diff --git a/.bazelrc b/.bazelrc
index 49e9fdb83b9e..8664d43f8680 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -137,9 +137,6 @@ build:windows --experimental_strict_action_env=true
 # Verbose failure logs when something goes wrong
 build:windows --verbose_failures
 
-# On windows, we never cross compile
-build:windows --distinct_host_configuration=false
-
 # Suppress all warning messages.
 build:short_logs --output_filter=DONT_MATCH_ANYTHING
 build:verbose_logs --output_filter=
diff --git a/.bazelversion b/.bazelversion
new file mode 100644
index 000000000000..1e20ec35c642
--- /dev/null
+++ b/.bazelversion
@@ -0,0 +1 @@
+5.4.0
\ No newline at end of file
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 17b6f699a330..a200d9d64547 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,29 +1,8 @@
-FROM python:3.9
-
-# https://code.visualstudio.com/docs/remote/containers-advanced#_creating-a-nonroot-user
-ARG USERNAME=keras-vscode
-ARG USER_UID=1000
-ARG USER_GID=$USER_UID
-
-# Create the user
-RUN groupadd --gid $USER_GID $USERNAME \
-    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
-    #
-    # [Optional] Add sudo support. Omit if you don't need to install software after connecting.
-    && apt-get update \
-    && apt-get install -y sudo bash \
-    && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
-    && chmod 0440 /etc/sudoers.d/$USERNAME
+FROM mcr.microsoft.com/vscode/devcontainers/python:3.9
+COPY setup.sh /setup.sh
 
 # Install Bazel
-RUN apt update
-RUN apt install curl gnupg -y
-RUN curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor > bazel.gpg
-RUN mv bazel.gpg /etc/apt/trusted.gpg.d/
-RUN echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
-RUN apt update && apt install bazel -y
-
-USER $USERNAME
-ENV PATH="/home/$USERNAME/.local/bin:${PATH}"
-
-CMD ["/bin/bash"]
\ No newline at end of file
+RUN sudo apt install wget -y
+RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-linux-amd64
+RUN chmod a+x bazelisk-linux-amd64
+RUN mv bazelisk-linux-amd64 /usr/bin/bazel
\ No newline at end of file
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index cc164d3f85c2..9c7b688f524d 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
     "dockerFile": "Dockerfile",
-    "postCreateCommand": "pip install -r requirements.txt && pip uninstall keras-nightly -y",
+    "postCreateCommand": "sh /setup.sh",
     "extensions": ["ms-python.python"],
     "settings": {
         "files.watcherExclude": {
@@ -8,8 +8,6 @@
         },
         "search.exclude": {
             "**/bazel-*/**": true
-        },
-        "terminal.integrated.defaultProfile.linux": "bash"
-    },
-    "remoteUser": "keras-vscode"
-}
+        }
+    }
+}
\ No newline at end of file
diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh
new file mode 100644
index 000000000000..dc6232affd6e
--- /dev/null
+++ b/.devcontainer/setup.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+sudo pip install -r requirements.txt
+sudo pip uninstall keras-nightly -y
+
+wget https://github.com/cli/cli/releases/download/v2.17.0/gh_2.17.0_linux_amd64.deb -P /tmp
+sudo apt install /tmp/gh_2.17.0_linux_amd64.deb -y
\ No newline at end of file
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index 11cb9eb6cccf..758d1c24fce9 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -16,4 +16,3 @@
 # A list of assignees
 assignees:
    - tilakrayal
-   - sushreebarsa
diff --git a/.github/stale.yml b/.github/stale.yml
deleted file mode 100644
index f0432f4a8d56..000000000000
--- a/.github/stale.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Number of days of inactivity before an Issue or Pull Request becomes stale
-daysUntilStale: 7
-# Number of days of inactivity before a stale Issue or Pull Request is closed
-daysUntilClose: 7
-# Only issues or pull requests with all of these labels are checked if stale. Defaults to `[]` (disabled)
-onlyLabels:
- - stat:awaiting response
-# Comment to post when marking as stale. Set to `false` to disable
-markComment: >
-  This issue has been automatically marked as stale because it has no
-  recent activity. It will be closed if no further activity occurs. Thank you.
-# Comment to post when removing the stale label. Set to `false` to disable
-unmarkComment: false
-closeComment: >
-  Closing as stale. Please reopen if you'd like to work on this further.
-limitPerRun: 30
-# Limit to only `issues` or `pulls`
-only: issues
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index b1b8fc1866ae..68e0256ba2b3 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -3,8 +3,13 @@ name: Format the code
 on:
   workflow_dispatch:
 
+permissions: {}
 jobs:
   createPullRequest:
+    permissions:
+      contents: write # to create branch (peter-evans/create-pull-request)
+      pull-requests: write # to create a PR (peter-evans/create-pull-request)
+
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -23,17 +28,17 @@ jobs:
             ${{ runner.os }}-pip-
       - name: Install dependencies
         run: |
-          pip install -r requirements.txt && pip uninstall keras-nightly -y
+          pip install black==22.3.0 isort==5.10.1 flake8==4.0.1
       - name: Format the code
-        run: black --line-length 80 keras
+        run: sh shell/format.sh
 
       - name: Create Pull Request
         id: cpr
         uses: peter-evans/create-pull-request@v4
         with:
           commit-message: format the code
-          committer: TensorFlower Gardener <tensorflower-gardener@users.noreply.github.com>
-          author: TensorFlower Gardener <tensorflower-gardener@users.noreply.github.com>
+          committer: A. Unique TensorFlower <gardener@tensorflow.org>
+          author: A. Unique TensorFlower <gardener@tensorflow.org>
           branch: format
           delete-branch: true
           title: 'Format the code'
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000000..66388041bc5b
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,34 @@
+name: Lint
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  lint:
+    name: Check the code format
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Get pip cache dir
+        id: pip-cache
+        run: |
+          python -m pip install --upgrade pip setuptools
+          echo "::set-output name=dir::$(pip cache dir)"
+      - name: pip cache
+        uses: actions/cache@v2
+        with:
+          path: ${{ steps.pip-cache.outputs.dir }}
+          key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+      - name: Install dependencies
+        run: |
+          pip install black==22.3.0 isort==5.10.1 flake8==4.0.1
+      - name: Lint the code
+        run: sh shell/lint.sh
diff --git a/.github/workflows/stale-issues-pr.yml b/.github/workflows/stale-issues-pr.yml
new file mode 100644
index 000000000000..3eab7a47959f
--- /dev/null
+++ b/.github/workflows/stale-issues-pr.yml
@@ -0,0 +1,47 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - name: Awaiting response issues
+        uses: actions/stale@v5
+        with:
+          days-before-issue-stale: 14
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          # reason for closed the issue default value is not_planned
+          close-issue-reason: completed
+          only-labels: "stat:awaiting response from contributor"
+          stale-issue-message: > 
+            This issue is stale because it has been open for 14 days with no activity.
+            It will be closed if no further activity occurs. Thank you.
+          close-issue-message: >
+            This issue was closed because it has been inactive for 28 days.
+            Please reopen if you'd like to work on this further.
+          days-before-pr-stale: 14
+          days-before-pr-close: 14
+          stale-pr-message: "This PR is stale because it has been open for 14 days with no activity. It will be closed if no further activity occurs. Thank you."
+          close-pr-message: "This PR was closed because it has been inactive for 28 days. Please reopen if you'd like to work on this further."
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Contribution issues
+        uses: actions/stale@v5
+        with:
+          days-before-issue-stale: 180
+          days-before-issue-close: 365
+          stale-issue-label: "stale"
+          # reason for closed the issue default value is not_planned
+          close-issue-reason: not_planned
+          any-of-labels: "stat:contributions welcome,good first issue"
+          stale-issue-message: > 
+            This issue is stale because it has been open for 180 days with no activity.
+            It will be closed if no further activity occurs. Thank you.
+          close-issue-message: >
+            This issue was closed because it has been inactive for more than 1 year.
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index d23c516b846e..000000000000
--- a/.pylintrc
+++ /dev/null
@@ -1,38 +0,0 @@
-[MESSAGES CONTROL]
-
-disable=
-    abstract-method,
-    access-member-before-definition,
-    arguments-differ,
-    attribute-defined-outside-init,
-    bad-continuation,
-    bad-option-value,
-    bad-whitespace,
-    c-extension-no-member,
-    design,
-    file-ignored,
-    fixme,
-    global-statement,
-    import-error,
-    import-outside-toplevel,
-    import-self,
-    interface-is-not-class,
-    invalid-metaclass,
-    invalid-name,
-    locally-disabled,
-    locally-enabled,
-    maybe-no-member,
-    method-hidden,
-    misplaced-comparison-constant,
-    missing-interface-method,
-    multiple-imports,
-    multiple-statements,
-    no-else-break,
-    no-else-continue,
-    no-else-raise,
-    no-else-return,
-    no-init,
-    no-member,
-    no-name-in-module,
-    no-self-use,
-    pointless-except,
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000000..4c3bb7528b99
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,27 @@
+{
+  "python.linting.flake8Enabled": true,
+  "python.linting.pylintEnabled": false,
+  "python.linting.enabled": true,
+  "editor.rulers": [
+      80
+  ],
+  "editor.formatOnSave": true,
+  "python.formatting.provider": "black",
+  "python.formatting.blackArgs": [
+      "--line-length",
+      "80"
+  ],
+  "python.sortImports.args": [
+      "--profile",
+      "black",
+      "--sl"
+  ],
+  "[python]": {
+      "editor.codeActionsOnSave": {
+          "source.organizeImports": true
+      }
+  },
+  "python.analysis.diagnosticSeverityOverrides": {
+      "reportMissingImports": "none"
+  }
+}
diff --git a/BUILD b/BUILD
index 37d69b2d69be..73742ab2ae12 100644
--- a/BUILD
+++ b/BUILD
@@ -106,3 +106,13 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [],
 )
+
+# Note that this dependency is for testing only.
+py_library(
+    name = "expect_tensorflow_io_installed",
+    # This is a dummy rule used as a tensorflow_io dependency in open-source.
+    # We expect tensorflow_io to already be installed on the system, e.g. via
+    # `pip install tensorflow-io`
+    visibility = ["//visibility:public"],
+    deps = [],
+)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0e314a4e256d..7dc9fe96eeb3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -14,7 +14,7 @@ to open a PR without discussion.
 ### Step 2. Make code changes
 
 To make code changes, you need to fork the repository. You will need to setup a
-development environment and run the unit tests. This is covered in section
+development environment and run the unit tests. This is covered in the section
 "Setup environment".
 
 ### Step 3. Create a pull request
@@ -39,7 +39,7 @@ add a `kokoro:force-run` label to trigger the continuous integration tests.
 
 ![CI tests tag](https://i.imgur.com/58NOCB0.png)
 
-If the tests fail, look into the error messages and try to fix it.
+If the tests fail, look into the error messages and try to fix them.
 
 ![CI tests](https://i.imgur.com/vVY0dZD.png)
 
@@ -63,7 +63,7 @@ for your reference.
 
 To setup the development environment, We provide two options. One is to use our
 Dockerfile, which builds into a container the required dev tools. Another one is
-to setup a local environment by install the dev tools needed.
+to setup a local environment by installing the dev tools needed.
 
 ### Option 1: Use a Docker container
 
@@ -99,7 +99,7 @@ You may modify the Dockerfile to your specific needs, like installing your own
 dev tools. You may also mount more volumes with the `-v` option, like your SSH
 credentials.
 
-Many popular editors today support developing in a container. Here is list of
+Many popular editors today support developing in a container. Here is the list of
 [supported editors](https://discuss.tensorflow.org/t/setup-your-favorite-editor-to-develop-keras)
 with setup instructions.
 
@@ -113,7 +113,7 @@ To setup your local dev environment, you will need the following tools.
 2.  [git](https://github.com/) for code repository management.
 3.  [python](https://www.python.org/) to build and code in Keras.
 
-The following commands checks the tools above are successfully installed. Note
+The following commands check the tools above are successfully installed. Note
 that Keras requires at least Python 3.7 to run.
 
 ```shell
@@ -125,7 +125,7 @@ python --version
 A [Python virtual environment](https://docs.python.org/3/tutorial/venv.html)
 (venv) is a powerful tool to create a self-contained environment that isolates
 any change from the system level config. It is highly recommended to avoid any
-unexpected dependency or version issue.
+unexpected dependency or version issues.
 
 With the following commands, you create a new venv, named `venv_dir`.
 
@@ -139,14 +139,14 @@ tests with the venv activated. You need to activate the venv every time you open
 a new shell.
 
 ```shell
-source venv_dir/bin/activate  # for linux or MacOS
+source venv_dir/bin/activate  # for Linux or MacOS
 venv_dir\Scripts\activate.bat  # for Windows
 ```
 
 Clone your forked repo to your local machine. Go to the cloned directory to
 install the dependencies into the venv. Since `tf-nightly` uses `keras-nightly`
 as a dependency, we need to uninstall `keras-nightly` so that tests will run
-against Keras code in local workspace.
+against Keras code in the local workspace.
 
 ```shell
 git clone https://github.com/YOUR_GITHUB_USERNAME/keras.git
@@ -165,18 +165,54 @@ pip install --upgrade tf-nightly
 
 ## Code style
 
-The Keras codebase uses the PEP 8 Python style conventions -- with the
-exception that it uses 2 spaces for indentation instead of 4.
-To check code style, please run the `pylint` command from the repo's
-root directory so that the configuration in
-`.pylintrc` is taken into account.
+The Keras uses [Black](https://black.readthedocs.io/en/stable/) and
+[isort](https://pycqa.github.io/isort/) to format the code. Please refer to
+[requirements.txt](https://github.com/keras-team/keras/blob/master/requirements.txt)
+for the required versions. Run the following command **at the root directory of
+the repo** to format your code.
 
-```shell
-pylint path/to/changed_file.py
 ```
+sh shell/format.sh
+```
+
+It will also display the errors that cannot be resolved by autoformatting. You
+need to follow the output of the command to resolve them manually.
+
+If you do not want to auto format the code but only show the lint errors, you
+can run `sh shell/lint.sh` **at the root directory of the repo**.
+
+### Docstrings
+
+We do not have an automated way to check docstring style, so if you write
+or edit any docstring, please make sure to check them manually.
+Keras docstrings follow the conventions below:
+
+A **class docstring** may contain the following items:
+
+* A one-line description of the class.
+* Paragraph(s) of more detailed information.
+* Optional `Examples` section.
+* `Args` section for arguments in `__init__()`.
+* If it's a layer:
+    * `Call arguments` section for arguments in `Layer.call()`.
+    * `Returns` section for the return values of `Layer.call()`.
+    * Optional `Raises` section for possible errors.
+
+You can check out `MultiHeadAttention` as an example
+[(link)](https://github.com/keras-team/keras/blob/v2.12.0-rc1/keras/layers/attention/multi_head_attention.py#L131).
+
+A **function docstring** may contain the following items:
+
+* One-line description of the function.
+* Paragraph(s) of more detailed information.
+* Optional `Examples` section.
+* `Args` section for the function arguments.
+* `Returns` section for the return values.
+* Optional `Raises` section for possible errors.
+
+You can check out `text_dataset_from_directory` as an example
+[(link)](https://github.com/keras-team/keras/blob/v2.12.0-rc1/keras/utils/text_dataset.py#L31).
 
-Please ignore the errors in the rest of the codebase and only fix the ones
-relevant to your changes.
 
 ## Run tests
 
@@ -198,7 +234,7 @@ defining the test. `base_layer_test` is the test target name defined with
 ### Run a single test case
 
 To run a single test, you can use `--test_filter=<your_regex>`
-to use regular expression to match the test you want to run. For example, you
+to use the regular expression to match the test you want to run. For example, you
 can use the following command to run all the tests in `activations_test.py`,
 whose names contain `test_serialization`.
 
@@ -212,7 +248,7 @@ You can run all the tests locally by running the following command in the repo
 root directory.
 
 ```
-bazel test --test_timeout 300,450,1200,3600 --test_output=errors --keep_going --define=use_fast_cpp_protos=false --build_tests_only --build_tag_filters=-no_oss --test_tag_filters=-no_oss keras/...
+bazel test --test_timeout 300,450,1200,3600 --test_output=errors --keep_going --define=use_fast_cpp_protos=false --build_tests_only --build_tag_filters=-no_oss,-oss_excluded --test_tag_filters=-no_oss,-oss_excluded keras/...
 ```
 
 ### Useful configs
@@ -258,7 +294,7 @@ mind.
 -   You should add any new applications to the unit tests defined in
     `applications_test.py` and `applications_load_weight_test.py`.
 -   For backwards compatibility, all applications should provide a
-    `preprocess_input()` function. For new applciations, you should leave the
+    `preprocess_input()` function. For new applications, you should leave the
     function empty (pass through inputs unaltered), and write the model so it
     can handle raw inputs directly. Adding
     [preprocessing layers](https://keras.io/guides/preprocessing_layers/) to the
@@ -272,4 +308,9 @@ mind.
 -   As every PR requires several CPU/GPU hours of CI testing, we discourage
     submitting PRs to fix one typo, one warning,etc. We recommend fixing the
     same issue at the file level at least (e.g.: fix all typos in a file, fix
-    all compiler warning in a file, etc.)
+    all compiler warnings in a file, etc.)
+
+## Security vulnerability reports
+
+Since Keras is the high-level API of TensorFlow 2, Keras follows same security practices as TensorFlow.
+For details on guidelines on vulnerabilities and reporting them, you can refer [Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md). 
diff --git a/README.md b/README.md
index 37675e0a4c9c..dfbe1608883d 100644
--- a/README.md
+++ b/README.md
@@ -9,20 +9,25 @@ Read the documentation at [keras.io](https://keras.io/).
 
 Keras is a deep learning API written in Python,
 running on top of the machine learning platform [TensorFlow](https://github.com/tensorflow/tensorflow).
-It was developed with a focus on enabling fast experimentation.
-*Being able to go from idea to result as fast as possible is key to doing good research.*
+It was developed with a focus on enabling fast experimentation and
+providing a delightful developer experience.
+
+**The purpose of Keras is to give an *unfair advantage* to any developer looking to ship ML-powered apps.**
 
 Keras is:
 
 -   **Simple** -- but not simplistic. Keras reduces developer *cognitive load*
     to free you to focus on the parts of the problem that really matter.
+    Keras focuses on ease of use, debugging speed, code elegance & conciseness,
+    maintainability, and deployability (via TFServing, TFLite, TF.js).
 -   **Flexible** -- Keras adopts the principle of *progressive disclosure of
     complexity*: simple workflows should be quick and easy, while arbitrarily
     advanced workflows should be *possible* via a clear path that builds upon
     what you've already learned.
 -   **Powerful** -- Keras provides industry-strength performance and
     scalability: it is used by organizations and companies including NASA,
-    YouTube, and Waymo.
+    YouTube, and Waymo. That's right -- your YouTube recommendations are
+    powered by Keras, and so is the world's most advanced driverless vehicle.
 
 ---
 
@@ -52,9 +57,9 @@ and you can export your Keras models to run in the browser or on a mobile device
 ## First contact with Keras
 
 The core data structures of Keras are __layers__ and __models__.
-The simplest type of model is the [`Sequential` model](/guides/sequential_model/), a linear stack of layers.
-For more complex architectures, you should use the [Keras functional API](/guides/functional_api/),
-which allows to build arbitrary graphs of layers, or [write models entirely from scratch via subclasssing](/guides/making_new_layers_and_models_via_subclassing/).
+The simplest type of model is the [`Sequential` model](https://keras.io/guides/sequential_model/), a linear stack of layers.
+For more complex architectures, you should use the [Keras functional API](https://keras.io/guides/functional_api/),
+which allows you to build arbitrary graphs of layers or [write models entirely from scratch via subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/).
 
 Here is the `Sequential` model:
 
@@ -82,7 +87,7 @@ model.compile(loss='categorical_crossentropy',
 ```
 
 If you need to, you can further configure your optimizer. The Keras philosophy is to keep simple things simple,
-while allowing the user to be fully in control when they need to (the ultimate control being the easy extensibility of the source code via subclassing).
+while allowing the user to be fully in control when they need to be (the ultimate control being the easy extensibility of the source code via subclassing).
 
 ```python
 model.compile(loss=tf.keras.losses.categorical_crossentropy,
@@ -116,7 +121,7 @@ Keras follows the principle of **progressive disclosure of complexity**: it make
 yet it makes it possible to handle arbitrarily advanced use cases,
 only requiring incremental learning at each step.
 
-In much the same way that you were able to train & evaluate a simple neural network above in a few lines,
+In pretty much the same way that you were able to train & evaluate a simple neural network above in a few lines,
 you can use Keras to quickly develop new training procedures or exotic model architectures.
 Here's a low-level training loop example, combining Keras functionality with the TensorFlow `GradientTape`:
 
@@ -156,6 +161,11 @@ For more in-depth tutorials about Keras, you can check out:
 
 Keras comes packaged with TensorFlow 2 as `tensorflow.keras`.
 To start using Keras, simply [install TensorFlow 2](https://www.tensorflow.org/install).
+You can then import Keras as follows:
+
+```python
+from tensorflow import keras
+```
 
 ---
 
@@ -174,26 +184,17 @@ version maps to a specific stable version of TensorFlow.
 The table below shows the compatibility version mapping
 between TensorFlow versions and Keras versions.
 
-All the release branches can be found on [Github](https://github.com/keras-team/keras/releases).
+All the release branches can be found on [GitHub](https://github.com/keras-team/keras/releases).
 
 All the release binaries can be found on [Pypi](https://pypi.org/project/keras/#history).
 
-| Keras release | Note      | Compatible Tensorflow version |
-| -----------   | ----------- | -----------        |
-| [2.4](https://github.com/keras-team/keras/releases/tag/2.4.0)  | Last stable release of multi-backend Keras | < 2.5
-| 2.5-pre| Pre-release (not formal) for standalone Keras repo | >= 2.5 < 2.6
-| [2.6](https://github.com/keras-team/keras/releases/tag/v2.6.0)    | First formal release of standalone Keras.  | >= 2.6 < 2.7
-| [2.7](https://github.com/keras-team/keras/releases/tag/v2.7.0-rc0)    | (Upcoming release) | >= 2.7 < 2.8
-| nightly|                                            | tf-nightly
-
 ---
 ## Support
 
 You can ask questions and join the development discussion:
 
 - In the [TensorFlow forum](https://discuss.tensorflow.org/).
-- On the [Keras Google group](https://groups.google.com/forum/#!forum/keras-users).
-- On the [Keras Slack channel](https://kerasteam.slack.com). Use [this link](https://keras-slack-autojoin.herokuapp.com/) to request an invitation to the channel.
+- On the [Keras mailing list](https://groups.google.com/forum/#!forum/keras-users).
 
 ---
 
diff --git a/WORKSPACE b/WORKSPACE
index 898b5b6dffce..c0ebc4e52ac5 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -6,8 +6,11 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 http_archive(
     name = "bazel_skylib",
-    url = "https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.1/bazel-skylib-1.0.1.tar.gz",
-    sha256 = "f1c8360c01fcf276778d3519394805dc2a71a64274a3a0908bc9edff7b5aebc8",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+    ],
+    sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
 )
 load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
 bazel_skylib_workspace()
@@ -16,12 +19,9 @@ bazel_skylib_workspace()
 http_archive(
     name = "six_archive",
     build_file = "//third_party:six.BUILD",
-    sha256 = "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73",
-    strip_prefix = "six-1.12.0",
-    urls = [
-        "http://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.12.0.tar.gz",
-        "https://pypi.python.org/packages/source/s/six/six-1.12.0.tar.gz",  # 2018-12-10
-    ],
+    sha256 = "1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+    strip_prefix = "six-1.16.0",
+    urls = ["https://pypi.python.org/packages/source/s/six/six-1.16.0.tar.gz"],
 )
 
 bind(
@@ -31,18 +31,21 @@ bind(
 
 http_archive(
     name = "com_google_protobuf",
-    sha256 = "1fbf1c2962af287607232b2eddeaec9b4f4a7a6f5934e1a9276e9af76952f7e0",
-    strip_prefix = "protobuf-3.9.2",
-    urls = ["https://github.com/protocolbuffers/protobuf/archive/v3.9.2.tar.gz"],
+    sha256 = "f66073dee0bc159157b0bd7f502d7d1ee0bc76b3c1eac9836927511bdc4b3fc1",
+    strip_prefix = "protobuf-3.21.9",
+    urls = ["https://github.com/protocolbuffers/protobuf/archive/v3.21.9.zip"],
 )
 
 # ZLIB. Need by com_google_protobuf.
 http_archive(
     name = "zlib",
     build_file = "@com_google_protobuf//:third_party/zlib.BUILD",
-    sha256 = "91844808532e5ce316b3c010929493c0244f3d37593afd6de04f71821d5136d9",
-    strip_prefix = "zlib-1.2.12",
-    urls = ["https://zlib.net/zlib-1.2.12.tar.gz"],
+    sha256 = "b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30",
+    strip_prefix = "zlib-1.2.13",
+    urls = [
+      "https://storage.googleapis.com/mirror.tensorflow.org/zlib.net/zlib-1.2.13.tar.gz",
+      "https://zlib.net/zlib-1.2.13.tar.gz",
+      ],
 )
 
 
diff --git a/keras/BUILD b/keras/BUILD
index 6d94758b2b6c..d31fcbc2b0e3 100644
--- a/keras/BUILD
+++ b/keras/BUILD
@@ -1,16 +1,16 @@
 # Description:
 #   Contains the Keras API (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
+# copybara:uncomment_begin(google-only)
+# load("//tools/build_defs/license:license.bzl", "license")
+# copybara:uncomment_end
+
 package(
-    default_visibility = [
-        ":friends",
-        "//third_party/py/tensorflow:__subpackages__",
-        "//third_party/tensorflow/python/feature_column:__subpackages__",  # For unit test
-        "//third_party/tensorflow/python/tpu:__subpackages__",  # For unit test
-        "//third_party/tensorflow_estimator:__subpackages__",
-    ],
+    # copybara:uncomment default_applicable_licenses = [":license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -48,7 +48,6 @@ py_library(
         "//keras/applications",
         "//keras/datasets",
         "//keras/distribute",
-        "//keras/dtensor:optimizers",
         "//keras/estimator",
         "//keras/feature_column",
         "//keras/layers",
@@ -64,7 +63,6 @@ py_library(
         "//keras/testing_infra:keras_doctest_lib",
         "//keras/testing_infra:test_utils",  # For keras.__internal__ API
         "//keras/utils",
-        "//keras/wrappers",
     ],
 )
 
@@ -183,7 +181,7 @@ py_library(
     deps = [
         ":backend",
         "//:expect_tensorflow_installed",
-        "//keras/saving/experimental",
+        "//keras/saving:saving_lib",
         "//keras/utils:engine_utils",
         "//keras/utils:generic_utils",
         "//keras/utils:tf_utils",
@@ -213,6 +211,20 @@ py_library(
 # )
 # copybara:uncomment_end
 
+# Some tf.distribute related feature requires detecting platform.
+# Internally we'd like to recognize Borg, which is not needed in OSS.
+# copybara:uncomment_begin(google-only)
+# py_library(
+#     name = "distribute_utils",
+#     srcs = ["google/distribute_utils.py"],
+#     deps = [
+#         "//:expect_six_installed",
+#         "//:expect_tensorflow_installed",
+#         "//third_party/py/requests",
+#     ],
+# )
+# copybara:uncomment_end
+
 tf_py_test(
     name = "activations_test",
     size = "small",
@@ -264,6 +276,7 @@ tf_py_test(
     size = "small",
     srcs = ["losses_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     tags = [
         "noasan",  # b/186128525
     ],
@@ -284,6 +297,7 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 6,
     tags = [
+        "no_pip",  # TODO(b/276923757)
         "no_tfrt",  # TODO(b/179690526)
         "notsan",
     ],
@@ -301,7 +315,10 @@ tf_py_test(
     size = "medium",
     srcs = ["callbacks_v1_test.py"],
     python_version = "PY3",
-    tags = ["notsan"],
+    tags = [
+        "nomac",  # Using profiler causes segfault in MacOS runs.
+        "notsan",
+    ],
     deps = [
         ":callbacks",
         ":callbacks_v1",
@@ -357,4 +374,21 @@ tf_py_test(
 #         "//testing/pymocks:matchers",
 #     ],
 # )
+#
+# tf_py_test(
+#     name = "distribute_utils_test",
+#     srcs = ["google/distribute_utils_test.py"],
+#     python_version = "PY3",
+#     deps = [
+#         ":distribute_utils",
+#         "//:expect_tensorflow_installed",
+#         "//keras/distribute",
+#         "//testing/pymocks:matchers",
+#     ],
+# )
+#
+# license(
+#     name = "license",
+#     package_name = "keras",
+# )
 # copybara:uncomment_end
diff --git a/keras/__init__.py b/keras/__init__.py
index 9dbe10b3e4f0..f4a25e8f3447 100644
--- a/keras/__init__.py
+++ b/keras/__init__.py
@@ -17,18 +17,17 @@
 Detailed documentation and user guides are available at
 [keras.io](https://keras.io).
 """
-# pylint: disable=unused-import
-from tensorflow.python import tf2
 from keras import distribute
-
 from keras import models
-
 from keras.engine.input_layer import Input
 from keras.engine.sequential import Sequential
 from keras.engine.training import Model
 
+# isort: off
+
+from tensorflow.python import tf2
 from tensorflow.python.util.tf_export import keras_export
 
-__version__ = '2.10.0'
+__version__ = "2.15.0"
 
-keras_export('keras.__version__').export_constant(__name__, '__version__')
+keras_export("keras.__version__").export_constant(__name__, "__version__")
diff --git a/keras/activations.py b/keras/activations.py
index 7499adea7df8..776f8e0322ab 100644
--- a/keras/activations.py
+++ b/keras/activations.py
@@ -15,12 +15,19 @@
 """Built-in activation functions."""
 
 import sys
+import types
 
 import tensorflow.compat.v2 as tf
 
-from keras import backend
 import keras.layers.activation as activation_layers
+from keras import backend
+from keras.saving import object_registration
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.legacy.saved_model import utils as saved_model_utils
 from keras.utils import generic_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 # b/123041942
@@ -32,484 +39,546 @@
 # This dict maps the activation function name from its v2 version to its
 # canonical name.
 _TF_ACTIVATIONS_V2 = {
-    'softmax_v2': 'softmax',
+    "softmax_v2": "softmax",
 }
 
 
-@keras_export('keras.activations.softmax')
+@keras_export("keras.activations.softmax")
 @tf.__internal__.dispatch.add_dispatch_support
 def softmax(x, axis=-1):
-  """Softmax converts a vector of values to a probability distribution.
+    """Softmax converts a vector of values to a probability distribution.
 
-  The elements of the output vector are in range (0, 1) and sum to 1.
+    The elements of the output vector are in range (0, 1) and sum to 1.
 
-  Each vector is handled independently. The `axis` argument sets which axis
-  of the input the function is applied along.
+    Each vector is handled independently. The `axis` argument sets which axis
+    of the input the function is applied along.
 
-  Softmax is often used as the activation for the last
-  layer of a classification network because the result could be interpreted as
-  a probability distribution.
+    Softmax is often used as the activation for the last
+    layer of a classification network because the result could be interpreted as
+    a probability distribution.
 
-  The softmax of each vector x is computed as
-  `exp(x) / tf.reduce_sum(exp(x))`.
+    The softmax of each vector x is computed as
+    `exp(x) / tf.reduce_sum(exp(x))`.
 
-  The input values in are the log-odds of the resulting probability.
+    The input values in are the log-odds of the resulting probability.
 
-  Args:
-    x : Input tensor.
-    axis: Integer, axis along which the softmax normalization is applied.
+    Args:
+        x : Input tensor.
+        axis: Integer, axis along which the softmax normalization is applied.
 
-  Returns:
-    Tensor, output of softmax transformation (all values are non-negative
-      and sum to 1).
+    Returns:
+        Tensor, output of softmax transformation (all values are non-negative
+            and sum to 1).
 
-  Examples:
+    Examples:
 
-  **Example 1: standalone usage**
+    **Example 1: standalone usage**
 
-  >>> inputs = tf.random.normal(shape=(32, 10))
-  >>> outputs = tf.keras.activations.softmax(inputs)
-  >>> tf.reduce_sum(outputs[0, :])  # Each sample in the batch now sums to 1
-  <tf.Tensor: shape=(), dtype=float32, numpy=1.0000001>
+    >>> inputs = tf.random.normal(shape=(32, 10))
+    >>> outputs = tf.keras.activations.softmax(inputs)
+    >>> tf.reduce_sum(outputs[0, :])  # Each sample in the batch now sums to 1
+    <tf.Tensor: shape=(), dtype=float32, numpy=1.0000001>
 
-  **Example 2: usage in a `Dense` layer**
+    **Example 2: usage in a `Dense` layer**
 
-  >>> layer = tf.keras.layers.Dense(32, activation=tf.keras.activations.softmax)
-  """
-  if x.shape.rank > 1:
-    if isinstance(axis, int):
-      output = tf.nn.softmax(x, axis=axis)
-    else:
-      # nn.softmax does not support tuple axis.
-      e = tf.exp(x - tf.reduce_max(x, axis=axis, keepdims=True))
-      s = tf.reduce_sum(e, axis=axis, keepdims=True)
-      output = e / s
-  else:
-    raise ValueError('Cannot apply softmax to a tensor that is 1D. '
-                     f'Received input: {x}')
+    >>> layer = tf.keras.layers.Dense(32,
+    ...                               activation=tf.keras.activations.softmax)
+    """
+    return backend.softmax(x, axis)
 
-  # Cache the logits to use for crossentropy loss.
-  output._keras_logits = x  # pylint: disable=protected-access
-  return output
 
-
-@keras_export('keras.activations.elu')
+@keras_export("keras.activations.elu")
 @tf.__internal__.dispatch.add_dispatch_support
 def elu(x, alpha=1.0):
-  """Exponential Linear Unit.
-
-  The exponential linear unit (ELU) with `alpha > 0` is:
-  `x` if `x > 0` and
-  `alpha * (exp(x) - 1)` if `x < 0`
-  The ELU hyperparameter `alpha` controls the value to which an
-  ELU saturates for negative net inputs. ELUs diminish the
-  vanishing gradient effect.
-
-  ELUs have negative values which pushes the mean of the activations
-  closer to zero.
-  Mean activations that are closer to zero enable faster learning as they
-  bring the gradient closer to the natural gradient.
-  ELUs saturate to a negative value when the argument gets smaller.
-  Saturation means a small derivative which decreases the variation
-  and the information that is propagated to the next layer.
-
-  Example Usage:
-
-  >>> import tensorflow as tf
-  >>> model = tf.keras.Sequential()
-  >>> model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='elu',
-  ...          input_shape=(28, 28, 1)))
-  >>> model.add(tf.keras.layers.MaxPooling2D((2, 2)))
-  >>> model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='elu'))
-  >>> model.add(tf.keras.layers.MaxPooling2D((2, 2)))
-  >>> model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='elu'))
-
-  <tensorflow.python.keras.engine.sequential.Sequential object ...>
-
-  Args:
-      x: Input tensor.
-      alpha: A scalar, slope of negative section. `alpha` controls the value to
-        which an ELU saturates for negative net inputs.
-
-  Returns:
-      The exponential linear unit (ELU) activation function: `x` if `x > 0` and
-      `alpha * (exp(x) - 1)` if `x < 0`.
-
-
-  Reference:
-      [Fast and Accurate Deep Network Learning by Exponential Linear Units
-      (ELUs) (Clevert et al, 2016)](https://arxiv.org/abs/1511.07289)
-  """
-  return backend.elu(x, alpha)
-
-
-@keras_export('keras.activations.selu')
+    """Exponential Linear Unit.
+
+    The exponential linear unit (ELU) with `alpha > 0` is:
+    `x` if `x > 0` and
+    `alpha * (exp(x) - 1)` if `x < 0`
+    The ELU hyperparameter `alpha` controls the value to which an
+    ELU saturates for negative net inputs. ELUs diminish the
+    vanishing gradient effect.
+
+    ELUs have negative values which pushes the mean of the activations
+    closer to zero.
+    Mean activations that are closer to zero enable faster learning as they
+    bring the gradient closer to the natural gradient.
+    ELUs saturate to a negative value when the argument gets smaller.
+    Saturation means a small derivative which decreases the variation
+    and the information that is propagated to the next layer.
+
+    Example Usage:
+
+    >>> import tensorflow as tf
+    >>> model = tf.keras.Sequential()
+    >>> model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='elu',
+    ...          input_shape=(28, 28, 1)))
+    >>> model.add(tf.keras.layers.MaxPooling2D((2, 2)))
+    >>> model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='elu'))
+    >>> model.add(tf.keras.layers.MaxPooling2D((2, 2)))
+    >>> model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='elu'))
+
+    <tensorflow.python.keras.engine.sequential.Sequential object ...>
+
+    Args:
+        x: Input tensor.
+        alpha: A scalar, slope of negative section. `alpha` controls the value
+            to which an ELU saturates for negative net inputs.
+
+    Returns:
+        The exponential linear unit (ELU) activation function: `x` if `x > 0`
+            and `alpha * (exp(x) - 1)` if `x < 0`.
+
+
+    Reference:
+        - [Fast and Accurate Deep Network Learning by Exponential Linear Units
+        (ELUs) (Clevert et al, 2016)](https://arxiv.org/abs/1511.07289)
+    """
+    return backend.elu(x, alpha)
+
+
+@keras_export("keras.activations.selu")
 @tf.__internal__.dispatch.add_dispatch_support
 def selu(x):
-  """Scaled Exponential Linear Unit (SELU).
+    """Scaled Exponential Linear Unit (SELU).
 
-  The Scaled Exponential Linear Unit (SELU) activation function is defined as:
+    The Scaled Exponential Linear Unit (SELU) activation function is defined as:
 
-  - `if x > 0: return scale * x`
-  - `if x < 0: return scale * alpha * (exp(x) - 1)`
+    - `if x > 0: return scale * x`
+    - `if x < 0: return scale * alpha * (exp(x) - 1)`
 
-  where `alpha` and `scale` are pre-defined constants
-  (`alpha=1.67326324` and `scale=1.05070098`).
+    where `alpha` and `scale` are pre-defined constants
+    (`alpha=1.67326324` and `scale=1.05070098`).
 
-  Basically, the SELU activation function multiplies `scale` (> 1) with the
-  output of the `tf.keras.activations.elu` function to ensure a slope larger
-  than one for positive inputs.
+    Basically, the SELU activation function multiplies `scale` (> 1) with the
+    output of the `tf.keras.activations.elu` function to ensure a slope larger
+    than one for positive inputs.
 
-  The values of `alpha` and `scale` are
-  chosen so that the mean and variance of the inputs are preserved
-  between two consecutive layers as long as the weights are initialized
-  correctly (see `tf.keras.initializers.LecunNormal` initializer)
-  and the number of input units is "large enough"
-  (see reference paper for more information).
+    The values of `alpha` and `scale` are
+    chosen so that the mean and variance of the inputs are preserved
+    between two consecutive layers as long as the weights are initialized
+    correctly (see `tf.keras.initializers.LecunNormal` initializer)
+    and the number of input units is "large enough"
+    (see reference paper for more information).
 
-  Example Usage:
+    Example Usage:
 
-  >>> num_classes = 10  # 10-class problem
-  >>> model = tf.keras.Sequential()
-  >>> model.add(tf.keras.layers.Dense(64, kernel_initializer='lecun_normal',
-  ...                                 activation='selu'))
-  >>> model.add(tf.keras.layers.Dense(32, kernel_initializer='lecun_normal',
-  ...                                 activation='selu'))
-  >>> model.add(tf.keras.layers.Dense(16, kernel_initializer='lecun_normal',
-  ...                                 activation='selu'))
-  >>> model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
+    >>> num_classes = 10  # 10-class problem
+    >>> model = tf.keras.Sequential()
+    >>> model.add(tf.keras.layers.Dense(64, kernel_initializer='lecun_normal',
+    ...                                 activation='selu'))
+    >>> model.add(tf.keras.layers.Dense(32, kernel_initializer='lecun_normal',
+    ...                                 activation='selu'))
+    >>> model.add(tf.keras.layers.Dense(16, kernel_initializer='lecun_normal',
+    ...                                 activation='selu'))
+    >>> model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
 
-  Args:
-      x: A tensor or variable to compute the activation function for.
+    Args:
+        x: A tensor or variable to compute the activation function for.
 
-  Returns:
-      The scaled exponential unit activation: `scale * elu(x, alpha)`.
+    Returns:
+        The scaled exponential unit activation: `scale * elu(x, alpha)`.
 
-  Notes:
-      - To be used together with the
-        `tf.keras.initializers.LecunNormal` initializer.
-      - To be used together with the dropout variant
-        `tf.keras.layers.AlphaDropout` (not regular dropout).
+    Notes:
+        - To be used together with the
+            `tf.keras.initializers.LecunNormal` initializer.
+        - To be used together with the dropout variant
+            `tf.keras.layers.AlphaDropout` (not regular dropout).
 
-  References:
-      - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
-  """
-  return tf.nn.selu(x)
+    References:
+        - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
+    """
+    return tf.nn.selu(x)
 
 
-@keras_export('keras.activations.softplus')
+@keras_export("keras.activations.softplus")
 @tf.__internal__.dispatch.add_dispatch_support
 def softplus(x):
-  """Softplus activation function, `softplus(x) = log(exp(x) + 1)`.
+    """Softplus activation function, `softplus(x) = log(exp(x) + 1)`.
 
-  Example Usage:
+    Example Usage:
 
-  >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
-  >>> b = tf.keras.activations.softplus(a)
-  >>> b.numpy()
-  array([2.0611537e-09, 3.1326166e-01, 6.9314718e-01, 1.3132616e+00,
-           2.0000000e+01], dtype=float32)
+    >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
+    >>> b = tf.keras.activations.softplus(a)
+    >>> b.numpy()
+    array([2.0611537e-09, 3.1326166e-01, 6.9314718e-01, 1.3132616e+00,
+             2.0000000e+01], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-      The softplus activation: `log(exp(x) + 1)`.
-  """
-  return tf.math.softplus(x)
+    Returns:
+        The softplus activation: `log(exp(x) + 1)`.
+    """
+    return tf.math.softplus(x)
 
 
-@keras_export('keras.activations.softsign')
+@keras_export("keras.activations.softsign")
 @tf.__internal__.dispatch.add_dispatch_support
 def softsign(x):
-  """Softsign activation function, `softsign(x) = x / (abs(x) + 1)`.
+    """Softsign activation function, `softsign(x) = x / (abs(x) + 1)`.
 
-  Example Usage:
+    Example Usage:
 
-  >>> a = tf.constant([-1.0, 0.0, 1.0], dtype = tf.float32)
-  >>> b = tf.keras.activations.softsign(a)
-  >>> b.numpy()
-  array([-0.5,  0. ,  0.5], dtype=float32)
+    >>> a = tf.constant([-1.0, 0.0, 1.0], dtype = tf.float32)
+    >>> b = tf.keras.activations.softsign(a)
+    >>> b.numpy()
+    array([-0.5,  0. ,  0.5], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-      The softsign activation: `x / (abs(x) + 1)`.
-  """
-  return tf.math.softsign(x)
+    Returns:
+        The softsign activation: `x / (abs(x) + 1)`.
+    """
+    return tf.math.softsign(x)
 
 
-@keras_export('keras.activations.swish')
+@keras_export("keras.activations.swish")
 @tf.__internal__.dispatch.add_dispatch_support
 def swish(x):
-  """Swish activation function, `swish(x) = x * sigmoid(x)`.
+    """Swish activation function, `swish(x) = x * sigmoid(x)`.
 
-  Swish activation function which returns `x*sigmoid(x)`.
-  It is a smooth, non-monotonic function that consistently matches
-  or outperforms ReLU on deep networks, it is unbounded above and
-  bounded below.
+    Swish activation function which returns `x*sigmoid(x)`.
+    It is a smooth, non-monotonic function that consistently matches
+    or outperforms ReLU on deep networks, it is unbounded above and
+    bounded below.
 
 
-  Example Usage:
+    Example Usage:
 
-  >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
-  >>> b = tf.keras.activations.swish(a)
-  >>> b.numpy()
-  array([-4.1223075e-08, -2.6894143e-01,  0.0000000e+00,  7.3105860e-01,
-            2.0000000e+01], dtype=float32)
+    >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
+    >>> b = tf.keras.activations.swish(a)
+    >>> b.numpy()
+    array([-4.1223075e-08, -2.6894143e-01,  0.0000000e+00,  7.3105860e-01,
+              2.0000000e+01], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-      The swish activation applied to `x` (see reference paper for details).
+    Returns:
+        The swish activation applied to `x` (see reference paper for details).
 
-  Reference:
-    - [Ramachandran et al., 2017](https://arxiv.org/abs/1710.05941)
-  """
-  return tf.nn.silu(x)
+    Reference:
+        - [Ramachandran et al., 2017](https://arxiv.org/abs/1710.05941)
+    """
+    return tf.nn.silu(x)
 
 
-@keras_export('keras.activations.relu')
+@keras_export("keras.activations.relu")
 @tf.__internal__.dispatch.add_dispatch_support
-def relu(x, alpha=0., max_value=None, threshold=0.):
-  """Applies the rectified linear unit activation function.
-
-  With default values, this returns the standard ReLU activation:
-  `max(x, 0)`, the element-wise maximum of 0 and the input tensor.
-
-  Modifying default parameters allows you to use non-zero thresholds,
-  change the max value of the activation,
-  and to use a non-zero multiple of the input for values below the threshold.
-
-  For example:
-
-  >>> foo = tf.constant([-10, -5, 0.0, 5, 10], dtype = tf.float32)
-  >>> tf.keras.activations.relu(foo).numpy()
-  array([ 0.,  0.,  0.,  5., 10.], dtype=float32)
-  >>> tf.keras.activations.relu(foo, alpha=0.5).numpy()
-  array([-5. , -2.5,  0. ,  5. , 10. ], dtype=float32)
-  >>> tf.keras.activations.relu(foo, max_value=5.).numpy()
-  array([0., 0., 0., 5., 5.], dtype=float32)
-  >>> tf.keras.activations.relu(foo, threshold=5.).numpy()
-  array([-0., -0.,  0.,  0., 10.], dtype=float32)
-
-  Args:
-      x: Input `tensor` or `variable`.
-      alpha: A `float` that governs the slope for values lower than the
-        threshold.
-      max_value: A `float` that sets the saturation threshold (the largest value
-        the function will return).
-      threshold: A `float` giving the threshold value of the activation function
-        below which values will be damped or set to zero.
-
-  Returns:
-      A `Tensor` representing the input tensor,
-      transformed by the relu activation function.
-      Tensor will be of the same shape and dtype of input `x`.
-  """
-  return backend.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
-
-
-@keras_export('keras.activations.gelu', v1=[])
+def relu(x, alpha=0.0, max_value=None, threshold=0.0):
+    """Applies the rectified linear unit activation function.
+
+    With default values, this returns the standard ReLU activation:
+    `max(x, 0)`, the element-wise maximum of 0 and the input tensor.
+
+    Modifying default parameters allows you to use non-zero thresholds,
+    change the max value of the activation,
+    and to use a non-zero multiple of the input for values below the threshold.
+
+    Example:
+
+    >>> foo = tf.constant([-10, -5, 0.0, 5, 10], dtype = tf.float32)
+    >>> tf.keras.activations.relu(foo).numpy()
+    array([ 0.,  0.,  0.,  5., 10.], dtype=float32)
+    >>> tf.keras.activations.relu(foo, alpha=0.5).numpy()
+    array([-5. , -2.5,  0. ,  5. , 10. ], dtype=float32)
+    >>> tf.keras.activations.relu(foo, max_value=5.).numpy()
+    array([0., 0., 0., 5., 5.], dtype=float32)
+    >>> tf.keras.activations.relu(foo, threshold=5.).numpy()
+    array([-0., -0.,  0.,  0., 10.], dtype=float32)
+
+    Args:
+        x: Input `tensor` or `variable`.
+        alpha: A `float` that governs the slope for values lower than the
+            threshold.
+        max_value: A `float` that sets the saturation threshold (the largest
+            value the function will return).
+        threshold: A `float` giving the threshold value of the activation
+            function below which values will be damped or set to zero.
+
+    Returns:
+        A `Tensor` representing the input tensor, transformed by the relu
+        activation function. Tensor will be of the same shape and dtype of
+        input `x`.
+    """
+    return backend.relu(
+        x, alpha=alpha, max_value=max_value, threshold=threshold
+    )
+
+
+@keras_export("keras.activations.gelu", v1=[])
 @tf.__internal__.dispatch.add_dispatch_support
 def gelu(x, approximate=False):
-  """Applies the Gaussian error linear unit (GELU) activation function.
-
-  Gaussian error linear unit (GELU) computes
-  `x * P(X <= x)`, where `P(X) ~ N(0, 1)`.
-  The (GELU) nonlinearity weights inputs by their value, rather than gates
-  inputs by their sign as in ReLU.
-
-  For example:
-
-  >>> x = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype=tf.float32)
-  >>> y = tf.keras.activations.gelu(x)
-  >>> y.numpy()
-  array([-0.00404951, -0.15865529,  0.        ,  0.8413447 ,  2.9959507 ],
-      dtype=float32)
-  >>> y = tf.keras.activations.gelu(x, approximate=True)
-  >>> y.numpy()
-  array([-0.00363752, -0.15880796,  0.        ,  0.841192  ,  2.9963627 ],
-      dtype=float32)
-
-  Args:
-      x: Input tensor.
-      approximate: A `bool`, whether to enable approximation.
-
-  Returns:
-      The gaussian error linear activation:
-      `0.5 * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * x^3)))`
-      if `approximate` is `True` or
-      `x * P(X <= x) = 0.5 * x * (1 + erf(x / sqrt(2)))`,
-      where `P(X) ~ N(0, 1)`,
-      if `approximate` is `False`.
-
-  Reference:
-    - [Gaussian Error Linear Units (GELUs)](https://arxiv.org/abs/1606.08415)
-  """
-  return tf.nn.gelu(x, approximate)
-
-
-@keras_export('keras.activations.tanh')
+    """Applies the Gaussian error linear unit (GELU) activation function.
+
+    Gaussian error linear unit (GELU) computes
+    `x * P(X <= x)`, where `P(X) ~ N(0, 1)`.
+    The (GELU) nonlinearity weights inputs by their value, rather than gates
+    inputs by their sign as in ReLU.
+
+    Example:
+
+    >>> x = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype=tf.float32)
+    >>> y = tf.keras.activations.gelu(x)
+    >>> y.numpy()
+    array([-0.00404951, -0.15865529,  0.        ,  0.8413447 ,  2.9959507 ],
+        dtype=float32)
+    >>> y = tf.keras.activations.gelu(x, approximate=True)
+    >>> y.numpy()
+    array([-0.00363752, -0.15880796,  0.        ,  0.841192  ,  2.9963627 ],
+        dtype=float32)
+
+    Args:
+        x: Input tensor.
+        approximate: A `bool`, whether to enable approximation.
+
+    Returns:
+        The gaussian error linear activation:
+        `0.5 * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * x^3)))`
+        if `approximate` is `True` or
+        `x * P(X <= x) = 0.5 * x * (1 + erf(x / sqrt(2)))`,
+        where `P(X) ~ N(0, 1)`,
+        if `approximate` is `False`.
+
+    Reference:
+        - [Gaussian Error Linear Units (GELUs)](https://arxiv.org/abs/1606.08415)
+    """  # noqa: E501
+    return tf.nn.gelu(x, approximate)
+
+
+@keras_export("keras.activations.tanh")
 @tf.__internal__.dispatch.add_dispatch_support
 def tanh(x):
-  """Hyperbolic tangent activation function.
+    """Hyperbolic tangent activation function.
 
-  For example:
+    Example:
 
-  >>> a = tf.constant([-3.0,-1.0, 0.0,1.0,3.0], dtype = tf.float32)
-  >>> b = tf.keras.activations.tanh(a)
-  >>> b.numpy()
-  array([-0.9950547, -0.7615942,  0.,  0.7615942,  0.9950547], dtype=float32)
+    >>> a = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype = tf.float32)
+    >>> b = tf.keras.activations.tanh(a)
+    >>> b.numpy()
+    array([-0.9950547, -0.7615942,  0.,  0.7615942,  0.9950547], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-      Tensor of same shape and dtype of input `x`, with tanh activation:
-      `tanh(x) = sinh(x)/cosh(x) = ((exp(x) - exp(-x))/(exp(x) + exp(-x)))`.
-  """
-  return tf.tanh(x)
+    Returns:
+        Tensor of same shape and dtype of input `x`, with tanh activation:
+        `tanh(x) = sinh(x)/cosh(x) = ((exp(x) - exp(-x))/(exp(x) + exp(-x)))`.
+    """
+    return tf.tanh(x)
 
 
-@keras_export('keras.activations.sigmoid')
+@keras_export("keras.activations.sigmoid")
 @tf.__internal__.dispatch.add_dispatch_support
 def sigmoid(x):
-  """Sigmoid activation function, `sigmoid(x) = 1 / (1 + exp(-x))`.
+    """Sigmoid activation function, `sigmoid(x) = 1 / (1 + exp(-x))`.
 
-  Applies the sigmoid activation function. For small values (<-5),
-  `sigmoid` returns a value close to zero, and for large values (>5)
-  the result of the function gets close to 1.
+    Applies the sigmoid activation function. For small values (<-5),
+    `sigmoid` returns a value close to zero, and for large values (>5)
+    the result of the function gets close to 1.
 
-  Sigmoid is equivalent to a 2-element Softmax, where the second element is
-  assumed to be zero. The sigmoid function always returns a value between
-  0 and 1.
+    Sigmoid is equivalent to a 2-element Softmax, where the second element is
+    assumed to be zero. The sigmoid function always returns a value between
+    0 and 1.
 
-  For example:
+    Example:
 
-  >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
-  >>> b = tf.keras.activations.sigmoid(a)
-  >>> b.numpy()
-  array([2.0611537e-09, 2.6894143e-01, 5.0000000e-01, 7.3105860e-01,
-           1.0000000e+00], dtype=float32)
+    >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32)
+    >>> b = tf.keras.activations.sigmoid(a)
+    >>> b.numpy()
+    array([2.0611537e-09, 2.6894143e-01, 5.0000000e-01, 7.3105860e-01,
+             1.0000000e+00], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-      Tensor with the sigmoid activation: `1 / (1 + exp(-x))`.
-  """
-  output = tf.sigmoid(x)
-  # Cache the logits to use for crossentropy loss.
-  output._keras_logits = x  # pylint: disable=protected-access
-  return output
+    Returns:
+        Tensor with the sigmoid activation: `1 / (1 + exp(-x))`.
+    """
+    return backend.sigmoid(x)
 
 
-@keras_export('keras.activations.exponential')
+@keras_export("keras.activations.exponential")
 @tf.__internal__.dispatch.add_dispatch_support
 def exponential(x):
-  """Exponential activation function.
+    """Exponential activation function.
 
-  For example:
+    Example:
 
-  >>> a = tf.constant([-3.0,-1.0, 0.0,1.0,3.0], dtype = tf.float32)
-  >>> b = tf.keras.activations.exponential(a)
-  >>> b.numpy()
-  array([0.04978707,  0.36787945,  1.,  2.7182817 , 20.085537], dtype=float32)
+    >>> a = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype = tf.float32)
+    >>> b = tf.keras.activations.exponential(a)
+    >>> b.numpy()
+    array([0.04978707,  0.36787945,  1.,  2.7182817 , 20.085537], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-      Tensor with exponential activation: `exp(x)`.
-  """
-  return tf.exp(x)
+    Returns:
+        Tensor with exponential activation: `exp(x)`.
+    """
+    return tf.exp(x)
 
 
-@keras_export('keras.activations.hard_sigmoid')
+@keras_export("keras.activations.hard_sigmoid")
 @tf.__internal__.dispatch.add_dispatch_support
 def hard_sigmoid(x):
-  """Hard sigmoid activation function.
+    """Hard sigmoid activation function.
 
-  A faster approximation of the sigmoid activation.
-  Piecewise linear approximation of the sigmoid function.
-  Ref: 'https://en.wikipedia.org/wiki/Hard_sigmoid'
+    A faster approximation of the sigmoid activation.
+    Piecewise linear approximation of the sigmoid function.
+    Ref: 'https://en.wikipedia.org/wiki/Hard_sigmoid'
 
-  For example:
+    Example:
 
-  >>> a = tf.constant([-3.0,-1.0, 0.0,1.0,3.0], dtype = tf.float32)
-  >>> b = tf.keras.activations.hard_sigmoid(a)
-  >>> b.numpy()
-  array([0. , 0.3, 0.5, 0.7, 1. ], dtype=float32)
+    >>> a = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype = tf.float32)
+    >>> b = tf.keras.activations.hard_sigmoid(a)
+    >>> b.numpy()
+    array([0. , 0.3, 0.5, 0.7, 1. ], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-    The hard sigmoid activation, defined as:
+    Returns:
+        The hard sigmoid activation, defined as:
 
-      - `if x < -2.5: return 0`
-      - `if x > 2.5: return 1`
-      - `if -2.5 <= x <= 2.5: return 0.2 * x + 0.5`
-  """
-  return backend.hard_sigmoid(x)
+            - `if x < -2.5: return 0`
+            - `if x > 2.5: return 1`
+            - `if -2.5 <= x <= 2.5: return 0.2 * x + 0.5`
+    """
+    return backend.hard_sigmoid(x)
 
 
-@keras_export('keras.activations.linear')
+@keras_export("keras.activations.linear")
 @tf.__internal__.dispatch.add_dispatch_support
 def linear(x):
-  """Linear activation function (pass-through).
+    """Linear activation function (pass-through).
 
-  For example:
+    Example:
 
-  >>> a = tf.constant([-3.0,-1.0, 0.0,1.0,3.0], dtype = tf.float32)
-  >>> b = tf.keras.activations.linear(a)
-  >>> b.numpy()
-  array([-3., -1.,  0.,  1.,  3.], dtype=float32)
+    >>> a = tf.constant([-3.0, -1.0, 0.0, 1.0, 3.0], dtype = tf.float32)
+    >>> b = tf.keras.activations.linear(a)
+    >>> b.numpy()
+    array([-3., -1.,  0.,  1.,  3.], dtype=float32)
 
-  Args:
-      x: Input tensor.
+    Args:
+        x: Input tensor.
 
-  Returns:
-      The input, unmodified.
-  """
-  return x
+    Returns:
+        The input, unmodified.
+    """
+    return x
 
 
-@keras_export('keras.activations.serialize')
+@keras_export("keras.activations.mish")
 @tf.__internal__.dispatch.add_dispatch_support
-def serialize(activation):
-  """Returns the string identifier of an activation function.
+def mish(x):
+    """Mish activation function.
+
+    It is defined as:
 
-  Args:
-      activation : Function object.
+    ```python
+    def mish(x):
+        return x * tanh(softplus(x))
+    ```
 
-  Returns:
-      String denoting the name attribute of the input function
+    where `softplus` is defined as:
 
-  For example:
+    ```python
+    def softplus(x):
+        return log(exp(x) + 1)
+    ```
 
-  >>> tf.keras.activations.serialize(tf.keras.activations.tanh)
-  'tanh'
-  >>> tf.keras.activations.serialize(tf.keras.activations.sigmoid)
-  'sigmoid'
-  >>> tf.keras.activations.serialize('abcd')
-  Traceback (most recent call last):
-  ...
-  ValueError: ('Cannot serialize', 'abcd')
+    Example:
 
-  Raises:
-      ValueError: The input function is not a valid one.
-  """
-  if (hasattr(activation, '__name__') and
-      activation.__name__ in _TF_ACTIVATIONS_V2):
-    return _TF_ACTIVATIONS_V2[activation.__name__]
-  return generic_utils.serialize_keras_object(activation)
+    >>> a = tf.constant([-3.0, -1.0, 0.0, 1.0], dtype = tf.float32)
+    >>> b = tf.keras.activations.mish(a)
+    >>> b.numpy()
+    array([-0.14564745, -0.30340144,  0.,  0.86509836], dtype=float32)
 
+    Args:
+        x: Input tensor.
 
-# Add additional globals so that deserialize can find these common activation
+    Returns:
+        The mish activation.
+
+    Reference:
+        - [Mish: A Self Regularized Non-Monotonic
+        Activation Function](https://arxiv.org/abs/1908.08681)
+    """
+    return x * tf.math.tanh(tf.math.softplus(x))
+
+
+@keras_export("keras.activations.serialize")
+@tf.__internal__.dispatch.add_dispatch_support
+def serialize(activation, use_legacy_format=False):
+    """Returns the string identifier of an activation function.
+
+    Args:
+        activation : Function object.
+        use_legacy_format: Boolean, whether to use the legacy format for
+            serialization. Defaults to False.
+
+    Returns:
+        String denoting the name attribute of the input function
+
+    Example:
+
+    >>> tf.keras.activations.serialize(tf.keras.activations.tanh)
+    'tanh'
+    >>> tf.keras.activations.serialize(tf.keras.activations.sigmoid)
+    'sigmoid'
+    >>> tf.keras.activations.serialize('abcd')
+    Traceback (most recent call last):
+    ...
+    ValueError: Unknown activation function 'abcd' cannot be serialized.
+
+    Raises:
+        ValueError: The input function is not a valid one.
+    """
+    if (
+        hasattr(activation, "__name__")
+        and activation.__name__ in _TF_ACTIVATIONS_V2
+    ):
+        return _TF_ACTIVATIONS_V2[activation.__name__]
+
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(activation)
+
+    fn_config = serialization_lib.serialize_keras_object(activation)
+    if (
+        not tf.__internal__.tf2.enabled()
+        or saved_model_utils.in_tf_saved_model_scope()
+    ):
+        return fn_config
+    if "config" not in fn_config:
+        raise ValueError(
+            f"Unknown activation function '{activation}' cannot be "
+            "serialized due to invalid function name. Make sure to use "
+            "an activation name that matches the references defined in "
+            "activations.py or use "
+            "`@keras.saving.register_keras_serializable()` "
+            "to register any custom activations. "
+            f"config={fn_config}"
+        )
+    if not isinstance(activation, types.FunctionType):
+        # Case for additional custom activations represented by objects
+        return fn_config
+    if (
+        isinstance(fn_config["config"], str)
+        and fn_config["config"] not in globals()
+    ):
+        # Case for custom activation functions from external activations modules
+        fn_config["config"] = object_registration.get_registered_name(
+            activation
+        )
+        return fn_config
+    return fn_config["config"]
+    # Case for keras.activations builtins (simply return name)
+
+
+# Add additional globals so that deserialize() can find these common activation
 # functions
 leaky_relu = tf.nn.leaky_relu
 log_softmax = tf.nn.log_softmax
@@ -517,87 +586,111 @@ def serialize(activation):
 silu = tf.nn.silu
 
 
-@keras_export('keras.activations.deserialize')
+@keras_export("keras.activations.deserialize")
 @tf.__internal__.dispatch.add_dispatch_support
-def deserialize(name, custom_objects=None):
-  """Returns activation function given a string identifier.
-
-  Args:
-    name: The name of the activation function.
-    custom_objects: Optional `{function_name: function_obj}`
-      dictionary listing user-provided activation functions.
-
-  Returns:
-      Corresponding activation function.
-
-  For example:
-
-  >>> tf.keras.activations.deserialize('linear')
-   <function linear at 0x1239596a8>
-  >>> tf.keras.activations.deserialize('sigmoid')
-   <function sigmoid at 0x123959510>
-  >>> tf.keras.activations.deserialize('abcd')
-  Traceback (most recent call last):
-  ...
-  ValueError: Unknown activation function:abcd
-
-  Raises:
-      ValueError: `Unknown activation function` if the input string does not
-      denote any defined Tensorflow activation function.
-  """
-  activation_functions = {}
-  current_module = sys.modules[__name__]
-
-  # we put 'current_module' after 'activation_layers' to prefer the local one
-  # if there is a collision
-  generic_utils.populate_dict_with_module_objects(
-      activation_functions,
-      (activation_layers, current_module),
-      obj_filter=callable)
-
-  return generic_utils.deserialize_keras_object(
-      name,
-      module_objects=activation_functions,
-      custom_objects=custom_objects,
-      printable_module_name='activation function')
-
-
-@keras_export('keras.activations.get')
+def deserialize(name, custom_objects=None, use_legacy_format=False):
+    """Returns activation function given a string identifier.
+
+    Args:
+        name: The name of the activation function.
+        custom_objects: Optional `{function_name: function_obj}`
+            dictionary listing user-provided activation functions.
+        use_legacy_format: Boolean, whether to use the legacy format for
+            deserialization. Defaults to False.
+
+    Returns:
+        Corresponding activation function.
+
+    Example:
+
+    >>> tf.keras.activations.deserialize('linear')
+     <function linear at 0x1239596a8>
+    >>> tf.keras.activations.deserialize('sigmoid')
+     <function sigmoid at 0x123959510>
+    >>> tf.keras.activations.deserialize('abcd')
+    Traceback (most recent call last):
+    ...
+    ValueError: Unknown activation function 'abcd' cannot be deserialized.
+
+    Raises:
+        ValueError: `Unknown activation function` if the input string does not
+        denote any defined Tensorflow activation function.
+    """
+    activation_functions = {}
+    current_module = sys.modules[__name__]
+
+    # we put 'current_module' after 'activation_layers' to prefer the local one
+    # if there is a collision
+    generic_utils.populate_dict_with_module_objects(
+        activation_functions,
+        (activation_layers, current_module),
+        obj_filter=callable,
+    )
+
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            name,
+            module_objects=activation_functions,
+            custom_objects=custom_objects,
+            printable_module_name="activation function",
+        )
+
+    returned_fn = serialization_lib.deserialize_keras_object(
+        name,
+        module_objects=activation_functions,
+        custom_objects=custom_objects,
+        printable_module_name="activation function",
+    )
+
+    if isinstance(returned_fn, str):
+        raise ValueError(
+            f"Unknown activation function '{name}' cannot be deserialized."
+        )
+
+    return returned_fn
+
+
+@keras_export("keras.activations.get")
 @tf.__internal__.dispatch.add_dispatch_support
 def get(identifier):
-  """Returns function.
-
-  Args:
-      identifier: Function or string
-
-  Returns:
-      Function corresponding to the input string or input function.
-
-  For example:
-
-  >>> tf.keras.activations.get('softmax')
-   <function softmax at 0x1222a3d90>
-  >>> tf.keras.activations.get(tf.keras.activations.softmax)
-   <function softmax at 0x1222a3d90>
-  >>> tf.keras.activations.get(None)
-   <function linear at 0x1239596a8>
-  >>> tf.keras.activations.get(abs)
-   <built-in function abs>
-  >>> tf.keras.activations.get('abcd')
-  Traceback (most recent call last):
-  ...
-  ValueError: Unknown activation function:abcd
-
-  Raises:
-      ValueError: Input is an unknown function or string, i.e., the input does
-      not denote any defined function.
-  """
-  if identifier is None:
-    return linear
-  if isinstance(identifier, (str, dict)):
-    return deserialize(identifier)
-  elif callable(identifier):
-    return identifier
-  else:
+    """Returns function.
+
+    Args:
+        identifier: Function or string
+
+    Returns:
+        Function corresponding to the input string or input function.
+
+    Example:
+
+    >>> tf.keras.activations.get('softmax')
+     <function softmax at 0x1222a3d90>
+    >>> tf.keras.activations.get(tf.keras.activations.softmax)
+     <function softmax at 0x1222a3d90>
+    >>> tf.keras.activations.get(None)
+     <function linear at 0x1239596a8>
+    >>> tf.keras.activations.get(abs)
+     <built-in function abs>
+    >>> tf.keras.activations.get('abcd')
+    Traceback (most recent call last):
+    ...
+    ValueError: Unknown activation function:abcd
+
+    Raises:
+        ValueError: Input is an unknown function or string, i.e., the input does
+        not denote any defined function.
+    """
+    if identifier is None:
+        return linear
+    if isinstance(identifier, (str, dict)):
+        use_legacy_format = (
+            "module" not in identifier
+            if isinstance(identifier, dict)
+            else False
+        )
+        return deserialize(identifier, use_legacy_format=use_legacy_format)
+    elif callable(identifier):
+        return identifier
     raise TypeError(
-        f'Could not interpret activation function identifier: {identifier}')
+        f"Could not interpret activation function identifier: {identifier}"
+    )
diff --git a/keras/activations_test.py b/keras/activations_test.py
index 81b7e6fb702b..2222d1574ec3 100644
--- a/keras/activations_test.py
+++ b/keras/activations_test.py
@@ -14,249 +14,286 @@
 # ==============================================================================
 """Tests for Keras activation functions."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
+import keras.layers.activation as activation_layers
 from keras import activations
 from keras import backend
-from keras.testing_infra import test_combinations
-import keras.layers.activation as activation_layers
 from keras.layers import core
 from keras.layers import serialization
+from keras.testing_infra import test_combinations
 
 
 def _ref_softmax(values):
-  m = np.max(values)
-  e = np.exp(values - m)
-  return e / np.sum(e)
+    m = np.max(values)
+    e = np.exp(values - m)
+    return e / np.sum(e)
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class KerasActivationsTest(tf.test.TestCase, parameterized.TestCase):
+def _ref_softplus(x):
+    return np.log(np.ones_like(x) + np.exp(x))
 
-  def test_serialization(self):
-    all_activations = [
-        'softmax', 'relu', 'elu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear',
-        'softplus', 'softsign', 'selu', 'gelu', 'relu6'
-    ]
-    for name in all_activations:
-      fn = activations.get(name)
-      ref_fn = getattr(activations, name)
-      assert fn == ref_fn
-      config = activations.serialize(fn)
-      fn = activations.deserialize(config)
-      assert fn == ref_fn
-
-  def test_serialization_v2(self):
-    activation_map = {tf.math.softmax: 'softmax'}
-    for fn_v2_key in activation_map:
-      fn_v2 = activations.get(fn_v2_key)
-      config = activations.serialize(fn_v2)
-      fn = activations.deserialize(config)
-      assert fn.__name__ == activation_map[fn_v2_key]
-
-  def test_serialization_with_layers(self):
-    activation = activation_layers.LeakyReLU(alpha=0.1)
-    layer = core.Dense(3, activation=activation)
-    config = serialization.serialize(layer)
-    # with custom objects
-    deserialized_layer = serialization.deserialize(
-        config, custom_objects={'LeakyReLU': activation})
-    self.assertEqual(deserialized_layer.__class__.__name__,
-                     layer.__class__.__name__)
-    self.assertEqual(deserialized_layer.activation.__class__.__name__,
-                     activation.__class__.__name__)
-    # without custom objects
-    deserialized_layer = serialization.deserialize(config)
-    self.assertEqual(deserialized_layer.__class__.__name__,
-                     layer.__class__.__name__)
-    self.assertEqual(deserialized_layer.activation.__class__.__name__,
-                     activation.__class__.__name__)
-
-  def test_softmax(self):
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.softmax(x)])
-    test_values = np.random.random((2, 5))
-
-    result = f([test_values])[0]
-    expected = _ref_softmax(test_values[0])
-    self.assertAllClose(result[0], expected, rtol=1e-05)
-
-    x = backend.placeholder(ndim=1)
-    with self.assertRaises(ValueError):
-      activations.softmax(x)
-
-  def test_softmax_2d_axis_0(self):
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.softmax(x, axis=0)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    expected = np.zeros((2, 5))
-    for i in range(5):
-      expected[:, i] = _ref_softmax(test_values[:, i])
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_softmax_3d_axis_tuple(self):
-    x = backend.placeholder(ndim=3)
-    f = backend.function([x], [activations.softmax(x, axis=(1, 2))])
-    test_values = np.random.random((2, 3, 5))
-    result = f([test_values])[0]
-    expected = np.zeros((2, 3, 5))
-    for i in range(2):
-      expected[i, :, :] = _ref_softmax(test_values[i, :, :])
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_temporal_softmax(self):
-    x = backend.placeholder(shape=(2, 2, 3))
-    f = backend.function([x], [activations.softmax(x)])
-    test_values = np.random.random((2, 2, 3)) * 10
-    result = f([test_values])[0]
-    expected = _ref_softmax(test_values[0, 0])
-    self.assertAllClose(result[0, 0], expected, rtol=1e-05)
-
-  def test_selu(self):
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.selu(x)])
-    alpha = 1.6732632423543772848170429916717
-    scale = 1.0507009873554804934193349852946
-
-    positive_values = np.array([[1, 2]], dtype=backend.floatx())
-    result = f([positive_values])[0]
-    self.assertAllClose(result, positive_values * scale, rtol=1e-05)
-
-    negative_values = np.array([[-1, -2]], dtype=backend.floatx())
-    result = f([negative_values])[0]
-    true_result = (np.exp(negative_values) - 1) * scale * alpha
-    self.assertAllClose(result, true_result)
-
-  def test_softplus(self):
-    def softplus(x):
-      return np.log(np.ones_like(x) + np.exp(x))
-
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.softplus(x)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    expected = softplus(test_values)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_softsign(self):
-    def softsign(x):
-      return np.divide(x, np.ones_like(x) + np.absolute(x))
-
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.softsign(x)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    expected = softsign(test_values)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_sigmoid(self):
-    def ref_sigmoid(x):
-      if x >= 0:
-        return 1 / (1 + np.exp(-x))
-      else:
-        z = np.exp(x)
-        return z / (1 + z)
-    sigmoid = np.vectorize(ref_sigmoid)
-
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.sigmoid(x)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    expected = sigmoid(test_values)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_hard_sigmoid(self):
-    def ref_hard_sigmoid(x):
-      x = (x * 0.2) + 0.5
-      z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
-      return z
-    hard_sigmoid = np.vectorize(ref_hard_sigmoid)
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.hard_sigmoid(x)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    expected = hard_sigmoid(test_values)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_relu(self):
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.relu(x)])
-    positive_values = np.random.random((2, 5))
-    result = f([positive_values])[0]
-    self.assertAllClose(result, positive_values, rtol=1e-05)
-
-    negative_values = np.random.uniform(-1, 0, (2, 5))
-    result = f([negative_values])[0]
-    expected = np.zeros((2, 5))
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_gelu(self):
-
-    def gelu(x, approximate=False):
-      if approximate:
-        return 0.5 * x * (1.0 + np.tanh(
-            np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3))))
-      else:
-        from scipy.stats import norm  # pylint: disable=g-import-not-at-top
-        return x * norm.cdf(x)
-
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.gelu(x)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    expected = gelu(test_values)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-    f = backend.function([x], [activations.gelu(x, True)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    expected = gelu(test_values, True)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_elu(self):
-    x = backend.placeholder(ndim=2)
-    f = backend.function([x], [activations.elu(x, 0.5)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    self.assertAllClose(result, test_values, rtol=1e-05)
-    negative_values = np.array([[-1, -2]], dtype=backend.floatx())
-    result = f([negative_values])[0]
-    true_result = (np.exp(negative_values) - 1) / 2
-    self.assertAllClose(result, true_result)
-
-  def test_tanh(self):
-    test_values = np.random.random((2, 5))
-    x = backend.placeholder(ndim=2)
-    exp = activations.tanh(x)
-    f = backend.function([x], [exp])
-    result = f([test_values])[0]
-    expected = np.tanh(test_values)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_exponential(self):
-    test_values = np.random.random((2, 5))
-    x = backend.placeholder(ndim=2)
-    exp = activations.exponential(x)
-    f = backend.function([x], [exp])
-    result = f([test_values])[0]
-    expected = np.exp(test_values)
-    self.assertAllClose(result, expected, rtol=1e-05)
-
-  def test_linear(self):
-    x = np.random.random((10, 5))
-    self.assertAllClose(x, activations.linear(x))
-
-  def test_invalid_usage(self):
-    with self.assertRaises(ValueError):
-      activations.get('unknown')
-
-    # The following should be possible but should raise a warning:
-    activations.get(activation_layers.LeakyReLU())
-
-
-if __name__ == '__main__':
-  tf.test.main()
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class KerasActivationsTest(tf.test.TestCase, parameterized.TestCase):
+    def test_serialization(self):
+        all_activations = [
+            "softmax",
+            "relu",
+            "elu",
+            "tanh",
+            "sigmoid",
+            "hard_sigmoid",
+            "linear",
+            "softplus",
+            "softsign",
+            "selu",
+            "gelu",
+            "relu6",
+            "mish",
+        ]
+        for name in all_activations:
+            fn = activations.get(name)
+            ref_fn = getattr(activations, name)
+            assert fn == ref_fn
+            config = activations.serialize(fn)
+            fn = activations.deserialize(config)
+            assert fn == ref_fn
+
+    def test_serialization_v2(self):
+        activation_map = {tf.math.softmax: "softmax"}
+        for fn_v2_key in activation_map:
+            fn_v2 = activations.get(fn_v2_key)
+            config = activations.serialize(fn_v2)
+            fn = activations.deserialize(config)
+            assert fn.__name__ == activation_map[fn_v2_key]
+
+    def test_serialization_with_layers(self):
+        activation = activation_layers.LeakyReLU(alpha=0.1)
+        layer = core.Dense(3, activation=activation)
+        config = serialization.serialize(layer)
+        # with custom objects
+        deserialized_layer = serialization.deserialize(
+            config, custom_objects={"LeakyReLU": activation}
+        )
+        self.assertEqual(
+            deserialized_layer.__class__.__name__, layer.__class__.__name__
+        )
+        self.assertEqual(
+            deserialized_layer.activation.__class__.__name__,
+            activation.__class__.__name__,
+        )
+        # without custom objects
+        deserialized_layer = serialization.deserialize(config)
+        self.assertEqual(
+            deserialized_layer.__class__.__name__, layer.__class__.__name__
+        )
+        self.assertEqual(
+            deserialized_layer.activation.__class__.__name__,
+            activation.__class__.__name__,
+        )
+
+    def test_softmax(self):
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.softmax(x)])
+        test_values = np.random.random((2, 5))
+
+        result = f([test_values])[0]
+        expected = _ref_softmax(test_values[0])
+        self.assertAllClose(result[0], expected, rtol=1e-05)
+
+        x = backend.placeholder(ndim=1)
+        with self.assertRaises(ValueError):
+            activations.softmax(x)
+
+    def test_softmax_2d_axis_0(self):
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.softmax(x, axis=0)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        expected = np.zeros((2, 5))
+        for i in range(5):
+            expected[:, i] = _ref_softmax(test_values[:, i])
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_softmax_3d_axis_tuple(self):
+        x = backend.placeholder(ndim=3)
+        f = backend.function([x], [activations.softmax(x, axis=(1, 2))])
+        test_values = np.random.random((2, 3, 5))
+        result = f([test_values])[0]
+        expected = np.zeros((2, 3, 5))
+        for i in range(2):
+            expected[i, :, :] = _ref_softmax(test_values[i, :, :])
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_temporal_softmax(self):
+        x = backend.placeholder(shape=(2, 2, 3))
+        f = backend.function([x], [activations.softmax(x)])
+        test_values = np.random.random((2, 2, 3)) * 10
+        result = f([test_values])[0]
+        expected = _ref_softmax(test_values[0, 0])
+        self.assertAllClose(result[0, 0], expected, rtol=1e-05)
+
+    def test_selu(self):
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.selu(x)])
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
+
+        positive_values = np.array([[1, 2]], dtype=backend.floatx())
+        result = f([positive_values])[0]
+        self.assertAllClose(result, positive_values * scale, rtol=1e-05)
+
+        negative_values = np.array([[-1, -2]], dtype=backend.floatx())
+        result = f([negative_values])[0]
+        true_result = (np.exp(negative_values) - 1) * scale * alpha
+        self.assertAllClose(result, true_result)
+
+    def test_softplus(self):
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.softplus(x)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        expected = _ref_softplus(test_values)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_softsign(self):
+        def softsign(x):
+            return np.divide(x, np.ones_like(x) + np.absolute(x))
+
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.softsign(x)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        expected = softsign(test_values)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_sigmoid(self):
+        def ref_sigmoid(x):
+            if x >= 0:
+                return 1 / (1 + np.exp(-x))
+            else:
+                z = np.exp(x)
+                return z / (1 + z)
+
+        sigmoid = np.vectorize(ref_sigmoid)
+
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.sigmoid(x)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        expected = sigmoid(test_values)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_hard_sigmoid(self):
+        def ref_hard_sigmoid(x):
+            x = (x * 0.2) + 0.5
+            z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
+            return z
+
+        hard_sigmoid = np.vectorize(ref_hard_sigmoid)
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.hard_sigmoid(x)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        expected = hard_sigmoid(test_values)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_relu(self):
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.relu(x)])
+        positive_values = np.random.random((2, 5))
+        result = f([positive_values])[0]
+        self.assertAllClose(result, positive_values, rtol=1e-05)
+
+        negative_values = np.random.uniform(-1, 0, (2, 5))
+        result = f([negative_values])[0]
+        expected = np.zeros((2, 5))
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_gelu(self):
+        def gelu(x, approximate=False):
+            if approximate:
+                return (
+                    0.5
+                    * x
+                    * (
+                        1.0
+                        + np.tanh(
+                            np.sqrt(2.0 / np.pi)
+                            * (x + 0.044715 * np.power(x, 3))
+                        )
+                    )
+                )
+            else:
+                from scipy.stats import norm
+
+                return x * norm.cdf(x)
+
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.gelu(x)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        expected = gelu(test_values)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+        f = backend.function([x], [activations.gelu(x, True)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        expected = gelu(test_values, True)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_elu(self):
+        x = backend.placeholder(ndim=2)
+        f = backend.function([x], [activations.elu(x, 0.5)])
+        test_values = np.random.random((2, 5))
+        result = f([test_values])[0]
+        self.assertAllClose(result, test_values, rtol=1e-05)
+        negative_values = np.array([[-1, -2]], dtype=backend.floatx())
+        result = f([negative_values])[0]
+        true_result = (np.exp(negative_values) - 1) / 2
+        self.assertAllClose(result, true_result)
+
+    def test_tanh(self):
+        test_values = np.random.random((2, 5))
+        x = backend.placeholder(ndim=2)
+        exp = activations.tanh(x)
+        f = backend.function([x], [exp])
+        result = f([test_values])[0]
+        expected = np.tanh(test_values)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_exponential(self):
+        test_values = np.random.random((2, 5))
+        x = backend.placeholder(ndim=2)
+        exp = activations.exponential(x)
+        f = backend.function([x], [exp])
+        result = f([test_values])[0]
+        expected = np.exp(test_values)
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_mish(self):
+        test_values = np.random.random((2, 5))
+        x = backend.placeholder(ndim=2)
+        output = activations.mish(x)
+        f = backend.function([x], [output])
+        result = f([test_values])[0]
+        expected = test_values * np.tanh(_ref_softplus(test_values))
+        self.assertAllClose(result, expected, rtol=1e-05)
+
+    def test_linear(self):
+        x = np.random.random((10, 5))
+        self.assertAllClose(x, activations.linear(x))
+
+    def test_invalid_usage(self):
+        with self.assertRaises(ValueError):
+            activations.get("unknown")
+
+        # The following should be possible but should raise a warning:
+        activations.get(activation_layers.LeakyReLU())
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/api/BUILD b/keras/api/BUILD
index 3707baa50007..3bcfc7a2d61b 100644
--- a/keras/api/BUILD
+++ b/keras/api/BUILD
@@ -1,10 +1,12 @@
 # Description:
 # Package for Keras.
 
+# Placeholder: load unaliased py_library
 load("//keras/api:api_gen.bzl", "gen_api_init_files")
 load("//keras/api:api_init_files.bzl", "KERAS_API_INIT_FILES", "KERAS_API_INIT_FILES_V1")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/py/tensorflow:__subpackages__",
@@ -52,7 +54,6 @@ keras_packages = [
     "keras.datasets.mnist",
     "keras.datasets.reuters",
     "keras.dtensor.layout_map",
-    "keras.dtensor.optimizers",
     "keras.engine.base_layer",
     "keras.engine.data_adapter",
     "keras.engine.input_layer",
@@ -60,22 +61,21 @@ keras_packages = [
     "keras.engine.sequential",
     "keras.engine.training",
     "keras.estimator",
+    "keras.export.export_lib",
     "keras.feature_column.dense_features",
     "keras.feature_column.dense_features_v2",
     "keras.feature_column.sequence_feature_column",
     # Placeholder for internal API
     "keras.initializers",
+    "keras.initializers.initializers",
     "keras.initializers.initializers_v1",
-    "keras.initializers.initializers_v2",
     "keras.layers.activation",
     "keras.layers.attention",
     "keras.layers.convolutional",
     "keras.layers.core",
     "keras.layers.locally_connected",
     "keras.layers.merging",
-    "keras.layers.normalization.batch_normalization",
-    "keras.layers.normalization.batch_normalization_v1",
-    "keras.layers.normalization.layer_normalization",
+    "keras.layers.normalization",
     "keras.layers.preprocessing",
     "keras.layers.pooling",
     "keras.layers.regularization",
@@ -93,33 +93,24 @@ keras_packages = [
     "keras.mixed_precision.loss_scale_optimizer",
     "keras.mixed_precision.policy",
     "keras.models",
+    "keras.optimizers.adadelta",
+    "keras.optimizers.adagrad",
+    "keras.optimizers.adam",
+    "keras.optimizers.adamax",
+    "keras.optimizers.ftrl",
+    "keras.optimizers.nadam",
+    "keras.optimizers.sgd",
+    "keras.optimizers.optimizer",
+    "keras.optimizers.rmsprop",
     "keras.optimizers.legacy.adadelta",
     "keras.optimizers.legacy.adagrad",
     "keras.optimizers.legacy.adam",
     "keras.optimizers.legacy.adamax",
     "keras.optimizers.legacy.ftrl",
+    "keras.optimizers.legacy.gradient_descent",
     "keras.optimizers.legacy.nadam",
-    "keras.optimizers.legacy.optimizer",
+    "keras.optimizers.legacy.optimizer_v2",
     "keras.optimizers.legacy.rmsprop",
-    "keras.optimizers.legacy.sgd",
-    "keras.optimizers.optimizer_experimental.adadelta",
-    "keras.optimizers.optimizer_experimental.adagrad",
-    "keras.optimizers.optimizer_experimental.adam",
-    "keras.optimizers.optimizer_experimental.adamax",
-    "keras.optimizers.optimizer_experimental.ftrl",
-    "keras.optimizers.optimizer_experimental.nadam",
-    "keras.optimizers.optimizer_experimental.sgd",
-    "keras.optimizers.optimizer_experimental.optimizer",
-    "keras.optimizers.optimizer_experimental.rmsprop",
-    "keras.optimizers.optimizer_v2.adadelta",
-    "keras.optimizers.optimizer_v2.adagrad",
-    "keras.optimizers.optimizer_v2.adam",
-    "keras.optimizers.optimizer_v2.adamax",
-    "keras.optimizers.optimizer_v2.ftrl",
-    "keras.optimizers.optimizer_v2.gradient_descent",
-    "keras.optimizers.optimizer_v2.nadam",
-    "keras.optimizers.optimizer_v2.optimizer_v2",
-    "keras.optimizers.optimizer_v2.rmsprop",
     "keras.optimizers.schedules.learning_rate_schedule",
     "keras.optimizers",
     "keras.premade_models.linear",
@@ -128,9 +119,9 @@ keras_packages = [
     "keras.preprocessing.sequence",
     "keras.preprocessing.text",
     "keras.regularizers",
-    "keras.saving.model_config",
-    "keras.saving.save",
-    "keras.saving.saved_model_experimental",
+    "keras.saving.legacy.model_config",
+    "keras.saving.legacy.save",
+    "keras.saving.legacy.serialization",
     "keras.testing_infra.test_utils",
     "keras.utils.data_utils",
     "keras.utils.generic_utils",
@@ -140,7 +131,6 @@ keras_packages = [
     "keras.utils.np_utils",
     "keras.utils.tf_utils",
     "keras.utils.vis_utils",
-    "keras.wrappers.scikit_learn",
 ]
 
 # The target used by PIP package which need to generate API init files during OSS build.
@@ -167,6 +157,9 @@ gen_api_init_files(
     package_deps = [
         "//keras",
         "//:expect_tensorflow_installed",
+        # "//third_party/tensorflow/lite/python:analyzer",
+        # "//third_party/tensorflow/lite/python:lite",
+        # "//third_party/tensorflow/lite/python/authoring",
     ],
     packages = keras_packages,
 )
@@ -181,6 +174,9 @@ gen_api_init_files(
     package_deps = [
         "//keras",
         "//:expect_tensorflow_installed",
+        # "//third_party/tensorflow/lite/python:analyzer",
+        # "//third_party/tensorflow/lite/python:lite",
+        # "//third_party/tensorflow/lite/python/authoring",
     ],
     packages = keras_packages,
 )
@@ -195,6 +191,9 @@ gen_api_init_files(
     package_deps = [
         "//keras",
         "//:expect_tensorflow_installed",
+        # "//third_party/tensorflow/lite/python:analyzer",
+        # "//third_party/tensorflow/lite/python:lite",
+        # "//third_party/tensorflow/lite/python/authoring",
     ],
     packages = keras_packages,
 )
diff --git a/keras/api/api_gen.bzl b/keras/api/api_gen.bzl
index 225c0900e0b2..f0d0cc067eba 100644
--- a/keras/api/api_gen.bzl
+++ b/keras/api/api_gen.bzl
@@ -9,6 +9,9 @@ and it imports TensorFlow code, that installing TensorFlow python package
 is required to Bazel build Keras.
 """
 
+load("@org_keras//keras:keras.bzl", "if_indexing_source_code")
+# Placeholder: load aliased py_binary
+
 def gen_api_init_files(
         name,
         output_files,
@@ -19,7 +22,9 @@ def gen_api_init_files(
         compat_api_versions = [],
         compat_init_templates = [],
         packages = ["keras"],
-        package_deps = ["//keras:keras"],
+        package_deps = [
+            "//keras:keras",
+        ],
         output_package = "keras.api",
         output_dir = "",
         root_file_name = "__init__.py"):
@@ -94,19 +99,32 @@ def gen_api_init_files(
     # Disable them for now so that we don't get SymbolExposedTwiceError
     # from create_python_api.py
     packages_to_ignore = ["tensorflow.python.keras", "tensorflow.keras"]
+
+    flags = [
+        root_init_template_flag,
+        "--apidir=$(@D)" + output_dir,
+        "--apiname=" + api_name,
+        "--apiversion=" + str(api_version),
+        compat_api_version_flags,
+        compat_init_template_flags,
+        "--packages=" + ",".join(packages),
+        "--packages_to_ignore=" + ",".join(packages_to_ignore),
+        "--output_package=" + output_package,
+    ]
+
     native.genrule(
         name = name,
         outs = all_output_files,
-        cmd = (
-            "$(location :" + api_gen_binary_target + ") " +
-            root_init_template_flag + " --apidir=$(@D)" + output_dir +
-            " --apiname=" + api_name + " --apiversion=" + str(api_version) +
-            compat_api_version_flags + " " + compat_init_template_flags +
-            " --packages=" + ",".join(packages) +
-            " --packages_to_ignore=" + ",".join(packages_to_ignore) +
-            " --output_package=" + output_package + " $(OUTS)"
+        cmd = if_indexing_source_code(
+            _make_cmd(api_gen_binary_target, flags, loading = "static"),
+            _make_cmd(api_gen_binary_target, flags, loading = "default"),
         ),
         srcs = srcs,
-        exec_tools = [":" + api_gen_binary_target],
+        tools = [":" + api_gen_binary_target],
         visibility = ["//visibility:public"],
     )
+
+def _make_cmd(api_gen_binary_target, flags, loading = "default"):
+    binary = "$(location :" + api_gen_binary_target + ")"
+    flags.append("--loading=" + loading)
+    return " ".join([binary] + flags + ["$(OUTS)"])
diff --git a/keras/api/api_init_files.bzl b/keras/api/api_init_files.bzl
index a7007e1dd235..48cfef198d73 100644
--- a/keras/api/api_init_files.bzl
+++ b/keras/api/api_init_files.bzl
@@ -9,6 +9,7 @@ KERAS_API_INIT_FILES = [
     "keras/__internal__/layers/__init__.py",
     "keras/__internal__/losses/__init__.py",
     "keras/__internal__/models/__init__.py",
+    "keras/__internal__/optimizers/__init__.py",
     "keras/__internal__/utils/__init__.py",
     "keras/activations/__init__.py",
     "keras/applications/__init__.py",
@@ -49,6 +50,7 @@ KERAS_API_INIT_FILES = [
     "keras/dtensor/experimental/optimizers/__init__.py",
     "keras/estimator/__init__.py",
     "keras/experimental/__init__.py",
+    "keras/export/__init__.py",
     # Placeholder for internal API
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
@@ -56,6 +58,7 @@ KERAS_API_INIT_FILES = [
     "keras/layers/experimental/preprocessing/__init__.py",
     "keras/losses/__init__.py",
     "keras/metrics/__init__.py",
+    "keras/metrics/experimental/__init__.py",
     "keras/mixed_precision/__init__.py",
     "keras/models/__init__.py",
     "keras/models/experimental/__init__.py",
@@ -69,8 +72,10 @@ KERAS_API_INIT_FILES = [
     "keras/preprocessing/sequence/__init__.py",
     "keras/preprocessing/text/__init__.py",
     "keras/regularizers/__init__.py",
+    "keras/saving/__init__.py",
     "keras/utils/__init__.py",
     "keras/utils/experimental/__init__.py",
+    "keras/utils/legacy/__init__.py",
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
 ]
@@ -119,6 +124,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/datasets/reuters/__init__.py",
     "keras/estimator/__init__.py",
     "keras/experimental/__init__.py",
+    "keras/export/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
     "keras/layers/experimental/__init__.py",
@@ -136,7 +142,9 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/preprocessing/sequence/__init__.py",
     "keras/preprocessing/text/__init__.py",
     "keras/regularizers/__init__.py",
+    "keras/saving/__init__.py",
     "keras/utils/__init__.py",
+    "keras/utils/legacy/__init__.py",
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
 ]
diff --git a/keras/api/create_python_api_wrapper.py b/keras/api/create_python_api_wrapper.py
index 83602c3aace3..c02c26e2cf99 100644
--- a/keras/api/create_python_api_wrapper.py
+++ b/keras/api/create_python_api_wrapper.py
@@ -23,8 +23,12 @@
 from __future__ import division
 from __future__ import print_function
 
-import keras  # pylint: disable=unused-import
-from tensorflow.python.tools.api.generator import create_python_api
+import keras  # noqa: F401
 
-if __name__ == '__main__':
-  create_python_api.main()
+# isort: off
+from tensorflow.python.tools.api.generator import (
+    create_python_api,
+)
+
+if __name__ == "__main__":
+    create_python_api.main()
diff --git a/keras/api/golden/BUILD b/keras/api/golden/BUILD
index 5c2a24c0669e..68d1e26f28fe 100644
--- a/keras/api/golden/BUILD
+++ b/keras/api/golden/BUILD
@@ -1,6 +1,7 @@
 # TensorFlow API backwards compatibility test goldens.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/keras/api/golden/v1/tensorflow.keras.-model.pbtxt b/keras/api/golden/v1/tensorflow.keras.-model.pbtxt
index 679bc3d70094..a867fb43ebd1 100644
--- a/keras/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -52,6 +60,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -112,6 +124,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -176,13 +192,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -216,6 +240,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +260,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -252,6 +288,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -264,13 +304,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -310,7 +358,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -326,7 +378,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/keras/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 9c322a1e659a..fc9edeb88c5f 100644
--- a/keras/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -54,6 +62,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -114,6 +126,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -182,13 +198,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -222,6 +246,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +266,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -258,6 +294,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -270,13 +310,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -320,7 +368,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -336,7 +388,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt
index 7f8976f0c0bf..68aa8fd65565 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'force_generator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'seed\', \'force_generator\', \'rng_type\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.layers.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.layers.pbtxt
index 429049587d64..1a3ec3c07eb7 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.layers.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.layers.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.keras.__internal__.layers"
 tf_module {
-  member {
-    name: "BaseImageAugmentationLayer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "BaseRandomLayer"
     mtype: "<type \'type\'>"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling1-d.pbtxt
index 32026fb12491..b724000004d0 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling1-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling2-d.pbtxt
index eb8ca29e8d1d..509a218c1f55 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling2-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling3-d.pbtxt
index 6f813150220b..4a4f882460b4 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-average-pooling3-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-batch-normalization.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-batch-normalization.pbtxt
index 38842e3849c2..77ae4cffed0c 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-batch-normalization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-batch-normalization.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,9 +171,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv1-d.pbtxt
index 699e2f4e8eeb..70cef7d5638a 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv1-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -203,6 +207,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -243,6 +251,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d-transpose.pbtxt
index f899e0e7a152..2a1dc3989ad9 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d-transpose.pbtxt
@@ -8,8 +8,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -244,6 +252,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d.pbtxt
index 4986cbfc2c67..3562610db383 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv2-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -203,6 +207,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -243,6 +251,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d-transpose.pbtxt
index 6739698fb60c..743619e0478e 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d-transpose.pbtxt
@@ -8,8 +8,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -244,6 +252,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d.pbtxt
index 36f8e63244ae..1975283a7815 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-conv3-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -203,6 +207,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -243,6 +251,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dense.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dense.pbtxt
index 93db8f2a0118..9c3540980571 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dense.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dense.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -170,6 +170,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -198,6 +202,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +246,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dropout.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dropout.pbtxt
index 177e51b470b9..99f55801f524 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dropout.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-dropout.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-flatten.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-flatten.pbtxt
index 0b5594ac61c7..d390aade084f 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-flatten.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-flatten.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -170,6 +170,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -198,6 +202,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +246,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-layer.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-layer.pbtxt
index b04e90fe0aa1..fa5c90d9b193 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-layer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-layer.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -168,6 +168,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +244,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling1-d.pbtxt
index 3c40a6c2a881..5a57d0d4f744 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling1-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling2-d.pbtxt
index fc781675d783..f0a9659a69de 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling2-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling3-d.pbtxt
index cc87c1d42329..dd0436a5821a 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-max-pooling3-d.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -239,6 +247,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv1-d.pbtxt
index 69a8b2e51d19..f1169e363e5f 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv1-d.pbtxt
@@ -8,8 +8,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -244,6 +252,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv2-d.pbtxt
index 7fe6d5194b2f..9815b88f8fa0 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.layers.-separable-conv2-d.pbtxt
@@ -8,8 +8,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -244,6 +252,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 7924e21ee229..91129cd63d4c 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,6 +259,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-r-n-n-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-r-n-n-cell.pbtxt
index 78bc3cad1b66..a056b2db71ee 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,6 +259,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-device-wrapper.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-device-wrapper.pbtxt
index 8ba415e602e8..06e5a0742dcb 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-device-wrapper.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-device-wrapper.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,6 +259,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-dropout-wrapper.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-dropout-wrapper.pbtxt
index 3bcd4f8b03e0..560abe76df77 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-dropout-wrapper.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-dropout-wrapper.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -183,6 +183,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -211,6 +215,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -255,6 +263,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-g-r-u-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-g-r-u-cell.pbtxt
index ff5a9c974c42..f047c7b161cc 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-g-r-u-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-g-r-u-cell.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,6 +259,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-l-s-t-m-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-l-s-t-m-cell.pbtxt
index 91f2d4ea5d12..917b7da630f8 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,6 +259,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-multi-r-n-n-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-multi-r-n-n-cell.pbtxt
index 99c6ee484e32..b87a1077437e 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -178,6 +178,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -206,6 +210,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -250,6 +258,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-r-n-n-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-r-n-n-cell.pbtxt
index 931f25495034..b12bdab443b1 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-r-n-n-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-r-n-n-cell.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -177,6 +177,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -205,6 +209,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -249,6 +257,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-residual-wrapper.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-residual-wrapper.pbtxt
index 584643d04a57..0c537a8bdea9 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-residual-wrapper.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.legacy.rnn_cell.-residual-wrapper.pbtxt
@@ -7,8 +7,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer_v1.Layer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,6 +259,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.pbtxt b/keras/api/golden/v1/tensorflow.keras.__internal__.pbtxt
index 2d6a3892f43b..6b25413391c4 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.__internal__.pbtxt
@@ -8,4 +8,8 @@ tf_module {
     name: "legacy"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "enable_unsafe_deserialization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.activations.pbtxt b/keras/api/golden/v1/tensorflow.keras.activations.pbtxt
index 28814e567e8d..ab982a5c4e4a 100644
--- a/keras/api/golden/v1/tensorflow.keras.activations.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.activations.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.activations"
 tf_module {
   member_method {
     name: "deserialize"
-    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'name\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "elu"
@@ -24,6 +24,10 @@ tf_module {
     name: "linear"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mish"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "relu"
     argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0.0\'], "
@@ -34,7 +38,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'activation\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'activation\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sigmoid"
diff --git a/keras/api/golden/v1/tensorflow.keras.backend.pbtxt b/keras/api/golden/v1/tensorflow.keras.backend.pbtxt
index a66ad258c8e0..6cc28ec691ae 100644
--- a/keras/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -62,11 +62,7 @@ tf_module {
   }
   member_method {
     name: "binary_focal_crossentropy"
-    argspec: "args=[\'target\', \'output\', \'gamma\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\'], "
-  }
-  member_method {
-    name: "binary_weighted_focal_crossentropy"
-    argspec: "args=[\'target\', \'output\', \'alpha\', \'gamma\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\'], "
+    argspec: "args=[\'target\', \'output\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\'], "
   }
   member_method {
     name: "cast"
@@ -80,6 +76,10 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'target\', \'output\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'target\', \'output\', \'alpha\', \'gamma\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'-1\'], "
+  }
   member_method {
     name: "clear_session"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -502,7 +502,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'target\', \'output\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+    argspec: "args=[\'target\', \'output\', \'from_logits\', \'axis\', \'ignore_class\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "spatial_2d_padding"
diff --git a/keras/api/golden/v1/tensorflow.keras.callbacks.-callback-list.pbtxt b/keras/api/golden/v1/tensorflow.keras.callbacks.-callback-list.pbtxt
index 3835ea4c944a..d3b5171b22c1 100644
--- a/keras/api/golden/v1/tensorflow.keras.callbacks.-callback-list.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.callbacks.-callback-list.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "append"
     argspec: "args=[\'self\', \'callback\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "make_logs"
+    argspec: "args=[\'self\', \'model\', \'logs\', \'outputs\', \'mode\', \'prefix\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
   member_method {
     name: "on_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt b/keras/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 75512300c8ab..2f6f3059b9b0 100644
--- a/keras/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\', \'restore_best_weights\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\', \'restore_best_weights\', \'start_from_epoch\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\', \'False\', \'0\'], "
   }
   member_method {
     name: "get_monitor_value"
diff --git a/keras/api/golden/v1/tensorflow.keras.callbacks.-sidecar-evaluator-model-export.pbtxt b/keras/api/golden/v1/tensorflow.keras.callbacks.-sidecar-evaluator-model-export.pbtxt
new file mode 100644
index 000000000000..0a33bbb4e389
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.callbacks.-sidecar-evaluator-model-export.pbtxt
@@ -0,0 +1,83 @@
+path: "tensorflow.keras.callbacks.SidecarEvaluatorModelExport"
+tf_class {
+  is_instance: "<class \'keras.utils.sidecar_evaluator.SidecarEvaluatorModelExport\'>"
+  is_instance: "<class \'keras.callbacks.ModelCheckpoint\'>"
+  is_instance: "<class \'keras.callbacks.Callback\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'export_filepath\', \'checkpoint_filepath\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "on_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_begin"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_model"
+    argspec: "args=[\'self\', \'model\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_params"
+    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.callbacks.pbtxt b/keras/api/golden/v1/tensorflow.keras.callbacks.pbtxt
index 31716a24407a..1d92b38192a5 100644
--- a/keras/api/golden/v1/tensorflow.keras.callbacks.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.callbacks.pbtxt
@@ -48,6 +48,10 @@ tf_module {
     name: "RemoteMonitor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SidecarEvaluatorModelExport"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TensorBoard"
     mtype: "<type \'type\'>"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.-constraint.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.-constraint.pbtxt
index b13e4c558f14..ebce5a630d42 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.-constraint.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.-constraint.pbtxt
@@ -5,6 +5,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.-max-norm.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.-max-norm.pbtxt
index b96e2fdc7649..751357a36cbf 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.-max-norm.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.-max-norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'max_value\', \'axis\'], varargs=None, keywords=None, defaults=[\'2\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.-min-max-norm.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.-min-max-norm.pbtxt
index 85017a5ab9fa..f385c813ca5c 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.-min-max-norm.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.-min-max-norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'min_value\', \'max_value\', \'rate\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'1.0\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.-non-neg.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.-non-neg.pbtxt
index 278f33d15b82..ab3251209eff 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.-non-neg.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.-non-neg.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.-radial-constraint.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.-radial-constraint.pbtxt
index 9fa92b2ccc62..54e6adf3e719 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.-radial-constraint.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.-radial-constraint.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.-unit-norm.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.-unit-norm.pbtxt
index a8ebd4eb371b..b821bbb8acc0 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.-unit-norm.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.-unit-norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.max_norm.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.max_norm.pbtxt
index bc201d9df1fb..42aeaf7e0f02 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.max_norm.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.max_norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'max_value\', \'axis\'], varargs=None, keywords=None, defaults=[\'2\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.min_max_norm.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.min_max_norm.pbtxt
index e260340d0c25..47ab0d1105bf 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.min_max_norm.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.min_max_norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'min_value\', \'max_value\', \'rate\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'1.0\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.non_neg.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.non_neg.pbtxt
index 4f8c1d767db8..0a8c23153108 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.non_neg.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.non_neg.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.pbtxt
index 29444ef3405f..be3658a12225 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -54,6 +54,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'constraint\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'constraint\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.radial_constraint.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.radial_constraint.pbtxt
index 8dca693a318b..78d401b280ff 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.radial_constraint.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.radial_constraint.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.constraints.unit_norm.pbtxt b/keras/api/golden/v1/tensorflow.keras.constraints.unit_norm.pbtxt
index 1aa9da9db057..137cb505e73c 100644
--- a/keras/api/golden/v1/tensorflow.keras.constraints.unit_norm.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.constraints.unit_norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.datasets.reuters.pbtxt b/keras/api/golden/v1/tensorflow.keras.datasets.reuters.pbtxt
index 2da4a13067f2..6f6446eb4296 100644
--- a/keras/api/golden/v1/tensorflow.keras.datasets.reuters.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.datasets.reuters.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.datasets.reuters"
 tf_module {
+  member_method {
+    name: "get_label_names"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_word_index"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=[\'reuters_word_index.json\'], "
diff --git a/keras/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt b/keras/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt
index cd4acbef5375..81bdedcb4e2e 100644
--- a/keras/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\', \'warmup_target\', \'warmup_steps\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/keras/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/keras/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 4324f56e2fc7..8301a65833d6 100644
--- a/keras/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -53,6 +61,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -113,6 +125,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -177,13 +193,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -217,6 +241,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +261,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -253,6 +289,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -265,13 +305,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -311,7 +359,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -327,7 +379,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/keras/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 5f0bfddb6bb1..e87a1ec3ddc6 100644
--- a/keras/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'features\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/keras/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index ed849f0c4597..44e02e9b4cad 100644
--- a/keras/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -53,6 +61,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -113,6 +125,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -177,13 +193,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -217,6 +241,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +261,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -253,6 +289,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -265,13 +305,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -311,7 +359,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -327,7 +379,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.experimental.pbtxt b/keras/api/golden/v1/tensorflow.keras.experimental.pbtxt
index d719121da99f..c658bcdc5b69 100644
--- a/keras/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -20,12 +20,4 @@ tf_module {
     name: "WideDeepModel"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
-  }
-  member_method {
-    name: "load_from_saved_model"
-    argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.export.-export-archive.pbtxt b/keras/api/golden/v1/tensorflow.keras.export.-export-archive.pbtxt
new file mode 100644
index 000000000000..4b245b4b999e
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.export.-export-archive.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.keras.export.ExportArchive"
+tf_class {
+  is_instance: "<class \'keras.export.export_lib.ExportArchive\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_endpoint"
+    argspec: "args=[\'self\', \'name\', \'fn\', \'input_signature\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable_collection"
+    argspec: "args=[\'self\', \'name\', \'variables\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "track"
+    argspec: "args=[\'self\', \'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write_out"
+    argspec: "args=[\'self\', \'filepath\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.export.pbtxt b/keras/api/golden/v1/tensorflow.keras.export.pbtxt
new file mode 100644
index 000000000000..ee81034d6104
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.export.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.export"
+tf_module {
+  member {
+    name: "ExportArchive"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.initializers.-initializer.pbtxt b/keras/api/golden/v1/tensorflow.keras.initializers.-initializer.pbtxt
index bbbf17dcface..848e5d352657 100644
--- a/keras/api/golden/v1/tensorflow.keras.initializers.-initializer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.initializers.-initializer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.Initializer"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v1/tensorflow.keras.initializers.pbtxt b/keras/api/golden/v1/tensorflow.keras.initializers.pbtxt
index 11794d5005ad..b8832017c3c3 100644
--- a/keras/api/golden/v1/tensorflow.keras.initializers.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.initializers.pbtxt
@@ -106,7 +106,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -114,6 +114,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'initializer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'initializer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index bb63c66b2c51..d7238394f940 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.abstract_rnn_cell.AbstractRNNCell\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -163,6 +163,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
@@ -191,6 +195,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -227,6 +235,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 22ac65768a1c..d1ee21e3e902 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.activation.Activation\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 4b2adcb785c0..8c47a61250e0 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.regularization.activity_regularization.ActivityRegularization\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index d6fc58c323b4..5127ff3dfaf2 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index a182400aba45..8ed84a4a760b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,9 +157,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\', \'use_causal_mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index f6ae42888aa8..b65b0c1c182c 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index dfba79459a37..c8c3027e9f66 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,9 +157,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\', \'use_causal_mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 5a2274e65da3..d1d687125d83 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 0758cd27ac34..c3c3f70274a7 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index bcf4b5d80bf0..cdd976ab992b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index 85dabd3a64c1..5552bd555473 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 43b071dc39ac..0fb5acc44d0a 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 003b77ca6d25..b46848ddfc0d 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 3eec8aea498d..c5f4a9b9b827 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index fc3a6fca4d7a..81ab7531f219 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.normalization.batch_normalization.BatchNormalizationBase\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -130,7 +130,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\', \'trainable\', \'virtual_batch_size\', \'adjustment\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_loss"
@@ -156,9 +156,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 19f50844e54d..4df4de9226d7 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_wrapper.Wrapper\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -160,6 +160,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,9 +228,17 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-category-encoding.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-category-encoding.pbtxt
index 0df48cefb4b3..dfa0cbabae9c 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-category-encoding.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-category-encoding.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-center-crop.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-center-crop.pbtxt
index c52a54221059..c4a5aa0e3c9a 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-center-crop.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-center-crop.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 60920b75bbd6..229006d485a4 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt
index c47f2afd7e18..13da3b785c9f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -246,6 +246,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -274,6 +278,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -310,10 +318,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 341952bb31f3..341d73a2cc91 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -246,6 +246,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -274,6 +278,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -310,10 +318,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt
index 2fb22764b37b..e6257107a1d1 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -246,6 +246,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -274,6 +278,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -310,10 +318,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 2ee4dbc50c27..5b3beb8b16d3 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index af41da6af123..5dff50a6f509 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 1989036fe4c0..67f03d1ce309 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index ae13a9283a5f..7413b8674afa 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 64875c946786..c66d6ffb327b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 7ab3a6d14952..5c0774f967b4 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index ba7e168af377..7484ce7ebb52 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 497bfe47f8b3..418e5d2b6bde 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 54a19a815066..dc4369ec905b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index a277662f5333..47258f5833e4 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 5f7efd7d6859..8219381a59ec 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 9dc46686425b..b334463bb54e 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index a049e4297da2..1d516ece0c4f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.cropping1d.Cropping1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index c7b804272d5e..569ff8d26659 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.cropping2d.Cropping2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 95d47a6b9c23..0d1f2865f73d 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.cropping3d.Cropping3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index cdb54fafc989..1827cda0cf38 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -165,6 +165,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -193,6 +197,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,10 +241,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 66519a796e59..cdad1bfac324 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -165,6 +165,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -193,6 +197,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,10 +241,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index 128f7e636d27..4e91e6e6709a 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 5adb1b1ebce6..b29161038bc4 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.dense.Dense\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt
index bdf88e8ca557..5d3179479b72 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 531c33aaa3a5..42f987270aaf 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-discretization.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-discretization.pbtxt
index 7a127fa7b94c..5563d613800d 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-discretization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-discretization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index be8dd47922f4..a43e3ea8e126 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 7b5db859f05f..0c504b38714b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index bc6cae7d82bf..338f8569be21 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.elu.ELU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-einsum-dense.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-einsum-dense.pbtxt
index e29b94e2fe12..0d878e1b6c76 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-einsum-dense.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-einsum-dense.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.einsum_dense.EinsumDense\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index db9812c187b4..d0acb29f450e 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.embedding.Embedding\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'input_dim\', \'output_dim\', \'embeddings_initializer\', \'embeddings_regularizer\', \'activity_regularizer\', \'embeddings_constraint\', \'mask_zero\', \'input_length\'], varargs=None, keywords=kwargs, defaults=[\'uniform\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'input_dim\', \'output_dim\', \'embeddings_initializer\', \'embeddings_regularizer\', \'activity_regularizer\', \'embeddings_constraint\', \'mask_zero\', \'input_length\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'uniform\', \'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index 496304ff4865..26ff207938f5 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.flatten.Flatten\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 2643beaa5715..f6fe569b9525 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -230,6 +238,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +250,10 @@ tf_class {
     name: "reset_recurrent_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 4a3099b0d687..a6e6dec7d7b7 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -228,6 +228,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -256,6 +260,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -292,10 +300,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index f57338d6e9b0..cfafd9e73d29 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index f9ffe97e40e2..03c265aeb58b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index bb0ca41b58cb..aaffbb42402c 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 78c5b4570884..5a5d64006850 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index f767993ce840..d211a3a0ac13 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index d13f9da6e9e9..f98c5fe73db4 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index b3c9acb03564..93ccb22cc8ac 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index aed9b8ebb0f7..f8a2802d8e5f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index eceeb2398af5..0c9d82c99469 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 0770d689735b..6aa97dfdc59e 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 4b61d5b49001..80177870bba2 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 99304d23491f..8b9a4c6e7c68 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index b4b2e891654f..8f4bf30b4514 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 6f8359590304..b165d98428f1 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-hashing.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-hashing.pbtxt
index 866f602987d8..ef1b9e56c2b2 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-hashing.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-hashing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.hashing.Hashing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-identity.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-identity.pbtxt
new file mode 100644
index 000000000000..3c3e39996588
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-identity.pbtxt
@@ -0,0 +1,242 @@
+path: "tensorflow.keras.layers.Identity"
+tf_class {
+  is_instance: "<class \'keras.layers.core.identity.Identity\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 796d62350d8f..7564a7f8bc7c 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index b52a8fee62b1..b86e2487a1ea 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -230,6 +238,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +250,10 @@ tf_class {
     name: "reset_recurrent_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index adf9bc7ca5ba..07d70ebe6935 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -228,6 +228,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -256,6 +260,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -292,10 +300,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index a20e5aaa6404..bb97d088dad2 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.lambda_layer.Lambda\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
index 40f56df8297f..1a81ce6f16e0 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.normalization.layer_normalization.LayerNormalization\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index a4b82d09fc3c..b50481b62f7b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -154,6 +154,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -182,6 +186,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -214,6 +222,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 6999a0d8ec4c..96cc14f91e00 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.leaky_relu.LeakyReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 3b1a787ccda2..f8b6b11e281f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.locally_connected.locally_connected1d.LocallyConnected1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index b078db2d0529..fb34dfb1c8e0 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.locally_connected.locally_connected2d.LocallyConnected2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 5021731d2885..cb3ac42a4afa 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.masking.Masking\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 8dc902d78f47..0d9dc7499d58 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 0d0d4841e616..e1092bf07672 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 514ca738be10..4696c58634a4 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index e6b925656d73..a021d15e3615 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 456185fa892b..8bea460ac28f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index a3267fed10f6..14a7d00de1cd 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index cdbe440dedee..cc8218f7a9db 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 5285f5c3220e..709c847a6953 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
index e8cb5e7f8a68..4b8080a1b78b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.attention.multi_head_attention.MultiHeadAttention\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,9 +155,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\', \'training\', \'use_causal_mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -165,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'query_shape\', \'value_shape\', \'key_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_signature"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index d0c3cbb0d595..3ef05dd0015f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-normalization.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-normalization.pbtxt
index f43dcd2f9b27..baa8fba13bdd 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-normalization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-normalization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 21589d6bb696..899af13f3363 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.prelu.PReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 22c083ff6d12..e08c6381543c 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.permute.Permute\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 703f18bbe89f..4dc7b8c60319 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -223,10 +231,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 1687d54efa2a..831131154f98 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.relu.ReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index fd5601eddeb2..a401a54ae021 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.repeat_vector.RepeatVector\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-rescaling.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-rescaling.pbtxt
index 238b54fb3e7e..2b52e5fa301f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-rescaling.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-rescaling.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 55b178a767a7..8af2743e9061 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.reshape.Reshape\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-resizing.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-resizing.pbtxt
index 3bf862774281..f04ecffd3a19 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-resizing.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-resizing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Resizing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 730d65cdc6e7..6922c5910055 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index bfa77c16d89c..b4d943239992 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index f91360016768..d21d6693bcc2 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 94962ea83281..312c27f69b33 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index d0fae29f2f6c..20da793c2a37 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -229,6 +237,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -237,6 +249,10 @@ tf_class {
     name: "reset_recurrent_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index b5f215070dc0..60a8f5172402 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -216,6 +216,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -244,6 +248,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -280,10 +288,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index b9be91a03f91..e8e05a00ece5 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.softmax.Softmax\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 606b7bc5f895..0f926be02b9b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index db4d2c885fc5..1bb81438fca3 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 1137eac88299..f31ec33f7cfd 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1c7dda9c0dc6..747de047f96c 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.stacked_rnn_cells.StackedRNNCells\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -163,6 +163,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'constants\', \'training\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
@@ -191,6 +195,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -227,6 +235,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index dec895ec98ee..d6bba621d770 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 9e04347d2a22..835f784b295f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.thresholded_relu.ThresholdedReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index 3e13ed5ab652..814d7168679b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_wrapper.Wrapper\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index c2f1d3d12cc2..ff61b890ceef 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.up_sampling1d.UpSampling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 00cc45f498f3..383e28967517 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.up_sampling2d.UpSampling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 89a07682e536..b2a2d89c1748 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.up_sampling3d.UpSampling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index a05086a1651d..149f9e61613f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_wrapper.Wrapper\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index eeb09f5a6a85..2ef8d53b6940 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.zero_padding1d.ZeroPadding1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 86805c95d9d0..5f5c510ec23f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.zero_padding2d.ZeroPadding2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 1789d6ec811c..03fc8519bb09 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.zero_padding3d.ZeroPadding3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
index 82e611df04e5..0da8e034e5a8 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.einsum_dense.EinsumDense\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
index ba2ad738ee29..fb529f555a8c 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.kernelized.RandomFourierFeatures\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index 63f019cf6868..a741778c72dd 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index a5358c4b811a..b2b7d584a5fc 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
index 06c517cf9c26..f61c4f82c5bb 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
index 5f9c8f541ac5..a608049a6d8a 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.hashing.Hashing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index c93956fe0e79..e6f797f63416 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index ce3100e121f0..942ce222c3e9 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -163,6 +163,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -195,6 +199,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -227,6 +235,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index fb98877a03cc..fdbab246741b 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 6135cdea2bbe..c11fb59691fb 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Resizing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index bf1243851874..a624e03a4d94 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "Discretization"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "HashedCrossing"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Hashing"
     mtype: "<type \'type\'>"
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.pbtxt b/keras/api/golden/v1/tensorflow.keras.layers.pbtxt
index 3596baa6505d..6ae37c06b75f 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -268,6 +268,10 @@ tf_module {
     name: "Hashing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Identity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputLayer"
     mtype: "<type \'type\'>"
@@ -494,7 +498,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "disable_v2_dtype_behavior"
@@ -522,7 +526,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'layer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "subtract"
diff --git a/keras/api/golden/v1/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt b/keras/api/golden/v1/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt
index 2c2a286f740e..ac49b8fc8701 100644
--- a/keras/api/golden/v1/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\', \'0.0\', \'-1\', \'auto\', \'binary_focal_crossentropy\'], "
+    argspec: "args=[\'self\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\', \'auto\', \'binary_focal_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/keras/api/golden/v1/tensorflow.keras.losses.-categorical-focal-crossentropy.pbtxt b/keras/api/golden/v1/tensorflow.keras.losses.-categorical-focal-crossentropy.pbtxt
new file mode 100644
index 000000000000..f06b44ec8765
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.losses.-categorical-focal-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CategoricalFocalCrossentropy"
+tf_class {
+  is_instance: "<class \'keras.losses.CategoricalFocalCrossentropy\'>"
+  is_instance: "<class \'keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\', \'auto\', \'categorical_focal_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/keras/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
index 06d9cfe145ee..389b05c75d5d 100644
--- a/keras/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'auto\', \'sparse_categorical_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'ignore_class\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'auto\', \'sparse_categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/keras/api/golden/v1/tensorflow.keras.losses.pbtxt b/keras/api/golden/v1/tensorflow.keras.losses.pbtxt
index b3294965eeff..2b628cdc7943 100644
--- a/keras/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalFocalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalHinge"
     mtype: "<type \'type\'>"
@@ -94,12 +98,16 @@ tf_module {
   }
   member_method {
     name: "binary_focal_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\', \'0.0\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
   }
   member_method {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.0\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
+  }
   member_method {
     name: "categorical_hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -118,7 +126,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'name\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -186,11 +194,11 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'loss\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'loss\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\', \'ignore_class\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
index 35f9a429b865..171da23f3bc1 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.metrics.AUC"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.AUC\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.AUC\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -160,6 +160,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -224,6 +232,10 @@ tf_class {
     name: "interpolate_pr_auc"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -240,6 +252,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index 26fbd0b585bf..863b948441e9 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.Accuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Accuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.Accuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 789c93e9c821..4b8759cf7628 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.BinaryAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.BinaryAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 4e88a2ad5ddd..16228d4229f2 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.BinaryCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.BinaryCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-binary-io-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-binary-io-u.pbtxt
index 590f84d1e583..49e4ac2946e7 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-binary-io-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-binary-io-u.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.BinaryIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.BinaryIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index f910dc4b0696..c56abceaeb13 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 27abc004b332..92d50ec7a5f1 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CategoricalCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.CategoricalCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -133,7 +133,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 4bb20d940f1f..f4386171e6f5 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CategoricalHinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.CategoricalHinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 0dbf94fa93df..221cbe34edd0 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.CosineSimilarity\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-f-beta-score.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-f-beta-score.pbtxt
new file mode 100644
index 000000000000..37847a1f933d
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-f-beta-score.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.keras.metrics.FBetaScore"
+tf_class {
+  is_instance: "<class \'keras.metrics.f_score_metrics.FBetaScore\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'average\', \'beta\', \'threshold\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'fbeta_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-f1-score.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-f1-score.pbtxt
new file mode 100644
index 000000000000..56d233b0b5fc
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-f1-score.pbtxt
@@ -0,0 +1,264 @@
+path: "tensorflow.keras.metrics.F1Score"
+tf_class {
+  is_instance: "<class \'keras.metrics.f_score_metrics.F1Score\'>"
+  is_instance: "<class \'keras.metrics.f_score_metrics.FBetaScore\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'average\', \'threshold\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'f1_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index ad1ffb7d5e1d..12518c046e4d 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.FalseNegatives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.FalseNegatives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.FalseNegatives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index 0dfa8b5ee1a6..d3a260bc7f5f 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.FalsePositives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.FalsePositives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.FalsePositives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index b9ef8b808189..c01adca8b432 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.Hinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Hinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.Hinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-io-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-io-u.pbtxt
index c8e3cac66dac..3b3e4ed1e707 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-io-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-io-u.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.IoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -131,7 +131,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_true\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'True\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 2c31b5fccac2..8fe4028c968d 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.KLDivergence"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.KLDivergence\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.KLDivergence\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 81ff9033cdac..862a2c127f69 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.LogCoshError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.LogCoshError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.LogCoshError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 50832f259e8d..4db047358108 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanAbsoluteError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanAbsoluteError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index dfc975031555..c1a4285ba95d 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanAbsolutePercentageError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
index a2c1fbea4afa..eb8b2c471f44 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.MeanIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.MeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -132,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_true\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'True\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt
index 951c151fdc79..d84345e14e31 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 10b3a82a0c8c..697c4e0bb74b 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.MeanRelativeError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanRelativeError\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index ec4d424986b5..ceb5282f0746 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanSquaredError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanSquaredError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index ecfebc72ad3b..2d5cf64c2c3d 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanSquaredLogarithmicError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
index a4ee5fc8e909..6e8ba1767c97 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -192,6 +196,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -224,6 +232,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -240,6 +252,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 80d830fb7efc..c31d49e14b7f 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
index 905c92a33ab9..916ae93096e5 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -231,6 +243,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-io-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-io-u.pbtxt
index 853ae3bcf38e..23fd50224c5c 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-io-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-io-u.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.OneHotIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.OneHotIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.OneHotIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -132,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt
index e20224e9b14e..98b63a62da97 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.OneHotMeanIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.OneHotMeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.MeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.OneHotMeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.MeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -133,7 +133,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
index 29ccceda1abe..1d5f8c6efcb7 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.Poisson"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Poisson\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.Poisson\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
index ec505dc742e9..21f1c36bdc1b 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-precision-at-recall.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.PrecisionAtRecall"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.PrecisionAtRecall\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.PrecisionAtRecall\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index fe1822fc8d53..d9c49540edcb 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.metrics.Precision"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Precision\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.Precision\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +244,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-r2-score.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-r2-score.pbtxt
new file mode 100644
index 000000000000..1e76ffb29ad4
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-r2-score.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.keras.metrics.R2Score"
+tf_class {
+  is_instance: "<class \'keras.metrics.regression_metrics.R2Score\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'class_aggregation\', \'num_regressors\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'uniform_average\', \'0\', \'r2_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
index e8ab0f6ce1c6..5aa668718b0e 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-recall-at-precision.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.RecallAtPrecision"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.RecallAtPrecision\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.RecallAtPrecision\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 52e9879a3446..e7c4864a1bbd 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.metrics.Recall"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Recall\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.Recall\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +244,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index cd99b1e8e29e..64671f63b4c0 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.RootMeanSquaredError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.RootMeanSquaredError\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 0da727a14110..9b35e4f14197 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SensitivityAtSpecificity\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index d47d06739b2a..d960b99eccb4 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index 4fdc705aa389..c5bd4c6f59db 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SparseCategoricalCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.SparseCategoricalCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -133,7 +133,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'ignore_class\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'None\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index dd386c6cba5d..069a3e3b2727 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.SparseTopKCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 15dfa9412558..9f42d1f0b3c2 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SpecificityAtSensitivity\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 0f76c4a43b47..83437f332258 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SquaredHinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SquaredHinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.SquaredHinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
index ccd3ac0c8752..6cb46d1f93e4 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index dd26258eb1bb..6355e88e1858 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.TopKCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index af0fb7936462..95bc523abd0c 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.TrueNegatives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TrueNegatives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.TrueNegatives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 0e1124fbc296..863fb2911873 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.TruePositives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TruePositives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.TruePositives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.metrics.pbtxt b/keras/api/golden/v1/tensorflow.keras.metrics.pbtxt
index b9b466ae381b..40356586b0ad 100644
--- a/keras/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -36,6 +36,14 @@ tf_module {
     name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "F1Score"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FBetaScore"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -120,6 +128,10 @@ tf_module {
     name: "PrecisionAtRecall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "R2Score"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Recall"
     mtype: "<type \'type\'>"
@@ -202,7 +214,7 @@ tf_module {
   }
   member_method {
     name: "binary_focal_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\', \'0.0\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -212,6 +224,10 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.0\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
+  }
   member_method {
     name: "cosine"
     argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
@@ -222,7 +238,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -290,7 +306,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'metric\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'metric\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_categorical_accuracy"
@@ -298,7 +314,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\', \'ignore_class\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/keras/api/golden/v1/tensorflow.keras.models.-linear-model.pbtxt b/keras/api/golden/v1/tensorflow.keras.models.-linear-model.pbtxt
index d98738fda8cd..a7e40b8a197c 100644
--- a/keras/api/golden/v1/tensorflow.keras.models.-linear-model.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.models.-linear-model.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -53,6 +61,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -113,6 +125,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -177,13 +193,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -217,6 +241,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +261,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -253,6 +289,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -265,13 +305,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -311,7 +359,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -327,7 +379,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/keras/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 8f0115b30ac0..af5a892ca740 100644
--- a/keras/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -52,6 +60,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -112,6 +124,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -176,13 +192,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -216,6 +240,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +260,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -252,6 +288,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -264,13 +304,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -310,7 +358,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -326,7 +378,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/keras/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 8b85b77488b4..a6f046c2e06a 100644
--- a/keras/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -54,6 +62,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -114,6 +126,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -182,13 +198,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -222,6 +246,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +266,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -258,6 +294,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -270,13 +310,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -320,7 +368,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -336,7 +388,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.models.-wide-deep-model.pbtxt b/keras/api/golden/v1/tensorflow.keras.models.-wide-deep-model.pbtxt
index 337ec78ac8f7..ee3b09f7c98d 100644
--- a/keras/api/golden/v1/tensorflow.keras.models.-wide-deep-model.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.models.-wide-deep-model.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -53,6 +61,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -113,6 +125,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -177,13 +193,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -217,6 +241,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +261,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -253,6 +289,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -265,13 +305,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -311,7 +359,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -327,7 +379,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v1/tensorflow.keras.models.pbtxt b/keras/api/golden/v1/tensorflow.keras.models.pbtxt
index f2a185c0b9d1..8d5fd58f2776 100644
--- a/keras/api/golden/v1/tensorflow.keras.models.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.models.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "load_model"
-    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\', \'True\'], "
   }
   member_method {
     name: "model_from_config"
@@ -38,6 +38,6 @@ tf_module {
   }
   member_method {
     name: "save_model"
-    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
   }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index d0856c75be4a..ff4531cd44fb 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adadelta.Adadelta\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index 17f68fd67db0..4e35fed07fd1 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adagrad.Adagrad\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index 74fa9869ad54..697ca03f6150 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adam.Adam\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index ae0d88760eb5..c488d88b72e8 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adamax.Adamax\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
index 2cfd1ca6b71c..e75a11b74f4b 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.Ftrl"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.ftrl.Ftrl\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index 2d18b1b4774b..a09e7ac9a467 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.nadam.Nadam\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index 5a9d33eea359..43c247557a69 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index d53b8c656ddc..8b093190fb74 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.rmsprop.RMSprop\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index f354c71298ce..78fdecf4d12d 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.gradient_descent.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.gradient_descent.SGD\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt
index 9c1b406a1d6f..05ae2888d367 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adadelta"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt
index 736ee08e4efb..507148f08dbb 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adagrad"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adam.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adam.pbtxt
index 7d0d3b23614c..d79093442bd9 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adam.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adam.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adam"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adamax.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adamax.pbtxt
index 149d0f213893..b18db03163b8 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adamax.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-adamax.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adamax"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt
index 9ce47c161678..b852c98df0e6 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Ftrl"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-nadam.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-nadam.pbtxt
index 8a612f6b89b2..ef505faade82 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-nadam.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-nadam.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Nadam"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt
index 6b4bf1701f22..f28c01037044 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt
@@ -1,8 +1,7 @@
 path: "tensorflow.keras.optimizers.legacy.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.legacy.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt
index 77a6e72a9411..f53b0568fe11 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.RMSprop"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt
index f6a6dd836e72..ab1041592075 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.legacy.sgd.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.gradient_descent.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.gradient_descent.SGD\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.pbtxt
index 94bf1bf82da6..a06dbfc73903 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.pbtxt
@@ -46,14 +46,14 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
-    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'optimizer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'optimizer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt
index 13a711fe288b..6df561f3342e 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\', \'warmup_target\', \'warmup_steps\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt b/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt
index 3ecc437199f6..8ed0edccf925 100644
--- a/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt
@@ -30,10 +30,10 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'learning_rate_schedule\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.pbtxt b/keras/api/golden/v1/tensorflow.keras.pbtxt
index c83d9ad57524..a5592a0f08b7 100644
--- a/keras/api/golden/v1/tensorflow.keras.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.pbtxt
@@ -40,6 +40,10 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "export"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "initializers"
     mtype: "<type \'module\'>"
@@ -77,11 +81,11 @@ tf_module {
     mtype: "<type \'module\'>"
   }
   member {
-    name: "utils"
+    name: "saving"
     mtype: "<type \'module\'>"
   }
   member {
-    name: "wrappers"
+    name: "utils"
     mtype: "<type \'module\'>"
   }
   member_method {
diff --git a/keras/api/golden/v1/tensorflow.keras.regularizers.pbtxt b/keras/api/golden/v1/tensorflow.keras.regularizers.pbtxt
index 96a4b193b1bd..f424d54785b0 100644
--- a/keras/api/golden/v1/tensorflow.keras.regularizers.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.regularizers.pbtxt
@@ -26,7 +26,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -38,6 +38,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'regularizer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.saving.custom_object_scope.pbtxt b/keras/api/golden/v1/tensorflow.keras.saving.custom_object_scope.pbtxt
new file mode 100644
index 000000000000..cf877e5ae4dd
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.saving.custom_object_scope.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.keras.saving.custom_object_scope"
+tf_class {
+  is_instance: "<class \'keras.saving.object_registration.CustomObjectScope\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.saving.pbtxt b/keras/api/golden/v1/tensorflow.keras.saving.pbtxt
new file mode 100644
index 000000000000..e1df1e64293c
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.saving.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.keras.saving"
+tf_module {
+  member {
+    name: "custom_object_scope"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize_keras_object"
+    argspec: "args=[\'config\', \'custom_objects\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "get_custom_objects"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_registered_name"
+    argspec: "args=[\'obj\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_registered_object"
+    argspec: "args=[\'name\', \'custom_objects\', \'module_objects\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "load_model"
+    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\', \'True\'], "
+  }
+  member_method {
+    name: "register_keras_serializable"
+    argspec: "args=[\'package\', \'name\'], varargs=None, keywords=None, defaults=[\'Custom\', \'None\'], "
+  }
+  member_method {
+    name: "save_model"
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "serialize_keras_object"
+    argspec: "args=[\'obj\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.utils.-custom-object-scope.pbtxt b/keras/api/golden/v1/tensorflow.keras.utils.-custom-object-scope.pbtxt
index 9e9370be68f8..3ccf719d8c8c 100644
--- a/keras/api/golden/v1/tensorflow.keras.utils.-custom-object-scope.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.utils.-custom-object-scope.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.CustomObjectScope"
 tf_class {
-  is_instance: "<class \'keras.utils.generic_utils.CustomObjectScope\'>"
+  is_instance: "<class \'keras.saving.object_registration.CustomObjectScope\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v1/tensorflow.keras.utils.-steps-per-execution-tuner.pbtxt b/keras/api/golden/v1/tensorflow.keras.utils.-steps-per-execution-tuner.pbtxt
new file mode 100644
index 000000000000..1363d2190e1e
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.utils.-steps-per-execution-tuner.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.utils.StepsPerExecutionTuner"
+tf_class {
+  is_instance: "<class \'keras.utils.steps_per_execution_tuning.StepsPerExecutionTuner\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'optimizer\', \'spe_variable\', \'interval\', \'change_spe_interval\', \'change_threshold\'], varargs=None, keywords=None, defaults=[\'5\', \'10\', \'0.1\'], "
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.utils.custom_object_scope.pbtxt b/keras/api/golden/v1/tensorflow.keras.utils.custom_object_scope.pbtxt
index 4fa8c7af04e4..08f84e0f825f 100644
--- a/keras/api/golden/v1/tensorflow.keras.utils.custom_object_scope.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.utils.custom_object_scope.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.custom_object_scope"
 tf_class {
-  is_instance: "<class \'keras.utils.generic_utils.CustomObjectScope\'>"
+  is_instance: "<class \'keras.saving.object_registration.CustomObjectScope\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v1/tensorflow.keras.utils.legacy.pbtxt b/keras/api/golden/v1/tensorflow.keras.utils.legacy.pbtxt
new file mode 100644
index 000000000000..267629bf49c2
--- /dev/null
+++ b/keras/api/golden/v1/tensorflow.keras.utils.legacy.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.keras.utils.legacy"
+tf_module {
+  member_method {
+    name: "deserialize_keras_object"
+    argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
+  }
+  member_method {
+    name: "serialize_keras_object"
+    argspec: "args=[\'instance\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.utils.pbtxt b/keras/api/golden/v1/tensorflow.keras.utils.pbtxt
index 675db2735114..09a7c4059fae 100644
--- a/keras/api/golden/v1/tensorflow.keras.utils.pbtxt
+++ b/keras/api/golden/v1/tensorflow.keras.utils.pbtxt
@@ -28,17 +28,25 @@ tf_module {
     name: "SequenceEnqueuer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StepsPerExecutionTuner"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "custom_object_scope"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "legacy"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "array_to_img"
     argspec: "args=[\'x\', \'data_format\', \'scale\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "deserialize_keras_object"
-    argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "disable_interactive_logging"
@@ -86,7 +94,7 @@ tf_module {
   }
   member_method {
     name: "model_to_dot"
-    argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\', \'layer_range\', \'show_layer_activations\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\', \'layer_range\', \'show_layer_activations\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "normalize"
@@ -98,7 +106,7 @@ tf_module {
   }
   member_method {
     name: "plot_model"
-    argspec: "args=[\'model\', \'to_file\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'layer_range\', \'show_layer_activations\'], varargs=None, keywords=None, defaults=[\'model.png\', \'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'None\', \'False\'], "
+    argspec: "args=[\'model\', \'to_file\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'layer_range\', \'show_layer_activations\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'model.png\', \'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "register_keras_serializable"
@@ -110,14 +118,22 @@ tf_module {
   }
   member_method {
     name: "serialize_keras_object"
-    argspec: "args=[\'instance\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'obj\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "to_categorical"
     argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
   }
+  member_method {
+    name: "to_ordinal"
+    argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
+  }
   member_method {
     name: "track_tf1_style_variables"
     argspec: "args=[\'method\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "warmstart_embedding_matrix"
+    argspec: "args=[\'base_vocabulary\', \'new_vocabulary\', \'base_embeddings\', \'new_embeddings_initializer\'], varargs=None, keywords=None, defaults=[\'uniform\'], "
+  }
 }
diff --git a/keras/api/golden/v1/tensorflow.keras.wrappers.pbtxt b/keras/api/golden/v1/tensorflow.keras.wrappers.pbtxt
deleted file mode 100644
index 0b2fac9b7d99..000000000000
--- a/keras/api/golden/v1/tensorflow.keras.wrappers.pbtxt
+++ /dev/null
@@ -1,7 +0,0 @@
-path: "tensorflow.keras.wrappers"
-tf_module {
-  member {
-    name: "scikit_learn"
-    mtype: "<type \'module\'>"
-  }
-}
diff --git a/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt b/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
deleted file mode 100644
index 180e05527f31..000000000000
--- a/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
+++ /dev/null
@@ -1,42 +0,0 @@
-path: "tensorflow.keras.wrappers.scikit_learn.KerasClassifier"
-tf_class {
-  is_instance: "<class \'keras.wrappers.scikit_learn.KerasClassifier\'>"
-  is_instance: "<class \'keras.wrappers.scikit_learn.BaseWrapper\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'build_fn\'], varargs=None, keywords=sk_params, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "check_params"
-    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "filter_sk_params"
-    argspec: "args=[\'self\', \'fn\', \'override\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "predict_proba"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "score"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "set_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-}
diff --git a/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt b/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
deleted file mode 100644
index 0dfc03fb05e5..000000000000
--- a/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.keras.wrappers.scikit_learn.KerasRegressor"
-tf_class {
-  is_instance: "<class \'keras.wrappers.scikit_learn.KerasRegressor\'>"
-  is_instance: "<class \'keras.wrappers.scikit_learn.BaseWrapper\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'build_fn\'], varargs=None, keywords=sk_params, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "check_params"
-    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "filter_sk_params"
-    argspec: "args=[\'self\', \'fn\', \'override\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "score"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "set_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-}
diff --git a/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.pbtxt b/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.pbtxt
deleted file mode 100644
index fbd4d13387a9..000000000000
--- a/keras/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.keras.wrappers.scikit_learn"
-tf_module {
-  member {
-    name: "KerasClassifier"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "KerasRegressor"
-    mtype: "<type \'type\'>"
-  }
-}
diff --git a/keras/api/golden/v2/tensorflow.keras.-model.pbtxt b/keras/api/golden/v2/tensorflow.keras.-model.pbtxt
index 679bc3d70094..a867fb43ebd1 100644
--- a/keras/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -52,6 +60,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -112,6 +124,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -176,13 +192,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -216,6 +240,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +260,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -252,6 +288,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -264,13 +304,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -310,7 +358,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -326,7 +378,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/keras/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 9c322a1e659a..fc9edeb88c5f 100644
--- a/keras/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -54,6 +62,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -114,6 +126,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -182,13 +198,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -222,6 +246,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +266,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -258,6 +294,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -270,13 +310,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -320,7 +368,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -336,7 +388,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.-keras-tensor.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.-keras-tensor.pbtxt
new file mode 100644
index 000000000000..9b09b44a8bfb
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.-keras-tensor.pbtxt
@@ -0,0 +1,61 @@
+path: "tensorflow.keras.__internal__.KerasTensor"
+tf_class {
+  is_instance: "<class \'keras.engine.keras_tensor.KerasTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_tensor_like"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "type_spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'type_spec\', \'inferred_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_type_spec"
+    argspec: "args=[\'cls\', \'type_spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.-ragged-keras-tensor.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.-ragged-keras-tensor.pbtxt
new file mode 100644
index 000000000000..7c91676b2f7e
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.-ragged-keras-tensor.pbtxt
@@ -0,0 +1,138 @@
+path: "tensorflow.keras.__internal__.RaggedKerasTensor"
+tf_class {
+  is_instance: "<class \'keras.engine.keras_tensor.RaggedKerasTensor\'>"
+  is_instance: "<class \'keras.engine.keras_tensor.KerasTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flat_values"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_tensor_like"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "nested_row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "type_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'type_spec\', \'inferred_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "bounding_shape"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_type_spec"
+    argspec: "args=[\'cls\', \'type_spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_dims"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "nested_row_lengths"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "nested_value_rowids"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "nrows"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "row_lengths"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "row_limits"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "row_starts"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_sparse"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_tensor"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_rowids"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "with_flat_values"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "with_row_splits_dtype"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.-sparse-keras-tensor.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.-sparse-keras-tensor.pbtxt
new file mode 100644
index 000000000000..c25a8784dd48
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.-sparse-keras-tensor.pbtxt
@@ -0,0 +1,78 @@
+path: "tensorflow.keras.__internal__.SparseKerasTensor"
+tf_class {
+  is_instance: "<class \'keras.engine.keras_tensor.SparseKerasTensor\'>"
+  is_instance: "<class \'keras.engine.keras_tensor.KerasTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_tensor_like"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "type_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'type_spec\', \'inferred_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_type_spec"
+    argspec: "args=[\'cls\', \'type_spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-image-augmentation-layer.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-dense-attention.pbtxt
similarity index 80%
rename from keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-image-augmentation-layer.pbtxt
rename to keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-dense-attention.pbtxt
index 8e7c54168a7c..bb4b16600324 100644
--- a/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-image-augmentation-layer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-dense-attention.pbtxt
@@ -1,21 +1,17 @@
-path: "tensorflow.keras.__internal__.layers.BaseImageAugmentationLayer"
+path: "tensorflow.keras.__internal__.layers.BaseDenseAttention"
 tf_class {
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
+  is_instance: "<class \'keras.layers.attention.base_dense_attention.BaseDenseAttention\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -134,7 +130,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'rate\', \'seed\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'None\'], "
+    argspec: "args=[\'self\', \'dropout\'], varargs=None, keywords=kwargs, defaults=[\'0.0\'], "
   }
   member_method {
     name: "add_loss"
@@ -156,29 +152,17 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\', \'use_causal_mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -204,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -232,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt
index 7f8976f0c0bf..68aa8fd65565 100644
--- a/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.layers.-base-random-layer.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'force_generator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'seed\', \'force_generator\', \'rng_type\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.layers.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.layers.pbtxt
index 429049587d64..8f5b1b170689 100644
--- a/keras/api/golden/v2/tensorflow.keras.__internal__.layers.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.layers.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.__internal__.layers"
 tf_module {
   member {
-    name: "BaseImageAugmentationLayer"
+    name: "BaseDenseAttention"
     mtype: "<type \'type\'>"
   }
   member {
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.losses.-loss-function-wrapper.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.losses.-loss-function-wrapper.pbtxt
new file mode 100644
index 000000000000..b59c57da8ce6
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.losses.-loss-function-wrapper.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.__internal__.losses.LossFunctionWrapper"
+tf_class {
+  is_instance: "<class \'keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'fn\', \'reduction\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'auto\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.losses.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.losses.pbtxt
index 02bc17e14dc5..d2b2abf80f42 100644
--- a/keras/api/golden/v2/tensorflow.keras.__internal__.losses.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.losses.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.__internal__.losses"
 tf_module {
+  member {
+    name: "LossFunctionWrapper"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "compute_weighted_loss"
     argspec: "args=[\'losses\', \'sample_weight\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'sum_over_batch_size\', \'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.optimizers.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.optimizers.pbtxt
new file mode 100644
index 000000000000..5afce7e73dd1
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.optimizers.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.__internal__.optimizers"
+tf_module {
+  member_method {
+    name: "convert_to_legacy_optimizer"
+    argspec: "args=[\'optimizer\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.pbtxt
index eca0637f5fb8..aadf3076c120 100644
--- a/keras/api/golden/v2/tensorflow.keras.__internal__.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.pbtxt
@@ -1,5 +1,17 @@
 path: "tensorflow.keras.__internal__"
 tf_module {
+  member {
+    name: "KerasTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RaggedKerasTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseKerasTensor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "backend"
     mtype: "<type \'module\'>"
@@ -16,6 +28,10 @@ tf_module {
     name: "models"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "optimizers"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "utils"
     mtype: "<type \'module\'>"
@@ -24,4 +40,8 @@ tf_module {
     name: "apply_name_scope_on_model_declaration"
     argspec: "args=[\'enable\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_unsafe_deserialization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.__internal__.utils.pbtxt b/keras/api/golden/v2/tensorflow.keras.__internal__.utils.pbtxt
index f604525fb8f0..ab38e0f70014 100644
--- a/keras/api/golden/v2/tensorflow.keras.__internal__.utils.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.__internal__.utils.pbtxt
@@ -6,7 +6,7 @@ tf_module {
   }
   member_method {
     name: "layer_test"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'layer_cls\', \'kwargs\', \'input_shape\', \'input_dtype\', \'input_data\', \'expected_output\', \'expected_output_dtype\', \'expected_output_shape\', \'validate_training\', \'adapt_data\', \'custom_objects\', \'test_harness\', \'supports_masking\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "register_symbolic_tensor_type"
diff --git a/keras/api/golden/v2/tensorflow.keras.activations.pbtxt b/keras/api/golden/v2/tensorflow.keras.activations.pbtxt
index 7acce4f5f6fa..863800e05306 100644
--- a/keras/api/golden/v2/tensorflow.keras.activations.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.activations.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.activations"
 tf_module {
   member_method {
     name: "deserialize"
-    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'name\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "elu"
@@ -28,6 +28,10 @@ tf_module {
     name: "linear"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mish"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "relu"
     argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0.0\'], "
@@ -38,7 +42,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'activation\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'activation\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sigmoid"
diff --git a/keras/api/golden/v2/tensorflow.keras.backend.pbtxt b/keras/api/golden/v2/tensorflow.keras.backend.pbtxt
index 0e1be9b5ad83..6e29da804c4d 100644
--- a/keras/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -62,11 +62,7 @@ tf_module {
   }
   member_method {
     name: "binary_focal_crossentropy"
-    argspec: "args=[\'target\', \'output\', \'gamma\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\'], "
-  }
-  member_method {
-    name: "binary_weighted_focal_crossentropy"
-    argspec: "args=[\'target\', \'output\', \'alpha\', \'gamma\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\'], "
+    argspec: "args=[\'target\', \'output\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\'], "
   }
   member_method {
     name: "cast"
@@ -80,6 +76,10 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'target\', \'output\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'target\', \'output\', \'alpha\', \'gamma\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'-1\'], "
+  }
   member_method {
     name: "clear_session"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -498,7 +498,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'target\', \'output\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+    argspec: "args=[\'target\', \'output\', \'from_logits\', \'axis\', \'ignore_class\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "spatial_2d_padding"
diff --git a/keras/api/golden/v2/tensorflow.keras.callbacks.-backup-and-restore.pbtxt b/keras/api/golden/v2/tensorflow.keras.callbacks.-backup-and-restore.pbtxt
index 55ee0aae41d2..ea38be4adcd1 100644
--- a/keras/api/golden/v2/tensorflow.keras.callbacks.-backup-and-restore.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.callbacks.-backup-and-restore.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'backup_dir\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'backup_dir\', \'save_freq\', \'delete_checkpoint\', \'save_before_preemption\'], varargs=None, keywords=None, defaults=[\'epoch\', \'True\', \'False\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/keras/api/golden/v2/tensorflow.keras.callbacks.-callback-list.pbtxt b/keras/api/golden/v2/tensorflow.keras.callbacks.-callback-list.pbtxt
index 3835ea4c944a..d3b5171b22c1 100644
--- a/keras/api/golden/v2/tensorflow.keras.callbacks.-callback-list.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.callbacks.-callback-list.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "append"
     argspec: "args=[\'self\', \'callback\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "make_logs"
+    argspec: "args=[\'self\', \'model\', \'logs\', \'outputs\', \'mode\', \'prefix\'], varargs=None, keywords=None, defaults=[\'\'], "
+  }
   member_method {
     name: "on_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt b/keras/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 75512300c8ab..2f6f3059b9b0 100644
--- a/keras/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\', \'restore_best_weights\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\', \'restore_best_weights\', \'start_from_epoch\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\', \'False\', \'0\'], "
   }
   member_method {
     name: "get_monitor_value"
diff --git a/keras/api/golden/v2/tensorflow.keras.callbacks.-sidecar-evaluator-model-export.pbtxt b/keras/api/golden/v2/tensorflow.keras.callbacks.-sidecar-evaluator-model-export.pbtxt
new file mode 100644
index 000000000000..0a33bbb4e389
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.callbacks.-sidecar-evaluator-model-export.pbtxt
@@ -0,0 +1,83 @@
+path: "tensorflow.keras.callbacks.SidecarEvaluatorModelExport"
+tf_class {
+  is_instance: "<class \'keras.utils.sidecar_evaluator.SidecarEvaluatorModelExport\'>"
+  is_instance: "<class \'keras.callbacks.ModelCheckpoint\'>"
+  is_instance: "<class \'keras.callbacks.Callback\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'export_filepath\', \'checkpoint_filepath\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "on_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_begin"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_model"
+    argspec: "args=[\'self\', \'model\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_params"
+    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.callbacks.pbtxt b/keras/api/golden/v2/tensorflow.keras.callbacks.pbtxt
index 1ae71bfee1af..6b162ce1e347 100644
--- a/keras/api/golden/v2/tensorflow.keras.callbacks.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.callbacks.pbtxt
@@ -52,6 +52,10 @@ tf_module {
     name: "RemoteMonitor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SidecarEvaluatorModelExport"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TensorBoard"
     mtype: "<type \'type\'>"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.-constraint.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.-constraint.pbtxt
index b13e4c558f14..ebce5a630d42 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.-constraint.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.-constraint.pbtxt
@@ -5,6 +5,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.-max-norm.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.-max-norm.pbtxt
index b96e2fdc7649..751357a36cbf 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.-max-norm.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.-max-norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'max_value\', \'axis\'], varargs=None, keywords=None, defaults=[\'2\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.-min-max-norm.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.-min-max-norm.pbtxt
index 85017a5ab9fa..f385c813ca5c 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.-min-max-norm.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.-min-max-norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'min_value\', \'max_value\', \'rate\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'1.0\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.-non-neg.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.-non-neg.pbtxt
index 278f33d15b82..ab3251209eff 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.-non-neg.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.-non-neg.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.-radial-constraint.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.-radial-constraint.pbtxt
index 9fa92b2ccc62..54e6adf3e719 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.-radial-constraint.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.-radial-constraint.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.-unit-norm.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.-unit-norm.pbtxt
index a8ebd4eb371b..b821bbb8acc0 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.-unit-norm.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.-unit-norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.max_norm.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.max_norm.pbtxt
index bc201d9df1fb..42aeaf7e0f02 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.max_norm.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.max_norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'max_value\', \'axis\'], varargs=None, keywords=None, defaults=[\'2\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.min_max_norm.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.min_max_norm.pbtxt
index e260340d0c25..47ab0d1105bf 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.min_max_norm.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.min_max_norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'min_value\', \'max_value\', \'rate\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'1.0\', \'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.non_neg.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.non_neg.pbtxt
index 4f8c1d767db8..0a8c23153108 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.non_neg.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.non_neg.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.pbtxt
index 29444ef3405f..be3658a12225 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -54,6 +54,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'constraint\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'constraint\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.radial_constraint.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.radial_constraint.pbtxt
index 8dca693a318b..78d401b280ff 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.radial_constraint.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.radial_constraint.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.constraints.unit_norm.pbtxt b/keras/api/golden/v2/tensorflow.keras.constraints.unit_norm.pbtxt
index 1aa9da9db057..137cb505e73c 100644
--- a/keras/api/golden/v2/tensorflow.keras.constraints.unit_norm.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.constraints.unit_norm.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.datasets.reuters.pbtxt b/keras/api/golden/v2/tensorflow.keras.datasets.reuters.pbtxt
index 2da4a13067f2..6f6446eb4296 100644
--- a/keras/api/golden/v2/tensorflow.keras.datasets.reuters.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.datasets.reuters.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.datasets.reuters"
 tf_module {
+  member_method {
+    name: "get_label_names"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_word_index"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=[\'reuters_word_index.json\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.-layout-map.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.-layout-map.pbtxt
index bcc7983c5da7..15402cd02143 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.-layout-map.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.-layout-map.pbtxt
@@ -34,6 +34,10 @@ tf_class {
     name: "popitem"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "setdefault"
     argspec: "args=[\'self\', \'key\', \'default\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adadelta.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adadelta.pbtxt
index eda0ec11e3ed..1bde9e5882c5 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adadelta.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adadelta.pbtxt
@@ -1,13 +1,10 @@
 path: "tensorflow.keras.dtensor.experimental.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'keras.dtensor.optimizers.Adadelta\'>"
-  is_instance: "<class \'keras.dtensor.optimizers.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adadelta.Adadelta\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -21,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'gradients_clip_option\', \'ema_option\', \'name\', \'mesh\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'None\', \'None\', \'Adadelta\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adadelta\'], "
   }
   member_method {
     name: "add_variable"
@@ -55,7 +32,7 @@ tf_class {
   }
   member_method {
     name: "add_variable_from_reference"
-    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "aggregate_gradients"
@@ -63,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -73,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adagrad.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adagrad.pbtxt
index d4cf31e80321..792f67240803 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adagrad.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adagrad.pbtxt
@@ -1,13 +1,10 @@
 path: "tensorflow.keras.dtensor.experimental.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'keras.dtensor.optimizers.Adagrad\'>"
-  is_instance: "<class \'keras.dtensor.optimizers.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adagrad.Adagrad\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -21,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'gradients_clip_option\', \'ema_option\', \'name\', \'mesh\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'None\', \'None\', \'Adagrad\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adagrad\'], "
   }
   member_method {
     name: "add_variable"
@@ -55,7 +32,7 @@ tf_class {
   }
   member_method {
     name: "add_variable_from_reference"
-    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "aggregate_gradients"
@@ -63,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -73,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adam-w.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adam-w.pbtxt
new file mode 100644
index 000000000000..2e5c929d6d21
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adam-w.pbtxt
@@ -0,0 +1,89 @@
+path: "tensorflow.keras.dtensor.experimental.optimizers.AdamW"
+tf_class {
+  is_instance: "<class \'keras.optimizers.adamw.AdamW\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "lr"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'weight_decay\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.004\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'AdamW\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
+  }
+  member_method {
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adam.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adam.pbtxt
index e2e9b31c73e8..93fe2d44bd9f 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adam.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-adam.pbtxt
@@ -1,13 +1,10 @@
 path: "tensorflow.keras.dtensor.experimental.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'keras.dtensor.optimizers.Adam\'>"
-  is_instance: "<class \'keras.dtensor.optimizers.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adam.Adam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -21,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'gradients_clip_option\', \'ema_option\', \'name\', \'mesh\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'Adam\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
   }
   member_method {
     name: "add_variable"
@@ -55,7 +32,7 @@ tf_class {
   }
   member_method {
     name: "add_variable_from_reference"
-    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "aggregate_gradients"
@@ -63,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -73,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-r-m-sprop.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-r-m-sprop.pbtxt
index ad3117262b0e..16efcd4fc38f 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-r-m-sprop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-r-m-sprop.pbtxt
@@ -1,13 +1,10 @@
 path: "tensorflow.keras.dtensor.experimental.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'keras.dtensor.optimizers.RMSprop\'>"
-  is_instance: "<class \'keras.dtensor.optimizers.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.rmsprop.RMSprop\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -21,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'gradients_clip_option\', \'ema_option\', \'jit_compile\', \'name\', \'mesh\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'None\', \'None\', \'False\', \'RMSprop\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'100\', \'True\', \'RMSprop\'], "
   }
   member_method {
     name: "add_variable"
@@ -55,7 +32,7 @@ tf_class {
   }
   member_method {
     name: "add_variable_from_reference"
-    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "aggregate_gradients"
@@ -63,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -73,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-s-g-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-s-g-d.pbtxt
index ad465d8a168f..e994213fe416 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-s-g-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.-s-g-d.pbtxt
@@ -1,13 +1,10 @@
 path: "tensorflow.keras.dtensor.experimental.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'keras.dtensor.optimizers.SGD\'>"
-  is_instance: "<class \'keras.dtensor.optimizers.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.sgd.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.sgd.SGD\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -21,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'amsgrad\', \'gradients_clip_option\', \'ema_option\', \'jit_compile\', \'name\', \'mesh\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'False\', \'SGD\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
   }
   member_method {
     name: "add_variable"
@@ -55,7 +32,7 @@ tf_class {
   }
   member_method {
     name: "add_variable_from_reference"
-    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "aggregate_gradients"
@@ -63,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -73,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.pbtxt
index aac7440b4a86..18bd1acf13e1 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.optimizers.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Adam"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "AdamW"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RMSprop"
     mtype: "<type \'type\'>"
diff --git a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.pbtxt b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.pbtxt
index 20f3bd29b566..dd963f6657dc 100644
--- a/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.dtensor.experimental.pbtxt
@@ -8,8 +8,4 @@ tf_module {
     name: "optimizers"
     mtype: "<type \'module\'>"
   }
-  member_method {
-    name: "layout_map_scope"
-    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
-  }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt b/keras/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt
index cd4acbef5375..81bdedcb4e2e 100644
--- a/keras/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\', \'warmup_target\', \'warmup_steps\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/keras/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/keras/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 4324f56e2fc7..8301a65833d6 100644
--- a/keras/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -53,6 +61,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -113,6 +125,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -177,13 +193,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -217,6 +241,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +261,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -253,6 +289,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -265,13 +305,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -311,7 +359,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -327,7 +379,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/keras/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 5f0bfddb6bb1..e87a1ec3ddc6 100644
--- a/keras/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'features\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.experimental.-sidecar-evaluator.pbtxt b/keras/api/golden/v2/tensorflow.keras.experimental.-sidecar-evaluator.pbtxt
index 605736dd4938..9ca14da2e737 100644
--- a/keras/api/golden/v2/tensorflow.keras.experimental.-sidecar-evaluator.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.experimental.-sidecar-evaluator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.experimental.SidecarEvaluator"
 tf_class {
-  is_instance: "<class \'keras.distribute.sidecar_evaluator.SidecarEvaluatorExperimental\'>"
-  is_instance: "<class \'keras.distribute.sidecar_evaluator.SidecarEvaluator\'>"
+  is_instance: "<class \'keras.utils.sidecar_evaluator.SidecarEvaluatorExperimental\'>"
+  is_instance: "<class \'keras.utils.sidecar_evaluator.SidecarEvaluator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/keras/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index ed849f0c4597..44e02e9b4cad 100644
--- a/keras/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -53,6 +61,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -113,6 +125,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -177,13 +193,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -217,6 +241,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +261,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -253,6 +289,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -265,13 +305,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -311,7 +359,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -327,7 +379,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v2/tensorflow.keras.export.-export-archive.pbtxt b/keras/api/golden/v2/tensorflow.keras.export.-export-archive.pbtxt
new file mode 100644
index 000000000000..4b245b4b999e
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.export.-export-archive.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.keras.export.ExportArchive"
+tf_class {
+  is_instance: "<class \'keras.export.export_lib.ExportArchive\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_endpoint"
+    argspec: "args=[\'self\', \'name\', \'fn\', \'input_signature\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable_collection"
+    argspec: "args=[\'self\', \'name\', \'variables\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "track"
+    argspec: "args=[\'self\', \'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write_out"
+    argspec: "args=[\'self\', \'filepath\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.export.pbtxt b/keras/api/golden/v2/tensorflow.keras.export.pbtxt
new file mode 100644
index 000000000000..ee81034d6104
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.export.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.export"
+tf_module {
+  member {
+    name: "ExportArchive"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
index cd56d7c7027b..026836fe4606 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.Constant"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Constant\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Constant\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt
index 7a4f2f695b19..570cb6015a70 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.GlorotNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt
index 39e8dceebd21..4f6b5719e75c 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.GlorotUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-he-normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-he-normal.pbtxt
index e2392a1de059..af6f28ad7bd9 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-he-normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-he-normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.HeNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-he-uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-he-uniform.pbtxt
index a1d0b78df694..a3ae35b25e82 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-he-uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-he-uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.HeUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
index bdf11c0d346b..11d9180d0e45 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.Identity"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Identity\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Identity\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
index bbbf17dcface..848e5d352657 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.Initializer"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-normal.pbtxt
index 4dc8579c6726..1a3b20240c36 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.LecunNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-uniform.pbtxt
index 1cf25acc880c..cb09e8963051 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-lecun-uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.LecunUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
index 949254b493fe..78065e847a27 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.Ones"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Ones\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Ones\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
index 7cf7a32a86c8..1623468564f8 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.Orthogonal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Orthogonal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Orthogonal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
index 8301dbbf2ecc..d56e2e30d60f 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.RandomNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
index 809b742218b2..a80f1ea48f5e 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.RandomUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
index 9ea077f5e2b2..38c1b18ae58d 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.TruncatedNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.TruncatedNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.TruncatedNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
index bf6aecad7088..52b639a1ac21 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.VarianceScaling"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
index 40b430b1a17e..263040949a2d 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.Zeros"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Zeros\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Zeros\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
index e560d7e5a529..fedf0b9a178e 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.constant"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Constant\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Constant\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
index a2aaabf88dd4..35bbb24fa5d4 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.glorot_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
index 841e2648282c..76eb02bbf5bd 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.glorot_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.he_normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.he_normal.pbtxt
index cc9a8717cdc2..59ee38972d47 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.he_normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.he_normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.he_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.he_uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.he_uniform.pbtxt
index e3228e20d552..f1b7ce285b21 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.he_uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.he_uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.he_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
index abf9a4d3c025..6b4b4cee8083 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.identity"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Identity\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Identity\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.lecun_normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.lecun_normal.pbtxt
index df5b58e28453..e6802630101b 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.lecun_normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.lecun_normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.lecun_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.lecun_uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.lecun_uniform.pbtxt
index 741054185c4f..1d8f833fcfcd 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.lecun_uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.lecun_uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.initializers.lecun_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
index 73fb315ecc4f..4b6fccb960ff 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.ones"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Ones\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Ones\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
index 94025290bc98..5e9e3cad98a1 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.orthogonal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Orthogonal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Orthogonal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.pbtxt
index f39b701806a2..7c3b8f1f8d4f 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.pbtxt
@@ -126,7 +126,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -134,6 +134,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'initializer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'initializer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
index d445f96f8c99..15ab42e95575 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.random_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
index b02d8cd54bd0..3e54ce21b24e 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.random_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
index a9d0650a5742..65d698377d32 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.truncated_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.TruncatedNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.TruncatedNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.variance_scaling.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.variance_scaling.pbtxt
index eaa0ed75dc95..f598610395f2 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.variance_scaling.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.variance_scaling.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.variance_scaling"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt b/keras/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
index 88770d1be604..2c4213342440 100644
--- a/keras/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.initializers.zeros"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Zeros\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Zeros\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index bb63c66b2c51..d7238394f940 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.abstract_rnn_cell.AbstractRNNCell\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -163,6 +163,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
@@ -191,6 +195,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -227,6 +235,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 22ac65768a1c..d1ee21e3e902 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.activation.Activation\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 4b2adcb785c0..8c47a61250e0 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.regularization.activity_regularization.ActivityRegularization\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index d6fc58c323b4..5127ff3dfaf2 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index a182400aba45..8ed84a4a760b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,9 +157,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\', \'use_causal_mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index f6ae42888aa8..b65b0c1c182c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index dfba79459a37..c8c3027e9f66 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,9 +157,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\', \'use_causal_mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 5a2274e65da3..d1d687125d83 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 0758cd27ac34..c3c3f70274a7 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index bcf4b5d80bf0..cdd976ab992b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index 85dabd3a64c1..5552bd555473 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 43b071dc39ac..0fb5acc44d0a 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 003b77ca6d25..b46848ddfc0d 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 3eec8aea498d..c5f4a9b9b827 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 53892cff4a58..0429225779da 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.normalization.batch_normalization.BatchNormalizationBase\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -130,7 +130,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'synchronized\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -156,9 +156,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 19f50844e54d..4df4de9226d7 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_wrapper.Wrapper\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -160,6 +160,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,9 +228,17 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-category-encoding.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-category-encoding.pbtxt
index 0df48cefb4b3..dfa0cbabae9c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-category-encoding.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-category-encoding.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-center-crop.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-center-crop.pbtxt
index c52a54221059..c4a5aa0e3c9a 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-center-crop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-center-crop.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 60920b75bbd6..229006d485a4 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt
index c47f2afd7e18..13da3b785c9f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m1-d.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -246,6 +246,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -274,6 +278,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -310,10 +318,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 341952bb31f3..341d73a2cc91 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -246,6 +246,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -274,6 +278,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -310,10 +318,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt
index 2fb22764b37b..e6257107a1d1 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m3-d.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -246,6 +246,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -274,6 +278,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -310,10 +318,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
index 2ee4dbc50c27..5b3beb8b16d3 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index af41da6af123..5dff50a6f509 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 1989036fe4c0..67f03d1ce309 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index ae13a9283a5f..7413b8674afa 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 64875c946786..c66d6ffb327b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 7ab3a6d14952..5c0774f967b4 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
index ba7e168af377..7484ce7ebb52 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 497bfe47f8b3..418e5d2b6bde 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 54a19a815066..dc4369ec905b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index a277662f5333..47258f5833e4 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 5f7efd7d6859..8219381a59ec 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 9dc46686425b..b334463bb54e 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -220,6 +228,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index a049e4297da2..1d516ece0c4f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.cropping1d.Cropping1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index c7b804272d5e..569ff8d26659 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.cropping2d.Cropping2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 95d47a6b9c23..0d1f2865f73d 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.cropping3d.Cropping3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index ec3fb1e2c1ff..cb71ae4d69c9 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 5adb1b1ebce6..b29161038bc4 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.dense.Dense\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt
index bdf88e8ca557..5d3179479b72 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 531c33aaa3a5..42f987270aaf 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-discretization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-discretization.pbtxt
index 7a127fa7b94c..5563d613800d 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-discretization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-discretization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index be8dd47922f4..a43e3ea8e126 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 7b5db859f05f..0c504b38714b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index bc6cae7d82bf..338f8569be21 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.elu.ELU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-einsum-dense.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-einsum-dense.pbtxt
index e29b94e2fe12..0d878e1b6c76 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-einsum-dense.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-einsum-dense.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.einsum_dense.EinsumDense\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index db9812c187b4..d0acb29f450e 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.embedding.Embedding\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -129,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'input_dim\', \'output_dim\', \'embeddings_initializer\', \'embeddings_regularizer\', \'activity_regularizer\', \'embeddings_constraint\', \'mask_zero\', \'input_length\'], varargs=None, keywords=kwargs, defaults=[\'uniform\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'input_dim\', \'output_dim\', \'embeddings_initializer\', \'embeddings_regularizer\', \'activity_regularizer\', \'embeddings_constraint\', \'mask_zero\', \'input_length\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'uniform\', \'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index 496304ff4865..26ff207938f5 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.flatten.Flatten\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index fc0c048df50a..0ecc1109cfac 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -229,6 +237,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -237,6 +249,10 @@ tf_class {
     name: "reset_recurrent_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index cd3eb6a40001..cabd8b355be3 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -230,6 +230,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -258,6 +262,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -302,6 +310,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -314,6 +326,10 @@ tf_class {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index f57338d6e9b0..cfafd9e73d29 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index f9ffe97e40e2..03c265aeb58b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index bb0ca41b58cb..aaffbb42402c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 78c5b4570884..5a5d64006850 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index f767993ce840..d211a3a0ac13 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index d13f9da6e9e9..f98c5fe73db4 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index b3c9acb03564..93ccb22cc8ac 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index aed9b8ebb0f7..f8a2802d8e5f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index eceeb2398af5..0c9d82c99469 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 0770d689735b..6aa97dfdc59e 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 4b61d5b49001..80177870bba2 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 99304d23491f..8b9a4c6e7c68 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling1d.GlobalPooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index b4b2e891654f..8f4bf30b4514 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling2d.GlobalPooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 6f8359590304..b165d98428f1 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_global_pooling3d.GlobalPooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-group-normalization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-group-normalization.pbtxt
new file mode 100644
index 000000000000..fced5da8192b
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-group-normalization.pbtxt
@@ -0,0 +1,242 @@
+path: "tensorflow.keras.layers.GroupNormalization"
+tf_class {
+  is_instance: "<class \'keras.layers.normalization.group_normalization.GroupNormalization\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'groups\', \'axis\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\'], varargs=None, keywords=kwargs, defaults=[\'32\', \'-1\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashed-crossing.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-hashed-crossing.pbtxt
similarity index 89%
rename from keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashed-crossing.pbtxt
rename to keras/api/golden/v2/tensorflow.keras.layers.-hashed-crossing.pbtxt
index 071f3088f661..eab9f207e7bb 100644
--- a/keras/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashed-crossing.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-hashed-crossing.pbtxt
@@ -1,10 +1,10 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.HashedCrossing"
+path: "tensorflow.keras.layers.HashedCrossing"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.hashed_crossing.HashedCrossing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-hashing.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-hashing.pbtxt
index 866f602987d8..ef1b9e56c2b2 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-hashing.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-hashing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.hashing.Hashing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-identity.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-identity.pbtxt
new file mode 100644
index 000000000000..3c3e39996588
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-identity.pbtxt
@@ -0,0 +1,242 @@
+path: "tensorflow.keras.layers.Identity"
+tf_class {
+  is_instance: "<class \'keras.layers.core.identity.Identity\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 796d62350d8f..7564a7f8bc7c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-integer-lookup.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-integer-lookup.pbtxt
index 68bdae207b82..60e70390c051 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-integer-lookup.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-integer-lookup.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -165,6 +165,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -197,6 +201,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +241,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -241,6 +257,14 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_vocabulary"
     argspec: "args=[\'self\', \'vocabulary\', \'idf_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 92842b09bb2e..d038c1493fc7 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -229,6 +237,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -237,6 +249,10 @@ tf_class {
     name: "reset_recurrent_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 5b3dbd75a9d2..893a35071d8e 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -230,6 +230,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -258,6 +262,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -302,6 +310,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -314,6 +326,10 @@ tf_class {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index a20e5aaa6404..bb97d088dad2 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.lambda_layer.Lambda\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
index 40f56df8297f..1a81ce6f16e0 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.normalization.layer_normalization.LayerNormalization\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index a4b82d09fc3c..b50481b62f7b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -154,6 +154,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -182,6 +186,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -214,6 +222,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 6999a0d8ec4c..96cc14f91e00 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.leaky_relu.LeakyReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 3b1a787ccda2..f8b6b11e281f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.locally_connected.locally_connected1d.LocallyConnected1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index b078db2d0529..fb34dfb1c8e0 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.locally_connected.locally_connected2d.LocallyConnected2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 5021731d2885..cb3ac42a4afa 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.masking.Masking\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 8dc902d78f47..0d9dc7499d58 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 0d0d4841e616..e1092bf07672 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 514ca738be10..4696c58634a4 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index e6b925656d73..a021d15e3615 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling1d.Pooling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 456185fa892b..8bea460ac28f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling2d.Pooling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index a3267fed10f6..14a7d00de1cd 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.pooling.base_pooling3d.Pooling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index cdbe440dedee..cc8218f7a9db 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 5285f5c3220e..709c847a6953 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
index e8cb5e7f8a68..4b8080a1b78b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.attention.multi_head_attention.MultiHeadAttention\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,9 +155,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\', \'training\', \'use_causal_mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -165,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'query_shape\', \'value_shape\', \'key_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_signature"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index d0c3cbb0d595..3ef05dd0015f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-normalization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-normalization.pbtxt
index f43dcd2f9b27..baa8fba13bdd 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-normalization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-normalization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 21589d6bb696..899af13f3363 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.prelu.PReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 22c083ff6d12..e08c6381543c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.permute.Permute\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 703f18bbe89f..4dc7b8c60319 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -223,10 +231,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-brightness.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-brightness.pbtxt
index 817053c52aef..d246250fbd2a 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-brightness.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-brightness.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomBrightness"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomBrightness\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-contrast.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-contrast.pbtxt
index 608d7216123c..85454d842005 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-contrast.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-contrast.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomContrast"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomContrast\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-crop.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-crop.pbtxt
index b196d62db2af..23f80ad15a04 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-crop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-crop.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomCrop"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomCrop\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-flip.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-flip.pbtxt
index b03109243455..0807d1d10d8d 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-flip.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-flip.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomFlip"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomFlip\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-height.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-height.pbtxt
index 2cfb51b0eb9d..9ce1de081c0f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-height.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-height.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomHeight"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomHeight\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-rotation.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-rotation.pbtxt
index 6335724e4784..df4e253ee924 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-rotation.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-rotation.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomRotation"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomRotation\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-translation.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-translation.pbtxt
index 802d6fefb05c..97cbab083bbb 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-translation.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-translation.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomTranslation"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomTranslation\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-width.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-width.pbtxt
index e5cb35110730..2f566e0cf939 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-width.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-width.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomWidth"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomWidth\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-random-zoom.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-random-zoom.pbtxt
index 3a7099acf4b2..9997add64fd2 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-random-zoom.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-random-zoom.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.RandomZoom"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 1687d54efa2a..831131154f98 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.relu.ReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index fd5601eddeb2..a401a54ae021 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.repeat_vector.RepeatVector\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-rescaling.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-rescaling.pbtxt
index 238b54fb3e7e..2b52e5fa301f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-rescaling.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-rescaling.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 55b178a767a7..8af2743e9061 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.reshape.Reshape\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-resizing.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-resizing.pbtxt
index 3bf862774281..f04ecffd3a19 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-resizing.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-resizing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Resizing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 730d65cdc6e7..6922c5910055 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index bfa77c16d89c..b4d943239992 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index f91360016768..d21d6693bcc2 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 94962ea83281..312c27f69b33 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.layers.convolutional.base_conv.Conv\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -189,6 +193,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -221,6 +229,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index d0fae29f2f6c..20da793c2a37 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -229,6 +237,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -237,6 +249,10 @@ tf_class {
     name: "reset_recurrent_dropout_mask"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index b5f215070dc0..60a8f5172402 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_rnn.RNN\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -216,6 +216,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -244,6 +248,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -280,10 +288,18 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index b9be91a03f91..e8e05a00ece5 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.softmax.Softmax\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 606b7bc5f895..0f926be02b9b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index db4d2c885fc5..1bb81438fca3 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 1137eac88299..f31ec33f7cfd 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v1/tensorflow.keras.__internal__.layers.-base-image-augmentation-layer.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-spectral-normalization.pbtxt
similarity index 80%
rename from keras/api/golden/v1/tensorflow.keras.__internal__.layers.-base-image-augmentation-layer.pbtxt
rename to keras/api/golden/v2/tensorflow.keras.layers.-spectral-normalization.pbtxt
index 8e7c54168a7c..72a04d40b891 100644
--- a/keras/api/golden/v1/tensorflow.keras.__internal__.layers.-base-image-augmentation-layer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-spectral-normalization.pbtxt
@@ -1,21 +1,17 @@
-path: "tensorflow.keras.__internal__.layers.BaseImageAugmentationLayer"
+path: "tensorflow.keras.layers.SpectralNormalization"
 tf_class {
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
-  is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
+  is_instance: "<class \'keras.layers.normalization.spectral_normalization.SpectralNormalization\'>"
+  is_instance: "<class \'keras.layers.rnn.base_wrapper.Wrapper\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -134,7 +130,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'rate\', \'seed\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'None\'], "
+    argspec: "args=[\'self\', \'layer\', \'power_iterations\'], varargs=None, keywords=kwargs, defaults=[\'1\'], "
   }
   member_method {
     name: "add_loss"
@@ -156,29 +152,17 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "compute_mask"
@@ -202,7 +186,11 @@ tf_class {
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_config"
@@ -233,13 +221,21 @@ tf_class {
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_weights"
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "normalize_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1c7dda9c0dc6..747de047f96c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.stacked_rnn_cells.StackedRNNCells\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -163,6 +163,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'states\', \'constants\', \'training\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
@@ -191,6 +195,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -227,6 +235,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-string-lookup.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-string-lookup.pbtxt
index d176221ddd2d..2b3d513fef9e 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-string-lookup.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-string-lookup.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'idf_weights\', \'encoding\', \'invert\', \'output_mode\', \'sparse\', \'pad_to_max_tokens\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'None\', \'[UNK]\', \'None\', \'None\', \'None\', \'False\', \'int\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'idf_weights\', \'encoding\', \'invert\', \'output_mode\', \'sparse\', \'pad_to_max_tokens\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'None\', \'[UNK]\', \'None\', \'None\', \'utf-8\', \'False\', \'int\', \'False\', \'False\'], "
   }
   member_method {
     name: "adapt"
@@ -165,6 +165,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -197,6 +201,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +241,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -241,6 +257,14 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_vocabulary"
     argspec: "args=[\'self\', \'vocabulary\', \'idf_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index dec895ec98ee..d6bba621d770 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.merging.base_merge._Merge\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-text-vectorization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-text-vectorization.pbtxt
index 00f3338c59fb..8824c0eac147 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-text-vectorization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-text-vectorization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -134,7 +134,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\', \'vocabulary\', \'idf_weights\', \'sparse\', \'ragged\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\', \'vocabulary\', \'idf_weights\', \'sparse\', \'ragged\', \'encoding\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'utf-8\'], "
   }
   member_method {
     name: "adapt"
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +240,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -240,6 +256,14 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_vocabulary"
     argspec: "args=[\'self\', \'vocabulary\', \'idf_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 9e04347d2a22..835f784b295f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.activation.thresholded_relu.ThresholdedReLU\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index 3e13ed5ab652..814d7168679b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_wrapper.Wrapper\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-unit-normalization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-unit-normalization.pbtxt
index 0bcb985a0b59..ae5f06b382a7 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-unit-normalization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-unit-normalization.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.normalization.unit_normalization.UnitNormalization\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index c2f1d3d12cc2..ff61b890ceef 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.up_sampling1d.UpSampling1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 00cc45f498f3..383e28967517 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.up_sampling2d.UpSampling2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 89a07682e536..b2a2d89c1748 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.up_sampling3d.UpSampling3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index a05086a1651d..149f9e61613f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.rnn.base_wrapper.Wrapper\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index eeb09f5a6a85..2ef8d53b6940 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.zero_padding1d.ZeroPadding1D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 86805c95d9d0..5f5c510ec23f 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.zero_padding2d.ZeroPadding2D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 1789d6ec811c..03fc8519bb09 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.reshaping.zero_padding3d.ZeroPadding3D\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
index 82e611df04e5..0da8e034e5a8 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.-einsum-dense.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.core.einsum_dense.EinsumDense\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
index ba2ad738ee29..fb529f555a8c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.-random-fourier-features.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.kernelized.RandomFourierFeatures\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
index b848b7bea001..63b1be08dc46 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.-sync-batch-normalization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.layers.normalization.batch_normalization.BatchNormalizationBase\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,9 +156,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index 63f019cf6868..a741778c72dd 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
index a5358c4b811a..b2b7d584a5fc 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
index 06c517cf9c26..f61c4f82c5bb 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashed-crossing.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashed-crossing.pbtxt
index 071f3088f661..9a9602229b26 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashed-crossing.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashed-crossing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.hashed_crossing.HashedCrossing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
index 5f9c8f541ac5..a608049a6d8a 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.hashing.Hashing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
index 5170b3b1fb65..d221e8bc29be 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -165,6 +165,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -197,6 +201,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +241,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -241,6 +257,14 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_vocabulary"
     argspec: "args=[\'self\', \'vocabulary\', \'idf_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index c93956fe0e79..e6f797f63416 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -228,6 +236,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -236,6 +248,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
index ce3100e121f0..942ce222c3e9 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-preprocessing-layer.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -163,6 +163,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -195,6 +199,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -227,6 +235,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
index 0c0ebcb55fa6..4a98b7dc741d 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomContrast"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomContrast\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
index bf2d56e3eb5b..ff0e93b7a3d7 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomCrop"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomCrop\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
index 025dd55fd6f2..dcd4bc07bb1c 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomFlip"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomFlip\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
index c5fa5143983f..2d5ada3de9cb 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomHeight"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomHeight\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index 69b8e2a539b4..634d29f45055 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomRotation"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomRotation\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index ad1098a6d246..bfb7693580b4 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomTranslation"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomTranslation\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
index 6fcd5815b885..c2d3ef92be9a 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomWidth"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomWidth\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index bc9cfaca33ff..ff3f05b1f9cc 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -1,22 +1,17 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
 tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
-  is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.BaseImageAugmentationLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.BaseRandomLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "auto_vectorize"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
@@ -157,26 +152,14 @@ tf_class {
     name: "add_weight"
     argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
   }
-  member_method {
-    name: "augment_bounding_boxes"
-    argspec: "args=[\'self\', \'image\', \'bounding_boxes\', \'transformation\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "augment_image"
-    argspec: "args=[\'self\', \'image\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_label"
-    argspec: "args=[\'self\', \'label\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "augment_target"
-    argspec: "args=[\'self\', \'target\', \'transformation\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -205,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,14 +220,18 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_random_transformation"
-    argspec: "args=[\'self\', \'image\', \'label\', \'bounding_box\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index fb98877a03cc..fdbab246741b 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
index 6135cdea2bbe..c11fb59691fb 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.layers.preprocessing.image_preprocessing.Resizing\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
index 076f8c3681ab..f7ee995f2eaa 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'idf_weights\', \'encoding\', \'invert\', \'output_mode\', \'sparse\', \'pad_to_max_tokens\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'None\', \'[UNK]\', \'None\', \'None\', \'None\', \'False\', \'int\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'idf_weights\', \'encoding\', \'invert\', \'output_mode\', \'sparse\', \'pad_to_max_tokens\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'None\', \'[UNK]\', \'None\', \'None\', \'utf-8\', \'False\', \'int\', \'False\', \'False\'], "
   }
   member_method {
     name: "adapt"
@@ -165,6 +165,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -197,6 +201,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +241,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -241,6 +257,14 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_vocabulary"
     argspec: "args=[\'self\', \'vocabulary\', \'idf_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 3e2f9b7e68b4..d9c28d3a36d7 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -134,7 +134,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\', \'vocabulary\', \'idf_weights\', \'sparse\', \'ragged\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\', \'vocabulary\', \'idf_weights\', \'sparse\', \'ragged\', \'encoding\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'utf-8\'], "
   }
   member_method {
     name: "adapt"
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +240,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_adapt_function"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -240,6 +256,14 @@ tf_class {
     name: "reset_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_assets"
+    argspec: "args=[\'self\', \'dir_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_vocabulary"
     argspec: "args=[\'self\', \'vocabulary\', \'idf_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/keras/api/golden/v2/tensorflow.keras.layers.pbtxt b/keras/api/golden/v2/tensorflow.keras.layers.pbtxt
index eef6e02c9efe..a2b218a4c0d5 100644
--- a/keras/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -256,10 +256,22 @@ tf_module {
     name: "GlobalMaxPooling3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "GroupNormalization"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "HashedCrossing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Hashing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Identity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputLayer"
     mtype: "<type \'type\'>"
@@ -460,6 +472,10 @@ tf_module {
     name: "SpatialDropout3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SpectralNormalization"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "StackedRNNCells"
     mtype: "<type \'type\'>"
@@ -538,7 +554,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "dot"
@@ -558,7 +574,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'layer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "subtract"
diff --git a/keras/api/golden/v2/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt b/keras/api/golden/v2/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt
index 2c2a286f740e..ac49b8fc8701 100644
--- a/keras/api/golden/v2/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.losses.-binary-focal-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\', \'0.0\', \'-1\', \'auto\', \'binary_focal_crossentropy\'], "
+    argspec: "args=[\'self\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\', \'auto\', \'binary_focal_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/keras/api/golden/v2/tensorflow.keras.losses.-categorical-focal-crossentropy.pbtxt b/keras/api/golden/v2/tensorflow.keras.losses.-categorical-focal-crossentropy.pbtxt
new file mode 100644
index 000000000000..f06b44ec8765
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.losses.-categorical-focal-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CategoricalFocalCrossentropy"
+tf_class {
+  is_instance: "<class \'keras.losses.CategoricalFocalCrossentropy\'>"
+  is_instance: "<class \'keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\', \'auto\', \'categorical_focal_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/keras/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
index 06d9cfe145ee..389b05c75d5d 100644
--- a/keras/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'auto\', \'sparse_categorical_crossentropy\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'ignore_class\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'auto\', \'sparse_categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/keras/api/golden/v2/tensorflow.keras.losses.pbtxt b/keras/api/golden/v2/tensorflow.keras.losses.pbtxt
index e64d82d71eae..8fb5dcb54f79 100644
--- a/keras/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalFocalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalHinge"
     mtype: "<type \'type\'>"
@@ -98,12 +102,16 @@ tf_module {
   }
   member_method {
     name: "binary_focal_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\', \'0.0\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
   }
   member_method {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.0\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
+  }
   member_method {
     name: "categorical_hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -114,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'name\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -186,11 +194,11 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'loss\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'loss\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\', \'ignore_class\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
index 35f9a429b865..171da23f3bc1 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.metrics.AUC"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.AUC\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.AUC\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -160,6 +160,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -224,6 +232,10 @@ tf_class {
     name: "interpolate_pr_auc"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -240,6 +252,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index 26fbd0b585bf..863b948441e9 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.Accuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Accuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.Accuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 789c93e9c821..4b8759cf7628 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.BinaryAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.BinaryAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 4e88a2ad5ddd..16228d4229f2 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.BinaryCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.BinaryCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-binary-io-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-binary-io-u.pbtxt
index 590f84d1e583..49e4ac2946e7 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-binary-io-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-binary-io-u.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.BinaryIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.BinaryIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index f910dc4b0696..c56abceaeb13 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index 27abc004b332..92d50ec7a5f1 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CategoricalCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.CategoricalCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -133,7 +133,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 4bb20d940f1f..f4386171e6f5 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CategoricalHinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.CategoricalHinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 0dbf94fa93df..221cbe34edd0 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.CosineSimilarity\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-f-beta-score.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-f-beta-score.pbtxt
new file mode 100644
index 000000000000..37847a1f933d
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-f-beta-score.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.keras.metrics.FBetaScore"
+tf_class {
+  is_instance: "<class \'keras.metrics.f_score_metrics.FBetaScore\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'average\', \'beta\', \'threshold\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'fbeta_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-f1-score.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-f1-score.pbtxt
new file mode 100644
index 000000000000..56d233b0b5fc
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-f1-score.pbtxt
@@ -0,0 +1,264 @@
+path: "tensorflow.keras.metrics.F1Score"
+tf_class {
+  is_instance: "<class \'keras.metrics.f_score_metrics.F1Score\'>"
+  is_instance: "<class \'keras.metrics.f_score_metrics.FBetaScore\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'average\', \'threshold\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'f1_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index ad1ffb7d5e1d..12518c046e4d 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.FalseNegatives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.FalseNegatives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.FalseNegatives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index 0dfa8b5ee1a6..d3a260bc7f5f 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.FalsePositives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.FalsePositives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.FalsePositives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index b9ef8b808189..c01adca8b432 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.Hinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Hinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.Hinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-io-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-io-u.pbtxt
index c8e3cac66dac..3b3e4ed1e707 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-io-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-io-u.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.IoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -131,7 +131,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_true\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'True\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index 2c31b5fccac2..8fe4028c968d 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.KLDivergence"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.KLDivergence\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.KLDivergence\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index 81ff9033cdac..862a2c127f69 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.LogCoshError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.LogCoshError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.LogCoshError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 50832f259e8d..4db047358108 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanAbsoluteError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanAbsoluteError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index dfc975031555..c1a4285ba95d 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanAbsolutePercentageError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
index a2c1fbea4afa..eb8b2c471f44 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.MeanIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.MeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -132,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_true\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'True\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt
index 951c151fdc79..d84345e14e31 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-metric-wrapper.pbtxt
@@ -6,8 +6,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 10b3a82a0c8c..697c4e0bb74b 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.MeanRelativeError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanRelativeError\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index ec4d424986b5..ceb5282f0746 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanSquaredError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanSquaredError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index ecfebc72ad3b..2d5cf64c2c3d 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanSquaredLogarithmicError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
index a4ee5fc8e909..6e8ba1767c97 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -192,6 +196,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -224,6 +232,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -240,6 +252,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 80d830fb7efc..c31d49e14b7f 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
index 905c92a33ab9..916ae93096e5 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,6 +223,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -231,6 +243,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-io-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-io-u.pbtxt
index 853ae3bcf38e..23fd50224c5c 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-io-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-io-u.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.OneHotIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.OneHotIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.OneHotIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -132,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'target_class_ids\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt
index e20224e9b14e..98b63a62da97 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-one-hot-mean-io-u.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.OneHotMeanIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.OneHotMeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.MeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.OneHotMeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.MeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -133,7 +133,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\', \'ignore_class\', \'sparse_y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
index 29ccceda1abe..1d5f8c6efcb7 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.Poisson"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Poisson\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.Poisson\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
index ec505dc742e9..21f1c36bdc1b 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-precision-at-recall.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.PrecisionAtRecall"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.PrecisionAtRecall\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.PrecisionAtRecall\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index fe1822fc8d53..d9c49540edcb 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.metrics.Precision"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Precision\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.Precision\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +244,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-r2-score.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-r2-score.pbtxt
new file mode 100644
index 000000000000..1e76ffb29ad4
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-r2-score.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.keras.metrics.R2Score"
+tf_class {
+  is_instance: "<class \'keras.metrics.regression_metrics.R2Score\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'class_aggregation\', \'num_regressors\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'uniform_average\', \'0\', \'r2_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
index e8ab0f6ce1c6..5aa668718b0e 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-recall-at-precision.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.RecallAtPrecision"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.RecallAtPrecision\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.RecallAtPrecision\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 52e9879a3446..e7c4864a1bbd 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.metrics.Recall"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Recall\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.Recall\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -216,6 +224,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +244,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index cd99b1e8e29e..64671f63b4c0 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -1,13 +1,13 @@
 path: "tensorflow.keras.metrics.RootMeanSquaredError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.RootMeanSquaredError\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -218,6 +226,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -234,6 +246,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 0da727a14110..9b35e4f14197 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SensitivityAtSpecificity\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index d47d06739b2a..d960b99eccb4 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index 4fdc705aa389..c5bd4c6f59db 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SparseCategoricalCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.SparseCategoricalCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -133,7 +133,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'ignore_class\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'None\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index dd386c6cba5d..069a3e3b2727 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.SparseTopKCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 15dfa9412558..9f42d1f0b3c2 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SpecificityAtSensitivity\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 0f76c4a43b47..83437f332258 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.SquaredHinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SquaredHinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.SquaredHinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
index ccd3ac0c8752..6cb46d1f93e4 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index dd26258eb1bb..6355e88e1858 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -1,14 +1,14 @@
 path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.TopKCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -219,6 +227,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -235,6 +247,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index af0fb7936462..95bc523abd0c 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.TrueNegatives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TrueNegatives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.TrueNegatives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 0e1124fbc296..863fb2911873 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.metrics.TruePositives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TruePositives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.TruePositives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -217,6 +225,10 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_state"
     argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +245,10 @@ tf_class {
     name: "result"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.experimental.-py-metric.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.experimental.-py-metric.pbtxt
new file mode 100644
index 000000000000..468898868b32
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.experimental.-py-metric.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.keras.metrics.experimental.PyMetric"
+tf_class {
+  is_instance: "<class \'keras.metrics.py_metric.PyMetric\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.experimental.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.experimental.pbtxt
new file mode 100644
index 000000000000..f5614c4b76ae
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.metrics.experimental"
+tf_module {
+  member {
+    name: "PyMetric"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.metrics.pbtxt b/keras/api/golden/v2/tensorflow.keras.metrics.pbtxt
index f05d1a6f89c5..1ab81fed6868 100644
--- a/keras/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -36,6 +36,14 @@ tf_module {
     name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "F1Score"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FBetaScore"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -120,6 +128,10 @@ tf_module {
     name: "PrecisionAtRecall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "R2Score"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Recall"
     mtype: "<type \'type\'>"
@@ -172,6 +184,10 @@ tf_module {
     name: "TruePositives"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -202,7 +218,7 @@ tf_module {
   }
   member_method {
     name: "binary_focal_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'2.0\', \'False\', \'0.0\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'apply_class_balancing\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -212,9 +228,13 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.0\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
+  }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -282,7 +302,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'metric\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'metric\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_categorical_accuracy"
@@ -290,7 +310,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\', \'ignore_class\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/keras/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/keras/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 8f0115b30ac0..af5a892ca740 100644
--- a/keras/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -12,10 +12,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -52,6 +60,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -112,6 +124,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -176,13 +192,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -216,6 +240,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -232,6 +260,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -252,6 +288,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -264,13 +304,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -310,7 +358,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -326,7 +378,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/keras/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 8b85b77488b4..a6f046c2e06a 100644
--- a/keras/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -5,8 +5,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -14,10 +14,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -54,6 +62,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -114,6 +126,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -182,13 +198,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -222,6 +246,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -238,6 +266,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -258,6 +294,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -270,13 +310,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -320,7 +368,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -336,7 +388,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v2/tensorflow.keras.models.experimental.-sharpness-aware-minimization.pbtxt b/keras/api/golden/v2/tensorflow.keras.models.experimental.-sharpness-aware-minimization.pbtxt
index 84a7524beb47..65e117c4573c 100644
--- a/keras/api/golden/v2/tensorflow.keras.models.experimental.-sharpness-aware-minimization.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.models.experimental.-sharpness-aware-minimization.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'keras.engine.training.Model\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
   is_instance: "<class \'keras.utils.version_utils.ModelVersionSelector\'>"
   is_instance: "<type \'object\'>"
@@ -13,10 +13,18 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "compute_dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "distribute_reduction_method"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "distribute_strategy"
     mtype: "<type \'property\'>"
@@ -53,6 +61,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "jit_compile"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "layers"
     mtype: "<type \'property\'>"
@@ -113,6 +125,10 @@ tf_class {
     name: "stateful"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -177,13 +193,21 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\', \'jit_compile\', \'pss_evaluation_shards\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "compile_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_loss"
@@ -217,6 +241,10 @@ tf_class {
     name: "evaluate_generator"
     argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize_state"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -233,6 +261,14 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compile_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -253,6 +289,10 @@ tf_class {
     name: "get_layer"
     argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "get_metrics_result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -265,13 +305,21 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weight_paths"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "load_weights"
-    argspec: "args=[\'self\', \'filepath\', \'by_name\', \'skip_mismatch\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'skip_mismatch\', \'by_name\', \'options\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "make_predict_function"
@@ -311,7 +359,11 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "save_spec"
@@ -327,7 +379,7 @@ tf_class {
   }
   member_method {
     name: "summary"
-    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\', \'expand_nested\', \'show_trainable\', \'layer_range\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "test_on_batch"
diff --git a/keras/api/golden/v2/tensorflow.keras.models.pbtxt b/keras/api/golden/v2/tensorflow.keras.models.pbtxt
index a12db424d210..49ba3fbf4642 100644
--- a/keras/api/golden/v2/tensorflow.keras.models.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.models.pbtxt
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "load_model"
-    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\', \'True\'], "
   }
   member_method {
     name: "model_from_config"
@@ -34,6 +34,6 @@ tf_module {
   }
   member_method {
     name: "save_model"
-    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'include_optimizer\', \'save_format\', \'signatures\', \'options\', \'save_traces\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index d0856c75be4a..bc24d928cb41 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adadelta.Adadelta\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adadelta\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-adafactor.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-adafactor.pbtxt
new file mode 100644
index 000000000000..fb3952d2b260
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-adafactor.pbtxt
@@ -0,0 +1,89 @@
+path: "tensorflow.keras.optimizers.Adafactor"
+tf_class {
+  is_instance: "<class \'keras.optimizers.adafactor.Adafactor\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "lr"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_2_decay\', \'epsilon_1\', \'epsilon_2\', \'clip_threshold\', \'relative_step\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.8\', \'1e-30\', \'0.001\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adafactor\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
+  }
+  member_method {
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index 17f68fd67db0..4e6b8a67982b 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adagrad.Adagrad\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adagrad\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-adam-w.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-adam-w.pbtxt
new file mode 100644
index 000000000000..12b1548926be
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-adam-w.pbtxt
@@ -0,0 +1,89 @@
+path: "tensorflow.keras.optimizers.AdamW"
+tf_class {
+  is_instance: "<class \'keras.optimizers.adamw.AdamW\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "lr"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'weight_decay\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.004\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'AdamW\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
+  }
+  member_method {
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 74fa9869ad54..978f3b874892 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adam.Adam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index ae0d88760eb5..302da145cd5d 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adamax.Adamax\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adamax\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
index 2cfd1ca6b71c..be804558c675 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.Ftrl"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.ftrl.Ftrl\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'l2_shrinkage_regularization_strength\', \'beta\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Ftrl\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-lion.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-lion.pbtxt
new file mode 100644
index 000000000000..5d4faf4150be
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-lion.pbtxt
@@ -0,0 +1,89 @@
+path: "tensorflow.keras.optimizers.Lion"
+tf_class {
+  is_instance: "<class \'keras.optimizers.lion.Lion\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "lr"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0001\', \'0.9\', \'0.99\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Lion\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
+  }
+  member_method {
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index 2d18b1b4774b..b6c91c10e99d 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.nadam.Nadam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Nadam\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 5a9d33eea359..d30f25489a37 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,82 +1,88 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'gradient_aggregator\', \'gradient_transformers\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'0\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index d53b8c656ddc..9bcb35ea798a 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.rmsprop.RMSprop\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'100\', \'True\', \'RMSprop\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index f354c71298ce..73dc46d85980 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,83 +1,89 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.gradient_descent.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.sgd.SGD\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "clipnorm"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "clipvalue"
+    name: "iterations"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "global_clipnorm"
+    name: "learning_rate"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "iterations"
+    name: "lr"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "weights"
+    name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'SGD\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
   }
   member_method {
-    name: "add_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\', \'shape\'], varargs=None, keywords=None, defaults=[\'zeros\', \'None\'], "
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
   }
   member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'experimental_aggregate_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_gradients"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_updates"
-    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_weights"
+    name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adadelta.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adadelta.pbtxt
index d9b8cf3c3065..2ada86ac054e 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adadelta.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adadelta.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.Adadelta"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adadelta.Adadelta\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adadelta\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adadelta\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adafactor.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adafactor.pbtxt
new file mode 100644
index 000000000000..30a77095af10
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adafactor.pbtxt
@@ -0,0 +1,89 @@
+path: "tensorflow.keras.optimizers.experimental.Adafactor"
+tf_class {
+  is_instance: "<class \'keras.optimizers.adafactor.Adafactor\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "lr"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_2_decay\', \'epsilon_1\', \'epsilon_2\', \'clip_threshold\', \'relative_step\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.8\', \'1e-30\', \'0.001\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adafactor\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
+  }
+  member_method {
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adagrad.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adagrad.pbtxt
index 222cc5cb0621..bcdc12926a78 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adagrad.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adagrad.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.Adagrad"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adagrad.Adagrad\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adagrad\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adagrad\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam-w.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam-w.pbtxt
index 496446ac3c60..240e92cf9621 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam-w.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam-w.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.AdamW"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adamw.AdamW\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adamw.AdamW\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,26 +18,6 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -61,11 +40,11 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'var_list\', \'exclude_from_weight_decay\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_gradients"
@@ -73,7 +52,7 @@ tf_class {
   }
   member_method {
     name: "exclude_from_weight_decay"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "finalize_variable_values"
@@ -81,22 +60,30 @@ tf_class {
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam.pbtxt
index b468f301f986..a36751778545 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adam.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.Adam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adam.Adam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adamax.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adamax.pbtxt
index 2421170c4641..f8b070a6b707 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adamax.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-adamax.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.Adamax"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.adamax.Adamax\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adamax\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adamax\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-ftrl.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-ftrl.pbtxt
index 096106ba41d4..892d407e86ed 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-ftrl.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-ftrl.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.Ftrl"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.ftrl.Ftrl\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'l2_shrinkage_regularization_strength\', \'beta\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Ftrl\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'l2_shrinkage_regularization_strength\', \'beta\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Ftrl\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-nadam.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-nadam.pbtxt
index d6b8adfcc788..887e8bb52784 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-nadam.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-nadam.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.Nadam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.nadam.Nadam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Nadam\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Nadam\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-optimizer.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-optimizer.pbtxt
index f8add2a3e1e9..f4a84d454881 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-optimizer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-optimizer.pbtxt
@@ -1,10 +1,9 @@
 path: "tensorflow.keras.optimizers.experimental.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -18,33 +17,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'name\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'0\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\'], "
   }
   member_method {
     name: "add_variable"
@@ -60,7 +39,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -70,28 +49,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-r-m-sprop.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-r-m-sprop.pbtxt
index 1c5325a505e5..c8998cffcf40 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-r-m-sprop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-r-m-sprop.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.RMSprop"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.rmsprop.RMSprop\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'100\', \'True\', \'RMSprop\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'100\', \'True\', \'RMSprop\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-s-g-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-s-g-d.pbtxt
index 3ac6cdda7aa6..7a73dc7f4238 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-s-g-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.-s-g-d.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.optimizers.experimental.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.sgd.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.autotrackable.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.sgd.SGD\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
@@ -19,33 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'amsgrad\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
   }
   member_method {
     name: "add_variable"
@@ -61,7 +40,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'skip_gradients_aggregation\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "build"
@@ -71,28 +50,40 @@ tf_class {
     name: "compute_gradients"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "finalize_variable_values"
     argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "update_step"
-    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.pbtxt
index 95a90dcaea0a..9d9f9cfe72da 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.experimental.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Adadelta"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Adafactor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Adagrad"
     mtype: "<type \'type\'>"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt
index 9c1b406a1d6f..05ae2888d367 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adadelta.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adadelta"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt
index 736ee08e4efb..507148f08dbb 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adagrad.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adagrad"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adam.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adam.pbtxt
index 7d0d3b23614c..d79093442bd9 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adam.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adam.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adam"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adamax.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adamax.pbtxt
index 149d0f213893..b18db03163b8 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adamax.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-adamax.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Adamax"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt
index 9ce47c161678..b852c98df0e6 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-ftrl.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Ftrl"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-nadam.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-nadam.pbtxt
index 8a612f6b89b2..ef505faade82 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-nadam.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-nadam.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.Nadam"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt
index 6b4bf1701f22..f28c01037044 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-optimizer.pbtxt
@@ -1,8 +1,7 @@
 path: "tensorflow.keras.optimizers.legacy.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.legacy.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt
index 77a6e72a9411..f53b0568fe11 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-r-m-sprop.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.RMSprop"
 tf_class {
   is_instance: "<class \'keras.optimizers.legacy.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt
index f6a6dd836e72..ab1041592075 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.legacy.-s-g-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.legacy.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.legacy.sgd.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.gradient_descent.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'keras.optimizers.legacy.gradient_descent.SGD\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "clipnorm"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.pbtxt
index f12ace047ee2..00b8c8fd3425 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Adadelta"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Adafactor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Adagrad"
     mtype: "<type \'type\'>"
@@ -12,6 +16,10 @@ tf_module {
     name: "Adam"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "AdamW"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Adamax"
     mtype: "<type \'type\'>"
@@ -20,6 +28,10 @@ tf_module {
     name: "Ftrl"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Lion"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Nadam"
     mtype: "<type \'type\'>"
@@ -50,14 +62,14 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
-    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'optimizer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'optimizer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt
index 13a711fe288b..6df561f3342e 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.-cosine-decay.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\', \'warmup_target\', \'warmup_steps\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt b/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt
index 3ecc437199f6..8ed0edccf925 100644
--- a/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt
@@ -30,10 +30,10 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'learning_rate_schedule\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.pbtxt b/keras/api/golden/v2/tensorflow.keras.pbtxt
index cdaeea7f8244..c080bc27539a 100644
--- a/keras/api/golden/v2/tensorflow.keras.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "export"
+    mtype: "<type \'module\'>"
+  }
   # Placeholder for internal API
   member {
     name: "initializers"
@@ -82,11 +86,11 @@ tf_module {
     mtype: "<type \'module\'>"
   }
   member {
-    name: "utils"
+    name: "saving"
     mtype: "<type \'module\'>"
   }
   member {
-    name: "wrappers"
+    name: "utils"
     mtype: "<type \'module\'>"
   }
   member_method {
diff --git a/keras/api/golden/v2/tensorflow.keras.regularizers.pbtxt b/keras/api/golden/v2/tensorflow.keras.regularizers.pbtxt
index 48f1ec4fa1b7..7272c0fb6702 100644
--- a/keras/api/golden/v2/tensorflow.keras.regularizers.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.regularizers.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -46,6 +46,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'regularizer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.saving.custom_object_scope.pbtxt b/keras/api/golden/v2/tensorflow.keras.saving.custom_object_scope.pbtxt
new file mode 100644
index 000000000000..cf877e5ae4dd
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.saving.custom_object_scope.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.keras.saving.custom_object_scope"
+tf_class {
+  is_instance: "<class \'keras.saving.object_registration.CustomObjectScope\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.saving.pbtxt b/keras/api/golden/v2/tensorflow.keras.saving.pbtxt
new file mode 100644
index 000000000000..e1df1e64293c
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.saving.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.keras.saving"
+tf_module {
+  member {
+    name: "custom_object_scope"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize_keras_object"
+    argspec: "args=[\'config\', \'custom_objects\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "get_custom_objects"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_registered_name"
+    argspec: "args=[\'obj\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_registered_object"
+    argspec: "args=[\'name\', \'custom_objects\', \'module_objects\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "load_model"
+    argspec: "args=[\'filepath\', \'custom_objects\', \'compile\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\', \'True\'], "
+  }
+  member_method {
+    name: "register_keras_serializable"
+    argspec: "args=[\'package\', \'name\'], varargs=None, keywords=None, defaults=[\'Custom\', \'None\'], "
+  }
+  member_method {
+    name: "save_model"
+    argspec: "args=[\'model\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "serialize_keras_object"
+    argspec: "args=[\'obj\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.-custom-object-scope.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.-custom-object-scope.pbtxt
index 9e9370be68f8..3ccf719d8c8c 100644
--- a/keras/api/golden/v2/tensorflow.keras.utils.-custom-object-scope.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.utils.-custom-object-scope.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.CustomObjectScope"
 tf_class {
-  is_instance: "<class \'keras.utils.generic_utils.CustomObjectScope\'>"
+  is_instance: "<class \'keras.saving.object_registration.CustomObjectScope\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.-feature-space.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.-feature-space.pbtxt
new file mode 100644
index 000000000000..1ae0313d8ecd
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.utils.-feature-space.pbtxt
@@ -0,0 +1,298 @@
+path: "tensorflow.keras.utils.FeatureSpace"
+tf_class {
+  is_instance: "<class \'keras.utils.feature_space.FeatureSpace\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'features\', \'output_mode\', \'crosses\', \'crossing_dim\', \'hashing_dim\', \'num_discretization_bins\'], varargs=None, keywords=None, defaults=[\'concat\', \'None\', \'32\', \'32\', \'32\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregationV2.NONE\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cross"
+    argspec: "args=[\'cls\', \'feature_names\', \'crossing_dim\', \'output_mode\'], varargs=None, keywords=None, defaults=[\'one_hot\'], "
+  }
+  member_method {
+    name: "feature"
+    argspec: "args=[\'cls\', \'dtype\', \'preprocessor\', \'output_mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "float"
+    argspec: "args=[\'cls\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "float_discretized"
+    argspec: "args=[\'cls\', \'num_bins\', \'bin_boundaries\', \'output_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'one_hot\', \'None\'], "
+  }
+  member_method {
+    name: "float_normalized"
+    argspec: "args=[\'cls\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "float_rescaled"
+    argspec: "args=[\'cls\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_encoded_features"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_inputs"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "integer_categorical"
+    argspec: "args=[\'cls\', \'max_tokens\', \'num_oov_indices\', \'output_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'one_hot\', \'None\'], "
+  }
+  member_method {
+    name: "integer_hashed"
+    argspec: "args=[\'cls\', \'num_bins\', \'output_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'one_hot\', \'None\'], "
+  }
+  member_method {
+    name: "load_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save_own_variables"
+    argspec: "args=[\'self\', \'store\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "string_categorical"
+    argspec: "args=[\'cls\', \'max_tokens\', \'num_oov_indices\', \'output_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'one_hot\', \'None\'], "
+  }
+  member_method {
+    name: "string_hashed"
+    argspec: "args=[\'cls\', \'num_bins\', \'output_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'one_hot\', \'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.-sidecar-evaluator.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.-sidecar-evaluator.pbtxt
index 4161e90e916b..1d3a83fa52eb 100644
--- a/keras/api/golden/v2/tensorflow.keras.utils.-sidecar-evaluator.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.utils.-sidecar-evaluator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.SidecarEvaluator"
 tf_class {
-  is_instance: "<class \'keras.distribute.sidecar_evaluator.SidecarEvaluator\'>"
+  is_instance: "<class \'keras.utils.sidecar_evaluator.SidecarEvaluator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.-steps-per-execution-tuner.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.-steps-per-execution-tuner.pbtxt
new file mode 100644
index 000000000000..1363d2190e1e
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.utils.-steps-per-execution-tuner.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.utils.StepsPerExecutionTuner"
+tf_class {
+  is_instance: "<class \'keras.utils.steps_per_execution_tuning.StepsPerExecutionTuner\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "steps_per_execution"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'optimizer\', \'spe_variable\', \'interval\', \'change_spe_interval\', \'change_threshold\'], varargs=None, keywords=None, defaults=[\'5\', \'10\', \'0.1\'], "
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.-timed-thread.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.-timed-thread.pbtxt
new file mode 100644
index 000000000000..bd3947c59a52
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.utils.-timed-thread.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.keras.utils.TimedThread"
+tf_class {
+  is_instance: "<class \'keras.utils.timed_threads.TimedThread\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'interval\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "is_alive"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "on_interval"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.custom_object_scope.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.custom_object_scope.pbtxt
index 4fa8c7af04e4..08f84e0f825f 100644
--- a/keras/api/golden/v2/tensorflow.keras.utils.custom_object_scope.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.utils.custom_object_scope.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.custom_object_scope"
 tf_class {
-  is_instance: "<class \'keras.utils.generic_utils.CustomObjectScope\'>"
+  is_instance: "<class \'keras.saving.object_registration.CustomObjectScope\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.legacy.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.legacy.pbtxt
new file mode 100644
index 000000000000..267629bf49c2
--- /dev/null
+++ b/keras/api/golden/v2/tensorflow.keras.utils.legacy.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.keras.utils.legacy"
+tf_module {
+  member_method {
+    name: "deserialize_keras_object"
+    argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
+  }
+  member_method {
+    name: "serialize_keras_object"
+    argspec: "args=[\'instance\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/keras/api/golden/v2/tensorflow.keras.utils.pbtxt b/keras/api/golden/v2/tensorflow.keras.utils.pbtxt
index 18dc92498862..b084948598ba 100644
--- a/keras/api/golden/v2/tensorflow.keras.utils.pbtxt
+++ b/keras/api/golden/v2/tensorflow.keras.utils.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "CustomObjectScope"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FeatureSpace"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "GeneratorEnqueuer"
     mtype: "<type \'type\'>"
@@ -28,6 +32,14 @@ tf_module {
     name: "SidecarEvaluator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StepsPerExecutionTuner"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TimedThread"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "custom_object_scope"
     mtype: "<type \'type\'>"
@@ -36,6 +48,10 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "legacy"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "array_to_img"
     argspec: "args=[\'x\', \'data_format\', \'scale\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
@@ -46,7 +62,7 @@ tf_module {
   }
   member_method {
     name: "deserialize_keras_object"
-    argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'safe_mode\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "disable_interactive_logging"
@@ -94,7 +110,7 @@ tf_module {
   }
   member_method {
     name: "model_to_dot"
-    argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\', \'layer_range\', \'show_layer_activations\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\', \'layer_range\', \'show_layer_activations\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "normalize"
@@ -110,7 +126,7 @@ tf_module {
   }
   member_method {
     name: "plot_model"
-    argspec: "args=[\'model\', \'to_file\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'layer_range\', \'show_layer_activations\'], varargs=None, keywords=None, defaults=[\'model.png\', \'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'None\', \'False\'], "
+    argspec: "args=[\'model\', \'to_file\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'layer_range\', \'show_layer_activations\', \'show_trainable\'], varargs=None, keywords=None, defaults=[\'model.png\', \'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "register_keras_serializable"
@@ -122,7 +138,7 @@ tf_module {
   }
   member_method {
     name: "serialize_keras_object"
-    argspec: "args=[\'instance\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'obj\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "set_random_seed"
@@ -144,8 +160,16 @@ tf_module {
     name: "to_categorical"
     argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
   }
+  member_method {
+    name: "to_ordinal"
+    argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
+  }
   member_method {
     name: "unpack_x_y_sample_weight"
     argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "warmstart_embedding_matrix"
+    argspec: "args=[\'base_vocabulary\', \'new_vocabulary\', \'base_embeddings\', \'new_embeddings_initializer\'], varargs=None, keywords=None, defaults=[\'uniform\'], "
+  }
 }
diff --git a/keras/api/golden/v2/tensorflow.keras.wrappers.pbtxt b/keras/api/golden/v2/tensorflow.keras.wrappers.pbtxt
deleted file mode 100644
index 0b2fac9b7d99..000000000000
--- a/keras/api/golden/v2/tensorflow.keras.wrappers.pbtxt
+++ /dev/null
@@ -1,7 +0,0 @@
-path: "tensorflow.keras.wrappers"
-tf_module {
-  member {
-    name: "scikit_learn"
-    mtype: "<type \'module\'>"
-  }
-}
diff --git a/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt b/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
deleted file mode 100644
index 180e05527f31..000000000000
--- a/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
+++ /dev/null
@@ -1,42 +0,0 @@
-path: "tensorflow.keras.wrappers.scikit_learn.KerasClassifier"
-tf_class {
-  is_instance: "<class \'keras.wrappers.scikit_learn.KerasClassifier\'>"
-  is_instance: "<class \'keras.wrappers.scikit_learn.BaseWrapper\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'build_fn\'], varargs=None, keywords=sk_params, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "check_params"
-    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "filter_sk_params"
-    argspec: "args=[\'self\', \'fn\', \'override\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "predict_proba"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "score"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "set_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-}
diff --git a/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt b/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
deleted file mode 100644
index 0dfc03fb05e5..000000000000
--- a/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.keras.wrappers.scikit_learn.KerasRegressor"
-tf_class {
-  is_instance: "<class \'keras.wrappers.scikit_learn.KerasRegressor\'>"
-  is_instance: "<class \'keras.wrappers.scikit_learn.BaseWrapper\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'build_fn\'], varargs=None, keywords=sk_params, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "check_params"
-    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "filter_sk_params"
-    argspec: "args=[\'self\', \'fn\', \'override\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "score"
-    argspec: "args=[\'self\', \'x\', \'y\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "set_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=params, defaults=None"
-  }
-}
diff --git a/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.pbtxt b/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.pbtxt
deleted file mode 100644
index fbd4d13387a9..000000000000
--- a/keras/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.keras.wrappers.scikit_learn"
-tf_module {
-  member {
-    name: "KerasClassifier"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "KerasRegressor"
-    mtype: "<type \'type\'>"
-  }
-}
diff --git a/keras/api/tests/BUILD b/keras/api/tests/BUILD
index 3077ff5e6443..951ec210e8b3 100644
--- a/keras/api/tests/BUILD
+++ b/keras/api/tests/BUILD
@@ -3,6 +3,7 @@
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = ["//keras/api:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
 )
@@ -32,8 +33,7 @@ tf_py_test(
     deps = [
         "//:expect_six_installed",
         "//third_party/py/tensorflow",
-        "//third_party/tensorflow/python:lib",
-        "//third_party/tensorflow/python:platform",
+        "//third_party/tensorflow/python/lib/io:lib",
         "//third_party/tensorflow/tools/api/lib:python_object_to_proto_visitor",
         "//third_party/tensorflow/tools/common:public_api",
         "//third_party/tensorflow/tools/common:traverse",
diff --git a/keras/api/tests/api_compatibility_test.py b/keras/api/tests/api_compatibility_test.py
index 2aa1e357a00a..10e31601abdb 100644
--- a/keras/api/tests/api_compatibility_test.py
+++ b/keras/api/tests/api_compatibility_test.py
@@ -27,26 +27,25 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
-
 import argparse
 import os
 import re
 import sys
 
 import six
+import tensorflow as tf
 
+# isort: off
 from google.protobuf import message
 from google.protobuf import text_format
-
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.tools.api.lib import api_objects_pb2
-from tensorflow.tools.api.lib import python_object_to_proto_visitor
+from tensorflow.tools.api.lib import (
+    python_object_to_proto_visitor,
+)
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
-
 # FLAGS defined at the bottom:
 FLAGS = None
 # DEFINE_boolean, update_goldens, default False:
@@ -67,304 +66,354 @@
 
 
 def _InitPathConstants():
-  global _API_GOLDEN_FOLDER_V1
-  global _API_GOLDEN_FOLDER_V2
-  root_golden_path_v2 = os.path.join(
-      tf.compat.v1.resource_loader.get_data_files_path(),
-      '..', 'golden', 'v2', 'tensorflow.keras.pbtxt')
-
-  if FLAGS.update_goldens:
-    root_golden_path_v2 = os.path.realpath(root_golden_path_v2)
-  # Get API directories based on the root golden file. This way
-  # we make sure to resolve symbolic links before creating new files.
-  _API_GOLDEN_FOLDER_V2 = os.path.dirname(root_golden_path_v2)
-  _API_GOLDEN_FOLDER_V1 = os.path.normpath(
-      os.path.join(_API_GOLDEN_FOLDER_V2, '..', 'v1'))
+    global _API_GOLDEN_FOLDER_V1
+    global _API_GOLDEN_FOLDER_V2
+    root_golden_path_v2 = os.path.join(
+        tf.compat.v1.resource_loader.get_data_files_path(),
+        "..",
+        "golden",
+        "v2",
+        "tensorflow.keras.pbtxt",
+    )
+
+    if FLAGS.update_goldens:
+        root_golden_path_v2 = os.path.realpath(root_golden_path_v2)
+    # Get API directories based on the root golden file. This way
+    # we make sure to resolve symbolic links before creating new files.
+    _API_GOLDEN_FOLDER_V2 = os.path.dirname(root_golden_path_v2)
+    _API_GOLDEN_FOLDER_V1 = os.path.normpath(
+        os.path.join(_API_GOLDEN_FOLDER_V2, "..", "v1")
+    )
 
 
 _TEST_README_FILE = os.path.join(
-    tf.compat.v1.resource_loader.get_data_files_path(), 'README.txt')
+    tf.compat.v1.resource_loader.get_data_files_path(), "README.txt"
+)
 _UPDATE_WARNING_FILE = os.path.join(
-    tf.compat.v1.resource_loader.get_data_files_path(),
-    'API_UPDATE_WARNING.txt')
+    tf.compat.v1.resource_loader.get_data_files_path(), "API_UPDATE_WARNING.txt"
+)
 
 
 def _KeyToFilePath(key, api_version):
-  """From a given key, construct a filepath.
+    """From a given key, construct a filepath.
 
-  Filepath will be inside golden folder for api_version.
+    Filepath will be inside golden folder for api_version.
 
-  Args:
-    key: a string used to determine the file path
-    api_version: a number indicating the tensorflow API version, e.g. 1 or 2.
+    Args:
+      key: a string used to determine the file path
+      api_version: a number indicating the tensorflow API version, e.g. 1 or 2.
 
-  Returns:
-    A string of file path to the pbtxt file which describes the public API
-  """
+    Returns:
+      A string of file path to the pbtxt file which describes the public API
+    """
 
-  def _ReplaceCapsWithDash(matchobj):
-    match = matchobj.group(0)
-    return '-%s' % (match.lower())
+    def _ReplaceCapsWithDash(matchobj):
+        match = matchobj.group(0)
+        return f"-{match.lower()}"
 
-  case_insensitive_key = re.sub('([A-Z]{1})', _ReplaceCapsWithDash,
-                                six.ensure_str(key))
-  api_folder = (
-      _API_GOLDEN_FOLDER_V2 if api_version == 2 else _API_GOLDEN_FOLDER_V1)
-  return os.path.join(api_folder, '%s.pbtxt' % case_insensitive_key)
+    case_insensitive_key = re.sub(
+        "([A-Z]{1})", _ReplaceCapsWithDash, six.ensure_str(key)
+    )
+    api_folder = (
+        _API_GOLDEN_FOLDER_V2 if api_version == 2 else _API_GOLDEN_FOLDER_V1
+    )
+    return os.path.join(api_folder, f"{case_insensitive_key}.pbtxt")
 
 
 def _FileNameToKey(filename):
-  """From a given filename, construct a key we use for api objects."""
+    """From a given filename, construct a key we use for api objects."""
 
-  def _ReplaceDashWithCaps(matchobj):
-    match = matchobj.group(0)
-    return match[1].upper()
+    def _ReplaceDashWithCaps(matchobj):
+        match = matchobj.group(0)
+        return match[1].upper()
 
-  base_filename = os.path.basename(filename)
-  base_filename_without_ext = os.path.splitext(base_filename)[0]
-  api_object_key = re.sub('((-[a-z]){1})', _ReplaceDashWithCaps,
-                          six.ensure_str(base_filename_without_ext))
-  return api_object_key
+    base_filename = os.path.basename(filename)
+    base_filename_without_ext = os.path.splitext(base_filename)[0]
+    api_object_key = re.sub(
+        "((-[a-z]){1})",
+        _ReplaceDashWithCaps,
+        six.ensure_str(base_filename_without_ext),
+    )
+    return api_object_key
 
 
 def _VerifyNoSubclassOfMessageVisitor(path, parent, unused_children):
-  """A Visitor that crashes on subclasses of generated proto classes."""
-  # If the traversed object is a proto Message class
-  if not (isinstance(parent, type) and issubclass(parent, message.Message)):
-    return
-  if parent is message.Message:
-    return
-  # Check that it is a direct subclass of Message.
-  if message.Message not in parent.__bases__:
-    raise NotImplementedError(
-        'Object tf.%s is a subclass of a generated proto Message. '
-        'They are not yet supported by the API tools.' % path)
+    """A Visitor that crashes on subclasses of generated proto classes."""
+    # If the traversed object is a proto Message class
+    if not (isinstance(parent, type) and issubclass(parent, message.Message)):
+        return
+    if parent is message.Message:
+        return
+    # Check that it is a direct subclass of Message.
+    if message.Message not in parent.__bases__:
+        raise NotImplementedError(
+            "Object tf.%s is a subclass of a generated proto Message. "
+            "They are not yet supported by the API tools." % path
+        )
 
 
 def _FilterGoldenProtoDict(golden_proto_dict, omit_golden_symbols_map):
-  """Filter out golden proto dict symbols that should be omitted."""
-  if not omit_golden_symbols_map:
-    return golden_proto_dict
-  filtered_proto_dict = dict(golden_proto_dict)
-  for key, symbol_list in six.iteritems(omit_golden_symbols_map):
-    api_object = api_objects_pb2.TFAPIObject()
-    api_object.CopyFrom(filtered_proto_dict[key])
-    filtered_proto_dict[key] = api_object
-    module_or_class = None
-    if api_object.HasField('tf_module'):
-      module_or_class = api_object.tf_module
-    elif api_object.HasField('tf_class'):
-      module_or_class = api_object.tf_class
-    if module_or_class is not None:
-      for members in (module_or_class.member, module_or_class.member_method):
-        filtered_members = [m for m in members if m.name not in symbol_list]
-        # Two steps because protobuf repeated fields disallow slice assignment.
-        del members[:]
-        members.extend(filtered_members)
-  return filtered_proto_dict
+    """Filter out golden proto dict symbols that should be omitted."""
+    if not omit_golden_symbols_map:
+        return golden_proto_dict
+    filtered_proto_dict = dict(golden_proto_dict)
+    for key, symbol_list in six.iteritems(omit_golden_symbols_map):
+        api_object = api_objects_pb2.TFAPIObject()
+        api_object.CopyFrom(filtered_proto_dict[key])
+        filtered_proto_dict[key] = api_object
+        module_or_class = None
+        if api_object.HasField("tf_module"):
+            module_or_class = api_object.tf_module
+        elif api_object.HasField("tf_class"):
+            module_or_class = api_object.tf_class
+        if module_or_class is not None:
+            for members in (
+                module_or_class.member,
+                module_or_class.member_method,
+            ):
+                filtered_members = [
+                    m for m in members if m.name not in symbol_list
+                ]
+                # Two steps because protobuf repeated fields disallow slice
+                # assignment.
+                del members[:]
+                members.extend(filtered_members)
+    return filtered_proto_dict
 
 
 class ApiCompatibilityTest(tf.test.TestCase):
-
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-
-    self._update_golden_warning = file_io.read_file_to_string(
-        _UPDATE_WARNING_FILE)
-
-    self._test_readme_message = file_io.read_file_to_string(_TEST_README_FILE)
-
-  def _AssertProtoDictEquals(self,
-                             expected_dict,
-                             actual_dict,
-                             verbose=False,
-                             update_goldens=False,
-                             additional_missing_object_message='',
-                             api_version=2):
-    """Diff given dicts of protobufs and report differences a readable way.
-
-    Args:
-      expected_dict: a dict of TFAPIObject protos constructed from golden files.
-      actual_dict: a ict of TFAPIObject protos constructed by reading from the
-        TF package linked to the test.
-      verbose: Whether to log the full diffs, or simply report which files were
-        different.
-      update_goldens: Whether to update goldens when there are diffs found.
-      additional_missing_object_message: Message to print when a symbol is
-        missing.
-      api_version: TensorFlow API version to test.
-    """
-    diffs = []
-    verbose_diffs = []
-
-    expected_keys = set(expected_dict.keys())
-    actual_keys = set(actual_dict.keys())
-    only_in_expected = expected_keys - actual_keys
-    only_in_actual = actual_keys - expected_keys
-    all_keys = expected_keys | actual_keys
-
-    # This will be populated below.
-    updated_keys = []
-
-    for key in all_keys:
-      diff_message = ''
-      verbose_diff_message = ''
-      # First check if the key is not found in one or the other.
-      if key in only_in_expected:
-        diff_message = 'Object %s expected but not found (removed). %s' % (
-            key, additional_missing_object_message)
-        verbose_diff_message = diff_message
-      elif key in only_in_actual:
-        diff_message = 'New object %s found (added).' % key
-        verbose_diff_message = diff_message
-      else:
-        # Do not truncate diff
-        self.maxDiff = None  # pylint: disable=invalid-name
-        # Now we can run an actual proto diff.
-        try:
-          self.assertProtoEquals(expected_dict[key], actual_dict[key])
-        except AssertionError as e:
-          updated_keys.append(key)
-          diff_message = 'Change detected in python object: %s.' % key
-          verbose_diff_message = str(e)
-
-      # All difference cases covered above. If any difference found, add to the
-      # list.
-      if diff_message:
-        diffs.append(diff_message)
-        verbose_diffs.append(verbose_diff_message)
-
-    # If diffs are found, handle them based on flags.
-    if diffs:
-      diff_count = len(diffs)
-      logging.error(self._test_readme_message)
-      logging.error('%d differences found between API and golden.', diff_count)
-
-      if update_goldens:
-        # Write files if requested.
-        logging.warning(self._update_golden_warning)
-
-        # If the keys are only in expected, some objects are deleted.
-        # Remove files.
-        for key in only_in_expected:
-          filepath = _KeyToFilePath(key, api_version)
-          tf.io.gfile.remove(filepath)
-
-        # If the files are only in actual (current library), these are new
-        # modules. Write them to files. Also record all updates in files.
-        for key in only_in_actual | set(updated_keys):
-          filepath = _KeyToFilePath(key, api_version)
-          file_io.write_string_to_file(
-              filepath, text_format.MessageToString(actual_dict[key]))
-      else:
-        # Include the actual differences to help debugging.
-        for d, verbose_d in zip(diffs, verbose_diffs):
-          logging.error('    %s', d)
-          logging.error('    %s', verbose_d)
-        # Fail if we cannot fix the test by updating goldens.
-        self.fail('%d differences found between API and golden.' % diff_count)
-
-    else:
-      logging.info('No differences found between API and golden.')
-
-  def _checkBackwardsCompatibility(self,
-                                   root,
-                                   golden_file_patterns,
-                                   api_version,
-                                   additional_private_map=None,
-                                   omit_golden_symbols_map=None):
-    # Extract all API stuff.
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor(
-        default_path='tensorflow.keras')
-
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    if additional_private_map:
-      public_api_visitor.private_map.update(additional_private_map)
-    public_api_visitor.set_root_name('tf.keras')
-
-    traverse.traverse(root, public_api_visitor)
-    proto_dict = visitor.GetProtos()
-
-    # Read all golden files.
-    golden_file_list = tf.compat.v1.gfile.Glob(golden_file_patterns)
-
-    def _ReadFileToProto(filename):
-      """Read a filename, create a protobuf from its contents."""
-      ret_val = api_objects_pb2.TFAPIObject()
-      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
-      return ret_val
-
-    golden_proto_dict = {
-        _FileNameToKey(filename): _ReadFileToProto(filename)
-        for filename in golden_file_list
-    }
-    golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict,
-                                               omit_golden_symbols_map)
-
-    # Diff them. Do not fail if called with update.
-    # If the test is run to update goldens, only report diffs but do not fail.
-    self._AssertProtoDictEquals(
-        golden_proto_dict,
-        proto_dict,
-        verbose=FLAGS.verbose_diffs,
-        update_goldens=FLAGS.update_goldens,
-        api_version=api_version)
-
-  def testAPIBackwardsCompatibility(self):
-    api_version = 1
-    if hasattr(tf, '_major_api_version') and tf._major_api_version == 2:
-      api_version = 2
-    golden_file_patterns = [
-        os.path.join(
-            tf.compat.v1.resource_loader.get_root_dir_with_all_resources(),
-            _KeyToFilePath('*', api_version))]
-
-    self._checkBackwardsCompatibility(
-        tf.keras,
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._update_golden_warning = file_io.read_file_to_string(
+            _UPDATE_WARNING_FILE
+        )
+
+        self._test_readme_message = file_io.read_file_to_string(
+            _TEST_README_FILE
+        )
+
+    def _AssertProtoDictEquals(
+        self,
+        expected_dict,
+        actual_dict,
+        verbose=False,
+        update_goldens=False,
+        additional_missing_object_message="",
+        api_version=2,
+    ):
+        """Diff given dicts of protobufs and report differences a readable way.
+
+        Args:
+          expected_dict: a dict of TFAPIObject protos constructed from golden
+            files.
+          actual_dict: a ict of TFAPIObject protos constructed by reading from
+            the TF package linked to the test.
+          verbose: Whether to log the full diffs, or simply report which files
+            were different.
+          update_goldens: Whether to update goldens when there are diffs found.
+          additional_missing_object_message: Message to print when a symbol is
+            missing.
+          api_version: TensorFlow API version to test.
+        """
+        diffs = []
+        verbose_diffs = []
+
+        expected_keys = set(expected_dict.keys())
+        actual_keys = set(actual_dict.keys())
+        only_in_expected = expected_keys - actual_keys
+        only_in_actual = actual_keys - expected_keys
+        all_keys = expected_keys | actual_keys
+
+        # This will be populated below.
+        updated_keys = []
+
+        for key in all_keys:
+            diff_message = ""
+            verbose_diff_message = ""
+            # First check if the key is not found in one or the other.
+            if key in only_in_expected:
+                diff_message = (
+                    "Object %s expected but not found (removed). %s"
+                    % (key, additional_missing_object_message)
+                )
+                verbose_diff_message = diff_message
+            elif key in only_in_actual:
+                diff_message = f"New object {key} found (added)."
+                verbose_diff_message = diff_message
+            else:
+                # Do not truncate diff
+                self.maxDiff = None
+                # Now we can run an actual proto diff.
+                try:
+                    self.assertProtoEquals(expected_dict[key], actual_dict[key])
+                except AssertionError as e:
+                    updated_keys.append(key)
+                    diff_message = f"Change detected in python object: {key}."
+                    verbose_diff_message = str(e)
+
+            # All difference cases covered above. If any difference found, add
+            # to the list.
+            if diff_message:
+                diffs.append(diff_message)
+                verbose_diffs.append(verbose_diff_message)
+
+        # If diffs are found, handle them based on flags.
+        if diffs:
+            diff_count = len(diffs)
+            tf.compat.v1.logging.error(self._test_readme_message)
+            tf.compat.v1.logging.error(
+                "%d differences found between API and golden.", diff_count
+            )
+
+            if update_goldens:
+                # Write files if requested.
+                tf.compat.v1.logging.warning(self._update_golden_warning)
+
+                # If the keys are only in expected, some objects are deleted.
+                # Remove files.
+                for key in only_in_expected:
+                    filepath = _KeyToFilePath(key, api_version)
+                    tf.io.gfile.remove(filepath)
+
+                # If the files are only in actual (current library), these are
+                # new modules. Write them to files. Also record all updates in
+                # files.
+                for key in only_in_actual | set(updated_keys):
+                    filepath = _KeyToFilePath(key, api_version)
+                    file_io.write_string_to_file(
+                        filepath, text_format.MessageToString(actual_dict[key])
+                    )
+            else:
+                # Include the actual differences to help debugging.
+                for d, verbose_d in zip(diffs, verbose_diffs):
+                    tf.compat.v1.logging.error("    %s", d)
+                    tf.compat.v1.logging.error("    %s", verbose_d)
+                # Fail if we cannot fix the test by updating goldens.
+                self.fail(
+                    "%d differences found between API and golden." % diff_count
+                )
+
+        else:
+            tf.compat.v1.logging.info(
+                "No differences found between API and golden."
+            )
+
+    def _checkBackwardsCompatibility(
+        self,
+        root,
         golden_file_patterns,
         api_version,
-        # Skip compat.v1 and compat.v2 since they are validated
-        # in separate tests.
-        additional_private_map={'tf.compat': ['v1', 'v2']},
-        omit_golden_symbols_map={})
-
-  def testAPIBackwardsCompatibilityV1(self):
-    api_version = 1
-    golden_file_patterns = os.path.join(
-        tf.compat.v1.resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*', api_version))
-    self._checkBackwardsCompatibility(
-        tf.compat.v1.keras,
-        golden_file_patterns,
-        api_version,
-        additional_private_map={
-            'tf': ['pywrap_tensorflow'],
-            'tf.compat': ['v1', 'v2'],
-        },
-        omit_golden_symbols_map={})
-
-  def testAPIBackwardsCompatibilityV2(self):
-    api_version = 2
-    golden_file_patterns = [os.path.join(
-        tf.compat.v1.resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*', api_version))]
-    self._checkBackwardsCompatibility(
-        tf.compat.v2.keras,
-        golden_file_patterns,
-        api_version,
-        additional_private_map={'tf.compat': ['v1', 'v2']},
-        omit_golden_symbols_map={})
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--update_goldens', type=bool, default=False, help=_UPDATE_GOLDENS_HELP)
-  parser.add_argument(
-      '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
-  FLAGS, unparsed = parser.parse_known_args()
-  _InitPathConstants()
-
-  # Now update argv, so that unittest library does not get confused.
-  sys.argv = [sys.argv[0]] + unparsed
-  tf.test.main()
+        additional_private_map=None,
+        omit_golden_symbols_map=None,
+    ):
+        # Extract all API stuff.
+        visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor(
+            default_path="tensorflow.keras"
+        )
+
+        public_api_visitor = public_api.PublicAPIVisitor(visitor)
+        if additional_private_map:
+            public_api_visitor.private_map.update(additional_private_map)
+        public_api_visitor.set_root_name("tf.keras")
+
+        traverse.traverse(root, public_api_visitor)
+        proto_dict = visitor.GetProtos()
+
+        # Read all golden files.
+        golden_file_list = tf.compat.v1.gfile.Glob(golden_file_patterns)
+
+        def _ReadFileToProto(filename):
+            """Read a filename, create a protobuf from its contents."""
+            ret_val = api_objects_pb2.TFAPIObject()
+            text_format.Merge(file_io.read_file_to_string(filename), ret_val)
+            return ret_val
+
+        golden_proto_dict = {
+            _FileNameToKey(filename): _ReadFileToProto(filename)
+            for filename in golden_file_list
+        }
+        golden_proto_dict = _FilterGoldenProtoDict(
+            golden_proto_dict, omit_golden_symbols_map
+        )
+
+        # Diff them. Do not fail if called with update.
+        # If the test is run to update goldens, only report diffs but do not
+        # fail.
+        self._AssertProtoDictEquals(
+            golden_proto_dict,
+            proto_dict,
+            verbose=FLAGS.verbose_diffs,
+            update_goldens=FLAGS.update_goldens,
+            api_version=api_version,
+        )
+
+    def testAPIBackwardsCompatibility(self):
+        api_version = 1
+        if hasattr(tf, "_major_api_version") and tf._major_api_version == 2:
+            api_version = 2
+        golden_file_patterns = [
+            os.path.join(
+                tf.compat.v1.resource_loader.get_root_dir_with_all_resources(),
+                _KeyToFilePath("*", api_version),
+            )
+        ]
+
+        self._checkBackwardsCompatibility(
+            tf.keras,
+            golden_file_patterns,
+            api_version,
+            # Skip compat.v1 and compat.v2 since they are validated
+            # in separate tests.
+            additional_private_map={"tf.compat": ["v1", "v2"]},
+            omit_golden_symbols_map={},
+        )
+
+    def testAPIBackwardsCompatibilityV1(self):
+        api_version = 1
+        golden_file_patterns = os.path.join(
+            tf.compat.v1.resource_loader.get_root_dir_with_all_resources(),
+            _KeyToFilePath("*", api_version),
+        )
+        self._checkBackwardsCompatibility(
+            tf.compat.v1.keras,
+            golden_file_patterns,
+            api_version,
+            additional_private_map={
+                "tf": ["pywrap_tensorflow"],
+                "tf.compat": ["v1", "v2"],
+            },
+            omit_golden_symbols_map={},
+        )
+
+    def testAPIBackwardsCompatibilityV2(self):
+        api_version = 2
+        golden_file_patterns = [
+            os.path.join(
+                tf.compat.v1.resource_loader.get_root_dir_with_all_resources(),
+                _KeyToFilePath("*", api_version),
+            )
+        ]
+        self._checkBackwardsCompatibility(
+            tf.compat.v2.keras,
+            golden_file_patterns,
+            api_version,
+            additional_private_map={"tf.compat": ["v1", "v2"]},
+            omit_golden_symbols_map={},
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--update_goldens", type=bool, default=False, help=_UPDATE_GOLDENS_HELP
+    )
+    parser.add_argument(
+        "--verbose_diffs", type=bool, default=True, help=_VERBOSE_DIFFS_HELP
+    )
+    FLAGS, unparsed = parser.parse_known_args()
+    _InitPathConstants()
+
+    # Now update argv, so that unittest library does not get confused.
+    sys.argv = [sys.argv[0]] + unparsed
+    tf.test.main()
diff --git a/keras/applications/BUILD b/keras/applications/BUILD
index b9960fb8bad4..90969468ef99 100644
--- a/keras/applications/BUILD
+++ b/keras/applications/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #   Contains the Keras Application package (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         # Remove this deps to integration test.
         "//keras:friends",
@@ -53,7 +55,7 @@ tf_py_test(
     name = "applications_test",
     size = "medium",
     srcs = ["applications_test.py"],
-    shard_count = 40,
+    shard_count = 50,
     tags = [
         "no_rocm",
         "notsan",  # b/168814536
diff --git a/keras/applications/__init__.py b/keras/applications/__init__.py
index ac88213e2c8c..c08ee2843fda 100644
--- a/keras/applications/__init__.py
+++ b/keras/applications/__init__.py
@@ -13,18 +13,16 @@
 # limitations under the License.
 # ==============================================================================
 """Keras Applications are premade architectures with pre-trained weights."""
-# pylint: disable=g-bad-import-order
 
-from keras.applications.convnext import ConvNeXtTiny
-from keras.applications.convnext import ConvNeXtSmall
+
 from keras.applications.convnext import ConvNeXtBase
 from keras.applications.convnext import ConvNeXtLarge
+from keras.applications.convnext import ConvNeXtSmall
+from keras.applications.convnext import ConvNeXtTiny
 from keras.applications.convnext import ConvNeXtXLarge
-
 from keras.applications.densenet import DenseNet121
 from keras.applications.densenet import DenseNet169
 from keras.applications.densenet import DenseNet201
-
 from keras.applications.efficientnet import EfficientNetB0
 from keras.applications.efficientnet import EfficientNetB1
 from keras.applications.efficientnet import EfficientNetB2
@@ -33,7 +31,6 @@
 from keras.applications.efficientnet import EfficientNetB5
 from keras.applications.efficientnet import EfficientNetB6
 from keras.applications.efficientnet import EfficientNetB7
-
 from keras.applications.efficientnet_v2 import EfficientNetV2B0
 from keras.applications.efficientnet_v2 import EfficientNetV2B1
 from keras.applications.efficientnet_v2 import EfficientNetV2B2
@@ -41,25 +38,17 @@
 from keras.applications.efficientnet_v2 import EfficientNetV2L
 from keras.applications.efficientnet_v2 import EfficientNetV2M
 from keras.applications.efficientnet_v2 import EfficientNetV2S
-
 from keras.applications.inception_resnet_v2 import InceptionResNetV2
 from keras.applications.inception_v3 import InceptionV3
-
 from keras.applications.mobilenet import MobileNet
 from keras.applications.mobilenet_v2 import MobileNetV2
-from keras.applications.mobilenet_v3 import MobileNetV3Small
 from keras.applications.mobilenet_v3 import MobileNetV3Large
-
+from keras.applications.mobilenet_v3 import MobileNetV3Small
 from keras.applications.nasnet import NASNetLarge
 from keras.applications.nasnet import NASNetMobile
-
 from keras.applications.resnet import ResNet50
 from keras.applications.resnet import ResNet101
 from keras.applications.resnet import ResNet152
-from keras.applications.resnet_v2 import ResNet50V2
-from keras.applications.resnet_v2 import ResNet101V2
-from keras.applications.resnet_v2 import ResNet152V2
-
 from keras.applications.resnet_rs import ResNetRS50
 from keras.applications.resnet_rs import ResNetRS101
 from keras.applications.resnet_rs import ResNetRS152
@@ -67,8 +56,9 @@
 from keras.applications.resnet_rs import ResNetRS270
 from keras.applications.resnet_rs import ResNetRS350
 from keras.applications.resnet_rs import ResNetRS420
-
+from keras.applications.resnet_v2 import ResNet50V2
+from keras.applications.resnet_v2 import ResNet101V2
+from keras.applications.resnet_v2 import ResNet152V2
 from keras.applications.vgg16 import VGG16
 from keras.applications.vgg19 import VGG19
-
 from keras.applications.xception import Xception
diff --git a/keras/applications/applications_load_weight_test.py b/keras/applications/applications_load_weight_test.py
index 42ff88fd1bc7..875f0e4cd3e2 100644
--- a/keras/applications/applications_load_weight_test.py
+++ b/keras/applications/applications_load_weight_test.py
@@ -14,11 +14,10 @@
 # ==============================================================================
 """Integration tests for Keras applications."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl import flags
 from absl.testing import parameterized
-import numpy as np
 
 from keras.applications import convnext
 from keras.applications import densenet
@@ -40,62 +39,112 @@
 from keras.utils import data_utils
 from keras.utils import image_utils
 
-
 ARG_TO_MODEL = {
-    'resnet': (resnet, [resnet.ResNet50, resnet.ResNet101, resnet.ResNet152]),
-    'resnet_v2':
-        (resnet_v2,
-         [resnet_v2.ResNet50V2, resnet_v2.ResNet101V2, resnet_v2.ResNet152V2]),
-    'vgg16': (vgg16, [vgg16.VGG16]),
-    'vgg19': (vgg19, [vgg19.VGG19]),
-    'xception': (xception, [xception.Xception]),
-    'inception_v3': (inception_v3, [inception_v3.InceptionV3]),
-    'inception_resnet_v2':
-        (inception_resnet_v2, [inception_resnet_v2.InceptionResNetV2]),
-    'mobilenet': (mobilenet, [mobilenet.MobileNet]),
-    'mobilenet_v2': (mobilenet_v2, [mobilenet_v2.MobileNetV2]),
-    'mobilenet_v3_small': (mobilenet_v3, [mobilenet_v3.MobileNetV3Small]),
-    'mobilenet_v3_large': (mobilenet_v3, [mobilenet_v3.MobileNetV3Large]),
-    'convnext': 
-        (convnext, 
-        [convnext.ConvNeXtTiny, convnext.ConvNeXtSmall, convnext.ConvNeXtBase,
-        convnext.ConvNeXtLarge, convnext.ConvNeXtXLarge]),
-    'densenet':
-        (densenet,
-         [densenet.DenseNet121, densenet.DenseNet169, densenet.DenseNet201]),
-    'nasnet_mobile': (nasnet, [nasnet.NASNetMobile]),
-    'nasnet_large': (nasnet, [nasnet.NASNetLarge]),
-    'efficientnet': (efficientnet, [
-        efficientnet.EfficientNetB0, efficientnet.EfficientNetB1,
-        efficientnet.EfficientNetB2, efficientnet.EfficientNetB3,
-        efficientnet.EfficientNetB4, efficientnet.EfficientNetB5,
-        efficientnet.EfficientNetB6, efficientnet.EfficientNetB7
-    ]),
-    'efficientnet_v2': (efficientnet_v2, [
-        efficientnet_v2.EfficientNetV2B0, efficientnet_v2.EfficientNetV2B1,
-        efficientnet_v2.EfficientNetV2B2, efficientnet_v2.EfficientNetV2B3,
-        efficientnet_v2.EfficientNetV2S, efficientnet_v2.EfficientNetV2M,
-        efficientnet_v2.EfficientNetV2L
-    ]),
-    'resnet_rs': (resnet_rs, [
-        resnet_rs.ResNetRS50, resnet_rs.ResNetRS101, resnet_rs.ResNetRS152,
-        resnet_rs.ResNetRS200, resnet_rs.ResNetRS270, resnet_rs.ResNetRS350,
-        resnet_rs.ResNetRS420
-    ]),
-    'regnet': (regnet, [
-        regnet.RegNetX002, regnet.RegNetX004, regnet.RegNetX006,
-        regnet.RegNetX008, regnet.RegNetX016, regnet.RegNetX032,
-        regnet.RegNetX040, regnet.RegNetX064, regnet.RegNetX080,
-        regnet.RegNetX120, regnet.RegNetX160, regnet.RegNetX320,
-        regnet.RegNetY002, regnet.RegNetY004, regnet.RegNetY006,
-        regnet.RegNetY008, regnet.RegNetY016, regnet.RegNetY032,
-        regnet.RegNetY040, regnet.RegNetY064, regnet.RegNetY080,
-        regnet.RegNetY120, regnet.RegNetY160, regnet.RegNetY320
-    ])
+    "resnet": (resnet, [resnet.ResNet50, resnet.ResNet101, resnet.ResNet152]),
+    "resnet_v2": (
+        resnet_v2,
+        [resnet_v2.ResNet50V2, resnet_v2.ResNet101V2, resnet_v2.ResNet152V2],
+    ),
+    "vgg16": (vgg16, [vgg16.VGG16]),
+    "vgg19": (vgg19, [vgg19.VGG19]),
+    "xception": (xception, [xception.Xception]),
+    "inception_v3": (inception_v3, [inception_v3.InceptionV3]),
+    "inception_resnet_v2": (
+        inception_resnet_v2,
+        [inception_resnet_v2.InceptionResNetV2],
+    ),
+    "mobilenet": (mobilenet, [mobilenet.MobileNet]),
+    "mobilenet_v2": (mobilenet_v2, [mobilenet_v2.MobileNetV2]),
+    "mobilenet_v3_small": (mobilenet_v3, [mobilenet_v3.MobileNetV3Small]),
+    "mobilenet_v3_large": (mobilenet_v3, [mobilenet_v3.MobileNetV3Large]),
+    "convnext": (
+        convnext,
+        [
+            convnext.ConvNeXtTiny,
+            convnext.ConvNeXtSmall,
+            convnext.ConvNeXtBase,
+            convnext.ConvNeXtLarge,
+            convnext.ConvNeXtXLarge,
+        ],
+    ),
+    "densenet": (
+        densenet,
+        [densenet.DenseNet121, densenet.DenseNet169, densenet.DenseNet201],
+    ),
+    "nasnet_mobile": (nasnet, [nasnet.NASNetMobile]),
+    "nasnet_large": (nasnet, [nasnet.NASNetLarge]),
+    "efficientnet": (
+        efficientnet,
+        [
+            efficientnet.EfficientNetB0,
+            efficientnet.EfficientNetB1,
+            efficientnet.EfficientNetB2,
+            efficientnet.EfficientNetB3,
+            efficientnet.EfficientNetB4,
+            efficientnet.EfficientNetB5,
+            efficientnet.EfficientNetB6,
+            efficientnet.EfficientNetB7,
+        ],
+    ),
+    "efficientnet_v2": (
+        efficientnet_v2,
+        [
+            efficientnet_v2.EfficientNetV2B0,
+            efficientnet_v2.EfficientNetV2B1,
+            efficientnet_v2.EfficientNetV2B2,
+            efficientnet_v2.EfficientNetV2B3,
+            efficientnet_v2.EfficientNetV2S,
+            efficientnet_v2.EfficientNetV2M,
+            efficientnet_v2.EfficientNetV2L,
+        ],
+    ),
+    "resnet_rs": (
+        resnet_rs,
+        [
+            resnet_rs.ResNetRS50,
+            resnet_rs.ResNetRS101,
+            resnet_rs.ResNetRS152,
+            resnet_rs.ResNetRS200,
+            resnet_rs.ResNetRS270,
+            resnet_rs.ResNetRS350,
+            resnet_rs.ResNetRS420,
+        ],
+    ),
+    "regnet": (
+        regnet,
+        [
+            regnet.RegNetX002,
+            regnet.RegNetX004,
+            regnet.RegNetX006,
+            regnet.RegNetX008,
+            regnet.RegNetX016,
+            regnet.RegNetX032,
+            regnet.RegNetX040,
+            regnet.RegNetX064,
+            regnet.RegNetX080,
+            regnet.RegNetX120,
+            regnet.RegNetX160,
+            regnet.RegNetX320,
+            regnet.RegNetY002,
+            regnet.RegNetY004,
+            regnet.RegNetY006,
+            regnet.RegNetY008,
+            regnet.RegNetY016,
+            regnet.RegNetY032,
+            regnet.RegNetY040,
+            regnet.RegNetY064,
+            regnet.RegNetY080,
+            regnet.RegNetY120,
+            regnet.RegNetY160,
+            regnet.RegNetY320,
+        ],
+    ),
 }
 
-TEST_IMAGE_PATH = ('https://storage.googleapis.com/tensorflow/'
-                   'keras-applications/tests/elephant.jpg')
+TEST_IMAGE_PATH = (
+    "https://storage.googleapis.com/tensorflow/"
+    "keras-applications/tests/elephant.jpg"
+)
 _IMAGENET_CLASSES = 1000
 
 # Add a flag to define which application module file is tested.
@@ -103,48 +152,47 @@
 # it only triggers the tests of the application models in the module
 # if that module file has been modified.
 FLAGS = flags.FLAGS
-flags.DEFINE_string('module', None,
-                    'Application module used in this test.')
+flags.DEFINE_string("module", None, "Application module used in this test.")
 
 
 def _get_elephant(target_size):
-  # For models that don't include a Flatten step,
-  # the default is to accept variable-size inputs
-  # even when loading ImageNet weights (since it is possible).
-  # In this case, default to 299x299.
-  if target_size[0] is None:
-    target_size = (299, 299)
-  test_image = data_utils.get_file('elephant.jpg', TEST_IMAGE_PATH)
-  img = image_utils.load_img(test_image, target_size=tuple(target_size))
-  x = image_utils.img_to_array(img)
-  return np.expand_dims(x, axis=0)
+    # For models that don't include a Flatten step,
+    # the default is to accept variable-size inputs
+    # even when loading ImageNet weights (since it is possible).
+    # In this case, default to 299x299.
+    if target_size[0] is None:
+        target_size = (299, 299)
+    test_image = data_utils.get_file("elephant.jpg", TEST_IMAGE_PATH)
+    img = image_utils.load_img(test_image, target_size=tuple(target_size))
+    x = image_utils.img_to_array(img)
+    return np.expand_dims(x, axis=0)
 
 
 class ApplicationsLoadWeightTest(tf.test.TestCase, parameterized.TestCase):
+    def assertShapeEqual(self, shape1, shape2):
+        if len(shape1) != len(shape2):
+            raise AssertionError(
+                f"Shapes are different rank: {shape1} vs {shape2}"
+            )
+        if shape1 != shape2:
+            raise AssertionError(f"Shapes differ: {shape1} vs {shape2}")
 
-  def assertShapeEqual(self, shape1, shape2):
-    if len(shape1) != len(shape2):
-      raise AssertionError(
-          'Shapes are different rank: %s vs %s' % (shape1, shape2))
-    if shape1 != shape2:
-      raise AssertionError('Shapes differ: %s vs %s' % (shape1, shape2))
-
-  def test_application_pretrained_weights_loading(self):
-    app_module = ARG_TO_MODEL[FLAGS.module][0]
-    apps = ARG_TO_MODEL[FLAGS.module][1]
-    for app in apps:
-      try:
-        model = app(weights='imagenet')
-      except Exception:  # pylint: disable=broad-except
-        self.skipTest('TODO(b/227700184): Re-enable.')
-      self.assertShapeEqual(model.output_shape, (None, _IMAGENET_CLASSES))
-      x = _get_elephant(model.input_shape[1:3])
-      x = app_module.preprocess_input(x)
-      preds = model.predict(x)
-      names = [p[1] for p in app_module.decode_predictions(preds)[0]]
-      # Test correct label is in top 3 (weak correctness test).
-      self.assertIn('African_elephant', names[:3])
+    def test_application_pretrained_weights_loading(self):
+        app_module = ARG_TO_MODEL[FLAGS.module][0]
+        apps = ARG_TO_MODEL[FLAGS.module][1]
+        for app in apps:
+            try:
+                model = app(weights="imagenet")
+            except Exception:
+                self.skipTest("TODO(b/227700184): Re-enable.")
+            self.assertShapeEqual(model.output_shape, (None, _IMAGENET_CLASSES))
+            x = _get_elephant(model.input_shape[1:3])
+            x = app_module.preprocess_input(x)
+            preds = model.predict(x)
+            names = [p[1] for p in app_module.decode_predictions(preds)[0]]
+            # Test correct label is in top 3 (weak correctness test).
+            self.assertIn("African_elephant", names[:3])
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/applications/applications_test.py b/keras/applications/applications_test.py
index e11e2119e437..d74ae95ec33f 100644
--- a/keras/applications/applications_test.py
+++ b/keras/applications/applications_test.py
@@ -14,8 +14,13 @@
 # ==============================================================================
 """Integration tests for Keras applications."""
 
+import os
+
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import backend
+from keras import utils
 from keras.applications import convnext
 from keras.applications import densenet
 from keras.applications import efficientnet
@@ -33,62 +38,79 @@
 from keras.applications import vgg16
 from keras.applications import vgg19
 from keras.applications import xception
-from keras import utils
-import tensorflow.compat.v2 as tf
+from keras.testing_infra import test_utils
 
-MODEL_LIST_NO_NASNET = [(resnet.ResNet50, 2048), (resnet.ResNet101, 2048),
-                        (resnet.ResNet152, 2048), (resnet_v2.ResNet50V2, 2048),
-                        (resnet_v2.ResNet101V2, 2048),
-                        (resnet_v2.ResNet152V2, 2048), (vgg16.VGG16, 512),
-                        (vgg19.VGG19, 512), (xception.Xception, 2048),
-                        (inception_v3.InceptionV3, 2048),
-                        (inception_resnet_v2.InceptionResNetV2, 1536),
-                        (mobilenet.MobileNet, 1024),
-                        (mobilenet_v2.MobileNetV2, 1280),
-                        (mobilenet_v3.MobileNetV3Small, 576),
-                        (mobilenet_v3.MobileNetV3Large, 960),
-                        (convnext.ConvNeXtTiny, 768),
-                        (convnext.ConvNeXtSmall, 768),
-                        (convnext.ConvNeXtBase, 1024),
-                        (convnext.ConvNeXtLarge, 1536),
-                        (convnext.ConvNeXtXLarge, 2048),
-                        (densenet.DenseNet121, 1024),
-                        (densenet.DenseNet169, 1664),
-                        (densenet.DenseNet201, 1920),
-                        (efficientnet.EfficientNetB0, 1280),
-                        (efficientnet.EfficientNetB1, 1280),
-                        (efficientnet.EfficientNetB2, 1408),
-                        (efficientnet.EfficientNetB3, 1536),
-                        (efficientnet.EfficientNetB4, 1792),
-                        (efficientnet.EfficientNetB5, 2048),
-                        (efficientnet.EfficientNetB6, 2304),
-                        (efficientnet.EfficientNetB7, 2560),
-                        (efficientnet_v2.EfficientNetV2B0, 1280),
-                        (efficientnet_v2.EfficientNetV2B1, 1280),
-                        (efficientnet_v2.EfficientNetV2B2, 1408),
-                        (efficientnet_v2.EfficientNetV2B3, 1536),
-                        (efficientnet_v2.EfficientNetV2S, 1280),
-                        (efficientnet_v2.EfficientNetV2M, 1280),
-                        (efficientnet_v2.EfficientNetV2L, 1280),
-                        (regnet.RegNetX002, 368), (regnet.RegNetX004, 384),
-                        (regnet.RegNetX006, 528), (regnet.RegNetX008, 672),
-                        (regnet.RegNetX016, 912), (regnet.RegNetX032, 1008),
-                        (regnet.RegNetX040, 1360), (regnet.RegNetX064, 1624),
-                        (regnet.RegNetX080, 1920), (regnet.RegNetX120, 2240),
-                        (regnet.RegNetX160, 2048), (regnet.RegNetX320, 2520),
-                        (regnet.RegNetY002, 368), (regnet.RegNetY004, 440),
-                        (regnet.RegNetY006, 608), (regnet.RegNetY008, 768),
-                        (regnet.RegNetY016, 888), (regnet.RegNetY032, 1512),
-                        (regnet.RegNetY040, 1088), (regnet.RegNetY064, 1296),
-                        (regnet.RegNetY080, 2016), (regnet.RegNetY120, 2240),
-                        (regnet.RegNetY160, 3024), (regnet.RegNetY320, 3712),
-                        (resnet_rs.ResNetRS50, 2048),
-                        (resnet_rs.ResNetRS101, 2048),
-                        (resnet_rs.ResNetRS152, 2048),
-                        (resnet_rs.ResNetRS200, 2048),
-                        (resnet_rs.ResNetRS270, 2048),
-                        (resnet_rs.ResNetRS350, 2048),
-                        (resnet_rs.ResNetRS420, 2048)]
+MODEL_LIST_NO_NASNET = [
+    (resnet.ResNet50, 2048),
+    (resnet.ResNet101, 2048),
+    (resnet.ResNet152, 2048),
+    (resnet_v2.ResNet50V2, 2048),
+    (resnet_v2.ResNet101V2, 2048),
+    (resnet_v2.ResNet152V2, 2048),
+    (vgg16.VGG16, 512),
+    (vgg19.VGG19, 512),
+    (xception.Xception, 2048),
+    (inception_v3.InceptionV3, 2048),
+    (inception_resnet_v2.InceptionResNetV2, 1536),
+    (mobilenet.MobileNet, 1024),
+    (mobilenet_v2.MobileNetV2, 1280),
+    (mobilenet_v3.MobileNetV3Small, 576),
+    (mobilenet_v3.MobileNetV3Large, 960),
+    (convnext.ConvNeXtTiny, 768),
+    (convnext.ConvNeXtSmall, 768),
+    (convnext.ConvNeXtBase, 1024),
+    (convnext.ConvNeXtLarge, 1536),
+    (convnext.ConvNeXtXLarge, 2048),
+    (densenet.DenseNet121, 1024),
+    (densenet.DenseNet169, 1664),
+    (densenet.DenseNet201, 1920),
+    (efficientnet.EfficientNetB0, 1280),
+    (efficientnet.EfficientNetB1, 1280),
+    (efficientnet.EfficientNetB2, 1408),
+    (efficientnet.EfficientNetB3, 1536),
+    (efficientnet.EfficientNetB4, 1792),
+    (efficientnet.EfficientNetB5, 2048),
+    (efficientnet.EfficientNetB6, 2304),
+    (efficientnet.EfficientNetB7, 2560),
+    (efficientnet_v2.EfficientNetV2B0, 1280),
+    (efficientnet_v2.EfficientNetV2B1, 1280),
+    (efficientnet_v2.EfficientNetV2B2, 1408),
+    (efficientnet_v2.EfficientNetV2B3, 1536),
+    (efficientnet_v2.EfficientNetV2S, 1280),
+    (efficientnet_v2.EfficientNetV2M, 1280),
+    (efficientnet_v2.EfficientNetV2L, 1280),
+    (regnet.RegNetX002, 368),
+    (regnet.RegNetX004, 384),
+    (regnet.RegNetX006, 528),
+    (regnet.RegNetX008, 672),
+    (regnet.RegNetX016, 912),
+    (regnet.RegNetX032, 1008),
+    (regnet.RegNetX040, 1360),
+    (regnet.RegNetX064, 1624),
+    (regnet.RegNetX080, 1920),
+    (regnet.RegNetX120, 2240),
+    (regnet.RegNetX160, 2048),
+    (regnet.RegNetX320, 2520),
+    (regnet.RegNetY002, 368),
+    (regnet.RegNetY004, 440),
+    (regnet.RegNetY006, 608),
+    (regnet.RegNetY008, 768),
+    (regnet.RegNetY016, 888),
+    (regnet.RegNetY032, 1512),
+    (regnet.RegNetY040, 1088),
+    (regnet.RegNetY064, 1296),
+    (regnet.RegNetY080, 2016),
+    (regnet.RegNetY120, 2240),
+    (regnet.RegNetY160, 3024),
+    (regnet.RegNetY320, 3712),
+    (resnet_rs.ResNetRS50, 2048),
+    (resnet_rs.ResNetRS101, 2048),
+    (resnet_rs.ResNetRS152, 2048),
+    (resnet_rs.ResNetRS200, 2048),
+    (resnet_rs.ResNetRS270, 2048),
+    (resnet_rs.ResNetRS350, 2048),
+    (resnet_rs.ResNetRS420, 2048),
+]
 
 NASNET_LIST = [
     (nasnet.NASNetMobile, 1056),
@@ -116,89 +138,128 @@
 
 
 class ApplicationsTest(tf.test.TestCase, parameterized.TestCase):
+    def assertShapeEqual(self, shape1, shape2):
+        if len(shape1) != len(shape2):
+            raise AssertionError(
+                f"Shapes are different rank: {shape1} vs {shape2}"
+            )
+        for v1, v2 in zip(shape1, shape2):
+            if v1 != v2:
+                raise AssertionError(f"Shapes differ: {shape1} vs {shape2}")
+
+    @parameterized.parameters(*MODEL_LIST)
+    def test_application_base(self, app, _):
+        # Can be instantiated with default arguments
+        model = app(weights=None)
+        # Can be serialized and deserialized
+        config = model.get_config()
+        if "ConvNeXt" in app.__name__:
+            custom_objects = {"LayerScale": convnext.LayerScale}
+            with utils.custom_object_scope(custom_objects):
+                reconstructed_model = model.__class__.from_config(config)
+        else:
+            reconstructed_model = model.__class__.from_config(config)
+        self.assertEqual(len(model.weights), len(reconstructed_model.weights))
+        backend.clear_session()
+
+    @parameterized.parameters(*MODEL_LIST)
+    def test_application_notop(self, app, last_dim):
+        if "NASNet" in app.__name__:
+            only_check_last_dim = True
+        else:
+            only_check_last_dim = False
+        output_shape = _get_output_shape(
+            lambda: app(weights=None, include_top=False)
+        )
+        if only_check_last_dim:
+            self.assertEqual(output_shape[-1], last_dim)
+        else:
+            self.assertShapeEqual(output_shape, (None, None, None, last_dim))
+        backend.clear_session()
+
+    @parameterized.parameters(*MODEL_LIST)
+    def test_application_notop_custom_input_shape(self, app, last_dim):
+        output_shape = _get_output_shape(
+            lambda: app(
+                weights="imagenet", include_top=False, input_shape=(224, 224, 3)
+            )
+        )
+
+        self.assertEqual(output_shape[-1], last_dim)
+
+    @parameterized.parameters(MODEL_LIST)
+    def test_application_pooling(self, app, last_dim):
+        output_shape = _get_output_shape(
+            lambda: app(weights=None, include_top=False, pooling="avg")
+        )
+        self.assertShapeEqual(output_shape, (None, last_dim))
+
+    @parameterized.parameters(MODEL_LIST)
+    def test_application_classifier_activation(self, app, _):
+        if "RegNet" in app.__name__:
+            self.skipTest("RegNet models do not support classifier activation")
+        model = app(
+            weights=None, include_top=True, classifier_activation="softmax"
+        )
+        last_layer_act = model.layers[-1].activation.__name__
+        self.assertEqual(last_layer_act, "softmax")
+
+    @parameterized.parameters(*MODEL_LIST_NO_NASNET)
+    def test_application_variable_input_channels(self, app, last_dim):
+        if backend.image_data_format() == "channels_first":
+            input_shape = (1, None, None)
+        else:
+            input_shape = (None, None, 1)
+        output_shape = _get_output_shape(
+            lambda: app(
+                weights=None, include_top=False, input_shape=input_shape
+            )
+        )
+        self.assertShapeEqual(output_shape, (None, None, None, last_dim))
+        backend.clear_session()
+
+        if backend.image_data_format() == "channels_first":
+            input_shape = (4, None, None)
+        else:
+            input_shape = (None, None, 4)
+        output_shape = _get_output_shape(
+            lambda: app(
+                weights=None, include_top=False, input_shape=input_shape
+            )
+        )
+        self.assertShapeEqual(output_shape, (None, None, None, last_dim))
+        backend.clear_session()
+
+    @parameterized.parameters(*MOBILENET_V3_FOR_WEIGHTS)
+    def test_mobilenet_v3_load_weights(
+        self, mobilenet_class, alpha, minimalistic, include_top
+    ):
+        mobilenet_class(
+            input_shape=(224, 224, 3),
+            weights="imagenet",
+            alpha=alpha,
+            minimalistic=minimalistic,
+            include_top=include_top,
+        )
+
+    @parameterized.parameters(MODEL_LIST)
+    @test_utils.run_v2_only
+    def test_model_checkpoint(self, app, _):
+        model = app(weights=None)
 
-  def assertShapeEqual(self, shape1, shape2):
-    if len(shape1) != len(shape2):
-      raise AssertionError(
-          'Shapes are different rank: %s vs %s' % (shape1, shape2))
-    for v1, v2 in zip(shape1, shape2):
-      if v1 != v2:
-        raise AssertionError('Shapes differ: %s vs %s' % (shape1, shape2))
-
-  @parameterized.parameters(*MODEL_LIST)
-  def test_application_base(self, app, _):
-    # Can be instantiated with default arguments
-    model = app(weights=None)
-    # Can be serialized and deserialized
-    config = model.get_config()
-    if "ConvNeXt" in app.__name__:
-      custom_objects = {"LayerScale": convnext.LayerScale}
-      with utils.custom_object_scope(custom_objects):
-        reconstructed_model = model.__class__.from_config(config)
-    else:
-      reconstructed_model = model.__class__.from_config(config)
-    self.assertEqual(len(model.weights), len(reconstructed_model.weights))
-    backend.clear_session()
-
-  @parameterized.parameters(*MODEL_LIST)
-  def test_application_notop(self, app, last_dim):
-    if 'NASNet' in app.__name__:
-      only_check_last_dim = True
-    else:
-      only_check_last_dim = False
-    output_shape = _get_output_shape(
-        lambda: app(weights=None, include_top=False))
-    if only_check_last_dim:
-      self.assertEqual(output_shape[-1], last_dim)
-    else:
-      self.assertShapeEqual(output_shape, (None, None, None, last_dim))
-    backend.clear_session()
-
-  @parameterized.parameters(MODEL_LIST)
-  def test_application_pooling(self, app, last_dim):
-    output_shape = _get_output_shape(
-        lambda: app(weights=None, include_top=False, pooling='avg'))
-    self.assertShapeEqual(output_shape, (None, last_dim))
-
-  @parameterized.parameters(*MODEL_LIST_NO_NASNET)
-  def test_application_variable_input_channels(self, app, last_dim):
-    if backend.image_data_format() == 'channels_first':
-      input_shape = (1, None, None)
-    else:
-      input_shape = (None, None, 1)
-    output_shape = _get_output_shape(
-        lambda: app(weights=None, include_top=False, input_shape=input_shape))
-    self.assertShapeEqual(output_shape, (None, None, None, last_dim))
-    backend.clear_session()
-
-    if backend.image_data_format() == 'channels_first':
-      input_shape = (4, None, None)
-    else:
-      input_shape = (None, None, 4)
-    output_shape = _get_output_shape(
-        lambda: app(weights=None, include_top=False, input_shape=input_shape))
-    self.assertShapeEqual(output_shape, (None, None, None, last_dim))
-    backend.clear_session()
-
-  @parameterized.parameters(*MOBILENET_V3_FOR_WEIGHTS)
-  def test_mobilenet_v3_load_weights(
-      self,
-      mobilenet_class,
-      alpha,
-      minimalistic,
-      include_top):
-    mobilenet_class(
-        input_shape=(224, 224, 3),
-        weights='imagenet',
-        alpha=alpha,
-        minimalistic=minimalistic,
-        include_top=include_top)
+        checkpoint = tf.train.Checkpoint(model=model)
+        checkpoint_manager = tf.train.CheckpointManager(
+            checkpoint,
+            directory=os.path.join(self.get_temp_dir(), model.name),
+            max_to_keep=1,
+        )
+        checkpoint_manager.save(checkpoint_number=1)
 
 
 def _get_output_shape(model_fn):
-  model = model_fn()
-  return model.output_shape
+    model = model_fn()
+    return model.output_shape
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/applications/convnext.py b/keras/applications/convnext.py
index 7efa1820b669..829466a6312b 100644
--- a/keras/applications/convnext.py
+++ b/keras/applications/convnext.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=missing-docstring
-# pylint: disable=g-classes-have-attributes
-# pylint: disable=g-direct-tensorflow-import
+
+
 """ConvNeXt models for Keras.
 
 References:
@@ -24,64 +22,74 @@
   (CVPR 2022)
 """
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
+from keras import initializers
 from keras import layers
 from keras import utils
 from keras.applications import imagenet_utils
 from keras.engine import sequential
 from keras.engine import training as training_lib
-import numpy as np
 
-import tensorflow.compat.v2 as tf
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHTS_PATH = "https://storage.googleapis.com/tensorflow/keras-applications/convnext/"
+BASE_WEIGHTS_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/convnext/"
+)
 
 WEIGHTS_HASHES = {
-  "tiny":
-    ("8ae6e78ce2933352b1ef4008e6dd2f17bc40771563877d156bc6426c7cf503ff",
-      "d547c096cabd03329d7be5562c5e14798aa39ed24b474157cef5e85ab9e49ef1"),
-  "small":
-    ("ce1277d8f1ee5a0ef0e171469089c18f5233860ceaf9b168049cb9263fd7483c",
-      "6fc8009faa2f00c1c1dfce59feea9b0745eb260a7dd11bee65c8e20843da6eab"),
-  "base":
-    ("52cbb006d3dadd03f6e095a8ca1aca47aecdd75acb4bc74bce1f5c695d0086e6",
-      "40a20c5548a5e9202f69735ecc06c990e6b7c9d2de39f0361e27baeb24cb7c45"),
-  "large":
-    ("070c5ed9ed289581e477741d3b34beffa920db8cf590899d6d2c67fba2a198a6",
-      "96f02b6f0753d4f543261bc9d09bed650f24dd6bc02ddde3066135b63d23a1cd"),
-  "xlarge":
-    ("c1f5ccab661354fc3a79a10fa99af82f0fbf10ec65cb894a3ae0815f17a889ee",
-      "de3f8a54174130e0cecdc71583354753d557fcf1f4487331558e2a16ba0cfe05"),
+    "convnext_tiny": (
+        "8ae6e78ce2933352b1ef4008e6dd2f17bc40771563877d156bc6426c7cf503ff",
+        "d547c096cabd03329d7be5562c5e14798aa39ed24b474157cef5e85ab9e49ef1",
+    ),
+    "convnext_small": (
+        "ce1277d8f1ee5a0ef0e171469089c18f5233860ceaf9b168049cb9263fd7483c",
+        "6fc8009faa2f00c1c1dfce59feea9b0745eb260a7dd11bee65c8e20843da6eab",
+    ),
+    "convnext_base": (
+        "52cbb006d3dadd03f6e095a8ca1aca47aecdd75acb4bc74bce1f5c695d0086e6",
+        "40a20c5548a5e9202f69735ecc06c990e6b7c9d2de39f0361e27baeb24cb7c45",
+    ),
+    "convnext_large": (
+        "070c5ed9ed289581e477741d3b34beffa920db8cf590899d6d2c67fba2a198a6",
+        "96f02b6f0753d4f543261bc9d09bed650f24dd6bc02ddde3066135b63d23a1cd",
+    ),
+    "convnext_xlarge": (
+        "c1f5ccab661354fc3a79a10fa99af82f0fbf10ec65cb894a3ae0815f17a889ee",
+        "de3f8a54174130e0cecdc71583354753d557fcf1f4487331558e2a16ba0cfe05",
+    ),
 }
 
 
 MODEL_CONFIGS = {
-  "tiny": {
-    "depths": [3, 3, 9, 3],
-    "projection_dims": [96, 192, 384, 768],
-    "default_size": 224,
-  },
-  "small": {
-    "depths": [3, 3, 27, 3],
-    "projection_dims": [96, 192, 384, 768],
-    "default_size": 224,
-  },
-  "base": {
-    "depths": [3, 3, 27, 3],
-    "projection_dims": [128, 256, 512, 1024],
-    "default_size": 224,
-  },
-  "large": {
-    "depths": [3, 3, 27, 3],
-    "projection_dims": [192, 384, 768, 1536],
-    "default_size": 224,
-  },
-  "xlarge": {
-    "depths": [3, 3, 27, 3],
-    "projection_dims": [256, 512, 1024, 2048],
-    "default_size": 224,
-  },
+    "tiny": {
+        "depths": [3, 3, 9, 3],
+        "projection_dims": [96, 192, 384, 768],
+        "default_size": 224,
+    },
+    "small": {
+        "depths": [3, 3, 27, 3],
+        "projection_dims": [96, 192, 384, 768],
+        "default_size": 224,
+    },
+    "base": {
+        "depths": [3, 3, 27, 3],
+        "projection_dims": [128, 256, 512, 1024],
+        "default_size": 224,
+    },
+    "large": {
+        "depths": [3, 3, 27, 3],
+        "projection_dims": [192, 384, 768, 1536],
+        "default_size": 224,
+    },
+    "xlarge": {
+        "depths": [3, 3, 27, 3],
+        "projection_dims": [256, 512, 1024, 2048],
+        "default_size": 224,
+    },
 }
 
 BASE_DOCSTRING = """Instantiates the {name} architecture.
@@ -96,6 +104,7 @@
   For transfer learning use cases, make sure to read the
   [guide to transfer learning & fine-tuning](
     https://keras.io/guides/transfer_learning/).
+
   The `base`, `large`, and `xlarge` models were first pre-trained on the
   ImageNet-21k dataset and then fine-tuned on the ImageNet-1k dataset. The
   pre-trained parameters of the models were assembled from the
@@ -103,20 +112,22 @@
   sense of how these parameters were converted to Keras compatible parameters,
   please refer to
   [this repository](https://github.com/sayakpaul/keras-convnext-conversion).
+
   Note: Each Keras Application expects a specific kind of input preprocessing.
   For ConvNeXt, preprocessing is included in the model using a `Normalization`
   layer.  ConvNeXt models expect their inputs to be float or uint8 tensors of
   pixels with values in the [0-255] range.
+
   When calling the `summary()` method after instantiating a ConvNeXt model,
   prefer setting the `expand_nested` argument `summary()` to `True` to better
   investigate the instantiated model.
 
   Args:
     include_top: Whether to include the fully-connected
-      layer at the top of the network. Defaults to True.
+      layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
-      `"imagenet"` (pre-training on ImageNet-1k), or the path to the weights file
-      to be loaded. Defaults to `"imagenet"`.
+      `"imagenet"` (pre-training on ImageNet-1k), or the path to the weights
+      file to be loaded. Defaults to `"imagenet"`.
     input_tensor: Optional Keras tensor
       (i.e. output of `layers.Input()`)
       to use as image input for the model.
@@ -124,7 +135,7 @@
       if `include_top` is False.
       It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`. Defaults to None.
+      when `include_top` is `False`.
       - `None` means that the output of the model will be
         the 4D tensor output of the last convolutional layer.
       - `avg` means that global average pooling
@@ -133,517 +144,597 @@
         the output of the model will be a 2D tensor.
       - `max` means that global max pooling will
         be applied.
+      Defaults to `None`.
     classes: Optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified. Defaults to 1000 (number of
-      ImageNet classes).
+      if no `weights` argument is specified. 1000 is how many
+      ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A `str` or callable. The activation function to use
       on the "top" layer. Ignored unless `include_top=True`. Set
       `classifier_activation=None` to return the logits of the "top" layer.
-      Defaults to `"softmax"`.
       When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
+      be `None` or `"softmax"`. Defaults to `"softmax"`.
 
   Returns:
     A `keras.Model` instance.
 """
 
+
 class StochasticDepth(layers.Layer):
-  """Stochastic Depth module.
+    """Stochastic Depth module.
 
-  It performs batch-wise dropping rather than sample-wise. In libraries like
-  `timm`, it's similar to `DropPath` layers that drops residual paths
-  sample-wise.
+    It performs batch-wise dropping rather than sample-wise. In libraries like
+    `timm`, it's similar to `DropPath` layers that drops residual paths
+    sample-wise.
 
-  References:
-    - https://github.com/rwightman/pytorch-image-models
+    References:
+      - https://github.com/rwightman/pytorch-image-models
 
-  Args:
-    drop_path_rate (float): Probability of dropping paths. Should be within
-      [0, 1].
+    Args:
+      drop_path_rate (float): Probability of dropping paths. Should be within
+        [0, 1].
 
-  Returns:
-    Tensor either with the residual path dropped or kept.
-  """
-  def __init__(self, drop_path_rate, **kwargs):
-    super().__init__(**kwargs)
-    self.drop_path_rate = drop_path_rate
-
-  def call(self, x, training=None):
-    if training:
-      keep_prob = 1 - self.drop_path_rate
-      shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
-      random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
-      random_tensor = tf.floor(random_tensor)
-      return (x / keep_prob) * random_tensor
-    return x
+    Returns:
+      Tensor either with the residual path dropped or kept.
+    """
 
-  def get_config(self):
-    config = super().get_config()
-    config.update({"drop_path_rate": self.drop_path_rate})
-    return config
+    def __init__(self, drop_path_rate, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path_rate = drop_path_rate
 
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path_rate
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
 
-class LayerScale(layers.Layer):
-  """Layer scale module.
+    def get_config(self):
+        config = super().get_config()
+        config.update({"drop_path_rate": self.drop_path_rate})
+        return config
 
-  References:
-    - https://arxiv.org/abs/2103.17239
 
-  Args:
-    init_values (float): Initial value for layer scale. Should be within
-      [0, 1].
-    projection_dim (int): Projection dimensionality.
+class LayerScale(layers.Layer):
+    """Layer scale module.
+
+    References:
+      - https://arxiv.org/abs/2103.17239
+
+    Args:
+      init_values (float): Initial value for layer scale. Should be within
+        [0, 1].
+      projection_dim (int): Projection dimensionality.
+
+    Returns:
+      Tensor multiplied to the scale.
+    """
+
+    def __init__(self, init_values, projection_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.init_values = init_values
+        self.projection_dim = projection_dim
+
+    def build(self, input_shape):
+        self.gamma = self.add_weight(
+            name="gamma",
+            shape=(self.projection_dim,),
+            initializer=initializers.Constant(self.init_values),
+            trainable=True,
+        )
+
+    def call(self, x):
+        return x * self.gamma
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "init_values": self.init_values,
+                "projection_dim": self.projection_dim,
+            }
+        )
+        return config
 
-  Returns:
-    Tensor multiplied to the scale.
-  """
-  def __init__(self, init_values, projection_dim, **kwargs):
-    super().__init__(**kwargs)
-    self.init_values = init_values
-    self.projection_dim = projection_dim
-
-  def build(self, input_shape):
-    self.gamma = tf.Variable(self.init_values * tf.ones((self.projection_dim,)))
-
-  def call(self, x):
-    return x * self.gamma
-
-  def get_config(self):
-    config = super().get_config()
-    config.update(
-      {"init_values": self.init_values, "projection_dim": self.projection_dim}
-    )
-    return config
 
 def ConvNeXtBlock(
-    projection_dim,
-    drop_path_rate=0.0,
-    layer_scale_init_value=1e-6,
-    name=None
-    ):
-  """ConvNeXt block.
+    projection_dim, drop_path_rate=0.0, layer_scale_init_value=1e-6, name=None
+):
+    """ConvNeXt block.
 
-  References:
+    References:
     - https://arxiv.org/abs/2201.03545
     - https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
 
-  Notes:
-    In the original ConvNeXt implementation (linked above), the authors use
-    `Dense` layers for pointwise convolutions for increased efficiency.
-    Following that, this implementation also uses the same.
+    Notes:
+      In the original ConvNeXt implementation (linked above), the authors use
+      `Dense` layers for pointwise convolutions for increased efficiency.
+      Following that, this implementation also uses the same.
+
+    Args:
+      projection_dim (int): Number of filters for convolution layers. In the
+        ConvNeXt paper, this is referred to as projection dimension.
+      drop_path_rate (float): Probability of dropping paths. Should be within
+        [0, 1].
+      layer_scale_init_value (float): Layer scale value. Should be a small float
+        number.
+      name: name to path to the keras layer.
+
+    Returns:
+      A function representing a ConvNeXtBlock block.
+    """
+    if name is None:
+        name = "prestem" + str(backend.get_uid("prestem"))
+
+    def apply(inputs):
+        x = inputs
+
+        x = layers.Conv2D(
+            filters=projection_dim,
+            kernel_size=7,
+            padding="same",
+            groups=projection_dim,
+            name=name + "_depthwise_conv",
+        )(x)
+        x = layers.LayerNormalization(epsilon=1e-6, name=name + "_layernorm")(x)
+        x = layers.Dense(4 * projection_dim, name=name + "_pointwise_conv_1")(x)
+        x = layers.Activation("gelu", name=name + "_gelu")(x)
+        x = layers.Dense(projection_dim, name=name + "_pointwise_conv_2")(x)
+
+        if layer_scale_init_value is not None:
+            x = LayerScale(
+                layer_scale_init_value,
+                projection_dim,
+                name=name + "_layer_scale",
+            )(x)
+        if drop_path_rate:
+            layer = StochasticDepth(
+                drop_path_rate, name=name + "_stochastic_depth"
+            )
+        else:
+            layer = layers.Activation("linear", name=name + "_identity")
+
+        return inputs + layer(x)
+
+    return apply
 
-  Args:
-    projection_dim (int): Number of filters for convolution layers. In the
-      ConvNeXt paper, this is referred to as projection dimension.
-    drop_path_rate (float): Probability of dropping paths. Should be within
-      [0, 1].
-    layer_scale_init_value (float): Layer scale value. Should be a small float
-      number.
-    name: name to path to the keras layer.
-
-  Returns:
-    A function representing a ConvNeXtBlock block.
-  """
-  if name is None:
-    name = "prestem" + str(backend.get_uid("prestem"))
 
-  def apply(inputs):
-    x = inputs
+def PreStem(name=None):
+    """Normalizes inputs with ImageNet-1k mean and std.
+
+    Args:
+      name (str): Name prefix.
+
+    Returns:
+      A presemt function.
+    """
+    if name is None:
+        name = "prestem" + str(backend.get_uid("prestem"))
+
+    def apply(x):
+        x = layers.Normalization(
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            variance=[
+                (0.229 * 255) ** 2,
+                (0.224 * 255) ** 2,
+                (0.225 * 255) ** 2,
+            ],
+            name=name + "_prestem_normalization",
+        )(x)
+        return x
+
+    return apply
+
+
+def Head(num_classes=1000, classifier_activation=None, name=None):
+    """Implementation of classification head of ConvNeXt.
+
+    Args:
+      num_classes: number of classes for Dense layer
+      classifier_activation: activation function for the Dense layer
+      name: name prefix
+
+    Returns:
+      Classification head function.
+    """
+    if name is None:
+        name = str(backend.get_uid("head"))
+
+    def apply(x):
+        x = layers.GlobalAveragePooling2D(name=name + "_head_gap")(x)
+        x = layers.LayerNormalization(
+            epsilon=1e-6, name=name + "_head_layernorm"
+        )(x)
+        x = layers.Dense(
+            num_classes,
+            activation=classifier_activation,
+            name=name + "_head_dense",
+        )(x)
+        return x
+
+    return apply
+
+
+def ConvNeXt(
+    depths,
+    projection_dims,
+    drop_path_rate=0.0,
+    layer_scale_init_value=1e-6,
+    default_size=224,
+    model_name="convnext",
+    include_preprocessing=True,
+    include_top=True,
+    weights=None,
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    """Instantiates ConvNeXt architecture given specific configuration.
+
+    Args:
+      depths: An iterable containing depths for each individual stages.
+      projection_dims: An iterable containing output number of channels of
+      each individual stages.
+      drop_path_rate: Stochastic depth probability. If 0.0, then stochastic
+        depth won't be used.
+      layer_scale_init_value: Layer scale coefficient. If 0.0, layer scaling
+        won't be used.
+      default_size: Default input image size.
+      model_name: An optional name for the model.
+      include_preprocessing: boolean denoting whther to include preprocessing in
+        the model. When `weights="imagenet"` this should be always set to True.
+        But for other models (e.g., randomly initialized) users should set it
+        to False and apply preprocessing to data accordingly.
+      include_top: Boolean denoting whether to include classification head to
+        the model.
+      weights: one of `None` (random initialization), `"imagenet"` (pre-training
+        on ImageNet-1k), or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to
+        use as image input for the model.
+      input_shape: optional shape tuple, only to be specified if `include_top`
+        is False. It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction when `include_top`
+        is `False`.
+        - `None` means that the output of the model will be the 4D tensor output
+          of the last convolutional layer.
+        - `avg` means that global average pooling will be applied to the output
+          of the last convolutional layer, and thus the output of the model will
+          be a 2D tensor.
+        - `max` means that global max pooling will be applied.
+      classes: optional number of classes to classify images into, only to be
+        specified if `include_top` is True, and if no `weights` argument is
+        specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+
+    Returns:
+      A `keras.Model` instance.
+
+    Raises:
+        ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+        ValueError: if `classifier_activation` is not `softmax`, or `None`
+          when using a pretrained top layer.
+        ValueError: if `include_top` is True but `num_classes` is not 1000
+          when using ImageNet.
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            "If using `weights` as `'imagenet'` with `include_top`"
+            " as true, `classes` should be 1000"
+        )
+
+    # Determine proper input shape.
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=default_size,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
 
-    x = layers.Conv2D(
-      filters=projection_dim, kernel_size=7, padding="same",
-      groups=projection_dim, name=name + "_depthwise_conv")(x)
-    x = layers.LayerNormalization(epsilon=1e-6, name=name + "_layernorm")(x)
-    x = layers.Dense(4 * projection_dim, name=name + "_pointwise_conv_1")(x)
-    x = layers.Activation("gelu", name=name + "_gelu")(x)
-    x = layers.Dense(projection_dim, name=name + "_pointwise_conv_2")(x)
-
-    if layer_scale_init_value is not None:
-      x = LayerScale(layer_scale_init_value, projection_dim,
-        name=name + "_layer_scale")(x)
-    if drop_path_rate:
-      layer = StochasticDepth(drop_path_rate, name=name + "_stochastic_depth")
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      layer = layers.Activation("linear", name=name + "_identity")
-
-    return inputs + layer(x)
-  return apply
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
 
+    if input_tensor is not None:
+        inputs = utils.layer_utils.get_source_inputs(input_tensor)[0]
+    else:
+        inputs = img_input
 
-def PreStem(name=None):
-  """Normalizes inputs with ImageNet-1k mean and std.
+    x = inputs
+    if include_preprocessing:
+        channel_axis = (
+            3 if backend.image_data_format() == "channels_last" else 1
+        )
+        num_channels = input_shape[channel_axis - 1]
+        if num_channels == 3:
+            x = PreStem(name=model_name)(x)
+
+    # Stem block.
+    stem = sequential.Sequential(
+        [
+            layers.Conv2D(
+                projection_dims[0],
+                kernel_size=4,
+                strides=4,
+                name=model_name + "_stem_conv",
+            ),
+            layers.LayerNormalization(
+                epsilon=1e-6, name=model_name + "_stem_layernorm"
+            ),
+        ],
+        name=model_name + "_stem",
+    )
 
-  Args:
-    name (str): Name prefix.
+    # Downsampling blocks.
+    downsample_layers = []
+    downsample_layers.append(stem)
+
+    num_downsample_layers = 3
+    for i in range(num_downsample_layers):
+        downsample_layer = sequential.Sequential(
+            [
+                layers.LayerNormalization(
+                    epsilon=1e-6,
+                    name=model_name + "_downsampling_layernorm_" + str(i),
+                ),
+                layers.Conv2D(
+                    projection_dims[i + 1],
+                    kernel_size=2,
+                    strides=2,
+                    name=model_name + "_downsampling_conv_" + str(i),
+                ),
+            ],
+            name=model_name + "_downsampling_block_" + str(i),
+        )
+        downsample_layers.append(downsample_layer)
+
+    # Stochastic depth schedule.
+    # This is referred from the original ConvNeXt codebase:
+    # https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py#L86
+    depth_drop_rates = [
+        float(x) for x in np.linspace(0.0, drop_path_rate, sum(depths))
+    ]
+
+    # First apply downsampling blocks and then apply ConvNeXt stages.
+    cur = 0
+
+    num_convnext_blocks = 4
+    for i in range(num_convnext_blocks):
+        x = downsample_layers[i](x)
+        for j in range(depths[i]):
+            x = ConvNeXtBlock(
+                projection_dim=projection_dims[i],
+                drop_path_rate=depth_drop_rates[cur + j],
+                layer_scale_init_value=layer_scale_init_value,
+                name=model_name + f"_stage_{i}_block_{j}",
+            )(x)
+        cur += depths[i]
 
-  Returns:
-    A presemt function.
-  """
-  if name is None:
-    name = "prestem" + str(backend.get_uid("prestem"))
-
-  def apply(x):
-    x = layers.Normalization(
-      mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
-      variance=[(0.229 * 255) ** 2, (0.224 * 255) ** 2, (0.225 * 255) ** 2],
-      name=name + "_prestem_normalization"
-    )(x)
-    return x
+    if include_top:
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = Head(
+            num_classes=classes,
+            classifier_activation=classifier_activation,
+            name=model_name,
+        )(x)
 
-  return apply
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+        x = layers.LayerNormalization(epsilon=1e-6)(x)
+
+    model = training_lib.Model(inputs=inputs, outputs=x, name=model_name)
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            file_suffix = ".h5"
+            file_hash = WEIGHTS_HASHES[model_name][0]
+        else:
+            file_suffix = "_notop.h5"
+            file_hash = WEIGHTS_HASHES[model_name][1]
+        file_name = model_name + file_suffix
+        weights_path = utils.data_utils.get_file(
+            file_name,
+            BASE_WEIGHTS_PATH + file_name,
+            cache_subdir="models",
+            file_hash=file_hash,
+        )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
 
 
-def Head(num_classes=1000, name=None):
-  """Implementation of classification head of RegNet.
+## Instantiating variants ##
 
-  Args:
-    num_classes: number of classes for Dense layer
-    name: name prefix
 
-  Returns:
-    Classification head function.
-  """
-  if name is None:
-    name = str(backend.get_uid("head"))
-
-  def apply(x):
-    x = layers.GlobalAveragePooling2D(name=name + "_head_gap")(x)
-    x = layers.LayerNormalization(
-      epsilon=1e-6, name=name + "_head_layernorm")(x)
-    x = layers.Dense(num_classes, name=name + "_head_dense")(x)
-    return x
+@keras_export(
+    "keras.applications.convnext.ConvNeXtTiny",
+    "keras.applications.ConvNeXtTiny",
+)
+def ConvNeXtTiny(
+    model_name="convnext_tiny",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return ConvNeXt(
+        depths=MODEL_CONFIGS["tiny"]["depths"],
+        projection_dims=MODEL_CONFIGS["tiny"]["projection_dims"],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        default_size=MODEL_CONFIGS["tiny"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
 
-  return apply
-
-
-def ConvNeXt(depths,
-  projection_dims,
-  drop_path_rate=0.0,
-  layer_scale_init_value=1e-6,
-  default_size=224,
-  model_name="convnext",
-  include_preprocessing=True,
-  include_top=True,
-  weights=None,
-  input_tensor=None,
-  input_shape=None,
-  pooling=None,
-  classes=1000,
-  classifier_activation="softmax"):
-  """Instantiates ConvNeXt architecture given specific configuration.
 
-  Args:
-    depths: An iterable containing depths for each individual stages.
-    projection_dims: An iterable containing output number of channels of
-    each individual stages.
-    drop_path_rate: Stochastic depth probability. If 0.0, then stochastic depth
-      won't be used.
-    layer_scale_init_value: Layer scale coefficient. If 0.0, layer scaling won't
-      be used.
-    default_size: Default input image size.
-    model_name: An optional name for the model.
-    include_preprocessing: boolean denoting whther to include preprocessing in
-      the model. When `weights="imagenet"` this should be always set to True.
-      But for other models (e.g., randomly initialized) users should set it
-      to False and apply preprocessing to data accordingly.
-    include_top: Boolean denoting whether to include classification head to the
-      model.
-    weights: one of `None` (random initialization), `"imagenet"` (pre-training
-      on ImageNet-1k), or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to use
-      as image input for the model.
-    input_shape: optional shape tuple, only to be specified if `include_top` is
-      False. It should have exactly 3 inputs channels.
-    pooling: optional pooling mode for feature extraction when `include_top` is
-      `False`. - `None` means that the output of the model will be the 4D tensor
-      output of the last convolutional layer. - `avg` means that global average
-      pooling will be applied to the output of the last convolutional layer, and
-      thus the output of the model will be a 2D tensor. - `max` means that
-      global max pooling will be applied.
-    classes: optional number of classes to classify images into, only to be
-      specified if `include_top` is True, and if no `weights` argument is
-      specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
+@keras_export(
+    "keras.applications.convnext.ConvNeXtSmall",
+    "keras.applications.ConvNeXtSmall",
+)
+def ConvNeXtSmall(
+    model_name="convnext_small",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return ConvNeXt(
+        depths=MODEL_CONFIGS["small"]["depths"],
+        projection_dims=MODEL_CONFIGS["small"]["projection_dims"],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        default_size=MODEL_CONFIGS["small"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
 
-  Returns:
-    A `keras.Model` instance.
 
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-        or invalid input shape.
-      ValueError: if `classifier_activation` is not `softmax`, or `None`
-        when using a pretrained top layer.
-      ValueError: if `include_top` is True but `num_classes` is not 1000
-        when using ImageNet.
-  """
-  if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
-    raise ValueError("The `weights` argument should be either "
-                     "`None` (random initialization), `imagenet` "
-                     "(pre-training on ImageNet), "
-                     "or the path to the weights file to be loaded.")
-
-  if weights == "imagenet" and include_top and classes != 1000:
-    raise ValueError("If using `weights` as `'imagenet'` with `include_top`"
-                     " as true, `classes` should be 1000")
-
-  # Determine proper input shape.
-  input_shape = imagenet_utils.obtain_input_shape(
-    input_shape,
-    default_size=default_size,
-    min_size=32,
-    data_format=backend.image_data_format(),
-    require_flatten=include_top,
-    weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  if input_tensor is not None:
-    inputs = utils.layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  x = inputs
-  if include_preprocessing:
-    channel_axis = 3 if backend.image_data_format() == "channels_last" else 1
-    num_channels = input_shape[channel_axis - 1]
-    if num_channels == 3:
-      x = PreStem(name=model_name)(x)
-
-  # Stem block.
-  stem = sequential.Sequential(
-    [
-      layers.Conv2D(projection_dims[0], kernel_size=4, strides=4,
-        name=model_name + "_stem_conv"),
-      layers.LayerNormalization(
-              epsilon=1e-6,
-              name=model_name + "_stem_layernorm"
-      ),
-    ],
-    name=model_name + "_stem",
-  )
-
-  # Downsampling blocks.
-  downsample_layers = []
-  downsample_layers.append(stem)
-
-  num_downsample_layers = 3
-  for i in range(num_downsample_layers):
-    downsample_layer = sequential.Sequential(
-      [
-        layers.LayerNormalization(epsilon=1e-6,
-          name=model_name + "_downsampling_layernorm_" + str(i)),
-        layers.Conv2D(projection_dims[i + 1], kernel_size=2, strides=2,
-          name=model_name + "_downsampling_conv_" + str(i)),
-      ],
-      name=model_name + "_downsampling_block_" + str(i),
+@keras_export(
+    "keras.applications.convnext.ConvNeXtBase",
+    "keras.applications.ConvNeXtBase",
+)
+def ConvNeXtBase(
+    model_name="convnext_base",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return ConvNeXt(
+        depths=MODEL_CONFIGS["base"]["depths"],
+        projection_dims=MODEL_CONFIGS["base"]["projection_dims"],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        default_size=MODEL_CONFIGS["base"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
     )
-    downsample_layers.append(downsample_layer)
-
-  # Stochastic depth schedule.
-  # This is referred from the original ConvNeXt codebase:
-  # https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py#L86
-  depth_drop_rates = [
-    float(x) for x in np.linspace(0.0, drop_path_rate, sum(depths))
-  ]
-
-  # First apply downsampling blocks and then apply ConvNeXt stages.
-  cur = 0
-
-  num_convnext_blocks = 4
-  for i in range(num_convnext_blocks):
-    x = downsample_layers[i](x)
-    for j in range(depths[i]):
-      x = ConvNeXtBlock(
-        projection_dim=projection_dims[i],
-        drop_path_rate=depth_drop_rates[cur + j],
-        layer_scale_init_value=layer_scale_init_value,
-        name=model_name + f"_stage_{i}_block_{j}",
-      )(x)
-    cur += depths[i]
-
-  if include_top:
-    x = Head(num_classes=classes, name=model_name)(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-
-  else:
-    if pooling == "avg":
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == "max":
-      x = layers.GlobalMaxPooling2D()(x)
-    x = layers.LayerNormalization(epsilon=1e-6)(x)
-
-  model = training_lib.Model(inputs=inputs, outputs=x, name=model_name)
-
-  # Load weights.
-  if weights == "imagenet":
-    if include_top:
-      file_suffix = ".h5"
-      file_hash = WEIGHTS_HASHES[model_name][0]
-    else:
-      file_suffix = "_notop.h5"
-      file_hash = WEIGHTS_HASHES[model_name][1]
-    file_name = model_name + file_suffix
-    weights_path = utils.data_utils.get_file(
-      file_name,
-      BASE_WEIGHTS_PATH + file_name,
-      cache_subdir="models",
-      file_hash=file_hash)
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
 
-  return model
 
+@keras_export(
+    "keras.applications.convnext.ConvNeXtLarge",
+    "keras.applications.ConvNeXtLarge",
+)
+def ConvNeXtLarge(
+    model_name="convnext_large",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return ConvNeXt(
+        depths=MODEL_CONFIGS["large"]["depths"],
+        projection_dims=MODEL_CONFIGS["large"]["projection_dims"],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        default_size=MODEL_CONFIGS["large"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
 
-## Instantiating variants ##
 
-@keras_export("keras.applications.convnext.ConvNeXtTiny",
-              "keras.applications.ConvNeXtTiny")
-def ConvNeXtTiny(model_name="convnext_tiny",
-  include_top=True,
-  include_preprocessing=True,
-  weights="imagenet",
-  input_tensor=None,
-  input_shape=None,
-  pooling=None,
-  classes=1000,
-  classifier_activation="softmax"):
-  return ConvNeXt(
-    depths=MODEL_CONFIGS["tiny"]["depths"],
-    projection_dims=MODEL_CONFIGS["tiny"]["projection_dims"],
-    drop_path_rate=0.0,
-    layer_scale_init_value=1e-6,
-    default_size=MODEL_CONFIGS["tiny"]["default_size"],
-    model_name=model_name,
-    include_top=include_top,
-    include_preprocessing=include_preprocessing,
-    weights=weights,
-    input_tensor=input_tensor,
-    input_shape=input_shape,
-    pooling=pooling,
-    classes=classes,
-    classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.convnext.ConvNeXtSmall",
-              "keras.applications.ConvNeXtSmall")
-def ConvNeXtSmall(model_name="convnext_small",
-  include_top=True,
-  include_preprocessing=True,
-  weights="imagenet",
-  input_tensor=None,
-  input_shape=None,
-  pooling=None,
-  classes=1000,
-  classifier_activation="softmax"):
-  return ConvNeXt(
-    depths=MODEL_CONFIGS["small"]["depths"],
-    projection_dims=MODEL_CONFIGS["small"]["projection_dims"],
-    drop_path_rate=0.0,
-    layer_scale_init_value=1e-6,
-    default_size=MODEL_CONFIGS["small"]["default_size"],
-    model_name=model_name,
-    include_top=include_top,
-    include_preprocessing=include_preprocessing,
-    weights=weights,
-    input_tensor=input_tensor,
-    input_shape=input_shape,
-    pooling=pooling,
-    classes=classes,
-    classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.convnext.ConvNeXtBase",
-              "keras.applications.ConvNeXtBase")
-def ConvNeXtBase(model_name="convnext_base",
-  include_top=True,
-  include_preprocessing=True,
-  weights="imagenet",
-  input_tensor=None,
-  input_shape=None,
-  pooling=None,
-  classes=1000,
-  classifier_activation="softmax"):
-  return ConvNeXt(
-    depths=MODEL_CONFIGS["base"]["depths"],
-    projection_dims=MODEL_CONFIGS["base"]["projection_dims"],
-    drop_path_rate=0.0,
-    layer_scale_init_value=1e-6,
-    default_size=MODEL_CONFIGS["base"]["default_size"],
-    model_name=model_name,
-    include_top=include_top,
-    include_preprocessing=include_preprocessing,
-    weights=weights,
-    input_tensor=input_tensor,
-    input_shape=input_shape,
-    pooling=pooling,
-    classes=classes,
-    classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.convnext.ConvNeXtLarge",
-              "keras.applications.ConvNeXtLarge")
-def ConvNeXtLarge(model_name="convnext_large",
-  include_top=True,
-  include_preprocessing=True,
-  weights="imagenet",
-  input_tensor=None,
-  input_shape=None,
-  pooling=None,
-  classes=1000,
-  classifier_activation="softmax"):
-  return ConvNeXt(
-    depths=MODEL_CONFIGS["large"]["depths"],
-    projection_dims=MODEL_CONFIGS["large"]["projection_dims"],
-    drop_path_rate=0.0,
-    layer_scale_init_value=1e-6,
-    default_size=MODEL_CONFIGS["large"]["default_size"],
-    model_name=model_name,
-    include_top=include_top,
-    include_preprocessing=include_preprocessing,
-    weights=weights,
-    input_tensor=input_tensor,
-    input_shape=input_shape,
-    pooling=pooling,
-    classes=classes,
-    classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.convnext.ConvNeXtXLarge",
-              "keras.applications.ConvNeXtXLarge")
-def ConvNeXtXLarge(model_name="convnext_xlarge",
-  include_top=True,
-  include_preprocessing=True,
-  weights="imagenet",
-  input_tensor=None,
-  input_shape=None,
-  pooling=None,
-  classes=1000,
-  classifier_activation="softmax"):
-  return ConvNeXt(
-    depths=MODEL_CONFIGS["xlarge"]["depths"],
-    projection_dims=MODEL_CONFIGS["xlarge"]["projection_dims"],
-    drop_path_rate=0.0,
-    layer_scale_init_value=1e-6,
-    default_size=MODEL_CONFIGS["xlarge"]["default_size"],
-    model_name=model_name,
-    include_top=include_top,
-    include_preprocessing=include_preprocessing,
-    weights=weights,
-    input_tensor=input_tensor,
-    input_shape=input_shape,
-    pooling=pooling,
-    classes=classes,
-    classifier_activation=classifier_activation)
+@keras_export(
+    "keras.applications.convnext.ConvNeXtXLarge",
+    "keras.applications.ConvNeXtXLarge",
+)
+def ConvNeXtXLarge(
+    model_name="convnext_xlarge",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return ConvNeXt(
+        depths=MODEL_CONFIGS["xlarge"]["depths"],
+        projection_dims=MODEL_CONFIGS["xlarge"]["projection_dims"],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        default_size=MODEL_CONFIGS["xlarge"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
 
 
 ConvNeXtTiny.__doc__ = BASE_DOCSTRING.format(name="ConvNeXtTiny")
@@ -654,30 +745,30 @@ def ConvNeXtXLarge(model_name="convnext_xlarge",
 
 
 @keras_export("keras.applications.convnext.preprocess_input")
-def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
-  """A placeholder method for backward compatibility.
-
-  The preprocessing logic has been included in the efficientnet model
-  implementation. Users are no longer required to call this method to normalize
-  the input data. This method does nothing and only kept as a placeholder to
-  align the API surface between old and new version of model.
-
-  Args:
-    x: A floating point `numpy.array` or a `tf.Tensor`.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it, it
-      defaults to "channels_last").{mode}
-
-  Returns:
-    Unchanged `numpy.array` or `tf.Tensor`.
-  """
-  return x
+def preprocess_input(x, data_format=None):
+    """A placeholder method for backward compatibility.
+
+    The preprocessing logic has been included in the convnext model
+    implementation. Users are no longer required to call this method to
+    normalize the input data. This method does nothing and only kept as a
+    placeholder to align the API surface between old and new version of model.
+
+    Args:
+      x: A floating point `numpy.array` or a `tf.Tensor`.
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it uses "channels_last").
+        Defaults to `None`.
+
+    Returns:
+      Unchanged `numpy.array` or `tf.Tensor`.
+    """
+    return x
 
 
 @keras_export("keras.applications.convnext.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/densenet.py b/keras/applications/densenet.py
index e32066036487..57372d6a123e 100644
--- a/keras/applications/densenet.py
+++ b/keras/applications/densenet.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """DenseNet models for Keras.
 
 Reference:
@@ -28,356 +28,412 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHTS_PATH = ('https://storage.googleapis.com/tensorflow/'
-                     'keras-applications/densenet/')
+BASE_WEIGHTS_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/densenet/"
+)
 DENSENET121_WEIGHT_PATH = (
-    BASE_WEIGHTS_PATH + 'densenet121_weights_tf_dim_ordering_tf_kernels.h5')
+    BASE_WEIGHTS_PATH + "densenet121_weights_tf_dim_ordering_tf_kernels.h5"
+)
 DENSENET121_WEIGHT_PATH_NO_TOP = (
-    BASE_WEIGHTS_PATH +
-    'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5')
+    BASE_WEIGHTS_PATH
+    + "densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5"
+)
 DENSENET169_WEIGHT_PATH = (
-    BASE_WEIGHTS_PATH + 'densenet169_weights_tf_dim_ordering_tf_kernels.h5')
+    BASE_WEIGHTS_PATH + "densenet169_weights_tf_dim_ordering_tf_kernels.h5"
+)
 DENSENET169_WEIGHT_PATH_NO_TOP = (
-    BASE_WEIGHTS_PATH +
-    'densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5')
+    BASE_WEIGHTS_PATH
+    + "densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5"
+)
 DENSENET201_WEIGHT_PATH = (
-    BASE_WEIGHTS_PATH + 'densenet201_weights_tf_dim_ordering_tf_kernels.h5')
+    BASE_WEIGHTS_PATH + "densenet201_weights_tf_dim_ordering_tf_kernels.h5"
+)
 DENSENET201_WEIGHT_PATH_NO_TOP = (
-    BASE_WEIGHTS_PATH +
-    'densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5')
+    BASE_WEIGHTS_PATH
+    + "densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5"
+)
 
 layers = VersionAwareLayers()
 
 
 def dense_block(x, blocks, name):
-  """A dense block.
+    """A dense block.
 
-  Args:
-    x: input tensor.
-    blocks: integer, the number of building blocks.
-    name: string, block label.
+    Args:
+      x: input tensor.
+      blocks: integer, the number of building blocks.
+      name: string, block label.
 
-  Returns:
-    Output tensor for the block.
-  """
-  for i in range(blocks):
-    x = conv_block(x, 32, name=name + '_block' + str(i + 1))
-  return x
+    Returns:
+      Output tensor for the block.
+    """
+    for i in range(blocks):
+        x = conv_block(x, 32, name=name + "_block" + str(i + 1))
+    return x
 
 
 def transition_block(x, reduction, name):
-  """A transition block.
-
-  Args:
-    x: input tensor.
-    reduction: float, compression rate at transition layers.
-    name: string, block label.
-
-  Returns:
-    output tensor for the block.
-  """
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_bn')(
-          x)
-  x = layers.Activation('relu', name=name + '_relu')(x)
-  x = layers.Conv2D(
-      int(backend.int_shape(x)[bn_axis] * reduction),
-      1,
-      use_bias=False,
-      name=name + '_conv')(
-          x)
-  x = layers.AveragePooling2D(2, strides=2, name=name + '_pool')(x)
-  return x
+    """A transition block.
+
+    Args:
+      x: input tensor.
+      reduction: float, compression rate at transition layers.
+      name: string, block label.
+
+    Returns:
+      output tensor for the block.
+    """
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_bn"
+    )(x)
+    x = layers.Activation("relu", name=name + "_relu")(x)
+    x = layers.Conv2D(
+        int(backend.int_shape(x)[bn_axis] * reduction),
+        1,
+        use_bias=False,
+        name=name + "_conv",
+    )(x)
+    x = layers.AveragePooling2D(2, strides=2, name=name + "_pool")(x)
+    return x
 
 
 def conv_block(x, growth_rate, name):
-  """A building block for a dense block.
-
-  Args:
-    x: input tensor.
-    growth_rate: float, growth rate at dense layers.
-    name: string, block label.
-
-  Returns:
-    Output tensor for the block.
-  """
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-  x1 = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_0_bn')(
-          x)
-  x1 = layers.Activation('relu', name=name + '_0_relu')(x1)
-  x1 = layers.Conv2D(
-      4 * growth_rate, 1, use_bias=False, name=name + '_1_conv')(
-          x1)
-  x1 = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(
-          x1)
-  x1 = layers.Activation('relu', name=name + '_1_relu')(x1)
-  x1 = layers.Conv2D(
-      growth_rate, 3, padding='same', use_bias=False, name=name + '_2_conv')(
-          x1)
-  x = layers.Concatenate(axis=bn_axis, name=name + '_concat')([x, x1])
-  return x
+    """A building block for a dense block.
+
+    Args:
+      x: input tensor.
+      growth_rate: float, growth rate at dense layers.
+      name: string, block label.
+
+    Returns:
+      Output tensor for the block.
+    """
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+    x1 = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_0_bn"
+    )(x)
+    x1 = layers.Activation("relu", name=name + "_0_relu")(x1)
+    x1 = layers.Conv2D(
+        4 * growth_rate, 1, use_bias=False, name=name + "_1_conv"
+    )(x1)
+    x1 = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_1_bn"
+    )(x1)
+    x1 = layers.Activation("relu", name=name + "_1_relu")(x1)
+    x1 = layers.Conv2D(
+        growth_rate, 3, padding="same", use_bias=False, name=name + "_2_conv"
+    )(x1)
+    x = layers.Concatenate(axis=bn_axis, name=name + "_concat")([x, x1])
+    return x
 
 
 def DenseNet(
     blocks,
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the DenseNet architecture.
-
-  Reference:
-  - [Densely Connected Convolutional Networks](
-      https://arxiv.org/abs/1608.06993) (CVPR 2017)
-
-  This function returns a Keras image classification model,
-  optionally loaded with weights pre-trained on ImageNet.
-
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For DenseNet, call `tf.keras.applications.densenet.preprocess_input` on your
-  inputs before passing them to the model.
-  `densenet.preprocess_input` will scale pixels between 0 and 1 and then
-  will normalize each channel with respect to the ImageNet dataset statistics.
+    classifier_activation="softmax",
+):
+    """Instantiates the DenseNet architecture.
+
+    Reference:
+    - [Densely Connected Convolutional Networks](
+        https://arxiv.org/abs/1608.06993) (CVPR 2017)
+
+    This function returns a Keras image classification model,
+    optionally loaded with weights pre-trained on ImageNet.
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For DenseNet, call `tf.keras.applications.densenet.preprocess_input` on your
+    inputs before passing them to the model.
+    `densenet.preprocess_input` will scale pixels between 0 and 1 and then
+    will normalize each channel with respect to the ImageNet dataset statistics.
+
+    Args:
+      blocks: numbers of building blocks for the four dense layers.
+      include_top: whether to include the fully-connected
+        layer at the top of the network.
+      weights: one of `None` (random initialization),
+        'imagenet' (pre-training on ImageNet),
+        or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor
+        (i.e. output of `layers.Input()`)
+        to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+        if `include_top` is False (otherwise the input shape
+        has to be `(224, 224, 3)` (with `'channels_last'` data format)
+        or `(3, 224, 224)` (with `'channels_first'` data format).
+        It should have exactly 3 inputs channels,
+        and width and height should be no smaller than 32.
+        E.g. `(200, 200, 3)` would be one valid value.
+      pooling: optional pooling mode for feature extraction
+        when `include_top` is `False`.
+        - `None` means that the output of the model will be
+            the 4D tensor output of the
+            last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will
+            be applied.
+      classes: optional number of classes to classify images
+        into, only to be specified if `include_top` is True, and
+        if no `weights` argument is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top`'
+            " as true, `classes` should be 1000"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=224,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
+    else:
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    x = layers.ZeroPadding2D(padding=((3, 3), (3, 3)))(img_input)
+    x = layers.Conv2D(64, 7, strides=2, use_bias=False, name="conv1/conv")(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name="conv1/bn"
+    )(x)
+    x = layers.Activation("relu", name="conv1/relu")(x)
+    x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)))(x)
+    x = layers.MaxPooling2D(3, strides=2, name="pool1")(x)
+
+    x = dense_block(x, blocks[0], name="conv2")
+    x = transition_block(x, 0.5, name="pool2")
+    x = dense_block(x, blocks[1], name="conv3")
+    x = transition_block(x, 0.5, name="pool3")
+    x = dense_block(x, blocks[2], name="conv4")
+    x = transition_block(x, 0.5, name="pool4")
+    x = dense_block(x, blocks[3], name="conv5")
+
+    x = layers.BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name="bn")(x)
+    x = layers.Activation("relu", name="relu")(x)
 
-  Args:
-    blocks: numbers of building blocks for the four dense layers.
-    include_top: whether to include the fully-connected
-      layer at the top of the network.
-    weights: one of `None` (random initialization),
-      'imagenet' (pre-training on ImageNet),
-      or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor
-      (i.e. output of `layers.Input()`)
-      to use as image input for the model.
-    input_shape: optional shape tuple, only to be specified
-      if `include_top` is False (otherwise the input shape
-      has to be `(224, 224, 3)` (with `'channels_last'` data format)
-      or `(3, 224, 224)` (with `'channels_first'` data format).
-      It should have exactly 3 inputs channels,
-      and width and height should be no smaller than 32.
-      E.g. `(200, 200, 3)` would be one valid value.
-    pooling: optional pooling mode for feature extraction
-      when `include_top` is `False`.
-      - `None` means that the output of the model will be
-          the 4D tensor output of the
-          last convolutional block.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a 2D tensor.
-      - `max` means that global max pooling will
-          be applied.
-    classes: optional number of classes to classify images
-      into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
+    if include_top:
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
 
-  Returns:
-    A `keras.Model` instance.
-  """
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
     else:
-      img_input = input_tensor
-
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-
-  x = layers.ZeroPadding2D(padding=((3, 3), (3, 3)))(img_input)
-  x = layers.Conv2D(64, 7, strides=2, use_bias=False, name='conv1/conv')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name='conv1/bn')(
-          x)
-  x = layers.Activation('relu', name='conv1/relu')(x)
-  x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)))(x)
-  x = layers.MaxPooling2D(3, strides=2, name='pool1')(x)
-
-  x = dense_block(x, blocks[0], name='conv2')
-  x = transition_block(x, 0.5, name='pool2')
-  x = dense_block(x, blocks[1], name='conv3')
-  x = transition_block(x, 0.5, name='pool3')
-  x = dense_block(x, blocks[2], name='conv4')
-  x = transition_block(x, 0.5, name='pool4')
-  x = dense_block(x, blocks[3], name='conv5')
-
-  x = layers.BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='bn')(x)
-  x = layers.Activation('relu', name='relu')(x)
-
-  if include_top:
-    x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  if blocks == [6, 12, 24, 16]:
-    model = training.Model(inputs, x, name='densenet121')
-  elif blocks == [6, 12, 32, 32]:
-    model = training.Model(inputs, x, name='densenet169')
-  elif blocks == [6, 12, 48, 32]:
-    model = training.Model(inputs, x, name='densenet201')
-  else:
-    model = training.Model(inputs, x, name='densenet')
-
-  # Load weights.
-  if weights == 'imagenet':
-    if include_top:
-      if blocks == [6, 12, 24, 16]:
-        weights_path = data_utils.get_file(
-            'densenet121_weights_tf_dim_ordering_tf_kernels.h5',
-            DENSENET121_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='9d60b8095a5708f2dcce2bca79d332c7')
-      elif blocks == [6, 12, 32, 32]:
-        weights_path = data_utils.get_file(
-            'densenet169_weights_tf_dim_ordering_tf_kernels.h5',
-            DENSENET169_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='d699b8f76981ab1b30698df4c175e90b')
-      elif blocks == [6, 12, 48, 32]:
-        weights_path = data_utils.get_file(
-            'densenet201_weights_tf_dim_ordering_tf_kernels.h5',
-            DENSENET201_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='1ceb130c1ea1b78c3bf6114dbdfd8807')
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D(name="max_pool")(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
     else:
-      if blocks == [6, 12, 24, 16]:
-        weights_path = data_utils.get_file(
-            'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5',
-            DENSENET121_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='30ee3e1110167f948a6b9946edeeb738')
-      elif blocks == [6, 12, 32, 32]:
-        weights_path = data_utils.get_file(
-            'densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5',
-            DENSENET169_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='b8c4d4c20dd625c148057b9ff1c1176b')
-      elif blocks == [6, 12, 48, 32]:
-        weights_path = data_utils.get_file(
-            'densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5',
-            DENSENET201_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='c13680b51ded0fb44dff2d8f86ac8bb1')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-@keras_export('keras.applications.densenet.DenseNet121',
-              'keras.applications.DenseNet121')
-def DenseNet121(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000,
-                classifier_activation='softmax'):
-  """Instantiates the Densenet121 architecture."""
-  return DenseNet([6, 12, 24, 16], include_top, weights, input_tensor,
-                  input_shape, pooling, classes, classifier_activation)
-
-
-@keras_export('keras.applications.densenet.DenseNet169',
-              'keras.applications.DenseNet169')
-def DenseNet169(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000,
-                classifier_activation='softmax'):
-  """Instantiates the Densenet169 architecture."""
-  return DenseNet([6, 12, 32, 32], include_top, weights, input_tensor,
-                  input_shape, pooling, classes, classifier_activation)
-
-
-@keras_export('keras.applications.densenet.DenseNet201',
-              'keras.applications.DenseNet201')
-def DenseNet201(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000,
-                classifier_activation='softmax'):
-  """Instantiates the Densenet201 architecture."""
-  return DenseNet([6, 12, 48, 32], include_top, weights, input_tensor,
-                  input_shape, pooling, classes, classifier_activation)
-
-
-@keras_export('keras.applications.densenet.preprocess_input')
+        inputs = img_input
+
+    # Create model.
+    if blocks == [6, 12, 24, 16]:
+        model = training.Model(inputs, x, name="densenet121")
+    elif blocks == [6, 12, 32, 32]:
+        model = training.Model(inputs, x, name="densenet169")
+    elif blocks == [6, 12, 48, 32]:
+        model = training.Model(inputs, x, name="densenet201")
+    else:
+        model = training.Model(inputs, x, name="densenet")
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            if blocks == [6, 12, 24, 16]:
+                weights_path = data_utils.get_file(
+                    "densenet121_weights_tf_dim_ordering_tf_kernels.h5",
+                    DENSENET121_WEIGHT_PATH,
+                    cache_subdir="models",
+                    file_hash="9d60b8095a5708f2dcce2bca79d332c7",
+                )
+            elif blocks == [6, 12, 32, 32]:
+                weights_path = data_utils.get_file(
+                    "densenet169_weights_tf_dim_ordering_tf_kernels.h5",
+                    DENSENET169_WEIGHT_PATH,
+                    cache_subdir="models",
+                    file_hash="d699b8f76981ab1b30698df4c175e90b",
+                )
+            elif blocks == [6, 12, 48, 32]:
+                weights_path = data_utils.get_file(
+                    "densenet201_weights_tf_dim_ordering_tf_kernels.h5",
+                    DENSENET201_WEIGHT_PATH,
+                    cache_subdir="models",
+                    file_hash="1ceb130c1ea1b78c3bf6114dbdfd8807",
+                )
+        else:
+            if blocks == [6, 12, 24, 16]:
+                weights_path = data_utils.get_file(
+                    "densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5",
+                    DENSENET121_WEIGHT_PATH_NO_TOP,
+                    cache_subdir="models",
+                    file_hash="30ee3e1110167f948a6b9946edeeb738",
+                )
+            elif blocks == [6, 12, 32, 32]:
+                weights_path = data_utils.get_file(
+                    "densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5",
+                    DENSENET169_WEIGHT_PATH_NO_TOP,
+                    cache_subdir="models",
+                    file_hash="b8c4d4c20dd625c148057b9ff1c1176b",
+                )
+            elif blocks == [6, 12, 48, 32]:
+                weights_path = data_utils.get_file(
+                    "densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5",
+                    DENSENET201_WEIGHT_PATH_NO_TOP,
+                    cache_subdir="models",
+                    file_hash="c13680b51ded0fb44dff2d8f86ac8bb1",
+                )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
+
+
+@keras_export(
+    "keras.applications.densenet.DenseNet121", "keras.applications.DenseNet121"
+)
+def DenseNet121(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    """Instantiates the Densenet121 architecture."""
+    return DenseNet(
+        [6, 12, 24, 16],
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.densenet.DenseNet169", "keras.applications.DenseNet169"
+)
+def DenseNet169(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    """Instantiates the Densenet169 architecture."""
+    return DenseNet(
+        [6, 12, 32, 32],
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.densenet.DenseNet201", "keras.applications.DenseNet201"
+)
+def DenseNet201(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    """Instantiates the Densenet201 architecture."""
+    return DenseNet(
+        [6, 12, 48, 32],
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        classifier_activation,
+    )
+
+
+@keras_export("keras.applications.densenet.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(
-      x, data_format=data_format, mode='torch')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="torch"
+    )
 
 
-@keras_export('keras.applications.densenet.decode_predictions')
+@keras_export("keras.applications.densenet.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TORCH,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
@@ -433,6 +489,6 @@ def decode_predictions(preds, top=5):
     A Keras model instance.
 """
 
-setattr(DenseNet121, '__doc__', DenseNet121.__doc__ + DOC)
-setattr(DenseNet169, '__doc__', DenseNet169.__doc__ + DOC)
-setattr(DenseNet201, '__doc__', DenseNet201.__doc__ + DOC)
+setattr(DenseNet121, "__doc__", DenseNet121.__doc__ + DOC)
+setattr(DenseNet169, "__doc__", DenseNet169.__doc__ + DOC)
+setattr(DenseNet201, "__doc__", DenseNet201.__doc__ + DOC)
diff --git a/keras/applications/efficientnet.py b/keras/applications/efficientnet.py
index f615ff278761..a7d9639eb5f5 100644
--- a/keras/applications/efficientnet.py
+++ b/keras/applications/efficientnet.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=missing-docstring
+
+
 """EfficientNet models for Keras.
 
 Reference:
@@ -24,6 +24,8 @@
 import copy
 import math
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.applications import imagenet_utils
 from keras.engine import training
@@ -31,113 +33,135 @@
 from keras.utils import data_utils
 from keras.utils import layer_utils
 
-import tensorflow.compat.v2 as tf
-
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-
-BASE_WEIGHTS_PATH = 'https://storage.googleapis.com/keras-applications/'
+BASE_WEIGHTS_PATH = "https://storage.googleapis.com/keras-applications/"
 
 WEIGHTS_HASHES = {
-    'b0': ('902e53a9f72be733fc0bcb005b3ebbac',
-           '50bc09e76180e00e4465e1a485ddc09d'),
-    'b1': ('1d254153d4ab51201f1646940f018540',
-           '74c4e6b3e1f6a1eea24c589628592432'),
-    'b2': ('b15cce36ff4dcbd00b6dd88e7857a6ad',
-           '111f8e2ac8aa800a7a99e3239f7bfb39'),
-    'b3': ('ffd1fdc53d0ce67064dc6a9c7960ede0',
-           'af6d107764bb5b1abb91932881670226'),
-    'b4': ('18c95ad55216b8f92d7e70b3a046e2fc',
-           'ebc24e6d6c33eaebbd558eafbeedf1ba'),
-    'b5': ('ace28f2a6363774853a83a0b21b9421a',
-           '38879255a25d3c92d5e44e04ae6cec6f'),
-    'b6': ('165f6e37dce68623721b423839de8be5',
-           '9ecce42647a20130c1f39a5d4cb75743'),
-    'b7': ('8c03f828fec3ef71311cd463b6759d99',
-           'cbcfe4450ddf6f3ad90b1b398090fe4a'),
+    "b0": (
+        "902e53a9f72be733fc0bcb005b3ebbac",
+        "50bc09e76180e00e4465e1a485ddc09d",
+    ),
+    "b1": (
+        "1d254153d4ab51201f1646940f018540",
+        "74c4e6b3e1f6a1eea24c589628592432",
+    ),
+    "b2": (
+        "b15cce36ff4dcbd00b6dd88e7857a6ad",
+        "111f8e2ac8aa800a7a99e3239f7bfb39",
+    ),
+    "b3": (
+        "ffd1fdc53d0ce67064dc6a9c7960ede0",
+        "af6d107764bb5b1abb91932881670226",
+    ),
+    "b4": (
+        "18c95ad55216b8f92d7e70b3a046e2fc",
+        "ebc24e6d6c33eaebbd558eafbeedf1ba",
+    ),
+    "b5": (
+        "ace28f2a6363774853a83a0b21b9421a",
+        "38879255a25d3c92d5e44e04ae6cec6f",
+    ),
+    "b6": (
+        "165f6e37dce68623721b423839de8be5",
+        "9ecce42647a20130c1f39a5d4cb75743",
+    ),
+    "b7": (
+        "8c03f828fec3ef71311cd463b6759d99",
+        "cbcfe4450ddf6f3ad90b1b398090fe4a",
+    ),
 }
 
-DEFAULT_BLOCKS_ARGS = [{
-    'kernel_size': 3,
-    'repeats': 1,
-    'filters_in': 32,
-    'filters_out': 16,
-    'expand_ratio': 1,
-    'id_skip': True,
-    'strides': 1,
-    'se_ratio': 0.25
-}, {
-    'kernel_size': 3,
-    'repeats': 2,
-    'filters_in': 16,
-    'filters_out': 24,
-    'expand_ratio': 6,
-    'id_skip': True,
-    'strides': 2,
-    'se_ratio': 0.25
-}, {
-    'kernel_size': 5,
-    'repeats': 2,
-    'filters_in': 24,
-    'filters_out': 40,
-    'expand_ratio': 6,
-    'id_skip': True,
-    'strides': 2,
-    'se_ratio': 0.25
-}, {
-    'kernel_size': 3,
-    'repeats': 3,
-    'filters_in': 40,
-    'filters_out': 80,
-    'expand_ratio': 6,
-    'id_skip': True,
-    'strides': 2,
-    'se_ratio': 0.25
-}, {
-    'kernel_size': 5,
-    'repeats': 3,
-    'filters_in': 80,
-    'filters_out': 112,
-    'expand_ratio': 6,
-    'id_skip': True,
-    'strides': 1,
-    'se_ratio': 0.25
-}, {
-    'kernel_size': 5,
-    'repeats': 4,
-    'filters_in': 112,
-    'filters_out': 192,
-    'expand_ratio': 6,
-    'id_skip': True,
-    'strides': 2,
-    'se_ratio': 0.25
-}, {
-    'kernel_size': 3,
-    'repeats': 1,
-    'filters_in': 192,
-    'filters_out': 320,
-    'expand_ratio': 6,
-    'id_skip': True,
-    'strides': 1,
-    'se_ratio': 0.25
-}]
+DEFAULT_BLOCKS_ARGS = [
+    {
+        "kernel_size": 3,
+        "repeats": 1,
+        "filters_in": 32,
+        "filters_out": 16,
+        "expand_ratio": 1,
+        "id_skip": True,
+        "strides": 1,
+        "se_ratio": 0.25,
+    },
+    {
+        "kernel_size": 3,
+        "repeats": 2,
+        "filters_in": 16,
+        "filters_out": 24,
+        "expand_ratio": 6,
+        "id_skip": True,
+        "strides": 2,
+        "se_ratio": 0.25,
+    },
+    {
+        "kernel_size": 5,
+        "repeats": 2,
+        "filters_in": 24,
+        "filters_out": 40,
+        "expand_ratio": 6,
+        "id_skip": True,
+        "strides": 2,
+        "se_ratio": 0.25,
+    },
+    {
+        "kernel_size": 3,
+        "repeats": 3,
+        "filters_in": 40,
+        "filters_out": 80,
+        "expand_ratio": 6,
+        "id_skip": True,
+        "strides": 2,
+        "se_ratio": 0.25,
+    },
+    {
+        "kernel_size": 5,
+        "repeats": 3,
+        "filters_in": 80,
+        "filters_out": 112,
+        "expand_ratio": 6,
+        "id_skip": True,
+        "strides": 1,
+        "se_ratio": 0.25,
+    },
+    {
+        "kernel_size": 5,
+        "repeats": 4,
+        "filters_in": 112,
+        "filters_out": 192,
+        "expand_ratio": 6,
+        "id_skip": True,
+        "strides": 2,
+        "se_ratio": 0.25,
+    },
+    {
+        "kernel_size": 3,
+        "repeats": 1,
+        "filters_in": 192,
+        "filters_out": 320,
+        "expand_ratio": 6,
+        "id_skip": True,
+        "strides": 1,
+        "se_ratio": 0.25,
+    },
+]
 
 CONV_KERNEL_INITIALIZER = {
-    'class_name': 'VarianceScaling',
-    'config': {
-        'scale': 2.0,
-        'mode': 'fan_out',
-        'distribution': 'truncated_normal'
-    }
+    "class_name": "VarianceScaling",
+    "config": {
+        "scale": 2.0,
+        "mode": "fan_out",
+        "distribution": "truncated_normal",
+    },
 }
 
 DENSE_KERNEL_INITIALIZER = {
-    'class_name': 'VarianceScaling',
-    'config': {
-        'scale': 1. / 3.,
-        'mode': 'fan_out',
-        'distribution': 'uniform'
-    }
+    "class_name": "VarianceScaling",
+    "config": {
+        "scale": 1.0 / 3.0,
+        "mode": "fan_out",
+        "distribution": "uniform",
+    },
 }
 
 layers = VersionAwareLayers()
@@ -168,7 +192,7 @@
 
   Args:
     include_top: Whether to include the fully-connected
-        layer at the top of the network. Defaults to True.
+        layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
           'imagenet' (pre-training on ImageNet),
           or the path to the weights file to be loaded. Defaults to 'imagenet'.
@@ -179,7 +203,7 @@
         if `include_top` is False.
         It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-        when `include_top` is `False`. Defaults to None.
+        when `include_top` is `False`. Defaults to `None`.
         - `None` means that the output of the model will be
             the 4D tensor output of the
             last convolutional layer.
@@ -191,8 +215,8 @@
             be applied.
     classes: Optional number of classes to classify images
         into, only to be specified if `include_top` is True, and
-        if no `weights` argument is specified. Defaults to 1000 (number of
-        ImageNet classes).
+        if no `weights` argument is specified. 1000 is how many
+        ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A `str` or callable. The activation function to use
         on the "top" layer. Ignored unless `include_top=True`. Set
         `classifier_activation=None` to return the logits of the "top" layer.
@@ -215,569 +239,633 @@ def EfficientNet(
     dropout_rate=0.2,
     drop_connect_rate=0.2,
     depth_divisor=8,
-    activation='swish',
-    blocks_args='default',
-    model_name='efficientnet',
+    activation="swish",
+    blocks_args="default",
+    model_name="efficientnet",
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the EfficientNet architecture using given scaling coefficients.
-
-  Args:
-    width_coefficient: float, scaling coefficient for network width.
-    depth_coefficient: float, scaling coefficient for network depth.
-    default_size: integer, default input image size.
-    dropout_rate: float, dropout rate before final classifier layer.
-    drop_connect_rate: float, dropout rate at skip connections.
-    depth_divisor: integer, a unit of network width.
-    activation: activation function.
-    blocks_args: list of dicts, parameters to construct block modules.
-    model_name: string, model name.
-    include_top: whether to include the fully-connected
-        layer at the top of the network.
-    weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor
-        (i.e. output of `layers.Input()`)
-        to use as image input for the model.
-    input_shape: optional shape tuple, only to be specified
-        if `include_top` is False.
-        It should have exactly 3 inputs channels.
-    pooling: optional pooling mode for feature extraction
-        when `include_top` is `False`.
-        - `None` means that the output of the model will be
-            the 4D tensor output of the
-            last convolutional layer.
-        - `avg` means that global average pooling
-            will be applied to the output of the
-            last convolutional layer, and thus
-            the output of the model will be a 2D tensor.
-        - `max` means that global max pooling will
-            be applied.
-    classes: optional number of classes to classify images
-        into, only to be specified if `include_top` is True, and
-        if no `weights` argument is specified.
-    classifier_activation: A `str` or callable. The activation function to use
-        on the "top" layer. Ignored unless `include_top=True`. Set
-        `classifier_activation=None` to return the logits of the "top" layer.
-
-  Returns:
-    A `keras.Model` instance.
-
-  Raises:
-    ValueError: in case of invalid argument for `weights`,
-      or invalid input shape.
-    ValueError: if `classifier_activation` is not `softmax` or `None` when
-      using a pretrained top layer.
-  """
-  if blocks_args == 'default':
-    blocks_args = DEFAULT_BLOCKS_ARGS
-
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-
-  def round_filters(filters, divisor=depth_divisor):
-    """Round number of filters based on depth multiplier."""
-    filters *= width_coefficient
-    new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_filters < 0.9 * filters:
-      new_filters += divisor
-    return int(new_filters)
-
-  def round_repeats(repeats):
-    """Round number of repeats based on depth multiplier."""
-    return int(math.ceil(depth_coefficient * repeats))
-
-  # Build stem
-  x = img_input
-  x = layers.Rescaling(1. / 255.)(x)
-  x = layers.Normalization(axis=bn_axis)(x)
-  if weights == 'imagenet':
-    # Note that the normaliztion layer uses square value of STDDEV as the
-    # variance for the layer: result = (input - mean) / sqrt(var)
-    # However, the orginal implemenetation uses (input - mean) / var to
-    # normalize the input, we need to divide another sqrt(var) to match the
-    # original implementation.
-    # See https://github.com/tensorflow/tensorflow/issues/49930 for more details
-    x = layers.Rescaling(1. / tf.math.sqrt(IMAGENET_STDDEV_RGB))(x)
-
-  x = layers.ZeroPadding2D(
-      padding=imagenet_utils.correct_pad(x, 3),
-      name='stem_conv_pad')(x)
-  x = layers.Conv2D(
-      round_filters(32),
-      3,
-      strides=2,
-      padding='valid',
-      use_bias=False,
-      kernel_initializer=CONV_KERNEL_INITIALIZER,
-      name='stem_conv')(x)
-  x = layers.BatchNormalization(axis=bn_axis, name='stem_bn')(x)
-  x = layers.Activation(activation, name='stem_activation')(x)
-
-  # Build blocks
-  blocks_args = copy.deepcopy(blocks_args)
-
-  b = 0
-  blocks = float(sum(round_repeats(args['repeats']) for args in blocks_args))
-  for (i, args) in enumerate(blocks_args):
-    assert args['repeats'] > 0
-    # Update block input and output filters based on depth multiplier.
-    args['filters_in'] = round_filters(args['filters_in'])
-    args['filters_out'] = round_filters(args['filters_out'])
-
-    for j in range(round_repeats(args.pop('repeats'))):
-      # The first block needs to take care of stride and filter size increase.
-      if j > 0:
-        args['strides'] = 1
-        args['filters_in'] = args['filters_out']
-      x = block(
-          x,
-          activation,
-          drop_connect_rate * b / blocks,
-          name='block{}{}_'.format(i + 1, chr(j + 97)),
-          **args)
-      b += 1
-
-  # Build top
-  x = layers.Conv2D(
-      round_filters(1280),
-      1,
-      padding='same',
-      use_bias=False,
-      kernel_initializer=CONV_KERNEL_INITIALIZER,
-      name='top_conv')(x)
-  x = layers.BatchNormalization(axis=bn_axis, name='top_bn')(x)
-  x = layers.Activation(activation, name='top_activation')(x)
-  if include_top:
-    x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    if dropout_rate > 0:
-      x = layers.Dropout(dropout_rate, name='top_dropout')(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(
-        classes,
-        activation=classifier_activation,
-        kernel_initializer=DENSE_KERNEL_INITIALIZER,
-        name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = training.Model(inputs, x, name=model_name)
-
-  # Load weights.
-  if weights == 'imagenet':
-    if include_top:
-      file_suffix = '.h5'
-      file_hash = WEIGHTS_HASHES[model_name[-2:]][0]
-    else:
-      file_suffix = '_notop.h5'
-      file_hash = WEIGHTS_HASHES[model_name[-2:]][1]
-    file_name = model_name + file_suffix
-    weights_path = data_utils.get_file(
-        file_name,
-        BASE_WEIGHTS_PATH + file_name,
-        cache_subdir='models',
-        file_hash=file_hash)
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-  return model
-
-
-def block(inputs,
-          activation='swish',
-          drop_rate=0.,
-          name='',
-          filters_in=32,
-          filters_out=16,
-          kernel_size=3,
-          strides=1,
-          expand_ratio=1,
-          se_ratio=0.,
-          id_skip=True):
-  """An inverted residual block.
-
-  Args:
-      inputs: input tensor.
+    classifier_activation="softmax",
+):
+    """Instantiates the EfficientNet architecture.
+
+    Args:
+      width_coefficient: float, scaling coefficient for network width.
+      depth_coefficient: float, scaling coefficient for network depth.
+      default_size: integer, default input image size.
+      dropout_rate: float, dropout rate before final classifier layer.
+      drop_connect_rate: float, dropout rate at skip connections.
+      depth_divisor: integer, a unit of network width.
       activation: activation function.
-      drop_rate: float between 0 and 1, fraction of the input units to drop.
-      name: string, block label.
-      filters_in: integer, the number of input filters.
-      filters_out: integer, the number of output filters.
-      kernel_size: integer, the dimension of the convolution window.
-      strides: integer, the stride of the convolution.
-      expand_ratio: integer, scaling coefficient for the input filters.
-      se_ratio: float between 0 and 1, fraction to squeeze the input filters.
-      id_skip: boolean.
-
-  Returns:
-      output tensor for the block.
-  """
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
+      blocks_args: list of dicts, parameters to construct block modules.
+      model_name: string, model name.
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor
+          (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False.
+          It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+          on the "top" layer. Ignored unless `include_top=True`. Set
+          `classifier_activation=None` to return the logits of the "top" layer.
+
+    Returns:
+      A `keras.Model` instance.
+
+    Raises:
+      ValueError: in case of invalid argument for `weights`,
+        or invalid input shape.
+      ValueError: if `classifier_activation` is not `softmax` or `None` when
+        using a pretrained top layer.
+    """
+    if blocks_args == "default":
+        blocks_args = DEFAULT_BLOCKS_ARGS
+
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top`'
+            " as true, `classes` should be 1000"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=default_size,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
+    else:
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    def round_filters(filters, divisor=depth_divisor):
+        """Round number of filters based on depth multiplier."""
+        filters *= width_coefficient
+        new_filters = max(
+            divisor, int(filters + divisor / 2) // divisor * divisor
+        )
+        # Make sure that round down does not go down by more than 10%.
+        if new_filters < 0.9 * filters:
+            new_filters += divisor
+        return int(new_filters)
+
+    def round_repeats(repeats):
+        """Round number of repeats based on depth multiplier."""
+        return int(math.ceil(depth_coefficient * repeats))
+
+    # Build stem
+    x = img_input
+    x = layers.Rescaling(1.0 / 255.0)(x)
+    x = layers.Normalization(axis=bn_axis)(x)
+    if weights == "imagenet":
+        # Note that the normaliztion layer uses square value of STDDEV as the
+        # variance for the layer: result = (input - mean) / sqrt(var)
+        # However, the original implemenetation uses (input - mean) / var to
+        # normalize the input, we need to divide another sqrt(var) to match the
+        # original implementation.
+        # See https://github.com/tensorflow/tensorflow/issues/49930 for more
+        # details
+        x = layers.Rescaling(
+            [1.0 / math.sqrt(stddev) for stddev in IMAGENET_STDDEV_RGB]
+        )(x)
 
-  # Expansion phase
-  filters = filters_in * expand_ratio
-  if expand_ratio != 1:
+    x = layers.ZeroPadding2D(
+        padding=imagenet_utils.correct_pad(x, 3), name="stem_conv_pad"
+    )(x)
     x = layers.Conv2D(
-        filters,
-        1,
-        padding='same',
+        round_filters(32),
+        3,
+        strides=2,
+        padding="valid",
         use_bias=False,
         kernel_initializer=CONV_KERNEL_INITIALIZER,
-        name=name + 'expand_conv')(
-            inputs)
-    x = layers.BatchNormalization(axis=bn_axis, name=name + 'expand_bn')(x)
-    x = layers.Activation(activation, name=name + 'expand_activation')(x)
-  else:
-    x = inputs
-
-  # Depthwise Convolution
-  if strides == 2:
-    x = layers.ZeroPadding2D(
-        padding=imagenet_utils.correct_pad(x, kernel_size),
-        name=name + 'dwconv_pad')(x)
-    conv_pad = 'valid'
-  else:
-    conv_pad = 'same'
-  x = layers.DepthwiseConv2D(
-      kernel_size,
-      strides=strides,
-      padding=conv_pad,
-      use_bias=False,
-      depthwise_initializer=CONV_KERNEL_INITIALIZER,
-      name=name + 'dwconv')(x)
-  x = layers.BatchNormalization(axis=bn_axis, name=name + 'bn')(x)
-  x = layers.Activation(activation, name=name + 'activation')(x)
-
-  # Squeeze and Excitation phase
-  if 0 < se_ratio <= 1:
-    filters_se = max(1, int(filters_in * se_ratio))
-    se = layers.GlobalAveragePooling2D(name=name + 'se_squeeze')(x)
-    if bn_axis == 1:
-      se_shape = (filters, 1, 1)
-    else:
-      se_shape = (1, 1, filters)
-    se = layers.Reshape(se_shape, name=name + 'se_reshape')(se)
-    se = layers.Conv2D(
-        filters_se,
+        name="stem_conv",
+    )(x)
+    x = layers.BatchNormalization(axis=bn_axis, name="stem_bn")(x)
+    x = layers.Activation(activation, name="stem_activation")(x)
+
+    # Build blocks
+    blocks_args = copy.deepcopy(blocks_args)
+
+    b = 0
+    blocks = float(sum(round_repeats(args["repeats"]) for args in blocks_args))
+    for i, args in enumerate(blocks_args):
+        assert args["repeats"] > 0
+        # Update block input and output filters based on depth multiplier.
+        args["filters_in"] = round_filters(args["filters_in"])
+        args["filters_out"] = round_filters(args["filters_out"])
+
+        for j in range(round_repeats(args.pop("repeats"))):
+            # The first block needs to take care of stride and filter size
+            # increase.
+            if j > 0:
+                args["strides"] = 1
+                args["filters_in"] = args["filters_out"]
+            x = block(
+                x,
+                activation,
+                drop_connect_rate * b / blocks,
+                name=f"block{i + 1}{chr(j + 97)}_",
+                **args,
+            )
+            b += 1
+
+    # Build top
+    x = layers.Conv2D(
+        round_filters(1280),
         1,
-        padding='same',
-        activation=activation,
+        padding="same",
+        use_bias=False,
         kernel_initializer=CONV_KERNEL_INITIALIZER,
-        name=name + 'se_reduce')(
-            se)
-    se = layers.Conv2D(
-        filters,
+        name="top_conv",
+    )(x)
+    x = layers.BatchNormalization(axis=bn_axis, name="top_bn")(x)
+    x = layers.Activation(activation, name="top_activation")(x)
+    if include_top:
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        if dropout_rate > 0:
+            x = layers.Dropout(dropout_rate, name="top_dropout")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes,
+            activation=classifier_activation,
+            kernel_initializer=DENSE_KERNEL_INITIALIZER,
+            name="predictions",
+        )(x)
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D(name="max_pool")(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = training.Model(inputs, x, name=model_name)
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            file_suffix = ".h5"
+            file_hash = WEIGHTS_HASHES[model_name[-2:]][0]
+        else:
+            file_suffix = "_notop.h5"
+            file_hash = WEIGHTS_HASHES[model_name[-2:]][1]
+        file_name = model_name + file_suffix
+        weights_path = data_utils.get_file(
+            file_name,
+            BASE_WEIGHTS_PATH + file_name,
+            cache_subdir="models",
+            file_hash=file_hash,
+        )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+    return model
+
+
+def block(
+    inputs,
+    activation="swish",
+    drop_rate=0.0,
+    name="",
+    filters_in=32,
+    filters_out=16,
+    kernel_size=3,
+    strides=1,
+    expand_ratio=1,
+    se_ratio=0.0,
+    id_skip=True,
+):
+    """An inverted residual block.
+
+    Args:
+        inputs: input tensor.
+        activation: activation function.
+        drop_rate: float between 0 and 1, fraction of the input units to drop.
+        name: string, block label.
+        filters_in: integer, the number of input filters.
+        filters_out: integer, the number of output filters.
+        kernel_size: integer, the dimension of the convolution window.
+        strides: integer, the stride of the convolution.
+        expand_ratio: integer, scaling coefficient for the input filters.
+        se_ratio: float between 0 and 1, fraction to squeeze the input filters.
+        id_skip: boolean.
+
+    Returns:
+        output tensor for the block.
+    """
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    # Expansion phase
+    filters = filters_in * expand_ratio
+    if expand_ratio != 1:
+        x = layers.Conv2D(
+            filters,
+            1,
+            padding="same",
+            use_bias=False,
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            name=name + "expand_conv",
+        )(inputs)
+        x = layers.BatchNormalization(axis=bn_axis, name=name + "expand_bn")(x)
+        x = layers.Activation(activation, name=name + "expand_activation")(x)
+    else:
+        x = inputs
+
+    # Depthwise Convolution
+    if strides == 2:
+        x = layers.ZeroPadding2D(
+            padding=imagenet_utils.correct_pad(x, kernel_size),
+            name=name + "dwconv_pad",
+        )(x)
+        conv_pad = "valid"
+    else:
+        conv_pad = "same"
+    x = layers.DepthwiseConv2D(
+        kernel_size,
+        strides=strides,
+        padding=conv_pad,
+        use_bias=False,
+        depthwise_initializer=CONV_KERNEL_INITIALIZER,
+        name=name + "dwconv",
+    )(x)
+    x = layers.BatchNormalization(axis=bn_axis, name=name + "bn")(x)
+    x = layers.Activation(activation, name=name + "activation")(x)
+
+    # Squeeze and Excitation phase
+    if 0 < se_ratio <= 1:
+        filters_se = max(1, int(filters_in * se_ratio))
+        se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
+        if bn_axis == 1:
+            se_shape = (filters, 1, 1)
+        else:
+            se_shape = (1, 1, filters)
+        se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
+        se = layers.Conv2D(
+            filters_se,
+            1,
+            padding="same",
+            activation=activation,
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            name=name + "se_reduce",
+        )(se)
+        se = layers.Conv2D(
+            filters,
+            1,
+            padding="same",
+            activation="sigmoid",
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            name=name + "se_expand",
+        )(se)
+        x = layers.multiply([x, se], name=name + "se_excite")
+
+    # Output phase
+    x = layers.Conv2D(
+        filters_out,
         1,
-        padding='same',
-        activation='sigmoid',
+        padding="same",
+        use_bias=False,
         kernel_initializer=CONV_KERNEL_INITIALIZER,
-        name=name + 'se_expand')(se)
-    x = layers.multiply([x, se], name=name + 'se_excite')
-
-  # Output phase
-  x = layers.Conv2D(
-      filters_out,
-      1,
-      padding='same',
-      use_bias=False,
-      kernel_initializer=CONV_KERNEL_INITIALIZER,
-      name=name + 'project_conv')(x)
-  x = layers.BatchNormalization(axis=bn_axis, name=name + 'project_bn')(x)
-  if id_skip and strides == 1 and filters_in == filters_out:
-    if drop_rate > 0:
-      x = layers.Dropout(
-          drop_rate, noise_shape=(None, 1, 1, 1), name=name + 'drop')(x)
-    x = layers.add([x, inputs], name=name + 'add')
-  return x
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB0',
-              'keras.applications.EfficientNetB0')
-def EfficientNetB0(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      1.0,
-      1.0,
-      224,
-      0.2,
-      model_name='efficientnetb0',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB1',
-              'keras.applications.EfficientNetB1')
-def EfficientNetB1(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      1.0,
-      1.1,
-      240,
-      0.2,
-      model_name='efficientnetb1',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB2',
-              'keras.applications.EfficientNetB2')
-def EfficientNetB2(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      1.1,
-      1.2,
-      260,
-      0.3,
-      model_name='efficientnetb2',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB3',
-              'keras.applications.EfficientNetB3')
-def EfficientNetB3(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      1.2,
-      1.4,
-      300,
-      0.3,
-      model_name='efficientnetb3',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB4',
-              'keras.applications.EfficientNetB4')
-def EfficientNetB4(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      1.4,
-      1.8,
-      380,
-      0.4,
-      model_name='efficientnetb4',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB5',
-              'keras.applications.EfficientNetB5')
-def EfficientNetB5(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      1.6,
-      2.2,
-      456,
-      0.4,
-      model_name='efficientnetb5',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB6',
-              'keras.applications.EfficientNetB6')
-def EfficientNetB6(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      1.8,
-      2.6,
-      528,
-      0.5,
-      model_name='efficientnetb6',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-@keras_export('keras.applications.efficientnet.EfficientNetB7',
-              'keras.applications.EfficientNetB7')
-def EfficientNetB7(include_top=True,
-                   weights='imagenet',
-                   input_tensor=None,
-                   input_shape=None,
-                   pooling=None,
-                   classes=1000,
-                   classifier_activation='softmax',
-                   **kwargs):
-  return EfficientNet(
-      2.0,
-      3.1,
-      600,
-      0.5,
-      model_name='efficientnetb7',
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      **kwargs)
-
-
-EfficientNetB0.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB0')
-EfficientNetB1.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB1')
-EfficientNetB2.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB2')
-EfficientNetB3.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB3')
-EfficientNetB4.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB4')
-EfficientNetB5.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB5')
-EfficientNetB6.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB6')
-EfficientNetB7.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB7')
-
-
-@keras_export('keras.applications.efficientnet.preprocess_input')
-def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
-  """A placeholder method for backward compatibility.
-
-  The preprocessing logic has been included in the efficientnet model
-  implementation. Users are no longer required to call this method to normalize
-  the input data. This method does nothing and only kept as a placeholder to
-  align the API surface between old and new version of model.
-
-  Args:
-    x: A floating point `numpy.array` or a `tf.Tensor`.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it,
-      it defaults to "channels_last").{mode}
-
-  Returns:
-    Unchanged `numpy.array` or `tf.Tensor`.
-  """
-  return x
-
-
-@keras_export('keras.applications.efficientnet.decode_predictions')
+        name=name + "project_conv",
+    )(x)
+    x = layers.BatchNormalization(axis=bn_axis, name=name + "project_bn")(x)
+    if id_skip and strides == 1 and filters_in == filters_out:
+        if drop_rate > 0:
+            x = layers.Dropout(
+                drop_rate, noise_shape=(None, 1, 1, 1), name=name + "drop"
+            )(x)
+        x = layers.add([x, inputs], name=name + "add")
+    return x
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB0",
+    "keras.applications.EfficientNetB0",
+)
+def EfficientNetB0(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        1.0,
+        1.0,
+        224,
+        0.2,
+        model_name="efficientnetb0",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB1",
+    "keras.applications.EfficientNetB1",
+)
+def EfficientNetB1(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        1.0,
+        1.1,
+        240,
+        0.2,
+        model_name="efficientnetb1",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB2",
+    "keras.applications.EfficientNetB2",
+)
+def EfficientNetB2(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        1.1,
+        1.2,
+        260,
+        0.3,
+        model_name="efficientnetb2",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB3",
+    "keras.applications.EfficientNetB3",
+)
+def EfficientNetB3(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        1.2,
+        1.4,
+        300,
+        0.3,
+        model_name="efficientnetb3",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB4",
+    "keras.applications.EfficientNetB4",
+)
+def EfficientNetB4(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        1.4,
+        1.8,
+        380,
+        0.4,
+        model_name="efficientnetb4",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB5",
+    "keras.applications.EfficientNetB5",
+)
+def EfficientNetB5(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        1.6,
+        2.2,
+        456,
+        0.4,
+        model_name="efficientnetb5",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB6",
+    "keras.applications.EfficientNetB6",
+)
+def EfficientNetB6(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        1.8,
+        2.6,
+        528,
+        0.5,
+        model_name="efficientnetb6",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet.EfficientNetB7",
+    "keras.applications.EfficientNetB7",
+)
+def EfficientNetB7(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    return EfficientNet(
+        2.0,
+        3.1,
+        600,
+        0.5,
+        model_name="efficientnetb7",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        **kwargs,
+    )
+
+
+EfficientNetB0.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB0")
+EfficientNetB1.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB1")
+EfficientNetB2.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB2")
+EfficientNetB3.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB3")
+EfficientNetB4.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB4")
+EfficientNetB5.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB5")
+EfficientNetB6.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB6")
+EfficientNetB7.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB7")
+
+
+@keras_export("keras.applications.efficientnet.preprocess_input")
+def preprocess_input(x, data_format=None):
+    """A placeholder method for backward compatibility.
+
+    The preprocessing logic has been included in the efficientnet model
+    implementation. Users are no longer required to call this method to
+    normalize the input data. This method does nothing and only kept as a
+    placeholder to align the API surface between old and new version of model.
+
+    Args:
+      x: A floating point `numpy.array` or a `tf.Tensor`.
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it uses "channels_last").
+        Defaults to `None`.
+
+    Returns:
+      Unchanged `numpy.array` or `tf.Tensor`.
+    """
+    return x
+
+
+@keras_export("keras.applications.efficientnet.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/efficientnet_v2.py b/keras/applications/efficientnet_v2.py
index 783d6a848b9f..2d309e757568 100644
--- a/keras/applications/efficientnet_v2.py
+++ b/keras/applications/efficientnet_v2.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=missing-docstring
+
+
 """EfficientNet V2 models for Keras.
 
 Reference:
@@ -24,91 +24,114 @@
 import copy
 import math
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import layers
 from keras.applications import imagenet_utils
 from keras.engine import training
 from keras.utils import data_utils
 from keras.utils import layer_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHTS_PATH = "https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/"
+BASE_WEIGHTS_PATH = "https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/"  # noqa: E501
 
 WEIGHTS_HASHES = {
-    "b0": ("21ecbf6da12460d5c40bb2f29ceb2188",
-           "893217f2bb855e2983157299931e43ff"),
-    "b1": ("069f0534ff22adf035c89e2d9547a9dc",
-           "0e80663031ca32d657f9caa404b6ec37"),
-    "b2": ("424e49f28180edbde1e94797771950a7",
-           "1dfe2e7a5d45b6632553a8961ea609eb"),
-    "b3": ("1f1fc43bd98a6e4fd8fdfd551e02c7a0",
-           "f6abf7b5849ac99a89b50dd3fd532856"),
-    "-s": ("e1d88a8495beba45748fedd0cecbe016",
-           "af0682fb74e8c54910f2d4393339c070"),
-    "-m": ("a3bf6aa3276309f4fc6a34aa114c95cd",
-           "1b8dc055df72dde80d614482840fe342"),
-    "-l": ("27e6d408b53c7ebc868fefa357689935",
-           "b0b66b5c863aef5b46e8608fe1711615"),
+    "b0": (
+        "21ecbf6da12460d5c40bb2f29ceb2188",
+        "893217f2bb855e2983157299931e43ff",
+    ),
+    "b1": (
+        "069f0534ff22adf035c89e2d9547a9dc",
+        "0e80663031ca32d657f9caa404b6ec37",
+    ),
+    "b2": (
+        "424e49f28180edbde1e94797771950a7",
+        "1dfe2e7a5d45b6632553a8961ea609eb",
+    ),
+    "b3": (
+        "1f1fc43bd98a6e4fd8fdfd551e02c7a0",
+        "f6abf7b5849ac99a89b50dd3fd532856",
+    ),
+    "-s": (
+        "e1d88a8495beba45748fedd0cecbe016",
+        "af0682fb74e8c54910f2d4393339c070",
+    ),
+    "-m": (
+        "a3bf6aa3276309f4fc6a34aa114c95cd",
+        "1b8dc055df72dde80d614482840fe342",
+    ),
+    "-l": (
+        "27e6d408b53c7ebc868fefa357689935",
+        "b0b66b5c863aef5b46e8608fe1711615",
+    ),
 }
 
 DEFAULT_BLOCKS_ARGS = {
-    "efficientnetv2-s": [{
-        "kernel_size": 3,
-        "num_repeat": 2,
-        "input_filters": 24,
-        "output_filters": 24,
-        "expand_ratio": 1,
-        "se_ratio": 0.0,
-        "strides": 1,
-        "conv_type": 1,
-    }, {
-        "kernel_size": 3,
-        "num_repeat": 4,
-        "input_filters": 24,
-        "output_filters": 48,
-        "expand_ratio": 4,
-        "se_ratio": 0.0,
-        "strides": 2,
-        "conv_type": 1,
-    }, {
-        "conv_type": 1,
-        "expand_ratio": 4,
-        "input_filters": 48,
-        "kernel_size": 3,
-        "num_repeat": 4,
-        "output_filters": 64,
-        "se_ratio": 0,
-        "strides": 2,
-    }, {
-        "conv_type": 0,
-        "expand_ratio": 4,
-        "input_filters": 64,
-        "kernel_size": 3,
-        "num_repeat": 6,
-        "output_filters": 128,
-        "se_ratio": 0.25,
-        "strides": 2,
-    }, {
-        "conv_type": 0,
-        "expand_ratio": 6,
-        "input_filters": 128,
-        "kernel_size": 3,
-        "num_repeat": 9,
-        "output_filters": 160,
-        "se_ratio": 0.25,
-        "strides": 1,
-    }, {
-        "conv_type": 0,
-        "expand_ratio": 6,
-        "input_filters": 160,
-        "kernel_size": 3,
-        "num_repeat": 15,
-        "output_filters": 256,
-        "se_ratio": 0.25,
-        "strides": 2,
-    }],
+    "efficientnetv2-s": [
+        {
+            "kernel_size": 3,
+            "num_repeat": 2,
+            "input_filters": 24,
+            "output_filters": 24,
+            "expand_ratio": 1,
+            "se_ratio": 0.0,
+            "strides": 1,
+            "conv_type": 1,
+        },
+        {
+            "kernel_size": 3,
+            "num_repeat": 4,
+            "input_filters": 24,
+            "output_filters": 48,
+            "expand_ratio": 4,
+            "se_ratio": 0.0,
+            "strides": 2,
+            "conv_type": 1,
+        },
+        {
+            "conv_type": 1,
+            "expand_ratio": 4,
+            "input_filters": 48,
+            "kernel_size": 3,
+            "num_repeat": 4,
+            "output_filters": 64,
+            "se_ratio": 0,
+            "strides": 2,
+        },
+        {
+            "conv_type": 0,
+            "expand_ratio": 4,
+            "input_filters": 64,
+            "kernel_size": 3,
+            "num_repeat": 6,
+            "output_filters": 128,
+            "se_ratio": 0.25,
+            "strides": 2,
+        },
+        {
+            "conv_type": 0,
+            "expand_ratio": 6,
+            "input_filters": 128,
+            "kernel_size": 3,
+            "num_repeat": 9,
+            "output_filters": 160,
+            "se_ratio": 0.25,
+            "strides": 1,
+        },
+        {
+            "conv_type": 0,
+            "expand_ratio": 6,
+            "input_filters": 160,
+            "kernel_size": 3,
+            "num_repeat": 15,
+            "output_filters": 256,
+            "se_ratio": 0.25,
+            "strides": 2,
+        },
+    ],
     "efficientnetv2-m": [
         {
             "kernel_size": 3,
@@ -508,17 +531,17 @@
     "config": {
         "scale": 2.0,
         "mode": "fan_out",
-        "distribution": "truncated_normal"
-    }
+        "distribution": "truncated_normal",
+    },
 }
 
 DENSE_KERNEL_INITIALIZER = {
     "class_name": "VarianceScaling",
     "config": {
-        "scale": 1. / 3.,
+        "scale": 1.0 / 3.0,
         "mode": "fan_out",
-        "distribution": "uniform"
-    }
+        "distribution": "uniform",
+    },
 }
 
 BASE_DOCSTRING = """Instantiates the {name} architecture.
@@ -539,19 +562,19 @@
     https://keras.io/guides/transfer_learning/).
 
   Note: each Keras Application expects a specific kind of input preprocessing.
-  For EfficientNetV2, by default input preprocessing is included as a part of the
-  model (as a `Rescaling` layer), and thus
+  For EfficientNetV2, by default input preprocessing is included as a part of
+  the model (as a `Rescaling` layer), and thus
   `tf.keras.applications.efficientnet_v2.preprocess_input` is actually a
-  pass-through function. In this use case, EfficientNetV2 models expect their inputs
-  to be float tensors of pixels with values in the [0-255] range.
+  pass-through function. In this use case, EfficientNetV2 models expect their
+  inputs to be float tensors of pixels with values in the [0-255] range.
   At the same time, preprocessing as a part of the model (i.e. `Rescaling`
   layer) can be disabled by setting `include_preprocessing` argument to False.
-  With preprocessing disabled EfficientNetV2 models expect their inputs to be float
-  tensors of pixels with values in the [-1, 1] range.
+  With preprocessing disabled EfficientNetV2 models expect their inputs to be
+  float tensors of pixels with values in the [-1, 1] range.
 
   Args:
     include_top: Boolean, whether to include the fully-connected
-      layer at the top of the network. Defaults to True.
+      layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
       `"imagenet"` (pre-training on ImageNet),
       or the path to the weights file to be loaded. Defaults to `"imagenet"`.
@@ -562,7 +585,7 @@
       if `include_top` is False.
       It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`. Defaults to None.
+      when `include_top` is `False`.
       - `None` means that the output of the model will be
           the 4D tensor output of the
           last convolutional layer.
@@ -572,16 +595,17 @@
           the output of the model will be a 2D tensor.
       - `"max"` means that global max pooling will
           be applied.
+      Defaults to `None`.
     classes: Optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified. Defaults to 1000 (number of
-      ImageNet classes).
+      if no `weights` argument is specified. 1000 is how many
+      ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A string or callable. The activation function to use
       on the `"top"` layer. Ignored unless `include_top=True`. Set
       `classifier_activation=None` to return the logits of the "top" layer.
-      Defaults to `"softmax"`.
       When loading pretrained weights, `classifier_activation` can only
       be `None` or `"softmax"`.
+      Defaults to `"softmax"`.
 
   Returns:
     A `keras.Model` instance.
@@ -589,19 +613,19 @@
 
 
 def round_filters(filters, width_coefficient, min_depth, depth_divisor):
-  """Round number of filters based on depth multiplier."""
-  filters *= width_coefficient
-  minimum_depth = min_depth or depth_divisor
-  new_filters = max(
-      minimum_depth,
-      int(filters + depth_divisor / 2) // depth_divisor * depth_divisor,
-  )
-  return int(new_filters)
+    """Round number of filters based on depth multiplier."""
+    filters *= width_coefficient
+    minimum_depth = min_depth or depth_divisor
+    new_filters = max(
+        minimum_depth,
+        int(filters + depth_divisor / 2) // depth_divisor * depth_divisor,
+    )
+    return int(new_filters)
 
 
 def round_repeats(repeats, depth_coefficient):
-  """Round number of repeats based on depth multiplier."""
-  return int(math.ceil(depth_coefficient * repeats))
+    """Round number of repeats based on depth multiplier."""
+    return int(math.ceil(depth_coefficient * repeats))
 
 
 def MBConvBlock(
@@ -616,103 +640,108 @@ def MBConvBlock(
     survival_probability: float = 0.8,
     name=None,
 ):
-  """MBConv block: Mobile Inverted Residual Bottleneck."""
-  bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
-
-  if name is None:
-    name = backend.get_uid("block0")
-
-  def apply(inputs):
-    # Expansion phase
-    filters = input_filters * expand_ratio
-    if expand_ratio != 1:
-      x = layers.Conv2D(
-          filters=filters,
-          kernel_size=1,
-          strides=1,
-          kernel_initializer=CONV_KERNEL_INITIALIZER,
-          padding="same",
-          data_format="channels_last",
-          use_bias=False,
-          name=name + "expand_conv",
-      )(inputs)
-      x = layers.BatchNormalization(
-          axis=bn_axis,
-          momentum=bn_momentum,
-          name=name + "expand_bn",
-      )(x)
-      x = layers.Activation(activation, name=name + "expand_activation")(x)
-    else:
-      x = inputs
+    """MBConv block: Mobile Inverted Residual Bottleneck."""
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    if name is None:
+        name = backend.get_uid("block0")
+
+    def apply(inputs):
+        # Expansion phase
+        filters = input_filters * expand_ratio
+        if expand_ratio != 1:
+            x = layers.Conv2D(
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                kernel_initializer=CONV_KERNEL_INITIALIZER,
+                padding="same",
+                data_format="channels_last",
+                use_bias=False,
+                name=name + "expand_conv",
+            )(inputs)
+            x = layers.BatchNormalization(
+                axis=bn_axis,
+                momentum=bn_momentum,
+                name=name + "expand_bn",
+            )(x)
+            x = layers.Activation(activation, name=name + "expand_activation")(
+                x
+            )
+        else:
+            x = inputs
+
+        # Depthwise conv
+        x = layers.DepthwiseConv2D(
+            kernel_size=kernel_size,
+            strides=strides,
+            depthwise_initializer=CONV_KERNEL_INITIALIZER,
+            padding="same",
+            data_format="channels_last",
+            use_bias=False,
+            name=name + "dwconv2",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=bn_axis, momentum=bn_momentum, name=name + "bn"
+        )(x)
+        x = layers.Activation(activation, name=name + "activation")(x)
+
+        # Squeeze and excite
+        if 0 < se_ratio <= 1:
+            filters_se = max(1, int(input_filters * se_ratio))
+            se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
+            if bn_axis == 1:
+                se_shape = (filters, 1, 1)
+            else:
+                se_shape = (1, 1, filters)
+            se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
+
+            se = layers.Conv2D(
+                filters_se,
+                1,
+                padding="same",
+                activation=activation,
+                kernel_initializer=CONV_KERNEL_INITIALIZER,
+                name=name + "se_reduce",
+            )(se)
+            se = layers.Conv2D(
+                filters,
+                1,
+                padding="same",
+                activation="sigmoid",
+                kernel_initializer=CONV_KERNEL_INITIALIZER,
+                name=name + "se_expand",
+            )(se)
+
+            x = layers.multiply([x, se], name=name + "se_excite")
+
+        # Output phase
+        x = layers.Conv2D(
+            filters=output_filters,
+            kernel_size=1,
+            strides=1,
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            padding="same",
+            data_format="channels_last",
+            use_bias=False,
+            name=name + "project_conv",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=bn_axis, momentum=bn_momentum, name=name + "project_bn"
+        )(x)
 
-    # Depthwise conv
-    x = layers.DepthwiseConv2D(
-        kernel_size=kernel_size,
-        strides=strides,
-        depthwise_initializer=CONV_KERNEL_INITIALIZER,
-        padding="same",
-        data_format="channels_last",
-        use_bias=False,
-        name=name + "dwconv2",
-    )(x)
-    x = layers.BatchNormalization(
-        axis=bn_axis, momentum=bn_momentum, name=name + "bn")(x)
-    x = layers.Activation(activation, name=name + "activation")(x)
-
-    # Squeeze and excite
-    if 0 < se_ratio <= 1:
-      filters_se = max(1, int(input_filters * se_ratio))
-      se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
-      if bn_axis == 1:
-        se_shape = (filters, 1, 1)
-      else:
-        se_shape = (1, 1, filters)
-      se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
-
-      se = layers.Conv2D(
-          filters_se,
-          1,
-          padding="same",
-          activation=activation,
-          kernel_initializer=CONV_KERNEL_INITIALIZER,
-          name=name + "se_reduce",
-      )(se)
-      se = layers.Conv2D(
-          filters,
-          1,
-          padding="same",
-          activation="sigmoid",
-          kernel_initializer=CONV_KERNEL_INITIALIZER,
-          name=name + "se_expand",
-      )(se)
-
-      x = layers.multiply([x, se], name=name + "se_excite")
-
-      # Output phase
-      x = layers.Conv2D(
-          filters=output_filters,
-          kernel_size=1,
-          strides=1,
-          kernel_initializer=CONV_KERNEL_INITIALIZER,
-          padding="same",
-          data_format="channels_last",
-          use_bias=False,
-          name=name + "project_conv",
-      )(x)
-      x = layers.BatchNormalization(
-          axis=bn_axis, momentum=bn_momentum, name=name + "project_bn")(x)
-
-      if strides == 1 and input_filters == output_filters:
-        if survival_probability:
-          x = layers.Dropout(
-              survival_probability,
-              noise_shape=(None, 1, 1, 1),
-              name=name + "drop",
-          )(x)
-        x = layers.add([x, inputs], name=name + "add")
-    return x
+        if strides == 1 and input_filters == output_filters:
+            if survival_probability:
+                x = layers.Dropout(
+                    survival_probability,
+                    noise_shape=(None, 1, 1, 1),
+                    name=name + "drop",
+                )(x)
+            x = layers.add([x, inputs], name=name + "add")
+
+        return x
 
-  return apply
+    return apply
 
 
 def FusedMBConvBlock(
@@ -727,90 +756,95 @@ def FusedMBConvBlock(
     survival_probability: float = 0.8,
     name=None,
 ):
-  """Fused MBConv Block: Fusing the proj conv1x1 and depthwise_conv into a conv2d."""
-  bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
-
-  if name is None:
-    name = backend.get_uid("block0")
-
-  def apply(inputs):
-    filters = input_filters * expand_ratio
-    if expand_ratio != 1:
-      x = layers.Conv2D(
-          filters,
-          kernel_size=kernel_size,
-          strides=strides,
-          kernel_initializer=CONV_KERNEL_INITIALIZER,
-          data_format="channels_last",
-          padding="same",
-          use_bias=False,
-          name=name + "expand_conv",
-      )(inputs)
-      x = layers.BatchNormalization(
-          axis=bn_axis, momentum=bn_momentum, name=name + "expand_bn")(x)
-      x = layers.Activation(
-          activation=activation, name=name + "expand_activation")(x)
-    else:
-      x = inputs
-
-    # Squeeze and excite
-    if 0 < se_ratio <= 1:
-      filters_se = max(1, int(input_filters * se_ratio))
-      se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
-      if bn_axis == 1:
-        se_shape = (filters, 1, 1)
-      else:
-        se_shape = (1, 1, filters)
-
-      se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
-
-      se = layers.Conv2D(
-          filters_se,
-          1,
-          padding="same",
-          activation=activation,
-          kernel_initializer=CONV_KERNEL_INITIALIZER,
-          name=name + "se_reduce",
-      )(se)
-      se = layers.Conv2D(
-          filters,
-          1,
-          padding="same",
-          activation="sigmoid",
-          kernel_initializer=CONV_KERNEL_INITIALIZER,
-          name=name + "se_expand",
-      )(se)
-
-      x = layers.multiply([x, se], name=name + "se_excite")
-
-    # Output phase:
-    x = layers.Conv2D(
-        output_filters,
-        kernel_size=1 if expand_ratio != 1 else kernel_size,
-        strides=1 if expand_ratio != 1 else strides,
-        kernel_initializer=CONV_KERNEL_INITIALIZER,
-        padding="same",
-        use_bias=False,
-        name=name + "project_conv",
-    )(x)
-    x = layers.BatchNormalization(
-        axis=bn_axis, momentum=bn_momentum, name=name + "project_bn")(x)
-    if expand_ratio == 1:
-      x = layers.Activation(
-          activation=activation, name=name + "project_activation")(x)
-
-    # Residual:
-    if strides == 1 and input_filters == output_filters:
-      if survival_probability:
-        x = layers.Dropout(
-            survival_probability,
-            noise_shape=(None, 1, 1, 1),
-            name=name + "drop",
+    """Fused MBConv Block: Fusing the proj conv1x1 and depthwise_conv into a
+    conv2d."""
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    if name is None:
+        name = backend.get_uid("block0")
+
+    def apply(inputs):
+        filters = input_filters * expand_ratio
+        if expand_ratio != 1:
+            x = layers.Conv2D(
+                filters,
+                kernel_size=kernel_size,
+                strides=strides,
+                kernel_initializer=CONV_KERNEL_INITIALIZER,
+                data_format="channels_last",
+                padding="same",
+                use_bias=False,
+                name=name + "expand_conv",
+            )(inputs)
+            x = layers.BatchNormalization(
+                axis=bn_axis, momentum=bn_momentum, name=name + "expand_bn"
+            )(x)
+            x = layers.Activation(
+                activation=activation, name=name + "expand_activation"
+            )(x)
+        else:
+            x = inputs
+
+        # Squeeze and excite
+        if 0 < se_ratio <= 1:
+            filters_se = max(1, int(input_filters * se_ratio))
+            se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
+            if bn_axis == 1:
+                se_shape = (filters, 1, 1)
+            else:
+                se_shape = (1, 1, filters)
+
+            se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
+
+            se = layers.Conv2D(
+                filters_se,
+                1,
+                padding="same",
+                activation=activation,
+                kernel_initializer=CONV_KERNEL_INITIALIZER,
+                name=name + "se_reduce",
+            )(se)
+            se = layers.Conv2D(
+                filters,
+                1,
+                padding="same",
+                activation="sigmoid",
+                kernel_initializer=CONV_KERNEL_INITIALIZER,
+                name=name + "se_expand",
+            )(se)
+
+            x = layers.multiply([x, se], name=name + "se_excite")
+
+        # Output phase:
+        x = layers.Conv2D(
+            output_filters,
+            kernel_size=1 if expand_ratio != 1 else kernel_size,
+            strides=1 if expand_ratio != 1 else strides,
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            padding="same",
+            use_bias=False,
+            name=name + "project_conv",
         )(x)
-      x = layers.add([x, inputs], name=name + "add")
-    return x
+        x = layers.BatchNormalization(
+            axis=bn_axis, momentum=bn_momentum, name=name + "project_bn"
+        )(x)
+        if expand_ratio == 1:
+            x = layers.Activation(
+                activation=activation, name=name + "project_activation"
+            )(x)
+
+        # Residual:
+        if strides == 1 and input_filters == output_filters:
+            if survival_probability:
+                x = layers.Dropout(
+                    survival_probability,
+                    noise_shape=(None, 1, 1, 1),
+                    name=name + "drop",
+                )(x)
+            x = layers.add([x, inputs], name=name + "add")
+        return x
 
-  return apply
+    return apply
 
 
 def EfficientNetV2(
@@ -834,238 +868,255 @@ def EfficientNetV2(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Instantiates the EfficientNetV2 architecture using given scaling coefficients.
-
-  Args:
-    width_coefficient: float, scaling coefficient for network width.
-    depth_coefficient: float, scaling coefficient for network depth.
-    default_size: integer, default input image size.
-    dropout_rate: float, dropout rate before final classifier layer.
-    drop_connect_rate: float, dropout rate at skip connections.
-    depth_divisor: integer, a unit of network width.
-    min_depth: integer, minimum number of filters.
-    bn_momentum: float. Momentum parameter for Batch Normalization layers.
-    activation: activation function.
-    blocks_args: list of dicts, parameters to construct block modules.
-    model_name: string, model name.
-    include_top: whether to include the fully-connected layer at the top of the
-      network.
-    weights: one of `None` (random initialization), `"imagenet"` (pre-training
-      on ImageNet), or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) or
-      numpy array to use as image input for the model.
-    input_shape: optional shape tuple, only to be specified if `include_top` is
-      False. It should have exactly 3 inputs channels.
-    pooling: optional pooling mode for feature extraction when `include_top` is
-      `False`. - `None` means that the output of the model will be the 4D tensor
-      output of the last convolutional layer. - "avg" means that global average
-      pooling will be applied to the output of the last convolutional layer, and
-      thus the output of the model will be a 2D tensor. - `"max"` means that
-      global max pooling will be applied.
-    classes: optional number of classes to classify images into, only to be
-      specified if `include_top` is True, and if no `weights` argument is
-      specified.
-    classifier_activation: A string or callable. The activation function to use
-      on the `"top"` layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the `"top"` layer.
-    include_preprocessing: Boolean, whether to include the preprocessing layer
-      (`Rescaling`) at the bottom of the network. Defaults to `True`.
-
-  Returns:
-    A `keras.Model` instance.
-
-  Raises:
-    ValueError: in case of invalid argument for `weights`,
-      or invalid input shape.
-    ValueError: if `classifier_activation` is not `"softmax"` or `None` when
-      using a pretrained top layer.
-  """
-
-  if blocks_args == "default":
-    blocks_args = DEFAULT_BLOCKS_ARGS[model_name]
-
-  if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
-    raise ValueError("The `weights` argument should be either "
-                     "`None` (random initialization), `imagenet` "
-                     "(pre-training on ImageNet), "
-                     "or the path to the weights file to be loaded."
-                     f"Received: weights={weights}")
-
-  if weights == "imagenet" and include_top and classes != 1000:
-    raise ValueError("If using `weights` as `'imagenet'` with `include_top`"
-                     " as true, `classes` should be 1000"
-                     f"Received: classes={classes}")
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
-
-  x = img_input
-
-  if include_preprocessing:
-    # Apply original V1 preprocessing for Bx variants
-    # if number of channels allows it
-    num_channels = input_shape[bn_axis - 1]
-    if model_name.split("-")[-1].startswith("b") and num_channels == 3:
-      x = layers.Rescaling(scale=1. / 255)(x)
-      x = layers.Normalization(
-          mean=[0.485, 0.456, 0.406],
-          variance=[0.229**2, 0.224**2, 0.225**2],
-          axis=bn_axis,
-      )(x)
+    """Instantiates the EfficientNetV2 architecture using given scaling
+    coefficients.
+
+    Args:
+      width_coefficient: float, scaling coefficient for network width.
+      depth_coefficient: float, scaling coefficient for network depth.
+      default_size: integer, default input image size.
+      dropout_rate: float, dropout rate before final classifier layer.
+      drop_connect_rate: float, dropout rate at skip connections.
+      depth_divisor: integer, a unit of network width.
+      min_depth: integer, minimum number of filters.
+      bn_momentum: float. Momentum parameter for Batch Normalization layers.
+      activation: activation function.
+      blocks_args: list of dicts, parameters to construct block modules.
+      model_name: string, model name.
+      include_top: whether to include the fully-connected layer at the top of
+        the network.
+      weights: one of `None` (random initialization), `"imagenet"` (pre-training
+        on ImageNet), or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) or
+        numpy array to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified if `include_top`
+        is False. It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction when `include_top`
+        is `False`.
+        - `None` means that the output of the model will be the 4D tensor output
+          of the last convolutional layer.
+        - "avg" means that global average pooling will be applied to the output
+          of the last convolutional layer, and thus the output of the model will
+          be a 2D tensor.
+        - `"max"` means that global max pooling will be applied.
+      classes: optional number of classes to classify images into, only to be
+        specified if `include_top` is True, and if no `weights` argument is
+        specified.
+      classifier_activation: A string or callable. The activation function to
+        use on the `"top"` layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the `"top"` layer.
+      include_preprocessing: Boolean, whether to include the preprocessing layer
+        (`Rescaling`) at the bottom of the network. Defaults to `True`.
+
+    Returns:
+      A `keras.Model` instance.
+
+    Raises:
+      ValueError: in case of invalid argument for `weights`,
+        or invalid input shape.
+      ValueError: if `classifier_activation` is not `"softmax"` or `None` when
+        using a pretrained top layer.
+    """
+
+    if blocks_args == "default":
+        blocks_args = DEFAULT_BLOCKS_ARGS[model_name]
+
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+            f"Received: weights={weights}"
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            "If using `weights` as `'imagenet'` with `include_top`"
+            " as true, `classes` should be 1000"
+            f"Received: classes={classes}"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=default_size,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      x = layers.Rescaling(scale=1. / 128.0, offset=-1)(x)
-
-  # Build stem
-  stem_filters = round_filters(
-      filters=blocks_args[0]["input_filters"],
-      width_coefficient=width_coefficient,
-      min_depth=min_depth,
-      depth_divisor=depth_divisor,
-  )
-  x = layers.Conv2D(
-      filters=stem_filters,
-      kernel_size=3,
-      strides=2,
-      kernel_initializer=CONV_KERNEL_INITIALIZER,
-      padding="same",
-      use_bias=False,
-      name="stem_conv",
-  )(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=bn_momentum,
-      name="stem_bn",
-  )(x)
-  x = layers.Activation(activation, name="stem_activation")(x)
-
-  # Build blocks
-  blocks_args = copy.deepcopy(blocks_args)
-  b = 0
-  blocks = float(sum(args["num_repeat"] for args in blocks_args))
-
-  for (i, args) in enumerate(blocks_args):
-    assert args["num_repeat"] > 0
-
-    # Update block input and output filters based on depth multiplier.
-    args["input_filters"] = round_filters(
-        filters=args["input_filters"],
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    x = img_input
+
+    if include_preprocessing:
+        # Apply original V1 preprocessing for Bx variants
+        # if number of channels allows it
+        num_channels = input_shape[bn_axis - 1]
+        if model_name.split("-")[-1].startswith("b") and num_channels == 3:
+            x = layers.Rescaling(scale=1.0 / 255)(x)
+            x = layers.Normalization(
+                mean=[0.485, 0.456, 0.406],
+                variance=[0.229**2, 0.224**2, 0.225**2],
+                axis=bn_axis,
+            )(x)
+        else:
+            x = layers.Rescaling(scale=1.0 / 128.0, offset=-1)(x)
+
+    # Build stem
+    stem_filters = round_filters(
+        filters=blocks_args[0]["input_filters"],
         width_coefficient=width_coefficient,
         min_depth=min_depth,
-        depth_divisor=depth_divisor)
-    args["output_filters"] = round_filters(
-        filters=args["output_filters"],
+        depth_divisor=depth_divisor,
+    )
+    x = layers.Conv2D(
+        filters=stem_filters,
+        kernel_size=3,
+        strides=2,
+        kernel_initializer=CONV_KERNEL_INITIALIZER,
+        padding="same",
+        use_bias=False,
+        name="stem_conv",
+    )(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis,
+        momentum=bn_momentum,
+        name="stem_bn",
+    )(x)
+    x = layers.Activation(activation, name="stem_activation")(x)
+
+    # Build blocks
+    blocks_args = copy.deepcopy(blocks_args)
+    b = 0
+    blocks = float(sum(args["num_repeat"] for args in blocks_args))
+
+    for i, args in enumerate(blocks_args):
+        assert args["num_repeat"] > 0
+
+        # Update block input and output filters based on depth multiplier.
+        args["input_filters"] = round_filters(
+            filters=args["input_filters"],
+            width_coefficient=width_coefficient,
+            min_depth=min_depth,
+            depth_divisor=depth_divisor,
+        )
+        args["output_filters"] = round_filters(
+            filters=args["output_filters"],
+            width_coefficient=width_coefficient,
+            min_depth=min_depth,
+            depth_divisor=depth_divisor,
+        )
+
+        # Determine which conv type to use:
+        block = {0: MBConvBlock, 1: FusedMBConvBlock}[args.pop("conv_type")]
+        repeats = round_repeats(
+            repeats=args.pop("num_repeat"), depth_coefficient=depth_coefficient
+        )
+        for j in range(repeats):
+            # The first block needs to take care of stride and filter size
+            # increase.
+            if j > 0:
+                args["strides"] = 1
+                args["input_filters"] = args["output_filters"]
+
+            x = block(
+                activation=activation,
+                bn_momentum=bn_momentum,
+                survival_probability=drop_connect_rate * b / blocks,
+                name=f"block{i + 1}{chr(j + 97)}_",
+                **args,
+            )(x)
+            b += 1
+
+    # Build top
+    top_filters = round_filters(
+        filters=1280,
         width_coefficient=width_coefficient,
         min_depth=min_depth,
-        depth_divisor=depth_divisor)
-
-    # Determine which conv type to use:
-    block = {0: MBConvBlock, 1: FusedMBConvBlock}[args.pop("conv_type")]
-    repeats = round_repeats(
-        repeats=args.pop("num_repeat"), depth_coefficient=depth_coefficient)
-    for j in range(repeats):
-      # The first block needs to take care of stride and filter size increase.
-      if j > 0:
-        args["strides"] = 1
-        args["input_filters"] = args["output_filters"]
-
-      x = block(
-          activation=activation,
-          bn_momentum=bn_momentum,
-          survival_probability=drop_connect_rate * b / blocks,
-          name="block{}{}_".format(i + 1, chr(j + 97)),
-          **args,
-      )(x)
-      b += 1
-
-  # Build top
-  top_filters = round_filters(
-      filters=1280,
-      width_coefficient=width_coefficient,
-      min_depth=min_depth,
-      depth_divisor=depth_divisor)
-  x = layers.Conv2D(
-      filters=top_filters,
-      kernel_size=1,
-      strides=1,
-      kernel_initializer=CONV_KERNEL_INITIALIZER,
-      padding="same",
-      data_format="channels_last",
-      use_bias=False,
-      name="top_conv",
-  )(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis,
-      momentum=bn_momentum,
-      name="top_bn",
-  )(x)
-  x = layers.Activation(activation=activation, name="top_activation")(x)
-
-  if include_top:
-    x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
-    if dropout_rate > 0:
-      x = layers.Dropout(dropout_rate, name="top_dropout")(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(
-        classes,
-        activation=classifier_activation,
-        kernel_initializer=DENSE_KERNEL_INITIALIZER,
-        bias_initializer=tf.constant_initializer(0),
-        name="predictions")(x)
-  else:
-    if pooling == "avg":
-      x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
-    elif pooling == "max":
-      x = layers.GlobalMaxPooling2D(name="max_pool")(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = training.Model(inputs, x, name=model_name)
-
-  # Load weights.
-  if weights == "imagenet":
+        depth_divisor=depth_divisor,
+    )
+    x = layers.Conv2D(
+        filters=top_filters,
+        kernel_size=1,
+        strides=1,
+        kernel_initializer=CONV_KERNEL_INITIALIZER,
+        padding="same",
+        data_format="channels_last",
+        use_bias=False,
+        name="top_conv",
+    )(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis,
+        momentum=bn_momentum,
+        name="top_bn",
+    )(x)
+    x = layers.Activation(activation=activation, name="top_activation")(x)
+
     if include_top:
-      file_suffix = ".h5"
-      file_hash = WEIGHTS_HASHES[model_name[-2:]][0]
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        if dropout_rate > 0:
+            x = layers.Dropout(dropout_rate, name="top_dropout")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes,
+            activation=classifier_activation,
+            kernel_initializer=DENSE_KERNEL_INITIALIZER,
+            bias_initializer=tf.constant_initializer(0),
+            name="predictions",
+        )(x)
     else:
-      file_suffix = "_notop.h5"
-      file_hash = WEIGHTS_HASHES[model_name[-2:]][1]
-    file_name = model_name + file_suffix
-    weights_path = data_utils.get_file(
-        file_name,
-        BASE_WEIGHTS_PATH + file_name,
-        cache_subdir="models",
-        file_hash=file_hash)
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-@keras_export("keras.applications.efficientnet_v2.EfficientNetV2B0",
-              "keras.applications.EfficientNetV2B0")
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D(name="max_pool")(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = training.Model(inputs, x, name=model_name)
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            file_suffix = ".h5"
+            file_hash = WEIGHTS_HASHES[model_name[-2:]][0]
+        else:
+            file_suffix = "_notop.h5"
+            file_hash = WEIGHTS_HASHES[model_name[-2:]][1]
+        file_name = model_name + file_suffix
+        weights_path = data_utils.get_file(
+            file_name,
+            BASE_WEIGHTS_PATH + file_name,
+            cache_subdir="models",
+            file_hash=file_hash,
+        )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
+
+
+@keras_export(
+    "keras.applications.efficientnet_v2.EfficientNetV2B0",
+    "keras.applications.EfficientNetV2B0",
+)
 def EfficientNetV2B0(
     include_top=True,
     weights="imagenet",
@@ -1076,23 +1127,26 @@ def EfficientNetV2B0(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  return EfficientNetV2(
-      width_coefficient=1.0,
-      depth_coefficient=1.0,
-      default_size=224,
-      model_name="efficientnetv2-b0",
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      include_preprocessing=include_preprocessing)
-
-
-@keras_export("keras.applications.efficientnet_v2.EfficientNetV2B1",
-              "keras.applications.EfficientNetV2B1")
+    return EfficientNetV2(
+        width_coefficient=1.0,
+        depth_coefficient=1.0,
+        default_size=224,
+        model_name="efficientnetv2-b0",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet_v2.EfficientNetV2B1",
+    "keras.applications.EfficientNetV2B1",
+)
 def EfficientNetV2B1(
     include_top=True,
     weights="imagenet",
@@ -1103,24 +1157,26 @@ def EfficientNetV2B1(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  return EfficientNetV2(
-      width_coefficient=1.0,
-      depth_coefficient=1.1,
-      default_size=240,
-      model_name="efficientnetv2-b1",
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.efficientnet_v2.EfficientNetV2B2",
-              "keras.applications.EfficientNetV2B2")
+    return EfficientNetV2(
+        width_coefficient=1.0,
+        depth_coefficient=1.1,
+        default_size=240,
+        model_name="efficientnetv2-b1",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet_v2.EfficientNetV2B2",
+    "keras.applications.EfficientNetV2B2",
+)
 def EfficientNetV2B2(
     include_top=True,
     weights="imagenet",
@@ -1131,24 +1187,26 @@ def EfficientNetV2B2(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  return EfficientNetV2(
-      width_coefficient=1.1,
-      depth_coefficient=1.2,
-      default_size=260,
-      model_name="efficientnetv2-b2",
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.efficientnet_v2.EfficientNetV2B3",
-              "keras.applications.EfficientNetV2B3")
+    return EfficientNetV2(
+        width_coefficient=1.1,
+        depth_coefficient=1.2,
+        default_size=260,
+        model_name="efficientnetv2-b2",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet_v2.EfficientNetV2B3",
+    "keras.applications.EfficientNetV2B3",
+)
 def EfficientNetV2B3(
     include_top=True,
     weights="imagenet",
@@ -1159,24 +1217,26 @@ def EfficientNetV2B3(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  return EfficientNetV2(
-      width_coefficient=1.2,
-      depth_coefficient=1.4,
-      default_size=300,
-      model_name="efficientnetv2-b3",
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.efficientnet_v2.EfficientNetV2S",
-              "keras.applications.EfficientNetV2S")
+    return EfficientNetV2(
+        width_coefficient=1.2,
+        depth_coefficient=1.4,
+        default_size=300,
+        model_name="efficientnetv2-b3",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet_v2.EfficientNetV2S",
+    "keras.applications.EfficientNetV2S",
+)
 def EfficientNetV2S(
     include_top=True,
     weights="imagenet",
@@ -1187,24 +1247,26 @@ def EfficientNetV2S(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  return EfficientNetV2(
-      width_coefficient=1.0,
-      depth_coefficient=1.0,
-      default_size=384,
-      model_name="efficientnetv2-s",
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.efficientnet_v2.EfficientNetV2M",
-              "keras.applications.EfficientNetV2M")
+    return EfficientNetV2(
+        width_coefficient=1.0,
+        depth_coefficient=1.0,
+        default_size=384,
+        model_name="efficientnetv2-s",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet_v2.EfficientNetV2M",
+    "keras.applications.EfficientNetV2M",
+)
 def EfficientNetV2M(
     include_top=True,
     weights="imagenet",
@@ -1215,24 +1277,26 @@ def EfficientNetV2M(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  return EfficientNetV2(
-      width_coefficient=1.0,
-      depth_coefficient=1.0,
-      default_size=480,
-      model_name="efficientnetv2-m",
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.efficientnet_v2.EfficientNetV2L",
-              "keras.applications.EfficientNetV2L")
+    return EfficientNetV2(
+        width_coefficient=1.0,
+        depth_coefficient=1.0,
+        default_size=480,
+        model_name="efficientnetv2-m",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.efficientnet_v2.EfficientNetV2L",
+    "keras.applications.EfficientNetV2L",
+)
 def EfficientNetV2L(
     include_top=True,
     weights="imagenet",
@@ -1243,20 +1307,20 @@ def EfficientNetV2L(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  return EfficientNetV2(
-      width_coefficient=1.0,
-      depth_coefficient=1.0,
-      default_size=480,
-      model_name="efficientnetv2-l",
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation,
-      include_preprocessing=include_preprocessing,
-  )
+    return EfficientNetV2(
+        width_coefficient=1.0,
+        depth_coefficient=1.0,
+        default_size=480,
+        model_name="efficientnetv2-l",
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+        include_preprocessing=include_preprocessing,
+    )
 
 
 EfficientNetV2B0.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2B0")
@@ -1269,30 +1333,30 @@ def EfficientNetV2L(
 
 
 @keras_export("keras.applications.efficientnet_v2.preprocess_input")
-def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
-  """A placeholder method for backward compatibility.
-
-  The preprocessing logic has been included in the EfficientNetV2 model
-  implementation. Users are no longer required to call this method to normalize
-  the input data. This method does nothing and only kept as a placeholder to
-  align the API surface between old and new version of model.
-
-  Args:
-    x: A floating point `numpy.array` or a `tf.Tensor`.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it, it
-      defaults to "channels_last").{mode}
-
-  Returns:
-    Unchanged `numpy.array` or `tf.Tensor`.
-  """
-  return x
+def preprocess_input(x, data_format=None):
+    """A placeholder method for backward compatibility.
+
+    The preprocessing logic has been included in the EfficientNetV2 model
+    implementation. Users are no longer required to call this method to
+    normalize the input data. This method does nothing and only kept as a
+    placeholder to align the API surface between old and new version of model.
+
+    Args:
+      x: A floating point `numpy.array` or a `tf.Tensor`.
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it uses "channels_last").
+        Defaults to `None`.
+
+    Returns:
+      Unchanged `numpy.array` or `tf.Tensor`.
+    """
+    return x
 
 
 @keras_export("keras.applications.efficientnet_v2.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/efficientnet_weight_update_util.py b/keras/applications/efficientnet_weight_update_util.py
deleted file mode 100644
index cc86cb02bbd1..000000000000
--- a/keras/applications/efficientnet_weight_update_util.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Utils for EfficientNet models for Keras.
-
-Write weights from  ckpt file as in original repo
-(https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet)
-to h5 file for keras implementation of the models.
-
-Usage:
-
-# use checkpoint efficientnet-b0/model.ckpt (can be downloaded from
-# https://storage.googleapis.com/cloud-tpu-checkpoints/
-#     efficientnet/ckptsaug/efficientnet-b0.tar.gz)
-# to update weight without top layers, saving to efficientnetb0_notop.h5
-python efficientnet_weight_update_util.py --model b0 --notop \
-    --ckpt efficientnet-b0/model.ckpt --o efficientnetb0_notop.h5
-
-# use checkpoint noisy_student_efficientnet-b3/model.ckpt (providing
-# improved result for b3, can be downloaded from
-# https://storage.googleapis.com/cloud-tpu-checkpoints/
-#     efficientnet/noisystudent/noisy_student_efficientnet-b3.tar.gz)
-# to update weight with top layers, saving to efficientnetb3_new.h5
-python efficientnet_weight_update_util.py --model b3 --notop \
-    --ckpt noisy_student_efficientnet-b3/model.ckpt --o efficientnetb3_new.h5
-"""
-
-import argparse
-import warnings
-
-from keras.utils import io_utils
-import tensorflow.compat.v2 as tf
-from tensorflow.keras.applications import efficientnet
-
-
-def write_ckpt_to_h5(path_h5, path_ckpt, keras_model, use_ema=True):
-  """Map the weights in checkpoint file (tf) to h5 file (keras).
-
-  Args:
-    path_h5: str, path to output hdf5 file to write weights loaded from ckpt
-      files.
-    path_ckpt: str, path to the ckpt files (e.g. 'efficientnet-b0/model.ckpt')
-      that records efficientnet weights from original repo
-      https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet
-    keras_model: keras model, built from keras.applications efficientnet
-      functions (e.g. EfficientNetB0)
-    use_ema: Bool, whether to use ExponentialMovingAverage result or not
-  """
-  model_name_keras = keras_model.name
-  model_name_tf = model_name_keras.replace('efficientnet', 'efficientnet-')
-
-  keras_weight_names = [w.name for w in keras_model.weights]
-  tf_weight_names = get_variable_names_from_ckpt(path_ckpt)
-
-  keras_blocks = get_keras_blocks(keras_weight_names)
-  tf_blocks = get_tf_blocks(tf_weight_names)
-
-  io_utils.print_msg('check variables match in each block')
-  for keras_block, tf_block in zip(keras_blocks, tf_blocks):
-    check_match(keras_block, tf_block, keras_weight_names, tf_weight_names,
-                model_name_tf)
-    io_utils.print_msg('{} and {} match.'.format(tf_block, keras_block))
-
-  block_mapping = {x[0]: x[1] for x in zip(keras_blocks, tf_blocks)}
-
-  changed_weights = 0
-  for w in keras_model.weights:
-    if 'block' in w.name:
-      # example: 'block1a_dwconv/depthwise_kernel:0' -> 'block1a'
-      keras_block = w.name.split('/')[0].split('_')[0]
-      tf_block = block_mapping[keras_block]
-      tf_name = keras_name_to_tf_name_block(
-          w.name,
-          keras_block=keras_block,
-          tf_block=tf_block,
-          use_ema=use_ema,
-          model_name_tf=model_name_tf)
-    elif any([x in w.name for x in ['stem', 'top', 'predictions', 'probs']]):
-      tf_name = keras_name_to_tf_name_stem_top(
-          w.name, use_ema=use_ema, model_name_tf=model_name_tf)
-    elif 'normalization' in w.name:
-      io_utils.print_msg(
-          f'Skipping variable {w.name}: normalization is a Keras '
-          'preprocessing layer, which does not exist in the TF ckpt.')
-      continue
-    else:
-      raise ValueError('{} failed to parse.'.format(w.name))
-
-    try:
-      w_tf = tf.train.load_variable(path_ckpt, tf_name)
-      if (w.value().numpy() != w_tf).any():
-        w.assign(w_tf)
-        changed_weights += 1
-    except ValueError as e:
-      if any([x in w.name for x in ['top', 'predictions', 'probs']]):
-        warnings.warn(
-            'Fail to load top layer variable {}'
-            'from {} because of {}.'.format(w.name, tf_name, e),
-            stacklevel=2)
-      else:
-        raise ValueError('Fail to load {} from {}'.format(w.name, tf_name))
-
-  total_weights = len(keras_model.weights)
-  io_utils.print_msg(f'{changed_weights}/{total_weights} weights updated')
-  keras_model.save_weights(path_h5)
-
-
-def get_variable_names_from_ckpt(path_ckpt, use_ema=True):
-  """Get list of tensor names from checkpoint.
-
-  Args:
-    path_ckpt: str, path to the ckpt files
-    use_ema: Bool, whether to use ExponentialMovingAverage result or not.
-  Returns:
-    List of variable names from checkpoint.
-  """
-  v_all = tf.train.list_variables(path_ckpt)
-
-  # keep name only
-  v_name_all = [x[0] for x in v_all]
-
-  if use_ema:
-    v_name_all = [x for x in v_name_all if 'ExponentialMovingAverage' in x]
-  else:
-    v_name_all = [x for x in v_name_all if 'ExponentialMovingAverage' not in x]
-
-  # remove util variables used for RMSprop
-  v_name_all = [x for x in v_name_all if 'RMS' not in x]
-  return v_name_all
-
-
-def get_tf_blocks(tf_weight_names):
-  """Extract the block names from list of full weight names."""
-  # Example: 'efficientnet-b0/blocks_0/conv2d/kernel' -> 'blocks_0'
-  tf_blocks = {x.split('/')[1] for x in tf_weight_names if 'block' in x}
-  # sort by number
-  tf_blocks = sorted(tf_blocks, key=lambda x: int(x.split('_')[1]))
-  return tf_blocks
-
-
-def get_keras_blocks(keras_weight_names):
-  """Extract the block names from list of full weight names."""
-  # example: 'block1a_dwconv/depthwise_kernel:0' -> 'block1a'
-  keras_blocks = {x.split('_')[0] for x in keras_weight_names if 'block' in x}
-  return sorted(keras_blocks)
-
-
-def keras_name_to_tf_name_stem_top(keras_name,
-                                   use_ema=True,
-                                   model_name_tf='efficientnet-b0'):
-  """Mapping name in h5 to ckpt that is in stem or top (head).
-
-  we map name keras_name that points to a weight in h5 file
-  to a name of weight in ckpt file.
-
-  Args:
-    keras_name: str, the name of weight in the h5 file of keras implementation
-    use_ema: Bool, use the ExponentialMovingAverage resuolt in ckpt or not
-    model_name_tf: str, the name of model in ckpt.
-
-  Returns:
-    String for the name of weight as in ckpt file.
-
-  Raises:
-    KeyError: if we cannot parse the keras_name.
-  """
-  if use_ema:
-    ema = '/ExponentialMovingAverage'
-  else:
-    ema = ''
-
-  stem_top_dict = {
-      'probs/bias:0': '{}/head/dense/bias{}',
-      'probs/kernel:0': '{}/head/dense/kernel{}',
-      'predictions/bias:0': '{}/head/dense/bias{}',
-      'predictions/kernel:0': '{}/head/dense/kernel{}',
-      'stem_conv/kernel:0': '{}/stem/conv2d/kernel{}',
-      'top_conv/kernel:0': '{}/head/conv2d/kernel{}',
-  }
-  for x in stem_top_dict:
-    stem_top_dict[x] = stem_top_dict[x].format(model_name_tf, ema)
-
-  # stem batch normalization
-  for bn_weights in ['beta', 'gamma', 'moving_mean', 'moving_variance']:
-    tf_name = '{}/stem/tpu_batch_normalization/{}{}'.format(
-        model_name_tf, bn_weights, ema)
-    stem_top_dict['stem_bn/{}:0'.format(bn_weights)] = tf_name
-
-  # top / head batch normalization
-  for bn_weights in ['beta', 'gamma', 'moving_mean', 'moving_variance']:
-    tf_name = '{}/head/tpu_batch_normalization/{}{}'.format(
-        model_name_tf, bn_weights, ema)
-    stem_top_dict['top_bn/{}:0'.format(bn_weights)] = tf_name
-
-  if keras_name in stem_top_dict:
-    return stem_top_dict[keras_name]
-  raise KeyError('{} from h5 file cannot be parsed'.format(keras_name))
-
-
-def keras_name_to_tf_name_block(keras_name,
-                                keras_block='block1a',
-                                tf_block='blocks_0',
-                                use_ema=True,
-                                model_name_tf='efficientnet-b0'):
-  """Mapping name in h5 to ckpt that belongs to a block.
-
-  we map name keras_name that points to a weight in h5 file
-  to a name of weight in ckpt file.
-
-  Args:
-    keras_name: str, the name of weight in the h5 file of keras implementation
-    keras_block: str, the block name for keras implementation (e.g. 'block1a')
-    tf_block: str, the block name for tf implementation (e.g. 'blocks_0')
-    use_ema: Bool, use the ExponentialMovingAverage resuolt in ckpt or not
-    model_name_tf: str, the name of model in ckpt.
-
-  Returns:
-    String for the name of weight as in ckpt file.
-
-  Raises:
-    ValueError if keras_block does not show up in keras_name
-  """
-
-  if keras_block not in keras_name:
-    raise ValueError('block name {} not found in {}'.format(
-        keras_block, keras_name))
-
-  # all blocks in the first group will not have expand conv and bn
-  is_first_blocks = (keras_block[5] == '1')
-
-  tf_name = [model_name_tf, tf_block]
-
-  # depthwide conv
-  if 'dwconv' in keras_name:
-    tf_name.append('depthwise_conv2d')
-    tf_name.append('depthwise_kernel')
-
-  # conv layers
-  if is_first_blocks:
-    # first blocks only have one conv2d
-    if 'project_conv' in keras_name:
-      tf_name.append('conv2d')
-      tf_name.append('kernel')
-  else:
-    if 'project_conv' in keras_name:
-      tf_name.append('conv2d_1')
-      tf_name.append('kernel')
-    elif 'expand_conv' in keras_name:
-      tf_name.append('conv2d')
-      tf_name.append('kernel')
-
-  # squeeze expansion layers
-  if '_se_' in keras_name:
-    if 'reduce' in keras_name:
-      tf_name.append('se/conv2d')
-    elif 'expand' in keras_name:
-      tf_name.append('se/conv2d_1')
-
-    if 'kernel' in keras_name:
-      tf_name.append('kernel')
-    elif 'bias' in keras_name:
-      tf_name.append('bias')
-
-  # batch normalization layers
-  if 'bn' in keras_name:
-    if is_first_blocks:
-      if 'project' in keras_name:
-        tf_name.append('tpu_batch_normalization_1')
-      else:
-        tf_name.append('tpu_batch_normalization')
-    else:
-      if 'project' in keras_name:
-        tf_name.append('tpu_batch_normalization_2')
-      elif 'expand' in keras_name:
-        tf_name.append('tpu_batch_normalization')
-      else:
-        tf_name.append('tpu_batch_normalization_1')
-
-    for x in ['moving_mean', 'moving_variance', 'beta', 'gamma']:
-      if x in keras_name:
-        tf_name.append(x)
-  if use_ema:
-    tf_name.append('ExponentialMovingAverage')
-  return '/'.join(tf_name)
-
-
-def check_match(keras_block, tf_block, keras_weight_names, tf_weight_names,
-                model_name_tf):
-  """Check if the weights in h5 and ckpt match.
-
-  we match each name from keras_weight_names that is in keras_block
-  and check if there is 1-1 correspondence to names from tf_weight_names
-  that is in tf_block
-
-  Args:
-    keras_block: str, the block name for keras implementation (e.g. 'block1a')
-    tf_block: str, the block name for tf implementation (e.g. 'blocks_0')
-    keras_weight_names: list of str, weight names in keras implementation
-    tf_weight_names: list of str, weight names in tf implementation
-    model_name_tf: str, the name of model in ckpt.
-  """
-  names_from_keras = set()
-  for x in keras_weight_names:
-    if keras_block in x:
-      y = keras_name_to_tf_name_block(
-          x,
-          keras_block=keras_block,
-          tf_block=tf_block,
-          model_name_tf=model_name_tf)
-      names_from_keras.add(y)
-
-  names_from_tf = set()
-  for x in tf_weight_names:
-    if tf_block in x and x.split('/')[1].endswith(tf_block):
-      names_from_tf.add(x)
-
-  names_missing = names_from_keras - names_from_tf
-  if names_missing:
-    raise ValueError('{} variables not found in checkpoint file: {}'.format(
-        len(names_missing), names_missing))
-
-  names_unused = names_from_tf - names_from_keras
-  if names_unused:
-    warnings.warn(
-        '{} variables from checkpoint file are not used: {}'.format(
-            len(names_unused), names_unused),
-        stacklevel=2)
-
-
-if __name__ == '__main__':
-  arg_to_model = {
-      'b0': efficientnet.EfficientNetB0,
-      'b1': efficientnet.EfficientNetB1,
-      'b2': efficientnet.EfficientNetB2,
-      'b3': efficientnet.EfficientNetB3,
-      'b4': efficientnet.EfficientNetB4,
-      'b5': efficientnet.EfficientNetB5,
-      'b6': efficientnet.EfficientNetB6,
-      'b7': efficientnet.EfficientNetB7
-  }
-
-  p = argparse.ArgumentParser(description='write weights from checkpoint to h5')
-  p.add_argument(
-      '--model',
-      required=True,
-      type=str,
-      help='name of efficient model',
-      choices=arg_to_model.keys())
-  p.add_argument(
-      '--notop',
-      action='store_true',
-      help='do not include top layers',
-      default=False)
-  p.add_argument('--ckpt', required=True, type=str, help='checkpoint path')
-  p.add_argument(
-      '--output', '-o', required=True, type=str, help='output (h5) file path')
-  args = p.parse_args()
-
-  include_top = not args.notop
-
-  model = arg_to_model[args.model](include_top=include_top)
-  write_ckpt_to_h5(args.output, args.ckpt, keras_model=model)
diff --git a/keras/applications/imagenet_utils.py b/keras/applications/imagenet_utils.py
index acecccccdf68..3aafbad0a174 100644
--- a/keras/applications/imagenet_utils.py
+++ b/keras/applications/imagenet_utils.py
@@ -22,12 +22,15 @@
 from keras import activations
 from keras import backend
 from keras.utils import data_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
 CLASS_INDEX = None
-CLASS_INDEX_PATH = ('https://storage.googleapis.com/download.tensorflow.org/'
-                    'data/imagenet_class_index.json')
+CLASS_INDEX_PATH = (
+    "https://storage.googleapis.com/download.tensorflow.org/"
+    "data/imagenet_class_index.json"
+)
 
 
 PREPROCESS_INPUT_DOC = """
@@ -53,10 +56,10 @@
       The preprocessed data are written over the input data
       if the data types are compatible. To avoid this
       behaviour, `numpy.copy(x)` can be used.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it,
-      it defaults to "channels_last").{mode}
+    data_format: Optional data format of the image tensor/array. None, means
+      the global setting `tf.keras.backend.image_data_format()` is used
+      (unless you changed it, it uses "channels_last").{mode}
+      Defaults to `None`.
 
   Returns:
       Preprocessed `numpy.array` or a `tf.Tensor` with type `float32`.
@@ -67,7 +70,7 @@
   """
 
 PREPROCESS_INPUT_MODE_DOC = """
-    mode: One of "caffe", "tf" or "torch". Defaults to "caffe".
+    mode: One of "caffe", "tf" or "torch".
       - caffe: will convert the images from RGB to BGR,
           then will zero-center each color channel with
           respect to the ImageNet dataset,
@@ -77,6 +80,7 @@
       - torch: will scale pixels between 0 and 1 and then
           will normalize each channel with respect to the
           ImageNet dataset.
+      Defaults to "caffe".
   """
 
 PREPROCESS_INPUT_DEFAULT_ERROR_DOC = """
@@ -97,344 +101,381 @@
       zero-centered with respect to the ImageNet dataset, without scaling."""
 
 
-@keras_export('keras.applications.imagenet_utils.preprocess_input')
-def preprocess_input(x, data_format=None, mode='caffe'):
-  """Preprocesses a tensor or Numpy array encoding a batch of images."""
-  if mode not in {'caffe', 'tf', 'torch'}:
-    raise ValueError('Expected mode to be one of `caffe`, `tf` or `torch`. '
-                     f'Received: mode={mode}')
-
-  if data_format is None:
-    data_format = backend.image_data_format()
-  elif data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Expected data_format to be one of `channels_first` or '
-                     f'`channels_last`. Received: data_format={data_format}')
-
-  if isinstance(x, np.ndarray):
-    return _preprocess_numpy_input(
-        x, data_format=data_format, mode=mode)
-  else:
-    return _preprocess_symbolic_input(
-        x, data_format=data_format, mode=mode)
+@keras_export("keras.applications.imagenet_utils.preprocess_input")
+def preprocess_input(x, data_format=None, mode="caffe"):
+    """Preprocesses a tensor or Numpy array encoding a batch of images."""
+    if mode not in {"caffe", "tf", "torch"}:
+        raise ValueError(
+            "Expected mode to be one of `caffe`, `tf` or `torch`. "
+            f"Received: mode={mode}"
+        )
+
+    if data_format is None:
+        data_format = backend.image_data_format()
+    elif data_format not in {"channels_first", "channels_last"}:
+        raise ValueError(
+            "Expected data_format to be one of `channels_first` or "
+            f"`channels_last`. Received: data_format={data_format}"
+        )
+
+    if isinstance(x, np.ndarray):
+        return _preprocess_numpy_input(x, data_format=data_format, mode=mode)
+    else:
+        return _preprocess_symbolic_input(x, data_format=data_format, mode=mode)
 
 
 preprocess_input.__doc__ = PREPROCESS_INPUT_DOC.format(
     mode=PREPROCESS_INPUT_MODE_DOC,
-    ret='',
-    error=PREPROCESS_INPUT_DEFAULT_ERROR_DOC)
+    ret="",
+    error=PREPROCESS_INPUT_DEFAULT_ERROR_DOC,
+)
 
 
-@keras_export('keras.applications.imagenet_utils.decode_predictions')
+@keras_export("keras.applications.imagenet_utils.decode_predictions")
 def decode_predictions(preds, top=5):
-  """Decodes the prediction of an ImageNet model.
-
-  Args:
-    preds: Numpy array encoding a batch of predictions.
-    top: Integer, how many top-guesses to return. Defaults to 5.
-
-  Returns:
-    A list of lists of top class prediction tuples
-    `(class_name, class_description, score)`.
-    One list of tuples per sample in batch input.
-
-  Raises:
-    ValueError: In case of invalid shape of the `pred` array
-      (must be 2D).
-  """
-  global CLASS_INDEX
-
-  if len(preds.shape) != 2 or preds.shape[1] != 1000:
-    raise ValueError('`decode_predictions` expects '
-                     'a batch of predictions '
-                     '(i.e. a 2D array of shape (samples, 1000)). '
-                     'Found array with shape: ' + str(preds.shape))
-  if CLASS_INDEX is None:
-    fpath = data_utils.get_file(
-        'imagenet_class_index.json',
-        CLASS_INDEX_PATH,
-        cache_subdir='models',
-        file_hash='c2c37ea517e94d9795004a39431a14cb')
-    with open(fpath) as f:
-      CLASS_INDEX = json.load(f)
-  results = []
-  for pred in preds:
-    top_indices = pred.argsort()[-top:][::-1]
-    result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
-    result.sort(key=lambda x: x[2], reverse=True)
-    results.append(result)
-  return results
+    """Decodes the prediction of an ImageNet model.
+
+    Args:
+      preds: Numpy array encoding a batch of predictions.
+      top: Integer, how many top-guesses to return. Defaults to 5.
+
+    Returns:
+      A list of lists of top class prediction tuples
+      `(class_name, class_description, score)`.
+      One list of tuples per sample in batch input.
+
+    Raises:
+      ValueError: In case of invalid shape of the `pred` array
+        (must be 2D).
+    """
+    global CLASS_INDEX
+
+    if len(preds.shape) != 2 or preds.shape[1] != 1000:
+        raise ValueError(
+            "`decode_predictions` expects "
+            "a batch of predictions "
+            "(i.e. a 2D array of shape (samples, 1000)). "
+            "Found array with shape: " + str(preds.shape)
+        )
+    if CLASS_INDEX is None:
+        fpath = data_utils.get_file(
+            "imagenet_class_index.json",
+            CLASS_INDEX_PATH,
+            cache_subdir="models",
+            file_hash="c2c37ea517e94d9795004a39431a14cb",
+        )
+        with open(fpath) as f:
+            CLASS_INDEX = json.load(f)
+    results = []
+    for pred in preds:
+        top_indices = pred.argsort()[-top:][::-1]
+        result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
+        result.sort(key=lambda x: x[2], reverse=True)
+        results.append(result)
+    return results
 
 
 def _preprocess_numpy_input(x, data_format, mode):
-  """Preprocesses a Numpy array encoding a batch of images.
-
-  Args:
-    x: Input array, 3D or 4D.
-    data_format: Data format of the image array.
-    mode: One of "caffe", "tf" or "torch".
-      - caffe: will convert the images from RGB to BGR,
-          then will zero-center each color channel with
-          respect to the ImageNet dataset,
-          without scaling.
-      - tf: will scale pixels between -1 and 1,
-          sample-wise.
-      - torch: will scale pixels between 0 and 1 and then
-          will normalize each channel with respect to the
-          ImageNet dataset.
-
-  Returns:
-      Preprocessed Numpy array.
-  """
-  if not issubclass(x.dtype.type, np.floating):
-    x = x.astype(backend.floatx(), copy=False)
-
-  if mode == 'tf':
-    x /= 127.5
-    x -= 1.
-    return x
-  elif mode == 'torch':
-    x /= 255.
-    mean = [0.485, 0.456, 0.406]
-    std = [0.229, 0.224, 0.225]
-  else:
-    if data_format == 'channels_first':
-      # 'RGB'->'BGR'
-      if x.ndim == 3:
-        x = x[::-1, ...]
-      else:
-        x = x[:, ::-1, ...]
+    """Preprocesses a Numpy array encoding a batch of images.
+
+    Args:
+      x: Input array, 3D or 4D.
+      data_format: Data format of the image array.
+      mode: One of "caffe", "tf" or "torch".
+        - caffe: will convert the images from RGB to BGR,
+            then will zero-center each color channel with
+            respect to the ImageNet dataset,
+            without scaling.
+        - tf: will scale pixels between -1 and 1,
+            sample-wise.
+        - torch: will scale pixels between 0 and 1 and then
+            will normalize each channel with respect to the
+            ImageNet dataset.
+
+    Returns:
+        Preprocessed Numpy array.
+    """
+    if not issubclass(x.dtype.type, np.floating):
+        x = x.astype(backend.floatx(), copy=False)
+
+    if mode == "tf":
+        x /= 127.5
+        x -= 1.0
+        return x
+    elif mode == "torch":
+        x /= 255.0
+        mean = [0.485, 0.456, 0.406]
+        std = [0.229, 0.224, 0.225]
     else:
-      # 'RGB'->'BGR'
-      x = x[..., ::-1]
-    mean = [103.939, 116.779, 123.68]
-    std = None
-
-  # Zero-center by mean pixel
-  if data_format == 'channels_first':
-    if x.ndim == 3:
-      x[0, :, :] -= mean[0]
-      x[1, :, :] -= mean[1]
-      x[2, :, :] -= mean[2]
-      if std is not None:
-        x[0, :, :] /= std[0]
-        x[1, :, :] /= std[1]
-        x[2, :, :] /= std[2]
+        if data_format == "channels_first":
+            # 'RGB'->'BGR'
+            if x.ndim == 3:
+                x = x[::-1, ...]
+            else:
+                x = x[:, ::-1, ...]
+        else:
+            # 'RGB'->'BGR'
+            x = x[..., ::-1]
+        mean = [103.939, 116.779, 123.68]
+        std = None
+
+    # Zero-center by mean pixel
+    if data_format == "channels_first":
+        if x.ndim == 3:
+            x[0, :, :] -= mean[0]
+            x[1, :, :] -= mean[1]
+            x[2, :, :] -= mean[2]
+            if std is not None:
+                x[0, :, :] /= std[0]
+                x[1, :, :] /= std[1]
+                x[2, :, :] /= std[2]
+        else:
+            x[:, 0, :, :] -= mean[0]
+            x[:, 1, :, :] -= mean[1]
+            x[:, 2, :, :] -= mean[2]
+            if std is not None:
+                x[:, 0, :, :] /= std[0]
+                x[:, 1, :, :] /= std[1]
+                x[:, 2, :, :] /= std[2]
     else:
-      x[:, 0, :, :] -= mean[0]
-      x[:, 1, :, :] -= mean[1]
-      x[:, 2, :, :] -= mean[2]
-      if std is not None:
-        x[:, 0, :, :] /= std[0]
-        x[:, 1, :, :] /= std[1]
-        x[:, 2, :, :] /= std[2]
-  else:
-    x[..., 0] -= mean[0]
-    x[..., 1] -= mean[1]
-    x[..., 2] -= mean[2]
-    if std is not None:
-      x[..., 0] /= std[0]
-      x[..., 1] /= std[1]
-      x[..., 2] /= std[2]
-  return x
+        x[..., 0] -= mean[0]
+        x[..., 1] -= mean[1]
+        x[..., 2] -= mean[2]
+        if std is not None:
+            x[..., 0] /= std[0]
+            x[..., 1] /= std[1]
+            x[..., 2] /= std[2]
+    return x
 
 
 def _preprocess_symbolic_input(x, data_format, mode):
-  """Preprocesses a tensor encoding a batch of images.
-
-  Args:
-    x: Input tensor, 3D or 4D.
-    data_format: Data format of the image tensor.
-    mode: One of "caffe", "tf" or "torch".
-      - caffe: will convert the images from RGB to BGR,
-          then will zero-center each color channel with
-          respect to the ImageNet dataset,
-          without scaling.
-      - tf: will scale pixels between -1 and 1,
-          sample-wise.
-      - torch: will scale pixels between 0 and 1 and then
-          will normalize each channel with respect to the
-          ImageNet dataset.
-
-  Returns:
-      Preprocessed tensor.
-  """
-  if mode == 'tf':
-    x /= 127.5
-    x -= 1.
-    return x
-  elif mode == 'torch':
-    x /= 255.
-    mean = [0.485, 0.456, 0.406]
-    std = [0.229, 0.224, 0.225]
-  else:
-    if data_format == 'channels_first':
-      # 'RGB'->'BGR'
-      if backend.ndim(x) == 3:
-        x = x[::-1, ...]
-      else:
-        x = x[:, ::-1, ...]
+    """Preprocesses a tensor encoding a batch of images.
+
+    Args:
+      x: Input tensor, 3D or 4D.
+      data_format: Data format of the image tensor.
+      mode: One of "caffe", "tf" or "torch".
+        - caffe: will convert the images from RGB to BGR,
+            then will zero-center each color channel with
+            respect to the ImageNet dataset,
+            without scaling.
+        - tf: will scale pixels between -1 and 1,
+            sample-wise.
+        - torch: will scale pixels between 0 and 1 and then
+            will normalize each channel with respect to the
+            ImageNet dataset.
+
+    Returns:
+        Preprocessed tensor.
+    """
+    if mode == "tf":
+        x /= 127.5
+        x -= 1.0
+        return x
+    elif mode == "torch":
+        x /= 255.0
+        mean = [0.485, 0.456, 0.406]
+        std = [0.229, 0.224, 0.225]
     else:
-      # 'RGB'->'BGR'
-      x = x[..., ::-1]
-    mean = [103.939, 116.779, 123.68]
-    std = None
-
-  mean_tensor = backend.constant(-np.array(mean))
-
-  # Zero-center by mean pixel
-  if backend.dtype(x) != backend.dtype(mean_tensor):
-    x = backend.bias_add(
-        x, backend.cast(mean_tensor, backend.dtype(x)), data_format=data_format)
-  else:
-    x = backend.bias_add(x, mean_tensor, data_format)
-  if std is not None:
-    std_tensor = backend.constant(np.array(std), dtype=backend.dtype(x))
-    if data_format == 'channels_first':
-      std_tensor = backend.reshape(std_tensor, (-1, 1, 1))
-    x /= std_tensor
-  return x
-
-
-def obtain_input_shape(input_shape,
-                       default_size,
-                       min_size,
-                       data_format,
-                       require_flatten,
-                       weights=None):
-  """Internal utility to compute/validate a model's input shape.
-
-  Args:
-    input_shape: Either None (will return the default network input shape),
-      or a user-provided shape to be validated.
-    default_size: Default input width/height for the model.
-    min_size: Minimum input width/height accepted by the model.
-    data_format: Image data format to use.
-    require_flatten: Whether the model is expected to
-      be linked to a classifier via a Flatten layer.
-    weights: One of `None` (random initialization)
-      or 'imagenet' (pre-training on ImageNet).
-      If weights='imagenet' input channels must be equal to 3.
+        if data_format == "channels_first":
+            # 'RGB'->'BGR'
+            if backend.ndim(x) == 3:
+                x = x[::-1, ...]
+            else:
+                x = x[:, ::-1, ...]
+        else:
+            # 'RGB'->'BGR'
+            x = x[..., ::-1]
+        mean = [103.939, 116.779, 123.68]
+        std = None
+
+    mean_tensor = backend.constant(-np.array(mean))
+
+    # Zero-center by mean pixel
+    if backend.dtype(x) != backend.dtype(mean_tensor):
+        x = backend.bias_add(
+            x,
+            backend.cast(mean_tensor, backend.dtype(x)),
+            data_format=data_format,
+        )
+    else:
+        x = backend.bias_add(x, mean_tensor, data_format)
+    if std is not None:
+        std_tensor = backend.constant(np.array(std), dtype=backend.dtype(x))
+        if data_format == "channels_first":
+            std_tensor = backend.reshape(std_tensor, (-1, 1, 1))
+        x /= std_tensor
+    return x
 
-  Returns:
-    An integer shape tuple (may include None entries).
 
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  if weights != 'imagenet' and input_shape and len(input_shape) == 3:
-    if data_format == 'channels_first':
-      if input_shape[0] not in {1, 3}:
-        warnings.warn(
-            'This model usually expects 1 or 3 input channels. '
-            'However, it was passed an input_shape with ' +
-            str(input_shape[0]) + ' input channels.',
-            stacklevel=2)
-      default_shape = (input_shape[0], default_size, default_size)
-    else:
-      if input_shape[-1] not in {1, 3}:
-        warnings.warn(
-            'This model usually expects 1 or 3 input channels. '
-            'However, it was passed an input_shape with ' +
-            str(input_shape[-1]) + ' input channels.',
-            stacklevel=2)
-      default_shape = (default_size, default_size, input_shape[-1])
-  else:
-    if data_format == 'channels_first':
-      default_shape = (3, default_size, default_size)
+def obtain_input_shape(
+    input_shape,
+    default_size,
+    min_size,
+    data_format,
+    require_flatten,
+    weights=None,
+):
+    """Internal utility to compute/validate a model's input shape.
+
+    Args:
+      input_shape: Either None (will return the default network input shape),
+        or a user-provided shape to be validated.
+      default_size: Default input width/height for the model.
+      min_size: Minimum input width/height accepted by the model.
+      data_format: Image data format to use.
+      require_flatten: Whether the model is expected to
+        be linked to a classifier via a Flatten layer.
+      weights: One of `None` (random initialization)
+        or 'imagenet' (pre-training on ImageNet).
+        If weights='imagenet' input channels must be equal to 3.
+
+    Returns:
+      An integer shape tuple (may include None entries).
+
+    Raises:
+      ValueError: In case of invalid argument values.
+    """
+    if weights != "imagenet" and input_shape and len(input_shape) == 3:
+        if data_format == "channels_first":
+            if input_shape[0] not in {1, 3}:
+                warnings.warn(
+                    "This model usually expects 1 or 3 input channels. "
+                    "However, it was passed an input_shape with "
+                    + str(input_shape[0])
+                    + " input channels.",
+                    stacklevel=2,
+                )
+            default_shape = (input_shape[0], default_size, default_size)
+        else:
+            if input_shape[-1] not in {1, 3}:
+                warnings.warn(
+                    "This model usually expects 1 or 3 input channels. "
+                    "However, it was passed an input_shape with "
+                    + str(input_shape[-1])
+                    + " input channels.",
+                    stacklevel=2,
+                )
+            default_shape = (default_size, default_size, input_shape[-1])
     else:
-      default_shape = (default_size, default_size, 3)
-  if weights == 'imagenet' and require_flatten:
-    if input_shape is not None:
-      if input_shape != default_shape:
-        raise ValueError('When setting `include_top=True` '
-                         'and loading `imagenet` weights, '
-                         f'`input_shape` should be {default_shape}.  '
-                         f'Received: input_shape={input_shape}')
-    return default_shape
-  if input_shape:
-    if data_format == 'channels_first':
-      if input_shape is not None:
-        if len(input_shape) != 3:
-          raise ValueError('`input_shape` must be a tuple of three integers.')
-        if input_shape[0] != 3 and weights == 'imagenet':
-          raise ValueError('The input must have 3 channels; Received '
-                           f'`input_shape={input_shape}`')
-        if ((input_shape[1] is not None and input_shape[1] < min_size) or
-            (input_shape[2] is not None and input_shape[2] < min_size)):
-          raise ValueError(f'Input size must be at least {min_size}'
-                           f'x{min_size}; Received: '
-                           f'input_shape={input_shape}')
+        if data_format == "channels_first":
+            default_shape = (3, default_size, default_size)
+        else:
+            default_shape = (default_size, default_size, 3)
+    if weights == "imagenet" and require_flatten:
+        if input_shape is not None:
+            if input_shape != default_shape:
+                raise ValueError(
+                    "When setting `include_top=True` "
+                    "and loading `imagenet` weights, "
+                    f"`input_shape` should be {default_shape}.  "
+                    f"Received: input_shape={input_shape}"
+                )
+        return default_shape
+    if input_shape:
+        if data_format == "channels_first":
+            if input_shape is not None:
+                if len(input_shape) != 3:
+                    raise ValueError(
+                        "`input_shape` must be a tuple of three integers."
+                    )
+                if input_shape[0] != 3 and weights == "imagenet":
+                    raise ValueError(
+                        "The input must have 3 channels; Received "
+                        f"`input_shape={input_shape}`"
+                    )
+                if (
+                    input_shape[1] is not None and input_shape[1] < min_size
+                ) or (input_shape[2] is not None and input_shape[2] < min_size):
+                    raise ValueError(
+                        f"Input size must be at least {min_size}"
+                        f"x{min_size}; Received: "
+                        f"input_shape={input_shape}"
+                    )
+        else:
+            if input_shape is not None:
+                if len(input_shape) != 3:
+                    raise ValueError(
+                        "`input_shape` must be a tuple of three integers."
+                    )
+                if input_shape[-1] != 3 and weights == "imagenet":
+                    raise ValueError(
+                        "The input must have 3 channels; Received "
+                        f"`input_shape={input_shape}`"
+                    )
+                if (
+                    input_shape[0] is not None and input_shape[0] < min_size
+                ) or (input_shape[1] is not None and input_shape[1] < min_size):
+                    raise ValueError(
+                        "Input size must be at least "
+                        f"{min_size}x{min_size}; Received: "
+                        f"input_shape={input_shape}"
+                    )
     else:
-      if input_shape is not None:
-        if len(input_shape) != 3:
-          raise ValueError('`input_shape` must be a tuple of three integers.')
-        if input_shape[-1] != 3 and weights == 'imagenet':
-          raise ValueError('The input must have 3 channels; Received '
-                           f'`input_shape={input_shape}`')
-        if ((input_shape[0] is not None and input_shape[0] < min_size) or
-            (input_shape[1] is not None and input_shape[1] < min_size)):
-          raise ValueError('Input size must be at least '
-                           f'{min_size}x{min_size}; Received: '
-                           f'input_shape={input_shape}')
-  else:
+        if require_flatten:
+            input_shape = default_shape
+        else:
+            if data_format == "channels_first":
+                input_shape = (3, None, None)
+            else:
+                input_shape = (None, None, 3)
     if require_flatten:
-      input_shape = default_shape
-    else:
-      if data_format == 'channels_first':
-        input_shape = (3, None, None)
-      else:
-        input_shape = (None, None, 3)
-  if require_flatten:
-    if None in input_shape:
-      raise ValueError('If `include_top` is True, '
-                       'you should specify a static `input_shape`. '
-                       f'Received: input_shape={input_shape}')
-  return input_shape
+        if None in input_shape:
+            raise ValueError(
+                "If `include_top` is True, "
+                "you should specify a static `input_shape`. "
+                f"Received: input_shape={input_shape}"
+            )
+    return input_shape
 
 
 def correct_pad(inputs, kernel_size):
-  """Returns a tuple for zero-padding for 2D convolution with downsampling.
-
-  Args:
-    inputs: Input tensor.
-    kernel_size: An integer or tuple/list of 2 integers.
-
-  Returns:
-    A tuple.
-  """
-  img_dim = 2 if backend.image_data_format() == 'channels_first' else 1
-  input_size = backend.int_shape(inputs)[img_dim:(img_dim + 2)]
-  if isinstance(kernel_size, int):
-    kernel_size = (kernel_size, kernel_size)
-  if input_size[0] is None:
-    adjust = (1, 1)
-  else:
-    adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2)
-  correct = (kernel_size[0] // 2, kernel_size[1] // 2)
-  return ((correct[0] - adjust[0], correct[0]),
-          (correct[1] - adjust[1], correct[1]))
+    """Returns a tuple for zero-padding for 2D convolution with downsampling.
+
+    Args:
+      inputs: Input tensor.
+      kernel_size: An integer or tuple/list of 2 integers.
+
+    Returns:
+      A tuple.
+    """
+    img_dim = 2 if backend.image_data_format() == "channels_first" else 1
+    input_size = backend.int_shape(inputs)[img_dim : (img_dim + 2)]
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    if input_size[0] is None:
+        adjust = (1, 1)
+    else:
+        adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2)
+    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
+    return (
+        (correct[0] - adjust[0], correct[0]),
+        (correct[1] - adjust[1], correct[1]),
+    )
 
 
 def validate_activation(classifier_activation, weights):
-  """validates that the classifer_activation is compatible with the weights.
-
-  Args:
-    classifier_activation: str or callable activation function
-    weights: The pretrained weights to load.
-
-  Raises:
-    ValueError: if an activation other than `None` or `softmax` are used with
-      pretrained weights.
-  """
-  if weights is None:
-    return
-
-  classifier_activation = activations.get(classifier_activation)
-  if classifier_activation not in {
-      activations.get('softmax'),
-      activations.get(None)
-  }:
-    raise ValueError('Only `None` and `softmax` activations are allowed '
-                     'for the `classifier_activation` argument when using '
-                     'pretrained weights, with `include_top=True`; Received: '
-                     f'classifier_activation={classifier_activation}')
+    """validates that the classifer_activation is compatible with the weights.
+
+    Args:
+      classifier_activation: str or callable activation function
+      weights: The pretrained weights to load.
+
+    Raises:
+      ValueError: if an activation other than `None` or `softmax` are used with
+        pretrained weights.
+    """
+    if weights is None:
+        return
+
+    classifier_activation = activations.get(classifier_activation)
+    if classifier_activation not in {
+        activations.get("softmax"),
+        activations.get(None),
+    }:
+        raise ValueError(
+            "Only `None` and `softmax` activations are allowed "
+            "for the `classifier_activation` argument when using "
+            "pretrained weights, with `include_top=True`; Received: "
+            f"classifier_activation={classifier_activation}"
+        )
diff --git a/keras/applications/imagenet_utils_test.py b/keras/applications/imagenet_utils_test.py
index 6ca7ee811e75..8369884ee6de 100644
--- a/keras/applications/imagenet_utils_test.py
+++ b/keras/applications/imagenet_utils_test.py
@@ -14,283 +14,312 @@
 # ==============================================================================
 """Tests for imagenet_utils."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
-from keras.testing_infra import test_combinations
 from keras.applications import imagenet_utils as utils
 from keras.mixed_precision.policy import set_global_policy
+from keras.testing_infra import test_combinations
 
 
 class TestImageNetUtils(test_combinations.TestCase):
-
-  def test_preprocess_input(self):
-    # Test invalid mode check
-    x = np.random.uniform(0, 255, (10, 10, 3))
-    with self.assertRaises(ValueError):
-      utils.preprocess_input(x, mode='some_unknown_mode')
-
-    # Test image batch with float and int image input
-    x = np.random.uniform(0, 255, (2, 10, 10, 3))
-    xint = x.astype('int32')
-    self.assertEqual(utils.preprocess_input(x).shape, x.shape)
-    self.assertEqual(utils.preprocess_input(xint).shape, xint.shape)
-
-    out1 = utils.preprocess_input(x, 'channels_last')
-    out1int = utils.preprocess_input(xint, 'channels_last')
-    out2 = utils.preprocess_input(
-        np.transpose(x, (0, 3, 1, 2)), 'channels_first')
-    out2int = utils.preprocess_input(
-        np.transpose(xint, (0, 3, 1, 2)), 'channels_first')
-    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
-    self.assertAllClose(out1int, out2int.transpose(0, 2, 3, 1))
-
-    # Test single image
-    x = np.random.uniform(0, 255, (10, 10, 3))
-    xint = x.astype('int32')
-    self.assertEqual(utils.preprocess_input(x).shape, x.shape)
-    self.assertEqual(utils.preprocess_input(xint).shape, xint.shape)
-
-    out1 = utils.preprocess_input(x, 'channels_last')
-    out1int = utils.preprocess_input(xint, 'channels_last')
-    out2 = utils.preprocess_input(np.transpose(x, (2, 0, 1)), 'channels_first')
-    out2int = utils.preprocess_input(
-        np.transpose(xint, (2, 0, 1)), 'channels_first')
-    self.assertAllClose(out1, out2.transpose(1, 2, 0))
-    self.assertAllClose(out1int, out2int.transpose(1, 2, 0))
-
-    # Test that writing over the input data works predictably
-    for mode in ['torch', 'tf']:
-      x = np.random.uniform(0, 255, (2, 10, 10, 3))
-      xint = x.astype('int')
-      x2 = utils.preprocess_input(x, mode=mode)
-      xint2 = utils.preprocess_input(xint)
-      self.assertAllClose(x, x2)
-      self.assertNotEqual(xint.astype('float').max(), xint2.max())
-
-    # Caffe mode works differently from the others
-    x = np.random.uniform(0, 255, (2, 10, 10, 3))
-    xint = x.astype('int')
-    x2 = utils.preprocess_input(x, data_format='channels_last', mode='caffe')
-    xint2 = utils.preprocess_input(xint)
-    self.assertAllClose(x, x2[..., ::-1])
-    self.assertNotEqual(xint.astype('float').max(), xint2.max())
-
-  @parameterized.named_parameters([
-      {
-          'testcase_name': 'mode_torch',
-          'mode': 'torch'
-      },
-      {
-          'testcase_name': 'mode_tf',
-          'mode': 'tf'
-      },
-      {
-          'testcase_name': 'mode_caffe',
-          'mode': 'caffe'
-      },
-  ])
-  def test_preprocess_input_symbolic(self, mode):
-    # Test image batch
-    x = np.random.uniform(0, 255, (2, 10, 10, 3))
-    inputs = keras.layers.Input(shape=x.shape[1:])
-    outputs = keras.layers.Lambda(
-        lambda x: utils.preprocess_input(x, mode=mode),
-        output_shape=x.shape[1:])(
-            inputs)
-    model = keras.Model(inputs, outputs)
-    self.assertEqual(model.predict(x).shape, x.shape)
-
-    outputs1 = keras.layers.Lambda(
-        lambda x: utils.preprocess_input(x, 'channels_last', mode=mode),
-        output_shape=x.shape[1:])(
-            inputs)
-    model1 = keras.Model(inputs, outputs1)
-    out1 = model1.predict(x)
-    x2 = np.transpose(x, (0, 3, 1, 2))
-    inputs2 = keras.layers.Input(shape=x2.shape[1:])
-    outputs2 = keras.layers.Lambda(
-        lambda x: utils.preprocess_input(x, 'channels_first', mode=mode),
-        output_shape=x2.shape[1:])(
-            inputs2)
-    model2 = keras.Model(inputs2, outputs2)
-    out2 = model2.predict(x2)
-    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
-
-    # Test single image
-    x = np.random.uniform(0, 255, (10, 10, 3))
-    inputs = keras.layers.Input(shape=x.shape)
-    outputs = keras.layers.Lambda(
-        lambda x: utils.preprocess_input(x, mode=mode), output_shape=x.shape)(
-            inputs)
-    model = keras.Model(inputs, outputs)
-    self.assertEqual(model.predict(x[np.newaxis])[0].shape, x.shape)
-
-    outputs1 = keras.layers.Lambda(
-        lambda x: utils.preprocess_input(x, 'channels_last', mode=mode),
-        output_shape=x.shape)(
-            inputs)
-    model1 = keras.Model(inputs, outputs1)
-    out1 = model1.predict(x[np.newaxis])[0]
-    x2 = np.transpose(x, (2, 0, 1))
-    inputs2 = keras.layers.Input(shape=x2.shape)
-    outputs2 = keras.layers.Lambda(
-        lambda x: utils.preprocess_input(x, 'channels_first', mode=mode),
-        output_shape=x2.shape)(
-            inputs2)
-    model2 = keras.Model(inputs2, outputs2)
-    out2 = model2.predict(x2[np.newaxis])[0]
-    self.assertAllClose(out1, out2.transpose(1, 2, 0))
-
-  @parameterized.named_parameters([
-      {
-          'testcase_name': 'mode_torch',
-          'mode': 'torch'
-      },
-      {
-          'testcase_name': 'mode_tf',
-          'mode': 'tf'
-      },
-      {
-          'testcase_name': 'mode_caffe',
-          'mode': 'caffe'
-      },
-  ])
-  def test_preprocess_input_symbolic_mixed_precision(self, mode):
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest('The global policy can only be tested in TensorFlow 2')
-    set_global_policy('mixed_float16')
-    shape = (20, 20, 3)
-    inputs = keras.layers.Input(shape=shape)
-    try:
-      keras.layers.Lambda(
-          lambda x: utils.preprocess_input(x, mode=mode), output_shape=shape)(
-              inputs)
-    finally:
-      set_global_policy('float32')
-
-  @parameterized.named_parameters([
-      {'testcase_name': 'channels_last_format',
-       'data_format': 'channels_last'},
-      {'testcase_name': 'channels_first_format',
-       'data_format': 'channels_first'},
-  ])
-  def test_obtain_input_shape(self, data_format):
-    # input_shape and default_size are not identical.
-    with self.assertRaises(ValueError):
-      utils.obtain_input_shape(
-          input_shape=(224, 224, 3),
-          default_size=299,
-          min_size=139,
-          data_format='channels_last',
-          require_flatten=True,
-          weights='imagenet')
-
-    # Test invalid use cases
-
-    shape = (139, 139)
-    if data_format == 'channels_last':
-      input_shape = shape + (99,)
-    else:
-      input_shape = (99,) + shape
-
-    # input_shape is smaller than min_size.
-    shape = (100, 100)
-    if data_format == 'channels_last':
-      input_shape = shape + (3,)
-    else:
-      input_shape = (3,) + shape
-    with self.assertRaises(ValueError):
-      utils.obtain_input_shape(
-          input_shape=input_shape,
-          default_size=None,
-          min_size=139,
-          data_format=data_format,
-          require_flatten=False)
-
-    # shape is 1D.
-    shape = (100,)
-    if data_format == 'channels_last':
-      input_shape = shape + (3,)
-    else:
-      input_shape = (3,) + shape
-    with self.assertRaises(ValueError):
-      utils.obtain_input_shape(
-          input_shape=input_shape,
-          default_size=None,
-          min_size=139,
-          data_format=data_format,
-          require_flatten=False)
-
-    # the number of channels is 5 not 3.
-    shape = (100, 100)
-    if data_format == 'channels_last':
-      input_shape = shape + (5,)
-    else:
-      input_shape = (5,) + shape
-    with self.assertRaises(ValueError):
-      utils.obtain_input_shape(
-          input_shape=input_shape,
-          default_size=None,
-          min_size=139,
-          data_format=data_format,
-          require_flatten=False)
-
-    # require_flatten=True with dynamic input shape.
-    with self.assertRaises(ValueError):
-      utils.obtain_input_shape(
-          input_shape=None,
-          default_size=None,
-          min_size=139,
-          data_format='channels_first',
-          require_flatten=True)
-
-    # test include top
-    self.assertEqual(utils.obtain_input_shape(
-        input_shape=(3, 200, 200),
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=True), (3, 200, 200))
-
-    self.assertEqual(utils.obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False), (None, None, 3))
-
-    self.assertEqual(utils.obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=False), (3, None, None))
-
-    self.assertEqual(utils.obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False), (None, None, 3))
-
-    self.assertEqual(utils.obtain_input_shape(
-        input_shape=(150, 150, 3),
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False), (150, 150, 3))
-
-    self.assertEqual(utils.obtain_input_shape(
-        input_shape=(3, None, None),
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=False), (3, None, None))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_preprocess_input(self):
+        # Test invalid mode check
+        x = np.random.uniform(0, 255, (10, 10, 3))
+        with self.assertRaises(ValueError):
+            utils.preprocess_input(x, mode="some_unknown_mode")
+
+        # Test image batch with float and int image input
+        x = np.random.uniform(0, 255, (2, 10, 10, 3))
+        xint = x.astype("int32")
+        self.assertEqual(utils.preprocess_input(x).shape, x.shape)
+        self.assertEqual(utils.preprocess_input(xint).shape, xint.shape)
+
+        out1 = utils.preprocess_input(x, "channels_last")
+        out1int = utils.preprocess_input(xint, "channels_last")
+        out2 = utils.preprocess_input(
+            np.transpose(x, (0, 3, 1, 2)), "channels_first"
+        )
+        out2int = utils.preprocess_input(
+            np.transpose(xint, (0, 3, 1, 2)), "channels_first"
+        )
+        self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
+        self.assertAllClose(out1int, out2int.transpose(0, 2, 3, 1))
+
+        # Test single image
+        x = np.random.uniform(0, 255, (10, 10, 3))
+        xint = x.astype("int32")
+        self.assertEqual(utils.preprocess_input(x).shape, x.shape)
+        self.assertEqual(utils.preprocess_input(xint).shape, xint.shape)
+
+        out1 = utils.preprocess_input(x, "channels_last")
+        out1int = utils.preprocess_input(xint, "channels_last")
+        out2 = utils.preprocess_input(
+            np.transpose(x, (2, 0, 1)), "channels_first"
+        )
+        out2int = utils.preprocess_input(
+            np.transpose(xint, (2, 0, 1)), "channels_first"
+        )
+        self.assertAllClose(out1, out2.transpose(1, 2, 0))
+        self.assertAllClose(out1int, out2int.transpose(1, 2, 0))
+
+        # Test that writing over the input data works predictably
+        for mode in ["torch", "tf"]:
+            x = np.random.uniform(0, 255, (2, 10, 10, 3))
+            xint = x.astype("int")
+            x2 = utils.preprocess_input(x, mode=mode)
+            xint2 = utils.preprocess_input(xint)
+            self.assertAllClose(x, x2)
+            self.assertNotEqual(xint.astype("float").max(), xint2.max())
+
+        # Caffe mode works differently from the others
+        x = np.random.uniform(0, 255, (2, 10, 10, 3))
+        xint = x.astype("int")
+        x2 = utils.preprocess_input(
+            x, data_format="channels_last", mode="caffe"
+        )
+        xint2 = utils.preprocess_input(xint)
+        self.assertAllClose(x, x2[..., ::-1])
+        self.assertNotEqual(xint.astype("float").max(), xint2.max())
+
+    @parameterized.named_parameters(
+        [
+            {"testcase_name": "mode_torch", "mode": "torch"},
+            {"testcase_name": "mode_tf", "mode": "tf"},
+            {"testcase_name": "mode_caffe", "mode": "caffe"},
+        ]
+    )
+    def test_preprocess_input_symbolic(self, mode):
+        # Test image batch
+        x = np.random.uniform(0, 255, (2, 10, 10, 3))
+        inputs = keras.layers.Input(shape=x.shape[1:])
+        outputs = keras.layers.Lambda(
+            lambda x: utils.preprocess_input(x, mode=mode),
+            output_shape=x.shape[1:],
+        )(inputs)
+        model = keras.Model(inputs, outputs)
+        self.assertEqual(model.predict(x).shape, x.shape)
+
+        outputs1 = keras.layers.Lambda(
+            lambda x: utils.preprocess_input(x, "channels_last", mode=mode),
+            output_shape=x.shape[1:],
+        )(inputs)
+        model1 = keras.Model(inputs, outputs1)
+        out1 = model1.predict(x)
+        x2 = np.transpose(x, (0, 3, 1, 2))
+        inputs2 = keras.layers.Input(shape=x2.shape[1:])
+        outputs2 = keras.layers.Lambda(
+            lambda x: utils.preprocess_input(x, "channels_first", mode=mode),
+            output_shape=x2.shape[1:],
+        )(inputs2)
+        model2 = keras.Model(inputs2, outputs2)
+        out2 = model2.predict(x2)
+        self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
+
+        # Test single image
+        x = np.random.uniform(0, 255, (10, 10, 3))
+        inputs = keras.layers.Input(shape=x.shape)
+        outputs = keras.layers.Lambda(
+            lambda x: utils.preprocess_input(x, mode=mode), output_shape=x.shape
+        )(inputs)
+        model = keras.Model(inputs, outputs)
+        self.assertEqual(model.predict(x[np.newaxis])[0].shape, x.shape)
+
+        outputs1 = keras.layers.Lambda(
+            lambda x: utils.preprocess_input(x, "channels_last", mode=mode),
+            output_shape=x.shape,
+        )(inputs)
+        model1 = keras.Model(inputs, outputs1)
+        out1 = model1.predict(x[np.newaxis])[0]
+        x2 = np.transpose(x, (2, 0, 1))
+        inputs2 = keras.layers.Input(shape=x2.shape)
+        outputs2 = keras.layers.Lambda(
+            lambda x: utils.preprocess_input(x, "channels_first", mode=mode),
+            output_shape=x2.shape,
+        )(inputs2)
+        model2 = keras.Model(inputs2, outputs2)
+        out2 = model2.predict(x2[np.newaxis])[0]
+        self.assertAllClose(out1, out2.transpose(1, 2, 0))
+
+    @parameterized.named_parameters(
+        [
+            {"testcase_name": "mode_torch", "mode": "torch"},
+            {"testcase_name": "mode_tf", "mode": "tf"},
+            {"testcase_name": "mode_caffe", "mode": "caffe"},
+        ]
+    )
+    def test_preprocess_input_symbolic_mixed_precision(self, mode):
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest(
+                "The global policy can only be tested in TensorFlow 2"
+            )
+        set_global_policy("mixed_float16")
+        shape = (20, 20, 3)
+        inputs = keras.layers.Input(shape=shape)
+        try:
+            keras.layers.Lambda(
+                lambda x: utils.preprocess_input(x, mode=mode),
+                output_shape=shape,
+            )(inputs)
+        finally:
+            set_global_policy("float32")
+
+    @parameterized.named_parameters(
+        [
+            {
+                "testcase_name": "channels_last_format",
+                "data_format": "channels_last",
+            },
+            {
+                "testcase_name": "channels_first_format",
+                "data_format": "channels_first",
+            },
+        ]
+    )
+    def test_obtain_input_shape(self, data_format):
+        # input_shape and default_size are not identical.
+        with self.assertRaises(ValueError):
+            utils.obtain_input_shape(
+                input_shape=(224, 224, 3),
+                default_size=299,
+                min_size=139,
+                data_format="channels_last",
+                require_flatten=True,
+                weights="imagenet",
+            )
+
+        # Test invalid use cases
+
+        shape = (139, 139)
+        if data_format == "channels_last":
+            input_shape = shape + (99,)
+        else:
+            input_shape = (99,) + shape
+
+        # input_shape is smaller than min_size.
+        shape = (100, 100)
+        if data_format == "channels_last":
+            input_shape = shape + (3,)
+        else:
+            input_shape = (3,) + shape
+        with self.assertRaises(ValueError):
+            utils.obtain_input_shape(
+                input_shape=input_shape,
+                default_size=None,
+                min_size=139,
+                data_format=data_format,
+                require_flatten=False,
+            )
+
+        # shape is 1D.
+        shape = (100,)
+        if data_format == "channels_last":
+            input_shape = shape + (3,)
+        else:
+            input_shape = (3,) + shape
+        with self.assertRaises(ValueError):
+            utils.obtain_input_shape(
+                input_shape=input_shape,
+                default_size=None,
+                min_size=139,
+                data_format=data_format,
+                require_flatten=False,
+            )
+
+        # the number of channels is 5 not 3.
+        shape = (100, 100)
+        if data_format == "channels_last":
+            input_shape = shape + (5,)
+        else:
+            input_shape = (5,) + shape
+        with self.assertRaises(ValueError):
+            utils.obtain_input_shape(
+                input_shape=input_shape,
+                default_size=None,
+                min_size=139,
+                data_format=data_format,
+                require_flatten=False,
+            )
+
+        # require_flatten=True with dynamic input shape.
+        with self.assertRaises(ValueError):
+            utils.obtain_input_shape(
+                input_shape=None,
+                default_size=None,
+                min_size=139,
+                data_format="channels_first",
+                require_flatten=True,
+            )
+
+        # test include top
+        self.assertEqual(
+            utils.obtain_input_shape(
+                input_shape=(3, 200, 200),
+                default_size=None,
+                min_size=139,
+                data_format="channels_first",
+                require_flatten=True,
+            ),
+            (3, 200, 200),
+        )
+
+        self.assertEqual(
+            utils.obtain_input_shape(
+                input_shape=None,
+                default_size=None,
+                min_size=139,
+                data_format="channels_last",
+                require_flatten=False,
+            ),
+            (None, None, 3),
+        )
+
+        self.assertEqual(
+            utils.obtain_input_shape(
+                input_shape=None,
+                default_size=None,
+                min_size=139,
+                data_format="channels_first",
+                require_flatten=False,
+            ),
+            (3, None, None),
+        )
+
+        self.assertEqual(
+            utils.obtain_input_shape(
+                input_shape=None,
+                default_size=None,
+                min_size=139,
+                data_format="channels_last",
+                require_flatten=False,
+            ),
+            (None, None, 3),
+        )
+
+        self.assertEqual(
+            utils.obtain_input_shape(
+                input_shape=(150, 150, 3),
+                default_size=None,
+                min_size=139,
+                data_format="channels_last",
+                require_flatten=False,
+            ),
+            (150, 150, 3),
+        )
+
+        self.assertEqual(
+            utils.obtain_input_shape(
+                input_shape=(3, None, None),
+                default_size=None,
+                min_size=139,
+                data_format="channels_first",
+                require_flatten=False,
+            ),
+            (3, None, None),
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/applications/inception_resnet_v2.py b/keras/applications/inception_resnet_v2.py
index b30a4799f10c..937139189898 100644
--- a/keras/applications/inception_resnet_v2.py
+++ b/keras/applications/inception_resnet_v2.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """Inception-ResNet V2 model for Keras.
 
 Reference:
@@ -23,372 +23,416 @@
 
 import tensorflow.compat.v2 as tf
 
+import keras
 from keras import backend
+from keras import layers as keras_layers
 from keras.applications import imagenet_utils
 from keras.engine import training
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHT_URL = ('https://storage.googleapis.com/tensorflow/'
-                   'keras-applications/inception_resnet_v2/')
+BASE_WEIGHT_URL = (
+    "https://storage.googleapis.com/tensorflow/"
+    "keras-applications/inception_resnet_v2/"
+)
 layers = None
 
 
-@keras_export('keras.applications.inception_resnet_v2.InceptionResNetV2',
-              'keras.applications.InceptionResNetV2')
-def InceptionResNetV2(include_top=True,
-                      weights='imagenet',
-                      input_tensor=None,
-                      input_shape=None,
-                      pooling=None,
-                      classes=1000,
-                      classifier_activation='softmax',
-                      **kwargs):
-  """Instantiates the Inception-ResNet v2 architecture.
-
-  Reference:
-  - [Inception-v4, Inception-ResNet and the Impact of
-     Residual Connections on Learning](https://arxiv.org/abs/1602.07261)
-    (AAAI 2017)
-
-  This function returns a Keras image classification model,
-  optionally loaded with weights pre-trained on ImageNet.
-
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For InceptionResNetV2, call
-  `tf.keras.applications.inception_resnet_v2.preprocess_input`
-  on your inputs before passing them to the model.
-  `inception_resnet_v2.preprocess_input`
-  will scale input pixels between -1 and 1.
-
-  Args:
-    include_top: whether to include the fully-connected
-      layer at the top of the network.
-    weights: one of `None` (random initialization),
-      'imagenet' (pre-training on ImageNet),
-      or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-      to use as image input for the model.
-    input_shape: optional shape tuple, only to be specified
-      if `include_top` is `False` (otherwise the input shape
-      has to be `(299, 299, 3)` (with `'channels_last'` data format)
-      or `(3, 299, 299)` (with `'channels_first'` data format).
-      It should have exactly 3 inputs channels,
-      and width and height should be no smaller than 75.
-      E.g. `(150, 150, 3)` would be one valid value.
-    pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`.
-      - `None` means that the output of the model will be
-          the 4D tensor output of the last convolutional block.
-      - `'avg'` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a 2D tensor.
-      - `'max'` means that global max pooling will be applied.
-    classes: optional number of classes to classify images
-      into, only to be specified if `include_top` is `True`, and
-      if no `weights` argument is specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-    **kwargs: For backwards compatibility only.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  global layers
-  if 'layers' in kwargs:
-    layers = kwargs.pop('layers')
-  else:
-    layers = VersionAwareLayers()
-  if kwargs:
-    raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=299,
-      min_size=75,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+@keras_export(
+    "keras.applications.inception_resnet_v2.InceptionResNetV2",
+    "keras.applications.InceptionResNetV2",
+)
+def InceptionResNetV2(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    """Instantiates the Inception-ResNet v2 architecture.
+
+    Reference:
+    - [Inception-v4, Inception-ResNet and the Impact of
+       Residual Connections on Learning](https://arxiv.org/abs/1602.07261)
+      (AAAI 2017)
+
+    This function returns a Keras image classification model,
+    optionally loaded with weights pre-trained on ImageNet.
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For InceptionResNetV2, call
+    `tf.keras.applications.inception_resnet_v2.preprocess_input`
+    on your inputs before passing them to the model.
+    `inception_resnet_v2.preprocess_input`
+    will scale input pixels between -1 and 1.
+
+    Args:
+      include_top: whether to include the fully-connected
+        layer at the top of the network.
+      weights: one of `None` (random initialization),
+        'imagenet' (pre-training on ImageNet),
+        or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+        to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+        if `include_top` is `False` (otherwise the input shape
+        has to be `(299, 299, 3)` (with `'channels_last'` data format)
+        or `(3, 299, 299)` (with `'channels_first'` data format).
+        It should have exactly 3 inputs channels,
+        and width and height should be no smaller than 75.
+        E.g. `(150, 150, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+        when `include_top` is `False`.
+        - `None` means that the output of the model will be
+            the 4D tensor output of the last convolutional block.
+        - `'avg'` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a 2D tensor.
+        - `'max'` means that global max pooling will be applied.
+      classes: optional number of classes to classify images
+        into, only to be specified if `include_top` is `True`, and
+        if no `weights` argument is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+      **kwargs: For backwards compatibility only.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    global layers
+    if "layers" in kwargs:
+        layers = kwargs.pop("layers")
     else:
-      img_input = input_tensor
-
-  # Stem block: 35 x 35 x 192
-  x = conv2d_bn(img_input, 32, 3, strides=2, padding='valid')
-  x = conv2d_bn(x, 32, 3, padding='valid')
-  x = conv2d_bn(x, 64, 3)
-  x = layers.MaxPooling2D(3, strides=2)(x)
-  x = conv2d_bn(x, 80, 1, padding='valid')
-  x = conv2d_bn(x, 192, 3, padding='valid')
-  x = layers.MaxPooling2D(3, strides=2)(x)
-
-  # Mixed 5b (Inception-A block): 35 x 35 x 320
-  branch_0 = conv2d_bn(x, 96, 1)
-  branch_1 = conv2d_bn(x, 48, 1)
-  branch_1 = conv2d_bn(branch_1, 64, 5)
-  branch_2 = conv2d_bn(x, 64, 1)
-  branch_2 = conv2d_bn(branch_2, 96, 3)
-  branch_2 = conv2d_bn(branch_2, 96, 3)
-  branch_pool = layers.AveragePooling2D(3, strides=1, padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 64, 1)
-  branches = [branch_0, branch_1, branch_2, branch_pool]
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else 3
-  x = layers.Concatenate(axis=channel_axis, name='mixed_5b')(branches)
-
-  # 10x block35 (Inception-ResNet-A block): 35 x 35 x 320
-  for block_idx in range(1, 11):
-    x = inception_resnet_block(
-        x, scale=0.17, block_type='block35', block_idx=block_idx)
-
-  # Mixed 6a (Reduction-A block): 17 x 17 x 1088
-  branch_0 = conv2d_bn(x, 384, 3, strides=2, padding='valid')
-  branch_1 = conv2d_bn(x, 256, 1)
-  branch_1 = conv2d_bn(branch_1, 256, 3)
-  branch_1 = conv2d_bn(branch_1, 384, 3, strides=2, padding='valid')
-  branch_pool = layers.MaxPooling2D(3, strides=2, padding='valid')(x)
-  branches = [branch_0, branch_1, branch_pool]
-  x = layers.Concatenate(axis=channel_axis, name='mixed_6a')(branches)
-
-  # 20x block17 (Inception-ResNet-B block): 17 x 17 x 1088
-  for block_idx in range(1, 21):
-    x = inception_resnet_block(
-        x, scale=0.1, block_type='block17', block_idx=block_idx)
-
-  # Mixed 7a (Reduction-B block): 8 x 8 x 2080
-  branch_0 = conv2d_bn(x, 256, 1)
-  branch_0 = conv2d_bn(branch_0, 384, 3, strides=2, padding='valid')
-  branch_1 = conv2d_bn(x, 256, 1)
-  branch_1 = conv2d_bn(branch_1, 288, 3, strides=2, padding='valid')
-  branch_2 = conv2d_bn(x, 256, 1)
-  branch_2 = conv2d_bn(branch_2, 288, 3)
-  branch_2 = conv2d_bn(branch_2, 320, 3, strides=2, padding='valid')
-  branch_pool = layers.MaxPooling2D(3, strides=2, padding='valid')(x)
-  branches = [branch_0, branch_1, branch_2, branch_pool]
-  x = layers.Concatenate(axis=channel_axis, name='mixed_7a')(branches)
-
-  # 10x block8 (Inception-ResNet-C block): 8 x 8 x 2080
-  for block_idx in range(1, 10):
+        layers = VersionAwareLayers()
+    if kwargs:
+        raise ValueError(f"Unknown argument(s): {kwargs}")
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top`'
+            " as true, `classes` should be 1000"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=299,
+        min_size=75,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
+    else:
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    # Stem block: 35 x 35 x 192
+    x = conv2d_bn(img_input, 32, 3, strides=2, padding="valid")
+    x = conv2d_bn(x, 32, 3, padding="valid")
+    x = conv2d_bn(x, 64, 3)
+    x = layers.MaxPooling2D(3, strides=2)(x)
+    x = conv2d_bn(x, 80, 1, padding="valid")
+    x = conv2d_bn(x, 192, 3, padding="valid")
+    x = layers.MaxPooling2D(3, strides=2)(x)
+
+    # Mixed 5b (Inception-A block): 35 x 35 x 320
+    branch_0 = conv2d_bn(x, 96, 1)
+    branch_1 = conv2d_bn(x, 48, 1)
+    branch_1 = conv2d_bn(branch_1, 64, 5)
+    branch_2 = conv2d_bn(x, 64, 1)
+    branch_2 = conv2d_bn(branch_2, 96, 3)
+    branch_2 = conv2d_bn(branch_2, 96, 3)
+    branch_pool = layers.AveragePooling2D(3, strides=1, padding="same")(x)
+    branch_pool = conv2d_bn(branch_pool, 64, 1)
+    branches = [branch_0, branch_1, branch_2, branch_pool]
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else 3
+    x = layers.Concatenate(axis=channel_axis, name="mixed_5b")(branches)
+
+    # 10x block35 (Inception-ResNet-A block): 35 x 35 x 320
+    for block_idx in range(1, 11):
+        x = inception_resnet_block(
+            x, scale=0.17, block_type="block35", block_idx=block_idx
+        )
+
+    # Mixed 6a (Reduction-A block): 17 x 17 x 1088
+    branch_0 = conv2d_bn(x, 384, 3, strides=2, padding="valid")
+    branch_1 = conv2d_bn(x, 256, 1)
+    branch_1 = conv2d_bn(branch_1, 256, 3)
+    branch_1 = conv2d_bn(branch_1, 384, 3, strides=2, padding="valid")
+    branch_pool = layers.MaxPooling2D(3, strides=2, padding="valid")(x)
+    branches = [branch_0, branch_1, branch_pool]
+    x = layers.Concatenate(axis=channel_axis, name="mixed_6a")(branches)
+
+    # 20x block17 (Inception-ResNet-B block): 17 x 17 x 1088
+    for block_idx in range(1, 21):
+        x = inception_resnet_block(
+            x, scale=0.1, block_type="block17", block_idx=block_idx
+        )
+
+    # Mixed 7a (Reduction-B block): 8 x 8 x 2080
+    branch_0 = conv2d_bn(x, 256, 1)
+    branch_0 = conv2d_bn(branch_0, 384, 3, strides=2, padding="valid")
+    branch_1 = conv2d_bn(x, 256, 1)
+    branch_1 = conv2d_bn(branch_1, 288, 3, strides=2, padding="valid")
+    branch_2 = conv2d_bn(x, 256, 1)
+    branch_2 = conv2d_bn(branch_2, 288, 3)
+    branch_2 = conv2d_bn(branch_2, 320, 3, strides=2, padding="valid")
+    branch_pool = layers.MaxPooling2D(3, strides=2, padding="valid")(x)
+    branches = [branch_0, branch_1, branch_2, branch_pool]
+    x = layers.Concatenate(axis=channel_axis, name="mixed_7a")(branches)
+
+    # 10x block8 (Inception-ResNet-C block): 8 x 8 x 2080
+    for block_idx in range(1, 10):
+        x = inception_resnet_block(
+            x, scale=0.2, block_type="block8", block_idx=block_idx
+        )
     x = inception_resnet_block(
-        x, scale=0.2, block_type='block8', block_idx=block_idx)
-  x = inception_resnet_block(
-      x, scale=1., activation=None, block_type='block8', block_idx=10)
-
-  # Final convolution block: 8 x 8 x 1536
-  x = conv2d_bn(x, 1536, 1, name='conv_7b')
-
-  if include_top:
-    # Classification block
-    x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = training.Model(inputs, x, name='inception_resnet_v2')
-
-  # Load weights.
-  if weights == 'imagenet':
+        x, scale=1.0, activation=None, block_type="block8", block_idx=10
+    )
+
+    # Final convolution block: 8 x 8 x 1536
+    x = conv2d_bn(x, 1536, 1, name="conv_7b")
+
     if include_top:
-      fname = 'inception_resnet_v2_weights_tf_dim_ordering_tf_kernels.h5'
-      weights_path = data_utils.get_file(
-          fname,
-          BASE_WEIGHT_URL + fname,
-          cache_subdir='models',
-          file_hash='e693bd0210a403b3192acc6073ad2e96')
+        # Classification block
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = training.Model(inputs, x, name="inception_resnet_v2")
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            fname = "inception_resnet_v2_weights_tf_dim_ordering_tf_kernels.h5"
+            weights_path = data_utils.get_file(
+                fname,
+                BASE_WEIGHT_URL + fname,
+                cache_subdir="models",
+                file_hash="e693bd0210a403b3192acc6073ad2e96",
+            )
+        else:
+            fname = (
+                "inception_resnet_v2_weights_"
+                "tf_dim_ordering_tf_kernels_notop.h5"
+            )
+            weights_path = data_utils.get_file(
+                fname,
+                BASE_WEIGHT_URL + fname,
+                cache_subdir="models",
+                file_hash="d19885ff4a710c122648d3b5c3b684e4",
+            )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
+
+
+def conv2d_bn(
+    x,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="same",
+    activation="relu",
+    use_bias=False,
+    name=None,
+):
+    """Utility function to apply conv + BN.
+
+    Args:
+      x: input tensor.
+      filters: filters in `Conv2D`.
+      kernel_size: kernel size as in `Conv2D`.
+      strides: strides in `Conv2D`.
+      padding: padding mode in `Conv2D`.
+      activation: activation in `Conv2D`.
+      use_bias: whether to use a bias in `Conv2D`.
+      name: name of the ops; will become `name + '_ac'` for the activation
+          and `name + '_bn'` for the batch norm layer.
+
+    Returns:
+      Output tensor after applying `Conv2D` and `BatchNormalization`.
+    """
+    x = layers.Conv2D(
+        filters,
+        kernel_size,
+        strides=strides,
+        padding=padding,
+        use_bias=use_bias,
+        name=name,
+    )(x)
+    if not use_bias:
+        bn_axis = 1 if backend.image_data_format() == "channels_first" else 3
+        bn_name = None if name is None else name + "_bn"
+        x = layers.BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(
+            x
+        )
+    if activation is not None:
+        ac_name = None if name is None else name + "_ac"
+        x = layers.Activation(activation, name=ac_name)(x)
+    return x
+
+
+@keras.utils.register_keras_serializable()
+class CustomScaleLayer(keras_layers.Layer):
+    def __init__(self, scale, **kwargs):
+        super().__init__(**kwargs)
+        self.scale = scale
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"scale": self.scale})
+        return config
+
+    def call(self, inputs):
+        return inputs[0] + inputs[1] * self.scale
+
+
+def inception_resnet_block(x, scale, block_type, block_idx, activation="relu"):
+    """Adds an Inception-ResNet block.
+
+    This function builds 3 types of Inception-ResNet blocks mentioned
+    in the paper, controlled by the `block_type` argument (which is the
+    block name used in the official TF-slim implementation):
+    - Inception-ResNet-A: `block_type='block35'`
+    - Inception-ResNet-B: `block_type='block17'`
+    - Inception-ResNet-C: `block_type='block8'`
+
+    Args:
+      x: input tensor.
+      scale: scaling factor to scale the residuals (i.e., the output of passing
+        `x` through an inception module) before adding them to the shortcut
+        branch. Let `r` be the output from the residual branch, the output of
+        this block will be `x + scale * r`.
+      block_type: `'block35'`, `'block17'` or `'block8'`, determines the network
+        structure in the residual branch.
+      block_idx: an `int` used for generating layer names. The Inception-ResNet
+        blocks are repeated many times in this network. We use `block_idx` to
+        identify each of the repetitions. For example, the first
+        Inception-ResNet-A block will have `block_type='block35', block_idx=0`,
+        and the layer names will have a common prefix `'block35_0'`.
+      activation: activation function to use at the end of the block (see
+        [activations](../activations.md)). When `activation=None`, no activation
+        is applied
+        (i.e., "linear" activation: `a(x) = x`).
+
+    Returns:
+        Output tensor for the block.
+
+    Raises:
+      ValueError: if `block_type` is not one of `'block35'`,
+        `'block17'` or `'block8'`.
+    """
+    if block_type == "block35":
+        branch_0 = conv2d_bn(x, 32, 1)
+        branch_1 = conv2d_bn(x, 32, 1)
+        branch_1 = conv2d_bn(branch_1, 32, 3)
+        branch_2 = conv2d_bn(x, 32, 1)
+        branch_2 = conv2d_bn(branch_2, 48, 3)
+        branch_2 = conv2d_bn(branch_2, 64, 3)
+        branches = [branch_0, branch_1, branch_2]
+    elif block_type == "block17":
+        branch_0 = conv2d_bn(x, 192, 1)
+        branch_1 = conv2d_bn(x, 128, 1)
+        branch_1 = conv2d_bn(branch_1, 160, [1, 7])
+        branch_1 = conv2d_bn(branch_1, 192, [7, 1])
+        branches = [branch_0, branch_1]
+    elif block_type == "block8":
+        branch_0 = conv2d_bn(x, 192, 1)
+        branch_1 = conv2d_bn(x, 192, 1)
+        branch_1 = conv2d_bn(branch_1, 224, [1, 3])
+        branch_1 = conv2d_bn(branch_1, 256, [3, 1])
+        branches = [branch_0, branch_1]
     else:
-      fname = ('inception_resnet_v2_weights_'
-               'tf_dim_ordering_tf_kernels_notop.h5')
-      weights_path = data_utils.get_file(
-          fname,
-          BASE_WEIGHT_URL + fname,
-          cache_subdir='models',
-          file_hash='d19885ff4a710c122648d3b5c3b684e4')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-def conv2d_bn(x,
-              filters,
-              kernel_size,
-              strides=1,
-              padding='same',
-              activation='relu',
-              use_bias=False,
-              name=None):
-  """Utility function to apply conv + BN.
-
-  Args:
-    x: input tensor.
-    filters: filters in `Conv2D`.
-    kernel_size: kernel size as in `Conv2D`.
-    strides: strides in `Conv2D`.
-    padding: padding mode in `Conv2D`.
-    activation: activation in `Conv2D`.
-    use_bias: whether to use a bias in `Conv2D`.
-    name: name of the ops; will become `name + '_ac'` for the activation
-        and `name + '_bn'` for the batch norm layer.
-
-  Returns:
-    Output tensor after applying `Conv2D` and `BatchNormalization`.
-  """
-  x = layers.Conv2D(
-      filters,
-      kernel_size,
-      strides=strides,
-      padding=padding,
-      use_bias=use_bias,
-      name=name)(
-          x)
-  if not use_bias:
-    bn_axis = 1 if backend.image_data_format() == 'channels_first' else 3
-    bn_name = None if name is None else name + '_bn'
-    x = layers.BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
-  if activation is not None:
-    ac_name = None if name is None else name + '_ac'
-    x = layers.Activation(activation, name=ac_name)(x)
-  return x
-
-
-def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
-  """Adds an Inception-ResNet block.
-
-  This function builds 3 types of Inception-ResNet blocks mentioned
-  in the paper, controlled by the `block_type` argument (which is the
-  block name used in the official TF-slim implementation):
-  - Inception-ResNet-A: `block_type='block35'`
-  - Inception-ResNet-B: `block_type='block17'`
-  - Inception-ResNet-C: `block_type='block8'`
-
-  Args:
-    x: input tensor.
-    scale: scaling factor to scale the residuals (i.e., the output of passing
-      `x` through an inception module) before adding them to the shortcut
-      branch. Let `r` be the output from the residual branch, the output of this
-      block will be `x + scale * r`.
-    block_type: `'block35'`, `'block17'` or `'block8'`, determines the network
-      structure in the residual branch.
-    block_idx: an `int` used for generating layer names. The Inception-ResNet
-      blocks are repeated many times in this network. We use `block_idx` to
-      identify each of the repetitions. For example, the first
-      Inception-ResNet-A block will have `block_type='block35', block_idx=0`,
-      and the layer names will have a common prefix `'block35_0'`.
-    activation: activation function to use at the end of the block (see
-      [activations](../activations.md)). When `activation=None`, no activation
-      is applied
-      (i.e., "linear" activation: `a(x) = x`).
-
-  Returns:
-      Output tensor for the block.
-
-  Raises:
-    ValueError: if `block_type` is not one of `'block35'`,
-      `'block17'` or `'block8'`.
-  """
-  if block_type == 'block35':
-    branch_0 = conv2d_bn(x, 32, 1)
-    branch_1 = conv2d_bn(x, 32, 1)
-    branch_1 = conv2d_bn(branch_1, 32, 3)
-    branch_2 = conv2d_bn(x, 32, 1)
-    branch_2 = conv2d_bn(branch_2, 48, 3)
-    branch_2 = conv2d_bn(branch_2, 64, 3)
-    branches = [branch_0, branch_1, branch_2]
-  elif block_type == 'block17':
-    branch_0 = conv2d_bn(x, 192, 1)
-    branch_1 = conv2d_bn(x, 128, 1)
-    branch_1 = conv2d_bn(branch_1, 160, [1, 7])
-    branch_1 = conv2d_bn(branch_1, 192, [7, 1])
-    branches = [branch_0, branch_1]
-  elif block_type == 'block8':
-    branch_0 = conv2d_bn(x, 192, 1)
-    branch_1 = conv2d_bn(x, 192, 1)
-    branch_1 = conv2d_bn(branch_1, 224, [1, 3])
-    branch_1 = conv2d_bn(branch_1, 256, [3, 1])
-    branches = [branch_0, branch_1]
-  else:
-    raise ValueError('Unknown Inception-ResNet block type. '
-                     'Expects "block35", "block17" or "block8", '
-                     'but got: ' + str(block_type))
-
-  block_name = block_type + '_' + str(block_idx)
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else 3
-  mixed = layers.Concatenate(
-      axis=channel_axis, name=block_name + '_mixed')(
-          branches)
-  up = conv2d_bn(
-      mixed,
-      backend.int_shape(x)[channel_axis],
-      1,
-      activation=None,
-      use_bias=True,
-      name=block_name + '_conv')
-
-  x = layers.Lambda(
-      lambda inputs, scale: inputs[0] + inputs[1] * scale,
-      output_shape=backend.int_shape(x)[1:],
-      arguments={'scale': scale},
-      name=block_name)([x, up])
-  if activation is not None:
-    x = layers.Activation(activation, name=block_name + '_ac')(x)
-  return x
-
-
-@keras_export('keras.applications.inception_resnet_v2.preprocess_input')
+        raise ValueError(
+            "Unknown Inception-ResNet block type. "
+            'Expects "block35", "block17" or "block8", '
+            "but got: " + str(block_type)
+        )
+
+    block_name = block_type + "_" + str(block_idx)
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else 3
+    mixed = layers.Concatenate(axis=channel_axis, name=block_name + "_mixed")(
+        branches
+    )
+    up = conv2d_bn(
+        mixed,
+        backend.int_shape(x)[channel_axis],
+        1,
+        activation=None,
+        use_bias=True,
+        name=block_name + "_conv",
+    )
+
+    x = CustomScaleLayer(scale)([x, up])
+    if activation is not None:
+        x = layers.Activation(activation, name=block_name + "_ac")(x)
+    return x
+
+
+@keras_export("keras.applications.inception_resnet_v2.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="tf"
+    )
 
 
-@keras_export('keras.applications.inception_resnet_v2.decode_predictions')
+@keras_export("keras.applications.inception_resnet_v2.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/inception_v3.py b/keras/applications/inception_v3.py
index bd12b8f75fb6..d3ab844e16a9 100644
--- a/keras/applications/inception_v3.py
+++ b/keras/applications/inception_v3.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """Inception V3 model for Keras.
 
 Reference:
@@ -28,399 +28,436 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
 WEIGHTS_PATH = (
-    'https://storage.googleapis.com/tensorflow/keras-applications/'
-    'inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5')
+    "https://storage.googleapis.com/tensorflow/keras-applications/"
+    "inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5"
+)
 WEIGHTS_PATH_NO_TOP = (
-    'https://storage.googleapis.com/tensorflow/keras-applications/'
-    'inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5')
+    "https://storage.googleapis.com/tensorflow/keras-applications/"
+    "inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5"
+)
 
 layers = VersionAwareLayers()
 
 
-@keras_export('keras.applications.inception_v3.InceptionV3',
-              'keras.applications.InceptionV3')
+@keras_export(
+    "keras.applications.inception_v3.InceptionV3",
+    "keras.applications.InceptionV3",
+)
 def InceptionV3(
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the Inception v3 architecture.
-
-  Reference:
-  - [Rethinking the Inception Architecture for Computer Vision](
-      http://arxiv.org/abs/1512.00567) (CVPR 2016)
+    classifier_activation="softmax",
+):
+    """Instantiates the Inception v3 architecture.
+
+    Reference:
+    - [Rethinking the Inception Architecture for Computer Vision](
+        http://arxiv.org/abs/1512.00567) (CVPR 2016)
+
+    This function returns a Keras image classification model,
+    optionally loaded with weights pre-trained on ImageNet.
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For `InceptionV3`, call
+    `tf.keras.applications.inception_v3.preprocess_input` on your inputs before
+    passing them to the model. `inception_v3.preprocess_input` will scale input
+    pixels between -1 and 1.
+
+    Args:
+      include_top: Boolean, whether to include the fully-connected
+        layer at the top, as the last layer of the network. Defaults to `True`.
+      weights: One of `None` (random initialization),
+        `imagenet` (pre-training on ImageNet),
+        or the path to the weights file to be loaded. Defaults to `imagenet`.
+      input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`)
+        to use as image input for the model. `input_tensor` is useful for
+        sharing inputs between multiple different networks. Defaults to `None`.
+      input_shape: Optional shape tuple, only to be specified
+        if `include_top` is False (otherwise the input shape
+        has to be `(299, 299, 3)` (with `channels_last` data format)
+        or `(3, 299, 299)` (with `channels_first` data format).
+        It should have exactly 3 inputs channels,
+        and width and height should be no smaller than 75.
+        E.g. `(150, 150, 3)` would be one valid value.
+        `input_shape` will be ignored if the `input_tensor` is provided.
+      pooling: Optional pooling mode for feature extraction
+        when `include_top` is `False`.
+        - `None` (default) means that the output of the model will be
+            the 4D tensor output of the last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will be applied.
+      classes: optional number of classes to classify images
+        into, only to be specified if `include_top` is True, and
+        if no `weights` argument is specified. Defaults to 1000.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded; "
+            f"Received: weights={weights}"
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top` '
+            "as true, `classes` should be 1000; "
+            f"Received classes={classes}"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=299,
+        min_size=75,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
+    else:
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
 
-  This function returns a Keras image classification model,
-  optionally loaded with weights pre-trained on ImageNet.
-
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For `InceptionV3`, call `tf.keras.applications.inception_v3.preprocess_input`
-  on your inputs before passing them to the model.
-  `inception_v3.preprocess_input` will scale input pixels between -1 and 1.
-
-  Args:
-    include_top: Boolean, whether to include the fully-connected
-      layer at the top, as the last layer of the network. Default to `True`.
-    weights: One of `None` (random initialization),
-      `imagenet` (pre-training on ImageNet),
-      or the path to the weights file to be loaded. Default to `imagenet`.
-    input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`)
-      to use as image input for the model. `input_tensor` is useful for sharing
-      inputs between multiple different networks. Default to None.
-    input_shape: Optional shape tuple, only to be specified
-      if `include_top` is False (otherwise the input shape
-      has to be `(299, 299, 3)` (with `channels_last` data format)
-      or `(3, 299, 299)` (with `channels_first` data format).
-      It should have exactly 3 inputs channels,
-      and width and height should be no smaller than 75.
-      E.g. `(150, 150, 3)` would be one valid value.
-      `input_shape` will be ignored if the `input_tensor` is provided.
-    pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`.
-      - `None` (default) means that the output of the model will be
-          the 4D tensor output of the last convolutional block.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a 2D tensor.
-      - `max` means that global max pooling will be applied.
-    classes: optional number of classes to classify images
-      into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified. Default to 1000.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded; '
-                     f'Received: weights={weights}')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
-                     'as true, `classes` should be 1000; '
-                     f'Received classes={classes}')
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=299,
-      min_size=75,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+    if backend.image_data_format() == "channels_first":
+        channel_axis = 1
     else:
-      img_input = input_tensor
-
-  if backend.image_data_format() == 'channels_first':
-    channel_axis = 1
-  else:
-    channel_axis = 3
-
-  x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='valid')
-  x = conv2d_bn(x, 32, 3, 3, padding='valid')
-  x = conv2d_bn(x, 64, 3, 3)
-  x = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-  x = conv2d_bn(x, 80, 1, 1, padding='valid')
-  x = conv2d_bn(x, 192, 3, 3, padding='valid')
-  x = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-  # mixed 0: 35 x 35 x 256
-  branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-  branch5x5 = conv2d_bn(x, 48, 1, 1)
-  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-  branch_pool = layers.AveragePooling2D(
-      (3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
-  x = layers.concatenate([branch1x1, branch5x5, branch3x3dbl, branch_pool],
-                         axis=channel_axis,
-                         name='mixed0')
-
-  # mixed 1: 35 x 35 x 288
-  branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-  branch5x5 = conv2d_bn(x, 48, 1, 1)
-  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-  branch_pool = layers.AveragePooling2D(
-      (3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
-  x = layers.concatenate([branch1x1, branch5x5, branch3x3dbl, branch_pool],
-                         axis=channel_axis,
-                         name='mixed1')
-
-  # mixed 2: 35 x 35 x 288
-  branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-  branch5x5 = conv2d_bn(x, 48, 1, 1)
-  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-  branch_pool = layers.AveragePooling2D(
-      (3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
-  x = layers.concatenate([branch1x1, branch5x5, branch3x3dbl, branch_pool],
-                         axis=channel_axis,
-                         name='mixed2')
-
-  # mixed 3: 17 x 17 x 768
-  branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='valid')
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(
-      branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='valid')
-
-  branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
-  x = layers.concatenate([branch3x3, branch3x3dbl, branch_pool],
-                         axis=channel_axis,
-                         name='mixed3')
-
-  # mixed 4: 17 x 17 x 768
-  branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-  branch7x7 = conv2d_bn(x, 128, 1, 1)
-  branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
-  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-  branch7x7dbl = conv2d_bn(x, 128, 1, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-  branch_pool = layers.AveragePooling2D(
-      (3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-  x = layers.concatenate([branch1x1, branch7x7, branch7x7dbl, branch_pool],
-                         axis=channel_axis,
-                         name='mixed4')
-
-  # mixed 5, 6: 17 x 17 x 768
-  for i in range(2):
+        channel_axis = 3
+
+    x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding="valid")
+    x = conv2d_bn(x, 32, 3, 3, padding="valid")
+    x = conv2d_bn(x, 64, 3, 3)
+    x = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+    x = conv2d_bn(x, 80, 1, 1, padding="valid")
+    x = conv2d_bn(x, 192, 3, 3, padding="valid")
+    x = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
+
+    # mixed 0: 35 x 35 x 256
+    branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+    branch5x5 = conv2d_bn(x, 48, 1, 1)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+    branch_pool = layers.AveragePooling2D(
+        (3, 3), strides=(1, 1), padding="same"
+    )(x)
+    branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
+    x = layers.concatenate(
+        [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+        axis=channel_axis,
+        name="mixed0",
+    )
+
+    # mixed 1: 35 x 35 x 288
+    branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+    branch5x5 = conv2d_bn(x, 48, 1, 1)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+    branch_pool = layers.AveragePooling2D(
+        (3, 3), strides=(1, 1), padding="same"
+    )(x)
+    branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+    x = layers.concatenate(
+        [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+        axis=channel_axis,
+        name="mixed1",
+    )
+
+    # mixed 2: 35 x 35 x 288
+    branch1x1 = conv2d_bn(x, 64, 1, 1)
+
+    branch5x5 = conv2d_bn(x, 48, 1, 1)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+
+    branch_pool = layers.AveragePooling2D(
+        (3, 3), strides=(1, 1), padding="same"
+    )(x)
+    branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+    x = layers.concatenate(
+        [branch1x1, branch5x5, branch3x3dbl, branch_pool],
+        axis=channel_axis,
+        name="mixed2",
+    )
+
+    # mixed 3: 17 x 17 x 768
+    branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding="valid")
+
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(
+        branch3x3dbl, 96, 3, 3, strides=(2, 2), padding="valid"
+    )
+
+    branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
+    x = layers.concatenate(
+        [branch3x3, branch3x3dbl, branch_pool], axis=channel_axis, name="mixed3"
+    )
+
+    # mixed 4: 17 x 17 x 768
     branch1x1 = conv2d_bn(x, 192, 1, 1)
 
-    branch7x7 = conv2d_bn(x, 160, 1, 1)
-    branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
+    branch7x7 = conv2d_bn(x, 128, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
     branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
 
-    branch7x7dbl = conv2d_bn(x, 160, 1, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+    branch7x7dbl = conv2d_bn(x, 128, 1, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
     branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
 
-    branch_pool = layers.AveragePooling2D((3, 3),
-                                          strides=(1, 1),
-                                          padding='same')(
-                                              x)
+    branch_pool = layers.AveragePooling2D(
+        (3, 3), strides=(1, 1), padding="same"
+    )(x)
     branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-    x = layers.concatenate([branch1x1, branch7x7, branch7x7dbl, branch_pool],
-                           axis=channel_axis,
-                           name='mixed' + str(5 + i))
-
-  # mixed 7: 17 x 17 x 768
-  branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-  branch7x7 = conv2d_bn(x, 192, 1, 1)
-  branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
-  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-  branch7x7dbl = conv2d_bn(x, 192, 1, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-  branch_pool = layers.AveragePooling2D(
-      (3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-  x = layers.concatenate([branch1x1, branch7x7, branch7x7dbl, branch_pool],
-                         axis=channel_axis,
-                         name='mixed7')
-
-  # mixed 8: 8 x 8 x 1280
-  branch3x3 = conv2d_bn(x, 192, 1, 1)
-  branch3x3 = conv2d_bn(branch3x3, 320, 3, 3, strides=(2, 2), padding='valid')
-
-  branch7x7x3 = conv2d_bn(x, 192, 1, 1)
-  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
-  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
-  branch7x7x3 = conv2d_bn(
-      branch7x7x3, 192, 3, 3, strides=(2, 2), padding='valid')
-
-  branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
-  x = layers.concatenate([branch3x3, branch7x7x3, branch_pool],
-                         axis=channel_axis,
-                         name='mixed8')
-
-  # mixed 9: 8 x 8 x 2048
-  for i in range(2):
-    branch1x1 = conv2d_bn(x, 320, 1, 1)
-
-    branch3x3 = conv2d_bn(x, 384, 1, 1)
-    branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
-    branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
-    branch3x3 = layers.concatenate([branch3x3_1, branch3x3_2],
-                                   axis=channel_axis,
-                                   name='mixed9_' + str(i))
-
-    branch3x3dbl = conv2d_bn(x, 448, 1, 1)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
-    branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
-    branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
-    branch3x3dbl = layers.concatenate([branch3x3dbl_1, branch3x3dbl_2],
-                                      axis=channel_axis)
-
-    branch_pool = layers.AveragePooling2D((3, 3),
-                                          strides=(1, 1),
-                                          padding='same')(
-                                              x)
+    x = layers.concatenate(
+        [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+        axis=channel_axis,
+        name="mixed4",
+    )
+
+    # mixed 5, 6: 17 x 17 x 768
+    for i in range(2):
+        branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+        branch7x7 = conv2d_bn(x, 160, 1, 1)
+        branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
+        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+        branch7x7dbl = conv2d_bn(x, 160, 1, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+        branch_pool = layers.AveragePooling2D(
+            (3, 3), strides=(1, 1), padding="same"
+        )(x)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+        x = layers.concatenate(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+            axis=channel_axis,
+            name="mixed" + str(5 + i),
+        )
+
+    # mixed 7: 17 x 17 x 768
+    branch1x1 = conv2d_bn(x, 192, 1, 1)
+
+    branch7x7 = conv2d_bn(x, 192, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+
+    branch7x7dbl = conv2d_bn(x, 192, 1, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+
+    branch_pool = layers.AveragePooling2D(
+        (3, 3), strides=(1, 1), padding="same"
+    )(x)
     branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-    x = layers.concatenate([branch1x1, branch3x3, branch3x3dbl, branch_pool],
-                           axis=channel_axis,
-                           name='mixed' + str(9 + i))
-  if include_top:
-    # Classification block
-    x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = training.Model(inputs, x, name='inception_v3')
-
-  # Load weights.
-  if weights == 'imagenet':
+    x = layers.concatenate(
+        [branch1x1, branch7x7, branch7x7dbl, branch_pool],
+        axis=channel_axis,
+        name="mixed7",
+    )
+
+    # mixed 8: 8 x 8 x 1280
+    branch3x3 = conv2d_bn(x, 192, 1, 1)
+    branch3x3 = conv2d_bn(branch3x3, 320, 3, 3, strides=(2, 2), padding="valid")
+
+    branch7x7x3 = conv2d_bn(x, 192, 1, 1)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
+    branch7x7x3 = conv2d_bn(
+        branch7x7x3, 192, 3, 3, strides=(2, 2), padding="valid"
+    )
+
+    branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2))(x)
+    x = layers.concatenate(
+        [branch3x3, branch7x7x3, branch_pool], axis=channel_axis, name="mixed8"
+    )
+
+    # mixed 9: 8 x 8 x 2048
+    for i in range(2):
+        branch1x1 = conv2d_bn(x, 320, 1, 1)
+
+        branch3x3 = conv2d_bn(x, 384, 1, 1)
+        branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
+        branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
+        branch3x3 = layers.concatenate(
+            [branch3x3_1, branch3x3_2],
+            axis=channel_axis,
+            name="mixed9_" + str(i),
+        )
+
+        branch3x3dbl = conv2d_bn(x, 448, 1, 1)
+        branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
+        branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
+        branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
+        branch3x3dbl = layers.concatenate(
+            [branch3x3dbl_1, branch3x3dbl_2], axis=channel_axis
+        )
+
+        branch_pool = layers.AveragePooling2D(
+            (3, 3), strides=(1, 1), padding="same"
+        )(x)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+        x = layers.concatenate(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool],
+            axis=channel_axis,
+            name="mixed" + str(9 + i),
+        )
     if include_top:
-      weights_path = data_utils.get_file(
-          'inception_v3_weights_tf_dim_ordering_tf_kernels.h5',
-          WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='9a0d58056eeedaa3f26cb7ebd46da564')
+        # Classification block
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = training.Model(inputs, x, name="inception_v3")
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            weights_path = data_utils.get_file(
+                "inception_v3_weights_tf_dim_ordering_tf_kernels.h5",
+                WEIGHTS_PATH,
+                cache_subdir="models",
+                file_hash="9a0d58056eeedaa3f26cb7ebd46da564",
+            )
+        else:
+            weights_path = data_utils.get_file(
+                "inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5",
+                WEIGHTS_PATH_NO_TOP,
+                cache_subdir="models",
+                file_hash="bcbd6486424b2319ff4ef7d526e38f63",
+            )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
+
+
+def conv2d_bn(
+    x, filters, num_row, num_col, padding="same", strides=(1, 1), name=None
+):
+    """Utility function to apply conv + BN.
+
+    Args:
+      x: input tensor.
+      filters: filters in `Conv2D`.
+      num_row: height of the convolution kernel.
+      num_col: width of the convolution kernel.
+      padding: padding mode in `Conv2D`.
+      strides: strides in `Conv2D`.
+      name: name of the ops; will become `name + '_conv'`
+        for the convolution and `name + '_bn'` for the
+        batch norm layer.
+
+    Returns:
+      Output tensor after applying `Conv2D` and `BatchNormalization`.
+    """
+    if name is not None:
+        bn_name = name + "_bn"
+        conv_name = name + "_conv"
+    else:
+        bn_name = None
+        conv_name = None
+    if backend.image_data_format() == "channels_first":
+        bn_axis = 1
     else:
-      weights_path = data_utils.get_file(
-          'inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='bcbd6486424b2319ff4ef7d526e38f63')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-def conv2d_bn(x,
-              filters,
-              num_row,
-              num_col,
-              padding='same',
-              strides=(1, 1),
-              name=None):
-  """Utility function to apply conv + BN.
-
-  Args:
-    x: input tensor.
-    filters: filters in `Conv2D`.
-    num_row: height of the convolution kernel.
-    num_col: width of the convolution kernel.
-    padding: padding mode in `Conv2D`.
-    strides: strides in `Conv2D`.
-    name: name of the ops; will become `name + '_conv'`
-      for the convolution and `name + '_bn'` for the
-      batch norm layer.
-
-  Returns:
-    Output tensor after applying `Conv2D` and `BatchNormalization`.
-  """
-  if name is not None:
-    bn_name = name + '_bn'
-    conv_name = name + '_conv'
-  else:
-    bn_name = None
-    conv_name = None
-  if backend.image_data_format() == 'channels_first':
-    bn_axis = 1
-  else:
-    bn_axis = 3
-  x = layers.Conv2D(
-      filters, (num_row, num_col),
-      strides=strides,
-      padding=padding,
-      use_bias=False,
-      name=conv_name)(
-          x)
-  x = layers.BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
-  x = layers.Activation('relu', name=name)(x)
-  return x
-
-
-@keras_export('keras.applications.inception_v3.preprocess_input')
+        bn_axis = 3
+    x = layers.Conv2D(
+        filters,
+        (num_row, num_col),
+        strides=strides,
+        padding=padding,
+        use_bias=False,
+        name=conv_name,
+    )(x)
+    x = layers.BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
+    x = layers.Activation("relu", name=name)(x)
+    return x
+
+
+@keras_export("keras.applications.inception_v3.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="tf"
+    )
 
 
-@keras_export('keras.applications.inception_v3.decode_predictions')
+@keras_export("keras.applications.inception_v3.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/mobilenet.py b/keras/applications/mobilenet.py
index beaf22b18531..e3a0cdd09e18 100644
--- a/keras/applications/mobilenet.py
+++ b/keras/applications/mobilenet.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """MobileNet v1 models for Keras.
 
 MobileNet is a general architecture and can be used for multiple use cases.
@@ -38,23 +38,22 @@
 on size 224 x 224:
 ----------------------------------------------------------------------------
 Width Multiplier (alpha) | ImageNet Acc |  Multiply-Adds (M) |  Params (M)
-----------------------------------------------------------------------------
+-------------------------|---------------|-------------------|--------------
 |   1.0 MobileNet-224    |    70.6 %     |        529        |     4.2     |
 |   0.75 MobileNet-224   |    68.4 %     |        325        |     2.6     |
 |   0.50 MobileNet-224   |    63.7 %     |        149        |     1.3     |
 |   0.25 MobileNet-224   |    50.6 %     |        41         |     0.5     |
-----------------------------------------------------------------------------
 
 The following table describes the performance of
 the 100 % MobileNet on various input sizes:
 ------------------------------------------------------------------------
-      Resolution      | ImageNet Acc | Multiply-Adds (M) | Params (M)
-------------------------------------------------------------------------
+Resolution      | ImageNet Acc | Multiply-Adds (M) | Params (M)
+----------------------|---------------|-------------------|----------------
 |  1.0 MobileNet-224  |    70.6 %    |        569        |     4.2     |
 |  1.0 MobileNet-192  |    69.1 %    |        418        |     4.2     |
 |  1.0 MobileNet-160  |    67.2 %    |        290        |     4.2     |
 |  1.0 MobileNet-128  |    64.4 %    |        186        |     4.2     |
-------------------------------------------------------------------------
+
 Reference:
   - [MobileNets: Efficient Convolutional Neural Networks
      for Mobile Vision Applications](
@@ -69,388 +68,422 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
-                    'keras-applications/mobilenet/')
+BASE_WEIGHT_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/"
+)
 layers = None
 
 
-@keras_export('keras.applications.mobilenet.MobileNet',
-              'keras.applications.MobileNet')
-def MobileNet(input_shape=None,
-              alpha=1.0,
-              depth_multiplier=1,
-              dropout=1e-3,
-              include_top=True,
-              weights='imagenet',
-              input_tensor=None,
-              pooling=None,
-              classes=1000,
-              classifier_activation='softmax',
-              **kwargs):
-  """Instantiates the MobileNet architecture.
-
-  Reference:
-  - [MobileNets: Efficient Convolutional Neural Networks
-     for Mobile Vision Applications](
-      https://arxiv.org/abs/1704.04861)
-
-  This function returns a Keras image classification model,
-  optionally loaded with weights pre-trained on ImageNet.
-
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For MobileNet, call `tf.keras.applications.mobilenet.preprocess_input`
-  on your inputs before passing them to the model.
-  `mobilenet.preprocess_input` will scale input pixels between -1 and 1.
-
-  Args:
-    input_shape: Optional shape tuple, only to be specified if `include_top`
-      is False (otherwise the input shape has to be `(224, 224, 3)` (with
-      `channels_last` data format) or (3, 224, 224) (with `channels_first`
-      data format). It should have exactly 3 inputs channels, and width and
-      height should be no smaller than 32. E.g. `(200, 200, 3)` would be one
-      valid value. Default to `None`.
-      `input_shape` will be ignored if the `input_tensor` is provided.
-    alpha: Controls the width of the network. This is known as the width
-      multiplier in the MobileNet paper. - If `alpha` < 1.0, proportionally
-      decreases the number of filters in each layer. - If `alpha` > 1.0,
-      proportionally increases the number of filters in each layer. - If
-      `alpha` = 1, default number of filters from the paper are used at each
-      layer. Default to 1.0.
-    depth_multiplier: Depth multiplier for depthwise convolution. This is
-      called the resolution multiplier in the MobileNet paper. Default to 1.0.
-    dropout: Dropout rate. Default to 0.001.
-    include_top: Boolean, whether to include the fully-connected layer at the
-      top of the network. Default to `True`.
-    weights: One of `None` (random initialization), 'imagenet' (pre-training
-      on ImageNet), or the path to the weights file to be loaded. Default to
-      `imagenet`.
-    input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`) to
-      use as image input for the model. `input_tensor` is useful for sharing
-      inputs between multiple different networks. Default to None.
-    pooling: Optional pooling mode for feature extraction when `include_top`
-      is `False`.
-      - `None` (default) means that the output of the model will be
-          the 4D tensor output of the last convolutional block.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a 2D tensor.
-      - `max` means that global max pooling will be applied.
-    classes: Optional number of classes to classify images into, only to be
-      specified if `include_top` is True, and if no `weights` argument is
-      specified. Defaults to 1000.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-    **kwargs: For backwards compatibility only.
-  Returns:
-    A `keras.Model` instance.
-  """
-  global layers
-  if 'layers' in kwargs:
-    layers = kwargs.pop('layers')
-  else:
-    layers = VersionAwareLayers()
-  if kwargs:
-    raise ValueError(f'Unknown argument(s): {(kwargs,)}')
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.  '
-                     f'Received weights={weights}')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
-                     'as true, `classes` should be 1000.  '
-                     f'Received classes={classes}')
-
-  # Determine proper input shape and default size.
-  if input_shape is None:
-    default_size = 224
-  else:
-    if backend.image_data_format() == 'channels_first':
-      rows = input_shape[1]
-      cols = input_shape[2]
+@keras_export(
+    "keras.applications.mobilenet.MobileNet", "keras.applications.MobileNet"
+)
+def MobileNet(
+    input_shape=None,
+    alpha=1.0,
+    depth_multiplier=1,
+    dropout=1e-3,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    """Instantiates the MobileNet architecture.
+
+    Reference:
+    - [MobileNets: Efficient Convolutional Neural Networks
+       for Mobile Vision Applications](
+        https://arxiv.org/abs/1704.04861)
+
+    This function returns a Keras image classification model,
+    optionally loaded with weights pre-trained on ImageNet.
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For MobileNet, call `tf.keras.applications.mobilenet.preprocess_input`
+    on your inputs before passing them to the model.
+    `mobilenet.preprocess_input` will scale input pixels between -1 and 1.
+
+    Args:
+      input_shape: Optional shape tuple, only to be specified if `include_top`
+        is False (otherwise the input shape has to be `(224, 224, 3)` (with
+        `channels_last` data format) or (3, 224, 224) (with `channels_first`
+        data format). It should have exactly 3 inputs channels, and width and
+        height should be no smaller than 32. E.g. `(200, 200, 3)` would be one
+        valid value. Defaults to `None`.
+        `input_shape` will be ignored if the `input_tensor` is provided.
+      alpha: Controls the width of the network. This is known as the width
+        multiplier in the MobileNet paper. - If `alpha` < 1.0, proportionally
+        decreases the number of filters in each layer. - If `alpha` > 1.0,
+        proportionally increases the number of filters in each layer. - If
+        `alpha` = 1, default number of filters from the paper are used at each
+        layer. Defaults to `1.0`.
+      depth_multiplier: Depth multiplier for depthwise convolution. This is
+        called the resolution multiplier in the MobileNet paper.
+        Defaults to `1.0`.
+      dropout: Dropout rate. Defaults to `0.001`.
+      include_top: Boolean, whether to include the fully-connected layer at the
+        top of the network. Defaults to `True`.
+      weights: One of `None` (random initialization), 'imagenet' (pre-training
+        on ImageNet), or the path to the weights file to be loaded. Defaults to
+        `imagenet`.
+      input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`) to
+        use as image input for the model. `input_tensor` is useful for sharing
+        inputs between multiple different networks. Defaults to `None`.
+      pooling: Optional pooling mode for feature extraction when `include_top`
+        is `False`.
+        - `None` (default) means that the output of the model will be
+            the 4D tensor output of the last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will be applied.
+      classes: Optional number of classes to classify images into, only to be
+        specified if `include_top` is True, and if no `weights` argument is
+        specified. Defaults to `1000`.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+      **kwargs: For backwards compatibility only.
+    Returns:
+      A `keras.Model` instance.
+    """
+    global layers
+    if "layers" in kwargs:
+        layers = kwargs.pop("layers")
     else:
-      rows = input_shape[0]
-      cols = input_shape[1]
-
-    if rows == cols and rows in [128, 160, 192, 224]:
-      default_size = rows
+        layers = VersionAwareLayers()
+    if kwargs:
+        raise ValueError(f"Unknown argument(s): {(kwargs,)}")
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded.  "
+            f"Received weights={weights}"
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top` '
+            "as true, `classes` should be 1000.  "
+            f"Received classes={classes}"
+        )
+
+    # Determine proper input shape and default size.
+    if input_shape is None:
+        default_size = 224
     else:
-      default_size = 224
-
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if backend.image_data_format() == 'channels_last':
-    row_axis, col_axis = (0, 1)
-  else:
-    row_axis, col_axis = (1, 2)
-  rows = input_shape[row_axis]
-  cols = input_shape[col_axis]
-
-  if weights == 'imagenet':
-    if depth_multiplier != 1:
-      raise ValueError('If imagenet weights are being loaded, '
-                       'depth multiplier must be 1.  '
-                       f'Received depth_multiplier={depth_multiplier}')
-
-    if alpha not in [0.25, 0.50, 0.75, 1.0]:
-      raise ValueError('If imagenet weights are being loaded, '
-                       'alpha can be one of'
-                       '`0.25`, `0.50`, `0.75` or `1.0` only.  '
-                       f'Received alpha={alpha}')
-
-    if rows != cols or rows not in [128, 160, 192, 224]:
-      rows = 224
-      logging.warning('`input_shape` is undefined or non-square, '
-                      'or `rows` is not in [128, 160, 192, 224]. '
-                      'Weights for input shape (224, 224) will be '
-                      'loaded as the default.')
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        if backend.image_data_format() == "channels_first":
+            rows = input_shape[1]
+            cols = input_shape[2]
+        else:
+            rows = input_shape[0]
+            cols = input_shape[1]
+
+        if rows == cols and rows in [128, 160, 192, 224]:
+            default_size = rows
+        else:
+            default_size = 224
+
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=default_size,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if backend.image_data_format() == "channels_last":
+        row_axis, col_axis = (0, 1)
     else:
-      img_input = input_tensor
-
-  x = _conv_block(img_input, 32, alpha, strides=(2, 2))
-  x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)
-
-  x = _depthwise_conv_block(
-      x, 128, alpha, depth_multiplier, strides=(2, 2), block_id=2)
-  x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)
-
-  x = _depthwise_conv_block(
-      x, 256, alpha, depth_multiplier, strides=(2, 2), block_id=4)
-  x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)
-
-  x = _depthwise_conv_block(
-      x, 512, alpha, depth_multiplier, strides=(2, 2), block_id=6)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)
-
-  x = _depthwise_conv_block(
-      x, 1024, alpha, depth_multiplier, strides=(2, 2), block_id=12)
-  x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)
-
-  if include_top:
-    x = layers.GlobalAveragePooling2D(keepdims=True)(x)
-    x = layers.Dropout(dropout, name='dropout')(x)
-    x = layers.Conv2D(classes, (1, 1), padding='same', name='conv_preds')(x)
-    x = layers.Reshape((classes,), name='reshape_2')(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Activation(activation=classifier_activation,
-                          name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = training.Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, rows))
-
-  # Load weights.
-  if weights == 'imagenet':
-    if alpha == 1.0:
-      alpha_text = '1_0'
-    elif alpha == 0.75:
-      alpha_text = '7_5'
-    elif alpha == 0.50:
-      alpha_text = '5_0'
+        row_axis, col_axis = (1, 2)
+    rows = input_shape[row_axis]
+    cols = input_shape[col_axis]
+
+    if weights == "imagenet":
+        if depth_multiplier != 1:
+            raise ValueError(
+                "If imagenet weights are being loaded, "
+                "depth multiplier must be 1.  "
+                f"Received depth_multiplier={depth_multiplier}"
+            )
+
+        if alpha not in [0.25, 0.50, 0.75, 1.0]:
+            raise ValueError(
+                "If imagenet weights are being loaded, "
+                "alpha can be one of"
+                "`0.25`, `0.50`, `0.75` or `1.0` only.  "
+                f"Received alpha={alpha}"
+            )
+
+        if rows != cols or rows not in [128, 160, 192, 224]:
+            rows = 224
+            logging.warning(
+                "`input_shape` is undefined or non-square, "
+                "or `rows` is not in [128, 160, 192, 224]. "
+                "Weights for input shape (224, 224) will be "
+                "loaded as the default."
+            )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      alpha_text = '2_5'
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    x = _conv_block(img_input, 32, alpha, strides=(2, 2))
+    x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)
+
+    x = _depthwise_conv_block(
+        x, 128, alpha, depth_multiplier, strides=(2, 2), block_id=2
+    )
+    x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)
+
+    x = _depthwise_conv_block(
+        x, 256, alpha, depth_multiplier, strides=(2, 2), block_id=4
+    )
+    x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)
+
+    x = _depthwise_conv_block(
+        x, 512, alpha, depth_multiplier, strides=(2, 2), block_id=6
+    )
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
+    x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)
+
+    x = _depthwise_conv_block(
+        x, 1024, alpha, depth_multiplier, strides=(2, 2), block_id=12
+    )
+    x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)
 
     if include_top:
-      model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)
-      weight_path = BASE_WEIGHT_PATH + model_name
-      weights_path = data_utils.get_file(
-          model_name, weight_path, cache_subdir='models')
+        x = layers.GlobalAveragePooling2D(keepdims=True)(x)
+        x = layers.Dropout(dropout, name="dropout")(x)
+        x = layers.Conv2D(classes, (1, 1), padding="same", name="conv_preds")(x)
+        x = layers.Reshape((classes,), name="reshape_2")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Activation(
+            activation=classifier_activation, name="predictions"
+        )(x)
     else:
-      model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)
-      weight_path = BASE_WEIGHT_PATH + model_name
-      weights_path = data_utils.get_file(
-          model_name, weight_path, cache_subdir='models')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = training.Model(inputs, x, name=f"mobilenet_{alpha:0.2f}_{rows}")
+
+    # Load weights.
+    if weights == "imagenet":
+        if alpha == 1.0:
+            alpha_text = "1_0"
+        elif alpha == 0.75:
+            alpha_text = "7_5"
+        elif alpha == 0.50:
+            alpha_text = "5_0"
+        else:
+            alpha_text = "2_5"
+
+        if include_top:
+            model_name = "mobilenet_%s_%d_tf.h5" % (alpha_text, rows)
+            weight_path = BASE_WEIGHT_PATH + model_name
+            weights_path = data_utils.get_file(
+                model_name, weight_path, cache_subdir="models"
+            )
+        else:
+            model_name = "mobilenet_%s_%d_tf_no_top.h5" % (alpha_text, rows)
+            weight_path = BASE_WEIGHT_PATH + model_name
+            weights_path = data_utils.get_file(
+                model_name, weight_path, cache_subdir="models"
+            )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
 
 
 def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
-  """Adds an initial convolution layer (with batch normalization and relu6).
-
-  Args:
-    inputs: Input tensor of shape `(rows, cols, 3)` (with `channels_last`
-      data format) or (3, rows, cols) (with `channels_first` data format).
-      It should have exactly 3 inputs channels, and width and height should
-      be no smaller than 32. E.g. `(224, 224, 3)` would be one valid value.
-    filters: Integer, the dimensionality of the output space (i.e. the
-      number of output filters in the convolution).
-    alpha: controls the width of the network. - If `alpha` < 1.0,
-      proportionally decreases the number of filters in each layer. - If
-      `alpha` > 1.0, proportionally increases the number of filters in each
-      layer. - If `alpha` = 1, default number of filters from the paper are
-      used at each layer.
-    kernel: An integer or tuple/list of 2 integers, specifying the width and
-      height of the 2D convolution window. Can be a single integer to
-      specify the same value for all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers, specifying the strides
-      of the convolution along the width and height. Can be a single integer
-      to specify the same value for all spatial dimensions. Specifying any
-      stride value != 1 is incompatible with specifying any `dilation_rate`
-      value != 1. # Input shape
-    4D tensor with shape: `(samples, channels, rows, cols)` if
-      data_format='channels_first'
-    or 4D tensor with shape: `(samples, rows, cols, channels)` if
-      data_format='channels_last'. # Output shape
-    4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
-      data_format='channels_first'
-    or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if
-      data_format='channels_last'. `rows` and `cols` values might have
-      changed due to stride.
-
-  Returns:
-    Output tensor of block.
-  """
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
-  filters = int(filters * alpha)
-  x = layers.Conv2D(
-      filters,
-      kernel,
-      padding='same',
-      use_bias=False,
-      strides=strides,
-      name='conv1')(inputs)
-  x = layers.BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
-  return layers.ReLU(6., name='conv1_relu')(x)
-
-
-def _depthwise_conv_block(inputs,
-                          pointwise_conv_filters,
-                          alpha,
-                          depth_multiplier=1,
-                          strides=(1, 1),
-                          block_id=1):
-  """Adds a depthwise convolution block.
-
-  A depthwise convolution block consists of a depthwise conv,
-  batch normalization, relu6, pointwise convolution,
-  batch normalization and relu6 activation.
-
-  Args:
-    inputs: Input tensor of shape `(rows, cols, channels)` (with
-      `channels_last` data format) or (channels, rows, cols) (with
-      `channels_first` data format).
-    pointwise_conv_filters: Integer, the dimensionality of the output space
-      (i.e. the number of output filters in the pointwise convolution).
-    alpha: controls the width of the network. - If `alpha` < 1.0,
-      proportionally decreases the number of filters in each layer. - If
-      `alpha` > 1.0, proportionally increases the number of filters in each
-      layer. - If `alpha` = 1, default number of filters from the paper are
-      used at each layer.
-    depth_multiplier: The number of depthwise convolution output channels
-      for each input channel. The total number of depthwise convolution
-      output channels will be equal to `filters_in * depth_multiplier`.
-    strides: An integer or tuple/list of 2 integers, specifying the strides
-      of the convolution along the width and height. Can be a single integer
-      to specify the same value for all spatial dimensions. Specifying any
-      stride value != 1 is incompatible with specifying any `dilation_rate`
-      value != 1.
-    block_id: Integer, a unique identification designating the block number.
-      # Input shape
-    4D tensor with shape: `(batch, channels, rows, cols)` if
-      data_format='channels_first'
-    or 4D tensor with shape: `(batch, rows, cols, channels)` if
-      data_format='channels_last'. # Output shape
-    4D tensor with shape: `(batch, filters, new_rows, new_cols)` if
-      data_format='channels_first'
-    or 4D tensor with shape: `(batch, new_rows, new_cols, filters)` if
-      data_format='channels_last'. `rows` and `cols` values might have
-      changed due to stride.
-
-  Returns:
-    Output tensor of block.
-  """
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
-  pointwise_conv_filters = int(pointwise_conv_filters * alpha)
-
-  if strides == (1, 1):
-    x = inputs
-  else:
-    x = layers.ZeroPadding2D(((0, 1), (0, 1)), name='conv_pad_%d' % block_id)(
-        inputs)
-  x = layers.DepthwiseConv2D((3, 3),
-                             padding='same' if strides == (1, 1) else 'valid',
-                             depth_multiplier=depth_multiplier,
-                             strides=strides,
-                             use_bias=False,
-                             name='conv_dw_%d' % block_id)(
-                                 x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, name='conv_dw_%d_bn' % block_id)(
-          x)
-  x = layers.ReLU(6., name='conv_dw_%d_relu' % block_id)(x)
-
-  x = layers.Conv2D(
-      pointwise_conv_filters, (1, 1),
-      padding='same',
-      use_bias=False,
-      strides=(1, 1),
-      name='conv_pw_%d' % block_id)(
-          x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, name='conv_pw_%d_bn' % block_id)(
-          x)
-  return layers.ReLU(6., name='conv_pw_%d_relu' % block_id)(x)
-
-
-@keras_export('keras.applications.mobilenet.preprocess_input')
+    """Adds an initial convolution layer (with batch normalization and relu6).
+
+    Args:
+      inputs: Input tensor of shape `(rows, cols, 3)` (with `channels_last`
+        data format) or (3, rows, cols) (with `channels_first` data format).
+        It should have exactly 3 inputs channels, and width and height should
+        be no smaller than 32. E.g. `(224, 224, 3)` would be one valid value.
+      filters: Integer, the dimensionality of the output space (i.e. the
+        number of output filters in the convolution).
+      alpha: controls the width of the network. - If `alpha` < 1.0,
+        proportionally decreases the number of filters in each layer. - If
+        `alpha` > 1.0, proportionally increases the number of filters in each
+        layer. - If `alpha` = 1, default number of filters from the paper are
+        used at each layer.
+      kernel: An integer or tuple/list of 2 integers, specifying the width and
+        height of the 2D convolution window. Can be a single integer to
+        specify the same value for all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers, specifying the strides
+        of the convolution along the width and height. Can be a single integer
+        to specify the same value for all spatial dimensions. Specifying any
+        stride value != 1 is incompatible with specifying any `dilation_rate`
+        value != 1. # Input shape
+      4D tensor with shape: `(samples, channels, rows, cols)` if
+        data_format='channels_first'
+      or 4D tensor with shape: `(samples, rows, cols, channels)` if
+        data_format='channels_last'. # Output shape
+      4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
+        data_format='channels_first'
+      or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if
+        data_format='channels_last'. `rows` and `cols` values might have
+        changed due to stride.
+
+    Returns:
+      Output tensor of block.
+    """
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
+    filters = int(filters * alpha)
+    x = layers.Conv2D(
+        filters,
+        kernel,
+        padding="same",
+        use_bias=False,
+        strides=strides,
+        name="conv1",
+    )(inputs)
+    x = layers.BatchNormalization(axis=channel_axis, name="conv1_bn")(x)
+    return layers.ReLU(6.0, name="conv1_relu")(x)
+
+
+def _depthwise_conv_block(
+    inputs,
+    pointwise_conv_filters,
+    alpha,
+    depth_multiplier=1,
+    strides=(1, 1),
+    block_id=1,
+):
+    """Adds a depthwise convolution block.
+
+    A depthwise convolution block consists of a depthwise conv,
+    batch normalization, relu6, pointwise convolution,
+    batch normalization and relu6 activation.
+
+    Args:
+      inputs: Input tensor of shape `(rows, cols, channels)` (with
+        `channels_last` data format) or (channels, rows, cols) (with
+        `channels_first` data format).
+      pointwise_conv_filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the pointwise convolution).
+      alpha: controls the width of the network. - If `alpha` < 1.0,
+        proportionally decreases the number of filters in each layer. - If
+        `alpha` > 1.0, proportionally increases the number of filters in each
+        layer. - If `alpha` = 1, default number of filters from the paper are
+        used at each layer.
+      depth_multiplier: The number of depthwise convolution output channels
+        for each input channel. The total number of depthwise convolution
+        output channels will be equal to `filters_in * depth_multiplier`.
+      strides: An integer or tuple/list of 2 integers, specifying the strides
+        of the convolution along the width and height. Can be a single integer
+        to specify the same value for all spatial dimensions. Specifying any
+        stride value != 1 is incompatible with specifying any `dilation_rate`
+        value != 1.
+      block_id: Integer, a unique identification designating the block number.
+        # Input shape
+      4D tensor with shape: `(batch, channels, rows, cols)` if
+        data_format='channels_first'
+      or 4D tensor with shape: `(batch, rows, cols, channels)` if
+        data_format='channels_last'. # Output shape
+      4D tensor with shape: `(batch, filters, new_rows, new_cols)` if
+        data_format='channels_first'
+      or 4D tensor with shape: `(batch, new_rows, new_cols, filters)` if
+        data_format='channels_last'. `rows` and `cols` values might have
+        changed due to stride.
+
+    Returns:
+      Output tensor of block.
+    """
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
+    pointwise_conv_filters = int(pointwise_conv_filters * alpha)
+
+    if strides == (1, 1):
+        x = inputs
+    else:
+        x = layers.ZeroPadding2D(
+            ((0, 1), (0, 1)), name="conv_pad_%d" % block_id
+        )(inputs)
+    x = layers.DepthwiseConv2D(
+        (3, 3),
+        padding="same" if strides == (1, 1) else "valid",
+        depth_multiplier=depth_multiplier,
+        strides=strides,
+        use_bias=False,
+        name="conv_dw_%d" % block_id,
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis, name="conv_dw_%d_bn" % block_id
+    )(x)
+    x = layers.ReLU(6.0, name="conv_dw_%d_relu" % block_id)(x)
+
+    x = layers.Conv2D(
+        pointwise_conv_filters,
+        (1, 1),
+        padding="same",
+        use_bias=False,
+        strides=(1, 1),
+        name="conv_pw_%d" % block_id,
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis, name="conv_pw_%d_bn" % block_id
+    )(x)
+    return layers.ReLU(6.0, name="conv_pw_%d_relu" % block_id)(x)
+
+
+@keras_export("keras.applications.mobilenet.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="tf"
+    )
 
 
-@keras_export('keras.applications.mobilenet.decode_predictions')
+@keras_export("keras.applications.mobilenet.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/mobilenet_v2.py b/keras/applications/mobilenet_v2.py
index eeacdb0c2deb..cc09e0e1713b 100644
--- a/keras/applications/mobilenet_v2.py
+++ b/keras/applications/mobilenet_v2.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """MobileNet v2 models for Keras.
 
 MobileNetV2 is a general architecture and can be used for multiple use cases.
@@ -44,8 +44,8 @@
 MobileNet on various input sizes:
 ------------------------------------------------------------------------
 MACs stands for Multiply Adds
- Classification Checkpoint|MACs (M)|Parameters (M)|Top 1 Accuracy|Top 5 Accuracy
---------------------------|------------|---------------|---------|----|---------
+Classification Checkpoint|MACs (M)|Parameters (M)|Top 1 Accuracy|Top 5 Accuracy
+--------------------------|------------|---------------|---------|------------
 | [mobilenet_v2_1.4_224]  | 582 | 6.06 |          75.0 | 92.5 |
 | [mobilenet_v2_1.3_224]  | 509 | 5.34 |          74.4 | 92.1 |
 | [mobilenet_v2_1.0_224]  | 300 | 3.47 |          71.8 | 91.0 |
@@ -74,456 +74,517 @@
       https://arxiv.org/abs/1801.04381) (CVPR 2018)
 """
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.applications import imagenet_utils
 from keras.engine import training
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
-                    'keras-applications/mobilenet_v2/')
+BASE_WEIGHT_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/"
+)
 layers = None
 
 
-@keras_export('keras.applications.mobilenet_v2.MobileNetV2',
-              'keras.applications.MobileNetV2')
-def MobileNetV2(input_shape=None,
-                alpha=1.0,
-                include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                pooling=None,
-                classes=1000,
-                classifier_activation='softmax',
-                **kwargs):
-  """Instantiates the MobileNetV2 architecture.
-
-  MobileNetV2 is very similar to the original MobileNet,
-  except that it uses inverted residual blocks with
-  bottlenecking features. It has a drastically lower
-  parameter count than the original MobileNet.
-  MobileNets support any input size greater
-  than 32 x 32, with larger image sizes
-  offering better performance.
-
-  Reference:
-  - [MobileNetV2: Inverted Residuals and Linear Bottlenecks](
-      https://arxiv.org/abs/1801.04381) (CVPR 2018)
+@keras_export(
+    "keras.applications.mobilenet_v2.MobileNetV2",
+    "keras.applications.MobileNetV2",
+)
+def MobileNetV2(
+    input_shape=None,
+    alpha=1.0,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    """Instantiates the MobileNetV2 architecture.
+
+    MobileNetV2 is very similar to the original MobileNet,
+    except that it uses inverted residual blocks with
+    bottlenecking features. It has a drastically lower
+    parameter count than the original MobileNet.
+    MobileNets support any input size greater
+    than 32 x 32, with larger image sizes
+    offering better performance.
+
+    Reference:
+    - [MobileNetV2: Inverted Residuals and Linear Bottlenecks](
+        https://arxiv.org/abs/1801.04381) (CVPR 2018)
+
+    This function returns a Keras image classification model,
+    optionally loaded with weights pre-trained on ImageNet.
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For MobileNetV2, call `tf.keras.applications.mobilenet_v2.preprocess_input`
+    on your inputs before passing them to the model.
+    `mobilenet_v2.preprocess_input` will scale input pixels between -1 and 1.
+
+    Args:
+      input_shape: Optional shape tuple, to be specified if you would
+        like to use a model with an input image resolution that is not
+        (224, 224, 3).
+        It should have exactly 3 inputs channels (224, 224, 3).
+        You can also omit this option if you would like
+        to infer input_shape from an input_tensor.
+        If you choose to include both input_tensor and input_shape then
+        input_shape will be used if they match, if the shapes
+        do not match then we will throw an error.
+        E.g. `(160, 160, 3)` would be one valid value.
+      alpha: Float, larger than zero, controls the width of the network. This is
+        known as the width multiplier in the MobileNetV2 paper, but the name is
+        kept for consistency with `applications.MobileNetV1` model in Keras.
+        - If `alpha` < 1.0, proportionally decreases the number
+            of filters in each layer.
+        - If `alpha` > 1.0, proportionally increases the number
+            of filters in each layer.
+        - If `alpha` = 1.0, default number of filters from the paper
+            are used at each layer.
+      include_top: Boolean, whether to include the fully-connected layer at the
+        top of the network. Defaults to `True`.
+      weights: String, one of `None` (random initialization), 'imagenet'
+        (pre-training on ImageNet), or the path to the weights file to be
+        loaded.
+      input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`)
+        to use as image input for the model.
+      pooling: String, optional pooling mode for feature extraction when
+        `include_top` is `False`.
+        - `None` means that the output of the model
+            will be the 4D tensor output of the
+            last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a
+            2D tensor.
+        - `max` means that global max pooling will
+            be applied.
+      classes: Optional integer number of classes to classify images into, only
+        to be specified if `include_top` is True, and if no `weights` argument
+        is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+      **kwargs: For backwards compatibility only.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    global layers
+    if "layers" in kwargs:
+        layers = kwargs.pop("layers")
+    else:
+        layers = VersionAwareLayers()
+    if kwargs:
+        raise ValueError(f"Unknown argument(s): {kwargs}")
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded.  "
+            f"Received `weights={weights}`"
+        )
 
-  This function returns a Keras image classification model,
-  optionally loaded with weights pre-trained on ImageNet.
-
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For MobileNetV2, call `tf.keras.applications.mobilenet_v2.preprocess_input`
-  on your inputs before passing them to the model.
-  `mobilenet_v2.preprocess_input` will scale input pixels between -1 and 1.
-
-  Args:
-    input_shape: Optional shape tuple, to be specified if you would
-      like to use a model with an input image resolution that is not
-      (224, 224, 3).
-      It should have exactly 3 inputs channels (224, 224, 3).
-      You can also omit this option if you would like
-      to infer input_shape from an input_tensor.
-      If you choose to include both input_tensor and input_shape then
-      input_shape will be used if they match, if the shapes
-      do not match then we will throw an error.
-      E.g. `(160, 160, 3)` would be one valid value.
-    alpha: Float, larger than zero, controls the width of the network. This is
-      known as the width multiplier in the MobileNetV2 paper, but the name is
-      kept for consistency with `applications.MobileNetV1` model in Keras.
-      - If `alpha` < 1.0, proportionally decreases the number
-          of filters in each layer.
-      - If `alpha` > 1.0, proportionally increases the number
-          of filters in each layer.
-      - If `alpha` = 1.0, default number of filters from the paper
-          are used at each layer.
-    include_top: Boolean, whether to include the fully-connected layer at the
-      top of the network. Defaults to `True`.
-    weights: String, one of `None` (random initialization), 'imagenet'
-      (pre-training on ImageNet), or the path to the weights file to be loaded.
-    input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`)
-      to use as image input for the model.
-    pooling: String, optional pooling mode for feature extraction when
-      `include_top` is `False`.
-      - `None` means that the output of the model
-          will be the 4D tensor output of the
-          last convolutional block.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a
-          2D tensor.
-      - `max` means that global max pooling will
-          be applied.
-    classes: Optional integer number of classes to classify images into, only to
-      be specified if `include_top` is True, and if no `weights` argument is
-      specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-    **kwargs: For backwards compatibility only.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  global layers
-  if 'layers' in kwargs:
-    layers = kwargs.pop('layers')
-  else:
-    layers = VersionAwareLayers()
-  if kwargs:
-    raise ValueError(f'Unknown argument(s): {kwargs}')
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.  '
-                     f'Received `weights={weights}`')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError(
-        'If using `weights` as `"imagenet"` with `include_top` '
-        f'as true, `classes` should be 1000. Received `classes={classes}`')
-
-  # Determine proper input shape and default size.
-  # If both input_shape and input_tensor are used, they should match
-  if input_shape is not None and input_tensor is not None:
-    try:
-      is_input_t_tensor = backend.is_keras_tensor(input_tensor)
-    except ValueError:
-      try:
-        is_input_t_tensor = backend.is_keras_tensor(
-            layer_utils.get_source_inputs(input_tensor))
-      except ValueError:
+    if weights == "imagenet" and include_top and classes != 1000:
         raise ValueError(
-            f'input_tensor: {input_tensor}'
-            'is not type input_tensor. '
-            f'Received `type(input_tensor)={type(input_tensor)}`'
+            'If using `weights` as `"imagenet"` with `include_top` '
+            f"as true, `classes` should be 1000. Received `classes={classes}`"
         )
-    if is_input_t_tensor:
-      if backend.image_data_format() == 'channels_first':
-        if backend.int_shape(input_tensor)[1] != input_shape[1]:
-          raise ValueError('input_shape[1] must equal shape(input_tensor)[1] '
-                           'when `image_data_format` is `channels_first`; '
-                           'Received `input_tensor.shape='
-                           f'{input_tensor.shape}`'
-                           f', `input_shape={input_shape}`')
-      else:
-        if backend.int_shape(input_tensor)[2] != input_shape[1]:
-          raise ValueError(
-              'input_tensor.shape[2] must equal input_shape[1]; '
-              'Received `input_tensor.shape='
-              f'{input_tensor.shape}`, '
-              f'`input_shape={input_shape}`')
-    else:
-      raise ValueError('input_tensor is not a Keras tensor; '
-                       f'Received `input_tensor={input_tensor}`')
-
-  # If input_shape is None, infer shape from input_tensor.
-  if input_shape is None and input_tensor is not None:
-
-    try:
-      backend.is_keras_tensor(input_tensor)
-    except ValueError:
-      raise ValueError('input_tensor must be a valid Keras tensor type; '
-                       f'Received {input_tensor} of type {type(input_tensor)}')
-
-    if input_shape is None and not backend.is_keras_tensor(input_tensor):
-      default_size = 224
-    elif input_shape is None and backend.is_keras_tensor(input_tensor):
-      if backend.image_data_format() == 'channels_first':
-        rows = backend.int_shape(input_tensor)[2]
-        cols = backend.int_shape(input_tensor)[3]
-      else:
-        rows = backend.int_shape(input_tensor)[1]
-        cols = backend.int_shape(input_tensor)[2]
-
-      if rows == cols and rows in [96, 128, 160, 192, 224]:
-        default_size = rows
-      else:
-        default_size = 224
 
-  # If input_shape is None and no input_tensor
-  elif input_shape is None:
-    default_size = 224
+    # Determine proper input shape and default size.
+    # If both input_shape and input_tensor are used, they should match
+    if input_shape is not None and input_tensor is not None:
+        try:
+            is_input_t_tensor = backend.is_keras_tensor(input_tensor)
+        except ValueError:
+            try:
+                is_input_t_tensor = backend.is_keras_tensor(
+                    layer_utils.get_source_inputs(input_tensor)
+                )
+            except ValueError:
+                raise ValueError(
+                    f"input_tensor: {input_tensor}"
+                    "is not type input_tensor. "
+                    f"Received `type(input_tensor)={type(input_tensor)}`"
+                )
+        if is_input_t_tensor:
+            if backend.image_data_format() == "channels_first":
+                if backend.int_shape(input_tensor)[1] != input_shape[1]:
+                    raise ValueError(
+                        "input_shape[1] must equal shape(input_tensor)[1] "
+                        "when `image_data_format` is `channels_first`; "
+                        "Received `input_tensor.shape="
+                        f"{input_tensor.shape}`"
+                        f", `input_shape={input_shape}`"
+                    )
+            else:
+                if backend.int_shape(input_tensor)[2] != input_shape[1]:
+                    raise ValueError(
+                        "input_tensor.shape[2] must equal input_shape[1]; "
+                        "Received `input_tensor.shape="
+                        f"{input_tensor.shape}`, "
+                        f"`input_shape={input_shape}`"
+                    )
+        else:
+            raise ValueError(
+                "input_tensor is not a Keras tensor; "
+                f"Received `input_tensor={input_tensor}`"
+            )
+
+    # If input_shape is None, infer shape from input_tensor.
+    if input_shape is None and input_tensor is not None:
+
+        try:
+            backend.is_keras_tensor(input_tensor)
+        except ValueError:
+            raise ValueError(
+                "input_tensor must be a valid Keras tensor type; "
+                f"Received {input_tensor} of type {type(input_tensor)}"
+            )
+
+        if input_shape is None and not backend.is_keras_tensor(input_tensor):
+            default_size = 224
+        elif input_shape is None and backend.is_keras_tensor(input_tensor):
+            if backend.image_data_format() == "channels_first":
+                rows = backend.int_shape(input_tensor)[2]
+                cols = backend.int_shape(input_tensor)[3]
+            else:
+                rows = backend.int_shape(input_tensor)[1]
+                cols = backend.int_shape(input_tensor)[2]
+
+            if rows == cols and rows in [96, 128, 160, 192, 224]:
+                default_size = rows
+            else:
+                default_size = 224
+
+    # If input_shape is None and no input_tensor
+    elif input_shape is None:
+        default_size = 224
 
-  # If input_shape is not None, assume default size.
-  else:
-    if backend.image_data_format() == 'channels_first':
-      rows = input_shape[1]
-      cols = input_shape[2]
+    # If input_shape is not None, assume default size.
     else:
-      rows = input_shape[0]
-      cols = input_shape[1]
-
-    if rows == cols and rows in [96, 128, 160, 192, 224]:
-      default_size = rows
+        if backend.image_data_format() == "channels_first":
+            rows = input_shape[1]
+            cols = input_shape[2]
+        else:
+            rows = input_shape[0]
+            cols = input_shape[1]
+
+        if rows == cols and rows in [96, 128, 160, 192, 224]:
+            default_size = rows
+        else:
+            default_size = 224
+
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=default_size,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if backend.image_data_format() == "channels_last":
+        row_axis, col_axis = (0, 1)
     else:
-      default_size = 224
-
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if backend.image_data_format() == 'channels_last':
-    row_axis, col_axis = (0, 1)
-  else:
-    row_axis, col_axis = (1, 2)
-  rows = input_shape[row_axis]
-  cols = input_shape[col_axis]
-
-  if weights == 'imagenet':
-    if alpha not in [0.35, 0.50, 0.75, 1.0, 1.3, 1.4]:
-      raise ValueError('If imagenet weights are being loaded, '
-                       'alpha must be one of `0.35`, `0.50`, `0.75`, '
-                       '`1.0`, `1.3` or `1.4` only;'
-                       f' Received `alpha={alpha}`')
-
-    if rows != cols or rows not in [96, 128, 160, 192, 224]:
-      rows = 224
-      logging.warning('`input_shape` is undefined or non-square, '
-                      'or `rows` is not in [96, 128, 160, 192, 224]. '
-                      'Weights for input shape (224, 224) will be '
-                      'loaded as the default.')
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        row_axis, col_axis = (1, 2)
+    rows = input_shape[row_axis]
+    cols = input_shape[col_axis]
+
+    if weights == "imagenet":
+        if alpha not in [0.35, 0.50, 0.75, 1.0, 1.3, 1.4]:
+            raise ValueError(
+                "If imagenet weights are being loaded, "
+                "alpha must be one of `0.35`, `0.50`, `0.75`, "
+                "`1.0`, `1.3` or `1.4` only;"
+                f" Received `alpha={alpha}`"
+            )
+
+        if rows != cols or rows not in [96, 128, 160, 192, 224]:
+            rows = 224
+            logging.warning(
+                "`input_shape` is undefined or non-square, "
+                "or `rows` is not in [96, 128, 160, 192, 224]. "
+                "Weights for input shape (224, 224) will be "
+                "loaded as the default."
+            )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      img_input = input_tensor
-
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
-
-  first_block_filters = _make_divisible(32 * alpha, 8)
-  x = layers.Conv2D(
-      first_block_filters,
-      kernel_size=3,
-      strides=(2, 2),
-      padding='same',
-      use_bias=False,
-      name='Conv1')(img_input)
-  x = layers.BatchNormalization(
-      axis=channel_axis, epsilon=1e-3, momentum=0.999, name='bn_Conv1')(
-          x)
-  x = layers.ReLU(6., name='Conv1_relu')(x)
-
-  x = _inverted_res_block(
-      x, filters=16, alpha=alpha, stride=1, expansion=1, block_id=0)
-
-  x = _inverted_res_block(
-      x, filters=24, alpha=alpha, stride=2, expansion=6, block_id=1)
-  x = _inverted_res_block(
-      x, filters=24, alpha=alpha, stride=1, expansion=6, block_id=2)
-
-  x = _inverted_res_block(
-      x, filters=32, alpha=alpha, stride=2, expansion=6, block_id=3)
-  x = _inverted_res_block(
-      x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=4)
-  x = _inverted_res_block(
-      x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=5)
-
-  x = _inverted_res_block(
-      x, filters=64, alpha=alpha, stride=2, expansion=6, block_id=6)
-  x = _inverted_res_block(
-      x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=7)
-  x = _inverted_res_block(
-      x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=8)
-  x = _inverted_res_block(
-      x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=9)
-
-  x = _inverted_res_block(
-      x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=10)
-  x = _inverted_res_block(
-      x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=11)
-  x = _inverted_res_block(
-      x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=12)
-
-  x = _inverted_res_block(
-      x, filters=160, alpha=alpha, stride=2, expansion=6, block_id=13)
-  x = _inverted_res_block(
-      x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=14)
-  x = _inverted_res_block(
-      x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=15)
-
-  x = _inverted_res_block(
-      x, filters=320, alpha=alpha, stride=1, expansion=6, block_id=16)
-
-  # no alpha applied to last conv as stated in the paper:
-  # if the width multiplier is greater than 1 we increase the number of output
-  # channels.
-  if alpha > 1.0:
-    last_block_filters = _make_divisible(1280 * alpha, 8)
-  else:
-    last_block_filters = 1280
-
-  x = layers.Conv2D(
-      last_block_filters, kernel_size=1, use_bias=False, name='Conv_1')(
-          x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, epsilon=1e-3, momentum=0.999, name='Conv_1_bn')(
-          x)
-  x = layers.ReLU(6., name='out_relu')(x)
-
-  if include_top:
-    x = layers.GlobalAveragePooling2D()(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account any potential predecessors of
-  # `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = training.Model(inputs, x, name='mobilenetv2_%0.2f_%s' % (alpha, rows))
-
-  # Load weights.
-  if weights == 'imagenet':
-    if include_top:
-      model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' +
-                    str(float(alpha)) + '_' + str(rows) + '.h5')
-      weight_path = BASE_WEIGHT_PATH + model_name
-      weights_path = data_utils.get_file(
-          model_name, weight_path, cache_subdir='models')
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
+
+    first_block_filters = _make_divisible(32 * alpha, 8)
+    x = layers.Conv2D(
+        first_block_filters,
+        kernel_size=3,
+        strides=(2, 2),
+        padding="same",
+        use_bias=False,
+        name="Conv1",
+    )(img_input)
+    x = layers.BatchNormalization(
+        axis=channel_axis, epsilon=1e-3, momentum=0.999, name="bn_Conv1"
+    )(x)
+    x = layers.ReLU(6.0, name="Conv1_relu")(x)
+
+    x = _inverted_res_block(
+        x, filters=16, alpha=alpha, stride=1, expansion=1, block_id=0
+    )
+
+    x = _inverted_res_block(
+        x, filters=24, alpha=alpha, stride=2, expansion=6, block_id=1
+    )
+    x = _inverted_res_block(
+        x, filters=24, alpha=alpha, stride=1, expansion=6, block_id=2
+    )
+
+    x = _inverted_res_block(
+        x, filters=32, alpha=alpha, stride=2, expansion=6, block_id=3
+    )
+    x = _inverted_res_block(
+        x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=4
+    )
+    x = _inverted_res_block(
+        x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=5
+    )
+
+    x = _inverted_res_block(
+        x, filters=64, alpha=alpha, stride=2, expansion=6, block_id=6
+    )
+    x = _inverted_res_block(
+        x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=7
+    )
+    x = _inverted_res_block(
+        x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=8
+    )
+    x = _inverted_res_block(
+        x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=9
+    )
+
+    x = _inverted_res_block(
+        x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=10
+    )
+    x = _inverted_res_block(
+        x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=11
+    )
+    x = _inverted_res_block(
+        x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=12
+    )
+
+    x = _inverted_res_block(
+        x, filters=160, alpha=alpha, stride=2, expansion=6, block_id=13
+    )
+    x = _inverted_res_block(
+        x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=14
+    )
+    x = _inverted_res_block(
+        x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=15
+    )
+
+    x = _inverted_res_block(
+        x, filters=320, alpha=alpha, stride=1, expansion=6, block_id=16
+    )
+
+    # no alpha applied to last conv as stated in the paper:
+    # if the width multiplier is greater than 1 we increase the number of output
+    # channels.
+    if alpha > 1.0:
+        last_block_filters = _make_divisible(1280 * alpha, 8)
     else:
-      model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' +
-                    str(float(alpha)) + '_' + str(rows) + '_no_top' + '.h5')
-      weight_path = BASE_WEIGHT_PATH + model_name
-      weights_path = data_utils.get_file(
-          model_name, weight_path, cache_subdir='models')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
+        last_block_filters = 1280
+
+    x = layers.Conv2D(
+        last_block_filters, kernel_size=1, use_bias=False, name="Conv_1"
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis, epsilon=1e-3, momentum=0.999, name="Conv_1_bn"
+    )(x)
+    x = layers.ReLU(6.0, name="out_relu")(x)
 
-  return model
+    if include_top:
+        x = layers.GlobalAveragePooling2D()(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
+
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account any potential predecessors of
+    # `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = training.Model(inputs, x, name=f"mobilenetv2_{alpha:0.2f}_{rows}")
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            model_name = (
+                "mobilenet_v2_weights_tf_dim_ordering_tf_kernels_"
+                + str(float(alpha))
+                + "_"
+                + str(rows)
+                + ".h5"
+            )
+            weight_path = BASE_WEIGHT_PATH + model_name
+            weights_path = data_utils.get_file(
+                model_name, weight_path, cache_subdir="models"
+            )
+        else:
+            model_name = (
+                "mobilenet_v2_weights_tf_dim_ordering_tf_kernels_"
+                + str(float(alpha))
+                + "_"
+                + str(rows)
+                + "_no_top"
+                + ".h5"
+            )
+            weight_path = BASE_WEIGHT_PATH + model_name
+            weights_path = data_utils.get_file(
+                model_name, weight_path, cache_subdir="models"
+            )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
 
 
 def _inverted_res_block(inputs, expansion, stride, alpha, filters, block_id):
-  """Inverted ResNet block."""
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
-
-  in_channels = backend.int_shape(inputs)[channel_axis]
-  pointwise_conv_filters = int(filters * alpha)
-  # Ensure the number of filters on the last 1x1 convolution is divisible by 8.
-  pointwise_filters = _make_divisible(pointwise_conv_filters, 8)
-  x = inputs
-  prefix = 'block_{}_'.format(block_id)
-
-  if block_id:
-    # Expand with a pointwise 1x1 convolution.
+    """Inverted ResNet block."""
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
+
+    in_channels = backend.int_shape(inputs)[channel_axis]
+    pointwise_conv_filters = int(filters * alpha)
+    # Ensure the number of filters on the last 1x1 convolution is divisible by
+    # 8.
+    pointwise_filters = _make_divisible(pointwise_conv_filters, 8)
+    x = inputs
+    prefix = f"block_{block_id}_"
+
+    if block_id:
+        # Expand with a pointwise 1x1 convolution.
+        x = layers.Conv2D(
+            expansion * in_channels,
+            kernel_size=1,
+            padding="same",
+            use_bias=False,
+            activation=None,
+            name=prefix + "expand",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=channel_axis,
+            epsilon=1e-3,
+            momentum=0.999,
+            name=prefix + "expand_BN",
+        )(x)
+        x = layers.ReLU(6.0, name=prefix + "expand_relu")(x)
+    else:
+        prefix = "expanded_conv_"
+
+    # Depthwise 3x3 convolution.
+    if stride == 2:
+        x = layers.ZeroPadding2D(
+            padding=imagenet_utils.correct_pad(x, 3), name=prefix + "pad"
+        )(x)
+    x = layers.DepthwiseConv2D(
+        kernel_size=3,
+        strides=stride,
+        activation=None,
+        use_bias=False,
+        padding="same" if stride == 1 else "valid",
+        name=prefix + "depthwise",
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis,
+        epsilon=1e-3,
+        momentum=0.999,
+        name=prefix + "depthwise_BN",
+    )(x)
+
+    x = layers.ReLU(6.0, name=prefix + "depthwise_relu")(x)
+
+    # Project with a pointwise 1x1 convolution.
     x = layers.Conv2D(
-        expansion * in_channels,
+        pointwise_filters,
         kernel_size=1,
-        padding='same',
+        padding="same",
         use_bias=False,
         activation=None,
-        name=prefix + 'expand')(
-            x)
+        name=prefix + "project",
+    )(x)
     x = layers.BatchNormalization(
         axis=channel_axis,
         epsilon=1e-3,
         momentum=0.999,
-        name=prefix + 'expand_BN')(
-            x)
-    x = layers.ReLU(6., name=prefix + 'expand_relu')(x)
-  else:
-    prefix = 'expanded_conv_'
-
-  # Depthwise 3x3 convolution.
-  if stride == 2:
-    x = layers.ZeroPadding2D(
-        padding=imagenet_utils.correct_pad(x, 3),
-        name=prefix + 'pad')(x)
-  x = layers.DepthwiseConv2D(
-      kernel_size=3,
-      strides=stride,
-      activation=None,
-      use_bias=False,
-      padding='same' if stride == 1 else 'valid',
-      name=prefix + 'depthwise')(
-          x)
-  x = layers.BatchNormalization(
-      axis=channel_axis,
-      epsilon=1e-3,
-      momentum=0.999,
-      name=prefix + 'depthwise_BN')(
-          x)
-
-  x = layers.ReLU(6., name=prefix + 'depthwise_relu')(x)
-
-  # Project with a pointwise 1x1 convolution.
-  x = layers.Conv2D(
-      pointwise_filters,
-      kernel_size=1,
-      padding='same',
-      use_bias=False,
-      activation=None,
-      name=prefix + 'project')(
-          x)
-  x = layers.BatchNormalization(
-      axis=channel_axis,
-      epsilon=1e-3,
-      momentum=0.999,
-      name=prefix + 'project_BN')(
-          x)
-
-  if in_channels == pointwise_filters and stride == 1:
-    return layers.Add(name=prefix + 'add')([inputs, x])
-  return x
+        name=prefix + "project_BN",
+    )(x)
+
+    if in_channels == pointwise_filters and stride == 1:
+        return layers.Add(name=prefix + "add")([inputs, x])
+    return x
 
 
 def _make_divisible(v, divisor, min_value=None):
-  if min_value is None:
-    min_value = divisor
-  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-  # Make sure that round down does not go down by more than 10%.
-  if new_v < 0.9 * v:
-    new_v += divisor
-  return new_v
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
 
 
-@keras_export('keras.applications.mobilenet_v2.preprocess_input')
+@keras_export("keras.applications.mobilenet_v2.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="tf"
+    )
 
 
-@keras_export('keras.applications.mobilenet_v2.decode_predictions')
+@keras_export("keras.applications.mobilenet_v2.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/mobilenet_v3.py b/keras/applications/mobilenet_v3.py
index d149797b4ded..b79c4a663678 100644
--- a/keras/applications/mobilenet_v3.py
+++ b/keras/applications/mobilenet_v3.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=missing-function-docstring
+
+
 """MobileNet v3 models for Keras."""
 
 import tensorflow.compat.v2 as tf
@@ -24,26 +24,40 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-
 # TODO(scottzhu): Change this to the GCS path.
-BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
-                    'keras-applications/mobilenet_v3/')
+BASE_WEIGHT_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v3/"
+)
 WEIGHTS_HASHES = {
-    'large_224_0.75_float': ('765b44a33ad4005b3ac83185abf1d0eb',
-                             '40af19a13ebea4e2ee0c676887f69a2e'),
-    'large_224_1.0_float': ('59e551e166be033d707958cf9e29a6a7',
-                            '07fb09a5933dd0c8eaafa16978110389'),
-    'large_minimalistic_224_1.0_float': ('675e7b876c45c57e9e63e6d90a36599c',
-                                         'ec5221f64a2f6d1ef965a614bdae7973'),
-    'small_224_0.75_float': ('cb65d4e5be93758266aa0a7f2c6708b7',
-                             'ebdb5cc8e0b497cd13a7c275d475c819'),
-    'small_224_1.0_float': ('8768d4c2e7dee89b9d02b2d03d65d862',
-                            'd3e8ec802a04aa4fc771ee12a9a9b836'),
-    'small_minimalistic_224_1.0_float': ('99cd97fb2fcdad2bf028eb838de69e37',
-                                         'cde8136e733e811080d9fcd8a252f7e4'),
+    "large_224_0.75_float": (
+        "765b44a33ad4005b3ac83185abf1d0eb",
+        "40af19a13ebea4e2ee0c676887f69a2e",
+    ),
+    "large_224_1.0_float": (
+        "59e551e166be033d707958cf9e29a6a7",
+        "07fb09a5933dd0c8eaafa16978110389",
+    ),
+    "large_minimalistic_224_1.0_float": (
+        "675e7b876c45c57e9e63e6d90a36599c",
+        "ec5221f64a2f6d1ef965a614bdae7973",
+    ),
+    "small_224_0.75_float": (
+        "cb65d4e5be93758266aa0a7f2c6708b7",
+        "ebdb5cc8e0b497cd13a7c275d475c819",
+    ),
+    "small_224_1.0_float": (
+        "8768d4c2e7dee89b9d02b2d03d65d862",
+        "d3e8ec802a04aa4fc771ee12a9a9b836",
+    ),
+    "small_minimalistic_224_1.0_float": (
+        "99cd97fb2fcdad2bf028eb838de69e37",
+        "cde8136e733e811080d9fcd8a252f7e4",
+    ),
 }
 
 layers = VersionAwareLayers()
@@ -80,8 +94,8 @@
   For MobileNetV3, by default input preprocessing is included as a part of the
   model (as a `Rescaling` layer), and thus
   `tf.keras.applications.mobilenet_v3.preprocess_input` is actually a
-  pass-through function. In this use case, MobileNetV3 models expect their inputs
-  to be float tensors of pixels with values in the [0-255] range.
+  pass-through function. In this use case, MobileNetV3 models expect their
+  inputs to be float tensors of pixels with values in the [0-255] range.
   At the same time, preprocessing as a part of the model (i.e. `Rescaling`
   layer) can be disabled by setting `include_preprocessing` argument to False.
   With preprocessing disabled MobileNetV3 models expect their inputs to be float
@@ -155,309 +169,397 @@
 """
 
 
-def MobileNetV3(stack_fn,
-                last_point_ch,
-                input_shape=None,
-                alpha=1.0,
-                model_type='large',
-                minimalistic=False,
-                include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                classes=1000,
-                pooling=None,
-                dropout_rate=0.2,
-                classifier_activation='softmax',
-                include_preprocessing=True):
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.  '
-                     f'Received weights={weights}')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
-                     'as true, `classes` should be 1000.  '
-                     f'Received classes={classes}')
-
-  # Determine proper input shape and default size.
-  # If both input_shape and input_tensor are used, they should match
-  if input_shape is not None and input_tensor is not None:
-    try:
-      is_input_t_tensor = backend.is_keras_tensor(input_tensor)
-    except ValueError:
-      try:
-        is_input_t_tensor = backend.is_keras_tensor(
-            layer_utils.get_source_inputs(input_tensor))
-      except ValueError:
-        raise ValueError('input_tensor: ', input_tensor,
-                         'is not type input_tensor.  '
-                         f'Received type(input_tensor)={type(input_tensor)}')
-    if is_input_t_tensor:
-      if backend.image_data_format() == 'channels_first':
-        if backend.int_shape(input_tensor)[1] != input_shape[1]:
-          raise ValueError('When backend.image_data_format()=channels_first, '
-                           'input_shape[1] must equal '
-                           'backend.int_shape(input_tensor)[1].  Received '
-                           f'input_shape={input_shape}, '
-                           'backend.int_shape(input_tensor)='
-                           f'{backend.int_shape(input_tensor)}')
-      else:
-        if backend.int_shape(input_tensor)[2] != input_shape[1]:
-          raise ValueError('input_shape[1] must equal '
-                           'backend.int_shape(input_tensor)[2].  Received '
-                           f'input_shape={input_shape}, '
-                           'backend.int_shape(input_tensor)='
-                           f'{backend.int_shape(input_tensor)}')
+def MobileNetV3(
+    stack_fn,
+    last_point_ch,
+    input_shape=None,
+    alpha=1.0,
+    model_type="large",
+    minimalistic=False,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    classes=1000,
+    pooling=None,
+    dropout_rate=0.2,
+    classifier_activation="softmax",
+    include_preprocessing=True,
+):
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded.  "
+            f"Received weights={weights}"
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top` '
+            "as true, `classes` should be 1000.  "
+            f"Received classes={classes}"
+        )
+
+    # Determine proper input shape and default size.
+    # If both input_shape and input_tensor are used, they should match
+    if input_shape is not None and input_tensor is not None:
+        try:
+            is_input_t_tensor = backend.is_keras_tensor(input_tensor)
+        except ValueError:
+            try:
+                is_input_t_tensor = backend.is_keras_tensor(
+                    layer_utils.get_source_inputs(input_tensor)
+                )
+            except ValueError:
+                raise ValueError(
+                    "input_tensor: ",
+                    input_tensor,
+                    "is not type input_tensor.  "
+                    f"Received type(input_tensor)={type(input_tensor)}",
+                )
+        if is_input_t_tensor:
+            if backend.image_data_format() == "channels_first":
+                if backend.int_shape(input_tensor)[1] != input_shape[1]:
+                    raise ValueError(
+                        "When backend.image_data_format()=channels_first, "
+                        "input_shape[1] must equal "
+                        "backend.int_shape(input_tensor)[1].  Received "
+                        f"input_shape={input_shape}, "
+                        "backend.int_shape(input_tensor)="
+                        f"{backend.int_shape(input_tensor)}"
+                    )
+            else:
+                if backend.int_shape(input_tensor)[2] != input_shape[1]:
+                    raise ValueError(
+                        "input_shape[1] must equal "
+                        "backend.int_shape(input_tensor)[2].  Received "
+                        f"input_shape={input_shape}, "
+                        "backend.int_shape(input_tensor)="
+                        f"{backend.int_shape(input_tensor)}"
+                    )
+        else:
+            raise ValueError(
+                "input_tensor specified: ",
+                input_tensor,
+                "is not a keras tensor",
+            )
+
+    # If input_shape is None, infer shape from input_tensor
+    if input_shape is None and input_tensor is not None:
+
+        try:
+            backend.is_keras_tensor(input_tensor)
+        except ValueError:
+            raise ValueError(
+                "input_tensor: ",
+                input_tensor,
+                "is type: ",
+                type(input_tensor),
+                "which is not a valid type",
+            )
+
+        if backend.is_keras_tensor(input_tensor):
+            if backend.image_data_format() == "channels_first":
+                rows = backend.int_shape(input_tensor)[2]
+                cols = backend.int_shape(input_tensor)[3]
+                input_shape = (3, cols, rows)
+            else:
+                rows = backend.int_shape(input_tensor)[1]
+                cols = backend.int_shape(input_tensor)[2]
+                input_shape = (cols, rows, 3)
+    # If input_shape is None and input_tensor is None using standard shape
+    if input_shape is None and input_tensor is None:
+        input_shape = (None, None, 3)
+
+    if backend.image_data_format() == "channels_last":
+        row_axis, col_axis = (0, 1)
     else:
-      raise ValueError('input_tensor specified: ', input_tensor,
-                       'is not a keras tensor')
-
-  # If input_shape is None, infer shape from input_tensor
-  if input_shape is None and input_tensor is not None:
-
-    try:
-      backend.is_keras_tensor(input_tensor)
-    except ValueError:
-      raise ValueError('input_tensor: ', input_tensor, 'is type: ',
-                       type(input_tensor), 'which is not a valid type')
-
-    if backend.is_keras_tensor(input_tensor):
-      if backend.image_data_format() == 'channels_first':
-        rows = backend.int_shape(input_tensor)[2]
-        cols = backend.int_shape(input_tensor)[3]
-        input_shape = (3, cols, rows)
-      else:
-        rows = backend.int_shape(input_tensor)[1]
-        cols = backend.int_shape(input_tensor)[2]
-        input_shape = (cols, rows, 3)
-  # If input_shape is None and input_tensor is None using standard shape
-  if input_shape is None and input_tensor is None:
-    input_shape = (None, None, 3)
-
-  if backend.image_data_format() == 'channels_last':
-    row_axis, col_axis = (0, 1)
-  else:
-    row_axis, col_axis = (1, 2)
-  rows = input_shape[row_axis]
-  cols = input_shape[col_axis]
-  if rows and cols and (rows < 32 or cols < 32):
-    raise ValueError('Input size must be at least 32x32; Received `input_shape='
-                     f'{input_shape}`')
-  if weights == 'imagenet':
-    if (not minimalistic and alpha not in [0.75, 1.0]
-        or minimalistic and alpha != 1.0):
-      raise ValueError('If imagenet weights are being loaded, '
-                       'alpha can be one of `0.75`, `1.0` for non minimalistic '
-                       'or `1.0` for minimalistic only.')
-
-    if rows != cols or rows != 224:
-      logging.warning('`input_shape` is undefined or non-square, '
-                      'or `rows` is not 224. '
-                      'Weights for input shape (224, 224) will be '
-                      'loaded as the default.')
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        row_axis, col_axis = (1, 2)
+    rows = input_shape[row_axis]
+    cols = input_shape[col_axis]
+    if rows and cols and (rows < 32 or cols < 32):
+        raise ValueError(
+            "Input size must be at least 32x32; Received `input_shape="
+            f"{input_shape}`"
+        )
+    if weights == "imagenet":
+        if (
+            not minimalistic
+            and alpha not in [0.75, 1.0]
+            or minimalistic
+            and alpha != 1.0
+        ):
+            raise ValueError(
+                "If imagenet weights are being loaded, "
+                "alpha can be one of `0.75`, `1.0` for non minimalistic "
+                "or `1.0` for minimalistic only."
+            )
+
+        if rows != cols or rows != 224:
+            logging.warning(
+                "`input_shape` is undefined or non-square, "
+                "or `rows` is not 224. "
+                "Weights for input shape (224, 224) will be "
+                "loaded as the default."
+            )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      img_input = input_tensor
-
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
-
-  if minimalistic:
-    kernel = 3
-    activation = relu
-    se_ratio = None
-  else:
-    kernel = 5
-    activation = hard_swish
-    se_ratio = 0.25
-
-  x = img_input
-  if include_preprocessing:
-    x = layers.Rescaling(scale=1. / 127.5, offset=-1.)(x)
-  x = layers.Conv2D(
-      16,
-      kernel_size=3,
-      strides=(2, 2),
-      padding='same',
-      use_bias=False,
-      name='Conv')(x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, epsilon=1e-3,
-      momentum=0.999, name='Conv/BatchNorm')(x)
-  x = activation(x)
-
-  x = stack_fn(x, kernel, activation, se_ratio)
-
-  last_conv_ch = _depth(backend.int_shape(x)[channel_axis] * 6)
-
-  # if the width multiplier is greater than 1 we
-  # increase the number of output channels
-  if alpha > 1.0:
-    last_point_ch = _depth(last_point_ch * alpha)
-  x = layers.Conv2D(
-      last_conv_ch,
-      kernel_size=1,
-      padding='same',
-      use_bias=False,
-      name='Conv_1')(x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, epsilon=1e-3,
-      momentum=0.999, name='Conv_1/BatchNorm')(x)
-  x = activation(x)
-  if include_top:
-    x = layers.GlobalAveragePooling2D(keepdims=True)(x)
-    x = layers.Conv2D(
-        last_point_ch,
-        kernel_size=1,
-        padding='same',
-        use_bias=True,
-        name='Conv_2')(x)
-    x = activation(x)
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
 
-    if dropout_rate > 0:
-      x = layers.Dropout(dropout_rate)(x)
-    x = layers.Conv2D(classes, kernel_size=1, padding='same', name='Logits')(x)
-    x = layers.Flatten()(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Activation(activation=classifier_activation,
-                          name='Predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = models.Model(inputs, x, name='MobilenetV3' + model_type)
-
-  # Load weights.
-  if weights == 'imagenet':
-    model_name = '{}{}_224_{}_float'.format(
-        model_type, '_minimalistic' if minimalistic else '', str(alpha))
-    if include_top:
-      file_name = 'weights_mobilenet_v3_' + model_name + '.h5'
-      file_hash = WEIGHTS_HASHES[model_name][0]
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
+
+    if minimalistic:
+        kernel = 3
+        activation = relu
+        se_ratio = None
     else:
-      file_name = 'weights_mobilenet_v3_' + model_name + '_no_top_v2.h5'
-      file_hash = WEIGHTS_HASHES[model_name][1]
-    weights_path = data_utils.get_file(
-        file_name,
-        BASE_WEIGHT_PATH + file_name,
-        cache_subdir='models',
-        file_hash=file_hash)
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-@keras_export('keras.applications.MobileNetV3Small')
-def MobileNetV3Small(input_shape=None,
-                     alpha=1.0,
-                     minimalistic=False,
-                     include_top=True,
-                     weights='imagenet',
-                     input_tensor=None,
-                     classes=1000,
-                     pooling=None,
-                     dropout_rate=0.2,
-                     classifier_activation='softmax',
-                     include_preprocessing=True):
-
-  def stack_fn(x, kernel, activation, se_ratio):
-
-    def depth(d):
-      return _depth(d * alpha)
-
-    x = _inverted_res_block(x, 1, depth(16), 3, 2, se_ratio, relu, 0)
-    x = _inverted_res_block(x, 72. / 16, depth(24), 3, 2, None, relu, 1)
-    x = _inverted_res_block(x, 88. / 24, depth(24), 3, 1, None, relu, 2)
-    x = _inverted_res_block(x, 4, depth(40), kernel, 2, se_ratio, activation, 3)
-    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 4)
-    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 5)
-    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 6)
-    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 7)
-    x = _inverted_res_block(x, 6, depth(96), kernel, 2, se_ratio, activation, 8)
-    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 9)
-    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation,
-                            10)
-    return x
+        kernel = 5
+        activation = hard_swish
+        se_ratio = 0.25
 
-  return MobileNetV3(stack_fn, 1024, input_shape, alpha, 'small', minimalistic,
-                     include_top, weights, input_tensor, classes, pooling,
-                     dropout_rate, classifier_activation, include_preprocessing)
-
-
-@keras_export('keras.applications.MobileNetV3Large')
-def MobileNetV3Large(input_shape=None,
-                     alpha=1.0,
-                     minimalistic=False,
-                     include_top=True,
-                     weights='imagenet',
-                     input_tensor=None,
-                     classes=1000,
-                     pooling=None,
-                     dropout_rate=0.2,
-                     classifier_activation='softmax',
-                     include_preprocessing=True):
-
-  def stack_fn(x, kernel, activation, se_ratio):
-
-    def depth(d):
-      return _depth(d * alpha)
-
-    x = _inverted_res_block(x, 1, depth(16), 3, 1, None, relu, 0)
-    x = _inverted_res_block(x, 4, depth(24), 3, 2, None, relu, 1)
-    x = _inverted_res_block(x, 3, depth(24), 3, 1, None, relu, 2)
-    x = _inverted_res_block(x, 3, depth(40), kernel, 2, se_ratio, relu, 3)
-    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 4)
-    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 5)
-    x = _inverted_res_block(x, 6, depth(80), 3, 2, None, activation, 6)
-    x = _inverted_res_block(x, 2.5, depth(80), 3, 1, None, activation, 7)
-    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 8)
-    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 9)
-    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 10)
-    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 11)
-    x = _inverted_res_block(x, 6, depth(160), kernel, 2, se_ratio, activation,
-                            12)
-    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
-                            13)
-    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
-                            14)
-    return x
+    x = img_input
+    if include_preprocessing:
+        x = layers.Rescaling(scale=1.0 / 127.5, offset=-1.0)(x)
+    x = layers.Conv2D(
+        16,
+        kernel_size=3,
+        strides=(2, 2),
+        padding="same",
+        use_bias=False,
+        name="Conv",
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis, epsilon=1e-3, momentum=0.999, name="Conv/BatchNorm"
+    )(x)
+    x = activation(x)
 
-  return MobileNetV3(stack_fn, 1280, input_shape, alpha, 'large', minimalistic,
-                     include_top, weights, input_tensor, classes, pooling,
-                     dropout_rate, classifier_activation, include_preprocessing)
+    x = stack_fn(x, kernel, activation, se_ratio)
 
+    last_conv_ch = _depth(backend.int_shape(x)[channel_axis] * 6)
 
-MobileNetV3Small.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Small')
-MobileNetV3Large.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Large')
+    # if the width multiplier is greater than 1 we
+    # increase the number of output channels
+    if alpha > 1.0:
+        last_point_ch = _depth(last_point_ch * alpha)
+    x = layers.Conv2D(
+        last_conv_ch,
+        kernel_size=1,
+        padding="same",
+        use_bias=False,
+        name="Conv_1",
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis, epsilon=1e-3, momentum=0.999, name="Conv_1/BatchNorm"
+    )(x)
+    x = activation(x)
+    if include_top:
+        x = layers.GlobalAveragePooling2D(keepdims=True)(x)
+        x = layers.Conv2D(
+            last_point_ch,
+            kernel_size=1,
+            padding="same",
+            use_bias=True,
+            name="Conv_2",
+        )(x)
+        x = activation(x)
+
+        if dropout_rate > 0:
+            x = layers.Dropout(dropout_rate)(x)
+        x = layers.Conv2D(
+            classes, kernel_size=1, padding="same", name="Logits"
+        )(x)
+        x = layers.Flatten()(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Activation(
+            activation=classifier_activation, name="Predictions"
+        )(x)
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D(name="max_pool")(x)
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = models.Model(inputs, x, name="MobilenetV3" + model_type)
+
+    # Load weights.
+    if weights == "imagenet":
+        model_name = "{}{}_224_{}_float".format(
+            model_type, "_minimalistic" if minimalistic else "", str(alpha)
+        )
+        if include_top:
+            file_name = "weights_mobilenet_v3_" + model_name + ".h5"
+            file_hash = WEIGHTS_HASHES[model_name][0]
+        else:
+            file_name = "weights_mobilenet_v3_" + model_name + "_no_top_v2.h5"
+            file_hash = WEIGHTS_HASHES[model_name][1]
+        weights_path = data_utils.get_file(
+            file_name,
+            BASE_WEIGHT_PATH + file_name,
+            cache_subdir="models",
+            file_hash=file_hash,
+        )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
+
+
+@keras_export("keras.applications.MobileNetV3Small")
+def MobileNetV3Small(
+    input_shape=None,
+    alpha=1.0,
+    minimalistic=False,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    classes=1000,
+    pooling=None,
+    dropout_rate=0.2,
+    classifier_activation="softmax",
+    include_preprocessing=True,
+):
+    def stack_fn(x, kernel, activation, se_ratio):
+        def depth(d):
+            return _depth(d * alpha)
+
+        x = _inverted_res_block(x, 1, depth(16), 3, 2, se_ratio, relu, 0)
+        x = _inverted_res_block(x, 72.0 / 16, depth(24), 3, 2, None, relu, 1)
+        x = _inverted_res_block(x, 88.0 / 24, depth(24), 3, 1, None, relu, 2)
+        x = _inverted_res_block(
+            x, 4, depth(40), kernel, 2, se_ratio, activation, 3
+        )
+        x = _inverted_res_block(
+            x, 6, depth(40), kernel, 1, se_ratio, activation, 4
+        )
+        x = _inverted_res_block(
+            x, 6, depth(40), kernel, 1, se_ratio, activation, 5
+        )
+        x = _inverted_res_block(
+            x, 3, depth(48), kernel, 1, se_ratio, activation, 6
+        )
+        x = _inverted_res_block(
+            x, 3, depth(48), kernel, 1, se_ratio, activation, 7
+        )
+        x = _inverted_res_block(
+            x, 6, depth(96), kernel, 2, se_ratio, activation, 8
+        )
+        x = _inverted_res_block(
+            x, 6, depth(96), kernel, 1, se_ratio, activation, 9
+        )
+        x = _inverted_res_block(
+            x, 6, depth(96), kernel, 1, se_ratio, activation, 10
+        )
+        return x
+
+    return MobileNetV3(
+        stack_fn,
+        1024,
+        input_shape,
+        alpha,
+        "small",
+        minimalistic,
+        include_top,
+        weights,
+        input_tensor,
+        classes,
+        pooling,
+        dropout_rate,
+        classifier_activation,
+        include_preprocessing,
+    )
+
+
+@keras_export("keras.applications.MobileNetV3Large")
+def MobileNetV3Large(
+    input_shape=None,
+    alpha=1.0,
+    minimalistic=False,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    classes=1000,
+    pooling=None,
+    dropout_rate=0.2,
+    classifier_activation="softmax",
+    include_preprocessing=True,
+):
+    def stack_fn(x, kernel, activation, se_ratio):
+        def depth(d):
+            return _depth(d * alpha)
+
+        x = _inverted_res_block(x, 1, depth(16), 3, 1, None, relu, 0)
+        x = _inverted_res_block(x, 4, depth(24), 3, 2, None, relu, 1)
+        x = _inverted_res_block(x, 3, depth(24), 3, 1, None, relu, 2)
+        x = _inverted_res_block(x, 3, depth(40), kernel, 2, se_ratio, relu, 3)
+        x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 4)
+        x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 5)
+        x = _inverted_res_block(x, 6, depth(80), 3, 2, None, activation, 6)
+        x = _inverted_res_block(x, 2.5, depth(80), 3, 1, None, activation, 7)
+        x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 8)
+        x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 9)
+        x = _inverted_res_block(
+            x, 6, depth(112), 3, 1, se_ratio, activation, 10
+        )
+        x = _inverted_res_block(
+            x, 6, depth(112), 3, 1, se_ratio, activation, 11
+        )
+        x = _inverted_res_block(
+            x, 6, depth(160), kernel, 2, se_ratio, activation, 12
+        )
+        x = _inverted_res_block(
+            x, 6, depth(160), kernel, 1, se_ratio, activation, 13
+        )
+        x = _inverted_res_block(
+            x, 6, depth(160), kernel, 1, se_ratio, activation, 14
+        )
+        return x
+
+    return MobileNetV3(
+        stack_fn,
+        1280,
+        input_shape,
+        alpha,
+        "large",
+        minimalistic,
+        include_top,
+        weights,
+        input_tensor,
+        classes,
+        pooling,
+        dropout_rate,
+        classifier_activation,
+        include_preprocessing,
+    )
+
+
+MobileNetV3Small.__doc__ = BASE_DOCSTRING.format(name="MobileNetV3Small")
+MobileNetV3Large.__doc__ = BASE_DOCSTRING.format(name="MobileNetV3Large")
 
 
 def relu(x):
-  return layers.ReLU()(x)
+    return layers.ReLU()(x)
 
 
 def hard_sigmoid(x):
-  return layers.ReLU(6.)(x + 3.) * (1. / 6.)
+    return layers.ReLU(6.0)(x + 3.0) * (1.0 / 6.0)
 
 
 def hard_swish(x):
-  return layers.Multiply()([x, hard_sigmoid(x)])
+    return layers.Multiply()([x, hard_sigmoid(x)])
 
 
 # This function is taken from the original tf repo.
@@ -468,128 +570,129 @@ def hard_swish(x):
 
 
 def _depth(v, divisor=8, min_value=None):
-  if min_value is None:
-    min_value = divisor
-  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-  # Make sure that round down does not go down by more than 10%.
-  if new_v < 0.9 * v:
-    new_v += divisor
-  return new_v
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
 
 
 def _se_block(inputs, filters, se_ratio, prefix):
-  x = layers.GlobalAveragePooling2D(
-      keepdims=True, name=prefix + 'squeeze_excite/AvgPool')(
-          inputs)
-  x = layers.Conv2D(
-      _depth(filters * se_ratio),
-      kernel_size=1,
-      padding='same',
-      name=prefix + 'squeeze_excite/Conv')(
-          x)
-  x = layers.ReLU(name=prefix + 'squeeze_excite/Relu')(x)
-  x = layers.Conv2D(
-      filters,
-      kernel_size=1,
-      padding='same',
-      name=prefix + 'squeeze_excite/Conv_1')(
-          x)
-  x = hard_sigmoid(x)
-  x = layers.Multiply(name=prefix + 'squeeze_excite/Mul')([inputs, x])
-  return x
-
-
-def _inverted_res_block(x, expansion, filters, kernel_size, stride, se_ratio,
-                        activation, block_id):
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
-  shortcut = x
-  prefix = 'expanded_conv/'
-  infilters = backend.int_shape(x)[channel_axis]
-  if block_id:
-    # Expand
-    prefix = 'expanded_conv_{}/'.format(block_id)
+    x = layers.GlobalAveragePooling2D(
+        keepdims=True, name=prefix + "squeeze_excite/AvgPool"
+    )(inputs)
+    x = layers.Conv2D(
+        _depth(filters * se_ratio),
+        kernel_size=1,
+        padding="same",
+        name=prefix + "squeeze_excite/Conv",
+    )(x)
+    x = layers.ReLU(name=prefix + "squeeze_excite/Relu")(x)
     x = layers.Conv2D(
-        _depth(infilters * expansion),
+        filters,
         kernel_size=1,
-        padding='same',
+        padding="same",
+        name=prefix + "squeeze_excite/Conv_1",
+    )(x)
+    x = hard_sigmoid(x)
+    x = layers.Multiply(name=prefix + "squeeze_excite/Mul")([inputs, x])
+    return x
+
+
+def _inverted_res_block(
+    x, expansion, filters, kernel_size, stride, se_ratio, activation, block_id
+):
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
+    shortcut = x
+    prefix = "expanded_conv/"
+    infilters = backend.int_shape(x)[channel_axis]
+    if block_id:
+        # Expand
+        prefix = f"expanded_conv_{block_id}/"
+        x = layers.Conv2D(
+            _depth(infilters * expansion),
+            kernel_size=1,
+            padding="same",
+            use_bias=False,
+            name=prefix + "expand",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=channel_axis,
+            epsilon=1e-3,
+            momentum=0.999,
+            name=prefix + "expand/BatchNorm",
+        )(x)
+        x = activation(x)
+
+    if stride == 2:
+        x = layers.ZeroPadding2D(
+            padding=imagenet_utils.correct_pad(x, kernel_size),
+            name=prefix + "depthwise/pad",
+        )(x)
+    x = layers.DepthwiseConv2D(
+        kernel_size,
+        strides=stride,
+        padding="same" if stride == 1 else "valid",
         use_bias=False,
-        name=prefix + 'expand')(
-            x)
+        name=prefix + "depthwise",
+    )(x)
     x = layers.BatchNormalization(
         axis=channel_axis,
         epsilon=1e-3,
         momentum=0.999,
-        name=prefix + 'expand/BatchNorm')(
-            x)
+        name=prefix + "depthwise/BatchNorm",
+    )(x)
     x = activation(x)
 
-  if stride == 2:
-    x = layers.ZeroPadding2D(
-        padding=imagenet_utils.correct_pad(x, kernel_size),
-        name=prefix + 'depthwise/pad')(
-            x)
-  x = layers.DepthwiseConv2D(
-      kernel_size,
-      strides=stride,
-      padding='same' if stride == 1 else 'valid',
-      use_bias=False,
-      name=prefix + 'depthwise')(
-          x)
-  x = layers.BatchNormalization(
-      axis=channel_axis,
-      epsilon=1e-3,
-      momentum=0.999,
-      name=prefix + 'depthwise/BatchNorm')(
-          x)
-  x = activation(x)
-
-  if se_ratio:
-    x = _se_block(x, _depth(infilters * expansion), se_ratio, prefix)
-
-  x = layers.Conv2D(
-      filters,
-      kernel_size=1,
-      padding='same',
-      use_bias=False,
-      name=prefix + 'project')(
-          x)
-  x = layers.BatchNormalization(
-      axis=channel_axis,
-      epsilon=1e-3,
-      momentum=0.999,
-      name=prefix + 'project/BatchNorm')(
-          x)
-
-  if stride == 1 and infilters == filters:
-    x = layers.Add(name=prefix + 'Add')([shortcut, x])
-  return x
-
-
-@keras_export('keras.applications.mobilenet_v3.preprocess_input')
-def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
-  """A placeholder method for backward compatibility.
-
-  The preprocessing logic has been included in the mobilenet_v3 model
-  implementation. Users are no longer required to call this method to normalize
-  the input data. This method does nothing and only kept as a placeholder to
-  align the API surface between old and new version of model.
+    if se_ratio:
+        x = _se_block(x, _depth(infilters * expansion), se_ratio, prefix)
 
-  Args:
-    x: A floating point `numpy.array` or a `tf.Tensor`.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it,
-      it defaults to "channels_last").{mode}
+    x = layers.Conv2D(
+        filters,
+        kernel_size=1,
+        padding="same",
+        use_bias=False,
+        name=prefix + "project",
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis,
+        epsilon=1e-3,
+        momentum=0.999,
+        name=prefix + "project/BatchNorm",
+    )(x)
 
-  Returns:
-    Unchanged `numpy.array` or `tf.Tensor`.
-  """
-  return x
+    if stride == 1 and infilters == filters:
+        x = layers.Add(name=prefix + "Add")([shortcut, x])
+    return x
+
+
+@keras_export("keras.applications.mobilenet_v3.preprocess_input")
+def preprocess_input(x, data_format=None):
+    """A placeholder method for backward compatibility.
+
+    The preprocessing logic has been included in the mobilenet_v3 model
+    implementation. Users are no longer required to call this method to
+    normalize the input data. This method does nothing and only kept as a
+    placeholder to align the API surface between old and new version of model.
+
+    Args:
+      x: A floating point `numpy.array` or a `tf.Tensor`.
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it uses "channels_last").
+        Defaults to `None`.
+
+    Returns:
+      Unchanged `numpy.array` or `tf.Tensor`.
+    """
+    return x
 
 
-@keras_export('keras.applications.mobilenet_v3.decode_predictions')
+@keras_export("keras.applications.mobilenet_v3.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/nasnet.py b/keras/applications/nasnet.py
index 1635787846c2..7667d14d1b97 100644
--- a/keras/applications/nasnet.py
+++ b/keras/applications/nasnet.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """NASNet-A models for Keras.
 
 NASNet refers to Neural Architecture Search Network, a family of models
@@ -26,12 +26,11 @@
 for ImageNet 2012 are provided.
 
 The below table describes the performance on ImageNet 2012:
---------------------------------------------------------------------------------
-      Architecture       | Top-1 Acc | Top-5 Acc |  Multiply-Adds |  Params (M)
---------------------------------------------------------------------------------
-|   NASNet-A (4 @ 1056)  |   74.0 %  |   91.6 %  |       564 M    |     5.3    |
-|   NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9    |
---------------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Architecture         | Top-1 Acc | Top-5 Acc |  Multiply-Adds |  Params (M)
+---------------------|-----------|-----------|----------------|------------
+NASNet-A (4 @ 1056)  |   74.0 %  |   91.6 %  |       564 M    |     5.3
+NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9
 
 Reference:
   - [Learning Transferable Architectures for Scalable Image Recognition](
@@ -46,786 +45,866 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-
-BASE_WEIGHTS_PATH = ('https://storage.googleapis.com/tensorflow/'
-                     'keras-applications/nasnet/')
-NASNET_MOBILE_WEIGHT_PATH = BASE_WEIGHTS_PATH + 'NASNet-mobile.h5'
-NASNET_MOBILE_WEIGHT_PATH_NO_TOP = BASE_WEIGHTS_PATH + 'NASNet-mobile-no-top.h5'
-NASNET_LARGE_WEIGHT_PATH = BASE_WEIGHTS_PATH + 'NASNet-large.h5'
-NASNET_LARGE_WEIGHT_PATH_NO_TOP = BASE_WEIGHTS_PATH + 'NASNet-large-no-top.h5'
+BASE_WEIGHTS_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/nasnet/"
+)
+NASNET_MOBILE_WEIGHT_PATH = BASE_WEIGHTS_PATH + "NASNet-mobile.h5"
+NASNET_MOBILE_WEIGHT_PATH_NO_TOP = BASE_WEIGHTS_PATH + "NASNet-mobile-no-top.h5"
+NASNET_LARGE_WEIGHT_PATH = BASE_WEIGHTS_PATH + "NASNet-large.h5"
+NASNET_LARGE_WEIGHT_PATH_NO_TOP = BASE_WEIGHTS_PATH + "NASNet-large-no-top.h5"
 
 layers = VersionAwareLayers()
 
 
-def NASNet(input_shape=None,
-           penultimate_filters=4032,
-           num_blocks=6,
-           stem_block_filters=96,
-           skip_reduction=True,
-           filter_multiplier=2,
-           include_top=True,
-           weights='imagenet',
-           input_tensor=None,
-           pooling=None,
-           classes=1000,
-           default_size=None,
-           classifier_activation='softmax'):
-  """Instantiates a NASNet model.
-
-  Reference:
-  - [Learning Transferable Architectures for Scalable Image Recognition](
-      https://arxiv.org/abs/1707.07012) (CVPR 2018)
-
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For NasNet, call `tf.keras.applications.nasnet.preprocess_input`
-  on your inputs before passing them to the model.
-  `nasnet.preprocess_input` will scale input pixels between -1 and 1.
-
-  Args:
-    input_shape: Optional shape tuple, the input shape
-      is by default `(331, 331, 3)` for NASNetLarge and
-      `(224, 224, 3)` for NASNetMobile.
-      It should have exactly 3 input channels,
-      and width and height should be no smaller than 32.
-      E.g. `(224, 224, 3)` would be one valid value.
-    penultimate_filters: Number of filters in the penultimate layer.
-      NASNet models use the notation `NASNet (N @ P)`, where:
-          -   N is the number of blocks
-          -   P is the number of penultimate filters
-    num_blocks: Number of repeated blocks of the NASNet model.
-      NASNet models use the notation `NASNet (N @ P)`, where:
-          -   N is the number of blocks
-          -   P is the number of penultimate filters
-    stem_block_filters: Number of filters in the initial stem block
-    skip_reduction: Whether to skip the reduction step at the tail
-      end of the network.
-    filter_multiplier: Controls the width of the network.
-      - If `filter_multiplier` < 1.0, proportionally decreases the number
-          of filters in each layer.
-      - If `filter_multiplier` > 1.0, proportionally increases the number
-          of filters in each layer.
-      - If `filter_multiplier` = 1, default number of filters from the
-           paper are used at each layer.
-    include_top: Whether to include the fully-connected
-      layer at the top of the network.
-    weights: `None` (random initialization) or
-        `imagenet` (ImageNet weights)
-    input_tensor: Optional Keras tensor (i.e. output of
-      `layers.Input()`)
-      to use as image input for the model.
-    pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`.
-      - `None` means that the output of the model
-          will be the 4D tensor output of the
-          last convolutional block.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a
-          2D tensor.
-      - `max` means that global max pooling will
-          be applied.
-    classes: Optional number of classes to classify images
-      into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified.
-    default_size: Specifies the default image size of the model
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
-                     'as true, `classes` should be 1000')
-
-  if (isinstance(input_shape, tuple) and None in input_shape and
-      weights == 'imagenet'):
-    raise ValueError('When specifying the input shape of a NASNet'
-                     ' and loading `ImageNet` weights, '
-                     'the input_shape argument must be static '
-                     '(no None entries). Got: `input_shape=' +
-                     str(input_shape) + '`.')
-
-  if default_size is None:
-    default_size = 331
-
-  # Determine proper input shape and default size.
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=True,
-      weights=weights)
-
-  if backend.image_data_format() != 'channels_last':
-    logging.warning('The NASNet family of models is only available '
-                    'for the input data format "channels_last" '
-                    '(width, height, channels). '
-                    'However your settings specify the default '
-                    'data format "channels_first" (channels, width, height).'
-                    ' You should set `image_data_format="channels_last"` '
-                    'in your Keras config located at ~/.keras/keras.json. '
-                    'The model being returned right now will expect inputs '
-                    'to follow the "channels_last" data format.')
-    backend.set_image_data_format('channels_last')
-    old_data_format = 'channels_first'
-  else:
-    old_data_format = None
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  if penultimate_filters % (24 * (filter_multiplier**2)) != 0:
-    raise ValueError(
-        'For NASNet-A models, the `penultimate_filters` must be a multiple '
-        'of 24 * (`filter_multiplier` ** 2). Current value: %d' %
-        penultimate_filters)
-
-  channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1
-  filters = penultimate_filters // 24
-
-  x = layers.Conv2D(
-      stem_block_filters, (3, 3),
-      strides=(2, 2),
-      padding='valid',
-      use_bias=False,
-      name='stem_conv1',
-      kernel_initializer='he_normal')(
-          img_input)
-
-  x = layers.BatchNormalization(
-      axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='stem_bn1')(
-          x)
-
-  p = None
-  x, p = _reduction_a_cell(
-      x, p, filters // (filter_multiplier**2), block_id='stem_1')
-  x, p = _reduction_a_cell(
-      x, p, filters // filter_multiplier, block_id='stem_2')
-
-  for i in range(num_blocks):
-    x, p = _normal_a_cell(x, p, filters, block_id='%d' % (i))
-
-  x, p0 = _reduction_a_cell(
-      x, p, filters * filter_multiplier, block_id='reduce_%d' % (num_blocks))
-
-  p = p0 if not skip_reduction else p
-
-  for i in range(num_blocks):
-    x, p = _normal_a_cell(
-        x, p, filters * filter_multiplier, block_id='%d' % (num_blocks + i + 1))
-
-  x, p0 = _reduction_a_cell(
-      x,
-      p,
-      filters * filter_multiplier**2,
-      block_id='reduce_%d' % (2 * num_blocks))
-
-  p = p0 if not skip_reduction else p
-
-  for i in range(num_blocks):
-    x, p = _normal_a_cell(
-        x,
-        p,
-        filters * filter_multiplier**2,
-        block_id='%d' % (2 * num_blocks + i + 1))
-
-  x = layers.Activation('relu')(x)
-
-  if include_top:
-    x = layers.GlobalAveragePooling2D()(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  model = training.Model(inputs, x, name='NASNet')
-
-  # Load weights.
-  if weights == 'imagenet':
-    if default_size == 224:  # mobile version
-      if include_top:
-        weights_path = data_utils.get_file(
-            'nasnet_mobile.h5',
-            NASNET_MOBILE_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='020fb642bf7360b370c678b08e0adf61')
-      else:
-        weights_path = data_utils.get_file(
-            'nasnet_mobile_no_top.h5',
-            NASNET_MOBILE_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='1ed92395b5b598bdda52abe5c0dbfd63')
-      model.load_weights(weights_path)
-    elif default_size == 331:  # large version
-      if include_top:
-        weights_path = data_utils.get_file(
-            'nasnet_large.h5',
-            NASNET_LARGE_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='11577c9a518f0070763c2b964a382f17')
-      else:
-        weights_path = data_utils.get_file(
-            'nasnet_large_no_top.h5',
-            NASNET_LARGE_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='d81d89dc07e6e56530c4e77faddd61b5')
-      model.load_weights(weights_path)
-    else:
-      raise ValueError('ImageNet weights can only be loaded with NASNetLarge'
-                       ' or NASNetMobile')
-  elif weights is not None:
-    model.load_weights(weights)
-
-  if old_data_format:
-    backend.set_image_data_format(old_data_format)
-
-  return model
-
-
-@keras_export('keras.applications.nasnet.NASNetMobile',
-              'keras.applications.NASNetMobile')
-def NASNetMobile(input_shape=None,
-                 include_top=True,
-                 weights='imagenet',
-                 input_tensor=None,
-                 pooling=None,
-                 classes=1000,
-                 classifier_activation='softmax'):
-  """Instantiates a Mobile NASNet model in ImageNet mode.
-
-  Reference:
-  - [Learning Transferable Architectures for Scalable Image Recognition](
-      https://arxiv.org/abs/1707.07012) (CVPR 2018)
-
-  Optionally loads weights pre-trained on ImageNet.
-  Note that the data format convention used by the model is
-  the one specified in your Keras config at `~/.keras/keras.json`.
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
-  inputs before passing them to the model.
-
-  Args:
-      input_shape: Optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)` for NASNetMobile
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 32.
-          E.g. `(224, 224, 3)` would be one valid value.
-      include_top: Whether to include the fully-connected
-          layer at the top of the network.
-      weights: `None` (random initialization) or
-          `imagenet` (ImageNet weights)
-          For loading `imagenet` weights, `input_shape` should be (224, 224, 3)
-      input_tensor: Optional Keras tensor (i.e. output of
-          `layers.Input()`)
-          to use as image input for the model.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model
-              will be the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a
-              2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: Optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-      classifier_activation: A `str` or callable. The activation function to use
-          on the "top" layer. Ignored unless `include_top=True`. Set
-          `classifier_activation=None` to return the logits of the "top" layer.
-          When loading pretrained weights, `classifier_activation` can only
-          be `None` or `"softmax"`.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: In case of invalid argument for `weights`,
-          or invalid input shape.
-      RuntimeError: If attempting to run this model with a
-          backend that does not support separable convolutions.
-  """
-  return NASNet(
-      input_shape,
-      penultimate_filters=1056,
-      num_blocks=4,
-      stem_block_filters=32,
-      skip_reduction=False,
-      filter_multiplier=2,
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classes=classes,
-      default_size=224,
-      classifier_activation=classifier_activation)
-
-
-@keras_export('keras.applications.nasnet.NASNetLarge',
-              'keras.applications.NASNetLarge')
-def NASNetLarge(input_shape=None,
-                include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                pooling=None,
-                classes=1000,
-                classifier_activation='softmax'):
-  """Instantiates a NASNet model in ImageNet mode.
-
-  Reference:
-  - [Learning Transferable Architectures for Scalable Image Recognition](
-      https://arxiv.org/abs/1707.07012) (CVPR 2018)
-
-  Optionally loads weights pre-trained on ImageNet.
-  Note that the data format convention used by the model is
-  the one specified in your Keras config at `~/.keras/keras.json`.
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
-  inputs before passing them to the model.
-
-  Args:
-      input_shape: Optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(331, 331, 3)` for NASNetLarge.
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 32.
-          E.g. `(224, 224, 3)` would be one valid value.
+def NASNet(
+    input_shape=None,
+    penultimate_filters=4032,
+    num_blocks=6,
+    stem_block_filters=96,
+    skip_reduction=True,
+    filter_multiplier=2,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    pooling=None,
+    classes=1000,
+    default_size=None,
+    classifier_activation="softmax",
+):
+    """Instantiates a NASNet model.
+
+    Reference:
+    - [Learning Transferable Architectures for Scalable Image Recognition](
+        https://arxiv.org/abs/1707.07012) (CVPR 2018)
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For NasNet, call `tf.keras.applications.nasnet.preprocess_input`
+    on your inputs before passing them to the model.
+    `nasnet.preprocess_input` will scale input pixels between -1 and 1.
+
+    Args:
+      input_shape: Optional shape tuple, the input shape
+        is by default `(331, 331, 3)` for NASNetLarge and
+        `(224, 224, 3)` for NASNetMobile.
+        It should have exactly 3 input channels,
+        and width and height should be no smaller than 32.
+        E.g. `(224, 224, 3)` would be one valid value.
+      penultimate_filters: Number of filters in the penultimate layer.
+        NASNet models use the notation `NASNet (N @ P)`, where:
+            -   N is the number of blocks
+            -   P is the number of penultimate filters
+      num_blocks: Number of repeated blocks of the NASNet model.
+        NASNet models use the notation `NASNet (N @ P)`, where:
+            -   N is the number of blocks
+            -   P is the number of penultimate filters
+      stem_block_filters: Number of filters in the initial stem block
+      skip_reduction: Whether to skip the reduction step at the tail
+        end of the network.
+      filter_multiplier: Controls the width of the network.
+        - If `filter_multiplier` < 1.0, proportionally decreases the number
+            of filters in each layer.
+        - If `filter_multiplier` > 1.0, proportionally increases the number
+            of filters in each layer.
+        - If `filter_multiplier` = 1, default number of filters from the
+             paper are used at each layer.
       include_top: Whether to include the fully-connected
-          layer at the top of the network.
+        layer at the top of the network.
       weights: `None` (random initialization) or
           `imagenet` (ImageNet weights)
-          For loading `imagenet` weights, `input_shape` should be (331, 331, 3)
       input_tensor: Optional Keras tensor (i.e. output of
-          `layers.Input()`)
-          to use as image input for the model.
+        `layers.Input()`)
+        to use as image input for the model.
       pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model
-              will be the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a
-              2D tensor.
-          - `max` means that global max pooling will
-              be applied.
+        when `include_top` is `False`.
+        - `None` means that the output of the model
+            will be the 4D tensor output of the
+            last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a
+            2D tensor.
+        - `max` means that global max pooling will
+            be applied.
       classes: Optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
+        into, only to be specified if `include_top` is True, and
+        if no `weights` argument is specified.
+      default_size: Specifies the default image size of the model
       classifier_activation: A `str` or callable. The activation function to use
-          on the "top" layer. Ignored unless `include_top=True`. Set
-          `classifier_activation=None` to return the logits of the "top" layer.
-          When loading pretrained weights, `classifier_activation` can only
-          be `None` or `"softmax"`.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-      RuntimeError: If attempting to run this model with a
-          backend that does not support separable convolutions.
-  """
-  return NASNet(
-      input_shape,
-      penultimate_filters=4032,
-      num_blocks=6,
-      stem_block_filters=96,
-      skip_reduction=True,
-      filter_multiplier=2,
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classes=classes,
-      default_size=331,
-      classifier_activation=classifier_activation)
-
-
-def _separable_conv_block(ip,
-                          filters,
-                          kernel_size=(3, 3),
-                          strides=(1, 1),
-                          block_id=None):
-  """Adds 2 blocks of [relu-separable conv-batchnorm].
-
-  Args:
-      ip: Input tensor
-      filters: Number of output filters per layer
-      kernel_size: Kernel size of separable convolutions
-      strides: Strided convolution for downsampling
-      block_id: String block_id
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top` '
+            "as true, `classes` should be 1000"
+        )
+
+    if (
+        isinstance(input_shape, tuple)
+        and None in input_shape
+        and weights == "imagenet"
+    ):
+        raise ValueError(
+            "When specifying the input shape of a NASNet"
+            " and loading `ImageNet` weights, "
+            "the input_shape argument must be static "
+            "(no None entries). Got: `input_shape=" + str(input_shape) + "`."
+        )
+
+    if default_size is None:
+        default_size = 331
+
+    # Determine proper input shape and default size.
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=default_size,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if backend.image_data_format() != "channels_last":
+        logging.warning(
+            "The NASNet family of models is only available "
+            'for the input data format "channels_last" '
+            "(width, height, channels). "
+            "However your settings specify the default "
+            'data format "channels_first" (channels, width, height).'
+            ' You should set `image_data_format="channels_last"` '
+            "in your Keras config located at ~/.keras/keras.json. "
+            "The model being returned right now will expect inputs "
+            'to follow the "channels_last" data format.'
+        )
+        backend.set_image_data_format("channels_last")
+        old_data_format = "channels_first"
+    else:
+        old_data_format = None
 
-  Returns:
-      A Keras tensor
-  """
-  channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1
-
-  with backend.name_scope('separable_conv_block_%s' % block_id):
-    x = layers.Activation('relu')(ip)
-    if strides == (2, 2):
-      x = layers.ZeroPadding2D(
-          padding=imagenet_utils.correct_pad(x, kernel_size),
-          name='separable_conv_1_pad_%s' % block_id)(x)
-      conv_pad = 'valid'
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      conv_pad = 'same'
-    x = layers.SeparableConv2D(
-        filters,
-        kernel_size,
-        strides=strides,
-        name='separable_conv_1_%s' % block_id,
-        padding=conv_pad,
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    if penultimate_filters % (24 * (filter_multiplier**2)) != 0:
+        raise ValueError(
+            "For NASNet-A models, the `penultimate_filters` must be a multiple "
+            "of 24 * (`filter_multiplier` ** 2). Current value: %d"
+            % penultimate_filters
+        )
+
+    channel_dim = 1 if backend.image_data_format() == "channels_first" else -1
+    filters = penultimate_filters // 24
+
+    x = layers.Conv2D(
+        stem_block_filters,
+        (3, 3),
+        strides=(2, 2),
+        padding="valid",
         use_bias=False,
-        kernel_initializer='he_normal')(
-            x)
-    x = layers.BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='separable_conv_1_bn_%s' % (block_id))(
-            x)
-    x = layers.Activation('relu')(x)
-    x = layers.SeparableConv2D(
-        filters,
-        kernel_size,
-        name='separable_conv_2_%s' % block_id,
-        padding='same',
-        use_bias=False,
-        kernel_initializer='he_normal')(
-            x)
-    x = layers.BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='separable_conv_2_bn_%s' % (block_id))(
-            x)
-  return x
-
+        name="stem_conv1",
+        kernel_initializer="he_normal",
+    )(img_input)
 
-def _adjust_block(p, ip, filters, block_id=None):
-  """Adjusts the input `previous path` to match the shape of the `input`.
-
-  Used in situations where the output number of filters needs to be changed.
-
-  Args:
-      p: Input tensor which needs to be modified
-      ip: Input tensor whose shape needs to be matched
-      filters: Number of output filters to be matched
-      block_id: String block_id
-
-  Returns:
-      Adjusted Keras tensor
-  """
-  channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1
-  img_dim = 2 if backend.image_data_format() == 'channels_first' else -2
-
-  ip_shape = backend.int_shape(ip)
-
-  if p is not None:
-    p_shape = backend.int_shape(p)
-
-  with backend.name_scope('adjust_block'):
-    if p is None:
-      p = ip
-
-    elif p_shape[img_dim] != ip_shape[img_dim]:
-      with backend.name_scope('adjust_reduction_block_%s' % block_id):
-        p = layers.Activation('relu', name='adjust_relu_1_%s' % block_id)(p)
-        p1 = layers.AveragePooling2D((1, 1),
-                                     strides=(2, 2),
-                                     padding='valid',
-                                     name='adjust_avg_pool_1_%s' % block_id)(
-                                         p)
-        p1 = layers.Conv2D(
-            filters // 2, (1, 1),
-            padding='same',
+    x = layers.BatchNormalization(
+        axis=channel_dim, momentum=0.9997, epsilon=1e-3, name="stem_bn1"
+    )(x)
+
+    p = None
+    x, p = _reduction_a_cell(
+        x, p, filters // (filter_multiplier**2), block_id="stem_1"
+    )
+    x, p = _reduction_a_cell(
+        x, p, filters // filter_multiplier, block_id="stem_2"
+    )
+
+    for i in range(num_blocks):
+        x, p = _normal_a_cell(x, p, filters, block_id="%d" % (i))
+
+    x, p0 = _reduction_a_cell(
+        x, p, filters * filter_multiplier, block_id="reduce_%d" % (num_blocks)
+    )
+
+    p = p0 if not skip_reduction else p
+
+    for i in range(num_blocks):
+        x, p = _normal_a_cell(
+            x,
+            p,
+            filters * filter_multiplier,
+            block_id="%d" % (num_blocks + i + 1),
+        )
+
+    x, p0 = _reduction_a_cell(
+        x,
+        p,
+        filters * filter_multiplier**2,
+        block_id="reduce_%d" % (2 * num_blocks),
+    )
+
+    p = p0 if not skip_reduction else p
+
+    for i in range(num_blocks):
+        x, p = _normal_a_cell(
+            x,
+            p,
+            filters * filter_multiplier**2,
+            block_id="%d" % (2 * num_blocks + i + 1),
+        )
+
+    x = layers.Activation("relu")(x)
+
+    if include_top:
+        x = layers.GlobalAveragePooling2D()(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    model = training.Model(inputs, x, name="NASNet")
+
+    # Load weights.
+    if weights == "imagenet":
+        if default_size == 224:  # mobile version
+            if include_top:
+                weights_path = data_utils.get_file(
+                    "nasnet_mobile.h5",
+                    NASNET_MOBILE_WEIGHT_PATH,
+                    cache_subdir="models",
+                    file_hash="020fb642bf7360b370c678b08e0adf61",
+                )
+            else:
+                weights_path = data_utils.get_file(
+                    "nasnet_mobile_no_top.h5",
+                    NASNET_MOBILE_WEIGHT_PATH_NO_TOP,
+                    cache_subdir="models",
+                    file_hash="1ed92395b5b598bdda52abe5c0dbfd63",
+                )
+            model.load_weights(weights_path)
+        elif default_size == 331:  # large version
+            if include_top:
+                weights_path = data_utils.get_file(
+                    "nasnet_large.h5",
+                    NASNET_LARGE_WEIGHT_PATH,
+                    cache_subdir="models",
+                    file_hash="11577c9a518f0070763c2b964a382f17",
+                )
+            else:
+                weights_path = data_utils.get_file(
+                    "nasnet_large_no_top.h5",
+                    NASNET_LARGE_WEIGHT_PATH_NO_TOP,
+                    cache_subdir="models",
+                    file_hash="d81d89dc07e6e56530c4e77faddd61b5",
+                )
+            model.load_weights(weights_path)
+        else:
+            raise ValueError(
+                "ImageNet weights can only be loaded with NASNetLarge"
+                " or NASNetMobile"
+            )
+    elif weights is not None:
+        model.load_weights(weights)
+
+    if old_data_format:
+        backend.set_image_data_format(old_data_format)
+
+    return model
+
+
+@keras_export(
+    "keras.applications.nasnet.NASNetMobile", "keras.applications.NASNetMobile"
+)
+def NASNetMobile(
+    input_shape=None,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    """Instantiates a Mobile NASNet model in ImageNet mode.
+
+    Reference:
+    - [Learning Transferable Architectures for Scalable Image Recognition](
+        https://arxiv.org/abs/1707.07012) (CVPR 2018)
+
+    Optionally loads weights pre-trained on ImageNet.
+    Note that the data format convention used by the model is
+    the one specified in your Keras config at `~/.keras/keras.json`.
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
+    inputs before passing them to the model.
+
+    Args:
+        input_shape: Optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` for NASNetMobile
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        include_top: Whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights). For loading `imagenet` weights,
+            `input_shape` should be (224, 224, 3)
+        input_tensor: Optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: Optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        classifier_activation: A `str` or callable. The activation function to
+            use on the "top" layer. Ignored unless `include_top=True`. Set
+            `classifier_activation=None` to return the logits of the "top"
+            layer.  When loading pretrained weights, `classifier_activation` can
+            only be `None` or `"softmax"`.
+
+    Returns:
+        A Keras model instance.
+
+    Raises:
+        ValueError: In case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    return NASNet(
+        input_shape,
+        penultimate_filters=1056,
+        num_blocks=4,
+        stem_block_filters=32,
+        skip_reduction=False,
+        filter_multiplier=2,
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classes=classes,
+        default_size=224,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.nasnet.NASNetLarge", "keras.applications.NASNetLarge"
+)
+def NASNetLarge(
+    input_shape=None,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    """Instantiates a NASNet model in ImageNet mode.
+
+    Reference:
+    - [Learning Transferable Architectures for Scalable Image Recognition](
+        https://arxiv.org/abs/1707.07012) (CVPR 2018)
+
+    Optionally loads weights pre-trained on ImageNet.
+    Note that the data format convention used by the model is
+    the one specified in your Keras config at `~/.keras/keras.json`.
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
+    inputs before passing them to the model.
+
+    Args:
+        input_shape: Optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(331, 331, 3)` for NASNetLarge.
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        include_top: Whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights).  For loading `imagenet` weights,
+            `input_shape` should be (331, 331, 3)
+        input_tensor: Optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: Optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        classifier_activation: A `str` or callable. The activation function to
+            use on the "top" layer. Ignored unless `include_top=True`. Set
+            `classifier_activation=None` to return the logits of the "top"
+            layer.  When loading pretrained weights, `classifier_activation` can
+            only be `None` or `"softmax"`.
+
+    Returns:
+        A Keras model instance.
+
+    Raises:
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    return NASNet(
+        input_shape,
+        penultimate_filters=4032,
+        num_blocks=6,
+        stem_block_filters=96,
+        skip_reduction=True,
+        filter_multiplier=2,
+        include_top=include_top,
+        weights=weights,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classes=classes,
+        default_size=331,
+        classifier_activation=classifier_activation,
+    )
+
+
+def _separable_conv_block(
+    ip, filters, kernel_size=(3, 3), strides=(1, 1), block_id=None
+):
+    """Adds 2 blocks of [relu-separable conv-batchnorm].
+
+    Args:
+        ip: Input tensor
+        filters: Number of output filters per layer
+        kernel_size: Kernel size of separable convolutions
+        strides: Strided convolution for downsampling
+        block_id: String block_id
+
+    Returns:
+        A Keras tensor
+    """
+    channel_dim = 1 if backend.image_data_format() == "channels_first" else -1
+
+    with backend.name_scope(f"separable_conv_block_{block_id}"):
+        x = layers.Activation("relu")(ip)
+        if strides == (2, 2):
+            x = layers.ZeroPadding2D(
+                padding=imagenet_utils.correct_pad(x, kernel_size),
+                name=f"separable_conv_1_pad_{block_id}",
+            )(x)
+            conv_pad = "valid"
+        else:
+            conv_pad = "same"
+        x = layers.SeparableConv2D(
+            filters,
+            kernel_size,
+            strides=strides,
+            name=f"separable_conv_1_{block_id}",
+            padding=conv_pad,
             use_bias=False,
-            name='adjust_conv_1_%s' % block_id,
-            kernel_initializer='he_normal')(
-                p1)
-
-        p2 = layers.ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
-        p2 = layers.Cropping2D(cropping=((1, 0), (1, 0)))(p2)
-        p2 = layers.AveragePooling2D((1, 1),
-                                     strides=(2, 2),
-                                     padding='valid',
-                                     name='adjust_avg_pool_2_%s' % block_id)(
-                                         p2)
-        p2 = layers.Conv2D(
-            filters // 2, (1, 1),
-            padding='same',
+            kernel_initializer="he_normal",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name=f"separable_conv_1_bn_{block_id}",
+        )(x)
+        x = layers.Activation("relu")(x)
+        x = layers.SeparableConv2D(
+            filters,
+            kernel_size,
+            name=f"separable_conv_2_{block_id}",
+            padding="same",
             use_bias=False,
-            name='adjust_conv_2_%s' % block_id,
-            kernel_initializer='he_normal')(
-                p2)
-
-        p = layers.concatenate([p1, p2], axis=channel_dim)
-        p = layers.BatchNormalization(
+            kernel_initializer="he_normal",
+        )(x)
+        x = layers.BatchNormalization(
             axis=channel_dim,
             momentum=0.9997,
             epsilon=1e-3,
-            name='adjust_bn_%s' % block_id)(
-                p)
-
-    elif p_shape[channel_dim] != filters:
-      with backend.name_scope('adjust_projection_block_%s' % block_id):
-        p = layers.Activation('relu')(p)
-        p = layers.Conv2D(
-            filters, (1, 1),
+            name=f"separable_conv_2_bn_{block_id}",
+        )(x)
+    return x
+
+
+def _adjust_block(p, ip, filters, block_id=None):
+    """Adjusts the input `previous path` to match the shape of the `input`.
+
+    Used in situations where the output number of filters needs to be changed.
+
+    Args:
+        p: Input tensor which needs to be modified
+        ip: Input tensor whose shape needs to be matched
+        filters: Number of output filters to be matched
+        block_id: String block_id
+
+    Returns:
+        Adjusted Keras tensor
+    """
+    channel_dim = 1 if backend.image_data_format() == "channels_first" else -1
+    img_dim = 2 if backend.image_data_format() == "channels_first" else -2
+
+    ip_shape = backend.int_shape(ip)
+
+    if p is not None:
+        p_shape = backend.int_shape(p)
+
+    with backend.name_scope("adjust_block"):
+        if p is None:
+            p = ip
+
+        elif p_shape[img_dim] != ip_shape[img_dim]:
+            with backend.name_scope(f"adjust_reduction_block_{block_id}"):
+                p = layers.Activation("relu", name=f"adjust_relu_1_{block_id}")(
+                    p
+                )
+                p1 = layers.AveragePooling2D(
+                    (1, 1),
+                    strides=(2, 2),
+                    padding="valid",
+                    name=f"adjust_avg_pool_1_{block_id}",
+                )(p)
+                p1 = layers.Conv2D(
+                    filters // 2,
+                    (1, 1),
+                    padding="same",
+                    use_bias=False,
+                    name=f"adjust_conv_1_{block_id}",
+                    kernel_initializer="he_normal",
+                )(p1)
+
+                p2 = layers.ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
+                p2 = layers.Cropping2D(cropping=((1, 0), (1, 0)))(p2)
+                p2 = layers.AveragePooling2D(
+                    (1, 1),
+                    strides=(2, 2),
+                    padding="valid",
+                    name=f"adjust_avg_pool_2_{block_id}",
+                )(p2)
+                p2 = layers.Conv2D(
+                    filters // 2,
+                    (1, 1),
+                    padding="same",
+                    use_bias=False,
+                    name=f"adjust_conv_2_{block_id}",
+                    kernel_initializer="he_normal",
+                )(p2)
+
+                p = layers.concatenate([p1, p2], axis=channel_dim)
+                p = layers.BatchNormalization(
+                    axis=channel_dim,
+                    momentum=0.9997,
+                    epsilon=1e-3,
+                    name=f"adjust_bn_{block_id}",
+                )(p)
+
+        elif p_shape[channel_dim] != filters:
+            with backend.name_scope(f"adjust_projection_block_{block_id}"):
+                p = layers.Activation("relu")(p)
+                p = layers.Conv2D(
+                    filters,
+                    (1, 1),
+                    strides=(1, 1),
+                    padding="same",
+                    name=f"adjust_conv_projection_{block_id}",
+                    use_bias=False,
+                    kernel_initializer="he_normal",
+                )(p)
+                p = layers.BatchNormalization(
+                    axis=channel_dim,
+                    momentum=0.9997,
+                    epsilon=1e-3,
+                    name=f"adjust_bn_{block_id}",
+                )(p)
+    return p
+
+
+def _normal_a_cell(ip, p, filters, block_id=None):
+    """Adds a Normal cell for NASNet-A (Fig. 4 in the paper).
+
+    Args:
+        ip: Input tensor `x`
+        p: Input tensor `p`
+        filters: Number of output filters
+        block_id: String block_id
+
+    Returns:
+        A Keras tensor
+    """
+    channel_dim = 1 if backend.image_data_format() == "channels_first" else -1
+
+    with backend.name_scope(f"normal_A_block_{block_id}"):
+        p = _adjust_block(p, ip, filters, block_id)
+
+        h = layers.Activation("relu")(ip)
+        h = layers.Conv2D(
+            filters,
+            (1, 1),
             strides=(1, 1),
-            padding='same',
-            name='adjust_conv_projection_%s' % block_id,
+            padding="same",
+            name=f"normal_conv_1_{block_id}",
             use_bias=False,
-            kernel_initializer='he_normal')(
-                p)
-        p = layers.BatchNormalization(
+            kernel_initializer="he_normal",
+        )(h)
+        h = layers.BatchNormalization(
             axis=channel_dim,
             momentum=0.9997,
             epsilon=1e-3,
-            name='adjust_bn_%s' % block_id)(
-                p)
-  return p
+            name=f"normal_bn_1_{block_id}",
+        )(h)
+
+        with backend.name_scope("block_1"):
+            x1_1 = _separable_conv_block(
+                h,
+                filters,
+                kernel_size=(5, 5),
+                block_id=f"normal_left1_{block_id}",
+            )
+            x1_2 = _separable_conv_block(
+                p, filters, block_id=f"normal_right1_{block_id}"
+            )
+            x1 = layers.add([x1_1, x1_2], name=f"normal_add_1_{block_id}")
+
+        with backend.name_scope("block_2"):
+            x2_1 = _separable_conv_block(
+                p, filters, (5, 5), block_id=f"normal_left2_{block_id}"
+            )
+            x2_2 = _separable_conv_block(
+                p, filters, (3, 3), block_id=f"normal_right2_{block_id}"
+            )
+            x2 = layers.add([x2_1, x2_2], name=f"normal_add_2_{block_id}")
+
+        with backend.name_scope("block_3"):
+            x3 = layers.AveragePooling2D(
+                (3, 3),
+                strides=(1, 1),
+                padding="same",
+                name=f"normal_left3_{block_id}",
+            )(h)
+            x3 = layers.add([x3, p], name=f"normal_add_3_{block_id}")
+
+        with backend.name_scope("block_4"):
+            x4_1 = layers.AveragePooling2D(
+                (3, 3),
+                strides=(1, 1),
+                padding="same",
+                name=f"normal_left4_{block_id}",
+            )(p)
+            x4_2 = layers.AveragePooling2D(
+                (3, 3),
+                strides=(1, 1),
+                padding="same",
+                name=f"normal_right4_{block_id}",
+            )(p)
+            x4 = layers.add([x4_1, x4_2], name=f"normal_add_4_{block_id}")
+
+        with backend.name_scope("block_5"):
+            x5 = _separable_conv_block(
+                h, filters, block_id=f"normal_left5_{block_id}"
+            )
+            x5 = layers.add([x5, h], name=f"normal_add_5_{block_id}")
+
+        x = layers.concatenate(
+            [p, x1, x2, x3, x4, x5],
+            axis=channel_dim,
+            name=f"normal_concat_{block_id}",
+        )
+    return x, ip
 
 
-def _normal_a_cell(ip, p, filters, block_id=None):
-  """Adds a Normal cell for NASNet-A (Fig. 4 in the paper).
+def _reduction_a_cell(ip, p, filters, block_id=None):
+    """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper).
 
-  Args:
+    Args:
       ip: Input tensor `x`
       p: Input tensor `p`
       filters: Number of output filters
       block_id: String block_id
 
-  Returns:
+    Returns:
       A Keras tensor
-  """
-  channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1
-
-  with backend.name_scope('normal_A_block_%s' % block_id):
-    p = _adjust_block(p, ip, filters, block_id)
-
-    h = layers.Activation('relu')(ip)
-    h = layers.Conv2D(
-        filters, (1, 1),
-        strides=(1, 1),
-        padding='same',
-        name='normal_conv_1_%s' % block_id,
-        use_bias=False,
-        kernel_initializer='he_normal')(
-            h)
-    h = layers.BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='normal_bn_1_%s' % block_id)(
-            h)
-
-    with backend.name_scope('block_1'):
-      x1_1 = _separable_conv_block(
-          h, filters, kernel_size=(5, 5), block_id='normal_left1_%s' % block_id)
-      x1_2 = _separable_conv_block(
-          p, filters, block_id='normal_right1_%s' % block_id)
-      x1 = layers.add([x1_1, x1_2], name='normal_add_1_%s' % block_id)
-
-    with backend.name_scope('block_2'):
-      x2_1 = _separable_conv_block(
-          p, filters, (5, 5), block_id='normal_left2_%s' % block_id)
-      x2_2 = _separable_conv_block(
-          p, filters, (3, 3), block_id='normal_right2_%s' % block_id)
-      x2 = layers.add([x2_1, x2_2], name='normal_add_2_%s' % block_id)
-
-    with backend.name_scope('block_3'):
-      x3 = layers.AveragePooling2D((3, 3),
-                                   strides=(1, 1),
-                                   padding='same',
-                                   name='normal_left3_%s' % (block_id))(
-                                       h)
-      x3 = layers.add([x3, p], name='normal_add_3_%s' % block_id)
-
-    with backend.name_scope('block_4'):
-      x4_1 = layers.AveragePooling2D((3, 3),
-                                     strides=(1, 1),
-                                     padding='same',
-                                     name='normal_left4_%s' % (block_id))(
-                                         p)
-      x4_2 = layers.AveragePooling2D((3, 3),
-                                     strides=(1, 1),
-                                     padding='same',
-                                     name='normal_right4_%s' % (block_id))(
-                                         p)
-      x4 = layers.add([x4_1, x4_2], name='normal_add_4_%s' % block_id)
-
-    with backend.name_scope('block_5'):
-      x5 = _separable_conv_block(
-          h, filters, block_id='normal_left5_%s' % block_id)
-      x5 = layers.add([x5, h], name='normal_add_5_%s' % block_id)
-
-    x = layers.concatenate([p, x1, x2, x3, x4, x5],
-                           axis=channel_dim,
-                           name='normal_concat_%s' % block_id)
-  return x, ip
+    """
+    channel_dim = 1 if backend.image_data_format() == "channels_first" else -1
 
+    with backend.name_scope(f"reduction_A_block_{block_id}"):
+        p = _adjust_block(p, ip, filters, block_id)
 
-def _reduction_a_cell(ip, p, filters, block_id=None):
-  """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper).
-
-  Args:
-    ip: Input tensor `x`
-    p: Input tensor `p`
-    filters: Number of output filters
-    block_id: String block_id
-
-  Returns:
-    A Keras tensor
-  """
-  channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1
-
-  with backend.name_scope('reduction_A_block_%s' % block_id):
-    p = _adjust_block(p, ip, filters, block_id)
-
-    h = layers.Activation('relu')(ip)
-    h = layers.Conv2D(
-        filters, (1, 1),
-        strides=(1, 1),
-        padding='same',
-        name='reduction_conv_1_%s' % block_id,
-        use_bias=False,
-        kernel_initializer='he_normal')(
-            h)
-    h = layers.BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='reduction_bn_1_%s' % block_id)(
-            h)
-    h3 = layers.ZeroPadding2D(
-        padding=imagenet_utils.correct_pad(h, 3),
-        name='reduction_pad_1_%s' % block_id)(
-            h)
-
-    with backend.name_scope('block_1'):
-      x1_1 = _separable_conv_block(
-          h,
-          filters, (5, 5),
-          strides=(2, 2),
-          block_id='reduction_left1_%s' % block_id)
-      x1_2 = _separable_conv_block(
-          p,
-          filters, (7, 7),
-          strides=(2, 2),
-          block_id='reduction_right1_%s' % block_id)
-      x1 = layers.add([x1_1, x1_2], name='reduction_add_1_%s' % block_id)
-
-    with backend.name_scope('block_2'):
-      x2_1 = layers.MaxPooling2D((3, 3),
-                                 strides=(2, 2),
-                                 padding='valid',
-                                 name='reduction_left2_%s' % block_id)(
-                                     h3)
-      x2_2 = _separable_conv_block(
-          p,
-          filters, (7, 7),
-          strides=(2, 2),
-          block_id='reduction_right2_%s' % block_id)
-      x2 = layers.add([x2_1, x2_2], name='reduction_add_2_%s' % block_id)
-
-    with backend.name_scope('block_3'):
-      x3_1 = layers.AveragePooling2D((3, 3),
-                                     strides=(2, 2),
-                                     padding='valid',
-                                     name='reduction_left3_%s' % block_id)(
-                                         h3)
-      x3_2 = _separable_conv_block(
-          p,
-          filters, (5, 5),
-          strides=(2, 2),
-          block_id='reduction_right3_%s' % block_id)
-      x3 = layers.add([x3_1, x3_2], name='reduction_add3_%s' % block_id)
-
-    with backend.name_scope('block_4'):
-      x4 = layers.AveragePooling2D((3, 3),
-                                   strides=(1, 1),
-                                   padding='same',
-                                   name='reduction_left4_%s' % block_id)(
-                                       x1)
-      x4 = layers.add([x2, x4])
-
-    with backend.name_scope('block_5'):
-      x5_1 = _separable_conv_block(
-          x1, filters, (3, 3), block_id='reduction_left4_%s' % block_id)
-      x5_2 = layers.MaxPooling2D((3, 3),
-                                 strides=(2, 2),
-                                 padding='valid',
-                                 name='reduction_right5_%s' % block_id)(
-                                     h3)
-      x5 = layers.add([x5_1, x5_2], name='reduction_add4_%s' % block_id)
-
-    x = layers.concatenate([x2, x3, x4, x5],
-                           axis=channel_dim,
-                           name='reduction_concat_%s' % block_id)
-    return x, ip
+        h = layers.Activation("relu")(ip)
+        h = layers.Conv2D(
+            filters,
+            (1, 1),
+            strides=(1, 1),
+            padding="same",
+            name=f"reduction_conv_1_{block_id}",
+            use_bias=False,
+            kernel_initializer="he_normal",
+        )(h)
+        h = layers.BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name=f"reduction_bn_1_{block_id}",
+        )(h)
+        h3 = layers.ZeroPadding2D(
+            padding=imagenet_utils.correct_pad(h, 3),
+            name=f"reduction_pad_1_{block_id}",
+        )(h)
+
+        with backend.name_scope("block_1"):
+            x1_1 = _separable_conv_block(
+                h,
+                filters,
+                (5, 5),
+                strides=(2, 2),
+                block_id=f"reduction_left1_{block_id}",
+            )
+            x1_2 = _separable_conv_block(
+                p,
+                filters,
+                (7, 7),
+                strides=(2, 2),
+                block_id=f"reduction_right1_{block_id}",
+            )
+            x1 = layers.add([x1_1, x1_2], name=f"reduction_add_1_{block_id}")
+
+        with backend.name_scope("block_2"):
+            x2_1 = layers.MaxPooling2D(
+                (3, 3),
+                strides=(2, 2),
+                padding="valid",
+                name=f"reduction_left2_{block_id}",
+            )(h3)
+            x2_2 = _separable_conv_block(
+                p,
+                filters,
+                (7, 7),
+                strides=(2, 2),
+                block_id=f"reduction_right2_{block_id}",
+            )
+            x2 = layers.add([x2_1, x2_2], name=f"reduction_add_2_{block_id}")
+
+        with backend.name_scope("block_3"):
+            x3_1 = layers.AveragePooling2D(
+                (3, 3),
+                strides=(2, 2),
+                padding="valid",
+                name=f"reduction_left3_{block_id}",
+            )(h3)
+            x3_2 = _separable_conv_block(
+                p,
+                filters,
+                (5, 5),
+                strides=(2, 2),
+                block_id=f"reduction_right3_{block_id}",
+            )
+            x3 = layers.add([x3_1, x3_2], name=f"reduction_add3_{block_id}")
+
+        with backend.name_scope("block_4"):
+            x4 = layers.AveragePooling2D(
+                (3, 3),
+                strides=(1, 1),
+                padding="same",
+                name=f"reduction_left4_{block_id}",
+            )(x1)
+            x4 = layers.add([x2, x4])
+
+        with backend.name_scope("block_5"):
+            x5_1 = _separable_conv_block(
+                x1, filters, (3, 3), block_id=f"reduction_left4_{block_id}"
+            )
+            x5_2 = layers.MaxPooling2D(
+                (3, 3),
+                strides=(2, 2),
+                padding="valid",
+                name=f"reduction_right5_{block_id}",
+            )(h3)
+            x5 = layers.add([x5_1, x5_2], name=f"reduction_add4_{block_id}")
+
+        x = layers.concatenate(
+            [x2, x3, x4, x5],
+            axis=channel_dim,
+            name=f"reduction_concat_{block_id}",
+        )
+        return x, ip
 
 
-@keras_export('keras.applications.nasnet.preprocess_input')
+@keras_export("keras.applications.nasnet.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="tf"
+    )
 
 
-@keras_export('keras.applications.nasnet.decode_predictions')
+@keras_export("keras.applications.nasnet.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/regnet.py b/keras/applications/regnet.py
index de035d8b9279..0c8ee7de0670 100644
--- a/keras/applications/regnet.py
+++ b/keras/applications/regnet.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=missing-docstring
-# pylint: disable=g-classes-have-attributes
+
 
 """RegNet models for Keras.
 
@@ -26,89 +24,119 @@
   (CVPR 2021)
 """
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import layers
 from keras.applications import imagenet_utils
 from keras.engine import training
 from keras.utils import data_utils
 from keras.utils import layer_utils
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHTS_PATH = "https://storage.googleapis.com/tensorflow/keras-applications/regnet/"
+BASE_WEIGHTS_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/regnet/"
+)
 
 WEIGHTS_HASHES = {
-    "x002":
-        ("49fb46e56cde07fdaf57bffd851461a86548f6a3a4baef234dd37290b826c0b8",
-         "5445b66cd50445eb7ecab094c1e78d4d3d29375439d1a7798861c4af15ffff21"),
-    "x004":
-        ("3523c7f5ac0dbbcc2fd6d83b3570e7540f7449d3301cc22c29547302114e4088",
-         "de139bf07a66c9256f2277bf5c1b6dd2d5a3a891a5f8a925a10c8a0a113fd6f3"),
-    "x006":
-        ("340216ef334a7bae30daac9f414e693c136fac9ab868704bbfcc9ce6a5ec74bb",
-         "a43ec97ad62f86b2a96a783bfdc63a5a54de02eef54f26379ea05e1bf90a9505"),
-    "x008":
-        ("8f145d6a5fae6da62677bb8d26eb92d0b9dfe143ec1ebf68b24a57ae50a2763d",
-         "3c7e4b0917359304dc18e644475c5c1f5e88d795542b676439c4a3acd63b7207"),
-    "x016":
-        ("31c386f4c7bfef4c021a583099aa79c1b3928057ba1b7d182f174674c5ef3510",
-         "1b8e3d545d190271204a7b2165936a227d26b79bb7922bac5ee4d303091bf17a"),
-    "x032":
-        ("6c025df1409e5ea846375bc9dfa240956cca87ef57384d93fef7d6fa90ca8c7f",
-         "9cd4522806c0fcca01b37874188b2bd394d7c419956d77472a4e072b01d99041"),
-    "x040":
-        ("ba128046c588a26dbd3b3a011b26cb7fa3cf8f269c184c132372cb20b6eb54c1",
-         "b4ed0ca0b9a98e789e05000e830403a7ade4d8afa01c73491c44610195198afe"),
-    "x064":
-        ("0f4489c3cd3ad979bd6b0324213998bcb36dc861d178f977997ebfe53c3ba564",
-         "3e706fa416a18dfda14c713423eba8041ae2509db3e0a611d5f599b5268a46c4"),
-    "x080":
-        ("76320e43272719df648db37271a247c22eb6e810fe469c37a5db7e2cb696d162",
-         "7b1ce8e29ceefec10a6569640ee329dba7fbc98b5d0f6346aabade058b66cf29"),
-    "x120":
-        ("5cafc461b78897d5e4f24e68cb406d18e75f31105ef620e7682b611bb355eb3a",
-         "36174ddd0299db04a42631d028abcb1cc7afec2b705e42bd28fcd325e5d596bf"),
-    "x160":
-        ("8093f57a5824b181fb734ea21ae34b1f7ee42c5298e63cf6d587c290973195d2",
-         "9d1485050bdf19531ffa1ed7827c75850e0f2972118a996b91aa9264b088fd43"),
-    "x320":
-        ("91fb3e6f4e9e44b3687e80977f7f4412ee9937c0c704232664fc83e4322ea01e",
-         "9db7eacc37b85c98184070e1a172e6104c00846f44bcd4e727da9e50d9692398"),
-    "y002":
-        ("1e8091c674532b1a61c04f6393a9c570113e0197f22bd1b98cc4c4fe800c6465",
-         "f63221f63d625b8e201221499682587bfe29d33f50a4c4f4d53be00f66c0f12c"),
-    "y004":
-        ("752fdbad21c78911bf1dcb8c513e5a0e14697b068e5d9e73525dbaa416d18d8e",
-         "45e6ba8309a17a77e67afc05228454b2e0ee6be0dae65edc0f31f1da10cc066b"),
-    "y006":
-        ("98942e07b273da500ff9699a1f88aca78dfad4375faabb0bab784bb0dace80a9",
-         "b70261cba4e60013c99d130cc098d2fce629ff978a445663b6fa4f8fc099a2be"),
-    "y008":
-        ("1b099377cc9a4fb183159a6f9b24bc998e5659d25a449f40c90cbffcbcfdcae4",
-         "b11f5432a216ee640fe9be6e32939defa8d08b8d136349bf3690715a98752ca1"),
-    "y016":
-        ("b7ce1f5e223f0941c960602de922bcf846288ce7a4c33b2a4f2e4ac4b480045b",
-         "d7404f50205e82d793e219afb9eb2bfeb781b6b2d316a6128c6d7d7dacab7f57"),
-    "y032":
-        ("6a6a545cf3549973554c9b94f0cd40e25f229fffb1e7f7ac779a59dcbee612bd",
-         "eb3ac1c45ec60f4f031c3f5180573422b1cf7bebc26c004637517372f68f8937"),
-    "y040":
-        ("98d00118b335162bbffe8f1329e54e5c8e75ee09b2a5414f97b0ddfc56e796f6",
-         "b5be2a5e5f072ecdd9c0b8a437cd896df0efa1f6a1f77e41caa8719b7dfcb05d"),
-    "y064":
-        ("65c948c7a18aaecaad2d1bd4fd978987425604ba6669ef55a1faa0069a2804b7",
-         "885c4b7ed7ea339daca7dafa1a62cb7d41b1068897ef90a5a3d71b4a2e2db31a"),
-    "y080":
-        ("7a2c62da2982e369a4984d3c7c3b32d6f8d3748a71cb37a31156c436c37f3e95",
-         "3d119577e1e3bf8d153b895e8ea9e4ec150ff2d92abdca711b6e949c3fd7115d"),
-    "y120":
-        ("a96ab0d27d3ae35a422ee7df0d789069b3e3217a99334e0ce861a96595bc5986",
-         "4a6fa387108380b730b71feea2ad80b5224b5ea9dc21dc156c93fe3c6186485c"),
-    "y160":
-        ("45067240ffbc7ca2591313fee2f80dbdda6d66ec1a7451446f9a6d00d8f7ac6e",
-         "ead1e6b568be8f34447ec8941299a9df4368736ba9a8205de5427fa20a1fb316"),
-    "y320": ("b05e173e4ae635cfa22d06392ee3741284d17dadfee68f2aa6fd8cb2b7561112",
-             "cad78f74a586e24c61d38be17f3ae53bb9674380174d2585da1a526b8c20e1fd")
+    "x002": (
+        "49fb46e56cde07fdaf57bffd851461a86548f6a3a4baef234dd37290b826c0b8",
+        "5445b66cd50445eb7ecab094c1e78d4d3d29375439d1a7798861c4af15ffff21",
+    ),
+    "x004": (
+        "3523c7f5ac0dbbcc2fd6d83b3570e7540f7449d3301cc22c29547302114e4088",
+        "de139bf07a66c9256f2277bf5c1b6dd2d5a3a891a5f8a925a10c8a0a113fd6f3",
+    ),
+    "x006": (
+        "340216ef334a7bae30daac9f414e693c136fac9ab868704bbfcc9ce6a5ec74bb",
+        "a43ec97ad62f86b2a96a783bfdc63a5a54de02eef54f26379ea05e1bf90a9505",
+    ),
+    "x008": (
+        "8f145d6a5fae6da62677bb8d26eb92d0b9dfe143ec1ebf68b24a57ae50a2763d",
+        "3c7e4b0917359304dc18e644475c5c1f5e88d795542b676439c4a3acd63b7207",
+    ),
+    "x016": (
+        "31c386f4c7bfef4c021a583099aa79c1b3928057ba1b7d182f174674c5ef3510",
+        "1b8e3d545d190271204a7b2165936a227d26b79bb7922bac5ee4d303091bf17a",
+    ),
+    "x032": (
+        "6c025df1409e5ea846375bc9dfa240956cca87ef57384d93fef7d6fa90ca8c7f",
+        "9cd4522806c0fcca01b37874188b2bd394d7c419956d77472a4e072b01d99041",
+    ),
+    "x040": (
+        "ba128046c588a26dbd3b3a011b26cb7fa3cf8f269c184c132372cb20b6eb54c1",
+        "b4ed0ca0b9a98e789e05000e830403a7ade4d8afa01c73491c44610195198afe",
+    ),
+    "x064": (
+        "0f4489c3cd3ad979bd6b0324213998bcb36dc861d178f977997ebfe53c3ba564",
+        "3e706fa416a18dfda14c713423eba8041ae2509db3e0a611d5f599b5268a46c4",
+    ),
+    "x080": (
+        "76320e43272719df648db37271a247c22eb6e810fe469c37a5db7e2cb696d162",
+        "7b1ce8e29ceefec10a6569640ee329dba7fbc98b5d0f6346aabade058b66cf29",
+    ),
+    "x120": (
+        "5cafc461b78897d5e4f24e68cb406d18e75f31105ef620e7682b611bb355eb3a",
+        "36174ddd0299db04a42631d028abcb1cc7afec2b705e42bd28fcd325e5d596bf",
+    ),
+    "x160": (
+        "8093f57a5824b181fb734ea21ae34b1f7ee42c5298e63cf6d587c290973195d2",
+        "9d1485050bdf19531ffa1ed7827c75850e0f2972118a996b91aa9264b088fd43",
+    ),
+    "x320": (
+        "91fb3e6f4e9e44b3687e80977f7f4412ee9937c0c704232664fc83e4322ea01e",
+        "9db7eacc37b85c98184070e1a172e6104c00846f44bcd4e727da9e50d9692398",
+    ),
+    "y002": (
+        "1e8091c674532b1a61c04f6393a9c570113e0197f22bd1b98cc4c4fe800c6465",
+        "f63221f63d625b8e201221499682587bfe29d33f50a4c4f4d53be00f66c0f12c",
+    ),
+    "y004": (
+        "752fdbad21c78911bf1dcb8c513e5a0e14697b068e5d9e73525dbaa416d18d8e",
+        "45e6ba8309a17a77e67afc05228454b2e0ee6be0dae65edc0f31f1da10cc066b",
+    ),
+    "y006": (
+        "98942e07b273da500ff9699a1f88aca78dfad4375faabb0bab784bb0dace80a9",
+        "b70261cba4e60013c99d130cc098d2fce629ff978a445663b6fa4f8fc099a2be",
+    ),
+    "y008": (
+        "1b099377cc9a4fb183159a6f9b24bc998e5659d25a449f40c90cbffcbcfdcae4",
+        "b11f5432a216ee640fe9be6e32939defa8d08b8d136349bf3690715a98752ca1",
+    ),
+    "y016": (
+        "b7ce1f5e223f0941c960602de922bcf846288ce7a4c33b2a4f2e4ac4b480045b",
+        "d7404f50205e82d793e219afb9eb2bfeb781b6b2d316a6128c6d7d7dacab7f57",
+    ),
+    "y032": (
+        "6a6a545cf3549973554c9b94f0cd40e25f229fffb1e7f7ac779a59dcbee612bd",
+        "eb3ac1c45ec60f4f031c3f5180573422b1cf7bebc26c004637517372f68f8937",
+    ),
+    "y040": (
+        "98d00118b335162bbffe8f1329e54e5c8e75ee09b2a5414f97b0ddfc56e796f6",
+        "b5be2a5e5f072ecdd9c0b8a437cd896df0efa1f6a1f77e41caa8719b7dfcb05d",
+    ),
+    "y064": (
+        "65c948c7a18aaecaad2d1bd4fd978987425604ba6669ef55a1faa0069a2804b7",
+        "885c4b7ed7ea339daca7dafa1a62cb7d41b1068897ef90a5a3d71b4a2e2db31a",
+    ),
+    "y080": (
+        "7a2c62da2982e369a4984d3c7c3b32d6f8d3748a71cb37a31156c436c37f3e95",
+        "3d119577e1e3bf8d153b895e8ea9e4ec150ff2d92abdca711b6e949c3fd7115d",
+    ),
+    "y120": (
+        "a96ab0d27d3ae35a422ee7df0d789069b3e3217a99334e0ce861a96595bc5986",
+        "4a6fa387108380b730b71feea2ad80b5224b5ea9dc21dc156c93fe3c6186485c",
+    ),
+    "y160": (
+        "45067240ffbc7ca2591313fee2f80dbdda6d66ec1a7451446f9a6d00d8f7ac6e",
+        "ead1e6b568be8f34447ec8941299a9df4368736ba9a8205de5427fa20a1fb316",
+    ),
+    "y320": (
+        "b05e173e4ae635cfa22d06392ee3741284d17dadfee68f2aa6fd8cb2b7561112",
+        "cad78f74a586e24c61d38be17f3ae53bb9674380174d2585da1a526b8c20e1fd",
+    ),
 }
 
 # The widths and depths are deduced from a quantized linear function. For
@@ -123,168 +151,168 @@
         "widths": [24, 56, 152, 368],
         "group_width": 8,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x004": {
         "depths": [1, 2, 7, 12],
         "widths": [32, 64, 160, 384],
         "group_width": 16,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x006": {
         "depths": [1, 3, 5, 7],
         "widths": [48, 96, 240, 528],
         "group_width": 24,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x008": {
         "depths": [1, 3, 7, 5],
         "widths": [64, 128, 288, 672],
         "group_width": 16,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x016": {
         "depths": [2, 4, 10, 2],
         "widths": [72, 168, 408, 912],
         "group_width": 24,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x032": {
         "depths": [2, 6, 15, 2],
         "widths": [96, 192, 432, 1008],
         "group_width": 48,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x040": {
         "depths": [2, 5, 14, 2],
         "widths": [80, 240, 560, 1360],
         "group_width": 40,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x064": {
         "depths": [2, 4, 10, 1],
         "widths": [168, 392, 784, 1624],
         "group_width": 56,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x080": {
         "depths": [2, 5, 15, 1],
         "widths": [80, 240, 720, 1920],
         "group_width": 120,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x120": {
         "depths": [2, 5, 11, 1],
         "widths": [224, 448, 896, 2240],
         "group_width": 112,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x160": {
         "depths": [2, 6, 13, 1],
         "widths": [256, 512, 896, 2048],
         "group_width": 128,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "x320": {
         "depths": [2, 7, 13, 1],
         "widths": [336, 672, 1344, 2520],
         "group_width": 168,
         "default_size": 224,
-        "block_type": "X"
+        "block_type": "X",
     },
     "y002": {
         "depths": [1, 1, 4, 7],
         "widths": [24, 56, 152, 368],
         "group_width": 8,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y004": {
         "depths": [1, 3, 6, 6],
         "widths": [48, 104, 208, 440],
         "group_width": 8,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y006": {
         "depths": [1, 3, 7, 4],
         "widths": [48, 112, 256, 608],
         "group_width": 16,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y008": {
         "depths": [1, 3, 8, 2],
         "widths": [64, 128, 320, 768],
         "group_width": 16,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y016": {
         "depths": [2, 6, 17, 2],
         "widths": [48, 120, 336, 888],
         "group_width": 24,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y032": {
         "depths": [2, 5, 13, 1],
         "widths": [72, 216, 576, 1512],
         "group_width": 24,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y040": {
         "depths": [2, 6, 12, 2],
         "widths": [128, 192, 512, 1088],
         "group_width": 64,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y064": {
         "depths": [2, 7, 14, 2],
         "widths": [144, 288, 576, 1296],
         "group_width": 72,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y080": {
         "depths": [2, 4, 10, 1],
         "widths": [168, 448, 896, 2016],
         "group_width": 56,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y120": {
         "depths": [2, 5, 11, 1],
         "widths": [224, 448, 896, 2240],
         "group_width": 112,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y160": {
         "depths": [2, 4, 11, 1],
         "widths": [224, 448, 1232, 3024],
         "group_width": 112,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
     "y320": {
         "depths": [2, 5, 12, 1],
         "widths": [232, 696, 1392, 3712],
         "group_width": 232,
         "default_size": 224,
-        "block_type": "Y"
+        "block_type": "Y",
     },
 }
 
@@ -314,7 +342,7 @@
 
   Args:
     include_top: Whether to include the fully-connected
-        layer at the top of the network. Defaults to True.
+        layer at the top of the network. Defaults to `True`.
     weights: One of `None` (random initialization),
           `"imagenet"` (pre-training on ImageNet), or the path to the weights
           file to be loaded. Defaults to `"imagenet"`.
@@ -325,7 +353,7 @@
         if `include_top` is False.
         It should have exactly 3 inputs channels.
     pooling: Optional pooling mode for feature extraction
-        when `include_top` is `False`. Defaults to None.
+        when `include_top` is `False`.
         - `None` means that the output of the model will be
             the 4D tensor output of the
             last convolutional layer.
@@ -335,16 +363,16 @@
             the output of the model will be a 2D tensor.
         - `max` means that global max pooling will
             be applied.
+        Defaults to `None`.
     classes: Optional number of classes to classify images
         into, only to be specified if `include_top` is True, and
-        if no `weights` argument is specified. Defaults to 1000 (number of
-        ImageNet classes).
+        if no `weights` argument is specified. 1000 is how many
+        ImageNet classes there are. Defaults to `1000`.
     classifier_activation: A `str` or callable. The activation function to use
         on the "top" layer. Ignored unless `include_top=True`. Set
         `classifier_activation=None` to return the logits of the "top" layer.
-        Defaults to `"softmax"`.
         When loading pretrained weights, `classifier_activation` can only
-        be `None` or `"softmax"`.
+        be `None` or `"softmax"`. Defaults to `"softmax"`.
 
   Returns:
     A `keras.Model` instance.
@@ -352,1230 +380,1405 @@
 
 
 def PreStem(name=None):
-  """Rescales and normalizes inputs to [0,1] and ImageNet mean and std.
+    """Rescales and normalizes inputs to [0,1] and ImageNet mean and std.
 
-  Args:
-    name: name prefix
+    Args:
+      name: name prefix
 
-  Returns:
-    Rescaled and normalized tensor
-  """
-  if name is None:
-    name = "prestem" + str(backend.get_uid("prestem"))
+    Returns:
+      Rescaled and normalized tensor
+    """
+    if name is None:
+        name = "prestem" + str(backend.get_uid("prestem"))
 
-  def apply(x):
-    x = layers.Rescaling(scale=1. / 255., name=name + "_prestem_rescaling")(x)
-    return x
+    def apply(x):
+        x = layers.Rescaling(
+            scale=1.0 / 255.0, name=name + "_prestem_rescaling"
+        )(x)
+        return x
 
-  return apply
+    return apply
 
 
 def Stem(name=None):
-  """Implementation of RegNet stem.
-
-  (Common to all model variants)
-  Args:
-    name: name prefix
-
-  Returns:
-    Output tensor of the Stem
-  """
-  if name is None:
-    name = "stem" + str(backend.get_uid("stem"))
-
-  def apply(x):
-    x = layers.Conv2D(
-        32, (3, 3),
-        strides=2,
-        use_bias=False,
-        padding="same",
-        kernel_initializer="he_normal",
-        name=name + "_stem_conv")(x)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_stem_bn")(x)
-    x = layers.ReLU(name=name + "_stem_relu")(x)
-    return x
-
-  return apply
+    """Implementation of RegNet stem.
+
+    (Common to all model variants)
+    Args:
+      name: name prefix
+
+    Returns:
+      Output tensor of the Stem
+    """
+    if name is None:
+        name = "stem" + str(backend.get_uid("stem"))
+
+    def apply(x):
+        x = layers.Conv2D(
+            32,
+            (3, 3),
+            strides=2,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
+            name=name + "_stem_conv",
+        )(x)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_stem_bn"
+        )(x)
+        x = layers.ReLU(name=name + "_stem_relu")(x)
+        return x
+
+    return apply
 
 
 def SqueezeAndExciteBlock(filters_in, se_filters, name=None):
-  """Implements the Squeeze and excite block (https://arxiv.org/abs/1709.01507).
-
-  Args:
-    filters_in: input filters to the block
-    se_filters: filters to squeeze to
-    name: name prefix
-
-  Returns:
-    A function object
-  """
-  if name is None:
-    name = str(backend.get_uid("squeeze_and_excite"))
-
-  def apply(inputs):
-    x = layers.GlobalAveragePooling2D(
-        name=name + "_squeeze_and_excite_gap", keepdims=True)(inputs)
-    x = layers.Conv2D(
-        se_filters, (1, 1),
-        activation="relu",
-        kernel_initializer="he_normal",
-        name=name + "_squeeze_and_excite_squeeze")(x)
-    x = layers.Conv2D(
-        filters_in, (1, 1),
-        activation="sigmoid",
-        kernel_initializer="he_normal",
-        name=name + "_squeeze_and_excite_excite")(x)
-    x = tf.math.multiply(x, inputs)
-    return x
-
-  return apply
+    """Implements the Squeeze & Excite block (https://arxiv.org/abs/1709.01507).
+
+    Args:
+      filters_in: input filters to the block
+      se_filters: filters to squeeze to
+      name: name prefix
+
+    Returns:
+      A function object
+    """
+    if name is None:
+        name = str(backend.get_uid("squeeze_and_excite"))
+
+    def apply(inputs):
+        x = layers.GlobalAveragePooling2D(
+            name=name + "_squeeze_and_excite_gap", keepdims=True
+        )(inputs)
+        x = layers.Conv2D(
+            se_filters,
+            (1, 1),
+            activation="relu",
+            kernel_initializer="he_normal",
+            name=name + "_squeeze_and_excite_squeeze",
+        )(x)
+        x = layers.Conv2D(
+            filters_in,
+            (1, 1),
+            activation="sigmoid",
+            kernel_initializer="he_normal",
+            name=name + "_squeeze_and_excite_excite",
+        )(x)
+        x = tf.math.multiply(x, inputs)
+        return x
+
+    return apply
 
 
 def XBlock(filters_in, filters_out, group_width, stride=1, name=None):
-  """Implementation of X Block.
+    """Implementation of X Block.
+
+    Reference: [Designing Network Design
+    Spaces](https://arxiv.org/abs/2003.13678)
+    Args:
+      filters_in: filters in the input tensor
+      filters_out: filters in the output tensor
+      group_width: group width
+      stride: stride
+      name: name prefix
+    Returns:
+      Output tensor of the block
+    """
+    if name is None:
+        name = str(backend.get_uid("xblock"))
+
+    def apply(inputs):
+        if filters_in != filters_out and stride == 1:
+            raise ValueError(
+                f"Input filters({filters_in}) and output "
+                f"filters({filters_out}) "
+                f"are not equal for stride {stride}. Input and output filters "
+                f"must be equal for stride={stride}."
+            )
+
+        # Declare layers
+        groups = filters_out // group_width
+
+        if stride != 1:
+            skip = layers.Conv2D(
+                filters_out,
+                (1, 1),
+                strides=stride,
+                use_bias=False,
+                kernel_initializer="he_normal",
+                name=name + "_skip_1x1",
+            )(inputs)
+            skip = layers.BatchNormalization(
+                momentum=0.9, epsilon=1e-5, name=name + "_skip_bn"
+            )(skip)
+        else:
+            skip = inputs
+
+        # Build block
+        # conv_1x1_1
+        x = layers.Conv2D(
+            filters_out,
+            (1, 1),
+            use_bias=False,
+            kernel_initializer="he_normal",
+            name=name + "_conv_1x1_1",
+        )(inputs)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_1_bn"
+        )(x)
+        x = layers.ReLU(name=name + "_conv_1x1_1_relu")(x)
+
+        # conv_3x3
+        x = layers.Conv2D(
+            filters_out,
+            (3, 3),
+            use_bias=False,
+            strides=stride,
+            groups=groups,
+            padding="same",
+            kernel_initializer="he_normal",
+            name=name + "_conv_3x3",
+        )(x)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_3x3_bn"
+        )(x)
+        x = layers.ReLU(name=name + "_conv_3x3_relu")(x)
+
+        # conv_1x1_2
+        x = layers.Conv2D(
+            filters_out,
+            (1, 1),
+            use_bias=False,
+            kernel_initializer="he_normal",
+            name=name + "_conv_1x1_2",
+        )(x)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_2_bn"
+        )(x)
+
+        x = layers.ReLU(name=name + "_exit_relu")(x + skip)
+
+        return x
+
+    return apply
+
+
+def YBlock(
+    filters_in,
+    filters_out,
+    group_width,
+    stride=1,
+    squeeze_excite_ratio=0.25,
+    name=None,
+):
+    """Implementation of Y Block.
+
+    Reference: [Designing Network Design
+    Spaces](https://arxiv.org/abs/2003.13678)
+    Args:
+      filters_in: filters in the input tensor
+      filters_out: filters in the output tensor
+      group_width: group width
+      stride: stride
+      squeeze_excite_ratio: expansion ration for Squeeze and Excite block
+      name: name prefix
+    Returns:
+      Output tensor of the block
+    """
+    if name is None:
+        name = str(backend.get_uid("yblock"))
+
+    def apply(inputs):
+        if filters_in != filters_out and stride == 1:
+            raise ValueError(
+                f"Input filters({filters_in}) and output "
+                f"filters({filters_out}) "
+                f"are not equal for stride {stride}. Input and output filters "
+                f"must be equal for stride={stride}."
+            )
+
+        groups = filters_out // group_width
+        se_filters = int(filters_in * squeeze_excite_ratio)
+
+        if stride != 1:
+            skip = layers.Conv2D(
+                filters_out,
+                (1, 1),
+                strides=stride,
+                use_bias=False,
+                kernel_initializer="he_normal",
+                name=name + "_skip_1x1",
+            )(inputs)
+            skip = layers.BatchNormalization(
+                momentum=0.9, epsilon=1e-5, name=name + "_skip_bn"
+            )(skip)
+        else:
+            skip = inputs
+
+        # Build block
+        # conv_1x1_1
+        x = layers.Conv2D(
+            filters_out,
+            (1, 1),
+            use_bias=False,
+            kernel_initializer="he_normal",
+            name=name + "_conv_1x1_1",
+        )(inputs)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_1_bn"
+        )(x)
+        x = layers.ReLU(name=name + "_conv_1x1_1_relu")(x)
+
+        # conv_3x3
+        x = layers.Conv2D(
+            filters_out,
+            (3, 3),
+            use_bias=False,
+            strides=stride,
+            groups=groups,
+            padding="same",
+            kernel_initializer="he_normal",
+            name=name + "_conv_3x3",
+        )(x)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_3x3_bn"
+        )(x)
+        x = layers.ReLU(name=name + "_conv_3x3_relu")(x)
+
+        # Squeeze-Excitation block
+        x = SqueezeAndExciteBlock(filters_out, se_filters, name=name)(x)
+
+        # conv_1x1_2
+        x = layers.Conv2D(
+            filters_out,
+            (1, 1),
+            use_bias=False,
+            kernel_initializer="he_normal",
+            name=name + "_conv_1x1_2",
+        )(x)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_2_bn"
+        )(x)
+
+        x = layers.ReLU(name=name + "_exit_relu")(x + skip)
+
+        return x
+
+    return apply
+
+
+def ZBlock(
+    filters_in,
+    filters_out,
+    group_width,
+    stride=1,
+    squeeze_excite_ratio=0.25,
+    bottleneck_ratio=0.25,
+    name=None,
+):
+    """Implementation of Z block Reference: [Fast and Accurate Model
+    Scaling](https://arxiv.org/abs/2103.06877).
+
+    Args:
+      filters_in: filters in the input tensor
+      filters_out: filters in the output tensor
+      group_width: group width
+      stride: stride
+      squeeze_excite_ratio: expansion ration for Squeeze and Excite block
+      bottleneck_ratio: inverted bottleneck ratio
+      name: name prefix
+    Returns:
+      Output tensor of the block
+    """
+    if name is None:
+        name = str(backend.get_uid("zblock"))
+
+    def apply(inputs):
+        if filters_in != filters_out and stride == 1:
+            raise ValueError(
+                f"Input filters({filters_in}) and output filters({filters_out})"
+                f"are not equal for stride {stride}. Input and output filters "
+                f"must be equal for stride={stride}."
+            )
+
+        groups = filters_out // group_width
+        se_filters = int(filters_in * squeeze_excite_ratio)
+
+        inv_btlneck_filters = int(filters_out / bottleneck_ratio)
+
+        # Build block
+        # conv_1x1_1
+        x = layers.Conv2D(
+            inv_btlneck_filters,
+            (1, 1),
+            use_bias=False,
+            kernel_initializer="he_normal",
+            name=name + "_conv_1x1_1",
+        )(inputs)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_1_bn"
+        )(x)
+        x = tf.nn.silu(x)
+
+        # conv_3x3
+        x = layers.Conv2D(
+            inv_btlneck_filters,
+            (3, 3),
+            use_bias=False,
+            strides=stride,
+            groups=groups,
+            padding="same",
+            kernel_initializer="he_normal",
+            name=name + "_conv_3x3",
+        )(x)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_3x3_bn"
+        )(x)
+        x = tf.nn.silu(x)
+
+        # Squeeze-Excitation block
+        x = SqueezeAndExciteBlock(inv_btlneck_filters, se_filters, name=name)
+
+        # conv_1x1_2
+        x = layers.Conv2D(
+            filters_out,
+            (1, 1),
+            use_bias=False,
+            kernel_initializer="he_normal",
+            name=name + "_conv_1x1_2",
+        )(x)
+        x = layers.BatchNormalization(
+            momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_2_bn"
+        )(x)
+
+        if stride != 1:
+            return x
+        else:
+            return x + inputs
+
+    return apply
 
-  Reference: [Designing Network Design
-  Spaces](https://arxiv.org/abs/2003.13678)
-  Args:
-    filters_in: filters in the input tensor
-    filters_out: filters in the output tensor
-    group_width: group width
-    stride: stride
-    name: name prefix
-  Returns:
-    Output tensor of the block
-  """
-  if name is None:
-    name = str(backend.get_uid("xblock"))
-
-  def apply(inputs):
-    if filters_in != filters_out and stride == 1:
-      raise ValueError(
-          f"Input filters({filters_in}) and output filters({filters_out}) "
-          f"are not equal for stride {stride}. Input and output filters must "
-          f"be equal for stride={stride}.")
-
-    # Declare layers
-    groups = filters_out // group_width
-
-    if stride != 1:
-      skip = layers.Conv2D(
-          filters_out, (1, 1),
-          strides=stride,
-          use_bias=False,
-          kernel_initializer="he_normal",
-          name=name + "_skip_1x1")(inputs)
-      skip = layers.BatchNormalization(
-          momentum=0.9, epsilon=1e-5, name=name + "_skip_bn")(skip)
-    else:
-      skip = inputs
-
-    # Build block
-    # conv_1x1_1
-    x = layers.Conv2D(
-        filters_out, (1, 1),
-        use_bias=False,
-        kernel_initializer="he_normal",
-        name=name + "_conv_1x1_1")(inputs)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_1_bn")(x)
-    x = layers.ReLU(name=name + "_conv_1x1_1_relu")(x)
-
-    # conv_3x3
-    x = layers.Conv2D(
-        filters_out, (3, 3),
-        use_bias=False,
-        strides=stride,
-        groups=groups,
-        padding="same",
-        kernel_initializer="he_normal",
-        name=name + "_conv_3x3")(x)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_3x3_bn")(x)
-    x = layers.ReLU(name=name + "_conv_3x3_relu")(x)
-
-    # conv_1x1_2
-    x = layers.Conv2D(
-        filters_out, (1, 1),
-        use_bias=False,
-        kernel_initializer="he_normal",
-        name=name + "_conv_1x1_2")(x)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_2_bn")(x)
-
-    x = layers.ReLU(name=name + "_exit_relu")(x + skip)
 
-    return x
-
-  return apply
+def Stage(block_type, depth, group_width, filters_in, filters_out, name=None):
+    """Implementation of Stage in RegNet.
+
+    Args:
+      block_type: must be one of "X", "Y", "Z"
+      depth: depth of stage, number of blocks to use
+      group_width: group width of all blocks in  this stage
+      filters_in: input filters to this stage
+      filters_out: output filters from this stage
+      name: name prefix
+
+    Returns:
+      Output tensor of Stage
+    """
+    if name is None:
+        name = str(backend.get_uid("stage"))
+
+    def apply(inputs):
+        x = inputs
+        if block_type == "X":
+            x = XBlock(
+                filters_in,
+                filters_out,
+                group_width,
+                stride=2,
+                name=f"{name}_XBlock_0",
+            )(x)
+            for i in range(1, depth):
+                x = XBlock(
+                    filters_out,
+                    filters_out,
+                    group_width,
+                    name=f"{name}_XBlock_{i}",
+                )(x)
+        elif block_type == "Y":
+            x = YBlock(
+                filters_in,
+                filters_out,
+                group_width,
+                stride=2,
+                name=name + "_YBlock_0",
+            )(x)
+            for i in range(1, depth):
+                x = YBlock(
+                    filters_out,
+                    filters_out,
+                    group_width,
+                    name=f"{name}_YBlock_{i}",
+                )(x)
+        elif block_type == "Z":
+            x = ZBlock(
+                filters_in,
+                filters_out,
+                group_width,
+                stride=2,
+                name=f"{name}_ZBlock_0",
+            )(x)
+            for i in range(1, depth):
+                x = ZBlock(
+                    filters_out,
+                    filters_out,
+                    group_width,
+                    name=f"{name}_ZBlock_{i}",
+                )(x)
+        else:
+            raise NotImplementedError(
+                f"Block type `{block_type}` not recognized."
+                "block_type must be one of (`X`, `Y`, `Z`). "
+            )
+        return x
+
+    return apply
 
 
-def YBlock(filters_in,
-           filters_out,
-           group_width,
-           stride=1,
-           squeeze_excite_ratio=0.25,
-           name=None):
-  """Implementation of Y Block.
+def Head(num_classes=1000, name=None):
+    """Implementation of classification head of RegNet.
+
+    Args:
+      num_classes: number of classes for Dense layer
+      name: name prefix
+
+    Returns:
+      Classification head function.
+    """
+    if name is None:
+        name = str(backend.get_uid("head"))
+
+    def apply(x):
+        x = layers.GlobalAveragePooling2D(name=name + "_head_gap")(x)
+        x = layers.Dense(num_classes, name=name + "head_dense")(x)
+        return x
+
+    return apply
+
+
+def RegNet(
+    depths,
+    widths,
+    group_width,
+    block_type,
+    default_size,
+    model_name="regnet",
+    include_preprocessing=True,
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    """Instantiates RegNet architecture given specific configuration.
+
+    Args:
+      depths: An iterable containing depths for each individual stages.
+      widths: An iterable containing output channel width of each individual
+        stages
+      group_width: Number of channels to be used in each group. See grouped
+        convolutions for more information.
+      block_type: Must be one of `{"X", "Y", "Z"}`. For more details see the
+        papers "Designing network design spaces" and "Fast and Accurate Model
+        Scaling"
+      default_size: Default input image size.
+      model_name: An optional name for the model.
+      include_preprocessing: boolean denoting whther to include preprocessing in
+        the model
+      include_top: Boolean denoting whether to include classification head to
+        the model.
+      weights: one of `None` (random initialization), "imagenet" (pre-training
+        on ImageNet), or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to
+        use as image input for the model.
+      input_shape: optional shape tuple, only to be specified if `include_top`
+        is False. It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction when `include_top`
+        is `False`. - `None` means that the output of the model will be the 4D
+        tensor output of the last convolutional layer. - `avg` means that global
+        average pooling will be applied to the output of the last convolutional
+        layer, and thus the output of the model will be a 2D tensor. - `max`
+        means that global max pooling will be applied.
+      classes: optional number of classes to classify images into, only to be
+        specified if `include_top` is True, and if no `weights` argument is
+        specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
 
-  Reference: [Designing Network Design
-  Spaces](https://arxiv.org/abs/2003.13678)
-  Args:
-    filters_in: filters in the input tensor
-    filters_out: filters in the output tensor
-    group_width: group width
-    stride: stride
-    squeeze_excite_ratio: expansion ration for Squeeze and Excite block
-    name: name prefix
-  Returns:
-    Output tensor of the block
-  """
-  if name is None:
-    name = str(backend.get_uid("yblock"))
-
-  def apply(inputs):
-    if filters_in != filters_out and stride == 1:
-      raise ValueError(
-          f"Input filters({filters_in}) and output filters({filters_out}) "
-          f"are not equal for stride {stride}. Input and output filters must  "
-          f"be equal for stride={stride}.")
-
-    groups = filters_out // group_width
-    se_filters = int(filters_in * squeeze_excite_ratio)
-
-    if stride != 1:
-      skip = layers.Conv2D(
-          filters_out, (1, 1),
-          strides=stride,
-          use_bias=False,
-          kernel_initializer="he_normal",
-          name=name + "_skip_1x1")(inputs)
-      skip = layers.BatchNormalization(
-          momentum=0.9, epsilon=1e-5, name=name + "_skip_bn")(skip)
+    Returns:
+      A `keras.Model` instance.
+
+    Raises:
+        ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+        ValueError: if `classifier_activation` is not `softmax` or `None` when
+          using a pretrained top layer.
+        ValueError: if `include_top` is True but `num_classes` is not 1000.
+        ValueError: if `block_type` is not one of `{"X", "Y", "Z"}`
+
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            "If using `weights` as `'imagenet'` with `include_top`"
+            " as true, `classes` should be 1000"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=default_size,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      skip = inputs
-
-    # Build block
-    # conv_1x1_1
-    x = layers.Conv2D(
-        filters_out, (1, 1),
-        use_bias=False,
-        kernel_initializer="he_normal",
-        name=name + "_conv_1x1_1")(inputs)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_1_bn")(x)
-    x = layers.ReLU(name=name + "_conv_1x1_1_relu")(x)
-
-    # conv_3x3
-    x = layers.Conv2D(
-        filters_out, (3, 3),
-        use_bias=False,
-        strides=stride,
-        groups=groups,
-        padding="same",
-        kernel_initializer="he_normal",
-        name=name + "_conv_3x3")(x)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_3x3_bn")(x)
-    x = layers.ReLU(name=name + "_conv_3x3_relu")(x)
-
-    # Squeeze-Excitation block
-    x = SqueezeAndExciteBlock(filters_out, se_filters, name=name)(x)
-
-    # conv_1x1_2
-    x = layers.Conv2D(
-        filters_out, (1, 1),
-        use_bias=False,
-        kernel_initializer="he_normal",
-        name=name + "_conv_1x1_2")(x)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_2_bn")(x)
-
-    x = layers.ReLU(name=name + "_exit_relu")(x + skip)
-
-    return x
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
 
-  return apply
-
-
-def ZBlock(filters_in,
-           filters_out,
-           group_width,
-           stride=1,
-           squeeze_excite_ratio=0.25,
-           bottleneck_ratio=0.25,
-           name=None):
-  """Implementation of Z block Reference: [Fast and Accurate Model Scaling](https://arxiv.org/abs/2103.06877).
-
-  Args:
-    filters_in: filters in the input tensor
-    filters_out: filters in the output tensor
-    group_width: group width
-    stride: stride
-    squeeze_excite_ratio: expansion ration for Squeeze and Excite block
-    bottleneck_ratio: inverted bottleneck ratio
-    name: name prefix
-  Returns:
-    Output tensor of the block
-  """
-  if name is None:
-    name = str(backend.get_uid("zblock"))
-
-  def apply(inputs):
-    if filters_in != filters_out and stride == 1:
-      raise ValueError(
-          f"Input filters({filters_in}) and output filters({filters_out})"
-          f"are not equal for stride {stride}. Input and output filters must be"
-          f" equal for stride={stride}.")
-
-    groups = filters_out // group_width
-    se_filters = int(filters_in * squeeze_excite_ratio)
-
-    inv_btlneck_filters = int(filters_out / bottleneck_ratio)
-
-    # Build block
-    # conv_1x1_1
-    x = layers.Conv2D(
-        inv_btlneck_filters, (1, 1),
-        use_bias=False,
-        kernel_initializer="he_normal",
-        name=name + "_conv_1x1_1")(inputs)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_1_bn")(x)
-    x = tf.nn.silu(x)
-
-    # conv_3x3
-    x = layers.Conv2D(
-        inv_btlneck_filters, (3, 3),
-        use_bias=False,
-        strides=stride,
-        groups=groups,
-        padding="same",
-        kernel_initializer="he_normal",
-        name=name + "_conv_3x3")(x)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_3x3_bn")(x)
-    x = tf.nn.silu(x)
-
-    # Squeeze-Excitation block
-    x = SqueezeAndExciteBlock(inv_btlneck_filters, se_filters, name=name)
-
-    # conv_1x1_2
-    x = layers.Conv2D(
-        filters_out, (1, 1),
-        use_bias=False,
-        kernel_initializer="he_normal",
-        name=name + "_conv_1x1_2")(x)
-    x = layers.BatchNormalization(
-        momentum=0.9, epsilon=1e-5, name=name + "_conv_1x1_2_bn")(x)
-
-    if stride != 1:
-      return x
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)[0]
     else:
-      return x + inputs
-
-  return apply
-
-
-def Stage(block_type, depth, group_width, filters_in, filters_out, name=None):
-  """Implementation of Stage in RegNet.
-
-  Args:
-    block_type: must be one of "X", "Y", "Z"
-    depth: depth of stage, number of blocks to use
-    group_width: group width of all blocks in  this stage
-    filters_in: input filters to this stage
-    filters_out: output filters from this stage
-    name: name prefix
-
-  Returns:
-    Output tensor of Stage
-  """
-  if name is None:
-    name = str(backend.get_uid("stage"))
+        inputs = img_input
 
-  def apply(inputs):
     x = inputs
-    if block_type == "X":
-      x = XBlock(
-          filters_in,
-          filters_out,
-          group_width,
-          stride=2,
-          name=f"{name}_XBlock_0")(x)
-      for i in range(1, depth):
-        x = XBlock(
-            filters_out, filters_out, group_width, name=f"{name}_XBlock_{i}")(x)
-    elif block_type == "Y":
-      x = YBlock(
-          filters_in,
-          filters_out,
-          group_width,
-          stride=2,
-          name=name + "_YBlock_0")(x)
-      for i in range(1, depth):
-        x = YBlock(
-            filters_out, filters_out, group_width, name=f"{name}_YBlock_{i}")(x)
-    elif block_type == "Z":
-      x = ZBlock(
-          filters_in,
-          filters_out,
-          group_width,
-          stride=2,
-          name=f"{name}_ZBlock_0")(x)
-      for i in range(1, depth):
-        x = ZBlock(
-            filters_out, filters_out, group_width, name=f"{name}_ZBlock_{i}")(x)
-    else:
-      raise NotImplementedError(f"Block type `{block_type}` not recognized."
-                                f"block_type must be one of (`X`, `Y`, `Z`). ")
-    return x
-
-  return apply
-
-
-def Head(num_classes=1000, name=None):
-  """Implementation of classification head of RegNet.
-
-  Args:
-    num_classes: number of classes for Dense layer
-    name: name prefix
-
-  Returns:
-    Output logits tensor.
-  """
-  if name is None:
-    name = str(backend.get_uid("head"))
-
-  def apply(x):
-    x = layers.GlobalAveragePooling2D(name=name + "_head_gap")(x)
-    x = layers.Dense(num_classes, name=name + "head_dense")(x)
-    return x
-
-  return apply
-
-
-def RegNet(depths,
-           widths,
-           group_width,
-           block_type,
-           default_size,
-           model_name="regnet",
-           include_preprocessing=True,
-           include_top=True,
-           weights="imagenet",
-           input_tensor=None,
-           input_shape=None,
-           pooling=None,
-           classes=1000,
-           classifier_activation="softmax"):
-  """Instantiates RegNet architecture given specific configuration.
-
-  Args:
-    depths: An iterable containing depths for each individual stages.
-    widths: An iterable containing output channel width of each individual
-      stages
-    group_width: Number of channels to be used in each group. See grouped
-      convolutions for more information.
-    block_type: Must be one of `{"X", "Y", "Z"}`. For more details see the
-      papers "Designing network design spaces" and "Fast and Accurate Model
-      Scaling"
-    default_size: Default input image size.
-    model_name: An optional name for the model.
-    include_preprocessing: boolean denoting whther to include preprocessing in
-      the model
-    include_top: Boolean denoting whether to include classification head to the
-      model.
-    weights: one of `None` (random initialization), "imagenet" (pre-training on
-      ImageNet), or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to use
-      as image input for the model.
-    input_shape: optional shape tuple, only to be specified if `include_top` is
-      False. It should have exactly 3 inputs channels.
-    pooling: optional pooling mode for feature extraction when `include_top` is
-      `False`. - `None` means that the output of the model will be the 4D tensor
-      output of the last convolutional layer. - `avg` means that global average
-      pooling will be applied to the output of the last convolutional layer, and
-      thus the output of the model will be a 2D tensor. - `max` means that
-      global max pooling will be applied.
-    classes: optional number of classes to classify images into, only to be
-      specified if `include_top` is True, and if no `weights` argument is
-      specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-
-  Returns:
-    A `keras.Model` instance.
+    if include_preprocessing:
+        x = PreStem(name=model_name)(x)
+    x = Stem(name=model_name)(x)
+
+    in_channels = 32  # Output from Stem
+
+    for num_stage in range(4):
+        depth = depths[num_stage]
+        out_channels = widths[num_stage]
+
+        x = Stage(
+            block_type,
+            depth,
+            group_width,
+            in_channels,
+            out_channels,
+            name=model_name + "_Stage_" + str(num_stage),
+        )(x)
+        in_channels = out_channels
 
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-        or invalid input shape.
-      ValueError: if `classifier_activation` is not `softmax` or `None` when
-        using a pretrained top layer.
-      ValueError: if `include_top` is True but `num_classes` is not 1000.
-      ValueError: if `block_type` is not one of `{"X", "Y", "Z"}`
-
-  """
-  if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
-    raise ValueError("The `weights` argument should be either "
-                     "`None` (random initialization), `imagenet` "
-                     "(pre-training on ImageNet), "
-                     "or the path to the weights file to be loaded.")
-
-  if weights == "imagenet" and include_top and classes != 1000:
-    raise ValueError("If using `weights` as `'imagenet'` with `include_top`"
-                     " as true, `classes` should be 1000")
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  x = inputs
-  if include_preprocessing:
-    x = PreStem(name=model_name)(x)
-  x = Stem(name=model_name)(x)
-
-  in_channels = 32  # Output from Stem
-
-  for num_stage in range(4):
-    depth = depths[num_stage]
-    out_channels = widths[num_stage]
-
-    x = Stage(
-        block_type,
-        depth,
-        group_width,
-        in_channels,
-        out_channels,
-        name=model_name + "_Stage_" + str(num_stage))(x)
-    in_channels = out_channels
-
-  if include_top:
-    x = Head(num_classes=classes)(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-
-  else:
-    if pooling == "avg":
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == "max":
-      x = layers.GlobalMaxPooling2D()(x)
-
-  model = training.Model(inputs=inputs, outputs=x, name=model_name)
-
-  # Load weights.
-  if weights == "imagenet":
     if include_top:
-      file_suffix = ".h5"
-      file_hash = WEIGHTS_HASHES[model_name[-4:]][0]
-    else:
-      file_suffix = "_notop.h5"
-      file_hash = WEIGHTS_HASHES[model_name[-4:]][1]
-    file_name = model_name + file_suffix
-    weights_path = data_utils.get_file(
-        file_name,
-        BASE_WEIGHTS_PATH + file_name,
-        cache_subdir="models",
-        file_hash=file_hash)
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
+        x = Head(num_classes=classes)(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
 
-  return model
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    model = training.Model(inputs=inputs, outputs=x, name=model_name)
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            file_suffix = ".h5"
+            file_hash = WEIGHTS_HASHES[model_name[-4:]][0]
+        else:
+            file_suffix = "_notop.h5"
+            file_hash = WEIGHTS_HASHES[model_name[-4:]][1]
+        file_name = model_name + file_suffix
+        weights_path = data_utils.get_file(
+            file_name,
+            BASE_WEIGHTS_PATH + file_name,
+            cache_subdir="models",
+            file_hash=file_hash,
+        )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
 
 
 ## Instantiating variants ##
 
 
-@keras_export("keras.applications.regnet.RegNetX002",
-              "keras.applications.RegNetX002")
-def RegNetX002(model_name="regnetx002",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x002"]["depths"],
-      MODEL_CONFIGS["x002"]["widths"],
-      MODEL_CONFIGS["x002"]["group_width"],
-      MODEL_CONFIGS["x002"]["block_type"],
-      MODEL_CONFIGS["x002"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX004",
-              "keras.applications.RegNetX004")
-def RegNetX004(model_name="regnetx004",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x004"]["depths"],
-      MODEL_CONFIGS["x004"]["widths"],
-      MODEL_CONFIGS["x004"]["group_width"],
-      MODEL_CONFIGS["x004"]["block_type"],
-      MODEL_CONFIGS["x004"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX006",
-              "keras.applications.RegNetX006")
-def RegNetX006(model_name="regnetx006",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x006"]["depths"],
-      MODEL_CONFIGS["x006"]["widths"],
-      MODEL_CONFIGS["x006"]["group_width"],
-      MODEL_CONFIGS["x006"]["block_type"],
-      MODEL_CONFIGS["x006"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX008",
-              "keras.applications.RegNetX008")
-def RegNetX008(model_name="regnetx008",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x008"]["depths"],
-      MODEL_CONFIGS["x008"]["widths"],
-      MODEL_CONFIGS["x008"]["group_width"],
-      MODEL_CONFIGS["x008"]["block_type"],
-      MODEL_CONFIGS["x008"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX016",
-              "keras.applications.RegNetX016")
-def RegNetX016(model_name="regnetx016",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x016"]["depths"],
-      MODEL_CONFIGS["x016"]["widths"],
-      MODEL_CONFIGS["x016"]["group_width"],
-      MODEL_CONFIGS["x016"]["block_type"],
-      MODEL_CONFIGS["x016"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX032",
-              "keras.applications.RegNetX032")
-def RegNetX032(model_name="regnetx032",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x032"]["depths"],
-      MODEL_CONFIGS["x032"]["widths"],
-      MODEL_CONFIGS["x032"]["group_width"],
-      MODEL_CONFIGS["x032"]["block_type"],
-      MODEL_CONFIGS["x032"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX040",
-              "keras.applications.RegNetX040")
-def RegNetX040(model_name="regnetx040",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x040"]["depths"],
-      MODEL_CONFIGS["x040"]["widths"],
-      MODEL_CONFIGS["x040"]["group_width"],
-      MODEL_CONFIGS["x040"]["block_type"],
-      MODEL_CONFIGS["x040"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX064",
-              "keras.applications.RegNetX064")
-def RegNetX064(model_name="regnetx064",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x064"]["depths"],
-      MODEL_CONFIGS["x064"]["widths"],
-      MODEL_CONFIGS["x064"]["group_width"],
-      MODEL_CONFIGS["x064"]["block_type"],
-      MODEL_CONFIGS["x064"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX080",
-              "keras.applications.RegNetX080")
-def RegNetX080(model_name="regnetx080",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x080"]["depths"],
-      MODEL_CONFIGS["x080"]["widths"],
-      MODEL_CONFIGS["x080"]["group_width"],
-      MODEL_CONFIGS["x080"]["block_type"],
-      MODEL_CONFIGS["x080"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX120",
-              "keras.applications.RegNetX120")
-def RegNetX120(model_name="regnetx120",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x120"]["depths"],
-      MODEL_CONFIGS["x120"]["widths"],
-      MODEL_CONFIGS["x120"]["group_width"],
-      MODEL_CONFIGS["x120"]["block_type"],
-      MODEL_CONFIGS["x120"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX160",
-              "keras.applications.RegNetX160")
-def RegNetX160(model_name="regnetx160",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x160"]["depths"],
-      MODEL_CONFIGS["x160"]["widths"],
-      MODEL_CONFIGS["x160"]["group_width"],
-      MODEL_CONFIGS["x160"]["block_type"],
-      MODEL_CONFIGS["x160"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetX320",
-              "keras.applications.RegNetX320")
-def RegNetX320(model_name="regnetx320",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["x320"]["depths"],
-      MODEL_CONFIGS["x320"]["widths"],
-      MODEL_CONFIGS["x320"]["group_width"],
-      MODEL_CONFIGS["x320"]["block_type"],
-      MODEL_CONFIGS["x320"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY002",
-              "keras.applications.RegNetY002")
-def RegNetY002(model_name="regnety002",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y002"]["depths"],
-      MODEL_CONFIGS["y002"]["widths"],
-      MODEL_CONFIGS["y002"]["group_width"],
-      MODEL_CONFIGS["y002"]["block_type"],
-      MODEL_CONFIGS["y002"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY004",
-              "keras.applications.RegNetY004")
-def RegNetY004(model_name="regnety004",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y004"]["depths"],
-      MODEL_CONFIGS["y004"]["widths"],
-      MODEL_CONFIGS["y004"]["group_width"],
-      MODEL_CONFIGS["y004"]["block_type"],
-      MODEL_CONFIGS["y004"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY006",
-              "keras.applications.RegNetY006")
-def RegNetY006(model_name="regnety006",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y006"]["depths"],
-      MODEL_CONFIGS["y006"]["widths"],
-      MODEL_CONFIGS["y006"]["group_width"],
-      MODEL_CONFIGS["y006"]["block_type"],
-      MODEL_CONFIGS["y006"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY008",
-              "keras.applications.RegNetY008")
-def RegNetY008(model_name="regnety008",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y008"]["depths"],
-      MODEL_CONFIGS["y008"]["widths"],
-      MODEL_CONFIGS["y008"]["group_width"],
-      MODEL_CONFIGS["y008"]["block_type"],
-      MODEL_CONFIGS["y008"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY016",
-              "keras.applications.RegNetY016")
-def RegNetY016(model_name="regnety016",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y016"]["depths"],
-      MODEL_CONFIGS["y016"]["widths"],
-      MODEL_CONFIGS["y016"]["group_width"],
-      MODEL_CONFIGS["y016"]["block_type"],
-      MODEL_CONFIGS["y016"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY032",
-              "keras.applications.RegNetY032")
-def RegNetY032(model_name="regnety032",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y032"]["depths"],
-      MODEL_CONFIGS["y032"]["widths"],
-      MODEL_CONFIGS["y032"]["group_width"],
-      MODEL_CONFIGS["y032"]["block_type"],
-      MODEL_CONFIGS["y032"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY040",
-              "keras.applications.RegNetY040")
-def RegNetY040(model_name="regnety040",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y040"]["depths"],
-      MODEL_CONFIGS["y040"]["widths"],
-      MODEL_CONFIGS["y040"]["group_width"],
-      MODEL_CONFIGS["y040"]["block_type"],
-      MODEL_CONFIGS["y040"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY064",
-              "keras.applications.RegNetY064")
-def RegNetY064(model_name="regnety064",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y064"]["depths"],
-      MODEL_CONFIGS["y064"]["widths"],
-      MODEL_CONFIGS["y064"]["group_width"],
-      MODEL_CONFIGS["y064"]["block_type"],
-      MODEL_CONFIGS["y064"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY080",
-              "keras.applications.RegNetY080")
-def RegNetY080(model_name="regnety080",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y080"]["depths"],
-      MODEL_CONFIGS["y080"]["widths"],
-      MODEL_CONFIGS["y080"]["group_width"],
-      MODEL_CONFIGS["y080"]["block_type"],
-      MODEL_CONFIGS["y080"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY120",
-              "keras.applications.RegNetY120")
-def RegNetY120(model_name="regnety120",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y120"]["depths"],
-      MODEL_CONFIGS["y120"]["widths"],
-      MODEL_CONFIGS["y120"]["group_width"],
-      MODEL_CONFIGS["y120"]["block_type"],
-      MODEL_CONFIGS["y120"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY160",
-              "keras.applications.RegNetY160")
-def RegNetY160(model_name="regnety160",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y160"]["depths"],
-      MODEL_CONFIGS["y160"]["widths"],
-      MODEL_CONFIGS["y160"]["group_width"],
-      MODEL_CONFIGS["y160"]["block_type"],
-      MODEL_CONFIGS["y160"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export("keras.applications.regnet.RegNetY320",
-              "keras.applications.RegNetY320")
-def RegNetY320(model_name="regnety320",
-               include_top=True,
-               include_preprocessing=True,
-               weights="imagenet",
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000,
-               classifier_activation="softmax"):
-  return RegNet(
-      MODEL_CONFIGS["y320"]["depths"],
-      MODEL_CONFIGS["y320"]["widths"],
-      MODEL_CONFIGS["y320"]["group_width"],
-      MODEL_CONFIGS["y320"]["block_type"],
-      MODEL_CONFIGS["y320"]["default_size"],
-      model_name=model_name,
-      include_top=include_top,
-      include_preprocessing=include_preprocessing,
-      weights=weights,
-      input_tensor=input_tensor,
-      input_shape=input_shape,
-      pooling=pooling,
-      classes=classes,
-      classifier_activation=classifier_activation)
+@keras_export(
+    "keras.applications.regnet.RegNetX002", "keras.applications.RegNetX002"
+)
+def RegNetX002(
+    model_name="regnetx002",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x002"]["depths"],
+        MODEL_CONFIGS["x002"]["widths"],
+        MODEL_CONFIGS["x002"]["group_width"],
+        MODEL_CONFIGS["x002"]["block_type"],
+        MODEL_CONFIGS["x002"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX004", "keras.applications.RegNetX004"
+)
+def RegNetX004(
+    model_name="regnetx004",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x004"]["depths"],
+        MODEL_CONFIGS["x004"]["widths"],
+        MODEL_CONFIGS["x004"]["group_width"],
+        MODEL_CONFIGS["x004"]["block_type"],
+        MODEL_CONFIGS["x004"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX006", "keras.applications.RegNetX006"
+)
+def RegNetX006(
+    model_name="regnetx006",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x006"]["depths"],
+        MODEL_CONFIGS["x006"]["widths"],
+        MODEL_CONFIGS["x006"]["group_width"],
+        MODEL_CONFIGS["x006"]["block_type"],
+        MODEL_CONFIGS["x006"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX008", "keras.applications.RegNetX008"
+)
+def RegNetX008(
+    model_name="regnetx008",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x008"]["depths"],
+        MODEL_CONFIGS["x008"]["widths"],
+        MODEL_CONFIGS["x008"]["group_width"],
+        MODEL_CONFIGS["x008"]["block_type"],
+        MODEL_CONFIGS["x008"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX016", "keras.applications.RegNetX016"
+)
+def RegNetX016(
+    model_name="regnetx016",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x016"]["depths"],
+        MODEL_CONFIGS["x016"]["widths"],
+        MODEL_CONFIGS["x016"]["group_width"],
+        MODEL_CONFIGS["x016"]["block_type"],
+        MODEL_CONFIGS["x016"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX032", "keras.applications.RegNetX032"
+)
+def RegNetX032(
+    model_name="regnetx032",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x032"]["depths"],
+        MODEL_CONFIGS["x032"]["widths"],
+        MODEL_CONFIGS["x032"]["group_width"],
+        MODEL_CONFIGS["x032"]["block_type"],
+        MODEL_CONFIGS["x032"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX040", "keras.applications.RegNetX040"
+)
+def RegNetX040(
+    model_name="regnetx040",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x040"]["depths"],
+        MODEL_CONFIGS["x040"]["widths"],
+        MODEL_CONFIGS["x040"]["group_width"],
+        MODEL_CONFIGS["x040"]["block_type"],
+        MODEL_CONFIGS["x040"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX064", "keras.applications.RegNetX064"
+)
+def RegNetX064(
+    model_name="regnetx064",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x064"]["depths"],
+        MODEL_CONFIGS["x064"]["widths"],
+        MODEL_CONFIGS["x064"]["group_width"],
+        MODEL_CONFIGS["x064"]["block_type"],
+        MODEL_CONFIGS["x064"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX080", "keras.applications.RegNetX080"
+)
+def RegNetX080(
+    model_name="regnetx080",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x080"]["depths"],
+        MODEL_CONFIGS["x080"]["widths"],
+        MODEL_CONFIGS["x080"]["group_width"],
+        MODEL_CONFIGS["x080"]["block_type"],
+        MODEL_CONFIGS["x080"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX120", "keras.applications.RegNetX120"
+)
+def RegNetX120(
+    model_name="regnetx120",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x120"]["depths"],
+        MODEL_CONFIGS["x120"]["widths"],
+        MODEL_CONFIGS["x120"]["group_width"],
+        MODEL_CONFIGS["x120"]["block_type"],
+        MODEL_CONFIGS["x120"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX160", "keras.applications.RegNetX160"
+)
+def RegNetX160(
+    model_name="regnetx160",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x160"]["depths"],
+        MODEL_CONFIGS["x160"]["widths"],
+        MODEL_CONFIGS["x160"]["group_width"],
+        MODEL_CONFIGS["x160"]["block_type"],
+        MODEL_CONFIGS["x160"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetX320", "keras.applications.RegNetX320"
+)
+def RegNetX320(
+    model_name="regnetx320",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["x320"]["depths"],
+        MODEL_CONFIGS["x320"]["widths"],
+        MODEL_CONFIGS["x320"]["group_width"],
+        MODEL_CONFIGS["x320"]["block_type"],
+        MODEL_CONFIGS["x320"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY002", "keras.applications.RegNetY002"
+)
+def RegNetY002(
+    model_name="regnety002",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y002"]["depths"],
+        MODEL_CONFIGS["y002"]["widths"],
+        MODEL_CONFIGS["y002"]["group_width"],
+        MODEL_CONFIGS["y002"]["block_type"],
+        MODEL_CONFIGS["y002"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY004", "keras.applications.RegNetY004"
+)
+def RegNetY004(
+    model_name="regnety004",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y004"]["depths"],
+        MODEL_CONFIGS["y004"]["widths"],
+        MODEL_CONFIGS["y004"]["group_width"],
+        MODEL_CONFIGS["y004"]["block_type"],
+        MODEL_CONFIGS["y004"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY006", "keras.applications.RegNetY006"
+)
+def RegNetY006(
+    model_name="regnety006",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y006"]["depths"],
+        MODEL_CONFIGS["y006"]["widths"],
+        MODEL_CONFIGS["y006"]["group_width"],
+        MODEL_CONFIGS["y006"]["block_type"],
+        MODEL_CONFIGS["y006"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY008", "keras.applications.RegNetY008"
+)
+def RegNetY008(
+    model_name="regnety008",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y008"]["depths"],
+        MODEL_CONFIGS["y008"]["widths"],
+        MODEL_CONFIGS["y008"]["group_width"],
+        MODEL_CONFIGS["y008"]["block_type"],
+        MODEL_CONFIGS["y008"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY016", "keras.applications.RegNetY016"
+)
+def RegNetY016(
+    model_name="regnety016",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y016"]["depths"],
+        MODEL_CONFIGS["y016"]["widths"],
+        MODEL_CONFIGS["y016"]["group_width"],
+        MODEL_CONFIGS["y016"]["block_type"],
+        MODEL_CONFIGS["y016"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY032", "keras.applications.RegNetY032"
+)
+def RegNetY032(
+    model_name="regnety032",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y032"]["depths"],
+        MODEL_CONFIGS["y032"]["widths"],
+        MODEL_CONFIGS["y032"]["group_width"],
+        MODEL_CONFIGS["y032"]["block_type"],
+        MODEL_CONFIGS["y032"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY040", "keras.applications.RegNetY040"
+)
+def RegNetY040(
+    model_name="regnety040",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y040"]["depths"],
+        MODEL_CONFIGS["y040"]["widths"],
+        MODEL_CONFIGS["y040"]["group_width"],
+        MODEL_CONFIGS["y040"]["block_type"],
+        MODEL_CONFIGS["y040"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY064", "keras.applications.RegNetY064"
+)
+def RegNetY064(
+    model_name="regnety064",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y064"]["depths"],
+        MODEL_CONFIGS["y064"]["widths"],
+        MODEL_CONFIGS["y064"]["group_width"],
+        MODEL_CONFIGS["y064"]["block_type"],
+        MODEL_CONFIGS["y064"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY080", "keras.applications.RegNetY080"
+)
+def RegNetY080(
+    model_name="regnety080",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y080"]["depths"],
+        MODEL_CONFIGS["y080"]["widths"],
+        MODEL_CONFIGS["y080"]["group_width"],
+        MODEL_CONFIGS["y080"]["block_type"],
+        MODEL_CONFIGS["y080"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY120", "keras.applications.RegNetY120"
+)
+def RegNetY120(
+    model_name="regnety120",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y120"]["depths"],
+        MODEL_CONFIGS["y120"]["widths"],
+        MODEL_CONFIGS["y120"]["group_width"],
+        MODEL_CONFIGS["y120"]["block_type"],
+        MODEL_CONFIGS["y120"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY160", "keras.applications.RegNetY160"
+)
+def RegNetY160(
+    model_name="regnety160",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y160"]["depths"],
+        MODEL_CONFIGS["y160"]["widths"],
+        MODEL_CONFIGS["y160"]["group_width"],
+        MODEL_CONFIGS["y160"]["block_type"],
+        MODEL_CONFIGS["y160"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.regnet.RegNetY320", "keras.applications.RegNetY320"
+)
+def RegNetY320(
+    model_name="regnety320",
+    include_top=True,
+    include_preprocessing=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+):
+    return RegNet(
+        MODEL_CONFIGS["y320"]["depths"],
+        MODEL_CONFIGS["y320"]["widths"],
+        MODEL_CONFIGS["y320"]["group_width"],
+        MODEL_CONFIGS["y320"]["block_type"],
+        MODEL_CONFIGS["y320"]["default_size"],
+        model_name=model_name,
+        include_top=include_top,
+        include_preprocessing=include_preprocessing,
+        weights=weights,
+        input_tensor=input_tensor,
+        input_shape=input_shape,
+        pooling=pooling,
+        classes=classes,
+        classifier_activation=classifier_activation,
+    )
 
 
 RegNetX002.__doc__ = BASE_DOCSTRING.format(name="RegNetX002")
@@ -1606,30 +1809,30 @@ def RegNetY320(model_name="regnety320",
 
 
 @keras_export("keras.applications.regnet.preprocess_input")
-def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
-  """A placeholder method for backward compatibility.
-
-  The preprocessing logic has been included in the efficientnet model
-  implementation. Users are no longer required to call this method to normalize
-  the input data. This method does nothing and only kept as a placeholder to
-  align the API surface between old and new version of model.
-
-  Args:
-    x: A floating point `numpy.array` or a `tf.Tensor`.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it, it
-      defaults to "channels_last").{mode}
-
-  Returns:
-    Unchanged `numpy.array` or `tf.Tensor`.
-  """
-  return x
+def preprocess_input(x, data_format=None):
+    """A placeholder method for backward compatibility.
+
+    The preprocessing logic has been included in the regnet model
+    implementation. Users are no longer required to call this method to
+    normalize the input data. This method does nothing and only kept as a
+    placeholder to align the API surface between old and new version of model.
+
+    Args:
+      x: A floating point `numpy.array` or a `tf.Tensor`.
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it uses "channels_last").
+        Defaults to `None`.
+
+    Returns:
+      Unchanged `numpy.array` or `tf.Tensor`.
+    """
+    return x
 
 
 @keras_export("keras.applications.regnet.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/resnet.py b/keras/applications/resnet.py
index 46b4e81c8ad3..adcd2b746e08 100644
--- a/keras/applications/resnet.py
+++ b/keras/applications/resnet.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """ResNet models for Keras.
 
 Reference:
@@ -28,496 +28,604 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
 BASE_WEIGHTS_PATH = (
-    'https://storage.googleapis.com/tensorflow/keras-applications/resnet/')
+    "https://storage.googleapis.com/tensorflow/keras-applications/resnet/"
+)
 WEIGHTS_HASHES = {
-    'resnet50': ('2cb95161c43110f7111970584f804107',
-                 '4d473c1dd8becc155b73f8504c6f6626'),
-    'resnet101': ('f1aeb4b969a6efcfb50fad2f0c20cfc5',
-                  '88cf7a10940856eca736dc7b7e228a21'),
-    'resnet152': ('100835be76be38e30d865e96f2aaae62',
-                  'ee4c566cf9a93f14d82f913c2dc6dd0c'),
-    'resnet50v2': ('3ef43a0b657b3be2300d5770ece849e0',
-                   'fac2f116257151a9d068a22e544a4917'),
-    'resnet101v2': ('6343647c601c52e1368623803854d971',
-                    'c0ed64b8031c3730f411d2eb4eea35b5'),
-    'resnet152v2': ('a49b44d1979771252814e80f8ec446f9',
-                    'ed17cf2e0169df9d443503ef94b23b33'),
-    'resnext50': ('67a5b30d522ed92f75a1f16eef299d1a',
-                  '62527c363bdd9ec598bed41947b379fc'),
-    'resnext101':
-        ('34fb605428fcc7aa4d62f44404c11509', '0f678c91647380debd923963594981b3')
+    "resnet50": (
+        "2cb95161c43110f7111970584f804107",
+        "4d473c1dd8becc155b73f8504c6f6626",
+    ),
+    "resnet101": (
+        "f1aeb4b969a6efcfb50fad2f0c20cfc5",
+        "88cf7a10940856eca736dc7b7e228a21",
+    ),
+    "resnet152": (
+        "100835be76be38e30d865e96f2aaae62",
+        "ee4c566cf9a93f14d82f913c2dc6dd0c",
+    ),
+    "resnet50v2": (
+        "3ef43a0b657b3be2300d5770ece849e0",
+        "fac2f116257151a9d068a22e544a4917",
+    ),
+    "resnet101v2": (
+        "6343647c601c52e1368623803854d971",
+        "c0ed64b8031c3730f411d2eb4eea35b5",
+    ),
+    "resnet152v2": (
+        "a49b44d1979771252814e80f8ec446f9",
+        "ed17cf2e0169df9d443503ef94b23b33",
+    ),
+    "resnext50": (
+        "67a5b30d522ed92f75a1f16eef299d1a",
+        "62527c363bdd9ec598bed41947b379fc",
+    ),
+    "resnext101": (
+        "34fb605428fcc7aa4d62f44404c11509",
+        "0f678c91647380debd923963594981b3",
+    ),
 }
 
 layers = None
 
 
-def ResNet(stack_fn,
-           preact,
-           use_bias,
-           model_name='resnet',
-           include_top=True,
-           weights='imagenet',
-           input_tensor=None,
-           input_shape=None,
-           pooling=None,
-           classes=1000,
-           classifier_activation='softmax',
-           **kwargs):
-  """Instantiates the ResNet, ResNetV2, and ResNeXt architecture.
-
-  Args:
-    stack_fn: a function that returns output tensor for the
-      stacked residual blocks.
-    preact: whether to use pre-activation or not
-      (True for ResNetV2, False for ResNet and ResNeXt).
-    use_bias: whether to use biases for convolutional layers or not
-      (True for ResNet and ResNetV2, False for ResNeXt).
-    model_name: string, model name.
-    include_top: whether to include the fully-connected
-      layer at the top of the network.
-    weights: one of `None` (random initialization),
-      'imagenet' (pre-training on ImageNet),
-      or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor
-      (i.e. output of `layers.Input()`)
-      to use as image input for the model.
-    input_shape: optional shape tuple, only to be specified
-      if `include_top` is False (otherwise the input shape
-      has to be `(224, 224, 3)` (with `channels_last` data format)
-      or `(3, 224, 224)` (with `channels_first` data format).
-      It should have exactly 3 inputs channels.
-    pooling: optional pooling mode for feature extraction
-      when `include_top` is `False`.
-      - `None` means that the output of the model will be
-          the 4D tensor output of the
-          last convolutional layer.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional layer, and thus
-          the output of the model will be a 2D tensor.
-      - `max` means that global max pooling will
-          be applied.
-    classes: optional number of classes to classify images
-      into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-    **kwargs: For backwards compatibility only.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  global layers
-  if 'layers' in kwargs:
-    layers = kwargs.pop('layers')
-  else:
-    layers = VersionAwareLayers()
-  if kwargs:
-    raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+def ResNet(
+    stack_fn,
+    preact,
+    use_bias,
+    model_name="resnet",
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation="softmax",
+    **kwargs,
+):
+    """Instantiates the ResNet, ResNetV2, and ResNeXt architecture.
+
+    Args:
+      stack_fn: a function that returns output tensor for the
+        stacked residual blocks.
+      preact: whether to use pre-activation or not
+        (True for ResNetV2, False for ResNet and ResNeXt).
+      use_bias: whether to use biases for convolutional layers or not
+        (True for ResNet and ResNetV2, False for ResNeXt).
+      model_name: string, model name.
+      include_top: whether to include the fully-connected
+        layer at the top of the network.
+      weights: one of `None` (random initialization),
+        'imagenet' (pre-training on ImageNet),
+        or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor
+        (i.e. output of `layers.Input()`)
+        to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+        if `include_top` is False (otherwise the input shape
+        has to be `(224, 224, 3)` (with `channels_last` data format)
+        or `(3, 224, 224)` (with `channels_first` data format).
+        It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction
+        when `include_top` is `False`.
+        - `None` means that the output of the model will be
+            the 4D tensor output of the
+            last convolutional layer.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional layer, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will
+            be applied.
+      classes: optional number of classes to classify images
+        into, only to be specified if `include_top` is True, and
+        if no `weights` argument is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+      **kwargs: For backwards compatibility only.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    global layers
+    if "layers" in kwargs:
+        layers = kwargs.pop("layers")
+    else:
+        layers = VersionAwareLayers()
+    if kwargs:
+        raise ValueError(f"Unknown argument(s): {kwargs}")
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top`'
+            " as true, `classes` should be 1000"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=224,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      img_input = input_tensor
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
 
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
 
-  x = layers.ZeroPadding2D(
-      padding=((3, 3), (3, 3)), name='conv1_pad')(img_input)
-  x = layers.Conv2D(64, 7, strides=2, use_bias=use_bias, name='conv1_conv')(x)
+    x = layers.ZeroPadding2D(padding=((3, 3), (3, 3)), name="conv1_pad")(
+        img_input
+    )
+    x = layers.Conv2D(64, 7, strides=2, use_bias=use_bias, name="conv1_conv")(x)
 
-  if not preact:
-    x = layers.BatchNormalization(
-        axis=bn_axis, epsilon=1.001e-5, name='conv1_bn')(x)
-    x = layers.Activation('relu', name='conv1_relu')(x)
+    if not preact:
+        x = layers.BatchNormalization(
+            axis=bn_axis, epsilon=1.001e-5, name="conv1_bn"
+        )(x)
+        x = layers.Activation("relu", name="conv1_relu")(x)
 
-  x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name='pool1_pad')(x)
-  x = layers.MaxPooling2D(3, strides=2, name='pool1_pool')(x)
+    x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name="pool1_pad")(x)
+    x = layers.MaxPooling2D(3, strides=2, name="pool1_pool")(x)
 
-  x = stack_fn(x)
+    x = stack_fn(x)
+
+    if preact:
+        x = layers.BatchNormalization(
+            axis=bn_axis, epsilon=1.001e-5, name="post_bn"
+        )(x)
+        x = layers.Activation("relu", name="post_relu")(x)
 
-  if preact:
-    x = layers.BatchNormalization(
-        axis=bn_axis, epsilon=1.001e-5, name='post_bn')(x)
-    x = layers.Activation('relu', name='post_relu')(x)
-
-  if include_top:
-    x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = training.Model(inputs, x, name=model_name)
-
-  # Load weights.
-  if (weights == 'imagenet') and (model_name in WEIGHTS_HASHES):
     if include_top:
-      file_name = model_name + '_weights_tf_dim_ordering_tf_kernels.h5'
-      file_hash = WEIGHTS_HASHES[model_name][0]
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
     else:
-      file_name = model_name + '_weights_tf_dim_ordering_tf_kernels_notop.h5'
-      file_hash = WEIGHTS_HASHES[model_name][1]
-    weights_path = data_utils.get_file(
-        file_name,
-        BASE_WEIGHTS_PATH + file_name,
-        cache_subdir='models',
-        file_hash=file_hash)
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D(name="max_pool")(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = training.Model(inputs, x, name=model_name)
+
+    # Load weights.
+    if (weights == "imagenet") and (model_name in WEIGHTS_HASHES):
+        if include_top:
+            file_name = model_name + "_weights_tf_dim_ordering_tf_kernels.h5"
+            file_hash = WEIGHTS_HASHES[model_name][0]
+        else:
+            file_name = (
+                model_name + "_weights_tf_dim_ordering_tf_kernels_notop.h5"
+            )
+            file_hash = WEIGHTS_HASHES[model_name][1]
+        weights_path = data_utils.get_file(
+            file_name,
+            BASE_WEIGHTS_PATH + file_name,
+            cache_subdir="models",
+            file_hash=file_hash,
+        )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
 
 
 def block1(x, filters, kernel_size=3, stride=1, conv_shortcut=True, name=None):
-  """A residual block.
+    """A residual block.
 
-  Args:
-    x: input tensor.
-    filters: integer, filters of the bottleneck layer.
-    kernel_size: default 3, kernel size of the bottleneck layer.
-    stride: default 1, stride of the first layer.
-    conv_shortcut: default True, use convolution shortcut if True,
-        otherwise identity shortcut.
-    name: string, block label.
+    Args:
+      x: input tensor.
+      filters: integer, filters of the bottleneck layer.
+      kernel_size: default 3, kernel size of the bottleneck layer.
+      stride: default 1, stride of the first layer.
+      conv_shortcut: default True, use convolution shortcut if True,
+          otherwise identity shortcut.
+      name: string, block label.
 
-  Returns:
-    Output tensor for the residual block.
-  """
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-
-  if conv_shortcut:
-    shortcut = layers.Conv2D(
-        4 * filters, 1, strides=stride, name=name + '_0_conv')(x)
-    shortcut = layers.BatchNormalization(
-        axis=bn_axis, epsilon=1.001e-5, name=name + '_0_bn')(shortcut)
-  else:
-    shortcut = x
-
-  x = layers.Conv2D(filters, 1, strides=stride, name=name + '_1_conv')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(x)
-  x = layers.Activation('relu', name=name + '_1_relu')(x)
-
-  x = layers.Conv2D(
-      filters, kernel_size, padding='SAME', name=name + '_2_conv')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_2_bn')(x)
-  x = layers.Activation('relu', name=name + '_2_relu')(x)
-
-  x = layers.Conv2D(4 * filters, 1, name=name + '_3_conv')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_3_bn')(x)
-
-  x = layers.Add(name=name + '_add')([shortcut, x])
-  x = layers.Activation('relu', name=name + '_out')(x)
-  return x
+    Returns:
+      Output tensor for the residual block.
+    """
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    if conv_shortcut:
+        shortcut = layers.Conv2D(
+            4 * filters, 1, strides=stride, name=name + "_0_conv"
+        )(x)
+        shortcut = layers.BatchNormalization(
+            axis=bn_axis, epsilon=1.001e-5, name=name + "_0_bn"
+        )(shortcut)
+    else:
+        shortcut = x
+
+    x = layers.Conv2D(filters, 1, strides=stride, name=name + "_1_conv")(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_1_bn"
+    )(x)
+    x = layers.Activation("relu", name=name + "_1_relu")(x)
+
+    x = layers.Conv2D(
+        filters, kernel_size, padding="SAME", name=name + "_2_conv"
+    )(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_2_bn"
+    )(x)
+    x = layers.Activation("relu", name=name + "_2_relu")(x)
+
+    x = layers.Conv2D(4 * filters, 1, name=name + "_3_conv")(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_3_bn"
+    )(x)
+
+    x = layers.Add(name=name + "_add")([shortcut, x])
+    x = layers.Activation("relu", name=name + "_out")(x)
+    return x
 
 
 def stack1(x, filters, blocks, stride1=2, name=None):
-  """A set of stacked residual blocks.
+    """A set of stacked residual blocks.
 
-  Args:
-    x: input tensor.
-    filters: integer, filters of the bottleneck layer in a block.
-    blocks: integer, blocks in the stacked blocks.
-    stride1: default 2, stride of the first layer in the first block.
-    name: string, stack label.
+    Args:
+      x: input tensor.
+      filters: integer, filters of the bottleneck layer in a block.
+      blocks: integer, blocks in the stacked blocks.
+      stride1: default 2, stride of the first layer in the first block.
+      name: string, stack label.
 
-  Returns:
-    Output tensor for the stacked blocks.
-  """
-  x = block1(x, filters, stride=stride1, name=name + '_block1')
-  for i in range(2, blocks + 1):
-    x = block1(x, filters, conv_shortcut=False, name=name + '_block' + str(i))
-  return x
+    Returns:
+      Output tensor for the stacked blocks.
+    """
+    x = block1(x, filters, stride=stride1, name=name + "_block1")
+    for i in range(2, blocks + 1):
+        x = block1(
+            x, filters, conv_shortcut=False, name=name + "_block" + str(i)
+        )
+    return x
 
 
 def block2(x, filters, kernel_size=3, stride=1, conv_shortcut=False, name=None):
-  """A residual block.
+    """A residual block.
+
+    Args:
+        x: input tensor.
+        filters: integer, filters of the bottleneck layer.
+        kernel_size: default 3, kernel size of the bottleneck layer.
+        stride: default 1, stride of the first layer.
+        conv_shortcut: default False, use convolution shortcut if True,
+          otherwise identity shortcut.
+        name: string, block label.
+
+    Returns:
+      Output tensor for the residual block.
+    """
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    preact = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_preact_bn"
+    )(x)
+    preact = layers.Activation("relu", name=name + "_preact_relu")(preact)
+
+    if conv_shortcut:
+        shortcut = layers.Conv2D(
+            4 * filters, 1, strides=stride, name=name + "_0_conv"
+        )(preact)
+    else:
+        shortcut = (
+            layers.MaxPooling2D(1, strides=stride)(x) if stride > 1 else x
+        )
 
-  Args:
+    x = layers.Conv2D(
+        filters, 1, strides=1, use_bias=False, name=name + "_1_conv"
+    )(preact)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_1_bn"
+    )(x)
+    x = layers.Activation("relu", name=name + "_1_relu")(x)
+
+    x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name=name + "_2_pad")(x)
+    x = layers.Conv2D(
+        filters,
+        kernel_size,
+        strides=stride,
+        use_bias=False,
+        name=name + "_2_conv",
+    )(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_2_bn"
+    )(x)
+    x = layers.Activation("relu", name=name + "_2_relu")(x)
+
+    x = layers.Conv2D(4 * filters, 1, name=name + "_3_conv")(x)
+    x = layers.Add(name=name + "_out")([shortcut, x])
+    return x
+
+
+def stack2(x, filters, blocks, stride1=2, name=None):
+    """A set of stacked residual blocks.
+
+    Args:
+        x: input tensor.
+        filters: integer, filters of the bottleneck layer in a block.
+        blocks: integer, blocks in the stacked blocks.
+        stride1: default 2, stride of the first layer in the first block.
+        name: string, stack label.
+
+    Returns:
+        Output tensor for the stacked blocks.
+    """
+    x = block2(x, filters, conv_shortcut=True, name=name + "_block1")
+    for i in range(2, blocks):
+        x = block2(x, filters, name=name + "_block" + str(i))
+    x = block2(x, filters, stride=stride1, name=name + "_block" + str(blocks))
+    return x
+
+
+def block3(
+    x,
+    filters,
+    kernel_size=3,
+    stride=1,
+    groups=32,
+    conv_shortcut=True,
+    name=None,
+):
+    """A residual block.
+
+    Args:
       x: input tensor.
       filters: integer, filters of the bottleneck layer.
       kernel_size: default 3, kernel size of the bottleneck layer.
       stride: default 1, stride of the first layer.
-      conv_shortcut: default False, use convolution shortcut if True,
-        otherwise identity shortcut.
+      groups: default 32, group size for grouped convolution.
+      conv_shortcut: default True, use convolution shortcut if True,
+          otherwise identity shortcut.
       name: string, block label.
 
-  Returns:
-    Output tensor for the residual block.
-  """
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-
-  preact = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_preact_bn')(x)
-  preact = layers.Activation('relu', name=name + '_preact_relu')(preact)
-
-  if conv_shortcut:
-    shortcut = layers.Conv2D(
-        4 * filters, 1, strides=stride, name=name + '_0_conv')(preact)
-  else:
-    shortcut = layers.MaxPooling2D(1, strides=stride)(x) if stride > 1 else x
-
-  x = layers.Conv2D(
-      filters, 1, strides=1, use_bias=False, name=name + '_1_conv')(preact)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(x)
-  x = layers.Activation('relu', name=name + '_1_relu')(x)
-
-  x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name=name + '_2_pad')(x)
-  x = layers.Conv2D(
-      filters,
-      kernel_size,
-      strides=stride,
-      use_bias=False,
-      name=name + '_2_conv')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_2_bn')(x)
-  x = layers.Activation('relu', name=name + '_2_relu')(x)
-
-  x = layers.Conv2D(4 * filters, 1, name=name + '_3_conv')(x)
-  x = layers.Add(name=name + '_out')([shortcut, x])
-  return x
+    Returns:
+      Output tensor for the residual block.
+    """
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    if conv_shortcut:
+        shortcut = layers.Conv2D(
+            (64 // groups) * filters,
+            1,
+            strides=stride,
+            use_bias=False,
+            name=name + "_0_conv",
+        )(x)
+        shortcut = layers.BatchNormalization(
+            axis=bn_axis, epsilon=1.001e-5, name=name + "_0_bn"
+        )(shortcut)
+    else:
+        shortcut = x
+
+    x = layers.Conv2D(filters, 1, use_bias=False, name=name + "_1_conv")(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_1_bn"
+    )(x)
+    x = layers.Activation("relu", name=name + "_1_relu")(x)
+
+    c = filters // groups
+    x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name=name + "_2_pad")(x)
+    x = layers.DepthwiseConv2D(
+        kernel_size,
+        strides=stride,
+        depth_multiplier=c,
+        use_bias=False,
+        name=name + "_2_conv",
+    )(x)
+    x_shape = backend.shape(x)[:-1]
+    x = backend.reshape(x, backend.concatenate([x_shape, (groups, c, c)]))
+    x = layers.Lambda(
+        lambda x: sum(x[:, :, :, :, i] for i in range(c)),
+        name=name + "_2_reduce",
+    )(x)
+    x = backend.reshape(x, backend.concatenate([x_shape, (filters,)]))
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_2_bn"
+    )(x)
+    x = layers.Activation("relu", name=name + "_2_relu")(x)
 
+    x = layers.Conv2D(
+        (64 // groups) * filters, 1, use_bias=False, name=name + "_3_conv"
+    )(x)
+    x = layers.BatchNormalization(
+        axis=bn_axis, epsilon=1.001e-5, name=name + "_3_bn"
+    )(x)
 
-def stack2(x, filters, blocks, stride1=2, name=None):
-  """A set of stacked residual blocks.
+    x = layers.Add(name=name + "_add")([shortcut, x])
+    x = layers.Activation("relu", name=name + "_out")(x)
+    return x
 
-  Args:
+
+def stack3(x, filters, blocks, stride1=2, groups=32, name=None):
+    """A set of stacked residual blocks.
+
+    Args:
       x: input tensor.
       filters: integer, filters of the bottleneck layer in a block.
       blocks: integer, blocks in the stacked blocks.
       stride1: default 2, stride of the first layer in the first block.
+      groups: default 32, group size for grouped convolution.
       name: string, stack label.
 
-  Returns:
+    Returns:
       Output tensor for the stacked blocks.
-  """
-  x = block2(x, filters, conv_shortcut=True, name=name + '_block1')
-  for i in range(2, blocks):
-    x = block2(x, filters, name=name + '_block' + str(i))
-  x = block2(x, filters, stride=stride1, name=name + '_block' + str(blocks))
-  return x
-
-
-def block3(x,
-           filters,
-           kernel_size=3,
-           stride=1,
-           groups=32,
-           conv_shortcut=True,
-           name=None):
-  """A residual block.
-
-  Args:
-    x: input tensor.
-    filters: integer, filters of the bottleneck layer.
-    kernel_size: default 3, kernel size of the bottleneck layer.
-    stride: default 1, stride of the first layer.
-    groups: default 32, group size for grouped convolution.
-    conv_shortcut: default True, use convolution shortcut if True,
-        otherwise identity shortcut.
-    name: string, block label.
-
-  Returns:
-    Output tensor for the residual block.
-  """
-  bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-
-  if conv_shortcut:
-    shortcut = layers.Conv2D(
-        (64 // groups) * filters,
-        1,
-        strides=stride,
-        use_bias=False,
-        name=name + '_0_conv')(x)
-    shortcut = layers.BatchNormalization(
-        axis=bn_axis, epsilon=1.001e-5, name=name + '_0_bn')(shortcut)
-  else:
-    shortcut = x
-
-  x = layers.Conv2D(filters, 1, use_bias=False, name=name + '_1_conv')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(x)
-  x = layers.Activation('relu', name=name + '_1_relu')(x)
-
-  c = filters // groups
-  x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name=name + '_2_pad')(x)
-  x = layers.DepthwiseConv2D(
-      kernel_size,
-      strides=stride,
-      depth_multiplier=c,
-      use_bias=False,
-      name=name + '_2_conv')(x)
-  x_shape = backend.shape(x)[:-1]
-  x = backend.reshape(x, backend.concatenate([x_shape, (groups, c, c)]))
-  x = layers.Lambda(
-      lambda x: sum(x[:, :, :, :, i] for i in range(c)),
-      name=name + '_2_reduce')(x)
-  x = backend.reshape(x, backend.concatenate([x_shape, (filters,)]))
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_2_bn')(x)
-  x = layers.Activation('relu', name=name + '_2_relu')(x)
-
-  x = layers.Conv2D(
-      (64 // groups) * filters, 1, use_bias=False, name=name + '_3_conv')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_3_bn')(x)
-
-  x = layers.Add(name=name + '_add')([shortcut, x])
-  x = layers.Activation('relu', name=name + '_out')(x)
-  return x
-
-
-def stack3(x, filters, blocks, stride1=2, groups=32, name=None):
-  """A set of stacked residual blocks.
-
-  Args:
-    x: input tensor.
-    filters: integer, filters of the bottleneck layer in a block.
-    blocks: integer, blocks in the stacked blocks.
-    stride1: default 2, stride of the first layer in the first block.
-    groups: default 32, group size for grouped convolution.
-    name: string, stack label.
-
-  Returns:
-    Output tensor for the stacked blocks.
-  """
-  x = block3(x, filters, stride=stride1, groups=groups, name=name + '_block1')
-  for i in range(2, blocks + 1):
-    x = block3(
-        x,
-        filters,
-        groups=groups,
-        conv_shortcut=False,
-        name=name + '_block' + str(i))
-  return x
-
-
-@keras_export('keras.applications.resnet50.ResNet50',
-              'keras.applications.resnet.ResNet50',
-              'keras.applications.ResNet50')
-def ResNet50(include_top=True,
-             weights='imagenet',
-             input_tensor=None,
-             input_shape=None,
-             pooling=None,
-             classes=1000,
-             **kwargs):
-  """Instantiates the ResNet50 architecture."""
-
-  def stack_fn(x):
-    x = stack1(x, 64, 3, stride1=1, name='conv2')
-    x = stack1(x, 128, 4, name='conv3')
-    x = stack1(x, 256, 6, name='conv4')
-    return stack1(x, 512, 3, name='conv5')
-
-  return ResNet(stack_fn, False, True, 'resnet50', include_top, weights,
-                input_tensor, input_shape, pooling, classes, **kwargs)
-
-
-@keras_export('keras.applications.resnet.ResNet101',
-              'keras.applications.ResNet101')
-def ResNet101(include_top=True,
-              weights='imagenet',
-              input_tensor=None,
-              input_shape=None,
-              pooling=None,
-              classes=1000,
-              **kwargs):
-  """Instantiates the ResNet101 architecture."""
-
-  def stack_fn(x):
-    x = stack1(x, 64, 3, stride1=1, name='conv2')
-    x = stack1(x, 128, 4, name='conv3')
-    x = stack1(x, 256, 23, name='conv4')
-    return stack1(x, 512, 3, name='conv5')
-
-  return ResNet(stack_fn, False, True, 'resnet101', include_top, weights,
-                input_tensor, input_shape, pooling, classes, **kwargs)
-
-
-@keras_export('keras.applications.resnet.ResNet152',
-              'keras.applications.ResNet152')
-def ResNet152(include_top=True,
-              weights='imagenet',
-              input_tensor=None,
-              input_shape=None,
-              pooling=None,
-              classes=1000,
-              **kwargs):
-  """Instantiates the ResNet152 architecture."""
-
-  def stack_fn(x):
-    x = stack1(x, 64, 3, stride1=1, name='conv2')
-    x = stack1(x, 128, 8, name='conv3')
-    x = stack1(x, 256, 36, name='conv4')
-    return stack1(x, 512, 3, name='conv5')
-
-  return ResNet(stack_fn, False, True, 'resnet152', include_top, weights,
-                input_tensor, input_shape, pooling, classes, **kwargs)
-
-
-@keras_export('keras.applications.resnet50.preprocess_input',
-              'keras.applications.resnet.preprocess_input')
+    """
+    x = block3(x, filters, stride=stride1, groups=groups, name=name + "_block1")
+    for i in range(2, blocks + 1):
+        x = block3(
+            x,
+            filters,
+            groups=groups,
+            conv_shortcut=False,
+            name=name + "_block" + str(i),
+        )
+    return x
+
+
+@keras_export(
+    "keras.applications.resnet50.ResNet50",
+    "keras.applications.resnet.ResNet50",
+    "keras.applications.ResNet50",
+)
+def ResNet50(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    **kwargs,
+):
+    """Instantiates the ResNet50 architecture."""
+
+    def stack_fn(x):
+        x = stack1(x, 64, 3, stride1=1, name="conv2")
+        x = stack1(x, 128, 4, name="conv3")
+        x = stack1(x, 256, 6, name="conv4")
+        return stack1(x, 512, 3, name="conv5")
+
+    return ResNet(
+        stack_fn,
+        False,
+        True,
+        "resnet50",
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet.ResNet101", "keras.applications.ResNet101"
+)
+def ResNet101(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    **kwargs,
+):
+    """Instantiates the ResNet101 architecture."""
+
+    def stack_fn(x):
+        x = stack1(x, 64, 3, stride1=1, name="conv2")
+        x = stack1(x, 128, 4, name="conv3")
+        x = stack1(x, 256, 23, name="conv4")
+        return stack1(x, 512, 3, name="conv5")
+
+    return ResNet(
+        stack_fn,
+        False,
+        True,
+        "resnet101",
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet.ResNet152", "keras.applications.ResNet152"
+)
+def ResNet152(
+    include_top=True,
+    weights="imagenet",
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    **kwargs,
+):
+    """Instantiates the ResNet152 architecture."""
+
+    def stack_fn(x):
+        x = stack1(x, 64, 3, stride1=1, name="conv2")
+        x = stack1(x, 128, 8, name="conv3")
+        x = stack1(x, 256, 36, name="conv4")
+        return stack1(x, 512, 3, name="conv5")
+
+    return ResNet(
+        stack_fn,
+        False,
+        True,
+        "resnet152",
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        **kwargs,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet50.preprocess_input",
+    "keras.applications.resnet.preprocess_input",
+)
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(
-      x, data_format=data_format, mode='caffe')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="caffe"
+    )
 
 
-@keras_export('keras.applications.resnet50.decode_predictions',
-              'keras.applications.resnet.decode_predictions')
+@keras_export(
+    "keras.applications.resnet50.decode_predictions",
+    "keras.applications.resnet.decode_predictions",
+)
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
@@ -580,6 +688,6 @@ def decode_predictions(preds, top=5):
     A Keras model instance.
 """
 
-setattr(ResNet50, '__doc__', ResNet50.__doc__ + DOC)
-setattr(ResNet101, '__doc__', ResNet101.__doc__ + DOC)
-setattr(ResNet152, '__doc__', ResNet152.__doc__ + DOC)
+setattr(ResNet50, "__doc__", ResNet50.__doc__ + DOC)
+setattr(ResNet101, "__doc__", ResNet101.__doc__ + DOC)
+setattr(ResNet152, "__doc__", ResNet152.__doc__ + DOC)
diff --git a/keras/applications/resnet_rs.py b/keras/applications/resnet_rs.py
index 6b4baa117862..eafa79ec0c69 100644
--- a/keras/applications/resnet_rs.py
+++ b/keras/applications/resnet_rs.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=missing-function-docstring
+
+
 """ResNet-RS models for Keras.
 
 Reference:
@@ -21,7 +21,12 @@
     https://arxiv.org/pdf/2103.07579.pdf)
 """
 import sys
-from typing import Callable, Dict, List, Union
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Union
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras import layers
@@ -29,13 +34,13 @@
 from keras.engine import training
 from keras.utils import data_utils
 from keras.utils import layer_utils
-import tensorflow.compat.v2 as tf
-
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-BASE_WEIGHTS_URL = ("https://storage.googleapis.com/tensorflow/"
-                    "keras-applications/resnet_rs/")
+BASE_WEIGHTS_URL = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/resnet_rs/"
+)
 
 WEIGHT_HASHES = {
     "resnet-rs-101-i160.h5": "544b3434d00efc199d66e9058c7f3379",
@@ -73,130 +78,46 @@
 }
 BLOCK_ARGS = {
     50: [
-        {
-            "input_filters": 64,
-            "num_repeats": 3
-        },
-        {
-            "input_filters": 128,
-            "num_repeats": 4
-        },
-        {
-            "input_filters": 256,
-            "num_repeats": 6
-        },
-        {
-            "input_filters": 512,
-            "num_repeats": 3
-        },
+        {"input_filters": 64, "num_repeats": 3},
+        {"input_filters": 128, "num_repeats": 4},
+        {"input_filters": 256, "num_repeats": 6},
+        {"input_filters": 512, "num_repeats": 3},
     ],
     101: [
-        {
-            "input_filters": 64,
-            "num_repeats": 3
-        },
-        {
-            "input_filters": 128,
-            "num_repeats": 4
-        },
-        {
-            "input_filters": 256,
-            "num_repeats": 23
-        },
-        {
-            "input_filters": 512,
-            "num_repeats": 3
-        },
+        {"input_filters": 64, "num_repeats": 3},
+        {"input_filters": 128, "num_repeats": 4},
+        {"input_filters": 256, "num_repeats": 23},
+        {"input_filters": 512, "num_repeats": 3},
     ],
     152: [
-        {
-            "input_filters": 64,
-            "num_repeats": 3
-        },
-        {
-            "input_filters": 128,
-            "num_repeats": 8
-        },
-        {
-            "input_filters": 256,
-            "num_repeats": 36
-        },
-        {
-            "input_filters": 512,
-            "num_repeats": 3
-        },
+        {"input_filters": 64, "num_repeats": 3},
+        {"input_filters": 128, "num_repeats": 8},
+        {"input_filters": 256, "num_repeats": 36},
+        {"input_filters": 512, "num_repeats": 3},
     ],
     200: [
-        {
-            "input_filters": 64,
-            "num_repeats": 3
-        },
-        {
-            "input_filters": 128,
-            "num_repeats": 24
-        },
-        {
-            "input_filters": 256,
-            "num_repeats": 36
-        },
-        {
-            "input_filters": 512,
-            "num_repeats": 3
-        },
+        {"input_filters": 64, "num_repeats": 3},
+        {"input_filters": 128, "num_repeats": 24},
+        {"input_filters": 256, "num_repeats": 36},
+        {"input_filters": 512, "num_repeats": 3},
     ],
     270: [
-        {
-            "input_filters": 64,
-            "num_repeats": 4
-        },
-        {
-            "input_filters": 128,
-            "num_repeats": 29
-        },
-        {
-            "input_filters": 256,
-            "num_repeats": 53
-        },
-        {
-            "input_filters": 512,
-            "num_repeats": 4
-        },
+        {"input_filters": 64, "num_repeats": 4},
+        {"input_filters": 128, "num_repeats": 29},
+        {"input_filters": 256, "num_repeats": 53},
+        {"input_filters": 512, "num_repeats": 4},
     ],
     350: [
-        {
-            "input_filters": 64,
-            "num_repeats": 4
-        },
-        {
-            "input_filters": 128,
-            "num_repeats": 36
-        },
-        {
-            "input_filters": 256,
-            "num_repeats": 72
-        },
-        {
-            "input_filters": 512,
-            "num_repeats": 4
-        },
+        {"input_filters": 64, "num_repeats": 4},
+        {"input_filters": 128, "num_repeats": 36},
+        {"input_filters": 256, "num_repeats": 72},
+        {"input_filters": 512, "num_repeats": 4},
     ],
     420: [
-        {
-            "input_filters": 64,
-            "num_repeats": 4
-        },
-        {
-            "input_filters": 128,
-            "num_repeats": 44
-        },
-        {
-            "input_filters": 256,
-            "num_repeats": 87
-        },
-        {
-            "input_filters": 512,
-            "num_repeats": 4
-        },
+        {"input_filters": 64, "num_repeats": 4},
+        {"input_filters": 128, "num_repeats": 44},
+        {"input_filters": 256, "num_repeats": 87},
+        {"input_filters": 512, "num_repeats": 4},
     ],
 }
 CONV_KERNEL_INITIALIZER = {
@@ -204,7 +125,7 @@
     "config": {
         "scale": 2.0,
         "mode": "fan_out",
-        "distribution": "truncated_normal"
+        "distribution": "truncated_normal",
     },
 }
 
@@ -272,10 +193,12 @@
             specified.
         classifier_activation: A `str` or callable. The activation function to
             use on the "top" layer. Ignored unless `include_top=True`. Set
-            `classifier_activation=None` to return the logits of the "top" layer.
-        include_preprocessing: Boolean, whether to include the preprocessing layer
-            (`Rescaling`) at the bottom of the network. Defaults to `True`.
-            Note: Input image is normalized by ImageNet mean and standard deviation.
+            `classifier_activation=None` to return the logits of the "top"
+            layer.
+        include_preprocessing: Boolean, whether to include the preprocessing
+            layer (`Rescaling`) at the bottom of the network. Note: Input image
+            is normalized by ImageNet mean and standard deviation.
+            Defaults to `True`.
 
     Returns:
         A `keras.Model` instance.
@@ -283,25 +206,25 @@
 
 
 def Conv2DFixedPadding(filters, kernel_size, strides, name=None):
-  """Conv2D block with fixed padding."""
-  if name is None:
-    counter = backend.get_uid("conv_")
-    name = f"conv_{counter}"
-
-  def apply(inputs):
-    if strides > 1:
-      inputs = fixed_padding(inputs, kernel_size)
-    return layers.Conv2D(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding="same" if strides == 1 else "valid",
-        use_bias=False,
-        kernel_initializer=CONV_KERNEL_INITIALIZER,
-        name=name,
-    )(inputs)
+    """Conv2D block with fixed padding."""
+    if name is None:
+        counter = backend.get_uid("conv_")
+        name = f"conv_{counter}"
+
+    def apply(inputs):
+        if strides > 1:
+            inputs = fixed_padding(inputs, kernel_size)
+        return layers.Conv2D(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding="same" if strides == 1 else "valid",
+            use_bias=False,
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            name=name,
+        )(inputs)
 
-  return apply
+    return apply
 
 
 def STEM(
@@ -310,111 +233,112 @@ def STEM(
     activation: str = "relu",
     name=None,
 ):
-  """ResNet-D type STEM block."""
-  if name is None:
-    counter = backend.get_uid("stem_")
-    name = f"stem_{counter}"
+    """ResNet-D type STEM block."""
+    if name is None:
+        counter = backend.get_uid("stem_")
+        name = f"stem_{counter}"
 
-  def apply(inputs):
-    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+    def apply(inputs):
+        bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
 
-    # First stem block
-    x = Conv2DFixedPadding(
-        filters=32,
-        kernel_size=3,
-        strides=2,
-        name=name + "_stem_conv_1"
-    )(inputs)
-    x = layers.BatchNormalization(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + "_stem_batch_norm_1",
-    )(x)
-    x = layers.Activation(activation, name=name + "_stem_act_1")(x)
-
-    # Second stem block
-    x = Conv2DFixedPadding(
-        filters=32, kernel_size=3, strides=1, name=name + "_stem_conv_2")(x)
-    x = layers.BatchNormalization(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + "_stem_batch_norm_2",
-    )(x)
-    x = layers.Activation(activation, name=name + "_stem_act_2")(x)
-
-    # Final Stem block:
-    x = Conv2DFixedPadding(
-        filters=64, kernel_size=3, strides=1, name=name + "_stem_conv_3")(x)
-    x = layers.BatchNormalization(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + "_stem_batch_norm_3",
-    )(x)
-    x = layers.Activation(activation, name=name + "_stem_act_3")(x)
-
-    # Replace stem max pool:
-    x = Conv2DFixedPadding(
-        filters=64, kernel_size=3, strides=2, name=name + "_stem_conv_4")(x)
-    x = layers.BatchNormalization(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + "_stem_batch_norm_4",
-    )(x)
-    x = layers.Activation(activation, name=name + "_stem_act_4")(x)
-    return x
-
-  return apply
-
-
-def SE(in_filters: int,
-       se_ratio: float = 0.25,
-       expand_ratio: int = 1,
-       name=None):
-  """Squeeze and Excitation block."""
-  bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
-  if name is None:
-    counter = backend.get_uid("se_")
-    name = f"se_{counter}"
-
-  def apply(inputs):
-    x = layers.GlobalAveragePooling2D(name=name + "_se_squeeze")(inputs)
-    if bn_axis == 1:
-      se_shape = (x.shape[-1], 1, 1)
-    else:
-      se_shape = (1, 1, x.shape[-1])
-    x = layers.Reshape(se_shape, name=name + "_se_reshape")(x)
-
-    num_reduced_filters = max(1, int(in_filters * 4 * se_ratio))
-
-    x = layers.Conv2D(
-        filters=num_reduced_filters,
-        kernel_size=[1, 1],
-        strides=[1, 1],
-        kernel_initializer=CONV_KERNEL_INITIALIZER,
-        padding="same",
-        use_bias=True,
-        activation="relu",
-        name=name + "_se_reduce",
-    )(x)
-
-    x = layers.Conv2D(
-        filters=4 * in_filters * expand_ratio,  # Expand ratio is 1 by default
-        kernel_size=[1, 1],
-        strides=[1, 1],
-        kernel_initializer=CONV_KERNEL_INITIALIZER,
-        padding="same",
-        use_bias=True,
-        activation="sigmoid",
-        name=name + "_se_expand",
-    )(x)
+        # First stem block
+        x = Conv2DFixedPadding(
+            filters=32, kernel_size=3, strides=2, name=name + "_stem_conv_1"
+        )(inputs)
+        x = layers.BatchNormalization(
+            axis=bn_axis,
+            momentum=bn_momentum,
+            epsilon=bn_epsilon,
+            name=name + "_stem_batch_norm_1",
+        )(x)
+        x = layers.Activation(activation, name=name + "_stem_act_1")(x)
+
+        # Second stem block
+        x = Conv2DFixedPadding(
+            filters=32, kernel_size=3, strides=1, name=name + "_stem_conv_2"
+        )(x)
+        x = layers.BatchNormalization(
+            axis=bn_axis,
+            momentum=bn_momentum,
+            epsilon=bn_epsilon,
+            name=name + "_stem_batch_norm_2",
+        )(x)
+        x = layers.Activation(activation, name=name + "_stem_act_2")(x)
+
+        # Final Stem block:
+        x = Conv2DFixedPadding(
+            filters=64, kernel_size=3, strides=1, name=name + "_stem_conv_3"
+        )(x)
+        x = layers.BatchNormalization(
+            axis=bn_axis,
+            momentum=bn_momentum,
+            epsilon=bn_epsilon,
+            name=name + "_stem_batch_norm_3",
+        )(x)
+        x = layers.Activation(activation, name=name + "_stem_act_3")(x)
+
+        # Replace stem max pool:
+        x = Conv2DFixedPadding(
+            filters=64, kernel_size=3, strides=2, name=name + "_stem_conv_4"
+        )(x)
+        x = layers.BatchNormalization(
+            axis=bn_axis,
+            momentum=bn_momentum,
+            epsilon=bn_epsilon,
+            name=name + "_stem_batch_norm_4",
+        )(x)
+        x = layers.Activation(activation, name=name + "_stem_act_4")(x)
+        return x
+
+    return apply
+
+
+def SE(
+    in_filters: int, se_ratio: float = 0.25, expand_ratio: int = 1, name=None
+):
+    """Squeeze and Excitation block."""
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+    if name is None:
+        counter = backend.get_uid("se_")
+        name = f"se_{counter}"
+
+    def apply(inputs):
+        x = layers.GlobalAveragePooling2D(name=name + "_se_squeeze")(inputs)
+        if bn_axis == 1:
+            se_shape = (x.shape[-1], 1, 1)
+        else:
+            se_shape = (1, 1, x.shape[-1])
+        x = layers.Reshape(se_shape, name=name + "_se_reshape")(x)
+
+        num_reduced_filters = max(1, int(in_filters * 4 * se_ratio))
+
+        x = layers.Conv2D(
+            filters=num_reduced_filters,
+            kernel_size=[1, 1],
+            strides=[1, 1],
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            padding="same",
+            use_bias=True,
+            activation="relu",
+            name=name + "_se_reduce",
+        )(x)
+
+        x = layers.Conv2D(
+            filters=4
+            * in_filters
+            * expand_ratio,  # Expand ratio is 1 by default
+            kernel_size=[1, 1],
+            strides=[1, 1],
+            kernel_initializer=CONV_KERNEL_INITIALIZER,
+            padding="same",
+            use_bias=True,
+            activation="sigmoid",
+            name=name + "_se_expand",
+        )(x)
 
-    return layers.multiply([inputs, x], name=name + "_se_excite")
+        return layers.multiply([inputs, x], name=name + "_se_excite")
 
-  return apply
+    return apply
 
 
 def BottleneckBlock(
@@ -428,98 +352,100 @@ def BottleneckBlock(
     survival_probability: float = 0.8,
     name=None,
 ):
-  """Bottleneck block variant for residual networks with BN."""
-  if name is None:
-    counter = backend.get_uid("block_0_")
-    name = f"block_0_{counter}"
-
-  def apply(inputs):
-    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
-
-    shortcut = inputs
-
-    if use_projection:
-      filters_out = filters * 4
-      if strides == 2:
-        shortcut = layers.AveragePooling2D(
-            pool_size=(2, 2),
-            strides=(2, 2),
-            padding="same",
-            name=name + "_projection_pooling",
+    """Bottleneck block variant for residual networks with BN."""
+    if name is None:
+        counter = backend.get_uid("block_0_")
+        name = f"block_0_{counter}"
+
+    def apply(inputs):
+        bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+        shortcut = inputs
+
+        if use_projection:
+            filters_out = filters * 4
+            if strides == 2:
+                shortcut = layers.AveragePooling2D(
+                    pool_size=(2, 2),
+                    strides=(2, 2),
+                    padding="same",
+                    name=name + "_projection_pooling",
+                )(inputs)
+                shortcut = Conv2DFixedPadding(
+                    filters=filters_out,
+                    kernel_size=1,
+                    strides=1,
+                    name=name + "_projection_conv",
+                )(shortcut)
+            else:
+                shortcut = Conv2DFixedPadding(
+                    filters=filters_out,
+                    kernel_size=1,
+                    strides=strides,
+                    name=name + "_projection_conv",
+                )(inputs)
+
+            shortcut = layers.BatchNormalization(
+                axis=bn_axis,
+                momentum=bn_momentum,
+                epsilon=bn_epsilon,
+                name=name + "_projection_batch_norm",
+            )(shortcut)
+
+        # First conv layer:
+        x = Conv2DFixedPadding(
+            filters=filters, kernel_size=1, strides=1, name=name + "_conv_1"
         )(inputs)
-        shortcut = Conv2DFixedPadding(
-            filters=filters_out,
-            kernel_size=1,
-            strides=1,
-            name=name + "_projection_conv",
-        )(shortcut)
-      else:
-        shortcut = Conv2DFixedPadding(
-            filters=filters_out,
-            kernel_size=1,
+        x = layers.BatchNormalization(
+            axis=bn_axis,
+            momentum=bn_momentum,
+            epsilon=bn_epsilon,
+            name=name + "batch_norm_1",
+        )(x)
+        x = layers.Activation(activation, name=name + "_act_1")(x)
+
+        # Second conv layer:
+        x = Conv2DFixedPadding(
+            filters=filters,
+            kernel_size=3,
             strides=strides,
-            name=name + "_projection_conv",
-        )(inputs)
-
-      shortcut = layers.BatchNormalization(
-          axis=bn_axis,
-          momentum=bn_momentum,
-          epsilon=bn_epsilon,
-          name=name + "_projection_batch_norm",
-      )(shortcut)
-
-    # First conv layer:
-    x = Conv2DFixedPadding(
-        filters=filters,
-        kernel_size=1,
-        strides=1,
-        name=name + "_conv_1"
-    )(inputs)
-    x = layers.BatchNormalization(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + "batch_norm_1",
-    )(x)
-    x = layers.Activation(activation, name=name + "_act_1")(x)
-
-    # Second conv layer:
-    x = Conv2DFixedPadding(
-        filters=filters, kernel_size=3, strides=strides, name=name + "_conv_2")(
-            x)
-    x = layers.BatchNormalization(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + "_batch_norm_2",
-    )(x)
-    x = layers.Activation(activation, name=name + "_act_2")(x)
-
-    # Third conv layer:
-    x = Conv2DFixedPadding(
-        filters=filters * 4, kernel_size=1, strides=1, name=name + "_conv_3")(x)
-    x = layers.BatchNormalization(
-        axis=bn_axis,
-        momentum=bn_momentum,
-        epsilon=bn_epsilon,
-        name=name + "_batch_norm_3",
-    )(x)
-
-    if 0 < se_ratio < 1:
-      x = SE(filters, se_ratio=se_ratio, name=name + "_se")(x)
-
-    # Drop connect
-    if survival_probability:
-      x = layers.Dropout(
-          survival_probability,
-          noise_shape=(None, 1, 1, 1),
-          name=name + "_drop")(x)
-
-    x = layers.Add()([x, shortcut])
-
-    return layers.Activation(activation, name=name + "_output_act")(x)
-
-  return apply
+            name=name + "_conv_2",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=bn_axis,
+            momentum=bn_momentum,
+            epsilon=bn_epsilon,
+            name=name + "_batch_norm_2",
+        )(x)
+        x = layers.Activation(activation, name=name + "_act_2")(x)
+
+        # Third conv layer:
+        x = Conv2DFixedPadding(
+            filters=filters * 4, kernel_size=1, strides=1, name=name + "_conv_3"
+        )(x)
+        x = layers.BatchNormalization(
+            axis=bn_axis,
+            momentum=bn_momentum,
+            epsilon=bn_epsilon,
+            name=name + "_batch_norm_3",
+        )(x)
+
+        if 0 < se_ratio < 1:
+            x = SE(filters, se_ratio=se_ratio, name=name + "_se")(x)
+
+        # Drop connect
+        if survival_probability:
+            x = layers.Dropout(
+                survival_probability,
+                noise_shape=(None, 1, 1, 1),
+                name=name + "_drop",
+            )(x)
+
+        x = layers.Add()([x, shortcut])
+
+        return layers.Activation(activation, name=name + "_output_act")(x)
+
+    return apply
 
 
 def BlockGroup(
@@ -533,65 +459,68 @@ def BlockGroup(
     survival_probability: float = 0.8,
     name=None,
 ):
-  """Create one group of blocks for the ResNet model."""
-  if name is None:
-    counter = backend.get_uid("block_group_")
-    name = f"block_group_{counter}"
-
-  def apply(inputs):
-    # Only the first block per block_group uses projection shortcut and strides.
-    x = BottleneckBlock(
-        filters=filters,
-        strides=strides,
-        use_projection=True,
-        se_ratio=se_ratio,
-        bn_epsilon=bn_epsilon,
-        bn_momentum=bn_momentum,
-        activation=activation,
-        survival_probability=survival_probability,
-        name=name + "_block_0_",
-    )(inputs)
+    """Create one group of blocks for the ResNet model."""
+    if name is None:
+        counter = backend.get_uid("block_group_")
+        name = f"block_group_{counter}"
+
+    def apply(inputs):
+        # Only the first block per block_group uses projection shortcut and
+        # strides.
+        x = BottleneckBlock(
+            filters=filters,
+            strides=strides,
+            use_projection=True,
+            se_ratio=se_ratio,
+            bn_epsilon=bn_epsilon,
+            bn_momentum=bn_momentum,
+            activation=activation,
+            survival_probability=survival_probability,
+            name=name + "_block_0_",
+        )(inputs)
 
-    for i in range(1, num_repeats):
-      x = BottleneckBlock(
-          filters=filters,
-          strides=1,
-          use_projection=False,
-          se_ratio=se_ratio,
-          activation=activation,
-          bn_epsilon=bn_epsilon,
-          bn_momentum=bn_momentum,
-          survival_probability=survival_probability,
-          name=name + f"_block_{i}_",
-      )(x)
-    return x
+        for i in range(1, num_repeats):
+            x = BottleneckBlock(
+                filters=filters,
+                strides=1,
+                use_projection=False,
+                se_ratio=se_ratio,
+                activation=activation,
+                bn_epsilon=bn_epsilon,
+                bn_momentum=bn_momentum,
+                survival_probability=survival_probability,
+                name=name + f"_block_{i}_",
+            )(x)
+        return x
 
-  return apply
+    return apply
 
 
 def get_survival_probability(init_rate, block_num, total_blocks):
-  """Get survival probability based on block number and initial rate."""
-  return init_rate * float(block_num) / total_blocks
+    """Get survival probability based on block number and initial rate."""
+    return init_rate * float(block_num) / total_blocks
 
 
 def allow_bigger_recursion(target_limit: int):
-  """Increase default recursion limit to create larger models."""
-  current_limit = sys.getrecursionlimit()
-  if current_limit < target_limit:
-    sys.setrecursionlimit(target_limit)
+    """Increase default recursion limit to create larger models."""
+    current_limit = sys.getrecursionlimit()
+    if current_limit < target_limit:
+        sys.setrecursionlimit(target_limit)
 
 
 def fixed_padding(inputs, kernel_size):
-  """Pad the input along the spatial dimensions independently of input size."""
-  pad_total = kernel_size - 1
-  pad_beg = pad_total // 2
-  pad_end = pad_total - pad_beg
-
-  # Use ZeroPadding as to avoid TFOpLambda layer
-  padded_inputs = layers.ZeroPadding2D(
-      padding=((pad_beg, pad_end), (pad_beg, pad_end)))(inputs)
+    """Pad the input along the spatial dimensions independently of input
+    size."""
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+
+    # Use ZeroPadding as to avoid TFOpLambda layer
+    padded_inputs = layers.ZeroPadding2D(
+        padding=((pad_beg, pad_end), (pad_beg, pad_end))
+    )(inputs)
 
-  return padded_inputs
+    return padded_inputs
 
 
 def ResNetRS(
@@ -610,193 +539,204 @@ def ResNetRS(
     weights="imagenet",
     input_tensor=None,
     classes=1000,
-    # pylint: disable=g-bare-generic
     classifier_activation: Union[str, Callable] = "softmax",
     include_preprocessing=True,
 ):
-  """Build Resnet-RS model, given provided parameters.
-
-  Args:
-      depth: Depth of ResNet network.
-      input_shape: optional shape tuple. It should have exactly 3 inputs
-        channels, and width and height should be no smaller than 32. E.g. (200,
-        200, 3) would be one valid value.
-      bn_momentum: Momentum parameter for Batch Normalization layers.
-      bn_epsilon: Epsilon parameter for Batch Normalization layers.
-      activation: activation function.
-      se_ratio: Squeeze and Excitation layer ratio.
-      dropout_rate: dropout rate before final classifier layer.
-      drop_connect_rate: dropout rate at skip connections.
-      include_top: whether to include the fully-connected layer at the top of
-        the network.
-      block_args: list of dicts, parameters to construct block modules.
-      model_name: name of the model.
-      pooling: optional pooling mode for feature extraction when `include_top`
-        is `False`. - `None` means that the output of the model will be the 4D
-        tensor output of the last convolutional layer. - `avg` means that global
-        average pooling will be applied to the output of the last convolutional
-        layer, and thus the output of the model will be a 2D tensor. - `max`
-        means that global max pooling will be applied.
-      weights: one of `None` (random initialization), `'imagenet'` (pre-training
-        on ImageNet), or the path to the weights file to be loaded. Note- one
-        model can have multiple imagenet variants depending on input shape it
-        was trained with. For input_shape 224x224 pass `imagenet-i224` as
-        argument. By default, highest input shape weights are downloaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to
-        use as image input for the model.
-      classes: optional number of classes to classify images into, only to be
-        specified if `include_top` is True, and if no `weights` argument is
-        specified.
-      classifier_activation: A `str` or callable. The activation function to use
-        on the "top" layer. Ignored unless `include_top=True`. Set
-        `classifier_activation=None` to return the logits of the "top" layer.
-      include_preprocessing: Boolean, whether to include the preprocessing layer
-        (`Rescaling`) at the bottom of the network. Defaults to `True`. Note-
-        Input image is normalized by ImageNet mean and standard deviation.
-
-  Returns:
-      A `tf.keras.Model` instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`, or invalid input
-          shape.
-      ValueError: if `classifier_activation` is not `softmax` or `None` when
-          using a pretrained top layer.
-  """
-  # Validate parameters
-  available_weight_variants = DEPTH_TO_WEIGHT_VARIANTS[depth]
-  if weights == "imagenet":
-    max_input_shape = max(available_weight_variants)
-    # `imagenet` argument without explicit weights input size.
-    # Picking weights trained with biggest available shape
-    weights = f"{weights}-i{max_input_shape}"
-
-  weights_allow_list = [f"imagenet-i{x}" for x in available_weight_variants]
-  if not (weights in {*weights_allow_list, None} or
-          tf.io.gfile.exists(weights)):
-    raise ValueError(
-        "The `weights` argument should be either "
-        "`None` (random initialization), `'imagenet'` "
-        "(pre-training on ImageNet, with highest available input shape),"
-        " or the path to the weights file to be loaded. "
-        f"For ResNetRS{depth} the following weight variants are "
-        f"available {weights_allow_list} (default=highest)."
-        f" Received weights={weights}")
-
-  if weights in weights_allow_list and include_top and classes != 1000:
-    raise ValueError(
-        f"If using `weights` as `'imagenet'` or any of {weights_allow_list} "
-        f"with `include_top` as true, `classes` should be 1000. "
-        f"Received classes={classes}")
-
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights,
-  )
-  # Define input tensor
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
-
-  x = img_input
-
-  if include_preprocessing:
-    num_channels = input_shape[bn_axis - 1]
-    x = layers.Rescaling(scale=1.0 / 255)(x)
-    if num_channels == 3:
-      x = layers.Normalization(
-          mean=[0.485, 0.456, 0.406],
-          variance=[0.229**2, 0.224**2, 0.225**2],
-          axis=bn_axis,
-      )(x)
-
-  # Build stem
-  x = STEM(
-      bn_momentum=bn_momentum, bn_epsilon=bn_epsilon, activation=activation)(x)
-
-  # Build blocks
-  if block_args is None:
-    block_args = BLOCK_ARGS[depth]
-
-  for i, args in enumerate(block_args):
-    survival_probability = get_survival_probability(
-        init_rate=drop_connect_rate,
-        block_num=i + 2,
-        total_blocks=len(block_args) + 1,
+    """Build Resnet-RS model, given provided parameters.
+
+    Args:
+        depth: Depth of ResNet network.
+        input_shape: optional shape tuple. It should have exactly 3 inputs
+          channels, and width and height should be no smaller than 32. E.g.
+          (200, 200, 3) would be one valid value.
+        bn_momentum: Momentum parameter for Batch Normalization layers.
+        bn_epsilon: Epsilon parameter for Batch Normalization layers.
+        activation: activation function.
+        se_ratio: Squeeze and Excitation layer ratio.
+        dropout_rate: dropout rate before final classifier layer.
+        drop_connect_rate: dropout rate at skip connections.
+        include_top: whether to include the fully-connected layer at the top of
+          the network.
+        block_args: list of dicts, parameters to construct block modules.
+        model_name: name of the model.
+        pooling: optional pooling mode for feature extraction when `include_top`
+          is `False`.
+          - `None` means that the output of the model will be the 4D tensor
+            output of the last convolutional layer.
+          - `avg` means that global average pooling will be applied to the
+            output of the last convolutional layer, and thus the output of the
+            model will be a 2D tensor.
+          - `max` means that global max pooling will be applied.
+        weights: one of `None` (random initialization), `'imagenet'`
+          (pre-training on ImageNet), or the path to the weights file to be
+          loaded. Note- one model can have multiple imagenet variants depending
+          on input shape it was trained with. For input_shape 224x224 pass
+          `imagenet-i224` as argument. By default, highest input shape weights
+          are downloaded.
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to
+          use as image input for the model.
+        classes: optional number of classes to classify images into, only to be
+          specified if `include_top` is True, and if no `weights` argument is
+          specified.
+        classifier_activation: A `str` or callable. The activation function to
+          use on the "top" layer. Ignored unless `include_top=True`. Set
+          `classifier_activation=None` to return the logits of the "top" layer.
+        include_preprocessing: Boolean, whether to include the preprocessing
+          layer (`Rescaling`) at the bottom of the network. Note - Input image
+          is normalized by ImageNet mean and standard deviation.
+          Defaults to `True`.
+
+
+    Returns:
+        A `tf.keras.Model` instance.
+
+    Raises:
+        ValueError: in case of invalid argument for `weights`, or invalid input
+            shape.
+        ValueError: if `classifier_activation` is not `softmax` or `None` when
+            using a pretrained top layer.
+    """
+    # Validate parameters
+    available_weight_variants = DEPTH_TO_WEIGHT_VARIANTS[depth]
+    if weights == "imagenet":
+        max_input_shape = max(available_weight_variants)
+        # `imagenet` argument without explicit weights input size.
+        # Picking weights trained with biggest available shape
+        weights = f"{weights}-i{max_input_shape}"
+
+    weights_allow_list = [f"imagenet-i{x}" for x in available_weight_variants]
+    if not (
+        weights in {*weights_allow_list, None} or tf.io.gfile.exists(weights)
+    ):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `'imagenet'` "
+            "(pre-training on ImageNet, with highest available input shape),"
+            " or the path to the weights file to be loaded. "
+            f"For ResNetRS{depth} the following weight variants are "
+            f"available {weights_allow_list} (default=highest)."
+            f" Received weights={weights}"
+        )
+
+    if weights in weights_allow_list and include_top and classes != 1000:
+        raise ValueError(
+            "If using `weights` as `'imagenet'` or any "
+            f"of {weights_allow_list} "
+            "with `include_top` as true, `classes` should be 1000. "
+            f"Received classes={classes}"
+        )
+
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=224,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
     )
+    # Define input tensor
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
+    else:
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
 
-    x = BlockGroup(
-        filters=args["input_filters"],
-        activation=activation,
-        strides=(1 if i == 0 else 2),
-        num_repeats=args["num_repeats"],
-        se_ratio=se_ratio,
-        bn_momentum=bn_momentum,
-        bn_epsilon=bn_epsilon,
-        survival_probability=survival_probability,
-        name=f"BlockGroup{i + 2}_",
+    bn_axis = 3 if backend.image_data_format() == "channels_last" else 1
+
+    x = img_input
+
+    if include_preprocessing:
+        num_channels = input_shape[bn_axis - 1]
+        x = layers.Rescaling(scale=1.0 / 255)(x)
+        if num_channels == 3:
+            x = layers.Normalization(
+                mean=[0.485, 0.456, 0.406],
+                variance=[0.229**2, 0.224**2, 0.225**2],
+                axis=bn_axis,
+            )(x)
+
+    # Build stem
+    x = STEM(
+        bn_momentum=bn_momentum, bn_epsilon=bn_epsilon, activation=activation
     )(x)
 
-  # Build head:
-  if include_top:
-    x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
-    if dropout_rate > 0:
-      x = layers.Dropout(dropout_rate, name="top_dropout")(x)
-
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(
-        classes, activation=classifier_activation, name="predictions")(x)
-  else:
-    if pooling == "avg":
-      x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
-    elif pooling == "max":
-      x = layers.GlobalMaxPooling2D(name="max_pool")(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = training.Model(inputs, x, name=model_name)
-
-  # Download weights
-  if weights in weights_allow_list:
-    weights_input_shape = weights.split("-")[-1]  # e. g. "i160"
-    weights_name = f"{model_name}-{weights_input_shape}"
-    if not include_top:
-      weights_name += "_notop"
-
-    filename = f"{weights_name}.h5"
-    download_url = BASE_WEIGHTS_URL + filename
-    weights_path = data_utils.get_file(
-        fname=filename,
-        origin=download_url,
-        cache_subdir="models",
-        file_hash=WEIGHT_HASHES[filename],
-    )
-    model.load_weights(weights_path)
+    # Build blocks
+    if block_args is None:
+        block_args = BLOCK_ARGS[depth]
+
+    for i, args in enumerate(block_args):
+        survival_probability = get_survival_probability(
+            init_rate=drop_connect_rate,
+            block_num=i + 2,
+            total_blocks=len(block_args) + 1,
+        )
+
+        x = BlockGroup(
+            filters=args["input_filters"],
+            activation=activation,
+            strides=(1 if i == 0 else 2),
+            num_repeats=args["num_repeats"],
+            se_ratio=se_ratio,
+            bn_momentum=bn_momentum,
+            bn_epsilon=bn_epsilon,
+            survival_probability=survival_probability,
+            name=f"BlockGroup{i + 2}_",
+        )(x)
+
+    # Build head:
+    if include_top:
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        if dropout_rate > 0:
+            x = layers.Dropout(dropout_rate, name="top_dropout")(x)
+
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D(name="max_pool")(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = training.Model(inputs, x, name=model_name)
 
-  elif weights is not None:
-    model.load_weights(weights)
+    # Download weights
+    if weights in weights_allow_list:
+        weights_input_shape = weights.split("-")[-1]  # e. g. "i160"
+        weights_name = f"{model_name}-{weights_input_shape}"
+        if not include_top:
+            weights_name += "_notop"
 
-  return model
+        filename = f"{weights_name}.h5"
+        download_url = BASE_WEIGHTS_URL + filename
+        weights_path = data_utils.get_file(
+            fname=filename,
+            origin=download_url,
+            cache_subdir="models",
+            file_hash=WEIGHT_HASHES[filename],
+        )
+        model.load_weights(weights_path)
 
+    elif weights is not None:
+        model.load_weights(weights)
 
-@keras_export("keras.applications.resnet_rs.ResNetRS50",
-              "keras.applications.ResNetRS50")
+    return model
+
+
+@keras_export(
+    "keras.applications.resnet_rs.ResNetRS50", "keras.applications.ResNetRS50"
+)
 def ResNetRS50(
     include_top=True,
     weights="imagenet",
@@ -807,25 +747,26 @@ def ResNetRS50(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Build ResNet-RS50 model."""
-  return ResNetRS(
-      depth=50,
-      include_top=include_top,
-      drop_connect_rate=0.0,
-      dropout_rate=0.25,
-      weights=weights,
-      classes=classes,
-      input_shape=input_shape,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classifier_activation=classifier_activation,
-      model_name="resnet-rs-50",
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.resnet_rs.ResNetRS101",
-              "keras.applications.ResNetRS101")
+    """Build ResNet-RS50 model."""
+    return ResNetRS(
+        depth=50,
+        include_top=include_top,
+        drop_connect_rate=0.0,
+        dropout_rate=0.25,
+        weights=weights,
+        classes=classes,
+        input_shape=input_shape,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classifier_activation=classifier_activation,
+        model_name="resnet-rs-50",
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_rs.ResNetRS101", "keras.applications.ResNetRS101"
+)
 def ResNetRS101(
     include_top=True,
     weights="imagenet",
@@ -836,25 +777,26 @@ def ResNetRS101(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Build ResNet-RS101 model."""
-  return ResNetRS(
-      depth=101,
-      include_top=include_top,
-      drop_connect_rate=0.0,
-      dropout_rate=0.25,
-      weights=weights,
-      classes=classes,
-      input_shape=input_shape,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classifier_activation=classifier_activation,
-      model_name="resnet-rs-101",
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.resnet_rs.ResNetRS152",
-              "keras.applications.ResNetRS152")
+    """Build ResNet-RS101 model."""
+    return ResNetRS(
+        depth=101,
+        include_top=include_top,
+        drop_connect_rate=0.0,
+        dropout_rate=0.25,
+        weights=weights,
+        classes=classes,
+        input_shape=input_shape,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classifier_activation=classifier_activation,
+        model_name="resnet-rs-101",
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_rs.ResNetRS152", "keras.applications.ResNetRS152"
+)
 def ResNetRS152(
     include_top=True,
     weights="imagenet",
@@ -865,25 +807,26 @@ def ResNetRS152(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Build ResNet-RS152 model."""
-  return ResNetRS(
-      depth=152,
-      include_top=include_top,
-      drop_connect_rate=0.0,
-      dropout_rate=0.25,
-      weights=weights,
-      classes=classes,
-      input_shape=input_shape,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classifier_activation=classifier_activation,
-      model_name="resnet-rs-152",
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.resnet_rs.ResNetRS200",
-              "keras.applications.ResNetRS200")
+    """Build ResNet-RS152 model."""
+    return ResNetRS(
+        depth=152,
+        include_top=include_top,
+        drop_connect_rate=0.0,
+        dropout_rate=0.25,
+        weights=weights,
+        classes=classes,
+        input_shape=input_shape,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classifier_activation=classifier_activation,
+        model_name="resnet-rs-152",
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_rs.ResNetRS200", "keras.applications.ResNetRS200"
+)
 def ResNetRS200(
     include_top=True,
     weights="imagenet",
@@ -894,25 +837,26 @@ def ResNetRS200(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Build ResNet-RS200 model."""
-  return ResNetRS(
-      depth=200,
-      include_top=include_top,
-      drop_connect_rate=0.1,
-      dropout_rate=0.25,
-      weights=weights,
-      classes=classes,
-      input_shape=input_shape,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classifier_activation=classifier_activation,
-      model_name="resnet-rs-200",
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.resnet_rs.ResNetRS270",
-              "keras.applications.ResNetRS270")
+    """Build ResNet-RS200 model."""
+    return ResNetRS(
+        depth=200,
+        include_top=include_top,
+        drop_connect_rate=0.1,
+        dropout_rate=0.25,
+        weights=weights,
+        classes=classes,
+        input_shape=input_shape,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classifier_activation=classifier_activation,
+        model_name="resnet-rs-200",
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_rs.ResNetRS270", "keras.applications.ResNetRS270"
+)
 def ResNetRS270(
     include_top=True,
     weights="imagenet",
@@ -923,26 +867,27 @@ def ResNetRS270(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Build ResNet-RS-270 model."""
-  allow_bigger_recursion(1300)
-  return ResNetRS(
-      depth=270,
-      include_top=include_top,
-      drop_connect_rate=0.1,
-      dropout_rate=0.25,
-      weights=weights,
-      classes=classes,
-      input_shape=input_shape,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classifier_activation=classifier_activation,
-      model_name="resnet-rs-270",
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.resnet_rs.ResNetRS350",
-              "keras.applications.ResNetRS350")
+    """Build ResNet-RS-270 model."""
+    allow_bigger_recursion(1300)
+    return ResNetRS(
+        depth=270,
+        include_top=include_top,
+        drop_connect_rate=0.1,
+        dropout_rate=0.25,
+        weights=weights,
+        classes=classes,
+        input_shape=input_shape,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classifier_activation=classifier_activation,
+        model_name="resnet-rs-270",
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_rs.ResNetRS350", "keras.applications.ResNetRS350"
+)
 def ResNetRS350(
     include_top=True,
     weights="imagenet",
@@ -953,26 +898,27 @@ def ResNetRS350(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Build ResNet-RS350 model."""
-  allow_bigger_recursion(1500)
-  return ResNetRS(
-      depth=350,
-      include_top=include_top,
-      drop_connect_rate=0.1,
-      dropout_rate=0.4,
-      weights=weights,
-      classes=classes,
-      input_shape=input_shape,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classifier_activation=classifier_activation,
-      model_name="resnet-rs-350",
-      include_preprocessing=include_preprocessing,
-  )
-
-
-@keras_export("keras.applications.resnet_rs.ResNetRS420",
-              "keras.applications.ResNetRS420")
+    """Build ResNet-RS350 model."""
+    allow_bigger_recursion(1500)
+    return ResNetRS(
+        depth=350,
+        include_top=include_top,
+        drop_connect_rate=0.1,
+        dropout_rate=0.4,
+        weights=weights,
+        classes=classes,
+        input_shape=input_shape,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classifier_activation=classifier_activation,
+        model_name="resnet-rs-350",
+        include_preprocessing=include_preprocessing,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_rs.ResNetRS420", "keras.applications.ResNetRS420"
+)
 def ResNetRS420(
     include_top=True,
     weights="imagenet",
@@ -983,56 +929,56 @@ def ResNetRS420(
     classifier_activation="softmax",
     include_preprocessing=True,
 ):
-  """Build ResNet-RS420 model."""
-  allow_bigger_recursion(1800)
-  return ResNetRS(
-      depth=420,
-      include_top=include_top,
-      dropout_rate=0.4,
-      drop_connect_rate=0.1,
-      weights=weights,
-      classes=classes,
-      input_shape=input_shape,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classifier_activation=classifier_activation,
-      model_name="resnet-rs-420",
-      include_preprocessing=include_preprocessing,
-  )
-
-
-# pylint: disable=unused-argument
+    """Build ResNet-RS420 model."""
+    allow_bigger_recursion(1800)
+    return ResNetRS(
+        depth=420,
+        include_top=include_top,
+        dropout_rate=0.4,
+        drop_connect_rate=0.1,
+        weights=weights,
+        classes=classes,
+        input_shape=input_shape,
+        input_tensor=input_tensor,
+        pooling=pooling,
+        classifier_activation=classifier_activation,
+        model_name="resnet-rs-420",
+        include_preprocessing=include_preprocessing,
+    )
+
+
 @keras_export("keras.applications.resnet_rs.preprocess_input")
 def preprocess_input(x, data_format=None):
-  """A placeholder method for backward compatibility.
+    """A placeholder method for backward compatibility.
 
-  The preprocessing logic has been included in the ResnetRS model
-  implementation. Users are no longer required to call this method to
-  normalize
-  the input data. This method does nothing and only kept as a placeholder to
-  align the API surface between old and new version of model.
+    The preprocessing logic has been included in the ResnetRS model
+    implementation. Users are no longer required to call this method to
+    normalize
+    the input data. This method does nothing and only kept as a placeholder to
+    align the API surface between old and new version of model.
 
-  Args:
-    x: A floating point `numpy.array` or a `tf.Tensor`.
-    data_format: Optional data format of the image tensor/array. Defaults to
-      None, in which case the global setting
-      `tf.keras.backend.image_data_format()` is used (unless you changed it,
-      it defaults to "channels_last").{mode}
+    Args:
+      x: A floating point `numpy.array` or a `tf.Tensor`.
+      data_format: Optional data format of the image tensor/array. `None` means
+        the global setting `tf.keras.backend.image_data_format()` is used
+        (unless you changed it, it uses "channels_last").
+        Defaults to `None`.
 
-  Returns:
-    Unchanged `numpy.array` or `tf.Tensor`.
-  """
-  return x
+    Returns:
+      Unchanged `numpy.array` or `tf.Tensor`.
+    """
+    return x
 
 
 @keras_export("keras.applications.resnet_rs.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 ResNetRS50.__doc__ = BASE_DOCSTRING.format(name="ResNetRS50")
+ResNetRS101.__doc__ = BASE_DOCSTRING.format(name="ResNetRS101")
 ResNetRS152.__doc__ = BASE_DOCSTRING.format(name="ResNetRS152")
 ResNetRS200.__doc__ = BASE_DOCSTRING.format(name="ResNetRS200")
 ResNetRS270.__doc__ = BASE_DOCSTRING.format(name="ResNetRS270")
diff --git a/keras/applications/resnet_v2.py b/keras/applications/resnet_v2.py
index 01c327ae326c..98117d6acbd6 100644
--- a/keras/applications/resnet_v2.py
+++ b/keras/applications/resnet_v2.py
@@ -12,134 +12,150 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """ResNet v2 models for Keras.
 
 Reference:
-  - [Identity Mappings in Deep Residual Networks]
-    (https://arxiv.org/abs/1603.05027) (CVPR 2016)
+  - [Identity Mappings in Deep Residual Networks](
+      https://arxiv.org/abs/1603.05027) (CVPR 2016)
 """
 
 from keras.applications import imagenet_utils
 from keras.applications import resnet
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.applications.resnet_v2.ResNet50V2',
-              'keras.applications.ResNet50V2')
+@keras_export(
+    "keras.applications.resnet_v2.ResNet50V2", "keras.applications.ResNet50V2"
+)
 def ResNet50V2(
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the ResNet50V2 architecture."""
-  def stack_fn(x):
-    x = resnet.stack2(x, 64, 3, name='conv2')
-    x = resnet.stack2(x, 128, 4, name='conv3')
-    x = resnet.stack2(x, 256, 6, name='conv4')
-    return resnet.stack2(x, 512, 3, stride1=1, name='conv5')
-
-  return resnet.ResNet(
-      stack_fn,
-      True,
-      True,
-      'resnet50v2',
-      include_top,
-      weights,
-      input_tensor,
-      input_shape,
-      pooling,
-      classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export('keras.applications.resnet_v2.ResNet101V2',
-              'keras.applications.ResNet101V2')
+    classifier_activation="softmax",
+):
+    """Instantiates the ResNet50V2 architecture."""
+
+    def stack_fn(x):
+        x = resnet.stack2(x, 64, 3, name="conv2")
+        x = resnet.stack2(x, 128, 4, name="conv3")
+        x = resnet.stack2(x, 256, 6, name="conv4")
+        return resnet.stack2(x, 512, 3, stride1=1, name="conv5")
+
+    return resnet.ResNet(
+        stack_fn,
+        True,
+        True,
+        "resnet50v2",
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_v2.ResNet101V2", "keras.applications.ResNet101V2"
+)
 def ResNet101V2(
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the ResNet101V2 architecture."""
-  def stack_fn(x):
-    x = resnet.stack2(x, 64, 3, name='conv2')
-    x = resnet.stack2(x, 128, 4, name='conv3')
-    x = resnet.stack2(x, 256, 23, name='conv4')
-    return resnet.stack2(x, 512, 3, stride1=1, name='conv5')
-
-  return resnet.ResNet(
-      stack_fn,
-      True,
-      True,
-      'resnet101v2',
-      include_top,
-      weights,
-      input_tensor,
-      input_shape,
-      pooling,
-      classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export('keras.applications.resnet_v2.ResNet152V2',
-              'keras.applications.ResNet152V2')
+    classifier_activation="softmax",
+):
+    """Instantiates the ResNet101V2 architecture."""
+
+    def stack_fn(x):
+        x = resnet.stack2(x, 64, 3, name="conv2")
+        x = resnet.stack2(x, 128, 4, name="conv3")
+        x = resnet.stack2(x, 256, 23, name="conv4")
+        return resnet.stack2(x, 512, 3, stride1=1, name="conv5")
+
+    return resnet.ResNet(
+        stack_fn,
+        True,
+        True,
+        "resnet101v2",
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export(
+    "keras.applications.resnet_v2.ResNet152V2", "keras.applications.ResNet152V2"
+)
 def ResNet152V2(
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the ResNet152V2 architecture."""
-  def stack_fn(x):
-    x = resnet.stack2(x, 64, 3, name='conv2')
-    x = resnet.stack2(x, 128, 8, name='conv3')
-    x = resnet.stack2(x, 256, 36, name='conv4')
-    return resnet.stack2(x, 512, 3, stride1=1, name='conv5')
-
-  return resnet.ResNet(
-      stack_fn,
-      True,
-      True,
-      'resnet152v2',
-      include_top,
-      weights,
-      input_tensor,
-      input_shape,
-      pooling,
-      classes,
-      classifier_activation=classifier_activation)
-
-
-@keras_export('keras.applications.resnet_v2.preprocess_input')
+    classifier_activation="softmax",
+):
+    """Instantiates the ResNet152V2 architecture."""
+
+    def stack_fn(x):
+        x = resnet.stack2(x, 64, 3, name="conv2")
+        x = resnet.stack2(x, 128, 8, name="conv3")
+        x = resnet.stack2(x, 256, 36, name="conv4")
+        return resnet.stack2(x, 512, 3, stride1=1, name="conv5")
+
+    return resnet.ResNet(
+        stack_fn,
+        True,
+        True,
+        "resnet152v2",
+        include_top,
+        weights,
+        input_tensor,
+        input_shape,
+        pooling,
+        classes,
+        classifier_activation=classifier_activation,
+    )
+
+
+@keras_export("keras.applications.resnet_v2.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(
-      x, data_format=data_format, mode='tf')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="tf"
+    )
 
 
-@keras_export('keras.applications.resnet_v2.decode_predictions')
+@keras_export("keras.applications.resnet_v2.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
 
   Reference:
-  - [Identity Mappings in Deep Residual Networks]
-    (https://arxiv.org/abs/1603.05027) (CVPR 2016)
+  - [Identity Mappings in Deep Residual Networks](
+      https://arxiv.org/abs/1603.05027) (CVPR 2016)
 
   For image classification use cases, see
   [this page for detailed examples](
@@ -193,6 +209,6 @@ def decode_predictions(preds, top=5):
     A `keras.Model` instance.
 """
 
-setattr(ResNet50V2, '__doc__', ResNet50V2.__doc__ + DOC)
-setattr(ResNet101V2, '__doc__', ResNet101V2.__doc__ + DOC)
-setattr(ResNet152V2, '__doc__', ResNet152V2.__doc__ + DOC)
+setattr(ResNet50V2, "__doc__", ResNet50V2.__doc__ + DOC)
+setattr(ResNet101V2, "__doc__", ResNet101V2.__doc__ + DOC)
+setattr(ResNet152V2, "__doc__", ResNet152V2.__doc__ + DOC)
diff --git a/keras/applications/vgg16.py b/keras/applications/vgg16.py
index adf633a777f3..f7eebee3d96d 100644
--- a/keras/applications/vgg16.py
+++ b/keras/applications/vgg16.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """VGG16 model for Keras.
 
 Reference:
@@ -28,218 +28,245 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-WEIGHTS_PATH = ('https://storage.googleapis.com/tensorflow/keras-applications/'
-                'vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5')
-WEIGHTS_PATH_NO_TOP = ('https://storage.googleapis.com/tensorflow/'
-                       'keras-applications/vgg16/'
-                       'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5')
+WEIGHTS_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/"
+    "vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5"
+)
+WEIGHTS_PATH_NO_TOP = (
+    "https://storage.googleapis.com/tensorflow/"
+    "keras-applications/vgg16/"
+    "vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5"
+)
 
 layers = VersionAwareLayers()
 
 
-@keras_export('keras.applications.vgg16.VGG16', 'keras.applications.VGG16')
+@keras_export("keras.applications.vgg16.VGG16", "keras.applications.VGG16")
 def VGG16(
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the VGG16 model.
-
-  Reference:
-  - [Very Deep Convolutional Networks for Large-Scale Image Recognition](
-  https://arxiv.org/abs/1409.1556) (ICLR 2015)
-
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  The default input size for this model is 224x224.
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For VGG16, call `tf.keras.applications.vgg16.preprocess_input` on your
-  inputs before passing them to the model.
-  `vgg16.preprocess_input` will convert the input images from RGB to BGR,
-  then will zero-center each color channel with respect to the ImageNet dataset,
-  without scaling.
-
-  Args:
-      include_top: whether to include the 3 fully-connected
-          layers at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor
-          (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)`
-          (with `channels_last` data format)
-          or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
-          and width and height should be no smaller than 32.
-          E.g. `(200, 200, 3)` would be one valid value.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the
-              last convolutional block.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional block, and thus
-              the output of the model will be a 2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-      classifier_activation: A `str` or callable. The activation function to use
-          on the "top" layer. Ignored unless `include_top=True`. Set
-          `classifier_activation=None` to return the logits of the "top" layer.
-          When loading pretrained weights, `classifier_activation` can only
-          be `None` or `"softmax"`.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError(
-        'The `weights` argument should be either '
-        '`None` (random initialization), `imagenet` '
-        '(pre-training on ImageNet), '
-        'or the path to the weights file to be loaded.  Received: '
-        f'weights={weights}')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
-                     'as true, `classes` should be 1000.  '
-                     f'Received `classes={classes}`')
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+    classifier_activation="softmax",
+):
+    """Instantiates the VGG16 model.
+
+    Reference:
+    - [Very Deep Convolutional Networks for Large-Scale Image Recognition](
+    https://arxiv.org/abs/1409.1556) (ICLR 2015)
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    The default input size for this model is 224x224.
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For VGG16, call `tf.keras.applications.vgg16.preprocess_input` on your
+    inputs before passing them to the model.
+    `vgg16.preprocess_input` will convert the input images from RGB to BGR,
+    then will zero-center each color channel with respect to the ImageNet
+    dataset, without scaling.
+
+    Args:
+        include_top: whether to include the 3 fully-connected
+            layers at the top of the network.
+        weights: one of `None` (random initialization),
+              'imagenet' (pre-training on ImageNet),
+              or the path to the weights file to be loaded.
+        input_tensor: optional Keras tensor
+            (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)`
+            (with `channels_last` data format)
+            or `(3, 224, 224)` (with `channels_first` data format).
+            It should have exactly 3 input channels,
+            and width and height should be no smaller than 32.
+            E.g. `(200, 200, 3)` would be one valid value.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model will be
+                the 4D tensor output of the
+                last convolutional block.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional block, and thus
+                the output of the model will be a 2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        classifier_activation: A `str` or callable. The activation function to
+            use on the "top" layer. Ignored unless `include_top=True`. Set
+            `classifier_activation=None` to return the logits of the "top"
+            layer.  When loading pretrained weights, `classifier_activation` can
+            only be `None` or `"softmax"`.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded.  Received: "
+            f"weights={weights}"
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top` '
+            "as true, `classes` should be 1000.  "
+            f"Received `classes={classes}`"
+        )
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=224,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      img_input = input_tensor
-  # Block 1
-  x = layers.Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
-          img_input)
-  x = layers.Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
-
-  # Block 2
-  x = layers.Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
-  x = layers.Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
-
-  # Block 3
-  x = layers.Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
-  x = layers.Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
-  x = layers.Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
-
-  # Block 4
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
-
-  # Block 5
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
-
-  if include_top:
-    # Classification block
-    x = layers.Flatten(name='flatten')(x)
-    x = layers.Dense(4096, activation='relu', name='fc1')(x)
-    x = layers.Dense(4096, activation='relu', name='fc2')(x)
-
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = training.Model(inputs, x, name='vgg16')
-
-  # Load weights.
-  if weights == 'imagenet':
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+    # Block 1
+    x = layers.Conv2D(
+        64, (3, 3), activation="relu", padding="same", name="block1_conv1"
+    )(img_input)
+    x = layers.Conv2D(
+        64, (3, 3), activation="relu", padding="same", name="block1_conv2"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block1_pool")(x)
+
+    # Block 2
+    x = layers.Conv2D(
+        128, (3, 3), activation="relu", padding="same", name="block2_conv1"
+    )(x)
+    x = layers.Conv2D(
+        128, (3, 3), activation="relu", padding="same", name="block2_conv2"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block2_pool")(x)
+
+    # Block 3
+    x = layers.Conv2D(
+        256, (3, 3), activation="relu", padding="same", name="block3_conv1"
+    )(x)
+    x = layers.Conv2D(
+        256, (3, 3), activation="relu", padding="same", name="block3_conv2"
+    )(x)
+    x = layers.Conv2D(
+        256, (3, 3), activation="relu", padding="same", name="block3_conv3"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block3_pool")(x)
+
+    # Block 4
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block4_conv1"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block4_conv2"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block4_conv3"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block4_pool")(x)
+
+    # Block 5
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block5_conv1"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block5_conv2"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block5_conv3"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block5_pool")(x)
+
     if include_top:
-      weights_path = data_utils.get_file(
-          'vgg16_weights_tf_dim_ordering_tf_kernels.h5',
-          WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='64373286793e3c8b2b4e3219cbf3544b')
+        # Classification block
+        x = layers.Flatten(name="flatten")(x)
+        x = layers.Dense(4096, activation="relu", name="fc1")(x)
+        x = layers.Dense(4096, activation="relu", name="fc2")(x)
+
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
     else:
-      weights_path = data_utils.get_file(
-          'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='6d6bbae143d832006294945121d1f1fc')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-@keras_export('keras.applications.vgg16.preprocess_input')
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = training.Model(inputs, x, name="vgg16")
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            weights_path = data_utils.get_file(
+                "vgg16_weights_tf_dim_ordering_tf_kernels.h5",
+                WEIGHTS_PATH,
+                cache_subdir="models",
+                file_hash="64373286793e3c8b2b4e3219cbf3544b",
+            )
+        else:
+            weights_path = data_utils.get_file(
+                "vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5",
+                WEIGHTS_PATH_NO_TOP,
+                cache_subdir="models",
+                file_hash="6d6bbae143d832006294945121d1f1fc",
+            )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
+
+
+@keras_export("keras.applications.vgg16.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(
-      x, data_format=data_format, mode='caffe')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="caffe"
+    )
 
 
-@keras_export('keras.applications.vgg16.decode_predictions')
+@keras_export("keras.applications.vgg16.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/vgg19.py b/keras/applications/vgg19.py
index 8766003d8ab8..b763dff5f28e 100644
--- a/keras/applications/vgg19.py
+++ b/keras/applications/vgg19.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """VGG19 model for Keras.
 
 Reference:
@@ -28,222 +28,253 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-WEIGHTS_PATH = ('https://storage.googleapis.com/tensorflow/keras-applications/'
-                'vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5')
-WEIGHTS_PATH_NO_TOP = ('https://storage.googleapis.com/tensorflow/'
-                       'keras-applications/vgg19/'
-                       'vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5')
+WEIGHTS_PATH = (
+    "https://storage.googleapis.com/tensorflow/keras-applications/"
+    "vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5"
+)
+WEIGHTS_PATH_NO_TOP = (
+    "https://storage.googleapis.com/tensorflow/"
+    "keras-applications/vgg19/"
+    "vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5"
+)
 
 layers = VersionAwareLayers()
 
 
-@keras_export('keras.applications.vgg19.VGG19', 'keras.applications.VGG19')
+@keras_export("keras.applications.vgg19.VGG19", "keras.applications.VGG19")
 def VGG19(
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the VGG19 architecture.
+    classifier_activation="softmax",
+):
+    """Instantiates the VGG19 architecture.
 
-  Reference:
-  - [Very Deep Convolutional Networks for Large-Scale Image Recognition](
-      https://arxiv.org/abs/1409.1556) (ICLR 2015)
+    Reference:
+    - [Very Deep Convolutional Networks for Large-Scale Image Recognition](
+        https://arxiv.org/abs/1409.1556) (ICLR 2015)
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    The default input size for this model is 224x224.
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For VGG19, call `tf.keras.applications.vgg19.preprocess_input` on your
+    inputs before passing them to the model.
+    `vgg19.preprocess_input` will convert the input images from RGB to BGR,
+    then will zero-center each color channel with respect to the ImageNet
+    dataset, without scaling.
+
+    Args:
+      include_top: whether to include the 3 fully-connected
+        layers at the top of the network.
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor
+        (i.e. output of `layers.Input()`)
+        to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+        if `include_top` is False (otherwise the input shape
+        has to be `(224, 224, 3)`
+        (with `channels_last` data format)
+        or `(3, 224, 224)` (with `channels_first` data format).
+        It should have exactly 3 inputs channels,
+        and width and height should be no smaller than 32.
+        E.g. `(200, 200, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+        when `include_top` is `False`.
+        - `None` means that the output of the model will be
+            the 4D tensor output of the
+            last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will
+            be applied.
+      classes: optional number of classes to classify images
+        into, only to be specified if `include_top` is True, and
+        if no `weights` argument is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
 
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  The default input size for this model is 224x224.
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For VGG19, call `tf.keras.applications.vgg19.preprocess_input` on your
-  inputs before passing them to the model.
-  `vgg19.preprocess_input` will convert the input images from RGB to BGR,
-  then will zero-center each color channel with respect to the ImageNet dataset,
-  without scaling.
-
-  Args:
-    include_top: whether to include the 3 fully-connected
-      layers at the top of the network.
-    weights: one of `None` (random initialization),
-        'imagenet' (pre-training on ImageNet),
-        or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor
-      (i.e. output of `layers.Input()`)
-      to use as image input for the model.
-    input_shape: optional shape tuple, only to be specified
-      if `include_top` is False (otherwise the input shape
-      has to be `(224, 224, 3)`
-      (with `channels_last` data format)
-      or `(3, 224, 224)` (with `channels_first` data format).
-      It should have exactly 3 inputs channels,
-      and width and height should be no smaller than 32.
-      E.g. `(200, 200, 3)` would be one valid value.
-    pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`.
-      - `None` means that the output of the model will be
-          the 4D tensor output of the
-          last convolutional block.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a 2D tensor.
-      - `max` means that global max pooling will
-          be applied.
-    classes: optional number of classes to classify images
-      into, only to be specified if `include_top` is True, and
-      if no `weights` argument is specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.  '
-                     f'Received: `weights={weights}.`')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
-                     'as true, `classes` should be 1000.  '
-                     f'Received: `classes={classes}.`')
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=32,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+    Returns:
+      A `keras.Model` instance.
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded.  "
+            f"Received: `weights={weights}.`"
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top` '
+            "as true, `classes` should be 1000.  "
+            f"Received: `classes={classes}.`"
+        )
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=224,
+        min_size=32,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
     else:
-      img_input = input_tensor
-  # Block 1
-  x = layers.Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
-          img_input)
-  x = layers.Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
-
-  # Block 2
-  x = layers.Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
-  x = layers.Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
-
-  # Block 3
-  x = layers.Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
-  x = layers.Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
-  x = layers.Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
-  x = layers.Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
-
-  # Block 4
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
-
-  # Block 5
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
-  x = layers.Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(x)
-  x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
-
-  if include_top:
-    # Classification block
-    x = layers.Flatten(name='flatten')(x)
-    x = layers.Dense(4096, activation='relu', name='fc1')(x)
-    x = layers.Dense(4096, activation='relu', name='fc2')(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = training.Model(inputs, x, name='vgg19')
-
-  # Load weights.
-  if weights == 'imagenet':
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+    # Block 1
+    x = layers.Conv2D(
+        64, (3, 3), activation="relu", padding="same", name="block1_conv1"
+    )(img_input)
+    x = layers.Conv2D(
+        64, (3, 3), activation="relu", padding="same", name="block1_conv2"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block1_pool")(x)
+
+    # Block 2
+    x = layers.Conv2D(
+        128, (3, 3), activation="relu", padding="same", name="block2_conv1"
+    )(x)
+    x = layers.Conv2D(
+        128, (3, 3), activation="relu", padding="same", name="block2_conv2"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block2_pool")(x)
+
+    # Block 3
+    x = layers.Conv2D(
+        256, (3, 3), activation="relu", padding="same", name="block3_conv1"
+    )(x)
+    x = layers.Conv2D(
+        256, (3, 3), activation="relu", padding="same", name="block3_conv2"
+    )(x)
+    x = layers.Conv2D(
+        256, (3, 3), activation="relu", padding="same", name="block3_conv3"
+    )(x)
+    x = layers.Conv2D(
+        256, (3, 3), activation="relu", padding="same", name="block3_conv4"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block3_pool")(x)
+
+    # Block 4
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block4_conv1"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block4_conv2"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block4_conv3"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block4_conv4"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block4_pool")(x)
+
+    # Block 5
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block5_conv1"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block5_conv2"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block5_conv3"
+    )(x)
+    x = layers.Conv2D(
+        512, (3, 3), activation="relu", padding="same", name="block5_conv4"
+    )(x)
+    x = layers.MaxPooling2D((2, 2), strides=(2, 2), name="block5_pool")(x)
+
     if include_top:
-      weights_path = data_utils.get_file(
-          'vgg19_weights_tf_dim_ordering_tf_kernels.h5',
-          WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='cbe5617147190e668d6c5d5026f83318')
+        # Classification block
+        x = layers.Flatten(name="flatten")(x)
+        x = layers.Dense(4096, activation="relu", name="fc1")(x)
+        x = layers.Dense(4096, activation="relu", name="fc2")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
     else:
-      weights_path = data_utils.get_file(
-          'vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='253f8cb515780f3b799900260a226db6')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = training.Model(inputs, x, name="vgg19")
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            weights_path = data_utils.get_file(
+                "vgg19_weights_tf_dim_ordering_tf_kernels.h5",
+                WEIGHTS_PATH,
+                cache_subdir="models",
+                file_hash="cbe5617147190e668d6c5d5026f83318",
+            )
+        else:
+            weights_path = data_utils.get_file(
+                "vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5",
+                WEIGHTS_PATH_NO_TOP,
+                cache_subdir="models",
+                file_hash="253f8cb515780f3b799900260a226db6",
+            )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
 
-  return model
+    return model
 
 
-@keras_export('keras.applications.vgg19.preprocess_input')
+@keras_export("keras.applications.vgg19.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(
-      x, data_format=data_format, mode='caffe')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="caffe"
+    )
 
 
-@keras_export('keras.applications.vgg19.decode_predictions')
+@keras_export("keras.applications.vgg19.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/applications/xception.py b/keras/applications/xception.py
index 5e931ecaadf6..e7e4ff597c89 100644
--- a/keras/applications/xception.py
+++ b/keras/applications/xception.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
+
 """Xception V1 model for Keras.
 
 On ImageNet, this model gets to a top-1 validation accuracy of 0.790
@@ -31,301 +31,350 @@
 from keras.layers import VersionAwareLayers
 from keras.utils import data_utils
 from keras.utils import layer_utils
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
 TF_WEIGHTS_PATH = (
-    'https://storage.googleapis.com/tensorflow/keras-applications/'
-    'xception/xception_weights_tf_dim_ordering_tf_kernels.h5')
+    "https://storage.googleapis.com/tensorflow/keras-applications/"
+    "xception/xception_weights_tf_dim_ordering_tf_kernels.h5"
+)
 TF_WEIGHTS_PATH_NO_TOP = (
-    'https://storage.googleapis.com/tensorflow/keras-applications/'
-    'xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5')
+    "https://storage.googleapis.com/tensorflow/keras-applications/"
+    "xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5"
+)
 
 layers = VersionAwareLayers()
 
 
-@keras_export('keras.applications.xception.Xception',
-              'keras.applications.Xception')
+@keras_export(
+    "keras.applications.xception.Xception", "keras.applications.Xception"
+)
 def Xception(
     include_top=True,
-    weights='imagenet',
+    weights="imagenet",
     input_tensor=None,
     input_shape=None,
     pooling=None,
     classes=1000,
-    classifier_activation='softmax'):
-  """Instantiates the Xception architecture.
+    classifier_activation="softmax",
+):
+    """Instantiates the Xception architecture.
+
+    Reference:
+    - [Xception: Deep Learning with Depthwise Separable Convolutions](
+        https://arxiv.org/abs/1610.02357) (CVPR 2017)
+
+    For image classification use cases, see
+    [this page for detailed examples](
+      https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](
+      https://keras.io/guides/transfer_learning/).
+
+    The default input image size for this model is 299x299.
+
+    Note: each Keras Application expects a specific kind of input preprocessing.
+    For Xception, call `tf.keras.applications.xception.preprocess_input` on your
+    inputs before passing them to the model.
+    `xception.preprocess_input` will scale input pixels between -1 and 1.
+
+    Args:
+      include_top: whether to include the fully-connected
+        layer at the top of the network.
+      weights: one of `None` (random initialization),
+        'imagenet' (pre-training on ImageNet),
+        or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor
+        (i.e. output of `layers.Input()`)
+        to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+        if `include_top` is False (otherwise the input shape
+        has to be `(299, 299, 3)`.
+        It should have exactly 3 inputs channels,
+        and width and height should be no smaller than 71.
+        E.g. `(150, 150, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+        when `include_top` is `False`.
+        - `None` means that the output of the model will be
+            the 4D tensor output of the
+            last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will
+            be applied.
+      classes: optional number of classes to classify images
+        into, only to be specified if `include_top` is True,
+        and if no `weights` argument is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+
+    Returns:
+      A `keras.Model` instance.
+    """
+    if not (weights in {"imagenet", None} or tf.io.gfile.exists(weights)):
+        raise ValueError(
+            "The `weights` argument should be either "
+            "`None` (random initialization), `imagenet` "
+            "(pre-training on ImageNet), "
+            "or the path to the weights file to be loaded."
+        )
+
+    if weights == "imagenet" and include_top and classes != 1000:
+        raise ValueError(
+            'If using `weights` as `"imagenet"` with `include_top`'
+            " as true, `classes` should be 1000"
+        )
+
+    # Determine proper input shape
+    input_shape = imagenet_utils.obtain_input_shape(
+        input_shape,
+        default_size=299,
+        min_size=71,
+        data_format=backend.image_data_format(),
+        require_flatten=include_top,
+        weights=weights,
+    )
+
+    if input_tensor is None:
+        img_input = layers.Input(shape=input_shape)
+    else:
+        if not backend.is_keras_tensor(input_tensor):
+            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
+
+    x = layers.Conv2D(
+        32, (3, 3), strides=(2, 2), use_bias=False, name="block1_conv1"
+    )(img_input)
+    x = layers.BatchNormalization(axis=channel_axis, name="block1_conv1_bn")(x)
+    x = layers.Activation("relu", name="block1_conv1_act")(x)
+    x = layers.Conv2D(64, (3, 3), use_bias=False, name="block1_conv2")(x)
+    x = layers.BatchNormalization(axis=channel_axis, name="block1_conv2_bn")(x)
+    x = layers.Activation("relu", name="block1_conv2_act")(x)
+
+    residual = layers.Conv2D(
+        128, (1, 1), strides=(2, 2), padding="same", use_bias=False
+    )(x)
+    residual = layers.BatchNormalization(axis=channel_axis)(residual)
 
-  Reference:
-  - [Xception: Deep Learning with Depthwise Separable Convolutions](
-      https://arxiv.org/abs/1610.02357) (CVPR 2017)
+    x = layers.SeparableConv2D(
+        128, (3, 3), padding="same", use_bias=False, name="block2_sepconv1"
+    )(x)
+    x = layers.BatchNormalization(axis=channel_axis, name="block2_sepconv1_bn")(
+        x
+    )
+    x = layers.Activation("relu", name="block2_sepconv2_act")(x)
+    x = layers.SeparableConv2D(
+        128, (3, 3), padding="same", use_bias=False, name="block2_sepconv2"
+    )(x)
+    x = layers.BatchNormalization(axis=channel_axis, name="block2_sepconv2_bn")(
+        x
+    )
+
+    x = layers.MaxPooling2D(
+        (3, 3), strides=(2, 2), padding="same", name="block2_pool"
+    )(x)
+    x = layers.add([x, residual])
 
-  For image classification use cases, see
-  [this page for detailed examples](
-    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
-
-  For transfer learning use cases, make sure to read the
-  [guide to transfer learning & fine-tuning](
-    https://keras.io/guides/transfer_learning/).
-
-  The default input image size for this model is 299x299.
-
-  Note: each Keras Application expects a specific kind of input preprocessing.
-  For Xception, call `tf.keras.applications.xception.preprocess_input` on your
-  inputs before passing them to the model.
-  `xception.preprocess_input` will scale input pixels between -1 and 1.
-
-  Args:
-    include_top: whether to include the fully-connected
-      layer at the top of the network.
-    weights: one of `None` (random initialization),
-      'imagenet' (pre-training on ImageNet),
-      or the path to the weights file to be loaded.
-    input_tensor: optional Keras tensor
-      (i.e. output of `layers.Input()`)
-      to use as image input for the model.
-    input_shape: optional shape tuple, only to be specified
-      if `include_top` is False (otherwise the input shape
-      has to be `(299, 299, 3)`.
-      It should have exactly 3 inputs channels,
-      and width and height should be no smaller than 71.
-      E.g. `(150, 150, 3)` would be one valid value.
-    pooling: Optional pooling mode for feature extraction
-      when `include_top` is `False`.
-      - `None` means that the output of the model will be
-          the 4D tensor output of the
-          last convolutional block.
-      - `avg` means that global average pooling
-          will be applied to the output of the
-          last convolutional block, and thus
-          the output of the model will be a 2D tensor.
-      - `max` means that global max pooling will
-          be applied.
-    classes: optional number of classes to classify images
-      into, only to be specified if `include_top` is True,
-      and if no `weights` argument is specified.
-    classifier_activation: A `str` or callable. The activation function to use
-      on the "top" layer. Ignored unless `include_top=True`. Set
-      `classifier_activation=None` to return the logits of the "top" layer.
-      When loading pretrained weights, `classifier_activation` can only
-      be `None` or `"softmax"`.
-
-  Returns:
-    A `keras.Model` instance.
-  """
-  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as `"imagenet"` with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = imagenet_utils.obtain_input_shape(
-      input_shape,
-      default_size=299,
-      min_size=71,
-      data_format=backend.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = layers.Input(shape=input_shape)
-  else:
-    if not backend.is_keras_tensor(input_tensor):
-      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
-
-  x = layers.Conv2D(
-      32, (3, 3),
-      strides=(2, 2),
-      use_bias=False,
-      name='block1_conv1')(img_input)
-  x = layers.BatchNormalization(axis=channel_axis, name='block1_conv1_bn')(x)
-  x = layers.Activation('relu', name='block1_conv1_act')(x)
-  x = layers.Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
-  x = layers.BatchNormalization(axis=channel_axis, name='block1_conv2_bn')(x)
-  x = layers.Activation('relu', name='block1_conv2_act')(x)
-
-  residual = layers.Conv2D(
-      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
-  residual = layers.BatchNormalization(axis=channel_axis)(residual)
-
-  x = layers.SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(x)
-  x = layers.BatchNormalization(axis=channel_axis, name='block2_sepconv1_bn')(x)
-  x = layers.Activation('relu', name='block2_sepconv2_act')(x)
-  x = layers.SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(x)
-  x = layers.BatchNormalization(axis=channel_axis, name='block2_sepconv2_bn')(x)
-
-  x = layers.MaxPooling2D((3, 3),
-                          strides=(2, 2),
-                          padding='same',
-                          name='block2_pool')(x)
-  x = layers.add([x, residual])
-
-  residual = layers.Conv2D(
-      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
-  residual = layers.BatchNormalization(axis=channel_axis)(residual)
-
-  x = layers.Activation('relu', name='block3_sepconv1_act')(x)
-  x = layers.SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(x)
-  x = layers.BatchNormalization(axis=channel_axis, name='block3_sepconv1_bn')(x)
-  x = layers.Activation('relu', name='block3_sepconv2_act')(x)
-  x = layers.SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(x)
-  x = layers.BatchNormalization(axis=channel_axis, name='block3_sepconv2_bn')(x)
-
-  x = layers.MaxPooling2D((3, 3),
-                          strides=(2, 2),
-                          padding='same',
-                          name='block3_pool')(x)
-  x = layers.add([x, residual])
-
-  residual = layers.Conv2D(
-      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
-  residual = layers.BatchNormalization(axis=channel_axis)(residual)
-
-  x = layers.Activation('relu', name='block4_sepconv1_act')(x)
-  x = layers.SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(x)
-  x = layers.BatchNormalization(axis=channel_axis, name='block4_sepconv1_bn')(x)
-  x = layers.Activation('relu', name='block4_sepconv2_act')(x)
-  x = layers.SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(x)
-  x = layers.BatchNormalization(axis=channel_axis, name='block4_sepconv2_bn')(x)
-
-  x = layers.MaxPooling2D((3, 3),
-                          strides=(2, 2),
-                          padding='same',
-                          name='block4_pool')(x)
-  x = layers.add([x, residual])
-
-  for i in range(8):
-    residual = x
-    prefix = 'block' + str(i + 5)
-
-    x = layers.Activation('relu', name=prefix + '_sepconv1_act')(x)
+    residual = layers.Conv2D(
+        256, (1, 1), strides=(2, 2), padding="same", use_bias=False
+    )(x)
+    residual = layers.BatchNormalization(axis=channel_axis)(residual)
+
+    x = layers.Activation("relu", name="block3_sepconv1_act")(x)
     x = layers.SeparableConv2D(
-        728, (3, 3),
-        padding='same',
-        use_bias=False,
-        name=prefix + '_sepconv1')(x)
-    x = layers.BatchNormalization(
-        axis=channel_axis, name=prefix + '_sepconv1_bn')(x)
-    x = layers.Activation('relu', name=prefix + '_sepconv2_act')(x)
+        256, (3, 3), padding="same", use_bias=False, name="block3_sepconv1"
+    )(x)
+    x = layers.BatchNormalization(axis=channel_axis, name="block3_sepconv1_bn")(
+        x
+    )
+    x = layers.Activation("relu", name="block3_sepconv2_act")(x)
+    x = layers.SeparableConv2D(
+        256, (3, 3), padding="same", use_bias=False, name="block3_sepconv2"
+    )(x)
+    x = layers.BatchNormalization(axis=channel_axis, name="block3_sepconv2_bn")(
+        x
+    )
+
+    x = layers.MaxPooling2D(
+        (3, 3), strides=(2, 2), padding="same", name="block3_pool"
+    )(x)
+    x = layers.add([x, residual])
+
+    residual = layers.Conv2D(
+        728, (1, 1), strides=(2, 2), padding="same", use_bias=False
+    )(x)
+    residual = layers.BatchNormalization(axis=channel_axis)(residual)
+
+    x = layers.Activation("relu", name="block4_sepconv1_act")(x)
+    x = layers.SeparableConv2D(
+        728, (3, 3), padding="same", use_bias=False, name="block4_sepconv1"
+    )(x)
+    x = layers.BatchNormalization(axis=channel_axis, name="block4_sepconv1_bn")(
+        x
+    )
+    x = layers.Activation("relu", name="block4_sepconv2_act")(x)
+    x = layers.SeparableConv2D(
+        728, (3, 3), padding="same", use_bias=False, name="block4_sepconv2"
+    )(x)
+    x = layers.BatchNormalization(axis=channel_axis, name="block4_sepconv2_bn")(
+        x
+    )
+
+    x = layers.MaxPooling2D(
+        (3, 3), strides=(2, 2), padding="same", name="block4_pool"
+    )(x)
+    x = layers.add([x, residual])
+
+    for i in range(8):
+        residual = x
+        prefix = "block" + str(i + 5)
+
+        x = layers.Activation("relu", name=prefix + "_sepconv1_act")(x)
+        x = layers.SeparableConv2D(
+            728,
+            (3, 3),
+            padding="same",
+            use_bias=False,
+            name=prefix + "_sepconv1",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=channel_axis, name=prefix + "_sepconv1_bn"
+        )(x)
+        x = layers.Activation("relu", name=prefix + "_sepconv2_act")(x)
+        x = layers.SeparableConv2D(
+            728,
+            (3, 3),
+            padding="same",
+            use_bias=False,
+            name=prefix + "_sepconv2",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=channel_axis, name=prefix + "_sepconv2_bn"
+        )(x)
+        x = layers.Activation("relu", name=prefix + "_sepconv3_act")(x)
+        x = layers.SeparableConv2D(
+            728,
+            (3, 3),
+            padding="same",
+            use_bias=False,
+            name=prefix + "_sepconv3",
+        )(x)
+        x = layers.BatchNormalization(
+            axis=channel_axis, name=prefix + "_sepconv3_bn"
+        )(x)
+
+        x = layers.add([x, residual])
+
+    residual = layers.Conv2D(
+        1024, (1, 1), strides=(2, 2), padding="same", use_bias=False
+    )(x)
+    residual = layers.BatchNormalization(axis=channel_axis)(residual)
+
+    x = layers.Activation("relu", name="block13_sepconv1_act")(x)
     x = layers.SeparableConv2D(
-        728, (3, 3),
-        padding='same',
-        use_bias=False,
-        name=prefix + '_sepconv2')(x)
+        728, (3, 3), padding="same", use_bias=False, name="block13_sepconv1"
+    )(x)
     x = layers.BatchNormalization(
-        axis=channel_axis, name=prefix + '_sepconv2_bn')(x)
-    x = layers.Activation('relu', name=prefix + '_sepconv3_act')(x)
+        axis=channel_axis, name="block13_sepconv1_bn"
+    )(x)
+    x = layers.Activation("relu", name="block13_sepconv2_act")(x)
     x = layers.SeparableConv2D(
-        728, (3, 3),
-        padding='same',
-        use_bias=False,
-        name=prefix + '_sepconv3')(x)
+        1024, (3, 3), padding="same", use_bias=False, name="block13_sepconv2"
+    )(x)
     x = layers.BatchNormalization(
-        axis=channel_axis, name=prefix + '_sepconv3_bn')(x)
+        axis=channel_axis, name="block13_sepconv2_bn"
+    )(x)
 
+    x = layers.MaxPooling2D(
+        (3, 3), strides=(2, 2), padding="same", name="block13_pool"
+    )(x)
     x = layers.add([x, residual])
 
-  residual = layers.Conv2D(
-      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
-  residual = layers.BatchNormalization(axis=channel_axis)(residual)
-
-  x = layers.Activation('relu', name='block13_sepconv1_act')(x)
-  x = layers.SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, name='block13_sepconv1_bn')(x)
-  x = layers.Activation('relu', name='block13_sepconv2_act')(x)
-  x = layers.SeparableConv2D(
-      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, name='block13_sepconv2_bn')(x)
-
-  x = layers.MaxPooling2D((3, 3),
-                          strides=(2, 2),
-                          padding='same',
-                          name='block13_pool')(x)
-  x = layers.add([x, residual])
-
-  x = layers.SeparableConv2D(
-      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, name='block14_sepconv1_bn')(x)
-  x = layers.Activation('relu', name='block14_sepconv1_act')(x)
-
-  x = layers.SeparableConv2D(
-      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(x)
-  x = layers.BatchNormalization(
-      axis=channel_axis, name='block14_sepconv2_bn')(x)
-  x = layers.Activation('relu', name='block14_sepconv2_act')(x)
-
-  if include_top:
-    x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    imagenet_utils.validate_activation(classifier_activation, weights)
-    x = layers.Dense(classes, activation=classifier_activation,
-                     name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = layers.GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = layers.GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = training.Model(inputs, x, name='xception')
-
-  # Load weights.
-  if weights == 'imagenet':
-    if include_top:
-      weights_path = data_utils.get_file(
-          'xception_weights_tf_dim_ordering_tf_kernels.h5',
-          TF_WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='0a58e3b7378bc2990ea3b43d5981f1f6')
-    else:
-      weights_path = data_utils.get_file(
-          'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          TF_WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='b0042744bf5b25fce3cb969f33bebb97')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
+    x = layers.SeparableConv2D(
+        1536, (3, 3), padding="same", use_bias=False, name="block14_sepconv1"
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis, name="block14_sepconv1_bn"
+    )(x)
+    x = layers.Activation("relu", name="block14_sepconv1_act")(x)
 
+    x = layers.SeparableConv2D(
+        2048, (3, 3), padding="same", use_bias=False, name="block14_sepconv2"
+    )(x)
+    x = layers.BatchNormalization(
+        axis=channel_axis, name="block14_sepconv2_bn"
+    )(x)
+    x = layers.Activation("relu", name="block14_sepconv2_act")(x)
 
-@keras_export('keras.applications.xception.preprocess_input')
+    if include_top:
+        x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
+        imagenet_utils.validate_activation(classifier_activation, weights)
+        x = layers.Dense(
+            classes, activation=classifier_activation, name="predictions"
+        )(x)
+    else:
+        if pooling == "avg":
+            x = layers.GlobalAveragePooling2D()(x)
+        elif pooling == "max":
+            x = layers.GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = layer_utils.get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = training.Model(inputs, x, name="xception")
+
+    # Load weights.
+    if weights == "imagenet":
+        if include_top:
+            weights_path = data_utils.get_file(
+                "xception_weights_tf_dim_ordering_tf_kernels.h5",
+                TF_WEIGHTS_PATH,
+                cache_subdir="models",
+                file_hash="0a58e3b7378bc2990ea3b43d5981f1f6",
+            )
+        else:
+            weights_path = data_utils.get_file(
+                "xception_weights_tf_dim_ordering_tf_kernels_notop.h5",
+                TF_WEIGHTS_PATH_NO_TOP,
+                cache_subdir="models",
+                file_hash="b0042744bf5b25fce3cb969f33bebb97",
+            )
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    return model
+
+
+@keras_export("keras.applications.xception.preprocess_input")
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+    return imagenet_utils.preprocess_input(
+        x, data_format=data_format, mode="tf"
+    )
 
 
-@keras_export('keras.applications.xception.decode_predictions')
+@keras_export("keras.applications.xception.decode_predictions")
 def decode_predictions(preds, top=5):
-  return imagenet_utils.decode_predictions(preds, top=top)
+    return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='',
+    mode="",
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
-    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC,
+)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/keras/backend.py b/keras/backend.py
index cf69a175b794..7f5b6b1d4cc7 100644
--- a/keras/backend.py
+++ b/keras/backend.py
@@ -12,15 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-# pylint: disable=redefined-outer-name
-# pylint: disable=redefined-builtin
-# pylint: disable=g-classes-have-attributes
-# pylint: disable=g-bad-import-order
-# pylint: disable=missing-function-docstring
-"""Keras backend API."""
 
-import tensorflow.compat.v2 as tf
+
+"""Keras backend API."""
 
 import collections
 import itertools
@@ -33,18 +27,22 @@
 import weakref
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.eager.context import get_config
-from tensorflow.python.framework import config
 from keras import backend_config
 from keras.distribute import distribute_coordinator_utils as dc
+from keras.dtensor import dtensor_api as dtensor
 from keras.engine import keras_tensor
 from keras.utils import control_flow_util
 from keras.utils import object_identity
 from keras.utils import tf_contextlib
 from keras.utils import tf_inspect
+from keras.utils import tf_utils
+
+# isort: off
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.eager.context import get_config
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
@@ -69,10 +67,9 @@
 # This is a thread local object that will hold the default internal TF session
 # used by Keras. It can be set manually via `set_session(sess)`.
 class SessionLocal(threading.local):
-
-  def __init__(self):
-    super().__init__()
-    self.session = None
+    def __init__(self):
+        super().__init__()
+        self.session = None
 
 
 _SESSION = SessionLocal()
@@ -96,32 +93,34 @@ def __init__(self):
 # thread local. This is needed to make set_learning_phase affect only the
 # current thread during eager execution (see b/123096885 for more details).
 class _DummyEagerGraph(threading.local):
-  """_DummyEagerGraph provides a thread local `key` attribute.
+    """_DummyEagerGraph provides a thread local `key` attribute.
 
-  We can't use threading.local directly, i.e. without subclassing, because
-  gevent monkey patches threading.local and its version does not support
-  weak references.
-  """
+    We can't use threading.local directly, i.e. without subclassing, because
+    gevent monkey patches threading.local and its version does not support
+    weak references.
+    """
 
-  class _WeakReferencableClass:
-    """This dummy class is needed for two reasons.
+    class _WeakReferencableClass:
+        """This dummy class is needed for two reasons.
 
-    - We need something that supports weak references. Basic types like string
-    and ints don't.
-    - We need something whose hash and equality are based on object identity
-    to make sure they are treated as different keys to _GRAPH_LEARNING_PHASES.
+        - We need something that supports weak references. Basic types like
+        string and ints don't.
+        - We need something whose hash and equality are based on object identity
+        to make sure they are treated as different keys to
+        _GRAPH_LEARNING_PHASES.
 
-    An empty Python class satisfies both of these requirements.
-    """
-    pass
+        An empty Python class satisfies both of these requirements.
+        """
 
-  def __init__(self):
-    # Constructors for classes subclassing threading.local run once
-    # per thread accessing something in the class. Thus, each thread will
-    # get a different key.
-    super().__init__()
-    self.key = _DummyEagerGraph._WeakReferencableClass()
-    self.learning_phase_is_set = False
+        pass
+
+    def __init__(self):
+        # Constructors for classes subclassing threading.local run once
+        # per thread accessing something in the class. Thus, each thread will
+        # get a different key.
+        super().__init__()
+        self.key = _DummyEagerGraph._WeakReferencableClass()
+        self.learning_phase_is_set = False
 
 
 _DUMMY_EAGER_GRAPH = _DummyEagerGraph()
@@ -145,623 +144,648 @@ def __init__(self):
 set_image_data_format = backend_config.set_image_data_format
 
 
-@keras_export('keras.backend.backend')
+@keras_export("keras.backend.backend")
 @doc_controls.do_not_generate_docs
 def backend():
-  """Publicly accessible method for determining the current backend.
+    """Publicly accessible method for determining the current backend.
 
-  Only exists for API compatibility with multi-backend Keras.
+    Only exists for API compatibility with multi-backend Keras.
 
-  Returns:
-      The string "tensorflow".
-  """
-  return 'tensorflow'
+    Returns:
+        The string "tensorflow".
+    """
+    return "tensorflow"
 
 
-@keras_export('keras.backend.cast_to_floatx')
+@keras_export("keras.backend.cast_to_floatx")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def cast_to_floatx(x):
-  """Cast a Numpy array to the default Keras float type.
-
-  Args:
-      x: Numpy array or TensorFlow tensor.
+    """Cast a Numpy array to the default Keras float type.
 
-  Returns:
-      The same array (Numpy array if `x` was a Numpy array, or TensorFlow tensor
-      if `x` was a tensor), cast to its new type.
-
-  Example:
+    Args:
+        x: Numpy array or TensorFlow tensor.
 
-  >>> tf.keras.backend.floatx()
-  'float32'
-  >>> arr = np.array([1.0, 2.0], dtype='float64')
-  >>> arr.dtype
-  dtype('float64')
-  >>> new_arr = cast_to_floatx(arr)
-  >>> new_arr
-  array([1.,  2.], dtype=float32)
-  >>> new_arr.dtype
-  dtype('float32')
+    Returns:
+        The same array (Numpy array if `x` was a Numpy array, or TensorFlow
+        tensor if `x` was a tensor), cast to its new type.
+
+    Example:
+
+    >>> tf.keras.backend.floatx()
+    'float32'
+    >>> arr = np.array([1.0, 2.0], dtype='float64')
+    >>> arr.dtype
+    dtype('float64')
+    >>> new_arr = cast_to_floatx(arr)
+    >>> new_arr
+    array([1.,  2.], dtype=float32)
+    >>> new_arr.dtype
+    dtype('float32')
 
-  """
-  if isinstance(x, (tf.Tensor,
-                    tf.Variable,
-                    tf.SparseTensor)):
-    return tf.cast(x, dtype=floatx())
-  return np.asarray(x, dtype=floatx())
+    """
+    if isinstance(x, (tf.Tensor, tf.Variable, tf.SparseTensor)):
+        return tf.cast(x, dtype=floatx())
+    return np.asarray(x, dtype=floatx())
 
 
-@keras_export('keras.backend.get_uid')
-def get_uid(prefix=''):
-  """Associates a string prefix with an integer counter in a TensorFlow graph.
+@keras_export("keras.backend.get_uid")
+def get_uid(prefix=""):
+    """Associates a string prefix with an integer counter in a TensorFlow graph.
 
-  Args:
-    prefix: String prefix to index.
+    Args:
+      prefix: String prefix to index.
 
-  Returns:
-    Unique integer ID.
+    Returns:
+      Unique integer ID.
 
-  Example:
+    Example:
 
-  >>> get_uid('dense')
-  1
-  >>> get_uid('dense')
-  2
+    >>> get_uid('dense')
+    1
+    >>> get_uid('dense')
+    2
 
-  """
-  graph = get_graph()
-  if graph not in PER_GRAPH_OBJECT_NAME_UIDS:
-    PER_GRAPH_OBJECT_NAME_UIDS[graph] = collections.defaultdict(int)
-  layer_name_uids = PER_GRAPH_OBJECT_NAME_UIDS[graph]
-  layer_name_uids[prefix] += 1
-  return layer_name_uids[prefix]
+    """
+    graph = get_graph()
+    if graph not in PER_GRAPH_OBJECT_NAME_UIDS:
+        PER_GRAPH_OBJECT_NAME_UIDS[graph] = collections.defaultdict(int)
+    layer_name_uids = PER_GRAPH_OBJECT_NAME_UIDS[graph]
+    layer_name_uids[prefix] += 1
+    return layer_name_uids[prefix]
 
 
-@keras_export('keras.backend.reset_uids')
+@keras_export("keras.backend.reset_uids")
 def reset_uids():
-  """Resets graph identifiers.
-  """
+    """Resets graph identifiers."""
 
-  PER_GRAPH_OBJECT_NAME_UIDS.clear()
-  OBSERVED_NAMES.clear()
+    PER_GRAPH_OBJECT_NAME_UIDS.clear()
+    OBSERVED_NAMES.clear()
 
 
-@keras_export('keras.backend.clear_session')
+@keras_export("keras.backend.clear_session")
 def clear_session():
-  """Resets all state generated by Keras.
-
-  Keras manages a global state, which it uses to implement the Functional
-  model-building API and to uniquify autogenerated layer names.
-
-  If you are creating many models in a loop, this global state will consume
-  an increasing amount of memory over time, and you may want to clear it.
-  Calling `clear_session()` releases the global state: this helps avoid clutter
-  from old models and layers, especially when memory is limited.
-
-  Example 1: calling `clear_session()` when creating models in a loop
-
-  ```python
-  for _ in range(100):
-    # Without `clear_session()`, each iteration of this loop will
-    # slightly increase the size of the global state managed by Keras
-    model = tf.keras.Sequential([tf.keras.layers.Dense(10) for _ in range(10)])
-
-  for _ in range(100):
-    # With `clear_session()` called at the beginning,
-    # Keras starts with a blank state at each iteration
-    # and memory consumption is constant over time.
-    tf.keras.backend.clear_session()
-    model = tf.keras.Sequential([tf.keras.layers.Dense(10) for _ in range(10)])
-  ```
-
-  Example 2: resetting the layer name generation counter
-
-  >>> import tensorflow as tf
-  >>> layers = [tf.keras.layers.Dense(10) for _ in range(10)]
-  >>> new_layer = tf.keras.layers.Dense(10)
-  >>> print(new_layer.name)
-  dense_10
-  >>> tf.keras.backend.set_learning_phase(1)
-  >>> print(tf.keras.backend.learning_phase())
-  1
-  >>> tf.keras.backend.clear_session()
-  >>> new_layer = tf.keras.layers.Dense(10)
-  >>> print(new_layer.name)
-  dense
-  """
-  global _SESSION
-  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
-  global _GRAPH_VARIABLES  # pylint: disable=global-variable-not-assigned
-  global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
-  global _GRAPH
-  _GRAPH.graph = None
-  tf.compat.v1.reset_default_graph()
-  reset_uids()
-  if _SESSION.session is not None:
-    _SESSION.session.close()
-    _SESSION.session = None
-  graph = get_graph()
-  with graph.as_default():
-    _DUMMY_EAGER_GRAPH.learning_phase_is_set = False
-
-    _GRAPH_LEARNING_PHASES = {}
-    # Create the learning phase placeholder in graph using the default factory
-    phase = _default_learning_phase()
-    _internal_set_learning_phase(graph, phase)
-
-    _GRAPH_VARIABLES.pop(graph, None)
-    _GRAPH_TF_OPTIMIZERS.pop(graph, None)
-  if tf.executing_eagerly():
-    # Clear pending nodes in eager executors, kernel caches and step_containers.
-    context.context().clear_kernel_cache()
+    """Resets all state generated by Keras.
+
+    Keras manages a global state, which it uses to implement the Functional
+    model-building API and to uniquify autogenerated layer names.
+
+    If you are creating many models in a loop, this global state will consume
+    an increasing amount of memory over time, and you may want to clear it.
+    Calling `clear_session()` releases the global state: this helps avoid
+    clutter from old models and layers, especially when memory is limited.
+
+    Example 1: calling `clear_session()` when creating models in a loop
+
+    ```python
+    for _ in range(100):
+      # Without `clear_session()`, each iteration of this loop will
+      # slightly increase the size of the global state managed by Keras
+      model = tf.keras.Sequential([
+          tf.keras.layers.Dense(10) for _ in range(10)])
+
+    for _ in range(100):
+      # With `clear_session()` called at the beginning,
+      # Keras starts with a blank state at each iteration
+      # and memory consumption is constant over time.
+      tf.keras.backend.clear_session()
+      model = tf.keras.Sequential([
+          tf.keras.layers.Dense(10) for _ in range(10)])
+    ```
+
+    Example 2: resetting the layer name generation counter
+
+    >>> import tensorflow as tf
+    >>> layers = [tf.keras.layers.Dense(10) for _ in range(10)]
+    >>> new_layer = tf.keras.layers.Dense(10)
+    >>> print(new_layer.name)
+    dense_10
+    >>> tf.keras.backend.set_learning_phase(1)
+    >>> print(tf.keras.backend.learning_phase())
+    1
+    >>> tf.keras.backend.clear_session()
+    >>> new_layer = tf.keras.layers.Dense(10)
+    >>> print(new_layer.name)
+    dense
+    """
+    global _SESSION
+    global _GRAPH_LEARNING_PHASES
+    global _GRAPH_VARIABLES
+    global _GRAPH_TF_OPTIMIZERS
+    global _GRAPH
+    _GRAPH.graph = None
+    tf.compat.v1.reset_default_graph()
+    reset_uids()
+    if _SESSION.session is not None:
+        _SESSION.session.close()
+        _SESSION.session = None
+    graph = get_graph()
+    with graph.as_default():
+        _DUMMY_EAGER_GRAPH.learning_phase_is_set = False
+
+        _GRAPH_LEARNING_PHASES = {}
+        # Create the learning phase placeholder in graph using the default
+        # factory
+        phase = _default_learning_phase()
+        _internal_set_learning_phase(graph, phase)
+
+        _GRAPH_VARIABLES.pop(graph, None)
+        _GRAPH_TF_OPTIMIZERS.pop(graph, None)
+    if tf.executing_eagerly():
+        # Clear pending nodes in eager executors, kernel caches and
+        # step_containers.
+        context.context().clear_kernel_cache()
+
 
 # Inject the clear_session function to keras_deps to remove the dependency
 # from TFLite to Keras.
 tf.__internal__.register_clear_session_function(clear_session)
 
 
-@keras_export('keras.backend.manual_variable_initialization')
+@keras_export("keras.backend.manual_variable_initialization")
 @doc_controls.do_not_generate_docs
 def manual_variable_initialization(value):
-  """Sets the manual variable initialization flag.
+    """Sets the manual variable initialization flag.
 
-  This boolean flag determines whether
-  variables should be initialized
-  as they are instantiated (default), or if
-  the user should handle the initialization
-  (e.g. via `tf.compat.v1.initialize_all_variables()`).
+    This boolean flag determines whether
+    variables should be initialized
+    as they are instantiated (default), or if
+    the user should handle the initialization
+    (e.g. via `tf.compat.v1.initialize_all_variables()`).
 
-  Args:
-      value: Python boolean.
-  """
-  global _MANUAL_VAR_INIT
-  _MANUAL_VAR_INIT = value
+    Args:
+        value: Python boolean.
+    """
+    global _MANUAL_VAR_INIT
+    _MANUAL_VAR_INIT = value
 
 
-@keras_export('keras.backend.learning_phase')
+@keras_export("keras.backend.learning_phase")
 @doc_controls.do_not_generate_docs
 def learning_phase():
-  """Returns the learning phase flag.
-
-  The learning phase flag is a bool tensor (0 = test, 1 = train)
-  to be passed as input to any Keras function
-  that uses a different behavior at train time and test time.
-
-  Returns:
-      Learning phase (scalar integer tensor or Python integer).
-  """
-  graph = tf.compat.v1.get_default_graph()
-  if graph is getattr(_GRAPH, 'graph', None):
-    # Don't enter an init_scope for the learning phase if eager execution
-    # is enabled but we're inside the Keras workspace graph.
-    learning_phase = symbolic_learning_phase()
-  else:
-    with tf.init_scope():
-      # We always check & set the learning phase inside the init_scope,
-      # otherwise the wrong default_graph will be used to look up the learning
-      # phase inside of functions & defuns.
-      #
-      # This is because functions & defuns (both in graph & in eager mode)
-      # will always execute non-eagerly using a function-specific default
-      # subgraph.
-      if context.executing_eagerly():
-        if _DUMMY_EAGER_GRAPH.key not in _GRAPH_LEARNING_PHASES:
-          phase = _default_learning_phase()
-          _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key, phase)
-          _DUMMY_EAGER_GRAPH.learning_phase_is_set = True
-        return _internal_get_learning_phase(_DUMMY_EAGER_GRAPH.key)
-      else:
+    """Returns the learning phase flag.
+
+    The learning phase flag is a bool tensor (0 = test, 1 = train)
+    to be passed as input to any Keras function
+    that uses a different behavior at train time and test time.
+
+    Returns:
+        Learning phase (scalar integer tensor or Python integer).
+    """
+    graph = tf.compat.v1.get_default_graph()
+    if graph is getattr(_GRAPH, "graph", None):
+        # Don't enter an init_scope for the learning phase if eager execution
+        # is enabled but we're inside the Keras workspace graph.
         learning_phase = symbolic_learning_phase()
-  _mark_func_graph_as_unsaveable(graph, learning_phase)
-  return learning_phase
+    else:
+        with tf.init_scope():
+            # We always check & set the learning phase inside the init_scope,
+            # otherwise the wrong default_graph will be used to look up the
+            # learning phase inside of functions & defuns.
+            #
+            # This is because functions & defuns (both in graph & in eager mode)
+            # will always execute non-eagerly using a function-specific default
+            # subgraph.
+            if context.executing_eagerly():
+                if _DUMMY_EAGER_GRAPH.key not in _GRAPH_LEARNING_PHASES:
+                    return _default_learning_phase()
+                else:
+                    return _internal_get_learning_phase(_DUMMY_EAGER_GRAPH.key)
+            else:
+                learning_phase = symbolic_learning_phase()
+    _mark_func_graph_as_unsaveable(graph, learning_phase)
+    return learning_phase
 
 
 def global_learning_phase_is_set():
-  return _DUMMY_EAGER_GRAPH.learning_phase_is_set
+    return _DUMMY_EAGER_GRAPH.learning_phase_is_set
 
 
 def _mark_func_graph_as_unsaveable(graph, learning_phase):
-  """Mark func graph as unsaveable due to use of symbolic keras learning phase.
+    """Mark graph as unsaveable due to use of symbolic keras learning phase.
 
-  Functions that capture the symbolic learning phase cannot be exported to
-  SavedModel. Mark the funcgraph as unsaveable, so that an error will be raised
-  if it is exported.
+    Functions that capture the symbolic learning phase cannot be exported to
+    SavedModel. Mark the funcgraph as unsaveable, so that an error will be
+    raised if it is exported.
 
-  Args:
-    graph: Graph or FuncGraph object.
-    learning_phase: Learning phase placeholder or int defined in the graph.
-  """
-  if graph.building_function and is_placeholder(learning_phase):
-    graph.mark_as_unsaveable(
-        'The keras learning phase placeholder was used inside a function. '
-        'Exporting placeholders is not supported when saving out a SavedModel. '
-        'Please call `tf.keras.backend.set_learning_phase(0)` in the function '
-        'to set the learning phase to a constant value.')
+    Args:
+      graph: Graph or FuncGraph object.
+      learning_phase: Learning phase placeholder or int defined in the graph.
+    """
+    if graph.building_function and is_placeholder(learning_phase):
+        graph.mark_as_unsaveable(
+            "The keras learning phase placeholder was used inside a function. "
+            "Exporting placeholders is not supported when saving out a "
+            "SavedModel. Please call `tf.keras.backend.set_learning_phase(0)` "
+            "in the function to set the learning phase to a constant value."
+        )
 
 
 def symbolic_learning_phase():
-  graph = get_graph()
-  with graph.as_default():
-    if graph not in _GRAPH_LEARNING_PHASES:
-      phase = _default_learning_phase()
-      _internal_set_learning_phase(graph, phase)
+    graph = get_graph()
+    with graph.as_default():
+        if graph not in _GRAPH_LEARNING_PHASES:
+            phase = _default_learning_phase()
+            _internal_set_learning_phase(graph, phase)
 
-    return _internal_get_learning_phase(graph)
+        return _internal_get_learning_phase(graph)
 
 
 def _internal_set_learning_phase(graph, value):
-  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
-
-  if isinstance(value, tf.Tensor):
-    # The 'value' here is a tf.Tensor with attribute 'graph'.
-    # There is a circular reference between key 'graph' and attribute 'graph'.
-    # So we need use a weakref.ref to refer to the 'value' tensor here.
-    # Otherwise, it would lead to memory leak.
-    value_ref = weakref.ref(value)
-    _GRAPH_LEARNING_PHASES[graph] = value_ref
-  else:
-    _GRAPH_LEARNING_PHASES[graph] = value
+    global _GRAPH_LEARNING_PHASES
+
+    if isinstance(value, tf.Tensor):
+        # The 'value' here is a tf.Tensor with attribute 'graph'.
+        # There is a circular reference between key 'graph' and attribute
+        # 'graph'.  So we need use a weakref.ref to refer to the 'value' tensor
+        # here.  Otherwise, it would lead to memory leak.
+        value_ref = weakref.ref(value)
+        _GRAPH_LEARNING_PHASES[graph] = value_ref
+    else:
+        _GRAPH_LEARNING_PHASES[graph] = value
 
 
 def _internal_get_learning_phase(graph):
-  phase = _GRAPH_LEARNING_PHASES.get(graph, None)
-  if isinstance(phase, weakref.ref):
-    return phase()
-  else:
-    return phase
+    phase = _GRAPH_LEARNING_PHASES.get(graph, None)
+    if isinstance(phase, weakref.ref):
+        return phase()
+    else:
+        return phase
 
 
 def _default_learning_phase():
-  if context.executing_eagerly():
-    return 0
-  else:
-    with name_scope(''):
-      return tf.compat.v1.placeholder_with_default(
-          False, shape=(), name='keras_learning_phase')
+    if context.executing_eagerly():
+        return 0
+    else:
+        with name_scope(""):
+            return tf.compat.v1.placeholder_with_default(
+                False, shape=(), name="keras_learning_phase"
+            )
 
 
-@keras_export('keras.backend.set_learning_phase')
+@keras_export("keras.backend.set_learning_phase")
 @doc_controls.do_not_generate_docs
 def set_learning_phase(value):
-  """Sets the learning phase to a fixed value.
+    """Sets the learning phase to a fixed value.
 
-  The backend learning phase affects any code that calls
-  `backend.learning_phase()`
-  In particular, all Keras built-in layers use the learning phase as the default
-  for the `training` arg to `Layer.__call__`.
+    The backend learning phase affects any code that calls
+    `backend.learning_phase()`
+    In particular, all Keras built-in layers use the learning phase as the
+    default for the `training` arg to `Layer.__call__`.
 
-  User-written layers and models can achieve the same behavior with code that
-  looks like:
+    User-written layers and models can achieve the same behavior with code that
+    looks like:
 
-  ```python
-    def call(self, inputs, training=None):
-      if training is None:
-        training = backend.learning_phase()
-  ```
+    ```python
+      def call(self, inputs, training=None):
+        if training is None:
+          training = backend.learning_phase()
+    ```
 
-  Args:
-      value: Learning phase value, either 0 or 1 (integers).
-             0 = test, 1 = train
+    Args:
+        value: Learning phase value, either 0 or 1 (integers).
+               0 = test, 1 = train
 
-  Raises:
-      ValueError: if `value` is neither `0` nor `1`.
-  """
-  warnings.warn('`tf.keras.backend.set_learning_phase` is deprecated and '
-                'will be removed after 2020-10-11. To update it, simply '
-                'pass a True/False value to the `training` argument of the '
-                '`__call__` method of your layer or model.')
-  deprecated_internal_set_learning_phase(value)
+    Raises:
+        ValueError: if `value` is neither `0` nor `1`.
+    """
+    warnings.warn(
+        "`tf.keras.backend.set_learning_phase` is deprecated and "
+        "will be removed after 2020-10-11. To update it, simply "
+        "pass a True/False value to the `training` argument of the "
+        "`__call__` method of your layer or model."
+    )
+    deprecated_internal_set_learning_phase(value)
 
 
 def deprecated_internal_set_learning_phase(value):
-  """A deprecated internal implementation of set_learning_phase.
+    """A deprecated internal implementation of set_learning_phase.
 
-  This method is an internal-only version of `set_learning_phase` that
-  does not raise a deprecation error. It is required because
-  saved_model needs to keep working with user code that uses the deprecated
-  learning phase methods until those APIs are fully removed from the public API.
+    This method is an internal-only version of `set_learning_phase` that
+    does not raise a deprecation error. It is required because
+    saved_model needs to keep working with user code that uses the deprecated
+    learning phase methods until those APIs are fully removed from the public
+    API.
 
-  Specifically SavedModel saving needs to make sure the learning phase is 0
-  during tracing even if users overwrote it to a different value.
+    Specifically SavedModel saving needs to make sure the learning phase is 0
+    during tracing even if users overwrote it to a different value.
 
-  But, we don't want to raise deprecation warnings for users when savedmodel
-  sets learning phase just for compatibility with code that relied on
-  explicitly setting the learning phase for other values.
+    But, we don't want to raise deprecation warnings for users when savedmodel
+    sets learning phase just for compatibility with code that relied on
+    explicitly setting the learning phase for other values.
 
-  Args:
-      value: Learning phase value, either 0 or 1 (integers). 0 = test, 1 = train
+    Args:
+        value: Learning phase value, either 0 or 1 (integers).
+            0 = test, 1 = train
 
-  Raises:
-      ValueError: if `value` is neither `0` nor `1`.
-  """
-  if value not in {0, 1}:
-    raise ValueError('Expected learning phase to be 0 or 1.')
-  with tf.init_scope():
-    if tf.executing_eagerly():
-      # In an eager context, the learning phase values applies to both the eager
-      # context and the internal Keras graph.
-      _DUMMY_EAGER_GRAPH.learning_phase_is_set = True
-      _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key, value)
+    Raises:
+        ValueError: if `value` is neither `0` nor `1`.
+    """
+    if value not in {0, 1}:
+        raise ValueError("Expected learning phase to be 0 or 1.")
+    with tf.init_scope():
+        if tf.executing_eagerly():
+            # In an eager context, the learning phase values applies to both the
+            # eager context and the internal Keras graph.
+            _DUMMY_EAGER_GRAPH.learning_phase_is_set = True
+            _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key, value)
 
-    _internal_set_learning_phase(get_graph(), value)
+        _internal_set_learning_phase(get_graph(), value)
 
 
-@keras_export('keras.backend.learning_phase_scope')
+@keras_export("keras.backend.learning_phase_scope")
 @tf_contextlib.contextmanager
 @doc_controls.do_not_generate_docs
 def learning_phase_scope(value):
-  """Provides a scope within which the learning phase is equal to `value`.
+    """Provides a scope within which the learning phase is equal to `value`.
 
-  The learning phase gets restored to its original value upon exiting the scope.
+    The learning phase gets restored to its original value upon exiting the
+    scope.
 
-  Args:
-     value: Learning phase value, either 0 or 1 (integers).
-            0 = test, 1 = train
+    Args:
+       value: Learning phase value, either 0 or 1 (integers).
+              0 = test, 1 = train
 
-  Yields:
-    None.
-
-  Raises:
-     ValueError: if `value` is neither `0` nor `1`.
-  """
-  warnings.warn(
-      '`tf.keras.backend.learning_phase_scope` is deprecated and '
-      'will be removed after 2020-10-11. To update it, simply '
-      'pass a True/False value to the `training` argument of the '
-      '`__call__` method of your layer or model.',
-      stacklevel=2)
-  with deprecated_internal_learning_phase_scope(value):
-    try:
-      yield
-    finally:
-      pass
+    Yields:
+      None.
+
+    Raises:
+       ValueError: if `value` is neither `0` nor `1`.
+    """
+    warnings.warn(
+        "`tf.keras.backend.learning_phase_scope` is deprecated and "
+        "will be removed after 2020-10-11. To update it, simply "
+        "pass a True/False value to the `training` argument of the "
+        "`__call__` method of your layer or model.",
+        stacklevel=2,
+    )
+    with deprecated_internal_learning_phase_scope(value):
+        try:
+            yield
+        finally:
+            pass
 
 
 @tf_contextlib.contextmanager
 def deprecated_internal_learning_phase_scope(value):
-  """An internal-only version of `learning_phase_scope`.
+    """An internal-only version of `learning_phase_scope`.
 
-  Unlike the public method, this method does not raise a deprecation warning.
-  This is needed because saved model saving needs to set learning phase
-  to maintain compatibility
-  with code that sets/gets the learning phase, but saved model
-  saving itself shouldn't raise a deprecation warning.
+    Unlike the public method, this method does not raise a deprecation warning.
+    This is needed because saved model saving needs to set learning phase
+    to maintain compatibility
+    with code that sets/gets the learning phase, but saved model
+    saving itself shouldn't raise a deprecation warning.
 
-  We can get rid of this method and its usages when the public API is
-  removed.
+    We can get rid of this method and its usages when the public API is
+    removed.
 
-  Args:
-     value: Learning phase value, either 0 or 1 (integers). 0 = test, 1 = train
-
-  Yields:
-    None.
+    Args:
+        value: Learning phase value, either 0 or 1 (integers).
+            0 = test, 1 = train
 
-  Raises:
-     ValueError: if `value` is neither `0` nor `1`.
-  """
-  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
-  if value not in {0, 1}:
-    raise ValueError('Expected learning phase to be 0 or 1.')
+    Yields:
+        None.
 
-  with tf.init_scope():
-    if tf.executing_eagerly():
-      previous_eager_value = _internal_get_learning_phase(
-          _DUMMY_EAGER_GRAPH.key)
-    previous_graph_value = _internal_get_learning_phase(get_graph())
+    Raises:
+        ValueError: if `value` is neither `0` nor `1`.
+    """
+    global _GRAPH_LEARNING_PHASES
+    if value not in {0, 1}:
+        raise ValueError("Expected learning phase to be 0 or 1.")
 
-  learning_phase_previously_set = _DUMMY_EAGER_GRAPH.learning_phase_is_set
-  try:
-    deprecated_internal_set_learning_phase(value)
-    yield
-  finally:
-    # Restore learning phase to initial value.
-    if not learning_phase_previously_set:
-      _DUMMY_EAGER_GRAPH.learning_phase_is_set = False
     with tf.init_scope():
-      if tf.executing_eagerly():
-        if previous_eager_value is not None:
-          _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key,
-                                       previous_eager_value)
-        elif _DUMMY_EAGER_GRAPH.key in _GRAPH_LEARNING_PHASES:
-          del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH.key]
+        if tf.executing_eagerly():
+            previous_eager_value = _internal_get_learning_phase(
+                _DUMMY_EAGER_GRAPH.key
+            )
+        previous_graph_value = _internal_get_learning_phase(get_graph())
 
-      graph = get_graph()
-      if previous_graph_value is not None:
-        _internal_set_learning_phase(graph, previous_graph_value)
-      elif graph in _GRAPH_LEARNING_PHASES:
-        del _GRAPH_LEARNING_PHASES[graph]
+    learning_phase_previously_set = _DUMMY_EAGER_GRAPH.learning_phase_is_set
+    try:
+        deprecated_internal_set_learning_phase(value)
+        yield
+    finally:
+        # Restore learning phase to initial value.
+        if not learning_phase_previously_set:
+            _DUMMY_EAGER_GRAPH.learning_phase_is_set = False
+        with tf.init_scope():
+            if tf.executing_eagerly():
+                if previous_eager_value is not None:
+                    _internal_set_learning_phase(
+                        _DUMMY_EAGER_GRAPH.key, previous_eager_value
+                    )
+                elif _DUMMY_EAGER_GRAPH.key in _GRAPH_LEARNING_PHASES:
+                    del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH.key]
+
+            graph = get_graph()
+            if previous_graph_value is not None:
+                _internal_set_learning_phase(graph, previous_graph_value)
+            elif graph in _GRAPH_LEARNING_PHASES:
+                del _GRAPH_LEARNING_PHASES[graph]
 
 
 @tf_contextlib.contextmanager
 def eager_learning_phase_scope(value):
-  """Internal scope that sets the learning phase in eager / tf.function only.
-
-  Args:
-      value: Learning phase value, either 0 or 1 (integers).
-             0 = test, 1 = train
-
-  Yields:
-    None.
-
-  Raises:
-     ValueError: if `value` is neither `0` nor `1`.
-  """
-  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
-  assert value in {0, 1}
-  assert tf.compat.v1.executing_eagerly_outside_functions()
-  global_learning_phase_was_set = global_learning_phase_is_set()
-  if global_learning_phase_was_set:
-    previous_value = learning_phase()
-  try:
-    _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key, value)
-    yield
-  finally:
-    # Restore learning phase to initial value or unset.
+    """Internal scope that sets the learning phase in eager / tf.function only.
+
+    Args:
+        value: Learning phase value, either 0 or 1 (integers).
+               0 = test, 1 = train
+
+    Yields:
+      None.
+
+    Raises:
+       ValueError: if `value` is neither `0` nor `1`.
+    """
+    global _GRAPH_LEARNING_PHASES
+    assert value in {0, 1}
+    assert tf.compat.v1.executing_eagerly_outside_functions()
+    global_learning_phase_was_set = global_learning_phase_is_set()
     if global_learning_phase_was_set:
-      _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key, previous_value)
-    else:
-      del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH.key]
+        previous_value = learning_phase()
+    try:
+        _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key, value)
+        yield
+    finally:
+        # Restore learning phase to initial value or unset.
+        if global_learning_phase_was_set:
+            _internal_set_learning_phase(_DUMMY_EAGER_GRAPH.key, previous_value)
+        else:
+            del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH.key]
 
 
 def _as_graph_element(obj):
-  """Convert `obj` to a graph element if possible, otherwise return `None`.
+    """Convert `obj` to a graph element if possible, otherwise return `None`.
 
-  Args:
-    obj: Object to convert.
+    Args:
+      obj: Object to convert.
 
-  Returns:
-    The result of `obj._as_graph_element()` if that method is available;
-        otherwise `None`.
-  """
-  conv_fn = getattr(obj, '_as_graph_element', None)
-  if conv_fn and callable(conv_fn):
-    return conv_fn()
-  return None
+    Returns:
+      The result of `obj._as_graph_element()` if that method is available;
+          otherwise `None`.
+    """
+    conv_fn = getattr(obj, "_as_graph_element", None)
+    if conv_fn and callable(conv_fn):
+        return conv_fn()
+    return None
 
 
 def _assert_same_graph(original_item, item):
-  """Fail if the 2 items are from different graphs.
+    """Fail if the 2 items are from different graphs.
 
-  Args:
-    original_item: Original item to check against.
-    item: Item to check.
+    Args:
+      original_item: Original item to check against.
+      item: Item to check.
 
-  Raises:
-    ValueError: if graphs do not match.
-  """
-  original_graph = getattr(original_item, 'graph', None)
-  graph = getattr(item, 'graph', None)
-  if original_graph and graph and original_graph is not graph:
-    raise ValueError(
-        '%s must be from the same graph as %s (graphs are %s and %s).' %
-        (item, original_item, graph, original_graph))
+    Raises:
+      ValueError: if graphs do not match.
+    """
+    original_graph = getattr(original_item, "graph", None)
+    graph = getattr(item, "graph", None)
+    if original_graph and graph and original_graph is not graph:
+        raise ValueError(
+            "%s must be from the same graph as %s (graphs are %s and %s)."
+            % (item, original_item, graph, original_graph)
+        )
 
 
 def _current_graph(op_input_list, graph=None):
-  """Returns the appropriate graph to use for the given inputs.
-
-  This library method provides a consistent algorithm for choosing the graph
-  in which an Operation should be constructed:
-
-  1. If the default graph is being used to construct a function, we
-     use the default graph.
-  2. If the "graph" is specified explicitly, we validate that all of the inputs
-     in "op_input_list" are compatible with that graph.
-  3. Otherwise, we attempt to select a graph from the first Operation-
-     or Tensor-valued input in "op_input_list", and validate that all other
-     such inputs are in the same graph.
-  4. If the graph was not specified and it could not be inferred from
-     "op_input_list", we attempt to use the default graph.
-
-  Args:
-    op_input_list: A list of inputs to an operation, which may include `Tensor`,
-      `Operation`, and other objects that may be converted to a graph element.
-    graph: (Optional) The explicit graph to use.
-
-  Raises:
-    TypeError: If op_input_list is not a list or tuple, or if graph is not a
-      Graph.
-    ValueError: If a graph is explicitly passed and not all inputs are from it,
-      or if the inputs are from multiple graphs, or we could not find a graph
-      and there was no default graph.
-
-  Returns:
-    The appropriate graph to use for the given inputs.
-
-  """
-  current_default_graph = tf.compat.v1.get_default_graph()
-  if current_default_graph.building_function:
-    return current_default_graph
-
-  op_input_list = tuple(op_input_list)  # Handle generators correctly
-  if graph and not isinstance(graph, tf.Graph):
-    raise TypeError('Input graph needs to be a Graph: %s' % (graph,))
-
-  # 1. We validate that all of the inputs are from the same graph. This is
-  #    either the supplied graph parameter, or the first one selected from one
-  #    the graph-element-valued inputs. In the latter case, we hold onto
-  #    that input in original_graph_element so we can provide a more
-  #    informative error if a mismatch is found.
-  original_graph_element = None
-  for op_input in op_input_list:
-    # Determine if this is a valid graph_element.
-    # TODO(joshl): Note that we exclude subclasses of Tensor. Need to clean this
-    # up.
-    if (isinstance(op_input, (
-        tf.Operation, tf.Tensor, tf.__internal__.CompositeTensor)) and
-        ((not isinstance(op_input, tf.Tensor))
-         or type(op_input) == tf.Tensor)):  # pylint: disable=unidiomatic-typecheck
-      graph_element = op_input
-    else:
-      graph_element = _as_graph_element(op_input)
+    """Returns the appropriate graph to use for the given inputs.
+
+    This library method provides a consistent algorithm for choosing the graph
+    in which an Operation should be constructed:
+
+    1. If the default graph is being used to construct a function, we
+       use the default graph.
+    2. If the "graph" is specified explicitly, we validate that all of the
+       inputs in "op_input_list" are compatible with that graph.
+    3. Otherwise, we attempt to select a graph from the first Operation-
+       or Tensor-valued input in "op_input_list", and validate that all other
+       such inputs are in the same graph.
+    4. If the graph was not specified and it could not be inferred from
+       "op_input_list", we attempt to use the default graph.
+
+    Args:
+      op_input_list: A list of inputs to an operation, which may include
+        `Tensor`, `Operation`, and other objects that may be converted to a
+        graph element.
+      graph: (Optional) The explicit graph to use.
+
+    Raises:
+      TypeError: If op_input_list is not a list or tuple, or if graph is not a
+        Graph.
+      ValueError: If a graph is explicitly passed and not all inputs are from
+        it, or if the inputs are from multiple graphs, or we could not find a
+        graph and there was no default graph.
+
+    Returns:
+      The appropriate graph to use for the given inputs.
+
+    """
+    current_default_graph = tf.compat.v1.get_default_graph()
+    if current_default_graph.building_function:
+        return current_default_graph
+
+    op_input_list = tuple(op_input_list)  # Handle generators correctly
+    if graph and not isinstance(graph, tf.Graph):
+        raise TypeError(f"Input graph needs to be a Graph: {graph}")
+
+    def _is_symbolic_tensor(tensor):
+        if hasattr(tf, "is_symbolic_tensor"):
+            return tf.is_symbolic_tensor(tensor)
+        return type(tensor) == tf.Tensor
+
+    # 1. We validate that all of the inputs are from the same graph. This is
+    #    either the supplied graph parameter, or the first one selected from one
+    #    the graph-element-valued inputs. In the latter case, we hold onto
+    #    that input in original_graph_element so we can provide a more
+    #    informative error if a mismatch is found.
+    original_graph_element = None
+    for op_input in op_input_list:
+        if isinstance(
+            op_input, (tf.Operation, tf.__internal__.CompositeTensor)
+        ) or _is_symbolic_tensor(op_input):
+            graph_element = op_input
+        else:
+            graph_element = _as_graph_element(op_input)
 
-    if graph_element is not None:
-      if not graph:
-        original_graph_element = graph_element
-        graph = getattr(graph_element, 'graph', None)
-      elif original_graph_element is not None:
-        _assert_same_graph(original_graph_element, graph_element)
-      elif graph_element.graph is not graph:
-        raise ValueError('%s is not from the passed-in graph.' % graph_element)
+        if graph_element is not None:
+            if not graph:
+                original_graph_element = graph_element
+                graph = getattr(graph_element, "graph", None)
+            elif original_graph_element is not None:
+                _assert_same_graph(original_graph_element, graph_element)
+            elif graph_element.graph is not graph:
+                raise ValueError(
+                    f"{graph_element} is not from the passed-in graph."
+                )
 
-  # 2. If all else fails, we use the default graph, which is always there.
-  return graph or current_default_graph
+    # 2. If all else fails, we use the default graph, which is always there.
+    return graph or current_default_graph
 
 
 def _get_session(op_input_list=()):
-  """Returns the session object for the current thread."""
-  global _SESSION
-  default_session = tf.compat.v1.get_default_session()
-  if default_session is not None:
-    session = default_session
-  else:
-    if tf.inside_function():
-      raise RuntimeError('Cannot get session inside Tensorflow graph function.')
-    # If we don't have a session, or that session does not match the current
-    # graph, create and cache a new session.
-    if (getattr(_SESSION, 'session', None) is None or
-        _SESSION.session.graph is not _current_graph(op_input_list)):
-      # If we are creating the Session inside a tf.distribute.Strategy scope,
-      # we ask the strategy for the right session options to use.
-      if tf.distribute.has_strategy():
-        configure_and_create_distributed_session(
-            tf.distribute.get_strategy())
-      else:
-        _SESSION.session = tf.compat.v1.Session(
-            config=get_default_session_config())
-    session = _SESSION.session
-  return session
-
-
-@keras_export(v1=['keras.backend.get_session'])
+    """Returns the session object for the current thread."""
+    global _SESSION
+    default_session = tf.compat.v1.get_default_session()
+    if default_session is not None:
+        session = default_session
+    else:
+        if tf.inside_function():
+            raise RuntimeError(
+                "Cannot get session inside Tensorflow graph function."
+            )
+        # If we don't have a session, or that session does not match the current
+        # graph, create and cache a new session.
+        if getattr(
+            _SESSION, "session", None
+        ) is None or _SESSION.session.graph is not _current_graph(
+            op_input_list
+        ):
+            # If we are creating the Session inside a tf.distribute.Strategy
+            # scope, we ask the strategy for the right session options to use.
+            if tf.distribute.has_strategy():
+                configure_and_create_distributed_session(
+                    tf.distribute.get_strategy()
+                )
+            else:
+                _SESSION.session = tf.compat.v1.Session(
+                    config=get_default_session_config()
+                )
+        session = _SESSION.session
+    return session
+
+
+@keras_export(v1=["keras.backend.get_session"])
 def get_session(op_input_list=()):
-  """Returns the TF session to be used by the backend.
+    """Returns the TF session to be used by the backend.
 
-  If a default TensorFlow session is available, we will return it.
+    If a default TensorFlow session is available, we will return it.
 
-  Else, we will return the global Keras session assuming it matches
-  the current graph.
+    Else, we will return the global Keras session assuming it matches
+    the current graph.
 
-  If no global Keras session exists at this point:
-  we will create a new global session.
+    If no global Keras session exists at this point:
+    we will create a new global session.
 
-  Note that you can manually set the global session
-  via `K.set_session(sess)`.
+    Note that you can manually set the global session
+    via `K.set_session(sess)`.
+
+    Args:
+        op_input_list: An option sequence of tensors or ops, which will be used
+          to determine the current graph. Otherwise the default graph will be
+          used.
 
-  Args:
-      op_input_list: An option sequence of tensors or ops, which will be used
-        to determine the current graph. Otherwise the default graph will be
-        used.
+    Returns:
+        A TensorFlow session.
+    """
+    session = _get_session(op_input_list)
+    if not _MANUAL_VAR_INIT:
+        with session.graph.as_default():
+            _initialize_variables(session)
+    return session
 
-  Returns:
-      A TensorFlow session.
-  """
-  session = _get_session(op_input_list)
-  if not _MANUAL_VAR_INIT:
-    with session.graph.as_default():
-      _initialize_variables(session)
-  return session
 
 # Inject the get_session function to keras_deps to remove the dependency
 # from TFLite to Keras.
@@ -773,980 +797,1018 @@ def get_session(op_input_list=()):
 
 
 def get_graph():
-  if tf.executing_eagerly():
-    global _GRAPH
-    if not getattr(_GRAPH, 'graph', None):
-      _GRAPH.graph = tf.__internal__.FuncGraph('keras_graph')
-    return _GRAPH.graph
-  else:
-    return tf.compat.v1.get_default_graph()
+    if tf.executing_eagerly():
+        global _GRAPH
+        if not getattr(_GRAPH, "graph", None):
+            _GRAPH.graph = tf.__internal__.FuncGraph("keras_graph")
+        return _GRAPH.graph
+    else:
+        return tf.compat.v1.get_default_graph()
 
 
 @tf_contextlib.contextmanager
 def _scratch_graph(graph=None):
-  """Retrieve a shared and temporary func graph.
-
-  The eager execution path lifts a subgraph from the keras global graph into
-  a scratch graph in order to create a function. DistributionStrategies, in
-  turn, constructs multiple functions as well as a final combined function. In
-  order for that logic to work correctly, all of the functions need to be
-  created on the same scratch FuncGraph.
-
-  Args:
-    graph: A graph to be used as the current scratch graph. If not set then
-      a scratch graph will either be retrieved or created:
-
-  Yields:
-    The current scratch graph.
-  """
-  global _CURRENT_SCRATCH_GRAPH
-  scratch_graph = getattr(_CURRENT_SCRATCH_GRAPH, 'graph', None)
-  # If scratch graph and `graph` are both configured, they must match.
-  if (scratch_graph is not None and graph is not None and
-      scratch_graph is not graph):
-    raise ValueError('Multiple scratch graphs specified.')
-
-  if scratch_graph:
-    yield scratch_graph
-    return
-
-  graph = graph or tf.__internal__.FuncGraph('keras_scratch_graph')
-  try:
-    _CURRENT_SCRATCH_GRAPH.graph = graph
-    yield graph
-  finally:
-    _CURRENT_SCRATCH_GRAPH.graph = None
-
-
-@keras_export(v1=['keras.backend.set_session'])
+    """Retrieve a shared and temporary func graph.
+
+    The eager execution path lifts a subgraph from the keras global graph into
+    a scratch graph in order to create a function. DistributionStrategies, in
+    turn, constructs multiple functions as well as a final combined function. In
+    order for that logic to work correctly, all of the functions need to be
+    created on the same scratch FuncGraph.
+
+    Args:
+      graph: A graph to be used as the current scratch graph. If not set then
+        a scratch graph will either be retrieved or created:
+
+    Yields:
+      The current scratch graph.
+    """
+    global _CURRENT_SCRATCH_GRAPH
+    scratch_graph = getattr(_CURRENT_SCRATCH_GRAPH, "graph", None)
+    # If scratch graph and `graph` are both configured, they must match.
+    if (
+        scratch_graph is not None
+        and graph is not None
+        and scratch_graph is not graph
+    ):
+        raise ValueError("Multiple scratch graphs specified.")
+
+    if scratch_graph:
+        yield scratch_graph
+        return
+
+    graph = graph or tf.__internal__.FuncGraph("keras_scratch_graph")
+    try:
+        _CURRENT_SCRATCH_GRAPH.graph = graph
+        yield graph
+    finally:
+        _CURRENT_SCRATCH_GRAPH.graph = None
+
+
+@keras_export(v1=["keras.backend.set_session"])
 def set_session(session):
-  """Sets the global TensorFlow session.
+    """Sets the global TensorFlow session.
 
-  Args:
-      session: A TF Session.
-  """
-  global _SESSION
-  _SESSION.session = session
+    Args:
+        session: A TF Session.
+    """
+    global _SESSION
+    _SESSION.session = session
 
 
 def get_default_session_config():
-  if os.environ.get('OMP_NUM_THREADS'):
-    logging.warning(
-        'OMP_NUM_THREADS is no longer used by the default Keras config. '
-        'To configure the number of threads, use tf.config.threading APIs.')
+    if os.environ.get("OMP_NUM_THREADS"):
+        logging.warning(
+            "OMP_NUM_THREADS is no longer used by the default Keras config. "
+            "To configure the number of threads, use tf.config.threading APIs."
+        )
 
-  config = get_config()
-  config.allow_soft_placement = True
+    config = get_config()
+    config.allow_soft_placement = True
 
-  return config
+    return config
 
 
 def get_default_graph_uid_map():
-  graph = tf.compat.v1.get_default_graph()
-  name_uid_map = PER_GRAPH_OBJECT_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections.defaultdict(int)
-    PER_GRAPH_OBJECT_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
+    graph = tf.compat.v1.get_default_graph()
+    name_uid_map = PER_GRAPH_OBJECT_NAME_UIDS.get(graph, None)
+    if name_uid_map is None:
+        name_uid_map = collections.defaultdict(int)
+        PER_GRAPH_OBJECT_NAME_UIDS[graph] = name_uid_map
+    return name_uid_map
 
 
 # DEVICE MANIPULATION
 
 
 class _TfDeviceCaptureOp:
-  """Class for capturing the TF device scope."""
+    """Class for capturing the TF device scope."""
 
-  def __init__(self):
-    self.device = None
+    def __init__(self):
+        self.device = None
 
-  def _set_device(self, device):
-    """This method captures TF's explicit device scope setting."""
-    if isinstance(device, tf.DeviceSpec):
-      device = device.to_string()
-    self.device = device
+    def _set_device(self, device):
+        """This method captures TF's explicit device scope setting."""
+        if isinstance(device, tf.DeviceSpec):
+            device = device.to_string()
+        self.device = device
 
-  def _set_device_from_string(self, device_str):
-    self.device = device_str
+    def _set_device_from_string(self, device_str):
+        self.device = device_str
 
 
 def _get_current_tf_device():
-  """Return explicit device of current context, otherwise returns `None`.
-
-  Returns:
-      If the current device scope is explicitly set, it returns a string with
-      the device (`CPU` or `GPU`). If the scope is not explicitly set, it will
-      return `None`.
-  """
-  graph = get_graph()
-  op = _TfDeviceCaptureOp()
-  graph._apply_device_functions(op)
-  if tf.__internal__.tf2.enabled():
-    return tf.DeviceSpec.from_string(op.device)
-  else:
-    return tf.compat.v1.DeviceSpec.from_string(op.device)
+    """Return explicit device of current context, otherwise returns `None`.
+
+    Returns:
+        If the current device scope is explicitly set, it returns a string with
+        the device (`CPU` or `GPU`). If the scope is not explicitly set, it will
+        return `None`.
+    """
+    graph = get_graph()
+    op = _TfDeviceCaptureOp()
+    graph._apply_device_functions(op)
+    if tf.__internal__.tf2.enabled():
+        return tf.DeviceSpec.from_string(op.device)
+    else:
+        return tf.compat.v1.DeviceSpec.from_string(op.device)
 
 
 def _is_current_explicit_device(device_type):
-  """Check if the current device is explicitly set on the device type specified.
+    """Check if the current device is explicitly set to `device_type`.
 
-  Args:
-      device_type: A string containing `GPU` or `CPU` (case-insensitive).
+    Args:
+        device_type: A string containing `GPU` or `CPU` (case-insensitive).
 
-  Returns:
-      A boolean indicating if the current device scope is explicitly set on the
-      device type.
+    Returns:
+        A boolean indicating if the current device scope is explicitly set on
+        the device type.
 
-  Raises:
-      ValueError: If the `device_type` string indicates an unsupported device.
-  """
-  device_type = device_type.upper()
-  if device_type not in ['CPU', 'GPU']:
-    raise ValueError('`device_type` should be either "CPU" or "GPU".')
-  device = _get_current_tf_device()
-  return device is not None and device.device_type == device_type.upper()
+    Raises:
+        ValueError: If the `device_type` string indicates an unsupported device.
+    """
+    device_type = device_type.upper()
+    if device_type not in ["CPU", "GPU"]:
+        raise ValueError('`device_type` should be either "CPU" or "GPU".')
+    device = _get_current_tf_device()
+    return device is not None and device.device_type == device_type.upper()
 
 
 def _get_available_gpus():
-  """Get a list of available GPU devices (formatted as strings).
+    """Get a list of available GPU devices (formatted as strings).
 
-  Returns:
-      A list of available GPU devices.
-  """
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    # Returns names of devices directly.
-    return [d.name for d in tf.config.list_logical_devices('GPU')]
+    Returns:
+        A list of available GPU devices.
+    """
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        # Returns names of devices directly.
+        return [d.name for d in tf.config.list_logical_devices("GPU")]
 
-  global _LOCAL_DEVICES
-  if _LOCAL_DEVICES is None:
-    _LOCAL_DEVICES = get_session().list_devices()
-  return [x.name for x in _LOCAL_DEVICES if x.device_type == 'GPU']
+    global _LOCAL_DEVICES
+    if _LOCAL_DEVICES is None:
+        _LOCAL_DEVICES = get_session().list_devices()
+    return [x.name for x in _LOCAL_DEVICES if x.device_type == "GPU"]
 
 
 def _has_nchw_support():
-  """Check whether the current scope supports NCHW ops.
+    """Check whether the current scope supports NCHW ops.
 
-  TensorFlow does not support NCHW on CPU. Therefore we check if we are not
-  explicitly put on
-  CPU, and have GPUs available. In this case there will be soft-placing on the
-  GPU device.
+    TensorFlow does not support NCHW on CPU. Therefore we check if we are not
+    explicitly put on
+    CPU, and have GPUs available. In this case there will be soft-placing on the
+    GPU device.
 
-  Returns:
-      bool: if the current scope device placement would support nchw
-  """
-  explicitly_on_cpu = _is_current_explicit_device('CPU')
-  gpus_available = bool(_get_available_gpus())
-  return not explicitly_on_cpu and gpus_available
+    Returns:
+        bool: if the current scope device placement would support nchw
+    """
+    explicitly_on_cpu = _is_current_explicit_device("CPU")
+    gpus_available = bool(_get_available_gpus())
+    return not explicitly_on_cpu and gpus_available
 
 
 # VARIABLE MANIPULATION
 
 
 def _constant_to_tensor(x, dtype):
-  """Convert the input `x` to a tensor of type `dtype`.
+    """Convert the input `x` to a tensor of type `dtype`.
 
-  This is slightly faster than the _to_tensor function, at the cost of
-  handling fewer cases.
+    This is slightly faster than the _to_tensor function, at the cost of
+    handling fewer cases.
 
-  Args:
-      x: An object to be converted (numpy arrays, floats, ints and lists of
-        them).
-      dtype: The destination type.
+    Args:
+        x: An object to be converted (numpy arrays, floats, ints and lists of
+          them).
+        dtype: The destination type.
 
-  Returns:
-      A tensor.
-  """
-  return tf.constant(x, dtype=dtype)
+    Returns:
+        A tensor.
+    """
+    return tf.constant(x, dtype=dtype)
 
 
 def _to_tensor(x, dtype):
-  """Convert the input `x` to a tensor of type `dtype`.
+    """Convert the input `x` to a tensor of type `dtype`.
 
-  Args:
-      x: An object to be converted (numpy array, list, tensors).
-      dtype: The destination type.
+    Args:
+        x: An object to be converted (numpy array, list, tensors).
+        dtype: The destination type.
 
-  Returns:
-      A tensor.
-  """
-  return tf.convert_to_tensor(x, dtype=dtype)
+    Returns:
+        A tensor.
+    """
+    return tf.convert_to_tensor(x, dtype=dtype)
 
 
-@keras_export('keras.backend.is_sparse')
+@keras_export("keras.backend.is_sparse")
 @doc_controls.do_not_generate_docs
 def is_sparse(tensor):
-  """Returns whether a tensor is a sparse tensor.
+    """Returns whether a tensor is a sparse tensor.
 
-  Args:
-      tensor: A tensor instance.
+    Args:
+        tensor: A tensor instance.
 
-  Returns:
-      A boolean.
+    Returns:
+        A boolean.
 
-  Example:
+    Example:
 
 
-  >>> a = tf.keras.backend.placeholder((2, 2), sparse=False)
-  >>> print(tf.keras.backend.is_sparse(a))
-  False
-  >>> b = tf.keras.backend.placeholder((2, 2), sparse=True)
-  >>> print(tf.keras.backend.is_sparse(b))
-  True
+    >>> a = tf.keras.backend.placeholder((2, 2), sparse=False)
+    >>> print(tf.keras.backend.is_sparse(a))
+    False
+    >>> b = tf.keras.backend.placeholder((2, 2), sparse=True)
+    >>> print(tf.keras.backend.is_sparse(b))
+    True
 
-  """
-  spec = getattr(tensor, '_type_spec', None)
-  if spec is not None:
-    return isinstance(spec, tf.SparseTensorSpec)
-  return isinstance(tensor, tf.SparseTensor)
+    """
+    spec = getattr(tensor, "_type_spec", None)
+    if spec is not None:
+        return isinstance(spec, tf.SparseTensorSpec)
+    return isinstance(tensor, tf.SparseTensor)
 
 
-@keras_export('keras.backend.to_dense')
+@keras_export("keras.backend.to_dense")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def to_dense(tensor):
-  """Converts a sparse tensor into a dense tensor and returns it.
+    """Converts a sparse tensor into a dense tensor and returns it.
 
-  Args:
-      tensor: A tensor instance (potentially sparse).
+    Args:
+        tensor: A tensor instance (potentially sparse).
 
-  Returns:
-      A dense tensor.
+    Returns:
+        A dense tensor.
 
-  Examples:
+    Examples:
 
 
-  >>> b = tf.keras.backend.placeholder((2, 2), sparse=True)
-  >>> print(tf.keras.backend.is_sparse(b))
-  True
-  >>> c = tf.keras.backend.to_dense(b)
-  >>> print(tf.keras.backend.is_sparse(c))
-  False
+    >>> b = tf.keras.backend.placeholder((2, 2), sparse=True)
+    >>> print(tf.keras.backend.is_sparse(b))
+    True
+    >>> c = tf.keras.backend.to_dense(b)
+    >>> print(tf.keras.backend.is_sparse(c))
+    False
 
-  """
-  if is_sparse(tensor):
-    return tf.sparse.to_dense(tensor)
-  else:
-    return tensor
+    """
+    if is_sparse(tensor):
+        return tf.sparse.to_dense(tensor)
+    else:
+        return tensor
 
 
-@keras_export('keras.backend.name_scope', v1=[])
+@keras_export("keras.backend.name_scope", v1=[])
 @doc_controls.do_not_generate_docs
 def name_scope(name):
-  """A context manager for use when defining a Python op.
+    """A context manager for use when defining a Python op.
 
-  This context manager pushes a name scope, which will make the name of all
-  operations added within it have a prefix.
+    This context manager pushes a name scope, which will make the name of all
+    operations added within it have a prefix.
 
-  For example, to define a new Python op called `my_op`:
+    For example, to define a new Python op called `my_op`:
 
 
-  def my_op(a):
-    with tf.name_scope("MyOp") as scope:
-      a = tf.convert_to_tensor(a, name="a")
-      # Define some computation that uses `a`.
-      return foo_op(..., name=scope)
+    def my_op(a):
+      with tf.name_scope("MyOp") as scope:
+        a = tf.convert_to_tensor(a, name="a")
+        # Define some computation that uses `a`.
+        return foo_op(..., name=scope)
 
 
-  When executed, the Tensor `a` will have the name `MyOp/a`.
+    When executed, the Tensor `a` will have the name `MyOp/a`.
+
+    Args:
+      name: The prefix to use on all names created within the name scope.
 
-  Args:
-    name: The prefix to use on all names created within the name scope.
+    Returns:
+      Name scope context manager.
+    """
+    return tf.name_scope(name)
 
-  Returns:
-    Name scope context manager.
-  """
-  return tf.name_scope(name)
 
 # Export V1 version.
 _v1_name_scope = tf.compat.v1.name_scope
-keras_export(v1=['keras.backend.name_scope'], allow_multiple_exports=True)(_v1_name_scope)
+keras_export(v1=["keras.backend.name_scope"])(_v1_name_scope)
 
 
-@keras_export('keras.backend.variable')
+@keras_export("keras.backend.variable")
 @doc_controls.do_not_generate_docs
 def variable(value, dtype=None, name=None, constraint=None):
-  """Instantiates a variable and returns it.
-
-  Args:
-      value: Numpy array, initial value of the tensor.
-      dtype: Tensor type.
-      name: Optional name string for the tensor.
-      constraint: Optional projection function to be
-          applied to the variable after an optimizer update.
-
-  Returns:
-      A variable instance (with Keras metadata included).
-
-  Examples:
-
-  >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = tf.keras.backend.variable(value=val, dtype='float64',
-  ...                                  name='example_var')
-  >>> tf.keras.backend.dtype(kvar)
-  'float64'
-  >>> print(kvar)
-  <tf.Variable 'example_var:...' shape=(2, 2) dtype=float64, numpy=
-    array([[1., 2.],
-           [3., 4.]])>
-
-  """
-  if dtype is None:
-    dtype = floatx()
-  if hasattr(value, 'tocoo'):
-    sparse_coo = value.tocoo()
-    indices = np.concatenate((np.expand_dims(sparse_coo.row, 1), np.expand_dims(
-        sparse_coo.col, 1)), 1)
-    v = tf.SparseTensor(
-        indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape)
-    v._keras_shape = sparse_coo.shape
+    """Instantiates a variable and returns it.
+
+    Args:
+        value: Numpy array, initial value of the tensor.
+        dtype: Tensor type.
+        name: Optional name string for the tensor.
+        constraint: Optional projection function to be
+            applied to the variable after an optimizer update.
+
+    Returns:
+        A variable instance (with Keras metadata included).
+
+    Examples:
+
+    >>> val = np.array([[1, 2], [3, 4]])
+    >>> kvar = tf.keras.backend.variable(value=val, dtype='float64',
+    ...                                  name='example_var')
+    >>> tf.keras.backend.dtype(kvar)
+    'float64'
+    >>> print(kvar)
+    <tf.Variable 'example_var:...' shape=(2, 2) dtype=float64, numpy=
+      array([[1., 2.],
+             [3., 4.]])>
+
+    """
+    if dtype is None:
+        dtype = floatx()
+    if hasattr(value, "tocoo"):
+        sparse_coo = value.tocoo()
+        indices = np.concatenate(
+            (
+                np.expand_dims(sparse_coo.row, 1),
+                np.expand_dims(sparse_coo.col, 1),
+            ),
+            1,
+        )
+        v = tf.SparseTensor(
+            indices=indices,
+            values=sparse_coo.data,
+            dense_shape=sparse_coo.shape,
+        )
+        v._keras_shape = sparse_coo.shape
+        return v
+    v = tf.Variable(
+        value, dtype=tf.as_dtype(dtype), name=name, constraint=constraint
+    )
+    if isinstance(value, np.ndarray):
+        v._keras_shape = value.shape
+    elif hasattr(value, "shape"):
+        v._keras_shape = int_shape(value)
+    track_variable(v)
     return v
-  v = tf.Variable(
-      value,
-      dtype=tf.as_dtype(dtype),
-      name=name,
-      constraint=constraint)
-  if isinstance(value, np.ndarray):
-    v._keras_shape = value.shape
-  elif hasattr(value, 'shape'):
-    v._keras_shape = int_shape(value)
-  track_variable(v)
-  return v
 
 
 def track_tf_optimizer(tf_optimizer):
-  """Tracks the given TF optimizer for initialization of its variables."""
-  if tf.executing_eagerly():
-    return
-  optimizers = _GRAPH_TF_OPTIMIZERS[None]
-  optimizers.add(tf_optimizer)
+    """Tracks the given TF optimizer for initialization of its variables."""
+    if tf.executing_eagerly():
+        return
+    optimizers = _GRAPH_TF_OPTIMIZERS[None]
+    optimizers.add(tf_optimizer)
 
 
-@keras_export('keras.__internal__.backend.track_variable', v1=[])
+@keras_export("keras.__internal__.backend.track_variable", v1=[])
 def track_variable(v):
-  """Tracks the given variable for initialization."""
-  if tf.executing_eagerly():
-    return
-  graph = v.graph if hasattr(v, 'graph') else get_graph()
-  _GRAPH_VARIABLES[graph].add(v)
+    """Tracks the given variable for initialization."""
+    if tf.executing_eagerly():
+        return
+    graph = v.graph if hasattr(v, "graph") else get_graph()
+    _GRAPH_VARIABLES[graph].add(v)
 
 
 def observe_object_name(name):
-  """Observe a name and make sure it won't be used by `unique_object_name`."""
-  OBSERVED_NAMES.add(name)
-
-
-def unique_object_name(name,
-                       name_uid_map=None,
-                       avoid_names=None,
-                       namespace='',
-                       zero_based=False,
-                       avoid_observed_names=False):
-  """Makes a object name (or arbitrary string) unique within a TensorFlow graph.
-
-  Args:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default), don't avoid any names unless `avoid_observed_names` is
-      True.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-    avoid_observed_names: If True, avoid any names that have been observed by
-      `backend.observe_object_name`.
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-
-  unique_object_name('dense')  # dense_1
-  unique_object_name('dense')  # dense_2
-
-  """
-  if name_uid_map is None:
-    name_uid_map = get_default_graph_uid_map()
-  if avoid_names is None:
-    if avoid_observed_names:
-      avoid_names = OBSERVED_NAMES
-    else:
-      avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
+    """Observe a name and make sure it won't be used by `unique_object_name`."""
+    OBSERVED_NAMES.add(name)
+
+
+def unique_object_name(
+    name,
+    name_uid_map=None,
+    avoid_names=None,
+    namespace="",
+    zero_based=False,
+    avoid_observed_names=False,
+):
+    """Makes a object name (or any string) unique within a Keras session.
+
+    Args:
+      name: String name to make unique.
+      name_uid_map: An optional defaultdict(int) to use when creating unique
+        names. If None (default), uses a per-Graph dictionary.
+      avoid_names: An optional set or dict with names which should not be used.
+        If None (default), don't avoid any names unless `avoid_observed_names`
+        is True.
+      namespace: Gets a name which is unique within the (graph, namespace).
+        Layers which are not Networks use a blank namespace and so get
+        graph-global names.
+      zero_based: If True, name sequences start with no suffix (e.g. "dense",
+        "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+      avoid_observed_names: If True, avoid any names that have been observed by
+        `backend.observe_object_name`.
+
+    Returns:
+      Unique string name.
+
+    Example:
+
+
+    unique_object_name('dense')  # dense_1
+    unique_object_name('dense')  # dense_2
+
+    """
+    if name_uid_map is None:
+        name_uid_map = get_default_graph_uid_map()
+    if avoid_names is None:
+        if avoid_observed_names:
+            avoid_names = OBSERVED_NAMES
+        else:
+            avoid_names = set()
+    proposed_name = None
+    while proposed_name is None or proposed_name in avoid_names:
+        name_key = (namespace, name)
+        if zero_based:
+            number = name_uid_map[name_key]
+            if number:
+                proposed_name = name + "_" + str(number)
+            else:
+                proposed_name = name
+            name_uid_map[name_key] += 1
+        else:
+            name_uid_map[name_key] += 1
+            proposed_name = name + "_" + str(name_uid_map[name_key])
+    return proposed_name
 
 
 def _get_variables(graph=None):
-  """Returns variables corresponding to the given graph for initialization."""
-  assert not tf.executing_eagerly()
-  variables = _GRAPH_VARIABLES[graph]
-  for opt in _GRAPH_TF_OPTIMIZERS[graph]:
-    variables.update(opt.optimizer.variables())
-  return variables
+    """Returns variables corresponding to the given graph for initialization."""
+    assert not tf.executing_eagerly()
+    variables = _GRAPH_VARIABLES[graph]
+    for opt in _GRAPH_TF_OPTIMIZERS[graph]:
+        variables.update(opt.optimizer.variables())
+    return variables
 
 
-@keras_export('keras.__internal__.backend.initialize_variables', v1=[])
+@keras_export("keras.__internal__.backend.initialize_variables", v1=[])
 def _initialize_variables(session):
-  """Utility to initialize uninitialized variables on the fly."""
-  variables = _get_variables(get_graph())
-  candidate_vars = []
-  for v in variables:
-    if not getattr(v, '_keras_initialized', False):
-      candidate_vars.append(v)
-  if candidate_vars:
-    # This step is expensive, so we only run it on variables not already
-    # marked as initialized.
-    is_initialized = session.run(
-        [tf.compat.v1.is_variable_initialized(v) for v in candidate_vars])
-    # TODO(kathywu): Some metric variables loaded from SavedModel are never
-    # actually used, and do not have an initializer.
-    should_be_initialized = [
-        (not is_initialized[n]) and v.initializer is not None
-        for n, v in enumerate(candidate_vars)]
-    uninitialized_vars = []
-    for flag, v in zip(should_be_initialized, candidate_vars):
-      if flag:
-        uninitialized_vars.append(v)
-      v._keras_initialized = True
-    if uninitialized_vars:
-      session.run(tf.compat.v1.variables_initializer(uninitialized_vars))
-
-
-@keras_export('keras.backend.constant')
+    """Utility to initialize uninitialized variables on the fly."""
+    variables = _get_variables(get_graph())
+    candidate_vars = []
+    for v in variables:
+        if not getattr(v, "_keras_initialized", False):
+            candidate_vars.append(v)
+    if candidate_vars:
+        # This step is expensive, so we only run it on variables not already
+        # marked as initialized.
+        is_initialized = session.run(
+            [tf.compat.v1.is_variable_initialized(v) for v in candidate_vars]
+        )
+        # TODO(kathywu): Some metric variables loaded from SavedModel are never
+        # actually used, and do not have an initializer.
+        should_be_initialized = [
+            (not is_initialized[n]) and v.initializer is not None
+            for n, v in enumerate(candidate_vars)
+        ]
+        uninitialized_vars = []
+        for flag, v in zip(should_be_initialized, candidate_vars):
+            if flag:
+                uninitialized_vars.append(v)
+            v._keras_initialized = True
+        if uninitialized_vars:
+            session.run(tf.compat.v1.variables_initializer(uninitialized_vars))
+
+
+@keras_export("keras.backend.constant")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def constant(value, dtype=None, shape=None, name=None):
-  """Creates a constant tensor.
+    """Creates a constant tensor.
 
-  Args:
-      value: A constant value (or list)
-      dtype: The type of the elements of the resulting tensor.
-      shape: Optional dimensions of resulting tensor.
-      name: Optional name for the tensor.
+    Args:
+        value: A constant value (or list)
+        dtype: The type of the elements of the resulting tensor.
+        shape: Optional dimensions of resulting tensor.
+        name: Optional name for the tensor.
 
-  Returns:
-      A Constant Tensor.
-  """
-  if dtype is None:
-    dtype = floatx()
+    Returns:
+        A Constant Tensor.
+    """
+    if dtype is None:
+        dtype = floatx()
 
-  return tf.constant(value, dtype=dtype, shape=shape, name=name)
+    return tf.constant(value, dtype=dtype, shape=shape, name=name)
 
 
-@keras_export('keras.backend.is_keras_tensor')
+@keras_export("keras.backend.is_keras_tensor")
 def is_keras_tensor(x):
-  """Returns whether `x` is a Keras tensor.
-
-  A "Keras tensor" is a tensor that was returned by a Keras layer,
-  (`Layer` class) or by `Input`.
-
-  Args:
-      x: A candidate tensor.
-
-  Returns:
-      A boolean: Whether the argument is a Keras tensor.
-
-  Raises:
-      ValueError: In case `x` is not a symbolic tensor.
-
-  Examples:
-
-  >>> np_var = np.array([1, 2])
-  >>> # A numpy array is not a symbolic tensor.
-  >>> tf.keras.backend.is_keras_tensor(np_var)
-  Traceback (most recent call last):
-  ...
-  ValueError: Unexpectedly found an instance of type `<class 'numpy.ndarray'>`.
-  Expected a symbolic tensor instance.
-  >>> keras_var = tf.keras.backend.variable(np_var)
-  >>> # A variable created with the keras backend is not a Keras tensor.
-  >>> tf.keras.backend.is_keras_tensor(keras_var)
-  False
-  >>> keras_placeholder = tf.keras.backend.placeholder(shape=(2, 4, 5))
-  >>> # A placeholder is a Keras tensor.
-  >>> tf.keras.backend.is_keras_tensor(keras_placeholder)
-  True
-  >>> keras_input = tf.keras.layers.Input([10])
-  >>> # An Input is a Keras tensor.
-  >>> tf.keras.backend.is_keras_tensor(keras_input)
-  True
-  >>> keras_layer_output = tf.keras.layers.Dense(10)(keras_input)
-  >>> # Any Keras layer output is a Keras tensor.
-  >>> tf.keras.backend.is_keras_tensor(keras_layer_output)
-  True
-
-  """
-  if not isinstance(x,
-                    (tf.Tensor, tf.Variable,
-                     tf.SparseTensor, tf.RaggedTensor,
-                     keras_tensor.KerasTensor)):
-    raise ValueError('Unexpectedly found an instance of type `' + str(type(x)) +
-                     '`. Expected a symbolic tensor instance.')
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    return isinstance(x, keras_tensor.KerasTensor)
-  return hasattr(x, '_keras_history')
-
-
-@keras_export('keras.backend.placeholder')
-@doc_controls.do_not_generate_docs
-def placeholder(shape=None,
-                ndim=None,
-                dtype=None,
-                sparse=False,
-                name=None,
-                ragged=False):
-  """Instantiates a placeholder tensor and returns it.
-
-  Args:
-      shape: Shape of the placeholder
-          (integer tuple, may include `None` entries).
-      ndim: Number of axes of the tensor.
-          At least one of {`shape`, `ndim`} must be specified.
-          If both are specified, `shape` is used.
-      dtype: Placeholder type.
-      sparse: Boolean, whether the placeholder should have a sparse type.
-      name: Optional name string for the placeholder.
-      ragged: Boolean, whether the placeholder should have a ragged type.
-          In this case, values of 'None' in the 'shape' argument represent
-          ragged dimensions. For more information about RaggedTensors, see this
-          [guide](https://www.tensorflow.org/guide/ragged_tensors).
-
-  Raises:
-      ValueError: If called with sparse = True and ragged = True.
-
-  Returns:
-      Tensor instance (with Keras metadata included).
-
-  Examples:
-
-
-  >>> input_ph = tf.keras.backend.placeholder(shape=(2, 4, 5))
-  >>> input_ph
-  <KerasTensor: shape=(2, 4, 5) dtype=float32 (created by layer ...)>
-
-  """
-  if sparse and ragged:
-    raise ValueError(
-        'Cannot set both sparse and ragged to True when creating a placeholder.'
-    )
-  if dtype is None:
-    dtype = floatx()
-  if not shape:
-    if ndim:
-      shape = (None,) * ndim
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    if sparse:
-      spec = tf.SparseTensorSpec(
-          shape=shape, dtype=dtype)
-    elif ragged:
-      ragged_rank = 0
-      for i in range(1, len(shape)):
-        # Hacky because could be tensorshape or tuple maybe?
-        # Or just tensorshape?
-        if shape[i] is None or (
-            hasattr(shape[i], 'value') and
-            shape[i].value is None):
-          ragged_rank = i
-      spec = tf.RaggedTensorSpec(
-          shape=shape, dtype=dtype, ragged_rank=ragged_rank)
-    else:
-      spec = tf.TensorSpec(
-          shape=shape, dtype=dtype, name=name)
-    x = keras_tensor.keras_tensor_from_type_spec(spec, name=name)
-  else:
-    with get_graph().as_default():
-      if sparse:
-        x = tf.compat.v1.sparse_placeholder(dtype, shape=shape, name=name)
-      elif ragged:
-        ragged_rank = 0
-        for i in range(1, len(shape)):
-          if shape[i] is None:
-            ragged_rank = i
-        type_spec = tf.RaggedTensorSpec(
-            shape=shape, dtype=dtype, ragged_rank=ragged_rank)
-        def tensor_spec_to_placeholder(tensorspec):
-          return tf.compat.v1.placeholder(tensorspec.dtype, tensorspec.shape)
-        x = tf.nest.map_structure(tensor_spec_to_placeholder, type_spec,
-                               expand_composites=True)
-      else:
-        x = tf.compat.v1.placeholder(dtype, shape=shape, name=name)
-
-  if tf.executing_eagerly():
-    # Add keras_history connectivity information to the placeholder
-    # when the placeholder is built in a top-level eager context
-    # (intended to be used with keras.backend.function)
-    from keras.engine import input_layer  # pylint: disable=g-import-not-at-top
-    x = input_layer.Input(tensor=x)
-    x._is_backend_placeholder = True
-
-  return x
+    """Returns whether `x` is a Keras tensor.
 
+    A "Keras tensor" is a tensor that was returned by a Keras layer,
+    (`Layer` class) or by `Input`.
 
-def is_placeholder(x):
-  """Returns whether `x` is a placeholder.
+    Args:
+        x: A candidate tensor.
+
+    Returns:
+        A boolean: Whether the argument is a Keras tensor.
+
+    Raises:
+        ValueError: In case `x` is not a symbolic tensor.
+
+    Examples:
+
+    >>> np_var = np.array([1, 2])
+    >>> # A numpy array is not a symbolic tensor.
+    >>> tf.keras.backend.is_keras_tensor(np_var)
+    Traceback (most recent call last):
+    ...
+    ValueError: Unexpectedly found an instance of type
+    `<class 'numpy.ndarray'>`.
+    Expected a symbolic tensor instance.
+    >>> keras_var = tf.keras.backend.variable(np_var)
+    >>> # A variable created with the keras backend is not a Keras tensor.
+    >>> tf.keras.backend.is_keras_tensor(keras_var)
+    False
+    >>> keras_placeholder = tf.keras.backend.placeholder(shape=(2, 4, 5))
+    >>> # A placeholder is a Keras tensor.
+    >>> tf.keras.backend.is_keras_tensor(keras_placeholder)
+    True
+    >>> keras_input = tf.keras.layers.Input([10])
+    >>> # An Input is a Keras tensor.
+    >>> tf.keras.backend.is_keras_tensor(keras_input)
+    True
+    >>> keras_layer_output = tf.keras.layers.Dense(10)(keras_input)
+    >>> # Any Keras layer output is a Keras tensor.
+    >>> tf.keras.backend.is_keras_tensor(keras_layer_output)
+    True
+
+    """
+    if not isinstance(
+        x,
+        (
+            tf.Tensor,
+            tf.Variable,
+            tf.SparseTensor,
+            tf.RaggedTensor,
+            keras_tensor.KerasTensor,
+        ),
+    ):
+        raise ValueError(
+            "Unexpectedly found an instance of type `"
+            + str(type(x))
+            + "`. Expected a symbolic tensor instance."
+        )
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        return isinstance(x, keras_tensor.KerasTensor)
+    return hasattr(x, "_keras_history")
+
+
+@keras_export("keras.backend.placeholder")
+@doc_controls.do_not_generate_docs
+def placeholder(
+    shape=None, ndim=None, dtype=None, sparse=False, name=None, ragged=False
+):
+    """Instantiates a placeholder tensor and returns it.
+
+    Args:
+        shape: Shape of the placeholder
+            (integer tuple, may include `None` entries).
+        ndim: Number of axes of the tensor.
+            At least one of {`shape`, `ndim`} must be specified.
+            If both are specified, `shape` is used.
+        dtype: Placeholder type.
+        sparse: Boolean, whether the placeholder should have a sparse type.
+        name: Optional name string for the placeholder.
+        ragged: Boolean, whether the placeholder should have a ragged type.
+            In this case, values of 'None' in the 'shape' argument represent
+            ragged dimensions. For more information about RaggedTensors, see
+            this [guide](https://www.tensorflow.org/guide/ragged_tensor).
+
+    Raises:
+        ValueError: If called with sparse = True and ragged = True.
+
+    Returns:
+        Tensor instance (with Keras metadata included).
+
+    Examples:
 
-  Args:
-      x: A candidate placeholder.
 
-  Returns:
-      Boolean.
-  """
-  try:
+    >>> input_ph = tf.keras.backend.placeholder(shape=(2, 4, 5))
+    >>> input_ph
+    <KerasTensor: shape=(2, 4, 5) dtype=float32 (created by layer ...)>
+
+    """
+    if sparse and ragged:
+        raise ValueError(
+            "Cannot set both sparse and ragged to "
+            "True when creating a placeholder."
+        )
+    if dtype is None:
+        dtype = floatx()
+    if not shape:
+        if ndim:
+            shape = (None,) * ndim
     if tf.compat.v1.executing_eagerly_outside_functions():
-      return hasattr(x, '_is_backend_placeholder')
-    from keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
-    if tf_utils.is_extension_type(x):
-      flat_components = tf.nest.flatten(x, expand_composites=True)
-      return py_any(is_placeholder(c) for c in flat_components)
+        if sparse:
+            spec = tf.SparseTensorSpec(shape=shape, dtype=dtype)
+        elif ragged:
+            ragged_rank = 0
+            for i in range(1, len(shape)):
+                # Hacky because could be tensorshape or tuple maybe?
+                # Or just tensorshape?
+                if shape[i] is None or (
+                    hasattr(shape[i], "value") and shape[i].value is None
+                ):
+                    ragged_rank = i
+            spec = tf.RaggedTensorSpec(
+                shape=shape, dtype=dtype, ragged_rank=ragged_rank
+            )
+        else:
+            spec = tf.TensorSpec(shape=shape, dtype=dtype, name=name)
+        x = keras_tensor.keras_tensor_from_type_spec(spec, name=name)
     else:
-      return x.op.type == 'Placeholder'
-  except AttributeError:
-    return False
+        with get_graph().as_default():
+            if sparse:
+                x = tf.compat.v1.sparse_placeholder(
+                    dtype, shape=shape, name=name
+                )
+            elif ragged:
+                ragged_rank = 0
+                for i in range(1, len(shape)):
+                    if shape[i] is None:
+                        ragged_rank = i
+                type_spec = tf.RaggedTensorSpec(
+                    shape=shape, dtype=dtype, ragged_rank=ragged_rank
+                )
+
+                def tensor_spec_to_placeholder(tensorspec):
+                    return tf.compat.v1.placeholder(
+                        tensorspec.dtype, tensorspec.shape
+                    )
+
+                x = tf.nest.map_structure(
+                    tensor_spec_to_placeholder,
+                    type_spec,
+                    expand_composites=True,
+                )
+            else:
+                x = tf.compat.v1.placeholder(dtype, shape=shape, name=name)
+
+    if tf.executing_eagerly():
+        # Add keras_history connectivity information to the placeholder
+        # when the placeholder is built in a top-level eager context
+        # (intended to be used with keras.backend.function)
+        from keras.engine import (
+            input_layer,
+        )
+
+        x = input_layer.Input(tensor=x)
+        x._is_backend_placeholder = True
+
+    return x
 
 
-@keras_export('keras.backend.shape')
+def is_placeholder(x):
+    """Returns whether `x` is a placeholder.
+
+    Args:
+        x: A candidate placeholder.
+
+    Returns:
+        Boolean.
+    """
+    try:
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            return hasattr(x, "_is_backend_placeholder")
+
+        # TODO(b/246438937): Remove the special case for tf.Variable once
+        # tf.Variable becomes CompositeTensor and will be expanded into
+        # dt_resource tensors.
+        if tf_utils.is_extension_type(x) and not isinstance(x, tf.Variable):
+            flat_components = tf.nest.flatten(x, expand_composites=True)
+            return py_any(is_placeholder(c) for c in flat_components)
+        else:
+            return x.op.type == "Placeholder"
+    except AttributeError:
+        return False
+
+
+@keras_export("keras.backend.shape")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def shape(x):
-  """Returns the symbolic shape of a tensor or variable.
+    """Returns the symbolic shape of a tensor or variable.
 
-  Args:
-      x: A tensor or variable.
+    Args:
+        x: A tensor or variable.
 
-  Returns:
-      A symbolic shape (which is itself a tensor).
+    Returns:
+        A symbolic shape (which is itself a tensor).
 
-  Examples:
+    Examples:
 
-  >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = tf.keras.backend.variable(value=val)
-  >>> tf.keras.backend.shape(kvar)
-  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>
-  >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
-  >>> tf.keras.backend.shape(input)
-  <KerasTensor: shape=(3,) dtype=int32 inferred_value=[2, 4, 5] ...>
+    >>> val = np.array([[1, 2], [3, 4]])
+    >>> kvar = tf.keras.backend.variable(value=val)
+    >>> tf.keras.backend.shape(kvar)
+    <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>
+    >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
+    >>> tf.keras.backend.shape(input)
+    <KerasTensor: shape=(3,) dtype=int32 inferred_value=[2, 4, 5] ...>
 
-  """
-  return tf.shape(x)
+    """
+    return tf.shape(x)
 
 
-@keras_export('keras.backend.int_shape')
+@keras_export("keras.backend.int_shape")
 @doc_controls.do_not_generate_docs
 def int_shape(x):
-  """Returns the shape of tensor or variable as a tuple of int or None entries.
-
-  Args:
-      x: Tensor or variable.
-
-  Returns:
-      A tuple of integers (or None entries).
-
-  Examples:
-
-  >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
-  >>> tf.keras.backend.int_shape(input)
-  (2, 4, 5)
-  >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = tf.keras.backend.variable(value=val)
-  >>> tf.keras.backend.int_shape(kvar)
-  (2, 2)
-
-  """
-  try:
-    shape = x.shape
-    if not isinstance(shape, tuple):
-      shape = tuple(shape.as_list())
-    return shape
-  except ValueError:
-    return None
+    """Returns shape of tensor/variable as a tuple of int/None entries.
+
+    Args:
+        x: Tensor or variable.
 
+    Returns:
+        A tuple of integers (or None entries).
+
+    Examples:
+
+    >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
+    >>> tf.keras.backend.int_shape(input)
+    (2, 4, 5)
+    >>> val = np.array([[1, 2], [3, 4]])
+    >>> kvar = tf.keras.backend.variable(value=val)
+    >>> tf.keras.backend.int_shape(kvar)
+    (2, 2)
+
+    """
+    try:
+        shape = x.shape
+        if not isinstance(shape, tuple):
+            shape = tuple(shape.as_list())
+        return shape
+    except ValueError:
+        return None
 
-@keras_export('keras.backend.ndim')
+
+@keras_export("keras.backend.ndim")
 @doc_controls.do_not_generate_docs
 def ndim(x):
-  """Returns the number of axes in a tensor, as an integer.
+    """Returns the number of axes in a tensor, as an integer.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      Integer (scalar), number of axes.
+    Returns:
+        Integer (scalar), number of axes.
 
-  Examples:
+    Examples:
 
 
-  >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
-  >>> val = np.array([[1, 2], [3, 4]])
-  >>> kvar = tf.keras.backend.variable(value=val)
-  >>> tf.keras.backend.ndim(input)
-  3
-  >>> tf.keras.backend.ndim(kvar)
-  2
+    >>> input = tf.keras.backend.placeholder(shape=(2, 4, 5))
+    >>> val = np.array([[1, 2], [3, 4]])
+    >>> kvar = tf.keras.backend.variable(value=val)
+    >>> tf.keras.backend.ndim(input)
+    3
+    >>> tf.keras.backend.ndim(kvar)
+    2
 
-  """
-  return x.shape.rank
+    """
+    return x.shape.rank
 
 
-@keras_export('keras.backend.dtype')
+@keras_export("keras.backend.dtype")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def dtype(x):
-  """Returns the dtype of a Keras tensor or variable, as a string.
-
-  Args:
-      x: Tensor or variable.
-
-  Returns:
-      String, dtype of `x`.
+    """Returns the dtype of a Keras tensor or variable, as a string.
 
-  Examples:
+    Args:
+        x: Tensor or variable.
 
-  >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5)))
-  'float32'
-  >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5),
-  ...                                                     dtype='float32'))
-  'float32'
-  >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5),
-  ...                                                     dtype='float64'))
-  'float64'
-  >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]))
-  >>> tf.keras.backend.dtype(kvar)
-  'float32'
-  >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]),
-  ...                                  dtype='float32')
-  >>> tf.keras.backend.dtype(kvar)
-  'float32'
+    Returns:
+        String, dtype of `x`.
+
+    Examples:
+
+    >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5)))
+    'float32'
+    >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5),
+    ...                                                     dtype='float32'))
+    'float32'
+    >>> tf.keras.backend.dtype(tf.keras.backend.placeholder(shape=(2,4,5),
+    ...                                                     dtype='float64'))
+    'float64'
+    >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]))
+    >>> tf.keras.backend.dtype(kvar)
+    'float32'
+    >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]),
+    ...                                  dtype='float32')
+    >>> tf.keras.backend.dtype(kvar)
+    'float32'
 
-  """
-  return x.dtype.base_dtype.name
+    """
+    return x.dtype.base_dtype.name
 
 
 @doc_controls.do_not_generate_docs
 def dtype_numpy(x):
-  """Returns the numpy dtype of a Keras tensor or variable.
+    """Returns the numpy dtype of a Keras tensor or variable.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      numpy.dtype, dtype of `x`.
-  """
-  return tf.as_dtype(x.dtype).as_numpy_dtype
+    Returns:
+        numpy.dtype, dtype of `x`.
+    """
+    return tf.as_dtype(x.dtype).as_numpy_dtype
 
 
-@keras_export('keras.backend.eval')
+@keras_export("keras.backend.eval")
 @doc_controls.do_not_generate_docs
 def eval(x):
-  """Evaluates the value of a variable.
+    """Evaluates the value of a variable.
 
-  Args:
-      x: A variable.
+    Args:
+        x: A variable.
 
-  Returns:
-      A Numpy array.
+    Returns:
+        A Numpy array.
 
-  Examples:
+    Examples:
 
-  >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]),
-  ...                                  dtype='float32')
-  >>> tf.keras.backend.eval(kvar)
-  array([[1.,  2.],
-         [3.,  4.]], dtype=float32)
+    >>> kvar = tf.keras.backend.variable(np.array([[1, 2], [3, 4]]),
+    ...                                  dtype='float32')
+    >>> tf.keras.backend.eval(kvar)
+    array([[1.,  2.],
+           [3.,  4.]], dtype=float32)
 
-  """
-  return get_value(to_dense(x))
+    """
+    return get_value(to_dense(x))
 
 
-@keras_export('keras.backend.zeros')
+@keras_export("keras.backend.zeros")
 @doc_controls.do_not_generate_docs
 def zeros(shape, dtype=None, name=None):
-  """Instantiates an all-zeros variable and returns it.
-
-  Args:
-      shape: Tuple or list of integers, shape of returned Keras variable
-      dtype: data type of returned Keras variable
-      name: name of returned Keras variable
-
-  Returns:
-      A variable (including Keras metadata), filled with `0.0`.
-      Note that if `shape` was symbolic, we cannot return a variable,
-      and will return a dynamically-shaped tensor instead.
-
-  Example:
-
-  >>> kvar = tf.keras.backend.zeros((3,4))
-  >>> tf.keras.backend.eval(kvar)
-  array([[0.,  0.,  0.,  0.],
-         [0.,  0.,  0.,  0.],
-         [0.,  0.,  0.,  0.]], dtype=float32)
-  >>> A = tf.constant([1,2,3])
-  >>> kvar2 = tf.keras.backend.zeros(A.shape) # [0., 0., 0.]
-  >>> tf.keras.backend.eval(kvar2)
-  array([0., 0., 0.], dtype=float32)
-  >>> kvar3 = tf.keras.backend.zeros(A.shape,dtype=tf.int32)
-  >>> tf.keras.backend.eval(kvar3)
-  array([0, 0, 0], dtype=int32)
-  >>> kvar4 = tf.keras.backend.zeros([2,3])
-  >>> tf.keras.backend.eval(kvar4)
-  array([[0., 0., 0.],
-         [0., 0., 0.]], dtype=float32)
-
-  """
-  with tf.init_scope():
-    if dtype is None:
-      dtype = floatx()
-    tf_dtype = tf.as_dtype(dtype)
-    v = tf.zeros(shape=shape, dtype=tf_dtype, name=name)
-    if py_all(v.shape.as_list()):
-      return variable(v, dtype=dtype, name=name)
-    return v
+    """Instantiates an all-zeros variable and returns it.
 
+    Args:
+        shape: Tuple or list of integers, shape of returned Keras variable
+        dtype: data type of returned Keras variable
+        name: name of returned Keras variable
+
+    Returns:
+        A variable (including Keras metadata), filled with `0.0`.
+        Note that if `shape` was symbolic, we cannot return a variable,
+        and will return a dynamically-shaped tensor instead.
+
+    Example:
+
+    >>> kvar = tf.keras.backend.zeros((3,4))
+    >>> tf.keras.backend.eval(kvar)
+    array([[0.,  0.,  0.,  0.],
+           [0.,  0.,  0.,  0.],
+           [0.,  0.,  0.,  0.]], dtype=float32)
+    >>> A = tf.constant([1,2,3])
+    >>> kvar2 = tf.keras.backend.zeros(A.shape) # [0., 0., 0.]
+    >>> tf.keras.backend.eval(kvar2)
+    array([0., 0., 0.], dtype=float32)
+    >>> kvar3 = tf.keras.backend.zeros(A.shape,dtype=tf.int32)
+    >>> tf.keras.backend.eval(kvar3)
+    array([0, 0, 0], dtype=int32)
+    >>> kvar4 = tf.keras.backend.zeros([2,3])
+    >>> tf.keras.backend.eval(kvar4)
+    array([[0., 0., 0.],
+           [0., 0., 0.]], dtype=float32)
+
+    """
+    with tf.init_scope():
+        if dtype is None:
+            dtype = floatx()
+        tf_dtype = tf.as_dtype(dtype)
+        v = tf.zeros(shape=shape, dtype=tf_dtype, name=name)
+        if py_all(v.shape.as_list()):
+            return variable(v, dtype=dtype, name=name)
+        return v
 
-@keras_export('keras.backend.ones')
+
+@keras_export("keras.backend.ones")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def ones(shape, dtype=None, name=None):
-  """Instantiates an all-ones variable and returns it.
+    """Instantiates an all-ones variable and returns it.
 
-  Args:
-      shape: Tuple of integers, shape of returned Keras variable.
-      dtype: String, data type of returned Keras variable.
-      name: String, name of returned Keras variable.
+    Args:
+        shape: Tuple of integers, shape of returned Keras variable.
+        dtype: String, data type of returned Keras variable.
+        name: String, name of returned Keras variable.
 
-  Returns:
-      A Keras variable, filled with `1.0`.
-      Note that if `shape` was symbolic, we cannot return a variable,
-      and will return a dynamically-shaped tensor instead.
+    Returns:
+        A Keras variable, filled with `1.0`.
+        Note that if `shape` was symbolic, we cannot return a variable,
+        and will return a dynamically-shaped tensor instead.
 
-  Example:
+    Example:
 
 
-  >>> kvar = tf.keras.backend.ones((3,4))
-  >>> tf.keras.backend.eval(kvar)
-  array([[1.,  1.,  1.,  1.],
-         [1.,  1.,  1.,  1.],
-         [1.,  1.,  1.,  1.]], dtype=float32)
+    >>> kvar = tf.keras.backend.ones((3,4))
+    >>> tf.keras.backend.eval(kvar)
+    array([[1.,  1.,  1.,  1.],
+           [1.,  1.,  1.,  1.],
+           [1.,  1.,  1.,  1.]], dtype=float32)
 
-  """
-  with tf.init_scope():
-    if dtype is None:
-      dtype = floatx()
-    tf_dtype = tf.as_dtype(dtype)
-    v = tf.ones(shape=shape, dtype=tf_dtype, name=name)
-    if py_all(v.shape.as_list()):
-      return variable(v, dtype=dtype, name=name)
-    return v
+    """
+    with tf.init_scope():
+        if dtype is None:
+            dtype = floatx()
+        tf_dtype = tf.as_dtype(dtype)
+        v = tf.ones(shape=shape, dtype=tf_dtype, name=name)
+        if py_all(v.shape.as_list()):
+            return variable(v, dtype=dtype, name=name)
+        return v
 
 
-@keras_export('keras.backend.eye')
+@keras_export("keras.backend.eye")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def eye(size, dtype=None, name=None):
-  """Instantiate an identity matrix and returns it.
+    """Instantiate an identity matrix and returns it.
 
-  Args:
-      size: Integer, number of rows/columns.
-      dtype: String, data type of returned Keras variable.
-      name: String, name of returned Keras variable.
+    Args:
+        size: Integer, number of rows/columns.
+        dtype: String, data type of returned Keras variable.
+        name: String, name of returned Keras variable.
 
-  Returns:
-      A Keras variable, an identity matrix.
+    Returns:
+        A Keras variable, an identity matrix.
 
-  Example:
+    Example:
 
 
-  >>> kvar = tf.keras.backend.eye(3)
-  >>> tf.keras.backend.eval(kvar)
-  array([[1.,  0.,  0.],
-         [0.,  1.,  0.],
-         [0.,  0.,  1.]], dtype=float32)
+    >>> kvar = tf.keras.backend.eye(3)
+    >>> tf.keras.backend.eval(kvar)
+    array([[1.,  0.,  0.],
+           [0.,  1.,  0.],
+           [0.,  0.,  1.]], dtype=float32)
 
 
-  """
-  if dtype is None:
-    dtype = floatx()
-  tf_dtype = tf.as_dtype(dtype)
-  return variable(tf.eye(size, dtype=tf_dtype), dtype, name)
+    """
+    if dtype is None:
+        dtype = floatx()
+    tf_dtype = tf.as_dtype(dtype)
+    return variable(tf.eye(size, dtype=tf_dtype), dtype, name)
 
 
-@keras_export('keras.backend.zeros_like')
+@keras_export("keras.backend.zeros_like")
 @doc_controls.do_not_generate_docs
 def zeros_like(x, dtype=None, name=None):
-  """Instantiates an all-zeros variable of the same shape as another tensor.
+    """Instantiates an all-zeros variable of the same shape as another tensor.
 
-  Args:
-      x: Keras variable or Keras tensor.
-      dtype: dtype of returned Keras variable.
-             `None` uses the dtype of `x`.
-      name: name for the variable to create.
+    Args:
+        x: Keras variable or Keras tensor.
+        dtype: dtype of returned Keras variable.
+               `None` uses the dtype of `x`.
+        name: name for the variable to create.
 
-  Returns:
-      A Keras variable with the shape of `x` filled with zeros.
+    Returns:
+        A Keras variable with the shape of `x` filled with zeros.
 
-  Example:
+    Example:
 
-  ```python
-  kvar = tf.keras.backend.variable(np.random.random((2,3)))
-  kvar_zeros = tf.keras.backend.zeros_like(kvar)
-  K.eval(kvar_zeros)
-  # array([[ 0.,  0.,  0.], [ 0.,  0.,  0.]], dtype=float32)
-  ```
-  """
-  return tf.zeros_like(x, dtype=dtype, name=name)
+    ```python
+    kvar = tf.keras.backend.variable(np.random.random((2,3)))
+    kvar_zeros = tf.keras.backend.zeros_like(kvar)
+    K.eval(kvar_zeros)
+    # array([[ 0.,  0.,  0.], [ 0.,  0.,  0.]], dtype=float32)
+    ```
+    """
+    return tf.zeros_like(x, dtype=dtype, name=name)
 
 
-@keras_export('keras.backend.ones_like')
+@keras_export("keras.backend.ones_like")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def ones_like(x, dtype=None, name=None):
-  """Instantiates an all-ones variable of the same shape as another tensor.
+    """Instantiates an all-ones variable of the same shape as another tensor.
 
-  Args:
-      x: Keras variable or tensor.
-      dtype: String, dtype of returned Keras variable.
-           None uses the dtype of x.
-      name: String, name for the variable to create.
+    Args:
+        x: Keras variable or tensor.
+        dtype: String, dtype of returned Keras variable.
+             None uses the dtype of x.
+        name: String, name for the variable to create.
 
-  Returns:
-      A Keras variable with the shape of x filled with ones.
+    Returns:
+        A Keras variable with the shape of x filled with ones.
 
-  Example:
+    Example:
 
-  >>> kvar = tf.keras.backend.variable(np.random.random((2,3)))
-  >>> kvar_ones = tf.keras.backend.ones_like(kvar)
-  >>> tf.keras.backend.eval(kvar_ones)
-  array([[1.,  1.,  1.],
-         [1.,  1.,  1.]], dtype=float32)
+    >>> kvar = tf.keras.backend.variable(np.random.random((2,3)))
+    >>> kvar_ones = tf.keras.backend.ones_like(kvar)
+    >>> tf.keras.backend.eval(kvar_ones)
+    array([[1.,  1.,  1.],
+           [1.,  1.,  1.]], dtype=float32)
 
-  """
-  return tf.ones_like(x, dtype=dtype, name=name)
+    """
+    return tf.ones_like(x, dtype=dtype, name=name)
 
 
 def identity(x, name=None):
-  """Returns a tensor with the same content as the input tensor.
+    """Returns a tensor with the same content as the input tensor.
 
-  Args:
-      x: The input tensor.
-      name: String, name for the variable to create.
+    Args:
+        x: The input tensor.
+        name: String, name for the variable to create.
 
-  Returns:
-      A tensor of the same shape, type and content.
-  """
-  return tf.identity(x, name=name)
+    Returns:
+        A tensor of the same shape, type and content.
+    """
+    return tf.identity(x, name=name)
 
 
 # Global flag to enforce tf.random.Generator for RandomGenerator.
@@ -1762,2244 +1824,2342 @@ def identity(x, name=None):
 # way, so that each client of the program could start with same seed. This is
 # very important for certain use case that requires all the client to have their
 # state in sync. This instance will be set when user call
-# `tf.keras.util.set_random_seed()`
+# `tf.keras.utils.set_random_seed()`
 _SEED_GENERATOR = threading.local()
 
 
-@keras_export('keras.backend.experimental.is_tf_random_generator_enabled',
-              v1=[])
+@keras_export(
+    "keras.backend.experimental.is_tf_random_generator_enabled", v1=[]
+)
 def is_tf_random_generator_enabled():
-  """Check whether `tf.random.Generator` is used for RNG in Keras.
-
-  Compared to existing TF stateful random ops, `tf.random.Generator` uses
-  `tf.Variable` and stateless random ops to generate random numbers,
-  which leads to better reproducibility in distributed training.
-  Note enabling it might introduce some breakage to existing code,
-  by producing differently-seeded random number sequences
-  and breaking tests that rely on specific random numbers being generated.
-  To disable the
-  usage of `tf.random.Generator`, please use
-  `tf.keras.backend.experimental.disable_random_generator`.
-
-  We expect the `tf.random.Generator` code path to become the default, and will
-  remove the legacy stateful random ops such as `tf.random.uniform` in the
-  future (see the
-  [TF RNG guide](https://www.tensorflow.org/guide/random_numbers)).
-
-  This API will also be removed in a future release as well, together with
-  `tf.keras.backend.experimental.enable_tf_random_generator()` and
-  `tf.keras.backend.experimental.disable_tf_random_generator()`
-
-  Returns:
-    boolean: whether `tf.random.Generator` is used for random number generation
-      in Keras.
-  """
-  return _USE_GENERATOR_FOR_RNG
-
-
-@keras_export('keras.backend.experimental.enable_tf_random_generator', v1=[])
+    """Check whether `tf.random.Generator` is used for RNG in Keras.
+
+    Compared to existing TF stateful random ops, `tf.random.Generator` uses
+    `tf.Variable` and stateless random ops to generate random numbers,
+    which leads to better reproducibility in distributed training.
+    Note enabling it might introduce some breakage to existing code,
+    by producing differently-seeded random number sequences
+    and breaking tests that rely on specific random numbers being generated.
+    To disable the
+    usage of `tf.random.Generator`, please use
+    `tf.keras.backend.experimental.disable_random_generator`.
+
+    We expect the `tf.random.Generator` code path to become the default, and
+    will remove the legacy stateful random ops such as `tf.random.uniform` in
+    the future (see the [TF RNG guide](
+    https://www.tensorflow.org/guide/random_numbers)).
+
+    This API will also be removed in a future release as well, together with
+    `tf.keras.backend.experimental.enable_tf_random_generator()` and
+    `tf.keras.backend.experimental.disable_tf_random_generator()`
+
+    Returns:
+      boolean: whether `tf.random.Generator` is used for random number
+        generation in Keras.
+    """
+    return _USE_GENERATOR_FOR_RNG
+
+
+@keras_export("keras.backend.experimental.enable_tf_random_generator", v1=[])
 def enable_tf_random_generator():
-  """Enable the `tf.random.Generator` as the RNG for Keras.
+    """Enable the `tf.random.Generator` as the RNG for Keras.
 
-  See `tf.keras.backend.experimental.is_tf_random_generator_enabled` for more
-  details.
-  """
+    See `tf.keras.backend.experimental.is_tf_random_generator_enabled` for more
+    details.
+    """
 
-  global _USE_GENERATOR_FOR_RNG
-  _USE_GENERATOR_FOR_RNG = True
+    global _USE_GENERATOR_FOR_RNG
+    _USE_GENERATOR_FOR_RNG = True
 
 
-@keras_export('keras.backend.experimental.disable_tf_random_generator', v1=[])
+@keras_export("keras.backend.experimental.disable_tf_random_generator", v1=[])
 def disable_tf_random_generator():
-  """Disable the `tf.random.Generator` as the RNG for Keras.
+    """Disable the `tf.random.Generator` as the RNG for Keras.
 
-  See `tf.keras.backend.experimental.is_tf_random_generator_enabled` for more
-  details.
-  """
-  global _USE_GENERATOR_FOR_RNG
-  _USE_GENERATOR_FOR_RNG = False
+    See `tf.keras.backend.experimental.is_tf_random_generator_enabled` for more
+    details.
+    """
+    global _USE_GENERATOR_FOR_RNG
+    _USE_GENERATOR_FOR_RNG = False
 
 
 class RandomGenerator(tf.__internal__.tracking.AutoTrackable):
-  """Random generator that selects appropriate random ops.
-
-  This class contains the logic for legacy stateful random ops, as well as the
-  new stateless random ops with seeds and tf.random.Generator. Any class that
-  relies on RNG (eg initializer, shuffle, dropout) should use this class to
-  handle the transition from legacy RNGs to new RNGs.
-
-  Args:
-    seed: Optional int seed. When `rng_type` is "stateful", the seed is used
-      to create `tf.random.Generator` to produce deterministic sequences.
-      When `rng_type` is "stateless", new seed will be created if it is not
-      provided by user, and it will be passed down to stateless random ops.
-      When `rng_type` is "legacy_stateful", the seed will be passed down to
-      stateful random ops.
-    rng_type: Type of RNG to use, one of "stateful", "stateless",
-      "legacy_stateful". It defaults to "stateful" if
-      `enable_tf_random_generator` has been activated, or to
-      "legacy_stateful" otherwise.
-      - When using "stateless", the random ops outputs are constant (the same
-        inputs result in the same outputs).
-      - When using "stateful" or "legacy_stateful", the random ops outputs are
-        non-constant, but deterministic: calling the same random op multiple
-        times with the same inputs results in a deterministic sequence of
-        different outputs.
-      - "legacy_stateful" is backed by TF1 stateful RNG ops
-        (e.g. `tf.random.uniform`), while "stateful"
-        is backed by TF2 APIs (e.g. `tf.random.Generator.uniform`).
-  """
-  RNG_STATELESS = 'stateless'
-  RNG_STATEFUL = 'stateful'
-  RNG_LEGACY_STATEFUL = 'legacy_stateful'
-
-  def __init__(self, seed=None, rng_type=None, **kwargs):
-    self._seed = seed
-    self._set_rng_type(rng_type, **kwargs)
-    self._built = False
-
-  def _set_rng_type(self, rng_type, **kwargs):
-    # Only supported kwargs is "force_generator", which we will remove once we
-    # clean up all the caller.
-    # TODO(scottzhu): Remove the kwargs for force_generator.
-    if kwargs.get('force_generator', False):
-      rng_type = self.RNG_STATEFUL
-    if rng_type is None:
-      if is_tf_random_generator_enabled():
-        self._rng_type = self.RNG_STATEFUL
-      else:
-        self._rng_type = self.RNG_LEGACY_STATEFUL
-    else:
-      if rng_type not in [self.RNG_STATEFUL,
-                          self.RNG_LEGACY_STATEFUL, self.RNG_STATELESS]:
-        raise ValueError(
-            'Invalid `rng_type` received. '
-            'Valid `rng_type` are ["stateless", "stateful", "legacy_stateful"].'
-            f' Got: {rng_type}')
-      self._rng_type = rng_type
-
-  def _maybe_init(self):
-    """Lazily init the RandomGenerator.
-
-    The TF API executing_eagerly_outside_functions() has some side effect, and
-    couldn't be used before API like tf.enable_eager_execution(). Some of the
-    client side code was creating the initializer at the code load time, which
-    triggers the creation of RandomGenerator. Lazy init this class to walkaround
-    this issue until it is resolved on TF side.
-    """
-    # TODO(b/167482354): Change this back to normal init when the bug is fixed.
-    if self._built:
-      return
-
-    if (self._rng_type == self.RNG_STATEFUL and
-        not tf.compat.v1.executing_eagerly_outside_functions()):
-      # Fall back to legacy stateful since the generator need to work in tf2.
-      self._rng_type = self.RNG_LEGACY_STATEFUL
-
-    if self._rng_type == self.RNG_STATELESS:
-      self._seed = self._create_seed(self._seed)
-      self._generator = None
-    elif self._rng_type == self.RNG_STATEFUL:
-      from keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
-      with tf_utils.maybe_init_scope(self):
-        seed = self._create_seed(self._seed)
-        self._generator = tf.random.Generator.from_seed(seed)
-    else:
-      # In legacy stateful, we use stateful op, regardless whether user provide
-      # seed or not. Seeded stateful op will ensure generating same sequences.
-      self._generator = None
-    self._built = True
+    """Random generator that selects appropriate random ops.
+
+    This class contains the logic for legacy stateful random ops, as well as the
+    new stateless random ops with seeds and tf.random.Generator. Any class that
+    relies on RNG (eg initializer, shuffle, dropout) should use this class to
+    handle the transition from legacy RNGs to new RNGs.
+
+    Args:
+      seed: Optional int seed. When `rng_type` is "stateful", the seed is used
+        to create `tf.random.Generator` to produce deterministic sequences.
+        When `rng_type` is "stateless", new seed will be created if it is not
+        provided by user, and it will be passed down to stateless random ops.
+        When `rng_type` is "legacy_stateful", the seed will be passed down to
+        stateful random ops.
+      rng_type: Type of RNG to use, one of "stateful", "stateless",
+        "legacy_stateful". When `None` it uses "stateful" if
+        `enable_tf_random_generator` has been activated, or
+        "legacy_stateful" otherwise.
+        - When using "stateless", the random ops outputs are constant (the same
+          inputs result in the same outputs).
+        - When using "stateful" or "legacy_stateful", the random ops outputs are
+          non-constant, but deterministic: calling the same random op multiple
+          times with the same inputs results in a deterministic sequence of
+          different outputs.
+        - "legacy_stateful" is backed by TF1 stateful RNG ops
+          (e.g. `tf.random.uniform`), while "stateful"
+          is backed by TF2 APIs (e.g. `tf.random.Generator.uniform`).
+        Defaults to `None`.
+    """
+
+    RNG_STATELESS = "stateless"
+    RNG_STATEFUL = "stateful"
+    RNG_LEGACY_STATEFUL = "legacy_stateful"
+
+    def __init__(self, seed=None, rng_type=None, **kwargs):
+        self._seed = seed
+        self._set_rng_type(rng_type, **kwargs)
+        self._built = False
+
+    def _set_rng_type(self, rng_type, **kwargs):
+        # Only supported kwargs is "force_generator", which we will remove once
+        # we clean up all the caller.
+        # TODO(scottzhu): Remove the kwargs for force_generator.
+        if kwargs.get("force_generator", False):
+            rng_type = self.RNG_STATEFUL
+        if rng_type is None:
+            if is_tf_random_generator_enabled():
+                self._rng_type = self.RNG_STATEFUL
+            else:
+                self._rng_type = self.RNG_LEGACY_STATEFUL
+        else:
+            if rng_type not in [
+                self.RNG_STATEFUL,
+                self.RNG_LEGACY_STATEFUL,
+                self.RNG_STATELESS,
+            ]:
+                raise ValueError(
+                    "Invalid `rng_type` received. "
+                    'Valid `rng_type` are ["stateless", '
+                    '"stateful", "legacy_stateful"].'
+                    f" Got: {rng_type}"
+                )
+            self._rng_type = rng_type
+
+    def _maybe_init(self):
+        """Lazily init the RandomGenerator.
+
+        The TF API executing_eagerly_outside_functions() has some side effect,
+        and couldn't be used before API like tf.enable_eager_execution(). Some
+        of the client side code was creating the initializer at the code load
+        time, which triggers the creation of RandomGenerator. Lazy init this
+        class to walkaround this issue until it is resolved on TF side.
+        """
+        # TODO(b/167482354): Change this back to normal init when the bug is
+        # fixed.
+        if self._built:
+            return
+
+        if (
+            self._rng_type == self.RNG_STATEFUL
+            and not tf.compat.v1.executing_eagerly_outside_functions()
+        ):
+            # Fall back to legacy stateful since the generator need to work in
+            # tf2.
+            self._rng_type = self.RNG_LEGACY_STATEFUL
+
+        if self._rng_type == self.RNG_STATELESS:
+            self._seed = self._create_seed(self._seed)
+            self._generator = None
+        elif self._rng_type == self.RNG_STATEFUL:
+            with tf_utils.maybe_init_scope(self):
+                seed = self._create_seed(self._seed)
+                self._generator = tf.random.Generator.from_seed(
+                    seed, alg=tf.random.Algorithm.AUTO_SELECT
+                )
+        else:
+            # In legacy stateful, we use stateful op, regardless whether user
+            # provide seed or not. Seeded stateful op will ensure generating
+            # same sequences.
+            self._generator = None
+        self._built = True
+
+    def make_seed_for_stateless_op(self):
+        """Generate a new seed based on the init config.
+
+        Note that this will not return python ints which will be frozen in the
+        graph and cause stateless op to return the same value. It will only
+        return value when generator is used, otherwise it will return None.
+
+        Returns:
+          A tensor with shape [2,].
+        """
+        self._maybe_init()
+        if self._rng_type == self.RNG_STATELESS:
+            return [self._seed, 0]
+        elif self._rng_type == self.RNG_STATEFUL:
+            return self._generator.make_seeds()[:, 0]
+        return None
+
+    def make_legacy_seed(self):
+        """Create a new seed for the legacy stateful ops to use.
+
+        When user didn't provide any original seed, this method will return
+        None.  Otherwise it will increment the counter and return as the new
+        seed.
+
+        Note that it is important to generate different seed for stateful ops in
+        the `tf.function`. The random ops will return same value when same seed
+        is provided in the `tf.function`.
+
+        Returns:
+          int as new seed, or None.
+        """
+        if self._seed is not None:
+            result = self._seed
+            self._seed += 1
+            return result
+        return None
+
+    def _create_seed(self, user_specified_seed):
+        if user_specified_seed is not None:
+            return user_specified_seed
+        elif getattr(_SEED_GENERATOR, "generator", None):
+            return _SEED_GENERATOR.generator.randint(1, 1e9)
+        else:
+            return random.randint(1, int(1e9))
+
+    def random_normal(
+        self, shape, mean=0.0, stddev=1.0, dtype=None, nonce=None
+    ):
+        """Produce random number based on the normal distribution.
+
+        Args:
+          shape: The shape of the random values to generate.
+          mean: Floats, default to 0. Mean of the random values to generate.
+          stddev: Floats, default to 1. Standard deviation of the random values
+            to generate.
+          dtype: Optional dtype of the tensor. Only floating point types are
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
+            `tf.keras.backend.set_floatx(float_dtype)`)
+          nonce: Optional integer scalar, that will be folded into the seed in
+            the stateless mode.
+        """
+        self._maybe_init()
+        dtype = dtype or floatx()
+        if self._rng_type == self.RNG_STATEFUL:
+            return self._generator.normal(
+                shape=shape, mean=mean, stddev=stddev, dtype=dtype
+            )
+        elif self._rng_type == self.RNG_STATELESS:
+            seed = self.make_seed_for_stateless_op()
+            if nonce:
+                seed = tf.random.experimental.stateless_fold_in(seed, nonce)
+            return tf.random.stateless_normal(
+                shape=shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed
+            )
+        return tf.random.normal(
+            shape=shape,
+            mean=mean,
+            stddev=stddev,
+            dtype=dtype,
+            seed=self.make_legacy_seed(),
+        )
+
+    def random_uniform(
+        self, shape, minval=0.0, maxval=None, dtype=None, nonce=None
+    ):
+        """Produce random number based on the uniform distribution.
+
+        Args:
+          shape: The shape of the random values to generate.
+          minval: Floats, default to 0. Lower bound of the range of
+            random values to generate (inclusive).
+          minval: Floats, default to None. Upper bound of the range of
+            random values to generate (exclusive).
+          dtype: Optional dtype of the tensor. Only floating point types are
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
+            `tf.keras.backend.set_floatx(float_dtype)`)
+          nonce: Optional integer scalar, that will be folded into the seed in
+            the stateless mode.
+        """
+        self._maybe_init()
+        dtype = dtype or floatx()
+        if self._rng_type == self.RNG_STATEFUL:
+            return self._generator.uniform(
+                shape=shape, minval=minval, maxval=maxval, dtype=dtype
+            )
+        elif self._rng_type == self.RNG_STATELESS:
+            seed = self.make_seed_for_stateless_op()
+            if nonce:
+                seed = tf.random.experimental.stateless_fold_in(seed, nonce)
+            return tf.random.stateless_uniform(
+                shape=shape,
+                minval=minval,
+                maxval=maxval,
+                dtype=dtype,
+                seed=seed,
+            )
+        return tf.random.uniform(
+            shape=shape,
+            minval=minval,
+            maxval=maxval,
+            dtype=dtype,
+            seed=self.make_legacy_seed(),
+        )
+
+    def truncated_normal(
+        self, shape, mean=0.0, stddev=1.0, dtype=None, nonce=None
+    ):
+        """Produce random number based on the truncated normal distribution.
+
+        Args:
+          shape: The shape of the random values to generate.
+          mean: Floats, default to 0. Mean of the random values to generate.
+          stddev: Floats, default to 1. Standard deviation of the random values
+            to generate.
+          dtype: Optional dtype of the tensor. Only floating point types are
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
+            `tf.keras.backend.set_floatx(float_dtype)`)
+          nonce: Optional integer scalar, that will be folded into the seed in
+            the stateless mode.
+        """
+        self._maybe_init()
+        dtype = dtype or floatx()
+        if self._rng_type == self.RNG_STATEFUL:
+            return self._generator.truncated_normal(
+                shape=shape, mean=mean, stddev=stddev, dtype=dtype
+            )
+        elif self._rng_type == self.RNG_STATELESS:
+            seed = self.make_seed_for_stateless_op()
+            if nonce:
+                seed = tf.random.experimental.stateless_fold_in(seed, nonce)
+            return tf.random.stateless_truncated_normal(
+                shape=shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed
+            )
+        return tf.random.truncated_normal(
+            shape=shape,
+            mean=mean,
+            stddev=stddev,
+            dtype=dtype,
+            seed=self.make_legacy_seed(),
+        )
+
+    def dropout(self, inputs, rate, noise_shape=None):
+        self._maybe_init()
+        if self._rng_type == self.RNG_STATEFUL:
+            return tf.nn.experimental.general_dropout(
+                inputs,
+                rate=rate,
+                noise_shape=noise_shape,
+                uniform_sampler=self._generator.uniform,
+            )
+        elif self._rng_type == self.RNG_STATELESS:
+            return tf.nn.experimental.stateless_dropout(
+                inputs,
+                rate=rate,
+                noise_shape=noise_shape,
+                seed=self.make_seed_for_stateless_op(),
+            )
+        else:
+            return tf.nn.dropout(
+                inputs,
+                rate=rate,
+                noise_shape=noise_shape,
+                seed=self.make_legacy_seed(),
+            )
+
 
-  def make_seed_for_stateless_op(self):
-    """Generate a new seed based on the init config.
+@keras_export("keras.backend.random_uniform_variable")
+@doc_controls.do_not_generate_docs
+def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
+    """Instantiates a variable with values drawn from a uniform distribution.
 
-    Note that this will not return python ints which will be frozen in the graph
-    and cause stateless op to return the same value. It will only return value
-    when generator is used, otherwise it will return None.
+    Args:
+        shape: Tuple of integers, shape of returned Keras variable.
+        low: Float, lower boundary of the output interval.
+        high: Float, upper boundary of the output interval.
+        dtype: String, dtype of returned Keras variable.
+        name: String, name of returned Keras variable.
+        seed: Integer, random seed.
 
     Returns:
-      A tensor with shape [2,].
+        A Keras variable, filled with drawn samples.
+
+    Example:
+
+    >>> kvar = tf.keras.backend.random_uniform_variable(shape=(2,3),
+    ... low=0.0, high=1.0)
+    >>> kvar
+    <tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=...,
+    dtype=float32)>
     """
-    self._maybe_init()
-    if self._rng_type == self.RNG_STATELESS:
-      return [self._seed, 0]
-    elif self._rng_type == self.RNG_STATEFUL:
-      return self._generator.make_seeds()[:, 0]
-    return None
+    if dtype is None:
+        dtype = floatx()
+    tf_dtype = tf.as_dtype(dtype)
+    if seed is None:
+        # ensure that randomness is conditioned by the Numpy RNG
+        seed = np.random.randint(10e8)
+    value = tf.compat.v1.random_uniform_initializer(
+        low, high, dtype=tf_dtype, seed=seed
+    )(shape)
+    return variable(value, dtype=dtype, name=name)
 
-  def make_legacy_seed(self):
-    """Create a new seed for the legacy stateful ops to use.
 
-    When user didn't provide any original seed, this method will return None.
-    Otherwise it will increment the counter and return as the new seed.
+@keras_export("keras.backend.random_normal_variable")
+@doc_controls.do_not_generate_docs
+def random_normal_variable(
+    shape, mean, scale, dtype=None, name=None, seed=None
+):
+    """Instantiates a variable with values drawn from a normal distribution.
 
-    Note that it is important to generate different seed for stateful ops in
-    the `tf.function`. The random ops will return same value when same seed is
-    provided in the `tf.function`.
+    Args:
+        shape: Tuple of integers, shape of returned Keras variable.
+        mean: Float, mean of the normal distribution.
+        scale: Float, standard deviation of the normal distribution.
+        dtype: String, dtype of returned Keras variable.
+        name: String, name of returned Keras variable.
+        seed: Integer, random seed.
 
     Returns:
-      int as new seed, or None.
-    """
-    if self._seed is not None:
-      result = self._seed
-      self._seed += 1
-      return result
-    return None
+        A Keras variable, filled with drawn samples.
 
-  def _create_seed(self, user_specified_seed):
-    if user_specified_seed is not None:
-      return user_specified_seed
-    elif getattr(_SEED_GENERATOR, 'generator', None):
-      return _SEED_GENERATOR.generator.randint(1, 1e9)
-    else:
-      return random.randint(1, 1e9)
-
-  def random_normal(self, shape, mean=0., stddev=1., dtype=None, nonce=None):
-    """Produce random number based on the normal distribution.
-
-    Args:
-      shape: The shape of the random values to generate.
-      mean: Floats, default to 0. Mean of the random values to generate.
-      stddev: Floats, default to 1. Standard deviation of the random values to
-        generate.
-      dtype: Optional dtype of the tensor. Only floating point types are
-        supported. If not specified, `tf.keras.backend.floatx()` is used, which
-        default to `float32` unless you configured it otherwise (via
-        `tf.keras.backend.set_floatx(float_dtype)`)
-      nonce: Optional integer scalar, that will be folded into the seed in the
-        stateless mode.
-    """
-    self._maybe_init()
-    dtype = dtype or floatx()
-    if self._rng_type == self.RNG_STATEFUL:
-      return self._generator.normal(
-          shape=shape, mean=mean, stddev=stddev, dtype=dtype)
-    elif self._rng_type == self.RNG_STATELESS:
-      seed = self.make_seed_for_stateless_op()
-      if nonce:
-        seed = tf.random.experimental.stateless_fold_in(seed, nonce)
-      return tf.random.stateless_normal(
-          shape=shape, mean=mean, stddev=stddev, dtype=dtype,
-          seed=seed)
-    return tf.random.normal(
-        shape=shape, mean=mean, stddev=stddev, dtype=dtype,
-        seed=self.make_legacy_seed())
-
-  def random_uniform(self, shape, minval=0., maxval=None, dtype=None,
-                     nonce=None):
-    """Produce random number based on the uniform distribution.
-
-    Args:
-      shape: The shape of the random values to generate.
-      minval: Floats, default to 0. Lower bound of the range of
-        random values to generate (inclusive).
-      minval: Floats, default to None. Upper bound of the range of
-        random values to generate (exclusive).
-      dtype: Optional dtype of the tensor. Only floating point types are
-        supported. If not specified, `tf.keras.backend.floatx()` is used, which
-        default to `float32` unless you configured it otherwise (via
-        `tf.keras.backend.set_floatx(float_dtype)`)
-      nonce: Optional integer scalar, that will be folded into the seed in the
-        stateless mode.
-    """
-    self._maybe_init()
-    dtype = dtype or floatx()
-    if self._rng_type == self.RNG_STATEFUL:
-      return self._generator.uniform(
-          shape=shape, minval=minval, maxval=maxval, dtype=dtype)
-    elif self._rng_type == self.RNG_STATELESS:
-      seed = self.make_seed_for_stateless_op()
-      if nonce:
-        seed = tf.random.experimental.stateless_fold_in(seed, nonce)
-      return tf.random.stateless_uniform(
-        shape=shape, minval=minval, maxval=maxval, dtype=dtype,
-        seed=seed)
-    return tf.random.uniform(
-        shape=shape, minval=minval, maxval=maxval, dtype=dtype,
-        seed=self.make_legacy_seed())
-
-  def truncated_normal(self, shape, mean=0., stddev=1., dtype=None, nonce=None):
-    """Produce random number based on the truncated normal distribution.
-
-    Args:
-      shape: The shape of the random values to generate.
-      mean: Floats, default to 0. Mean of the random values to generate.
-      stddev: Floats, default to 1. Standard deviation of the random values to
-        generate.
-      dtype: Optional dtype of the tensor. Only floating point types are
-        supported. If not specified, `tf.keras.backend.floatx()` is used, which
-        default to `float32` unless you configured it otherwise (via
-        `tf.keras.backend.set_floatx(float_dtype)`)
-      nonce: Optional integer scalar, that will be folded into the seed in the
-        stateless mode.
-    """
-    self._maybe_init()
-    dtype = dtype or floatx()
-    if self._rng_type == self.RNG_STATEFUL:
-      return self._generator.truncated_normal(
-          shape=shape, mean=mean, stddev=stddev, dtype=dtype)
-    elif self._rng_type == self.RNG_STATELESS:
-      seed = self.make_seed_for_stateless_op()
-      if nonce:
-        seed = tf.random.experimental.stateless_fold_in(seed, nonce)
-      return tf.random.stateless_truncated_normal(
-        shape=shape, mean=mean, stddev=stddev, dtype=dtype,
-        seed=seed)
-    return tf.random.truncated_normal(
-        shape=shape, mean=mean, stddev=stddev, dtype=dtype,
-        seed=self.make_legacy_seed())
+    Example:
 
-  def dropout(self, inputs, rate, noise_shape=None):
-    self._maybe_init()
-    if self._rng_type in [self.RNG_STATEFUL, self.RNG_STATELESS]:
-      return tf.nn.experimental.stateless_dropout(
-          inputs, rate=rate, noise_shape=noise_shape,
-          seed=self.make_seed_for_stateless_op())
-    return tf.nn.dropout(inputs, rate=rate, noise_shape=noise_shape,
-                         seed=self.make_legacy_seed())
+    >>> kvar = tf.keras.backend.random_normal_variable(shape=(2,3),
+    ... mean=0.0, scale=1.0)
+    >>> kvar
+    <tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=...,
+    dtype=float32)>
+    """
+    if dtype is None:
+        dtype = floatx()
+    tf_dtype = tf.as_dtype(dtype)
+    if seed is None:
+        # ensure that randomness is conditioned by the Numpy RNG
+        seed = np.random.randint(10e8)
+    value = tf.compat.v1.random_normal_initializer(
+        mean, scale, dtype=tf_dtype, seed=seed
+    )(shape)
+    return variable(value, dtype=dtype, name=name)
 
 
-@keras_export('keras.backend.random_uniform_variable')
-@doc_controls.do_not_generate_docs
-def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
-  """Instantiates a variable with values drawn from a uniform distribution.
-
-  Args:
-      shape: Tuple of integers, shape of returned Keras variable.
-      low: Float, lower boundary of the output interval.
-      high: Float, upper boundary of the output interval.
-      dtype: String, dtype of returned Keras variable.
-      name: String, name of returned Keras variable.
-      seed: Integer, random seed.
-
-  Returns:
-      A Keras variable, filled with drawn samples.
-
-  Example:
-
-  >>> kvar = tf.keras.backend.random_uniform_variable(shape=(2,3),
-  ... low=0.0, high=1.0)
-  >>> kvar
-  <tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=...,
-  dtype=float32)>
-  """
-  if dtype is None:
-    dtype = floatx()
-  tf_dtype = tf.as_dtype(dtype)
-  if seed is None:
-    # ensure that randomness is conditioned by the Numpy RNG
-    seed = np.random.randint(10e8)
-  value = tf.compat.v1.random_uniform_initializer(
-      low, high, dtype=tf_dtype, seed=seed)(shape)
-  return variable(value, dtype=dtype, name=name)
-
-
-@keras_export('keras.backend.random_normal_variable')
-@doc_controls.do_not_generate_docs
-def random_normal_variable(shape, mean, scale, dtype=None, name=None,
-                           seed=None):
-  """Instantiates a variable with values drawn from a normal distribution.
-
-  Args:
-      shape: Tuple of integers, shape of returned Keras variable.
-      mean: Float, mean of the normal distribution.
-      scale: Float, standard deviation of the normal distribution.
-      dtype: String, dtype of returned Keras variable.
-      name: String, name of returned Keras variable.
-      seed: Integer, random seed.
-
-  Returns:
-      A Keras variable, filled with drawn samples.
-
-  Example:
-
-  >>> kvar = tf.keras.backend.random_normal_variable(shape=(2,3),
-  ... mean=0.0, scale=1.0)
-  >>> kvar
-  <tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=...,
-  dtype=float32)>
-  """
-  if dtype is None:
-    dtype = floatx()
-  tf_dtype = tf.as_dtype(dtype)
-  if seed is None:
-    # ensure that randomness is conditioned by the Numpy RNG
-    seed = np.random.randint(10e8)
-  value = tf.compat.v1.random_normal_initializer(
-      mean, scale, dtype=tf_dtype, seed=seed)(shape)
-  return variable(value, dtype=dtype, name=name)
-
-
-@keras_export('keras.backend.count_params')
+@keras_export("keras.backend.count_params")
 @doc_controls.do_not_generate_docs
 def count_params(x):
-  """Returns the static number of elements in a variable or tensor.
+    """Returns the static number of elements in a variable or tensor.
 
-  Args:
-      x: Variable or tensor.
+    Args:
+        x: Variable or tensor.
 
-  Returns:
-      Integer, the number of scalars in `x`.
+    Returns:
+        Integer, the number of scalars in `x`.
 
-  Example:
+    Example:
 
-  >>> kvar = tf.keras.backend.zeros((2,3))
-  >>> tf.keras.backend.count_params(kvar)
-  6
-  >>> tf.keras.backend.eval(kvar)
-  array([[0.,  0.,  0.],
-         [0.,  0.,  0.]], dtype=float32)
+    >>> kvar = tf.keras.backend.zeros((2,3))
+    >>> tf.keras.backend.count_params(kvar)
+    6
+    >>> tf.keras.backend.eval(kvar)
+    array([[0.,  0.,  0.],
+           [0.,  0.,  0.]], dtype=float32)
 
-  """
-  return np.prod(x.shape.as_list())
+    """
+    return np.prod(x.shape.as_list())
 
 
-@keras_export('keras.backend.cast')
+@keras_export("keras.backend.cast")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def cast(x, dtype):
-  """Casts a tensor to a different dtype and returns it.
+    """Casts a tensor to a different dtype and returns it.
 
-  You can cast a Keras variable but it still returns a Keras tensor.
+    You can cast a Keras variable but it still returns a Keras tensor.
 
-  Args:
-      x: Keras tensor (or variable).
-      dtype: String, either (`'float16'`, `'float32'`, or `'float64'`).
+    Args:
+        x: Keras tensor (or variable).
+        dtype: String, either (`'float16'`, `'float32'`, or `'float64'`).
 
-  Returns:
-      Keras tensor with dtype `dtype`.
+    Returns:
+        Keras tensor with dtype `dtype`.
 
-  Examples:
-      Cast a float32 variable to a float64 tensor
+    Examples:
+        Cast a float32 variable to a float64 tensor
 
-  >>> input = tf.keras.backend.ones(shape=(1,3))
-  >>> print(input)
-  <tf.Variable 'Variable:0' shape=(1, 3) dtype=float32,
-  numpy=array([[1., 1., 1.]], dtype=float32)>
-  >>> cast_input = tf.keras.backend.cast(input, dtype='float64')
-  >>> print(cast_input)
-  tf.Tensor([[1. 1. 1.]], shape=(1, 3), dtype=float64)
+    >>> input = tf.keras.backend.ones(shape=(1,3))
+    >>> print(input)
+    <tf.Variable 'Variable:0' shape=(1, 3) dtype=float32,
+    numpy=array([[1., 1., 1.]], dtype=float32)>
+    >>> cast_input = tf.keras.backend.cast(input, dtype='float64')
+    >>> print(cast_input)
+    tf.Tensor([[1. 1. 1.]], shape=(1, 3), dtype=float64)
 
-  """
-  return tf.cast(x, dtype)
+    """
+    return tf.cast(x, dtype)
 
 
 # UPDATES OPS
 
 
-@keras_export('keras.backend.update')
+@keras_export("keras.backend.update")
 @doc_controls.do_not_generate_docs
 def update(x, new_x):
-  return tf.compat.v1.assign(x, new_x)
+    return tf.compat.v1.assign(x, new_x)
 
 
-@keras_export('keras.backend.update_add')
+@keras_export("keras.backend.update_add")
 @doc_controls.do_not_generate_docs
 def update_add(x, increment):
-  """Update the value of `x` by adding `increment`.
+    """Update the value of `x` by adding `increment`.
 
-  Args:
-      x: A Variable.
-      increment: A tensor of same shape as `x`.
+    Args:
+        x: A Variable.
+        increment: A tensor of same shape as `x`.
 
-  Returns:
-      The variable `x` updated.
-  """
-  return tf.compat.v1.assign_add(x, increment)
+    Returns:
+        The variable `x` updated.
+    """
+    return tf.compat.v1.assign_add(x, increment)
 
 
-@keras_export('keras.backend.update_sub')
+@keras_export("keras.backend.update_sub")
 @doc_controls.do_not_generate_docs
 def update_sub(x, decrement):
-  """Update the value of `x` by subtracting `decrement`.
+    """Update the value of `x` by subtracting `decrement`.
 
-  Args:
-      x: A Variable.
-      decrement: A tensor of same shape as `x`.
+    Args:
+        x: A Variable.
+        decrement: A tensor of same shape as `x`.
 
-  Returns:
-      The variable `x` updated.
-  """
-  return tf.compat.v1.assign_sub(x, decrement)
+    Returns:
+        The variable `x` updated.
+    """
+    return tf.compat.v1.assign_sub(x, decrement)
 
 
-@keras_export('keras.backend.moving_average_update')
+@keras_export("keras.backend.moving_average_update")
 @doc_controls.do_not_generate_docs
 def moving_average_update(x, value, momentum):
-  """Compute the exponential moving average of a value.
+    """Compute the exponential moving average of a value.
 
-  The moving average 'x' is updated with 'value' following:
+    The moving average 'x' is updated with 'value' following:
 
-  ```
-  x = x * momentum + value * (1 - momentum)
-  ```
+    ```
+    x = x * momentum + value * (1 - momentum)
+    ```
 
-  For example:
+    For example:
 
-  >>> x = tf.Variable(0.0)
-  >>> momentum=0.9
-  >>> moving_average_update(x, value = 2.0, momentum=momentum).numpy()
-  >>> x.numpy()
-  0.2
+    >>> x = tf.Variable(0.0)
+    >>> momentum=0.9
+    >>> moving_average_update(x, value = 2.0, momentum=momentum).numpy()
+    >>> x.numpy()
+    0.2
 
-  The result will be biased towards the initial value of the variable.
+    The result will be biased towards the initial value of the variable.
 
-  If the variable was initialized to zero, you can divide by
-  `1 - momentum ** num_updates` to debias it (Section 3 of
-  [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)):
+    If the variable was initialized to zero, you can divide by
+    `1 - momentum ** num_updates` to debias it (Section 3 of
+    [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)):
 
-  >>> num_updates = 1.0
-  >>> x_zdb = x/(1 - momentum**num_updates)
-  >>> x_zdb.numpy()
-  2.0
+    >>> num_updates = 1.0
+    >>> x_zdb = x/(1 - momentum**num_updates)
+    >>> x_zdb.numpy()
+    2.0
 
-  Args:
-      x: A Variable, the moving average.
-      value: A tensor with the same shape as `x`, the new value to be
-        averaged in.
-      momentum: The moving average momentum.
+    Args:
+        x: A Variable, the moving average.
+        value: A tensor with the same shape as `x`, the new value to be
+          averaged in.
+        momentum: The moving average momentum.
 
-  Returns:
-      The updated variable.
-  """
-  if tf.__internal__.tf2.enabled():
-    momentum = tf.cast(momentum, x.dtype)
-    value = tf.cast(value, x.dtype)
-    return x.assign_sub((x - value) * (1 - momentum))
-  else:
-    return tf.__internal__.train.assign_moving_average(
-        x, value, momentum, zero_debias=True)
+    Returns:
+        The updated variable.
+    """
+    if tf.__internal__.tf2.enabled():
+        momentum = tf.cast(momentum, x.dtype)
+        value = tf.cast(value, x.dtype)
+        return x.assign_sub((x - value) * (1 - momentum))
+    else:
+        return tf.__internal__.train.assign_moving_average(
+            x, value, momentum, zero_debias=True
+        )
 
 
 # LINEAR ALGEBRA
 
 
-@keras_export('keras.backend.dot')
+@keras_export("keras.backend.dot")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def dot(x, y):
-  """Multiplies 2 tensors (and/or variables) and returns a tensor.
-
-  This operation corresponds to `numpy.dot(a, b, out=None)`.
-
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
-
-  Returns:
-      A tensor, dot product of `x` and `y`.
-
-  Examples:
-
-  If inputs `x` and `y` are 2-D arrays, then it is equivalent to `tf.matmul`.
-  >>> x = tf.keras.backend.placeholder(shape=(2, 3))
-  >>> y = tf.keras.backend.placeholder(shape=(3, 4))
-  >>> xy = tf.keras.backend.dot(x, y)
-  >>> xy
-  <KerasTensor: shape=(2, 4) dtype=float32 ...>
-
-  >>> x = tf.keras.backend.placeholder(shape=(32, 28, 3))
-  >>> y = tf.keras.backend.placeholder(shape=(3, 4))
-  >>> xy = tf.keras.backend.dot(x, y)
-  >>> xy
-  <KerasTensor: shape=(32, 28, 4) dtype=float32 ...>
-
-  If `x` is an N-D array and `y` is an M-D array (where M>=2), it is a sum
-  product over the last axis of `x` and the second-to-last axis of `y`.
-  >>> x = tf.keras.backend.random_uniform_variable(
-  ... shape=(2, 3), low=0., high=1.)
-  >>> y = tf.keras.backend.ones((4, 3, 5))
-  >>> xy = tf.keras.backend.dot(x, y)
-  >>> tf.keras.backend.int_shape(xy)
-  (2, 4, 5)
-  """
-  if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
-    x_shape = []
-    for i, s in zip(int_shape(x), tf.unstack(tf.shape(x))):
-      if i is not None:
-        x_shape.append(i)
-      else:
-        x_shape.append(s)
-    x_shape = tuple(x_shape)
-    y_shape = []
-    for i, s in zip(int_shape(y), tf.unstack(tf.shape(y))):
-      if i is not None:
-        y_shape.append(i)
-      else:
-        y_shape.append(s)
-    y_shape = tuple(y_shape)
-    y_permute_dim = list(range(ndim(y)))
-    y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim
-    xt = tf.reshape(x, [-1, x_shape[-1]])
-    yt = tf.reshape(
-        tf.compat.v1.transpose(y, perm=y_permute_dim), [y_shape[-2], -1])
-    return tf.reshape(
-        tf.matmul(xt, yt), x_shape[:-1] + y_shape[:-2] + y_shape[-1:])
-  if is_sparse(x):
-    out = tf.sparse.sparse_dense_matmul(x, y)
-  else:
-    out = tf.matmul(x, y)
-  return out
-
-
-@keras_export('keras.backend.batch_dot')
+    """Multiplies 2 tensors (and/or variables) and returns a tensor.
+
+    This operation corresponds to `numpy.dot(a, b, out=None)`.
+
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
+
+    Returns:
+        A tensor, dot product of `x` and `y`.
+
+    Examples:
+
+    If inputs `x` and `y` are 2-D arrays, then it is equivalent to `tf.matmul`.
+    >>> x = tf.keras.backend.placeholder(shape=(2, 3))
+    >>> y = tf.keras.backend.placeholder(shape=(3, 4))
+    >>> xy = tf.keras.backend.dot(x, y)
+    >>> xy
+    <KerasTensor: shape=(2, 4) dtype=float32 ...>
+
+    >>> x = tf.keras.backend.placeholder(shape=(32, 28, 3))
+    >>> y = tf.keras.backend.placeholder(shape=(3, 4))
+    >>> xy = tf.keras.backend.dot(x, y)
+    >>> xy
+    <KerasTensor: shape=(32, 28, 4) dtype=float32 ...>
+
+    If `x` is an N-D array and `y` is an M-D array (where M>=2), it is a sum
+    product over the last axis of `x` and the second-to-last axis of `y`.
+    >>> x = tf.keras.backend.random_uniform_variable(
+    ... shape=(2, 3), low=0., high=1.)
+    >>> y = tf.keras.backend.ones((4, 3, 5))
+    >>> xy = tf.keras.backend.dot(x, y)
+    >>> tf.keras.backend.int_shape(xy)
+    (2, 4, 5)
+    """
+    if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
+        x_shape = []
+        for i, s in zip(int_shape(x), tf.unstack(tf.shape(x))):
+            if i is not None:
+                x_shape.append(i)
+            else:
+                x_shape.append(s)
+        x_shape = tuple(x_shape)
+        y_shape = []
+        for i, s in zip(int_shape(y), tf.unstack(tf.shape(y))):
+            if i is not None:
+                y_shape.append(i)
+            else:
+                y_shape.append(s)
+        y_shape = tuple(y_shape)
+        y_permute_dim = list(range(ndim(y)))
+        y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim
+        xt = tf.reshape(x, [-1, x_shape[-1]])
+        yt = tf.reshape(
+            tf.compat.v1.transpose(y, perm=y_permute_dim), [y_shape[-2], -1]
+        )
+        return tf.reshape(
+            tf.matmul(xt, yt), x_shape[:-1] + y_shape[:-2] + y_shape[-1:]
+        )
+    if is_sparse(x):
+        out = tf.sparse.sparse_dense_matmul(x, y)
+    else:
+        out = tf.matmul(x, y)
+    return out
+
+
+@keras_export("keras.backend.batch_dot")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def batch_dot(x, y, axes=None):
-  """Batchwise dot product.
-
-  `batch_dot` is used to compute dot product of `x` and `y` when
-  `x` and `y` are data in batch, i.e. in a shape of
-  `(batch_size, :)`.
-  `batch_dot` results in a tensor or variable with less dimensions
-  than the input. If the number of dimensions is reduced to 1,
-  we use `expand_dims` to make sure that ndim is at least 2.
-
-  Args:
-    x: Keras tensor or variable with `ndim >= 2`.
-    y: Keras tensor or variable with `ndim >= 2`.
-    axes: Tuple or list of integers with target dimensions, or single integer.
-      The sizes of `x.shape[axes[0]]` and `y.shape[axes[1]]` should be equal.
-
-  Returns:
-    A tensor with shape equal to the concatenation of `x`'s shape
-    (less the dimension that was summed over) and `y`'s shape
-    (less the batch dimension and the dimension that was summed over).
-    If the final rank is 1, we reshape it to `(batch_size, 1)`.
-
-  Examples:
-
-  >>> x_batch = tf.keras.backend.ones(shape=(32, 20, 1))
-  >>> y_batch = tf.keras.backend.ones(shape=(32, 30, 20))
-  >>> xy_batch_dot = tf.keras.backend.batch_dot(x_batch, y_batch, axes=(1, 2))
-  >>> tf.keras.backend.int_shape(xy_batch_dot)
-  (32, 1, 30)
-
-  Shape inference:
-    Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`.
-    If `axes` is (1, 2), to find the output shape of resultant tensor,
-        loop through each dimension in `x`'s shape and `y`'s shape:
-    * `x.shape[0]` : 100 : append to output shape
-    * `x.shape[1]` : 20 : do not append to output shape,
-        dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1)
-    * `y.shape[0]` : 100 : do not append to output shape,
-        always ignore first dimension of `y`
-    * `y.shape[1]` : 30 : append to output shape
-    * `y.shape[2]` : 20 : do not append to output shape,
-        dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2)
-    `output_shape` = `(100, 30)`
-  """
-  x_shape = int_shape(x)
-  y_shape = int_shape(y)
-
-  x_ndim = len(x_shape)
-  y_ndim = len(y_shape)
-
-  if x_ndim < 2 or y_ndim < 2:
-    raise ValueError('Cannot do batch_dot on inputs '
-                     'with rank < 2. '
-                     'Received inputs with shapes ' +
-                     str(x_shape) + ' and ' +
-                     str(y_shape) + '.')
-
-  x_batch_size = x_shape[0]
-  y_batch_size = y_shape[0]
-
-  if x_batch_size is not None and y_batch_size is not None:
-    if x_batch_size != y_batch_size:
-      raise ValueError('Cannot do batch_dot on inputs '
-                       'with different batch sizes. '
-                       'Received inputs with shapes ' +
-                       str(x_shape) + ' and ' +
-                       str(y_shape) + '.')
-  if isinstance(axes, int):
-    axes = [axes, axes]
-
-  if axes is None:
+    """Batchwise dot product.
+
+    `batch_dot` is used to compute dot product of `x` and `y` when
+    `x` and `y` are data in batch, i.e. in a shape of
+    `(batch_size, :)`.
+    `batch_dot` results in a tensor or variable with less dimensions
+    than the input. If the number of dimensions is reduced to 1,
+    we use `expand_dims` to make sure that ndim is at least 2.
+
+    Args:
+      x: Keras tensor or variable with `ndim >= 2`.
+      y: Keras tensor or variable with `ndim >= 2`.
+      axes: Tuple or list of integers with target dimensions, or single integer.
+        The sizes of `x.shape[axes[0]]` and `y.shape[axes[1]]` should be equal.
+
+    Returns:
+      A tensor with shape equal to the concatenation of `x`'s shape
+      (less the dimension that was summed over) and `y`'s shape
+      (less the batch dimension and the dimension that was summed over).
+      If the final rank is 1, we reshape it to `(batch_size, 1)`.
+
+    Examples:
+
+    >>> x_batch = tf.keras.backend.ones(shape=(32, 20, 1))
+    >>> y_batch = tf.keras.backend.ones(shape=(32, 30, 20))
+    >>> xy_batch_dot = tf.keras.backend.batch_dot(x_batch, y_batch, axes=(1, 2))
+    >>> tf.keras.backend.int_shape(xy_batch_dot)
+    (32, 1, 30)
+
+    Shape inference:
+      Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`.
+      If `axes` is (1, 2), to find the output shape of resultant tensor,
+          loop through each dimension in `x`'s shape and `y`'s shape:
+      * `x.shape[0]` : 100 : append to output shape
+      * `x.shape[1]` : 20 : do not append to output shape,
+          dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1)
+      * `y.shape[0]` : 100 : do not append to output shape,
+          always ignore first dimension of `y`
+      * `y.shape[1]` : 30 : append to output shape
+      * `y.shape[2]` : 20 : do not append to output shape,
+          dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2)
+      `output_shape` = `(100, 30)`
+    """
+    x_shape = int_shape(x)
+    y_shape = int_shape(y)
+
+    x_ndim = len(x_shape)
+    y_ndim = len(y_shape)
+
+    if x_ndim < 2 or y_ndim < 2:
+        raise ValueError(
+            "Cannot do batch_dot on inputs "
+            "with rank < 2. "
+            "Received inputs with shapes "
+            + str(x_shape)
+            + " and "
+            + str(y_shape)
+            + "."
+        )
+
+    x_batch_size = x_shape[0]
+    y_batch_size = y_shape[0]
+
+    if x_batch_size is not None and y_batch_size is not None:
+        if x_batch_size != y_batch_size:
+            raise ValueError(
+                "Cannot do batch_dot on inputs "
+                "with different batch sizes. "
+                "Received inputs with shapes "
+                + str(x_shape)
+                + " and "
+                + str(y_shape)
+                + "."
+            )
+    if isinstance(axes, int):
+        axes = [axes, axes]
+
+    if axes is None:
+        if y_ndim == 2:
+            axes = [x_ndim - 1, y_ndim - 1]
+        else:
+            axes = [x_ndim - 1, y_ndim - 2]
+
+    if py_any(isinstance(a, (list, tuple)) for a in axes):
+        raise ValueError(
+            "Multiple target dimensions are not supported. "
+            + "Expected: None, int, (int, int), "
+            + "Provided: "
+            + str(axes)
+        )
+
+    # if tuple, convert to list.
+    axes = list(axes)
+
+    # convert negative indices.
+    if axes[0] < 0:
+        axes[0] += x_ndim
+    if axes[1] < 0:
+        axes[1] += y_ndim
+
+    # sanity checks
+    if 0 in axes:
+        raise ValueError(
+            "Cannot perform batch_dot over axis 0. "
+            "If your inputs are not batched, "
+            "add a dummy batch dimension to your "
+            "inputs using K.expand_dims(x, 0)"
+        )
+    a0, a1 = axes
+    d1 = x_shape[a0]
+    d2 = y_shape[a1]
+
+    if d1 is not None and d2 is not None and d1 != d2:
+        raise ValueError(
+            "Cannot do batch_dot on inputs with shapes "
+            + str(x_shape)
+            + " and "
+            + str(y_shape)
+            + " with axes="
+            + str(axes)
+            + ". x.shape[%d] != y.shape[%d] (%d != %d)."
+            % (axes[0], axes[1], d1, d2)
+        )
+
+    # backup ndims. Need them later.
+    orig_x_ndim = x_ndim
+    orig_y_ndim = y_ndim
+
+    # if rank is 2, expand to 3.
+    if x_ndim == 2:
+        x = tf.expand_dims(x, 1)
+        a0 += 1
+        x_ndim += 1
     if y_ndim == 2:
-      axes = [x_ndim - 1, y_ndim - 1]
+        y = tf.expand_dims(y, 2)
+        y_ndim += 1
+
+    # bring x's dimension to be reduced to last axis.
+    if a0 != x_ndim - 1:
+        pattern = list(range(x_ndim))
+        for i in range(a0, x_ndim - 1):
+            pattern[i] = pattern[i + 1]
+        pattern[-1] = a0
+        x = tf.compat.v1.transpose(x, pattern)
+
+    # bring y's dimension to be reduced to axis 1.
+    if a1 != 1:
+        pattern = list(range(y_ndim))
+        for i in range(a1, 1, -1):
+            pattern[i] = pattern[i - 1]
+        pattern[1] = a1
+        y = tf.compat.v1.transpose(y, pattern)
+
+    # normalize both inputs to rank 3.
+    if x_ndim > 3:
+        # squash middle dimensions of x.
+        x_shape = shape(x)
+        x_mid_dims = x_shape[1:-1]
+        x_squashed_shape = tf.stack([x_shape[0], -1, x_shape[-1]])
+        x = tf.reshape(x, x_squashed_shape)
+        x_squashed = True
     else:
-      axes = [x_ndim - 1, y_ndim - 2]
-
-  if py_any(isinstance(a, (list, tuple)) for a in axes):
-    raise ValueError('Multiple target dimensions are not supported. ' +
-                     'Expected: None, int, (int, int), ' +
-                     'Provided: ' + str(axes))
-
-  # if tuple, convert to list.
-  axes = list(axes)
-
-  # convert negative indices.
-  if axes[0] < 0:
-    axes[0] += x_ndim
-  if axes[1] < 0:
-    axes[1] += y_ndim
-
-  # sanity checks
-  if 0 in axes:
-    raise ValueError('Cannot perform batch_dot over axis 0. '
-                     'If your inputs are not batched, '
-                     'add a dummy batch dimension to your '
-                     'inputs using K.expand_dims(x, 0)')
-  a0, a1 = axes
-  d1 = x_shape[a0]
-  d2 = y_shape[a1]
-
-  if d1 is not None and d2 is not None and d1 != d2:
-    raise ValueError('Cannot do batch_dot on inputs with shapes ' +
-                     str(x_shape) + ' and ' + str(y_shape) +
-                     ' with axes=' + str(axes) + '. x.shape[%d] != '
-                     'y.shape[%d] (%d != %d).' % (axes[0], axes[1], d1, d2))
-
-  # backup ndims. Need them later.
-  orig_x_ndim = x_ndim
-  orig_y_ndim = y_ndim
-
-  # if rank is 2, expand to 3.
-  if x_ndim == 2:
-    x = tf.expand_dims(x, 1)
-    a0 += 1
-    x_ndim += 1
-  if y_ndim == 2:
-    y = tf.expand_dims(y, 2)
-    y_ndim += 1
-
-  # bring x's dimension to be reduced to last axis.
-  if a0 != x_ndim - 1:
-    pattern = list(range(x_ndim))
-    for i in range(a0, x_ndim - 1):
-      pattern[i] = pattern[i + 1]
-    pattern[-1] = a0
-    x = tf.compat.v1.transpose(x, pattern)
-
-  # bring y's dimension to be reduced to axis 1.
-  if a1 != 1:
-    pattern = list(range(y_ndim))
-    for i in range(a1, 1, -1):
-      pattern[i] = pattern[i - 1]
-    pattern[1] = a1
-    y = tf.compat.v1.transpose(y, pattern)
-
-  # normalize both inputs to rank 3.
-  if x_ndim > 3:
-    # squash middle dimensions of x.
-    x_shape = shape(x)
-    x_mid_dims = x_shape[1:-1]
-    x_squashed_shape = tf.stack(
-        [x_shape[0], -1, x_shape[-1]])
-    x = tf.reshape(x, x_squashed_shape)
-    x_squashed = True
-  else:
-    x_squashed = False
-
-  if y_ndim > 3:
-    # squash trailing dimensions of y.
-    y_shape = shape(y)
-    y_trail_dims = y_shape[2:]
-    y_squashed_shape = tf.stack(
-        [y_shape[0], y_shape[1], -1])
-    y = tf.reshape(y, y_squashed_shape)
-    y_squashed = True
-  else:
-    y_squashed = False
-
-  result = tf.matmul(x, y)
-
-  # if inputs were squashed, we have to reshape the matmul output.
-  output_shape = tf.shape(result)
-  do_reshape = False
-
-  if x_squashed:
-    output_shape = tf.concat(
-        [output_shape[:1],
-         x_mid_dims,
-         output_shape[-1:]], 0)
-    do_reshape = True
-
-  if y_squashed:
-    output_shape = tf.concat([output_shape[:-1], y_trail_dims], 0)
-    do_reshape = True
-
-  if do_reshape:
-    result = tf.reshape(result, output_shape)
-
-  # if the inputs were originally rank 2, we remove the added 1 dim.
-  if orig_x_ndim == 2:
-    result = tf.squeeze(result, 1)
-  elif orig_y_ndim == 2:
-    result = tf.squeeze(result, -1)
-
-  return result
-
-
-@keras_export('keras.backend.transpose')
+        x_squashed = False
+
+    if y_ndim > 3:
+        # squash trailing dimensions of y.
+        y_shape = shape(y)
+        y_trail_dims = y_shape[2:]
+        y_squashed_shape = tf.stack([y_shape[0], y_shape[1], -1])
+        y = tf.reshape(y, y_squashed_shape)
+        y_squashed = True
+    else:
+        y_squashed = False
+
+    result = tf.matmul(x, y)
+
+    # if inputs were squashed, we have to reshape the matmul output.
+    output_shape = tf.shape(result)
+    do_reshape = False
+
+    if x_squashed:
+        output_shape = tf.concat(
+            [output_shape[:1], x_mid_dims, output_shape[-1:]], 0
+        )
+        do_reshape = True
+
+    if y_squashed:
+        output_shape = tf.concat([output_shape[:-1], y_trail_dims], 0)
+        do_reshape = True
+
+    if do_reshape:
+        result = tf.reshape(result, output_shape)
+
+    # if the inputs were originally rank 2, we remove the added 1 dim.
+    if orig_x_ndim == 2:
+        result = tf.squeeze(result, 1)
+    elif orig_y_ndim == 2:
+        result = tf.squeeze(result, -1)
+
+    return result
+
+
+@keras_export("keras.backend.transpose")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def transpose(x):
-  """Transposes a tensor and returns it.
-
-  Args:
-      x: Tensor or variable.
-
-  Returns:
-      A tensor.
-
-  Examples:
-
-  >>> var = tf.keras.backend.variable([[1, 2, 3], [4, 5, 6]])
-  >>> tf.keras.backend.eval(var)
-  array([[1.,  2.,  3.],
-         [4.,  5.,  6.]], dtype=float32)
-  >>> var_transposed = tf.keras.backend.transpose(var)
-  >>> tf.keras.backend.eval(var_transposed)
-  array([[1.,  4.],
-         [2.,  5.],
-         [3.,  6.]], dtype=float32)
-  >>> input = tf.keras.backend.placeholder((2, 3))
-  >>> input
-  <KerasTensor: shape=(2, 3) dtype=float32 ...>
-  >>> input_transposed = tf.keras.backend.transpose(input)
-  >>> input_transposed
-  <KerasTensor: shape=(3, 2) dtype=float32 ...>
-  """
-  return tf.compat.v1.transpose(x)
-
-
-@keras_export('keras.backend.gather')
+    """Transposes a tensor and returns it.
+
+    Args:
+        x: Tensor or variable.
+
+    Returns:
+        A tensor.
+
+    Examples:
+
+    >>> var = tf.keras.backend.variable([[1, 2, 3], [4, 5, 6]])
+    >>> tf.keras.backend.eval(var)
+    array([[1.,  2.,  3.],
+           [4.,  5.,  6.]], dtype=float32)
+    >>> var_transposed = tf.keras.backend.transpose(var)
+    >>> tf.keras.backend.eval(var_transposed)
+    array([[1.,  4.],
+           [2.,  5.],
+           [3.,  6.]], dtype=float32)
+    >>> input = tf.keras.backend.placeholder((2, 3))
+    >>> input
+    <KerasTensor: shape=(2, 3) dtype=float32 ...>
+    >>> input_transposed = tf.keras.backend.transpose(input)
+    >>> input_transposed
+    <KerasTensor: shape=(3, 2) dtype=float32 ...>
+    """
+    return tf.compat.v1.transpose(x)
+
+
+@keras_export("keras.backend.gather")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def gather(reference, indices):
-  """Retrieves the elements of indices `indices` in the tensor `reference`.
-
-  Args:
-      reference: A tensor.
-      indices: An integer tensor of indices.
-
-  Returns:
-      A tensor of same type as `reference`.
-
-  Examples:
-
-  >>> var = tf.keras.backend.variable([[1, 2, 3], [4, 5, 6]])
-  >>> tf.keras.backend.eval(var)
-  array([[1., 2., 3.],
-         [4., 5., 6.]], dtype=float32)
-  >>> var_gathered = tf.keras.backend.gather(var, [0])
-  >>> tf.keras.backend.eval(var_gathered)
-  array([[1., 2., 3.]], dtype=float32)
-  >>> var_gathered = tf.keras.backend.gather(var, [1])
-  >>> tf.keras.backend.eval(var_gathered)
-  array([[4., 5., 6.]], dtype=float32)
-  >>> var_gathered = tf.keras.backend.gather(var, [0,1,0])
-  >>> tf.keras.backend.eval(var_gathered)
-  array([[1., 2., 3.],
-         [4., 5., 6.],
-         [1., 2., 3.]], dtype=float32)
-  """
-  return tf.compat.v1.gather(reference, indices)
+    """Retrieves the elements of indices `indices` in the tensor `reference`.
+
+    Args:
+        reference: A tensor.
+        indices: An integer tensor of indices.
+
+    Returns:
+        A tensor of same type as `reference`.
+
+    Examples:
+
+    >>> var = tf.keras.backend.variable([[1, 2, 3], [4, 5, 6]])
+    >>> tf.keras.backend.eval(var)
+    array([[1., 2., 3.],
+           [4., 5., 6.]], dtype=float32)
+    >>> var_gathered = tf.keras.backend.gather(var, [0])
+    >>> tf.keras.backend.eval(var_gathered)
+    array([[1., 2., 3.]], dtype=float32)
+    >>> var_gathered = tf.keras.backend.gather(var, [1])
+    >>> tf.keras.backend.eval(var_gathered)
+    array([[4., 5., 6.]], dtype=float32)
+    >>> var_gathered = tf.keras.backend.gather(var, [0,1,0])
+    >>> tf.keras.backend.eval(var_gathered)
+    array([[1., 2., 3.],
+           [4., 5., 6.],
+           [1., 2., 3.]], dtype=float32)
+    """
+    return tf.compat.v1.gather(reference, indices)
 
 
 # ELEMENT-WISE OPERATIONS
 
 
-@keras_export('keras.backend.max')
+@keras_export("keras.backend.max")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def max(x, axis=None, keepdims=False):
-  """Maximum value in a tensor.
+    """Maximum value in a tensor.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to find maximum values.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`,
-          the reduced dimension is retained with length 1.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to find maximum values.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1. If `keepdims` is `True`,
+            the reduced dimension is retained with length 1.
 
-  Returns:
-      A tensor with maximum values of `x`.
-  """
-  return tf.reduce_max(x, axis, keepdims)
+    Returns:
+        A tensor with maximum values of `x`.
+    """
+    return tf.reduce_max(x, axis, keepdims)
 
 
-@keras_export('keras.backend.min')
+@keras_export("keras.backend.min")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def min(x, axis=None, keepdims=False):
-  """Minimum value in a tensor.
+    """Minimum value in a tensor.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to find minimum values.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`,
-          the reduced dimension is retained with length 1.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to find minimum values.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1. If `keepdims` is `True`,
+            the reduced dimension is retained with length 1.
 
-  Returns:
-      A tensor with minimum values of `x`.
-  """
-  return tf.reduce_min(x, axis, keepdims)
+    Returns:
+        A tensor with minimum values of `x`.
+    """
+    return tf.reduce_min(x, axis, keepdims)
 
 
-@keras_export('keras.backend.sum')
+@keras_export("keras.backend.sum")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def sum(x, axis=None, keepdims=False):
-  """Sum of the values in a tensor, alongside the specified axis.
+    """Sum of the values in a tensor, alongside the specified axis.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to sum over.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`,
-          the reduced dimension is retained with length 1.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to sum over.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1. If `keepdims` is `True`,
+            the reduced dimension is retained with length 1.
 
-  Returns:
-      A tensor with sum of `x`.
-  """
-  return tf.reduce_sum(x, axis, keepdims)
+    Returns:
+        A tensor with sum of `x`.
+    """
+    return tf.reduce_sum(x, axis, keepdims)
 
 
-@keras_export('keras.backend.prod')
+@keras_export("keras.backend.prod")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def prod(x, axis=None, keepdims=False):
-  """Multiplies the values in a tensor, alongside the specified axis.
+    """Multiplies the values in a tensor, alongside the specified axis.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to compute the product.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`,
-          the reduced dimension is retained with length 1.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to compute the product.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1. If `keepdims` is `True`,
+            the reduced dimension is retained with length 1.
 
-  Returns:
-      A tensor with the product of elements of `x`.
-  """
-  return tf.reduce_prod(x, axis, keepdims)
+    Returns:
+        A tensor with the product of elements of `x`.
+    """
+    return tf.reduce_prod(x, axis, keepdims)
 
 
-@keras_export('keras.backend.cumsum')
+@keras_export("keras.backend.cumsum")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def cumsum(x, axis=0):
-  """Cumulative sum of the values in a tensor, alongside the specified axis.
+    """Cumulative sum of the values in a tensor, alongside the specified axis.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to compute the sum.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to compute the sum.
 
-  Returns:
-      A tensor of the cumulative sum of values of `x` along `axis`.
-  """
-  return tf.cumsum(x, axis=axis)
+    Returns:
+        A tensor of the cumulative sum of values of `x` along `axis`.
+    """
+    return tf.cumsum(x, axis=axis)
 
 
-@keras_export('keras.backend.cumprod')
+@keras_export("keras.backend.cumprod")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def cumprod(x, axis=0):
-  """Cumulative product of the values in a tensor, alongside the specified axis.
+    """Cumulative product of the values in a tensor alongside `axis`.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to compute the product.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to compute the product.
 
-  Returns:
-      A tensor of the cumulative product of values of `x` along `axis`.
-  """
-  return tf.math.cumprod(x, axis=axis)
+    Returns:
+        A tensor of the cumulative product of values of `x` along `axis`.
+    """
+    return tf.math.cumprod(x, axis=axis)
 
 
-@keras_export('keras.backend.var')
+@keras_export("keras.backend.var")
 @doc_controls.do_not_generate_docs
 def var(x, axis=None, keepdims=False):
-  """Variance of a tensor, alongside the specified axis.
+    """Variance of a tensor, alongside the specified axis.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to compute the variance.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`,
-          the reduced dimension is retained with length 1.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to compute the variance.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1. If `keepdims` is `True`,
+            the reduced dimension is retained with length 1.
 
-  Returns:
-      A tensor with the variance of elements of `x`.
-  """
-  if x.dtype.base_dtype == tf.bool:
-    x = tf.cast(x, floatx())
-  return tf.math.reduce_variance(x, axis=axis, keepdims=keepdims)
+    Returns:
+        A tensor with the variance of elements of `x`.
+    """
+    if x.dtype.base_dtype == tf.bool:
+        x = tf.cast(x, floatx())
+    return tf.math.reduce_variance(x, axis=axis, keepdims=keepdims)
 
 
-@keras_export('keras.backend.std')
+@keras_export("keras.backend.std")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def std(x, axis=None, keepdims=False):
-  """Standard deviation of a tensor, alongside the specified axis.
-
-  It is an alias to `tf.math.reduce_std`.
-
-  Args:
-      x: A tensor or variable. It should have numerical dtypes. Boolean type
-        inputs will be converted to float.
-      axis: An integer, the axis to compute the standard deviation. If `None`
-        (the default), reduces all dimensions. Must be in the range
-        `[-rank(x), rank(x))`.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`, the reduced dimension is retained with
-          length 1.
-
-  Returns:
-      A tensor with the standard deviation of elements of `x` with same dtype.
-      Boolean type input will be converted to float.
-  """
-  if x.dtype.base_dtype == tf.bool:
-    x = tf.cast(x, floatx())
-  return tf.math.reduce_std(x, axis=axis, keepdims=keepdims)
-
-
-@keras_export('keras.backend.mean')
+    """Standard deviation of a tensor, alongside the specified axis.
+
+    It is an alias to `tf.math.reduce_std`.
+
+    Args:
+        x: A tensor or variable. It should have numerical dtypes. Boolean type
+          inputs will be converted to float.
+        axis: An integer, the axis to compute the standard deviation. If `None`
+          (the default), reduces all dimensions. Must be in the range
+          `[-rank(x), rank(x))`.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1. If `keepdims` is `True`, the reduced dimension is retained
+            with length 1.
+
+    Returns:
+        A tensor with the standard deviation of elements of `x` with same dtype.
+        Boolean type input will be converted to float.
+    """
+    if x.dtype.base_dtype == tf.bool:
+        x = tf.cast(x, floatx())
+    return tf.math.reduce_std(x, axis=axis, keepdims=keepdims)
+
+
+@keras_export("keras.backend.mean")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def mean(x, axis=None, keepdims=False):
-  """Mean of a tensor, alongside the specified axis.
+    """Mean of a tensor, alongside the specified axis.
 
-  Args:
-      x: A tensor or variable.
-      axis: A list of integer. Axes to compute the mean.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1 for each entry in `axis`. If `keepdims` is `True`,
-          the reduced dimensions are retained with length 1.
+    Args:
+        x: A tensor or variable.
+        axis: A list of integer. Axes to compute the mean.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1 for each entry in `axis`. If `keepdims` is `True`,
+            the reduced dimensions are retained with length 1.
 
-  Returns:
-      A tensor with the mean of elements of `x`.
-  """
-  if x.dtype.base_dtype == tf.bool:
-    x = tf.cast(x, floatx())
-  return tf.reduce_mean(x, axis, keepdims)
+    Returns:
+        A tensor with the mean of elements of `x`.
+    """
+    if x.dtype.base_dtype == tf.bool:
+        x = tf.cast(x, floatx())
+    return tf.reduce_mean(x, axis, keepdims)
 
 
-@keras_export('keras.backend.any')
+@keras_export("keras.backend.any")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def any(x, axis=None, keepdims=False):
-  """Bitwise reduction (logical OR).
+    """Bitwise reduction (logical OR).
 
-  Args:
-      x: Tensor or variable.
-      axis: axis along which to perform the reduction.
-      keepdims: whether the drop or broadcast the reduction axes.
+    Args:
+        x: Tensor or variable.
+        axis: axis along which to perform the reduction.
+        keepdims: whether the drop or broadcast the reduction axes.
 
-  Returns:
-      A uint8 tensor (0s and 1s).
-  """
-  x = tf.cast(x, tf.bool)
-  return tf.reduce_any(x, axis, keepdims)
+    Returns:
+        A uint8 tensor (0s and 1s).
+    """
+    x = tf.cast(x, tf.bool)
+    return tf.reduce_any(x, axis, keepdims)
 
 
-@keras_export('keras.backend.all')
+@keras_export("keras.backend.all")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def all(x, axis=None, keepdims=False):
-  """Bitwise reduction (logical AND).
+    """Bitwise reduction (logical AND).
 
-  Args:
-      x: Tensor or variable.
-      axis: axis along which to perform the reduction.
-      keepdims: whether the drop or broadcast the reduction axes.
+    Args:
+        x: Tensor or variable.
+        axis: axis along which to perform the reduction.
+        keepdims: whether the drop or broadcast the reduction axes.
 
-  Returns:
-      A uint8 tensor (0s and 1s).
-  """
-  x = tf.cast(x, tf.bool)
-  return tf.reduce_all(x, axis, keepdims)
+    Returns:
+        A uint8 tensor (0s and 1s).
+    """
+    x = tf.cast(x, tf.bool)
+    return tf.reduce_all(x, axis, keepdims)
 
 
-@keras_export('keras.backend.argmax')
+@keras_export("keras.backend.argmax")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def argmax(x, axis=-1):
-  """Returns the index of the maximum value along an axis.
+    """Returns the index of the maximum value along an axis.
 
-  Args:
-      x: Tensor or variable.
-      axis: axis along which to perform the reduction.
+    Args:
+        x: Tensor or variable.
+        axis: axis along which to perform the reduction.
 
-  Returns:
-      A tensor.
-  """
-  return tf.argmax(x, axis)
+    Returns:
+        A tensor.
+    """
+    return tf.argmax(x, axis)
 
 
-@keras_export('keras.backend.argmin')
+@keras_export("keras.backend.argmin")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def argmin(x, axis=-1):
-  """Returns the index of the minimum value along an axis.
+    """Returns the index of the minimum value along an axis.
 
-  Args:
-      x: Tensor or variable.
-      axis: axis along which to perform the reduction.
+    Args:
+        x: Tensor or variable.
+        axis: axis along which to perform the reduction.
 
-  Returns:
-      A tensor.
-  """
-  return tf.argmin(x, axis)
+    Returns:
+        A tensor.
+    """
+    return tf.argmin(x, axis)
 
 
-@keras_export('keras.backend.square')
+@keras_export("keras.backend.square")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def square(x):
-  """Element-wise square.
+    """Element-wise square.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.square(x)
+    Returns:
+        A tensor.
+    """
+    return tf.square(x)
 
 
-@keras_export('keras.backend.abs')
+@keras_export("keras.backend.abs")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def abs(x):
-  """Element-wise absolute value.
+    """Element-wise absolute value.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.abs(x)
+    Returns:
+        A tensor.
+    """
+    return tf.abs(x)
 
 
-@keras_export('keras.backend.sqrt')
+@keras_export("keras.backend.sqrt")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def sqrt(x):
-  """Element-wise square root.
+    """Element-wise square root.
 
-     This function clips negative tensor values to 0 before computing the
-     square root.
+       This function clips negative tensor values to 0 before computing the
+       square root.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  zero = _constant_to_tensor(0., x.dtype.base_dtype)
-  x = tf.maximum(x, zero)
-  return tf.sqrt(x)
+    Returns:
+        A tensor.
+    """
+    zero = _constant_to_tensor(0.0, x.dtype.base_dtype)
+    x = tf.maximum(x, zero)
+    return tf.sqrt(x)
 
 
-@keras_export('keras.backend.exp')
+@keras_export("keras.backend.exp")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def exp(x):
-  """Element-wise exponential.
+    """Element-wise exponential.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.exp(x)
+    Returns:
+        A tensor.
+    """
+    return tf.exp(x)
 
 
-@keras_export('keras.backend.log')
+@keras_export("keras.backend.log")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def log(x):
-  """Element-wise log.
+    """Element-wise log.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.math.log(x)
+    Returns:
+        A tensor.
+    """
+    return tf.math.log(x)
 
 
 def logsumexp(x, axis=None, keepdims=False):
-  """Computes log(sum(exp(elements across dimensions of a tensor))).
+    """Computes log(sum(exp(elements across dimensions of a tensor))).
 
-  This function is more numerically stable than log(sum(exp(x))).
-  It avoids overflows caused by taking the exp of large inputs and
-  underflows caused by taking the log of small inputs.
+    This function is more numerically stable than log(sum(exp(x))).
+    It avoids overflows caused by taking the exp of large inputs and
+    underflows caused by taking the log of small inputs.
 
-  Args:
-      x: A tensor or variable.
-      axis: An integer, the axis to reduce over.
-      keepdims: A boolean, whether to keep the dimensions or not.
-          If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`, the reduced dimension is
-          retained with length 1.
+    Args:
+        x: A tensor or variable.
+        axis: An integer, the axis to reduce over.
+        keepdims: A boolean, whether to keep the dimensions or not.
+            If `keepdims` is `False`, the rank of the tensor is reduced
+            by 1. If `keepdims` is `True`, the reduced dimension is
+            retained with length 1.
 
-  Returns:
-      The reduced tensor.
-  """
-  return tf.reduce_logsumexp(x, axis, keepdims)
+    Returns:
+        The reduced tensor.
+    """
+    return tf.reduce_logsumexp(x, axis, keepdims)
 
 
-@keras_export('keras.backend.round')
+@keras_export("keras.backend.round")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def round(x):
-  """Element-wise rounding to the closest integer.
+    """Element-wise rounding to the closest integer.
 
-  In case of tie, the rounding mode used is "half to even".
+    In case of tie, the rounding mode used is "half to even".
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.round(x)
+    Returns:
+        A tensor.
+    """
+    return tf.round(x)
 
 
-@keras_export('keras.backend.sign')
+@keras_export("keras.backend.sign")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def sign(x):
-  """Element-wise sign.
+    """Element-wise sign.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.sign(x)
+    Returns:
+        A tensor.
+    """
+    return tf.sign(x)
 
 
-@keras_export('keras.backend.pow')
+@keras_export("keras.backend.pow")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def pow(x, a):
-  """Element-wise exponentiation.
+    """Element-wise exponentiation.
 
-  Args:
-      x: Tensor or variable.
-      a: Python integer.
+    Args:
+        x: Tensor or variable.
+        a: Python integer.
 
-  Returns:
-      A tensor.
-  """
-  return tf.pow(x, a)
+    Returns:
+        A tensor.
+    """
+    return tf.pow(x, a)
 
 
-@keras_export('keras.backend.clip')
+@keras_export("keras.backend.clip")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def clip(x, min_value, max_value):
-  """Element-wise value clipping.
-
-  Args:
-      x: Tensor or variable.
-      min_value: Python float, integer, or tensor.
-      max_value: Python float, integer, or tensor.
-
-  Returns:
-      A tensor.
-  """
-  if (isinstance(min_value, (int, float)) and
-      isinstance(max_value, (int, float))):
-    if max_value < min_value:
-      max_value = min_value
-  if min_value is None:
-    min_value = -np.inf
-  if max_value is None:
-    max_value = np.inf
-  return tf.clip_by_value(x, min_value, max_value)
-
-
-@keras_export('keras.backend.equal')
+    """Element-wise value clipping.
+
+    Args:
+        x: Tensor or variable.
+        min_value: Python float, integer, or tensor.
+        max_value: Python float, integer, or tensor.
+
+    Returns:
+        A tensor.
+    """
+    if isinstance(min_value, (int, float)) and isinstance(
+        max_value, (int, float)
+    ):
+        if max_value < min_value:
+            max_value = min_value
+    if min_value is None:
+        min_value = -np.inf
+    if max_value is None:
+        max_value = np.inf
+    return tf.clip_by_value(x, min_value, max_value)
+
+
+@keras_export("keras.backend.equal")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def equal(x, y):
-  """Element-wise equality between two tensors.
+    """Element-wise equality between two tensors.
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A bool tensor.
-  """
-  return tf.equal(x, y)
+    Returns:
+        A bool tensor.
+    """
+    return tf.equal(x, y)
 
 
-@keras_export('keras.backend.not_equal')
+@keras_export("keras.backend.not_equal")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def not_equal(x, y):
-  """Element-wise inequality between two tensors.
+    """Element-wise inequality between two tensors.
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A bool tensor.
-  """
-  return tf.not_equal(x, y)
+    Returns:
+        A bool tensor.
+    """
+    return tf.not_equal(x, y)
 
 
-@keras_export('keras.backend.greater')
+@keras_export("keras.backend.greater")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def greater(x, y):
-  """Element-wise truth value of (x > y).
+    """Element-wise truth value of (x > y).
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A bool tensor.
-  """
-  return tf.greater(x, y)
+    Returns:
+        A bool tensor.
+    """
+    return tf.greater(x, y)
 
 
-@keras_export('keras.backend.greater_equal')
+@keras_export("keras.backend.greater_equal")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def greater_equal(x, y):
-  """Element-wise truth value of (x >= y).
+    """Element-wise truth value of (x >= y).
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A bool tensor.
-  """
-  return tf.greater_equal(x, y)
+    Returns:
+        A bool tensor.
+    """
+    return tf.greater_equal(x, y)
 
 
-@keras_export('keras.backend.less')
+@keras_export("keras.backend.less")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def less(x, y):
-  """Element-wise truth value of (x < y).
+    """Element-wise truth value of (x < y).
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A bool tensor.
-  """
-  return tf.less(x, y)
+    Returns:
+        A bool tensor.
+    """
+    return tf.less(x, y)
 
 
-@keras_export('keras.backend.less_equal')
+@keras_export("keras.backend.less_equal")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def less_equal(x, y):
-  """Element-wise truth value of (x <= y).
+    """Element-wise truth value of (x <= y).
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A bool tensor.
-  """
-  return tf.less_equal(x, y)
+    Returns:
+        A bool tensor.
+    """
+    return tf.less_equal(x, y)
 
 
-@keras_export('keras.backend.maximum')
+@keras_export("keras.backend.maximum")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def maximum(x, y):
-  """Element-wise maximum of two tensors.
+    """Element-wise maximum of two tensors.
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A tensor with the element wise maximum value(s) of `x` and `y`.
+    Returns:
+        A tensor with the element wise maximum value(s) of `x` and `y`.
 
-  Examples:
+    Examples:
 
-  >>> x = tf.Variable([[1, 2], [3, 4]])
-  >>> y = tf.Variable([[2, 1], [0, -1]])
-  >>> m = tf.keras.backend.maximum(x, y)
-  >>> m
-  <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
-  array([[2, 2],
-         [3, 4]], dtype=int32)>
-  """
-  return tf.maximum(x, y)
+    >>> x = tf.Variable([[1, 2], [3, 4]])
+    >>> y = tf.Variable([[2, 1], [0, -1]])
+    >>> m = tf.keras.backend.maximum(x, y)
+    >>> m
+    <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+    array([[2, 2],
+           [3, 4]], dtype=int32)>
+    """
+    return tf.maximum(x, y)
 
 
-@keras_export('keras.backend.minimum')
+@keras_export("keras.backend.minimum")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def minimum(x, y):
-  """Element-wise minimum of two tensors.
+    """Element-wise minimum of two tensors.
 
-  Args:
-      x: Tensor or variable.
-      y: Tensor or variable.
+    Args:
+        x: Tensor or variable.
+        y: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.minimum(x, y)
+    Returns:
+        A tensor.
+    """
+    return tf.minimum(x, y)
 
 
-@keras_export('keras.backend.sin')
+@keras_export("keras.backend.sin")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def sin(x):
-  """Computes sin of x element-wise.
+    """Computes sin of x element-wise.
 
-  Args:
-      x: Tensor or variable.
+    Args:
+        x: Tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.sin(x)
+    Returns:
+        A tensor.
+    """
+    return tf.sin(x)
 
 
-@keras_export('keras.backend.cos')
+@keras_export("keras.backend.cos")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def cos(x):
-  """Computes cos of x element-wise.
-
-  Args:
-      x: Tensor or variable.
-
-  Returns:
-      A tensor.
-  """
-  return tf.cos(x)
-
-
-def _regular_normalize_batch_in_training(x,
-                                         gamma,
-                                         beta,
-                                         reduction_axes,
-                                         epsilon=1e-3):
-  """Non-fused version of `normalize_batch_in_training`.
-
-  Args:
-      x: Input tensor or variable.
-      gamma: Tensor by which to scale the input.
-      beta: Tensor with which to center the input.
-      reduction_axes: iterable of integers,
-          axes over which to normalize.
-      epsilon: Fuzz factor.
-
-  Returns:
-      A tuple length of 3, `(normalized_tensor, mean, variance)`.
-  """
-  mean, var = tf.compat.v1.nn.moments(x, reduction_axes, None, None, False)
-  normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
-  return normed, mean, var
-
-
-def _broadcast_normalize_batch_in_training(x,
-                                           gamma,
-                                           beta,
-                                           reduction_axes,
-                                           epsilon=1e-3):
-  """Non-fused, broadcast version of `normalize_batch_in_training`.
-
-  Args:
-      x: Input tensor or variable.
-      gamma: Tensor by which to scale the input.
-      beta: Tensor with which to center the input.
-      reduction_axes: iterable of integers,
-          axes over which to normalize.
-      epsilon: Fuzz factor.
-
-  Returns:
-      A tuple length of 3, `(normalized_tensor, mean, variance)`.
-  """
-  mean, var = tf.compat.v1.nn.moments(x, reduction_axes, None, None, False)
-  target_shape = []
-  for axis in range(ndim(x)):
-    if axis in reduction_axes:
-      target_shape.append(1)
-    else:
-      target_shape.append(tf.shape(x)[axis])
-  target_shape = tf.stack(target_shape)
-
-  broadcast_mean = tf.reshape(mean, target_shape)
-  broadcast_var = tf.reshape(var, target_shape)
-  if gamma is None:
-    broadcast_gamma = None
-  else:
-    broadcast_gamma = tf.reshape(gamma, target_shape)
-  if beta is None:
-    broadcast_beta = None
-  else:
-    broadcast_beta = tf.reshape(beta, target_shape)
-
-  normed = tf.nn.batch_normalization(x, broadcast_mean, broadcast_var,
-                                  broadcast_beta, broadcast_gamma, epsilon)
-  return normed, mean, var
-
-
-def _fused_normalize_batch_in_training(x,
-                                       gamma,
-                                       beta,
-                                       reduction_axes,
-                                       epsilon=1e-3):
-  """Fused version of `normalize_batch_in_training`.
-
-  Args:
-      x: Input tensor or variable.
-      gamma: Tensor by which to scale the input.
-      beta: Tensor with which to center the input.
-      reduction_axes: iterable of integers,
-          axes over which to normalize.
-      epsilon: Fuzz factor.
-
-  Returns:
-      A tuple length of 3, `(normalized_tensor, mean, variance)`.
-  """
-  if list(reduction_axes) == [0, 1, 2]:
-    normalization_axis = 3
-    tf_data_format = 'NHWC'
-  else:
-    normalization_axis = 1
-    tf_data_format = 'NCHW'
-
-  if gamma is None:
-    gamma = tf.constant(
-        1.0, dtype=x.dtype, shape=[x.shape[normalization_axis]])
-  if beta is None:
-    beta = tf.constant(
-        0.0, dtype=x.dtype, shape=[x.shape[normalization_axis]])
-
-  return tf.compat.v1.nn.fused_batch_norm(
-      x, gamma, beta, epsilon=epsilon, data_format=tf_data_format)
-
-
-@keras_export('keras.backend.normalize_batch_in_training')
-@doc_controls.do_not_generate_docs
-def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
-  """Computes mean and std for batch then apply batch_normalization on batch.
-
-  Args:
-      x: Input tensor or variable.
-      gamma: Tensor by which to scale the input.
-      beta: Tensor with which to center the input.
-      reduction_axes: iterable of integers,
-          axes over which to normalize.
-      epsilon: Fuzz factor.
-
-  Returns:
-      A tuple length of 3, `(normalized_tensor, mean, variance)`.
-  """
-  if ndim(x) == 4 and list(reduction_axes) in [[0, 1, 2], [0, 2, 3]]:
-    if not _has_nchw_support() and list(reduction_axes) == [0, 2, 3]:
-      return _broadcast_normalize_batch_in_training(
-          x, gamma, beta, reduction_axes, epsilon=epsilon)
-    return _fused_normalize_batch_in_training(
-        x, gamma, beta, reduction_axes, epsilon=epsilon)
-  else:
-    if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
-      return _regular_normalize_batch_in_training(
-          x, gamma, beta, reduction_axes, epsilon=epsilon)
-    else:
-      return _broadcast_normalize_batch_in_training(
-          x, gamma, beta, reduction_axes, epsilon=epsilon)
+    """Computes cos of x element-wise.
 
+    Args:
+        x: Tensor or variable.
 
-@keras_export('keras.backend.batch_normalization')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
-  """Applies batch normalization on x given mean, var, beta and gamma.
-
-  I.e. returns:
-  `output = (x - mean) / (sqrt(var) + epsilon) * gamma + beta`
-
-  Args:
-      x: Input tensor or variable.
-      mean: Mean of batch.
-      var: Variance of batch.
-      beta: Tensor with which to center the input.
-      gamma: Tensor by which to scale the input.
-      axis: Integer, the axis that should be normalized.
-          (typically the features axis).
-      epsilon: Fuzz factor.
-
-  Returns:
-      A tensor.
-  """
-  if ndim(x) == 4:
-    # The CPU implementation of `fused_batch_norm` only supports NHWC
-    if axis == 1 or axis == -3:
-      tf_data_format = 'NCHW'
-    elif axis == 3 or axis == -1:
-      tf_data_format = 'NHWC'
+    Returns:
+        A tensor.
+    """
+    return tf.cos(x)
+
+
+def _regular_normalize_batch_in_training(
+    x, gamma, beta, reduction_axes, epsilon=1e-3
+):
+    """Non-fused version of `normalize_batch_in_training`.
+
+    Args:
+        x: Input tensor or variable.
+        gamma: Tensor by which to scale the input.
+        beta: Tensor with which to center the input.
+        reduction_axes: iterable of integers,
+            axes over which to normalize.
+        epsilon: Fuzz factor.
+
+    Returns:
+        A tuple length of 3, `(normalized_tensor, mean, variance)`.
+    """
+    mean, var = tf.compat.v1.nn.moments(x, reduction_axes, None, None, False)
+    normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+    return normed, mean, var
+
+
+def _broadcast_normalize_batch_in_training(
+    x, gamma, beta, reduction_axes, epsilon=1e-3
+):
+    """Non-fused, broadcast version of `normalize_batch_in_training`.
+
+    Args:
+        x: Input tensor or variable.
+        gamma: Tensor by which to scale the input.
+        beta: Tensor with which to center the input.
+        reduction_axes: iterable of integers,
+            axes over which to normalize.
+        epsilon: Fuzz factor.
+
+    Returns:
+        A tuple length of 3, `(normalized_tensor, mean, variance)`.
+    """
+    mean, var = tf.compat.v1.nn.moments(x, reduction_axes, None, None, False)
+    target_shape = []
+    for axis in range(ndim(x)):
+        if axis in reduction_axes:
+            target_shape.append(1)
+        else:
+            target_shape.append(tf.shape(x)[axis])
+    target_shape = tf.stack(target_shape)
+
+    broadcast_mean = tf.reshape(mean, target_shape)
+    broadcast_var = tf.reshape(var, target_shape)
+    if gamma is None:
+        broadcast_gamma = None
     else:
-      tf_data_format = None
-
-    if (tf_data_format == 'NHWC' or
-        tf_data_format == 'NCHW' and _has_nchw_support()):
-      # The mean / var / beta / gamma tensors may be broadcasted
-      # so they may have extra axes of size 1, which should be squeezed.
-      if ndim(mean) > 1:
-        mean = tf.reshape(mean, [-1])
-      if ndim(var) > 1:
-        var = tf.reshape(var, [-1])
-      if beta is None:
-        beta = zeros_like(mean)
-      elif ndim(beta) > 1:
-        beta = tf.reshape(beta, [-1])
-      if gamma is None:
-        gamma = ones_like(mean)
-      elif ndim(gamma) > 1:
-        gamma = tf.reshape(gamma, [-1])
-    y, _, _ = tf.compat.v1.nn.fused_batch_norm(
+        broadcast_gamma = tf.reshape(gamma, target_shape)
+    if beta is None:
+        broadcast_beta = None
+    else:
+        broadcast_beta = tf.reshape(beta, target_shape)
+
+    normed = tf.nn.batch_normalization(
         x,
-        gamma,
-        beta,
-        epsilon=epsilon,
-        mean=mean,
-        variance=var,
-        data_format=tf_data_format,
-        is_training=False
+        broadcast_mean,
+        broadcast_var,
+        broadcast_beta,
+        broadcast_gamma,
+        epsilon,
+    )
+    return normed, mean, var
+
+
+def _fused_normalize_batch_in_training(
+    x, gamma, beta, reduction_axes, epsilon=1e-3
+):
+    """Fused version of `normalize_batch_in_training`.
+
+    Args:
+        x: Input tensor or variable.
+        gamma: Tensor by which to scale the input.
+        beta: Tensor with which to center the input.
+        reduction_axes: iterable of integers,
+            axes over which to normalize.
+        epsilon: Fuzz factor.
+
+    Returns:
+        A tuple length of 3, `(normalized_tensor, mean, variance)`.
+    """
+    if list(reduction_axes) == [0, 1, 2]:
+        normalization_axis = 3
+        tf_data_format = "NHWC"
+    else:
+        normalization_axis = 1
+        tf_data_format = "NCHW"
+
+    if gamma is None:
+        gamma = tf.constant(
+            1.0, dtype=x.dtype, shape=[x.shape[normalization_axis]]
+        )
+    if beta is None:
+        beta = tf.constant(
+            0.0, dtype=x.dtype, shape=[x.shape[normalization_axis]]
+        )
+
+    return tf.compat.v1.nn.fused_batch_norm(
+        x, gamma, beta, epsilon=epsilon, data_format=tf_data_format
     )
-    return y
-  return tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+
+
+@keras_export("keras.backend.normalize_batch_in_training")
+@doc_controls.do_not_generate_docs
+def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
+    """Computes mean and std for batch then apply batch_normalization on batch.
+
+    Args:
+        x: Input tensor or variable.
+        gamma: Tensor by which to scale the input.
+        beta: Tensor with which to center the input.
+        reduction_axes: iterable of integers,
+            axes over which to normalize.
+        epsilon: Fuzz factor.
+
+    Returns:
+        A tuple length of 3, `(normalized_tensor, mean, variance)`.
+    """
+    if ndim(x) == 4 and list(reduction_axes) in [[0, 1, 2], [0, 2, 3]]:
+        if not _has_nchw_support() and list(reduction_axes) == [0, 2, 3]:
+            return _broadcast_normalize_batch_in_training(
+                x, gamma, beta, reduction_axes, epsilon=epsilon
+            )
+        return _fused_normalize_batch_in_training(
+            x, gamma, beta, reduction_axes, epsilon=epsilon
+        )
+    else:
+        if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
+            return _regular_normalize_batch_in_training(
+                x, gamma, beta, reduction_axes, epsilon=epsilon
+            )
+        else:
+            return _broadcast_normalize_batch_in_training(
+                x, gamma, beta, reduction_axes, epsilon=epsilon
+            )
+
+
+@keras_export("keras.backend.batch_normalization")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
+    """Applies batch normalization on x given mean, var, beta and gamma.
+
+    I.e. returns:
+    `output = (x - mean) / (sqrt(var) + epsilon) * gamma + beta`
+
+    Args:
+        x: Input tensor or variable.
+        mean: Mean of batch.
+        var: Variance of batch.
+        beta: Tensor with which to center the input.
+        gamma: Tensor by which to scale the input.
+        axis: Integer, the axis that should be normalized.
+            (typically the features axis).
+        epsilon: Fuzz factor.
+
+    Returns:
+        A tensor.
+    """
+    if ndim(x) == 4:
+        # The CPU implementation of `fused_batch_norm` only supports NHWC
+        if axis == 1 or axis == -3:
+            tf_data_format = "NCHW"
+        elif axis == 3 or axis == -1:
+            tf_data_format = "NHWC"
+        else:
+            tf_data_format = None
+
+        if (
+            tf_data_format == "NHWC"
+            or tf_data_format == "NCHW"
+            and _has_nchw_support()
+        ):
+            # The mean / var / beta / gamma tensors may be broadcasted
+            # so they may have extra axes of size 1, which should be squeezed.
+            if ndim(mean) > 1:
+                mean = tf.reshape(mean, [-1])
+            if ndim(var) > 1:
+                var = tf.reshape(var, [-1])
+            if beta is None:
+                beta = zeros_like(mean)
+            elif ndim(beta) > 1:
+                beta = tf.reshape(beta, [-1])
+            if gamma is None:
+                gamma = ones_like(mean)
+            elif ndim(gamma) > 1:
+                gamma = tf.reshape(gamma, [-1])
+        y, _, _ = tf.compat.v1.nn.fused_batch_norm(
+            x,
+            gamma,
+            beta,
+            epsilon=epsilon,
+            mean=mean,
+            variance=var,
+            data_format=tf_data_format,
+            is_training=False,
+        )
+        return y
+    return tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
 
 
 # SHAPE OPERATIONS
 
 
-@keras_export('keras.backend.concatenate')
+@keras_export("keras.backend.concatenate")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def concatenate(tensors, axis=-1):
-  """Concatenates a list of tensors alongside the specified axis.
-
-  Args:
-      tensors: list of tensors to concatenate.
-      axis: concatenation axis.
-
-  Returns:
-      A tensor.
-
-  Example:
-
-      >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-      >>> b = tf.constant([[10, 20, 30], [40, 50, 60], [70, 80, 90]])
-      >>> tf.keras.backend.concatenate((a, b), axis=-1)
-      <tf.Tensor: shape=(3, 6), dtype=int32, numpy=
-      array([[ 1,  2,  3, 10, 20, 30],
-             [ 4,  5,  6, 40, 50, 60],
-             [ 7,  8,  9, 70, 80, 90]], dtype=int32)>
-
-  """
-  if axis < 0:
-    rank = ndim(tensors[0])
-    if rank:
-      axis %= rank
-    else:
-      axis = 0
+    """Concatenates a list of tensors alongside the specified axis.
+
+    Args:
+        tensors: list of tensors to concatenate.
+        axis: concatenation axis.
+
+    Returns:
+        A tensor.
+
+    Example:
 
-  if py_all(is_sparse(x) for x in tensors):
-    return tf.compat.v1.sparse_concat(axis, tensors)
-  elif py_all(isinstance(x, tf.RaggedTensor) for x in tensors):
-    return tf.concat(tensors, axis)
-  else:
-    return tf.concat([to_dense(x) for x in tensors], axis)
+        >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> b = tf.constant([[10, 20, 30], [40, 50, 60], [70, 80, 90]])
+        >>> tf.keras.backend.concatenate((a, b), axis=-1)
+        <tf.Tensor: shape=(3, 6), dtype=int32, numpy=
+        array([[ 1,  2,  3, 10, 20, 30],
+               [ 4,  5,  6, 40, 50, 60],
+               [ 7,  8,  9, 70, 80, 90]], dtype=int32)>
 
+    """
+    if axis < 0:
+        rank = ndim(tensors[0])
+        if rank:
+            axis %= rank
+        else:
+            axis = 0
+
+    if py_all(is_sparse(x) for x in tensors):
+        return tf.compat.v1.sparse_concat(axis, tensors)
+    elif py_all(isinstance(x, tf.RaggedTensor) for x in tensors):
+        return tf.concat(tensors, axis)
+    else:
+        return tf.concat([to_dense(x) for x in tensors], axis)
 
-@keras_export('keras.backend.reshape')
+
+@keras_export("keras.backend.reshape")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def reshape(x, shape):
-  """Reshapes a tensor to the specified shape.
+    """Reshapes a tensor to the specified shape.
+
+    Args:
+        x: Tensor or variable.
+        shape: Target shape tuple.
+
+    Returns:
+        A tensor.
+
+    Example:
+
+      >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+      >>> a
+      <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
+      array([[ 1,  2,  3],
+             [ 4,  5,  6],
+             [ 7,  8,  9],
+             [10, 11, 12]], dtype=int32)>
+      >>> tf.keras.backend.reshape(a, shape=(2, 6))
+      <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
+      array([[ 1,  2,  3,  4,  5,  6],
+             [ 7,  8,  9, 10, 11, 12]], dtype=int32)>
+
+    """
+    return tf.reshape(x, shape)
 
-  Args:
-      x: Tensor or variable.
-      shape: Target shape tuple.
 
-  Returns:
-      A tensor.
+@keras_export("keras.backend.permute_dimensions")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def permute_dimensions(x, pattern):
+    """Permutes axes in a tensor.
 
-  Example:
+    Args:
+        x: Tensor or variable.
+        pattern: A tuple of
+            dimension indices, e.g. `(0, 2, 1)`.
 
-    >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
-    >>> a
-    <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
-    array([[ 1,  2,  3],
-           [ 4,  5,  6],
-           [ 7,  8,  9],
-           [10, 11, 12]], dtype=int32)>
-    >>> tf.keras.backend.reshape(a, shape=(2, 6))
-    <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
-    array([[ 1,  2,  3,  4,  5,  6],
-           [ 7,  8,  9, 10, 11, 12]], dtype=int32)>
+    Returns:
+        A tensor.
+
+    Example:
+
+      >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+      >>> a
+      <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
+      array([[ 1,  2,  3],
+             [ 4,  5,  6],
+             [ 7,  8,  9],
+             [10, 11, 12]], dtype=int32)>
+      >>> tf.keras.backend.permute_dimensions(a, pattern=(1, 0))
+      <tf.Tensor: shape=(3, 4), dtype=int32, numpy=
+      array([[ 1,  4,  7, 10],
+             [ 2,  5,  8, 11],
+             [ 3,  6,  9, 12]], dtype=int32)>
 
-  """
-  return tf.reshape(x, shape)
+    """
+    return tf.compat.v1.transpose(x, perm=pattern)
 
 
-@keras_export('keras.backend.permute_dimensions')
+@keras_export("keras.backend.resize_images")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def permute_dimensions(x, pattern):
-  """Permutes axes in a tensor.
+def resize_images(
+    x, height_factor, width_factor, data_format, interpolation="nearest"
+):
+    """Resizes the images contained in a 4D tensor.
+
+    Args:
+        x: Tensor or variable to resize.
+        height_factor: Positive integer.
+        width_factor: Positive integer.
+        data_format: One of `"channels_first"`, `"channels_last"`.
+        interpolation: A string, one of `"area"`, `"bicubic"`, `"bilinear"`,
+          `"gaussian"`, `"lanczos3"`, `"lanczos5"`, `"mitchellcubic"`,
+          `"nearest"`.
 
-  Args:
-      x: Tensor or variable.
-      pattern: A tuple of
-          dimension indices, e.g. `(0, 2, 1)`.
+    Returns:
+        A tensor.
 
-  Returns:
-      A tensor.
+    Raises:
+        ValueError: in case of incorrect value for
+          `data_format` or `interpolation`.
+    """
+    if data_format == "channels_first":
+        rows, cols = 2, 3
+    elif data_format == "channels_last":
+        rows, cols = 1, 2
+    else:
+        raise ValueError(f"Invalid `data_format` argument: {data_format}")
 
-  Example:
+    new_shape = x.shape[rows : cols + 1]
+    if new_shape.is_fully_defined():
+        new_shape = tf.constant(new_shape.as_list(), dtype="int32")
+    else:
+        new_shape = tf.shape(x)[rows : cols + 1]
+    new_shape *= tf.constant(
+        np.array([height_factor, width_factor], dtype="int32")
+    )
 
-    >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
-    >>> a
-    <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
-    array([[ 1,  2,  3],
-           [ 4,  5,  6],
-           [ 7,  8,  9],
-           [10, 11, 12]], dtype=int32)>
-    >>> tf.keras.backend.permute_dimensions(a, pattern=(1, 0))
-    <tf.Tensor: shape=(3, 4), dtype=int32, numpy=
-    array([[ 1,  4,  7, 10],
-           [ 2,  5,  8, 11],
-           [ 3,  6,  9, 12]], dtype=int32)>
+    if data_format == "channels_first":
+        x = permute_dimensions(x, [0, 2, 3, 1])
+    interpolations = {
+        "area": tf.image.ResizeMethod.AREA,
+        "bicubic": tf.image.ResizeMethod.BICUBIC,
+        "bilinear": tf.image.ResizeMethod.BILINEAR,
+        "gaussian": tf.image.ResizeMethod.GAUSSIAN,
+        "lanczos3": tf.image.ResizeMethod.LANCZOS3,
+        "lanczos5": tf.image.ResizeMethod.LANCZOS5,
+        "mitchellcubic": tf.image.ResizeMethod.MITCHELLCUBIC,
+        "nearest": tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+    }
+    interploations_list = '"' + '", "'.join(interpolations.keys()) + '"'
+    if interpolation in interpolations:
+        x = tf.image.resize(x, new_shape, method=interpolations[interpolation])
+    else:
+        raise ValueError(
+            "`interpolation` argument should be one of: "
+            f'{interploations_list}. Received: "{interpolation}".'
+        )
+    if data_format == "channels_first":
+        x = permute_dimensions(x, [0, 3, 1, 2])
 
-  """
-  return tf.compat.v1.transpose(x, perm=pattern)
+    return x
 
 
-@keras_export('keras.backend.resize_images')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def resize_images(x, height_factor, width_factor, data_format,
-                  interpolation='nearest'):
-  """Resizes the images contained in a 4D tensor.
-
-  Args:
-      x: Tensor or variable to resize.
-      height_factor: Positive integer.
-      width_factor: Positive integer.
-      data_format: One of `"channels_first"`, `"channels_last"`.
-      interpolation: A string, one of `"area"`, `"bicubic"`, `"bilinear"`,
-        `"gaussian"`, `"lanczos3"`, `"lanczos5"`, `"mitchellcubic"`,
-        `"nearest"`.
-
-  Returns:
-      A tensor.
-
-  Raises:
-      ValueError: in case of incorrect value for
-        `data_format` or `interpolation`.
-  """
-  if data_format == 'channels_first':
-    rows, cols = 2, 3
-  elif data_format == 'channels_last':
-    rows, cols = 1, 2
-  else:
-    raise ValueError('Invalid `data_format` argument: %s' % (data_format,))
-
-  new_shape = x.shape[rows:cols + 1]
-  if new_shape.is_fully_defined():
-    new_shape = tf.constant(new_shape.as_list(), dtype='int32')
-  else:
-    new_shape = tf.shape(x)[rows:cols + 1]
-  new_shape *= tf.constant(
-      np.array([height_factor, width_factor], dtype='int32'))
-
-  if data_format == 'channels_first':
-    x = permute_dimensions(x, [0, 2, 3, 1])
-  interpolations = {
-      'area': tf.image.ResizeMethod.AREA,
-      'bicubic': tf.image.ResizeMethod.BICUBIC,
-      'bilinear': tf.image.ResizeMethod.BILINEAR,
-      'gaussian': tf.image.ResizeMethod.GAUSSIAN,
-      'lanczos3': tf.image.ResizeMethod.LANCZOS3,
-      'lanczos5': tf.image.ResizeMethod.LANCZOS5,
-      'mitchellcubic': tf.image.ResizeMethod.MITCHELLCUBIC,
-      'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR,
-  }
-  interploations_list = '"' + '", "'.join(interpolations.keys()) + '"'
-  if interpolation in interpolations:
-    x = tf.image.resize(x, new_shape, method=interpolations[interpolation])
-  else:
-    raise ValueError('`interpolation` argument should be one of: '
-                     f'{interploations_list}. Received: "{interpolation}".')
-  if data_format == 'channels_first':
-    x = permute_dimensions(x, [0, 3, 1, 2])
-
-  return x
-
-
-@keras_export('keras.backend.resize_volumes')
+@keras_export("keras.backend.resize_volumes")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
-  """Resizes the volume contained in a 5D tensor.
-
-  Args:
-      x: Tensor or variable to resize.
-      depth_factor: Positive integer.
-      height_factor: Positive integer.
-      width_factor: Positive integer.
-      data_format: One of `"channels_first"`, `"channels_last"`.
-
-  Returns:
-      A tensor.
-
-  Raises:
-      ValueError: if `data_format` is neither
-          `channels_last` or `channels_first`.
-  """
-  if data_format == 'channels_first':
-    output = repeat_elements(x, depth_factor, axis=2)
-    output = repeat_elements(output, height_factor, axis=3)
-    output = repeat_elements(output, width_factor, axis=4)
-    return output
-  elif data_format == 'channels_last':
-    output = repeat_elements(x, depth_factor, axis=1)
-    output = repeat_elements(output, height_factor, axis=2)
-    output = repeat_elements(output, width_factor, axis=3)
-    return output
-  else:
-    raise ValueError('Invalid data_format: ' + str(data_format))
+    """Resizes the volume contained in a 5D tensor.
+
+    Args:
+        x: Tensor or variable to resize.
+        depth_factor: Positive integer.
+        height_factor: Positive integer.
+        width_factor: Positive integer.
+        data_format: One of `"channels_first"`, `"channels_last"`.
+
+    Returns:
+        A tensor.
+
+    Raises:
+        ValueError: if `data_format` is neither
+            `channels_last` or `channels_first`.
+    """
+    if data_format == "channels_first":
+        output = repeat_elements(x, depth_factor, axis=2)
+        output = repeat_elements(output, height_factor, axis=3)
+        output = repeat_elements(output, width_factor, axis=4)
+        return output
+    elif data_format == "channels_last":
+        output = repeat_elements(x, depth_factor, axis=1)
+        output = repeat_elements(output, height_factor, axis=2)
+        output = repeat_elements(output, width_factor, axis=3)
+        return output
+    else:
+        raise ValueError("Invalid data_format: " + str(data_format))
 
 
-@keras_export('keras.backend.repeat_elements')
+@keras_export("keras.backend.repeat_elements")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def repeat_elements(x, rep, axis):
-  """Repeats the elements of a tensor along an axis, like `np.repeat`.
-
-  If `x` has shape `(s1, s2, s3)` and `axis` is `1`, the output
-  will have shape `(s1, s2 * rep, s3)`.
-
-  Args:
-      x: Tensor or variable.
-      rep: Python integer, number of times to repeat.
-      axis: Axis along which to repeat.
-
-  Returns:
-      A tensor.
-
-  Example:
-
-      >>> b = tf.constant([1, 2, 3])
-      >>> tf.keras.backend.repeat_elements(b, rep=2, axis=0)
-      <tf.Tensor: shape=(6,), dtype=int32,
-          numpy=array([1, 1, 2, 2, 3, 3], dtype=int32)>
-
-  """
-  x_shape = x.shape.as_list()
-  # For static axis
-  if x_shape[axis] is not None:
-    # slices along the repeat axis
-    splits = tf.split(value=x,
-                             num_or_size_splits=x_shape[axis],
-                             axis=axis)
-    # repeat each slice the given number of reps
-    x_rep = [s for s in splits for _ in range(rep)]
-    return concatenate(x_rep, axis)
-
-  # Here we use tf.tile to mimic behavior of np.repeat so that
-  # we can handle dynamic shapes (that include None).
-  # To do that, we need an auxiliary axis to repeat elements along
-  # it and then merge them along the desired axis.
-
-  # Repeating
-  auxiliary_axis = axis + 1
-  x_shape = tf.shape(x)
-  x_rep = tf.expand_dims(x, axis=auxiliary_axis)
-  reps = np.ones(len(x.shape) + 1)
-  reps[auxiliary_axis] = rep
-  x_rep = tf.tile(x_rep, reps)
-
-  # Merging
-  reps = np.delete(reps, auxiliary_axis)
-  reps[axis] = rep
-  reps = tf.constant(reps, dtype='int32')
-  x_shape *= reps
-  x_rep = tf.reshape(x_rep, x_shape)
-
-  # Fix shape representation
-  x_shape = x.shape.as_list()
-  x_rep.set_shape(x_shape)
-  x_rep._keras_shape = tuple(x_shape)
-  return x_rep
-
-
-@keras_export('keras.backend.repeat')
+    """Repeats the elements of a tensor along an axis, like `np.repeat`.
+
+    If `x` has shape `(s1, s2, s3)` and `axis` is `1`, the output
+    will have shape `(s1, s2 * rep, s3)`.
+
+    Args:
+        x: Tensor or variable.
+        rep: Python integer, number of times to repeat.
+        axis: Axis along which to repeat.
+
+    Returns:
+        A tensor.
+
+    Example:
+
+        >>> b = tf.constant([1, 2, 3])
+        >>> tf.keras.backend.repeat_elements(b, rep=2, axis=0)
+        <tf.Tensor: shape=(6,), dtype=int32,
+            numpy=array([1, 1, 2, 2, 3, 3], dtype=int32)>
+
+    """
+    x_shape = x.shape.as_list()
+    # For static axis
+    if x_shape[axis] is not None:
+        # slices along the repeat axis
+        splits = tf.split(value=x, num_or_size_splits=x_shape[axis], axis=axis)
+        # repeat each slice the given number of reps
+        x_rep = [s for s in splits for _ in range(rep)]
+        return concatenate(x_rep, axis)
+
+    # Here we use tf.tile to mimic behavior of np.repeat so that
+    # we can handle dynamic shapes (that include None).
+    # To do that, we need an auxiliary axis to repeat elements along
+    # it and then merge them along the desired axis.
+
+    # Repeating
+    auxiliary_axis = axis + 1
+    x_shape = tf.shape(x)
+    x_rep = tf.expand_dims(x, axis=auxiliary_axis)
+    reps = np.ones(len(x.shape) + 1)
+    reps[auxiliary_axis] = rep
+    x_rep = tf.tile(x_rep, reps)
+
+    # Merging
+    reps = np.delete(reps, auxiliary_axis)
+    reps[axis] = rep
+    reps = tf.constant(reps, dtype="int32")
+    x_shape *= reps
+    x_rep = tf.reshape(x_rep, x_shape)
+
+    # Fix shape representation
+    x_shape = x.shape.as_list()
+    x_rep.set_shape(x_shape)
+    x_rep._keras_shape = tuple(x_shape)
+    return x_rep
+
+
+@keras_export("keras.backend.repeat")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def repeat(x, n):
-  """Repeats a 2D tensor.
-
-  if `x` has shape (samples, dim) and `n` is `2`,
-  the output will have shape `(samples, 2, dim)`.
-
-  Args:
-      x: Tensor or variable.
-      n: Python integer, number of times to repeat.
+    """Repeats a 2D tensor.
 
-  Returns:
-      A tensor.
+    if `x` has shape (samples, dim) and `n` is `2`,
+    the output will have shape `(samples, 2, dim)`.
 
-  Example:
+    Args:
+        x: Tensor or variable.
+        n: Python integer, number of times to repeat.
 
-      >>> b = tf.constant([[1, 2], [3, 4]])
-      >>> b
-      <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
-      array([[1, 2],
-             [3, 4]], dtype=int32)>
-      >>> tf.keras.backend.repeat(b, n=2)
-      <tf.Tensor: shape=(2, 2, 2), dtype=int32, numpy=
-      array([[[1, 2],
-              [1, 2]],
-             [[3, 4],
-              [3, 4]]], dtype=int32)>
+    Returns:
+        A tensor.
+
+    Example:
+
+        >>> b = tf.constant([[1, 2], [3, 4]])
+        >>> b
+        <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+        array([[1, 2],
+               [3, 4]], dtype=int32)>
+        >>> tf.keras.backend.repeat(b, n=2)
+        <tf.Tensor: shape=(2, 2, 2), dtype=int32, numpy=
+        array([[[1, 2],
+                [1, 2]],
+               [[3, 4],
+                [3, 4]]], dtype=int32)>
 
-  """
-  assert ndim(x) == 2
-  x = tf.expand_dims(x, 1)
-  pattern = tf.stack([1, n, 1])
-  return tf.tile(x, pattern)
+    """
+    assert ndim(x) == 2
+    x = tf.expand_dims(x, 1)
+    pattern = tf.stack([1, n, 1])
+    return tf.tile(x, pattern)
 
 
-@keras_export('keras.backend.arange')
+@keras_export("keras.backend.arange")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def arange(start, stop=None, step=1, dtype='int32'):
-  """Creates a 1D tensor containing a sequence of integers.
+def arange(start, stop=None, step=1, dtype="int32"):
+    """Creates a 1D tensor containing a sequence of integers.
 
-  The function arguments use the same convention as
-  Theano's arange: if only one argument is provided,
-  it is in fact the "stop" argument and "start" is 0.
+    The function arguments use the same convention as
+    Theano's arange: if only one argument is provided,
+    it is in fact the "stop" argument and "start" is 0.
 
-  The default type of the returned tensor is `'int32'` to
-  match TensorFlow's default.
+    The default type of the returned tensor is `'int32'` to
+    match TensorFlow's default.
 
-  Args:
-      start: Start value.
-      stop: Stop value.
-      step: Difference between two successive values.
-      dtype: Integer dtype to use.
+    Args:
+        start: Start value.
+        stop: Stop value.
+        step: Difference between two successive values.
+        dtype: Integer dtype to use.
 
-  Returns:
-      An integer tensor.
+    Returns:
+        An integer tensor.
 
-  Example:
+    Example:
 
-      >>> tf.keras.backend.arange(start=0, stop=10, step=1.5)
-      <tf.Tensor: shape=(7,), dtype=float32,
-          numpy=array([0. , 1.5, 3. , 4.5, 6. , 7.5, 9. ], dtype=float32)>
+        >>> tf.keras.backend.arange(start=0, stop=10, step=1.5)
+        <tf.Tensor: shape=(7,), dtype=float32,
+            numpy=array([0. , 1.5, 3. , 4.5, 6. , 7.5, 9. ], dtype=float32)>
 
 
 
-  """
-  # Match the behavior of numpy and Theano by returning an empty sequence.
-  if stop is None and start < 0:
-    start = 0
-  result = tf.range(start, limit=stop, delta=step, name='arange')
-  if dtype != 'int32':
-    result = cast(result, dtype)
-  return result
+    """
+    # Match the behavior of numpy and Theano by returning an empty sequence.
+    if stop is None and start < 0:
+        start = 0
+    result = tf.range(start, limit=stop, delta=step, name="arange")
+    if dtype != "int32":
+        result = cast(result, dtype)
+    return result
 
 
-@keras_export('keras.backend.tile')
+@keras_export("keras.backend.tile")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def tile(x, n):
-  """Creates a tensor by tiling `x` by `n`.
+    """Creates a tensor by tiling `x` by `n`.
 
-  Args:
-      x: A tensor or variable
-      n: A list of integer. The length must be the same as the number of
-          dimensions in `x`.
+    Args:
+        x: A tensor or variable
+        n: A list of integer. The length must be the same as the number of
+            dimensions in `x`.
 
-  Returns:
-      A tiled tensor.
-  """
-  if isinstance(n, int):
-    n = [n]
-  return tf.tile(x, n)
+    Returns:
+        A tiled tensor.
+    """
+    if isinstance(n, int):
+        n = [n]
+    return tf.tile(x, n)
 
 
-@keras_export('keras.backend.flatten')
+@keras_export("keras.backend.flatten")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def flatten(x):
-  """Flatten a tensor.
+    """Flatten a tensor.
 
-  Args:
-      x: A tensor or variable.
+    Args:
+        x: A tensor or variable.
 
-  Returns:
-      A tensor, reshaped into 1-D
+    Returns:
+        A tensor, reshaped into 1-D
 
-  Example:
+    Example:
 
-      >>> b = tf.constant([[1, 2], [3, 4]])
-      >>> b
-      <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
-      array([[1, 2],
-             [3, 4]], dtype=int32)>
-      >>> tf.keras.backend.flatten(b)
-      <tf.Tensor: shape=(4,), dtype=int32,
-          numpy=array([1, 2, 3, 4], dtype=int32)>
+        >>> b = tf.constant([[1, 2], [3, 4]])
+        >>> b
+        <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+        array([[1, 2],
+               [3, 4]], dtype=int32)>
+        >>> tf.keras.backend.flatten(b)
+        <tf.Tensor: shape=(4,), dtype=int32,
+            numpy=array([1, 2, 3, 4], dtype=int32)>
 
-  """
-  return tf.reshape(x, [-1])
+    """
+    return tf.reshape(x, [-1])
 
 
-@keras_export('keras.backend.batch_flatten')
+@keras_export("keras.backend.batch_flatten")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def batch_flatten(x):
-  """Turn a nD tensor into a 2D tensor with same 0th dimension.
+    """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
-  In other words, it flattens each data samples of a batch.
+    In other words, it flattens each data samples of a batch.
 
-  Args:
-      x: A tensor or variable.
+    Args:
+        x: A tensor or variable.
 
-  Returns:
-      A tensor.
+    Returns:
+        A tensor.
 
-  Examples:
-    Flattening a 3D tensor to 2D by collapsing the last dimension.
+    Examples:
+      Flattening a 3D tensor to 2D by collapsing the last dimension.
 
-  >>> x_batch = tf.keras.backend.ones(shape=(2, 3, 4, 5))
-  >>> x_batch_flatten = batch_flatten(x_batch)
-  >>> tf.keras.backend.int_shape(x_batch_flatten)
-  (2, 60)
+    >>> x_batch = tf.keras.backend.ones(shape=(2, 3, 4, 5))
+    >>> x_batch_flatten = batch_flatten(x_batch)
+    >>> tf.keras.backend.int_shape(x_batch_flatten)
+    (2, 60)
 
-  """
-  x = tf.reshape(x, tf.stack([-1, prod(shape(x)[1:])]))
-  return x
+    """
+    x = tf.reshape(x, tf.stack([-1, prod(shape(x)[1:])]))
+    return x
 
 
-@keras_export('keras.backend.expand_dims')
+@keras_export("keras.backend.expand_dims")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def expand_dims(x, axis=-1):
-  """Adds a 1-sized dimension at index "axis".
+    """Adds a 1-sized dimension at index "axis".
 
-  Args:
-      x: A tensor or variable.
-      axis: Position where to add a new axis.
+    Args:
+        x: A tensor or variable.
+        axis: Position where to add a new axis.
 
-  Returns:
-      A tensor with expanded dimensions.
-  """
-  return tf.expand_dims(x, axis)
+    Returns:
+        A tensor with expanded dimensions.
+    """
+    return tf.expand_dims(x, axis)
 
 
-@keras_export('keras.backend.squeeze')
+@keras_export("keras.backend.squeeze")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def squeeze(x, axis):
-  """Removes a 1-dimension from the tensor at index "axis".
+    """Removes a 1-dimension from the tensor at index "axis".
 
-  Args:
-      x: A tensor or variable.
-      axis: Axis to drop.
+    Args:
+        x: A tensor or variable.
+        axis: Axis to drop.
 
-  Returns:
-      A tensor with the same data as `x` but reduced dimensions.
-  """
-  return tf.squeeze(x, [axis])
+    Returns:
+        A tensor with the same data as `x` but reduced dimensions.
+    """
+    return tf.squeeze(x, [axis])
 
 
-@keras_export('keras.backend.temporal_padding')
+@keras_export("keras.backend.temporal_padding")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def temporal_padding(x, padding=(1, 1)):
-  """Pads the middle dimension of a 3D tensor.
+    """Pads the middle dimension of a 3D tensor.
 
-  Args:
-      x: Tensor or variable.
-      padding: Tuple of 2 integers, how many zeros to
-          add at the start and end of dim 1.
+    Args:
+        x: Tensor or variable.
+        padding: Tuple of 2 integers, how many zeros to
+            add at the start and end of dim 1.
 
-  Returns:
-      A padded 3D tensor.
-  """
-  assert len(padding) == 2
-  pattern = [[0, 0], [padding[0], padding[1]], [0, 0]]
-  return tf.compat.v1.pad(x, pattern)
+    Returns:
+        A padded 3D tensor.
+    """
+    assert len(padding) == 2
+    pattern = [[0, 0], [padding[0], padding[1]], [0, 0]]
+    return tf.compat.v1.pad(x, pattern)
 
 
-@keras_export('keras.backend.spatial_2d_padding')
+@keras_export("keras.backend.spatial_2d_padding")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
-  """Pads the 2nd and 3rd dimensions of a 4D tensor.
-
-  Args:
-      x: Tensor or variable.
-      padding: Tuple of 2 tuples, padding pattern.
-      data_format: One of `channels_last` or `channels_first`.
-
-  Returns:
-      A padded 4D tensor.
-
-  Raises:
-      ValueError: if `data_format` is neither
-          `channels_last` or `channels_first`.
-  """
-  assert len(padding) == 2
-  assert len(padding[0]) == 2
-  assert len(padding[1]) == 2
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  if data_format == 'channels_first':
-    pattern = [[0, 0], [0, 0], list(padding[0]), list(padding[1])]
-  else:
-    pattern = [[0, 0], list(padding[0]), list(padding[1]), [0, 0]]
-  return tf.compat.v1.pad(x, pattern)
-
-
-@keras_export('keras.backend.spatial_3d_padding')
+    """Pads the 2nd and 3rd dimensions of a 4D tensor.
+
+    Args:
+        x: Tensor or variable.
+        padding: Tuple of 2 tuples, padding pattern.
+        data_format: One of `channels_last` or `channels_first`.
+
+    Returns:
+        A padded 4D tensor.
+
+    Raises:
+        ValueError: if `data_format` is neither
+            `channels_last` or `channels_first`.
+    """
+    assert len(padding) == 2
+    assert len(padding[0]) == 2
+    assert len(padding[1]) == 2
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    if data_format == "channels_first":
+        pattern = [[0, 0], [0, 0], list(padding[0]), list(padding[1])]
+    else:
+        pattern = [[0, 0], list(padding[0]), list(padding[1]), [0, 0]]
+    return tf.compat.v1.pad(x, pattern)
+
+
+@keras_export("keras.backend.spatial_3d_padding")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
-  """Pads 5D tensor with zeros along the depth, height, width dimensions.
-
-  Pads these dimensions with respectively
-  "padding[0]", "padding[1]" and "padding[2]" zeros left and right.
-
-  For 'channels_last' data_format,
-  the 2nd, 3rd and 4th dimension will be padded.
-  For 'channels_first' data_format,
-  the 3rd, 4th and 5th dimension will be padded.
-
-  Args:
-      x: Tensor or variable.
-      padding: Tuple of 3 tuples, padding pattern.
-      data_format: One of `channels_last` or `channels_first`.
-
-  Returns:
-      A padded 5D tensor.
-
-  Raises:
-      ValueError: if `data_format` is neither
-          `channels_last` or `channels_first`.
-
-  """
-  assert len(padding) == 3
-  assert len(padding[0]) == 2
-  assert len(padding[1]) == 2
-  assert len(padding[2]) == 2
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  if data_format == 'channels_first':
-    pattern = [[0, 0], [0, 0], [padding[0][0], padding[0][1]],
-               [padding[1][0], padding[1][1]], [padding[2][0], padding[2][1]]]
-  else:
-    pattern = [[0, 0], [padding[0][0], padding[0][1]],
-               [padding[1][0], padding[1][1]], [padding[2][0],
-                                                padding[2][1]], [0, 0]]
-  return tf.compat.v1.pad(x, pattern)
-
-
-@keras_export('keras.backend.stack')
+    """Pads 5D tensor with zeros along the depth, height, width dimensions.
+
+    Pads these dimensions with respectively
+    "padding[0]", "padding[1]" and "padding[2]" zeros left and right.
+
+    For 'channels_last' data_format,
+    the 2nd, 3rd and 4th dimension will be padded.
+    For 'channels_first' data_format,
+    the 3rd, 4th and 5th dimension will be padded.
+
+    Args:
+        x: Tensor or variable.
+        padding: Tuple of 3 tuples, padding pattern.
+        data_format: One of `channels_last` or `channels_first`.
+
+    Returns:
+        A padded 5D tensor.
+
+    Raises:
+        ValueError: if `data_format` is neither
+            `channels_last` or `channels_first`.
+
+    """
+    assert len(padding) == 3
+    assert len(padding[0]) == 2
+    assert len(padding[1]) == 2
+    assert len(padding[2]) == 2
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    if data_format == "channels_first":
+        pattern = [
+            [0, 0],
+            [0, 0],
+            [padding[0][0], padding[0][1]],
+            [padding[1][0], padding[1][1]],
+            [padding[2][0], padding[2][1]],
+        ]
+    else:
+        pattern = [
+            [0, 0],
+            [padding[0][0], padding[0][1]],
+            [padding[1][0], padding[1][1]],
+            [padding[2][0], padding[2][1]],
+            [0, 0],
+        ]
+    return tf.compat.v1.pad(x, pattern)
+
+
+@keras_export("keras.backend.stack")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def stack(x, axis=0):
-  """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
+    """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
-  Args:
-      x: List of tensors.
-      axis: Axis along which to perform stacking.
+    Args:
+        x: List of tensors.
+        axis: Axis along which to perform stacking.
 
-  Returns:
-      A tensor.
+    Returns:
+        A tensor.
 
-  Example:
+    Example:
 
-      >>> a = tf.constant([[1, 2],[3, 4]])
-      >>> b = tf.constant([[10, 20],[30, 40]])
-      >>> tf.keras.backend.stack((a, b))
-      <tf.Tensor: shape=(2, 2, 2), dtype=int32, numpy=
-      array([[[ 1,  2],
-              [ 3,  4]],
-             [[10, 20],
-              [30, 40]]], dtype=int32)>
+        >>> a = tf.constant([[1, 2],[3, 4]])
+        >>> b = tf.constant([[10, 20],[30, 40]])
+        >>> tf.keras.backend.stack((a, b))
+        <tf.Tensor: shape=(2, 2, 2), dtype=int32, numpy=
+        array([[[ 1,  2],
+                [ 3,  4]],
+               [[10, 20],
+                [30, 40]]], dtype=int32)>
 
-  """
-  return tf.stack(x, axis=axis)
+    """
+    return tf.stack(x, axis=axis)
 
 
-@keras_export('keras.backend.one_hot')
+@keras_export("keras.backend.one_hot")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def one_hot(indices, num_classes):
-  """Computes the one-hot representation of an integer tensor.
+    """Computes the one-hot representation of an integer tensor.
 
-  Args:
-      indices: nD integer tensor of shape
-          `(batch_size, dim1, dim2, ... dim(n-1))`
-      num_classes: Integer, number of classes to consider.
+    Args:
+        indices: nD integer tensor of shape
+            `(batch_size, dim1, dim2, ... dim(n-1))`
+        num_classes: Integer, number of classes to consider.
 
-  Returns:
-      (n + 1)D one hot representation of the input
-      with shape `(batch_size, dim1, dim2, ... dim(n-1), num_classes)`
+    Returns:
+        (n + 1)D one hot representation of the input
+        with shape `(batch_size, dim1, dim2, ... dim(n-1), num_classes)`
 
-  Returns:
-      The one-hot tensor.
-  """
-  return tf.one_hot(indices, depth=num_classes, axis=-1)
+    Returns:
+        The one-hot tensor.
+    """
+    return tf.one_hot(indices, depth=num_classes, axis=-1)
 
 
-@keras_export('keras.backend.reverse')
+@keras_export("keras.backend.reverse")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def reverse(x, axes):
-  """Reverse a tensor along the specified axes.
+    """Reverse a tensor along the specified axes.
 
-  Args:
-      x: Tensor to reverse.
-      axes: Integer or iterable of integers.
-          Axes to reverse.
+    Args:
+        x: Tensor to reverse.
+        axes: Integer or iterable of integers.
+            Axes to reverse.
 
-  Returns:
-      A tensor.
-  """
-  if isinstance(axes, int):
-    axes = [axes]
-  return tf.reverse(x, axes)
+    Returns:
+        A tensor.
+    """
+    if isinstance(axes, int):
+        axes = [axes]
+    return tf.reverse(x, axes)
 
 
 # VALUE MANIPULATION
@@ -4028,2630 +4188,2893 @@ def reverse(x, axes):
 
   >>> v.assign_add(1.)
   >>> print(v.numpy())
-  3.0"""[3:]  # Prune first newline and indent to match the docstring template.
+  3.0"""[
+    3:
+]  # Prune first newline and indent to match the docstring template.
 
 
-@keras_export('keras.backend.get_value')
+@keras_export("keras.backend.get_value")
 @doc_controls.do_not_generate_docs
 def get_value(x):
-  """Returns the value of a variable.
+    """Returns the value of a variable.
 
-  `backend.get_value` is the complement of `backend.set_value`, and provides
-  a generic interface for reading from variables while abstracting away the
-  differences between TensorFlow 1.x and 2.x semantics.
+    `backend.get_value` is the complement of `backend.set_value`, and provides
+    a generic interface for reading from variables while abstracting away the
+    differences between TensorFlow 1.x and 2.x semantics.
 
-  {snippet}
+    {snippet}
 
-  Args:
-      x: input variable.
+    Args:
+        x: input variable.
 
-  Returns:
-      A Numpy array.
-  """
-  if not tf.is_tensor(x):
-    return x
-  if tf.executing_eagerly() or isinstance(x, tf.__internal__.EagerTensor):
-    return x.numpy()
-  if not getattr(x, '_in_graph_mode', True):
-    # This is a variable which was created in an eager context, but is being
-    # evaluated from a Graph.
-    with tf.__internal__.eager_context.eager_mode():
-      return x.numpy()
-
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    # This method of evaluating works inside the Keras FuncGraph.
-    with tf.init_scope():
-      return x.numpy()
+    Returns:
+        A Numpy array.
+    """
+    if not tf.is_tensor(x):
+        return x
+    if tf.executing_eagerly() or isinstance(x, tf.__internal__.EagerTensor):
+        return x.numpy()
+    if not getattr(x, "_in_graph_mode", True):
+        # This is a variable which was created in an eager context, but is being
+        # evaluated from a Graph.
+        with tf.__internal__.eager_context.eager_mode():
+            return x.numpy()
+
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        # This method of evaluating works inside the Keras FuncGraph.
+        with tf.init_scope():
+            return x.numpy()
 
-  with x.graph.as_default():
-    return x.eval(session=get_session((x,)))
+    with x.graph.as_default():
+        return x.eval(session=get_session((x,)))
 
 
-@keras_export('keras.backend.batch_get_value')
+@keras_export("keras.backend.batch_get_value")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def batch_get_value(tensors):
-  """Returns the value of more than one tensor variable.
+    """Returns the value of more than one tensor variable.
 
-  Args:
-      tensors: list of ops to run.
+    Args:
+        tensors: list of ops to run.
 
-  Returns:
-      A list of Numpy arrays.
+    Returns:
+        A list of Numpy arrays.
 
-  Raises:
-      RuntimeError: If this method is called inside defun.
-  """
-  if tf.executing_eagerly():
-    return [x.numpy() for x in tensors]
-  elif tf.inside_function():  # pylint: disable=protected-access
-    raise RuntimeError('Cannot get value inside Tensorflow graph function.')
-  if tensors:
-    return get_session(tensors).run(tensors)
-  else:
-    return []
+    Raises:
+        RuntimeError: If this method is called inside defun.
+    """
+    if tf.executing_eagerly():
+        return [x.numpy() for x in tensors]
+    elif tf.inside_function():
+        raise RuntimeError("Cannot get value inside Tensorflow graph function.")
+    if tensors:
+        return get_session(tensors).run(tensors)
+    else:
+        return []
 
 
-@keras_export('keras.backend.set_value')
+@keras_export("keras.backend.set_value")
 @doc_controls.do_not_generate_docs
 def set_value(x, value):
-  """Sets the value of a variable, from a Numpy array.
-
-  `backend.set_value` is the complement of `backend.get_value`, and provides
-  a generic interface for assigning to variables while abstracting away the
-  differences between TensorFlow 1.x and 2.x semantics.
-
-  {snippet}
-
-  Args:
-      x: Variable to set to a new value.
-      value: Value to set the tensor to, as a Numpy array
-          (of the same shape).
-  """
-  value = np.asarray(value, dtype=dtype_numpy(x))
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    x.assign(value)
-  else:
-    with get_graph().as_default():
-      tf_dtype = tf.as_dtype(x.dtype.name.split('_')[0])
-      if hasattr(x, '_assign_placeholder'):
-        assign_placeholder = x._assign_placeholder
-        assign_op = x._assign_op
-      else:
-        # In order to support assigning weights to resizable variables in
-        # Keras, we make a placeholder with the correct number of dimensions
-        # but with None in each dimension. This way, we can assign weights
-        # of any size (as long as they have the correct dimensionality).
-        placeholder_shape = tf.TensorShape([None] * value.ndim)
-        assign_placeholder = tf.compat.v1.placeholder(
-            tf_dtype, shape=placeholder_shape)
-        assign_op = x.assign(assign_placeholder)
-        x._assign_placeholder = assign_placeholder
-        x._assign_op = assign_op
-      get_session().run(assign_op, feed_dict={assign_placeholder: value})
-
-
-@keras_export('keras.backend.batch_set_value')
+    """Sets the value of a variable, from a Numpy array.
+
+    `backend.set_value` is the complement of `backend.get_value`, and provides
+    a generic interface for assigning to variables while abstracting away the
+    differences between TensorFlow 1.x and 2.x semantics.
+
+    {snippet}
+
+    Args:
+        x: Variable to set to a new value.
+        value: Value to set the tensor to, as a Numpy array
+            (of the same shape).
+    """
+    value = np.asarray(value, dtype=dtype_numpy(x))
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        _assign_value_to_variable(x, value)
+    else:
+        with get_graph().as_default():
+            tf_dtype = tf.as_dtype(x.dtype.name.split("_")[0])
+            if hasattr(x, "_assign_placeholder"):
+                assign_placeholder = x._assign_placeholder
+                assign_op = x._assign_op
+            else:
+                # In order to support assigning weights to resizable variables
+                # in Keras, we make a placeholder with the correct number of
+                # dimensions but with None in each dimension. This way, we can
+                # assign weights of any size (as long as they have the correct
+                # dimensionality).
+                placeholder_shape = tf.TensorShape([None] * value.ndim)
+                assign_placeholder = tf.compat.v1.placeholder(
+                    tf_dtype, shape=placeholder_shape
+                )
+                assign_op = x.assign(assign_placeholder)
+                x._assign_placeholder = assign_placeholder
+                x._assign_op = assign_op
+            get_session().run(assign_op, feed_dict={assign_placeholder: value})
+
+
+@keras_export("keras.backend.batch_set_value")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def batch_set_value(tuples):
-  """Sets the values of many tensor variables at once.
-
-  Args:
-      tuples: a list of tuples `(tensor, value)`.
-          `value` should be a Numpy array.
-  """
-  if tf.executing_eagerly() or tf.inside_function():
-    for x, value in tuples:
-      x.assign(np.asarray(value, dtype=dtype_numpy(x)))
-  else:
-    with get_graph().as_default():
-      if tuples:
-        assign_ops = []
-        feed_dict = {}
+    """Sets the values of many tensor variables at once.
+
+    Args:
+        tuples: a list of tuples `(tensor, value)`.
+            `value` should be a Numpy array.
+    """
+    if tf.executing_eagerly() or tf.inside_function():
         for x, value in tuples:
-          value = np.asarray(value, dtype=dtype_numpy(x))
-          tf_dtype = tf.as_dtype(x.dtype.name.split('_')[0])
-          if hasattr(x, '_assign_placeholder'):
-            assign_placeholder = x._assign_placeholder
-            assign_op = x._assign_op
-          else:
-            # In order to support assigning weights to resizable variables in
-            # Keras, we make a placeholder with the correct number of dimensions
-            # but with None in each dimension. This way, we can assign weights
-            # of any size (as long as they have the correct dimensionality).
-            placeholder_shape = tf.TensorShape([None] * value.ndim)
-            assign_placeholder = tf.compat.v1.placeholder(
-                tf_dtype, shape=placeholder_shape)
-            assign_op = x.assign(assign_placeholder)
-            x._assign_placeholder = assign_placeholder
-            x._assign_op = assign_op
-          assign_ops.append(assign_op)
-          feed_dict[assign_placeholder] = value
-        get_session().run(assign_ops, feed_dict=feed_dict)
+            value = np.asarray(value, dtype=dtype_numpy(x))
+            _assign_value_to_variable(x, value)
+    else:
+        with get_graph().as_default():
+            if tuples:
+                assign_ops = []
+                feed_dict = {}
+                for x, value in tuples:
+                    value = np.asarray(value, dtype=dtype_numpy(x))
+                    tf_dtype = tf.as_dtype(x.dtype.name.split("_")[0])
+                    if hasattr(x, "_assign_placeholder"):
+                        assign_placeholder = x._assign_placeholder
+                        assign_op = x._assign_op
+                    else:
+                        # In order to support assigning weights to resizable
+                        # variables in Keras, we make a placeholder with the
+                        # correct number of dimensions but with None in each
+                        # dimension. This way, we can assign weights of any size
+                        # (as long as they have the correct dimensionality).
+                        placeholder_shape = tf.TensorShape([None] * value.ndim)
+                        assign_placeholder = tf.compat.v1.placeholder(
+                            tf_dtype, shape=placeholder_shape
+                        )
+                        assign_op = x.assign(assign_placeholder)
+                        x._assign_placeholder = assign_placeholder
+                        x._assign_op = assign_op
+                    assign_ops.append(assign_op)
+                    feed_dict[assign_placeholder] = value
+                get_session().run(assign_ops, feed_dict=feed_dict)
 
 
 get_value.__doc__ = get_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 set_value.__doc__ = set_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 
 
-@keras_export('keras.backend.print_tensor')
+def _assign_value_to_variable(variable, value):
+    # Helper function to assign value to variable. It handles normal tf.Variable
+    # as well as DTensor variable.
+    if isinstance(variable, dtensor.DVariable):
+        mesh = variable.layout.mesh
+        replicate_layout = dtensor.Layout.replicated(
+            rank=variable.shape.rank, mesh=mesh
+        )
+        # TODO(b/262894693): Avoid the broadcast of tensor to all devices.
+        d_value = dtensor.copy_to_mesh(value, replicate_layout)
+        d_value = dtensor.relayout(d_value, variable.layout)
+        variable.assign(d_value)
+    else:
+        # For the normal tf.Variable assign
+        variable.assign(value)
+
+
+@keras_export("keras.backend.print_tensor")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def print_tensor(x, message='', summarize=3):
-  """Prints `message` and the tensor value when evaluated.
-
-  Note that `print_tensor` returns a new tensor identical to `x`
-  which should be used in the following code. Otherwise the
-  print operation is not taken into account during evaluation.
-
-  Example:
-
-  >>> x = tf.constant([[1.0, 2.0], [3.0, 4.0]])
-  >>> tf.keras.backend.print_tensor(x)
-  <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[1., 2.],
-           [3., 4.]], dtype=float32)>
-
-  Args:
-      x: Tensor to print.
-      message: Message to print jointly with the tensor.
-      summarize: The first and last `summarize` elements within each dimension
-          are recursively printed per Tensor. If None, then the first 3 and last
-          3 elements of each dimension are printed for each tensor. If set to
-          -1, it will print all elements of every tensor.
-
-  Returns:
-      The same tensor `x`, unchanged.
-  """
-  if isinstance(x, tf.Tensor) and hasattr(x, 'graph'):
-    with get_graph().as_default():
-      op = tf.print(
-          message, x, output_stream=sys.stdout, summarize=summarize)
-      with tf.control_dependencies([op]):
-        return tf.identity(x)
-  else:
-    tf.print(
-        message, x, output_stream=sys.stdout, summarize=summarize)
-    return x
+def print_tensor(x, message="", summarize=3):
+    """Prints `message` and the tensor value when evaluated.
+
+    Note that `print_tensor` returns a new tensor identical to `x`
+    which should be used in the following code. Otherwise the
+    print operation is not taken into account during evaluation.
+
+    Example:
+
+    >>> x = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+    >>> tf.keras.backend.print_tensor(x)
+    <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+      array([[1., 2.],
+             [3., 4.]], dtype=float32)>
+
+    Args:
+        x: Tensor to print.
+        message: Message to print jointly with the tensor.
+        summarize: The first and last `summarize` elements within each dimension
+            are recursively printed per Tensor. If None, then the first 3 and
+            last 3 elements of each dimension are printed for each tensor. If
+            set to -1, it will print all elements of every tensor.
+
+    Returns:
+        The same tensor `x`, unchanged.
+    """
+    if isinstance(x, tf.Tensor) and hasattr(x, "graph"):
+        with get_graph().as_default():
+            op = tf.print(
+                message, x, output_stream=sys.stdout, summarize=summarize
+            )
+            with tf.control_dependencies([op]):
+                return tf.identity(x)
+    else:
+        tf.print(message, x, output_stream=sys.stdout, summarize=summarize)
+        return x
+
 
 # GRAPH MANIPULATION
 
 
 class GraphExecutionFunction:
-  """Runs a computation graph.
-
-  It's possible to pass arguments to `tf.Session.run()` via `session_kwargs`.
-  In particular additional operations via `fetches` argument and additional
-  tensor substitutions via `feed_dict` arguments. Note that given
-  substitutions are merged with substitutions from `inputs`. Even though
-  `feed_dict` is passed once in the constructor (called in `model.compile()`)
-  we can modify the values in the dictionary. Through this feed_dict we can
-  provide additional substitutions besides Keras inputs.
-
-  Args:
-      inputs: Feed placeholders to the computation graph.
-      outputs: Output tensors to fetch.
-      updates: Additional update ops to be run at function call.
-      name: A name to help users identify what this function does.
-      session_kwargs: Arguments to `tf.Session.run()`:
-                      `fetches`, `feed_dict`, `options`, `run_metadata`.
-  """
-
-  def __init__(self, inputs, outputs, updates=None, name=None,
-               **session_kwargs):
-    updates = updates or []
-    if not isinstance(updates, (list, tuple)):
-      raise TypeError('`updates` in a Keras backend function '
-                      'should be a list or tuple.')
-
-    self._inputs_structure = inputs
-    self.inputs = tf.nest.flatten(inputs, expand_composites=True)
-    self._outputs_structure = outputs
-    self.outputs = cast_variables_to_tensor(
-        tf.nest.flatten(outputs, expand_composites=True))
-    # TODO(b/127668432): Consider using autograph to generate these
-    # dependencies in call.
-    # Index 0 = total loss or model output for `predict`.
-    with tf.control_dependencies([self.outputs[0]]):
-      updates_ops = []
-      for update in updates:
-        if isinstance(update, tuple):
-          p, new_p = update
-          updates_ops.append(tf.compat.v1.assign(p, new_p))
+    """Runs a computation graph.
+
+    It's possible to pass arguments to `tf.Session.run()` via `session_kwargs`.
+    In particular additional operations via `fetches` argument and additional
+    tensor substitutions via `feed_dict` arguments. Note that given
+    substitutions are merged with substitutions from `inputs`. Even though
+    `feed_dict` is passed once in the constructor (called in `model.compile()`)
+    we can modify the values in the dictionary. Through this feed_dict we can
+    provide additional substitutions besides Keras inputs.
+
+    Args:
+        inputs: Feed placeholders to the computation graph.
+        outputs: Output tensors to fetch.
+        updates: Additional update ops to be run at function call.
+        name: A name to help users identify what this function does.
+        session_kwargs: Arguments to `tf.Session.run()`:
+                        `fetches`, `feed_dict`, `options`, `run_metadata`.
+    """
+
+    def __init__(
+        self, inputs, outputs, updates=None, name=None, **session_kwargs
+    ):
+        updates = updates or []
+        if not isinstance(updates, (list, tuple)):
+            raise TypeError(
+                "`updates` in a Keras backend function "
+                "should be a list or tuple."
+            )
+
+        self.inputs = tf.nest.flatten(
+            tf_utils.convert_variables_to_tensors(inputs),
+            expand_composites=True,
+        )
+        self._outputs_structure = tf_utils.convert_variables_to_tensors(outputs)
+        self.outputs = tf.nest.flatten(
+            self._outputs_structure, expand_composites=True
+        )
+        # TODO(b/127668432): Consider using autograph to generate these
+        # dependencies in call.
+        # Index 0 = total loss or model output for `predict`.
+        with tf.control_dependencies([self.outputs[0]]):
+            updates_ops = []
+            for update in updates:
+                if isinstance(update, tuple):
+                    p, new_p = update
+                    updates_ops.append(tf.compat.v1.assign(p, new_p))
+                else:
+                    # assumed already an op
+                    updates_ops.append(update)
+            self.updates_op = tf.group(*updates_ops)
+        self.name = name
+        # additional tensor substitutions
+        self.feed_dict = session_kwargs.pop("feed_dict", None)
+        # additional operations
+        self.fetches = session_kwargs.pop("fetches", [])
+        if not isinstance(self.fetches, list):
+            self.fetches = [self.fetches]
+        self.run_options = session_kwargs.pop("options", None)
+        self.run_metadata = session_kwargs.pop("run_metadata", None)
+        # The main use case of `fetches` being passed to a model is the ability
+        # to run custom updates
+        # This requires us to wrap fetches in `identity` ops.
+        self.fetches = [tf.identity(x) for x in self.fetches]
+        self.session_kwargs = session_kwargs
+        # This mapping keeps track of the function that should receive the
+        # output from a fetch in `fetches`: { fetch: function(fetch_output) }
+        # A Callback can use this to register a function with access to the
+        # output values for a fetch it added.
+        self.fetch_callbacks = {}
+
+        if session_kwargs:
+            raise ValueError(
+                "Some keys in session_kwargs are not supported at this time: %s"
+                % (session_kwargs.keys(),)
+            )
+
+        self._callable_fn = None
+        self._feed_arrays = None
+        self._feed_symbols = None
+        self._symbol_vals = None
+        self._fetches = None
+        self._session = None
+
+    def _make_callable(self, feed_arrays, feed_symbols, symbol_vals, session):
+        """Generates a callable that runs the graph.
+
+        Args:
+          feed_arrays: List of input tensors to be fed Numpy arrays at runtime.
+          feed_symbols: List of input tensors to be fed symbolic tensors at
+            runtime.
+          symbol_vals: List of symbolic tensors to be fed to `feed_symbols`.
+          session: Session to use to generate the callable.
+
+        Returns:
+          Function that runs the graph according to the above options.
+        """
+        # Prepare callable options.
+        callable_opts = config_pb2.CallableOptions()
+        # Handle external-data feed.
+        for x in feed_arrays:
+            callable_opts.feed.append(x.name)
+        if self.feed_dict:
+            for key in sorted(self.feed_dict.keys()):
+                callable_opts.feed.append(key.name)
+        # Handle symbolic feed.
+        for x, y in zip(feed_symbols, symbol_vals):
+            connection = callable_opts.tensor_connection.add()
+            if x.dtype != y.dtype:
+                y = tf.cast(y, dtype=x.dtype)
+            from_tensor = _as_graph_element(y)
+            if from_tensor is None:
+                from_tensor = y
+            connection.from_tensor = from_tensor.name  # Data tensor
+            connection.to_tensor = x.name  # Placeholder
+        # Handle fetches.
+        for x in self.outputs + self.fetches:
+            callable_opts.fetch.append(x.name)
+        # Handle updates.
+        callable_opts.target.append(self.updates_op.name)
+        # Handle run_options.
+        if self.run_options:
+            callable_opts.run_options.CopyFrom(self.run_options)
+        # Create callable.
+        callable_fn = session._make_callable_from_options(callable_opts)
+        # Cache parameters corresponding to the generated callable, so that
+        # we can detect future mismatches and refresh the callable.
+        self._callable_fn = callable_fn
+        self._feed_arrays = feed_arrays
+        self._feed_symbols = feed_symbols
+        self._symbol_vals = symbol_vals
+        self._fetches = list(self.fetches)
+        self._session = session
+
+    def _call_fetch_callbacks(self, fetches_output):
+        for fetch, output in zip(self._fetches, fetches_output):
+            if fetch in self.fetch_callbacks:
+                self.fetch_callbacks[fetch](output)
+
+    def _eval_if_composite(self, tensor):
+        """Helper method which evaluates any CompositeTensors passed to it."""
+        # We need to evaluate any composite tensor objects that have been
+        # reconstructed in 'pack_sequence_as', since otherwise they'll be output
+        # as actual CompositeTensor objects instead of the value(s) contained in
+        # the CompositeTensors. E.g., if output_structure contains a
+        # SparseTensor, then this ensures that we return its value as a
+        # SparseTensorValue rather than a SparseTensor.
+
+        if tf_utils.is_extension_type(tensor):
+            return self._session.run(tensor)
         else:
-          # assumed already an op
-          updates_ops.append(update)
-      self.updates_op = tf.group(*updates_ops)
-    self.name = name
-    # additional tensor substitutions
-    self.feed_dict = session_kwargs.pop('feed_dict', None)
-    # additional operations
-    self.fetches = session_kwargs.pop('fetches', [])
-    if not isinstance(self.fetches, list):
-      self.fetches = [self.fetches]
-    self.run_options = session_kwargs.pop('options', None)
-    self.run_metadata = session_kwargs.pop('run_metadata', None)
-    # The main use case of `fetches` being passed to a model is the ability
-    # to run custom updates
-    # This requires us to wrap fetches in `identity` ops.
-    self.fetches = [tf.identity(x) for x in self.fetches]
-    self.session_kwargs = session_kwargs
-    # This mapping keeps track of the function that should receive the
-    # output from a fetch in `fetches`: { fetch: function(fetch_output) }
-    # A Callback can use this to register a function with access to the
-    # output values for a fetch it added.
-    self.fetch_callbacks = {}
-
-    if session_kwargs:
-      raise ValueError('Some keys in session_kwargs are not supported at this '
-                       'time: %s' % (session_kwargs.keys(),))
-
-    self._callable_fn = None
-    self._feed_arrays = None
-    self._feed_symbols = None
-    self._symbol_vals = None
-    self._fetches = None
-    self._session = None
-
-  def _make_callable(self, feed_arrays, feed_symbols, symbol_vals, session):
-    """Generates a callable that runs the graph.
-
-    Args:
-      feed_arrays: List of input tensors to be fed Numpy arrays at runtime.
-      feed_symbols: List of input tensors to be fed symbolic tensors at runtime.
-      symbol_vals: List of symbolic tensors to be fed to `feed_symbols`.
-      session: Session to use to generate the callable.
-
-    Returns:
-      Function that runs the graph according to the above options.
-    """
-    # Prepare callable options.
-    callable_opts = config_pb2.CallableOptions()
-    # Handle external-data feed.
-    for x in feed_arrays:
-      callable_opts.feed.append(x.name)
-    if self.feed_dict:
-      for key in sorted(self.feed_dict.keys()):
-        callable_opts.feed.append(key.name)
-    # Handle symbolic feed.
-    for x, y in zip(feed_symbols, symbol_vals):
-      connection = callable_opts.tensor_connection.add()
-      if x.dtype != y.dtype:
-        y = tf.cast(y, dtype=x.dtype)
-      from_tensor = _as_graph_element(y)
-      if from_tensor is None:
-        from_tensor = y
-      connection.from_tensor = from_tensor.name  # Data tensor
-      connection.to_tensor = x.name  # Placeholder
-    # Handle fetches.
-    for x in self.outputs + self.fetches:
-      callable_opts.fetch.append(x.name)
-    # Handle updates.
-    callable_opts.target.append(self.updates_op.name)
-    # Handle run_options.
-    if self.run_options:
-      callable_opts.run_options.CopyFrom(self.run_options)
-    # Create callable.
-    callable_fn = session._make_callable_from_options(callable_opts)
-    # Cache parameters corresponding to the generated callable, so that
-    # we can detect future mismatches and refresh the callable.
-    self._callable_fn = callable_fn
-    self._feed_arrays = feed_arrays
-    self._feed_symbols = feed_symbols
-    self._symbol_vals = symbol_vals
-    self._fetches = list(self.fetches)
-    self._session = session
-
-  def _call_fetch_callbacks(self, fetches_output):
-    for fetch, output in zip(self._fetches, fetches_output):
-      if fetch in self.fetch_callbacks:
-        self.fetch_callbacks[fetch](output)
-
-  def _eval_if_composite(self, tensor):
-    """Helper method which evaluates any CompositeTensors passed to it."""
-    # We need to evaluate any composite tensor objects that have been
-    # reconstructed in 'pack_sequence_as', since otherwise they'll be output as
-    # actual CompositeTensor objects instead of the value(s) contained in the
-    # CompositeTensors. E.g., if output_structure contains a SparseTensor, then
-    # this ensures that we return its value as a SparseTensorValue rather than
-    # a SparseTensor.
-    from keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
-    if tf_utils.is_extension_type(tensor):
-      return self._session.run(tensor)
-    else:
-      return tensor
-
-  def __call__(self, inputs):
-    inputs = tf.nest.flatten(inputs, expand_composites=True)
-
-    session = get_session(inputs)
-    feed_arrays = []
-    array_vals = []
-    feed_symbols = []
-    symbol_vals = []
-    for tensor, value in zip(self.inputs, inputs):
-      if value is None:
-        continue
-
-      if tf.is_tensor(value):
-        # Case: feeding symbolic tensor.
-        feed_symbols.append(tensor)
-        symbol_vals.append(value)
-      else:
-        # Case: feeding Numpy array.
-        feed_arrays.append(tensor)
-        # We need to do array conversion and type casting at this level, since
-        # `callable_fn` only supports exact matches.
-        tensor_type = tf.as_dtype(tensor.dtype)
-        array_vals.append(np.asarray(value,
-                                     dtype=tensor_type.as_numpy_dtype))
-
-    if self.feed_dict:
-      for key in sorted(self.feed_dict.keys()):
-        array_vals.append(
-            np.asarray(self.feed_dict[key], dtype=key.dtype.as_numpy_dtype))
-
-    # Refresh callable if anything has changed.
-    if (self._callable_fn is None or feed_arrays != self._feed_arrays or
-        symbol_vals != self._symbol_vals or
-        feed_symbols != self._feed_symbols or self.fetches != self._fetches or
-        session != self._session):
-      self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
-
-    fetched = self._callable_fn(*array_vals,
-                                run_metadata=self.run_metadata)
-    self._call_fetch_callbacks(fetched[-len(self._fetches):])
-    output_structure = tf.nest.pack_sequence_as(
-        self._outputs_structure,
-        fetched[:len(self.outputs)],
-        expand_composites=True)
-    # We need to evaluate any composite tensor objects that have been
-    # reconstructed in 'pack_sequence_as', since otherwise they'll be output as
-    # actual CompositeTensor objects instead of the value(s) contained in the
-    # CompositeTensors. E.g., if output_structure contains a SparseTensor, then
-    # this ensures that we return its value as a SparseTensorValue rather than
-    # a SparseTensor.
-    return tf.nest.map_structure(self._eval_if_composite, output_structure)
-
-
-@keras_export('keras.backend.function')
+            return tensor
+
+    def __call__(self, inputs):
+        inputs = tf.nest.flatten(
+            tf_utils.convert_variables_to_tensors(inputs),
+            expand_composites=True,
+        )
+
+        session = get_session(inputs)
+        feed_arrays = []
+        array_vals = []
+        feed_symbols = []
+        symbol_vals = []
+        for tensor, value in zip(self.inputs, inputs):
+            if value is None:
+                continue
+
+            if tf.is_tensor(value):
+                # Case: feeding symbolic tensor.
+                feed_symbols.append(tensor)
+                symbol_vals.append(value)
+            else:
+                # Case: feeding Numpy array.
+                feed_arrays.append(tensor)
+                # We need to do array conversion and type casting at this level,
+                # since `callable_fn` only supports exact matches.
+                tensor_type = tf.as_dtype(tensor.dtype)
+                array_vals.append(
+                    np.asarray(value, dtype=tensor_type.as_numpy_dtype)
+                )
+
+        if self.feed_dict:
+            for key in sorted(self.feed_dict.keys()):
+                array_vals.append(
+                    np.asarray(
+                        self.feed_dict[key], dtype=key.dtype.as_numpy_dtype
+                    )
+                )
+
+        # Refresh callable if anything has changed.
+        if (
+            self._callable_fn is None
+            or feed_arrays != self._feed_arrays
+            or symbol_vals != self._symbol_vals
+            or feed_symbols != self._feed_symbols
+            or self.fetches != self._fetches
+            or session != self._session
+        ):
+            self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
+
+        fetched = self._callable_fn(*array_vals, run_metadata=self.run_metadata)
+        self._call_fetch_callbacks(fetched[-len(self._fetches) :])
+        output_structure = tf.nest.pack_sequence_as(
+            self._outputs_structure,
+            fetched[: len(self.outputs)],
+            expand_composites=True,
+        )
+        # We need to evaluate any composite tensor objects that have been
+        # reconstructed in 'pack_sequence_as', since otherwise they'll be output
+        # as actual CompositeTensor objects instead of the value(s) contained in
+        # the CompositeTensors. E.g., if output_structure contains a
+        # SparseTensor, then this ensures that we return its value as a
+        # SparseTensorValue rather than a SparseTensor.
+        return tf.nest.map_structure(self._eval_if_composite, output_structure)
+
+
+@keras_export("keras.backend.function")
 @doc_controls.do_not_generate_docs
 def function(inputs, outputs, updates=None, name=None, **kwargs):
-  """Instantiates a Keras function.
-
-  Args:
-      inputs: List of placeholder tensors.
-      outputs: List of output tensors.
-      updates: List of update ops.
-      name: String, name of function.
-      **kwargs: Passed to `tf.Session.run`.
-
-  Returns:
-      Output values as Numpy arrays.
-
-  Raises:
-      ValueError: if invalid kwargs are passed in or if in eager execution.
-  """
-  if tf.compat.v1.executing_eagerly_outside_functions():
+    """Instantiates a Keras function.
+
+    Args:
+        inputs: List of placeholder tensors.
+        outputs: List of output tensors.
+        updates: List of update ops.
+        name: String, name of function.
+        **kwargs: Passed to `tf.Session.run`.
+
+    Returns:
+        Output values as Numpy arrays.
+
+    Raises:
+        ValueError: if invalid kwargs are passed in or if in eager execution.
+    """
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        if kwargs:
+            raise ValueError(
+                "Session keyword arguments are not supported during "
+                "eager execution. You passed: %s" % (kwargs,)
+            )
+        if updates:
+            raise ValueError(
+                "`updates` argument is not supported during "
+                "eager execution. You passed: %s" % (updates,)
+            )
+        from keras import models
+
+        model = models.Model(inputs=inputs, outputs=outputs)
+
+        wrap_outputs = isinstance(outputs, list) and len(outputs) == 1
+
+        def func(model_inputs):
+            outs = model(model_inputs)
+            if wrap_outputs:
+                outs = [outs]
+            return tf_utils.sync_to_numpy_or_python_type(outs)
+
+        return func
+
     if kwargs:
-      raise ValueError('Session keyword arguments are not supported during '
-                       'eager execution. You passed: %s' % (kwargs,))
-    if updates:
-      raise ValueError('`updates` argument is not supported during '
-                       'eager execution. You passed: %s' % (updates,))
-    from keras import models  # pylint: disable=g-import-not-at-top
-    from keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
-    model = models.Model(inputs=inputs, outputs=outputs)
-
-    wrap_outputs = isinstance(outputs, list) and len(outputs) == 1
-    def func(model_inputs):
-      outs = model(model_inputs)
-      if wrap_outputs:
-        outs = [outs]
-      return tf_utils.sync_to_numpy_or_python_type(outs)
-
-    return func
-
-  if kwargs:
-    for key in kwargs:
-      if (key not in tf_inspect.getfullargspec(tf.compat.v1.Session.run)[0]
-          and key not in ['inputs', 'outputs', 'updates', 'name']):
-        msg = ('Invalid argument "%s" passed to K.function with TensorFlow '
-               'backend') % key
-        raise ValueError(msg)
-  return GraphExecutionFunction(
-      inputs, outputs, updates=updates, name=name, **kwargs)
-
-
-@keras_export('keras.backend.gradients')
+        for key in kwargs:
+            if key not in tf_inspect.getfullargspec(tf.compat.v1.Session.run)[
+                0
+            ] and key not in ["inputs", "outputs", "updates", "name"]:
+                msg = (
+                    'Invalid argument "%s" passed to K.function with '
+                    "TensorFlow backend" % key
+                )
+                raise ValueError(msg)
+    return GraphExecutionFunction(
+        inputs, outputs, updates=updates, name=name, **kwargs
+    )
+
+
+@keras_export("keras.backend.gradients")
 @doc_controls.do_not_generate_docs
 def gradients(loss, variables):
-  """Returns the gradients of `loss` w.r.t. `variables`.
+    """Returns the gradients of `loss` w.r.t. `variables`.
 
-  Args:
-      loss: Scalar tensor to minimize.
-      variables: List of variables.
+    Args:
+        loss: Scalar tensor to minimize.
+        variables: List of variables.
 
-  Returns:
-      A gradients tensor.
-  """
-  return tf.compat.v1.gradients(
-      loss, variables, colocate_gradients_with_ops=True)
+    Returns:
+        A gradients tensor.
+    """
+    return tf.compat.v1.gradients(
+        loss, variables, colocate_gradients_with_ops=True
+    )
 
 
-@keras_export('keras.backend.stop_gradient')
+@keras_export("keras.backend.stop_gradient")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def stop_gradient(variables):
-  """Returns `variables` but with zero gradient w.r.t. every other variable.
+    """Returns `variables` but with zero gradient w.r.t. every other variable.
 
-  Args:
-      variables: Tensor or list of tensors to consider constant with respect
-        to any other variable.
+    Args:
+        variables: Tensor or list of tensors to consider constant with respect
+          to any other variable.
 
 
-  Returns:
-      A single tensor or a list of tensors (depending on the passed argument)
-      that has no gradient with respect to any other variable.
-  """
-  if isinstance(variables, (list, tuple)):
-    return map(tf.stop_gradient, variables)
-  return tf.stop_gradient(variables)
+    Returns:
+        A single tensor or a list of tensors (depending on the passed argument)
+        that has no gradient with respect to any other variable.
+    """
+    if isinstance(variables, (list, tuple)):
+        return map(tf.stop_gradient, variables)
+    return tf.stop_gradient(variables)
 
 
 # CONTROL FLOW
 
 
-@keras_export('keras.backend.rnn')
+@keras_export("keras.backend.rnn")
 @tf.__internal__.dispatch.add_dispatch_support
-def rnn(step_function,
-        inputs,
-        initial_states,
-        go_backwards=False,
-        mask=None,
-        constants=None,
-        unroll=False,
-        input_length=None,
-        time_major=False,
-        zero_output_for_mask=False,
-        return_all_outputs=True):
-  """Iterates over the time dimension of a tensor.
-
-  Args:
-      step_function: RNN step function.
-          Args;
-              input; Tensor with shape `(samples, ...)` (no time dimension),
-                  representing input for the batch of samples at a certain
-                  time step.
-              states; List of tensors.
-          Returns;
-              output; Tensor with shape `(samples, output_dim)`
-                  (no time dimension).
-              new_states; List of tensors, same length and shapes
-                  as 'states'. The first state in the list must be the
-                  output tensor at the previous timestep.
-      inputs: Tensor of temporal data of shape `(samples, time, ...)`
-          (at least 3D), or nested tensors, and each of which has shape
-          `(samples, time, ...)`.
-      initial_states: Tensor with shape `(samples, state_size)`
-          (no time dimension), containing the initial values for the states used
-          in the step function. In the case that state_size is in a nested
-          shape, the shape of initial_states will also follow the nested
-          structure.
-      go_backwards: Boolean. If True, do the iteration over the time
-          dimension in reverse order and return the reversed sequence.
-      mask: Binary tensor with shape `(samples, time, 1)`,
-          with a zero for every element that is masked.
-      constants: List of constant values passed at each step.
-      unroll: Whether to unroll the RNN or to use a symbolic `while_loop`.
-      input_length: An integer or a 1-D Tensor, depending on whether
-          the time dimension is fixed-length or not. In case of variable length
-          input, it is used for masking in case there's no mask specified.
-      time_major: Boolean. If true, the inputs and outputs will be in shape
-          `(timesteps, batch, ...)`, whereas in the False case, it will be
-          `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
-          efficient because it avoids transposes at the beginning and end of the
-          RNN calculation. However, most TensorFlow data is batch-major, so by
-          default this function accepts input and emits output in batch-major
-          form.
-      zero_output_for_mask: Boolean. If True, the output for masked timestep
-          will be zeros, whereas in the False case, output from previous
-          timestep is returned.
-      return_all_outputs: Boolean. If True, return the recurrent outputs for all
-          timesteps in the sequence. If False, only return the output for the
-          last timestep (which consumes less memory).
-
-  Returns:
-      A tuple, `(last_output, outputs, new_states)`.
-          last_output: the latest output of the rnn, of shape `(samples, ...)`
-          outputs:
-              - If `return_all_outputs=True`: a tensor with shape
-                `(samples, time, ...)` where each entry `outputs[s, t]` is the
-                output of the step function at time `t` for sample `s`
-              - Else, a tensor equal to `last_output` with shape
-                `(samples, 1, ...)`
-          new_states: list of tensors, latest states returned by
-              the step function, of shape `(samples, ...)`.
-
-  Raises:
-      ValueError: if input dimension is less than 3.
-      ValueError: if `unroll` is `True` but input timestep is not a fixed
-      number.
-      ValueError: if `mask` is provided (not `None`) but states is not provided
-          (`len(states)` == 0).
-  """
-  if not tf.__internal__.tf2.enabled():
-    return_all_outputs = True  # Not supported in TF1.
-
-  def swap_batch_timestep(input_t):
-    # Swap the batch and timestep dim for the incoming tensor.
-    axes = list(range(len(input_t.shape)))
-    axes[0], axes[1] = 1, 0
-    return tf.compat.v1.transpose(input_t, axes)
-
-  if not time_major:
-    inputs = tf.nest.map_structure(swap_batch_timestep, inputs)
-
-  flatted_inputs = tf.nest.flatten(inputs)
-  time_steps = flatted_inputs[0].shape[0]
-  batch = flatted_inputs[0].shape[1]
-  time_steps_t = tf.shape(flatted_inputs[0])[0]
-
-  for input_ in flatted_inputs:
-    input_.shape.with_rank_at_least(3)
-
-  if mask is not None:
-    if mask.dtype != tf.bool:
-      mask = tf.cast(mask, tf.bool)
-    if len(mask.shape) == 2:
-      mask = expand_dims(mask)
+def rnn(
+    step_function,
+    inputs,
+    initial_states,
+    go_backwards=False,
+    mask=None,
+    constants=None,
+    unroll=False,
+    input_length=None,
+    time_major=False,
+    zero_output_for_mask=False,
+    return_all_outputs=True,
+):
+    """Iterates over the time dimension of a tensor.
+
+    Args:
+        step_function: RNN step function.
+            Args;
+                input; Tensor with shape `(samples, ...)` (no time dimension),
+                    representing input for the batch of samples at a certain
+                    time step.
+                states; List of tensors.
+            Returns;
+                output; Tensor with shape `(samples, output_dim)`
+                    (no time dimension).
+                new_states; List of tensors, same length and shapes
+                    as 'states'. The first state in the list must be the
+                    output tensor at the previous timestep.
+        inputs: Tensor of temporal data of shape `(samples, time, ...)`
+            (at least 3D), or nested tensors, and each of which has shape
+            `(samples, time, ...)`.
+        initial_states: Tensor with shape `(samples, state_size)`
+            (no time dimension), containing the initial values for the states
+            used in the step function. In the case that state_size is in a
+            nested shape, the shape of initial_states will also follow the
+            nested structure.
+        go_backwards: Boolean. If True, do the iteration over the time
+            dimension in reverse order and return the reversed sequence.
+        mask: Binary tensor with shape `(samples, time, 1)`,
+            with a zero for every element that is masked.
+        constants: List of constant values passed at each step.
+        unroll: Whether to unroll the RNN or to use a symbolic `while_loop`.
+        input_length: An integer or a 1-D Tensor, depending on whether
+            the time dimension is fixed-length or not. In case of variable
+            length input, it is used for masking in case there's no mask
+            specified.
+        time_major: Boolean. If true, the inputs and outputs will be in shape
+            `(timesteps, batch, ...)`, whereas in the False case, it will be
+            `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
+            efficient because it avoids transposes at the beginning and end of
+            the RNN calculation. However, most TensorFlow data is batch-major,
+            so by default this function accepts input and emits output in
+            batch-major form.
+        zero_output_for_mask: Boolean. If True, the output for masked timestep
+            will be zeros, whereas in the False case, output from previous
+            timestep is returned.
+        return_all_outputs: Boolean. If True, return the recurrent outputs for
+            all timesteps in the sequence. If False, only return the output for
+            the last timestep (which consumes less memory).
+
+    Returns:
+        A tuple, `(last_output, outputs, new_states)`.
+            last_output: the latest output of the rnn, of shape `(samples, ...)`
+            outputs:
+                - If `return_all_outputs=True`: a tensor with shape
+                  `(samples, time, ...)` where each entry `outputs[s, t]` is the
+                  output of the step function at time `t` for sample `s`
+                - Else, a tensor equal to `last_output` with shape
+                  `(samples, 1, ...)`
+            new_states: list of tensors, latest states returned by
+                the step function, of shape `(samples, ...)`.
+
+    Raises:
+        ValueError: if input dimension is less than 3.
+        ValueError: if `unroll` is `True` but input timestep is not a fixed
+            number.
+        ValueError: if `mask` is provided (not `None`) but states is not
+            provided (`len(states)` == 0).
+    """
+    if not tf.__internal__.tf2.enabled():
+        return_all_outputs = True  # Not supported in TF1.
+
+    def swap_batch_timestep(input_t):
+        # Swap the batch and timestep dim for the incoming tensor.
+        axes = list(range(len(input_t.shape)))
+        axes[0], axes[1] = 1, 0
+        return tf.compat.v1.transpose(input_t, axes)
+
     if not time_major:
-      mask = swap_batch_timestep(mask)
-
-  if constants is None:
-    constants = []
-
-  # tf.where needs its condition tensor to be the same shape as its two
-  # result tensors, but in our case the condition (mask) tensor is
-  # (nsamples, 1), and inputs are (nsamples, ndimensions) or even more.
-  # So we need to broadcast the mask to match the shape of inputs.
-  # That's what the tile call does, it just repeats the mask along its
-  # second dimension n times.
-  def _expand_mask(mask_t, input_t, fixed_dim=1):
-    if tf.nest.is_nested(mask_t):
-      raise ValueError('mask_t is expected to be tensor, but got %s' % mask_t)
-    if tf.nest.is_nested(input_t):
-      raise ValueError('input_t is expected to be tensor, but got %s' % input_t)
-    rank_diff = len(input_t.shape) - len(mask_t.shape)
-    for _ in range(rank_diff):
-      mask_t = tf.expand_dims(mask_t, -1)
-    multiples = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:]
-    return tf.tile(mask_t, multiples)
-
-  if unroll:
-    if not time_steps:
-      raise ValueError('Unrolling requires a fixed number of timesteps.')
-    states = tuple(initial_states)
-    successive_states = []
-    successive_outputs = []
-
-    # Process the input tensors. The input tensor need to be split on the
-    # time_step dim, and reverse if go_backwards is True. In the case of nested
-    # input, the input is flattened and then transformed individually.
-    # The result of this will be a tuple of lists, each of the item in tuple is
-    # list of the tensor with shape (batch, feature)
-    def _process_single_input_t(input_t):
-      input_t = tf.unstack(input_t)  # unstack for time_step dim
-      if go_backwards:
-        input_t.reverse()
-      return input_t
-
-    if tf.nest.is_nested(inputs):
-      processed_input = tf.nest.map_structure(_process_single_input_t, inputs)
-    else:
-      processed_input = (_process_single_input_t(inputs),)
+        inputs = tf.nest.map_structure(swap_batch_timestep, inputs)
 
-    def _get_input_tensor(time):
-      inp = [t_[time] for t_ in processed_input]
-      return tf.nest.pack_sequence_as(inputs, inp)
+    flatted_inputs = tf.nest.flatten(inputs)
+    time_steps = flatted_inputs[0].shape[0]
+    batch = flatted_inputs[0].shape[1]
+    time_steps_t = tf.shape(flatted_inputs[0])[0]
+
+    for input_ in flatted_inputs:
+        input_.shape.with_rank_at_least(3)
 
     if mask is not None:
-      mask_list = tf.unstack(mask)
-      if go_backwards:
-        mask_list.reverse()
-
-      for i in range(time_steps):
-        inp = _get_input_tensor(i)
-        mask_t = mask_list[i]
-        output, new_states = step_function(inp,
-                                           tuple(states) + tuple(constants))
-        tiled_mask_t = _expand_mask(mask_t, output)
-
-        if not successive_outputs:
-          prev_output = zeros_like(output)
+        if mask.dtype != tf.bool:
+            mask = tf.cast(mask, tf.bool)
+        if len(mask.shape) == 2:
+            mask = expand_dims(mask)
+        if not time_major:
+            mask = swap_batch_timestep(mask)
+
+    if constants is None:
+        constants = []
+
+    # tf.where needs its condition tensor to be the same shape as its two
+    # result tensors, but in our case the condition (mask) tensor is
+    # (nsamples, 1), and inputs are (nsamples, ndimensions) or even more.
+    # So we need to broadcast the mask to match the shape of inputs.
+    # That's what the tile call does, it just repeats the mask along its
+    # second dimension n times.
+    def _expand_mask(mask_t, input_t, fixed_dim=1):
+        if tf.nest.is_nested(mask_t):
+            raise ValueError(
+                f"mask_t is expected to be tensor, but got {mask_t}"
+            )
+        if tf.nest.is_nested(input_t):
+            raise ValueError(
+                f"input_t is expected to be tensor, but got {input_t}"
+            )
+        rank_diff = len(input_t.shape) - len(mask_t.shape)
+        for _ in range(rank_diff):
+            mask_t = tf.expand_dims(mask_t, -1)
+        multiples = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:]
+        return tf.tile(mask_t, multiples)
+
+    if unroll:
+        if not time_steps:
+            raise ValueError("Unrolling requires a fixed number of timesteps.")
+        states = tuple(initial_states)
+        successive_states = []
+        successive_outputs = []
+
+        # Process the input tensors. The input tensor need to be split on the
+        # time_step dim, and reverse if go_backwards is True. In the case of
+        # nested input, the input is flattened and then transformed
+        # individually.  The result of this will be a tuple of lists, each of
+        # the item in tuple is list of the tensor with shape (batch, feature)
+        def _process_single_input_t(input_t):
+            input_t = tf.unstack(input_t)  # unstack for time_step dim
+            if go_backwards:
+                input_t.reverse()
+            return input_t
+
+        if tf.nest.is_nested(inputs):
+            processed_input = tf.nest.map_structure(
+                _process_single_input_t, inputs
+            )
         else:
-          prev_output = successive_outputs[-1]
-
-        output = tf.where(tiled_mask_t, output, prev_output)
-
-        flat_states = tf.nest.flatten(states)
-        flat_new_states = tf.nest.flatten(new_states)
-        tiled_mask_t = tuple(_expand_mask(mask_t, s) for s in flat_states)
-        flat_final_states = tuple(
-            tf.where(m, s, ps)
-            for m, s, ps in zip(tiled_mask_t, flat_new_states, flat_states))
-        states = tf.nest.pack_sequence_as(states, flat_final_states)
+            processed_input = (_process_single_input_t(inputs),)
+
+        def _get_input_tensor(time):
+            inp = [t_[time] for t_ in processed_input]
+            return tf.nest.pack_sequence_as(inputs, inp)
+
+        if mask is not None:
+            mask_list = tf.unstack(mask)
+            if go_backwards:
+                mask_list.reverse()
+
+            for i in range(time_steps):
+                inp = _get_input_tensor(i)
+                mask_t = mask_list[i]
+                output, new_states = step_function(
+                    inp, tuple(states) + tuple(constants)
+                )
+                tiled_mask_t = _expand_mask(mask_t, output)
+
+                if not successive_outputs:
+                    prev_output = zeros_like(output)
+                else:
+                    prev_output = successive_outputs[-1]
+
+                output = tf.where(tiled_mask_t, output, prev_output)
+
+                flat_states = tf.nest.flatten(states)
+                flat_new_states = tf.nest.flatten(new_states)
+                tiled_mask_t = tuple(
+                    _expand_mask(mask_t, s) for s in flat_states
+                )
+                flat_final_states = tuple(
+                    tf.where(m, s, ps)
+                    for m, s, ps in zip(
+                        tiled_mask_t, flat_new_states, flat_states
+                    )
+                )
+                states = tf.nest.pack_sequence_as(states, flat_final_states)
+
+                if return_all_outputs:
+                    successive_outputs.append(output)
+                    successive_states.append(states)
+                else:
+                    successive_outputs = [output]
+                    successive_states = [states]
+            last_output = successive_outputs[-1]
+            new_states = successive_states[-1]
+            outputs = tf.stack(successive_outputs)
+
+            if zero_output_for_mask:
+                last_output = tf.where(
+                    _expand_mask(mask_list[-1], last_output),
+                    last_output,
+                    zeros_like(last_output),
+                )
+                outputs = tf.where(
+                    _expand_mask(mask, outputs, fixed_dim=2),
+                    outputs,
+                    zeros_like(outputs),
+                )
+
+        else:  # mask is None
+            for i in range(time_steps):
+                inp = _get_input_tensor(i)
+                output, states = step_function(
+                    inp, tuple(states) + tuple(constants)
+                )
+                if return_all_outputs:
+                    successive_outputs.append(output)
+                    successive_states.append(states)
+                else:
+                    successive_outputs = [output]
+                    successive_states = [states]
+            last_output = successive_outputs[-1]
+            new_states = successive_states[-1]
+            outputs = tf.stack(successive_outputs)
+
+    else:  # Unroll == False
+        states = tuple(initial_states)
+
+        # Create input tensor array, if the inputs is nested tensors, then it
+        # will be flattened first, and tensor array will be created one per
+        # flattened tensor.
+        input_ta = tuple(
+            tf.TensorArray(
+                dtype=inp.dtype,
+                size=time_steps_t,
+                tensor_array_name=f"input_ta_{i}",
+            )
+            for i, inp in enumerate(flatted_inputs)
+        )
+        input_ta = tuple(
+            ta.unstack(input_)
+            if not go_backwards
+            else ta.unstack(reverse(input_, 0))
+            for ta, input_ in zip(input_ta, flatted_inputs)
+        )
+
+        # Get the time(0) input and compute the output for that, the output will
+        # be used to determine the dtype of output tensor array. Don't read from
+        # input_ta due to TensorArray clear_after_read default to True.
+        input_time_zero = tf.nest.pack_sequence_as(
+            inputs, [inp[0] for inp in flatted_inputs]
+        )
+        # output_time_zero is used to determine the cell output shape and its
+        # dtype.  the value is discarded.
+        output_time_zero, _ = step_function(
+            input_time_zero, tuple(initial_states) + tuple(constants)
+        )
+
+        output_ta_size = time_steps_t if return_all_outputs else 1
+        output_ta = tuple(
+            tf.TensorArray(
+                dtype=out.dtype,
+                size=output_ta_size,
+                element_shape=out.shape,
+                tensor_array_name=f"output_ta_{i}",
+            )
+            for i, out in enumerate(tf.nest.flatten(output_time_zero))
+        )
+
+        time = tf.constant(0, dtype="int32", name="time")
+
+        # We only specify the 'maximum_iterations' when building for XLA since
+        # that causes slowdowns on GPU in TF.
+        if (
+            not tf.executing_eagerly()
+            and control_flow_util.GraphOrParentsInXlaContext(
+                tf.compat.v1.get_default_graph()
+            )
+        ):
+            if input_length is None:
+                max_iterations = time_steps_t
+            else:
+                max_iterations = tf.reduce_max(input_length)
+        else:
+            max_iterations = None
+
+        while_loop_kwargs = {
+            "cond": lambda time, *_: time < time_steps_t,
+            "maximum_iterations": max_iterations,
+            "parallel_iterations": 32,
+            "swap_memory": True,
+        }
+        if mask is not None:
+            if go_backwards:
+                mask = reverse(mask, 0)
+
+            mask_ta = tf.TensorArray(
+                dtype=tf.bool, size=time_steps_t, tensor_array_name="mask_ta"
+            )
+            mask_ta = mask_ta.unstack(mask)
+
+            def masking_fn(time):
+                return mask_ta.read(time)
+
+            def compute_masked_output(mask_t, flat_out, flat_mask):
+                tiled_mask_t = tuple(
+                    _expand_mask(mask_t, o, fixed_dim=len(mask_t.shape))
+                    for o in flat_out
+                )
+                return tuple(
+                    tf.where(m, o, fm)
+                    for m, o, fm in zip(tiled_mask_t, flat_out, flat_mask)
+                )
+
+        elif isinstance(input_length, tf.Tensor):
+            if go_backwards:
+                max_len = tf.reduce_max(input_length, axis=0)
+                rev_input_length = tf.subtract(max_len - 1, input_length)
+
+                def masking_fn(time):
+                    return tf.less(rev_input_length, time)
+
+            else:
+
+                def masking_fn(time):
+                    return tf.greater(input_length, time)
+
+            def compute_masked_output(mask_t, flat_out, flat_mask):
+                return tuple(
+                    tf.compat.v1.where(mask_t, o, zo)
+                    for (o, zo) in zip(flat_out, flat_mask)
+                )
 
-        if return_all_outputs:
-          successive_outputs.append(output)
-          successive_states.append(states)
         else:
-          successive_outputs = [output]
-          successive_states = [states]
-      last_output = successive_outputs[-1]
-      new_states = successive_states[-1]
-      outputs = tf.stack(successive_outputs)
-
-      if zero_output_for_mask:
-        last_output = tf.where(
-            _expand_mask(mask_list[-1], last_output), last_output,
-            zeros_like(last_output))
-        outputs = tf.where(
-            _expand_mask(mask, outputs, fixed_dim=2), outputs,
-            zeros_like(outputs))
-
-    else:  # mask is None
-      for i in range(time_steps):
-        inp = _get_input_tensor(i)
-        output, states = step_function(inp, tuple(states) + tuple(constants))
-        if return_all_outputs:
-          successive_outputs.append(output)
-          successive_states.append(states)
+            masking_fn = None
+
+        if masking_fn is not None:
+            # Mask for the T output will be base on the output of T - 1. In the
+            # case T = 0, a zero filled tensor will be used.
+            flat_zero_output = tuple(
+                tf.zeros_like(o) for o in tf.nest.flatten(output_time_zero)
+            )
+
+            def _step(time, output_ta_t, prev_output, *states):
+                """RNN step function.
+
+                Args:
+                    time: Current timestep value.
+                    output_ta_t: TensorArray.
+                    prev_output: tuple of outputs from time - 1.
+                    *states: List of states.
+
+                Returns:
+                    Tuple: `(time + 1, output_ta_t, output) + tuple(new_states)`
+                """
+                current_input = tuple(ta.read(time) for ta in input_ta)
+                # maybe set shape.
+                current_input = tf.nest.pack_sequence_as(inputs, current_input)
+                mask_t = masking_fn(time)
+                output, new_states = step_function(
+                    current_input, tuple(states) + tuple(constants)
+                )
+                # mask output
+                flat_output = tf.nest.flatten(output)
+                flat_mask_output = (
+                    flat_zero_output
+                    if zero_output_for_mask
+                    else tf.nest.flatten(prev_output)
+                )
+                flat_new_output = compute_masked_output(
+                    mask_t, flat_output, flat_mask_output
+                )
+
+                # mask states
+                flat_state = tf.nest.flatten(states)
+                flat_new_state = tf.nest.flatten(new_states)
+                for state, new_state in zip(flat_state, flat_new_state):
+                    if isinstance(new_state, tf.Tensor):
+                        new_state.set_shape(state.shape)
+                flat_final_state = compute_masked_output(
+                    mask_t, flat_new_state, flat_state
+                )
+                new_states = tf.nest.pack_sequence_as(
+                    new_states, flat_final_state
+                )
+
+                ta_index_to_write = time if return_all_outputs else 0
+                output_ta_t = tuple(
+                    ta.write(ta_index_to_write, out)
+                    for ta, out in zip(output_ta_t, flat_new_output)
+                )
+
+                return (time + 1, output_ta_t, tuple(flat_new_output)) + tuple(
+                    new_states
+                )
+
+            final_outputs = tf.compat.v1.while_loop(
+                body=_step,
+                loop_vars=(time, output_ta, flat_zero_output) + states,
+                **while_loop_kwargs,
+            )
+            # Skip final_outputs[2] which is the output for final timestep.
+            new_states = final_outputs[3:]
         else:
-          successive_outputs = [output]
-          successive_states = [states]
-      last_output = successive_outputs[-1]
-      new_states = successive_states[-1]
-      outputs = tf.stack(successive_outputs)
-
-  else:  # Unroll == False
-    states = tuple(initial_states)
-
-    # Create input tensor array, if the inputs is nested tensors, then it will
-    # be flattened first, and tensor array will be created one per flattened
-    # tensor.
-    input_ta = tuple(
-        tf.TensorArray(
-            dtype=inp.dtype,
-            size=time_steps_t,
-            tensor_array_name='input_ta_%s' % i)
-        for i, inp in enumerate(flatted_inputs))
-    input_ta = tuple(
-        ta.unstack(input_) if not go_backwards else ta
-        .unstack(reverse(input_, 0))
-        for ta, input_ in zip(input_ta, flatted_inputs))
-
-    # Get the time(0) input and compute the output for that, the output will be
-    # used to determine the dtype of output tensor array. Don't read from
-    # input_ta due to TensorArray clear_after_read default to True.
-    input_time_zero = tf.nest.pack_sequence_as(inputs,
-                                            [inp[0] for inp in flatted_inputs])
-    # output_time_zero is used to determine the cell output shape and its dtype.
-    # the value is discarded.
-    output_time_zero, _ = step_function(
-        input_time_zero, tuple(initial_states) + tuple(constants))
-
-    output_ta_size = time_steps_t if return_all_outputs else 1
-    output_ta = tuple(
-        tf.TensorArray(
-            dtype=out.dtype,
-            size=output_ta_size,
-            element_shape=out.shape,
-            tensor_array_name='output_ta_%s' % i)
-        for i, out in enumerate(tf.nest.flatten(output_time_zero)))
-
-    time = tf.constant(0, dtype='int32', name='time')
-
-    # We only specify the 'maximum_iterations' when building for XLA since that
-    # causes slowdowns on GPU in TF.
-    if (not tf.executing_eagerly() and
-        control_flow_util.GraphOrParentsInXlaContext(tf.compat.v1.get_default_graph())):
-      max_iterations = tf.reduce_max(input_length)
-    else:
-      max_iterations = None
 
-    while_loop_kwargs = {
-        'cond': lambda time, *_: time < time_steps_t,
-        'maximum_iterations': max_iterations,
-        'parallel_iterations': 32,
-        'swap_memory': True,
-    }
-    if mask is not None:
-      if go_backwards:
-        mask = reverse(mask, 0)
-
-      mask_ta = tf.TensorArray(
-          dtype=tf.bool,
-          size=time_steps_t,
-          tensor_array_name='mask_ta')
-      mask_ta = mask_ta.unstack(mask)
-
-      def masking_fn(time):
-        return mask_ta.read(time)
-
-      def compute_masked_output(mask_t, flat_out, flat_mask):
-        tiled_mask_t = tuple(
-            _expand_mask(mask_t, o, fixed_dim=len(mask_t.shape))
-            for o in flat_out)
-        return tuple(
-            tf.where(m, o, fm)
-            for m, o, fm in zip(tiled_mask_t, flat_out, flat_mask))
-    elif isinstance(input_length, tf.Tensor):
-      if go_backwards:
-        max_len = tf.reduce_max(input_length, axis=0)
-        rev_input_length = tf.subtract(max_len - 1, input_length)
-
-        def masking_fn(time):
-          return tf.less(rev_input_length, time)
-      else:
-
-        def masking_fn(time):
-          return tf.greater(input_length, time)
-
-      def compute_masked_output(mask_t, flat_out, flat_mask):
-        return tuple(
-            tf.compat.v1.where(mask_t, o, zo)
-            for (o, zo) in zip(flat_out, flat_mask))
-    else:
-      masking_fn = None
+            def _step(time, output_ta_t, *states):
+                """RNN step function.
+
+                Args:
+                    time: Current timestep value.
+                    output_ta_t: TensorArray.
+                    *states: List of states.
+
+                Returns:
+                    Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+                """
+                current_input = tuple(ta.read(time) for ta in input_ta)
+                current_input = tf.nest.pack_sequence_as(inputs, current_input)
+                output, new_states = step_function(
+                    current_input, tuple(states) + tuple(constants)
+                )
+                flat_state = tf.nest.flatten(states)
+                flat_new_state = tf.nest.flatten(new_states)
+                for state, new_state in zip(flat_state, flat_new_state):
+                    if isinstance(new_state, tf.Tensor):
+                        new_state.set_shape(state.shape)
+
+                flat_output = tf.nest.flatten(output)
+                ta_index_to_write = time if return_all_outputs else 0
+                output_ta_t = tuple(
+                    ta.write(ta_index_to_write, out)
+                    for ta, out in zip(output_ta_t, flat_output)
+                )
+
+                new_states = tf.nest.pack_sequence_as(
+                    initial_states, flat_new_state
+                )
+                return (time + 1, output_ta_t) + tuple(new_states)
+
+            final_outputs = tf.compat.v1.while_loop(
+                body=_step,
+                loop_vars=(time, output_ta) + states,
+                **while_loop_kwargs,
+            )
+            new_states = final_outputs[2:]
+
+        output_ta = final_outputs[1]
+
+        outputs = tuple(o.stack() for o in output_ta)
+        last_output = tuple(o[-1] for o in outputs)
+
+        outputs = tf.nest.pack_sequence_as(output_time_zero, outputs)
+        last_output = tf.nest.pack_sequence_as(output_time_zero, last_output)
+
+    # static shape inference
+    def set_shape(output_):
+        if isinstance(output_, tf.Tensor):
+            shape = output_.shape.as_list()
+            if return_all_outputs:
+                shape[0] = time_steps
+            else:
+                shape[0] = 1
+            shape[1] = batch
+            output_.set_shape(shape)
+        return output_
+
+    outputs = tf.nest.map_structure(set_shape, outputs)
 
-    if masking_fn is not None:
-      # Mask for the T output will be base on the output of T - 1. In the case
-      # T = 0, a zero filled tensor will be used.
-      flat_zero_output = tuple(tf.zeros_like(o)
-                               for o in tf.nest.flatten(output_time_zero))
-      def _step(time, output_ta_t, prev_output, *states):
-        """RNN step function.
+    if not time_major:
+        outputs = tf.nest.map_structure(swap_batch_timestep, outputs)
 
-        Args:
-            time: Current timestep value.
-            output_ta_t: TensorArray.
-            prev_output: tuple of outputs from time - 1.
-            *states: List of states.
+    return last_output, outputs, new_states
 
-        Returns:
-            Tuple: `(time + 1, output_ta_t, output) + tuple(new_states)`
-        """
-        current_input = tuple(ta.read(time) for ta in input_ta)
-        # maybe set shape.
-        current_input = tf.nest.pack_sequence_as(inputs, current_input)
-        mask_t = masking_fn(time)
-        output, new_states = step_function(current_input,
-                                           tuple(states) + tuple(constants))
-        # mask output
-        flat_output = tf.nest.flatten(output)
-        flat_mask_output = (flat_zero_output if zero_output_for_mask
-                            else tf.nest.flatten(prev_output))
-        flat_new_output = compute_masked_output(mask_t, flat_output,
-                                                flat_mask_output)
-
-        # mask states
-        flat_state = tf.nest.flatten(states)
-        flat_new_state = tf.nest.flatten(new_states)
-        for state, new_state in zip(flat_state, flat_new_state):
-          if isinstance(new_state, tf.Tensor):
-            new_state.set_shape(state.shape)
-        flat_final_state = compute_masked_output(mask_t, flat_new_state,
-                                                 flat_state)
-        new_states = tf.nest.pack_sequence_as(new_states, flat_final_state)
-
-        ta_index_to_write = time if return_all_outputs else 0
-        output_ta_t = tuple(
-            ta.write(ta_index_to_write, out)
-            for ta, out in zip(output_ta_t, flat_new_output))
-
-        return (time + 1, output_ta_t,
-                tuple(flat_new_output)) + tuple(new_states)
-
-      final_outputs = tf.compat.v1.while_loop(
-          body=_step,
-          loop_vars=(time, output_ta, flat_zero_output) + states,
-          **while_loop_kwargs)
-      # Skip final_outputs[2] which is the output for final timestep.
-      new_states = final_outputs[3:]
-    else:
-      def _step(time, output_ta_t, *states):
-        """RNN step function.
-
-        Args:
-            time: Current timestep value.
-            output_ta_t: TensorArray.
-            *states: List of states.
 
-        Returns:
-            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
-        """
-        current_input = tuple(ta.read(time) for ta in input_ta)
-        current_input = tf.nest.pack_sequence_as(inputs, current_input)
-        output, new_states = step_function(current_input,
-                                           tuple(states) + tuple(constants))
-        flat_state = tf.nest.flatten(states)
-        flat_new_state = tf.nest.flatten(new_states)
-        for state, new_state in zip(flat_state, flat_new_state):
-          if isinstance(new_state, tf.Tensor):
-            new_state.set_shape(state.shape)
-
-        flat_output = tf.nest.flatten(output)
-        ta_index_to_write = time if return_all_outputs else 0
-        output_ta_t = tuple(
-            ta.write(ta_index_to_write, out)
-            for ta, out in zip(output_ta_t, flat_output))
-
-        new_states = tf.nest.pack_sequence_as(initial_states, flat_new_state)
-        return (time + 1, output_ta_t) + tuple(new_states)
-
-      final_outputs = tf.compat.v1.while_loop(
-          body=_step,
-          loop_vars=(time, output_ta) + states,
-          **while_loop_kwargs)
-      new_states = final_outputs[2:]
-
-    output_ta = final_outputs[1]
-
-    outputs = tuple(o.stack() for o in output_ta)
-    last_output = tuple(o[-1] for o in outputs)
-
-    outputs = tf.nest.pack_sequence_as(output_time_zero, outputs)
-    last_output = tf.nest.pack_sequence_as(output_time_zero, last_output)
-
-  # static shape inference
-  def set_shape(output_):
-    if isinstance(output_, tf.Tensor):
-      shape = output_.shape.as_list()
-      if return_all_outputs:
-        shape[0] = time_steps
-      else:
-        shape[0] = 1
-      shape[1] = batch
-      output_.set_shape(shape)
-    return output_
-
-  outputs = tf.nest.map_structure(set_shape, outputs)
-
-  if not time_major:
-    outputs = tf.nest.map_structure(swap_batch_timestep, outputs)
-
-  return last_output, outputs, new_states
-
-
-@keras_export('keras.backend.switch')
+@keras_export("keras.backend.switch")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def switch(condition, then_expression, else_expression):
-  """Switches between two operations depending on a scalar value.
-
-  Note that both `then_expression` and `else_expression`
-  should be symbolic tensors of the *same shape*.
-
-  Args:
-      condition: tensor (`int` or `bool`).
-      then_expression: either a tensor, or a callable that returns a tensor.
-      else_expression: either a tensor, or a callable that returns a tensor.
-
-  Returns:
-      The selected tensor.
-
-  Raises:
-      ValueError: If rank of `condition` is greater than rank of expressions.
-  """
-  if condition.dtype != tf.bool:
-    condition = tf.cast(condition, 'bool')
-  cond_ndim = ndim(condition)
-  if not cond_ndim:
-    if not callable(then_expression):
-
-      def then_expression_fn():
-        return then_expression
-    else:
-      then_expression_fn = then_expression
-    if not callable(else_expression):
+    """Switches between two operations depending on a scalar value.
+
+    Note that both `then_expression` and `else_expression`
+    should be symbolic tensors of the *same shape*.
+
+    Args:
+        condition: tensor (`int` or `bool`).
+        then_expression: either a tensor, or a callable that returns a tensor.
+        else_expression: either a tensor, or a callable that returns a tensor.
+
+    Returns:
+        The selected tensor.
+
+    Raises:
+        ValueError: If rank of `condition` is greater than rank of expressions.
+    """
+    if condition.dtype != tf.bool:
+        condition = tf.cast(condition, "bool")
+    cond_ndim = ndim(condition)
+    if not cond_ndim:
+        if not callable(then_expression):
+
+            def then_expression_fn():
+                return then_expression
+
+        else:
+            then_expression_fn = then_expression
+        if not callable(else_expression):
 
-      def else_expression_fn():
-        return else_expression
+            def else_expression_fn():
+                return else_expression
+
+        else:
+            else_expression_fn = else_expression
+        x = tf.compat.v1.cond(condition, then_expression_fn, else_expression_fn)
     else:
-      else_expression_fn = else_expression
-    x = tf.compat.v1.cond(condition, then_expression_fn, else_expression_fn)
-  else:
-    # tf.where needs its condition tensor
-    # to be the same shape as its two
-    # result tensors
-    if callable(then_expression):
-      then_expression = then_expression()
-    if callable(else_expression):
-      else_expression = else_expression()
-    expr_ndim = ndim(then_expression)
-    if cond_ndim > expr_ndim:
-      raise ValueError('Rank of `condition` should be less than or'
-                       ' equal to rank of `then_expression` and '
-                       '`else_expression`. ndim(condition)=' + str(cond_ndim) +
-                       ', ndim(then_expression)'
-                       '=' + str(expr_ndim))
-    if cond_ndim > 1:
-      ndim_diff = expr_ndim - cond_ndim
-      cond_shape = tf.concat(
-          [tf.shape(condition), [1] * ndim_diff], axis=0)
-      condition = tf.reshape(condition, cond_shape)
-      expr_shape = tf.shape(then_expression)
-      shape_diff = expr_shape - cond_shape
-      tile_shape = tf.where(shape_diff > 0, expr_shape,
-                            tf.ones_like(expr_shape))
-      condition = tf.tile(condition, tile_shape)
-    x = tf.where(condition, then_expression, else_expression)
-  return x
-
-
-@keras_export('keras.backend.in_train_phase')
+        # tf.where needs its condition tensor
+        # to be the same shape as its two
+        # result tensors
+        if callable(then_expression):
+            then_expression = then_expression()
+        if callable(else_expression):
+            else_expression = else_expression()
+        expr_ndim = ndim(then_expression)
+        if cond_ndim > expr_ndim:
+            raise ValueError(
+                "Rank of `condition` should be less than or"
+                " equal to rank of `then_expression` and "
+                "`else_expression`. ndim(condition)="
+                + str(cond_ndim)
+                + ", ndim(then_expression)="
+                + str(expr_ndim)
+            )
+        if cond_ndim > 1:
+            ndim_diff = expr_ndim - cond_ndim
+            cond_shape = tf.concat(
+                [tf.shape(condition), [1] * ndim_diff], axis=0
+            )
+            condition = tf.reshape(condition, cond_shape)
+            expr_shape = tf.shape(then_expression)
+            shape_diff = expr_shape - cond_shape
+            tile_shape = tf.where(
+                shape_diff > 0, expr_shape, tf.ones_like(expr_shape)
+            )
+            condition = tf.tile(condition, tile_shape)
+        x = tf.where(condition, then_expression, else_expression)
+    return x
+
+
+@keras_export("keras.backend.in_train_phase")
 @doc_controls.do_not_generate_docs
 def in_train_phase(x, alt, training=None):
-  """Selects `x` in train phase, and `alt` otherwise.
-
-  Note that `alt` should have the *same shape* as `x`.
-
-  Args:
-      x: What to return in train phase
-          (tensor or callable that returns a tensor).
-      alt: What to return otherwise
-          (tensor or callable that returns a tensor).
-      training: Optional scalar tensor
-          (or Python boolean, or Python integer)
-          specifying the learning phase.
-
-  Returns:
-      Either `x` or `alt` based on the `training` flag.
-      the `training` flag defaults to `K.learning_phase()`.
-  """
-  from keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
-  if training is None:
-    training = base_layer_utils.call_context().training
-
-  if training is None:
-    training = learning_phase()
-
-  # TODO(b/138862903): Handle the case when training is tensor.
-  if not tf.is_tensor(training):
-    if training == 1 or training is True:
-      if callable(x):
-        return x()
-      else:
-        return x
+    """Selects `x` in train phase, and `alt` otherwise.
 
-    elif training == 0 or training is False:
-      if callable(alt):
-        return alt()
-      else:
-        return alt
+    Note that `alt` should have the *same shape* as `x`.
 
-  # else: assume learning phase is a placeholder tensor.
-  x = switch(training, x, alt)
-  return x
+    Args:
+        x: What to return in train phase
+            (tensor or callable that returns a tensor).
+        alt: What to return otherwise
+            (tensor or callable that returns a tensor).
+        training: Optional scalar tensor
+            (or Python boolean, or Python integer)
+            specifying the learning phase.
+
+    Returns:
+        Either `x` or `alt` based on the `training` flag.
+        the `training` flag defaults to `K.learning_phase()`.
+    """
+    from keras.engine import (
+        base_layer_utils,
+    )
 
+    if training is None:
+        training = base_layer_utils.call_context().training
 
-@keras_export('keras.backend.in_test_phase')
+    if training is None:
+        training = learning_phase()
+
+    # TODO(b/138862903): Handle the case when training is tensor.
+    if not tf.is_tensor(training):
+        if training == 1 or training is True:
+            if callable(x):
+                return x()
+            else:
+                return x
+
+        elif training == 0 or training is False:
+            if callable(alt):
+                return alt()
+            else:
+                return alt
+
+    # else: assume learning phase is a placeholder tensor.
+    x = switch(training, x, alt)
+    return x
+
+
+@keras_export("keras.backend.in_test_phase")
 @doc_controls.do_not_generate_docs
 def in_test_phase(x, alt, training=None):
-  """Selects `x` in test phase, and `alt` otherwise.
+    """Selects `x` in test phase, and `alt` otherwise.
 
-  Note that `alt` should have the *same shape* as `x`.
+    Note that `alt` should have the *same shape* as `x`.
 
-  Args:
-      x: What to return in test phase
-          (tensor or callable that returns a tensor).
-      alt: What to return otherwise
-          (tensor or callable that returns a tensor).
-      training: Optional scalar tensor
-          (or Python boolean, or Python integer)
-          specifying the learning phase.
+    Args:
+        x: What to return in test phase
+            (tensor or callable that returns a tensor).
+        alt: What to return otherwise
+            (tensor or callable that returns a tensor).
+        training: Optional scalar tensor
+            (or Python boolean, or Python integer)
+            specifying the learning phase.
 
-  Returns:
-      Either `x` or `alt` based on `K.learning_phase`.
-  """
-  return in_train_phase(alt, x, training=training)
+    Returns:
+        Either `x` or `alt` based on `K.learning_phase`.
+    """
+    return in_train_phase(alt, x, training=training)
 
 
 # NN OPERATIONS
 
 
-@keras_export('keras.backend.relu')
+@keras_export("keras.backend.relu")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def relu(x, alpha=0., max_value=None, threshold=0.):
-  """Rectified linear unit.
-
-  With default values, it returns element-wise `max(x, 0)`.
-
-  Otherwise, it follows:
-  `f(x) = max_value` for `x >= max_value`,
-  `f(x) = x` for `threshold <= x < max_value`,
-  `f(x) = alpha * (x - threshold)` otherwise.
-
-  Args:
-      x: A tensor or variable.
-      alpha: A scalar, slope of negative section (default=`0.`).
-      max_value: float. Saturation threshold.
-      threshold: float. Threshold value for thresholded activation.
-
-  Returns:
-      A tensor.
-  """
-  # While x can be a tensor or variable, we also see cases where
-  # numpy arrays, lists, tuples are passed as well.
-  # lists, tuples do not have 'dtype' attribute.
-  dtype = getattr(x, 'dtype', floatx())
-  if alpha != 0.:
-    if max_value is None and threshold == 0:
-      return tf.nn.leaky_relu(x, alpha=alpha)
+def relu(x, alpha=0.0, max_value=None, threshold=0.0):
+    """Rectified linear unit.
 
-    if threshold != 0:
-      negative_part = tf.nn.relu(-x + threshold)
-    else:
-      negative_part = tf.nn.relu(-x)
+    With default values, it returns element-wise `max(x, 0)`.
+
+    Otherwise, it follows:
+    `f(x) = max_value` for `x >= max_value`,
+    `f(x) = x` for `threshold <= x < max_value`,
+    `f(x) = alpha * (x - threshold)` otherwise.
 
-  clip_max = max_value is not None
+    Args:
+        x: A tensor or variable.
+        alpha: A scalar, slope of negative section (default=`0.`).
+        max_value: float. Saturation threshold.
+        threshold: float. Threshold value for thresholded activation.
 
-  if threshold != 0:
-    # computes x for x > threshold else 0
-    x = x * tf.cast(tf.greater(x, threshold), dtype=dtype)
-  elif max_value == 6:
-    # if no threshold, then can use nn.relu6 native TF op for performance
-    x = tf.nn.relu6(x)
-    clip_max = False
-  else:
-    x = tf.nn.relu(x)
+    Returns:
+        A tensor.
+    """
+    # While x can be a tensor or variable, we also see cases where
+    # numpy arrays, lists, tuples are passed as well.
+    # lists, tuples do not have 'dtype' attribute.
+    dtype = getattr(x, "dtype", floatx())
+    if alpha != 0.0:
+        if max_value is None and threshold == 0:
+            return tf.nn.leaky_relu(x, alpha=alpha)
+
+        if threshold != 0:
+            negative_part = tf.nn.relu(-x + threshold)
+        else:
+            negative_part = tf.nn.relu(-x)
 
-  if clip_max:
-    max_value = _constant_to_tensor(max_value, x.dtype.base_dtype)
-    zero = _constant_to_tensor(0, x.dtype.base_dtype)
-    x = tf.clip_by_value(x, zero, max_value)
+    clip_max = max_value is not None
 
-  if alpha != 0.:
-    alpha = _to_tensor(alpha, x.dtype.base_dtype)
-    x -= alpha * negative_part
-  return x
+    if threshold != 0:
+        # computes x for x > threshold else 0
+        x = x * tf.cast(tf.greater(x, threshold), dtype=dtype)
+    elif max_value == 6:
+        # if no threshold, then can use nn.relu6 native TF op for performance
+        x = tf.nn.relu6(x)
+        clip_max = False
+    else:
+        x = tf.nn.relu(x)
+
+    if clip_max:
+        max_value = _constant_to_tensor(max_value, x.dtype.base_dtype)
+        zero = _constant_to_tensor(0, x.dtype.base_dtype)
+        x = tf.clip_by_value(x, zero, max_value)
+
+    if alpha != 0.0:
+        alpha = _to_tensor(alpha, x.dtype.base_dtype)
+        x -= alpha * negative_part
+    return x
 
 
-@keras_export('keras.backend.elu')
+@keras_export("keras.backend.elu")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def elu(x, alpha=1.):
-  """Exponential linear unit.
+def elu(x, alpha=1.0):
+    """Exponential linear unit.
 
-  Args:
-      x: A tensor or variable to compute the activation function for.
-      alpha: A scalar, slope of negative section.
+    Args:
+        x: A tensor or variable to compute the activation function for.
+        alpha: A scalar, slope of negative section.
 
-  Returns:
-      A tensor.
-  """
-  res = tf.nn.elu(x)
-  if alpha == 1:
-    return res
-  else:
-    return tf.where(x > 0, res, alpha * res)
+    Returns:
+        A tensor.
+    """
+    res = tf.nn.elu(x)
+    if alpha == 1:
+        return res
+    else:
+        return tf.where(x > 0, res, alpha * res)
 
 
-@keras_export('keras.backend.softmax')
+@keras_export("keras.backend.softmax")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def softmax(x, axis=-1):
-  """Softmax of a tensor.
+    """Softmax of a tensor.
+
+    Args:
+        x: A tensor or variable.
+        axis: The dimension softmax would be performed on.
+            The default is -1 which indicates the last dimension.
+
+    Returns:
+        A tensor.
+    """
+    if x.shape.rank <= 1:
+        raise ValueError(
+            f"Cannot apply softmax to a tensor that is 1D. Received input: {x}"
+        )
 
-  Args:
-      x: A tensor or variable.
-      axis: The dimension softmax would be performed on.
-          The default is -1 which indicates the last dimension.
+    if isinstance(axis, int):
+        output = tf.nn.softmax(x, axis=axis)
+    else:
+        # nn.softmax does not support tuple axis.
+        numerator = tf.exp(x - tf.reduce_max(x, axis=axis, keepdims=True))
+        denominator = tf.reduce_sum(numerator, axis=axis, keepdims=True)
+        output = numerator / denominator
 
-  Returns:
-      A tensor.
-  """
-  return tf.nn.softmax(x, axis=axis)
+    # Cache the logits to use for crossentropy loss.
+    output._keras_logits = x
+    return output
 
 
-@keras_export('keras.backend.softplus')
+@keras_export("keras.backend.softplus")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def softplus(x):
-  """Softplus of a tensor.
+    """Softplus of a tensor.
 
-  Args:
-      x: A tensor or variable.
+    Args:
+        x: A tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.math.softplus(x)
+    Returns:
+        A tensor.
+    """
+    return tf.math.softplus(x)
 
 
-@keras_export('keras.backend.softsign')
+@keras_export("keras.backend.softsign")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def softsign(x):
-  """Softsign of a tensor.
+    """Softsign of a tensor.
+
+    Args:
+        x: A tensor or variable.
+
+    Returns:
+        A tensor.
+    """
+    return tf.math.softsign(x)
 
-  Args:
-      x: A tensor or variable.
 
-  Returns:
-      A tensor.
-  """
-  return tf.math.softsign(x)
+def _get_logits(output, from_logits, op_type, fn_name):
+    output_ = output
+    from_logits_ = from_logits
 
+    has_keras_logits = hasattr(output, "_keras_logits")
+    if has_keras_logits:
+        output_ = output._keras_logits
+        from_logits_ = True
 
-@keras_export('keras.backend.categorical_crossentropy')
+    from_expected_op_type = (
+        not isinstance(output, (tf.__internal__.EagerTensor, tf.Variable))
+        and output.op.type == op_type
+    ) and not has_keras_logits
+
+    if from_expected_op_type:
+        # When softmax activation function is used for output operation, we
+        # use logits from the softmax function directly to compute loss in order
+        # to prevent collapsing zero when training.
+        # See b/117284466
+        assert len(output.op.inputs) == 1
+        output_ = output.op.inputs[0]
+        from_logits_ = True
+
+    if from_logits and (has_keras_logits or from_expected_op_type):
+        warnings.warn(
+            f'"`{fn_name}` received `from_logits=True`, but '
+            f"the `output` argument was produced by a {op_type} "
+            "activation and thus does not represent logits. "
+            "Was this intended?",
+            stacklevel=2,
+        )
+
+    return output_, from_logits_
+
+
+@keras_export("keras.backend.categorical_crossentropy")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
-  """Categorical crossentropy between an output tensor and a target tensor.
-
-  Args:
-      target: A tensor of the same shape as `output`.
-      output: A tensor resulting from a softmax
-          (unless `from_logits` is True, in which
-          case `output` is expected to be the logits).
-      from_logits: Boolean, whether `output` is the
-          result of a softmax, or is a tensor of logits.
-      axis: Int specifying the channels axis. `axis=-1` corresponds to data
-          format `channels_last`, and `axis=1` corresponds to data format
-          `channels_first`.
-
-  Returns:
-      Output tensor.
-
-  Raises:
-      ValueError: if `axis` is neither -1 nor one of the axes of `output`.
-
-  Example:
-
-  >>> a = tf.constant([1., 0., 0., 0., 1., 0., 0., 0., 1.], shape=[3,3])
-  >>> print(a)
-  tf.Tensor(
-    [[1. 0. 0.]
-     [0. 1. 0.]
-     [0. 0. 1.]], shape=(3, 3), dtype=float32)
-  >>> b = tf.constant([.9, .05, .05, .05, .89, .06, .05, .01, .94], shape=[3,3])
-  >>> print(b)
-  tf.Tensor(
-    [[0.9  0.05 0.05]
-     [0.05 0.89 0.06]
-     [0.05 0.01 0.94]], shape=(3, 3), dtype=float32)
-  >>> loss = tf.keras.backend.categorical_crossentropy(a, b)
-  >>> print(np.around(loss, 5))
-  [0.10536 0.11653 0.06188]
-  >>> loss = tf.keras.backend.categorical_crossentropy(a, a)
-  >>> print(np.around(loss, 5))
-  [0. 0. 0.]
-
-  """
-  target = tf.convert_to_tensor(target)
-  output = tf.convert_to_tensor(output)
-  target.shape.assert_is_compatible_with(output.shape)
-
-  # Use logits whenever they are available. `softmax` and `sigmoid`
-  # activations cache logits on the `output` Tensor.
-  if hasattr(output, '_keras_logits'):
-    output = output._keras_logits  # pylint: disable=protected-access
+    """Categorical crossentropy between an output tensor and a target tensor.
+
+    Args:
+        target: A tensor of the same shape as `output`.
+        output: A tensor resulting from a softmax
+            (unless `from_logits` is True, in which
+            case `output` is expected to be the logits).
+        from_logits: Boolean, whether `output` is the
+            result of a softmax, or is a tensor of logits.
+        axis: Int specifying the channels axis. `axis=-1` corresponds to data
+            format `channels_last`, and `axis=1` corresponds to data format
+            `channels_first`.
+
+    Returns:
+        Output tensor.
+
+    Raises:
+        ValueError: if `axis` is neither -1 nor one of the axes of `output`.
+
+    Example:
+
+    >>> a = tf.constant([1., 0., 0., 0., 1., 0., 0., 0., 1.], shape=[3,3])
+    >>> print(a)
+    tf.Tensor(
+      [[1. 0. 0.]
+       [0. 1. 0.]
+       [0. 0. 1.]], shape=(3, 3), dtype=float32)
+    >>> b = tf.constant([.9, .05, .05, .05, .89, .06, .05, .01, .94],
+    ...                 shape=[3, 3])
+    >>> print(b)
+    tf.Tensor(
+      [[0.9  0.05 0.05]
+       [0.05 0.89 0.06]
+       [0.05 0.01 0.94]], shape=(3, 3), dtype=float32)
+    >>> loss = tf.keras.backend.categorical_crossentropy(a, b)
+    >>> print(np.around(loss, 5))
+    [0.10536 0.11653 0.06188]
+    >>> loss = tf.keras.backend.categorical_crossentropy(a, a)
+    >>> print(np.around(loss, 5))
+    [0. 0. 0.]
+
+    """
+    target = tf.convert_to_tensor(target)
+    output = tf.convert_to_tensor(output)
+    target.shape.assert_is_compatible_with(output.shape)
+
+    output, from_logits = _get_logits(
+        output, from_logits, "Softmax", "categorical_crossentropy"
+    )
     if from_logits:
-      warnings.warn(
-          '"`categorical_crossentropy` received `from_logits=True`, but '
-          'the `output` argument was produced by a sigmoid or softmax '
-          'activation and thus does not represent logits. Was this intended?"',
-          stacklevel=2)
-    from_logits = True
-
-  if from_logits:
-    return tf.nn.softmax_cross_entropy_with_logits(
-        labels=target, logits=output, axis=axis)
-
-  if (not isinstance(output, (tf.__internal__.EagerTensor, tf.Variable)) and
-      output.op.type == 'Softmax') and not hasattr(output, '_keras_history'):
-    # When softmax activation function is used for output operation, we
-    # use logits from the softmax function directly to compute loss in order
-    # to prevent collapsing zero when training.
-    # See b/117284466
-    assert len(output.op.inputs) == 1
-    output = output.op.inputs[0]
-    return tf.nn.softmax_cross_entropy_with_logits(
-        labels=target, logits=output, axis=axis)
-
-  # scale preds so that the class probas of each sample sum to 1
-  output = output / tf.reduce_sum(output, axis, True)
-  # Compute cross entropy from probabilities.
-  epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
-  output = tf.clip_by_value(output, epsilon_, 1. - epsilon_)
-  return -tf.reduce_sum(target * tf.math.log(output), axis)
-
-
-@keras_export('keras.backend.sparse_categorical_crossentropy')
+        return tf.nn.softmax_cross_entropy_with_logits(
+            labels=target, logits=output, axis=axis
+        )
+
+    # Adjust the predictions so that the probability of
+    # each class for every sample adds up to 1
+    # This is needed to ensure that the cross entropy is
+    # computed correctly.
+    output = output / tf.reduce_sum(output, axis, True)
+
+    # Compute cross entropy from probabilities.
+    epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
+    output = tf.clip_by_value(output, epsilon_, 1.0 - epsilon_)
+    return -tf.reduce_sum(target * tf.math.log(output), axis)
+
+
+@keras_export("keras.backend.categorical_focal_crossentropy")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
-  """Categorical crossentropy with integer targets.
-
-  Args:
-      target: An integer tensor.
-      output: A tensor resulting from a softmax
-          (unless `from_logits` is True, in which
-          case `output` is expected to be the logits).
-      from_logits: Boolean, whether `output` is the
-          result of a softmax, or is a tensor of logits.
-      axis: Int specifying the channels axis. `axis=-1` corresponds to data
-          format `channels_last`, and `axis=1` corresponds to data format
-          `channels_first`.
-
-  Returns:
-      Output tensor.
-
-  Raises:
-      ValueError: if `axis` is neither -1 nor one of the axes of `output`.
-  """
-  target = tf.convert_to_tensor(target)
-  output = tf.convert_to_tensor(output)
-
-  # Use logits whenever they are available. `softmax` and `sigmoid`
-  # activations cache logits on the `output` Tensor.
-  if hasattr(output, '_keras_logits'):
-    output = output._keras_logits  # pylint: disable=protected-access
+def categorical_focal_crossentropy(
+    target,
+    output,
+    alpha=0.25,
+    gamma=2.0,
+    from_logits=False,
+    axis=-1,
+):
+    """Computes the alpha balanced focal crossentropy loss.
+
+    According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
+    helps to apply a focal factor to down-weight easy examples and focus more on
+    hard examples. The general formula for the focal loss (FL)
+    is as follows:
+
+    `FL(p_t) = (1 − p_t)^gamma * log(p_t)`
+
+    where `p_t` is defined as follows:
+    `p_t = output if y_true == 1, else 1 - output`
+
+    `(1 − p_t)^gamma` is the `modulating_factor`, where `gamma` is a focusing
+    parameter. When `gamma` = 0, there is no focal effect on the cross entropy.
+    `gamma` reduces the importance given to simple examples in a smooth manner.
+
+    The authors use alpha-balanced variant of focal loss (FL) in the paper:
+    `FL(p_t) = −alpha * (1 − p_t)^gamma * log(p_t)`
+
+    where `alpha` is the weight factor for the classes. If `alpha` = 1, the
+    loss won't be able to handle class imbalance properly as all
+    classes will have the same weight. This can be a constant or a list of
+    constants. If alpha is a list, it must have the same length as the number
+    of classes.
+
+    The formula above can be generalized to:
+    `FL(p_t) = alpha * (1 − p_t)^gamma * CrossEntropy(target, output)`
+
+    where minus comes from `CrossEntropy(target, output)` (CE).
+
+    Extending this to multi-class case is straightforward:
+    `FL(p_t) = alpha * (1 − p_t)^gamma * CategoricalCE(target, output)`
+
+    Args:
+        target: Ground truth values from the dataset.
+        output: Predictions of the model.
+        alpha: A weight balancing factor for all classes, default is `0.25` as
+            mentioned in the reference. It can be a list of floats or a scalar.
+            In the multi-class case, alpha may be set by inverse class
+            frequency by using `compute_class_weight` from `sklearn.utils`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference. It helps to gradually reduce the importance given to
+            simple examples in a smooth manner.
+        from_logits: Whether `output` is expected to be a logits tensor. By
+            default, we consider that `output` encodes a probability
+            distribution.
+        axis: Int specifying the channels axis. `axis=-1` corresponds to data
+             format `channels_last`, and `axis=1` corresponds to data format
+             `channels_first`.
+
+    Returns:
+        A tensor.
+    """
+    target = tf.convert_to_tensor(target)
+    output = tf.convert_to_tensor(output)
+    target.shape.assert_is_compatible_with(output.shape)
+
+    output, from_logits = _get_logits(
+        output, from_logits, "Softmax", "categorical_focal_crossentropy"
+    )
+
     if from_logits:
-      warnings.warn(
-          '"`sparse_categorical_crossentropy` received `from_logits=True`, but '
-          'the `output` argument was produced by a sigmoid or softmax '
-          'activation and thus does not represent logits. Was this intended?"',
-          stacklevel=2)
-    from_logits = True
-  elif (not from_logits and
-        not isinstance(output, (tf.__internal__.EagerTensor, tf.Variable)) and
-        output.op.type == 'Softmax') and not hasattr(output, '_keras_history'):
-    # When softmax activation function is used for output operation, we
-    # use logits from the softmax function directly to compute loss in order
-    # to prevent collapsing zero when training.
-    # See b/117284466
-    assert len(output.op.inputs) == 1
-    output = output.op.inputs[0]
-    from_logits = True
-  elif not from_logits:
+        output = tf.nn.softmax(output, axis=axis)
+
+    # Adjust the predictions so that the probability of
+    # each class for every sample adds up to 1
+    # This is needed to ensure that the cross entropy is
+    # computed correctly.
+    output = output / tf.reduce_sum(output, axis=axis, keepdims=True)
+
     epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
-    output = tf.clip_by_value(output, epsilon_, 1 - epsilon_)
-    output = tf.math.log(output)
-
-  if isinstance(output.shape, (tuple, list)):
-    output_rank = len(output.shape)
-  else:
-    output_rank = output.shape.ndims
-  if output_rank is not None:
-    axis %= output_rank
-    if axis != output_rank - 1:
-      permutation = list(
-          itertools.chain(range(axis), range(axis + 1, output_rank), [axis]))
-      output = tf.compat.v1.transpose(output, perm=permutation)
-  elif axis != -1:
-    raise ValueError(
-        'Cannot compute sparse categorical crossentropy with `axis={}` on an '
-        'output tensor with unknown rank'.format(axis))
-
-  target = cast(target, 'int64')
-
-  # Try to adjust the shape so that rank of labels = rank of logits - 1.
-  output_shape = tf.shape(output)
-  target_rank = target.shape.ndims
-
-  update_shape = (
-      target_rank is not None and output_rank is not None and
-      target_rank != output_rank - 1)
-  if update_shape:
-    target = flatten(target)
-    output = tf.reshape(output, [-1, output_shape[-1]])
-
-  if py_any(_is_symbolic_tensor(v) for v in [target, output]):
-    with get_graph().as_default():
-      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=target, logits=output)
-  else:
-    res = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        labels=target, logits=output)
-
-  if update_shape and output_rank >= 3:
-    # If our output includes timesteps or spatial dimensions we need to reshape
-    return tf.reshape(res, output_shape[:-1])
-  else:
+    output = tf.clip_by_value(output, epsilon_, 1.0 - epsilon_)
+
+    # Calculate cross entropy
+    cce = -target * tf.math.log(output)
+
+    # Calculate factors
+    modulating_factor = tf.pow(1.0 - output, gamma)
+    weighting_factor = tf.multiply(modulating_factor, alpha)
+
+    # Apply weighting factor
+    focal_cce = tf.multiply(weighting_factor, cce)
+    focal_cce = tf.reduce_sum(focal_cce, axis=axis)
+    return focal_cce
+
+
+@keras_export("keras.backend.sparse_categorical_crossentropy")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def sparse_categorical_crossentropy(
+    target, output, from_logits=False, axis=-1, ignore_class=None
+):
+    """Categorical crossentropy with integer targets.
+
+    Args:
+        target: An integer tensor.
+        output: A tensor resulting from a softmax
+            (unless `from_logits` is True, in which
+            case `output` is expected to be the logits).
+        from_logits: Boolean, whether `output` is the
+            result of a softmax, or is a tensor of logits.
+        axis: Int specifying the channels axis. `axis=-1` corresponds to data
+            format `channels_last`, and `axis=1` corresponds to data format
+            `channels_first`.
+        ignore_class: Optional integer. The ID of a class to be ignored
+            during loss computation. This is useful, for example, in
+            segmentation problems featuring a "void" class (commonly -1
+            or 255) in segmentation maps.
+            By default (`ignore_class=None`), all classes are considered.
+
+    Returns:
+        Output tensor.
+
+    Raises:
+        ValueError: if `axis` is neither -1 nor one of the axes of `output`.
+    """
+    target = tf.convert_to_tensor(target)
+    output = tf.convert_to_tensor(output)
+
+    target = cast(target, "int64")
+
+    output, from_logits = _get_logits(
+        output, from_logits, "Softmax", "sparse_categorical_crossentropy"
+    )
+    if not from_logits:
+        epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
+        output = tf.clip_by_value(output, epsilon_, 1 - epsilon_)
+        output = tf.math.log(output)
+
+    # Permute output so that the last axis contains the logits/probabilities.
+    if isinstance(output.shape, (tuple, list)):
+        output_rank = len(output.shape)
+    else:
+        output_rank = output.shape.ndims
+    if output_rank is not None:
+        axis %= output_rank
+        if axis != output_rank - 1:
+            permutation = list(
+                itertools.chain(
+                    range(axis), range(axis + 1, output_rank), [axis]
+                )
+            )
+            output = tf.compat.v1.transpose(output, perm=permutation)
+    elif axis != -1:
+        raise ValueError(
+            "Cannot compute sparse categorical crossentropy with `axis={}` "
+            "on an output tensor with unknown rank".format(axis)
+        )
+
+    # Try to adjust the shape so that rank of labels = rank of logits - 1.
+    output_shape = tf.shape(output)
+    target_rank = target.shape.ndims
+
+    update_shape = (
+        target_rank is not None
+        and output_rank is not None
+        and target_rank != output_rank - 1
+    )
+    if update_shape:
+        target = flatten(target)
+        output = tf.reshape(output, [-1, output_shape[-1]])
+
+    if ignore_class is not None:
+        valid_mask = tf.not_equal(target, cast(ignore_class, target.dtype))
+        target = target[valid_mask]
+        output = output[valid_mask]
+
+    if py_any(_is_symbolic_tensor(v) for v in [target, output]):
+        with get_graph().as_default():
+            res = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=target, logits=output
+            )
+    else:
+        res = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=target, logits=output
+        )
+
+    if ignore_class is not None:
+        res_shape = cast(output_shape[:-1], "int64")
+        valid_mask = tf.reshape(valid_mask, res_shape)
+        res = tf.scatter_nd(tf.where(valid_mask), res, res_shape)
+        res._keras_mask = valid_mask
+
+        return res
+
+    if update_shape and output_rank >= 3:
+        # If our output includes timesteps or
+        # spatial dimensions we need to reshape
+        res = tf.reshape(res, output_shape[:-1])
+
     return res
 
 
-@keras_export('keras.backend.binary_crossentropy')
+@keras_export("keras.backend.binary_crossentropy")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def binary_crossentropy(target, output, from_logits=False):
-  """Binary crossentropy between an output tensor and a target tensor.
-
-  Args:
-      target: A tensor with the same shape as `output`.
-      output: A tensor.
-      from_logits: Whether `output` is expected to be a logits tensor.
-          By default, we consider that `output`
-          encodes a probability distribution.
-
-  Returns:
-      A tensor.
-  """
-  target = tf.convert_to_tensor(target)
-  output = tf.convert_to_tensor(output)
-
-  # Use logits whenever they are available. `softmax` and `sigmoid`
-  # activations cache logits on the `output` Tensor.
-  if hasattr(output, '_keras_logits'):
-    output = output._keras_logits  # pylint: disable=protected-access
+    """Binary crossentropy between an output tensor and a target tensor.
+
+    Args:
+        target: A tensor with the same shape as `output`.
+        output: A tensor.
+        from_logits: Whether `output` is expected to be a logits tensor.
+            By default, we consider that `output`
+            encodes a probability distribution.
+
+    Returns:
+        A tensor.
+    """
+    target = tf.convert_to_tensor(target)
+    output = tf.convert_to_tensor(output)
+
+    output, from_logits = _get_logits(
+        output, from_logits, "Sigmoid", "binary_crossentropy"
+    )
     if from_logits:
-      warnings.warn(
-          '"`binary_crossentropy` received `from_logits=True`, but the `output`'
-          ' argument was produced by a sigmoid or softmax activation and thus '
-          'does not represent logits. Was this intended?"',
-          stacklevel=2)
-    from_logits = True
-
-  if from_logits:
-    return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
-
-  if (not isinstance(output, (tf.__internal__.EagerTensor, tf.Variable)) and
-      output.op.type == 'Sigmoid') and not hasattr(output, '_keras_history'):
-    # When sigmoid activation function is used for output operation, we
-    # use logits from the sigmoid function directly to compute loss in order
-    # to prevent collapsing zero when training.
-    assert len(output.op.inputs) == 1
-    output = output.op.inputs[0]
-    return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
-
-  epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
-  output = tf.clip_by_value(output, epsilon_, 1. - epsilon_)
-
-  # Compute cross entropy from probabilities.
-  bce = target * tf.math.log(output + epsilon())
-  bce += (1 - target) * tf.math.log(1 - output + epsilon())
-  return -bce
-
-
-@keras_export('keras.backend.binary_focal_crossentropy')
+        return tf.nn.sigmoid_cross_entropy_with_logits(
+            labels=target, logits=output
+        )
+
+    epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
+    output = tf.clip_by_value(output, epsilon_, 1.0 - epsilon_)
+
+    # Compute cross entropy from probabilities.
+    bce = target * tf.math.log(output + epsilon())
+    bce += (1 - target) * tf.math.log(1 - output + epsilon())
+    return -bce
+
+
+@keras_export("keras.backend.binary_focal_crossentropy")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def binary_focal_crossentropy(
     target,
     output,
+    apply_class_balancing=False,
+    alpha=0.25,
     gamma=2.0,
     from_logits=False,
 ):
-  """Binary focal crossentropy between an output tensor and a target tensor.
-
-  According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
-  helps to apply a focal factor to down-weight easy examples and focus more on
-  hard examples. By default, the focal tensor is computed as follows:
-
-  `focal_factor = (1 - output)**gamma` for class 1
-  `focal_factor = output**gamma` for class 0
-  where `gamma` is a focusing parameter. When `gamma` = 0, this function is
-  equivalent to the binary crossentropy.
-
-  Args:
-    target: A tensor with the same shape as `output`.
-    output: A tensor.
-    gamma: A focusing parameter used to compute the focal factor, default is 2.0
-      as mentioned in reference.
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
-
-  Returns:
-    A tensor.
-  """
-  sigmoidal = tf.__internal__.smart_cond.smart_cond(
-      from_logits,
-      lambda: sigmoid(output),
-      lambda: output,
-  )
-  p_t = (target * sigmoidal) + ((1 - target) * (1 - sigmoidal))
-  # Calculate focal factor
-  focal_factor = tf.pow(1.0 - p_t, gamma)
-  # Binary crossentropy
-  bce = binary_crossentropy(
-      target=target,
-      output=output,
-      from_logits=from_logits,
-  )
-  return focal_factor * bce
-
-
-@keras_export('keras.backend.binary_weighted_focal_crossentropy')
+    """Binary focal crossentropy between an output tensor and a target tensor.
+
+    According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
+    helps to apply a focal factor to down-weight easy examples and focus more on
+    hard examples. By default, the focal tensor is computed as follows:
+
+    `focal_factor = (1 - output) ** gamma` for class 1
+    `focal_factor = output ** gamma` for class 0
+    where `gamma` is a focusing parameter. When `gamma` = 0, there is no focal
+    effect on the binary crossentropy.
+
+    If `apply_class_balancing == True`, this function also takes into account a
+    weight balancing factor for the binary classes 0 and 1 as follows:
+
+    `weight = alpha` for class 1 (`target == 1`)
+    `weight = 1 - alpha` for class 0
+    where `alpha` is a float in the range of `[0, 1]`.
+
+    Args:
+        target: A tensor with the same shape as `output`.
+        output: A tensor.
+        apply_class_balancing: A bool, whether to apply weight balancing on the
+            binary classes 0 and 1.
+        alpha: A weight balancing factor for class 1, default is `0.25` as
+            mentioned in the reference. The weight for class 0 is `1.0 - alpha`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference.
+        from_logits: Whether `output` is expected to be a logits tensor. By
+            default, we consider that `output` encodes a probability
+            distribution.
+
+    Returns:
+        A tensor.
+    """
+
+    sigmoidal = sigmoid(output) if from_logits else output
+
+    p_t = target * sigmoidal + (1 - target) * (1 - sigmoidal)
+
+    # Calculate focal factor
+    focal_factor = tf.pow(1.0 - p_t, gamma)
+
+    # Binary crossentropy
+    bce = binary_crossentropy(
+        target=target,
+        output=output,
+        from_logits=from_logits,
+    )
+    focal_bce = focal_factor * bce
+
+    if apply_class_balancing:
+        weight = target * alpha + (1 - target) * (1 - alpha)
+        focal_bce = weight * focal_bce
+
+    return focal_bce
+
+
+@keras_export("keras.backend.sigmoid")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def sigmoid(x):
+    """Element-wise sigmoid.
+
+    Args:
+        x: A tensor or variable.
+
+    Returns:
+        A tensor.
+    """
+    output = tf.sigmoid(x)
+    # Cache the logits to use for crossentropy loss.
+    output._keras_logits = x
+    return output
+
+
+@keras_export("keras.backend.hard_sigmoid")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def hard_sigmoid(x):
+    """Segment-wise linear approximation of sigmoid.
+
+    Faster than sigmoid.
+    Returns `0.` if `x < -2.5`, `1.` if `x > 2.5`.
+    In `-2.5 <= x <= 2.5`, returns `0.2 * x + 0.5`.
+
+    Args:
+        x: A tensor or variable.
+
+    Returns:
+        A tensor.
+    """
+    point_two = _constant_to_tensor(0.2, x.dtype.base_dtype)
+    point_five = _constant_to_tensor(0.5, x.dtype.base_dtype)
+    x = tf.multiply(x, point_two)
+    x = tf.add(x, point_five)
+    x = tf.clip_by_value(x, 0.0, 1.0)
+    return x
+
+
+@keras_export("keras.backend.tanh")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def tanh(x):
+    """Element-wise tanh.
+
+    Args:
+        x: A tensor or variable.
+
+    Returns:
+        A tensor.
+    """
+    return tf.tanh(x)
+
+
+@keras_export("keras.backend.dropout")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def dropout(x, level, noise_shape=None, seed=None):
+    """Sets entries in `x` to zero at random, while scaling the entire tensor.
+
+    Args:
+        x: tensor
+        level: fraction of the entries in the tensor
+            that will be set to 0.
+        noise_shape: shape for randomly generated keep/drop flags,
+            must be broadcastable to the shape of `x`
+        seed: random seed to ensure determinism.
+
+    Returns:
+        A tensor.
+    """
+    if seed is None:
+        seed = np.random.randint(10e6)
+    return tf.nn.dropout(x, rate=level, noise_shape=noise_shape, seed=seed)
+
+
+@keras_export("keras.backend.l2_normalize")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def l2_normalize(x, axis=None):
+    """Normalizes a tensor wrt the L2 norm alongside the specified axis.
+
+    Args:
+        x: Tensor or variable.
+        axis: axis along which to perform normalization.
+
+    Returns:
+        A tensor.
+    """
+    return tf.linalg.l2_normalize(x, axis=axis)
+
+
+@keras_export("keras.backend.in_top_k")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def in_top_k(predictions, targets, k):
+    """Returns whether the `targets` are in the top `k` `predictions`.
+
+    Args:
+        predictions: A tensor of shape `(batch_size, classes)` and type
+          `float32`.
+        targets: A 1D tensor of length `batch_size` and type `int32` or `int64`.
+        k: An `int`, number of top elements to consider.
+
+    Returns:
+        A 1D tensor of length `batch_size` and type `bool`.
+        `output[i]` is `True` if `predictions[i, targets[i]]` is within top-`k`
+        values of `predictions[i]`.
+    """
+    return tf.compat.v1.math.in_top_k(predictions, targets, k)
+
+
+# CONVOLUTIONS
+
+
+def _preprocess_conv1d_input(x, data_format):
+    """Transpose and cast the input before the conv1d.
+
+    Args:
+        x: input tensor.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+
+    Returns:
+        A tensor.
+    """
+    tf_data_format = "NWC"  # to pass TF Conv2dNative operations
+    if data_format == "channels_first":
+        if not _has_nchw_support():
+            x = tf.compat.v1.transpose(x, (0, 2, 1))  # NCW -> NWC
+        else:
+            tf_data_format = "NCW"
+    return x, tf_data_format
+
+
+def _preprocess_conv2d_input(x, data_format, force_transpose=False):
+    """Transpose and cast the input before the conv2d.
+
+    Args:
+        x: input tensor.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        force_transpose: Boolean. If True, the input will always be transposed
+            from NCHW to NHWC if `data_format` is `"channels_first"`.
+            If False, the transposition only occurs on CPU (GPU ops are
+            assumed to support NCHW).
+
+    Returns:
+        A tensor.
+    """
+    tf_data_format = "NHWC"
+    if data_format == "channels_first":
+        if not _has_nchw_support() or force_transpose:
+            x = tf.compat.v1.transpose(x, (0, 2, 3, 1))  # NCHW -> NHWC
+        else:
+            tf_data_format = "NCHW"
+    return x, tf_data_format
+
+
+def _preprocess_conv3d_input(x, data_format):
+    """Transpose and cast the input before the conv3d.
+
+    Args:
+        x: input tensor.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+
+    Returns:
+        A tensor.
+    """
+    tf_data_format = "NDHWC"
+    if data_format == "channels_first":
+        if not _has_nchw_support():
+            x = tf.compat.v1.transpose(x, (0, 2, 3, 4, 1))
+        else:
+            tf_data_format = "NCDHW"
+    return x, tf_data_format
+
+
+def _preprocess_padding(padding):
+    """Convert keras' padding to TensorFlow's padding.
+
+    Args:
+        padding: string, one of 'same' , 'valid'
+
+    Returns:
+        a string, one of 'SAME', 'VALID'.
+
+    Raises:
+        ValueError: if invalid `padding'`
+    """
+    if padding == "same":
+        padding = "SAME"
+    elif padding == "valid":
+        padding = "VALID"
+    else:
+        raise ValueError("Invalid padding: " + str(padding))
+    return padding
+
+
+@keras_export("keras.backend.conv1d")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def conv1d(
+    x, kernel, strides=1, padding="valid", data_format=None, dilation_rate=1
+):
+    """1D convolution.
+
+    Args:
+        x: Tensor or variable.
+        kernel: kernel tensor.
+        strides: stride integer.
+        padding: string, `"same"`, `"causal"` or `"valid"`.
+        data_format: string, one of "channels_last", "channels_first".
+        dilation_rate: integer dilate rate.
+
+    Returns:
+        A tensor, result of 1D convolution.
+
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    kernel_shape = kernel.shape.as_list()
+    if padding == "causal":
+        # causal (dilated) convolution:
+        left_pad = dilation_rate * (kernel_shape[0] - 1)
+        x = temporal_padding(x, (left_pad, 0))
+        padding = "valid"
+    padding = _preprocess_padding(padding)
+
+    x, tf_data_format = _preprocess_conv1d_input(x, data_format)
+    x = tf.compat.v1.nn.convolution(
+        input=x,
+        filter=kernel,
+        dilation_rate=dilation_rate,
+        strides=strides,
+        padding=padding,
+        data_format=tf_data_format,
+    )
+    if data_format == "channels_first" and tf_data_format == "NWC":
+        x = tf.compat.v1.transpose(x, (0, 2, 1))  # NWC -> NCW
+    return x
+
+
+@keras_export("keras.backend.conv2d")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def conv2d(
+    x,
+    kernel,
+    strides=(1, 1),
+    padding="valid",
+    data_format=None,
+    dilation_rate=(1, 1),
+):
+    """2D convolution.
+
+    Args:
+        x: Tensor or variable.
+        kernel: kernel tensor.
+        strides: strides tuple.
+        padding: string, `"same"` or `"valid"`.
+        data_format: `"channels_last"` or `"channels_first"`.
+        dilation_rate: tuple of 2 integers.
+
+    Returns:
+        A tensor, result of 2D convolution.
+
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    x = tf.compat.v1.nn.convolution(
+        input=x,
+        filter=kernel,
+        dilation_rate=dilation_rate,
+        strides=strides,
+        padding=padding,
+        data_format=tf_data_format,
+    )
+    if data_format == "channels_first" and tf_data_format == "NHWC":
+        x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+    return x
+
+
+@keras_export("keras.backend.conv2d_transpose")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def conv2d_transpose(
+    x,
+    kernel,
+    output_shape,
+    strides=(1, 1),
+    padding="valid",
+    data_format=None,
+    dilation_rate=(1, 1),
+):
+    """2D deconvolution (i.e.
+
+    transposed convolution).
+
+    Args:
+        x: Tensor or variable.
+        kernel: kernel tensor.
+        output_shape: 1D int tensor for the output shape.
+        strides: strides tuple.
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        dilation_rate: Tuple of 2 integers.
+
+    Returns:
+        A tensor, result of transposed 2D convolution.
+
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    # `atrous_conv2d_transpose` only supports NHWC format, even on GPU.
+    if data_format == "channels_first" and dilation_rate != (1, 1):
+        force_transpose = True
+    else:
+        force_transpose = False
+
+    x, tf_data_format = _preprocess_conv2d_input(
+        x, data_format, force_transpose
+    )
+
+    if data_format == "channels_first" and tf_data_format == "NHWC":
+        output_shape = (
+            output_shape[0],
+            output_shape[2],
+            output_shape[3],
+            output_shape[1],
+        )
+    if output_shape[0] is None:
+        output_shape = (shape(x)[0],) + tuple(output_shape[1:])
+
+    if isinstance(output_shape, (tuple, list)):
+        output_shape = tf.stack(list(output_shape))
+
+    padding = _preprocess_padding(padding)
+    if tf_data_format == "NHWC":
+        strides = (1,) + strides + (1,)
+    else:
+        strides = (1, 1) + strides
+
+    if dilation_rate == (1, 1):
+        x = tf.compat.v1.nn.conv2d_transpose(
+            x,
+            kernel,
+            output_shape,
+            strides,
+            padding=padding,
+            data_format=tf_data_format,
+        )
+    else:
+        if dilation_rate[0] != dilation_rate[1]:
+            raise ValueError(
+                "Expected the 2 dimensions of the `dilation_rate` argument "
+                "to be equal to each other. "
+                f"Received: dilation_rate={dilation_rate}"
+            )
+        x = tf.nn.atrous_conv2d_transpose(
+            x, kernel, output_shape, rate=dilation_rate[0], padding=padding
+        )
+    if data_format == "channels_first" and tf_data_format == "NHWC":
+        x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+    return x
+
+
+def separable_conv1d(
+    x,
+    depthwise_kernel,
+    pointwise_kernel,
+    strides=1,
+    padding="valid",
+    data_format=None,
+    dilation_rate=1,
+):
+    """1D convolution with separable filters.
+
+    Args:
+        x: input tensor
+        depthwise_kernel: convolution kernel for the depthwise convolution.
+        pointwise_kernel: kernel for the 1x1 convolution.
+        strides: stride integer.
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        dilation_rate: integer dilation rate.
+
+    Returns:
+        Output tensor.
+
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    if isinstance(strides, int):
+        strides = (strides,)
+    if isinstance(dilation_rate, int):
+        dilation_rate = (dilation_rate,)
+
+    x, tf_data_format = _preprocess_conv1d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    if not isinstance(strides, tuple):
+        strides = tuple(strides)
+    if tf_data_format == "NWC":
+        spatial_start_dim = 1
+        strides = (1,) + strides * 2 + (1,)
+    else:
+        spatial_start_dim = 2
+        strides = (1, 1) + strides * 2
+    x = tf.expand_dims(x, spatial_start_dim)
+    depthwise_kernel = tf.expand_dims(depthwise_kernel, 0)
+    pointwise_kernel = tf.expand_dims(pointwise_kernel, 0)
+    dilation_rate = (1,) + dilation_rate
+
+    x = tf.nn.separable_conv2d(
+        x,
+        depthwise_kernel,
+        pointwise_kernel,
+        strides=strides,
+        padding=padding,
+        dilations=dilation_rate,
+        data_format=tf_data_format,
+    )
+
+    x = tf.squeeze(x, [spatial_start_dim])
+
+    if data_format == "channels_first" and tf_data_format == "NWC":
+        x = tf.compat.v1.transpose(x, (0, 2, 1))  # NWC -> NCW
+
+    return x
+
+
+@keras_export("keras.backend.separable_conv2d")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def binary_weighted_focal_crossentropy(
-    target,
-    output,
-    alpha=0.25,
-    gamma=2.0,
-    from_logits=False,
+def separable_conv2d(
+    x,
+    depthwise_kernel,
+    pointwise_kernel,
+    strides=(1, 1),
+    padding="valid",
+    data_format=None,
+    dilation_rate=(1, 1),
 ):
-  """Binary weighted focal crossentropy between an output tensor and a target.
-
-  According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
-  helps to apply a focal factor to down-weight easy examples and focus more on
-  hard examples. By default, the focal tensor is computed as follows:
-
-  `focal_factor = (1 - output)**gamma` for class 1
-  `focal_factor = output**gamma` for class 0
-  where `gamma` is a focusing parameter. When `gamma` = 0, there is no focal
-  effect on the binary crossentropy.
-
-  This function also takes into account a weight balancing factor for the binary
-  classes 0 and 1 as follows:
-
-  `weight = alpha` for class 1 (`target` = 1)
-  `weight = 1 - alpha` for class 0
-  where `alpha` is a float in the range of [0, 1].
-
-  Args:
-    target: A tensor with the same shape as `output`.
-    output: A tensor.
-    alpha: A weight balancing factor for class 1, default is 0.25 as mentioned
-    in reference. The weight for class 0 is 1.0 - `alpha`.
-    gamma: A focusing parameter, default is 2.0 as mentioned in reference.
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
-
-  Returns:
-    A tensor.
-  """
-  # Balancing weight for the binary classes
-  weight = target * alpha + (1 - target) * (1 - alpha)
-
-  # Binary focal crossentropy
-  bfce = binary_focal_crossentropy(
-      target=target,
-      output=output,
-      gamma=gamma,
-      from_logits=from_logits,
-  )
-  return weight * bfce
-
-
-@keras_export('keras.backend.sigmoid')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def sigmoid(x):
-  """Element-wise sigmoid.
+    """2D convolution with separable filters.
+
+    Args:
+        x: input tensor
+        depthwise_kernel: convolution kernel for the depthwise convolution.
+        pointwise_kernel: kernel for the 1x1 convolution.
+        strides: strides tuple (length 2).
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        dilation_rate: tuple of integers,
+            dilation rates for the separable convolution.
 
-  Args:
-      x: A tensor or variable.
+    Returns:
+        Output tensor.
+
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+        ValueError: if `strides` is not a tuple of 2 integers.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+    if len(strides) != 2:
+        raise ValueError("`strides` must be a tuple of 2 integers.")
+
+    x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    if not isinstance(strides, tuple):
+        strides = tuple(strides)
+    if tf_data_format == "NHWC":
+        strides = (1,) + strides + (1,)
+    else:
+        strides = (1, 1) + strides
 
-  Returns:
-      A tensor.
-  """
-  return tf.sigmoid(x)
+    x = tf.nn.separable_conv2d(
+        x,
+        depthwise_kernel,
+        pointwise_kernel,
+        strides=strides,
+        padding=padding,
+        dilations=dilation_rate,
+        data_format=tf_data_format,
+    )
+    if data_format == "channels_first" and tf_data_format == "NHWC":
+        x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+    return x
 
 
-@keras_export('keras.backend.hard_sigmoid')
+@keras_export("keras.backend.depthwise_conv2d")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def hard_sigmoid(x):
-  """Segment-wise linear approximation of sigmoid.
+def depthwise_conv2d(
+    x,
+    depthwise_kernel,
+    strides=(1, 1),
+    padding="valid",
+    data_format=None,
+    dilation_rate=(1, 1),
+):
+    """2D convolution with separable filters.
+
+    Args:
+        x: input tensor
+        depthwise_kernel: convolution kernel for the depthwise convolution.
+        strides: strides tuple (length 2).
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        dilation_rate: tuple of integers,
+            dilation rates for the separable convolution.
 
-  Faster than sigmoid.
-  Returns `0.` if `x < -2.5`, `1.` if `x > 2.5`.
-  In `-2.5 <= x <= 2.5`, returns `0.2 * x + 0.5`.
+    Returns:
+        Output tensor.
 
-  Args:
-      x: A tensor or variable.
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    if tf_data_format == "NHWC":
+        strides = (1,) + strides + (1,)
+    else:
+        strides = (1, 1) + strides
 
-  Returns:
-      A tensor.
-  """
-  point_two = _constant_to_tensor(0.2, x.dtype.base_dtype)
-  point_five = _constant_to_tensor(0.5, x.dtype.base_dtype)
-  x = tf.multiply(x, point_two)
-  x = tf.add(x, point_five)
-  x = tf.clip_by_value(x, 0., 1.)
-  return x
+    x = tf.nn.depthwise_conv2d(
+        x,
+        depthwise_kernel,
+        strides=strides,
+        padding=padding,
+        dilations=dilation_rate,
+        data_format=tf_data_format,
+    )
+    if data_format == "channels_first" and tf_data_format == "NHWC":
+        x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+    return x
 
 
-@keras_export('keras.backend.tanh')
+@keras_export("keras.backend.conv3d")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def tanh(x):
-  """Element-wise tanh.
+def conv3d(
+    x,
+    kernel,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format=None,
+    dilation_rate=(1, 1, 1),
+):
+    """3D convolution.
 
-  Args:
-      x: A tensor or variable.
+    Args:
+        x: Tensor or variable.
+        kernel: kernel tensor.
+        strides: strides tuple.
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        dilation_rate: tuple of 3 integers.
 
-  Returns:
-      A tensor.
-  """
-  return tf.tanh(x)
+    Returns:
+        A tensor, result of 3D convolution.
 
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    x, tf_data_format = _preprocess_conv3d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    x = tf.compat.v1.nn.convolution(
+        input=x,
+        filter=kernel,
+        dilation_rate=dilation_rate,
+        strides=strides,
+        padding=padding,
+        data_format=tf_data_format,
+    )
+    if data_format == "channels_first" and tf_data_format == "NDHWC":
+        x = tf.compat.v1.transpose(x, (0, 4, 1, 2, 3))
+    return x
 
-@keras_export('keras.backend.dropout')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def dropout(x, level, noise_shape=None, seed=None):
-  """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
-  Args:
-      x: tensor
-      level: fraction of the entries in the tensor
-          that will be set to 0.
-      noise_shape: shape for randomly generated keep/drop flags,
-          must be broadcastable to the shape of `x`
-      seed: random seed to ensure determinism.
+def conv3d_transpose(
+    x,
+    kernel,
+    output_shape,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format=None,
+):
+    """3D deconvolution (i.e.
 
-  Returns:
-      A tensor.
-  """
-  if seed is None:
-    seed = np.random.randint(10e6)
-  return tf.nn.dropout(x, rate=level, noise_shape=noise_shape, seed=seed)
+    transposed convolution).
 
+    Args:
+        x: input tensor.
+        kernel: kernel tensor.
+        output_shape: 1D int tensor for the output shape.
+        strides: strides tuple.
+        padding: string, "same" or "valid".
+        data_format: string, `"channels_last"` or `"channels_first"`.
 
-@keras_export('keras.backend.l2_normalize')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def l2_normalize(x, axis=None):
-  """Normalizes a tensor wrt the L2 norm alongside the specified axis.
+    Returns:
+        A tensor, result of transposed 3D convolution.
 
-  Args:
-      x: Tensor or variable.
-      axis: axis along which to perform normalization.
+    Raises:
+        ValueError: if `data_format` is neither `channels_last` or
+        `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+    if isinstance(output_shape, (tuple, list)):
+        output_shape = tf.stack(output_shape)
+
+    x, tf_data_format = _preprocess_conv3d_input(x, data_format)
+
+    if data_format == "channels_first" and tf_data_format == "NDHWC":
+        output_shape = (
+            output_shape[0],
+            output_shape[2],
+            output_shape[3],
+            output_shape[4],
+            output_shape[1],
+        )
+    if output_shape[0] is None:
+        output_shape = (tf.shape(x)[0],) + tuple(output_shape[1:])
+        output_shape = tf.stack(list(output_shape))
+
+    padding = _preprocess_padding(padding)
+    if tf_data_format == "NDHWC":
+        strides = (1,) + strides + (1,)
+    else:
+        strides = (1, 1) + strides
 
-  Returns:
-      A tensor.
-  """
-  return tf.linalg.l2_normalize(x, axis=axis)
+    x = tf.compat.v1.nn.conv3d_transpose(
+        x,
+        kernel,
+        output_shape,
+        strides,
+        padding=padding,
+        data_format=tf_data_format,
+    )
+    if data_format == "channels_first" and tf_data_format == "NDHWC":
+        x = tf.compat.v1.transpose(x, (0, 4, 1, 2, 3))
+    return x
 
 
-@keras_export('keras.backend.in_top_k')
+@keras_export("keras.backend.pool2d")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def in_top_k(predictions, targets, k):
-  """Returns whether the `targets` are in the top `k` `predictions`.
+def pool2d(
+    x,
+    pool_size,
+    strides=(1, 1),
+    padding="valid",
+    data_format=None,
+    pool_mode="max",
+):
+    """2D Pooling.
 
-  Args:
-      predictions: A tensor of shape `(batch_size, classes)` and type `float32`.
-      targets: A 1D tensor of length `batch_size` and type `int32` or `int64`.
-      k: An `int`, number of top elements to consider.
+    Args:
+        x: Tensor or variable.
+        pool_size: tuple of 2 integers.
+        strides: tuple of 2 integers.
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        pool_mode: string, `"max"` or `"avg"`.
 
-  Returns:
-      A 1D tensor of length `batch_size` and type `bool`.
-      `output[i]` is `True` if `predictions[i, targets[i]]` is within top-`k`
-      values of `predictions[i]`.
-  """
-  return tf.compat.v1.math.in_top_k(predictions, targets, k)
+    Returns:
+        A tensor, result of 2D pooling.
+
+    Raises:
+        ValueError: if `data_format` is neither `"channels_last"` or
+        `"channels_first"`.
+        ValueError: if `pool_size` is not a tuple of 2 integers.
+        ValueError: if `strides` is not a tuple of 2 integers.
+        ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+    if len(pool_size) != 2:
+        raise ValueError("`pool_size` must be a tuple of 2 integers.")
+    if len(strides) != 2:
+        raise ValueError("`strides` must be a tuple of 2 integers.")
+
+    x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    if tf_data_format == "NHWC":
+        strides = (1,) + strides + (1,)
+        pool_size = (1,) + pool_size + (1,)
+    else:
+        strides = (1, 1) + strides
+        pool_size = (1, 1) + pool_size
+
+    if pool_mode == "max":
+        x = tf.compat.v1.nn.max_pool(
+            x, pool_size, strides, padding=padding, data_format=tf_data_format
+        )
+    elif pool_mode == "avg":
+        x = tf.compat.v1.nn.avg_pool(
+            x, pool_size, strides, padding=padding, data_format=tf_data_format
+        )
+    else:
+        raise ValueError("Invalid pooling mode: " + str(pool_mode))
 
+    if data_format == "channels_first" and tf_data_format == "NHWC":
+        x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+    return x
 
-# CONVOLUTIONS
 
+@keras_export("keras.backend.pool3d")
+@tf.__internal__.dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
+def pool3d(
+    x,
+    pool_size,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format=None,
+    pool_mode="max",
+):
+    """3D Pooling.
 
-def _preprocess_conv1d_input(x, data_format):
-  """Transpose and cast the input before the conv1d.
-
-  Args:
-      x: input tensor.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-
-  Returns:
-      A tensor.
-  """
-  tf_data_format = 'NWC'  # to pass TF Conv2dNative operations
-  if data_format == 'channels_first':
-    if not _has_nchw_support():
-      x = tf.compat.v1.transpose(x, (0, 2, 1))  # NCW -> NWC
-    else:
-      tf_data_format = 'NCW'
-  return x, tf_data_format
+    Args:
+        x: Tensor or variable.
+        pool_size: tuple of 3 integers.
+        strides: tuple of 3 integers.
+        padding: string, `"same"` or `"valid"`.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+        pool_mode: string, `"max"` or `"avg"`.
 
+    Returns:
+        A tensor, result of 3D pooling.
 
-def _preprocess_conv2d_input(x, data_format, force_transpose=False):
-  """Transpose and cast the input before the conv2d.
-
-  Args:
-      x: input tensor.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      force_transpose: Boolean. If True, the input will always be transposed
-          from NCHW to NHWC if `data_format` is `"channels_first"`.
-          If False, the transposition only occurs on CPU (GPU ops are
-          assumed to support NCHW).
-
-  Returns:
-      A tensor.
-  """
-  tf_data_format = 'NHWC'
-  if data_format == 'channels_first':
-    if not _has_nchw_support() or force_transpose:
-      x = tf.compat.v1.transpose(x, (0, 2, 3, 1))  # NCHW -> NHWC
+    Raises:
+        ValueError: if `data_format` is neither `"channels_last"` or
+        `"channels_first"`.
+        ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    x, tf_data_format = _preprocess_conv3d_input(x, data_format)
+    padding = _preprocess_padding(padding)
+    if tf_data_format == "NDHWC":
+        strides = (1,) + strides + (1,)
+        pool_size = (1,) + pool_size + (1,)
     else:
-      tf_data_format = 'NCHW'
-  return x, tf_data_format
-
-
-def _preprocess_conv3d_input(x, data_format):
-  """Transpose and cast the input before the conv3d.
-
-  Args:
-      x: input tensor.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-
-  Returns:
-      A tensor.
-  """
-  tf_data_format = 'NDHWC'
-  if data_format == 'channels_first':
-    if not _has_nchw_support():
-      x = tf.compat.v1.transpose(x, (0, 2, 3, 4, 1))
+        strides = (1, 1) + strides
+        pool_size = (1, 1) + pool_size
+
+    if pool_mode == "max":
+        x = tf.nn.max_pool3d(
+            x, pool_size, strides, padding=padding, data_format=tf_data_format
+        )
+    elif pool_mode == "avg":
+        x = tf.nn.avg_pool3d(
+            x, pool_size, strides, padding=padding, data_format=tf_data_format
+        )
     else:
-      tf_data_format = 'NCDHW'
-  return x, tf_data_format
+        raise ValueError("Invalid pooling mode: " + str(pool_mode))
 
+    if data_format == "channels_first" and tf_data_format == "NDHWC":
+        x = tf.compat.v1.transpose(x, (0, 4, 1, 2, 3))
+    return x
 
-def _preprocess_padding(padding):
-  """Convert keras' padding to TensorFlow's padding.
 
-  Args:
-      padding: string, one of 'same' , 'valid'
+def local_conv(
+    inputs, kernel, kernel_size, strides, output_shape, data_format=None
+):
+    """Apply N-D convolution with un-shared weights.
 
-  Returns:
-      a string, one of 'SAME', 'VALID'.
+    Args:
+        inputs: (N+2)-D tensor with shape
+            (batch_size, channels_in, d_in1, ..., d_inN)
+            if data_format='channels_first', or
+            (batch_size, d_in1, ..., d_inN, channels_in)
+            if data_format='channels_last'.
+        kernel: the unshared weight for N-D convolution,
+            with shape (output_items, feature_dim, channels_out), where
+            feature_dim = np.prod(kernel_size) * channels_in,
+            output_items = np.prod(output_shape).
+        kernel_size: a tuple of N integers, specifying the
+            spatial dimensions of the N-D convolution window.
+        strides: a tuple of N integers, specifying the strides
+            of the convolution along the spatial dimensions.
+        output_shape: a tuple of (d_out1, ..., d_outN) specifying the spatial
+            dimensionality of the output.
+        data_format: string, "channels_first" or "channels_last".
 
-  Raises:
-      ValueError: if invalid `padding'`
-  """
-  if padding == 'same':
-    padding = 'SAME'
-  elif padding == 'valid':
-    padding = 'VALID'
-  else:
-    raise ValueError('Invalid padding: ' + str(padding))
-  return padding
+    Returns:
+        An (N+2)-D tensor with shape:
+        (batch_size, channels_out) + output_shape
+        if data_format='channels_first', or:
+        (batch_size,) + output_shape + (channels_out,)
+        if data_format='channels_last'.
+
+    Raises:
+        ValueError: if `data_format` is neither
+        `channels_last` nor `channels_first`.
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+
+    kernel_shape = int_shape(kernel)
+    feature_dim = kernel_shape[1]
+    channels_out = kernel_shape[-1]
+    ndims = len(output_shape)
+    spatial_dimensions = list(range(ndims))
+
+    xs = []
+    output_axes_ticks = [range(axis_max) for axis_max in output_shape]
+    for position in itertools.product(*output_axes_ticks):
+        slices = [slice(None)]
+
+        if data_format == "channels_first":
+            slices.append(slice(None))
+
+        slices.extend(
+            slice(
+                position[d] * strides[d],
+                position[d] * strides[d] + kernel_size[d],
+            )
+            for d in spatial_dimensions
+        )
+
+        if data_format == "channels_last":
+            slices.append(slice(None))
+
+        xs.append(reshape(inputs[slices], (1, -1, feature_dim)))
+
+    x_aggregate = concatenate(xs, axis=0)
+    output = batch_dot(x_aggregate, kernel)
+    output = reshape(output, output_shape + (-1, channels_out))
+
+    if data_format == "channels_first":
+        permutation = [ndims, ndims + 1] + spatial_dimensions
+    else:
+        permutation = [ndims] + spatial_dimensions + [ndims + 1]
 
+    return permute_dimensions(output, permutation)
 
-@keras_export('keras.backend.conv1d')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def conv1d(x,
-           kernel,
-           strides=1,
-           padding='valid',
-           data_format=None,
-           dilation_rate=1):
-  """1D convolution.
-
-  Args:
-      x: Tensor or variable.
-      kernel: kernel tensor.
-      strides: stride integer.
-      padding: string, `"same"`, `"causal"` or `"valid"`.
-      data_format: string, one of "channels_last", "channels_first".
-      dilation_rate: integer dilate rate.
-
-  Returns:
-      A tensor, result of 1D convolution.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  kernel_shape = kernel.shape.as_list()
-  if padding == 'causal':
-    # causal (dilated) convolution:
-    left_pad = dilation_rate * (kernel_shape[0] - 1)
-    x = temporal_padding(x, (left_pad, 0))
-    padding = 'valid'
-  padding = _preprocess_padding(padding)
-
-  x, tf_data_format = _preprocess_conv1d_input(x, data_format)
-  x = tf.compat.v1.nn.convolution(
-      input=x,
-      filter=kernel,
-      dilation_rate=dilation_rate,
-      strides=strides,
-      padding=padding,
-      data_format=tf_data_format)
-  if data_format == 'channels_first' and tf_data_format == 'NWC':
-    x = tf.compat.v1.transpose(x, (0, 2, 1))  # NWC -> NCW
-  return x
-
-
-@keras_export('keras.backend.conv2d')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def conv2d(x,
-           kernel,
-           strides=(1, 1),
-           padding='valid',
-           data_format=None,
-           dilation_rate=(1, 1)):
-  """2D convolution.
-
-  Args:
-      x: Tensor or variable.
-      kernel: kernel tensor.
-      strides: strides tuple.
-      padding: string, `"same"` or `"valid"`.
-      data_format: `"channels_last"` or `"channels_first"`.
-      dilation_rate: tuple of 2 integers.
-
-  Returns:
-      A tensor, result of 2D convolution.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
-  padding = _preprocess_padding(padding)
-  x = tf.compat.v1.nn.convolution(
-      input=x,
-      filter=kernel,
-      dilation_rate=dilation_rate,
-      strides=strides,
-      padding=padding,
-      data_format=tf_data_format)
-  if data_format == 'channels_first' and tf_data_format == 'NHWC':
-    x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
-  return x
-
-
-@keras_export('keras.backend.conv2d_transpose')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def conv2d_transpose(x,
-                     kernel,
-                     output_shape,
-                     strides=(1, 1),
-                     padding='valid',
-                     data_format=None,
-                     dilation_rate=(1, 1)):
-  """2D deconvolution (i.e.
-
-  transposed convolution).
-
-  Args:
-      x: Tensor or variable.
-      kernel: kernel tensor.
-      output_shape: 1D int tensor for the output shape.
-      strides: strides tuple.
-      padding: string, `"same"` or `"valid"`.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      dilation_rate: Tuple of 2 integers.
-
-  Returns:
-      A tensor, result of transposed 2D convolution.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  # `atrous_conv2d_transpose` only supports NHWC format, even on GPU.
-  if data_format == 'channels_first' and dilation_rate != (1, 1):
-    force_transpose = True
-  else:
-    force_transpose = False
-
-  x, tf_data_format = _preprocess_conv2d_input(x, data_format, force_transpose)
-
-  if data_format == 'channels_first' and tf_data_format == 'NHWC':
-    output_shape = (output_shape[0], output_shape[2], output_shape[3],
-                    output_shape[1])
-  if output_shape[0] is None:
-    output_shape = (shape(x)[0],) + tuple(output_shape[1:])
-
-  if isinstance(output_shape, (tuple, list)):
-    output_shape = tf.stack(list(output_shape))
-
-  padding = _preprocess_padding(padding)
-  if tf_data_format == 'NHWC':
-    strides = (1,) + strides + (1,)
-  else:
-    strides = (1, 1) + strides
-
-  if dilation_rate == (1, 1):
-    x = tf.compat.v1.nn.conv2d_transpose(x, kernel, output_shape, strides,
-                                         padding=padding,
-                                         data_format=tf_data_format)
-  else:
-    if dilation_rate[0] != dilation_rate[1]:
-      raise ValueError(
-          'Expected the 2 dimensions of the `dilation_rate` argument '
-          'to be equal to each other. '
-          f'Received: dilation_rate={dilation_rate}'
-      )
-    x = tf.nn.atrous_conv2d_transpose(
-        x,
-        kernel,
-        output_shape,
-        rate=dilation_rate[0],
-        padding=padding)
-  if data_format == 'channels_first' and tf_data_format == 'NHWC':
-    x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
-  return x
-
-
-def separable_conv1d(x,
-                     depthwise_kernel,
-                     pointwise_kernel,
-                     strides=1,
-                     padding='valid',
-                     data_format=None,
-                     dilation_rate=1):
-  """1D convolution with separable filters.
-
-  Args:
-      x: input tensor
-      depthwise_kernel: convolution kernel for the depthwise convolution.
-      pointwise_kernel: kernel for the 1x1 convolution.
-      strides: stride integer.
-      padding: string, `"same"` or `"valid"`.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      dilation_rate: integer dilation rate.
-
-  Returns:
-      Output tensor.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  if isinstance(strides, int):
-    strides = (strides,)
-  if isinstance(dilation_rate, int):
-    dilation_rate = (dilation_rate,)
-
-  x, tf_data_format = _preprocess_conv1d_input(x, data_format)
-  padding = _preprocess_padding(padding)
-  if not isinstance(strides, tuple):
-    strides = tuple(strides)
-  if tf_data_format == 'NWC':
-    spatial_start_dim = 1
-    strides = (1,) + strides * 2 + (1,)
-  else:
-    spatial_start_dim = 2
-    strides = (1, 1) + strides * 2
-  x = tf.expand_dims(x, spatial_start_dim)
-  depthwise_kernel = tf.expand_dims(depthwise_kernel, 0)
-  pointwise_kernel = tf.expand_dims(pointwise_kernel, 0)
-  dilation_rate = (1,) + dilation_rate
-
-  x = tf.compat.v1.nn.separable_conv2d(
-      x,
-      depthwise_kernel,
-      pointwise_kernel,
-      strides=strides,
-      padding=padding,
-      rate=dilation_rate,
-      data_format=tf_data_format)
-
-  x = tf.squeeze(x, [spatial_start_dim])
-
-  if data_format == 'channels_first' and tf_data_format == 'NWC':
-    x = tf.compat.v1.transpose(x, (0, 2, 1))  # NWC -> NCW
-
-  return x
-
-
-@keras_export('keras.backend.separable_conv2d')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def separable_conv2d(x,
-                     depthwise_kernel,
-                     pointwise_kernel,
-                     strides=(1, 1),
-                     padding='valid',
-                     data_format=None,
-                     dilation_rate=(1, 1)):
-  """2D convolution with separable filters.
-
-  Args:
-      x: input tensor
-      depthwise_kernel: convolution kernel for the depthwise convolution.
-      pointwise_kernel: kernel for the 1x1 convolution.
-      strides: strides tuple (length 2).
-      padding: string, `"same"` or `"valid"`.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      dilation_rate: tuple of integers,
-          dilation rates for the separable convolution.
-
-  Returns:
-      Output tensor.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-      ValueError: if `strides` is not a tuple of 2 integers.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-  if len(strides) != 2:
-    raise ValueError('`strides` must be a tuple of 2 integers.')
-
-  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
-  padding = _preprocess_padding(padding)
-  if not isinstance(strides, tuple):
-    strides = tuple(strides)
-  if tf_data_format == 'NHWC':
-    strides = (1,) + strides + (1,)
-  else:
-    strides = (1, 1) + strides
-
-  x = tf.compat.v1.nn.separable_conv2d(
-      x,
-      depthwise_kernel,
-      pointwise_kernel,
-      strides=strides,
-      padding=padding,
-      rate=dilation_rate,
-      data_format=tf_data_format)
-  if data_format == 'channels_first' and tf_data_format == 'NHWC':
-    x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
-  return x
-
-
-@keras_export('keras.backend.depthwise_conv2d')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def depthwise_conv2d(x,
-                     depthwise_kernel,
-                     strides=(1, 1),
-                     padding='valid',
-                     data_format=None,
-                     dilation_rate=(1, 1)):
-  """2D convolution with separable filters.
-
-  Args:
-      x: input tensor
-      depthwise_kernel: convolution kernel for the depthwise convolution.
-      strides: strides tuple (length 2).
-      padding: string, `"same"` or `"valid"`.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      dilation_rate: tuple of integers,
-          dilation rates for the separable convolution.
-
-  Returns:
-      Output tensor.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
-  padding = _preprocess_padding(padding)
-  if tf_data_format == 'NHWC':
-    strides = (1,) + strides + (1,)
-  else:
-    strides = (1, 1) + strides
-
-  x = tf.compat.v1.nn.depthwise_conv2d(
-      x,
-      depthwise_kernel,
-      strides=strides,
-      padding=padding,
-      rate=dilation_rate,
-      data_format=tf_data_format)
-  if data_format == 'channels_first' and tf_data_format == 'NHWC':
-    x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
-  return x
-
-
-@keras_export('keras.backend.conv3d')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def conv3d(x,
-           kernel,
-           strides=(1, 1, 1),
-           padding='valid',
-           data_format=None,
-           dilation_rate=(1, 1, 1)):
-  """3D convolution.
-
-  Args:
-      x: Tensor or variable.
-      kernel: kernel tensor.
-      strides: strides tuple.
-      padding: string, `"same"` or `"valid"`.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      dilation_rate: tuple of 3 integers.
-
-  Returns:
-      A tensor, result of 3D convolution.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  x, tf_data_format = _preprocess_conv3d_input(x, data_format)
-  padding = _preprocess_padding(padding)
-  x = tf.compat.v1.nn.convolution(
-      input=x,
-      filter=kernel,
-      dilation_rate=dilation_rate,
-      strides=strides,
-      padding=padding,
-      data_format=tf_data_format)
-  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
-    x = tf.compat.v1.transpose(x, (0, 4, 1, 2, 3))
-  return x
-
-
-def conv3d_transpose(x,
-                     kernel,
-                     output_shape,
-                     strides=(1, 1, 1),
-                     padding='valid',
-                     data_format=None):
-  """3D deconvolution (i.e.
-
-  transposed convolution).
-
-  Args:
-      x: input tensor.
-      kernel: kernel tensor.
-      output_shape: 1D int tensor for the output shape.
-      strides: strides tuple.
-      padding: string, "same" or "valid".
-      data_format: string, `"channels_last"` or `"channels_first"`.
-
-  Returns:
-      A tensor, result of transposed 3D convolution.
-
-  Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-  if isinstance(output_shape, (tuple, list)):
-    output_shape = tf.stack(output_shape)
-
-  x, tf_data_format = _preprocess_conv3d_input(x, data_format)
-
-  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
-    output_shape = (output_shape[0], output_shape[2], output_shape[3],
-                    output_shape[4], output_shape[1])
-  if output_shape[0] is None:
-    output_shape = (tf.shape(x)[0],) + tuple(output_shape[1:])
-    output_shape = tf.stack(list(output_shape))
-
-  padding = _preprocess_padding(padding)
-  if tf_data_format == 'NDHWC':
-    strides = (1,) + strides + (1,)
-  else:
-    strides = (1, 1) + strides
-
-  x = tf.compat.v1.nn.conv3d_transpose(
-      x,
-      kernel,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=tf_data_format)
-  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
-    x = tf.compat.v1.transpose(x, (0, 4, 1, 2, 3))
-  return x
-
-
-@keras_export('keras.backend.pool2d')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def pool2d(x,
-           pool_size,
-           strides=(1, 1),
-           padding='valid',
-           data_format=None,
-           pool_mode='max'):
-  """2D Pooling.
-
-  Args:
-      x: Tensor or variable.
-      pool_size: tuple of 2 integers.
-      strides: tuple of 2 integers.
-      padding: string, `"same"` or `"valid"`.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      pool_mode: string, `"max"` or `"avg"`.
-
-  Returns:
-      A tensor, result of 2D pooling.
-
-  Raises:
-      ValueError: if `data_format` is neither `"channels_last"` or
-      `"channels_first"`.
-      ValueError: if `pool_size` is not a tuple of 2 integers.
-      ValueError: if `strides` is not a tuple of 2 integers.
-      ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-  if len(pool_size) != 2:
-    raise ValueError('`pool_size` must be a tuple of 2 integers.')
-  if len(strides) != 2:
-    raise ValueError('`strides` must be a tuple of 2 integers.')
-
-  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
-  padding = _preprocess_padding(padding)
-  if tf_data_format == 'NHWC':
-    strides = (1,) + strides + (1,)
-    pool_size = (1,) + pool_size + (1,)
-  else:
-    strides = (1, 1) + strides
-    pool_size = (1, 1) + pool_size
-
-  if pool_mode == 'max':
-    x = tf.compat.v1.nn.max_pool(
-        x, pool_size, strides, padding=padding, data_format=tf_data_format)
-  elif pool_mode == 'avg':
-    x = tf.compat.v1.nn.avg_pool(
-        x, pool_size, strides, padding=padding, data_format=tf_data_format)
-  else:
-    raise ValueError('Invalid pooling mode: ' + str(pool_mode))
-
-  if data_format == 'channels_first' and tf_data_format == 'NHWC':
-    x = tf.compat.v1.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
-  return x
-
-
-@keras_export('keras.backend.pool3d')
-@tf.__internal__.dispatch.add_dispatch_support
-@doc_controls.do_not_generate_docs
-def pool3d(x,
-           pool_size,
-           strides=(1, 1, 1),
-           padding='valid',
-           data_format=None,
-           pool_mode='max'):
-  """3D Pooling.
-
-  Args:
-      x: Tensor or variable.
-      pool_size: tuple of 3 integers.
-      strides: tuple of 3 integers.
-      padding: string, `"same"` or `"valid"`.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-      pool_mode: string, `"max"` or `"avg"`.
-
-  Returns:
-      A tensor, result of 3D pooling.
-
-  Raises:
-      ValueError: if `data_format` is neither `"channels_last"` or
-      `"channels_first"`.
-      ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  x, tf_data_format = _preprocess_conv3d_input(x, data_format)
-  padding = _preprocess_padding(padding)
-  if tf_data_format == 'NDHWC':
-    strides = (1,) + strides + (1,)
-    pool_size = (1,) + pool_size + (1,)
-  else:
-    strides = (1, 1) + strides
-    pool_size = (1, 1) + pool_size
-
-  if pool_mode == 'max':
-    x = tf.nn.max_pool3d(
-        x, pool_size, strides, padding=padding, data_format=tf_data_format)
-  elif pool_mode == 'avg':
-    x = tf.nn.avg_pool3d(
-        x, pool_size, strides, padding=padding, data_format=tf_data_format)
-  else:
-    raise ValueError('Invalid pooling mode: ' + str(pool_mode))
-
-  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
-    x = tf.compat.v1.transpose(x, (0, 4, 1, 2, 3))
-  return x
-
-
-def local_conv(inputs,
-               kernel,
-               kernel_size,
-               strides,
-               output_shape,
-               data_format=None):
-  """Apply N-D convolution with un-shared weights.
-
-  Args:
-      inputs: (N+2)-D tensor with shape
-          (batch_size, channels_in, d_in1, ..., d_inN)
-          if data_format='channels_first', or
-          (batch_size, d_in1, ..., d_inN, channels_in)
-          if data_format='channels_last'.
-      kernel: the unshared weight for N-D convolution,
-          with shape (output_items, feature_dim, channels_out), where
-          feature_dim = np.prod(kernel_size) * channels_in,
-          output_items = np.prod(output_shape).
-      kernel_size: a tuple of N integers, specifying the
-          spatial dimensions of the N-D convolution window.
-      strides: a tuple of N integers, specifying the strides
-          of the convolution along the spatial dimensions.
-      output_shape: a tuple of (d_out1, ..., d_outN) specifying the spatial
-          dimensionality of the output.
-      data_format: string, "channels_first" or "channels_last".
-
-  Returns:
-      An (N+2)-D tensor with shape:
-      (batch_size, channels_out) + output_shape
-      if data_format='channels_first', or:
-      (batch_size,) + output_shape + (channels_out,)
-      if data_format='channels_last'.
-
-  Raises:
-      ValueError: if `data_format` is neither
-      `channels_last` nor `channels_first`.
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-
-  kernel_shape = int_shape(kernel)
-  feature_dim = kernel_shape[1]
-  channels_out = kernel_shape[-1]
-  ndims = len(output_shape)
-  spatial_dimensions = list(range(ndims))
-
-  xs = []
-  output_axes_ticks = [range(axis_max) for axis_max in output_shape]
-  for position in itertools.product(*output_axes_ticks):
-    slices = [slice(None)]
-
-    if data_format == 'channels_first':
-      slices.append(slice(None))
-
-    slices.extend(
-        slice(position[d] * strides[d], position[d] * strides[d] +
-              kernel_size[d]) for d in spatial_dimensions)
-
-    if data_format == 'channels_last':
-      slices.append(slice(None))
-
-    xs.append(reshape(inputs[slices], (1, -1, feature_dim)))
-
-  x_aggregate = concatenate(xs, axis=0)
-  output = batch_dot(x_aggregate, kernel)
-  output = reshape(output, output_shape + (-1, channels_out))
-
-  if data_format == 'channels_first':
-    permutation = [ndims, ndims + 1] + spatial_dimensions
-  else:
-    permutation = [ndims] + spatial_dimensions + [ndims + 1]
-
-  return permute_dimensions(output, permutation)
-
-
-@keras_export('keras.backend.local_conv1d')
+
+@keras_export("keras.backend.local_conv1d")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
-  """Apply 1D conv with un-shared weights.
-
-  Args:
-      inputs: 3D tensor with shape:
-          (batch_size, steps, input_dim)
-          if data_format is "channels_last" or
-          (batch_size, input_dim, steps)
-          if data_format is "channels_first".
-      kernel: the unshared weight for convolution,
-          with shape (output_length, feature_dim, filters).
-      kernel_size: a tuple of a single integer,
-          specifying the length of the 1D convolution window.
-      strides: a tuple of a single integer,
-          specifying the stride length of the convolution.
-      data_format: the data format, channels_first or channels_last.
-
-  Returns:
-      A 3d tensor with shape:
-      (batch_size, output_length, filters)
-      if data_format='channels_first'
-      or 3D tensor with shape:
-      (batch_size, filters, output_length)
-      if data_format='channels_last'.
-  """
-  output_shape = (kernel.shape[0],)
-  return local_conv(inputs,
-                    kernel,
-                    kernel_size,
-                    strides,
-                    output_shape,
-                    data_format)
-
-
-@keras_export('keras.backend.local_conv2d')
+    """Apply 1D conv with un-shared weights.
+
+    Args:
+        inputs: 3D tensor with shape:
+            (batch_size, steps, input_dim)
+            if data_format is "channels_last" or
+            (batch_size, input_dim, steps)
+            if data_format is "channels_first".
+        kernel: the unshared weight for convolution,
+            with shape (output_length, feature_dim, filters).
+        kernel_size: a tuple of a single integer,
+            specifying the length of the 1D convolution window.
+        strides: a tuple of a single integer,
+            specifying the stride length of the convolution.
+        data_format: the data format, channels_first or channels_last.
+
+    Returns:
+        A 3d tensor with shape:
+        (batch_size, output_length, filters)
+        if data_format='channels_first'
+        or 3D tensor with shape:
+        (batch_size, filters, output_length)
+        if data_format='channels_last'.
+    """
+    output_shape = (kernel.shape[0],)
+    return local_conv(
+        inputs, kernel, kernel_size, strides, output_shape, data_format
+    )
+
+
+@keras_export("keras.backend.local_conv2d")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
-def local_conv2d(inputs,
-                 kernel,
-                 kernel_size,
-                 strides,
-                 output_shape,
-                 data_format=None):
-  """Apply 2D conv with un-shared weights.
-
-  Args:
-      inputs: 4D tensor with shape:
-          (batch_size, filters, new_rows, new_cols)
-          if data_format='channels_first'
-          or 4D tensor with shape:
-          (batch_size, new_rows, new_cols, filters)
-          if data_format='channels_last'.
-      kernel: the unshared weight for convolution,
-          with shape (output_items, feature_dim, filters).
-      kernel_size: a tuple of 2 integers, specifying the
-          width and height of the 2D convolution window.
-      strides: a tuple of 2 integers, specifying the strides
-          of the convolution along the width and height.
-      output_shape: a tuple with (output_row, output_col).
-      data_format: the data format, channels_first or channels_last.
-
-  Returns:
-      A 4D tensor with shape:
-      (batch_size, filters, new_rows, new_cols)
-      if data_format='channels_first'
-      or 4D tensor with shape:
-      (batch_size, new_rows, new_cols, filters)
-      if data_format='channels_last'.
-  """
-  return local_conv(inputs,
-                    kernel,
-                    kernel_size,
-                    strides,
-                    output_shape,
-                    data_format)
-
-
-@keras_export('keras.backend.bias_add')
+def local_conv2d(
+    inputs, kernel, kernel_size, strides, output_shape, data_format=None
+):
+    """Apply 2D conv with un-shared weights.
+
+    Args:
+        inputs: 4D tensor with shape:
+            (batch_size, filters, new_rows, new_cols)
+            if data_format='channels_first'
+            or 4D tensor with shape:
+            (batch_size, new_rows, new_cols, filters)
+            if data_format='channels_last'.
+        kernel: the unshared weight for convolution,
+            with shape (output_items, feature_dim, filters).
+        kernel_size: a tuple of 2 integers, specifying the
+            width and height of the 2D convolution window.
+        strides: a tuple of 2 integers, specifying the strides
+            of the convolution along the width and height.
+        output_shape: a tuple with (output_row, output_col).
+        data_format: the data format, channels_first or channels_last.
+
+    Returns:
+        A 4D tensor with shape:
+        (batch_size, filters, new_rows, new_cols)
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        (batch_size, new_rows, new_cols, filters)
+        if data_format='channels_last'.
+    """
+    return local_conv(
+        inputs, kernel, kernel_size, strides, output_shape, data_format
+    )
+
+
+@keras_export("keras.backend.bias_add")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def bias_add(x, bias, data_format=None):
-  """Adds a bias vector to a tensor.
-
-  Args:
-      x: Tensor or variable.
-      bias: Bias tensor to add.
-      data_format: string, `"channels_last"` or `"channels_first"`.
-
-  Returns:
-      Output tensor.
-
-  Raises:
-      ValueError: In one of the two cases below:
-                  1. invalid `data_format` argument.
-                  2. invalid bias shape.
-                     the bias should be either a vector or
-                     a tensor with ndim(x) - 1 dimension
-  """
-  if data_format is None:
-    data_format = image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-  bias_shape = int_shape(bias)
-  if len(bias_shape) != 1 and len(bias_shape) != ndim(x) - 1:
-    raise ValueError(
-        'Unexpected bias dimensions %d, expect to be 1 or %d dimensions' %
-        (len(bias_shape), ndim(x) - 1))
-
-  if len(bias_shape) == 1:
-    if data_format == 'channels_first':
-      return tf.nn.bias_add(x, bias, data_format='NCHW')
-    return tf.nn.bias_add(x, bias, data_format='NHWC')
-  if ndim(x) in (3, 4, 5):
-    if data_format == 'channels_first':
-      bias_reshape_axis = (1, bias_shape[-1]) + bias_shape[:-1]
-      return x + reshape(bias, bias_reshape_axis)
-    return x + reshape(bias, (1,) + bias_shape)
-  return tf.nn.bias_add(x, bias)
+    """Adds a bias vector to a tensor.
+
+    Args:
+        x: Tensor or variable.
+        bias: Bias tensor to add.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+
+    Returns:
+        Output tensor.
+
+    Raises:
+        ValueError: In one of the two cases below:
+                    1. invalid `data_format` argument.
+                    2. invalid bias shape.
+                       the bias should be either a vector or
+                       a tensor with ndim(x) - 1 dimension
+    """
+    if data_format is None:
+        data_format = image_data_format()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError("Unknown data_format: " + str(data_format))
+    bias_shape = int_shape(bias)
+    if len(bias_shape) != 1 and len(bias_shape) != ndim(x) - 1:
+        raise ValueError(
+            "Unexpected bias dimensions %d, expect to be 1 or %d dimensions"
+            % (len(bias_shape), ndim(x) - 1)
+        )
+
+    if len(bias_shape) == 1:
+        if data_format == "channels_first":
+            return tf.nn.bias_add(x, bias, data_format="NCHW")
+        return tf.nn.bias_add(x, bias, data_format="NHWC")
+    if ndim(x) in (3, 4, 5):
+        if data_format == "channels_first":
+            bias_reshape_axis = (1, bias_shape[-1]) + bias_shape[:-1]
+            return x + reshape(bias, bias_reshape_axis)
+        return x + reshape(bias, (1,) + bias_shape)
+    return tf.nn.bias_add(x, bias)
 
 
 # RANDOMNESS
 
 
-@keras_export('keras.backend.random_normal')
+@keras_export("keras.backend.random_normal")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
-  """Returns a tensor with normal distribution of values.
-
-  It is an alias to `tf.random.normal`.
-
-  Args:
-      shape: A tuple of integers, the shape of tensor to create.
-      mean: A float, the mean value of the normal distribution to draw samples.
-        Default to 0.0.
-      stddev: A float, the standard deviation of the normal distribution
-        to draw samples. Default to 1.0.
-      dtype: `tf.dtypes.DType`, dtype of returned tensor. Default to use Keras
-        backend dtype which is float32.
-      seed: Integer, random seed. Will use a random numpy integer when not
-        specified.
-
-  Returns:
-      A tensor with normal distribution of values.
-
-  Example:
-
-  >>> random_normal_tensor = tf.keras.backend.random_normal(shape=(2,3),
-  ... mean=0.0, stddev=1.0)
-  >>> random_normal_tensor
-  <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
-  dtype=float32)>
-  """
-  if dtype is None:
-    dtype = floatx()
-  if seed is None:
-    seed = np.random.randint(10e6)
-  return tf.random.normal(
-      shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed)
-
-
-@keras_export('keras.backend.random_uniform')
+    """Returns a tensor with normal distribution of values.
+
+    It is an alias to `tf.random.normal`.
+
+    Args:
+        shape: A tuple of integers, the shape of tensor to create.
+        mean: A float, the mean value of the normal distribution to draw
+          samples. Defaults to `0.0`.
+        stddev: A float, the standard deviation of the normal distribution
+          to draw samples. Defaults to `1.0`.
+        dtype: `tf.dtypes.DType`, dtype of returned tensor. None uses Keras
+          backend dtype which is float32. Defaults to `None`.
+        seed: Integer, random seed. Will use a random numpy integer when not
+          specified.
+
+    Returns:
+        A tensor with normal distribution of values.
+
+    Example:
+
+    >>> random_normal_tensor = tf.keras.backend.random_normal(shape=(2,3),
+    ... mean=0.0, stddev=1.0)
+    >>> random_normal_tensor
+    <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
+    dtype=float32)>
+    """
+    if dtype is None:
+        dtype = floatx()
+    if seed is None:
+        seed = np.random.randint(10e6)
+    return tf.random.normal(
+        shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed
+    )
+
+
+@keras_export("keras.backend.random_uniform")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
-  """Returns a tensor with uniform distribution of values.
-
-  Args:
-      shape: A tuple of integers, the shape of tensor to create.
-      minval: A float, lower boundary of the uniform distribution
-          to draw samples.
-      maxval: A float, upper boundary of the uniform distribution
-          to draw samples.
-      dtype: String, dtype of returned tensor.
-      seed: Integer, random seed.
-
-  Returns:
-      A tensor.
-
-  Example:
-
-  >>> random_uniform_tensor = tf.keras.backend.random_uniform(shape=(2,3),
-  ... minval=0.0, maxval=1.0)
-  >>> random_uniform_tensor
-  <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
-  dtype=float32)>
-  """
-  if dtype is None:
-    dtype = floatx()
-  if seed is None:
-    seed = np.random.randint(10e6)
-  return tf.random.uniform(
-      shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
-
-
-@keras_export('keras.backend.random_binomial')
+    """Returns a tensor with uniform distribution of values.
+
+    Args:
+        shape: A tuple of integers, the shape of tensor to create.
+        minval: A float, lower boundary of the uniform distribution
+            to draw samples.
+        maxval: A float, upper boundary of the uniform distribution
+            to draw samples.
+        dtype: String, dtype of returned tensor.
+        seed: Integer, random seed.
+
+    Returns:
+        A tensor.
+
+    Example:
+
+    >>> random_uniform_tensor = tf.keras.backend.random_uniform(shape=(2,3),
+    ... minval=0.0, maxval=1.0)
+    >>> random_uniform_tensor
+    <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
+    dtype=float32)>
+    """
+    if dtype is None:
+        dtype = floatx()
+    if seed is None:
+        seed = np.random.randint(10e6)
+    return tf.random.uniform(
+        shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed
+    )
+
+
+@keras_export("keras.backend.random_binomial")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
-  """Returns a tensor with random binomial distribution of values.
+    """Returns a tensor with random binomial distribution of values.
 
-  DEPRECATED, use `tf.keras.backend.random_bernoulli` instead.
+    DEPRECATED, use `tf.keras.backend.random_bernoulli` instead.
 
-  The binomial distribution with parameters `n` and `p` is the probability
-  distribution of the number of successful Bernoulli process. Only supports
-  `n` = 1 for now.
+    The binomial distribution with parameters `n` and `p` is the probability
+    distribution of the number of successful Bernoulli process. Only supports
+    `n` = 1 for now.
 
-  Args:
-      shape: A tuple of integers, the shape of tensor to create.
-      p: A float, `0. <= p <= 1`, probability of binomial distribution.
-      dtype: String, dtype of returned tensor.
-      seed: Integer, random seed.
+    Args:
+        shape: A tuple of integers, the shape of tensor to create.
+        p: A float, `0. <= p <= 1`, probability of binomial distribution.
+        dtype: String, dtype of returned tensor.
+        seed: Integer, random seed.
 
-  Returns:
-      A tensor.
+    Returns:
+        A tensor.
 
-  Example:
+    Example:
 
-  >>> random_binomial_tensor = tf.keras.backend.random_binomial(shape=(2,3),
-  ... p=0.5)
-  >>> random_binomial_tensor
-  <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
-  dtype=float32)>
-  """
-  warnings.warn(
-      '`tf.keras.backend.random_binomial` is deprecated, '
-      'and will be removed in a future version.'
-      'Please use `tf.keras.backend.random_bernoulli` instead.',
-      stacklevel=2)
-  return random_bernoulli(shape, p, dtype, seed)
+    >>> random_binomial_tensor = tf.keras.backend.random_binomial(shape=(2,3),
+    ... p=0.5)
+    >>> random_binomial_tensor
+    <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
+    dtype=float32)>
+    """
+    warnings.warn(
+        "`tf.keras.backend.random_binomial` is deprecated, "
+        "and will be removed in a future version."
+        "Please use `tf.keras.backend.random_bernoulli` instead.",
+        stacklevel=2,
+    )
+    return random_bernoulli(shape, p, dtype, seed)
 
 
-@keras_export('keras.backend.random_bernoulli')
+@keras_export("keras.backend.random_bernoulli")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
-  """Returns a tensor with random bernoulli distribution of values.
-
-  Args:
-      shape: A tuple of integers, the shape of tensor to create.
-      p: A float, `0. <= p <= 1`, probability of bernoulli distribution.
-      dtype: String, dtype of returned tensor.
-      seed: Integer, random seed.
-
-  Returns:
-      A tensor.
-  """
-  if dtype is None:
-    dtype = floatx()
-  if seed is None:
-    seed = np.random.randint(10e6)
-  return tf.where(
-      tf.random.uniform(shape, dtype=dtype, seed=seed) <= p,
-      tf.ones(shape, dtype=dtype), tf.zeros(shape, dtype=dtype))
-
-
-@keras_export('keras.backend.truncated_normal')
+    """Returns a tensor with random bernoulli distribution of values.
+
+    Args:
+        shape: A tuple of integers, the shape of tensor to create.
+        p: A float, `0. <= p <= 1`, probability of bernoulli distribution.
+        dtype: String, dtype of returned tensor.
+        seed: Integer, random seed.
+
+    Returns:
+        A tensor.
+    """
+    if dtype is None:
+        dtype = floatx()
+    if seed is None:
+        seed = np.random.randint(10e6)
+    return tf.where(
+        tf.random.uniform(shape, dtype=dtype, seed=seed) <= p,
+        tf.ones(shape, dtype=dtype),
+        tf.zeros(shape, dtype=dtype),
+    )
+
+
+@keras_export("keras.backend.truncated_normal")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
-  """Returns a tensor with truncated random normal distribution of values.
-
-  The generated values follow a normal distribution
-  with specified mean and standard deviation,
-  except that values whose magnitude is more than
-  two standard deviations from the mean are dropped and re-picked.
-
-  Args:
-      shape: A tuple of integers, the shape of tensor to create.
-      mean: Mean of the values.
-      stddev: Standard deviation of the values.
-      dtype: String, dtype of returned tensor.
-      seed: Integer, random seed.
-
-  Returns:
-      A tensor.
-  """
-  if dtype is None:
-    dtype = floatx()
-  if seed is None:
-    seed = np.random.randint(10e6)
-  return tf.random.truncated_normal(
-      shape, mean, stddev, dtype=dtype, seed=seed)
+    """Returns a tensor with truncated random normal distribution of values.
+
+    The generated values follow a normal distribution
+    with specified mean and standard deviation,
+    except that values whose magnitude is more than
+    two standard deviations from the mean are dropped and re-picked.
+
+    Args:
+        shape: A tuple of integers, the shape of tensor to create.
+        mean: Mean of the values.
+        stddev: Standard deviation of the values.
+        dtype: String, dtype of returned tensor.
+        seed: Integer, random seed.
+
+    Returns:
+        A tensor.
+    """
+    if dtype is None:
+        dtype = floatx()
+    if seed is None:
+        seed = np.random.randint(10e6)
+    return tf.random.truncated_normal(
+        shape, mean, stddev, dtype=dtype, seed=seed
+    )
 
 
 # CTC
@@ -6661,472 +7084,484 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 # in TensorFlow's CTC implementation
 
 
-@keras_export('keras.backend.ctc_label_dense_to_sparse')
+@keras_export("keras.backend.ctc_label_dense_to_sparse")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def ctc_label_dense_to_sparse(labels, label_lengths):
-  """Converts CTC labels from dense to sparse.
-
-  Args:
-      labels: dense CTC labels.
-      label_lengths: length of the labels.
-
-  Returns:
-      A sparse tensor representation of the labels.
-  """
-  label_shape = tf.shape(labels)
-  num_batches_tns = tf.stack([label_shape[0]])
-  max_num_labels_tns = tf.stack([label_shape[1]])
+    """Converts CTC labels from dense to sparse.
 
-  def range_less_than(old_input, current_input):
-    return tf.expand_dims(
-        tf.range(tf.shape(old_input)[1]), 0) < tf.fill(
-            max_num_labels_tns, current_input)
+    Args:
+        labels: dense CTC labels.
+        label_lengths: length of the labels.
 
-  init = tf.cast(
-      tf.fill([1, label_shape[1]], 0), tf.bool)
-  dense_mask = tf.compat.v1.scan(
-      range_less_than, label_lengths, initializer=init, parallel_iterations=1)
-  dense_mask = dense_mask[:, 0, :]
+    Returns:
+        A sparse tensor representation of the labels.
+    """
+    label_shape = tf.shape(labels)
+    num_batches_tns = tf.stack([label_shape[0]])
+    max_num_labels_tns = tf.stack([label_shape[1]])
+
+    def range_less_than(old_input, current_input):
+        return tf.expand_dims(tf.range(tf.shape(old_input)[1]), 0) < tf.fill(
+            max_num_labels_tns, current_input
+        )
+
+    init = tf.cast(tf.fill([1, label_shape[1]], 0), tf.bool)
+    dense_mask = tf.compat.v1.scan(
+        range_less_than, label_lengths, initializer=init, parallel_iterations=1
+    )
+    dense_mask = dense_mask[:, 0, :]
 
-  label_array = tf.reshape(
-      tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
-      label_shape)
-  label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask)
+    label_array = tf.reshape(
+        tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape
+    )
+    label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask)
 
-  batch_array = tf.compat.v1.transpose(
-      tf.reshape(
-          tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns),
-          reverse(label_shape, 0)))
-  batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask)
-  indices = tf.compat.v1.transpose(
-      tf.reshape(concatenate([batch_ind, label_ind], axis=0), [2, -1]))
+    batch_array = tf.compat.v1.transpose(
+        tf.reshape(
+            tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns),
+            reverse(label_shape, 0),
+        )
+    )
+    batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask)
+    indices = tf.compat.v1.transpose(
+        tf.reshape(concatenate([batch_ind, label_ind], axis=0), [2, -1])
+    )
 
-  vals_sparse = tf.compat.v1.gather_nd(labels, indices)
+    vals_sparse = tf.compat.v1.gather_nd(labels, indices)
 
-  return tf.SparseTensor(
-      tf.cast(indices, tf.int64), vals_sparse,
-      tf.cast(label_shape, tf.int64))
+    return tf.SparseTensor(
+        tf.cast(indices, tf.int64), vals_sparse, tf.cast(label_shape, tf.int64)
+    )
 
 
-@keras_export('keras.backend.ctc_batch_cost')
+@keras_export("keras.backend.ctc_batch_cost")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
-  """Runs CTC loss algorithm on each batch element.
-
-  Args:
-      y_true: tensor `(samples, max_string_length)`
-          containing the truth labels.
-      y_pred: tensor `(samples, time_steps, num_categories)`
-          containing the prediction, or output of the softmax.
-      input_length: tensor `(samples, 1)` containing the sequence length for
-          each batch item in `y_pred`.
-      label_length: tensor `(samples, 1)` containing the sequence length for
-          each batch item in `y_true`.
-
-  Returns:
-      Tensor with shape (samples,1) containing the
-          CTC loss of each element.
-  """
-  label_length = tf.cast(
-      tf.squeeze(label_length, axis=-1), tf.int32)
-  input_length = tf.cast(
-      tf.squeeze(input_length, axis=-1), tf.int32)
-  sparse_labels = tf.cast(
-      ctc_label_dense_to_sparse(y_true, label_length), tf.int32)
-
-  y_pred = tf.math.log(tf.compat.v1.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
-
-  return tf.expand_dims(
-      tf.compat.v1.nn.ctc_loss(
-          inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
-
-
-@keras_export('keras.backend.ctc_decode')
+    """Runs CTC loss algorithm on each batch element.
+
+    Args:
+        y_true: tensor `(samples, max_string_length)`
+            containing the truth labels.
+        y_pred: tensor `(samples, time_steps, num_categories)`
+            containing the prediction, or output of the softmax.
+        input_length: tensor `(samples, 1)` containing the sequence length for
+            each batch item in `y_pred`.
+        label_length: tensor `(samples, 1)` containing the sequence length for
+            each batch item in `y_true`.
+
+    Returns:
+        Tensor with shape (samples,1) containing the
+            CTC loss of each element.
+    """
+    label_length = tf.cast(tf.squeeze(label_length, axis=-1), tf.int32)
+    input_length = tf.cast(tf.squeeze(input_length, axis=-1), tf.int32)
+    sparse_labels = tf.cast(
+        ctc_label_dense_to_sparse(y_true, label_length), tf.int32
+    )
+
+    y_pred = tf.math.log(
+        tf.compat.v1.transpose(y_pred, perm=[1, 0, 2]) + epsilon()
+    )
+
+    return tf.expand_dims(
+        tf.compat.v1.nn.ctc_loss(
+            inputs=y_pred, labels=sparse_labels, sequence_length=input_length
+        ),
+        1,
+    )
+
+
+@keras_export("keras.backend.ctc_decode")
 @tf.__internal__.dispatch.add_dispatch_support
 @doc_controls.do_not_generate_docs
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
-  """Decodes the output of a softmax.
-
-  Can use either greedy search (also known as best path)
-  or a constrained dictionary search.
-
-  Args:
-      y_pred: tensor `(samples, time_steps, num_categories)`
-          containing the prediction, or output of the softmax.
-      input_length: tensor `(samples, )` containing the sequence length for
-          each batch item in `y_pred`.
-      greedy: perform much faster best-path search if `true`.
-          This does not use a dictionary.
-      beam_width: if `greedy` is `false`: a beam search decoder will be used
-          with a beam of this width.
-      top_paths: if `greedy` is `false`,
-          how many of the most probable paths will be returned.
-
-  Returns:
-      Tuple:
-          List: if `greedy` is `true`, returns a list of one element that
-              contains the decoded sequence.
-              If `false`, returns the `top_paths` most probable
-              decoded sequences.
-              Each decoded sequence has shape (samples, time_steps).
-              Important: blank labels are returned as `-1`.
-          Tensor `(top_paths, )` that contains
-              the log probability of each decoded sequence.
-  """
-  input_shape = shape(y_pred)
-  num_samples, num_steps = input_shape[0], input_shape[1]
-  y_pred = tf.math.log(tf.compat.v1.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
-  input_length = tf.cast(input_length, tf.int32)
-
-  if greedy:
-    (decoded, log_prob) = tf.nn.ctc_greedy_decoder(
-        inputs=y_pred, sequence_length=input_length)
-  else:
-    (decoded, log_prob) = tf.compat.v1.nn.ctc_beam_search_decoder(
-        inputs=y_pred,
-        sequence_length=input_length,
-        beam_width=beam_width,
-        top_paths=top_paths)
-  decoded_dense = []
-  for st in decoded:
-    st = tf.SparseTensor(
-        st.indices, st.values, (num_samples, num_steps))
-    decoded_dense.append(
-        tf.sparse.to_dense(sp_input=st, default_value=-1))
-  return (decoded_dense, log_prob)
+    """Decodes the output of a softmax.
+
+    Can use either greedy search (also known as best path)
+    or a constrained dictionary search.
+
+    Args:
+        y_pred: tensor `(samples, time_steps, num_categories)`
+            containing the prediction, or output of the softmax.
+        input_length: tensor `(samples, )` containing the sequence length for
+            each batch item in `y_pred`.
+        greedy: perform much faster best-path search if `true`.
+            This does not use a dictionary.
+        beam_width: if `greedy` is `false`: a beam search decoder will be used
+            with a beam of this width.
+        top_paths: if `greedy` is `false`,
+            how many of the most probable paths will be returned.
+
+    Returns:
+        Tuple:
+            List: if `greedy` is `true`, returns a list of one element that
+                contains the decoded sequence.
+                If `false`, returns the `top_paths` most probable
+                decoded sequences.
+                Each decoded sequence has shape (samples, time_steps).
+                Important: blank labels are returned as `-1`.
+            Tensor `(top_paths, )` that contains
+                the log probability of each decoded sequence.
+    """
+    input_shape = shape(y_pred)
+    num_samples, num_steps = input_shape[0], input_shape[1]
+    y_pred = tf.math.log(
+        tf.compat.v1.transpose(y_pred, perm=[1, 0, 2]) + epsilon()
+    )
+    input_length = tf.cast(input_length, tf.int32)
+
+    if greedy:
+        (decoded, log_prob) = tf.nn.ctc_greedy_decoder(
+            inputs=y_pred, sequence_length=input_length
+        )
+    else:
+        (decoded, log_prob) = tf.compat.v1.nn.ctc_beam_search_decoder(
+            inputs=y_pred,
+            sequence_length=input_length,
+            beam_width=beam_width,
+            top_paths=top_paths,
+        )
+    decoded_dense = []
+    for st in decoded:
+        st = tf.SparseTensor(st.indices, st.values, (num_samples, num_steps))
+        decoded_dense.append(tf.sparse.to_dense(sp_input=st, default_value=-1))
+    return (decoded_dense, log_prob)
 
 
 # HIGH ORDER FUNCTIONS
 
 
-@keras_export('keras.backend.map_fn')
+@keras_export("keras.backend.map_fn")
 @doc_controls.do_not_generate_docs
 def map_fn(fn, elems, name=None, dtype=None):
-  """Map the function fn over the elements elems and return the outputs.
+    """Map the function fn over the elements elems and return the outputs.
 
-  Args:
-      fn: Callable that will be called upon each element in elems
-      elems: tensor
-      name: A string name for the map node in the graph
-      dtype: Output data type.
+    Args:
+        fn: Callable that will be called upon each element in elems
+        elems: tensor
+        name: A string name for the map node in the graph
+        dtype: Output data type.
 
-  Returns:
-      Tensor with dtype `dtype`.
-  """
-  return tf.compat.v1.map_fn(fn, elems, name=name, dtype=dtype)
+    Returns:
+        Tensor with dtype `dtype`.
+    """
+    return tf.compat.v1.map_fn(fn, elems, name=name, dtype=dtype)
 
 
-@keras_export('keras.backend.foldl')
+@keras_export("keras.backend.foldl")
 @doc_controls.do_not_generate_docs
 def foldl(fn, elems, initializer=None, name=None):
-  """Reduce elems using fn to combine them from left to right.
+    """Reduce elems using fn to combine them from left to right.
 
-  Args:
-      fn: Callable that will be called upon each element in elems and an
-          accumulator, for instance `lambda acc, x: acc + x`
-      elems: tensor
-      initializer: The first value used (`elems[0]` in case of None)
-      name: A string name for the foldl node in the graph
+    Args:
+        fn: Callable that will be called upon each element in elems and an
+            accumulator, for instance `lambda acc, x: acc + x`
+        elems: tensor
+        initializer: The first value used (`elems[0]` in case of None)
+        name: A string name for the foldl node in the graph
 
-  Returns:
-      Tensor with same type and shape as `initializer`.
-  """
-  return tf.compat.v1.foldl(fn, elems, initializer=initializer, name=name)
+    Returns:
+        Tensor with same type and shape as `initializer`.
+    """
+    return tf.compat.v1.foldl(fn, elems, initializer=initializer, name=name)
 
 
-@keras_export('keras.backend.foldr')
+@keras_export("keras.backend.foldr")
 @doc_controls.do_not_generate_docs
 def foldr(fn, elems, initializer=None, name=None):
-  """Reduce elems using fn to combine them from right to left.
+    """Reduce elems using fn to combine them from right to left.
+
+    Args:
+        fn: Callable that will be called upon each element in elems and an
+            accumulator, for instance `lambda acc, x: acc + x`
+        elems: tensor
+        initializer: The first value used (`elems[-1]` in case of None)
+        name: A string name for the foldr node in the graph
 
-  Args:
-      fn: Callable that will be called upon each element in elems and an
-          accumulator, for instance `lambda acc, x: acc + x`
-      elems: tensor
-      initializer: The first value used (`elems[-1]` in case of None)
-      name: A string name for the foldr node in the graph
+    Returns:
+        Same type and shape as initializer
+    """
+    return tf.compat.v1.foldr(fn, elems, initializer=initializer, name=name)
 
-  Returns:
-      Same type and shape as initializer
-  """
-  return tf.compat.v1.foldr(fn, elems, initializer=initializer, name=name)
 
 # Load Keras default configuration from config file if present.
 # Set Keras base dir path given KERAS_HOME env variable, if applicable.
 # Otherwise either ~/.keras or /tmp.
-if 'KERAS_HOME' in os.environ:
-  _keras_dir = os.environ.get('KERAS_HOME')
+if "KERAS_HOME" in os.environ:
+    _keras_dir = os.environ.get("KERAS_HOME")
 else:
-  _keras_base_dir = os.path.expanduser('~')
-  _keras_dir = os.path.join(_keras_base_dir, '.keras')
-_config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
+    _keras_base_dir = os.path.expanduser("~")
+    _keras_dir = os.path.join(_keras_base_dir, ".keras")
+_config_path = os.path.expanduser(os.path.join(_keras_dir, "keras.json"))
 if os.path.exists(_config_path):
-  try:
-    with open(_config_path) as fh:
-      _config = json.load(fh)
-  except ValueError:
-    _config = {}
-  _floatx = _config.get('floatx', floatx())
-  assert _floatx in {'float16', 'float32', 'float64'}
-  _epsilon = _config.get('epsilon', epsilon())
-  assert isinstance(_epsilon, float)
-  _image_data_format = _config.get('image_data_format', image_data_format())
-  assert _image_data_format in {'channels_last', 'channels_first'}
-  set_floatx(_floatx)
-  set_epsilon(_epsilon)
-  set_image_data_format(_image_data_format)
+    try:
+        with open(_config_path) as fh:
+            _config = json.load(fh)
+    except ValueError:
+        _config = {}
+    _floatx = _config.get("floatx", floatx())
+    assert _floatx in {"float16", "float32", "float64"}
+    _epsilon = _config.get("epsilon", epsilon())
+    assert isinstance(_epsilon, float)
+    _image_data_format = _config.get("image_data_format", image_data_format())
+    assert _image_data_format in {"channels_last", "channels_first"}
+    set_floatx(_floatx)
+    set_epsilon(_epsilon)
+    set_image_data_format(_image_data_format)
 
 # Save config file.
 if not os.path.exists(_keras_dir):
-  try:
-    os.makedirs(_keras_dir)
-  except OSError:
-    # Except permission denied and potential race conditions
-    # in multi-threaded environments.
-    pass
+    try:
+        os.makedirs(_keras_dir)
+    except OSError:
+        # Except permission denied and potential race conditions
+        # in multi-threaded environments.
+        pass
 
 if not os.path.exists(_config_path):
-  _config = {
-      'floatx': floatx(),
-      'epsilon': epsilon(),
-      'backend': 'tensorflow',
-      'image_data_format': image_data_format()
-  }
-  try:
-    with open(_config_path, 'w') as f:
-      f.write(json.dumps(_config, indent=4))
-  except IOError:
-    # Except permission denied.
-    pass
+    _config = {
+        "floatx": floatx(),
+        "epsilon": epsilon(),
+        "backend": "tensorflow",
+        "image_data_format": image_data_format(),
+    }
+    try:
+        with open(_config_path, "w") as f:
+            f.write(json.dumps(_config, indent=4))
+    except IOError:
+        # Except permission denied.
+        pass
 
 
 def configure_and_create_distributed_session(distribution_strategy):
-  """Configure session config and create a session with it."""
-
-  def _create_session(distribution_strategy):
-    """Create the Distributed Strategy session."""
-    session_config = get_default_session_config()
-
-    # If a session already exists, merge in its config; in the case there is a
-    # conflict, take values of the existing config.
-    global _SESSION
-    if getattr(_SESSION, 'session', None) and _SESSION.session._config:
-      session_config.MergeFrom(_SESSION.session._config)
-
-    if is_tpu_strategy(distribution_strategy):
-      # TODO(priyag, yuefengz): Remove this workaround when Distribute
-      # Coordinator is integrated with keras and we can create a session from
-      # there.
-      distribution_strategy.configure(session_config)
-      master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
-      session = tf.compat.v1.Session(config=session_config, target=master)
+    """Configure session config and create a session with it."""
+
+    def _create_session(distribution_strategy):
+        """Create the Distributed Strategy session."""
+        session_config = get_default_session_config()
+
+        # If a session already exists, merge in its config; in the case there is
+        # a conflict, take values of the existing config.
+        global _SESSION
+        if getattr(_SESSION, "session", None) and _SESSION.session._config:
+            session_config.MergeFrom(_SESSION.session._config)
+
+        if is_tpu_strategy(distribution_strategy):
+            # TODO(priyag, yuefengz): Remove this workaround when Distribute
+            # Coordinator is integrated with keras and we can create a session
+            # from there.
+            distribution_strategy.configure(session_config)
+            master = (
+                distribution_strategy.extended._tpu_cluster_resolver.master()
+            )
+            session = tf.compat.v1.Session(config=session_config, target=master)
+        else:
+            worker_context = dc.get_current_worker_context()
+            if worker_context:
+                dc_session_config = worker_context.session_config
+                # Merge the default session config to the one from distribute
+                # coordinator, which is fine for now since they don't have
+                # conflicting configurations.
+                dc_session_config.MergeFrom(session_config)
+                session = tf.compat.v1.Session(
+                    config=dc_session_config,
+                    target=worker_context.master_target,
+                )
+            else:
+                distribution_strategy.configure(session_config)
+                session = tf.compat.v1.Session(config=session_config)
+
+        set_session(session)
+
+    if distribution_strategy.extended._in_multi_worker_mode():
+        dc.run_distribute_coordinator(_create_session, distribution_strategy)
     else:
-      worker_context = dc.get_current_worker_context()
-      if worker_context:
-        dc_session_config = worker_context.session_config
-        # Merge the default session config to the one from distribute
-        # coordinator, which is fine for now since they don't have
-        # conflicting configurations.
-        dc_session_config.MergeFrom(session_config)
-        session = tf.compat.v1.Session(
-            config=dc_session_config, target=worker_context.master_target)
-      else:
-        distribution_strategy.configure(session_config)
-        session = tf.compat.v1.Session(config=session_config)
-
-    set_session(session)
-
-  if distribution_strategy.extended._in_multi_worker_mode():
-    dc.run_distribute_coordinator(
-        _create_session,
-        distribution_strategy)
-  else:
-    _create_session(distribution_strategy)
+        _create_session(distribution_strategy)
 
 
 def _is_tpu_strategy_class(clz):
-  is_tpu_strat = lambda k: k.__name__.startswith('TPUStrategy')
-  if is_tpu_strat(clz):
-    return True
-  return py_any(map(_is_tpu_strategy_class, clz.__bases__))
+    is_tpu_strat = lambda k: k.__name__.startswith("TPUStrategy")
+    if is_tpu_strat(clz):
+        return True
+    return py_any(map(_is_tpu_strategy_class, clz.__bases__))
 
 
 def is_tpu_strategy(strategy):
-  """Returns whether input is a TPUStrategy instance or subclass instance."""
-  return _is_tpu_strategy_class(strategy.__class__)
-
-
-def cast_variables_to_tensor(tensors):
-
-  def _cast_variables_to_tensor(tensor):
-    if isinstance(tensor, tf.Variable):
-      return tf.identity(tensor)
-    return tensor
-
-  return tf.nest.map_structure(_cast_variables_to_tensor, tensors)
+    """Returns whether input is a TPUStrategy instance or subclass instance."""
+    return _is_tpu_strategy_class(strategy.__class__)
 
 
 def _is_symbolic_tensor(x):
-  return tf.is_tensor(x) and not isinstance(x, tf.__internal__.EagerTensor)
+    return tf.is_tensor(x) and not isinstance(x, tf.__internal__.EagerTensor)
 
 
 def convert_inputs_if_ragged(inputs):
-  """Converts any ragged tensors to dense."""
-
-  def _convert_ragged_input(inputs):
-    if isinstance(inputs, tf.RaggedTensor):
-      return inputs.to_tensor()
-    return inputs
+    """Converts any ragged tensors to dense."""
 
-  flat_inputs = tf.nest.flatten(inputs)
-  contains_ragged = py_any(
-      isinstance(i, tf.RaggedTensor) for i in flat_inputs)
+    def _convert_ragged_input(inputs):
+        if isinstance(inputs, tf.RaggedTensor):
+            return inputs.to_tensor()
+        return inputs
 
-  if not contains_ragged:
-    return inputs, None
+    flat_inputs = tf.nest.flatten(inputs)
+    contains_ragged = py_any(
+        isinstance(i, tf.RaggedTensor) for i in flat_inputs
+    )
 
-  inputs = tf.nest.map_structure(_convert_ragged_input, inputs)
-  # Multiple mask are not yet supported, so one mask is used on all inputs.
-  # We approach this similarly when using row lengths to ignore steps.
-  nested_row_lengths = tf.cast(flat_inputs[0].nested_row_lengths()[0],
-                                     'int32')
-  return inputs, nested_row_lengths
+    if not contains_ragged:
+        return inputs, None
 
+    inputs = tf.nest.map_structure(_convert_ragged_input, inputs)
+    # Multiple mask are not yet supported, so one mask is used on all inputs.
+    # We approach this similarly when using row lengths to ignore steps.
+    nested_row_lengths = tf.cast(
+        flat_inputs[0].nested_row_lengths()[0], "int32"
+    )
+    return inputs, nested_row_lengths
 
-def maybe_convert_to_ragged(is_ragged_input, output, nested_row_lengths,
-                            go_backwards=False):
-  """Converts any ragged input back to its initial structure."""
-  if not is_ragged_input:
-    return output
 
-  if go_backwards:
-    # Reverse based on the timestep dim, so that nested_row_lengths will mask
-    # from the correct direction. Return the reverse ragged tensor.
-    output = reverse(output, [1])
-    ragged = tf.RaggedTensor.from_tensor(output, nested_row_lengths)
-    return reverse(ragged, [1])
-  else:
-    return tf.RaggedTensor.from_tensor(output, nested_row_lengths)
+def maybe_convert_to_ragged(
+    is_ragged_input, output, nested_row_lengths, go_backwards=False
+):
+    """Converts any ragged input back to its initial structure."""
+    if not is_ragged_input:
+        return output
+
+    if go_backwards:
+        # Reverse based on the timestep dim, so that nested_row_lengths will
+        # mask from the correct direction. Return the reverse ragged tensor.
+        output = reverse(output, [1])
+        ragged = tf.RaggedTensor.from_tensor(output, nested_row_lengths)
+        return reverse(ragged, [1])
+    else:
+        return tf.RaggedTensor.from_tensor(output, nested_row_lengths)
 
 
 class ContextValueCache(weakref.WeakKeyDictionary):
-  """Container that caches (possibly tensor) values based on the context.
-
-  This class is similar to defaultdict, where values may be produced by the
-  default factory specified during initialization. This class also has a default
-  value for the key (when key is `None`) -- the key is set to the current graph
-  or eager context. The default factories for key and value are only used in
-  `__getitem__` and `setdefault`. The `.get()` behavior remains the same.
-
-  This object will return the value of the current graph or closest parent graph
-  if the current graph is a function. This is to reflect the fact that if a
-  tensor is created in eager/graph, child functions may capture that tensor.
-
-  The default factory method may accept keyword arguments (unlike defaultdict,
-  which only accepts callables with 0 arguments). To pass keyword arguments to
-  `default_factory`, use the `setdefault` method instead of `__getitem__`.
-
-  An example of how this class can be used in different contexts:
-
-  ```
-  cache = ContextValueCache(int)
-
-  # Eager mode
-  cache[None] += 2
-  cache[None] += 4
-  assert cache[None] == 6
-
-  # Graph mode
-  with tf.Graph().as_default() as g:
-    cache[None] += 5
-    cache[g] += 3
-  assert cache[g] == 8
-  ```
-
-  Example of a default factory with arguments:
-
-  ```
-  cache = ContextValueCache(lambda x: x + 1)
-  g = tf.get_default_graph()
-
-  # Example with keyword argument.
-  value = cache.setdefault(key=g, kwargs={'x': 3})
-  assert cache[g] == 4
-  ```
-  """
+    """Container that caches (possibly tensor) values based on the context.
+
+    This class is similar to defaultdict, where values may be produced by the
+    default factory specified during initialization. This class also has a
+    default value for the key (when key is `None`) -- the key is set to the
+    current graph or eager context. The default factories for key and value are
+    only used in `__getitem__` and `setdefault`. The `.get()` behavior remains
+    the same.
+
+    This object will return the value of the current graph or closest parent
+    graph if the current graph is a function. This is to reflect the fact that
+    if a tensor is created in eager/graph, child functions may capture that
+    tensor.
+
+    The default factory method may accept keyword arguments (unlike defaultdict,
+    which only accepts callables with 0 arguments). To pass keyword arguments to
+    `default_factory`, use the `setdefault` method instead of `__getitem__`.
+
+    An example of how this class can be used in different contexts:
+
+    ```
+    cache = ContextValueCache(int)
+
+    # Eager mode
+    cache[None] += 2
+    cache[None] += 4
+    assert cache[None] == 6
+
+    # Graph mode
+    with tf.Graph().as_default() as g:
+      cache[None] += 5
+      cache[g] += 3
+    assert cache[g] == 8
+    ```
+
+    Example of a default factory with arguments:
+
+    ```
+    cache = ContextValueCache(lambda x: x + 1)
+    g = tf.get_default_graph()
+
+    # Example with keyword argument.
+    value = cache.setdefault(key=g, kwargs={'x': 3})
+    assert cache[g] == 4
+    ```
+    """
 
-  def __init__(self, default_factory):
-    self.default_factory = default_factory
-    weakref.WeakKeyDictionary.__init__(self)
+    def __init__(self, default_factory):
+        self.default_factory = default_factory
+        weakref.WeakKeyDictionary.__init__(self)
 
-  def _key(self):
-    if tf.executing_eagerly():
-      return _DUMMY_EAGER_GRAPH.key
-    else:
-      return tf.compat.v1.get_default_graph()
-
-  def _get_parent_graph(self, graph):
-    """Returns the parent graph or dummy eager object."""
-    # TODO(b/149317164): Currently FuncGraphs use ops.get_default_graph() as the
-    # outer graph. This results in outer_graph always being a Graph,
-    # even in eager mode (get_default_graph will create a new Graph if there
-    # isn't a default graph). Because of this bug, we have to specially set the
-    # key when eager execution is enabled.
-    parent_graph = graph.outer_graph
-    if (not isinstance(parent_graph, tf.__internal__.FuncGraph) and
-        tf.compat.v1.executing_eagerly_outside_functions()):
-      return _DUMMY_EAGER_GRAPH.key
-    return parent_graph
-
-  def _get_recursive(self, key):
-    """Gets the value at key or the closest parent graph."""
-    value = self.get(key)
-    if value is not None:
-      return value
-
-    # Since FuncGraphs are able to capture tensors and variables from their
-    # parent graphs, recursively search to see if there is a value stored for
-    # one of the parent graphs.
-    if isinstance(key, tf.__internal__.FuncGraph):
-      return self._get_recursive(self._get_parent_graph(key))
-    return None
+    def _key(self):
+        if tf.executing_eagerly():
+            return _DUMMY_EAGER_GRAPH.key
+        else:
+            return tf.compat.v1.get_default_graph()
+
+    def _get_parent_graph(self, graph):
+        """Returns the parent graph or dummy eager object."""
+        # TODO(b/149317164): Currently FuncGraphs use ops.get_default_graph() as
+        # the outer graph. This results in outer_graph always being a Graph,
+        # even in eager mode (get_default_graph will create a new Graph if there
+        # isn't a default graph). Because of this bug, we have to specially set
+        # the key when eager execution is enabled.
+        parent_graph = graph.outer_graph
+        if (
+            not isinstance(parent_graph, tf.__internal__.FuncGraph)
+            and tf.compat.v1.executing_eagerly_outside_functions()
+        ):
+            return _DUMMY_EAGER_GRAPH.key
+        return parent_graph
+
+    def _get_recursive(self, key):
+        """Gets the value at key or the closest parent graph."""
+        value = self.get(key)
+        if value is not None:
+            return value
+
+        # Since FuncGraphs are able to capture tensors and variables from their
+        # parent graphs, recursively search to see if there is a value stored
+        # for one of the parent graphs.
+        if isinstance(key, tf.__internal__.FuncGraph):
+            return self._get_recursive(self._get_parent_graph(key))
+        return None
+
+    def __getitem__(self, key):
+        """Gets the value at key (or current context), or sets default value.
 
-  def __getitem__(self, key):
-    """Gets the value at key (or current context), or sets default value.
+        Args:
+          key: May be `None` or `Graph`object. When `None`, the key is set to
+            the current context.
 
-    Args:
-      key: May be `None` or `Graph`object. When `None`, the key is set to the
-        current context.
+        Returns:
+          Either the cached or default value.
+        """
+        if key is None:
+            key = self._key()
 
-    Returns:
-      Either the cached or default value.
-    """
-    if key is None:
-      key = self._key()
+        value = self._get_recursive(key)
+        if value is None:
+            value = self[key] = self.default_factory()
+        return value
 
-    value = self._get_recursive(key)
-    if value is None:
-      value = self[key] = self.default_factory()  # pylint:disable=not-callable
-    return value
+    def setdefault(self, key=None, default=None, kwargs=None):
+        """Sets the default value if key is not in dict, and returns the
+        value."""
+        if key is None:
+            key = self._key()
+        kwargs = kwargs or {}
 
-  def setdefault(self, key=None, default=None, kwargs=None):
-    """Sets the default value if key is not in dict, and returns the value."""
-    if key is None:
-      key = self._key()
-    kwargs = kwargs or {}
+        if default is None and key not in self:
+            default = self.default_factory(**kwargs)
+        return weakref.WeakKeyDictionary.setdefault(self, key, default)
 
-    if default is None and key not in self:
-      default = self.default_factory(**kwargs)
-    return weakref.WeakKeyDictionary.setdefault(self, key, default)
 
 # This dictionary holds a mapping {graph: learning_phase}. In eager mode, a
 # dummy object is used.
 # A learning phase is a bool tensor used to run Keras models in
 # either train mode (learning_phase == 1) or test mode (learning_phase == 0).
 _GRAPH_LEARNING_PHASES = ContextValueCache(
-    object_identity.ObjectIdentityWeakSet)
+    object_identity.ObjectIdentityWeakSet
+)
 
 # This dictionary holds a mapping between a graph and variables to initialize
 # in the graph.
diff --git a/keras/backend_config.py b/keras/backend_config.py
index a1e64fac4b2d..948cec331849 100644
--- a/keras/backend_config.py
+++ b/keras/backend_config.py
@@ -15,138 +15,143 @@
 """Keras backend config API."""
 
 import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 # The type of float to use throughout a session.
-_FLOATX = 'float32'
+_FLOATX = "float32"
 
 # Epsilon fuzz factor used throughout the codebase.
 _EPSILON = 1e-7
 
 # Default image data format, one of "channels_last", "channels_first".
-_IMAGE_DATA_FORMAT = 'channels_last'
+_IMAGE_DATA_FORMAT = "channels_last"
 
 
-@keras_export('keras.backend.epsilon')
+@keras_export("keras.backend.epsilon")
 @tf.__internal__.dispatch.add_dispatch_support
 def epsilon():
-  """Returns the value of the fuzz factor used in numeric expressions.
+    """Returns the value of the fuzz factor used in numeric expressions.
 
-  Returns:
-      A float.
+    Returns:
+        A float.
 
-  Example:
-  >>> tf.keras.backend.epsilon()
-  1e-07
-  """
-  return _EPSILON
+    Example:
+    >>> tf.keras.backend.epsilon()
+    1e-07
+    """
+    return _EPSILON
 
 
-@keras_export('keras.backend.set_epsilon')
+@keras_export("keras.backend.set_epsilon")
 def set_epsilon(value):
-  """Sets the value of the fuzz factor used in numeric expressions.
+    """Sets the value of the fuzz factor used in numeric expressions.
 
-  Args:
-      value: float. New value of epsilon.
+    Args:
+        value: float. New value of epsilon.
 
-  Example:
-  >>> tf.keras.backend.epsilon()
-  1e-07
-  >>> tf.keras.backend.set_epsilon(1e-5)
-  >>> tf.keras.backend.epsilon()
-  1e-05
-   >>> tf.keras.backend.set_epsilon(1e-7)
-  """
-  global _EPSILON
-  _EPSILON = value
+    Example:
+    >>> tf.keras.backend.epsilon()
+    1e-07
+    >>> tf.keras.backend.set_epsilon(1e-5)
+    >>> tf.keras.backend.epsilon()
+    1e-05
+     >>> tf.keras.backend.set_epsilon(1e-7)
+    """
+    global _EPSILON
+    _EPSILON = value
 
 
-@keras_export('keras.backend.floatx')
+@keras_export("keras.backend.floatx")
 def floatx():
-  """Returns the default float type, as a string.
+    """Returns the default float type, as a string.
 
-  E.g. `'float16'`, `'float32'`, `'float64'`.
+    E.g. `'float16'`, `'float32'`, `'float64'`.
 
-  Returns:
-      String, the current default float type.
+    Returns:
+        String, the current default float type.
 
-  Example:
-  >>> tf.keras.backend.floatx()
-  'float32'
-  """
-  return _FLOATX
+    Example:
+    >>> tf.keras.backend.floatx()
+    'float32'
+    """
+    return _FLOATX
 
 
-@keras_export('keras.backend.set_floatx')
+@keras_export("keras.backend.set_floatx")
 def set_floatx(value):
-  """Sets the default float type.
-
-  Note: It is not recommended to set this to float16 for training, as this will
-  likely cause numeric stability issues. Instead, mixed precision, which is
-  using a mix of float16 and float32, can be used by calling
-  `tf.keras.mixed_precision.set_global_policy('mixed_float16')`. See the
-  [mixed precision guide](
-    https://www.tensorflow.org/guide/keras/mixed_precision) for details.
-
-  Args:
-      value: String; `'float16'`, `'float32'`, or `'float64'`.
-
-  Example:
-  >>> tf.keras.backend.floatx()
-  'float32'
-  >>> tf.keras.backend.set_floatx('float64')
-  >>> tf.keras.backend.floatx()
-  'float64'
-  >>> tf.keras.backend.set_floatx('float32')
-
-  Raises:
-      ValueError: In case of invalid value.
-  """
-  global _FLOATX
-  accepted_dtypes = {'float16', 'float32', 'float64'}
-  if value not in accepted_dtypes:
-    raise ValueError(
-        f'Unknown `floatx` value: {value}. Expected one of {accepted_dtypes}')
-  _FLOATX = str(value)
-
-
-@keras_export('keras.backend.image_data_format')
+    """Sets the default float type.
+
+    Note: It is not recommended to set this to float16 for training, as this
+    will likely cause numeric stability issues. Instead, mixed precision, which
+    is using a mix of float16 and float32, can be used by calling
+    `tf.keras.mixed_precision.set_global_policy('mixed_float16')`. See the
+    [mixed precision guide](
+      https://www.tensorflow.org/guide/keras/mixed_precision) for details.
+
+    Args:
+        value: String; `'float16'`, `'float32'`, or `'float64'`.
+
+    Example:
+    >>> tf.keras.backend.floatx()
+    'float32'
+    >>> tf.keras.backend.set_floatx('float64')
+    >>> tf.keras.backend.floatx()
+    'float64'
+    >>> tf.keras.backend.set_floatx('float32')
+
+    Raises:
+        ValueError: In case of invalid value.
+    """
+    global _FLOATX
+    accepted_dtypes = {"float16", "float32", "float64"}
+    if value not in accepted_dtypes:
+        raise ValueError(
+            f"Unknown `floatx` value: {value}. "
+            f"Expected one of {accepted_dtypes}"
+        )
+    _FLOATX = str(value)
+
+
+@keras_export("keras.backend.image_data_format")
 @tf.__internal__.dispatch.add_dispatch_support
 def image_data_format():
-  """Returns the default image data format convention.
+    """Returns the default image data format convention.
 
-  Returns:
-      A string, either `'channels_first'` or `'channels_last'`
+    Returns:
+        A string, either `'channels_first'` or `'channels_last'`
 
-  Example:
-  >>> tf.keras.backend.image_data_format()
-  'channels_last'
-  """
-  return _IMAGE_DATA_FORMAT
+    Example:
+    >>> tf.keras.backend.image_data_format()
+    'channels_last'
+    """
+    return _IMAGE_DATA_FORMAT
 
 
-@keras_export('keras.backend.set_image_data_format')
+@keras_export("keras.backend.set_image_data_format")
 def set_image_data_format(data_format):
-  """Sets the value of the image data format convention.
-
-  Args:
-      data_format: string. `'channels_first'` or `'channels_last'`.
-
-  Example:
-  >>> tf.keras.backend.image_data_format()
-  'channels_last'
-  >>> tf.keras.backend.set_image_data_format('channels_first')
-  >>> tf.keras.backend.image_data_format()
-  'channels_first'
-  >>> tf.keras.backend.set_image_data_format('channels_last')
-
-  Raises:
-      ValueError: In case of invalid `data_format` value.
-  """
-  global _IMAGE_DATA_FORMAT
-  accepted_formats = {'channels_last', 'channels_first'}
-  if data_format not in accepted_formats:
-    raise ValueError(
-        f'Unknown `data_format`: {data_format}. '
-        f'Expected one of {accepted_formats}')
-  _IMAGE_DATA_FORMAT = str(data_format)
+    """Sets the value of the image data format convention.
+
+    Args:
+        data_format: string. `'channels_first'` or `'channels_last'`.
+
+    Example:
+    >>> tf.keras.backend.image_data_format()
+    'channels_last'
+    >>> tf.keras.backend.set_image_data_format('channels_first')
+    >>> tf.keras.backend.image_data_format()
+    'channels_first'
+    >>> tf.keras.backend.set_image_data_format('channels_last')
+
+    Raises:
+        ValueError: In case of invalid `data_format` value.
+    """
+    global _IMAGE_DATA_FORMAT
+    accepted_formats = {"channels_last", "channels_first"}
+    if data_format not in accepted_formats:
+        raise ValueError(
+            f"Unknown `data_format`: {data_format}. "
+            f"Expected one of {accepted_formats}"
+        )
+    _IMAGE_DATA_FORMAT = str(data_format)
diff --git a/keras/backend_config_test.py b/keras/backend_config_test.py
index e7e9dfd5bf39..5e8e9e2c0359 100644
--- a/keras/backend_config_test.py
+++ b/keras/backend_config_test.py
@@ -21,33 +21,32 @@
 from keras.testing_infra import test_combinations
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class BackendConfigTest(tf.test.TestCase):
-
-  def test_backend(self):
-    self.assertEqual(backend.backend(), 'tensorflow')
-
-  def test_epsilon(self):
-    epsilon = 1e-2
-    backend_config.set_epsilon(epsilon)
-    self.assertEqual(backend_config.epsilon(), epsilon)
-    backend_config.set_epsilon(1e-7)
-    self.assertEqual(backend_config.epsilon(), 1e-7)
-
-  def test_floatx(self):
-    floatx = 'float64'
-    backend_config.set_floatx(floatx)
-    self.assertEqual(backend_config.floatx(), floatx)
-    backend_config.set_floatx('float32')
-    self.assertEqual(backend_config.floatx(), 'float32')
-
-  def test_image_data_format(self):
-    image_data_format = 'channels_first'
-    backend_config.set_image_data_format(image_data_format)
-    self.assertEqual(backend_config.image_data_format(), image_data_format)
-    backend_config.set_image_data_format('channels_last')
-    self.assertEqual(backend_config.image_data_format(), 'channels_last')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_backend(self):
+        self.assertEqual(backend.backend(), "tensorflow")
+
+    def test_epsilon(self):
+        epsilon = 1e-2
+        backend_config.set_epsilon(epsilon)
+        self.assertEqual(backend_config.epsilon(), epsilon)
+        backend_config.set_epsilon(1e-7)
+        self.assertEqual(backend_config.epsilon(), 1e-7)
+
+    def test_floatx(self):
+        floatx = "float64"
+        backend_config.set_floatx(floatx)
+        self.assertEqual(backend_config.floatx(), floatx)
+        backend_config.set_floatx("float32")
+        self.assertEqual(backend_config.floatx(), "float32")
+
+    def test_image_data_format(self):
+        image_data_format = "channels_first"
+        backend_config.set_image_data_format(image_data_format)
+        self.assertEqual(backend_config.image_data_format(), image_data_format)
+        backend_config.set_image_data_format("channels_last")
+        self.assertEqual(backend_config.image_data_format(), "channels_last")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/backend_test.py b/keras/backend_test.py
index cee51d964743..b47ca213d225 100644
--- a/keras/backend_test.py
+++ b/keras/backend_test.py
@@ -14,2504 +14,3159 @@
 # ==============================================================================
 """Tests for Keras backend."""
 
-import tensorflow.compat.v2 as tf
-
 import gc
 import warnings
 
-from absl.testing import parameterized
 import numpy as np
 import scipy.sparse
-from tensorflow.python.eager import context
-from tensorflow.python.eager.context import get_config
-from tensorflow.python.framework import test_util as tf_test_utils
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras import activations
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras.engine import input_layer
 from keras.layers import activation
 from keras.layers.normalization import batch_normalization_v1
+from keras.testing_infra import test_combinations
+from keras.utils import losses_utils
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
 
-
-def compare_single_input_op_to_numpy(keras_op,
-                                     np_op,
-                                     input_shape,
-                                     dtype='float32',
-                                     negative_values=True,
-                                     keras_args=None,
-                                     keras_kwargs=None,
-                                     np_args=None,
-                                     np_kwargs=None):
-  keras_args = keras_args or []
-  keras_kwargs = keras_kwargs or {}
-  np_args = np_args or []
-  np_kwargs = np_kwargs or {}
-  inputs = 2. * np.random.random(input_shape)
-  if negative_values:
-    inputs -= 1.
-  keras_output = keras_op(
-      backend.variable(inputs, dtype=dtype), *keras_args, **keras_kwargs)
-  keras_output = backend.eval(keras_output)
-  np_output = np_op(inputs.astype(dtype), *np_args, **np_kwargs)
-  try:
-    np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
-  except AssertionError:
-    raise AssertionError('Test for op `' + str(keras_op.__name__) + '` failed; '
-                         'Expected ' + str(np_output) + ' but got ' +
-                         str(keras_output))
-
-
-def compare_two_inputs_op_to_numpy(keras_op,
-                                   np_op,
-                                   input_shape_a,
-                                   input_shape_b,
-                                   dtype='float32',
-                                   keras_args=None,
-                                   keras_kwargs=None,
-                                   np_args=None,
-                                   np_kwargs=None):
-  keras_args = keras_args or []
-  keras_kwargs = keras_kwargs or {}
-  np_args = np_args or []
-  np_kwargs = np_kwargs or {}
-  input_a = np.random.random(input_shape_a)
-  input_b = np.random.random(input_shape_b)
-  keras_output = keras_op(
-      backend.variable(input_a, dtype=dtype),
-      backend.variable(input_b, dtype=dtype), *keras_args, **keras_kwargs)
-  keras_output = backend.eval(keras_output)
-  np_output = np_op(
-      input_a.astype(dtype), input_b.astype(dtype), *np_args, **np_kwargs)
-  try:
-    np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
-  except AssertionError:
-    raise AssertionError('Test for op `' + str(keras_op.__name__) + '` failed; '
-                         'Expected ' + str(np_output) + ' but got ' +
-                         str(keras_output))
+# isort: off
+from tensorflow.python.eager import context
+from tensorflow.python.eager.context import get_config
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
+
+def compare_single_input_op_to_numpy(
+    keras_op,
+    np_op,
+    input_shape,
+    dtype="float32",
+    negative_values=True,
+    keras_args=None,
+    keras_kwargs=None,
+    np_args=None,
+    np_kwargs=None,
+):
+    keras_args = keras_args or []
+    keras_kwargs = keras_kwargs or {}
+    np_args = np_args or []
+    np_kwargs = np_kwargs or {}
+    inputs = 2.0 * np.random.random(input_shape)
+    if negative_values:
+        inputs -= 1.0
+    keras_output = keras_op(
+        backend.variable(inputs, dtype=dtype), *keras_args, **keras_kwargs
+    )
+    keras_output = backend.eval(keras_output)
+    np_output = np_op(inputs.astype(dtype), *np_args, **np_kwargs)
+    try:
+        np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
+    except AssertionError:
+        raise AssertionError(
+            "Test for op `"
+            + str(keras_op.__name__)
+            + "` failed; Expected "
+            + str(np_output)
+            + " but got "
+            + str(keras_output)
+        )
+
+
+def compare_two_inputs_op_to_numpy(
+    keras_op,
+    np_op,
+    input_shape_a,
+    input_shape_b,
+    dtype="float32",
+    keras_args=None,
+    keras_kwargs=None,
+    np_args=None,
+    np_kwargs=None,
+):
+    keras_args = keras_args or []
+    keras_kwargs = keras_kwargs or {}
+    np_args = np_args or []
+    np_kwargs = np_kwargs or {}
+    input_a = np.random.random(input_shape_a)
+    input_b = np.random.random(input_shape_b)
+    keras_output = keras_op(
+        backend.variable(input_a, dtype=dtype),
+        backend.variable(input_b, dtype=dtype),
+        *keras_args,
+        **keras_kwargs,
+    )
+    keras_output = backend.eval(keras_output)
+    np_output = np_op(
+        input_a.astype(dtype), input_b.astype(dtype), *np_args, **np_kwargs
+    )
+    try:
+        np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
+    except AssertionError:
+        raise AssertionError(
+            "Test for op `"
+            + str(keras_op.__name__)
+            + "` failed; Expected "
+            + str(np_output)
+            + " but got "
+            + str(keras_output)
+        )
 
 
 class BackendResetTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_new_config(self):
-    # User defined jit setting
-    tf.config.optimizer.set_jit(False)
-    sess = backend.get_session()
-    default_config = get_config()
-    self.assertEqual(
-        sess._config.graph_options.optimizer_options.global_jit_level,
-        default_config.graph_options.optimizer_options.global_jit_level)
-    backend.clear_session()
-
-    # New session has the same jit setting
-    sess = backend.get_session()
-    default_config = get_config()
-    self.assertEqual(
-        sess._config.graph_options.optimizer_options.global_jit_level,
-        default_config.graph_options.optimizer_options.global_jit_level)
-    backend.clear_session()
-
-    # Change respected
-    tf.config.optimizer.set_jit(True)
-    sess = backend.get_session()
-    default_config = get_config()
-    self.assertEqual(
-        sess._config.graph_options.optimizer_options.global_jit_level,
-        default_config.graph_options.optimizer_options.global_jit_level)
-    backend.clear_session()
-
-  # We can't use the normal parameterized decorator because the test session
-  # will block graph clearing.
-  @parameterized.named_parameters(('_v1', context.graph_mode),
-                                  ('_v2', tf.__internal__.eager_context.eager_mode))
-  def test_new_graph(self, test_context):
-    with test_context():
-      g_old = backend.get_graph()
-      backend.clear_session()
-      g = backend.get_graph()
-
-      assert g_old is not g
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_new_config(self):
+        # User defined jit setting
+        tf.config.optimizer.set_jit(False)
+        sess = backend.get_session()
+        default_config = get_config()
+        self.assertEqual(
+            sess._config.graph_options.optimizer_options.global_jit_level,
+            default_config.graph_options.optimizer_options.global_jit_level,
+        )
+        backend.clear_session()
+
+        # New session has the same jit setting
+        sess = backend.get_session()
+        default_config = get_config()
+        self.assertEqual(
+            sess._config.graph_options.optimizer_options.global_jit_level,
+            default_config.graph_options.optimizer_options.global_jit_level,
+        )
+        backend.clear_session()
+
+        # Change respected
+        tf.config.optimizer.set_jit(True)
+        sess = backend.get_session()
+        default_config = get_config()
+        self.assertEqual(
+            sess._config.graph_options.optimizer_options.global_jit_level,
+            default_config.graph_options.optimizer_options.global_jit_level,
+        )
+        backend.clear_session()
+
+    # We can't use the normal parameterized decorator because the test session
+    # will block graph clearing.
+    @parameterized.named_parameters(
+        ("_v1", context.graph_mode),
+        ("_v2", tf.__internal__.eager_context.eager_mode),
+    )
+    def test_new_graph(self, test_context):
+        with test_context():
+            g_old = backend.get_graph()
+            backend.clear_session()
+            g = backend.get_graph()
+
+            assert g_old is not g
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class BackendUtilsTest(tf.test.TestCase):
+    def test_backend(self):
+        self.assertEqual(backend.backend(), "tensorflow")
+
+    def test_get_reset_uids(self):
+        self.assertEqual(backend.get_uid("foo"), 1)
+        self.assertEqual(backend.get_uid("foo"), 2)
+
+        backend.reset_uids()
+        self.assertEqual(backend.get_uid("foo"), 1)
+
+    def test_learning_phase(self):
+        with self.cached_session() as sess:
+            with self.assertRaises(ValueError):
+                backend.set_learning_phase(2)
+
+            # Test running with a learning-phase-consuming layer
+            with backend.learning_phase_scope(0):
+                x = input_layer.Input((3,))
+                y = batch_normalization_v1.BatchNormalization()(x)
+                if not tf.executing_eagerly():
+                    self.evaluate(tf.compat.v1.global_variables_initializer())
+                    sess.run(y, feed_dict={x: np.random.random((2, 3))})
+
+    def test_get_learning_phase_eager(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Check for eager only.")
+        # see b/251520266 for more details.
+        # By default the learning phase should be False
+        self.assertFalse(backend.learning_phase())
+        # Also make sure retrieving the learning phase doesn't set the default
+        # value
+        self.assertFalse(backend.global_learning_phase_is_set())
+
+        with backend.learning_phase_scope(1):
+            self.assertTrue(backend.learning_phase())
+            self.assertTrue(backend.global_learning_phase_is_set())
+
+        self.assertFalse(backend.global_learning_phase_is_set())
+
+    def test_learning_phase_name(self):
+        with backend.name_scope("test_scope"):
+            # Test that outer name scopes do not affect the learning phase's
+            # name.
+            lp = backend.symbolic_learning_phase()
+        self.assertEqual(lp.name, "keras_learning_phase:0")
+
+    def test_learning_phase_scope(self):
+        initial_learning_phase = backend.learning_phase()
+        with backend.learning_phase_scope(1):
+            self.assertEqual(backend.learning_phase(), 1)
+        self.assertEqual(backend.learning_phase(), initial_learning_phase)
+        with backend.learning_phase_scope(0):
+            self.assertEqual(backend.learning_phase(), 0)
+        self.assertEqual(backend.learning_phase(), initial_learning_phase)
+        with self.assertRaises(ValueError):
+            with backend.learning_phase_scope(None):
+                pass
+        self.assertEqual(backend.learning_phase(), initial_learning_phase)
+
+        new_learning_phase = 0
+        backend.set_learning_phase(new_learning_phase)
+        self.assertEqual(backend.learning_phase(), new_learning_phase)
+        with backend.learning_phase_scope(1):
+            self.assertEqual(backend.learning_phase(), 1)
+        self.assertEqual(backend.learning_phase(), new_learning_phase)
+
+    def test_learning_phase_scope_in_graph(self):
+        initial_learning_phase_outside_graph = backend.learning_phase()
+        with backend.get_graph().as_default():
+            initial_learning_phase_in_graph = backend.learning_phase()
+
+        self.assertEqual(
+            backend.learning_phase(), initial_learning_phase_outside_graph
+        )
+        with backend.learning_phase_scope(1):
+            self.assertEqual(backend.learning_phase(), 1)
+        self.assertEqual(
+            backend.learning_phase(), initial_learning_phase_outside_graph
+        )
+
+        with backend.get_graph().as_default():
+            self.assertIs(
+                backend.learning_phase(), initial_learning_phase_in_graph
+            )
+
+        self.assertEqual(
+            backend.learning_phase(), initial_learning_phase_outside_graph
+        )
+
+    def test_int_shape(self):
+        x = backend.ones(shape=(3, 4))
+        self.assertEqual(backend.int_shape(x), (3, 4))
 
-  def test_backend(self):
-    self.assertEqual(backend.backend(), 'tensorflow')
+        if not tf.executing_eagerly():
+            x = backend.placeholder(shape=(None, 4))
+            self.assertEqual(backend.int_shape(x), (None, 4))
+
+    def test_in_train_phase(self):
+        y1 = backend.variable(1)
+        y2 = backend.variable(2)
+        if tf.executing_eagerly():
+            with backend.learning_phase_scope(0):
+                y_val_test = backend.in_train_phase(y1, y2).numpy()
+            with backend.learning_phase_scope(1):
+                y_val_train = backend.in_train_phase(y1, y2).numpy()
+        else:
+            y = backend.in_train_phase(y1, y2)
+            f = backend.function([backend.learning_phase()], [y])
+            y_val_test = f([0])[0]
+            y_val_train = f([1])[0]
+        self.assertAllClose(y_val_test, 2)
+        self.assertAllClose(y_val_train, 1)
+
+    def test_is_keras_tensor(self):
+        x = backend.variable(1)
+        self.assertEqual(backend.is_keras_tensor(x), False)
+        x = input_layer.Input(shape=(1,))
+        self.assertEqual(backend.is_keras_tensor(x), True)
+        x = input_layer.Input(shape=(None,), ragged=True)
+        self.assertEqual(backend.is_keras_tensor(x), True)
+        x = input_layer.Input(shape=(None, None), sparse=True)
+        self.assertEqual(backend.is_keras_tensor(x), True)
+        with self.assertRaises(ValueError):
+            backend.is_keras_tensor(0)
+
+    def test_stop_gradient(self):
+        x = backend.variable(1)
+        y = backend.stop_gradient(x)
+        if not tf.executing_eagerly():
+            self.assertEqual(y.op.name[:12], "StopGradient")
 
-  def test_get_reset_uids(self):
-    self.assertEqual(backend.get_uid('foo'), 1)
-    self.assertEqual(backend.get_uid('foo'), 2)
+        xs = [backend.variable(1) for _ in range(3)]
+        ys = backend.stop_gradient(xs)
+        if not tf.executing_eagerly():
+            for y in ys:
+                self.assertEqual(y.op.name[:12], "StopGradient")
+
+    def test_placeholder(self):
+        x = backend.placeholder(shape=(3, 4))
+        self.assertEqual(x.shape.as_list(), [3, 4])
+        x = backend.placeholder(shape=(3, 4), sparse=True)
+        self.assertEqual(x.shape.as_list(), [3, 4])
+
+    def test_is_placeholder(self):
+        x = backend.placeholder(shape=(1,))
+        self.assertEqual(backend.is_placeholder(x), True)
+        x = backend.variable(1)
+        self.assertEqual(backend.is_placeholder(x), False)
+
+    def test_print_tensor(self):
+        # Unfortunately it seems impossible to use `mock` (or any other method)
+        # to capture stdout when used inside a graph or graph function, thus
+        # we cannot test correctness.
+        # The message gets correctly printed in practice.
+        x = backend.placeholder(shape=())
+        y = backend.print_tensor(x, f"eager={tf.executing_eagerly()}")
+        f = backend.function(x, y)
+        f(0)
+
+    def test_cast_to_floatx(self):
+        x = backend.variable(1, dtype="float64")
+        x = backend.cast_to_floatx(x)
+        self.assertEqual(x.dtype.name, "float32")
+        x = backend.cast_to_floatx(2)
+        self.assertEqual(x.dtype.name, "float32")
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class BackendVariableTest(tf.test.TestCase):
+    def test_zeros(self):
+        x = backend.zeros((3, 4))
+        val = backend.eval(x)
+        self.assertAllClose(val, np.zeros((3, 4)))
+
+    def test_ones(self):
+        x = backend.ones((3, 4))
+        val = backend.eval(x)
+        self.assertAllClose(val, np.ones((3, 4)))
+
+    def test_eye(self):
+        x = backend.eye(4)
+        val = backend.eval(x)
+        self.assertAllClose(val, np.eye(4))
+
+    def test_zeros_like(self):
+        x = backend.zeros((3, 4))
+        y = backend.zeros_like(x)
+        val = backend.eval(y)
+        self.assertAllClose(val, np.zeros((3, 4)))
+
+    def test_ones_like(self):
+        x = backend.zeros((3, 4))
+        y = backend.ones_like(x)
+        val = backend.eval(y)
+        self.assertAllClose(val, np.ones((3, 4)))
+
+    def test_random_uniform_variable(self):
+        x = backend.random_uniform_variable((30, 20), low=1.0, high=2.0, seed=0)
+        val = backend.eval(x)
+        self.assertAllClose(val.mean(), 1.5, atol=1e-1)
+        self.assertAllClose(val.max(), 2.0, atol=1e-1)
+        self.assertAllClose(val.min(), 1.0, atol=1e-1)
+
+    def test_random_normal_variable(self):
+        x = backend.random_normal_variable((30, 20), 1.0, 0.5, seed=0)
+        val = backend.eval(x)
+        self.assertAllClose(val.mean(), 1.0, atol=1e-1)
+        self.assertAllClose(val.std(), 0.5, atol=1e-1)
+
+    def test_count_params(self):
+        x = backend.zeros((4, 5))
+        val = backend.count_params(x)
+        self.assertAllClose(val, 20)
+
+    def test_constant(self):
+        ref_val = np.random.random((3, 4)).astype("float32")
+        x = backend.constant(ref_val)
+        val = backend.eval(x)
+        self.assertAllClose(val, ref_val)
+
+    def test_sparse_variable(self):
+        val = scipy.sparse.eye(10)
+        x = backend.variable(val)
+        self.assertTrue(isinstance(x, tf.SparseTensor))
+
+        y = backend.to_dense(x)
+        self.assertFalse(backend.is_sparse(y))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class BackendLinearAlgebraTest(tf.test.TestCase, parameterized.TestCase):
+    def test_dot(self):
+        x = backend.ones(shape=(2, 3))
+        y = backend.ones(shape=(3, 4))
+        xy = backend.dot(x, y)
+        self.assertEqual(xy.shape.as_list(), [2, 4])
+
+        x = backend.ones(shape=(32, 28, 3))
+        y = backend.ones(shape=(3, 4))
+        xy = backend.dot(x, y)
+        self.assertEqual(xy.shape.as_list(), [32, 28, 4])
+
+    @parameterized.parameters(
+        [(2, 3, 4, 5), (2, 5, 6, 7), (2, 3, 4, 6, 7), (3, 1)],
+        [(2, 20, 1), (2, 30, 20), (2, 1, 30), (1, 2)],
+        [(4, 2, 3), (4, 5, 3), (4, 2, 5), (2, 2)],
+        [(4, 2), (4, 2, 3), (4, 3), (1, 1)],
+        [(4, 2), (4, 2, 3), (4, 3), 1],
+        [(4, 2, 3), (4, 3), (4, 2), (2, 1)],
+    )
+    def test_batch_dot(self, x_shape, y_shape, output_shape, axes):
+        x_val = np.random.random(x_shape)
+        y_val = np.random.random(y_shape)
+        x = backend.variable(x_val)
+        y = backend.variable(y_val)
+        xy = backend.batch_dot(x, y, axes=axes)
+        self.assertEqual(tuple(xy.shape.as_list()), output_shape)
+        xy_val = backend.eval(xy)
+        ref_val = self._reference_batch_dot(x_val, y_val, axes)
+        self.assertAllClose(xy_val, ref_val, atol=1e-5)
+
+    def _reference_batch_dot(self, x, y, axes):
+        if isinstance(axes, int):
+            axes = [axes, axes]
+        elif isinstance(axes, tuple):
+            axes = list(axes)
+        if axes is None:
+            if y.ndim == 2:
+                axes = [x.ndim - 1, y.ndim - 1]
+            else:
+                axes = [x.ndim - 1, y.ndim - 2]
+        if axes[0] < 0:
+            axes[0] += x.ndim
+        if axes[1] < 0:
+            axes[1] += y.ndim
+        result = []
+        axes = [axes[0] - 1, axes[1] - 1]
+        for xi, yi in zip(x, y):
+            result.append(np.tensordot(xi, yi, axes))
+        result = np.array(result)
+        if result.ndim == 1:
+            result = np.expand_dims(result, -1)
+        return result
+
+    def test_reduction_ops(self):
+        ops_to_test = [
+            (backend.max, np.max),
+            (backend.min, np.min),
+            (backend.sum, np.sum),
+            (backend.prod, np.prod),
+            (backend.var, np.var),
+            (backend.std, np.std),
+            (backend.mean, np.mean),
+            (backend.argmin, np.argmin),
+            (backend.argmax, np.argmax),
+        ]
+        for keras_op, np_op in ops_to_test:
+            compare_single_input_op_to_numpy(
+                keras_op,
+                np_op,
+                input_shape=(4, 7, 5),
+                keras_kwargs={"axis": 1},
+                np_kwargs={"axis": 1},
+            )
+            compare_single_input_op_to_numpy(
+                keras_op,
+                np_op,
+                input_shape=(4, 7, 5),
+                keras_kwargs={"axis": -1},
+                np_kwargs={"axis": -1},
+            )
+            if "keepdims" in tf_inspect.getargspec(keras_op).args:
+                compare_single_input_op_to_numpy(
+                    keras_op,
+                    np_op,
+                    input_shape=(4, 7, 5),
+                    keras_kwargs={"axis": 1, "keepdims": True},
+                    np_kwargs={"axis": 1, "keepdims": True},
+                )
+
+    def test_elementwise_ops(self):
+        ops_to_test = [
+            (backend.square, np.square),
+            (backend.abs, np.abs),
+            (backend.round, np.round),
+            (backend.sign, np.sign),
+            (backend.sin, np.sin),
+            (backend.cos, np.cos),
+            (backend.exp, np.exp),
+        ]
+        for keras_op, np_op in ops_to_test:
+            compare_single_input_op_to_numpy(
+                keras_op, np_op, input_shape=(4, 7)
+            )
+
+        ops_to_test = [
+            (backend.sqrt, np.sqrt),
+            (backend.log, np.log),
+        ]
+        for keras_op, np_op in ops_to_test:
+            compare_single_input_op_to_numpy(
+                keras_op, np_op, input_shape=(4, 7), negative_values=False
+            )
 
-    backend.reset_uids()
-    self.assertEqual(backend.get_uid('foo'), 1)
+        compare_single_input_op_to_numpy(
+            backend.clip,
+            np.clip,
+            input_shape=(6, 4),
+            keras_kwargs={"min_value": 0.1, "max_value": 2.4},
+            np_kwargs={"a_min": 0.1, "a_max": 1.4},
+        )
 
-  def test_learning_phase(self):
-    with self.cached_session() as sess:
-      with self.assertRaises(ValueError):
-        backend.set_learning_phase(2)
+        compare_single_input_op_to_numpy(
+            backend.pow,
+            np.power,
+            input_shape=(6, 4),
+            keras_args=[3],
+            np_args=[3],
+        )
+
+    def test_two_tensor_ops(self):
+        ops_to_test = [
+            (backend.equal, np.equal),
+            (backend.not_equal, np.not_equal),
+            (backend.greater, np.greater),
+            (backend.greater_equal, np.greater_equal),
+            (backend.less, np.less),
+            (backend.less_equal, np.less_equal),
+            (backend.maximum, np.maximum),
+            (backend.minimum, np.minimum),
+        ]
+        for keras_op, np_op in ops_to_test:
+            compare_two_inputs_op_to_numpy(
+                keras_op, np_op, input_shape_a=(4, 7), input_shape_b=(4, 7)
+            )
+
+    def test_relu(self):
+        x = tf.convert_to_tensor([[-4, 0], [2, 7]], "float32")
+
+        # standard relu
+        relu_op = backend.relu(x)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
+
+        # alpha (leaky relu used)
+        relu_op = backend.relu(x, alpha=0.5)
+        if not tf.executing_eagerly():
+            self.assertTrue("LeakyRelu" in relu_op.name)
+        self.assertAllClose(backend.eval(relu_op), [[-2, 0], [2, 7]])
 
-      # Test running with a learning-phase-consuming layer
-      with backend.learning_phase_scope(0):
-        x = input_layer.Input((3,))
-        y = batch_normalization_v1.BatchNormalization()(x)
+        # max_value < some elements
+        relu_op = backend.relu(x, max_value=5.0)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 5]])
+
+        # nn.relu6 used
+        relu_op = backend.relu(x, max_value=6.0)
         if not tf.executing_eagerly():
-          self.evaluate(tf.compat.v1.global_variables_initializer())
-          sess.run(y, feed_dict={x: np.random.random((2, 3))})
-
-  def test_learning_phase_name(self):
-    with backend.name_scope('test_scope'):
-      # Test that outer name scopes do not affect the learning phase's name.
-      lp = backend.symbolic_learning_phase()
-    self.assertEqual(lp.name, 'keras_learning_phase:0')
-
-  def test_learning_phase_scope(self):
-    initial_learning_phase = backend.learning_phase()
-    with backend.learning_phase_scope(1):
-      self.assertEqual(backend.learning_phase(), 1)
-    self.assertEqual(backend.learning_phase(), initial_learning_phase)
-    with backend.learning_phase_scope(0):
-      self.assertEqual(backend.learning_phase(), 0)
-    self.assertEqual(backend.learning_phase(), initial_learning_phase)
-    with self.assertRaises(ValueError):
-      with backend.learning_phase_scope(None):
-        pass
-    self.assertEqual(backend.learning_phase(), initial_learning_phase)
-
-    new_learning_phase = 0
-    backend.set_learning_phase(new_learning_phase)
-    self.assertEqual(backend.learning_phase(), new_learning_phase)
-    with backend.learning_phase_scope(1):
-      self.assertEqual(backend.learning_phase(), 1)
-    self.assertEqual(backend.learning_phase(), new_learning_phase)
-
-  def test_learning_phase_scope_in_graph(self):
-    initial_learning_phase_outside_graph = backend.learning_phase()
-    with backend.get_graph().as_default():
-      initial_learning_phase_in_graph = backend.learning_phase()
-
-    self.assertEqual(backend.learning_phase(),
-                     initial_learning_phase_outside_graph)
-    with backend.learning_phase_scope(1):
-      self.assertEqual(backend.learning_phase(), 1)
-    self.assertEqual(backend.learning_phase(),
-                     initial_learning_phase_outside_graph)
-
-    with backend.get_graph().as_default():
-      self.assertIs(backend.learning_phase(), initial_learning_phase_in_graph)
-
-    self.assertEqual(backend.learning_phase(),
-                     initial_learning_phase_outside_graph)
-
-  def test_int_shape(self):
-    x = backend.ones(shape=(3, 4))
-    self.assertEqual(backend.int_shape(x), (3, 4))
-
-    if not tf.executing_eagerly():
-      x = backend.placeholder(shape=(None, 4))
-      self.assertEqual(backend.int_shape(x), (None, 4))
-
-  def test_in_train_phase(self):
-    y1 = backend.variable(1)
-    y2 = backend.variable(2)
-    if tf.executing_eagerly():
-      with backend.learning_phase_scope(0):
-        y_val_test = backend.in_train_phase(y1, y2).numpy()
-      with backend.learning_phase_scope(1):
-        y_val_train = backend.in_train_phase(y1, y2).numpy()
-    else:
-      y = backend.in_train_phase(y1, y2)
-      f = backend.function([backend.learning_phase()], [y])
-      y_val_test = f([0])[0]
-      y_val_train = f([1])[0]
-    self.assertAllClose(y_val_test, 2)
-    self.assertAllClose(y_val_train, 1)
-
-  def test_is_keras_tensor(self):
-    x = backend.variable(1)
-    self.assertEqual(backend.is_keras_tensor(x), False)
-    x = input_layer.Input(shape=(1,))
-    self.assertEqual(backend.is_keras_tensor(x), True)
-    x = input_layer.Input(shape=(None,), ragged=True)
-    self.assertEqual(backend.is_keras_tensor(x), True)
-    x = input_layer.Input(shape=(None, None), sparse=True)
-    self.assertEqual(backend.is_keras_tensor(x), True)
-    with self.assertRaises(ValueError):
-      backend.is_keras_tensor(0)
-
-  def test_stop_gradient(self):
-    x = backend.variable(1)
-    y = backend.stop_gradient(x)
-    if not tf.executing_eagerly():
-      self.assertEqual(y.op.name[:12], 'StopGradient')
-
-    xs = [backend.variable(1) for _ in range(3)]
-    ys = backend.stop_gradient(xs)
-    if not tf.executing_eagerly():
-      for y in ys:
-        self.assertEqual(y.op.name[:12], 'StopGradient')
-
-  def test_placeholder(self):
-    x = backend.placeholder(shape=(3, 4))
-    self.assertEqual(x.shape.as_list(), [3, 4])
-    x = backend.placeholder(shape=(3, 4), sparse=True)
-    self.assertEqual(x.shape.as_list(), [3, 4])
-
-  def test_is_placeholder(self):
-    x = backend.placeholder(shape=(1,))
-    self.assertEqual(backend.is_placeholder(x), True)
-    x = backend.variable(1)
-    self.assertEqual(backend.is_placeholder(x), False)
-
-  def test_print_tensor(self):
-    # Unfortunately it seems impossible to use `mock` (or any other method)
-    # to capture stdout when used inside a graph or graph function, thus
-    # we cannot test correctness.
-    # The message gets correctly printed in practice.
-    x = backend.placeholder(shape=())
-    y = backend.print_tensor(x, 'eager=%s' % tf.executing_eagerly())
-    f = backend.function(x, y)
-    f(0)
-
-  def test_cast_to_floatx(self):
-    x = backend.variable(1, dtype='float64')
-    x = backend.cast_to_floatx(x)
-    self.assertEqual(x.dtype.name, 'float32')
-    x = backend.cast_to_floatx(2)
-    self.assertEqual(x.dtype.name, 'float32')
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class BackendVariableTest(tf.test.TestCase):
+            self.assertTrue("Relu6" in relu_op.name)  # uses tf.nn.relu6
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 6]])
 
-  def test_zeros(self):
-    x = backend.zeros((3, 4))
-    val = backend.eval(x)
-    self.assertAllClose(val, np.zeros((3, 4)))
-
-  def test_ones(self):
-    x = backend.ones((3, 4))
-    val = backend.eval(x)
-    self.assertAllClose(val, np.ones((3, 4)))
-
-  def test_eye(self):
-    x = backend.eye(4)
-    val = backend.eval(x)
-    self.assertAllClose(val, np.eye(4))
-
-  def test_zeros_like(self):
-    x = backend.zeros((3, 4))
-    y = backend.zeros_like(x)
-    val = backend.eval(y)
-    self.assertAllClose(val, np.zeros((3, 4)))
-
-  def test_ones_like(self):
-    x = backend.zeros((3, 4))
-    y = backend.ones_like(x)
-    val = backend.eval(y)
-    self.assertAllClose(val, np.ones((3, 4)))
-
-  def test_random_uniform_variable(self):
-    x = backend.random_uniform_variable((30, 20), low=1., high=2., seed=0)
-    val = backend.eval(x)
-    self.assertAllClose(val.mean(), 1.5, atol=1e-1)
-    self.assertAllClose(val.max(), 2., atol=1e-1)
-    self.assertAllClose(val.min(), 1., atol=1e-1)
-
-  def test_random_normal_variable(self):
-    x = backend.random_normal_variable((30, 20), 1., 0.5, seed=0)
-    val = backend.eval(x)
-    self.assertAllClose(val.mean(), 1., atol=1e-1)
-    self.assertAllClose(val.std(), 0.5, atol=1e-1)
-
-  def test_count_params(self):
-    x = backend.zeros((4, 5))
-    val = backend.count_params(x)
-    self.assertAllClose(val, 20)
-
-  def test_constant(self):
-    ref_val = np.random.random((3, 4)).astype('float32')
-    x = backend.constant(ref_val)
-    val = backend.eval(x)
-    self.assertAllClose(val, ref_val)
-
-  def test_sparse_variable(self):
-    val = scipy.sparse.eye(10)
-    x = backend.variable(val)
-    self.assertTrue(isinstance(x, tf.SparseTensor))
-
-    y = backend.to_dense(x)
-    self.assertFalse(backend.is_sparse(y))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class BackendLinearAlgebraTest(tf.test.TestCase, parameterized.TestCase):
+        # max value > 6
+        relu_op = backend.relu(x, max_value=10.0)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
 
-  def test_dot(self):
-    x = backend.ones(shape=(2, 3))
-    y = backend.ones(shape=(3, 4))
-    xy = backend.dot(x, y)
-    self.assertEqual(xy.shape.as_list(), [2, 4])
-
-    x = backend.ones(shape=(32, 28, 3))
-    y = backend.ones(shape=(3, 4))
-    xy = backend.dot(x, y)
-    self.assertEqual(xy.shape.as_list(), [32, 28, 4])
-
-  @parameterized.parameters(
-      [(2, 3, 4, 5), (2, 5, 6, 7), (2, 3, 4, 6, 7), (3, 1)],
-      [(2, 20, 1), (2, 30, 20), (2, 1, 30), (1, 2)],
-      [(4, 2, 3), (4, 5, 3), (4, 2, 5), (2, 2)],
-      [(4, 2), (4, 2, 3), (4, 3), (1, 1)],
-      [(4, 2), (4, 2, 3), (4, 3), 1],
-      [(4, 2, 3), (4, 3), (4, 2), (2, 1)],
-  )
-  def test_batch_dot(self, x_shape, y_shape, output_shape, axes):
-    x_val = np.random.random(x_shape)
-    y_val = np.random.random(y_shape)
-    x = backend.variable(x_val)
-    y = backend.variable(y_val)
-    xy = backend.batch_dot(x, y, axes=axes)
-    self.assertEqual(tuple(xy.shape.as_list()), output_shape)
-    xy_val = backend.eval(xy)
-    ref_val = self._reference_batch_dot(x_val, y_val, axes)
-    self.assertAllClose(xy_val, ref_val, atol=1e-5)
-
-  def _reference_batch_dot(self, x, y, axes):
-    if isinstance(axes, int):
-      axes = [axes, axes]
-    elif isinstance(axes, tuple):
-      axes = list(axes)
-    if axes is None:
-      if y.ndim == 2:
-        axes = [x.ndim - 1, y.ndim - 1]
-      else:
-        axes = [x.ndim - 1, y.ndim - 2]
-    if axes[0] < 0:
-      axes[0] += x.ndim
-    if axes[1] < 0:
-      axes[1] += y.ndim
-    result = []
-    axes = [axes[0] - 1, axes[1] - 1]
-    for xi, yi in zip(x, y):
-      result.append(np.tensordot(xi, yi, axes))
-    result = np.array(result)
-    if result.ndim == 1:
-      result = np.expand_dims(result, -1)
-    return result
-
-  def test_reduction_ops(self):
-    ops_to_test = [
-        (backend.max, np.max),
-        (backend.min, np.min),
-        (backend.sum, np.sum),
-        (backend.prod, np.prod),
-        (backend.var, np.var),
-        (backend.std, np.std),
-        (backend.mean, np.mean),
-        (backend.argmin, np.argmin),
-        (backend.argmax, np.argmax),
-    ]
-    for keras_op, np_op in ops_to_test:
-      compare_single_input_op_to_numpy(
-          keras_op,
-          np_op,
-          input_shape=(4, 7, 5),
-          keras_kwargs={'axis': 1},
-          np_kwargs={'axis': 1})
-      compare_single_input_op_to_numpy(
-          keras_op,
-          np_op,
-          input_shape=(4, 7, 5),
-          keras_kwargs={'axis': -1},
-          np_kwargs={'axis': -1})
-      if 'keepdims' in tf_inspect.getargspec(keras_op).args:
-        compare_single_input_op_to_numpy(
-            keras_op,
-            np_op,
-            input_shape=(4, 7, 5),
-            keras_kwargs={
-                'axis': 1,
-                'keepdims': True
-            },
-            np_kwargs={
-                'axis': 1,
-                'keepdims': True
-            })
-
-  def test_elementwise_ops(self):
-    ops_to_test = [
-        (backend.square, np.square),
-        (backend.abs, np.abs),
-        (backend.round, np.round),
-        (backend.sign, np.sign),
-        (backend.sin, np.sin),
-        (backend.cos, np.cos),
-        (backend.exp, np.exp),
-    ]
-    for keras_op, np_op in ops_to_test:
-      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
-
-    ops_to_test = [
-        (backend.sqrt, np.sqrt),
-        (backend.log, np.log),
-    ]
-    for keras_op, np_op in ops_to_test:
-      compare_single_input_op_to_numpy(
-          keras_op, np_op, input_shape=(4, 7), negative_values=False)
-
-    compare_single_input_op_to_numpy(
-        backend.clip,
-        np.clip,
-        input_shape=(6, 4),
-        keras_kwargs={
-            'min_value': 0.1,
-            'max_value': 2.4
-        },
-        np_kwargs={
-            'a_min': 0.1,
-            'a_max': 1.4
-        })
-
-    compare_single_input_op_to_numpy(
-        backend.pow, np.power, input_shape=(6, 4), keras_args=[3], np_args=[3])
-
-  def test_two_tensor_ops(self):
-    ops_to_test = [
-        (backend.equal, np.equal),
-        (backend.not_equal, np.not_equal),
-        (backend.greater, np.greater),
-        (backend.greater_equal, np.greater_equal),
-        (backend.less, np.less),
-        (backend.less_equal, np.less_equal),
-        (backend.maximum, np.maximum),
-        (backend.minimum, np.minimum),
-    ]
-    for keras_op, np_op in ops_to_test:
-      compare_two_inputs_op_to_numpy(
-          keras_op, np_op, input_shape_a=(4, 7), input_shape_b=(4, 7))
-
-  def test_relu(self):
-    x = tf.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
-
-    # standard relu
-    relu_op = backend.relu(x)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
-
-    # alpha (leaky relu used)
-    relu_op = backend.relu(x, alpha=0.5)
-    if not tf.executing_eagerly():
-      self.assertTrue('LeakyRelu' in relu_op.name)
-    self.assertAllClose(backend.eval(relu_op), [[-2, 0], [2, 7]])
-
-    # max_value < some elements
-    relu_op = backend.relu(x, max_value=5.)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 5]])
-
-    # nn.relu6 used
-    relu_op = backend.relu(x, max_value=6.)
-    if not tf.executing_eagerly():
-      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 6]])
-
-    # max value > 6
-    relu_op = backend.relu(x, max_value=10.)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
-
-    # max value is float
-    relu_op = backend.relu(x, max_value=4.3)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 4.3]])
-
-    # max value == 0
-    relu_op = backend.relu(x, max_value=0.)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 0]])
-
-    # alpha and max_value
-    relu_op = backend.relu(x, alpha=0.25, max_value=3.)
-    self.assertAllClose(backend.eval(relu_op), [[-1, 0], [2, 3]])
-
-    # threshold
-    relu_op = backend.relu(x, threshold=3)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 7]])
-
-    # threshold is float
-    relu_op = backend.relu(x, threshold=1.5)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
-
-    # threshold is negative
-    relu_op = backend.relu(x, threshold=-5)
-    self.assertAllClose(backend.eval(relu_op), [[-4, 0], [2, 7]])
-
-    # threshold and max_value
-    relu_op = backend.relu(x, threshold=3, max_value=5.)
-    self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 5]])
-
-    # threshold and alpha
-    relu_op = backend.relu(x, alpha=0.25, threshold=4.)
-    self.assertAllClose(backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
-
-    # threshold, alpha, and max_value
-    relu_op = backend.relu(x, alpha=0.25, threshold=4., max_value=5.)
-    self.assertAllClose(backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
-
-    # Test case for GitHub issue 35430, with integer dtype
-    x = input_layer.Input(shape=(), name='x', dtype='int64')
-    _ = activation.ReLU(max_value=100., dtype='int64')(x)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class BackendShapeOpsTest(tf.test.TestCase):
+        # max value is float
+        relu_op = backend.relu(x, max_value=4.3)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 4.3]])
 
-  def test_reshape(self):
-    compare_single_input_op_to_numpy(
-        backend.reshape,
-        np.reshape,
-        input_shape=(4, 7),
-        keras_args=[(2, 14)],
-        np_args=[(2, 14)])
-
-  def test_concatenate(self):
-    a = backend.variable(np.ones((1, 2, 3)))
-    b = backend.variable(np.ones((1, 2, 2)))
-    y = backend.concatenate([a, b], axis=-1)
-    self.assertEqual(y.shape.as_list(), [1, 2, 5])
-
-  def test_permute_dimensions(self):
-    compare_single_input_op_to_numpy(
-        backend.permute_dimensions,
-        np.transpose,
-        input_shape=(4, 7),
-        keras_args=[(1, 0)],
-        np_args=[(1, 0)])
-
-  def test_resize_images(self):
-    height_factor = 2
-    width_factor = 2
-    data_format = 'channels_last'
-    x = backend.variable(np.ones((1, 2, 2, 3)))
-    y = backend.resize_images(x, height_factor, width_factor, data_format)
-    self.assertEqual(y.shape.as_list(), [1, 4, 4, 3])
-
-    data_format = 'channels_first'
-    x = backend.variable(np.ones((1, 3, 2, 2)))
-    y = backend.resize_images(x, height_factor, width_factor, data_format)
-    self.assertEqual(y.shape.as_list(), [1, 3, 4, 4])
-
-    # Use with a dynamic axis:
-    if not tf.executing_eagerly():
-      x = backend.placeholder(shape=(1, 3, None, None))
-      y = backend.resize_images(x, height_factor, width_factor, data_format)
-      self.assertEqual(y.shape.as_list(), [1, 3, None, None])
-
-    # Invalid use:
-    with self.assertRaises(ValueError):
-      backend.resize_images(
-          x, height_factor, width_factor, data_format='unknown')
-
-  def test_resize_volumes(self):
-    height_factor = 2
-    width_factor = 2
-    depth_factor = 2
-    data_format = 'channels_last'
-    x = backend.variable(np.ones((1, 2, 2, 2, 3)))
-    y = backend.resize_volumes(x, depth_factor, height_factor, width_factor,
-                               data_format)
-    self.assertEqual(y.shape.as_list(), [1, 4, 4, 4, 3])
-
-    data_format = 'channels_first'
-    x = backend.variable(np.ones((1, 3, 2, 2, 2)))
-    y = backend.resize_volumes(x, depth_factor, height_factor, width_factor,
-                               data_format)
-    self.assertEqual(y.shape.as_list(), [1, 3, 4, 4, 4])
-
-    # Invalid use:
-    with self.assertRaises(ValueError):
-      backend.resize_volumes(
-          x, depth_factor, height_factor, width_factor, data_format='unknown')
-
-  def test_repeat_elements(self):
-    x = backend.variable(np.ones((1, 3, 2)))
-    y = backend.repeat_elements(x, 3, axis=1)
-    self.assertEqual(y.shape.as_list(), [1, 9, 2])
-
-    # Use with a dynamic axis:
-    if not tf.executing_eagerly():
-      x = backend.placeholder(shape=(2, None, 2))
-      y = backend.repeat_elements(x, 3, axis=1)
-      self.assertEqual(y.shape.as_list(), [2, None, 2])
-
-  def test_repeat(self):
-    x = backend.variable(np.ones((1, 3)))
-    y = backend.repeat(x, 2)
-    self.assertEqual(y.shape.as_list(), [1, 2, 3])
-
-  def test_flatten(self):
-    compare_single_input_op_to_numpy(
-        backend.flatten,
-        np.reshape,
-        input_shape=(4, 7, 6),
-        np_args=[(4 * 7 * 6,)])
-
-  def test_batch_flatten(self):
-    compare_single_input_op_to_numpy(
-        backend.batch_flatten,
-        np.reshape,
-        input_shape=(4, 7, 6),
-        np_args=[(4, 7 * 6)])
-
-  def test_temporal_padding(self):
-
-    def ref_op(x, padding):
-      shape = list(x.shape)
-      shape[1] += padding[0] + padding[1]
-      y = np.zeros(tuple(shape))
-      y[:, padding[0]:-padding[1], :] = x
-      return y
-
-    compare_single_input_op_to_numpy(
-        backend.temporal_padding,
-        ref_op,
-        input_shape=(4, 7, 6),
-        keras_args=[(2, 3)],
-        np_args=[(2, 3)])
-
-  def test_spatial_2d_padding(self):
-
-    def ref_op(x, padding, data_format='channels_last'):
-      shape = list(x.shape)
-      if data_format == 'channels_last':
-        shape[1] += padding[0][0] + padding[0][1]
-        shape[2] += padding[1][0] + padding[1][1]
-        y = np.zeros(tuple(shape))
-        y[:, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1], :] = x
-      else:
-        shape[2] += padding[0][0] + padding[0][1]
-        shape[3] += padding[1][0] + padding[1][1]
-        y = np.zeros(tuple(shape))
-        y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1]] = x
-      return y
-
-    compare_single_input_op_to_numpy(
-        backend.spatial_2d_padding,
-        ref_op,
-        input_shape=(2, 3, 2, 3),
-        keras_args=[((2, 3), (1, 2))],
-        keras_kwargs={'data_format': 'channels_last'},
-        np_args=[((2, 3), (1, 2))],
-        np_kwargs={'data_format': 'channels_last'})
-    compare_single_input_op_to_numpy(
-        backend.spatial_2d_padding,
-        ref_op,
-        input_shape=(2, 3, 2, 3),
-        keras_args=[((2, 3), (1, 2))],
-        keras_kwargs={'data_format': 'channels_first'},
-        np_args=[((2, 3), (1, 2))],
-        np_kwargs={'data_format': 'channels_first'})
-
-  def test_spatial_3d_padding(self):
-
-    def ref_op(x, padding, data_format='channels_last'):
-      shape = list(x.shape)
-      if data_format == 'channels_last':
-        shape[1] += padding[0][0] + padding[0][1]
-        shape[2] += padding[1][0] + padding[1][1]
-        shape[3] += padding[2][0] + padding[2][1]
-        y = np.zeros(tuple(shape))
-        y[:, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1],
-          padding[2][0]:-padding[2][1], :] = x
-      else:
-        shape[2] += padding[0][0] + padding[0][1]
-        shape[3] += padding[1][0] + padding[1][1]
-        shape[4] += padding[2][0] + padding[2][1]
-        y = np.zeros(tuple(shape))
-        y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1],
-          padding[2][0]:-padding[2][1]] = x
-      return y
-
-    compare_single_input_op_to_numpy(
-        backend.spatial_3d_padding,
-        ref_op,
-        input_shape=(2, 3, 2, 3, 2),
-        keras_args=[((2, 3), (1, 2), (2, 3))],
-        keras_kwargs={'data_format': 'channels_last'},
-        np_args=[((2, 3), (1, 2), (2, 3))],
-        np_kwargs={'data_format': 'channels_last'})
-    compare_single_input_op_to_numpy(
-        backend.spatial_3d_padding,
-        ref_op,
-        input_shape=(2, 3, 2, 3, 2),
-        keras_args=[((2, 3), (1, 2), (2, 3))],
-        keras_kwargs={'data_format': 'channels_first'},
-        np_args=[((2, 3), (1, 2), (2, 3))],
-        np_kwargs={'data_format': 'channels_first'})
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class BackendNNOpsTest(tf.test.TestCase, parameterized.TestCase):
+        # max value == 0
+        relu_op = backend.relu(x, max_value=0.0)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 0]])
 
-  def test_bias_add(self):
-    keras_op = backend.bias_add
-    np_op = np.add
-    compare_two_inputs_op_to_numpy(
-        keras_op, np_op, input_shape_a=(4, 7), input_shape_b=(7,))
-    compare_two_inputs_op_to_numpy(
-        keras_op, np_op, input_shape_a=(4, 3, 7), input_shape_b=(7,))
-    compare_two_inputs_op_to_numpy(
-        keras_op, np_op, input_shape_a=(4, 3, 5, 7), input_shape_b=(7,))
-    compare_two_inputs_op_to_numpy(
-        keras_op, np_op, input_shape_a=(4, 3, 5, 2, 7), input_shape_b=(7,))
-
-    with self.assertRaises((ValueError, tf.errors.InvalidArgumentError)):
-      x = backend.variable((3, 4))
-      b = backend.variable((3, 4))
-      backend.bias_add(x, b)
-    with self.assertRaises(ValueError):
-      x = backend.variable((3, 4))
-      b = backend.variable((4,))
-      backend.bias_add(x, b, data_format='unknown')
-
-  def test_bias_add_channels_first(self):
-
-    def keras_op(x, b):
-      return backend.bias_add(x, b, data_format='channels_first')
-
-    def np_op(x, b):
-      if x.ndim == 3:
-        b = b.reshape((1, b.shape[0], 1))
-      if x.ndim == 4:
-        b = b.reshape((1, b.shape[0], 1, 1))
-      return x + b
-
-    compare_two_inputs_op_to_numpy(
-        keras_op, np_op, input_shape_a=(4, 3, 7), input_shape_b=(3,))
-    compare_two_inputs_op_to_numpy(
-        keras_op, np_op, input_shape_a=(4, 3, 5, 7), input_shape_b=(3,))
-
-  def test_pool2d(self):
-    val = np.random.random((10, 3, 10, 10))
-    x = backend.variable(val)
-    y = backend.pool2d(
-        x, (2, 2),
-        strides=(1, 1),
-        padding='valid',
-        data_format='channels_first',
-        pool_mode='max')
-    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
-
-    y = backend.pool2d(
-        x, (2, 2),
-        strides=(1, 1),
-        padding='valid',
-        data_format='channels_first',
-        pool_mode='avg')
-    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
-
-    val = np.random.random((10, 10, 10, 3))
-    x = backend.variable(val)
-    y = backend.pool2d(
-        x, (2, 2), strides=(1, 1), padding='valid', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 9, 9, 3])
-
-    val = np.random.random((10, 10, 10, 3))
-    x = backend.variable(val)
-    y = backend.pool2d(
-        x, (2, 2), strides=(1, 1), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 10, 10, 3])
-
-    val = np.random.random((10, 10, 10, 3))
-    x = backend.variable(val)
-    y = backend.pool2d(
-        x, (2, 2), strides=(2, 2), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 5, 5, 3])
-
-    with self.assertRaises(ValueError):
-      y = backend.pool2d(
-          x, (2, 2),
-          strides=(2, 2),
-          padding='other',
-          data_format='channels_last')
-    with self.assertRaises(ValueError):
-      y = backend.pool2d(x, (2, 2), strides=(2, 2), data_format='other')
-    with self.assertRaises(ValueError):
-      y = backend.pool2d(x, (2, 2, 2), strides=(2, 2))
-    with self.assertRaises(ValueError):
-      y = backend.pool2d(x, (2, 2), strides=(2, 2, 2))
-    with self.assertRaises(ValueError):
-      y = backend.pool2d(x, (2, 2), strides=(2, 2), pool_mode='other')
-
-  def test_pool3d(self):
-    val = np.random.random((10, 3, 10, 10, 10))
-    x = backend.variable(val)
-    y = backend.pool3d(
-        x, (2, 2, 2),
-        strides=(1, 1, 1),
-        padding='valid',
-        data_format='channels_first',
-        pool_mode='max')
-    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
-
-    y = backend.pool3d(
-        x, (2, 2, 2),
-        strides=(1, 1, 1),
-        padding='valid',
-        data_format='channels_first',
-        pool_mode='avg')
-    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
-
-    val = np.random.random((10, 10, 10, 10, 3))
-    x = backend.variable(val)
-    y = backend.pool3d(
-        x, (2, 2, 2),
-        strides=(1, 1, 1),
-        padding='valid',
-        data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 9, 9, 9, 3])
-
-    val = np.random.random((10, 10, 10, 10, 3))
-    x = backend.variable(val)
-    y = backend.pool3d(
-        x, (2, 2, 2),
-        strides=(1, 1, 1),
-        padding='same',
-        data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 3])
-
-    val = np.random.random((10, 10, 10, 10, 3))
-    x = backend.variable(val)
-    y = backend.pool3d(
-        x, (2, 2, 2),
-        strides=(2, 2, 2),
-        padding='same',
-        data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 3])
-
-  def test_conv1d(self):
-    val = np.random.random((10, 4, 10))
-    x = backend.variable(val)
-    kernel_val = np.random.random((3, 4, 5))
-    k = backend.variable(kernel_val)
-    y = backend.conv1d(
-        x, k, strides=(1,), padding='valid', data_format='channels_first')
-    self.assertEqual(y.shape.as_list(), [10, 5, 8])
-
-    val = np.random.random((10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv1d(
-        x, k, strides=(1,), padding='valid', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 8, 5])
-
-    val = np.random.random((10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv1d(
-        x, k, strides=(1,), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 10, 5])
-
-    val = np.random.random((10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv1d(
-        x, k, strides=(2,), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 5, 5])
-
-  def test_local_conv_channels_dim(self):
-    filters = 3
-    batch_size = 2
-
-    for input_shape in [(3, 5), (2, 3, 5), (2, 5, 3, 4)]:
-      channels_in = input_shape[0]
-      input_spatial_shape = input_shape[1:]
-      dim = len(input_spatial_shape)
-
-      inputs = np.random.normal(0, 1, (batch_size,) + input_shape)
-      inputs_cf = backend.variable(inputs)
-
-      for kernel_size in [1, 2]:
-        for stride in [1, 2]:
-          kernel_sizes = (kernel_size,) * dim
-          strides = (stride,) * dim
-
-          output_shape = tuple([
-              (i - kernel_size + stride) // stride for i in input_spatial_shape
-          ])
-
-          kernel_shape = (np.prod(output_shape),
-                          np.prod(kernel_sizes) * channels_in, filters)
-
-          kernel = np.random.normal(
-              0, 1,
-              output_shape + (channels_in, np.prod(kernel_sizes), filters))
-
-          kernel_cf = np.reshape(kernel, kernel_shape)
-          kernel_cf = backend.variable(kernel_cf)
-
-          conv_cf = backend.local_conv(inputs_cf, kernel_cf, kernel_sizes,
-                                       strides, output_shape, 'channels_first')
-
-          inputs_cl = np.transpose(inputs,
-                                   [0, 2] + list(range(3, dim + 2)) + [1])
-          inputs_cl = backend.variable(inputs_cl)
-
-          kernel_cl = np.reshape(
-              np.transpose(kernel,
-                           list(range(dim)) + [dim + 1, dim, dim + 2]),
-              kernel_shape)
-          kernel_cl = backend.variable(kernel_cl)
-
-          conv_cl = backend.local_conv(inputs_cl, kernel_cl, kernel_sizes,
-                                       strides, output_shape, 'channels_last')
-
-          conv_cf = backend.eval(conv_cf)
-          conv_cl = backend.eval(conv_cl)
-
-          self.assertAllCloseAccordingToType(
-              conv_cf,
-              np.transpose(conv_cl, [0, dim + 1] + list(range(1, dim + 1))),
-              atol=1e-5)
-
-  @parameterized.named_parameters(
-      ('local_conv1d', (5, 6), (3,), (1,), (3,)),
-      ('local_conv2d', (4, 5, 6), (3, 3), (1, 1), (2, 3)))
-  def test_local_conv_1d_and_2d(self, input_shape, kernel_sizes, strides,
-                                output_shape):
-    filters = 3
-    batch_size = 2
-
-    inputs = np.random.normal(0, 1, (batch_size,) + input_shape)
-    inputs = backend.variable(inputs)
-
-    kernel = np.random.normal(0, 1,
-                              (np.prod(output_shape), np.prod(kernel_sizes) *
-                               input_shape[-1], filters))
-    kernel = backend.variable(kernel)
-
-    local_conv = backend.local_conv(inputs, kernel, kernel_sizes, strides,
-                                    output_shape, 'channels_last')
-    if len(output_shape) == 1:
-      local_conv_dim = backend.local_conv1d(inputs, kernel, kernel_sizes,
-                                            strides, 'channels_last')
-    else:
-      local_conv_dim = backend.local_conv2d(inputs, kernel, kernel_sizes,
-                                            strides, output_shape,
-                                            'channels_last')
-
-    local_conv = backend.eval(local_conv)
-    local_conv_dim = backend.eval(local_conv_dim)
-
-    self.assertAllCloseAccordingToType(local_conv, local_conv_dim)
-
-  def test_conv2d(self):
-    kernel_val = np.random.random((3, 3, 4, 5))
-    k = backend.variable(kernel_val)
-
-    # Test channels_first
-    val = np.random.random((10, 4, 10, 10))
-    x = backend.variable(val)
-    y = backend.conv2d(x, k, padding='valid', data_format='channels_first')
-    self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
-
-    # Test channels_last
-    val = np.random.random((10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv2d(
-        x, k, strides=(1, 1), padding='valid', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
-
-    # Test same padding
-    val = np.random.random((10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv2d(x, k, padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
-
-    # Test dilation_rate
-    val = np.random.random((10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv2d(
-        x, k, dilation_rate=(2, 2), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
-
-    # Test strides
-    val = np.random.random((10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv2d(
-        x, k, strides=(2, 2), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
-
-    # Test invalid arguments
-    with self.assertRaises(ValueError):
-      y = backend.conv2d(
-          x, k, (2, 2), padding='other', data_format='channels_last')
-    with self.assertRaises(ValueError):
-      y = backend.conv2d(x, k, (2, 2), data_format='other')
-    with self.assertRaises(ValueError):
-      y = backend.conv2d(x, k, (2, 2, 2))
-
-  def test_conv2d_transpose(self):
-    input_size = (7, 8)
-    kernel_size = (3, 3)
-    input_depth = 6
-    filters = 6
-    batch_size = 2
-
-    kernel_val = np.random.random(kernel_size + (input_depth, filters))
-    k = backend.variable(kernel_val)
-
-    # Test channels_first
-    input_val = np.random.random((batch_size, input_depth) + input_size)
-    x = backend.variable(input_val)
-    y = backend.conv2d_transpose(
-        x,
-        k, (batch_size, filters) + input_size,
-        padding='same',
-        data_format='channels_first')
-    self.assertEqual(
-        tuple(y.shape.as_list()), (batch_size, filters) + input_size)
-
-    # Test channels_last
-    input_val = np.random.random((batch_size,) + input_size + (input_depth,))
-    x = backend.variable(input_val)
-    y = backend.conv2d_transpose(
-        x,
-        k, (batch_size,) + input_size + (filters,),
-        padding='same',
-        data_format='channels_last')
-    self.assertEqual(
-        tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,))
-
-    # Test dilation_rate
-    y = backend.conv2d_transpose(
-        x,
-        k, (batch_size,) + input_size + (filters,),
-        padding='same',
-        data_format='channels_last',
-        dilation_rate=(2, 2))
-    self.assertEqual(
-        tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,))
-
-    # Test dilation_rate error
-    with self.assertRaisesRegex(
-        ValueError,
-        'Expected the 2 dimensions'):
-      y = backend.conv2d_transpose(
-          x,
-          k, (batch_size,) + input_size + (filters,),
-          padding='same',
-          data_format='channels_last',
-          dilation_rate=(1, 2))
-
-    # Test batch size of None in output_shape
-    y = backend.conv2d_transpose(
-        x,
-        k, (None,) + input_size + (filters,),
-        padding='same',
-        data_format='channels_last')
-    self.assertEqual(
-        tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,))
-
-    # Test invalid values
-    with self.assertRaises(ValueError):
-      y = backend.conv2d_transpose(
-          x, k, (2, 2, 8, 9), padding='other', data_format='channels_last')
-    with self.assertRaises(ValueError):
-      y = backend.conv2d_transpose(x, k, (2, 2, 8, 9), data_format='other')
-
-  def test_separable_conv2d(self):
-    val = np.random.random((10, 4, 10, 10))
-    x = backend.variable(val)
-    depthwise_kernel_val = np.random.random((3, 3, 4, 1))
-    pointwise_kernel_val = np.random.random((1, 1, 4, 5))
-    dk = backend.variable(depthwise_kernel_val)
-    pk = backend.variable(pointwise_kernel_val)
-    y = backend.separable_conv2d(
-        x, dk, pk, padding='valid', data_format='channels_first')
-    self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
-
-    val = np.random.random((10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.separable_conv2d(
-        x, dk, pk, strides=(1, 1), padding='valid', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
-
-    val = np.random.random((10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.separable_conv2d(
-        x, dk, pk, strides=(1, 1), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
-
-    val = np.random.random((10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.separable_conv2d(
-        x, dk, pk, strides=(2, 2), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
-    with self.assertRaises(ValueError):
-      y = backend.separable_conv2d(
-          x, dk, pk, (2, 2), padding='other', data_format='channels_last')
-    with self.assertRaises(ValueError):
-      y = backend.separable_conv2d(x, dk, pk, (2, 2), data_format='other')
-    with self.assertRaises(ValueError):
-      y = backend.separable_conv2d(x, dk, pk, (2, 2, 2))
-
-  def test_conv3d(self):
-    val = np.random.random((10, 4, 10, 10, 10))
-    x = backend.variable(val)
-    kernel_val = np.random.random((3, 3, 3, 4, 5))
-    k = backend.variable(kernel_val)
-    y = backend.conv3d(x, k, padding='valid', data_format='channels_first')
-    self.assertEqual(y.shape.as_list(), [10, 5, 8, 8, 8])
-
-    val = np.random.random((10, 10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv3d(
-        x, k, strides=(1, 1, 1), padding='valid', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 8, 8, 8, 5])
-
-    val = np.random.random((10, 10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv3d(
-        x, k, strides=(1, 1, 1), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 5])
-
-    val = np.random.random((10, 10, 10, 10, 4))
-    x = backend.variable(val)
-    y = backend.conv3d(
-        x, k, strides=(2, 2, 2), padding='same', data_format='channels_last')
-    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 5])
-    with self.assertRaises(ValueError):
-      y = backend.conv3d(
-          x, k, (2, 2, 2), padding='other', data_format='channels_last')
-    with self.assertRaises(ValueError):
-      y = backend.conv3d(x, k, (2, 2, 2), data_format='other')
-    with self.assertRaises(ValueError):
-      y = backend.conv3d(x, k, (2, 2))
-
-  def test_rnn(self):
-    # implement a simple RNN
-    num_samples = 4
-    input_dim = 5
-    output_dim = 3
-    timesteps = 6
-
-    input_val = np.random.random(
-        (num_samples, timesteps, input_dim)).astype(np.float32)
-    init_state_val = np.random.random(
-        (num_samples, output_dim)).astype(np.float32)
-    w_i_val = np.random.random((input_dim, output_dim)).astype(np.float32)
-    w_o_val = np.random.random((output_dim, output_dim)).astype(np.float32)
-    np_mask = np.random.randint(2, size=(num_samples, timesteps))
-
-    def rnn_step_fn():
-      w_i = backend.variable(w_i_val)
-      w_o = backend.variable(w_o_val)
-
-      def step_function(x, states):
-        assert len(states) == 1
-        prev_output = states[0]
-        output = backend.dot(x, w_i) + backend.dot(prev_output, w_o)
-        return output, [output]
-
-      return step_function
-
-    # test default setup
-    last_output_list = [[], [], [], [], [], []]
-    outputs_list = [[], [], [], [], [], []]
-    state_list = [[], [], [], [], [], []]
-
-    rnn_fn = rnn_step_fn()
-    inputs = backend.variable(input_val)
-    initial_states = [backend.variable(init_state_val)]
-    mask = backend.variable(np_mask)
-
-    kwargs_list = [
-        {
-            'go_backwards': False,
-            'mask': None
-        },
-        {
-            'go_backwards': False,
-            'mask': None,
-            'unroll': True
-        },
-        {
-            'go_backwards': True,
-            'mask': None
-        },
-        {
-            'go_backwards': True,
-            'mask': None,
-            'unroll': True
-        },
-        {
-            'go_backwards': False,
-            'mask': mask
-        },
-        {
-            'go_backwards': False,
-            'mask': mask,
-            'unroll': True
-        },
-    ]
-    for i, kwargs in enumerate(kwargs_list):
-      last_output, outputs, new_states = backend.rnn(rnn_fn, inputs,
-                                                     initial_states, **kwargs)
-      # check static shape inference
-      self.assertEqual(last_output.shape.as_list(), [num_samples, output_dim])
-      self.assertEqual(outputs.shape.as_list(),
-                       [num_samples, timesteps, output_dim])
-      for state in new_states:
-        self.assertEqual(state.shape.as_list(), [num_samples, output_dim])
-
-      last_output_list[i].append(backend.eval(last_output))
-      outputs_list[i].append(backend.eval(outputs))
-      self.assertLen(new_states, 1)
-      state_list[i].append(backend.eval(new_states[0]))
-
-      def assert_list_pairwise(z_list, atol=1e-05):
-        for (z1, z2) in zip(z_list[1:], z_list[:-1]):
-          self.assertAllClose(z1, z2, atol=atol)
-
-      assert_list_pairwise(last_output_list[0], atol=1e-04)
-      assert_list_pairwise(outputs_list[0], atol=1e-04)
-      assert_list_pairwise(state_list[0], atol=1e-04)
-      assert_list_pairwise(last_output_list[2], atol=1e-04)
-      assert_list_pairwise(outputs_list[2], atol=1e-04)
-      assert_list_pairwise(state_list[2], atol=1e-04)
-
-      for l, u_l in zip(last_output_list[0], last_output_list[1]):
-        self.assertAllClose(l, u_l, atol=1e-04)
-
-      for o, u_o in zip(outputs_list[0], outputs_list[1]):
-        self.assertAllClose(o, u_o, atol=1e-04)
-
-      for s, u_s in zip(state_list[0], state_list[1]):
-        self.assertAllClose(s, u_s, atol=1e-04)
-
-      for b_l, b_u_l in zip(last_output_list[2], last_output_list[3]):
-        self.assertAllClose(b_l, b_u_l, atol=1e-04)
-
-      for b_o, b_u_o in zip(outputs_list[2], outputs_list[3]):
-        self.assertAllClose(b_o, b_u_o, atol=1e-04)
-
-      for b_s, b_u_s in zip(state_list[2], state_list[3]):
-        self.assertAllClose(b_s, b_u_s, atol=1e-04)
-
-  def test_rnn_additional_states(self):
-    # implement a simple RNN
-    num_samples = 4
-    input_dim = 5
-    output_dim = 3
-    timesteps = 6
-
-    input_val = np.random.random(
-        (num_samples, timesteps, input_dim)).astype(np.float32)
-    init_state_val = np.random.random(
-        (num_samples, output_dim)).astype(np.float32)
-    w_i_val = np.random.random((input_dim, output_dim)).astype(np.float32)
-    w_o_val = np.random.random((output_dim, output_dim)).astype(np.float32)
-    np_mask = np.random.randint(2, size=(num_samples, timesteps))
-
-    def rnn_step_fn():
-      w_i = backend.variable(w_i_val)
-      w_o = backend.variable(w_o_val)
-
-      def step_function(x, states):
-        assert len(states) == 2
-        prev_output = states[0]
-        output = backend.dot(x, w_i) + backend.dot(prev_output, w_o)
-        return output, [output, backend.concatenate([output, output], axis=-1)]
-
-      return step_function
-
-    # test default setup
-    last_output_list = [[], [], [], [], [], []]
-    outputs_list = [[], [], [], [], [], []]
-    state_list = [[], [], [], [], [], []]
-    additional_state_list = [[], [], [], [], [], []]
-
-    rnn_fn = rnn_step_fn()
-    inputs = backend.variable(input_val)
-    initial_states = [
-        backend.variable(init_state_val),
-        tf.convert_to_tensor(
-            np.concatenate([init_state_val, init_state_val], axis=-1))
-    ]
-    mask = backend.variable(np_mask)
-
-    kwargs_list = [
-        {
-            'go_backwards': False,
-            'mask': None
-        },
-        {
-            'go_backwards': False,
-            'mask': None,
-            'unroll': True
-        },
-        {
-            'go_backwards': True,
-            'mask': None
-        },
-        {
-            'go_backwards': True,
-            'mask': None,
-            'unroll': True
-        },
-        {
-            'go_backwards': False,
-            'mask': mask
-        },
-        {
-            'go_backwards': False,
-            'mask': mask,
-            'unroll': True
-        },
-    ]
-    for i, kwargs in enumerate(kwargs_list):
-      last_output, outputs, new_states = backend.rnn(rnn_fn, inputs,
-                                                     initial_states, **kwargs)
-      # check static shape inference
-      self.assertEqual(last_output.shape.as_list(), [num_samples, output_dim])
-      self.assertEqual(outputs.shape.as_list(),
-                       [num_samples, timesteps, output_dim])
-      # for state in new_states:
-      #   self.assertEqual(state.shape.as_list(),
-      #                     [num_samples, output_dim])
-      self.assertEqual(new_states[0].shape.as_list(), [num_samples, output_dim])
-      self.assertEqual(new_states[1].shape.as_list(),
-                       [num_samples, 2 * output_dim])
-
-      last_output_list[i].append(backend.eval(last_output))
-      outputs_list[i].append(backend.eval(outputs))
-      self.assertLen(new_states, 2)
-      state_list[i].append(backend.eval(new_states[0]))
-      additional_state_list[i].append(backend.eval(new_states[1]))
-
-      def assert_list_pairwise(z_list, atol=1e-05):
-        for (z1, z2) in zip(z_list[1:], z_list[:-1]):
-          self.assertAllClose(z1, z2, atol=atol)
-
-      assert_list_pairwise(last_output_list[0], atol=1e-04)
-      assert_list_pairwise(outputs_list[0], atol=1e-04)
-      assert_list_pairwise(state_list[0], atol=1e-04)
-      assert_list_pairwise(additional_state_list[0], atol=1e-04)
-      assert_list_pairwise(last_output_list[2], atol=1e-04)
-      assert_list_pairwise(outputs_list[2], atol=1e-04)
-      assert_list_pairwise(state_list[2], atol=1e-04)
-      assert_list_pairwise(additional_state_list[2], atol=1e-04)
-
-      for l, u_l in zip(last_output_list[0], last_output_list[1]):
-        self.assertAllClose(l, u_l, atol=1e-04)
-
-      for o, u_o in zip(outputs_list[0], outputs_list[1]):
-        self.assertAllClose(o, u_o, atol=1e-04)
-
-      for s, u_s in zip(state_list[0], state_list[1]):
-        self.assertAllClose(s, u_s, atol=1e-04)
-
-      for s, u_s in zip(additional_state_list[0], additional_state_list[1]):
-        self.assertAllClose(s, u_s, atol=1e-04)
-
-      for b_l, b_u_l in zip(last_output_list[2], last_output_list[3]):
-        self.assertAllClose(b_l, b_u_l, atol=1e-04)
-
-      for b_o, b_u_o in zip(outputs_list[2], outputs_list[3]):
-        self.assertAllClose(b_o, b_u_o, atol=1e-04)
-
-      for b_s, b_u_s in zip(state_list[2], state_list[3]):
-        self.assertAllClose(b_s, b_u_s, atol=1e-04)
-
-      for s, u_s in zip(additional_state_list[2], additional_state_list[3]):
-        self.assertAllClose(s, u_s, atol=1e-04)
-
-  def test_rnn_output_and_state_masking_independent(self):
-    num_samples = 2
-    num_timesteps = 4
-    state_and_io_size = 2
-    mask_last_num_timesteps = 2  # for second sample only
-
-    # a step function that just outputs inputs,
-    # but increments states +1 per timestep
-    def step_function(inputs, states):
-      return inputs, [s + 1 for s in states]
-
-    inputs_vals = np.random.random(
-        (num_samples, num_timesteps, state_and_io_size))
-    initial_state_vals = np.random.random((num_samples, state_and_io_size))
-    # masking of two last timesteps for second sample only
-    mask_vals = np.ones((num_samples, num_timesteps))
-    mask_vals[1, -mask_last_num_timesteps:] = 0
-
-    # outputs expected to be same as inputs for the first sample
-    expected_outputs = inputs_vals.copy()
-    # but for the second sample all outputs in masked region should be the same
-    # as last output before masked region
-    expected_outputs[1, -mask_last_num_timesteps:] = \
-        expected_outputs[1, -(mask_last_num_timesteps + 1)]
-
-    expected_last_state = initial_state_vals.copy()
-    # first state should be incremented for every timestep (no masking)
-    expected_last_state[0] += num_timesteps
-    # second state should not be incremented for last two timesteps
-    expected_last_state[1] += (num_timesteps - mask_last_num_timesteps)
-
-    # verify same expected output for `unroll=true/false`
-    inputs = backend.variable(inputs_vals)
-    initial_states = [backend.variable(initial_state_vals)]
-    mask = backend.variable(mask_vals)
-    for unroll in [True, False]:
-      _, outputs, last_states = backend.rnn(
-          step_function,
-          inputs,
-          initial_states,
-          mask=mask,
-          unroll=unroll,
-          input_length=num_timesteps if unroll else None)
-
-      self.assertAllClose(backend.eval(outputs), expected_outputs)
-      self.assertAllClose(backend.eval(last_states[0]), expected_last_state)
-
-  def test_rnn_output_num_dim_larger_than_2_masking(self):
-    num_samples = 3
-    num_timesteps = 4
-    num_features = 5
-
-    def step_function(inputs, states):
-      outputs = backend.tile(backend.expand_dims(inputs), [1, 1, 2])
-      return outputs, [backend.identity(s) for s in states]
-      # Note: cannot just return states (which can be a problem) ->
-      # tensorflow/python/ops/resource_variable_ops.py", line 824, in set_shape
-      # NotImplementedError: ResourceVariable does not implement set_shape()
-
-    inputs_vals = np.random.random((num_samples, num_timesteps, num_features))
-    initial_state_vals = np.random.random((num_samples, 6))
-    mask_vals = np.ones((num_samples, num_timesteps))
-    mask_vals[-1, -1] = 0  # final timestep masked for last sample
-
-    expected_outputs = np.repeat(inputs_vals[..., None], repeats=2, axis=-1)
-    # for the last sample, the final timestep (in masked region) should be the
-    # same as the second to final output (before masked region)
-    expected_outputs[-1, -1] = expected_outputs[-1, -2]
-
-    inputs = backend.variable(inputs_vals)
-    initial_states = [backend.variable(initial_state_vals)]
-    mask = backend.variable(mask_vals)
-    for unroll in [True, False]:
-      _, outputs, _ = backend.rnn(
-          step_function,
-          inputs,
-          initial_states,
-          mask=mask,
-          unroll=unroll,
-          input_length=num_timesteps if unroll else None)
-
-      self.assertAllClose(backend.eval(outputs), expected_outputs)
-
-  def test_rnn_state_num_dim_larger_than_2_masking(self):
-    num_samples = 3
-    num_timesteps = 4
-
-    def step_function(inputs, states):
-      return inputs, [s + 1 for s in states]
-
-    inputs_vals = np.random.random((num_samples, num_timesteps, 5))
-    initial_state_vals = np.random.random((num_samples, 6, 7))
-    mask_vals = np.ones((num_samples, num_timesteps))
-    mask_vals[0, -2:] = 0  # final two timesteps masked for first sample
-
-    expected_last_state = initial_state_vals.copy()
-    expected_last_state[0] += (num_timesteps - 2)
-    expected_last_state[1:] += num_timesteps
-
-    inputs = backend.variable(inputs_vals)
-    initial_states = [backend.variable(initial_state_vals)]
-    mask = backend.variable(mask_vals)
-    for unroll in [True, False]:
-      _, _, last_states = backend.rnn(
-          step_function,
-          inputs,
-          initial_states,
-          mask=mask,
-          unroll=unroll,
-          input_length=num_timesteps if unroll else None)
-
-      self.assertAllClose(backend.eval(last_states[0]), expected_last_state)
-
-  def test_batch_normalization(self):
-    g_val = np.random.random((3,))
-    b_val = np.random.random((3,))
-    gamma = backend.variable(g_val)
-    beta = backend.variable(b_val)
-
-    # 3D NHC case
-    val = np.random.random((10, 5, 3))
-    x = backend.variable(val)
-    mean, var = tf.nn.moments(x, (0, 1), None, None, False)
-    normed = backend.batch_normalization(
-        x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
-    self.assertEqual(normed.shape.as_list(), [10, 5, 3])
-
-    # 4D NHWC case
-    val = np.random.random((10, 5, 5, 3))
-    x = backend.variable(val)
-    mean, var = tf.nn.moments(x, (0, 1, 2), None, None, False)
-    normed = backend.batch_normalization(
-        x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
-    self.assertEqual(normed.shape.as_list(), [10, 5, 5, 3])
-
-    # 4D NCHW case
-    if not tf.executing_eagerly():
-      # Eager CPU kernel for NCHW does not exist.
-      val = np.random.random((10, 3, 5, 5))
-      x = backend.variable(val)
-      mean, var = tf.nn.moments(x, (0, 2, 3), None, None, False)
-      normed = backend.batch_normalization(
-          x, mean, var, beta, gamma, axis=1, epsilon=1e-3)
-      self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
-
-  def test_normalize_batch_in_training(self):
-    val = np.random.random((10, 3, 10, 10))
-    x = backend.variable(val)
-    reduction_axes = (0, 2, 3)
-
-    g_val = np.random.random((3,))
-    b_val = np.random.random((3,))
-    gamma = backend.variable(g_val)
-    beta = backend.variable(b_val)
-    normed, mean, var = backend.normalize_batch_in_training(
-        x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.shape.as_list(), [
-        3,
-    ])
-    self.assertEqual(var.shape.as_list(), [
-        3,
-    ])
-
-    # case: gamma=None
-    gamma = None
-    normed, mean, var = backend.normalize_batch_in_training(
-        x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.shape.as_list(), [
-        3,
-    ])
-    self.assertEqual(var.shape.as_list(), [
-        3,
-    ])
-
-    # case: beta=None
-    beta = None
-    normed, mean, var = backend.normalize_batch_in_training(
-        x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.shape.as_list(), [
-        3,
-    ])
-    self.assertEqual(var.shape.as_list(), [
-        3,
-    ])
-
-  def test_dropout(self):
-    inputs = tf.ones((200, 200))
-    outputs = backend.dropout(inputs, 0.2)
-    outputs_val = backend.eval(outputs)
-    self.assertEqual(np.min(outputs_val), 0)
-    self.assertAllClose(np.count_nonzero(outputs_val), 32000, atol=1000)
-    # Test noise shape
-    outputs = backend.dropout(inputs, 0.2, noise_shape=(200, 1))
-    outputs_val = backend.eval(outputs)
-    # Make sure the whole column gets the same dropout
-    self.assertEqual(np.min(outputs_val[0, :]), np.max(outputs_val[0, :]))
+        # alpha and max_value
+        relu_op = backend.relu(x, alpha=0.25, max_value=3.0)
+        self.assertAllClose(backend.eval(relu_op), [[-1, 0], [2, 3]])
 
+        # threshold
+        relu_op = backend.relu(x, threshold=3)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 7]])
 
-class BackendCrossEntropyLossesTest(tf.test.TestCase, parameterized.TestCase):
+        # threshold is float
+        relu_op = backend.relu(x, threshold=1.5)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
 
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_crossentropy_with_sigmoid(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    p = backend.sigmoid(logits)
-    p = tf.identity(tf.identity(p))
-    result = self.evaluate(backend.binary_crossentropy(t, p))
-    self.assertArrayNear(result[0], [8., 0.313, 1.313], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_categorical_crossentropy_loss(self):
-    t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-
-    p = backend.constant([[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]])
-    result = backend.categorical_crossentropy(t, p)
-    self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
-
-    p = backend.constant([[.9, .05, .05], [.05, .89, .01], [.05, .06, .94]])
-    result = backend.categorical_crossentropy(t, p, axis=0)
-    self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
-
-    p = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    result = backend.categorical_crossentropy(t, p, from_logits=True),
-    self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
-
-    p = backend.constant([[8., 0., 2.], [1., 9., 3.], [1., 1., 5.]])
-    result = backend.categorical_crossentropy(t, p, from_logits=True, axis=0),
-    self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    t = backend.placeholder()
-    p = backend.placeholder()
-    o = backend.categorical_crossentropy(t, p)
-
-    t_val = tf.convert_to_tensor([[1., 0., 0.], [0., 1., 0.],
-                                                    [0., 0., 1.]])
-    p_val = tf.convert_to_tensor([[.9, .05, .05],
-                                                    [.05, .89, .06],
-                                                    [.05, .01, .94]])
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.105, .116, .062], 1e-3)
-
-    # With axis set
-    o = backend.categorical_crossentropy(t, p, axis=0)
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.105, .065, .111], 1e-3)
-
-    # from logits
-    p_val = tf.convert_to_tensor([[8., 1., 1.], [0., 9., 1.],
-                                                    [2., 3., 5.]])
-    o = backend.categorical_crossentropy(t, p, from_logits=True)
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.002, 0, .17], 1e-3)
-
-    # from logits and axis set
-    o = backend.categorical_crossentropy(t, p, from_logits=True, axis=0)
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.002, .003, .036], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_categorical_crossentropy_with_softmax(self):
-    t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    logits = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    p = backend.softmax(logits)
-    p = tf.identity(tf.identity(p))
-    result = self.evaluate(backend.categorical_crossentropy(t, p))
-    self.assertArrayNear(result, [0.002, 0.0005, 0.17], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_sparse_categorical_crossentropy_loss(self):
-    t = backend.constant([0, 1, 2])
-
-    p = backend.constant([[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]])
-    result = backend.sparse_categorical_crossentropy(t, p)
-    self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
-
-    p = backend.constant([[.9, .05, .05], [.05, .89, .01], [.05, .06, .94]])
-    result = backend.sparse_categorical_crossentropy(t, p, axis=0)
-    self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
-
-    p = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    result = backend.sparse_categorical_crossentropy(t, p, from_logits=True),
-    self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
-
-    p = backend.constant([[8., 0., 2.], [1., 9., 3.], [1., 1., 5.]])
-    result = backend.sparse_categorical_crossentropy(
-        t, p, from_logits=True, axis=0),
-    self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
-
-  @test_combinations.generate(test_combinations.combine(mode=['graph']))
-  def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    # This test only runs in graph because the TF op layer is not supported yet
-    # for sparse ops.
-    t = backend.placeholder()
-    p = backend.placeholder()
-    o = backend.sparse_categorical_crossentropy(t, p)
-
-    t_val = tf.convert_to_tensor([0, 1, 2])
-    p_val = tf.convert_to_tensor([[.9, .05, .05],
-                                                    [.05, .89, .06],
-                                                    [.05, .01, .94]])
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.105, .116, .062], 1e-3)
-
-    # With axis set
-    with self.assertRaisesRegex(
-        ValueError,
-        'Cannot compute sparse categorical crossentropy with `axis=0`'):
-      o = backend.sparse_categorical_crossentropy(t, p, axis=0)
-      f = backend.function([t, p], o)
-
-      _ = f([t_val, p_val])
-
-    # from logits
-    p_val = tf.convert_to_tensor([[8., 1., 1.], [0., 9., 1.],
-                                                    [2., 3., 5.]])
-    o = backend.sparse_categorical_crossentropy(t, p, from_logits=True)
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.002, 0, .17], 1e-3)
-
-    # from logits and axis set
-    with self.assertRaisesRegex(
-        ValueError,
-        'Cannot compute sparse categorical crossentropy with `axis=0`'):
-      o = backend.sparse_categorical_crossentropy(
-          t, p, from_logits=True, axis=0)
-      f = backend.function([t, p], o)
-
-      _ = f([t_val, p_val])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_sparse_categorical_crossentropy_with_softmax(self):
-    t = backend.constant([0, 1, 2])
-    logits = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    p = backend.softmax(logits)
-    p = tf.identity(tf.identity(p))
-    result = self.evaluate(backend.sparse_categorical_crossentropy(t, p))
-    self.assertArrayNear(result, [0.002, 0.0005, 0.17], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_crossentropy_from_logits_no_warnings(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    with warnings.catch_warnings(record=True) as w:
-      self.evaluate(backend.binary_crossentropy(t, logits, from_logits=True))
-      self.assertEmpty(w)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_crossentropy_from_logits_with_sigmoid(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    p = activations.sigmoid(logits)
-    with warnings.catch_warnings(record=True) as w:
-      self.evaluate(backend.binary_crossentropy(t, p, from_logits=True))
-      self.assertLen(w, 1)
-      self.assertIn('received `from_logits=True`', str(w[0].message))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_categorical_crossentropy_from_logits_with_softmax(self):
-    t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    logits = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    p = activations.softmax(logits)
-    with warnings.catch_warnings(record=True) as w:
-      self.evaluate(backend.categorical_crossentropy(t, p, from_logits=True))
-      self.assertLen(w, 1)
-      self.assertIn('received `from_logits=True`', str(w[0].message))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_sparse_categorical_crossentropy_from_logits_with_softmax(self):
-    t = backend.constant([0, 1, 2])
-    logits = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    p = activations.softmax(logits)
-    with warnings.catch_warnings(record=True) as w:
-      self.evaluate(
-          backend.sparse_categorical_crossentropy(t, p, from_logits=True))
-      self.assertLen(w, 1)
-      self.assertIn('received `from_logits=True`', str(w[0].message))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_focal_crossentropy_with_sigmoid(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    p = backend.sigmoid(logits)
-    p = tf.identity(tf.identity(p))
-    result = self.evaluate(backend.binary_focal_crossentropy(t, p, gamma=2.0))
-    self.assertArrayNear(result[0], [7.995, 0.022, 0.701], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_focal_crossentropy_from_logits(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    result = self.evaluate(
-        backend.binary_focal_crossentropy(
-            target=t,
-            output=logits,
-            gamma=2.0,
-            from_logits=True,
-        ))
-    self.assertArrayNear(result[0], [7.995, 0.022, 0.701], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_focal_crossentropy_no_focal_effect_with_zero_gamma(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    p = backend.sigmoid(logits)
-    p = tf.identity(tf.identity(p))
-    gamma = 0
-    focal_result = self.evaluate(
-        backend.binary_focal_crossentropy(
-            target=t,
-            output=p,
-            gamma=gamma,
-        ))
-    non_focal_result = self.evaluate(backend.binary_crossentropy(t, p))
-    self.assertArrayNear(focal_result[0], non_focal_result[0], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_weighted_focal_crossentropy_with_sigmoid(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    p = backend.sigmoid(logits)
-    p = tf.identity(tf.identity(p))
-    result = self.evaluate(backend.binary_weighted_focal_crossentropy(t, p))
-    self.assertArrayNear(result[0], [5.996, 0.006, 0.526], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_weighted_focal_crossentropy_from_logits(self):
-    t = backend.constant([[0, 1, 0]])
-    logits = backend.constant([[8., 1., 1.]])
-    result = self.evaluate(
-        backend.binary_weighted_focal_crossentropy(
-            target=t,
-            output=logits,
-            from_logits=True,
-        ))
-    self.assertArrayNear(result[0], [5.996, 0.006, 0.526], 1e-3)
+        # threshold is negative
+        relu_op = backend.relu(x, threshold=-5)
+        self.assertAllClose(backend.eval(relu_op), [[-4, 0], [2, 7]])
 
+        # threshold and max_value
+        relu_op = backend.relu(x, threshold=3, max_value=5.0)
+        self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 5]])
 
-@tf_test_utils.with_control_flow_v2
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class TestCTC(tf.test.TestCase):
+        # threshold and alpha
+        relu_op = backend.relu(x, alpha=0.25, threshold=4.0)
+        self.assertAllClose(backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
 
-  def test_ctc_decode(self):
-    depth = 6
-    seq_len_0 = 5
-    input_prob_matrix_0 = np.asarray(
-        [
-            [0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
-            [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
-            [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
-            [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
-            [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
-            # Random entry added in at time=5
-            [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]
-        ],
-        dtype=np.float32)
-
-    # len max_time_steps array of batch_size x depth matrices
-    inputs = (
-        [input_prob_matrix_0[t, :][np.newaxis, :] for t in range(seq_len_0)
-        ] +  # Pad to max_time_steps = 8
-        2 * [np.zeros((1, depth), dtype=np.float32)])
-
-    inputs = backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
-
-    # batch_size length vector of sequence_lengths
-    input_length = backend.variable(np.array([seq_len_0], dtype=np.int32))
-    # batch_size length vector of negative log probabilities
-    log_prob_truth = np.array(
-        [
-            -3.5821197,  # output beam 0
-            -3.777835  # output beam 1
-        ],
-        np.float32)[np.newaxis, :]
-
-    decode_truth = [
-        np.array([1, 0, -1, -1, -1, -1, -1]),
-        np.array([0, 1, 0, -1, -1, -1, -1])
-    ]
-    beam_width = 2
-    top_paths = 2
-
-    decode_pred_tf, log_prob_pred_tf = backend.ctc_decode(
-        inputs,
-        input_length,
-        greedy=False,
-        beam_width=beam_width,
-        top_paths=top_paths)
-
-    self.assertEqual(len(decode_pred_tf), top_paths)
-    log_prob_pred = backend.eval(log_prob_pred_tf)
-    for i in range(top_paths):
-      self.assertTrue(
-          np.alltrue(decode_truth[i] == backend.eval(decode_pred_tf[i])))
-    self.assertAllClose(log_prob_truth, log_prob_pred)
-
-  def test_ctc_batch_cost(self):
-    with self.cached_session():
-      label_lens = np.expand_dims(np.asarray([5, 4]), 1)
-      input_lens = np.expand_dims(np.asarray([5, 5]), 1)  # number of timesteps
-      loss_log_probs = [3.34211, 5.42262]
-
-      # dimensions are batch x time x categories
-      labels = np.asarray([[0, 1, 2, 1, 0], [0, 1, 1, 0, -1]])
-      inputs = np.asarray(
-          [[[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
-            [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
-            [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
-            [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
-            [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
-           [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
-            [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549],
-            [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456],
-            [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345],
-            [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]]],
-          dtype=np.float32)
-
-      labels = backend.variable(labels, dtype='int32')
-      inputs = backend.variable(inputs, dtype='float32')
-      input_lens = backend.variable(input_lens, dtype='int32')
-      label_lens = backend.variable(label_lens, dtype='int32')
-      res = backend.eval(
-          backend.ctc_batch_cost(labels, inputs, input_lens, label_lens))
-      self.assertAllClose(res[:, 0], loss_log_probs, atol=1e-05)
-
-      # test when batch_size = 1, that is, one sample only
-      ref = [3.34211]
-      input_lens = np.expand_dims(np.asarray([5]), 1)
-      label_lens = np.expand_dims(np.asarray([5]), 1)
-
-      labels = np.asarray([[0, 1, 2, 1, 0]])
-      inputs = np.asarray(
-          [[[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
-            [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
-            [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
-            [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
-            [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]]
-          ],
-          dtype=np.float32)
-
-      k_labels = backend.variable(labels, dtype='int32')
-      k_inputs = backend.variable(inputs, dtype='float32')
-      k_input_lens = backend.variable(input_lens, dtype='int32')
-      k_label_lens = backend.variable(label_lens, dtype='int32')
-      res = backend.eval(
-          backend.ctc_batch_cost(k_labels, k_inputs, k_input_lens,
-                                 k_label_lens))
-      self.assertAllClose(res[:, 0], ref, atol=1e-05)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class TestRandomOps(tf.test.TestCase):
+        # threshold, alpha, and max_value
+        relu_op = backend.relu(x, alpha=0.25, threshold=4.0, max_value=5.0)
+        self.assertAllClose(backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
 
-  def test_random_normal(self):
-    np.random.seed(123)
-    x = backend.random_normal((500, 500))
-    val = backend.eval(x)
-    self.assertAllClose(np.mean(val), 0., atol=0.01)
-    self.assertAllClose(np.std(val), 1., atol=0.01)
-
-  def test_random_uniform(self):
-    np.random.seed(123)
-    x = backend.random_uniform((500, 500))
-    val = backend.eval(x)
-    self.assertAllClose(np.mean(val), 0.5, atol=0.01)
-    self.assertAllClose(np.max(val), 1., atol=0.01)
-    self.assertAllClose(np.min(val), 0., atol=0.01)
-
-  def test_random_binomial(self):
-    np.random.seed(123)
-    x = backend.random_binomial((500, 500), p=0.5)
-    self.assertAllClose(np.mean(backend.eval(x)), 0.5, atol=0.01)
-
-  def test_truncated_normal(self):
-    np.random.seed(123)
-    x = backend.truncated_normal((500, 500), mean=0.0, stddev=1.0)
-    x = backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
-    y = backend.eval(x)
-    self.assertAllClose(np.mean(y), 0., atol=0.01)
-    self.assertAllClose(np.std(y), 0.88, atol=0.01)
-    self.assertAllClose(np.max(y), 2., atol=0.01)
-    self.assertAllClose(np.min(y), -2., atol=0.01)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class FunctionTest(tf.test.TestCase):
+        # Test case for GitHub issue 35430, with integer dtype
+        x = input_layer.Input(shape=(), name="x", dtype="int64")
+        _ = activation.ReLU(max_value=100.0, dtype="int64")(x)
 
-  def test_function_basics(self):
-    if tf.executing_eagerly():
-      self.skipTest('eager backend.function does not support updates')
-    x1 = backend.placeholder(shape=(), dtype='float32')
-    x2 = backend.placeholder(shape=(), dtype='int32')
-    v = backend.variable(10.)
-
-    y1 = x1 + backend.cast(x2, 'float32') + v
-    y2 = x1 * backend.cast(x2, 'float32')
-
-    with tf.control_dependencies([y1]):
-      u = backend.update(v, x1)
-
-    f = backend.function([x1, x2], [y1, y2], updates=[u])
-    output_values = f([2, 3])
-    self.assertEqual(output_values, [15., 6.])
-    self.assertEqual(backend.eval(v), 2.)
-
-  def test_function_dict_outputs(self):
-    x_ph = backend.placeholder(shape=(), name='x')
-    y_ph = backend.placeholder(shape=(), name='y')
-    outputs = {'x*y': y_ph * x_ph, 'x*x': x_ph * x_ph}
-
-    f = backend.function(inputs=[x_ph, y_ph], outputs=outputs)
-    x, y = 2., 5.
-    results = f([x, y])
-
-    self.assertEqual(results['x*y'], 10.)
-    self.assertEqual(results['x*x'], 4)
-
-  def test_function_dict_inputs(self):
-    placeholders = {
-        'x': backend.placeholder(shape=()),
-        'y': backend.placeholder(shape=())
-    }
-    outputs = [placeholders['x'] * placeholders['y']]
-
-    f = backend.function(inputs=placeholders, outputs=outputs)
-    results = f({'x': 2., 'y': 3.})
-    self.assertEqual(results[0], 6.)
-
-  def test_function_single_input_output(self):
-    x_ph = backend.placeholder(shape=(), name='x')
-    output = x_ph * x_ph
-    f = backend.function(x_ph, output)
-    result = f(2.)
-    self.assertEqual(result, 4.)
-
-  def test_tuple_updates(self):
-    if tf.executing_eagerly():
-      self.skipTest('eager backend.function does not support updates')
-
-    x_ph = backend.placeholder(ndim=2)
-    v = backend.variable(np.ones((4, 2)))
-    output = x_ph**2 + v
-    new_v = v + x_ph
-    f = backend.function(x_ph, output, updates=[(v, new_v)])
-    input_val = np.random.random((4, 2))
-    result = f(input_val)
-    self.assertAllClose(result, input_val**2 + 1)
-    self.assertAllClose(backend.get_value(v), np.ones((4, 2)) + input_val)
 
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class BackendShapeOpsTest(tf.test.TestCase):
+    def test_reshape(self):
+        compare_single_input_op_to_numpy(
+            backend.reshape,
+            np.reshape,
+            input_shape=(4, 7),
+            keras_args=[(2, 14)],
+            np_args=[(2, 14)],
+        )
+
+    def test_concatenate(self):
+        a = backend.variable(np.ones((1, 2, 3)))
+        b = backend.variable(np.ones((1, 2, 2)))
+        y = backend.concatenate([a, b], axis=-1)
+        self.assertEqual(y.shape.as_list(), [1, 2, 5])
+
+    def test_permute_dimensions(self):
+        compare_single_input_op_to_numpy(
+            backend.permute_dimensions,
+            np.transpose,
+            input_shape=(4, 7),
+            keras_args=[(1, 0)],
+            np_args=[(1, 0)],
+        )
+
+    def test_resize_images(self):
+        height_factor = 2
+        width_factor = 2
+        data_format = "channels_last"
+        x = backend.variable(np.ones((1, 2, 2, 3)))
+        y = backend.resize_images(x, height_factor, width_factor, data_format)
+        self.assertEqual(y.shape.as_list(), [1, 4, 4, 3])
+
+        data_format = "channels_first"
+        x = backend.variable(np.ones((1, 3, 2, 2)))
+        y = backend.resize_images(x, height_factor, width_factor, data_format)
+        self.assertEqual(y.shape.as_list(), [1, 3, 4, 4])
+
+        # Use with a dynamic axis:
+        if not tf.executing_eagerly():
+            x = backend.placeholder(shape=(1, 3, None, None))
+            y = backend.resize_images(
+                x, height_factor, width_factor, data_format
+            )
+            self.assertEqual(y.shape.as_list(), [1, 3, None, None])
+
+        # Invalid use:
+        with self.assertRaises(ValueError):
+            backend.resize_images(
+                x, height_factor, width_factor, data_format="unknown"
+            )
+
+    def test_resize_volumes(self):
+        height_factor = 2
+        width_factor = 2
+        depth_factor = 2
+        data_format = "channels_last"
+        x = backend.variable(np.ones((1, 2, 2, 2, 3)))
+        y = backend.resize_volumes(
+            x, depth_factor, height_factor, width_factor, data_format
+        )
+        self.assertEqual(y.shape.as_list(), [1, 4, 4, 4, 3])
+
+        data_format = "channels_first"
+        x = backend.variable(np.ones((1, 3, 2, 2, 2)))
+        y = backend.resize_volumes(
+            x, depth_factor, height_factor, width_factor, data_format
+        )
+        self.assertEqual(y.shape.as_list(), [1, 3, 4, 4, 4])
+
+        # Invalid use:
+        with self.assertRaises(ValueError):
+            backend.resize_volumes(
+                x,
+                depth_factor,
+                height_factor,
+                width_factor,
+                data_format="unknown",
+            )
+
+    def test_repeat_elements(self):
+        x = backend.variable(np.ones((1, 3, 2)))
+        y = backend.repeat_elements(x, 3, axis=1)
+        self.assertEqual(y.shape.as_list(), [1, 9, 2])
+
+        # Use with a dynamic axis:
+        if not tf.executing_eagerly():
+            x = backend.placeholder(shape=(2, None, 2))
+            y = backend.repeat_elements(x, 3, axis=1)
+            self.assertEqual(y.shape.as_list(), [2, None, 2])
 
-class BackendGraphTests(tf.test.TestCase, parameterized.TestCase):
+    def test_repeat(self):
+        x = backend.variable(np.ones((1, 3)))
+        y = backend.repeat(x, 2)
+        self.assertEqual(y.shape.as_list(), [1, 2, 3])
 
-  @test_combinations.generate(test_combinations.combine(mode=['graph']))
-  def test_function_placeholder_with_default(self):
-    with backend.get_graph().as_default():
-      x1 = tf.compat.v1.placeholder_with_default(
-          np.array(2., dtype='float32'), shape=())
-      x2 = tf.compat.v1.placeholder_with_default(
-          np.array(3, dtype='int32'), shape=())
-    y1 = x1 + backend.cast(x2, 'float32')
-    y2 = x1 * backend.cast(x2, 'float32')
-    f = backend.function([x1, x2], [y1, y2])
-    output_values = f([4, 5])
-    self.assertEqual(output_values, [9., 20.])
-    output_values = f([None, None])
-    self.assertEqual(output_values, [5., 6.])
-
-  def test_function_tf_feed_symbols(self):
-    # Test Keras backend functions with TF tensor inputs.
-    with tf.Graph().as_default(), self.cached_session():
-      # Test feeding a resource variable to `function`.
-      x1 = backend.placeholder(shape=())
-      x2 = backend.placeholder(shape=())
-      lr = backend.learning_phase()  # Include a placeholder_with_default.
-
-      y1 = backend.variable(10.)
-      y2 = 3
-
-      f = backend.function(
-          inputs=[x1, x2, lr],
-          outputs=[x1 + 1, backend.in_train_phase(x2 + 2, x2 - 1)])
-      outs = f([y1, y2, None])  # Use default learning_phase value.
-      self.assertEqual(outs, [11., 2.])
-      outs = f([y1, y2, 1])  # Set learning phase value.
-      self.assertEqual(outs, [11., 5.])
-
-      # Test triggering a callable refresh by changing the input.
-      y3 = backend.constant(20.)  # Test with tensor
-      outs = f([y3, y2, None])
-      self.assertEqual(outs, [21., 2.])
-
-      y4 = 4  # Test with non-symbol
-      outs = f([y4, y2, None])
-      self.assertEqual(outs, [5., 2.])
-
-      # Test with a different dtype
-      y5 = backend.constant(10., dtype='float64')
-      outs = f([y5, y2, None])
-      self.assertEqual(outs, [11., 2.])
-
-  def test_function_tf_fetches(self):
-    # Additional operations can be passed to tf.compat.v1.Session().run() via
-    # its `fetches` arguments. In contrast to `updates` argument of
-    # backend.function() these do not have control dependency on `outputs`
-    # so they can run in parallel. Also they should not contribute to output of
-    # backend.function().
-    with tf.Graph().as_default(), self.cached_session():
-      x = backend.variable(0.)
-      y = backend.variable(0.)
-      x_placeholder = backend.placeholder(shape=())
-      y_placeholder = backend.placeholder(shape=())
-
-      f = backend.function(
-          inputs=[x_placeholder, y_placeholder],
-          outputs=[x_placeholder + y_placeholder],
-          updates=[(x, x_placeholder + 1.)],
-          fetches=[backend.update(y, 5.)])
-      output = f([10., 20.])
-      self.assertEqual(output, [30.])
-      self.assertEqual(backend.get_session().run(fetches=[x, y]), [11., 5.])
-
-  def test_function_tf_feed_dict(self):
-    # Additional substitutions can be passed to `tf.compat.v1.Session().run()`
-    # via its `feed_dict` arguments. Note that the feed_dict is passed once in
-    # the constructor but we can modify the values in the dictionary. Through
-    # this feed_dict we can provide additional substitutions besides Keras
-    # inputs.
-    with tf.Graph().as_default(), self.cached_session():
-      x = backend.variable(0.)
-      y = backend.variable(0.)
-      x_placeholder = backend.placeholder(shape=())
-      y_placeholder = backend.placeholder(shape=())
-
-      feed_dict = {y_placeholder: 3.}
-      fetches = [backend.update(y, y_placeholder * 10.)]
-      f = backend.function(
-          inputs=[x_placeholder],
-          outputs=[x_placeholder + 1.],
-          updates=[(x, x_placeholder + 10.)],
-          feed_dict=feed_dict,
-          fetches=fetches)
-      output = f([10.])
-      self.assertEqual(output, [11.])
-      self.assertEqual(backend.get_session().run(fetches=[x, y]), [20., 30.])
-
-      # updated value in feed_dict will be modified within the K.function()
-      feed_dict[y_placeholder] = 4.
-      output = f([20.])
-      self.assertEqual(output, [21.])
-      self.assertEqual(backend.get_session().run(fetches=[x, y]), [30., 40.])
-
-  def test_function_tf_run_options_with_run_metadata(self):
-    with tf.Graph().as_default(), self.cached_session():
-      x_placeholder = backend.placeholder(shape=())
-      y_placeholder = backend.placeholder(shape=())
-
-      run_options = tf.compat.v1.RunOptions(output_partition_graphs=True)
-      run_metadata = tf.compat.v1.RunMetadata()
-      # enable run_options.
-      f = backend.function(
-          inputs=[x_placeholder, y_placeholder],
-          outputs=[x_placeholder + y_placeholder],
-          options=run_options,
-          run_metadata=run_metadata)
-      output = f([10., 20.])
-      self.assertEqual(output, [30.])
-      self.assertNotEmpty(run_metadata.partition_graphs)
-      # disable run_options.
-      f1 = backend.function(
-          inputs=[x_placeholder, y_placeholder],
-          outputs=[x_placeholder + y_placeholder],
-          run_metadata=run_metadata)
-      output1 = f1([10., 20.])
-      self.assertEqual(output1, [30.])
-      self.assertEmpty(run_metadata.partition_graphs)
-
-  def test_function_fetch_callbacks(self):
-
-    class CallbackStub:
-
-      def __init__(self):
-        self.times_called = 0
-        self.callback_result = 0
-
-      def _fetch_callback(self, result):
-        self.times_called += 1
-        self.callback_result = result
-
-    with tf.Graph().as_default(), self.cached_session():
-      callback = CallbackStub()
-      x_placeholder = backend.placeholder(shape=())
-      y_placeholder = backend.placeholder(shape=())
-
-      callback_op = x_placeholder * y_placeholder
-
-      f = backend.function(
-          inputs=[x_placeholder, y_placeholder],
-          outputs=[x_placeholder + y_placeholder])
-      f.fetches.append(callback_op)
-      f.fetch_callbacks[callback_op] = callback._fetch_callback
-
-      _ = f([10., 20.])
-
-      self.assertEqual(callback.times_called, 1)
-      self.assertEqual(callback.callback_result, 200)
-
-  def test_get_session_different_graphs(self):
-    with tf.Graph().as_default():
-      x = backend.constant(1)
-      session = backend.get_session()
-      self.assertIs(session, backend.get_session((x,)))
-      self.assertIs(session, backend.get_session())
-    with tf.Graph().as_default():
-      self.assertIs(session, backend.get_session((x,)))
-      self.assertIsNot(session, backend.get_session())
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class ControlOpsTests(tf.test.TestCase):
+    def test_flatten(self):
+        compare_single_input_op_to_numpy(
+            backend.flatten,
+            np.reshape,
+            input_shape=(4, 7, 6),
+            np_args=[(4 * 7 * 6,)],
+        )
+
+    def test_batch_flatten(self):
+        compare_single_input_op_to_numpy(
+            backend.batch_flatten,
+            np.reshape,
+            input_shape=(4, 7, 6),
+            np_args=[(4, 7 * 6)],
+        )
+
+    def test_temporal_padding(self):
+        def ref_op(x, padding):
+            shape = list(x.shape)
+            shape[1] += padding[0] + padding[1]
+            y = np.zeros(tuple(shape))
+            y[:, padding[0] : -padding[1], :] = x
+            return y
+
+        compare_single_input_op_to_numpy(
+            backend.temporal_padding,
+            ref_op,
+            input_shape=(4, 7, 6),
+            keras_args=[(2, 3)],
+            np_args=[(2, 3)],
+        )
+
+    def test_spatial_2d_padding(self):
+        def ref_op(x, padding, data_format="channels_last"):
+            shape = list(x.shape)
+            if data_format == "channels_last":
+                shape[1] += padding[0][0] + padding[0][1]
+                shape[2] += padding[1][0] + padding[1][1]
+                y = np.zeros(tuple(shape))
+                y[
+                    :,
+                    padding[0][0] : -padding[0][1],
+                    padding[1][0] : -padding[1][1],
+                    :,
+                ] = x
+            else:
+                shape[2] += padding[0][0] + padding[0][1]
+                shape[3] += padding[1][0] + padding[1][1]
+                y = np.zeros(tuple(shape))
+                y[
+                    :,
+                    :,
+                    padding[0][0] : -padding[0][1],
+                    padding[1][0] : -padding[1][1],
+                ] = x
+            return y
+
+        compare_single_input_op_to_numpy(
+            backend.spatial_2d_padding,
+            ref_op,
+            input_shape=(2, 3, 2, 3),
+            keras_args=[((2, 3), (1, 2))],
+            keras_kwargs={"data_format": "channels_last"},
+            np_args=[((2, 3), (1, 2))],
+            np_kwargs={"data_format": "channels_last"},
+        )
+        compare_single_input_op_to_numpy(
+            backend.spatial_2d_padding,
+            ref_op,
+            input_shape=(2, 3, 2, 3),
+            keras_args=[((2, 3), (1, 2))],
+            keras_kwargs={"data_format": "channels_first"},
+            np_args=[((2, 3), (1, 2))],
+            np_kwargs={"data_format": "channels_first"},
+        )
+
+    def test_spatial_3d_padding(self):
+        def ref_op(x, padding, data_format="channels_last"):
+            shape = list(x.shape)
+            if data_format == "channels_last":
+                shape[1] += padding[0][0] + padding[0][1]
+                shape[2] += padding[1][0] + padding[1][1]
+                shape[3] += padding[2][0] + padding[2][1]
+                y = np.zeros(tuple(shape))
+                y[
+                    :,
+                    padding[0][0] : -padding[0][1],
+                    padding[1][0] : -padding[1][1],
+                    padding[2][0] : -padding[2][1],
+                    :,
+                ] = x
+            else:
+                shape[2] += padding[0][0] + padding[0][1]
+                shape[3] += padding[1][0] + padding[1][1]
+                shape[4] += padding[2][0] + padding[2][1]
+                y = np.zeros(tuple(shape))
+                y[
+                    :,
+                    :,
+                    padding[0][0] : -padding[0][1],
+                    padding[1][0] : -padding[1][1],
+                    padding[2][0] : -padding[2][1],
+                ] = x
+            return y
+
+        compare_single_input_op_to_numpy(
+            backend.spatial_3d_padding,
+            ref_op,
+            input_shape=(2, 3, 2, 3, 2),
+            keras_args=[((2, 3), (1, 2), (2, 3))],
+            keras_kwargs={"data_format": "channels_last"},
+            np_args=[((2, 3), (1, 2), (2, 3))],
+            np_kwargs={"data_format": "channels_last"},
+        )
+        compare_single_input_op_to_numpy(
+            backend.spatial_3d_padding,
+            ref_op,
+            input_shape=(2, 3, 2, 3, 2),
+            keras_args=[((2, 3), (1, 2), (2, 3))],
+            keras_kwargs={"data_format": "channels_first"},
+            np_args=[((2, 3), (1, 2), (2, 3))],
+            np_kwargs={"data_format": "channels_first"},
+        )
 
-  def test_function_switch_basics(self):
-    x = tf.constant(2.0)
-    y = tf.constant(3.0)
 
-    def xpowy():
-      return backend.pow(x, y)
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class BackendNNOpsTest(tf.test.TestCase, parameterized.TestCase):
+    def test_bias_add(self):
+        keras_op = backend.bias_add
+        np_op = np.add
+        compare_two_inputs_op_to_numpy(
+            keras_op, np_op, input_shape_a=(4, 7), input_shape_b=(7,)
+        )
+        compare_two_inputs_op_to_numpy(
+            keras_op, np_op, input_shape_a=(4, 3, 7), input_shape_b=(7,)
+        )
+        compare_two_inputs_op_to_numpy(
+            keras_op, np_op, input_shape_a=(4, 3, 5, 7), input_shape_b=(7,)
+        )
+        compare_two_inputs_op_to_numpy(
+            keras_op, np_op, input_shape_a=(4, 3, 5, 2, 7), input_shape_b=(7,)
+        )
+
+        with self.assertRaises((ValueError, tf.errors.InvalidArgumentError)):
+            x = backend.variable((3, 4))
+            b = backend.variable((3, 4))
+            backend.bias_add(x, b)
+        with self.assertRaises(ValueError):
+            x = backend.variable((3, 4))
+            b = backend.variable((4,))
+            backend.bias_add(x, b, data_format="unknown")
+
+    def test_bias_add_channels_first(self):
+        def keras_op(x, b):
+            return backend.bias_add(x, b, data_format="channels_first")
+
+        def np_op(x, b):
+            if x.ndim == 3:
+                b = b.reshape((1, b.shape[0], 1))
+            if x.ndim == 4:
+                b = b.reshape((1, b.shape[0], 1, 1))
+            return x + b
+
+        compare_two_inputs_op_to_numpy(
+            keras_op, np_op, input_shape_a=(4, 3, 7), input_shape_b=(3,)
+        )
+        compare_two_inputs_op_to_numpy(
+            keras_op, np_op, input_shape_a=(4, 3, 5, 7), input_shape_b=(3,)
+        )
+
+    def test_pool2d(self):
+        val = np.random.random((10, 3, 10, 10))
+        x = backend.variable(val)
+        y = backend.pool2d(
+            x,
+            (2, 2),
+            strides=(1, 1),
+            padding="valid",
+            data_format="channels_first",
+            pool_mode="max",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
+
+        y = backend.pool2d(
+            x,
+            (2, 2),
+            strides=(1, 1),
+            padding="valid",
+            data_format="channels_first",
+            pool_mode="avg",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
+
+        val = np.random.random((10, 10, 10, 3))
+        x = backend.variable(val)
+        y = backend.pool2d(
+            x,
+            (2, 2),
+            strides=(1, 1),
+            padding="valid",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 9, 9, 3])
+
+        val = np.random.random((10, 10, 10, 3))
+        x = backend.variable(val)
+        y = backend.pool2d(
+            x,
+            (2, 2),
+            strides=(1, 1),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 10, 10, 3])
+
+        val = np.random.random((10, 10, 10, 3))
+        x = backend.variable(val)
+        y = backend.pool2d(
+            x,
+            (2, 2),
+            strides=(2, 2),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 5, 3])
+
+        with self.assertRaises(ValueError):
+            y = backend.pool2d(
+                x,
+                (2, 2),
+                strides=(2, 2),
+                padding="other",
+                data_format="channels_last",
+            )
+        with self.assertRaises(ValueError):
+            y = backend.pool2d(x, (2, 2), strides=(2, 2), data_format="other")
+        with self.assertRaises(ValueError):
+            y = backend.pool2d(x, (2, 2, 2), strides=(2, 2))
+        with self.assertRaises(ValueError):
+            y = backend.pool2d(x, (2, 2), strides=(2, 2, 2))
+        with self.assertRaises(ValueError):
+            y = backend.pool2d(x, (2, 2), strides=(2, 2), pool_mode="other")
+
+    def test_pool3d(self):
+        val = np.random.random((10, 3, 10, 10, 10))
+        x = backend.variable(val)
+        y = backend.pool3d(
+            x,
+            (2, 2, 2),
+            strides=(1, 1, 1),
+            padding="valid",
+            data_format="channels_first",
+            pool_mode="max",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
+
+        y = backend.pool3d(
+            x,
+            (2, 2, 2),
+            strides=(1, 1, 1),
+            padding="valid",
+            data_format="channels_first",
+            pool_mode="avg",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
+
+        val = np.random.random((10, 10, 10, 10, 3))
+        x = backend.variable(val)
+        y = backend.pool3d(
+            x,
+            (2, 2, 2),
+            strides=(1, 1, 1),
+            padding="valid",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 9, 9, 9, 3])
+
+        val = np.random.random((10, 10, 10, 10, 3))
+        x = backend.variable(val)
+        y = backend.pool3d(
+            x,
+            (2, 2, 2),
+            strides=(1, 1, 1),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 3])
+
+        val = np.random.random((10, 10, 10, 10, 3))
+        x = backend.variable(val)
+        y = backend.pool3d(
+            x,
+            (2, 2, 2),
+            strides=(2, 2, 2),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 3])
+
+    def test_conv1d(self):
+        val = np.random.random((10, 4, 10))
+        x = backend.variable(val)
+        kernel_val = np.random.random((3, 4, 5))
+        k = backend.variable(kernel_val)
+        y = backend.conv1d(
+            x, k, strides=(1,), padding="valid", data_format="channels_first"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 8])
+
+        val = np.random.random((10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv1d(
+            x, k, strides=(1,), padding="valid", data_format="channels_last"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 8, 5])
+
+        val = np.random.random((10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv1d(
+            x, k, strides=(1,), padding="same", data_format="channels_last"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 10, 5])
+
+        val = np.random.random((10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv1d(
+            x, k, strides=(2,), padding="same", data_format="channels_last"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 5])
+
+    def test_local_conv_channels_dim(self):
+        filters = 3
+        batch_size = 2
+
+        for input_shape in [(3, 5), (2, 3, 5), (2, 5, 3, 4)]:
+            channels_in = input_shape[0]
+            input_spatial_shape = input_shape[1:]
+            dim = len(input_spatial_shape)
+
+            inputs = np.random.normal(0, 1, (batch_size,) + input_shape)
+            inputs_cf = backend.variable(inputs)
+
+            for kernel_size in [1, 2]:
+                for stride in [1, 2]:
+                    kernel_sizes = (kernel_size,) * dim
+                    strides = (stride,) * dim
+
+                    output_shape = tuple(
+                        [
+                            (i - kernel_size + stride) // stride
+                            for i in input_spatial_shape
+                        ]
+                    )
+
+                    kernel_shape = (
+                        np.prod(output_shape),
+                        np.prod(kernel_sizes) * channels_in,
+                        filters,
+                    )
+
+                    kernel = np.random.normal(
+                        0,
+                        1,
+                        output_shape
+                        + (channels_in, np.prod(kernel_sizes), filters),
+                    )
+
+                    kernel_cf = np.reshape(kernel, kernel_shape)
+                    kernel_cf = backend.variable(kernel_cf)
+
+                    conv_cf = backend.local_conv(
+                        inputs_cf,
+                        kernel_cf,
+                        kernel_sizes,
+                        strides,
+                        output_shape,
+                        "channels_first",
+                    )
+
+                    inputs_cl = np.transpose(
+                        inputs, [0, 2] + list(range(3, dim + 2)) + [1]
+                    )
+                    inputs_cl = backend.variable(inputs_cl)
+
+                    kernel_cl = np.reshape(
+                        np.transpose(
+                            kernel, list(range(dim)) + [dim + 1, dim, dim + 2]
+                        ),
+                        kernel_shape,
+                    )
+                    kernel_cl = backend.variable(kernel_cl)
+
+                    conv_cl = backend.local_conv(
+                        inputs_cl,
+                        kernel_cl,
+                        kernel_sizes,
+                        strides,
+                        output_shape,
+                        "channels_last",
+                    )
+
+                    conv_cf = backend.eval(conv_cf)
+                    conv_cl = backend.eval(conv_cl)
+
+                    self.assertAllCloseAccordingToType(
+                        conv_cf,
+                        np.transpose(
+                            conv_cl, [0, dim + 1] + list(range(1, dim + 1))
+                        ),
+                        atol=1e-5,
+                    )
+
+    @parameterized.named_parameters(
+        ("local_conv1d", (5, 6), (3,), (1,), (3,)),
+        ("local_conv2d", (4, 5, 6), (3, 3), (1, 1), (2, 3)),
+    )
+    def test_local_conv_1d_and_2d(
+        self, input_shape, kernel_sizes, strides, output_shape
+    ):
+        filters = 3
+        batch_size = 2
+
+        inputs = np.random.normal(0, 1, (batch_size,) + input_shape)
+        inputs = backend.variable(inputs)
+
+        kernel = np.random.normal(
+            0,
+            1,
+            (
+                np.prod(output_shape),
+                np.prod(kernel_sizes) * input_shape[-1],
+                filters,
+            ),
+        )
+        kernel = backend.variable(kernel)
+
+        local_conv = backend.local_conv(
+            inputs, kernel, kernel_sizes, strides, output_shape, "channels_last"
+        )
+        if len(output_shape) == 1:
+            local_conv_dim = backend.local_conv1d(
+                inputs, kernel, kernel_sizes, strides, "channels_last"
+            )
+        else:
+            local_conv_dim = backend.local_conv2d(
+                inputs,
+                kernel,
+                kernel_sizes,
+                strides,
+                output_shape,
+                "channels_last",
+            )
+
+        local_conv = backend.eval(local_conv)
+        local_conv_dim = backend.eval(local_conv_dim)
+
+        self.assertAllCloseAccordingToType(local_conv, local_conv_dim)
+
+    def test_conv2d(self):
+        kernel_val = np.random.random((3, 3, 4, 5))
+        k = backend.variable(kernel_val)
+
+        # Test channels_first
+        val = np.random.random((10, 4, 10, 10))
+        x = backend.variable(val)
+        y = backend.conv2d(x, k, padding="valid", data_format="channels_first")
+        self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
+
+        # Test channels_last
+        val = np.random.random((10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv2d(
+            x, k, strides=(1, 1), padding="valid", data_format="channels_last"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
+
+        # Test same padding
+        val = np.random.random((10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv2d(x, k, padding="same", data_format="channels_last")
+        self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
+
+        # Test dilation_rate
+        val = np.random.random((10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv2d(
+            x,
+            k,
+            dilation_rate=(2, 2),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
+
+        # Test strides
+        val = np.random.random((10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv2d(
+            x, k, strides=(2, 2), padding="same", data_format="channels_last"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
+
+        # Test invalid arguments
+        with self.assertRaises(ValueError):
+            y = backend.conv2d(
+                x, k, (2, 2), padding="other", data_format="channels_last"
+            )
+        with self.assertRaises(ValueError):
+            y = backend.conv2d(x, k, (2, 2), data_format="other")
+        with self.assertRaises(ValueError):
+            y = backend.conv2d(x, k, (2, 2, 2))
+
+    def test_conv2d_transpose(self):
+        input_size = (7, 8)
+        kernel_size = (3, 3)
+        input_depth = 6
+        filters = 6
+        batch_size = 2
+
+        kernel_val = np.random.random(kernel_size + (input_depth, filters))
+        k = backend.variable(kernel_val)
+
+        # Test channels_first
+        input_val = np.random.random((batch_size, input_depth) + input_size)
+        x = backend.variable(input_val)
+        y = backend.conv2d_transpose(
+            x,
+            k,
+            (batch_size, filters) + input_size,
+            padding="same",
+            data_format="channels_first",
+        )
+        self.assertEqual(
+            tuple(y.shape.as_list()), (batch_size, filters) + input_size
+        )
+
+        # Test channels_last
+        input_val = np.random.random(
+            (batch_size,) + input_size + (input_depth,)
+        )
+        x = backend.variable(input_val)
+        y = backend.conv2d_transpose(
+            x,
+            k,
+            (batch_size,) + input_size + (filters,),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(
+            tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,)
+        )
+
+        # Test dilation_rate
+        y = backend.conv2d_transpose(
+            x,
+            k,
+            (batch_size,) + input_size + (filters,),
+            padding="same",
+            data_format="channels_last",
+            dilation_rate=(2, 2),
+        )
+        self.assertEqual(
+            tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,)
+        )
+
+        # Test dilation_rate error
+        with self.assertRaisesRegex(ValueError, "Expected the 2 dimensions"):
+            y = backend.conv2d_transpose(
+                x,
+                k,
+                (batch_size,) + input_size + (filters,),
+                padding="same",
+                data_format="channels_last",
+                dilation_rate=(1, 2),
+            )
+
+        # Test batch size of None in output_shape
+        y = backend.conv2d_transpose(
+            x,
+            k,
+            (None,) + input_size + (filters,),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(
+            tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,)
+        )
+
+        # Test invalid values
+        with self.assertRaises(ValueError):
+            y = backend.conv2d_transpose(
+                x, k, (2, 2, 8, 9), padding="other", data_format="channels_last"
+            )
+        with self.assertRaises(ValueError):
+            y = backend.conv2d_transpose(
+                x, k, (2, 2, 8, 9), data_format="other"
+            )
+
+    def test_separable_conv2d(self):
+        val = np.random.random((10, 4, 10, 10))
+        x = backend.variable(val)
+        depthwise_kernel_val = np.random.random((3, 3, 4, 1))
+        pointwise_kernel_val = np.random.random((1, 1, 4, 5))
+        dk = backend.variable(depthwise_kernel_val)
+        pk = backend.variable(pointwise_kernel_val)
+        y = backend.separable_conv2d(
+            x, dk, pk, padding="valid", data_format="channels_first"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
+
+        val = np.random.random((10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.separable_conv2d(
+            x,
+            dk,
+            pk,
+            strides=(1, 1),
+            padding="valid",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
+
+        val = np.random.random((10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.separable_conv2d(
+            x,
+            dk,
+            pk,
+            strides=(1, 1),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
+
+        val = np.random.random((10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.separable_conv2d(
+            x,
+            dk,
+            pk,
+            strides=(2, 2),
+            padding="same",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
+        with self.assertRaises(ValueError):
+            y = backend.separable_conv2d(
+                x, dk, pk, (2, 2), padding="other", data_format="channels_last"
+            )
+        with self.assertRaises(ValueError):
+            y = backend.separable_conv2d(x, dk, pk, (2, 2), data_format="other")
+        with self.assertRaises(ValueError):
+            y = backend.separable_conv2d(x, dk, pk, (2, 2, 2))
+
+    def test_conv3d(self):
+        val = np.random.random((10, 4, 10, 10, 10))
+        x = backend.variable(val)
+        kernel_val = np.random.random((3, 3, 3, 4, 5))
+        k = backend.variable(kernel_val)
+        y = backend.conv3d(x, k, padding="valid", data_format="channels_first")
+        self.assertEqual(y.shape.as_list(), [10, 5, 8, 8, 8])
+
+        val = np.random.random((10, 10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv3d(
+            x,
+            k,
+            strides=(1, 1, 1),
+            padding="valid",
+            data_format="channels_last",
+        )
+        self.assertEqual(y.shape.as_list(), [10, 8, 8, 8, 5])
+
+        val = np.random.random((10, 10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv3d(
+            x, k, strides=(1, 1, 1), padding="same", data_format="channels_last"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 5])
+
+        val = np.random.random((10, 10, 10, 10, 4))
+        x = backend.variable(val)
+        y = backend.conv3d(
+            x, k, strides=(2, 2, 2), padding="same", data_format="channels_last"
+        )
+        self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 5])
+        with self.assertRaises(ValueError):
+            y = backend.conv3d(
+                x, k, (2, 2, 2), padding="other", data_format="channels_last"
+            )
+        with self.assertRaises(ValueError):
+            y = backend.conv3d(x, k, (2, 2, 2), data_format="other")
+        with self.assertRaises(ValueError):
+            y = backend.conv3d(x, k, (2, 2))
+
+    def test_rnn(self):
+        # implement a simple RNN
+        num_samples = 4
+        input_dim = 5
+        output_dim = 3
+        timesteps = 6
+
+        input_val = np.random.random(
+            (num_samples, timesteps, input_dim)
+        ).astype(np.float32)
+        init_state_val = np.random.random((num_samples, output_dim)).astype(
+            np.float32
+        )
+        w_i_val = np.random.random((input_dim, output_dim)).astype(np.float32)
+        w_o_val = np.random.random((output_dim, output_dim)).astype(np.float32)
+        np_mask = np.random.randint(2, size=(num_samples, timesteps))
+
+        def rnn_step_fn():
+            w_i = backend.variable(w_i_val)
+            w_o = backend.variable(w_o_val)
+
+            def step_function(x, states):
+                assert len(states) == 1
+                prev_output = states[0]
+                output = backend.dot(x, w_i) + backend.dot(prev_output, w_o)
+                return output, [output]
+
+            return step_function
+
+        # test default setup
+        last_output_list = [[], [], [], [], [], []]
+        outputs_list = [[], [], [], [], [], []]
+        state_list = [[], [], [], [], [], []]
+
+        rnn_fn = rnn_step_fn()
+        inputs = backend.variable(input_val)
+        initial_states = [backend.variable(init_state_val)]
+        mask = backend.variable(np_mask)
+
+        kwargs_list = [
+            {"go_backwards": False, "mask": None},
+            {"go_backwards": False, "mask": None, "unroll": True},
+            {"go_backwards": True, "mask": None},
+            {"go_backwards": True, "mask": None, "unroll": True},
+            {"go_backwards": False, "mask": mask},
+            {"go_backwards": False, "mask": mask, "unroll": True},
+        ]
+        for i, kwargs in enumerate(kwargs_list):
+            last_output, outputs, new_states = backend.rnn(
+                rnn_fn, inputs, initial_states, **kwargs
+            )
+            # check static shape inference
+            self.assertEqual(
+                last_output.shape.as_list(), [num_samples, output_dim]
+            )
+            self.assertEqual(
+                outputs.shape.as_list(), [num_samples, timesteps, output_dim]
+            )
+            for state in new_states:
+                self.assertEqual(
+                    state.shape.as_list(), [num_samples, output_dim]
+                )
+
+            last_output_list[i].append(backend.eval(last_output))
+            outputs_list[i].append(backend.eval(outputs))
+            self.assertLen(new_states, 1)
+            state_list[i].append(backend.eval(new_states[0]))
+
+            def assert_list_pairwise(z_list, atol=1e-05):
+                for z1, z2 in zip(z_list[1:], z_list[:-1]):
+                    self.assertAllClose(z1, z2, atol=atol)
+
+            assert_list_pairwise(last_output_list[0], atol=1e-04)
+            assert_list_pairwise(outputs_list[0], atol=1e-04)
+            assert_list_pairwise(state_list[0], atol=1e-04)
+            assert_list_pairwise(last_output_list[2], atol=1e-04)
+            assert_list_pairwise(outputs_list[2], atol=1e-04)
+            assert_list_pairwise(state_list[2], atol=1e-04)
+
+            for l, u_l in zip(last_output_list[0], last_output_list[1]):
+                self.assertAllClose(l, u_l, atol=1e-04)
+
+            for o, u_o in zip(outputs_list[0], outputs_list[1]):
+                self.assertAllClose(o, u_o, atol=1e-04)
+
+            for s, u_s in zip(state_list[0], state_list[1]):
+                self.assertAllClose(s, u_s, atol=1e-04)
+
+            for b_l, b_u_l in zip(last_output_list[2], last_output_list[3]):
+                self.assertAllClose(b_l, b_u_l, atol=1e-04)
+
+            for b_o, b_u_o in zip(outputs_list[2], outputs_list[3]):
+                self.assertAllClose(b_o, b_u_o, atol=1e-04)
+
+            for b_s, b_u_s in zip(state_list[2], state_list[3]):
+                self.assertAllClose(b_s, b_u_s, atol=1e-04)
+
+    def test_rnn_additional_states(self):
+        # implement a simple RNN
+        num_samples = 4
+        input_dim = 5
+        output_dim = 3
+        timesteps = 6
+
+        input_val = np.random.random(
+            (num_samples, timesteps, input_dim)
+        ).astype(np.float32)
+        init_state_val = np.random.random((num_samples, output_dim)).astype(
+            np.float32
+        )
+        w_i_val = np.random.random((input_dim, output_dim)).astype(np.float32)
+        w_o_val = np.random.random((output_dim, output_dim)).astype(np.float32)
+        np_mask = np.random.randint(2, size=(num_samples, timesteps))
+
+        def rnn_step_fn():
+            w_i = backend.variable(w_i_val)
+            w_o = backend.variable(w_o_val)
+
+            def step_function(x, states):
+                assert len(states) == 2
+                prev_output = states[0]
+                output = backend.dot(x, w_i) + backend.dot(prev_output, w_o)
+                return output, [
+                    output,
+                    backend.concatenate([output, output], axis=-1),
+                ]
+
+            return step_function
+
+        # test default setup
+        last_output_list = [[], [], [], [], [], []]
+        outputs_list = [[], [], [], [], [], []]
+        state_list = [[], [], [], [], [], []]
+        additional_state_list = [[], [], [], [], [], []]
+
+        rnn_fn = rnn_step_fn()
+        inputs = backend.variable(input_val)
+        initial_states = [
+            backend.variable(init_state_val),
+            tf.convert_to_tensor(
+                np.concatenate([init_state_val, init_state_val], axis=-1)
+            ),
+        ]
+        mask = backend.variable(np_mask)
+
+        kwargs_list = [
+            {"go_backwards": False, "mask": None},
+            {"go_backwards": False, "mask": None, "unroll": True},
+            {"go_backwards": True, "mask": None},
+            {"go_backwards": True, "mask": None, "unroll": True},
+            {"go_backwards": False, "mask": mask},
+            {"go_backwards": False, "mask": mask, "unroll": True},
+        ]
+        for i, kwargs in enumerate(kwargs_list):
+            last_output, outputs, new_states = backend.rnn(
+                rnn_fn, inputs, initial_states, **kwargs
+            )
+            # check static shape inference
+            self.assertEqual(
+                last_output.shape.as_list(), [num_samples, output_dim]
+            )
+            self.assertEqual(
+                outputs.shape.as_list(), [num_samples, timesteps, output_dim]
+            )
+            # for state in new_states:
+            #   self.assertEqual(state.shape.as_list(),
+            #                     [num_samples, output_dim])
+            self.assertEqual(
+                new_states[0].shape.as_list(), [num_samples, output_dim]
+            )
+            self.assertEqual(
+                new_states[1].shape.as_list(), [num_samples, 2 * output_dim]
+            )
+
+            last_output_list[i].append(backend.eval(last_output))
+            outputs_list[i].append(backend.eval(outputs))
+            self.assertLen(new_states, 2)
+            state_list[i].append(backend.eval(new_states[0]))
+            additional_state_list[i].append(backend.eval(new_states[1]))
+
+            def assert_list_pairwise(z_list, atol=1e-05):
+                for z1, z2 in zip(z_list[1:], z_list[:-1]):
+                    self.assertAllClose(z1, z2, atol=atol)
+
+            assert_list_pairwise(last_output_list[0], atol=1e-04)
+            assert_list_pairwise(outputs_list[0], atol=1e-04)
+            assert_list_pairwise(state_list[0], atol=1e-04)
+            assert_list_pairwise(additional_state_list[0], atol=1e-04)
+            assert_list_pairwise(last_output_list[2], atol=1e-04)
+            assert_list_pairwise(outputs_list[2], atol=1e-04)
+            assert_list_pairwise(state_list[2], atol=1e-04)
+            assert_list_pairwise(additional_state_list[2], atol=1e-04)
+
+            for l, u_l in zip(last_output_list[0], last_output_list[1]):
+                self.assertAllClose(l, u_l, atol=1e-04)
+
+            for o, u_o in zip(outputs_list[0], outputs_list[1]):
+                self.assertAllClose(o, u_o, atol=1e-04)
+
+            for s, u_s in zip(state_list[0], state_list[1]):
+                self.assertAllClose(s, u_s, atol=1e-04)
+
+            for s, u_s in zip(
+                additional_state_list[0], additional_state_list[1]
+            ):
+                self.assertAllClose(s, u_s, atol=1e-04)
+
+            for b_l, b_u_l in zip(last_output_list[2], last_output_list[3]):
+                self.assertAllClose(b_l, b_u_l, atol=1e-04)
+
+            for b_o, b_u_o in zip(outputs_list[2], outputs_list[3]):
+                self.assertAllClose(b_o, b_u_o, atol=1e-04)
+
+            for b_s, b_u_s in zip(state_list[2], state_list[3]):
+                self.assertAllClose(b_s, b_u_s, atol=1e-04)
+
+            for s, u_s in zip(
+                additional_state_list[2], additional_state_list[3]
+            ):
+                self.assertAllClose(s, u_s, atol=1e-04)
+
+    def test_rnn_output_and_state_masking_independent(self):
+        num_samples = 2
+        num_timesteps = 4
+        state_and_io_size = 2
+        mask_last_num_timesteps = 2  # for second sample only
+
+        # a step function that just outputs inputs,
+        # but increments states +1 per timestep
+        def step_function(inputs, states):
+            return inputs, [s + 1 for s in states]
+
+        inputs_vals = np.random.random(
+            (num_samples, num_timesteps, state_and_io_size)
+        )
+        initial_state_vals = np.random.random((num_samples, state_and_io_size))
+        # masking of two last timesteps for second sample only
+        mask_vals = np.ones((num_samples, num_timesteps))
+        mask_vals[1, -mask_last_num_timesteps:] = 0
+
+        # outputs expected to be same as inputs for the first sample
+        expected_outputs = inputs_vals.copy()
+        # but for the second sample all outputs in masked region should be the
+        # same as last output before masked region
+        expected_outputs[1, -mask_last_num_timesteps:] = expected_outputs[
+            1, -(mask_last_num_timesteps + 1)
+        ]
+
+        expected_last_state = initial_state_vals.copy()
+        # first state should be incremented for every timestep (no masking)
+        expected_last_state[0] += num_timesteps
+        # second state should not be incremented for last two timesteps
+        expected_last_state[1] += num_timesteps - mask_last_num_timesteps
+
+        # verify same expected output for `unroll=true/false`
+        inputs = backend.variable(inputs_vals)
+        initial_states = [backend.variable(initial_state_vals)]
+        mask = backend.variable(mask_vals)
+        for unroll in [True, False]:
+            _, outputs, last_states = backend.rnn(
+                step_function,
+                inputs,
+                initial_states,
+                mask=mask,
+                unroll=unroll,
+                input_length=num_timesteps if unroll else None,
+            )
+
+            self.assertAllClose(backend.eval(outputs), expected_outputs)
+            self.assertAllClose(
+                backend.eval(last_states[0]), expected_last_state
+            )
+
+    def test_rnn_output_num_dim_larger_than_2_masking(self):
+        num_samples = 3
+        num_timesteps = 4
+        num_features = 5
+
+        def step_function(inputs, states):
+            outputs = backend.tile(backend.expand_dims(inputs), [1, 1, 2])
+            return outputs, [backend.identity(s) for s in states]
+            # Note: cannot just return states (which can be a problem) ->
+            # tensorflow/python/ops/resource_variable_ops.py", line 824, in
+            # set_shape NotImplementedError: ResourceVariable does not implement
+            # set_shape()
+
+        inputs_vals = np.random.random(
+            (num_samples, num_timesteps, num_features)
+        )
+        initial_state_vals = np.random.random((num_samples, 6))
+        mask_vals = np.ones((num_samples, num_timesteps))
+        mask_vals[-1, -1] = 0  # final timestep masked for last sample
+
+        expected_outputs = np.repeat(inputs_vals[..., None], repeats=2, axis=-1)
+        # for the last sample, the final timestep (in masked region) should be
+        # the same as the second to final output (before masked region)
+        expected_outputs[-1, -1] = expected_outputs[-1, -2]
+
+        inputs = backend.variable(inputs_vals)
+        initial_states = [backend.variable(initial_state_vals)]
+        mask = backend.variable(mask_vals)
+        for unroll in [True, False]:
+            _, outputs, _ = backend.rnn(
+                step_function,
+                inputs,
+                initial_states,
+                mask=mask,
+                unroll=unroll,
+                input_length=num_timesteps if unroll else None,
+            )
+
+            self.assertAllClose(backend.eval(outputs), expected_outputs)
+
+    def test_rnn_state_num_dim_larger_than_2_masking(self):
+        num_samples = 3
+        num_timesteps = 4
+
+        def step_function(inputs, states):
+            return inputs, [s + 1 for s in states]
+
+        inputs_vals = np.random.random((num_samples, num_timesteps, 5))
+        initial_state_vals = np.random.random((num_samples, 6, 7))
+        mask_vals = np.ones((num_samples, num_timesteps))
+        mask_vals[0, -2:] = 0  # final two timesteps masked for first sample
+
+        expected_last_state = initial_state_vals.copy()
+        expected_last_state[0] += num_timesteps - 2
+        expected_last_state[1:] += num_timesteps
+
+        inputs = backend.variable(inputs_vals)
+        initial_states = [backend.variable(initial_state_vals)]
+        mask = backend.variable(mask_vals)
+        for unroll in [True, False]:
+            _, _, last_states = backend.rnn(
+                step_function,
+                inputs,
+                initial_states,
+                mask=mask,
+                unroll=unroll,
+                input_length=num_timesteps if unroll else None,
+            )
+
+            self.assertAllClose(
+                backend.eval(last_states[0]), expected_last_state
+            )
+
+    def test_rnn_function_jit_compile_no_unroll_input_length_none(self):
+        num_samples = 3
+        num_timesteps = 4
+
+        def step_function(inputs, states):
+            return inputs, [s + 1 for s in states]
+
+        inputs_vals = np.random.random((num_samples, num_timesteps, 5))
+        initial_state_vals = np.random.random((num_samples, 6, 7))
+        mask_vals = np.ones((num_samples, num_timesteps))
+        mask_vals[0, -2:] = 0  # final two timesteps masked for first sample
+
+        expected_last_state = initial_state_vals.copy()
+        expected_last_state[0] += num_timesteps - 2
+        expected_last_state[1:] += num_timesteps
+
+        inputs = backend.variable(inputs_vals)
+        initial_states = [backend.variable(initial_state_vals)]
+        mask = backend.variable(mask_vals)
+
+        @tf.function(jit_compile=True)
+        def fn():
+            _, _, last_states = backend.rnn(
+                step_function,
+                inputs,
+                initial_states,
+                mask=mask,
+                unroll=False,
+                input_length=None,
+            )
+            return last_states
+
+        last_states = fn()
+        self.assertAllClose(backend.eval(last_states[0]), expected_last_state)
+
+    def test_batch_normalization(self):
+        g_val = np.random.random((3,))
+        b_val = np.random.random((3,))
+        gamma = backend.variable(g_val)
+        beta = backend.variable(b_val)
+
+        # 3D NHC case
+        val = np.random.random((10, 5, 3))
+        x = backend.variable(val)
+        mean, var = tf.nn.moments(x, (0, 1), None, None, False)
+        normed = backend.batch_normalization(
+            x, mean, var, beta, gamma, axis=-1, epsilon=1e-3
+        )
+        self.assertEqual(normed.shape.as_list(), [10, 5, 3])
+
+        # 4D NHWC case
+        val = np.random.random((10, 5, 5, 3))
+        x = backend.variable(val)
+        mean, var = tf.nn.moments(x, (0, 1, 2), None, None, False)
+        normed = backend.batch_normalization(
+            x, mean, var, beta, gamma, axis=-1, epsilon=1e-3
+        )
+        self.assertEqual(normed.shape.as_list(), [10, 5, 5, 3])
+
+        # 4D NCHW case
+        if not tf.executing_eagerly():
+            # Eager CPU kernel for NCHW does not exist.
+            val = np.random.random((10, 3, 5, 5))
+            x = backend.variable(val)
+            mean, var = tf.nn.moments(x, (0, 2, 3), None, None, False)
+            normed = backend.batch_normalization(
+                x, mean, var, beta, gamma, axis=1, epsilon=1e-3
+            )
+            self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
+
+    def test_normalize_batch_in_training(self):
+        val = np.random.random((10, 3, 10, 10))
+        x = backend.variable(val)
+        reduction_axes = (0, 2, 3)
+
+        g_val = np.random.random((3,))
+        b_val = np.random.random((3,))
+        gamma = backend.variable(g_val)
+        beta = backend.variable(b_val)
+        normed, mean, var = backend.normalize_batch_in_training(
+            x, gamma, beta, reduction_axes, epsilon=1e-3
+        )
+        self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
+        self.assertEqual(
+            mean.shape.as_list(),
+            [
+                3,
+            ],
+        )
+        self.assertEqual(
+            var.shape.as_list(),
+            [
+                3,
+            ],
+        )
+
+        # case: gamma=None
+        gamma = None
+        normed, mean, var = backend.normalize_batch_in_training(
+            x, gamma, beta, reduction_axes, epsilon=1e-3
+        )
+        self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
+        self.assertEqual(
+            mean.shape.as_list(),
+            [
+                3,
+            ],
+        )
+        self.assertEqual(
+            var.shape.as_list(),
+            [
+                3,
+            ],
+        )
+
+        # case: beta=None
+        beta = None
+        normed, mean, var = backend.normalize_batch_in_training(
+            x, gamma, beta, reduction_axes, epsilon=1e-3
+        )
+        self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
+        self.assertEqual(
+            mean.shape.as_list(),
+            [
+                3,
+            ],
+        )
+        self.assertEqual(
+            var.shape.as_list(),
+            [
+                3,
+            ],
+        )
+
+    def test_dropout(self):
+        inputs = tf.ones((200, 200))
+        outputs = backend.dropout(inputs, 0.2)
+        outputs_val = backend.eval(outputs)
+        self.assertEqual(np.min(outputs_val), 0)
+        self.assertAllClose(np.count_nonzero(outputs_val), 32000, atol=1000)
+        # Test noise shape
+        outputs = backend.dropout(inputs, 0.2, noise_shape=(200, 1))
+        outputs_val = backend.eval(outputs)
+        # Make sure the whole column gets the same dropout
+        self.assertEqual(np.min(outputs_val[0, :]), np.max(outputs_val[0, :]))
 
-    def ypowx():
-      return backend.pow(y, x)
 
-    tensor = backend.switch(backend.less(x, y), xpowy, ypowx)
-    self.assertEqual(backend.eval(tensor), [8.0])
+class BackendCrossEntropyLossesTest(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_crossentropy_with_sigmoid(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        p = backend.sigmoid(logits)
+        p = tf.identity(tf.identity(p))
+        result = self.evaluate(backend.binary_crossentropy(t, p))
+        self.assertArrayNear(result[0], [8.0, 0.313, 1.313], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_crossentropy_loss(self):
+        t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+
+        p = backend.constant(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.06], [0.05, 0.01, 0.94]]
+        )
+        result = backend.categorical_crossentropy(t, p)
+        self.assertArrayNear(self.evaluate(result), [0.105, 0.116, 0.062], 1e-3)
+
+        p = backend.constant(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.01], [0.05, 0.06, 0.94]]
+        )
+        result = backend.categorical_crossentropy(t, p, axis=0)
+        self.assertArrayNear(self.evaluate(result), [0.105, 0.116, 0.062], 1e-3)
+
+        p = backend.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        result = (backend.categorical_crossentropy(t, p, from_logits=True),)
+        self.assertArrayNear(self.evaluate(result)[0], [0.002, 0, 0.17], 1e-3)
+
+        p = backend.constant(
+            [[8.0, 0.0, 2.0], [1.0, 9.0, 3.0], [1.0, 1.0, 5.0]]
+        )
+        result = (
+            backend.categorical_crossentropy(t, p, from_logits=True, axis=0),
+        )
+        self.assertArrayNear(self.evaluate(result)[0], [0.002, 0, 0.17], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
+        t = backend.placeholder()
+        p = backend.placeholder()
+        o = backend.categorical_crossentropy(t, p)
+
+        t_val = tf.convert_to_tensor(
+            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
+        )
+        p_val = tf.convert_to_tensor(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.06], [0.05, 0.01, 0.94]]
+        )
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.105, 0.116, 0.062], 1e-3)
+
+        # With axis set
+        o = backend.categorical_crossentropy(t, p, axis=0)
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.105, 0.065, 0.111], 1e-3)
+
+        # from logits
+        p_val = tf.convert_to_tensor(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        o = backend.categorical_crossentropy(t, p, from_logits=True)
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.002, 0, 0.17], 1e-3)
+
+        # from logits and axis set
+        o = backend.categorical_crossentropy(t, p, from_logits=True, axis=0)
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.002, 0.003, 0.036], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_crossentropy_with_softmax(self):
+        t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        logits = backend.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        p = backend.softmax(logits)
+        p = tf.identity(tf.identity(p))
+        result = self.evaluate(backend.categorical_crossentropy(t, p))
+        self.assertArrayNear(result, [0.002, 0.0005, 0.17], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_categorical_crossentropy_loss(self):
+        t = backend.constant([0, 1, 2])
+
+        p = backend.constant(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.06], [0.05, 0.01, 0.94]]
+        )
+        result = backend.sparse_categorical_crossentropy(t, p)
+        self.assertArrayNear(self.evaluate(result), [0.105, 0.116, 0.062], 1e-3)
+
+        p = backend.constant(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.01], [0.05, 0.06, 0.94]]
+        )
+        result = backend.sparse_categorical_crossentropy(t, p, axis=0)
+        self.assertArrayNear(self.evaluate(result), [0.105, 0.116, 0.062], 1e-3)
+
+        p = backend.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        result = (
+            backend.sparse_categorical_crossentropy(t, p, from_logits=True),
+        )
+        self.assertArrayNear(self.evaluate(result)[0], [0.002, 0, 0.17], 1e-3)
+
+        p = backend.constant(
+            [[8.0, 0.0, 2.0], [1.0, 9.0, 3.0], [1.0, 1.0, 5.0]]
+        )
+        result = (
+            backend.sparse_categorical_crossentropy(
+                t, p, from_logits=True, axis=0
+            ),
+        )
+        self.assertArrayNear(self.evaluate(result)[0], [0.002, 0, 0.17], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_categorical_crossentropy_loss_with_ignore_class(self):
+        tests = (([255, 1, 2, 2], 255), ([-1, 1, 2, 2], -1))
+        p = backend.softmax(
+            backend.constant(
+                [
+                    [1.8, 1.2, 0.5],
+                    [0.2, 3.8, 0.8],
+                    [1.1, 0.4, 3.4],
+                    [1.3, 0.7, 3.8],
+                ]
+            )
+        )
+
+        for t, ignore_class in tests:
+            t = backend.constant(t)
+            result = backend.sparse_categorical_crossentropy(
+                t, p, ignore_class=ignore_class
+            )
+            self.assertArrayNear(
+                self.evaluate(result),
+                [0.0, 0.07428224, 0.13980183, 0.11967831],
+                1e-3,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_cce_loss_with_ignore_class_for_segmentation(self):
+        t = backend.constant(
+            [[[0, 2], [-1, -1]], [[0, 2], [-1, -1]], [[0, 0], [0, 0]]]
+        )
+        p = backend.constant(
+            [
+                [
+                    [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
+                    [[0.2, 0.5, 0.3], [0.0, 1.0, 0.0]],
+                ],
+                [
+                    [[1.0, 0.0, 0.0], [0.0, 0.5, 0.5]],
+                    [[0.2, 0.5, 0.3], [0.0, 1.0, 0.0]],
+                ],
+                [
+                    [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]],
+                    [[0.1, 0.9, 0.0], [0.2, 0.8, 0.0]],
+                ],
+            ]
+        )
+
+        expected_result = [
+            [[0.0, 0.0], [0.0, 0.0]],
+            [[0.0, 0.693148], [0.0, 0.0]],
+            [[0.0, 0.0], [2.302585, 1.609438]],
+        ]
+
+        # total_entries = 12
+        # valid_entries = 8
+        expected_mask = backend.constant(
+            [
+                [[True, True], [False, False]],
+                [[True, True], [False, False]],
+                [[True, True], [True, True]],
+            ]
+        )
+
+        result = backend.sparse_categorical_crossentropy(t, p, ignore_class=-1)
+        mask = losses_utils.get_mask(result)
+
+        self.assertIsNotNone(
+            mask,
+            "expected sparse_categorical_crossentropy to set the "
+            "`_keras_mask` attribute when `ignore_class is not None`, "
+            "which indicates which loss values are valid.",
+        )
+
+        result = self.evaluate(result)
+        mask = self.evaluate(mask)
+        self.assertAllEqual(mask, expected_mask)
+        self.assertAllClose(result, expected_result, atol=1e-6)
+
+    @test_combinations.generate(test_combinations.combine(mode=["graph"]))
+    def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(
+        self,
+    ):
+        # This test only runs in graph because the TF op layer is not supported
+        # yet for sparse ops.
+        t = backend.placeholder()
+        p = backend.placeholder()
+        o = backend.sparse_categorical_crossentropy(t, p)
+
+        t_val = tf.convert_to_tensor([0, 1, 2])
+        p_val = tf.convert_to_tensor(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.06], [0.05, 0.01, 0.94]]
+        )
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.105, 0.116, 0.062], 1e-3)
+
+        # With axis set
+        with self.assertRaisesRegex(
+            ValueError,
+            "Cannot compute sparse categorical crossentropy with `axis=0`",
+        ):
+            o = backend.sparse_categorical_crossentropy(t, p, axis=0)
+            f = backend.function([t, p], o)
+
+            _ = f([t_val, p_val])
+
+        # from logits
+        p_val = tf.convert_to_tensor(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        o = backend.sparse_categorical_crossentropy(t, p, from_logits=True)
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.002, 0, 0.17], 1e-3)
+
+        # from logits and axis set
+        with self.assertRaisesRegex(
+            ValueError,
+            "Cannot compute sparse categorical crossentropy with `axis=0`",
+        ):
+            o = backend.sparse_categorical_crossentropy(
+                t, p, from_logits=True, axis=0
+            )
+            f = backend.function([t, p], o)
+
+            _ = f([t_val, p_val])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_categorical_crossentropy_with_softmax(self):
+        t = backend.constant([0, 1, 2])
+        logits = backend.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        p = backend.softmax(logits)
+        p = tf.identity(tf.identity(p))
+        result = self.evaluate(backend.sparse_categorical_crossentropy(t, p))
+        self.assertArrayNear(result, [0.002, 0.0005, 0.17], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_crossentropy_from_logits_no_warnings(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        with warnings.catch_warnings(record=True) as w:
+            self.evaluate(
+                backend.binary_crossentropy(t, logits, from_logits=True)
+            )
+            self.assertEmpty(w)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_crossentropy_from_logits_with_sigmoid(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        p = activations.sigmoid(logits)
+        with warnings.catch_warnings(record=True) as w:
+            self.evaluate(backend.binary_crossentropy(t, p, from_logits=True))
+            self.assertLen(w, 1)
+            self.assertIn("received `from_logits=True`", str(w[0].message))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_crossentropy_from_logits_with_softmax(self):
+        t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        logits = backend.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        p = activations.softmax(logits)
+        with warnings.catch_warnings(record=True) as w:
+            self.evaluate(
+                backend.categorical_crossentropy(t, p, from_logits=True)
+            )
+            self.assertLen(w, 1)
+            self.assertIn("received `from_logits=True`", str(w[0].message))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_categorical_crossentropy_from_logits_with_softmax(self):
+        t = backend.constant([0, 1, 2])
+        logits = backend.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        p = activations.softmax(logits)
+        with warnings.catch_warnings(record=True) as w:
+            self.evaluate(
+                backend.sparse_categorical_crossentropy(t, p, from_logits=True)
+            )
+            self.assertLen(w, 1)
+            self.assertIn("received `from_logits=True`", str(w[0].message))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_focal_crossentropy_with_sigmoid(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        p = backend.sigmoid(logits)
+        p = tf.identity(tf.identity(p))
+        result = self.evaluate(
+            backend.binary_focal_crossentropy(t, p, gamma=2.0)
+        )
+        self.assertArrayNear(result[0], [7.995, 0.022, 0.701], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_focal_crossentropy_with_softmax(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        p = backend.softmax(logits)
+        p = tf.identity(tf.identity(p))
+        result = self.evaluate(
+            backend.categorical_focal_crossentropy(t, p, gamma=2.0)
+        )
+        self.assertArrayNear(result, [1.747], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_focal_crossentropy_from_logits(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        result = self.evaluate(
+            backend.binary_focal_crossentropy(
+                target=t,
+                output=logits,
+                gamma=2.0,
+                from_logits=True,
+            )
+        )
+        self.assertArrayNear(result[0], [7.995, 0.022, 0.701], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_focal_crossentropy_from_logits(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        result = self.evaluate(
+            backend.categorical_focal_crossentropy(
+                target=t,
+                output=logits,
+                from_logits=True,
+            )
+        )
+        self.assertArrayNear(result, [1.7472], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_focal_crossentropy_no_focal_effect_with_zero_gamma(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        p = backend.sigmoid(logits)
+        p = tf.identity(tf.identity(p))
+        gamma = 0
+        focal_result = self.evaluate(
+            backend.binary_focal_crossentropy(
+                target=t,
+                output=p,
+                gamma=gamma,
+            )
+        )
+        non_focal_result = self.evaluate(backend.binary_crossentropy(t, p))
+        self.assertArrayNear(focal_result[0], non_focal_result[0], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_focal_crossentropy_no_focal_effect(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        p = backend.softmax(logits)
+        p = tf.identity(tf.identity(p))
+        focal_result = self.evaluate(
+            backend.categorical_focal_crossentropy(
+                target=t,
+                output=p,
+                gamma=0.0,
+                alpha=1.0,
+            )
+        )
+        non_focal_result = self.evaluate(backend.categorical_crossentropy(t, p))
+        self.assertArrayNear(focal_result, non_focal_result, 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_weighted_focal_crossentropy_with_sigmoid(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        p = backend.sigmoid(logits)
+        p = tf.identity(tf.identity(p))
+        result = self.evaluate(
+            backend.binary_focal_crossentropy(
+                target=t,
+                output=p,
+                apply_class_balancing=True,
+            )
+        )
+        self.assertArrayNear(result[0], [5.996, 0.006, 0.526], 1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_weighted_focal_crossentropy_from_logits(self):
+        t = backend.constant([[0, 1, 0]])
+        logits = backend.constant([[8.0, 1.0, 1.0]])
+        result = self.evaluate(
+            backend.binary_focal_crossentropy(
+                target=t,
+                output=logits,
+                apply_class_balancing=True,
+                from_logits=True,
+            )
+        )
+        self.assertArrayNear(result[0], [5.996, 0.006, 0.526], 1e-3)
 
-    tensor = backend.switch(backend.greater(x, y), xpowy, ypowx)
-    self.assertEqual(backend.eval(tensor), [9.0])
 
-  def test_unequal_rank(self):
-    x = tf.convert_to_tensor(
-        np.array([[1, 2, 3], [4, 5, 6]]), dtype='float32')
-    y = tf.convert_to_tensor(
-        np.array([1, 2, 3]), dtype='float32')
+@tf_test_utils.with_control_flow_v2
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class TestCTC(tf.test.TestCase):
+    def test_ctc_decode(self):
+        depth = 6
+        seq_len_0 = 5
+        input_prob_matrix_0 = np.asarray(
+            [
+                [0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
+                [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
+                [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
+                [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
+                [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
+                # Random entry added in at time=5
+                [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671],
+            ],
+            dtype=np.float32,
+        )
+
+        # len max_time_steps array of batch_size x depth matrices
+        inputs = [
+            input_prob_matrix_0[t, :][np.newaxis, :] for t in range(seq_len_0)
+        ] + 2 * [  # Pad to max_time_steps = 8
+            np.zeros((1, depth), dtype=np.float32)
+        ]
+
+        inputs = backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
+
+        # batch_size length vector of sequence_lengths
+        input_length = backend.variable(np.array([seq_len_0], dtype=np.int32))
+        # batch_size length vector of negative log probabilities
+        log_prob_truth = np.array(
+            [-3.5821197, -3.777835],  # output beam 0  # output beam 1
+            np.float32,
+        )[np.newaxis, :]
+
+        decode_truth = [
+            np.array([1, 0, -1, -1, -1, -1, -1]),
+            np.array([0, 1, 0, -1, -1, -1, -1]),
+        ]
+        beam_width = 2
+        top_paths = 2
+
+        decode_pred_tf, log_prob_pred_tf = backend.ctc_decode(
+            inputs,
+            input_length,
+            greedy=False,
+            beam_width=beam_width,
+            top_paths=top_paths,
+        )
+
+        self.assertEqual(len(decode_pred_tf), top_paths)
+        log_prob_pred = backend.eval(log_prob_pred_tf)
+        for i in range(top_paths):
+            self.assertTrue(
+                np.all(decode_truth[i] == backend.eval(decode_pred_tf[i]))
+            )
+        self.assertAllClose(log_prob_truth, log_prob_pred)
+
+    def test_ctc_batch_cost(self):
+        with self.cached_session():
+            label_lens = np.expand_dims(np.asarray([5, 4]), 1)
+            input_lens = np.expand_dims(
+                np.asarray([5, 5]), 1
+            )  # number of timesteps
+            loss_log_probs = [3.34211, 5.42262]
+
+            # dimensions are batch x time x categories
+            labels = np.asarray([[0, 1, 2, 1, 0], [0, 1, 1, 0, -1]])
+            inputs = np.asarray(
+                [
+                    [
+                        [
+                            0.633766,
+                            0.221185,
+                            0.0917319,
+                            0.0129757,
+                            0.0142857,
+                            0.0260553,
+                        ],
+                        [
+                            0.111121,
+                            0.588392,
+                            0.278779,
+                            0.0055756,
+                            0.00569609,
+                            0.010436,
+                        ],
+                        [
+                            0.0357786,
+                            0.633813,
+                            0.321418,
+                            0.00249248,
+                            0.00272882,
+                            0.0037688,
+                        ],
+                        [
+                            0.0663296,
+                            0.643849,
+                            0.280111,
+                            0.00283995,
+                            0.0035545,
+                            0.00331533,
+                        ],
+                        [
+                            0.458235,
+                            0.396634,
+                            0.123377,
+                            0.00648837,
+                            0.00903441,
+                            0.00623107,
+                        ],
+                    ],
+                    [
+                        [
+                            0.30176,
+                            0.28562,
+                            0.0831517,
+                            0.0862751,
+                            0.0816851,
+                            0.161508,
+                        ],
+                        [
+                            0.24082,
+                            0.397533,
+                            0.0557226,
+                            0.0546814,
+                            0.0557528,
+                            0.19549,
+                        ],
+                        [
+                            0.230246,
+                            0.450868,
+                            0.0389607,
+                            0.038309,
+                            0.0391602,
+                            0.202456,
+                        ],
+                        [
+                            0.280884,
+                            0.429522,
+                            0.0326593,
+                            0.0339046,
+                            0.0326856,
+                            0.190345,
+                        ],
+                        [
+                            0.423286,
+                            0.315517,
+                            0.0338439,
+                            0.0393744,
+                            0.0339315,
+                            0.154046,
+                        ],
+                    ],
+                ],
+                dtype=np.float32,
+            )
+
+            labels = backend.variable(labels, dtype="int32")
+            inputs = backend.variable(inputs, dtype="float32")
+            input_lens = backend.variable(input_lens, dtype="int32")
+            label_lens = backend.variable(label_lens, dtype="int32")
+            res = backend.eval(
+                backend.ctc_batch_cost(labels, inputs, input_lens, label_lens)
+            )
+            self.assertAllClose(res[:, 0], loss_log_probs, atol=1e-05)
+
+            # test when batch_size = 1, that is, one sample only
+            ref = [3.34211]
+            input_lens = np.expand_dims(np.asarray([5]), 1)
+            label_lens = np.expand_dims(np.asarray([5]), 1)
+
+            labels = np.asarray([[0, 1, 2, 1, 0]])
+            inputs = np.asarray(
+                [
+                    [
+                        [
+                            0.633766,
+                            0.221185,
+                            0.0917319,
+                            0.0129757,
+                            0.0142857,
+                            0.0260553,
+                        ],
+                        [
+                            0.111121,
+                            0.588392,
+                            0.278779,
+                            0.0055756,
+                            0.00569609,
+                            0.010436,
+                        ],
+                        [
+                            0.0357786,
+                            0.633813,
+                            0.321418,
+                            0.00249248,
+                            0.00272882,
+                            0.0037688,
+                        ],
+                        [
+                            0.0663296,
+                            0.643849,
+                            0.280111,
+                            0.00283995,
+                            0.0035545,
+                            0.00331533,
+                        ],
+                        [
+                            0.458235,
+                            0.396634,
+                            0.123377,
+                            0.00648837,
+                            0.00903441,
+                            0.00623107,
+                        ],
+                    ]
+                ],
+                dtype=np.float32,
+            )
+
+            k_labels = backend.variable(labels, dtype="int32")
+            k_inputs = backend.variable(inputs, dtype="float32")
+            k_input_lens = backend.variable(input_lens, dtype="int32")
+            k_label_lens = backend.variable(label_lens, dtype="int32")
+            res = backend.eval(
+                backend.ctc_batch_cost(
+                    k_labels, k_inputs, k_input_lens, k_label_lens
+                )
+            )
+            self.assertAllClose(res[:, 0], ref, atol=1e-05)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class TestRandomOps(tf.test.TestCase):
+    def test_random_normal(self):
+        np.random.seed(123)
+        x = backend.random_normal((500, 500))
+        val = backend.eval(x)
+        self.assertAllClose(np.mean(val), 0.0, atol=0.01)
+        self.assertAllClose(np.std(val), 1.0, atol=0.01)
+
+    def test_random_uniform(self):
+        np.random.seed(123)
+        x = backend.random_uniform((500, 500))
+        val = backend.eval(x)
+        self.assertAllClose(np.mean(val), 0.5, atol=0.01)
+        self.assertAllClose(np.max(val), 1.0, atol=0.01)
+        self.assertAllClose(np.min(val), 0.0, atol=0.01)
+
+    def test_random_binomial(self):
+        np.random.seed(123)
+        x = backend.random_binomial((500, 500), p=0.5)
+        self.assertAllClose(np.mean(backend.eval(x)), 0.5, atol=0.01)
+
+    def test_truncated_normal(self):
+        np.random.seed(123)
+        x = backend.truncated_normal((500, 500), mean=0.0, stddev=1.0)
+        x = backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
+        y = backend.eval(x)
+        self.assertAllClose(np.mean(y), 0.0, atol=0.01)
+        self.assertAllClose(np.std(y), 0.88, atol=0.01)
+        self.assertAllClose(np.max(y), 2.0, atol=0.01)
+        self.assertAllClose(np.min(y), -2.0, atol=0.01)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class FunctionTest(tf.test.TestCase):
+    def test_function_basics(self):
+        if tf.executing_eagerly():
+            self.skipTest("eager backend.function does not support updates")
+        x1 = backend.placeholder(shape=(), dtype="float32")
+        x2 = backend.placeholder(shape=(), dtype="int32")
+        v = backend.variable(10.0)
+
+        y1 = x1 + backend.cast(x2, "float32") + v
+        y2 = x1 * backend.cast(x2, "float32")
+
+        with tf.control_dependencies([y1]):
+            u = backend.update(v, x1)
+
+        f = backend.function([x1, x2], [y1, y2], updates=[u])
+        output_values = f([2, 3])
+        self.assertEqual(output_values, [15.0, 6.0])
+        self.assertEqual(backend.eval(v), 2.0)
+
+    def test_function_dict_outputs(self):
+        x_ph = backend.placeholder(shape=(), name="x")
+        y_ph = backend.placeholder(shape=(), name="y")
+        outputs = {"x*y": y_ph * x_ph, "x*x": x_ph * x_ph}
+
+        f = backend.function(inputs=[x_ph, y_ph], outputs=outputs)
+        x, y = 2.0, 5.0
+        results = f([x, y])
+
+        self.assertEqual(results["x*y"], 10.0)
+        self.assertEqual(results["x*x"], 4)
+
+    def test_function_dict_inputs(self):
+        placeholders = {
+            "x": backend.placeholder(shape=()),
+            "y": backend.placeholder(shape=()),
+        }
+        outputs = [placeholders["x"] * placeholders["y"]]
+
+        f = backend.function(inputs=placeholders, outputs=outputs)
+        results = f({"x": 2.0, "y": 3.0})
+        self.assertEqual(results[0], 6.0)
+
+    def test_function_variable_inputs(self):
+        placeholders = {
+            "x": backend.placeholder(shape=()),
+            "y": backend.placeholder(shape=()),
+        }
+        outputs = [placeholders["x"] * placeholders["y"]]
+
+        f = backend.function(inputs=placeholders, outputs=outputs)
+        results = f({"x": backend.variable(2.0), "y": 3.0})
+        self.assertEqual(results[0], 6.0)
+
+    def test_function_composite_variable_inputs(self):
+        if context.executing_eagerly():
+            self.skipTest(
+                "Only graph mode flattens composite tensor inputs into flat "
+                "tensors."
+            )
+
+        class Spec(tf.TypeSpec):
+            value_type = property(lambda self: CompositeVariable)
+
+            def _serialize(self):
+                pass
+
+            def _component_specs(self):
+                pass
+
+            def _to_components(self, value):
+                return value.variables
+
+            def _from_components(self, variable_list):
+                return CompositeVariable(variable_list)
+
+        class CompositeVariable(tf.__internal__.CompositeTensor):
+            def __init__(self, variable_list):
+                self.variables = variable_list
+
+            @property
+            def _type_spec(self):
+                return Spec()
+
+            def _convert_variables_to_tensors(self):
+                self.variables = tf.nest.map_structure(
+                    tf_utils.convert_variables_to_tensors, self.variables
+                )
+                return self
+
+        placeholders = {
+            "x": backend.placeholder(shape=()),
+            "y": backend.placeholder(shape=()),
+        }
+        outputs = [placeholders["x"] * placeholders["y"]]
+
+        f = backend.function(inputs=placeholders, outputs=outputs)
+        results = f({"x": CompositeVariable([backend.variable(2.0)]), "y": 3.0})
+        self.assertEqual(results[0], 6.0)
+
+    def test_function_single_input_output(self):
+        x_ph = backend.placeholder(shape=(), name="x")
+        output = x_ph * x_ph
+        f = backend.function(x_ph, output)
+        result = f(2.0)
+        self.assertEqual(result, 4.0)
+
+    def test_tuple_updates(self):
+        if tf.executing_eagerly():
+            self.skipTest("eager backend.function does not support updates")
+
+        x_ph = backend.placeholder(ndim=2)
+        v = backend.variable(np.ones((4, 2)))
+        output = x_ph**2 + v
+        new_v = v + x_ph
+        f = backend.function(x_ph, output, updates=[(v, new_v)])
+        input_val = np.random.random((4, 2))
+        result = f(input_val)
+        self.assertAllClose(result, input_val**2 + 1)
+        self.assertAllClose(backend.get_value(v), np.ones((4, 2)) + input_val)
 
-    def true_func():
-      return x
 
-    def false_func():
-      return y
+class BackendGraphTests(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.generate(test_combinations.combine(mode=["graph"]))
+    def test_function_placeholder_with_default(self):
+        with backend.get_graph().as_default():
+            x1 = tf.compat.v1.placeholder_with_default(
+                np.array(2.0, dtype="float32"), shape=()
+            )
+            x2 = tf.compat.v1.placeholder_with_default(
+                np.array(3, dtype="int32"), shape=()
+            )
+        y1 = x1 + backend.cast(x2, "float32")
+        y2 = x1 * backend.cast(x2, "float32")
+        f = backend.function([x1, x2], [y1, y2])
+        output_values = f([4, 5])
+        self.assertEqual(output_values, [9.0, 20.0])
+        output_values = f([None, None])
+        self.assertEqual(output_values, [5.0, 6.0])
+
+    def test_function_tf_feed_symbols(self):
+        # Test Keras backend functions with TF tensor inputs.
+        with tf.Graph().as_default(), self.cached_session():
+            # Test feeding a resource variable to `function`.
+            x1 = backend.placeholder(shape=())
+            x2 = backend.placeholder(shape=())
+            lr = backend.learning_phase()  # Include a placeholder_with_default.
+
+            y1 = backend.variable(10.0)
+            y2 = 3
+
+            f = backend.function(
+                inputs=[x1, x2, lr],
+                outputs=[x1 + 1, backend.in_train_phase(x2 + 2, x2 - 1)],
+            )
+            outs = f([y1, y2, None])  # Use default learning_phase value.
+            self.assertEqual(outs, [11.0, 2.0])
+            outs = f([y1, y2, 1])  # Set learning phase value.
+            self.assertEqual(outs, [11.0, 5.0])
+
+            # Test triggering a callable refresh by changing the input.
+            y3 = backend.constant(20.0)  # Test with tensor
+            outs = f([y3, y2, None])
+            self.assertEqual(outs, [21.0, 2.0])
+
+            y4 = 4  # Test with non-symbol
+            outs = f([y4, y2, None])
+            self.assertEqual(outs, [5.0, 2.0])
+
+            # Test with a different dtype
+            y5 = backend.constant(10.0, dtype="float64")
+            outs = f([y5, y2, None])
+            self.assertEqual(outs, [11.0, 2.0])
+
+    def test_function_tf_fetches(self):
+        # Additional operations can be passed to tf.compat.v1.Session().run()
+        # via its `fetches` arguments. In contrast to `updates` argument of
+        # backend.function() these do not have control dependency on `outputs`
+        # so they can run in parallel. Also they should not contribute to output
+        # of backend.function().
+        with tf.Graph().as_default(), self.cached_session():
+            x = backend.variable(0.0)
+            y = backend.variable(0.0)
+            x_placeholder = backend.placeholder(shape=())
+            y_placeholder = backend.placeholder(shape=())
+
+            f = backend.function(
+                inputs=[x_placeholder, y_placeholder],
+                outputs=[x_placeholder + y_placeholder],
+                updates=[(x, x_placeholder + 1.0)],
+                fetches=[backend.update(y, 5.0)],
+            )
+            output = f([10.0, 20.0])
+            self.assertEqual(output, [30.0])
+            self.assertEqual(
+                backend.get_session().run(fetches=[x, y]), [11.0, 5.0]
+            )
+
+    def test_function_tf_feed_dict(self):
+        # Additional substitutions can be passed to
+        # `tf.compat.v1.Session().run()` via its `feed_dict` arguments. Note
+        # that the feed_dict is passed once in the constructor but we can modify
+        # the values in the dictionary. Through this feed_dict we can provide
+        # additional substitutions besides Keras inputs.
+        with tf.Graph().as_default(), self.cached_session():
+            x = backend.variable(0.0)
+            y = backend.variable(0.0)
+            x_placeholder = backend.placeholder(shape=())
+            y_placeholder = backend.placeholder(shape=())
+
+            feed_dict = {y_placeholder: 3.0}
+            fetches = [backend.update(y, y_placeholder * 10.0)]
+            f = backend.function(
+                inputs=[x_placeholder],
+                outputs=[x_placeholder + 1.0],
+                updates=[(x, x_placeholder + 10.0)],
+                feed_dict=feed_dict,
+                fetches=fetches,
+            )
+            output = f([10.0])
+            self.assertEqual(output, [11.0])
+            self.assertEqual(
+                backend.get_session().run(fetches=[x, y]), [20.0, 30.0]
+            )
+
+            # updated value in feed_dict will be modified within the
+            # K.function()
+            feed_dict[y_placeholder] = 4.0
+            output = f([20.0])
+            self.assertEqual(output, [21.0])
+            self.assertEqual(
+                backend.get_session().run(fetches=[x, y]), [30.0, 40.0]
+            )
+
+    def test_function_tf_run_options_with_run_metadata(self):
+        with tf.Graph().as_default(), self.cached_session():
+            x_placeholder = backend.placeholder(shape=())
+            y_placeholder = backend.placeholder(shape=())
+
+            run_options = tf.compat.v1.RunOptions(output_partition_graphs=True)
+            run_metadata = tf.compat.v1.RunMetadata()
+            # enable run_options.
+            f = backend.function(
+                inputs=[x_placeholder, y_placeholder],
+                outputs=[x_placeholder + y_placeholder],
+                options=run_options,
+                run_metadata=run_metadata,
+            )
+            output = f([10.0, 20.0])
+            self.assertEqual(output, [30.0])
+            self.assertNotEmpty(run_metadata.partition_graphs)
+            # disable run_options.
+            f1 = backend.function(
+                inputs=[x_placeholder, y_placeholder],
+                outputs=[x_placeholder + y_placeholder],
+                run_metadata=run_metadata,
+            )
+            output1 = f1([10.0, 20.0])
+            self.assertEqual(output1, [30.0])
+            self.assertEmpty(run_metadata.partition_graphs)
+
+    def test_function_fetch_callbacks(self):
+        class CallbackStub:
+            def __init__(self):
+                self.times_called = 0
+                self.callback_result = 0
+
+            def _fetch_callback(self, result):
+                self.times_called += 1
+                self.callback_result = result
+
+        with tf.Graph().as_default(), self.cached_session():
+            callback = CallbackStub()
+            x_placeholder = backend.placeholder(shape=())
+            y_placeholder = backend.placeholder(shape=())
+
+            callback_op = x_placeholder * y_placeholder
+
+            f = backend.function(
+                inputs=[x_placeholder, y_placeholder],
+                outputs=[x_placeholder + y_placeholder],
+            )
+            f.fetches.append(callback_op)
+            f.fetch_callbacks[callback_op] = callback._fetch_callback
+
+            _ = f([10.0, 20.0])
+
+            self.assertEqual(callback.times_called, 1)
+            self.assertEqual(callback.callback_result, 200)
+
+    def test_get_session_different_graphs(self):
+        with tf.Graph().as_default():
+            x = backend.constant(1)
+            session = backend.get_session()
+            self.assertIs(session, backend.get_session((x,)))
+            self.assertIs(session, backend.get_session())
+        with tf.Graph().as_default():
+            self.assertIs(session, backend.get_session((x,)))
+            self.assertIsNot(session, backend.get_session())
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class ControlOpsTests(tf.test.TestCase):
+    def test_function_switch_basics(self):
+        x = tf.constant(2.0)
+        y = tf.constant(3.0)
 
-    with self.assertRaisesRegex(ValueError,
-                                'Rank of `condition` should be less than'):
-      backend.switch(backend.equal(x, x), false_func, true_func)
+        def xpowy():
+            return backend.pow(x, y)
 
+        def ypowx():
+            return backend.pow(y, x)
 
-class ContextValueCacheTest(tf.test.TestCase):
+        tensor = backend.switch(backend.less(x, y), xpowy, ypowx)
+        self.assertEqual(backend.eval(tensor), [8.0])
+
+        tensor = backend.switch(backend.greater(x, y), xpowy, ypowx)
+        self.assertEqual(backend.eval(tensor), [9.0])
 
-  def test_cache(self):
-    cache = backend.ContextValueCache(list)
-    graph1 = tf.Graph()
-    graph2 = tf.Graph()
+    def test_unequal_rank(self):
+        x = tf.convert_to_tensor(
+            np.array([[1, 2, 3], [4, 5, 6]]), dtype="float32"
+        )
+        y = tf.convert_to_tensor(np.array([1, 2, 3]), dtype="float32")
 
-    cache[graph1].append(1)
-    with graph1.as_default():
-      cache[None].append(2)
+        def true_func():
+            return x
 
-    with graph2.as_default():
-      cache[None].append(3)
-    cache[graph2].append(4)
+        def false_func():
+            return y
 
-    self.assertAllEqual(cache[graph1], [1, 2])
-    self.assertAllEqual(cache[graph2], [3, 4])
+        with self.assertRaisesRegex(
+            ValueError, "Rank of `condition` should be less than"
+        ):
+            backend.switch(backend.equal(x, x), false_func, true_func)
 
-    with tf.__internal__.eager_context.eager_mode():
-      cache[None].append(5)
-      cache[None].append(6)
-      self.assertAllEqual(cache[None], [5, 6])
 
-    self.assertLen(cache, 3)
+class ContextValueCacheTest(tf.test.TestCase):
+    def test_cache(self):
+        cache = backend.ContextValueCache(list)
+        graph1 = tf.Graph()
+        graph2 = tf.Graph()
 
-    del graph1
-    gc.collect()
-    self.assertLen(cache, 2)
+        cache[graph1].append(1)
+        with graph1.as_default():
+            cache[None].append(2)
 
-  def test_cache_in_parent_graph(self):
-    cache = backend.ContextValueCache(int)
-    cache.setdefault(None, backend.constant(5))
+        with graph2.as_default():
+            cache[None].append(3)
+        cache[graph2].append(4)
 
-    with tf.Graph().as_default() as g:
-      # g is not a child graph of the default test context, so the recursive
-      # lookup will create a new default value.
-      self.assertAllEqual(cache[g], 0)
+        self.assertAllEqual(cache[graph1], [1, 2])
+        self.assertAllEqual(cache[graph2], [3, 4])
 
-    @tf.function
-    def fn():
-      # The function graph is a child of the default test context, so
-      # __getitem__ will return the previously saved value.
-      return cache[tf.compat.v1.get_default_graph()]
+        with tf.__internal__.eager_context.eager_mode():
+            cache[None].append(5)
+            cache[None].append(6)
+            self.assertAllEqual(cache[None], [5, 6])
 
-    self.assertEqual(self.evaluate(fn()), 5)
+        self.assertLen(cache, 3)
 
+        del graph1
+        gc.collect()
+        self.assertLen(cache, 2)
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class RandomGeneratorTest(tf.test.TestCase, parameterized.TestCase):
+    def test_cache_in_parent_graph(self):
+        cache = backend.ContextValueCache(int)
+        cache.setdefault(None, backend.constant(5))
 
-  def test_generator_reproducibility(self):
-    seed = 1337
-    gen1 = backend.RandomGenerator(seed, rng_type='stateful')
-    output1 = gen1.random_normal(shape=[2, 3])
-    output2 = gen1.random_normal(shape=[2, 3])
-
-    self.assertNotAllClose(output1, output2)
-
-    gen2 = backend.RandomGenerator(seed, rng_type='stateful')
-    output3 = gen2.random_normal(shape=[2, 3])
-    output4 = gen2.random_normal(shape=[2, 3])
-
-    if tf.compat.v1.executing_eagerly():
-      # Make sure generator with same seed will produce same sequence.
-      self.assertAllEqual(output1, output3)
-      self.assertAllEqual(output2, output4)
-
-  def test_unseeded(self):
-    seed = None
-    gen1 = backend.RandomGenerator(seed, rng_type='stateful')
-    output1 = gen1.random_normal(shape=[2, 3])
-
-    gen2 = backend.RandomGenerator(seed, rng_type='stateful')
-    output2 = gen2.random_normal(shape=[2, 3])
-
-    self.assertNotAllClose(output1, output2)
-
-  def test_implementation(self):
-    seed = 1337
-    seeded = backend.RandomGenerator(seed, rng_type='stateful')
-    seeded._maybe_init()
-    unseeded = backend.RandomGenerator(None, rng_type='stateful')
-    unseeded._maybe_init()
-    if tf.compat.v1.executing_eagerly():
-      # Make sure we use tf.random.Generator in v2.
-      self.assertIsNotNone(seeded._generator)
-      self.assertIsNotNone(unseeded._generator)
-    else:
-      # In v1, we can't use tf.random.Generator since it is not compatible with
-      # graph mode.
-      self.assertIsNone(seeded._generator)
-      self.assertIsNone(unseeded._generator)
-
-  def test_unseeded_with_utils_set_random_seed(self):
-    keras_seed = 1337
-    tf_utils.set_random_seed(keras_seed)
-    gen1 = backend.RandomGenerator(seed=None, rng_type='stateful')
-    output1 = gen1.random_normal(shape=[2, 3])
-    output2 = gen1.random_normal(shape=[2, 3])
-
-    self.assertNotAllClose(output1, output2)
-
-    # Make sure even with unseeded backend generator, as long as we set the
-    # keras random seed, it will make the generator to produce the same
-    # sequence. This will ensure all the client are in sync in the multi-client
-    # setting, when they all set the keras seed.
-    tf_utils.set_random_seed(keras_seed)
-    gen2 = backend.RandomGenerator(seed=None, rng_type='stateful')
-    output3 = gen2.random_normal(shape=[2, 3])
-    output4 = gen2.random_normal(shape=[2, 3])
-
-    gen3 = backend.RandomGenerator(seed=None, rng_type='stateful')
-    output5 = gen3.random_normal(shape=[2, 3])
-    output6 = gen3.random_normal(shape=[2, 3])
-
-    if tf.compat.v1.executing_eagerly():
-      # The generator is only used in the tf2 with eager.
-      self.assertAllEqual(output1, output3)
-      self.assertAllEqual(output2, output4)
-
-      # Also make sure different generator instance are still producing
-      # different result
-      self.assertNotAllEqual(output3, output5)
-      self.assertNotAllEqual(output4, output6)
-
-  def test_force_stateless(self):
-    gen = backend.RandomGenerator(seed=None, rng_type='stateless')
-    output1 = gen.random_normal(shape=[2, 3])
-    seed1 = gen._seed
-    output2 = gen.random_normal(shape=[2, 3])
-    seed2 = gen._seed
-
-    self.assertAllClose(output1, output2)
-    # Make sure we always use the same seed, and it is not None
-    self.assertEqual(seed1, seed2)
-    self.assertIsNotNone(seed1)
-
-    # Make sure a new seed is used when creating a new generator instance.
-    gen2 = backend.RandomGenerator(seed=None, rng_type='stateless')
-    output3 = gen2.random_normal(shape=[2, 3])
-    seed3 = gen2._seed
-    output4 = gen2.random_normal(shape=[2, 3])
-    seed4 = gen2._seed
-
-    self.assertAllClose(output3, output4)
-    self.assertEqual(seed3, seed4)
-    self.assertNotEqual(seed1, seed3)
-
-  def test_force_stateless_with_seed(self):
-    seed = 1337
-    gen = backend.RandomGenerator(seed=seed, rng_type='stateless')
-    output1 = gen.random_normal(shape=[2, 3])
-    seed1 = gen._seed
-    output2 = gen.random_normal(shape=[2, 3])
-    seed2 = gen._seed
-
-    self.assertAllClose(output1, output2)
-    # Make sure we always use the same seed, and it is not None
-    self.assertEqual(seed, seed1)
-    self.assertEqual(seed, seed2)
-
-    # Make sure RandomGenerator always generate same value with same seed.
-    gen2 = backend.RandomGenerator(seed=seed, rng_type='stateless')
-    output3 = gen2.random_normal(shape=[2, 3])
-    self.assertAllClose(output3, output1)
-
-  @parameterized.named_parameters(
-      ('seeded', 1337), ('unseeded', None)
-  )
-  def test_stateless_with_seed_delta(self, seed):
-    gen = backend.RandomGenerator(seed=seed, rng_type='stateless')
-    output1 = gen.random_normal(shape=[2, 3], nonce=hash((1, 1)))
-    seed1 = gen._seed
-    output2 = gen.random_normal(shape=[2, 3], nonce=hash((1, 1)))
-    seed2 = gen._seed
-    output3 = gen.random_normal(shape=[2, 3], nonce=hash((2, 1)))
-    seed3 = gen._seed
-
-    self.assertAllClose(output1, output2)
-    # Different seed_delta will produce different value.
-    self.assertNotAllClose(output1, output3)
-    # Make sure the internal seed is not changed at all.
-    self.assertEqual(seed1, seed2)
-    self.assertEqual(seed1, seed3)
-
-  def test_unknown_rng_type(self):
-    with self.assertRaisesRegex(ValueError, 'Got: unknown'):
-      backend.RandomGenerator(seed=None, rng_type='unknown')
-
-  def test_prefer_stateless_over_global_generator(self):
-    try:
-      generator_enabled = backend.is_tf_random_generator_enabled()
-      if not generator_enabled:
-        backend.enable_tf_random_generator()
+        with tf.Graph().as_default() as g:
+            # g is not a child graph of the default test context, so the
+            # recursive lookup will create a new default value.
+            self.assertAllEqual(cache[g], 0)
 
-      seed = 1337
-      gen = backend.RandomGenerator(seed=seed, rng_type='stateless')
-      output1 = gen.random_normal(shape=[2, 3])
-      output2 = gen.random_normal(shape=[2, 3])
+        @tf.function
+        def fn():
+            # The function graph is a child of the default test context, so
+            # __getitem__ will return the previously saved value.
+            return cache[tf.compat.v1.get_default_graph()]
 
-      self.assertIsNone(gen._generator)
-      self.assertAllClose(output1, output2)
-    finally:
-      if not generator_enabled:
-        # Change the global flag back.
-        backend.disable_tf_random_generator()
+        self.assertEqual(self.evaluate(fn()), 5)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class RandomGeneratorTest(tf.test.TestCase, parameterized.TestCase):
+    def test_generator_reproducibility(self):
+        seed = 1337
+        gen1 = backend.RandomGenerator(seed, rng_type="stateful")
+        output1 = gen1.random_normal(shape=[2, 3])
+        output2 = gen1.random_normal(shape=[2, 3])
+
+        self.assertNotAllClose(output1, output2)
+
+        gen2 = backend.RandomGenerator(seed, rng_type="stateful")
+        output3 = gen2.random_normal(shape=[2, 3])
+        output4 = gen2.random_normal(shape=[2, 3])
+
+        if tf.compat.v1.executing_eagerly():
+            # Make sure generator with same seed will produce same sequence.
+            self.assertAllEqual(output1, output3)
+            self.assertAllEqual(output2, output4)
+
+    def test_unseeded(self):
+        seed = None
+        gen1 = backend.RandomGenerator(seed, rng_type="stateful")
+        output1 = gen1.random_normal(shape=[2, 3])
+
+        gen2 = backend.RandomGenerator(seed, rng_type="stateful")
+        output2 = gen2.random_normal(shape=[2, 3])
+
+        self.assertNotAllClose(output1, output2)
+
+    def test_implementation(self):
+        seed = 1337
+        seeded = backend.RandomGenerator(seed, rng_type="stateful")
+        seeded._maybe_init()
+        unseeded = backend.RandomGenerator(None, rng_type="stateful")
+        unseeded._maybe_init()
+        if tf.compat.v1.executing_eagerly():
+            # Make sure we use tf.random.Generator in v2.
+            self.assertIsNotNone(seeded._generator)
+            self.assertIsNotNone(unseeded._generator)
+        else:
+            # In v1, we can't use tf.random.Generator since it is not compatible
+            # with graph mode.
+            self.assertIsNone(seeded._generator)
+            self.assertIsNone(unseeded._generator)
+
+    def test_unseeded_with_utils_set_random_seed(self):
+        keras_seed = 1337
+        tf_utils.set_random_seed(keras_seed)
+        gen1 = backend.RandomGenerator(seed=None, rng_type="stateful")
+        output1 = gen1.random_normal(shape=[2, 3])
+        output2 = gen1.random_normal(shape=[2, 3])
+
+        self.assertNotAllClose(output1, output2)
+
+        # Make sure even with unseeded backend generator, as long as we set the
+        # keras random seed, it will make the generator to produce the same
+        # sequence. This will ensure all the client are in sync in the
+        # multi-client setting, when they all set the keras seed.
+        tf_utils.set_random_seed(keras_seed)
+        gen2 = backend.RandomGenerator(seed=None, rng_type="stateful")
+        output3 = gen2.random_normal(shape=[2, 3])
+        output4 = gen2.random_normal(shape=[2, 3])
+
+        gen3 = backend.RandomGenerator(seed=None, rng_type="stateful")
+        output5 = gen3.random_normal(shape=[2, 3])
+        output6 = gen3.random_normal(shape=[2, 3])
+
+        if tf.compat.v1.executing_eagerly():
+            # The generator is only used in the tf2 with eager.
+            self.assertAllEqual(output1, output3)
+            self.assertAllEqual(output2, output4)
+
+            # Also make sure different generator instance are still producing
+            # different result
+            self.assertNotAllEqual(output3, output5)
+            self.assertNotAllEqual(output4, output6)
+
+    def test_force_stateless(self):
+        gen = backend.RandomGenerator(seed=None, rng_type="stateless")
+        output1 = gen.random_normal(shape=[2, 3])
+        seed1 = gen._seed
+        output2 = gen.random_normal(shape=[2, 3])
+        seed2 = gen._seed
+
+        self.assertAllClose(output1, output2)
+        # Make sure we always use the same seed, and it is not None
+        self.assertEqual(seed1, seed2)
+        self.assertIsNotNone(seed1)
+
+        # Make sure a new seed is used when creating a new generator instance.
+        gen2 = backend.RandomGenerator(seed=None, rng_type="stateless")
+        output3 = gen2.random_normal(shape=[2, 3])
+        seed3 = gen2._seed
+        output4 = gen2.random_normal(shape=[2, 3])
+        seed4 = gen2._seed
+
+        self.assertAllClose(output3, output4)
+        self.assertEqual(seed3, seed4)
+        self.assertNotEqual(seed1, seed3)
+
+    def test_force_stateless_with_seed(self):
+        seed = 1337
+        gen = backend.RandomGenerator(seed=seed, rng_type="stateless")
+        output1 = gen.random_normal(shape=[2, 3])
+        seed1 = gen._seed
+        output2 = gen.random_normal(shape=[2, 3])
+        seed2 = gen._seed
+
+        self.assertAllClose(output1, output2)
+        # Make sure we always use the same seed, and it is not None
+        self.assertEqual(seed, seed1)
+        self.assertEqual(seed, seed2)
+
+        # Make sure RandomGenerator always generate same value with same seed.
+        gen2 = backend.RandomGenerator(seed=seed, rng_type="stateless")
+        output3 = gen2.random_normal(shape=[2, 3])
+        self.assertAllClose(output3, output1)
+
+    @parameterized.named_parameters(("seeded", 1337), ("unseeded", None))
+    def test_stateless_with_seed_delta(self, seed):
+        gen = backend.RandomGenerator(seed=seed, rng_type="stateless")
+        output1 = gen.random_normal(shape=[2, 3], nonce=hash((1, 1)))
+        seed1 = gen._seed
+        output2 = gen.random_normal(shape=[2, 3], nonce=hash((1, 1)))
+        seed2 = gen._seed
+        output3 = gen.random_normal(shape=[2, 3], nonce=hash((2, 1)))
+        seed3 = gen._seed
+
+        self.assertAllClose(output1, output2)
+        # Different seed_delta will produce different value.
+        self.assertNotAllClose(output1, output3)
+        # Make sure the internal seed is not changed at all.
+        self.assertEqual(seed1, seed2)
+        self.assertEqual(seed1, seed3)
+
+    def test_unknown_rng_type(self):
+        with self.assertRaisesRegex(ValueError, "Got: unknown"):
+            backend.RandomGenerator(seed=None, rng_type="unknown")
+
+    def test_prefer_stateless_over_global_generator(self):
+        try:
+            generator_enabled = backend.is_tf_random_generator_enabled()
+            if not generator_enabled:
+                backend.enable_tf_random_generator()
+
+            seed = 1337
+            gen = backend.RandomGenerator(seed=seed, rng_type="stateless")
+            output1 = gen.random_normal(shape=[2, 3])
+            output2 = gen.random_normal(shape=[2, 3])
+
+            self.assertIsNone(gen._generator)
+            self.assertAllClose(output1, output2)
+        finally:
+            if not generator_enabled:
+                # Change the global flag back.
+                backend.disable_tf_random_generator()
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/BUILD b/keras/benchmarks/BUILD
index 37085c716478..eacb26a3a36c 100644
--- a/keras/benchmarks/BUILD
+++ b/keras/benchmarks/BUILD
@@ -1,9 +1,13 @@
 # Description:
 #   Implementation of Keras benchmarks.
 
+# Placeholder: load unaliased py_library
+# Placeholder: load unaliased py_test
+# Placeholder: load unaliased py_binary
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -134,7 +138,7 @@ py_test(
         ":profiler_lib",
         "//:expect_tensorflow_installed",
         "//keras/api:keras_api",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
     ],
 )
 
diff --git a/keras/benchmarks/benchmark_util.py b/keras/benchmarks/benchmark_util.py
index 564fade27d79..a37b71ac0196 100644
--- a/keras/benchmarks/benchmark_util.py
+++ b/keras/benchmarks/benchmark_util.py
@@ -14,203 +14,211 @@
 # ==============================================================================
 """Common utils for benchmarks."""
 
-import tensorflow.compat.v2 as tf
-
 import timeit
+
 import numpy as np
 
+from keras import callbacks
 from keras.benchmarks import distribution_util
 
 
 def get_benchmark_name(name):
-  """Split the suffix of the benchmark name.
+    """Split the suffix of the benchmark name.
 
-  For example, for the name = 'benchmark_layer_call__Conv2D_small_shape',
-  the return value is ['Conv2D', 'small', 'shape'].
+    For example, for the name = 'benchmark_layer_call__Conv2D_small_shape',
+    the return value is ['Conv2D', 'small', 'shape'].
 
-  This is to generate the metadata of the benchmark test.
+    This is to generate the metadata of the benchmark test.
 
-  Args:
-    name: A string, the benchmark name.
+    Args:
+      name: A string, the benchmark name.
 
-  Returns:
-    A list of strings of the suffix in the benchmark name.
-  """
-  if '__' not in name or '_' not in name:
-    raise ValueError('The format of the benchmark name is wrong.')
-  return name.split('__')[-1].split('_')
+    Returns:
+      A list of strings of the suffix in the benchmark name.
+    """
+    if "__" not in name or "_" not in name:
+        raise ValueError("The format of the benchmark name is wrong.")
+    return name.split("__")[-1].split("_")
 
 
 def generate_benchmark_params_cpu_gpu(*params_list):
-  """Extend the benchmark names with CPU and GPU suffix.
-
-  Args:
-    *params_list: A list of tuples represents the benchmark parameters.
-
-  Returns:
-    A list of strings with the benchmark name extended with CPU and GPU suffix.
-  """
-  benchmark_params = []
-  for params in params_list:
-    benchmark_params.extend([
-        ((param[0] + '_CPU',) + param[1:]) for param in params
-    ])
-    benchmark_params.extend([
-        ((param[0] + '_GPU',) + param[1:]) for param in params
-    ])
-  return benchmark_params
-
-
-def get_keras_examples_metadata(keras_model,
-                                batch_size,
-                                impl='.keras.cfit_graph'):
-  return {
-      'model_name': 'keras_examples',
-      'implementation': keras_model + impl,
-      'parameters': 'bs_' + str(batch_size),
-  }
-
-
-class TimerCallBack(tf.keras.callbacks.Callback):
-  """Callback for logging time in each epoch or batch."""
-
-  def __init__(self):
-    self.times = []
-    self.timer = timeit.default_timer
-    self.startup_time = timeit.default_timer()
-    self.recorded_startup = False
-
-  def on_epoch_begin(self, e, logs):
-    self.epoch_start_time = self.timer()
-
-  def on_epoch_end(self, e, logs):
-    self.times.append(self.timer() - self.epoch_start_time)
-
-  def on_batch_end(self, e, logs):
-    if not self.recorded_startup:
-      self.startup_time = self.timer() - self.startup_time
-      self.recorded_startup = True
-
-
-def measure_performance(model_fn,
-                        x=None,
-                        y=None,
-                        epochs=2,
-                        batch_size=32,
-                        run_iters=4,
-                        optimizer=None,
-                        loss=None,
-                        metrics=None,
-                        verbose=0,
-                        num_gpus=0,
-                        distribution_strategy='off'):
-  """Run models and measure the performance.
-
-  Args:
-    model_fn: Model function to be benchmarked.
-    x: Input data. See `x` in the `fit()` method of `keras.Model`.
-    y: Target data. See `y` in the `fit()` method of `keras.Model`.
-    epochs: Integer. Number of epochs to train the model.
-      If unspecified, `epochs` will default to 2.
-    batch_size: Integer. Number of samples per gradient update. If unspecified,
-      `batch_size` will default to 32.
-    run_iters: Integer. Number of iterations to run the performance measurement.
-      If unspecified, `run_iters` will default to 4.
-    optimizer: String (name of optimizer) or optimizer instance. See
-      `tf.keras.optimizers`.
-    loss: String (name of objective function), objective function or
-      `tf.keras.losses.Loss` instance. See `tf.keras.losses`.
-    metrics: Lists of metrics to be evaluated by the model during training. See
-      `metrics` in the `compile()` method of  `keras.Model`.
-    verbose: 0, 1, 2. Verbosity mode. See `verbose` in the `fit()` method of
-      `keras.Model`. If unspecified, `verbose` will default to 0.
-    num_gpus: Number of GPUs to run the model.
-    distribution_strategy: Distribution strategies. It could be
-      `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
-      `distribution_strategy` will default to 'off'. Note that, `TPU`
-      and `parameter_server` are not supported yet.
-
-  Returns:
-    Performance summary, which contains build_time, compile_time,
-    startup_time, avg_epoch_time, wall_time, exp_per_sec, epochs,
-    distribution_strategy.
-
-  Raise:
-    ValueError: If `x` is none or if `optimizer` is not provided or
-    if `loss` is not provided or if `num_gpus` is negative.
-  """
-  if 'x' is None:
-    raise ValueError('Input data is required.')
-  if 'optimizer' is None:
-    raise ValueError('Optimizer is required.')
-  if 'loss' is None:
-    raise ValueError('Loss function is required.')
-  if num_gpus < 0:
-    raise ValueError('`num_gpus` cannot be negative')
-
-  # TODO(xingyulong): we will add tfds support later and
-  #  get the `num_examples` from info.
-  num_examples = x.shape[0]
-
-  build_time_list, compile_time_list, startup_time_list = [], [], []
-  avg_epoch_time_list, wall_time_list, exp_per_sec_list = [], [], []
-  total_num_examples = epochs * num_examples
-
-  strategy = distribution_util.get_distribution_strategy(
-      distribution_strategy=distribution_strategy, num_gpus=num_gpus)
-
-  for _ in range(run_iters):
-    timer = timeit.default_timer
-    start_time = timer()
-    # Init the distribution strategy scope for each iteration.
-    strategy_scope = distribution_util.get_strategy_scope(strategy)
-    with strategy_scope:
-      t0 = timer()
-      model = model_fn()
-      build_time = timer() - t0
-
-      t1 = timer()
-      model.compile(
-          optimizer=optimizer,
-          loss=loss,
-          metrics=metrics,
-      )
-      compile_time = timer() - t1
-    # Run one warm up epoch.
-    model.fit(x=x, y=y, batch_size=batch_size, epochs=1)
-    cbk = TimerCallBack()
-    t2 = timer()
-    model.fit(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        epochs=epochs,
-        callbacks=[cbk],
-        verbose=verbose)
-    end_time = timer()
-
-    build_time_list.append(build_time)
-    compile_time_list.append(compile_time)
-    startup_time_list.append(cbk.startup_time)
-    avg_epoch_time_list.append(np.mean(cbk.times))
-    wall_time_list.append(end_time - start_time)
-    exp_per_sec_list.append(total_num_examples / (end_time - t2))
-
-  metrics = []
-  metrics.append({'name': 'build_time', 'value': np.mean(build_time_list)})
-  metrics.append({'name': 'compile_time', 'value': np.mean(compile_time_list)})
-  metrics.append({'name': 'startup_time', 'value': np.mean(startup_time_list)})
-  metrics.append({
-      'name': 'avg_epoch_time',
-      'value': np.mean(avg_epoch_time_list)
-  })
-  metrics.append({'name': 'exp_per_sec', 'value': np.mean(exp_per_sec_list)})
-  metrics.append({'name': 'epochs', 'value': epochs})
-
-  wall_time = np.mean(wall_time_list)
-  extras = {
-      'distribution_strategy': distribution_strategy,
-      'num_gpus': num_gpus
-  }
-
-  return metrics, wall_time, extras
+    """Extend the benchmark names with CPU and GPU suffix.
+
+    Args:
+      *params_list: A list of tuples represents the benchmark parameters.
+
+    Returns:
+      A list of strings with the benchmark name extended with CPU and GPU
+      suffix.
+    """
+    benchmark_params = []
+    for params in params_list:
+        benchmark_params.extend(
+            [((param[0] + "_CPU",) + param[1:]) for param in params]
+        )
+        benchmark_params.extend(
+            [((param[0] + "_GPU",) + param[1:]) for param in params]
+        )
+    return benchmark_params
+
+
+def get_keras_examples_metadata(
+    keras_model, batch_size, impl=".keras.cfit_graph"
+):
+    return {
+        "model_name": "keras_examples",
+        "implementation": keras_model + impl,
+        "parameters": "bs_" + str(batch_size),
+    }
+
+
+class TimerCallBack(callbacks.Callback):
+    """Callback for logging time in each epoch or batch."""
+
+    def __init__(self):
+        self.times = []
+        self.timer = timeit.default_timer
+        self.startup_time = timeit.default_timer()
+        self.recorded_startup = False
+
+    def on_epoch_begin(self, e, logs):
+        self.epoch_start_time = self.timer()
+
+    def on_epoch_end(self, e, logs):
+        self.times.append(self.timer() - self.epoch_start_time)
+
+    def on_batch_end(self, e, logs):
+        if not self.recorded_startup:
+            self.startup_time = self.timer() - self.startup_time
+            self.recorded_startup = True
+
+
+def measure_performance(
+    model_fn,
+    x=None,
+    y=None,
+    epochs=2,
+    batch_size=32,
+    run_iters=4,
+    optimizer=None,
+    loss=None,
+    metrics=None,
+    verbose=0,
+    num_gpus=0,
+    distribution_strategy="off",
+):
+    """Run models and measure the performance.
+
+    Args:
+      model_fn: Model function to be benchmarked.
+      x: Input data. See `x` in the `fit()` method of `keras.Model`.
+      y: Target data. See `y` in the `fit()` method of `keras.Model`.
+      epochs: Integer. Number of epochs to train the model.
+        If unspecified, `epochs` will default to 2.
+      batch_size: Integer. Number of samples per gradient update. If
+        unspecified, `batch_size` will default to 32.
+      run_iters: Integer. Number of iterations to run the performance
+        measurement.  If unspecified, `run_iters` will default to 4.
+      optimizer: String (name of optimizer) or optimizer instance. See
+        `tf.keras.optimizers`.
+      loss: String (name of objective function), objective function or
+        `tf.keras.losses.Loss` instance. See `tf.keras.losses`.
+      metrics: Lists of metrics to be evaluated by the model during training.
+        See `metrics` in the `compile()` method of  `keras.Model`.
+      verbose: 0, 1, 2. Verbosity mode. See `verbose` in the `fit()` method of
+        `keras.Model`. If unspecified, `verbose` will default to 0.
+      num_gpus: Number of GPUs to run the model.
+      distribution_strategy: Distribution strategies. It could be
+        `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
+        `distribution_strategy` will default to 'off'. Note that, `TPU`
+        and `parameter_server` are not supported yet.
+
+    Returns:
+      Performance summary, which contains build_time, compile_time,
+      startup_time, avg_epoch_time, wall_time, exp_per_sec, epochs,
+      distribution_strategy.
+
+    Raise:
+      ValueError: If `x` is none or if `optimizer` is not provided or
+      if `loss` is not provided or if `num_gpus` is negative.
+    """
+    if x is None:
+        raise ValueError("Input data is required.")
+    elif optimizer is None:
+        raise ValueError("Optimizer is required.")
+    elif loss is None:
+        raise ValueError("Loss function is required.")
+    elif num_gpus < 0:
+        raise ValueError("`num_gpus` cannot be negative")
+
+    # TODO(xingyulong): we will add tfds support later and
+    #  get the `num_examples` from info.
+    num_examples = x.shape[0]
+
+    build_time_list, compile_time_list, startup_time_list = [], [], []
+    avg_epoch_time_list, wall_time_list, exp_per_sec_list = [], [], []
+    total_num_examples = epochs * num_examples
+
+    strategy = distribution_util.get_distribution_strategy(
+        distribution_strategy=distribution_strategy, num_gpus=num_gpus
+    )
+
+    for _ in range(run_iters):
+        timer = timeit.default_timer
+        start_time = timer()
+        # Init the distribution strategy scope for each iteration.
+        strategy_scope = distribution_util.get_strategy_scope(strategy)
+        with strategy_scope:
+            t0 = timer()
+            model = model_fn()
+            build_time = timer() - t0
+
+            t1 = timer()
+            model.compile(
+                optimizer=optimizer,
+                loss=loss,
+                metrics=metrics,
+            )
+            compile_time = timer() - t1
+        # Run one warm up epoch.
+        model.fit(x=x, y=y, batch_size=batch_size, epochs=1)
+        cbk = TimerCallBack()
+        t2 = timer()
+        model.fit(
+            x=x,
+            y=y,
+            batch_size=batch_size,
+            epochs=epochs,
+            callbacks=[cbk],
+            verbose=verbose,
+        )
+        end_time = timer()
+
+        build_time_list.append(build_time)
+        compile_time_list.append(compile_time)
+        startup_time_list.append(cbk.startup_time)
+        avg_epoch_time_list.append(np.mean(cbk.times))
+        wall_time_list.append(end_time - start_time)
+        exp_per_sec_list.append(total_num_examples / (end_time - t2))
+
+    metrics = []
+    metrics.append({"name": "build_time", "value": np.mean(build_time_list)})
+    metrics.append(
+        {"name": "compile_time", "value": np.mean(compile_time_list)}
+    )
+    metrics.append(
+        {"name": "startup_time", "value": np.mean(startup_time_list)}
+    )
+    metrics.append(
+        {"name": "avg_epoch_time", "value": np.mean(avg_epoch_time_list)}
+    )
+    metrics.append({"name": "exp_per_sec", "value": np.mean(exp_per_sec_list)})
+    metrics.append({"name": "epochs", "value": epochs})
+
+    wall_time = np.mean(wall_time_list)
+    extras = {
+        "distribution_strategy": distribution_strategy,
+        "num_gpus": num_gpus,
+    }
+
+    return metrics, wall_time, extras
diff --git a/keras/benchmarks/benchmark_util_test.py b/keras/benchmarks/benchmark_util_test.py
index fb14d5ab63b7..a667f53c5fda 100644
--- a/keras/benchmarks/benchmark_util_test.py
+++ b/keras/benchmarks/benchmark_util_test.py
@@ -20,30 +20,29 @@
 
 
 class BenchmarkUtilTest(tf.test.TestCase):
-
-  def test_get_benchmark_name(self):
-    name = "benchmark_layer_call__Conv2D_small_shape"
-    expected = ["Conv2D", "small", "shape"]
-    out = benchmark_util.get_benchmark_name(name)
-    self.assertAllEqual(out, expected)
-
-  def test_generate_benchmark_params_cpu_gpu(self):
-    adam_opt = tf.keras.optimizers.Adam()
-    sgd_opt = tf.keras.optimizers.SGD()
-    params = [
-        ("Adam", adam_opt, 10),
-        ("SGD", sgd_opt, 10),
-    ]
-    expected = [
-        ("Adam_CPU", adam_opt, 10),
-        ("SGD_CPU", sgd_opt, 10),
-        ("Adam_GPU", adam_opt, 10),
-        ("SGD_GPU", sgd_opt, 10),
-    ]
-
-    out = benchmark_util.generate_benchmark_params_cpu_gpu(params)
-    self.assertAllEqual(out, expected)
+    def test_get_benchmark_name(self):
+        name = "benchmark_layer_call__Conv2D_small_shape"
+        expected = ["Conv2D", "small", "shape"]
+        out = benchmark_util.get_benchmark_name(name)
+        self.assertAllEqual(out, expected)
+
+    def test_generate_benchmark_params_cpu_gpu(self):
+        adam_opt = tf.keras.optimizers.Adam()
+        sgd_opt = tf.keras.optimizers.SGD()
+        params = [
+            ("Adam", adam_opt, 10),
+            ("SGD", sgd_opt, 10),
+        ]
+        expected = [
+            ("Adam_CPU", adam_opt, 10),
+            ("SGD_CPU", sgd_opt, 10),
+            ("Adam_GPU", adam_opt, 10),
+            ("SGD_GPU", sgd_opt, 10),
+        ]
+
+        out = benchmark_util.generate_benchmark_params_cpu_gpu(params)
+        self.assertAllEqual(out, expected)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/benchmarks/distribution_util.py b/keras/benchmarks/distribution_util.py
index 4c180b6ad414..a4868749ed5c 100644
--- a/keras/benchmarks/distribution_util.py
+++ b/keras/benchmarks/distribution_util.py
@@ -18,168 +18,182 @@
 https://github.com/tensorflow/models/blob/master/official/utils/misc/distribution_utils.py.
 """
 
-import tensorflow.compat.v2 as tf
-
 import json
 import os
 
+import tensorflow.compat.v2 as tf
+
 
 def _collective_communication(all_reduce_alg):
-  """Return a CollectiveCommunication based on all_reduce_alg.
-
-  Args:
-    all_reduce_alg: a string specifying which collective communication to pick,
-      or None.
-
-  Returns:
-    tf.distribute.experimental.CollectiveCommunication object
-
-  Raises:
-    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
-  """
-  collective_communication_options = {
-      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
-      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
-      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
-  }
-  if all_reduce_alg not in collective_communication_options:
-    raise ValueError(
-        "When used with `multi_worker_mirrored`, valid values for "
-        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
-            all_reduce_alg))
-  return collective_communication_options[all_reduce_alg]
+    """Return a CollectiveCommunication based on all_reduce_alg.
+
+    Args:
+      all_reduce_alg: a string specifying which collective communication to
+        pick, or None.
+
+    Returns:
+      tf.distribute.experimental.CollectiveCommunication object
+
+    Raises:
+      ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
+    """
+    collective_communication_options = {
+        None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+        "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+        "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL,
+    }
+    if all_reduce_alg not in collective_communication_options:
+        raise ValueError(
+            "When used with `multi_worker_mirrored`, valid values for "
+            "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
+                all_reduce_alg
+            )
+        )
+    return collective_communication_options[all_reduce_alg]
 
 
 def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
-  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
-
-  Args:
-    all_reduce_alg: a string specifying which cross device op to pick, or None.
-    num_packs: an integer specifying number of packs for the cross device op.
-
-  Returns:
-    tf.distribute.CrossDeviceOps object or None.
-
-  Raises:
-    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
-  """
-  if all_reduce_alg is None:
-    return None
-  mirrored_all_reduce_options = {
-      "nccl": tf.distribute.NcclAllReduce,
-      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
-  }
-  if all_reduce_alg not in mirrored_all_reduce_options:
-    raise ValueError(
-        "When used with `mirrored`, valid values for all_reduce_alg are "
-        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
-            all_reduce_alg))
-  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
-  return cross_device_ops_class(num_packs=num_packs)
-
-
-def get_distribution_strategy(distribution_strategy="mirrored",
-                              num_gpus=0,
-                              all_reduce_alg=None,
-                              num_packs=1):
-  """Return a DistributionStrategy for running the model.
-
-  Args:
-    distribution_strategy: a string specifying which distribution strategy to
-      use. Accepted values are "off", "one_device", "mirrored", and
-      "multi_worker_mirrored" -- case insensitive. "off" means not to use
-      Distribution Strategy.
-    num_gpus: Number of GPUs to run this model.
-
-  Returns:
-    tf.distribute.DistibutionStrategy object.
-  Raises:
-    ValueError: if `distribution_strategy` is "off" or "one_device" and
-      `num_gpus` is larger than 1; or `num_gpus` is negative.
-  """
-  if num_gpus < 0:
-    raise ValueError("`num_gpus` can not be negative.")
-
-  distribution_strategy = distribution_strategy.lower()
-
-  if distribution_strategy == "off":
-    if num_gpus > 1:
-      raise ValueError("When {} GPUs are specified, distribution_strategy "
-                       "flag cannot be set to `off`.".format(num_gpus))
-    return None
-
-  if distribution_strategy == "multi_worker_mirrored":
-    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
-        communication=_collective_communication(all_reduce_alg))
-
-  if distribution_strategy == "one_device":
-    if num_gpus == 0:
-      return tf.distribute.OneDeviceStrategy("device:CPU:0")
-    if num_gpus > 1:
-      raise ValueError("`OneDeviceStrategy` can not be used for more than "
-                       "one device.")
-    return tf.distribute.OneDeviceStrategy("device:GPU:0")
-
-  if distribution_strategy == "mirrored":
-    if num_gpus == 0:
-      devices = ["device:CPU:0"]
-    else:
-      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
-    return tf.distribute.MirroredStrategy(
-        devices=devices,
-        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
+    """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
+
+    Args:
+      all_reduce_alg: a string specifying which cross device op to pick, or
+        None.
+      num_packs: an integer specifying number of packs for the cross device op.
+
+    Returns:
+      tf.distribute.CrossDeviceOps object or None.
+
+    Raises:
+      ValueError: if `all_reduce_alg` not in [None, "nccl",
+        "hierarchical_copy"].
+    """
+    if all_reduce_alg is None:
+        return None
+    mirrored_all_reduce_options = {
+        "nccl": tf.distribute.NcclAllReduce,
+        "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce,
+    }
+    if all_reduce_alg not in mirrored_all_reduce_options:
+        raise ValueError(
+            "When used with `mirrored`, valid values for all_reduce_alg are "
+            "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
+                all_reduce_alg
+            )
+        )
+    cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
+    return cross_device_ops_class(num_packs=num_packs)
+
+
+def get_distribution_strategy(
+    distribution_strategy="mirrored",
+    num_gpus=0,
+    all_reduce_alg=None,
+    num_packs=1,
+):
+    """Return a DistributionStrategy for running the model.
+
+    Args:
+      distribution_strategy: a string specifying which distribution strategy to
+        use. Accepted values are "off", "one_device", "mirrored", and
+        "multi_worker_mirrored" -- case insensitive. "off" means not to use
+        Distribution Strategy.
+      num_gpus: Number of GPUs to run this model.
+
+    Returns:
+      tf.distribute.DistibutionStrategy object.
+    Raises:
+      ValueError: if `distribution_strategy` is "off" or "one_device" and
+        `num_gpus` is larger than 1; or `num_gpus` is negative.
+    """
+    if num_gpus < 0:
+        raise ValueError("`num_gpus` can not be negative.")
+
+    distribution_strategy = distribution_strategy.lower()
+
+    if distribution_strategy == "off":
+        if num_gpus > 1:
+            raise ValueError(
+                "When {} GPUs are specified, distribution_strategy "
+                "flag cannot be set to `off`.".format(num_gpus)
+            )
+        return None
+
+    if distribution_strategy == "multi_worker_mirrored":
+        return tf.distribute.experimental.MultiWorkerMirroredStrategy(
+            communication=_collective_communication(all_reduce_alg)
+        )
+
+    if distribution_strategy == "one_device":
+        if num_gpus == 0:
+            return tf.distribute.OneDeviceStrategy("device:CPU:0")
+        if num_gpus > 1:
+            raise ValueError(
+                "`OneDeviceStrategy` can not be used for more than one device."
+            )
+        return tf.distribute.OneDeviceStrategy("device:GPU:0")
+
+    if distribution_strategy == "mirrored":
+        if num_gpus == 0:
+            devices = ["device:CPU:0"]
+        else:
+            devices = ["device:GPU:%d" % i for i in range(num_gpus)]
+        return tf.distribute.MirroredStrategy(
+            devices=devices,
+            cross_device_ops=_mirrored_cross_device_ops(
+                all_reduce_alg, num_packs
+            ),
+        )
 
-  raise ValueError("Unrecognized Distribution Strategy: %r" %
-                   distribution_strategy)
+    raise ValueError(
+        f"Unrecognized Distribution Strategy: {distribution_strategy}"
+    )
 
 
 def configure_cluster(worker_hosts=None, task_index=-1):
-  """Set multi-worker cluster spec in TF_CONFIG environment variable.
-
-  Args:
-    worker_hosts: comma-separated list of worker ip:port pairs.
-
-  Returns:
-    Number of workers in the cluster.
-  """
-  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
-  if tf_config:
-    num_workers = (
-        len(tf_config["cluster"].get("chief", [])) +
-        len(tf_config["cluster"].get("worker", [])))
-  elif worker_hosts:
-    workers = worker_hosts.split(",")
-    num_workers = len(workers)
-    if num_workers > 1 and task_index < 0:
-      raise ValueError("Must specify task_index when number of workers > 1")
-    task_index = 0 if num_workers == 1 else task_index
-    os.environ["TF_CONFIG"] = json.dumps({
-        "cluster": {
-            "worker": workers
-        },
-        "task": {
-            "type": "worker",
-            "index": task_index
-        }
-    })
-  else:
-    num_workers = 1
-  return num_workers
+    """Set multi-worker cluster spec in TF_CONFIG environment variable.
+
+    Args:
+      worker_hosts: comma-separated list of worker ip:port pairs.
+
+    Returns:
+      Number of workers in the cluster.
+    """
+    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+    if tf_config:
+        num_workers = len(tf_config["cluster"].get("chief", [])) + len(
+            tf_config["cluster"].get("worker", [])
+        )
+    elif worker_hosts:
+        workers = worker_hosts.split(",")
+        num_workers = len(workers)
+        if num_workers > 1 and task_index < 0:
+            raise ValueError(
+                "Must specify task_index when number of workers > 1"
+            )
+        task_index = 0 if num_workers == 1 else task_index
+        os.environ["TF_CONFIG"] = json.dumps(
+            {
+                "cluster": {"worker": workers},
+                "task": {"type": "worker", "index": task_index},
+            }
+        )
+    else:
+        num_workers = 1
+    return num_workers
 
 
 def get_strategy_scope(strategy):
-  if strategy:
-    strategy_scope = strategy.scope()
-  else:
-    strategy_scope = DummyContextManager()
+    if strategy:
+        strategy_scope = strategy.scope()
+    else:
+        strategy_scope = DummyContextManager()
 
-  return strategy_scope
+    return strategy_scope
 
 
 class DummyContextManager:
+    def __enter__(self):
+        pass
 
-  def __enter__(self):
-    pass
-
-  def __exit__(self, *args):
-    pass
+    def __exit__(self, *args):
+        pass
diff --git a/keras/benchmarks/eager_microbenchmarks_test.py b/keras/benchmarks/eager_microbenchmarks_test.py
index dcfcdaadd88c..19b42f750dcd 100644
--- a/keras/benchmarks/eager_microbenchmarks_test.py
+++ b/keras/benchmarks/eager_microbenchmarks_test.py
@@ -14,193 +14,226 @@
 # ==============================================================================
 """Microbenchmarks for Keras components in eager mode."""
 
+import time
+
 import tensorflow.compat.v2 as tf
 
-import time
+from keras.utils import tf_inspect
 
+# isort: off
 from tensorflow.python.eager import context
 from tensorflow.python.eager.context import get_executor
-from keras.utils import tf_inspect
 
 
 def _run_benchmark(func, num_iters, execution_mode=None):
-  with context.execution_mode(execution_mode):
-    # call func to warm up
-    func()
-    if execution_mode == context.ASYNC:
-      get_executor().wait()
-    start = time.time()
-    for _ in range(num_iters):
-      func()
-    if execution_mode == context.ASYNC:
-      get_executor().wait()
-    end = time.time()
+    with context.execution_mode(execution_mode):
+        # call func to warm up
+        func()
+        if execution_mode == context.ASYNC:
+            get_executor().wait()
+        start = time.time()
+        for _ in range(num_iters):
+            func()
+        if execution_mode == context.ASYNC:
+            get_executor().wait()
+        end = time.time()
 
-    return end - start
+        return end - start
 
 
 class MicroBenchmarksBase(tf.test.Benchmark):
-  """Run and report benchmark results."""
-
-  def run_report(self, run_benchmark, func, num_iters, execution_mode=None):
     """Run and report benchmark results."""
-    total_time = run_benchmark(func, num_iters, execution_mode)
-    mean_us = total_time * 1e6 / num_iters
-    metrics = [{
-        "name": "exp_per_sec",
-        "value": float("{0:.3f}".format(num_iters / total_time))
-    }, {
-        "name": "us_per_exp",
-        "value": float("{0:.3f}".format(total_time * 1e6 / num_iters))
-    }]
-    benchmark_name = self._get_benchmark_name()
-    self.report_benchmark(
-        iters=num_iters,
-        wall_time=mean_us,
-        metrics=metrics,
-        name=benchmark_name)
-
-  def _get_benchmark_name(self):
-    """Mostly copied from benchmark.py _get_name()."""
-    stack = tf_inspect.stack()
-    name = None
-    for frame in stack[::-1]:
-      f_locals = frame[0].f_locals
-      f_self = f_locals.get("self", None)
-      if isinstance(f_self, tf.test.Benchmark):
-        name = frame[3]  # Get the method name
-        # This is a hack to get around the fact that some methods might have a
-        # disable_tfrt decorator around them. In that case a function called
-        # 'decorated' wraps the real called function underneath and so we
-        # peek one deeper into the stack to get the real name.
-        if name == "decorated":
-          continue
-        else:
-          break
-    if name is None:
-      raise ValueError("Unable to determine calling Benchmark function.")
-    if tf.__internal__.is_tfrt_enabled():
-      name = name + "_tfrt"
-    return name
-
-  def _run(self, func, num_iters, execution_mode=None):
-    self.run_report(_run_benchmark, func, num_iters, execution_mode)
-
-  def benchmark_layers_call_overhead(self):
-
-    class OnlyOverheadLayer(tf.keras.layers.Layer):
-
-      def call(self, x):
-        return x
-
-    layer = OnlyOverheadLayer()
-    x = tf.convert_to_tensor([[1.]])
-
-    def fn():
-      layer(x)  # pylint: disable=not-callable
-
-    self._run(fn, 10000)
-
-  def benchmark_op_layer_call_overhead(self):
-    model_input = tf.keras.Input(shape=(1,))
-    model_output = model_input
-    x = tf.convert_to_tensor([[1.1]])
-
-    for _ in range(20):
-      model_output = tf.multiply(model_output, x)
-    model = tf.keras.Model(inputs=model_input, outputs=model_output)
-
-    def fn():
-      model(x)  # pylint: disable=not-callable
-
-    fn()
-    self._run(fn, 100)
-
-  def benchmark_model_predict_tensorlike_overhead(self):
-
-    class OnlyOverheadLayer(tf.keras.layers.Layer):
-
-      def call(self, x):
-        return x
-
-    model = tf.keras.Sequential([OnlyOverheadLayer()])
-    x = tf.convert_to_tensor([[1.]])
-
-    def fn():
-      model.predict(x)
-
-    self._run(fn, 20)
-
-  def benchmark_layers_embeddings_embedding_overhead(self):
-
-    layer = tf.keras.layers.Embedding(1, 1)
-    x = tf.zeros((1, 1), dtype="int32")
-
-    def fn():
-      layer(x)
-
-    self._run(fn, 10000)
-
-
-class KerasLayerCallOverheadBenchmarks(  # pylint: disable=undefined-variable
-    MicroBenchmarksBase, metaclass=tf.__internal__.test.ParameterizedBenchmark):
-
-  # The set of layers for benchmarking. To add benchmarks for new layers,
-  # please add the parameter configs to "_benchmark_paramters".
-
-  # The parameter of each layer benchmark is a tuple contains:
-  # 1) The benchmark name with convention "{module_name}_{layer_name}";
-  # 2) The layer instance;
-  # 3) The shape of the input to the layer;
-  # 4) The kwargs used in the benchmark. It can include the number of
-  #    iterations to run the benchmarks, and kwargs used in the layer call.
-  #    By default, # of iteration is 10000.
-  _benchmark_parameters = [
-      ("advanced_activations_leaky_relu", tf.keras.layers.LeakyReLU(),
-       (1, 1)),
-      ("advanced_activations_prelu", tf.keras.layers.PReLU(), (1, 1)),
-      ("advanced_activations_elu", tf.keras.layers.ELU(), (1, 1)),
-      ("advanced_activations_thresholded_relu",
-       tf.keras.layers.ThresholdedReLU(), (1, 1)),
-      ("advanced_activations_softmax", tf.keras.layers.Softmax(), (1, 1)),
-      ("advanced_activations_relu", tf.keras.layers.ReLU(), (1, 1)),
-      ("core_masking", tf.keras.layers.Masking(), (1, 1)),
-      ("core_dropout", tf.keras.layers.Dropout(0.5), (1, 1), {
-          "training": True
-      }),
-      ("core_flatten", tf.keras.layers.Flatten(), (1, 1, 1)),
-      ("core_dense", tf.keras.layers.Dense(1), (1, 1)),
-      ("convolutional_conv1d", tf.keras.layers.Conv1D(1, (1,)), (1, 1, 1)),
-      ("convolutional_conv2d", tf.keras.layers.Conv2D(1, (1, 1)), (1, 1, 1, 1)),
-      ("convolutional_conv3d", tf.keras.layers.Conv3D(
-          1, (1, 1, 1)), (1, 1, 1, 1, 1)),
-      ("batch_norm_fused_inf", tf.keras.layers.BatchNormalization(fused=True),
-       (1, 1, 1, 1)),
-      ("batch_norm_fused_train", tf.keras.layers.BatchNormalization(fused=True),
-       (1, 1, 1, 1), {"training": True}),
-      ("batch_norm_nonfused_inf",
-       tf.keras.layers.BatchNormalization(fused=False), (1, 1, 1, 1)),
-      ("batch_norm_nonfused_train",
-       tf.keras.layers.BatchNormalization(fused=False), (1, 1, 1, 1),
-       {"training": True}),
-      ("normalization_layer_normalization",
-       tf.keras.layers.LayerNormalization(), (1, 1),
-       {"iters": 100, "training": True}),
-  ]
-
-  def benchmark_layer(self, layer, input_shape, kwargs=None):
-
-    x = tf.ones(input_shape)
-
-    def fn():
-      layer(x, **(kwargs or {}))
 
-    default_iters = 10000
-    iters = kwargs.pop("iters", default_iters) if kwargs else default_iters
-    self._run(fn, iters)
+    def run_report(self, run_benchmark, func, num_iters, execution_mode=None):
+        """Run and report benchmark results."""
+        total_time = run_benchmark(func, num_iters, execution_mode)
+        mean_us = total_time * 1e6 / num_iters
+        metrics = [
+            {
+                "name": "exp_per_sec",
+                "value": float(f"{num_iters / total_time:.3f}"),
+            },
+            {
+                "name": "us_per_exp",
+                "value": float(f"{total_time * 1000000.0 / num_iters:.3f}"),
+            },
+        ]
+        benchmark_name = self._get_benchmark_name()
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=mean_us,
+            metrics=metrics,
+            name=benchmark_name,
+        )
+
+    def _get_benchmark_name(self):
+        """Mostly copied from benchmark.py _get_name()."""
+        stack = tf_inspect.stack()
+        name = None
+        for frame in stack[::-1]:
+            f_locals = frame[0].f_locals
+            f_self = f_locals.get("self", None)
+            if isinstance(f_self, tf.test.Benchmark):
+                name = frame[3]  # Get the method name
+                # This is a hack to get around the fact that some methods might
+                # have a disable_tfrt decorator around them. In that case a
+                # function called 'decorated' wraps the real called function
+                # underneath and so we peek one deeper into the stack to get the
+                # real name.
+                if name == "decorated":
+                    continue
+                else:
+                    break
+        if name is None:
+            raise ValueError("Unable to determine calling Benchmark function.")
+        if tf.__internal__.is_tfrt_enabled():
+            name = name + "_tfrt"
+        return name
+
+    def _run(self, func, num_iters, execution_mode=None):
+        self.run_report(_run_benchmark, func, num_iters, execution_mode)
+
+    def benchmark_layers_call_overhead(self):
+        class OnlyOverheadLayer(tf.keras.layers.Layer):
+            def call(self, x):
+                return x
+
+        layer = OnlyOverheadLayer()
+        x = tf.convert_to_tensor([[1.0]])
+
+        def fn():
+            layer(x)
+
+        self._run(fn, 10000)
+
+    def benchmark_op_layer_call_overhead(self):
+        model_input = tf.keras.Input(shape=(1,))
+        model_output = model_input
+        x = tf.convert_to_tensor([[1.1]])
+
+        for _ in range(20):
+            model_output = tf.multiply(model_output, x)
+        model = tf.keras.Model(inputs=model_input, outputs=model_output)
+
+        def fn():
+            model(x)
+
+        fn()
+        self._run(fn, 100)
+
+    def benchmark_model_predict_tensorlike_overhead(self):
+        class OnlyOverheadLayer(tf.keras.layers.Layer):
+            def call(self, x):
+                return x
+
+        model = tf.keras.Sequential([OnlyOverheadLayer()])
+        x = tf.convert_to_tensor([[1.0]])
+
+        def fn():
+            model.predict(x)
+
+        self._run(fn, 20)
+
+    def benchmark_layers_embeddings_embedding_overhead(self):
+        layer = tf.keras.layers.Embedding(1, 1)
+        x = tf.zeros((1, 1), dtype="int32")
+
+        def fn():
+            layer(x)
+
+        self._run(fn, 10000)
+
+
+class KerasLayerCallOverheadBenchmarks(
+    MicroBenchmarksBase, metaclass=tf.__internal__.test.ParameterizedBenchmark
+):
+    # The set of layers for benchmarking. To add benchmarks for new layers,
+    # please add the parameter configs to "_benchmark_paramters".
+
+    # The parameter of each layer benchmark is a tuple contains:
+    # 1) The benchmark name with convention "{module_name}_{layer_name}";
+    # 2) The layer instance;
+    # 3) The shape of the input to the layer;
+    # 4) The kwargs used in the benchmark. It can include the number of
+    #    iterations to run the benchmarks, and kwargs used in the layer call.
+    #    By default, # of iteration is 10000.
+    _benchmark_parameters = [
+        (
+            "advanced_activations_leaky_relu",
+            tf.keras.layers.LeakyReLU(),
+            (1, 1),
+        ),
+        ("advanced_activations_prelu", tf.keras.layers.PReLU(), (1, 1)),
+        ("advanced_activations_elu", tf.keras.layers.ELU(), (1, 1)),
+        (
+            "advanced_activations_thresholded_relu",
+            tf.keras.layers.ThresholdedReLU(),
+            (1, 1),
+        ),
+        ("advanced_activations_softmax", tf.keras.layers.Softmax(), (1, 1)),
+        ("advanced_activations_relu", tf.keras.layers.ReLU(), (1, 1)),
+        ("core_masking", tf.keras.layers.Masking(), (1, 1)),
+        (
+            "core_dropout",
+            tf.keras.layers.Dropout(0.5),
+            (1, 1),
+            {"training": True},
+        ),
+        ("core_flatten", tf.keras.layers.Flatten(), (1, 1, 1)),
+        ("core_dense", tf.keras.layers.Dense(1), (1, 1)),
+        ("convolutional_conv1d", tf.keras.layers.Conv1D(1, (1,)), (1, 1, 1)),
+        (
+            "convolutional_conv2d",
+            tf.keras.layers.Conv2D(1, (1, 1)),
+            (1, 1, 1, 1),
+        ),
+        (
+            "convolutional_conv3d",
+            tf.keras.layers.Conv3D(1, (1, 1, 1)),
+            (1, 1, 1, 1, 1),
+        ),
+        (
+            "batch_norm_fused_inf",
+            tf.keras.layers.BatchNormalization(fused=True),
+            (1, 1, 1, 1),
+        ),
+        (
+            "batch_norm_fused_train",
+            tf.keras.layers.BatchNormalization(fused=True),
+            (1, 1, 1, 1),
+            {"training": True},
+        ),
+        (
+            "batch_norm_nonfused_inf",
+            tf.keras.layers.BatchNormalization(fused=False),
+            (1, 1, 1, 1),
+        ),
+        (
+            "batch_norm_nonfused_train",
+            tf.keras.layers.BatchNormalization(fused=False),
+            (1, 1, 1, 1),
+            {"training": True},
+        ),
+        (
+            "normalization_layer_normalization",
+            tf.keras.layers.LayerNormalization(),
+            (1, 1),
+            {"iters": 100, "training": True},
+        ),
+    ]
+
+    def benchmark_layer(self, layer, input_shape, kwargs=None):
+        x = tf.ones(input_shape)
+
+        def fn():
+            layer(x, **(kwargs or {}))
+
+        default_iters = 10000
+        iters = kwargs.pop("iters", default_iters) if kwargs else default_iters
+        self._run(fn, iters)
 
 
 if __name__ == "__main__":
-  if tf.compat.v1.executing_eagerly():
-    # Only run test when eager is enabled (skip test in v1).
-    tf.test.main()
+    if tf.compat.v1.executing_eagerly():
+        # Only run test when eager is enabled (skip test in v1).
+        tf.test.main()
diff --git a/keras/benchmarks/keras_cpu_benchmark_test.py b/keras/benchmarks/keras_cpu_benchmark_test.py
index b2ba3604ab04..6ca5cb8c3870 100644
--- a/keras/benchmarks/keras_cpu_benchmark_test.py
+++ b/keras/benchmarks/keras_cpu_benchmark_test.py
@@ -14,123 +14,141 @@
 # ==============================================================================
 """Benchmark tests for CPU performance of Keras models."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.benchmarks import benchmark_util
 
 # Loss function and optimizer.
-_LOSS = 'binary_crossentropy'
-_OPTIMIZER = 'rmsprop'
-
-
-class KerasModelCPUBenchmark(  # pylint: disable=undefined-variable
-    tf.test.Benchmark, metaclass=tf.__internal__.test.ParameterizedBenchmark):
-  """Required Arguments for measure_performance.
-
-      x: Input data, it could be Numpy or load from tfds.
-      y: Target data. If `x` is a dataset, generator instance,
-         `y` should not be specified.
-      loss: Loss function for model.
-      optimizer: Optimizer for model.
-      Other details can see in `measure_performance()` method of
-      benchmark_util.
-  """
-  # The parameters of each benchmark is a tuple:
-
-  # (benchmark_name_suffix, batch_size, run_iters).
-  # benchmark_name_suffix: The suffix of the benchmark test name with
-  # convention `{bs}_{batch_size}`.
-  # batch_size: Integer. Number of samples per gradient update.
-  # run_iters: Integer. Number of iterations to run the
-  # performance measurement.
-
-  _benchmark_parameters = [
-      ('bs_32', 32, 3), ('bs_64', 64, 2), ('bs_128', 128, 2),
-      ('bs_256', 256, 1), ('bs_512', 512, 1)]
-
-  def _mnist_mlp(self):
-    """Simple MLP model."""
-    model = tf.keras.Sequential()
-    model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(784,)))
-    model.add(tf.keras.layers.Dropout(0.2))
-    model.add(tf.keras.layers.Dense(512, activation='relu'))
-    model.add(tf.keras.layers.Dropout(0.2))
-    model.add(tf.keras.layers.Dense(10, activation='softmax'))
-
-    return model
-
-  def _mnist_convnet(self):
-    """Simple Convnet model."""
-    model = tf.keras.Sequential()
-    model.add(
-        tf.keras.layers.Conv2D(
-            32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
-    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
-    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
-    model.add(tf.keras.layers.Dropout(0.25))
-    model.add(tf.keras.layers.Flatten())
-    model.add(tf.keras.layers.Dense(128, activation='relu'))
-    model.add(tf.keras.layers.Dropout(0.5))
-    model.add(tf.keras.layers.Dense(10, activation='softmax'))
-
-    return model
-
-  def _imdb_lstm(self):
-    """Simple LSTM model."""
-    model = tf.keras.Sequential()
-    model.add(tf.keras.layers.Embedding(20000, 128))
-    model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
-    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
-
-    return model
-
-  def benchmark_mnist_mlp(self, batch_size, run_iters):
-    """Benchmark for MLP model on synthetic mnist data."""
-    mlp_x = np.random.random((5000, 784))
-    mlp_y = np.random.random((5000, 10))
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._mnist_mlp,
-        x=mlp_x,
-        y=mlp_y,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer=_OPTIMIZER,
-        loss=_LOSS)
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mnist_convnet(self, batch_size, run_iters):
-    """Benchmark for Convnet model on synthetic mnist data."""
-    convnet_x = np.random.random((5000, 28, 28, 1))
-    convnet_y = np.random.random((5000, 10))
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._mnist_convnet,
-        x=convnet_x,
-        y=convnet_y,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer=_OPTIMIZER,
-        loss=_LOSS)
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_imdb_lstm(self, batch_size, run_iters):
-    """Benchmark for LSTM model on synthetic imdb review dataset."""
-    lstm_x = np.random.randint(0, 1999, size=(2500, 100))
-    lstm_y = np.random.random((2500, 1))
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._imdb_lstm,
-        x=lstm_x,
-        y=lstm_y,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer=_OPTIMIZER,
-        loss=_LOSS)
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+_LOSS = "binary_crossentropy"
+_OPTIMIZER = "rmsprop"
+
+
+class KerasModelCPUBenchmark(
+    tf.test.Benchmark, metaclass=tf.__internal__.test.ParameterizedBenchmark
+):
+    """Required Arguments for measure_performance.
+
+    x: Input data, it could be Numpy or load from tfds.
+    y: Target data. If `x` is a dataset, generator instance,
+       `y` should not be specified.
+    loss: Loss function for model.
+    optimizer: Optimizer for model.
+    Other details can see in `measure_performance()` method of
+    benchmark_util.
+    """
+
+    # The parameters of each benchmark is a tuple:
+
+    # (benchmark_name_suffix, batch_size, run_iters).
+    # benchmark_name_suffix: The suffix of the benchmark test name with
+    # convention `{bs}_{batch_size}`.
+    # batch_size: Integer. Number of samples per gradient update.
+    # run_iters: Integer. Number of iterations to run the
+    # performance measurement.
+
+    _benchmark_parameters = [
+        ("bs_32", 32, 3),
+        ("bs_64", 64, 2),
+        ("bs_128", 128, 2),
+        ("bs_256", 256, 1),
+        ("bs_512", 512, 1),
+    ]
+
+    def _mnist_mlp(self):
+        """Simple MLP model."""
+        model = tf.keras.Sequential()
+        model.add(
+            tf.keras.layers.Dense(512, activation="relu", input_shape=(784,))
+        )
+        model.add(tf.keras.layers.Dropout(0.2))
+        model.add(tf.keras.layers.Dense(512, activation="relu"))
+        model.add(tf.keras.layers.Dropout(0.2))
+        model.add(tf.keras.layers.Dense(10, activation="softmax"))
+
+        return model
+
+    def _mnist_convnet(self):
+        """Simple Convnet model."""
+        model = tf.keras.Sequential()
+        model.add(
+            tf.keras.layers.Conv2D(
+                32,
+                kernel_size=(3, 3),
+                activation="relu",
+                input_shape=(28, 28, 1),
+            )
+        )
+        model.add(tf.keras.layers.Conv2D(64, (3, 3), activation="relu"))
+        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
+        model.add(tf.keras.layers.Dropout(0.25))
+        model.add(tf.keras.layers.Flatten())
+        model.add(tf.keras.layers.Dense(128, activation="relu"))
+        model.add(tf.keras.layers.Dropout(0.5))
+        model.add(tf.keras.layers.Dense(10, activation="softmax"))
+
+        return model
+
+    def _imdb_lstm(self):
+        """Simple LSTM model."""
+        model = tf.keras.Sequential()
+        model.add(tf.keras.layers.Embedding(20000, 128))
+        model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
+        model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
+
+        return model
+
+    def benchmark_mnist_mlp(self, batch_size, run_iters):
+        """Benchmark for MLP model on synthetic mnist data."""
+        mlp_x = np.random.random((5000, 784))
+        mlp_y = np.random.random((5000, 10))
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._mnist_mlp,
+            x=mlp_x,
+            y=mlp_y,
+            batch_size=batch_size,
+            run_iters=run_iters,
+            optimizer=_OPTIMIZER,
+            loss=_LOSS,
+        )
+        self.report_benchmark(
+            iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_mnist_convnet(self, batch_size, run_iters):
+        """Benchmark for Convnet model on synthetic mnist data."""
+        convnet_x = np.random.random((5000, 28, 28, 1))
+        convnet_y = np.random.random((5000, 10))
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._mnist_convnet,
+            x=convnet_x,
+            y=convnet_y,
+            batch_size=batch_size,
+            run_iters=run_iters,
+            optimizer=_OPTIMIZER,
+            loss=_LOSS,
+        )
+        self.report_benchmark(
+            iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_imdb_lstm(self, batch_size, run_iters):
+        """Benchmark for LSTM model on synthetic imdb review dataset."""
+        lstm_x = np.random.randint(0, 1999, size=(2500, 100))
+        lstm_y = np.random.random((2500, 1))
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._imdb_lstm,
+            x=lstm_x,
+            y=lstm_y,
+            batch_size=batch_size,
+            run_iters=run_iters,
+            optimizer=_OPTIMIZER,
+            loss=_LOSS,
+        )
+        self.report_benchmark(
+            iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/BUILD b/keras/benchmarks/keras_examples_benchmarks/BUILD
index 4668cacaf1c5..932a7643a689 100644
--- a/keras/benchmarks/keras_examples_benchmarks/BUILD
+++ b/keras/benchmarks/keras_examples_benchmarks/BUILD
@@ -4,6 +4,7 @@
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/keras/benchmarks/keras_examples_benchmarks/README.md b/keras/benchmarks/keras_examples_benchmarks/README.md
index a2e460fb9421..42bae76a5e29 100644
--- a/keras/benchmarks/keras_examples_benchmarks/README.md
+++ b/keras/benchmarks/keras_examples_benchmarks/README.md
@@ -186,7 +186,7 @@ To run benchmarks in
 [keras/benchmarks](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/keras/benchmarks),
 please take the following steps:
 
-1.  Pull the latest tensorflow repo from github.
+1.  Pull the latest tensorflow repo from GitHub.
 2.  Install the Bazel tool which works with tensorflow, please take a look for
     the [Install bazel](#install-bazel) section.
 3.  To run benchmarks with Bazel, use the `--benchmarks=.` flags to specify the
diff --git a/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
index 43e2a405ae51..be16c0a2cb4f 100644
--- a/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
@@ -23,140 +23,168 @@
 
 
 class AntirectifierBenchmark(tf.test.Benchmark):
-  """Benchmarks for Antirectifier using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
-    self.x_train = self.x_train.reshape(-1, 784)
-    self.x_train = self.x_train.astype("float32") / 255
-
-  def _build_model(self):
-    """Model from https://keras.io/examples/keras_recipes/antirectifier/."""
-    model = tf.keras.Sequential([
-        tf.keras.Input(shape=(784,)),
-        tf.keras.layers.Dense(256),
-        Antirectifier(),
-        tf.keras.layers.Dense(256),
-        Antirectifier(),
-        tf.keras.layers.Dropout(0.5),
-        tf.keras.layers.Dense(10),
-    ])
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_antirectifier_bs_128(self):
-    """Measure performance with batch_size=128."""
-    batch_size = 128
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer="rmsprop",
-        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        metrics=["sparse_categorical_accuracy"])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        "antirectifier", batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_antirectifier_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer="rmsprop",
-        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        metrics=["sparse_categorical_accuracy"])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        "antirectifier", batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_antirectifier_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer="rmsprop",
-        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        metrics=["sparse_categorical_accuracy"])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        "antirectifier", batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_antirectifier_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, gpu=2 and
-
-    distribution_strategy=`mirrored`.
-    """
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy="mirrored",
-        optimizer="rmsprop",
-        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        metrics=["sparse_categorical_accuracy"])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        "antirectifier", batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+    """Benchmarks for Antirectifier using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+        self.x_train = self.x_train.reshape(-1, 784)
+        self.x_train = self.x_train.astype("float32") / 255
+
+    def _build_model(self):
+        """Model from https://keras.io/examples/keras_recipes/antirectifier/."""
+        model = tf.keras.Sequential(
+            [
+                tf.keras.Input(shape=(784,)),
+                tf.keras.layers.Dense(256),
+                Antirectifier(),
+                tf.keras.layers.Dense(256),
+                Antirectifier(),
+                tf.keras.layers.Dropout(0.5),
+                tf.keras.layers.Dense(10),
+            ]
+        )
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_antirectifier_bs_128(self):
+        """Measure performance with batch_size=128."""
+        batch_size = 128
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer="rmsprop",
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
+            metrics=["sparse_categorical_accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "antirectifier", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_antirectifier_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer="rmsprop",
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
+            metrics=["sparse_categorical_accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "antirectifier", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_antirectifier_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer="rmsprop",
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
+            metrics=["sparse_categorical_accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "antirectifier", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_antirectifier_bs_512_gpu_2(self):
+        """Measure performance with batch_size=512, gpu=2 and
+
+        distribution_strategy=`mirrored`.
+        """
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            optimizer="rmsprop",
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
+            metrics=["sparse_categorical_accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "antirectifier", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
 
 
 class Antirectifier(tf.keras.layers.Layer):
-  """Build simple custom layer."""
-
-  def __init__(self, initializer="he_normal", **kwargs):
-    super().__init__(**kwargs)
-    self.initializer = tf.keras.initializers.get(initializer)
-
-  def build(self, input_shape):
-    output_dim = input_shape[-1]
-    self.kernel = self.add_weight(
-        shape=(output_dim * 2, output_dim),
-        initializer=self.initializer,
-        name="kernel",
-        trainable=True,
-    )
-
-  def call(self, inputs):  #pylint: disable=arguments-differ
-    inputs -= tf.reduce_mean(inputs, axis=-1, keepdims=True)
-    pos = tf.nn.relu(inputs)
-    neg = tf.nn.relu(-inputs)
-    concatenated = tf.concat([pos, neg], axis=-1)
-    mixed = tf.matmul(concatenated, self.kernel)
-    return mixed
-
-  def get_config(self):
-    # Implement get_config to enable serialization. This is optional.
-    base_config = super().get_config()
-    config = {"initializer": tf.keras.initializers.serialize(self.initializer)}
-    return dict(list(base_config.items()) + list(config.items()))
+    """Build simple custom layer."""
+
+    def __init__(self, initializer="he_normal", **kwargs):
+        super().__init__(**kwargs)
+        self.initializer = tf.keras.initializers.get(initializer)
+
+    def build(self, input_shape):
+        output_dim = input_shape[-1]
+        self.kernel = self.add_weight(
+            shape=(output_dim * 2, output_dim),
+            initializer=self.initializer,
+            name="kernel",
+            trainable=True,
+        )
+
+    def call(self, inputs):
+        inputs -= tf.reduce_mean(inputs, axis=-1, keepdims=True)
+        pos = tf.nn.relu(inputs)
+        neg = tf.nn.relu(-inputs)
+        concatenated = tf.concat([pos, neg], axis=-1)
+        mixed = tf.matmul(concatenated, self.kernel)
+        return mixed
+
+    def get_config(self):
+        # Implement get_config to enable serialization. This is optional.
+        base_config = super().get_config()
+        config = {
+            "initializer": tf.keras.initializers.serialize(self.initializer)
+        }
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
index 65ef5ea6e265..771612a31389 100644
--- a/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
@@ -23,111 +23,129 @@
 
 
 class BidirectionalLSTMBenchmark(tf.test.Benchmark):
-  """Benchmarks for Bidirectional LSTM using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    self.max_feature = 20000
-    self.max_len = 200
-    (self.imdb_x, self.imdb_y), _ = tf.keras.datasets.imdb.load_data(
-        num_words=self.max_feature)
-    self.imdb_x = tf.keras.preprocessing.sequence.pad_sequences(
-        self.imdb_x, maxlen=self.max_len)
-
-  def _build_model(self):
-    """Model from https://keras.io/examples/nlp/bidirectional_lstm_imdb/."""
-    inputs = tf.keras.Input(shape=(None,), dtype='int32')
-    x = tf.keras.layers.Embedding(self.max_feature, 128)(inputs)
-    x = tf.keras.layers.Bidirectional(
-        tf.keras.layers.LSTM(64, return_sequences=True))(
-            x)
-    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
-    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
-    model = tf.keras.Model(inputs, outputs)
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_bidirect_lstm_imdb_bs_128(self):
-    """Measure performance with batch_size=128."""
-    batch_size = 128
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        optimizer='adam',
-        loss='binary_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'bidirectional_lstm', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_bidirect_lstm_imdb_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        optimizer='adam',
-        loss='binary_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'bidirectional_lstm', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_bidirect_lstm_imdb_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        optimizer='adam',
-        loss='binary_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'bidirectional_lstm', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_bidirect_lstm_imdb_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, gpu=2 and
-
-    distribution_strategy=`mirrored`.
-    """
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy='mirrored',
-        optimizer='adam',
-        loss='binary_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'bidirectional_lstm', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Benchmarks for Bidirectional LSTM using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        self.max_feature = 20000
+        self.max_len = 200
+        (self.imdb_x, self.imdb_y), _ = tf.keras.datasets.imdb.load_data(
+            num_words=self.max_feature
+        )
+        self.imdb_x = tf.keras.preprocessing.sequence.pad_sequences(
+            self.imdb_x, maxlen=self.max_len
+        )
+
+    def _build_model(self):
+        """Model from https://keras.io/examples/nlp/bidirectional_lstm_imdb/."""
+        inputs = tf.keras.Input(shape=(None,), dtype="int32")
+        x = tf.keras.layers.Embedding(self.max_feature, 128)(inputs)
+        x = tf.keras.layers.Bidirectional(
+            tf.keras.layers.LSTM(64, return_sequences=True)
+        )(x)
+        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
+        outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
+        model = tf.keras.Model(inputs, outputs)
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_bidirect_lstm_imdb_bs_128(self):
+        """Measure performance with batch_size=128."""
+        batch_size = 128
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            optimizer="adam",
+            loss="binary_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "bidirectional_lstm", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_bidirect_lstm_imdb_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            optimizer="adam",
+            loss="binary_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "bidirectional_lstm", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_bidirect_lstm_imdb_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            optimizer="adam",
+            loss="binary_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "bidirectional_lstm", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_bidirect_lstm_imdb_bs_512_gpu_2(self):
+        """Measure performance with batch_size=512, gpu=2 and
+
+        distribution_strategy=`mirrored`.
+        """
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            optimizer="adam",
+            loss="binary_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "bidirectional_lstm", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
index 10b1c1f0d743..cd8537cdd647 100644
--- a/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
@@ -23,125 +23,151 @@
 
 
 class Cifar10CNNBenchmark(tf.test.Benchmark):
-  """Benchmarks for CNN using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    self.num_classes = 10
-    (self.x_train, self.y_train), _ = tf.keras.datasets.cifar10.load_data()
-    self.x_train = self.x_train.astype('float32') / 255
-    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
-    self.epochs = 5
-
-  def _build_model(self):
-    """Model from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py."""
-    model = tf.keras.Sequential()
-    model.add(
-        tf.keras.layers.Conv2D(
-            32, (3, 3), padding='same', input_shape=self.x_train.shape[1:]))
-    model.add(tf.keras.layers.Activation('relu'))
-    model.add(tf.keras.layers.Conv2D(32, (3, 3)))
-    model.add(tf.keras.layers.Activation('relu'))
-    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
-    model.add(tf.keras.layers.Dropout(0.25))
-
-    model.add(tf.keras.layers.Conv2D(64, (3, 3), padding='same'))
-    model.add(tf.keras.layers.Activation('relu'))
-    model.add(tf.keras.layers.Conv2D(64, (3, 3)))
-    model.add(tf.keras.layers.Activation('relu'))
-    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
-    model.add(tf.keras.layers.Dropout(0.25))
-
-    model.add(tf.keras.layers.Flatten())
-    model.add(tf.keras.layers.Dense(512))
-    model.add(tf.keras.layers.Activation('relu'))
-    model.add(tf.keras.layers.Dropout(0.5))
-    model.add(tf.keras.layers.Dense(self.num_classes))
-    model.add(tf.keras.layers.Activation('softmax'))
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_cnn_cifar10_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('cnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_cnn_cifar10_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('cnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_cnn_cifar10_bs_1024(self):
-    """Measure performance with batch_size=1024."""
-    batch_size = 1024
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('cnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_cnn_cifar10_bs_1024_gpu_2(self):
-    """Measure performance with batch_size=1024, gpu=2 and
-
-    distribution_strategy=`mirrored`.
-    """
-    batch_size = 1024
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy='mirrored',
-        epochs=self.epochs,
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('cnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Benchmarks for CNN using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        self.num_classes = 10
+        (self.x_train, self.y_train), _ = tf.keras.datasets.cifar10.load_data()
+        self.x_train = self.x_train.astype("float32") / 255
+        self.y_train = tf.keras.utils.to_categorical(
+            self.y_train, self.num_classes
+        )
+        self.epochs = 5
+
+    def _build_model(self):
+        """Model from
+        https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py.
+        """
+        model = tf.keras.Sequential()
+        model.add(
+            tf.keras.layers.Conv2D(
+                32, (3, 3), padding="same", input_shape=self.x_train.shape[1:]
+            )
+        )
+        model.add(tf.keras.layers.Activation("relu"))
+        model.add(tf.keras.layers.Conv2D(32, (3, 3)))
+        model.add(tf.keras.layers.Activation("relu"))
+        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
+        model.add(tf.keras.layers.Dropout(0.25))
+
+        model.add(tf.keras.layers.Conv2D(64, (3, 3), padding="same"))
+        model.add(tf.keras.layers.Activation("relu"))
+        model.add(tf.keras.layers.Conv2D(64, (3, 3)))
+        model.add(tf.keras.layers.Activation("relu"))
+        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
+        model.add(tf.keras.layers.Dropout(0.25))
+
+        model.add(tf.keras.layers.Flatten())
+        model.add(tf.keras.layers.Dense(512))
+        model.add(tf.keras.layers.Activation("relu"))
+        model.add(tf.keras.layers.Dropout(0.5))
+        model.add(tf.keras.layers.Dense(self.num_classes))
+        model.add(tf.keras.layers.Activation("softmax"))
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_cnn_cifar10_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=0.0001, decay=1e-6
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("cnn", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_cnn_cifar10_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=0.0001, decay=1e-6
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("cnn", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_cnn_cifar10_bs_1024(self):
+        """Measure performance with batch_size=1024."""
+        batch_size = 1024
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=0.0001, decay=1e-6
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("cnn", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_cnn_cifar10_bs_1024_gpu_2(self):
+        """Measure performance with batch_size=1024, gpu=2 and
+
+        distribution_strategy=`mirrored`.
+        """
+        batch_size = 1024
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            epochs=self.epochs,
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=0.0001, decay=1e-6
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("cnn", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
index 47b077373f26..fc5cedd27df2 100644
--- a/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
@@ -17,122 +17,149 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.benchmarks import benchmark_util
 
 
 class ConvMnistBenchmark(tf.test.Benchmark):
-  """Benchmarks for Convnet using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    self.num_classes = 10
-    self.input_shape = (28, 28, 1)
-    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
-    self.x_train = self.x_train.astype('float32') / 255
-    self.x_train = np.expand_dims(self.x_train, -1)
-    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
-    self.epochs = 15
-
-  def _build_model(self):
-    """Model from https://keras.io/examples/vision/mnist_convnet/."""
-    model = tf.keras.Sequential([
-        tf.keras.Input(shape=self.input_shape),
-        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
-        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-        tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
-        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-        tf.keras.layers.Flatten(),
-        tf.keras.layers.Dropout(0.5),
-        tf.keras.layers.Dense(self.num_classes, activation='softmax'),
-    ])
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_conv_mnist_bs_128(self):
-    """Measure performance with batch_size=128."""
-    batch_size = 128
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('conv', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_conv_mnist_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('conv', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_conv_mnist_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('conv', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_conv_mnist_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, gpu=2 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy='mirrored',
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('conv', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Benchmarks for Convnet using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        self.num_classes = 10
+        self.input_shape = (28, 28, 1)
+        (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+        self.x_train = self.x_train.astype("float32") / 255
+        self.x_train = np.expand_dims(self.x_train, -1)
+        self.y_train = tf.keras.utils.to_categorical(
+            self.y_train, self.num_classes
+        )
+        self.epochs = 15
+
+    def _build_model(self):
+        """Model from https://keras.io/examples/vision/mnist_convnet/."""
+        model = tf.keras.Sequential(
+            [
+                tf.keras.Input(shape=self.input_shape),
+                tf.keras.layers.Conv2D(
+                    32, kernel_size=(3, 3), activation="relu"
+                ),
+                tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+                tf.keras.layers.Conv2D(
+                    64, kernel_size=(3, 3), activation="relu"
+                ),
+                tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+                tf.keras.layers.Flatten(),
+                tf.keras.layers.Dropout(0.5),
+                tf.keras.layers.Dense(self.num_classes, activation="softmax"),
+            ]
+        )
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_conv_mnist_bs_128(self):
+        """Measure performance with batch_size=128."""
+        batch_size = 128
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_conv_mnist_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_conv_mnist_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_conv_mnist_bs_512_gpu_2(self):
+        """Measure performance with batch_size=512, gpu=2 and
+
+        distribution_strategy='mirrored'
+        """
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py
index 79d5c00af563..70762325ee74 100644
--- a/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py
@@ -17,357 +17,448 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import timeit
+
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.benchmarks import benchmark_util
 from keras.benchmarks import distribution_util
 
 
 class CustomMnistBenchmark(tf.test.Benchmark):
-  """Benchmarks for custom training loop using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    self.num_classes = 10
-    self.input_shape = (28, 28, 1)
-    self.epochs = 15
-    (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
-    x_train = x_train.astype('float32') / 255
-    x_train = np.expand_dims(x_train, -1)
-    y_train = tf.keras.utils.to_categorical(y_train, self.num_classes)
-    self.num_examples = x_train.shape[0]
-    #  Use `tf.data.Dataset` for custom training loop.
-    self.train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-
-  def _build_model(self):
-    """Model from https://keras.io/examples/vision/mnist_convnet/."""
-    model = tf.keras.Sequential([
-        tf.keras.Input(shape=self.input_shape),
-        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
-        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-        tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
-        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-        tf.keras.layers.Flatten(),
-        tf.keras.layers.Dropout(0.5),
-        tf.keras.layers.Dense(self.num_classes, activation='softmax'),
-    ])
-
-    return model
-
-  def compute_loss(self, targets, predictions, loss_fn, batch_size):
-    """Compute average loss."""
-    per_example_loss = loss_fn(targets, predictions)
-    return tf.nn.compute_average_loss(
-        per_example_loss, global_batch_size=batch_size)
-
-  @tf.function(reduce_retracing=True)
-  def train_step(self, inputs, model, loss_fn, optimizer, batch_size):
-    """Compute loss and optimize model by optimizer.
-
-    Args:
-      inputs: `tf.data`.
-      model: See `model` in `train_function()` method.
-      loss_fn: See `loss_fn` in `train_function()` method.
-      optimizer: See `optimizer` in `train_function()` method.
-      batch_size: See `batch_size` in `train_function()` method.
-
-    Returns:
-      Loss value.
-    """
-    train_x, train_y = inputs
-    with tf.GradientTape() as tape:
-      predictions = model(train_x, training=True)
-      loss = self.compute_loss(train_y, predictions, loss_fn, batch_size)
-    grads = tape.gradient(loss, model.trainable_weights)
-    optimizer.apply_gradients(zip(grads, model.trainable_weights))
-    return loss
-
-  @tf.function(reduce_retracing=True)
-  def distributed_train_step(self, batch_dataset, model, loss_fn, optimizer,
-                             batch_size, distribution_strategy):
-    """Train step in distribution strategy setting.
-
-    Args:
-      batch_dataset: `tf.data`.
-      model: See `model` in `train_function()` method.
-      loss_fn: See `loss_fn` in `train_function()` method.
-      optimizer: See `optimizer` in `train_function()` method.
-      batch_size: See `batch_size` in `train_function()` method.
-      distribution_strategy: See `distribution_strategy` in `train_function()`
-        method.
-
-    Returns:
-      Sum of per_replica_losses.
-    """
-    per_replica_losses = distribution_strategy.run(
-        self.train_step,
-        args=(
-            batch_dataset,
+    """Benchmarks for custom training loop using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        self.num_classes = 10
+        self.input_shape = (28, 28, 1)
+        self.epochs = 15
+        (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
+        x_train = x_train.astype("float32") / 255
+        x_train = np.expand_dims(x_train, -1)
+        y_train = tf.keras.utils.to_categorical(y_train, self.num_classes)
+        self.num_examples = x_train.shape[0]
+        #  Use `tf.data.Dataset` for custom training loop.
+        self.train_dataset = tf.data.Dataset.from_tensor_slices(
+            (x_train, y_train)
+        )
+
+    def _build_model(self):
+        """Model from https://keras.io/examples/vision/mnist_convnet/."""
+        model = tf.keras.Sequential(
+            [
+                tf.keras.Input(shape=self.input_shape),
+                tf.keras.layers.Conv2D(
+                    32, kernel_size=(3, 3), activation="relu"
+                ),
+                tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+                tf.keras.layers.Conv2D(
+                    64, kernel_size=(3, 3), activation="relu"
+                ),
+                tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+                tf.keras.layers.Flatten(),
+                tf.keras.layers.Dropout(0.5),
+                tf.keras.layers.Dense(self.num_classes, activation="softmax"),
+            ]
+        )
+
+        return model
+
+    def compute_loss(self, targets, predictions, loss_fn, batch_size):
+        """Compute average loss."""
+        per_example_loss = loss_fn(targets, predictions)
+        return tf.nn.compute_average_loss(
+            per_example_loss, global_batch_size=batch_size
+        )
+
+    @tf.function(reduce_retracing=True)
+    def train_step(self, inputs, model, loss_fn, optimizer, batch_size):
+        """Compute loss and optimize model by optimizer.
+
+        Args:
+          inputs: `tf.data`.
+          model: See `model` in `train_function()` method.
+          loss_fn: See `loss_fn` in `train_function()` method.
+          optimizer: See `optimizer` in `train_function()` method.
+          batch_size: See `batch_size` in `train_function()` method.
+
+        Returns:
+          Loss value.
+        """
+        train_x, train_y = inputs
+        with tf.GradientTape() as tape:
+            predictions = model(train_x, training=True)
+            loss = self.compute_loss(train_y, predictions, loss_fn, batch_size)
+        grads = tape.gradient(loss, model.trainable_weights)
+        optimizer.apply_gradients(zip(grads, model.trainable_weights))
+        return loss
+
+    @tf.function(reduce_retracing=True)
+    def distributed_train_step(
+        self,
+        batch_dataset,
+        model,
+        loss_fn,
+        optimizer,
+        batch_size,
+        distribution_strategy,
+    ):
+        """Train step in distribution strategy setting.
+
+        Args:
+          batch_dataset: `tf.data`.
+          model: See `model` in `train_function()` method.
+          loss_fn: See `loss_fn` in `train_function()` method.
+          optimizer: See `optimizer` in `train_function()` method.
+          batch_size: See `batch_size` in `train_function()` method.
+          distribution_strategy: See `distribution_strategy` in
+            `train_function()` method.
+
+        Returns:
+          Sum of per_replica_losses.
+        """
+        per_replica_losses = distribution_strategy.run(
+            self.train_step,
+            args=(
+                batch_dataset,
+                model,
+                loss_fn,
+                optimizer,
+                batch_size,
+            ),
+        )
+        return distribution_strategy.reduce(
+            tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None
+        )
+
+    def train_function(
+        self,
+        model,
+        train_dataset,
+        loss_fn,
+        optimizer,
+        epochs=2,
+        distribution_strategy=None,
+        batch_size=256,
+    ):
+        """Train model in custom training loop and return average
+
+        train_step_time.
+
+        Args:
+          model: Model function to be benchmarked.
+          train_dataset: `tf.data` dataset. Should return a tuple of either
+            (inputs, targets) or (inputs, targets, sample_weights).
+          loss_fn: `tf.keras.losses.Loss` instance.
+          optimizer: `tf.keras.optimizers` instance.
+          epochs: Integer. Number of epochs to train the model. If unspecified,
+            `epochs` will default to 2.
+          distribution_strategy: Distribution strategies. It could be
+            `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
+            `distribution_strategy` will default to 'off'. Note that, `TPU` and
+            `parameter_server` are not supported yet.
+          batch_size: Integer. Number of samples per gradient update. If
+            unspecified, `batch_size` will default to 32.
+
+        Returns:
+          Average train_step_time.
+        """
+        train_step_time_list = []
+        timer = timeit.default_timer
+
+        total_loss = 0.0
+        num_batches = 0
+        for _ in range(epochs):
+            # Iterate over the batches of the dataset.
+            for batch_dataset in train_dataset:
+
+                start_time = timer()
+
+                if distribution_strategy is not None:
+                    total_loss += self.distributed_train_step(
+                        batch_dataset,
+                        model,
+                        loss_fn,
+                        optimizer,
+                        batch_size,
+                        distribution_strategy,
+                    )
+                else:
+                    total_loss += self.train_step(
+                        batch_dataset, model, loss_fn, optimizer, batch_size
+                    )
+                num_batches += 1
+
+                end_time = timer()
+                train_step_time_list.append(end_time - start_time)
+
+        return np.mean(train_step_time_list)
+
+    def measure_performance(
+        self,
+        model,
+        dataset,
+        loss_fn,
+        optimizer,
+        batch_size=32,
+        run_iters=4,
+        epochs=10,
+        distribution_strategy=None,
+    ):
+        """Run models and measure the performance.
+
+        Args:
+          model_fn: Model function to be benchmarked.
+          dataset: `tf.data` dataset. Should return a tuple of either (inputs,
+            targets) or (inputs, targets, sample_weights).
+          loss_fn: `tf.keras.losses.Loss` instance.
+          optimizer: `tf.keras.optimizers` instance.
+          batch_size: Integer. Number of samples per gradient update. If
+            unspecified, `batch_size` will default to 32.
+          run_iters: Integer. Number of iterations to run the performance
+            measurement. If unspecified, `run_iters` will default to 4.
+          epochs: Integer. Number of epochs to train the model. If unspecified,
+            `epochs` will default to 10.
+          distribution_strategy: Distribution strategies. It could be
+            `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
+            `distribution_strategy` will default to 'off'. Note that, `TPU` and
+            `parameter_server` are not supported yet.
+
+        Returns:
+          Performance summary, which contains build_time, avg_epoch_time,
+          wall_time, exp_per_sec, epochs, warmup_time, train_step_time.
+
+        Raise:
+          ValueError: if `dataset` is None or if `optimizer` instance is
+          not provided or if `loss_fn` instance is not provided.
+        """
+        if distribution_strategy is not None and not isinstance(
+            dataset, tf.distribute.DistributedDataset
+        ):
+            raise ValueError(
+                "tf.distribute.DistributedDataset"
+                " required in distribution strategy."
+            )
+
+        if distribution_strategy is None and not isinstance(
+            dataset, tf.data.Dataset
+        ):
+            raise ValueError("`tf.data` is required.")
+
+        if not isinstance(loss_fn, tf.keras.losses.Loss):
+            raise ValueError(
+                "`tf.keras.losses.Loss` instance for loss_fn is required."
+            )
+
+        if not isinstance(optimizer, tf.keras.optimizers.Optimizer):
+            raise ValueError(
+                "`tf.keras.optimizers` instance for optimizer is required."
+            )
+
+        avg_epoch_time_list, train_step_time_list = [], []
+        wall_time_list, exp_per_sec_list, warmup_time_list = [], [], []
+
+        total_num_examples = epochs * self.num_examples
+
+        for _ in range(run_iters):
+            timer = timeit.default_timer
+            start_time = timer()
+            t1 = timer()
+            self.train_function(
+                model,
+                dataset,
+                loss_fn,
+                optimizer,
+                1,
+                distribution_strategy,
+                batch_size,
+            )
+            warmup_time = timer() - t1
+
+            t2 = timer()
+            train_step_time = self.train_function(
+                model,
+                dataset,
+                loss_fn,
+                optimizer,
+                epochs,
+                distribution_strategy,
+                batch_size,
+            )
+            end_time = timer()
+
+            train_step_time_list.append(train_step_time)
+            warmup_time_list.append(warmup_time)
+            wall_time_list.append(end_time - start_time)
+            exp_per_sec_list.append(total_num_examples / (end_time - t2))
+            avg_epoch_time_list.append((end_time - t2) / epochs)
+
+        metrics = []
+        metrics.append(
+            {"name": "avg_epoch_time", "value": np.mean(avg_epoch_time_list)}
+        )
+        metrics.append(
+            {"name": "exp_per_sec", "value": np.mean(exp_per_sec_list)}
+        )
+        metrics.append(
+            {"name": "warmup_time", "value": np.mean(warmup_time_list)}
+        )
+        metrics.append(
+            {"name": "train_step_time", "value": np.mean(train_step_time_list)}
+        )
+        metrics.append({"name": "epochs", "value": epochs})
+
+        wall_time = np.mean(wall_time_list)
+
+        return metrics, wall_time
+
+    def benchmark_custom_training_mnist_bs_128(self):
+        """Measure performance with batch_size=128 and run_iters=5."""
+        batch_size = 128
+        run_iters = 5
+        train_dataset = self.train_dataset.shuffle(buffer_size=1024).batch(
+            batch_size
+        )
+
+        # Instantiate a loss function.
+        loss_fn = tf.keras.losses.CategoricalCrossentropy(
+            reduction=tf.keras.losses.Reduction.NONE
+        )
+        # Instantiate an optimizer to train the model.
+        optimizer = tf.keras.optimizers.Adam()
+        model = self._build_model()
+
+        metrics, wall_time = self.measure_performance(
+            model,
+            train_dataset,
+            loss_fn,
+            optimizer,
+            batch_size,
+            run_iters,
+            self.epochs,
+        )
+        extras = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size, ".keras.ctl_graph"
+        )
+        self.report_benchmark(
+            iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_custom_training_mnist_bs_256(self):
+        """Measure performance with batch_size=256 and run_iters=5."""
+        batch_size = 256
+        run_iters = 5
+        train_dataset = self.train_dataset.shuffle(buffer_size=1024).batch(
+            batch_size
+        )
+
+        # Instantiate a loss function.
+        loss_fn = tf.keras.losses.CategoricalCrossentropy(
+            reduction=tf.keras.losses.Reduction.NONE
+        )
+        # Instantiate an optimizer to train the model.
+        optimizer = tf.keras.optimizers.Adam()
+        model = self._build_model()
+
+        metrics, wall_time = self.measure_performance(
+            model,
+            train_dataset,
+            loss_fn,
+            optimizer,
+            batch_size,
+            run_iters,
+            self.epochs,
+        )
+        extras = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size, ".keras.ctl_graph"
+        )
+        self.report_benchmark(
+            iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_custom_training_mnist_bs_512(self):
+        """Measure performance with batch_size=512 and run_iters=10."""
+        batch_size = 512
+        run_iters = 5
+        train_dataset = self.train_dataset.shuffle(buffer_size=1024).batch(
+            batch_size
+        )
+
+        # Instantiate a loss function.
+        loss_fn = tf.keras.losses.CategoricalCrossentropy(
+            reduction=tf.keras.losses.Reduction.NONE
+        )
+        # Instantiate an optimizer to train the model.
+        optimizer = tf.keras.optimizers.Adam()
+        model = self._build_model()
+
+        metrics, wall_time = self.measure_performance(
+            model,
+            train_dataset,
+            loss_fn,
+            optimizer,
+            batch_size,
+            run_iters,
+            self.epochs,
+        )
+        extras = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size, ".keras.ctl_graph"
+        )
+        self.report_benchmark(
+            iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_custom_training_mnist_bs_512_gpu_2(self):
+        """Measure performance with batch_size=512, run_iters=10, gpu=2 and
+
+        distribution_strategy='mirrored'.
+        """
+        batch_size = 512
+        run_iters = 10
+        train_dataset = self.train_dataset.shuffle(buffer_size=1024).batch(
+            batch_size
+        )
+
+        distribution_strategy = "mirrored"
+
+        strategy = distribution_util.get_distribution_strategy(
+            distribution_strategy=distribution_strategy, num_gpus=2
+        )
+
+        if distribution_strategy != "off":
+            train_dataset = strategy.experimental_distribute_dataset(
+                train_dataset
+            )
+
+        strategy_scope = distribution_util.get_strategy_scope(strategy)
+
+        with strategy_scope:
+            # Instantiate a loss function.
+            loss_fn = tf.keras.losses.CategoricalCrossentropy(
+                reduction=tf.keras.losses.Reduction.NONE
+            )
+            # Instantiate an optimizer to train the model.
+            optimizer = tf.keras.optimizers.Adam()
+            model = self._build_model()
+
+        metrics, wall_time = self.measure_performance(
             model,
+            train_dataset,
             loss_fn,
             optimizer,
             batch_size,
-        ))
-    return distribution_strategy.reduce(
-        tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
-
-  def train_function(self,
-                     model,
-                     train_dataset,
-                     loss_fn,
-                     optimizer,
-                     epochs=2,
-                     distribution_strategy=None,
-                     batch_size=256):
-    """Train model in custom training loop and return average
-
-    train_step_time.
-
-    Args:
-      model: Model function to be benchmarked.
-      train_dataset: `tf.data` dataset. Should return a tuple of either (inputs,
-        targets) or (inputs, targets, sample_weights).
-      loss_fn: `tf.keras.losses.Loss` instance.
-      optimizer: `tf.keras.optimizers` instance.
-      epochs: Integer. Number of epochs to train the model. If unspecified,
-        `epochs` will default to 2.
-      distribution_strategy: Distribution strategies. It could be
-        `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
-        `distribution_strategy` will default to 'off'. Note that, `TPU` and
-        `parameter_server` are not supported yet.
-      batch_size: Integer. Number of samples per gradient update. If
-        unspecified, `batch_size` will default to 32.
-
-    Returns:
-      Average train_step_time.
-    """
-    train_step_time_list = []
-    timer = timeit.default_timer
-
-    total_loss = 0.0
-    num_batches = 0
-    for _ in range(epochs):
-      # Iterate over the batches of the dataset.
-      for batch_dataset in train_dataset:
-
-        start_time = timer()
-
-        if distribution_strategy is not None:
-          total_loss += self.distributed_train_step(batch_dataset, model,
-                                                    loss_fn, optimizer,
-                                                    batch_size,
-                                                    distribution_strategy)
-        else:
-          total_loss += self.train_step(batch_dataset, model, loss_fn,
-                                        optimizer, batch_size)
-        num_batches += 1
-
-        end_time = timer()
-        train_step_time_list.append(end_time - start_time)
-
-    return np.mean(train_step_time_list)
-
-  def measure_performance(self,
-                          model,
-                          dataset,
-                          loss_fn,
-                          optimizer,
-                          batch_size=32,
-                          run_iters=4,
-                          epochs=10,
-                          distribution_strategy=None):
-    """Run models and measure the performance.
-
-    Args:
-      model_fn: Model function to be benchmarked.
-      dataset: `tf.data` dataset. Should return a tuple of either (inputs,
-        targets) or (inputs, targets, sample_weights).
-      loss_fn: `tf.keras.losses.Loss` instance.
-      optimizer: `tf.keras.optimizers` instance.
-      batch_size: Integer. Number of samples per gradient update. If
-        unspecified, `batch_size` will default to 32.
-      run_iters: Integer. Number of iterations to run the performance
-        measurement. If unspecified, `run_iters` will default to 4.
-      epochs: Integer. Number of epochs to train the model. If unspecified,
-        `epochs` will default to 10.
-      distribution_strategy: Distribution strategies. It could be
-        `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
-        `distribution_strategy` will default to 'off'. Note that, `TPU` and
-        `parameter_server` are not supported yet.
-
-    Returns:
-      Performance summary, which contains build_time, avg_epoch_time,
-      wall_time, exp_per_sec, epochs, warmup_time, train_step_time.
-
-    Raise:
-      ValueError: if `dataset` is None or if `optimizer` instance is
-      not provided or if `loss_fn` instance is not provided.
-    """
-    if distribution_strategy is not None and \
-      not isinstance(dataset, tf.distribute.DistributedDataset):
-      raise ValueError('tf.distribute.DistributedDataset'
-                       ' required in distribution strategy.')
-
-    if distribution_strategy is None and \
-      not isinstance(dataset, tf.data.Dataset):
-      raise ValueError('`tf.data` is required.')
-
-    if not isinstance(loss_fn, tf.keras.losses.Loss):
-      raise ValueError('`tf.keras.losses.Loss` instance '
-                       'for loss_fn is required.')
-
-    if not isinstance(optimizer, tf.keras.optimizers.Optimizer):
-      raise ValueError('`tf.keras.optimizers` instance '
-                       'for optimizer is required.')
-
-    avg_epoch_time_list, train_step_time_list = [], []
-    wall_time_list, exp_per_sec_list, warmup_time_list = [], [], []
-
-    total_num_examples = epochs * self.num_examples
-
-    for _ in range(run_iters):
-      timer = timeit.default_timer
-      start_time = timer()
-      t1 = timer()
-      self.train_function(model, dataset, loss_fn, optimizer, 1,
-                          distribution_strategy, batch_size)
-      warmup_time = timer() - t1
-
-      t2 = timer()
-      train_step_time = self.train_function(model, dataset, loss_fn, optimizer,
-                                            epochs, distribution_strategy,
-                                            batch_size)
-      end_time = timer()
-
-      train_step_time_list.append(train_step_time)
-      warmup_time_list.append(warmup_time)
-      wall_time_list.append(end_time - start_time)
-      exp_per_sec_list.append(total_num_examples / (end_time - t2))
-      avg_epoch_time_list.append((end_time - t2) / epochs)
-
-    metrics = []
-    metrics.append({
-        'name': 'avg_epoch_time',
-        'value': np.mean(avg_epoch_time_list)
-    })
-    metrics.append({'name': 'exp_per_sec', 'value': np.mean(exp_per_sec_list)})
-    metrics.append({'name': 'warmup_time', 'value': np.mean(warmup_time_list)})
-    metrics.append({
-        'name': 'train_step_time',
-        'value': np.mean(train_step_time_list)
-    })
-    metrics.append({'name': 'epochs', 'value': epochs})
-
-    wall_time = np.mean(wall_time_list)
-
-    return metrics, wall_time
-
-  def benchmark_custom_training_mnist_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=5."""
-    batch_size = 128
-    run_iters = 5
-    train_dataset = self.train_dataset.shuffle(
-        buffer_size=1024).batch(batch_size)
-
-    # Instantiate a loss function.
-    loss_fn = tf.keras.losses.CategoricalCrossentropy(
-        reduction=tf.keras.losses.Reduction.NONE)
-    # Instantiate an optimizer to train the model.
-    optimizer = tf.keras.optimizers.Adam()
-    model = self._build_model()
-
-    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
-                                                  optimizer, batch_size,
-                                                  run_iters, self.epochs)
-    extras = benchmark_util.get_keras_examples_metadata('conv', batch_size,
-                                                        '.keras.ctl_graph')
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_custom_training_mnist_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=5."""
-    batch_size = 256
-    run_iters = 5
-    train_dataset = self.train_dataset.shuffle(
-        buffer_size=1024).batch(batch_size)
-
-    # Instantiate a loss function.
-    loss_fn = tf.keras.losses.CategoricalCrossentropy(
-        reduction=tf.keras.losses.Reduction.NONE)
-    # Instantiate an optimizer to train the model.
-    optimizer = tf.keras.optimizers.Adam()
-    model = self._build_model()
-
-    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
-                                                  optimizer, batch_size,
-                                                  run_iters, self.epochs)
-    extras = benchmark_util.get_keras_examples_metadata('conv', batch_size,
-                                                        '.keras.ctl_graph')
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_custom_training_mnist_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=10."""
-    batch_size = 512
-    run_iters = 5
-    train_dataset = self.train_dataset.shuffle(
-        buffer_size=1024).batch(batch_size)
-
-    # Instantiate a loss function.
-    loss_fn = tf.keras.losses.CategoricalCrossentropy(
-        reduction=tf.keras.losses.Reduction.NONE)
-    # Instantiate an optimizer to train the model.
-    optimizer = tf.keras.optimizers.Adam()
-    model = self._build_model()
-
-    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
-                                                  optimizer, batch_size,
-                                                  run_iters, self.epochs)
-    extras = benchmark_util.get_keras_examples_metadata('conv', batch_size,
-                                                        '.keras.ctl_graph')
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_custom_training_mnist_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, run_iters=10, gpu=2 and
-
-    distribution_strategy='mirrored'.
-    """
-    batch_size = 512
-    run_iters = 10
-    train_dataset = self.train_dataset.shuffle(
-        buffer_size=1024).batch(batch_size)
-
-    distribution_strategy = 'mirrored'
-
-    strategy = distribution_util.get_distribution_strategy(
-        distribution_strategy=distribution_strategy, num_gpus=2)
-
-    if distribution_strategy != 'off':
-      train_dataset = strategy.experimental_distribute_dataset(train_dataset)
-
-    strategy_scope = distribution_util.get_strategy_scope(strategy)
-
-    with strategy_scope:
-      # Instantiate a loss function.
-      loss_fn = tf.keras.losses.CategoricalCrossentropy(
-          reduction=tf.keras.losses.Reduction.NONE)
-      # Instantiate an optimizer to train the model.
-      optimizer = tf.keras.optimizers.Adam()
-      model = self._build_model()
-
-    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
-                                                  optimizer, batch_size,
-                                                  run_iters, self.epochs,
-                                                  strategy)
-    extras = benchmark_util.get_keras_examples_metadata('conv', batch_size,
-                                                        '.keras.ctl_graph')
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+            run_iters,
+            self.epochs,
+            strategy,
+        )
+        extras = benchmark_util.get_keras_examples_metadata(
+            "conv", batch_size, ".keras.ctl_graph"
+        )
+        self.report_benchmark(
+            iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
index a58f2ec36dce..4103c3a3ee40 100644
--- a/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
@@ -23,117 +23,135 @@
 
 
 class HierarchicalRNNBenchmark(tf.test.Benchmark):
-  """Benchmarks for Hierarchical RNN using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    self.num_classes = 10
-    self.row_hidden, self.col_hidden = 128, 128
-    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
-    self.x_train = self.x_train.reshape(self.x_train.shape[0], 28, 28, 1)
-    self.x_train = self.x_train.astype('float32') / 255
-    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
-
-  def _build_model(self):
-    """Model from https://github.com/keras-team/keras/blob/master/examples
-
-    /mnist_hierarchical_rnn.py.
-    """
-    row, col, pixel = self.x_train.shape[1:]
-    inputs = tf.keras.layers.Input(shape=(row, col, pixel))
-    encoded_rows = tf.keras.layers.TimeDistributed(
-        tf.keras.layers.LSTM(self.row_hidden))(
-            inputs)
-    encoded_cols = tf.keras.layers.LSTM(self.col_hidden)(encoded_rows)
-    outputs = tf.keras.layers.Dense(
-        self.num_classes, activation='softmax')(
-            encoded_cols)
-    model = tf.keras.Model(inputs, outputs)
-
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_hrnn_mnist_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'hierarchical_rnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_hrnn_mnist_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'hierarchical_rnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_hrnn_mnist_bs_1024(self):
-    """Measure performance with batch_size=1024."""
-    batch_size = 1024
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'hierarchical_rnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_hrnn_mnist_bs_1024_gpu_2(self):
-    """Measure performance with batch_size=1024, gpu=2 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 1024
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy='mirrored',
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'hierarchical_rnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Benchmarks for Hierarchical RNN using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        self.num_classes = 10
+        self.row_hidden, self.col_hidden = 128, 128
+        (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+        self.x_train = self.x_train.reshape(self.x_train.shape[0], 28, 28, 1)
+        self.x_train = self.x_train.astype("float32") / 255
+        self.y_train = tf.keras.utils.to_categorical(
+            self.y_train, self.num_classes
+        )
+
+    def _build_model(self):
+        """Model from https://github.com/keras-team/keras/blob/master/examples
+
+        /mnist_hierarchical_rnn.py.
+        """
+        row, col, pixel = self.x_train.shape[1:]
+        inputs = tf.keras.layers.Input(shape=(row, col, pixel))
+        encoded_rows = tf.keras.layers.TimeDistributed(
+            tf.keras.layers.LSTM(self.row_hidden)
+        )(inputs)
+        encoded_cols = tf.keras.layers.LSTM(self.col_hidden)(encoded_rows)
+        outputs = tf.keras.layers.Dense(self.num_classes, activation="softmax")(
+            encoded_cols
+        )
+        model = tf.keras.Model(inputs, outputs)
+
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_hrnn_mnist_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "hierarchical_rnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_hrnn_mnist_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "hierarchical_rnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_hrnn_mnist_bs_1024(self):
+        """Measure performance with batch_size=1024."""
+        batch_size = 1024
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "hierarchical_rnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_hrnn_mnist_bs_1024_gpu_2(self):
+        """Measure performance with batch_size=1024, gpu=2 and
+
+        distribution_strategy='mirrored'
+        """
+        batch_size = 1024
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "hierarchical_rnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
index c996b2360132..42dbfede4a4d 100644
--- a/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
@@ -23,113 +23,147 @@
 
 
 class IRNNMnistBenchmark(tf.test.Benchmark):
-  """Benchmarks for IRNN using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    self.num_classes = 10
-    self.hidden_units = 100
-    self.learning_rate = 1e-6
-    (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
-    self.x_train = self.x_train.reshape(self.x_train.shape[0], -1, 1)
-    self.x_train = self.x_train.astype('float32') / 255
-    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
-
-  def _build_model(self):
-    """Model from https://github.com/keras-team/keras/
-
-    blob/master/examples/mnist_irnn.py.
-    """
-    model = tf.keras.Sequential()
-    model.add(
-        tf.keras.layers.SimpleRNN(
-            self.hidden_units,
-            kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
-            recurrent_initializer=tf.keras.initializers.Identity(gain=1.0),
-            activation='relu',
-            input_shape=self.x_train.shape[1:]))
-    model.add(tf.keras.layers.Dense(self.num_classes))
-    model.add(tf.keras.layers.Activation('softmax'))
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_irnn_mnist_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('irnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_irnn_mnist_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('irnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_irnn_mnist_bs_1024(self):
-    """Measure performance with batch_size=1024."""
-    batch_size = 1024
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('irnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_irnn_mnist_bs_1024_gpu_2(self):
-    """Measure performance with batch_size=1024, gpu=2 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 1024
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy='mirrored',
-        optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('irnn', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Benchmarks for IRNN using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        self.num_classes = 10
+        self.hidden_units = 100
+        self.learning_rate = 1e-6
+        (self.x_train, self.y_train), _ = tf.keras.datasets.mnist.load_data()
+        self.x_train = self.x_train.reshape(self.x_train.shape[0], -1, 1)
+        self.x_train = self.x_train.astype("float32") / 255
+        self.y_train = tf.keras.utils.to_categorical(
+            self.y_train, self.num_classes
+        )
+
+    def _build_model(self):
+        """Model from https://github.com/keras-team/keras/
+
+        blob/master/examples/mnist_irnn.py.
+        """
+        model = tf.keras.Sequential()
+        model.add(
+            tf.keras.layers.SimpleRNN(
+                self.hidden_units,
+                kernel_initializer=tf.keras.initializers.RandomNormal(
+                    stddev=0.001
+                ),
+                recurrent_initializer=tf.keras.initializers.Identity(gain=1.0),
+                activation="relu",
+                input_shape=self.x_train.shape[1:],
+            )
+        )
+        model.add(tf.keras.layers.Dense(self.num_classes))
+        model.add(tf.keras.layers.Activation("softmax"))
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_irnn_mnist_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=self.learning_rate
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "irnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_irnn_mnist_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=self.learning_rate
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "irnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_irnn_mnist_bs_1024(self):
+        """Measure performance with batch_size=1024."""
+        batch_size = 1024
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=self.learning_rate
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "irnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_irnn_mnist_bs_1024_gpu_2(self):
+        """Measure performance with batch_size=1024, gpu=2 and
+
+        distribution_strategy='mirrored'
+        """
+        batch_size = 1024
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            optimizer=tf.keras.optimizers.RMSprop(
+                learning_rate=self.learning_rate
+            ),
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "irnn", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
index d446713e165b..39fc136c4618 100644
--- a/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
@@ -17,122 +17,140 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.benchmarks import benchmark_util
 
 
 class MLPReutersBenchmark(tf.test.Benchmark):
-  """Benchmarks for MLP using `tf.test.Benchmark`."""
-
-  def __init__(self):
-    super().__init__()
-    self.max_words = 1000
-    (self.x_train, self.y_train), _ = tf.keras.datasets.reuters.load_data(
-        num_words=self.max_words)
-    self.num_classes = np.max(self.y_train) + 1
-    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.max_words)
-    self.x_train = tokenizer.sequences_to_matrix(self.x_train, mode='binary')
-    self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
-    self.epochs = 5
-
-  def _build_model(self):
-    """Model from https://github.com/keras-team/keras/blob/master/
-
-    examples/reuters_mlp.py.
-    """
-    model = tf.keras.Sequential()
-    model.add(tf.keras.layers.Dense(512, input_shape=(self.max_words,)))
-    model.add(tf.keras.layers.Activation('relu'))
-    model.add(tf.keras.layers.Dropout(0.5))
-    model.add(tf.keras.layers.Dense(self.num_classes))
-    model.add(tf.keras.layers.Activation('softmax'))
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_mlp_reuters_bs_128(self):
-    """Measure performance with batch_size=128."""
-    batch_size = 128
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('mlp', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mlp_reuters_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('mlp', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mlp_reuters_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('mlp', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mlp_reuters_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, gpu=2 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy='mirrored',
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata('mlp', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Benchmarks for MLP using `tf.test.Benchmark`."""
+
+    def __init__(self):
+        super().__init__()
+        self.max_words = 1000
+        (self.x_train, self.y_train), _ = tf.keras.datasets.reuters.load_data(
+            num_words=self.max_words
+        )
+        self.num_classes = np.max(self.y_train) + 1
+        tokenizer = tf.keras.preprocessing.text.Tokenizer(
+            num_words=self.max_words
+        )
+        self.x_train = tokenizer.sequences_to_matrix(
+            self.x_train, mode="binary"
+        )
+        self.y_train = tf.keras.utils.to_categorical(
+            self.y_train, self.num_classes
+        )
+        self.epochs = 5
+
+    def _build_model(self):
+        """Model from https://github.com/keras-team/keras/blob/master/
+
+        examples/reuters_mlp.py.
+        """
+        model = tf.keras.Sequential()
+        model.add(tf.keras.layers.Dense(512, input_shape=(self.max_words,)))
+        model.add(tf.keras.layers.Activation("relu"))
+        model.add(tf.keras.layers.Dropout(0.5))
+        model.add(tf.keras.layers.Dense(self.num_classes))
+        model.add(tf.keras.layers.Activation("softmax"))
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_mlp_reuters_bs_128(self):
+        """Measure performance with batch_size=128."""
+        batch_size = 128
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("mlp", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_mlp_reuters_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("mlp", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_mlp_reuters_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("mlp", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_mlp_reuters_bs_512_gpu_2(self):
+        """Measure performance with batch_size=512, gpu=2 and
+
+        distribution_strategy='mirrored'
+        """
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.x_train,
+            y=self.y_train,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            epochs=self.epochs,
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata("mlp", batch_size)
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py b/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
index 7f2af56afcc1..7277c955f215 100644
--- a/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
+++ b/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
@@ -23,215 +23,245 @@
 
 
 class TextWithTransformerBenchmark(tf.test.Benchmark):
-  """Benchmarks for Text classification with Transformer
-  using `tf.test.Benchmark`.
-  """
-
-  def __init__(self):
-    super().__init__()
-    self.max_feature = 20000
-    self.max_len = 200
-    (self.imdb_x, self.imdb_y), _ = tf.keras.datasets.imdb.load_data(
-        num_words=self.max_feature)
-    self.imdb_x = tf.keras.preprocessing.sequence.pad_sequences(
-        self.imdb_x, maxlen=self.max_len)
-
-  def _build_model(self):
-    """Model from https://keras.io/examples/nlp/text_classification_with_transformer/."""
-    embed_dim = 32
-    num_heads = 2
-    ff_dim = 32
-    inputs = tf.keras.layers.Input(shape=(self.max_len,))
-    embedding_layer = TokenAndPositionEmbedding(self.max_len, self.max_feature,
-                                                embed_dim)
-    x = embedding_layer(inputs)  #pylint: disable=not-callable
-    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
-    x = transformer_block(x)  #pylint: disable=not-callable
-    x = tf.keras.layers.GlobalAvgPool1D()(x)
-    x = tf.keras.layers.Dropout(0.1)(x)
-    x = tf.keras.layers.Dense(20, activation='relu')(x)
-    x = tf.keras.layers.Dropout(0.1)(x)
-    outputs = tf.keras.layers.Dense(2, activation='softmax')(x)
-
-    model = tf.keras.Model(inputs=inputs, outputs=outputs)
-    return model
-
-  # In each benchmark test, the required arguments for the
-  # method `measure_performance` include:
-  #   x: Input data, it could be Numpy or loaded from tfds.
-  #   y: Target data. If `x` is a dataset or generator instance,
-  #      `y` should not be specified.
-  #   loss: Loss function for model.
-  #   optimizer: Optimizer for model.
-  #   Check more details in `measure_performance()` method of
-  #   benchmark_util.
-  def benchmark_text_classification_bs_128(self):
-    """Measure performance with batch_size=128."""
-    batch_size = 128
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        optimizer='adam',
-        loss='sparse_categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'transformer', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_text_classification_bs_256(self):
-    """Measure performance with batch_size=256."""
-    batch_size = 256
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        optimizer='adam',
-        loss='sparse_categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'transformer', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_text_classification_bs_512(self):
-    """Measure performance with batch_size=512."""
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        optimizer='adam',
-        loss='sparse_categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'transformer', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_text_classification_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, gpu=1 and
-
-    distribution_strategy='mirrored'
+    """Benchmarks for Text classification with Transformer
+    using `tf.test.Benchmark`.
     """
-    batch_size = 512
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.imdb_x,
-        y=self.imdb_y,
-        batch_size=batch_size,
-        num_gpus=2,
-        distribution_strategy='mirrored',
-        optimizer='adam',
-        loss='sparse_categorical_crossentropy',
-        metrics=['accuracy'])
-
-    metadata = benchmark_util.get_keras_examples_metadata(
-        'transformer', batch_size)
-    extras.update(metadata)
-    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+    def __init__(self):
+        super().__init__()
+        self.max_feature = 20000
+        self.max_len = 200
+        (self.imdb_x, self.imdb_y), _ = tf.keras.datasets.imdb.load_data(
+            num_words=self.max_feature
+        )
+        self.imdb_x = tf.keras.preprocessing.sequence.pad_sequences(
+            self.imdb_x, maxlen=self.max_len
+        )
+
+    def _build_model(self):
+        """Model from
+        https://keras.io/examples/nlp/text_classification_with_transformer/."""
+        embed_dim = 32
+        num_heads = 2
+        ff_dim = 32
+        inputs = tf.keras.layers.Input(shape=(self.max_len,))
+        embedding_layer = TokenAndPositionEmbedding(
+            self.max_len, self.max_feature, embed_dim
+        )
+        x = embedding_layer(inputs)
+        transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
+        x = transformer_block(x)
+        x = tf.keras.layers.GlobalAvgPool1D()(x)
+        x = tf.keras.layers.Dropout(0.1)(x)
+        x = tf.keras.layers.Dense(20, activation="relu")(x)
+        x = tf.keras.layers.Dropout(0.1)(x)
+        outputs = tf.keras.layers.Dense(2, activation="softmax")(x)
+
+        model = tf.keras.Model(inputs=inputs, outputs=outputs)
+        return model
+
+    # In each benchmark test, the required arguments for the
+    # method `measure_performance` include:
+    #   x: Input data, it could be Numpy or loaded from tfds.
+    #   y: Target data. If `x` is a dataset or generator instance,
+    #      `y` should not be specified.
+    #   loss: Loss function for model.
+    #   optimizer: Optimizer for model.
+    #   Check more details in `measure_performance()` method of
+    #   benchmark_util.
+    def benchmark_text_classification_bs_128(self):
+        """Measure performance with batch_size=128."""
+        batch_size = 128
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "transformer", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_text_classification_bs_256(self):
+        """Measure performance with batch_size=256."""
+        batch_size = 256
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "transformer", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_text_classification_bs_512(self):
+        """Measure performance with batch_size=512."""
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "transformer", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
+
+    def benchmark_text_classification_bs_512_gpu_2(self):
+        """Measure performance with batch_size=512, gpu=1 and
+
+        distribution_strategy='mirrored'
+        """
+        batch_size = 512
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            self._build_model,
+            x=self.imdb_x,
+            y=self.imdb_y,
+            batch_size=batch_size,
+            num_gpus=2,
+            distribution_strategy="mirrored",
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+
+        metadata = benchmark_util.get_keras_examples_metadata(
+            "transformer", batch_size
+        )
+        extras.update(metadata)
+        self.report_benchmark(
+            wall_time=wall_time, metrics=metrics, extras=extras
+        )
 
 
 class MultiHeadSelfAttention(tf.keras.layers.Layer):
-  """Implement multi head self attention as a Keras layer."""
-
-  def __init__(self, embed_dim, num_heads=8):
-    super().__init__()
-    self.embed_dim = embed_dim
-    self.num_heads = num_heads
-    if embed_dim % num_heads != 0:
-      raise ValueError(f'embedding dimension = {embed_dim} should be divisible'
-                       f'by number of heads = {num_heads}')
-    self.projection_dim = embed_dim // num_heads
-    self.query_dense = tf.keras.layers.Dense(embed_dim)
-    self.key_dense = tf.keras.layers.Dense(embed_dim)
-    self.value_dense = tf.keras.layers.Dense(embed_dim)
-    self.combine_heads = tf.keras.layers.Dense(embed_dim)
-
-  def attention(self, query, key, value):
-    score = tf.matmul(query, key, transpose_b=True)
-    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
-    scaled_score = score / tf.math.sqrt(dim_key)
-    weights = tf.nn.softmax(scaled_score, axis=-1)
-    output = tf.matmul(weights, value)
-    return output, weights
-
-  def separate_heads(self, x, batch_size):
-    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
-    return tf.transpose(x, perm=[0, 2, 1, 3])
-
-  def call(self, inputs):  #pylint: disable=arguments-differ
-    # x.shape = [batch_size, seq_len, embedding_dim]
-    batch_size = tf.shape(inputs)[0]
-    query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
-    key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
-    value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
-    query = self.separate_heads(
-        query, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
-    key = self.separate_heads(
-        key, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
-    value = self.separate_heads(
-        value, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
-    attention, _ = self.attention(query, key, value)
-    attention = tf.transpose(
-        attention, perm=[0, 2, 1,
-                         3])  # (batch_size, seq_len, num_heads, projection_dim)
-    concat_attention = tf.reshape(
-        attention,
-        (batch_size, -1, self.embed_dim))  # (batch_size, seq_len, embed_dim)
-    output = self.combine_heads(
-        concat_attention)  # (batch_size, seq_len, embed_dim)
-    return output
+    """Implement multi head self attention as a Keras layer."""
+
+    def __init__(self, embed_dim, num_heads=8):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        if embed_dim % num_heads != 0:
+            raise ValueError(
+                f"embedding dimension = {embed_dim} should be divisible"
+                f"by number of heads = {num_heads}"
+            )
+        self.projection_dim = embed_dim // num_heads
+        self.query_dense = tf.keras.layers.Dense(embed_dim)
+        self.key_dense = tf.keras.layers.Dense(embed_dim)
+        self.value_dense = tf.keras.layers.Dense(embed_dim)
+        self.combine_heads = tf.keras.layers.Dense(embed_dim)
+
+    def attention(self, query, key, value):
+        score = tf.matmul(query, key, transpose_b=True)
+        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
+        scaled_score = score / tf.math.sqrt(dim_key)
+        weights = tf.nn.softmax(scaled_score, axis=-1)
+        output = tf.matmul(weights, value)
+        return output, weights
+
+    def separate_heads(self, x, batch_size):
+        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def call(self, inputs):
+        # x.shape = [batch_size, seq_len, embedding_dim]
+        batch_size = tf.shape(inputs)[0]
+        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
+        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
+        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
+        query = self.separate_heads(
+            query, batch_size
+        )  # (batch_size, num_heads, seq_len, projection_dim)
+        key = self.separate_heads(
+            key, batch_size
+        )  # (batch_size, num_heads, seq_len, projection_dim)
+        value = self.separate_heads(
+            value, batch_size
+        )  # (batch_size, num_heads, seq_len, projection_dim)
+        attention, _ = self.attention(query, key, value)
+        attention = tf.transpose(
+            attention, perm=[0, 2, 1, 3]
+        )  # (batch_size, seq_len, num_heads, projection_dim)
+        concat_attention = tf.reshape(
+            attention, (batch_size, -1, self.embed_dim)
+        )  # (batch_size, seq_len, embed_dim)
+        output = self.combine_heads(
+            concat_attention
+        )  # (batch_size, seq_len, embed_dim)
+        return output
 
 
 class TransformerBlock(tf.keras.layers.Layer):
-  """Implement a Transformer block as a layer."""
-
-  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
-    super().__init__()
-    self.att = MultiHeadSelfAttention(embed_dim, num_heads)
-    self.ffn = tf.keras.Sequential([
-        tf.keras.layers.Dense(ff_dim, activation='relu'),
-        tf.keras.layers.Dense(embed_dim)
-    ])
-    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-    self.dropout1 = tf.keras.layers.Dropout(rate)
-    self.dropout2 = tf.keras.layers.Dropout(rate)
-
-  def call(self, inputs, training):  #pylint: disable=arguments-differ
-    attn_output = self.att(inputs)  #pylint: disable=not-callable
-    attn_output = self.dropout1(attn_output, training=training)
-    out1 = self.layernorm1(inputs + attn_output)
-    ffn_output = self.ffn(out1)
-    ffn_output = self.dropout2(ffn_output, training=training)
-    return self.layernorm2(out1 + ffn_output)
+    """Implement a Transformer block as a layer."""
+
+    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
+        super().__init__()
+        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
+        self.ffn = tf.keras.Sequential(
+            [
+                tf.keras.layers.Dense(ff_dim, activation="relu"),
+                tf.keras.layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = tf.keras.layers.Dropout(rate)
+        self.dropout2 = tf.keras.layers.Dropout(rate)
+
+    def call(self, inputs, training):
+        attn_output = self.att(inputs)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return self.layernorm2(out1 + ffn_output)
 
 
 class TokenAndPositionEmbedding(tf.keras.layers.Layer):
-  """Implement embedding layer."""
-
-  def __init__(self, maxlen, vocab_size, embed_dim):
-    super().__init__()
-    self.token_emb = tf.keras.layers.Embedding(
-        input_dim=vocab_size, output_dim=embed_dim)
-    self.pos_emb = tf.keras.layers.Embedding(
-        input_dim=maxlen, output_dim=embed_dim)
-
-  def call(self, x):  #pylint: disable=arguments-differ
-    maxlen = tf.shape(x)[-1]
-    positions = tf.range(start=0, limit=maxlen, delta=1)
-    positions = self.pos_emb(positions)
-    x = self.token_emb(x)
-    return x + positions
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Implement embedding layer."""
+
+    def __init__(self, maxlen, vocab_size, embed_dim):
+        super().__init__()
+        self.token_emb = tf.keras.layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.pos_emb = tf.keras.layers.Embedding(
+            input_dim=maxlen, output_dim=embed_dim
+        )
+
+    def call(self, x):
+        maxlen = tf.shape(x)[-1]
+        positions = tf.range(start=0, limit=maxlen, delta=1)
+        positions = self.pos_emb(positions)
+        x = self.token_emb(x)
+        return x + positions
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/layer_benchmarks/BUILD b/keras/benchmarks/layer_benchmarks/BUILD
index ef34aff6d7c5..809292c8c18f 100644
--- a/keras/benchmarks/layer_benchmarks/BUILD
+++ b/keras/benchmarks/layer_benchmarks/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #   Implementation of benchmarks on Keras layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/keras/benchmarks/layer_benchmarks/layer_benchmarks_test.py b/keras/benchmarks/layer_benchmarks/layer_benchmarks_test.py
index 5073bb9fed24..42c5d17fa069 100644
--- a/keras/benchmarks/layer_benchmarks/layer_benchmarks_test.py
+++ b/keras/benchmarks/layer_benchmarks/layer_benchmarks_test.py
@@ -18,284 +18,524 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import functools
+
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras.benchmarks import benchmark_util
 from keras.benchmarks.layer_benchmarks import layer_benchmarks_test_base
 
 
 def _get_metadata(name):
-  return {
-      "model_name": "ideal_layers",
-      "parameters": name[1] + "_shape",
-  }
+    return {
+        "model_name": "ideal_layers",
+        "parameters": name[1] + "_shape",
+    }
 
 
 def _get_layer_args(layer_cls, layer_args):
-  # To make benchmark parameters compatible with GPU platform.
-  if layer_cls is tf.keras.layers.Bidirectional:
-    return {"layer": tf.keras.layers.LSTM(1)}
-  return layer_args
+    # To make benchmark parameters compatible with GPU platform.
+    if layer_cls is tf.keras.layers.Bidirectional:
+        return {"layer": tf.keras.layers.LSTM(1)}
+    return layer_args
 
 
 def _get_input_data(inputs):
-  if "input_shape" in inputs:
-    return tf.ones(inputs["input_shape"])
-  elif "input" in inputs:
-    return inputs["input"]
-  else:
-    raise ValueError("Please specify either `input_shape` or `input`"
-                     "for the benchmark test")
+    if "input_shape" in inputs:
+        return tf.ones(inputs["input_shape"])
+    elif "input" in inputs:
+        return inputs["input"]
+    else:
+        raise ValueError(
+            "Please specify either `input_shape` or `input`"
+            "for the benchmark test"
+        )
 
 
 def _layer_call_backward(layer, x):
-  with tf.GradientTape() as tape:
-    y = layer(x)
-    loss = tf.reduce_mean(y**2)
+    with tf.GradientTape() as tape:
+        y = layer(x)
+        loss = tf.reduce_mean(y**2)
+
+    _ = tape.gradient(loss, layer.trainable_variables)
 
-  _ = tape.gradient(loss, layer.trainable_variables)
 
 CORE_LAYERS = [
-    ("Dense_small_shape", tf.keras.layers.Dense,
-     {"units": 32, "activation": "relu"},
-     {"input_shape": (1, 16)}, 100),
-    ("Activation_small_shape", tf.keras.layers.Activation,
-     {"activation": "relu"},
-     {"input_shape": (1, 4)}, 100),
-    ("Embedding_small_shape", tf.keras.layers.Embedding,
-     {"input_dim": 1, "output_dim": 1, "input_length": 1},
-     {"input": np.random.randint(1, size=(1, 1))}, 100),
-    ("Embedding_normal_shape", tf.keras.layers.Embedding,
-     {"input_dim": 1000, "output_dim": 64, "input_length": 10},
-     {"input": np.random.randint(1000, size=(32, 10))}, 100),
-    ("Masking_small_shape", tf.keras.layers.Masking,
-     {"mask_value": 1}, {"input_shape": (1, 1)}, 100),
-    ("Lambda_small_shape", tf.keras.layers.Lambda,
-     {"function": lambda x: x ** 2}, {"input_shape": (1, 1)}, 100),
-    ("Flatten_small_shape", tf.keras.layers.Flatten,
-     {}, {"input_shape": (1, 1)}, 100),
+    (
+        "Dense_small_shape",
+        tf.keras.layers.Dense,
+        {"units": 32, "activation": "relu"},
+        {"input_shape": (1, 16)},
+        100,
+    ),
+    (
+        "Activation_small_shape",
+        tf.keras.layers.Activation,
+        {"activation": "relu"},
+        {"input_shape": (1, 4)},
+        100,
+    ),
+    (
+        "Embedding_small_shape",
+        tf.keras.layers.Embedding,
+        {"input_dim": 1, "output_dim": 1, "input_length": 1},
+        {"input": np.random.randint(1, size=(1, 1))},
+        100,
+    ),
+    (
+        "Embedding_normal_shape",
+        tf.keras.layers.Embedding,
+        {"input_dim": 1000, "output_dim": 64, "input_length": 10},
+        {"input": np.random.randint(1000, size=(32, 10))},
+        100,
+    ),
+    (
+        "Masking_small_shape",
+        tf.keras.layers.Masking,
+        {"mask_value": 1},
+        {"input_shape": (1, 1)},
+        100,
+    ),
+    (
+        "Lambda_small_shape",
+        tf.keras.layers.Lambda,
+        {"function": lambda x: x**2},
+        {"input_shape": (1, 1)},
+        100,
+    ),
+    (
+        "Flatten_small_shape",
+        tf.keras.layers.Flatten,
+        {},
+        {"input_shape": (1, 1)},
+        100,
+    ),
 ]
 
 CONV_LAYERS = [
-    ("Conv1D_small_shape", tf.keras.layers.Conv1D,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1)}, 100),
-    ("Conv2D_small_shape", tf.keras.layers.Conv2D,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1, 1)}, 100),
-    ("Conv2D_normal_shape", tf.keras.layers.Conv2D,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (64, 28, 28, 3)}, 100),
-    ("Conv3D_small_shape", tf.keras.layers.Conv3D,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("Conv1DTranspose_small_shape", tf.keras.layers.Conv1DTranspose,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1)}, 100),
-    ("Conv2DTranspose_small_shape", tf.keras.layers.Conv2DTranspose,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1, 1)}, 100),
-    ("Conv3DTranspose_small_shape", tf.keras.layers.Conv3DTranspose,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("SeparableConv1D_small_shape", tf.keras.layers.SeparableConv1D,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1)}, 100),
-    ("SeparableConv2D_small_shape", tf.keras.layers.SeparableConv2D,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1, 1)}, 100),
-    ("DepthwiseConv2D_small_shape", tf.keras.layers.DepthwiseConv2D,
-     {"kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1, 1)}, 100),
+    (
+        "Conv1D_small_shape",
+        tf.keras.layers.Conv1D,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "Conv2D_small_shape",
+        tf.keras.layers.Conv2D,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "Conv2D_normal_shape",
+        tf.keras.layers.Conv2D,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (64, 28, 28, 3)},
+        100,
+    ),
+    (
+        "Conv3D_small_shape",
+        tf.keras.layers.Conv3D,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "Conv1DTranspose_small_shape",
+        tf.keras.layers.Conv1DTranspose,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "Conv2DTranspose_small_shape",
+        tf.keras.layers.Conv2DTranspose,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "Conv3DTranspose_small_shape",
+        tf.keras.layers.Conv3DTranspose,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "SeparableConv1D_small_shape",
+        tf.keras.layers.SeparableConv1D,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "SeparableConv2D_small_shape",
+        tf.keras.layers.SeparableConv2D,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "DepthwiseConv2D_small_shape",
+        tf.keras.layers.DepthwiseConv2D,
+        {"kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
 ]
 
 RECURRENT_LAYERS = [
-    ("LSTM_small_shape", tf.keras.layers.LSTM,
-     {"units": 1}, {"input_shape": (1, 1, 1)}, 100),
-    ("LSTM_normal_shape", tf.keras.layers.LSTM,
-     {"units": 4}, {"input_shape": (32, 10, 8)}, 100),
-    ("GRU_small_shape", tf.keras.layers.GRU,
-     {"units": 1}, {"input_shape": (1, 1, 1)}, 100),
-    ("SimpleRNN_small_shape", tf.keras.layers.SimpleRNN,
-     {"units": 1}, {"input_shape": (1, 1, 1)}, 100),
-    ("TimeDistributed_small_shape", tf.keras.layers.TimeDistributed,
-     {"layer": tf.keras.layers.Conv2D(1, 1)},
-     {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("Bidirectional_small_shape", tf.keras.layers.Bidirectional,
-     {}, {"input_shape": (1, 1, 1)}, 100),
-    ("ConvLSTM2D_small_shape", tf.keras.layers.ConvLSTM2D,
-     {"filters": 1, "kernel_size": 1, "activation": "relu"},
-     {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("RNN_small_shape", tf.keras.layers.RNN,
-     {"cell": tf.keras.layers.LSTMCell(1)}, {"input_shape": (1, 1, 1)}, 100),
+    (
+        "LSTM_small_shape",
+        tf.keras.layers.LSTM,
+        {"units": 1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "LSTM_normal_shape",
+        tf.keras.layers.LSTM,
+        {"units": 4},
+        {"input_shape": (32, 10, 8)},
+        100,
+    ),
+    (
+        "GRU_small_shape",
+        tf.keras.layers.GRU,
+        {"units": 1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "SimpleRNN_small_shape",
+        tf.keras.layers.SimpleRNN,
+        {"units": 1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "TimeDistributed_small_shape",
+        tf.keras.layers.TimeDistributed,
+        {"layer": tf.keras.layers.Conv2D(1, 1)},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "Bidirectional_small_shape",
+        tf.keras.layers.Bidirectional,
+        {},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "ConvLSTM2D_small_shape",
+        tf.keras.layers.ConvLSTM2D,
+        {"filters": 1, "kernel_size": 1, "activation": "relu"},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "RNN_small_shape",
+        tf.keras.layers.RNN,
+        {"cell": tf.keras.layers.LSTMCell(1)},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
 ]
 
 NORMALIZATION_LAYERS = [
-    ("BatchNormalization_small_shape", tf.keras.layers.BatchNormalization,
-     {"axis": -1}, {"input_shape": (1, 1, 1)}, 100),
-    ("LayerNormalization_small_shape", tf.keras.layers.LayerNormalization,
-     {"axis": -1}, {"input_shape": (1, 1, 1)}, 100),
+    (
+        "BatchNormalization_small_shape",
+        tf.keras.layers.BatchNormalization,
+        {"axis": -1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "LayerNormalization_small_shape",
+        tf.keras.layers.LayerNormalization,
+        {"axis": -1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
 ]
 
 REGULARIZATION_LAYERS = [
-    ("Dropout_small_shape", tf.keras.layers.Dropout,
-     {"rate": 0.2}, {"input_shape": (1, 1, 1)}, 100),
-    ("SpatialDropout1D_small_shape", tf.keras.layers.SpatialDropout1D,
-     {"rate": 0.2}, {"input_shape": (1, 1, 1)}, 100),
-    ("SpatialDropout2D_small_shape", tf.keras.layers.SpatialDropout2D,
-     {"rate": 0.2}, {"input_shape": (1, 1, 1, 1)}, 100),
-    ("SpatialDropout3D_small_shape", tf.keras.layers.SpatialDropout3D,
-     {"rate": 0.2}, {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("GaussianDropout_small_shape", tf.keras.layers.GaussianDropout,
-     {"rate": 0.2}, {"input_shape": (1, 1, 1)}, 100),
-    ("GaussianNoise_small_shape", tf.keras.layers.GaussianNoise,
-     {"stddev": 0.1}, {"input_shape": (1, 1, 1)}, 100),
-    ("ActivityRegularization_small_shape",
-     tf.keras.layers.ActivityRegularization,
-     {"l1": 0.3}, {"input_shape": (1, 1, 1)}, 100),
-    ("AlphaDropout_small_shape", tf.keras.layers.AlphaDropout,
-     {"rate": 0.2}, {"input_shape": (1, 1, 1)}, 100),
+    (
+        "Dropout_small_shape",
+        tf.keras.layers.Dropout,
+        {"rate": 0.2},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "SpatialDropout1D_small_shape",
+        tf.keras.layers.SpatialDropout1D,
+        {"rate": 0.2},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "SpatialDropout2D_small_shape",
+        tf.keras.layers.SpatialDropout2D,
+        {"rate": 0.2},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "SpatialDropout3D_small_shape",
+        tf.keras.layers.SpatialDropout3D,
+        {"rate": 0.2},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "GaussianDropout_small_shape",
+        tf.keras.layers.GaussianDropout,
+        {"rate": 0.2},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "GaussianNoise_small_shape",
+        tf.keras.layers.GaussianNoise,
+        {"stddev": 0.1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "ActivityRegularization_small_shape",
+        tf.keras.layers.ActivityRegularization,
+        {"l1": 0.3},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "AlphaDropout_small_shape",
+        tf.keras.layers.AlphaDropout,
+        {"rate": 0.2},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
 ]
 
 
 ATTENSION_LAYERS = [
-    ("Attention_small_shape", tf.keras.layers.Attention,
-     {"use_scale": False}, {"input": [np.ones((1, 1, 1)), np.ones((1, 1, 1))]},
-     100),
-    ("AdditiveAttention_small_shape", tf.keras.layers.AdditiveAttention,
-     {"use_scale": True}, {"input": [np.ones((1, 1, 1)), np.ones((1, 1, 1))]},
-     100),
+    (
+        "Attention_small_shape",
+        tf.keras.layers.Attention,
+        {"use_scale": False},
+        {"input": [np.ones((1, 1, 1)), np.ones((1, 1, 1))]},
+        100,
+    ),
+    (
+        "AdditiveAttention_small_shape",
+        tf.keras.layers.AdditiveAttention,
+        {"use_scale": True},
+        {"input": [np.ones((1, 1, 1)), np.ones((1, 1, 1))]},
+        100,
+    ),
 ]
 
 POOLING_LAYERS = [
-    ("MaxPooling1D_small_shape", tf.keras.layers.MaxPooling1D,
-     {"pool_size": 1, "strides": 1}, {"input_shape": (1, 1, 1)}, 100),
-    ("MaxPooling2D_small_shape", tf.keras.layers.MaxPooling2D,
-     {"pool_size": 1, "strides": 1}, {"input_shape": (1, 1, 1, 1)}, 100),
-    ("MaxPooling3D_small_shape", tf.keras.layers.MaxPooling3D,
-     {"pool_size": 1, "strides": 1}, {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("AveragePooling1D_small_shape", tf.keras.layers.AveragePooling1D,
-     {"pool_size": 1, "strides": 1}, {"input_shape": (1, 1, 1)}, 100),
-    ("AveragePooling2D_small_shape", tf.keras.layers.AveragePooling2D,
-     {"pool_size": 1, "strides": 1}, {"input_shape": (1, 1, 1, 1)}, 100),
-    ("AveragePooling3D_small_shape", tf.keras.layers.AveragePooling3D,
-     {"pool_size": 1, "strides": 1}, {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("GlobalMaxPooling1D_small_shape", tf.keras.layers.GlobalMaxPooling1D,
-     {}, {"input_shape": (1, 1, 1)}, 100),
-    ("GlobalMaxPooling2D_small_shape", tf.keras.layers.GlobalMaxPooling2D,
-     {}, {"input_shape": (1, 1, 1, 1)}, 100),
-    ("GlobalMaxPooling3D_small_shape", tf.keras.layers.GlobalMaxPooling3D,
-     {}, {"input_shape": (1, 1, 1, 1, 1)}, 100),
-    ("GlobalAveragePooling1D_small_shape",
-     tf.keras.layers.GlobalAveragePooling1D,
-     {}, {"input_shape": (1, 1, 1)}, 100),
-    ("GlobalAveragePooling2D_small_shape",
-     tf.keras.layers.GlobalAveragePooling2D,
-     {}, {"input_shape": (1, 1, 1, 1)}, 100),
-    ("GlobalAveragePooling3D_small_shape",
-     tf.keras.layers.GlobalAveragePooling3D,
-     {}, {"input_shape": (1, 1, 1, 1, 1)}, 100),
+    (
+        "MaxPooling1D_small_shape",
+        tf.keras.layers.MaxPooling1D,
+        {"pool_size": 1, "strides": 1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "MaxPooling2D_small_shape",
+        tf.keras.layers.MaxPooling2D,
+        {"pool_size": 1, "strides": 1},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "MaxPooling3D_small_shape",
+        tf.keras.layers.MaxPooling3D,
+        {"pool_size": 1, "strides": 1},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "AveragePooling1D_small_shape",
+        tf.keras.layers.AveragePooling1D,
+        {"pool_size": 1, "strides": 1},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "AveragePooling2D_small_shape",
+        tf.keras.layers.AveragePooling2D,
+        {"pool_size": 1, "strides": 1},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "AveragePooling3D_small_shape",
+        tf.keras.layers.AveragePooling3D,
+        {"pool_size": 1, "strides": 1},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "GlobalMaxPooling1D_small_shape",
+        tf.keras.layers.GlobalMaxPooling1D,
+        {},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "GlobalMaxPooling2D_small_shape",
+        tf.keras.layers.GlobalMaxPooling2D,
+        {},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "GlobalMaxPooling3D_small_shape",
+        tf.keras.layers.GlobalMaxPooling3D,
+        {},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "GlobalAveragePooling1D_small_shape",
+        tf.keras.layers.GlobalAveragePooling1D,
+        {},
+        {"input_shape": (1, 1, 1)},
+        100,
+    ),
+    (
+        "GlobalAveragePooling2D_small_shape",
+        tf.keras.layers.GlobalAveragePooling2D,
+        {},
+        {"input_shape": (1, 1, 1, 1)},
+        100,
+    ),
+    (
+        "GlobalAveragePooling3D_small_shape",
+        tf.keras.layers.GlobalAveragePooling3D,
+        {},
+        {"input_shape": (1, 1, 1, 1, 1)},
+        100,
+    ),
 ]
 
 
-class KerasLayerBenchmarks(  # pylint: disable=undefined-variable
+class KerasLayerBenchmarks(
     layer_benchmarks_test_base.LayerBenchmarksBase,
-    metaclass=tf.__internal__.test.ParameterizedBenchmark):
-
-  # The parameter of each layer benchmark is a tuple, and the first one is
-  # the benchmark name. It must follow the convention of
-  # "{layer_name}_{small|normal|large}_shape" to make it compatible with
-  # `self.report_benchmark()` method.
-  _benchmark_parameters = benchmark_util.generate_benchmark_params_cpu_gpu(
-      CORE_LAYERS + CONV_LAYERS + RECURRENT_LAYERS + NORMALIZATION_LAYERS +
-      REGULARIZATION_LAYERS + ATTENSION_LAYERS + POOLING_LAYERS)
-
-  def benchmark_layer_call(self, layer_cls, layer_args, inputs, num_iters):
-    layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
-    x = _get_input_data(inputs)
-
-    fn = functools.partial(layer, x)
-    name = benchmark_util.get_benchmark_name(self._get_name())
-    metadata = {"implementation": name[0] + ".layer.call"}
-    metadata.update(_get_metadata(name))
-    self.run_report(fn, num_iters, metadata)
-
-  def benchmark_layer_call_with_function(
-      self, layer_cls, layer_args, inputs, num_iters):
-    layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
-    x = _get_input_data(inputs)
-    layer.call = tf.function(layer.call)
-
-    fn = functools.partial(layer, x)
-    name = benchmark_util.get_benchmark_name(self._get_name())
-    metadata = {"implementation": name[0] + ".layer.call.function"}
-    metadata.update(_get_metadata(name))
-    self.run_report(fn, num_iters, metadata)
-
-  def benchmark_layer_call_with_xla(
-      self, layer_cls, layer_args, inputs, num_iters):
-    name = benchmark_util.get_benchmark_name(self._get_name())
-    # TODO(b/173461426)
-    if layer_cls is tf.keras.layers.Embedding and name[-1] == "GPU":
-      return
-    layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
-    x = _get_input_data(inputs)
-    layer.call = tf.function(
-        layer.call, jit_compile=True)
-
-    fn = functools.partial(layer, x)
-    metadata = {"implementation": name[0] + ".layer.call.xla"}
-    metadata.update(_get_metadata(name))
-    self.run_report(fn, num_iters, metadata)
-
-  def benchmark_layer_call_backward(
-      self, layer_cls, layer_args, inputs, num_iters):
-    layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
-    x = _get_input_data(inputs)
-
-    fn = functools.partial(_layer_call_backward, layer, x)
-    name = benchmark_util.get_benchmark_name(self._get_name())
-    metadata = {"implementation": name[0] + ".layer.call.backward"}
-    metadata.update(_get_metadata(name))
-    self.run_report(fn, num_iters, metadata)
-
-  def benchmark_layer_call_backward_with_function(
-      self, layer_cls, layer_args, inputs, num_iters):
-    layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
-    x = _get_input_data(inputs)
-    layer.call = tf.function(layer.call)
-
-    fn = functools.partial(_layer_call_backward, layer, x)
-    name = benchmark_util.get_benchmark_name(self._get_name())
-    metadata = {"implementation": name[0] + ".layer.call.backward.function"}
-    metadata.update(_get_metadata(name))
-    self.run_report(fn, num_iters, metadata)
-
-  def benchmark_layer_call_backward_with_xla(
-      self, layer_cls, layer_args, inputs, num_iters):
-    name = benchmark_util.get_benchmark_name(self._get_name())
-    # TODO(b/153480400)
-    if layer_cls in [
-        tf.keras.layers.LSTM, tf.keras.layers.Bidirectional,
-        tf.keras.layers.ConvLSTM2D, tf.keras.layers.GRU, tf.keras.layers.RNN,
-        tf.keras.layers.SimpleRNN
-    ]:
-      return
-    # TODO(b/173461426)
-    if layer_cls is tf.keras.layers.Embedding and name[-1] == "GPU":
-      return
-    layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
-    x = _get_input_data(inputs)
-    layer.call = tf.function(
-        layer.call, jit_compile=True)
-
-    fn = functools.partial(_layer_call_backward, layer, x)
-    metadata = {"implementation": name[0] + ".layer.call.backward.xla"}
-    metadata.update(_get_metadata(name))
-    self.run_report(fn, num_iters, metadata)
+    metaclass=tf.__internal__.test.ParameterizedBenchmark,
+):
+
+    # The parameter of each layer benchmark is a tuple, and the first one is
+    # the benchmark name. It must follow the convention of
+    # "{layer_name}_{small|normal|large}_shape" to make it compatible with
+    # `self.report_benchmark()` method.
+    _benchmark_parameters = benchmark_util.generate_benchmark_params_cpu_gpu(
+        CORE_LAYERS
+        + CONV_LAYERS
+        + RECURRENT_LAYERS
+        + NORMALIZATION_LAYERS
+        + REGULARIZATION_LAYERS
+        + ATTENSION_LAYERS
+        + POOLING_LAYERS
+    )
+
+    def benchmark_layer_call(self, layer_cls, layer_args, inputs, num_iters):
+        layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
+        x = _get_input_data(inputs)
+
+        fn = functools.partial(layer, x)
+        name = benchmark_util.get_benchmark_name(self._get_name())
+        metadata = {"implementation": name[0] + ".layer.call"}
+        metadata.update(_get_metadata(name))
+        self.run_report(fn, num_iters, metadata)
+
+    def benchmark_layer_call_with_function(
+        self, layer_cls, layer_args, inputs, num_iters
+    ):
+        layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
+        x = _get_input_data(inputs)
+        layer.call = tf.function(layer.call)
+
+        fn = functools.partial(layer, x)
+        name = benchmark_util.get_benchmark_name(self._get_name())
+        metadata = {"implementation": name[0] + ".layer.call.function"}
+        metadata.update(_get_metadata(name))
+        self.run_report(fn, num_iters, metadata)
+
+    def benchmark_layer_call_with_xla(
+        self, layer_cls, layer_args, inputs, num_iters
+    ):
+        name = benchmark_util.get_benchmark_name(self._get_name())
+        # TODO(b/173461426)
+        if layer_cls is tf.keras.layers.Embedding and name[-1] == "GPU":
+            return
+        layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
+        x = _get_input_data(inputs)
+        layer.call = tf.function(layer.call, jit_compile=True)
+
+        fn = functools.partial(layer, x)
+        metadata = {"implementation": name[0] + ".layer.call.xla"}
+        metadata.update(_get_metadata(name))
+        self.run_report(fn, num_iters, metadata)
+
+    def benchmark_layer_call_backward(
+        self, layer_cls, layer_args, inputs, num_iters
+    ):
+        layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
+        x = _get_input_data(inputs)
+
+        fn = functools.partial(_layer_call_backward, layer, x)
+        name = benchmark_util.get_benchmark_name(self._get_name())
+        metadata = {"implementation": name[0] + ".layer.call.backward"}
+        metadata.update(_get_metadata(name))
+        self.run_report(fn, num_iters, metadata)
+
+    def benchmark_layer_call_backward_with_function(
+        self, layer_cls, layer_args, inputs, num_iters
+    ):
+        layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
+        x = _get_input_data(inputs)
+        layer.call = tf.function(layer.call)
+
+        fn = functools.partial(_layer_call_backward, layer, x)
+        name = benchmark_util.get_benchmark_name(self._get_name())
+        metadata = {"implementation": name[0] + ".layer.call.backward.function"}
+        metadata.update(_get_metadata(name))
+        self.run_report(fn, num_iters, metadata)
+
+    def benchmark_layer_call_backward_with_xla(
+        self, layer_cls, layer_args, inputs, num_iters
+    ):
+        name = benchmark_util.get_benchmark_name(self._get_name())
+        # TODO(b/153480400)
+        if layer_cls in [
+            tf.keras.layers.LSTM,
+            tf.keras.layers.Bidirectional,
+            tf.keras.layers.ConvLSTM2D,
+            tf.keras.layers.GRU,
+            tf.keras.layers.RNN,
+            tf.keras.layers.SimpleRNN,
+        ]:
+            return
+        # TODO(b/173461426)
+        if layer_cls is tf.keras.layers.Embedding and name[-1] == "GPU":
+            return
+        layer = layer_cls(**_get_layer_args(layer_cls, layer_args))
+        x = _get_input_data(inputs)
+        layer.call = tf.function(layer.call, jit_compile=True)
+
+        fn = functools.partial(_layer_call_backward, layer, x)
+        metadata = {"implementation": name[0] + ".layer.call.backward.xla"}
+        metadata.update(_get_metadata(name))
+        self.run_report(fn, num_iters, metadata)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/benchmarks/layer_benchmarks/layer_benchmarks_test_base.py b/keras/benchmarks/layer_benchmarks/layer_benchmarks_test_base.py
index 8331240e4d42..d64e95c241df 100644
--- a/keras/benchmarks/layer_benchmarks/layer_benchmarks_test_base.py
+++ b/keras/benchmarks/layer_benchmarks/layer_benchmarks_test_base.py
@@ -18,58 +18,69 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import time
 
+import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.layer_benchmarks import run_xprof
 
 
 class LayerBenchmarksBase(tf.test.Benchmark):
-  """Run and report benchmark results.
+    """Run and report benchmark results.
 
-  The first run is without any profiling to purly measure running time.
-  Second run is with xprof but no python trace.
-  Third run is with xprof and python trace.
-  Note: xprof runs fewer iterations, and the maximum iterations is 100.
-  """
+    The first run is without any profiling to purly measure running time.
+    Second run is with xprof but no python trace.
+    Third run is with xprof and python trace.
+    Note: xprof runs fewer iterations, and the maximum iterations is 100.
+    """
 
-  def run_report(self, func, num_iters, metadata=None):
-    """Run and report benchmark results for different settings."""
+    def run_report(self, func, num_iters, metadata=None):
+        """Run and report benchmark results for different settings."""
 
-    # 0. Warm up.
-    func()
+        # 0. Warm up.
+        func()
 
-    # 1. Run without profiling.
-    start = time.time()
-    for _ in range(num_iters):
-      func()
-    total_time = time.time() - start
-    us_mean_time = total_time * 1e6 / num_iters
+        # 1. Run without profiling.
+        start = time.time()
+        for _ in range(num_iters):
+            func()
+        total_time = time.time() - start
+        us_mean_time = total_time * 1e6 / num_iters
 
-    metrics = [
-        {"name": "examples_per_sec",
-         "value": float("{0:.3f}".format(num_iters / total_time))},
-        {"name": "us_per_example",
-         "value": float("{0:.3f}".format(us_mean_time))}]
+        metrics = [
+            {
+                "name": "examples_per_sec",
+                "value": float(f"{num_iters / total_time:.3f}"),
+            },
+            {
+                "name": "us_per_example",
+                "value": float(f"{us_mean_time:.3f}"),
+            },
+        ]
 
-    # 2. Run with xprof with no python trace.
-    num_iters_xprof = min(100, num_iters)
-    xprof_link, us_per_example = run_xprof.run_with_xprof(
-        func, num_iters_xprof, False)
-    # This xprof link will appear in the benchmark dashboard.
-    extras = {
-        "xprof_link": xprof_link,
-        "us_per_example_with_xprof": us_per_example
-    }
+        # 2. Run with xprof with no python trace.
+        num_iters_xprof = min(100, num_iters)
+        xprof_link, us_per_example = run_xprof.run_with_xprof(
+            func, num_iters_xprof, False
+        )
+        # This xprof link will appear in the benchmark dashboard.
+        extras = {
+            "xprof_link": xprof_link,
+            "us_per_example_with_xprof": us_per_example,
+        }
 
-    # 3. Run with xprof and python trace.
-    xprof_link, us_per_example = run_xprof.run_with_xprof(
-        func, num_iters_xprof, True)
-    extras["python_trace_xprof_link"] = xprof_link
-    extras["us_per_example_with_xprof_and_python"] = us_per_example
+        # 3. Run with xprof and python trace.
+        xprof_link, us_per_example = run_xprof.run_with_xprof(
+            func, num_iters_xprof, True
+        )
+        extras["python_trace_xprof_link"] = xprof_link
+        extras["us_per_example_with_xprof_and_python"] = us_per_example
 
-    if metadata:
-      extras.update(metadata)
-    self.report_benchmark(
-        iters=num_iters, wall_time=us_mean_time, extras=extras, metrics=metrics)
+        if metadata:
+            extras.update(metadata)
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=us_mean_time,
+            extras=extras,
+            metrics=metrics,
+        )
diff --git a/keras/benchmarks/layer_benchmarks/run_xprof.py b/keras/benchmarks/layer_benchmarks/run_xprof.py
index aef4d7b98771..1eb65a367a4c 100644
--- a/keras/benchmarks/layer_benchmarks/run_xprof.py
+++ b/keras/benchmarks/layer_benchmarks/run_xprof.py
@@ -16,25 +16,32 @@
 from __future__ import division as _division
 from __future__ import print_function as _print_function
 
+import os
 import time
 import uuid
 
 from tensorflow.python.profiler import profiler_v2 as profiler
 
-def run_with_xprof(self, func, num_iters_xprof=100, enable_python_trace=True,
-                   logdir='/tmp/layer_benchmark_xprof/'):
-  suid = str(uuid.uuid4())
-  if enable_python_trace:
-    options = profiler.ProfilerOptions(python_tracer_level=1)
-    logdir = os.path.join(logdir, str(uuid.uuid4()) + "_with_python")
-  else:
-    options = profiler.ProfilerOptions(python_tracer_level=0)
-    logdir = os.path.join(logdir, suid)
 
-  start = time.time()
-  with profiler.Profile(logdir, options):
-    for _ in range(num_iters_xprof):
-      func()
-  total_time = time.time() - start
-  us_per_example = float("{0:.3f}".format(total_time * 1e6 / num_iters_xprof))
-  return logdir, us_per_example
+def run_with_xprof(
+    self,
+    func,
+    num_iters_xprof=100,
+    enable_python_trace=True,
+    logdir="/tmp/layer_benchmark_xprof/",
+):
+    suid = str(uuid.uuid4())
+    if enable_python_trace:
+        options = profiler.ProfilerOptions(python_tracer_level=1)
+        logdir = os.path.join(logdir, str(uuid.uuid4()) + "_with_python")
+    else:
+        options = profiler.ProfilerOptions(python_tracer_level=0)
+        logdir = os.path.join(logdir, suid)
+
+    start = time.time()
+    with profiler.Profile(logdir, options):
+        for _ in range(num_iters_xprof):
+            func()
+    total_time = time.time() - start
+    us_per_example = float(f"{total_time * 1000000.0 / num_iters_xprof:.3f}")
+    return logdir, us_per_example
diff --git a/keras/benchmarks/metrics_memory_benchmark_test.py b/keras/benchmarks/metrics_memory_benchmark_test.py
index 07ab36e6cbc0..2bc58d85e3c6 100644
--- a/keras/benchmarks/metrics_memory_benchmark_test.py
+++ b/keras/benchmarks/metrics_memory_benchmark_test.py
@@ -14,59 +14,64 @@
 # ==============================================================================
 """Benchmark tests for Keras metrics memory consumption."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 try:
-  import memory_profiler  # pylint:disable=g-import-not-at-top
+    import memory_profiler
 except ImportError:
-  memory_profiler = None
+    memory_profiler = None
 
 
 class KerasMetricMemoryBenchmark(tf.test.Benchmark):
 
-  # This test is added to measure the memory footprint for
-  # metrics_utils._update_confusion_matrix_variables_optimized().
+    # This test is added to measure the memory footprint for
+    # metrics_utils._update_confusion_matrix_variables_optimized().
 
-  def benchmark_auc_memory_usage(self):
-    if memory_profiler is None:
-      self.skipTest('Skip test since memory_profiler is not available.')
+    def benchmark_auc_memory_usage(self):
+        if memory_profiler is None:
+            self.skipTest("Skip test since memory_profiler is not available.")
 
-    with tf.compat.forward_compatibility_horizon(2021, 6, 9):
-      self.y_true = np.random.randint(2, size=(1024, 1024))
-      self.y_pred = np.random.rand(1024, 1024)
+        with tf.compat.forward_compatibility_horizon(2021, 6, 9):
+            self.y_true = np.random.randint(2, size=(1024, 1024))
+            self.y_pred = np.random.rand(1024, 1024)
 
-      memory_usage_1 = memory_profiler.memory_usage((self.even_thresholds_auc))
-      memory_usage_2 = memory_profiler.memory_usage(
-          (self.uneven_thresholds_auc))
-      # memory usage is a list of number which sampled when running the function
-      # The pure memory consumption is approximately max(usage) - min(usage)
-      memory_usage_1 = max(memory_usage_1) - min(memory_usage_1)
-      memory_usage_2 = max(memory_usage_2) - min(memory_usage_2)
+            memory_usage_1 = memory_profiler.memory_usage(
+                (self.even_thresholds_auc)
+            )
+            memory_usage_2 = memory_profiler.memory_usage(
+                (self.uneven_thresholds_auc)
+            )
+            # memory usage is a list of number which sampled when running the
+            # function The pure memory consumption is approximately max(usage) -
+            # min(usage)
+            memory_usage_1 = max(memory_usage_1) - min(memory_usage_1)
+            memory_usage_2 = max(memory_usage_2) - min(memory_usage_2)
 
-      metrics = {'even_threshold_memory_usage': memory_usage_1,
-                 'uneven_threshold_memory_usage': memory_usage_2}
-      self.report_benchmark(iters=1, metrics=metrics)
+            metrics = {
+                "even_threshold_memory_usage": memory_usage_1,
+                "uneven_threshold_memory_usage": memory_usage_2,
+            }
+            self.report_benchmark(iters=1, metrics=metrics)
 
-  def even_thresholds_auc(self):
-    auc = tf.keras.metrics.AUC(num_thresholds=200)
-    self.assertTrue(auc._thresholds_distributed_evenly)
+    def even_thresholds_auc(self):
+        auc = tf.keras.metrics.AUC(num_thresholds=200)
+        self.assertTrue(auc._thresholds_distributed_evenly)
 
-    auc(self.y_true, self.y_pred)
+        auc(self.y_true, self.y_pred)
 
-  def uneven_thresholds_auc(self):
-    num_thresholds = 200
-    thresholds = [x / (num_thresholds - 1) for x in range(num_thresholds)]
-    thresholds[100] += 1 / 200
-    thresholds = thresholds[1:-1]
+    def uneven_thresholds_auc(self):
+        num_thresholds = 200
+        thresholds = [x / (num_thresholds - 1) for x in range(num_thresholds)]
+        thresholds[100] += 1 / 200
+        thresholds = thresholds[1:-1]
 
-    auc = tf.keras.metrics.AUC(thresholds=thresholds)
-    self.assertFalse(auc._thresholds_distributed_evenly)
-    self.assertEqual(auc.num_thresholds, num_thresholds)
+        auc = tf.keras.metrics.AUC(thresholds=thresholds)
+        self.assertFalse(auc._thresholds_distributed_evenly)
+        self.assertEqual(auc.num_thresholds, num_thresholds)
 
-    auc(self.y_true, self.y_pred)
+        auc(self.y_true, self.y_pred)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/model_components_benchmarks_test.py b/keras/benchmarks/model_components_benchmarks_test.py
index af637ad28a23..f10f07294b29 100644
--- a/keras/benchmarks/model_components_benchmarks_test.py
+++ b/keras/benchmarks/model_components_benchmarks_test.py
@@ -14,277 +14,300 @@
 # ==============================================================================
 r"""Benchmarks on Keras components with different Keras model types."""
 
-import tensorflow.compat.v2 as tf
-
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.eager import context
 from tensorflow.python.eager.context import get_executor
 
 
 class SubclassedKerasModel(tf.keras.Model):
-
-  def __init__(self, initializer="ones"):
-    super().__init__()
-    self.layer_a = tf.keras.layers.Dense(
-        64, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_b = tf.keras.layers.Dense(
-        128, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_c = tf.keras.layers.Dense(
-        256, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_d = tf.keras.layers.Dense(
-        256, kernel_initializer=initializer, bias_initializer="zeros")
-    self.layer_e = tf.keras.layers.Dense(
-        10, kernel_initializer=initializer, bias_initializer="zeros")
-
-  def call(self, x):
-    x = self.layer_a(x)
-    x = self.layer_b(x)
-    x = self.layer_c(x)
-    x = self.layer_d(x)
-    return self.layer_e(x)
+    def __init__(self, initializer="ones"):
+        super().__init__()
+        self.layer_a = tf.keras.layers.Dense(
+            64, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+        self.layer_b = tf.keras.layers.Dense(
+            128, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+        self.layer_c = tf.keras.layers.Dense(
+            256, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+        self.layer_d = tf.keras.layers.Dense(
+            256, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+        self.layer_e = tf.keras.layers.Dense(
+            10, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+
+    def call(self, x):
+        x = self.layer_a(x)
+        x = self.layer_b(x)
+        x = self.layer_c(x)
+        x = self.layer_d(x)
+        return self.layer_e(x)
 
 
 def make_keras_model(initializer="ones"):
-  model_input = tf.keras.Input(shape=(10,))
-  x = tf.keras.layers.Dense(
-      64, kernel_initializer=initializer, bias_initializer="zeros")(model_input)
-  x = tf.keras.layers.Dense(
-      128, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  x = tf.keras.layers.Dense(
-      256, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  x = tf.keras.layers.Dense(
-      256, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  x = tf.keras.layers.Dense(
-      10, kernel_initializer=initializer, bias_initializer="zeros")(x)
-  return tf.keras.Model(inputs=model_input, outputs=x)
+    model_input = tf.keras.Input(shape=(10,))
+    x = tf.keras.layers.Dense(
+        64, kernel_initializer=initializer, bias_initializer="zeros"
+    )(model_input)
+    x = tf.keras.layers.Dense(
+        128, kernel_initializer=initializer, bias_initializer="zeros"
+    )(x)
+    x = tf.keras.layers.Dense(
+        256, kernel_initializer=initializer, bias_initializer="zeros"
+    )(x)
+    x = tf.keras.layers.Dense(
+        256, kernel_initializer=initializer, bias_initializer="zeros"
+    )(x)
+    x = tf.keras.layers.Dense(
+        10, kernel_initializer=initializer, bias_initializer="zeros"
+    )(x)
+    return tf.keras.Model(inputs=model_input, outputs=x)
 
 
 def make_sequential_keras_model(initializer="ones"):
-  model = tf.keras.models.Sequential()
-  model.add(tf.keras.layers.Dense(
-      64, kernel_initializer=initializer, bias_initializer="zeros",
-      input_shape=(10,)))
-  model.add(tf.keras.layers.Dense(
-      128, kernel_initializer=initializer, bias_initializer="zeros"))
-  model.add(tf.keras.layers.Dense(
-      256, kernel_initializer=initializer, bias_initializer="zeros"))
-  model.add(tf.keras.layers.Dense(
-      256, kernel_initializer=initializer, bias_initializer="zeros"))
-  model.add(tf.keras.layers.Dense(
-      10, kernel_initializer=initializer, bias_initializer="zeros"))
-  return model
+    model = tf.keras.models.Sequential()
+    model.add(
+        tf.keras.layers.Dense(
+            64,
+            kernel_initializer=initializer,
+            bias_initializer="zeros",
+            input_shape=(10,),
+        )
+    )
+    model.add(
+        tf.keras.layers.Dense(
+            128, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+    )
+    model.add(
+        tf.keras.layers.Dense(
+            256, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+    )
+    model.add(
+        tf.keras.layers.Dense(
+            256, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+    )
+    model.add(
+        tf.keras.layers.Dense(
+            10, kernel_initializer=initializer, bias_initializer="zeros"
+        )
+    )
+    return model
 
 
 def run_benchmark(func, num_iters, execution_mode=None):
-  with context.execution_mode(execution_mode):
-    # call func to warm up
-    func()
-    if execution_mode == context.ASYNC:
-      get_executor().wait()
-    start = time.time()
-    for _ in range(num_iters):
-      func()
-    if execution_mode == context.ASYNC:
-      get_executor().wait()
-    end = time.time()
+    with context.execution_mode(execution_mode):
+        # call func to warm up
+        func()
+        if execution_mode == context.ASYNC:
+            get_executor().wait()
+        start = time.time()
+        for _ in range(num_iters):
+            func()
+        if execution_mode == context.ASYNC:
+            get_executor().wait()
+        end = time.time()
 
-    return end - start
+        return end - start
 
 
 class KerasComponentsBenchmarks(tf.test.Benchmark):
+    def _run(self, func, num_iters, execution_mode=None):
+        total_time = run_benchmark(func, num_iters, execution_mode)
+        mean_us = total_time * 1e6 / num_iters
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=mean_us,
+            metrics=[
+                {
+                    "name": "exp_per_sec",
+                    "value": float(f"{num_iters / total_time:.3f}"),
+                },
+                {
+                    "name": "us_per_exp",
+                    "value": float(f"{total_time * 1000000.0 / num_iters:.3f}"),
+                },
+            ],
+        )
+
+    def benchmark_keras_model_subclassed(self):
+        model = SubclassedKerasModel()
+        data = tf.random.uniform((10, 10))
+
+        func = lambda: model(data)
+        # First call is more expensive (creates variables etc.), discount that.
+        func()
+
+        # The whole point of this test is to contrast subclassing with
+        # the functional style of keras model building, so validate that
+        # the models are equivalent.
+        assert np.equal(func(), make_keras_model()(data)).all()
+
+        self._run(func, 30000)
+
+    def benchmark_keras_model_functional(self):
+        model = make_keras_model()
+        data = tf.random.uniform((10, 10))
+        func = lambda: model(data)
+        # Symmetry with benchmark_keras_model_subclassed
+        func()
+        assert np.equal(func(), SubclassedKerasModel()(data)).all()
+        self._run(func, 30000)
+
+    def benchmark_keras_model_sequential(self):
+        model = make_sequential_keras_model()
+        data = tf.random.uniform((10, 10))
+        func = lambda: model(data)
+        # Symmetry with benchmark_keras_model_functional
+        func()
+        assert np.equal(func(), make_keras_model()(data)).all()
+        self._run(func, 30000)
+
+    def _benchmark_keras_model_fit(self, model, run_eagerly=False):
+        data = tf.random.uniform((10, 10), minval=-1, maxval=1)
+        labels = tf.random.uniform((10, 10), minval=-1, maxval=1)
+        dataset = tf.data.Dataset.from_tensors((data, labels)).repeat()
+        model.compile("sgd", loss="mse", run_eagerly=run_eagerly)
+        func = lambda: model.fit(
+            dataset, epochs=1, steps_per_epoch=1000, verbose=0
+        )
+        # First call is more expensive (creates variables etc.), discount that.
+        model.fit(dataset, epochs=1, steps_per_epoch=1, verbose=0)
+
+        self._run(func, 1)
+
+    def _benchmark_keras_model_evaluate(self, model, run_eagerly=False):
+        data = tf.random.uniform((10, 10), minval=-1, maxval=1)
+        labels = tf.random.uniform((10, 10), minval=-1, maxval=1)
+        dataset = tf.data.Dataset.from_tensors((data, labels)).repeat()
+        model.compile("sgd", loss="mse", run_eagerly=run_eagerly)
+        func = lambda: model.evaluate(dataset, steps=1000, verbose=0)
+        # First call is more expensive (creates variables etc.), discount that.
+        model.evaluate(dataset, steps=1, verbose=0)
+
+        self._run(func, 1)
+
+    def _benchmark_keras_model_predict(self, model, run_eagerly=False):
+        data = tf.random.uniform((10, 10), minval=-1, maxval=1)
+        dataset = tf.data.Dataset.from_tensors(data).repeat()
+        model.compile("sgd", loss="mse", run_eagerly=run_eagerly)
+        func = lambda: model.predict(dataset, steps=1000, verbose=0)
+        # First call is more expensive (creates variables etc.), discount that.
+        model.predict(dataset, steps=1, verbose=0)
+
+        self._run(func, 1)
+
+    def benchmark_keras_model_subclassed_fit(self):
+        model = SubclassedKerasModel(initializer="glorot_uniform")
+        self._benchmark_keras_model_fit(model)
+
+    def benchmark_keras_model_subclassed_fit_graph_mode(self):
+        with context.graph_mode():
+            model = SubclassedKerasModel(initializer="glorot_uniform")
+            self._benchmark_keras_model_fit(model)
+
+    def benchmark_keras_model_subclassed_fit_run_model_eagerly(self):
+        model = SubclassedKerasModel(initializer="glorot_uniform")
+        self._benchmark_keras_model_fit(model, run_eagerly=True)
+
+    def benchmark_keras_model_functional_fit(self):
+        model = make_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_fit(model)
+
+    def benchmark_keras_model_functional_fit_graph_mode(self):
+        with context.graph_mode():
+            model = make_keras_model(initializer="glorot_uniform")
+            self._benchmark_keras_model_fit(model)
+
+    def benchmark_keras_model_functional_fit_graph_mode_with_profiler(self):
+        tf.profiler.experimental.start("")
+        with context.graph_mode():
+            model = make_keras_model(initializer="glorot_uniform")
+            self._benchmark_keras_model_fit(model)
+        tf.profiler.experimental.stop(save=False)
+
+    def benchmark_keras_model_functional_fit_run_model_eagerly(self):
+        model = make_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_fit(model, run_eagerly=True)
+
+    def benchmark_keras_model_functional_fit_run_model_eagerly_with_profiler(
+        self,
+    ):
+        tf.profiler.experimental.start("")
+        model = make_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_fit(model, run_eagerly=True)
+        tf.profiler.experimental.stop(save=False)
+
+    def benchmark_keras_model_sequential_fit(self):
+        model = make_sequential_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_fit(model)
+
+    def benchmark_keras_model_sequential_fit_graph_mode(self):
+        with context.graph_mode():
+            model = make_sequential_keras_model(initializer="glorot_uniform")
+            self._benchmark_keras_model_fit(model)
+
+    def benchmark_keras_model_sequential_fit_run_model_eagerly(self):
+        model = make_sequential_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_fit(model, run_eagerly=True)
+
+    def benchmark_keras_model_subclassed_evaluate(self):
+        model = SubclassedKerasModel(initializer="glorot_uniform")
+        self._benchmark_keras_model_evaluate(model)
+
+    def benchmark_keras_model_subclassed_evaluate_run_model_eagerly(self):
+        model = SubclassedKerasModel(initializer="glorot_uniform")
+        self._benchmark_keras_model_evaluate(model, run_eagerly=True)
+
+    def benchmark_keras_model_functional_evaluate(self):
+        model = make_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_evaluate(model)
+
+    def benchmark_keras_model_functional_evaluate_run_model_eagerly(self):
+        model = make_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_evaluate(model, run_eagerly=True)
+
+    def benchmark_keras_model_sequential_evaluate(self):
+        model = make_sequential_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_evaluate(model)
+
+    def benchmark_keras_model_sequential_evaluate_run_model_eagerly(self):
+        model = make_sequential_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_evaluate(model, run_eagerly=True)
+
+    def benchmark_keras_model_subclassed_predict(self):
+        model = SubclassedKerasModel(initializer="glorot_uniform")
+        self._benchmark_keras_model_predict(model)
+
+    def benchmark_keras_model_subclassed_predict_run_model_eagerly(self):
+        model = SubclassedKerasModel(initializer="glorot_uniform")
+        self._benchmark_keras_model_predict(model, run_eagerly=True)
+
+    def benchmark_keras_model_functional_predict(self):
+        model = make_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_predict(model)
+
+    def benchmark_keras_model_functional_predict_run_model_eagerly(self):
+        model = make_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_predict(model, run_eagerly=True)
+
+    def benchmark_keras_model_sequential_predict(self):
+        model = make_sequential_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_predict(model)
 
-  def _run(self, func, num_iters, execution_mode=None):
-    total_time = run_benchmark(func, num_iters, execution_mode)
-    mean_us = total_time * 1e6 / num_iters
-    self.report_benchmark(
-        iters=num_iters,
-        wall_time=mean_us,
-        metrics=[
-            {
-                "name": "exp_per_sec",
-                "value": float("{0:.3f}".format(num_iters / total_time))
-            },
-            {
-                "name": "us_per_exp",
-                "value": float("{0:.3f}".format(total_time * 1e6 / num_iters))
-            },
-        ])
-
-  def benchmark_keras_model_subclassed(self):
-    model = SubclassedKerasModel()
-    data = tf.random.uniform((10, 10))
-
-    func = lambda: model(data)  # pylint: disable=not-callable
-    # First call is more expensive (creates variables etc.), discount that.
-    func()
-
-    # The whole point of this test is to contrast subclassing with
-    # the functional style of keras model building, so validate that
-    # the models are equivalent.
-    assert np.equal(func(), make_keras_model()(data)).all()
-
-    self._run(func, 30000)
-
-  def benchmark_keras_model_functional(self):
-    model = make_keras_model()
-    data = tf.random.uniform((10, 10))
-    func = lambda: model(data)  # pylint: disable=not-callable
-    # Symmetry with benchmark_keras_model_subclassed
-    func()
-    assert np.equal(func(), SubclassedKerasModel()(data)).all()  # pylint: disable=not-callable
-    self._run(func, 30000)
-
-  def benchmark_keras_model_sequential(self):
-    model = make_sequential_keras_model()
-    data = tf.random.uniform((10, 10))
-    func = lambda: model(data)
-    # Symmetry with benchmark_keras_model_functional
-    func()
-    assert np.equal(func(), make_keras_model()(data)).all()
-    self._run(func, 30000)
-
-  def _benchmark_keras_model_fit(self, model, run_eagerly=False):
-    data = tf.random.uniform((10, 10), minval=-1, maxval=1)
-    labels = tf.random.uniform((10, 10), minval=-1, maxval=1)
-    dataset = tf.data.Dataset.from_tensors((data, labels)).repeat()
-    model.compile(
-        "sgd",
-        loss="mse", run_eagerly=run_eagerly)
-    func = lambda: model.fit(dataset, epochs=1, steps_per_epoch=1000, verbose=0)
-    # First call is more expensive (creates variables etc.), discount that.
-    model.fit(dataset, epochs=1, steps_per_epoch=1, verbose=0)
-
-    self._run(func, 1)
-
-  def _benchmark_keras_model_evaluate(self, model, run_eagerly=False):
-    data = tf.random.uniform((10, 10), minval=-1, maxval=1)
-    labels = tf.random.uniform((10, 10), minval=-1, maxval=1)
-    dataset = tf.data.Dataset.from_tensors((data, labels)).repeat()
-    model.compile(
-        "sgd",
-        loss="mse", run_eagerly=run_eagerly)
-    func = lambda: model.evaluate(dataset, steps=1000, verbose=0)
-    # First call is more expensive (creates variables etc.), discount that.
-    model.evaluate(dataset, steps=1, verbose=0)
-
-    self._run(func, 1)
-
-  def _benchmark_keras_model_predict(self, model, run_eagerly=False):
-    data = tf.random.uniform((10, 10), minval=-1, maxval=1)
-    dataset = tf.data.Dataset.from_tensors(data).repeat()
-    model.compile(
-        "sgd",
-        loss="mse", run_eagerly=run_eagerly)
-    func = lambda: model.predict(dataset, steps=1000, verbose=0)
-    # First call is more expensive (creates variables etc.), discount that.
-    model.predict(dataset, steps=1, verbose=0)
-
-    self._run(func, 1)
-
-  def benchmark_keras_model_subclassed_fit(self):
-    model = SubclassedKerasModel(initializer="glorot_uniform")
-    self._benchmark_keras_model_fit(model)
-
-  def benchmark_keras_model_subclassed_fit_graph_mode(self):
-    with context.graph_mode():
-      model = SubclassedKerasModel(initializer="glorot_uniform")
-      self._benchmark_keras_model_fit(model)
-
-  def benchmark_keras_model_subclassed_fit_run_model_eagerly(self):
-    model = SubclassedKerasModel(initializer="glorot_uniform")
-    self._benchmark_keras_model_fit(model, run_eagerly=True)
-
-  def benchmark_keras_model_functional_fit(self):
-    model = make_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_fit(model)
-
-  def benchmark_keras_model_functional_fit_graph_mode(self):
-    with context.graph_mode():
-      model = make_keras_model(initializer="glorot_uniform")
-      self._benchmark_keras_model_fit(model)
-
-  def benchmark_keras_model_functional_fit_graph_mode_with_profiler(self):
-    tf.profiler.experimental.start("")
-    with context.graph_mode():
-      model = make_keras_model(initializer="glorot_uniform")
-      self._benchmark_keras_model_fit(model)
-    tf.profiler.experimental.stop(save=False)
-
-  def benchmark_keras_model_functional_fit_run_model_eagerly(self):
-    model = make_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_fit(model, run_eagerly=True)
-
-  def benchmark_keras_model_functional_fit_run_model_eagerly_with_profiler(
-      self):
-    tf.profiler.experimental.start("")
-    model = make_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_fit(model, run_eagerly=True)
-    tf.profiler.experimental.stop(save=False)
-
-  def benchmark_keras_model_sequential_fit(self):
-    model = make_sequential_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_fit(model)
-
-  def benchmark_keras_model_sequential_fit_graph_mode(self):
-    with context.graph_mode():
-      model = make_sequential_keras_model(initializer="glorot_uniform")
-      self._benchmark_keras_model_fit(model)
-
-  def benchmark_keras_model_sequential_fit_run_model_eagerly(self):
-    model = make_sequential_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_fit(model, run_eagerly=True)
-
-  def benchmark_keras_model_subclassed_evaluate(self):
-    model = SubclassedKerasModel(initializer="glorot_uniform")
-    self._benchmark_keras_model_evaluate(model)
-
-  def benchmark_keras_model_subclassed_evaluate_run_model_eagerly(self):
-    model = SubclassedKerasModel(initializer="glorot_uniform")
-    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
-
-  def benchmark_keras_model_functional_evaluate(self):
-    model = make_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_evaluate(model)
-
-  def benchmark_keras_model_functional_evaluate_run_model_eagerly(self):
-    model = make_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
-
-  def benchmark_keras_model_sequential_evaluate(self):
-    model = make_sequential_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_evaluate(model)
-
-  def benchmark_keras_model_sequential_evaluate_run_model_eagerly(self):
-    model = make_sequential_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
-
-  def benchmark_keras_model_subclassed_predict(self):
-    model = SubclassedKerasModel(initializer="glorot_uniform")
-    self._benchmark_keras_model_predict(model)
-
-  def benchmark_keras_model_subclassed_predict_run_model_eagerly(self):
-    model = SubclassedKerasModel(initializer="glorot_uniform")
-    self._benchmark_keras_model_predict(model, run_eagerly=True)
-
-  def benchmark_keras_model_functional_predict(self):
-    model = make_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_predict(model)
-
-  def benchmark_keras_model_functional_predict_run_model_eagerly(self):
-    model = make_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_predict(model, run_eagerly=True)
-
-  def benchmark_keras_model_sequential_predict(self):
-    model = make_sequential_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_predict(model)
-
-  def benchmark_keras_model_sequential_predict_run_model_eagerly(self):
-    model = make_sequential_keras_model(initializer="glorot_uniform")
-    self._benchmark_keras_model_predict(model, run_eagerly=True)
+    def benchmark_keras_model_sequential_predict_run_model_eagerly(self):
+        model = make_sequential_keras_model(initializer="glorot_uniform")
+        self._benchmark_keras_model_predict(model, run_eagerly=True)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/benchmarks/model_memory_profile.py b/keras/benchmarks/model_memory_profile.py
index 04877e0d98f0..927c5fdb5943 100644
--- a/keras/benchmarks/model_memory_profile.py
+++ b/keras/benchmarks/model_memory_profile.py
@@ -20,58 +20,54 @@
 3. Add the model function to the dict `models`.
 """
 
-import tensorflow.compat.v2 as tf
-
+import numpy as np
 from absl import app
 from absl import flags
-
 from absl import logging
-import numpy as np
+
+import keras
 
 try:
-  import memory_profiler  # pylint:disable=g-import-not-at-top
+    import memory_profiler
 except ImportError:
-  memory_profiler = None
+    memory_profiler = None
 
 
 FLAGS = flags.FLAGS
-flags.DEFINE_string('model', None,
-                    'The model to run memory profiler.')
-
-
-@memory_profiler.profile
-def _imdb_lstm_model():
-  """LSTM model."""
-  x_train = np.random.randint(0, 1999, size=(2500, 100))
-  y_train = np.random.random((2500, 1))
-
-  # IMDB LSTM model.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Embedding(20000, 128))
-  model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
-  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
-
-  model.compile('sgd', 'mse')
-  # Warm up the model with one epoch.
-  model.fit(x_train, y_train, batch_size=512, epochs=3)
+flags.DEFINE_string("model", None, "The model to run memory profiler.")
 
 
 def main(_):
-  # Add the model for memory profile.
-  models = {
-      'lstm': _imdb_lstm_model,
-  }
-
-  if FLAGS.model in models:
-    logging.info('Run memory profile on %s.', FLAGS.model)
-    run_model = models[FLAGS.model]
-    run_model()
-  else:
-    logging.info('The model does not exist. Please verify the model name.')
-
-
-if __name__ == '__main__':
-  flags.mark_flags_as_required(['model'])
-  if memory_profiler:
-    app.run(main)
-
+    @memory_profiler.profile
+    def _imdb_lstm_model():
+        """LSTM model."""
+        x_train = np.random.randint(0, 1999, size=(2500, 100))
+        y_train = np.random.random((2500, 1))
+
+        # IMDB LSTM model.
+        model = keras.Sequential()
+        model.add(keras.layers.Embedding(20000, 128))
+        model.add(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
+        model.add(keras.layers.Dense(1, activation="sigmoid"))
+
+        model.compile("sgd", "mse")
+        # Warm up the model with one epoch.
+        model.fit(x_train, y_train, batch_size=512, epochs=3)
+
+    # Add the model for memory profile.
+    models = {
+        "lstm": _imdb_lstm_model,
+    }
+
+    if FLAGS.model in models:
+        logging.info("Run memory profile on %s.", FLAGS.model)
+        run_model = models[FLAGS.model]
+        run_model()
+    else:
+        logging.info("The model does not exist. Please verify the model name.")
+
+
+if __name__ == "__main__":
+    flags.mark_flags_as_required(["model"])
+    if memory_profiler:
+        app.run(main)
diff --git a/keras/benchmarks/optimizer_benchmarks_test.py b/keras/benchmarks/optimizer_benchmarks_test.py
index 2b50f8a54710..7156a1fa7137 100644
--- a/keras/benchmarks/optimizer_benchmarks_test.py
+++ b/keras/benchmarks/optimizer_benchmarks_test.py
@@ -17,67 +17,77 @@
 import tensorflow.compat.v2 as tf
 
 from keras.benchmarks import benchmark_util
-from keras.optimizers.optimizer_v2 import adam
-from tensorflow.python.platform.benchmark import ParameterizedBenchmark
+from keras.optimizers.legacy import adam
+
+# isort: off
+from tensorflow.python.platform.benchmark import (
+    ParameterizedBenchmark,
+)
 
 
 def bidirect_imdb_lstm_config():
-  """Bidirectional LSTM model and IMDB data."""
+    """Bidirectional LSTM model and IMDB data."""
 
-  def model_fn():
-    inputs = tf.keras.Input(shape=(None,), dtype="int32")
-    x = tf.keras.layers.Embedding(20000, 128)(inputs)
-    x = tf.keras.layers.Bidirectional(
-        tf.keras.layers.LSTM(64, return_sequences=True))(
-            x)
-    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
-    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
-    model = tf.keras.Model(inputs, outputs)
-    return model
+    def model_fn():
+        inputs = tf.keras.Input(shape=(None,), dtype="int32")
+        x = tf.keras.layers.Embedding(20000, 128)(inputs)
+        x = tf.keras.layers.Bidirectional(
+            tf.keras.layers.LSTM(64, return_sequences=True)
+        )(x)
+        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
+        outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
+        model = tf.keras.Model(inputs, outputs)
+        return model
 
-  (x_train, y_train), _ = tf.keras.datasets.imdb.load_data(num_words=20000)
-  x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=200)
+    (x_train, y_train), _ = tf.keras.datasets.imdb.load_data(num_words=20000)
+    x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=200)
 
-  return model_fn, x_train, y_train
+    return model_fn, x_train, y_train
 
 
 class KerasOptimizerBenchmark(
-    tf.test.Benchmark, metaclass=ParameterizedBenchmark):
-  """Keras optimizer benchmarks."""
+    tf.test.Benchmark, metaclass=ParameterizedBenchmark
+):
+    """Keras optimizer benchmarks."""
 
-  # The parameter of each benchmark test is a tuple, and the first one is
-  # the optimizer name.
-  _benchmark_parameters = benchmark_util.generate_benchmark_params_cpu_gpu([
-      ("Adam", tf.keras.optimizers.Adam(), 10),
-      ("NonFusedAdam", adam.NonFusedAdam(), 10),
-  ])
+    # The parameter of each benchmark test is a tuple, and the first one is
+    # the optimizer name.
+    _benchmark_parameters = benchmark_util.generate_benchmark_params_cpu_gpu(
+        [
+            ("Adam", tf.keras.optimizers.Adam(), 10),
+            ("NonFusedAdam", adam.NonFusedAdam(), 10),
+        ]
+    )
 
-  def benchmark_optimizer(self, optimizer, num_iters):
-    """Optimizer benchmark with Bidirectional LSTM model on IMDB data.
+    def benchmark_optimizer(self, optimizer, num_iters):
+        """Optimizer benchmark with Bidirectional LSTM model on IMDB data.
 
-    Args:
-      optimizer: The optimizer instance to be benchmarked.
-      num_iters: The number of iterations to run for performance measurement.
-    """
-    model, train_x, train_y = bidirect_imdb_lstm_config()
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        model,
-        x=train_x,
-        y=train_y,
-        batch_size=512,
-        optimizer=optimizer,
-        loss="binary_crossentropy",
-        metrics=["accuracy"])
-    name = benchmark_util.get_benchmark_name(self._get_name())
-    metadata = {
-        "implementation": name[0],
-        "model_name": "optimizers",
-        "parameters": "lstm.512",
-    }
-    extras.update(metadata)
-    self.report_benchmark(
-        iters=num_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+        Args:
+          optimizer: The optimizer instance to be benchmarked.
+          num_iters: The number of iterations to run for performance
+            measurement.
+        """
+        model, train_x, train_y = bidirect_imdb_lstm_config()
+        metrics, wall_time, extras = benchmark_util.measure_performance(
+            model,
+            x=train_x,
+            y=train_y,
+            batch_size=512,
+            optimizer=optimizer,
+            loss="binary_crossentropy",
+            metrics=["accuracy"],
+        )
+        name = benchmark_util.get_benchmark_name(self._get_name())
+        metadata = {
+            "implementation": name[0],
+            "model_name": "optimizers",
+            "parameters": "lstm.512",
+        }
+        extras.update(metadata)
+        self.report_benchmark(
+            iters=num_iters, wall_time=wall_time, metrics=metrics, extras=extras
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/BUILD b/keras/benchmarks/saved_model_benchmarks/BUILD
index 01b3df2d30ef..408dd37c96e3 100644
--- a/keras/benchmarks/saved_model_benchmarks/BUILD
+++ b/keras/benchmarks/saved_model_benchmarks/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #   Implementation of Keras benchmarks.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/keras/benchmarks/saved_model_benchmarks/densenet_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/densenet_benchmark_test.py
index 52c81e633cdc..bcc94015baf7 100644
--- a/keras/benchmarks/saved_model_benchmarks/densenet_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/densenet_benchmark_test.py
@@ -19,25 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_densenet_201(self):
-    app = tf.keras.applications.DenseNet201
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_densenet_201(self):
+        app = tf.keras.applications.DenseNet201
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/efficientnet_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/efficientnet_benchmark_test.py
index 5c0dabb6a1f6..62707cdcf776 100644
--- a/keras/benchmarks/saved_model_benchmarks/efficientnet_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/efficientnet_benchmark_test.py
@@ -19,25 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_efficient_net_b7(self):
-    app = tf.keras.applications.EfficientNetB7
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_efficient_net_b7(self):
+        app = tf.keras.applications.EfficientNetB7
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/inception_resnet_v2_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/inception_resnet_v2_benchmark_test.py
index 0b489dd855c6..fd53786d7cc0 100644
--- a/keras/benchmarks/saved_model_benchmarks/inception_resnet_v2_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/inception_resnet_v2_benchmark_test.py
@@ -19,26 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_inception_resnet_v2(self):
-    app = tf.keras.applications.InceptionResNetV2
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_inception_resnet_v2(self):
+        app = tf.keras.applications.InceptionResNetV2
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/mobilenet_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/mobilenet_benchmark_test.py
index de8eadfa6fb0..bb00e7da03f3 100644
--- a/keras/benchmarks/saved_model_benchmarks/mobilenet_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/mobilenet_benchmark_test.py
@@ -19,25 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_mobilenet_v2(self):
-    app = tf.keras.applications.MobileNetV2
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_mobilenet_v2(self):
+        app = tf.keras.applications.MobileNetV2
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/nasnet_large_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/nasnet_large_benchmark_test.py
index bd9e41c0bc60..cd97d1d53153 100644
--- a/keras/benchmarks/saved_model_benchmarks/nasnet_large_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/nasnet_large_benchmark_test.py
@@ -19,25 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_nasnet_large(self):
-    app = tf.keras.applications.NASNetLarge
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_nasnet_large(self):
+        app = tf.keras.applications.NASNetLarge
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/resnet152_v2_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/resnet152_v2_benchmark_test.py
index 5bada695c99e..bab2f5a60d35 100644
--- a/keras/benchmarks/saved_model_benchmarks/resnet152_v2_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/resnet152_v2_benchmark_test.py
@@ -19,26 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_resnet152_v2(self):
-    app = tf.keras.applications.ResNet152V2
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_resnet152_v2(self):
+        app = tf.keras.applications.ResNet152V2
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py b/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
index 692646749a6a..62271f0b7189 100644
--- a/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
+++ b/keras/benchmarks/saved_model_benchmarks/saved_model_benchmark_util.py
@@ -18,50 +18,51 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import tempfile
 import time
 
+import tensorflow.compat.v2 as tf
+
+import keras
 
-def save_and_load_benchmark(app):
-  """Util for saved model benchmarks."""
-  trials = 3
 
-  model = app(weights=None)
-  model_name = app.__name__
+def save_and_load_benchmark(app):
+    """Util for saved model benchmarks."""
+    trials = 3
 
-  tmp_dir = tf.compat.v1.test.get_temp_dir()
-  tf.io.gfile.makedirs(tmp_dir)
-  save_dir = tempfile.mkdtemp(dir=tmp_dir)
+    model = app(weights=None)
+    model_name = app.__name__
 
-  total_save_time = 0
-  total_load_time = 0
+    tmp_dir = tf.compat.v1.test.get_temp_dir()
+    tf.io.gfile.makedirs(tmp_dir)
+    save_dir = tempfile.mkdtemp(dir=tmp_dir)
 
-  # Run one untimed iteration of saving/loading.
-  model.save(save_dir, save_format='tf')
-  tf.keras.models.load_model(save_dir)
+    total_save_time = 0
+    total_load_time = 0
 
-  for _ in range(trials):
-    start_time = time.time()
-    model.save(save_dir, save_format='tf')
-    total_save_time += time.time() - start_time
+    # Run one untimed iteration of saving/loading.
+    model.save(save_dir, save_format="tf")
+    keras.models.load_model(save_dir)
 
-    start_time = time.time()
-    tf.keras.models.load_model(save_dir)
-    total_load_time += time.time() - start_time
+    for _ in range(trials):
+        start_time = time.time()
+        model.save(save_dir, save_format="tf")
+        total_save_time += time.time() - start_time
 
-  save_result = {
-      'iters': trials,
-      'wall_time': total_save_time / trials,
-      'name': '{}.save'.format(model_name)
-  }
+        start_time = time.time()
+        keras.models.load_model(save_dir)
+        total_load_time += time.time() - start_time
 
-  load_result = {
-      'iters': trials,
-      'wall_time': total_load_time / trials,
-      'name': '{}.load'.format(model_name)
-  }
-  tf.compat.v1.gfile.DeleteRecursively(save_dir)
-  return save_result, load_result
+    save_result = {
+        "iters": trials,
+        "wall_time": total_save_time / trials,
+        "name": f"{model_name}.save",
+    }
 
+    load_result = {
+        "iters": trials,
+        "wall_time": total_load_time / trials,
+        "name": f"{model_name}.load",
+    }
+    tf.compat.v1.gfile.DeleteRecursively(save_dir)
+    return save_result, load_result
diff --git a/keras/benchmarks/saved_model_benchmarks/vgg_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/vgg_benchmark_test.py
index 246596dbecac..cdb044a1fcb0 100644
--- a/keras/benchmarks/saved_model_benchmarks/vgg_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/vgg_benchmark_test.py
@@ -19,26 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_vgg19(self):
-    app = tf.keras.applications.VGG19
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_vgg19(self):
+        app = tf.keras.applications.VGG19
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/benchmarks/saved_model_benchmarks/xception_benchmark_test.py b/keras/benchmarks/saved_model_benchmarks/xception_benchmark_test.py
index 627ccc9cb3cf..ca9eb7c63060 100644
--- a/keras/benchmarks/saved_model_benchmarks/xception_benchmark_test.py
+++ b/keras/benchmarks/saved_model_benchmarks/xception_benchmark_test.py
@@ -19,26 +19,30 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.benchmarks.saved_model_benchmarks import saved_model_benchmark_util
 
 
 class BenchmarkSaveApplications(tf.test.Benchmark):
-
-  def benchmark_save_and_load_xception(self):
-    app = tf.keras.applications.Xception
-    save_result, load_result = (
-        saved_model_benchmark_util.save_and_load_benchmark(app))
-
-    self.report_benchmark(
-        iters=save_result['iters'],
-        wall_time=save_result['wall_time'],
-        name=save_result['name'])
-
-    self.report_benchmark(
-        iters=load_result['iters'],
-        wall_time=load_result['wall_time'],
-        name=load_result['name'])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def benchmark_save_and_load_xception(self):
+        app = tf.keras.applications.Xception
+        (
+            save_result,
+            load_result,
+        ) = saved_model_benchmark_util.save_and_load_benchmark(app)
+
+        self.report_benchmark(
+            iters=save_result["iters"],
+            wall_time=save_result["wall_time"],
+            name=save_result["name"],
+        )
+
+        self.report_benchmark(
+            iters=load_result["iters"],
+            wall_time=load_result["wall_time"],
+            name=load_result["name"],
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/callbacks.py b/keras/callbacks.py
index 47081d3d3c48..bc5a3080512a 100644
--- a/keras/callbacks.py
+++ b/keras/callbacks.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-import-not-at-top
-# pylint: disable=g-classes-have-attributes
+
+
 """Callbacks: utilities called at certain points during model training."""
 
 import collections
@@ -25,10 +25,13 @@
 import sys
 import time
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.distribute import distributed_file_utils
 from keras.distribute import worker_training_state
+from keras.optimizers import optimizer
 from keras.optimizers.schedules import learning_rate_schedule
 from keras.utils import generic_utils
 from keras.utils import io_utils
@@ -37,2893 +40,3269 @@
 from keras.utils.data_utils import Sequence
 from keras.utils.generic_utils import Progbar
 from keras.utils.mode_keys import ModeKeys
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 try:
-  import requests
+    import requests
 except ImportError:
-  requests = None
+    requests = None
 
 
 # Note: `configure_callbacks` is only used in TF1.
-def configure_callbacks(callbacks,
-                        model,
-                        do_validation=False,
-                        batch_size=None,
-                        epochs=None,
-                        steps_per_epoch=None,
-                        samples=None,
-                        verbose=1,
-                        count_mode='steps',
-                        mode=ModeKeys.TRAIN):
-  """Configures callbacks for use in various training loops.
-
-  Args:
-      callbacks: List of Callbacks.
-      model: Model being trained.
-      do_validation: Whether or not validation loop will be run.
-      batch_size: Number of samples per batch.
-      epochs: Number of epoch to train.
-      steps_per_epoch: Number of batches to run per training epoch.
-      samples: Number of training samples.
-      verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
-      count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
-      mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
-        Which loop mode to configure callbacks for.
-
-  Returns:
-      Instance of CallbackList used to control all Callbacks.
-  """
-  # Check if callbacks have already been configured.
-  if isinstance(callbacks, CallbackList):
-    return callbacks
-
-  if not callbacks:
-    callbacks = []
-
-  # Add additional callbacks during training.
-  if mode == ModeKeys.TRAIN:
-    model.history = History()
-    callbacks = [BaseLogger()] + (callbacks or []) + [model.history]
-    if verbose:
-      callbacks.append(ProgbarLogger(count_mode))
-  callback_list = CallbackList(callbacks)
-
-  # Set callback model
-  callback_model = model._get_callback_model()  # pylint: disable=protected-access
-  callback_list.set_model(callback_model)
-
-  set_callback_parameters(
-      callback_list,
-      model,
-      do_validation=do_validation,
-      batch_size=batch_size,
-      epochs=epochs,
-      steps_per_epoch=steps_per_epoch,
-      samples=samples,
-      verbose=verbose,
-      mode=mode)
-
-  callback_list.model.stop_training = False
-  return callback_list
-
-
-def set_callback_parameters(callback_list,
-                            model,
-                            do_validation=False,
-                            batch_size=None,
-                            epochs=None,
-                            steps_per_epoch=None,
-                            samples=None,
-                            verbose=1,
-                            mode=ModeKeys.TRAIN):
-  """Sets callback parameters.
-
-  Args:
-      callback_list: CallbackList instance.
-      model: Model being trained.
-      do_validation: Whether or not validation loop will be run.
-      batch_size: Number of samples per batch.
-      epochs: Number of epoch to train.
-      steps_per_epoch: Number of batches to run per training epoch.
-      samples: Number of training samples.
-      verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
-      mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
-        Which loop mode to configure callbacks for.
-  """
-  metric_names = model.metrics_names
-  for cbk in callback_list:
-    if isinstance(cbk, (BaseLogger, ProgbarLogger)):
-      cbk.stateful_metrics = metric_names[1:]  # Exclude `loss`
-
-  # Set callback parameters
-  callback_metrics = []
-  # When we have deferred build scenario with iterator input, we will compile
-  # when we standardize first batch of data.
-  if mode != ModeKeys.PREDICT:
-    callback_metrics = copy.copy(metric_names)
-    if do_validation:
-      callback_metrics += ['val_' + n for n in metric_names]
-  callback_params = {
-      'batch_size': batch_size,
-      'epochs': epochs,
-      'steps': steps_per_epoch,
-      'samples': samples,
-      'verbose': verbose,
-      'do_validation': do_validation,
-      'metrics': callback_metrics,
-  }
-  callback_list.set_params(callback_params)
-
-
-def _is_generator_like(data):
-  """Checks if data is a generator, Sequence, or Iterator."""
-  return (hasattr(data, '__next__') or hasattr(data, 'next') or isinstance(
-      data, (Sequence, tf.compat.v1.data.Iterator, tf.data.Iterator)))
-
-
-def make_logs(model, logs, outputs, mode, prefix=''):
-  """Computes logs for sending to `on_batch_end` methods."""
-  metric_names = model.metrics_names
-  if mode in {ModeKeys.TRAIN, ModeKeys.TEST} and metric_names:
-    for label, output in zip(metric_names, outputs):
-      logs[prefix + label] = output
-  else:
-    logs['outputs'] = outputs
-  return logs
-
-
-@keras_export('keras.callbacks.CallbackList')
-class CallbackList:
-  """Container abstracting a list of callbacks."""
-
-  def __init__(self,
-               callbacks=None,
-               add_history=False,
-               add_progbar=False,
-               model=None,
-               **params):
-    """Container for `Callback` instances.
-
-    This object wraps a list of `Callback` instances, making it possible
-    to call them all at once via a single endpoint
-    (e.g. `callback_list.on_epoch_end(...)`).
+def configure_callbacks(
+    callbacks,
+    model,
+    do_validation=False,
+    batch_size=None,
+    epochs=None,
+    steps_per_epoch=None,
+    samples=None,
+    verbose=1,
+    count_mode="steps",
+    mode=ModeKeys.TRAIN,
+):
+    """Configures callbacks for use in various training loops.
 
     Args:
-      callbacks: List of `Callback` instances.
-      add_history: Whether a `History` callback should be added, if one does not
-        already exist in the `callbacks` list.
-      add_progbar: Whether a `ProgbarLogger` callback should be added, if one
-        does not already exist in the `callbacks` list.
-      model: The `Model` these callbacks are used with.
-      **params: If provided, parameters will be passed to each `Callback` via
-        `Callback.set_params`.
-    """
-    self.callbacks = tf.nest.flatten(callbacks) if callbacks else []
-    self._add_default_callbacks(add_history, add_progbar)
-
-    if model:
-      self.set_model(model)
-    if params:
-      self.set_params(params)
-
-    # Performance optimization: determines if batch hooks need to be called.
-    # pylint: disable=protected-access
-    self._supports_tf_logs = all(
-        getattr(cb, '_supports_tf_logs', False) for cb in self.callbacks)
-    self._batch_hooks_support_tf_logs = all(
-        getattr(cb, '_supports_tf_logs', False)
-        for cb in self.callbacks
-        if cb._implements_train_batch_hooks() or cb
-        ._implements_test_batch_hooks() or cb._implements_predict_batch_hooks())
-
-    self._should_call_train_batch_hooks = any(
-        cb._implements_train_batch_hooks() for cb in self.callbacks)
-    self._should_call_test_batch_hooks = any(
-        cb._implements_test_batch_hooks() for cb in self.callbacks)
-    self._should_call_predict_batch_hooks = any(
-        cb._implements_predict_batch_hooks() for cb in self.callbacks)
-    # pylint: enable=protected-access
-
-    self._disallow_batch_hooks_in_ps_strategy()
-
-    # Performance check: Check batch hooks for slowness compared to batch time.
-    # Only run check for custom callbacks (i.e. not present in this file).
-    self._check_timing = any(
-        cbk.__class__.__name__ not in globals() for cbk in self.callbacks)
-    self._num_batches_for_timing_check = 5
-    self._hook_times = {}
-    self._batch_start_time = None
-    self._batch_times = []
-
-  def _add_default_callbacks(self, add_history, add_progbar):
-    """Adds `Callback`s that are always present."""
-    self._progbar = None
-    self._history = None
-
-    for cb in self.callbacks:
-      if isinstance(cb, ProgbarLogger):
-        self._progbar = cb
-      elif isinstance(cb, History):
-        self._history = cb
-
-    if self._history is None and add_history:
-      self._history = History()
-      self.callbacks.append(self._history)
-
-    if self._progbar is None and add_progbar:
-      self._progbar = ProgbarLogger(count_mode='steps')
-      self.callbacks.append(self._progbar)
-
-  def _process_logs(self, logs, is_batch_hook=False):
-    """Turns tensors into numpy arrays or Python scalars if necessary."""
-    if logs is None:
-      return {}
-    if self._supports_tf_logs:
-      return logs
-    if is_batch_hook and self._batch_hooks_support_tf_logs:
-      return logs
-    return tf_utils.sync_to_numpy_or_python_type(logs)
-
-  def append(self, callback):
-    self.callbacks.append(callback)
-
-  def set_params(self, params):
-    self.params = params
-    for callback in self.callbacks:
-      callback.set_params(params)
-
-  def set_model(self, model):
-    self.model = model
-    if self._history:
-      model.history = self._history
-    for callback in self.callbacks:
-      callback.set_model(model)
-
-  def _call_batch_hook(self, mode, hook, batch, logs=None):
-    """Helper function for all batch_{begin | end} methods."""
-    if not self.callbacks:
-      return
-
-    if hook == 'begin':
-      self._call_batch_begin_hook(mode, batch, logs)
-    elif hook == 'end':
-      self._call_batch_end_hook(mode, batch, logs)
-    else:
-      raise ValueError(
-          f'Unrecognized hook: {hook}. Expected values are ["begin", "end"]')
-
-  def _call_batch_begin_hook(self, mode, batch, logs):
-    """Helper function for `on_*_batch_begin` methods."""
-    hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
-    self._call_batch_hook_helper(hook_name, batch, logs)
-
-    if self._check_timing:
-      self._batch_start_time = time.time()
-
-  def _call_batch_end_hook(self, mode, batch, logs):
-    """Helper function for `on_*_batch_end` methods."""
-    hook_name = 'on_{mode}_batch_end'.format(mode=mode)
-
-    if self._check_timing and batch >= 1:
-      batch_time = time.time() - self._batch_start_time
-      self._batch_times.append(batch_time)
-
-    self._call_batch_hook_helper(hook_name, batch, logs)
-
-    if len(self._batch_times) >= self._num_batches_for_timing_check:
-      end_hook_name = hook_name
-      begin_hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
-      avg_batch_time = sum(self._batch_times) / len(self._batch_times)
-      avg_end_hook_time = sum(self._hook_times[end_hook_name]) / len(
-          self._hook_times[end_hook_name])
-      avg_begin_hook_time = sum(self._hook_times[begin_hook_name]) / len(
-          self._hook_times[begin_hook_name])
-
-      threshold_time = 1.0 * avg_batch_time
-      warning_msg = ('Callback method `{hook}` is slow compared to '
-                     'the batch time (batch time: {batch_time:.4f}s vs '
-                     '`{hook}` time: {hook_time:.4f}s). Check your callbacks.')
-      if avg_begin_hook_time > threshold_time:
-        logging.warning(warning_msg.format(
-            hook=begin_hook_name,
-            batch_time=avg_batch_time,
-            hook_time=avg_begin_hook_time))
-      if avg_end_hook_time > threshold_time:
-        logging.warning(warning_msg.format(
-            hook=end_hook_name,
-            batch_time=avg_batch_time,
-            hook_time=avg_end_hook_time))
-      self._check_timing = False
-      self._batch_start_time = None
-      self._batch_times = []
-      self._hook_times = {}
-
-  def _call_batch_hook_helper(self, hook_name, batch, logs):
-    """Helper function for `on_*_batch_*` methods."""
-    if self._check_timing:
-      start_time = time.time()
-
-    logs = self._process_logs(logs, is_batch_hook=True)
-    for callback in self.callbacks:
-      hook = getattr(callback, hook_name)
-      hook(batch, logs)
-
-    if self._check_timing:
-      if hook_name not in self._hook_times:
-        self._hook_times[hook_name] = []
-      self._hook_times[hook_name].append(time.time() - start_time)
-
-  def _call_begin_hook(self, mode):
-    """Helper function for on_{train|test|predict}_begin methods."""
-    if mode == ModeKeys.TRAIN:
-      self.on_train_begin()
-    elif mode == ModeKeys.TEST:
-      self.on_test_begin()
-    else:
-      self.on_predict_begin()
-
-  def _call_end_hook(self, mode):
-    """Helper function for on_{train|test|predict}_end methods."""
-    if mode == ModeKeys.TRAIN:
-      self.on_train_end()
-    elif mode == ModeKeys.TEST:
-      self.on_test_end()
-    else:
-      self.on_predict_end()
-
-  def on_batch_begin(self, batch, logs=None):
-    if self._should_call_train_batch_hooks:
-      self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
+        callbacks: List of Callbacks.
+        model: Model being trained.
+        do_validation: Whether or not validation loop will be run.
+        batch_size: Number of samples per batch.
+        epochs: Number of epoch to train.
+        steps_per_epoch: Number of batches to run per training epoch.
+        samples: Number of training samples.
+        verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
+        count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
+        mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
+          Which loop mode to configure callbacks for.
 
-  def on_batch_end(self, batch, logs=None):
-    if self._should_call_train_batch_hooks:
-      self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
-
-  def on_epoch_begin(self, epoch, logs=None):
-    """Calls the `on_epoch_begin` methods of its callbacks.
-
-    This function should only be called during TRAIN mode.
-
-    Args:
-        epoch: Integer, index of epoch.
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
+    Returns:
+        Instance of CallbackList used to control all Callbacks.
     """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_epoch_begin(epoch, logs)
+    # Check if callbacks have already been configured.
+    if isinstance(callbacks, CallbackList):
+        return callbacks
 
-  def on_epoch_end(self, epoch, logs=None):
-    """Calls the `on_epoch_end` methods of its callbacks.
+    if not callbacks:
+        callbacks = []
 
-    This function should only be called during TRAIN mode.
+    # Add additional callbacks during training.
+    if mode == ModeKeys.TRAIN:
+        model.history = History()
+        callbacks = [BaseLogger()] + (callbacks or []) + [model.history]
+        if verbose:
+            callbacks.append(ProgbarLogger(count_mode))
+    callback_list = CallbackList(callbacks)
+
+    # Set callback model
+    callback_model = model._get_callback_model()
+    callback_list.set_model(callback_model)
+
+    set_callback_parameters(
+        callback_list,
+        model,
+        do_validation=do_validation,
+        batch_size=batch_size,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        samples=samples,
+        verbose=verbose,
+        mode=mode,
+    )
+
+    callback_list.model.stop_training = False
+    return callback_list
+
+
+def set_callback_parameters(
+    callback_list,
+    model,
+    do_validation=False,
+    batch_size=None,
+    epochs=None,
+    steps_per_epoch=None,
+    samples=None,
+    verbose=1,
+    mode=ModeKeys.TRAIN,
+):
+    """Sets callback parameters.
 
     Args:
-        epoch: Integer, index of epoch.
-        logs: Dict, metric results for this training epoch, and for the
-          validation epoch if validation is performed. Validation result keys
-          are prefixed with `val_`.
+        callback_list: CallbackList instance.
+        model: Model being trained.
+        do_validation: Whether or not validation loop will be run.
+        batch_size: Number of samples per batch.
+        epochs: Number of epoch to train.
+        steps_per_epoch: Number of batches to run per training epoch.
+        samples: Number of training samples.
+        verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
+        mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
+          Which loop mode to configure callbacks for.
     """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_epoch_end(epoch, logs)
+    metric_names = None
+    for cbk in callback_list:
+        if isinstance(cbk, (BaseLogger, ProgbarLogger)):
+            if not metric_names:
+                metric_names = model.metrics_names
+            cbk.stateful_metrics = metric_names[1:]  # Exclude `loss`
+
+    # Set callback parameters
+    callback_metrics = []
+    # When we have deferred build scenario with iterator input, we will compile
+    # when we standardize first batch of data.
+    if mode != ModeKeys.PREDICT:
+        if not metric_names:
+            metric_names = model.metrics_names
+        callback_metrics = copy.copy(metric_names)
+        if do_validation:
+            callback_metrics += ["val_" + n for n in metric_names]
+    callback_params = {
+        "batch_size": batch_size,
+        "epochs": epochs,
+        "steps": steps_per_epoch,
+        "samples": samples,
+        "verbose": verbose,
+        "do_validation": do_validation,
+        "metrics": callback_metrics,
+    }
+    callback_list.set_params(callback_params)
 
-  def on_train_batch_begin(self, batch, logs=None):
-    """Calls the `on_train_batch_begin` methods of its callbacks.
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict, contains the return value of `model.train_step`. Typically,
-          the values of the `Model`'s metrics are returned.  Example:
-          `{'loss': 0.2, 'accuracy': 0.7}`.
-    """
-    if self._should_call_train_batch_hooks:
-      self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
+def _is_generator_like(data):
+    """Checks if data is a generator, Sequence, or Iterator."""
+    return (
+        hasattr(data, "__next__")
+        or hasattr(data, "next")
+        or isinstance(
+            data, (Sequence, tf.compat.v1.data.Iterator, tf.data.Iterator)
+        )
+    )
+
+
+def make_logs(model, logs, outputs, mode, prefix=""):
+    """Computes logs for sending to `on_batch_end` methods."""
+    metric_names = model.metrics_names
+    if mode in {ModeKeys.TRAIN, ModeKeys.TEST} and metric_names:
+        for label, output in zip(metric_names, outputs):
+            logs[prefix + label] = output
+    else:
+        logs["outputs"] = outputs
+    return logs
 
-  def on_train_batch_end(self, batch, logs=None):
-    """Calls the `on_train_batch_end` methods of its callbacks.
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Aggregated metric results up until this batch.
-    """
-    if self._should_call_train_batch_hooks:
-      self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
+@keras_export("keras.callbacks.CallbackList")
+class CallbackList:
+    """Container abstracting a list of callbacks."""
+
+    def __init__(
+        self,
+        callbacks=None,
+        add_history=False,
+        add_progbar=False,
+        model=None,
+        **params,
+    ):
+        """Container for `Callback` instances.
+
+        This object wraps a list of `Callback` instances, making it possible
+        to call them all at once via a single endpoint
+        (e.g. `callback_list.on_epoch_end(...)`).
+
+        Args:
+          callbacks: List of `Callback` instances.
+          add_history: Whether a `History` callback should be added, if one does
+            not already exist in the `callbacks` list.
+          add_progbar: Whether a `ProgbarLogger` callback should be added, if
+            one does not already exist in the `callbacks` list.
+          model: The `Model` these callbacks are used with.
+          **params: If provided, parameters will be passed to each `Callback`
+            via `Callback.set_params`.
+        """
+        self.callbacks = tf.nest.flatten(callbacks) if callbacks else []
+        self._add_default_callbacks(add_history, add_progbar)
+
+        if model:
+            self.set_model(model)
+        if params:
+            self.set_params(params)
+
+        # Performance optimization: determines if batch hooks need to be called.
+
+        self._supports_tf_logs = all(
+            getattr(cb, "_supports_tf_logs", False) for cb in self.callbacks
+        )
+        self._batch_hooks_support_tf_logs = all(
+            getattr(cb, "_supports_tf_logs", False)
+            for cb in self.callbacks
+            if cb._implements_train_batch_hooks()
+            or cb._implements_test_batch_hooks()
+            or cb._implements_predict_batch_hooks()
+        )
+
+        self._should_call_train_batch_hooks = any(
+            cb._implements_train_batch_hooks() for cb in self.callbacks
+        )
+        self._should_call_test_batch_hooks = any(
+            cb._implements_test_batch_hooks() for cb in self.callbacks
+        )
+        self._should_call_predict_batch_hooks = any(
+            cb._implements_predict_batch_hooks() for cb in self.callbacks
+        )
+
+        self._disallow_batch_hooks_in_ps_strategy()
+
+        # Performance check: Check batch hooks for slowness compared to batch
+        # time.  Only run check for custom callbacks (i.e. not present in this
+        # file).
+        self._check_timing = any(
+            cbk.__class__.__name__ not in globals() for cbk in self.callbacks
+        )
+        self._num_batches_for_timing_check = 5
+        self._hook_times = {}
+        self._batch_start_time = None
+        self._batch_times = []
+
+    def _add_default_callbacks(self, add_history, add_progbar):
+        """Adds `Callback`s that are always present."""
+        self._progbar = None
+        self._history = None
+
+        for cb in self.callbacks:
+            if isinstance(cb, ProgbarLogger):
+                self._progbar = cb
+            elif isinstance(cb, History):
+                self._history = cb
+
+        if self._history is None and add_history:
+            self._history = History()
+            self.callbacks.append(self._history)
+
+        if self._progbar is None and add_progbar:
+            self._progbar = ProgbarLogger(count_mode="steps")
+            self.callbacks.append(self._progbar)
+
+    def _process_logs(self, logs, is_batch_hook=False):
+        """Turns tensors into numpy arrays or Python scalars if necessary."""
+        if logs is None:
+            return {}
+        if self._supports_tf_logs:
+            return logs
+        if is_batch_hook and self._batch_hooks_support_tf_logs:
+            return logs
+        return tf_utils.sync_to_numpy_or_python_type(logs)
+
+    def append(self, callback):
+        self.callbacks.append(callback)
+
+    def set_params(self, params):
+        self.params = params
+        for callback in self.callbacks:
+            callback.set_params(params)
+
+    def set_model(self, model):
+        self.model = model
+        if self._history:
+            model.history = self._history
+        for callback in self.callbacks:
+            callback.set_model(model)
+
+    def _call_batch_hook(self, mode, hook, batch, logs=None):
+        """Helper function for all batch_{begin | end} methods."""
+        if not self.callbacks:
+            return
+
+        if hook == "begin":
+            self._call_batch_begin_hook(mode, batch, logs)
+        elif hook == "end":
+            self._call_batch_end_hook(mode, batch, logs)
+        else:
+            raise ValueError(
+                f"Unrecognized hook: {hook}. "
+                'Expected values are ["begin", "end"]'
+            )
+
+    def _call_batch_begin_hook(self, mode, batch, logs):
+        """Helper function for `on_*_batch_begin` methods."""
+        hook_name = f"on_{mode}_batch_begin"
+        self._call_batch_hook_helper(hook_name, batch, logs)
+
+        if self._check_timing:
+            self._batch_start_time = time.time()
+
+    def _call_batch_end_hook(self, mode, batch, logs):
+        """Helper function for `on_*_batch_end` methods."""
+        hook_name = f"on_{mode}_batch_end"
+
+        if self._check_timing and batch >= 1:
+            batch_time = time.time() - self._batch_start_time
+            self._batch_times.append(batch_time)
+
+        self._call_batch_hook_helper(hook_name, batch, logs)
+
+        if len(self._batch_times) >= self._num_batches_for_timing_check:
+            end_hook_name = hook_name
+            begin_hook_name = f"on_{mode}_batch_begin"
+            avg_batch_time = sum(self._batch_times) / len(self._batch_times)
+            avg_end_hook_time = sum(self._hook_times[end_hook_name]) / len(
+                self._hook_times[end_hook_name]
+            )
+            avg_begin_hook_time = sum(self._hook_times[begin_hook_name]) / len(
+                self._hook_times[begin_hook_name]
+            )
+
+            threshold_time = 1.0 * avg_batch_time
+            warning_msg = (
+                "Callback method `{hook}` is slow compared to "
+                "the batch time (batch time: {batch_time:.4f}s vs "
+                "`{hook}` time: {hook_time:.4f}s). Check your callbacks."
+            )
+            if avg_begin_hook_time > threshold_time:
+                logging.warning(
+                    warning_msg.format(
+                        hook=begin_hook_name,
+                        batch_time=avg_batch_time,
+                        hook_time=avg_begin_hook_time,
+                    )
+                )
+            if avg_end_hook_time > threshold_time:
+                logging.warning(
+                    warning_msg.format(
+                        hook=end_hook_name,
+                        batch_time=avg_batch_time,
+                        hook_time=avg_end_hook_time,
+                    )
+                )
+            self._check_timing = False
+            self._batch_start_time = None
+            self._batch_times = []
+            self._hook_times = {}
+
+    def _call_batch_hook_helper(self, hook_name, batch, logs):
+        """Helper function for `on_*_batch_*` methods."""
+        if self._check_timing:
+            start_time = time.time()
+
+        logs = self._process_logs(logs, is_batch_hook=True)
+        for callback in self.callbacks:
+            hook = getattr(callback, hook_name)
+            hook(batch, logs)
+
+        if self._check_timing:
+            if hook_name not in self._hook_times:
+                self._hook_times[hook_name] = []
+            self._hook_times[hook_name].append(time.time() - start_time)
+
+    def _call_begin_hook(self, mode):
+        """Helper function for on_{train|test|predict}_begin methods."""
+        if mode == ModeKeys.TRAIN:
+            self.on_train_begin()
+        elif mode == ModeKeys.TEST:
+            self.on_test_begin()
+        else:
+            self.on_predict_begin()
+
+    def _call_end_hook(self, mode):
+        """Helper function for on_{train|test|predict}_end methods."""
+        if mode == ModeKeys.TRAIN:
+            self.on_train_end()
+        elif mode == ModeKeys.TEST:
+            self.on_test_end()
+        else:
+            self.on_predict_end()
+
+    def on_batch_begin(self, batch, logs=None):
+        if self._should_call_train_batch_hooks:
+            self._call_batch_hook(ModeKeys.TRAIN, "begin", batch, logs=logs)
+
+    def on_batch_end(self, batch, logs=None):
+        if self._should_call_train_batch_hooks:
+            self._call_batch_hook(ModeKeys.TRAIN, "end", batch, logs=logs)
+
+    def on_epoch_begin(self, epoch, logs=None):
+        """Calls the `on_epoch_begin` methods of its callbacks.
+
+        This function should only be called during TRAIN mode.
+
+        Args:
+            epoch: Integer, index of epoch.
+            logs: Dict. Currently no data is passed to this argument for this
+               method but that may change in the future.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_epoch_begin(epoch, logs)
+
+    def on_epoch_end(self, epoch, logs=None):
+        """Calls the `on_epoch_end` methods of its callbacks.
+
+        This function should only be called during TRAIN mode.
+
+        Args:
+            epoch: Integer, index of epoch.
+            logs: Dict, metric results for this training epoch, and for the
+              validation epoch if validation is performed. Validation result
+              keys are prefixed with `val_`.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_epoch_end(epoch, logs)
+
+    def on_train_batch_begin(self, batch, logs=None):
+        """Calls the `on_train_batch_begin` methods of its callbacks.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict, contains the return value of `model.train_step`.
+              Typically, the values of the `Model`'s metrics are returned.
+              Example: `{'loss': 0.2, 'accuracy': 0.7}`.
+        """
+        if self._should_call_train_batch_hooks:
+            self._call_batch_hook(ModeKeys.TRAIN, "begin", batch, logs=logs)
+
+    def on_train_batch_end(self, batch, logs=None):
+        """Calls the `on_train_batch_end` methods of its callbacks.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Aggregated metric results up until this batch.
+        """
+        if self._should_call_train_batch_hooks:
+            self._call_batch_hook(ModeKeys.TRAIN, "end", batch, logs=logs)
+
+    def on_test_batch_begin(self, batch, logs=None):
+        """Calls the `on_test_batch_begin` methods of its callbacks.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict, contains the return value of `model.test_step`.
+              Typically, the values of the `Model`'s metrics are returned.
+              Example: `{'loss': 0.2, 'accuracy': 0.7}`.
+        """
+        if self._should_call_test_batch_hooks:
+            self._call_batch_hook(ModeKeys.TEST, "begin", batch, logs=logs)
+
+    def on_test_batch_end(self, batch, logs=None):
+        """Calls the `on_test_batch_end` methods of its callbacks.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Aggregated metric results up until this batch.
+        """
+        if self._should_call_test_batch_hooks:
+            self._call_batch_hook(ModeKeys.TEST, "end", batch, logs=logs)
+
+    def on_predict_batch_begin(self, batch, logs=None):
+        """Calls the `on_predict_batch_begin` methods of its callbacks.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict, contains the return value of `model.predict_step`,
+              it typically returns a dict with a key 'outputs' containing
+              the model's outputs.
+        """
+        if self._should_call_predict_batch_hooks:
+            self._call_batch_hook(ModeKeys.PREDICT, "begin", batch, logs=logs)
+
+    def on_predict_batch_end(self, batch, logs=None):
+        """Calls the `on_predict_batch_end` methods of its callbacks.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Aggregated metric results up until this batch.
+        """
+        if self._should_call_predict_batch_hooks:
+            self._call_batch_hook(ModeKeys.PREDICT, "end", batch, logs=logs)
+
+    def on_train_begin(self, logs=None):
+        """Calls the `on_train_begin` methods of its callbacks.
+
+        Args:
+            logs: Dict. Currently, no data is passed via this argument
+              for this method, but that may change in the future.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_train_begin(logs)
+
+    def on_train_end(self, logs=None):
+        """Calls the `on_train_end` methods of its callbacks.
+
+        Args:
+            logs: Dict. Currently, no data is passed via this argument
+              for this method, but that may change in the future.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_train_end(logs)
+
+    def on_test_begin(self, logs=None):
+        """Calls the `on_test_begin` methods of its callbacks.
+
+        Args:
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_test_begin(logs)
+
+    def on_test_end(self, logs=None):
+        """Calls the `on_test_end` methods of its callbacks.
+
+        Args:
+            logs: Dict. Currently, no data is passed via this argument
+              for this method, but that may change in the future.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_test_end(logs)
+
+    def on_predict_begin(self, logs=None):
+        """Calls the 'on_predict_begin` methods of its callbacks.
+
+        Args:
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_predict_begin(logs)
+
+    def on_predict_end(self, logs=None):
+        """Calls the `on_predict_end` methods of its callbacks.
+
+        Args:
+            logs: Dict. Currently, no data is passed via this argument
+              for this method, but that may change in the future.
+        """
+        logs = self._process_logs(logs)
+        for callback in self.callbacks:
+            callback.on_predict_end(logs)
+
+    def __iter__(self):
+        return iter(self.callbacks)
+
+    def _disallow_batch_hooks_in_ps_strategy(self):
+        """Error out if batch-level callbacks are passed with PSStrategy."""
+
+        strategy = tf.distribute.get_strategy()
+        if strategy._should_use_with_coordinator:
+            unsupported_callbacks = []
+            for cb in self.callbacks:
+                # These Callbacks can accept RemoteValues directly.
+                if getattr(cb, "_supports_tf_logs", False):
+                    continue
+                if (
+                    cb._implements_train_batch_hooks()
+                    or cb._implements_test_batch_hooks()
+                    or cb._implements_predict_batch_hooks()
+                ):
+                    unsupported_callbacks.append(cb)
+            if unsupported_callbacks:
+                raise ValueError(
+                    "Batch-level `Callback`s are not supported with "
+                    "`ParameterServerStrategy`. Found unsupported "
+                    f"callbacks: {unsupported_callbacks}"
+                )
+
+    def make_logs(self, model, logs, outputs, mode, prefix=""):
+        """Computes logs for sending to `on_batch_end` methods."""
+        if not self.callbacks:
+            return logs
+
+        return make_logs(model, logs, outputs, mode, prefix=prefix)
+
+
+@keras_export("keras.callbacks.Callback")
+class Callback:
+    """Abstract base class used to build new callbacks.
+
+    Callbacks can be passed to keras methods such as `fit`, `evaluate`, and
+    `predict` in order to hook into the various stages of the model training and
+    inference lifecycle.
+
+    To create a custom callback, subclass `keras.callbacks.Callback` and
+    override the method associated with the stage of interest. See
+    https://www.tensorflow.org/guide/keras/custom_callback for more information.
+
+    Example:
+
+    >>> training_finished = False
+    >>> class MyCallback(tf.keras.callbacks.Callback):
+    ...   def on_train_end(self, logs=None):
+    ...     global training_finished
+    ...     training_finished = True
+    >>> model = tf.keras.Sequential([
+    ...     tf.keras.layers.Dense(1, input_shape=(1,))])
+    >>> model.compile(loss='mean_squared_error')
+    >>> model.fit(tf.constant([[1.0]]), tf.constant([[1.0]]),
+    ...           callbacks=[MyCallback()])
+    >>> assert training_finished == True
+
+    If you want to use `Callback` objects in a custom training loop:
+
+    1. You should pack all your callbacks into a single `callbacks.CallbackList`
+       so they can all be called together.
+    2. You will need to manually call all the `on_*` methods at the appropriate
+       locations in your loop. Like this:
+
+    Example:
+    ```python
+       callbacks =  tf.keras.callbacks.CallbackList([...])
+       callbacks.append(...)
+       callbacks.on_train_begin(...)
+       for epoch in range(EPOCHS):
+         callbacks.on_epoch_begin(epoch)
+         for i, data in dataset.enumerate():
+           callbacks.on_train_batch_begin(i)
+           batch_logs = model.train_step(data)
+           callbacks.on_train_batch_end(i, batch_logs)
+         epoch_logs = ...
+         callbacks.on_epoch_end(epoch, epoch_logs)
+       final_logs=...
+       callbacks.on_train_end(final_logs)
+    ```
 
-  def on_test_batch_begin(self, batch, logs=None):
-    """Calls the `on_test_batch_begin` methods of its callbacks.
+    Attributes:
+        params: Dict. Training parameters
+            (eg. verbosity, batch size, number of epochs...).
+        model: Instance of `keras.models.Model`.
+            Reference of the model being trained.
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict, contains the return value of `model.test_step`. Typically,
-          the values of the `Model`'s metrics are returned.  Example:
-          `{'loss': 0.2, 'accuracy': 0.7}`.
+    The `logs` dictionary that callback methods
+    take as argument will contain keys for quantities relevant to
+    the current batch or epoch (see method-specific docstrings).
     """
-    if self._should_call_test_batch_hooks:
-      self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
 
-  def on_test_batch_end(self, batch, logs=None):
-    """Calls the `on_test_batch_end` methods of its callbacks.
+    def __init__(self):
+        self.validation_data = None
+        self.model = None
+        # Whether this Callback should only run on the chief worker in a
+        # Multi-Worker setting.
+        # TODO(omalleyt): Make this attr public once solution is stable.
+        self._chief_worker_only = None
+        self._supports_tf_logs = False
+
+    def set_params(self, params):
+        self.params = params
+
+    def set_model(self, model):
+        self.model = model
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_batch_begin(self, batch, logs=None):
+        """A backwards compatibility alias for `on_train_batch_begin`."""
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_batch_end(self, batch, logs=None):
+        """A backwards compatibility alias for `on_train_batch_end`."""
+
+    @doc_controls.for_subclass_implementers
+    def on_epoch_begin(self, epoch, logs=None):
+        """Called at the start of an epoch.
+
+        Subclasses should override for any actions to run. This function should
+        only be called during TRAIN mode.
+
+        Args:
+            epoch: Integer, index of epoch.
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+
+    @doc_controls.for_subclass_implementers
+    def on_epoch_end(self, epoch, logs=None):
+        """Called at the end of an epoch.
+
+        Subclasses should override for any actions to run. This function should
+        only be called during TRAIN mode.
+
+        Args:
+            epoch: Integer, index of epoch.
+            logs: Dict, metric results for this training epoch, and for the
+              validation epoch if validation is performed. Validation result
+              keys are prefixed with `val_`. For training epoch, the values of
+              the `Model`'s metrics are returned. Example:
+              `{'loss': 0.2, 'accuracy': 0.7}`.
+        """
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_train_batch_begin(self, batch, logs=None):
+        """Called at the beginning of a training batch in `fit` methods.
+
+        Subclasses should override for any actions to run.
+
+        Note that if the `steps_per_execution` argument to `compile` in
+        `tf.keras.Model` is set to `N`, this method will only be called every
+        `N` batches.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+        # For backwards compatibility.
+        self.on_batch_begin(batch, logs=logs)
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_train_batch_end(self, batch, logs=None):
+        """Called at the end of a training batch in `fit` methods.
+
+        Subclasses should override for any actions to run.
+
+        Note that if the `steps_per_execution` argument to `compile` in
+        `tf.keras.Model` is set to `N`, this method will only be called every
+        `N` batches.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Aggregated metric results up until this batch.
+        """
+        # For backwards compatibility.
+        self.on_batch_end(batch, logs=logs)
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_test_batch_begin(self, batch, logs=None):
+        """Called at the beginning of a batch in `evaluate` methods.
+
+        Also called at the beginning of a validation batch in the `fit`
+        methods, if validation data is provided.
+
+        Subclasses should override for any actions to run.
+
+        Note that if the `steps_per_execution` argument to `compile` in
+        `tf.keras.Model` is set to `N`, this method will only be called every
+        `N` batches.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_test_batch_end(self, batch, logs=None):
+        """Called at the end of a batch in `evaluate` methods.
+
+        Also called at the end of a validation batch in the `fit`
+        methods, if validation data is provided.
+
+        Subclasses should override for any actions to run.
+
+        Note that if the `steps_per_execution` argument to `compile` in
+        `tf.keras.Model` is set to `N`, this method will only be called every
+        `N` batches.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Aggregated metric results up until this batch.
+        """
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_predict_batch_begin(self, batch, logs=None):
+        """Called at the beginning of a batch in `predict` methods.
+
+        Subclasses should override for any actions to run.
+
+        Note that if the `steps_per_execution` argument to `compile` in
+        `tf.keras.Model` is set to `N`, this method will only be called every
+        `N` batches.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+
+    @doc_controls.for_subclass_implementers
+    @generic_utils.default
+    def on_predict_batch_end(self, batch, logs=None):
+        """Called at the end of a batch in `predict` methods.
+
+        Subclasses should override for any actions to run.
+
+        Note that if the `steps_per_execution` argument to `compile` in
+        `tf.keras.Model` is set to `N`, this method will only be called every
+        `N` batches.
+
+        Args:
+            batch: Integer, index of batch within the current epoch.
+            logs: Dict. Aggregated metric results up until this batch.
+        """
+
+    @doc_controls.for_subclass_implementers
+    def on_train_begin(self, logs=None):
+        """Called at the beginning of training.
+
+        Subclasses should override for any actions to run.
+
+        Args:
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+
+    @doc_controls.for_subclass_implementers
+    def on_train_end(self, logs=None):
+        """Called at the end of training.
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Aggregated metric results up until this batch.
-    """
-    if self._should_call_test_batch_hooks:
-      self._call_batch_hook(ModeKeys.TEST, 'end', batch, logs=logs)
+        Subclasses should override for any actions to run.
+
+        Args:
+            logs: Dict. Currently the output of the last call to
+              `on_epoch_end()` is passed to this argument for this method but
+              that may change in the future.
+        """
+
+    @doc_controls.for_subclass_implementers
+    def on_test_begin(self, logs=None):
+        """Called at the beginning of evaluation or validation.
 
-  def on_predict_batch_begin(self, batch, logs=None):
-    """Calls the `on_predict_batch_begin` methods of its callbacks.
+        Subclasses should override for any actions to run.
+
+        Args:
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict, contains the return value of `model.predict_step`,
-          it typically returns a dict with a key 'outputs' containing
-          the model's outputs.
-    """
-    if self._should_call_predict_batch_hooks:
-      self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
-
-  def on_predict_batch_end(self, batch, logs=None):
-    """Calls the `on_predict_batch_end` methods of its callbacks.
+    @doc_controls.for_subclass_implementers
+    def on_test_end(self, logs=None):
+        """Called at the end of evaluation or validation.
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Aggregated metric results up until this batch.
-    """
-    if self._should_call_predict_batch_hooks:
-      self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
+        Subclasses should override for any actions to run.
+
+        Args:
+            logs: Dict. Currently the output of the last call to
+              `on_test_batch_end()` is passed to this argument for this method
+              but that may change in the future.
+        """
 
-  def on_train_begin(self, logs=None):
-    """Calls the `on_train_begin` methods of its callbacks.
+    @doc_controls.for_subclass_implementers
+    def on_predict_begin(self, logs=None):
+        """Called at the beginning of prediction.
 
-    Args:
-        logs: Dict. Currently, no data is passed via this argument
-          for this method, but that may change in the future.
-    """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_train_begin(logs)
+        Subclasses should override for any actions to run.
 
-  def on_train_end(self, logs=None):
-    """Calls the `on_train_end` methods of its callbacks.
+        Args:
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+
+    @doc_controls.for_subclass_implementers
+    def on_predict_end(self, logs=None):
+        """Called at the end of prediction.
+
+        Subclasses should override for any actions to run.
+
+        Args:
+            logs: Dict. Currently no data is passed to this argument for this
+              method but that may change in the future.
+        """
+
+    def _implements_train_batch_hooks(self):
+        """Determines if this Callback should be called for each train batch."""
+        return (
+            not generic_utils.is_default(self.on_batch_begin)
+            or not generic_utils.is_default(self.on_batch_end)
+            or not generic_utils.is_default(self.on_train_batch_begin)
+            or not generic_utils.is_default(self.on_train_batch_end)
+        )
+
+    def _implements_test_batch_hooks(self):
+        """Determines if this Callback should be called for each test batch."""
+        return not generic_utils.is_default(
+            self.on_test_batch_begin
+        ) or not generic_utils.is_default(self.on_test_batch_end)
 
-    Args:
-        logs: Dict. Currently, no data is passed via this argument
-          for this method, but that may change in the future.
-    """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_train_end(logs)
+    def _implements_predict_batch_hooks(self):
+        """Determines if this Callback should be called for each predict
+        batch."""
+        return not generic_utils.is_default(
+            self.on_predict_batch_begin
+        ) or not generic_utils.is_default(self.on_predict_batch_end)
 
-  def on_test_begin(self, logs=None):
-    """Calls the `on_test_begin` methods of its callbacks.
 
-    Args:
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_test_begin(logs)
+@keras_export("keras.callbacks.BaseLogger")
+class BaseLogger(Callback):
+    """Callback that accumulates epoch averages of metrics.
 
-  def on_test_end(self, logs=None):
-    """Calls the `on_test_end` methods of its callbacks.
+    This callback is automatically applied to every Keras model.
 
     Args:
-        logs: Dict. Currently, no data is passed via this argument
-          for this method, but that may change in the future.
+        stateful_metrics: Iterable of string names of metrics that
+            should *not* be averaged over an epoch.
+            Metrics in this list will be logged as-is in `on_epoch_end`.
+            All others will be averaged in `on_epoch_end`.
     """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_test_end(logs)
 
-  def on_predict_begin(self, logs=None):
-    """Calls the 'on_predict_begin` methods of its callbacks.
+    def __init__(self, stateful_metrics=None):
+        super().__init__()
+        self.stateful_metrics = set(stateful_metrics or [])
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self.seen = 0
+        self.totals = {}
+
+    def on_batch_end(self, batch, logs=None):
+        logs = logs or {}
+        batch_size = logs.get("size", 0)
+        # In case of distribution strategy we can potentially run multiple steps
+        # at the same time, we should account for that in the `seen`
+        # calculation.
+        num_steps = logs.get("num_steps", 1)
+        self.seen += batch_size * num_steps
+
+        for k, v in logs.items():
+            if k in self.stateful_metrics:
+                self.totals[k] = v
+            else:
+                if k in self.totals:
+                    self.totals[k] += v * batch_size
+                else:
+                    self.totals[k] = v * batch_size
+
+    def on_epoch_end(self, epoch, logs=None):
+        if logs is not None:
+            for k in self.params["metrics"]:
+                if k in self.totals:
+                    # Make value available to next callbacks.
+                    if k in self.stateful_metrics:
+                        logs[k] = self.totals[k]
+                    else:
+                        logs[k] = self.totals[k] / self.seen
+
+
+@keras_export("keras.callbacks.TerminateOnNaN")
+class TerminateOnNaN(Callback):
+    """Callback that terminates training when a NaN loss is encountered."""
+
+    def __init__(self):
+        super().__init__()
+        self._supports_tf_logs = True
+
+    def on_batch_end(self, batch, logs=None):
+        logs = logs or {}
+        loss = logs.get("loss")
+        if loss is not None:
+            loss = tf_utils.sync_to_numpy_or_python_type(loss)
+            if np.isnan(loss) or np.isinf(loss):
+                io_utils.print_msg(
+                    f"Batch {batch}: Invalid loss, terminating training"
+                )
+                self.model.stop_training = True
 
-    Args:
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_predict_begin(logs)
 
-  def on_predict_end(self, logs=None):
-    """Calls the `on_predict_end` methods of its callbacks.
+@keras_export("keras.callbacks.ProgbarLogger")
+class ProgbarLogger(Callback):
+    """Callback that prints metrics to stdout.
 
     Args:
-        logs: Dict. Currently, no data is passed via this argument
-          for this method, but that may change in the future.
-    """
-    logs = self._process_logs(logs)
-    for callback in self.callbacks:
-      callback.on_predict_end(logs)
-
-  def __iter__(self):
-    return iter(self.callbacks)
-
-  def _disallow_batch_hooks_in_ps_strategy(self):
-    """Error out if batch-level callbacks are passed with PSStrategy."""
-    # pylint: disable=protected-access
-    strategy = tf.distribute.get_strategy()
-    if strategy._should_use_with_coordinator:
-      unsupported_callbacks = []
-      for cb in self.callbacks:
-        # These Callbacks can accept RemoteValues directly.
-        if getattr(cb, '_supports_tf_logs', False):
-          continue
-        if (cb._implements_train_batch_hooks() or
-            cb._implements_test_batch_hooks() or
-            cb._implements_predict_batch_hooks()):
-          unsupported_callbacks.append(cb)
-      if unsupported_callbacks:
-        raise ValueError(
-            'Batch-level `Callback`s are not supported with '
-            '`ParameterServerStrategy`. Found unsupported '
-            f'callbacks: {unsupported_callbacks}')
-    # pylint: enable=protected-access
-
-
-@keras_export('keras.callbacks.Callback')
-class Callback:
-  """Abstract base class used to build new callbacks.
-
-  Callbacks can be passed to keras methods such as `fit`, `evaluate`, and
-  `predict` in order to hook into the various stages of the model training and
-  inference lifecycle.
-
-  To create a custom callback, subclass `keras.callbacks.Callback` and override
-  the method associated with the stage of interest. See
-  https://www.tensorflow.org/guide/keras/custom_callback for more information.
-
-  Example:
-
-  >>> training_finished = False
-  >>> class MyCallback(tf.keras.callbacks.Callback):
-  ...   def on_train_end(self, logs=None):
-  ...     global training_finished
-  ...     training_finished = True
-  >>> model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
-  >>> model.compile(loss='mean_squared_error')
-  >>> model.fit(tf.constant([[1.0]]), tf.constant([[1.0]]),
-  ...           callbacks=[MyCallback()])
-  >>> assert training_finished == True
-
-  If you want to use `Callback` objects in a custom training loop:
-
-  1. You should pack all your callbacks into a single `callbacks.CallbackList`
-     so they can all be called together.
-  2. You will need to manually call all the `on_*` methods at the appropriate
-     locations in your loop. Like this:
-
-     ```
-     callbacks =  tf.keras.callbacks.CallbackList([...])
-     callbacks.append(...)
-
-     callbacks.on_train_begin(...)
-     for epoch in range(EPOCHS):
-       callbacks.on_epoch_begin(epoch)
-       for i, data in dataset.enumerate():
-         callbacks.on_train_batch_begin(i)
-         batch_logs = model.train_step(data)
-         callbacks.on_train_batch_end(i, batch_logs)
-       epoch_logs = ...
-       callbacks.on_epoch_end(epoch, epoch_logs)
-     final_logs=...
-     callbacks.on_train_end(final_logs)
-     ```
-
-  Attributes:
-      params: Dict. Training parameters
-          (eg. verbosity, batch size, number of epochs...).
-      model: Instance of `keras.models.Model`.
-          Reference of the model being trained.
-
-  The `logs` dictionary that callback methods
-  take as argument will contain keys for quantities relevant to
-  the current batch or epoch (see method-specific docstrings).
-  """
-
-  def __init__(self):
-    self.validation_data = None  # pylint: disable=g-missing-from-attributes
-    self.model = None
-    # Whether this Callback should only run on the chief worker in a
-    # Multi-Worker setting.
-    # TODO(omalleyt): Make this attr public once solution is stable.
-    self._chief_worker_only = None
-    self._supports_tf_logs = False
-
-  def set_params(self, params):
-    self.params = params
-
-  def set_model(self, model):
-    self.model = model
-
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_batch_begin(self, batch, logs=None):
-    """A backwards compatibility alias for `on_train_batch_begin`."""
-
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_batch_end(self, batch, logs=None):
-    """A backwards compatibility alias for `on_train_batch_end`."""
-
-  @doc_controls.for_subclass_implementers
-  def on_epoch_begin(self, epoch, logs=None):
-    """Called at the start of an epoch.
-
-    Subclasses should override for any actions to run. This function should only
-    be called during TRAIN mode.
+        count_mode: One of `"steps"` or `"samples"`.
+            Whether the progress bar should
+            count samples seen or steps (batches) seen.
+        stateful_metrics: Iterable of string names of metrics that
+            should *not* be averaged over an epoch.
+            Metrics in this list will be logged as-is.
+            All others will be averaged over time (e.g. loss, etc).
+            If not provided, defaults to the `Model`'s metrics.
 
-    Args:
-        epoch: Integer, index of epoch.
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
+    Raises:
+        ValueError: In case of invalid `count_mode`.
     """
 
-  @doc_controls.for_subclass_implementers
-  def on_epoch_end(self, epoch, logs=None):
-    """Called at the end of an epoch.
+    def __init__(self, count_mode: str = "samples", stateful_metrics=None):
+        super().__init__()
+        self._supports_tf_logs = True
+        if count_mode == "samples":
+            self.use_steps = False
+        elif count_mode == "steps":
+            self.use_steps = True
+        else:
+            raise ValueError(
+                f"Unknown `count_mode`: {count_mode}. "
+                'Expected values are ["samples", "steps"]'
+            )
+        # Defaults to all Model's metrics except for loss.
+        self.stateful_metrics = (
+            set(stateful_metrics) if stateful_metrics else set()
+        )
+
+        self.seen = 0
+        self.progbar = None
+        self.target = None
+        self.verbose = 1
+        self.epochs = 1
+
+        self._train_step, self._test_step, self._predict_step = None, None, None
+        self._call_batch_hooks = True
 
-    Subclasses should override for any actions to run. This function should only
-    be called during TRAIN mode.
+        self._called_in_fit = False
 
-    Args:
-        epoch: Integer, index of epoch.
-        logs: Dict, metric results for this training epoch, and for the
-          validation epoch if validation is performed. Validation result keys
-          are prefixed with `val_`. For training epoch, the values of the
-         `Model`'s metrics are returned. Example : `{'loss': 0.2, 'accuracy':
-           0.7}`.
-    """
+    def set_params(self, params):
+        self.verbose = params["verbose"]
+        self.epochs = params["epochs"]
+        if self.use_steps and "steps" in params:
+            self.target = params["steps"]
+        elif not self.use_steps and "samples" in params:
+            self.target = params["samples"]
+        else:
+            self.target = (
+                None  # Will be inferred at the end of the first epoch.
+            )
+
+        self._call_batch_hooks = self.verbose == 1
+        if self.target is None:
+            try:
+                self._train_step = self.model._train_counter
+                self._test_step = self.model._test_counter
+                self._predict_step = self.model._predict_counter
+            except AttributeError:
+                self._call_batch_hooks = True
+
+    def on_train_begin(self, logs=None):
+        # When this logger is called inside `fit`, validation is silent.
+        self._called_in_fit = True
+
+    def on_test_begin(self, logs=None):
+        if not self._called_in_fit:
+            self._reset_progbar()
+            self._maybe_init_progbar()
+
+    def on_predict_begin(self, logs=None):
+        self._reset_progbar()
+        self._maybe_init_progbar()
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self._reset_progbar()
+        self._maybe_init_progbar()
+        if self.verbose and self.epochs > 1:
+            io_utils.print_msg(f"Epoch {epoch + 1}/{self.epochs}")
+
+    def on_train_batch_end(self, batch, logs=None):
+        self._batch_update_progbar(batch, logs)
+
+    def on_test_batch_end(self, batch, logs=None):
+        if not self._called_in_fit:
+            self._batch_update_progbar(batch, logs)
+
+    def on_predict_batch_end(self, batch, logs=None):
+        # Don't pass prediction results.
+        self._batch_update_progbar(batch, None)
+
+    def on_epoch_end(self, epoch, logs=None):
+        self._finalize_progbar(logs, self._train_step)
+
+    def on_test_end(self, logs=None):
+        if not self._called_in_fit:
+            self._finalize_progbar(logs, self._test_step)
+
+    def on_predict_end(self, logs=None):
+        self._finalize_progbar(logs, self._predict_step)
+
+    def _reset_progbar(self):
+        self.seen = 0
+        self.progbar = None
+
+    def _maybe_init_progbar(self):
+        """Instantiate a `Progbar` if not yet, and update the stateful
+        metrics."""
+        # TODO(rchao): Legacy TF1 code path may use list for
+        # `self.stateful_metrics`. Remove "cast to set" when TF1 support is
+        # dropped.
+        self.stateful_metrics = set(self.stateful_metrics)
+
+        if self.model:
+            # Update the existing stateful metrics as `self.model.metrics` may
+            # contain updated metrics after `MetricsContainer` is built in the
+            # first train step.
+            self.stateful_metrics = self.stateful_metrics.union(
+                set(m.name for m in self.model.metrics)
+            )
+
+        if self.progbar is None:
+            self.progbar = Progbar(
+                target=self.target,
+                verbose=self.verbose,
+                stateful_metrics=self.stateful_metrics,
+                unit_name="step" if self.use_steps else "sample",
+            )
+
+        self.progbar._update_stateful_metrics(self.stateful_metrics)
+
+    def _implements_train_batch_hooks(self):
+        return self._call_batch_hooks
+
+    def _implements_test_batch_hooks(self):
+        return self._call_batch_hooks
+
+    def _implements_predict_batch_hooks(self):
+        return self._call_batch_hooks
+
+    def _batch_update_progbar(self, batch, logs=None):
+        """Updates the progbar."""
+        logs = logs or {}
+        self._maybe_init_progbar()
+        if self.use_steps:
+            self.seen = batch + 1  # One-indexed.
+        else:
+            # v1 path only.
+            logs = copy.copy(logs)
+            batch_size = logs.pop("size", 0)
+            num_steps = logs.pop("num_steps", 1)
+            logs.pop("batch", None)
+            add_seen = num_steps * batch_size
+            self.seen += add_seen
+
+        if self.verbose == 1:
+            # Only block async when verbose = 1.
+            logs = tf_utils.sync_to_numpy_or_python_type(logs)
+            self.progbar.update(self.seen, list(logs.items()), finalize=False)
+
+    def _finalize_progbar(self, logs, counter):
+        logs = tf_utils.sync_to_numpy_or_python_type(logs or {})
+        if self.target is None:
+            if counter is not None:
+                counter = counter.numpy()
+                if not self.use_steps:
+                    counter *= logs.get("size", 1)
+            self.target = counter or self.seen
+            self.progbar.target = self.target
+        self.progbar.update(self.target, list(logs.items()), finalize=True)
+
+
+@keras_export("keras.callbacks.History")
+class History(Callback):
+    """Callback that records events into a `History` object.
 
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_train_batch_begin(self, batch, logs=None):
-    """Called at the beginning of a training batch in `fit` methods.
+    This callback is automatically applied to
+    every Keras model. The `History` object
+    gets returned by the `fit` method of models.
 
-    Subclasses should override for any actions to run.
+    Example:
 
-    Note that if the `steps_per_execution` argument to `compile` in
-    `tf.keras.Model` is set to `N`, this method will only be called every `N`
-    batches.
+    >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+    >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
+    >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
+    ...                     epochs=10, verbose=1)
+    >>> print(history.params)
+    {'verbose': 1, 'epochs': 10, 'steps': 1}
+    >>> # check the keys of history object
+    >>> print(history.history.keys())
+    dict_keys(['loss'])
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
     """
-    # For backwards compatibility.
-    self.on_batch_begin(batch, logs=logs)
 
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_train_batch_end(self, batch, logs=None):
-    """Called at the end of a training batch in `fit` methods.
+    def __init__(self):
+        super().__init__()
+        self.history = {}
 
-    Subclasses should override for any actions to run.
+    def on_train_begin(self, logs=None):
+        self.epoch = []
 
-    Note that if the `steps_per_execution` argument to `compile` in
-    `tf.keras.Model` is set to `N`, this method will only be called every `N`
-    batches.
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        self.epoch.append(epoch)
+        for k, v in logs.items():
+            self.history.setdefault(k, []).append(v)
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Aggregated metric results up until this batch.
-    """
-    # For backwards compatibility.
-    self.on_batch_end(batch, logs=logs)
+        # Set the history attribute on the model after the epoch ends. This will
+        # make sure that the state which is set is the latest one.
+        self.model.history = self
 
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_test_batch_begin(self, batch, logs=None):
-    """Called at the beginning of a batch in `evaluate` methods.
 
-    Also called at the beginning of a validation batch in the `fit`
-    methods, if validation data is provided.
-
-    Subclasses should override for any actions to run.
+@keras_export("keras.callbacks.ModelCheckpoint")
+class ModelCheckpoint(Callback):
+    """Callback to save the Keras model or model weights at some frequency.
 
-    Note that if the `steps_per_execution` argument to `compile` in
-    `tf.keras.Model` is set to `N`, this method will only be called every `N`
-    batches.
+    `ModelCheckpoint` callback is used in conjunction with training using
+    `model.fit()` to save a model or weights (in a checkpoint file) at some
+    interval, so the model or weights can be loaded later to continue the
+    training from the state saved.
 
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
+    A few options this callback provides include:
 
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_test_batch_end(self, batch, logs=None):
-    """Called at the end of a batch in `evaluate` methods.
+    - Whether to only keep the model that has achieved the "best performance" so
+      far, or whether to save the model at the end of every epoch regardless of
+      performance.
+    - Definition of 'best'; which quantity to monitor and whether it should be
+      maximized or minimized.
+    - The frequency it should save at. Currently, the callback supports saving
+      at the end of every epoch, or after a fixed number of training batches.
+    - Whether only weights are saved, or the whole model is saved.
 
-    Also called at the end of a validation batch in the `fit`
-    methods, if validation data is provided.
+    Note: If you get `WARNING:tensorflow:Can save best model only with <name>
+    available, skipping` see the description of the `monitor` argument for
+    details on how to get this right.
 
-    Subclasses should override for any actions to run.
+    Example:
 
-    Note that if the `steps_per_execution` argument to `compile` in
-    `tf.keras.Model` is set to `N`, this method will only be called every `N`
-    batches.
+    ```python
+    model.compile(loss=..., optimizer=...,
+                  metrics=['accuracy'])
+
+    EPOCHS = 10
+    checkpoint_filepath = '/tmp/checkpoint'
+    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+        filepath=checkpoint_filepath,
+        save_weights_only=True,
+        monitor='val_accuracy',
+        mode='max',
+        save_best_only=True)
+
+    # Model weights are saved at the end of every epoch, if it's the best seen
+    # so far.
+    model.fit(epochs=EPOCHS, callbacks=[model_checkpoint_callback])
+
+    # The model weights (that are considered the best) are loaded into the
+    # model.
+    model.load_weights(checkpoint_filepath)
+    ```
 
     Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Aggregated metric results up until this batch.
+        filepath: string or `PathLike`, path to save the model file. e.g.
+          filepath = os.path.join(working_dir, 'ckpt', file_name). `filepath`
+          can contain named formatting options, which will be filled the value
+          of `epoch` and keys in `logs` (passed in `on_epoch_end`). For example:
+          if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, then the
+          model checkpoints will be saved with the epoch number and the
+          validation loss in the filename. The directory of the filepath should
+          not be reused by any other callbacks to avoid conflicts.
+        monitor: The metric name to monitor. Typically the metrics are set by
+          the `Model.compile` method. Note:
+
+          * Prefix the name with `"val_`" to monitor validation metrics.
+          * Use `"loss"` or "`val_loss`" to monitor the model's total loss.
+          * If you specify metrics as strings, like `"accuracy"`, pass the same
+            string (with or without the `"val_"` prefix).
+          * If you pass `metrics.Metric` objects, `monitor` should be set to
+            `metric.name`
+          * If you're not sure about the metric names you can check the contents
+            of the `history.history` dictionary returned by
+            `history = model.fit()`
+          * Multi-output models set additional prefixes on the metric names.
+
+        verbose: Verbosity mode, 0 or 1. Mode 0 is silent, and mode 1
+          displays messages when the callback takes an action.
+        save_best_only: if `save_best_only=True`, it only saves when the model
+          is considered the "best" and the latest best model according to the
+          quantity monitored will not be overwritten. If `filepath` doesn't
+          contain formatting options like `{epoch}` then `filepath` will be
+          overwritten by each new better model.
+        mode: one of {'auto', 'min', 'max'}. If `save_best_only=True`, the
+          decision to overwrite the current save file is made based on either
+          the maximization or the minimization of the monitored quantity.
+          For `val_acc`, this should be `max`, for `val_loss` this should be
+          `min`, etc. In `auto` mode, the mode is set to `max` if the quantities
+          monitored are 'acc' or start with 'fmeasure' and are set to `min` for
+          the rest of the quantities.
+        save_weights_only: if True, then only the model's weights will be saved
+          (`model.save_weights(filepath)`), else the full model is saved
+          (`model.save(filepath)`).
+        save_freq: `'epoch'` or integer. When using `'epoch'`, the callback
+          saves the model after each epoch. When using integer, the callback
+          saves the model at end of this many batches. If the `Model` is
+          compiled with `steps_per_execution=N`, then the saving criteria will
+          be checked every Nth batch. Note that if the saving isn't aligned to
+          epochs, the monitored metric may potentially be less reliable (it
+          could reflect as little as 1 batch, since the metrics get reset every
+          epoch). Defaults to `'epoch'`.
+        options: Optional `tf.train.CheckpointOptions` object if
+          `save_weights_only` is true or optional `tf.saved_model.SaveOptions`
+          object if `save_weights_only` is false.
+        initial_value_threshold: Floating point initial "best" value of the
+          metric to be monitored. Only applies if `save_best_value=True`. Only
+          overwrites the model weights already saved if the performance of
+          current model is better than this value.
+        **kwargs: Additional arguments for backwards compatibility. Possible key
+          is `period`.
     """
 
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_predict_batch_begin(self, batch, logs=None):
-    """Called at the beginning of a batch in `predict` methods.
-
-    Subclasses should override for any actions to run.
-
-    Note that if the `steps_per_execution` argument to `compile` in
-    `tf.keras.Model` is set to `N`, this method will only be called every `N`
-    batches.
-
-    Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
+    def __init__(
+        self,
+        filepath,
+        monitor: str = "val_loss",
+        verbose: int = 0,
+        save_best_only: bool = False,
+        save_weights_only: bool = False,
+        mode: str = "auto",
+        save_freq="epoch",
+        options=None,
+        initial_value_threshold=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self._supports_tf_logs = True
+        self.monitor = monitor
+        self.verbose = verbose
+        self.filepath = io_utils.path_to_string(filepath)
+        self.save_best_only = save_best_only
+        self.save_weights_only = save_weights_only
+        self.save_freq = save_freq
+        self.epochs_since_last_save = 0
+        self._batches_seen_since_last_saving = 0
+        self._last_batch_seen = -1
+        self.best = initial_value_threshold
+
+        if save_weights_only:
+            if options is None or isinstance(
+                options, tf.train.CheckpointOptions
+            ):
+                self._options = options or tf.train.CheckpointOptions()
+            else:
+                raise TypeError(
+                    "If save_weights_only is True, then `options` must be "
+                    "either None or a tf.train.CheckpointOptions. "
+                    f"Got {options}."
+                )
+        else:
+            if filepath and filepath.endswith(".keras") and options is not None:
+                raise ValueError(
+                    "The native Keras format does not support "
+                    "the `options` argument. Please remove "
+                    "the `options` argument, or use the SavedModel "
+                    "format by removing the `.keras` extension from "
+                    "the model filepath."
+                )
+            if options is None or isinstance(
+                options, tf.saved_model.SaveOptions
+            ):
+                self._options = options or tf.saved_model.SaveOptions()
+            else:
+                raise TypeError(
+                    "If save_weights_only is False, then `options` must be "
+                    "either None or a tf.saved_model.SaveOptions. "
+                    f"Got {options}."
+                )
+
+        # Deprecated field `load_weights_on_restart` is for loading the
+        # checkpoint file from `filepath` at the start of `model.fit()`
+        # TODO(rchao): Remove the arg during next breaking release.
+        if "load_weights_on_restart" in kwargs:
+            self.load_weights_on_restart = kwargs["load_weights_on_restart"]
+            logging.warning(
+                "`load_weights_on_restart` argument is deprecated. "
+                "Please use `model.load_weights()` for loading weights "
+                "before the start of `model.fit()`."
+            )
+        else:
+            self.load_weights_on_restart = False
+
+        # Deprecated field `period` is for the number of epochs between which
+        # the model is saved.
+        if "period" in kwargs:
+            self.period = kwargs["period"]
+            logging.warning(
+                "`period` argument is deprecated. Please use `save_freq` "
+                "to specify the frequency in number of batches seen."
+            )
+        else:
+            self.period = 1
+
+        if mode not in ["auto", "min", "max"]:
+            logging.warning(
+                "ModelCheckpoint mode %s is unknown, fallback to auto mode.",
+                mode,
+            )
+            mode = "auto"
+
+        if mode == "min":
+            self.monitor_op = np.less
+            if self.best is None:
+                self.best = np.Inf
+        elif mode == "max":
+            self.monitor_op = np.greater
+            if self.best is None:
+                self.best = -np.Inf
+        else:
+            if "acc" in self.monitor or self.monitor.startswith("fmeasure"):
+                self.monitor_op = np.greater
+                if self.best is None:
+                    self.best = -np.Inf
+            else:
+                self.monitor_op = np.less
+                if self.best is None:
+                    self.best = np.Inf
+
+        if self.save_freq != "epoch" and not isinstance(self.save_freq, int):
+            raise ValueError(
+                f"Unrecognized save_freq: {self.save_freq}. "
+                'Expected save_freq are "epoch" or integer'
+            )
+
+        # Only the chief worker writes model checkpoints, but all workers
+        # restore checkpoint at on_train_begin().
+        self._chief_worker_only = False
+
+    def on_train_begin(self, logs=None):
+        if self.load_weights_on_restart:
+            filepath_to_load = (
+                self._get_most_recently_modified_file_matching_pattern(
+                    self.filepath
+                )
+            )
+            if filepath_to_load is not None and self._checkpoint_exists(
+                filepath_to_load
+            ):
+                try:
+                    # `filepath` may contain placeholders such as `{epoch:02d}`,
+                    # and thus it attempts to load the most recently modified
+                    # file with file name matching the pattern.
+                    self.model.load_weights(filepath_to_load)
+                except (IOError, ValueError) as e:
+                    raise ValueError(
+                        f"Error loading file from {filepath_to_load}. "
+                        f"Reason: {e}"
+                    )
+
+    def _implements_train_batch_hooks(self):
+        # Only call batch hooks when saving on batch
+        return self.save_freq != "epoch"
+
+    def on_train_batch_end(self, batch, logs=None):
+        if self._should_save_on_batch(batch):
+            self._save_model(epoch=self._current_epoch, batch=batch, logs=logs)
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self._current_epoch = epoch
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.epochs_since_last_save += 1
+
+        if self.save_freq == "epoch":
+            self._save_model(epoch=epoch, batch=None, logs=logs)
+
+    def _should_save_on_batch(self, batch):
+        """Handles batch-level saving logic, supports steps_per_execution."""
+        if self.save_freq == "epoch":
+            return False
+
+        if batch <= self._last_batch_seen:  # New epoch.
+            add_batches = batch + 1  # batches are zero-indexed.
+        else:
+            add_batches = batch - self._last_batch_seen
+        self._batches_seen_since_last_saving += add_batches
+        self._last_batch_seen = batch
+
+        if self._batches_seen_since_last_saving >= self.save_freq:
+            self._batches_seen_since_last_saving = 0
+            return True
+        return False
+
+    def _save_model(self, epoch, batch, logs):
+        """Saves the model.
+
+        Args:
+            epoch: the epoch this iteration is in.
+            batch: the batch this iteration is in. `None` if the `save_freq`
+              is set to `epoch`.
+            logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`.
+        """
+        logs = logs or {}
+
+        if (
+            isinstance(self.save_freq, int)
+            or self.epochs_since_last_save >= self.period
+        ):
+            # Block only when saving interval is reached.
+            logs = tf_utils.sync_to_numpy_or_python_type(logs)
+            self.epochs_since_last_save = 0
+            filepath = self._get_file_path(epoch, batch, logs)
+
+            dirname = os.path.dirname(filepath)
+            if (
+                dirname
+                and not dirname.startswith("gs://")
+                and not tf.io.gfile.exists(dirname)
+            ):
+                tf.io.gfile.makedirs(dirname)
+
+            try:
+                if self.save_best_only:
+                    current = logs.get(self.monitor)
+                    if current is None:
+                        logging.warning(
+                            "Can save best model only with %s available, "
+                            "skipping.",
+                            self.monitor,
+                        )
+                    else:
+                        if self.monitor_op(current, self.best):
+                            if self.verbose > 0:
+                                io_utils.print_msg(
+                                    f"\nEpoch {epoch + 1}: {self.monitor} "
+                                    "improved "
+                                    f"from {self.best:.5f} to {current:.5f}, "
+                                    f"saving model to {filepath}"
+                                )
+                            self.best = current
+                            if self.save_weights_only:
+                                self.model.save_weights(
+                                    filepath,
+                                    overwrite=True,
+                                    options=self._options,
+                                )
+                            else:
+                                self.model.save(
+                                    filepath,
+                                    overwrite=True,
+                                    options=self._options,
+                                )
+                        else:
+                            if self.verbose > 0:
+                                io_utils.print_msg(
+                                    f"\nEpoch {epoch + 1}: "
+                                    f"{self.monitor} did not improve "
+                                    f"from {self.best:.5f}"
+                                )
+                else:
+                    if self.verbose > 0:
+                        io_utils.print_msg(
+                            f"\nEpoch {epoch + 1}: saving model to {filepath}"
+                        )
+                    if self.save_weights_only:
+                        self.model.save_weights(
+                            filepath, overwrite=True, options=self._options
+                        )
+                    elif filepath.endswith(".keras"):
+                        self.model.save(filepath, overwrite=True)
+                    else:
+                        self.model.save(
+                            filepath, overwrite=True, options=self._options
+                        )
+
+                self._maybe_remove_file()
+            except IsADirectoryError:  # h5py 3.x
+                raise IOError(
+                    "Please specify a non-directory filepath for "
+                    "ModelCheckpoint. Filepath used is an existing "
+                    f"directory: {filepath}"
+                )
+            except IOError as e:  # h5py 2.x
+                # `e.errno` appears to be `None` so checking the content of
+                # `e.args[0]`.
+                if "is a directory" in str(e.args[0]).lower():
+                    raise IOError(
+                        "Please specify a non-directory filepath for "
+                        "ModelCheckpoint. Filepath used is an existing "
+                        f"directory: f{filepath}"
+                    )
+                # Re-throw the error for any other causes.
+                raise e
+
+    def _get_file_path(self, epoch, batch, logs):
+        """Returns the file path for checkpoint."""
 
-  @doc_controls.for_subclass_implementers
-  @generic_utils.default
-  def on_predict_batch_end(self, batch, logs=None):
-    """Called at the end of a batch in `predict` methods.
+        try:
+            # `filepath` may contain placeholders such as
+            # `{epoch:02d}`,`{batch:02d}` and `{mape:.2f}`. A mismatch between
+            # logged metrics and the path's placeholders can cause formatting to
+            # fail.
+            if batch is None or "batch" in logs:
+                file_path = self.filepath.format(epoch=epoch + 1, **logs)
+            else:
+                file_path = self.filepath.format(
+                    epoch=epoch + 1, batch=batch + 1, **logs
+                )
+        except KeyError as e:
+            raise KeyError(
+                f'Failed to format this callback filepath: "{self.filepath}". '
+                f"Reason: {e}"
+            )
+        self._write_filepath = distributed_file_utils.write_filepath(
+            file_path, self.model.distribute_strategy
+        )
+        return self._write_filepath
+
+    def _maybe_remove_file(self):
+        # Remove the checkpoint directory in multi-worker training where this
+        # worker should not checkpoint. It is a dummy directory previously saved
+        # for sync distributed training.
+        distributed_file_utils.remove_temp_dir_with_filepath(
+            self._write_filepath, self.model.distribute_strategy
+        )
+
+    def _checkpoint_exists(self, filepath):
+        """Returns whether the checkpoint `filepath` refers to exists."""
+        if filepath.endswith(".h5"):
+            return tf.io.gfile.exists(filepath)
+        tf_saved_model_exists = tf.io.gfile.exists(filepath)
+        tf_weights_only_checkpoint_exists = tf.io.gfile.exists(
+            filepath + ".index"
+        )
+        return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+
+    def _get_most_recently_modified_file_matching_pattern(self, pattern):
+        """Returns the most recently modified filepath matching pattern.
+
+        Pattern may contain python formatting placeholder. If
+        `tf.train.latest_checkpoint()` does not return None, use that;
+        otherwise, check for most recently modified one that matches the
+        pattern.
+
+        In the rare case where there are more than one pattern-matching file
+        having the same modified time that is most recent among all, return the
+        filepath that is largest (by `>` operator, lexicographically using the
+        numeric equivalents). This provides a tie-breaker when multiple files
+        are most recent. Note that a larger `filepath` can sometimes indicate a
+        later time of modification (for instance, when epoch/batch is used as
+        formatting option), but not necessarily (when accuracy or loss is used).
+        The tie-breaker is put in the logic as best effort to return the most
+        recent, and to avoid undeterministic result.
+
+        Modified time of a file is obtained with `os.path.getmtime()`.
+
+        This utility function is best demonstrated via an example:
+
+        ```python
+        file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
+        test_dir = self.get_temp_dir()
+        path_pattern = os.path.join(test_dir, file_pattern)
+        file_paths = [
+            os.path.join(test_dir, file_name) for file_name in
+            ['f.batch03epoch02.h5',
+             'f.batch02epoch02.h5', 'f.batch01epoch01.h5']
+        ]
+        for file_path in file_paths:
+          # Write something to each of the files
+        self.assertEqual(
+            _get_most_recently_modified_file_matching_pattern(path_pattern),
+            file_paths[-1])
+        ```
+
+        Args:
+            pattern: The file pattern that may optionally contain python
+                placeholder such as `{epoch:02d}`.
+
+        Returns:
+            The most recently modified file's full filepath matching `pattern`.
+            If `pattern` does not contain any placeholder, this returns the
+            filepath that exactly matches `pattern`. Returns `None` if no match
+            is found.
+        """
+        dir_name = os.path.dirname(pattern)
+        base_name = os.path.basename(pattern)
+        base_name_regex = "^" + re.sub(r"{.*}", r".*", base_name) + "$"
+
+        # If tf.train.latest_checkpoint tells us there exists a latest
+        # checkpoint, use that as it is more robust than `os.path.getmtime()`.
+        latest_tf_checkpoint = tf.train.latest_checkpoint(dir_name)
+        if latest_tf_checkpoint is not None and re.match(
+            base_name_regex, os.path.basename(latest_tf_checkpoint)
+        ):
+            return latest_tf_checkpoint
+
+        latest_mod_time = 0
+        file_path_with_latest_mod_time = None
+        n_file_with_latest_mod_time = 0
+        file_path_with_largest_file_name = None
+
+        if tf.io.gfile.exists(dir_name):
+            for file_name in os.listdir(dir_name):
+                # Only consider if `file_name` matches the pattern.
+                if re.match(base_name_regex, file_name):
+                    file_path = os.path.join(dir_name, file_name)
+                    mod_time = os.path.getmtime(file_path)
+                    if (
+                        file_path_with_largest_file_name is None
+                        or file_path > file_path_with_largest_file_name
+                    ):
+                        file_path_with_largest_file_name = file_path
+                    if mod_time > latest_mod_time:
+                        latest_mod_time = mod_time
+                        file_path_with_latest_mod_time = file_path
+                        # In the case a file with later modified time is found,
+                        # reset the counter for the number of files with latest
+                        # modified time.
+                        n_file_with_latest_mod_time = 1
+                    elif mod_time == latest_mod_time:
+                        # In the case a file has modified time tied with the
+                        # most recent, increment the counter for the number of
+                        # files with latest modified time by 1.
+                        n_file_with_latest_mod_time += 1
+
+        if n_file_with_latest_mod_time == 1:
+            # Return the sole file that has most recent modified time.
+            return file_path_with_latest_mod_time
+        else:
+            # If there are more than one file having latest modified time,
+            # return the file path with the largest file name.
+            return file_path_with_largest_file_name
 
-    Subclasses should override for any actions to run.
 
-    Note that if the `steps_per_execution` argument to `compile` in
-    `tf.keras.Model` is set to `N`, this method will only be called every `N`
-    batches.
+@keras_export("keras.callbacks.BackupAndRestore", v1=[])
+class BackupAndRestore(Callback):
+    """Callback to back up and restore the training state.
+
+    `BackupAndRestore` callback is intended to recover training from an
+    interruption that has happened in the middle of a `Model.fit` execution, by
+    backing up the training states in a temporary checkpoint file (with the help
+    of a `tf.train.CheckpointManager`), at the end of each epoch. Each backup
+    overwrites the previously written checkpoint file, so at any given time
+    there is at most one such checkpoint file for backup/restoring purpose.
+
+    If training restarts before completion, the training state (which includes
+    the `Model` weights and epoch number) is restored to the most recently saved
+    state at the beginning of a new `Model.fit` run. At the completion of a
+    `Model.fit` run, the temporary checkpoint file is deleted.
+
+    Note that the user is responsible to bring jobs back after the interruption.
+    This callback is important for the backup and restore mechanism for fault
+    tolerance purpose, and the model to be restored from a previous checkpoint
+    is expected to be the same as the one used to back up. If user changes
+    arguments passed to compile or fit, the checkpoint saved for fault tolerance
+    can become invalid.
+
+    Note:
+
+    1. This callback is not compatible with eager execution disabled.
+    2. A checkpoint is saved at the end of each epoch. After restoring,
+    `Model.fit` redoes any partial work during the unfinished epoch in which the
+    training got restarted (so the work done before the interruption doesn't
+    affect the final model state).
+    3. This works for both single worker and multi-worker modes. When
+    `Model.fit` is used with `tf.distribute`, it supports
+    `tf.distribute.MirroredStrategy`,
+    `tf.distribute.MultiWorkerMirroredStrategy`, `tf.distribute.TPUStrategy`,
+    and `tf.distribute.experimental.ParameterServerStrategy`.
+
+    Example:
+
+    >>> class InterruptingCallback(tf.keras.callbacks.Callback):
+    ...   def on_epoch_begin(self, epoch, logs=None):
+    ...     if epoch == 4:
+    ...       raise RuntimeError('Interrupting!')
+    >>> callback = tf.keras.callbacks.BackupAndRestore(backup_dir="/tmp/backup")
+    >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+    >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
+    >>> try:
+    ...   model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
+    ...             batch_size=1, callbacks=[callback, InterruptingCallback()],
+    ...             verbose=0)
+    ... except:
+    ...   pass
+    >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
+    ...                     epochs=10, batch_size=1, callbacks=[callback],
+    ...                     verbose=0)
+    >>> # Only 6 more epochs are run, since first training got interrupted at
+    >>> # zero-indexed epoch 4, second training will continue from 4 to 9.
+    >>> len(history.history['loss'])
+    6
+
+    Besides the option to save at the end of every epoch or every N steps, if
+    you are doing distributed training with
+    `tf.distribute.MultiWorkerMirroredStrategy` on Google Cloud Platform or
+    Google Borg, you can also use the `save_before_preemption` argument
+    to enable saving a checkpoint right before a worker gets preempted
+    by other jobs and training gets interrupted. See
+    `tf.distribute.experimental.PreemptionCheckpointHandler` for more details.
 
     Args:
-        batch: Integer, index of batch within the current epoch.
-        logs: Dict. Aggregated metric results up until this batch.
+        backup_dir: String, path to store the checkpoint.
+          e.g. `backup_dir = os.path.join(working_dir, 'backup')`.
+          This is the directory in which the system stores temporary files to
+          recover the model from jobs terminated unexpectedly. The directory
+          cannot be reused elsewhere to store other files, e.g. by the
+          `BackupAndRestore` callback of another training run,
+          or by another callback
+          (e.g. `ModelCheckpoint`) of the same training.
+        save_freq: `'epoch'`, integer, or `False`. When set to `'epoch'`
+          the callback saves the checkpoint at the end of each epoch.
+          When set to an integer, the callback saves the checkpoint every
+          `save_freq` batches. Set `save_freq` to `False` if only using
+          preemption checkpointing (with `save_before_preemption=True`).
+        delete_checkpoint: Boolean, default to True. This `BackupAndRestore`
+          callback works by saving a checkpoint to back up the training state.
+          If `delete_checkpoint=True`, the checkpoint will be deleted after
+          training is finished. Use `False` if you'd like to keep the checkpoint
+          for future usage.
+        save_before_preemption: A boolean value instructing whether to turn on
+          the automatic checkpoint saving for preemption/maintenance events.
+          This only supports
+          `tf.distribute.MultiWorkerMirroredStrategy` on Google Cloud Platform
+          or Google Borg for now.
     """
 
-  @doc_controls.for_subclass_implementers
-  def on_train_begin(self, logs=None):
-    """Called at the beginning of training.
-
-    Subclasses should override for any actions to run.
+    def __init__(
+        self,
+        backup_dir,
+        save_freq="epoch",
+        delete_checkpoint=True,
+        save_before_preemption=False,
+    ):
+        super().__init__()
+        self.backup_dir = backup_dir
+        self._supports_tf_logs = True
+        self._supported_strategies = (
+            tf.distribute.MirroredStrategy,
+            tf.distribute.MultiWorkerMirroredStrategy,
+            tf.distribute.experimental.TPUStrategy,
+            tf.distribute.TPUStrategy,
+            tf.distribute.experimental.ParameterServerStrategy,
+        )
+        self.save_freq = save_freq
+        self.delete_checkpoint = delete_checkpoint
+        self.save_before_preemption = save_before_preemption
+        self._batches_count = 0
+        self._current_epoch = 0
+
+        if not tf.executing_eagerly():
+            if tf.inside_function():
+                raise ValueError(
+                    "This Callback's method contains Python state and "
+                    "should be called outside of `tf.function`s."
+                )
+            else:  # Legacy graph mode:
+                raise ValueError(
+                    "BackupAndRestore only supports eager mode. In graph "
+                    "mode, consider using ModelCheckpoint to manually save "
+                    "and restore weights with `model.load_weights()` and by "
+                    "providing `initial_epoch` in `model.fit()` for fault "
+                    "tolerance."
+                )
+        if (not save_freq) and (not save_before_preemption):
+            raise ValueError(
+                "Either `save_freq` or `save_before_preemption` " "must be set."
+            )
+
+        # Only the chief worker writes model checkpoints, but all workers
+        # restore checkpoint at on_train_begin().
+        self._chief_worker_only = False
+
+    def on_train_begin(self, logs=None):
+        # TrainingState is used to manage the training state needed for
+        # failure-recovery of a worker in training.
+
+        if self.model._distribution_strategy and not isinstance(
+            self.model.distribute_strategy, self._supported_strategies
+        ):
+            raise NotImplementedError(
+                f"{type(self.model.distribute_strategy)} is not supported yet. "
+                "Currently BackupAndRestore callback "
+                "only supports empty strategy, "
+                "MirroredStrategy, MultiWorkerMirroredStrategy and TPUStrategy."
+            )
+        self.model._training_state = worker_training_state.WorkerTrainingState(
+            self.model,
+            self.backup_dir,
+            self.save_freq,
+            self.save_before_preemption,
+        )
+        self._training_state = self.model._training_state
+        self._training_state.restore()
+
+    def on_train_batch_begin(self, batch, logs=None):
+        # Skip batch update for PSS Strategy
+        if isinstance(
+            self.model.distribute_strategy,
+            tf.distribute.ParameterServerStrategy,
+        ):
+            return
+        self._training_state._ckpt_saved_batch.assign(batch)
+
+    def on_train_batch_end(self, batch, logs=None):
+        # Skip batch update for PSS Strategy
+        if isinstance(
+            self.model.distribute_strategy,
+            tf.distribute.ParameterServerStrategy,
+        ):
+            return
+        self._training_state.backup_if_preempted()
+        if self.save_freq and self.save_freq != "epoch":
+            self._batches_count += 1
+            if self._batches_count >= self.save_freq:
+                self._batches_count = 0
+                self._backup(epoch=self._current_epoch, batch=batch)
+
+    def _implements_train_batch_hooks(self):
+        return self.save_freq != "epoch"
+
+    def on_train_end(self, logs=None):
+        if self.delete_checkpoint:
+            # On exit of training, delete the training state backup file saved
+            # for the purpose of worker recovery unless the user opts out.
+            self._training_state.delete_backup()
+        # Clean up the training state.
+        del self._training_state
+        del self.model._training_state
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self._training_state._ckpt_saved_epoch.assign(epoch)
+        self._current_epoch = epoch
+
+    def on_epoch_end(self, epoch, logs=None):
+        # Back up the model and current epoch for possible future recovery.
+        if self.save_freq == "epoch":
+            self._backup(epoch=epoch)
+
+    def _backup(self, epoch, batch=0):
+        self._training_state.back_up(epoch=epoch, batch=batch)
+
+
+@keras_export("keras.callbacks.experimental.BackupAndRestore", v1=[])
+@deprecation.deprecated_endpoints(
+    "keras.callbacks.experimental.BackupAndRestore"
+)
+class BackupAndRestoreExperimental(BackupAndRestore):
+    """Deprecated. Please use `tf.keras.callbacks.BackupAndRestore` instead.
 
-    Args:
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
+    Caution: `tf.keras.callbacks.experimental.BackupAndRestore` endpoint is
+      deprecated and will be removed in a future release. Please use
+      `tf.keras.callbacks.BackupAndRestore`.
     """
 
-  @doc_controls.for_subclass_implementers
-  def on_train_end(self, logs=None):
-    """Called at the end of training.
+    def __init__(self, *args, **kwargs):
+        logging.warning(
+            "`tf.keras.callbacks.experimental.BackupAndRestore` endpoint is "
+            "deprecated and will be removed in a future release. Please use "
+            "`tf.keras.callbacks.BackupAndRestore`."
+        )
+        super().__init__(*args, **kwargs)
 
-    Subclasses should override for any actions to run.
 
-    Args:
-        logs: Dict. Currently the output of the last call to `on_epoch_end()`
-          is passed to this argument for this method but that may change in
-          the future.
-    """
+@keras_export("keras.callbacks.EarlyStopping")
+class EarlyStopping(Callback):
+    """Stop training when a monitored metric has stopped improving.
 
-  @doc_controls.for_subclass_implementers
-  def on_test_begin(self, logs=None):
-    """Called at the beginning of evaluation or validation.
+    Assuming the goal of a training is to minimize the loss. With this, the
+    metric to be monitored would be `'loss'`, and mode would be `'min'`. A
+    `model.fit()` training loop will check at end of every epoch whether
+    the loss is no longer decreasing, considering the `min_delta` and
+    `patience` if applicable. Once it's found no longer decreasing,
+    `model.stop_training` is marked True and the training terminates.
 
-    Subclasses should override for any actions to run.
+    The quantity to be monitored needs to be available in `logs` dict.
+    To make it so, pass the loss or metrics at `model.compile()`.
 
     Args:
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
+      monitor: Quantity to be monitored.
+      min_delta: Minimum change in the monitored quantity
+          to qualify as an improvement, i.e. an absolute
+          change of less than min_delta, will count as no
+          improvement.
+      patience: Number of epochs with no improvement
+          after which training will be stopped.
+      verbose: Verbosity mode, 0 or 1. Mode 0 is silent, and mode 1
+          displays messages when the callback takes an action.
+      mode: One of `{"auto", "min", "max"}`. In `min` mode,
+          training will stop when the quantity
+          monitored has stopped decreasing; in `"max"`
+          mode it will stop when the quantity
+          monitored has stopped increasing; in `"auto"`
+          mode, the direction is automatically inferred
+          from the name of the monitored quantity.
+      baseline: Baseline value for the monitored quantity.
+          Training will stop if the model doesn't show improvement over the
+          baseline.
+      restore_best_weights: Whether to restore model weights from
+          the epoch with the best value of the monitored quantity.
+          If False, the model weights obtained at the last step of
+          training are used. An epoch will be restored regardless
+          of the performance relative to the `baseline`. If no epoch
+          improves on `baseline`, training will run for `patience`
+          epochs and restore weights from the best epoch in that set.
+      start_from_epoch: Number of epochs to wait before starting
+          to monitor improvement. This allows for a warm-up period in which
+          no improvement is expected and thus training will not be stopped.
+
+
+    Example:
+
+    >>> callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
+    >>> # This callback will stop the training when there is no improvement in
+    >>> # the loss for three consecutive epochs.
+    >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+    >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
+    >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
+    ...                     epochs=10, batch_size=1, callbacks=[callback],
+    ...                     verbose=0)
+    >>> len(history.history['loss'])  # Only 4 epochs are run.
+    4
     """
 
-  @doc_controls.for_subclass_implementers
-  def on_test_end(self, logs=None):
-    """Called at the end of evaluation or validation.
+    def __init__(
+        self,
+        monitor="val_loss",
+        min_delta=0,
+        patience=0,
+        verbose=0,
+        mode="auto",
+        baseline=None,
+        restore_best_weights=False,
+        start_from_epoch=0,
+    ):
+        super().__init__()
+
+        self.monitor = monitor
+        self.patience = patience
+        self.verbose = verbose
+        self.baseline = baseline
+        self.min_delta = abs(min_delta)
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.restore_best_weights = restore_best_weights
+        self.best_weights = None
+        self.start_from_epoch = start_from_epoch
+
+        if mode not in ["auto", "min", "max"]:
+            logging.warning(
+                "EarlyStopping mode %s is unknown, fallback to auto mode.",
+                mode,
+            )
+            mode = "auto"
+
+        if mode == "min":
+            self.monitor_op = np.less
+        elif mode == "max":
+            self.monitor_op = np.greater
+        else:
+            if (
+                self.monitor.endswith("acc")
+                or self.monitor.endswith("accuracy")
+                or self.monitor.endswith("auc")
+            ):
+                self.monitor_op = np.greater
+            else:
+                self.monitor_op = np.less
 
-    Subclasses should override for any actions to run.
+        if self.monitor_op == np.greater:
+            self.min_delta *= 1
+        else:
+            self.min_delta *= -1
 
-    Args:
-        logs: Dict. Currently the output of the last call to
-          `on_test_batch_end()` is passed to this argument for this method
-          but that may change in the future.
-    """
+    def on_train_begin(self, logs=None):
+        # Allow instances to be re-used
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+        self.best_weights = None
+        self.best_epoch = 0
+
+    def on_epoch_end(self, epoch, logs=None):
+        current = self.get_monitor_value(logs)
+        if current is None or epoch < self.start_from_epoch:
+            # If no monitor value exists or still in initial warm-up stage.
+            return
+        if self.restore_best_weights and self.best_weights is None:
+            # Restore the weights after first epoch if no progress is ever made.
+            self.best_weights = self.model.get_weights()
 
-  @doc_controls.for_subclass_implementers
-  def on_predict_begin(self, logs=None):
-    """Called at the beginning of prediction.
+        self.wait += 1
+        if self._is_improvement(current, self.best):
+            self.best = current
+            self.best_epoch = epoch
+            if self.restore_best_weights:
+                self.best_weights = self.model.get_weights()
+            # Only restart wait if we beat both the baseline and our previous
+            # best.
+            if self.baseline is None or self._is_improvement(
+                current, self.baseline
+            ):
+                self.wait = 0
+            return
+
+        # Only check after the first epoch.
+        if self.wait >= self.patience and epoch > 0:
+            self.stopped_epoch = epoch
+            self.model.stop_training = True
+            if self.restore_best_weights and self.best_weights is not None:
+                if self.verbose > 0:
+                    io_utils.print_msg(
+                        "Restoring model weights from "
+                        "the end of the best epoch: "
+                        f"{self.best_epoch + 1}."
+                    )
+                self.model.set_weights(self.best_weights)
+
+    def on_train_end(self, logs=None):
+        if self.stopped_epoch > 0 and self.verbose > 0:
+            io_utils.print_msg(
+                f"Epoch {self.stopped_epoch + 1}: early stopping"
+            )
+
+    def get_monitor_value(self, logs):
+        logs = logs or {}
+        monitor_value = logs.get(self.monitor)
+        if monitor_value is None:
+            logging.warning(
+                "Early stopping conditioned on metric `%s` "
+                "which is not available. Available metrics are: %s",
+                self.monitor,
+                ",".join(list(logs.keys())),
+            )
+        return monitor_value
+
+    def _is_improvement(self, monitor_value, reference_value):
+        return self.monitor_op(monitor_value - self.min_delta, reference_value)
+
+
+@keras_export("keras.callbacks.RemoteMonitor")
+class RemoteMonitor(Callback):
+    """Callback used to stream events to a server.
 
-    Subclasses should override for any actions to run.
+    Requires the `requests` library.
+    Events are sent to `root + '/publish/epoch/end/'` by default. Calls are
+    HTTP POST, with a `data` argument which is a
+    JSON-encoded dictionary of event data.
+    If `send_as_json=True`, the content type of the request will be
+    `"application/json"`.
+    Otherwise the serialized JSON will be sent within a form.
 
     Args:
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
+      root: String; root url of the target server.
+      path: String; path relative to `root` to which the events will be sent.
+      field: String; JSON field under which the data will be stored.
+          The field is used only if the payload is sent within a form
+          (i.e. send_as_json is set to False).
+      headers: Dictionary; optional custom HTTP headers.
+      send_as_json: Boolean; whether the request should be
+          sent as `"application/json"`.
     """
 
-  @doc_controls.for_subclass_implementers
-  def on_predict_end(self, logs=None):
-    """Called at the end of prediction.
+    def __init__(
+        self,
+        root="http://localhost:9000",
+        path="/publish/epoch/end/",
+        field="data",
+        headers=None,
+        send_as_json=False,
+    ):
+        super().__init__()
+
+        self.root = root
+        self.path = path
+        self.field = field
+        self.headers = headers
+        self.send_as_json = send_as_json
+
+    def on_epoch_end(self, epoch, logs=None):
+        if requests is None:
+            raise ImportError("RemoteMonitor requires the `requests` library.")
+        logs = logs or {}
+        send = {}
+        send["epoch"] = epoch
+        for k, v in logs.items():
+            # np.ndarray and np.generic are not scalar types
+            # therefore we must unwrap their scalar values and
+            # pass to the json-serializable dict 'send'
+            if isinstance(v, (np.ndarray, np.generic)):
+                send[k] = v.item()
+            else:
+                send[k] = v
+        try:
+            if self.send_as_json:
+                requests.post(
+                    self.root + self.path, json=send, headers=self.headers
+                )
+            else:
+                requests.post(
+                    self.root + self.path,
+                    {self.field: json.dumps(send)},
+                    headers=self.headers,
+                )
+        except requests.exceptions.RequestException:
+            logging.warning(
+                "Warning: could not reach RemoteMonitor root server at "
+                + str(self.root)
+            )
+
+
+@keras_export("keras.callbacks.LearningRateScheduler")
+class LearningRateScheduler(Callback):
+    """Learning rate scheduler.
 
-    Subclasses should override for any actions to run.
+    At the beginning of every epoch, this callback gets the updated learning
+    rate value from `schedule` function provided at `__init__`, with the current
+    epoch and current learning rate, and applies the updated learning rate on
+    the optimizer.
 
     Args:
-        logs: Dict. Currently no data is passed to this argument for this method
-          but that may change in the future.
-    """
-
-  def _implements_train_batch_hooks(self):
-    """Determines if this Callback should be called for each train batch."""
-    return (not generic_utils.is_default(self.on_batch_begin) or
-            not generic_utils.is_default(self.on_batch_end) or
-            not generic_utils.is_default(self.on_train_batch_begin) or
-            not generic_utils.is_default(self.on_train_batch_end))
-
-  def _implements_test_batch_hooks(self):
-    """Determines if this Callback should be called for each test batch."""
-    return (not generic_utils.is_default(self.on_test_batch_begin) or
-            not generic_utils.is_default(self.on_test_batch_end))
-
-  def _implements_predict_batch_hooks(self):
-    """Determines if this Callback should be called for each predict batch."""
-    return (not generic_utils.is_default(self.on_predict_batch_begin) or
-            not generic_utils.is_default(self.on_predict_batch_end))
-
-
-@keras_export('keras.callbacks.BaseLogger')
-class BaseLogger(Callback):
-  """Callback that accumulates epoch averages of metrics.
-
-  This callback is automatically applied to every Keras model.
-
-  Args:
-      stateful_metrics: Iterable of string names of metrics that
-          should *not* be averaged over an epoch.
-          Metrics in this list will be logged as-is in `on_epoch_end`.
-          All others will be averaged in `on_epoch_end`.
-  """
-
-  def __init__(self, stateful_metrics=None):
-    super().__init__()
-    self.stateful_metrics = set(stateful_metrics or [])
-
-  def on_epoch_begin(self, epoch, logs=None):
-    self.seen = 0
-    self.totals = {}
-
-  def on_batch_end(self, batch, logs=None):
-    logs = logs or {}
-    batch_size = logs.get('size', 0)
-    # In case of distribution strategy we can potentially run multiple steps
-    # at the same time, we should account for that in the `seen` calculation.
-    num_steps = logs.get('num_steps', 1)
-    self.seen += batch_size * num_steps
-
-    for k, v in logs.items():
-      if k in self.stateful_metrics:
-        self.totals[k] = v
-      else:
-        if k in self.totals:
-          self.totals[k] += v * batch_size
-        else:
-          self.totals[k] = v * batch_size
-
-  def on_epoch_end(self, epoch, logs=None):
-    if logs is not None:
-      for k in self.params['metrics']:
-        if k in self.totals:
-          # Make value available to next callbacks.
-          if k in self.stateful_metrics:
-            logs[k] = self.totals[k]
-          else:
-            logs[k] = self.totals[k] / self.seen
+      schedule: a function that takes an epoch index (integer, indexed from 0)
+          and current learning rate (float) as inputs and returns a new
+          learning rate as output (float).
+      verbose: int. 0: quiet, 1: update messages.
 
+    Example:
+
+    >>> # This function keeps the initial learning rate for the first ten epochs
+    >>> # and decreases it exponentially after that.
+    >>> def scheduler(epoch, lr):
+    ...   if epoch < 10:
+    ...     return lr
+    ...   else:
+    ...     return lr * tf.math.exp(-0.1)
+    >>>
+    >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+    >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
+    >>> round(model.optimizer.lr.numpy(), 5)
+    0.01
+
+    >>> callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
+    >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
+    ...                     epochs=15, callbacks=[callback], verbose=0)
+    >>> round(model.optimizer.lr.numpy(), 5)
+    0.00607
 
-@keras_export('keras.callbacks.TerminateOnNaN')
-class TerminateOnNaN(Callback):
-  """Callback that terminates training when a NaN loss is encountered.
-  """
+    """
 
-  def __init__(self):
-    super().__init__()
-    self._supports_tf_logs = True
+    def __init__(self, schedule, verbose=0):
+        super().__init__()
+        self.schedule = schedule
+        self.verbose = verbose
+
+    def on_epoch_begin(self, epoch, logs=None):
+        if not hasattr(self.model.optimizer, "lr"):
+            raise ValueError('Optimizer must have a "lr" attribute.')
+        try:  # new API
+            lr = float(backend.get_value(self.model.optimizer.lr))
+            lr = self.schedule(epoch, lr)
+        except TypeError:  # Support for old API for backward compatibility
+            lr = self.schedule(epoch)
+        if not isinstance(lr, (tf.Tensor, float, np.float32, np.float64)):
+            raise ValueError(
+                'The output of the "schedule" function '
+                f"should be float. Got: {lr}"
+            )
+        if isinstance(lr, tf.Tensor) and not lr.dtype.is_floating:
+            raise ValueError(
+                f"The dtype of `lr` Tensor should be float. Got: {lr.dtype}"
+            )
+        backend.set_value(self.model.optimizer.lr, backend.get_value(lr))
+        if self.verbose > 0:
+            io_utils.print_msg(
+                f"\nEpoch {epoch + 1}: LearningRateScheduler setting learning "
+                f"rate to {lr}."
+            )
 
-  def on_batch_end(self, batch, logs=None):
-    logs = logs or {}
-    loss = logs.get('loss')
-    if loss is not None:
-      loss = tf_utils.sync_to_numpy_or_python_type(loss)
-      if np.isnan(loss) or np.isinf(loss):
-        io_utils.print_msg(f'Batch {batch}: Invalid loss, terminating training')
-        self.model.stop_training = True
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs["lr"] = backend.get_value(self.model.optimizer.lr)
 
 
-@keras_export('keras.callbacks.ProgbarLogger')
-class ProgbarLogger(Callback):
-  """Callback that prints metrics to stdout.
-
-  Args:
-      count_mode: One of `"steps"` or `"samples"`.
-          Whether the progress bar should
-          count samples seen or steps (batches) seen.
-      stateful_metrics: Iterable of string names of metrics that
-          should *not* be averaged over an epoch.
-          Metrics in this list will be logged as-is.
-          All others will be averaged over time (e.g. loss, etc).
-          If not provided, defaults to the `Model`'s metrics.
-
-  Raises:
-      ValueError: In case of invalid `count_mode`.
-  """
-
-  def __init__(self, count_mode='samples', stateful_metrics=None):
-    super().__init__()
-    self._supports_tf_logs = True
-    if count_mode == 'samples':
-      self.use_steps = False
-    elif count_mode == 'steps':
-      self.use_steps = True
-    else:
-      raise ValueError(
-          f'Unknown `count_mode`: {count_mode}. '
-          'Expected values are ["samples", "steps"]')
-    # Defaults to all Model's metrics except for loss.
-    self.stateful_metrics = set(stateful_metrics) if stateful_metrics else set()
-
-    self.seen = 0
-    self.progbar = None
-    self.target = None
-    self.verbose = 1
-    self.epochs = 1
-
-    self._train_step, self._test_step, self._predict_step = None, None, None
-    self._call_batch_hooks = True
-
-    self._called_in_fit = False
-
-  def set_params(self, params):
-    self.verbose = params['verbose']
-    self.epochs = params['epochs']
-    if self.use_steps and 'steps' in params:
-      self.target = params['steps']
-    elif not self.use_steps and 'samples' in params:
-      self.target = params['samples']
-    else:
-      self.target = None  # Will be inferred at the end of the first epoch.
-
-    self._call_batch_hooks = self.verbose == 1
-    if self.target is None:
-      try:
-        self._train_step = self.model._train_counter  # pylint: disable=protected-access
-        self._test_step = self.model._test_counter  # pylint: disable=protected-access
-        self._predict_step = self.model._predict_counter  # pylint: disable=protected-access
-      except AttributeError:
-        self._call_batch_hooks = True
+def keras_model_summary(name, data, step=None):
+    """Writes a Keras model as JSON to as a Summary.
 
-  def on_train_begin(self, logs=None):
-    # When this logger is called inside `fit`, validation is silent.
-    self._called_in_fit = True
-
-  def on_test_begin(self, logs=None):
-    if not self._called_in_fit:
-      self._reset_progbar()
-      self._maybe_init_progbar()
-
-  def on_predict_begin(self, logs=None):
-    self._reset_progbar()
-    self._maybe_init_progbar()
-
-  def on_epoch_begin(self, epoch, logs=None):
-    self._reset_progbar()
-    self._maybe_init_progbar()
-    if self.verbose and self.epochs > 1:
-      io_utils.print_msg(f'Epoch {epoch + 1}/{self.epochs}')
-
-  def on_train_batch_end(self, batch, logs=None):
-    self._batch_update_progbar(batch, logs)
-
-  def on_test_batch_end(self, batch, logs=None):
-    if not self._called_in_fit:
-      self._batch_update_progbar(batch, logs)
-
-  def on_predict_batch_end(self, batch, logs=None):
-    # Don't pass prediction results.
-    self._batch_update_progbar(batch, None)
-
-  def on_epoch_end(self, epoch, logs=None):
-    self._finalize_progbar(logs, self._train_step)
-
-  def on_test_end(self, logs=None):
-    if not self._called_in_fit:
-      self._finalize_progbar(logs, self._test_step)
-
-  def on_predict_end(self, logs=None):
-    self._finalize_progbar(logs, self._predict_step)
-
-  def _reset_progbar(self):
-    self.seen = 0
-    self.progbar = None
-
-  def _maybe_init_progbar(self):
-    """Instantiate a `Progbar` if not yet, and update the stateful metrics."""
-    # TODO(rchao): Legacy TF1 code path may use list for
-    # `self.stateful_metrics`. Remove "cast to set" when TF1 support is dropped.
-    self.stateful_metrics = set(self.stateful_metrics)
-
-    if self.model:
-      # Update the existing stateful metrics as `self.model.metrics` may contain
-      # updated metrics after `MetricsContainer` is built in the first train
-      # step.
-      self.stateful_metrics = self.stateful_metrics.union(
-          set(m.name for m in self.model.metrics))
-
-    if self.progbar is None:
-      self.progbar = Progbar(
-          target=self.target,
-          verbose=self.verbose,
-          stateful_metrics=self.stateful_metrics,
-          unit_name='step' if self.use_steps else 'sample')
-
-    self.progbar._update_stateful_metrics(self.stateful_metrics)  # pylint: disable=protected-access
-
-  def _implements_train_batch_hooks(self):
-    return self._call_batch_hooks
-
-  def _implements_test_batch_hooks(self):
-    return self._call_batch_hooks
-
-  def _implements_predict_batch_hooks(self):
-    return self._call_batch_hooks
-
-  def _batch_update_progbar(self, batch, logs=None):
-    """Updates the progbar."""
-    logs = logs or {}
-    self._maybe_init_progbar()
-    if self.use_steps:
-      self.seen = batch + 1  # One-indexed.
-    else:
-      # v1 path only.
-      logs = copy.copy(logs)
-      batch_size = logs.pop('size', 0)
-      num_steps = logs.pop('num_steps', 1)
-      logs.pop('batch', None)
-      add_seen = num_steps * batch_size
-      self.seen += add_seen
-
-    if self.verbose == 1:
-      # Only block async when verbose = 1.
-      logs = tf_utils.sync_to_numpy_or_python_type(logs)
-      self.progbar.update(self.seen, list(logs.items()), finalize=False)
-
-  def _finalize_progbar(self, logs, counter):
-    logs = tf_utils.sync_to_numpy_or_python_type(logs or {})
-    if self.target is None:
-      if counter is not None:
-        counter = counter.numpy()
-        if not self.use_steps:
-          counter *= logs.get('size', 1)
-      self.target = counter or self.seen
-      self.progbar.target = self.target
-    self.progbar.update(self.target, list(logs.items()), finalize=True)
-
-
-@keras_export('keras.callbacks.History')
-class History(Callback):
-  """Callback that records events into a `History` object.
+    Writing the Keras model configuration allows the TensorBoard graph plugin to
+    render a conceptual graph, as opposed to graph of ops. In case the model
+    fails to serialize as JSON, it ignores and returns False.
 
-  This callback is automatically applied to
-  every Keras model. The `History` object
-  gets returned by the `fit` method of models.
+    Args:
+      name: A name for this summary. The summary tag used for TensorBoard will
+        be this name prefixed by any active name scopes.
+      data: A Keras Model to write.
+      step: Explicit `int64`-castable monotonic step value for this summary. If
+        omitted, this defaults to `tf.summary.experimental.get_step()`, which
+        must not be None.
 
-  Example:
+    Returns:
+      True on success, or False if no summary was written because no default
+      summary writer was available.
 
-  >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-  >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
-  >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
-  ...                     epochs=10, verbose=1)
-  >>> print(history.params)
-  {'verbose': 1, 'epochs': 10, 'steps': 1}
-  >>> # check the keys of history object
-  >>> print(history.history.keys())
-  dict_keys(['loss'])
+    Raises:
+      ValueError: if a default writer exists, but no step was provided and
+        `tf.summary.experimental.get_step()` is None.
+    """
+    summary_metadata = tf.compat.v1.SummaryMetadata()
+    # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+    # the rationale.
+    summary_metadata.plugin_data.plugin_name = "graph_keras_model"
+    # version number = 1
+    summary_metadata.plugin_data.content = b"1"
 
-  """
+    try:
+        json_string = data.to_json()
+    except Exception as exc:
+        # An exception should not break a model code.
+        logging.warning(
+            "Model failed to serialize as JSON. Ignoring... %s", exc
+        )
+        return False
+
+    with tf.summary.experimental.summary_scope(
+        name, "graph_keras_model", [data, step]
+    ) as (tag, _):
+        with tf.device("cpu:0"):
+            tensor = tf.constant(json_string, dtype=tf.string)
+        return tf.summary.write(
+            tag=tag, tensor=tensor, step=step, metadata=summary_metadata
+        )
+
+
+@keras_export("keras.callbacks.TensorBoard", v1=[])
+class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
 
-  def __init__(self):
-    super().__init__()
-    self.history = {}
+    """Enable visualizations for TensorBoard.
 
-  def on_train_begin(self, logs=None):
-    self.epoch = []
+    TensorBoard is a visualization tool provided with TensorFlow.
 
-  def on_epoch_end(self, epoch, logs=None):
-    logs = logs or {}
-    self.epoch.append(epoch)
-    for k, v in logs.items():
-      self.history.setdefault(k, []).append(v)
+    This callback logs events for TensorBoard, including:
 
-    # Set the history attribute on the model after the epoch ends. This will
-    # make sure that the state which is set is the latest one.
-    self.model.history = self
+    * Metrics summary plots
+    * Training graph visualization
+    * Weight histograms
+    * Sampled profiling
 
+    When used in `Model.evaluate` or regular validation
+    ([on_test_end](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/Callback#on_test_end)),
+    in addition to epoch summaries, there will be a summary that records
+    evaluation metrics vs `Model.optimizer.iterations` written. The metric names
+    will be prepended with `evaluation`, with `Model.optimizer.iterations` being
+    the step in the visualized TensorBoard.
 
-@keras_export('keras.callbacks.ModelCheckpoint')
-class ModelCheckpoint(Callback):
-  """Callback to save the Keras model or model weights at some frequency.
-
-  `ModelCheckpoint` callback is used in conjunction with training using
-  `model.fit()` to save a model or weights (in a checkpoint file) at some
-  interval, so the model or weights can be loaded later to continue the training
-  from the state saved.
-
-  A few options this callback provides include:
-
-  - Whether to only keep the model that has achieved the "best performance" so
-    far, or whether to save the model at the end of every epoch regardless of
-    performance.
-  - Definition of 'best'; which quantity to monitor and whether it should be
-    maximized or minimized.
-  - The frequency it should save at. Currently, the callback supports saving at
-    the end of every epoch, or after a fixed number of training batches.
-  - Whether only weights are saved, or the whole model is saved.
-
-  Note: If you get `WARNING:tensorflow:Can save best model only with <name>
-  available, skipping` see the description of the `monitor` argument for
-  details on how to get this right.
-
-  Example:
-
-  ```python
-  model.compile(loss=..., optimizer=...,
-                metrics=['accuracy'])
-
-  EPOCHS = 10
-  checkpoint_filepath = '/tmp/checkpoint'
-  model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
-      filepath=checkpoint_filepath,
-      save_weights_only=True,
-      monitor='val_accuracy',
-      mode='max',
-      save_best_only=True)
-
-  # Model weights are saved at the end of every epoch, if it's the best seen
-  # so far.
-  model.fit(epochs=EPOCHS, callbacks=[model_checkpoint_callback])
-
-  # The model weights (that are considered the best) are loaded into the model.
-  model.load_weights(checkpoint_filepath)
-  ```
-
-  Args:
-      filepath: string or `PathLike`, path to save the model file. e.g.
-        filepath = os.path.join(working_dir, 'ckpt', file_name). `filepath`
-        can contain named formatting options, which will be filled the value of
-        `epoch` and keys in `logs` (passed in `on_epoch_end`). For example: if
-        `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, then the model
-        checkpoints will be saved with the epoch number and the validation loss
-        in the filename. The directory of the filepath should not be reused by
-        any other callbacks to avoid conflicts.
-      monitor: The metric name to monitor. Typically the metrics are set by the
-        `Model.compile` method. Note:
-
-        * Prefix the name with `"val_`" to monitor validation metrics.
-        * Use `"loss"` or "`val_loss`" to monitor the model's total loss.
-        * If you specify metrics as strings, like `"accuracy"`, pass the same
-          string (with or without the `"val_"` prefix).
-        * If you pass `metrics.Metric` objects, `monitor` should be set to
-          `metric.name`
-        * If you're not sure about the metric names you can check the contents
-          of the `history.history` dictionary returned by
-          `history = model.fit()`
-        * Multi-output models set additional prefixes on the metric names.
+    If you have installed TensorFlow with pip, you should be able
+    to launch TensorBoard from the command line:
 
-      verbose: Verbosity mode, 0 or 1. Mode 0 is silent, and mode 1
-        displays messages when the callback takes an action.
-      save_best_only: if `save_best_only=True`, it only saves when the model
-        is considered the "best" and the latest best model according to the
-        quantity monitored will not be overwritten. If `filepath` doesn't
-        contain formatting options like `{epoch}` then `filepath` will be
-        overwritten by each new better model.
-      mode: one of {'auto', 'min', 'max'}. If `save_best_only=True`, the
-        decision to overwrite the current save file is made based on either
-        the maximization or the minimization of the monitored quantity.
-        For `val_acc`, this should be `max`, for `val_loss` this should be
-        `min`, etc. In `auto` mode, the mode is set to `max` if the quantities
-        monitored are 'acc' or start with 'fmeasure' and are set to `min` for
-        the rest of the quantities.
-      save_weights_only: if True, then only the model's weights will be saved
-        (`model.save_weights(filepath)`), else the full model is saved
-        (`model.save(filepath)`).
-      save_freq: `'epoch'` or integer. When using `'epoch'`, the callback saves
-        the model after each epoch. When using integer, the callback saves the
-        model at end of this many batches. If the `Model` is compiled with
-        `steps_per_execution=N`, then the saving criteria will be
-        checked every Nth batch. Note that if the saving isn't aligned to
-        epochs, the monitored metric may potentially be less reliable (it
-        could reflect as little as 1 batch, since the metrics get reset every
-        epoch). Defaults to `'epoch'`.
-      options: Optional `tf.train.CheckpointOptions` object if
-        `save_weights_only` is true or optional `tf.saved_model.SaveOptions`
-        object if `save_weights_only` is false.
-      initial_value_threshold: Floating point initial "best" value of the metric
-        to be monitored. Only applies if `save_best_value=True`. Only overwrites
-        the model weights already saved if the performance of current
-        model is better than this value.
-      **kwargs: Additional arguments for backwards compatibility. Possible key
-        is `period`.
-  """
-
-  def __init__(self,
-               filepath,
-               monitor='val_loss',
-               verbose=0,
-               save_best_only=False,
-               save_weights_only=False,
-               mode='auto',
-               save_freq='epoch',
-               options=None,
-               initial_value_threshold=None,
-               **kwargs):
-    super().__init__()
-    self._supports_tf_logs = True
-    self.monitor = monitor
-    self.verbose = verbose
-    self.filepath = io_utils.path_to_string(filepath)
-    self.save_best_only = save_best_only
-    self.save_weights_only = save_weights_only
-    self.save_freq = save_freq
-    self.epochs_since_last_save = 0
-    self._batches_seen_since_last_saving = 0
-    self._last_batch_seen = 0
-    self.best = initial_value_threshold
-
-    if save_weights_only:
-      if options is None or isinstance(
-          options, tf.train.CheckpointOptions):
-        self._options = options or tf.train.CheckpointOptions()
-      else:
-        raise TypeError(
-            'If save_weights_only is True, then `options` must be '
-            f'either None or a tf.train.CheckpointOptions. Got {options}.')
-    else:
-      if options is None or isinstance(options, tf.saved_model.SaveOptions):
-        self._options = options or tf.saved_model.SaveOptions()
-      else:
-        raise TypeError(
-            'If save_weights_only is False, then `options` must be '
-            f'either None or a tf.saved_model.SaveOptions. Got {options}.')
-
-    # Deprecated field `load_weights_on_restart` is for loading the checkpoint
-    # file from `filepath` at the start of `model.fit()`
-    # TODO(rchao): Remove the arg during next breaking release.
-    if 'load_weights_on_restart' in kwargs:
-      self.load_weights_on_restart = kwargs['load_weights_on_restart']
-      logging.warning('`load_weights_on_restart` argument is deprecated. '
-                      'Please use `model.load_weights()` for loading weights '
-                      'before the start of `model.fit()`.')
-    else:
-      self.load_weights_on_restart = False
-
-    # Deprecated field `period` is for the number of epochs between which
-    # the model is saved.
-    if 'period' in kwargs:
-      self.period = kwargs['period']
-      logging.warning('`period` argument is deprecated. Please use `save_freq` '
-                      'to specify the frequency in number of batches seen.')
-    else:
-      self.period = 1
-
-    if mode not in ['auto', 'min', 'max']:
-      logging.warning('ModelCheckpoint mode %s is unknown, '
-                      'fallback to auto mode.', mode)
-      mode = 'auto'
-
-    if mode == 'min':
-      self.monitor_op = np.less
-      if self.best is None:
-        self.best = np.Inf
-    elif mode == 'max':
-      self.monitor_op = np.greater
-      if self.best is None:
-        self.best = -np.Inf
-    else:
-      if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
-        self.monitor_op = np.greater
-        if self.best is None:
-          self.best = -np.Inf
-      else:
-        self.monitor_op = np.less
-        if self.best is None:
-          self.best = np.Inf
-
-    if self.save_freq != 'epoch' and not isinstance(self.save_freq, int):
-      raise ValueError(
-          f'Unrecognized save_freq: {self.save_freq}. '
-          'Expected save_freq are "epoch" or integer')
-
-    # Only the chief worker writes model checkpoints, but all workers
-    # restore checkpoint at on_train_begin().
-    self._chief_worker_only = False
-
-  def on_train_begin(self, logs=None):
-    if self.load_weights_on_restart:
-      filepath_to_load = (
-          self._get_most_recently_modified_file_matching_pattern(self.filepath))
-      if (filepath_to_load is not None and
-          self._checkpoint_exists(filepath_to_load)):
-        try:
-          # `filepath` may contain placeholders such as `{epoch:02d}`, and
-          # thus it attempts to load the most recently modified file with file
-          # name matching the pattern.
-          self.model.load_weights(filepath_to_load)
-        except (IOError, ValueError) as e:
-          raise ValueError(
-              f'Error loading file from {filepath_to_load}. Reason: {e}')
-
-  def _implements_train_batch_hooks(self):
-    # Only call batch hooks when saving on batch
-    return self.save_freq != 'epoch'
-
-  def on_train_batch_end(self, batch, logs=None):
-    if self._should_save_on_batch(batch):
-      self._save_model(epoch=self._current_epoch, batch=batch, logs=logs)
-
-  def on_epoch_begin(self, epoch, logs=None):
-    self._current_epoch = epoch
-
-  def on_epoch_end(self, epoch, logs=None):
-    self.epochs_since_last_save += 1
-    # pylint: disable=protected-access
-    if self.save_freq == 'epoch':
-      self._save_model(epoch=epoch, batch=None, logs=logs)
-
-  def _should_save_on_batch(self, batch):
-    """Handles batch-level saving logic, supports steps_per_execution."""
-    if self.save_freq == 'epoch':
-      return False
-
-    if batch <= self._last_batch_seen:  # New epoch.
-      add_batches = batch + 1  # batches are zero-indexed.
-    else:
-      add_batches = batch - self._last_batch_seen
-    self._batches_seen_since_last_saving += add_batches
-    self._last_batch_seen = batch
-
-    if self._batches_seen_since_last_saving >= self.save_freq:
-      self._batches_seen_since_last_saving = 0
-      return True
-    return False
+    ```
+    tensorboard --logdir=path_to_your_logs
+    ```
 
-  def _save_model(self, epoch, batch, logs):
-    """Saves the model.
+    You can find more information about TensorBoard
+    [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 
     Args:
-        epoch: the epoch this iteration is in.
-        batch: the batch this iteration is in. `None` if the `save_freq`
-          is set to `epoch`.
-        logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`.
-    """
-    logs = logs or {}
-
-    if isinstance(self.save_freq,
-                  int) or self.epochs_since_last_save >= self.period:
-      # Block only when saving interval is reached.
-      logs = tf_utils.sync_to_numpy_or_python_type(logs)
-      self.epochs_since_last_save = 0
-      filepath = self._get_file_path(epoch, batch, logs)
-
-      try:
-        if self.save_best_only:
-          current = logs.get(self.monitor)
-          if current is None:
-            logging.warning('Can save best model only with %s available, '
-                            'skipping.', self.monitor)
-          else:
-            if self.monitor_op(current, self.best):
-              if self.verbose > 0:
-                io_utils.print_msg(
-                    f'\nEpoch {epoch + 1}: {self.monitor} improved '
-                    f'from {self.best:.5f} to {current:.5f}, '
-                    f'saving model to {filepath}')
-              self.best = current
-              if self.save_weights_only:
-                self.model.save_weights(
-                    filepath, overwrite=True, options=self._options)
-              else:
-                self.model.save(filepath, overwrite=True, options=self._options)
-            else:
-              if self.verbose > 0:
-                io_utils.print_msg(
-                    f'\nEpoch {epoch + 1}: '
-                    f'{self.monitor} did not improve from {self.best:.5f}')
-        else:
-          if self.verbose > 0:
-            io_utils.print_msg(
-                f'\nEpoch {epoch + 1}: saving model to {filepath}')
-          if self.save_weights_only:
-            self.model.save_weights(
-                filepath, overwrite=True, options=self._options)
-          else:
-            self.model.save(filepath, overwrite=True, options=self._options)
-
-        self._maybe_remove_file()
-      except IsADirectoryError as e:  # h5py 3.x
-        raise IOError('Please specify a non-directory filepath for '
-                      'ModelCheckpoint. Filepath used is an existing '
-                      f'directory: {filepath}')
-      except IOError as e:  # h5py 2.x
-        # `e.errno` appears to be `None` so checking the content of `e.args[0]`.
-        if 'is a directory' in str(e.args[0]).lower():
-          raise IOError('Please specify a non-directory filepath for '
-                        'ModelCheckpoint. Filepath used is an existing '
-                        f'directory: f{filepath}')
-        # Re-throw the error for any other causes.
-        raise e
-
-  def _get_file_path(self, epoch, batch, logs):
-    """Returns the file path for checkpoint."""
-    # pylint: disable=protected-access
-    try:
-      # `filepath` may contain placeholders such as `{epoch:02d}`,`{batch:02d}`
-      # and `{mape:.2f}`. A mismatch between logged metrics and the path's
-      # placeholders can cause formatting to fail.
-      if batch is None or 'batch' in logs:
-        file_path = self.filepath.format(epoch=epoch + 1, **logs)
-      else:
-        file_path = self.filepath.format(
-            epoch=epoch + 1, batch=batch + 1, **logs)
-    except KeyError as e:
-      raise KeyError(
-          f'Failed to format this callback filepath: "{self.filepath}". '
-          f'Reason: {e}')
-    self._write_filepath = distributed_file_utils.write_filepath(
-        file_path, self.model.distribute_strategy)
-    return self._write_filepath
-
-  def _maybe_remove_file(self):
-    # Remove the checkpoint directory in multi-worker training where this worker
-    # should not checkpoint. It is a dummy directory previously saved for sync
-    # distributed training.
-    distributed_file_utils.remove_temp_dir_with_filepath(
-        self._write_filepath, self.model.distribute_strategy)
-
-  def _checkpoint_exists(self, filepath):
-    """Returns whether the checkpoint `filepath` refers to exists."""
-    if filepath.endswith('.h5'):
-      return tf.io.gfile.exists(filepath)
-    tf_saved_model_exists = tf.io.gfile.exists(filepath)
-    tf_weights_only_checkpoint_exists = tf.io.gfile.exists(
-        filepath + '.index')
-    return tf_saved_model_exists or tf_weights_only_checkpoint_exists
-
-  def _get_most_recently_modified_file_matching_pattern(self, pattern):
-    """Returns the most recently modified filepath matching pattern.
-
-    Pattern may contain python formatting placeholder. If
-    `tf.train.latest_checkpoint()` does not return None, use that; otherwise,
-    check for most recently modified one that matches the pattern.
-
-    In the rare case where there are more than one pattern-matching file having
-    the same modified time that is most recent among all, return the filepath
-    that is largest (by `>` operator, lexicographically using the numeric
-    equivalents). This provides a tie-breaker when multiple files are most
-    recent. Note that a larger `filepath` can sometimes indicate a later time of
-    modification (for instance, when epoch/batch is used as formatting option),
-    but not necessarily (when accuracy or loss is used). The tie-breaker is
-    put in the logic as best effort to return the most recent, and to avoid
-    undeterministic result.
-
-    Modified time of a file is obtained with `os.path.getmtime()`.
-
-    This utility function is best demonstrated via an example:
+        log_dir: the path of the directory where to save the log files to be
+          parsed by TensorBoard. e.g. log_dir = os.path.join(working_dir,
+          'logs') This directory should not be reused by any other callbacks.
+        histogram_freq: frequency (in epochs) at which to compute
+          weight histograms for the layers of the model. If set to 0, histograms
+          won't be computed. Validation data (or split) must be specified for
+          histogram visualizations.
+        write_graph: whether to visualize the graph in TensorBoard. The log file
+          can become quite large when write_graph is set to True.
+        write_images: whether to write model weights to visualize as image in
+          TensorBoard.
+        write_steps_per_second: whether to log the training steps per second
+          into TensorBoard. This supports both epoch and batch frequency
+          logging.
+        update_freq: `'batch'` or `'epoch'` or integer. When using `'epoch'`,
+          writes the losses and metrics to TensorBoard after every epoch.
+          If using an integer, let's say `1000`, all metrics and losses
+          (including custom ones added by `Model.compile`) will be logged to
+          TensorBoard every 1000 batches. `'batch'` is a synonym for `1`,
+          meaning that they will be written every batch.
+          Note however that writing too frequently to TensorBoard can slow down
+          your training, especially when used with `tf.distribute.Strategy` as
+          it will incur additional synchronization overhead.
+          Use with `ParameterServerStrategy` is not supported.
+          Batch-level summary writing is also available via `train_step`
+          override. Please see
+          [TensorBoard Scalars tutorial](https://www.tensorflow.org/tensorboard/scalars_and_keras#batch-level_logging)  # noqa: E501
+          for more details.
+        profile_batch: Profile the batch(es) to sample compute characteristics.
+          profile_batch must be a non-negative integer or a tuple of integers.
+          A pair of positive integers signify a range of batches to profile.
+          By default, profiling is disabled.
+        embeddings_freq: frequency (in epochs) at which embedding layers will be
+          visualized. If set to 0, embeddings won't be visualized.
+        embeddings_metadata: Dictionary which maps embedding layer names to the
+          filename of a file in which to save metadata for the embedding layer.
+          In case the same metadata file is to be
+          used for all embedding layers, a single filename can be passed.
+
+    Examples:
+
+    Basic usage:
 
     ```python
-    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
-    test_dir = self.get_temp_dir()
-    path_pattern = os.path.join(test_dir, file_pattern)
-    file_paths = [
-        os.path.join(test_dir, file_name) for file_name in
-        ['f.batch03epoch02.h5', 'f.batch02epoch02.h5', 'f.batch01epoch01.h5']
-    ]
-    for file_path in file_paths:
-      # Write something to each of the files
-    self.assertEqual(
-        _get_most_recently_modified_file_matching_pattern(path_pattern),
-        file_paths[-1])
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
+    model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+    # Then run the tensorboard command to view the visualizations.
     ```
 
-    Args:
-        pattern: The file pattern that may optionally contain python placeholder
-            such as `{epoch:02d}`.
-
-    Returns:
-        The most recently modified file's full filepath matching `pattern`. If
-        `pattern` does not contain any placeholder, this returns the filepath
-        that
-        exactly matches `pattern`. Returns `None` if no match is found.
-    """
-    dir_name = os.path.dirname(pattern)
-    base_name = os.path.basename(pattern)
-    base_name_regex = '^' + re.sub(r'{.*}', r'.*', base_name) + '$'
-
-    # If tf.train.latest_checkpoint tells us there exists a latest checkpoint,
-    # use that as it is more robust than `os.path.getmtime()`.
-    latest_tf_checkpoint = tf.train.latest_checkpoint(dir_name)
-    if latest_tf_checkpoint is not None and re.match(
-        base_name_regex, os.path.basename(latest_tf_checkpoint)):
-      return latest_tf_checkpoint
-
-    latest_mod_time = 0
-    file_path_with_latest_mod_time = None
-    n_file_with_latest_mod_time = 0
-    file_path_with_largest_file_name = None
-
-    if tf.io.gfile.exists(dir_name):
-      for file_name in os.listdir(dir_name):
-        # Only consider if `file_name` matches the pattern.
-        if re.match(base_name_regex, file_name):
-          file_path = os.path.join(dir_name, file_name)
-          mod_time = os.path.getmtime(file_path)
-          if (file_path_with_largest_file_name is None or
-              file_path > file_path_with_largest_file_name):
-            file_path_with_largest_file_name = file_path
-          if mod_time > latest_mod_time:
-            latest_mod_time = mod_time
-            file_path_with_latest_mod_time = file_path
-            # In the case a file with later modified time is found, reset
-            # the counter for the number of files with latest modified time.
-            n_file_with_latest_mod_time = 1
-          elif mod_time == latest_mod_time:
-            # In the case a file has modified time tied with the most recent,
-            # increment the counter for the number of files with latest modified
-            # time by 1.
-            n_file_with_latest_mod_time += 1
-
-    if n_file_with_latest_mod_time == 1:
-      # Return the sole file that has most recent modified time.
-      return file_path_with_latest_mod_time
-    else:
-      # If there are more than one file having latest modified time, return
-      # the file path with the largest file name.
-      return file_path_with_largest_file_name
-
-
-@keras_export('keras.callbacks.BackupAndRestore', v1=[])
-class BackupAndRestore(Callback):
-  """Callback to back up and restore the training state.
-
-  `BackupAndRestore` callback is intended to recover training from an
-  interruption that has happened in the middle of a `Model.fit` execution, by
-  backing up the training states in a temporary checkpoint file (with the help
-  of a `tf.train.CheckpointManager`), at the end of each epoch. Each backup
-  overwrites the previously written checkpoint file, so at any given time there
-  is at most one such checkpoint file for backup/restoring purpose.
-
-  If training restarts before completion, the training state (which includes the
-  `Model` weights and epoch number) is restored to the most recently saved state
-  at the beginning of a new `Model.fit` run. At the completion of a `Model.fit`
-  run, the temporary checkpoint file is deleted.
-
-  Note that the user is responsible to bring jobs back after the interruption.
-  This callback is important for the backup and restore mechanism for fault
-  tolerance purpose, and the model to be restored from an previous checkpoint is
-  expected to be the same as the one used to back up. If user changes arguments
-  passed to compile or fit, the checkpoint saved for fault tolerance can become
-  invalid.
-
-  Note:
-
-  1. This callback is not compatible with eager execution disabled.
-  2. A checkpoint is saved at the end of each epoch. After restoring,
-  `Model.fit` redoes any partial work during the unfinished epoch in which the
-  training got restarted (so the work done before the interruption doesn't
-  affect the final model state).
-  3. This works for both single worker and multi-worker modes. When `Model.fit`
-  is used with `tf.distribute`, it supports `tf.distribute.MirroredStrategy`,
-  `tf.distribute.MultiWorkerMirroredStrategy`, `tf.distribute.TPUStrategy`, and
-  `tf.distribute.experimental.ParameterServerStrategy`.
-
-  Example:
-
-  >>> class InterruptingCallback(tf.keras.callbacks.Callback):
-  ...   def on_epoch_begin(self, epoch, logs=None):
-  ...     if epoch == 4:
-  ...       raise RuntimeError('Interrupting!')
-  >>> callback = tf.keras.callbacks.BackupAndRestore(backup_dir="/tmp/backup")
-  >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-  >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
-  >>> try:
-  ...   model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
-  ...             batch_size=1, callbacks=[callback, InterruptingCallback()],
-  ...             verbose=0)
-  ... except:
-  ...   pass
-  >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
-  ...             batch_size=1, callbacks=[callback], verbose=0)
-  >>> # Only 6 more epochs are run, since first trainning got interrupted at
-  >>> # zero-indexed epoch 4, second training will continue from 4 to 9.
-  >>> len(history.history['loss'])
-  6
-
-  Args:
-      backup_dir: String, path to store the checkpoint.
-        e.g. backup_dir = os.path.join(working_dir, 'backup')
-        This is the directory in which the system stores temporary files to
-        recover the model from jobs terminated unexpectedly. The directory
-        cannot be reused elsewhere to store other files, e.g. by
-        BackupAndRestore callback of another training, or by another callback
-        (ModelCheckpoint) of the same training.
-  """
-
-  def __init__(self, backup_dir):
-    super().__init__()
-    self.backup_dir = backup_dir
-    self._supports_tf_logs = True
-    self._supported_strategies = (
-        tf.distribute.MirroredStrategy,
-        tf.distribute.MultiWorkerMirroredStrategy,
-        tf.distribute.experimental.TPUStrategy, tf.distribute.TPUStrategy,
-        tf.distribute.experimental.ParameterServerStrategy)
-
-    if not tf.executing_eagerly():
-      if tf.inside_function():
-        raise ValueError('This Callback\'s method contains Python state and '
-                         'should be called outside of `tf.function`s.')
-      else:  # Legacy graph mode:
-        raise ValueError(
-            'BackupAndRestore only supports eager mode. In graph '
-            'mode, consider using ModelCheckpoint to manually save '
-            'and restore weights with `model.load_weights()` and by '
-            'providing `initial_epoch` in `model.fit()` for fault tolerance.')
-
-    # Only the chief worker writes model checkpoints, but all workers
-    # restore checkpoint at on_train_begin().
-    self._chief_worker_only = False
-
-  def on_train_begin(self, logs=None):
-    # TrainingState is used to manage the training state needed for
-    # failure-recovery of a worker in training.
-    # pylint: disable=protected-access
-
-    if self.model._distribution_strategy and not isinstance(
-        self.model.distribute_strategy, self._supported_strategies):
-      raise NotImplementedError(
-          f'{type(self.model.distribute_strategy)} is not supported yet. '
-          'Currently BackupAndRestore callback only supports empty strategy, '
-          'MirroredStrategy, MultiWorkerMirroredStrategy and TPUStrategy.')
-    self.model._training_state = (
-        worker_training_state.WorkerTrainingState(self.model, self.backup_dir))
-    self._training_state = self.model._training_state
-    self._training_state.restore()
-
-  def on_train_end(self, logs=None):
-    # pylint: disable=protected-access
-    # On exit of training, delete the training state backup file that was saved
-    # for the purpose of worker recovery.
-    self._training_state.delete_backup()
-
-    # Clean up the training state.
-    del self._training_state
-    del self.model._training_state
-
-  def on_epoch_end(self, epoch, logs=None):
-    # Back up the model and current epoch for possible future recovery.
-    self._training_state.back_up(epoch)
-
-
-@keras_export('keras.callbacks.experimental.BackupAndRestore', v1=[])
-@deprecation.deprecated_endpoints(
-    'keras.callbacks.experimental.BackupAndRestore')
-class BackupAndRestoreExperimental(BackupAndRestore):
-  """Deprecated. Please use `tf.keras.callbacks.BackupAndRestore` instead.
-
-  Caution: `tf.keras.callbacks.experimental.BackupAndRestore` endpoint is
-    deprecated and will be removed in a future release. Please use
-    `tf.keras.callbacks.BackupAndRestore`.
-  """
-
-  def __init__(self, *args, **kwargs):
-    logging.warning(
-        '`tf.keras.callbacks.experimental.BackupAndRestore` endpoint is '
-        'deprecated and will be removed in a future release. Please use '
-        '`tf.keras.callbacks.BackupAndRestore`.')
-    super().__init__(*args, **kwargs)
-
-
-@keras_export('keras.callbacks.EarlyStopping')
-class EarlyStopping(Callback):
-  """Stop training when a monitored metric has stopped improving.
-
-  Assuming the goal of a training is to minimize the loss. With this, the
-  metric to be monitored would be `'loss'`, and mode would be `'min'`. A
-  `model.fit()` training loop will check at end of every epoch whether
-  the loss is no longer decreasing, considering the `min_delta` and
-  `patience` if applicable. Once it's found no longer decreasing,
-  `model.stop_training` is marked True and the training terminates.
-
-  The quantity to be monitored needs to be available in `logs` dict.
-  To make it so, pass the loss or metrics at `model.compile()`.
-
-  Args:
-    monitor: Quantity to be monitored.
-    min_delta: Minimum change in the monitored quantity
-        to qualify as an improvement, i.e. an absolute
-        change of less than min_delta, will count as no
-        improvement.
-    patience: Number of epochs with no improvement
-        after which training will be stopped.
-    verbose: Verbosity mode, 0 or 1. Mode 0 is silent, and mode 1
-        displays messages when the callback takes an action.
-    mode: One of `{"auto", "min", "max"}`. In `min` mode,
-        training will stop when the quantity
-        monitored has stopped decreasing; in `"max"`
-        mode it will stop when the quantity
-        monitored has stopped increasing; in `"auto"`
-        mode, the direction is automatically inferred
-        from the name of the monitored quantity.
-    baseline: Baseline value for the monitored quantity.
-        Training will stop if the model doesn't show improvement over the
-        baseline.
-    restore_best_weights: Whether to restore model weights from
-        the epoch with the best value of the monitored quantity.
-        If False, the model weights obtained at the last step of
-        training are used. An epoch will be restored regardless
-        of the performance relative to the `baseline`. If no epoch
-        improves on `baseline`, training will run for `patience`
-        epochs and restore weights from the best epoch in that set.
-
-  Example:
-
-  >>> callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
-  >>> # This callback will stop the training when there is no improvement in
-  >>> # the loss for three consecutive epochs.
-  >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-  >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
-  >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
-  ...                     epochs=10, batch_size=1, callbacks=[callback],
-  ...                     verbose=0)
-  >>> len(history.history['loss'])  # Only 4 epochs are run.
-  4
-  """
-
-  def __init__(self,
-               monitor='val_loss',
-               min_delta=0,
-               patience=0,
-               verbose=0,
-               mode='auto',
-               baseline=None,
-               restore_best_weights=False):
-    super().__init__()
-
-    self.monitor = monitor
-    self.patience = patience
-    self.verbose = verbose
-    self.baseline = baseline
-    self.min_delta = abs(min_delta)
-    self.wait = 0
-    self.stopped_epoch = 0
-    self.restore_best_weights = restore_best_weights
-    self.best_weights = None
-
-    if mode not in ['auto', 'min', 'max']:
-      logging.warning('EarlyStopping mode %s is unknown, '
-                      'fallback to auto mode.', mode)
-      mode = 'auto'
-
-    if mode == 'min':
-      self.monitor_op = np.less
-    elif mode == 'max':
-      self.monitor_op = np.greater
-    else:
-      if (self.monitor.endswith('acc') or self.monitor.endswith('accuracy') or
-          self.monitor.endswith('auc')):
-        self.monitor_op = np.greater
-      else:
-        self.monitor_op = np.less
-
-    if self.monitor_op == np.greater:
-      self.min_delta *= 1
-    else:
-      self.min_delta *= -1
-
-  def on_train_begin(self, logs=None):
-    # Allow instances to be re-used
-    self.wait = 0
-    self.stopped_epoch = 0
-    self.best = np.Inf if self.monitor_op == np.less else -np.Inf
-    self.best_weights = None
-    self.best_epoch = 0
-
-  def on_epoch_end(self, epoch, logs=None):
-    current = self.get_monitor_value(logs)
-    if current is None:
-      return
-    if self.restore_best_weights and self.best_weights is None:
-      # Restore the weights after first epoch if no progress is ever made.
-      self.best_weights = self.model.get_weights()
-
-    self.wait += 1
-    if self._is_improvement(current, self.best):
-      self.best = current
-      self.best_epoch = epoch
-      if self.restore_best_weights:
-        self.best_weights = self.model.get_weights()
-      # Only restart wait if we beat both the baseline and our previous best.
-      if self.baseline is None or self._is_improvement(current, self.baseline):
-        self.wait = 0
+    Custom batch-level summaries in a subclassed Model:
 
-    # Only check after the first epoch.
-    if self.wait >= self.patience and epoch > 0:
-      self.stopped_epoch = epoch
-      self.model.stop_training = True
-      if self.restore_best_weights and self.best_weights is not None:
-        if self.verbose > 0:
-          io_utils.print_msg(
-              'Restoring model weights from the end of the best epoch: '
-              f'{self.best_epoch + 1}.')
-        self.model.set_weights(self.best_weights)
-
-  def on_train_end(self, logs=None):
-    if self.stopped_epoch > 0 and self.verbose > 0:
-      io_utils.print_msg(
-          f'Epoch {self.stopped_epoch + 1}: early stopping')
-
-  def get_monitor_value(self, logs):
-    logs = logs or {}
-    monitor_value = logs.get(self.monitor)
-    if monitor_value is None:
-      logging.warning('Early stopping conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s',
-                      self.monitor, ','.join(list(logs.keys())))
-    return monitor_value
-
-  def _is_improvement(self, monitor_value, reference_value):
-    return self.monitor_op(monitor_value - self.min_delta, reference_value)
-
-
-@keras_export('keras.callbacks.RemoteMonitor')
-class RemoteMonitor(Callback):
-  """Callback used to stream events to a server.
-
-  Requires the `requests` library.
-  Events are sent to `root + '/publish/epoch/end/'` by default. Calls are
-  HTTP POST, with a `data` argument which is a
-  JSON-encoded dictionary of event data.
-  If `send_as_json=True`, the content type of the request will be
-  `"application/json"`.
-  Otherwise the serialized JSON will be sent within a form.
-
-  Args:
-    root: String; root url of the target server.
-    path: String; path relative to `root` to which the events will be sent.
-    field: String; JSON field under which the data will be stored.
-        The field is used only if the payload is sent within a form
-        (i.e. send_as_json is set to False).
-    headers: Dictionary; optional custom HTTP headers.
-    send_as_json: Boolean; whether the request should be
-        sent as `"application/json"`.
-  """
-
-  def __init__(self,
-               root='http://localhost:9000',
-               path='/publish/epoch/end/',
-               field='data',
-               headers=None,
-               send_as_json=False):
-    super().__init__()
-
-    self.root = root
-    self.path = path
-    self.field = field
-    self.headers = headers
-    self.send_as_json = send_as_json
-
-  def on_epoch_end(self, epoch, logs=None):
-    if requests is None:
-      raise ImportError('RemoteMonitor requires the `requests` library.')
-    logs = logs or {}
-    send = {}
-    send['epoch'] = epoch
-    for k, v in logs.items():
-      # np.ndarray and np.generic are not scalar types
-      # therefore we must unwrap their scalar values and
-      # pass to the json-serializable dict 'send'
-      if isinstance(v, (np.ndarray, np.generic)):
-        send[k] = v.item()
-      else:
-        send[k] = v
-    try:
-      if self.send_as_json:
-        requests.post(self.root + self.path, json=send, headers=self.headers)
-      else:
-        requests.post(
-            self.root + self.path, {self.field: json.dumps(send)},
-            headers=self.headers)
-    except requests.exceptions.RequestException:
-      logging.warning('Warning: could not reach RemoteMonitor '
-                      'root server at ' + str(self.root))
-
-
-@keras_export('keras.callbacks.LearningRateScheduler')
-class LearningRateScheduler(Callback):
-  """Learning rate scheduler.
-
-  At the beginning of every epoch, this callback gets the updated learning rate
-  value from `schedule` function provided at `__init__`, with the current epoch
-  and current learning rate, and applies the updated learning rate
-  on the optimizer.
-
-  Args:
-    schedule: a function that takes an epoch index (integer, indexed from 0)
-        and current learning rate (float) as inputs and returns a new
-        learning rate as output (float).
-    verbose: int. 0: quiet, 1: update messages.
-
-  Example:
-
-  >>> # This function keeps the initial learning rate for the first ten epochs
-  >>> # and decreases it exponentially after that.
-  >>> def scheduler(epoch, lr):
-  ...   if epoch < 10:
-  ...     return lr
-  ...   else:
-  ...     return lr * tf.math.exp(-0.1)
-  >>>
-  >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-  >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
-  >>> round(model.optimizer.lr.numpy(), 5)
-  0.01
-
-  >>> callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
-  >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
-  ...                     epochs=15, callbacks=[callback], verbose=0)
-  >>> round(model.optimizer.lr.numpy(), 5)
-  0.00607
-
-  """
-
-  def __init__(self, schedule, verbose=0):
-    super().__init__()
-    self.schedule = schedule
-    self.verbose = verbose
-
-  def on_epoch_begin(self, epoch, logs=None):
-    if not hasattr(self.model.optimizer, 'lr'):
-      raise ValueError('Optimizer must have a "lr" attribute.')
-    try:  # new API
-      lr = float(backend.get_value(self.model.optimizer.lr))
-      lr = self.schedule(epoch, lr)
-    except TypeError:  # Support for old API for backward compatibility
-      lr = self.schedule(epoch)
-    if not isinstance(lr, (tf.Tensor, float, np.float32, np.float64)):
-      raise ValueError('The output of the "schedule" function '
-                       f'should be float. Got: {lr}')
-    if isinstance(lr, tf.Tensor) and not lr.dtype.is_floating:
-      raise ValueError(
-          f'The dtype of `lr` Tensor should be float. Got: {lr.dtype}')
-    backend.set_value(self.model.optimizer.lr, backend.get_value(lr))
-    if self.verbose > 0:
-      io_utils.print_msg(
-          f'\nEpoch {epoch + 1}: LearningRateScheduler setting learning '
-          f'rate to {lr}.')
-
-  def on_epoch_end(self, epoch, logs=None):
-    logs = logs or {}
-    logs['lr'] = backend.get_value(self.model.optimizer.lr)
+    ```python
+    class MyModel(tf.keras.Model):
 
+      def build(self, _):
+        self.dense = tf.keras.layers.Dense(10)
 
-def keras_model_summary(name, data, step=None):
-  """Writes a Keras model as JSON to as a Summary.
-
-  Writing the Keras model configuration allows the TensorBoard graph plugin to
-  render a conceptual graph, as opposed to graph of ops. In case the model fails
-  to serialize as JSON, it ignores and returns False.
-
-  Args:
-    name: A name for this summary. The summary tag used for TensorBoard will be
-      this name prefixed by any active name scopes.
-    data: A Keras Model to write.
-    step: Explicit `int64`-castable monotonic step value for this summary. If
-      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
-      not be None.
-
-  Returns:
-    True on success, or False if no summary was written because no default
-    summary writer was available.
-
-  Raises:
-    ValueError: if a default writer exists, but no step was provided and
-      `tf.summary.experimental.get_step()` is None.
-  """
-  summary_metadata = tf.compat.v1.SummaryMetadata()
-  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
-  # the rationale.
-  summary_metadata.plugin_data.plugin_name = 'graph_keras_model'
-  # version number = 1
-  summary_metadata.plugin_data.content = b'1'
-
-  try:
-    json_string = data.to_json()
-  except Exception as exc:  # pylint: disable=broad-except
-    # An exception should not break a model code.
-    logging.warning('Model failed to serialize as JSON. Ignoring... %s', exc)
-    return False
-
-  with tf.summary.experimental.summary_scope(
-      name, 'graph_keras_model', [data, step]) as (tag, _):
-    with tf.device('cpu:0'):
-      tensor = tf.constant(json_string, dtype=tf.string)
-    return tf.summary.write(
-        tag=tag, tensor=tensor, step=step, metadata=summary_metadata)
-
-
-@keras_export('keras.callbacks.TensorBoard', v1=[])
-class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
-  # pylint: disable=line-too-long
-  """Enable visualizations for TensorBoard.
-
-  TensorBoard is a visualization tool provided with TensorFlow.
-
-  This callback logs events for TensorBoard, including:
-
-  * Metrics summary plots
-  * Training graph visualization
-  * Weight histograms
-  * Sampled profiling
-
-  When used in `Model.evaluate`, in addition to epoch summaries, there will be
-  a summary that records evaluation metrics vs `Model.optimizer.iterations`
-  written. The metric names will be prepended with `evaluation`, with
-  `Model.optimizer.iterations` being the step in the visualized TensorBoard.
-
-  If you have installed TensorFlow with pip, you should be able
-  to launch TensorBoard from the command line:
-
-  ```
-  tensorboard --logdir=path_to_your_logs
-  ```
-
-  You can find more information about TensorBoard
-  [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
-
-  Args:
-      log_dir: the path of the directory where to save the log files to be
-        parsed by TensorBoard. e.g. log_dir = os.path.join(working_dir, 'logs')
-        This directory should not be reused by any other callbacks.
-      histogram_freq: frequency (in epochs) at which to compute
-        weight histograms for the layers of the model. If set to 0, histograms
-        won't be computed. Validation data (or split) must be specified for
-        histogram visualizations.
-      write_graph: whether to visualize the graph in TensorBoard. The log file
-        can become quite large when write_graph is set to True.
-      write_images: whether to write model weights to visualize as image in
-        TensorBoard.
-      write_steps_per_second: whether to log the training steps per second into
-        Tensorboard. This supports both epoch and batch frequency logging.
-      update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
-        writes the losses and metrics to TensorBoard after each batch. The same
-        applies for `'epoch'`. If using an integer, let's say `1000`, the
-        callback will write the metrics and losses to TensorBoard every 1000
-        batches. Note that writing too frequently to TensorBoard can slow down
-        your training.
-      profile_batch: Profile the batch(es) to sample compute characteristics.
-        profile_batch must be a non-negative integer or a tuple of integers.
-        A pair of positive integers signify a range of batches to profile.
-        By default, profiling is disabled.
-      embeddings_freq: frequency (in epochs) at which embedding layers will be
-        visualized. If set to 0, embeddings won't be visualized.
-      embeddings_metadata: Dictionary which maps embedding layer names to the
-        filename of a file in which to save metadata for the embedding layer.
-        In case the same metadata file is to be
-        used for all embedding layers, a single filename can be passed.
-
-  Examples:
-
-  Basic usage:
-
-  ```python
-  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
-  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
-  # Then run the tensorboard command to view the visualizations.
-  ```
-
-  Custom batch-level summaries in a subclassed Model:
-
-  ```python
-  class MyModel(tf.keras.Model):
-
-    def build(self, _):
-      self.dense = tf.keras.layers.Dense(10)
-
-    def call(self, x):
-      outputs = self.dense(x)
-      tf.summary.histogram('outputs', outputs)
-      return outputs
-
-  model = MyModel()
-  model.compile('sgd', 'mse')
-
-  # Make sure to set `update_freq=N` to log a batch-level summary every N batches.
-  # In addition to any `tf.summary` contained in `Model.call`, metrics added in
-  # `Model.compile` will be logged every N batches.
-  tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
-  model.fit(x_train, y_train, callbacks=[tb_callback])
-  ```
-
-  Custom batch-level summaries in a Functional API Model:
-
-  ```python
-  def my_summary(x):
-    tf.summary.histogram('x', x)
-    return x
-
-  inputs = tf.keras.Input(10)
-  x = tf.keras.layers.Dense(10)(inputs)
-  outputs = tf.keras.layers.Lambda(my_summary)(x)
-  model = tf.keras.Model(inputs, outputs)
-  model.compile('sgd', 'mse')
-
-  # Make sure to set `update_freq=N` to log a batch-level summary every N batches.
-  # In addition to any `tf.summary` contained in `Model.call`, metrics added in
-  # `Model.compile` will be logged every N batches.
-  tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
-  model.fit(x_train, y_train, callbacks=[tb_callback])
-  ```
-
-  Profiling:
-
-  ```python
-  # Profile a single batch, e.g. the 5th batch.
-  tensorboard_callback = tf.keras.callbacks.TensorBoard(
-      log_dir='./logs', profile_batch=5)
-  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
-
-  # Profile a range of batches, e.g. from 10 to 20.
-  tensorboard_callback = tf.keras.callbacks.TensorBoard(
-      log_dir='./logs', profile_batch=(10,20))
-  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
-  ```
-  """
-
-  # pylint: enable=line-too-long
-
-  def __init__(self,
-               log_dir='logs',
-               histogram_freq=0,
-               write_graph=True,
-               write_images=False,
-               write_steps_per_second=False,
-               update_freq='epoch',
-               profile_batch=0,
-               embeddings_freq=0,
-               embeddings_metadata=None,
-               **kwargs):
-    super().__init__()
-    self._supports_tf_logs = True
-    self._validate_kwargs(kwargs)
-
-    self.log_dir = io_utils.path_to_string(log_dir)
-    self.histogram_freq = histogram_freq
-    self.write_graph = write_graph
-    self.write_images = write_images
-    self.write_steps_per_second = write_steps_per_second
-    self.update_freq = 1 if update_freq == 'batch' else update_freq
-    self.embeddings_freq = embeddings_freq
-    self.embeddings_metadata = embeddings_metadata
-    self._init_profile_batch(profile_batch)
-    self._global_train_batch = 0
-    self._previous_epoch_iterations = 0
-    self._train_accumulated_time = 0
-    self._batch_start_time = 0
-
-    # Lazily initialized in order to avoid creating event files when
-    # not needed.
-    self._writers = {}
-
-    # Used to restore any existing `SummaryWriter` after training ends.
-    self._prev_summary_state = []
-
-  def _validate_kwargs(self, kwargs):
-    """Handle arguments were supported in V1."""
-    if kwargs.get('write_grads', False):
-      logging.warning('`write_grads` will be ignored in TensorFlow 2.0 '
-                      'for the `TensorBoard` Callback.')
-    if kwargs.get('batch_size', False):
-      logging.warning('`batch_size` is no longer needed in the '
-                      '`TensorBoard` Callback and will be ignored '
-                      'in TensorFlow 2.0.')
-    if kwargs.get('embeddings_layer_names', False):
-      logging.warning('`embeddings_layer_names` is not supported in '
-                      'TensorFlow 2.0. Instead, all `Embedding` layers '
-                      'will be visualized.')
-    if kwargs.get('embeddings_data', False):
-      logging.warning('`embeddings_data` is not supported in TensorFlow '
-                      '2.0. Instead, all `Embedding` variables will be '
-                      'visualized.')
-
-    supported_kwargs = {'write_grads', 'embeddings_layer_names',
-                        'embeddings_data', 'batch_size'}
-    unrecognized_kwargs = set(kwargs.keys()) - supported_kwargs
-
-    # Only allow kwargs that were supported in V1.
-    if unrecognized_kwargs:
-      raise ValueError(
-          'Unrecognized arguments in `TensorBoard` Callback: '
-          f'{unrecognized_kwargs}. Supported kwargs are: {supported_kwargs}')
-
-  def set_model(self, model):
-    """Sets Keras model and writes graph if specified."""
-    self.model = model
-    self._log_write_dir = self._get_log_write_dir()
-
-    self._train_dir = os.path.join(self._log_write_dir, 'train')
-    self._train_step = self.model._train_counter  # pylint: disable=protected-access
-
-    self._val_dir = os.path.join(self._log_write_dir, 'validation')
-    self._val_step = self.model._test_counter  # pylint: disable=protected-access
-
-    self._writers = {}  # Resets writers.
-
-    self._should_write_train_graph = False
-    if self.write_graph:
-      self._write_keras_model_summary()
-      self._should_write_train_graph = True
-    if self.embeddings_freq:
-      self._configure_embeddings()
-
-  @property
-  def _train_writer(self):
-    if 'train' not in self._writers:
-      self._writers['train'] = tf.summary.create_file_writer(
-          self._train_dir)
-    return self._writers['train']
-
-  @property
-  def _val_writer(self):
-    if 'val' not in self._writers:
-      self._writers['val'] = tf.summary.create_file_writer(self._val_dir)
-    return self._writers['val']
-
-  def _get_log_write_dir(self):
-    """For multi-worker, only chief should write, others write to '/tmp'."""
-    return distributed_file_utils.write_dirpath(self.log_dir,
-                                                self.model.distribute_strategy)
-
-  def _delete_tmp_write_dir(self):
-    """Deletes tmp write directories for multi-worker."""
-    distributed_file_utils.remove_temp_dirpath(self.log_dir,
-                                               self.model.distribute_strategy)
-
-  def _write_keras_model_train_graph(self):
-    """Writes Keras model train_function graph to TensorBoard."""
-    with self._train_writer.as_default():
-      with tf.summary.record_if(True):
-        train_fn = self.model.train_tf_function
-        # If the train_function is a `tf.function`, we can write out a graph
-        if hasattr(train_fn, 'function_spec'):
-          tf.summary.graph(train_fn._concrete_stateful_fn.graph)  # pylint: disable=protected-access
-
-  def _write_keras_model_summary(self):
-    """Writes Keras graph network summary to TensorBoard."""
-    with self._train_writer.as_default():
-      with tf.summary.record_if(True):
-        summary_writable = (
-            self.model._is_graph_network or  # pylint: disable=protected-access
-            self.model.__class__.__name__ == 'Sequential')  # pylint: disable=protected-access
-        if summary_writable:
-          keras_model_summary('keras', self.model, step=0)
-
-  def _configure_embeddings(self):
-    """Configure the Projector for embeddings."""
-    # TODO(omalleyt): Add integration tests.
-    from google.protobuf import text_format
-    from keras.layers import core
-    from keras.protobuf import projector_config_pb2
-
-    config = projector_config_pb2.ProjectorConfig()
-    for layer in self.model.layers:
-      if isinstance(layer, core.Embedding):
-        embedding = config.embeddings.add()
-        # Embeddings are always the first layer, so this naming should be
-        # consistent in any keras models checkpoints.
-        name = 'layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE'
-        embedding.tensor_name = name
-
-        if self.embeddings_metadata is not None:
-          if isinstance(self.embeddings_metadata, str):
-            embedding.metadata_path = self.embeddings_metadata
-          else:
-            if layer.name in self.embeddings_metadata.keys():
-              embedding.metadata_path = self.embeddings_metadata.pop(layer.name)
-
-    if self.embeddings_metadata and not isinstance(self.embeddings_metadata,
-                                                   str):
-      raise ValueError('Unrecognized `Embedding` layer names passed to '
-                       '`keras.callbacks.TensorBoard` `embeddings_metadata` '
-                       f'argument: {self.embeddings_metadata.keys()}')
-
-    config_pbtxt = text_format.MessageToString(config)
-    path = os.path.join(self._log_write_dir, 'projector_config.pbtxt')
-    with tf.io.gfile.GFile(path, 'w') as f:
-      f.write(config_pbtxt)
-
-  def _push_writer(self, writer, step):
-    """Sets the default writer for custom batch-level summaries."""
-    if self.update_freq == 'epoch':
-      return
-
-    should_record = lambda: tf.equal(step % self.update_freq, 0)
-    # TODO(b/151339474): Fix deadlock when not using .value() here.
-    summary_context = (writer.as_default(step.value()),
-                       tf.summary.record_if(should_record))
-    self._prev_summary_state.append(summary_context)
-    summary_context[0].__enter__()
-    summary_context[1].__enter__()
-
-  def _pop_writer(self):
-    """Pops the current writer."""
-    if self.update_freq == 'epoch':
-      return
-
-    # See _push_writer for the content of the previous_context, which is pair
-    # of context.
-    previous_context = self._prev_summary_state.pop()
-    previous_context[1].__exit__(*sys.exc_info())
-    previous_context[0].__exit__(*sys.exc_info())
-
-  def _close_writers(self):
-    for writer in self._writers.values():
-      writer.close()
-
-  def _init_profile_batch(self, profile_batch):
-    """Validate profile_batch value and set the range of batches to profile.
-
-    Sets values of _start_batch and _stop_batch attributes,
-    specifying the start and stop batch to profile.
-    Setting `profile_batch=0` disables profiling.
+      def call(self, x):
+        outputs = self.dense(x)
+        tf.summary.histogram('outputs', outputs)
+        return outputs
 
-    Args:
-      profile_batch: The range of batches to profile. Should be a non-negative
-        integer or a comma separated string of pair of positive integers. A pair
-        of positive integers signify a range of batches to profile.
+    model = MyModel()
+    model.compile('sgd', 'mse')
 
-    Raises:
-      ValueError: If profile_batch is not an integer or a comma separated pair
-                  of positive integers.
+    # Make sure to set `update_freq=N` to log a batch-level summary every N
+    # batches.  In addition to any `tf.summary` contained in `Model.call`,
+    # metrics added in `Model.compile` will be logged every N batches.
+    tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
+    model.fit(x_train, y_train, callbacks=[tb_callback])
+    ```
 
-    """
-    profile_batch_error_message = (
-        'profile_batch must be a non-negative integer or 2-tuple of positive '
-        'integers. A pair of positive integers signifies a range of batches '
-        f'to profile. Found: {profile_batch}')
-
-    # Support legacy way of specifying "start,stop" or "start" as str.
-    if isinstance(profile_batch, str):
-      profile_batch = str(profile_batch).split(',')
-      profile_batch = tf.nest.map_structure(int, profile_batch)
-
-    if isinstance(profile_batch, int):
-      self._start_batch = profile_batch
-      self._stop_batch = profile_batch
-    elif isinstance(profile_batch, (tuple, list)) and len(profile_batch) == 2:
-      self._start_batch, self._stop_batch = profile_batch
-    else:
-      raise ValueError(profile_batch_error_message)
-
-    if self._start_batch < 0 or self._stop_batch < self._start_batch:
-      raise ValueError(profile_batch_error_message)
-
-    # True when the profiler was successfully started by this callback.
-    # We track the status here to make sure callbacks do not interfere with
-    # each other. The callback will only stop the profiler it started.
-    self._profiler_started = False
-    if self._start_batch > 0:
-      # Warm up and improve the profiling accuracy.
-      self._start_profiler(logdir='')
-      self._stop_profiler(save=False)
-    # True when a trace is running.
-    self._is_tracing = False
-
-    # Setting `profile_batch=0` disables profiling.
-    self._should_trace = not (self._start_batch == 0 and self._stop_batch == 0)
-
-  def on_train_begin(self, logs=None):
-    self._global_train_batch = 0
-    self._previous_epoch_iterations = 0
-    self._push_writer(self._train_writer, self._train_step)
-
-  def on_train_end(self, logs=None):
-    self._pop_writer()
-
-    if self._is_tracing:
-      self._stop_trace()
-
-    self._close_writers()
-    self._delete_tmp_write_dir()
-
-  def on_test_begin(self, logs=None):
-    self._push_writer(self._val_writer, self._val_step)
-
-  def on_test_end(self, logs=None):
-    if self.model.optimizer and hasattr(self.model.optimizer, 'iterations'):
-      with tf.summary.record_if(True), self._val_writer.as_default():
-        for name, value in logs.items():
-          tf.summary.scalar(
-              'evaluation_' + name + '_vs_iterations',
-              value,
-              step=self.model.optimizer.iterations.read_value())
-    self._pop_writer()
-
-  def _implements_train_batch_hooks(self):
-    # Only call batch hooks when tracing or write_steps_per_second are enabled
-    return self._should_trace or self.write_steps_per_second
-
-  def on_train_batch_begin(self, batch, logs=None):
-    self._global_train_batch += 1
-    if self.write_steps_per_second:
-      self._batch_start_time = time.time()
-    if not self._should_trace:
-      return
-
-    if self._global_train_batch == self._start_batch:
-      self._start_trace()
-
-  def on_train_batch_end(self, batch, logs=None):
-    if self._should_write_train_graph:
-      self._write_keras_model_train_graph()
-      self._should_write_train_graph = False
-    if self.write_steps_per_second:
-      batch_run_time = time.time() - self._batch_start_time
-      tf.summary.scalar(
-          'batch_steps_per_second', 1. / batch_run_time, step=self._train_step)
-    if not self._should_trace:
-      return
-
-    if self._is_tracing and self._global_train_batch >= self._stop_batch:
-      self._stop_trace()
-
-  def on_epoch_begin(self, epoch, logs=None):
-    # Keeps track of epoch for profiling.
-    if self.write_steps_per_second:
-      self._previous_epoch_iterations = self.model.optimizer.iterations.numpy()
-      self._epoch_start_time = time.time()
-
-  def on_epoch_end(self, epoch, logs=None):
-    """Runs metrics and histogram summaries at epoch end."""
-    self._log_epoch_metrics(epoch, logs)
-
-    if self.histogram_freq and epoch % self.histogram_freq == 0:
-      self._log_weights(epoch)
-
-    if self.embeddings_freq and epoch % self.embeddings_freq == 0:
-      self._log_embeddings(epoch)
-
-  def _start_trace(self):
-    tf.summary.trace_on(graph=True, profiler=False)
-    self._start_profiler(logdir=self.log_dir)
-    self._is_tracing = True
-
-  def _stop_trace(self, batch=None):
-    """Logs the trace graph to TensorBoard."""
-    if batch is None:
-      batch = self._stop_batch
-    with self._train_writer.as_default():
-      with tf.summary.record_if(True):
-        # TODO(b/126388999): Remove step info in the summary name.
-        tf.summary.trace_export(name='batch_%d' % batch, step=batch)
-    self._stop_profiler()
-    self._is_tracing = False
-
-  def _collect_learning_rate(self, logs):
-    lr_schedule = getattr(self.model.optimizer, 'lr', None)
-    if isinstance(lr_schedule, learning_rate_schedule.LearningRateSchedule):
-      logs['learning_rate'] = lr_schedule(self.model.optimizer.iterations)
-    return logs
+    Custom batch-level summaries in a Functional API Model:
 
-  def _compute_steps_per_second(self):
-    current_iteration = self.model.optimizer.iterations.numpy()
-    time_since_epoch_begin = time.time() - self._epoch_start_time
-    steps_per_second = ((current_iteration - self._previous_epoch_iterations) /
-                        time_since_epoch_begin)
-    return steps_per_second
+    ```python
+    def my_summary(x):
+      tf.summary.histogram('x', x)
+      return x
+
+    inputs = tf.keras.Input(10)
+    x = tf.keras.layers.Dense(10)(inputs)
+    outputs = tf.keras.layers.Lambda(my_summary)(x)
+    model = tf.keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    # Make sure to set `update_freq=N` to log a batch-level summary every N
+    # batches. In addition to any `tf.summary` contained in `Model.call`,
+    # metrics added in `Model.compile` will be logged every N batches.
+    tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
+    model.fit(x_train, y_train, callbacks=[tb_callback])
+    ```
 
-  def _log_epoch_metrics(self, epoch, logs):
-    """Writes epoch metrics out as scalar summaries.
+    Profiling:
 
-    Args:
-        epoch: Int. The global step to use for TensorBoard.
-        logs: Dict. Keys are scalar summary names, values are scalars.
+    ```python
+    # Profile a single batch, e.g. the 5th batch.
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir='./logs', profile_batch=5)
+    model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+
+    # Profile a range of batches, e.g. from 10 to 20.
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir='./logs', profile_batch=(10,20))
+    model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+    ```
     """
-    if not logs:
-      return
 
-    train_logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
-    val_logs = {k: v for k, v in logs.items() if k.startswith('val_')}
-    train_logs = self._collect_learning_rate(train_logs)
-    if self.write_steps_per_second:
-      train_logs['steps_per_second'] = self._compute_steps_per_second()
-
-    with tf.summary.record_if(True):
-      if train_logs:
+    def __init__(
+        self,
+        log_dir="logs",
+        histogram_freq=0,
+        write_graph=True,
+        write_images=False,
+        write_steps_per_second=False,
+        update_freq="epoch",
+        profile_batch=0,
+        embeddings_freq=0,
+        embeddings_metadata=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self._supports_tf_logs = True
+        self._validate_kwargs(kwargs)
+
+        self.log_dir = io_utils.path_to_string(log_dir)
+        self.histogram_freq = histogram_freq
+        self.write_graph = write_graph
+        self.write_images = write_images
+        self.write_steps_per_second = write_steps_per_second
+        self.update_freq = 1 if update_freq == "batch" else update_freq
+        self.embeddings_freq = embeddings_freq
+        self.embeddings_metadata = embeddings_metadata
+        self._init_profile_batch(profile_batch)
+        self._global_train_batch = 0
+        self._previous_epoch_iterations = 0
+        self._train_accumulated_time = 0
+        self._batch_start_time = 0
+
+        # Lazily initialized in order to avoid creating event files when
+        # not needed.
+        self._writers = {}
+
+        # Used to restore any existing `SummaryWriter` after training ends.
+        self._prev_summary_state = []
+
+    def _validate_kwargs(self, kwargs):
+        """Handle arguments were supported in V1."""
+        if kwargs.get("write_grads", False):
+            logging.warning(
+                "`write_grads` will be ignored in TensorFlow 2.0 "
+                "for the `TensorBoard` Callback."
+            )
+        if kwargs.get("batch_size", False):
+            logging.warning(
+                "`batch_size` is no longer needed in the "
+                "`TensorBoard` Callback and will be ignored "
+                "in TensorFlow 2.0."
+            )
+        if kwargs.get("embeddings_layer_names", False):
+            logging.warning(
+                "`embeddings_layer_names` is not supported in "
+                "TensorFlow 2.0. Instead, all `Embedding` layers "
+                "will be visualized."
+            )
+        if kwargs.get("embeddings_data", False):
+            logging.warning(
+                "`embeddings_data` is not supported in TensorFlow "
+                "2.0. Instead, all `Embedding` variables will be "
+                "visualized."
+            )
+
+        supported_kwargs = {
+            "write_grads",
+            "embeddings_layer_names",
+            "embeddings_data",
+            "batch_size",
+        }
+        unrecognized_kwargs = set(kwargs.keys()) - supported_kwargs
+
+        # Only allow kwargs that were supported in V1.
+        if unrecognized_kwargs:
+            raise ValueError(
+                "Unrecognized arguments in `TensorBoard` Callback: "
+                f"{unrecognized_kwargs}. "
+                f"Supported kwargs are: {supported_kwargs}"
+            )
+
+    def set_model(self, model):
+        """Sets Keras model and writes graph if specified."""
+        self.model = model
+        self._log_write_dir = self._get_log_write_dir()
+
+        self._train_dir = os.path.join(self._log_write_dir, "train")
+        self._train_step = self.model._train_counter
+
+        self._val_dir = os.path.join(self._log_write_dir, "validation")
+        self._val_step = self.model._test_counter
+
+        self._writers = {}  # Resets writers.
+
+        self._should_write_train_graph = False
+        if self.write_graph:
+            self._write_keras_model_summary()
+            self._should_write_train_graph = True
+        if self.embeddings_freq:
+            self._configure_embeddings()
+
+    @property
+    def _train_writer(self):
+        if "train" not in self._writers:
+            self._writers["train"] = tf.summary.create_file_writer(
+                self._train_dir
+            )
+        return self._writers["train"]
+
+    @property
+    def _val_writer(self):
+        if "val" not in self._writers:
+            self._writers["val"] = tf.summary.create_file_writer(self._val_dir)
+        return self._writers["val"]
+
+    def _get_log_write_dir(self):
+        """For multi-worker, only chief should write, others write to '/tmp'."""
+        return distributed_file_utils.write_dirpath(
+            self.log_dir, self.model.distribute_strategy
+        )
+
+    def _delete_tmp_write_dir(self):
+        """Deletes tmp write directories for multi-worker."""
+        distributed_file_utils.remove_temp_dirpath(
+            self.log_dir, self.model.distribute_strategy
+        )
+
+    def _write_keras_model_train_graph(self):
+        """Writes Keras model train_function graph to TensorBoard."""
+        with self._train_writer.as_default():
+            with tf.summary.record_if(True):
+                train_fn = self.model.train_tf_function
+                # If the train_function is a `tf.function`, we can write out a
+                # graph
+                if hasattr(train_fn, "function_spec"):
+                    tf.summary.graph(
+                        train_fn._concrete_variable_creation_fn.graph
+                    )
+
+    def _write_keras_model_summary(self):
+        """Writes Keras graph network summary to TensorBoard."""
         with self._train_writer.as_default():
-          for name, value in train_logs.items():
-            tf.summary.scalar('epoch_' + name, value, step=epoch)
-      if val_logs:
-        with self._val_writer.as_default():
-          for name, value in val_logs.items():
-            name = name[4:]  # Remove 'val_' prefix.
-            tf.summary.scalar('epoch_' + name, value, step=epoch)
-
-  def _log_weights(self, epoch):
-    """Logs the weights of the Model to TensorBoard."""
-    with self._train_writer.as_default():
-      with tf.summary.record_if(True):
+            with tf.summary.record_if(True):
+                summary_writable = (
+                    self.model._is_graph_network
+                    or self.model.__class__.__name__ == "Sequential"
+                )
+                if summary_writable:
+                    keras_model_summary("keras", self.model, step=0)
+
+    def _configure_embeddings(self):
+        """Configure the Projector for embeddings."""
+        # TODO(omalleyt): Add integration tests.
+        from keras.layers import core
+        from keras.protobuf import projector_config_pb2
+
+        # isort: off
+        from google.protobuf import text_format
+
+        config = projector_config_pb2.ProjectorConfig()
         for layer in self.model.layers:
-          for weight in layer.weights:
-            weight_name = weight.name.replace(':', '_')
-            # Add a suffix to prevent summary tag name collision.
-            histogram_weight_name = weight_name + '/histogram'
-            tf.summary.histogram(histogram_weight_name, weight, step=epoch)
-            if self.write_images:
-              # Add a suffix to prevent summary tag name collision.
-              image_weight_name = weight_name + '/image'
-              self._log_weight_as_image(weight, image_weight_name, epoch)
-        self._train_writer.flush()
-
-  def _log_weight_as_image(self, weight, weight_name, epoch):
-    """Logs a weight as a TensorBoard image."""
-    w_img = tf.squeeze(weight)
-    shape = backend.int_shape(w_img)
-    if len(shape) == 1:  # Bias case
-      w_img = tf.reshape(w_img, [1, shape[0], 1, 1])
-    elif len(shape) == 2:  # Dense layer kernel case
-      if shape[0] > shape[1]:
-        w_img = tf.transpose(w_img)
+            if isinstance(layer, core.Embedding):
+                embedding = config.embeddings.add()
+                # Embeddings are always the first layer, so this naming should
+                # be consistent in any keras models checkpoints.
+                name = (
+                    "layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE"
+                )
+                embedding.tensor_name = name
+
+                if self.embeddings_metadata is not None:
+                    if isinstance(self.embeddings_metadata, str):
+                        embedding.metadata_path = self.embeddings_metadata
+                    else:
+                        if layer.name in self.embeddings_metadata.keys():
+                            embedding.metadata_path = (
+                                self.embeddings_metadata.pop(layer.name)
+                            )
+
+        if self.embeddings_metadata and not isinstance(
+            self.embeddings_metadata, str
+        ):
+            raise ValueError(
+                "Unrecognized `Embedding` layer names passed to "
+                "`keras.callbacks.TensorBoard` `embeddings_metadata` "
+                f"argument: {self.embeddings_metadata.keys()}"
+            )
+
+        config_pbtxt = text_format.MessageToString(config)
+        path = os.path.join(self._log_write_dir, "projector_config.pbtxt")
+        with tf.io.gfile.GFile(path, "w") as f:
+            f.write(config_pbtxt)
+
+    def _push_writer(self, writer, step):
+        """Sets the default writer for custom batch-level summaries."""
+        if self.update_freq == "epoch":
+            return
+
+        should_record = lambda: tf.equal(step % self.update_freq, 0)
+        # TODO(b/151339474): Fix deadlock when not using .value() here.
+        summary_context = (
+            writer.as_default(step.value()),
+            tf.summary.record_if(should_record),
+        )
+        self._prev_summary_state.append(summary_context)
+        summary_context[0].__enter__()
+        summary_context[1].__enter__()
+
+    def _pop_writer(self):
+        """Pops the current writer."""
+        if self.update_freq == "epoch":
+            return
+
+        # See _push_writer for the content of the previous_context, which is
+        # pair of context.
+        previous_context = self._prev_summary_state.pop()
+        previous_context[1].__exit__(*sys.exc_info())
+        previous_context[0].__exit__(*sys.exc_info())
+
+    def _close_writers(self):
+        for writer in self._writers.values():
+            writer.close()
+
+    def _init_profile_batch(self, profile_batch):
+        """Validate profile_batch value and set the range of batches to profile.
+
+        Sets values of _start_batch and _stop_batch attributes,
+        specifying the start and stop batch to profile.
+        Setting `profile_batch=0` disables profiling.
+
+        Args:
+          profile_batch: The range of batches to profile. Should be a
+            non-negative integer or a comma separated string of pair of positive
+            integers. A pair of positive integers signify a range of batches to
+            profile.
+
+        Raises:
+          ValueError: If profile_batch is not an integer or a comma separated
+            pair of positive integers.
+
+        """
+        profile_batch_error_message = (
+            "profile_batch must be a non-negative integer or "
+            "2-tuple of positive "
+            "integers. A pair of positive integers "
+            "signifies a range of batches "
+            f"to profile. Found: {profile_batch}"
+        )
+
+        # Support legacy way of specifying "start,stop" or "start" as str.
+        if isinstance(profile_batch, str):
+            profile_batch = str(profile_batch).split(",")
+            profile_batch = tf.nest.map_structure(int, profile_batch)
+
+        if isinstance(profile_batch, int):
+            self._start_batch = profile_batch
+            self._stop_batch = profile_batch
+        elif (
+            isinstance(profile_batch, (tuple, list)) and len(profile_batch) == 2
+        ):
+            self._start_batch, self._stop_batch = profile_batch
+        else:
+            raise ValueError(profile_batch_error_message)
+
+        if self._start_batch < 0 or self._stop_batch < self._start_batch:
+            raise ValueError(profile_batch_error_message)
+
+        # True when the profiler was successfully started by this callback.
+        # We track the status here to make sure callbacks do not interfere with
+        # each other. The callback will only stop the profiler it started.
+        self._profiler_started = False
+        if self._start_batch > 0:
+            # Warm up and improve the profiling accuracy.
+            self._start_profiler(logdir="")
+            self._stop_profiler(save=False)
+        # True when a trace is running.
+        self._is_tracing = False
+
+        # Setting `profile_batch=0` disables profiling.
+        self._should_trace = not (
+            self._start_batch == 0 and self._stop_batch == 0
+        )
+
+    def on_train_begin(self, logs=None):
+        self._global_train_batch = 0
+        self._previous_epoch_iterations = 0
+        self._push_writer(self._train_writer, self._train_step)
+
+    def on_train_end(self, logs=None):
+        self._pop_writer()
+
+        if self._is_tracing:
+            self._stop_trace()
+
+        self._close_writers()
+        self._delete_tmp_write_dir()
+
+    def on_test_begin(self, logs=None):
+        self._push_writer(self._val_writer, self._val_step)
+
+    def on_test_end(self, logs=None):
+        if self.model.optimizer and hasattr(self.model.optimizer, "iterations"):
+            with tf.summary.record_if(True), self._val_writer.as_default():
+                for name, value in logs.items():
+                    tf.summary.scalar(
+                        "evaluation_" + name + "_vs_iterations",
+                        value,
+                        step=self.model.optimizer.iterations.read_value(),
+                    )
+        self._pop_writer()
+
+    def _implements_train_batch_hooks(self):
+        # Only call batch hooks when tracing or write_steps_per_second are
+        # enabled
+        return self._should_trace or self.write_steps_per_second
+
+    def on_train_batch_begin(self, batch, logs=None):
+        self._global_train_batch += 1
+        if self.write_steps_per_second:
+            self._batch_start_time = time.time()
+        if not self._should_trace:
+            return
+
+        if self._global_train_batch == self._start_batch:
+            self._start_trace()
+
+    def on_train_batch_end(self, batch, logs=None):
+        if self._should_write_train_graph:
+            self._write_keras_model_train_graph()
+            self._should_write_train_graph = False
+        if self.write_steps_per_second:
+            batch_run_time = time.time() - self._batch_start_time
+            tf.summary.scalar(
+                "batch_steps_per_second",
+                1.0 / batch_run_time,
+                step=self._train_step,
+            )
+
+        # `logs` isn't necessarily always a dict. For example, when using
+        # `tf.distribute.experimental.ParameterServerStrategy`, a
+        # `tf.distribute.experimental.coordinator.RemoteValue` will be passed.
+        # For now, we just disable `update_freq` in those cases.
+        if isinstance(logs, dict):
+            for name, value in logs.items():
+                tf.summary.scalar("batch_" + name, value, step=self._train_step)
+
+        if not self._should_trace:
+            return
+
+        if self._is_tracing and self._global_train_batch >= self._stop_batch:
+            self._stop_trace()
+
+    def on_epoch_begin(self, epoch, logs=None):
+        # Keeps track of epoch for profiling.
+        if self.write_steps_per_second:
+            self._previous_epoch_iterations = (
+                self.model.optimizer.iterations.numpy()
+            )
+            self._epoch_start_time = time.time()
+
+    def on_epoch_end(self, epoch, logs=None):
+        """Runs metrics and histogram summaries at epoch end."""
+        self._log_epoch_metrics(epoch, logs)
+
+        if self.histogram_freq and epoch % self.histogram_freq == 0:
+            self._log_weights(epoch)
+
+        if self.embeddings_freq and epoch % self.embeddings_freq == 0:
+            self._log_embeddings(epoch)
+
+    def _start_trace(self):
+        tf.summary.trace_on(graph=True, profiler=False)
+        self._start_profiler(logdir=self.log_dir)
+        self._is_tracing = True
+
+    def _stop_trace(self, batch=None):
+        """Logs the trace graph to TensorBoard."""
+        if batch is None:
+            batch = self._stop_batch
+        with self._train_writer.as_default():
+            with tf.summary.record_if(True):
+                # TODO(b/126388999): Remove step info in the summary name.
+                tf.summary.trace_export(name="batch_%d" % batch, step=batch)
+        self._stop_profiler()
+        self._is_tracing = False
+
+    def _collect_learning_rate(self, logs):
+        if isinstance(self.model.optimizer, optimizer.Optimizer):
+            lr_schedule = getattr(self.model.optimizer, "_learning_rate", None)
+        else:
+            lr_schedule = getattr(self.model.optimizer, "lr", None)
+        if isinstance(lr_schedule, learning_rate_schedule.LearningRateSchedule):
+            logs["learning_rate"] = lr_schedule(self.model.optimizer.iterations)
+        return logs
+
+    def _compute_steps_per_second(self):
+        current_iteration = self.model.optimizer.iterations.numpy()
+        time_since_epoch_begin = time.time() - self._epoch_start_time
+        steps_per_second = (
+            current_iteration - self._previous_epoch_iterations
+        ) / time_since_epoch_begin
+        return steps_per_second
+
+    def _log_epoch_metrics(self, epoch, logs):
+        """Writes epoch metrics out as scalar summaries.
+
+        Args:
+            epoch: Int. The global step to use for TensorBoard.
+            logs: Dict. Keys are scalar summary names, values are scalars.
+        """
+        if not logs:
+            return
+
+        train_logs = dict()
+        val_logs = dict()
+        for k, v in logs.items():
+            if k.startswith("val_"):
+                val_logs[k] = v
+            else:
+                train_logs[k] = v
+
+        train_logs = self._collect_learning_rate(train_logs)
+        if self.write_steps_per_second:
+            train_logs["steps_per_second"] = self._compute_steps_per_second()
+
+        with tf.summary.record_if(True):
+            if train_logs:
+                with self._train_writer.as_default():
+                    for name, value in train_logs.items():
+                        tf.summary.scalar("epoch_" + name, value, step=epoch)
+            if val_logs:
+                with self._val_writer.as_default():
+                    for name, value in val_logs.items():
+                        name = name[4:]  # Remove 'val_' prefix.
+                        tf.summary.scalar("epoch_" + name, value, step=epoch)
+
+    def _log_weights(self, epoch):
+        """Logs the weights of the Model to TensorBoard."""
+        with self._train_writer.as_default():
+            with tf.summary.record_if(True):
+                for layer in self.model.layers:
+                    for weight in layer.weights:
+                        weight_name = weight.name.replace(":", "_")
+                        # Add a suffix to prevent summary tag name collision.
+                        histogram_weight_name = weight_name + "/histogram"
+                        tf.summary.histogram(
+                            histogram_weight_name, weight, step=epoch
+                        )
+                        if self.write_images:
+                            # Add a suffix to prevent summary tag name
+                            # collision.
+                            image_weight_name = weight_name + "/image"
+                            self._log_weight_as_image(
+                                weight, image_weight_name, epoch
+                            )
+                self._train_writer.flush()
+
+    def _log_weight_as_image(self, weight, weight_name, epoch):
+        """Logs a weight as a TensorBoard image."""
+        w_img = tf.squeeze(weight)
         shape = backend.int_shape(w_img)
-      w_img = tf.reshape(w_img, [1, shape[0], shape[1], 1])
-    elif len(shape) == 3:  # ConvNet case
-      if backend.image_data_format() == 'channels_last':
-        # Switch to channels_first to display every kernel as a separate
-        # image.
-        w_img = tf.transpose(w_img, perm=[2, 0, 1])
+        if len(shape) == 1:  # Bias case
+            w_img = tf.reshape(w_img, [1, shape[0], 1, 1])
+        elif len(shape) == 2:  # Dense layer kernel case
+            if shape[0] > shape[1]:
+                w_img = tf.transpose(w_img)
+                shape = backend.int_shape(w_img)
+            w_img = tf.reshape(w_img, [1, shape[0], shape[1], 1])
+        elif len(shape) == 3:  # ConvNet case
+            if backend.image_data_format() == "channels_last":
+                # Switch to channels_first to display every kernel as a separate
+                # image.
+                w_img = tf.transpose(w_img, perm=[2, 0, 1])
+                shape = backend.int_shape(w_img)
+            w_img = tf.reshape(w_img, [shape[0], shape[1], shape[2], 1])
+
         shape = backend.int_shape(w_img)
-      w_img = tf.reshape(w_img, [shape[0], shape[1], shape[2], 1])
+        # Not possible to handle 3D convnets etc.
+        if len(shape) == 4 and shape[-1] in [1, 3, 4]:
+            tf.summary.image(weight_name, w_img, step=epoch)
+
+    def _log_embeddings(self, epoch):
+        embeddings_ckpt = os.path.join(
+            self._log_write_dir,
+            "train",
+            f"keras_embedding.ckpt-{epoch}",
+        )
+        self.model.save_weights(embeddings_ckpt)
+
+    def _start_profiler(self, logdir):
+        """Starts the profiler if currently inactive.
+
+        Args:
+          logdir: Directory where profiler results will be saved.
+        """
+        if self._profiler_started:
+            return
+        try:
+            tf.profiler.experimental.start(logdir=logdir)
+            self._profiler_started = True
+        except tf.errors.AlreadyExistsError as e:
+            # Profiler errors should not be fatal.
+            logging.error("Failed to start profiler: %s", e.message)
+
+    def _stop_profiler(self, save=True):
+        """Stops the profiler if currently active.
+
+        Args:
+          save: Whether to save the profiler results to TensorBoard.
+        """
+        if not self._profiler_started:
+            return
+        try:
+            tf.profiler.experimental.stop(save=save)
+        except tf.errors.UnavailableError as e:
+            # Profiler errors should not be fatal.
+            logging.error("Failed to stop profiler: %s", e.message)
+        finally:
+            self._profiler_started = False
 
-    shape = backend.int_shape(w_img)
-    # Not possible to handle 3D convnets etc.
-    if len(shape) == 4 and shape[-1] in [1, 3, 4]:
-      tf.summary.image(weight_name, w_img, step=epoch)
 
-  def _log_embeddings(self, epoch):
-    embeddings_ckpt = os.path.join(self._log_write_dir, 'train',
-                                   'keras_embedding.ckpt-{}'.format(epoch))
-    self.model.save_weights(embeddings_ckpt)
+@keras_export("keras.callbacks.ReduceLROnPlateau")
+class ReduceLROnPlateau(Callback):
+    """Reduce learning rate when a metric has stopped improving.
 
-  def _start_profiler(self, logdir):
-    """Starts the profiler if currently inactive.
+    Models often benefit from reducing the learning rate by a factor
+    of 2-10 once learning stagnates. This callback monitors a
+    quantity and if no improvement is seen for a 'patience' number
+    of epochs, the learning rate is reduced.
 
-    Args:
-      logdir: Directory where profiler results will be saved.
-    """
-    if self._profiler_started:
-      return
-    try:
-      tf.profiler.experimental.start(logdir=logdir)
-      self._profiler_started = True
-    except tf.errors.AlreadyExistsError as e:
-      # Profiler errors should not be fatal.
-      logging.error('Failed to start profiler: %s', e.message)
+    Example:
 
-  def _stop_profiler(self, save=True):
-    """Stops the profiler if currently active.
+    ```python
+    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
+                                  patience=5, min_lr=0.001)
+    model.fit(X_train, Y_train, callbacks=[reduce_lr])
+    ```
 
     Args:
-      save: Whether to save the profiler results to TensorBoard.
+        monitor: quantity to be monitored.
+        factor: factor by which the learning rate will be reduced.
+          `new_lr = lr * factor`.
+        patience: number of epochs with no improvement after which learning rate
+          will be reduced.
+        verbose: int. 0: quiet, 1: update messages.
+        mode: one of `{'auto', 'min', 'max'}`. In `'min'` mode,
+          the learning rate will be reduced when the
+          quantity monitored has stopped decreasing; in `'max'` mode it will be
+          reduced when the quantity monitored has stopped increasing; in
+          `'auto'` mode, the direction is automatically inferred from the name
+          of the monitored quantity.
+        min_delta: threshold for measuring the new optimum, to only focus on
+          significant changes.
+        cooldown: number of epochs to wait before resuming normal operation
+          after lr has been reduced.
+        min_lr: lower bound on the learning rate.
     """
-    if not self._profiler_started:
-      return
-    try:
-      tf.profiler.experimental.stop(save=save)
-    except tf.errors.UnavailableError as e:
-      # Profiler errors should not be fatal.
-      logging.error('Failed to stop profiler: %s', e.message)
-    finally:
-      self._profiler_started = False
 
+    def __init__(
+        self,
+        monitor="val_loss",
+        factor=0.1,
+        patience=10,
+        verbose=0,
+        mode="auto",
+        min_delta=1e-4,
+        cooldown=0,
+        min_lr=0,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.monitor = monitor
+        if factor >= 1.0:
+            raise ValueError(
+                "ReduceLROnPlateau does not support "
+                f"a factor >= 1.0. Got {factor}"
+            )
+        if "epsilon" in kwargs:
+            min_delta = kwargs.pop("epsilon")
+            logging.warning(
+                "`epsilon` argument is deprecated and "
+                "will be removed, use `min_delta` instead."
+            )
+        self.factor = factor
+        self.min_lr = min_lr
+        self.min_delta = min_delta
+        self.patience = patience
+        self.verbose = verbose
+        self.cooldown = cooldown
+        self.cooldown_counter = 0  # Cooldown counter.
+        self.wait = 0
+        self.best = 0
+        self.mode = mode
+        self.monitor_op = None
+        self._reset()
+
+    def _reset(self):
+        """Resets wait counter and cooldown counter."""
+        if self.mode not in ["auto", "min", "max"]:
+            logging.warning(
+                "Learning rate reduction mode %s is unknown, "
+                "fallback to auto mode.",
+                self.mode,
+            )
+            self.mode = "auto"
+        if self.mode == "min" or (
+            self.mode == "auto" and "acc" not in self.monitor
+        ):
+            self.monitor_op = lambda a, b: np.less(a, b - self.min_delta)
+            self.best = np.Inf
+        else:
+            self.monitor_op = lambda a, b: np.greater(a, b + self.min_delta)
+            self.best = -np.Inf
+        self.cooldown_counter = 0
+        self.wait = 0
 
-@keras_export('keras.callbacks.ReduceLROnPlateau')
-class ReduceLROnPlateau(Callback):
-  """Reduce learning rate when a metric has stopped improving.
-
-  Models often benefit from reducing the learning rate by a factor
-  of 2-10 once learning stagnates. This callback monitors a
-  quantity and if no improvement is seen for a 'patience' number
-  of epochs, the learning rate is reduced.
-
-  Example:
-
-  ```python
-  reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
-                                patience=5, min_lr=0.001)
-  model.fit(X_train, Y_train, callbacks=[reduce_lr])
-  ```
-
-  Args:
-      monitor: quantity to be monitored.
-      factor: factor by which the learning rate will be reduced.
-        `new_lr = lr * factor`.
-      patience: number of epochs with no improvement after which learning rate
-        will be reduced.
-      verbose: int. 0: quiet, 1: update messages.
-      mode: one of `{'auto', 'min', 'max'}`. In `'min'` mode,
-        the learning rate will be reduced when the
-        quantity monitored has stopped decreasing; in `'max'` mode it will be
-        reduced when the quantity monitored has stopped increasing; in `'auto'`
-        mode, the direction is automatically inferred from the name of the
-        monitored quantity.
-      min_delta: threshold for measuring the new optimum, to only focus on
-        significant changes.
-      cooldown: number of epochs to wait before resuming normal operation after
-        lr has been reduced.
-      min_lr: lower bound on the learning rate.
-  """
-
-  def __init__(self,
-               monitor='val_loss',
-               factor=0.1,
-               patience=10,
-               verbose=0,
-               mode='auto',
-               min_delta=1e-4,
-               cooldown=0,
-               min_lr=0,
-               **kwargs):
-    super().__init__()
-
-    self.monitor = monitor
-    if factor >= 1.0:
-      raise ValueError(
-          f'ReduceLROnPlateau does not support a factor >= 1.0. Got {factor}')
-    if 'epsilon' in kwargs:
-      min_delta = kwargs.pop('epsilon')
-      logging.warning('`epsilon` argument is deprecated and '
-                      'will be removed, use `min_delta` instead.')
-    self.factor = factor
-    self.min_lr = min_lr
-    self.min_delta = min_delta
-    self.patience = patience
-    self.verbose = verbose
-    self.cooldown = cooldown
-    self.cooldown_counter = 0  # Cooldown counter.
-    self.wait = 0
-    self.best = 0
-    self.mode = mode
-    self.monitor_op = None
-    self._reset()
-
-  def _reset(self):
-    """Resets wait counter and cooldown counter.
-    """
-    if self.mode not in ['auto', 'min', 'max']:
-      logging.warning('Learning rate reduction mode %s is unknown, '
-                      'fallback to auto mode.', self.mode)
-      self.mode = 'auto'
-    if (self.mode == 'min' or
-        (self.mode == 'auto' and 'acc' not in self.monitor)):
-      self.monitor_op = lambda a, b: np.less(a, b - self.min_delta)
-      self.best = np.Inf
-    else:
-      self.monitor_op = lambda a, b: np.greater(a, b + self.min_delta)
-      self.best = -np.Inf
-    self.cooldown_counter = 0
-    self.wait = 0
-
-  def on_train_begin(self, logs=None):
-    self._reset()
-
-  def on_epoch_end(self, epoch, logs=None):
-    logs = logs or {}
-    logs['lr'] = backend.get_value(self.model.optimizer.lr)
-    current = logs.get(self.monitor)
-    if current is None:
-      logging.warning('Learning rate reduction is conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s',
-                      self.monitor, ','.join(list(logs.keys())))
+    def on_train_begin(self, logs=None):
+        self._reset()
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs["lr"] = backend.get_value(self.model.optimizer.lr)
+        current = logs.get(self.monitor)
+        if current is None:
+            logging.warning(
+                "Learning rate reduction is conditioned on metric `%s` "
+                "which is not available. Available metrics are: %s",
+                self.monitor,
+                ",".join(list(logs.keys())),
+            )
 
-    else:
-      if self.in_cooldown():
-        self.cooldown_counter -= 1
-        self.wait = 0
+        else:
+            if self.in_cooldown():
+                self.cooldown_counter -= 1
+                self.wait = 0
 
-      if self.monitor_op(current, self.best):
-        self.best = current
-        self.wait = 0
-      elif not self.in_cooldown():
-        self.wait += 1
-        if self.wait >= self.patience:
-          old_lr = backend.get_value(self.model.optimizer.lr)
-          if old_lr > np.float32(self.min_lr):
-            new_lr = old_lr * self.factor
-            new_lr = max(new_lr, self.min_lr)
-            backend.set_value(self.model.optimizer.lr, new_lr)
-            if self.verbose > 0:
-              io_utils.print_msg(
-                  f'\nEpoch {epoch +1}: '
-                  f'ReduceLROnPlateau reducing learning rate to {new_lr}.')
-            self.cooldown_counter = self.cooldown
-            self.wait = 0
-
-  def in_cooldown(self):
-    return self.cooldown_counter > 0
-
-
-@keras_export('keras.callbacks.CSVLogger')
+            if self.monitor_op(current, self.best):
+                self.best = current
+                self.wait = 0
+            elif not self.in_cooldown():
+                self.wait += 1
+                if self.wait >= self.patience:
+                    old_lr = backend.get_value(self.model.optimizer.lr)
+                    if old_lr > np.float32(self.min_lr):
+                        new_lr = old_lr * self.factor
+                        new_lr = max(new_lr, self.min_lr)
+                        backend.set_value(self.model.optimizer.lr, new_lr)
+                        if self.verbose > 0:
+                            io_utils.print_msg(
+                                f"\nEpoch {epoch +1}: "
+                                "ReduceLROnPlateau reducing "
+                                f"learning rate to {new_lr}."
+                            )
+                        self.cooldown_counter = self.cooldown
+                        self.wait = 0
+
+    def in_cooldown(self):
+        return self.cooldown_counter > 0
+
+
+@keras_export("keras.callbacks.CSVLogger")
 class CSVLogger(Callback):
-  """Callback that streams epoch results to a CSV file.
-
-  Supports all values that can be represented as a string,
-  including 1D iterables such as `np.ndarray`.
-
-  Example:
-
-  ```python
-  csv_logger = CSVLogger('training.log')
-  model.fit(X_train, Y_train, callbacks=[csv_logger])
-  ```
-
-  Args:
-      filename: Filename of the CSV file, e.g. `'run/log.csv'`.
-      separator: String used to separate elements in the CSV file.
-      append: Boolean. True: append if file exists (useful for continuing
-          training). False: overwrite existing file.
-  """
-
-  def __init__(self, filename, separator=',', append=False):
-    self.sep = separator
-    self.filename = io_utils.path_to_string(filename)
-    self.append = append
-    self.writer = None
-    self.keys = None
-    self.append_header = True
-    super().__init__()
-
-  def on_train_begin(self, logs=None):
-    if self.append:
-      if tf.io.gfile.exists(self.filename):
-        with tf.io.gfile.GFile(self.filename, 'r') as f:
-          self.append_header = not bool(len(f.readline()))
-      mode = 'a'
-    else:
-      mode = 'w'
-    self.csv_file = tf.io.gfile.GFile(self.filename, mode)
+    """Callback that streams epoch results to a CSV file.
 
-  def on_epoch_end(self, epoch, logs=None):
-    logs = logs or {}
+    Supports all values that can be represented as a string,
+    including 1D iterables such as `np.ndarray`.
 
-    def handle_value(k):
-      is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
-      if isinstance(k, str):
-        return k
-      elif isinstance(k, collections.abc.Iterable) and not is_zero_dim_ndarray:
-        return '"[%s]"' % (', '.join(map(str, k)))
-      else:
-        return k
+    Example:
 
-    if self.keys is None:
-      self.keys = sorted(logs.keys())
-
-    if self.model.stop_training:
-      # We set NA so that csv parsers do not fail for this last epoch.
-      logs = dict((k, logs[k]) if k in logs else (k, 'NA') for k in self.keys)
+    ```python
+    csv_logger = CSVLogger('training.log')
+    model.fit(X_train, Y_train, callbacks=[csv_logger])
+    ```
 
-    if not self.writer:
+    Args:
+        filename: Filename of the CSV file, e.g. `'run/log.csv'`.
+        separator: String used to separate elements in the CSV file.
+        append: Boolean. True: append if file exists (useful for continuing
+            training). False: overwrite existing file.
+    """
 
-      class CustomDialect(csv.excel):
-        delimiter = self.sep
+    def __init__(self, filename, separator=",", append=False):
+        self.sep = separator
+        self.filename = io_utils.path_to_string(filename)
+        self.append = append
+        self.writer = None
+        self.keys = None
+        self.append_header = True
+        super().__init__()
+
+    def on_train_begin(self, logs=None):
+        if self.append:
+            if tf.io.gfile.exists(self.filename):
+                with tf.io.gfile.GFile(self.filename, "r") as f:
+                    self.append_header = not bool(len(f.readline()))
+            mode = "a"
+        else:
+            mode = "w"
+        self.csv_file = tf.io.gfile.GFile(self.filename, mode)
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+
+        def handle_value(k):
+            is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
+            if isinstance(k, str):
+                return k
+            elif (
+                isinstance(k, collections.abc.Iterable)
+                and not is_zero_dim_ndarray
+            ):
+                return f"\"[{', '.join(map(str, k))}]\""
+            else:
+                return k
+
+        if self.keys is None:
+            self.keys = sorted(logs.keys())
+            # When validation_freq > 1, `val_` keys are not in first epoch logs
+            # Add the `val_` keys so that its part of the fieldnames of writer.
+            val_keys_found = False
+            for key in self.keys:
+                if key.startswith("val_"):
+                    val_keys_found = True
+                    break
+            if not val_keys_found:
+                self.keys.extend(["val_" + k for k in self.keys])
+
+        if not self.writer:
+
+            class CustomDialect(csv.excel):
+                delimiter = self.sep
+
+            fieldnames = ["epoch"] + self.keys
+
+            self.writer = csv.DictWriter(
+                self.csv_file, fieldnames=fieldnames, dialect=CustomDialect
+            )
+            if self.append_header:
+                self.writer.writeheader()
+
+        row_dict = collections.OrderedDict({"epoch": epoch})
+        row_dict.update(
+            (key, handle_value(logs.get(key, "NA"))) for key in self.keys
+        )
+        self.writer.writerow(row_dict)
+        self.csv_file.flush()
+
+    def on_train_end(self, logs=None):
+        self.csv_file.close()
+        self.writer = None
+
+
+@keras_export("keras.callbacks.LambdaCallback")
+class LambdaCallback(Callback):
+    r"""Callback for creating simple, custom callbacks on-the-fly.
 
-      fieldnames = ['epoch'] + self.keys
+    This callback is constructed with anonymous functions that will be called
+    at the appropriate time (during `Model.{fit | evaluate | predict}`).
+    Note that the callbacks expects positional arguments, as:
 
-      self.writer = csv.DictWriter(
-          self.csv_file,
-          fieldnames=fieldnames,
-          dialect=CustomDialect)
-      if self.append_header:
-        self.writer.writeheader()
+    - `on_epoch_begin` and `on_epoch_end` expect two positional arguments:
+      `epoch`, `logs`
+    - `on_batch_begin` and `on_batch_end` expect two positional arguments:
+      `batch`, `logs`
+    - `on_train_begin` and `on_train_end` expect one positional argument:
+      `logs`
 
-    row_dict = collections.OrderedDict({'epoch': epoch})
-    row_dict.update((key, handle_value(logs[key])) for key in self.keys)
-    self.writer.writerow(row_dict)
-    self.csv_file.flush()
+    Args:
+        on_epoch_begin: called at the beginning of every epoch.
+        on_epoch_end: called at the end of every epoch.
+        on_batch_begin: called at the beginning of every batch.
+        on_batch_end: called at the end of every batch.
+        on_train_begin: called at the beginning of model training.
+        on_train_end: called at the end of model training.
 
-  def on_train_end(self, logs=None):
-    self.csv_file.close()
-    self.writer = None
+    Example:
 
+    ```python
+    # Print the batch number at the beginning of every batch.
+    batch_print_callback = LambdaCallback(
+        on_batch_begin=lambda batch,logs: print(batch))
+
+    # Stream the epoch loss to a file in JSON format. The file content
+    # is not well-formed JSON but rather has a JSON object per line.
+    import json
+    json_log = open('loss_log.json', mode='wt', buffering=1)
+    json_logging_callback = LambdaCallback(
+        on_epoch_end=lambda epoch, logs: json_log.write(
+            json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
+        on_train_end=lambda logs: json_log.close()
+    )
+
+    # Terminate some processes after having finished model training.
+    processes = ...
+    cleanup_callback = LambdaCallback(
+        on_train_end=lambda logs: [
+            p.terminate() for p in processes if p.is_alive()])
+
+    model.fit(...,
+              callbacks=[batch_print_callback,
+                         json_logging_callback,
+                         cleanup_callback])
+    ```
+    """
 
-@keras_export('keras.callbacks.LambdaCallback')
-class LambdaCallback(Callback):
-  r"""Callback for creating simple, custom callbacks on-the-fly.
-
-  This callback is constructed with anonymous functions that will be called
-  at the appropriate time (during `Model.{fit | evaluate | predict}`).
-  Note that the callbacks expects positional arguments, as:
-
-  - `on_epoch_begin` and `on_epoch_end` expect two positional arguments:
-    `epoch`, `logs`
-  - `on_batch_begin` and `on_batch_end` expect two positional arguments:
-    `batch`, `logs`
-  - `on_train_begin` and `on_train_end` expect one positional argument:
-    `logs`
-
-  Args:
-      on_epoch_begin: called at the beginning of every epoch.
-      on_epoch_end: called at the end of every epoch.
-      on_batch_begin: called at the beginning of every batch.
-      on_batch_end: called at the end of every batch.
-      on_train_begin: called at the beginning of model training.
-      on_train_end: called at the end of model training.
-
-  Example:
-
-  ```python
-  # Print the batch number at the beginning of every batch.
-  batch_print_callback = LambdaCallback(
-      on_batch_begin=lambda batch,logs: print(batch))
-
-  # Stream the epoch loss to a file in JSON format. The file content
-  # is not well-formed JSON but rather has a JSON object per line.
-  import json
-  json_log = open('loss_log.json', mode='wt', buffering=1)
-  json_logging_callback = LambdaCallback(
-      on_epoch_end=lambda epoch, logs: json_log.write(
-          json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
-      on_train_end=lambda logs: json_log.close()
-  )
-
-  # Terminate some processes after having finished model training.
-  processes = ...
-  cleanup_callback = LambdaCallback(
-      on_train_end=lambda logs: [
-          p.terminate() for p in processes if p.is_alive()])
-
-  model.fit(...,
-            callbacks=[batch_print_callback,
-                       json_logging_callback,
-                       cleanup_callback])
-  ```
-  """
-
-  def __init__(self,
-               on_epoch_begin=None,
-               on_epoch_end=None,
-               on_batch_begin=None,
-               on_batch_end=None,
-               on_train_begin=None,
-               on_train_end=None,
-               **kwargs):
-    super().__init__()
-    self.__dict__.update(kwargs)
-    if on_epoch_begin is not None:
-      self.on_epoch_begin = on_epoch_begin
-    if on_epoch_end is not None:
-      self.on_epoch_end = on_epoch_end
-    if on_batch_begin is not None:
-      self.on_batch_begin = on_batch_begin
-    if on_batch_end is not None:
-      self.on_batch_end = on_batch_end
-    if on_train_begin is not None:
-      self.on_train_begin = on_train_begin
-    if on_train_end is not None:
-      self.on_train_end = on_train_end
+    def __init__(
+        self,
+        on_epoch_begin=None,
+        on_epoch_end=None,
+        on_batch_begin=None,
+        on_batch_end=None,
+        on_train_begin=None,
+        on_train_end=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.__dict__.update(kwargs)
+        if on_epoch_begin is not None:
+            self.on_epoch_begin = on_epoch_begin
+        if on_epoch_end is not None:
+            self.on_epoch_end = on_epoch_end
+        if on_batch_begin is not None:
+            self.on_batch_begin = on_batch_begin
+        if on_batch_end is not None:
+            self.on_batch_end = on_batch_end
+        if on_train_begin is not None:
+            self.on_train_begin = on_train_begin
+        if on_train_end is not None:
+            self.on_train_end = on_train_end
diff --git a/keras/callbacks_test.py b/keras/callbacks_test.py
index b3d6cff1e8ce..f0e269141666 100644
--- a/keras/callbacks_test.py
+++ b/keras/callbacks_test.py
@@ -27,32 +27,38 @@
 import unittest
 from unittest import mock
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.callbacks import BackupAndRestore
 from keras.callbacks import BackupAndRestoreExperimental
+from keras.callbacks import Callback
 from keras.engine import sequential
 from keras.layers import Activation
 from keras.layers import Dense
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers import sgd
+from keras.optimizers.legacy import gradient_descent
 from keras.optimizers.schedules import learning_rate_schedule
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import io_utils
 from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
+from keras.utils import tf_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+    import h5py
 except ImportError:
-  h5py = None
+    h5py = None
 
 try:
-  import requests  # pylint:disable=g-import-not-at-top
+    import requests
 except ImportError:
-  requests = None
+    requests = None
 
 
 TRAIN_SAMPLES = 10
@@ -63,3220 +69,4107 @@
 BATCH_SIZE = 5
 
 CALLBACK_HOOKS = [
-    'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
-    'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
-    'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
-    'on_test_begin', 'on_test_end', 'on_train_batch_begin',
-    'on_train_batch_end', 'on_train_begin', 'on_train_end'
+    "on_batch_begin",
+    "on_batch_end",
+    "on_epoch_begin",
+    "on_epoch_end",
+    "on_predict_batch_begin",
+    "on_predict_batch_end",
+    "on_predict_begin",
+    "on_predict_end",
+    "on_test_batch_begin",
+    "on_test_batch_end",
+    "on_test_begin",
+    "on_test_end",
+    "on_train_batch_begin",
+    "on_train_batch_end",
+    "on_train_begin",
+    "on_train_end",
 ]
 
 
 class Counter(keras.callbacks.Callback):
-  """Counts the number of times each callback method was run.
-
-  Attributes:
-    method_counts: dict. Contains the counts of time  each callback method was
-      run.
-  """
+    """Counts the number of times each callback method was run.
 
-  def __init__(self):
-    self.method_counts = collections.defaultdict(int)
-    for method_name in CALLBACK_HOOKS:
-      setattr(self, method_name,
-              self.wrap_with_counts(method_name, getattr(self, method_name)))
+    Attributes:
+      method_counts: dict. Contains the counts of time  each callback method was
+        run.
+    """
 
-  def wrap_with_counts(self, method_name, method):
+    def __init__(self):
+        self.method_counts = collections.defaultdict(int)
+        for method_name in CALLBACK_HOOKS:
+            setattr(
+                self,
+                method_name,
+                self.wrap_with_counts(method_name, getattr(self, method_name)),
+            )
 
-    def _call_and_count(*args, **kwargs):
-      self.method_counts[method_name] += 1
-      return method(*args, **kwargs)
+    def wrap_with_counts(self, method_name, method):
+        def _call_and_count(*args, **kwargs):
+            self.method_counts[method_name] += 1
+            return method(*args, **kwargs)
 
-    return _call_and_count
+        return _call_and_count
 
 
 class CallAllHooks(keras.callbacks.Callback):
-  """A callback that calls self._run for all hooks"""
+    """A callback that calls self._run for all hooks"""
 
-  def __init__(self):
-    for method_name in CALLBACK_HOOKS:
-      setattr(self, method_name, self._run)
+    def __init__(self):
+        for method_name in CALLBACK_HOOKS:
+            setattr(self, method_name, self._run)
 
-  def _run(self, *args, logs=None):
-    raise NotImplementedError
+    def _run(self, *args, logs=None):
+        raise NotImplementedError
 
 
 def _get_numpy():
-  return np.ones((10, 10)), np.ones((10, 1))
+    return np.ones((10, 10)), np.ones((10, 1))
 
 
 def _get_sequence():
+    class MySequence(keras.utils.data_utils.Sequence):
+        def __getitem__(self, _):
+            return np.ones((2, 10)), np.ones((2, 1))
 
-  class MySequence(keras.utils.data_utils.Sequence):
+        def __len__(self):
+            return 5
 
-    def __getitem__(self, _):
-      return np.ones((2, 10)), np.ones((2, 1))
-
-    def __len__(self):
-      return 5
-
-  return MySequence(), None
+    return MySequence(), None
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 class CallbackCountsTest(test_combinations.TestCase):
+    def _check_counts(self, counter, expected_counts):
+        """Checks that the counts registered by `counter` are those expected."""
+        for method_name, expected_count in expected_counts.items():
+            self.assertEqual(
+                counter.method_counts[method_name],
+                expected_count,
+                msg="For method {}: expected {}, got: {}".format(
+                    method_name,
+                    expected_count,
+                    counter.method_counts[method_name],
+                ),
+            )
+
+    def _get_model(self):
+        layers = [
+            keras.layers.Dense(10, activation="relu"),
+            keras.layers.Dense(1, activation="sigmoid"),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(10,))
+        model.compile(
+            tf.compat.v1.train.AdamOptimizer(0.001),
+            "binary_crossentropy",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
 
-  def _check_counts(self, counter, expected_counts):
-    """Checks that the counts registered by `counter` are those expected."""
-    for method_name, expected_count in expected_counts.items():
-      self.assertEqual(
-          counter.method_counts[method_name],
-          expected_count,
-          msg='For method {}: expected {}, got: {}'.format(
-              method_name, expected_count, counter.method_counts[method_name]))
-
-  def _get_model(self):
-    layers = [
-        keras.layers.Dense(10, activation='relu'),
-        keras.layers.Dense(1, activation='sigmoid')
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(10,))
-    model.compile(
-        tf.compat.v1.train.AdamOptimizer(0.001),
-        'binary_crossentropy',
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  @parameterized.named_parameters(('with_numpy', _get_numpy()),
-                                  ('with_sequence', _get_sequence()))
-  def test_callback_hooks_are_called_in_fit(self, data):
-    if not tf.executing_eagerly():
-      self.skipTest('Behavior changed in v2.')
-    x, y = data
-    val_x, val_y = np.ones((4, 10)), np.ones((4, 1))
-
-    model = self._get_model()
-    counter = Counter()
-    model.fit(
-        x,
-        y,
-        validation_data=(val_x, val_y),
-        batch_size=2,
-        steps_per_epoch=5,
-        epochs=5,
-        callbacks=[counter])
-
-    self._check_counts(
-        counter, {
-            'on_batch_begin': 25,
-            'on_batch_end': 25,
-            'on_epoch_begin': 5,
-            'on_epoch_end': 5,
-            'on_predict_batch_begin': 0,
-            'on_predict_batch_end': 0,
-            'on_predict_begin': 0,
-            'on_predict_end': 0,
-            'on_test_batch_begin': 10,
-            'on_test_batch_end': 10,
-            'on_test_begin': 5,
-            'on_test_end': 5,
-            'on_train_batch_begin': 25,
-            'on_train_batch_end': 25,
-            'on_train_begin': 1,
-            'on_train_end': 1
-        })
-
-  @parameterized.named_parameters(('with_numpy', _get_numpy()),
-                                  ('with_sequence', _get_sequence()))
-  def test_callback_hooks_are_called_in_evaluate(self, data):
-    x, y = data
-    is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
-
-    model = self._get_model()
-    counter = Counter()
-    model.evaluate(
-        x,
-        y,
-        batch_size=2 if not is_sequence else None,
-        steps=5 if is_sequence else None,
-        callbacks=[counter])
-    self._check_counts(
-        counter, {
-            'on_test_batch_begin': 5,
-            'on_test_batch_end': 5,
-            'on_test_begin': 1,
-            'on_test_end': 1
-        })
-
-  @parameterized.named_parameters(('with_numpy', _get_numpy()),
-                                  ('with_sequence', _get_sequence()))
-  def test_callback_hooks_are_called_in_predict(self, data):
-    x = data[0]
-    is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
-
-    model = self._get_model()
-    counter = Counter()
-    model.predict(
-        x,
-        batch_size=2 if not is_sequence else None,
-        steps=5 if is_sequence else None,
-        callbacks=[counter])
-    self._check_counts(
-        counter, {
-            'on_predict_batch_begin': 5,
-            'on_predict_batch_end': 5,
-            'on_predict_begin': 1,
-            'on_predict_end': 1
-        })
-
-  def test_callback_list_methods(self):
-    counter = Counter()
-    callback_list = keras.callbacks.CallbackList([counter])
-
-    batch = 0
-    callback_list.on_test_batch_begin(batch)
-    callback_list.on_test_batch_end(batch)
-    callback_list.on_predict_batch_begin(batch)
-    callback_list.on_predict_batch_end(batch)
-
-    self._check_counts(
-        counter, {
-            'on_test_batch_begin': 1,
-            'on_test_batch_end': 1,
-            'on_predict_batch_begin': 1,
-            'on_predict_batch_end': 1
-        })
+    @parameterized.named_parameters(
+        ("with_numpy", _get_numpy()), ("with_sequence", _get_sequence())
+    )
+    def test_callback_hooks_are_called_in_fit(self, data):
+        if not tf.executing_eagerly():
+            self.skipTest("Behavior changed in v2.")
+        x, y = data
+        val_x, val_y = np.ones((4, 10)), np.ones((4, 1))
+
+        model = self._get_model()
+        counter = Counter()
+        model.fit(
+            x,
+            y,
+            validation_data=(val_x, val_y),
+            batch_size=2,
+            steps_per_epoch=5,
+            epochs=5,
+            callbacks=[counter],
+        )
+
+        self._check_counts(
+            counter,
+            {
+                "on_batch_begin": 25,
+                "on_batch_end": 25,
+                "on_epoch_begin": 5,
+                "on_epoch_end": 5,
+                "on_predict_batch_begin": 0,
+                "on_predict_batch_end": 0,
+                "on_predict_begin": 0,
+                "on_predict_end": 0,
+                "on_test_batch_begin": 10,
+                "on_test_batch_end": 10,
+                "on_test_begin": 5,
+                "on_test_end": 5,
+                "on_train_batch_begin": 25,
+                "on_train_batch_end": 25,
+                "on_train_begin": 1,
+                "on_train_end": 1,
+            },
+        )
+
+    @parameterized.named_parameters(
+        ("with_numpy", _get_numpy()), ("with_sequence", _get_sequence())
+    )
+    def test_callback_hooks_are_called_in_evaluate(self, data):
+        x, y = data
+        is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
+
+        model = self._get_model()
+        counter = Counter()
+        model.evaluate(
+            x,
+            y,
+            batch_size=2 if not is_sequence else None,
+            steps=5 if is_sequence else None,
+            callbacks=[counter],
+        )
+        self._check_counts(
+            counter,
+            {
+                "on_test_batch_begin": 5,
+                "on_test_batch_end": 5,
+                "on_test_begin": 1,
+                "on_test_end": 1,
+            },
+        )
+
+    @parameterized.named_parameters(
+        ("with_numpy", _get_numpy()), ("with_sequence", _get_sequence())
+    )
+    def test_callback_hooks_are_called_in_predict(self, data):
+        x = data[0]
+        is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
+
+        model = self._get_model()
+        counter = Counter()
+        model.predict(
+            x,
+            batch_size=2 if not is_sequence else None,
+            steps=5 if is_sequence else None,
+            callbacks=[counter],
+        )
+        self._check_counts(
+            counter,
+            {
+                "on_predict_batch_begin": 5,
+                "on_predict_batch_end": 5,
+                "on_predict_begin": 1,
+                "on_predict_end": 1,
+            },
+        )
+
+    def test_callback_list_methods(self):
+        counter = Counter()
+        callback_list = keras.callbacks.CallbackList([counter])
+
+        batch = 0
+        callback_list.on_test_batch_begin(batch)
+        callback_list.on_test_batch_end(batch)
+        callback_list.on_predict_batch_begin(batch)
+        callback_list.on_predict_batch_end(batch)
+
+        self._check_counts(
+            counter,
+            {
+                "on_test_batch_begin": 1,
+                "on_test_batch_end": 1,
+                "on_predict_batch_begin": 1,
+                "on_predict_batch_end": 1,
+            },
+        )
 
 
 class KerasCallbacksTest(test_combinations.TestCase):
+    def _get_model(self, input_shape=None, additional_metrics=None):
+        additional_metrics = additional_metrics or []
+        layers = [
+            keras.layers.Dense(3, activation="relu"),
+            keras.layers.Dense(2, activation="softmax"),
+        ]
+        model = test_utils.get_model_from_layers(
+            layers, input_shape=input_shape
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=[keras.metrics.CategoricalAccuracy(name="my_acc")]
+            + additional_metrics,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
 
-  def _get_model(self, input_shape=None, additional_metrics=None):
-    additional_metrics = additional_metrics or []
-    layers = [
-        keras.layers.Dense(3, activation='relu'),
-        keras.layers.Dense(2, activation='softmax')
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=input_shape)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=[keras.metrics.CategoricalAccuracy(name='my_acc')] +
-        additional_metrics,
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_progbar_logging(self):
-    model = self._get_model(input_shape=(3,))
-
-    x = tf.ones((200, 3))
-    y = tf.zeros((200, 2))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
-    expected_log = r'(.*- loss:.*- my_acc:.*)+'
-
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model.fit(dataset, epochs=2, steps_per_epoch=10)
-      self.assertRegex(printed.contents(), expected_log)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_progbar_logging_with_stateful_metrics(self):
-
-    class AddAllOnes(keras.metrics.Metric):
-      """A simple metric that adds all the one's in `y_true`."""
-
-      def __init__(self, name='add_all_ones', **kwargs):
-        super().__init__(name=name, **kwargs)
-        self.total = self.add_weight(name='total', initializer='zeros')
-
-      def update_state(self, y_true, y_pred, sample_weight=None):
-        self.total.assign_add(
-            tf.cast(tf.reduce_sum(y_true), dtype=tf.float32))
-
-      def result(self):
-        return self.total
-
-    x_train = np.array([[0, 1, 0, 1, 0, 1, 0, 1]] * 8).astype(float)
-    y_train = np.array([[1, 0], [0, 0], [1, 1], [1, 0], [0, 1], [1, 0], [1, 0],
-                        [0, 0]])
-    # There are 7 ones in total in `y_train` after two batches.
-    expected_log = r'(.*- loss:.*- my_acc:.*- add_all_ones: 7.0000)+'
-
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model = self._get_model(
-          input_shape=(8,), additional_metrics=[AddAllOnes()])
-      model.fit(x_train, y_train, verbose=1, batch_size=4, shuffle=False)
-      self.assertRegex(printed.contents(), expected_log)
-
-    # When not executing eagerly, `model.evaluate` does not have the metrics
-    # results printed.
-    if tf.executing_eagerly():
-      with self.captureWritesToStream(sys.stdout) as printed:
-        model = self._get_model(
-            input_shape=(8,), additional_metrics=[AddAllOnes()])
-        model.evaluate(x_train, y_train, verbose=1, batch_size=4)
-        self.assertRegex(printed.contents(), expected_log)
-
-  @test_combinations.run_all_keras_modes
-  def test_trivial_backup_restore(self):
-    if test_utils.should_run_eagerly():
-      model = keras.Sequential([keras.layers.Dense(1)])
-      model.compile('sgd', 'mse')
-      cbk = BackupAndRestore(self.get_temp_dir())
-      model.fit(np.ones((10, 1)), np.ones((10, 1)), epochs=0, callbacks=[cbk])
-
-  def test_backup_restore_train_counter(self):
-    if not tf.compat.v1.executing_eagerly():
-      self.skipTest('BackupAndRestore only available when execution is enabled')
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse')
-    cbk = BackupAndRestore(self.get_temp_dir())
-
-    class InterruptingCallback(keras.callbacks.Callback):
-      """A callback to intentionally introduce interruption to training."""
-
-      def on_epoch_end(self, epoch, log=None):
-        logging.info(f'counter: {model._train_counter}')
-        if epoch == 5 or epoch == 12:
-          raise RuntimeError('Interruption')
-
-    log_dir = self.get_temp_dir()
-
-    # The following asserts that the train counter is fault tolerant.
-    self.assertEqual(model._train_counter.numpy(), 0)
-    try:
-      model.fit(np.ones((10, 1)), np.ones((10, 1)), epochs=20,
-                callbacks=[cbk, InterruptingCallback()])
-    except RuntimeError:
-      pass
-    self.assertEqual(model._train_counter.numpy(), 6)
-    try:
-      model.fit(np.ones((10, 1)), np.ones((10, 1)), epochs=20,
-                callbacks=[cbk, InterruptingCallback()])
-    except RuntimeError:
-      pass
-    self.assertEqual(model._train_counter.numpy(), 13)
-
-  def _test_backup_and_restore_callback_with(self, cls):
-    if not tf.compat.v1.executing_eagerly():
-      self.skipTest('BackupAndRestore only available when execution is enabled')
-
-    class InterruptingCallback(keras.callbacks.Callback):
-      """A callback to intentionally introduce interruption to training."""
-
-      def on_epoch_end(self, epoch, log=None):
-        if epoch == 15:
-          raise RuntimeError('Interruption')
-
-    model = keras.Sequential([keras.layers.Dense(10)])
-    optimizer = gradient_descent.SGD()
-    model.compile(optimizer, loss='mse')
-
-    x = tf.random.uniform((24, 10))
-    y = tf.random.uniform((24,))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat().batch(2)
-
-    backup_callback = cls(backup_dir=self.get_temp_dir())
-    try:
-      model.fit(
-          dataset,
-          epochs=20,
-          steps_per_epoch=5,
-          callbacks=[backup_callback, InterruptingCallback()])
-    except RuntimeError:
-      logging.warning('***Handling interruption***')
-      # This continues at the epoch where it left off.
-      model.fit(
-          dataset, epochs=20, steps_per_epoch=5, callbacks=[backup_callback])
-
-  def test_experimental_backup_and_restore(self):
-    """Ensure the legacy endpoint of `BackupAndRestore` gives warning."""
-
-    warning_messages = []
-
-    def warning(msg):
-      warning_messages.append(msg)
-
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning', warning):
-      self._test_backup_and_restore_callback_with(BackupAndRestoreExperimental)
-
-    warning_msg = ('`tf.keras.callbacks.experimental.BackupAndRestore` '
-                   'endpoint is deprecated')
-    self.assertIn(warning_msg, '\n'.join(warning_messages))
-    warning_msg = ('***Handling interruption***')
-    self.assertIn(warning_msg, '\n'.join(warning_messages))
-
-  def test_backup_and_restore(self):
-    """Ensure the public endpoint of `BackupAndRestore` is working."""
-
-    warning_messages = []
-
-    def warning(msg):
-      warning_messages.append(msg)
-
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning', warning):
-      self._test_backup_and_restore_callback_with(BackupAndRestore)
-
-    warning_msg = ('`tf.keras.callbacks.experimental.BackupAndRestore` '
-                   'endpoint is deprecated')
-    self.assertNotIn(warning_msg, '\n'.join(warning_messages))
-    warning_msg = ('***Handling interruption***')
-    self.assertIn(warning_msg, '\n'.join(warning_messages))
-
-  @test_combinations.run_all_keras_modes
-  def test_callback_warning(self):
-
-    class SleepCallback(keras.callbacks.Callback):
-
-      def on_train_batch_end(self, batch, logs=None):
-        time.sleep(0.1)
-
-    model = sequential.Sequential()
-    model.add(keras.layers.Dense(1))
-    model.compile(
-        'sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    warning_messages = []
-
-    def warning(msg):
-      warning_messages.append(msg)
-
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning', warning):
-      model.fit(
-          np.ones((16, 1), 'float32'),
-          np.ones((16, 1), 'float32'),
-          batch_size=3,
-          epochs=1,
-          callbacks=[SleepCallback()])
-    warning_msg = ('Callback method `on_train_batch_end` is slow compared '
-                   'to the batch time')
-    self.assertIn(warning_msg, '\n'.join(warning_messages))
-
-  @test_combinations.run_all_keras_modes
-  def test_default_callbacks_no_warning(self):
-    # Test that without the callback no warning is raised
-    model = sequential.Sequential()
-    model.add(keras.layers.Dense(1))
-    model.compile(
-        'sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    warning_messages = []
-
-    def warning(msg):
-      warning_messages.append(msg)
-
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning', warning):
-      model.fit(
-          np.ones((16, 1), 'float32'),
-          np.ones((16, 1), 'float32'),
-          batch_size=3,
-          epochs=1)
-    self.assertListEqual(warning_messages, [])
-
-  @test_combinations.run_with_all_model_types(exclude_models='functional')
-  @test_combinations.run_all_keras_modes
-  def test_progbar_logging_deferred_model_build(self):
-    model = self._get_model()
-    self.assertFalse(model.built)
-
-    x = tf.ones((200, 3))
-    y = tf.zeros((200, 2))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
-    expected_log = r'(.*- loss:.*- my_acc:.*)+'
-
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model.fit(dataset, epochs=2, steps_per_epoch=10)
-      self.assertRegex(printed.contents(), expected_log)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_progbar_logging_validation_data(self):
-    model = self._get_model(input_shape=(3,))
-
-    x = tf.ones((50, 3))
-    y = tf.zeros((50, 2))
-    training_dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
-    val_dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
-    expected_log = r'(.*5/5.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*)+'
-
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model.fit(training_dataset, epochs=2, validation_data=val_dataset)
-      self.assertRegex(printed.contents(), expected_log)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_progbar_logging_validation_split(self):
-    model = self._get_model(input_shape=(3,))
-
-    x = np.ones((100, 3))
-    y = np.zeros((100, 2))
-    expected_log = (
-        r'(?s).*1/2.*8/8.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:'
-        r'.*2/2.*8/8.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*')
-
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model.fit(x, y, batch_size=10, epochs=2, validation_split=0.2)
-      self.assertRegex(printed.contents(), expected_log)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_progbar_logging_training_validation(self):
-    model = self._get_model(input_shape=(2,))
-
-    def generator():
-      for _ in range(100):
-        yield [1, 1], 1
-
-    training = tf.data.Dataset \
-        .from_generator(
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_progbar_logging(self):
+        model = self._get_model(input_shape=(3,))
+
+        x = tf.ones((200, 3))
+        y = tf.zeros((200, 2))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
+        expected_log = r"(.*- loss:.*- my_acc:.*)+"
+
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model.fit(dataset, epochs=2, steps_per_epoch=10)
+            self.assertRegex(printed.contents(), expected_log)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_progbar_logging_with_stateful_metrics(self):
+        class AddAllOnes(keras.metrics.Metric):
+            """A simple metric that adds all the one's in `y_true`."""
+
+            def __init__(self, name="add_all_ones", **kwargs):
+                super().__init__(name=name, **kwargs)
+                self.total = self.add_weight(name="total", initializer="zeros")
+
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                self.total.assign_add(
+                    tf.cast(tf.reduce_sum(y_true), dtype=tf.float32)
+                )
+
+            def result(self):
+                return self.total
+
+        x_train = np.array([[0, 1, 0, 1, 0, 1, 0, 1]] * 8).astype(float)
+        y_train = np.array(
+            [[1, 0], [0, 0], [1, 1], [1, 0], [0, 1], [1, 0], [1, 0], [0, 0]]
+        )
+        # There are 7 ones in total in `y_train` after two batches.
+        expected_log = r"(.*- loss:.*- my_acc:.*- add_all_ones: 7.0000)+"
+
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model = self._get_model(
+                input_shape=(8,), additional_metrics=[AddAllOnes()]
+            )
+            model.fit(x_train, y_train, verbose=1, batch_size=4, shuffle=False)
+            self.assertRegex(printed.contents(), expected_log)
+
+        # When not executing eagerly, `model.evaluate` does not have the metrics
+        # results printed.
+        if tf.executing_eagerly():
+            with self.captureWritesToStream(sys.stdout) as printed:
+                model = self._get_model(
+                    input_shape=(8,), additional_metrics=[AddAllOnes()]
+                )
+                model.evaluate(x_train, y_train, verbose=1, batch_size=4)
+                self.assertRegex(printed.contents(), expected_log)
+
+    @test_combinations.run_all_keras_modes
+    def test_trivial_backup_restore(self):
+        if test_utils.should_run_eagerly():
+            model = keras.Sequential([keras.layers.Dense(1)])
+            model.compile("sgd", "mse")
+            cbk = BackupAndRestore(self.get_temp_dir())
+            model.fit(
+                np.ones((10, 1)), np.ones((10, 1)), epochs=1, callbacks=[cbk]
+            )
+
+    def test_backup_restore_train_counter(self):
+        if not tf.compat.v1.executing_eagerly():
+            self.skipTest(
+                "BackupAndRestore only available when eager execution is "
+                "enabled"
+            )
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse")
+        cbk = BackupAndRestore(self.get_temp_dir())
+
+        class InterruptingCallback(keras.callbacks.Callback):
+            """A callback to intentionally introduce interruption to
+            training."""
+
+            def on_epoch_end(self, epoch, log=None):
+                logging.info(f"counter: {model._train_counter}")
+                if epoch == 5 or epoch == 12:
+                    raise RuntimeError("Interruption")
+
+        self.get_temp_dir()
+
+        # The following asserts that the train counter is fault tolerant.
+        self.assertEqual(model._train_counter.numpy(), 0)
+        try:
+            model.fit(
+                np.ones((10, 1)),
+                np.ones((10, 1)),
+                epochs=20,
+                callbacks=[cbk, InterruptingCallback()],
+            )
+        except RuntimeError:
+            pass
+        self.assertEqual(model._train_counter.numpy(), 6)
+        try:
+            model.fit(
+                np.ones((10, 1)),
+                np.ones((10, 1)),
+                epochs=20,
+                callbacks=[cbk, InterruptingCallback()],
+            )
+        except RuntimeError:
+            pass
+        self.assertEqual(model._train_counter.numpy(), 13)
+
+    def _test_backup_and_restore_callback_with(self, cls):
+        if not tf.compat.v1.executing_eagerly():
+            self.skipTest(
+                "BackupAndRestore only available when execution is enabled"
+            )
+
+        class InterruptingCallback(keras.callbacks.Callback):
+            """A callback to intentionally introduce interruption to
+            training."""
+
+            def on_epoch_end(self, epoch, log=None):
+                if epoch == 15:
+                    raise RuntimeError("Interruption")
+
+        model = keras.Sequential([keras.layers.Dense(10)])
+        optimizer = sgd.SGD()
+        model.compile(optimizer, loss="mse")
+
+        x = tf.random.uniform((24, 10))
+        y = tf.random.uniform((24,))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat().batch(2)
+
+        backup_callback = cls(backup_dir=self.get_temp_dir())
+        try:
+            model.fit(
+                dataset,
+                epochs=20,
+                steps_per_epoch=5,
+                callbacks=[backup_callback, InterruptingCallback()],
+            )
+        except RuntimeError:
+            logging.warning("***Handling interruption***")
+            # This continues at the epoch where it left off.
+            model.fit(
+                dataset,
+                epochs=20,
+                steps_per_epoch=5,
+                callbacks=[backup_callback],
+            )
+
+    def _test_backup_and_restore_callback_at_steps(
+        self, cls, epoch_int, steps_int, mode
+    ):
+        if not tf.compat.v1.executing_eagerly():
+            self.skipTest(
+                "BackupAndRestore only available when eager execution is "
+                "enabled"
+            )
+
+        class InterruptingCallback(keras.callbacks.Callback):
+            """A callback to intentionally introduce interruption to
+            training."""
+
+            batch_count = 0
+
+            def on_epoch_end(self, epoch, log=None):
+                if epoch == epoch_int:
+                    raise RuntimeError("EpochInterruption")
+
+            def on_batch_end(self, batch, logs=None):
+                self.batch_count += 1
+                if self.batch_count == steps_int:
+                    raise RuntimeError("StepsInterruption")
+
+        class VerifyRestore(Callback):
+            """Verify if the training restored to the correct epoch and step."""
+
+            def __init__(self, initial_epoch, initial_step):
+                super(VerifyRestore, self).__init__()
+                self.initial_epoch = initial_epoch
+                self.initial_step = initial_step
+                self._current_epoch = 0
+
+            def on_epoch_begin(self, epoch, logs=None):
+                self._current_epoch = epoch
+                if epoch < self.initial_epoch:
+                    raise ValueError(
+                        "Training did not restore at epoch (%d) and step (%d)"
+                        % (self.initial_epoch, self.initial_step)
+                    )
+
+            def on_batch_begin(self, batch, logs=None):
+                if (
+                    batch <= self.initial_step
+                    and self._current_epoch < self.initial_epoch
+                ):
+                    raise ValueError(
+                        "Training did not restore at Epoch (%d) and step (%d)"
+                        % (self.initial_epoch, self.initial_step)
+                    )
+
+        model = keras.Sequential([keras.layers.Dense(10)])
+        optimizer = sgd.SGD()
+        model.compile(optimizer, loss="mse")
+
+        x = tf.random.uniform((24, 10))
+        y = tf.random.uniform((24,))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat().batch(2)
+        save_freq_arg = "epoch" if mode == "epoch" else 7
+        backup_callback = cls(
+            backup_dir=self.get_temp_dir(), save_freq=save_freq_arg
+        )
+        # epoch where the restore should resume from
+        if save_freq_arg == "epoch":
+            init_epoch = epoch_int
+            init_step = 0
+        elif save_freq_arg:
+            init_epoch = int(((steps_int // 7) * 7) // 5)
+            init_step = int((((steps_int // 7) * 7) % 5) - 1)
+        else:
+            init_epoch = 0
+            init_step = 0
+
+        # callback to verify accurate training state restore
+        verify_restore_callback = VerifyRestore(
+            initial_epoch=init_epoch, initial_step=init_step
+        )
+        try:
+            model.fit(
+                dataset,
+                epochs=20,
+                steps_per_epoch=5,
+                callbacks=[backup_callback, InterruptingCallback()],
+            )
+        except RuntimeError as e:
+            if str(e) == "EpochInterruption":
+                logging.warning("***Handling interruption at epoch***")
+            elif str(e) == "StepsInterruption":
+                logging.warning("***Handling interruption at Nth step***")
+            # This continues at the epoch and step where it left off.
+            model.fit(
+                dataset,
+                epochs=20,
+                steps_per_epoch=5,
+                callbacks=[backup_callback, verify_restore_callback],
+            )
+
+    def test_experimental_backup_and_restore(self):
+        """Ensure the legacy endpoint of `BackupAndRestore` gives warning."""
+
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            self._test_backup_and_restore_callback_with(
+                BackupAndRestoreExperimental
+            )
+
+        warning_msg = (
+            "`tf.keras.callbacks.experimental.BackupAndRestore` "
+            "endpoint is deprecated"
+        )
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+        warning_msg = "***Handling interruption***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+    def test_backup_and_restore(self):
+        """Ensure the public endpoint of `BackupAndRestore` is working."""
+
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            self._test_backup_and_restore_callback_with(BackupAndRestore)
+
+        warning_msg = (
+            "`tf.keras.callbacks.experimental.BackupAndRestore` "
+            "endpoint is deprecated"
+        )
+        self.assertNotIn(warning_msg, "\n".join(warning_messages))
+        warning_msg = "***Handling interruption***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+    def test_backup_and_restore_steps(self):
+        """Ensure the public endpoint of `BackupAndRestore` is working."""
+
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            # interrupt at steps before 1 epoch
+            self._test_backup_and_restore_callback_at_steps(
+                BackupAndRestore, epoch_int=20, steps_int=3, mode="batch"
+            )
+        warning_msg = (
+            "`tf.keras.callbacks.experimental.BackupAndRestore` "
+            "endpoint is deprecated"
+        )
+        self.assertNotIn(warning_msg, "\n".join(warning_messages))
+        warning_msg = "***Handling interruption at Nth step***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+        # interrupt at steps after 1 epoch
+        warning_messages = []
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            self._test_backup_and_restore_callback_at_steps(
+                BackupAndRestore, epoch_int=20, steps_int=8, mode="batch"
+            )
+        warning_msg = "***Handling interruption at Nth step***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+        # interrupt at epoch before steps
+        warning_messages = []
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            self._test_backup_and_restore_callback_at_steps(
+                BackupAndRestore, epoch_int=1, steps_int=12, mode="epoch"
+            )
+        warning_msg = "***Handling interruption at epoch***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+    def test_backup_and_restore_steps_last_batch(self):
+        """Ensure the public endpoint of `BackupAndRestore` is working."""
+
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            # interrupt at last step in 7th epoch
+            self._test_backup_and_restore_callback_at_steps(
+                BackupAndRestore, epoch_int=20, steps_int=35, mode="batch"
+            )
+        warning_msg = (
+            "`tf.keras.callbacks.experimental.BackupAndRestore` "
+            "endpoint is deprecated"
+        )
+        self.assertNotIn(warning_msg, "\n".join(warning_messages))
+        warning_msg = "***Handling interruption at Nth step***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+    def test_backup_and_restore_steps_false_save_freq(self):
+        """Ensure the public endpoint of `BackupAndRestore` is working."""
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            # interrupt at steps before 1 epoch
+            self._test_backup_and_restore_callback_at_steps(
+                BackupAndRestore, epoch_int=20, steps_int=3, mode=False
+            )
+        warning_msg = (
+            "`tf.keras.callbacks.experimental.BackupAndRestore` "
+            "endpoint is deprecated"
+        )
+        self.assertNotIn(warning_msg, "\n".join(warning_messages))
+        warning_msg = "***Handling interruption at Nth step***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+        # interrupt at steps after 1 epoch
+        warning_messages = []
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            self._test_backup_and_restore_callback_at_steps(
+                BackupAndRestore, epoch_int=20, steps_int=8, mode="batch"
+            )
+        warning_msg = "***Handling interruption at Nth step***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+        # interrupt at epoch before steps
+        warning_messages = []
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            self._test_backup_and_restore_callback_at_steps(
+                BackupAndRestore, epoch_int=1, steps_int=12, mode="epoch"
+            )
+        warning_msg = "***Handling interruption at epoch***"
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+    def test_backup_and_restore_steps_clean_up(self):
+        if not tf.executing_eagerly():
+            self.skipTest(
+                "BackupAndRestore only available when eager execution is "
+                "enabled."
+            )
+        path = self.get_temp_dir()
+        callback = BackupAndRestore(path, delete_checkpoint=True)
+        model = keras.Sequential([keras.layers.Dense(10)])
+        optimizer = gradient_descent.SGD()
+        model.compile(optimizer, loss="mse")
+
+        x = tf.random.uniform((24, 10))
+        y = tf.random.uniform((24,))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
+        model.fit(dataset, epochs=1, callbacks=[callback])
+        self.assertEmpty(os.listdir(path))
+
+        callback = BackupAndRestore(path, delete_checkpoint=False)
+        model.fit(dataset, epochs=1, callbacks=[callback])
+        self.assertNotEmpty(os.listdir(path))
+
+    @test_combinations.run_all_keras_modes
+    def test_callback_warning(self):
+        class SleepCallback(keras.callbacks.Callback):
+            def on_train_batch_end(self, batch, logs=None):
+                time.sleep(0.1)
+
+        model = sequential.Sequential()
+        model.add(keras.layers.Dense(1))
+        model.compile(
+            "sgd", loss="mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            model.fit(
+                np.ones((16, 1), "float32"),
+                np.ones((16, 1), "float32"),
+                batch_size=3,
+                epochs=1,
+                callbacks=[SleepCallback()],
+            )
+        warning_msg = (
+            "Callback method `on_train_batch_end` is slow compared "
+            "to the batch time"
+        )
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+    @test_combinations.run_all_keras_modes
+    def test_default_callbacks_no_warning(self):
+        # Test that without the callback no warning is raised
+        model = sequential.Sequential()
+        model.add(keras.layers.Dense(1))
+        model.compile(
+            "sgd", loss="mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            model.fit(
+                np.ones((16, 1), "float32"),
+                np.ones((16, 1), "float32"),
+                batch_size=3,
+                epochs=1,
+            )
+        self.assertListEqual(warning_messages, [])
+
+    @test_combinations.run_with_all_model_types(exclude_models="functional")
+    @test_combinations.run_all_keras_modes
+    def test_progbar_logging_deferred_model_build(self):
+        model = self._get_model()
+        self.assertFalse(model.built)
+
+        x = tf.ones((200, 3))
+        y = tf.zeros((200, 2))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
+        expected_log = r"(.*- loss:.*- my_acc:.*)+"
+
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model.fit(dataset, epochs=2, steps_per_epoch=10)
+            self.assertRegex(printed.contents(), expected_log)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_progbar_logging_validation_data(self):
+        model = self._get_model(input_shape=(3,))
+
+        x = tf.ones((50, 3))
+        y = tf.zeros((50, 2))
+        training_dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
+        val_dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(10)
+        expected_log = (
+            r"(.*5/5.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*)+"
+        )
+
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model.fit(training_dataset, epochs=2, validation_data=val_dataset)
+            self.assertRegex(printed.contents(), expected_log)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_progbar_logging_validation_split(self):
+        model = self._get_model(input_shape=(3,))
+
+        x = np.ones((100, 3))
+        y = np.zeros((100, 2))
+        expected_log = (
+            r"(?s).*1/2.*8/8.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:"
+            r".*2/2.*8/8.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*"
+        )
+
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model.fit(x, y, batch_size=10, epochs=2, validation_split=0.2)
+            self.assertRegex(printed.contents(), expected_log)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_progbar_logging_training_validation(self):
+        model = self._get_model(input_shape=(2,))
+
+        def generator():
+            for _ in range(100):
+                yield [1, 1], 1
+
+        training = (
+            tf.data.Dataset.from_generator(
+                generator=generator,
+                output_types=("float64", "float64"),
+                output_shapes=([2], []),
+            )
+            .batch(2)
+            .repeat()
+        )
+        validation = tf.data.Dataset.from_generator(
             generator=generator,
-            output_types=('float64', 'float64'),
-            output_shapes=([2], [])) \
-        .batch(2) \
-        .repeat()
-    validation = tf.data.Dataset \
-        .from_generator(
+            output_types=("float64", "float64"),
+            output_shapes=([2], []),
+        ).batch(2)
+        expected_log = (
+            r"(?s).*1/2.*20/20.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:"
+            r".*2/2.*20/20.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*"
+        )
+
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model.fit(
+                x=training,
+                validation_data=validation,
+                epochs=2,
+                steps_per_epoch=20,
+            )
+            self.assertRegex(printed.contents(), expected_log)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_progbar_logging_with_dataset_and_partial_batch(self):
+        model = self._get_model(input_shape=(2,))
+
+        def generator():
+            # Have a partial batch at the end.
+            for _ in range(9):
+                yield np.random.random(2), 1
+
+        training = tf.data.Dataset.from_generator(
             generator=generator,
-            output_types=('float64', 'float64'),
-            output_shapes=([2], [])) \
-        .batch(2)
-    expected_log = (
-        r'(?s).*1/2.*20/20.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:'
-        r'.*2/2.*20/20.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*')
-
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model.fit(
-          x=training, validation_data=validation, epochs=2, steps_per_epoch=20)
-      self.assertRegex(printed.contents(), expected_log)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_progbar_logging_with_dataset_and_partial_batch(self):
-    model = self._get_model(input_shape=(2,))
-
-    def generator():
-      # Have a partial batch at the end.
-      for _ in range(9):
-        yield np.random.random(2), 1
-
-    training = tf.data.Dataset \
-      .from_generator(
-          generator=generator,
-          output_types=('float64', 'float64'),
-          output_shapes=([2], [])) \
-      .batch(2)
-    validation = tf.data.Dataset \
-      .from_generator(
-          generator=generator,
-          output_types=('float64', 'float64'),
-          output_shapes=([2], [])) \
-      .batch(2)
-
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model.fit(x=training, validation_data=validation)
-
-      # Make sure the value of val_ metrics are not zeros.
-      log_content = printed.contents()
-      val_loss = re.findall(r'val_loss: (\d\.\d+)', log_content)
-      self.assertLen(val_loss, 1)
-      self.assertGreater(float(val_loss[0]), 0.0)
-
-  @test_combinations.run_with_all_model_types
-  def test_ModelCheckpoint(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    model_type = test_utils.get_model_type()
-    if model_type == 'subclass':
-      return  # Skip test since subclassed models cannot be saved in .h5 format.
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest('Checkpoint callback only available in v2.')
-
-    layers = [
-        keras.layers.Dense(NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'),
-        keras.layers.Dense(NUM_CLASSES, activation='softmax')
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(3,))
-    model.compile(
-        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    filepath = os.path.join(temp_dir, 'checkpoint.h5')
-    (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = np_utils.to_categorical(y_test)
-    y_train = np_utils.to_categorical(y_train)
-
-    # Case 1
-    monitor = 'val_loss'
-    save_best_only = False
-    mode = 'auto'
-
-    cbks = [
-        keras.callbacks.ModelCheckpoint(
-            filepath,
-            monitor=monitor,
-            save_best_only=save_best_only,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert os.path.exists(filepath)
-    os.remove(filepath)
-
-    # Case 2
-    mode = 'min'
-    cbks = [
-        keras.callbacks.ModelCheckpoint(
-            filepath,
-            monitor=monitor,
-            save_best_only=save_best_only,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert os.path.exists(filepath)
-    os.remove(filepath)
-
-    # Case 3
-    mode = 'max'
-    monitor = 'val_acc'
-    cbks = [
-        keras.callbacks.ModelCheckpoint(
-            filepath,
-            monitor=monitor,
-            save_best_only=save_best_only,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert os.path.exists(filepath)
-    os.remove(filepath)
-
-    # Case 4
-    save_best_only = True
-    cbks = [
+            output_types=("float64", "float64"),
+            output_shapes=([2], []),
+        ).batch(2)
+        validation = tf.data.Dataset.from_generator(
+            generator=generator,
+            output_types=("float64", "float64"),
+            output_shapes=([2], []),
+        ).batch(2)
+
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model.fit(x=training, validation_data=validation)
+
+            # Make sure the value of val_ metrics are not zeros.
+            log_content = printed.contents()
+            val_loss = re.findall(r"val_loss: (\d\.\d+)", log_content)
+            self.assertLen(val_loss, 1)
+            self.assertGreater(float(val_loss[0]), 0.0)
+
+    @test_combinations.run_with_all_model_types
+    def test_ModelCheckpoint(self):
+        if h5py is None:
+            return  # Skip test if models cannot be saved.
+
+        model_type = test_utils.get_model_type()
+        if model_type == "subclass":
+            # Skip test since subclassed models cannot be saved in .h5 format.
+            return
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest("Checkpoint callback only available in v2.")
+
+        layers = [
+            keras.layers.Dense(
+                NUM_HIDDEN, input_dim=INPUT_DIM, activation="relu"
+            ),
+            keras.layers.Dense(NUM_CLASSES, activation="softmax"),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(3,))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            metrics=["acc"],
+        )
+
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+        # Save model to a subdir inside the temp_dir so we can test
+        # automatic directory creation.
+        filepath = os.path.join(temp_dir, "subdir", "checkpoint.h5")
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_test = np_utils.to_categorical(y_test)
+        y_train = np_utils.to_categorical(y_train)
+
+        # Case 1
+        monitor = "val_loss"
+        save_best_only = False
+        mode = "auto"
+
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                mode=mode,
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1,
+            verbose=0,
+        )
+        assert os.path.exists(filepath)
+        os.remove(filepath)
+
+        # Case 2
+        mode = "min"
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                mode=mode,
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1,
+            verbose=0,
+        )
+        assert os.path.exists(filepath)
+        os.remove(filepath)
+
+        # Case 3
+        mode = "max"
+        monitor = "val_acc"
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                mode=mode,
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1,
+            verbose=0,
+        )
+        assert os.path.exists(filepath)
+        os.remove(filepath)
+
+        # Case 4
+        save_best_only = True
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                mode=mode,
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1,
+            verbose=0,
+        )
+        assert os.path.exists(filepath)
+        os.remove(filepath)
+
+        # Case 5: metric not available.
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath, monitor="unknown", save_best_only=True
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1,
+            verbose=0,
+        )
+        # File won't be written.
+        assert not os.path.exists(filepath)
+
+        # Case 6
+        save_best_only = False
+        period = 2
+        mode = "auto"
+
+        filepath = os.path.join(temp_dir, "checkpoint.{epoch:02d}.h5")
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                mode=mode,
+                period=period,
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=4,
+            verbose=1,
+        )
+        assert os.path.exists(filepath.format(epoch=2))
+        assert os.path.exists(filepath.format(epoch=4))
+        os.remove(filepath.format(epoch=2))
+        os.remove(filepath.format(epoch=4))
+        assert not os.path.exists(filepath.format(epoch=1))
+        assert not os.path.exists(filepath.format(epoch=3))
+
+        # Invalid use: this will raise a warning but not an Exception.
         keras.callbacks.ModelCheckpoint(
             filepath,
             monitor=monitor,
             save_best_only=save_best_only,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert os.path.exists(filepath)
-    os.remove(filepath)
-
-    # Case 5: metric not available.
-    cbks = [
-        keras.callbacks.ModelCheckpoint(
-            filepath,
-            monitor='unknown',
-            save_best_only=True)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    # File won't be written.
-    assert not os.path.exists(filepath)
-
-    # Case 6
-    save_best_only = False
-    period = 2
-    mode = 'auto'
-
-    filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
-    cbks = [
+            mode="unknown",
+        )
+
+        # Case 7: `ModelCheckpoint` with a combination of `save_freq` and
+        # `period`.  Though `period` is deprecated, we're testing it for
+        # backward-compatibility.
+        filepath = os.path.join(temp_dir, "checkpoint.epoch{epoch:02d}.h5")
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                mode=mode,
+                save_freq="epoch",
+                period=5,
+            )
+        ]
+        assert not os.path.exists(filepath.format(epoch=0))
+        assert not os.path.exists(filepath.format(epoch=5))
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=2,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=10,
+            verbose=1,
+        )
+        assert not os.path.exists(filepath.format(epoch=1))
+        assert not os.path.exists(filepath.format(epoch=2))
+        assert not os.path.exists(filepath.format(epoch=3))
+        assert not os.path.exists(filepath.format(epoch=4))
+        assert os.path.exists(filepath.format(epoch=5))
+        assert not os.path.exists(filepath.format(epoch=6))
+        assert os.path.exists(filepath.format(epoch=10))
+        os.remove(filepath.format(epoch=5))
+        os.remove(filepath.format(epoch=10))
+
+        # Case 8: `ModelCheckpoint` with an integer `save_freq`
+        filepath = os.path.join(temp_dir, "checkpoint.epoch{epoch:02d}.h5")
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                mode=mode,
+                save_freq=15,
+                period=100,
+            )  # The period should be ignored (this test tests this).
+        ]
+        assert not os.path.exists(filepath.format(epoch=3))
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=2,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=10,
+            verbose=1,
+        )
+        assert not os.path.exists(filepath.format(epoch=1))
+        assert not os.path.exists(filepath.format(epoch=2))
+        assert os.path.exists(filepath.format(epoch=3))
+        assert not os.path.exists(filepath.format(epoch=4))
+        assert not os.path.exists(filepath.format(epoch=5))
+        assert os.path.exists(filepath.format(epoch=6))
+        assert not os.path.exists(filepath.format(epoch=7))
+        assert not os.path.exists(filepath.format(epoch=8))
+        assert os.path.exists(filepath.format(epoch=9))
+        os.remove(filepath.format(epoch=3))
+        os.remove(filepath.format(epoch=6))
+        os.remove(filepath.format(epoch=9))
+
+        # Case 9: `ModelCheckpoint` with valid and invalid save_freq argument.
+        with self.assertRaisesRegex(ValueError, "Unrecognized save_freq"):
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                mode=mode,
+                save_freq="invalid_save_freq",
+            )
+        # The following should not raise ValueError.
         keras.callbacks.ModelCheckpoint(
             filepath,
             monitor=monitor,
             save_best_only=save_best_only,
             mode=mode,
-            period=period)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=4,
-        verbose=1)
-    assert os.path.exists(filepath.format(epoch=2))
-    assert os.path.exists(filepath.format(epoch=4))
-    os.remove(filepath.format(epoch=2))
-    os.remove(filepath.format(epoch=4))
-    assert not os.path.exists(filepath.format(epoch=1))
-    assert not os.path.exists(filepath.format(epoch=3))
-
-    # Invalid use: this will raise a warning but not an Exception.
-    keras.callbacks.ModelCheckpoint(
-        filepath,
-        monitor=monitor,
-        save_best_only=save_best_only,
-        mode='unknown')
-
-    # Case 7: `ModelCheckpoint` with a combination of `save_freq` and `period`.
-    # Though `period` is deprecated, we're testing it for
-    # backward-compatibility.
-    filepath = os.path.join(temp_dir, 'checkpoint.epoch{epoch:02d}.h5')
-    cbks = [
-        keras.callbacks.ModelCheckpoint(
-            filepath, monitor=monitor, mode=mode, save_freq='epoch', period=5)
-    ]
-    assert not os.path.exists(filepath.format(epoch=0))
-    assert not os.path.exists(filepath.format(epoch=5))
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=2,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=10,
-        verbose=1)
-    assert not os.path.exists(filepath.format(epoch=1))
-    assert not os.path.exists(filepath.format(epoch=2))
-    assert not os.path.exists(filepath.format(epoch=3))
-    assert not os.path.exists(filepath.format(epoch=4))
-    assert os.path.exists(filepath.format(epoch=5))
-    assert not os.path.exists(filepath.format(epoch=6))
-    assert os.path.exists(filepath.format(epoch=10))
-    os.remove(filepath.format(epoch=5))
-    os.remove(filepath.format(epoch=10))
-
-    # Case 8: `ModelCheckpoint` with an integer `save_freq`
-    filepath = os.path.join(temp_dir, 'checkpoint.epoch{epoch:02d}.h5')
-    cbks = [
+            save_freq="epoch",
+        )
         keras.callbacks.ModelCheckpoint(
             filepath,
             monitor=monitor,
             save_best_only=save_best_only,
             mode=mode,
-            save_freq=15,
-            period=100)  # The period should be ignored (this test tests this).
-    ]
-    assert not os.path.exists(filepath.format(epoch=3))
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=2,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=10,
-        verbose=1)
-    assert not os.path.exists(filepath.format(epoch=1))
-    assert not os.path.exists(filepath.format(epoch=2))
-    assert os.path.exists(filepath.format(epoch=3))
-    assert not os.path.exists(filepath.format(epoch=4))
-    assert not os.path.exists(filepath.format(epoch=5))
-    assert os.path.exists(filepath.format(epoch=6))
-    assert not os.path.exists(filepath.format(epoch=7))
-    assert not os.path.exists(filepath.format(epoch=8))
-    assert os.path.exists(filepath.format(epoch=9))
-    os.remove(filepath.format(epoch=3))
-    os.remove(filepath.format(epoch=6))
-    os.remove(filepath.format(epoch=9))
-
-    # Case 9: `ModelCheckpoint` with valid and invalid save_freq argument.
-    with self.assertRaisesRegex(ValueError, 'Unrecognized save_freq'):
-      keras.callbacks.ModelCheckpoint(
-          filepath,
-          monitor=monitor,
-          save_best_only=save_best_only,
-          mode=mode,
-          save_freq='invalid_save_freq')
-    # The following should not raise ValueError.
-    keras.callbacks.ModelCheckpoint(
-        filepath,
-        monitor=monitor,
-        save_best_only=save_best_only,
-        mode=mode,
-        save_freq='epoch')
-    keras.callbacks.ModelCheckpoint(
-        filepath,
-        monitor=monitor,
-        save_best_only=save_best_only,
-        mode=mode,
-        save_freq=3)
-
-    # Case 10: `ModelCheckpoint` with valid and invalid `options` argument.
-    with self.assertRaisesRegex(TypeError, 'tf.train.CheckpointOptions'):
-      keras.callbacks.ModelCheckpoint(
-          filepath,
-          monitor=monitor,
-          save_best_only=save_best_only,
-          save_weights_only=True,
-          mode=mode,
-          options=tf.saved_model.SaveOptions())
-    with self.assertRaisesRegex(TypeError, 'tf.saved_model.SaveOptions'):
-      keras.callbacks.ModelCheckpoint(
-          filepath,
-          monitor=monitor,
-          save_best_only=save_best_only,
-          save_weights_only=False,
-          mode=mode,
-          options=tf.train.CheckpointOptions())
-    keras.callbacks.ModelCheckpoint(
-        filepath,
-        monitor=monitor,
-        save_best_only=save_best_only,
-        save_weights_only=True,
-        mode=mode,
-        options=tf.train.CheckpointOptions())
-    keras.callbacks.ModelCheckpoint(
-        filepath,
-        monitor=monitor,
-        save_best_only=save_best_only,
-        save_weights_only=False,
-        mode=mode,
-        options=tf.saved_model.SaveOptions())
-
-    # Case 11: `ModelCheckpoint` save model with batch number in filename.
-    filepath = os.path.join(temp_dir,
-                            'checkpoint.epoch{epoch:02d}batch{batch:02d}.h5')
-    cbks = [
-        keras.callbacks.ModelCheckpoint(filepath, monitor=monitor, save_freq=1)
-    ]
-    assert not os.path.exists(filepath.format(epoch=1, batch=1))
-    assert not os.path.exists(filepath.format(epoch=1, batch=2))
-    assert not os.path.exists(filepath.format(epoch=2, batch=1))
-    assert not os.path.exists(filepath.format(epoch=2, batch=2))
-    assert not os.path.exists(filepath.format(epoch=3, batch=1))
-    assert not os.path.exists(filepath.format(epoch=3, batch=2))
-    assert not os.path.exists(filepath.format(epoch=4, batch=1))
-    assert not os.path.exists(filepath.format(epoch=4, batch=2))
-    assert not os.path.exists(filepath.format(epoch=5, batch=1))
-    assert not os.path.exists(filepath.format(epoch=5, batch=2))
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=5,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=5,
-        verbose=1)
-
-    assert os.path.exists(filepath.format(epoch=1, batch=1))
-    assert os.path.exists(filepath.format(epoch=1, batch=2))
-    assert os.path.exists(filepath.format(epoch=2, batch=1))
-    assert os.path.exists(filepath.format(epoch=2, batch=2))
-    assert os.path.exists(filepath.format(epoch=3, batch=1))
-    assert os.path.exists(filepath.format(epoch=3, batch=2))
-    assert os.path.exists(filepath.format(epoch=4, batch=1))
-    assert os.path.exists(filepath.format(epoch=4, batch=2))
-    assert os.path.exists(filepath.format(epoch=5, batch=1))
-    assert os.path.exists(filepath.format(epoch=5, batch=2))
-
-    os.remove(filepath.format(epoch=1, batch=1))
-    os.remove(filepath.format(epoch=1, batch=2))
-    os.remove(filepath.format(epoch=2, batch=1))
-    os.remove(filepath.format(epoch=2, batch=2))
-    os.remove(filepath.format(epoch=3, batch=1))
-    os.remove(filepath.format(epoch=3, batch=2))
-    os.remove(filepath.format(epoch=4, batch=1))
-    os.remove(filepath.format(epoch=4, batch=2))
-    os.remove(filepath.format(epoch=5, batch=1))
-    os.remove(filepath.format(epoch=5, batch=2))
-
-    # Case 12: ModelCheckpoint saves model with initial_value_threshold param
-    mode = 'max'
-    monitor = 'val_acc'
-    initial_value_threshold = 0
-    save_best_only = True
-    filepath = os.path.join(temp_dir, 'checkpoint.h5')
-    cbks = [
+            save_freq=3,
+        )
+
+        # Case 10: `ModelCheckpoint` with valid and invalid `options` argument.
+        with self.assertRaisesRegex(TypeError, "tf.train.CheckpointOptions"):
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                save_weights_only=True,
+                mode=mode,
+                options=tf.saved_model.SaveOptions(),
+            )
+        with self.assertRaisesRegex(TypeError, "tf.saved_model.SaveOptions"):
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                save_weights_only=False,
+                mode=mode,
+                options=tf.train.CheckpointOptions(),
+            )
         keras.callbacks.ModelCheckpoint(
             filepath,
             monitor=monitor,
             save_best_only=save_best_only,
-            initial_value_threshold=initial_value_threshold,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert os.path.exists(filepath)
-    os.remove(filepath)
-
-    # Case 13: ModelCheckpoint saves model with initial_value_threshold param
-    mode = 'auto'
-    monitor = 'val_loss'
-    initial_value_threshold = None
-    save_best_only = True
-    cbks = [
-        keras.callbacks.ModelCheckpoint(
-            filepath,
-            monitor=monitor,
-            save_best_only=save_best_only,
-            initial_value_threshold=initial_value_threshold,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert os.path.exists(filepath)
-    os.remove(filepath)
-
-    # Case 14: ModelCheckpoint doesnt save model if loss was minimum earlier
-    mode = 'min'
-    monitor = 'val_loss'
-    initial_value_threshold = 0
-    save_best_only = True
-    cbks = [
-        keras.callbacks.ModelCheckpoint(
-            filepath,
-            monitor=monitor,
-            save_best_only=save_best_only,
-            initial_value_threshold=initial_value_threshold,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert not os.path.exists(filepath)
-
-    # Case 15: ModelCheckpoint doesnt save model if loss was min earlier in auto
-    # mode
-    mode = 'auto'
-    monitor = 'val_loss'
-    initial_value_threshold = 0
-    save_best_only = True
-    cbks = [
+            save_weights_only=True,
+            mode=mode,
+            options=tf.train.CheckpointOptions(),
+        )
         keras.callbacks.ModelCheckpoint(
             filepath,
             monitor=monitor,
             save_best_only=save_best_only,
-            initial_value_threshold=initial_value_threshold,
-            mode=mode)
-    ]
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    assert not os.path.exists(filepath)
-
-  @test_utils.run_v2_only
-  def test_ModelCheckpoint_subclass_save_weights_false(self):
-    model = test_utils.get_small_subclass_mlp(NUM_HIDDEN, NUM_CLASSES)
-    model.compile(
-        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    filepath = os.path.join(temp_dir, 'checkpoint')
-    cbks = [keras.callbacks.ModelCheckpoint(
-        filepath, save_weights_only=False)]
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_train = np_utils.to_categorical(y_train, num_classes=NUM_CLASSES)
-
-    model.fit(
-        x_train,
-        y_train,
-        callbacks=cbks,
-        epochs=1,
-        verbose=0)
-    # Check that the filepath is a SavedModel directory.
-    self.assertIn('saved_model.pb', os.listdir(filepath))
-
-  def _get_dummy_resource_for_model_checkpoint_testing(self):
-
-    def get_input_datasets():
-      # Simple training input.
-      train_input = [[1.]] * 16
-      train_label = [[0.]] * 16
-      ds = tf.data.Dataset.from_tensor_slices((train_input, train_label))
-      return ds.batch(8, drop_remainder=True)
-
-    # Very simple bias model to eliminate randomness.
-    optimizer = gradient_descent.SGD(0.1)
-    model = sequential.Sequential()
-    model.add(test_utils.Bias(input_shape=(1,)))
-    model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])
-    train_ds = get_input_datasets()
-
-    temp_dir = self.get_temp_dir()
-    filepath = os.path.join(temp_dir, 'checkpoint.epoch{epoch:02d}.h5')
-
-    # The filepath shouldn't exist at the beginning.
-    self.assertFalse(os.path.exists(filepath))
-    callback = keras.callbacks.ModelCheckpoint(
-        filepath=filepath, save_weights_only=True)
-
-    return model, train_ds, callback, filepath
-
-  def _run_load_weights_on_restart_test_common_iterations(self):
-
-    (model, train_ds, callback,
-     filepath) = self._get_dummy_resource_for_model_checkpoint_testing()
-    initial_epochs = 3
-    model.fit(train_ds, epochs=initial_epochs, callbacks=[callback])
-
-    # The files should exist after fitting with callback.
-    for epoch in range(initial_epochs):
-      self.assertTrue(os.path.exists(filepath.format(epoch=epoch + 1)))
-    self.assertFalse(os.path.exists(filepath.format(epoch=initial_epochs + 1)))
-    self.assertEqual(
-        callback._get_most_recently_modified_file_matching_pattern(filepath),
-        filepath.format(epoch=initial_epochs))
-
-    model.fit(train_ds, epochs=1)
-    weights_after_one_more_epoch = model.get_weights()
-
-    # The filepath should continue to exist after fitting without callback.
-    for epoch in range(initial_epochs):
-      self.assertTrue(os.path.exists(filepath.format(epoch=epoch + 1)))
-
-    return model, train_ds, filepath, weights_after_one_more_epoch
-
-  @staticmethod
-  def get_ModelCheckpoint_load_weights_on_restart_true_test(save_weights_only):
-
-    def func(self):
-      (model, train_ds, filepath, weights_after_one_more_epoch
-      ) = self._run_load_weights_on_restart_test_common_iterations()
-
-      # Sleep for some short time period ensuring the files are created with
-      # a different time (in MacOS OSS the granularity is only 1 second).
-      time.sleep(2)
-      callback = keras.callbacks.ModelCheckpoint(
-          filepath=filepath,
-          save_weights_only=save_weights_only,
-          load_weights_on_restart=True)
-      model.fit(train_ds, epochs=1, callbacks=[callback])
-      weights_after_model_restoring_and_one_more_epoch = model.get_weights()
-
-      self.assertEqual(
-          callback._get_most_recently_modified_file_matching_pattern(filepath),
-          filepath.format(epoch=1))
-
-      model.fit(
-          train_ds,
-          epochs=1,
-          callbacks=[
-              keras.callbacks.ModelCheckpoint(
-                  filepath=filepath,
-                  save_weights_only=save_weights_only,
-                  load_weights_on_restart=True)
-          ])
-      weights_with_one_final_extra_epoch = model.get_weights()
-
-      # Asserting the weights one epoch after initial fitting and another epoch
-      # after that are closed, if a ModelCheckpoint with
-      # load_weights_on_restart=True is given (so the model is restored at the
-      # beginning of training).
-      self.assertAllClose(weights_after_one_more_epoch,
-                          weights_after_model_restoring_and_one_more_epoch)
-
-      self.assertNotAllClose(weights_after_one_more_epoch,
-                             weights_with_one_final_extra_epoch)
-
-    return func
-
-  @staticmethod
-  def get_ModelCheckpoint_load_weights_on_restart_false_test(save_weights_only):
-
-    def func(self):
-      (model, train_ds, filepath, weights_after_one_more_epoch
-      ) = self._run_load_weights_on_restart_test_common_iterations()
-
-      model.fit(
-          train_ds,
-          epochs=1,
-          callbacks=[
-              keras.callbacks.ModelCheckpoint(
-                  filepath=filepath, save_weights_only=save_weights_only)
-          ])
-      weights_after_model_restoring_and_one_more_epoch = model.get_weights()
-
-      # Asserting the weights one epoch after initial fitting and another epoch
-      # after that are different, if a ModelCheckpoint with
-      # load_weights_on_restart=False is given (so the model is not restored at
-      # the beginning of training).
-      self.assertNotAllClose(weights_after_one_more_epoch,
-                             weights_after_model_restoring_and_one_more_epoch)
-
-    return func
-
-  test_model_checkpoint_load_weights_on_restart_true_save_weights_only_true = \
-        get_ModelCheckpoint_load_weights_on_restart_true_test.__func__(True)
-
-  test_model_checkpoint_load_weights_on_restart_true_save_weights_only_false = \
-        get_ModelCheckpoint_load_weights_on_restart_true_test.__func__(False)
-
-  test_model_checkpoint_load_weights_on_restart_false_save_weights_only_true = \
-        get_ModelCheckpoint_load_weights_on_restart_false_test.__func__(True)
-
-  test_model_checkpoint_load_weights_on_restart_false_save_weights_only_false \
-        = get_ModelCheckpoint_load_weights_on_restart_false_test.__func__(False)
-
-  def test_ModelCheckpoint_override_if_file_exist(self):
-    (model, train_ds, filepath,
-     _) = self._run_load_weights_on_restart_test_common_iterations()
-
-    # Sleep for some short time period to ensure the files are created with
-    # a different time (in MacOS OSS the granularity is only 1 second).
-    time.sleep(2)
-    callback = keras.callbacks.ModelCheckpoint(
-        filepath=filepath, save_weights_only=True)
-    model.load_weights(
-        callback._get_most_recently_modified_file_matching_pattern(filepath))
-    weights_before_additional_fit = model.get_weights()
-    model.fit(train_ds, epochs=1, callbacks=[callback])
-    model.load_weights(
-        callback._get_most_recently_modified_file_matching_pattern(filepath))
-    weights_after_additional_fit = model.get_weights()
-
-    self.assertNotAllClose(weights_before_additional_fit,
-                           weights_after_additional_fit)
-
-  def test_fit_with_ModelCheckpoint_with_tf_config(self):
-    (model, train_ds, callback,
-     _) = self._get_dummy_resource_for_model_checkpoint_testing()
-
-    os.environ['TF_CONFIG'] = json.dumps({
-        'cluster': {
-            'worker': ['localhost:23333']
-        },
-        'task': {
-            'type': 'worker',
-            'index': 0
-        }
-    })
-
-    # `model.fit()` should work regardless of the presence of `TF_CONFIG`.
-    model.fit(train_ds, epochs=1, callbacks=[callback])
-
-  def test_fit_with_ModelCheckpoint_with_dir_as_h5_filepath(self):
-    (model, train_ds, callback,
-     filepath) = self._get_dummy_resource_for_model_checkpoint_testing()
-
-    temp_dir = self.get_temp_dir()
-    filepath = os.path.join(temp_dir, 'temp.h5')
-
-    self.assertFalse(os.path.exists(filepath))
-    os.mkdir(filepath)
-    self.assertTrue(os.path.exists(filepath))
-
-    callback = keras.callbacks.ModelCheckpoint(filepath=filepath)
-
-    with self.assertRaisesRegex(
-        IOError, 'Please specify a non-directory '
-        'filepath for ModelCheckpoint.'):
-      model.fit(train_ds, epochs=1, callbacks=[callback])
-
-  def test_ModelCheckpoint_with_bad_path_placeholders(self):
-    (model, train_ds, callback,
-     filepath) = self._get_dummy_resource_for_model_checkpoint_testing()
-
-    temp_dir = self.get_temp_dir()
-    filepath = os.path.join(temp_dir, 'chkpt_{epoch:02d}_{mape:.2f}.h5')
-    callback = keras.callbacks.ModelCheckpoint(filepath=filepath)
-
-    with self.assertRaisesRegex(KeyError, 'Failed to format this callback '
-                                'filepath.*'):
-      model.fit(train_ds, epochs=1, callbacks=[callback])
-
-  def test_ModelCheckpoint_nonblocking(self):
-    filepath = self.get_temp_dir()
-    # Should only cause a sync block when saving is actually performed.
-    callback = keras.callbacks.ModelCheckpoint(filepath=filepath, save_freq=100)
-    self.assertTrue(callback._supports_tf_logs)
-
-    model = keras.Sequential([keras.layers.Dense(1)])
-    cb_list = keras.callbacks.CallbackList([callback],
-                                           model=model,
-                                           epochs=1,
-                                           steps=10,
-                                           verbose=0)
-
-    tensor = tf.convert_to_tensor(1.)
-
-    def mock_numpy():
-      raise RuntimeError(
-          'If this error is seen, ModelCheckpoint is causing a blocking '
-          'NumPy conversion even when not checkpointing.')
-
-    tensor.numpy = mock_numpy
-
-    logs = {'metric': tensor}
-
-    cb_list.on_train_begin(logs)
-    cb_list.on_epoch_begin(0, logs)
-    cb_list.on_train_batch_begin(0, logs)
-    cb_list.on_train_batch_end(0, logs)
-    cb_list.on_epoch_end(0, logs)
-    cb_list.on_train_end(logs)
-
-    cb_list.on_test_begin(logs)
-    cb_list.on_test_batch_begin(0, logs)
-    cb_list.on_test_batch_end(0, logs)
-    cb_list.on_test_end(logs)
-
-    cb_list.on_predict_begin(logs)
-    cb_list.on_predict_batch_begin(logs)
-    cb_list.on_predict_batch_end(logs)
-    cb_list.on_predict_end(logs)
-
-  def test_verbose_2_logging(self):
-    data = np.random.random((100, 1))
-    labels = np.where(data > 0.5, 1, 0)
-    model = keras.models.Sequential((keras.layers.Dense(
-        1, input_dim=1, activation='relu'), keras.layers.Dense(
-            1, activation='sigmoid'),))
-    model.compile(
-        optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
-    expected_log = r'(.*- loss:.*- acc.*:.*epoch)+'
-    with self.captureWritesToStream(sys.stdout) as printed:
-      model.fit(data, labels, verbose=2, epochs=20)
-      self.assertRegex(printed.contents(), expected_log)
-
-  def test_ProgbarLogger_verbose_2_nonblocking(self):
-    # Should only cause a sync block on epoch end methods.
-    callback = keras.callbacks.ProgbarLogger(count_mode='steps')
-    self.assertTrue(callback._supports_tf_logs)
-
-    model = keras.Sequential([keras.layers.Dense(1)])
-    cb_list = keras.callbacks.CallbackList([callback],
-                                           model=model,
-                                           epochs=1,
-                                           steps=10,
-                                           verbose=2)
-
-    tensor = tf.convert_to_tensor(1.)
-
-    def mock_numpy():
-      raise RuntimeError(
-          'If this error is seen, ModelCheckpoint is causing a blocking '
-          'NumPy conversion even when not checkpointing.')
-
-    tensor.numpy = mock_numpy
-    logs = {'metric': tensor}
-
-    cb_list.on_train_begin(logs)
-    cb_list.on_epoch_begin(0, logs)
-    cb_list.on_train_batch_begin(0, logs)
-    cb_list.on_train_batch_end(0, logs)
-
-    cb_list.on_test_begin(logs)
-    cb_list.on_test_batch_begin(0, logs)
-    cb_list.on_test_batch_end(0, logs)
-    cb_list.on_test_end(logs)
-
-    with self.assertRaisesRegex(RuntimeError, 'NumPy conversion'):
-      # on_epoch_end should still block.
-      cb_list.on_epoch_end(0, logs)
-    cb_list.on_train_end(logs)
-
-  def test_EarlyStopping(self):
-    with self.cached_session():
-      np.random.seed(123)
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-      model = test_utils.get_small_sequential_mlp(
-          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-      model.compile(
-          loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
-
-      cases = [
-          ('max', 'val_acc'),
-          ('min', 'val_loss'),
-          ('auto', 'val_acc'),
-          ('auto', 'loss'),
-          ('unknown', 'unknown')
-      ]
-      for mode, monitor in cases:
-        patience = 0
+            save_weights_only=False,
+            mode=mode,
+            options=tf.saved_model.SaveOptions(),
+        )
+
+        # Case 11: `ModelCheckpoint` save model with batch number in filename.
+        filepath = os.path.join(
+            temp_dir, "checkpoint.epoch{epoch:02d}batch{batch:02d}.h5"
+        )
         cbks = [
-            keras.callbacks.EarlyStopping(
-                patience=patience, monitor=monitor, mode=mode)
+            keras.callbacks.ModelCheckpoint(
+                filepath, monitor=monitor, save_freq=1
+            )
         ]
+        assert not os.path.exists(filepath.format(epoch=1, batch=1))
+        assert not os.path.exists(filepath.format(epoch=1, batch=2))
+        assert not os.path.exists(filepath.format(epoch=2, batch=1))
+        assert not os.path.exists(filepath.format(epoch=2, batch=2))
+        assert not os.path.exists(filepath.format(epoch=3, batch=1))
+        assert not os.path.exists(filepath.format(epoch=3, batch=2))
+        assert not os.path.exists(filepath.format(epoch=4, batch=1))
+        assert not os.path.exists(filepath.format(epoch=4, batch=2))
+        assert not os.path.exists(filepath.format(epoch=5, batch=1))
+        assert not os.path.exists(filepath.format(epoch=5, batch=2))
         model.fit(
             x_train,
             y_train,
-            batch_size=BATCH_SIZE,
+            batch_size=5,
             validation_data=(x_test, y_test),
             callbacks=cbks,
             epochs=5,
-            verbose=0)
-
-  def test_EarlyStopping_reuse(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      patience = 3
-      data = np.random.random((100, 1))
-      labels = np.where(data > 0.5, 1, 0)
-      model = keras.models.Sequential((keras.layers.Dense(
-          1, input_dim=1, activation='relu'), keras.layers.Dense(
-              1, activation='sigmoid'),))
-      model.compile(
-          optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
-      weights = model.get_weights()
-
-      # This should allow training to go for at least `patience` epochs
-      model.set_weights(weights)
-
-      stopper = keras.callbacks.EarlyStopping(monitor='acc', patience=patience)
-      hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
-      assert len(hist.epoch) >= patience
-
-  def test_EarlyStopping_with_baseline(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      baseline = 0.6
-      (data, labels), _ = test_utils.get_test_data(
-          train_samples=100,
-          test_samples=50,
-          input_shape=(1,),
-          num_classes=NUM_CLASSES)
-      model = test_utils.get_small_sequential_mlp(
-          num_hidden=1, num_classes=1, input_dim=1)
-      model.compile(
-          optimizer='sgd', loss='binary_crossentropy', metrics=['acc'])
-
-      stopper = keras.callbacks.EarlyStopping(monitor='acc',
-                                              baseline=baseline)
-      hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
-      assert len(hist.epoch) == 2
-
-      patience = 3
-      stopper = keras.callbacks.EarlyStopping(monitor='acc',
-                                              patience=patience,
-                                              baseline=baseline)
-      hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
-      assert len(hist.epoch) >= patience
-
-  def test_EarlyStopping_final_weights_when_restoring_model_weights(self):
-
-    class DummyModel:
-
-      def __init__(self):
-        self.stop_training = False
-        self.weights = -1
-
-      def get_weights(self):
-        return self.weights
-
-      def set_weights(self, weights):
-        self.weights = weights
-
-      def set_weight_to_epoch(self, epoch):
-        self.weights = epoch
-
-    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',
-                                               patience=2,
-                                               restore_best_weights=True)
-    early_stop.model = DummyModel()
-    losses = [0.2, 0.15, 0.1, 0.11, 0.12]
-    # The best configuration is in the epoch 2 (loss = 0.1000).
-    epochs_trained = 0
-    early_stop.on_train_begin()
-    for epoch in range(len(losses)):
-      epochs_trained += 1
-      early_stop.model.set_weight_to_epoch(epoch=epoch)
-      early_stop.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
-      if early_stop.model.stop_training:
-        break
-    # The best configuration is in epoch 2 (loss = 0.1000),
-    # and while patience = 2, we're restoring the best weights,
-    # so we end up at the epoch with the best weights, i.e. epoch 2
-    self.assertEqual(early_stop.model.get_weights(), 2)
-
-    # Check early stopping when no model beats the baseline.
-    early_stop = keras.callbacks.EarlyStopping(
-        monitor='val_loss', patience=5, baseline=0.5, restore_best_weights=True)
-    early_stop.model = DummyModel()
-    losses = [0.9, 0.8, 0.7, 0.71, 0.72, 0.73]
-    # The best configuration is in the epoch 2 (loss = 0.7000).
-    epochs_trained = 0
-    early_stop.on_train_begin()
-    for epoch in range(len(losses)):
-      epochs_trained += 1
-      early_stop.model.set_weight_to_epoch(epoch=epoch)
-      early_stop.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
-      if early_stop.model.stop_training:
-        break
-    # No epoch improves on the baseline, so we should train for only 5 epochs,
-    # and restore the second model.
-    self.assertEqual(epochs_trained, 5)
-    self.assertEqual(early_stop.model.get_weights(), 2)
-
-  def test_RemoteMonitor(self):
-    if requests is None:
-      self.skipTest('`requests` required to run this test')
-      return None
-
-    monitor = keras.callbacks.RemoteMonitor()
-    # This will raise a warning since the default address in unreachable:
-    monitor.on_epoch_end(0, logs={'loss': 0.})
-
-  def test_LearningRateScheduler(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-      model = test_utils.get_small_sequential_mlp(
-          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-
-      cbks = [
-          keras.callbacks.LearningRateScheduler(
-              lambda x: 1. / (1. + x), verbose=1)
-      ]
-      io_utils.enable_interactive_logging()
-      with self.captureWritesToStream(sys.stdout) as printed:
+            verbose=1,
+        )
+
+        assert os.path.exists(filepath.format(epoch=1, batch=1))
+        assert os.path.exists(filepath.format(epoch=1, batch=2))
+        assert os.path.exists(filepath.format(epoch=2, batch=1))
+        assert os.path.exists(filepath.format(epoch=2, batch=2))
+        assert os.path.exists(filepath.format(epoch=3, batch=1))
+        assert os.path.exists(filepath.format(epoch=3, batch=2))
+        assert os.path.exists(filepath.format(epoch=4, batch=1))
+        assert os.path.exists(filepath.format(epoch=4, batch=2))
+        assert os.path.exists(filepath.format(epoch=5, batch=1))
+        assert os.path.exists(filepath.format(epoch=5, batch=2))
+
+        os.remove(filepath.format(epoch=1, batch=1))
+        os.remove(filepath.format(epoch=1, batch=2))
+        os.remove(filepath.format(epoch=2, batch=1))
+        os.remove(filepath.format(epoch=2, batch=2))
+        os.remove(filepath.format(epoch=3, batch=1))
+        os.remove(filepath.format(epoch=3, batch=2))
+        os.remove(filepath.format(epoch=4, batch=1))
+        os.remove(filepath.format(epoch=4, batch=2))
+        os.remove(filepath.format(epoch=5, batch=1))
+        os.remove(filepath.format(epoch=5, batch=2))
+
+        # Case 12: ModelCheckpoint saves model with initial_value_threshold
+        # param
+        mode = "max"
+        monitor = "val_acc"
+        initial_value_threshold = 0
+        save_best_only = True
+        filepath = os.path.join(temp_dir, "checkpoint.h5")
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                initial_value_threshold=initial_value_threshold,
+                mode=mode,
+            )
+        ]
         model.fit(
             x_train,
             y_train,
             batch_size=BATCH_SIZE,
             validation_data=(x_test, y_test),
             callbacks=cbks,
-            epochs=5)
-        self.assertIn('LearningRateScheduler setting learning rate to 1.0',
-                      printed.contents())
-      assert (
-          float(keras.backend.get_value(
-              model.optimizer.lr)) - 0.2) < keras.backend.epsilon()
-
-      cbks = [keras.callbacks.LearningRateScheduler(lambda x, lr: lr / 2)]
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-      assert (
-          float(keras.backend.get_value(
-              model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
-
-      cbks = [
-          keras.callbacks.LearningRateScheduler(
-              lambda epoch, _: learning_rate_schedule.CosineDecay(0.01, 2)
-              (epoch))
-      ]
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      cosine_decay_np = 0.5 * (1 + np.cos(np.pi * (1 / 2)))
-      decayed_learning_rate = 0.01 * cosine_decay_np
-
-      assert (float(keras.backend.get_value(model.optimizer.lr)) -
-              decayed_learning_rate) < keras.backend.epsilon()
-
-  def test_ReduceLROnPlateau(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-
-      def make_model():
-        tf.compat.v1.set_random_seed(1234)
-        np.random.seed(1337)
-        model = test_utils.get_small_sequential_mlp(
-            num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-        model.compile(
-            loss='categorical_crossentropy',
-            optimizer=gradient_descent.SGD(lr=0.1))
-        return model
-
-      # TODO(psv): Make sure the callback works correctly when min_delta is
-      # set as 0. Test fails when the order of this callback and assertion is
-      # interchanged.
-      model = make_model()
-      cbks = [
-          keras.callbacks.ReduceLROnPlateau(
-              monitor='val_loss',
-              factor=0.1,
-              min_delta=0,
-              patience=1,
-              cooldown=5)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-      self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
-
-      model = make_model()
-      # This should reduce the LR after the first epoch (due to high epsilon).
-      cbks = [
-          keras.callbacks.ReduceLROnPlateau(
-              monitor='val_loss',
-              factor=0.1,
-              min_delta=10,
-              patience=1,
-              cooldown=5)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=2)
-      self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
-
-  def test_ReduceLROnPlateau_patience(self):
-
-    class DummyOptimizer:
-
-      def __init__(self):
-        self.lr = keras.backend.variable(1.0)
-
-    class DummyModel:
-
-      def __init__(self):
-        self.optimizer = DummyOptimizer()
-
-    reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(
-        monitor='val_loss', patience=2)
-    reduce_on_plateau.model = DummyModel()
-
-    losses = [0.0860, 0.1096, 0.1040]
-    lrs = []
-
-    for epoch in range(len(losses)):
-      reduce_on_plateau.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
-      lrs.append(keras.backend.get_value(reduce_on_plateau.model.optimizer.lr))
-
-    # The learning rates should be 1.0 except the last one
-    for lr in lrs[:-1]:
-      self.assertEqual(lr, 1.0)
-    self.assertLess(lrs[-1], 1.0)
-
-  def test_ReduceLROnPlateau_backwards_compatibility(self):
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning') as mock_log:
-      reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(epsilon=1e-13)
-      self.assertRegex(
-          str(mock_log.call_args), '`epsilon` argument is deprecated')
-    self.assertFalse(hasattr(reduce_on_plateau, 'epsilon'))
-    self.assertTrue(hasattr(reduce_on_plateau, 'min_delta'))
-    self.assertEqual(reduce_on_plateau.min_delta, 1e-13)
-
-  def test_CSVLogger(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-      filepath = os.path.join(temp_dir, 'log.tsv')
-
-      sep = '\t'
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-
-      def make_model():
-        np.random.seed(1337)
-        model = test_utils.get_small_sequential_mlp(
-            num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-        model.compile(
-            loss='categorical_crossentropy',
-            optimizer=gradient_descent.SGD(lr=0.1),
-            metrics=['accuracy'])
-        return model
-
-      # case 1, create new file with defined separator
-      model = make_model()
-      cbks = [keras.callbacks.CSVLogger(filepath, separator=sep)]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-
-      assert os.path.exists(filepath)
-      with open(filepath) as csvfile:
-        dialect = csv.Sniffer().sniff(csvfile.read())
-      assert dialect.delimiter == sep
-      del model
-      del cbks
-
-      # case 2, append data to existing file, skip header
-      model = make_model()
-      cbks = [keras.callbacks.CSVLogger(filepath, separator=sep, append=True)]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-
-      # case 3, reuse of CSVLogger object
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      with open(filepath) as csvfile:
-        list_lines = csvfile.readlines()
-        for line in list_lines:
-          assert line.count(sep) == 4
-        assert len(list_lines) == 5
-        output = ' '.join(list_lines)
-        assert len(re.findall('epoch', output)) == 1
-
-      os.remove(filepath)
-
-  def test_stop_training_csv(self):
-    # Test that using the CSVLogger callback with the TerminateOnNaN callback
-    # does not result in invalid CSVs.
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      fp = os.path.join(tmpdir, 'test.csv')
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-      cbks = [keras.callbacks.TerminateOnNaN(), keras.callbacks.CSVLogger(fp)]
-      model = keras.models.Sequential()
-      for _ in range(5):
-        model.add(keras.layers.Dense(2, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='linear'))
-      model.compile(loss='mean_squared_error',
-                    optimizer='rmsprop')
-
-      def data_generator():
-        i = 0
-        max_batch_index = len(x_train) // BATCH_SIZE
-        tot = 0
-        while 1:
-          if tot > 3 * len(x_train):
-            yield (np.ones([BATCH_SIZE, INPUT_DIM]) * np.nan,
-                   np.ones([BATCH_SIZE, NUM_CLASSES]) * np.nan)
-          else:
-            yield (x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          i += 1
-          tot += 1
-          i %= max_batch_index
-
-      history = model.fit_generator(data_generator(),
-                                    len(x_train) // BATCH_SIZE,
-                                    validation_data=(x_test, y_test),
-                                    callbacks=cbks,
-                                    epochs=20)
-      loss = history.history['loss']
-      assert len(loss) > 1
-      assert loss[-1] == np.inf or np.isnan(loss[-1])
-
-      values = []
-      with open(fp) as f:
-        # On Windows, due to \r\n line ends, we may end up reading empty lines
-        # after each line. Skip empty lines.
-        values = [x for x in csv.reader(f) if x]
-
-      assert 'nan' in values[-1], 'The last epoch was not logged.'
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_TerminateOnNaN(self):
-    np.random.seed(1337)
-    (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-
-    y_test = np_utils.to_categorical(y_test)
-    y_train = np_utils.to_categorical(y_train)
-    cbks = [keras.callbacks.TerminateOnNaN()]
-    model = keras.models.Sequential()
-    initializer = keras.initializers.Constant(value=1e5)
-    for _ in range(5):
-      model.add(
-          keras.layers.Dense(
-              2,
-              input_dim=INPUT_DIM,
-              activation='relu',
-              kernel_initializer=initializer))
-    model.add(keras.layers.Dense(NUM_CLASSES))
-    model.compile(loss='mean_squared_error', optimizer='rmsprop')
-
-    history = model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=20)
-    loss = history.history['loss']
-    self.assertEqual(len(loss), 1)
-    self.assertTrue(np.isnan(loss[0]) or np.isinf(loss[0]))
-
-  @unittest.skipIf(
-      os.name == 'nt',
-      'use_multiprocessing=True does not work on windows properly.')
-  def test_LambdaCallback(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-
-      # Start an arbitrary process that should run during model
-      # training and be terminated after training has completed.
-      e = threading.Event()
-
-      def target():
-        e.wait()
-
-      t = threading.Thread(target=target)
-      t.start()
-      cleanup_callback = keras.callbacks.LambdaCallback(
-          on_train_end=lambda logs: e.set())
-
-      cbks = [cleanup_callback]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=5,
-          verbose=0)
-      t.join()
-      assert not t.is_alive()
-
-  def test_RemoteMonitor_np_array(self):
-    if requests is None:
-      self.skipTest('`requests` required to run this test')
-    with tf.compat.v1.test.mock.patch.object(requests, 'post') as requests_post:
-      monitor = keras.callbacks.RemoteMonitor(send_as_json=True)
-      a = np.arange(1)  # a 1 by 1 array
-      logs = {'loss': 0., 'val': a}
-      monitor.on_epoch_end(0, logs=logs)
-      send = {'loss': 0., 'epoch': 0, 'val': 0}
-      requests_post.assert_called_once_with(
-          monitor.root + monitor.path, json=send, headers=monitor.headers)
-
-  def test_RemoteMonitor_np_float32(self):
-    if requests is None:
-      self.skipTest('`requests` required to run this test')
-
-    with tf.compat.v1.test.mock.patch.object(requests, 'post') as requests_post:
-      monitor = keras.callbacks.RemoteMonitor(send_as_json=True)
-      a = np.float32(1.0)  # a float32 generic type
-      logs = {'loss': 0., 'val': a}
-      monitor.on_epoch_end(0, logs=logs)
-      send = {'loss': 0., 'epoch': 0, 'val': 1.0}
-      requests_post.assert_called_once_with(
-          monitor.root + monitor.path, json=send, headers=monitor.headers)
-
-  def test_RemoteMonitorWithJsonPayload(self):
-    if requests is None:
-      self.skipTest('`requests` required to run this test')
-      return None
-    with self.cached_session():
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.np_utils.to_categorical(y_test)
-      y_train = keras.utils.np_utils.to_categorical(y_train)
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
-      cbks = [keras.callbacks.RemoteMonitor(send_as_json=True)]
-
-      with tf.compat.v1.test.mock.patch.object(requests, 'post'):
+            epochs=1,
+            verbose=0,
+        )
+        assert os.path.exists(filepath)
+        os.remove(filepath)
+
+        # Case 13: ModelCheckpoint saves model with initial_value_threshold
+        # param
+        mode = "auto"
+        monitor = "val_loss"
+        initial_value_threshold = None
+        save_best_only = True
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                initial_value_threshold=initial_value_threshold,
+                mode=mode,
+            )
+        ]
         model.fit(
             x_train,
             y_train,
             batch_size=BATCH_SIZE,
             validation_data=(x_test, y_test),
             callbacks=cbks,
-            epochs=1)
-
-  def test_progbar_infers_steps(self):
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    data = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
-    data = data.filter(lambda x, y: True)  # Unknown cardinality.
-
-    progbar = keras.callbacks.ProgbarLogger('steps')
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse')
-    self.assertIsNone(progbar.target)
-    model.fit(data, epochs=2, callbacks=[progbar])
-    self.assertEqual(progbar.target, 5)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_callback_passed_floats(self):
-
-    class MyCallback(keras.callbacks.Callback):
-
-      def on_batch_end(self, batch, logs=None):
-        assert isinstance(batch, int)
-        assert isinstance(logs['loss'], float)
-        self.on_batch_end_called = True
-
-      def on_epoch_end(self, batch, logs=None):
-        assert isinstance(batch, int)
-        assert isinstance(logs['loss'], float)
-        self.on_epoch_end_called = True
-
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    callback = MyCallback()
-    model.fit(x, y, epochs=2, callbacks=[callback])
-    self.assertTrue(callback.on_batch_end_called)
-    self.assertTrue(callback.on_batch_end_called)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_implements_batch_hooks(self):
-
-    class MyCallbackWithBatchHooks(keras.callbacks.Callback):
-
-      def __init__(self):
-        self.train_batches = 0
-        self.test_batches = 0
-        self.predict_batches = 0
+            epochs=1,
+            verbose=0,
+        )
+        assert os.path.exists(filepath)
+        os.remove(filepath)
+
+        # Case 14: ModelCheckpoint doesnt save model if loss was minimum earlier
+        mode = "min"
+        monitor = "val_loss"
+        initial_value_threshold = 0
+        save_best_only = True
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                initial_value_threshold=initial_value_threshold,
+                mode=mode,
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1,
+            verbose=0,
+        )
+        assert not os.path.exists(filepath)
+
+        # Case 15: ModelCheckpoint doesnt save model if loss was min earlier in
+        # auto mode
+        mode = "auto"
+        monitor = "val_loss"
+        initial_value_threshold = 0
+        save_best_only = True
+        cbks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath,
+                monitor=monitor,
+                save_best_only=save_best_only,
+                initial_value_threshold=initial_value_threshold,
+                mode=mode,
+            )
+        ]
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1,
+            verbose=0,
+        )
+        assert not os.path.exists(filepath)
+
+    @test_utils.run_v2_only
+    def test_ModelCheckpoint_subclass_SavedModel_save_weights_false(self):
+        model = test_utils.get_small_subclass_mlp(NUM_HIDDEN, NUM_CLASSES)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            metrics=["acc"],
+        )
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        filepath = os.path.join(temp_dir, "checkpoint")
+        cbks = [
+            keras.callbacks.ModelCheckpoint(filepath, save_weights_only=False)
+        ]
 
-      def on_train_batch_end(self, batch, logs=None):
-        self.train_batches += 1
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_train = np_utils.to_categorical(y_train, num_classes=NUM_CLASSES)
+
+        model.fit(x_train, y_train, callbacks=cbks, epochs=1, verbose=0)
+        # Check that the filepath is a SavedModel directory.
+        self.assertIn("saved_model.pb", os.listdir(filepath))
+
+    @test_utils.run_v2_only
+    def test_ModelCheckpoint_subclass_KerasV3_save_weights_false(self):
+        model = test_utils.get_small_subclass_mlp(NUM_HIDDEN, NUM_CLASSES)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            metrics=["acc"],
+        )
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        filepath = os.path.join(temp_dir, "checkpoint.keras")
+        cbks = [
+            keras.callbacks.ModelCheckpoint(filepath, save_weights_only=False)
+        ]
 
-      def on_test_batch_end(self, batch, logs=None):
-        self.test_batches += 1
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_train = np_utils.to_categorical(y_train, num_classes=NUM_CLASSES)
+
+        model.fit(x_train, y_train, callbacks=cbks, epochs=1, verbose=0)
+
+        assert os.path.exists(filepath)
+
+    def _get_dummy_resource_for_model_checkpoint_testing(self):
+        def get_input_datasets():
+            # Simple training input.
+            train_input = [[1.0]] * 16
+            train_label = [[0.0]] * 16
+            ds = tf.data.Dataset.from_tensor_slices((train_input, train_label))
+            return ds.batch(8, drop_remainder=True)
+
+        # Very simple bias model to eliminate randomness.
+        optimizer = gradient_descent.SGD(0.1)
+        model = sequential.Sequential()
+        model.add(test_utils.Bias(input_shape=(1,)))
+        model.compile(loss="mae", optimizer=optimizer, metrics=["mae"])
+        train_ds = get_input_datasets()
+
+        temp_dir = self.get_temp_dir()
+        filepath = os.path.join(temp_dir, "checkpoint.epoch{epoch:02d}.h5")
+
+        # The filepath shouldn't exist at the beginning.
+        self.assertFalse(os.path.exists(filepath))
+        callback = keras.callbacks.ModelCheckpoint(
+            filepath=filepath, save_weights_only=True
+        )
+
+        return model, train_ds, callback, filepath
+
+    def _run_load_weights_on_restart_test_common_iterations(self):
+        (
+            model,
+            train_ds,
+            callback,
+            filepath,
+        ) = self._get_dummy_resource_for_model_checkpoint_testing()
+        initial_epochs = 3
+        model.fit(train_ds, epochs=initial_epochs, callbacks=[callback])
 
-      def on_predict_batch_end(self, batch, logs=None):
-        self.predict_batches += 1
-
-    class MyCallbackWithTFBatchHooks(keras.callbacks.Callback):
-
-      def __init__(self):
-        super().__init__()
-        self._supports_tf_logs = True
+        # The files should exist after fitting with callback.
+        for epoch in range(initial_epochs):
+            self.assertTrue(os.path.exists(filepath.format(epoch=epoch + 1)))
+        self.assertFalse(
+            os.path.exists(filepath.format(epoch=initial_epochs + 1))
+        )
+        self.assertEqual(
+            callback._get_most_recently_modified_file_matching_pattern(
+                filepath
+            ),
+            filepath.format(epoch=initial_epochs),
+        )
+
+        model.fit(train_ds, epochs=1)
+        weights_after_one_more_epoch = model.get_weights()
+
+        # The filepath should continue to exist after fitting without callback.
+        for epoch in range(initial_epochs):
+            self.assertTrue(os.path.exists(filepath.format(epoch=epoch + 1)))
+
+        return model, train_ds, filepath, weights_after_one_more_epoch
+
+    @staticmethod
+    def get_ModelCheckpoint_load_weights_on_restart_true_test(
+        save_weights_only,
+    ):
+        def func(self):
+            (
+                model,
+                train_ds,
+                filepath,
+                weights_after_one_more_epoch,
+            ) = self._run_load_weights_on_restart_test_common_iterations()
+
+            # Sleep for some short time period ensuring the files are created
+            # with a different time (in MacOS OSS the granularity is only 1
+            # second).
+            time.sleep(2)
+            callback = keras.callbacks.ModelCheckpoint(
+                filepath=filepath,
+                save_weights_only=save_weights_only,
+                load_weights_on_restart=True,
+            )
+            model.fit(train_ds, epochs=1, callbacks=[callback])
+            weights_after_model_restoring_and_one_more_epoch = (
+                model.get_weights()
+            )
+
+            self.assertEqual(
+                callback._get_most_recently_modified_file_matching_pattern(
+                    filepath
+                ),
+                filepath.format(epoch=1),
+            )
+
+            model.fit(
+                train_ds,
+                epochs=1,
+                callbacks=[
+                    keras.callbacks.ModelCheckpoint(
+                        filepath=filepath,
+                        save_weights_only=save_weights_only,
+                        load_weights_on_restart=True,
+                    )
+                ],
+            )
+            weights_with_one_final_extra_epoch = model.get_weights()
+
+            # Asserting the weights one epoch after initial fitting and another
+            # epoch after that are closed, if a ModelCheckpoint with
+            # load_weights_on_restart=True is given (so the model is restored at
+            # the beginning of training).
+            self.assertAllClose(
+                weights_after_one_more_epoch,
+                weights_after_model_restoring_and_one_more_epoch,
+            )
+
+            self.assertNotAllClose(
+                weights_after_one_more_epoch, weights_with_one_final_extra_epoch
+            )
+
+        return func
+
+    @staticmethod
+    def get_ModelCheckpoint_load_weights_on_restart_false_test(
+        save_weights_only,
+    ):
+        def func(self):
+            (
+                model,
+                train_ds,
+                filepath,
+                weights_after_one_more_epoch,
+            ) = self._run_load_weights_on_restart_test_common_iterations()
+
+            model.fit(
+                train_ds,
+                epochs=1,
+                callbacks=[
+                    keras.callbacks.ModelCheckpoint(
+                        filepath=filepath, save_weights_only=save_weights_only
+                    )
+                ],
+            )
+            weights_after_model_restoring_and_one_more_epoch = (
+                model.get_weights()
+            )
+
+            # Asserting the weights one epoch after initial fitting and another
+            # epoch after that are different, if a ModelCheckpoint with
+            # load_weights_on_restart=False is given (so the model is not
+            # restored at the beginning of training).
+            self.assertNotAllClose(
+                weights_after_one_more_epoch,
+                weights_after_model_restoring_and_one_more_epoch,
+            )
+
+        return func
+
+    test_model_checkpoint_load_weights_on_restart_true_save_weights_only_true = get_ModelCheckpoint_load_weights_on_restart_true_test.__func__(  # noqa: E501
+        True
+    )
 
-    class MyCallbackWithoutBatchHooks(keras.callbacks.Callback):
+    test_model_checkpoint_load_weights_on_restart_true_save_weights_only_false = get_ModelCheckpoint_load_weights_on_restart_true_test.__func__(  # noqa: E501
+        False
+    )
 
-      def __init__(self):
-        self.epochs = 0
-
-      def on_epoch_end(self, epoch, logs=None):
-        self.epochs += 1
+    test_model_checkpoint_load_weights_on_restart_false_save_weights_only_true = get_ModelCheckpoint_load_weights_on_restart_false_test.__func__(  # noqa: E501
+        True
+    )
 
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse')
+    test_model_checkpoint_load_weights_on_restart_false_save_weights_only_false = get_ModelCheckpoint_load_weights_on_restart_false_test.__func__(  # noqa: E501
+        False
+    )
 
-    my_cb = MyCallbackWithBatchHooks()
-    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
-    self.assertTrue(cb_list._should_call_train_batch_hooks)
-    self.assertTrue(cb_list._should_call_test_batch_hooks)
-    self.assertTrue(cb_list._should_call_predict_batch_hooks)
-    self.assertFalse(cb_list._batch_hooks_support_tf_logs)
+    def test_ModelCheckpoint_override_if_file_exist(self):
+        (
+            model,
+            train_ds,
+            filepath,
+            _,
+        ) = self._run_load_weights_on_restart_test_common_iterations()
 
-    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+        # Sleep for some short time period to ensure the files are created with
+        # a different time (in MacOS OSS the granularity is only 1 second).
+        time.sleep(2)
+        callback = keras.callbacks.ModelCheckpoint(
+            filepath=filepath, save_weights_only=True
+        )
+        model.load_weights(
+            callback._get_most_recently_modified_file_matching_pattern(filepath)
+        )
+        weights_before_additional_fit = model.get_weights()
+        model.fit(train_ds, epochs=1, callbacks=[callback])
+        model.load_weights(
+            callback._get_most_recently_modified_file_matching_pattern(filepath)
+        )
+        weights_after_additional_fit = model.get_weights()
+
+        self.assertNotAllClose(
+            weights_before_additional_fit, weights_after_additional_fit
+        )
+
+    def test_fit_with_ModelCheckpoint_with_tf_config(self):
+        (
+            model,
+            train_ds,
+            callback,
+            _,
+        ) = self._get_dummy_resource_for_model_checkpoint_testing()
+
+        os.environ["TF_CONFIG"] = json.dumps(
+            {
+                "cluster": {"worker": ["localhost:23333"]},
+                "task": {"type": "worker", "index": 0},
+            }
+        )
+
+        # `model.fit()` should work regardless of the presence of `TF_CONFIG`.
+        model.fit(train_ds, epochs=1, callbacks=[callback])
+
+    def test_fit_with_ModelCheckpoint_with_dir_as_h5_filepath(self):
+        (
+            model,
+            train_ds,
+            callback,
+            filepath,
+        ) = self._get_dummy_resource_for_model_checkpoint_testing()
 
-    self.assertEqual(my_cb.train_batches, 2)
-    self.assertEqual(my_cb.test_batches, 1)
-    self.assertEqual(my_cb.predict_batches, 1)
+        temp_dir = self.get_temp_dir()
+        filepath = os.path.join(temp_dir, "temp.h5")
 
-    my_cb = MyCallbackWithTFBatchHooks()
-    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
-    self.assertTrue(cb_list._batch_hooks_support_tf_logs)
+        self.assertFalse(os.path.exists(filepath))
+        os.mkdir(filepath)
+        self.assertTrue(os.path.exists(filepath))
 
-    my_cb = MyCallbackWithoutBatchHooks()
-    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
-    self.assertLen(cb_list.callbacks, 1)
-    self.assertFalse(cb_list._should_call_train_batch_hooks)
-    self.assertFalse(cb_list._should_call_test_batch_hooks)
-    self.assertFalse(cb_list._should_call_predict_batch_hooks)
+        callback = keras.callbacks.ModelCheckpoint(filepath=filepath)
 
-    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+        with self.assertRaisesRegex(
+            IOError,
+            "Please specify a non-directory filepath for ModelCheckpoint.",
+        ):
+            model.fit(train_ds, epochs=1, callbacks=[callback])
 
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_logs_conversion(self):
-    assert_dict_equal = self.assertDictEqual
+    def test_ModelCheckpoint_KerasV3_save_options_error(self):
+        (
+            model,
+            train_ds,
+            callback,
+            filepath,
+        ) = self._get_dummy_resource_for_model_checkpoint_testing()
+
+        temp_dir = self.get_temp_dir()
+        filepath = os.path.join(temp_dir, "temp.keras")
+
+        with self.assertRaisesRegex(
+            ValueError, "The native Keras format does not support"
+        ):
+            _ = keras.callbacks.ModelCheckpoint(
+                filepath=filepath, options=tf.saved_model.SaveOptions()
+            )
+
+    def test_ModelCheckpoint_with_bad_path_placeholders(self):
+        (
+            model,
+            train_ds,
+            callback,
+            filepath,
+        ) = self._get_dummy_resource_for_model_checkpoint_testing()
+
+        temp_dir = self.get_temp_dir()
+        filepath = os.path.join(temp_dir, "chkpt_{epoch:02d}_{mape:.2f}.h5")
+        callback = keras.callbacks.ModelCheckpoint(filepath=filepath)
+
+        with self.assertRaisesRegex(
+            KeyError, "Failed to format this callback filepath.*"
+        ):
+            model.fit(train_ds, epochs=1, callbacks=[callback])
+
+    def test_ModelCheckpoint_nonblocking(self):
+        filepath = self.get_temp_dir()
+        # Should only cause a sync block when saving is actually performed.
+        callback = keras.callbacks.ModelCheckpoint(
+            filepath=filepath, save_freq=100
+        )
+        self.assertTrue(callback._supports_tf_logs)
+
+        model = keras.Sequential([keras.layers.Dense(1)])
+        cb_list = keras.callbacks.CallbackList(
+            [callback], model=model, epochs=1, steps=10, verbose=0
+        )
+
+        tensor = tf.convert_to_tensor(1.0)
+
+        def mock_numpy():
+            raise RuntimeError(
+                "If this error is seen, ModelCheckpoint is causing a blocking "
+                "NumPy conversion even when not checkpointing."
+            )
+
+        tensor.numpy = mock_numpy
+
+        logs = {"metric": tensor}
+
+        cb_list.on_train_begin(logs)
+        cb_list.on_epoch_begin(0, logs)
+        cb_list.on_train_batch_begin(0, logs)
+        cb_list.on_train_batch_end(0, logs)
+        cb_list.on_epoch_end(0, logs)
+        cb_list.on_train_end(logs)
+
+        cb_list.on_test_begin(logs)
+        cb_list.on_test_batch_begin(0, logs)
+        cb_list.on_test_batch_end(0, logs)
+        cb_list.on_test_end(logs)
+
+        cb_list.on_predict_begin(logs)
+        cb_list.on_predict_batch_begin(logs)
+        cb_list.on_predict_batch_end(logs)
+        cb_list.on_predict_end(logs)
+
+    def _run_fit_with_ModelCheckpoint_with_steps_per_execution(
+        self,
+        model,
+        savepath,
+        save_freq,
+        train_samples,
+        steps_per_execution,
+        epochs,
+        check_ckpt_epochs,
+        check_ckpt_batchs,
+    ):
+        assert len(check_ckpt_epochs) == len(check_ckpt_batchs)
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=train_samples,
+            test_samples=0,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_train = np_utils.to_categorical(y_train)
 
-    class MutateNumpyLogs(CallAllHooks):
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            steps_per_execution=steps_per_execution,
+        )
 
-      def _run(self, *args, logs=None):
-        logs = logs or args[-1]
-        logs['numpy'] = 1
+        self.assertFalse(os.path.exists(savepath))
 
-    class MutateTensorFlowLogs(CallAllHooks):
+        callback = keras.callbacks.ModelCheckpoint(
+            filepath=os.path.join(savepath, "ckpt_{epoch}_{batch}"),
+            save_freq=save_freq,
+        )
 
-      def __init__(self):
-        super().__init__()
-        self._supports_tf_logs = True
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=1,
+            epochs=epochs,
+            verbose=0,
+            callbacks=[callback],
+        )
+
+        self.assertTrue(os.path.exists(savepath))
+
+        for i in range(len(check_ckpt_epochs)):
+            epoch = check_ckpt_epochs[i]
+            batch = check_ckpt_batchs[i]
+            ckpt_name = "ckpt_" + str(epoch) + "_" + str(batch)
+            ckpt_path = os.path.join(savepath, ckpt_name)
+            self.assertTrue(os.path.exists(ckpt_path))
+            self.assertIn("saved_model.pb", os.listdir(ckpt_path))
+
+        shutil.rmtree(savepath)
+
+    @test_combinations.run_with_all_model_types
+    @test_utils.run_v2_only
+    def test_fit_with_ModelCheckpoint_with_steps_per_execution(self):
+        layers = [
+            keras.layers.Dense(
+                NUM_HIDDEN, input_dim=INPUT_DIM, activation="relu"
+            ),
+            keras.layers.Dense(NUM_CLASSES, activation="softmax"),
+        ]
+        model = test_utils.get_model_from_layers(
+            layers, input_shape=(INPUT_DIM,)
+        )
+
+        temp_dir = self.get_temp_dir()
+        savepath = os.path.join(temp_dir, "checkpoint")
+
+        for steps_per_execution in [None, 7]:
+            self._run_fit_with_ModelCheckpoint_with_steps_per_execution(
+                model,
+                savepath,
+                save_freq=7,
+                train_samples=7,
+                steps_per_execution=steps_per_execution,
+                epochs=1,
+                check_ckpt_epochs=[1],
+                check_ckpt_batchs=[7],
+            )
+
+            self._run_fit_with_ModelCheckpoint_with_steps_per_execution(
+                model,
+                savepath,
+                save_freq=7,
+                train_samples=7,
+                steps_per_execution=steps_per_execution,
+                epochs=2,
+                check_ckpt_epochs=[1, 2],
+                check_ckpt_batchs=[7, 7],
+            )
+
+            self._run_fit_with_ModelCheckpoint_with_steps_per_execution(
+                model,
+                savepath,
+                save_freq=14,
+                train_samples=7,
+                steps_per_execution=steps_per_execution,
+                epochs=2,
+                check_ckpt_epochs=[2],
+                check_ckpt_batchs=[7],
+            )
+
+            self._run_fit_with_ModelCheckpoint_with_steps_per_execution(
+                model,
+                savepath,
+                save_freq=7,
+                train_samples=14,
+                steps_per_execution=steps_per_execution,
+                epochs=2,
+                check_ckpt_epochs=[1, 1, 2, 2],
+                check_ckpt_batchs=[7, 14, 7, 14],
+            )
+
+    def test_verbose_2_logging(self):
+        data = np.random.random((100, 1))
+        labels = np.where(data > 0.5, 1, 0)
+        model = keras.models.Sequential(
+            (
+                keras.layers.Dense(1, input_dim=1, activation="relu"),
+                keras.layers.Dense(1, activation="sigmoid"),
+            )
+        )
+        model.compile(
+            optimizer="sgd", loss="binary_crossentropy", metrics=["accuracy"]
+        )
+        expected_log = r"(.*- loss:.*- acc.*:.*epoch)+"
+        with self.captureWritesToStream(sys.stdout) as printed:
+            model.fit(data, labels, verbose=2, epochs=20)
+            self.assertRegex(printed.contents(), expected_log)
+
+    def test_ProgbarLogger_verbose_2_nonblocking(self):
+        # Should only cause a sync block on epoch end methods.
+        callback = keras.callbacks.ProgbarLogger(count_mode="steps")
+        self.assertTrue(callback._supports_tf_logs)
+
+        model = keras.Sequential([keras.layers.Dense(1)])
+        cb_list = keras.callbacks.CallbackList(
+            [callback], model=model, epochs=1, steps=10, verbose=2
+        )
+
+        tensor = tf.convert_to_tensor(1.0)
+
+        def mock_numpy():
+            raise RuntimeError(
+                "If this error is seen, ModelCheckpoint is causing a blocking "
+                "NumPy conversion even when not checkpointing."
+            )
+
+        tensor.numpy = mock_numpy
+        logs = {"metric": tensor}
+
+        cb_list.on_train_begin(logs)
+        cb_list.on_epoch_begin(0, logs)
+        cb_list.on_train_batch_begin(0, logs)
+        cb_list.on_train_batch_end(0, logs)
+
+        cb_list.on_test_begin(logs)
+        cb_list.on_test_batch_begin(0, logs)
+        cb_list.on_test_batch_end(0, logs)
+        cb_list.on_test_end(logs)
+
+        with self.assertRaisesRegex(RuntimeError, "NumPy conversion"):
+            # on_epoch_end should still block.
+            cb_list.on_epoch_end(0, logs)
+        cb_list.on_train_end(logs)
+
+    def test_EarlyStopping(self):
+        with self.cached_session():
+            np.random.seed(123)
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+            model = test_utils.get_small_sequential_mlp(
+                num_hidden=NUM_HIDDEN,
+                num_classes=NUM_CLASSES,
+                input_dim=INPUT_DIM,
+            )
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="rmsprop",
+                metrics=["acc"],
+            )
+
+            cases = [
+                ("max", "val_acc"),
+                ("min", "val_loss"),
+                ("auto", "val_acc"),
+                ("auto", "loss"),
+                ("unknown", "unknown"),
+            ]
+            for mode, monitor in cases:
+                patience = 0
+                cbks = [
+                    keras.callbacks.EarlyStopping(
+                        patience=patience, monitor=monitor, mode=mode
+                    )
+                ]
+                model.fit(
+                    x_train,
+                    y_train,
+                    batch_size=BATCH_SIZE,
+                    validation_data=(x_test, y_test),
+                    callbacks=cbks,
+                    epochs=5,
+                    verbose=0,
+                )
+
+    def test_EarlyStopping_patience(self):
+        cases = [0, 1, 2, 3]
+        losses = [10.0, 9.0, 8.0, 9.0, 8.9, 8.8, 8.7, 8.6, 8.5]
+
+        for patience in cases:
+            stopper = keras.callbacks.EarlyStopping(
+                monitor="loss", patience=patience
+            )
+            stopper.model = keras.models.Sequential()
+            stopper.on_train_begin()
+
+            for epoch, loss in enumerate(losses):
+                stopper.on_epoch_end(epoch=epoch, logs={"loss": loss})
+                if stopper.model.stop_training:
+                    break
+
+            self.assertEqual(stopper.stopped_epoch, max(patience, 1) + 2)
+
+    def test_EarlyStopping_reuse(self):
+        with self.cached_session():
+            np.random.seed(1337)
+            patience = 3
+            data = np.random.random((100, 1))
+            labels = np.where(data > 0.5, 1, 0)
+            model = keras.models.Sequential(
+                (
+                    keras.layers.Dense(1, input_dim=1, activation="relu"),
+                    keras.layers.Dense(1, activation="sigmoid"),
+                )
+            )
+            model.compile(
+                optimizer="sgd",
+                loss="binary_crossentropy",
+                metrics=["accuracy"],
+            )
+            weights = model.get_weights()
+
+            # This should allow training to go for at least `patience` epochs
+            model.set_weights(weights)
+
+            stopper = keras.callbacks.EarlyStopping(
+                monitor="acc", patience=patience
+            )
+            hist = model.fit(
+                data, labels, callbacks=[stopper], verbose=0, epochs=20
+            )
+            assert len(hist.epoch) >= patience
+
+    def test_EarlyStopping_with_baseline(self):
+        with self.cached_session():
+            np.random.seed(1337)
+            baseline = 0.6
+            (data, labels), _ = test_utils.get_test_data(
+                train_samples=100,
+                test_samples=50,
+                input_shape=(1,),
+                num_classes=NUM_CLASSES,
+            )
+            model = test_utils.get_small_sequential_mlp(
+                num_hidden=1, num_classes=1, input_dim=1
+            )
+            model.compile(
+                optimizer="sgd", loss="binary_crossentropy", metrics=["acc"]
+            )
+
+            stopper = keras.callbacks.EarlyStopping(
+                monitor="acc", baseline=baseline
+            )
+            hist = model.fit(
+                data, labels, callbacks=[stopper], verbose=0, epochs=20
+            )
+            assert len(hist.epoch) == 2
+
+            patience = 3
+            stopper = keras.callbacks.EarlyStopping(
+                monitor="acc", patience=patience, baseline=baseline
+            )
+            hist = model.fit(
+                data, labels, callbacks=[stopper], verbose=0, epochs=20
+            )
+            assert len(hist.epoch) >= patience
+
+    def test_EarlyStopping_final_weights_when_restoring_model_weights(self):
+        class DummyModel:
+            def __init__(self):
+                self.stop_training = False
+                self.weights = -1
+
+            def get_weights(self):
+                return self.weights
+
+            def set_weights(self, weights):
+                self.weights = weights
+
+            def set_weight_to_epoch(self, epoch):
+                self.weights = epoch
+
+        early_stop = keras.callbacks.EarlyStopping(
+            monitor="val_loss", patience=2, restore_best_weights=True
+        )
+        early_stop.model = DummyModel()
+        losses = [0.2, 0.15, 0.1, 0.11, 0.12]
+        # The best configuration is in the epoch 2 (loss = 0.1000).
+        epochs_trained = 0
+        early_stop.on_train_begin()
+        for epoch in range(len(losses)):
+            epochs_trained += 1
+            early_stop.model.set_weight_to_epoch(epoch=epoch)
+            early_stop.on_epoch_end(epoch, logs={"val_loss": losses[epoch]})
+            if early_stop.model.stop_training:
+                break
+        # The best configuration is in epoch 2 (loss = 0.1000),
+        # and while patience = 2, we're restoring the best weights,
+        # so we end up at the epoch with the best weights, i.e. epoch 2
+        self.assertEqual(early_stop.model.get_weights(), 2)
+
+        # Check early stopping when no model beats the baseline.
+        early_stop = keras.callbacks.EarlyStopping(
+            monitor="val_loss",
+            patience=5,
+            baseline=0.5,
+            restore_best_weights=True,
+        )
+        early_stop.model = DummyModel()
+        losses = [0.9, 0.8, 0.7, 0.71, 0.72, 0.73]
+        # The best configuration is in the epoch 2 (loss = 0.7000).
+        epochs_trained = 0
+        early_stop.on_train_begin()
+        for epoch in range(len(losses)):
+            epochs_trained += 1
+            early_stop.model.set_weight_to_epoch(epoch=epoch)
+            early_stop.on_epoch_end(epoch, logs={"val_loss": losses[epoch]})
+            if early_stop.model.stop_training:
+                break
+        # No epoch improves on the baseline, so we should train for only 5
+        # epochs, and restore the second model.
+        self.assertEqual(epochs_trained, 5)
+        self.assertEqual(early_stop.model.get_weights(), 2)
+
+    def test_EarlyStopping_with_start_from_epoch(self):
+        with self.cached_session():
+            np.random.seed(1337)
+            (data, labels), _ = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            labels = np_utils.to_categorical(labels)
+            model = test_utils.get_small_sequential_mlp(
+                num_hidden=NUM_HIDDEN,
+                num_classes=NUM_CLASSES,
+                input_dim=INPUT_DIM,
+            )
+            model.compile(
+                optimizer="sgd", loss="binary_crossentropy", metrics=["acc"]
+            )
+            start_from_epoch = 2
+            patience = 3
+            stopper = keras.callbacks.EarlyStopping(
+                monitor="acc",
+                patience=patience,
+                start_from_epoch=start_from_epoch,
+            )
+            history = model.fit(
+                data, labels, callbacks=[stopper], verbose=0, epochs=20
+            )
+            # Test 'patience' argument functions correctly when used
+            # in conjunction with 'start_from_epoch'.
+            self.assertGreaterEqual(
+                len(history.epoch), patience + start_from_epoch
+            )
+
+            start_from_epoch = 2
+            patience = 0
+            stopper = keras.callbacks.EarlyStopping(
+                monitor="acc",
+                patience=patience,
+                start_from_epoch=start_from_epoch,
+            )
+            history = model.fit(
+                data, labels, callbacks=[stopper], verbose=0, epochs=20
+            )
+            # Test for boundary condition when 'patience' = 0.
+            self.assertGreaterEqual(len(history.epoch), start_from_epoch)
+
+    def test_RemoteMonitor(self):
+        if requests is None:
+            self.skipTest("`requests` required to run this test")
+            return None
+
+        monitor = keras.callbacks.RemoteMonitor()
+        # This will raise a warning since the default address in unreachable:
+        monitor.on_epoch_end(0, logs={"loss": 0.0})
+
+    def test_LearningRateScheduler(self):
+        with self.cached_session():
+            np.random.seed(1337)
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+            model = test_utils.get_small_sequential_mlp(
+                num_hidden=NUM_HIDDEN,
+                num_classes=NUM_CLASSES,
+                input_dim=INPUT_DIM,
+            )
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+
+            cbks = [
+                keras.callbacks.LearningRateScheduler(
+                    lambda x: 1.0 / (1.0 + x), verbose=1
+                )
+            ]
+            io_utils.enable_interactive_logging()
+            with self.captureWritesToStream(sys.stdout) as printed:
+                model.fit(
+                    x_train,
+                    y_train,
+                    batch_size=BATCH_SIZE,
+                    validation_data=(x_test, y_test),
+                    callbacks=cbks,
+                    epochs=5,
+                )
+                self.assertIn(
+                    "LearningRateScheduler setting learning rate to 1.0",
+                    printed.contents(),
+                )
+            assert (
+                float(keras.backend.get_value(model.optimizer.lr)) - 0.2
+            ) < keras.backend.epsilon()
+
+            cbks = [keras.callbacks.LearningRateScheduler(lambda x, lr: lr / 2)]
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=2,
+                verbose=0,
+            )
+            assert (
+                float(keras.backend.get_value(model.optimizer.lr)) - 0.01 / 4
+            ) < keras.backend.epsilon()
+
+            cbks = [
+                keras.callbacks.LearningRateScheduler(
+                    lambda epoch, _: learning_rate_schedule.CosineDecay(
+                        0.01, 2
+                    )(epoch)
+                )
+            ]
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=2,
+                verbose=0,
+            )
+
+            cosine_decay_np = 0.5 * (1 + np.cos(np.pi * (1 / 2)))
+            decayed_learning_rate = 0.01 * cosine_decay_np
+
+            assert (
+                float(keras.backend.get_value(model.optimizer.lr))
+                - decayed_learning_rate
+            ) < keras.backend.epsilon()
+
+    def test_ReduceLROnPlateau(self):
+        with self.cached_session():
+            tf_utils.set_random_seed(1337)
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+
+            def make_model():
+                tf_utils.set_random_seed(1337)
+                model = test_utils.get_small_sequential_mlp(
+                    num_hidden=NUM_HIDDEN,
+                    num_classes=NUM_CLASSES,
+                    input_dim=INPUT_DIM,
+                )
+                model.compile(
+                    loss="categorical_crossentropy",
+                    optimizer=gradient_descent.SGD(lr=0.1),
+                )
+                return model
+
+            # TODO(psv): Make sure the callback works correctly when min_delta
+            # is set as 0. Test fails when the order of this callback and
+            # assertion is interchanged.
+            model = make_model()
+            cbks = [
+                keras.callbacks.ReduceLROnPlateau(
+                    monitor="val_loss",
+                    factor=0.1,
+                    min_delta=0,
+                    patience=1,
+                    cooldown=5,
+                )
+            ]
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=2,
+                verbose=0,
+            )
+            self.assertAllClose(
+                float(keras.backend.get_value(model.optimizer.lr)),
+                0.1,
+                atol=1e-4,
+            )
+
+            model = make_model()
+            # This should reduce the LR after the first epoch (due to high
+            # epsilon).
+            cbks = [
+                keras.callbacks.ReduceLROnPlateau(
+                    monitor="val_loss",
+                    factor=0.1,
+                    min_delta=10,
+                    patience=1,
+                    cooldown=5,
+                )
+            ]
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=2,
+                verbose=2,
+            )
+            self.assertAllClose(
+                float(keras.backend.get_value(model.optimizer.lr)),
+                0.01,
+                atol=1e-4,
+            )
+
+    def test_ReduceLROnPlateau_patience(self):
+        class DummyOptimizer:
+            def __init__(self):
+                self.lr = keras.backend.variable(1.0)
+
+        class DummyModel:
+            def __init__(self):
+                self.optimizer = DummyOptimizer()
+
+        reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(
+            monitor="val_loss", patience=2
+        )
+        reduce_on_plateau.model = DummyModel()
+
+        losses = [0.0860, 0.1096, 0.1040]
+        lrs = []
+
+        for epoch in range(len(losses)):
+            reduce_on_plateau.on_epoch_end(
+                epoch, logs={"val_loss": losses[epoch]}
+            )
+            lrs.append(
+                keras.backend.get_value(reduce_on_plateau.model.optimizer.lr)
+            )
+
+        # The learning rates should be 1.0 except the last one
+        for lr in lrs[:-1]:
+            self.assertEqual(lr, 1.0)
+        self.assertLess(lrs[-1], 1.0)
+
+    def test_ReduceLROnPlateau_backwards_compatibility(self):
+        with tf.compat.v1.test.mock.patch.object(
+            logging, "warning"
+        ) as mock_log:
+            reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(epsilon=1e-13)
+            self.assertRegex(
+                str(mock_log.call_args), "`epsilon` argument is deprecated"
+            )
+        self.assertFalse(hasattr(reduce_on_plateau, "epsilon"))
+        self.assertTrue(hasattr(reduce_on_plateau, "min_delta"))
+        self.assertEqual(reduce_on_plateau.min_delta, 1e-13)
+
+    def test_CSVLogger(self):
+        with self.cached_session():
+            np.random.seed(1337)
+            temp_dir = self.get_temp_dir()
+            self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+            filepath = os.path.join(temp_dir, "log.tsv")
+
+            sep = "\t"
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+
+            def make_model():
+                np.random.seed(1337)
+                model = test_utils.get_small_sequential_mlp(
+                    num_hidden=NUM_HIDDEN,
+                    num_classes=NUM_CLASSES,
+                    input_dim=INPUT_DIM,
+                )
+                model.compile(
+                    loss="categorical_crossentropy",
+                    optimizer=gradient_descent.SGD(lr=0.1),
+                    metrics=["accuracy"],
+                )
+                return model
+
+            # case 1, create new file with defined separator
+            model = make_model()
+            cbks = [keras.callbacks.CSVLogger(filepath, separator=sep)]
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=1,
+                verbose=0,
+            )
+
+            assert os.path.exists(filepath)
+            with open(filepath) as csvfile:
+                dialect = csv.Sniffer().sniff(csvfile.read())
+            assert dialect.delimiter == sep
+            del model
+            del cbks
+
+            # case 2, append data to existing file, skip header
+            model = make_model()
+            cbks = [
+                keras.callbacks.CSVLogger(filepath, separator=sep, append=True)
+            ]
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=1,
+                verbose=0,
+            )
+
+            # case 3, reuse of CSVLogger object
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=2,
+                verbose=0,
+            )
+
+            with open(filepath) as csvfile:
+                list_lines = csvfile.readlines()
+                for line in list_lines:
+                    assert line.count(sep) == 4
+                assert len(list_lines) == 5
+                output = " ".join(list_lines)
+                assert len(re.findall("epoch", output)) == 1
+
+            os.remove(filepath)
+
+            # case 3, Verify Val. loss also registered when Validation Freq > 1
+            model = make_model()
+            cbks = [keras.callbacks.CSVLogger(filepath, separator=sep)]
+            hist = model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                validation_freq=3,
+                callbacks=cbks,
+                epochs=5,
+                verbose=0,
+            )
+            assert os.path.exists(filepath)
+            # Verify that validation loss is registered at val. freq
+            with open(filepath) as csvfile:
+                rows = csv.DictReader(csvfile, delimiter=sep)
+                for idx, row in enumerate(rows, 1):
+                    self.assertIn("val_loss", row)
+                    if idx == 3:
+                        self.assertEqual(
+                            row["val_loss"], str(hist.history["val_loss"][0])
+                        )
+                    else:
+                        self.assertEqual(row["val_loss"], "NA")
+
+    def test_stop_training_csv(self):
+        # Test that using the CSVLogger callback with the TerminateOnNaN
+        # callback does not result in invalid CSVs.
+        np.random.seed(1337)
+        tmpdir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+        with self.cached_session():
+            fp = os.path.join(tmpdir, "test.csv")
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+            cbks = [
+                keras.callbacks.TerminateOnNaN(),
+                keras.callbacks.CSVLogger(fp),
+            ]
+            model = keras.models.Sequential()
+            for _ in range(5):
+                model.add(
+                    keras.layers.Dense(
+                        2, input_dim=INPUT_DIM, activation="relu"
+                    )
+                )
+            model.add(keras.layers.Dense(NUM_CLASSES, activation="linear"))
+            model.compile(loss="mean_squared_error", optimizer="rmsprop")
+
+            def data_generator():
+                i = 0
+                max_batch_index = len(x_train) // BATCH_SIZE
+                tot = 0
+                while 1:
+                    if tot > 3 * len(x_train):
+                        yield (
+                            np.ones([BATCH_SIZE, INPUT_DIM]) * np.nan,
+                            np.ones([BATCH_SIZE, NUM_CLASSES]) * np.nan,
+                        )
+                    else:
+                        yield (
+                            x_train[i * BATCH_SIZE : (i + 1) * BATCH_SIZE],
+                            y_train[i * BATCH_SIZE : (i + 1) * BATCH_SIZE],
+                        )
+                    i += 1
+                    tot += 1
+                    i %= max_batch_index
+
+            history = model.fit_generator(
+                data_generator(),
+                len(x_train) // BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=20,
+            )
+            loss = history.history["loss"]
+            assert len(loss) > 1
+            assert loss[-1] == np.inf or np.isnan(loss[-1])
+
+            values = []
+            with open(fp) as f:
+                # On Windows, due to \r\n line ends, we may end up reading empty
+                # lines after each line. Skip empty lines.
+                values = [x for x in csv.reader(f) if x]
+
+            assert "nan" in values[-1], "The last epoch was not logged."
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_TerminateOnNaN(self):
+        np.random.seed(1337)
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+
+        y_test = np_utils.to_categorical(y_test)
+        y_train = np_utils.to_categorical(y_train)
+        cbks = [keras.callbacks.TerminateOnNaN()]
+        model = keras.models.Sequential()
+        initializer = keras.initializers.Constant(value=1e5)
+        for _ in range(5):
+            model.add(
+                keras.layers.Dense(
+                    2,
+                    input_dim=INPUT_DIM,
+                    activation="relu",
+                    kernel_initializer=initializer,
+                )
+            )
+        model.add(keras.layers.Dense(NUM_CLASSES))
+        model.compile(loss="mean_squared_error", optimizer="rmsprop")
+
+        history = model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=20,
+        )
+        loss = history.history["loss"]
+        self.assertEqual(len(loss), 1)
+        self.assertTrue(np.isnan(loss[0]) or np.isinf(loss[0]))
+
+    @unittest.skipIf(
+        os.name == "nt",
+        "use_multiprocessing=True does not work on windows properly.",
+    )
+    def test_LambdaCallback(self):
+        with self.cached_session():
+            np.random.seed(1337)
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Dense(
+                    NUM_HIDDEN, input_dim=INPUT_DIM, activation="relu"
+                )
+            )
+            model.add(keras.layers.Dense(NUM_CLASSES, activation="softmax"))
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+
+            # Start an arbitrary process that should run during model
+            # training and be terminated after training has completed.
+            e = threading.Event()
+
+            def target():
+                e.wait()
+
+            t = threading.Thread(target=target)
+            t.start()
+            cleanup_callback = keras.callbacks.LambdaCallback(
+                on_train_end=lambda logs: e.set()
+            )
+
+            cbks = [cleanup_callback]
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=5,
+                verbose=0,
+            )
+            t.join()
+            assert not t.is_alive()
+
+    def test_RemoteMonitor_np_array(self):
+        if requests is None:
+            self.skipTest("`requests` required to run this test")
+        with tf.compat.v1.test.mock.patch.object(
+            requests, "post"
+        ) as requests_post:
+            monitor = keras.callbacks.RemoteMonitor(send_as_json=True)
+            a = np.arange(1)  # a 1 by 1 array
+            logs = {"loss": 0.0, "val": a}
+            monitor.on_epoch_end(0, logs=logs)
+            send = {"loss": 0.0, "epoch": 0, "val": 0}
+            requests_post.assert_called_once_with(
+                monitor.root + monitor.path, json=send, headers=monitor.headers
+            )
+
+    def test_RemoteMonitor_np_float32(self):
+        if requests is None:
+            self.skipTest("`requests` required to run this test")
+
+        with tf.compat.v1.test.mock.patch.object(
+            requests, "post"
+        ) as requests_post:
+            monitor = keras.callbacks.RemoteMonitor(send_as_json=True)
+            a = np.float32(1.0)  # a float32 generic type
+            logs = {"loss": 0.0, "val": a}
+            monitor.on_epoch_end(0, logs=logs)
+            send = {"loss": 0.0, "epoch": 0, "val": 1.0}
+            requests_post.assert_called_once_with(
+                monitor.root + monitor.path, json=send, headers=monitor.headers
+            )
+
+    def test_RemoteMonitorWithJsonPayload(self):
+        if requests is None:
+            self.skipTest("`requests` required to run this test")
+            return None
+        with self.cached_session():
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = keras.utils.np_utils.to_categorical(y_test)
+            y_train = keras.utils.np_utils.to_categorical(y_train)
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Dense(
+                    NUM_HIDDEN, input_dim=INPUT_DIM, activation="relu"
+                )
+            )
+            model.add(keras.layers.Dense(NUM_CLASSES, activation="softmax"))
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="rmsprop",
+                metrics=["accuracy"],
+            )
+            cbks = [keras.callbacks.RemoteMonitor(send_as_json=True)]
+
+            with tf.compat.v1.test.mock.patch.object(requests, "post"):
+                model.fit(
+                    x_train,
+                    y_train,
+                    batch_size=BATCH_SIZE,
+                    validation_data=(x_test, y_test),
+                    callbacks=cbks,
+                    epochs=1,
+                )
+
+    def test_progbar_infers_steps(self):
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        data = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
+        data = data.filter(lambda x, y: True)  # Unknown cardinality.
+
+        progbar = keras.callbacks.ProgbarLogger("steps")
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse")
+        self.assertIsNone(progbar.target)
+        model.fit(data, epochs=2, callbacks=[progbar])
+        self.assertEqual(progbar.target, 5)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_callback_passed_floats(self):
+        class MyCallback(keras.callbacks.Callback):
+            def on_batch_end(self, batch, logs=None):
+                assert isinstance(batch, int)
+                assert isinstance(logs["loss"], float)
+                self.on_batch_end_called = True
+
+            def on_epoch_end(self, batch, logs=None):
+                assert isinstance(batch, int)
+                assert isinstance(logs["loss"], float)
+                self.on_epoch_end_called = True
+
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        callback = MyCallback()
+        model.fit(x, y, epochs=2, callbacks=[callback])
+        self.assertTrue(callback.on_batch_end_called)
+        self.assertTrue(callback.on_batch_end_called)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_implements_batch_hooks(self):
+        class MyCallbackWithBatchHooks(keras.callbacks.Callback):
+            def __init__(self):
+                self.train_batches = 0
+                self.test_batches = 0
+                self.predict_batches = 0
+
+            def on_train_batch_end(self, batch, logs=None):
+                self.train_batches += 1
+
+            def on_test_batch_end(self, batch, logs=None):
+                self.test_batches += 1
+
+            def on_predict_batch_end(self, batch, logs=None):
+                self.predict_batches += 1
+
+        class MyCallbackWithTFBatchHooks(keras.callbacks.Callback):
+            def __init__(self):
+                super().__init__()
+                self._supports_tf_logs = True
+
+        class MyCallbackWithoutBatchHooks(keras.callbacks.Callback):
+            def __init__(self):
+                self.epochs = 0
+
+            def on_epoch_end(self, epoch, logs=None):
+                self.epochs += 1
+
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse")
+
+        my_cb = MyCallbackWithBatchHooks()
+        cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+        self.assertTrue(cb_list._should_call_train_batch_hooks)
+        self.assertTrue(cb_list._should_call_test_batch_hooks)
+        self.assertTrue(cb_list._should_call_predict_batch_hooks)
+        self.assertFalse(cb_list._batch_hooks_support_tf_logs)
+
+        model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+        self.assertEqual(my_cb.train_batches, 2)
+        self.assertEqual(my_cb.test_batches, 1)
+        self.assertEqual(my_cb.predict_batches, 1)
+
+        my_cb = MyCallbackWithTFBatchHooks()
+        cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+        self.assertTrue(cb_list._batch_hooks_support_tf_logs)
+
+        my_cb = MyCallbackWithoutBatchHooks()
+        cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+        self.assertLen(cb_list.callbacks, 1)
+        self.assertFalse(cb_list._should_call_train_batch_hooks)
+        self.assertFalse(cb_list._should_call_test_batch_hooks)
+        self.assertFalse(cb_list._should_call_predict_batch_hooks)
+
+        model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_logs_conversion(self):
+        assert_dict_equal = self.assertDictEqual
+
+        class MutateNumpyLogs(CallAllHooks):
+            def _run(self, *args, logs=None):
+                logs = logs or args[-1]
+                logs["numpy"] = 1
+
+        class MutateTensorFlowLogs(CallAllHooks):
+            def __init__(self):
+                super().__init__()
+                self._supports_tf_logs = True
+
+            def _run(self, *args, logs=None):
+                logs = logs or args[-1]
+                logs["tf"] = 2
+
+        class AssertNumpyLogs(CallAllHooks):
+            def _run(self, *args, logs=None):
+                logs = logs or args[-1]
+                assert_dict_equal(logs, {"all": 0, "numpy": 1, "tf": 2})
+
+        class AssertTensorFlowLogs(AssertNumpyLogs):
+            def __init__(self):
+                super().__init__()
+                self._supports_tf_logs = True
+
+        cb_list = keras.callbacks.CallbackList(
+            [
+                MutateNumpyLogs(),
+                MutateTensorFlowLogs(),
+                AssertNumpyLogs(),
+                AssertTensorFlowLogs(),
+            ]
+        )
+
+        assert len(cb_list.callbacks) == 4
+        cb_list.on_epoch_begin(0, logs={"all": 0})
+        cb_list.on_epoch_end(0, logs={"all": 0})
+        cb_list.on_predict_batch_begin(0, logs={"all": 0})
+        cb_list.on_predict_batch_end(0, logs={"all": 0})
+        cb_list.on_predict_begin(logs={"all": 0})
+        cb_list.on_predict_end(logs={"all": 0})
+        cb_list.on_test_batch_begin(0, logs={"all": 0})
+        cb_list.on_test_batch_end(0, logs={"all": 0})
+        cb_list.on_test_begin(logs={"all": 0})
+        cb_list.on_test_end(logs={"all": 0})
+        cb_list.on_train_batch_begin(0, logs={"all": 0})
+        cb_list.on_train_batch_end(0, logs={"all": 0})
+        cb_list.on_train_begin(logs={"all": 0})
+        cb_list.on_train_end(logs={"all": 0})
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_implements_batch_hooks_override(self):
+        class MyCallback(keras.callbacks.Callback):
+            def __init__(self, should_run=True):
+                self.should_run = should_run
+                self.train_batches = 0
+                self.test_batches = 0
+                self.predict_batches = 0
+
+            def on_train_batch_end(self, batch, logs=None):
+                self.train_batches += 1
+
+            def on_test_batch_end(self, batch, logs=None):
+                self.test_batches += 1
+
+            def on_predict_batch_end(self, batch, logs=None):
+                self.predict_batches += 1
+
+            def _implements_train_batch_hooks(self):
+                return self.should_run
+
+            def _implements_test_batch_hooks(self):
+                return self.should_run
+
+            def _implements_predict_batch_hooks(self):
+                return self.should_run
+
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse")
+
+        my_cb = MyCallback(should_run=True)
+        cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+        self.assertTrue(cb_list._should_call_train_batch_hooks)
+        self.assertTrue(cb_list._should_call_test_batch_hooks)
+        self.assertTrue(cb_list._should_call_predict_batch_hooks)
+
+        model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+        self.assertEqual(my_cb.train_batches, 2)
+        self.assertEqual(my_cb.test_batches, 1)
+        self.assertEqual(my_cb.predict_batches, 1)
+
+        my_cb = MyCallback(should_run=False)
+        cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+        self.assertFalse(cb_list._should_call_train_batch_hooks)
+        self.assertFalse(cb_list._should_call_test_batch_hooks)
+        self.assertFalse(cb_list._should_call_predict_batch_hooks)
+
+        model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+        model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+        self.assertEqual(my_cb.train_batches, 0)
+        self.assertEqual(my_cb.test_batches, 0)
+        self.assertEqual(my_cb.predict_batches, 0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_default_callbacks_do_not_call_batch_hooks(self):
+        model = keras.Sequential([keras.layers.Dense(1)])
+        log_dir = self.get_temp_dir()
+        cb_list = keras.callbacks.CallbackList(
+            [
+                keras.callbacks.TensorBoard(log_dir, profile_batch=0),
+                keras.callbacks.ModelCheckpoint(log_dir),
+            ],
+            add_progbar=True,
+            model=model,
+            verbose=2,
+            epochs=3,
+        )
+        self.assertLen(cb_list.callbacks, 3)
+        self.assertFalse(cb_list._should_call_train_batch_hooks)
+        self.assertFalse(cb_list._should_call_test_batch_hooks)
+        self.assertFalse(cb_list._should_call_predict_batch_hooks)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_change_tf_functions_during_fit(self):
+        class ChangeFunctions(keras.callbacks.Callback):
+            def on_epoch_end(self, epochs, logs=None):
+                def new_fn(iterator):
+                    raise ValueError("New function substituted successfully.")
+
+                self.model.train_function = new_fn
+                self.model.test_function = new_fn
+                self.model.predict_function = new_fn
+
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse")
+
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        with self.assertRaisesRegex(ValueError, "New function "):
+            model.fit(
+                x, y, batch_size=2, epochs=2, callbacks=[ChangeFunctions()]
+            )
+        with self.assertRaisesRegex(ValueError, "New function "):
+            model.evaluate(x, y, batch_size=2)
+        with self.assertRaisesRegex(ValueError, "New function "):
+            model.predict(x, batch_size=2)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_stop_training_batch_level(self):
+        class MyCallback(keras.callbacks.Callback):
+            def __init__(self):
+                super().__init__()
+                self.batch_counter = 0
+
+            def on_train_batch_end(self, batch, logs=None):
+                self.batch_counter += 1
+                if batch == 2:
+                    self.model.stop_training = True
+
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse")
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        my_cb = MyCallback()
+        # Will run 5 batches if `stop_training` doesn't work.
+        model.fit(x, y, batch_size=2, callbacks=[my_cb])
+        self.assertEqual(my_cb.batch_counter, 3)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_built_in_callback_order(self):
+        class CustomCallback(keras.callbacks.Callback):
+            pass
+
+        class TestingCallbackList(keras.callbacks.CallbackList):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                if (
+                    (not isinstance(self.callbacks[0], CustomCallback))
+                    or (
+                        not isinstance(
+                            self.callbacks[1], keras.callbacks.History
+                        )
+                    )
+                    or (
+                        not isinstance(
+                            self.callbacks[2], keras.callbacks.ProgbarLogger
+                        )
+                    )
+                ):
+                    raise AssertionError(
+                        f"Callback order unexpected: {self.callbacks}"
+                    )
+
+        with mock.patch.object(
+            keras.callbacks, "CallbackList", TestingCallbackList
+        ):
+            model = keras.Sequential([keras.layers.Dense(1)])
+            model.compile("sgd", "mse")
+            custom_callback = CustomCallback()
+            model.fit(
+                np.ones((10, 10)),
+                np.ones((10, 1)),
+                epochs=5,
+                callbacks=[custom_callback],
+            )
 
-      def _run(self, *args, logs=None):
-        logs = logs or args[-1]
-        logs['tf'] = 2
-
-    class AssertNumpyLogs(CallAllHooks):
-
-      def _run(self, *args, logs=None):
-        logs = logs or args[-1]
-        assert_dict_equal(logs, {'all': 0, 'numpy': 1, 'tf': 2})
-
-    class AssertTensorFlowLogs(AssertNumpyLogs):
-
-      def __init__(self):
-        super().__init__()
-        self._supports_tf_logs = True
-
-    cb_list = keras.callbacks.CallbackList([
-        MutateNumpyLogs(),
-        MutateTensorFlowLogs(),
-        AssertNumpyLogs(),
-        AssertTensorFlowLogs()
-    ])
-
-    assert len(cb_list.callbacks) == 4
-    cb_list.on_epoch_begin(0, logs={'all': 0})
-    cb_list.on_epoch_end(0, logs={'all': 0})
-    cb_list.on_predict_batch_begin(0, logs={'all': 0})
-    cb_list.on_predict_batch_end(0, logs={'all': 0})
-    cb_list.on_predict_begin(logs={'all': 0})
-    cb_list.on_predict_end(logs={'all': 0})
-    cb_list.on_test_batch_begin(0, logs={'all': 0})
-    cb_list.on_test_batch_end(0, logs={'all': 0})
-    cb_list.on_test_begin(logs={'all': 0})
-    cb_list.on_test_end(logs={'all': 0})
-    cb_list.on_train_batch_begin(0, logs={'all': 0})
-    cb_list.on_train_batch_end(0, logs={'all': 0})
-    cb_list.on_train_begin(logs={'all': 0})
-    cb_list.on_train_end(logs={'all': 0})
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_implements_batch_hooks_override(self):
-
-    class MyCallback(keras.callbacks.Callback):
-
-      def __init__(self, should_run=True):
-        self.should_run = should_run
-        self.train_batches = 0
-        self.test_batches = 0
-        self.predict_batches = 0
-
-      def on_train_batch_end(self, batch, logs=None):
-        self.train_batches += 1
-
-      def on_test_batch_end(self, batch, logs=None):
-        self.test_batches += 1
-
-      def on_predict_batch_end(self, batch, logs=None):
-        self.predict_batches += 1
-
-      def _implements_train_batch_hooks(self):
-        return self.should_run
-
-      def _implements_test_batch_hooks(self):
-        return self.should_run
-
-      def _implements_predict_batch_hooks(self):
-        return self.should_run
-
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse')
-
-    my_cb = MyCallback(should_run=True)
-    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
-    self.assertTrue(cb_list._should_call_train_batch_hooks)
-    self.assertTrue(cb_list._should_call_test_batch_hooks)
-    self.assertTrue(cb_list._should_call_predict_batch_hooks)
-
-    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
-
-    self.assertEqual(my_cb.train_batches, 2)
-    self.assertEqual(my_cb.test_batches, 1)
-    self.assertEqual(my_cb.predict_batches, 1)
-
-    my_cb = MyCallback(should_run=False)
-    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
-    self.assertFalse(cb_list._should_call_train_batch_hooks)
-    self.assertFalse(cb_list._should_call_test_batch_hooks)
-    self.assertFalse(cb_list._should_call_predict_batch_hooks)
-
-    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
-    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
-
-    self.assertEqual(my_cb.train_batches, 0)
-    self.assertEqual(my_cb.test_batches, 0)
-    self.assertEqual(my_cb.predict_batches, 0)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_default_callbacks_do_not_call_batch_hooks(self):
-    model = keras.Sequential([keras.layers.Dense(1)])
-    log_dir = self.get_temp_dir()
-    cb_list = keras.callbacks.CallbackList([
-        keras.callbacks.TensorBoard(log_dir, profile_batch=0),
-        keras.callbacks.ModelCheckpoint(log_dir),
-    ],
-                                           add_progbar=True,
-                                           model=model,
-                                           verbose=2,
-                                           epochs=3)
-    self.assertLen(cb_list.callbacks, 3)
-    self.assertFalse(cb_list._should_call_train_batch_hooks)
-    self.assertFalse(cb_list._should_call_test_batch_hooks)
-    self.assertFalse(cb_list._should_call_predict_batch_hooks)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_change_tf_functions_during_fit(self):
-
-    class ChangeFunctions(keras.callbacks.Callback):
-
-      def on_epoch_end(self, epochs, logs=None):
-
-        def new_fn(iterator):
-          raise ValueError('New function substituted successfully.')
-
-        self.model.train_function = new_fn
-        self.model.test_function = new_fn
-        self.model.predict_function = new_fn
-
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse')
-
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    with self.assertRaisesRegexp(ValueError, 'New function '):
-      model.fit(x, y, batch_size=2, epochs=2, callbacks=[ChangeFunctions()])
-    with self.assertRaisesRegexp(ValueError, 'New function '):
-      model.evaluate(x, y, batch_size=2)
-    with self.assertRaisesRegexp(ValueError, 'New function '):
-      model.predict(x, batch_size=2)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_stop_training_batch_level(self):
-
-    class MyCallback(keras.callbacks.Callback):
-
-      def __init__(self):
-        super().__init__()
-        self.batch_counter = 0
-
-      def on_train_batch_end(self, batch, logs=None):
-        self.batch_counter += 1
-        if batch == 2:
-          self.model.stop_training = True
-
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse')
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    my_cb = MyCallback()
-    # Will run 5 batches if `stop_training` doesn't work.
-    model.fit(x, y, batch_size=2, callbacks=[my_cb])
-    self.assertEqual(my_cb.batch_counter, 3)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_built_in_callback_order(self):
-
-    class CustomCallback(keras.callbacks.Callback):
-      pass
-
-    class TestingCallbackList(keras.callbacks.CallbackList):
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if ((not isinstance(self.callbacks[0], CustomCallback)) or
-            (not isinstance(self.callbacks[1], keras.callbacks.History)) or
-            (not isinstance(self.callbacks[2], keras.callbacks.ProgbarLogger))):
-          raise AssertionError(f'Callback order unexpected: {self.callbacks}')
-
-    with mock.patch.object(
-        keras.callbacks, 'CallbackList', TestingCallbackList):
-      model = keras.Sequential([keras.layers.Dense(1)])
-      model.compile('sgd', 'mse')
-      custom_callback = CustomCallback()
-      model.fit(np.ones((10, 10)), np.ones((10, 1)), epochs=5,
-                callbacks=[custom_callback])
 
 # A summary that was emitted during a test. Fields:
 #   logdir: str. The logdir of the FileWriter to which the summary was
 #     written.
 #   tag: str. The name of the summary.
-_ObservedSummary = collections.namedtuple('_ObservedSummary', ('logdir', 'tag'))
+_ObservedSummary = collections.namedtuple("_ObservedSummary", ("logdir", "tag"))
 
 
 class _SummaryFile:
-  """A record of summary tags and the files to which they were written.
+    """A record of summary tags and the files to which they were written.
 
-  Fields `scalars`, `images`, `histograms`, and `tensors` are sets
-  containing `_ObservedSummary` values.
-  """
+    Fields `scalars`, `images`, `histograms`, and `tensors` are sets
+    containing `_ObservedSummary` values.
+    """
 
-  def __init__(self):
-    self.scalars = set()
-    self.images = set()
-    self.histograms = set()
-    self.tensors = set()
-    self.graph_defs = []
-    self.convert_from_v2_summary_proto = False
+    def __init__(self):
+        self.scalars = set()
+        self.images = set()
+        self.histograms = set()
+        self.tensors = set()
+        self.graph_defs = []
+        self.convert_from_v2_summary_proto = False
 
 
 def list_summaries(logdir):
-  """Read all summaries under the logdir into a `_SummaryFile`.
-
-  Args:
-    logdir: A path to a directory that contains zero or more event
-      files, either as direct children or in transitive subdirectories.
-      Summaries in these events must only contain old-style scalars,
-      images, and histograms. Non-summary events, like `graph_def`s, are
-      ignored.
-
-  Returns:
-    A `_SummaryFile` object reflecting all summaries written to any
-    event files in the logdir or any of its descendant directories.
-
-  Raises:
-    ValueError: If an event file contains an summary of unexpected kind.
-  """
-  result = _SummaryFile()
-  for (dirpath, _, filenames) in os.walk(logdir):
-    for filename in filenames:
-      if not filename.startswith('events.out.'):
-        continue
-      path = os.path.join(dirpath, filename)
-      for event in tf.compat.v1.train.summary_iterator(path):
-        if event.graph_def:
-          result.graph_defs.append(event.graph_def)
-        if not event.summary:  # (e.g., it's a `graph_def` event)
-          continue
-        for value in event.summary.value:
-          tag = value.tag
-          # Case on the `value` rather than the summary metadata because
-          # the Keras callback uses `summary_ops_v2` to emit old-style
-          # summaries. See b/124535134.
-          kind = value.WhichOneof('value')
-          container = {
-              'simple_value': result.scalars,
-              'image': result.images,
-              'histo': result.histograms,
-              'tensor': result.tensors,
-          }.get(kind)
-          if container is None:
-            raise ValueError(
-                'Unexpected summary kind %r in event file %s:\n%r'
-                % (kind, path, event))
-          elif kind == 'tensor' and tag != 'keras':
-            # Convert the tf2 summary proto to old style for type checking.
-            plugin_name = value.metadata.plugin_data.plugin_name
-            container = {
-                'images': result.images,
-                'histograms': result.histograms,
-                'scalars': result.scalars,
-            }.get(plugin_name)
-            if container is not None:
-              result.convert_from_v2_summary_proto = True
-            else:
-              container = result.tensors
-          container.add(_ObservedSummary(logdir=dirpath, tag=tag))
-  return result
+    """Read all summaries under the logdir into a `_SummaryFile`.
+
+    Args:
+      logdir: A path to a directory that contains zero or more event
+        files, either as direct children or in transitive subdirectories.
+        Summaries in these events must only contain old-style scalars,
+        images, and histograms. Non-summary events, like `graph_def`s, are
+        ignored.
+
+    Returns:
+      A `_SummaryFile` object reflecting all summaries written to any
+      event files in the logdir or any of its descendant directories.
+
+    Raises:
+      ValueError: If an event file contains an summary of unexpected kind.
+    """
+    result = _SummaryFile()
+    for dirpath, _, filenames in os.walk(logdir):
+        for filename in filenames:
+            if not filename.startswith("events.out."):
+                continue
+            path = os.path.join(dirpath, filename)
+            for event in tf.compat.v1.train.summary_iterator(path):
+                if event.graph_def:
+                    result.graph_defs.append(event.graph_def)
+                if not event.summary:  # (e.g., it's a `graph_def` event)
+                    continue
+                for value in event.summary.value:
+                    tag = value.tag
+                    # Case on the `value` rather than the summary metadata
+                    # because the Keras callback uses `summary_ops_v2` to emit
+                    # old-style summaries. See b/124535134.
+                    kind = value.WhichOneof("value")
+                    container = {
+                        "simple_value": result.scalars,
+                        "image": result.images,
+                        "histo": result.histograms,
+                        "tensor": result.tensors,
+                    }.get(kind)
+                    if container is None:
+                        raise ValueError(
+                            "Unexpected summary kind %r in event file %s:\n%r"
+                            % (kind, path, event)
+                        )
+                    elif kind == "tensor" and tag != "keras":
+                        # Convert the tf2 summary proto to old style for type
+                        # checking.
+                        plugin_name = value.metadata.plugin_data.plugin_name
+                        container = {
+                            "images": result.images,
+                            "histograms": result.histograms,
+                            "scalars": result.scalars,
+                        }.get(plugin_name)
+                        if container is not None:
+                            result.convert_from_v2_summary_proto = True
+                        else:
+                            container = result.tensors
+                    container.add(_ObservedSummary(logdir=dirpath, tag=tag))
+    return result
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TestTensorBoardV2(test_combinations.TestCase):
+    def setUp(self):
+        super(TestTensorBoardV2, self).setUp()
+        self.logdir = os.path.join(self.get_temp_dir(), "tb")
+        self.train_dir = os.path.join(self.logdir, "train")
+        self.validation_dir = os.path.join(self.logdir, "validation")
+
+    def _get_model(self, compile_model=True):
+        layers = [
+            keras.layers.Conv2D(8, (3, 3)),
+            keras.layers.Flatten(),
+            keras.layers.Dense(1),
+        ]
+        model = test_utils.get_model_from_layers(
+            layers, input_shape=(10, 10, 1)
+        )
+        if compile_model:
+            opt = gradient_descent.SGD(learning_rate=0.001)
+            model.compile(
+                opt, "mse", run_eagerly=test_utils.should_run_eagerly()
+            )
+        return model
 
-  def setUp(self):
-    super(TestTensorBoardV2, self).setUp()
-    self.logdir = os.path.join(self.get_temp_dir(), 'tb')
-    self.train_dir = os.path.join(self.logdir, 'train')
-    self.validation_dir = os.path.join(self.logdir, 'validation')
-
-  def _get_model(self, compile_model=True):
-    layers = [
-        keras.layers.Conv2D(8, (3, 3)),
-        keras.layers.Flatten(),
-        keras.layers.Dense(1)
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(10, 10, 1))
-    if compile_model:
-      opt = gradient_descent.SGD(learning_rate=0.001)
-      model.compile(opt, 'mse', run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  def test_TensorBoard_default_logdir(self):
-    """Regression test for cross-platform pathsep in default logdir."""
-    os.chdir(self.get_temp_dir())
-
-    model = self._get_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard()  # no logdir specified
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-
-    summary_file = list_summaries(logdir='.')
-    train_dir = os.path.join('.', 'logs', 'train')
-    validation_dir = os.path.join('.', 'logs', 'validation')
-    self.assertEqual(
-        summary_file.scalars, {
-            _ObservedSummary(logdir=train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=validation_dir, tag='epoch_loss'),
-            _ObservedSummary(
-                logdir=validation_dir, tag='evaluation_loss_vs_iterations'),
-        })
-
-  def test_TensorBoard_basic(self):
-    model = self._get_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-
-    summary_file = list_summaries(self.logdir)
-    self.assertEqual(
-        summary_file.scalars, {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
-            _ObservedSummary(
-                logdir=self.validation_dir,
-                tag='evaluation_loss_vs_iterations'),
-        })
-
-  def test_TensorBoard_across_invocations(self):
-    """Regression test for summary writer resource use-after-free.
-
-    See: <https://github.com/tensorflow/tensorflow/issues/25707>
-    """
-    model = self._get_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
-
-    for _ in (1, 2):
-      model.fit(
-          x,
-          y,
-          batch_size=2,
-          epochs=2,
-          validation_data=(x, y),
-          callbacks=[tb_cbk])
-
-    summary_file = list_summaries(self.logdir)
-    self.assertEqual(
-        summary_file.scalars, {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
-            _ObservedSummary(
-                logdir=self.validation_dir,
-                tag='evaluation_loss_vs_iterations'),
-        })
-
-  def test_TensorBoard_no_spurious_event_files(self):
-    model = self._get_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        callbacks=[tb_cbk])
-
-    events_file_run_basenames = set()
-    for (dirpath, _, filenames) in os.walk(self.train_dir):
-      if any(fn.startswith('events.out.') for fn in filenames):
-        events_file_run_basenames.add(os.path.basename(dirpath))
-    self.assertEqual(events_file_run_basenames, {'train'})
-
-  def test_TensorBoard_batch_metrics(self):
-    model = self._get_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-
-    summary_file = list_summaries(self.logdir)
-    self.assertEqual(
-        summary_file.scalars,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
-            _ObservedSummary(
-                logdir=self.validation_dir,
-                tag='evaluation_loss_vs_iterations'),
-        },
-    )
+    def test_TensorBoard_default_logdir(self):
+        """Regression test for cross-platform pathsep in default logdir."""
+        os.chdir(self.get_temp_dir())
 
-  def test_TensorBoard_learning_rate_schedules(self):
-    model = self._get_model(compile_model=False)
-    opt = gradient_descent.SGD(learning_rate_schedule.CosineDecay(0.01, 1))
-    model.compile(opt, 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        callbacks=[keras.callbacks.TensorBoard(self.logdir)])
-
-    summary_file = list_summaries(self.logdir)
-    self.assertEqual(
-        summary_file.scalars,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_learning_rate'),
-        },
-    )
+        model = self._get_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard()  # no logdir specified
 
-  def test_TensorBoard_global_step(self):
-    model = self._get_model(compile_model=False)
-    opt = gradient_descent.SGD(learning_rate_schedule.CosineDecay(0.01, 1))
-    model.compile(opt, 'mse', run_eagerly=test_utils.should_run_eagerly())
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+
+        summary_file = list_summaries(logdir=".")
+        train_dir = os.path.join(".", "logs", "train")
+        validation_dir = os.path.join(".", "logs", "validation")
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=train_dir, tag="epoch_loss"),
+                _ObservedSummary(logdir=validation_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=validation_dir, tag="evaluation_loss_vs_iterations"
+                ),
+            },
+        )
+
+    def test_TensorBoard_basic(self):
+        model = self._get_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(self.logdir)
 
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+
+        summary_file = list_summaries(self.logdir)
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(logdir=self.validation_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.validation_dir,
+                    tag="evaluation_loss_vs_iterations",
+                ),
+            },
+        )
+
+    def test_TensorBoard_across_invocations(self):
+        """Regression test for summary writer resource use-after-free.
+
+        See: <https://github.com/tensorflow/tensorflow/issues/25707>
+        """
+        model = self._get_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(self.logdir)
+
+        for _ in (1, 2):
+            model.fit(
+                x,
+                y,
+                batch_size=2,
+                epochs=2,
+                validation_data=(x, y),
+                callbacks=[tb_cbk],
+            )
+
+        summary_file = list_summaries(self.logdir)
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(logdir=self.validation_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.validation_dir,
+                    tag="evaluation_loss_vs_iterations",
+                ),
+            },
+        )
+
+    def test_TensorBoard_no_spurious_event_files(self):
+        model = self._get_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(self.logdir)
+
+        model.fit(x, y, batch_size=2, epochs=2, callbacks=[tb_cbk])
+
+        events_file_run_basenames = set()
+        for dirpath, _, filenames in os.walk(self.train_dir):
+            if any(fn.startswith("events.out.") for fn in filenames):
+                events_file_run_basenames.add(os.path.basename(dirpath))
+        self.assertEqual(events_file_run_basenames, {"train"})
+
+    def test_TensorBoard_batch_metrics(self):
+        model = self._get_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
 
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        verbose=0,
-        callbacks=[
-            keras.callbacks.TensorBoard(
-                self.logdir,
-                update_freq=1,
-                profile_batch=0,
-                write_steps_per_second=True)
-        ])
-
-    summary_file = list_summaries(self.logdir)
-    self.assertEqual(
-        summary_file.scalars,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_learning_rate'),
-            _ObservedSummary(
-                logdir=self.train_dir, tag='epoch_steps_per_second'),
-            _ObservedSummary(
-                logdir=self.train_dir, tag='batch_steps_per_second'),
-        },
-    )
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+
+        summary_file = list_summaries(self.logdir)
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="batch_loss"),
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(logdir=self.validation_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.validation_dir,
+                    tag="evaluation_loss_vs_iterations",
+                ),
+            },
+        )
+
+    def test_TensorBoard_learning_rate_schedules(self):
+        model = self._get_model(compile_model=False)
+        opt = gradient_descent.SGD(learning_rate_schedule.CosineDecay(0.01, 1))
+        model.compile(opt, "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
 
-  def test_TensorBoard_weight_histograms(self):
-    model = self._get_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(self.logdir, histogram_freq=1)
-    model_type = test_utils.get_model_type()
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-
-    self.assertEqual(
-        summary_file.scalars,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
-            _ObservedSummary(
-                logdir=self.validation_dir,
-                tag='evaluation_loss_vs_iterations'),
-        },
-    )
-    self.assertEqual(
-        self._strip_layer_names(summary_file.histograms, model_type),
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='bias_0/histogram'),
-            _ObservedSummary(logdir=self.train_dir, tag='kernel_0/histogram'),
-        },
-    )
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            callbacks=[keras.callbacks.TensorBoard(self.logdir)],
+        )
+
+        summary_file = list_summaries(self.logdir)
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.train_dir, tag="epoch_learning_rate"
+                ),
+            },
+        )
+
+    def test_TensorBoard_global_step(self):
+        model = self._get_model(compile_model=False)
+        opt = gradient_descent.SGD(learning_rate_schedule.CosineDecay(0.01, 1))
+        model.compile(opt, "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
 
-  def test_TensorBoard_weight_images(self):
-    model = self._get_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, write_images=True)
-    model_type = test_utils.get_model_type()
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-
-    self.assertEqual(
-        summary_file.scalars,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
-            _ObservedSummary(
-                logdir=self.validation_dir,
-                tag='evaluation_loss_vs_iterations'),
-        },
-    )
-    self.assertEqual(
-        self._strip_layer_names(summary_file.histograms, model_type),
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='bias_0/histogram'),
-            _ObservedSummary(logdir=self.train_dir, tag='kernel_0/histogram'),
-        },
-    )
-    if summary_file.convert_from_v2_summary_proto:
-      expected_image_summaries = {
-          _ObservedSummary(logdir=self.train_dir, tag='bias_0/image'),
-          _ObservedSummary(logdir=self.train_dir, tag='kernel_0/image'),
-      }
-    else:
-      expected_image_summaries = {
-          _ObservedSummary(logdir=self.train_dir, tag='bias_0/image/0'),
-          _ObservedSummary(logdir=self.train_dir, tag='kernel_0/image/0'),
-          _ObservedSummary(logdir=self.train_dir, tag='kernel_0/image/1'),
-          _ObservedSummary(logdir=self.train_dir, tag='kernel_0/image/2'),
-      }
-    self.assertEqual(
-        self._strip_layer_names(summary_file.images, model_type),
-        expected_image_summaries
-    )
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            verbose=0,
+            callbacks=[
+                keras.callbacks.TensorBoard(
+                    self.logdir,
+                    update_freq=1,
+                    profile_batch=0,
+                    write_steps_per_second=True,
+                )
+            ],
+        )
+
+        summary_file = list_summaries(self.logdir)
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="batch_loss"),
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.train_dir, tag="epoch_learning_rate"
+                ),
+                _ObservedSummary(
+                    logdir=self.train_dir, tag="epoch_steps_per_second"
+                ),
+                _ObservedSummary(
+                    logdir=self.train_dir, tag="batch_steps_per_second"
+                ),
+            },
+        )
+
+    def test_TensorBoard_weight_histograms(self):
+        model = self._get_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(self.logdir, histogram_freq=1)
+        model_type = test_utils.get_model_type()
 
-  def test_TensorBoard_projector_callback(self):
-    layers = [
-        keras.layers.Embedding(10, 10, name='test_embedding'),
-        keras.layers.Dense(10, activation='relu'),
-        keras.layers.Dense(1, activation='sigmoid')
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(10,))
-    model.compile(
-        optimizer='adam',
-        loss=keras.losses.BinaryCrossentropy(from_logits=True),
-        run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 10)), np.ones((10, 10))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir,
-        embeddings_freq=1,
-        embeddings_metadata={'test_embedding': 'metadata.tsv'})
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-
-    with open(os.path.join(self.logdir, 'projector_config.pbtxt')) as f:
-      self.assertEqual(f.readlines(), [
-          'embeddings {\n',
-          ('  tensor_name: '
-           '"layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE"\n'),
-          '  metadata_path: "metadata.tsv"\n', '}\n'
-      ])
-
-  def test_custom_summary(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Custom summaries only supported in V2 code path.')
-
-    def scalar_v2_mock(name, data, step=None):
-      """A reimplementation of the scalar plugin to avoid circular deps."""
-      metadata = tf.compat.v1.SummaryMetadata()
-      # Should match value in tensorboard/plugins/scalar/metadata.py.
-      metadata.plugin_data.plugin_name = 'scalars'
-      with tf.summary.experimental.summary_scope(
-          name, 'scalar_summary', values=[data, step]) as (tag, _):
-        return tf.summary.write(
-            tag=tag,
-            tensor=tf.cast(data, 'float32'),
-            step=step,
-            metadata=metadata)
-
-    class LayerWithSummary(keras.layers.Layer):
-
-      def call(self, x):
-        scalar_v2_mock('custom_summary', tf.reduce_sum(x))
-        return x
-
-    model = test_utils.get_model_from_layers([LayerWithSummary()],
-                                             input_shape=(5,),
-                                             name='model')
-
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
-    x, y = np.ones((10, 5)), np.ones((10, 5))
-    model.fit(x, y, batch_size=2, validation_data=(x, y), callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-    self.assertEqual(
-        summary_file.scalars,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
-            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
-            _ObservedSummary(
-                logdir=self.validation_dir,
-                tag='evaluation_loss_vs_iterations'),
-            _ObservedSummary(
-                logdir=self.train_dir,
-                tag='model/layer_with_summary/custom_summary'),
-            _ObservedSummary(
-                logdir=self.validation_dir,
-                tag='model/layer_with_summary/custom_summary')
-        },
-    )
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(logdir=self.validation_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.validation_dir,
+                    tag="evaluation_loss_vs_iterations",
+                ),
+            },
+        )
+        self.assertEqual(
+            self._strip_layer_names(summary_file.histograms, model_type),
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="bias_0/histogram"),
+                _ObservedSummary(
+                    logdir=self.train_dir, tag="kernel_0/histogram"
+                ),
+            },
+        )
+
+    def test_TensorBoard_weight_images(self):
+        model = self._get_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir, histogram_freq=1, write_images=True
+        )
+        model_type = test_utils.get_model_type()
 
-  def _strip_layer_names(self, summaries, model_type):
-    """Deduplicate summary names modulo layer prefix.
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(logdir=self.validation_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.validation_dir,
+                    tag="evaluation_loss_vs_iterations",
+                ),
+            },
+        )
+        self.assertEqual(
+            self._strip_layer_names(summary_file.histograms, model_type),
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="bias_0/histogram"),
+                _ObservedSummary(
+                    logdir=self.train_dir, tag="kernel_0/histogram"
+                ),
+            },
+        )
+        if summary_file.convert_from_v2_summary_proto:
+            expected_image_summaries = {
+                _ObservedSummary(logdir=self.train_dir, tag="bias_0/image"),
+                _ObservedSummary(logdir=self.train_dir, tag="kernel_0/image"),
+            }
+        else:
+            expected_image_summaries = {
+                _ObservedSummary(logdir=self.train_dir, tag="bias_0/image/0"),
+                _ObservedSummary(logdir=self.train_dir, tag="kernel_0/image/0"),
+                _ObservedSummary(logdir=self.train_dir, tag="kernel_0/image/1"),
+                _ObservedSummary(logdir=self.train_dir, tag="kernel_0/image/2"),
+            }
+        self.assertEqual(
+            self._strip_layer_names(summary_file.images, model_type),
+            expected_image_summaries,
+        )
+
+    def test_TensorBoard_projector_callback(self):
+        layers = [
+            keras.layers.Embedding(10, 10, name="test_embedding"),
+            keras.layers.Dense(10, activation="relu"),
+            keras.layers.Dense(1, activation="sigmoid"),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(10,))
+        model.compile(
+            optimizer="adam",
+            loss=keras.losses.BinaryCrossentropy(from_logits=True),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x, y = np.ones((10, 10)), np.ones((10, 10))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir,
+            embeddings_freq=1,
+            embeddings_metadata={"test_embedding": "metadata.tsv"},
+        )
 
-    This removes the first slash-component of each tag name: for
-    instance, "foo/bar/baz" becomes "bar/baz".
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+
+        with open(os.path.join(self.logdir, "projector_config.pbtxt")) as f:
+            self.assertEqual(
+                f.readlines(),
+                [
+                    "embeddings {\n",
+                    "  tensor_name: "
+                    '"layer_with_weights-0/embeddings/.ATTRIBUTES/'
+                    'VARIABLE_VALUE"\n',
+                    '  metadata_path: "metadata.tsv"\n',
+                    "}\n",
+                ],
+            )
+
+    def test_custom_summary(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Custom summaries only supported in V2 code path.")
+
+        def scalar_v2_mock(name, data, step=None):
+            """A reimplementation of the scalar plugin to avoid circular
+            deps."""
+            metadata = tf.compat.v1.SummaryMetadata()
+            # Should match value in tensorboard/plugins/scalar/metadata.py.
+            metadata.plugin_data.plugin_name = "scalars"
+            with tf.summary.experimental.summary_scope(
+                name, "scalar_summary", values=[data, step]
+            ) as (tag, _):
+                return tf.summary.write(
+                    tag=tag,
+                    tensor=tf.cast(data, "float32"),
+                    step=step,
+                    metadata=metadata,
+                )
+
+        class LayerWithSummary(keras.layers.Layer):
+            def call(self, x):
+                scalar_v2_mock("custom_summary", tf.reduce_sum(x))
+                return x
+
+        model = test_utils.get_model_from_layers(
+            [LayerWithSummary()], input_shape=(5,), name="model"
+        )
+
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
+        x, y = np.ones((10, 5)), np.ones((10, 5))
+        model.fit(
+            x, y, batch_size=2, validation_data=(x, y), callbacks=[tb_cbk]
+        )
+        summary_file = list_summaries(self.logdir)
+        self.assertEqual(
+            summary_file.scalars,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="batch_loss"),
+                _ObservedSummary(logdir=self.train_dir, tag="epoch_loss"),
+                _ObservedSummary(logdir=self.validation_dir, tag="epoch_loss"),
+                _ObservedSummary(
+                    logdir=self.validation_dir,
+                    tag="evaluation_loss_vs_iterations",
+                ),
+                _ObservedSummary(
+                    logdir=self.train_dir,
+                    tag="model/layer_with_summary/custom_summary",
+                ),
+                _ObservedSummary(
+                    logdir=self.validation_dir,
+                    tag="model/layer_with_summary/custom_summary",
+                ),
+            },
+        )
+
+    def _strip_layer_names(self, summaries, model_type):
+        """Deduplicate summary names modulo layer prefix.
+
+        This removes the first slash-component of each tag name: for
+        instance, "foo/bar/baz" becomes "bar/baz".
+
+        Args:
+          summaries: A `set` of `_ObservedSummary` values.
+          model_type: The model type currently being tested.
+
+        Returns:
+          A new `set` of `_ObservedSummary` values with layer prefixes
+          removed.
+        """
+        result = set()
+        for summary in summaries:
+            if "/" not in summary.tag:
+                raise ValueError(f"tag has no layer name: {summary.tag!r}")
+            start_from = 2 if "subclass" in model_type else 1
+            new_tag = "/".join(summary.tag.split("/")[start_from:])
+            result.add(summary._replace(tag=new_tag))
+        return result
+
+    def test_TensorBoard_invalid_argument(self):
+        with self.assertRaisesRegex(ValueError, "Unrecognized arguments"):
+            keras.callbacks.TensorBoard(wwrite_images=True)
+
+    def test_TensorBoard_non_blocking(self):
+        model = keras.Sequential([keras.layers.Dense(1)])
+        tb = keras.callbacks.TensorBoard(self.logdir)
+        self.assertTrue(tb._supports_tf_logs)
+        cb_list = keras.callbacks.CallbackList(
+            [tb], model=model, epochs=1, steps=100, verbose=0
+        )
+
+        tensor = tf.convert_to_tensor(1.0)
+
+        def mock_numpy():
+            raise RuntimeError(
+                "If this error is seen, TensorBoard is causing a blocking "
+                "NumPy conversion."
+            )
+
+        with tf.compat.v1.test.mock.patch.object(tensor, "numpy", mock_numpy):
+            logs = {"metric": tensor}
+
+            cb_list.on_train_begin(logs)
+            cb_list.on_epoch_begin(0, logs)
+            cb_list.on_train_batch_begin(0, logs)
+            cb_list.on_train_batch_end(0, logs)
+            cb_list.on_epoch_end(0, logs)
+            cb_list.on_train_end(logs)
+
+            cb_list.on_test_begin(logs)
+            cb_list.on_test_batch_begin(0, logs)
+            cb_list.on_test_batch_end(0, logs)
+            cb_list.on_test_end(logs)
+
+            cb_list.on_predict_begin(logs)
+            cb_list.on_predict_batch_begin(logs)
+            cb_list.on_predict_batch_end(logs)
+            cb_list.on_predict_end(logs)
 
-    Args:
-      summaries: A `set` of `_ObservedSummary` values.
-      model_type: The model type currently being tested.
 
-    Returns:
-      A new `set` of `_ObservedSummary` values with layer prefixes
-      removed.
-    """
-    result = set()
-    for summary in summaries:
-      if '/' not in summary.tag:
-        raise ValueError('tag has no layer name: %r' % summary.tag)
-      start_from = 2 if 'subclass' in model_type else 1
-      new_tag = '/'.join(summary.tag.split('/')[start_from:])
-      result.add(summary._replace(tag=new_tag))
-    return result
+# Note that this test specifies model_type explicitly.
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class TestTensorBoardV2NonParameterizedTest(test_combinations.TestCase):
+    def setUp(self):
+        super(TestTensorBoardV2NonParameterizedTest, self).setUp()
+        self.logdir = os.path.join(self.get_temp_dir(), "tb")
+        self.train_dir = os.path.join(self.logdir, "train")
+        self.validation_dir = os.path.join(self.logdir, "validation")
+
+    def _get_seq_model(self):
+        model = keras.models.Sequential(
+            [
+                keras.layers.Conv2D(8, (3, 3), input_shape=(10, 10, 1)),
+                keras.layers.Flatten(),
+                keras.layers.Dense(1),
+            ]
+        )
+        opt = gradient_descent.SGD(learning_rate=0.001)
+        model.compile(opt, "mse", run_eagerly=test_utils.should_run_eagerly())
+        return model
 
-  def test_TensorBoard_invalid_argument(self):
-    with self.assertRaisesRegex(ValueError, 'Unrecognized arguments'):
-      keras.callbacks.TensorBoard(wwrite_images=True)
+    def _count_xplane_file(self, logdir):
+        profile_dir = os.path.join(logdir, "plugins", "profile")
+        count = 0
+        for dirpath, dirnames, filenames in os.walk(profile_dir):
+            del dirpath  # unused
+            del dirnames  # unused
+            for filename in filenames:
+                if filename.endswith(".xplane.pb"):
+                    count += 1
+        return count
+
+    def fitModelAndAssertKerasModelWritten(self, model):
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir, write_graph=True, profile_batch=0
+        )
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=3,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+        self.assertEqual(
+            summary_file.tensors,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="keras"),
+            },
+        )
+        if not model.run_eagerly:
+            # There should be one train graph
+            self.assertLen(summary_file.graph_defs, 1)
+            for graph_def in summary_file.graph_defs:
+                graph_def_str = str(graph_def)
+
+                # All the model layers should appear in the graphs
+                for layer in model.layers:
+                    if "input" not in layer.name:
+                        self.assertIn(layer.name, graph_def_str)
+
+    def test_TensorBoard_writeSequentialModel_noInputShape(self):
+        model = keras.models.Sequential(
+            [
+                keras.layers.Conv2D(8, (3, 3)),
+                keras.layers.Flatten(),
+                keras.layers.Dense(1),
+            ]
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        self.fitModelAndAssertKerasModelWritten(model)
+
+    def test_TensorBoard_writeSequentialModel_withInputShape(self):
+        model = keras.models.Sequential(
+            [
+                keras.layers.Conv2D(8, (3, 3), input_shape=(10, 10, 1)),
+                keras.layers.Flatten(),
+                keras.layers.Dense(1),
+            ]
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        self.fitModelAndAssertKerasModelWritten(model)
+
+    def test_TensorBoard_writeModel(self):
+        inputs = keras.layers.Input([10, 10, 1])
+        x = keras.layers.Conv2D(8, (3, 3), activation="relu")(inputs)
+        x = keras.layers.Flatten()(x)
+        x = keras.layers.Dense(1)(x)
+        model = keras.models.Model(inputs=inputs, outputs=[x])
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        self.fitModelAndAssertKerasModelWritten(model)
+
+    def test_TensorBoard_autoTrace(self):
+        model = self._get_seq_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir, histogram_freq=1, profile_batch=1, write_graph=False
+        )
 
-  def test_TensorBoard_non_blocking(self):
-    model = keras.Sequential([keras.layers.Dense(1)])
-    tb = keras.callbacks.TensorBoard(self.logdir)
-    self.assertTrue(tb._supports_tf_logs)
-    cb_list = keras.callbacks.CallbackList([tb],
-                                           model=model,
-                                           epochs=1,
-                                           steps=100,
-                                           verbose=0)
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+
+        self.assertEqual(
+            summary_file.tensors,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="batch_1"),
+            },
+        )
+        self.assertEqual(1, self._count_xplane_file(logdir=self.logdir))
+
+    def test_TensorBoard_autoTrace_outerProfiler(self):
+        """Runs a profiler session that interferes with the callback's one.
+
+        The callback will not generate a profile but execution will proceed
+        without crashing due to unhandled exceptions.
+        """
+        tf.profiler.experimental.start(logdir="")
+        model = self._get_seq_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir, histogram_freq=1, profile_batch=1, write_graph=False
+        )
 
-    tensor = tf.convert_to_tensor(1.)
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+        tf.profiler.experimental.stop(save=False)
+
+        self.assertEqual(
+            summary_file.tensors,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="batch_1"),
+            },
+        )
+        self.assertEqual(0, self._count_xplane_file(logdir=self.train_dir))
+
+    def test_TensorBoard_autoTrace_tagNameWithBatchNum(self):
+        model = self._get_seq_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir, histogram_freq=1, profile_batch=2, write_graph=False
+        )
 
-    def mock_numpy():
-      raise RuntimeError(
-          'If this error is seen, TensorBoard is causing a blocking '
-          'NumPy conversion.')
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+
+        self.assertEqual(
+            summary_file.tensors,
+            {
+                _ObservedSummary(logdir=self.train_dir, tag="batch_2"),
+            },
+        )
+        self.assertEqual(1, self._count_xplane_file(logdir=self.logdir))
+
+    def test_TensorBoard_autoTrace_profileBatchRangeSingle(self):
+        model = self._get_seq_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir,
+            histogram_freq=1,
+            profile_batch="2,2",
+            write_graph=False,
+        )
 
-    with tf.compat.v1.test.mock.patch.object(tensor, 'numpy', mock_numpy):
-      logs = {'metric': tensor}
+        model.fit(
+            x,
+            y,
+            batch_size=3,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+
+        self.assertEqual(
+            summary_file.tensors,
+            {
+                # Trace will be logged once at the batch it stops profiling.
+                _ObservedSummary(logdir=self.train_dir, tag="batch_2"),
+            },
+        )
+        self.assertEqual(1, self._count_xplane_file(logdir=self.logdir))
+
+    def test_TensorBoard_autoTrace_profileBatchRangeTwice(self):
+        model = self._get_seq_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir,
+            histogram_freq=1,
+            profile_batch="10,10",
+            write_graph=False,
+        )
 
-      cb_list.on_train_begin(logs)
-      cb_list.on_epoch_begin(0, logs)
-      cb_list.on_train_batch_begin(0, logs)
-      cb_list.on_train_batch_end(0, logs)
-      cb_list.on_epoch_end(0, logs)
-      cb_list.on_train_end(logs)
+        model.fit(
+            x,
+            y,
+            batch_size=3,
+            epochs=10,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
 
-      cb_list.on_test_begin(logs)
-      cb_list.on_test_batch_begin(0, logs)
-      cb_list.on_test_batch_end(0, logs)
-      cb_list.on_test_end(logs)
+        time.sleep(1)  # Avoids the second profile over-writing the first.
 
-      cb_list.on_predict_begin(logs)
-      cb_list.on_predict_batch_begin(logs)
-      cb_list.on_predict_batch_end(logs)
-      cb_list.on_predict_end(logs)
+        model.fit(
+            x,
+            y,
+            batch_size=3,
+            epochs=10,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        self.assertEqual(2, self._count_xplane_file(logdir=self.logdir))
+
+    # Test case that replicates a GitHub issue.
+    # https://github.com/tensorflow/tensorflow/issues/37543
+    def test_TensorBoard_autoTrace_profileTwiceGraphMode(self):
+        tf.compat.v1.disable_eager_execution()
+        inp = keras.Input((1,))
+        out = keras.layers.Dense(units=1)(inp)
+        model = keras.Model(inp, out)
+
+        model.compile(gradient_descent.SGD(1), "mse")
+
+        logdir = os.path.join(self.get_temp_dir(), "tb1")
+        model.fit(
+            np.zeros((64, 1)),
+            np.zeros((64, 1)),
+            batch_size=32,
+            callbacks=[keras.callbacks.TensorBoard(logdir, profile_batch=1)],
+        )
+        # Verifies trace exists in the first logdir.
+        self.assertEqual(1, self._count_xplane_file(logdir=logdir))
+        logdir = os.path.join(self.get_temp_dir(), "tb2")
+        model.fit(
+            np.zeros((64, 1)),
+            np.zeros((64, 1)),
+            batch_size=32,
+            callbacks=[keras.callbacks.TensorBoard(logdir, profile_batch=2)],
+        )
+        # Verifies trace exists in the second logdir.
+        self.assertEqual(1, self._count_xplane_file(logdir=logdir))
+
+    def test_TensorBoard_autoTrace_profileBatchRange(self):
+        model = self._get_seq_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir,
+            histogram_freq=1,
+            profile_batch="1,3",
+            write_graph=False,
+        )
 
+        model.fit(
+            x,
+            y,
+            batch_size=4,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
+
+        self.assertEqual(
+            summary_file.tensors,
+            {
+                # Trace will be logged once at the batch it stops profiling.
+                _ObservedSummary(logdir=self.train_dir, tag="batch_3"),
+            },
+        )
+        self.assertEqual(1, self._count_xplane_file(logdir=self.logdir))
+
+    def test_TensorBoard_autoTrace_profileInvalidBatchRange(self):
+        with self.assertRaises(ValueError):
+            keras.callbacks.TensorBoard(
+                self.logdir,
+                histogram_freq=1,
+                profile_batch="-1,3",
+                write_graph=False,
+            )
 
-# Note that this test specifies model_type explicitly.
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class TestTensorBoardV2NonParameterizedTest(test_combinations.TestCase):
+        with self.assertRaises(ValueError):
+            keras.callbacks.TensorBoard(
+                self.logdir,
+                histogram_freq=1,
+                profile_batch="1,None",
+                write_graph=False,
+            )
 
-  def setUp(self):
-    super(TestTensorBoardV2NonParameterizedTest, self).setUp()
-    self.logdir = os.path.join(self.get_temp_dir(), 'tb')
-    self.train_dir = os.path.join(self.logdir, 'train')
-    self.validation_dir = os.path.join(self.logdir, 'validation')
-
-  def _get_seq_model(self):
-    model = keras.models.Sequential([
-        keras.layers.Conv2D(8, (3, 3), input_shape=(10, 10, 1)),
-        keras.layers.Flatten(),
-        keras.layers.Dense(1),
-    ])
-    opt = gradient_descent.SGD(learning_rate=0.001)
-    model.compile(
-        opt,
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  def _count_trace_file(self, logdir):
-    profile_dir = os.path.join(logdir, 'plugins', 'profile')
-    count = 0
-    for (dirpath, dirnames, filenames) in os.walk(profile_dir):
-      del dirpath  # unused
-      del dirnames  # unused
-      for filename in filenames:
-        if filename.endswith('.trace.json.gz'):
-          count += 1
-    return count
-
-  def fitModelAndAssertKerasModelWritten(self, model):
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(self.logdir,
-                                         write_graph=True,
-                                         profile_batch=0)
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=3,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-    self.assertEqual(
-        summary_file.tensors,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag='keras'),
-        },
-    )
-    if not model.run_eagerly:
-      # There should be one train graph
-      self.assertLen(summary_file.graph_defs, 1)
-      for graph_def in summary_file.graph_defs:
-        graph_def_str = str(graph_def)
-
-        # All the model layers should appear in the graphs
-        for layer in model.layers:
-          if 'input' not in layer.name:
-            self.assertIn(layer.name, graph_def_str)
-
-  def test_TensorBoard_writeSequentialModel_noInputShape(self):
-    model = keras.models.Sequential([
-        keras.layers.Conv2D(8, (3, 3)),
-        keras.layers.Flatten(),
-        keras.layers.Dense(1),
-    ])
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    self.fitModelAndAssertKerasModelWritten(model)
-
-  def test_TensorBoard_writeSequentialModel_withInputShape(self):
-    model = keras.models.Sequential([
-        keras.layers.Conv2D(8, (3, 3), input_shape=(10, 10, 1)),
-        keras.layers.Flatten(),
-        keras.layers.Dense(1),
-    ])
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    self.fitModelAndAssertKerasModelWritten(model)
-
-  def test_TensorBoard_writeModel(self):
-    inputs = keras.layers.Input([10, 10, 1])
-    x = keras.layers.Conv2D(8, (3, 3), activation='relu')(inputs)
-    x = keras.layers.Flatten()(x)
-    x = keras.layers.Dense(1)(x)
-    model = keras.models.Model(inputs=inputs, outputs=[x])
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    self.fitModelAndAssertKerasModelWritten(model)
-
-  def test_TensorBoard_autoTrace(self):
-    model = self._get_seq_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, profile_batch=1, write_graph=False)
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-
-    self.assertEqual(
-        summary_file.tensors,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag=u'batch_1'),
-        },
-    )
-    self.assertEqual(1, self._count_trace_file(logdir=self.logdir))
+        with self.assertRaises(ValueError):
+            keras.callbacks.TensorBoard(
+                self.logdir,
+                histogram_freq=1,
+                profile_batch="6,5",
+                write_graph=False,
+            )
 
-  def test_TensorBoard_autoTrace_outerProfiler(self):
-    """Runs a profiler session that interferes with the one from the callback.
+        with self.assertRaises(ValueError):
+            keras.callbacks.TensorBoard(
+                self.logdir,
+                histogram_freq=1,
+                profile_batch=-1,
+                write_graph=False,
+            )
+
+    def test_TensorBoard_autoTrace_profile_batch_largerThanBatchCount(self):
+        model = self._get_seq_model()
+        x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+        tb_cbk = keras.callbacks.TensorBoard(
+            self.logdir,
+            histogram_freq=1,
+            profile_batch=10000,
+            write_graph=False,
+        )
 
-    The callback will not generate a profile but execution will proceed without
-    crashing due to unhandled exceptions.
-    """
-    tf.profiler.experimental.start(logdir='')
-    model = self._get_seq_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, profile_batch=1, write_graph=False)
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-    tf.profiler.experimental.stop(save=False)
-
-    self.assertEqual(
-        summary_file.tensors,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag=u'batch_1'),
-        },
-    )
-    self.assertEqual(0, self._count_trace_file(logdir=self.train_dir))
-
-  def test_TensorBoard_autoTrace_tagNameWithBatchNum(self):
-    model = self._get_seq_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, profile_batch=2, write_graph=False)
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-
-    self.assertEqual(
-        summary_file.tensors,
-        {
-            _ObservedSummary(logdir=self.train_dir, tag=u'batch_2'),
-        },
-    )
-    self.assertEqual(1, self._count_trace_file(logdir=self.logdir))
-
-  def test_TensorBoard_autoTrace_profileBatchRangeSingle(self):
-    model = self._get_seq_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, profile_batch='2,2', write_graph=False)
-
-    model.fit(
-        x,
-        y,
-        batch_size=3,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-
-    self.assertEqual(
-        summary_file.tensors,
-        {
-            # Trace will be logged once at the batch it stops profiling.
-            _ObservedSummary(logdir=self.train_dir, tag=u'batch_2'),
-        },
-    )
-    self.assertEqual(1, self._count_trace_file(logdir=self.logdir))
-
-  def test_TensorBoard_autoTrace_profileBatchRangeTwice(self):
-    model = self._get_seq_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, profile_batch='10,10', write_graph=False)
-
-    model.fit(
-        x,
-        y,
-        batch_size=3,
-        epochs=10,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-
-    time.sleep(1)  # Avoids the second profile over-writing the first.
-
-    model.fit(
-        x,
-        y,
-        batch_size=3,
-        epochs=10,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    self.assertEqual(2, self._count_trace_file(logdir=self.logdir))
-
-  # Test case that replicates a Github issue.
-  # https://github.com/tensorflow/tensorflow/issues/37543
-  def test_TensorBoard_autoTrace_profileTwiceGraphMode(self):
-    tf.compat.v1.disable_eager_execution()
-    inp = keras.Input((1,))
-    out = keras.layers.Dense(units=1)(inp)
-    model = keras.Model(inp, out)
-
-    model.compile(gradient_descent.SGD(1), 'mse')
-
-    logdir = os.path.join(self.get_temp_dir(), 'tb1')
-    model.fit(
-        np.zeros((64, 1)),
-        np.zeros((64, 1)),
-        batch_size=32,
-        callbacks=[keras.callbacks.TensorBoard(logdir, profile_batch=1)],
-    )
-    # Verifies trace exists in the first logdir.
-    self.assertEqual(1, self._count_trace_file(logdir=logdir))
-    logdir = os.path.join(self.get_temp_dir(), 'tb2')
-    model.fit(
-        np.zeros((64, 1)),
-        np.zeros((64, 1)),
-        batch_size=32,
-        callbacks=[keras.callbacks.TensorBoard(logdir, profile_batch=2)],
-    )
-    # Verifies trace exists in the second logdir.
-    self.assertEqual(1, self._count_trace_file(logdir=logdir))
-
-  def test_TensorBoard_autoTrace_profileBatchRange(self):
-    model = self._get_seq_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, profile_batch='1,3', write_graph=False)
-
-    model.fit(
-        x,
-        y,
-        batch_size=4,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-
-    self.assertEqual(
-        summary_file.tensors,
-        {
-            # Trace will be logged once at the batch it stops profiling.
-            _ObservedSummary(logdir=self.train_dir, tag=u'batch_3'),
-        },
-    )
-    self.assertEqual(1, self._count_trace_file(logdir=self.logdir))
-
-  def test_TensorBoard_autoTrace_profileInvalidBatchRange(self):
-    with self.assertRaises(ValueError):
-      keras.callbacks.TensorBoard(
-          self.logdir,
-          histogram_freq=1,
-          profile_batch='-1,3',
-          write_graph=False)
-
-    with self.assertRaises(ValueError):
-      keras.callbacks.TensorBoard(
-          self.logdir,
-          histogram_freq=1,
-          profile_batch='1,None',
-          write_graph=False)
-
-    with self.assertRaises(ValueError):
-      keras.callbacks.TensorBoard(
-          self.logdir, histogram_freq=1, profile_batch='6,5', write_graph=False)
-
-    with self.assertRaises(ValueError):
-      keras.callbacks.TensorBoard(
-          self.logdir, histogram_freq=1, profile_batch=-1, write_graph=False)
-
-  def test_TensorBoard_autoTrace_profile_batch_largerThanBatchCount(self):
-    model = self._get_seq_model()
-    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
-    tb_cbk = keras.callbacks.TensorBoard(
-        self.logdir, histogram_freq=1, profile_batch=10000, write_graph=False)
-
-    model.fit(
-        x,
-        y,
-        batch_size=2,
-        epochs=2,
-        validation_data=(x, y),
-        callbacks=[tb_cbk])
-    summary_file = list_summaries(self.logdir)
-
-    # Enabled trace only on the 10000th batch, thus it should be empty.
-    self.assertEmpty(summary_file.tensors)
-    self.assertEqual(0, self._count_trace_file(logdir=self.train_dir))
+        model.fit(
+            x,
+            y,
+            batch_size=2,
+            epochs=2,
+            validation_data=(x, y),
+            callbacks=[tb_cbk],
+        )
+        summary_file = list_summaries(self.logdir)
 
+        # Enabled trace only on the 10000th batch, thus it should be empty.
+        self.assertEmpty(summary_file.tensors)
+        self.assertEqual(0, self._count_xplane_file(logdir=self.train_dir))
 
-class MostRecentlyModifiedFileMatchingPatternTest(tf.test.TestCase):
 
-  def test_get_most_recently_modified_file_matching_pattern(self):
-    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
-    test_dir = self.get_temp_dir()
-    path_pattern = os.path.join(test_dir, file_pattern)
-    file_paths = [
-        os.path.join(test_dir, file_name) for file_name in
-        ['f.batch03epoch02.h5', 'f.batch02epoch02.h5', 'f.batch01epoch01.h5']
-    ]
-    for file_path in file_paths:
-      with open(file_path, 'w') as f:
-        # Ensure there are some intervals between file creation.
-        time.sleep(2)
-        f.write('foo bar')
-    # Ensure the files have been actually written.
-    self.assertEqual(
-        set([
+class MostRecentlyModifiedFileMatchingPatternTest(tf.test.TestCase):
+    def test_get_most_recently_modified_file_matching_pattern(self):
+        file_pattern = "f.batch{batch:02d}epoch{epoch:02d}.h5"
+        test_dir = self.get_temp_dir()
+        path_pattern = os.path.join(test_dir, file_pattern)
+        file_paths = [
             os.path.join(test_dir, file_name)
-            for file_name in os.listdir(test_dir)
-        ]), set(file_paths))
-    self.assertEqual(
-        keras.callbacks.ModelCheckpoint(None)
-        ._get_most_recently_modified_file_matching_pattern(path_pattern),
-        file_paths[-1])
-
-  def test_some_file_not_matching_pattern(self):
-    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
-    test_dir = self.get_temp_dir()
-    path_pattern = os.path.join(test_dir, file_pattern)
-    file_paths = [
-        os.path.join(test_dir, file_name) for file_name in
-        ['f.batch03epoch02.h5', 'f.batch02epoch02.h5', 'f.baatch01epoch01.h5']
-    ]
-    for file_path in file_paths:
-      with open(file_path, 'w') as f:
-        # Ensure there are some intervals between file creation.
-        time.sleep(2)
-        f.write('foo bar')
-    self.assertEqual(
-        keras.callbacks.ModelCheckpoint(None)
-        ._get_most_recently_modified_file_matching_pattern(path_pattern),
-        file_paths[-2])
-
-  def test_get_same_file_if_file_name_equals_pattern(self):
-    file_name = 'f.batch02.h5'
-    test_dir = self.get_temp_dir()
-    file_path = os.path.join(test_dir, file_name)
-    with open(file_path, 'w') as f:
-      f.write('foo bar')
-    self.assertEqual(os.path.join(test_dir, os.listdir(test_dir)[0]), file_path)
-    self.assertEqual(
-        keras.callbacks.ModelCheckpoint(
-            None)._get_most_recently_modified_file_matching_pattern(file_path),
-        file_path)
-
-  def test_get_none_if_file_does_not_exist(self):
-    file_name = 'f.batch02.h5'
-    test_dir = self.get_temp_dir()
-    file_path = os.path.join(test_dir, file_name)
-    self.assertLen(os.listdir(test_dir), 0)
-    self.assertEqual(
-        keras.callbacks.ModelCheckpoint(
-            None)._get_most_recently_modified_file_matching_pattern(file_path),
-        None)
-
-  def test_using_checkpoint_management_latest_checkpoint(self):
-    file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}'
-    ckpt_file_name = 'f.batchXepochY'
-    test_dir = self.get_temp_dir()
-    path_pattern = os.path.join(test_dir, file_pattern)
-    ckpt_file_path = os.path.join(test_dir, ckpt_file_name)
-    with open(ckpt_file_path, 'w') as f:
-      f.write('dummy ckpt')
-    tf.__internal__.train.update_checkpoint_state(
-        test_dir, ckpt_file_path)
-
-    file_paths = [
-        os.path.join(test_dir, file_name)
-        for file_name in ['f.batch03epoch02', 'f.batch02epoch02']
-    ]
-    for file_path in file_paths:
-      with open(file_path, 'w') as f:
-        f.write('foo bar')
-
-    # The result returned from checkpoint_management.latest_checkpoint takes
-    # priority, so even if it was written earlier, we should still return that.
-    self.assertEqual(
-        keras.callbacks.ModelCheckpoint(None)
-        ._get_most_recently_modified_file_matching_pattern(path_pattern),
-        ckpt_file_path)
-
+            for file_name in [
+                "f.batch03epoch02.h5",
+                "f.batch02epoch02.h5",
+                "f.batch01epoch01.h5",
+            ]
+        ]
+        for file_path in file_paths:
+            with open(file_path, "w") as f:
+                # Ensure there are some intervals between file creation.
+                time.sleep(2)
+                f.write("foo bar")
+        # Ensure the files have been actually written.
+        self.assertEqual(
+            set(
+                [
+                    os.path.join(test_dir, file_name)
+                    for file_name in os.listdir(test_dir)
+                ]
+            ),
+            set(file_paths),
+        )
+        self.assertEqual(
+            keras.callbacks.ModelCheckpoint(
+                None
+            )._get_most_recently_modified_file_matching_pattern(path_pattern),
+            file_paths[-1],
+        )
+
+    def test_some_file_not_matching_pattern(self):
+        file_pattern = "f.batch{batch:02d}epoch{epoch:02d}.h5"
+        test_dir = self.get_temp_dir()
+        path_pattern = os.path.join(test_dir, file_pattern)
+        file_paths = [
+            os.path.join(test_dir, file_name)
+            for file_name in [
+                "f.batch03epoch02.h5",
+                "f.batch02epoch02.h5",
+                "f.baatch01epoch01.h5",
+            ]
+        ]
+        for file_path in file_paths:
+            with open(file_path, "w") as f:
+                # Ensure there are some intervals between file creation.
+                time.sleep(2)
+                f.write("foo bar")
+        self.assertEqual(
+            keras.callbacks.ModelCheckpoint(
+                None
+            )._get_most_recently_modified_file_matching_pattern(path_pattern),
+            file_paths[-2],
+        )
+
+    def test_get_same_file_if_file_name_equals_pattern(self):
+        file_name = "f.batch02.h5"
+        test_dir = self.get_temp_dir()
+        file_path = os.path.join(test_dir, file_name)
+        with open(file_path, "w") as f:
+            f.write("foo bar")
+        self.assertEqual(
+            os.path.join(test_dir, os.listdir(test_dir)[0]), file_path
+        )
+        self.assertEqual(
+            keras.callbacks.ModelCheckpoint(
+                None
+            )._get_most_recently_modified_file_matching_pattern(file_path),
+            file_path,
+        )
+
+    def test_get_none_if_file_does_not_exist(self):
+        file_name = "f.batch02.h5"
+        test_dir = self.get_temp_dir()
+        file_path = os.path.join(test_dir, file_name)
+        self.assertEmpty(os.listdir(test_dir))
+        self.assertEqual(
+            keras.callbacks.ModelCheckpoint(
+                None
+            )._get_most_recently_modified_file_matching_pattern(file_path),
+            None,
+        )
+
+    def test_using_checkpoint_management_latest_checkpoint(self):
+        file_pattern = "f.batch{batch:02d}epoch{epoch:02d}"
+        ckpt_file_name = "f.batchXepochY"
+        test_dir = self.get_temp_dir()
+        path_pattern = os.path.join(test_dir, file_pattern)
+        ckpt_file_path = os.path.join(test_dir, ckpt_file_name)
+        with open(ckpt_file_path, "w") as f:
+            f.write("dummy ckpt")
+        tf.__internal__.train.update_checkpoint_state(test_dir, ckpt_file_path)
+
+        file_paths = [
+            os.path.join(test_dir, file_name)
+            for file_name in ["f.batch03epoch02", "f.batch02epoch02"]
+        ]
+        for file_path in file_paths:
+            with open(file_path, "w") as f:
+                f.write("foo bar")
 
-class SummaryOpsTest(tf.test.TestCase):
+        # The result returned from checkpoint_management.latest_checkpoint takes
+        # priority, so even if it was written earlier, we should still return
+        # that.
+        self.assertEqual(
+            keras.callbacks.ModelCheckpoint(
+                None
+            )._get_most_recently_modified_file_matching_pattern(path_pattern),
+            ckpt_file_path,
+        )
 
-  def tearDown(self):
-    super(SummaryOpsTest, self).tearDown()
-    tf.summary.trace_off()
-
-  def keras_model(self, *args, **kwargs):
-    logdir = self.get_temp_dir()
-    writer = tf.summary.create_file_writer(logdir)
-    with writer.as_default():
-      keras.callbacks.keras_model_summary(*args, **kwargs)
-    writer.close()
-    events = events_from_logdir(logdir)
-    # The first event contains no summary values. The written content goes to
-    # the second event.
-    return events[1]
-
-  @test_utils.run_v2_only
-  def testKerasModel(self):
-    model = keras.Sequential(
-        [Dense(10, input_shape=(100,)),
-         Activation('relu', name='my_relu')])
-    event = self.keras_model(name='my_name', data=model, step=1)
-    first_val = event.summary.value[0]
-    self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
-
-  @test_utils.run_v2_only
-  def testKerasModel_usesDefaultStep(self):
-    model = keras.Sequential(
-        [Dense(10, input_shape=(100,)),
-         Activation('relu', name='my_relu')])
-    try:
-      tf.summary.experimental.set_step(42)
-      event = self.keras_model(name='my_name', data=model)
-      self.assertEqual(42, event.step)
-    finally:
-      # Reset to default state for other tests.
-      tf.summary.experimental.set_step(None)
-
-  @test_utils.run_v2_only
-  def testKerasModel_subclass(self):
-
-    class SimpleSubclass(keras.Model):
-
-      def __init__(self):
-        super().__init__(name='subclass')
-        self.dense = Dense(10, input_shape=(100,))
-        self.activation = Activation('relu', name='my_relu')
-
-      def call(self, inputs):
-        x = self.dense(inputs)
-        return self.activation(x)
-
-      # Intentionally erroring out at json serialization to test the warning.
-      def get_config(self):
-        raise NotImplementedError
 
-    model = SimpleSubclass()
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning') as mock_log:
-      self.assertFalse(
-          keras.callbacks.keras_model_summary(
-              name='my_name', data=model, step=1))
-      self.assertRegex(
-          str(mock_log.call_args), 'Model failed to serialize as JSON.')
-
-  @test_utils.run_v2_only
-  def testKerasModel_otherExceptions(self):
-    model = keras.Sequential()
-
-    with tf.compat.v1.test.mock.patch.object(model, 'to_json') as mock_to_json:
-      with tf.compat.v1.test.mock.patch.object(logging, 'warning') as mock_log:
-        mock_to_json.side_effect = Exception('oops')
-        self.assertFalse(
-            keras.callbacks.keras_model_summary(
-                name='my_name', data=model, step=1))
-        self.assertRegex(
-            str(mock_log.call_args),
-            'Model failed to serialize as JSON. Ignoring')
+class SummaryOpsTest(tf.test.TestCase):
+    def tearDown(self):
+        super(SummaryOpsTest, self).tearDown()
+        tf.summary.trace_off()
+
+    def keras_model(self, *args, **kwargs):
+        logdir = self.get_temp_dir()
+        writer = tf.summary.create_file_writer(logdir)
+        with writer.as_default():
+            keras.callbacks.keras_model_summary(*args, **kwargs)
+        writer.close()
+        events = events_from_logdir(logdir)
+        # The first event contains no summary values. The written content goes
+        # to the second event.
+        return events[1]
+
+    @test_utils.run_v2_only
+    def testKerasModel(self):
+        model = keras.Sequential(
+            [Dense(10, input_shape=(100,)), Activation("relu", name="my_relu")]
+        )
+        event = self.keras_model(name="my_name", data=model, step=1)
+        first_val = event.summary.value[0]
+        self.assertEqual(
+            model.to_json(), first_val.tensor.string_val[0].decode()
+        )
+
+    @test_utils.run_v2_only
+    def testKerasModel_usesDefaultStep(self):
+        model = keras.Sequential(
+            [Dense(10, input_shape=(100,)), Activation("relu", name="my_relu")]
+        )
+        try:
+            tf.summary.experimental.set_step(42)
+            event = self.keras_model(name="my_name", data=model)
+            self.assertEqual(42, event.step)
+        finally:
+            # Reset to default state for other tests.
+            tf.summary.experimental.set_step(None)
+
+    @test_utils.run_v2_only
+    def testKerasModel_subclass(self):
+        class SimpleSubclass(keras.Model):
+            def __init__(self):
+                super().__init__(name="subclass")
+                self.dense = Dense(10, input_shape=(100,))
+                self.activation = Activation("relu", name="my_relu")
+
+            def call(self, inputs):
+                x = self.dense(inputs)
+                return self.activation(x)
+
+            # Intentionally erroring out at json serialization to test the
+            # warning.
+            def get_config(self):
+                raise NotImplementedError
+
+        model = SimpleSubclass()
+        with tf.compat.v1.test.mock.patch.object(
+            logging, "warning"
+        ) as mock_log:
+            self.assertFalse(
+                keras.callbacks.keras_model_summary(
+                    name="my_name", data=model, step=1
+                )
+            )
+            self.assertRegex(
+                str(mock_log.call_args), "Model failed to serialize as JSON."
+            )
+
+    @test_utils.run_v2_only
+    def testKerasModel_otherExceptions(self):
+        model = keras.Sequential()
+
+        with tf.compat.v1.test.mock.patch.object(
+            model, "to_json"
+        ) as mock_to_json:
+            with tf.compat.v1.test.mock.patch.object(
+                logging, "warning"
+            ) as mock_log:
+                mock_to_json.side_effect = Exception("oops")
+                self.assertFalse(
+                    keras.callbacks.keras_model_summary(
+                        name="my_name", data=model, step=1
+                    )
+                )
+                self.assertRegex(
+                    str(mock_log.call_args),
+                    "Model failed to serialize as JSON. Ignoring",
+                )
 
 
 def events_from_file(filepath):
-  """Returns all events in a single event file.
+    """Returns all events in a single event file.
 
-  Args:
-    filepath: Path to the event file.
+    Args:
+      filepath: Path to the event file.
 
-  Returns:
-    A list of all tf.Event protos in the event file.
-  """
-  result = []
-  raw_dataset = tf.data.TFRecordDataset([filepath])
-  for raw_record in raw_dataset.take(10):
-    event = tf.compat.v1.Event()
-    event.ParseFromString(raw_record.numpy())
-    result.append(event)
-  return result
+    Returns:
+      A list of all tf.Event protos in the event file.
+    """
+    result = []
+    raw_dataset = tf.data.TFRecordDataset([filepath])
+    for raw_record in raw_dataset.take(10):
+        event = tf.compat.v1.Event()
+        event.ParseFromString(raw_record.numpy())
+        result.append(event)
+    return result
 
 
 def events_from_logdir(logdir):
-  """Returns all events in the single eventfile in logdir.
+    """Returns all events in the single eventfile in logdir.
 
-  Args:
-    logdir: The directory in which the single event file is sought.
+    Args:
+      logdir: The directory in which the single event file is sought.
 
-  Returns:
-    A list of all tf.Event protos from the single event file.
+    Returns:
+      A list of all tf.Event protos from the single event file.
 
-  Raises:
-    AssertionError: If logdir does not contain exactly one file.
-  """
-  assert tf.compat.v1.gfile.Exists(logdir)
-  files = tf.compat.v1.gfile.ListDirectory(logdir)
-  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
-  return events_from_file(os.path.join(logdir, files[0]))
+    Raises:
+      AssertionError: If logdir does not contain exactly one file.
+    """
+    assert tf.compat.v1.gfile.Exists(logdir)
+    files = tf.compat.v1.gfile.ListDirectory(logdir)
+    assert len(files) == 1, f"Found not exactly one file in logdir: {files}"
+    return events_from_file(os.path.join(logdir, files[0]))
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/callbacks_v1.py b/keras/callbacks_v1.py
index e09297fcd3ff..013b7bcadef9 100644
--- a/keras/callbacks_v1.py
+++ b/keras/callbacks_v1.py
@@ -12,463 +12,517 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-import-not-at-top
-# pylint: disable=g-classes-have-attributes
-"""Callbacks: utilities called at certain points during model training."""
 
-import tensorflow.compat.v2 as tf
+
+"""Callbacks: utilities called at certain points during model training."""
 
 import os
+
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import callbacks
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export(v1=['keras.callbacks.TensorBoard'])
+@keras_export(v1=["keras.callbacks.TensorBoard"])
 class TensorBoard(callbacks.TensorBoard):
-  # pylint: disable=line-too-long
-  """Enable visualizations for TensorBoard.
-
-  TensorBoard is a visualization tool provided with TensorFlow.
-
-  This callback logs events for TensorBoard, including:
-  * Metrics summary plots
-  * Training graph visualization
-  * Activation histograms
-  * Sampled profiling
-
-  If you have installed TensorFlow with pip, you should be able
-  to launch TensorBoard from the command line:
-
-  ```sh
-  tensorboard --logdir=path_to_your_logs
-  ```
-
-  You can find more information about TensorBoard
-  [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
-
-  Args:
-      log_dir: the path of the directory where to save the log files to be
-        parsed by TensorBoard.
-      histogram_freq: frequency (in epochs) at which to compute activation and
-        weight histograms for the layers of the model. If set to 0, histograms
-        won't be computed. Validation data (or split) must be specified for
-        histogram visualizations.
-      write_graph: whether to visualize the graph in TensorBoard. The log file
-        can become quite large when write_graph is set to True.
-      write_grads: whether to visualize gradient histograms in TensorBoard.
-        `histogram_freq` must be greater than 0.
-      batch_size: size of batch of inputs to feed to the network for histograms
-        computation.
-      write_images: whether to write model weights to visualize as image in
-        TensorBoard.
-      embeddings_freq: frequency (in epochs) at which selected embedding layers
-        will be saved. If set to 0, embeddings won't be computed. Data to be
-        visualized in TensorBoard's Embedding tab must be passed as
-        `embeddings_data`.
-      embeddings_layer_names: a list of names of layers to keep eye on. If None
-        or empty list all the embedding layer will be watched.
-      embeddings_metadata: a dictionary which maps layer name to a file name in
-        which metadata for this embedding layer is saved.
-          [Here are details](
-            https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
-            about metadata files format. In case if the same metadata file is
-            used for all embedding layers, string can be passed.
-      embeddings_data: data to be embedded at layers specified in
-        `embeddings_layer_names`. Numpy array (if the model has a single input)
-        or list of Numpy arrays (if the model has multiple inputs). Learn more
-        about embeddings [in this guide](
-          https://www.tensorflow.org/programmers_guide/embedding).
-      update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
-        writes the losses and metrics to TensorBoard after each batch. The same
-        applies for `'epoch'`. If using an integer, let's say `1000`, the
-        callback will write the metrics and losses to TensorBoard every 1000
-        samples. Note that writing too frequently to TensorBoard can slow down
-        your training.
-      profile_batch: Profile the batch to sample compute characteristics. By
-        default, it will profile the second batch. Set profile_batch=0 to
-        disable profiling.
-
-  Raises:
-      ValueError: If histogram_freq is set and no validation data is provided.
-
-  @compatibility(eager)
-  Using the `TensorBoard` callback will work when eager execution is enabled,
-  with the restriction that outputting histogram summaries of weights and
-  gradients is not supported. Consequently, `histogram_freq` will be ignored.
-  @end_compatibility
-  """
-
-  # pylint: enable=line-too-long
-
-  def __init__(self,
-               log_dir='./logs',
-               histogram_freq=0,
-               batch_size=32,
-               write_graph=True,
-               write_grads=False,
-               write_images=False,
-               embeddings_freq=0,
-               embeddings_layer_names=None,
-               embeddings_metadata=None,
-               embeddings_data=None,
-               update_freq='epoch',
-               profile_batch=2):
-    # Don't call super's init since it is an eager-only version.
-    callbacks.Callback.__init__(self)
-    self.log_dir = log_dir
-    self.histogram_freq = histogram_freq
-    if self.histogram_freq and tf.executing_eagerly():
-      logging.warning(
-          UserWarning('Weight and gradient histograms not supported for eager'
-                      'execution, setting `histogram_freq` to `0`.'))
-      self.histogram_freq = 0
-    self.merged = None
-    self.write_graph = write_graph
-    self.write_grads = write_grads
-    self.write_images = write_images
-    self.batch_size = batch_size
-    self._current_batch = 0
-    self._total_batches_seen = 0
-    self._total_val_batches_seen = 0
-    self.embeddings_freq = embeddings_freq
-    self.embeddings_layer_names = embeddings_layer_names
-    self.embeddings_metadata = embeddings_metadata
-    self.embeddings_data = embeddings_data
-    if update_freq == 'batch':
-      self.update_freq = 1
-    else:
-      self.update_freq = update_freq
-    self._samples_seen = 0
-    self._samples_seen_at_last_write = 0
-    # TODO(fishx): Add a link to the full profiler tutorial.
-    self._profile_batch = profile_batch
-    # True when the profiler was successfully started by this callback.
-    # We track the status here to make sure callbacks do not interfere with
-    # each other. The callback will only stop the profiler it started.
-    self._profiler_started = False
-
-    # TensorBoard should only write summaries on the chief when in a
-    # Multi-Worker setting.
-    self._chief_worker_only = True
-
-  def _init_writer(self, model):
-    """Sets file writer."""
-    if tf.executing_eagerly():
-      self.writer = tf.summary.create_file_writer(self.log_dir)
-      if not model.run_eagerly and self.write_graph:
-        with self.writer.as_default():
-          tf.summary.graph(backend.get_graph())
-    elif self.write_graph:
-      self.writer = tf.compat.v1.summary.FileWriter(
-          self.log_dir, backend.get_graph())
-    else:
-      self.writer = tf.compat.v1.summary.FileWriter(self.log_dir)
-
-  def _make_histogram_ops(self, model):
-    """Defines histogram ops when histogram_freq > 0."""
-    # only make histogram summary op if it hasn't already been made
-    if self.histogram_freq and self.merged is None:
-      for layer in self.model.layers:
-        for weight in layer.weights:
-          mapped_weight_name = weight.name.replace(':', '_')
-          tf.compat.v1.summary.histogram(mapped_weight_name, weight)
-          if self.write_images:
-            w_img = tf.compat.v1.squeeze(weight)
-            shape = tuple(w_img.shape)
-            if len(shape) == 2:  # dense layer kernel case
-              if shape[0] > shape[1]:
-                w_img = tf.compat.v1.transpose(w_img)
-                shape = tuple(w_img.shape)
-              w_img = tf.reshape(w_img, [1, shape[0], shape[1], 1])
-            elif len(shape) == 3:  # convnet case
-              if backend.image_data_format() == 'channels_last':
-                # switch to channels_first to display
-                # every kernel as a separate image
-                w_img = tf.compat.v1.transpose(w_img, perm=[2, 0, 1])
-                shape = tuple(w_img.shape)
-              w_img = tf.reshape(w_img, [shape[0], shape[1], shape[2], 1])
-            elif len(shape) == 1:  # bias case
-              w_img = tf.reshape(w_img, [1, shape[0], 1, 1])
-            else:
-              # not possible to handle 3D convnets etc.
-              continue
-
-            shape = tuple(w_img.shape)
-            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
-            tf.compat.v1.summary.image(mapped_weight_name, w_img)
-
-        if self.write_grads:
-          for weight in layer.trainable_weights:
-            mapped_weight_name = weight.name.replace(':', '_')
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [
-                grad.values if is_indexed_slices(grad) else grad
-                for grad in grads
-            ]
-            tf.compat.v1.summary.histogram('{}_grad'.format(mapped_weight_name), grads)
-
-        if hasattr(layer, 'output'):
-          if isinstance(layer.output, list):
-            for i, output in enumerate(layer.output):
-              tf.compat.v1.summary.histogram('{}_out_{}'.format(layer.name, i), output)
-          else:
-            tf.compat.v1.summary.histogram('{}_out'.format(layer.name), layer.output)
-
-  def set_model(self, model):
-    """Sets Keras model and creates summary ops."""
-
-    self.model = model
-    self._init_writer(model)
-    # histogram summaries only enabled in graph mode
-    if not tf.executing_eagerly():
-      self._make_histogram_ops(model)
-      self.merged = tf.compat.v1.summary.merge_all()
-
-    # If both embedding_freq and embeddings_data are available, we will
-    # visualize embeddings.
-    if self.embeddings_freq and self.embeddings_data is not None:
-      # Avoid circular dependency.
-      from keras.engine import training_utils_v1  # pylint: disable=g-import-not-at-top
-      self.embeddings_data = training_utils_v1.standardize_input_data(
-          self.embeddings_data, model.input_names)
-
-      # If embedding_layer_names are not provided, get all of the embedding
-      # layers from the model.
-      embeddings_layer_names = self.embeddings_layer_names
-      if not embeddings_layer_names:
-        embeddings_layer_names = [
-            layer.name
-            for layer in self.model.layers
-            if type(layer).__name__ == 'Embedding'
-        ]
-
-      self.assign_embeddings = []
-      embeddings_vars = {}
-
-      self.batch_id = batch_id = tf.compat.v1.placeholder(tf.int32)
-      self.step = step = tf.compat.v1.placeholder(tf.int32)
-
-      for layer in self.model.layers:
-        if layer.name in embeddings_layer_names:
-          embedding_input = self.model.get_layer(layer.name).output
-          embedding_size = np.prod(embedding_input.shape[1:])
-          embedding_input = tf.reshape(embedding_input,
-                                              (step, int(embedding_size)))
-          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
-          embedding = tf.Variable(
-              tf.zeros(shape), name=layer.name + '_embedding')
-          embeddings_vars[layer.name] = embedding
-          batch = tf.compat.v1.assign(embedding[batch_id:batch_id + step],
-                                   embedding_input)
-          self.assign_embeddings.append(batch)
-
-      self.saver = tf.compat.v1.train.Saver(list(embeddings_vars.values()))
-
-      # Create embeddings_metadata dictionary
-      if isinstance(self.embeddings_metadata, str):
-        embeddings_metadata = {
-            layer_name: self.embeddings_metadata
-            for layer_name in embeddings_vars.keys()
-        }
-      else:
-        # If embedding_metadata is already a dictionary
-        embeddings_metadata = self.embeddings_metadata
 
-      try:
-        from tensorboard.plugins import projector
-      except ImportError:
-        raise ImportError('Failed to import TensorBoard. Please make sure that '
-                          'TensorBoard integration is complete."')
+    """Enable visualizations for TensorBoard.
 
-      # TODO(psv): Add integration tests to test embedding visualization
-      # with TensorBoard callback. We are unable to write a unit test for this
-      # because TensorBoard dependency assumes TensorFlow package is installed.
-      config = projector.ProjectorConfig()
-      for layer_name, tensor in embeddings_vars.items():
-        embedding = config.embeddings.add()
-        embedding.tensor_name = tensor.name
+    TensorBoard is a visualization tool provided with TensorFlow.
 
-        if (embeddings_metadata is not None and
-            layer_name in embeddings_metadata):
-          embedding.metadata_path = embeddings_metadata[layer_name]
+    This callback logs events for TensorBoard, including:
+    * Metrics summary plots
+    * Training graph visualization
+    * Activation histograms
+    * Sampled profiling
 
-      projector.visualize_embeddings(self.writer, config)
+    If you have installed TensorFlow with pip, you should be able
+    to launch TensorBoard from the command line:
 
-  def _fetch_callback(self, summary):
-    self.writer.add_summary(summary, self._total_val_batches_seen)
-    self._total_val_batches_seen += 1
+    ```sh
+    tensorboard --logdir=path_to_your_logs
+    ```
 
-  def _write_custom_summaries(self, step, logs=None):
-    """Writes metrics out as custom scalar summaries.
+    You can find more information about TensorBoard
+    [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 
     Args:
-        step: the global step to use for TensorBoard.
-        logs: dict. Keys are scalar summary names, values are
-            NumPy scalars.
-
-    """
-    logs = logs or {}
-    if tf.executing_eagerly():
-      # use v2 summary ops
-      with self.writer.as_default(), tf.summary.record_if(True):
-        for name, value in logs.items():
-          if isinstance(value, np.ndarray):
-            value = value.item()
-          tf.summary.scalar(name, value, step=step)
-    else:
-      # use FileWriter from v1 summary
-      for name, value in logs.items():
-        if isinstance(value, np.ndarray):
-          value = value.item()
-        summary = tf.compat.v1.Summary()
-        summary_value = summary.value.add()
-        summary_value.simple_value = value
-        summary_value.tag = name
-        self.writer.add_summary(summary, step)
-    self.writer.flush()
-
-  def on_train_batch_begin(self, batch, logs=None):
-    if self._total_batches_seen == self._profile_batch - 1:
-      self._start_profiler()
-
-  def on_train_batch_end(self, batch, logs=None):
-    return self.on_batch_end(batch, logs)
-
-  def on_test_begin(self, logs=None):
-    pass
-
-  def on_test_end(self, logs=None):
-    pass
-
-  def on_batch_end(self, batch, logs=None):
-    """Writes scalar summaries for metrics on every training batch.
-
-    Performs profiling if current batch is in profiler_batches.
+        log_dir: the path of the directory where to save the log files to be
+          parsed by TensorBoard.
+        histogram_freq: frequency (in epochs) at which to compute activation and
+          weight histograms for the layers of the model. If set to 0, histograms
+          won't be computed. Validation data (or split) must be specified for
+          histogram visualizations.
+        write_graph: whether to visualize the graph in TensorBoard. The log file
+          can become quite large when write_graph is set to True.
+        write_grads: whether to visualize gradient histograms in TensorBoard.
+          `histogram_freq` must be greater than 0.
+        batch_size: size of batch of inputs to feed to the network for
+          histograms computation.
+        write_images: whether to write model weights to visualize as image in
+          TensorBoard.
+        embeddings_freq: frequency (in epochs) at which selected embedding
+          layers will be saved. If set to 0, embeddings won't be computed. Data
+          to be visualized in TensorBoard's Embedding tab must be passed as
+          `embeddings_data`.
+        embeddings_layer_names: a list of names of layers to keep eye on. If
+          None or empty list all the embedding layer will be watched.
+        embeddings_metadata: a dictionary which maps layer name to a file name
+          in which metadata for this embedding layer is saved.
+            [Here are details](
+              https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+              about metadata files format. In case if the same metadata file is
+              used for all embedding layers, string can be passed.
+        embeddings_data: data to be embedded at layers specified in
+          `embeddings_layer_names`. Numpy array (if the model has a single
+          input) or list of Numpy arrays (if the model has multiple inputs).
+          Learn more about embeddings [in this guide](
+          https://www.tensorflow.org/programmers_guide/embedding).
+        update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
+          writes the losses and metrics to TensorBoard after each batch. The
+          same applies for `'epoch'`. If using an integer, let's say `1000`, the
+          callback will write the metrics and losses to TensorBoard every 1000
+          samples. Note that writing too frequently to TensorBoard can slow down
+          your training.
+        profile_batch: Profile the batch to sample compute characteristics. By
+          default, it will profile the second batch. Set profile_batch=0 to
+          disable profiling.
+
+    Raises:
+        ValueError: If histogram_freq is set and no validation data is provided.
+
+    @compatibility(eager)
+    Using the `TensorBoard` callback will work when eager execution is enabled,
+    with the restriction that outputting histogram summaries of weights and
+    gradients is not supported. Consequently, `histogram_freq` will be ignored.
+    @end_compatibility
     """
-    # Don't output batch_size and batch number as TensorBoard summaries
-    logs = logs or {}
-    self._samples_seen += logs.get('size', 1)
-    samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
-    if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
-      batch_logs = {('batch_' + k): v
-                    for k, v in logs.items()
-                    if k not in ['batch', 'size', 'num_steps']}
-      self._write_custom_summaries(self._total_batches_seen, batch_logs)
-      self._samples_seen_at_last_write = self._samples_seen
-    self._total_batches_seen += 1
-    self._stop_profiler()
-
-  def on_train_begin(self, logs=None):
-    pass
-
-  def on_epoch_begin(self, epoch, logs=None):
-    """Add histogram op to Model eval_function callbacks, reset batch count."""
-
-    # check if histogram summary should be run for this epoch
-    if self.histogram_freq and epoch % self.histogram_freq == 0:
-      # pylint: disable=protected-access
-      # add the histogram summary op if it should run this epoch
-      self.model._make_test_function()
-      if self.merged not in self.model.test_function.fetches:
-        self.model.test_function.fetches.append(self.merged)
-        self.model.test_function.fetch_callbacks[
-            self.merged] = self._fetch_callback
-      # pylint: enable=protected-access
-
-  def on_epoch_end(self, epoch, logs=None):
-    """Checks if summary ops should run next epoch, logs scalar summaries."""
-
-    # don't output batch_size and
-    # batch number as TensorBoard summaries
-    logs = {('epoch_' + k): v
-            for k, v in logs.items()
-            if k not in ['batch', 'size', 'num_steps']}
-    if self.update_freq == 'epoch':
-      step = epoch
-    else:
-      step = self._samples_seen
-    self._write_custom_summaries(step, logs)
-
-    # pop the histogram summary op after each epoch
-    if self.histogram_freq:
-      # pylint: disable=protected-access
-      if self.merged in self.model.test_function.fetches:
-        self.model.test_function.fetches.remove(self.merged)
-      if self.merged in self.model.test_function.fetch_callbacks:
-        self.model.test_function.fetch_callbacks.pop(self.merged)
-      # pylint: enable=protected-access
-
-    if self.embeddings_data is None and self.embeddings_freq:
-      raise ValueError('To visualize embeddings, embeddings_data must '
-                       'be provided.')
-
-    if self.embeddings_freq and self.embeddings_data is not None:
-      if epoch % self.embeddings_freq == 0:
-        # We need a second forward-pass here because we're passing
-        # the `embeddings_data` explicitly. This design allows to pass
-        # arbitrary data as `embeddings_data` and results from the fact
-        # that we need to know the size of the `tf.Variable`s which
-        # hold the embeddings in `set_model`. At this point, however,
-        # the `validation_data` is not yet set.
-
-        embeddings_data = self.embeddings_data
-        n_samples = embeddings_data[0].shape[0]
-        i = 0
-        sess = backend.get_session()
-        while i < n_samples:
-          step = min(self.batch_size, n_samples - i)
-          batch = slice(i, i + step)
-
-          if isinstance(self.model.input, list):
-            feed_dict = {
-                model_input: embeddings_data[idx][batch]
-                for idx, model_input in enumerate(self.model.input)
+
+    def __init__(
+        self,
+        log_dir="./logs",
+        histogram_freq=0,
+        batch_size=32,
+        write_graph=True,
+        write_grads=False,
+        write_images=False,
+        embeddings_freq=0,
+        embeddings_layer_names=None,
+        embeddings_metadata=None,
+        embeddings_data=None,
+        update_freq="epoch",
+        profile_batch=2,
+    ):
+        # Don't call super's init since it is an eager-only version.
+        callbacks.Callback.__init__(self)
+        self.log_dir = log_dir
+        self.histogram_freq = histogram_freq
+        if self.histogram_freq and tf.executing_eagerly():
+            logging.warning(
+                UserWarning(
+                    "Weight and gradient histograms not supported for eager"
+                    "execution, setting `histogram_freq` to `0`."
+                )
+            )
+            self.histogram_freq = 0
+        self.merged = None
+        self.write_graph = write_graph
+        self.write_grads = write_grads
+        self.write_images = write_images
+        self.batch_size = batch_size
+        self._current_batch = 0
+        self._total_batches_seen = 0
+        self._total_val_batches_seen = 0
+        self.embeddings_freq = embeddings_freq
+        self.embeddings_layer_names = embeddings_layer_names
+        self.embeddings_metadata = embeddings_metadata
+        self.embeddings_data = embeddings_data
+        if update_freq == "batch":
+            self.update_freq = 1
+        else:
+            self.update_freq = update_freq
+        self._samples_seen = 0
+        self._samples_seen_at_last_write = 0
+        # TODO(fishx): Add a link to the full profiler tutorial.
+        self._profile_batch = profile_batch
+        # True when the profiler was successfully started by this callback.
+        # We track the status here to make sure callbacks do not interfere with
+        # each other. The callback will only stop the profiler it started.
+        self._profiler_started = False
+
+        # TensorBoard should only write summaries on the chief when in a
+        # Multi-Worker setting.
+        self._chief_worker_only = True
+
+    def _init_writer(self, model):
+        """Sets file writer."""
+        if tf.executing_eagerly():
+            self.writer = tf.summary.create_file_writer(self.log_dir)
+            if not model.run_eagerly and self.write_graph:
+                with self.writer.as_default():
+                    tf.summary.graph(backend.get_graph())
+        elif self.write_graph:
+            self.writer = tf.compat.v1.summary.FileWriter(
+                self.log_dir, backend.get_graph()
+            )
+        else:
+            self.writer = tf.compat.v1.summary.FileWriter(self.log_dir)
+
+    def _make_histogram_ops(self, model):
+        """Defines histogram ops when histogram_freq > 0."""
+        # only make histogram summary op if it hasn't already been made
+        if self.histogram_freq and self.merged is None:
+            for layer in self.model.layers:
+                for weight in layer.weights:
+                    mapped_weight_name = weight.name.replace(":", "_")
+                    tf.compat.v1.summary.histogram(mapped_weight_name, weight)
+                    if self.write_images:
+                        w_img = tf.compat.v1.squeeze(weight)
+                        shape = tuple(w_img.shape)
+                        if len(shape) == 2:  # dense layer kernel case
+                            if shape[0] > shape[1]:
+                                w_img = tf.compat.v1.transpose(w_img)
+                                shape = tuple(w_img.shape)
+                            w_img = tf.reshape(
+                                w_img, [1, shape[0], shape[1], 1]
+                            )
+                        elif len(shape) == 3:  # convnet case
+                            if backend.image_data_format() == "channels_last":
+                                # switch to channels_first to display
+                                # every kernel as a separate image
+                                w_img = tf.compat.v1.transpose(
+                                    w_img, perm=[2, 0, 1]
+                                )
+                                shape = tuple(w_img.shape)
+                            w_img = tf.reshape(
+                                w_img, [shape[0], shape[1], shape[2], 1]
+                            )
+                        elif len(shape) == 1:  # bias case
+                            w_img = tf.reshape(w_img, [1, shape[0], 1, 1])
+                        else:
+                            # not possible to handle 3D convnets etc.
+                            continue
+
+                        shape = tuple(w_img.shape)
+                        assert len(shape) == 4 and shape[-1] in [1, 3, 4]
+                        tf.compat.v1.summary.image(mapped_weight_name, w_img)
+
+                if self.write_grads:
+                    for weight in layer.trainable_weights:
+                        mapped_weight_name = weight.name.replace(":", "_")
+                        grads = model.optimizer.get_gradients(
+                            model.total_loss, weight
+                        )
+
+                        def is_indexed_slices(grad):
+                            return type(grad).__name__ == "IndexedSlices"
+
+                        grads = [
+                            grad.values if is_indexed_slices(grad) else grad
+                            for grad in grads
+                        ]
+                        tf.compat.v1.summary.histogram(
+                            f"{mapped_weight_name}_grad", grads
+                        )
+
+                if hasattr(layer, "output"):
+                    if isinstance(layer.output, list):
+                        for i, output in enumerate(layer.output):
+                            tf.compat.v1.summary.histogram(
+                                f"{layer.name}_out_{i}", output
+                            )
+                    else:
+                        tf.compat.v1.summary.histogram(
+                            f"{layer.name}_out", layer.output
+                        )
+
+    def set_model(self, model):
+        """Sets Keras model and creates summary ops."""
+
+        self.model = model
+        self._init_writer(model)
+        # histogram summaries only enabled in graph mode
+        if not tf.executing_eagerly():
+            self._make_histogram_ops(model)
+            self.merged = tf.compat.v1.summary.merge_all()
+
+        # If both embedding_freq and embeddings_data are available, we will
+        # visualize embeddings.
+        if self.embeddings_freq and self.embeddings_data is not None:
+            # Avoid circular dependency.
+            from keras.engine import (
+                training_utils_v1,
+            )
+
+            self.embeddings_data = training_utils_v1.standardize_input_data(
+                self.embeddings_data, model.input_names
+            )
+
+            # If embedding_layer_names are not provided, get all of the
+            # embedding layers from the model.
+            embeddings_layer_names = self.embeddings_layer_names
+            if not embeddings_layer_names:
+                embeddings_layer_names = [
+                    layer.name
+                    for layer in self.model.layers
+                    if type(layer).__name__ == "Embedding"
+                ]
+
+            self.assign_embeddings = []
+            embeddings_vars = {}
+
+            self.batch_id = batch_id = tf.compat.v1.placeholder(tf.int32)
+            self.step = step = tf.compat.v1.placeholder(tf.int32)
+
+            for layer in self.model.layers:
+                if layer.name in embeddings_layer_names:
+                    embedding_input = self.model.get_layer(layer.name).output
+                    embedding_size = np.prod(embedding_input.shape[1:])
+                    embedding_input = tf.reshape(
+                        embedding_input, (step, int(embedding_size))
+                    )
+                    shape = (
+                        self.embeddings_data[0].shape[0],
+                        int(embedding_size),
+                    )
+                    embedding = tf.Variable(
+                        tf.zeros(shape), name=layer.name + "_embedding"
+                    )
+                    embeddings_vars[layer.name] = embedding
+                    batch = tf.compat.v1.assign(
+                        embedding[batch_id : batch_id + step], embedding_input
+                    )
+                    self.assign_embeddings.append(batch)
+
+            self.saver = tf.compat.v1.train.Saver(
+                list(embeddings_vars.values())
+            )
+
+            # Create embeddings_metadata dictionary
+            if isinstance(self.embeddings_metadata, str):
+                embeddings_metadata = {
+                    layer_name: self.embeddings_metadata
+                    for layer_name in embeddings_vars.keys()
+                }
+            else:
+                # If embedding_metadata is already a dictionary
+                embeddings_metadata = self.embeddings_metadata
+
+            try:
+                # isort: off
+                from tensorboard.plugins import projector
+            except ImportError:
+                raise ImportError(
+                    "Failed to import TensorBoard. Please make sure that "
+                    'TensorBoard integration is complete."'
+                )
+
+            # TODO(psv): Add integration tests to test embedding visualization
+            # with TensorBoard callback. We are unable to write a unit test for
+            # this because TensorBoard dependency assumes TensorFlow package is
+            # installed.
+            config = projector.ProjectorConfig()
+            for layer_name, tensor in embeddings_vars.items():
+                embedding = config.embeddings.add()
+                embedding.tensor_name = tensor.name
+
+                if (
+                    embeddings_metadata is not None
+                    and layer_name in embeddings_metadata
+                ):
+                    embedding.metadata_path = embeddings_metadata[layer_name]
+
+            projector.visualize_embeddings(self.writer, config)
+
+    def _fetch_callback(self, summary):
+        self.writer.add_summary(summary, self._total_val_batches_seen)
+        self._total_val_batches_seen += 1
+
+    def _write_custom_summaries(self, step, logs=None):
+        """Writes metrics out as custom scalar summaries.
+
+        Args:
+            step: the global step to use for TensorBoard.
+            logs: dict. Keys are scalar summary names, values are
+                NumPy scalars.
+
+        """
+        logs = logs or {}
+        if tf.executing_eagerly():
+            # use v2 summary ops
+            with self.writer.as_default(), tf.summary.record_if(True):
+                for name, value in logs.items():
+                    if isinstance(value, np.ndarray):
+                        value = value.item()
+                    tf.summary.scalar(name, value, step=step)
+        else:
+            # use FileWriter from v1 summary
+            for name, value in logs.items():
+                if isinstance(value, np.ndarray):
+                    value = value.item()
+                summary = tf.compat.v1.Summary()
+                summary_value = summary.value.add()
+                summary_value.simple_value = value
+                summary_value.tag = name
+                self.writer.add_summary(summary, step)
+        self.writer.flush()
+
+    def on_train_batch_begin(self, batch, logs=None):
+        if self._total_batches_seen == self._profile_batch - 1:
+            self._start_profiler()
+
+    def on_train_batch_end(self, batch, logs=None):
+        return self.on_batch_end(batch, logs)
+
+    def on_test_begin(self, logs=None):
+        pass
+
+    def on_test_end(self, logs=None):
+        pass
+
+    def on_batch_end(self, batch, logs=None):
+        """Writes scalar summaries for metrics on every training batch.
+
+        Performs profiling if current batch is in profiler_batches.
+        """
+        # Don't output batch_size and batch number as TensorBoard summaries
+        logs = logs or {}
+        self._samples_seen += logs.get("size", 1)
+        samples_seen_since = (
+            self._samples_seen - self._samples_seen_at_last_write
+        )
+        if (
+            self.update_freq != "epoch"
+            and samples_seen_since >= self.update_freq
+        ):
+            batch_logs = {
+                ("batch_" + k): v
+                for k, v in logs.items()
+                if k not in ["batch", "size", "num_steps"]
             }
-          else:
-            feed_dict = {self.model.input: embeddings_data[0][batch]}
-
-          feed_dict.update({self.batch_id: i, self.step: step})
-
-          if not isinstance(backend.learning_phase(), int):
-            feed_dict[backend.learning_phase()] = False
-
-          sess.run(self.assign_embeddings, feed_dict=feed_dict)
-          self.saver.save(sess,
-                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
-                          epoch)
-
-          i += self.batch_size
-
-  def on_train_end(self, logs=None):
-    self._stop_profiler()
-    self.writer.close()
-
-  def _start_profiler(self):
-    """Starts the profiler if currently inactive."""
-    if self._profiler_started:
-      return
-    try:
-      tf.profiler.experimental.start(logdir=self.log_dir)
-      self._profiler_started = True
-    except tf.errors.AlreadyExistsError as e:
-      # Profiler errors should not be fatal.
-      logging.error('Failed to start profiler: %s', e.message)
-
-  def _stop_profiler(self):
-    """Stops the profiler if currently active."""
-    if not self._profiler_started:
-      return
-    try:
-      tf.profiler.experimental.stop()
-    except tf.errors.UnavailableError as e:
-      # Profiler errors should not be fatal.
-      logging.error('Failed to stop profiler: %s', e.message)
-    finally:
-      self._profiler_started = False
+            self._write_custom_summaries(self._total_batches_seen, batch_logs)
+            self._samples_seen_at_last_write = self._samples_seen
+        self._total_batches_seen += 1
+        self._stop_profiler()
+
+    def on_train_begin(self, logs=None):
+        pass
+
+    def on_epoch_begin(self, epoch, logs=None):
+        """Add histogram op to Model eval_function callbacks, reset batch
+        count."""
+
+        # check if histogram summary should be run for this epoch
+        if self.histogram_freq and epoch % self.histogram_freq == 0:
+
+            # add the histogram summary op if it should run this epoch
+            self.model._make_test_function()
+            if self.merged not in self.model.test_function.fetches:
+                self.model.test_function.fetches.append(self.merged)
+                self.model.test_function.fetch_callbacks[
+                    self.merged
+                ] = self._fetch_callback
+
+    def on_epoch_end(self, epoch, logs=None):
+        """Checks if summary ops should run next epoch, logs scalar
+        summaries."""
+
+        # don't output batch_size and
+        # batch number as TensorBoard summaries
+        logs = {
+            ("epoch_" + k): v
+            for k, v in logs.items()
+            if k not in ["batch", "size", "num_steps"]
+        }
+        if self.update_freq == "epoch":
+            step = epoch
+        else:
+            step = self._samples_seen
+        self._write_custom_summaries(step, logs)
+
+        # pop the histogram summary op after each epoch
+        if self.histogram_freq:
+
+            if self.merged in self.model.test_function.fetches:
+                self.model.test_function.fetches.remove(self.merged)
+            if self.merged in self.model.test_function.fetch_callbacks:
+                self.model.test_function.fetch_callbacks.pop(self.merged)
+
+        if self.embeddings_data is None and self.embeddings_freq:
+            raise ValueError(
+                "To visualize embeddings, embeddings_data must be provided."
+            )
+
+        if self.embeddings_freq and self.embeddings_data is not None:
+            if epoch % self.embeddings_freq == 0:
+                # We need a second forward-pass here because we're passing
+                # the `embeddings_data` explicitly. This design allows to pass
+                # arbitrary data as `embeddings_data` and results from the fact
+                # that we need to know the size of the `tf.Variable`s which
+                # hold the embeddings in `set_model`. At this point, however,
+                # the `validation_data` is not yet set.
+
+                embeddings_data = self.embeddings_data
+                n_samples = embeddings_data[0].shape[0]
+                i = 0
+                sess = backend.get_session()
+                while i < n_samples:
+                    step = min(self.batch_size, n_samples - i)
+                    batch = slice(i, i + step)
+
+                    if isinstance(self.model.input, list):
+                        feed_dict = {
+                            model_input: embeddings_data[idx][batch]
+                            for idx, model_input in enumerate(self.model.input)
+                        }
+                    else:
+                        feed_dict = {
+                            self.model.input: embeddings_data[0][batch]
+                        }
+
+                    feed_dict.update({self.batch_id: i, self.step: step})
+
+                    if not isinstance(backend.learning_phase(), int):
+                        feed_dict[backend.learning_phase()] = False
+
+                    sess.run(self.assign_embeddings, feed_dict=feed_dict)
+                    self.saver.save(
+                        sess,
+                        os.path.join(self.log_dir, "keras_embedding.ckpt"),
+                        epoch,
+                    )
+
+                    i += self.batch_size
+
+    def on_train_end(self, logs=None):
+        self._stop_profiler()
+        self.writer.close()
+
+    def _start_profiler(self):
+        """Starts the profiler if currently inactive."""
+        if self._profiler_started:
+            return
+        try:
+            tf.profiler.experimental.start(logdir=self.log_dir)
+            self._profiler_started = True
+        except tf.errors.AlreadyExistsError as e:
+            # Profiler errors should not be fatal.
+            logging.error("Failed to start profiler: %s", e.message)
+
+    def _stop_profiler(self):
+        """Stops the profiler if currently active."""
+        if not self._profiler_started:
+            return
+        try:
+            tf.profiler.experimental.stop()
+        except tf.errors.UnavailableError as e:
+            # Profiler errors should not be fatal.
+            logging.error("Failed to stop profiler: %s", e.message)
+        finally:
+            self._profiler_started = False
diff --git a/keras/callbacks_v1_test.py b/keras/callbacks_v1_test.py
index da0202e35881..b46c6e9f185e 100644
--- a/keras/callbacks_v1_test.py
+++ b/keras/callbacks_v1_test.py
@@ -14,25 +14,24 @@
 # ==============================================================================
 """Tests for Keras callbacks."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import shutil
 import tempfile
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras import callbacks
 from keras import callbacks_v1
-from keras.testing_infra import test_combinations
 from keras import layers
-from keras.testing_infra import test_utils
 from keras.engine import input_layer
 from keras.engine import sequential
 from keras.engine import training
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 from keras.utils import np_utils
 
-
 TRAIN_SAMPLES = 10
 TEST_SAMPLES = 10
 NUM_CLASSES = 2
@@ -42,523 +41,581 @@
 
 
 class TestTensorBoardV1(tf.test.TestCase, parameterized.TestCase):
-
-  def test_TensorBoard(self):
-    np.random.seed(1337)
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = np_utils.to_categorical(y_test)
-    y_train = np_utils.to_categorical(y_train)
-
-    def data_generator(train):
-      if train:
-        max_batch_index = len(x_train) // BATCH_SIZE
-      else:
-        max_batch_index = len(x_test) // BATCH_SIZE
-      i = 0
-      while 1:
-        if train:
-          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        else:
-          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        i += 1
-        i %= max_batch_index
-
-    # case: Sequential
-    with tf.Graph().as_default(), self.cached_session():
-      model = sequential.Sequential()
-      model.add(
-          layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(layers.BatchNormalization())
-      model.add(layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = callbacks_v1.TensorBoard(
-          log_dir=temp_dir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      # fit with validation data and accuracy
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      # fit generator with validation data
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data
-      # histogram_freq must be zero
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator with validation data and accuracy
-      tsb.histogram_freq = 1
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data and accuracy
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
-      assert os.path.exists(temp_dir)
-
-  def test_TensorBoard_multi_input_output(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with tf.Graph().as_default(), self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield ([x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          else:
-            yield ([x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          i += 1
-          i %= max_batch_index
-
-      inp1 = input_layer.Input((INPUT_DIM,))
-      inp2 = input_layer.Input((INPUT_DIM,))
-      inp = layers.add([inp1, inp2])
-      hidden = layers.Dense(2, activation='relu')(inp)
-      hidden = layers.Dropout(0.1)(hidden)
-      output1 = layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      output2 = layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = training.Model([inp1, inp2], [output1, output2])
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [
-            callbacks_v1.TensorBoard(
-                log_dir=filepath,
-                histogram_freq=histogram_freq,
+    def test_TensorBoard(self):
+        np.random.seed(1337)
+
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_test = np_utils.to_categorical(y_test)
+        y_train = np_utils.to_categorical(y_train)
+
+        def data_generator(train):
+            if train:
+                max_batch_index = len(x_train) // BATCH_SIZE
+            else:
+                max_batch_index = len(x_test) // BATCH_SIZE
+            i = 0
+            while 1:
+                if train:
+                    yield (
+                        x_train[i * BATCH_SIZE : (i + 1) * BATCH_SIZE],
+                        y_train[i * BATCH_SIZE : (i + 1) * BATCH_SIZE],
+                    )
+                else:
+                    yield (
+                        x_test[i * BATCH_SIZE : (i + 1) * BATCH_SIZE],
+                        y_test[i * BATCH_SIZE : (i + 1) * BATCH_SIZE],
+                    )
+                i += 1
+                i %= max_batch_index
+
+        # case: Sequential
+        with tf.Graph().as_default(), self.cached_session():
+            model = sequential.Sequential()
+            model.add(
+                layers.Dense(NUM_HIDDEN, input_dim=INPUT_DIM, activation="relu")
+            )
+            # non_trainable_weights: moving_variance, moving_mean
+            model.add(layers.BatchNormalization())
+            model.add(layers.Dense(NUM_CLASSES, activation="softmax"))
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+            tsb = callbacks_v1.TensorBoard(
+                log_dir=temp_dir,
+                histogram_freq=1,
                 write_images=True,
                 write_grads=True,
-                batch_size=5)
-        ]
-
-      # fit without validation data
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
-                callbacks=callbacks_factory(histogram_freq=0), epochs=3)
-
-      # fit with validation data and accuracy
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
+                batch_size=5,
+            )
+            cbks = [tsb]
+
+            # fit with validation data
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=3,
+                verbose=0,
+            )
+
+            # fit with validation data and accuracy
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=2,
+                verbose=0,
+            )
+
+            # fit generator with validation data
+            model.fit_generator(
+                data_generator(True),
+                len(x_train),
+                epochs=2,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                verbose=0,
+            )
+
+            # fit generator without validation data
+            # histogram_freq must be zero
+            tsb.histogram_freq = 0
+            model.fit_generator(
+                data_generator(True),
+                len(x_train),
+                epochs=2,
+                callbacks=cbks,
+                verbose=0,
+            )
+
+            # fit generator with validation data and accuracy
+            tsb.histogram_freq = 1
+            model.fit_generator(
+                data_generator(True),
+                len(x_train),
+                epochs=2,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                verbose=0,
+            )
+
+            # fit generator without validation data and accuracy
+            tsb.histogram_freq = 0
+            model.fit_generator(
+                data_generator(True), len(x_train), epochs=2, callbacks=cbks
+            )
+            assert os.path.exists(temp_dir)
+
+    def test_TensorBoard_multi_input_output(self):
+        np.random.seed(1337)
+        tmpdir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+        with tf.Graph().as_default(), self.cached_session():
+            filepath = os.path.join(tmpdir, "logs")
+
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+
+            def data_generator(train):
+                if train:
+                    max_batch_index = len(x_train) // BATCH_SIZE
+                else:
+                    max_batch_index = len(x_test) // BATCH_SIZE
+                i = 0
+                while 1:
+                    if train:
+                        # simulate multi-input/output models
+                        yield (
+                            [x_train[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]]
+                            * 2,
+                            [y_train[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]]
+                            * 2,
+                        )
+                    else:
+                        yield (
+                            [x_test[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]] * 2,
+                            [y_test[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]] * 2,
+                        )
+                    i += 1
+                    i %= max_batch_index
+
+            inp1 = input_layer.Input((INPUT_DIM,))
+            inp2 = input_layer.Input((INPUT_DIM,))
+            inp = layers.add([inp1, inp2])
+            hidden = layers.Dense(2, activation="relu")(inp)
+            hidden = layers.Dropout(0.1)(hidden)
+            output1 = layers.Dense(NUM_CLASSES, activation="softmax")(hidden)
+            output2 = layers.Dense(NUM_CLASSES, activation="softmax")(hidden)
+            model = training.Model([inp1, inp2], [output1, output2])
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+
+            # we must generate new callbacks for each test, as they aren't
+            # stateless
+            def callbacks_factory(histogram_freq):
+                return [
+                    callbacks_v1.TensorBoard(
+                        log_dir=filepath,
+                        histogram_freq=histogram_freq,
+                        write_images=True,
+                        write_grads=True,
+                        batch_size=5,
+                    )
+                ]
+
+            # fit without validation data
+            model.fit(
+                [x_train] * 2,
+                [y_train] * 2,
+                batch_size=BATCH_SIZE,
+                callbacks=callbacks_factory(histogram_freq=0),
+                epochs=3,
+            )
+
+            # fit with validation data and accuracy
+            model.fit(
+                [x_train] * 2,
+                [y_train] * 2,
+                batch_size=BATCH_SIZE,
                 validation_data=([x_test] * 2, [y_test] * 2),
-                callbacks=callbacks_factory(histogram_freq=1), epochs=2)
-
-      # fit generator without validation data
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          callbacks=callbacks_factory(histogram_freq=0))
-
-      # fit generator with validation data and accuracy
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          validation_data=([x_test] * 2, [y_test] * 2),
-                          callbacks=callbacks_factory(histogram_freq=1))
-      assert os.path.isdir(filepath)
-
-  def test_Tensorboard_histogram_summaries_in_test_function(self):
-
-    class FileWriterStub:
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.steps_seen = []
-
-      def add_summary(self, summary, global_step):
-        summary_obj = tf.compat.v1.Summary()
-
-        # ensure a valid Summary proto is being sent
-        if isinstance(summary, bytes):
-          summary_obj.ParseFromString(summary)
-        else:
-          assert isinstance(summary, tf.compat.v1.Summary)
-          summary_obj = summary
-
-        # keep track of steps seen for the merged_summary op,
-        # which contains the histogram summaries
-        if len(summary_obj.value) > 1:
-          self.steps_seen.append(global_step)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    def _init_writer(obj, _):
-      obj.writer = FileWriterStub(obj.log_dir)
-
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-    (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = np_utils.to_categorical(y_test)
-    y_train = np_utils.to_categorical(y_train)
-
-    with tf.Graph().as_default(), self.cached_session():
-      model = sequential.Sequential()
-      model.add(
-          layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(layers.BatchNormalization())
-      model.add(layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      callbacks_v1.TensorBoard._init_writer = _init_writer
-      tsb = callbacks_v1.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
-
-  def test_Tensorboard_histogram_summaries_with_generator(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    def generator():
-      x = np.random.randn(10, 100).astype(np.float32)
-      y = np.random.randn(10, 10).astype(np.float32)
-      while True:
-        yield x, y
-
-    with tf.Graph().as_default(), self.cached_session():
-      model = test_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=10, input_dim=100)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = callbacks_v1.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation generator
-      model.fit_generator(
-          generator(),
-          steps_per_epoch=2,
-          epochs=2,
-          validation_data=generator(),
-          validation_steps=2,
-          callbacks=cbks,
-          verbose=0)
-
-      with self.assertRaises(ValueError):
-        # fit with validation generator but no
-        # validation_steps
-        model.fit_generator(
-            generator(),
-            steps_per_epoch=2,
-            epochs=2,
-            validation_data=generator(),
+                callbacks=callbacks_factory(histogram_freq=1),
+                epochs=2,
+            )
+
+            # fit generator without validation data
+            model.fit_generator(
+                data_generator(True),
+                len(x_train),
+                epochs=2,
+                callbacks=callbacks_factory(histogram_freq=0),
+            )
+
+            # fit generator with validation data and accuracy
+            model.fit_generator(
+                data_generator(True),
+                len(x_train),
+                epochs=2,
+                validation_data=([x_test] * 2, [y_test] * 2),
+                callbacks=callbacks_factory(histogram_freq=1),
+            )
+            assert os.path.isdir(filepath)
+
+    def test_Tensorboard_histogram_summaries_in_test_function(self):
+        class FileWriterStub:
+            def __init__(self, logdir, graph=None):
+                self.logdir = logdir
+                self.graph = graph
+                self.steps_seen = []
+
+            def add_summary(self, summary, global_step):
+                summary_obj = tf.compat.v1.Summary()
+
+                # ensure a valid Summary proto is being sent
+                if isinstance(summary, bytes):
+                    summary_obj.ParseFromString(summary)
+                else:
+                    assert isinstance(summary, tf.compat.v1.Summary)
+                    summary_obj = summary
+
+                # keep track of steps seen for the merged_summary op,
+                # which contains the histogram summaries
+                if len(summary_obj.value) > 1:
+                    self.steps_seen.append(global_step)
+
+            def flush(self):
+                pass
+
+            def close(self):
+                pass
+
+        def _init_writer(obj, _):
+            obj.writer = FileWriterStub(obj.log_dir)
+
+        np.random.seed(1337)
+        tmpdir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_test = np_utils.to_categorical(y_test)
+        y_train = np_utils.to_categorical(y_train)
+
+        with tf.Graph().as_default(), self.cached_session():
+            model = sequential.Sequential()
+            model.add(
+                layers.Dense(NUM_HIDDEN, input_dim=INPUT_DIM, activation="relu")
+            )
+            # non_trainable_weights: moving_variance, moving_mean
+            model.add(layers.BatchNormalization())
+            model.add(layers.Dense(NUM_CLASSES, activation="softmax"))
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+            callbacks_v1.TensorBoard._init_writer = _init_writer
+            tsb = callbacks_v1.TensorBoard(
+                log_dir=tmpdir,
+                histogram_freq=1,
+                write_images=True,
+                write_grads=True,
+                batch_size=5,
+            )
+            cbks = [tsb]
+
+            # fit with validation data
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=3,
+                verbose=0,
+            )
+
+            self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
+
+    def test_Tensorboard_histogram_summaries_with_generator(self):
+        np.random.seed(1337)
+        tmpdir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+        def generator():
+            x = np.random.randn(10, 100).astype(np.float32)
+            y = np.random.randn(10, 10).astype(np.float32)
+            while True:
+                yield x, y
+
+        with tf.Graph().as_default(), self.cached_session():
+            model = test_utils.get_small_sequential_mlp(
+                num_hidden=10, num_classes=10, input_dim=100
+            )
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+            tsb = callbacks_v1.TensorBoard(
+                log_dir=tmpdir,
+                histogram_freq=1,
+                write_images=True,
+                write_grads=True,
+                batch_size=5,
+            )
+            cbks = [tsb]
+
+            # fit with validation generator
+            model.fit_generator(
+                generator(),
+                steps_per_epoch=2,
+                epochs=2,
+                validation_data=generator(),
+                validation_steps=2,
+                callbacks=cbks,
+                verbose=0,
+            )
+
+            with self.assertRaises(ValueError):
+                # fit with validation generator but no
+                # validation_steps
+                model.fit_generator(
+                    generator(),
+                    steps_per_epoch=2,
+                    epochs=2,
+                    validation_data=generator(),
+                    callbacks=cbks,
+                    verbose=0,
+                )
+
+            self.assertTrue(os.path.exists(tmpdir))
+
+    def test_TensorBoard_with_ReduceLROnPlateau(self):
+        with self.cached_session():
+            temp_dir = self.get_temp_dir()
+            self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=TRAIN_SAMPLES,
+                test_samples=TEST_SAMPLES,
+                input_shape=(INPUT_DIM,),
+                num_classes=NUM_CLASSES,
+            )
+            y_test = np_utils.to_categorical(y_test)
+            y_train = np_utils.to_categorical(y_train)
+
+            model = test_utils.get_small_sequential_mlp(
+                num_hidden=NUM_HIDDEN,
+                num_classes=NUM_CLASSES,
+                input_dim=INPUT_DIM,
+            )
+            model.compile(
+                loss="binary_crossentropy",
+                optimizer="sgd",
+                metrics=["accuracy"],
+            )
+
+            cbks = [
+                callbacks.ReduceLROnPlateau(
+                    monitor="val_loss", factor=0.5, patience=4, verbose=1
+                ),
+                callbacks_v1.TensorBoard(log_dir=temp_dir),
+            ]
+
+            model.fit(
+                x_train,
+                y_train,
+                batch_size=BATCH_SIZE,
+                validation_data=(x_test, y_test),
+                callbacks=cbks,
+                epochs=2,
+                verbose=0,
+            )
+
+            assert os.path.exists(temp_dir)
+
+    def test_Tensorboard_batch_logging(self):
+        class FileWriterStub:
+            def __init__(self, logdir, graph=None):
+                self.logdir = logdir
+                self.graph = graph
+                self.batches_logged = []
+                self.summary_values = []
+                self.summary_tags = []
+
+            def add_summary(self, summary, step):
+                self.summary_values.append(summary.value[0].simple_value)
+                self.summary_tags.append(summary.value[0].tag)
+                self.batches_logged.append(step)
+
+            def flush(self):
+                pass
+
+            def close(self):
+                pass
+
+        with tf.Graph().as_default():
+            temp_dir = self.get_temp_dir()
+            self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+            tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq="batch")
+            tb_cbk.writer = FileWriterStub(temp_dir)
+
+            for batch in range(5):
+                tb_cbk.on_batch_end(batch, {"acc": batch})
+            self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
+            self.assertEqual(
+                tb_cbk.writer.summary_values, [0.0, 1.0, 2.0, 3.0, 4.0]
+            )
+            self.assertEqual(tb_cbk.writer.summary_tags, ["batch_acc"] * 5)
+
+    def test_Tensorboard_epoch_and_batch_logging(self):
+        class FileWriterStub:
+            def __init__(self, logdir, graph=None):
+                self.logdir = logdir
+                self.graph = graph
+
+            def add_summary(self, summary, step):
+                if "batch_" in summary.value[0].tag:
+                    self.batch_summary = (step, summary)
+                elif "epoch_" in summary.value[0].tag:
+                    self.epoch_summary = (step, summary)
+
+            def flush(self):
+                pass
+
+            def close(self):
+                pass
+
+        with tf.Graph().as_default():
+            temp_dir = self.get_temp_dir()
+            self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+            tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq="batch")
+            tb_cbk.writer = FileWriterStub(temp_dir)
+
+            tb_cbk.on_batch_end(0, {"acc": 5.0})
+            tb_cbk.on_train_end()
+            batch_step, batch_summary = tb_cbk.writer.batch_summary
+            self.assertEqual(batch_step, 0)
+            self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+
+            tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq="epoch")
+            tb_cbk.writer = FileWriterStub(temp_dir)
+            tb_cbk.on_epoch_end(0, {"acc": 10.0})
+            tb_cbk.on_train_end()
+            epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
+            self.assertEqual(epoch_step, 0)
+            self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_Tensorboard_eager(self):
+        temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_test = np_utils.to_categorical(y_test)
+        y_train = np_utils.to_categorical(y_train)
+
+        model = test_utils.get_small_sequential_mlp(
+            num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM
+        )
+        model.compile(
+            loss="binary_crossentropy",
+            optimizer=tf.compat.v1.train.AdamOptimizer(0.01),
+            metrics=["accuracy"],
+        )
+
+        cbks = [callbacks_v1.TensorBoard(log_dir=temp_dir)]
+
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
             callbacks=cbks,
-            verbose=0)
-
-      self.assertTrue(os.path.exists(tmpdir))
-
-  def test_TensorBoard_with_ReduceLROnPlateau(self):
-    with self.cached_session():
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-
-      model = test_utils.get_small_sequential_mlp(
-          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-      model.compile(
-          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
-
-      cbks = [
-          callbacks.ReduceLROnPlateau(
-              monitor='val_loss', factor=0.5, patience=4, verbose=1),
-          callbacks_v1.TensorBoard(log_dir=temp_dir)
-      ]
-
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      assert os.path.exists(temp_dir)
-
-  def test_Tensorboard_batch_logging(self):
-
-    class FileWriterStub:
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batches_logged = []
-        self.summary_values = []
-        self.summary_tags = []
-
-      def add_summary(self, summary, step):
-        self.summary_values.append(summary.value[0].simple_value)
-        self.summary_tags.append(summary.value[0].tag)
-        self.batches_logged.append(step)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    with tf.Graph().as_default():
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-      tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
-      tb_cbk.writer = FileWriterStub(temp_dir)
-
-      for batch in range(5):
-        tb_cbk.on_batch_end(batch, {'acc': batch})
-      self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
-      self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
-      self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
-
-  def test_Tensorboard_epoch_and_batch_logging(self):
-
-    class FileWriterStub:
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summary = (step, summary)
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summary = (step, summary)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    with tf.Graph().as_default():
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-      tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
-      tb_cbk.writer = FileWriterStub(temp_dir)
-
-      tb_cbk.on_batch_end(0, {'acc': 5.0})
-      tb_cbk.on_train_end()
-      batch_step, batch_summary = tb_cbk.writer.batch_summary
-      self.assertEqual(batch_step, 0)
-      self.assertEqual(batch_summary.value[0].simple_value, 5.0)
-
-      tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
-      tb_cbk.writer = FileWriterStub(temp_dir)
-      tb_cbk.on_epoch_end(0, {'acc': 10.0})
-      tb_cbk.on_train_end()
-      epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
-      self.assertEqual(epoch_step, 0)
-      self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_Tensorboard_eager(self):
-    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = np_utils.to_categorical(y_test)
-    y_train = np_utils.to_categorical(y_train)
-
-    model = test_utils.get_small_sequential_mlp(
-        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer=tf.compat.v1.train.AdamOptimizer(0.01),
-        metrics=['accuracy'])
-
-    cbks = [callbacks_v1.TensorBoard(log_dir=temp_dir)]
-
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=2,
-        verbose=0)
-
-    self.assertTrue(os.path.exists(temp_dir))
-
-  def test_TensorBoard_update_freq(self):
-
-    class FileWriterStub:
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batch_summaries = []
-        self.epoch_summaries = []
-
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summaries.append((step, summary))
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summaries.append((step, summary))
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    with tf.Graph().as_default():
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-      # Epoch mode
-      tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
-      tb_cbk.writer = FileWriterStub(temp_dir)
-
-      tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-      self.assertEqual(tb_cbk.writer.batch_summaries, [])
-      tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
-      self.assertLen(tb_cbk.writer.epoch_summaries, 1)
-      tb_cbk.on_train_end()
-
-      # Batch mode
-      tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
-      tb_cbk.writer = FileWriterStub(temp_dir)
-
-      tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-      self.assertLen(tb_cbk.writer.batch_summaries, 1)
-      tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-      self.assertLen(tb_cbk.writer.batch_summaries, 2)
-      self.assertFalse(tb_cbk.writer.epoch_summaries)
-      tb_cbk.on_train_end()
-
-      # Integer mode
-      tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq=20)
-      tb_cbk.writer = FileWriterStub(temp_dir)
-
-      tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-      self.assertFalse(tb_cbk.writer.batch_summaries)
-      tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-      self.assertLen(tb_cbk.writer.batch_summaries, 1)
-      tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-      self.assertLen(tb_cbk.writer.batch_summaries, 1)
-      tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-      self.assertLen(tb_cbk.writer.batch_summaries, 2)
-      tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
-      self.assertLen(tb_cbk.writer.batch_summaries, 2)
-      self.assertFalse(tb_cbk.writer.epoch_summaries)
-      tb_cbk.on_train_end()
-
-
-if __name__ == '__main__':
-  tf.test.main()
+            epochs=2,
+            verbose=0,
+        )
+
+        self.assertTrue(os.path.exists(temp_dir))
+
+    def test_TensorBoard_update_freq(self):
+        class FileWriterStub:
+            def __init__(self, logdir, graph=None):
+                self.logdir = logdir
+                self.graph = graph
+                self.batch_summaries = []
+                self.epoch_summaries = []
+
+            def add_summary(self, summary, step):
+                if "batch_" in summary.value[0].tag:
+                    self.batch_summaries.append((step, summary))
+                elif "epoch_" in summary.value[0].tag:
+                    self.epoch_summaries.append((step, summary))
+
+            def flush(self):
+                pass
+
+            def close(self):
+                pass
+
+        with tf.Graph().as_default():
+            temp_dir = self.get_temp_dir()
+            self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+            # Epoch mode
+            tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq="epoch")
+            tb_cbk.writer = FileWriterStub(temp_dir)
+
+            tb_cbk.on_batch_end(0, {"acc": 5.0, "size": 1})
+            self.assertEqual(tb_cbk.writer.batch_summaries, [])
+            tb_cbk.on_epoch_end(0, {"acc": 10.0, "size": 1})
+            self.assertLen(tb_cbk.writer.epoch_summaries, 1)
+            tb_cbk.on_train_end()
+
+            # Batch mode
+            tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq="batch")
+            tb_cbk.writer = FileWriterStub(temp_dir)
+
+            tb_cbk.on_batch_end(0, {"acc": 5.0, "size": 1})
+            self.assertLen(tb_cbk.writer.batch_summaries, 1)
+            tb_cbk.on_batch_end(0, {"acc": 5.0, "size": 1})
+            self.assertLen(tb_cbk.writer.batch_summaries, 2)
+            self.assertFalse(tb_cbk.writer.epoch_summaries)
+            tb_cbk.on_train_end()
+
+            # Integer mode
+            tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq=20)
+            tb_cbk.writer = FileWriterStub(temp_dir)
+
+            tb_cbk.on_batch_end(0, {"acc": 5.0, "size": 10})
+            self.assertFalse(tb_cbk.writer.batch_summaries)
+            tb_cbk.on_batch_end(0, {"acc": 5.0, "size": 10})
+            self.assertLen(tb_cbk.writer.batch_summaries, 1)
+            tb_cbk.on_batch_end(0, {"acc": 5.0, "size": 10})
+            self.assertLen(tb_cbk.writer.batch_summaries, 1)
+            tb_cbk.on_batch_end(0, {"acc": 5.0, "size": 10})
+            self.assertLen(tb_cbk.writer.batch_summaries, 2)
+            tb_cbk.on_batch_end(0, {"acc": 10.0, "size": 10})
+            self.assertLen(tb_cbk.writer.batch_summaries, 2)
+            self.assertFalse(tb_cbk.writer.epoch_summaries)
+            tb_cbk.on_train_end()
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/constraints.py b/keras/constraints.py
index c3302ab195c5..4a25f5a3dbf2 100644
--- a/keras/constraints.py
+++ b/keras/constraints.py
@@ -12,295 +12,335 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=g-classes-have-attributes
+
+
 """Constraints: functions that impose constraints on weight values."""
 
+import warnings
+
 import tensorflow.compat.v2 as tf
+
 from keras import backend
-from keras.utils.generic_utils import deserialize_keras_object
-from keras.utils.generic_utils import serialize_keras_object
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.serialization_lib import deserialize_keras_object
+from keras.saving.serialization_lib import serialize_keras_object
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 
-@keras_export('keras.constraints.Constraint')
+@keras_export("keras.constraints.Constraint")
 class Constraint:
-  """Base class for weight constraints.
+    """Base class for weight constraints.
+
+    A `Constraint` instance works like a stateless function.
+    Users who subclass this
+    class should override the `__call__` method, which takes a single
+    weight parameter and return a projected version of that parameter
+    (e.g. normalized or clipped). Constraints can be used with various Keras
+    layers via the `kernel_constraint` or `bias_constraint` arguments.
+
+    Here's a simple example of a non-negative weight constraint:
+
+    >>> class NonNegative(tf.keras.constraints.Constraint):
+    ...
+    ...  def __call__(self, w):
+    ...    return w * tf.cast(tf.math.greater_equal(w, 0.), w.dtype)
+
+    >>> weight = tf.constant((-1.0, 1.0))
+    >>> NonNegative()(weight)
+    <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.,  1.],
+    dtype=float32)>
+
+    >>> tf.keras.layers.Dense(4, kernel_constraint=NonNegative())
+    """
+
+    def __call__(self, w):
+        """Applies the constraint to the input weight variable.
+
+        By default, the inputs weight variable is not modified.
+        Users should override this method to implement their own projection
+        function.
 
-  A `Constraint` instance works like a stateless function.
-  Users who subclass this
-  class should override the `__call__` method, which takes a single
-  weight parameter and return a projected version of that parameter
-  (e.g. normalized or clipped). Constraints can be used with various Keras
-  layers via the `kernel_constraint` or `bias_constraint` arguments.
+        Args:
+          w: Input weight variable.
 
-  Here's a simple example of a non-negative weight constraint:
+        Returns:
+          Projected variable (by default, returns unmodified inputs).
+        """
+        return w
 
-  >>> class NonNegative(tf.keras.constraints.Constraint):
-  ...
-  ...  def __call__(self, w):
-  ...    return w * tf.cast(tf.math.greater_equal(w, 0.), w.dtype)
+    def get_config(self):
+        """Returns a Python dict of the object config.
 
-  >>> weight = tf.constant((-1.0, 1.0))
-  >>> NonNegative()(weight)
-  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.,  1.], dtype=float32)>
+        A constraint config is a Python dictionary (JSON-serializable) that can
+        be used to reinstantiate the same object.
 
-  >>> tf.keras.layers.Dense(4, kernel_constraint=NonNegative())
-  """
+        Returns:
+          Python dict containing the configuration of the constraint object.
+        """
+        return {}
 
-  def __call__(self, w):
-    """Applies the constraint to the input weight variable.
+    @classmethod
+    def from_config(cls, config):
+        """Instantiates a weight constraint from a configuration dictionary.
 
-    By default, the inputs weight variable is not modified.
-    Users should override this method to implement their own projection
-    function.
+        Example:
+
+        ```python
+        constraint = UnitNorm()
+        config = constraint.get_config()
+        constraint = UnitNorm.from_config(config)
+        ```
+
+        Args:
+          config: A Python dictionary, the output of `get_config`.
+
+        Returns:
+          A `tf.keras.constraints.Constraint` instance.
+        """
+        return cls(**config)
+
+
+@keras_export("keras.constraints.MaxNorm", "keras.constraints.max_norm")
+class MaxNorm(Constraint):
+    """MaxNorm weight constraint.
+
+    Constrains the weights incident to each hidden unit
+    to have a norm less than or equal to a desired value.
+
+    Also available via the shortcut function `tf.keras.constraints.max_norm`.
 
     Args:
-      w: Input weight variable.
+      max_value: the maximum norm value for the incoming weights.
+      axis: integer, axis along which to calculate weight norms.
+        For instance, in a `Dense` layer the weight matrix
+        has shape `(input_dim, output_dim)`,
+        set `axis` to `0` to constrain each weight vector
+        of length `(input_dim,)`.
+        In a `Conv2D` layer with `data_format="channels_last"`,
+        the weight tensor has shape
+        `(rows, cols, input_depth, output_depth)`,
+        set `axis` to `[0, 1, 2]`
+        to constrain the weights of each filter tensor of size
+        `(rows, cols, input_depth)`.
 
-    Returns:
-      Projected variable (by default, returns unmodified inputs).
     """
-    return w
 
-  def get_config(self):
-    """Returns a Python dict of the object config.
+    def __init__(self, max_value=2, axis=0):
+        self.max_value = max_value
+        self.axis = axis
 
-    A constraint config is a Python dictionary (JSON-serializable) that can
-    be used to reinstantiate the same object.
+    @doc_controls.do_not_generate_docs
+    def __call__(self, w):
+        norms = backend.sqrt(
+            tf.reduce_sum(tf.square(w), axis=self.axis, keepdims=True)
+        )
+        desired = backend.clip(norms, 0, self.max_value)
+        return w * (desired / (backend.epsilon() + norms))
 
-    Returns:
-      Python dict containing the configuration of the constraint object.
-    """
-    return {}
+    @doc_controls.do_not_generate_docs
+    def get_config(self):
+        return {"max_value": self.max_value, "axis": self.axis}
 
 
-@keras_export('keras.constraints.MaxNorm', 'keras.constraints.max_norm')
-class MaxNorm(Constraint):
-  """MaxNorm weight constraint.
-
-  Constrains the weights incident to each hidden unit
-  to have a norm less than or equal to a desired value.
-
-  Also available via the shortcut function `tf.keras.constraints.max_norm`.
-
-  Args:
-    max_value: the maximum norm value for the incoming weights.
-    axis: integer, axis along which to calculate weight norms.
-      For instance, in a `Dense` layer the weight matrix
-      has shape `(input_dim, output_dim)`,
-      set `axis` to `0` to constrain each weight vector
-      of length `(input_dim,)`.
-      In a `Conv2D` layer with `data_format="channels_last"`,
-      the weight tensor has shape
-      `(rows, cols, input_depth, output_depth)`,
-      set `axis` to `[0, 1, 2]`
-      to constrain the weights of each filter tensor of size
-      `(rows, cols, input_depth)`.
-
-  """
-
-  def __init__(self, max_value=2, axis=0):
-    self.max_value = max_value
-    self.axis = axis
-
-  @doc_controls.do_not_generate_docs
-  def __call__(self, w):
-    norms = backend.sqrt(
-        tf.reduce_sum(tf.square(w), axis=self.axis, keepdims=True))
-    desired = backend.clip(norms, 0, self.max_value)
-    return w * (desired / (backend.epsilon() + norms))
-
-  @doc_controls.do_not_generate_docs
-  def get_config(self):
-    return {'max_value': self.max_value, 'axis': self.axis}
-
-
-@keras_export('keras.constraints.NonNeg', 'keras.constraints.non_neg')
+@keras_export("keras.constraints.NonNeg", "keras.constraints.non_neg")
 class NonNeg(Constraint):
-  """Constrains the weights to be non-negative.
+    """Constrains the weights to be non-negative.
 
-  Also available via the shortcut function `tf.keras.constraints.non_neg`.
-  """
+    Also available via the shortcut function `tf.keras.constraints.non_neg`.
+    """
 
-  def __call__(self, w):
-    return w * tf.cast(tf.greater_equal(w, 0.), backend.floatx())
+    def __call__(self, w):
+        return w * tf.cast(tf.greater_equal(w, 0.0), backend.floatx())
 
 
-@keras_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
+@keras_export("keras.constraints.UnitNorm", "keras.constraints.unit_norm")
 class UnitNorm(Constraint):
-  """Constrains the weights incident to each hidden unit to have unit norm.
-
-  Also available via the shortcut function `tf.keras.constraints.unit_norm`.
-
-  Args:
-    axis: integer, axis along which to calculate weight norms.
-      For instance, in a `Dense` layer the weight matrix
-      has shape `(input_dim, output_dim)`,
-      set `axis` to `0` to constrain each weight vector
-      of length `(input_dim,)`.
-      In a `Conv2D` layer with `data_format="channels_last"`,
-      the weight tensor has shape
-      `(rows, cols, input_depth, output_depth)`,
-      set `axis` to `[0, 1, 2]`
-      to constrain the weights of each filter tensor of size
-      `(rows, cols, input_depth)`.
-  """
-
-  def __init__(self, axis=0):
-    self.axis = axis
-
-  @doc_controls.do_not_generate_docs
-  def __call__(self, w):
-    return w / (
-        backend.epsilon() + backend.sqrt(
-            tf.reduce_sum(
-                tf.square(w), axis=self.axis, keepdims=True)))
-
-  @doc_controls.do_not_generate_docs
-  def get_config(self):
-    return {'axis': self.axis}
-
-
-@keras_export('keras.constraints.MinMaxNorm', 'keras.constraints.min_max_norm')
+    """Constrains the weights incident to each hidden unit to have unit norm.
+
+    Also available via the shortcut function `tf.keras.constraints.unit_norm`.
+
+    Args:
+      axis: integer, axis along which to calculate weight norms.
+        For instance, in a `Dense` layer the weight matrix
+        has shape `(input_dim, output_dim)`,
+        set `axis` to `0` to constrain each weight vector
+        of length `(input_dim,)`.
+        In a `Conv2D` layer with `data_format="channels_last"`,
+        the weight tensor has shape
+        `(rows, cols, input_depth, output_depth)`,
+        set `axis` to `[0, 1, 2]`
+        to constrain the weights of each filter tensor of size
+        `(rows, cols, input_depth)`.
+    """
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    @doc_controls.do_not_generate_docs
+    def __call__(self, w):
+        return w / (
+            backend.epsilon()
+            + backend.sqrt(
+                tf.reduce_sum(tf.square(w), axis=self.axis, keepdims=True)
+            )
+        )
+
+    @doc_controls.do_not_generate_docs
+    def get_config(self):
+        return {"axis": self.axis}
+
+
+@keras_export("keras.constraints.MinMaxNorm", "keras.constraints.min_max_norm")
 class MinMaxNorm(Constraint):
-  """MinMaxNorm weight constraint.
-
-  Constrains the weights incident to each hidden unit
-  to have the norm between a lower bound and an upper bound.
-
-  Also available via the shortcut function `tf.keras.constraints.min_max_norm`.
-
-  Args:
-    min_value: the minimum norm for the incoming weights.
-    max_value: the maximum norm for the incoming weights.
-    rate: rate for enforcing the constraint: weights will be
-      rescaled to yield
-      `(1 - rate) * norm + rate * norm.clip(min_value, max_value)`.
-      Effectively, this means that rate=1.0 stands for strict
-      enforcement of the constraint, while rate<1.0 means that
-      weights will be rescaled at each step to slowly move
-      towards a value inside the desired interval.
-    axis: integer, axis along which to calculate weight norms.
-      For instance, in a `Dense` layer the weight matrix
-      has shape `(input_dim, output_dim)`,
-      set `axis` to `0` to constrain each weight vector
-      of length `(input_dim,)`.
-      In a `Conv2D` layer with `data_format="channels_last"`,
-      the weight tensor has shape
-      `(rows, cols, input_depth, output_depth)`,
-      set `axis` to `[0, 1, 2]`
-      to constrain the weights of each filter tensor of size
-      `(rows, cols, input_depth)`.
-  """
-
-  def __init__(self, min_value=0.0, max_value=1.0, rate=1.0, axis=0):
-    self.min_value = min_value
-    self.max_value = max_value
-    self.rate = rate
-    self.axis = axis
-
-  @doc_controls.do_not_generate_docs
-  def __call__(self, w):
-    norms = backend.sqrt(
-        tf.reduce_sum(tf.square(w), axis=self.axis, keepdims=True))
-    desired = (
-        self.rate * backend.clip(norms, self.min_value, self.max_value) +
-        (1 - self.rate) * norms)
-    return w * (desired / (backend.epsilon() + norms))
-
-  @doc_controls.do_not_generate_docs
-  def get_config(self):
-    return {
-        'min_value': self.min_value,
-        'max_value': self.max_value,
-        'rate': self.rate,
-        'axis': self.axis
-    }
-
-
-@keras_export('keras.constraints.RadialConstraint',
-              'keras.constraints.radial_constraint')
+    """MinMaxNorm weight constraint.
+
+    Constrains the weights incident to each hidden unit
+    to have the norm between a lower bound and an upper bound.
+
+    Also available via the shortcut function
+    `tf.keras.constraints.min_max_norm`.
+
+    Args:
+      min_value: the minimum norm for the incoming weights.
+      max_value: the maximum norm for the incoming weights.
+      rate: rate for enforcing the constraint: weights will be
+        rescaled to yield
+        `(1 - rate) * norm + rate * norm.clip(min_value, max_value)`.
+        Effectively, this means that rate=1.0 stands for strict
+        enforcement of the constraint, while rate<1.0 means that
+        weights will be rescaled at each step to slowly move
+        towards a value inside the desired interval.
+      axis: integer, axis along which to calculate weight norms.
+        For instance, in a `Dense` layer the weight matrix
+        has shape `(input_dim, output_dim)`,
+        set `axis` to `0` to constrain each weight vector
+        of length `(input_dim,)`.
+        In a `Conv2D` layer with `data_format="channels_last"`,
+        the weight tensor has shape
+        `(rows, cols, input_depth, output_depth)`,
+        set `axis` to `[0, 1, 2]`
+        to constrain the weights of each filter tensor of size
+        `(rows, cols, input_depth)`.
+    """
+
+    def __init__(self, min_value=0.0, max_value=1.0, rate=1.0, axis=0):
+        self.min_value = min_value
+        self.max_value = max_value
+        self.rate = rate
+        self.axis = axis
+
+    @doc_controls.do_not_generate_docs
+    def __call__(self, w):
+        norms = backend.sqrt(
+            tf.reduce_sum(tf.square(w), axis=self.axis, keepdims=True)
+        )
+        desired = (
+            self.rate * backend.clip(norms, self.min_value, self.max_value)
+            + (1 - self.rate) * norms
+        )
+        return w * (desired / (backend.epsilon() + norms))
+
+    @doc_controls.do_not_generate_docs
+    def get_config(self):
+        return {
+            "min_value": self.min_value,
+            "max_value": self.max_value,
+            "rate": self.rate,
+            "axis": self.axis,
+        }
+
+
+@keras_export(
+    "keras.constraints.RadialConstraint", "keras.constraints.radial_constraint"
+)
 class RadialConstraint(Constraint):
-  """Constrains `Conv2D` kernel weights to be the same for each radius.
-
-  Also available via the shortcut function
-  `tf.keras.constraints.radial_constraint`.
-
-  For example, the desired output for the following 4-by-4 kernel:
-
-  ```
-      kernel = [[v_00, v_01, v_02, v_03],
-                [v_10, v_11, v_12, v_13],
-                [v_20, v_21, v_22, v_23],
-                [v_30, v_31, v_32, v_33]]
-  ```
-
-  is this::
-
-  ```
-      kernel = [[v_11, v_11, v_11, v_11],
-                [v_11, v_33, v_33, v_11],
-                [v_11, v_33, v_33, v_11],
-                [v_11, v_11, v_11, v_11]]
-  ```
-
-  This constraint can be applied to any `Conv2D` layer version, including
-  `Conv2DTranspose` and `SeparableConv2D`, and with either `"channels_last"` or
-  `"channels_first"` data format. The method assumes the weight tensor is of
-  shape `(rows, cols, input_depth, output_depth)`.
-  """
-
-  @doc_controls.do_not_generate_docs
-  def __call__(self, w):
-    w_shape = w.shape
-    if w_shape.rank is None or w_shape.rank != 4:
-      raise ValueError(
-          'The weight tensor must have rank 4. '
-          f'Received weight tensor with shape: {w_shape}')
-
-    height, width, channels, kernels = w_shape
-    w = backend.reshape(w, (height, width, channels * kernels))
-    # TODO(cpeter): Switch map_fn for a faster tf.vectorized_map once
-    # backend.switch is supported.
-    w = backend.map_fn(
-        self._kernel_constraint,
-        backend.stack(tf.unstack(w, axis=-1), axis=0))
-    return backend.reshape(backend.stack(tf.unstack(w, axis=0), axis=-1),
-                           (height, width, channels, kernels))
-
-  def _kernel_constraint(self, kernel):
-    """Radially constraints a kernel with shape (height, width, channels)."""
-    padding = backend.constant([[1, 1], [1, 1]], dtype='int32')
-
-    kernel_shape = backend.shape(kernel)[0]
-    start = backend.cast(kernel_shape / 2, 'int32')
-
-    kernel_new = backend.switch(
-        backend.cast(tf.math.floormod(kernel_shape, 2), 'bool'),
-        lambda: kernel[start - 1:start, start - 1:start],
-        lambda: kernel[start - 1:start, start - 1:start] + backend.zeros(  # pylint: disable=g-long-lambda
-            (2, 2), dtype=kernel.dtype))
-    index = backend.switch(
-        backend.cast(tf.math.floormod(kernel_shape, 2), 'bool'),
-        lambda: backend.constant(0, dtype='int32'),
-        lambda: backend.constant(1, dtype='int32'))
-    while_condition = lambda index, *args: backend.less(index, start)
-
-    def body_fn(i, array):
-      return i + 1, tf.pad(
-          array,
-          padding,
-          constant_values=kernel[start + i, start + i])
-
-    _, kernel_new = tf.compat.v1.while_loop(
-        while_condition,
-        body_fn,
-        [index, kernel_new],
-        shape_invariants=[index.get_shape(),
-                          tf.TensorShape([None, None])])
-    return kernel_new
+    """Constrains `Conv2D` kernel weights to be the same for each radius.
+
+    Also available via the shortcut function
+    `tf.keras.constraints.radial_constraint`.
+
+    For example, the desired output for the following 4-by-4 kernel:
+
+    ```
+        kernel = [[v_00, v_01, v_02, v_03],
+                  [v_10, v_11, v_12, v_13],
+                  [v_20, v_21, v_22, v_23],
+                  [v_30, v_31, v_32, v_33]]
+    ```
+
+    is this::
+
+    ```
+        kernel = [[v_11, v_11, v_11, v_11],
+                  [v_11, v_33, v_33, v_11],
+                  [v_11, v_33, v_33, v_11],
+                  [v_11, v_11, v_11, v_11]]
+    ```
+
+    This constraint can be applied to any `Conv2D` layer version, including
+    `Conv2DTranspose` and `SeparableConv2D`, and with either `"channels_last"`
+    or `"channels_first"` data format. The method assumes the weight tensor is
+    of shape `(rows, cols, input_depth, output_depth)`.
+    """
+
+    @doc_controls.do_not_generate_docs
+    def __call__(self, w):
+        w_shape = w.shape
+        if w_shape.rank is None or w_shape.rank != 4:
+            raise ValueError(
+                "The weight tensor must have rank 4. "
+                f"Received weight tensor with shape: {w_shape}"
+            )
+
+        height, width, channels, kernels = w_shape
+        w = backend.reshape(w, (height, width, channels * kernels))
+        # TODO(cpeter): Switch map_fn for a faster tf.vectorized_map once
+        # backend.switch is supported.
+        w = backend.map_fn(
+            self._kernel_constraint,
+            backend.stack(tf.unstack(w, axis=-1), axis=0),
+        )
+        return backend.reshape(
+            backend.stack(tf.unstack(w, axis=0), axis=-1),
+            (height, width, channels, kernels),
+        )
+
+    def _kernel_constraint(self, kernel):
+        """Radially constraints a kernel with shape (height, width,
+        channels)."""
+        padding = backend.constant([[1, 1], [1, 1]], dtype="int32")
+
+        kernel_shape = backend.shape(kernel)[0]
+        start = backend.cast(kernel_shape / 2, "int32")
+
+        kernel_new = backend.switch(
+            backend.cast(tf.math.floormod(kernel_shape, 2), "bool"),
+            lambda: kernel[start - 1 : start, start - 1 : start],
+            lambda: kernel[start - 1 : start, start - 1 : start]
+            + backend.zeros((2, 2), dtype=kernel.dtype),
+        )
+        index = backend.switch(
+            backend.cast(tf.math.floormod(kernel_shape, 2), "bool"),
+            lambda: backend.constant(0, dtype="int32"),
+            lambda: backend.constant(1, dtype="int32"),
+        )
+        while_condition = lambda index, *args: backend.less(index, start)
+
+        def body_fn(i, array):
+            return i + 1, tf.pad(
+                array, padding, constant_values=kernel[start + i, start + i]
+            )
+
+        _, kernel_new = tf.compat.v1.while_loop(
+            while_condition,
+            body_fn,
+            [index, kernel_new],
+            shape_invariants=[index.get_shape(), tf.TensorShape([None, None])],
+        )
+        return kernel_new
 
 
 # Aliases.
@@ -317,32 +357,53 @@ def body_fn(i, array):
 unitnorm = unit_norm
 
 
-@keras_export('keras.constraints.serialize')
-def serialize(constraint):
-  return serialize_keras_object(constraint)
-
-
-@keras_export('keras.constraints.deserialize')
-def deserialize(config, custom_objects=None):
-  return deserialize_keras_object(
-      config,
-      module_objects=globals(),
-      custom_objects=custom_objects,
-      printable_module_name='constraint')
-
-
-@keras_export('keras.constraints.get')
+@keras_export("keras.constraints.serialize")
+def serialize(constraint, use_legacy_format=False):
+    if constraint is None:
+        return None
+    if not isinstance(constraint, Constraint):
+        warnings.warn(
+            "The `keras.constraints.serialize()` API should only be used for "
+            "objects of type `keras.constraints.Constraint`. Found an instance "
+            f"of type {type(constraint)}, which may lead to improper "
+            "serialization."
+        )
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(constraint)
+    return serialize_keras_object(constraint)
+
+
+@keras_export("keras.constraints.deserialize")
+def deserialize(config, custom_objects=None, use_legacy_format=False):
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            config,
+            module_objects=globals(),
+            custom_objects=custom_objects,
+            printable_module_name="constraint",
+        )
+    return deserialize_keras_object(
+        config,
+        module_objects=globals(),
+        custom_objects=custom_objects,
+        printable_module_name="constraint",
+    )
+
+
+@keras_export("keras.constraints.get")
 def get(identifier):
-  """Retrieves a Keras constraint function."""
-  if identifier is None:
-    return None
-  if isinstance(identifier, dict):
-    return deserialize(identifier)
-  elif isinstance(identifier, str):
-    config = {'class_name': str(identifier), 'config': {}}
-    return deserialize(config)
-  elif callable(identifier):
-    return identifier
-  else:
-    raise ValueError(
-        f'Could not interpret constraint function identifier: {identifier}')
+    """Retrieves a Keras constraint function."""
+    if identifier is None:
+        return None
+    if isinstance(identifier, dict):
+        use_legacy_format = "module" not in identifier
+        return deserialize(identifier, use_legacy_format=use_legacy_format)
+    elif isinstance(identifier, str):
+        config = {"class_name": str(identifier), "config": {}}
+        return get(config)
+    elif callable(identifier):
+        return identifier
+    else:
+        raise ValueError(
+            f"Could not interpret constraint function identifier: {identifier}"
+        )
diff --git a/keras/constraints_test.py b/keras/constraints_test.py
index a7c0ba06608a..b0fdb95b4367 100644
--- a/keras/constraints_test.py
+++ b/keras/constraints_test.py
@@ -14,98 +14,106 @@
 # ==============================================================================
 """Tests for Keras weights constraints."""
 
-import tensorflow.compat.v2 as tf
-
 import math
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import constraints
+from keras.testing_infra import test_combinations
 
 
 def get_test_values():
-  return [0.1, 0.5, 3, 8, 1e-7]
+    return [0.1, 0.5, 3, 8, 1e-7]
 
 
 def get_example_array():
-  np.random.seed(3537)
-  example_array = np.random.random((100, 100)) * 100. - 50.
-  example_array[0, 0] = 0.  # 0 could possibly cause trouble
-  return example_array
+    np.random.seed(3537)
+    example_array = np.random.random((100, 100)) * 100.0 - 50.0
+    example_array[0, 0] = 0.0  # 0 could possibly cause trouble
+    return example_array
 
 
 def get_example_kernel(width):
-  np.random.seed(3537)
-  example_array = np.random.rand(width, width, 2, 2)
-  return example_array
+    np.random.seed(3537)
+    example_array = np.random.rand(width, width, 2, 2)
+    return example_array
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class KerasConstraintsTest(tf.test.TestCase):
-
-  def test_serialization(self):
-    all_activations = ['max_norm', 'non_neg',
-                       'unit_norm', 'min_max_norm']
-    for name in all_activations:
-      fn = constraints.get(name)
-      ref_fn = getattr(constraints, name)()
-      assert fn.__class__ == ref_fn.__class__
-      config = constraints.serialize(fn)
-      fn = constraints.deserialize(config)
-      assert fn.__class__ == ref_fn.__class__
-
-  def test_max_norm(self):
-    array = get_example_array()
-    for m in get_test_values():
-      norm_instance = constraints.max_norm(m)
-      normed = norm_instance(backend.variable(array))
-      assert np.all(backend.eval(normed) < m)
-
-    # a more explicit example
-    norm_instance = constraints.max_norm(2.0)
-    x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
-    x_normed_target = np.array(
-        [[0, 0, 0], [1.0, 0, 0], [2.0, 0, 0],
-         [2. / np.sqrt(3), 2. / np.sqrt(3), 2. / np.sqrt(3)]]).T
-    x_normed_actual = backend.eval(norm_instance(backend.variable(x)))
-    self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
-
-  def test_non_neg(self):
-    non_neg_instance = constraints.non_neg()
-    normed = non_neg_instance(backend.variable(get_example_array()))
-    assert np.all(np.min(backend.eval(normed), axis=1) == 0.)
-
-  def test_unit_norm(self):
-    unit_norm_instance = constraints.unit_norm()
-    normalized = unit_norm_instance(backend.variable(get_example_array()))
-    norm_of_normalized = np.sqrt(np.sum(backend.eval(normalized)**2, axis=0))
-    # In the unit norm constraint, it should be equal to 1.
-    difference = norm_of_normalized - 1.
-    largest_difference = np.max(np.abs(difference))
-    assert np.abs(largest_difference) < 10e-5
-
-  def test_min_max_norm(self):
-    array = get_example_array()
-    for m in get_test_values():
-      norm_instance = constraints.min_max_norm(min_value=m, max_value=m * 2)
-      normed = norm_instance(backend.variable(array))
-      value = backend.eval(normed)
-      l2 = np.sqrt(np.sum(np.square(value), axis=0))
-      assert not l2[l2 < m]
-      assert not l2[l2 > m * 2 + 1e-5]
-
-  def test_conv2d_radial_constraint(self):
-    for width in (3, 4, 5, 6):
-      array = get_example_kernel(width)
-      norm_instance = constraints.radial_constraint()
-      normed = norm_instance(backend.variable(array))
-      value = backend.eval(normed)
-      assert np.all(value.shape == array.shape)
-      assert np.all(value[0:, 0, 0, 0] == value[-1:, 0, 0, 0])
-      assert len(set(value[..., 0, 0].flatten())) == math.ceil(float(width) / 2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_serialization(self):
+        all_activations = ["max_norm", "non_neg", "unit_norm", "min_max_norm"]
+        for name in all_activations:
+            fn = constraints.get(name)
+            ref_fn = getattr(constraints, name)()
+            assert fn.__class__ == ref_fn.__class__
+            config = constraints.serialize(fn)
+            fn = constraints.deserialize(config)
+            assert fn.__class__ == ref_fn.__class__
+
+    def test_max_norm(self):
+        array = get_example_array()
+        for m in get_test_values():
+            norm_instance = constraints.max_norm(m)
+            normed = norm_instance(backend.variable(array))
+            assert np.all(backend.eval(normed) < m)
+
+        # a more explicit example
+        norm_instance = constraints.max_norm(2.0)
+        x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
+        x_normed_target = np.array(
+            [
+                [0, 0, 0],
+                [1.0, 0, 0],
+                [2.0, 0, 0],
+                [2.0 / np.sqrt(3), 2.0 / np.sqrt(3), 2.0 / np.sqrt(3)],
+            ]
+        ).T
+        x_normed_actual = backend.eval(norm_instance(backend.variable(x)))
+        self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
+
+    def test_non_neg(self):
+        non_neg_instance = constraints.non_neg()
+        normed = non_neg_instance(backend.variable(get_example_array()))
+        assert np.all(np.min(backend.eval(normed), axis=1) == 0.0)
+
+    def test_unit_norm(self):
+        unit_norm_instance = constraints.unit_norm()
+        normalized = unit_norm_instance(backend.variable(get_example_array()))
+        norm_of_normalized = np.sqrt(
+            np.sum(backend.eval(normalized) ** 2, axis=0)
+        )
+        # In the unit norm constraint, it should be equal to 1.
+        difference = norm_of_normalized - 1.0
+        largest_difference = np.max(np.abs(difference))
+        assert np.abs(largest_difference) < 10e-5
+
+    def test_min_max_norm(self):
+        array = get_example_array()
+        for m in get_test_values():
+            norm_instance = constraints.min_max_norm(
+                min_value=m, max_value=m * 2
+            )
+            normed = norm_instance(backend.variable(array))
+            value = backend.eval(normed)
+            l2 = np.sqrt(np.sum(np.square(value), axis=0))
+            assert not l2[l2 < m]
+            assert not l2[l2 > m * 2 + 1e-5]
+
+    def test_conv2d_radial_constraint(self):
+        for width in (3, 4, 5, 6):
+            array = get_example_kernel(width)
+            norm_instance = constraints.radial_constraint()
+            normed = norm_instance(backend.variable(array))
+            value = backend.eval(normed)
+            assert np.all(value.shape == array.shape)
+            assert np.all(value[0:, 0, 0, 0] == value[-1:, 0, 0, 0])
+            assert len(set(value[..., 0, 0].flatten())) == math.ceil(
+                float(width) / 2
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/datasets/BUILD b/keras/datasets/BUILD
index 06be216b3486..325aff5ed829 100644
--- a/keras/datasets/BUILD
+++ b/keras/datasets/BUILD
@@ -1,7 +1,10 @@
 # Description:
 #   Contains the Keras datasets package (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
diff --git a/keras/datasets/boston_housing.py b/keras/datasets/boston_housing.py
index 64b6743ceb8f..08a31e34614b 100644
--- a/keras/datasets/boston_housing.py
+++ b/keras/datasets/boston_housing.py
@@ -17,60 +17,72 @@
 import numpy as np
 
 from keras.utils.data_utils import get_file
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.datasets.boston_housing.load_data')
-def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
-  """Loads the Boston Housing dataset.
+@keras_export("keras.datasets.boston_housing.load_data")
+def load_data(path="boston_housing.npz", test_split=0.2, seed=113):
+    """Loads the Boston Housing dataset.
+
+    This is a dataset taken from the StatLib library which is maintained at
+    Carnegie Mellon University.
 
-  This is a dataset taken from the StatLib library which is maintained at
-  Carnegie Mellon University.
+    **WARNING:** This dataset has an ethical problem: the authors of this
+    dataset included a variable, "B", that may appear to assume that racial
+    self-segregation influences house prices. As such, we strongly discourage
+    the use of this dataset, unless in the context of illustrating ethical
+    issues in data science and machine learning.
 
-  Samples contain 13 attributes of houses at different locations around the
-  Boston suburbs in the late 1970s. Targets are the median values of
-  the houses at a location (in k$).
+    Samples contain 13 attributes of houses at different locations around the
+    Boston suburbs in the late 1970s. Targets are the median values of
+    the houses at a location (in k$).
 
-  The attributes themselves are defined in the
-  [StatLib website](http://lib.stat.cmu.edu/datasets/boston).
+    The attributes themselves are defined in the
+    [StatLib website](http://lib.stat.cmu.edu/datasets/boston).
 
-  Args:
-    path: path where to cache the dataset locally
-        (relative to `~/.keras/datasets`).
-    test_split: fraction of the data to reserve as test set.
-    seed: Random seed for shuffling the data
-        before computing the test split.
+    Args:
+      path: path where to cache the dataset locally
+          (relative to `~/.keras/datasets`).
+      test_split: fraction of the data to reserve as test set.
+      seed: Random seed for shuffling the data
+          before computing the test split.
 
-  Returns:
-    Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+    Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
 
-  **x_train, x_test**: numpy arrays with shape `(num_samples, 13)`
-    containing either the training samples (for x_train),
-    or test samples (for y_train).
+    **x_train, x_test**: numpy arrays with shape `(num_samples, 13)`
+      containing either the training samples (for x_train),
+      or test samples (for y_train).
 
-  **y_train, y_test**: numpy arrays of shape `(num_samples,)` containing the
-    target scalars. The targets are float scalars typically between 10 and
-    50 that represent the home prices in k$.
-  """
-  assert 0 <= test_split < 1
-  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
-  path = get_file(
-      path,
-      origin=origin_folder + 'boston_housing.npz',
-      file_hash=
-      'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
-  with np.load(path, allow_pickle=True) as f:  # pylint: disable=unexpected-keyword-arg
-    x = f['x']
-    y = f['y']
+    **y_train, y_test**: numpy arrays of shape `(num_samples,)` containing the
+      target scalars. The targets are float scalars typically between 10 and
+      50 that represent the home prices in k$.
+    """
+    assert 0 <= test_split < 1
+    origin_folder = (
+        "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
+    )
+    path = get_file(
+        path,
+        origin=origin_folder + "boston_housing.npz",
+        file_hash=(  # noqa: E501
+            "f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5"
+        ),
+    )
+    with np.load(path, allow_pickle=True) as f:
+        x = f["x"]
+        y = f["y"]
 
-  rng = np.random.RandomState(seed)
-  indices = np.arange(len(x))
-  rng.shuffle(indices)
-  x = x[indices]
-  y = y[indices]
+    rng = np.random.RandomState(seed)
+    indices = np.arange(len(x))
+    rng.shuffle(indices)
+    x = x[indices]
+    y = y[indices]
 
-  x_train = np.array(x[:int(len(x) * (1 - test_split))])
-  y_train = np.array(y[:int(len(x) * (1 - test_split))])
-  x_test = np.array(x[int(len(x) * (1 - test_split)):])
-  y_test = np.array(y[int(len(x) * (1 - test_split)):])
-  return (x_train, y_train), (x_test, y_test)
+    x_train = np.array(x[: int(len(x) * (1 - test_split))])
+    y_train = np.array(y[: int(len(x) * (1 - test_split))])
+    x_test = np.array(x[int(len(x) * (1 - test_split)) :])
+    y_test = np.array(y[int(len(x) * (1 - test_split)) :])
+    return (x_train, y_train), (x_test, y_test)
diff --git a/keras/datasets/cifar.py b/keras/datasets/cifar.py
index af4f44bae89f..2d21d066a46d 100644
--- a/keras/datasets/cifar.py
+++ b/keras/datasets/cifar.py
@@ -17,26 +17,26 @@
 import _pickle as cPickle
 
 
-def load_batch(fpath, label_key='labels'):
-  """Internal utility for parsing CIFAR data.
+def load_batch(fpath, label_key="labels"):
+    """Internal utility for parsing CIFAR data.
 
-  Args:
-      fpath: path the file to parse.
-      label_key: key for label data in the retrieve
-          dictionary.
+    Args:
+        fpath: path the file to parse.
+        label_key: key for label data in the retrieve
+            dictionary.
 
-  Returns:
-      A tuple `(data, labels)`.
-  """
-  with open(fpath, 'rb') as f:
-    d = cPickle.load(f, encoding='bytes')
-    # decode utf8
-    d_decoded = {}
-    for k, v in d.items():
-      d_decoded[k.decode('utf8')] = v
-    d = d_decoded
-  data = d['data']
-  labels = d[label_key]
+    Returns:
+        A tuple `(data, labels)`.
+    """
+    with open(fpath, "rb") as f:
+        d = cPickle.load(f, encoding="bytes")
+        # decode utf8
+        d_decoded = {}
+        for k, v in d.items():
+            d_decoded[k.decode("utf8")] = v
+        d = d_decoded
+    data = d["data"]
+    labels = d[label_key]
 
-  data = data.reshape(data.shape[0], 3, 32, 32)
-  return data, labels
+    data = data.reshape(data.shape[0], 3, 32, 32)
+    return data, labels
diff --git a/keras/datasets/cifar10.py b/keras/datasets/cifar10.py
index 92919f80c89f..5131d2a69f54 100644
--- a/keras/datasets/cifar10.py
+++ b/keras/datasets/cifar10.py
@@ -21,89 +21,95 @@
 from keras import backend
 from keras.datasets.cifar import load_batch
 from keras.utils.data_utils import get_file
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.datasets.cifar10.load_data')
+@keras_export("keras.datasets.cifar10.load_data")
 def load_data():
-  """Loads the CIFAR10 dataset.
-
-  This is a dataset of 50,000 32x32 color training images and 10,000 test
-  images, labeled over 10 categories. See more info at the
-  [CIFAR homepage](https://www.cs.toronto.edu/~kriz/cifar.html).
-
-  The classes are:
-
-  | Label | Description |
-  |:-----:|-------------|
-  |   0   | airplane    |
-  |   1   | automobile  |
-  |   2   | bird        |
-  |   3   | cat         |
-  |   4   | deer        |
-  |   5   | dog         |
-  |   6   | frog        |
-  |   7   | horse       |
-  |   8   | ship        |
-  |   9   | truck       |
-
-  Returns:
-    Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
-
-  **x_train**: uint8 NumPy array of grayscale image data with shapes
-    `(50000, 32, 32, 3)`, containing the training data. Pixel values range
-    from 0 to 255.
-
-  **y_train**: uint8 NumPy array of labels (integers in range 0-9)
-    with shape `(50000, 1)` for the training data.
-
-  **x_test**: uint8 NumPy array of grayscale image data with shapes
-    `(10000, 32, 32, 3)`, containing the test data. Pixel values range
-    from 0 to 255.
-
-  **y_test**: uint8 NumPy array of labels (integers in range 0-9)
-    with shape `(10000, 1)` for the test data.
-
-  Example:
-
-  ```python
-  (x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
-  assert x_train.shape == (50000, 32, 32, 3)
-  assert x_test.shape == (10000, 32, 32, 3)
-  assert y_train.shape == (50000, 1)
-  assert y_test.shape == (10000, 1)
-  ```
-  """
-  dirname = 'cifar-10-batches-py'
-  origin = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
-  path = get_file(
-      dirname,
-      origin=origin,
-      untar=True,
-      file_hash=
-      '6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce')
-
-  num_train_samples = 50000
-
-  x_train = np.empty((num_train_samples, 3, 32, 32), dtype='uint8')
-  y_train = np.empty((num_train_samples,), dtype='uint8')
-
-  for i in range(1, 6):
-    fpath = os.path.join(path, 'data_batch_' + str(i))
-    (x_train[(i - 1) * 10000:i * 10000, :, :, :],
-     y_train[(i - 1) * 10000:i * 10000]) = load_batch(fpath)
-
-  fpath = os.path.join(path, 'test_batch')
-  x_test, y_test = load_batch(fpath)
-
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  y_test = np.reshape(y_test, (len(y_test), 1))
-
-  if backend.image_data_format() == 'channels_last':
-    x_train = x_train.transpose(0, 2, 3, 1)
-    x_test = x_test.transpose(0, 2, 3, 1)
-
-  x_test = x_test.astype(x_train.dtype)
-  y_test = y_test.astype(y_train.dtype)
-
-  return (x_train, y_train), (x_test, y_test)
+    """Loads the CIFAR10 dataset.
+
+    This is a dataset of 50,000 32x32 color training images and 10,000 test
+    images, labeled over 10 categories. See more info at the
+    [CIFAR homepage](https://www.cs.toronto.edu/~kriz/cifar.html).
+
+    The classes are:
+
+    | Label | Description |
+    |:-----:|-------------|
+    |   0   | airplane    |
+    |   1   | automobile  |
+    |   2   | bird        |
+    |   3   | cat         |
+    |   4   | deer        |
+    |   5   | dog         |
+    |   6   | frog        |
+    |   7   | horse       |
+    |   8   | ship        |
+    |   9   | truck       |
+
+    Returns:
+      Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+    **x_train**: uint8 NumPy array of image data with shapes
+      `(50000, 32, 32, 3)`, containing the training data. Pixel values range
+      from 0 to 255.
+
+    **y_train**: uint8 NumPy array of labels (integers in range 0-9)
+      with shape `(50000, 1)` for the training data.
+
+    **x_test**: uint8 NumPy array of image data with shapes
+      `(10000, 32, 32, 3)`, containing the test data. Pixel values range
+      from 0 to 255.
+
+    **y_test**: uint8 NumPy array of labels (integers in range 0-9)
+      with shape `(10000, 1)` for the test data.
+
+    Example:
+
+    ```python
+    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+    assert x_train.shape == (50000, 32, 32, 3)
+    assert x_test.shape == (10000, 32, 32, 3)
+    assert y_train.shape == (50000, 1)
+    assert y_test.shape == (10000, 1)
+    ```
+    """
+    dirname = "cifar-10-batches-py"
+    origin = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
+    path = get_file(
+        dirname,
+        origin=origin,
+        untar=True,
+        file_hash=(  # noqa: E501
+            "6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce"
+        ),
+    )
+
+    num_train_samples = 50000
+
+    x_train = np.empty((num_train_samples, 3, 32, 32), dtype="uint8")
+    y_train = np.empty((num_train_samples,), dtype="uint8")
+
+    for i in range(1, 6):
+        fpath = os.path.join(path, "data_batch_" + str(i))
+        (
+            x_train[(i - 1) * 10000 : i * 10000, :, :, :],
+            y_train[(i - 1) * 10000 : i * 10000],
+        ) = load_batch(fpath)
+
+    fpath = os.path.join(path, "test_batch")
+    x_test, y_test = load_batch(fpath)
+
+    y_train = np.reshape(y_train, (len(y_train), 1))
+    y_test = np.reshape(y_test, (len(y_test), 1))
+
+    if backend.image_data_format() == "channels_last":
+        x_train = x_train.transpose(0, 2, 3, 1)
+        x_test = x_test.transpose(0, 2, 3, 1)
+
+    x_test = x_test.astype(x_train.dtype)
+    y_test = y_test.astype(y_train.dtype)
+
+    return (x_train, y_train), (x_test, y_test)
diff --git a/keras/datasets/cifar100.py b/keras/datasets/cifar100.py
index b7f24ebfda82..e910b0051884 100644
--- a/keras/datasets/cifar100.py
+++ b/keras/datasets/cifar100.py
@@ -21,74 +21,80 @@
 from keras import backend
 from keras.datasets.cifar import load_batch
 from keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.datasets.cifar100.load_data')
-def load_data(label_mode='fine'):
-  """Loads the CIFAR100 dataset.
-
-  This is a dataset of 50,000 32x32 color training images and
-  10,000 test images, labeled over 100 fine-grained classes that are
-  grouped into 20 coarse-grained classes. See more info at the
-  [CIFAR homepage](https://www.cs.toronto.edu/~kriz/cifar.html).
-
-  Args:
-    label_mode: one of "fine", "coarse". If it is "fine" the category labels
-      are the fine-grained labels, if it is "coarse" the output labels are the
-      coarse-grained superclasses.
-
-  Returns:
-    Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
-
-  **x_train**: uint8 NumPy array of grayscale image data with shapes
-    `(50000, 32, 32, 3)`, containing the training data. Pixel values range
-    from 0 to 255.
-
-  **y_train**: uint8 NumPy array of labels (integers in range 0-99)
-    with shape `(50000, 1)` for the training data.
 
-  **x_test**: uint8 NumPy array of grayscale image data with shapes
-    `(10000, 32, 32, 3)`, containing the test data. Pixel values range
-    from 0 to 255.
-
-  **y_test**: uint8 NumPy array of labels (integers in range 0-99)
-    with shape `(10000, 1)` for the test data.
-
-  Example:
-
-  ```python
-  (x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
-  assert x_train.shape == (50000, 32, 32, 3)
-  assert x_test.shape == (10000, 32, 32, 3)
-  assert y_train.shape == (50000, 1)
-  assert y_test.shape == (10000, 1)
-  ```
-  """
-  if label_mode not in ['fine', 'coarse']:
-    raise ValueError('`label_mode` must be one of `"fine"`, `"coarse"`. '
-                     f'Received: label_mode={label_mode}.')
-
-  dirname = 'cifar-100-python'
-  origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
-  path = get_file(
-      dirname,
-      origin=origin,
-      untar=True,
-      file_hash=
-      '85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7')
-
-  fpath = os.path.join(path, 'train')
-  x_train, y_train = load_batch(fpath, label_key=label_mode + '_labels')
-
-  fpath = os.path.join(path, 'test')
-  x_test, y_test = load_batch(fpath, label_key=label_mode + '_labels')
-
-  y_train = np.reshape(y_train, (len(y_train), 1))
-  y_test = np.reshape(y_test, (len(y_test), 1))
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-  if backend.image_data_format() == 'channels_last':
-    x_train = x_train.transpose(0, 2, 3, 1)
-    x_test = x_test.transpose(0, 2, 3, 1)
 
-  return (x_train, y_train), (x_test, y_test)
+@keras_export("keras.datasets.cifar100.load_data")
+def load_data(label_mode="fine"):
+    """Loads the CIFAR100 dataset.
+
+    This is a dataset of 50,000 32x32 color training images and
+    10,000 test images, labeled over 100 fine-grained classes that are
+    grouped into 20 coarse-grained classes. See more info at the
+    [CIFAR homepage](https://www.cs.toronto.edu/~kriz/cifar.html).
+
+    Args:
+      label_mode: one of "fine", "coarse". If it is "fine" the category labels
+        are the fine-grained labels, if it is "coarse" the output labels are the
+        coarse-grained superclasses.
+
+    Returns:
+      Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+    **x_train**: uint8 NumPy array of image data with shapes
+      `(50000, 32, 32, 3)`, containing the training data. Pixel values range
+      from 0 to 255.
+
+    **y_train**: uint8 NumPy array of labels (integers in range 0-99)
+      with shape `(50000, 1)` for the training data.
+
+    **x_test**: uint8 NumPy array of image data with shapes
+      `(10000, 32, 32, 3)`, containing the test data. Pixel values range
+      from 0 to 255.
+
+    **y_test**: uint8 NumPy array of labels (integers in range 0-99)
+      with shape `(10000, 1)` for the test data.
+
+    Example:
+
+    ```python
+    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+    assert x_train.shape == (50000, 32, 32, 3)
+    assert x_test.shape == (10000, 32, 32, 3)
+    assert y_train.shape == (50000, 1)
+    assert y_test.shape == (10000, 1)
+    ```
+    """
+    if label_mode not in ["fine", "coarse"]:
+        raise ValueError(
+            '`label_mode` must be one of `"fine"`, `"coarse"`. '
+            f"Received: label_mode={label_mode}."
+        )
+
+    dirname = "cifar-100-python"
+    origin = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
+    path = get_file(
+        dirname,
+        origin=origin,
+        untar=True,
+        file_hash=(  # noqa: E501
+            "85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7"
+        ),
+    )
+
+    fpath = os.path.join(path, "train")
+    x_train, y_train = load_batch(fpath, label_key=label_mode + "_labels")
+
+    fpath = os.path.join(path, "test")
+    x_test, y_test = load_batch(fpath, label_key=label_mode + "_labels")
+
+    y_train = np.reshape(y_train, (len(y_train), 1))
+    y_test = np.reshape(y_test, (len(y_test), 1))
+
+    if backend.image_data_format() == "channels_last":
+        x_train = x_train.transpose(0, 2, 3, 1)
+        x_test = x_test.transpose(0, 2, 3, 1)
+
+    return (x_train, y_train), (x_test, y_test)
diff --git a/keras/datasets/fashion_mnist.py b/keras/datasets/fashion_mnist.py
index adbba99cd7ec..e7d64ebef178 100644
--- a/keras/datasets/fashion_mnist.py
+++ b/keras/datasets/fashion_mnist.py
@@ -20,86 +20,92 @@
 import numpy as np
 
 from keras.utils.data_utils import get_file
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.datasets.fashion_mnist.load_data')
+@keras_export("keras.datasets.fashion_mnist.load_data")
 def load_data():
-  """Loads the Fashion-MNIST dataset.
-
-  This is a dataset of 60,000 28x28 grayscale images of 10 fashion categories,
-  along with a test set of 10,000 images. This dataset can be used as
-  a drop-in replacement for MNIST.
-
-  The classes are:
-
-  | Label | Description |
-  |:-----:|-------------|
-  |   0   | T-shirt/top |
-  |   1   | Trouser     |
-  |   2   | Pullover    |
-  |   3   | Dress       |
-  |   4   | Coat        |
-  |   5   | Sandal      |
-  |   6   | Shirt       |
-  |   7   | Sneaker     |
-  |   8   | Bag         |
-  |   9   | Ankle boot  |
-
-  Returns:
-    Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
-
-  **x_train**: uint8 NumPy array of grayscale image data with shapes
-    `(60000, 28, 28)`, containing the training data.
-
-  **y_train**: uint8 NumPy array of labels (integers in range 0-9)
-    with shape `(60000,)` for the training data.
-
-  **x_test**: uint8 NumPy array of grayscale image data with shapes
-    (10000, 28, 28), containing the test data.
-
-  **y_test**: uint8 NumPy array of labels (integers in range 0-9)
-    with shape `(10000,)` for the test data.
-
-  Example:
-
-  ```python
-  (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
-  assert x_train.shape == (60000, 28, 28)
-  assert x_test.shape == (10000, 28, 28)
-  assert y_train.shape == (60000,)
-  assert y_test.shape == (10000,)
-  ```
-
-  License:
-    The copyright for Fashion-MNIST is held by Zalando SE.
-    Fashion-MNIST is licensed under the [MIT license](
-    https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE).
-
-  """
-  dirname = os.path.join('datasets', 'fashion-mnist')
-  base = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
-  files = [
-      'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz',
-      't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz'
-  ]
-
-  paths = []
-  for fname in files:
-    paths.append(get_file(fname, origin=base + fname, cache_subdir=dirname))
-
-  with gzip.open(paths[0], 'rb') as lbpath:
-    y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
-
-  with gzip.open(paths[1], 'rb') as imgpath:
-    x_train = np.frombuffer(
-        imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28)
-
-  with gzip.open(paths[2], 'rb') as lbpath:
-    y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)
-
-  with gzip.open(paths[3], 'rb') as imgpath:
-    x_test = np.frombuffer(
-        imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28)
-
-  return (x_train, y_train), (x_test, y_test)
+    """Loads the Fashion-MNIST dataset.
+
+    This is a dataset of 60,000 28x28 grayscale images of 10 fashion categories,
+    along with a test set of 10,000 images. This dataset can be used as
+    a drop-in replacement for MNIST.
+
+    The classes are:
+
+    | Label | Description |
+    |:-----:|-------------|
+    |   0   | T-shirt/top |
+    |   1   | Trouser     |
+    |   2   | Pullover    |
+    |   3   | Dress       |
+    |   4   | Coat        |
+    |   5   | Sandal      |
+    |   6   | Shirt       |
+    |   7   | Sneaker     |
+    |   8   | Bag         |
+    |   9   | Ankle boot  |
+
+    Returns:
+      Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+    **x_train**: uint8 NumPy array of grayscale image data with shapes
+      `(60000, 28, 28)`, containing the training data.
+
+    **y_train**: uint8 NumPy array of labels (integers in range 0-9)
+      with shape `(60000,)` for the training data.
+
+    **x_test**: uint8 NumPy array of grayscale image data with shapes
+      (10000, 28, 28), containing the test data.
+
+    **y_test**: uint8 NumPy array of labels (integers in range 0-9)
+      with shape `(10000,)` for the test data.
+
+    Example:
+
+    ```python
+    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
+    assert x_train.shape == (60000, 28, 28)
+    assert x_test.shape == (10000, 28, 28)
+    assert y_train.shape == (60000,)
+    assert y_test.shape == (10000,)
+    ```
+
+    License:
+      The copyright for Fashion-MNIST is held by Zalando SE.
+      Fashion-MNIST is licensed under the [MIT license](
+      https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE).
+
+    """
+    dirname = os.path.join("datasets", "fashion-mnist")
+    base = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
+    files = [
+        "train-labels-idx1-ubyte.gz",
+        "train-images-idx3-ubyte.gz",
+        "t10k-labels-idx1-ubyte.gz",
+        "t10k-images-idx3-ubyte.gz",
+    ]
+
+    paths = []
+    for fname in files:
+        paths.append(get_file(fname, origin=base + fname, cache_subdir=dirname))
+
+    with gzip.open(paths[0], "rb") as lbpath:
+        y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
+
+    with gzip.open(paths[1], "rb") as imgpath:
+        x_train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(
+            len(y_train), 28, 28
+        )
+
+    with gzip.open(paths[2], "rb") as lbpath:
+        y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)
+
+    with gzip.open(paths[3], "rb") as imgpath:
+        x_test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(
+            len(y_test), 28, 28
+        )
+
+    return (x_train, y_train), (x_test, y_test)
diff --git a/keras/datasets/imdb.py b/keras/datasets/imdb.py
index a90764bf8507..1e61771ad79b 100644
--- a/keras/datasets/imdb.py
+++ b/keras/datasets/imdb.py
@@ -20,169 +20,198 @@
 
 from keras.preprocessing.sequence import _remove_long_seq
 from keras.utils.data_utils import get_file
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.datasets.imdb.load_data')
-def load_data(path='imdb.npz',
-              num_words=None,
-              skip_top=0,
-              maxlen=None,
-              seed=113,
-              start_char=1,
-              oov_char=2,
-              index_from=3,
-              **kwargs):
-  """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
-
-  This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
-  (positive/negative). Reviews have been preprocessed, and each review is
-  encoded as a list of word indexes (integers).
-  For convenience, words are indexed by overall frequency in the dataset,
-  so that for instance the integer "3" encodes the 3rd most frequent word in
-  the data. This allows for quick filtering operations such as:
-  "only consider the top 10,000 most
-  common words, but eliminate the top 20 most common words".
-
-  As a convention, "0" does not stand for a specific word, but instead is used
-  to encode any unknown word.
-
-  Args:
-    path: where to cache the data (relative to `~/.keras/dataset`).
-    num_words: integer or None. Words are
-        ranked by how often they occur (in the training set) and only
-        the `num_words` most frequent words are kept. Any less frequent word
-        will appear as `oov_char` value in the sequence data. If None,
-        all words are kept. Defaults to None, so all words are kept.
-    skip_top: skip the top N most frequently occurring words
-        (which may not be informative). These words will appear as
-        `oov_char` value in the dataset. Defaults to 0, so no words are
-        skipped.
-    maxlen: int or None. Maximum sequence length.
-        Any longer sequence will be truncated. Defaults to None, which
-        means no truncation.
-    seed: int. Seed for reproducible data shuffling.
-    start_char: int. The start of a sequence will be marked with this
-        character. Defaults to 1 because 0 is usually the padding character.
-    oov_char: int. The out-of-vocabulary character.
-        Words that were cut out because of the `num_words` or
-        `skip_top` limits will be replaced with this character.
-    index_from: int. Index actual words with this index and higher.
-    **kwargs: Used for backwards compatibility.
-
-  Returns:
-    Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
-
-  **x_train, x_test**: lists of sequences, which are lists of indexes
-    (integers). If the num_words argument was specific, the maximum
-    possible index value is `num_words - 1`. If the `maxlen` argument was
-    specified, the largest possible sequence length is `maxlen`.
-
-  **y_train, y_test**: lists of integer labels (1 or 0).
-
-  Raises:
-    ValueError: in case `maxlen` is so low
-        that no input sequence could be kept.
-
-  Note that the 'out of vocabulary' character is only used for
-  words that were present in the training set but are not included
-  because they're not making the `num_words` cut here.
-  Words that were not seen in the training set but are in the test set
-  have simply been skipped.
-  """
-  # Legacy support
-  if 'nb_words' in kwargs:
-    logging.warning('The `nb_words` argument in `load_data` '
-                    'has been renamed `num_words`.')
-    num_words = kwargs.pop('nb_words')
-  if kwargs:
-    raise TypeError(f'Unrecognized keyword arguments: {str(kwargs)}.')
-
-  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
-  path = get_file(
-      path,
-      origin=origin_folder + 'imdb.npz',
-      file_hash=
-      '69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f')
-  with np.load(path, allow_pickle=True) as f:  # pylint: disable=unexpected-keyword-arg
-    x_train, labels_train = f['x_train'], f['y_train']
-    x_test, labels_test = f['x_test'], f['y_test']
-
-  rng = np.random.RandomState(seed)
-  indices = np.arange(len(x_train))
-  rng.shuffle(indices)
-  x_train = x_train[indices]
-  labels_train = labels_train[indices]
-
-  indices = np.arange(len(x_test))
-  rng.shuffle(indices)
-  x_test = x_test[indices]
-  labels_test = labels_test[indices]
-
-  if start_char is not None:
-    x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
-    x_test = [[start_char] + [w + index_from for w in x] for x in x_test]
-  elif index_from:
-    x_train = [[w + index_from for w in x] for x in x_train]
-    x_test = [[w + index_from for w in x] for x in x_test]
-
-  if maxlen:
-    x_train, labels_train = _remove_long_seq(maxlen, x_train, labels_train)
-    x_test, labels_test = _remove_long_seq(maxlen, x_test, labels_test)
-    if not x_train or not x_test:
-      raise ValueError('After filtering for sequences shorter than maxlen='
-                       f'{str(maxlen)}, no sequence was kept. Increase maxlen.')
-
-  xs = x_train + x_test
-  labels = np.concatenate([labels_train, labels_test])
-
-  if not num_words:
-    num_words = max(max(x) for x in xs)
-
-  # by convention, use 2 as OOV word
-  # reserve 'index_from' (=3 by default) characters:
-  # 0 (padding), 1 (start), 2 (OOV)
-  if oov_char is not None:
-    xs = [
-        [w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs
-    ]
-  else:
-    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
-
-  idx = len(x_train)
-  x_train, y_train = np.array(xs[:idx], dtype='object'), labels[:idx]
-  x_test, y_test = np.array(xs[idx:], dtype='object'), labels[idx:]
-  return (x_train, y_train), (x_test, y_test)
-
-
-@keras_export('keras.datasets.imdb.get_word_index')
-def get_word_index(path='imdb_word_index.json'):
-  """Retrieves a dict mapping words to their index in the IMDB dataset.
-
-  Args:
+@keras_export("keras.datasets.imdb.load_data")
+def load_data(
+    path="imdb.npz",
+    num_words=None,
+    skip_top=0,
+    maxlen=None,
+    seed=113,
+    start_char=1,
+    oov_char=2,
+    index_from=3,
+    **kwargs,
+):
+    """Loads the [IMDB dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
+
+    This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment
+    (positive/negative). Reviews have been preprocessed, and each review is
+    encoded as a list of word indexes (integers).
+    For convenience, words are indexed by overall frequency in the dataset,
+    so that for instance the integer "3" encodes the 3rd most frequent word in
+    the data. This allows for quick filtering operations such as:
+    "only consider the top 10,000 most
+    common words, but eliminate the top 20 most common words".
+
+    As a convention, "0" does not stand for a specific word, but instead is used
+    to encode the pad token.
+
+    Args:
       path: where to cache the data (relative to `~/.keras/dataset`).
-
-  Returns:
-      The word index dictionary. Keys are word strings, values are their index.
-
-  Example:
-
-  ```python
-  # Retrieve the training sequences.
-  (x_train, _), _ = keras.datasets.imdb.load_data()
-  # Retrieve the word index file mapping words to indices
-  word_index = keras.datasets.imdb.get_word_index()
-  # Reverse the word index to obtain a dict mapping indices to words
-  inverted_word_index = dict((i, word) for (word, i) in word_index.items())
-  # Decode the first sequence in the dataset
-  decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])
-  ```
-  """
-  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
-  path = get_file(
-      path,
-      origin=origin_folder + 'imdb_word_index.json',
-      file_hash='bfafd718b763782e994055a2d397834f')
-  with open(path) as f:
-    return json.load(f)
+      num_words: integer or None. Words are
+          ranked by how often they occur (in the training set) and only
+          the `num_words` most frequent words are kept. Any less frequent word
+          will appear as `oov_char` value in the sequence data. If None,
+          all words are kept. Defaults to `None`.
+      skip_top: skip the top N most frequently occurring words
+          (which may not be informative). These words will appear as
+          `oov_char` value in the dataset. When 0, no words are
+          skipped. Defaults to `0`.
+      maxlen: int or None. Maximum sequence length.
+          Any longer sequence will be truncated. None, means no truncation.
+          Defaults to `None`.
+      seed: int. Seed for reproducible data shuffling.
+      start_char: int. The start of a sequence will be marked with this
+          character. 0 is usually the padding character. Defaults to `1`.
+      oov_char: int. The out-of-vocabulary character.
+          Words that were cut out because of the `num_words` or
+          `skip_top` limits will be replaced with this character.
+      index_from: int. Index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
+
+    Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+    **x_train, x_test**: lists of sequences, which are lists of indexes
+      (integers). If the num_words argument was specific, the maximum
+      possible index value is `num_words - 1`. If the `maxlen` argument was
+      specified, the largest possible sequence length is `maxlen`.
+
+    **y_train, y_test**: lists of integer labels (1 or 0).
+
+    Raises:
+      ValueError: in case `maxlen` is so low
+          that no input sequence could be kept.
+
+    Note that the 'out of vocabulary' character is only used for
+    words that were present in the training set but are not included
+    because they're not making the `num_words` cut here.
+    Words that were not seen in the training set but are in the test set
+    have simply been skipped.
+    """
+    # Legacy support
+    if "nb_words" in kwargs:
+        logging.warning(
+            "The `nb_words` argument in `load_data` "
+            "has been renamed `num_words`."
+        )
+        num_words = kwargs.pop("nb_words")
+    if kwargs:
+        raise TypeError(f"Unrecognized keyword arguments: {str(kwargs)}.")
+
+    origin_folder = (
+        "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
+    )
+    path = get_file(
+        path,
+        origin=origin_folder + "imdb.npz",
+        file_hash=(  # noqa: E501
+            "69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f"
+        ),
+    )
+    with np.load(path, allow_pickle=True) as f:
+        x_train, labels_train = f["x_train"], f["y_train"]
+        x_test, labels_test = f["x_test"], f["y_test"]
+
+    rng = np.random.RandomState(seed)
+    indices = np.arange(len(x_train))
+    rng.shuffle(indices)
+    x_train = x_train[indices]
+    labels_train = labels_train[indices]
+
+    indices = np.arange(len(x_test))
+    rng.shuffle(indices)
+    x_test = x_test[indices]
+    labels_test = labels_test[indices]
+
+    if start_char is not None:
+        x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
+        x_test = [[start_char] + [w + index_from for w in x] for x in x_test]
+    elif index_from:
+        x_train = [[w + index_from for w in x] for x in x_train]
+        x_test = [[w + index_from for w in x] for x in x_test]
+
+    if maxlen:
+        x_train, labels_train = _remove_long_seq(maxlen, x_train, labels_train)
+        x_test, labels_test = _remove_long_seq(maxlen, x_test, labels_test)
+        if not x_train or not x_test:
+            raise ValueError(
+                "After filtering for sequences shorter than maxlen="
+                f"{str(maxlen)}, no sequence was kept. Increase maxlen."
+            )
+
+    xs = x_train + x_test
+    labels = np.concatenate([labels_train, labels_test])
+
+    if not num_words:
+        num_words = max(max(x) for x in xs)
+
+    # by convention, use 2 as OOV word
+    # reserve 'index_from' (=3 by default) characters:
+    # 0 (padding), 1 (start), 2 (OOV)
+    if oov_char is not None:
+        xs = [
+            [w if (skip_top <= w < num_words) else oov_char for w in x]
+            for x in xs
+        ]
+    else:
+        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
+
+    idx = len(x_train)
+    x_train, y_train = np.array(xs[:idx], dtype="object"), labels[:idx]
+    x_test, y_test = np.array(xs[idx:], dtype="object"), labels[idx:]
+    return (x_train, y_train), (x_test, y_test)
+
+
+@keras_export("keras.datasets.imdb.get_word_index")
+def get_word_index(path="imdb_word_index.json"):
+    """Retrieves a dict mapping words to their index in the IMDB dataset.
+
+    Args:
+        path: where to cache the data (relative to `~/.keras/dataset`).
+
+    Returns:
+        The word index dictionary. Keys are word strings, values are their
+        index.
+
+    Example:
+
+    ```python
+    # Use the default parameters to keras.datasets.imdb.load_data
+    start_char = 1
+    oov_char = 2
+    index_from = 3
+    # Retrieve the training sequences.
+    (x_train, _), _ = keras.datasets.imdb.load_data(
+        start_char=start_char, oov_char=oov_char, index_from=index_from
+    )
+    # Retrieve the word index file mapping words to indices
+    word_index = keras.datasets.imdb.get_word_index()
+    # Reverse the word index to obtain a dict mapping indices to words
+    # And add `index_from` to indices to sync with `x_train`
+    inverted_word_index = dict(
+        (i + index_from, word) for (word, i) in word_index.items()
+    )
+    # Update `inverted_word_index` to include `start_char` and `oov_char`
+    inverted_word_index[start_char] = "[START]"
+    inverted_word_index[oov_char] = "[OOV]"
+    # Decode the first sequence in the dataset
+    decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])
+    ```
+    """
+    origin_folder = (
+        "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
+    )
+    path = get_file(
+        path,
+        origin=origin_folder + "imdb_word_index.json",
+        file_hash="bfafd718b763782e994055a2d397834f",
+    )
+    with open(path) as f:
+        return json.load(f)
diff --git a/keras/datasets/mnist.py b/keras/datasets/mnist.py
index 1bd4349fdf1b..a145d167affa 100644
--- a/keras/datasets/mnist.py
+++ b/keras/datasets/mnist.py
@@ -17,64 +17,70 @@
 import numpy as np
 
 from keras.utils.data_utils import get_file
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.datasets.mnist.load_data')
-def load_data(path='mnist.npz'):
-  """Loads the MNIST dataset.
-
-  This is a dataset of 60,000 28x28 grayscale images of the 10 digits,
-  along with a test set of 10,000 images.
-  More info can be found at the
-  [MNIST homepage](http://yann.lecun.com/exdb/mnist/).
-
-  Args:
-    path: path where to cache the dataset locally
-      (relative to `~/.keras/datasets`).
-
-  Returns:
-    Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
-
-  **x_train**: uint8 NumPy array of grayscale image data with shapes
-    `(60000, 28, 28)`, containing the training data. Pixel values range
-    from 0 to 255.
-
-  **y_train**: uint8 NumPy array of digit labels (integers in range 0-9)
-    with shape `(60000,)` for the training data.
-
-  **x_test**: uint8 NumPy array of grayscale image data with shapes
-    (10000, 28, 28), containing the test data. Pixel values range
-    from 0 to 255.
-
-  **y_test**: uint8 NumPy array of digit labels (integers in range 0-9)
-    with shape `(10000,)` for the test data.
-
-  Example:
-
-  ```python
-  (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
-  assert x_train.shape == (60000, 28, 28)
-  assert x_test.shape == (10000, 28, 28)
-  assert y_train.shape == (60000,)
-  assert y_test.shape == (10000,)
-  ```
-
-  License:
-    Yann LeCun and Corinna Cortes hold the copyright of MNIST dataset,
-    which is a derivative work from original NIST datasets.
-    MNIST dataset is made available under the terms of the
-    [Creative Commons Attribution-Share Alike 3.0 license.](
-    https://creativecommons.org/licenses/by-sa/3.0/)
-  """
-  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
-  path = get_file(
-      path,
-      origin=origin_folder + 'mnist.npz',
-      file_hash=
-      '731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1')
-  with np.load(path, allow_pickle=True) as f:  # pylint: disable=unexpected-keyword-arg
-    x_train, y_train = f['x_train'], f['y_train']
-    x_test, y_test = f['x_test'], f['y_test']
-
-    return (x_train, y_train), (x_test, y_test)
+@keras_export("keras.datasets.mnist.load_data")
+def load_data(path="mnist.npz"):
+    """Loads the MNIST dataset.
+
+    This is a dataset of 60,000 28x28 grayscale images of the 10 digits,
+    along with a test set of 10,000 images.
+    More info can be found at the
+    [MNIST homepage](http://yann.lecun.com/exdb/mnist/).
+
+    Args:
+      path: path where to cache the dataset locally
+        (relative to `~/.keras/datasets`).
+
+    Returns:
+      Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+    **x_train**: uint8 NumPy array of grayscale image data with shapes
+      `(60000, 28, 28)`, containing the training data. Pixel values range
+      from 0 to 255.
+
+    **y_train**: uint8 NumPy array of digit labels (integers in range 0-9)
+      with shape `(60000,)` for the training data.
+
+    **x_test**: uint8 NumPy array of grayscale image data with shapes
+      (10000, 28, 28), containing the test data. Pixel values range
+      from 0 to 255.
+
+    **y_test**: uint8 NumPy array of digit labels (integers in range 0-9)
+      with shape `(10000,)` for the test data.
+
+    Example:
+
+    ```python
+    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
+    assert x_train.shape == (60000, 28, 28)
+    assert x_test.shape == (10000, 28, 28)
+    assert y_train.shape == (60000,)
+    assert y_test.shape == (10000,)
+    ```
+
+    License:
+      Yann LeCun and Corinna Cortes hold the copyright of MNIST dataset,
+      which is a derivative work from original NIST datasets.
+      MNIST dataset is made available under the terms of the
+      [Creative Commons Attribution-Share Alike 3.0 license.](
+      https://creativecommons.org/licenses/by-sa/3.0/)
+    """
+    origin_folder = (
+        "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
+    )
+    path = get_file(
+        path,
+        origin=origin_folder + "mnist.npz",
+        file_hash=(  # noqa: E501
+            "731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1"
+        ),
+    )
+    with np.load(path, allow_pickle=True) as f:
+        x_train, y_train = f["x_train"], f["y_train"]
+        x_test, y_test = f["x_test"], f["y_test"]
+
+        return (x_train, y_train), (x_test, y_test)
diff --git a/keras/datasets/reuters.py b/keras/datasets/reuters.py
index 8aec4906c532..38cc15e33d98 100644
--- a/keras/datasets/reuters.py
+++ b/keras/datasets/reuters.py
@@ -20,146 +20,230 @@
 
 from keras.preprocessing.sequence import _remove_long_seq
 from keras.utils.data_utils import get_file
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.datasets.reuters.load_data')
-def load_data(path='reuters.npz',
-              num_words=None,
-              skip_top=0,
-              maxlen=None,
-              test_split=0.2,
-              seed=113,
-              start_char=1,
-              oov_char=2,
-              index_from=3,
-              **kwargs):
-  """Loads the Reuters newswire classification dataset.
-
-  This is a dataset of 11,228 newswires from Reuters, labeled over 46 topics.
-
-  This was originally generated by parsing and preprocessing the classic
-  Reuters-21578 dataset, but the preprocessing code is no longer packaged
-  with Keras. See this
-  [github discussion](https://github.com/keras-team/keras/issues/12072)
-  for more info.
-
-  Each newswire is encoded as a list of word indexes (integers).
-  For convenience, words are indexed by overall frequency in the dataset,
-  so that for instance the integer "3" encodes the 3rd most frequent word in
-  the data. This allows for quick filtering operations such as:
-  "only consider the top 10,000 most
-  common words, but eliminate the top 20 most common words".
-
-  As a convention, "0" does not stand for a specific word, but instead is used
-  to encode any unknown word.
-
-  Args:
-    path: where to cache the data (relative to `~/.keras/dataset`).
-    num_words: integer or None. Words are
-        ranked by how often they occur (in the training set) and only
-        the `num_words` most frequent words are kept. Any less frequent word
-        will appear as `oov_char` value in the sequence data. If None,
-        all words are kept. Defaults to None, so all words are kept.
-    skip_top: skip the top N most frequently occurring words
-        (which may not be informative). These words will appear as
-        `oov_char` value in the dataset. Defaults to 0, so no words are
-        skipped.
-    maxlen: int or None. Maximum sequence length.
-        Any longer sequence will be truncated. Defaults to None, which
-        means no truncation.
-    test_split: Float between 0 and 1. Fraction of the dataset to be used
-      as test data. Defaults to 0.2, meaning 20% of the dataset is used as
-      test data.
-    seed: int. Seed for reproducible data shuffling.
-    start_char: int. The start of a sequence will be marked with this
-        character. Defaults to 1 because 0 is usually the padding character.
-    oov_char: int. The out-of-vocabulary character.
-        Words that were cut out because of the `num_words` or
-        `skip_top` limits will be replaced with this character.
-    index_from: int. Index actual words with this index and higher.
-    **kwargs: Used for backwards compatibility.
-
-  Returns:
-    Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
-
-  **x_train, x_test**: lists of sequences, which are lists of indexes
-    (integers). If the num_words argument was specific, the maximum
-    possible index value is `num_words - 1`. If the `maxlen` argument was
-    specified, the largest possible sequence length is `maxlen`.
-
-  **y_train, y_test**: lists of integer labels (1 or 0).
-
-  Note: The 'out of vocabulary' character is only used for
-  words that were present in the training set but are not included
-  because they're not making the `num_words` cut here.
-  Words that were not seen in the training set but are in the test set
-  have simply been skipped.
-  """
-  # Legacy support
-  if 'nb_words' in kwargs:
-    logging.warning('The `nb_words` argument in `load_data` '
-                    'has been renamed `num_words`.')
-    num_words = kwargs.pop('nb_words')
-  if kwargs:
-    raise TypeError(f'Unrecognized keyword arguments: {str(kwargs)}')
-
-  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
-  path = get_file(
-      path,
-      origin=origin_folder + 'reuters.npz',
-      file_hash=
-      'd6586e694ee56d7a4e65172e12b3e987c03096cb01eab99753921ef915959916')
-  with np.load(path, allow_pickle=True) as f:  # pylint: disable=unexpected-keyword-arg
-    xs, labels = f['x'], f['y']
-
-  rng = np.random.RandomState(seed)
-  indices = np.arange(len(xs))
-  rng.shuffle(indices)
-  xs = xs[indices]
-  labels = labels[indices]
-
-  if start_char is not None:
-    xs = [[start_char] + [w + index_from for w in x] for x in xs]
-  elif index_from:
-    xs = [[w + index_from for w in x] for x in xs]
-
-  if maxlen:
-    xs, labels = _remove_long_seq(maxlen, xs, labels)
-
-  if not num_words:
-    num_words = max(max(x) for x in xs)
-
-  # by convention, use 2 as OOV word
-  # reserve 'index_from' (=3 by default) characters:
-  # 0 (padding), 1 (start), 2 (OOV)
-  if oov_char is not None:
-    xs = [[w if skip_top <= w < num_words else oov_char for w in x] for x in xs]
-  else:
-    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
-
-  idx = int(len(xs) * (1 - test_split))
-  x_train, y_train = np.array(xs[:idx], dtype='object'), np.array(labels[:idx])
-  x_test, y_test = np.array(xs[idx:], dtype='object'), np.array(labels[idx:])
-
-  return (x_train, y_train), (x_test, y_test)
-
-
-@keras_export('keras.datasets.reuters.get_word_index')
-def get_word_index(path='reuters_word_index.json'):
-  """Retrieves a dict mapping words to their index in the Reuters dataset.
-
-  Args:
+@keras_export("keras.datasets.reuters.load_data")
+def load_data(
+    path="reuters.npz",
+    num_words=None,
+    skip_top=0,
+    maxlen=None,
+    test_split=0.2,
+    seed=113,
+    start_char=1,
+    oov_char=2,
+    index_from=3,
+    **kwargs,
+):
+    """Loads the Reuters newswire classification dataset.
+
+    This is a dataset of 11,228 newswires from Reuters, labeled over 46 topics.
+
+    This was originally generated by parsing and preprocessing the classic
+    Reuters-21578 dataset, but the preprocessing code is no longer packaged
+    with Keras. See this
+    [GitHub discussion](https://github.com/keras-team/keras/issues/12072)
+    for more info.
+
+    Each newswire is encoded as a list of word indexes (integers).
+    For convenience, words are indexed by overall frequency in the dataset,
+    so that for instance the integer "3" encodes the 3rd most frequent word in
+    the data. This allows for quick filtering operations such as:
+    "only consider the top 10,000 most
+    common words, but eliminate the top 20 most common words".
+
+    As a convention, "0" does not stand for a specific word, but instead is used
+    to encode any unknown word.
+
+    Args:
       path: where to cache the data (relative to `~/.keras/dataset`).
-
-  Returns:
-      The word index dictionary. Keys are word strings, values are their index.
-  """
-  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
-  path = get_file(
-      path,
-      origin=origin_folder + 'reuters_word_index.json',
-      file_hash='4d44cc38712099c9e383dc6e5f11a921')
-  with open(path) as f:
-    return json.load(f)
+      num_words: integer or None. Words are
+          ranked by how often they occur (in the training set) and only
+          the `num_words` most frequent words are kept. Any less frequent word
+          will appear as `oov_char` value in the sequence data. If None,
+          all words are kept. Defaults to `None`.
+      skip_top: skip the top N most frequently occurring words
+          (which may not be informative). These words will appear as
+          `oov_char` value in the dataset. 0 means no words are
+          skipped. Defaults to `0`.
+      maxlen: int or None. Maximum sequence length.
+          Any longer sequence will be truncated. None means no truncation.
+          Defaults to `None`.
+      test_split: Float between `0.` and `1.`. Fraction of the dataset to be
+        used as test data. `0.2` means that 20% of the dataset is used as
+        test data. Defaults to `0.2`.
+      seed: int. Seed for reproducible data shuffling.
+      start_char: int. The start of a sequence will be marked with this
+          character. 0 is usually the padding character. Defaults to `1`.
+      oov_char: int. The out-of-vocabulary character.
+          Words that were cut out because of the `num_words` or
+          `skip_top` limits will be replaced with this character.
+      index_from: int. Index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
+
+    Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+    **x_train, x_test**: lists of sequences, which are lists of indexes
+      (integers). If the num_words argument was specific, the maximum
+      possible index value is `num_words - 1`. If the `maxlen` argument was
+      specified, the largest possible sequence length is `maxlen`.
+
+    **y_train, y_test**: lists of integer labels (1 or 0).
+
+    Note: The 'out of vocabulary' character is only used for
+    words that were present in the training set but are not included
+    because they're not making the `num_words` cut here.
+    Words that were not seen in the training set but are in the test set
+    have simply been skipped.
+    """
+    # Legacy support
+    if "nb_words" in kwargs:
+        logging.warning(
+            "The `nb_words` argument in `load_data` "
+            "has been renamed `num_words`."
+        )
+        num_words = kwargs.pop("nb_words")
+    if kwargs:
+        raise TypeError(f"Unrecognized keyword arguments: {str(kwargs)}")
+
+    origin_folder = (
+        "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
+    )
+    path = get_file(
+        path,
+        origin=origin_folder + "reuters.npz",
+        file_hash=(  # noqa: E501
+            "d6586e694ee56d7a4e65172e12b3e987c03096cb01eab99753921ef915959916"
+        ),
+    )
+    with np.load(path, allow_pickle=True) as f:
+        xs, labels = f["x"], f["y"]
+
+    rng = np.random.RandomState(seed)
+    indices = np.arange(len(xs))
+    rng.shuffle(indices)
+    xs = xs[indices]
+    labels = labels[indices]
+
+    if start_char is not None:
+        xs = [[start_char] + [w + index_from for w in x] for x in xs]
+    elif index_from:
+        xs = [[w + index_from for w in x] for x in xs]
+
+    if maxlen:
+        xs, labels = _remove_long_seq(maxlen, xs, labels)
+
+    if not num_words:
+        num_words = max(max(x) for x in xs)
+
+    # by convention, use 2 as OOV word
+    # reserve 'index_from' (=3 by default) characters:
+    # 0 (padding), 1 (start), 2 (OOV)
+    if oov_char is not None:
+        xs = [
+            [w if skip_top <= w < num_words else oov_char for w in x]
+            for x in xs
+        ]
+    else:
+        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
+
+    idx = int(len(xs) * (1 - test_split))
+    x_train, y_train = np.array(xs[:idx], dtype="object"), np.array(
+        labels[:idx]
+    )
+    x_test, y_test = np.array(xs[idx:], dtype="object"), np.array(labels[idx:])
+
+    return (x_train, y_train), (x_test, y_test)
+
+
+@keras_export("keras.datasets.reuters.get_word_index")
+def get_word_index(path="reuters_word_index.json"):
+    """Retrieves a dict mapping words to their index in the Reuters dataset.
+
+    Actual word indices starts from 3, with 3 indices reserved for:
+    0 (padding), 1 (start), 2 (oov).
+
+    E.g. word index of 'the' is 1, but the in the actual training data, the
+    index of 'the' will be 1 + 3 = 4. Vice versa, to translate word indices in
+    training data back to words using this mapping, indices need to substract 3.
+
+    Args:
+        path: where to cache the data (relative to `~/.keras/dataset`).
+
+    Returns:
+        The word index dictionary. Keys are word strings, values are their
+        index.
+    """
+    origin_folder = (
+        "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
+    )
+    path = get_file(
+        path,
+        origin=origin_folder + "reuters_word_index.json",
+        file_hash="4d44cc38712099c9e383dc6e5f11a921",
+    )
+    with open(path) as f:
+        return json.load(f)
+
+
+@keras_export("keras.datasets.reuters.get_label_names")
+def get_label_names():
+    """Returns labels as a list of strings with indices matching training data.
+
+    Reference:
+
+    - [Reuters Dataset](https://martin-thoma.com/nlp-reuters/)
+    """
+    return (
+        "cocoa",
+        "grain",
+        "veg-oil",
+        "earn",
+        "acq",
+        "wheat",
+        "copper",
+        "housing",
+        "money-supply",
+        "coffee",
+        "sugar",
+        "trade",
+        "reserves",
+        "ship",
+        "cotton",
+        "carcass",
+        "crude",
+        "nat-gas",
+        "cpi",
+        "money-fx",
+        "interest",
+        "gnp",
+        "meal-feed",
+        "alum",
+        "oilseed",
+        "gold",
+        "tin",
+        "strategic-metal",
+        "livestock",
+        "retail",
+        "ipi",
+        "iron-steel",
+        "rubber",
+        "heat",
+        "jobs",
+        "lei",
+        "bop",
+        "zinc",
+        "orange",
+        "pet-chem",
+        "dlr",
+        "gas",
+        "silver",
+        "wpi",
+        "hog",
+        "lead",
+    )
diff --git a/keras/distribute/BUILD b/keras/distribute/BUILD
index 63b7fd485342..7c5d1c04714d 100644
--- a/keras/distribute/BUILD
+++ b/keras/distribute/BUILD
@@ -2,11 +2,13 @@
 #   keras/distribute package is intended to serve as the centralized place for things
 #   related to dist-strat used by Keras..
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "distribute_py_test")
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 load("@org_keras//keras:keras.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     # TODO(scottzhu): Remove this deps when distribute test are converted to integration test.
     default_visibility = [
         "//keras:friends",
@@ -26,7 +28,6 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":distribute_coordinator_utils",
-        ":sidecar_evaluator",
         "//:expect_tensorflow_installed",
         "//keras:backend",
         "//keras:callbacks",
@@ -63,8 +64,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//:expect_tensorflow_installed",
-        "//keras/optimizers/optimizer_experimental:optimizer",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers",
+        "//keras/optimizers/legacy:optimizers",
     ],
 )
 
@@ -121,6 +122,19 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "model_checkpoint_test",
+    srcs = ["model_checkpoint_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    deps = [
+        ":multi_worker_testing_utils",
+        ":worker_training_state",
+        "//:expect_tensorflow_installed",
+        "//keras",
+    ],
+)
+
 cuda_py_test(
     name = "worker_training_state_test",
     srcs = ["worker_training_state_test.py"],
@@ -144,7 +158,7 @@ distribute_py_test(
     ],
     deps = [
         "//:expect_tensorflow_installed",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
     ],
 )
 
@@ -174,12 +188,14 @@ cuda_py_test(
 distribute_py_test(
     name = "ctl_correctness_test",
     srcs = ["ctl_correctness_test.py"],
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "ctl_correctness_test.py",
     shard_count = 10,
     tags = [
         "multi_and_single_gpu",
         "no_cuda_asan",  # times out
-        "no_oss",  # TODO(b/226938240): Timeout
         "nomultivm",  # TODO(b/170502145)
     ],
     deps = [
@@ -246,7 +262,7 @@ distribute_py_test(
         ":strategy_combinations",
         "//:expect_absl_installed",
         "//:expect_tensorflow_installed",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
     ],
 )
 
@@ -273,13 +289,16 @@ distribute_py_test(
     size = "medium",
     srcs = ["keras_premade_models_test.py"],
     disable_mlir_bridge = False,
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     full_precision = True,
     main = "keras_premade_models_test.py",
     shard_count = 8,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/226938240): Reenable
         "nomultivm",  # TODO(b/170502145)
+        "requires-mem:28g",  # spawns multiple processes.
     ],
     deps = [
         ":distribute_strategy_test_lib",
@@ -415,7 +434,6 @@ distribute_py_test(
     shard_count = 16,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/226938240): Reenable
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
         "noasan",  # TODO(b/337374867) fails with -fsanitize=null
@@ -434,6 +452,9 @@ distribute_py_test(
     name = "keras_metrics_test",
     srcs = ["keras_metrics_test.py"],
     disable_mlir_bridge = False,
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "keras_metrics_test.py",
     shard_count = 8,
     tags = [
@@ -475,6 +496,7 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/277925387)
         "no_rocm",  # Would require size large, but that effectively disables the test for presubmits.
         "no_windows_gpu",
         "noasan",  # TODO(b/337374867) fails with -fsanitize=null
@@ -536,6 +558,7 @@ distribute_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_cuda_asan",  # times out
+        "no_pip",  # The test imports distribute_strategy_test which is not in the pip package.
         "no_windows_gpu",
         "nomultivm",  # TODO(b/170502145)
         "notsan",
@@ -645,7 +668,7 @@ cuda_py_test(
         "//keras:callbacks",
         "//keras:engine",
         "//keras/optimizers",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/utils:kpl_test_utils",
     ],
 )
@@ -675,7 +698,7 @@ py_library(
     deps = [
         "//:expect_tensorflow_installed",
         "//keras",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
     ],
 )
 
@@ -766,6 +789,27 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "parameter_server_exact_evaluation_test",
+    srcs = ["parameter_server_exact_evaluation_test.py"],
+    python_version = "PY3",
+    shard_count = 29,
+    tags = [
+        "multi_and_single_gpu",
+        "no_cuda_asan",  # TODO(b/186361027)
+        "no_oss",  # TODO(b/186248973)
+        "no_tfrt",
+        "nomultivm",  # TODO(b/170502145)
+        "notpu",
+    ],
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_utils",
+        "//keras/utils:dataset_creator",
+    ],
+)
+
 distribute_py_test(
     name = "dataset_creator_model_fit_test",
     srcs = ["dataset_creator_model_fit_test.py"],
@@ -838,30 +882,6 @@ tf_py_test(
     ],
 )
 
-py_library(
-    name = "sidecar_evaluator",
-    srcs = ["sidecar_evaluator.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//:expect_tensorboard_installed",
-        "//:expect_tensorflow_installed",
-    ],
-)
-
-tf_py_test(
-    name = "sidecar_evaluator_test",
-    size = "medium",
-    srcs = ["sidecar_evaluator_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":sidecar_evaluator",
-        "//:expect_absl_installed",
-        "//:expect_tensorflow_installed",
-        "//keras",
-        "//keras/testing_infra:test_utils",
-    ],
-)
-
 py_library(
     name = "strategy_combinations",
     srcs = ["strategy_combinations.py"],
@@ -905,7 +925,7 @@ py_library(
         "//keras/engine",
         "//keras/layers/core",
         "//keras/layers/preprocessing:string_lookup",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/utils:dataset_creator",
     ],
 )
diff --git a/keras/distribute/__init__.py b/keras/distribute/__init__.py
index 9348b6fe3d01..808055096522 100644
--- a/keras/distribute/__init__.py
+++ b/keras/distribute/__init__.py
@@ -13,6 +13,3 @@
 # limitations under the License.
 # ==============================================================================
 """Keras' Distribution Strategy library."""
-
-# pylint: disable=unused-import
-from keras.distribute import sidecar_evaluator
diff --git a/keras/distribute/checkpointing_test.py b/keras/distribute/checkpointing_test.py
index b03ce0703e02..a3d586fbc749 100644
--- a/keras/distribute/checkpointing_test.py
+++ b/keras/distribute/checkpointing_test.py
@@ -16,113 +16,117 @@
 import os
 
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-from keras.optimizers.optimizer_v2 import adam
 
+from keras.optimizers.legacy import adam
 
-class TrainingCheckpointTests(tf.test.TestCase, parameterized.TestCase):
 
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_one_cpu,
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.tpu_strategy,
-              tf.__internal__.distribute.combinations.tpu_strategy_packed_var,
-              tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,
-          ],
-          mode=["eager"]))
-  def testCheckpointRestoreOptimizerSlots(self, distribution):
-    def state():
-      with distribution.scope():
-        v = tf.Variable(tf.random.normal([]))
-      opt = adam.Adam(0.001)
-
-      @tf.function
-      def step():
-        def f():
-          with tf.GradientTape() as tape:
-            loss = v + v
-          gradients = tape.gradient(loss, [v])
-          opt.apply_gradients(zip(gradients, [v]))
-
-        distribution.run(f)
-
-      return v, opt, step
-
-    def checkpoint():
-      v, opt, step = state()
-      step()
-
-      # Save random weights into checkpoint.
-      checkpoint = tf.train.Checkpoint(v=v, opt=opt)
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      with self.test_session():
-        save_path = checkpoint.save(prefix)
-      return save_path
-
-    save_path = checkpoint()
-
-    v, opt, step = state()
-    checkpoint = tf.train.Checkpoint(v=v, opt=opt)
-    # Restore from the checkpoint inside a distribution.scope().
-    with self.test_session():
-      with distribution.scope():
-        checkpoint.restore(save_path)
-    step()
-    slot = opt.get_slot(v, "m")
-    self.assertEqual(v._distribute_strategy, slot._distribute_strategy)
-
-    v, opt, step = state()
-    checkpoint = tf.train.Checkpoint(v=v, opt=opt)
-    # Restore from the checkpoint outside a distribution.scope().
-    with self.test_session():
-      with self.assertRaisesRegex(
-          ValueError, "optimizer slot variable under the scope"):
-        checkpoint.restore(save_path)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_one_cpu,
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.cloud_tpu_strategy,
-              tf.__internal__.distribute.combinations.tpu_strategy,
-              tf.__internal__.distribute.combinations.tpu_strategy_packed_var,
-              tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,
-          ],
-          mode=["eager"]))
-  def testCheckpointSaveRestoreIoDevice(self, distribution):
-
-    def state():
-      with distribution.scope():
-        v = tf.Variable(tf.random.normal([]))
-        return v
-
-    ckpt_options = tf.train.CheckpointOptions(
-        experimental_io_device="/job:localhost")
-
-    def checkpoint():
-      v = state()
-      # Save random weights into checkpoint.
-      checkpoint = tf.train.Checkpoint(v=v)
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      with self.test_session():
-        save_path = checkpoint.save(prefix, options=ckpt_options)
-      return save_path
-
-    save_path = checkpoint()
-
-    v = state()
-    checkpoint = tf.train.Checkpoint(v=v)
-    # Restore from the checkpoint inside a distribution.scope().
-    # Check that restore works without error.
-    with self.test_session():
-      with distribution.scope():
-        checkpoint.restore(save_path, options=ckpt_options)
+class TrainingCheckpointTests(tf.test.TestCase, parameterized.TestCase):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_one_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.tpu_strategy,  # noqa: E501
+                tf.__internal__.distribute.combinations.tpu_strategy_packed_var,  # noqa: E501
+                tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["eager"],
+        )
+    )
+    def testCheckpointRestoreOptimizerSlots(self, distribution):
+        def state():
+            with distribution.scope():
+                v = tf.Variable(tf.random.normal([]))
+            opt = adam.Adam(0.001)
+
+            @tf.function
+            def step():
+                def f():
+                    with tf.GradientTape() as tape:
+                        loss = v + v
+                    gradients = tape.gradient(loss, [v])
+                    opt.apply_gradients(zip(gradients, [v]))
+
+                distribution.run(f)
+
+            return v, opt, step
+
+        def checkpoint():
+            v, opt, step = state()
+            step()
+
+            # Save random weights into checkpoint.
+            checkpoint = tf.train.Checkpoint(v=v, opt=opt)
+            prefix = os.path.join(self.get_temp_dir(), "ckpt")
+            with self.test_session():
+                save_path = checkpoint.save(prefix)
+            return save_path
+
+        save_path = checkpoint()
+
+        v, opt, step = state()
+        checkpoint = tf.train.Checkpoint(v=v, opt=opt)
+        # Restore from the checkpoint inside a distribution.scope().
+        with self.test_session():
+            with distribution.scope():
+                checkpoint.restore(save_path)
+        step()
+        slot = opt.get_slot(v, "m")
+        self.assertEqual(v._distribute_strategy, slot._distribute_strategy)
+
+        v, opt, step = state()
+        checkpoint = tf.train.Checkpoint(v=v, opt=opt)
+        # Restore from the checkpoint outside a distribution.scope().
+        with self.test_session():
+            with self.assertRaisesRegex(
+                ValueError, "optimizer slot variable under the scope"
+            ):
+                checkpoint.restore(save_path)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_one_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.cloud_tpu_strategy,  # noqa: E501
+                tf.__internal__.distribute.combinations.tpu_strategy,  # noqa: E501
+                tf.__internal__.distribute.combinations.tpu_strategy_packed_var,  # noqa: E501
+                tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["eager"],
+        )
+    )
+    def testCheckpointSaveRestoreIoDevice(self, distribution):
+        def state():
+            with distribution.scope():
+                v = tf.Variable(tf.random.normal([]))
+                return v
+
+        ckpt_options = tf.train.CheckpointOptions(
+            experimental_io_device="/job:localhost"
+        )
+
+        def checkpoint():
+            v = state()
+            # Save random weights into checkpoint.
+            checkpoint = tf.train.Checkpoint(v=v)
+            prefix = os.path.join(self.get_temp_dir(), "ckpt")
+            with self.test_session():
+                save_path = checkpoint.save(prefix, options=ckpt_options)
+            return save_path
+
+        save_path = checkpoint()
+
+        v = state()
+        checkpoint = tf.train.Checkpoint(v=v)
+        # Restore from the checkpoint inside a distribution.scope().
+        # Check that restore works without error.
+        with self.test_session():
+            with distribution.scope():
+                checkpoint.restore(save_path, options=ckpt_options)
 
 
 if __name__ == "__main__":
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/distribute/collective_all_reduce_strategy_test.py b/keras/distribute/collective_all_reduce_strategy_test.py
index da485d062f2b..42992cef34b9 100644
--- a/keras/distribute/collective_all_reduce_strategy_test.py
+++ b/keras/distribute/collective_all_reduce_strategy_test.py
@@ -15,58 +15,56 @@
 """Tests for CollectiveAllReduceStrategy."""
 
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
+
 from keras import layers
-from keras.testing_infra import test_utils
 from keras.engine import training
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_keras
+from keras.optimizers.legacy import gradient_descent as gradient_descent_keras
+from keras.testing_infra import test_utils
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
         strategy=[
-            tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,
-            tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,
+            tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,  # noqa: E501
         ],
-        mode=['eager']))
+        mode=["eager"],
+    )
+)
 class MultiWorkerMirroredStrategyTest(tf.test.TestCase, parameterized.TestCase):
+    def testFitWithoutStepsPerEpochPartialBatch(self, strategy):
+        def _model_fn():
+            x = layers.Input(shape=(1,), name="input")
+            y = layers.Dense(1, name="dense")(x)
+            model = training.Model(x, y)
+            return model
 
-  def testFitWithoutStepsPerEpochPartialBatch(self, strategy):
-
-    def _model_fn():
-      x = layers.Input(shape=(1,), name='input')
-      y = layers.Dense(1, name='dense')(x)
-      model = training.Model(x, y)
-      return model
-
-    def _get_dataset():
-      inputs = tf.expand_dims(
-          tf.constant(range(10)), axis=1)
-      targets = tf.expand_dims(
-          tf.constant(range(10)), axis=1)
-      # Make global batch size 12 for 2 replicas and a non-repeated dataset
-      # with 10 elements so that we have partial batch
-      dataset = tf.data.Dataset.from_tensor_slices(
-          (inputs, targets)).batch(
-              12, drop_remainder=False)
-      return dataset
+        def _get_dataset():
+            inputs = tf.expand_dims(tf.constant(range(10)), axis=1)
+            targets = tf.expand_dims(tf.constant(range(10)), axis=1)
+            # Make global batch size 12 for 2 replicas and a non-repeated
+            # dataset with 10 elements so that we have partial batch
+            dataset = tf.data.Dataset.from_tensor_slices(
+                (inputs, targets)
+            ).batch(12, drop_remainder=False)
+            return dataset
 
-    with strategy.scope():
-      optimizer_fn = gradient_descent_keras.SGD
-      optimizer = optimizer_fn(0.001)
-      model = _model_fn()
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics)
-    dataset = _get_dataset()
-    kernel_before = model.get_weights()[0][0]
-    model.fit(dataset, epochs=10)
-    kernel_after = model.get_weights()[0][0]
-    self.assertNotEqual(kernel_before, kernel_after)
-    self.assertGreater(abs(kernel_before - 1), abs(kernel_after - 1))
+        with strategy.scope():
+            optimizer_fn = gradient_descent_keras.SGD
+            optimizer = optimizer_fn(0.001)
+            model = _model_fn()
+            loss = "mse"
+            metrics = ["mae"]
+            model.compile(optimizer, loss, metrics=metrics)
+        dataset = _get_dataset()
+        kernel_before = model.get_weights()[0][0]
+        model.fit(dataset, epochs=10)
+        kernel_after = model.get_weights()[0][0]
+        self.assertNotEqual(kernel_before, kernel_after)
+        self.assertGreater(abs(kernel_before - 1), abs(kernel_after - 1))
 
 
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/ctl_correctness_test.py b/keras/distribute/ctl_correctness_test.py
index d5be37b534a5..48b15e8fb245 100644
--- a/keras/distribute/ctl_correctness_test.py
+++ b/keras/distribute/ctl_correctness_test.py
@@ -14,7 +14,10 @@
 # ==============================================================================
 """Custom Training Loop correctness test."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras import optimizers
 from keras.applications import resnet_v2
@@ -22,9 +25,8 @@
 from keras.distribute import optimizer_combinations
 from keras.distribute import strategy_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.ops.losses import losses_impl
 
 _NUM_SAMPLES = 66
@@ -35,382 +37,445 @@
 
 
 class MaybeStrategyScope:
-  """Provides a context allowing no distribution strategy."""
+    """Provides a context allowing no distribution strategy."""
 
-  def __init__(self, strategy):
-    self._strategy = strategy
-    self._scope = None
+    def __init__(self, strategy):
+        self._strategy = strategy
+        self._scope = None
 
-  def __enter__(self):
-    if self._strategy:
-      self._scope = self._strategy.scope()
-      self._scope.__enter__()
+    def __enter__(self):
+        if self._strategy:
+            self._scope = self._strategy.scope()
+            self._scope.__enter__()
 
-  def __exit__(self, exc_type, value, traceback):
-    if self._strategy:
-      self._scope.__exit__(exc_type, value, traceback)
-      self._scope = None
+    def __exit__(self, exc_type, value, traceback):
+        if self._strategy:
+            self._scope.__exit__(exc_type, value, traceback)
+            self._scope = None
 
 
 def get_model(sync_batchnorm=False):
-  model = keras.Sequential()
-  model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-  model.add(keras.layers.Dense(
-      10, activation='relu',
-      kernel_regularizer=keras.regularizers.l2(1e-4)))
-  if sync_batchnorm:
-    model.add(keras.layers.SyncBatchNormalization())
-  else:
-    model.add(keras.layers.BatchNormalization())
-  model.add(keras.layers.Dense(10, activation='relu'))
-  model.add(keras.layers.Dense(1))
-  return model
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(10, activation="relu", input_shape=(1,)))
+    model.add(
+        keras.layers.Dense(
+            10,
+            activation="relu",
+            kernel_regularizer=keras.regularizers.l2(1e-4),
+        )
+    )
+    if sync_batchnorm:
+        model.add(keras.layers.BatchNormalization(synchronized=True))
+    else:
+        model.add(keras.layers.BatchNormalization())
+    model.add(keras.layers.Dense(10, activation="relu"))
+    model.add(keras.layers.Dense(1))
+    return model
 
 
 def get_data():
-  x_train = np.random.rand(_NUM_SAMPLES, 1)
-  y_train = 3 * x_train
-  x_train = x_train.astype('float32')
-  y_train = y_train.astype('float32')
-  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-  train_dataset = train_dataset.batch(_BATCH_SIZE)
-  return train_dataset
+    x_train = np.random.rand(_NUM_SAMPLES, 1)
+    y_train = 3 * x_train
+    x_train = x_train.astype("float32")
+    y_train = y_train.astype("float32")
+    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    train_dataset = train_dataset.batch(_BATCH_SIZE)
+    return train_dataset
 
 
 def compute_loss(labels, logits, reg_losses):
-  pred_loss = keras.losses.mean_squared_error(labels, logits)
-  scaled_loss = tf.nn.compute_average_loss(
-      pred_loss, global_batch_size=_BATCH_SIZE)
-  l2_loss = tf.nn.scale_regularization_loss(reg_losses)
-  return scaled_loss + l2_loss
-
-
-def iteration_inside_func(initial_weights,
-                          dataset,
-                          optimizer_fn,
-                          iteration_type,
-                          strategy=None,
-                          sync_batchnorm=None,
-                          jit_compile=False):
-  """Helper function to test iterating over data inside a tf.function."""
-  with MaybeStrategyScope(strategy):
-    if strategy and sync_batchnorm:
-      model = get_model(sync_batchnorm)
-    else:
-      model = get_model()
-    model.set_weights(initial_weights)
-    optimizer = optimizer_fn()
-
-    training_accuracy = keras.metrics.CategoricalAccuracy(
-        'training_accuracy', dtype=tf.float32)
-
-    @tf.function
-    def train_epoch(dist_input):
-      """Training StepFn."""
-
-      @tf.function(jit_compile=jit_compile)
-      def step_fn(inputs):
-        samples, labels = inputs
-        with tf.GradientTape() as tape:
-          logits = model(samples)
-          loss = compute_loss(labels, logits, model.losses)
-        grads = tape.gradient(loss, model.trainable_variables)
-        optimizer.apply_gradients(zip(grads, model.trainable_variables))
-        training_accuracy.update_state(labels, logits)
-        return loss
-
-      total_loss = 0.0
-      num_batches = 0
-      if iteration_type == 'dataset':
-        for x in dist_input:
-          if strategy:
-            per_replica_losses = strategy.run(step_fn, args=(x,))
-            total_loss += strategy.reduce(tf.distribute.ReduceOp.SUM,
-                                          per_replica_losses,
-                                          axis=None)
-          else:
-            total_loss += step_fn(x)
-          num_batches += 1
-      else:
-        iterator = iter(dist_input)
-        for _ in range(_STEPS_PER_EPOCH):
-          if strategy:
-            per_replica_losses = strategy.run(step_fn, args=(next(iterator),))
-            total_loss += strategy.reduce(tf.distribute.ReduceOp.SUM,
-                                          per_replica_losses,
-                                          axis=None)
-          else:
-            total_loss += step_fn(next(iterator))
-          num_batches += 1
-
-      return total_loss / tf.cast(num_batches, dtype=tf.float32)
-
-    if strategy:
-      dataset = strategy.experimental_distribute_dataset(dataset)
-
-    for _ in range(_NUM_EPOCHS):
-      loss = train_epoch(dataset)
-
-    return (model.get_weights(),
-            loss,
-            training_accuracy.result())
-
-
-def iteration_outside_func(initial_weights,
-                           dataset,
-                           optimizer_fn,
-                           iteration_type,
-                           strategy=None,
-                           sync_batchnorm=None,
-                           jit_compile=False):
-  """Helper function to test iterating over data outside a tf.function."""
-  with MaybeStrategyScope(strategy):
-    model = get_model(sync_batchnorm=sync_batchnorm)
-    model.set_weights(initial_weights)
-    optimizer = optimizer_fn()
-
-    training_accuracy = keras.metrics.CategoricalAccuracy(
-        'training_accuracy', dtype=tf.float32)
-
-    @tf.function
-    def train_step(dist_inputs):
-      """Training StepFn."""
-
-      @tf.function(jit_compile=jit_compile)
-      def step_fn(inputs):
-        samples, labels = inputs
-        with tf.GradientTape() as tape:
-          logits = model(samples)
-          loss = compute_loss(labels, logits, model.losses)
-        grads = tape.gradient(loss, model.trainable_variables)
-        optimizer.apply_gradients(zip(grads, model.trainable_variables))
-        training_accuracy.update_state(labels, logits)
-        return loss
-
-      if strategy:
-        per_replica_losses = strategy.run(step_fn, args=(dist_inputs,))
-        return strategy.reduce(tf.distribute.ReduceOp.SUM,
-                               per_replica_losses,
-                               axis=None)
-      else:
-        return step_fn(dist_inputs)
-
-    if strategy:
-      dataset = strategy.experimental_distribute_dataset(dataset)
-
-    total_loss = 0.0
-    num_batches = 0
-    if iteration_type == 'dataset':
-      for _ in range(_NUM_EPOCHS):
-        for x in dataset:
-          total_loss += train_step(x)
-          num_batches += 1
-    else:
-      for _ in range(_NUM_EPOCHS):
-        iterator = iter(dataset)
-        for _ in range(_STEPS_PER_EPOCH):
-          total_loss += train_step(next(iterator))
-          num_batches += 1
-
-    return (model.get_weights(),
+    pred_loss = keras.losses.mean_squared_error(labels, logits)
+    scaled_loss = tf.nn.compute_average_loss(
+        pred_loss, global_batch_size=_BATCH_SIZE
+    )
+    l2_loss = tf.nn.scale_regularization_loss(reg_losses)
+    return scaled_loss + l2_loss
+
+
+def iteration_inside_func(
+    initial_weights,
+    dataset,
+    optimizer_fn,
+    iteration_type,
+    strategy=None,
+    sync_batchnorm=None,
+    jit_compile=False,
+):
+    """Helper function to test iterating over data inside a tf.function."""
+    with MaybeStrategyScope(strategy):
+        if strategy and sync_batchnorm:
+            model = get_model(sync_batchnorm)
+        else:
+            model = get_model()
+        model.set_weights(initial_weights)
+        optimizer = optimizer_fn()
+
+        training_accuracy = keras.metrics.CategoricalAccuracy(
+            "training_accuracy", dtype=tf.float32
+        )
+
+        @tf.function
+        def train_epoch(dist_input):
+            """Training StepFn."""
+
+            @tf.function(jit_compile=jit_compile)
+            def step_fn(inputs):
+                samples, labels = inputs
+                with tf.GradientTape() as tape:
+                    logits = model(samples)
+                    loss = compute_loss(labels, logits, model.losses)
+                grads = tape.gradient(loss, model.trainable_variables)
+                optimizer.apply_gradients(zip(grads, model.trainable_variables))
+                training_accuracy.update_state(labels, logits)
+                return loss
+
+            total_loss = 0.0
+            num_batches = 0
+            if iteration_type == "dataset":
+                for x in dist_input:
+                    if strategy:
+                        per_replica_losses = strategy.run(step_fn, args=(x,))
+                        total_loss += strategy.reduce(
+                            tf.distribute.ReduceOp.SUM,
+                            per_replica_losses,
+                            axis=None,
+                        )
+                    else:
+                        total_loss += step_fn(x)
+                    num_batches += 1
+            else:
+                iterator = iter(dist_input)
+                for _ in range(_STEPS_PER_EPOCH):
+                    if strategy:
+                        per_replica_losses = strategy.run(
+                            step_fn, args=(next(iterator),)
+                        )
+                        total_loss += strategy.reduce(
+                            tf.distribute.ReduceOp.SUM,
+                            per_replica_losses,
+                            axis=None,
+                        )
+                    else:
+                        total_loss += step_fn(next(iterator))
+                    num_batches += 1
+
+            return total_loss / tf.cast(num_batches, dtype=tf.float32)
+
+        if strategy:
+            dataset = strategy.experimental_distribute_dataset(dataset)
+
+        for _ in range(_NUM_EPOCHS):
+            loss = train_epoch(dataset)
+
+        return (model.get_weights(), loss, training_accuracy.result())
+
+
+def iteration_outside_func(
+    initial_weights,
+    dataset,
+    optimizer_fn,
+    iteration_type,
+    strategy=None,
+    sync_batchnorm=None,
+    jit_compile=False,
+):
+    """Helper function to test iterating over data outside a tf.function."""
+    with MaybeStrategyScope(strategy):
+        model = get_model(sync_batchnorm=sync_batchnorm)
+        model.set_weights(initial_weights)
+        optimizer = optimizer_fn()
+
+        training_accuracy = keras.metrics.CategoricalAccuracy(
+            "training_accuracy", dtype=tf.float32
+        )
+
+        @tf.function
+        def train_step(dist_inputs):
+            """Training StepFn."""
+
+            @tf.function(jit_compile=jit_compile)
+            def step_fn(inputs):
+                samples, labels = inputs
+                with tf.GradientTape() as tape:
+                    logits = model(samples)
+                    loss = compute_loss(labels, logits, model.losses)
+                grads = tape.gradient(loss, model.trainable_variables)
+                optimizer.apply_gradients(zip(grads, model.trainable_variables))
+                training_accuracy.update_state(labels, logits)
+                return loss
+
+            if strategy:
+                per_replica_losses = strategy.run(step_fn, args=(dist_inputs,))
+                return strategy.reduce(
+                    tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None
+                )
+            else:
+                return step_fn(dist_inputs)
+
+        if strategy:
+            dataset = strategy.experimental_distribute_dataset(dataset)
+
+        total_loss = 0.0
+        num_batches = 0
+        if iteration_type == "dataset":
+            for _ in range(_NUM_EPOCHS):
+                for x in dataset:
+                    total_loss += train_step(x)
+                    num_batches += 1
+        else:
+            for _ in range(_NUM_EPOCHS):
+                iterator = iter(dataset)
+                for _ in range(_STEPS_PER_EPOCH):
+                    total_loss += train_step(next(iterator))
+                    num_batches += 1
+
+        return (
+            model.get_weights(),
             total_loss / tf.cast(num_batches, dtype=tf.float32),
-            training_accuracy.result())
+            training_accuracy.result(),
+        )
 
 
 @test_utils.run_v2_only
-class TestDistributionStrategyDnnCorrectness(tf.test.TestCase,
-                                             parameterized.TestCase):
-  """Test custom training loop correctness with a simple DNN model."""
-
-  def setUp(self):
-    super().setUp()
-    np.random.seed(_RANDOM_SEED)
-    tf.compat.v1.set_random_seed(_RANDOM_SEED)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          optimizer_fn=optimizer_combinations.optimizers_v2,
-          mode=['eager'],
-          iteration_type=['iterator', 'dataset'],
-          inside_func=[False, True],
-          sync_batchnorm=[True, False],
-          jit_compile=[False]) + tf.__internal__.test.combinations.combine(
-              distribution=strategy_combinations.multiworker_strategies,
-              optimizer_fn=[
-                  optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
-                  optimizer_combinations.adagrad_optimizer_keras_v2_fn,
-                  optimizer_combinations.adam_experimental_fn,
-              ],
-              mode=['eager'],
-              iteration_type=['iterator', 'dataset'],
-              inside_func=[False, True],
-              sync_batchnorm=[True, False],
-              jit_compile=[False]) +
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.one_device_strategy_gpu,
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-          ],
-          optimizer_fn=[
-              optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
-              optimizer_combinations.adagrad_optimizer_keras_v2_fn
-          ],
-          mode=['eager'],
-          iteration_type=['iterator', 'dataset'],
-          inside_func=[False, True],
-          sync_batchnorm=[True, False],
-          jit_compile=[True]))
-  def test_dnn_correctness_minus_tpus(self, distribution, optimizer_fn,
-                                      iteration_type, inside_func,
-                                      sync_batchnorm, jit_compile):
-    # TODO(anjs): Identify why this particular V1 optimizer needs a higher tol.
-    if 'FtrlV1' in optimizer_fn._name and 'TPU' in type(distribution).__name__:
-      self.skipTest('Reduced tolerance of the order of 1e-1 required.')
-    self.dnn_correctness(distribution, optimizer_fn, iteration_type,
-                         inside_func, sync_batchnorm, jit_compile)
-
-  def dnn_correctness(self,
-                      distribution,
-                      optimizer_fn,
-                      iteration_type,
-                      inside_func,
-                      sync_batchnorm=None,
-                      jit_compile=False):
-    model = get_model(sync_batchnorm)
-    initial_weights = model.get_weights()
-    dataset = get_data()
-    if inside_func:
-      iteration_func = iteration_inside_func
-    else:
-      iteration_func = iteration_outside_func
-
-    wts_with_ds, loss_with_ds, acc_with_ds = iteration_func(
-        initial_weights,
-        dataset,
+class TestDistributionStrategyDnnCorrectness(
+    tf.test.TestCase, parameterized.TestCase
+):
+    """Test custom training loop correctness with a simple DNN model."""
+
+    def setUp(self):
+        super().setUp()
+        np.random.seed(_RANDOM_SEED)
+        tf.compat.v1.set_random_seed(_RANDOM_SEED)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.all_strategies,
+            optimizer_fn=optimizer_combinations.optimizers_v2,
+            mode=["eager"],
+            iteration_type=["iterator", "dataset"],
+            inside_func=[False, True],
+            sync_batchnorm=[True, False],
+            jit_compile=[False],
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.multiworker_strategies,
+            optimizer_fn=[
+                optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+                optimizer_combinations.adagrad_optimizer_keras_v2_fn,
+                optimizer_combinations.adam_experimental_fn,
+            ],
+            mode=["eager"],
+            iteration_type=["iterator", "dataset"],
+            inside_func=[False, True],
+            sync_batchnorm=[True, False],
+            jit_compile=[False],
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.one_device_strategy_gpu,
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            ],
+            optimizer_fn=[
+                optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+                optimizer_combinations.adagrad_optimizer_keras_v2_fn,
+            ],
+            mode=["eager"],
+            iteration_type=["iterator", "dataset"],
+            inside_func=[False, True],
+            sync_batchnorm=[True, False],
+            jit_compile=[True],
+        )
+    )
+    def test_dnn_correctness_minus_tpus(
+        self,
+        distribution,
         optimizer_fn,
         iteration_type,
-        strategy=distribution,
-        sync_batchnorm=sync_batchnorm,
-        jit_compile=jit_compile)
-    wts, loss, acc = iteration_func(
-        initial_weights,
-        dataset,
+        inside_func,
+        sync_batchnorm,
+        jit_compile,
+    ):
+        # TODO(anjs): Identify why this particular V1 optimizer needs a higher
+        # tol.
+        if (
+            "FtrlV1" in optimizer_fn._name
+            and "TPU" in type(distribution).__name__
+        ):
+            self.skipTest("Reduced tolerance of the order of 1e-1 required.")
+        self.dnn_correctness(
+            distribution,
+            optimizer_fn,
+            iteration_type,
+            inside_func,
+            sync_batchnorm,
+            jit_compile,
+        )
+
+    def dnn_correctness(
+        self,
+        distribution,
         optimizer_fn,
         iteration_type,
-        sync_batchnorm=sync_batchnorm,
-        jit_compile=False)
-
-    self.assertAllClose(wts, wts_with_ds, atol=1e-3, rtol=1e-3)
-    self.assertAllClose(loss, loss_with_ds, atol=1e-3, rtol=1e-3)
-    self.assertAllClose(acc, acc_with_ds, atol=1e-3, rtol=1e-3)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations
-              .mirrored_strategy_with_two_gpus,
-          ],
-          mode=['eager'],
-      ))
-  def test_fused_batch_norm_uneven_batch(self, distribution):
-    """Test that fused batch norm works when the last device may get empty data.
-
-    Adapted from https://www.tensorflow.org/tutorials/distribute/custom_training
-    but using ResNet, which uses fused batchnorm, as the model.
-
-    Arguments:
-      distribution: distribute test configuration
-    """
-    (train_images, train_labels), _ = fashion_mnist.load_data()
-    # add channel dimension to make 2D data into 3D, since some ops of the model
-    # require it.
-    train_images = train_images[..., None]
-    train_images = train_images / np.float32(255)
-
-    # Padding images because ResNet requires a minimal shape of (32, 32)
-    padded_train_images = np.concatenate([
-        np.zeros((len(train_images), 2, 28, 1)),
-        train_images,
-        np.zeros((len(train_images), 2, 28, 1))
-    ], axis=1)
-    padded_train_images = np.concatenate([
-        np.zeros((len(train_images), 32, 2, 1)),
-        padded_train_images,
-        np.zeros((len(train_images), 32, 2, 1))
-    ], axis=2)
-
-    buffer_size = len(train_images)
-    global_batch_size = distribution.num_replicas_in_sync
-    num_samples = global_batch_size - 1
-
-    epochs = 2
-
-    # Keep only the first images, so that the last GPU receives an empty batch
-    padded_train_images = padded_train_images[:num_samples]
-    train_labels = train_labels[:num_samples]
-
-    train_dataset = tf.data.Dataset.from_tensor_slices(
-        (padded_train_images,
-         train_labels)).shuffle(buffer_size).batch(global_batch_size)
-    train_dist_dataset = distribution.experimental_distribute_dataset(
-        train_dataset)
-
-    def create_model():
-      inputs = keras.Input((32, 32, 1))
-      preprocessed = keras.layers.Conv2D(3, (1, 1))(
-          inputs)  # ResNet requires 3 channels
-      features = resnet_v2.ResNet50V2(
-          include_top=False,
-          input_tensor=preprocessed,
-          pooling='avg',
-          weights=None).output
-      return keras.Model(inputs, features)
-
-    with distribution.scope():
-      # Set reduction to `none` so we can do the reduction afterwards and divide
-      # by global batch size.
-      loss_object = keras.losses.SparseCategoricalCrossentropy(
-          from_logits=True,
-          reduction=losses_impl.Reduction.NONE)
-
-      def compute_resnet_loss(labels, predictions):
-        per_example_loss = loss_object(labels, predictions)
-        return tf.nn.compute_average_loss(
-            per_example_loss, global_batch_size=global_batch_size)
-
-      model = create_model()
-
-      optimizer = optimizers.adam_v2.Adam()
-
-    def train_step(inputs):
-      images, labels = inputs
-
-      with tf.GradientTape() as tape:
-        predictions = model(images, training=True)
-        loss = compute_resnet_loss(labels, predictions)
-
-      gradients = tape.gradient(loss, model.trainable_variables)
-      optimizer.apply_gradients(zip(gradients, model.trainable_variables))
-      return loss
-
-    @tf.function
-    def distributed_train_step(dataset_inputs):
-      per_replica_losses = distribution.run(train_step, args=(dataset_inputs,))
-      return distribution.reduce(
-          tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
-
-    for epoch in range(epochs):
-      # Train loop
-      total_loss = 0.0
-      num_batches = 0
-      for x in train_dist_dataset:
-        total_loss += distributed_train_step(x)
-        num_batches += 1
-      train_loss = total_loss / num_batches
-
-      print(f'Epoch {epoch+1}, Loss: {train_loss}')
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+        inside_func,
+        sync_batchnorm=None,
+        jit_compile=False,
+    ):
+        model = get_model(sync_batchnorm)
+        initial_weights = model.get_weights()
+        dataset = get_data()
+        if inside_func:
+            iteration_func = iteration_inside_func
+        else:
+            iteration_func = iteration_outside_func
+
+        wts_with_ds, loss_with_ds, acc_with_ds = iteration_func(
+            initial_weights,
+            dataset,
+            optimizer_fn,
+            iteration_type,
+            strategy=distribution,
+            sync_batchnorm=sync_batchnorm,
+            jit_compile=jit_compile,
+        )
+        wts, loss, acc = iteration_func(
+            initial_weights,
+            dataset,
+            optimizer_fn,
+            iteration_type,
+            sync_batchnorm=sync_batchnorm,
+            jit_compile=False,
+        )
+
+        self.assertAllClose(wts, wts_with_ds, atol=1e-3, rtol=1e-3)
+        self.assertAllClose(loss, loss_with_ds, atol=1e-3, rtol=1e-3)
+        self.assertAllClose(acc, acc_with_ds, atol=1e-3, rtol=1e-3)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["eager"],
+        )
+    )
+    def test_fused_batch_norm_uneven_batch(self, distribution):
+        """Test that fused BN works when the last device gets empty data.
+
+        Adapted from
+        https://www.tensorflow.org/tutorials/distribute/custom_training
+        but using ResNet, which uses fused batchnorm, as the model.
+
+        Arguments:
+          distribution: distribute test configuration
+        """
+        self.skipTest("TODO(b/234354008): Requires fetching data from network.")
+        (train_images, train_labels), _ = fashion_mnist.load_data()
+        # add channel dimension to make 2D data into 3D, since some ops of the
+        # model require it.
+        train_images = train_images[..., None]
+        train_images = train_images / np.float32(255)
+
+        # Padding images because ResNet requires a minimal shape of (32, 32)
+        padded_train_images = np.concatenate(
+            [
+                np.zeros((len(train_images), 2, 28, 1)),
+                train_images,
+                np.zeros((len(train_images), 2, 28, 1)),
+            ],
+            axis=1,
+        )
+        padded_train_images = np.concatenate(
+            [
+                np.zeros((len(train_images), 32, 2, 1)),
+                padded_train_images,
+                np.zeros((len(train_images), 32, 2, 1)),
+            ],
+            axis=2,
+        )
+
+        buffer_size = len(train_images)
+        global_batch_size = distribution.num_replicas_in_sync
+        num_samples = global_batch_size - 1
+
+        epochs = 2
+
+        # Keep only the first images, so that the last GPU receives an empty
+        # batch
+        padded_train_images = padded_train_images[:num_samples]
+        train_labels = train_labels[:num_samples]
+
+        train_dataset = (
+            tf.data.Dataset.from_tensor_slices(
+                (padded_train_images, train_labels)
+            )
+            .shuffle(buffer_size)
+            .batch(global_batch_size)
+        )
+        train_dist_dataset = distribution.experimental_distribute_dataset(
+            train_dataset
+        )
+
+        def create_model():
+            inputs = keras.Input((32, 32, 1))
+            preprocessed = keras.layers.Conv2D(3, (1, 1))(
+                inputs
+            )  # ResNet requires 3 channels
+            features = resnet_v2.ResNet50V2(
+                include_top=False,
+                input_tensor=preprocessed,
+                pooling="avg",
+                weights=None,
+            ).output
+            return keras.Model(inputs, features)
+
+        with distribution.scope():
+            # Set reduction to `none` so we can do the reduction afterwards and
+            # divide by global batch size.
+            loss_object = keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True, reduction=losses_impl.Reduction.NONE
+            )
+
+            def compute_resnet_loss(labels, predictions):
+                per_example_loss = loss_object(labels, predictions)
+                return tf.nn.compute_average_loss(
+                    per_example_loss, global_batch_size=global_batch_size
+                )
+
+            model = create_model()
+
+            optimizer = optimizers.adam_legacy.Adam()
+
+        def train_step(inputs):
+            images, labels = inputs
+
+            with tf.GradientTape() as tape:
+                predictions = model(images, training=True)
+                loss = compute_resnet_loss(labels, predictions)
+
+            gradients = tape.gradient(loss, model.trainable_variables)
+            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+            return loss
+
+        @tf.function
+        def distributed_train_step(dataset_inputs):
+            per_replica_losses = distribution.run(
+                train_step, args=(dataset_inputs,)
+            )
+            return distribution.reduce(
+                tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None
+            )
+
+        for epoch in range(epochs):
+            # Train loop
+            total_loss = 0.0
+            num_batches = 0
+            for x in train_dist_dataset:
+                total_loss += distributed_train_step(x)
+                num_batches += 1
+            train_loss = total_loss / num_batches
+
+            print(f"Epoch {epoch+1}, Loss: {train_loss}")
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/custom_training_loop_metrics_test.py b/keras/distribute/custom_training_loop_metrics_test.py
index c7957dd87c02..a48a7d6b1b8f 100644
--- a/keras/distribute/custom_training_loop_metrics_test.py
+++ b/keras/distribute/custom_training_loop_metrics_test.py
@@ -14,110 +14,120 @@
 # ==============================================================================
 """Tests for custom training loops."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
-from tensorflow.python.framework import test_util as tf_test_utils
+
 from keras import metrics
 from keras.distribute import strategy_combinations
 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
-class KerasMetricsTest(tf.test.TestCase, parameterized.TestCase):
 
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.all_strategies +
-          strategy_combinations.multiworker_strategies,
-          mode=["eager"]
-      ))
-  def test_multiple_keras_metrics_experimental_run(self, distribution):
-    with distribution.scope():
-      loss_metric = metrics.Mean("loss", dtype=np.float32)
-      loss_metric_2 = metrics.Mean("loss_2", dtype=np.float32)
-
-    @tf.function
-    def train_step():
-      def step_fn():
-        loss = tf.constant(5.0, dtype=np.float32)
-        loss_metric.update_state(loss)
-        loss_metric_2.update_state(loss)
-
-      distribution.run(step_fn)
-
-    train_step()
-    self.assertEqual(loss_metric.result().numpy(),
-                     loss_metric_2.result().numpy())
-    self.assertEqual(loss_metric.result().numpy(), 5.0)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.all_strategies+
-          strategy_combinations.multiworker_strategies,
-          mode=["eager"]
-      ))
-  def test_update_keras_metric_declared_in_strategy_scope(self, distribution):
-    with distribution.scope():
-      metric = metrics.Mean("test_metric", dtype=np.float32)
-
-    dataset = tf.data.Dataset.range(10).batch(2)
-    dataset = distribution.experimental_distribute_dataset(dataset)
-
-    @tf.function
-    def step_fn(i):
-      metric.update_state(i)
-
-    for i in dataset:
-      distribution.run(step_fn, args=(i,))
-
-    # This should be the mean of integers 0-9 which has a sum of 45 and a count
-    # of 10 resulting in mean of 4.5.
-    self.assertEqual(metric.result().numpy(), 4.5)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_update_keras_metric_outside_strategy_scope_cross_replica(
-      self, distribution):
-    metric = metrics.Mean("test_metric", dtype=np.float32)
-
-    with distribution.scope():
-      for i in range(10):
-        metric.update_state(i)
-
-    # This should be the mean of integers 0-9 which has a sum of 45 and a count
-    # of 10 resulting in mean of 4.5.
-    self.assertEqual(metric.result().numpy(), 4.5)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.all_strategies, mode=["eager"]))
-  @tf_test_utils.disable_mlir_bridge(
-      "TODO(b/168036682): Support dynamic padder")
-  def test_update_keras_metrics_dynamic_shape(self, distribution):
-    with distribution.scope():
-      metric = metrics.Mean("test_metric", dtype=np.float32)
-
-    dataset = tf.data.Dataset.range(10).batch(2, drop_remainder=False)
-
-    @tf.function
-    def train_fn(dataset):
-      weights = tf.constant([0.1, 0.1])
-
-      def step_fn(i):
-        metric.update_state(i, weights)
-
-      for i in dataset:
-        distribution.run(step_fn, args=(i,))
-
-    train_fn(dataset)
-
-    # This should be the mean of integers 0-9 which has a sum of 45 and a count
-    # of 10 resulting in mean of 4.5.
-    self.assertEqual(metric.result().numpy(), 4.5)
+class KerasMetricsTest(tf.test.TestCase, parameterized.TestCase):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.all_strategies
+            + strategy_combinations.multiworker_strategies,
+            mode=["eager"],
+        )
+    )
+    def test_multiple_keras_metrics_experimental_run(self, distribution):
+        with distribution.scope():
+            loss_metric = metrics.Mean("loss", dtype=np.float32)
+            loss_metric_2 = metrics.Mean("loss_2", dtype=np.float32)
+
+        @tf.function
+        def train_step():
+            def step_fn():
+                loss = tf.constant(5.0, dtype=np.float32)
+                loss_metric.update_state(loss)
+                loss_metric_2.update_state(loss)
+
+            distribution.run(step_fn)
+
+        train_step()
+        self.assertEqual(
+            loss_metric.result().numpy(), loss_metric_2.result().numpy()
+        )
+        self.assertEqual(loss_metric.result().numpy(), 5.0)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.all_strategies
+            + strategy_combinations.multiworker_strategies,
+            mode=["eager"],
+        )
+    )
+    def test_update_keras_metric_declared_in_strategy_scope(self, distribution):
+        with distribution.scope():
+            metric = metrics.Mean("test_metric", dtype=np.float32)
+
+        dataset = tf.data.Dataset.range(10).batch(2)
+        dataset = distribution.experimental_distribute_dataset(dataset)
+
+        @tf.function
+        def step_fn(i):
+            metric.update_state(i)
+
+        for i in dataset:
+            distribution.run(step_fn, args=(i,))
+
+        # This should be the mean of integers 0-9 which has a sum of 45 and a
+        # count of 10 resulting in mean of 4.5.
+        self.assertEqual(metric.result().numpy(), 4.5)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.all_strategies, mode=["eager"]
+        )
+    )
+    def test_update_keras_metric_outside_strategy_scope_cross_replica(
+        self, distribution
+    ):
+        metric = metrics.Mean("test_metric", dtype=np.float32)
+
+        with distribution.scope():
+            for i in range(10):
+                metric.update_state(i)
+
+        # This should be the mean of integers 0-9 which has a sum of 45 and a
+        # count of 10 resulting in mean of 4.5.
+        self.assertEqual(metric.result().numpy(), 4.5)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.all_strategies, mode=["eager"]
+        )
+    )
+    @tf_test_utils.disable_mlir_bridge(
+        "TODO(b/168036682): Support dynamic padder"
+    )
+    def test_update_keras_metrics_dynamic_shape(self, distribution):
+        with distribution.scope():
+            metric = metrics.Mean("test_metric", dtype=np.float32)
+
+        dataset = tf.data.Dataset.range(10).batch(2, drop_remainder=False)
+
+        @tf.function
+        def train_fn(dataset):
+            weights = tf.constant([0.1, 0.1])
+
+            def step_fn(i):
+                metric.update_state(i, weights)
+
+            for i in dataset:
+                distribution.run(step_fn, args=(i,))
+
+        train_fn(dataset)
+
+        # This should be the mean of integers 0-9 which has a sum of 45 and a
+        # count of 10 resulting in mean of 4.5.
+        self.assertEqual(metric.result().numpy(), 4.5)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/custom_training_loop_models_test.py b/keras/distribute/custom_training_loop_models_test.py
index 7e6990608eb7..cdcd869b9fab 100644
--- a/keras/distribute/custom_training_loop_models_test.py
+++ b/keras/distribute/custom_training_loop_models_test.py
@@ -14,526 +14,558 @@
 # ==============================================================================
 """Tests for custom training loops."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras.distribute import strategy_combinations
 from keras.layers import core
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
 
 
 class CustomModel(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        with self.name_scope:
+            self._layers = [
+                keras.layers.Dense(4, name="dense"),
+            ]
 
-  def __init__(self, name=None):
-    super().__init__(name=name)
-    with self.name_scope:
-      self._layers = [
-          keras.layers.Dense(4, name="dense"),
-      ]
-
-  @tf.Module.with_name_scope
-  def __call__(self, x):
-    for layer in self._layers:
-      x = layer(x)
-    return x
+    @tf.Module.with_name_scope
+    def __call__(self, x):
+        for layer in self._layers:
+            x = layer(x)
+        return x
 
 
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        distribution=(strategy_combinations.all_strategies +
-                      strategy_combinations.multiworker_strategies),
-        mode=["eager"]
-        )
+        distribution=(
+            strategy_combinations.all_strategies
+            + strategy_combinations.multiworker_strategies
+        ),
+        mode=["eager"],
     )
+)
 class KerasModelsTest(tf.test.TestCase, parameterized.TestCase):
+    def test_single_keras_layer_run(self, distribution):
+        dataset = _get_dataset()
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-  def test_single_keras_layer_run(self, distribution):
-    dataset = _get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = keras.layers.Dense(4, name="dense")
-
-    @tf.function
-    def train_step(iterator):
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        return grads
-
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
-      return tf.nest.map_structure(distribution.experimental_local_results,
-                                   outputs)
-
-    train_step(input_iterator)
-
-  def test_keras_model_optimizer_run(self, distribution):
-    dataset = _get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = _get_model()
-      optimizer = keras.optimizers.optimizer_v2.rmsprop.RMSprop()
-
-    @tf.function
-    def train_step(replicated_inputs):
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        optimizer.apply_gradients(zip(grads, model.variables))
-        return loss
-
-      outputs = distribution.run(step_fn, args=(replicated_inputs,))
-      return tf.nest.map_structure(distribution.experimental_local_results,
-                                   outputs)
-
-    for x in input_iterator:
-      train_step(x)
-
-  def test_keras_subclass_model_optimizer_run(self, distribution):
-    def get_subclass_model():
-
-      class KerasSubclassModel(keras.Model):
-
-        def __init__(self):
-          super().__init__()
-          self.l = keras.layers.Dense(4, name="dense")
-
-        def call(self, x):
-          return self.l(x)
-
-      return KerasSubclassModel()
-    dataset = _get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = get_subclass_model()
-      optimizer = keras.optimizers.optimizer_v2.rmsprop.RMSprop()
-
-    @tf.function
-    def train_step(iterator):
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        optimizer.apply_gradients(zip(grads, model.variables))
-        return loss
-
-      outputs = distribution.run(step_fn, args=(next(iterator),))
-      return tf.nest.map_structure(distribution.experimental_local_results,
-                                   outputs)
-
-    train_step(input_iterator)
-
-  def test_keras_model_optimizer_run_loop(self, distribution):
-    dataset = _get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = _get_model()
-      optimizer = keras.optimizers.optimizer_v2.rmsprop.RMSprop()
-
-    @tf.function
-    def train_step(iterator):
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        optimizer.apply_gradients(zip(grads, model.variables))
-        return loss
-
-      for _ in tf.range(4):
-        distribution.run(step_fn, args=(next(iterator),))
-
-    train_step(input_iterator)
-
-  def test_batch_norm_with_dynamic_batch(self, distribution):
-    inputs = np.zeros((10, 3, 3, 3), dtype=np.float32)
-    targets = np.zeros((10, 4), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat()
-    dataset = dataset.batch(10)
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      x = keras.layers.Input(shape=(3, 3, 3), name="input")
-      y = keras.layers.BatchNormalization(fused=True, name="bn")(x)
-      y = keras.layers.Flatten()(y)
-      y = keras.layers.Dense(4, name="dense")(y)
-      model = keras.Model(x, y)
-      optimizer = keras.optimizers.optimizer_v2.rmsprop.RMSprop()
-
-    @tf.function
-    def train_step(iterator):
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images, training=True)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        optimizer.apply_gradients(zip(grads, model.variables))
-        return loss
-
-      distribution.run(step_fn, args=(next(iterator),))
-
-    train_step(input_iterator)
-
-  def test_lstm(self, distribution):
-
-    batch_size = 32
-
-    def create_lstm_model():
-      model = keras.models.Sequential()
-      # We only have LSTM variables so we can detect no gradient issues more
-      # easily.
-      model.add(
-          keras.layers.LSTM(1, return_sequences=False, input_shape=(10, 1)))
-      return model
-
-    def create_lstm_data():
-      seq_length = 10
-
-      x_train = np.random.rand(batch_size, seq_length, 1).astype("float32")
-      y_train = np.random.rand(batch_size, 1).astype("float32")
-      return x_train, y_train
-
-    x, y = create_lstm_data()
-    dataset = tf.data.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(batch_size)
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = create_lstm_model()
-      optimizer = keras.optimizers.optimizer_v2.gradient_descent.SGD()
-
-    @tf.function
-    def train_step(input_iterator):
-
-      def step_fn(inputs):
-        inps, targ = inputs
-        with tf.GradientTape() as tape:
-          output = model(inps)
-          loss = tf.reduce_mean(
-              keras.losses.binary_crossentropy(
-                  y_true=targ, y_pred=output, from_logits=False))
-        grads = tape.gradient(loss, model.variables)
-        optimizer.apply_gradients(zip(grads, model.variables))
-        return loss
-
-      outputs = distribution.run(
-          step_fn, args=(next(input_iterator),))
-      return distribution.experimental_local_results(outputs)
-
-    train_step(input_iterator)
-
-  def test_nested_tf_functions(self, distribution):
-    # The test builds two computations with keras layers, one with nested
-    # tf.function, and the other without nested tf.function. We run these
-    # computations independently on the model with same weights, and make sure
-    # the variables are still the same after one training step.
-
-    inputs = np.random.random((10, 3)).astype(np.float32)
-    targets = np.ones((10, 4), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).repeat()
-    dataset = dataset.batch(10)
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    def get_model():
-      x = keras.layers.Input(shape=(3,), name="input")
-      y = keras.layers.Dense(4, name="dense")(x)
-      model = keras.Model(x, y)
-      return model
-
-    with distribution.scope():
-      model = get_model()
-      optimizer = keras.optimizers.optimizer_v2.gradient_descent.SGD(
-          0.1, momentum=0.01)
-      weights_file = os.path.join(self.get_temp_dir(), ".h5")
-      model.save_weights(weights_file)
-      model2 = get_model()
-      model2.load_weights(weights_file)
-
-    # Make sure model and model2 variables are in sync when initialized.
-    for model_v, model2_v in zip(model.variables, model2.variables):
-      self.assertAllClose(model_v.numpy(), model2_v.numpy())
-
-    def compute_loss(images, targets):
-      outputs = model(images)
-      return keras.losses.mean_squared_error(targets, outputs)
-
-    @tf.function
-    def train_step_without_nested_tf_function(inputs):
-
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          loss = compute_loss(images, targets)
-        grads = tape.gradient(loss, model.variables)
-        optimizer.apply_gradients(zip(grads, model.variables))
-
-      distribution.run(step_fn, args=(inputs,))
-
-    @tf.function
-    def compute_loss2(images, targets):
-      outputs = model2(images)
-      return keras.losses.mean_squared_error(targets, outputs)
-
-    @tf.function
-    def train_step_with_nested_tf_function(inputs):
-
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          loss = compute_loss2(images, targets)
-        grads = tape.gradient(loss, model2.variables)
-        optimizer.apply_gradients(zip(grads, model2.variables))
-
-      distribution.run(step_fn, args=(inputs,))
-
-    inputs = next(input_iterator)
-
-    train_step_without_nested_tf_function(inputs)
-    train_step_with_nested_tf_function(inputs)
-
-    # Make sure model and model2 variables are still in sync.
-    for model_v, model2_v in zip(model.variables, model2.variables):
-      self.assertAllClose(model_v.numpy(), model2_v.numpy())
-
-  def test_nested_tf_functions_with_control_flow(self, distribution):
-    inputs = np.random.random((10, 3)).astype(np.float32)
-    targets = np.ones((10, 4), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).repeat()
-    dataset = dataset.batch(10)
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    def get_model():
-      x = keras.layers.Input(shape=(3,), name="input")
-      y = keras.layers.Dense(4, name="dense")(x)
-      model = keras.Model(x, y)
-      return model
-
-    with distribution.scope():
-      model = get_model()
-      optimizer = keras.optimizers.optimizer_v2.gradient_descent.SGD(
-          0.1, momentum=0.01)
-
-    @tf.function
-    def train_step(iterator):
-
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        optimizer.apply_gradients(zip(grads, model.variables))
-
-      distribution.run(step_fn, args=(next(iterator),))
-
-    @tf.function
-    def train_steps(iterator):
-      for _ in tf.range(10):
-        train_step(iterator)
-
-    train_steps(input_iterator)
-
-  def test_nested_tf_functions_with_tf_function_passing_to_strategy_run(
-      self, distribution):
-    self.skipTest("b/190608193")
-
-    inputs = np.random.random((10, 3)).astype(np.float32)
-    targets = np.ones((10, 4), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).repeat()
-    dataset = dataset.batch(10)
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    def get_model():
-      x = keras.layers.Input(shape=(3,), name="input")
-      y = keras.layers.Dense(4, name="dense")(x)
-      model = keras.Model(x, y)
-      return model
-
-    with distribution.scope():
-      model = get_model()
-      optimizer = keras.optimizers.optimizer_v2.gradient_descent.SGD(
-          0.1, momentum=0.01)
-
-    @tf.function
-    def compute_loss(images, targets):
-      outputs = model(images)
-      return keras.losses.mean_squared_error(targets, outputs)
-
-    @tf.function
-    def step_fn(inputs):
-      images, targets = inputs
-      with tf.GradientTape() as tape:
-        loss = compute_loss(images, targets)
-      grads = tape.gradient(loss, model.variables)
-      optimizer.apply_gradients(zip(grads, model.variables))
-
-    inputs = next(input_iterator)
-    distribution.run(step_fn, args=(inputs,))
-
-  def test_customized_tf_module_run(self, distribution):
-    dataset = _get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = CustomModel()
-
-    @tf.function
-    def train_step(iterator):
-
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        return grads
-
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
-      return tf.nest.map_structure(distribution.experimental_local_results,
-                                   outputs)
-
-    train_step(input_iterator)
-
-  def test_reduce_loss(self, distribution):
-    inputs = np.zeros((10, 4), dtype=np.float32)
-    targets = np.zeros((10, 1), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.batch(10)
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      x = keras.layers.Input(shape=(4), name="input")
-      y = keras.layers.Dense(3, name="dense")(x)
-      model = keras.Model(x, y)
+        with distribution.scope():
+            model = keras.layers.Dense(4, name="dense")
+
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                return grads
+
+            outputs = distribution.run(step_fn, args=(next(iterator),))
+            return tf.nest.map_structure(
+                distribution.experimental_local_results, outputs
+            )
+
+        train_step(input_iterator)
+
+    def test_keras_model_optimizer_run(self, distribution):
+        dataset = _get_dataset()
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-    @tf.function
-    def train_step(iterator):
+        with distribution.scope():
+            model = _get_model()
+            optimizer = keras.optimizers.legacy.rmsprop.RMSprop()
+
+        @tf.function
+        def train_step(replicated_inputs):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grads, model.variables))
+                return loss
+
+            outputs = distribution.run(step_fn, args=(replicated_inputs,))
+            return tf.nest.map_structure(
+                distribution.experimental_local_results, outputs
+            )
+
+        for x in input_iterator:
+            train_step(x)
+
+    def test_keras_subclass_model_optimizer_run(self, distribution):
+        def get_subclass_model():
+            class KerasSubclassModel(keras.Model):
+                def __init__(self):
+                    super().__init__()
+                    self.l = keras.layers.Dense(4, name="dense")
+
+                def call(self, x):
+                    return self.l(x)
+
+            return KerasSubclassModel()
+
+        dataset = _get_dataset()
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-      def step_fn(inputs):
-        images, targets = inputs
-        outputs = model(images)
-        loss = keras.losses.sparse_categorical_crossentropy(targets, outputs)
-        return loss
+        with distribution.scope():
+            model = get_subclass_model()
+            optimizer = keras.optimizers.legacy.rmsprop.RMSprop()
+
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grads, model.variables))
+                return loss
+
+            outputs = distribution.run(step_fn, args=(next(iterator),))
+            return tf.nest.map_structure(
+                distribution.experimental_local_results, outputs
+            )
+
+        train_step(input_iterator)
+
+    def test_keras_model_optimizer_run_loop(self, distribution):
+        dataset = _get_dataset()
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-      return distribution.run(step_fn, args=(next(iterator),))
+        with distribution.scope():
+            model = _get_model()
+            optimizer = keras.optimizers.legacy.rmsprop.RMSprop()
+
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grads, model.variables))
+                return loss
+
+            for _ in tf.range(4):
+                distribution.run(step_fn, args=(next(iterator),))
+
+        train_step(input_iterator)
+
+    def test_batch_norm_with_dynamic_batch(self, distribution):
+        inputs = np.zeros((10, 3, 3, 3), dtype=np.float32)
+        targets = np.zeros((10, 4), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat()
+        dataset = dataset.batch(10)
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-    loss = train_step(input_iterator)
-    loss = distribution.reduce(tf.distribute.ReduceOp.MEAN, loss, axis=0)
+        with distribution.scope():
+            x = keras.layers.Input(shape=(3, 3, 3), name="input")
+            y = keras.layers.BatchNormalization(fused=True, name="bn")(x)
+            y = keras.layers.Flatten()(y)
+            y = keras.layers.Dense(4, name="dense")(y)
+            model = keras.Model(x, y)
+            optimizer = keras.optimizers.legacy.rmsprop.RMSprop()
+
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images, training=True)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grads, model.variables))
+                return loss
+
+            distribution.run(step_fn, args=(next(iterator),))
+
+        train_step(input_iterator)
+
+    def test_lstm(self, distribution):
+
+        batch_size = 32
+
+        def create_lstm_model():
+            model = keras.models.Sequential()
+            # We only have LSTM variables so we can detect no gradient issues
+            # more easily.
+            model.add(
+                keras.layers.LSTM(
+                    1, return_sequences=False, input_shape=(10, 1)
+                )
+            )
+            return model
+
+        def create_lstm_data():
+            seq_length = 10
+
+            x_train = np.random.rand(batch_size, seq_length, 1).astype(
+                "float32"
+            )
+            y_train = np.random.rand(batch_size, 1).astype("float32")
+            return x_train, y_train
+
+        x, y = create_lstm_data()
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        dataset = dataset.batch(batch_size)
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-  def test_variable_run_argument(self, distribution):
-    # Test that variables passed to run() remain variables. Previous behavior
-    # in TPUStrategy was to cast to Tensor.
+        with distribution.scope():
+            model = create_lstm_model()
+            optimizer = keras.optimizers.legacy.gradient_descent.SGD()
+
+        @tf.function
+        def train_step(input_iterator):
+            def step_fn(inputs):
+                inps, targ = inputs
+                with tf.GradientTape() as tape:
+                    output = model(inps)
+                    loss = tf.reduce_mean(
+                        keras.losses.binary_crossentropy(
+                            y_true=targ, y_pred=output, from_logits=False
+                        )
+                    )
+                grads = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grads, model.variables))
+                return loss
+
+            outputs = distribution.run(step_fn, args=(next(input_iterator),))
+            return distribution.experimental_local_results(outputs)
+
+        train_step(input_iterator)
+
+    def test_nested_tf_functions(self, distribution):
+        # The test builds two computations with keras layers, one with nested
+        # tf.function, and the other without nested tf.function. We run these
+        # computations independently on the model with same weights, and make
+        # sure the variables are still the same after one training step.
+
+        inputs = np.random.random((10, 3)).astype(np.float32)
+        targets = np.ones((10, 4), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).repeat()
+        dataset = dataset.batch(10)
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-    with distribution.scope():
-      optimizer = gradient_descent.SGD(0.1)
-      net = core.Dense(1, trainable=True)
-    dataset = tf.data.Dataset.from_tensors([[1.]])
-    dataset = dataset.repeat()
-    dataset = dataset.batch(2, drop_remainder=True)
+        def get_model():
+            x = keras.layers.Input(shape=(3,), name="input")
+            y = keras.layers.Dense(4, name="dense")(x)
+            model = keras.Model(x, y)
+            return model
+
+        with distribution.scope():
+            model = get_model()
+            optimizer = keras.optimizers.legacy.gradient_descent.SGD(
+                0.1, momentum=0.01
+            )
+            weights_file = os.path.join(self.get_temp_dir(), ".h5")
+            model.save_weights(weights_file)
+            model2 = get_model()
+            model2.load_weights(weights_file)
+
+        # Make sure model and model2 variables are in sync when initialized.
+        for model_v, model2_v in zip(model.variables, model2.variables):
+            self.assertAllClose(model_v.numpy(), model2_v.numpy())
+
+        def compute_loss(images, targets):
+            outputs = model(images)
+            return keras.losses.mean_squared_error(targets, outputs)
+
+        @tf.function
+        def train_step_without_nested_tf_function(inputs):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    loss = compute_loss(images, targets)
+                grads = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grads, model.variables))
+
+            distribution.run(step_fn, args=(inputs,))
+
+        @tf.function
+        def compute_loss2(images, targets):
+            outputs = model2(images)
+            return keras.losses.mean_squared_error(targets, outputs)
+
+        @tf.function
+        def train_step_with_nested_tf_function(inputs):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    loss = compute_loss2(images, targets)
+                grads = tape.gradient(loss, model2.variables)
+                optimizer.apply_gradients(zip(grads, model2.variables))
+
+            distribution.run(step_fn, args=(inputs,))
+
+        inputs = next(input_iterator)
+
+        train_step_without_nested_tf_function(inputs)
+        train_step_with_nested_tf_function(inputs)
+
+        # Make sure model and model2 variables are still in sync.
+        for model_v, model2_v in zip(model.variables, model2.variables):
+            self.assertAllClose(model_v.numpy(), model2_v.numpy())
+
+    def test_nested_tf_functions_with_control_flow(self, distribution):
+        inputs = np.random.random((10, 3)).astype(np.float32)
+        targets = np.ones((10, 4), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).repeat()
+        dataset = dataset.batch(10)
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-    def replica_step(trainable_variables, features):
+        def get_model():
+            x = keras.layers.Input(shape=(3,), name="input")
+            y = keras.layers.Dense(4, name="dense")(x)
+            model = keras.Model(x, y)
+            return model
+
+        with distribution.scope():
+            model = get_model()
+            optimizer = keras.optimizers.legacy.gradient_descent.SGD(
+                0.1, momentum=0.01
+            )
+
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                optimizer.apply_gradients(zip(grads, model.variables))
+
+            distribution.run(step_fn, args=(next(iterator),))
+
+        @tf.function
+        def train_steps(iterator):
+            for _ in tf.range(10):
+                train_step(iterator)
+
+        train_steps(input_iterator)
+
+    def test_nested_tf_functions_with_tf_function_passing_to_strategy_run(
+        self, distribution
+    ):
+        self.skipTest("b/190608193")
+
+        inputs = np.random.random((10, 3)).astype(np.float32)
+        targets = np.ones((10, 4), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).repeat()
+        dataset = dataset.batch(10)
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-      with tf.GradientTape() as tape:
-        net_out = net(features[0], training=True)
-        loss = (net_out - 1.0) * (net_out - 1.0)
-      gradients = tape.gradient(loss, trainable_variables)
-      optimizer.apply_gradients(zip(gradients, trainable_variables))
-      return loss
+        def get_model():
+            x = keras.layers.Input(shape=(3,), name="input")
+            y = keras.layers.Dense(4, name="dense")(x)
+            model = keras.Model(x, y)
+            return model
+
+        with distribution.scope():
+            model = get_model()
+            optimizer = keras.optimizers.legacy.gradient_descent.SGD(
+                0.1, momentum=0.01
+            )
+
+        @tf.function
+        def compute_loss(images, targets):
+            outputs = model(images)
+            return keras.losses.mean_squared_error(targets, outputs)
+
+        @tf.function
+        def step_fn(inputs):
+            images, targets = inputs
+            with tf.GradientTape() as tape:
+                loss = compute_loss(images, targets)
+            grads = tape.gradient(loss, model.variables)
+            optimizer.apply_gradients(zip(grads, model.variables))
+
+        inputs = next(input_iterator)
+        distribution.run(step_fn, args=(inputs,))
+
+    def test_customized_tf_module_run(self, distribution):
+        dataset = _get_dataset()
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-    @tf.function
-    def step(features):
-      per_replica_losses = distribution.run(
-          replica_step,
-          (net.trainable_variables, features),
-      )
-      loss = distribution.reduce(
-          tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
-      return loss
+        with distribution.scope():
+            model = CustomModel()
+
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                return grads
+
+            outputs = distribution.run(step_fn, args=(next(iterator),))
+            return tf.nest.map_structure(
+                distribution.experimental_local_results, outputs
+            )
+
+        train_step(input_iterator)
+
+    def test_reduce_loss(self, distribution):
+        inputs = np.zeros((10, 4), dtype=np.float32)
+        targets = np.zeros((10, 1), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.batch(10)
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-    step(next(iter(dataset)))
+        with distribution.scope():
+            x = keras.layers.Input(shape=(4), name="input")
+            y = keras.layers.Dense(3, name="dense")(x)
+            model = keras.Model(x, y)
+
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                outputs = model(images)
+                loss = keras.losses.sparse_categorical_crossentropy(
+                    targets, outputs
+                )
+                return loss
+
+            return distribution.run(step_fn, args=(next(iterator),))
+
+        loss = train_step(input_iterator)
+        loss = distribution.reduce(tf.distribute.ReduceOp.MEAN, loss, axis=0)
+
+    def test_variable_run_argument(self, distribution):
+        # Test that variables passed to run() remain variables. Previous
+        # behavior in TPUStrategy was to cast to Tensor.
+
+        with distribution.scope():
+            optimizer = gradient_descent.SGD(0.1)
+            net = core.Dense(1, trainable=True)
+        dataset = tf.data.Dataset.from_tensors([[1.0]])
+        dataset = dataset.repeat()
+        dataset = dataset.batch(2, drop_remainder=True)
+
+        def replica_step(trainable_variables, features):
+
+            with tf.GradientTape() as tape:
+                net_out = net(features[0], training=True)
+                loss = (net_out - 1.0) * (net_out - 1.0)
+            gradients = tape.gradient(loss, trainable_variables)
+            optimizer.apply_gradients(zip(gradients, trainable_variables))
+            return loss
+
+        @tf.function
+        def step(features):
+            per_replica_losses = distribution.run(
+                replica_step,
+                (net.trainable_variables, features),
+            )
+            loss = distribution.reduce(
+                tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None
+            )
+            return loss
+
+        step(next(iter(dataset)))
 
 
 class KerasModelsXLATest(tf.test.TestCase, parameterized.TestCase):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.tpu_strategies, mode=["eager"]
+        )
+    )
+    def test_tf_function_jit_compile(self, distribution):
+        dataset = _get_dataset()
+        input_iterator = iter(
+            distribution.experimental_distribute_dataset(dataset)
+        )
 
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
-  def test_tf_function_jit_compile(self, distribution):
-    dataset = _get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    class CustomDense(keras.layers.Layer):
-
-      def __init__(self, num_outputs):
-        super().__init__()
-        self.num_outputs = num_outputs
+        class CustomDense(keras.layers.Layer):
+            def __init__(self, num_outputs):
+                super().__init__()
+                self.num_outputs = num_outputs
 
-      def build(self, input_shape):
-        self.kernel = self.add_weight(
-            "kernel", shape=[int(input_shape[-1]), self.num_outputs])
+            def build(self, input_shape):
+                self.kernel = self.add_weight(
+                    "kernel", shape=[int(input_shape[-1]), self.num_outputs]
+                )
 
-      @tf.function(jit_compile=True)
-      def call(self, inputs):
-        return tf.matmul(inputs, self.kernel)
+            @tf.function(jit_compile=True)
+            def call(self, inputs):
+                return tf.matmul(inputs, self.kernel)
 
-    with distribution.scope():
-      x = keras.layers.Input(shape=(3,))
-      y = CustomDense(4)(x)
-      model = keras.Model(x, y)
+        with distribution.scope():
+            x = keras.layers.Input(shape=(3,))
+            y = CustomDense(4)(x)
+            model = keras.Model(x, y)
 
-    @tf.function
-    def train_step(iterator):
-      def step_fn(inputs):
-        images, targets = inputs
-        with tf.GradientTape() as tape:
-          outputs = model(images)
-          loss = keras.losses.mean_squared_error(targets, outputs)
-        grads = tape.gradient(loss, model.variables)
-        return grads
+        @tf.function
+        def train_step(iterator):
+            def step_fn(inputs):
+                images, targets = inputs
+                with tf.GradientTape() as tape:
+                    outputs = model(images)
+                    loss = keras.losses.mean_squared_error(targets, outputs)
+                grads = tape.gradient(loss, model.variables)
+                return grads
 
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
-      return tf.nest.map_structure(distribution.experimental_local_results,
-                                   outputs)
+            outputs = distribution.run(step_fn, args=(next(iterator),))
+            return tf.nest.map_structure(
+                distribution.experimental_local_results, outputs
+            )
 
-    train_step(input_iterator)
+        train_step(input_iterator)
 
 
 def _get_dataset():
-  inputs = np.zeros((31, 3), dtype=np.float32)
-  targets = np.zeros((31, 4), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-  dataset = dataset.batch(10)
-  return dataset
+    inputs = np.zeros((31, 3), dtype=np.float32)
+    targets = np.zeros((31, 4), dtype=np.float32)
+    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10)
+    return dataset
 
 
 def _get_model():
-  x = keras.layers.Input(shape=(3,), name="input")
-  y = keras.layers.Dense(4, name="dense")(x)
-  model = keras.Model(x, y)
-  return model
+    x = keras.layers.Input(shape=(3,), name="input")
+    y = keras.layers.Dense(4, name="dense")(x)
+    model = keras.Model(x, y)
+    return model
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/custom_training_loop_optimizer_test.py b/keras/distribute/custom_training_loop_optimizer_test.py
index 511a28e0894d..c972b96a2e56 100644
--- a/keras/distribute/custom_training_loop_optimizer_test.py
+++ b/keras/distribute/custom_training_loop_optimizer_test.py
@@ -15,106 +15,125 @@
 """Tests for custom training loops that involves advanced optimizer usage."""
 
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
+
+from keras.distribute import (
+    strategy_combinations as keras_strategy_combinations,
+)
+from keras.optimizers.legacy import gradient_descent
+
+# isort: off
 from tensorflow.python.distribute import values
-from keras.distribute import strategy_combinations as keras_strategy_combinations
-from keras.optimizers.optimizer_v2 import gradient_descent
 
 
 class OptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          tf.__internal__.test.combinations.combine(
-              distribution=keras_strategy_combinations.multidevice_strategies,
-              mode=["eager"],
-          ),
-          tf.__internal__.test.combinations.combine(
-              experimental_aggregate_gradients=True,
-              expected=[[[-0.3, -0.3], [-0.3, -0.3]]]) +
-          tf.__internal__.test.combinations.combine(
-              experimental_aggregate_gradients=False,
-              expected=[[[-0.1, -0.1], [-0.2, -0.2]]])
-      ))
-  def test_custom_aggregation(self, distribution,
-                              experimental_aggregate_gradients, expected):
-
-    with distribution.scope():
-      v = tf.Variable([0., 0.])
-      optimizer = gradient_descent.SGD(0.1)
-
-    class PerReplica(values.DistributedValues):
-      """Holds a map from replica to unsynchronized values."""
-
-      @property
-      def values(self):
-        """Returns the per replica values."""
-        return self._values
-
-    @tf.function
-    def optimize():
-      with tf.device(distribution.extended.worker_devices[0]):
-        v1 = tf.convert_to_tensor([1., 1.])
-      with tf.device(distribution.extended.worker_devices[1]):
-        v2 = tf.convert_to_tensor([2., 2.])
-      grads = PerReplica([v1, v2])
-      def step_fn(grads):
-        optimizer.apply_gradients(
-            [(grads, v)],
-            experimental_aggregate_gradients=experimental_aggregate_gradients)
-        return v.read_value()
-
-      return distribution.experimental_local_results(
-          distribution.run(step_fn, args=(grads,)))
-
-    self.assertAllClose(optimize(), expected)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=tf.__internal__.distribute.combinations.one_device_strategy,
-          mode=["eager"],
-          experimental_aggregate_gradients=[True, False]))
-  def test_custom_aggregation_one_device(self, distribution,
-                                         experimental_aggregate_gradients):
-
-    with distribution.scope():
-      v = tf.Variable([0., 0.])
-      optimizer = gradient_descent.SGD(0.1)
-
-    @tf.function
-    def optimize():
-      grads = tf.convert_to_tensor([1., 1.])
-
-      def step_fn(grads):
-        optimizer.apply_gradients(
-            [(grads, v)],
-            experimental_aggregate_gradients=experimental_aggregate_gradients)
-        return v.read_value()
-
-      return distribution.experimental_local_results(
-          distribution.run(step_fn, args=(grads,)))
-
-    self.assertAllClose(optimize(), [[-0.1, -0.1]])
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=[
-          tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu
-      ]))
-  def test_custom_aggregation_central_storage(self, distribution):
-    with distribution.scope():
-      v = tf.Variable([0., 0.])
-      optimizer = gradient_descent.SGD(0.1)
-
-    grads = tf.convert_to_tensor([1., 1.])
-
-    def step_fn(grads):
-      with self.assertRaises(NotImplementedError):
-        optimizer.apply_gradients([(grads, v)],
-                                  experimental_aggregate_gradients=False)
-
-    return distribution.run(step_fn, args=(grads,))
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tf.__internal__.test.combinations.combine(
+                distribution=keras_strategy_combinations.multidevice_strategies,
+                mode=["eager"],
+            ),
+            tf.__internal__.test.combinations.combine(
+                experimental_aggregate_gradients=True,
+                expected=[[[-0.3, -0.3], [-0.3, -0.3]]],
+            )
+            + tf.__internal__.test.combinations.combine(
+                experimental_aggregate_gradients=False,
+                expected=[[[-0.1, -0.1], [-0.2, -0.2]]],
+            ),
+        )
+    )
+    def test_custom_aggregation(
+        self, distribution, experimental_aggregate_gradients, expected
+    ):
+
+        with distribution.scope():
+            v = tf.Variable([0.0, 0.0])
+            optimizer = gradient_descent.SGD(0.1)
+
+        class PerReplica(values.DistributedValues):
+            """Holds a map from replica to unsynchronized values."""
+
+            @property
+            def values(self):
+                """Returns the per replica values."""
+                return self._values
+
+        @tf.function
+        def optimize():
+            with tf.device(distribution.extended.worker_devices[0]):
+                v1 = tf.convert_to_tensor([1.0, 1.0])
+            with tf.device(distribution.extended.worker_devices[1]):
+                v2 = tf.convert_to_tensor([2.0, 2.0])
+            grads = PerReplica([v1, v2])
+
+            def step_fn(grads):
+                optimizer.apply_gradients(
+                    [(grads, v)],
+                    experimental_aggregate_gradients=experimental_aggregate_gradients,  # noqa: E501
+                )
+                return v.read_value()
+
+            return distribution.experimental_local_results(
+                distribution.run(step_fn, args=(grads,))
+            )
+
+        self.assertAllClose(optimize(), expected)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=tf.__internal__.distribute.combinations.one_device_strategy,  # noqa: E501
+            mode=["eager"],
+            experimental_aggregate_gradients=[True, False],
+        )
+    )
+    def test_custom_aggregation_one_device(
+        self, distribution, experimental_aggregate_gradients
+    ):
+
+        with distribution.scope():
+            v = tf.Variable([0.0, 0.0])
+            optimizer = gradient_descent.SGD(0.1)
+
+        @tf.function
+        def optimize():
+            grads = tf.convert_to_tensor([1.0, 1.0])
+
+            def step_fn(grads):
+                optimizer.apply_gradients(
+                    [(grads, v)],
+                    experimental_aggregate_gradients=experimental_aggregate_gradients,  # noqa: E501
+                )
+                return v.read_value()
+
+            return distribution.experimental_local_results(
+                distribution.run(step_fn, args=(grads,))
+            )
+
+        self.assertAllClose(optimize(), [[-0.1, -0.1]])
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu  # noqa: E501
+            ]
+        )
+    )
+    def test_custom_aggregation_central_storage(self, distribution):
+        with distribution.scope():
+            v = tf.Variable([0.0, 0.0])
+            optimizer = gradient_descent.SGD(0.1)
+
+        grads = tf.convert_to_tensor([1.0, 1.0])
+
+        def step_fn(grads):
+            with self.assertRaises(NotImplementedError):
+                optimizer.apply_gradients(
+                    [(grads, v)], experimental_aggregate_gradients=False
+                )
+
+        return distribution.run(step_fn, args=(grads,))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/distribute/dataset_creator_model_fit_ps_only_test.py b/keras/distribute/dataset_creator_model_fit_ps_only_test.py
index edc515aa327e..077ff151008e 100644
--- a/keras/distribute/dataset_creator_model_fit_ps_only_test.py
+++ b/keras/distribute/dataset_creator_model_fit_ps_only_test.py
@@ -14,11 +14,12 @@
 # ==============================================================================
 """Tests for `DatasetCreator` with `Model.fit` across usages and strategies."""
 
+import tensorflow.compat.v2 as tf
+
 from keras import callbacks as callbacks_lib
 from keras.distribute import dataset_creator_model_fit_test_base as test_base
 from keras.distribute import strategy_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_utils.run_v2_only
@@ -26,121 +27,152 @@
     tf.__internal__.test.combinations.combine(
         strategy=strategy_combinations.parameter_server_strategies_multi_worker,
         use_dataset_creator=[True, False],
-        mode="eager"))
+        mode="eager",
+    )
+)
 class DatasetCreatorModelFitParameterServerStrategyOnlyTest(
-    test_base.DatasetCreatorModelFitTestBase):
-
-  def testModelFitWithRunEagerly(self, strategy, use_dataset_creator):
-    with self.assertRaisesRegex(
-        ValueError, "When using `Model` with `ParameterServerStrategy`, "
-        "`run_eagerly` is not supported."):
-      self._model_fit(
-          strategy, run_eagerly=True, use_dataset_creator=use_dataset_creator)
-
-  def testModelPredict(self, strategy, use_dataset_creator):
-    if use_dataset_creator:
-      self.skipTest("Unused option.")
-    model, _ = self._model_compile(strategy)
-    test_data = tf.data.Dataset.from_tensor_slices(
-        [[1.], [2.], [3.], [1.], [5.], [1.]]).repeat().batch(2)
-    model.predict(x=test_data, steps=3)
-
-  def testClusterCoordinatorSingleInstance(self, strategy, use_dataset_creator):
-    model = self._model_fit(strategy, use_dataset_creator=use_dataset_creator)
-    strategy = model.distribute_strategy
-    self.assertIs(
-        strategy._cluster_coordinator,
-        tf.distribute.experimental.coordinator.ClusterCoordinator(strategy))
-
-  def testModelFitErrorOnBatchLevelCallbacks(self, strategy,
-                                             use_dataset_creator):
-
-    class BatchLevelCallback(callbacks_lib.Callback):
-
-      def on_train_batch_end(self, batch, logs=None):
-        pass
-
-    with self.assertRaisesRegex(ValueError,
-                                "Batch-level `Callback`s are not supported"):
-      callbacks = [BatchLevelCallback()]
-      self._model_fit(
-          strategy,
-          callbacks=callbacks,
-          use_dataset_creator=use_dataset_creator)
-
-  def testModelFitCallbackSupportsTFLogs(self, strategy, use_dataset_creator):
-
-    class MyCallback(callbacks_lib.Callback):
-
-      def __init__(self):
-        super().__init__()
-        # Fetches the RemoteValues if necessary.
-        self._supports_tf_logs = True
-
-      def on_train_batch_end(self, batch, logs=None):
-        assert isinstance(logs, tf.distribute.experimental.coordinator.RemoteValue)
-
-    my_callback = MyCallback()
-    callbacks = [my_callback]
-    self._model_fit(
-        strategy, callbacks=callbacks, use_dataset_creator=use_dataset_creator)
-
-  def testModelFitVerbosity(self, strategy, use_dataset_creator):
-
-    class MyCallback(callbacks_lib.Callback):
-      pass
-
-    my_callback = MyCallback()
-    callbacks = [my_callback]
-    self._model_fit(
-        strategy, callbacks=callbacks, use_dataset_creator=use_dataset_creator)
-    # PSStrategy should default to epoch-level logging.
-    self.assertEqual(my_callback.params["verbose"], 2)
-
-  def testModelFitTensorBoardEpochLevel(self, strategy, use_dataset_creator):
-    log_dir = self.get_temp_dir()
-    callbacks = [callbacks_lib.TensorBoard(log_dir)]
-    self._model_fit(
-        strategy, callbacks=callbacks, use_dataset_creator=use_dataset_creator)
-    self.assertTrue(tf.compat.v1.gfile.Exists(log_dir))
-    files = tf.compat.v1.gfile.ListDirectory(log_dir)
-    self.assertGreaterEqual(len(files), 1)
-
-  def testModelFitVerbose1(self, strategy, use_dataset_creator):
-    with self.assertRaisesRegex(ValueError,
-                                "`verbose=1` is not allowed with "
-                                "`ParameterServerStrategy` for performance "
-                                "reasons. Received: verbose=1"):
-      self._model_fit(
-          strategy, use_dataset_creator=use_dataset_creator,
-          verbose=1)
-
-  def testModelEvaluateErrorOnBatchLevelCallbacks(self, strategy,
-                                                  use_dataset_creator):
-
-    class BatchLevelCallback(callbacks_lib.Callback):
-
-      def on_train_batch_end(self, batch, logs=None):
-        pass
-
-    with self.assertRaisesRegex(ValueError,
-                                "Batch-level `Callback`s are not supported"):
-      callbacks = [BatchLevelCallback()]
-      self._model_evaluate(
-          strategy,
-          callbacks=callbacks,
-          use_dataset_creator=use_dataset_creator)
-
-  def testClusterCoordinatorSingleInstanceWithJitCompileTrue(
-      self, strategy, use_dataset_creator):
-    model = self._model_fit(strategy,
-                            use_dataset_creator=use_dataset_creator,
-                            jit_compile=True)
-    strategy = model.distribute_strategy
-    self.assertIs(
-        strategy._cluster_coordinator,
-        tf.distribute.experimental.coordinator.ClusterCoordinator(strategy))
+    test_base.DatasetCreatorModelFitTestBase
+):
+    def testModelFitWithRunEagerly(self, strategy, use_dataset_creator):
+        with self.assertRaisesRegex(
+            ValueError,
+            "When using `Model` with `ParameterServerStrategy`, "
+            "`run_eagerly` is not supported.",
+        ):
+            self._model_fit(
+                strategy,
+                run_eagerly=True,
+                use_dataset_creator=use_dataset_creator,
+            )
+
+    def testModelPredict(self, strategy, use_dataset_creator):
+        if use_dataset_creator:
+            self.skipTest("Unused option.")
+        model, _ = self._model_compile(strategy)
+        test_data = (
+            tf.data.Dataset.from_tensor_slices(
+                [[1.0], [2.0], [3.0], [1.0], [5.0], [1.0]]
+            )
+            .repeat()
+            .batch(2)
+        )
+        model.predict(x=test_data, steps=3)
+
+    def testClusterCoordinatorSingleInstance(
+        self, strategy, use_dataset_creator
+    ):
+        model = self._model_fit(
+            strategy, use_dataset_creator=use_dataset_creator
+        )
+        strategy = model.distribute_strategy
+        self.assertIs(
+            strategy._cluster_coordinator,
+            tf.distribute.experimental.coordinator.ClusterCoordinator(strategy),
+        )
+
+    def testModelFitErrorOnBatchLevelCallbacks(
+        self, strategy, use_dataset_creator
+    ):
+        class BatchLevelCallback(callbacks_lib.Callback):
+            def on_train_batch_end(self, batch, logs=None):
+                pass
+
+        with self.assertRaisesRegex(
+            ValueError, "Batch-level `Callback`s are not supported"
+        ):
+            callbacks = [BatchLevelCallback()]
+            self._model_fit(
+                strategy,
+                callbacks=callbacks,
+                use_dataset_creator=use_dataset_creator,
+            )
+
+    def testModelFitCallbackSupportsTFLogs(self, strategy, use_dataset_creator):
+        class MyCallback(callbacks_lib.Callback):
+            def __init__(self):
+                super().__init__()
+                # Fetches the RemoteValues if necessary.
+                self._supports_tf_logs = True
+
+            def on_train_batch_end(self, batch, logs=None):
+                assert isinstance(
+                    logs, tf.distribute.experimental.coordinator.RemoteValue
+                )
+
+        my_callback = MyCallback()
+        callbacks = [my_callback]
+        self._model_fit(
+            strategy,
+            callbacks=callbacks,
+            use_dataset_creator=use_dataset_creator,
+        )
+
+    def testModelFitVerbosity(self, strategy, use_dataset_creator):
+        class MyCallback(callbacks_lib.Callback):
+            pass
+
+        my_callback = MyCallback()
+        callbacks = [my_callback]
+        self._model_fit(
+            strategy,
+            callbacks=callbacks,
+            use_dataset_creator=use_dataset_creator,
+        )
+        # PSStrategy should default to epoch-level logging.
+        self.assertEqual(my_callback.params["verbose"], 2)
+
+    def testModelFitTensorBoardEpochLevel(self, strategy, use_dataset_creator):
+        log_dir = self.get_temp_dir()
+        callbacks = [callbacks_lib.TensorBoard(log_dir)]
+        self._model_fit(
+            strategy,
+            callbacks=callbacks,
+            use_dataset_creator=use_dataset_creator,
+        )
+        self.assertTrue(tf.compat.v1.gfile.Exists(log_dir))
+        files = tf.compat.v1.gfile.ListDirectory(log_dir)
+        self.assertGreaterEqual(len(files), 1)
+
+    def testModelFitVerbose1(self, strategy, use_dataset_creator):
+        with self.assertRaisesRegex(
+            ValueError,
+            "`verbose=1` is not allowed with "
+            "`ParameterServerStrategy` for performance "
+            "reasons. Received: verbose=1",
+        ):
+            self._model_fit(
+                strategy, use_dataset_creator=use_dataset_creator, verbose=1
+            )
+
+    def testModelEvaluateErrorOnBatchLevelCallbacks(
+        self, strategy, use_dataset_creator
+    ):
+        class BatchLevelCallback(callbacks_lib.Callback):
+            def on_train_batch_end(self, batch, logs=None):
+                pass
+
+        with self.assertRaisesRegex(
+            ValueError, "Batch-level `Callback`s are not supported"
+        ):
+            callbacks = [BatchLevelCallback()]
+            self._model_evaluate(
+                strategy,
+                callbacks=callbacks,
+                use_dataset_creator=use_dataset_creator,
+            )
+
+    def testClusterCoordinatorSingleInstanceWithJitCompileTrue(
+        self, strategy, use_dataset_creator
+    ):
+        model = self._model_fit(
+            strategy, use_dataset_creator=use_dataset_creator, jit_compile=True
+        )
+        strategy = model.distribute_strategy
+        self.assertIs(
+            strategy._cluster_coordinator,
+            tf.distribute.experimental.coordinator.ClusterCoordinator(strategy),
+        )
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/dataset_creator_model_fit_test.py b/keras/distribute/dataset_creator_model_fit_test.py
index 518bd3c54289..c6b36be62c46 100644
--- a/keras/distribute/dataset_creator_model_fit_test.py
+++ b/keras/distribute/dataset_creator_model_fit_test.py
@@ -14,247 +14,287 @@
 # ==============================================================================
 """Tests for `DatasetCreator` with `Model.fit` across usages and strategies."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
-from tensorflow.python.framework import test_util as tf_test_utils
-from keras.testing_infra import test_utils
 from keras.distribute import dataset_creator_model_fit_test_base as test_base
 from keras.distribute import strategy_combinations
+from keras.testing_infra import test_utils
 from keras.utils import dataset_creator
 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
 
 # TODO(rchao): Investigate why there cannot be single worker and multi worker
 # PS strategies running in the same shard.
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        strategy=strategy_combinations.all_strategies +
-        strategy_combinations.multi_worker_mirrored_strategies +
-        strategy_combinations.parameter_server_strategies_multi_worker,
-        mode="eager"))
+        strategy=strategy_combinations.all_strategies
+        + strategy_combinations.multi_worker_mirrored_strategies
+        + strategy_combinations.parameter_server_strategies_multi_worker,
+        mode="eager",
+    )
+)
 class DatasetCreatorModelFitTest(test_base.DatasetCreatorModelFitTestBase):
+    def setUp(self):
+        super().setUp()
+        if tf_test_utils.is_xla_enabled():
+            self.skipTest(
+                "model.optimizer.iterations values is not as expected "
+                "with XLA: b/184384487"
+            )
+
+    def testModelFit(self, strategy):
+        model = self._model_fit(strategy)
+        self.assertEqual(model.optimizer.iterations, 100)
+
+    def testModelFitwithStepsPerEpochNegativeOne(self, strategy):
+        def dataset_fn(input_context):
+            del input_context
+            x = tf.random.uniform((10, 10))
+            y = tf.random.uniform((10,))
+            return (
+                tf.data.Dataset.from_tensor_slices((x, y)).shuffle(10).batch(2)
+            )
+
+        if strategy._should_use_with_coordinator:
+            with self.assertRaises(
+                (tf.errors.OutOfRangeError, tf.errors.CancelledError)
+            ):
+                self._model_fit(
+                    strategy,
+                    steps_per_epoch=-1,
+                    x=dataset_creator.DatasetCreator(dataset_fn),
+                    validation_data=dataset_creator.DatasetCreator(dataset_fn),
+                )
+        else:
+            self._model_fit(
+                strategy,
+                steps_per_epoch=-1,
+                x=dataset_creator.DatasetCreator(dataset_fn),
+                validation_data=dataset_creator.DatasetCreator(dataset_fn),
+            )
+
+    def testModelFitWithNumpyData(self, strategy):
+        x = np.random.rand(100, 10)
+        y = np.random.rand(100, 1)
+        model = self._model_fit(
+            strategy,
+            x=x,
+            y=y,
+            batch_size=1,
+            validation_data=(x, y),
+        )
+        self.assertEqual(model.optimizer.iterations, 100)
 
-  def setUp(self):
-    super().setUp()
-    if tf_test_utils.is_xla_enabled():
-      self.skipTest("model.optimizer.iterations values is not as expected "
-                    "with XLA: b/184384487")
-
-  def testModelFit(self, strategy):
-    model = self._model_fit(strategy)
-    self.assertEqual(model.optimizer.iterations, 100)
-
-  def testModelFitwithStepsPerEpochNegativeOne(self, strategy):
-    def dataset_fn(input_context):
-      del input_context
-      x = tf.random.uniform((10, 10))
-      y = tf.random.uniform((10,))
-      return tf.data.Dataset.from_tensor_slices(
-          (x, y)).shuffle(10).batch(2)
-
-    if strategy._should_use_with_coordinator:
-      with self.assertRaises((tf.errors.OutOfRangeError,
-                              tf.errors.CancelledError)):
-        self._model_fit(
+    def testModelFitWithTensorData(self, strategy):
+        x = tf.random.uniform((100, 10))
+        y = tf.random.uniform((100,))
+        model = self._model_fit(
             strategy,
-            steps_per_epoch=-1,
-            x=dataset_creator.DatasetCreator(dataset_fn),
-            validation_data=dataset_creator.DatasetCreator(dataset_fn),
+            x=x,
+            y=y,
+            batch_size=1,
+            validation_data=(x, y),
         )
-    else:
-      self._model_fit(
-          strategy,
-          steps_per_epoch=-1,
-          x=dataset_creator.DatasetCreator(dataset_fn),
-          validation_data=dataset_creator.DatasetCreator(dataset_fn),
-      )
-
-  def testModelFitWithNumpyData(self, strategy):
-    x = np.random.rand(100, 10)
-    y = np.random.rand(100, 1)
-    model = self._model_fit(
-        strategy,
-        x=x,
-        y=y,
-        batch_size=1,
-        validation_data=(x, y),
-    )
-    self.assertEqual(model.optimizer.iterations, 100)
-
-  def testModelFitWithTensorData(self, strategy):
-    x = tf.random.uniform((100, 10))
-    y = tf.random.uniform((100,))
-    model = self._model_fit(
-        strategy,
-        x=x,
-        y=y,
-        batch_size=1,
-        validation_data=(x, y),
-    )
-    self.assertEqual(model.optimizer.iterations, 100)
-
-  def testModelFitWithLookupLayer(self, strategy):
-    model = self._model_fit(strategy, use_lookup_layer=True)
-    self.assertEqual(model.optimizer.iterations, 100)
-
-  def testModelFitWithNormalizationLayer(self, strategy):
-    model = self._model_fit(strategy, with_normalization_layer=True)
-    self.assertEqual(model.optimizer.iterations, 100)
-
-  def testModelFitWithStepsPerExecution(self, strategy):
-    model = self._model_fit(strategy, steps_per_execution=10)
-    self.assertEqual(model.optimizer.iterations, 100)
-
-  def testModelFitWithNoStepsPerEpoch(self, strategy):
-    with self.assertRaisesRegex(
-        ValueError,
-        "When using a `tf.keras.utils.experimental.DatasetCreator`, "
-        "`steps_per_epoch`, `validation_steps` or `steps` argument must be "
-        "provided in `Model.fit`, `Model.evaluate`, or `Model.predict`."):
-      self._model_fit(strategy, steps_per_epoch=None)
-
-  def testModelEvaluate(self, strategy):
-    self._model_evaluate(strategy)
-    self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
-
-  def testModelEvaluateWithNumpyData(self, strategy):
-    x = np.random.rand(100, 10)
-    y = np.random.rand(100, 1)
-    self._model_evaluate(
-        strategy,
-        x=x,
-        y=y,
-        batch_size=1,
-    )
-    self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
-
-  def testModelEvaluateWithTensorData(self, strategy):
-    x = tf.random.uniform((100, 10))
-    y = tf.random.uniform((100,))
-    self._model_evaluate(
-        strategy,
-        x=x,
-        y=y,
-        batch_size=1,
-    )
-    self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
-
-  def testModelEvaluateWithNormalizationLayer(self, strategy):
-    self._model_evaluate(strategy, with_normalization_layer=True)
-    self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
-
-  def testModelEvaluateWithStepsPerExecution(self, strategy):
-    self._model_evaluate(strategy, steps_per_execution=10)
-    self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
-
-  def testModelEvaluateWithNoStepsPerEpoch(self, strategy):
-    with self.assertRaisesRegex(
-        ValueError,
-        "When using a `tf.keras.utils.experimental.DatasetCreator`, "
-        "`steps_per_epoch`, `validation_steps` or `steps` argument must be "
-        "provided in `Model.fit`, `Model.evaluate`, or `Model.predict`."):
-      self._model_evaluate(strategy, steps=None)
-
-  def testModelPredict(self, strategy):
-    _, predictions = self._model_predict(strategy, steps=3)
-    # Check the first (0th index), fourth (3rd index) and the last predictions
-    # because the first, fourth and the last input are the same in
-    # `model.predict` so there predictions should match.
-    self.assertTrue(all(predictions[0] == predictions[i] for i in [0, 3, 5]))
-
-    self.assertFalse(
-        all(predictions[0] == predictions[i] for i in [0, 1, 2, 4]))
-
-  def testModelPredictWithNumpyData(self, strategy):
-    x = np.array([[1.], [2.], [3.], [1.], [5.], [1.]])
-    _, predictions = self._model_predict(strategy, test_data=x)
-
-    self.assertTrue(all(predictions[0] == predictions[i] for i in [0, 3, 5]))
-    self.assertFalse(
-        all(predictions[0] == predictions[i] for i in [0, 1, 2, 4]))
-
-  def testModelPredictWithTensorData(self, strategy):
-    x = tf.constant([[1.], [2.], [3.], [1.], [5.], [1.]])
-    _, predictions = self._model_predict(strategy, test_data=x)
-    self.assertTrue(all(predictions[0] == predictions[i] for i in [0, 3, 5]))
-    self.assertFalse(
-        all(predictions[0] == predictions[i] for i in [0, 1, 2, 4]))
-
-  def testModelPredictWithNormalizationLayer(self, strategy):
-    _, predictions = self._model_predict(
-        strategy, with_normalization_layer=True, steps=3)
-    # Check the first (0th index), fourth (3rd index) and the last predictions
-    # because the first, fourth and the last input is the same in
-    # `model.predict` so there predictions should match.
-    self.assertTrue(all(predictions[0] == predictions[i] for i in [0, 3, 5]))
-
-    self.assertFalse(
-        all(predictions[0] == predictions[i] for i in [0, 1, 2, 4]))
-
-  def testModelPredictWithStepsPerExecution(self, strategy):
-    _, predictions = self._model_predict(
-        strategy, steps_per_execution=3, steps=3)
-
-    # Check the first (0th index), fourth (3rd index) and the last predictions
-    # because the first, fourth and the last input is the same in
-    # `model.predict` so there predictions should match.
-    self.assertTrue(all(predictions[0] == predictions[i] for i in [0, 3, 5]))
-
-    self.assertFalse(
-        all(predictions[0] == predictions[i] for i in [0, 1, 2, 4]))
-
-  def testModelFitAndPredict(self, strategy):
-    def fit_dataset_fn(input_context):
-      del input_context
-      x = tf.random.uniform((10, 1))
-      y = tf.random.uniform((10,))
-      return tf.data.Dataset.from_tensor_slices(
-          (x, y)).shuffle(10).repeat().batch(2)
-
-    x = dataset_creator.DatasetCreator(fit_dataset_fn)
-    validation_data = dataset_creator.DatasetCreator(fit_dataset_fn)
-
-    model = self._model_fit(strategy, x=x, validation_data=validation_data)
-    _, predictions = self._model_predict(strategy, model, steps=3)
-
-    # Check the first (0th index), fourth (3rd index) and the last predictions
-    # because the first, fourth and the last input is the same in
-    # `model.predict` so there predictions should match.
-    self.assertTrue(all(predictions[0] == predictions[i] for i in [0, 3, 5]))
-
-    self.assertFalse(
-        all(predictions[0] == predictions[i] for i in [0, 1, 2, 4]))
-
-  def testModelPredictWithDatasetCreator(self, strategy):
-    if isinstance(strategy,
-                  tf.distribute.MultiWorkerMirroredStrategy):
-      self.skipTest("b/189223991")
-
-    def _dataset_fn(input_context):
-      del input_context
-      x = tf.constant([[1.], [2.], [3.], [1.], [5.], [1.]])
-      return tf.data.Dataset.from_tensor_slices(x).repeat().batch(2)
-
-    _, predictions = self._model_predict(
-        strategy,
-        steps=3,
-        test_data=dataset_creator.DatasetCreator(_dataset_fn),
-    )
+        self.assertEqual(model.optimizer.iterations, 100)
+
+    def testModelFitWithLookupLayer(self, strategy):
+        model = self._model_fit(strategy, use_lookup_layer=True)
+        self.assertEqual(model.optimizer.iterations, 100)
+
+    def testModelFitWithNormalizationLayer(self, strategy):
+        model = self._model_fit(strategy, with_normalization_layer=True)
+        self.assertEqual(model.optimizer.iterations, 100)
+
+    def testModelFitWithStepsPerExecution(self, strategy):
+        model = self._model_fit(strategy, steps_per_execution=10)
+        self.assertEqual(model.optimizer.iterations, 100)
+
+    def testModelFitWithNoStepsPerEpoch(self, strategy):
+        with self.assertRaisesRegex(
+            ValueError,
+            "When using a `tf.keras.utils.experimental.DatasetCreator`, "
+            "`steps_per_epoch`, `validation_steps`, `steps`, or "
+            "`pss_evaluation_shards` argument must be provided in "
+            "`Model.fit`, `Model.evaluate`, or `Model.predict`.",
+        ):
+            self._model_fit(strategy, steps_per_epoch=None)
+
+    def testModelEvaluate(self, strategy):
+        self._model_evaluate(strategy)
+        self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
+
+    def testModelEvaluateWithNumpyData(self, strategy):
+        x = np.random.rand(100, 10)
+        y = np.random.rand(100, 1)
+        self._model_evaluate(
+            strategy,
+            x=x,
+            y=y,
+            batch_size=1,
+        )
+        self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
 
-    # Check the first (0th index), fourth (3rd index) and the last predictions
-    # because the first, fourth and the last input is the same in
-    # `model.predict` so there predictions should match.
-    self.assertTrue(all(predictions[0] == predictions[i] for i in [0, 3, 5]))
+    def testModelEvaluateWithTensorData(self, strategy):
+        x = tf.random.uniform((100, 10))
+        y = tf.random.uniform((100,))
+        self._model_evaluate(
+            strategy,
+            x=x,
+            y=y,
+            batch_size=1,
+        )
+        self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
+
+    def testModelEvaluateWithNormalizationLayer(self, strategy):
+        self._model_evaluate(strategy, with_normalization_layer=True)
+        self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
+
+    def testModelEvaluateWithStepsPerExecution(self, strategy):
+        self._model_evaluate(strategy, steps_per_execution=10)
+        self.assertGreaterEqual(self._accuracy_metric.result(), 0.0)
+
+    def testModelEvaluateWithNoStepsPerEpoch(self, strategy):
+        with self.assertRaisesRegex(
+            ValueError,
+            "When using a `tf.keras.utils.experimental.DatasetCreator`, "
+            "`steps_per_epoch`, `validation_steps`, `steps`, or "
+            "`pss_evaluation_shards` argument must be provided in "
+            "`Model.fit`, `Model.evaluate`, or `Model.predict`.",
+        ):
+            self._model_evaluate(strategy, steps=None)
+
+    def testModelPredict(self, strategy):
+        _, predictions = self._model_predict(strategy, steps=3)
+        # Check the first (0th index), fourth (3rd index) and the last
+        # predictions because the first, fourth and the last input are the same
+        # in `model.predict` so there predictions should match.
+        self.assertTrue(
+            all(predictions[0] == predictions[i] for i in [0, 3, 5])
+        )
+
+        self.assertFalse(
+            all(predictions[0] == predictions[i] for i in [0, 1, 2, 4])
+        )
 
-    self.assertFalse(
-        all(predictions[0] == predictions[i] for i in [0, 1, 2, 4]))
+    def testModelPredictWithNumpyData(self, strategy):
+        x = np.array([[1.0], [2.0], [3.0], [1.0], [5.0], [1.0]])
+        _, predictions = self._model_predict(strategy, test_data=x)
+
+        self.assertTrue(
+            all(predictions[0] == predictions[i] for i in [0, 3, 5])
+        )
+        self.assertFalse(
+            all(predictions[0] == predictions[i] for i in [0, 1, 2, 4])
+        )
+
+    def testModelPredictWithTensorData(self, strategy):
+        x = tf.constant([[1.0], [2.0], [3.0], [1.0], [5.0], [1.0]])
+        _, predictions = self._model_predict(strategy, test_data=x)
+        self.assertTrue(
+            all(predictions[0] == predictions[i] for i in [0, 3, 5])
+        )
+        self.assertFalse(
+            all(predictions[0] == predictions[i] for i in [0, 1, 2, 4])
+        )
 
-  def testModelTrainTFFunction(self, strategy):
-    model = self._model_fit(strategy)
-    self.assertIsInstance(model.train_tf_function,
-                          tf.__internal__.function.Function)
+    def testModelPredictWithNormalizationLayer(self, strategy):
+        _, predictions = self._model_predict(
+            strategy, with_normalization_layer=True, steps=3
+        )
+        # Check the first (0th index), fourth (3rd index) and the last
+        # predictions because the first, fourth and the last input is the same
+        # in `model.predict` so there predictions should match.
+        self.assertTrue(
+            all(predictions[0] == predictions[i] for i in [0, 3, 5])
+        )
+
+        self.assertFalse(
+            all(predictions[0] == predictions[i] for i in [0, 1, 2, 4])
+        )
+
+    def testModelPredictWithStepsPerExecution(self, strategy):
+        _, predictions = self._model_predict(
+            strategy, steps_per_execution=3, steps=3
+        )
+
+        # Check the first (0th index), fourth (3rd index) and the last
+        # predictions because the first, fourth and the last input is the same
+        # in `model.predict` so there predictions should match.
+        self.assertTrue(
+            all(predictions[0] == predictions[i] for i in [0, 3, 5])
+        )
+
+        self.assertFalse(
+            all(predictions[0] == predictions[i] for i in [0, 1, 2, 4])
+        )
+
+    def testModelFitAndPredict(self, strategy):
+        def fit_dataset_fn(input_context):
+            del input_context
+            x = tf.random.uniform((10, 1))
+            y = tf.random.uniform((10,))
+            return (
+                tf.data.Dataset.from_tensor_slices((x, y))
+                .shuffle(10)
+                .repeat()
+                .batch(2)
+            )
+
+        x = dataset_creator.DatasetCreator(fit_dataset_fn)
+        validation_data = dataset_creator.DatasetCreator(fit_dataset_fn)
+
+        model = self._model_fit(strategy, x=x, validation_data=validation_data)
+        _, predictions = self._model_predict(strategy, model, steps=3)
+
+        # Check the first (0th index), fourth (3rd index) and the last
+        # predictions because the first, fourth and the last input is the same
+        # in `model.predict` so there predictions should match.
+        self.assertTrue(
+            all(predictions[0] == predictions[i] for i in [0, 3, 5])
+        )
+
+        self.assertFalse(
+            all(predictions[0] == predictions[i] for i in [0, 1, 2, 4])
+        )
+
+    def testModelPredictWithDatasetCreator(self, strategy):
+        if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy):
+            self.skipTest("b/189223991")
+
+        def _dataset_fn(input_context):
+            del input_context
+            x = tf.constant([[1.0], [2.0], [3.0], [1.0], [5.0], [1.0]])
+            return tf.data.Dataset.from_tensor_slices(x).repeat().batch(2)
+
+        _, predictions = self._model_predict(
+            strategy,
+            steps=3,
+            test_data=dataset_creator.DatasetCreator(_dataset_fn),
+        )
+
+        # Check the first (0th index), fourth (3rd index) and the last
+        # predictions because the first, fourth and the last input is the same
+        # in `model.predict` so there predictions should match.
+        self.assertTrue(
+            all(predictions[0] == predictions[i] for i in [0, 3, 5])
+        )
+
+        self.assertFalse(
+            all(predictions[0] == predictions[i] for i in [0, 1, 2, 4])
+        )
+
+    def testModelTrainTFFunction(self, strategy):
+        model = self._model_fit(strategy)
+        self.assertIsInstance(
+            model.train_tf_function, tf.__internal__.function.Function
+        )
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/dataset_creator_model_fit_test_base.py b/keras/distribute/dataset_creator_model_fit_test_base.py
index b2369cf123da..e7318fdf3b3b 100644
--- a/keras/distribute/dataset_creator_model_fit_test_base.py
+++ b/keras/distribute/dataset_creator_model_fit_test_base.py
@@ -14,216 +14,254 @@
 # ==============================================================================
 """Tests for `DatasetCreator` with `Model.fit` across usages and strategies."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras import callbacks as callbacks_lib
 from keras.engine import sequential
 from keras.layers import core as core_layers
 from keras.layers.preprocessing import string_lookup
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
 from keras.utils import dataset_creator
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 
 class DatasetCreatorModelFitTestBase(tf.test.TestCase, parameterized.TestCase):
-  """The base class for DatasetCreator with Model.fit tests."""
-
-  def _get_dataset_fn(self, use_lookup_layer):
-
-    if use_lookup_layer:
-
-      filepath = os.path.join(self.get_temp_dir(), "vocab")
-      with open(filepath, "w") as f:
-        f.write("\n".join(["earth", "wind", "and", "fire"]))
-
-      def dataset_fn(input_context):
-        del input_context
-        lookup_layer = string_lookup.StringLookup(
-            num_oov_indices=1, vocabulary=filepath)
-        x = np.array([["earth", "wind", "and", "fire"],
-                      ["fire", "and", "earth", "michigan"]])
-        y = np.array([0, 1])
-        map_fn = lambda x, y: (lookup_layer(x), y)
-        return tf.data.Dataset.from_tensor_slices(
-            (x, y)).shuffle(10).repeat().batch(2).map(map_fn)
-
-    else:
-
-      def dataset_fn(input_context):
-        del input_context
-        x = tf.random.uniform((10, 10))
-        y = tf.random.uniform((10,))
-        return tf.data.Dataset.from_tensor_slices(
-            (x, y)).shuffle(10).repeat().batch(2)
-
-    return dataset_fn
-
-  def _model_compile(self,
-                     strategy,
-                     steps_per_execution=1,
-                     run_eagerly=False,
-                     with_normalization_layer=False,
-                     jit_compile=None):
-
-    class ResultAssertingCallback(callbacks_lib.Callback):
-      """A callback that asserts the result of the tests."""
-
-      def __init__(self):
-        self._prev_epoch = -1
-
-      def on_epoch_end(self, epoch, logs=None):
-        logging.info("testModelFit: epoch=%r, logs=%r", epoch, logs)
-        if epoch <= self._prev_epoch:
-          raise RuntimeError("Epoch is supposed to be larger than previous.")
-        self._prev_epoch = epoch
-        is_loss_float = (
-            logs.get("loss", None) is not None and
-            isinstance(logs["loss"], (float, np.floating)))
-        if not is_loss_float:
-          raise RuntimeError("loss is supposed to be in the logs and float.")
-
-    with strategy.scope():
-      model = sequential.Sequential([core_layers.Dense(10)])
-      if with_normalization_layer:
-        norm = keras.layers.BatchNormalization(
-            axis=-1, input_shape=(4, 4, 3), momentum=0.8)
-        model.add(norm)
-      model.add(core_layers.Dense(1, activation="sigmoid"))
-      self._accuracy_metric = keras.metrics.Accuracy()
-
-    model.compile(
-        gradient_descent.SGD(),
-        loss="binary_crossentropy",
-        metrics=[self._accuracy_metric],
-        steps_per_execution=steps_per_execution,
-        run_eagerly=run_eagerly,
-        jit_compile=jit_compile)
-    return model, [ResultAssertingCallback()]
-
-  def _model_fit(self,
-                 strategy,
-                 steps_per_execution=1,
-                 validation_data=None,
-                 x=None,
-                 y=None,
-                 shuffle=True,
-                 batch_size=None,
-                 steps_per_epoch=10,
-                 run_eagerly=False,
-                 with_normalization_layer=False,
-                 callbacks=None,
-                 use_lookup_layer=False,
-                 use_dataset_creator=True,
-                 verbose="auto",
-                 jit_compile=None):
-    if callbacks is None:
-      callbacks = []
-
-    model, default_callbacks = self._model_compile(strategy,
-                                                   steps_per_execution,
-                                                   run_eagerly,
-                                                   with_normalization_layer,
-                                                   jit_compile)
-    callbacks += default_callbacks
-
-    if x is None:
-      if use_dataset_creator:
-        x = dataset_creator.DatasetCreator(
-            self._get_dataset_fn(use_lookup_layer))
-      else:
-        x = self._get_dataset_fn(use_lookup_layer)(None)
-
-    if validation_data is None:
-      if use_dataset_creator:
-        validation_data = dataset_creator.DatasetCreator(
-            self._get_dataset_fn(use_lookup_layer))
-      else:
-        validation_data = self._get_dataset_fn(use_lookup_layer)(None)
-
-    model.fit(
-        x,
-        y,
-        shuffle=shuffle,
-        batch_size=batch_size,
-        epochs=10,
-        steps_per_epoch=steps_per_epoch,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=steps_per_epoch,
-        verbose=verbose)
-    return model
-
-  def _model_evaluate(self,
-                      strategy,
-                      steps_per_execution=1,
-                      x=None,
-                      y=None,
-                      batch_size=None,
-                      steps=10,
-                      run_eagerly=False,
-                      with_normalization_layer=False,
-                      callbacks=None,
-                      use_dataset_creator=True):
-    if callbacks is None:
-      callbacks = []
-
-    model, default_callbacks = self._model_compile(
+    """The base class for DatasetCreator with Model.fit tests."""
+
+    def _get_dataset_fn(self, use_lookup_layer):
+
+        if use_lookup_layer:
+
+            filepath = os.path.join(self.get_temp_dir(), "vocab")
+            with open(filepath, "w") as f:
+                f.write("\n".join(["earth", "wind", "and", "fire"]))
+
+            def dataset_fn(input_context):
+                del input_context
+                lookup_layer = string_lookup.StringLookup(
+                    num_oov_indices=1, vocabulary=filepath
+                )
+                x = np.array(
+                    [
+                        ["earth", "wind", "and", "fire"],
+                        ["fire", "and", "earth", "michigan"],
+                    ]
+                )
+                y = np.array([0, 1])
+                map_fn = lambda x, y: (lookup_layer(x), y)
+                return (
+                    tf.data.Dataset.from_tensor_slices((x, y))
+                    .shuffle(10)
+                    .repeat()
+                    .batch(2)
+                    .map(map_fn)
+                )
+
+        else:
+
+            def dataset_fn(input_context):
+                del input_context
+                x = tf.random.uniform((10, 10))
+                y = tf.random.uniform((10,))
+                return (
+                    tf.data.Dataset.from_tensor_slices((x, y))
+                    .shuffle(10)
+                    .repeat()
+                    .batch(2)
+                )
+
+        return dataset_fn
+
+    def _model_compile(
+        self,
+        strategy,
+        steps_per_execution=1,
+        run_eagerly=False,
+        with_normalization_layer=False,
+        jit_compile=None,
+    ):
+        class ResultAssertingCallback(callbacks_lib.Callback):
+            """A callback that asserts the result of the tests."""
+
+            def __init__(self):
+                self._prev_epoch = -1
+
+            def on_epoch_end(self, epoch, logs=None):
+                logging.info("testModelFit: epoch=%r, logs=%r", epoch, logs)
+                if epoch <= self._prev_epoch:
+                    raise RuntimeError(
+                        "Epoch is supposed to be larger than previous."
+                    )
+                self._prev_epoch = epoch
+                is_loss_float = logs.get(
+                    "loss", None
+                ) is not None and isinstance(logs["loss"], (float, np.floating))
+                if not is_loss_float:
+                    raise RuntimeError(
+                        "loss is supposed to be in the logs and float."
+                    )
+
+        with strategy.scope():
+            model = sequential.Sequential([core_layers.Dense(10)])
+            if with_normalization_layer:
+                norm = keras.layers.BatchNormalization(
+                    axis=-1, input_shape=(4, 4, 3), momentum=0.8
+                )
+                model.add(norm)
+            model.add(core_layers.Dense(1, activation="sigmoid"))
+            self._accuracy_metric = keras.metrics.Accuracy()
+
+        model.compile(
+            gradient_descent.SGD(),
+            loss="binary_crossentropy",
+            metrics=[self._accuracy_metric],
+            steps_per_execution=steps_per_execution,
+            run_eagerly=run_eagerly,
+            jit_compile=jit_compile,
+        )
+        return model, [ResultAssertingCallback()]
+
+    def _model_fit(
+        self,
+        strategy,
+        steps_per_execution=1,
+        validation_data=None,
+        x=None,
+        y=None,
+        shuffle=True,
+        batch_size=None,
+        steps_per_epoch=10,
+        run_eagerly=False,
+        with_normalization_layer=False,
+        callbacks=None,
+        use_lookup_layer=False,
+        use_dataset_creator=True,
+        verbose="auto",
+        jit_compile=None,
+    ):
+        if callbacks is None:
+            callbacks = []
+
+        model, default_callbacks = self._model_compile(
+            strategy,
+            steps_per_execution,
+            run_eagerly,
+            with_normalization_layer,
+            jit_compile,
+        )
+        callbacks += default_callbacks
+
+        if x is None:
+            if use_dataset_creator:
+                x = dataset_creator.DatasetCreator(
+                    self._get_dataset_fn(use_lookup_layer)
+                )
+            else:
+                x = self._get_dataset_fn(use_lookup_layer)(None)
+
+        if validation_data is None:
+            if use_dataset_creator:
+                validation_data = dataset_creator.DatasetCreator(
+                    self._get_dataset_fn(use_lookup_layer)
+                )
+            else:
+                validation_data = self._get_dataset_fn(use_lookup_layer)(None)
+
+        model.fit(
+            x,
+            y,
+            shuffle=shuffle,
+            batch_size=batch_size,
+            epochs=10,
+            steps_per_epoch=steps_per_epoch,
+            callbacks=callbacks,
+            validation_data=validation_data,
+            validation_steps=steps_per_epoch,
+            verbose=verbose,
+        )
+        return model
+
+    def _model_evaluate(
+        self,
+        strategy,
+        steps_per_execution=1,
+        x=None,
+        y=None,
+        batch_size=None,
+        steps=10,
+        run_eagerly=False,
+        with_normalization_layer=False,
+        callbacks=None,
+        use_dataset_creator=True,
+    ):
+        if callbacks is None:
+            callbacks = []
+
+        model, default_callbacks = self._model_compile(
+            strategy,
+            steps_per_execution,
+            run_eagerly,
+            with_normalization_layer,
+        )
+        callbacks += default_callbacks
+
+        def dataset_fn(input_context):
+            del input_context
+            x = tf.random.uniform((10, 10))
+            y = tf.random.uniform((10, 1))
+            return (
+                tf.data.Dataset.from_tensor_slices((x, y))
+                .shuffle(10)
+                .repeat()
+                .batch(8)
+            )
+
+        if x is None:
+            if use_dataset_creator:
+                x = dataset_creator.DatasetCreator(dataset_fn)
+            else:
+                x = dataset_fn(None)
+
+        model.evaluate(
+            x=x, y=y, steps=steps, callbacks=callbacks, batch_size=batch_size
+        )
+        return model
+
+    def _model_predict(
+        self,
         strategy,
-        steps_per_execution,
-        run_eagerly,
-        with_normalization_layer,
-    )
-    callbacks += default_callbacks
-
-    def dataset_fn(input_context):
-      del input_context
-      x = tf.random.uniform((10, 10))
-      y = tf.random.uniform((10, 1))
-      return tf.data.Dataset.from_tensor_slices(
-          (x, y)).shuffle(10).repeat().batch(8)
-
-    if x is None:
-      if use_dataset_creator:
-        x = dataset_creator.DatasetCreator(dataset_fn)
-      else:
-        x = dataset_fn(None)
-
-    model.evaluate(
-        x=x, y=y, steps=steps, callbacks=callbacks, batch_size=batch_size)
-    return model
-
-  def _model_predict(
-      self,
-      strategy,
-      model=None,
-      steps_per_execution=1,
-      test_data=None,
-      steps=10,
-      with_normalization_layer=False,
-  ):
-    callbacks = []
-
-    if model is None:
-      model, default_callbacks = self._model_compile(
-          strategy,
-          steps_per_execution,
-          with_normalization_layer=with_normalization_layer,
-      )
-      callbacks += default_callbacks
-
-    def create_test_data():
-      x = tf.constant([[1.], [2.], [3.], [1.], [5.], [1.]])
-      return tf.data.Dataset.from_tensor_slices(x).repeat().batch(2)
-
-    if test_data is None:
-      test_data = create_test_data()
-
-    predictions = model.predict(x=test_data, steps=steps, callbacks=callbacks)
-    predictions = np.around(predictions, 4)
-    return model, predictions
+        model=None,
+        steps_per_execution=1,
+        test_data=None,
+        steps=10,
+        with_normalization_layer=False,
+    ):
+        callbacks = []
+
+        if model is None:
+            model, default_callbacks = self._model_compile(
+                strategy,
+                steps_per_execution,
+                with_normalization_layer=with_normalization_layer,
+            )
+            callbacks += default_callbacks
+
+        def create_test_data():
+            x = tf.constant([[1.0], [2.0], [3.0], [1.0], [5.0], [1.0]])
+            return tf.data.Dataset.from_tensor_slices(x).repeat().batch(2)
+
+        if test_data is None:
+            test_data = create_test_data()
+
+        predictions = model.predict(
+            x=test_data, steps=steps, callbacks=callbacks
+        )
+        predictions = np.around(predictions, 4)
+        return model, predictions
diff --git a/keras/distribute/distribute_coordinator_utils.py b/keras/distribute/distribute_coordinator_utils.py
index fe3f625d36c8..9aa95008b3f5 100644
--- a/keras/distribute/distribute_coordinator_utils.py
+++ b/keras/distribute/distribute_coordinator_utils.py
@@ -25,13 +25,15 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import copy
 import json
 import os
 import threading
 import time
+
+import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.python.platform import tf_logging as logging
 
@@ -40,638 +42,742 @@
 
 
 def get_current_worker_context():
-  """Returns the current task context."""
-  try:
-    return _worker_context.current
-  except AttributeError:
-    return None
+    """Returns the current task context."""
+    try:
+        return _worker_context.current
+    except AttributeError:
+        return None
 
 
 class _TaskType:
-  PS = "ps"
-  WORKER = "worker"
-  CHIEF = "chief"
-  EVALUATOR = "evaluator"
-  CLIENT = "client"
+    PS = "ps"
+    WORKER = "worker"
+    CHIEF = "chief"
+    EVALUATOR = "evaluator"
+    CLIENT = "client"
 
 
 def _get_num_workers(cluster_spec):
-  """Gets number of workers including chief."""
-  if not cluster_spec:
-    return 0
-  return len(cluster_spec.as_dict().get(_TaskType.WORKER, [])) + len(
-      cluster_spec.as_dict().get(_TaskType.CHIEF, []))
+    """Gets number of workers including chief."""
+    if not cluster_spec:
+        return 0
+    return len(cluster_spec.as_dict().get(_TaskType.WORKER, [])) + len(
+        cluster_spec.as_dict().get(_TaskType.CHIEF, [])
+    )
 
 
 class _WorkerContext:
-  """The worker context class.
-
-  This context object provides configuration information for each task. One
-  context manager with a worker context object will be created per
-  invocation to the `worker_fn` where `get_current_worker_context` can be called
-  to access the worker context object.
-  """
-
-  def __init__(self,
-               strategy,
-               cluster_spec,
-               task_type,
-               task_id,
-               session_config=None,
-               rpc_layer="grpc",
-               worker_barrier=None):
-    """Initialize the worker context object.
+    """The worker context class.
 
-    Args:
-      strategy: a `DistributionStrategy` object.
-      cluster_spec: a ClusterSpec object. It can be empty or None in the local
-        training case.
-      task_type: a string indicating the role of the corresponding task, such as
-        "worker" or "ps". It can be None if it is local training or in-graph
-        replicated training.
-      task_id: an integer indicating id of the corresponding task. It can be
-        None if it is local training or in-graph replicated training.
-      session_config: an optional `tf.compat.v1.ConfigProto` object.
-      rpc_layer: optional string specifying the RPC protocol for communication
-        with worker masters. If None or empty, hosts in the `cluster_spec` will
-        be used directly.
-      worker_barrier: optional, the barrier object for worker synchronization.
+    This context object provides configuration information for each task. One
+    context manager with a worker context object will be created per invocation
+    to the `worker_fn` where `get_current_worker_context` can be called to
+    access the worker context object.
     """
-    self._strategy = strategy
-    self._cluster_spec = cluster_spec
-    self._task_type = task_type
-    self._task_id = task_id
-    self._session_config = session_config
-    self._worker_barrier = worker_barrier
-    self._rpc_layer = rpc_layer
-    self._master_target = self._get_master_target()
-    self._num_workers = _get_num_workers(cluster_spec)
-    self._is_chief_node = self._is_chief()
-
-  def _debug_message(self):
-    if self._cluster_spec:
-      return "[cluster_spec: %r, task_type: %r, task_id: %r]" % (
-          self._cluster_spec, self.task_type, self.task_id)
-    else:
-      return "[local]"
-
-  def __enter__(self):
-    old_context = get_current_worker_context()
-    if old_context:
-      raise ValueError(
-          "You cannot run distribute coordinator in a `worker_fn`.\t" +
-          self._debug_message())
-    # pylint: disable=protected-access
-    _worker_context.current = self
-
-  def __exit__(self, unused_exception_type, unused_exception_value,
-               unused_traceback):
-    # pylint: disable=protected-access
-    _worker_context.current = None
-
-  def _get_master_target(self):
-    """Return the master target for a task."""
-    # If cluster_spec is None or empty, we use local master.
-    if not self._cluster_spec or self._task_type == _TaskType.EVALUATOR:
-      return ""
-
-    # If task_type is None, then it is in-graph replicated training. In this
-    # case we use the chief or first worker's master target.
-    if not self._task_type:
-      if _TaskType.CHIEF in self._cluster_spec.jobs:
-        task_type = _TaskType.CHIEF
-        task_id = 0
-      else:
-        assert _TaskType.WORKER in self._cluster_spec.jobs
-        task_type = _TaskType.WORKER
-        task_id = 0
+
+    def __init__(
+        self,
+        strategy,
+        cluster_spec,
+        task_type,
+        task_id,
+        session_config=None,
+        rpc_layer="grpc",
+        worker_barrier=None,
+    ):
+        """Initialize the worker context object.
+
+        Args:
+          strategy: a `DistributionStrategy` object.
+          cluster_spec: a ClusterSpec object. It can be empty or None in the
+            local training case.
+          task_type: a string indicating the role of the corresponding task,
+            such as "worker" or "ps". It can be None if it is local training or
+            in-graph replicated training.
+          task_id: an integer indicating id of the corresponding task. It can be
+            None if it is local training or in-graph replicated training.
+          session_config: an optional `tf.compat.v1.ConfigProto` object.
+          rpc_layer: optional string specifying the RPC protocol for
+            communication with worker masters. If None or empty, hosts in the
+            `cluster_spec` will be used directly.
+          worker_barrier: optional, the barrier object for worker
+            synchronization.
+        """
+        self._strategy = strategy
+        self._cluster_spec = cluster_spec
+        self._task_type = task_type
+        self._task_id = task_id
+        self._session_config = session_config
+        self._worker_barrier = worker_barrier
+        self._rpc_layer = rpc_layer
+        self._master_target = self._get_master_target()
+        self._num_workers = _get_num_workers(cluster_spec)
+        self._is_chief_node = self._is_chief()
+
+    def _debug_message(self):
+        if self._cluster_spec:
+            return "[cluster_spec: %r, task_type: %r, task_id: %r]" % (
+                self._cluster_spec,
+                self.task_type,
+                self.task_id,
+            )
+        else:
+            return "[local]"
+
+    def __enter__(self):
+        old_context = get_current_worker_context()
+        if old_context:
+            raise ValueError(
+                "You cannot run distribute coordinator in a `worker_fn`.\t"
+                + self._debug_message()
+            )
+
+        _worker_context.current = self
+
+    def __exit__(
+        self, unused_exception_type, unused_exception_value, unused_traceback
+    ):
+
+        _worker_context.current = None
+
+    def _get_master_target(self):
+        """Return the master target for a task."""
+        # If cluster_spec is None or empty, we use local master.
+        if not self._cluster_spec or self._task_type == _TaskType.EVALUATOR:
+            return ""
+
+        # If task_type is None, then it is in-graph replicated training. In this
+        # case we use the chief or first worker's master target.
+        if not self._task_type:
+            if _TaskType.CHIEF in self._cluster_spec.jobs:
+                task_type = _TaskType.CHIEF
+                task_id = 0
+            else:
+                assert _TaskType.WORKER in self._cluster_spec.jobs
+                task_type = _TaskType.WORKER
+                task_id = 0
+        else:
+            task_type = self._task_type
+            task_id = self._task_id
+
+        prefix = ""
+        if self._rpc_layer:
+            prefix = self._rpc_layer + "://"
+        return prefix + self._cluster_spec.job_tasks(task_type)[task_id or 0]
+
+    def _is_chief(self):
+        """Return whether the task is the chief worker."""
+        if not self._cluster_spec or self._task_type in [
+            _TaskType.CHIEF,
+            _TaskType.EVALUATOR,
+            None,
+        ]:
+            return True
+
+        # If not local and chief not in the cluster_spec, use the first worker
+        # as chief.
+        if (
+            _TaskType.CHIEF not in self._cluster_spec.jobs
+            and self._task_type == _TaskType.WORKER
+            and self._task_id == 0
+        ):
+            return True
+        return False
+
+    def wait_for_other_workers(self):
+        """Waits for other workers to reach the same call to this method.
+
+        Raises:
+          ValueError: if `worker_barrier` is not passed to the __init__ method.
+        """
+        if not self._worker_barrier:
+            # TODO(yuefengz): we should throw an error in independent worker
+            # mode.
+            return
+        self._worker_barrier.wait()
+
+    def session_creator(
+        self,
+        scaffold=None,
+        config=None,
+        checkpoint_dir=None,
+        checkpoint_filename_with_path=None,
+        max_wait_secs=7200,
+    ):
+        """Returns a session creator.
+
+        The returned session creator will be configured with the correct master
+        target and session configs. It will also run either init ops or ready
+        ops by querying the `strategy` object when `create_session` is called on
+        it.
+
+        Args:
+          scaffold: A `Scaffold` used for gathering or building supportive ops.
+            If not specified a default one is created. It's used to finalize the
+            graph.
+          config: `ConfigProto` proto used to configure the session.
+          checkpoint_dir: A string. Optional path to a directory where to
+            restore variables.
+          checkpoint_filename_with_path: Full file name path to the checkpoint
+            file. Only one of `checkpoint_dir` or
+            `checkpoint_filename_with_path` can be specified.
+          max_wait_secs: Maximum time to wait for the session to become
+            available.
+
+        Returns:
+          a descendant of SessionCreator.
+        """
+        if config:
+            session_config = copy.deepcopy(config)
+            session_config.MergeFrom(self._session_config)
+        else:
+            session_config = self._session_config
+
+        if (
+            not self._strategy
+            or self._strategy.extended.experimental_should_init
+        ):
+            logging.info(
+                "Creating chief session creator with config: %r", config
+            )
+            return tf.compat.v1.train.ChiefSessionCreator(
+                scaffold,
+                master=self.master_target,
+                config=session_config,
+                checkpoint_dir=checkpoint_dir,
+                checkpoint_filename_with_path=checkpoint_filename_with_path,
+            )
+        else:
+            logging.info(
+                "Creating worker session creator with config: %r", config
+            )
+            return tf.compat.v1.train.WorkerSessionCreator(
+                scaffold,
+                master=self.master_target,
+                config=session_config,
+                max_wait_secs=max_wait_secs,
+            )
+
+    @property
+    def session_config(self):
+        return copy.deepcopy(self._session_config)
+
+    @property
+    def has_barrier(self):
+        """Whether the barrier is set or not."""
+        return self._worker_barrier is not None
+
+    @property
+    def distributed_mode(self):
+        """Whether it is distributed training or not."""
+        return (
+            bool(self._cluster_spec) and self._task_type != _TaskType.EVALUATOR
+        )
+
+    @property
+    def cluster_spec(self):
+        """Returns a copy of the cluster_spec object."""
+        return copy.deepcopy(self._cluster_spec)
+
+    @property
+    def task_type(self):
+        """Returns the role of the corresponding task."""
+        return self._task_type
+
+    @property
+    def task_id(self):
+        """Returns the id or index of the corresponding task."""
+        return self._task_id
+
+    @property
+    def master_target(self):
+        """Returns the session master for the corresponding task to connect
+        to."""
+        return self._master_target
+
+    @property
+    def is_chief(self):
+        """Returns whether the task is a chief node."""
+        return self._is_chief_node
+
+    @property
+    def num_workers(self):
+        """Returns number of workers in the cluster, including chief."""
+        return self._num_workers
+
+    @property
+    def experimental_should_init(self):
+        """Whether to run init ops."""
+        return self._strategy.extended.experimental_should_init
+
+    @property
+    def should_checkpoint(self):
+        """Whether to save checkpoint."""
+        return self._strategy.extended.should_checkpoint
+
+    @property
+    def should_save_summary(self):
+        """Whether to save summaries."""
+        return self._strategy.extended.should_save_summary
+
+
+def _run_single_worker(
+    worker_fn,
+    strategy,
+    cluster_spec,
+    task_type,
+    task_id,
+    session_config,
+    rpc_layer="",
+    worker_barrier=None,
+    coord=None,
+):
+    """Runs a single worker by calling `worker_fn` under context."""
+    session_config = copy.deepcopy(session_config)
+    strategy = copy.deepcopy(strategy)
+    # If there is an EVALUATOR task, we run single-machine eval on that task.
+    if task_type == _TaskType.EVALUATOR:
+        # It is possible to not have a strategy object for EVALUATOR task.
+        if strategy:
+            strategy.configure(session_config)
     else:
-      task_type = self._task_type
-      task_id = self._task_id
-
-    prefix = ""
-    if self._rpc_layer:
-      prefix = self._rpc_layer + "://"
-    return prefix + self._cluster_spec.job_tasks(task_type)[task_id or 0]
-
-  def _is_chief(self):
-    """Return whether the task is the chief worker."""
-    if (not self._cluster_spec or
-        self._task_type in [_TaskType.CHIEF, _TaskType.EVALUATOR, None]):
-      return True
-
-    # If not local and chief not in the cluster_spec, use the first worker as
-    # chief.
-    if (_TaskType.CHIEF not in self._cluster_spec.jobs and
-        self._task_type == _TaskType.WORKER and self._task_id == 0):
-      return True
-    return False
-
-  def wait_for_other_workers(self):
-    """Waits for other workers to reach the same call to this method.
+        assert strategy
+        strategy.configure(session_config, cluster_spec, task_type, task_id)
 
-    Raises:
-      ValueError: if `worker_barrier` is not passed to the __init__ method.
-    """
-    if not self._worker_barrier:
-      # TODO(yuefengz): we should throw an error in independent worker mode.
-      return
-    self._worker_barrier.wait()
-
-  def session_creator(self,
-                      scaffold=None,
-                      config=None,
-                      checkpoint_dir=None,
-                      checkpoint_filename_with_path=None,
-                      max_wait_secs=7200):
-    """Returns a session creator.
-
-    The returned session creator will be configured with the correct master
-    target and session configs. It will also run either init ops or ready ops
-    by querying the `strategy` object when `create_session` is called on it.
+    context = _WorkerContext(
+        strategy,
+        cluster_spec,
+        task_type,
+        task_id,
+        session_config=session_config,
+        rpc_layer=rpc_layer,
+        worker_barrier=worker_barrier,
+    )
+    with context:
+        if coord:
+            with coord.stop_on_exception():
+                return worker_fn(strategy)
+        else:
+            return worker_fn(strategy)
 
-    Args:
-      scaffold: A `Scaffold` used for gathering or building supportive ops. If
-        not specified a default one is created. It's used to finalize the graph.
-      config: `ConfigProto` proto used to configure the session.
-      checkpoint_dir: A string. Optional path to a directory where to restore
-        variables.
-      checkpoint_filename_with_path: Full file name path to the checkpoint file.
-        Only one of `checkpoint_dir` or `checkpoint_filename_with_path` can be
-        specified.
-      max_wait_secs: Maximum time to wait for the session to become available.
 
-    Returns:
-      a descendant of SessionCreator.
-    """
-    if config:
-      session_config = copy.deepcopy(config)
-      session_config.MergeFrom(self._session_config)
+def _split_cluster_for_evaluator(cluster_spec, task_type):
+    """Split the cluster for evaluator since it needn't talk to other tasks."""
+    # Splitting the cluster is important to prevent the evaluator from talking
+    # to other tasks in the cluster. Since we allow evaluator not to use
+    # distribution strategies and as a result ops in the evaluator task may have
+    # unspecified devices. Those ops may end up on other tasks if we don't split
+    # the cluster.
+    # Note: if you bypass distribute coordinator and bring the cluster yourself,
+    # you can equivalently set device filters to split clusters. This is already
+    # done by distribution strategy's `update_config_proto` method.
+    new_cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+    if task_type == _TaskType.EVALUATOR:
+        assert _TaskType.EVALUATOR in new_cluster_spec
+        new_cluster_spec = {
+            _TaskType.EVALUATOR: new_cluster_spec[_TaskType.EVALUATOR]
+        }
     else:
-      session_config = self._session_config
-
-    if not self._strategy or self._strategy.extended.experimental_should_init:
-      logging.info("Creating chief session creator with config: %r", config)
-      return tf.compat.v1.train.ChiefSessionCreator(
-          scaffold,
-          master=self.master_target,
-          config=session_config,
-          checkpoint_dir=checkpoint_dir,
-          checkpoint_filename_with_path=checkpoint_filename_with_path)
+        new_cluster_spec.pop(_TaskType.EVALUATOR, None)
+    return normalize_cluster_spec(new_cluster_spec)
+
+
+def _run_std_server(
+    cluster_spec=None,
+    task_type=None,
+    task_id=None,
+    session_config=None,
+    rpc_layer=None,
+    environment=None,
+):
+    """Runs a standard server."""
+    # Check if the Server is already running. If so, assert that no
+    # configuration options have changed, and return the existing Server. This
+    # allows us to call `run_distribute_coordinator` multiple times.
+    if getattr(_thread_local, "server", None) is not None:
+        assert _thread_local.cluster_spec == cluster_spec
+        assert _thread_local.task_type == task_type
+        assert _thread_local.task_id == task_id
+        assert _thread_local.session_config_str == repr(session_config)
+        assert _thread_local.rpc_layer == rpc_layer
+        assert _thread_local.environment == environment
+        return _thread_local.server
     else:
-      logging.info("Creating worker session creator with config: %r", config)
-      return tf.compat.v1.train.WorkerSessionCreator(
-          scaffold,
-          master=self.master_target,
-          config=session_config,
-          max_wait_secs=max_wait_secs)
-
-  @property
-  def session_config(self):
-    return copy.deepcopy(self._session_config)
-
-  @property
-  def has_barrier(self):
-    """Whether the barrier is set or not."""
-    return self._worker_barrier is not None
-
-  @property
-  def distributed_mode(self):
-    """Whether it is distributed training or not."""
-    return bool(self._cluster_spec) and self._task_type != _TaskType.EVALUATOR
-
-  @property
-  def cluster_spec(self):
-    """Returns a copy of the cluster_spec object."""
-    return copy.deepcopy(self._cluster_spec)
-
-  @property
-  def task_type(self):
-    """Returns the role of the corresponding task."""
-    return self._task_type
-
-  @property
-  def task_id(self):
-    """Returns the id or index of the corresponding task."""
-    return self._task_id
-
-  @property
-  def master_target(self):
-    """Returns the session master for the corresponding task to connect to."""
-    return self._master_target
-
-  @property
-  def is_chief(self):
-    """Returns whether the task is a chief node."""
-    return self._is_chief_node
-
-  @property
-  def num_workers(self):
-    """Returns number of workers in the cluster, including chief."""
-    return self._num_workers
-
-  @property
-  def experimental_should_init(self):
-    """Whether to run init ops."""
-    return self._strategy.extended.experimental_should_init
-
-  @property
-  def should_checkpoint(self):
-    """Whether to save checkpoint."""
-    return self._strategy.extended.should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    """Whether to save summaries."""
-    return self._strategy.extended.should_save_summary
-
-
-def _run_single_worker(worker_fn,
-                       strategy,
-                       cluster_spec,
-                       task_type,
-                       task_id,
-                       session_config,
-                       rpc_layer="",
-                       worker_barrier=None,
-                       coord=None):
-  """Runs a single worker by calling `worker_fn` under context."""
-  session_config = copy.deepcopy(session_config)
-  strategy = copy.deepcopy(strategy)
-  # If there is an EVALUATOR task, we run single-machine eval on that task.
-  if task_type == _TaskType.EVALUATOR:
-    # It is possible to not have a strategy object for EVALUATOR task.
-    if strategy:
-      strategy.configure(session_config)
-  else:
-    assert strategy
-    strategy.configure(session_config, cluster_spec, task_type, task_id)
-
-  context = _WorkerContext(
-      strategy,
-      cluster_spec,
-      task_type,
-      task_id,
-      session_config=session_config,
-      rpc_layer=rpc_layer,
-      worker_barrier=worker_barrier)
-  with context:
-    if coord:
-      with coord.stop_on_exception():
-        return worker_fn(strategy)
+        # This method is not thread-safe.
+        _thread_local.server_started = True
+        _thread_local.cluster_spec = cluster_spec
+        _thread_local.task_type = task_type
+        _thread_local.task_id = task_id
+        _thread_local.session_config_str = repr(session_config)
+        _thread_local.rpc_layer = rpc_layer
+        _thread_local.environment = environment
+
+    assert cluster_spec
+    target = cluster_spec.task_address(task_type, task_id)
+    if rpc_layer:
+        target = rpc_layer + "://" + target
+
+    class _FakeServer:
+        """A fake server that runs a master session."""
+
+        def start(self):
+            # A tensorflow server starts when a remote session is created.
+            logging.info(
+                "Creating a remote session to start a TensorFlow server, "
+                "target = %r, session_config=%r",
+                target,
+                session_config,
+            )
+            tf.compat.v1.Session(target=target, config=session_config)
+
+        def join(self):
+            while True:
+                time.sleep(5)
+
+    if environment == "google":
+        server = _FakeServer()
     else:
-      return worker_fn(strategy)
-
-
-def _split_cluster_for_evaluator(cluster_spec, task_type):
-  """Split the cluster for evaluator since it needn't talk to other tasks."""
-  # Splitting the cluster is important to prevent the evaluator from talking to
-  # other tasks in the cluster. Since we allow evaluator not to use
-  # distribution strategies and as a result ops in the evaluator task may have
-  # unspecified devices. Those ops may end up on other tasks if we don't split
-  # the cluster.
-  # Note: if you bypass distribute coordinator and bring the cluster yourself,
-  # you can equivalently set device filters to split clusters. This is already
-  # done by distribution strategy's `update_config_proto` method.
-  new_cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
-  if task_type == _TaskType.EVALUATOR:
-    assert _TaskType.EVALUATOR in new_cluster_spec
-    new_cluster_spec = {
-        _TaskType.EVALUATOR: new_cluster_spec[_TaskType.EVALUATOR]
-    }
-  else:
-    new_cluster_spec.pop(_TaskType.EVALUATOR, None)
-  return normalize_cluster_spec(new_cluster_spec)
-
-
-def _run_std_server(cluster_spec=None,
-                    task_type=None,
-                    task_id=None,
-                    session_config=None,
-                    rpc_layer=None,
-                    environment=None):
-  """Runs a standard server."""
-  # Check if the Server is already running. If so, assert that no configuration
-  # options have changed, and return the existing Server. This allows us to
-  # call `run_distribute_coordinator` multiple times.
-  if getattr(_thread_local, "server", None) is not None:
-    assert _thread_local.cluster_spec == cluster_spec
-    assert _thread_local.task_type == task_type
-    assert _thread_local.task_id == task_id
-    assert _thread_local.session_config_str == repr(session_config)
-    assert _thread_local.rpc_layer == rpc_layer
-    assert _thread_local.environment == environment
-    return _thread_local.server
-  else:
-    # This method is not thread-safe.
-    _thread_local.server_started = True
-    _thread_local.cluster_spec = cluster_spec
-    _thread_local.task_type = task_type
-    _thread_local.task_id = task_id
-    _thread_local.session_config_str = repr(session_config)
-    _thread_local.rpc_layer = rpc_layer
-    _thread_local.environment = environment
-
-  assert cluster_spec
-  target = cluster_spec.task_address(task_type, task_id)
-  if rpc_layer:
-    target = rpc_layer + "://" + target
-
-  class _FakeServer:
-    """A fake server that runs a master session."""
-
-    def start(self):
-      # A tensorflow server starts when a remote session is created.
-      logging.info(
-          "Creating a remote session to start a TensorFlow server, "
-          "target = %r, session_config=%r", target, session_config)
-      tf.compat.v1.Session(target=target, config=session_config)
-
-    def join(self):
-      while True:
-        time.sleep(5)
-
-  if environment == "google":
-    server = _FakeServer()
-  else:
-    if session_config:
-      logging.info(
-          "Starting standard TensorFlow server, target = %r, session_config= "
-          "%r", target, session_config)
+        if session_config:
+            logging.info(
+                "Starting standard TensorFlow server, target = %r, "
+                "session_config = %r",
+                target,
+                session_config,
+            )
+        else:
+            logging.info(
+                "Starting standard TensorFlow server, target = %r", target
+            )
+        cluster_spec = _split_cluster_for_evaluator(cluster_spec, task_type)
+        server = tf.distribute.Server(
+            cluster_spec,
+            job_name=task_type,
+            task_index=task_id,
+            config=session_config,
+            protocol=rpc_layer,
+        )
+
+    server.start()
+    _thread_local.server = server
+    return server
+
+
+def _configure_session_config_for_std_servers(
+    strategy, eval_strategy, session_config, cluster_spec, task_type, task_id
+):
+
+    """Call strategy's `configure` to mutate the session_config.
+
+    The session_config is currently needed as default config for a TensorFlow
+    server. In the future, we should be able to remove this method and only pass
+    the session config to a client session.
+    """
+    if task_type == _TaskType.EVALUATOR:
+        if eval_strategy:
+            eval_strategy.configure(session_config=session_config)
     else:
-      logging.info("Starting standard TensorFlow server, target = %r", target)
-    cluster_spec = _split_cluster_for_evaluator(cluster_spec, task_type)
-    server = tf.distribute.Server(
-        cluster_spec,
-        job_name=task_type,
-        task_index=task_id,
-        config=session_config,
-        protocol=rpc_layer)
-
-  server.start()
-  _thread_local.server = server
-  return server
-
-
-def _configure_session_config_for_std_servers(strategy, eval_strategy,
-                                              session_config, cluster_spec,
-                                              task_type, task_id):
-  # pylint: disable=g-doc-args
-  """Call strategy's `configure` to mutate the session_config.
-
-  The session_config is currently needed as default config for a TensorFlow
-  server. In the future, we should be able to remove this method and only pass
-  the session config to a client session.
-  """
-  if task_type == _TaskType.EVALUATOR:
-    if eval_strategy:
-      eval_strategy.configure(session_config=session_config)
-  else:
-    # The strategy may be shared in standalone client mode.
-    strategy = copy.deepcopy(strategy)
-    strategy.configure(
-        session_config=session_config,
-        cluster_spec=cluster_spec,
-        task_type=task_type,
-        task_id=task_id)
-  # Remove the device filters specific to the strategy, so that the
-  # TensorFlow server brought up with one strategy can be used by other
-  # strategies. The device filters can be set in the client side as well.
-  del session_config.device_filters[:]
+        # The strategy may be shared in standalone client mode.
+        strategy = copy.deepcopy(strategy)
+        strategy.configure(
+            session_config=session_config,
+            cluster_spec=cluster_spec,
+            task_type=task_type,
+            task_id=task_id,
+        )
+    # Remove the device filters specific to the strategy, so that the
+    # TensorFlow server brought up with one strategy can be used by other
+    # strategies. The device filters can be set in the client side as well.
+    del session_config.device_filters[:]
 
 
 # TODO(yuefengz): propagate cluster_spec in the STANDALONE_CLIENT mode.
 # TODO(yuefengz): we may need a smart way to figure out whether the current task
 # is the special task when we support cluster_spec propagation.
-def run_distribute_coordinator(worker_fn,
-                               strategy,
-                               eval_fn=None,
-                               eval_strategy=None,
-                               cluster_spec=None,
-                               task_type=None,
-                               task_id=None,
-                               session_config=None,
-                               rpc_layer="grpc"):
-  """Runs the coordinator for distributed TensorFlow.
-
-  This function runs a split coordinator for distributed TensorFlow in its
-  default mode, i.e the STANDALONE_CLIENT mode. Given a `cluster_spec`
-  specifying server addresses and their roles in a cluster, this coordinator
-  will figure out how to set them up, give the underlying function the right
-  targets for master sessions via a scope object and coordinate their training.
-  The cluster consisting of standard servers needs to be brought up either with
-  the standard server binary or with a binary running distribute coordinator
-  with `task_type` set to non-client type which will then turn into standard
-  servers.
-
-  In addition to be the distribute coordinator, this is also the source of
-  configurations for each job in the distributed training. As there are multiple
-  ways to configure a distributed TensorFlow cluster, its context object
-  provides these configurations so that users or higher-level APIs don't have to
-  figure out the configuration for each job by themselves.
-
-  In the between-graph replicated training, this coordinator will create
-  multiple threads and each calls the `worker_fn` which is supposed to create
-  its own graph and connect to one worker master given by its context object. In
-  the in-graph replicated training, it has only one thread calling this
-  `worker_fn`.
-
-  Another mode is the INDEPENDENT_WORKER mode where each server runs a
-  distribute coordinator which will start a standard server and optionally runs
-  `worker_fn` depending whether it is between-graph training or in-graph
-  replicated training.
-
-  The `strategy` object is expected to be a DistributionStrategy object which
-  has implemented methods needed by distributed coordinator such as
-  `configure(session_config, cluster_spec, task_type, task_id)` which configures
-  the strategy object for a specific task and `experimental_should_init`
-  property which instructs the distribute coordinator whether to run init ops
-  for a task. The distribute coordinator will make a copy of the `strategy`
-  object, call its `configure` method and pass it to `worker_fn` as an argument.
-
-  The `worker_fn` defines the training logic and is called under its own
-  worker context which can be accessed to via `get_current_worker_context`. A
-  worker context provides access to configurations for each task, e.g. the
-  task_type, task_id, master target and so on. Since `worker_fn` will be called
-  in a thread and possibly multiple times, caller should be careful when it
-  accesses global data. For example, it is unsafe to define flags in a
-  `worker_fn` or to define different environment variables for different
-  `worker_fn`s.
-
-  The `worker_fn` for the between-graph replication is defined as if there is
-  only one worker corresponding to the `worker_fn` and possibly ps jobs. For
-  example, when training with parameter servers, it assigns variables to
-  parameter servers and all other operations to that worker. In the in-graph
-  replication case, the `worker_fn` has to define operations for all worker
-  jobs. Using a distribution strategy can simplify the `worker_fn` by not having
-  to worry about the replication and device assignment of variables and
-  operations.
-
-  This method is intended to be invoked by high-level APIs so that users don't
-  have to explicitly call it to run this coordinator. For those who don't use
-  high-level APIs, to change a program to use this coordinator, wrap everything
-  in a the program after global data definitions such as commandline flag
-  definition into the `worker_fn` and get task-specific configurations from
-  the worker context.
-
-  The `cluster_spec` can be either passed by the argument or parsed from the
-  "TF_CONFIG" environment variable. Example of a TF_CONFIG:
-  ```
-    cluster = {'chief': ['host0:2222'],
-               'ps': ['host1:2222', 'host2:2222'],
-               'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
-    os.environ['TF_CONFIG'] = json.dumps({'cluster': cluster})
-  ```
-
-  If `cluster_spec` is not given in any format, it becomes local training and
-  this coordinator will connect to a local session.
-
-  For evaluation, if "evaluator" exists in the cluster_spec, a separate thread
-  will be created to call `eval_fn` with its `task_type` set to "evaluator". If
-  `eval_fn` is not defined, fall back to `worker_fn`. This implies that
-  evaluation will be done on a single machine if there is an "evaluator" task.
-  If "evaluator" doesn't exist in the cluster_spec, it entirely depends on the
-  `worker_fn` for how to do evaluation.
-
-  Args:
-    worker_fn: the function to be called. The function should accept a
-      `strategy` object and will be given access to a context object via a
-      context manager scope.
-    strategy: a DistributionStrategy object specifying whether it should run
-      between-graph replicated training or not, whether to run init ops, etc.
-      This object will also be configured given `session_config`,
-      `cluster_spec`, `task_type` and `task_id`.
-    eval_fn: optional function for "evaluator" task. If `eval_fn` is not passed
-      in but a "evaluator" task is found in the `cluster_spec`, the `worker_fn`
-      will be used for this task.
-    eval_strategy: optional DistributionStrategy object for "evaluator" task.
-    cluster_spec: a dict, ClusterDef or ClusterSpec specifying servers and roles
-      in a cluster. If not set or empty, fall back to local training.
-    task_type: the current task type, optional if this is a client.
-    task_id: the current task id, optional if this is a client.
-    session_config: an optional `tf.compat.v1.ConfigProto` object which will be
-      passed to `strategy`'s `configure` method and used to create a session.
-    rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
-
-  Raises:
-    ValueError: if `cluster_spec` is supplied but not a dict or a ClusterDef or
-      a ClusterSpec.
-
-  Returns:
-    In the client job, return the value returned by `worker_fn` if
-    it is in-graph replication or INDEPENDENT_WORKER mode; return None
-    otherwise.
-  """
-  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
-  rpc_layer = tf_config.get("rpc_layer", rpc_layer)
-  environment = tf_config.get("environment", None)
-
-  if not cluster_spec:
-    cluster_spec = tf_config.get("cluster", {})
-    task_env = tf_config.get("task", {})
-    if task_env:
-      task_type = task_env.get("type", task_type)
-      task_id = int(task_env.get("index", task_id))
-
-  if cluster_spec:
-    # TODO(yuefengz): validate cluster_spec.
-    cluster_spec = normalize_cluster_spec(cluster_spec)
-  elif hasattr(strategy.extended, "_cluster_resolver"):
-    cluster_resolver = strategy.extended._cluster_resolver  # pylint: disable=protected-access
-    task_type = cluster_resolver.task_type
-    task_id = cluster_resolver.task_id
-    rpc_layer = cluster_resolver.rpc_layer or rpc_layer
-    environment = cluster_resolver.environment
-    cluster_spec = cluster_resolver.cluster_spec()
-
-  # Setting the session config is necessary for some strategies such as
-  # CollectiveAllReduceStrategy.
-  session_config = session_config or tf.compat.v1.ConfigProto(
-      allow_soft_placement=True)
-
-  if cluster_spec:
-    logging.info(
-        "Running Distribute Coordinator with cluster_spec = %r, "
-        "task_type = %r, task_id = %r, environment = %r, rpc_layer = %r",
-        cluster_spec.as_dict(), task_type, task_id, environment, rpc_layer)
-
-  if not cluster_spec:
-    # `mode` is ignored in the local case.
-    logging.info("Running local Distribute Coordinator.")
-    _run_single_worker(worker_fn, strategy, None, None, None, session_config,
-                       rpc_layer)
-    if eval_fn:
-      _run_single_worker(eval_fn, eval_strategy, None, None, None,
-                         session_config, rpc_layer)
-    else:
-      logging.warning("Skipped evaluation since `eval_fn` is not passed in.")
-  else:
-    if not eval_fn:
-      logging.warning("`eval_fn` is not passed in. The `worker_fn` will be "
-                      "used if an \"evaluator\" task exists in the cluster.")
-    eval_fn = eval_fn or worker_fn
-    if not eval_strategy:
-      logging.warning("`eval_strategy` is not passed in. No distribution "
-                      "strategy will be used for evaluation.")
-
-    # Every one starts a standard server, get session config from `configure`
-    # method.
-    _configure_session_config_for_std_servers(strategy, eval_strategy,
-                                              session_config, cluster_spec,
-                                              task_type, task_id)
-
-    if (task_type != _TaskType.EVALUATOR and
-        not getattr(strategy.extended, "_std_server_started", False)):
-      # Right now, with eager mode, context is configured with a std server at
-      # the very beginning while with graph mode the std server is started when
-      # distribute coordinator is called. We should consolidate these two paths.
-      server = _run_std_server(
-          cluster_spec=cluster_spec,
-          task_type=task_type,
-          task_id=task_id,
-          session_config=session_config,
-          rpc_layer=rpc_layer,
-          environment=environment)
-    if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
-      if strategy.extended.experimental_between_graph:
-        # All jobs run `worker_fn` if between-graph.
-        return _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
-                                  task_id, session_config, rpc_layer)
-      else:
-        # Only one node runs `worker_fn` if in-graph.
-        context = _WorkerContext(strategy, cluster_spec, task_type, task_id)
-        if context.is_chief:
-          return _run_single_worker(worker_fn, strategy, cluster_spec, None,
-                                    None, session_config, rpc_layer)
+def run_distribute_coordinator(
+    worker_fn,
+    strategy,
+    eval_fn=None,
+    eval_strategy=None,
+    cluster_spec=None,
+    task_type=None,
+    task_id=None,
+    session_config=None,
+    rpc_layer="grpc",
+):
+    """Runs the coordinator for distributed TensorFlow.
+
+    This function runs a split coordinator for distributed TensorFlow in its
+    default mode, i.e the STANDALONE_CLIENT mode. Given a `cluster_spec`
+    specifying server addresses and their roles in a cluster, this coordinator
+    will figure out how to set them up, give the underlying function the right
+    targets for master sessions via a scope object and coordinate their
+    training.  The cluster consisting of standard servers needs to be brought up
+    either with the standard server binary or with a binary running distribute
+    coordinator with `task_type` set to non-client type which will then turn
+    into standard servers.
+
+    In addition to be the distribute coordinator, this is also the source of
+    configurations for each job in the distributed training. As there are
+    multiple ways to configure a distributed TensorFlow cluster, its context
+    object provides these configurations so that users or higher-level APIs
+    don't have to figure out the configuration for each job by themselves.
+
+    In the between-graph replicated training, this coordinator will create
+    multiple threads and each calls the `worker_fn` which is supposed to create
+    its own graph and connect to one worker master given by its context object.
+    In the in-graph replicated training, it has only one thread calling this
+    `worker_fn`.
+
+    Another mode is the INDEPENDENT_WORKER mode where each server runs a
+    distribute coordinator which will start a standard server and optionally
+    runs `worker_fn` depending whether it is between-graph training or in-graph
+    replicated training.
+
+    The `strategy` object is expected to be a DistributionStrategy object which
+    has implemented methods needed by distributed coordinator such as
+    `configure(session_config, cluster_spec, task_type, task_id)` which
+    configures the strategy object for a specific task and
+    `experimental_should_init` property which instructs the distribute
+    coordinator whether to run init ops for a task. The distribute coordinator
+    will make a copy of the `strategy` object, call its `configure` method and
+    pass it to `worker_fn` as an argument.
+
+    The `worker_fn` defines the training logic and is called under its own
+    worker context which can be accessed to via `get_current_worker_context`. A
+    worker context provides access to configurations for each task, e.g. the
+    task_type, task_id, master target and so on. Since `worker_fn` will be
+    called in a thread and possibly multiple times, caller should be careful
+    when it accesses global data. For example, it is unsafe to define flags in a
+    `worker_fn` or to define different environment variables for different
+    `worker_fn`s.
+
+    The `worker_fn` for the between-graph replication is defined as if there is
+    only one worker corresponding to the `worker_fn` and possibly ps jobs. For
+    example, when training with parameter servers, it assigns variables to
+    parameter servers and all other operations to that worker. In the in-graph
+    replication case, the `worker_fn` has to define operations for all worker
+    jobs. Using a distribution strategy can simplify the `worker_fn` by not
+    having to worry about the replication and device assignment of variables and
+    operations.
+
+    This method is intended to be invoked by high-level APIs so that users don't
+    have to explicitly call it to run this coordinator. For those who don't use
+    high-level APIs, to change a program to use this coordinator, wrap
+    everything in a the program after global data definitions such as
+    commandline flag definition into the `worker_fn` and get task-specific
+    configurations from the worker context.
+
+    The `cluster_spec` can be either passed by the argument or parsed from the
+    "TF_CONFIG" environment variable. Example of a TF_CONFIG:
+    ```
+      cluster = {'chief': ['host0:2222'],
+                 'ps': ['host1:2222', 'host2:2222'],
+                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
+      os.environ['TF_CONFIG'] = json.dumps({'cluster': cluster})
+    ```
+
+    If `cluster_spec` is not given in any format, it becomes local training and
+    this coordinator will connect to a local session.
+
+    For evaluation, if "evaluator" exists in the cluster_spec, a separate thread
+    will be created to call `eval_fn` with its `task_type` set to "evaluator".
+    If `eval_fn` is not defined, fall back to `worker_fn`. This implies that
+    evaluation will be done on a single machine if there is an "evaluator" task.
+    If "evaluator" doesn't exist in the cluster_spec, it entirely depends on the
+    `worker_fn` for how to do evaluation.
+
+    Args:
+      worker_fn: the function to be called. The function should accept a
+        `strategy` object and will be given access to a context object via a
+        context manager scope.
+      strategy: a DistributionStrategy object specifying whether it should run
+        between-graph replicated training or not, whether to run init ops, etc.
+        This object will also be configured given `session_config`,
+        `cluster_spec`, `task_type` and `task_id`.
+      eval_fn: optional function for "evaluator" task. If `eval_fn` is not
+        passed in but a "evaluator" task is found in the `cluster_spec`, the
+        `worker_fn` will be used for this task.
+      eval_strategy: optional DistributionStrategy object for "evaluator" task.
+      cluster_spec: a dict, ClusterDef or ClusterSpec specifying servers and
+        roles in a cluster. If not set or empty, fall back to local training.
+      task_type: the current task type, optional if this is a client.
+      task_id: the current task id, optional if this is a client.
+      session_config: an optional `tf.compat.v1.ConfigProto` object which will
+        be passed to `strategy`'s `configure` method and used to create a
+        session.
+      rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
+
+    Raises:
+      ValueError: if `cluster_spec` is supplied but not a dict or a ClusterDef
+        or a ClusterSpec.
+
+    Returns:
+      In the client job, return the value returned by `worker_fn` if
+      it is in-graph replication or INDEPENDENT_WORKER mode; return None
+      otherwise.
+    """
+    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+    rpc_layer = tf_config.get("rpc_layer", rpc_layer)
+    environment = tf_config.get("environment", None)
+
+    if not cluster_spec:
+        cluster_spec = tf_config.get("cluster", {})
+        task_env = tf_config.get("task", {})
+        if task_env:
+            task_type = task_env.get("type", task_type)
+            task_id = int(task_env.get("index", task_id))
+
+    if cluster_spec:
+        # TODO(yuefengz): validate cluster_spec.
+        cluster_spec = normalize_cluster_spec(cluster_spec)
+    elif hasattr(strategy.extended, "_cluster_resolver"):
+        cluster_resolver = strategy.extended._cluster_resolver
+        task_type = cluster_resolver.task_type
+        task_id = cluster_resolver.task_id
+        rpc_layer = cluster_resolver.rpc_layer or rpc_layer
+        environment = cluster_resolver.environment
+        cluster_spec = cluster_resolver.cluster_spec()
+
+    # Setting the session config is necessary for some strategies such as
+    # CollectiveAllReduceStrategy.
+    session_config = session_config or tf.compat.v1.ConfigProto(
+        allow_soft_placement=True
+    )
+
+    if cluster_spec:
+        logging.info(
+            "Running Distribute Coordinator with cluster_spec = %r, "
+            "task_type = %r, task_id = %r, environment = %r, rpc_layer = %r",
+            cluster_spec.as_dict(),
+            task_type,
+            task_id,
+            environment,
+            rpc_layer,
+        )
+
+    if not cluster_spec:
+        # `mode` is ignored in the local case.
+        logging.info("Running local Distribute Coordinator.")
+        _run_single_worker(
+            worker_fn, strategy, None, None, None, session_config, rpc_layer
+        )
+        if eval_fn:
+            _run_single_worker(
+                eval_fn,
+                eval_strategy,
+                None,
+                None,
+                None,
+                session_config,
+                rpc_layer,
+            )
         else:
-          server.join()
-    elif task_type == _TaskType.EVALUATOR:
-      return _run_single_worker(eval_fn, eval_strategy, cluster_spec, task_type,
-                                task_id, session_config, rpc_layer)
+            logging.warning(
+                "Skipped evaluation since `eval_fn` is not passed in."
+            )
     else:
-      if task_type != _TaskType.PS:
-        raise ValueError("Unexpected task_type: %r" % task_type)
-      server.join()
+        if not eval_fn:
+            logging.warning(
+                "`eval_fn` is not passed in. The `worker_fn` will be "
+                'used if an "evaluator" task exists in the cluster.'
+            )
+        eval_fn = eval_fn or worker_fn
+        if not eval_strategy:
+            logging.warning(
+                "`eval_strategy` is not passed in. No distribution "
+                "strategy will be used for evaluation."
+            )
+
+        # Every one starts a standard server, get session config from
+        # `configure` method.
+        _configure_session_config_for_std_servers(
+            strategy,
+            eval_strategy,
+            session_config,
+            cluster_spec,
+            task_type,
+            task_id,
+        )
+
+        if task_type != _TaskType.EVALUATOR and not getattr(
+            strategy.extended, "_std_server_started", False
+        ):
+            # Right now, with eager mode, context is configured with a std
+            # server at the very beginning while with graph mode the std server
+            # is started when distribute coordinator is called. We should
+            # consolidate these two paths.
+            server = _run_std_server(
+                cluster_spec=cluster_spec,
+                task_type=task_type,
+                task_id=task_id,
+                session_config=session_config,
+                rpc_layer=rpc_layer,
+                environment=environment,
+            )
+        if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
+            if strategy.extended.experimental_between_graph:
+                # All jobs run `worker_fn` if between-graph.
+                return _run_single_worker(
+                    worker_fn,
+                    strategy,
+                    cluster_spec,
+                    task_type,
+                    task_id,
+                    session_config,
+                    rpc_layer,
+                )
+            else:
+                # Only one node runs `worker_fn` if in-graph.
+                context = _WorkerContext(
+                    strategy, cluster_spec, task_type, task_id
+                )
+                if context.is_chief:
+                    return _run_single_worker(
+                        worker_fn,
+                        strategy,
+                        cluster_spec,
+                        None,
+                        None,
+                        session_config,
+                        rpc_layer,
+                    )
+                else:
+                    server.join()
+        elif task_type == _TaskType.EVALUATOR:
+            return _run_single_worker(
+                eval_fn,
+                eval_strategy,
+                cluster_spec,
+                task_type,
+                task_id,
+                session_config,
+                rpc_layer,
+            )
+        else:
+            if task_type != _TaskType.PS:
+                raise ValueError(f"Unexpected task_type: {task_type!r}")
+            server.join()
 
 
 def normalize_cluster_spec(cluster_spec):
-  """Makes `cluster_spec` into a `ClusterSpec` object.
-
-  Args:
-    cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-      cluster configurations.
-
-  Returns:
-    a `ClusterSpec` object.
-
-  Raises:
-    ValueError: if `cluster_spec` is not a dict or a `ClusterSpec` or a
-      `ClusterDef`.
-  """
-  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
-    return tf.train.ClusterSpec(cluster_spec)
-  elif not isinstance(cluster_spec, tf.train.ClusterSpec):
-    raise ValueError(
-        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
-        "`tf.train.ClusterDef` object")
-  return cluster_spec
+    """Makes `cluster_spec` into a `ClusterSpec` object.
+
+    Args:
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+
+    Returns:
+      a `ClusterSpec` object.
+
+    Raises:
+      ValueError: if `cluster_spec` is not a dict or a `ClusterSpec` or a
+        `ClusterDef`.
+    """
+    if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+        return tf.train.ClusterSpec(cluster_spec)
+    elif not isinstance(cluster_spec, tf.train.ClusterSpec):
+        raise ValueError(
+            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+            "`tf.train.ClusterDef` object"
+        )
+    return cluster_spec
diff --git a/keras/distribute/distribute_strategy_test.py b/keras/distribute/distribute_strategy_test.py
index fba7cfbbd12e..5931f4cc7636 100644
--- a/keras/distribute/distribute_strategy_test.py
+++ b/keras/distribute/distribute_strategy_test.py
@@ -14,32 +14,40 @@
 # ==============================================================================
 """Tests for tf.keras models using tf.distribute.Strategy."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from keras import backend
-from keras.testing_infra import test_utils
 from keras.distribute import distributed_training_utils
 from keras.distribute import distributed_training_utils_v1
 from keras.distribute import multi_worker_testing_utils
 from keras.distribute import optimizer_combinations
 from keras.distribute.strategy_combinations import all_strategies
-from keras.distribute.strategy_combinations import multi_worker_mirrored_strategies
-from keras.distribute.strategy_combinations import strategies_minus_default_minus_tpu
+from keras.distribute.strategy_combinations import (
+    multi_worker_mirrored_strategies,
+)
+from keras.distribute.strategy_combinations import (
+    strategies_minus_default_minus_tpu,
+)
 from keras.distribute.strategy_combinations import strategies_minus_tpu
 from keras.distribute.strategy_combinations import tpu_strategies
 from keras.engine import base_layer_utils
 from keras.mixed_precision import policy
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_keras
+from keras.optimizers import optimizer as optimizer_base
+from keras.optimizers.legacy import gradient_descent as gradient_descent_keras
+from keras.testing_infra import test_utils
 from keras.utils import losses_utils
 from keras.utils import np_utils
 
+# isort: off
+from tensorflow.python.distribute.cluster_resolver import (
+    SimpleClusterResolver,
+)
+
 _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
@@ -53,2631 +61,3008 @@
 
 
 def simple_sequential_model():
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(16, activation='relu', input_shape=_INPUT_SIZE))
-  model.add(keras.layers.Dropout(0.1))
-  model.add(keras.layers.Dense(_NUM_CLASS, activation='softmax'))
-  return model
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Dense(16, activation="relu", input_shape=_INPUT_SIZE)
+    )
+    model.add(keras.layers.Dropout(0.1))
+    model.add(keras.layers.Dense(_NUM_CLASS, activation="softmax"))
+    return model
 
 
 def simple_subclassed_model(num_labels=_NUM_CLASS):
+    class _SimpleMLP(keras.Model):
+        def __init__(self, num_labels):
+            super().__init__()
+            self.dense = keras.layers.Dense(num_labels)
 
-  class _SimpleMLP(keras.Model):
-
-    def __init__(self, num_labels):
-      super().__init__()
-      self.dense = keras.layers.Dense(num_labels)
+        def call(self, inputs):
+            return self.dense(inputs)
 
-    def call(self, inputs):
-      return self.dense(inputs)
-
-  return _SimpleMLP(num_labels)
+    return _SimpleMLP(num_labels)
 
 
 def simple_multi_inputs_multi_outputs_model():
-  input_a = keras.layers.Input(shape=(16,), name='input_a')
-  input_b = keras.layers.Input(shape=(16,), name='input_b')
-
-  merged = keras.layers.concatenate([input_a, input_b], name='merge')
-  output_c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
-  output_d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
-  model = keras.models.Model(
-      inputs=[input_a, input_b], outputs=[output_c, output_d])
-  return model
+    input_a = keras.layers.Input(shape=(16,), name="input_a")
+    input_b = keras.layers.Input(shape=(16,), name="input_b")
+
+    merged = keras.layers.concatenate([input_a, input_b], name="merge")
+    output_c = keras.layers.Dense(3, activation="softmax", name="dense_2")(
+        merged
+    )
+    output_d = keras.layers.Dense(2, activation="softmax", name="dense_3")(
+        merged
+    )
+    model = keras.models.Model(
+        inputs=[input_a, input_b], outputs=[output_c, output_d]
+    )
+    return model
 
 
 def get_multi_inputs_multi_outputs_data():
-  (a_train, c_train), (a_test, c_test) = test_utils.get_test_data(
-      train_samples=_TRAIN_SIZE,
-      test_samples=50,
-      input_shape=(16,),
-      num_classes=3,
-      random_seed=_RANDOM_SEED)
-  (b_train, d_train), (b_test, d_test) = test_utils.get_test_data(
-      train_samples=_TRAIN_SIZE,
-      test_samples=50,
-      input_shape=(16,),
-      num_classes=2,
-      random_seed=_RANDOM_SEED)
-  (m_train, _), (m_test, _) = test_utils.get_test_data(
-      train_samples=_TRAIN_SIZE,
-      test_samples=50,
-      input_shape=(8,),
-      num_classes=2,
-      random_seed=_RANDOM_SEED)
-
-  c_train = np_utils.to_categorical(c_train)
-  c_test = np_utils.to_categorical(c_test)
-  d_train = np_utils.to_categorical(d_train)
-  d_test = np_utils.to_categorical(d_test)
-
-  train_data = {
-      'input_a': a_train,
-      'input_b': b_train,
-      'input_m': m_train,
-      'output_c': c_train,
-      'output_d': d_train
-  }
-  test_data = {
-      'input_a': a_test,
-      'input_b': b_test,
-      'input_m': m_test,
-      'output_c': c_test,
-      'output_d': d_test
-  }
-
-  return (train_data, test_data)
+    (a_train, c_train), (a_test, c_test) = test_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(16,),
+        num_classes=3,
+        random_seed=_RANDOM_SEED,
+    )
+    (b_train, d_train), (b_test, d_test) = test_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(16,),
+        num_classes=2,
+        random_seed=_RANDOM_SEED,
+    )
+    (m_train, _), (m_test, _) = test_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(8,),
+        num_classes=2,
+        random_seed=_RANDOM_SEED,
+    )
+
+    c_train = np_utils.to_categorical(c_train)
+    c_test = np_utils.to_categorical(c_test)
+    d_train = np_utils.to_categorical(d_train)
+    d_test = np_utils.to_categorical(d_test)
+
+    train_data = {
+        "input_a": a_train,
+        "input_b": b_train,
+        "input_m": m_train,
+        "output_c": c_train,
+        "output_d": d_train,
+    }
+    test_data = {
+        "input_a": a_test,
+        "input_b": b_test,
+        "input_m": m_test,
+        "output_c": c_test,
+        "output_d": d_test,
+    }
+
+    return (train_data, test_data)
 
 
 def batch_wrapper(dataset, batch_size, distribution, repeat=None):
-  if repeat:
-    dataset = dataset.repeat(repeat)
-  # TPUs currently require fully defined input shapes, drop_remainder ensures
-  # the input will have fully defined shapes.
-  if backend.is_tpu_strategy(distribution):
-    return dataset.batch(batch_size, drop_remainder=True)
-  else:
-    return dataset.batch(batch_size)
+    if repeat:
+        dataset = dataset.repeat(repeat)
+    # TPUs currently require fully defined input shapes, drop_remainder ensures
+    # the input will have fully defined shapes.
+    if backend.is_tpu_strategy(distribution):
+        return dataset.batch(batch_size, drop_remainder=True)
+    else:
+        return dataset.batch(batch_size)
 
 
 def get_model():
-  x = keras.layers.Input(shape=(3,), name='input')
-  y = keras.layers.Dense(4, name='dense')(x)
-  model = keras.Model(x, y)
-  return model
+    x = keras.layers.Input(shape=(3,), name="input")
+    y = keras.layers.Dense(4, name="dense")(x)
+    model = keras.Model(x, y)
+    return model
 
 
 def get_sample_weights_model():
-  x = keras.layers.Input(shape=(1,), name='input')
-  y = keras.layers.Dense(
-      1, kernel_initializer='ones', bias_initializer='zeros', name='dense')(
-          x)
-  model = keras.Model(x, y)
-  return model
+    x = keras.layers.Input(shape=(1,), name="input")
+    y = keras.layers.Dense(
+        1, kernel_initializer="ones", bias_initializer="zeros", name="dense"
+    )(x)
+    model = keras.Model(x, y)
+    return model
 
 
 def get_dataset(distribution):
-  inputs = np.zeros((10, 3), dtype=np.float32)
-  targets = np.zeros((10, 4), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-  dataset = dataset.repeat(100)
-  dataset = batch_wrapper(dataset, 10, distribution)
-  return dataset
+    inputs = np.zeros((10, 3), dtype=np.float32)
+    targets = np.zeros((10, 4), dtype=np.float32)
+    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = batch_wrapper(dataset, 10, distribution)
+    return dataset
 
 
 def get_predict_dataset(distribution):
-  inputs = np.zeros((10, 3), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices(inputs)
-  dataset = dataset.repeat(100)
-  dataset = batch_wrapper(dataset, 10, distribution)
-  return dataset
+    inputs = np.zeros((10, 3), dtype=np.float32)
+    dataset = tf.data.Dataset.from_tensor_slices(inputs)
+    dataset = dataset.repeat(100)
+    dataset = batch_wrapper(dataset, 10, distribution)
+    return dataset
 
 
 def convert_numpy_to_dataset_with_unknown_cardinality(inputs, targets=None):
-  if targets is not None:
-    input_slices = (inputs, targets)
-    dummy_op = (lambda inp, target: True)
-  else:
-    input_slices = inputs
-    dummy_op = (lambda inp: True)
+    if targets is not None:
+        input_slices = (inputs, targets)
+        dummy_op = lambda inp, target: True
+    else:
+        input_slices = inputs
+        dummy_op = lambda inp: True
 
-  original_dataset = (tf.data.Dataset.from_tensor_slices(input_slices))
-  ds_with_unknown_cardinality = (
-      original_dataset.filter(dummy_op).batch(10, drop_remainder=True))
-  return ds_with_unknown_cardinality
+    original_dataset = tf.data.Dataset.from_tensor_slices(input_slices)
+    ds_with_unknown_cardinality = original_dataset.filter(dummy_op).batch(
+        10, drop_remainder=True
+    )
+    return ds_with_unknown_cardinality
 
 
 def multi_input_output_model():
-  a = keras.layers.Input(shape=(3,), name='input_a')
-  b = keras.layers.Input(shape=(5,), name='input_b')
-  # TODO(anjalisridhar): Change the output dimension of the second Dense layer
-  # once the iterator output validation issue has been fixed.
-  dense_1 = keras.layers.Dense(7, name='dense_1')
-  dense_2 = keras.layers.Dense(7, name='dense_2')
-  c = dense_1(a)
-  d = dense_2(b)
-  e = keras.layers.Dropout(0.5, name='dropout')(c)
-  model = keras.models.Model([a, b], [d, e])
-  return model
+    a = keras.layers.Input(shape=(3,), name="input_a")
+    b = keras.layers.Input(shape=(5,), name="input_b")
+    # TODO(anjalisridhar): Change the output dimension of the second Dense layer
+    # once the iterator output validation issue has been fixed.
+    dense_1 = keras.layers.Dense(7, name="dense_1")
+    dense_2 = keras.layers.Dense(7, name="dense_2")
+    c = dense_1(a)
+    d = dense_2(b)
+    e = keras.layers.Dropout(0.5, name="dropout")(c)
+    model = keras.models.Model([a, b], [d, e])
+    return model
 
 
 def strategy_minus_tpu_combinations():
-  return tf.__internal__.test.combinations.combine(
-      distribution=strategies_minus_tpu, mode=['graph', 'eager'])
+    return tf.__internal__.test.combinations.combine(
+        distribution=strategies_minus_tpu, mode=["graph", "eager"]
+    )
 
 
 def tpu_strategy_combinations():
-  return tf.__internal__.test.combinations.combine(
-      distribution=tpu_strategies, mode=['graph', 'eager'])
+    return tf.__internal__.test.combinations.combine(
+        distribution=tpu_strategies, mode=["graph", "eager"]
+    )
 
 
 def tpu_strategy_combinations_graph_only():
-  return tf.__internal__.test.combinations.combine(distribution=tpu_strategies, mode=['graph'])
+    return tf.__internal__.test.combinations.combine(
+        distribution=tpu_strategies, mode=["graph"]
+    )
 
 
 def multi_worker_strategy_combinations_eager_only():
-  return tf.__internal__.test.combinations.combine(
-      distribution=multi_worker_mirrored_strategies, mode=['eager'])
+    return tf.__internal__.test.combinations.combine(
+        distribution=multi_worker_mirrored_strategies, mode=["eager"]
+    )
 
 
 def all_strategy_combinations():
-  return strategy_minus_tpu_combinations() + tpu_strategy_combinations(
-  ) + multi_worker_strategy_combinations_eager_only()
+    return (
+        strategy_minus_tpu_combinations()
+        + tpu_strategy_combinations()
+        + multi_worker_strategy_combinations_eager_only()
+    )
 
 
 def all_strategy_minus_default_and_tpu_combinations():
-  return tf.__internal__.test.combinations.combine(
-      distribution=[
-          tf.__internal__.distribute.combinations.one_device_strategy,
-          tf.__internal__.distribute.combinations.one_device_strategy_gpu,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-      ],
-      mode=['graph', 'eager'])
+    return tf.__internal__.test.combinations.combine(
+        distribution=[
+            tf.__internal__.distribute.combinations.one_device_strategy,
+            tf.__internal__.distribute.combinations.one_device_strategy_gpu,
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+        ],
+        mode=["graph", "eager"],
+    )
 
 
 def all_strategy_combinations_minus_default():
-  return (all_strategy_minus_default_and_tpu_combinations() +
-          tpu_strategy_combinations() +
-          multi_worker_strategy_combinations_eager_only())
+    return (
+        all_strategy_minus_default_and_tpu_combinations()
+        + tpu_strategy_combinations()
+        + multi_worker_strategy_combinations_eager_only()
+    )
 
 
 def strategy_and_optimizer_combinations():
-  non_tpu_strategies = tf.__internal__.test.combinations.times(
-      strategy_minus_tpu_combinations(),
-      tf.__internal__.test.combinations.combine(
-          optimizer=[
-              optimizer_combinations.adagrad_optimizer_v1_fn,
-              optimizer_combinations.adam_optimizer_v1_fn,
-              optimizer_combinations.gradient_descent_optimizer_v1_fn,
-              optimizer_combinations.rmsprop_optimizer_v1_fn,
-              optimizer_combinations.adadelta_optimizer_keras_v2_fn,
-              optimizer_combinations.adagrad_optimizer_keras_v2_fn,
-              optimizer_combinations.adam_optimizer_keras_v2_fn,
-              optimizer_combinations.adamax_optimizer_keras_v2_fn,
-              optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
-              optimizer_combinations.nadam_optimizer_keras_v2_fn,
-              optimizer_combinations.rmsprop_optimizer_keras_v2_fn,
-              optimizer_combinations.ftrl_optimizer_keras_v2_fn
-          ]))
-  tpu_strategies_graph = tf.__internal__.test.combinations.combine(
-      distribution=tpu_strategies,
-      mode=['graph'],
-      optimizer=[
-          optimizer_combinations.adagrad_optimizer_v1_fn,
-          optimizer_combinations.adam_optimizer_v1_fn,
-          optimizer_combinations.gradient_descent_optimizer_v1_fn,
-          optimizer_combinations.rmsprop_optimizer_v1_fn,
-          optimizer_combinations.adagrad_optimizer_keras_v2_fn,
-          optimizer_combinations.adam_optimizer_keras_v2_fn,
-          optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
-          optimizer_combinations.rmsprop_optimizer_keras_v2_fn
-      ])
-  tpu_strategies_eager = tf.__internal__.test.combinations.combine(
-      distribution=tpu_strategies,
-      mode=['eager'],
-      optimizer=[
-          optimizer_combinations.adagrad_optimizer_keras_v2_fn,
-          optimizer_combinations.adam_optimizer_keras_v2_fn,
-          optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
-          optimizer_combinations.rmsprop_optimizer_keras_v2_fn
-      ])
-  multi_worker_eager = tf.__internal__.test.combinations.combine(
-      distribution=multi_worker_mirrored_strategies,
-      mode=['eager'],
-      optimizer=[
-          optimizer_combinations.adadelta_optimizer_keras_v2_fn,
-          optimizer_combinations.adagrad_optimizer_keras_v2_fn,
-          optimizer_combinations.adam_optimizer_keras_v2_fn,
-          optimizer_combinations.adamax_optimizer_keras_v2_fn,
-          optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
-          optimizer_combinations.nadam_optimizer_keras_v2_fn,
-          optimizer_combinations.rmsprop_optimizer_keras_v2_fn,
-          optimizer_combinations.ftrl_optimizer_keras_v2_fn
-      ])
-  return (non_tpu_strategies + tpu_strategies_eager + tpu_strategies_graph +
-          multi_worker_eager)
+    non_tpu_strategies = tf.__internal__.test.combinations.times(
+        strategy_minus_tpu_combinations(),
+        tf.__internal__.test.combinations.combine(
+            optimizer=[
+                optimizer_combinations.adagrad_optimizer_v1_fn,
+                optimizer_combinations.adam_optimizer_v1_fn,
+                optimizer_combinations.gradient_descent_optimizer_v1_fn,
+                optimizer_combinations.rmsprop_optimizer_v1_fn,
+                optimizer_combinations.adadelta_optimizer_keras_v2_fn,
+                optimizer_combinations.adagrad_optimizer_keras_v2_fn,
+                optimizer_combinations.adam_optimizer_keras_v2_fn,
+                optimizer_combinations.adamax_optimizer_keras_v2_fn,
+                optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+                optimizer_combinations.nadam_optimizer_keras_v2_fn,
+                optimizer_combinations.rmsprop_optimizer_keras_v2_fn,
+                optimizer_combinations.ftrl_optimizer_keras_v2_fn,
+            ]
+        ),
+    )
+    tpu_strategies_graph = tf.__internal__.test.combinations.combine(
+        distribution=tpu_strategies,
+        mode=["graph"],
+        optimizer=[
+            optimizer_combinations.adagrad_optimizer_v1_fn,
+            optimizer_combinations.adam_optimizer_v1_fn,
+            optimizer_combinations.gradient_descent_optimizer_v1_fn,
+            optimizer_combinations.rmsprop_optimizer_v1_fn,
+            optimizer_combinations.adagrad_optimizer_keras_v2_fn,
+            optimizer_combinations.adam_optimizer_keras_v2_fn,
+            optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+            optimizer_combinations.rmsprop_optimizer_keras_v2_fn,
+        ],
+    )
+    tpu_strategies_eager = tf.__internal__.test.combinations.combine(
+        distribution=tpu_strategies,
+        mode=["eager"],
+        optimizer=[
+            optimizer_combinations.adagrad_optimizer_keras_v2_fn,
+            optimizer_combinations.adam_optimizer_keras_v2_fn,
+            optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+            optimizer_combinations.rmsprop_optimizer_keras_v2_fn,
+        ],
+    )
+    multi_worker_eager = tf.__internal__.test.combinations.combine(
+        distribution=multi_worker_mirrored_strategies,
+        mode=["eager"],
+        optimizer=[
+            optimizer_combinations.adadelta_optimizer_keras_v2_fn,
+            optimizer_combinations.adagrad_optimizer_keras_v2_fn,
+            optimizer_combinations.adam_optimizer_keras_v2_fn,
+            optimizer_combinations.adamax_optimizer_keras_v2_fn,
+            optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+            optimizer_combinations.nadam_optimizer_keras_v2_fn,
+            optimizer_combinations.rmsprop_optimizer_keras_v2_fn,
+            optimizer_combinations.ftrl_optimizer_keras_v2_fn,
+        ],
+    )
+    return (
+        non_tpu_strategies
+        + tpu_strategies_eager
+        + tpu_strategies_graph
+        + multi_worker_eager
+    )
 
 
 class BatchCountingCB(keras.callbacks.Callback):
+    def __init__(self):
+        super().__init__()
+        self.train_begin_batches = []
+        self.train_end_batches = []
+        self.test_begin_batches = []
+        self.test_end_batches = []
+        self.predict_begin_batches = []
+        self.predict_end_batches = []
+
+    def on_train_batch_begin(self, batch, logs=None):
+        self.train_begin_batches.append(batch)
+
+    def on_train_batch_end(self, batch, logs=None):
+        self.train_end_batches.append(batch)
+
+    def on_test_batch_begin(self, batch, logs=None):
+        self.test_begin_batches.append(batch)
+
+    def on_test_batch_end(self, batch, logs=None):
+        self.test_end_batches.append(batch)
+
+    def on_predict_batch_begin(self, batch, logs=None):
+        self.predict_begin_batches.append(batch)
+
+    def on_predict_batch_end(self, batch, logs=None):
+        self.predict_end_batches.append(batch)
+
+
+class TestDistributionStrategyWithNumpyArrays(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calculating_input_params_no_steps_no_batch_size(
+        self, distribution
+    ):
+        # Calculate the per_replica_batch_size scaling factor for strategies
+        # that use per_core_batch_size
+        replica_scale_factor = 1.0
+        if not distributed_training_utils.global_batch_size_supported(
+            distribution
+        ):
+            replica_scale_factor = distribution.num_replicas_in_sync
+
+        with self.cached_session():
+            # Default global batch size 32 for input with 64 samples run in 2
+            # steps
+            steps, batch_size = distributed_training_utils_v1.get_input_params(
+                distribution, 64, steps=None, batch_size=None
+            )
+            self.assertEqual(batch_size, 32 // replica_scale_factor)
+            self.assertEqual(steps, 2)
+
+            # Computed global batch size 20 is lower than 32 if we pass less
+            # samples.
+            steps, batch_size = distributed_training_utils_v1.get_input_params(
+                distribution, 20, steps=None, batch_size=None
+            )
+            self.assertEqual(batch_size, 20 // replica_scale_factor)
+            self.assertEqual(steps, 1)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calculating_input_params_with_steps_no_batch_size(
+        self, distribution
+    ):
+        # Calculate the per_replica_batch_size scaling factor for strategies
+        # that use per_core_batch_size
+        replica_scale_factor = 1.0
+        if not distributed_training_utils.global_batch_size_supported(
+            distribution
+        ):
+            replica_scale_factor = distribution.num_replicas_in_sync
+
+        with self.cached_session():
+            # Computed global batch size is correct for number of specified 1
+            # step
+            steps, batch_size = distributed_training_utils_v1.get_input_params(
+                distribution, 64, steps=1, batch_size=None
+            )
+            self.assertEqual(batch_size, 64 // replica_scale_factor)
+            self.assertEqual(steps, 1)
+
+            # Computed global batch size is correct for number of specified 2
+            # steps
+            steps, batch_size = distributed_training_utils_v1.get_input_params(
+                distribution, 64, steps=2, batch_size=None
+            )
+            self.assertEqual(batch_size, 32 // replica_scale_factor)
+            self.assertEqual(steps, 2)
+
+            # All samples can not be consumed in specified number of steps
+            with self.assertRaisesRegex(ValueError, "not divisible by steps"):
+                distributed_training_utils_v1.get_input_params(
+                    distribution, 63, steps=2, batch_size=None
+                )
+
+            # This cases is different for different strategies due to the
+            # difference in supported batch size being global or per-replica.
+            if replica_scale_factor == 1:
+                # Computed global batch size is correct even if not sharadable
+                (
+                    steps,
+                    batch_size,
+                ) = distributed_training_utils_v1.get_input_params(
+                    distribution, 63, steps=3, batch_size=None
+                )
+                self.assertEqual(batch_size, 21)
+                self.assertEqual(steps, 3)
+            else:
+                # Computed global batch size can not be sharded across replicas
+                with self.assertRaisesRegex(
+                    ValueError,
+                    "could not be sharded evenly across the sync replicas",
+                ):
+                    distributed_training_utils_v1.get_input_params(
+                        distribution, 63, steps=1, batch_size=None
+                    )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calculating_input_params_no_steps_with_batch_size(
+        self, distribution
+    ):
+        # Calculate the per_replica_batch_size scaling factor for strategies
+        # that use per_core_batch_size
+        replica_scale_factor = 1.0
+        if not distributed_training_utils.global_batch_size_supported(
+            distribution
+        ):
+            replica_scale_factor = distribution.num_replicas_in_sync
+
+        with self.cached_session():
+            # Computed steps is correct for specified batch size
+            steps, batch_size = distributed_training_utils_v1.get_input_params(
+                distribution, 64, steps=None, batch_size=16
+            )
+            self.assertEqual(batch_size, 16)
+            self.assertEqual(steps, 4 // replica_scale_factor)
+
+            # Computed steps is correct for specified batch size
+            steps, batch_size = distributed_training_utils_v1.get_input_params(
+                distribution, 64, steps=None, batch_size=32
+            )
+            self.assertEqual(batch_size, 32)
+            self.assertEqual(steps, 2 // replica_scale_factor)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calculating_input_params_with_steps_with_batch_size(
+        self, distribution
+    ):
+        with self.cached_session():
+            # No change to steps and batch size if both specified and feasible
+            steps, batch_size = distributed_training_utils_v1.get_input_params(
+                distribution, 64, steps=5, batch_size=3
+            )
+            self.assertEqual(batch_size, 3)
+            self.assertEqual(steps, 5)
+
+            # Number of samples is less than global batch size * steps
+            with self.assertRaisesRegex(
+                ValueError, "less than samples required"
+            ):
+                distributed_training_utils_v1.get_input_params(
+                    distribution, 64, steps=10, batch_size=13
+                )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calling_model_with_numpy_arrays(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(0.001)
+                model = get_model()
+                loss = "mse"
+                metrics = ["mae"]
+                model.compile(optimizer, loss, metrics=metrics)
+
+                inputs = np.zeros((64, 3), dtype=np.float32)
+                targets = np.zeros((64, 4), dtype=np.float32)
+
+                # Call fit with validation data
+                model.fit(
+                    inputs,
+                    targets,
+                    epochs=1,
+                    batch_size=2,
+                    verbose=0,
+                    validation_data=(inputs, targets),
+                )
+
+                # TODO(anjalisridhar): We need tests for when the batch size and
+                # steps are smaller and results in a 0 batch_size and steps
+                # value.
+                model.evaluate(inputs, targets)
+                model.evaluate(inputs, targets, batch_size=8)
+
+                model.predict(inputs)
+                model.predict(inputs, batch_size=8)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calling_model_with_mixed_precision(self, distribution):
+        if isinstance(
+            distribution,
+            (
+                tf.compat.v1.distribute.experimental.ParameterServerStrategy,
+                tf.distribute.experimental.ParameterServerStrategy,
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            self.skipTest("b/152097775")
+        if backend.is_tpu_strategy(distribution):
+            policy_name = "mixed_bfloat16"
+        else:
+            policy_name = "mixed_float16"
+        with self.cached_session(), distribution.scope(), policy.policy_scope(
+            policy_name
+        ):
+            optimizer_fn = gradient_descent_keras.SGD
+            optimizer = optimizer_fn(0.001)
+            x = keras.layers.Input(shape=(3,), name="input")
+            y = keras.layers.Dense(4, name="dense")(x)
+            y = keras.layers.Activation("softmax", dtype="float32")(y)
+            model = keras.Model(x, y)
+            loss = "mse"
+            metrics = ["mae"]
+            model.compile(optimizer, loss, metrics=metrics)
+
+            # We need to pass float32 since TPUs do not support float64, even
+            # though these arrays will immediately be casted to bfloat16 on
+            # TPUs. We also cannot pass bfloat16, as Numpy does not support it.
+            inputs = np.zeros((64, 3), dtype="float32")
+            targets = np.zeros((64, 4), dtype="float32")
+
+            model.fit(
+                inputs,
+                targets,
+                epochs=1,
+                batch_size=2,
+                verbose=0,
+                validation_data=(inputs, targets),
+            )
+
+            model.evaluate(inputs, targets)
+            model.evaluate(inputs, targets, batch_size=8)
+
+            model.predict(inputs)
+            model.predict(inputs, batch_size=8)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_operator_overload_mixed_precision(self, distribution):
+        # Regression test that tests a fixed bug does not reoccur. Adding an
+        # AutoCastVariable to a tensor on a TPU, where the variable was the LHS
+        # of the '+' operator, used to cause the gradient w.r.t. the variable to
+        # be None.
+        if isinstance(
+            distribution,
+            (
+                tf.compat.v1.distribute.experimental.ParameterServerStrategy,
+                tf.distribute.experimental.ParameterServerStrategy,
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            self.skipTest("b/152097775")
+
+        if backend.is_tpu_strategy(distribution):
+            policy_name = "mixed_bfloat16"
+        else:
+            policy_name = "mixed_float16"
+
+        class MyLayer(keras.layers.Layer):
+            def build(self, _):
+                self.v1 = self.add_weight("v", ())
+                self.v2 = self.add_weight("v", ())
+
+            def call(self, inp):
+                inp += self.v1
+                return self.v2 + inp
+
+        with self.cached_session(), distribution.scope():
+            layer = MyLayer(dtype=policy_name)
+
+            def run_fn():
+                x = np.array([1.0])
+                with tf.GradientTape() as tape:
+                    y = layer(x)
+                grad_v1, grad_v2 = tape.gradient(y, [layer.v1, layer.v2])
+                return grad_v1, grad_v2
+
+            if tf.executing_eagerly():
+                run_fn = tf.function(run_fn)
+
+            grad_v1, grad_v2 = distribution.run(run_fn)
+            self.assertIsNotNone(grad_v1)
+            self.assertIsNotNone(grad_v2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.one_device_strategy
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_optimizer_in_cross_replica_context_raises_error(
+        self, distribution
+    ):
+
+        with self.cached_session(), distribution.scope():
+            model = keras.models.Sequential([keras.layers.Dense(1)])
+            x = np.array([[1.0]])
+            with tf.GradientTape() as tape:
+                y = model(x)
+            gradients = tape.gradient(y, model.trainable_variables)
+            optimizer = gradient_descent_keras.SGD()
+
+            with self.assertRaisesRegex(
+                RuntimeError, "cannot be called in cross-replica context"
+            ):
+                optimizer.apply_gradients(
+                    zip(gradients, model.trainable_variables)
+                )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calling_model_with_nested_numpy_arrays(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(learning_rate=0.001)
+                model = multi_input_output_model()
+                loss = "mse"
+                model.compile(optimizer, loss)
+
+            input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+            input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
+            inputs = [input_a_np, input_b_np]
+
+            output_d_np = np.asarray(
+                np.random.random((64, 7)), dtype=np.float32
+            )
+            output_e_np = np.asarray(
+                np.random.random((64, 7)), dtype=np.float32
+            )
+            targets = [output_d_np, output_e_np]
+
+            # Call fit with validation data
+            model.fit(inputs, targets, epochs=1, batch_size=8, verbose=0)
+
+            # TODO(anjalisridhar): We need tests for when the batch size and
+            # steps are smaller and results in a 0 batch_size and steps value.
+            model.evaluate(inputs, targets)
+            model.evaluate(inputs, targets, batch_size=8)
+
+            model.predict(inputs)
+            model.predict(inputs, batch_size=8)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategies_minus_tpu, mode=["graph", "eager"]
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=multi_worker_mirrored_strategies, mode=["eager"]
+        )
+    )
+    def test_numpy_with_sample_weights(self, distribution):
+        with self.cached_session(), distribution.scope():
+            model = get_sample_weights_model()
+            optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.001)
+            loss = "mse"
+            model.compile(optimizer, loss)
+
+            inputs = np.array([[0], [1], [2], [3]], np.float32)
+            targets = np.array([[2], [4], [6], [8]], np.float32)
+            sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32)
+
+            result = model.evaluate(
+                inputs,
+                targets,
+                batch_size=2,
+                sample_weight=sample_weights,
+                verbose=1,
+            )
+
+            # The per sample loss is multiplied by the corresponding sample
+            # weight.  The average of these weighted losses is the return value
+            # of the `evaluate` call. For example, in the test above the average
+            # weighted loss is calculated in the following manner:
+
+            # batch_1 = (((2-0)^2) * 0.25 + ((4-1)^2) * 0.5) / 2 = 5.5 / 2 =
+            # 2.75
+            # batch_2 = (((6-2)^2 * 0.75) + ((8-3)^2 * 1)) / 2 = 37 / 2 = 18.5
+            # final result = (batch_1 + batch_2) / 2 = 10.625.
+            # The first time we divide by number of input samples and the second
+            # time we divide by number of steps/batches that the loss is
+            # aggregated over.
+            self.assertAllClose(result, 10.625)
+
+            # We now test without passing sample_weights:
+            # batch_1 = ((2-0)^2) + ((4-1)^2) / 2 = 13 / 2 = 6.5
+            # batch_2 = ((6-2)^2) + ((8-3)^2) / 2 = 41 / 2 = 20.5
+            # final result = (batch_1 + batch_2) / 2 =  27 / 2 = 13.5
+            result = model.evaluate(inputs, targets, batch_size=2, verbose=1)
+            self.assertAllClose(result, 13.5)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_flatten_predict_outputs(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                model = multi_input_output_model()
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(learning_rate=0.001)
+                loss = "mse"
+                model.compile(optimizer, loss)
+
+            # We take 6 input samples with each input having a dimension of 3 or
+            # 5.
+            input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
+            input_b_np = np.asarray(np.random.random((6, 5)), dtype=np.float32)
+            inputs = [input_a_np, input_b_np]
+
+            outs = model.predict(inputs)
+            # `predict` a list that is equal in length to the number of model
+            # outputs.  In this test our model has two outputs and each element
+            # of `outs` corresponds to all the samples of one of the model
+            # outputs.
+            self.assertLen(outs, 2)
+            # Each of the output samples have a dimension of 7. We should
+            # process all the available input samples(6).
+            self.assertAllEqual([6, 7], outs[0].shape)
+            self.assertAllEqual([6, 7], outs[1].shape)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tpu_strategy_combinations_graph_only(),
+            tf.__internal__.test.combinations.combine(batch_size=[4, 6]),
+        )
+    )
+    def test_evaluate_with_partial_batch(self, distribution, batch_size):
+        with self.cached_session():
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+            loss = "mse"
+            metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+
+            with distribution.scope():
+                model_with_ds_strategy = get_model()
+                model_with_ds_strategy.compile(optimizer, loss, metrics=metrics)
+
+            cpu_model = get_model()
+            cpu_model.compile(optimizer, loss, metrics=metrics)
+
+            x = np.random.random((10, 3)).astype("float32")
+            y = np.random.random((10, 4)).astype("float32")
+
+            # As sample size is 10, we batch by 4 so that the last batch is a
+            # partial batch. Also `evaluate()` using numpy array as inputs
+            # without distribution strategy uses entire sample as a single
+            # batch. As so, we remove parameters `batch_size` and `steps`.
+            cpu_model.set_weights(model_with_ds_strategy.get_weights())
+            evaluate_ground_truth = cpu_model.evaluate(x, y)
+
+            # We don't compare the loss as loss is currently not computed as
+            # metric in Keras, the loss value is inaccurate for last partial
+            # batch due to more weights for the last batch samples.
+            steps = np.ceil(10.0 / batch_size)
+            self.assertAllClose(
+                model_with_ds_strategy.evaluate(
+                    x, y, batch_size=batch_size, steps=steps
+                )[1:],
+                evaluate_ground_truth[1:],
+                atol=1e-5,
+                rtol=1e-5,
+            )
+            # Test that `steps` is inferred correctly when final partial batch
+            # exists.
+            self.assertAllClose(
+                model_with_ds_strategy.evaluate(x, y, batch_size=batch_size)[
+                    1:
+                ],
+                evaluate_ground_truth[1:],
+                atol=1e-5,
+                rtol=1e-5,
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tpu_strategy_combinations_graph_only()
+        )
+    )
+    def test_predict_with_partial_batch(self, distribution):
+        with self.cached_session():
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+            loss = "mse"
+
+            with distribution.scope():
+                model_with_ds_strategy = get_model()
+                model_with_ds_strategy.compile(optimizer, loss)
+
+            cpu_model = get_model()
+            cpu_model.compile(optimizer, loss)
+
+            inputs = np.random.random((10, 3)).astype(np.float32)
+
+            # As sample size is 10, we batch by 4 so that the last batch is
+            # a partial batch. Also `predict()` using numpy array as inputs
+            # without distribution strategy uses entire sample as a single
+            # batch. As so, we remove parameters `batch_size` and `steps`.
+            cpu_model.set_weights(model_with_ds_strategy.get_weights())
+            predict_ground_truth = cpu_model.predict(inputs)
+            self.assertAllClose(
+                model_with_ds_strategy.predict(inputs, batch_size=4, steps=3),
+                predict_ground_truth,
+                atol=1e-5,
+                rtol=1e-5,
+            )
+            # Test that `steps` is inferred correctly when final partial batch
+            # exists.
+            self.assertAllClose(
+                model_with_ds_strategy.predict(inputs, batch_size=4),
+                predict_ground_truth,
+                atol=1e-5,
+                rtol=1e-5,
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tpu_strategy_combinations_graph_only()
+    )
+    def test_no_target_model(self, distribution):
+        with self.cached_session():
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+
+            class MyLayer(keras.layers.Layer):
+                def call(self, inputs, training=None):
+                    self.add_loss(tf.reduce_sum(inputs), inputs=True)
+                    return inputs
+
+            with distribution.scope():
+                model = keras.models.Sequential()
+                model.add(
+                    keras.layers.Dense(
+                        16, activation="relu", input_shape=_INPUT_SIZE
+                    )
+                )
+                model.add(MyLayer())
+                model.add(keras.layers.Dense(_NUM_CLASS, activation="softmax"))
+
+                model.compile(optimizer)
+                inputs = np.zeros((20, 10), np.float32)
+
+                model.fit(inputs, epochs=1, steps_per_epoch=2)
+                model.predict(inputs, steps=1)
+                model.evaluate(inputs, steps=1)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tpu_strategy_combinations_graph_only()
+        )
+    )
+    def test_predict_multi_output_model_with_partial_batch(self, distribution):
+        with self.cached_session():
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+            loss = "mse"
+
+            with distribution.scope():
+                model_with_ds_strategy = (
+                    simple_multi_inputs_multi_outputs_model()
+                )
+                model_with_ds_strategy.compile(optimizer, loss)
+
+            cpu_model = simple_multi_inputs_multi_outputs_model()
+            cpu_model.compile(optimizer, loss)
+
+            input_data, _ = get_multi_inputs_multi_outputs_data()
+            input_dict = {
+                "input_a": input_data["input_a"],
+                "input_b": input_data["input_b"],
+            }
+
+            # As sample size is 200, we batch by 18 so that the last batch is
+            # a partial batch. Also `fit()` using numpy array as inputs without
+            # distribution strategy uses entire sample as a single batch. As so,
+            # we remove parameters `batch_size` and `steps`.
+            cpu_model.set_weights(model_with_ds_strategy.get_weights())
+            self.assertAllClose(
+                model_with_ds_strategy.predict(
+                    input_dict, batch_size=18, steps=12
+                ),
+                cpu_model.predict(input_dict),
+                atol=1e-4,
+                rtol=1e-4,
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_gradients_are_none(self, distribution):
+
+        if not tf.executing_eagerly():
+            self.skipTest("None gradients are not supported in graph mode")
+
+        class DenseWithExtraWeight(keras.layers.Dense):
+            def build(self, input_shape):
+                # Gradients w.r.t. extra_weights are None
+                self.extra_weight_1 = self.add_weight(
+                    "extra_weight_1", shape=(), initializer="ones"
+                )
+                super().build(input_shape)
+                self.extra_weight_2 = self.add_weight(
+                    "extra_weight_2", shape=(), initializer="ones"
+                )
+
+        with distribution.scope():
+            model = keras.Sequential(
+                [DenseWithExtraWeight(4, input_shape=(4,))]
+            )
+            model.compile("adam", "mse")
+
+        inputs = np.random.normal(size=(64, 4))
+        targets = np.random.normal(size=(64, 4))
+        old_kernel = model.get_weights()[1]
+        model.fit(inputs, targets)
+        new_kernel = model.get_weights()[1]
+        self.assertNotAllEqual(old_kernel, new_kernel)
+
+
+class TestDistributionStrategyWithDatasets(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_calling_model_on_same_dataset(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(0.001)
+                model = get_model()
+                loss = "mse"
+                metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            dataset = get_dataset(distribution)
+
+            # Call fit with validation data
+            model.fit(
+                dataset,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                validation_data=dataset,
+                validation_steps=2,
+            )
+            model.fit(
+                dataset,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                validation_data=dataset,
+                validation_steps=2,
+            )
+            model.predict(get_predict_dataset(distribution), steps=2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                user_controlled_model = get_model()
+                user_controlled_model.compile(
+                    optimizer_fn(0.001),
+                    loss="mse",
+                    metrics=["mae", keras.metrics.CategoricalAccuracy()],
+                )
+
+                interleaved_model = get_model()
+                interleaved_model.set_weights(
+                    user_controlled_model.get_weights()
+                )
+                interleaved_model.compile(
+                    optimizer_fn(0.001),
+                    loss="mse",
+                    metrics=["mae", keras.metrics.CategoricalAccuracy()],
+                )
+
+            dataset = get_dataset(distribution)
+
+            # Call fit with validation interleaved
+            interleaved_output = interleaved_model.fit(
+                dataset,
+                epochs=2,
+                steps_per_epoch=2,
+                verbose=1,
+                validation_data=dataset,
+                validation_steps=2,
+                shuffle=False,
+            )
+
+            # Manually control the validation running after each epoch.
+            user_controlled_output = []
+            for _ in range(2):
+                user_controlled_model.fit(
+                    dataset,
+                    epochs=1,
+                    steps_per_epoch=2,
+                    verbose=1,
+                    shuffle=False,
+                )
+                user_controlled_output.append(
+                    user_controlled_model.evaluate(dataset, steps=2)
+                )
+
+            self.assertEqual(
+                interleaved_output.history["val_loss"],
+                [x[0] for x in user_controlled_output],
+            )
+            val_mean_absolute_error = interleaved_output.history.get(
+                "val_mean_absolute_error"
+            )
+            if not val_mean_absolute_error:
+                # The name of the metric changed in TF2.0
+                val_mean_absolute_error = interleaved_output.history["val_mae"]
+            self.assertEqual(
+                val_mean_absolute_error, [x[1] for x in user_controlled_output]
+            )
+            self.assertEqual(
+                interleaved_output.history["val_categorical_accuracy"],
+                [x[2] for x in user_controlled_output],
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(learning_rate=0.001)
+                model = multi_input_output_model()
+                loss = "mse"
+                metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            input_a_np = np.random.random((10, 3)).astype("float32")
+            input_b_np = np.random.random((10, 5)).astype("float32")
+            output_d_np = np.random.random((10, 7)).astype("float32")
+            output_e_np = np.random.random((10, 7)).astype("float32")
+
+            # Test with tuples
+            dataset_tuple = tf.data.Dataset.from_tensor_slices(
+                ((input_a_np, input_b_np), (output_d_np, output_e_np))
+            )
+            dataset_tuple = dataset_tuple.repeat(100)
+            dataset_tuple = dataset_tuple.batch(10)
+
+            model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
+
+            # Test with dict
+            dataset_dict = tf.data.Dataset.from_tensor_slices(
+                (
+                    {"input_a": input_a_np, "input_b": input_b_np},
+                    (output_d_np, output_e_np),
+                )
+            )
+            dataset_dict = dataset_dict.repeat(100)
+            dataset_dict = dataset_dict.batch(10)
+
+            model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_fit_with_dictionary_in_the_dataset_b135161171(self, distribution):
+
+        if backend.is_tpu_strategy(distribution):
+            self.skipTest("b/142805125")
+
+        def custom_loss(predict, label, weight):
+            bce = keras.losses.binary_crossentropy(label, predict)
+            return tf.reduce_mean(bce * weight)
+
+        with self.cached_session():
+            with distribution.scope():
+                input_img = keras.layers.Input([64, 64, 3], name="img")
+                input_lbl = keras.layers.Input([64, 64, 1], name="lbl")
+                input_weight = keras.layers.Input([64, 64], name="weight")
+                predict = keras.layers.Conv2D(2, [1, 1], padding="same")(
+                    input_img
+                )
+                loss_lambda = keras.layers.Lambda(
+                    lambda x: custom_loss(*x), name="my_loss"
+                )
+                my_loss = loss_lambda([predict, input_lbl, input_weight])
+                model = keras.models.Model(
+                    inputs=[input_img, input_lbl, input_weight],
+                    outputs=[predict, my_loss],
+                )
+                model.add_loss(model.get_layer("my_loss").output)
+                model.compile(optimizer="adam")
+
+            if tf.executing_eagerly():
+
+                def map_fn(img, lbl, weight):
+                    inputs = {"img": img, "lbl": lbl, "weight": weight}
+                    return (inputs,)
+
+            else:
+
+                def map_fn(img, lbl, weight):
+                    inputs = {"img": img, "lbl": lbl, "weight": weight}
+                    return inputs, {}
+
+            fake_imgs = np.ones([50, 64, 64, 3], dtype=np.float32)
+            fake_lbls = np.ones([50, 64, 64, 1], dtype=np.float32)
+            fake_weights = np.ones([50, 64, 64], dtype=np.float32)
+
+            data = (
+                tf.data.Dataset.from_tensor_slices(
+                    (fake_imgs, fake_lbls, fake_weights)
+                )
+                .map(map_fn)
+                .batch(10)
+            )
+
+            model.fit(data)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_fit_eval_and_predict_methods_on_dataset_without_steps(
+        self, distribution
+    ):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(0.001)
+                model = get_model()
+                loss = "mse"
+                metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            inputs = np.zeros((1000, 3), dtype=np.float32)
+            targets = np.zeros((1000, 4), dtype=np.float32)
+            # steps/steps_per_epoch are calculated when using numpy arrays as
+            # input data.
+            fit_with_numpy = model.fit(
+                inputs, targets, epochs=1, batch_size=10
+            ).history
+            eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+            predict_with_numpy = model.predict(inputs, batch_size=10)
+
+            dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+            dataset = dataset.batch(10, drop_remainder=True)
+            fit_with_ds = model.fit(dataset, epochs=1).history
+            eval_with_ds = model.evaluate(dataset)
+            predict_dataset = tf.data.Dataset.from_tensor_slices(inputs)
+            predict_dataset = predict_dataset.batch(10, drop_remainder=True)
+            predict_with_ds = model.predict(predict_dataset)
+            self.assertAllClose(
+                fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4
+            )
+            self.assertAllClose(
+                eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4
+            )
+            self.assertAllClose(
+                predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_predict_on_dataset_with_unknown_cardinality_without_steps(
+        self, distribution, mode
+    ):
+
+        if mode == "graph" and backend.is_tpu_strategy(distribution):
+            self.skipTest("partial batch not supported with TPU in graph mode.")
+
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(0.001)
+                model = get_model()
+                loss = "mse"
+                metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            inputs = np.zeros((20, 3), dtype=np.float32)
+            # steps/steps_per_epoch are calculated when using numpy arrays as
+            # input data.
+            predict_with_numpy = model.predict(inputs, batch_size=10)
+
+            predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+                inputs
+            )
+
+            self.assertEqual(
+                keras.backend.get_value(
+                    tf.data.experimental.cardinality(predict_dataset)
+                ),
+                tf.data.experimental.UNKNOWN_CARDINALITY,
+            )
+
+            predict_with_ds = model.predict(predict_dataset)
+            self.assertAllClose(
+                predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_on_dataset_with_unknown_cardinality_without_steps(
+        self, distribution, mode
+    ):
+        # TODO(b/155867206): Investigate why this test occasionally segfaults on
+        # TPU in eager mode.
+        if mode == "eager" and backend.is_tpu_strategy(distribution):
+            self.skipTest("caused segfault with TPU in eager mode.")
+
+        if mode == "graph" and backend.is_tpu_strategy(distribution):
+            self.skipTest("partial batch not supported with TPU in graph mode.")
+
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(0.001)
+                model = get_model()
+                loss = "mse"
+                metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            inputs = np.zeros((100, 3), dtype=np.float32)
+            targets = np.zeros((100, 4), dtype=np.float32)
+            # steps/steps_per_epoch are calculated when using numpy arrays as
+            # input data.
+            fit_with_numpy = model.fit(
+                inputs, targets, epochs=1, batch_size=10
+            ).history
+            fit_with_numpy_multiple_epochs = model.fit(
+                inputs, targets, epochs=2, batch_size=10
+            ).history
+            eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+            predict_with_numpy = model.predict(inputs, batch_size=10)
+
+            dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+                inputs, targets
+            )
+            predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+                inputs
+            )
+
+            self.assertEqual(
+                keras.backend.get_value(
+                    tf.data.experimental.cardinality(dataset)
+                ),
+                tf.data.experimental.UNKNOWN_CARDINALITY,
+            )
+            self.assertEqual(
+                keras.backend.get_value(
+                    tf.data.experimental.cardinality(predict_dataset)
+                ),
+                tf.data.experimental.UNKNOWN_CARDINALITY,
+            )
+
+            eval_with_ds = model.evaluate(dataset)
+            predict_with_ds = model.predict(predict_dataset)
+            self.assertAllClose(
+                eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4
+            )
+            self.assertAllClose(
+                predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4
+            )
+
+            fit_with_ds = model.fit(dataset, epochs=1).history
+            fit_with_ds_multiple_epochs = model.fit(dataset, epochs=2).history
+            self.assertAllClose(
+                fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4
+            )
+            self.assertAllClose(
+                fit_with_numpy_multiple_epochs,
+                fit_with_ds_multiple_epochs,
+                atol=1e-4,
+                rtol=1e-4,
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tpu_strategy_combinations_graph_only()
+    )
+    def test_on_dataset_with_unknown_cardinality(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                model = get_model()
+                loss = "mse"
+                metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                model.compile(
+                    tf.compat.v1.train.GradientDescentOptimizer(0.001),
+                    loss,
+                    metrics=metrics,
+                )
+
+            inputs = np.zeros((1000, 3), dtype=np.float32)
+            targets = np.zeros((1000, 4), dtype=np.float32)
+            # steps/steps_per_epoch are calculated when using numpy arrays as
+            # input data.
+            eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+            predict_with_numpy = model.predict(inputs, batch_size=10)
+
+            dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+                inputs, targets
+            )
+            predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+                inputs
+            )
+
+            self.assertEqual(
+                keras.backend.get_value(
+                    tf.data.experimental.cardinality(dataset)
+                ),
+                tf.data.experimental.UNKNOWN_CARDINALITY,
+            )
+            self.assertEqual(
+                keras.backend.get_value(
+                    tf.data.experimental.cardinality(predict_dataset)
+                ),
+                tf.data.experimental.UNKNOWN_CARDINALITY,
+            )
+
+            eval_with_ds = model.evaluate(dataset, steps=100)
+            predict_with_ds = model.predict(predict_dataset, steps=100)
+            self.assertAllClose(
+                eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4
+            )
+            self.assertAllClose(
+                predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4
+            )
+
+            with self.assertRaisesRegex(
+                ValueError, "Number of steps could not be inferred"
+            ):
+                model.fit(dataset, epochs=1)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(0.001)
+                model = get_model()
+                loss = "mse"
+                metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            dataset = get_dataset(distribution)
+
+            model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+            model.evaluate(dataset, steps=2, verbose=1)
+            model.predict(get_predict_dataset(distribution), steps=2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        strategy_and_optimizer_combinations()
+    )
+    def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
+        with self.cached_session():
+
+            with distribution.scope():
+
+                model = get_model()
+                loss = "mse"
+                model.compile(optimizer(), loss)
+
+            dataset = get_dataset(distribution)
+
+            model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+            model.evaluate(dataset, steps=2, verbose=1)
+            model.predict(get_predict_dataset(distribution), steps=2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.one_device_strategy,
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_dataset_wrong_input_shape(self, distribution, mode):
+        if mode == "graph":
+            self.skipTest(
+                "TODO(b/120943676, b/120957836): Re-enable for graph once the "
+                "validation code is restored."
+            )
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(learning_rate=0.001)
+                model = get_model()
+                loss = "mse"
+                model.compile(optimizer, loss)
+
+            # Wrong input shape
+            inputs = np.zeros((10, 5), dtype=np.float32)
+            targets = np.zeros((10, 4), dtype=np.float32)
+            dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+            dataset = dataset.repeat(100)
+            dataset = dataset.batch(10)
+
+            with self.assertRaisesRegex(ValueError, "is incompatible with"):
+                model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_dataset_external_batch_input_validation(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(learning_rate=0.001)
+                model = get_model()
+                loss = "mse"
+                model.compile(optimizer, loss)
+
+            # Batching is done outside tf.data's `batch`
+            inputs = np.zeros((100, 10, 3), dtype=np.float32)
+            targets = np.zeros((100, 10, 4), dtype=np.float32)
+            dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+            dataset = dataset.repeat(100)
+
+            model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_learning_phase_value(self, distribution):
+        # TODO(anjalisridhar): Modify this test to use Lambdas since we can
+        # compare meaningful values. Currently we don't pass the learning phase
+        # if the Lambda layer uses the learning phase.
+        with self.cached_session():
+            with distribution.scope():
+                x = keras.layers.Input(shape=(1,), name="input")
+                y = keras.layers.Dense(1, kernel_initializer="ones")(x)
+                z = keras.layers.Dropout(0.9999)(y)
+                model = keras.Model(x, z)
+                initial_weights = model.get_weights()
+
+                optimizer_fn = gradient_descent_keras.SGD
+                optimizer = optimizer_fn(0.005)
+                loss = "mse"
+                metrics = ["acc"]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            batch_size = 8
+            if isinstance(
+                distribution,
+                (
+                    tf.distribute.MirroredStrategy,
+                    tf.compat.v1.distribute.MirroredStrategy,
+                ),
+            ):
+                # MirroredStrategy uses global batch size.
+                batch_size = 8 * distribution.num_replicas_in_sync
+
+            inputs = np.ones((10, 1), dtype=np.float32)
+            targets = np.ones((10, 1), dtype=np.float32)
+            dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+            dataset = dataset.repeat().batch(batch_size)
+            hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
+            self.assertAlmostEqual(hist.history["acc"][0], 0, 0)
+
+            with distribution.scope():
+                model.set_weights(initial_weights)
+            # TODO(psv/anjalisridhar): Enable these lines after we fix
+            # b/117431185.  evaluate_output = model.evaluate(dataset, steps=20)
+            # self.assertAlmostEqual(evaluate_output[1], 1, 0)
+
+            inputs = np.ones((10, 1), dtype=np.float32)
+            predict_dataset = tf.data.Dataset.from_tensor_slices(inputs)
+
+            predict_dataset = predict_dataset.repeat().batch(batch_size)
+            output = model.predict(predict_dataset, steps=10)
+            # `predict` runs for 10 steps
+            ref_output = np.ones((160, 1), dtype=np.float32)
+            self.assertArrayNear(output, ref_output, 1e-1)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def testOptimizerWithCallbacks(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                model = get_model()
+                optimizer = gradient_descent_keras.SGD(0.01)
+                loss = "mse"
+                model.compile(optimizer, loss)
+
+            dataset = get_dataset(distribution)
+
+            def schedule(_):
+                return 0.001
+
+            model.fit(
+                dataset,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                callbacks=[keras.callbacks.LearningRateScheduler(schedule)],
+            )
+            self.assertAllClose(
+                0.001, keras.backend.get_value(model.optimizer.lr)
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tpu_strategy_combinations_graph_only(),
+            tf.__internal__.test.combinations.combine(batch_size=[4, 6]),
+        )
+    )
+    def test_evaluate_with_dataset_with_partial_batch(
+        self, distribution, batch_size
+    ):
+        with self.cached_session():
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+            loss = "mse"
+            metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+
+            with distribution.scope():
+                model_with_ds_strategy = get_model()
+                model_with_ds_strategy.compile(optimizer, loss, metrics=metrics)
+
+            cpu_model = get_model()
+            cpu_model.compile(optimizer, loss, metrics=metrics)
+
+            x = np.random.random((10, 3)).astype("float32")
+            y = np.random.random((10, 4)).astype("float32")
+            dataset = tf.data.Dataset.from_tensor_slices((x, y))
+
+            # As sample size is 10, we make the last batch a partial batch.
+            cpu_model.set_weights(model_with_ds_strategy.get_weights())
+            dataset_with_partial_batch = dataset.batch(batch_size)
+
+            # We don't compare the loss as loss is currently not computed as
+            # metric in Keras, the loss value is inaccurate for last partial
+            # batch due to more weights for the last batch samples.
+            steps = np.ceil(10.0 / batch_size)
+            self.assertAllClose(
+                model_with_ds_strategy.evaluate(
+                    dataset_with_partial_batch, steps=steps
+                )[1:],
+                cpu_model.evaluate(dataset_with_partial_batch, steps=steps)[1:],
+                atol=1e-5,
+                rtol=1e-5,
+            )
+            self.assertAllClose(
+                model_with_ds_strategy.evaluate(dataset_with_partial_batch)[1:],
+                cpu_model.evaluate(dataset_with_partial_batch)[1:],
+                atol=1e-5,
+                rtol=1e-5,
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tpu_strategy_combinations_graph_only()
+        )
+    )
+    def test_predict_with_dataset_with_partial_batch(self, distribution):
+        with self.cached_session():
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+            loss = "mse"
+
+            with distribution.scope():
+                model_with_ds_strategy = get_model()
+                model_with_ds_strategy.compile(optimizer, loss)
+
+            cpu_model = get_model()
+            cpu_model.compile(optimizer, loss)
+
+            inputs = np.random.random((10, 3)).astype(np.float32)
+            dataset = tf.data.Dataset.from_tensor_slices((inputs))
+
+            # As sample size is 10, we batch by 4 so that the last batch is
+            # a partial batch.
+            dataset_with_partial_batch = dataset.batch(4)
+            cpu_model.set_weights(model_with_ds_strategy.get_weights())
+
+            self.assertAllClose(
+                model_with_ds_strategy.predict(
+                    dataset_with_partial_batch, steps=3
+                ),
+                cpu_model.predict(dataset_with_partial_batch, steps=3),
+                atol=1e-5,
+                rtol=1e-5,
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tpu_strategy_combinations_graph_only()
+        )
+    )
+    def test_predict_multi_output_model_with_dataset_with_partial_batch(
+        self, distribution
+    ):
+        with self.cached_session():
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+            loss = "mse"
+
+            with distribution.scope():
+                model_with_ds_strategy = (
+                    simple_multi_inputs_multi_outputs_model()
+                )
+                model_with_ds_strategy.compile(optimizer, loss)
+
+            cpu_model = simple_multi_inputs_multi_outputs_model()
+            cpu_model.compile(optimizer, loss)
+
+            input_data, _ = get_multi_inputs_multi_outputs_data()
+            input_dict = {
+                "input_a": input_data["input_a"],
+                "input_b": input_data["input_b"],
+            }
+
+            dataset = tf.data.Dataset.from_tensor_slices(input_dict)
+
+            # As sample size is 200, we batch by 18 using 12 steps per epoch so
+            # that the last batch is a partial batch.
+            dataset_with_partial_batch = dataset.batch(18)
+            cpu_model.set_weights(model_with_ds_strategy.get_weights())
+
+            self.assertAllClose(
+                model_with_ds_strategy.predict(
+                    dataset_with_partial_batch, steps=12
+                ),
+                cpu_model.predict(dataset_with_partial_batch, steps=12),
+                atol=1e-4,
+                rtol=1e-4,
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations_minus_default()
+    )
+    def test_match_model_input_matches_with_dataset_tensors(self, distribution):
+        def _create_model_input_output_tensors():
+            input_a = keras.layers.Input(
+                shape=(16,), name="z_input_sorted_last"
+            )
+            input_b = keras.layers.Input(
+                shape=(32,), name="a_input_sorted_first"
+            )
+            intermediate_a = keras.layers.Dense(10)(input_a)
+            intermediate_b = keras.layers.Dense(10)(input_b)
+            merged = keras.layers.Add()([intermediate_a, intermediate_b])
+            output = keras.layers.Dense(2)(merged)
+            return input_a, input_b, output
+
+        input_dict = {
+            "z_input_sorted_last": np.random.rand(32, 16).astype(np.float32),
+            "a_input_sorted_first": np.random.rand(32, 32).astype(np.float32),
+        }
+        target = np.ones((32, 2), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((input_dict, target))
+        dataset = dataset.batch(4, drop_remainder=True)
+
+        with self.cached_session():
+            with distribution.scope():
+                input_a, input_b, output = _create_model_input_output_tensors()
+                # `input_a`, which has input name that comes last in
+                # alphanumeric order, is the first input of the model input
+                # layers. If tensors from `input_dict` is blindly flattened and
+                # passed to model inputs incorrectly, this would result in
+                # `input_a` input layer matching with tensor
+                # `a_input_sorted_first` and would result in shape mismatch.
+                model_with_array_input = keras.models.Model(
+                    inputs=[input_a, input_b], outputs=output
+                )
+                model_with_array_input.compile("sgd", "mse")
+                model_weights = model_with_array_input.get_weights()
+                model_with_array_input_fit = model_with_array_input.fit(
+                    dataset, steps_per_epoch=1, epochs=1
+                ).history
+
+                input_a, input_b, output = _create_model_input_output_tensors()
+                model_with_dict_input = keras.models.Model(
+                    inputs={
+                        "z_input_sorted_last": input_a,
+                        "a_input_sorted_first": input_b,
+                    },
+                    outputs=output,
+                )
+                model_with_dict_input.compile("sgd", "mse")
+                model_with_dict_input.set_weights(model_weights)
+                model_with_dict_input_fit = model_with_dict_input.fit(
+                    dataset, steps_per_epoch=1, epochs=1
+                ).history
+                self.assertAllClose(
+                    model_with_dict_input_fit,
+                    model_with_array_input_fit,
+                    atol=1e-4,
+                    rtol=1e-4,
+                )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategies_minus_tpu, mode=["graph", "eager"]
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=multi_worker_mirrored_strategies, mode=["eager"]
+        )
+    )
+    def test_dataset_with_sample_weights(self, distribution):
+        with self.cached_session(), distribution.scope():
+            model = get_sample_weights_model()
+            optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.001)
+            loss = "mse"
+            model.compile(optimizer, loss)
+
+            inputs = np.array([[0], [1], [2], [3]], np.float32)
+            targets = np.array([[2], [4], [6], [8]], np.float32)
+            sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32)
+            ds = tf.data.Dataset.from_tensor_slices(
+                (inputs, targets, sample_weights)
+            ).batch(2)
+            result = model.evaluate(ds, verbose=1)
+
+            # The per sample loss is multiplied by the corresponding sample
+            # weight.  The average of these weighted losses is the return value
+            # of the `evaluate` call. For example, in the test above the average
+            # weighted loss is calculated in the following manner:
+            # batch_1 = (((2-0)^2) * 0.25 + ((4-1)^2) * 0.5) / 2 = 5.5 / 2 =
+            # 2.75
+            # batch_2 = (((6-2)^2 * 0.75) + ((8-3)^2 * 1)) / 2 = 37 / 2 = 18.5
+            # final result = (batch_1 + batch_2) / 2 = 10.625.
+            # The first time we divide by number of input samples and the second
+            # time we divide by number of steps/batches that the loss is
+            # aggregated over.
+            self.assertAllClose(result, 10.625)
+
+            # We now test without passing sample_weights:
+            # batch_1 = ((2-0)^2) + ((4-1)^2) / 2 = 13 / 2 = 6.5
+            # batch_2 = ((6-2)^2) + ((8-3)^2) / 2 = 41 / 2 = 20.5
+            # final result = (batch_1 + batch_2) / 2 =  27 / 2 = 13.5
+            ds = tf.data.Dataset.from_tensor_slices((inputs, targets)).batch(2)
+            result = model.evaluate(ds, verbose=1)
+            self.assertAllClose(result, 13.5)
+
+
+class TestDistributionStrategyWithDatasetsFile(
+    tf.test.TestCase, parameterized.TestCase
+):
+    def setUp(self):
+        super().setUp()
+        self.input_file_name = os.path.join(
+            self.get_temp_dir(), "input.tfrecord"
+        )
+        inputs = np.zeros((20, 3), dtype=np.float32)
+        input_dataset = tf.data.Dataset.from_tensor_slices(inputs)
+        input_dataset = input_dataset.map(tf.io.serialize_tensor)
+        writer = tf.data.experimental.TFRecordWriter(self.input_file_name)
+        writer.write(input_dataset)
+
+    # TODO(wxinyi): add a multi-worker test for TPU
+    @tf.__internal__.distribute.combinations.generate(
+        multi_worker_strategy_combinations_eager_only()
+    )
+    def test_predict_on_dataset_shard_options_file_multi_worker_mirrored(
+        self, distribution, mode
+    ):
+        # This test is to verify if we successfully switch auto_shard_policy of
+        # a input dataset inside model.predict with MultiWorkerMirroredStrategy
+        # to AutoShardPolicy.DATA. Since there is only one input file for
+        # multiple workers, AutoShardPolicy.AUTO or AutoShardPolicy.FILE will
+        # lead to an error. However, since we switch to AutoShardPolicy.DATA in
+        # model.predict, no error is raised.
+        del mode
+        with distribution.scope():
+            optimizer_fn = gradient_descent_keras.SGD
+            optimizer = optimizer_fn(0.001)
+            model = get_model()
+            loss = "mse"
+            model.compile(optimizer, loss)
+
+        dataset = tf.data.TFRecordDataset(self.input_file_name)
+        dataset = dataset.map(lambda x: tf.io.parse_tensor(x, tf.float32))
+
+        dummy_op = lambda inp: True
+
+        dataset = dataset.filter(dummy_op).batch(8, drop_remainder=True)
+
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = (
+            tf.data.experimental.AutoShardPolicy.FILE
+        )
+        dataset = dataset.with_options(options)
+
+        model.predict(dataset, steps=1)
 
-  def __init__(self):
-    super().__init__()
-    self.train_begin_batches = []
-    self.train_end_batches = []
-    self.test_begin_batches = []
-    self.test_end_batches = []
-    self.predict_begin_batches = []
-    self.predict_end_batches = []
-
-  def on_train_batch_begin(self, batch, logs=None):
-    self.train_begin_batches.append(batch)
-
-  def on_train_batch_end(self, batch, logs=None):
-    self.train_end_batches.append(batch)
-
-  def on_test_batch_begin(self, batch, logs=None):
-    self.test_begin_batches.append(batch)
-
-  def on_test_batch_end(self, batch, logs=None):
-    self.test_end_batches.append(batch)
-
-  def on_predict_batch_begin(self, batch, logs=None):
-    self.predict_begin_batches.append(batch)
-
-  def on_predict_batch_end(self, batch, logs=None):
-    self.predict_end_batches.append(batch)
-
-
-class TestDistributionStrategyWithNumpyArrays(tf.test.TestCase,
-                                              parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
-    # Calculate the per_replica_batch_size scaling factor for strategies
-    # that use per_core_batch_size
-    replica_scale_factor = 1.0
-    if not distributed_training_utils.global_batch_size_supported(distribution):
-      replica_scale_factor = distribution.num_replicas_in_sync
-
-    with self.cached_session():
-      # Default global batch size 32 for input with 64 samples run in 2 steps
-      steps, batch_size = distributed_training_utils_v1.get_input_params(
-          distribution, 64, steps=None, batch_size=None)
-      self.assertEqual(batch_size, 32 // replica_scale_factor)
-      self.assertEqual(steps, 2)
-
-      # Computed global batch size 20 is lower than 32 if we pass less samples.
-      steps, batch_size = distributed_training_utils_v1.get_input_params(
-          distribution, 20, steps=None, batch_size=None)
-      self.assertEqual(batch_size, 20 // replica_scale_factor)
-      self.assertEqual(steps, 1)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calculating_input_params_with_steps_no_batch_size(
-      self, distribution):
-    # Calculate the per_replica_batch_size scaling factor for strategies
-    # that use per_core_batch_size
-    replica_scale_factor = 1.0
-    if not distributed_training_utils.global_batch_size_supported(distribution):
-      replica_scale_factor = distribution.num_replicas_in_sync
-
-    with self.cached_session():
-      # Computed global batch size is correct for number of specified 1 step
-      steps, batch_size = distributed_training_utils_v1.get_input_params(
-          distribution, 64, steps=1, batch_size=None)
-      self.assertEqual(batch_size, 64 // replica_scale_factor)
-      self.assertEqual(steps, 1)
-
-      # Computed global batch size is correct for number of specified 2 steps
-      steps, batch_size = distributed_training_utils_v1.get_input_params(
-          distribution, 64, steps=2, batch_size=None)
-      self.assertEqual(batch_size, 32 // replica_scale_factor)
-      self.assertEqual(steps, 2)
-
-      # All samples can not be consumed in specified number of steps
-      with self.assertRaisesRegex(ValueError, 'not divisible by steps'):
-        distributed_training_utils_v1.get_input_params(
-            distribution, 63, steps=2, batch_size=None)
-
-      # This cases is different for different strategies due to the
-      # difference in supported batch size being global or per-replica.
-      if replica_scale_factor == 1:
-        # Computed global batch size is correct even if not sharadable
-        steps, batch_size = distributed_training_utils_v1.get_input_params(
-            distribution, 63, steps=3, batch_size=None)
-        self.assertEqual(batch_size, 21)
-        self.assertEqual(steps, 3)
-      else:
-        # Computed global batch size can not be sharded across replicas
-        with self.assertRaisesRegex(
-            ValueError, 'could not be sharded evenly '
-            'across the sync replicas'):
-          distributed_training_utils_v1.get_input_params(
-              distribution, 63, steps=1, batch_size=None)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calculating_input_params_no_steps_with_batch_size(
-      self, distribution):
-    # Calculate the per_replica_batch_size scaling factor for strategies
-    # that use per_core_batch_size
-    replica_scale_factor = 1.0
-    if not distributed_training_utils.global_batch_size_supported(distribution):
-      replica_scale_factor = distribution.num_replicas_in_sync
-
-    with self.cached_session():
-      # Computed steps is correct for specified batch size
-      steps, batch_size = distributed_training_utils_v1.get_input_params(
-          distribution, 64, steps=None, batch_size=16)
-      self.assertEqual(batch_size, 16)
-      self.assertEqual(steps, 4 // replica_scale_factor)
-
-      # Computed steps is correct for specified batch size
-      steps, batch_size = distributed_training_utils_v1.get_input_params(
-          distribution, 64, steps=None, batch_size=32)
-      self.assertEqual(batch_size, 32)
-      self.assertEqual(steps, 2 // replica_scale_factor)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calculating_input_params_with_steps_with_batch_size(
-      self, distribution):
-    with self.cached_session():
-      # No change to steps and batch size if both specified and feasible
-      steps, batch_size = distributed_training_utils_v1.get_input_params(
-          distribution, 64, steps=5, batch_size=3)
-      self.assertEqual(batch_size, 3)
-      self.assertEqual(steps, 5)
-
-      # Number of samples is less than global batch size * steps
-      with self.assertRaisesRegex(ValueError, 'less than samples required'):
-        distributed_training_utils_v1.get_input_params(
-            distribution, 64, steps=10, batch_size=13)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calling_model_with_numpy_arrays(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(0.001)
-        model = get_model()
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-        inputs = np.zeros((64, 3), dtype=np.float32)
-        targets = np.zeros((64, 4), dtype=np.float32)
-
-        # Call fit with validation data
-        model.fit(
-            inputs,
-            targets,
-            epochs=1,
-            batch_size=2,
-            verbose=0,
-            validation_data=(inputs, targets))
-
-        # TODO(anjalisridhar): We need tests for when the batch size and steps
-        # are smaller and results in a 0 batch_size and steps value.
-        model.evaluate(inputs, targets)
-        model.evaluate(inputs, targets, batch_size=8)
-
-        model.predict(inputs)
-        model.predict(inputs, batch_size=8)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calling_model_with_mixed_precision(self, distribution):
-    if isinstance(distribution,
-                  (tf.compat.v1.distribute.experimental.ParameterServerStrategy,
-                   tf.distribute.experimental.ParameterServerStrategy,
-                   tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      self.skipTest('b/152097775')
-    if backend.is_tpu_strategy(distribution):
-      policy_name = 'mixed_bfloat16'
-    else:
-      policy_name = 'mixed_float16'
-    with self.cached_session(), \
-         distribution.scope(), \
-         policy.policy_scope(policy_name):
-      optimizer_fn = gradient_descent_keras.SGD
-      optimizer = optimizer_fn(0.001)
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      y = keras.layers.Activation('softmax', dtype='float32')(y)
-      model = keras.Model(x, y)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(
-          optimizer,
-          loss,
-          metrics=metrics)
-
-      # We need to pass float32 since TPUs do not support float64, even though
-      # these arrays will immediately be casted to bfloat16 on TPUs. We also
-      # cannot pass bfloat16, as Numpy does not support it.
-      inputs = np.zeros((64, 3), dtype='float32')
-      targets = np.zeros((64, 4), dtype='float32')
-
-      model.fit(
-          inputs,
-          targets,
-          epochs=1,
-          batch_size=2,
-          verbose=0,
-          validation_data=(inputs, targets))
-
-      model.evaluate(inputs, targets)
-      model.evaluate(inputs, targets, batch_size=8)
-
-      model.predict(inputs)
-      model.predict(inputs, batch_size=8)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_operator_overload_mixed_precision(self, distribution):
-    # Regression test that tests a fixed bug does not reoccur. Adding an
-    # AutoCastVariable to a tensor on a TPU, where the variable was the LHS of
-    # the '+' operator, used to cause the gradient w.r.t. the variable to be
-    # None.
-    if isinstance(distribution,
-                  (tf.compat.v1.distribute.experimental.ParameterServerStrategy,
-                   tf.distribute.experimental.ParameterServerStrategy,
-                   tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      self.skipTest('b/152097775')
-
-    if backend.is_tpu_strategy(distribution):
-      policy_name = 'mixed_bfloat16'
-    else:
-      policy_name = 'mixed_float16'
-
-    class MyLayer(keras.layers.Layer):
-
-      def build(self, _):
-        self.v1 = self.add_weight('v', ())
-        self.v2 = self.add_weight('v', ())
-
-      def call(self, inp):
-        inp += self.v1
-        return self.v2 + inp
-
-    with self.cached_session(), distribution.scope():
-      layer = MyLayer(dtype=policy_name)
-      def run_fn():
-        x = np.array([1.])
-        with tf.GradientTape() as tape:
-          y = layer(x)
-        grad_v1, grad_v2 = tape.gradient(y, [layer.v1, layer.v2])
-        return grad_v1, grad_v2
-      if tf.executing_eagerly():
-        run_fn = tf.function(run_fn)
-
-      grad_v1, grad_v2 = distribution.run(run_fn)
-      self.assertIsNotNone(grad_v1)
-      self.assertIsNotNone(grad_v2)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[tf.__internal__.distribute.combinations.one_device_strategy],
-          mode=['graph', 'eager']))
-  def test_optimizer_in_cross_replica_context_raises_error(self, distribution):
-
-    with self.cached_session(), distribution.scope():
-      model = keras.models.Sequential([keras.layers.Dense(1)])
-      x = np.array([[1.]])
-      with tf.GradientTape() as tape:
-        y = model(x)
-      gradients = tape.gradient(y, model.trainable_variables)
-      optimizer = gradient_descent_keras.SGD()
-
-      with self.assertRaisesRegex(RuntimeError,
-                                  'cannot be called in cross-replica context'):
-        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calling_model_with_nested_numpy_arrays(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(learning_rate=0.001)
-        model = multi_input_output_model()
-        loss = 'mse'
-        model.compile(
-            optimizer,
-            loss)
-
-      input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
-      input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
-      inputs = [input_a_np, input_b_np]
-
-      output_d_np = np.asarray(np.random.random((64, 7)), dtype=np.float32)
-      output_e_np = np.asarray(np.random.random((64, 7)), dtype=np.float32)
-      targets = [output_d_np, output_e_np]
-
-      # Call fit with validation data
-      model.fit(inputs, targets, epochs=1, batch_size=8, verbose=0)
-
-      # TODO(anjalisridhar): We need tests for when the batch size and steps are
-      # smaller and results in a 0 batch_size and steps value.
-      model.evaluate(inputs, targets)
-      model.evaluate(inputs, targets, batch_size=8)
-
-      model.predict(inputs)
-      model.predict(inputs, batch_size=8)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategies_minus_tpu, mode=['graph', 'eager']) +
-      tf.__internal__.test.combinations.combine(
-          distribution=multi_worker_mirrored_strategies, mode=['eager']))
-  def test_numpy_with_sample_weights(self, distribution):
-    with self.cached_session(), distribution.scope():
-      model = get_sample_weights_model()
-      optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss)
-
-      inputs = np.array([[0], [1], [2], [3]], np.float32)
-      targets = np.array([[2], [4], [6], [8]], np.float32)
-      sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32)
-
-      result = model.evaluate(
-          inputs,
-          targets,
-          batch_size=2,
-          sample_weight=sample_weights,
-          verbose=1)
-
-      # The per sample loss is multiplied by the corresponding sample weight.
-      # The average of these weighted losses is the return value of the
-      # `evaluate` call. For example, in the test above the average weighted
-      # loss is calculated in the following manner:
-
-      # batch_1 = (((2-0)^2) * 0.25 + ((4-1)^2) * 0.5) / 2 = 5.5 / 2 = 2.75
-      # batch_2 = (((6-2)^2 * 0.75) + ((8-3)^2 * 1)) / 2 = 37 / 2 = 18.5
-      # final result = (batch_1 + batch_2) / 2 = 10.625.
-      # The first time we divide by number of input samples and the second time
-      # we divide by number of steps/batches that the loss is aggregated over.
-      self.assertAllClose(result, 10.625)
-
-      # We now test without passing sample_weights:
-      # batch_1 = ((2-0)^2) + ((4-1)^2) / 2 = 13 / 2 = 6.5
-      # batch_2 = ((6-2)^2) + ((8-3)^2) / 2 = 41 / 2 = 20.5
-      # final result = (batch_1 + batch_2) / 2 =  27 / 2 = 13.5
-      result = model.evaluate(inputs, targets, batch_size=2, verbose=1)
-      self.assertAllClose(result, 13.5)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_flatten_predict_outputs(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = multi_input_output_model()
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(learning_rate=0.001)
-        loss = 'mse'
-        model.compile(
-            optimizer,
-            loss)
-
-      # We take 6 input samples with each input having a dimension of 3 or 5.
-      input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
-      input_b_np = np.asarray(np.random.random((6, 5)), dtype=np.float32)
-      inputs = [input_a_np, input_b_np]
-
-      outs = model.predict(inputs)
-      # `predict` a list that is equal in length to the number of model outputs.
-      # In this test our model has two outputs and each element of `outs`
-      # corresponds to all the samples of one of the model outputs.
-      self.assertLen(outs, 2)
-      # Each of the output samples have a dimension of 7. We should process all
-      # the available input samples(6).
-      self.assertAllEqual([6, 7], outs[0].shape)
-      self.assertAllEqual([6, 7], outs[1].shape)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(tpu_strategy_combinations_graph_only(),
-                         tf.__internal__.test.combinations.combine(batch_size=[4, 6])))
-  def test_evaluate_with_partial_batch(self, distribution, batch_size):
-    with self.cached_session():
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-
-      with distribution.scope():
-        model_with_ds_strategy = get_model()
-        model_with_ds_strategy.compile(optimizer, loss, metrics=metrics)
-
-      cpu_model = get_model()
-      cpu_model.compile(optimizer, loss, metrics=metrics)
-
-      x = np.random.random((10, 3)).astype('float32')
-      y = np.random.random((10, 4)).astype('float32')
-
-      # As sample size is 10, we batch by 4 so that the last batch is
-      # a partial batch. Also `evaluate()` using numpy array as inputs without
-      # distribution strategy uses entire sample as a single batch. As so,
-      # we remove parameters `batch_size` and `steps`.
-      cpu_model.set_weights(model_with_ds_strategy.get_weights())
-      evaluate_ground_truth = cpu_model.evaluate(x, y)
-
-      # We don't compare the loss as loss is currently not computed as metric
-      # in Keras, the loss value is inaccurate for last partial batch due to
-      # more weights for the last batch samples.
-      steps = np.ceil(10.0 / batch_size)
-      self.assertAllClose(
-          model_with_ds_strategy.evaluate(
-              x, y, batch_size=batch_size, steps=steps)[1:],
-          evaluate_ground_truth[1:],
-          atol=1e-5,
-          rtol=1e-5)
-      # Test that `steps` is inferred correctly when final partial batch exists.
-      self.assertAllClose(
-          model_with_ds_strategy.evaluate(x, y, batch_size=batch_size)[1:],
-          evaluate_ground_truth[1:],
-          atol=1e-5,
-          rtol=1e-5)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          tpu_strategy_combinations_graph_only()))
-  def test_predict_with_partial_batch(self, distribution):
-    with self.cached_session():
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-
-      with distribution.scope():
-        model_with_ds_strategy = get_model()
-        model_with_ds_strategy.compile(
-            optimizer,
-            loss)
-
-      cpu_model = get_model()
-      cpu_model.compile(optimizer, loss)
-
-      inputs = np.random.random((10, 3)).astype(np.float32)
-
-      # As sample size is 10, we batch by 4 so that the last batch is
-      # a partial batch. Also `predict()` using numpy array as inputs without
-      # distribution strategy uses entire sample as a single batch. As so,
-      # we remove parameters `batch_size` and `steps`.
-      cpu_model.set_weights(model_with_ds_strategy.get_weights())
-      predict_ground_truth = cpu_model.predict(inputs)
-      self.assertAllClose(
-          model_with_ds_strategy.predict(inputs, batch_size=4, steps=3),
-          predict_ground_truth,
-          atol=1e-5,
-          rtol=1e-5)
-      # Test that `steps` is inferred correctly when final partial batch exists.
-      self.assertAllClose(
-          model_with_ds_strategy.predict(inputs, batch_size=4),
-          predict_ground_truth,
-          atol=1e-5,
-          rtol=1e-5)
-
-  @tf.__internal__.distribute.combinations.generate(tpu_strategy_combinations_graph_only())
-  def test_no_target_model(self, distribution):
-    with self.cached_session():
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-
-      class MyLayer(keras.layers.Layer):
-
-        def call(self, inputs, training=None):
-          self.add_loss(tf.reduce_sum(inputs), inputs=True)
-          return inputs
-
-      with distribution.scope():
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Dense(16, activation='relu', input_shape=_INPUT_SIZE))
-        model.add(MyLayer())
-        model.add(keras.layers.Dense(_NUM_CLASS, activation='softmax'))
-
-        model.compile(optimizer)
-        inputs = np.zeros((20, 10), np.float32)
-
-        model.fit(inputs, epochs=1, steps_per_epoch=2)
-        model.predict(inputs, steps=1)
-        model.evaluate(inputs, steps=1)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          tpu_strategy_combinations_graph_only()))
-  def test_predict_multi_output_model_with_partial_batch(
-      self, distribution):
-    with self.cached_session():
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-
-      with distribution.scope():
-        model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
-        model_with_ds_strategy.compile(
-            optimizer,
-            loss)
-
-      cpu_model = simple_multi_inputs_multi_outputs_model()
-      cpu_model.compile(optimizer, loss)
-
-      input_data, _ = get_multi_inputs_multi_outputs_data()
-      input_dict = {
-          'input_a': input_data['input_a'],
-          'input_b': input_data['input_b'],
-      }
-
-      # As sample size is 200, we batch by 18 so that the last batch is
-      # a partial batch. Also `fit()` using numpy array as inputs without
-      # distribution strategy uses entire sample as a single batch. As so,
-      # we remove parameters `batch_size` and `steps`.
-      cpu_model.set_weights(model_with_ds_strategy.get_weights())
-      self.assertAllClose(
-          model_with_ds_strategy.predict(input_dict, batch_size=18, steps=12),
-          cpu_model.predict(input_dict),
-          atol=1e-4,
-          rtol=1e-4)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_gradients_are_none(self, distribution):
-
-    if not tf.executing_eagerly():
-      self.skipTest('None gradients are not supported in graph mode')
-
-    class DenseWithExtraWeight(keras.layers.Dense):
-
-      def build(self, input_shape):
-        # Gradients w.r.t. extra_weights are None
-        self.extra_weight_1 = self.add_weight('extra_weight_1', shape=(),
-                                              initializer='ones')
-        super().build(input_shape)
-        self.extra_weight_2 = self.add_weight('extra_weight_2', shape=(),
-                                              initializer='ones')
-
-    with distribution.scope():
-      model = keras.Sequential([DenseWithExtraWeight(4, input_shape=(4,))])
-      model.compile('adam', 'mse')
-
-    inputs = np.random.normal(size=(64, 4))
-    targets = np.random.normal(size=(64, 4))
-    old_kernel = model.get_weights()[1]
-    model.fit(inputs, targets)
-    new_kernel = model.get_weights()[1]
-    self.assertNotAllEqual(old_kernel, new_kernel)
-
-
-class TestDistributionStrategyWithDatasets(tf.test.TestCase,
-                                           parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_calling_model_on_same_dataset(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(0.001)
-        model = get_model()
-        loss = 'mse'
-        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-      dataset = get_dataset(distribution)
-
-      # Call fit with validation data
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          validation_data=dataset,
-          validation_steps=2)
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          validation_data=dataset,
-          validation_steps=2)
-      model.predict(get_predict_dataset(distribution), steps=2)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_model_interleaved_eval_same_as_direct_eval(
-      self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        user_controlled_model = get_model()
-        user_controlled_model.compile(
-            optimizer_fn(0.001),
-            loss='mse',
-            metrics=['mae', keras.metrics.CategoricalAccuracy()])
-
-        interleaved_model = get_model()
-        interleaved_model.set_weights(user_controlled_model.get_weights())
-        interleaved_model.compile(
-            optimizer_fn(0.001),
-            loss='mse',
-            metrics=['mae', keras.metrics.CategoricalAccuracy()])
-
-      dataset = get_dataset(distribution)
-
-      # Call fit with validation interleaved
-      interleaved_output = interleaved_model.fit(
-          dataset,
-          epochs=2,
-          steps_per_epoch=2,
-          verbose=1,
-          validation_data=dataset,
-          validation_steps=2,
-          shuffle=False)
-
-      # Manually control the validation running after each epoch.
-      user_controlled_output = []
-      for _ in range(2):
-        user_controlled_model.fit(
-            dataset, epochs=1, steps_per_epoch=2, verbose=1, shuffle=False)
-        user_controlled_output.append(
-            user_controlled_model.evaluate(dataset, steps=2))
-
-      self.assertEqual(interleaved_output.history['val_loss'],
-                       [x[0] for x in user_controlled_output])
-      val_mean_absolute_error = interleaved_output.history.get(
-          'val_mean_absolute_error')
-      if not val_mean_absolute_error:
-        # The name of the metric changed in TF2.0
-        val_mean_absolute_error = interleaved_output.history['val_mae']
-      self.assertEqual(val_mean_absolute_error,
-                       [x[1] for x in user_controlled_output])
-      self.assertEqual(interleaved_output.history['val_categorical_accuracy'],
-                       [x[2] for x in user_controlled_output])
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(learning_rate=0.001)
-        model = multi_input_output_model()
-        loss = 'mse'
-        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-      input_a_np = np.random.random((10, 3)).astype('float32')
-      input_b_np = np.random.random((10, 5)).astype('float32')
-      output_d_np = np.random.random((10, 7)).astype('float32')
-      output_e_np = np.random.random((10, 7)).astype('float32')
-
-      # Test with tuples
-      dataset_tuple = tf.data.Dataset.from_tensor_slices(
-          ((input_a_np, input_b_np), (output_d_np, output_e_np)))
-      dataset_tuple = dataset_tuple.repeat(100)
-      dataset_tuple = dataset_tuple.batch(10)
-
-      model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
 
-      # Test with dict
-      dataset_dict = tf.data.Dataset.from_tensor_slices(({
-          'input_a': input_a_np,
-          'input_b': input_b_np
-      }, (output_d_np, output_e_np)))
-      dataset_dict = dataset_dict.repeat(100)
-      dataset_dict = dataset_dict.batch(10)
-
-      model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_fit_with_dictionary_in_the_dataset_b135161171(
-      self, distribution):
+class TestRegularizerLoss(tf.test.TestCase, parameterized.TestCase):
+    class IdentityRegularizer(keras.regularizers.Regularizer):
+        def __call__(self, x):
+            return tf.identity(x)
+
+    class AddLayer(keras.layers.Layer):
+        def build(self, _):
+            self.v = self.add_weight(
+                "v",
+                (),
+                initializer="ones",
+                regularizer=TestRegularizerLoss.IdentityRegularizer(),
+            )
+
+        def call(self, inputs):
+            return inputs + self.v
+
+    @staticmethod
+    def loss_fn(_, y_pred):
+        return tf.reduce_mean(y_pred)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            all_strategy_combinations_minus_default()
+        )
+    )
+    def test_regularizer_loss(self, distribution):
+        batch_size = 2
+        if not distributed_training_utils.global_batch_size_supported(
+            distribution
+        ):
+            batch_size //= distribution.num_replicas_in_sync
+
+            # Given an input x, which is always 1, and variable v, this model
+            # computes Loss=x+v+regularizer_loss, where regularizer_loss=v and
+            # the variable is initialized to 1. Therefore, this model computes
+            # Loss=1+2v, and so the gradient dLoss/dv = 2. This gradient of 2 is
+            # averaged over all examples in a batch and then multiplied by the
+            # learning rate of 1. As a result, the model update for one batch
+            # should subtract 2 from v, resulting in v being -1. If the
+            # regularizer loss is not scaled correctly by number of replicas,
+            # the variable value will be incorrect when number of replicas >1.
+            # For e.g. it will be -2 if num replicas = 2.
+        with distribution.scope():
+            x = keras.layers.Input(shape=(1,), batch_size=batch_size)
+            y = TestRegularizerLoss.AddLayer()(x)
+            model = keras.models.Model(inputs=x, outputs=y)
+            opt = gradient_descent_keras.SGD(1.0)
+            model.compile(opt, loss=TestRegularizerLoss.loss_fn)
+            model.fit(
+                x=np.array([[1.0], [1.0]], dtype=np.float32),
+                y=np.array([[1.0], [1.0]], dtype=np.float32),
+                batch_size=batch_size,
+            )
+            v = model.get_weights()[0]
+            self.assertEqual(-1.0, v)
 
-    if backend.is_tpu_strategy(distribution):
-      self.skipTest('b/142805125')
-
-    def custom_loss(predict, label, weight):
-      bce = keras.losses.binary_crossentropy(label, predict)
-      return tf.reduce_mean(bce * weight)
-
-    with self.cached_session():
-      with distribution.scope():
-        input_img = keras.layers.Input([64, 64, 3], name='img')
-        input_lbl = keras.layers.Input([64, 64, 1], name='lbl')
-        input_weight = keras.layers.Input([64, 64], name='weight')
-        predict = keras.layers.Conv2D(2, [1, 1], padding='same')(input_img)
-        loss_lambda = keras.layers.Lambda(
-            lambda x: custom_loss(*x), name='my_loss')
-        my_loss = loss_lambda([predict, input_lbl, input_weight])
-        model = keras.models.Model(
-            inputs=[input_img, input_lbl, input_weight],
-            outputs=[predict, my_loss])
-        model.add_loss(model.get_layer('my_loss').output)
-        model.compile(
-            optimizer='adam')
-
-      if tf.executing_eagerly():
-
-        def map_fn(img, lbl, weight):
-          inputs = {'img': img, 'lbl': lbl, 'weight': weight}
-          return (inputs,)
-      else:
-
-        def map_fn(img, lbl, weight):
-          inputs = {'img': img, 'lbl': lbl, 'weight': weight}
-          return inputs, {}
-
-      fake_imgs = np.ones([50, 64, 64, 3], dtype=np.float32)
-      fake_lbls = np.ones([50, 64, 64, 1], dtype=np.float32)
-      fake_weights = np.ones([50, 64, 64], dtype=np.float32)
-
-      data = tf.data.Dataset.from_tensor_slices(
-          (fake_imgs, fake_lbls, fake_weights)).map(map_fn).batch(10)
-
-      model.fit(data)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_fit_eval_and_predict_methods_on_dataset_without_steps(
-      self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(0.001)
-        model = get_model()
-        loss = 'mse'
-        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-      inputs = np.zeros((1000, 3), dtype=np.float32)
-      targets = np.zeros((1000, 4), dtype=np.float32)
-      # steps/steps_per_epoch are calculated when using numpy arrays as
-      # input data.
-      fit_with_numpy = model.fit(
-          inputs, targets, epochs=1, batch_size=10).history
-      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
-      predict_with_numpy = model.predict(inputs, batch_size=10)
-
-      dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.batch(10, drop_remainder=True)
-      fit_with_ds = model.fit(dataset, epochs=1).history
-      eval_with_ds = model.evaluate(dataset)
-      predict_dataset = tf.data.Dataset.from_tensor_slices(inputs)
-      predict_dataset = predict_dataset.batch(10, drop_remainder=True)
-      predict_with_ds = model.predict(predict_dataset)
-      self.assertAllClose(fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4)
-      self.assertAllClose(eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
-      self.assertAllClose(
-          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_predict_on_dataset_with_unknown_cardinality_without_steps(
-      self, distribution, mode):
-
-    if mode == 'graph' and backend.is_tpu_strategy(distribution):
-      self.skipTest('partial batch not supported with TPU in graph mode.')
-
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(0.001)
-        model = get_model()
-        loss = 'mse'
-        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(optimizer, loss, metrics=metrics)
-
-      inputs = np.zeros((20, 3), dtype=np.float32)
-      # steps/steps_per_epoch are calculated when using numpy arrays as
-      # input data.
-      predict_with_numpy = model.predict(inputs, batch_size=10)
-
-      predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
-          inputs)
-
-      self.assertEqual(
-          keras.backend.get_value(tf.data.experimental.cardinality(predict_dataset)),
-          tf.data.experimental.UNKNOWN_CARDINALITY)
-
-      predict_with_ds = model.predict(predict_dataset)
-      self.assertAllClose(
-          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_on_dataset_with_unknown_cardinality_without_steps(
-      self, distribution, mode):
-    # TODO(b/155867206): Investigate why this test occasionally segfaults on TPU
-    # in eager mode.
-    if mode == 'eager' and backend.is_tpu_strategy(distribution):
-      self.skipTest('caused segfault with TPU in eager mode.')
-
-    if mode == 'graph' and backend.is_tpu_strategy(distribution):
-      self.skipTest('partial batch not supported with TPU in graph mode.')
-
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(0.001)
-        model = get_model()
-        loss = 'mse'
-        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-      inputs = np.zeros((100, 3), dtype=np.float32)
-      targets = np.zeros((100, 4), dtype=np.float32)
-      # steps/steps_per_epoch are calculated when using numpy arrays as
-      # input data.
-      fit_with_numpy = model.fit(
-          inputs, targets, epochs=1, batch_size=10).history
-      fit_with_numpy_multiple_epochs = model.fit(
-          inputs, targets, epochs=2, batch_size=10).history
-      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
-      predict_with_numpy = model.predict(inputs, batch_size=10)
-
-      dataset = convert_numpy_to_dataset_with_unknown_cardinality(
-          inputs, targets)
-      predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
-          inputs)
-
-      self.assertEqual(
-          keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
-          tf.data.experimental.UNKNOWN_CARDINALITY)
-      self.assertEqual(
-          keras.backend.get_value(tf.data.experimental.cardinality(predict_dataset)),
-          tf.data.experimental.UNKNOWN_CARDINALITY)
-
-      eval_with_ds = model.evaluate(dataset)
-      predict_with_ds = model.predict(predict_dataset)
-      self.assertAllClose(eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
-      self.assertAllClose(
-          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
-
-      fit_with_ds = model.fit(dataset, epochs=1).history
-      fit_with_ds_multiple_epochs = model.fit(dataset, epochs=2).history
-      self.assertAllClose(fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4)
-      self.assertAllClose(
-          fit_with_numpy_multiple_epochs,
-          fit_with_ds_multiple_epochs,
-          atol=1e-4,
-          rtol=1e-4)
-
-  @tf.__internal__.distribute.combinations.generate(tpu_strategy_combinations_graph_only())
-  def test_on_dataset_with_unknown_cardinality(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = get_model()
-        loss = 'mse'
-        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(
-            tf.compat.v1.train.GradientDescentOptimizer(0.001),
-            loss,
-            metrics=metrics)
-
-      inputs = np.zeros((1000, 3), dtype=np.float32)
-      targets = np.zeros((1000, 4), dtype=np.float32)
-      # steps/steps_per_epoch are calculated when using numpy arrays as
-      # input data.
-      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
-      predict_with_numpy = model.predict(inputs, batch_size=10)
-
-      dataset = convert_numpy_to_dataset_with_unknown_cardinality(
-          inputs, targets)
-      predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
-          inputs)
-
-      self.assertEqual(
-          keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
-          tf.data.experimental.UNKNOWN_CARDINALITY)
-      self.assertEqual(
-          keras.backend.get_value(tf.data.experimental.cardinality(predict_dataset)),
-          tf.data.experimental.UNKNOWN_CARDINALITY)
-
-      eval_with_ds = model.evaluate(dataset, steps=100)
-      predict_with_ds = model.predict(predict_dataset, steps=100)
-      self.assertAllClose(eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
-      self.assertAllClose(
-          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
-
-      with self.assertRaisesRegex(ValueError,
-                                  'Number of steps could not be inferred'):
-        model.fit(dataset, epochs=1)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_fit_eval_and_predict_methods_on_dataset(
-      self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(0.001)
-        model = get_model()
-        loss = 'mse'
-        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
 
-      dataset = get_dataset(distribution)
+@test_utils.run_all_without_tensor_float_32(
+    "Uses Dense layers, which call matmul"
+)
+class TestDistributionStrategyWithKerasModels(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_distribution_strategy_on_sequential_model(self, distribution):
+        with distribution.scope():
+            optimizer_fn = gradient_descent_keras.SGD
+            optimizer = optimizer_fn(learning_rate=0.001)
+            model = simple_sequential_model()
+            loss = "mse"
+            model.compile(optimizer, loss)
+
+            inputs = np.zeros((20, 10), np.float32)
+            targets = np.zeros((20, 2), np.float32)
+
+        model.fit(inputs, targets, epochs=1, batch_size=10)
+        model.predict(inputs, batch_size=10)
+        model.evaluate(inputs, targets, batch_size=10)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations()
+    )
+    def test_distribution_strategy_on_functional_model(self, distribution):
+        with distribution.scope():
+            optimizer_fn = gradient_descent_keras.SGD
+            optimizer = optimizer_fn(learning_rate=0.001)
+            model = get_model()
+            loss = "mse"
+            model.compile(optimizer, loss)
+
+            inputs = np.zeros((64, 3), dtype=np.float32)
+            targets = np.zeros((64, 4), dtype=np.float32)
+
+        model.fit(inputs, targets, epochs=1)
+        model.predict(inputs)
+        model.evaluate(inputs, targets)
 
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-      model.evaluate(dataset, steps=2, verbose=1)
-      model.predict(get_predict_dataset(distribution), steps=2)
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_distributed_dataset(self, distribution):
+        with distribution.scope():
+
+            class CBCounter(keras.callbacks.Callback):
+                def __init__(self):
+                    self.epochs = 0
+                    self.train_batches = 0
+                    self.test_batches = 0
+
+                def on_epoch_end(self, batch, logs=None):
+                    self.epochs += 1
+
+                def on_train_batch_end(self, batch, logs=None):
+                    self.train_batches += 1
+
+                def on_test_batch_end(self, batch, logs=None):
+                    self.test_batches += 1
+
+            model = keras.Sequential([keras.layers.Dense(1)])
+            model.compile("sgd", "mse")
+            cb_counter = CBCounter()
+
+            x, y = np.ones((100, 10)), np.ones((100, 1))
+            ds = tf.data.Dataset.from_tensor_slices((x, y))
+            ds = ds.batch(10).repeat(2)
+            ds = distribution.experimental_distribute_dataset(ds)
+
+            val_ds = tf.data.Dataset.from_tensor_slices((x, y))
+            val_ds = val_ds.batch(20)
+            val_ds = distribution.experimental_distribute_dataset(val_ds)
+
+            model.fit(
+                ds,
+                steps_per_epoch=10,
+                validation_data=val_ds,
+                validation_steps=5,
+                epochs=2,
+                callbacks=[cb_counter],
+            )
+
+            self.assertEqual(cb_counter.train_batches, 20)
+            self.assertEqual(cb_counter.test_batches, 10)
+            self.assertEqual(cb_counter.epochs, 2)
+
+            # Check for `steps_per_epoch`.
+            if distribution.num_replicas_in_sync > 1:
+                with self.assertRaisesRegex(
+                    ValueError, "distributed dataset, you must specify"
+                ):
+                    model.fit(ds, epochs=2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_distributed_datasets_from_function(self, distribution):
+        with distribution.scope():
+
+            class CBCounter(keras.callbacks.Callback):
+                def __init__(self):
+                    self.epochs = 0
+                    self.train_batches = 0
+                    self.test_batches = 0
+
+                def on_epoch_end(self, batch, logs=None):
+                    self.epochs += 1
+
+                def on_train_batch_end(self, batch, logs=None):
+                    self.train_batches += 1
+
+                def on_test_batch_end(self, batch, logs=None):
+                    self.test_batches += 1
+
+            model = keras.Sequential([keras.layers.Dense(1)])
+            model.compile("sgd", "mse")
+            cb_counter = CBCounter()
+
+            def make_dataset(_):
+                x, y = np.ones((100, 10)), np.ones((100, 1))
+                ds = tf.data.Dataset.from_tensor_slices((x, y))
+                ds = ds.batch(5).repeat()
+                return ds
+
+            ds = distribution.distribute_datasets_from_function(make_dataset)
+            val_ds = distribution.distribute_datasets_from_function(
+                make_dataset
+            )
+
+            model.fit(
+                ds,
+                steps_per_epoch=10,
+                validation_data=val_ds,
+                validation_steps=5,
+                epochs=2,
+                callbacks=[cb_counter],
+            )
+
+            self.assertEqual(cb_counter.train_batches, 20)
+            self.assertEqual(cb_counter.test_batches, 10)
+            self.assertEqual(cb_counter.epochs, 2)
+
+            # Check for `steps_per_epoch`.
+            if distribution.num_replicas_in_sync > 1:
+                with self.assertRaisesRegex(
+                    ValueError, "distributed dataset, you must specify"
+                ):
+                    model.fit(ds, epochs=2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_host_training_loop(self, distribution):
+        if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy):
+            self.skipTest("b/172032817")
+        with distribution.scope():
+            inputs = keras.Input((10, 10, 3))
+            x = keras.layers.Conv2D(3, kernel_size=3)(inputs)
+            x = keras.layers.Flatten()(x)
+            outputs = keras.layers.Dense(1)(x)
+            model = keras.Model(inputs, outputs)
+
+        model.compile("sgd", "mse", steps_per_execution=10)
+
+        bc = BatchCountingCB()
+        x, y = np.ones((100, 10, 10, 3)), np.ones((100, 1))
+        model.fit(x, y, batch_size=2, epochs=1, callbacks=[bc])
+        self.assertEqual(bc.train_begin_batches, [0, 10, 20, 30, 40])
+        self.assertEqual(bc.train_end_batches, [9, 19, 29, 39, 49])
+
+        model.evaluate(x, y, batch_size=2, callbacks=[bc])
+        self.assertEqual(bc.test_begin_batches, [0, 10, 20, 30, 40])
+        self.assertEqual(bc.test_end_batches, [9, 19, 29, 39, 49])
+
+        model.predict(x, batch_size=2, callbacks=[bc])
+        self.assertEqual(bc.predict_begin_batches, [0, 10, 20, 30, 40])
+        self.assertEqual(bc.predict_end_batches, [9, 19, 29, 39, 49])
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_host_training_loop_last_partial_execution(self, distribution):
+        if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy):
+            self.skipTest("b/172032817")
+        with distribution.scope():
+            inputs = keras.Input(10)
+            outputs = keras.layers.Dense(1)(inputs)
+            model = keras.Model(inputs, outputs)
+
+        model.compile("sgd", "mse", steps_per_execution=20)
+
+        bc = BatchCountingCB()
+        x, y = np.ones((100, 10)), np.ones((100, 1))
+        model.fit(x, y, batch_size=2, epochs=1, callbacks=[bc])
+        self.assertEqual(bc.train_begin_batches, [0, 20, 40])
+        self.assertEqual(bc.train_end_batches, [19, 39, 49])
+
+        model.evaluate(x, y, batch_size=2, callbacks=[bc])
+        self.assertEqual(bc.test_begin_batches, [0, 20, 40])
+        self.assertEqual(bc.test_end_batches, [19, 39, 49])
+
+        model.predict(x, batch_size=2, callbacks=[bc])
+        self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
+        self.assertEqual(bc.predict_end_batches, [19, 39, 49])
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_host_training_loop_dataset_unknown_size(self, distribution):
+        if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy):
+            self.skipTest("b/172032817")
+        with distribution.scope():
+            inputs = keras.Input(10)
+            outputs = keras.layers.Dense(1)(inputs)
+            model = keras.Model(inputs, outputs)
+
+        model.compile("sgd", "mse", steps_per_execution=20)
 
-  @tf.__internal__.distribute.combinations.generate(strategy_and_optimizer_combinations())
-  def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
-    with self.cached_session():
+        x, y = np.ones((100, 10)), np.ones((100, 1))
+        ds = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
+        ds = ds.filter(lambda *args, **kwargs: True)  # Makes the size UNKNOWN.
+        bc = BatchCountingCB()
+
+        with self.assertRaisesRegex(ValueError, "steps_per_execution"):
+            model.fit(ds, epochs=2, callbacks=[bc])
+
+        train_ds = ds.repeat(2)
+        model.fit(train_ds, steps_per_epoch=50, epochs=2, callbacks=[bc])
+        self.assertEqual(bc.train_begin_batches, [0, 20, 40, 0, 20, 40])
+        self.assertEqual(bc.train_end_batches, [19, 39, 49, 19, 39, 49])
+
+        with self.assertRaisesRegex(ValueError, "steps_per_execution"):
+            model.evaluate(ds, callbacks=[bc])
+
+        test_ds = ds.repeat(2)
+        model.evaluate(test_ds, steps=50, callbacks=[bc])
+        self.assertEqual(bc.test_begin_batches, [0, 20, 40])
+        self.assertEqual(bc.test_end_batches, [19, 39, 49])
+
+        predict_ds = ds.repeat(2)
+        model.predict(predict_ds, steps=50, callbacks=[bc])
+        self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
+        self.assertEqual(bc.predict_end_batches, [19, 39, 49])
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_host_training_loop_truncate_to_epoch(self, distribution):
+        if isinstance(distribution, tf.distribute.MultiWorkerMirroredStrategy):
+            self.skipTest("b/172032817")
+        with distribution.scope():
+            inputs = keras.Input(10)
+            outputs = keras.layers.Dense(1)(inputs)
+            model = keras.Model(inputs, outputs)
+
+        model.compile("sgd", "mse", steps_per_execution=500)
 
-      with distribution.scope():
+        x, y = np.ones((100, 10)), np.ones((100, 1))
+        bc = BatchCountingCB()
+        model.fit(x, y, batch_size=2, epochs=2, callbacks=[bc])
+        self.assertEqual(bc.train_begin_batches, [0, 0])
+        self.assertEqual(bc.train_end_batches, [49, 49])
+
+        x, y = np.ones((50, 10)), np.ones((50, 1))
+        model.evaluate(x, y, batch_size=2, callbacks=[bc])
+        self.assertEqual(bc.test_begin_batches, [0])
+        self.assertEqual(bc.test_end_batches, [24])
+
+        x = np.ones((50, 10))
+        model.predict(x, batch_size=2, callbacks=[bc])
+        self.assertEqual(bc.predict_begin_batches, [0])
+        self.assertEqual(bc.predict_end_batches, [24])
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_gradient_clipping(self, distribution):
+        class MyLayer(keras.layers.Layer):
+            def build(self, _):
+                self.v1 = tf.Variable(1.0)
+                self.v2 = tf.Variable(1.0)
+
+            def call(self, x):
+                return 3 * self.v1 - 3 * self.v2
+
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+
+        with distribution.scope():
+            layer = MyLayer()
+            model = keras.Sequential([layer])
+            optimizer = gradient_descent_keras.SGD(
+                1.0, clipnorm=2.0, clipvalue=2.0
+            )
+        model.compile(optimizer, "mae")
+
+        if isinstance(
+            distribution,
+            (
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            with self.assertRaisesRegex(ValueError, "not supported"):
+                model.fit(x, y, batch_size=10, epochs=1)
+        else:
+            model.fit(x, y, batch_size=10, epochs=1)
+            self.assertAllClose(self.evaluate(layer.v1), 3.0)
+            self.assertAllClose(self.evaluate(layer.v2), -1.0)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_custom_gradient_transformation(self, distribution):
+        if isinstance(
+            distribution,
+            (
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            self.skipTest("Not supported with `CentralStorageStrategy`")
+
+        class MyLayer(keras.layers.Layer):
+            def build(self, _):
+                self.v1 = tf.Variable(1.0)
+                self.v2 = tf.Variable(-1.0)
+
+            def call(self, x):
+                return x + self.v1 + self.v2
+
+        def custom_transform(grads_and_vars):
+            # Always set gradients to 1.
+            return [(tf.ones_like(g), v) for g, v in grads_and_vars]
+
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+
+        with distribution.scope():
+            layer = MyLayer()
+            model = keras.Sequential([layer])
+            optimizer = gradient_descent_keras.SGD(
+                1.0, gradient_transformers=[custom_transform]
+            )
+        model.compile(optimizer, "mae")
 
-        model = get_model()
-        loss = 'mse'
-        model.compile(
-            optimizer(),
-            loss)
-
-      dataset = get_dataset(distribution)
-
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-      model.evaluate(dataset, steps=2, verbose=1)
-      model.predict(get_predict_dataset(distribution), steps=2)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.one_device_strategy
-          ],
-          mode=['graph', 'eager']))
-  def test_dataset_wrong_input_shape(self, distribution, mode):
-    if mode == 'graph':
-      self.skipTest(
-          'TODO(b/120943676, b/120957836): Re-enable for graph once the '
-          'validation code is restored.')
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(learning_rate=0.001)
-        model = get_model()
-        loss = 'mse'
-        model.compile(
-            optimizer,
-            loss)
-
-      # Wrong input shape
-      inputs = np.zeros((10, 5), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
-      dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      with self.assertRaisesRegex(ValueError, 'is incompatible with'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu
-          ],
-          mode=['graph', 'eager']))
-  def test_dataset_external_batch_input_validation(
-      self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(learning_rate=0.001)
-        model = get_model()
-        loss = 'mse'
-        model.compile(
-            optimizer,
-            loss)
-
-      # Batching is done outside tf.data's `batch`
-      inputs = np.zeros((100, 10, 3), dtype=np.float32)
-      targets = np.zeros((100, 10, 4), dtype=np.float32)
-      dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus
-          ],
-          mode=['graph', 'eager']))
-  def test_learning_phase_value(self, distribution):
-    # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
-    # meaningful values. Currently we don't pass the learning phase if the
-    # Lambda layer uses the learning phase.
-    with self.cached_session():
-      with distribution.scope():
-        x = keras.layers.Input(shape=(1,), name='input')
-        y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-        z = keras.layers.Dropout(0.9999)(y)
-        model = keras.Model(x, z)
-        initial_weights = model.get_weights()
-
-        optimizer_fn = gradient_descent_keras.SGD
-        optimizer = optimizer_fn(0.005)
-        loss = 'mse'
-        metrics = ['acc']
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-      batch_size = 8
-      if isinstance(distribution, (tf.distribute.MirroredStrategy,
-                                   tf.compat.v1.distribute.MirroredStrategy)):
-        # MirroredStrategy uses global batch size.
-        batch_size = 8 * distribution.num_replicas_in_sync
-
-      inputs = np.ones((10, 1), dtype=np.float32)
-      targets = np.ones((10, 1), dtype=np.float32)
-      dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat().batch(batch_size)
-      hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
-      self.assertAlmostEqual(hist.history['acc'][0], 0, 0)
-
-      with distribution.scope():
-        model.set_weights(initial_weights)
-      # TODO(psv/anjalisridhar): Enable these lines after we fix b/117431185.
-      # evaluate_output = model.evaluate(dataset, steps=20)
-      # self.assertAlmostEqual(evaluate_output[1], 1, 0)
-
-      inputs = np.ones((10, 1), dtype=np.float32)
-      predict_dataset = tf.data.Dataset.from_tensor_slices(inputs)
-
-      predict_dataset = predict_dataset.repeat().batch(batch_size)
-      output = model.predict(predict_dataset, steps=10)
-      # `predict` runs for 10 steps
-      ref_output = np.ones((160, 1), dtype=np.float32)
-      self.assertArrayNear(output, ref_output, 1e-1)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def testOptimizerWithCallbacks(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = get_model()
-        optimizer = gradient_descent_keras.SGD(0.01)
-        loss = 'mse'
+        model.fit(x, y, batch_size=10, epochs=1)
+        self.assertAllClose(self.evaluate(layer.v1), 0.0)
+        self.assertAllClose(self.evaluate(layer.v2), -2.0)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            all_strategy_combinations_minus_default()
+        )
+    )
+    def test_distribution_strategy_one_dimensional(self, distribution):
+        with distribution.scope():
+            inp = keras.layers.Input(shape=(10,))
+            out = keras.layers.Dense(3, activation="softmax")(inp)
+            model = keras.Model(inputs=[inp], outputs=[out])
+            model.compile(
+                optimizer="rmsprop",
+                loss="sparse_categorical_crossentropy",
+                metrics=["sparse_categorical_accuracy"],
+            )
+
+            x = np.random.random((64, 10)).astype("float32")
+            y = np.random.randint(3, size=64)
+
+            model.fit(x, y, epochs=1, steps_per_epoch=2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+            reduction=[
+                losses_utils.ReductionV2.AUTO,
+                losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+                losses_utils.ReductionV2.SUM,
+            ],
+        )
+    )
+    def test_distribution_strategy_with_loss_reduction_types(
+        self, distribution, reduction
+    ):
+        np.random.seed(_RANDOM_SEED)
+
+        def _get_model():
+            inputs = keras.Input((10,))
+            x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs)
+            x2 = keras.layers.Dense(10, kernel_initializer="zeros")(x1)
+            outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2)
+            model = keras.Model(inputs, outputs)
+            return model
+
+        x = np.random.random((64, 10))
+        y = np.random.random((64, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        dataset = dataset.batch(32)
+
+        model = _get_model()
         model.compile(
-            optimizer,
-            loss)
-
-      dataset = get_dataset(distribution)
-
-      def schedule(_):
-        return 0.001
-
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
-      self.assertAllClose(0.001, keras.backend.get_value(model.optimizer.lr))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(tpu_strategy_combinations_graph_only(),
-                         tf.__internal__.test.combinations.combine(batch_size=[4, 6])))
-  def test_evaluate_with_dataset_with_partial_batch(self, distribution,
-                                                    batch_size):
-    with self.cached_session():
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-
-      with distribution.scope():
-        model_with_ds_strategy = get_model()
-        model_with_ds_strategy.compile(optimizer, loss, metrics=metrics)
-
-      cpu_model = get_model()
-      cpu_model.compile(optimizer, loss, metrics=metrics)
-
-      x = np.random.random((10, 3)).astype('float32')
-      y = np.random.random((10, 4)).astype('float32')
-      dataset = tf.data.Dataset.from_tensor_slices((x, y))
-
-      # As sample size is 10, we make the last batch a partial batch.
-      cpu_model.set_weights(model_with_ds_strategy.get_weights())
-      dataset_with_partial_batch = dataset.batch(batch_size)
-
-      # We don't compare the loss as loss is currently not computed as metric
-      # in Keras, the loss value is inaccurate for last partial batch due to
-      # more weights for the last batch samples.
-      steps = np.ceil(10.0 / batch_size)
-      self.assertAllClose(
-          model_with_ds_strategy.evaluate(
-              dataset_with_partial_batch, steps=steps)[1:],
-          cpu_model.evaluate(dataset_with_partial_batch, steps=steps)[1:],
-          atol=1e-5,
-          rtol=1e-5)
-      self.assertAllClose(
-          model_with_ds_strategy.evaluate(dataset_with_partial_batch)[1:],
-          cpu_model.evaluate(dataset_with_partial_batch)[1:],
-          atol=1e-5,
-          rtol=1e-5)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          tpu_strategy_combinations_graph_only()))
-  def test_predict_with_dataset_with_partial_batch(
-      self, distribution):
-    with self.cached_session():
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-
-      with distribution.scope():
-        model_with_ds_strategy = get_model()
-        model_with_ds_strategy.compile(
-            optimizer,
-            loss)
-
-      cpu_model = get_model()
-      cpu_model.compile(optimizer, loss)
-
-      inputs = np.random.random((10, 3)).astype(np.float32)
-      dataset = tf.data.Dataset.from_tensor_slices((inputs))
-
-      # As sample size is 10, we batch by 4 so that the last batch is
-      # a partial batch.
-      dataset_with_partial_batch = dataset.batch(4)
-      cpu_model.set_weights(model_with_ds_strategy.get_weights())
-
-      self.assertAllClose(
-          model_with_ds_strategy.predict(dataset_with_partial_batch, steps=3),
-          cpu_model.predict(dataset_with_partial_batch, steps=3),
-          atol=1e-5,
-          rtol=1e-5)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          tpu_strategy_combinations_graph_only()))
-  def test_predict_multi_output_model_with_dataset_with_partial_batch(
-      self, distribution):
-    with self.cached_session():
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-
-      with distribution.scope():
-        model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
-        model_with_ds_strategy.compile(
-            optimizer,
-            loss)
-
-      cpu_model = simple_multi_inputs_multi_outputs_model()
-      cpu_model.compile(optimizer, loss)
-
-      input_data, _ = get_multi_inputs_multi_outputs_data()
-      input_dict = {
-          'input_a': input_data['input_a'],
-          'input_b': input_data['input_b'],
-      }
-
-      dataset = tf.data.Dataset.from_tensor_slices(input_dict)
-
-      # As sample size is 200, we batch by 18 using 12 steps per epoch so
-      # that the last batch is a partial batch.
-      dataset_with_partial_batch = dataset.batch(18)
-      cpu_model.set_weights(model_with_ds_strategy.get_weights())
-
-      self.assertAllClose(
-          model_with_ds_strategy.predict(dataset_with_partial_batch, steps=12),
-          cpu_model.predict(dataset_with_partial_batch, steps=12),
-          atol=1e-4,
-          rtol=1e-4)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations_minus_default())
-  def test_match_model_input_matches_with_dataset_tensors(self, distribution):
-
-    def _create_model_input_output_tensors():
-      input_a = keras.layers.Input(shape=(16,), name='z_input_sorted_last')
-      input_b = keras.layers.Input(shape=(32,), name='a_input_sorted_first')
-      intermediate_a = keras.layers.Dense(10)(input_a)
-      intermediate_b = keras.layers.Dense(10)(input_b)
-      merged = keras.layers.Add()([intermediate_a, intermediate_b])
-      output = keras.layers.Dense(2)(merged)
-      return input_a, input_b, output
-
-    input_dict = {
-        'z_input_sorted_last': np.random.rand(32, 16).astype(np.float32),
-        'a_input_sorted_first': np.random.rand(32, 32).astype(np.float32)
-    }
-    target = np.ones((32, 2), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((input_dict, target))
-    dataset = dataset.batch(4, drop_remainder=True)
-
-    with self.cached_session():
-      with distribution.scope():
-        input_a, input_b, output = _create_model_input_output_tensors()
-        # `input_a`, which has input name that comes last in alphanumeric
-        # order, is the first input of the model input layers. If tensors
-        # from `input_dict` is blindly flattened and passed to model
-        # inputs incorrectly, this would result in `input_a` input layer
-        # matching with tensor `a_input_sorted_first` and would result in
-        # shape mismatch.
-        model_with_array_input = keras.models.Model(
-            inputs=[input_a, input_b], outputs=output)
-        model_with_array_input.compile('sgd', 'mse')
-        model_weights = model_with_array_input.get_weights()
-        model_with_array_input_fit = model_with_array_input.fit(
-            dataset, steps_per_epoch=1, epochs=1).history
-
-        input_a, input_b, output = _create_model_input_output_tensors()
-        model_with_dict_input = keras.models.Model(
-            inputs={
-                'z_input_sorted_last': input_a,
-                'a_input_sorted_first': input_b,
-            },
-            outputs=output)
-        model_with_dict_input.compile('sgd', 'mse')
-        model_with_dict_input.set_weights(model_weights)
-        model_with_dict_input_fit = model_with_dict_input.fit(
-            dataset, steps_per_epoch=1, epochs=1).history
-        self.assertAllClose(
-            model_with_dict_input_fit,
-            model_with_array_input_fit,
-            atol=1e-4,
-            rtol=1e-4)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategies_minus_tpu, mode=['graph', 'eager']) +
-      tf.__internal__.test.combinations.combine(
-          distribution=multi_worker_mirrored_strategies, mode=['eager']))
-  def test_dataset_with_sample_weights(self, distribution):
-    with self.cached_session(), distribution.scope():
-      model = get_sample_weights_model()
-      optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss)
-
-      inputs = np.array([[0], [1], [2], [3]], np.float32)
-      targets = np.array([[2], [4], [6], [8]], np.float32)
-      sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32)
-      ds = tf.data.Dataset.from_tensor_slices(
-          (inputs, targets, sample_weights)).batch(2)
-      result = model.evaluate(ds, verbose=1)
-
-      # The per sample loss is multiplied by the corresponding sample weight.
-      # The average of these weighted losses is the return value of the
-      # `evaluate` call. For example, in the test above the average weighted
-      # loss is calculated in the following manner:
-      # batch_1 = (((2-0)^2) * 0.25 + ((4-1)^2) * 0.5) / 2 = 5.5 / 2 = 2.75
-      # batch_2 = (((6-2)^2 * 0.75) + ((8-3)^2 * 1)) / 2 = 37 / 2 = 18.5
-      # final result = (batch_1 + batch_2) / 2 = 10.625.
-      # The first time we divide by number of input samples and the second time
-      # we divide by number of steps/batches that the loss is aggregated over.
-      self.assertAllClose(result, 10.625)
-
-      # We now test without passing sample_weights:
-      # batch_1 = ((2-0)^2) + ((4-1)^2) / 2 = 13 / 2 = 6.5
-      # batch_2 = ((6-2)^2) + ((8-3)^2) / 2 = 41 / 2 = 20.5
-      # final result = (batch_1 + batch_2) / 2 =  27 / 2 = 13.5
-      ds = tf.data.Dataset.from_tensor_slices((inputs, targets)).batch(2)
-      result = model.evaluate(ds, verbose=1)
-      self.assertAllClose(result, 13.5)
-
-
-class TestDistributionStrategyWithDatasetsFile(tf.test.TestCase,
-                                               parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.input_file_name = os.path.join(self.get_temp_dir(), 'input.tfrecord')
-    inputs = np.zeros((20, 3), dtype=np.float32)
-    input_dataset = tf.data.Dataset.from_tensor_slices(inputs)
-    input_dataset = input_dataset.map(tf.io.serialize_tensor)
-    writer = tf.data.experimental.TFRecordWriter(self.input_file_name)
-    writer.write(input_dataset)
-
-  # TODO(wxinyi): add a multi-worker test for TPU
-  @tf.__internal__.distribute.combinations.generate(multi_worker_strategy_combinations_eager_only())
-  def test_predict_on_dataset_shard_options_file_multi_worker_mirrored(
-      self, distribution, mode):
-    # This test is to verify if we successfully switch auto_shard_policy of a
-    # input dataset inside model.predict with MultiWorkerMirroredStrategy to
-    # AutoShardPolicy.DATA. Since there is only one input file for multiple
-    # workers, AutoShardPolicy.AUTO or AutoShardPolicy.FILE will lead to an
-    # error. However, since we switch to AutoShardPolicy.DATA in model.predict,
-    # no error is raised.
-    del mode
-    with distribution.scope():
-      optimizer_fn = gradient_descent_keras.SGD
-      optimizer = optimizer_fn(0.001)
-      model = get_model()
-      loss = 'mse'
-      model.compile(optimizer, loss)
-
-    dataset = tf.data.TFRecordDataset(self.input_file_name)
-    dataset = dataset.map(lambda x: tf.io.parse_tensor(x, tf.float32))
-
-    dummy_op = lambda inp: True
-
-    dataset = dataset.filter(dummy_op).batch(8, drop_remainder=True)
-
-    options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = \
-        tf.data.experimental.AutoShardPolicy.FILE
-    dataset = dataset.with_options(options)
-
-    model.predict(dataset, steps=1)
-
-
-class TestRegularizerLoss(tf.test.TestCase, parameterized.TestCase):
-
-  class IdentityRegularizer(keras.regularizers.Regularizer):
-
-    def __call__(self, x):
-      return tf.identity(x)
-
-  class AddLayer(keras.layers.Layer):
-
-    def build(self, _):
-      self.v = self.add_weight(
-          'v', (),
-          initializer='ones',
-          regularizer=TestRegularizerLoss.IdentityRegularizer())
-
-    def call(self, inputs):
-      return inputs + self.v
-
-  @staticmethod
-  def loss_fn(_, y_pred):
-    return tf.reduce_mean(y_pred)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(all_strategy_combinations_minus_default()))
-  def test_regularizer_loss(self, distribution):
-    batch_size = 2
-    if not distributed_training_utils.global_batch_size_supported(distribution):
-      batch_size //= distribution.num_replicas_in_sync
-
-      # Given an input x, which is always 1, and variable v, this model computes
-      # Loss=x+v+regularizer_loss, where regularizer_loss=v and the variable is
-      # initialized to 1. Therefore, this model computes Loss=1+2v, and so the
-      # gradient dLoss/dv = 2. This gradient of 2 is averaged over all examples
-      # in a batch and then multiplied by the learning rate of 1. As a result,
-      # the model update for one batch should subtract 2 from v, resulting in v
-      # being -1. If the regularizer loss is not scaled correctly by number of
-      # replicas, the variable value will be incorrect when number of replicas
-      # >1. For e.g. it will be -2 if num replicas = 2.
-    with distribution.scope():
-      x = keras.layers.Input(shape=(1,), batch_size=batch_size)
-      y = TestRegularizerLoss.AddLayer()(x)
-      model = keras.models.Model(inputs=x, outputs=y)
-      opt = gradient_descent_keras.SGD(1.)
-      model.compile(
-          opt,
-          loss=TestRegularizerLoss.loss_fn)
-      model.fit(
-          x=np.array([[1.], [1.]], dtype=np.float32),
-          y=np.array([[1.], [1.]], dtype=np.float32),
-          batch_size=batch_size)
-      v = model.get_weights()[0]
-      self.assertEqual(-1.0, v)
+            "sgd", loss=keras.losses.MeanSquaredError(reduction=reduction)
+        )
+        history = model.fit(dataset, steps_per_epoch=2, epochs=1, shuffle=False)
+
+        with distribution.scope():
+            ds_model = _get_model()
+            ds_model.compile(
+                "sgd", loss=keras.losses.MeanSquaredError(reduction=reduction)
+            )
+            ds_history = ds_model.fit(
+                dataset, steps_per_epoch=2, epochs=1, shuffle=False
+            )
+        self.assertArrayNear(
+            history.history["loss"], ds_history.history["loss"], 1e-5
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            all_strategy_combinations_minus_default()
+        )
+    )
+    def test_distribution_strategy_with_symbolic_add_loss(
+        self, mode, distribution
+    ):
+        def _make_model_with_add_loss():
+            inputs = keras.Input((10,))
+            x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs)
+            x2 = keras.layers.Dense(10, kernel_initializer="zeros")(x1)
+            outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2)
+            model = keras.Model(inputs, outputs)
+            model.add_loss(tf.reduce_mean(x1))
+            model.add_loss(tf.reduce_mean(outputs))
+            return model
+
+        x = np.ones((64, 10)).astype("float32")
+
+        model = _make_model_with_add_loss()
+        model.compile("sgd")
+        history = model.fit(x, epochs=1)
+
+        with distribution.scope():
+            ds_model = _make_model_with_add_loss()
+            ds_model.compile("sgd")
+            ds_history = ds_model.fit(x, epochs=1)
+
+        self.assertAllClose(history.history, ds_history.history)
+
+    # TODO(omalleyt): Investigate flakiness and re-enable.
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_minus_default_and_tpu_combinations()
+    )
+    def DISABLED_test_distribution_strategy_with_callable_add_loss(
+        self, distribution
+    ):
+        def _make_model():
+            inputs = keras.Input((10,))
+            x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs)
+            x2 = keras.layers.Dense(10, kernel_initializer="zeros")(x1)
+            d = keras.layers.Dense(1, kernel_initializer="zeros")
+            outputs = d(x2)
+            model = keras.Model(inputs, outputs)
+            model.add_loss(lambda: 100.0 * tf.reduce_mean(d.kernel))
+            return model
+
+        x = np.ones((64, 10)).astype("float32")
+        y = np.ones((64, 1)).astype("float32")
+
+        model = _make_model()
+        self.assertLen(model.losses, 1)
+
+        model.compile("sgd", "mse")
+        history = model.fit(x, y, steps_per_epoch=2, epochs=1)
+
+        with distribution.scope():
+            ds_model = _make_model()
+            self.assertLen(ds_model.losses, 1)
+            ds_model.compile("sgd", "mse")
+            ds_history = ds_model.fit(x, y, steps_per_epoch=2, epochs=1)
+
+        self.assertAllClose(history.history, ds_history.history)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            all_strategy_minus_default_and_tpu_combinations()
+        )
+    )
+    def test_distribution_strategy_with_add_metric_in_call(self, distribution):
+        class Bias(keras.layers.Layer):
+            def build(self, input_shape):
+                self.bias = self.add_weight(
+                    name="bias", initializer="zeros", shape=()
+                )
+
+            def call(self, inputs):
+                self.add_metric(
+                    tf.reduce_mean(inputs), name="bias", aggregation="mean"
+                )
+                return inputs + self.bias
+
+        def _make_model_with_add_metric():
+            inputs = keras.Input((10,))
+            x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs)
+            x2 = Bias()(x1)
+            outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2)
+            model = keras.Model(inputs, outputs)
+            return model
+
+        x = np.ones((64, 10)).astype("float32")
+        y = np.ones((64, 1)).astype("float32")
+
+        model = _make_model_with_add_metric()
+        self.assertLen(model.metrics, 1)
+
+        model.compile("sgd", "mse")
+        history = model.fit(
+            x, y, validation_data=(x, y), validation_steps=2, epochs=2
+        )
+
+        with distribution.scope():
+            ds_model = _make_model_with_add_metric()
+            self.assertLen(ds_model.metrics, 1)
+            ds_model.compile("sgd", "mse")
+            ds_history = ds_model.fit(
+                x, y, validation_data=(x, y), validation_steps=2, epochs=2
+            )
+            # includes stateful loss metric in eager.
+            metrics_len = 2 if tf.executing_eagerly() else 1
+            self.assertLen(ds_model.metrics, metrics_len)
+
+        self.assertAllClose(history.history, ds_history.history)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.one_device_strategy,
+                tf.__internal__.distribute.combinations.one_device_strategy_gpu,
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["eager"],
+        )
+    )
+    def test_distribution_strategy_with_add_metric_object(self, distribution):
+        class Bias(keras.layers.Layer):
+            def build(self, input_shape):
+                self.bias = self.add_weight(
+                    name="bias", initializer="zeros", shape=()
+                )
+                self.mean = keras.metrics.Mean(name="mean")
+
+            def call(self, inputs):
+                self.add_metric(self.mean(inputs))
+                return inputs + self.bias
+
+        def _make_model_with_add_metric_object():
+            inputs = keras.Input((10,))
+            x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs)
+            x2 = Bias()(x1)
+            outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x2)
+            model = keras.Model(inputs, outputs)
+            return model
+
+        x = np.ones((64, 10)).astype("float32")
+        y = np.ones((64, 1)).astype("float32")
+
+        model = _make_model_with_add_metric_object()
+        self.assertLen(model.metrics, 1)
+
+        model.compile("sgd", "mse")
+        history = model.fit(
+            x, y, validation_data=(x, y), validation_steps=2, epochs=2
+        )
+
+        with distribution.scope():
+            ds_model = _make_model_with_add_metric_object()
+            self.assertLen(ds_model.metrics, 1)
+            ds_model.compile("sgd", "mse")
+            ds_history = ds_model.fit(
+                x, y, validation_data=(x, y), validation_steps=2, epochs=2
+            )
+            # includes stateful loss metric in eager.
+            metrics_len = 2 if tf.executing_eagerly() else 1
+            self.assertLen(ds_model.metrics, metrics_len)
+
+        self.assertAllClose(history.history, ds_history.history)
+
+    @tf.__internal__.distribute.combinations.generate(
+        # TODO(phillypham): Why does validation_steps > 1 not work on TPUs?
+        tf.__internal__.test.combinations.times(
+            all_strategy_minus_default_and_tpu_combinations()
+        )
+    )
+    def test_distribution_strategy_with_add_metric_outside_call(
+        self, distribution
+    ):
+        def _make_model_with_add_metric():
+            inputs = keras.Input((10,))
+            x1 = keras.layers.Dense(10, kernel_initializer="zeros")(inputs)
+            outputs = keras.layers.Dense(1, kernel_initializer="zeros")(x1)
+            model = keras.Model(inputs, outputs)
+            model.add_metric(
+                tf.reduce_mean(x1), name="mid_mean", aggregation="mean"
+            )
+            return model
+
+        x = np.ones((64, 10)).astype("float32")
+        y = np.ones((64, 1)).astype("float32")
+
+        model = _make_model_with_add_metric()
+        self.assertLen(model.metrics, 1)
+
+        model.compile("sgd", "mse")
+        history = model.fit(
+            x, y, validation_data=(x, y), validation_steps=2, epochs=2
+        )
+
+        with distribution.scope():
+            ds_model = _make_model_with_add_metric()
+            self.assertLen(ds_model.metrics, 1)
+            ds_model.compile("sgd", "mse")
+            ds_history = ds_model.fit(
+                x, y, validation_data=(x, y), validation_steps=2, epochs=2
+            )
+            # includes stateful loss metric in eager.
+            metrics_len = 2 if tf.executing_eagerly() else 1
+            self.assertLen(ds_model.metrics, metrics_len)
+
+        self.assertAllClose(history.history, ds_history.history)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategies_minus_tpu
+            + multi_worker_mirrored_strategies,
+            mode=["eager"],
+        )
+    )
+    def test_sparse_tensor_outputs(self, distribution):
+        class ToSparse(keras.layers.Layer):
+            """Create a sparse tensor based on a given dense tensor."""
+
+            def call(self, inputs):
+                indices = tf.where(tf.not_equal(inputs, 0))
+                values = tf.gather_nd(inputs, indices)
+                shape = tf.shape(inputs, out_type="int64")
+                return tf.SparseTensor(indices, values, dense_shape=shape)
+
+        model = keras.Sequential([ToSparse()])
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0]])
+        output = model.predict(input_data, batch_size=2)
+
+        expected_indices = np.array([[0, 0], [1, 0], [1, 1]])
+        expected_values = np.array([1, 2, 3])
+        expected_dense_shape = np.array([2, 3])
+
+        self.assertAllEqual(output.indices, expected_indices)
+        self.assertAllEqual(output.values, expected_values)
+        self.assertAllEqual(output.dense_shape, expected_dense_shape)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategies_minus_tpu
+            + multi_worker_mirrored_strategies,
+            mode=["eager"],
+        )
+    )
+    def test_ragged_tensor_outputs(self, distribution):
+        class ToRagged(keras.layers.Layer):
+            """Create a ragged tensor based on a given dense tensor."""
+
+            def __init__(self, padding, ragged_rank=1, **kwargs):
+                super().__init__(**kwargs)
+                self._padding = padding
+                self._ragged_rank = ragged_rank
+
+            def call(self, inputs):
+                return tf.RaggedTensor.from_tensor(
+                    inputs, padding=self._padding, ragged_rank=self._ragged_rank
+                )
+
+        model = keras.Sequential([ToRagged(padding=0)])
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0]])
+        output = model.predict(input_data, batch_size=2)
+
+        expected_values = [[1], [2, 3]]
+        self.assertAllEqual(expected_values, output)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategies_minus_default_minus_tpu
+            + tpu_strategies
+            + multi_worker_mirrored_strategies,
+            mode=["eager"],
+        )
+    )
+    def test_correctness_of_add_loss_with_merge_call(self, distribution):
+        batch_size = 32
+
+        def _get_model():
+            inputs = keras.layers.Input(shape=(1,))
+            labels = keras.layers.Input(shape=(1,))
+            x = keras.layers.Dense(10, activation="relu")(inputs)
+            y = keras.layers.Dense(1)(x)
+            model = keras.models.Model([inputs, labels], y)
+            model.add_loss(keras.losses.mean_squared_error(labels, y))
+            return model
+
+        def _get_data():
+            x_train = np.random.rand(64, 1)
+            y_train = 3 * x_train
+            x_train = x_train.astype("float32")
+            y_train = y_train.astype("float32")
+            dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+            dataset = dataset.batch(batch_size)
+            return dataset
+
+        with distribution.scope():
+            model = _get_model()
+            optimizer = gradient_descent_keras.SGD(0.2)
+
+            @tf.function
+            def train_step(dist_inputs):
+                def step_fn(inputs):
+                    with tf.GradientTape() as tape:
+                        logits = model(inputs)
+
+                        # Invoke a merge_call()
+                        tf.distribute.get_replica_context().merge_call(
+                            lambda d: None
+                        )
+
+                        # Verify that there is only one loss on the model.
+                        assert len(model.losses) == 1
+                        loss_from_model = (
+                            tf.reduce_sum(model.losses) * 1.0 / batch_size
+                        )
+
+                        # Compute loss in this loop.
+                        loss = keras.losses.mean_squared_error(
+                            inputs[1], logits
+                        )
+                        loss = tf.nn.compute_average_loss(
+                            loss, global_batch_size=batch_size
+                        )
+
+                        # Verify that the loss computed in this loop is
+                        # equivalent to the loss from the model that was added
+                        # via add_loss.
+                        tf.compat.v1.assert_equal(loss, loss_from_model)
+
+                    grads = tape.gradient(loss, model.trainable_variables)
+                    optimizer.apply_gradients(
+                        zip(grads, model.trainable_variables)
+                    )
+                    return loss
+
+                per_replica_losses = distribution.run(
+                    step_fn, args=(dist_inputs,)
+                )
+                return distribution.reduce(
+                    tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None
+                )
+
+            dataset = distribution.experimental_distribute_dataset(_get_data())
+            for _ in range(2):
+                for x in dataset:
+                    train_step(x)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["graph", "eager"])
+    )
+    def test_unimplemented_parameter_server_strategy(self):
+        cluster_spec = multi_worker_testing_utils.create_in_process_cluster(
+            num_workers=3, num_ps=2
+        )
+        cluster_resolver = SimpleClusterResolver(
+            cluster_spec=tf.train.ClusterSpec(cluster_spec),
+            task_type="worker",
+            task_id=1,
+            num_accelerators={"GPU": 0},
+        )
+        distribution = (
+            tf.compat.v1.distribute.experimental.ParameterServerStrategy(
+                cluster_resolver
+            )
+        )
+
+        self.assertIsInstance(
+            distribution,
+            tf.compat.v1.distribute.experimental.ParameterServerStrategy,
+        )
 
-
-@test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
-class TestDistributionStrategyWithKerasModels(tf.test.TestCase,
-                                              parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_distribution_strategy_on_sequential_model(
-      self, distribution):
-    with distribution.scope():
-      optimizer_fn = gradient_descent_keras.SGD
-      optimizer = optimizer_fn(learning_rate=0.001)
-      model = simple_sequential_model()
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss)
-
-      inputs = np.zeros((20, 10), np.float32)
-      targets = np.zeros((20, 2), np.float32)
-
-    model.fit(inputs, targets, epochs=1, batch_size=10)
-    model.predict(inputs, batch_size=10)
-    model.evaluate(inputs, targets, batch_size=10)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations())
-  def test_distribution_strategy_on_functional_model(
-      self, distribution):
-    with distribution.scope():
-      optimizer_fn = gradient_descent_keras.SGD
-      optimizer = optimizer_fn(learning_rate=0.001)
-      model = get_model()
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss)
-
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
-
-    model.fit(inputs, targets, epochs=1)
-    model.predict(inputs)
-    model.evaluate(inputs, targets)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_distributed_dataset(self, distribution):
-    with distribution.scope():
-
-      class CBCounter(keras.callbacks.Callback):
-
-        def __init__(self):
-          self.epochs = 0
-          self.train_batches = 0
-          self.test_batches = 0
-
-        def on_epoch_end(self, batch, logs=None):
-          self.epochs += 1
-
-        def on_train_batch_end(self, batch, logs=None):
-          self.train_batches += 1
-
-        def on_test_batch_end(self, batch, logs=None):
-          self.test_batches += 1
-
-      model = keras.Sequential([keras.layers.Dense(1)])
-      model.compile('sgd', 'mse')
-      cb_counter = CBCounter()
-
-      x, y = np.ones((100, 10)), np.ones((100, 1))
-      ds = tf.data.Dataset.from_tensor_slices((x, y))
-      ds = ds.batch(10).repeat(2)
-      ds = distribution.experimental_distribute_dataset(ds)
-
-      val_ds = tf.data.Dataset.from_tensor_slices((x, y))
-      val_ds = val_ds.batch(20)
-      val_ds = distribution.experimental_distribute_dataset(val_ds)
-
-      model.fit(
-          ds,
-          steps_per_epoch=10,
-          validation_data=val_ds,
-          validation_steps=5,
-          epochs=2,
-          callbacks=[cb_counter])
-
-      self.assertEqual(cb_counter.train_batches, 20)
-      self.assertEqual(cb_counter.test_batches, 10)
-      self.assertEqual(cb_counter.epochs, 2)
-
-      # Check for `steps_per_epoch`.
-      if distribution.num_replicas_in_sync > 1:
-        with self.assertRaisesRegex(ValueError,
-                                    'distributed dataset, you must specify'):
-          model.fit(ds, epochs=2)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_distributed_datasets_from_function(self, distribution):
-    with distribution.scope():
-
-      class CBCounter(keras.callbacks.Callback):
-
-        def __init__(self):
-          self.epochs = 0
-          self.train_batches = 0
-          self.test_batches = 0
-
-        def on_epoch_end(self, batch, logs=None):
-          self.epochs += 1
-
-        def on_train_batch_end(self, batch, logs=None):
-          self.train_batches += 1
-
-        def on_test_batch_end(self, batch, logs=None):
-          self.test_batches += 1
-
-      model = keras.Sequential([keras.layers.Dense(1)])
-      model.compile('sgd', 'mse')
-      cb_counter = CBCounter()
-
-      def make_dataset(_):
-        x, y = np.ones((100, 10)), np.ones((100, 1))
-        ds = tf.data.Dataset.from_tensor_slices((x, y))
-        ds = ds.batch(5).repeat()
-        return ds
-
-      ds = distribution.distribute_datasets_from_function(make_dataset)
-      val_ds = distribution.distribute_datasets_from_function(make_dataset)
-
-      model.fit(
-          ds,
-          steps_per_epoch=10,
-          validation_data=val_ds,
-          validation_steps=5,
-          epochs=2,
-          callbacks=[cb_counter])
-
-      self.assertEqual(cb_counter.train_batches, 20)
-      self.assertEqual(cb_counter.test_batches, 10)
-      self.assertEqual(cb_counter.epochs, 2)
-
-      # Check for `steps_per_epoch`.
-      if distribution.num_replicas_in_sync > 1:
-        with self.assertRaisesRegex(ValueError,
-                                    'distributed dataset, you must specify'):
-          model.fit(ds, epochs=2)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_host_training_loop(self, distribution):
-    if isinstance(distribution,
-                  tf.distribute.MultiWorkerMirroredStrategy):
-      self.skipTest('b/172032817')
-    with distribution.scope():
-      inputs = keras.Input((10, 10, 3))
-      x = keras.layers.Conv2D(3, kernel_size=3)(inputs)
-      x = keras.layers.Flatten()(x)
-      outputs = keras.layers.Dense(1)(x)
-      model = keras.Model(inputs, outputs)
-
-    model.compile('sgd', 'mse', steps_per_execution=10)
-
-    bc = BatchCountingCB()
-    x, y = np.ones((100, 10, 10, 3)), np.ones((100, 1))
-    model.fit(x, y, batch_size=2, epochs=1, callbacks=[bc])
-    self.assertEqual(bc.train_begin_batches, [0, 10, 20, 30, 40])
-    self.assertEqual(bc.train_end_batches, [9, 19, 29, 39, 49])
-
-    model.evaluate(x, y, batch_size=2, callbacks=[bc])
-    self.assertEqual(bc.test_begin_batches, [0, 10, 20, 30, 40])
-    self.assertEqual(bc.test_end_batches, [9, 19, 29, 39, 49])
-
-    model.predict(x, batch_size=2, callbacks=[bc])
-    self.assertEqual(bc.predict_begin_batches, [0, 10, 20, 30, 40])
-    self.assertEqual(bc.predict_end_batches, [9, 19, 29, 39, 49])
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_host_training_loop_last_partial_execution(self, distribution):
-    if isinstance(distribution,
-                  tf.distribute.MultiWorkerMirroredStrategy):
-      self.skipTest('b/172032817')
-    with distribution.scope():
-      inputs = keras.Input(10)
-      outputs = keras.layers.Dense(1)(inputs)
-      model = keras.Model(inputs, outputs)
-
-    model.compile('sgd', 'mse', steps_per_execution=20)
-
-    bc = BatchCountingCB()
-    x, y = np.ones((100, 10)), np.ones((100, 1))
-    model.fit(x, y, batch_size=2, epochs=1, callbacks=[bc])
-    self.assertEqual(bc.train_begin_batches, [0, 20, 40])
-    self.assertEqual(bc.train_end_batches, [19, 39, 49])
-
-    model.evaluate(x, y, batch_size=2, callbacks=[bc])
-    self.assertEqual(bc.test_begin_batches, [0, 20, 40])
-    self.assertEqual(bc.test_end_batches, [19, 39, 49])
-
-    model.predict(x, batch_size=2, callbacks=[bc])
-    self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
-    self.assertEqual(bc.predict_end_batches, [19, 39, 49])
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_host_training_loop_dataset_unknown_size(self, distribution):
-    if isinstance(distribution,
-                  tf.distribute.MultiWorkerMirroredStrategy):
-      self.skipTest('b/172032817')
-    with distribution.scope():
-      inputs = keras.Input(10)
-      outputs = keras.layers.Dense(1)(inputs)
-      model = keras.Model(inputs, outputs)
-
-    model.compile('sgd', 'mse', steps_per_execution=20)
-
-    x, y = np.ones((100, 10)), np.ones((100, 1))
-    ds = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
-    ds = ds.filter(lambda *args, **kwargs: True)  # Makes the size UNKNOWN.
-    bc = BatchCountingCB()
-
-    with self.assertRaisesRegex(ValueError, 'steps_per_execution'):
-      model.fit(ds, epochs=2, callbacks=[bc])
-
-    train_ds = ds.repeat(2)
-    model.fit(train_ds, steps_per_epoch=50, epochs=2, callbacks=[bc])
-    self.assertEqual(bc.train_begin_batches, [0, 20, 40, 0, 20, 40])
-    self.assertEqual(bc.train_end_batches, [19, 39, 49, 19, 39, 49])
-
-    with self.assertRaisesRegex(ValueError, 'steps_per_execution'):
-      model.evaluate(ds, callbacks=[bc])
-
-    test_ds = ds.repeat(2)
-    model.evaluate(test_ds, steps=50, callbacks=[bc])
-    self.assertEqual(bc.test_begin_batches, [0, 20, 40])
-    self.assertEqual(bc.test_end_batches, [19, 39, 49])
-
-    predict_ds = ds.repeat(2)
-    model.predict(predict_ds, steps=50, callbacks=[bc])
-    self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
-    self.assertEqual(bc.predict_end_batches, [19, 39, 49])
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_host_training_loop_truncate_to_epoch(self, distribution):
-    if isinstance(distribution,
-                  tf.distribute.MultiWorkerMirroredStrategy):
-      self.skipTest('b/172032817')
-    with distribution.scope():
-      inputs = keras.Input(10)
-      outputs = keras.layers.Dense(1)(inputs)
-      model = keras.Model(inputs, outputs)
-
-    model.compile('sgd', 'mse', steps_per_execution=500)
-
-    x, y = np.ones((100, 10)), np.ones((100, 1))
-    bc = BatchCountingCB()
-    model.fit(x, y, batch_size=2, epochs=2, callbacks=[bc])
-    self.assertEqual(bc.train_begin_batches, [0, 0])
-    self.assertEqual(bc.train_end_batches, [49, 49])
-
-    x, y = np.ones((50, 10)), np.ones((50, 1))
-    model.evaluate(x, y, batch_size=2, callbacks=[bc])
-    self.assertEqual(bc.test_begin_batches, [0])
-    self.assertEqual(bc.test_end_batches, [24])
-
-    x = np.ones((50, 10))
-    model.predict(x, batch_size=2, callbacks=[bc])
-    self.assertEqual(bc.predict_begin_batches, [0])
-    self.assertEqual(bc.predict_end_batches, [24])
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_gradient_clipping(self, distribution):
-
-    class MyLayer(keras.layers.Layer):
-
-      def build(self, _):
-        self.v1 = tf.Variable(1.)
-        self.v2 = tf.Variable(1.)
-
-      def call(self, x):
-        return 3 * self.v1 - 3 * self.v2
-
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-
-    with distribution.scope():
-      layer = MyLayer()
-      model = keras.Sequential([layer])
-      optimizer = gradient_descent_keras.SGD(1., clipnorm=2., clipvalue=2.)
-    model.compile(optimizer, 'mae')
-
-    if isinstance(distribution,
-                  (tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      with self.assertRaisesRegex(ValueError, 'not supported'):
-        model.fit(x, y, batch_size=10, epochs=1)
-    else:
-      model.fit(x, y, batch_size=10, epochs=1)
-      self.assertAllClose(self.evaluate(layer.v1), 3.)
-      self.assertAllClose(self.evaluate(layer.v2), -1.)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_custom_gradient_transformation(self, distribution):
-    if isinstance(distribution,
-                  (tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      self.skipTest('Not supported with `CentralStorageStrategy`')
-
-    class MyLayer(keras.layers.Layer):
-
-      def build(self, _):
-        self.v1 = tf.Variable(1.)
-        self.v2 = tf.Variable(-1.)
-
-      def call(self, x):
-        return x + self.v1 + self.v2
-
-    def custom_transform(grads_and_vars):
-      # Always set gradients to 1.
-      return [(tf.ones_like(g), v) for g, v in grads_and_vars]
-
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-
-    with distribution.scope():
-      layer = MyLayer()
-      model = keras.Sequential([layer])
-      optimizer = gradient_descent_keras.SGD(
-          1., gradient_transformers=[custom_transform])
-    model.compile(optimizer, 'mae')
-
-    model.fit(x, y, batch_size=10, epochs=1)
-    self.assertAllClose(self.evaluate(layer.v1), 0.)
-    self.assertAllClose(self.evaluate(layer.v2), -2.)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          all_strategy_combinations_minus_default()))
-  def test_distribution_strategy_one_dimensional(self, distribution):
-    with distribution.scope():
-      inp = keras.layers.Input(shape=(10,))
-      out = keras.layers.Dense(3, activation='softmax')(inp)
-      model = keras.Model(inputs=[inp], outputs=[out])
-      model.compile(
-          optimizer='rmsprop',
-          loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
-
-      x = np.random.random((64, 10)).astype('float32')
-      y = np.random.randint(3, size=64)
-
-      model.fit(x, y, epochs=1, steps_per_epoch=2)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus
-          ],
-          mode=['graph', 'eager'],
-          reduction=[
-              losses_utils.ReductionV2.AUTO,
-              losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
-              losses_utils.ReductionV2.SUM
-          ]))
-  def test_distribution_strategy_with_loss_reduction_types(
-      self, distribution, reduction):
-    np.random.seed(_RANDOM_SEED)
-
-    def _get_model():
-      inputs = keras.Input((10,))
-      x1 = keras.layers.Dense(10, kernel_initializer='zeros')(inputs)
-      x2 = keras.layers.Dense(10, kernel_initializer='zeros')(x1)
-      outputs = keras.layers.Dense(1, kernel_initializer='zeros')(x2)
-      model = keras.Model(inputs, outputs)
-      return model
-
-    x = np.random.random((64, 10))
-    y = np.random.random((64, 1))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(32)
-
-    model = _get_model()
-    model.compile(
-        'sgd', loss=keras.losses.MeanSquaredError(reduction=reduction))
-    history = model.fit(dataset, steps_per_epoch=2, epochs=1, shuffle=False)
-
-    with distribution.scope():
-      ds_model = _get_model()
-      ds_model.compile(
-          'sgd',
-          loss=keras.losses.MeanSquaredError(reduction=reduction))
-      ds_history = ds_model.fit(
-          dataset, steps_per_epoch=2, epochs=1, shuffle=False)
-    self.assertArrayNear(history.history['loss'], ds_history.history['loss'],
-                         1e-5)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          all_strategy_combinations_minus_default()))
-  def test_distribution_strategy_with_symbolic_add_loss(
-      self, mode, distribution):
-
-    def _make_model_with_add_loss():
-      inputs = keras.Input((10,))
-      x1 = keras.layers.Dense(10, kernel_initializer='zeros')(inputs)
-      x2 = keras.layers.Dense(10, kernel_initializer='zeros')(x1)
-      outputs = keras.layers.Dense(1, kernel_initializer='zeros')(x2)
-      model = keras.Model(inputs, outputs)
-      model.add_loss(tf.reduce_mean(x1))
-      model.add_loss(tf.reduce_mean(outputs))
-      return model
-
-    x = np.ones((64, 10)).astype('float32')
-
-    model = _make_model_with_add_loss()
-    model.compile('sgd')
-    history = model.fit(x, epochs=1)
-
-    with distribution.scope():
-      ds_model = _make_model_with_add_loss()
-      ds_model.compile(
-          'sgd')
-      ds_history = ds_model.fit(x, epochs=1)
-
-    self.assertAllClose(history.history, ds_history.history)
-
-  # TODO(omalleyt): Investigate flakiness and re-enable.
-  @tf.__internal__.distribute.combinations.generate(all_strategy_minus_default_and_tpu_combinations())
-  def DISABLED_test_distribution_strategy_with_callable_add_loss(
-      self, distribution):
-
-    def _make_model():
-      inputs = keras.Input((10,))
-      x1 = keras.layers.Dense(10, kernel_initializer='zeros')(inputs)
-      x2 = keras.layers.Dense(10, kernel_initializer='zeros')(x1)
-      d = keras.layers.Dense(1, kernel_initializer='zeros')
-      outputs = d(x2)
-      model = keras.Model(inputs, outputs)
-      model.add_loss(lambda: 100. * tf.reduce_mean(d.kernel))
-      return model
-
-    x = np.ones((64, 10)).astype('float32')
-    y = np.ones((64, 1)).astype('float32')
-
-    model = _make_model()
-    self.assertLen(model.losses, 1)
-
-    model.compile('sgd', 'mse')
-    history = model.fit(x, y, steps_per_epoch=2, epochs=1)
-
-    with distribution.scope():
-      ds_model = _make_model()
-      self.assertLen(ds_model.losses, 1)
-      ds_model.compile('sgd', 'mse')
-      ds_history = ds_model.fit(x, y, steps_per_epoch=2, epochs=1)
-
-    self.assertAllClose(history.history, ds_history.history)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          all_strategy_minus_default_and_tpu_combinations()))
-  def test_distribution_strategy_with_add_metric_in_call(
-      self, distribution):
-
-    class Bias(keras.layers.Layer):
-
-      def build(self, input_shape):
-        self.bias = self.add_weight(name='bias', initializer='zeros', shape=())
-
-      def call(self, inputs):
-        self.add_metric(
-            tf.reduce_mean(inputs), name='bias', aggregation='mean')
-        return inputs + self.bias
-
-    def _make_model_with_add_metric():
-      inputs = keras.Input((10,))
-      x1 = keras.layers.Dense(10, kernel_initializer='zeros')(inputs)
-      x2 = Bias()(x1)
-      outputs = keras.layers.Dense(1, kernel_initializer='zeros')(x2)
-      model = keras.Model(inputs, outputs)
-      return model
-
-    x = np.ones((64, 10)).astype('float32')
-    y = np.ones((64, 1)).astype('float32')
-
-    model = _make_model_with_add_metric()
-    self.assertLen(model.metrics, 1)
-
-    model.compile('sgd', 'mse')
-    history = model.fit(
-        x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-
-    with distribution.scope():
-      ds_model = _make_model_with_add_metric()
-      self.assertLen(ds_model.metrics, 1)
-      ds_model.compile(
-          'sgd',
-          'mse')
-      ds_history = ds_model.fit(
-          x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-      # includes stateful loss metric in eager.
-      metrics_len = 2 if tf.executing_eagerly() else 1
-      self.assertLen(ds_model.metrics, metrics_len)
-
-    self.assertAllClose(history.history, ds_history.history)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.one_device_strategy,
-              tf.__internal__.distribute.combinations.one_device_strategy_gpu,
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus
-          ],
-          mode=['eager']))
-  def test_distribution_strategy_with_add_metric_object(
-      self, distribution):
-
-    class Bias(keras.layers.Layer):
-
-      def build(self, input_shape):
-        self.bias = self.add_weight(name='bias', initializer='zeros', shape=())
-        self.mean = keras.metrics.Mean(name='mean')
-
-      def call(self, inputs):
-        self.add_metric(self.mean(inputs))
-        return inputs + self.bias
-
-    def _make_model_with_add_metric_object():
-      inputs = keras.Input((10,))
-      x1 = keras.layers.Dense(10, kernel_initializer='zeros')(inputs)
-      x2 = Bias()(x1)
-      outputs = keras.layers.Dense(1, kernel_initializer='zeros')(x2)
-      model = keras.Model(inputs, outputs)
-      return model
-
-    x = np.ones((64, 10)).astype('float32')
-    y = np.ones((64, 1)).astype('float32')
-
-    model = _make_model_with_add_metric_object()
-    self.assertLen(model.metrics, 1)
-
-    model.compile('sgd', 'mse')
-    history = model.fit(
-        x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-
-    with distribution.scope():
-      ds_model = _make_model_with_add_metric_object()
-      self.assertLen(ds_model.metrics, 1)
-      ds_model.compile(
-          'sgd',
-          'mse')
-      ds_history = ds_model.fit(
-          x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-      # includes stateful loss metric in eager.
-      metrics_len = 2 if tf.executing_eagerly() else 1
-      self.assertLen(ds_model.metrics, metrics_len)
-
-    self.assertAllClose(history.history, ds_history.history)
-
-  @tf.__internal__.distribute.combinations.generate(
-      # TODO(phillypham): Why does validation_steps > 1 not work on TPUs?
-      tf.__internal__.test.combinations.times(
-          all_strategy_minus_default_and_tpu_combinations()))
-  def test_distribution_strategy_with_add_metric_outside_call(
-      self, distribution):
-
-    def _make_model_with_add_metric():
-      inputs = keras.Input((10,))
-      x1 = keras.layers.Dense(10, kernel_initializer='zeros')(inputs)
-      outputs = keras.layers.Dense(1, kernel_initializer='zeros')(x1)
-      model = keras.Model(inputs, outputs)
-      model.add_metric(
-          tf.reduce_mean(x1), name='mid_mean', aggregation='mean')
-      return model
-
-    x = np.ones((64, 10)).astype('float32')
-    y = np.ones((64, 1)).astype('float32')
-
-    model = _make_model_with_add_metric()
-    self.assertLen(model.metrics, 1)
-
-    model.compile('sgd', 'mse')
-    history = model.fit(
-        x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-
-    with distribution.scope():
-      ds_model = _make_model_with_add_metric()
-      self.assertLen(ds_model.metrics, 1)
-      ds_model.compile(
-          'sgd',
-          'mse')
-      ds_history = ds_model.fit(
-          x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-      # includes stateful loss metric in eager.
-      metrics_len = 2 if tf.executing_eagerly() else 1
-      self.assertLen(ds_model.metrics, metrics_len)
-
-    self.assertAllClose(history.history, ds_history.history)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategies_minus_tpu + multi_worker_mirrored_strategies,
-          mode=['eager']))
-  def test_sparse_tensor_outputs(self, distribution):
-
-    class ToSparse(keras.layers.Layer):
-      """Create a sparse tensor based on a given dense tensor."""
-
-      def call(self, inputs):
-        indices = tf.where(tf.not_equal(inputs, 0))
-        values = tf.gather_nd(inputs, indices)
-        shape = tf.shape(inputs, out_type='int64')
-        return tf.SparseTensor(indices, values, dense_shape=shape)
-
-    model = keras.Sequential([ToSparse()])
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0]])
-    output = model.predict(input_data, batch_size=2)
-
-    expected_indices = np.array([[0, 0], [1, 0], [1, 1]])
-    expected_values = np.array([1, 2, 3])
-    expected_dense_shape = np.array([2, 3])
-
-    self.assertAllEqual(output.indices, expected_indices)
-    self.assertAllEqual(output.values, expected_values)
-    self.assertAllEqual(output.dense_shape, expected_dense_shape)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategies_minus_tpu + multi_worker_mirrored_strategies,
-          mode=['eager']))
-  def test_ragged_tensor_outputs(self, distribution):
-
-    class ToRagged(keras.layers.Layer):
-      """Create a ragged tensor based on a given dense tensor."""
-
-      def __init__(self, padding, ragged_rank=1, **kwargs):
-        super().__init__(**kwargs)
-        self._padding = padding
-        self._ragged_rank = ragged_rank
-
-      def call(self, inputs):
-        return tf.RaggedTensor.from_tensor(
-            inputs, padding=self._padding, ragged_rank=self._ragged_rank)
-
-    model = keras.Sequential([ToRagged(padding=0)])
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0]])
-    output = model.predict(input_data, batch_size=2)
-
-    expected_values = [[1], [2, 3]]
-    self.assertAllEqual(expected_values, output)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategies_minus_default_minus_tpu + tpu_strategies +
-          multi_worker_mirrored_strategies,
-          mode=['eager']))
-  def test_correctness_of_add_loss_with_merge_call(self, distribution):
-    batch_size = 32
-
-    def _get_model():
-      inputs = keras.layers.Input(shape=(1,))
-      labels = keras.layers.Input(shape=(1,))
-      x = keras.layers.Dense(10, activation='relu')(inputs)
-      y = keras.layers.Dense(1)(x)
-      model = keras.models.Model([inputs, labels], y)
-      model.add_loss(keras.losses.mean_squared_error(labels, y))
-      return model
-
-    def _get_data():
-      x_train = np.random.rand(64, 1)
-      y_train = 3 * x_train
-      x_train = x_train.astype('float32')
-      y_train = y_train.astype('float32')
-      dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-      dataset = dataset.batch(batch_size)
-      return dataset
-
-    with distribution.scope():
-      model = _get_model()
-      optimizer = gradient_descent_keras.SGD(0.2)
-
-      @tf.function
-      def train_step(dist_inputs):
-
-        def step_fn(inputs):
-          with tf.GradientTape() as tape:
-            logits = model(inputs)
-
-            # Invoke a merge_call()
-            tf.distribute.get_replica_context().merge_call(
-                lambda d: None)
-
-            # Verify that there is only one loss on the model.
-            assert len(model.losses) == 1
-            loss_from_model = tf.reduce_sum(
-                model.losses) * 1.0 / batch_size
-
-            # Compute loss in this loop.
-            loss = keras.losses.mean_squared_error(inputs[1], logits)
-            loss = tf.nn.compute_average_loss(loss, global_batch_size=batch_size)
-
-            # Verify that the loss computed in this loop is equivalent to the
-            # loss from the model that was added via add_loss.
-            tf.compat.v1.assert_equal(loss, loss_from_model)
-
-          grads = tape.gradient(loss, model.trainable_variables)
-          optimizer.apply_gradients(zip(grads, model.trainable_variables))
-          return loss
-
-        per_replica_losses = distribution.run(step_fn, args=(dist_inputs,))
-        return distribution.reduce(
-            tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
-
-      dataset = distribution.experimental_distribute_dataset(_get_data())
-      for _ in range(2):
-        for x in dataset:
-          train_step(x)
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['graph', 'eager']))
-  def test_unimplemented_parameter_server_strategy(self):
-    cluster_spec = multi_worker_testing_utils.create_in_process_cluster(
-        num_workers=3, num_ps=2)
-    cluster_resolver = SimpleClusterResolver(
-        cluster_spec=tf.train.ClusterSpec(cluster_spec),
-        task_type='worker',
-        task_id=1,
-        num_accelerators={'GPU': 0})
-    distribution = tf.compat.v1.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-
-    self.assertIsInstance(distribution,
-                          tf.compat.v1.distribute.experimental.ParameterServerStrategy)
-
-    with self.assertRaisesRegex(NotImplementedError,
-                                'ParameterServerStrategy*'):
-      with distribution.scope():
-        model = simple_sequential_model()
-        optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.001)
-        loss = 'mse'
-        model.compile(optimizer, loss)
+        with self.assertRaisesRegex(
+            NotImplementedError, "ParameterServerStrategy*"
+        ):
+            with distribution.scope():
+                model = simple_sequential_model()
+                optimizer = tf.compat.v1.train.RMSPropOptimizer(
+                    learning_rate=0.001
+                )
+                loss = "mse"
+                model.compile(optimizer, loss)
 
 
 # Models to exercise inserting ancillary layers with add_loss and add_metric.
 def _functional_with_add_loss_and_metric(input_shape, num_classes, l1, l2):
-  inputs = keras.Input(input_shape, name='images')
-  x = keras.layers.Conv2D(32, kernel_size=5, activation='relu')(inputs)
-  x = keras.layers.MaxPooling2D(pool_size=2)(x)
-  x = keras.layers.Conv2D(64, kernel_size=5, activation='relu')(x)
-  x = keras.layers.MaxPooling2D(pool_size=2)(x)
-  # Apply L2 regularization to embedding. Use a mix of TensorFlow ops and layers
-  # to exercise all code paths.
-  x = keras.layers.Flatten(name='embedding')(x)
-  l2_loss = tf.reduce_mean(tf.reduce_sum(tf.square(x), -1))
-  # Apply L1 regularization to next layer.
-  x = keras.layers.Dense(1024, activation='relu', name='sparse_embedding')(x)
-  l1_loss = keras.layers.Lambda(
-      lambda x: tf.reduce_mean(tf.reduce_sum(x, -1)),
-      name='l1_loss')(
-          x)
-  outputs = keras.layers.Dense(num_classes, name='logits')(x)
-  model = keras.Model(inputs=inputs, outputs=outputs)
-  # Weight regularization terms.
-  model.add_loss(keras.layers.Lambda(lambda x: x * l2)(l2_loss))
-  model.add_metric(l2_loss, aggregation='mean', name='l2_loss')
-  model.add_loss(l1_loss * l1)
-  model.add_metric(l1_loss, aggregation='mean', name='l1_loss')
-  return model
+    inputs = keras.Input(input_shape, name="images")
+    x = keras.layers.Conv2D(32, kernel_size=5, activation="relu")(inputs)
+    x = keras.layers.MaxPooling2D(pool_size=2)(x)
+    x = keras.layers.Conv2D(64, kernel_size=5, activation="relu")(x)
+    x = keras.layers.MaxPooling2D(pool_size=2)(x)
+    # Apply L2 regularization to embedding. Use a mix of TensorFlow ops and
+    # layers to exercise all code paths.
+    x = keras.layers.Flatten(name="embedding")(x)
+    l2_loss = tf.reduce_mean(tf.reduce_sum(tf.square(x), -1))
+    # Apply L1 regularization to next layer.
+    x = keras.layers.Dense(1024, activation="relu", name="sparse_embedding")(x)
+    l1_loss = keras.layers.Lambda(
+        lambda x: tf.reduce_mean(tf.reduce_sum(x, -1)), name="l1_loss"
+    )(x)
+    outputs = keras.layers.Dense(num_classes, name="logits")(x)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    # Weight regularization terms.
+    model.add_loss(keras.layers.Lambda(lambda x: x * l2)(l2_loss))
+    model.add_metric(l2_loss, aggregation="mean", name="l2_loss")
+    model.add_loss(l1_loss * l1)
+    model.add_metric(l1_loss, aggregation="mean", name="l1_loss")
+    return model
 
 
 def _sequential_with_add_loss_and_metric(input_shape, num_classes, l1, l2):
-  model = keras.Sequential([
-      keras.layers.Conv2D(
-          32, kernel_size=5, activation='relu', input_shape=input_shape),
-      keras.layers.MaxPooling2D(pool_size=2),
-      keras.layers.Conv2D(64, kernel_size=5, activation='relu'),
-      keras.layers.MaxPooling2D(pool_size=2),
-      keras.layers.Flatten(name='embedding'),
-      keras.layers.Dense(1024, activation='relu', name='sparse_embedding'),
-      keras.layers.Dense(num_classes, name='logits'),
-  ])
-  # Extract layer outputs, add regularization terms, and rescale the metric.
-  # Use a mix of TensorFlow ops and layers to exercise all code paths.
-  x = model.get_layer('sparse_embedding').get_output_at(-1)
-  l1_loss = l1 * tf.reduce_mean(tf.reduce_sum(x, -1))
-  model.add_loss(l1_loss)
-  model.add_metric(
-      keras.layers.Lambda(lambda x: tf.divide(x, l1))(l1_loss),
-      aggregation='mean',
-      name='l1_loss')
-  x = model.get_layer('embedding').get_output_at(-1)
-  l2_loss = keras.layers.Lambda(
-      lambda x: l2 * tf.reduce_mean(tf.reduce_sum(x * x, -1)),
-      name='l2_loss')(
-          x)
-  model.add_loss(l2_loss)
-  model.add_metric(l2_loss / l2, aggregation='mean', name='l2_loss')
-  return model
+    model = keras.Sequential(
+        [
+            keras.layers.Conv2D(
+                32, kernel_size=5, activation="relu", input_shape=input_shape
+            ),
+            keras.layers.MaxPooling2D(pool_size=2),
+            keras.layers.Conv2D(64, kernel_size=5, activation="relu"),
+            keras.layers.MaxPooling2D(pool_size=2),
+            keras.layers.Flatten(name="embedding"),
+            keras.layers.Dense(
+                1024, activation="relu", name="sparse_embedding"
+            ),
+            keras.layers.Dense(num_classes, name="logits"),
+        ]
+    )
+    # Extract layer outputs, add regularization terms, and rescale the metric.
+    # Use a mix of TensorFlow ops and layers to exercise all code paths.
+    x = model.get_layer("sparse_embedding").get_output_at(-1)
+    l1_loss = l1 * tf.reduce_mean(tf.reduce_sum(x, -1))
+    model.add_loss(l1_loss)
+    model.add_metric(
+        keras.layers.Lambda(lambda x: tf.divide(x, l1))(l1_loss),
+        aggregation="mean",
+        name="l1_loss",
+    )
+    x = model.get_layer("embedding").get_output_at(-1)
+    l2_loss = keras.layers.Lambda(
+        lambda x: l2 * tf.reduce_mean(tf.reduce_sum(x * x, -1)), name="l2_loss"
+    )(x)
+    model.add_loss(l2_loss)
+    model.add_metric(l2_loss / l2, aggregation="mean", name="l2_loss")
+    return model
 
 
 def _functional_with_layer_reuse(input_shape, num_classes, l1, l2):
-  base_model = keras.Sequential([
-      keras.layers.Conv2D(
-          32, kernel_size=5, activation='relu', input_shape=input_shape),
-      keras.layers.MaxPooling2D(pool_size=2),
-      keras.layers.Conv2D(64, kernel_size=5, activation='relu'),
-      keras.layers.MaxPooling2D(pool_size=2),
-      keras.layers.Flatten(),
-      keras.layers.Dense(1024, activation='relu'),
-      keras.layers.Dense(num_classes, name='logits'),
-  ])
-  inputs = keras.Input(input_shape, name='images')
-  logits = base_model(inputs)
-  model = keras.Model(inputs=inputs, outputs=logits)
-  # Reuse sequential layer and create new nodes.
-  zero_logits = base_model(tf.zeros_like(inputs))
-  one_logits = base_model(tf.ones_like(inputs))
-  # L2 loss.
-  l2_loss = tf.reduce_mean(
-      tf.reduce_sum(tf.square(logits - zero_logits), -1))
-  model.add_loss(l2_loss * l2)
-  model.add_metric(l2_loss, aggregation='mean', name='l2_loss')
-  # L1 loss.
-  l1_loss = tf.reduce_mean(
-      tf.reduce_sum(tf.abs(logits - one_logits), -1))
-  model.add_loss(l1_loss * l1)
-  model.add_metric(l1_loss, aggregation='mean', name='l1_loss')
-  return model
+    base_model = keras.Sequential(
+        [
+            keras.layers.Conv2D(
+                32, kernel_size=5, activation="relu", input_shape=input_shape
+            ),
+            keras.layers.MaxPooling2D(pool_size=2),
+            keras.layers.Conv2D(64, kernel_size=5, activation="relu"),
+            keras.layers.MaxPooling2D(pool_size=2),
+            keras.layers.Flatten(),
+            keras.layers.Dense(1024, activation="relu"),
+            keras.layers.Dense(num_classes, name="logits"),
+        ]
+    )
+    inputs = keras.Input(input_shape, name="images")
+    logits = base_model(inputs)
+    model = keras.Model(inputs=inputs, outputs=logits)
+    # Reuse sequential layer and create new nodes.
+    zero_logits = base_model(tf.zeros_like(inputs))
+    one_logits = base_model(tf.ones_like(inputs))
+    # L2 loss.
+    l2_loss = tf.reduce_mean(tf.reduce_sum(tf.square(logits - zero_logits), -1))
+    model.add_loss(l2_loss * l2)
+    model.add_metric(l2_loss, aggregation="mean", name="l2_loss")
+    # L1 loss.
+    l1_loss = tf.reduce_mean(tf.reduce_sum(tf.abs(logits - one_logits), -1))
+    model.add_loss(l1_loss * l1)
+    model.add_metric(l1_loss, aggregation="mean", name="l1_loss")
+    return model
 
 
 class TestDistributionStrategyWithMultipleAddLossAndMetricCalls(
-    tf.test.TestCase, parameterized.TestCase):
-  """Tests complex models with multiple add loss and metric calls."""
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          all_strategy_combinations_minus_default(),
-          tf.__internal__.test.combinations.combine(
-              model_fn=[
-                  _functional_with_add_loss_and_metric,
-                  _sequential_with_add_loss_and_metric,
-                  _functional_with_layer_reuse,
-              ],
-              l1=[0.01],
-              l2=[0.1])))
-  def test_fit_and_evaluate(self, distribution, model_fn, l1, l2):
-    # Make fake MNIST-like image data.
-    np.random.seed(_RANDOM_SEED)
-    dataset = tf.data.Dataset.from_tensor_slices(
-        (np.random.uniform(size=(64, 28, 28, 1)).astype(np.float32),
-         np.random.randint(0, 10, size=(64,))))
-    dataset = dataset.shuffle(64).batch(
-        8 * distribution.num_replicas_in_sync, drop_remainder=True)
-    # Make model with distribution strategy and initialize with dataset shape.
-    input_shape = tf.data.experimental.get_structure(dataset)[0].shape[1:]
-    with distribution.scope():
-      model = model_fn(input_shape, 10, l1, l2)
-      model.compile(
-          optimizer=keras.optimizers.adam_v2.Adam(1e-4),
-          loss=keras.losses.SparseCategoricalCrossentropy(
-              from_logits=True,
-              reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE),
-          metrics=[
-              keras.metrics.SparseCategoricalAccuracy(),
-              keras.metrics.SparseCategoricalCrossentropy(from_logits=True),
-          ])
-    # Non-eager training doesn't support steps_per_epoch=None.
-    for unused_epoch in range(2):
-      model.fit(dataset)
-    results = dict(zip(model.metrics_names, model.evaluate(dataset)))
-    # Sanity checks.
-    self.assertBetween(results['sparse_categorical_accuracy'], 0.02, 1.)
-    self.assertGreater(results['l2_loss'], 0.)
-    self.assertGreater(results['l1_loss'], 0.)
-    # Assert correctness of the loss calculation and updating of metrics.
-    self.assertNear(
-        results['l1_loss'] * l1 + results['l2_loss'] * l2 +
-        results['sparse_categorical_crossentropy'], results['loss'], 1e-6)
+    tf.test.TestCase, parameterized.TestCase
+):
+    """Tests complex models with multiple add loss and metric calls."""
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            all_strategy_combinations_minus_default(),
+            tf.__internal__.test.combinations.combine(
+                model_fn=[
+                    _functional_with_add_loss_and_metric,
+                    _sequential_with_add_loss_and_metric,
+                    _functional_with_layer_reuse,
+                ],
+                l1=[0.01],
+                l2=[0.1],
+            ),
+        )
+    )
+    def test_fit_and_evaluate(self, distribution, model_fn, l1, l2):
+        # Make fake MNIST-like image data.
+        np.random.seed(_RANDOM_SEED)
+        dataset = tf.data.Dataset.from_tensor_slices(
+            (
+                np.random.uniform(size=(64, 28, 28, 1)).astype(np.float32),
+                np.random.randint(0, 10, size=(64,)),
+            )
+        )
+        dataset = dataset.shuffle(64).batch(
+            8 * distribution.num_replicas_in_sync, drop_remainder=True
+        )
+        # Make model with distribution strategy and initialize with dataset
+        # shape.
+        input_shape = tf.data.experimental.get_structure(dataset)[0].shape[1:]
+        with distribution.scope():
+            model = model_fn(input_shape, 10, l1, l2)
+            model.compile(
+                optimizer=keras.optimizers.adam_legacy.Adam(1e-4),
+                loss=keras.losses.SparseCategoricalCrossentropy(
+                    from_logits=True,
+                    reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+                ),
+                metrics=[
+                    keras.metrics.SparseCategoricalAccuracy(),
+                    keras.metrics.SparseCategoricalCrossentropy(
+                        from_logits=True
+                    ),
+                ],
+            )
+        # Non-eager training doesn't support steps_per_epoch=None.
+        for unused_epoch in range(2):
+            model.fit(dataset)
+        results = dict(zip(model.metrics_names, model.evaluate(dataset)))
+        # Sanity checks.
+        self.assertBetween(results["sparse_categorical_accuracy"], 0.02, 1.0)
+        self.assertGreater(results["l2_loss"], 0.0)
+        self.assertGreater(results["l1_loss"], 0.0)
+        # Assert correctness of the loss calculation and updating of metrics.
+        self.assertNear(
+            results["l1_loss"] * l1
+            + results["l2_loss"] * l2
+            + results["sparse_categorical_crossentropy"],
+            results["loss"],
+            1e-6,
+        )
 
 
 class DeterministicModel(keras.Model):
-  """Deterministic Model that always outputs the same initial result.
+    """Deterministic Model that always outputs the same initial result.
 
-  It verifies the `call` method is run inside the same distribution
-  strategy that the model was initially passed.
-  """
+    It verifies the `call` method is run inside the same distribution
+    strategy that the model was initially passed.
+    """
 
-  def __init__(self, strategy):
-    super().__init__()
-    self.x = None
-    self.strategy = strategy
+    def __init__(self, strategy):
+        super().__init__()
+        self.x = None
+        self.strategy = strategy
 
-  def build(self, input_shape):
-    self.x = tf.Variable(tf.ones(shape=()))
+    def build(self, input_shape):
+        self.x = tf.Variable(tf.ones(shape=()))
 
-  def call(self, inputs, training=None, mask=None):
-    active_strategy = tf.distribute.get_strategy()
-    if active_strategy is not self.strategy:
-      raise ValueError('Model must execute call w/ the original strategy')
-    return self.x * inputs
+    def call(self, inputs, training=None, mask=None):
+        active_strategy = tf.distribute.get_strategy()
+        if active_strategy is not self.strategy:
+            raise ValueError("Model must execute call w/ the original strategy")
+        return self.x * inputs
 
 
 class TestModelCapturesStrategy(tf.test.TestCase, parameterized.TestCase):
-  """Tests that model creation captures the strategy."""
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies, mode=['eager']))
-  def test_fit_and_evaluate(self, distribution):
-    dataset = tf.data.Dataset.from_tensor_slices(
-        (tf.ones(shape=(64,)), tf.ones(shape=(64,))))
-    dataset = dataset.batch(8 * distribution.num_replicas_in_sync)
-    # Make model with distribution strategy
-    with distribution.scope():
-      model = DeterministicModel(distribution)
-      optimizer = keras.optimizers.adam_v2.Adam(1e-4)
-
-    # Compile & evaluate the model outside of the distribution strategy scope
-    model.compile(
-        optimizer=optimizer,
-        loss=keras.losses.MeanSquaredError(),
-        metrics=['binary_accuracy'])
-
-    # Call `optimizer.iterations` out of strategy scope.
-    self.assertEqual(model.optimizer.iterations.numpy(), 0)
-
-    # Non-eager training doesn't support steps_per_epoch=None.
-    for unused_epoch in range(2):
-      model.fit(dataset)
-
-    results = model.evaluate(dataset)
-    results = dict(zip(model.metrics_names, results))
-
-    # Check that the metrics have a result we expect
-    self.assertEqual(results['binary_accuracy'], 1.0)
-    self.assertAllClose(results['loss'], 0.0)
-
-    # Assert that all metric/optimizer/model variables were made in the
-    # distribution strategy (Test that compile uses the captured
-    # distribution strategy)
-    metric_vars = tf.nest.flatten(
-        [metric.variables for metric in model.metrics])
-    for var in metric_vars:
-      self.assertTrue(distribution.extended.variable_created_in_scope(var))
-    for var in model.optimizer._weights:
-      self.assertTrue(distribution.extended.variable_created_in_scope(var))
-    for var in model.variables:
-      self.assertTrue(distribution.extended.variable_created_in_scope(var))
-
-    # Make sure the metric must be created in the same scope as the model:
-    # This shouldn't raise any validation errors
-    with distribution.scope():
-      metric = keras.metrics.BinaryAccuracy()
-    model.compile(
-        optimizer=optimizer,
-        loss=keras.losses.MeanSquaredError(),
-        metrics=[metric])
-
-    # This should raise an error because the metric is constructed
-    # outside of the scope, and not by compile
-    if tf.distribute.has_strategy():
-      with self.assertRaisesRegex(ValueError, 'All metrics must be created in'):
+    """Tests that model creation captures the strategy."""
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_fit_and_evaluate(self, distribution):
+        dataset = tf.data.Dataset.from_tensor_slices(
+            (tf.ones(shape=(64,)), tf.ones(shape=(64,)))
+        )
+        dataset = dataset.batch(8 * distribution.num_replicas_in_sync)
+        # Make model with distribution strategy
+        with distribution.scope():
+            model = DeterministicModel(distribution)
+            optimizer = keras.optimizers.adam_legacy.Adam(1e-4)
+
+        # Compile & evaluate the model outside of the distribution strategy
+        # scope
+        model.compile(
+            optimizer=optimizer,
+            loss=keras.losses.MeanSquaredError(),
+            metrics=["binary_accuracy"],
+        )
+
+        # Call `optimizer.iterations` out of strategy scope.
+        self.assertEqual(model.optimizer.iterations.numpy(), 0)
+
+        # Non-eager training doesn't support steps_per_epoch=None.
+        for unused_epoch in range(2):
+            model.fit(dataset)
+
+        results = model.evaluate(dataset)
+        results = dict(zip(model.metrics_names, results))
+
+        # Check that the metrics have a result we expect
+        self.assertEqual(results["binary_accuracy"], 1.0)
+        self.assertAllClose(results["loss"], 0.0)
+
+        # Assert that all metric/optimizer/model variables were made in the
+        # distribution strategy (Test that compile uses the captured
+        # distribution strategy)
+        metric_vars = tf.nest.flatten(
+            [metric.variables for metric in model.metrics]
+        )
+        for var in metric_vars:
+            self.assertTrue(
+                distribution.extended.variable_created_in_scope(var)
+            )
+        for var in model.optimizer._weights:
+            self.assertTrue(
+                distribution.extended.variable_created_in_scope(var)
+            )
+        for var in model.variables:
+            self.assertTrue(
+                distribution.extended.variable_created_in_scope(var)
+            )
+
+        # Make sure the metric must be created in the same scope as the model:
+        # This shouldn't raise any validation errors
+        with distribution.scope():
+            metric = keras.metrics.BinaryAccuracy()
         model.compile(
-            optimizer=keras.optimizers.adam_v2.Adam(1e-4),
+            optimizer=optimizer,
             loss=keras.losses.MeanSquaredError(),
-            metrics=[keras.metrics.BinaryAccuracy()])
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=tf.__internal__.distribute.combinations.mirrored_strategy_with_one_cpu,
-          mode=['eager']))
-  def test_optimizer(self, distribution):
-    temp_dir = os.path.join(self.get_temp_dir(), 'ckpt')
-
-    def create_model():
-      model = keras.models.Sequential([
-          keras.layers.Dense(1),
-      ])
-      model.compile(optimizer='adam', loss='mse')
-      model.build([None, 1])  # create weights.
-      self.assertEmpty(model.optimizer.weights)
-      return model
-
-    model = create_model()
-    x = y = tf.ones(shape=(1, 1))
-    model.fit(x=x, y=y, batch_size=1)
-    model.save_weights(temp_dir)
-
-    with distribution.scope():
-      model = create_model()
-      model.load_weights(temp_dir)
-      self.assertNotEmpty(model.optimizer.weights)
-      self.assertTrue(
-          distributed_training_utils.is_distributed_variable(
-              model.optimizer.weights[0]))
-
-    with distribution.scope():
-      model = create_model()
-    # create/restore slot variables outside of scope is fine.
-    model.load_weights(temp_dir)
-    self.assertNotEmpty(model.optimizer.weights)
-    self.assertTrue(
-        distributed_training_utils.is_distributed_variable(
-            model.optimizer.weights[0]))
-
-
-if __name__ == '__main__':
-  base_layer_utils.enable_v2_dtype_behavior()
-  tf.__internal__.distribute.multi_process_runner.test_main()
+            metrics=[metric],
+        )
+
+        # This should raise an error because the metric is constructed
+        # outside of the scope, and not by compile
+        if tf.distribute.has_strategy():
+            with self.assertRaisesRegex(
+                ValueError, "All metrics must be created in"
+            ):
+                model.compile(
+                    optimizer=keras.optimizers.adam_v2.Adam(1e-4),
+                    loss=keras.losses.MeanSquaredError(),
+                    metrics=[keras.metrics.BinaryAccuracy()],
+                )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=tf.__internal__.distribute.combinations.mirrored_strategy_with_one_cpu,  # noqa: E501
+            mode=["eager"],
+        )
+    )
+    def test_optimizer(self, distribution):
+        temp_dir = os.path.join(self.get_temp_dir(), "ckpt")
+
+        def create_model():
+            model = keras.models.Sequential(
+                [
+                    keras.layers.Dense(1),
+                ]
+            )
+            model.compile(optimizer=keras.optimizers.Adam(), loss="mse")
+            model.build([None, 1])  # create weights.
+            return model
+
+        model = create_model()
+        x = y = tf.ones(shape=(1, 1))
+        model.fit(x=x, y=y, batch_size=1)
+        model.save_weights(temp_dir)
+
+        with distribution.scope():
+            model = create_model()
+            model.load_weights(temp_dir)
+            if isinstance(model.optimizer, optimizer_base.Optimizer):
+                model.optimizer.build(model.trainable_variables)
+                variables = model.optimizer.variables
+            else:
+                variables = model.optimizer.variables()
+            self.assertNotEmpty(variables)
+            self.assertTrue(
+                distributed_training_utils.is_distributed_variable(variables[0])
+            )
+
+        with distribution.scope():
+            model = create_model()
+        # create/restore slot variables outside of scope is fine.
+        model.load_weights(temp_dir)
+        if isinstance(model.optimizer, optimizer_base.Optimizer):
+            # V3 optimizer has to restore variables in scope.
+            return
+        # From this point on, the optimizer must be a V2 optimizer.
+        self.assertNotEmpty(model.optimizer.variables())
+        self.assertTrue(
+            distributed_training_utils.is_distributed_variable(
+                model.optimizer.variables()[0]
+            )
+        )
+
+
+if __name__ == "__main__":
+    base_layer_utils.enable_v2_dtype_behavior()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/distributed_file_utils.py b/keras/distribute/distributed_file_utils.py
index 52de006e8b5b..fec668cfaa59 100644
--- a/keras/distribute/distributed_file_utils.py
+++ b/keras/distribute/distributed_file_utils.py
@@ -44,102 +44,137 @@
 Experimental. API is subject to change.
 """
 
+import os
+
+import requests
 import tensorflow.compat.v2 as tf
 
-import os
+GCP_METADATA_HEADER = {"Metadata-Flavor": "Google"}
+_GCE_METADATA_URL_ENV_VARIABLE = "GCE_METADATA_IP"
 
 
 def _get_base_dirpath(strategy):
-  task_id = strategy.extended._task_id  # pylint: disable=protected-access
-  return 'workertemp_' + str(task_id)
+    task_id = strategy.extended._task_id
+    return "workertemp_" + str(task_id)
 
 
 def _is_temp_dir(dirpath, strategy):
-  return dirpath.endswith(_get_base_dirpath(strategy))
+    return dirpath.endswith(_get_base_dirpath(strategy))
 
 
 def _get_temp_dir(dirpath, strategy):
-  if _is_temp_dir(dirpath, strategy):
-    temp_dir = dirpath
-  else:
-    temp_dir = os.path.join(dirpath, _get_base_dirpath(strategy))
-  tf.io.gfile.makedirs(temp_dir)
-  return temp_dir
+    if _is_temp_dir(dirpath, strategy):
+        temp_dir = dirpath
+    else:
+        temp_dir = os.path.join(dirpath, _get_base_dirpath(strategy))
+    tf.io.gfile.makedirs(temp_dir)
+    return temp_dir
 
 
 def write_dirpath(dirpath, strategy):
-  """Returns the writing dir that should be used to save file distributedly.
-
-  `dirpath` would be created if it doesn't exist.
-
-  Args:
-    dirpath: Original dirpath that would be used without distribution.
-    strategy: The tf.distribute strategy object currently used.
-
-  Returns:
-    The writing dir path that should be used to save with distribution.
-  """
-  if strategy is None:
-    # Infer strategy from `distribution_strategy_context` if not given.
-    strategy = tf.distribute.get_strategy()
-  if strategy is None:
-    # If strategy is still not available, this is not in distributed training.
-    # Fallback to original dirpath.
-    return dirpath
-  if not strategy.extended._in_multi_worker_mode():  # pylint: disable=protected-access
-    return dirpath
-  if strategy.extended.should_checkpoint:
-    return dirpath
-  # If this worker is not chief and hence should not save file, save it to a
-  # temporary directory to be removed later.
-  return _get_temp_dir(dirpath, strategy)
+    """Returns the writing dir that should be used to save file distributedly.
+
+    `dirpath` would be created if it doesn't exist.
+
+    Args:
+      dirpath: Original dirpath that would be used without distribution.
+      strategy: The tf.distribute strategy object currently used.
+
+    Returns:
+      The writing dir path that should be used to save with distribution.
+    """
+    if strategy is None:
+        # Infer strategy from `tf.distribute` if not given.
+        strategy = tf.distribute.get_strategy()
+    if strategy is None:
+        # If strategy is still not available, this is not in distributed
+        # training.  Fallback to original dirpath.
+        return dirpath
+    if not strategy.extended._in_multi_worker_mode():
+        return dirpath
+    if strategy.extended.should_checkpoint:
+        return dirpath
+    # If this worker is not chief and hence should not save file, save it to a
+    # temporary directory to be removed later.
+    return _get_temp_dir(dirpath, strategy)
 
 
 def remove_temp_dirpath(dirpath, strategy):
-  """Removes the temp path after writing is finished.
-
-  Args:
-    dirpath: Original dirpath that would be used without distribution.
-    strategy: The tf.distribute strategy object currently used.
-  """
-  if strategy is None:
-    # Infer strategy from `distribution_strategy_context` if not given.
-    strategy = tf.distribute.get_strategy()
-  if strategy is None:
-    # If strategy is still not available, this is not in distributed training.
-    # Fallback to no-op.
-    return
-  # TODO(anjalisridhar): Consider removing the check for multi worker mode since
-  # it is redundant when used with the should_checkpoint property.
-  if (strategy.extended._in_multi_worker_mode() and  # pylint: disable=protected-access
-      not strategy.extended.should_checkpoint):
-    # If this worker is not chief and hence should not save file, remove
-    # the temporary directory.
-    tf.compat.v1.gfile.DeleteRecursively(_get_temp_dir(dirpath, strategy))
+    """Removes the temp path after writing is finished.
+
+    Args:
+      dirpath: Original dirpath that would be used without distribution.
+      strategy: The tf.distribute strategy object currently used.
+    """
+    if strategy is None:
+        # Infer strategy from `tf.distribute` if not given.
+        strategy = tf.distribute.get_strategy()
+    if strategy is None:
+        # If strategy is still not available, this is not in distributed
+        # training.  Fallback to no-op.
+        return
+    # TODO(anjalisridhar): Consider removing the check for multi worker mode
+    # since it is redundant when used with the should_checkpoint property.
+    if (
+        strategy.extended._in_multi_worker_mode()
+        and not strategy.extended.should_checkpoint
+    ):
+        # If this worker is not chief and hence should not save file, remove
+        # the temporary directory.
+        tf.compat.v1.gfile.DeleteRecursively(_get_temp_dir(dirpath, strategy))
 
 
 def write_filepath(filepath, strategy):
-  """Returns the writing file path to be used to save file distributedly.
+    """Returns the writing file path to be used to save file distributedly.
 
-  Directory to contain `filepath` would be created if it doesn't exist.
+    Directory to contain `filepath` would be created if it doesn't exist.
 
-  Args:
-    filepath: Original filepath that would be used without distribution.
-    strategy: The tf.distribute strategy object currently used.
+    Args:
+      filepath: Original filepath that would be used without distribution.
+      strategy: The tf.distribute strategy object currently used.
 
-  Returns:
-    The writing filepath that should be used to save file with distribution.
-  """
-  dirpath = os.path.dirname(filepath)
-  base = os.path.basename(filepath)
-  return os.path.join(write_dirpath(dirpath, strategy), base)
+    Returns:
+      The writing filepath that should be used to save file with distribution.
+    """
+    dirpath = os.path.dirname(filepath)
+    base = os.path.basename(filepath)
+    return os.path.join(write_dirpath(dirpath, strategy), base)
 
 
 def remove_temp_dir_with_filepath(filepath, strategy):
-  """Removes the temp path for file after writing is finished.
-
-  Args:
-    filepath: Original filepath that would be used without distribution.
-    strategy: The tf.distribute strategy object currently used.
-  """
-  remove_temp_dirpath(os.path.dirname(filepath), strategy)
+    """Removes the temp path for file after writing is finished.
+
+    Args:
+      filepath: Original filepath that would be used without distribution.
+      strategy: The tf.distribute strategy object currently used.
+    """
+    remove_temp_dirpath(os.path.dirname(filepath), strategy)
+
+
+def _on_gcp():
+    """Detect whether the current running environment is on GCP."""
+    gce_metadata_endpoint = "http://" + os.environ.get(
+        _GCE_METADATA_URL_ENV_VARIABLE, "metadata.google.internal"
+    )
+
+    try:
+        # Timeout in 5 seconds, in case the test environment has connectivity
+        # issue. There is not default timeout, which means it might block
+        # forever.
+        response = requests.get(
+            f"{gce_metadata_endpoint}/computeMetadata/v1/{'instance/hostname'}",
+            headers=GCP_METADATA_HEADER,
+            timeout=5,
+        )
+        return response.status_code
+    except requests.exceptions.RequestException:
+        return False
+
+
+def support_on_demand_checkpoint_callback(strategy):
+    if _on_gcp() and isinstance(
+        strategy, tf.distribute.MultiWorkerMirroredStrategy
+    ):
+        return True
+
+    return False
diff --git a/keras/distribute/distributed_file_utils_test.py b/keras/distribute/distributed_file_utils_test.py
index ddd7f0485bd0..0260b45c13c5 100644
--- a/keras/distribute/distributed_file_utils_test.py
+++ b/keras/distribute/distributed_file_utils_test.py
@@ -14,119 +14,121 @@
 # ==============================================================================
 """Tests for distributed_file_utils."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
+import tensorflow.compat.v2 as tf
+
 from keras.distribute import distributed_file_utils
 
 
 class DistributedFileUtilsTest(tf.test.TestCase):
-
-  class MockedExtended:
-    pass
-
-  class MockedChiefStrategy:
-
-    def __init__(self):
-      self.extended = DistributedFileUtilsTest.MockedExtended()
-      self.extended._in_multi_worker_mode = lambda: True
-      self.extended.should_checkpoint = True
-
-  class MockedWorkerStrategy:
-
-    def __init__(self):
-      self.extended = DistributedFileUtilsTest.MockedExtended()
-      self.extended._in_multi_worker_mode = lambda: True
-      self.extended.should_checkpoint = False
-      self.extended._task_id = 3
-
-  class MockedSingleWorkerStrategy:
-
-    def __init__(self):
-      self.extended = DistributedFileUtilsTest.MockedExtended()
-      self.extended._in_multi_worker_mode = lambda: False
-
-  def _write_dummy_file(self, file_to_write):
-    with open(file_to_write, 'w') as f:
-      f.write('foo bar')
-
-  def testChiefWriteDirAndFilePath(self):
-    dirpath = self.get_temp_dir()
-    filepath = os.path.join(dirpath, 'foo.bar')
-    strategy = DistributedFileUtilsTest.MockedChiefStrategy()
-    self.assertEqual(
-        distributed_file_utils.write_filepath(filepath, strategy), filepath)
-    self.assertEqual(
-        distributed_file_utils.write_dirpath(dirpath, strategy), dirpath)
-
-  def testWorkerWriteDirAndFilePath(self):
-    dirpath = self.get_temp_dir()
-    filepath = os.path.join(dirpath, 'foo.bar')
-    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
-    self.assertEqual(
-        distributed_file_utils.write_filepath(filepath, strategy),
-        os.path.join(dirpath, 'workertemp_3', 'foo.bar'))
-    self.assertEqual(
-        distributed_file_utils.write_dirpath(dirpath, strategy),
-        os.path.join(dirpath, 'workertemp_3'))
-
-  def testChiefDoesNotRemoveDirAndFilePath(self):
-    temp_dir = self.get_temp_dir()
-    strategy = DistributedFileUtilsTest.MockedChiefStrategy()
-    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
-    file_to_write = os.path.join(dir_to_write, 'tmp')
-    self.assertFalse(os.path.exists(file_to_write))
-    self._write_dummy_file(file_to_write)
-    self.assertTrue(os.path.exists(file_to_write))
-    distributed_file_utils.remove_temp_dir_with_filepath(
-        file_to_write, strategy)
-    self.assertTrue(os.path.exists(file_to_write))
-
-  def testWorkerDoesRemoveFilePath(self):
-    temp_dir = self.get_temp_dir()
-    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
-    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
-    file_to_write = os.path.join(dir_to_write, 'tmp')
-    self.assertFalse(os.path.exists(file_to_write))
-    self._write_dummy_file(file_to_write)
-    self.assertTrue(os.path.exists(file_to_write))
-    distributed_file_utils.remove_temp_dir_with_filepath(
-        file_to_write, strategy)
-    self.assertFalse(os.path.exists(file_to_write))
-
-  def testWorkerDoesRemoveDirPath(self):
-    temp_dir = self.get_temp_dir()
-    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
-    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
-    file_to_write = os.path.join(dir_to_write, 'tmp')
-    self.assertFalse(os.path.exists(file_to_write))
-    self._write_dummy_file(file_to_write)
-    self.assertTrue(os.path.exists(file_to_write))
-    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
-    self.assertFalse(os.path.exists(file_to_write))
-    self.assertFalse(os.path.exists(os.path.dirname(file_to_write)))
-
-  def testMultipleRemoveOrigDirPathIsFine(self):
-    temp_dir = self.get_temp_dir()
-    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
-    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
-    file_to_write = os.path.join(dir_to_write, 'tmp')
-    self._write_dummy_file(file_to_write)
-    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
-    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
-    distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
-
-  def testMultipleRemoveDirToWritePathIsFine(self):
-    temp_dir = self.get_temp_dir()
-    strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
-    dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
-    file_to_write = os.path.join(dir_to_write, 'tmp')
-    self._write_dummy_file(file_to_write)
-    distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
-    distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
-    distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    class MockedExtended:
+        pass
+
+    class MockedChiefStrategy:
+        def __init__(self):
+            self.extended = DistributedFileUtilsTest.MockedExtended()
+            self.extended._in_multi_worker_mode = lambda: True
+            self.extended.should_checkpoint = True
+
+    class MockedWorkerStrategy:
+        def __init__(self):
+            self.extended = DistributedFileUtilsTest.MockedExtended()
+            self.extended._in_multi_worker_mode = lambda: True
+            self.extended.should_checkpoint = False
+            self.extended._task_id = 3
+
+    class MockedSingleWorkerStrategy:
+        def __init__(self):
+            self.extended = DistributedFileUtilsTest.MockedExtended()
+            self.extended._in_multi_worker_mode = lambda: False
+
+    def _write_dummy_file(self, file_to_write):
+        with open(file_to_write, "w") as f:
+            f.write("foo bar")
+
+    def testChiefWriteDirAndFilePath(self):
+        dirpath = self.get_temp_dir()
+        filepath = os.path.join(dirpath, "foo.bar")
+        strategy = DistributedFileUtilsTest.MockedChiefStrategy()
+        self.assertEqual(
+            distributed_file_utils.write_filepath(filepath, strategy), filepath
+        )
+        self.assertEqual(
+            distributed_file_utils.write_dirpath(dirpath, strategy), dirpath
+        )
+
+    def testWorkerWriteDirAndFilePath(self):
+        dirpath = self.get_temp_dir()
+        filepath = os.path.join(dirpath, "foo.bar")
+        strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+        self.assertEqual(
+            distributed_file_utils.write_filepath(filepath, strategy),
+            os.path.join(dirpath, "workertemp_3", "foo.bar"),
+        )
+        self.assertEqual(
+            distributed_file_utils.write_dirpath(dirpath, strategy),
+            os.path.join(dirpath, "workertemp_3"),
+        )
+
+    def testChiefDoesNotRemoveDirAndFilePath(self):
+        temp_dir = self.get_temp_dir()
+        strategy = DistributedFileUtilsTest.MockedChiefStrategy()
+        dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+        file_to_write = os.path.join(dir_to_write, "tmp")
+        self.assertFalse(os.path.exists(file_to_write))
+        self._write_dummy_file(file_to_write)
+        self.assertTrue(os.path.exists(file_to_write))
+        distributed_file_utils.remove_temp_dir_with_filepath(
+            file_to_write, strategy
+        )
+        self.assertTrue(os.path.exists(file_to_write))
+
+    def testWorkerDoesRemoveFilePath(self):
+        temp_dir = self.get_temp_dir()
+        strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+        dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+        file_to_write = os.path.join(dir_to_write, "tmp")
+        self.assertFalse(os.path.exists(file_to_write))
+        self._write_dummy_file(file_to_write)
+        self.assertTrue(os.path.exists(file_to_write))
+        distributed_file_utils.remove_temp_dir_with_filepath(
+            file_to_write, strategy
+        )
+        self.assertFalse(os.path.exists(file_to_write))
+
+    def testWorkerDoesRemoveDirPath(self):
+        temp_dir = self.get_temp_dir()
+        strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+        dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+        file_to_write = os.path.join(dir_to_write, "tmp")
+        self.assertFalse(os.path.exists(file_to_write))
+        self._write_dummy_file(file_to_write)
+        self.assertTrue(os.path.exists(file_to_write))
+        distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+        self.assertFalse(os.path.exists(file_to_write))
+        self.assertFalse(os.path.exists(os.path.dirname(file_to_write)))
+
+    def testMultipleRemoveOrigDirPathIsFine(self):
+        temp_dir = self.get_temp_dir()
+        strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+        dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+        file_to_write = os.path.join(dir_to_write, "tmp")
+        self._write_dummy_file(file_to_write)
+        distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+        distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+        distributed_file_utils.remove_temp_dirpath(temp_dir, strategy)
+
+    def testMultipleRemoveDirToWritePathIsFine(self):
+        temp_dir = self.get_temp_dir()
+        strategy = DistributedFileUtilsTest.MockedWorkerStrategy()
+        dir_to_write = distributed_file_utils.write_dirpath(temp_dir, strategy)
+        file_to_write = os.path.join(dir_to_write, "tmp")
+        self._write_dummy_file(file_to_write)
+        distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
+        distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
+        distributed_file_utils.remove_temp_dirpath(dir_to_write, strategy)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/distribute/distributed_training_utils.py b/keras/distribute/distributed_training_utils.py
index 876f83c7142b..61edf4f5193d 100644
--- a/keras/distribute/distributed_training_utils.py
+++ b/keras/distribute/distributed_training_utils.py
@@ -14,10 +14,12 @@
 # ==============================================================================
 """Utilities related to distributed training."""
 
-from absl import flags
-from keras import backend
+import contextlib
 
 import tensorflow.compat.v2 as tf
+from absl import flags
+
+from keras import backend
 
 FLAGS = flags.FLAGS
 
@@ -26,92 +28,115 @@
 # core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
 # no longer needed.
 def global_batch_size_supported(distribution_strategy):
-  return distribution_strategy.extended._global_batch_size  # pylint: disable=protected-access
+    return distribution_strategy.extended._global_batch_size
 
 
 def call_replica_local_fn(fn, *args, **kwargs):
-  """Call a function that uses replica-local variables.
-
-  This function correctly handles calling `fn` in a cross-replica
-  context.
-
-  Args:
-    fn: The function to call.
-    *args: Positional arguments to the `fn`.
-    **kwargs: Keyword argument to `fn`.
-
-  Returns:
-    The result of calling `fn`.
-  """
-  # TODO(b/132666209): Remove this function when we support assign_*
-  # for replica-local variables.
-  strategy = None
-  if 'strategy' in kwargs:
-    strategy = kwargs.pop('strategy')
-  else:
-    if tf.distribute.has_strategy():
-      strategy = tf.distribute.get_strategy()
-
-  # TODO(b/120571621): TPUStrategy does not implement replica-local variables.
-  is_tpu = backend.is_tpu_strategy(strategy)
-  if ((not is_tpu) and strategy and tf.distribute.in_cross_replica_context()):
-    with strategy.scope():
-      return strategy.extended.call_for_each_replica(fn, args, kwargs)
-  return fn(*args, **kwargs)
+    """Call a function that uses replica-local variables.
+
+    This function correctly handles calling `fn` in a cross-replica
+    context.
+
+    Args:
+      fn: The function to call.
+      *args: Positional arguments to the `fn`.
+      **kwargs: Keyword argument to `fn`.
+
+    Returns:
+      The result of calling `fn`.
+    """
+    # TODO(b/132666209): Remove this function when we support assign_*
+    # for replica-local variables.
+    strategy = None
+    if "strategy" in kwargs:
+        strategy = kwargs.pop("strategy")
+    else:
+        if tf.distribute.has_strategy():
+            strategy = tf.distribute.get_strategy()
+
+    # TODO(b/120571621): TPUStrategy does not implement replica-local variables.
+    is_tpu = backend.is_tpu_strategy(strategy)
+    if (not is_tpu) and strategy and tf.distribute.in_cross_replica_context():
+        with strategy.scope():
+            return strategy.extended.call_for_each_replica(fn, args, kwargs)
+    return fn(*args, **kwargs)
 
 
 def is_distributed_variable(v):
-  """Returns whether `v` is a distributed variable."""
-  return (isinstance(v, tf.distribute.DistributedValues) and
-          isinstance(v, tf.Variable))
+    """Returns whether `v` is a distributed variable."""
+    return isinstance(v, tf.distribute.DistributedValues) and isinstance(
+        v, tf.Variable
+    )
 
 
 def get_strategy():
-  """Creates a `tf.distribute.Strategy` object from flags.
-
-  Example usage:
-
-  ```python
-  strategy = utils.get_strategy()
-  with strategy.scope():
-    model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
-
-  model.compile(...)
-  train_ds, test_ds = ...
-  model.fit(train_ds, validation_data=test_ds, epochs=10)
-  ```
-
-  Returns:
-    `tf.distribute.Strategy` instance.
-  """
-  cls = FLAGS.keras_distribute_strategy_class
-  accepted_strats = {
-      'tpu', 'multi_worker_mirrored', 'mirrored',
-      'parameter_server', 'one_device'}
-  if cls == 'tpu':
-    tpu_addr = FLAGS.keras_distribute_strategy_tpu_addr
-    if not tpu_addr:
-      raise ValueError(
-          'When using a TPU strategy, you must set the flag '
-          '`keras_distribute_strategy_tpu_addr` (TPU address).')
-    cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-        tpu=tpu_addr)
-    tf.config.experimental_connect_to_cluster(cluster_resolver)
-    tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
-    strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
-  elif cls == 'multi_worker_mirrored':
-    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
-  elif cls == 'mirrored':
-    strategy = tf.distribute.MirroredStrategy()
-  elif cls == 'parameter_server':
-    cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
-    strategy = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-  elif cls == 'one_device':
-    strategy = tf.distribute.OneDeviceStrategy('/gpu:0')
-  else:
-    raise ValueError(
-        'Unknown distribution strategy flag. Received: '
-        f'keras_distribute_strategy_class={cls}. '
-        f'It should be one of {accepted_strats}')
-  return strategy
+    """Creates a `tf.distribute.Strategy` object from flags.
+
+    Example usage:
+
+    ```python
+    strategy = utils.get_strategy()
+    with strategy.scope():
+      model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
+
+    model.compile(...)
+    train_ds, test_ds = ...
+    model.fit(train_ds, validation_data=test_ds, epochs=10)
+    ```
+
+    Returns:
+      `tf.distribute.Strategy` instance.
+    """
+    cls = FLAGS.keras_distribute_strategy_class
+    accepted_strats = {
+        "tpu",
+        "multi_worker_mirrored",
+        "mirrored",
+        "parameter_server",
+        "one_device",
+    }
+    if cls == "tpu":
+        tpu_addr = FLAGS.keras_distribute_strategy_tpu_addr
+        if not tpu_addr:
+            raise ValueError(
+                "When using a TPU strategy, you must set the flag "
+                "`keras_distribute_strategy_tpu_addr` (TPU address)."
+            )
+        cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+            tpu=tpu_addr
+        )
+        tf.config.experimental_connect_to_cluster(cluster_resolver)
+        tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
+        strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
+    elif cls == "multi_worker_mirrored":
+        strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+    elif cls == "mirrored":
+        strategy = tf.distribute.MirroredStrategy()
+    elif cls == "parameter_server":
+        cluster_resolver = (
+            tf.distribute.cluster_resolver.TFConfigClusterResolver()
+        )
+        strategy = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver
+        )
+    elif cls == "one_device":
+        strategy = tf.distribute.OneDeviceStrategy("/gpu:0")
+    else:
+        raise ValueError(
+            "Unknown distribution strategy flag. Received: "
+            f"keras_distribute_strategy_class={cls}. "
+            f"It should be one of {accepted_strats}"
+        )
+    return strategy
+
+
+def maybe_preemption_handler_scope(model):
+
+    if getattr(model, "_preemption_handler", None):
+        preemption_checkpoint_scope = (
+            model._preemption_handler.watch_preemption_scope()
+        )
+    else:
+        preemption_checkpoint_scope = contextlib.nullcontext()
+
+    return preemption_checkpoint_scope
diff --git a/keras/distribute/distributed_training_utils_test.py b/keras/distribute/distributed_training_utils_test.py
index 54e5124be30f..690cade75923 100644
--- a/keras/distribute/distributed_training_utils_test.py
+++ b/keras/distribute/distributed_training_utils_test.py
@@ -18,37 +18,39 @@
 
 from keras import callbacks
 from keras.distribute import distributed_training_utils_v1
-from keras.optimizers.optimizer_v2 import adam
+from keras.optimizers.legacy import adam
 
 
 class DistributedTrainingUtilsTest(tf.test.TestCase):
+    def test_validate_callbacks_predefined_callbacks(self):
+        supported_predefined_callbacks = [
+            callbacks.TensorBoard(),
+            callbacks.CSVLogger(filename="./log.csv"),
+            callbacks.EarlyStopping(),
+            callbacks.ModelCheckpoint(filepath="./checkpoint"),
+            callbacks.TerminateOnNaN(),
+            callbacks.ProgbarLogger(),
+            callbacks.History(),
+            callbacks.RemoteMonitor(),
+        ]
 
-  def test_validate_callbacks_predefined_callbacks(self):
-    supported_predefined_callbacks = [
-        callbacks.TensorBoard(),
-        callbacks.CSVLogger(filename='./log.csv'),
-        callbacks.EarlyStopping(),
-        callbacks.ModelCheckpoint(filepath='./checkpoint'),
-        callbacks.TerminateOnNaN(),
-        callbacks.ProgbarLogger(),
-        callbacks.History(),
-        callbacks.RemoteMonitor()
-    ]
-
-    distributed_training_utils_v1.validate_callbacks(
-        supported_predefined_callbacks, adam.Adam())
-
-    unsupported_predefined_callbacks = [
-        callbacks.ReduceLROnPlateau(),
-        callbacks.LearningRateScheduler(schedule=lambda epoch: 0.001)
-    ]
-
-    for callback in unsupported_predefined_callbacks:
-      with self.assertRaisesRegex(ValueError,
-                                  'You must specify a Keras Optimizer V2'):
         distributed_training_utils_v1.validate_callbacks(
-            [callback], tf.compat.v1.train.AdamOptimizer())
+            supported_predefined_callbacks, adam.Adam()
+        )
 
+        unsupported_predefined_callbacks = [
+            callbacks.ReduceLROnPlateau(),
+            callbacks.LearningRateScheduler(schedule=lambda epoch: 0.001),
+        ]
 
-if __name__ == '__main__':
-  tf.test.main()
+        for callback in unsupported_predefined_callbacks:
+            with self.assertRaisesRegex(
+                ValueError, "You must specify a Keras Optimizer V2"
+            ):
+                distributed_training_utils_v1.validate_callbacks(
+                    [callback], tf.compat.v1.train.AdamOptimizer()
+                )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/distribute/distributed_training_utils_v1.py b/keras/distribute/distributed_training_utils_v1.py
index 1155e3d14398..8b19235f41ff 100644
--- a/keras/distribute/distributed_training_utils_v1.py
+++ b/keras/distribute/distributed_training_utils_v1.py
@@ -14,12 +14,11 @@
 # ==============================================================================
 """Utilities related to distributed training."""
 
-import tensorflow.compat.v2 as tf
-# pylint:disable=protected-access
-
 import functools
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import callbacks
 from keras import metrics as metrics_module
@@ -27,1107 +26,1240 @@
 from keras.distribute import distribute_coordinator_utils as dc
 from keras.distribute import distributed_training_utils as dist_utils
 from keras.engine import training_utils_v1
-from keras.optimizers.optimizer_v2 import optimizer_v2
+from keras.optimizers.legacy import optimizer_v2
 from keras.utils import tf_contextlib
 from keras.utils.mode_keys import ModeKeys
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 
 def set_weights(distribution_strategy, dist_model, weights):
-  """Sets the weights of the replicated models.
+    """Sets the weights of the replicated models.
+
+    The weights of the replicated models are set to the weights of the original
+    model. The weights of the replicated model are Mirrored variables and hence
+    we need to use the `update` call within a DistributionStrategy scope.
+
+    Args:
+      distribution_strategy: DistributionStrategy used to distribute training
+          and validation.
+      dist_model: The replicated models on the different devices.
+      weights: The weights of the original model.
+    """
+    assign_ops = []
+    for layer in dist_model.layers:
+        num_param = len(layer.weights)
+        layer_weights = weights[:num_param]
+        for sw, w in zip(layer.weights, layer_weights):
+            if tf.compat.v1.executing_eagerly_outside_functions():
+                sw.assign(w)
+            else:
+                assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
+        weights = weights[num_param:]
+
+    if not tf.compat.v1.executing_eagerly_outside_functions():
+        backend.get_session(assign_ops).run(assign_ops)
+
+
+def unwrap_values(
+    distribution_strategy,
+    grouped_inputs,
+    grouped_outputs,
+    grouped_updates=None,
+    grouped_session_args=None,
+    with_loss_tensor=False,
+):
+    """Unwrap the list of values contained in the PerReplica parameters.
+
+    This function calls `flatten_per_replica_values` to parse each of the input
+    parameters into a list of values on the different devices. If we set
+    `with_loss_tensor` to be True, we also call `reduce` on the list of losses
+    on the different devices to give us one loss tensor.
+
+    Args:
+      distribution_strategy: DistributionStrategy used to distribute training
+          and validation.
+      grouped_inputs: PerReplica inputs returned from the train or test function
+          that we ran on each device.
+      grouped_outputs: PerReplica outputs returned from the train or test
+          function that we ran on each device.
+      grouped_updates: PerReplica updates returned from the train or test
+          function that we ran on each device.
+      grouped_session_args: PerReplica session args returned from the train or
+          test function that we ran on each device.
+      with_loss_tensor: Boolean that indicates if we need to add the reduced
+          loss tensor as one of the outputs.
+
+    Returns:
+      Values of each of the PerReplica parameters.
+
+    """
+    # Unwrap per device values returned from each model's train function.
+    # This will be used to construct the main train function.
+    all_inputs = flatten_per_replica_values(
+        distribution_strategy, grouped_inputs
+    )
+    all_outputs = unwrap_outputs(
+        distribution_strategy, grouped_outputs, with_loss_tensor
+    )
+
+    if grouped_updates:
+        all_updates = flatten_per_replica_values(
+            distribution_strategy, grouped_updates
+        )
+    else:
+        all_updates = None
 
-  The weights of the replicated models are set to the weights of the original
-  model. The weights of the replicated model are Mirrored variables and hence
-  we need to use the `update` call within a DistributionStrategy scope.
+    all_session_args = {}
+    if grouped_session_args:
+        grouped_feed_dict = grouped_session_args.get("feed_dict")
+        if grouped_feed_dict:
+            all_session_args["feed_dict"] = flatten_per_replica_values(
+                distribution_strategy, grouped_feed_dict
+            )
 
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training
-        and validation.
-    dist_model: The replicated models on the different devices.
-    weights: The weights of the original model.
-  """
-  assign_ops = []
-  for layer in dist_model.layers:
-    num_param = len(layer.weights)
-    layer_weights = weights[:num_param]
-    for sw, w in zip(layer.weights, layer_weights):
-      if tf.compat.v1.executing_eagerly_outside_functions():
-        sw.assign(w)
-      else:
-        assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
-    weights = weights[num_param:]
-
-  if not tf.compat.v1.executing_eagerly_outside_functions():
-    backend.get_session(assign_ops).run(assign_ops)
-
-
-def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
-                  grouped_updates=None, grouped_session_args=None,
-                  with_loss_tensor=False):
-  """Unwrap the list of values contained in the PerReplica parameters.
-
-  This function calls `flatten_per_replica_values` to parse each of the input
-  parameters into a list of values on the different devices. If we set
-  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
-  the different devices to give us one loss tensor.
-
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training and
-        validation.
-    grouped_inputs: PerReplica inputs returned from the train or test function
-        that we ran on each device.
-    grouped_outputs: PerReplica outputs returned from the train or test function
-        that we ran on each device.
-    grouped_updates: PerReplica updates returned from the train or test function
-        that we ran on each device.
-    grouped_session_args: PerReplica session args returned from the train or
-        test function that we ran on each device.
-    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
-        tensor as one of the outputs.
-
-  Returns:
-    Values of each of the PerReplica parameters.
-
-  """
-  # Unwrap per device values returned from each model's train function.
-  # This will be used to construct the main train function.
-  all_inputs = flatten_per_replica_values(distribution_strategy,
-                                          grouped_inputs)
-  all_outputs = unwrap_outputs(distribution_strategy, grouped_outputs,
-                               with_loss_tensor)
-
-  if grouped_updates:
-    all_updates = flatten_per_replica_values(distribution_strategy,
-                                             grouped_updates)
-  else:
-    all_updates = None
-
-  all_session_args = {}
-  if grouped_session_args:
-    grouped_feed_dict = grouped_session_args.get('feed_dict')
-    if grouped_feed_dict:
-      all_session_args['feed_dict'] = flatten_per_replica_values(
-          distribution_strategy, grouped_feed_dict)
-
-    grouped_fetches = grouped_session_args.get('fetches')
-    if grouped_fetches:
-      all_session_args['fetches'] = flatten_per_replica_values(
-          distribution_strategy, grouped_fetches)
-
-  # TODO(priyag): Return only non empty/None values
-  return all_inputs, all_outputs, all_updates, all_session_args
+        grouped_fetches = grouped_session_args.get("fetches")
+        if grouped_fetches:
+            all_session_args["fetches"] = flatten_per_replica_values(
+                distribution_strategy, grouped_fetches
+            )
+
+    # TODO(priyag): Return only non empty/None values
+    return all_inputs, all_outputs, all_updates, all_session_args
 
 
 def unwrap_output_dict(strategy, grouped_outputs, mode):
-  """Unwrap the list of outputs contained in the PerReplica parameters."""
-  if mode == ModeKeys.PREDICT:
-    return flatten_per_replica_values(strategy, grouped_outputs)
-
-  # In the case of fit/eval, the grouped_outputs is a dict, whereas in predict,
-  # the output is as same structure as model output. They need to be treated
-  # differently
-  total_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
-                               grouped_outputs['total_loss'][0], axis=None)
-  output_losses = flatten_per_replica_values(strategy,
-                                             grouped_outputs['output_losses'])
-  metrics = flatten_per_replica_values(strategy,
-                                       grouped_outputs['metrics'])
-  batch_size = strategy.reduce(tf.distribute.ReduceOp.SUM,
-                               grouped_outputs['batch_size'], axis=None)
-  if (backend.is_tpu_strategy(strategy) and
-      tf.compat.v1.executing_eagerly_outside_functions()):
-    # Choose 1 value per replica in the TPU case since all replicas produce the
-    # same output.
-    # We only do this in eager mode for now since this function is used in
-    # both graph and eager mode and in the graph case we currently don't use
-    # experimental_run so would need to be removed when we converge the graph
-    # code path as well.
-    output_losses = output_losses[::strategy.num_replicas_in_sync]
-    metrics = metrics[::strategy.num_replicas_in_sync]
-  return {'total_loss': [total_loss],
-          'output_losses': output_losses,
-          'metrics': metrics,
-          'batch_size': batch_size}
-
-
-def unwrap_outputs(distribution_strategy, grouped_outputs,
-                   with_loss_tensor=False):
-  """Unwrap the list of outputs contained in the PerReplica parameters.
-
-  This function calls `flatten_per_replica_values` to parse each of the input
-  parameters into a list of outputs on the different devices. If we set
-  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
-  the different devices to give us one loss tensor.
-
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training and
-        validation.
-    grouped_outputs: PerReplica outputs returned from the train or test function
-        that we ran on each device.
-    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
-        tensor as one of the outputs.
-
-  Returns:
-    Values of each of the PerReplica outputs.
-
-  """
-  if not with_loss_tensor:
-    return flatten_per_replica_values(distribution_strategy,
-                                      grouped_outputs)
-
-  if not isinstance(grouped_outputs, list):
-    grouped_outputs = [grouped_outputs]
-  # reduce loss tensor before adding it to the list of fetches
-  loss = distribution_strategy.reduce(tf.distribute.ReduceOp.SUM,
-                                      grouped_outputs[0], axis=None)
-  all_outputs = flatten_per_replica_values(distribution_strategy,
-                                           grouped_outputs[1:])
-  if (backend.is_tpu_strategy(distribution_strategy) and
-      tf.compat.v1.executing_eagerly_outside_functions()):
-    # Choose 1 value per replica in the TPU case since all replicas produce the
-    # same output.
-    # We only do this in eager mode for now since this function is used in
-    # both graph and eager mode and in the graph case we currently don't use
-    # experimental_run so would need to be removed when we converge the graph
-    # code path as well.
-    all_outputs = all_outputs[::distribution_strategy.num_replicas_in_sync]
-  return [loss] + all_outputs
+    """Unwrap the list of outputs contained in the PerReplica parameters."""
+    if mode == ModeKeys.PREDICT:
+        return flatten_per_replica_values(strategy, grouped_outputs)
+
+    # In the case of fit/eval, the grouped_outputs is a dict, whereas in
+    # predict, the output is as same structure as model output. They need to be
+    # treated differently
+    total_loss = strategy.reduce(
+        tf.distribute.ReduceOp.SUM, grouped_outputs["total_loss"][0], axis=None
+    )
+    output_losses = flatten_per_replica_values(
+        strategy, grouped_outputs["output_losses"]
+    )
+    metrics = flatten_per_replica_values(strategy, grouped_outputs["metrics"])
+    batch_size = strategy.reduce(
+        tf.distribute.ReduceOp.SUM, grouped_outputs["batch_size"], axis=None
+    )
+    if (
+        backend.is_tpu_strategy(strategy)
+        and tf.compat.v1.executing_eagerly_outside_functions()
+    ):
+        # Choose 1 value per replica in the TPU case since all replicas produce
+        # the same output.
+        # We only do this in eager mode for now since this function is used in
+        # both graph and eager mode and in the graph case we currently don't use
+        # experimental_run so would need to be removed when we converge the
+        # graph code path as well.
+        output_losses = output_losses[:: strategy.num_replicas_in_sync]
+        metrics = metrics[:: strategy.num_replicas_in_sync]
+    return {
+        "total_loss": [total_loss],
+        "output_losses": output_losses,
+        "metrics": metrics,
+        "batch_size": batch_size,
+    }
+
+
+def unwrap_outputs(
+    distribution_strategy, grouped_outputs, with_loss_tensor=False
+):
+    """Unwrap the list of outputs contained in the PerReplica parameters.
+
+    This function calls `flatten_per_replica_values` to parse each of the input
+    parameters into a list of outputs on the different devices. If we set
+    `with_loss_tensor` to be True, we also call `reduce` on the list of losses
+    on the different devices to give us one loss tensor.
+
+    Args:
+      distribution_strategy: DistributionStrategy used to distribute training
+          and validation.
+      grouped_outputs: PerReplica outputs returned from the train or test
+          function that we ran on each device.
+      with_loss_tensor: Boolean that indicates if we need to add the reduced
+          loss tensor as one of the outputs.
+
+    Returns:
+      Values of each of the PerReplica outputs.
+
+    """
+    if not with_loss_tensor:
+        return flatten_per_replica_values(
+            distribution_strategy, grouped_outputs
+        )
+
+    if not isinstance(grouped_outputs, list):
+        grouped_outputs = [grouped_outputs]
+    # reduce loss tensor before adding it to the list of fetches
+    loss = distribution_strategy.reduce(
+        tf.distribute.ReduceOp.SUM, grouped_outputs[0], axis=None
+    )
+    all_outputs = flatten_per_replica_values(
+        distribution_strategy, grouped_outputs[1:]
+    )
+    if (
+        backend.is_tpu_strategy(distribution_strategy)
+        and tf.compat.v1.executing_eagerly_outside_functions()
+    ):
+        # Choose 1 value per replica in the TPU case since all replicas produce
+        # the same output.
+        # We only do this in eager mode for now since this function is used in
+        # both graph and eager mode and in the graph case we currently don't use
+        # experimental_run so would need to be removed when we converge the
+        # graph code path as well.
+        all_outputs = all_outputs[:: distribution_strategy.num_replicas_in_sync]
+    return [loss] + all_outputs
 
 
 def flatten_per_replica_values(distribution_strategy, per_replica_values):
-  """Unwraps and flattens a nest of PerReplica parameters.
+    """Unwraps and flattens a nest of PerReplica parameters.
+
+    PerReplica values have one value associated with each device. Each entry in
+    the PerReplica dict has a device `key` and the corresponding value on the
+    device as the `value`. In this function we take a PerReplica value or a list
+    of PerReplica values and return all the values in the PerReplica dict.
 
-  PerReplica values have one value associated with each device. Each entry in
-  the PerReplica dict has a device `key` and the corresponding value on the
-  device as the `value`. In this function we take a PerReplica value or a list
-  of PerReplica values and return all the values in the PerReplica dict.
+    Args:
+      distribution_strategy: DistributionStrategy used to distribute training
+        and validation.
+      per_replica_values: List of PerReplica object or a single PerReplica
+        object.
 
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training and
-      validation.
-    per_replica_values: List of PerReplica object or a single PerReplica object.
+    Returns:
+      List of values of all the PerReplica objects.
 
-  Returns:
-    List of values of all the PerReplica objects.
+    """
 
-  """
-  # pylint: disable=g-complex-comprehension
-  # This function takes a PerReplica object or a list of PerReplica objects and
-  # returns all the values associated with it.
-  return [e for flattened in tf.nest.flatten(per_replica_values)
-          for e in distribution_strategy.unwrap(flattened)]
+    # This function takes a PerReplica object or a list of PerReplica objects
+    # and returns all the values associated with it.
+    return [
+        e
+        for flattened in tf.nest.flatten(per_replica_values)
+        for e in distribution_strategy.unwrap(flattened)
+    ]
 
 
 def validate_callbacks(input_callbacks, optimizer):
-  """Validate whether given callbacks are supported by DistributionStrategy.
-
-  Args:
-    input_callbacks: List of callbacks passed by the user to fit.
-    optimizer: Optimizer instance used to train the model.
-
-  Raises:
-    ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
-        callbacks passed.
-    ValueError: If `write_grads` is one of the parameters passed as part of the
-        TensorBoard callback.
-  """
-  if input_callbacks:
-    for callback in input_callbacks:
-      if isinstance(callback, (callbacks.LearningRateScheduler,
-                               callbacks.ReduceLROnPlateau)):
-
-        if not isinstance(optimizer, optimizer_v2.OptimizerV2):
-          raise ValueError('You must specify a Keras Optimizer V2 when using '
-                           '%s callback with DistributionStrategy.' % callback)
-
-      # If users want to use the TensorBoard callback they cannot use certain
-      # features of the callback that involve accessing model attributes and
-      # running ops.
-      if isinstance(callback, callbacks.TensorBoard):
-        if getattr(callback, 'write_grads', False):
-          logging.warning(
-              UserWarning(
-                  '`write_grads` in the TensorBoard callback is not supported '
-                  'when using DistributionStrategy. Setting `write_grads` '
-                  'to `False`.'))
-          callback.write_grads = False
-
-
-def validate_distributed_dataset_inputs(distribution_strategy, x, y,
-                                        sample_weights=None):
-  """Validate all the components of a DistributedValue Dataset input.
-
-  Args:
-    distribution_strategy: The current DistributionStrategy used to call
-        `fit`/`evaluate`.
-    x: Input Dataset DistributedValue object. For example, when we use
-        `MirroredStrategy` this is a PerReplica object with a tensor for each
-        device set in the dict. x can also be a tuple or dict. The keys of the
-        dict should match the names of the input layers of the model.
-    y: Target Dataset DistributedValue object. For example, when we use
-        `MirroredStrategy` this is a PerReplica object with a tensor for each
-        device set in the dict. y can also be a tuple or dict. The keys of the
-        dict should match the names of the output layers of the model.
-    sample_weights: Sample weights Dataset DistributedValue object. For example,
-        when we use `MirroredStrategy` this is a PerReplica object with a tensor
-        for each device set in the dict.
-
-  Returns:
-    The unwrapped values list of the x and y DistributedValues inputs.
-
-  Raises:
-    ValueError: If x and y do not have support for being evaluated as tensors.
-        or if x and y contain elements that are not tensors or if x and y
-        contain elements that have a shape or dtype mismatch.
-  """
-  # If the input and target used to call the model are not dataset tensors,
-  # we need to raise an error. When using a DistributionStrategy, the input
-  # and targets to a model should be from a `tf.data.Dataset`.
-
-  # If each element of x and y are not tensors, we cannot standardize and
-  # validate the input and targets.
-  x_values_list = validate_per_replica_inputs(distribution_strategy, x)
-
-  if y is not None:
-    y_values_list = validate_per_replica_inputs(distribution_strategy, y)
-  else:
-    y_values_list = None
-
-  if sample_weights is not None:
-    sample_weights_list = validate_per_replica_inputs(distribution_strategy,
-                                                      sample_weights)
-  else:
-    sample_weights_list = None
-
-  # Return the unwrapped values to avoid calling `unwrap` a second time.
-  return x_values_list, y_values_list, sample_weights_list
+    """Validate whether given callbacks are supported by DistributionStrategy.
+
+    Args:
+      input_callbacks: List of callbacks passed by the user to fit.
+      optimizer: Optimizer instance used to train the model.
+
+    Raises:
+      ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of
+          the callbacks passed.
+      ValueError: If `write_grads` is one of the parameters passed as part of
+          the TensorBoard callback.
+    """
+    if input_callbacks:
+        for callback in input_callbacks:
+            if isinstance(
+                callback,
+                (callbacks.LearningRateScheduler, callbacks.ReduceLROnPlateau),
+            ):
+
+                if not isinstance(optimizer, optimizer_v2.OptimizerV2):
+                    raise ValueError(
+                        "You must specify a Keras Optimizer V2 when using "
+                        "%s callback with DistributionStrategy." % callback
+                    )
+
+            # If users want to use the TensorBoard callback they cannot use
+            # certain features of the callback that involve accessing model
+            # attributes and running ops.
+            if isinstance(callback, callbacks.TensorBoard):
+                if getattr(callback, "write_grads", False):
+                    logging.warning(
+                        UserWarning(
+                            "`write_grads` in the TensorBoard callback is not "
+                            "supported when using DistributionStrategy. "
+                            "Setting `write_grads` to `False`."
+                        )
+                    )
+                    callback.write_grads = False
+
+
+def validate_distributed_dataset_inputs(
+    distribution_strategy, x, y, sample_weights=None
+):
+    """Validate all the components of a DistributedValue Dataset input.
+
+    Args:
+      distribution_strategy: The current DistributionStrategy used to call
+          `fit`/`evaluate`.
+      x: Input Dataset DistributedValue object. For example, when we use
+          `MirroredStrategy` this is a PerReplica object with a tensor for each
+          device set in the dict. x can also be a tuple or dict. The keys of the
+          dict should match the names of the input layers of the model.
+      y: Target Dataset DistributedValue object. For example, when we use
+          `MirroredStrategy` this is a PerReplica object with a tensor for each
+          device set in the dict. y can also be a tuple or dict. The keys of the
+          dict should match the names of the output layers of the model.
+      sample_weights: Sample weights Dataset DistributedValue object. For
+          example, when we use `MirroredStrategy` this is a PerReplica object
+          with a tensor for each device set in the dict.
+
+    Returns:
+      The unwrapped values list of the x and y DistributedValues inputs.
+
+    Raises:
+      ValueError: If x and y do not have support for being evaluated as tensors.
+          or if x and y contain elements that are not tensors or if x and y
+          contain elements that have a shape or dtype mismatch.
+    """
+    # If the input and target used to call the model are not dataset tensors,
+    # we need to raise an error. When using a DistributionStrategy, the input
+    # and targets to a model should be from a `tf.data.Dataset`.
+
+    # If each element of x and y are not tensors, we cannot standardize and
+    # validate the input and targets.
+    x_values_list = validate_per_replica_inputs(distribution_strategy, x)
+
+    if y is not None:
+        y_values_list = validate_per_replica_inputs(distribution_strategy, y)
+    else:
+        y_values_list = None
+
+    if sample_weights is not None:
+        sample_weights_list = validate_per_replica_inputs(
+            distribution_strategy, sample_weights
+        )
+    else:
+        sample_weights_list = None
+
+    # Return the unwrapped values to avoid calling `unwrap` a second time.
+    return x_values_list, y_values_list, sample_weights_list
 
 
 def validate_per_replica_inputs(distribution_strategy, x):
-  """Validates PerReplica dataset input list.
-
-  Args:
-    distribution_strategy: The current DistributionStrategy used to call
-      `fit`, `evaluate` and `predict`.
-    x: A list of PerReplica objects that represent the input or
-      target values.
-
-  Returns:
-    List containing the first element of each of the PerReplica objects in
-    the input list.
-
-  Raises:
-    ValueError: If any of the objects in the `per_replica_list` is not a tensor.
-
-  """
-  # Convert the inputs and targets into a list of PerReplica objects.
-  per_replica_list = tf.nest.flatten(x)
-  x_values_list = []
-  for x in per_replica_list:
-    # At this point x should contain only tensors.
-    x_values = distribution_strategy.unwrap(x)
-    for value in x_values:
-      if not tf.is_tensor(value):
-        raise ValueError('Dataset input to the model should be tensors instead '
-                         'they are of type {}'.format(type(value)))
-
-    if not tf.executing_eagerly():
-      # Validate that the shape and dtype of all the elements in x are the same.
-      validate_all_tensor_shapes(x, x_values)
-    validate_all_tensor_types(x, x_values)
-
-    x_values_list.append(x_values[0])
-  return x_values_list
+    """Validates PerReplica dataset input list.
+
+    Args:
+      distribution_strategy: The current DistributionStrategy used to call
+        `fit`, `evaluate` and `predict`.
+      x: A list of PerReplica objects that represent the input or
+        target values.
+
+    Returns:
+      List containing the first element of each of the PerReplica objects in
+      the input list.
+
+    Raises:
+      ValueError: If any of the objects in the `per_replica_list` is not a
+        tensor.
+
+    """
+    # Convert the inputs and targets into a list of PerReplica objects.
+    per_replica_list = tf.nest.flatten(x)
+    x_values_list = []
+    for x in per_replica_list:
+        # At this point x should contain only tensors.
+        x_values = distribution_strategy.unwrap(x)
+        for value in x_values:
+            if not tf.is_tensor(value):
+                raise ValueError(
+                    "Dataset input to the model should be tensors instead "
+                    "they are of type {}".format(type(value))
+                )
+
+        if not tf.executing_eagerly():
+            # Validate that the shape and dtype of all the elements in x are the
+            # same.
+            validate_all_tensor_shapes(x, x_values)
+        validate_all_tensor_types(x, x_values)
+
+        x_values_list.append(x_values[0])
+    return x_values_list
 
 
 def validate_all_tensor_types(x, x_values):
-  x_dtype = x_values[0].dtype
-  for i in range(1, len(x_values)):
-    if x_dtype != x_values[i].dtype:
-      raise ValueError('Input tensor dtypes do not match for distributed tensor'
-                       ' inputs {}'.format(x))
+    x_dtype = x_values[0].dtype
+    for i in range(1, len(x_values)):
+        if x_dtype != x_values[i].dtype:
+            raise ValueError(
+                "Input tensor dtypes do not match for distributed tensor"
+                " inputs {}".format(x)
+            )
 
 
 def validate_all_tensor_shapes(x, x_values):
-  # Validate that the shape of all the elements in x have the same shape
-  x_shape = x_values[0].shape.as_list()
-  for i in range(1, len(x_values)):
-    if x_shape != x_values[i].shape.as_list():
-      raise ValueError('Input tensor shapes do not match for distributed tensor'
-                       ' inputs {}'.format(x))
+    # Validate that the shape of all the elements in x have the same shape
+    x_shape = x_values[0].shape.as_list()
+    for i in range(1, len(x_values)):
+        if x_shape != x_values[i].shape.as_list():
+            raise ValueError(
+                "Input tensor shapes do not match for distributed tensor"
+                " inputs {}".format(x)
+            )
 
 
 def _wait_for_variable_initialization(session):
-  """Utility to wait for variables to be initialized."""
-  all_variables = backend._get_variables(backend.get_graph())  # pylint: disable=protected-access
-  candidate_vars = []
-  for v in all_variables:
-    if not getattr(v, '_keras_initialized', False):
-      candidate_vars.append(v)
-
-  if not candidate_vars:
-    return
-
-  while True:
-    is_initialized = session.run(
-        [tf.compat.v1.is_variable_initialized(v) for v in candidate_vars])
-    uninitialized_vars = []
-    for flag, v in zip(is_initialized, candidate_vars):
-      if not flag:
-        uninitialized_vars.append(v)
-      v._keras_initialized = True  # pylint: disable=protected-access
-    if not uninitialized_vars:
-      break
+    """Utility to wait for variables to be initialized."""
+    all_variables = backend._get_variables(backend.get_graph())
+    candidate_vars = []
+    for v in all_variables:
+        if not getattr(v, "_keras_initialized", False):
+            candidate_vars.append(v)
+
+    if not candidate_vars:
+        return
+
+    while True:
+        is_initialized = session.run(
+            [tf.compat.v1.is_variable_initialized(v) for v in candidate_vars]
+        )
+        uninitialized_vars = []
+        for flag, v in zip(is_initialized, candidate_vars):
+            if not flag:
+                uninitialized_vars.append(v)
+            v._keras_initialized = True
+        if not uninitialized_vars:
+            break
 
 
 def init_restore_or_wait_for_variables():
-  """Initialize or restore variables or wait for variables to be initialized."""
-  backend._initialize_variables(backend._get_session())  # pylint: disable=protected-access
+    """Initialize or restore variables or wait for variables to be
+    initialized."""
+    backend._initialize_variables(backend._get_session())
 
 
 def validate_inputs(x, y):
-  """Validate inputs when using DistributionStrategy.
-
-  Args:
-    x: Model Inputs.
-    y: Model Targets.
-
-  Raises:
-    ValueError: if input is not a Dataset or a numpy array(when we use
-      MirroredStrategy).
-  """
-  if (isinstance(x, tf.compat.v1.data.Iterator) or
-      isinstance(y, tf.compat.v1.data.Iterator)):
-    raise ValueError('`DistributionStrategy` does not support inputs of type '
-                     'Iterator. You must pass a `tf.data.Dataset` object or a '
-                     'numpy array as input.')
+    """Validate inputs when using DistributionStrategy.
+
+    Args:
+      x: Model Inputs.
+      y: Model Targets.
+
+    Raises:
+      ValueError: if input is not a Dataset or a numpy array(when we use
+        MirroredStrategy).
+    """
+    if isinstance(x, tf.compat.v1.data.Iterator) or isinstance(
+        y, tf.compat.v1.data.Iterator
+    ):
+        raise ValueError(
+            "`DistributionStrategy` does not support inputs of type "
+            "Iterator. You must pass a `tf.data.Dataset` object or a "
+            "numpy array as input."
+        )
 
 
 def is_dataset_shape_fully_defined(dataset):
-  """Returns whether a dataset contains a final partial batch."""
-  shapes = tf.nest.flatten(tf.compat.v1.data.get_output_shapes(dataset))
-  unknown_shapes = [s for s in shapes if not s.is_fully_defined()]
-  return not unknown_shapes
-
-
-def process_batch_and_step_size(strategy,
-                                inputs,
-                                batch_size,
-                                steps_per_epoch,
-                                mode,
-                                validation_split=0.):
-  """Process the batch size and step size based on input and dist strategy."""
-  first_x_value = tf.nest.flatten(inputs)[0]
-  if isinstance(first_x_value, np.ndarray):
-    num_samples = first_x_value.shape[0]
-    if validation_split and 0. < validation_split < 1.:
-      num_samples = int(num_samples * (1 - validation_split))
-    # Until support for partial batch is implemented across all
-    # functions and distribution strategy, we pass `mode` to selectively
-    # relax the constraint to consume all the training samples.
-    steps_per_epoch, batch_size = get_input_params(
-        strategy, num_samples, steps_per_epoch, batch_size, mode=mode)
-  return batch_size, steps_per_epoch
-
-
-def get_input_params(distribution_strategy,
-                     num_samples,
-                     steps,
-                     batch_size,
-                     mode=None):
-  """Calculate the number of batches and steps/steps_per_epoch.
-
-  Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
-    num_samples: The number of samples from which we determine the batch size
-      and steps.
-    steps:  The specified number of steps.
-    batch_size: The specified batch_size.
-    mode: ModeKey representing whether input will be used for training,
-      evaluation, or prediction. This is used to relax the constraints on
-      consuming all the training samples to keep compatibility till we support
-      partial batches. If none, then partial batches are not allowed.
-
-  Returns:
-    steps: The steps or steps_per_epoch argument depending on if a user is
-        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
-        we don't require the number of samples to be used completely.
-    batch_size: The batch size to be used in model iterations.
-
-  Raises:
-    ValueError: If the number of batches or steps evaluates to 0.
-
-  """
-  # TODO(b/118776054): Use global batch size for Keras/DS support.
-  # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
-  use_per_replica_batch = not dist_utils.global_batch_size_supported(
-      distribution_strategy)
-
-  # TODO(b/128995245): In eager mode, uneven batch sizes are allowed except for
-  # `fit()` on TPUStrategy.
-  # In graph mode, the zero batch case in batch norm is not handled due to
-  # XLA-GPU regression. Uneven batch sizes are not allowed except
-  # for `test()` and `predict()` on TPUStrategy.
-  if tf.executing_eagerly():
-    allow_partial_batch = (
-        mode != ModeKeys.TRAIN or
-        not backend.is_tpu_strategy(distribution_strategy))
-  else:
-    allow_partial_batch = (
-        mode == ModeKeys.TRAIN or
-        ((mode == ModeKeys.PREDICT or mode == ModeKeys.TEST) and
-         backend.is_tpu_strategy(distribution_strategy)))
-
-  if steps is None:
-    if batch_size is None:
-      # If neither the batch size or number of steps are set. We choose the
-      # global batch size as the minimum of number of samples and 32. 32 is
-      # chosen to provide backward compatibility.
-      global_batch_size = min(num_samples, 32)
+    """Returns whether a dataset contains a final partial batch."""
+    shapes = tf.nest.flatten(tf.compat.v1.data.get_output_shapes(dataset))
+    unknown_shapes = [s for s in shapes if not s.is_fully_defined()]
+    return not unknown_shapes
+
+
+def process_batch_and_step_size(
+    strategy, inputs, batch_size, steps_per_epoch, mode, validation_split=0.0
+):
+    """Process the batch size and step size based on input and dist strategy."""
+    first_x_value = tf.nest.flatten(inputs)[0]
+    if isinstance(first_x_value, np.ndarray):
+        num_samples = first_x_value.shape[0]
+        if validation_split and 0.0 < validation_split < 1.0:
+            num_samples = int(num_samples * (1 - validation_split))
+        # Until support for partial batch is implemented across all
+        # functions and distribution strategy, we pass `mode` to selectively
+        # relax the constraint to consume all the training samples.
+        steps_per_epoch, batch_size = get_input_params(
+            strategy, num_samples, steps_per_epoch, batch_size, mode=mode
+        )
+    return batch_size, steps_per_epoch
+
+
+def get_input_params(
+    distribution_strategy, num_samples, steps, batch_size, mode=None
+):
+    """Calculate the number of batches and steps/steps_per_epoch.
+
+    Args:
+      distribution_strategy: The DistributionStrategy used to compile the model.
+      num_samples: The number of samples from which we determine the batch size
+        and steps.
+      steps:  The specified number of steps.
+      batch_size: The specified batch_size.
+      mode: ModeKey representing whether input will be used for training,
+        evaluation, or prediction. This is used to relax the constraints on
+        consuming all the training samples to keep compatibility till we support
+        partial batches. If none, then partial batches are not allowed.
+
+    Returns:
+      steps: The steps or steps_per_epoch argument depending on if a user is
+          calling `fit`, `evaluate` or `predict`. If the is_training flag is set
+          we don't require the number of samples to be used completely.
+      batch_size: The batch size to be used in model iterations.
+
+    Raises:
+      ValueError: If the number of batches or steps evaluates to 0.
+
+    """
+    # TODO(b/118776054): Use global batch size for Keras/DS support.
+    # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
+    use_per_replica_batch = not dist_utils.global_batch_size_supported(
+        distribution_strategy
+    )
+
+    # TODO(b/128995245): In eager mode, uneven batch sizes are allowed except
+    # for `fit()` on TPUStrategy.
+    # In graph mode, the zero batch case in batch norm is not handled due to
+    # XLA-GPU regression. Uneven batch sizes are not allowed except
+    # for `test()` and `predict()` on TPUStrategy.
+    if tf.executing_eagerly():
+        allow_partial_batch = (
+            mode != ModeKeys.TRAIN
+            or not backend.is_tpu_strategy(distribution_strategy)
+        )
     else:
-      # If the user provided the batch size we need to handle the case
-      # between different strategies that use the global/per-replica batch size
-      global_batch_size = batch_size
-      if use_per_replica_batch:
-        global_batch_size *= distribution_strategy.num_replicas_in_sync
-    if allow_partial_batch:
-      steps = np.ceil(num_samples / global_batch_size).astype(int)
+        allow_partial_batch = mode == ModeKeys.TRAIN or (
+            (mode == ModeKeys.PREDICT or mode == ModeKeys.TEST)
+            and backend.is_tpu_strategy(distribution_strategy)
+        )
+
+    if steps is None:
+        if batch_size is None:
+            # If neither the batch size or number of steps are set. We choose
+            # the global batch size as the minimum of number of samples and 32.
+            # 32 is chosen to provide backward compatibility.
+            global_batch_size = min(num_samples, 32)
+        else:
+            # If the user provided the batch size we need to handle the case
+            # between different strategies that use the global/per-replica batch
+            # size
+            global_batch_size = batch_size
+            if use_per_replica_batch:
+                global_batch_size *= distribution_strategy.num_replicas_in_sync
+        if allow_partial_batch:
+            steps = np.ceil(num_samples / global_batch_size).astype(int)
+        else:
+            if num_samples % global_batch_size:
+                raise ValueError(
+                    "The number of samples %s is not divisible by "
+                    "batch size %s." % (num_samples, global_batch_size)
+                )
+            steps = num_samples // global_batch_size
     else:
-      if num_samples % global_batch_size:
-        raise ValueError('The number of samples %s is not divisible by '
-                         'batch size %s.' % (num_samples, global_batch_size))
-      steps = num_samples // global_batch_size
-  else:
-    if batch_size is None:
-      # We calculate the batch size based on the number of steps specified
-      if num_samples % steps:
-        raise ValueError('The number of samples %s is not divisible by '
-                         'steps %s. Please change the number of steps to a '
-                         'value that can consume all the samples' % (
-                             num_samples, steps))
-      global_batch_size = num_samples // steps
+        if batch_size is None:
+            # We calculate the batch size based on the number of steps specified
+            if num_samples % steps:
+                raise ValueError(
+                    "The number of samples %s is not divisible by "
+                    "steps %s. Please change the number of steps to a "
+                    "value that can consume all the samples"
+                    % (num_samples, steps)
+                )
+            global_batch_size = num_samples // steps
+        else:
+            # If the user provided the batch size we need to handle the case
+            # between different strategies that use the global/per-replica batch
+            # size
+            global_batch_size = batch_size
+            if use_per_replica_batch:
+                global_batch_size *= distribution_strategy.num_replicas_in_sync
+
+            min_num_samples = global_batch_size * steps
+            if allow_partial_batch:
+                min_num_samples = (
+                    global_batch_size * (steps - 1) + 1 if steps > 1 else 0
+                )
+
+            if num_samples < min_num_samples:
+                raise ValueError(
+                    "Number of samples %s is less than samples required "
+                    "for specified batch_size %s and steps %s"
+                    % (num_samples, global_batch_size, steps)
+                )
+
+    # We need to return the per replica or global batch size based on the
+    # strategy
+    if use_per_replica_batch:
+        if global_batch_size % distribution_strategy.num_replicas_in_sync:
+            raise ValueError(
+                "The batch size (%s) could not be sharded evenly across the "
+                "sync replicas (%s) in the distribution strategy."
+                % (
+                    global_batch_size,
+                    distribution_strategy.num_replicas_in_sync,
+                )
+            )
+        batch_size = (
+            global_batch_size // distribution_strategy.num_replicas_in_sync
+        )
     else:
-      # If the user provided the batch size we need to handle the case
-      # between different strategies that use the global/per-replica batch size
-      global_batch_size = batch_size
-      if use_per_replica_batch:
-        global_batch_size *= distribution_strategy.num_replicas_in_sync
-
-      min_num_samples = global_batch_size * steps
-      if allow_partial_batch:
-        min_num_samples = global_batch_size * (steps-1) + 1 if steps > 1 else 0
-
-      if num_samples < min_num_samples:
-        raise ValueError('Number of samples %s is less than samples required '
-                         'for specified batch_size %s and steps %s' % (
-                             num_samples, global_batch_size, steps))
-
-  # We need to return the per replica or global batch size based on the strategy
-  if use_per_replica_batch:
-    if global_batch_size % distribution_strategy.num_replicas_in_sync:
-      raise ValueError(
-          'The batch size (%s) could not be sharded evenly across the sync '
-          'replicas (%s) in the distribution strategy.' % (
-              global_batch_size, distribution_strategy.num_replicas_in_sync))
-    batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
-  else:
-    batch_size = global_batch_size
-
-  return steps, batch_size
+        batch_size = global_batch_size
+
+    return steps, batch_size
 
 
 def get_batch_dimension(iterator):
-  shapes = tf.nest.flatten(tf.compat.v1.data.get_output_shapes(iterator))
-  # Take the batch size from the first element, as it should be the same for
-  # all.
-  dims = shapes[0].dims
-  return dims[0] if dims else None
+    shapes = tf.nest.flatten(tf.compat.v1.data.get_output_shapes(iterator))
+    # Take the batch size from the first element, as it should be the same for
+    # all.
+    dims = shapes[0].dims
+    return dims[0] if dims else None
 
 
 def get_iterator(dataset, distribution_strategy):
-  with distribution_strategy.scope():
-    iterator = distribution_strategy.make_dataset_iterator(dataset)
-  initialize_iterator(iterator, distribution_strategy)
-  return iterator
+    with distribution_strategy.scope():
+        iterator = distribution_strategy.make_dataset_iterator(dataset)
+    initialize_iterator(iterator, distribution_strategy)
+    return iterator
 
 
 def initialize_iterator(iterator, distribution_strategy):
-  with distribution_strategy.scope():
-    init_op = tf.group(iterator.initializer)
-    if not tf.executing_eagerly():
-      backend.get_session((init_op,)).run(init_op)
+    with distribution_strategy.scope():
+        init_op = tf.group(iterator.initializer)
+        if not tf.executing_eagerly():
+            backend.get_session((init_op,)).run(init_op)
 
 
 def _get_input_from_iterator(iterator, model):
-  """Get elements from the iterator and verify the input shape and type."""
-  next_element = iterator.get_next()
-
-  # `len(nest.flatten(x))` is going to not count empty elements such as {}.
-  # len(nest.flatten([[0,1,2], {}])) is 3 and not 4.   The `next_element` is
-  # going to get flattened in `_prepare_feed_values` to work around that. Empty
-  # elements are going to get filtered out as part of the flattening.
-  if len(tf.nest.flatten(next_element)) == len(model.inputs):
-    x = next_element
-    y = None
-    sample_weights = None
-  elif len(tf.nest.flatten(next_element)) == (len(model.inputs) +
-                                           len(model.outputs)):
-    x, y = next_element
-    sample_weights = None
-  else:
-    x, y, sample_weights = next_element
-
-  # Validate that all the elements in x and y are of the same type and shape.
-  validate_distributed_dataset_inputs(
-      model._distribution_strategy, x, y, sample_weights)
-  return x, y, sample_weights
+    """Get elements from the iterator and verify the input shape and type."""
+    next_element = iterator.get_next()
+
+    # `len(nest.flatten(x))` is going to not count empty elements such as {}.
+    # len(nest.flatten([[0,1,2], {}])) is 3 and not 4. The `next_element` is
+    # going to get flattened in `_prepare_feed_values` to work around that.
+    # Empty elements are going to get filtered out as part of the flattening.
+    if len(tf.nest.flatten(next_element)) == len(model.inputs):
+        x = next_element
+        y = None
+        sample_weights = None
+    elif len(tf.nest.flatten(next_element)) == (
+        len(model.inputs) + len(model.outputs)
+    ):
+        x, y = next_element
+        sample_weights = None
+    else:
+        x, y, sample_weights = next_element
+
+    # Validate that all the elements in x and y are of the same type and shape.
+    validate_distributed_dataset_inputs(
+        model._distribution_strategy, x, y, sample_weights
+    )
+    return x, y, sample_weights
 
 
 def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
-  """Prepare feed values to the model execution function.
-
-  Args:
-    model: Model to prepare feed values for.
-    inputs: List or dict of model inputs.
-    targets: Optional list of model targets.
-    sample_weights: Optional list of sample weight arrays.
-    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-
-  Returns:
-    Feed values for the model in the given mode.
-  """
-  strategy = model._distribution_strategy
-  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
-  if backend.is_tpu_strategy(strategy):
-    if sample_weights is not None:
-      raise ValueError('TPUStrategy does not support sample weights.')
-
-  # When the inputs are dict, then we want to flatten it in the same order as
-  # the input layers, such that the data are fed into the input layers in the
-  # correct order.
-  if isinstance(inputs, dict):
-    inputs = [inputs[key] for key in model._feed_input_names]
-  if is_distributing_by_cloning(model):
-    inputs = flatten_per_replica_values(strategy, inputs)
-    targets = flatten_per_replica_values(strategy, targets)
-    # Expand 1-dimensional inputs.
-    # TODO(b/124535720): Remove once this standarize data logic is shared with
-    # main flow.
-    inputs, targets = tf.nest.map_structure(
-        training_utils_v1.standardize_single_array, (inputs, targets))
-  else:
-    inputs = training_utils_v1.ModelInputs(inputs).as_list()
-
-  if mode == ModeKeys.PREDICT:
-    sample_weights = []
-    targets = []
-  elif sample_weights is not None and is_distributing_by_cloning(model):
-    if tf.executing_eagerly() and not model._compile_distribution:
-      raise NotImplementedError('`sample_weight` is not supported when using '
-                                'tf.distribute.Strategy in eager mode and '
-                                'cloning=True.')
-    sample_weights = flatten_per_replica_values(strategy, sample_weights)
-
-  ins = [inputs, targets, sample_weights]
-  return tuple(ins)
+    """Prepare feed values to the model execution function.
+
+    Args:
+      model: Model to prepare feed values for.
+      inputs: List or dict of model inputs.
+      targets: Optional list of model targets.
+      sample_weights: Optional list of sample weight arrays.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+
+    Returns:
+      Feed values for the model in the given mode.
+    """
+    strategy = model._distribution_strategy
+    inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+    if backend.is_tpu_strategy(strategy):
+        if sample_weights is not None:
+            raise ValueError("TPUStrategy does not support sample weights.")
+
+    # When the inputs are dict, then we want to flatten it in the same order as
+    # the input layers, such that the data are fed into the input layers in the
+    # correct order.
+    if isinstance(inputs, dict):
+        inputs = [inputs[key] for key in model._feed_input_names]
+    if is_distributing_by_cloning(model):
+        inputs = flatten_per_replica_values(strategy, inputs)
+        targets = flatten_per_replica_values(strategy, targets)
+        # Expand 1-dimensional inputs.
+        # TODO(b/124535720): Remove once this standarize data logic is shared
+        # with main flow.
+        inputs, targets = tf.nest.map_structure(
+            training_utils_v1.standardize_single_array, (inputs, targets)
+        )
+    else:
+        inputs = training_utils_v1.ModelInputs(inputs).as_list()
+
+    if mode == ModeKeys.PREDICT:
+        sample_weights = []
+        targets = []
+    elif sample_weights is not None and is_distributing_by_cloning(model):
+        if tf.executing_eagerly() and not model._compile_distribution:
+            raise NotImplementedError(
+                "`sample_weight` is not supported when using "
+                "tf.distribute.Strategy in eager mode and "
+                "cloning=True."
+            )
+        sample_weights = flatten_per_replica_values(strategy, sample_weights)
+
+    ins = [inputs, targets, sample_weights]
+    return tuple(ins)
 
 
 def is_distributing_by_cloning(model):
-  """Decide whether this model is going to be distributed via cloning.
+    """Decide whether this model is going to be distributed via cloning.
 
-  We are going to distribute the model by cloning in graph mode.
+    We are going to distribute the model by cloning in graph mode.
 
-  Args:
-    model: Keras model to distribute.
+    Args:
+      model: Keras model to distribute.
 
-  Returns:
-    True if the `model` is going to be distributed using cloning and False
-    otherwise.
-  """
-  if (backend.is_tpu_strategy(model._distribution_strategy) and
-      tf.executing_eagerly):  # b/137580852
-    return False
-  elif tf.compat.v1.executing_eagerly_outside_functions():
-    return bool(model._compile_distribution)
-  return True
+    Returns:
+      True if the `model` is going to be distributed using cloning and False
+      otherwise.
+    """
+    if (
+        backend.is_tpu_strategy(model._distribution_strategy)
+        and tf.executing_eagerly
+    ):  # b/137580852
+        return False
+    elif tf.compat.v1.executing_eagerly_outside_functions():
+        return bool(model._compile_distribution)
+    return True
 
 
 def _custom_compile_for_predict(model):
-  """Custom compile for TPU predict mode."""
-  if not model.built:
-    # Model is not compilable because it does not know its number of inputs
-    # and outputs, nor their shapes and names. We will compile after the first
-    # time the model gets called on training data.
-    return
-  model._is_compiled = True
-  model.total_loss = None
-  model.train_function = None
-  model.test_function = None
-  model.predict_function = None
+    """Custom compile for TPU predict mode."""
+    if not model.built:
+        # Model is not compilable because it does not know its number of inputs
+        # and outputs, nor their shapes and names. We will compile after the
+        # first time the model gets called on training data.
+        return
+    model._is_compiled = True
+    model.total_loss = None
+    model.train_function = None
+    model.test_function = None
+    model.predict_function = None
 
 
 def _build_network_on_replica(model, mode, inputs=None, targets=None):
-  """Build an updated model on replicas.
-
-  We create a new Keras model while sharing the variables from the old graph.
-  Building a new sub-graph is required since the original keras model creates
-  placeholders for the input and the output that are not accessible till we
-  call iterator.get_next() inside the step_fn for `fit`/`evaluate`/`predict`.
-
-  The sharing of weights and layers between the old and the new model guarantee
-  that we're using Strategy variables and any updates on either model are
-  reflected correctly in callbacks and loop iterations.
-
-  We need to make sure we share the optimizers between the old and the new model
-  as well so that optimizer state is not lost if the user is running fit
-  multiple times.
-
-  Args:
-    model: Model to be replicated across Replicas
-    mode: Which of fit/eval/predict is building the distributed network
-    inputs: Input variables to be passed to the model
-    targets: Target tensor to be passed to model.compile
-
-  Returns:
-    A new model with shared layers with the old model.
-  """
-  # Need to do imports here since we run into a circular dependency error.
-  from keras import models  # pylint: disable=g-import-not-at-top
-  from keras.engine import sequential  # pylint: disable=g-import-not-at-top
-
-  # We rely on the internal methods to avoid having share_weights weights in the
-  # public API.
-  if isinstance(model, sequential.Sequential):
-    updated_model = models._clone_sequential_model(
-        model, input_tensors=inputs, layer_fn=models.share_weights)
-  else:
-    updated_model = models._clone_functional_model(
-        model, input_tensors=inputs, layer_fn=models.share_weights)
-    # Callable losses added directly to a functional Model need to be added
-    # here.
-    updated_model._callable_losses = model._callable_losses
-
-  # Recast all low precision outputs back to float32 since we only casted
-  # the inputs to bfloat16 and not targets. This is done so that we can preserve
-  # precision when calculating the loss value.
-  def _upcast_low_precision_outputs(output):
-    if output.dtype == tf.bfloat16:
-      return tf.cast(output, tf.float32)
+    """Build an updated model on replicas.
+
+    We create a new Keras model while sharing the variables from the old graph.
+    Building a new sub-graph is required since the original keras model creates
+    placeholders for the input and the output that are not accessible till we
+    call iterator.get_next() inside the step_fn for `fit`/`evaluate`/`predict`.
+
+    The sharing of weights and layers between the old and the new model
+    guarantee that we're using Strategy variables and any updates on either
+    model are reflected correctly in callbacks and loop iterations.
+
+    We need to make sure we share the optimizers between the old and the new
+    model as well so that optimizer state is not lost if the user is running fit
+    multiple times.
+
+    Args:
+      model: Model to be replicated across Replicas
+      mode: Which of fit/eval/predict is building the distributed network
+      inputs: Input variables to be passed to the model
+      targets: Target tensor to be passed to model.compile
+
+    Returns:
+      A new model with shared layers with the old model.
+    """
+    # Need to do imports here since we run into a circular dependency error.
+    from keras import models
+    from keras.engine import sequential
+
+    # We rely on the internal methods to avoid having share_weights weights in
+    # the public API.
+    if isinstance(model, sequential.Sequential):
+        updated_model = models._clone_sequential_model(
+            model, input_tensors=inputs, layer_fn=models.share_weights
+        )
+    else:
+        updated_model = models._clone_functional_model(
+            model, input_tensors=inputs, layer_fn=models.share_weights
+        )
+        # Callable losses added directly to a functional Model need to be added
+        # here.
+        updated_model._callable_losses = model._callable_losses
+
+    # Recast all low precision outputs back to float32 since we only casted the
+    # inputs to bfloat16 and not targets. This is done so that we can preserve
+    # precision when calculating the loss value.
+    def _upcast_low_precision_outputs(output):
+        if output.dtype == tf.bfloat16:
+            return tf.cast(output, tf.float32)
+        else:
+            return output
+
+    updated_model.outputs = [
+        _upcast_low_precision_outputs(o) for o in updated_model.outputs
+    ]
+
+    if isinstance(targets, tuple):
+        targets = tf.nest.flatten(targets)
+
+    if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+        _custom_compile_for_predict(updated_model)
     else:
-      return output
-  updated_model.outputs = [_upcast_low_precision_outputs(o)
-                           for o in updated_model.outputs]
-
-  if isinstance(targets, tuple):
-    targets = tf.nest.flatten(targets)
-
-  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
-    _custom_compile_for_predict(updated_model)
-  else:
-    updated_model.compile(
-        model.optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=targets)
-  return updated_model
-
-
-def _build_distributed_network(model, strategy, mode, inputs=None,
-                               targets=None):
-  """Create a cloned model on each replica."""
-  with backend.get_graph().as_default(), strategy.scope():
-    distributed_model = strategy.extended.call_for_each_replica(
-        _build_network_on_replica,
-        args=(model, mode, inputs, targets))
-    set_distributed_model(model, mode, distributed_model)
+        updated_model.compile(
+            model.optimizer,
+            model.loss,
+            metrics=metrics_module.clone_metrics(model._compile_metrics),
+            loss_weights=model.loss_weights,
+            sample_weight_mode=model.sample_weight_mode,
+            weighted_metrics=metrics_module.clone_metrics(
+                model._compile_weighted_metrics
+            ),
+            target_tensors=targets,
+        )
+    return updated_model
+
+
+def _build_distributed_network(
+    model, strategy, mode, inputs=None, targets=None
+):
+    """Create a cloned model on each replica."""
+    with backend.get_graph().as_default(), strategy.scope():
+        distributed_model = strategy.extended.call_for_each_replica(
+            _build_network_on_replica, args=(model, mode, inputs, targets)
+        )
+        set_distributed_model(model, mode, distributed_model)
 
 
 def _clone_and_build_model(model, mode, inputs=None, targets=None):
-  """Clone and build the given keras_model."""
-  # We need to set the import here since we run into a circular dependency
-  # error.
-  from keras import models  # pylint: disable=g-import-not-at-top
-  cloned_model = models.clone_model(model, input_tensors=inputs)
-
-  # Compile and build model.
-  if isinstance(model.optimizer, optimizers.TFOptimizer):
-    optimizer = model.optimizer
-  else:
-    optimizer_config = model.optimizer.get_config()
-    optimizer = model.optimizer.__class__.from_config(optimizer_config)
-
-  # Recast all low precision outputs back to float32 since we only casted
-  # the inputs to bfloat16 and not targets. This is done so that we can preserve
-  # precision when calculating the loss value.
-  def _upcast_low_precision_outputs(output):
-    if output.dtype == tf.bfloat16:
-      return tf.cast(output, tf.float32)
+    """Clone and build the given keras_model."""
+    # We need to set the import here since we run into a circular dependency
+    # error.
+    from keras import models
+
+    cloned_model = models.clone_model(model, input_tensors=inputs)
+
+    # Compile and build model.
+    if isinstance(model.optimizer, optimizers.TFOptimizer):
+        optimizer = model.optimizer
     else:
-      return output
-  cloned_model.outputs = [_upcast_low_precision_outputs(o)
-                          for o in cloned_model.outputs]
-
-  if isinstance(targets, tuple):
-    targets = tf.nest.flatten(targets)
-  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
-    _custom_compile_for_predict(cloned_model)
-  else:
-    cloned_model.compile(
-        optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=targets)
-  return cloned_model
+        optimizer_config = model.optimizer.get_config()
+        optimizer = model.optimizer.__class__.from_config(optimizer_config)
+
+    # Recast all low precision outputs back to float32 since we only casted
+    # the inputs to bfloat16 and not targets. This is done so that we can
+    # preserve precision when calculating the loss value.
+    def _upcast_low_precision_outputs(output):
+        if output.dtype == tf.bfloat16:
+            return tf.cast(output, tf.float32)
+        else:
+            return output
+
+    cloned_model.outputs = [
+        _upcast_low_precision_outputs(o) for o in cloned_model.outputs
+    ]
+
+    if isinstance(targets, tuple):
+        targets = tf.nest.flatten(targets)
+    if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+        _custom_compile_for_predict(cloned_model)
+    else:
+        cloned_model.compile(
+            optimizer,
+            model.loss,
+            metrics=metrics_module.clone_metrics(model._compile_metrics),
+            loss_weights=model.loss_weights,
+            sample_weight_mode=model.sample_weight_mode,
+            weighted_metrics=metrics_module.clone_metrics(
+                model._compile_weighted_metrics
+            ),
+            target_tensors=targets,
+        )
+    return cloned_model
 
 
 def clone_model_on_replicas(model, strategy, mode, inputs=None, targets=None):
-  """Create a cloned model on each replica."""
-  with backend.get_graph().as_default(), strategy.scope():
-    distributed_model = strategy.extended.call_for_each_replica(
-        _clone_and_build_model, args=(model, mode, inputs, targets))
-    set_distributed_model(model, mode, distributed_model)
-  if mode == ModeKeys.TRAIN:
-    model._make_callback_model(distributed_model)
+    """Create a cloned model on each replica."""
+    with backend.get_graph().as_default(), strategy.scope():
+        distributed_model = strategy.extended.call_for_each_replica(
+            _clone_and_build_model, args=(model, mode, inputs, targets)
+        )
+        set_distributed_model(model, mode, distributed_model)
+    if mode == ModeKeys.TRAIN:
+        model._make_callback_model(distributed_model)
 
 
 def _make_execution_function(model, mode):
-  """Makes or reuses function to run one step of distributed model execution."""
-  if is_distributing_by_cloning(model):
-    return _make_execution_function_with_cloning(model, mode)
+    """Makes or reuses function to run one step of distributed model
+    execution."""
+    if is_distributing_by_cloning(model):
+        return _make_execution_function_with_cloning(model, mode)
 
-  distributed_function = get_distributed_function(model, mode)
-  if distributed_function:
-    return distributed_function
+    distributed_function = get_distributed_function(model, mode)
+    if distributed_function:
+        return distributed_function
 
-  distribution_function = _make_execution_function_without_cloning(model, mode)
-  set_distributed_function(model, mode, distribution_function)
-  return distribution_function
+    distribution_function = _make_execution_function_without_cloning(
+        model, mode
+    )
+    set_distributed_function(model, mode, distribution_function)
+    return distribution_function
 
 
 def _make_execution_function_without_cloning(model, mode):
-  """Creates a function to run one step of distributed model execution."""
-  strategy = model._distribution_strategy
-
-  with strategy.scope():
-    per_replica_function = _make_replica_execution_function(model, mode)
-
-    def distributed_function(input_fn):
-      """A single step of the distributed execution across replicas."""
-      x, y, sample_weights = input_fn()
-      # Call `Model.{train,test,predict}_on_batch` on every replica passing
-      # PerReplicas as arguments.  On every replica inside this call, each
-      # PerReplica object will return the value for that replica.  The outputs
-      # are PerReplicas too.
-      outputs = strategy.run(per_replica_function, args=(x, y, sample_weights))
-      # Out of PerReplica outputs reduce or pick values to return.
-      all_outputs = unwrap_outputs(
-          strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT))
-      return all_outputs
-
-    if not model.run_eagerly:
-      distributed_function = tf.function(distributed_function)
-      def execution_function(input_fn):
-        # `numpy` translates Tensors to values in Eager mode.
-        return [out.numpy() for out in distributed_function(input_fn)]
-    else:
-      execution_function = distributed_function
+    """Creates a function to run one step of distributed model execution."""
+    strategy = model._distribution_strategy
+
+    with strategy.scope():
+        per_replica_function = _make_replica_execution_function(model, mode)
+
+        def distributed_function(input_fn):
+            """A single step of the distributed execution across replicas."""
+            x, y, sample_weights = input_fn()
+            # Call `Model.{train,test,predict}_on_batch` on every replica
+            # passing PerReplicas as arguments.  On every replica inside this
+            # call, each PerReplica object will return the value for that
+            # replica. The outputs are PerReplicas too.
+            outputs = strategy.run(
+                per_replica_function, args=(x, y, sample_weights)
+            )
+            # Out of PerReplica outputs reduce or pick values to return.
+            all_outputs = unwrap_outputs(
+                strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT)
+            )
+            return all_outputs
 
-    return execution_function
+        if not model.run_eagerly:
+            distributed_function = tf.function(distributed_function)
+
+            def execution_function(input_fn):
+                # `numpy` translates Tensors to values in Eager mode.
+                return [out.numpy() for out in distributed_function(input_fn)]
+
+        else:
+            execution_function = distributed_function
+
+        return execution_function
 
 
 def _make_replica_execution_function(model, mode):
-  """A single step of the distributed execution on a replica."""
-  if mode == ModeKeys.TRAIN:
-    func = model.train_on_batch
-  elif mode == ModeKeys.TEST:
-    func = model.test_on_batch
-  else:
+    """A single step of the distributed execution on a replica."""
+    if mode == ModeKeys.TRAIN:
+        func = model.train_on_batch
+    elif mode == ModeKeys.TEST:
+        func = model.test_on_batch
+    else:
 
-    def predict_on_batch(x, y=None, sample_weights=None):
-      del y, sample_weights
-      return model.predict_on_batch(x)
+        def predict_on_batch(x, y=None, sample_weights=None):
+            del y, sample_weights
+            return model.predict_on_batch(x)
 
-    func = predict_on_batch
+        func = predict_on_batch
 
-  if mode != ModeKeys.PREDICT:
-    # `reset_metrics` is set to False to maintain stateful metrics across
-    # batch-level calls.
-    func = functools.partial(func, reset_metrics=False)
+    if mode != ModeKeys.PREDICT:
+        # `reset_metrics` is set to False to maintain stateful metrics across
+        # batch-level calls.
+        func = functools.partial(func, reset_metrics=False)
 
-  return func
+    return func
 
 
 def _make_replicated_models_with_cloning(model, mode):
-  """Build models on each replica."""
-  strategy = model._distribution_strategy
+    """Build models on each replica."""
+    strategy = model._distribution_strategy
 
-  # If distributed_model is not built, create one for `mode`.
-  if model._compile_distribution:
-    clone_model_on_replicas(model, strategy, mode)
-  else:
-    _build_distributed_network(model, strategy, mode)
+    # If distributed_model is not built, create one for `mode`.
+    if model._compile_distribution:
+        clone_model_on_replicas(model, strategy, mode)
+    else:
+        _build_distributed_network(model, strategy, mode)
 
 
 def _make_execution_function_with_cloning(model, mode):
-  """Clones or re-uses models to run one step of distributed model execution."""
-  distributed_model = get_distributed_model(model, mode)
-  # TODO(b/134069401): Create a cache for the distributed model and exec
-  # function that incorporates additional attributes to be part of the cache key
-  # than just the mode.
-  # If distributed model for a particular `mode` is already built, use the
-  # `_distribution_function` on that distributed model.
-  # If you have updated the sample_weight_mode on the model, then you will need
-  # to recompile metrics and recreate the execution function. This is indicated
-  # by the `_recompile_exec_function` property.
-  if (distributed_model and hasattr(distributed_model, '_distribution_function')
-      and not (hasattr(distributed_model, '_recompile_exec_function') and
-               distributed_model._recompile_exec_function)):
-    return distributed_model._distributed_function
-
-  if not distributed_model:
-    _make_replicated_models_with_cloning(model, mode)
+    """Clones or re-uses models to run one step of distributed model
+    execution."""
     distributed_model = get_distributed_model(model, mode)
-  assert distributed_model
+    # TODO(b/134069401): Create a cache for the distributed model and exec
+    # function that incorporates additional attributes to be part of the cache
+    # key than just the mode.
+    # If distributed model for a particular `mode` is already built, use the
+    # `_distribution_function` on that distributed model.
+    # If you have updated the sample_weight_mode on the model, then you will
+    # need to recompile metrics and recreate the execution function. This is
+    # indicated by the `_recompile_exec_function` property.
+    if (
+        distributed_model
+        and hasattr(distributed_model, "_distribution_function")
+        and not (
+            hasattr(distributed_model, "_recompile_exec_function")
+            and distributed_model._recompile_exec_function
+        )
+    ):
+        return distributed_model._distributed_function
 
-  # Also create an execution function on that distributed model.
-  if tf.executing_eagerly():
-    distributed_function = _make_eager_execution_function(model, mode)
-  else:
-    distributed_function = _make_graph_execution_function(model, mode)
+    if not distributed_model:
+        _make_replicated_models_with_cloning(model, mode)
+        distributed_model = get_distributed_model(model, mode)
+    assert distributed_model
 
-  # We cache the distributed execution function on the model since creating
-  # distributed models and execution functions are expensive.
-  distributed_model._distributed_function = distributed_function
-  distributed_model._recompile_exec_function = False
-  return distributed_function
+    # Also create an execution function on that distributed model.
+    if tf.executing_eagerly():
+        distributed_function = _make_eager_execution_function(model, mode)
+    else:
+        distributed_function = _make_graph_execution_function(model, mode)
+
+    # We cache the distributed execution function on the model since creating
+    # distributed models and execution functions are expensive.
+    distributed_model._distributed_function = distributed_function
+    distributed_model._recompile_exec_function = False
+    return distributed_function
 
 
 def _make_graph_execution_function(model, mode):
-  """Makes function to run one step of distributed model in graph mode."""
-
-  def _per_replica_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
-
-  strategy = model._distribution_strategy
-  with strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_replica_fit_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_replica_function, args=(get_distributed_model(model, mode),))
-
-    # Initialize the variables in the replicated model. This is necessary for
-    # multi-worker training because on some workers, initialization is not
-    # needed. This method does initialization or waiting for initialization
-    # according to the context object of distribute coordinator.
-    init_restore_or_wait_for_variables()
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates, all_session_args) = unwrap_values(
-        strategy,
-        grouped_inputs,
-        grouped_outputs,
-        grouped_updates,
-        grouped_session_args,
-        with_loss_tensor=(mode != ModeKeys.PREDICT))
-
-    return backend.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_{}_function'.format(mode),
-        **all_session_args)
+    """Makes function to run one step of distributed model in graph mode."""
+
+    def _per_replica_function(model):
+        f = model._make_execution_function(mode)
+        return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+    strategy = model._distribution_strategy
+    with strategy.scope():
+        # Create train ops on each of the devices when we call
+        # `_per_replica_fit_function`.
+        (
+            grouped_inputs,
+            grouped_outputs,
+            grouped_updates,
+            grouped_session_args,
+        ) = strategy.extended.call_for_each_replica(
+            _per_replica_function, args=(get_distributed_model(model, mode),)
+        )
+
+        # Initialize the variables in the replicated model. This is necessary
+        # for multi-worker training because on some workers, initialization is
+        # not needed. This method does initialization or waiting for
+        # initialization according to the context object of distribute
+        # coordinator.
+        init_restore_or_wait_for_variables()
+
+        # Unwrap all the per device values returned from
+        # `call_for_each_replica`.  Unwrapping per device values gives you a
+        # list of values that can be used to construct a new train function that
+        # is composed of update ops on all the devices over which the model is
+        # distributed.
+        (
+            all_inputs,
+            all_outputs,
+            all_updates,
+            all_session_args,
+        ) = unwrap_values(
+            strategy,
+            grouped_inputs,
+            grouped_outputs,
+            grouped_updates,
+            grouped_session_args,
+            with_loss_tensor=(mode != ModeKeys.PREDICT),
+        )
+
+        return backend.function(
+            all_inputs,
+            all_outputs,
+            updates=all_updates,
+            name=f"distributed_{mode}_function",
+            **all_session_args,
+        )
 
 
 def _make_eager_execution_function(model, mode):
-  """Makes function to run one step of distributed model eager execution."""
-  def _per_replica_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs)
-
-  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
-  # the global one.
-  strategy = model._distribution_strategy
-  global_graph = backend.get_graph()
-
-  with global_graph.as_default(), strategy.scope():
-    # First we gather the relevant portions of the model across all replicas.
-    # `backend._scratch_graph(global_graph)` signals to Keras that it should not
-    # lift to a separate graph when creating the per-replica functions.
-    with backend._scratch_graph(global_graph):
-      # Create train ops on each of the devices when we call
-      # `_per_replica_fit_function`.
-      grouped = strategy.extended.call_for_each_replica(
-          _per_replica_function, args=(get_distributed_model(model, mode),))
-      grouped_inputs, grouped_outputs = grouped
-
-      # Unwrap all the per device values returned from `call_for_each_replica`.
-      # Unwrapping per device values gives you a list of values that can be
-      # used to construct a new train function that is composed of
-      # inputs/outputs on all the devices over which the model is distributed.
-      (all_inputs, all_outputs, _, _) = unwrap_values(
-          strategy,
-          grouped_inputs,
-          grouped_outputs,
-          with_loss_tensor=(mode != ModeKeys.PREDICT))
-
-    # Finally, a joint Keras function is created; this one will be created in
-    # a separate FuncGraph.
-    return backend.function(
-        all_inputs,
-        all_outputs,
-        name='eager_distributed_{}_function'.format(mode))
+    """Makes function to run one step of distributed model eager execution."""
+
+    def _per_replica_function(model):
+        f = model._make_execution_function(mode)
+        return (f.inputs, f.outputs)
+
+    # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of
+    # using the global one.
+    strategy = model._distribution_strategy
+    global_graph = backend.get_graph()
+
+    with global_graph.as_default(), strategy.scope():
+        # First we gather the relevant portions of the model across all
+        # replicas.  `backend._scratch_graph(global_graph)` signals to Keras
+        # that it should not lift to a separate graph when creating the
+        # per-replica functions.
+        with backend._scratch_graph(global_graph):
+            # Create train ops on each of the devices when we call
+            # `_per_replica_fit_function`.
+            grouped = strategy.extended.call_for_each_replica(
+                _per_replica_function,
+                args=(get_distributed_model(model, mode),),
+            )
+            grouped_inputs, grouped_outputs = grouped
+
+            # Unwrap all the per device values returned from
+            # `call_for_each_replica`.  Unwrapping per device values gives you a
+            # list of values that can be used to construct a new train function
+            # that is composed of inputs/outputs on all the devices over which
+            # the model is distributed.
+            (all_inputs, all_outputs, _, _) = unwrap_values(
+                strategy,
+                grouped_inputs,
+                grouped_outputs,
+                with_loss_tensor=(mode != ModeKeys.PREDICT),
+            )
+
+        # Finally, a joint Keras function is created; this one will be created
+        # in a separate FuncGraph.
+        return backend.function(
+            all_inputs,
+            all_outputs,
+            name=f"eager_distributed_{mode}_function",
+        )
 
 
 def _copy_weights_to_distributed_model(original_model, mode):
-  """Copies weights from original model to distributed models."""
-  strategy = original_model._distribution_strategy
-  distributed_model = get_distributed_model(original_model, mode)
-  if strategy:
-    # Copy the weights from the original model to each of the replicated
-    # models.
-    orig_model_weights = original_model.get_weights()
-    first_model = strategy.unwrap(distributed_model)[0]
-    set_weights(strategy, first_model, orig_model_weights)
+    """Copies weights from original model to distributed models."""
+    strategy = original_model._distribution_strategy
+    distributed_model = get_distributed_model(original_model, mode)
+    if strategy:
+        # Copy the weights from the original model to each of the replicated
+        # models.
+        orig_model_weights = original_model.get_weights()
+        first_model = strategy.unwrap(distributed_model)[0]
+        set_weights(strategy, first_model, orig_model_weights)
 
 
 def _copy_weights_to_original_model(model, mode):
-  """Copies weights from first distributed model back to original model."""
-  if model._distribution_strategy and mode == ModeKeys.TRAIN:
-    distributed_model = get_distributed_model(model, mode)
-    updated_weights = model._distribution_strategy.unwrap(
-        distributed_model)[0].get_weights()
-    model.set_weights(updated_weights)
+    """Copies weights from first distributed model back to original model."""
+    if model._distribution_strategy and mode == ModeKeys.TRAIN:
+        distributed_model = get_distributed_model(model, mode)
+        updated_weights = model._distribution_strategy.unwrap(
+            distributed_model
+        )[0].get_weights()
+        model.set_weights(updated_weights)
 
 
 def _per_replica_aggregate_batch(strategy, batch_outs, model, mode):
-  """Aggregates the per-replica batch-level outputs from a distributed step."""
-  if strategy is not None and mode == ModeKeys.PREDICT:
-    total_batch_outs = []
-    for i in range(len(model.outputs)):
-      num_replicas = strategy.num_replicas_in_sync
-      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
-      total_batch_outs.append(
-          concat_along_batch_dimension(tf.nest.flatten(nested_outs)))
-    return total_batch_outs
-  return batch_outs
+    """Aggregates the per-replica batch-level outputs from a distributed
+    step."""
+    if strategy is not None and mode == ModeKeys.PREDICT:
+        total_batch_outs = []
+        for i in range(len(model.outputs)):
+            num_replicas = strategy.num_replicas_in_sync
+            nested_outs = batch_outs[
+                i * num_replicas : i * num_replicas + num_replicas
+            ]
+            total_batch_outs.append(
+                concat_along_batch_dimension(tf.nest.flatten(nested_outs))
+            )
+        return total_batch_outs
+    return batch_outs
 
 
 def _reset_metrics(model):
-  if model._distribution_strategy:
-    for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
-      distributed_model = get_distributed_model(model, mode)
-      if distributed_model:
-        first_model = model._distribution_strategy.unwrap(distributed_model)[0]
-        first_model.reset_metrics()
+    if model._distribution_strategy:
+        for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
+            distributed_model = get_distributed_model(model, mode)
+            if distributed_model:
+                first_model = model._distribution_strategy.unwrap(
+                    distributed_model
+                )[0]
+                first_model.reset_metrics()
 
 
 def get_distributed_model(model, mode):
-  key = _generate_cache_key(mode)
-  return model._distributed_model_cache.get(key, None)
+    key = _generate_cache_key(mode)
+    return model._distributed_model_cache.get(key, None)
 
 
 def set_distributed_model(model, mode, distributed_model):
-  key = _generate_cache_key(mode)
-  model._distributed_model_cache[key] = distributed_model
+    key = _generate_cache_key(mode)
+    model._distributed_model_cache[key] = distributed_model
 
 
 def get_distributed_function(model, mode):
-  key = _generate_cache_key(mode)
-  return model._distributed_function_cache.get(key, None)
+    key = _generate_cache_key(mode)
+    return model._distributed_function_cache.get(key, None)
 
 
 def set_distributed_function(model, mode, distributed_function):
-  key = _generate_cache_key(mode)
-  model._distributed_function_cache[key] = distributed_function
+    key = _generate_cache_key(mode)
+    model._distributed_function_cache[key] = distributed_function
 
 
 def _generate_cache_key(mode):
-  key = hash(mode)
-  return key
+    key = hash(mode)
+    return key
 
 
 @tf_contextlib.contextmanager
 def distributed_scope(strategy, learning_phase):
-  with strategy.scope(), backend.learning_phase_scope(learning_phase):
-    yield
+    with strategy.scope(), backend.learning_phase_scope(learning_phase):
+        yield
 
 
 def is_current_worker_chief():
-  return dc.get_current_worker_context().is_chief
+    return dc.get_current_worker_context().is_chief
 
 
 def filter_distributed_callbacks(callbacks_list, model):
-  """Filter Callbacks based on the worker context when running multi-worker.
-
-  Args:
-    callbacks_list: A list of `Callback` instances.
-    model: Keras model instance.
-
-  Returns:
-    The list of `Callback` instances that should be run on this worker.
-  """
-
-  if not model._in_multi_worker_mode():
-    raise ValueError(
-        'filter_distributed_callbacks() should only be called when Keras '
-        'is in multi worker mode.')
-
-  callbacks_list = callbacks_list or []
-  if not [
-      c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
-  ]:
-    # TODO(rchao): Consider providing a ModelCheckpoint here if the user
-    # fails to (possibly with tempfile directory).
-    logging.warning('ModelCheckpoint callback is not provided. '
-                    'Workers will need to restart training if any fails.')
-
-  if callbacks_list is None or is_current_worker_chief():
-    return callbacks_list
-
-  # Some Callbacks should only run on the chief worker.
-  return [
-      callback for callback in callbacks_list if not callback._chief_worker_only
-  ]  # pylint: disable=protected-access
+    """Filter Callbacks based on the worker context when running multi-worker.
+
+    Args:
+      callbacks_list: A list of `Callback` instances.
+      model: Keras model instance.
+
+    Returns:
+      The list of `Callback` instances that should be run on this worker.
+    """
+
+    if not model._in_multi_worker_mode():
+        raise ValueError(
+            "filter_distributed_callbacks() should only be called when Keras "
+            "is in multi worker mode."
+        )
+
+    callbacks_list = callbacks_list or []
+    if not [
+        c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
+    ]:
+        # TODO(rchao): Consider providing a ModelCheckpoint here if the user
+        # fails to (possibly with tempfile directory).
+        logging.warning(
+            "ModelCheckpoint callback is not provided. "
+            "Workers will need to restart training if any fails."
+        )
+
+    if callbacks_list is None or is_current_worker_chief():
+        return callbacks_list
+
+    # Some Callbacks should only run on the chief worker.
+    return [
+        callback
+        for callback in callbacks_list
+        if not callback._chief_worker_only
+    ]
 
 
 def _update_sample_weight_modes(model, mode, sample_weights):
-  """Update sample_weight_mode of the distributed model."""
-  if is_distributing_by_cloning(model):
-    distributed_model = get_distributed_model(model, mode)
-    if not distributed_model:
-      _make_replicated_models_with_cloning(model, mode)
-      distributed_model = get_distributed_model(model, mode)
-    distributed_model._recompile_exec_function = any(
-        [e.sample_weights_mismatch() for e in model._training_endpoints])
-
-    if sample_weights:
-      distributed_models = flatten_per_replica_values(
-          model._distribution_strategy, distributed_model)
-      # sample_weights is a tuple of 1 list where the number of elements in the
-      # list is equal to the number of replicas in sync.
-      sample_weights = sample_weights[0]
-      if sample_weights and None not in sample_weights:
-        for m, sw in zip(distributed_models, sample_weights):
-          m._update_sample_weight_modes(sample_weights=[sw])
+    """Update sample_weight_mode of the distributed model."""
+    if is_distributing_by_cloning(model):
+        distributed_model = get_distributed_model(model, mode)
+        if not distributed_model:
+            _make_replicated_models_with_cloning(model, mode)
+            distributed_model = get_distributed_model(model, mode)
+        distributed_model._recompile_exec_function = any(
+            [e.sample_weights_mismatch() for e in model._training_endpoints]
+        )
+
+        if sample_weights:
+            distributed_models = flatten_per_replica_values(
+                model._distribution_strategy, distributed_model
+            )
+            # sample_weights is a tuple of 1 list where the number of elements
+            # in the list is equal to the number of replicas in sync.
+            sample_weights = sample_weights[0]
+            if sample_weights and None not in sample_weights:
+                for m, sw in zip(distributed_models, sample_weights):
+                    m._update_sample_weight_modes(sample_weights=[sw])
 
 
 def concat_along_batch_dimension(outputs):
-  """Concats prediction outputs along the batch dimension."""
-  if isinstance(outputs[0], tf.SparseTensor):
-    return tf.sparse.concat(axis=0, sp_inputs=outputs)
-  if isinstance(outputs[0], tf.RaggedTensor):
-    return tf.concat(outputs, axis=0)
-  return np.concatenate(outputs)
+    """Concats prediction outputs along the batch dimension."""
+    if isinstance(outputs[0], tf.SparseTensor):
+        return tf.sparse.concat(axis=0, sp_inputs=outputs)
+    if isinstance(outputs[0], tf.RaggedTensor):
+        return tf.concat(outputs, axis=0)
+    return np.concatenate(outputs)
diff --git a/keras/distribute/keras_correctness_test_base.py b/keras/distribute/keras_correctness_test_base.py
index 1f131128a234..1e5501654ecd 100644
--- a/keras/distribute/keras_correctness_test_base.py
+++ b/keras/distribute/keras_correctness_test_base.py
@@ -14,16 +14,18 @@
 # ==============================================================================
 """Correctness tests for tf.keras using DistributionStrategy."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
-from absl.testing import parameterized
+
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras.distribute import distributed_training_utils
 from keras.distribute.strategy_combinations import all_strategies
-from keras.distribute.strategy_combinations import multi_worker_mirrored_strategies
+from keras.distribute.strategy_combinations import (
+    multi_worker_mirrored_strategies,
+)
 from keras.distribute.strategy_combinations import strategies_minus_tpu
 from keras.mixed_precision import policy
 from keras.utils import data_utils
@@ -37,583 +39,672 @@
 
 
 def eager_mode_test_configuration():
-  return tf.__internal__.test.combinations.combine(
-      mode='eager', use_numpy=[True, False], use_validation_data=[True, False])
+    return tf.__internal__.test.combinations.combine(
+        mode="eager", use_numpy=[True, False], use_validation_data=[True, False]
+    )
 
 
 def graph_mode_test_configuration():
-  return tf.__internal__.test.combinations.combine(
-      mode='graph', use_numpy=[True, False], use_validation_data=[True, False])
+    return tf.__internal__.test.combinations.combine(
+        mode="graph", use_numpy=[True, False], use_validation_data=[True, False]
+    )
 
 
 def all_strategy_and_input_config_combinations():
-  return (tf.__internal__.test.combinations.times(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies),
-      eager_mode_test_configuration() + graph_mode_test_configuration()))
+    return tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(distribution=all_strategies),
+        eager_mode_test_configuration() + graph_mode_test_configuration(),
+    )
 
 
 def all_strategy_and_input_config_combinations_eager():
-  return (tf.__internal__.test.combinations.times(
-      tf.__internal__.test.combinations.combine(distribution=all_strategies),
-      eager_mode_test_configuration()))
+    return tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(distribution=all_strategies),
+        eager_mode_test_configuration(),
+    )
 
 
 def strategy_minus_tpu_and_input_config_combinations_eager():
-  return (tf.__internal__.test.combinations.times(
-      tf.__internal__.test.combinations.combine(distribution=strategies_minus_tpu),
-      eager_mode_test_configuration()))
+    return tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategies_minus_tpu
+        ),
+        eager_mode_test_configuration(),
+    )
 
 
 def strategies_for_embedding_models():
-  """Returns distribution strategies to test for embedding models.
+    """Returns distribution strategies to test for embedding models.
 
-  Since embedding models take longer to train, we disregard DefaultStrategy
-  in order to prevent testing timeouts.
-  """
+    Since embedding models take longer to train, we disregard DefaultStrategy
+    in order to prevent testing timeouts.
+    """
 
-  return [
-      s for s in all_strategies if s.required_tpu or s.required_gpus or
-      s is tf.__internal__.distribute.combinations.one_device_strategy
-  ]
+    return [
+        s
+        for s in all_strategies
+        if s.required_tpu
+        or s.required_gpus
+        or s is tf.__internal__.distribute.combinations.one_device_strategy
+    ]
 
 
 def test_combinations_for_embedding_model():
-  # TODO(sourabhbajaj): Enable tests for eager mode
-  eager_mode_strategies = [
-      s for s in strategies_for_embedding_models() if not s.required_tpu
-  ]
-
-  return (tf.__internal__.test.combinations.times(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategies_for_embedding_models()),
-      (graph_mode_test_configuration())) + tf.__internal__.test.combinations.times(
-          tf.__internal__.test.combinations.combine(
-              distribution=eager_mode_strategies),
-          (eager_mode_test_configuration())))
+    # TODO(sourabhbajaj): Enable tests for eager mode
+    eager_mode_strategies = [
+        s for s in strategies_for_embedding_models() if not s.required_tpu
+    ]
+
+    return tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategies_for_embedding_models()
+        ),
+        (graph_mode_test_configuration()),
+    ) + tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(
+            distribution=eager_mode_strategies
+        ),
+        (eager_mode_test_configuration()),
+    )
 
 
 def test_combinations_with_tpu_strategies_graph():
-  tpu_strategies = [
-      tf.__internal__.distribute.combinations.tpu_strategy,
-  ]
+    tpu_strategies = [
+        tf.__internal__.distribute.combinations.tpu_strategy,
+    ]
 
-  return (tf.__internal__.test.combinations.times(
-      tf.__internal__.test.combinations.combine(distribution=tpu_strategies),
-      graph_mode_test_configuration()))
+    return tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(distribution=tpu_strategies),
+        graph_mode_test_configuration(),
+    )
 
 
 def multi_worker_mirrored_eager():
-  return tf.__internal__.test.combinations.times(
-      tf.__internal__.test.combinations.combine(distribution=multi_worker_mirrored_strategies),
-      eager_mode_test_configuration())
+    return tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(
+            distribution=multi_worker_mirrored_strategies
+        ),
+        eager_mode_test_configuration(),
+    )
 
 
 def multi_worker_mirrored_eager_and_graph():
-  return tf.__internal__.test.combinations.times(
-      tf.__internal__.test.combinations.combine(distribution=multi_worker_mirrored_strategies),
-      eager_mode_test_configuration() + graph_mode_test_configuration())
+    return tf.__internal__.test.combinations.times(
+        tf.__internal__.test.combinations.combine(
+            distribution=multi_worker_mirrored_strategies
+        ),
+        eager_mode_test_configuration() + graph_mode_test_configuration(),
+    )
 
 
 class MaybeDistributionScope:
-  """Provides a context allowing no distribution strategy."""
+    """Provides a context allowing no distribution strategy."""
 
-  def __init__(self, distribution):
-    self._distribution = distribution
-    self._scope = None
+    def __init__(self, distribution):
+        self._distribution = distribution
+        self._scope = None
 
-  def __enter__(self):
-    if self._distribution:
-      self._scope = self._distribution.scope()
-      self._scope.__enter__()
+    def __enter__(self):
+        if self._distribution:
+            self._scope = self._distribution.scope()
+            self._scope.__enter__()
 
-  def __exit__(self, exc_type, value, traceback):
-    if self._distribution:
-      self._scope.__exit__(exc_type, value, traceback)
-      self._scope = None
+    def __exit__(self, exc_type, value, traceback):
+        if self._distribution:
+            self._scope.__exit__(exc_type, value, traceback)
+            self._scope = None
 
 
 def batch_wrapper(dataset, batch_size, repeat=None):
-  if repeat:
-    dataset = dataset.repeat(repeat)
-  return dataset.batch(batch_size)
+    if repeat:
+        dataset = dataset.repeat(repeat)
+    return dataset.batch(batch_size)
 
 
 def get_batch_size(global_batch_size, distribution):
-  batch_size = global_batch_size
-  # TODO(b/118776054): Use global batch size for Keras/DS support.
-  use_per_core_batch_size = (
-      distribution and
-      not distributed_training_utils.global_batch_size_supported(distribution))
-  if use_per_core_batch_size:
-    batch_size //= distribution.num_replicas_in_sync
-  return batch_size
+    batch_size = global_batch_size
+    # TODO(b/118776054): Use global batch size for Keras/DS support.
+    use_per_core_batch_size = (
+        distribution
+        and not distributed_training_utils.global_batch_size_supported(
+            distribution
+        )
+    )
+    if use_per_core_batch_size:
+        batch_size //= distribution.num_replicas_in_sync
+    return batch_size
 
 
 def get_data_size(data):
-  """Gets the size of data in list, tuple, dict, or a numpy array."""
-  assert isinstance(data, (np.ndarray, list, dict, tuple))
+    """Gets the size of data in list, tuple, dict, or a numpy array."""
+    assert isinstance(data, (np.ndarray, list, dict, tuple))
 
-  if isinstance(data, np.ndarray):
-    return len(data)
+    if isinstance(data, np.ndarray):
+        return len(data)
 
-  if isinstance(data, (list, tuple)):
-    return len(data[0])
+    if isinstance(data, (list, tuple)):
+        return len(data[0])
 
-  return len(data.values())
+    return len(data.values())
 
 
 def get_shapes(data):
-  shapes = None
-  if all(hasattr(x, 'shape') for x in tf.nest.flatten(data)):
-    shapes = tf.nest.map_structure(lambda x: x.shape, data)
-  return shapes
-
-
-def get_correctness_test_inputs(use_numpy, use_validation_data,
-                                with_distribution, x_train, y_train, x_eval,
-                                y_eval, x_predict, training_epochs):
-  """Generates the inputs for correctness check when enable Keras with DS."""
-  global_batch_size = _GLOBAL_BATCH_SIZE
-  batch_size = get_batch_size(global_batch_size, with_distribution)
-
-  if use_numpy:
-    training_inputs = {
-        'batch_size': batch_size,
-        'x': x_train,
-        'y': y_train,
-        'epochs': training_epochs,
-        'shuffle': False,
-    }
-
-    if use_validation_data:
-      eval_inputs = None
-      training_inputs['validation_data'] = (x_eval, y_eval)
+    shapes = None
+    if all(hasattr(x, "shape") for x in tf.nest.flatten(data)):
+        shapes = tf.nest.map_structure(lambda x: x.shape, data)
+    return shapes
+
+
+def get_correctness_test_inputs(
+    use_numpy,
+    use_validation_data,
+    with_distribution,
+    x_train,
+    y_train,
+    x_eval,
+    y_eval,
+    x_predict,
+    training_epochs,
+):
+    """Generates the inputs for correctness check when enable Keras with DS."""
+    global_batch_size = _GLOBAL_BATCH_SIZE
+    batch_size = get_batch_size(global_batch_size, with_distribution)
+
+    if use_numpy:
+        training_inputs = {
+            "batch_size": batch_size,
+            "x": x_train,
+            "y": y_train,
+            "epochs": training_epochs,
+            "shuffle": False,
+        }
+
+        if use_validation_data:
+            eval_inputs = None
+            training_inputs["validation_data"] = (x_eval, y_eval)
+        else:
+            eval_inputs = {
+                "batch_size": batch_size,
+                "x": x_eval,
+                "y": y_eval,
+            }
+        predict_inputs = {"x": x_predict}
     else:
-      eval_inputs = {
-          'batch_size': batch_size,
-          'x': x_eval,
-          'y': y_eval,
-      }
-    predict_inputs = {'x': x_predict}
-  else:
-    training_data_size = get_data_size(x_train)
-    # For dataset inputs, we do not pass batch_size to
-    # keras.fit/evaluate/predict. The batch size is part of the dataset.
-    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-    x = batch_wrapper(train_dataset, batch_size, repeat=training_epochs)
-
-    steps_per_epoch = int(np.ceil(1.0 * training_data_size / global_batch_size))
-    training_inputs = {
-        'batch_size': None,
-        'x': x,
-        'y': None,
-        'epochs': training_epochs,
-        'shuffle': False,
-        'steps_per_epoch': steps_per_epoch
-    }
-    if use_validation_data:
-      eval_inputs = None  # Remove the eval_inputs
-      eval_dataset = tf.data.Dataset.from_tensor_slices((x_eval, y_eval))
-      x = batch_wrapper(eval_dataset, batch_size)
-      training_inputs['validation_data'] = x
-      training_inputs['validation_steps'] = 5
+        training_data_size = get_data_size(x_train)
+        # For dataset inputs, we do not pass batch_size to
+        # keras.fit/evaluate/predict. The batch size is part of the dataset.
+        train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+        x = batch_wrapper(train_dataset, batch_size, repeat=training_epochs)
+
+        steps_per_epoch = int(
+            np.ceil(1.0 * training_data_size / global_batch_size)
+        )
+        training_inputs = {
+            "batch_size": None,
+            "x": x,
+            "y": None,
+            "epochs": training_epochs,
+            "shuffle": False,
+            "steps_per_epoch": steps_per_epoch,
+        }
+        if use_validation_data:
+            eval_inputs = None  # Remove the eval_inputs
+            eval_dataset = tf.data.Dataset.from_tensor_slices((x_eval, y_eval))
+            x = batch_wrapper(eval_dataset, batch_size)
+            training_inputs["validation_data"] = x
+            training_inputs["validation_steps"] = 5
+        else:
+            eval_dataset = tf.data.Dataset.from_tensor_slices((x_eval, y_eval))
+            x = batch_wrapper(eval_dataset, batch_size)
+            eval_steps = int(
+                np.ceil(1.0 * get_data_size(x_eval) / global_batch_size)
+            )
+            eval_inputs = {
+                "batch_size": None,
+                "x": x,
+                "y": None,
+                "steps": eval_steps,
+            }
+
+        predict_batch_size = get_batch_size(
+            get_data_size(x_predict), with_distribution
+        )
+        predict_dataset = tf.data.Dataset.from_tensor_slices(x_predict)
+        predict_dataset = batch_wrapper(predict_dataset, predict_batch_size)
+        predict_inputs = {
+            "steps": 1,
+            "x": predict_dataset,
+        }
+
+    return training_inputs, eval_inputs, predict_inputs
+
+
+def fit_eval_and_predict(
+    initial_weights,
+    input_fn,
+    model_fn,
+    distribution=None,
+    is_stateful_model=False,
+):
+    """Generates results for fit/predict/evaluate for given model."""
+    training_inputs, eval_inputs, predict_inputs = input_fn()
+    model = model_fn(
+        initial_weights=initial_weights,
+        distribution=distribution,
+        input_shapes=get_shapes(training_inputs["x"]),
+    )
+
+    result = {}
+    result["training_history_1"] = model.fit(**training_inputs).history
+
+    if eval_inputs is not None:
+        result["eval_result_1"] = model.evaluate(**eval_inputs)
+
+    result["weights_1"] = model.get_weights()
+
+    if predict_inputs is not None:
+        # Check correctness of the result of predict() invoked
+        # multiple times -- as for stateful models, result of
+        # predict may differ for each batch.
+        predict_length = 1
+        if is_stateful_model:
+            predict_length = 3
+        for i in range(predict_length):
+            result_key = f"predict_result_{i}"
+            result[result_key] = model.predict(**predict_inputs)
+
+    # Train and eval again to mimic user's flow.
+
+    result["training_history_2"] = model.fit(**training_inputs).history
+
+    if eval_inputs is not None:
+        result["eval_result_2"] = model.evaluate(**eval_inputs)
+
+    result["weights_2"] = model.get_weights()
+
+    return result
+
+
+def compare_results(
+    results_with_ds,
+    results_without_ds,
+    distribution,
+    testcase,
+    partial_last_batch=None,
+):
+    """Compares results of model compiled with/without distribution strategy."""
+    if policy.global_policy().compute_dtype in ("float16", "bfloat16"):
+        default_tolerance = 1e-2
+        relaxed_tolerance = 1e-2
+    elif partial_last_batch == "train_and_eval":
+        # We relax the tolerance a lot in the partial last batch case as
+        #   1. the examples in uneven batches may have different weights when
+        #      applying the gradients in the distributed case.
+        #   2. TF Keras and TF Keras DS have different ways to handle the case
+        #      when training with epochs > 1 with numpy inputs. In TF Keras,
+        #      every epoch may have a partial batch. While in TF Keras DS, as we
+        #      convert numpy inputs into dataset, it will do a repeat() first
+        #      and calculate steps_per_epoch, so it will at most have one
+        #      partial batch. This makes the 1-CPU result even different.
+        default_tolerance = 1e-3
+        relaxed_tolerance = 1e-3
     else:
-      eval_dataset = tf.data.Dataset.from_tensor_slices((x_eval, y_eval))
-      x = batch_wrapper(eval_dataset, batch_size)
-      eval_steps = int(np.ceil(1.0 * get_data_size(x_eval) / global_batch_size))
-      eval_inputs = {
-          'batch_size': None,
-          'x': x,
-          'y': None,
-          'steps': eval_steps,
-      }
-
-    predict_batch_size = get_batch_size(
-        get_data_size(x_predict), with_distribution)
-    predict_dataset = tf.data.Dataset.from_tensor_slices(x_predict)
-    predict_dataset = batch_wrapper(predict_dataset, predict_batch_size)
-    predict_inputs = {
-        'steps': 1,
-        'x': predict_dataset,
-    }
-
-  return training_inputs, eval_inputs, predict_inputs
-
-
-def fit_eval_and_predict(initial_weights,
-                         input_fn,
-                         model_fn,
-                         distribution=None,
-                         is_stateful_model=False):
-  """Generates results for fit/predict/evaluate for given model."""
-  training_inputs, eval_inputs, predict_inputs = input_fn()
-  model = model_fn(
-      initial_weights=initial_weights,
-      distribution=distribution,
-      input_shapes=get_shapes(training_inputs['x']))
-
-  result = {}
-  result['training_history_1'] = model.fit(**training_inputs).history
-
-  if eval_inputs is not None:
-    result['eval_result_1'] = model.evaluate(**eval_inputs)
-
-  result['weights_1'] = model.get_weights()
-
-  if predict_inputs is not None:
-    # Check correctness of the result of predict() invoked
-    # multiple times -- as for stateful models, result of
-    # predict may differ for each batch.
-    predict_length = 1
-    if is_stateful_model:
-      predict_length = 3
-    for i in range(predict_length):
-      result_key = 'predict_result_{}'.format(i)
-      result[result_key] = model.predict(**predict_inputs)
-
-  # Train and eval again to mimic user's flow.
-
-  result['training_history_2'] = model.fit(**training_inputs).history
-
-  if eval_inputs is not None:
-    result['eval_result_2'] = model.evaluate(**eval_inputs)
-
-  result['weights_2'] = model.get_weights()
-
-  return result
-
-
-def compare_results(results_with_ds,
-                    results_without_ds,
-                    distribution,
-                    testcase,
-                    partial_last_batch=None):
-  """Compares results of model compiled with/without distribution strategy."""
-  if policy.global_policy().compute_dtype in ('float16', 'bfloat16'):
-    default_tolerance = 1e-2
-    relaxed_tolerance = 1e-2
-  elif partial_last_batch == 'train_and_eval':
-    # We relax the tolerance a lot in the partial last batch case as
-    #   1. the examples in uneven batches may have different weights when
-    #      applying the gradients in the distributed case.
-    #   2. TF Keras and TF Keras DS have different ways to handle the case when
-    #      training with epochs > 1 with numpy inputs. In TF Keras, every epoch
-    #      may have a partial batch. While in TF Keras DS, as we convert
-    #      numpy inputs into dataset, it will do a repeat() first and calculate
-    #      steps_per_epoch, so it will at most have one partial batch. This
-    #      makes the 1-CPU result even different.
-    default_tolerance = 1e-3
-    relaxed_tolerance = 1e-3
-  else:
-    default_tolerance = 4e-5
-    relaxed_tolerance = 1e-4
-
-  def _get_compare_result_tolerance(key):
-    """Returns tolerance to compare results."""
-    # See b/119257215 for more details. DS test run on GPU could have larger
-    # variance then test on CPU.
-    if (tf.test.is_gpu_available() and
-        key.startswith(('weights_1', 'weights_2', 'predict_result'))):
-      return relaxed_tolerance
-
-    return default_tolerance
-
-  for key in sorted(results_with_ds.keys()):
-    if (key.startswith('training_history') and
-        isinstance(distribution,
-                   (tf.distribute.experimental.TPUStrategy, tf.compat.v1.distribute.experimental.TPUStrategy)) and
-        distribution.extended.steps_per_run > 1):
-      # TODO(b/119894254): Enable this test for all cases once the
-      # underlying bug is fixed.
-      continue
-
-    tolerance = _get_compare_result_tolerance(key)
-
-    # We don't compare the loss as loss is currently not computed as metric
-    # in Keras, the loss value is inaccurate for last partial batch due to
-    # more weights for the last batch samples.
-    if partial_last_batch is not None:
-      if key.startswith('eval_result'):
-        results_with_ds[key] = results_with_ds[key][1:]
-        results_without_ds[key] = results_without_ds[key][1:]
-      if key.startswith('training_history'):
-        results_with_ds[key]['val_loss'] = 0
-        results_without_ds[key]['val_loss'] = 0
-
-    testcase.assertAllClose(
-        results_with_ds[key],
-        results_without_ds[key],
-        atol=tolerance,
-        rtol=tolerance,
-        msg='Fail to assert {}.'.format(key))
+        default_tolerance = 4e-5
+        relaxed_tolerance = 1e-4
+
+    def _get_compare_result_tolerance(key):
+        """Returns tolerance to compare results."""
+        # See b/119257215 for more details. DS test run on GPU could have larger
+        # variance then test on CPU.
+        if tf.test.is_gpu_available() and key.startswith(
+            ("weights_1", "weights_2", "predict_result")
+        ):
+            return relaxed_tolerance
+
+        return default_tolerance
+
+    for key in sorted(results_with_ds.keys()):
+        if (
+            key.startswith("training_history")
+            and isinstance(
+                distribution,
+                (
+                    tf.distribute.experimental.TPUStrategy,
+                    tf.compat.v1.distribute.experimental.TPUStrategy,
+                ),
+            )
+            and distribution.extended.steps_per_run > 1
+        ):
+            # TODO(b/119894254): Enable this test for all cases once the
+            # underlying bug is fixed.
+            continue
+
+        tolerance = _get_compare_result_tolerance(key)
+
+        # We don't compare the loss as loss is currently not computed as metric
+        # in Keras, the loss value is inaccurate for last partial batch due to
+        # more weights for the last batch samples.
+        if partial_last_batch is not None:
+            if key.startswith("eval_result"):
+                results_with_ds[key] = results_with_ds[key][1:]
+                results_without_ds[key] = results_without_ds[key][1:]
+            if key.startswith("training_history"):
+                results_with_ds[key]["val_loss"] = 0
+                results_without_ds[key]["val_loss"] = 0
+
+        testcase.assertAllClose(
+            results_with_ds[key],
+            results_without_ds[key],
+            atol=tolerance,
+            rtol=tolerance,
+            msg=f"Fail to assert {key}.",
+        )
 
 
 def should_skip_tpu_with_eager(distribution):
-  return (tf.executing_eagerly() and
-          isinstance(distribution,
-                     (tf.distribute.experimental.TPUStrategy, tf.compat.v1.distribute.experimental.TPUStrategy)))
+    return tf.executing_eagerly() and isinstance(
+        distribution,
+        (
+            tf.distribute.experimental.TPUStrategy,
+            tf.compat.v1.distribute.experimental.TPUStrategy,
+        ),
+    )
 
 
 class LearningRateBatchScheduler(keras.callbacks.Callback):
-  """Scheduler that dynamically sets the learning rate of model."""
-
-  def __init__(self, update_freq=None):
-    self._update_freq = update_freq
-
-  def on_batch_begin(self, batch, logs=None):
-    if self._update_freq and batch % self._update_freq != 0:
-      return
-
-    # To avoid divergence, limit the value range.
-    lr = 0.001 * (batch % 10)
-    keras.backend.set_value(self.model.optimizer.lr, lr)
-
-
-class TestDistributionStrategyCorrectnessBase(tf.test.TestCase,
-                                              parameterized.TestCase):
-  """Model agnostic testing infra to test correctness of Keras models."""
-
-  def set_up_test_config(self,
-                         use_numpy=False,
-                         use_validation_data=False,
-                         with_batch_norm=None):
-    self.use_numpy = use_numpy
-    self.use_validation_data = use_validation_data
-    self.with_batch_norm = with_batch_norm
-
-    keras.backend.set_image_data_format('channels_last')
-    np.random.seed(_RANDOM_SEED)
-    tf.compat.v1.set_random_seed(_RANDOM_SEED)
-
-  def get_data(self):
-    num_samples = 10000
-    x_train = np.random.randint(0, 2, num_samples)
-    x_train = np.reshape(x_train, (num_samples, 1))
-    y_train = x_train
-    return (x_train.astype('float32'), y_train.astype('float32'), None)
+    """Scheduler that dynamically sets the learning rate of model."""
+
+    def __init__(self, update_freq=None):
+        self._update_freq = update_freq
+
+    def on_batch_begin(self, batch, logs=None):
+        if self._update_freq and batch % self._update_freq != 0:
+            return
+
+        # To avoid divergence, limit the value range.
+        lr = 0.001 * (batch % 10)
+        keras.backend.set_value(self.model.optimizer.lr, lr)
+
+
+class TestDistributionStrategyCorrectnessBase(
+    tf.test.TestCase, parameterized.TestCase
+):
+    """Model agnostic testing infra to test correctness of Keras models."""
+
+    def set_up_test_config(
+        self, use_numpy=False, use_validation_data=False, with_batch_norm=None
+    ):
+        self.use_numpy = use_numpy
+        self.use_validation_data = use_validation_data
+        self.with_batch_norm = with_batch_norm
+
+        keras.backend.set_image_data_format("channels_last")
+        np.random.seed(_RANDOM_SEED)
+        tf.compat.v1.set_random_seed(_RANDOM_SEED)
+
+    def get_data(self):
+        num_samples = 10000
+        x_train = np.random.randint(0, 2, num_samples)
+        x_train = np.reshape(x_train, (num_samples, 1))
+        y_train = x_train
+        return (x_train.astype("float32"), y_train.astype("float32"), None)
+
+    def get_data_with_partial_last_batch(self):
+        raise NotImplementedError
+
+    def get_data_with_partial_last_batch_eval(self):
+        raise NotImplementedError
+
+    def get_input_for_correctness_test(self, **kwargs):
+        """Generates inputs that are dictionaries.
+
+        We only provide a default implementation of this method here. If you
+        need more customized way of providing input to your model, overwrite
+        this method.
+
+        Args:
+          **kwargs: key word arguments about how to create the input
+            dictionaries
+
+        Returns:
+          Three dictionaries representing the input for fit(), evaluate() and
+          predict()
+        """
+
+        return get_correctness_test_inputs(**kwargs)
+
+    def get_model(self, distribution=None, input_shapes=None):
+        raise NotImplementedError
+
+    def run_correctness_test(
+        self,
+        distribution,
+        use_numpy,
+        use_validation_data,
+        with_batch_norm=None,
+        is_stateful_model=False,
+        partial_last_batch=None,
+        training_epochs=2,
+    ):
+        with self.cached_session():
+            self.set_up_test_config(
+                use_numpy, use_validation_data, with_batch_norm
+            )
+
+            if partial_last_batch == "eval":
+                (
+                    x_train,
+                    y_train,
+                    x_eval,
+                    y_eval,
+                    x_predict,
+                ) = self.get_data_with_partial_last_batch_eval()
+            elif partial_last_batch == "train_and_eval":
+                (
+                    x_train,
+                    y_train,
+                    x_eval,
+                    y_eval,
+                    x_predict,
+                ) = self.get_data_with_partial_last_batch()
+            else:
+                x_train, y_train, x_predict = self.get_data()
+                x_eval = x_train
+                y_eval = y_train
+
+            # The model is built once and the initial weights are saved.
+            # This is used to initialize the model for both the distribution and
+            # non-distribution run.
+            model = self.get_model(input_shapes=get_shapes(x_train))
+            initial_weights = model.get_weights()
+
+            ds_input_fn = functools.partial(
+                self.get_input_for_correctness_test,
+                use_numpy=use_numpy,
+                use_validation_data=use_validation_data,
+                with_distribution=distribution,
+                x_train=x_train,
+                y_train=y_train,
+                x_eval=x_eval,
+                y_eval=y_eval,
+                x_predict=x_predict,
+                training_epochs=training_epochs,
+            )
+
+            nods_input_fn = functools.partial(
+                self.get_input_for_correctness_test,
+                use_numpy=use_numpy,
+                use_validation_data=use_validation_data,
+                with_distribution=None,
+                x_train=x_train,
+                y_train=y_train,
+                x_eval=x_eval,
+                y_eval=y_eval,
+                x_predict=x_predict,
+                training_epochs=training_epochs,
+            )
+
+            results_with_ds = fit_eval_and_predict(
+                initial_weights,
+                input_fn=ds_input_fn,
+                model_fn=self.get_model,
+                distribution=distribution,
+                is_stateful_model=is_stateful_model,
+            )
+            results_without_ds = fit_eval_and_predict(
+                initial_weights,
+                input_fn=nods_input_fn,
+                model_fn=self.get_model,
+                distribution=None,
+                is_stateful_model=is_stateful_model,
+            )
+
+            # First, special case, for multi-replica distributed training, batch
+            # norm is not aggregated globally. So it is expected to have
+            # different weights.
+            if (
+                self.with_batch_norm == "regular"
+                and distribution.num_replicas_in_sync > 1
+            ):
+                with self.assertRaises(AssertionError):
+                    compare_results(
+                        results_with_ds,
+                        results_without_ds,
+                        distribution,
+                        testcase=self,
+                        partial_last_batch=partial_last_batch,
+                    )
+            else:
+                compare_results(
+                    results_with_ds,
+                    results_without_ds,
+                    distribution,
+                    testcase=self,
+                    partial_last_batch=partial_last_batch,
+                )
 
-  def get_data_with_partial_last_batch(self):
-    raise NotImplementedError
+    def get_input_for_dynamic_lr_test(self, **kwargs):
+        """Generates inputs that are dictionaries.
 
-  def get_data_with_partial_last_batch_eval(self):
-    raise NotImplementedError
+        We only provide a default implementation of this method here. If you
+        need more customized way of providing input to your model, overwrite
+        this method.
 
-  def get_input_for_correctness_test(self, **kwargs):
-    """Generates inputs that are dictionaries.
+        Args:
+          **kwargs: key word arguments about how to create the input
+            dictionaries
 
-    We only provide a default implementation of this method here. If you need
-    more customized way of providing input to your model, overwrite this method.
+        Returns:
+          Three dictionaries representing the input for fit(), evaluate() and
+          predict()
+        """
 
-    Args:
-      **kwargs: key word arguments about how to create the input dictionaries
+        training_input = kwargs
+        return training_input, None, None
 
-    Returns:
-      Three dictionaries representing the input for fit(), evaluate() and
-      predict()
-    """
+    def run_dynamic_lr_test(self, distribution):
+        with self.cached_session():
+            self.set_up_test_config()
 
-    return get_correctness_test_inputs(**kwargs)
+            x_train, y_train, _ = self.get_data()
+            model = self.get_model(input_shapes=get_shapes(x_train))
+            initial_weights = model.get_weights()
+            update_freq = None
 
-  def get_model(self,
+            if (
+                isinstance(
+                    distribution,
+                    tf.compat.v1.distribute.experimental.TPUStrategy,
+                )
+                and distribution.extended.steps_per_run > 1
+            ):
+                # For TPUStrategy with steps_per_run > 1, the callback is not
+                # invoked every step. So, to compare the CPU/TPU, we let the CPU
+                # to behave the same as TPU.
+                update_freq = distribution.extended.steps_per_run
+
+            training_epochs = 2
+            global_batch_size = 64
+
+            ds_batch_size = get_batch_size(global_batch_size, distribution)
+            nods_batch_size = get_batch_size(global_batch_size, None)
+
+            ds_input_fn = functools.partial(
+                self.get_input_for_dynamic_lr_test,
+                x=x_train,
+                y=y_train,
+                batch_size=ds_batch_size,
+                shuffle=False,
+                epochs=training_epochs,
+                callbacks=[LearningRateBatchScheduler(update_freq)],
+                validation_data=(x_train, y_train),
+            )
+
+            nods_input_fn = functools.partial(
+                self.get_input_for_dynamic_lr_test,
+                x=x_train,
+                y=y_train,
+                batch_size=nods_batch_size,
+                shuffle=False,
+                epochs=training_epochs,
+                callbacks=[LearningRateBatchScheduler(update_freq)],
+                validation_data=(x_train, y_train),
+            )
+
+            results_with_ds = fit_eval_and_predict(
+                initial_weights,
+                input_fn=ds_input_fn,
+                model_fn=self.get_model,
+                distribution=distribution,
+            )
+            results_without_ds = fit_eval_and_predict(
+                initial_weights,
+                input_fn=nods_input_fn,
+                model_fn=self.get_model,
                 distribution=None,
-                input_shapes=None):
-    raise NotImplementedError
-
-  def run_correctness_test(self,
-                           distribution,
-                           use_numpy,
-                           use_validation_data,
-                           with_batch_norm=None,
-                           is_stateful_model=False,
-                           partial_last_batch=None,
-                           training_epochs=2):
-    with self.cached_session():
-      self.set_up_test_config(use_numpy, use_validation_data, with_batch_norm)
-
-      if partial_last_batch == 'eval':
-        x_train, y_train, x_eval, y_eval, x_predict = (
-            self.get_data_with_partial_last_batch_eval())
-      elif partial_last_batch == 'train_and_eval':
-        x_train, y_train, x_eval, y_eval, x_predict = (
-            self.get_data_with_partial_last_batch())
-      else:
-        x_train, y_train, x_predict = self.get_data()
-        x_eval = x_train
-        y_eval = y_train
-
-      # The model is built once and the initial weights are saved.
-      # This is used to initialize the model for both the distribution and
-      # non-distribution run.
-      model = self.get_model(
-          input_shapes=get_shapes(x_train))
-      initial_weights = model.get_weights()
-
-      ds_input_fn = functools.partial(
-          self.get_input_for_correctness_test,
-          use_numpy=use_numpy,
-          use_validation_data=use_validation_data,
-          with_distribution=distribution,
-          x_train=x_train,
-          y_train=y_train,
-          x_eval=x_eval,
-          y_eval=y_eval,
-          x_predict=x_predict,
-          training_epochs=training_epochs)
-
-      nods_input_fn = functools.partial(
-          self.get_input_for_correctness_test,
-          use_numpy=use_numpy,
-          use_validation_data=use_validation_data,
-          with_distribution=None,
-          x_train=x_train,
-          y_train=y_train,
-          x_eval=x_eval,
-          y_eval=y_eval,
-          x_predict=x_predict,
-          training_epochs=training_epochs)
-
-      results_with_ds = fit_eval_and_predict(
-          initial_weights,
-          input_fn=ds_input_fn,
-          model_fn=self.get_model,
-          distribution=distribution,
-          is_stateful_model=is_stateful_model)
-      results_without_ds = fit_eval_and_predict(
-          initial_weights,
-          input_fn=nods_input_fn,
-          model_fn=self.get_model,
-          distribution=None,
-          is_stateful_model=is_stateful_model)
-
-      # First, special case, for multi-replica distributed training, batch
-      # norm is not aggregated globally. So it is expected to have different
-      # weights.
-      if (self.with_batch_norm == 'regular' and
-          distribution.num_replicas_in_sync > 1):
-        with self.assertRaises(AssertionError):
-          compare_results(
-              results_with_ds,
-              results_without_ds,
-              distribution,
-              testcase=self,
-              partial_last_batch=partial_last_batch)
-      else:
-        compare_results(
-            results_with_ds,
-            results_without_ds,
-            distribution,
-            testcase=self,
-            partial_last_batch=partial_last_batch)
-
-  def get_input_for_dynamic_lr_test(self, **kwargs):
-    """Generates inputs that are dictionaries.
-
-    We only provide a default implementation of this method here. If you need
-    more customized way of providing input to your model, overwrite this method.
-
-    Args:
-      **kwargs: key word arguments about how to create the input dictionaries
-
-    Returns:
-      Three dictionaries representing the input for fit(), evaluate() and
-      predict()
-    """
-
-    training_input = kwargs
-    return training_input, None, None
-
-  def run_dynamic_lr_test(self,
-                          distribution):
-    with self.cached_session():
-      self.set_up_test_config()
-
-      x_train, y_train, _ = self.get_data()
-      model = self.get_model(
-          input_shapes=get_shapes(x_train))
-      initial_weights = model.get_weights()
-      update_freq = None
-
-      if (isinstance(distribution, tf.compat.v1.distribute.experimental.TPUStrategy) and
-          distribution.extended.steps_per_run > 1):
-        # For TPUStrategy with steps_per_run > 1, the callback is not invoked
-        # every step. So, to compare the CPU/TPU, we let the CPU to behave the
-        # same as TPU.
-        update_freq = distribution.extended.steps_per_run
-
-      training_epochs = 2
-      global_batch_size = 64
-
-      ds_batch_size = get_batch_size(global_batch_size, distribution)
-      nods_batch_size = get_batch_size(global_batch_size, None)
-
-      ds_input_fn = functools.partial(
-          self.get_input_for_dynamic_lr_test,
-          x=x_train,
-          y=y_train,
-          batch_size=ds_batch_size,
-          shuffle=False,
-          epochs=training_epochs,
-          callbacks=[LearningRateBatchScheduler(update_freq)],
-          validation_data=(x_train, y_train))
-
-      nods_input_fn = functools.partial(
-          self.get_input_for_dynamic_lr_test,
-          x=x_train,
-          y=y_train,
-          batch_size=nods_batch_size,
-          shuffle=False,
-          epochs=training_epochs,
-          callbacks=[LearningRateBatchScheduler(update_freq)],
-          validation_data=(x_train, y_train))
-
-      results_with_ds = fit_eval_and_predict(
-          initial_weights,
-          input_fn=ds_input_fn,
-          model_fn=self.get_model,
-          distribution=distribution)
-      results_without_ds = fit_eval_and_predict(
-          initial_weights,
-          input_fn=nods_input_fn,
-          model_fn=self.get_model,
-          distribution=None)
-      compare_results(
-          results_with_ds, results_without_ds, distribution, testcase=self)
+            )
+            compare_results(
+                results_with_ds, results_without_ds, distribution, testcase=self
+            )
 
 
 class TestDistributionStrategyEmbeddingModelCorrectnessBase(
-    TestDistributionStrategyCorrectnessBase):
-  """Base class to test correctness of Keras models with embedding layers."""
-
-  def get_data(self,
-               count=(_GLOBAL_BATCH_SIZE * _EVAL_STEPS),
-               min_words=5,
-               max_words=10,
-               max_word_id=19,
-               num_classes=2):
-    distribution = []
-    for _ in range(num_classes):
-      dist = np.abs(np.random.randn(max_word_id))
-      dist /= np.sum(dist)
-      distribution.append(dist)
-
-    features = []
-    labels = []
-    for _ in range(count):
-      label = np.random.randint(0, num_classes, size=1)[0]
-      num_words = np.random.randint(min_words, max_words, size=1)[0]
-      word_ids = np.random.choice(
-          max_word_id, size=num_words, replace=True, p=distribution[label])
-      word_ids = word_ids
-      labels.append(label)
-      features.append(word_ids)
-
-    features = data_utils.pad_sequences(
-        features, maxlen=max_words)
-    x_train = np.asarray(features, dtype=np.float32)
-    y_train = np.asarray(labels, dtype=np.int32).reshape((count, 1))
-    x_predict = x_train[:_GLOBAL_BATCH_SIZE]
-    return x_train, y_train, x_predict
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    TestDistributionStrategyCorrectnessBase
+):
+    """Base class to test correctness of Keras models with embedding layers."""
+
+    def get_data(
+        self,
+        count=(_GLOBAL_BATCH_SIZE * _EVAL_STEPS),
+        min_words=5,
+        max_words=10,
+        max_word_id=19,
+        num_classes=2,
+    ):
+        distribution = []
+        for _ in range(num_classes):
+            dist = np.abs(np.random.randn(max_word_id))
+            dist /= np.sum(dist)
+            distribution.append(dist)
+
+        features = []
+        labels = []
+        for _ in range(count):
+            label = np.random.randint(0, num_classes, size=1)[0]
+            num_words = np.random.randint(min_words, max_words, size=1)[0]
+            word_ids = np.random.choice(
+                max_word_id, size=num_words, replace=True, p=distribution[label]
+            )
+            word_ids = word_ids
+            labels.append(label)
+            features.append(word_ids)
+
+        features = data_utils.pad_sequences(features, maxlen=max_words)
+        x_train = np.asarray(features, dtype=np.float32)
+        y_train = np.asarray(labels, dtype=np.int32).reshape((count, 1))
+        x_predict = x_train[:_GLOBAL_BATCH_SIZE]
+        return x_train, y_train, x_predict
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/distribute/keras_dnn_correctness_test.py b/keras/distribute/keras_dnn_correctness_test.py
index d4d1602cfc56..9577957a236c 100644
--- a/keras/distribute/keras_dnn_correctness_test.py
+++ b/keras/distribute/keras_dnn_correctness_test.py
@@ -14,312 +14,361 @@
 # ==============================================================================
 """Correctness tests for tf.keras DNN model using DistributionStrategy."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras import backend
-from keras.testing_infra import test_utils
 from keras.distribute import keras_correctness_test_base
 from keras.distribute import strategy_combinations
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_keras
+from keras.optimizers.legacy import gradient_descent as gradient_descent_keras
+from keras.testing_infra import test_utils
 
 
 def all_strategy_combinations_with_eager_and_graph_modes():
-  return (tf.__internal__.test.combinations.combine(
-      distribution=strategy_combinations.all_strategies,
-      mode=['graph', 'eager']) + tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.multi_worker_mirrored_strategies,
-          mode='eager'))
+    return tf.__internal__.test.combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["graph", "eager"],
+    ) + tf.__internal__.test.combinations.combine(
+        distribution=strategy_combinations.multi_worker_mirrored_strategies,
+        mode="eager",
+    )
 
 
 def all_strategy_combinations_with_graph_mode():
-  return (tf.__internal__.test.combinations.combine(
-      distribution=keras_correctness_test_base.all_strategies,
-      mode=['graph']))
+    return tf.__internal__.test.combinations.combine(
+        distribution=keras_correctness_test_base.all_strategies, mode=["graph"]
+    )
 
 
 def is_default_strategy(strategy):
-  with strategy.scope():
-    return not tf.distribute.has_strategy()
+    with strategy.scope():
+        return not tf.distribute.has_strategy()
 
 
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
+    "Uses Dense layers, which call matmul"
+)
 class TestDistributionStrategyDnnCorrectness(
-    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
-
-  def get_model(self,
-                initial_weights=None,
-                distribution=None,
-                input_shapes=None):
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-      # We add few non-linear layers to make it non-trivial.
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-      model.add(
-          keras.layers.Dense(
-              10,
-              activation='relu',
-              kernel_regularizer=keras.regularizers.l2(1e-4)))
-      model.add(keras.layers.Dense(10, activation='relu'))
-      model.add(keras.layers.Dense(1))
-
-      if initial_weights:
-        model.set_weights(initial_weights)
-
-      model.compile(
-          loss=keras.losses.mean_squared_error,
-          optimizer=gradient_descent_keras.SGD(0.05),
-          metrics=['mse'])
-      return model
-
-  def get_data(self):
-    x_train = np.random.rand(9984, 1).astype('float32')
-    y_train = 3 * x_train
-    x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
-    return x_train, y_train, x_predict
-
-  def get_data_with_partial_last_batch(self):
-    x_train = np.random.rand(10000, 1).astype('float32')
-    y_train = 3 * x_train
-    x_eval = np.random.rand(10000, 1).astype('float32')
-    y_eval = 3 * x_eval
-    x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
-    return x_train, y_train, x_eval, y_eval, x_predict
-
-  def get_data_with_partial_last_batch_eval(self):
-    x_train = np.random.rand(9984, 1).astype('float32')
-    y_train = 3 * x_train
-    x_eval = np.random.rand(10000, 1).astype('float32')
-    y_eval = 3 * x_eval
-    x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
-    return x_train, y_train, x_eval, y_eval, x_predict
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base
-      .test_combinations_with_tpu_strategies_graph() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_dnn_correctness_with_partial_last_batch_eval(self, distribution,
-                                                        use_numpy,
-                                                        use_validation_data):
-    self.run_correctness_test(
-        distribution, use_numpy, use_validation_data, partial_last_batch='eval')
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base
-      .strategy_minus_tpu_and_input_config_combinations_eager() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_dnn_correctness_with_partial_last_batch(self, distribution,
-                                                   use_numpy,
-                                                   use_validation_data):
-    distribution.extended.experimental_enable_get_next_as_optional = True
-    self.run_correctness_test(
-        distribution,
-        use_numpy,
-        use_validation_data,
-        partial_last_batch='train_and_eval',
-        training_epochs=1)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations_with_graph_mode())
-  def test_dnn_with_dynamic_learning_rate(self, distribution):
-    self.run_dynamic_lr_test(distribution)
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase
+):
+    def get_model(
+        self, initial_weights=None, distribution=None, input_shapes=None
+    ):
+        with keras_correctness_test_base.MaybeDistributionScope(distribution):
+            # We add few non-linear layers to make it non-trivial.
+            model = keras.Sequential()
+            model.add(
+                keras.layers.Dense(10, activation="relu", input_shape=(1,))
+            )
+            model.add(
+                keras.layers.Dense(
+                    10,
+                    activation="relu",
+                    kernel_regularizer=keras.regularizers.l2(1e-4),
+                )
+            )
+            model.add(keras.layers.Dense(10, activation="relu"))
+            model.add(keras.layers.Dense(1))
+
+            if initial_weights:
+                model.set_weights(initial_weights)
+
+            model.compile(
+                loss=keras.losses.mean_squared_error,
+                optimizer=gradient_descent_keras.SGD(0.05),
+                metrics=["mse"],
+            )
+            return model
+
+    def get_data(self):
+        x_train = np.random.rand(9984, 1).astype("float32")
+        y_train = 3 * x_train
+        x_predict = np.array([[1.0], [2.0], [3.0], [4.0]], dtype=np.float32)
+        return x_train, y_train, x_predict
+
+    def get_data_with_partial_last_batch(self):
+        x_train = np.random.rand(10000, 1).astype("float32")
+        y_train = 3 * x_train
+        x_eval = np.random.rand(10000, 1).astype("float32")
+        y_eval = 3 * x_eval
+        x_predict = np.array([[1.0], [2.0], [3.0], [4.0]], dtype=np.float32)
+        return x_train, y_train, x_eval, y_eval, x_predict
+
+    def get_data_with_partial_last_batch_eval(self):
+        x_train = np.random.rand(9984, 1).astype("float32")
+        y_train = 3 * x_train
+        x_eval = np.random.rand(10000, 1).astype("float32")
+        y_eval = 3 * x_eval
+        x_predict = np.array([[1.0], [2.0], [3.0], [4.0]], dtype=np.float32)
+        return x_train, y_train, x_eval, y_eval, x_predict
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.all_strategy_and_input_config_combinations()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_dnn_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_with_tpu_strategies_graph()  # noqa: E501
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_dnn_correctness_with_partial_last_batch_eval(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(
+            distribution,
+            use_numpy,
+            use_validation_data,
+            partial_last_batch="eval",
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.strategy_minus_tpu_and_input_config_combinations_eager()  # noqa: E501
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_dnn_correctness_with_partial_last_batch(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        distribution.extended.experimental_enable_get_next_as_optional = True
+        self.run_correctness_test(
+            distribution,
+            use_numpy,
+            use_validation_data,
+            partial_last_batch="train_and_eval",
+            training_epochs=1,
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations_with_graph_mode()
+    )
+    def test_dnn_with_dynamic_learning_rate(self, distribution):
+        self.run_dynamic_lr_test(distribution)
 
 
 class TestDistributionStrategyDnnMetricCorrectness(
-    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
-
-  def get_model(self,
-                distribution=None,
-                input_shapes=None):
-    with distribution.scope():
-      model = keras.Sequential()
-      model.add(
-          keras.layers.Dense(1, input_shape=(1,), kernel_initializer='ones'))
-      model.compile(
-          loss=keras.losses.mean_squared_error,
-          optimizer=gradient_descent_keras.SGD(0.05),
-          metrics=[keras.metrics.BinaryAccuracy()])
-    return model
-
-  def run_metric_correctness_test(self, distribution):
-    with self.cached_session():
-      self.set_up_test_config()
-
-      x_train, y_train, _ = self.get_data()
-      model = self.get_model(
-          distribution=distribution)
-
-      batch_size = 64
-      batch_size = (
-          keras_correctness_test_base.get_batch_size(batch_size, distribution))
-      train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-      train_dataset = (
-          keras_correctness_test_base.batch_wrapper(train_dataset, batch_size))
-
-      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
-      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
-
-  @tf.__internal__.distribute.combinations.generate(
-      all_strategy_combinations_with_eager_and_graph_modes())
-  def test_simple_dnn_metric_correctness(self, distribution):
-    self.run_metric_correctness_test(distribution)
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase
+):
+    def get_model(self, distribution=None, input_shapes=None):
+        with distribution.scope():
+            model = keras.Sequential()
+            model.add(
+                keras.layers.Dense(
+                    1, input_shape=(1,), kernel_initializer="ones"
+                )
+            )
+            model.compile(
+                loss=keras.losses.mean_squared_error,
+                optimizer=gradient_descent_keras.SGD(0.05),
+                metrics=[keras.metrics.BinaryAccuracy()],
+            )
+        return model
+
+    def run_metric_correctness_test(self, distribution):
+        with self.cached_session():
+            self.set_up_test_config()
+
+            x_train, y_train, _ = self.get_data()
+            model = self.get_model(distribution=distribution)
+
+            batch_size = 64
+            batch_size = keras_correctness_test_base.get_batch_size(
+                batch_size, distribution
+            )
+            train_dataset = tf.data.Dataset.from_tensor_slices(
+                (x_train, y_train)
+            )
+            train_dataset = keras_correctness_test_base.batch_wrapper(
+                train_dataset, batch_size
+            )
+
+            history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
+            self.assertEqual(history.history["binary_accuracy"], [1.0, 1.0])
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations_with_eager_and_graph_modes()
+    )
+    def test_simple_dnn_metric_correctness(self, distribution):
+        self.run_metric_correctness_test(distribution)
 
 
 class TestDistributionStrategyDnnMetricEvalCorrectness(
-    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
-
-  def get_model(self,
-                distribution=None,
-                input_shapes=None):
-    with distribution.scope():
-      model = keras.Sequential()
-      model.add(
-          keras.layers.Dense(
-              3, activation='relu', input_dim=4, kernel_initializer='ones'))
-      model.add(
-          keras.layers.Dense(
-              1, activation='sigmoid', kernel_initializer='ones'))
-      model.compile(
-          loss='mae',
-          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
-          optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.001))
-    return model
-
-  def run_eval_metrics_correctness_test(self, distribution):
-    with self.cached_session():
-      self.set_up_test_config()
-
-      model = self.get_model(
-          distribution=distribution)
-
-      # verify correctness of stateful and stateless metrics.
-      x = np.ones((100, 4)).astype('float32')
-      y = np.ones((100, 1)).astype('float32')
-      dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = keras_correctness_test_base.batch_wrapper(dataset, 4)
-      outs = model.evaluate(dataset, steps=10)
-      self.assertEqual(outs[1], 1.)
-      self.assertEqual(outs[2], 1.)
-
-      y = np.zeros((100, 1)).astype('float32')
-      dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = keras_correctness_test_base.batch_wrapper(dataset, 4)
-      outs = model.evaluate(dataset, steps=10)
-      self.assertEqual(outs[1], 0.)
-      self.assertEqual(outs[2], 0.)
-
-  @tf.__internal__.distribute.combinations.generate(
-      all_strategy_combinations_with_eager_and_graph_modes())
-  def test_identity_model_metric_eval_correctness(self, distribution):
-    self.run_eval_metrics_correctness_test(distribution)
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase
+):
+    def get_model(self, distribution=None, input_shapes=None):
+        with distribution.scope():
+            model = keras.Sequential()
+            model.add(
+                keras.layers.Dense(
+                    3, activation="relu", input_dim=4, kernel_initializer="ones"
+                )
+            )
+            model.add(
+                keras.layers.Dense(
+                    1, activation="sigmoid", kernel_initializer="ones"
+                )
+            )
+            model.compile(
+                loss="mae",
+                metrics=["accuracy", keras.metrics.BinaryAccuracy()],
+                optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.001),
+            )
+        return model
+
+    def run_eval_metrics_correctness_test(self, distribution):
+        with self.cached_session():
+            self.set_up_test_config()
+
+            model = self.get_model(distribution=distribution)
+
+            # verify correctness of stateful and stateless metrics.
+            x = np.ones((100, 4)).astype("float32")
+            y = np.ones((100, 1)).astype("float32")
+            dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat()
+            dataset = keras_correctness_test_base.batch_wrapper(dataset, 4)
+            outs = model.evaluate(dataset, steps=10)
+            self.assertEqual(outs[1], 1.0)
+            self.assertEqual(outs[2], 1.0)
+
+            y = np.zeros((100, 1)).astype("float32")
+            dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat()
+            dataset = keras_correctness_test_base.batch_wrapper(dataset, 4)
+            outs = model.evaluate(dataset, steps=10)
+            self.assertEqual(outs[1], 0.0)
+            self.assertEqual(outs[2], 0.0)
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations_with_eager_and_graph_modes()
+    )
+    def test_identity_model_metric_eval_correctness(self, distribution):
+        self.run_eval_metrics_correctness_test(distribution)
 
 
 class SubclassedModel(keras.Model):
-
-  def __init__(self, initial_weights, input_shapes):
-    super().__init__()
-    self.dense1 = keras.layers.Dense(10, activation='relu', input_shape=(1,))
-    self.dense2 = keras.layers.Dense(
-        10, activation='relu', kernel_regularizer=keras.regularizers.l2(1e-4))
-    self.dense3 = keras.layers.Dense(10, activation='relu')
-    self.dense4 = keras.layers.Dense(1)
-    if input_shapes:
-      self.build(input_shapes)
-    else:
-      # This covers cases when the input is DatasetV1Adapter.
-      self.build((None, 1))
-    if initial_weights:
-      self.set_weights(initial_weights)
-
-  def call(self, inputs):
-    x = self.dense1(inputs)
-    x = self.dense2(x)
-    x = self.dense3(x)
-    return self.dense4(x)
+    def __init__(self, initial_weights, input_shapes):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(
+            10, activation="relu", input_shape=(1,)
+        )
+        self.dense2 = keras.layers.Dense(
+            10,
+            activation="relu",
+            kernel_regularizer=keras.regularizers.l2(1e-4),
+        )
+        self.dense3 = keras.layers.Dense(10, activation="relu")
+        self.dense4 = keras.layers.Dense(1)
+        if input_shapes:
+            self.build(input_shapes)
+        else:
+            # This covers cases when the input is DatasetV1Adapter.
+            self.build((None, 1))
+        if initial_weights:
+            self.set_weights(initial_weights)
+
+    def call(self, inputs):
+        x = self.dense1(inputs)
+        x = self.dense2(x)
+        x = self.dense3(x)
+        return self.dense4(x)
 
 
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
+    "Uses Dense layers, which call matmul"
+)
 class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
-    TestDistributionStrategyDnnCorrectness):
-
-  def get_model(self,
-                initial_weights=None,
-                distribution=None,
-                input_shapes=None):
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-      model = SubclassedModel(initial_weights, input_shapes)
-
-      model.compile(
-          loss=keras.losses.mean_squared_error,
-          optimizer=gradient_descent_keras.SGD(0.05),
-          metrics=['mse'])
-      return model
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
-    if (tf.executing_eagerly()) or is_default_strategy(distribution):
-      self.run_correctness_test(distribution, use_numpy, use_validation_data)
-    elif (backend.is_tpu_strategy(distribution)
-          and not tf.executing_eagerly()):
-      with self.assertRaisesRegex(
-          ValueError,
-          'Expected `model` argument to be a functional `Model` instance, '
-          'but got a subclassed model instead.'):
-        self.run_correctness_test(distribution, use_numpy, use_validation_data)
-    else:
-      with self.assertRaisesRegex(
-          ValueError,
-          'We currently do not support distribution strategy with a '
-          '`Sequential` model that is created without `input_shape`/'
-          '`input_dim` set in its first layer or a subclassed model.'):
-        self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-  @tf.__internal__.distribute.combinations.generate(all_strategy_combinations_with_graph_mode())
-  def test_dnn_with_dynamic_learning_rate(self, distribution):
-    if ((tf.executing_eagerly()
-         and not backend.is_tpu_strategy(distribution))
-        or is_default_strategy(distribution)):
-      self.run_dynamic_lr_test(distribution)
-    elif backend.is_tpu_strategy(distribution):
-      with self.assertRaisesRegex(
-          ValueError,
-          'Expected `model` argument to be a functional `Model` instance, '
-          'but got a subclassed model instead.'):
-        self.run_dynamic_lr_test(distribution)
-    else:
-      with self.assertRaisesRegex(
-          ValueError,
-          'We currently do not support distribution strategy with a '
-          '`Sequential` model that is created without `input_shape`/'
-          '`input_dim` set in its first layer or a subclassed model.'):
-        self.run_dynamic_lr_test(distribution)
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.test_combinations_with_tpu_strategies_graph())
-  def test_dnn_correctness_with_partial_last_batch_eval(self, distribution,
-                                                        use_numpy,
-                                                        use_validation_data):
-    with self.assertRaisesRegex(
-        ValueError,
-        'Expected `model` argument to be a functional `Model` instance, '
-        'but got a subclassed model instead.'):
-      self.run_correctness_test(
-          distribution,
-          use_numpy,
-          use_validation_data,
-          partial_last_batch='eval')
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    TestDistributionStrategyDnnCorrectness
+):
+    def get_model(
+        self, initial_weights=None, distribution=None, input_shapes=None
+    ):
+        with keras_correctness_test_base.MaybeDistributionScope(distribution):
+            model = SubclassedModel(initial_weights, input_shapes)
+
+            model.compile(
+                loss=keras.losses.mean_squared_error,
+                optimizer=gradient_descent_keras.SGD(0.05),
+                metrics=["mse"],
+            )
+            return model
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.all_strategy_and_input_config_combinations()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_dnn_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        if (tf.executing_eagerly()) or is_default_strategy(distribution):
+            self.run_correctness_test(
+                distribution, use_numpy, use_validation_data
+            )
+        elif (
+            backend.is_tpu_strategy(distribution) and not tf.executing_eagerly()
+        ):
+            with self.assertRaisesRegex(
+                ValueError,
+                "Expected `model` argument to be a functional `Model` "
+                "instance, but got a subclassed model instead.",
+            ):
+                self.run_correctness_test(
+                    distribution, use_numpy, use_validation_data
+                )
+        else:
+            with self.assertRaisesRegex(
+                ValueError,
+                "We currently do not support distribution strategy with a "
+                "`Sequential` model that is created without `input_shape`/"
+                "`input_dim` set in its first layer or a subclassed model.",
+            ):
+                self.run_correctness_test(
+                    distribution, use_numpy, use_validation_data
+                )
+
+    @tf.__internal__.distribute.combinations.generate(
+        all_strategy_combinations_with_graph_mode()
+    )
+    def test_dnn_with_dynamic_learning_rate(self, distribution):
+        if (
+            tf.executing_eagerly() and not backend.is_tpu_strategy(distribution)
+        ) or is_default_strategy(distribution):
+            self.run_dynamic_lr_test(distribution)
+        elif backend.is_tpu_strategy(distribution):
+            with self.assertRaisesRegex(
+                ValueError,
+                "Expected `model` argument to be a functional `Model` "
+                "instance, but got a subclassed model instead.",
+            ):
+                self.run_dynamic_lr_test(distribution)
+        else:
+            with self.assertRaisesRegex(
+                ValueError,
+                "We currently do not support distribution strategy with a "
+                "`Sequential` model that is created without `input_shape`/"
+                "`input_dim` set in its first layer or a subclassed model.",
+            ):
+                self.run_dynamic_lr_test(distribution)
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_with_tpu_strategies_graph()  # noqa: E501
+    )
+    def test_dnn_correctness_with_partial_last_batch_eval(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Expected `model` argument to be a functional `Model` instance, "
+            "but got a subclassed model instead.",
+        ):
+            self.run_correctness_test(
+                distribution,
+                use_numpy,
+                use_validation_data,
+                partial_last_batch="eval",
+            )
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/keras_embedding_model_correctness_test.py b/keras/distribute/keras_embedding_model_correctness_test.py
index a5c041e75429..f126c41609a1 100644
--- a/keras/distribute/keras_embedding_model_correctness_test.py
+++ b/keras/distribute/keras_embedding_model_correctness_test.py
@@ -14,142 +14,162 @@
 # ==============================================================================
 """Correctness test for tf.keras Embedding models using DistributionStrategy."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.distribute import keras_correctness_test_base
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_keras
+from keras.optimizers.legacy import gradient_descent as gradient_descent_keras
 
 
 class DistributionStrategyEmbeddingModelCorrectnessTest(
-    keras_correctness_test_base
-    .TestDistributionStrategyEmbeddingModelCorrectnessBase):
-
-  def get_model(self,
-                max_words=10,
-                initial_weights=None,
-                distribution=None,
-                input_shapes=None):
-    del input_shapes
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-      word_ids = keras.layers.Input(
-          shape=(max_words,), dtype=np.int32, name='words')
-      word_embed = keras.layers.Embedding(input_dim=20, output_dim=10)(word_ids)
-      if self.use_distributed_dense:
-        word_embed = keras.layers.TimeDistributed(keras.layers.Dense(4))(
-            word_embed)
-      avg = keras.layers.GlobalAveragePooling1D()(word_embed)
-      preds = keras.layers.Dense(2, activation='softmax')(avg)
-      model = keras.Model(inputs=[word_ids], outputs=[preds])
-
-      if initial_weights:
-        model.set_weights(initial_weights)
-
-      model.compile(
-          optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
-          loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
-    return model
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_embedding_model_correctness(self, distribution, use_numpy,
-                                       use_validation_data):
-
-    self.use_distributed_dense = False
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_embedding_time_distributed_model_correctness(
-      self, distribution, use_numpy, use_validation_data):
-    self.use_distributed_dense = True
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+    keras_correctness_test_base.TestDistributionStrategyEmbeddingModelCorrectnessBase  # noqa: E501
+):
+    def get_model(
+        self,
+        max_words=10,
+        initial_weights=None,
+        distribution=None,
+        input_shapes=None,
+    ):
+        del input_shapes
+        with keras_correctness_test_base.MaybeDistributionScope(distribution):
+            word_ids = keras.layers.Input(
+                shape=(max_words,), dtype=np.int32, name="words"
+            )
+            word_embed = keras.layers.Embedding(input_dim=20, output_dim=10)(
+                word_ids
+            )
+            if self.use_distributed_dense:
+                word_embed = keras.layers.TimeDistributed(
+                    keras.layers.Dense(4)
+                )(word_embed)
+            avg = keras.layers.GlobalAveragePooling1D()(word_embed)
+            preds = keras.layers.Dense(2, activation="softmax")(avg)
+            model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+            if initial_weights:
+                model.set_weights(initial_weights)
+
+            model.compile(
+                optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
+                loss="sparse_categorical_crossentropy",
+                metrics=["sparse_categorical_accuracy"],
+            )
+        return model
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_for_embedding_model()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_embedding_model_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+
+        self.use_distributed_dense = False
+        self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_for_embedding_model()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_embedding_time_distributed_model_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.use_distributed_dense = True
+        self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
 
 class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
-    keras_correctness_test_base
-    .TestDistributionStrategyEmbeddingModelCorrectnessBase):
-
-  def get_model(self,
-                max_words=10,
-                initial_weights=None,
-                distribution=None,
-                input_shapes=None):
-    del input_shapes
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-      word_ids_a = keras.layers.Input(
-          shape=(max_words,), dtype=np.int32, name='words_a')
-      word_ids_b = keras.layers.Input(
-          shape=(max_words,), dtype=np.int32, name='words_b')
-
-      def submodel(embedding, word_ids):
-        word_embed = embedding(word_ids)
-        rep = keras.layers.GlobalAveragePooling1D()(word_embed)
-        return keras.Model(inputs=[word_ids], outputs=[rep])
-
-      word_embed = keras.layers.Embedding(
-          input_dim=20,
-          output_dim=10,
-          input_length=max_words,
-          embeddings_initializer=keras.initializers.RandomUniform(0, 1))
-
-      a_rep = submodel(word_embed, word_ids_a).outputs[0]
-      b_rep = submodel(word_embed, word_ids_b).outputs[0]
-      sim = keras.layers.Dot(axes=1, normalize=True)([a_rep, b_rep])
-
-      model = keras.Model(inputs=[word_ids_a, word_ids_b], outputs=[sim])
-
-      if initial_weights:
-        model.set_weights(initial_weights)
-
-      # TODO(b/130808953): Switch back to the V1 optimizer after global_step
-      # is made mirrored.
-      model.compile(
-          optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
-          loss='mse',
-          metrics=['mse'])
-    return model
-
-  def get_data(self,
-               count=(keras_correctness_test_base._GLOBAL_BATCH_SIZE *
-                      keras_correctness_test_base._EVAL_STEPS),
-               min_words=5,
-               max_words=10,
-               max_word_id=19,
-               num_classes=2):
-    features_a, labels_a, _ = (
-        super().get_data(count, min_words, max_words, max_word_id,
-                             num_classes))
-
-    features_b, labels_b, _ = (
-        super().get_data(count, min_words, max_words, max_word_id,
-                             num_classes))
-
-    y_train = np.zeros((count, 1), dtype=np.float32)
-    y_train[labels_a == labels_b] = 1.0
-    y_train[labels_a != labels_b] = -1.0
-    # TODO(b/123360757): Add tests for using list as inputs for multi-input
-    # models.
-    x_train = {
-        'words_a': features_a,
-        'words_b': features_b,
-    }
-    x_predict = x_train
-
-    return x_train, y_train, x_predict
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
-                                               use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    keras_correctness_test_base.TestDistributionStrategyEmbeddingModelCorrectnessBase  # noqa: E501
+):
+    def get_model(
+        self,
+        max_words=10,
+        initial_weights=None,
+        distribution=None,
+        input_shapes=None,
+    ):
+        del input_shapes
+        with keras_correctness_test_base.MaybeDistributionScope(distribution):
+            word_ids_a = keras.layers.Input(
+                shape=(max_words,), dtype=np.int32, name="words_a"
+            )
+            word_ids_b = keras.layers.Input(
+                shape=(max_words,), dtype=np.int32, name="words_b"
+            )
+
+            def submodel(embedding, word_ids):
+                word_embed = embedding(word_ids)
+                rep = keras.layers.GlobalAveragePooling1D()(word_embed)
+                return keras.Model(inputs=[word_ids], outputs=[rep])
+
+            word_embed = keras.layers.Embedding(
+                input_dim=20,
+                output_dim=10,
+                input_length=max_words,
+                embeddings_initializer=keras.initializers.RandomUniform(0, 1),
+            )
+
+            a_rep = submodel(word_embed, word_ids_a).outputs[0]
+            b_rep = submodel(word_embed, word_ids_b).outputs[0]
+            sim = keras.layers.Dot(axes=1, normalize=True)([a_rep, b_rep])
+
+            model = keras.Model(inputs=[word_ids_a, word_ids_b], outputs=[sim])
+
+            if initial_weights:
+                model.set_weights(initial_weights)
+
+            # TODO(b/130808953): Switch back to the V1 optimizer after
+            # global_step is made mirrored.
+            model.compile(
+                optimizer=gradient_descent_keras.SGD(learning_rate=0.1),
+                loss="mse",
+                metrics=["mse"],
+            )
+        return model
+
+    def get_data(
+        self,
+        count=(
+            keras_correctness_test_base._GLOBAL_BATCH_SIZE
+            * keras_correctness_test_base._EVAL_STEPS
+        ),
+        min_words=5,
+        max_words=10,
+        max_word_id=19,
+        num_classes=2,
+    ):
+        features_a, labels_a, _ = super().get_data(
+            count, min_words, max_words, max_word_id, num_classes
+        )
+
+        features_b, labels_b, _ = super().get_data(
+            count, min_words, max_words, max_word_id, num_classes
+        )
+
+        y_train = np.zeros((count, 1), dtype=np.float32)
+        y_train[labels_a == labels_b] = 1.0
+        y_train[labels_a != labels_b] = -1.0
+        # TODO(b/123360757): Add tests for using list as inputs for multi-input
+        # models.
+        x_train = {
+            "words_a": features_a,
+            "words_b": features_b,
+        }
+        x_predict = x_train
+
+        return x_train, y_train, x_predict
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_for_embedding_model()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_siamese_embedding_model_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/keras_image_model_correctness_test.py b/keras/distribute/keras_image_model_correctness_test.py
index dee432912102..687c180aa3f5 100644
--- a/keras/distribute/keras_image_model_correctness_test.py
+++ b/keras/distribute/keras_image_model_correctness_test.py
@@ -14,150 +14,169 @@
 # ==============================================================================
 """Correctness tests for tf.keras CNN models using DistributionStrategy."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 import keras
-from keras.testing_infra import test_utils
 from keras.distribute import keras_correctness_test_base
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
+from keras.testing_infra import test_utils
 
 
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul. Even if Dense layers run in '
-    'float64, the test sometimes fails with TensorFloat-32 enabled for unknown '
-    'reasons')
+    "Uses Dense layers, which call matmul. Even if Dense layers run in "
+    "float64, the test sometimes fails with TensorFloat-32 enabled for unknown "
+    "reasons"
+)
+@test_utils.run_v2_only()
 class DistributionStrategyCnnCorrectnessTest(
-    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
-
-  def get_model(self,
-                initial_weights=None,
-                distribution=None,
-                input_shapes=None):
-    del input_shapes
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-      image = keras.layers.Input(shape=(28, 28, 3), name='image')
-      c1 = keras.layers.Conv2D(
-          name='conv1',
-          filters=16,
-          kernel_size=(3, 3),
-          strides=(4, 4),
-          kernel_regularizer=keras.regularizers.l2(1e-4))(
-              image)
-      if self.with_batch_norm == 'regular':
-        c1 = keras.layers.BatchNormalization(name='bn1')(c1)
-      elif self.with_batch_norm == 'sync':
-        # Test with parallel batch norms to verify all-reduce works OK.
-        bn1 = keras.layers.SyncBatchNormalization(name='bn1')(c1)
-        bn2 = keras.layers.SyncBatchNormalization(name='bn2')(c1)
-        c1 = keras.layers.Add()([bn1, bn2])
-      c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1)
-      logits = keras.layers.Dense(
-          10, activation='softmax', name='pred')(
-              keras.layers.Flatten()(c1))
-      model = keras.Model(inputs=[image], outputs=[logits])
-
-      if initial_weights:
-        model.set_weights(initial_weights)
-
-      model.compile(
-          optimizer=gradient_descent.SGD(learning_rate=0.1),
-          loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
-
-    return model
-
-  def _get_data(self, count, shape=(28, 28, 3), num_classes=10):
-    centers = np.random.randn(num_classes, *shape)
-
-    features = []
-    labels = []
-    for _ in range(count):
-      label = np.random.randint(0, num_classes, size=1)[0]
-      offset = np.random.normal(loc=0, scale=0.1, size=np.prod(shape))
-      offset = offset.reshape(shape)
-      labels.append(label)
-      features.append(centers[label] + offset)
-
-    x = np.asarray(features, dtype=np.float32)
-    y = np.asarray(labels, dtype=np.float32).reshape((count, 1))
-    return x, y
-
-  def get_data(self):
-    x_train, y_train = self._get_data(
-        count=keras_correctness_test_base._GLOBAL_BATCH_SIZE *
-        keras_correctness_test_base._EVAL_STEPS)
-    x_predict = x_train
-    return x_train, y_train, x_predict
-
-  def get_data_with_partial_last_batch_eval(self):
-    x_train, y_train = self._get_data(count=1280)
-    x_eval, y_eval = self._get_data(count=1000)
-    return x_train, y_train, x_eval, y_eval, x_eval
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_cnn_correctness(self, distribution, use_numpy, use_validation_data):
-    if (distribution ==
-        tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu):
-      self.skipTest('b/183958183')
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_cnn_with_batch_norm_correctness(self, distribution, use_numpy,
-                                           use_validation_data):
-    self.run_correctness_test(
-        distribution,
-        use_numpy,
-        use_validation_data,
-        with_batch_norm='regular')
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_cnn_with_sync_batch_norm_correctness(self, distribution, use_numpy,
-                                                use_validation_data):
-    if not tf.executing_eagerly():
-      self.skipTest('SyncBatchNorm is not enabled in graph mode.')
-
-    self.run_correctness_test(
-        distribution,
-        use_numpy,
-        use_validation_data,
-        with_batch_norm='sync')
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base
-      .all_strategy_and_input_config_combinations_eager() +
-      keras_correctness_test_base.multi_worker_mirrored_eager() +
-      keras_correctness_test_base.test_combinations_with_tpu_strategies_graph())
-  def test_cnn_correctness_with_partial_last_batch_eval(self, distribution,
-                                                        use_numpy,
-                                                        use_validation_data):
-    self.run_correctness_test(
-        distribution,
-        use_numpy,
-        use_validation_data,
-        partial_last_batch=True,
-        training_epochs=1)
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.
-      all_strategy_and_input_config_combinations_eager() +
-      keras_correctness_test_base.multi_worker_mirrored_eager() +
-      keras_correctness_test_base.test_combinations_with_tpu_strategies_graph())
-  def test_cnn_with_batch_norm_correctness_and_partial_last_batch_eval(
-      self, distribution, use_numpy, use_validation_data):
-    self.run_correctness_test(
-        distribution,
-        use_numpy,
-        use_validation_data,
-        with_batch_norm='regular',
-        partial_last_batch=True)
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase
+):
+    def get_model(
+        self, initial_weights=None, distribution=None, input_shapes=None
+    ):
+        del input_shapes
+        with keras_correctness_test_base.MaybeDistributionScope(distribution):
+            image = keras.layers.Input(shape=(28, 28, 3), name="image")
+            c1 = keras.layers.Conv2D(
+                name="conv1",
+                filters=16,
+                kernel_size=(3, 3),
+                strides=(4, 4),
+                kernel_regularizer=keras.regularizers.l2(1e-4),
+            )(image)
+            if self.with_batch_norm == "regular":
+                c1 = keras.layers.BatchNormalization(name="bn1")(c1)
+            elif self.with_batch_norm == "sync":
+                # Test with parallel batch norms to verify all-reduce works OK.
+                bn1 = keras.layers.BatchNormalization(
+                    name="bn1", synchronized=True
+                )(c1)
+                bn2 = keras.layers.BatchNormalization(
+                    name="bn2", synchronized=True
+                )(c1)
+                c1 = keras.layers.Add()([bn1, bn2])
+            c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1)
+            logits = keras.layers.Dense(10, activation="softmax", name="pred")(
+                keras.layers.Flatten()(c1)
+            )
+            model = keras.Model(inputs=[image], outputs=[logits])
+
+            if initial_weights:
+                model.set_weights(initial_weights)
+
+            model.compile(
+                optimizer=gradient_descent.SGD(learning_rate=0.1),
+                loss="sparse_categorical_crossentropy",
+                metrics=["sparse_categorical_accuracy"],
+            )
+
+        return model
+
+    def _get_data(self, count, shape=(28, 28, 3), num_classes=10):
+        centers = np.random.randn(num_classes, *shape)
+
+        features = []
+        labels = []
+        for _ in range(count):
+            label = np.random.randint(0, num_classes, size=1)[0]
+            offset = np.random.normal(loc=0, scale=0.1, size=np.prod(shape))
+            offset = offset.reshape(shape)
+            labels.append(label)
+            features.append(centers[label] + offset)
+
+        x = np.asarray(features, dtype=np.float32)
+        y = np.asarray(labels, dtype=np.float32).reshape((count, 1))
+        return x, y
+
+    def get_data(self):
+        x_train, y_train = self._get_data(
+            count=keras_correctness_test_base._GLOBAL_BATCH_SIZE
+            * keras_correctness_test_base._EVAL_STEPS
+        )
+        x_predict = x_train
+        return x_train, y_train, x_predict
+
+    def get_data_with_partial_last_batch_eval(self):
+        x_train, y_train = self._get_data(count=1280)
+        x_eval, y_eval = self._get_data(count=1000)
+        return x_train, y_train, x_eval, y_eval, x_eval
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.all_strategy_and_input_config_combinations()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_cnn_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        if (
+            distribution
+            == tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu  # noqa: E501
+        ):
+            self.skipTest("b/183958183")
+        self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.all_strategy_and_input_config_combinations()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_cnn_with_batch_norm_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(
+            distribution,
+            use_numpy,
+            use_validation_data,
+            with_batch_norm="regular",
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.all_strategy_and_input_config_combinations()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_cnn_with_sync_batch_norm_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        if not tf.executing_eagerly():
+            self.skipTest(
+                "BatchNorm with `synchronized` is not enabled in graph mode."
+            )
+        self.run_correctness_test(
+            distribution, use_numpy, use_validation_data, with_batch_norm="sync"
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.all_strategy_and_input_config_combinations_eager()  # noqa: E501
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+        + keras_correctness_test_base.test_combinations_with_tpu_strategies_graph()  # noqa: E501
+    )
+    def test_cnn_correctness_with_partial_last_batch_eval(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(
+            distribution,
+            use_numpy,
+            use_validation_data,
+            partial_last_batch=True,
+            training_epochs=1,
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.all_strategy_and_input_config_combinations_eager()  # noqa: E501
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+        + keras_correctness_test_base.test_combinations_with_tpu_strategies_graph()  # noqa: E501
+    )
+    def test_cnn_with_batch_norm_correctness_and_partial_last_batch_eval(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(
+            distribution,
+            use_numpy,
+            use_validation_data,
+            with_batch_norm="regular",
+            partial_last_batch=True,
+        )
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/keras_metrics_test.py b/keras/distribute/keras_metrics_test.py
index adf45640571c..a0f79e4181ef 100644
--- a/keras/distribute/keras_metrics_test.py
+++ b/keras/distribute/keras_metrics_test.py
@@ -14,251 +14,294 @@
 # ==============================================================================
 """Tests for Keras metrics."""
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import metrics
 from keras.engine import base_layer
-import tensorflow.compat.v2 as tf
 
 combinations = tf.__internal__.distribute.combinations
 
 
 def _labeled_dataset_fn():
-  # First four batches of x: labels, predictions -> (labels == predictions)
-  #  0: 0, 0 -> True;   1: 1, 1 -> True;   2: 2, 2 -> True;   3: 3, 0 -> False
-  #  4: 4, 1 -> False;  5: 0, 2 -> False;  6: 1, 0 -> False;  7: 2, 1 -> False
-  #  8: 3, 2 -> False;  9: 4, 0 -> False; 10: 0, 1 -> False; 11: 1, 2 -> False
-  # 12: 2, 0 -> False; 13: 3, 1 -> False; 14: 4, 2 -> False; 15: 0, 0 -> True
-  return tf.data.Dataset.range(1000).map(
-      lambda x: {"labels": x % 5, "predictions": x % 3}).batch(
-          4, drop_remainder=True)
+    # First four batches of x: labels, predictions -> (labels == predictions)
+    #  0: 0, 0 -> True;   1: 1, 1 -> True;   2: 2, 2 -> True;   3: 3, 0 -> False
+    #  4: 4, 1 -> False;  5: 0, 2 -> False;  6: 1, 0 -> False;  7: 2, 1 -> False
+    #  8: 3, 2 -> False;  9: 4, 0 -> False; 10: 0, 1 -> False; 11: 1, 2 -> False
+    # 12: 2, 0 -> False; 13: 3, 1 -> False; 14: 4, 2 -> False; 15: 0, 0 -> True
+    return (
+        tf.data.Dataset.range(1000)
+        .map(lambda x: {"labels": x % 5, "predictions": x % 3})
+        .batch(4, drop_remainder=True)
+    )
 
 
 def _boolean_dataset_fn():
-  # First four batches of labels, predictions: {TP, FP, TN, FN}
-  # with a threshold of 0.5:
-  #   T, T -> TP;  F, T -> FP;   T, F -> FN
-  #   F, F -> TN;  T, T -> TP;   F, T -> FP
-  #   T, F -> FN;  F, F -> TN;   T, T -> TP
-  #   F, T -> FP;  T, F -> FN;   F, F -> TN
-  return tf.data.Dataset.from_tensor_slices({
-      "labels": [True, False, True, False],
-      "predictions": [True, True, False, False]}).repeat().batch(
-          3, drop_remainder=True)
+    # First four batches of labels, predictions: {TP, FP, TN, FN}
+    # with a threshold of 0.5:
+    #   T, T -> TP;  F, T -> FP;   T, F -> FN
+    #   F, F -> TN;  T, T -> TP;   F, T -> FP
+    #   T, F -> FN;  F, F -> TN;   T, T -> TP
+    #   F, T -> FP;  T, F -> FN;   F, F -> TN
+    return (
+        tf.data.Dataset.from_tensor_slices(
+            {
+                "labels": [True, False, True, False],
+                "predictions": [True, True, False, False],
+            }
+        )
+        .repeat()
+        .batch(3, drop_remainder=True)
+    )
 
 
 def _threshold_dataset_fn():
-  # First four batches of labels, predictions: {TP, FP, TN, FN}
-  # with a threshold of 0.5:
-  #   True, 1.0 -> TP;  False, .75 -> FP;   True, .25 -> FN
-  #  False, 0.0 -> TN;   True, 1.0 -> TP;  False, .75 -> FP
-  #   True, .25 -> FN;  False, 0.0 -> TN;   True, 1.0 -> TP
-  #  False, .75 -> FP;   True, .25 -> FN;  False, 0.0 -> TN
-  return tf.data.Dataset.from_tensor_slices({
-      "labels": [True, False, True, False],
-      "predictions": [1.0, 0.75, 0.25, 0.]}).repeat().batch(
-          3, drop_remainder=True)
+    # First four batches of labels, predictions: {TP, FP, TN, FN}
+    # with a threshold of 0.5:
+    #   True, 1.0 -> TP;  False, .75 -> FP;   True, .25 -> FN
+    #  False, 0.0 -> TN;   True, 1.0 -> TP;  False, .75 -> FP
+    #   True, .25 -> FN;  False, 0.0 -> TN;   True, 1.0 -> TP
+    #  False, .75 -> FP;   True, .25 -> FN;  False, 0.0 -> TN
+    return (
+        tf.data.Dataset.from_tensor_slices(
+            {
+                "labels": [True, False, True, False],
+                "predictions": [1.0, 0.75, 0.25, 0.0],
+            }
+        )
+        .repeat()
+        .batch(3, drop_remainder=True)
+    )
 
 
 def _regression_dataset_fn():
-  return tf.data.Dataset.from_tensor_slices({
-      "labels": [1., .5, 1., 0.],
-      "predictions": [1., .75, .25, 0.]}).repeat()
+    return tf.data.Dataset.from_tensor_slices(
+        {"labels": [1.0, 0.5, 1.0, 0.0], "predictions": [1.0, 0.75, 0.25, 0.0]}
+    ).repeat()
 
 
 def all_combinations():
-  return tf.__internal__.test.combinations.combine(
-      distribution=[
-          combinations.default_strategy, combinations.one_device_strategy,
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus
-      ],
-      mode=["graph", "eager"])
+    return tf.__internal__.test.combinations.combine(
+        distribution=[
+            combinations.default_strategy,
+            combinations.one_device_strategy,
+            combinations.mirrored_strategy_with_gpu_and_cpu,
+            combinations.mirrored_strategy_with_two_gpus,
+        ],
+        mode=["graph", "eager"],
+    )
 
 
 def tpu_combinations():
-  return tf.__internal__.test.combinations.combine(
-      distribution=[
-          combinations.tpu_strategy,
-      ], mode=["graph"])
+    return tf.__internal__.test.combinations.combine(
+        distribution=[
+            combinations.tpu_strategy,
+        ],
+        mode=["graph"],
+    )
 
 
 class KerasMetricsTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _test_metric(self, distribution, dataset_fn, metric_init_fn, expected_fn):
-    with tf.Graph().as_default(), distribution.scope():
-      metric = metric_init_fn()
-
-      iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
-      updates = distribution.experimental_local_results(
-          distribution.run(metric, args=(iterator.get_next(),)))
-      batches_per_update = distribution.num_replicas_in_sync
-
-      self.evaluate(iterator.initializer)
-      self.evaluate([v.initializer for v in metric.variables])
-
-      batches_consumed = 0
-      for i in range(4):
-        batches_consumed += batches_per_update
-        self.evaluate(updates)
-        self.assertAllClose(expected_fn(batches_consumed),
-                            self.evaluate(metric.result()),
-                            0.001,
-                            msg="After update #" + str(i+1))
-        if batches_consumed >= 4:  # Consume 4 input batches in total.
-          break
-
-  @combinations.generate(all_combinations() + tpu_combinations())
-  def testMean(self, distribution):
-    def _dataset_fn():
-      return tf.data.Dataset.range(1000).map(tf.compat.v1.to_float).batch(
-          4, drop_remainder=True)
-
-    def _expected_fn(num_batches):
-      # Mean(0..3) = 1.5, Mean(0..7) = 3.5, Mean(0..11) = 5.5, etc.
-      return num_batches * 2 - 0.5
-
-    self._test_metric(distribution, _dataset_fn, metrics.Mean, _expected_fn)
-
-  @combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              combinations.mirrored_strategy_with_one_cpu,
-              combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus,
-              combinations.tpu_strategy_packed_var,
-              combinations.parameter_server_strategy_1worker_2ps_cpu,
-              combinations.parameter_server_strategy_1worker_2ps_1gpu,
-          ],
-          mode=["eager"],
-          jit_compile=[False]) + tf.__internal__.test.combinations.combine(
-              distribution=[combinations.mirrored_strategy_with_two_gpus],
-              mode=["eager"],
-              jit_compile=[True]))
-  def testAddMetric(self, distribution, jit_compile):
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest("Skip test since tf2 is not enabled. Pass "
-                    " --test_env=TF2_BEHAVIOR=1 to enable tf2 behavior.")
-
-    class MetricLayer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__(name="metric_layer")
-        self.sum = metrics.Sum(name="sum")
-        # Using aggregation for jit_compile results in failure. Thus only set
-        # aggregation for PS Strategy for multi-gpu tests.
-        if isinstance(distribution,
-                      tf.distribute.experimental.ParameterServerStrategy):
-          self.sum_var = tf.Variable(
-              1.0, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+    def _test_metric(
+        self, distribution, dataset_fn, metric_init_fn, expected_fn
+    ):
+        with tf.Graph().as_default(), distribution.scope():
+            metric = metric_init_fn()
+
+            iterator = distribution.make_input_fn_iterator(
+                lambda _: dataset_fn()
+            )
+            updates = distribution.experimental_local_results(
+                distribution.run(metric, args=(iterator.get_next(),))
+            )
+            batches_per_update = distribution.num_replicas_in_sync
+
+            self.evaluate(iterator.initializer)
+            self.evaluate([v.initializer for v in metric.variables])
+
+            batches_consumed = 0
+            for i in range(4):
+                batches_consumed += batches_per_update
+                self.evaluate(updates)
+                self.assertAllClose(
+                    expected_fn(batches_consumed),
+                    self.evaluate(metric.result()),
+                    0.001,
+                    msg="After update #" + str(i + 1),
+                )
+                if batches_consumed >= 4:  # Consume 4 input batches in total.
+                    break
+
+    @combinations.generate(all_combinations() + tpu_combinations())
+    def testMean(self, distribution):
+        def _dataset_fn():
+            return (
+                tf.data.Dataset.range(1000)
+                .map(tf.compat.v1.to_float)
+                .batch(4, drop_remainder=True)
+            )
+
+        def _expected_fn(num_batches):
+            # Mean(0..3) = 1.5, Mean(0..7) = 3.5, Mean(0..11) = 5.5, etc.
+            return num_batches * 2 - 0.5
+
+        self._test_metric(distribution, _dataset_fn, metrics.Mean, _expected_fn)
+
+    @combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                combinations.mirrored_strategy_with_one_cpu,
+                combinations.mirrored_strategy_with_gpu_and_cpu,
+                combinations.mirrored_strategy_with_two_gpus,
+                combinations.tpu_strategy_packed_var,
+                combinations.parameter_server_strategy_1worker_2ps_cpu,
+                combinations.parameter_server_strategy_1worker_2ps_1gpu,
+            ],
+            mode=["eager"],
+            jit_compile=[False],
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=[combinations.mirrored_strategy_with_two_gpus],
+            mode=["eager"],
+            jit_compile=[True],
+        )
+    )
+    def testAddMetric(self, distribution, jit_compile):
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest(
+                "Skip test since tf2 is not enabled. Pass "
+                " --test_env=TF2_BEHAVIOR=1 to enable tf2 behavior."
+            )
+
+        class MetricLayer(base_layer.Layer):
+            def __init__(self):
+                super().__init__(name="metric_layer")
+                self.sum = metrics.Sum(name="sum")
+                # Using aggregation for jit_compile results in failure. Thus
+                # only set aggregation for PS Strategy for multi-gpu tests.
+                if isinstance(
+                    distribution,
+                    tf.distribute.experimental.ParameterServerStrategy,
+                ):
+                    self.sum_var = tf.Variable(
+                        1.0,
+                        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+                    )
+                else:
+                    self.sum_var = tf.Variable(1.0)
+
+            def call(self, inputs):
+                self.add_metric(self.sum(inputs))
+                self.add_metric(
+                    tf.reduce_mean(inputs), name="mean", aggregation="mean"
+                )
+                self.sum_var.assign(self.sum.result())
+                return inputs
+
+        with distribution.scope():
+            layer = MetricLayer()
+
+        def func():
+            return layer(tf.ones(()))
+
+        if jit_compile:
+            func = tf.function(jit_compile=True)(func)
+
+        @tf.function
+        def run():
+            return distribution.run(func)
+
+        if distribution._should_use_with_coordinator:
+            coord = tf.distribute.experimental.coordinator.ClusterCoordinator(
+                distribution
+            )
+            coord.schedule(run)
+            coord.join()
         else:
-          self.sum_var = tf.Variable(1.0)
-
-      def call(self, inputs):
-        self.add_metric(self.sum(inputs))
-        self.add_metric(
-            tf.reduce_mean(inputs), name="mean", aggregation="mean")
-        self.sum_var.assign(self.sum.result())
-        return inputs
-
-    with distribution.scope():
-      layer = MetricLayer()
-
-    def func():
-      return layer(tf.ones(()))
-
-    if jit_compile:
-      func = tf.function(jit_compile=True)(func)
-
-    @tf.function
-    def run():
-      return distribution.run(func)
-
-    if distribution._should_use_with_coordinator:
-      coord = tf.distribute.experimental.coordinator.ClusterCoordinator(
-          distribution)
-      coord.schedule(run)
-      coord.join()
-    else:
-      run()
-
-    self.assertEqual(layer.metrics[0].result().numpy(),
-                     1.0 * distribution.num_replicas_in_sync)
-    self.assertEqual(layer.metrics[1].result().numpy(), 1.0)
-    self.assertEqual(layer.sum_var.read_value().numpy(),
-                     1.0 * distribution.num_replicas_in_sync)
-
-  @combinations.generate(all_combinations())
-  def test_precision(self, distribution):
-    # True positive is 2, false positive 1, precision is 2/3 = 0.6666667
-    label_prediction = ([0, 1, 1, 1], [1, 0, 1, 1])
-    with distribution.scope():
-      precision = metrics.Precision()
-      self.evaluate([v.initializer for v in precision.variables])
-      updates = distribution.run(precision, args=label_prediction)
-      self.evaluate(updates)
-    self.assertAllClose(precision.result(), 0.6666667)
-
-  @combinations.generate(all_combinations())
-  def test_recall(self, distribution):
-    # True positive is 2, false negative 1, precision is 2/3 = 0.6666667
-    label_prediction = ([0, 1, 1, 1], [1, 0, 1, 1])
-    with distribution.scope():
-      recall = metrics.Recall()
-      self.evaluate([v.initializer for v in recall.variables])
-      updates = distribution.run(recall, args=label_prediction)
-      self.evaluate(updates)
-    self.assertAllClose(recall.result(), 0.6666667)
-
-  @combinations.generate(all_combinations())
-  def test_SensitivityAtSpecificity(self, distribution):
-    label_prediction = ([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
-    with distribution.scope():
-      metric = metrics.SensitivityAtSpecificity(0.5)
-      self.evaluate([v.initializer for v in metric.variables])
-      updates = distribution.run(metric, args=label_prediction)
-      self.evaluate(updates)
-    self.assertAllClose(metric.result(), 0.5)
-
-  @combinations.generate(all_combinations())
-  def test_SpecificityAtSensitivity(self, distribution):
-    label_prediction = ([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
-    with distribution.scope():
-      metric = metrics.SpecificityAtSensitivity(0.5)
-      self.evaluate([v.initializer for v in metric.variables])
-      updates = distribution.run(metric, args=label_prediction)
-      self.evaluate(updates)
-    self.assertAllClose(metric.result(), 0.66666667)
-
-  @combinations.generate(all_combinations())
-  def test_PrecisionAtRecall(self, distribution):
-    label_prediction = ([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
-    with distribution.scope():
-      metric = metrics.PrecisionAtRecall(0.5)
-      self.evaluate([v.initializer for v in metric.variables])
-      updates = distribution.run(metric, args=label_prediction)
-      self.evaluate(updates)
-    self.assertAllClose(metric.result(), 0.5)
-
-  @combinations.generate(all_combinations())
-  def test_RecallAtPrecision(self, distribution):
-    label_prediction = ([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
-    with distribution.scope():
-      metric = metrics.RecallAtPrecision(0.8)
-      self.evaluate([v.initializer for v in metric.variables])
-      updates = distribution.run(metric, args=label_prediction)
-      self.evaluate(updates)
-    self.assertAllClose(metric.result(), 0.5)
-
-  @combinations.generate(all_combinations())
-  def test_auc(self, distribution):
-    label_prediction = ([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
-    with distribution.scope():
-      metric = metrics.AUC(num_thresholds=3)
-      self.evaluate([v.initializer for v in metric.variables])
-      updates = distribution.run(metric, args=label_prediction)
-      self.evaluate(updates)
-    self.assertAllClose(metric.result(), 0.75)
+            run()
+
+        self.assertEqual(
+            layer.metrics[0].result().numpy(),
+            1.0 * distribution.num_replicas_in_sync,
+        )
+        self.assertEqual(layer.metrics[1].result().numpy(), 1.0)
+        self.assertEqual(
+            layer.sum_var.read_value().numpy(),
+            1.0 * distribution.num_replicas_in_sync,
+        )
+
+    @combinations.generate(all_combinations())
+    def test_precision(self, distribution):
+        # True positive is 2, false positive 1, precision is 2/3 = 0.6666667
+        label_prediction = ([0, 1, 1, 1], [1, 0, 1, 1])
+        with distribution.scope():
+            precision = metrics.Precision()
+            self.evaluate([v.initializer for v in precision.variables])
+            updates = distribution.run(precision, args=label_prediction)
+            self.evaluate(updates)
+        self.assertAllClose(precision.result(), 0.6666667)
+
+    @combinations.generate(all_combinations())
+    def test_recall(self, distribution):
+        # True positive is 2, false negative 1, precision is 2/3 = 0.6666667
+        label_prediction = ([0, 1, 1, 1], [1, 0, 1, 1])
+        with distribution.scope():
+            recall = metrics.Recall()
+            self.evaluate([v.initializer for v in recall.variables])
+            updates = distribution.run(recall, args=label_prediction)
+            self.evaluate(updates)
+        self.assertAllClose(recall.result(), 0.6666667)
+
+    @combinations.generate(all_combinations())
+    def test_SensitivityAtSpecificity(self, distribution):
+        label_prediction = ([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
+        with distribution.scope():
+            metric = metrics.SensitivityAtSpecificity(0.5)
+            self.evaluate([v.initializer for v in metric.variables])
+            updates = distribution.run(metric, args=label_prediction)
+            self.evaluate(updates)
+        self.assertAllClose(metric.result(), 0.5)
+
+    @combinations.generate(all_combinations())
+    def test_SpecificityAtSensitivity(self, distribution):
+        label_prediction = ([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
+        with distribution.scope():
+            metric = metrics.SpecificityAtSensitivity(0.5)
+            self.evaluate([v.initializer for v in metric.variables])
+            updates = distribution.run(metric, args=label_prediction)
+            self.evaluate(updates)
+        self.assertAllClose(metric.result(), 0.66666667)
+
+    @combinations.generate(all_combinations())
+    def test_PrecisionAtRecall(self, distribution):
+        label_prediction = ([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
+        with distribution.scope():
+            metric = metrics.PrecisionAtRecall(0.5)
+            self.evaluate([v.initializer for v in metric.variables])
+            updates = distribution.run(metric, args=label_prediction)
+            self.evaluate(updates)
+        self.assertAllClose(metric.result(), 0.5)
+
+    @combinations.generate(all_combinations())
+    def test_RecallAtPrecision(self, distribution):
+        label_prediction = ([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+        with distribution.scope():
+            metric = metrics.RecallAtPrecision(0.8)
+            self.evaluate([v.initializer for v in metric.variables])
+            updates = distribution.run(metric, args=label_prediction)
+            self.evaluate(updates)
+        self.assertAllClose(metric.result(), 0.5)
+
+    @combinations.generate(all_combinations())
+    def test_auc(self, distribution):
+        label_prediction = ([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+        with distribution.scope():
+            metric = metrics.AUC(num_thresholds=3)
+            self.evaluate([v.initializer for v in metric.variables])
+            updates = distribution.run(metric, args=label_prediction)
+            self.evaluate(updates)
+        self.assertAllClose(metric.result(), 0.75)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/keras_models_test.py b/keras/distribute/keras_models_test.py
index c4a9683954b7..4cc9e9c35c1a 100644
--- a/keras/distribute/keras_models_test.py
+++ b/keras/distribute/keras_models_test.py
@@ -14,43 +14,45 @@
 # ==============================================================================
 """Tests for Keras high level APIs, e.g. fit, evaluate and predict."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
 from keras.distribute.strategy_combinations import all_strategies
 
 
 class KerasModelsTest(tf.test.TestCase, parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=all_strategies, mode=["eager"]))
-  def test_lstm_model_with_dynamic_batch(self, distribution):
-    input_data = np.random.random([1, 32, 64, 64, 3])
-    input_shape = tuple(input_data.shape[1:])
-
-    def build_model():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.ConvLSTM2D(
-              4,
-              kernel_size=(4, 4),
-              activation="sigmoid",
-              padding="same",
-              input_shape=input_shape))
-      model.add(keras.layers.GlobalMaxPooling2D())
-      model.add(keras.layers.Dense(2, activation="sigmoid"))
-      return model
-
-    with distribution.scope():
-      model = build_model()
-      model.compile(loss="binary_crossentropy", optimizer="adam")
-      result = model.predict(input_data)
-      self.assertEqual(result.shape, (1, 2))
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=all_strategies, mode=["eager"]
+        )
+    )
+    def test_lstm_model_with_dynamic_batch(self, distribution):
+        input_data = np.random.random([1, 32, 64, 64, 3])
+        input_shape = tuple(input_data.shape[1:])
+
+        def build_model():
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.ConvLSTM2D(
+                    4,
+                    kernel_size=(4, 4),
+                    activation="sigmoid",
+                    padding="same",
+                    input_shape=input_shape,
+                )
+            )
+            model.add(keras.layers.GlobalMaxPooling2D())
+            model.add(keras.layers.Dense(2, activation="sigmoid"))
+            return model
+
+        with distribution.scope():
+            model = build_model()
+            model.compile(loss="binary_crossentropy", optimizer="adam")
+            result = model.predict(input_data)
+            self.assertEqual(result.shape, (1, 2))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/distribute/keras_optimizer_v2_test.py b/keras/distribute/keras_optimizer_v2_test.py
index b7dc18c66139..1b4c6150af2c 100644
--- a/keras/distribute/keras_optimizer_v2_test.py
+++ b/keras/distribute/keras_optimizer_v2_test.py
@@ -14,119 +14,123 @@
 # ==============================================================================
 """Tests that show that DistributionStrategy works with optimizer v2."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
-from keras.optimizers.optimizer_v2 import adam
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import adam
+from keras.optimizers.legacy import gradient_descent
 
 
 def get_model():
-  x = keras.layers.Input(shape=(3,), name='input')
-  y = keras.layers.Dense(4, name='dense')(x)
-  model = keras.Model(x, y)
-  return model
+    x = keras.layers.Input(shape=(3,), name="input")
+    y = keras.layers.Dense(4, name="dense")(x)
+    model = keras.Model(x, y)
+    return model
 
 
 class MirroredStrategyOptimizerV2Test(tf.test.TestCase, parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,
-          ],
-          mode=['graph', 'eager']))
-  def testKerasOptimizerWithUnequalInput(self, distribution):
-    with distribution.scope():
-      var = tf.Variable(
-          2.0, name='var', aggregation=tf.VariableAggregation.SUM)
-      optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
-      all_vars = []
-
-      def model_fn():
-
-        def loss_fn():
-          replica_id = _replica_id()
-          return tf.cast(replica_id + 1, dtype=tf.float32) * 0.5 * var
-
-        train_op = optimizer.minimize(loss_fn, var_list=[var])
-
-        return train_op, optimizer
-
-      def train_fn():
-        train_op, optimizer = distribution.extended.call_for_each_replica(
-            model_fn)
-        if not all_vars:
-          all_vars.append(var)
-          all_vars.append(optimizer.get_slot(var, 'm'))
-          all_vars.append(optimizer.get_slot(var, 'v'))
-        return distribution.group(train_op)
-
-      if not tf.executing_eagerly():
-        with self.cached_session() as sess:
-          train_fn = sess.make_callable(train_fn())
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      # first step.
-      train_fn()
-      # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
-      #        = 2.0 - 0.01 * 1.2 * sqrt(0.8) / sqrt(1.8) / 0.8
-      self.assertAllClose(1.99, self.evaluate(all_vars[0]))
-      # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
-      self.assertAllClose(1.2, self.evaluate(all_vars[1]))
-      # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
-      self.assertAllClose(1.8, self.evaluate(all_vars[2]))
-
-      # second step.
-      train_fn()
-      # var(1) = var(0) - lr * 2 = 1.98
-      self.assertAllClose(1.98, self.evaluate(all_vars[0]))
-      # m(2) = beta1 * m(1) + (1-beta1) * grad = 0.2 * 1.2 + 0.8 * 1.5
-      self.assertAllClose(1.44, self.evaluate(all_vars[1]))
-      # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
-      self.assertAllClose(2.16, self.evaluate(all_vars[2]))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,
-          ],
-          mode=['graph', 'eager']))
-  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = get_model()
-        optimizer = gradient_descent.SGD(0.001)
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
-
-      model.fit(
-          inputs,
-          targets,
-          epochs=1,
-          batch_size=2,
-          verbose=0,
-          validation_data=(inputs, targets))
-      model.evaluate(inputs, targets)
-      model.predict(inputs)
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def testKerasOptimizerWithUnequalInput(self, distribution):
+        with distribution.scope():
+            var = tf.Variable(
+                2.0, name="var", aggregation=tf.VariableAggregation.SUM
+            )
+            optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
+            all_vars = []
+
+            def model_fn():
+                def loss_fn():
+                    replica_id = _replica_id()
+                    return tf.cast(replica_id + 1, dtype=tf.float32) * 0.5 * var
+
+                train_op = optimizer.minimize(loss_fn, var_list=[var])
+
+                return train_op, optimizer
+
+            def train_fn():
+                (
+                    train_op,
+                    optimizer,
+                ) = distribution.extended.call_for_each_replica(model_fn)
+                if not all_vars:
+                    all_vars.append(var)
+                    all_vars.append(optimizer.get_slot(var, "m"))
+                    all_vars.append(optimizer.get_slot(var, "v"))
+                return distribution.group(train_op)
+
+            if not tf.executing_eagerly():
+                with self.cached_session() as sess:
+                    train_fn = sess.make_callable(train_fn())
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            # first step.
+            train_fn()
+            # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 -
+            # beta1)
+            #        = 2.0 - 0.01 * 1.2 * sqrt(0.8) / sqrt(1.8) / 0.8
+            self.assertAllClose(1.99, self.evaluate(all_vars[0]))
+            # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) /
+            # 2
+            self.assertAllClose(1.2, self.evaluate(all_vars[1]))
+            # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
+            self.assertAllClose(1.8, self.evaluate(all_vars[2]))
+
+            # second step.
+            train_fn()
+            # var(1) = var(0) - lr * 2 = 1.98
+            self.assertAllClose(1.98, self.evaluate(all_vars[0]))
+            # m(2) = beta1 * m(1) + (1-beta1) * grad = 0.2 * 1.2 + 0.8 * 1.5
+            self.assertAllClose(1.44, self.evaluate(all_vars[1]))
+            # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
+            self.assertAllClose(2.16, self.evaluate(all_vars[2]))
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
+        with self.cached_session():
+            with distribution.scope():
+                model = get_model()
+                optimizer = gradient_descent.SGD(0.001)
+                loss = "mse"
+                metrics = ["mae"]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            inputs = np.zeros((64, 3), dtype=np.float32)
+            targets = np.zeros((64, 4), dtype=np.float32)
+
+            model.fit(
+                inputs,
+                targets,
+                epochs=1,
+                batch_size=2,
+                verbose=0,
+                validation_data=(inputs, targets),
+            )
+            model.evaluate(inputs, targets)
+            model.predict(inputs)
 
 
 def _replica_id():
-  replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
-  if not isinstance(replica_id, tf.Tensor):
-    replica_id = tf.constant(replica_id)
-  return replica_id
+    replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
+    if not isinstance(replica_id, tf.Tensor):
+        replica_id = tf.constant(replica_id)
+    return replica_id
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/distribute/keras_premade_models_test.py b/keras/distribute/keras_premade_models_test.py
index ace71a5ac697..e4badc570524 100644
--- a/keras/distribute/keras_premade_models_test.py
+++ b/keras/distribute/keras_premade_models_test.py
@@ -14,44 +14,40 @@
 # ==============================================================================
 """Tests for keras premade models using tf.distribute.Strategy."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 from keras.engine import sequential
 from keras.layers import core
-from keras.optimizers.optimizer_v2 import adagrad
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import adagrad
+from keras.optimizers.legacy import gradient_descent
 from keras.premade_models import linear
 from keras.premade_models import wide_deep
 from keras.utils import dataset_creator
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 def strategy_combinations_eager_data_fn():
-  return tf.__internal__.test.combinations.combine(
-      distribution=[
-          tf.__internal__.distribute.combinations.default_strategy,
-          tf.__internal__.distribute.combinations.one_device_strategy,
-          tf.__internal__.distribute.combinations.one_device_strategy_gpu,
-          tf.__internal__.distribute.combinations
-          .mirrored_strategy_with_gpu_and_cpu,
-          tf.__internal__.distribute.combinations
-          .mirrored_strategy_with_two_gpus,
-          tf.__internal__.distribute.combinations
-          .mirrored_strategy_with_two_gpus_no_merge_call,
-          tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,
-          tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,
-          tf.__internal__.distribute.combinations.multi_worker_mirrored_2x2_gpu,
-          tf.__internal__.distribute.combinations
-          .parameter_server_strategy_1worker_2ps_cpu,
-          tf.__internal__.distribute.combinations
-          .parameter_server_strategy_1worker_2ps_1gpu,
-          # NOTE: TPUStrategy not tested because the models in this test are
-          # sparse and do not work with TPUs.
-      ],
-      use_dataset_creator=[True, False],
-      mode=['eager'],
-      data_fn=['numpy', 'dataset'])
+    return tf.__internal__.test.combinations.combine(
+        distribution=[
+            tf.__internal__.distribute.combinations.default_strategy,
+            tf.__internal__.distribute.combinations.one_device_strategy,
+            tf.__internal__.distribute.combinations.one_device_strategy_gpu,
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus_no_merge_call,  # noqa: E501
+            tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.multi_worker_mirrored_2x2_gpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.parameter_server_strategy_1worker_2ps_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.parameter_server_strategy_1worker_2ps_1gpu,  # noqa: E501
+            # NOTE: TPUStrategy not tested because the models in this test are
+            # sparse and do not work with TPUs.
+        ],
+        use_dataset_creator=[True, False],
+        mode=["eager"],
+        data_fn=["numpy", "dataset"],
+    )
 
 
 INPUT_SIZE = 64
@@ -59,96 +55,116 @@ def strategy_combinations_eager_data_fn():
 
 
 def get_numpy():
-  inputs = np.random.uniform(
-      low=-5., high=5., size=(INPUT_SIZE, 2)).astype(np.float32)
-  output = .3 * inputs[:, 0] + .2 * inputs[:, 1]
-  return inputs, output
+    inputs = np.random.uniform(low=-5.0, high=5.0, size=(INPUT_SIZE, 2)).astype(
+        np.float32
+    )
+    output = 0.3 * inputs[:, 0] + 0.2 * inputs[:, 1]
+    return inputs, output
 
 
 def get_dataset(input_context=None, batch_size=None):
-  inputs, output = get_numpy()
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, output))
-  if input_context:
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-  if batch_size is None:
-    batch_size = BATCH_SIZE
+    inputs, output = get_numpy()
+    dataset = tf.data.Dataset.from_tensor_slices((inputs, output))
+    if input_context:
+        dataset = dataset.shard(
+            input_context.num_input_pipelines, input_context.input_pipeline_id
+        )
+    if batch_size is None:
+        batch_size = BATCH_SIZE
 
-  dataset = dataset.batch(batch_size).repeat(200)
-  return dataset
+    dataset = dataset.batch(batch_size).repeat(200)
+    return dataset
 
 
 # A `dataset_fn` is required for `Model.fit` to work across all strategies.
 def dataset_fn(input_context):
-  batch_size = input_context.get_per_replica_batch_size(
-      global_batch_size=BATCH_SIZE)
-  return get_dataset(input_context, batch_size)
+    batch_size = input_context.get_per_replica_batch_size(
+        global_batch_size=BATCH_SIZE
+    )
+    return get_dataset(input_context, batch_size)
 
 
 class KerasPremadeModelsTest(tf.test.TestCase, parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      strategy_combinations_eager_data_fn())
-  def test_linear_model(self, distribution, use_dataset_creator, data_fn):
-    if ((not use_dataset_creator) and isinstance(
-        distribution, tf.distribute.experimental.ParameterServerStrategy)):
-      self.skipTest(
-          'Parameter Server strategy requires dataset creator to be used in '
-          'model.fit.')
-    if (not tf.__internal__.tf2.enabled() and use_dataset_creator
-        and isinstance(distribution,
-                       tf.distribute.experimental.ParameterServerStrategy)):
-      self.skipTest(
-          'Parameter Server strategy with dataset creator needs to be run when '
-          'eager execution is enabled.')
-    with distribution.scope():
-      model = linear.LinearModel()
-      opt = gradient_descent.SGD(learning_rate=0.1)
-      model.compile(opt, 'mse')
-      if use_dataset_creator:
-        x = dataset_creator.DatasetCreator(dataset_fn)
-        hist = model.fit(x, epochs=3, steps_per_epoch=INPUT_SIZE)
-      else:
-        if data_fn == 'numpy':
-          inputs, output = get_numpy()
-          hist = model.fit(inputs, output, epochs=3)
-        else:
-          hist = model.fit(get_dataset(), epochs=3)
-        self.assertLess(hist.history['loss'][2], 0.2)
-
-  @tf.__internal__.distribute.combinations.generate(
-      strategy_combinations_eager_data_fn())
-  def test_wide_deep_model(self, distribution, use_dataset_creator, data_fn):
-    if ((not use_dataset_creator) and isinstance(
-        distribution, tf.distribute.experimental.ParameterServerStrategy)):
-      self.skipTest(
-          'Parameter Server strategy requires dataset creator to be used in '
-          'model.fit.')
-    if (not tf.__internal__.tf2.enabled() and use_dataset_creator
-        and isinstance(distribution,
-                       tf.distribute.experimental.ParameterServerStrategy)):
-      self.skipTest(
-          'Parameter Server strategy with dataset creator needs to be run when '
-          'eager execution is enabled.')
-    with distribution.scope():
-      linear_model = linear.LinearModel(units=1)
-      dnn_model = sequential.Sequential([core.Dense(units=1)])
-      wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-      linear_opt = gradient_descent.SGD(learning_rate=0.05)
-      dnn_opt = adagrad.Adagrad(learning_rate=0.1)
-      wide_deep_model.compile(optimizer=[linear_opt, dnn_opt], loss='mse')
-
-      if use_dataset_creator:
-        x = dataset_creator.DatasetCreator(dataset_fn)
-        hist = wide_deep_model.fit(x, epochs=3, steps_per_epoch=INPUT_SIZE)
-      else:
-        if data_fn == 'numpy':
-          inputs, output = get_numpy()
-          hist = wide_deep_model.fit(inputs, output, epochs=3)
-        else:
-          hist = wide_deep_model.fit(get_dataset(), epochs=3)
-      self.assertLess(hist.history['loss'][2], 0.2)
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    @tf.__internal__.distribute.combinations.generate(
+        strategy_combinations_eager_data_fn()
+    )
+    def test_linear_model(self, distribution, use_dataset_creator, data_fn):
+        if (not use_dataset_creator) and isinstance(
+            distribution, tf.distribute.experimental.ParameterServerStrategy
+        ):
+            self.skipTest(
+                "Parameter Server strategy requires dataset creator to be used "
+                "in model.fit."
+            )
+        if (
+            not tf.__internal__.tf2.enabled()
+            and use_dataset_creator
+            and isinstance(
+                distribution, tf.distribute.experimental.ParameterServerStrategy
+            )
+        ):
+            self.skipTest(
+                "Parameter Server strategy with dataset creator needs to be "
+                "run when eager execution is enabled."
+            )
+        with distribution.scope():
+            model = linear.LinearModel()
+            opt = gradient_descent.SGD(learning_rate=0.1)
+            model.compile(opt, "mse")
+            if use_dataset_creator:
+                x = dataset_creator.DatasetCreator(dataset_fn)
+                hist = model.fit(x, epochs=3, steps_per_epoch=INPUT_SIZE)
+            else:
+                if data_fn == "numpy":
+                    inputs, output = get_numpy()
+                    hist = model.fit(inputs, output, epochs=3)
+                else:
+                    hist = model.fit(get_dataset(), epochs=3)
+                self.assertLess(hist.history["loss"][2], 0.2)
+
+    @tf.__internal__.distribute.combinations.generate(
+        strategy_combinations_eager_data_fn()
+    )
+    def test_wide_deep_model(self, distribution, use_dataset_creator, data_fn):
+        if (not use_dataset_creator) and isinstance(
+            distribution, tf.distribute.experimental.ParameterServerStrategy
+        ):
+            self.skipTest(
+                "Parameter Server strategy requires dataset creator to be used "
+                "in model.fit."
+            )
+        if (
+            not tf.__internal__.tf2.enabled()
+            and use_dataset_creator
+            and isinstance(
+                distribution, tf.distribute.experimental.ParameterServerStrategy
+            )
+        ):
+            self.skipTest(
+                "Parameter Server strategy with dataset creator needs to be "
+                "run when eager execution is enabled."
+            )
+        with distribution.scope():
+            linear_model = linear.LinearModel(units=1)
+            dnn_model = sequential.Sequential([core.Dense(units=1)])
+            wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+            linear_opt = gradient_descent.SGD(learning_rate=0.05)
+            dnn_opt = adagrad.Adagrad(learning_rate=0.1)
+            wide_deep_model.compile(optimizer=[linear_opt, dnn_opt], loss="mse")
+
+            if use_dataset_creator:
+                x = dataset_creator.DatasetCreator(dataset_fn)
+                hist = wide_deep_model.fit(
+                    x, epochs=3, steps_per_epoch=INPUT_SIZE
+                )
+            else:
+                if data_fn == "numpy":
+                    inputs, output = get_numpy()
+                    hist = wide_deep_model.fit(inputs, output, epochs=3)
+                else:
+                    hist = wide_deep_model.fit(get_dataset(), epochs=3)
+            self.assertLess(hist.history["loss"][2], 0.2)
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/keras_rnn_model_correctness_test.py b/keras/distribute/keras_rnn_model_correctness_test.py
index 18c468b7039d..74bf17077d36 100644
--- a/keras/distribute/keras_rnn_model_correctness_test.py
+++ b/keras/distribute/keras_rnn_model_correctness_test.py
@@ -14,119 +14,147 @@
 # ==============================================================================
 """Correctness tests for tf.keras RNN models using DistributionStrategy."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
-from keras.testing_infra import test_utils
 from keras.distribute import keras_correctness_test_base
 from keras.layers.rnn import gru
 from keras.layers.rnn import gru_v1
 from keras.layers.rnn import lstm
 from keras.layers.rnn import lstm_v1
 from keras.mixed_precision import policy
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_keras
+from keras.optimizers.legacy import gradient_descent as gradient_descent_keras
+from keras.testing_infra import test_utils
 
 
 class _DistributionStrategyRnnModelCorrectnessTest(
-    keras_correctness_test_base
-    .TestDistributionStrategyEmbeddingModelCorrectnessBase):
-
-  def _get_layer_class(self):
-    raise NotImplementedError
-
-  def get_model(self,
-                max_words=10,
-                initial_weights=None,
-                distribution=None,
-                input_shapes=None):
-    del input_shapes
-    rnn_cls = self._get_layer_class()
-
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-      word_ids = keras.layers.Input(
-          shape=(max_words,), dtype=np.int32, name='words')
-      word_embed = keras.layers.Embedding(input_dim=20, output_dim=10)(word_ids)
-      rnn_embed = rnn_cls(units=4, return_sequences=False)(word_embed)
-
-      dense_output = keras.layers.Dense(2)(rnn_embed)
-      preds = keras.layers.Softmax(dtype='float32')(dense_output)
-      model = keras.Model(inputs=[word_ids], outputs=[preds])
-
-      if initial_weights:
-        model.set_weights(initial_weights)
-
-      optimizer_fn = gradient_descent_keras.SGD
-
-      model.compile(
-          optimizer=optimizer_fn(learning_rate=0.1),
-          loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
-    return model
+    keras_correctness_test_base.TestDistributionStrategyEmbeddingModelCorrectnessBase  # noqa: E501
+):
+    def _get_layer_class(self):
+        raise NotImplementedError
+
+    def get_model(
+        self,
+        max_words=10,
+        initial_weights=None,
+        distribution=None,
+        input_shapes=None,
+    ):
+        del input_shapes
+        rnn_cls = self._get_layer_class()
+
+        with keras_correctness_test_base.MaybeDistributionScope(distribution):
+            word_ids = keras.layers.Input(
+                shape=(max_words,), dtype=np.int32, name="words"
+            )
+            word_embed = keras.layers.Embedding(input_dim=20, output_dim=10)(
+                word_ids
+            )
+            rnn_embed = rnn_cls(units=4, return_sequences=False)(word_embed)
+
+            dense_output = keras.layers.Dense(2)(rnn_embed)
+            preds = keras.layers.Softmax(dtype="float32")(dense_output)
+            model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+            if initial_weights:
+                model.set_weights(initial_weights)
+
+            optimizer_fn = gradient_descent_keras.SGD
+
+            model.compile(
+                optimizer=optimizer_fn(learning_rate=0.1),
+                loss="sparse_categorical_crossentropy",
+                metrics=["sparse_categorical_accuracy"],
+            )
+        return model
 
 
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
+    "Uses Dense layers, which call matmul"
+)
 class DistributionStrategyGruModelCorrectnessTest(
-    _DistributionStrategyRnnModelCorrectnessTest):
-
-  def _get_layer_class(self):
-    if tf.__internal__.tf2.enabled():
-      if not tf.executing_eagerly():
-        self.skipTest("GRU v2 and legacy graph mode don't work together.")
-      return gru.GRU
-    else:
-      return gru_v1.GRU
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_gru_model_correctness(self, distribution, use_numpy,
-                                 use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+    _DistributionStrategyRnnModelCorrectnessTest
+):
+    def _get_layer_class(self):
+        if tf.__internal__.tf2.enabled():
+            if not tf.executing_eagerly():
+                self.skipTest(
+                    "GRU v2 and legacy graph mode don't work together."
+                )
+            return gru.GRU
+        else:
+            return gru_v1.GRU
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_for_embedding_model()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_gru_model_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
 
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
+    "Uses Dense layers, which call matmul"
+)
 class DistributionStrategyLstmModelCorrectnessTest(
-    _DistributionStrategyRnnModelCorrectnessTest):
-
-  def _get_layer_class(self):
-    if tf.__internal__.tf2.enabled():
-      if not tf.executing_eagerly():
-        self.skipTest("LSTM v2 and legacy graph mode don't work together.")
-      return lstm.LSTM
-    else:
-      return lstm_v1.LSTM
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  def test_lstm_model_correctness(self, distribution, use_numpy,
-                                  use_validation_data):
-    self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model() +
-      keras_correctness_test_base.multi_worker_mirrored_eager())
-  @test_utils.enable_v2_dtype_behavior
-  def test_lstm_model_correctness_mixed_precision(self, distribution, use_numpy,
-                                                  use_validation_data):
-    if isinstance(distribution,
-                  (tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      self.skipTest('CentralStorageStrategy is not supported by '
-                    'mixed precision.')
-    if isinstance(distribution,
-                  (tf.distribute.experimental.TPUStrategy, tf.compat.v1.distribute.experimental.TPUStrategy)):
-      policy_name = 'mixed_bfloat16'
-    else:
-      policy_name = 'mixed_float16'
-
-    with policy.policy_scope(policy_name):
-      self.run_correctness_test(distribution, use_numpy, use_validation_data)
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    _DistributionStrategyRnnModelCorrectnessTest
+):
+    def _get_layer_class(self):
+        if tf.__internal__.tf2.enabled():
+            if not tf.executing_eagerly():
+                self.skipTest(
+                    "LSTM v2 and legacy graph mode don't work together."
+                )
+            return lstm.LSTM
+        else:
+            return lstm_v1.LSTM
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_for_embedding_model()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    def test_lstm_model_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_correctness_test_base.test_combinations_for_embedding_model()
+        + keras_correctness_test_base.multi_worker_mirrored_eager()
+    )
+    @test_utils.enable_v2_dtype_behavior
+    def test_lstm_model_correctness_mixed_precision(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        if isinstance(
+            distribution,
+            (
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            self.skipTest(
+                "CentralStorageStrategy is not supported by mixed precision."
+            )
+        if isinstance(
+            distribution,
+            (
+                tf.distribute.experimental.TPUStrategy,
+                tf.compat.v1.distribute.experimental.TPUStrategy,
+            ),
+        ):
+            policy_name = "mixed_bfloat16"
+        else:
+            policy_name = "mixed_float16"
+
+        with policy.policy_scope(policy_name):
+            self.run_correctness_test(
+                distribution, use_numpy, use_validation_data
+            )
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/keras_save_load_test.py b/keras/distribute/keras_save_load_test.py
index 7b35bd613cc8..b72be7171d8f 100644
--- a/keras/distribute/keras_save_load_test.py
+++ b/keras/distribute/keras_save_load_test.py
@@ -14,59 +14,80 @@
 # ==============================================================================
 """Tests for saving and loading using keras save/load APIs with DS."""
 
+import tensorflow.compat.v2 as tf
+
 from keras.distribute import saved_model_test_base as test_base
-from keras.saving import save
+from keras.saving.legacy import save
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
+    "Uses Dense layers, which call matmul"
+)
 class KerasSaveLoadTest(test_base.TestSavedModelBase):
+    def setUp(self):
+        self._root_dir = "keras_save_load"
+        super().setUp()
 
-  def setUp(self):
-    self._root_dir = 'keras_save_load'
-    super().setUp()
-
-  def _save_model(self, model, saved_dir):
-    model.save(saved_dir, save_format='tf')
+    def _save_model(self, model, saved_dir):
+        model.save(saved_dir, save_format="tf")
 
-  def _load_and_run_model(self,
-                          distribution,
-                          saved_dir,
-                          predict_dataset,
-                          output_name='output_1'):
-    restored_keras_model = save.load_model(saved_dir)
-    return restored_keras_model.predict(
-        predict_dataset, steps=test_base.PREDICT_STEPS)
+    def _load_and_run_model(
+        self, distribution, saved_dir, predict_dataset, output_name="output_1"
+    ):
+        restored_keras_model = save.load_model(saved_dir)
+        return restored_keras_model.predict(
+            predict_dataset, steps=test_base.PREDICT_STEPS
+        )
 
-  @tf.__internal__.distribute.combinations.generate(test_base.simple_models_with_strategies())
-  def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(
-        model_and_input, distribution)
+    @tf.__internal__.distribute.combinations.generate(
+        test_base.simple_models_with_strategies()
+    )
+    def test_save_no_strategy_restore_strategy(
+        self, model_and_input, distribution
+    ):
+        self.run_test_save_no_strategy_restore_strategy(
+            model_and_input, distribution
+        )
 
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.simple_models_with_strategies(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_no_strategy(self, model_and_input,
-                                             distribution, save_in_scope):
-    self.run_test_save_strategy_restore_no_strategy(
-        model_and_input, distribution, save_in_scope)
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.simple_models_with_strategies(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_no_strategy(
+        self, model_and_input, distribution, save_in_scope
+    ):
+        self.run_test_save_strategy_restore_no_strategy(
+            model_and_input, distribution, save_in_scope
+        )
 
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.simple_models_with_strategy_pairs(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_strategy(self, model_and_input,
-                                          distribution_for_saving,
-                                          distribution_for_restoring,
-                                          save_in_scope):
-    self.run_test_save_strategy_restore_strategy(model_and_input,
-                                                 distribution_for_saving,
-                                                 distribution_for_restoring,
-                                                 save_in_scope)
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.simple_models_with_strategy_pairs(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_strategy(
+        self,
+        model_and_input,
+        distribution_for_saving,
+        distribution_for_restoring,
+        save_in_scope,
+    ):
+        self.run_test_save_strategy_restore_strategy(
+            model_and_input,
+            distribution_for_saving,
+            distribution_for_restoring,
+            save_in_scope,
+        )
 
 
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/distribute/keras_stateful_lstm_model_correctness_test.py b/keras/distribute/keras_stateful_lstm_model_correctness_test.py
index c0e28d41c70f..631643c645c9 100644
--- a/keras/distribute/keras_stateful_lstm_model_correctness_test.py
+++ b/keras/distribute/keras_stateful_lstm_model_correctness_test.py
@@ -14,93 +14,103 @@
 # ==============================================================================
 """Tests for stateful tf.keras LSTM models using DistributionStrategy."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.distribute import keras_correctness_test_base
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_keras
+from keras.optimizers.legacy import gradient_descent as gradient_descent_keras
 
 
 def strategies_for_stateful_embedding_model():
-  """Returns TPUStrategy with single core device assignment."""
+    """Returns TPUStrategy with single core device assignment."""
 
-  return [
-      tf.__internal__.distribute.combinations.tpu_strategy_one_core,
-  ]
+    return [
+        tf.__internal__.distribute.combinations.tpu_strategy_one_core,
+    ]
 
 
 def test_combinations_for_stateful_embedding_model():
-  return (tf.__internal__.test.combinations.combine(
-      distribution=strategies_for_stateful_embedding_model(),
-      mode='graph',
-      use_numpy=False,
-      use_validation_data=False))
+    return tf.__internal__.test.combinations.combine(
+        distribution=strategies_for_stateful_embedding_model(),
+        mode="graph",
+        use_numpy=False,
+        use_validation_data=False,
+    )
 
 
 class DistributionStrategyStatefulLstmModelCorrectnessTest(
-    keras_correctness_test_base
-    .TestDistributionStrategyEmbeddingModelCorrectnessBase):
-
-  def get_model(self,
-                max_words=10,
-                initial_weights=None,
-                distribution=None,
-                input_shapes=None):
-    del input_shapes
-    batch_size = keras_correctness_test_base._GLOBAL_BATCH_SIZE
-
-    with keras_correctness_test_base.MaybeDistributionScope(distribution):
-      word_ids = keras.layers.Input(
-          shape=(max_words,),
-          batch_size=batch_size,
-          dtype=np.int32,
-          name='words')
-      word_embed = keras.layers.Embedding(input_dim=20, output_dim=10)(word_ids)
-      lstm_embed = keras.layers.LSTM(
-          units=4, return_sequences=False, stateful=True)(
-              word_embed)
-
-      preds = keras.layers.Dense(2, activation='softmax')(lstm_embed)
-      model = keras.Model(inputs=[word_ids], outputs=[preds])
-
-      if initial_weights:
-        model.set_weights(initial_weights)
-
-      optimizer_fn = gradient_descent_keras.SGD
-
-      model.compile(
-          optimizer=optimizer_fn(learning_rate=0.1),
-          loss='sparse_categorical_crossentropy',
-          metrics=['sparse_categorical_accuracy'])
-    return model
-
-  # TODO(jhseu): Disabled to fix b/130808953. Need to investigate why it
-  # doesn't work and enable for DistributionStrategy more generally.
-  @tf.__internal__.distribute.combinations.generate(test_combinations_for_stateful_embedding_model())
-  def disabled_test_stateful_lstm_model_correctness(
-      self, distribution, use_numpy, use_validation_data):
-    self.run_correctness_test(
-        distribution,
-        use_numpy,
-        use_validation_data,
-        is_stateful_model=True)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_correctness_test_base
-          .test_combinations_with_tpu_strategies_graph()))
-  def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
-      self, distribution, use_numpy, use_validation_data):
-    with self.assertRaisesRegex(
-        ValueError, 'not yet supported with tf.distribute.Strategy'):
-      self.run_correctness_test(
-          distribution,
-          use_numpy,
-          use_validation_data,
-          is_stateful_model=True)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    keras_correctness_test_base.TestDistributionStrategyEmbeddingModelCorrectnessBase  # noqa: E501
+):
+    def get_model(
+        self,
+        max_words=10,
+        initial_weights=None,
+        distribution=None,
+        input_shapes=None,
+    ):
+        del input_shapes
+        batch_size = keras_correctness_test_base._GLOBAL_BATCH_SIZE
+
+        with keras_correctness_test_base.MaybeDistributionScope(distribution):
+            word_ids = keras.layers.Input(
+                shape=(max_words,),
+                batch_size=batch_size,
+                dtype=np.int32,
+                name="words",
+            )
+            word_embed = keras.layers.Embedding(input_dim=20, output_dim=10)(
+                word_ids
+            )
+            lstm_embed = keras.layers.LSTM(
+                units=4, return_sequences=False, stateful=True
+            )(word_embed)
+
+            preds = keras.layers.Dense(2, activation="softmax")(lstm_embed)
+            model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+            if initial_weights:
+                model.set_weights(initial_weights)
+
+            optimizer_fn = gradient_descent_keras.SGD
+
+            model.compile(
+                optimizer=optimizer_fn(learning_rate=0.1),
+                loss="sparse_categorical_crossentropy",
+                metrics=["sparse_categorical_accuracy"],
+            )
+        return model
+
+    # TODO(jhseu): Disabled to fix b/130808953. Need to investigate why it
+    # doesn't work and enable for DistributionStrategy more generally.
+    @tf.__internal__.distribute.combinations.generate(
+        test_combinations_for_stateful_embedding_model()
+    )
+    def disabled_test_stateful_lstm_model_correctness(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        self.run_correctness_test(
+            distribution, use_numpy, use_validation_data, is_stateful_model=True
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_correctness_test_base.test_combinations_with_tpu_strategies_graph()  # noqa: E501
+        )
+    )
+    def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
+        self, distribution, use_numpy, use_validation_data
+    ):
+        with self.assertRaisesRegex(
+            ValueError, "not yet supported with tf.distribute.Strategy"
+        ):
+            self.run_correctness_test(
+                distribution,
+                use_numpy,
+                use_validation_data,
+                is_stateful_model=True,
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/distribute/keras_utils_test.py b/keras/distribute/keras_utils_test.py
index d33299f0bd9e..8925801ea4dc 100644
--- a/keras/distribute/keras_utils_test.py
+++ b/keras/distribute/keras_utils_test.py
@@ -12,15 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.keras models with callbacks, checkpointing with dist strategy."""
-
-import tensorflow.compat.v2 as tf
+"""Tests for tf.keras models with callbacks, checkpointing with dist
+strategy."""
 
 import collections
 import tempfile
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras import losses
@@ -30,594 +30,668 @@
 
 
 class Counter(keras.callbacks.Callback):
-  """Counts the number of times each callback method was run.
-
-  Attributes:
-    method_counts: dict. Contains the counts of time  each callback method was
-      run.
-  """
-
-  def __init__(self):
-    self.method_counts = collections.defaultdict(int)
-    methods_to_count = [
-        'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
-        'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
-        'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
-        'on_test_begin', 'on_test_end', 'on_train_batch_begin',
-        'on_train_batch_end', 'on_train_begin', 'on_train_end'
-    ]
-    for method_name in methods_to_count:
-      setattr(self, method_name,
-              self.wrap_with_counts(method_name, getattr(self, method_name)))
-
-  def wrap_with_counts(self, method_name, method):
-
-    def _call_and_count(*args, **kwargs):
-      self.method_counts[method_name] += 1
-      return method(*args, **kwargs)
-
-    return _call_and_count
-
-
-class TestDistributionStrategyWithCallbacks(tf.test.TestCase,
-                                            parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.all_strategy_combinations()))
-  def test_callbacks_in_fit(self, distribution):
-    with distribution.scope():
-      model = keras_test_lib.get_model()
-      model.compile(
-          optimizer='sgd',
-          loss='mse',
-          metrics=['mae'])
-
-    dataset = keras_test_lib.get_dataset(distribution)
-    counter = Counter()
-
-    epochs = 2
-    steps_per_epoch = 5
-    validation_steps = 3
-
-    model.fit(
-        dataset,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        verbose=0,
-        validation_data=dataset,
-        validation_steps=validation_steps,
-        callbacks=[counter])
-
-    if (isinstance(distribution, tf.compat.v1.distribute.experimental.TPUStrategy) and
-        not tf.executing_eagerly()):
-      # TPU Strategy can have multi step training, from extended.steps_per_run
-      # if steps_per_run = 1, then num_batch_call_per_epoch = steps_per_epoch
-      steps_per_run = distribution.extended.steps_per_run
-      num_batch_call_per_epoch = steps_per_epoch // steps_per_run
-      if steps_per_epoch % steps_per_run:
-        num_batch_call_per_epoch += 1
-    else:
-      num_batch_call_per_epoch = steps_per_epoch
-
-    self.assertDictEqual(
-        counter.method_counts, {
-            'on_batch_begin': epochs * num_batch_call_per_epoch,
-            'on_batch_end': epochs * num_batch_call_per_epoch,
-            'on_epoch_begin': epochs,
-            'on_epoch_end': epochs,
-            'on_test_batch_begin': epochs * validation_steps,
-            'on_test_batch_end': epochs * validation_steps,
-            'on_test_begin': epochs,
-            'on_test_end': epochs,
-            'on_train_batch_begin': epochs * num_batch_call_per_epoch,
-            'on_train_batch_end': epochs * num_batch_call_per_epoch,
-            'on_train_begin': 1,
-            'on_train_end': 1
-        })
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.all_strategy_combinations()))
-  def test_callbacks_in_eval(self, distribution):
-    with distribution.scope():
-      model = keras_test_lib.get_model()
-      model.compile(
-          optimizer='sgd',
-          loss='mse',
-          metrics=['mae'])
-
-    dataset = keras_test_lib.get_dataset(distribution)
-    counter = Counter()
-
-    model.evaluate(dataset, steps=5, callbacks=[counter])
-
-    self.assertDictEqual(
-        counter.method_counts, {
-            'on_test_batch_begin': 5,
-            'on_test_batch_end': 5,
-            'on_test_begin': 1,
-            'on_test_end': 1
-        })
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.all_strategy_combinations()))
-  def test_callbacks_in_predict(self, distribution):
-    with distribution.scope():
-      model = keras_test_lib.get_model()
-      model.compile(
-          optimizer='sgd',
-          loss='mse',
-          metrics=['mae'])
-
-    dataset = keras_test_lib.get_dataset(distribution)
-    counter = Counter()
-
-    model.predict(
-        keras_test_lib.get_predict_dataset(dataset),
-        steps=5,
-        callbacks=[counter])
-
-    self.assertDictEqual(
-        counter.method_counts, {
-            'on_predict_batch_begin': 5,
-            'on_predict_batch_end': 5,
-            'on_predict_begin': 1,
-            'on_predict_end': 1
-        })
-
-
-class TestDistributionStrategyErrorCases(tf.test.TestCase, parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.
-              mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph']))
-  def test_validating_dataset_input_tensors_with_shape_mismatch(
-      self, distribution):
-    with self.cached_session():
-      @tf.function
-      def run():
-        ctx = tf.distribute.get_replica_context()
-        if ctx.replica_id_in_sync_group.device.endswith('GPU:0'):
-          return tf.constant([[1, 2]])
-        else:
-          return tf.constant([[1, 2], [1, 2]])
-
-      x = distribution.run(run)
-
-      # Removed device and input tensor shape details from the error message
-      # since the order of the device and the corresponding input tensor shape
-      # is not deterministic over different runs.
-      with self.assertRaisesRegex(
-          ValueError, 'Input tensor shapes do not match for '
-          'distributed tensor inputs '
-          'PerReplica:.+'):
+    """Counts the number of times each callback method was run.
+
+    Attributes:
+      method_counts: dict. Contains the counts of time  each callback method was
+        run.
+    """
+
+    def __init__(self):
+        self.method_counts = collections.defaultdict(int)
+        methods_to_count = [
+            "on_batch_begin",
+            "on_batch_end",
+            "on_epoch_begin",
+            "on_epoch_end",
+            "on_predict_batch_begin",
+            "on_predict_batch_end",
+            "on_predict_begin",
+            "on_predict_end",
+            "on_test_batch_begin",
+            "on_test_batch_end",
+            "on_test_begin",
+            "on_test_end",
+            "on_train_batch_begin",
+            "on_train_batch_end",
+            "on_train_begin",
+            "on_train_end",
+        ]
+        for method_name in methods_to_count:
+            setattr(
+                self,
+                method_name,
+                self.wrap_with_counts(method_name, getattr(self, method_name)),
+            )
+
+    def wrap_with_counts(self, method_name, method):
+        def _call_and_count(*args, **kwargs):
+            self.method_counts[method_name] += 1
+            return method(*args, **kwargs)
+
+        return _call_and_count
+
+
+class TestDistributionStrategyWithCallbacks(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.all_strategy_combinations()
+        )
+    )
+    def test_callbacks_in_fit(self, distribution):
         with distribution.scope():
-          distributed_training_utils_v1.validate_distributed_dataset_inputs(
-              distribution, x, None)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations
-              .mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(
-      self, distribution):
-    with self.cached_session():
-
-      @tf.function
-      def run():
-        ctx = tf.distribute.get_replica_context()
-        if ctx.replica_id_in_sync_group.device.endswith('GPU:0'):
-          return tf.constant([[1, 2]], dtype=tf.int32)
-        else:
-          return tf.constant([[1, 2]], dtype=tf.float64)
+            model = keras_test_lib.get_model()
+            model.compile(optimizer="sgd", loss="mse", metrics=["mae"])
 
-      x = distribution.run(run)
+        dataset = keras_test_lib.get_dataset(distribution)
+        counter = Counter()
 
-      # Removed device and input tensor dtype details from the error message
-      # since the order of the device and the corresponding input tensor dtype
-      # is not deterministic over different runs.
-      with self.assertRaisesRegex(
-          ValueError, 'Input tensor dtypes do not match for '
-          'distributed tensor inputs '
-          'PerReplica:.+'):
-        with distribution.scope():
-          distributed_training_utils_v1.validate_distributed_dataset_inputs(
-              distribution, x, None)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph', 'eager']))
-  def test_unsupported_features(self, distribution, mode):
-    with self.cached_session():
-      with distribution.scope():
-        model = keras_test_lib.get_model()
-        optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics)
-
-      dataset = keras_test_lib.get_dataset(distribution)
-      # Test with validation split
-      with self.assertRaises(ValueError):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            validation_split=0.5,
-            validation_steps=2)
+        epochs = 2
+        steps_per_epoch = 5
+        validation_steps = 3
 
-      # Test with sample weight.
-      sample_weight = np.random.random((10,))
-      with self.assertRaises(ValueError):
         model.fit(
             dataset,
-            epochs=1,
-            steps_per_epoch=2,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
             verbose=0,
-            sample_weight=sample_weight)
-
-      # Test with not specifying the `steps` argument for dataset with infinite
-      # cardinality.
-      dataset = dataset.repeat()
-      with self.assertRaises(ValueError):
-        model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaises(ValueError):
-        model.evaluate(dataset, verbose=0)
-
-      with self.assertRaises(ValueError):
-        model.predict(dataset, verbose=0)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.one_device_strategy,
-          ],
-          mode=['graph', 'eager']))
-  def test_distribution_strategy_on_subclassed_model(
-      self, distribution):
-    with distribution.scope():
-
-      class _SimpleMLP(keras.Model):
-
-        def __init__(self, num_labels):
-          super().__init__()
-          self.dense = keras.layers.Dense(num_labels)
-
-        def call(self, inputs):
-          return self.dense(inputs)
-
-      model = _SimpleMLP(3)
-
-      if not tf.executing_eagerly():
-        with self.assertRaisesRegex(
-            ValueError,
-            'We currently do not support distribution strategy with a '
-            '`Sequential` model that is created without `input_shape`/'
-            '`input_dim` set in its first layer or a subclassed model.'):
-          model.compile(
-              'sgd')
-      else:
-        model.compile(
-            'sgd')
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-              tf.__internal__.distribute.combinations.one_device_strategy,
-          ],
-          mode=['graph', 'eager']))
-  def test_distribution_strategy_on_deferred_sequential_model(
-      self, distribution):
-    with distribution.scope():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(16, activation='relu'))
-      model.add(keras.layers.Dense(3, activation='softmax'))
-
-      if tf.executing_eagerly():
-        model.compile(
-            'sgd')
-      else:
-        with self.assertRaisesRegex(
-            ValueError,
-            'We currently do not support distribution strategy with a '
-            '`Sequential` model that is created without '
-            '`input_shape`/`input_dim` set in its first layer or '
-            'a subclassed model.'):
-          model.compile(
-              'sgd')
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_test_lib.all_strategy_combinations_minus_default())
-  def test_standalone_loss_without_loss_reduction(self, distribution):
-    with distribution.scope():
-      loss_object = losses.MeanSquaredError()
-
-      with self.assertRaisesRegex(
-          ValueError, 'Please use `tf.keras.losses.Reduction.SUM` or '
-          '`tf.keras.losses.Reduction.NONE`'):
-        y = np.asarray([1, 0])
-        loss_object(y, y)
-
-
-class TestDistributionStrategyWithLossMasking(tf.test.TestCase,
-                                              parameterized.TestCase):
-
-  # TODO(priyag): Enable all strategies for this test. Currently it does not
-  # work for TPU due to some invalid datatype.
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph', 'eager'],
-          optimizer=optimizer_combinations
-          .gradient_descent_optimizer_keras_v2_fn
-      ))
-  def test_masking(self, distribution, optimizer):
-    with self.cached_session():
-      np.random.seed(1337)
-      x = np.array([[[1], [1]], [[0], [0]]])
-      with distribution.scope():
-        model = keras.models.Sequential()
-        model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
-        model.add(
-            keras.layers.TimeDistributed(
-                keras.layers.Dense(1, kernel_initializer='one')))
-        model.compile(
-            loss='mse',
-            optimizer=optimizer())
-      y = np.array([[[1], [1]], [[1], [1]]])
-      dataset = tf.data.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
-      self.assertEqual(hist.history['loss'][0], 0)
-
-
-class TestDistributionStrategyWithNormalizationLayer(tf.test.TestCase,
-                                                     parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.all_strategy_combinations(),
-          tf.__internal__.test.combinations.combine(
-              fused=[True, False],
-              optimizer=optimizer_combinations
-              .gradient_descent_optimizer_keras_v2_fn)))
-  def test_batchnorm_correctness(self, distribution, fused, optimizer):
-    with self.cached_session():
-      with distribution.scope():
-        model = keras.models.Sequential()
-        norm = keras.layers.BatchNormalization(
-            input_shape=(
-                10,
-                20,
-                30,
-            ), momentum=0.8, fused=fused)
-        model.add(norm)
-        model.compile(
-            loss='mse',
-            optimizer=optimizer())
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 20, 30))
-      x = x.astype('float32')
-      dataset = tf.data.Dataset.from_tensor_slices((x, x))
-      dataset = dataset.repeat(100)
-      dataset = keras_test_lib.batch_wrapper(dataset, 32, distribution)
-
-      predict_dataset = tf.data.Dataset.from_tensor_slices(x)
-      predict_dataset = predict_dataset.repeat(100)
-      predict_dataset = keras_test_lib.batch_wrapper(predict_dataset, 32,
-                                                     distribution)
-
-      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
-      out = model.predict(predict_dataset, steps=2)
-      out -= keras.backend.eval(norm.beta)
-      out /= keras.backend.eval(norm.gamma)
-      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
-
-# TODO(b/146181571): Enable this for all distribution strategies once
-# DistributedVariable.assign() returns a variable for MirroredStrategy.
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.tpu_strategy_combinations(),
-          tf.__internal__.test.combinations.combine(
-              optimizer=optimizer_combinations
-              .gradient_descent_optimizer_keras_v2_fn)))
-  def test_batchnorm_correctness_with_renorm(self, distribution, optimizer):
-    with self.cached_session():
-      with distribution.scope():
-        model = keras.models.Sequential()
-        norm = keras.layers.BatchNormalization(
-            input_shape=(
-                10,
-                20,
-                30,
-            ), momentum=0.8, fused=False, renorm=True)
-        model.add(norm)
-        model.compile(
-            loss='mse',
-            optimizer=optimizer())
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 20, 30))
-      x = x.astype('float32')
-      dataset = tf.data.Dataset.from_tensor_slices((x, x))
-      dataset = dataset.repeat(100)
-      dataset = keras_test_lib.batch_wrapper(dataset, 32, distribution)
-
-      predict_dataset = tf.data.Dataset.from_tensor_slices(x)
-      predict_dataset = predict_dataset.repeat(100)
-      predict_dataset = keras_test_lib.batch_wrapper(predict_dataset, 32,
-                                                     distribution)
-
-      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
-      out = model.predict(predict_dataset, steps=2)
-      out -= keras.backend.eval(norm.beta)
-      out /= keras.backend.eval(norm.gamma)
-      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
-
-
-class TestDistributionStrategySaveLoadWeights(tf.test.TestCase,
-                                              parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.all_strategy_combinations_minus_default(),
-          tf.__internal__.test.combinations.combine(
-              optimizer=optimizer_combinations.rmsprop_optimizer_keras_v2_fn)))
-  def test_save_load_h5(self, distribution, optimizer):
-    with self.cached_session():
-      dataset = keras_test_lib.get_dataset(distribution)
-      with distribution.scope():
-        model = keras_test_lib.get_model()
-        model.compile(
-            optimizer(),
-            'mse')
-        model.fit(dataset, epochs=1, steps_per_epoch=1)
-
-        weights_file = tempfile.mktemp('.h5')
-        model.save_weights(weights_file)
-
-        model_2 = keras_test_lib.get_model()
-        model_2.compile(
-            optimizer(),
-            'mse')
-        model_2.load_weights(weights_file)
-        model_2.predict(
-            keras_test_lib.get_predict_dataset(distribution), steps=2)
-        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.all_strategy_combinations_minus_default(),
-          tf.__internal__.test.combinations.combine(
-              optimizer=optimizer_combinations.rmsprop_optimizer_keras_v2_fn)))
-  def test_save_load_trackable(self, distribution, optimizer):
-    # TODO(b/123533246): Enable the test for TPU once bug is fixed
-    if (isinstance(distribution,
-                   (tf.distribute.experimental.TPUStrategy, tf.compat.v1.distribute.experimental.TPUStrategy)) and
-        distribution.extended.steps_per_run > 1):
-      self.skipTest('MultiStep TPU Strategy deadlocks with optimizer restore.')
-    with self.cached_session():
-      dataset = keras_test_lib.get_dataset(distribution)
-      with distribution.scope():
-        model = keras_test_lib.get_model()
-        model.compile(
-            optimizer(),
-            'mse')
-        model.fit(dataset, epochs=1, steps_per_epoch=1)
-
-        weights_file = tempfile.mktemp()
-        model.save_weights(weights_file)
-
-        model_2 = keras_test_lib.get_model()
-        model_2.compile(
-            optimizer(),
-            'mse')
-        model_2.load_weights(weights_file)
-        model_2.predict(
-            keras_test_lib.get_predict_dataset(distribution), steps=2)
-        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
-
-
-class TestDistributionStrategyValidation(tf.test.TestCase, parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          keras_test_lib.all_strategy_combinations_minus_default()))
-  def test_layer_outside_scope(self, distribution):
-    with self.cached_session():
-      with self.assertRaisesRegex(
-          ValueError, 'was not created in the distribution strategy'):
-        x = keras.layers.Input(shape=(3,), name='input')
-        y = keras.layers.Dense(4, name='dense')(x)
+            validation_data=dataset,
+            validation_steps=validation_steps,
+            callbacks=[counter],
+        )
+
+        if (
+            isinstance(
+                distribution, tf.compat.v1.distribute.experimental.TPUStrategy
+            )
+            and not tf.executing_eagerly()
+        ):
+            # TPU Strategy can have multi step training, from
+            # extended.steps_per_run if steps_per_run = 1, then
+            # num_batch_call_per_epoch = steps_per_epoch
+            steps_per_run = distribution.extended.steps_per_run
+            num_batch_call_per_epoch = steps_per_epoch // steps_per_run
+            if steps_per_epoch % steps_per_run:
+                num_batch_call_per_epoch += 1
+        else:
+            num_batch_call_per_epoch = steps_per_epoch
+
+        self.assertDictEqual(
+            counter.method_counts,
+            {
+                "on_batch_begin": epochs * num_batch_call_per_epoch,
+                "on_batch_end": epochs * num_batch_call_per_epoch,
+                "on_epoch_begin": epochs,
+                "on_epoch_end": epochs,
+                "on_test_batch_begin": epochs * validation_steps,
+                "on_test_batch_end": epochs * validation_steps,
+                "on_test_begin": epochs,
+                "on_test_end": epochs,
+                "on_train_batch_begin": epochs * num_batch_call_per_epoch,
+                "on_train_batch_end": epochs * num_batch_call_per_epoch,
+                "on_train_begin": 1,
+                "on_train_end": 1,
+            },
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.all_strategy_combinations()
+        )
+    )
+    def test_callbacks_in_eval(self, distribution):
+        with distribution.scope():
+            model = keras_test_lib.get_model()
+            model.compile(optimizer="sgd", loss="mse", metrics=["mae"])
+
+        dataset = keras_test_lib.get_dataset(distribution)
+        counter = Counter()
+
+        model.evaluate(dataset, steps=5, callbacks=[counter])
+
+        self.assertDictEqual(
+            counter.method_counts,
+            {
+                "on_test_batch_begin": 5,
+                "on_test_batch_end": 5,
+                "on_test_begin": 1,
+                "on_test_end": 1,
+            },
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.all_strategy_combinations()
+        )
+    )
+    def test_callbacks_in_predict(self, distribution):
+        with distribution.scope():
+            model = keras_test_lib.get_model()
+            model.compile(optimizer="sgd", loss="mse", metrics=["mae"])
+
+        dataset = keras_test_lib.get_dataset(distribution)
+        counter = Counter()
+
+        model.predict(
+            keras_test_lib.get_predict_dataset(dataset),
+            steps=5,
+            callbacks=[counter],
+        )
+
+        self.assertDictEqual(
+            counter.method_counts,
+            {
+                "on_predict_batch_begin": 5,
+                "on_predict_batch_end": 5,
+                "on_predict_begin": 1,
+                "on_predict_end": 1,
+            },
+        )
+
+
+class TestDistributionStrategyErrorCases(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            ],
+            mode=["graph"],
+        )
+    )
+    def test_validating_dataset_input_tensors_with_shape_mismatch(
+        self, distribution
+    ):
+        with self.cached_session():
+
+            @tf.function
+            def run():
+                ctx = tf.distribute.get_replica_context()
+                if ctx.replica_id_in_sync_group.device.endswith("GPU:0"):
+                    return tf.constant([[1, 2]])
+                else:
+                    return tf.constant([[1, 2], [1, 2]])
+
+            x = distribution.run(run)
+
+            # Removed device and input tensor shape details from the error
+            # message since the order of the device and the corresponding input
+            # tensor shape is not deterministic over different runs.
+            with self.assertRaisesRegex(
+                ValueError,
+                "Input tensor shapes do not match for "
+                "distributed tensor inputs "
+                "PerReplica:.+",
+            ):
+                with distribution.scope():
+                    distributed_training_utils_v1.validate_distributed_dataset_inputs(  # noqa: E501
+                        distribution, x, None
+                    )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_validating_dataset_input_tensors_with_dtype_mismatch(
+        self, distribution
+    ):
+        with self.cached_session():
+
+            @tf.function
+            def run():
+                ctx = tf.distribute.get_replica_context()
+                if ctx.replica_id_in_sync_group.device.endswith("GPU:0"):
+                    return tf.constant([[1, 2]], dtype=tf.int32)
+                else:
+                    return tf.constant([[1, 2]], dtype=tf.float64)
+
+            x = distribution.run(run)
+
+            # Removed device and input tensor dtype details from the error
+            # message since the order of the device and the corresponding input
+            # tensor dtype is not deterministic over different runs.
+            with self.assertRaisesRegex(
+                ValueError,
+                "Input tensor dtypes do not match for "
+                "distributed tensor inputs "
+                "PerReplica:.+",
+            ):
+                with distribution.scope():
+                    distributed_training_utils_v1.validate_distributed_dataset_inputs(  # noqa: E501
+                        distribution, x, None
+                    )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_unsupported_features(self, distribution, mode):
+        with self.cached_session():
+            with distribution.scope():
+                model = keras_test_lib.get_model()
+                optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
+                loss = "mse"
+                metrics = ["mae"]
+                model.compile(optimizer, loss, metrics=metrics)
+
+            dataset = keras_test_lib.get_dataset(distribution)
+            # Test with validation split
+            with self.assertRaises(ValueError):
+                model.fit(
+                    dataset,
+                    epochs=1,
+                    steps_per_epoch=2,
+                    verbose=0,
+                    validation_split=0.5,
+                    validation_steps=2,
+                )
+
+            # Test with sample weight.
+            sample_weight = np.random.random((10,))
+            with self.assertRaises(ValueError):
+                model.fit(
+                    dataset,
+                    epochs=1,
+                    steps_per_epoch=2,
+                    verbose=0,
+                    sample_weight=sample_weight,
+                )
+
+            # Test with not specifying the `steps` argument for dataset with
+            # infinite cardinality.
+            dataset = dataset.repeat()
+            with self.assertRaises(ValueError):
+                model.fit(dataset, epochs=1, verbose=0)
+            with self.assertRaises(ValueError):
+                model.evaluate(dataset, verbose=0)
+
+            with self.assertRaises(ValueError):
+                model.predict(dataset, verbose=0)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.one_device_strategy,
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_distribution_strategy_on_subclassed_model(self, distribution):
+        with distribution.scope():
+
+            class _SimpleMLP(keras.Model):
+                def __init__(self, num_labels):
+                    super().__init__()
+                    self.dense = keras.layers.Dense(num_labels)
+
+                def call(self, inputs):
+                    return self.dense(inputs)
+
+            model = _SimpleMLP(3)
+
+            if not tf.executing_eagerly():
+                with self.assertRaisesRegex(
+                    ValueError,
+                    "We currently do not support distribution strategy with a "
+                    "`Sequential` model that is created without `input_shape`/"
+                    "`input_dim` set in its first layer or a subclassed model.",
+                ):
+                    model.compile("sgd")
+            else:
+                model.compile("sgd")
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.one_device_strategy,
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_distribution_strategy_on_deferred_sequential_model(
+        self, distribution
+    ):
         with distribution.scope():
-          model = keras.Model(x, y)
-          optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-          loss = 'mse'
-          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(
-              optimizer,
-              loss,
-              metrics=metrics)
-
-  @tf.__internal__.distribute.combinations.generate(
-      keras_test_lib.all_strategy_combinations_minus_default())
-  def test_model_outside_scope(self, distribution):
-    with self.cached_session():
-      with self.assertRaisesRegex(
-          ValueError, 'was not created in the distribution strategy'):
-        x = keras.layers.Input(shape=(3,), name='input')
-        y = keras.layers.Dense(4, name='dense')(x)
-        model = keras.Model(x, y)
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(16, activation="relu"))
+            model.add(keras.layers.Dense(3, activation="softmax"))
+
+            if tf.executing_eagerly():
+                model.compile("sgd")
+            else:
+                with self.assertRaisesRegex(
+                    ValueError,
+                    "We currently do not support distribution strategy with a "
+                    "`Sequential` model that is created without "
+                    "`input_shape`/`input_dim` set in its first layer or "
+                    "a subclassed model.",
+                ):
+                    model.compile("sgd")
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_test_lib.all_strategy_combinations_minus_default()
+    )
+    def test_standalone_loss_without_loss_reduction(self, distribution):
         with distribution.scope():
-          optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.001)
-          loss = 'mse'
-          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(optimizer, loss, metrics=metrics)
-
-
-class TestDistributionStrategyWithStaticShapes(tf.test.TestCase,
-                                               parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph', 'eager']))
-  def test_input_batch_size_not_divisible_by_num_replicas(self, distribution):
-    with distribution.scope():
-      with self.assertRaisesRegex(
-          ValueError, r'The `batch_size` argument \(5\) must be divisible by '
-          r'the number of replicas \(2\)'):
-        keras.layers.Input(shape=(3,), batch_size=5, name='input')
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=[
-              tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph', 'eager']))
-  def test_static_input_batch_size(self, distribution):
-    inputs = np.zeros((10, 3), dtype=np.float32)
-    targets = np.zeros((10, 4), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10, drop_remainder=True)
-
-    with distribution.scope():
-      x = keras.layers.Input(shape=(3,), batch_size=10, name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
-
-    model.fit(dataset, epochs=1, steps_per_epoch=5)
-    model.evaluate(dataset, steps=5)
-    model.predict(dataset)
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+            loss_object = losses.MeanSquaredError()
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "Please use `tf.keras.losses.Reduction.SUM` or "
+                "`tf.keras.losses.Reduction.NONE`",
+            ):
+                y = np.asarray([1, 0])
+                loss_object(y, y)
+
+
+class TestDistributionStrategyWithLossMasking(
+    tf.test.TestCase, parameterized.TestCase
+):
+
+    # TODO(priyag): Enable all strategies for this test. Currently it does not
+    # work for TPU due to some invalid datatype.
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+            optimizer=optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,  # noqa: E501
+        )
+    )
+    def test_masking(self, distribution, optimizer):
+        with self.cached_session():
+            np.random.seed(1337)
+            x = np.array([[[1], [1]], [[0], [0]]])
+            with distribution.scope():
+                model = keras.models.Sequential()
+                model.add(
+                    keras.layers.Masking(mask_value=0, input_shape=(2, 1))
+                )
+                model.add(
+                    keras.layers.TimeDistributed(
+                        keras.layers.Dense(1, kernel_initializer="one")
+                    )
+                )
+                model.compile(loss="mse", optimizer=optimizer())
+            y = np.array([[[1], [1]], [[1], [1]]])
+            dataset = tf.data.Dataset.from_tensor_slices((x, y))
+            dataset = dataset.repeat(100)
+            dataset = dataset.batch(10)
+            hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
+            self.assertEqual(hist.history["loss"][0], 0)
+
+
+class TestDistributionStrategyWithNormalizationLayer(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.all_strategy_combinations(),
+            tf.__internal__.test.combinations.combine(
+                fused=[True, False],
+                optimizer=optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,  # noqa: E501
+            ),
+        )
+    )
+    def test_batchnorm_correctness(self, distribution, fused, optimizer):
+        with self.cached_session():
+            with distribution.scope():
+                model = keras.models.Sequential()
+                norm = keras.layers.BatchNormalization(
+                    input_shape=(
+                        10,
+                        20,
+                        30,
+                    ),
+                    momentum=0.8,
+                    fused=fused,
+                )
+                model.add(norm)
+                model.compile(loss="mse", optimizer=optimizer())
+
+            # centered on 5.0, variance 10.0
+            x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 20, 30))
+            x = x.astype("float32")
+            dataset = tf.data.Dataset.from_tensor_slices((x, x))
+            dataset = dataset.repeat(100)
+            dataset = keras_test_lib.batch_wrapper(dataset, 32, distribution)
+
+            predict_dataset = tf.data.Dataset.from_tensor_slices(x)
+            predict_dataset = predict_dataset.repeat(100)
+            predict_dataset = keras_test_lib.batch_wrapper(
+                predict_dataset, 32, distribution
+            )
+
+            model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
+            out = model.predict(predict_dataset, steps=2)
+            out -= keras.backend.eval(norm.beta)
+            out /= keras.backend.eval(norm.gamma)
+            np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+            np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+    # TODO(b/146181571): Enable this for all distribution strategies once
+    # DistributedVariable.assign() returns a variable for MirroredStrategy.
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.tpu_strategy_combinations(),
+            tf.__internal__.test.combinations.combine(
+                optimizer=optimizer_combinations.gradient_descent_optimizer_keras_v2_fn  # noqa: E501
+            ),
+        )
+    )
+    def test_batchnorm_correctness_with_renorm(self, distribution, optimizer):
+        with self.cached_session():
+            with distribution.scope():
+                model = keras.models.Sequential()
+                norm = keras.layers.BatchNormalization(
+                    input_shape=(
+                        10,
+                        20,
+                        30,
+                    ),
+                    momentum=0.8,
+                    fused=False,
+                    renorm=True,
+                )
+                model.add(norm)
+                model.compile(loss="mse", optimizer=optimizer())
+
+            # centered on 5.0, variance 10.0
+            x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 20, 30))
+            x = x.astype("float32")
+            dataset = tf.data.Dataset.from_tensor_slices((x, x))
+            dataset = dataset.repeat(100)
+            dataset = keras_test_lib.batch_wrapper(dataset, 32, distribution)
+
+            predict_dataset = tf.data.Dataset.from_tensor_slices(x)
+            predict_dataset = predict_dataset.repeat(100)
+            predict_dataset = keras_test_lib.batch_wrapper(
+                predict_dataset, 32, distribution
+            )
+
+            model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
+            out = model.predict(predict_dataset, steps=2)
+            out -= keras.backend.eval(norm.beta)
+            out /= keras.backend.eval(norm.gamma)
+            np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+            np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class TestDistributionStrategySaveLoadWeights(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.all_strategy_combinations_minus_default(),
+            tf.__internal__.test.combinations.combine(
+                optimizer=optimizer_combinations.rmsprop_optimizer_keras_v2_fn
+            ),
+        )
+    )
+    def test_save_load_h5(self, distribution, optimizer):
+        with self.cached_session():
+            dataset = keras_test_lib.get_dataset(distribution)
+            with distribution.scope():
+                model = keras_test_lib.get_model()
+                model.compile(optimizer(), "mse")
+                model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+                weights_file = tempfile.mktemp(".h5")
+                model.save_weights(weights_file)
+
+                model_2 = keras_test_lib.get_model()
+                model_2.compile(optimizer(), "mse")
+                model_2.load_weights(weights_file)
+                model_2.predict(
+                    keras_test_lib.get_predict_dataset(distribution), steps=2
+                )
+                model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.all_strategy_combinations_minus_default(),
+            tf.__internal__.test.combinations.combine(
+                optimizer=optimizer_combinations.rmsprop_optimizer_keras_v2_fn
+            ),
+        )
+    )
+    def test_save_load_trackable(self, distribution, optimizer):
+        # TODO(b/123533246): Enable the test for TPU once bug is fixed
+        if (
+            isinstance(
+                distribution,
+                (
+                    tf.distribute.experimental.TPUStrategy,
+                    tf.compat.v1.distribute.experimental.TPUStrategy,
+                ),
+            )
+            and distribution.extended.steps_per_run > 1
+        ):
+            self.skipTest(
+                "MultiStep TPU Strategy deadlocks with optimizer restore."
+            )
+        with self.cached_session():
+            dataset = keras_test_lib.get_dataset(distribution)
+            with distribution.scope():
+                model = keras_test_lib.get_model()
+                model.compile(optimizer(), "mse")
+                model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+                weights_file = tempfile.mktemp()
+                model.save_weights(weights_file)
+
+                model_2 = keras_test_lib.get_model()
+                model_2.compile(optimizer(), "mse")
+                model_2.load_weights(weights_file)
+                model_2.predict(
+                    keras_test_lib.get_predict_dataset(distribution), steps=2
+                )
+                model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+
+class TestDistributionStrategyValidation(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            keras_test_lib.all_strategy_combinations_minus_default()
+        )
+    )
+    def test_layer_outside_scope(self, distribution):
+        with self.cached_session():
+            with self.assertRaisesRegex(
+                ValueError, "was not created in the distribution strategy"
+            ):
+                x = keras.layers.Input(shape=(3,), name="input")
+                y = keras.layers.Dense(4, name="dense")(x)
+                with distribution.scope():
+                    model = keras.Model(x, y)
+                    optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+                        0.001
+                    )
+                    loss = "mse"
+                    metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                    model.compile(optimizer, loss, metrics=metrics)
+
+    @tf.__internal__.distribute.combinations.generate(
+        keras_test_lib.all_strategy_combinations_minus_default()
+    )
+    def test_model_outside_scope(self, distribution):
+        with self.cached_session():
+            with self.assertRaisesRegex(
+                ValueError, "was not created in the distribution strategy"
+            ):
+                x = keras.layers.Input(shape=(3,), name="input")
+                y = keras.layers.Dense(4, name="dense")(x)
+                model = keras.Model(x, y)
+                with distribution.scope():
+                    optimizer = tf.compat.v1.train.GradientDescentOptimizer(
+                        0.001
+                    )
+                    loss = "mse"
+                    metrics = ["mae", keras.metrics.CategoricalAccuracy()]
+                    model.compile(optimizer, loss, metrics=metrics)
+
+
+class TestDistributionStrategyWithStaticShapes(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_input_batch_size_not_divisible_by_num_replicas(self, distribution):
+        with distribution.scope():
+            with self.assertRaisesRegex(
+                ValueError,
+                r"The `batch_size` argument \(5\) must be divisible by "
+                r"the number of replicas \(2\)",
+            ):
+                keras.layers.Input(shape=(3,), batch_size=5, name="input")
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            ],
+            mode=["graph", "eager"],
+        )
+    )
+    def test_static_input_batch_size(self, distribution):
+        inputs = np.zeros((10, 3), dtype=np.float32)
+        targets = np.zeros((10, 4), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10, drop_remainder=True)
+
+        with distribution.scope():
+            x = keras.layers.Input(shape=(3,), batch_size=10, name="input")
+            y = keras.layers.Dense(4, name="dense")(x)
+            model = keras.Model(x, y)
+            model.compile(optimizer="sgd", loss="mse", metrics=["mae"])
+
+        model.fit(dataset, epochs=1, steps_per_epoch=5)
+        model.evaluate(dataset, steps=5)
+        model.predict(dataset)
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/minimize_loss_test.py b/keras/distribute/minimize_loss_test.py
index 414fe8ae4d59..14168b003fdc 100644
--- a/keras/distribute/minimize_loss_test.py
+++ b/keras/distribute/minimize_loss_test.py
@@ -15,522 +15,685 @@
 """Tests for running legacy optimizer code with DistributionStrategy."""
 
 
+import numpy
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras.distribute import optimizer_combinations
 from keras.distribute.test_example import batchnorm_example
 from keras.distribute.test_example import minimize_loss_example
 from keras.layers import core
-from keras.optimizers.optimizer_v2 import optimizer_v2
-import numpy
-import tensorflow.compat.v2 as tf
-
+from keras.optimizers.legacy import optimizer_v2
 
 VAR_MAP_V1 = {
     "GradientDescent": ("dense/kernel", "dense/bias"),
-    "Adagrad": ("dense/kernel/Adagrad", "dense/kernel", "dense/bias/Adagrad",
-                "dense/bias"),
-    "Ftrl": ("dense/kernel/Ftrl", "dense/kernel", "dense/bias/Ftrl",
-             "dense/bias", "dense/kernel/Ftrl_1", "dense/bias/Ftrl_1"),
-    "RMSProp": ("dense/kernel", "dense/bias/RMSProp", "dense/bias/RMSProp_1",
-                "dense/bias", "dense/kernel/RMSProp_1", "dense/kernel/RMSProp")
+    "Adagrad": (
+        "dense/kernel/Adagrad",
+        "dense/kernel",
+        "dense/bias/Adagrad",
+        "dense/bias",
+    ),
+    "Ftrl": (
+        "dense/kernel/Ftrl",
+        "dense/kernel",
+        "dense/bias/Ftrl",
+        "dense/bias",
+        "dense/kernel/Ftrl_1",
+        "dense/bias/Ftrl_1",
+    ),
+    "RMSProp": (
+        "dense/kernel",
+        "dense/bias/RMSProp",
+        "dense/bias/RMSProp_1",
+        "dense/bias",
+        "dense/kernel/RMSProp_1",
+        "dense/kernel/RMSProp",
+    ),
 }
 
 VAR_MAP_V2 = {
-    "SGD": ("dense/bias", "SGD/learning_rate", "SGD/decay", "SGD/iter",
-            "dense/kernel", "SGD/momentum"),
-    "Adagrad":
-        ("Adagrad/iter", "dense/bias", "dense/kernel", "Adagrad/learning_rate",
-         "Adagrad/decay", "Adagrad/dense/kernel/accumulator",
-         "Adagrad/dense/bias/accumulator")
+    "SGD": (
+        "dense/bias",
+        "SGD/learning_rate",
+        "SGD/decay",
+        "SGD/iter",
+        "dense/kernel",
+        "SGD/momentum",
+    ),
+    "Adagrad": (
+        "Adagrad/iter",
+        "dense/bias",
+        "dense/kernel",
+        "Adagrad/learning_rate",
+        "Adagrad/decay",
+        "Adagrad/dense/kernel/accumulator",
+        "Adagrad/dense/bias/accumulator",
+    ),
 }
 
 
 class MinimizeLossStepTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _get_iterator(self, strategy, input_fn):
-    iterator = strategy.make_input_fn_iterator(lambda _: input_fn())
-    self.evaluate(iterator.initializer)
-    return iterator
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          optimizer_combinations.distributions_and_v1_optimizers(),
-          tf.__internal__.test.combinations.combine(mode=["graph"], use_callable_loss=[True, False])
-          + tf.__internal__.test.combinations.combine(mode=["eager"], use_callable_loss=[True])) +
-      tf.__internal__.test.combinations.times(
-          optimizer_combinations.distributions_and_v2_optimizers(),
-          tf.__internal__.test.combinations.combine(
-              mode=["graph", "eager"], use_callable_loss=[True])) +
-      tf.__internal__.test.combinations.combine(
-          distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
-          optimizer_fn=optimizer_combinations.optimizers_v2,
-          mode=["graph"],
-          use_callable_loss=[True]) + tf.__internal__.test.combinations.combine(
-              distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
-              optimizer_fn=optimizer_combinations.optimizers_v1,
-              mode=["graph"],
-              use_callable_loss=[True, False]))
-  def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss):
-    with distribution.scope():
-      optimizer = optimizer_fn()
-      model_fn, dataset_fn, layer = minimize_loss_example(
-          optimizer, use_bias=True, use_callable_loss=use_callable_loss)
-
-      def step_fn(ctx, inputs):
-        del ctx  # Unused
-        return distribution.group(
-            distribution.extended.call_for_each_replica(
-                model_fn, args=(inputs,)))
-
-      iterator = self._get_iterator(distribution, dataset_fn)
-
-      def run_step():
-        return distribution.extended.experimental_run_steps_on_iterator(
-            step_fn, iterator, iterations=2).run_op
-
-      if not tf.executing_eagerly():
-        with self.cached_session() as sess:
-          run_step = sess.make_callable(run_step())
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      weights, biases = [], []
-      for _ in range(5):
-        run_step()
-        weights.append(self.evaluate(layer.kernel))
-        biases.append(self.evaluate(layer.bias))
-
-      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
-      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
-      self.assertTrue(is_not_increasing)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          optimizer_combinations.distributions_and_v1_optimizers(),
-          tf.__internal__.test.combinations.combine(mode=["graph"], use_callable_loss=[True, False])
-          + tf.__internal__.test.combinations.combine(mode=["eager"], use_callable_loss=[True])) +
-      tf.__internal__.test.combinations.times(
-          optimizer_combinations.distributions_and_v2_optimizers(),
-          tf.__internal__.test.combinations.combine(
-              mode=["graph", "eager"], use_callable_loss=[True])))
-  def testTrainNetworkByCallForEachReplica(self, distribution, optimizer_fn,
-                                           use_callable_loss):
-    with distribution.scope():
-      optimizer = optimizer_fn()
-      model_fn, dataset_fn, layer = minimize_loss_example(
-          optimizer, use_bias=True, use_callable_loss=use_callable_loss)
-
-      iterator = self._get_iterator(distribution, dataset_fn)
-
-      def run_step():
-        return distribution.group(
-            distribution.extended.call_for_each_replica(
-                model_fn, args=(iterator.get_next(),)))
-
-      if not tf.executing_eagerly():
-        with self.cached_session() as sess:
-          run_step = sess.make_callable(run_step())
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      weights, biases = [], []
-      for _ in range(10):
-        run_step()
-
-        weights.append(self.evaluate(layer.kernel))
-        biases.append(self.evaluate(layer.bias))
-
-      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
-      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
-      self.assertTrue(is_not_increasing)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
-          tf.__internal__.test.combinations.combine(mode=["graph", "eager"])) + tf.__internal__.test.combinations.combine(
-              distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
-              optimizer_fn=optimizer_combinations.optimizers_v1_and_v2,
-              mode=["graph"]))
-  def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
-    if (not tf.executing_eagerly() and
-        tf.compat.v1.control_flow_v2_enabled()):
-      self.skipTest("b/138751864")
-    created_variables = []
-    trainable_variables = []
-
-    def appending_creator(next_creator, **kwargs):
-      v = next_creator(**kwargs)
-      # Skip the StateVar created in the tf.random.Generator, which is used by
-      # keras initializers.
-      if "StateVar" in v.name:
-        return v
-      created_variables.append(v.name)
-      if "trainable" in kwargs and kwargs["trainable"]:
-        trainable_variables.append(v.name)
-      return v
-
-    # Creator scope needs to be set before it's used inside
-    # `distribution.scope`.
-    with tf.variable_creator_scope(
-        appending_creator), distribution.scope():
-      optimizer = optimizer_fn()
-      model_fn, dataset_fn, _ = minimize_loss_example(
-          optimizer, use_bias=True, use_callable_loss=True)
-
-      def step_fn(ctx, inputs):
-        del ctx  # Unused
-        return distribution.group(
-            distribution.extended.call_for_each_replica(
-                model_fn, args=(inputs,)))
-
-      iterator = self._get_iterator(distribution, dataset_fn)
-
-      def run_step():
-        return distribution.extended.experimental_run_steps_on_iterator(
-            step_fn, iterator, iterations=1).run_op
-
-      if not tf.executing_eagerly():
-        with self.cached_session() as sess:
-          run_step = sess.make_callable(run_step())
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      run_step()
-
-      def get_expected_variables(num_parameter_devices):
-        name = optimizer._name
-
-        if isinstance(optimizer, optimizer_v2.OptimizerV2):
-          variables = VAR_MAP_V2[name]
-        else:
-          variables = VAR_MAP_V1[name]
-
-        extended_variables = [
-            v + "/replica_{}".format(replica)
-            for v in variables
-            for replica in range(1, num_parameter_devices)
-        ]
-        variables = list(variables) + extended_variables
-        return set(v + ":0" for v in variables)
-
-      self.assertEqual(
-          get_expected_variables(len(distribution.extended.parameter_devices)),
-          set(created_variables))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          tf.__internal__.test.combinations.combine(momentum=[0.8, 0.9, 0.99], renorm=[False, True]),
-          tf.__internal__.test.combinations.times(
-              optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
-              tf.__internal__.test.combinations.combine(
-                  mode=["graph", "eager"],
-                  # TODO(isaprykin):  Allow False here.  Currently subsequent
-                  # replicas will re-execute UPDATE_OPS of previous replicas.
-                  update_ops_in_cross_replica_mode=[True])) +
-          tf.__internal__.test.combinations.combine(
-              distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
-              optimizer_fn=optimizer_combinations.optimizers_v1_and_v2,
-              mode=["graph"],
-              update_ops_in_cross_replica_mode=[False])))
-  def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
-                                    renorm, update_ops_in_cross_replica_mode):
-    """Verifies that moving mean updates are reduced across replicas."""
-    with distribution.scope():
-      num_replicas = distribution.num_replicas_in_sync
-      model_fn, dataset_fn, batchnorm = batchnorm_example(
-          optimizer_fn,
-          batch_per_epoch=num_replicas,
-          momentum=momentum,
-          renorm=renorm,
-          update_ops_in_replica_mode=not update_ops_in_cross_replica_mode)
-
-      def step_fn(ctx, inputs):
-        del ctx  # Unused
-        fetches = distribution.experimental_local_results(
-            distribution.extended.call_for_each_replica(
-                model_fn, args=(inputs,)))
-        if update_ops_in_cross_replica_mode:
-          fetches += tuple(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS))
-        return tf.group(fetches)
-
-      iterator = self._get_iterator(distribution, dataset_fn)
-
-      def run_step():
-        return distribution.extended.experimental_run_steps_on_iterator(
-            step_fn, iterator, iterations=1).run_op
-
-      if not tf.executing_eagerly():
-        with self.cached_session() as sess:
-          run_step = sess.make_callable(run_step())
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      expected_moving_means = [0.] * 8
-
-      def averaged_batch_mean(i):
-        # Each batch has shape [16, 8] where the ith element in jth list is
-        # (8 * j + i + replica_id * 100). So the batch mean in each replica is
-        # (60 + i + replica_id * 100). So here comes its batch mean over all
-        # replicas:
-        return 60. + i + (num_replicas - 1.) / 2. * 100.
-
-      for _ in range(10):
-        run_step()
-        moving_means = self.evaluate(batchnorm.moving_mean)
-
-        # We make sure that the moving_mean is updated as if the sample mean is
-        # calculated over all replicas.
-        for i, expected_moving_mean in enumerate(expected_moving_means):
-          expected_moving_means[i] -= ((
-              expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
-          self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          tf.__internal__.test.combinations.combine(loss_reduction=[
-              tf.compat.v1.losses.Reduction.SUM, tf.compat.v1.losses.Reduction.MEAN,
-              tf.compat.v1.losses.Reduction.SUM_OVER_BATCH_SIZE,
-              tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS
-          ]),
-          tf.__internal__.test.combinations.times(
-              tf.__internal__.test.combinations.combine(distribution=[
-                  tf.__internal__.distribute.combinations.one_device_strategy,
-                  tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-                  tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-                  tf.__internal__.distribute.combinations
-                  .mirrored_strategy_with_two_gpus_no_merge_call,
-              ]),
-              tf.__internal__.test.combinations.times(
-                  tf.__internal__.test.combinations.combine(optimizer_fn=optimizer_combinations
-                                       .gradient_descent_optimizer_v1_fn),
-                  tf.__internal__.test.combinations.combine(
-                      mode=["graph"], use_callable_loss=[True, False]) +
-                  tf.__internal__.test.combinations.combine(
-                      mode=["eager"], use_callable_loss=[True])) +
-              tf.__internal__.test.combinations.times(
-                  tf.__internal__.test.combinations.combine(optimizer_fn=optimizer_combinations
-                                       .gradient_descent_optimizer_keras_v2_fn),
-                  tf.__internal__.test.combinations.combine(
-                      mode=["graph", "eager"], use_callable_loss=[True]))) +
-          tf.__internal__.test.combinations.combine(
-              distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
-              optimizer_fn=optimizer_combinations
-              .gradient_descent_optimizer_v1_fn,
-              mode=["graph"],
-              use_callable_loss=[True, False]) + tf.__internal__.test.combinations.combine(
-                  distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
-                  optimizer_fn=optimizer_combinations
-                  .gradient_descent_optimizer_keras_v2_fn,
-                  mode=["graph"],
-                  use_callable_loss=[True])))
-  def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
-                    use_callable_loss):
-    with distribution.scope():
-      all_vars = []
-
-      def model_fn(inputs):
-        x, y = inputs
-        w = tf.compat.v1.get_variable("w", initializer=[[2.]])
-        all_vars.append(w)
-
-        def loss_fn():
-          # Use fixed initialization to make the steps deterministic.
-          predict = tf.matmul(x, w)
-          loss = tf.compat.v1.losses.mean_squared_error(
-              y, predict, reduction=loss_reduction)
-          if loss_reduction == tf.compat.v1.losses.Reduction.SUM:
-            return loss
-          return loss / distribution.num_replicas_in_sync
-
-        optimizer = optimizer_fn()  # GradientDescent with 0.2 learning rate
-
-        if isinstance(optimizer, optimizer_v2.OptimizerV2):
-          return optimizer.minimize(loss_fn, [w])
-        else:
-          if use_callable_loss:
-            return optimizer.minimize(loss_fn)
-          else:
-            return optimizer.minimize(loss_fn())
-
-      def dataset_fn():
-        features = tf.data.Dataset.from_tensors([[2.], [7.]])
-        labels = tf.data.Dataset.from_tensors([[6.], [21.]])
-        return tf.data.Dataset.zip((features, labels)).repeat()
-
-      def step_fn(ctx, inputs):
-        del ctx  # Unused
-        return distribution.group(
-            distribution.extended.call_for_each_replica(
-                model_fn, args=(inputs,)))
-
-      iterator = self._get_iterator(distribution, dataset_fn)
-
-      def run_step():
-        return distribution.extended.experimental_run_steps_on_iterator(
-            step_fn, iterator, iterations=1).run_op
-
-      if not tf.executing_eagerly():
-        with self.cached_session() as sess:
-          run_step = sess.make_callable(run_step())
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      run_step()
-
-      v = all_vars[0]
-      self.assertTrue(all(v is vi for vi in all_vars[1:]))
-      weight = numpy.squeeze(self.evaluate(v))
-      # Our model is:
-      #   predict = x * w
-      #   loss = (predict - y)^2
-      #   dloss/dpredict = 2*(predict - y)
-      #   dloss/dw = 2 * x^T @ (predict - y)
-      # For our batch size of 2, assuming sum loss reduction:
-      #   x = [2, 7]
-      #   y = [6, 21]
-      #   w_initial = 2
-      #   predict = [4, 14]
-      #   predict - y = [-2, -7]
-      #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
-      # So unreplicated the update to w with lr=0.001 is -0.2 * -106 = 0.106
-      # with sum loss reduction, or 0.053 with mean.
-      if loss_reduction == tf.compat.v1.losses.Reduction.SUM:
-        # Note that the "distribution.num_replicas_in_sync" factor will go away
-        # once we split the input across replicas, instead of pulling a complete
-        # batch of input per replica.
-        self.assertNear(weight, 2 + 0.106 * distribution.num_replicas_in_sync,
-                        0.0001)
-      else:
-        # One of the mean loss reductions.
-        self.assertNear(weight, 2 + 0.053, 0.0001)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(
-          optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
-          tf.__internal__.test.combinations.combine(mode=["graph", "eager"]),
-          tf.__internal__.test.combinations.combine(is_tpu=[False])) + tf.__internal__.test.combinations.combine(
-              distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
-              optimizer_fn=optimizer_combinations.optimizers_v1_and_v2,
-              mode=["graph"],
-              is_tpu=[True]))
-  def testRunStepsWithOutputContext(self, distribution, optimizer_fn, is_tpu):
-    with distribution.scope():
-      def dataset_fn():
-        dataset = tf.data.Dataset.from_tensors([[1.]]).repeat()
-        # TODO(priyag): batch with drop_remainder=True causes shapes to be
-        # fully defined for TPU. Remove this when XLA supports dynamic shapes.
-        return dataset.batch(batch_size=1, drop_remainder=True)
-
-      optimizer = optimizer_fn()
-      layer = core.Dense(1, use_bias=True)
-
-      key1 = "foo"
-      value1 = "bar"
-
-      def model_fn(output_context, x):
-        """A very simple model written by the user."""
-        def loss_fn():
-          y = tf.reshape(layer(x), []) - tf.constant(1.)
-          return y * y
-
-        if isinstance(optimizer, optimizer_v2.OptimizerV2):
-          train_op = optimizer.minimize(
-              loss_fn, lambda: layer.trainable_variables)
+    def _get_iterator(self, strategy, input_fn):
+        iterator = strategy.make_input_fn_iterator(lambda _: input_fn())
+        self.evaluate(iterator.initializer)
+        return iterator
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            optimizer_combinations.distributions_and_v1_optimizers(),
+            tf.__internal__.test.combinations.combine(
+                mode=["graph"], use_callable_loss=[True, False]
+            )
+            + tf.__internal__.test.combinations.combine(
+                mode=["eager"], use_callable_loss=[True]
+            ),
+        )
+        + tf.__internal__.test.combinations.times(
+            optimizer_combinations.distributions_and_v2_optimizers(),
+            tf.__internal__.test.combinations.combine(
+                mode=["graph", "eager"], use_callable_loss=[True]
+            ),
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
+            optimizer_fn=optimizer_combinations.optimizers_v2,
+            mode=["graph"],
+            use_callable_loss=[True],
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
+            optimizer_fn=optimizer_combinations.optimizers_v1,
+            mode=["graph"],
+            use_callable_loss=[True, False],
+        )
+    )
+    def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss):
+        with distribution.scope():
+            optimizer = optimizer_fn()
+            model_fn, dataset_fn, layer = minimize_loss_example(
+                optimizer, use_bias=True, use_callable_loss=use_callable_loss
+            )
+
+            def step_fn(ctx, inputs):
+                del ctx  # Unused
+                return distribution.group(
+                    distribution.extended.call_for_each_replica(
+                        model_fn, args=(inputs,)
+                    )
+                )
+
+            iterator = self._get_iterator(distribution, dataset_fn)
+
+            def run_step():
+                return distribution.extended.experimental_run_steps_on_iterator(
+                    step_fn, iterator, iterations=2
+                ).run_op
+
+            if not tf.executing_eagerly():
+                with self.cached_session() as sess:
+                    run_step = sess.make_callable(run_step())
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            weights, biases = [], []
+            for _ in range(5):
+                run_step()
+                weights.append(self.evaluate(layer.kernel))
+                biases.append(self.evaluate(layer.bias))
+
+            error = abs(
+                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1
+            )
+            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+            self.assertTrue(is_not_increasing)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            optimizer_combinations.distributions_and_v1_optimizers(),
+            tf.__internal__.test.combinations.combine(
+                mode=["graph"], use_callable_loss=[True, False]
+            )
+            + tf.__internal__.test.combinations.combine(
+                mode=["eager"], use_callable_loss=[True]
+            ),
+        )
+        + tf.__internal__.test.combinations.times(
+            optimizer_combinations.distributions_and_v2_optimizers(),
+            tf.__internal__.test.combinations.combine(
+                mode=["graph", "eager"], use_callable_loss=[True]
+            ),
+        )
+    )
+    def testTrainNetworkByCallForEachReplica(
+        self, distribution, optimizer_fn, use_callable_loss
+    ):
+        with distribution.scope():
+            optimizer = optimizer_fn()
+            model_fn, dataset_fn, layer = minimize_loss_example(
+                optimizer, use_bias=True, use_callable_loss=use_callable_loss
+            )
+
+            iterator = self._get_iterator(distribution, dataset_fn)
+
+            def run_step():
+                return distribution.group(
+                    distribution.extended.call_for_each_replica(
+                        model_fn, args=(iterator.get_next(),)
+                    )
+                )
+
+            if not tf.executing_eagerly():
+                with self.cached_session() as sess:
+                    run_step = sess.make_callable(run_step())
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            weights, biases = [], []
+            for _ in range(10):
+                run_step()
+
+                weights.append(self.evaluate(layer.kernel))
+                biases.append(self.evaluate(layer.bias))
+
+            error = abs(
+                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1
+            )
+            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+            self.assertTrue(is_not_increasing)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
+            tf.__internal__.test.combinations.combine(mode=["graph", "eager"]),
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
+            optimizer_fn=optimizer_combinations.optimizers_v1_and_v2,
+            mode=["graph"],
+        )
+    )
+    def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
+        if (
+            not tf.executing_eagerly()
+            and tf.compat.v1.control_flow_v2_enabled()
+        ):
+            self.skipTest("b/138751864")
+        created_variables = []
+        trainable_variables = []
+
+        def appending_creator(next_creator, **kwargs):
+            v = next_creator(**kwargs)
+            # Skip the StateVar created in the tf.random.Generator, which is
+            # used by keras initializers.
+            if "StateVar" in v.name:
+                return v
+            created_variables.append(v.name)
+            if "trainable" in kwargs and kwargs["trainable"]:
+                trainable_variables.append(v.name)
+            return v
+
+        # Creator scope needs to be set before it's used inside
+        # `distribution.scope`.
+        with tf.variable_creator_scope(appending_creator), distribution.scope():
+            optimizer = optimizer_fn()
+            model_fn, dataset_fn, _ = minimize_loss_example(
+                optimizer, use_bias=True, use_callable_loss=True
+            )
+
+            def step_fn(ctx, inputs):
+                del ctx  # Unused
+                return distribution.group(
+                    distribution.extended.call_for_each_replica(
+                        model_fn, args=(inputs,)
+                    )
+                )
+
+            iterator = self._get_iterator(distribution, dataset_fn)
+
+            def run_step():
+                return distribution.extended.experimental_run_steps_on_iterator(
+                    step_fn, iterator, iterations=1
+                ).run_op
+
+            if not tf.executing_eagerly():
+                with self.cached_session() as sess:
+                    run_step = sess.make_callable(run_step())
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            run_step()
+
+            def get_expected_variables(num_parameter_devices):
+                name = optimizer._name
+
+                if isinstance(optimizer, optimizer_v2.OptimizerV2):
+                    variables = VAR_MAP_V2[name]
+                else:
+                    variables = VAR_MAP_V1[name]
+
+                extended_variables = [
+                    v + f"/replica_{replica}"
+                    for v in variables
+                    for replica in range(1, num_parameter_devices)
+                ]
+                variables = list(variables) + extended_variables
+                return set(v + ":0" for v in variables)
+
+            self.assertEqual(
+                get_expected_variables(
+                    len(distribution.extended.parameter_devices)
+                ),
+                set(created_variables),
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tf.__internal__.test.combinations.combine(
+                momentum=[0.8, 0.9, 0.99], renorm=[False, True]
+            ),
+            tf.__internal__.test.combinations.times(
+                optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
+                tf.__internal__.test.combinations.combine(
+                    mode=["graph", "eager"],
+                    # TODO(isaprykin):  Allow False here.  Currently subsequent
+                    # replicas will re-execute UPDATE_OPS of previous replicas.
+                    update_ops_in_cross_replica_mode=[True],
+                ),
+            )
+            + tf.__internal__.test.combinations.combine(
+                distribution=[
+                    tf.__internal__.distribute.combinations.tpu_strategy
+                ],
+                optimizer_fn=optimizer_combinations.optimizers_v1_and_v2,
+                mode=["graph"],
+                update_ops_in_cross_replica_mode=[False],
+            ),
+        )
+    )
+    def testTrainNetworkWithBatchNorm(
+        self,
+        distribution,
+        optimizer_fn,
+        momentum,
+        renorm,
+        update_ops_in_cross_replica_mode,
+    ):
+        """Verifies that moving mean updates are reduced across replicas."""
+        with distribution.scope():
+            num_replicas = distribution.num_replicas_in_sync
+            model_fn, dataset_fn, batchnorm = batchnorm_example(
+                optimizer_fn,
+                batch_per_epoch=num_replicas,
+                momentum=momentum,
+                renorm=renorm,
+                update_ops_in_replica_mode=not update_ops_in_cross_replica_mode,
+            )
+
+            def step_fn(ctx, inputs):
+                del ctx  # Unused
+                fetches = distribution.experimental_local_results(
+                    distribution.extended.call_for_each_replica(
+                        model_fn, args=(inputs,)
+                    )
+                )
+                if update_ops_in_cross_replica_mode:
+                    fetches += tuple(
+                        tf.compat.v1.get_collection(
+                            tf.compat.v1.GraphKeys.UPDATE_OPS
+                        )
+                    )
+                return tf.group(fetches)
+
+            iterator = self._get_iterator(distribution, dataset_fn)
+
+            def run_step():
+                return distribution.extended.experimental_run_steps_on_iterator(
+                    step_fn, iterator, iterations=1
+                ).run_op
+
+            if not tf.executing_eagerly():
+                with self.cached_session() as sess:
+                    run_step = sess.make_callable(run_step())
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            expected_moving_means = [0.0] * 8
+
+            def averaged_batch_mean(i):
+                # Each batch has shape [16, 8] where the ith element in jth list
+                # is (8 * j + i + replica_id * 100). So the batch mean in each
+                # replica is (60 + i + replica_id * 100). So here comes its
+                # batch mean over all replicas:
+                return 60.0 + i + (num_replicas - 1.0) / 2.0 * 100.0
+
+            for _ in range(10):
+                run_step()
+                moving_means = self.evaluate(batchnorm.moving_mean)
+
+                # We make sure that the moving_mean is updated as if the sample
+                # mean is calculated over all replicas.
+                for i, expected_moving_mean in enumerate(expected_moving_means):
+                    expected_moving_means[i] -= (
+                        expected_moving_mean - averaged_batch_mean(i)
+                    ) * (1.0 - momentum)
+                    self.assertNear(
+                        expected_moving_means[i], moving_means[i], 0.0001
+                    )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            tf.__internal__.test.combinations.combine(
+                loss_reduction=[
+                    tf.compat.v1.losses.Reduction.SUM,
+                    tf.compat.v1.losses.Reduction.MEAN,
+                    tf.compat.v1.losses.Reduction.SUM_OVER_BATCH_SIZE,
+                    tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS,
+                ]
+            ),
+            tf.__internal__.test.combinations.times(
+                tf.__internal__.test.combinations.combine(
+                    distribution=[
+                        tf.__internal__.distribute.combinations.one_device_strategy,  # noqa: E501
+                        tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                        tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+                        tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus_no_merge_call,  # noqa: E501
+                    ]
+                ),
+                tf.__internal__.test.combinations.times(
+                    tf.__internal__.test.combinations.combine(
+                        optimizer_fn=optimizer_combinations.gradient_descent_optimizer_v1_fn  # noqa: E501
+                    ),
+                    tf.__internal__.test.combinations.combine(
+                        mode=["graph"], use_callable_loss=[True, False]
+                    )
+                    + tf.__internal__.test.combinations.combine(
+                        mode=["eager"], use_callable_loss=[True]
+                    ),
+                )
+                + tf.__internal__.test.combinations.times(
+                    tf.__internal__.test.combinations.combine(
+                        optimizer_fn=optimizer_combinations.gradient_descent_optimizer_keras_v2_fn  # noqa: E501
+                    ),
+                    tf.__internal__.test.combinations.combine(
+                        mode=["graph", "eager"], use_callable_loss=[True]
+                    ),
+                ),
+            )
+            + tf.__internal__.test.combinations.combine(
+                distribution=[
+                    tf.__internal__.distribute.combinations.tpu_strategy
+                ],
+                optimizer_fn=optimizer_combinations.gradient_descent_optimizer_v1_fn,  # noqa: E501
+                mode=["graph"],
+                use_callable_loss=[True, False],
+            )
+            + tf.__internal__.test.combinations.combine(
+                distribution=[
+                    tf.__internal__.distribute.combinations.tpu_strategy
+                ],
+                optimizer_fn=optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,  # noqa: E501
+                mode=["graph"],
+                use_callable_loss=[True],
+            ),
+        )
+    )
+    def testMeanVsSum(
+        self, distribution, optimizer_fn, loss_reduction, use_callable_loss
+    ):
+        with distribution.scope():
+            all_vars = []
+
+            def model_fn(inputs):
+                x, y = inputs
+                w = tf.compat.v1.get_variable("w", initializer=[[2.0]])
+                all_vars.append(w)
+
+                def loss_fn():
+                    # Use fixed initialization to make the steps deterministic.
+                    predict = tf.matmul(x, w)
+                    loss = tf.compat.v1.losses.mean_squared_error(
+                        y, predict, reduction=loss_reduction
+                    )
+                    if loss_reduction == tf.compat.v1.losses.Reduction.SUM:
+                        return loss
+                    return loss / distribution.num_replicas_in_sync
+
+                optimizer = (
+                    optimizer_fn()
+                )  # GradientDescent with 0.2 learning rate
+
+                if isinstance(optimizer, optimizer_v2.OptimizerV2):
+                    return optimizer.minimize(loss_fn, [w])
+                else:
+                    if use_callable_loss:
+                        return optimizer.minimize(loss_fn)
+                    else:
+                        return optimizer.minimize(loss_fn())
+
+            def dataset_fn():
+                features = tf.data.Dataset.from_tensors([[2.0], [7.0]])
+                labels = tf.data.Dataset.from_tensors([[6.0], [21.0]])
+                return tf.data.Dataset.zip((features, labels)).repeat()
+
+            def step_fn(ctx, inputs):
+                del ctx  # Unused
+                return distribution.group(
+                    distribution.extended.call_for_each_replica(
+                        model_fn, args=(inputs,)
+                    )
+                )
+
+            iterator = self._get_iterator(distribution, dataset_fn)
+
+            def run_step():
+                return distribution.extended.experimental_run_steps_on_iterator(
+                    step_fn, iterator, iterations=1
+                ).run_op
+
+            if not tf.executing_eagerly():
+                with self.cached_session() as sess:
+                    run_step = sess.make_callable(run_step())
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            run_step()
+
+            v = all_vars[0]
+            self.assertTrue(all(v is vi for vi in all_vars[1:]))
+            weight = numpy.squeeze(self.evaluate(v))
+            # Our model is:
+            #   predict = x * w
+            #   loss = (predict - y)^2
+            #   dloss/dpredict = 2*(predict - y)
+            #   dloss/dw = 2 * x^T @ (predict - y)
+            # For our batch size of 2, assuming sum loss reduction:
+            #   x = [2, 7]
+            #   y = [6, 21]
+            #   w_initial = 2
+            #   predict = [4, 14]
+            #   predict - y = [-2, -7]
+            #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
+            # So unreplicated the update to w with lr=0.001 is -0.2 * -106 =
+            # 0.106 with sum loss reduction, or 0.053 with mean.
+            if loss_reduction == tf.compat.v1.losses.Reduction.SUM:
+                # Note that the "distribution.num_replicas_in_sync" factor will
+                # go away once we split the input across replicas, instead of
+                # pulling a complete batch of input per replica.
+                self.assertNear(
+                    weight,
+                    2 + 0.106 * distribution.num_replicas_in_sync,
+                    0.0001,
+                )
+            else:
+                # One of the mean loss reductions.
+                self.assertNear(weight, 2 + 0.053, 0.0001)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
+            tf.__internal__.test.combinations.combine(mode=["graph", "eager"]),
+            tf.__internal__.test.combinations.combine(is_tpu=[False]),
+        )
+        + tf.__internal__.test.combinations.combine(
+            distribution=[tf.__internal__.distribute.combinations.tpu_strategy],
+            optimizer_fn=optimizer_combinations.optimizers_v1_and_v2,
+            mode=["graph"],
+            is_tpu=[True],
+        )
+    )
+    def testRunStepsWithOutputContext(self, distribution, optimizer_fn, is_tpu):
+        with distribution.scope():
+
+            def dataset_fn():
+                dataset = tf.data.Dataset.from_tensors([[1.0]]).repeat()
+                # TODO(priyag): batch with drop_remainder=True causes shapes to
+                # be fully defined for TPU. Remove this when XLA supports
+                # dynamic shapes.
+                return dataset.batch(batch_size=1, drop_remainder=True)
+
+            optimizer = optimizer_fn()
+            layer = core.Dense(1, use_bias=True)
+
+            key1 = "foo"
+            value1 = "bar"
+
+            def model_fn(output_context, x):
+                """A very simple model written by the user."""
+
+                def loss_fn():
+                    y = tf.reshape(layer(x), []) - tf.constant(1.0)
+                    return y * y
+
+                if isinstance(optimizer, optimizer_v2.OptimizerV2):
+                    train_op = optimizer.minimize(
+                        loss_fn, lambda: layer.trainable_variables
+                    )
+                else:
+                    train_op = optimizer.minimize(loss_fn)
+                loss = loss_fn()
+                output_context.set_last_step_output(
+                    name="replica_loss_reduced",
+                    output=loss,
+                    reduce_op=tf.distribute.ReduceOp.MEAN,
+                )
+                output_context.set_non_tensor_output(key1, value1)
+                return (train_op, loss)
+
+            def step_fn(output_context, inputs):
+                (train_op, loss) = distribution.extended.call_for_each_replica(
+                    model_fn, args=(output_context, inputs)
+                )
+                output_context.set_last_step_output(
+                    name="cross_replica_loss_reduced",
+                    output=loss,
+                    reduce_op=tf.distribute.ReduceOp.MEAN,
+                )
+                output_context.set_last_step_output(
+                    name="cross_replica_loss_not_reduced", output=loss
+                )
+                return distribution.group(train_op)
+
+            iterator = self._get_iterator(distribution, dataset_fn)
+
+            def run_step():
+                initial_loss = lambda: tf.constant(1e7)
+                # Initial values corresponding to reduced losses are just single
+                # tensors. But for non reduced losses, we need to have initial
+                # values that are of the same structure as non reduced losses.
+                # In MirroredStrategy, this will be a list of losses, in
+                # TPUStrategy it will be single tensor. Using
+                # `call_for_each_replica` followed by
+                # `experimental_local_results` gives us the desired initial
+                # value structure.
+                not_reduced = distribution.experimental_local_results(
+                    distribution.extended.call_for_each_replica(initial_loss)
+                )
+                initial_loop_values = {
+                    "replica_loss_reduced": initial_loss(),
+                    "cross_replica_loss_reduced": initial_loss(),
+                    "cross_replica_loss_not_reduced": not_reduced,
+                }
+                ctx = distribution.extended.experimental_run_steps_on_iterator(
+                    step_fn,
+                    iterator,
+                    iterations=2,
+                    initial_loop_values=initial_loop_values,
+                )
+
+                self.assertEqual({key1: (value1,)}, ctx.non_tensor_outputs)
+                self._verify_loss_output(
+                    initial_loss(),
+                    loss_output=ctx.last_step_outputs["replica_loss_reduced"],
+                    reduced=True,
+                    distribution=distribution,
+                )
+                self._verify_loss_output(
+                    initial_loss(),
+                    loss_output=ctx.last_step_outputs[
+                        "cross_replica_loss_reduced"
+                    ],
+                    reduced=True,
+                    distribution=distribution,
+                )
+                self._verify_loss_output(
+                    initial_loss(),
+                    loss_output=ctx.last_step_outputs[
+                        "cross_replica_loss_not_reduced"
+                    ],
+                    reduced=False,
+                    distribution=distribution,
+                )
+                return (
+                    ctx.run_op,
+                    ctx.last_step_outputs["replica_loss_reduced"],
+                )
+
+            if not tf.executing_eagerly():
+                with self.cached_session() as sess:
+                    run_step = sess.make_callable(run_step())
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            weights, biases = [], []
+            for _ in range(5):
+                run_step()
+                weights.append(self.evaluate(layer.kernel))
+                biases.append(self.evaluate(layer.bias))
+
+            error = abs(
+                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1
+            )
+            error_is_not_increasing = all(
+                y <= x for x, y in zip(error, error[1:])
+            )
+            self.assertTrue(error_is_not_increasing)
+
+    def _verify_loss_output(
+        self, initial_loss, loss_output, reduced, distribution
+    ):
+        if not reduced:
+            self.assertLen(
+                distribution.experimental_local_results(loss_output),
+                distribution.num_replicas_in_sync,
+            )
+            loss_tensor = distribution.reduce(
+                tf.distribute.ReduceOp.MEAN, loss_output, axis=None
+            )
         else:
-          train_op = optimizer.minimize(loss_fn)
-        loss = loss_fn()
-        output_context.set_last_step_output(
-            name="replica_loss_reduced",
-            output=loss,
-            reduce_op=tf.distribute.ReduceOp.MEAN)
-        output_context.set_non_tensor_output(key1, value1)
-        return (train_op, loss)
-
-      def step_fn(output_context, inputs):
-        (train_op, loss) = distribution.extended.call_for_each_replica(
-            model_fn, args=(output_context, inputs))
-        output_context.set_last_step_output(
-            name="cross_replica_loss_reduced",
-            output=loss,
-            reduce_op=tf.distribute.ReduceOp.MEAN)
-        output_context.set_last_step_output(
-            name="cross_replica_loss_not_reduced",
-            output=loss)
-        return distribution.group(train_op)
-
-      iterator = self._get_iterator(distribution, dataset_fn)
-
-      def run_step():
-        initial_loss = lambda: tf.constant(1e7)
-        # Initial values corresponding to reduced losses are just single
-        # tensors. But for non reduced losses, we need to have initial
-        # values that are of the same structure as non reduced losses. In
-        # MirroredStrategy, this will be a list of losses, in TPUStrategy
-        # it will be single tensor. Using `call_for_each_replica` followed
-        # by `experimental_local_results` gives us the desired initial
-        # value structure.
-        not_reduced = distribution.experimental_local_results(
-            distribution.extended.call_for_each_replica(initial_loss))
-        initial_loop_values = {
-            "replica_loss_reduced": initial_loss(),
-            "cross_replica_loss_reduced": initial_loss(),
-            "cross_replica_loss_not_reduced": not_reduced,
-        }
-        ctx = distribution.extended.experimental_run_steps_on_iterator(
-            step_fn, iterator, iterations=2,
-            initial_loop_values=initial_loop_values)
-
-        self.assertEqual({key1: (value1,)}, ctx.non_tensor_outputs)
-        self._verify_loss_output(
-            initial_loss(),
-            loss_output=ctx.last_step_outputs["replica_loss_reduced"],
-            reduced=True, distribution=distribution)
-        self._verify_loss_output(
-            initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_reduced"],
-            reduced=True, distribution=distribution)
-        self._verify_loss_output(
-            initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_not_reduced"],
-            reduced=False, distribution=distribution)
-        return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
-
-      if not tf.executing_eagerly():
-        with self.cached_session() as sess:
-          run_step = sess.make_callable(run_step())
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      weights, biases = [], []
-      for _ in range(5):
-        run_step()
-        weights.append(self.evaluate(layer.kernel))
-        biases.append(self.evaluate(layer.bias))
-
-      error = abs(
-          numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
-      error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
-      self.assertTrue(error_is_not_increasing)
-
-  def _verify_loss_output(self, initial_loss, loss_output, reduced,
-                          distribution):
-    if not reduced:
-      self.assertLen(distribution.experimental_local_results(loss_output),
-                     distribution.num_replicas_in_sync)
-      loss_tensor = distribution.reduce(tf.distribute.ReduceOp.MEAN, loss_output,
-                                        axis=None)
-    else:
-      unwrapped_output = distribution.experimental_local_results(loss_output)
-      self.assertLen(unwrapped_output, 1)
-      loss_tensor = unwrapped_output[0]
-    self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
-    self.assertEqual(initial_loss.shape, loss_tensor.shape)
-
-  @tf.__internal__.distribute.combinations.generate(
-      optimizer_combinations.distributions_and_v2_optimizers())
-  def test_empty_var_list(self, distribution, optimizer_fn):
-    opt = optimizer_fn()
-    with distribution.scope():
-
-      def run_fn():
-        opt.minimize(lambda: tf.constant(1.), [])
-        opt.apply_gradients([])
-
-      distribution.run(run_fn)
+            unwrapped_output = distribution.experimental_local_results(
+                loss_output
+            )
+            self.assertLen(unwrapped_output, 1)
+            loss_tensor = unwrapped_output[0]
+        self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
+        self.assertEqual(initial_loss.shape, loss_tensor.shape)
+
+    @tf.__internal__.distribute.combinations.generate(
+        optimizer_combinations.distributions_and_v2_optimizers()
+    )
+    def test_empty_var_list(self, distribution, optimizer_fn):
+        opt = optimizer_fn()
+        with distribution.scope():
+
+            def run_fn():
+                opt.minimize(lambda: tf.constant(1.0), [])
+                opt.apply_gradients([])
+
+            distribution.run(run_fn)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/distribute/mirrored_strategy_test.py b/keras/distribute/mirrored_strategy_test.py
index 47e4105e5c87..2f482f5ccbed 100644
--- a/keras/distribute/mirrored_strategy_test.py
+++ b/keras/distribute/mirrored_strategy_test.py
@@ -14,119 +14,135 @@
 # ==============================================================================
 """Tests for MirroredStrategy."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
-from tensorflow.python.eager import backprop
 from keras.engine import training as keras_training
 from keras.layers import core as keras_core
-from keras.optimizers.optimizer_v2 import rmsprop
+from keras.optimizers.legacy import rmsprop
 from keras.utils import kpl_test_utils
-from tensorflow.python.training import optimizer as optimizer_lib
+
+# isort: off
+from tensorflow.python.eager import backprop
+from tensorflow.python.training import (
+    optimizer as optimizer_lib,
+)
 
 
 class MiniModel(keras_training.Model):
-  """Minimal model for mnist.
+    """Minimal model for mnist.
 
-  Useful for testing and debugging on slow TPU simulators.
-  """
+    Useful for testing and debugging on slow TPU simulators.
+    """
 
-  def __init__(self):
-    super().__init__(name="")
-    self.fc = keras_core.Dense(1, name="fc", kernel_initializer="ones",
-                               bias_initializer="ones")
+    def __init__(self):
+        super().__init__(name="")
+        self.fc = keras_core.Dense(
+            1, name="fc", kernel_initializer="ones", bias_initializer="ones"
+        )
 
-  def call(self, inputs, training=True):
-    inputs = tf.ones([1, 10])
-    return self.fc(inputs)
+    def call(self, inputs, training=True):
+        inputs = tf.ones([1, 10])
+        return self.fc(inputs)
 
 
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
         distribution=[
-            tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
         ],
-        mode=["eager"]))
+        mode=["eager"],
+    )
+)
 class MirroredStrategyDefunTest(tf.test.TestCase, parameterized.TestCase):
-
-  def testTrain(self, distribution):
-    with distribution.scope():
-      mock_model = MiniModel()
-      mock_model.call = tf.function(mock_model.call)
-
-      def loss_fn(ctx):
-        del ctx
-        return mock_model(tf.ones([1, 10]))
-
-      gradients_fn = backprop.implicit_grad(loss_fn)
-      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
-      grads_and_vars = distribution.extended.call_for_each_replica(
-          gradients_fn, args=(None,))
-
-      optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.25)
-      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
-
-      if not tf.executing_eagerly():
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.evaluate(update_ops)
-
-      updated_var_values = self.evaluate(mock_model.variables)
-      # All variables start at 1.0 and get two updates of 0.25.
-      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
-      self.assertAllEqual([0.5], updated_var_values[1])
-
-  def testTrainAndServeWithKPL(self, distribution):
-    use_adapt = False
-    test_utils_obj = kpl_test_utils.DistributeKplTestUtils()
-    with distribution.scope():
-      feature_mapper, label_mapper = test_utils_obj.define_kpls_for_training(
-          use_adapt)
-      model = test_utils_obj.define_model()
-      optimizer = rmsprop.RMSprop(learning_rate=0.1)
-      accuracy = keras.metrics.Accuracy()
-
-      def dataset_fn(_):
-        return test_utils_obj.dataset_fn(feature_mapper, label_mapper)
-
-      @tf.function
-      def train_step(iterator):
-        """The step function for one training step."""
-
-        def step_fn(inputs):
-          """The computation to run on each replica(GPU)."""
-          features, labels = inputs
-          with tf.GradientTape() as tape:
-            pred = model(features, training=True)
-            loss = keras.losses.binary_crossentropy(labels, pred)
-            loss = tf.nn.compute_average_loss(loss)
-          grads = tape.gradient(loss, model.trainable_variables)
-          optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
-
-          actual_pred = tf.cast(tf.greater(pred, 0.5), tf.int64)
-          accuracy.update_state(labels, actual_pred)
-
-        distribution.run(step_fn, args=(next(iterator),))
-
-      distributed_dataset = distribution.distribute_datasets_from_function(
-          dataset_fn)
-      distributed_iterator = iter(distributed_dataset)
-      num_epochs = 4
-      num_steps = 7
-      for _ in range(num_epochs):
-        accuracy.reset_state()
-        for _ in range(num_steps):
-          train_step(distributed_iterator)
-
-      self.assertGreater(accuracy.result().numpy(), 0.5)
-      self.assertEqual(optimizer.iterations.numpy(), num_epochs * num_steps)
-
-    # Test save/load/serving the trained model.
-    test_utils_obj.test_save_load_serving_model(
-        model, feature_mapper, test_utils_obj.define_reverse_lookup_layer())
+    def testTrain(self, distribution):
+        with distribution.scope():
+            mock_model = MiniModel()
+            mock_model.call = tf.function(mock_model.call)
+
+            def loss_fn(ctx):
+                del ctx
+                return mock_model(tf.ones([1, 10]))
+
+            gradients_fn = backprop.implicit_grad(loss_fn)
+            gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+            grads_and_vars = distribution.extended.call_for_each_replica(
+                gradients_fn, args=(None,)
+            )
+
+            optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.25)
+            update_ops = optimizer._distributed_apply(
+                distribution, grads_and_vars
+            )
+
+            if not tf.executing_eagerly():
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.evaluate(update_ops)
+
+            updated_var_values = self.evaluate(mock_model.variables)
+            # All variables start at 1.0 and get two updates of 0.25.
+            self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
+            self.assertAllEqual([0.5], updated_var_values[1])
+
+    def testTrainAndServeWithKPL(self, distribution):
+        use_adapt = False
+        test_utils_obj = kpl_test_utils.DistributeKplTestUtils()
+        with distribution.scope():
+            (
+                feature_mapper,
+                label_mapper,
+            ) = test_utils_obj.define_kpls_for_training(use_adapt)
+            model = test_utils_obj.define_model()
+            optimizer = rmsprop.RMSprop(learning_rate=0.1)
+            accuracy = keras.metrics.Accuracy()
+
+            def dataset_fn(_):
+                return test_utils_obj.dataset_fn(feature_mapper, label_mapper)
+
+            @tf.function
+            def train_step(iterator):
+                """The step function for one training step."""
+
+                def step_fn(inputs):
+                    """The computation to run on each replica(GPU)."""
+                    features, labels = inputs
+                    with tf.GradientTape() as tape:
+                        pred = model(features, training=True)
+                        loss = keras.losses.binary_crossentropy(labels, pred)
+                        loss = tf.nn.compute_average_loss(loss)
+                    grads = tape.gradient(loss, model.trainable_variables)
+                    optimizer.apply_gradients(
+                        list(zip(grads, model.trainable_variables))
+                    )
+
+                    actual_pred = tf.cast(tf.greater(pred, 0.5), tf.int64)
+                    accuracy.update_state(labels, actual_pred)
+
+                distribution.run(step_fn, args=(next(iterator),))
+
+            distributed_dataset = (
+                distribution.distribute_datasets_from_function(dataset_fn)
+            )
+            distributed_iterator = iter(distributed_dataset)
+            num_epochs = 4
+            num_steps = 7
+            for _ in range(num_epochs):
+                accuracy.reset_state()
+                for _ in range(num_steps):
+                    train_step(distributed_iterator)
+
+            self.assertGreater(accuracy.result().numpy(), 0.5)
+            self.assertEqual(
+                optimizer.iterations.numpy(), num_epochs * num_steps
+            )
+
+        # Test save/load/serving the trained model.
+        test_utils_obj.test_save_load_serving_model(
+            model, feature_mapper, test_utils_obj.define_reverse_lookup_layer()
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/distribute/mirrored_variable_test.py b/keras/distribute/mirrored_variable_test.py
index 9f247031d209..fc7cdb566f61 100644
--- a/keras/distribute/mirrored_variable_test.py
+++ b/keras/distribute/mirrored_variable_test.py
@@ -15,96 +15,115 @@
 """Test MirroredVariable in MirroredStrategy and MultiWorkerMirroredStrategy."""
 
 import tensorflow.compat.v2 as tf
+
 from keras.distribute import distributed_training_utils
 from keras.layers import core
 
 
 def _mimic_two_cpus():
-  try:
-    cpus = tf.config.list_physical_devices("CPU")
-  except tf.errors.NotFoundError:
-    # Testing device not available. Skip the test.
-    return False
-
-  tf.config.set_logical_device_configuration(cpus[0], [
-      tf.config.LogicalDeviceConfiguration(),
-      tf.config.LogicalDeviceConfiguration(),
-  ])
-  return True
+    try:
+        cpus = tf.config.list_physical_devices("CPU")
+    except tf.errors.NotFoundError:
+        # Testing device not available. Skip the test.
+        return False
+
+    tf.config.set_logical_device_configuration(
+        cpus[0],
+        [
+            tf.config.LogicalDeviceConfiguration(),
+            tf.config.LogicalDeviceConfiguration(),
+        ],
+    )
+    return True
 
 
 def get_strategy_with_mimicing_cpus():
-  if not _mimic_two_cpus():
-    return None
-  return (tf.distribute.MultiWorkerMirroredStrategy
-          ._from_local_devices(("/device:CPU:0", "/device:CPU:1")))
+    if not _mimic_two_cpus():
+        return None
+    return tf.distribute.MultiWorkerMirroredStrategy._from_local_devices(
+        ("/device:CPU:0", "/device:CPU:1")
+    )
 
 
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
         distribution=list(
-            filter(None.__ne__, [
-                tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-                get_strategy_with_mimicing_cpus()
-            ])),
-        mode=["graph", "eager"]))
+            filter(
+                None.__ne__,
+                [
+                    tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+                    get_strategy_with_mimicing_cpus(),
+                ],
+            )
+        ),
+        mode=["graph", "eager"],
+    )
+)
 class MirroredVariableCreationTest(tf.test.TestCase):
-  """Base class that tests mirrored variable creator.
-
-  Currently it assumes all strategy objects have two replicas.
-  """
-
-  @classmethod
-  def setUpClass(cls):
-    _mimic_two_cpus()
-
-  def assertAllDifferent(self, objs):
-    for i in range(len(objs)):
-      for j in range(len(objs)):
-        if i == j:
-          continue
-        self.assertIsNot(objs[i], objs[j])
-
-  def _is_mirrored(self, val):
-    if distributed_training_utils.is_distributed_variable(val):
-      if val._policy:  # pylint: disable=protected-access
-        return val._policy._is_mirrored()  # pylint: disable=protected-access
-    # Since `Mirrored` is a private symbol in tf.distribute, we're checking
-    # with `DistributedValues` as an approximation.
-    return isinstance(val, tf.distribute.DistributedValues)
-
-  def testWithLayers(self, distribution):
-
-    def model_fn(features):
-
-      layer1 = core.Dense(1)
-      layer1(features)
-      layer2 = core.Dense(1)
-      layer2(features)
-      # We rely on names and orders to make sure replica references the same
-      # MirroredVariable. Uniquifying names may involve global states,
-      # merge_call switches threads so we need to test things work after
-      # merge_call.
-      tf.distribute.get_replica_context().merge_call(lambda _: _)
-      layer3 = core.Dense(1)
-      layer3(features)
-      return [(layer1.kernel, layer1.bias), (layer2.kernel, layer2.bias),
-              (layer3.kernel, layer3.bias)]
-
-    iterator = distribution.make_input_fn_iterator(
-        lambda _: tf.data.Dataset.from_tensors([[1.]]).repeat(10))
-    self.evaluate(iterator.initializer)
-    features = iterator.get_next()
-
-    with distribution.scope():
-      result = distribution.extended.call_for_each_replica(
-          model_fn, args=(features,))
-      for kernel, bias in result:
-        self.assertTrue(self._is_mirrored(kernel))
-        self.assertAllDifferent(distribution.experimental_local_results(kernel))
-        self.assertTrue(self._is_mirrored(bias))
-        self.assertAllDifferent(distribution.experimental_local_results(kernel))
+    """Base class that tests mirrored variable creator.
+
+    Currently it assumes all strategy objects have two replicas.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        _mimic_two_cpus()
+
+    def assertAllDifferent(self, objs):
+        for i in range(len(objs)):
+            for j in range(len(objs)):
+                if i == j:
+                    continue
+                self.assertIsNot(objs[i], objs[j])
+
+    def _is_mirrored(self, val):
+        if distributed_training_utils.is_distributed_variable(val):
+            if val._policy:
+                return val._policy._is_mirrored()
+        # Since `Mirrored` is a private symbol in tf.distribute, we're checking
+        # with `DistributedValues` as an approximation.
+        return isinstance(val, tf.distribute.DistributedValues)
+
+    def testWithLayers(self, distribution):
+        def model_fn(features):
+
+            layer1 = core.Dense(1)
+            layer1(features)
+            layer2 = core.Dense(1)
+            layer2(features)
+            # We rely on names and orders to make sure replica references the
+            # same MirroredVariable. Uniquifying names may involve global
+            # states, merge_call switches threads so we need to test things work
+            # after merge_call.
+            tf.distribute.get_replica_context().merge_call(lambda _: _)
+            layer3 = core.Dense(1)
+            layer3(features)
+            return [
+                (layer1.kernel, layer1.bias),
+                (layer2.kernel, layer2.bias),
+                (layer3.kernel, layer3.bias),
+            ]
+
+        iterator = distribution.make_input_fn_iterator(
+            lambda _: tf.data.Dataset.from_tensors([[1.0]]).repeat(10)
+        )
+        self.evaluate(iterator.initializer)
+        features = iterator.get_next()
+
+        with distribution.scope():
+            result = distribution.extended.call_for_each_replica(
+                model_fn, args=(features,)
+            )
+            for kernel, bias in result:
+                self.assertTrue(self._is_mirrored(kernel))
+                self.assertAllDifferent(
+                    distribution.experimental_local_results(kernel)
+                )
+                self.assertTrue(self._is_mirrored(bias))
+                self.assertAllDifferent(
+                    distribution.experimental_local_results(kernel)
+                )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/distribute/model_checkpoint_test.py b/keras/distribute/model_checkpoint_test.py
new file mode 100644
index 000000000000..a2d75cc5d0ab
--- /dev/null
+++ b/keras/distribute/model_checkpoint_test.py
@@ -0,0 +1,60 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests of ModelCheckpoint callback."""
+
+import os
+import sys
+
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras import callbacks
+from keras.distribute import multi_worker_testing_utils
+
+
+class ModelCheckpointTest(tf.test.TestCase, parameterized.TestCase):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            file_format=["h5", "tf"],
+            save_weights_only=[True, False],
+        )
+    )
+    def testCheckpointExists(self, file_format, save_weights_only):
+        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
+        model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
+        saving_dir = self.get_temp_dir()
+        saving_filepath = os.path.join(saving_dir, "checkpoint." + file_format)
+        callbacks_list = [
+            callbacks.ModelCheckpoint(
+                filepath=saving_filepath, save_weights_only=save_weights_only
+            )
+        ]
+        self.assertFalse(tf.io.gfile.exists(saving_filepath))
+        model.fit(
+            x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list
+        )
+        tf_saved_model_exists = tf.io.gfile.exists(saving_filepath)
+        tf_weights_only_checkpoint_exists = tf.io.gfile.exists(
+            saving_filepath + ".index"
+        )
+        self.assertTrue(
+            tf_saved_model_exists or tf_weights_only_checkpoint_exists
+        )
+
+
+if __name__ == "__main__":
+    with tf.compat.v1.test.mock.patch.object(sys, "exit", os._exit):
+        tf.test.main()
diff --git a/keras/distribute/model_collection_base.py b/keras/distribute/model_collection_base.py
index 75e0d4ccdf1d..16dea694b528 100644
--- a/keras/distribute/model_collection_base.py
+++ b/keras/distribute/model_collection_base.py
@@ -16,27 +16,27 @@
 
 
 class ModelAndInput:
-  """Base class to provide model and its corresponding inputs."""
+    """Base class to provide model and its corresponding inputs."""
 
-  def get_model(self):
-    """Returns a compiled keras model object, together with output name.
+    def get_model(self):
+        """Returns a compiled keras model object, together with output name.
 
-    Returns:
-      model: a keras model object
-      output_name: a string for the name of the output layer
-    """
-    raise NotImplementedError("must be implemented in descendants")
+        Returns:
+          model: a keras model object
+          output_name: a string for the name of the output layer
+        """
+        raise NotImplementedError("must be implemented in descendants")
 
-  def get_data(self):
-    """Returns data for training and predicting.
+    def get_data(self):
+        """Returns data for training and predicting.
 
-    Returns:
-      x_train: data used for training
-      y_train: label used for training
-      x_predict: data used for predicting
-    """
-    raise NotImplementedError("must be implemented in descendants")
+        Returns:
+          x_train: data used for training
+          y_train: label used for training
+          x_predict: data used for predicting
+        """
+        raise NotImplementedError("must be implemented in descendants")
 
-  def get_batch_size(self):
-    """Returns the batch_size used by the model."""
-    raise NotImplementedError("must be implemented in descendants")
+    def get_batch_size(self):
+        """Returns the batch_size used by the model."""
+        raise NotImplementedError("must be implemented in descendants")
diff --git a/keras/distribute/model_combinations.py b/keras/distribute/model_combinations.py
index f4f5602b2719..0349cad552eb 100644
--- a/keras/distribute/model_combinations.py
+++ b/keras/distribute/model_combinations.py
@@ -15,16 +15,21 @@
 """Strategy and optimizer combinations for combinations.combine()."""
 
 import tensorflow.compat.v2 as tf
+
 from keras.distribute import simple_models
 
 simple_functional_model = tf.__internal__.test.combinations.NamedObject(
-    "SimpleFunctionalModel", simple_models.SimpleFunctionalModel())
+    "SimpleFunctionalModel", simple_models.SimpleFunctionalModel()
+)
 
 simple_sequential_model = tf.__internal__.test.combinations.NamedObject(
-    "SimpleSequentialModel", simple_models.SimpleSequentialModel())
+    "SimpleSequentialModel", simple_models.SimpleSequentialModel()
+)
 
 simple_subclass_model = tf.__internal__.test.combinations.NamedObject(
-    "SimpleSubclassModel", simple_models.SimpleSubclassModel())
+    "SimpleSubclassModel", simple_models.SimpleSubclassModel()
+)
 
 simple_tfmodule_model = tf.__internal__.test.combinations.NamedObject(
-    "SimpleTFModuleModel", simple_models.SimpleTFModuleModel())
+    "SimpleTFModuleModel", simple_models.SimpleTFModuleModel()
+)
diff --git a/keras/distribute/multi_worker_callback_tf2_test.py b/keras/distribute/multi_worker_callback_tf2_test.py
index 24cc90076b5e..69043d6bd824 100644
--- a/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/keras/distribute/multi_worker_callback_tf2_test.py
@@ -14,390 +14,464 @@
 # ==============================================================================
 """Tests for Keras callbacks in multi-worker training with TF2."""
 
-import tensorflow.compat.v2 as tf
-
 import json
 import os
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import callbacks
 from keras.distribute import distributed_file_utils
 from keras.distribute import multi_worker_testing_utils
 
 
 def checkpoint_exists(filepath):
-  """Returns whether the checkpoint `filepath` refers to exists."""
-  if filepath.endswith('.h5'):
-    return tf.io.gfile.exists(filepath)
-  tf_saved_model_exists = tf.io.gfile.exists(filepath)
-  tf_weights_only_checkpoint_exists = tf.io.gfile.exists(
-      filepath + '.index')
-  return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+    """Returns whether the checkpoint `filepath` refers to exists."""
+    if filepath.endswith(".h5"):
+        return tf.io.gfile.exists(filepath)
+    tf_saved_model_exists = tf.io.gfile.exists(filepath)
+    tf_weights_only_checkpoint_exists = tf.io.gfile.exists(filepath + ".index")
+    return tf_saved_model_exists or tf_weights_only_checkpoint_exists
 
 
 def _model_setup(test_obj, file_format):
-  """Set up a MNIST Keras model for testing purposes.
-
-  This function builds a MNIST Keras model and returns relevant information
-  for testing.
-
-  Args:
-    test_obj: The `TestCase` testing object.
-    file_format: File format for checkpoints. 'tf' or 'h5'.
-
-  Returns:
-    A tuple of (model, saving_filepath, train_ds, steps) where train_ds is
-    the training dataset.
-  """
-  batch_size = 64
-  steps = 2
-  with tf.distribute.MultiWorkerMirroredStrategy().scope():
-    # TODO(b/142509827): In rare cases this errors out at C++ level with the
-    # "Connect failed" error message.
-    train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-        batch_size, steps)
-    model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-  # Pass saving_filepath from the parent thread to ensure every worker has the
-  # same filepath to save.
-  saving_filepath = os.path.join(test_obj.get_temp_dir(),
-                                 'checkpoint.' + file_format)
-  return model, saving_filepath, train_ds, steps
+    """Set up a MNIST Keras model for testing purposes.
+
+    This function builds a MNIST Keras model and returns relevant information
+    for testing.
+
+    Args:
+      test_obj: The `TestCase` testing object.
+      file_format: File format for checkpoints. 'tf' or 'h5'.
+
+    Returns:
+      A tuple of (model, saving_filepath, train_ds, steps) where train_ds is
+      the training dataset.
+    """
+    batch_size = 64
+    steps = 2
+    with tf.distribute.MultiWorkerMirroredStrategy().scope():
+        # TODO(b/142509827): In rare cases this errors out at C++ level with the
+        # "Connect failed" error message.
+        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
+            batch_size, steps
+        )
+        model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
+    # Pass saving_filepath from the parent thread to ensure every worker has the
+    # same filepath to save.
+    saving_filepath = os.path.join(
+        test_obj.get_temp_dir(), "checkpoint." + file_format
+    )
+    return model, saving_filepath, train_ds, steps
 
 
 def get_tf_config_task():
-  return json.loads(os.environ['TF_CONFIG'])['task']
+    return json.loads(os.environ["TF_CONFIG"])["task"]
 
 
 def get_tf_config_cluster_spec():
-  return json.loads(os.environ['TF_CONFIG'])['cluster']
+    return json.loads(os.environ["TF_CONFIG"])["cluster"]
 
 
 def get_task_type():
-  return get_tf_config_task()['type']
+    return get_tf_config_task()["type"]
 
 
 def get_task_index():
-  return get_tf_config_task()['index']
+    return get_tf_config_task()["index"]
 
 
 def is_chief():
-  return ('chief' not in get_tf_config_cluster_spec() and
-          get_task_type() == 'worker' and get_task_index() == 0)
+    return (
+        "chief" not in get_tf_config_cluster_spec()
+        and get_task_type() == "worker"
+        and get_task_index() == 0
+    )
 
 
 class KerasCallbackMultiProcessTest(parameterized.TestCase, tf.test.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'],
-          file_format=['h5', 'tf'],
-          save_weights_only=[True, False]))
-  def test_model_checkpoint_saves_on_chief_but_not_otherwise(
-      self, file_format, mode, save_weights_only):
-
-    def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
-        test_obj, file_format):
-
-      model, saving_filepath, train_ds, steps = _model_setup(
-          test_obj, file_format)
-      num_epoch = 2
-      extension = os.path.splitext(saving_filepath)[1]
-
-      # Incorporate type/index information and thread id in saving_filepath to
-      # ensure every worker has a unique path. Note that in normal use case the
-      # saving_filepath will be the same for all workers, but we use different
-      # ones here just to test out chief saves checkpoint but non-chief doesn't.
-      task_config = get_tf_config_task()
-      saving_filepath = os.path.join(
-          test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
-          (task_config['type'], task_config['index'], extension))
-
-      # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(checkpoint_exists(saving_filepath))
-
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          validation_data=train_ds,
-          validation_steps=steps,
-          callbacks=[
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath, save_weights_only=save_weights_only)
-          ])
-
-      # If it's chief, the model should be saved; if not, the model shouldn't.
-      test_obj.assertEqual(checkpoint_exists(saving_filepath), is_chief())
-
-      # If it's chief, the model should be saved (`write_filepath` should
-      # simply return `saving_filepath`); if not, i.e. for non-chief workers,
-      # the temporary path generated by `write_filepath` should no longer
-      # contain the checkpoint that has been deleted.
-      test_obj.assertEqual(
-          checkpoint_exists(
-              distributed_file_utils.write_filepath(
-                  saving_filepath, model._distribution_strategy)), is_chief())
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_model_checkpoint_saves_on_chief_but_not_otherwise,
-        cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(num_workers=2),
-        args=(self, file_format))
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['eager']))
-  def test_model_checkpoint_works_with_same_file_path(self, mode):
-
-    def proc_model_checkpoint_works_with_same_file_path(
-        test_obj, saving_filepath):
-      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
-      num_epoch = 2
-
-      # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
-
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])
-
-      test_obj.assertTrue(tf.io.gfile.exists(saving_filepath))
-
-    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_model_checkpoint_works_with_same_file_path,
-        cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(num_workers=2),
-        args=(self, saving_filepath))
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['eager']))
-  def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
-
-    class InterruptingCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        if epoch == 2:
-          raise RuntimeError('Interrupting!')
-
-    class AssertCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        # the interruption happened on epoch 2 as specified in
-        # InterruptingCallback, so the initial epoch after restart will begin
-        # at 2.
-        assert epoch > 1
-
-    def proc_model_checkpoint_works_with_same_file_path(test_obj,
-                                                        saving_filepath):
-      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
-      num_epoch = 4
-
-      # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
-      bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')
-
-      try:
-        model.fit(
-            x=train_ds,
-            epochs=num_epoch,
-            steps_per_epoch=steps,
-            callbacks=[
-                callbacks.ModelCheckpoint(filepath=saving_filepath),
-                callbacks.BackupAndRestore(backup_dir=bar_dir),
-                InterruptingCallback()
-            ])
-      except RuntimeError as e:
-        if 'Interrupting!' not in str(e):
-          raise
-
-      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
-      backup_filepath = os.path.join(bar_dir, 'chief', 'checkpoint')
-      test_obj.assertTrue(tf.io.gfile.exists(backup_filepath))
-      test_obj.assertTrue(tf.io.gfile.exists(saving_filepath))
-
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[
-              callbacks.ModelCheckpoint(filepath=saving_filepath),
-              callbacks.BackupAndRestore(backup_dir=bar_dir),
-              AssertCallback()
-          ])
-      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
-      test_obj.assertFalse(tf.io.gfile.exists(backup_filepath))
-      test_obj.assertTrue(tf.io.gfile.exists(saving_filepath))
-
-    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_model_checkpoint_works_with_same_file_path,
-        cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(num_workers=2),
-        args=(self, saving_filepath))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(mode=['eager']))
-  def test_profiler_saves_on_both_chief_and_non_chief(self, mode):
-
-    def proc_profiler_saves_on_both_chief_and_non_chief(test_obj):
-      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
-      num_epoch = 2
-
-      task_config = get_tf_config_task()
-      saving_filepath = os.path.join(
-          test_obj.get_temp_dir(),
-          'logfile_%s_%d' % (task_config['type'], task_config['index']))
-
-      # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
-
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[
-              callbacks.TensorBoard(
-                  log_dir=saving_filepath, profile_batch=[2, 4])
-          ])
-
-      # Profiler dir should be created on both chief and non-chief node
-      profiler_dir_path = os.path.join(saving_filepath, 'plugins', 'profile')
-      test_obj.assertTrue(tf.io.gfile.exists(profiler_dir_path))
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_profiler_saves_on_both_chief_and_non_chief,
-        cluster_spec=
-        tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
-            num_workers=2),
-        args=(self,))
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['eager']))
-  def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
-
-    def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
-      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
-      num_epoch = 2
-
-      # Incorporate type/index information and thread id in saving_filepath to
-      # ensure every worker has a unique path. Note that in normal use case the
-      # saving_filepath will be the same for all workers, but we use different
-      # ones here just to test out chief saves summaries but non-chief doesn't.
-      task_config = get_tf_config_task()
-      saving_filepath = os.path.join(
-          test_obj.get_temp_dir(),
-          'logfile_%s_%d' % (task_config['type'], task_config['index']))
-
-      # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
-
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          # disabling profiler by setting profile_batch to zero
-          callbacks=[
-              callbacks.TensorBoard(log_dir=saving_filepath, profile_batch=0)
-          ])
-
-      # If it's chief, the summaries should be saved in the filepath; if not,
-      # the directory should be empty (although created). Using
-      # `file_io.list_directory()` since the directory may be created at this
-      # point.
-      test_obj.assertEqual(
-          bool(tf.io.gfile.listdir(saving_filepath)), is_chief())
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_tensorboard_saves_on_chief_but_not_otherwise,
-        cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(num_workers=2),
-        args=(self,))
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['eager']))
-  def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode):
-
-    def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj):
-      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
-      num_epoch = 2
-
-      saving_filepath = os.path.join(
-          test_obj.get_temp_dir(),
-          'logfile_%s' % (get_tf_config_task()['type']))
-
-      saving_filepath_for_temp = os.path.join(saving_filepath, 'workertemp_1')
-      os.mkdir(saving_filepath)
-      os.mkdir(saving_filepath_for_temp)
-
-      # Verifies that even if `saving_filepath_for_temp` exists, tensorboard
-      # can still save to temporary directory.
-      test_obj.assertTrue(tf.io.gfile.exists(saving_filepath_for_temp))
-
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_tensorboard_can_still_save_to_temp_even_if_it_exists,
-        cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(num_workers=2),
-        args=(self,))
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['eager']))
-  def test_tensorboard_works_with_same_file_path(self, mode):
-
-    def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath):
-      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
-      num_epoch = 2
-
-      # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
-
-      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
-
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
-
-      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
-
-      test_obj.assertTrue(tf.io.gfile.listdir(saving_filepath))
-
-    saving_filepath = os.path.join(self.get_temp_dir(), 'logfile')
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_tensorboard_works_with_same_file_path,
-        cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(num_workers=2),
-        args=(self, saving_filepath))
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['eager']))
-  def test_early_stopping(self, mode):
-
-    def proc_early_stopping(test_obj):
-
-      class EpochCounterCallback(callbacks.Callback):
-
-        def on_epoch_begin(self, epoch, logs):
-          self.last_epoch = epoch
-
-      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
-      epoch_counter_cbk = EpochCounterCallback()
-      cbks = [
-          callbacks.EarlyStopping(
-              monitor='loss', min_delta=0.05, patience=1, verbose=1),
-          epoch_counter_cbk
-      ]
-
-      # Empirically, it is expected that `model.fit()` terminates around the
-      # 22th epoch. Asserting that it should have been stopped before the 50th
-      # epoch to avoid flakiness and be more predictable.
-      model.fit(x=train_ds, epochs=100, steps_per_epoch=steps, callbacks=cbks)
-      test_obj.assertLess(epoch_counter_cbk.last_epoch, 50)
-
-    tf.__internal__.distribute.multi_process_runner.run(
-        proc_early_stopping,
-        cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(num_workers=2),
-        args=(self,))
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            file_format=["h5", "tf"],
+            save_weights_only=[True, False],
+        )
+    )
+    def test_model_checkpoint_saves_on_chief_but_not_otherwise(
+        self, file_format, mode, save_weights_only
+    ):
+        def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
+            test_obj, file_format
+        ):
+
+            model, saving_filepath, train_ds, steps = _model_setup(
+                test_obj, file_format
+            )
+            num_epoch = 2
+            extension = os.path.splitext(saving_filepath)[1]
+
+            # Incorporate type/index information and thread id in
+            # saving_filepath to ensure every worker has a unique path. Note
+            # that in normal use case the saving_filepath will be the same for
+            # all workers, but we use different ones here just to test out chief
+            # saves checkpoint but non-chief doesn't.
+            task_config = get_tf_config_task()
+            saving_filepath = os.path.join(
+                test_obj.get_temp_dir(),
+                "checkpoint_%s_%d%s"
+                % (task_config["type"], task_config["index"], extension),
+            )
+
+            # The saving_filepath shouldn't exist at the beginning (as it's
+            # unique).
+            test_obj.assertFalse(checkpoint_exists(saving_filepath))
+
+            model.fit(
+                x=train_ds,
+                epochs=num_epoch,
+                steps_per_epoch=steps,
+                validation_data=train_ds,
+                validation_steps=steps,
+                callbacks=[
+                    callbacks.ModelCheckpoint(
+                        filepath=saving_filepath,
+                        save_weights_only=save_weights_only,
+                    )
+                ],
+            )
+
+            # If it's chief, the model should be saved; if not, the model
+            # shouldn't.
+            test_obj.assertEqual(checkpoint_exists(saving_filepath), is_chief())
+
+            # If it's chief, the model should be saved (`write_filepath` should
+            # simply return `saving_filepath`); if not, i.e. for non-chief
+            # workers, the temporary path generated by `write_filepath` should
+            # no longer contain the checkpoint that has been deleted.
+            test_obj.assertEqual(
+                checkpoint_exists(
+                    distributed_file_utils.write_filepath(
+                        saving_filepath, model._distribution_strategy
+                    )
+                ),
+                is_chief(),
+            )
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_model_checkpoint_saves_on_chief_but_not_otherwise,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self, file_format),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def test_model_checkpoint_works_with_same_file_path(self, mode):
+        def proc_model_checkpoint_works_with_same_file_path(
+            test_obj, saving_filepath
+        ):
+            model, _, train_ds, steps = _model_setup(test_obj, file_format="")
+            num_epoch = 2
+
+            # The saving_filepath shouldn't exist at the beginning (as it's
+            # unique).
+            test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
+
+            model.fit(
+                x=train_ds,
+                epochs=num_epoch,
+                steps_per_epoch=steps,
+                callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)],
+            )
+
+            test_obj.assertTrue(tf.io.gfile.exists(saving_filepath))
+
+        saving_filepath = os.path.join(self.get_temp_dir(), "checkpoint")
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_model_checkpoint_works_with_same_file_path,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self, saving_filepath),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
+        class InterruptingCallback(callbacks.Callback):
+            def on_epoch_begin(self, epoch, logs=None):
+                if epoch == 2:
+                    raise RuntimeError("Interrupting!")
+
+        class AssertCallback(callbacks.Callback):
+            def on_epoch_begin(self, epoch, logs=None):
+                # the interruption happened on epoch 2 as specified in
+                # InterruptingCallback, so the initial epoch after restart will
+                # begin at 2.
+                assert epoch > 1
+
+        def proc_model_checkpoint_works_with_same_file_path(
+            test_obj, saving_filepath
+        ):
+            model, _, train_ds, steps = _model_setup(test_obj, file_format="")
+            num_epoch = 4
+
+            # The saving_filepath shouldn't exist at the beginning (as it's
+            # unique).
+            test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
+            bar_dir = os.path.join(os.path.dirname(saving_filepath), "backup")
+
+            try:
+                model.fit(
+                    x=train_ds,
+                    epochs=num_epoch,
+                    steps_per_epoch=steps,
+                    callbacks=[
+                        callbacks.ModelCheckpoint(filepath=saving_filepath),
+                        callbacks.BackupAndRestore(backup_dir=bar_dir),
+                        InterruptingCallback(),
+                    ],
+                )
+            except RuntimeError as e:
+                if "Interrupting!" not in str(e):
+                    raise
+
+            tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+            backup_filepath = os.path.join(bar_dir, "chief", "checkpoint")
+            test_obj.assertTrue(tf.io.gfile.exists(backup_filepath))
+            test_obj.assertTrue(tf.io.gfile.exists(saving_filepath))
+
+            model.fit(
+                x=train_ds,
+                epochs=num_epoch,
+                steps_per_epoch=steps,
+                callbacks=[
+                    callbacks.ModelCheckpoint(filepath=saving_filepath),
+                    callbacks.BackupAndRestore(backup_dir=bar_dir),
+                    AssertCallback(),
+                ],
+            )
+            tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+            test_obj.assertFalse(tf.io.gfile.exists(backup_filepath))
+            test_obj.assertTrue(tf.io.gfile.exists(saving_filepath))
+
+        saving_filepath = os.path.join(self.get_temp_dir(), "checkpoint")
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_model_checkpoint_works_with_same_file_path,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self, saving_filepath),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def test_profiler_saves_on_both_chief_and_non_chief(self, mode):
+        def proc_profiler_saves_on_both_chief_and_non_chief(test_obj):
+            model, _, train_ds, steps = _model_setup(test_obj, file_format="")
+            num_epoch = 2
+
+            task_config = get_tf_config_task()
+            saving_filepath = os.path.join(
+                test_obj.get_temp_dir(),
+                "logfile_%s_%d" % (task_config["type"], task_config["index"]),
+            )
+
+            # The saving_filepath shouldn't exist at the beginning (as it's
+            # unique).
+            test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
+
+            model.fit(
+                x=train_ds,
+                epochs=num_epoch,
+                steps_per_epoch=steps,
+                callbacks=[
+                    callbacks.TensorBoard(
+                        log_dir=saving_filepath, profile_batch=[2, 4]
+                    )
+                ],
+            )
+
+            # Profiler dir should be created on both chief and non-chief node
+            profiler_dir_path = os.path.join(
+                saving_filepath, "plugins", "profile"
+            )
+            test_obj.assertTrue(tf.io.gfile.exists(profiler_dir_path))
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_profiler_saves_on_both_chief_and_non_chief,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self,),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
+        def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
+            model, _, train_ds, steps = _model_setup(test_obj, file_format="")
+            num_epoch = 2
+
+            # Incorporate type/index information and thread id in
+            # saving_filepath to ensure every worker has a unique path. Note
+            # that in normal use case the saving_filepath will be the same for
+            # all workers, but we use different ones here just to test out chief
+            # saves summaries but non-chief doesn't.
+            task_config = get_tf_config_task()
+            saving_filepath = os.path.join(
+                test_obj.get_temp_dir(),
+                "logfile_%s_%d" % (task_config["type"], task_config["index"]),
+            )
+
+            # The saving_filepath shouldn't exist at the beginning (as it's
+            # unique).
+            test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
+
+            model.fit(
+                x=train_ds,
+                epochs=num_epoch,
+                steps_per_epoch=steps,
+                # disabling profiler by setting profile_batch to zero
+                callbacks=[
+                    callbacks.TensorBoard(
+                        log_dir=saving_filepath, profile_batch=0
+                    )
+                ],
+            )
+
+            # If it's chief, the summaries should be saved in the filepath; if
+            # not, the directory should be empty (although created). Using
+            # `file_io.list_directory()` since the directory may be created at
+            # this point.
+            test_obj.assertEqual(
+                bool(tf.io.gfile.listdir(saving_filepath)), is_chief()
+            )
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_tensorboard_saves_on_chief_but_not_otherwise,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self,),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode):
+        def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj):
+            model, _, train_ds, steps = _model_setup(test_obj, file_format="")
+            num_epoch = 2
+
+            saving_filepath = os.path.join(
+                test_obj.get_temp_dir(),
+                f"logfile_{get_tf_config_task()['type']}",
+            )
+
+            saving_filepath_for_temp = os.path.join(
+                saving_filepath, "workertemp_1"
+            )
+            os.mkdir(saving_filepath)
+            os.mkdir(saving_filepath_for_temp)
+
+            # Verifies that even if `saving_filepath_for_temp` exists,
+            # tensorboard can still save to temporary directory.
+            test_obj.assertTrue(tf.io.gfile.exists(saving_filepath_for_temp))
+
+            model.fit(
+                x=train_ds,
+                epochs=num_epoch,
+                steps_per_epoch=steps,
+                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)],
+            )
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_tensorboard_can_still_save_to_temp_even_if_it_exists,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self,),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def test_tensorboard_works_with_same_file_path(self, mode):
+        def proc_tensorboard_works_with_same_file_path(
+            test_obj, saving_filepath
+        ):
+            model, _, train_ds, steps = _model_setup(test_obj, file_format="")
+            num_epoch = 2
+
+            # The saving_filepath shouldn't exist at the beginning (as it's
+            # unique).
+            test_obj.assertFalse(tf.io.gfile.exists(saving_filepath))
+
+            tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+
+            model.fit(
+                x=train_ds,
+                epochs=num_epoch,
+                steps_per_epoch=steps,
+                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)],
+            )
+
+            tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+
+            test_obj.assertTrue(tf.io.gfile.listdir(saving_filepath))
+
+        saving_filepath = os.path.join(self.get_temp_dir(), "logfile")
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_tensorboard_works_with_same_file_path,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self, saving_filepath),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def test_early_stopping(self, mode):
+        def proc_early_stopping(test_obj):
+            class EpochCounterCallback(callbacks.Callback):
+                def on_epoch_begin(self, epoch, logs):
+                    self.last_epoch = epoch
+
+            model, _, train_ds, steps = _model_setup(test_obj, file_format="")
+            epoch_counter_cbk = EpochCounterCallback()
+            cbks = [
+                callbacks.EarlyStopping(
+                    monitor="loss", min_delta=0.05, patience=1, verbose=1
+                ),
+                epoch_counter_cbk,
+            ]
+
+            # Empirically, it is expected that `model.fit()` terminates around
+            # the 22th epoch. Asserting that it should have been stopped before
+            # the 50th epoch to avoid flakiness and be more predictable.
+            model.fit(
+                x=train_ds, epochs=100, steps_per_epoch=steps, callbacks=cbks
+            )
+            test_obj.assertLess(epoch_counter_cbk.last_epoch, 50)
+
+        tf.__internal__.distribute.multi_process_runner.run(
+            proc_early_stopping,
+            cluster_spec=tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                num_workers=2
+            ),
+            args=(self,),
+        )
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/multi_worker_test.py b/keras/distribute/multi_worker_test.py
index ae74ba22af5f..243b6b54737c 100644
--- a/keras/distribute/multi_worker_test.py
+++ b/keras/distribute/multi_worker_test.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Test multi-worker Keras."""
 
-import tensorflow.compat.v2 as tf
-
 import collections
 import copy
 import functools
@@ -24,261 +22,409 @@
 import sys
 import threading
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
-
 import keras
 from keras import backend
 from keras import callbacks
 from keras import metrics as metrics_module
 from keras import models
-from keras.optimizers import optimizer_v1
 from keras.distribute import multi_worker_testing_utils
-from keras.optimizers.optimizer_v2 import rmsprop
+from keras.optimizers import optimizer_v1
+from keras.optimizers.legacy import rmsprop
 from keras.utils import kpl_test_utils
 
 
-
-
 def _clone_and_build_model(model, strategy):
-  # The new "original" model in worker 0.
-  with strategy.scope():
-    cloned_model = models.clone_model(model)
-
-  # Compile and build model.
-  if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
-    optimizer = model.optimizer
-    # TODO(yuefengz): figure out why the optimizer here is still a
-    # TFOptimizer.
-    while isinstance(optimizer, optimizer_v1.TFOptimizer):
-      optimizer = optimizer.optimizer
-    optimizer = copy.deepcopy(optimizer)
-  else:
-    optimizer_config = model.optimizer.get_config()
-    optimizer = type(model.optimizer).from_config(optimizer_config)
-
-  cloned_model.compile(
-      optimizer,
-      model.loss,
-      metrics=metrics_module.clone_metrics(model._compile_metrics),
-      loss_weights=model.loss_weights,
-      sample_weight_mode=model.sample_weight_mode,
-      weighted_metrics=metrics_module.clone_metrics(
-          model._compile_weighted_metrics))
-  return cloned_model
+    # The new "original" model in worker 0.
+    with strategy.scope():
+        cloned_model = models.clone_model(model)
+
+    # Compile and build model.
+    if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
+        optimizer = model.optimizer
+        # TODO(yuefengz): figure out why the optimizer here is still a
+        # TFOptimizer.
+        while isinstance(optimizer, optimizer_v1.TFOptimizer):
+            optimizer = optimizer.optimizer
+        optimizer = copy.deepcopy(optimizer)
+    else:
+        optimizer_config = model.optimizer.get_config()
+        optimizer = type(model.optimizer).from_config(optimizer_config)
+
+    cloned_model.compile(
+        optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics
+        ),
+    )
+    return cloned_model
 
 
 # TODO(b/123918215): Possibly merge this Callback with keras_test.Counter.
 class MultiWorkerVerificationCallback(callbacks.Callback):
-  """MultiWorkerVerificationCallback verifies the callbacks in multi-worker scheme.
-
-  This Callback is intended to be used for verifying the callback is indeed
-  called the correct number of times in various task types.
-
-  Attributes:
-    _task_dict: A nested dictionary storing the number of times a callback has
-                been called in specific task type, task index, and method name.
-                Look up structure is
-                task_name -> task_id -> tracking_method_name -> invoke_count
-                For example, a _task_dict of
-                {
-                    'ps': {
-                         0: {
-                             'on_epoch_begin': 2
-                         },
-                         1: {
-                             'on_epoch_begin': 2
-                         }
-                    },
-                    'worker': {
-                         0: {
-                             'on_epoch_begin': 2
-                         },
-                         1: {
-                             'on_epoch_begin': 2
-                         }
-                    }
-                }
-                indicates the ps task has 'on_epoch_begin' called twice on each
-                of the two indices, and likewise for worker task.
-  """
-
-  # TODO(rchao): Add other method calls to verify.
-  METHODS_TO_VERIFY = ['on_epoch_begin']
-
-  def __init__(self, num_epoch, num_worker):
-    """Initialize a MultiWorkerVerificationCallback.
-
-    Args:
-      num_epoch: Number of epochs this Callback is expected to be called for.
-      num_worker: Number of workers this Callback is expected to be called from.
+    """MultiWorkerVerificationCallback verifies the callbacks in multi-worker
+    scheme.
+
+    This Callback is intended to be used for verifying the callback is indeed
+    called the correct number of times in various task types.
+
+    Attributes:
+      _task_dict: A nested dictionary storing the number of times a callback has
+                  been called in specific task type, task index, and method
+                  name.  Look up structure is
+                  task_name -> task_id -> tracking_method_name -> invoke_count
+                  For example, a _task_dict of
+                  {
+                      'ps': {
+                           0: {
+                               'on_epoch_begin': 2
+                           },
+                           1: {
+                               'on_epoch_begin': 2
+                           }
+                      },
+                      'worker': {
+                           0: {
+                               'on_epoch_begin': 2
+                           },
+                           1: {
+                               'on_epoch_begin': 2
+                           }
+                      }
+                  }
+                  indicates the ps task has 'on_epoch_begin' called twice on
+                  each of the two indices, and likewise for worker task.
     """
-    super().__init__()
-    self._num_epoch = num_epoch
-    self._num_worker = num_worker
-    self._task_dict = {
-        key: collections.defaultdict(lambda: collections.defaultdict(int))
-        for key in ['ps', 'worker', 'chief']
-    }
-    self._lock = threading.Lock()
-    self._is_between_graph = None
-    self.wrap_methods(self.METHODS_TO_VERIFY)
-
-  @property
-  def is_between_graph(self):
-    return self._is_between_graph
-
-  @is_between_graph.setter
-  def is_between_graph(self, is_between_graph):
-    self._is_between_graph = is_between_graph
-
-  def wrap_methods(self, method_names):
-    """Wrap methods so that the counts of calls are tracked.
-
-    Args:
-      method_names: A list of names of methods to track calls.
-    """
-    for method_name in method_names:
-      method = getattr(self, method_name)
-
-      def wrapped_method(method_to_wrap, name, *arg, **kwargs):
-        # Use lock to ensure += operation is thread-safe.
-        with self._lock:
-          task_config = json.loads(os.environ['TF_CONFIG'])['task']
-          self._task_dict[task_config['type']][task_config['index']][name] += 1
-        method_to_wrap(*arg, **kwargs)
-
-      setattr(self, method_name,
-              functools.partial(wrapped_method, method, method_name))
-
-  def verify(self, test_case):
-    method_count_dict = {
-        method_name: self._num_epoch for method_name in self.METHODS_TO_VERIFY
-    }
-    assert self._is_between_graph is not None
-    if self._is_between_graph:
-      # TODO(b/124171024): In between-graph replication, by default only the
-      # chief calls callback. Fix this test to cover that, as well as the rare
-      # cases where all workers call.
-      worker_call_count = {
-          i: method_count_dict for i in range(0, self._num_worker)
-      }
-    else:
-      # If in-graph, only the first worker calls callback methods.
-      worker_call_count = {0: method_count_dict}
-    chief_call_count = {0: method_count_dict}
-    task_config = json.loads(os.environ['TF_CONFIG'])['task']['type']
-    test_case.assertDictEqual(
-        self._task_dict,
-        {
-            # PS' callback is not supposed to be called.
-            'ps': {},
-            # Worker or chief should only be called on worker/chief.
-            'worker': worker_call_count if task_config == 'worker' else {},
-            'chief': chief_call_count if task_config == 'chief' else {}
-        })
-
-
-class KerasMultiWorkerTestIndependentWorker(tf.test.TestCase,
-                                            parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'],
-          strategy=[
-              tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,
-              tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,
-          ]))
-  def testSimpleModelIndependentWorkerSync(self, strategy):
-    verification_callback = MultiWorkerVerificationCallback(
-        num_epoch=2,
-        num_worker=len(
-            json.loads(os.environ['TF_CONFIG'])['cluster']['worker']))
-    verification_callback.is_between_graph = \
-        strategy.extended.experimental_between_graph
-    batch_size = 64
-    steps = 2
-    train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-        batch_size, steps)
-    with strategy.scope():
-      model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-    orig_loss, _ = model.evaluate(train_ds, steps=steps)
-    history = model.fit(
-        x=train_ds,
-        epochs=2,
-        steps_per_epoch=steps,
-        callbacks=[verification_callback])
-    self.assertIsInstance(history, keras.callbacks.History)
-    trained_loss, _ = model.evaluate(train_ds, steps=steps)
-    self.assertLess(trained_loss, orig_loss)
-
-    verification_callback.verify(self)
-
-
-class KPLMultiWorkerTest(tf.test.TestCase,
-                         parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'],
-          use_adapt=[False],  # TODO(b/180742437): Add tests for using adapt.
-          strategy=[
-              tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,
-              # TODO(b/183956672): Re-enable
-              # strategy_combinations.multi_worker_mirrored_2x2_gpu,
-          ]))
-  def testTrainAndServeWithKPL(self, use_adapt, strategy):
-    test_utils_obj = kpl_test_utils.DistributeKplTestUtils()
-    with strategy.scope():
-      feature_mapper, label_mapper = test_utils_obj.define_kpls_for_training(
-          use_adapt)
-      model = test_utils_obj.define_model()
-      optimizer = rmsprop.RMSprop(learning_rate=0.1)
-      accuracy = keras.metrics.Accuracy()
-
-      def dataset_fn(_):
-        return test_utils_obj.dataset_fn(feature_mapper, label_mapper)
-
-      @tf.function
-      def train_step(iterator):
-        """The step function for one training step."""
-
-        def step_fn(inputs):
-          """The computation to run on each worker."""
-          features, labels = inputs
-          with tf.GradientTape() as tape:
-            pred = model(features, training=True)
-            loss = keras.losses.binary_crossentropy(labels, pred)
-            loss = tf.nn.compute_average_loss(loss)
-          grads = tape.gradient(loss, model.trainable_variables)
-          optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
-
-          actual_pred = tf.cast(tf.greater(pred, 0.5), tf.int64)
-          accuracy.update_state(labels, actual_pred)
-
-        strategy.run(step_fn, args=(next(iterator),))
-
-      distributed_dataset = strategy.distribute_datasets_from_function(
-          dataset_fn)
-      distributed_iterator = iter(distributed_dataset)
-      num_epochs = 4
-      num_steps = 7
-      for _ in range(num_epochs):
-        accuracy.reset_state()
-        for _ in range(num_steps):
-          train_step(distributed_iterator)
-
-      self.assertGreater(accuracy.result().numpy(), 0.5)
-      self.assertEqual(optimizer.iterations.numpy(), num_epochs * num_steps)
-
-    # Test save/load/serving the trained model.
-    test_utils_obj.test_save_load_serving_model(
-        model, feature_mapper, test_utils_obj.define_reverse_lookup_layer())
-
-
-if __name__ == '__main__':
-  # Enable manual variable initialization to make sure variables are initialized
-  # by `init_restore_or_wait_for_variables`.
-  backend.manual_variable_initialization(True)
-  with tf.compat.v1.test.mock.patch.object(sys, 'exit', os._exit):
-    tf.__internal__.distribute.multi_process_runner.test_main()
+
+    # TODO(rchao): Add other method calls to verify.
+    METHODS_TO_VERIFY = ["on_epoch_begin"]
+
+    def __init__(self, num_epoch, num_worker):
+        """Initialize a MultiWorkerVerificationCallback.
+
+        Args:
+          num_epoch: Number of epochs this Callback is expected to be called
+            for.
+          num_worker: Number of workers this Callback is expected to be called
+            from.
+        """
+        super().__init__()
+        self._num_epoch = num_epoch
+        self._num_worker = num_worker
+        self._task_dict = {
+            key: collections.defaultdict(lambda: collections.defaultdict(int))
+            for key in ["ps", "worker", "chief"]
+        }
+        self._lock = threading.Lock()
+        self._is_between_graph = None
+        self.wrap_methods(self.METHODS_TO_VERIFY)
+
+    @property
+    def is_between_graph(self):
+        return self._is_between_graph
+
+    @is_between_graph.setter
+    def is_between_graph(self, is_between_graph):
+        self._is_between_graph = is_between_graph
+
+    def wrap_methods(self, method_names):
+        """Wrap methods so that the counts of calls are tracked.
+
+        Args:
+          method_names: A list of names of methods to track calls.
+        """
+        for method_name in method_names:
+            method = getattr(self, method_name)
+
+            def wrapped_method(method_to_wrap, name, *arg, **kwargs):
+                # Use lock to ensure += operation is thread-safe.
+                with self._lock:
+                    task_config = json.loads(os.environ["TF_CONFIG"])["task"]
+                    self._task_dict[task_config["type"]][task_config["index"]][
+                        name
+                    ] += 1
+                method_to_wrap(*arg, **kwargs)
+
+            setattr(
+                self,
+                method_name,
+                functools.partial(wrapped_method, method, method_name),
+            )
+
+    def verify(self, test_case):
+        method_count_dict = {
+            method_name: self._num_epoch
+            for method_name in self.METHODS_TO_VERIFY
+        }
+        assert self._is_between_graph is not None
+        if self._is_between_graph:
+            # TODO(b/124171024): In between-graph replication, by default only
+            # the chief calls callback. Fix this test to cover that, as well as
+            # the rare cases where all workers call.
+            worker_call_count = {
+                i: method_count_dict for i in range(0, self._num_worker)
+            }
+        else:
+            # If in-graph, only the first worker calls callback methods.
+            worker_call_count = {0: method_count_dict}
+        chief_call_count = {0: method_count_dict}
+        task_config = json.loads(os.environ["TF_CONFIG"])["task"]["type"]
+        test_case.assertDictEqual(
+            self._task_dict,
+            {
+                # PS' callback is not supposed to be called.
+                "ps": {},
+                # Worker or chief should only be called on worker/chief.
+                "worker": worker_call_count if task_config == "worker" else {},
+                "chief": chief_call_count if task_config == "chief" else {},
+            },
+        )
+
+
+class KerasMultiWorkerTestIndependentWorker(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            strategy=[
+                tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,  # noqa: E501
+            ],
+        )
+    )
+    def testSimpleModelIndependentWorkerSync(self, strategy):
+        verification_callback = MultiWorkerVerificationCallback(
+            num_epoch=2,
+            num_worker=len(
+                json.loads(os.environ["TF_CONFIG"])["cluster"]["worker"]
+            ),
+        )
+        verification_callback.is_between_graph = (
+            strategy.extended.experimental_between_graph
+        )
+        batch_size = 64
+        steps = 2
+        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
+            batch_size, steps
+        )
+        with strategy.scope():
+            model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
+        orig_loss, _ = model.evaluate(train_ds, steps=steps)
+        history = model.fit(
+            x=train_ds,
+            epochs=2,
+            steps_per_epoch=steps,
+            callbacks=[verification_callback],
+        )
+        self.assertIsInstance(history, keras.callbacks.History)
+        trained_loss, _ = model.evaluate(train_ds, steps=steps)
+        self.assertLess(trained_loss, orig_loss)
+
+        verification_callback.verify(self)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            strategy=[
+                tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,  # noqa: E501
+            ],
+        )
+    )
+    def test_distribution_reduction_method_auto_default_train_step(
+        self, strategy
+    ):
+        BATCH = 4
+        EPOCHS = 1
+        STEPS = 2
+
+        # Dataset's targets are [0, 1, 2, 3, 4, 5, 6, 7]:
+        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
+            BATCH, STEPS, target_values="increasing"
+        )
+
+        # A model that always outputs `sum(inputs*0) + 1 = 1`
+        with strategy.scope():
+            inputs = keras.Input(shape=(28, 28, 1))
+            x = keras.layers.Flatten()(inputs)
+            x = keras.layers.Dense(
+                1, kernel_initializer="zeros", bias_initializer="ones"
+            )(x)
+            model = keras.Model(inputs=inputs, outputs=x)
+            model.trainable = False
+            # model.distribute_reduction_method = 'auto'
+
+            model.compile(
+                loss=keras.losses.MeanAbsoluteError(
+                    reduction=keras.losses.losses_utils.ReductionV2.NONE
+                ),
+                optimizer=multi_worker_testing_utils.gradient_descent.SGD(
+                    learning_rate=0.001
+                ),
+                metrics=["mse"],
+            )
+
+        # For every output x_i = 1, and increasing target values in [0, 8):
+        #   loss_i = |i-1|
+        #   loss   = (|0-1| + |1-1| + |2-1| + ... |7-1|) / (BATCH*STEPS)
+        #          = (1+0+1+2+3+4+5+6) / 8 = 2.75
+        orig_loss, _ = model.evaluate(train_ds, steps=STEPS)
+        self.assertEqual(2.75, orig_loss)
+
+        history = model.fit(train_ds, epochs=EPOCHS, steps_per_epoch=STEPS)
+        self.assertAllClose(history.history["loss"], [2.75] * EPOCHS)
+
+        trained_loss, _ = model.evaluate(train_ds, steps=STEPS)
+        self.assertEqual(2.75, trained_loss)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            strategy=[
+                tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,  # noqa: E501
+                tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,  # noqa: E501
+            ],
+        )
+    )
+    def test_distribution_reduction_method_auto_custom_train_step(
+        self, strategy
+    ):
+        BATCH = 4
+        EPOCHS = 1
+        STEPS = 2
+
+        # Dataset's targets are [0, 1, 2, 3, 4, 5, 6, 7]:
+        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
+            BATCH, STEPS, target_values="increasing"
+        )
+
+        # A model that has loss=sum(targets) / BATCH:
+        class MyModel(keras.Model):
+            def train_step(self, data):
+                _, y = data
+                loss_value = tf.cast(y, tf.float32)
+                loss_value = tf.nn.compute_average_loss(
+                    loss_value, global_batch_size=BATCH
+                )
+                return {"loss": loss_value}
+
+            def test_step(self, data):
+                _, y = data
+                loss_value = tf.cast(y, tf.float32)
+                loss_value = tf.nn.compute_average_loss(
+                    loss_value, global_batch_size=BATCH
+                )
+                return {"loss": loss_value}
+
+        with strategy.scope():
+            inputs = keras.Input(shape=(28, 28, 1))
+            x = keras.layers.Flatten()(inputs)
+            x = keras.layers.Dense(
+                1, kernel_initializer="ones", bias_initializer="ones"
+            )(x)
+            model = MyModel(inputs=inputs, outputs=x)
+            # model.distribute_reduction_method = 'auto'
+
+            model.compile(
+                optimizer=multi_worker_testing_utils.gradient_descent.SGD(
+                    learning_rate=0.001
+                ),
+            )
+
+        # For epochs=1 steps=2 replicas=2 batch=4, and increasing target vals,
+        #   loss_e0_s0_r0 = [0+1]/BATCH =  1/4
+        #   loss_e0_s0_r1 = [2+3]/BATCH =  5/4
+        #   loss_e0_s0    = 1/4 + 5/4   = 1.5
+        #   loss_e0_s1_r0 = [4+5]/BATCH =  9/4
+        #   loss_e0_s2_r1 = [6+7]/BATCH = 13/4
+        #   loss_e0_s1    = 9/4 + 13/4   = 5.5
+        #   loss_e0       = last([1.5, 5.5])
+        history = model.fit(train_ds, epochs=EPOCHS, steps_per_epoch=STEPS)
+        self.assertAllClose([5.5], history.history["loss"])
+
+        eval_output = model.evaluate(train_ds, steps=STEPS)
+        self.assertAllClose(5.5, eval_output)
+
+
+class KPLMultiWorkerTest(tf.test.TestCase, parameterized.TestCase):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            use_adapt=[False],  # TODO(b/180742437): Add tests for using adapt.
+            strategy=[
+                tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,  # noqa: E501
+                # TODO(b/183956672): Re-enable
+                # strategy_combinations.multi_worker_mirrored_2x2_gpu,
+            ],
+        )
+    )
+    def testTrainAndServeWithKPL(self, use_adapt, strategy):
+        test_utils_obj = kpl_test_utils.DistributeKplTestUtils()
+        with strategy.scope():
+            (
+                feature_mapper,
+                label_mapper,
+            ) = test_utils_obj.define_kpls_for_training(use_adapt)
+            model = test_utils_obj.define_model()
+            optimizer = rmsprop.RMSprop(learning_rate=0.1)
+            accuracy = keras.metrics.Accuracy()
+
+            def dataset_fn(_):
+                return test_utils_obj.dataset_fn(feature_mapper, label_mapper)
+
+            @tf.function
+            def train_step(iterator):
+                """The step function for one training step."""
+
+                def step_fn(inputs):
+                    """The computation to run on each worker."""
+                    features, labels = inputs
+                    with tf.GradientTape() as tape:
+                        pred = model(features, training=True)
+                        loss = keras.losses.binary_crossentropy(labels, pred)
+                        loss = tf.nn.compute_average_loss(loss)
+                    grads = tape.gradient(loss, model.trainable_variables)
+                    optimizer.apply_gradients(
+                        list(zip(grads, model.trainable_variables))
+                    )
+
+                    actual_pred = tf.cast(tf.greater(pred, 0.5), tf.int64)
+                    accuracy.update_state(labels, actual_pred)
+
+                strategy.run(step_fn, args=(next(iterator),))
+
+            distributed_dataset = strategy.distribute_datasets_from_function(
+                dataset_fn
+            )
+            distributed_iterator = iter(distributed_dataset)
+            num_epochs = 4
+            num_steps = 7
+            for _ in range(num_epochs):
+                accuracy.reset_state()
+                for _ in range(num_steps):
+                    train_step(distributed_iterator)
+
+            self.assertGreater(accuracy.result().numpy(), 0.5)
+            self.assertEqual(
+                optimizer.iterations.numpy(), num_epochs * num_steps
+            )
+
+        # Test save/load/serving the trained model.
+        test_utils_obj.test_save_load_serving_model(
+            model, feature_mapper, test_utils_obj.define_reverse_lookup_layer()
+        )
+
+
+if __name__ == "__main__":
+    # Enable manual variable initialization to make sure variables are
+    # initialized by `init_restore_or_wait_for_variables`.
+    backend.manual_variable_initialization(True)
+    with tf.compat.v1.test.mock.patch.object(sys, "exit", os._exit):
+        tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/multi_worker_testing_utils.py b/keras/distribute/multi_worker_testing_utils.py
index e9b4e319a509..c0fd9d19d969 100644
--- a/keras/distribute/multi_worker_testing_utils.py
+++ b/keras/distribute/multi_worker_testing_utils.py
@@ -14,224 +14,259 @@
 # ==============================================================================
 """Utilities for testing multi-worker distribution strategies with Keras."""
 
-import tensorflow.compat.v2 as tf
-
 import threading
 import unittest
+
+import tensorflow.compat.v2 as tf
+
 import keras
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from keras.optimizers.optimizer_v2 import gradient_descent
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.server_lib import ClusterSpec
+from keras.optimizers.legacy import gradient_descent
 
+# isort: off
+from tensorflow.python.distribute.cluster_resolver import (
+    SimpleClusterResolver,
+)
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.server_lib import (
+    ClusterSpec,
+)
 
 _portpicker_import_error = None
 try:
-  import portpicker  # pylint: disable=g-import-not-at-top
-except (ImportError, ModuleNotFoundError) as _error:  # pylint: disable=invalid-name
-  _portpicker_import_error = _error
-  portpicker = None
+    import portpicker
+except (
+    ImportError,
+    ModuleNotFoundError,
+) as _error:
+    _portpicker_import_error = _error
+    portpicker = None
 
 ASSIGNED_PORTS = set()
 lock = threading.Lock()
 
 
-def mnist_synthetic_dataset(batch_size, steps_per_epoch):
-  """Generate synthetic MNIST dataset for testing."""
-  # train dataset
-  x_train = tf.ones([batch_size * steps_per_epoch, 28, 28, 1],
-                           dtype=tf.float32)
-  y_train = tf.ones([batch_size * steps_per_epoch, 1],
-                           dtype=tf.int32)
-  train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-  train_ds = train_ds.repeat()
-  # train_ds = train_ds.shuffle(100)
-  train_ds = train_ds.batch(64, drop_remainder=True)
+def mnist_synthetic_dataset(
+    batch_size, steps_per_epoch, target_values="constant"
+):
+    """Generate synthetic MNIST dataset for testing."""
+    # train dataset
+    x_train = tf.ones(
+        [batch_size * steps_per_epoch, 28, 28, 1], dtype=tf.float32
+    )
+    if target_values == "constant":
+        y_train = tf.ones([batch_size * steps_per_epoch, 1], dtype=tf.int32)
+    elif target_values == "increasing":
+        y_train = tf.reshape(
+            tf.range(batch_size * steps_per_epoch, dtype=tf.int32), (-1, 1)
+        )
+    else:
+        raise ValueError(
+            'Unknown value for `target_values` "'
+            + str(target_values)
+            + '". Valid options are "constant" and "increasing".'
+        )
+
+    train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    train_ds = train_ds.repeat()
+    # train_ds = train_ds.shuffle(100)
+    train_ds = train_ds.batch(batch_size, drop_remainder=True)
 
-  # eval dataset
-  x_test = tf.random.uniform([10000, 28, 28, 1], dtype=tf.float32)
-  y_test = tf.random.uniform([10000, 1],
-                                     minval=0,
-                                     maxval=9,
-                                     dtype=tf.int32)
-  eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
-  eval_ds = eval_ds.batch(64, drop_remainder=True)
+    # eval dataset
+    x_test = tf.random.uniform([10000, 28, 28, 1], dtype=tf.float32)
+    y_test = tf.random.uniform([10000, 1], minval=0, maxval=9, dtype=tf.int32)
+    eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+    eval_ds = eval_ds.batch(batch_size, drop_remainder=True)
 
-  return train_ds, eval_ds
+    return train_ds, eval_ds
 
 
 def get_mnist_model(input_shape):
-  """Define a deterministically-initialized CNN model for MNIST testing."""
-  inputs = keras.Input(shape=input_shape)
-  x = keras.layers.Conv2D(
-      32,
-      kernel_size=(3, 3),
-      activation="relu",
-      kernel_initializer=keras.initializers.TruncatedNormal(seed=99))(inputs)
-  x = keras.layers.BatchNormalization()(x)
-  x = keras.layers.Flatten()(x) + keras.layers.Flatten()(x)
-  x = keras.layers.Dense(
-      10,
-      activation="softmax",
-      kernel_initializer=keras.initializers.TruncatedNormal(seed=99))(x)
-  model = keras.Model(inputs=inputs, outputs=x)
-
-  # TODO(yuefengz): optimizer with slot variables doesn't work because of
-  # optimizer's bug.
-  # TODO(yuefengz): we should not allow non-v2 optimizer.
-  model.compile(
-      loss=keras.losses.sparse_categorical_crossentropy,
-      optimizer=gradient_descent.SGD(learning_rate=0.001),
-      metrics=["accuracy"])
-  return model
+    """Define a deterministically-initialized CNN model for MNIST testing."""
+    inputs = keras.Input(shape=input_shape)
+    x = keras.layers.Conv2D(
+        32,
+        kernel_size=(3, 3),
+        activation="relu",
+        kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
+    )(inputs)
+    x = keras.layers.BatchNormalization()(x)
+    x = keras.layers.Flatten()(x) + keras.layers.Flatten()(x)
+    x = keras.layers.Dense(
+        10,
+        activation="softmax",
+        kernel_initializer=keras.initializers.TruncatedNormal(seed=99),
+    )(x)
+    model = keras.Model(inputs=inputs, outputs=x)
+
+    # TODO(yuefengz): optimizer with slot variables doesn't work because of
+    # optimizer's bug.
+    # TODO(yuefengz): we should not allow non-v2 optimizer.
+    model.compile(
+        loss=keras.losses.sparse_categorical_crossentropy,
+        optimizer=gradient_descent.SGD(learning_rate=0.001),
+        metrics=["accuracy"],
+    )
+    return model
 
 
 def make_parameter_server_cluster(num_workers, num_ps):
-  cluster_def = create_in_process_cluster(
-      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
-  return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
+    cluster_def = create_in_process_cluster(
+        num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc"
+    )
+    return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
 
 
 def pick_unused_port():
-  """Returns an unused and unassigned local port."""
-  if _portpicker_import_error:
-    raise _portpicker_import_error  # pylint: disable=raising-bad-type
-
-  global ASSIGNED_PORTS
-  with lock:
-    while True:
-      try:
-        port = portpicker.pick_unused_port()
-      except portpicker.NoFreePortFoundError:
-        raise unittest.SkipTest("Flakes in portpicker library do not represent "
-                                "TensorFlow errors.")
-      if port > 10000 and port not in ASSIGNED_PORTS:
-        ASSIGNED_PORTS.add(port)
-        logging.info("Using local port %r", port)
-        return port
-
-
-def _create_cluster(num_workers,
-                    num_ps,
-                    has_chief=False,
-                    has_eval=False,
-                    protocol="grpc",
-                    worker_config=None,
-                    ps_config=None,
-                    eval_config=None,
-                    worker_name="worker",
-                    ps_name="ps",
-                    chief_name="chief"):
-  """Creates and starts local servers and returns the cluster_spec dict."""
-  if _portpicker_import_error:
-    raise _portpicker_import_error  # pylint: disable=raising-bad-type
-  worker_ports = [pick_unused_port() for _ in range(num_workers)]
-  ps_ports = [pick_unused_port() for _ in range(num_ps)]
-
-  cluster_dict = {}
-  if num_workers > 0:
-    cluster_dict[worker_name] = ["localhost:%s" % port for port in worker_ports]
-  if num_ps > 0:
-    cluster_dict[ps_name] = ["localhost:%s" % port for port in ps_ports]
-  if has_eval:
-    cluster_dict["evaluator"] = ["localhost:%s" % pick_unused_port()]
-  if has_chief:
-    cluster_dict[chief_name] = ["localhost:%s" % pick_unused_port()]
-
-  cs = tf.train.ClusterSpec(cluster_dict)
-
-  for i in range(num_workers):
-    tf.distribute.Server(
-        cs,
-        job_name=worker_name,
-        protocol=protocol,
-        task_index=i,
-        config=worker_config,
-        start=True)
-
-  for i in range(num_ps):
-    tf.distribute.Server(
-        cs,
-        job_name=ps_name,
-        protocol=protocol,
-        task_index=i,
-        config=ps_config,
-        start=True)
-
-  if has_chief:
-    tf.distribute.Server(
-        cs,
-        job_name=chief_name,
-        protocol=protocol,
-        task_index=0,
-        config=worker_config,
-        start=True)
-
-  if has_eval:
-    tf.distribute.Server(
-        cs,
-        job_name="evaluator",
-        protocol=protocol,
-        task_index=0,
-        config=eval_config,
-        start=True)
-
-  return cluster_dict
-
-
-def create_in_process_cluster(num_workers,
-                              num_ps,
-                              has_chief=False,
-                              has_eval=False,
-                              rpc_layer="grpc"):
-  """Create an in-process cluster that consists of only standard server."""
-  # Leave some memory for cuda runtime.
-  gpu_mem_frac = 0.7 / (num_workers + int(has_chief) + int(has_eval))
-  worker_config = tf.compat.v1.ConfigProto()
-  worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
-
-  # The cluster may hang if workers don't have enough inter_op threads. See
-  # b/172296720 for more details.
-  if worker_config.inter_op_parallelism_threads < num_workers + 1:
-    worker_config.inter_op_parallelism_threads = num_workers + 1
-
-  # Enable collective ops which has no impact on non-collective ops.
-  if has_chief:
-    worker_config.experimental.collective_group_leader = (
-        "/job:chief/replica:0/task:0")
-  else:
-    worker_config.experimental.collective_group_leader = (
-        "/job:worker/replica:0/task:0")
-
-  ps_config = tf.compat.v1.ConfigProto()
-  ps_config.device_count["GPU"] = 0
-
-  eval_config = tf.compat.v1.ConfigProto()
-  eval_config.experimental.collective_group_leader = ""
-
-  # Create in-process servers. Once an in-process tensorflow server is created,
-  # there is no way to terminate it. So we create one cluster per test process.
-  # We could've started the server in another process, we could then kill that
-  # process to terminate the server. The reasons why we don"t want multiple
-  # processes are
-  # 1) it is more difficult to manage these processes;
-  # 2) there is something global in CUDA such that if we initialize CUDA in the
-  # parent process, the child process cannot initialize it again and thus cannot
-  # use GPUs (https://stackoverflow.com/questions/22950047).
-  cluster = None
-  try:
-    cluster = _create_cluster(
-        num_workers,
-        num_ps=num_ps,
-        has_chief=has_chief,
-        has_eval=has_eval,
-        worker_config=worker_config,
-        ps_config=ps_config,
-        eval_config=eval_config,
-        protocol=rpc_layer)
-  except tf.errors.UnknownError as e:
-    if "Could not start gRPC server" in e.message:
-      raise unittest.SkipTest("Cannot start std servers.")
+    """Returns an unused and unassigned local port."""
+    if _portpicker_import_error:
+        raise _portpicker_import_error
+
+    global ASSIGNED_PORTS
+    with lock:
+        while True:
+            try:
+                port = portpicker.pick_unused_port()
+            except portpicker.NoFreePortFoundError:
+                raise unittest.SkipTest(
+                    "Flakes in portpicker library do not represent "
+                    "TensorFlow errors."
+                )
+            if port > 10000 and port not in ASSIGNED_PORTS:
+                ASSIGNED_PORTS.add(port)
+                logging.info("Using local port %r", port)
+                return port
+
+
+def _create_cluster(
+    num_workers,
+    num_ps,
+    has_chief=False,
+    has_eval=False,
+    protocol="grpc",
+    worker_config=None,
+    ps_config=None,
+    eval_config=None,
+    worker_name="worker",
+    ps_name="ps",
+    chief_name="chief",
+):
+    """Creates and starts local servers and returns the cluster_spec dict."""
+    if _portpicker_import_error:
+        raise _portpicker_import_error
+    worker_ports = [pick_unused_port() for _ in range(num_workers)]
+    ps_ports = [pick_unused_port() for _ in range(num_ps)]
+
+    cluster_dict = {}
+    if num_workers > 0:
+        cluster_dict[worker_name] = [
+            f"localhost:{port}" for port in worker_ports
+        ]
+    if num_ps > 0:
+        cluster_dict[ps_name] = [f"localhost:{port}" for port in ps_ports]
+    if has_eval:
+        cluster_dict["evaluator"] = [f"localhost:{pick_unused_port()}"]
+    if has_chief:
+        cluster_dict[chief_name] = [f"localhost:{pick_unused_port()}"]
+
+    cs = tf.train.ClusterSpec(cluster_dict)
+
+    for i in range(num_workers):
+        tf.distribute.Server(
+            cs,
+            job_name=worker_name,
+            protocol=protocol,
+            task_index=i,
+            config=worker_config,
+            start=True,
+        )
+
+    for i in range(num_ps):
+        tf.distribute.Server(
+            cs,
+            job_name=ps_name,
+            protocol=protocol,
+            task_index=i,
+            config=ps_config,
+            start=True,
+        )
+
+    if has_chief:
+        tf.distribute.Server(
+            cs,
+            job_name=chief_name,
+            protocol=protocol,
+            task_index=0,
+            config=worker_config,
+            start=True,
+        )
+
+    if has_eval:
+        tf.distribute.Server(
+            cs,
+            job_name="evaluator",
+            protocol=protocol,
+            task_index=0,
+            config=eval_config,
+            start=True,
+        )
+
+    return cluster_dict
+
+
+def create_in_process_cluster(
+    num_workers, num_ps, has_chief=False, has_eval=False, rpc_layer="grpc"
+):
+    """Create an in-process cluster that consists of only standard server."""
+    # Leave some memory for cuda runtime.
+    gpu_mem_frac = 0.7 / (num_workers + int(has_chief) + int(has_eval))
+    worker_config = tf.compat.v1.ConfigProto()
+    worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
+
+    # The cluster may hang if workers don't have enough inter_op threads. See
+    # b/172296720 for more details.
+    if worker_config.inter_op_parallelism_threads < num_workers + 1:
+        worker_config.inter_op_parallelism_threads = num_workers + 1
+
+    # Enable collective ops which has no impact on non-collective ops.
+    if has_chief:
+        worker_config.experimental.collective_group_leader = (
+            "/job:chief/replica:0/task:0"
+        )
     else:
-      raise
-  return cluster
+        worker_config.experimental.collective_group_leader = (
+            "/job:worker/replica:0/task:0"
+        )
+
+    ps_config = tf.compat.v1.ConfigProto()
+    ps_config.device_count["GPU"] = 0
+
+    eval_config = tf.compat.v1.ConfigProto()
+    eval_config.experimental.collective_group_leader = ""
+
+    # Create in-process servers. Once an in-process tensorflow server is
+    # created, there is no way to terminate it. So we create one cluster per
+    # test process.  We could've started the server in another process, we could
+    # then kill that process to terminate the server. The reasons why we don"t
+    # want multiple processes are
+    # 1) it is more difficult to manage these processes;
+    # 2) there is something global in CUDA such that if we initialize CUDA in
+    # the parent process, the child process cannot initialize it again and thus
+    # cannot use GPUs (https://stackoverflow.com/questions/22950047).
+    cluster = None
+    try:
+        cluster = _create_cluster(
+            num_workers,
+            num_ps=num_ps,
+            has_chief=has_chief,
+            has_eval=has_eval,
+            worker_config=worker_config,
+            ps_config=ps_config,
+            eval_config=eval_config,
+            protocol=rpc_layer,
+        )
+    except tf.errors.UnknownError as e:
+        if "Could not start gRPC server" in e.message:
+            raise unittest.SkipTest("Cannot start std servers.")
+        else:
+            raise
+    return cluster
diff --git a/keras/distribute/optimizer_combinations.py b/keras/distribute/optimizer_combinations.py
index 8a585a00dea4..9df667080acd 100644
--- a/keras/distribute/optimizer_combinations.py
+++ b/keras/distribute/optimizer_combinations.py
@@ -14,97 +14,123 @@
 # ==============================================================================
 """Strategy and optimizer combinations for combinations.combine()."""
 
-from keras.optimizers.optimizer_experimental import adam as adam_experimental
-from keras.optimizers.optimizer_v2 import adadelta as adadelta_keras_v2
-from keras.optimizers.optimizer_v2 import adagrad as adagrad_keras_v2
-from keras.optimizers.optimizer_v2 import adam as adam_keras_v2
-from keras.optimizers.optimizer_v2 import adamax as adamax_keras_v2
-from keras.optimizers.optimizer_v2 import ftrl as ftrl_keras_v2
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_keras_v2
-from keras.optimizers.optimizer_v2 import nadam as nadam_keras_v2
-from keras.optimizers.optimizer_v2 import rmsprop as rmsprop_keras_v2
 import tensorflow.compat.v2 as tf
 
+from keras.optimizers import adam as adam_experimental
+from keras.optimizers.legacy import adadelta as adadelta_keras_v2
+from keras.optimizers.legacy import adagrad as adagrad_keras_v2
+from keras.optimizers.legacy import adam as adam_keras_v2
+from keras.optimizers.legacy import adamax as adamax_keras_v2
+from keras.optimizers.legacy import ftrl as ftrl_keras_v2
+from keras.optimizers.legacy import (
+    gradient_descent as gradient_descent_keras_v2,
+)
+from keras.optimizers.legacy import nadam as nadam_keras_v2
+from keras.optimizers.legacy import rmsprop as rmsprop_keras_v2
 
-gradient_descent_optimizer_v1_fn = tf.__internal__.test.combinations.NamedObject(
-    "GradientDescentV1",
-    lambda: tf.compat.v1.train.GradientDescentOptimizer(0.001))
+gradient_descent_optimizer_v1_fn = (
+    tf.__internal__.test.combinations.NamedObject(
+        "GradientDescentV1",
+        lambda: tf.compat.v1.train.GradientDescentOptimizer(0.001),
+    )
+)
 adagrad_optimizer_v1_fn = tf.__internal__.test.combinations.NamedObject(
-    "AdagradV1", lambda: tf.compat.v1.train.AdagradOptimizer(0.001))
+    "AdagradV1", lambda: tf.compat.v1.train.AdagradOptimizer(0.001)
+)
 adam_optimizer_v1_fn = tf.__internal__.test.combinations.NamedObject(
-    "AdamV1", lambda: tf.compat.v1.train.AdamOptimizer(0.001, epsilon=1))
+    "AdamV1", lambda: tf.compat.v1.train.AdamOptimizer(0.001, epsilon=1)
+)
 ftrl_optimizer_v1_fn = tf.__internal__.test.combinations.NamedObject(
-    "FtrlV1", lambda: tf.compat.v1.train.FtrlOptimizer(0.001))
+    "FtrlV1", lambda: tf.compat.v1.train.FtrlOptimizer(0.001)
+)
 rmsprop_optimizer_v1_fn = tf.__internal__.test.combinations.NamedObject(
-    "RmsPropV1", lambda: tf.compat.v1.train.RMSPropOptimizer(0.001))
+    "RmsPropV1", lambda: tf.compat.v1.train.RMSPropOptimizer(0.001)
+)
 
 # TODO(shiningsun): consider adding the other v1 optimizers
 optimizers_v1 = [
-    gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn,
-    ftrl_optimizer_v1_fn, rmsprop_optimizer_v1_fn
+    gradient_descent_optimizer_v1_fn,
+    adagrad_optimizer_v1_fn,
+    ftrl_optimizer_v1_fn,
+    rmsprop_optimizer_v1_fn,
 ]
 
 adadelta_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001))
+    "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001)
+)
 adagrad_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
+    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001)
+)
 adam_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0))
+    "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0)
+)
 adam_experimental_fn = tf.__internal__.test.combinations.NamedObject(
-    "AdamExperimental", lambda: adam_experimental.Adam(0.001))
+    "AdamExperimental", lambda: adam_experimental.Adam(0.001)
+)
 adamax_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "AdamaxKerasV2", lambda: adamax_keras_v2.Adamax(0.001, epsilon=1.0))
+    "AdamaxKerasV2", lambda: adamax_keras_v2.Adamax(0.001, epsilon=1.0)
+)
 nadam_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "NadamKerasV2", lambda: nadam_keras_v2.Nadam(0.001, epsilon=1.0))
+    "NadamKerasV2", lambda: nadam_keras_v2.Nadam(0.001, epsilon=1.0)
+)
 ftrl_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "FtrlKerasV2", lambda: ftrl_keras_v2.Ftrl(0.001))
-gradient_descent_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "GradientDescentKerasV2", lambda: gradient_descent_keras_v2.SGD(0.001))
+    "FtrlKerasV2", lambda: ftrl_keras_v2.Ftrl(0.001)
+)
+gradient_descent_optimizer_keras_v2_fn = (
+    tf.__internal__.test.combinations.NamedObject(
+        "GradientDescentKerasV2", lambda: gradient_descent_keras_v2.SGD(0.001)
+    )
+)
 rmsprop_optimizer_keras_v2_fn = tf.__internal__.test.combinations.NamedObject(
-    "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001))
+    "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001)
+)
 
 # TODO(shiningsun): consider adding the other v2 optimizers
 optimizers_v2 = [
-    gradient_descent_optimizer_keras_v2_fn, adagrad_optimizer_keras_v2_fn
+    gradient_descent_optimizer_keras_v2_fn,
+    adagrad_optimizer_keras_v2_fn,
 ]
 
 optimizers_v1_and_v2 = optimizers_v1 + optimizers_v2
 
 
 def distributions_and_v1_optimizers():
-  """A common set of combination with DistributionStrategies and Optimizers."""
-  return tf.__internal__.test.combinations.combine(
-      distribution=[
-          tf.__internal__.distribute.combinations.one_device_strategy,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-          tf.__internal__.distribute.combinations
-          .mirrored_strategy_with_two_gpus_no_merge_call,
-      ],
-      optimizer_fn=optimizers_v1)
+    """A common set of combination with DistributionStrategies and
+    Optimizers."""
+    return tf.__internal__.test.combinations.combine(
+        distribution=[
+            tf.__internal__.distribute.combinations.one_device_strategy,
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus_no_merge_call,  # noqa: E501
+        ],
+        optimizer_fn=optimizers_v1,
+    )
 
 
 def distributions_and_v2_optimizers():
-  """A common set of combination with DistributionStrategies and Optimizers."""
-  return tf.__internal__.test.combinations.combine(
-      distribution=[
-          tf.__internal__.distribute.combinations.one_device_strategy,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-          tf.__internal__.distribute.combinations
-          .mirrored_strategy_with_two_gpus_no_merge_call,
-      ],
-      optimizer_fn=optimizers_v2)
+    """A common set of combination with DistributionStrategies and
+    Optimizers."""
+    return tf.__internal__.test.combinations.combine(
+        distribution=[
+            tf.__internal__.distribute.combinations.one_device_strategy,
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus_no_merge_call,  # noqa: E501
+        ],
+        optimizer_fn=optimizers_v2,
+    )
 
 
 def distributions_and_v1_and_v2_optimizers():
-  """A common set of combination with DistributionStrategies and Optimizers."""
-  return tf.__internal__.test.combinations.combine(
-      distribution=[
-          tf.__internal__.distribute.combinations.one_device_strategy,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
-          tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-          tf.__internal__.distribute.combinations
-          .mirrored_strategy_with_two_gpus_no_merge_call,
-      ],
-      optimizer_fn=optimizers_v1_and_v2)
+    """A common set of combination with DistributionStrategies and
+    Optimizers."""
+    return tf.__internal__.test.combinations.combine(
+        distribution=[
+            tf.__internal__.distribute.combinations.one_device_strategy,
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,  # noqa: E501
+            tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus_no_merge_call,  # noqa: E501
+        ],
+        optimizer_fn=optimizers_v1_and_v2,
+    )
diff --git a/keras/distribute/parameter_server_evaluation_test.py b/keras/distribute/parameter_server_evaluation_test.py
index d1e67ea01705..647d35d85a2a 100644
--- a/keras/distribute/parameter_server_evaluation_test.py
+++ b/keras/distribute/parameter_server_evaluation_test.py
@@ -16,161 +16,180 @@
 
 import time
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
-from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+# isort: off
+from tensorflow.python.distribute import (
+    multi_worker_test_base,
+)
+from tensorflow.python.distribute.cluster_resolver import (
+    SimpleClusterResolver,
+)
 from tensorflow.python.ops import resource_variable_ops
 
 
 # TODO(yuefengz): move the following implementation to Keras core.
 class MeanMetricSpec(tf.TypeSpec):
+    def __init__(self, config, weights):
+        self._config = config
+        self._weights = weights
 
-  def __init__(self, config, weights):
-    self._config = config
-    self._weights = weights
-
-  def _serialize(self):
-    return (self._config, self._weights)
+    def _serialize(self):
+        return (self._config, self._weights)
 
-  @property
-  def value_type(self):
-    return MeanMetricAsCompositeTensor
+    @property
+    def value_type(self):
+        return MeanMetricAsCompositeTensor
 
-  @property
-  def _component_specs(self):
-    return self._weights
+    @property
+    def _component_specs(self):
+        return self._weights
 
-  def _to_components(self, value):
-    return value.weights
+    def _to_components(self, value):
+        return value.weights
 
-  def _from_components(self, weights):
-    counter = [0]
+    def _from_components(self, weights):
+        counter = [0]
 
-    def fetch_variable(next_creator, **kwargs):
-      del next_creator, kwargs
-      # TODO(yuefengz): verify the var creation order matches the weights
-      # property
-      var = weights[counter[0]]
-      counter[0] += 1
-      return var
+        def fetch_variable(next_creator, **kwargs):
+            del next_creator, kwargs
+            # TODO(yuefengz): verify the var creation order matches the weights
+            # property
+            var = weights[counter[0]]
+            counter[0] += 1
+            return var
 
-    with tf.variable_creator_scope(fetch_variable):
-      ret = MeanMetricAsCompositeTensor.from_config(self._config)
-    assert len(weights) == len(ret.weights)
-    return ret
+        with tf.variable_creator_scope(fetch_variable):
+            ret = MeanMetricAsCompositeTensor.from_config(self._config)
+        assert len(weights) == len(ret.weights)
+        return ret
 
 
-class MeanMetricAsCompositeTensor(keras.metrics.Mean,
-                                  tf.__internal__.CompositeTensor):
+class MeanMetricAsCompositeTensor(
+    keras.metrics.Mean, tf.__internal__.CompositeTensor
+):
+    def element_spec(self):
+        raise NotImplementedError("element_spec not implemented")
 
-  def element_spec(self):
-    raise NotImplementedError("element_spec not implemented")
-
-  @property
-  def _type_spec(self):
-    weight_specs = [
-        resource_variable_ops.VariableSpec.from_value(w) for w in self.weights]
-    return MeanMetricSpec(self.get_config(), weight_specs)
+    @property
+    def _type_spec(self):
+        weight_specs = [
+            resource_variable_ops.VariableSpec.from_value(w)
+            for w in self.weights
+        ]
+        return MeanMetricSpec(self.get_config(), weight_specs)
 
 
 @test_utils.run_v2_only
 class EvaluationTest(tf.test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super(EvaluationTest, cls).setUpClass()
-    cls._cluster = multi_worker_test_base.create_multi_process_cluster(
-        num_workers=3, num_ps=2, rpc_layer="grpc")
-    cls._cluster_def = cls._cluster.cluster_resolver.cluster_spec().as_dict()
-    cluster_resolver = SimpleClusterResolver(
-        tf.train.ClusterSpec(cls._cluster_def), rpc_layer="grpc")
-
-    cls.strategy = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-    cls.cluster_coord = tf.distribute.experimental.coordinator.ClusterCoordinator(cls.strategy)
-
-  @classmethod
-  def tearDownClass(cls):
-    cls._cluster.stop()
-    cls._cluster = None
-    super(EvaluationTest, cls).tearDownClass()
-
-  def testPassMetricToTfFunction(self):
-    metric1 = MeanMetricAsCompositeTensor()
-    metric2 = MeanMetricAsCompositeTensor()
-
-    self.assertEqual(metric1.result(), 0.0)
-    self.assertEqual(metric2.result(), 0.0)
-
-    tf.nest.assert_same_structure(
-        metric1, metric2._type_spec, expand_composites=True)
-    tf.nest.assert_same_structure(
-        metric1._type_spec, metric2, expand_composites=True)
-
-    @tf.function
-    def func(m):
-      m.update_state([1.0, 2.0])
-
-    func(metric1)
-    self.assertEqual(metric1.result(), 1.5)
-    self.assertEqual(metric2.result(), 0.0)
-
-    concrete_f = func.get_concrete_function(metric1._type_spec)
-    concrete_f(metric2)
-    self.assertEqual(metric1.result(), 1.5)
-    self.assertEqual(metric2.result(), 1.5)
-
-  def testModelEvaluatePrototype(self):
-
-    def metric_fn():
-      return MeanMetricAsCompositeTensor()
-
-    # TODO(yuefengz): make _create_per_worker_resources public and get rid of
-    # the type_spec hack.
-    per_worker_metric = self.cluster_coord._create_per_worker_resources(
-        metric_fn)
-
-    metric_on_coordinator = metric_fn()
-
-    for metric_remote_value in per_worker_metric._values:
-      metric_remote_value._type_spec = metric_on_coordinator._type_spec
-
-    def dataset_fn():
-      return tf.data.Dataset.range(1024)
-
-    # TODO(yuefengz): integrate it into model.evaluate.
-
-    @tf.function
-    def eval_fn(total_shard, shard_id, metric):
-      metric.reset_states()
-      dataset_shard = dataset_fn().shard(total_shard, shard_id)
-      for i in dataset_shard:
-        metric.update_state(i)
-
-      # TODO(yuefengz): we should return the internal state of the metric and
-      # then use the combiner API.
-      return metric.result()
-
-    total_shards = 128
-    result_remote_values = []
-    for i in range(total_shards):
-      result_remote_values.append(
-          self.cluster_coord.schedule(
-              eval_fn, args=(total_shards, i, per_worker_metric)))
-
-    self._cluster.kill_task("worker", 0)
-    self._cluster.kill_task("worker", 1)
-    time.sleep(1)
-    self._cluster.start_task("worker", 0)
-    self._cluster.start_task("worker", 1)
-
-    results = [r.fetch() for r in result_remote_values]
-    result = sum(results) / len(results)
-    self.assertEqual(result, 511.5)
+    @classmethod
+    def setUpClass(cls):
+        super(EvaluationTest, cls).setUpClass()
+        cls._cluster = multi_worker_test_base.create_multi_process_cluster(
+            num_workers=3, num_ps=2, rpc_layer="grpc"
+        )
+        cls._cluster_def = (
+            cls._cluster.cluster_resolver.cluster_spec().as_dict()
+        )
+        cluster_resolver = SimpleClusterResolver(
+            tf.train.ClusterSpec(cls._cluster_def), rpc_layer="grpc"
+        )
+
+        cls.strategy = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver
+        )
+        cls.cluster_coord = (
+            tf.distribute.experimental.coordinator.ClusterCoordinator(
+                cls.strategy
+            )
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._cluster.stop()
+        cls._cluster = None
+        super(EvaluationTest, cls).tearDownClass()
+
+    def testPassMetricToTfFunction(self):
+        metric1 = MeanMetricAsCompositeTensor()
+        metric2 = MeanMetricAsCompositeTensor()
+
+        self.assertEqual(metric1.result(), 0.0)
+        self.assertEqual(metric2.result(), 0.0)
+
+        tf.nest.assert_same_structure(
+            metric1, metric2._type_spec, expand_composites=True
+        )
+        tf.nest.assert_same_structure(
+            metric1._type_spec, metric2, expand_composites=True
+        )
+
+        @tf.function
+        def func(m):
+            m.update_state([1.0, 2.0])
+
+        func(metric1)
+        self.assertEqual(metric1.result(), 1.5)
+        self.assertEqual(metric2.result(), 0.0)
+
+        concrete_f = func.get_concrete_function(metric1._type_spec)
+        concrete_f(metric2)
+        self.assertEqual(metric1.result(), 1.5)
+        self.assertEqual(metric2.result(), 1.5)
+
+    def testModelEvaluatePrototype(self):
+        def metric_fn():
+            return MeanMetricAsCompositeTensor()
+
+        # TODO(yuefengz): make _create_per_worker_resources public and get rid
+        # of the type_spec hack.
+        per_worker_metric = self.cluster_coord._create_per_worker_resources(
+            metric_fn
+        )
+
+        metric_on_coordinator = metric_fn()
+
+        for metric_remote_value in per_worker_metric._values:
+            metric_remote_value._type_spec = metric_on_coordinator._type_spec
+
+        def dataset_fn():
+            return tf.data.Dataset.range(1024)
+
+        # TODO(yuefengz): integrate it into model.evaluate.
+
+        @tf.function
+        def eval_fn(total_shard, shard_id, metric):
+            metric.reset_states()
+            dataset_shard = dataset_fn().shard(total_shard, shard_id)
+            for i in dataset_shard:
+                metric.update_state(i)
+
+            # TODO(yuefengz): we should return the internal state of the metric
+            # and then use the combiner API.
+            return metric.result()
+
+        total_shards = 128
+        result_remote_values = []
+        for i in range(total_shards):
+            result_remote_values.append(
+                self.cluster_coord.schedule(
+                    eval_fn, args=(total_shards, i, per_worker_metric)
+                )
+            )
+
+        self._cluster.kill_task("worker", 0)
+        self._cluster.kill_task("worker", 1)
+        time.sleep(1)
+        self._cluster.start_task("worker", 0)
+        self._cluster.start_task("worker", 1)
+
+        results = [r.fetch() for r in result_remote_values]
+        result = sum(results) / len(results)
+        self.assertEqual(result, 511.5)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/parameter_server_exact_evaluation_test.py b/keras/distribute/parameter_server_exact_evaluation_test.py
new file mode 100644
index 000000000000..097fbdffdba3
--- /dev/null
+++ b/keras/distribute/parameter_server_exact_evaluation_test.py
@@ -0,0 +1,484 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for evaluation using Keras model and ParameterServerStrategy."""
+import threading
+import time
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+from tensorflow.python.platform import tf_logging as logging
+
+import keras
+from keras.metrics import base_metric
+from keras.testing_infra import test_utils
+from keras.utils import dataset_creator
+from keras.utils import tf_utils
+
+# isort: off
+from tensorflow.python.distribute import (
+    multi_worker_test_base,
+)
+from tensorflow.python.distribute.cluster_resolver import (
+    SimpleClusterResolver,
+)
+
+
+def _aggregate_results(coordinator_metrics, results):
+    for result in results:
+        for metric in coordinator_metrics:
+            if metric.name == "loss":
+                continue
+            assert metric.name in result.keys()
+            metric_result = result[metric.name]
+            assert len(metric_result) == len(metric.weights)
+            for weight, val in zip(metric.weights, metric_result):
+                weight.assign_add(val)
+    return coordinator_metrics
+
+
+def make_binary_dataset_fn(num_examples, num_data_shards, batch_size):
+    def dataset_fn(input_context=None):
+        del input_context
+        x = np.arange(num_examples)
+
+        def make_batch_with_n_true(n):
+            return np.concatenate((np.ones(n), np.zeros(batch_size - n)))
+
+        y = np.zeros(num_examples)
+        batch_idxs = np.arange(num_examples // batch_size)
+        for shard_idx in range(num_data_shards):
+            num_correct = shard_idx
+            # Dataset.shard uses mod sharding, so each shard consists of the
+            # batches whose index mod (num_data_shards) = shard_idx
+            batch_idxs_for_shard = np.where(
+                np.mod(batch_idxs, num_data_shards) == shard_idx
+            )[0]
+            for batch_idx in batch_idxs_for_shard:
+                # Select the individual data elements for this batch
+                batch_range = range(
+                    batch_idx * batch_size, (batch_idx + 1) * batch_size
+                )
+                num_for_batch = min(num_correct, batch_size)
+                y[batch_range] = make_batch_with_n_true(num_for_batch)
+                num_correct -= num_for_batch
+
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+
+        dataset = dataset.batch(batch_size)
+        return dataset
+
+    return dataset_fn
+
+
+def make_multiclass_dataset_fn(
+    num_examples, num_data_shards, batch_size, n_classes
+):
+    def dataset_fn(input_context=None):
+        del input_context
+        x = np.arange(num_examples)
+        y = np.mod(np.arange(num_examples), n_classes)
+        y[y == 0] = 1
+        y = tf.convert_to_tensor(y, dtype=tf.int64)
+        weights = np.random.uniform(size=num_examples)
+        dataset = tf.data.Dataset.from_tensor_slices((x, y, weights)).batch(
+            batch_size
+        )
+        return dataset
+
+    return dataset_fn
+
+
+@test_utils.run_v2_only
+class ExactEvaluationTest(tf.test.TestCase, parameterized.TestCase):
+    def setUp(self):
+        super(ExactEvaluationTest, self).setUp()
+        self._cluster = multi_worker_test_base.create_multi_process_cluster(
+            num_workers=5, num_ps=1, rpc_layer="grpc"
+        )
+        self._cluster_def = (
+            self._cluster.cluster_resolver.cluster_spec().as_dict()
+        )
+        cluster_resolver = SimpleClusterResolver(
+            tf.train.ClusterSpec(self._cluster_def), rpc_layer="grpc"
+        )
+
+        self.strategy = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver
+        )
+        self.cluster_coord = (
+            tf.distribute.experimental.coordinator.ClusterCoordinator(
+                self.strategy
+            )
+        )
+
+    def tearDown(self):
+        super(ExactEvaluationTest, self).tearDown()
+        self._cluster.stop()
+        self._cluster = None
+
+    def testDistributedMetrics(self):
+        coordinator_metrics = [
+            keras.metrics.AUC(),
+            keras.metrics.MeanAbsoluteError(),
+        ]
+
+        def dataset_fn():
+            y_true = np.concatenate((np.zeros(512), np.ones(512)))
+            y_pred = np.concatenate(
+                (np.linspace(0, 1, 512), np.linspace(0, 1, 512))
+            )
+            return tf.data.Dataset.from_tensor_slices((y_true, y_pred)).batch(1)
+
+        @tf.function
+        def eval_shard_fn(total_shard, shard_id, worker_dataset):
+            with tf_utils.with_metric_local_vars_scope():
+                worker_metrics = []
+                for coord_metric in coordinator_metrics:
+                    worker_metrics.append(
+                        base_metric.clone_metric(coord_metric)
+                    )
+
+                dataset_shard = worker_dataset.shard(total_shard, shard_id)
+
+                for value in dataset_shard:
+                    for worker_metric in worker_metrics:
+                        worker_metric.update_state(*value)
+
+                return {
+                    metric.name: metric.weights for metric in worker_metrics
+                }
+
+        per_worker_dataset = self.cluster_coord.create_per_worker_dataset(
+            dataset_fn()
+        )
+        # Trigger dataset creation on workers without creating an iterator
+        built_dataset = per_worker_dataset.build()
+
+        # needs to be a tf.constant so it doesn't get re-traced each time
+        # needs to be int64 because that's what Dataset.shard expects
+        total_shards = tf.constant(100, dtype=tf.int64)
+
+        result_remote_values = []
+        logging.info("Scheduling eval closures")
+        for i in tf.range(total_shards):
+            result_remote_values.append(
+                self.cluster_coord.schedule(
+                    eval_shard_fn,
+                    args=(total_shards, i, built_dataset),
+                )
+            )
+
+        logging.info("Killing 2 workers")
+        self._cluster.kill_task("worker", 0)
+        self._cluster.kill_task("worker", 1)
+        time.sleep(1)
+        self._cluster.start_task("worker", 0)
+        self._cluster.start_task("worker", 1)
+
+        self.cluster_coord.join()
+        results = [r.fetch() for r in result_remote_values]
+        coordinator_metrics = _aggregate_results(coordinator_metrics, results)
+
+        expected_results = {"auc": 0.5, "mean_absolute_error": 0.5}
+        for metric in coordinator_metrics:
+            self.assertAlmostEqual(
+                metric.result().numpy(), expected_results[metric.name], places=5
+            )
+
+    def testModelAddMetricErrors(self):
+        class MyModel(keras.Model):
+            def call(self, x):
+                self.add_metric(
+                    tf.cast(x >= 0, tf.float32),
+                    aggregation="sum",
+                    name="num_positive",
+                )
+                return tf.cast(tf.add(x, 1), tf.float32)
+
+        dataset = tf.data.Dataset.zip(
+            (tf.data.Dataset.range(-5, 5), tf.data.Dataset.range(-4, 6))
+        ).batch(1)
+        with self.strategy.scope():
+            model = MyModel()
+            model.compile(
+                metrics=[keras.metrics.Accuracy()],
+                loss="binary_crossentropy",
+                pss_evaluation_shards="auto",
+            )
+
+        # run a single train step to compile metrics
+        model.fit(dataset, steps_per_epoch=1)
+        with self.assertRaises(ValueError):
+            model.evaluate(dataset, return_dict=True)
+
+    def testModelInfiniteDatasetErrors(self):
+        dataset = tf.data.Dataset.range(10).repeat()
+        with self.strategy.scope():
+            model = keras.Model()
+            model.compile(pss_evaluation_shards="auto")
+        with self.assertRaisesRegex(
+            ValueError,
+            "When performing exact evaluation, the dataset must "
+            "be finite. Make sure not to call `repeat\(\)` on your "
+            "dataset.",
+        ):
+            model.evaluate(dataset)
+
+    def testTrainingWithVariablesCreatedInFunction(self):
+        # When metrics are specified via string, they are instantiated in a
+        # tf.function in the the first pass of the model when update_state is
+        # called. This use case should not be affected by exact visitation
+        # guarantee support.
+
+        class MyModel(keras.Model):
+            @tf.function
+            def worker_fn(self, y_true, y_pred):
+                self.compiled_metrics.update_state(y_true, y_pred)
+
+        with self.strategy.scope():
+            model = MyModel()
+            model.compile(metrics=["accuracy"])
+
+        y_true_0 = tf.convert_to_tensor([[0.0], [0.0], [0.0]])
+        y_pred_0 = tf.convert_to_tensor([[0.0], [0.0], [1.0]])
+        self.cluster_coord.schedule(model.worker_fn, args=(y_true_0, y_pred_0))
+
+        y_true_1 = tf.convert_to_tensor([[0.0], [0.0], [0.0]])
+        y_pred_1 = tf.convert_to_tensor([[0.0], [1.0], [1.0]])
+        self.cluster_coord.schedule(model.worker_fn, args=(y_true_1, y_pred_1))
+
+        self.cluster_coord.join()
+        for metric in model.compiled_metrics.metrics:
+            self.assertAlmostEqual(metric.result().numpy(), 0.5)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            input_type=["dataset", "dataset_creator", "distributed_dataset"],
+            eval_in_model_fit=[True, False],
+            use_auto=[True, False],
+            custom_metric=[True, False],
+        )
+    )
+    def testDistributedModelEvaluation(
+        self, input_type, eval_in_model_fit, use_auto, custom_metric
+    ):
+        # Define dataset by batch size, number of shards, and batches per shard
+        batch_size = 16
+        num_data_shards = 32
+        batches_per_shard = 4
+        num_examples = batch_size * num_data_shards * batches_per_shard
+
+        # Input dataset x: just the sequence of numbers up to the dataset size
+        # Input dataset y: defined such that each shard has index equal to the
+        # number of y_i's == True in that shard
+        expected_acc = sum(range(num_data_shards)) / num_examples
+
+        # The predictions y_pred from this dummy model are fixed to True. This
+        # way we can control the expected accuracy by just modifying y.
+        class BinaryModel(keras.Model):
+            def __call__(self, x, training=False):
+                return tf.cast(x >= 0, tf.float32)
+
+        class CustomAccuracy(keras.metrics.Metric):
+            def __init__(self, name="custom_acc", dtype=None):
+                super().__init__(name, dtype)
+                self.total = self.add_weight("total", initializer="zeros")
+                self.count = self.add_weight("count", initializer="zeros")
+
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                y_true = tf.cast(y_true, tf.float32)
+                y_pred = tf.cast(y_pred, tf.float32)
+                matches = tf.cast(tf.equal(y_true, y_pred), tf.float32)
+                count = tf.reduce_sum(matches)
+                self.count.assign_add(count)
+                total = tf.cast(tf.size(y_true), tf.float32)
+                self.total.assign_add(total)
+
+            def result(self):
+                return self.count / self.total
+
+            def reset_state(self):
+                self.total.assign(0)
+                self.count.assign(0)
+
+        def build_metric():
+            metric = (
+                CustomAccuracy() if custom_metric else keras.metrics.Accuracy()
+            )
+            return metric
+
+        dataset_fn = make_binary_dataset_fn(
+            num_examples, num_data_shards, batch_size
+        )
+
+        loss = "mae"
+
+        logging.info("Local evaluation (exact)")
+        model = BinaryModel()
+        model.compile(metrics=[build_metric()], loss=loss)
+        ground_truth_evaluation = model.evaluate(dataset_fn())
+        logging.info(
+            "Result local evaluation (exact): %s", ground_truth_evaluation
+        )
+        self.assertAlmostEqual(ground_truth_evaluation[1], expected_acc)
+        # Since outputs are always 0 or 1, MAE loss should == 1 - accuracy
+        self.assertAlmostEqual(ground_truth_evaluation[0], 1 - expected_acc)
+
+        logging.info("Distributed evaluation (exact)")
+        if use_auto:
+            num_shards = "auto"
+        else:
+            num_shards = 5 * self.strategy._extended._num_workers
+
+        with self.strategy.scope():
+            model = BinaryModel()
+            model.compile(
+                metrics=[build_metric()],
+                loss=loss,
+                pss_evaluation_shards=num_shards,
+            )
+
+        if input_type == "dataset":
+            train_dataset = dataset_fn()
+            val_dataset = dataset_fn()
+        elif input_type == "dataset_creator":
+            train_dataset = dataset_creator.DatasetCreator(dataset_fn)
+            val_dataset = dataset_creator.DatasetCreator(dataset_fn)
+        elif input_type == "distributed_dataset":
+            train_dataset = self.strategy.experimental_distribute_dataset(
+                dataset_fn()
+            )
+            val_dataset = self.strategy.experimental_distribute_dataset(
+                dataset_fn()
+            )
+
+        metric_name = "custom_acc" if custom_metric else "accuracy"
+        expected_results = {metric_name: expected_acc, "loss": 1 - expected_acc}
+
+        def kill_and_revive_in_thread(wait_secs=0.1):
+            def _kill_and_revive_fn():
+                time.sleep(wait_secs)
+                logging.info("Killing 2 workers")
+                self._cluster.kill_task("worker", 0)
+                self._cluster.kill_task("worker", 1)
+                time.sleep(1)
+                self._cluster.start_task("worker", 0)
+                self._cluster.start_task("worker", 1)
+
+            restart_thread = threading.Thread(target=_kill_and_revive_fn)
+            restart_thread.start()
+            return restart_thread
+
+        eval_results = {}
+        if eval_in_model_fit:
+            kill_and_revive_in_thread()
+            history = model.fit(
+                train_dataset,
+                steps_per_epoch=1,
+                validation_data=val_dataset,
+            )
+            logging.info(
+                "History: params (%r), history (%r)",
+                history.params,
+                history.history,
+            )
+            eval_results = {
+                metric.split("val_")[1]: val[-1]
+                for metric, val in history.history.items()
+                if metric.startswith("val_")
+            }
+        else:
+            # run a single train step to compile metrics
+            model.fit(train_dataset, steps_per_epoch=1)
+            kill_and_revive_in_thread()
+            eval_results = model.evaluate(val_dataset, return_dict=True)
+            eval_results = {
+                metric: val.numpy() for metric, val in eval_results.items()
+            }
+        for metric, val in eval_results.items():
+            self.assertIn(metric, expected_results)
+            self.assertAlmostEqual(val, expected_results[metric], places=5)
+
+    def testDistributedMulticlassWeightedEvaluation(self):
+        n_classes = 5
+
+        # Define dataset by batch size, number of shards, and batches per shard
+        batch_size = n_classes * 2
+        num_data_shards = 32
+        batches_per_shard = 4
+        num_examples = batch_size * num_data_shards * batches_per_shard
+        expected_acc = 4 / 5
+
+        class MulticlassModel(keras.Model):
+            def __call__(self, x, training=False):
+                # e.g. x = 6 -> y_pred = [0, 1, 0, 0, 0]
+                return tf.squeeze(
+                    tf.one_hot(
+                        indices=[tf.math.floormod(x, n_classes)],
+                        depth=n_classes,
+                    )
+                )
+
+        dataset_fn = make_multiclass_dataset_fn(
+            num_examples, num_data_shards, batch_size, n_classes
+        )
+
+        model = MulticlassModel()
+        model.compile(
+            metrics=[
+                keras.metrics.SparseCategoricalAccuracy(),
+                keras.metrics.SparseCategoricalCrossentropy(),
+            ],
+            weighted_metrics=[keras.metrics.SparseCategoricalCrossentropy()],
+            loss="sparse_categorical_crossentropy",
+        )
+        eval_dataset = dataset_fn()
+        ground_truth_evaluation = model.evaluate(eval_dataset, return_dict=True)
+        self.assertAlmostEqual(
+            ground_truth_evaluation["sparse_categorical_accuracy"], expected_acc
+        )
+
+        with self.strategy.scope():
+            model = MulticlassModel()
+            model.compile(
+                metrics=[
+                    keras.metrics.SparseCategoricalAccuracy(),
+                    keras.metrics.SparseCategoricalCrossentropy(),
+                ],
+                weighted_metrics=[
+                    keras.metrics.SparseCategoricalCrossentropy()
+                ],
+                loss="sparse_categorical_crossentropy",
+                pss_evaluation_shards=num_data_shards,
+            )
+
+        # run a single train step to compile metrics
+        train_dataset = dataset_fn()
+        model.fit(train_dataset, steps_per_epoch=1)
+
+        eval_results = model.evaluate(eval_dataset, return_dict=True)
+        eval_results = {
+            metric: val.numpy() for metric, val in eval_results.items()
+        }
+        for metric, val in eval_results.items():
+            self.assertIn(metric, ground_truth_evaluation)
+            self.assertAlmostEqual(
+                val, ground_truth_evaluation[metric], places=4
+            )
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/distribute/saved_model_mixed_api_test.py b/keras/distribute/saved_model_mixed_api_test.py
index fb901ca3a9ca..0aaeed7c1143 100644
--- a/keras/distribute/saved_model_mixed_api_test.py
+++ b/keras/distribute/saved_model_mixed_api_test.py
@@ -20,61 +20,81 @@
 tf.saved_model.save().
 """
 
+import tensorflow.compat.v2 as tf
+
 from keras.distribute import saved_model_test_base as test_base
-from keras.saving import save
+from keras.saving.legacy import save
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
-_DEFAULT_FUNCTION_KEY = 'serving_default'
+_DEFAULT_FUNCTION_KEY = "serving_default"
 
 
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
+    "Uses Dense layers, which call matmul"
+)
 class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
+    def setUp(self):
+        self._root_dir = "saved_model_save_load"
+        super().setUp()
 
-  def setUp(self):
-    self._root_dir = 'saved_model_save_load'
-    super().setUp()
-
-  def _save_model(self, model, saved_dir):
-    save.save_model(model, saved_dir, save_format='tf')
+    def _save_model(self, model, saved_dir):
+        save.save_model(model, saved_dir, save_format="tf")
 
-  def _load_and_run_model(self,
-                          distribution,
-                          saved_dir,
-                          predict_dataset,
-                          output_name='output_1'):
-    return test_base.load_and_run_with_saved_model_api(distribution, saved_dir,
-                                                       predict_dataset,
-                                                       output_name)
+    def _load_and_run_model(
+        self, distribution, saved_dir, predict_dataset, output_name="output_1"
+    ):
+        return test_base.load_and_run_with_saved_model_api(
+            distribution, saved_dir, predict_dataset, output_name
+        )
 
-  @tf.__internal__.distribute.combinations.generate(test_base.simple_models_with_strategies())
-  def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(
-        model_and_input, distribution)
+    @tf.__internal__.distribute.combinations.generate(
+        test_base.simple_models_with_strategies()
+    )
+    def test_save_no_strategy_restore_strategy(
+        self, model_and_input, distribution
+    ):
+        self.run_test_save_no_strategy_restore_strategy(
+            model_and_input, distribution
+        )
 
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.simple_models_with_strategies(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_no_strategy(self, model_and_input,
-                                             distribution, save_in_scope):
-    self.run_test_save_strategy_restore_no_strategy(
-        model_and_input, distribution, save_in_scope)
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.simple_models_with_strategies(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_no_strategy(
+        self, model_and_input, distribution, save_in_scope
+    ):
+        self.run_test_save_strategy_restore_no_strategy(
+            model_and_input, distribution, save_in_scope
+        )
 
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.simple_models_with_strategy_pairs(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_strategy(self, model_and_input,
-                                          distribution_for_saving,
-                                          distribution_for_restoring,
-                                          save_in_scope):
-    self.run_test_save_strategy_restore_strategy(model_and_input,
-                                                 distribution_for_saving,
-                                                 distribution_for_restoring,
-                                                 save_in_scope)
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.simple_models_with_strategy_pairs(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_strategy(
+        self,
+        model_and_input,
+        distribution_for_saving,
+        distribution_for_restoring,
+        save_in_scope,
+    ):
+        self.run_test_save_strategy_restore_strategy(
+            model_and_input,
+            distribution_for_saving,
+            distribution_for_restoring,
+            save_in_scope,
+        )
 
 
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/distribute/saved_model_save_load_test.py b/keras/distribute/saved_model_save_load_test.py
index da91996aa17c..2ca75d238a83 100644
--- a/keras/distribute/saved_model_save_load_test.py
+++ b/keras/distribute/saved_model_save_load_test.py
@@ -14,161 +14,214 @@
 # ==============================================================================
 """Tests for saving and loading using tf's saved_model APIs with DS."""
 
+import os
+
 import tensorflow.compat.v2 as tf
 
-import os
-from keras.testing_infra import test_utils
 from keras.distribute import model_combinations
 from keras.distribute import saved_model_test_base as test_base
+from keras.testing_infra import test_utils
 
 
 @test_utils.run_v2_only
 @test_utils.run_all_without_tensor_float_32(
-    'Uses Dense layers, which call matmul')
+    "Uses Dense layers, which call matmul"
+)
 class SavedModelKerasModelTest(test_base.TestSavedModelBase):
-
-  def setUp(self):
-    self._root_dir = 'saved_model_save_load'
-    super().setUp()
-
-  def _save_model(self, model, saved_dir):
-    tf.saved_model.save(model, saved_dir)
-
-  def _load_and_run_model(self,
-                          distribution,
-                          saved_dir,
-                          predict_dataset,
-                          output_name='output_1'):
-    return test_base.load_and_run_with_saved_model_api(distribution, saved_dir,
-                                                       predict_dataset,
-                                                       output_name)
-
-  @tf.__internal__.distribute.combinations.generate(test_base.simple_models_with_strategies())
-  def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(
-        model_and_input, distribution)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.simple_models_with_strategies(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_no_strategy(self, model_and_input,
-                                             distribution, save_in_scope):
-    self.run_test_save_strategy_restore_no_strategy(
-        model_and_input, distribution, save_in_scope)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.simple_models_with_strategy_pairs(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_strategy(self, model_and_input,
-                                          distribution_for_saving,
-                                          distribution_for_restoring,
-                                          save_in_scope):
-    self.run_test_save_strategy_restore_strategy(model_and_input,
-                                                 distribution_for_saving,
-                                                 distribution_for_restoring,
-                                                 save_in_scope)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.simple_models_with_strategies(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_no_variable_device_placement(self, model_and_input, distribution,
-                                        save_in_scope):
-    saved_dir = self.run_test_save_strategy(model_and_input, distribution,
-                                            save_in_scope)
-    func = tf.saved_model.load(saved_dir)
-    concrete_function = func.signatures[test_base._DEFAULT_FUNCTION_KEY]
-    for f in concrete_function.graph.as_graph_def().library.function:
-      for n in f.node_def:
-        if n.op == 'ReadVariableOp':
-          self.assertEmpty(n.device)
+    def setUp(self):
+        self._root_dir = "saved_model_save_load"
+        super().setUp()
+
+    def _save_model(self, model, saved_dir):
+        tf.saved_model.save(model, saved_dir)
+
+    def _load_and_run_model(
+        self, distribution, saved_dir, predict_dataset, output_name="output_1"
+    ):
+        return test_base.load_and_run_with_saved_model_api(
+            distribution, saved_dir, predict_dataset, output_name
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        test_base.simple_models_with_strategies()
+    )
+    def test_save_no_strategy_restore_strategy(
+        self, model_and_input, distribution
+    ):
+        self.run_test_save_no_strategy_restore_strategy(
+            model_and_input, distribution
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.simple_models_with_strategies(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_no_strategy(
+        self, model_and_input, distribution, save_in_scope
+    ):
+        self.run_test_save_strategy_restore_no_strategy(
+            model_and_input, distribution, save_in_scope
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.simple_models_with_strategy_pairs(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_strategy(
+        self,
+        model_and_input,
+        distribution_for_saving,
+        distribution_for_restoring,
+        save_in_scope,
+    ):
+        self.run_test_save_strategy_restore_strategy(
+            model_and_input,
+            distribution_for_saving,
+            distribution_for_restoring,
+            save_in_scope,
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.simple_models_with_strategies(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_no_variable_device_placement(
+        self, model_and_input, distribution, save_in_scope
+    ):
+        saved_dir = self.run_test_save_strategy(
+            model_and_input, distribution, save_in_scope
+        )
+        func = tf.saved_model.load(saved_dir)
+        concrete_function = func.signatures[test_base._DEFAULT_FUNCTION_KEY]
+        for f in concrete_function.graph.as_graph_def().library.function:
+            for n in f.node_def:
+                if n.op == "ReadVariableOp":
+                    self.assertEmpty(n.device)
 
 
 @test_utils.run_v2_only
 class SavedModelTFModuleTest(test_base.TestSavedModelBase):
-
-  def setUp(self):
-    self._root_dir = 'saved_model_save_load'
-    super().setUp()
-
-  def _train_model(self, model, x_train, y_train, batch_size):
-    pass
-
-  def _predict_with_model(self, distribution, model, predict_dataset):
-    if distribution:
-      dist_predict_dataset = distribution.experimental_distribute_dataset(
-          predict_dataset)
-      per_replica_predict_data = next(iter(dist_predict_dataset))
-      result = distribution.run(model, args=(per_replica_predict_data,))
-      # Convert the per_replica value to a list, then concatenate them
-      reduced = distribution.experimental_local_results(result)
-      concat = tf.concat(reduced, 0)
-      return concat
-    else:
-      return model(next(iter(predict_dataset)))
-
-  def _save_model(self, model, saved_dir):
-    call = model.__call__.get_concrete_function(tf.TensorSpec(None))
-    tf.saved_model.save(model, saved_dir, signatures=call)
-
-  def _load_and_run_model(self,
-                          distribution,
-                          saved_dir,
-                          predict_dataset,
-                          output_name='output_1'):
-    del output_name
-    model = tf.saved_model.load(saved_dir)
-    return self._predict_with_model(distribution, model, predict_dataset)
-
-  @tf.__internal__.distribute.combinations.generate(test_base.tfmodule_models_with_strategies())
-  def test_save_no_strategy_restore_strategy(self, model_and_input,
-                                             distribution):
-    self.run_test_save_no_strategy_restore_strategy(
-        model_and_input, distribution)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.tfmodule_models_with_strategies(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_no_strategy(
-      self, model_and_input, distribution, save_in_scope):
-    self.run_test_save_strategy_restore_no_strategy(
-        model_and_input, distribution, save_in_scope)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.times(test_base.tfmodule_models_with_strategy_pairs(),
-                         tf.__internal__.test.combinations.combine(save_in_scope=[True, False])))
-  def test_save_strategy_restore_strategy(self, model_and_input,
-                                          distribution_for_saving,
-                                          distribution_for_restoring,
-                                          save_in_scope):
-    self.run_test_save_strategy_restore_strategy(model_and_input,
-                                                 distribution_for_saving,
-                                                 distribution_for_restoring,
-                                                 save_in_scope)
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          model_and_input=[model_combinations.simple_tfmodule_model],
-          distribution=test_base.strategies +
-          [tf.__internal__.distribute.combinations.cloud_tpu_strategy]))
-  def test_save_load_io_device(self, model_and_input, distribution):
-    saved_dir = os.path.join(self.get_temp_dir(), 'io_device')
-    with distribution.scope():
-      model = model_and_input.get_model()
-      x_train, y_train, _ = model_and_input.get_data()
-      batch_size = model_and_input.get_batch_size()
-      self._train_model(model, x_train, y_train, batch_size)
-    call = model.__call__.get_concrete_function(tf.TensorSpec(None))
-    save_options = tf.saved_model.SaveOptions(
-        experimental_io_device='/job:localhost')
-    tf.saved_model.save(model, saved_dir, signatures=call, options=save_options)
-    load_options = tf.saved_model.LoadOptions(
-        experimental_io_device='/job:localhost')
-    # Check that the model can be loaded and training continued without error.
-    with distribution.scope():
-      loaded_model = tf.saved_model.load(saved_dir, options=load_options)
-      self._train_model(loaded_model, x_train, y_train, batch_size)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def setUp(self):
+        self._root_dir = "saved_model_save_load"
+        super().setUp()
+
+    def _train_model(self, model, x_train, y_train, batch_size):
+        pass
+
+    def _predict_with_model(self, distribution, model, predict_dataset):
+        if distribution:
+            dist_predict_dataset = distribution.experimental_distribute_dataset(
+                predict_dataset
+            )
+            per_replica_predict_data = next(iter(dist_predict_dataset))
+            result = distribution.run(model, args=(per_replica_predict_data,))
+            # Convert the per_replica value to a list, then concatenate them
+            reduced = distribution.experimental_local_results(result)
+            concat = tf.concat(reduced, 0)
+            return concat
+        else:
+            return model(next(iter(predict_dataset)))
+
+    def _save_model(self, model, saved_dir):
+        call = model.__call__.get_concrete_function(tf.TensorSpec(None))
+        tf.saved_model.save(model, saved_dir, signatures=call)
+
+    def _load_and_run_model(
+        self, distribution, saved_dir, predict_dataset, output_name="output_1"
+    ):
+        del output_name
+        model = tf.saved_model.load(saved_dir)
+        return self._predict_with_model(distribution, model, predict_dataset)
+
+    @tf.__internal__.distribute.combinations.generate(
+        test_base.tfmodule_models_with_strategies()
+    )
+    def test_save_no_strategy_restore_strategy(
+        self, model_and_input, distribution
+    ):
+        self.run_test_save_no_strategy_restore_strategy(
+            model_and_input, distribution
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.tfmodule_models_with_strategies(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_no_strategy(
+        self, model_and_input, distribution, save_in_scope
+    ):
+        self.run_test_save_strategy_restore_no_strategy(
+            model_and_input, distribution, save_in_scope
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.times(
+            test_base.tfmodule_models_with_strategy_pairs(),
+            tf.__internal__.test.combinations.combine(
+                save_in_scope=[True, False]
+            ),
+        )
+    )
+    def test_save_strategy_restore_strategy(
+        self,
+        model_and_input,
+        distribution_for_saving,
+        distribution_for_restoring,
+        save_in_scope,
+    ):
+        self.run_test_save_strategy_restore_strategy(
+            model_and_input,
+            distribution_for_saving,
+            distribution_for_restoring,
+            save_in_scope,
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            model_and_input=[model_combinations.simple_tfmodule_model],
+            distribution=test_base.strategies
+            + [tf.__internal__.distribute.combinations.cloud_tpu_strategy],
+        )
+    )
+    def test_save_load_io_device(self, model_and_input, distribution):
+        saved_dir = os.path.join(self.get_temp_dir(), "io_device")
+        with distribution.scope():
+            model = model_and_input.get_model()
+            x_train, y_train, _ = model_and_input.get_data()
+            batch_size = model_and_input.get_batch_size()
+            self._train_model(model, x_train, y_train, batch_size)
+        call = model.__call__.get_concrete_function(tf.TensorSpec(None))
+        save_options = tf.saved_model.SaveOptions(
+            experimental_io_device="/job:localhost"
+        )
+        tf.saved_model.save(
+            model, saved_dir, signatures=call, options=save_options
+        )
+        load_options = tf.saved_model.LoadOptions(
+            experimental_io_device="/job:localhost"
+        )
+        # Check that the model can be loaded and training continued without
+        # error.
+        with distribution.scope():
+            loaded_model = tf.saved_model.load(saved_dir, options=load_options)
+            self._train_model(loaded_model, x_train, y_train, batch_size)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/distribute/saved_model_test_base.py b/keras/distribute/saved_model_test_base.py
index 576a6d836021..09e8e5aff184 100644
--- a/keras/distribute/saved_model_test_base.py
+++ b/keras/distribute/saved_model_test_base.py
@@ -16,14 +16,14 @@
 
 import os
 
-from absl.testing import parameterized
-from keras.distribute import model_combinations
 import numpy as np
-
 import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.distribute import model_combinations
 
 _RANDOM_SEED = 1337
-_DEFAULT_FUNCTION_KEY = 'serving_default'
+_DEFAULT_FUNCTION_KEY = "serving_default"
 
 _TOLERANCE = 1e-30
 # TPU uses bfloat16 for computation in hardware underlying, so it has less
@@ -49,219 +49,239 @@
     tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
     tf.__internal__.distribute.combinations.tpu_strategy,
     tf.__internal__.distribute.combinations.tpu_strategy_packed_var,
-    tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,
+    tf.__internal__.distribute.combinations.central_storage_strategy_with_two_gpus,  # noqa: E501
 ]
 
 
 def simple_models_with_strategies():
-  return tf.__internal__.test.combinations.combine(
-      model_and_input=simple_models,
-      distribution=strategies,
-      mode=['eager'])
+    return tf.__internal__.test.combinations.combine(
+        model_and_input=simple_models, distribution=strategies, mode=["eager"]
+    )
 
 
 def simple_models_with_strategy_pairs():
-  return tf.__internal__.test.combinations.combine(
-      model_and_input=simple_models,
-      distribution_for_saving=strategies,
-      distribution_for_restoring=strategies,
-      mode=['eager'])
+    return tf.__internal__.test.combinations.combine(
+        model_and_input=simple_models,
+        distribution_for_saving=strategies,
+        distribution_for_restoring=strategies,
+        mode=["eager"],
+    )
 
 
 def tfmodule_models_with_strategies():
-  return tf.__internal__.test.combinations.combine(
-      model_and_input=[model_combinations.simple_tfmodule_model],
-      distribution=strategies,
-      mode=['eager'])
+    return tf.__internal__.test.combinations.combine(
+        model_and_input=[model_combinations.simple_tfmodule_model],
+        distribution=strategies,
+        mode=["eager"],
+    )
 
 
 def tfmodule_models_with_strategy_pairs():
-  return tf.__internal__.test.combinations.combine(
-      model_and_input=[model_combinations.simple_tfmodule_model],
-      distribution_for_saving=strategies,
-      distribution_for_restoring=strategies,
-      mode=['eager'])
-
-
-def load_and_run_with_saved_model_api(distribution, saved_dir, predict_dataset,
-                                      output_name):
-  """Loads a saved_model using tf.saved_model API, and runs it."""
-  func = tf.saved_model.load(saved_dir)
-  if distribution:
-    dist_predict_dataset = distribution.experimental_distribute_dataset(
-        predict_dataset)
-    per_replica_predict_data = next(iter(dist_predict_dataset))
-    result = distribution.run(
-        func.signatures[_DEFAULT_FUNCTION_KEY],
-        args=(per_replica_predict_data,))
-    result = result[output_name]
-
-    # Convert the per_replica value to a list, then concatenate them
-    reduced = distribution.experimental_local_results(result)
-    concat = tf.concat(reduced, 0)
-    return concat
-  else:
-    result = func.signatures[_DEFAULT_FUNCTION_KEY](next(iter(predict_dataset)))
-    return result[output_name]
-
-
-class TestSavedModelBase(tf.test.TestCase, parameterized.TestCase):
-  """Base class for testing saving/loading with DS."""
-
-  def setUp(self):
-    np.random.seed(_RANDOM_SEED)
-    tf.compat.v1.set_random_seed(_RANDOM_SEED)
-    self._root_dir = 'base'
-    super().setUp()
-
-  def _save_model(self, model, saved_dir):
-    """Save the given model to the given saved_dir.
-
-    This method needs to be implemented by the subclasses.
-
-    Args:
-      model: a keras model object to save.
-      saved_dir: a string representing the path to save the keras model
-    """
-    raise NotImplementedError('must be implemented in descendants')
-
-  def _load_and_run_model(self,
-                          distribution,
-                          saved_dir,
-                          predict_dataset,
-                          output_name='output_1'):
-    """Load the model and run 1 step of predict with it.
-
-    This method must be implemented by the subclasses.
-
-    Args:
-      distribution: the distribution strategy used to load the model. None if no
-        distribution strategy is used
-      saved_dir: the string representing the path where the model is saved.
-      predict_dataset: the data used to do the predict on the model for
-        cross_replica context.
-      output_name: the string representing the name of the output layer of the
-        model.
-    """
-
-    raise NotImplementedError('must be implemented in descendants')
-
-  def _train_model(self, model, x_train, y_train, batch_size):
-    training_dataset = tf.data.Dataset.from_tensor_slices(
-        (x_train, y_train))
-    training_dataset = training_dataset.repeat()
-    training_dataset = training_dataset.batch(batch_size)
-
-    # Train the model for 1 epoch
-    model.fit(x=training_dataset, epochs=1, steps_per_epoch=100)
-
-  def _predict_with_model(self, distribution, model, predict_dataset):
-    return model.predict(predict_dataset, steps=PREDICT_STEPS)
-
-  def _get_predict_dataset(self, x_predict, batch_size):
-    predict_dataset = tf.data.Dataset.from_tensor_slices(x_predict)
-    predict_dataset = predict_dataset.repeat()
-    predict_dataset = predict_dataset.batch(batch_size)
-    return predict_dataset
-
-  def run_test_save_no_strategy_restore_strategy(self, model_and_input,
-                                                 distribution):
-    """Save a model without DS, and restore it with DS."""
-
-    saved_dir = os.path.join(self.get_temp_dir(), '0')
-
-    model = model_and_input.get_model()
-    x_train, y_train, x_predict = model_and_input.get_data()
-    batch_size = model_and_input.get_batch_size()
-    predict_dataset = self._get_predict_dataset(x_predict, batch_size)
-
-    self._train_model(model, x_train, y_train, batch_size)
-    result_before_save = self._predict_with_model(None, model, predict_dataset)
-
-    self._save_model(model, saved_dir)
-
-    with distribution.scope():
-      result_after_save = self._load_and_run_model(
-          distribution=distribution,
-          saved_dir=saved_dir,
-          predict_dataset=predict_dataset)
-
-    self.assertAllClose(result_before_save, result_after_save)
-
-  def run_test_save_strategy_restore_no_strategy(self, model_and_input,
-                                                 distribution, save_in_scope):
-    """Save a model with DS, and restore it without DS."""
-
-    saved_dir = os.path.join(self.get_temp_dir(), '1')
-
-    with distribution.scope():
-      model = model_and_input.get_model()
-      x_train, y_train, x_predict = model_and_input.get_data()
-      batch_size = model_and_input.get_batch_size()
-
-      self._train_model(model, x_train, y_train, batch_size)
-      predict_dataset = self._get_predict_dataset(x_predict, batch_size)
-      result_before_save = self._predict_with_model(
-          distribution, model, predict_dataset)
-
-    if save_in_scope:
-      with distribution.scope():
-        self._save_model(model, saved_dir)
-    else:
-      self._save_model(model, saved_dir)
-
-    load_result = self._load_and_run_model(
-        distribution=None,
-        saved_dir=saved_dir,
-        predict_dataset=predict_dataset)
-
-    self.assertAllClose(result_before_save, load_result)
-
-  def run_test_save_strategy_restore_strategy(self, model_and_input,
-                                              distribution_for_saving,
-                                              distribution_for_restoring,
-                                              save_in_scope):
-    """Save a model with DS, and restore it with potentially different DS."""
-    saved_dir = os.path.join(self.get_temp_dir(), '2')
-
-    with distribution_for_saving.scope():
-      model = model_and_input.get_model()
-      x_train, y_train, x_predict = model_and_input.get_data()
-      batch_size = model_and_input.get_batch_size()
-
-      self._train_model(model, x_train, y_train, batch_size)
-      predict_dataset = self._get_predict_dataset(x_predict, batch_size)
-      result_before_save = self._predict_with_model(
-          distribution_for_saving, model, predict_dataset)
-
-    if save_in_scope:
-      with distribution_for_saving.scope():
-        self._save_model(model, saved_dir)
+    return tf.__internal__.test.combinations.combine(
+        model_and_input=[model_combinations.simple_tfmodule_model],
+        distribution_for_saving=strategies,
+        distribution_for_restoring=strategies,
+        mode=["eager"],
+    )
+
+
+def load_and_run_with_saved_model_api(
+    distribution, saved_dir, predict_dataset, output_name
+):
+    """Loads a saved_model using tf.saved_model API, and runs it."""
+    func = tf.saved_model.load(saved_dir)
+    if distribution:
+        dist_predict_dataset = distribution.experimental_distribute_dataset(
+            predict_dataset
+        )
+        per_replica_predict_data = next(iter(dist_predict_dataset))
+        result = distribution.run(
+            func.signatures[_DEFAULT_FUNCTION_KEY],
+            args=(per_replica_predict_data,),
+        )
+        result = result[output_name]
+
+        # Convert the per_replica value to a list, then concatenate them
+        reduced = distribution.experimental_local_results(result)
+        concat = tf.concat(reduced, 0)
+        return concat
     else:
-      self._save_model(model, saved_dir)
-
-    with distribution_for_restoring.scope():
+        result = func.signatures[_DEFAULT_FUNCTION_KEY](
+            next(iter(predict_dataset))
+        )
+        return result[output_name]
 
-      load_result = self._load_and_run_model(
-          distribution=distribution_for_restoring,
-          saved_dir=saved_dir,
-          predict_dataset=predict_dataset)
 
-    self.assertAllClose(result_before_save, load_result)
-
-  def run_test_save_strategy(self, model_and_input,
-                             distribution, save_in_scope):
-    """Save a model with DS."""
-    saved_dir = os.path.join(self.get_temp_dir(), '3')
-    with distribution.scope():
-      model = model_and_input.get_model()
-      x_train, y_train, _ = model_and_input.get_data()
-      batch_size = model_and_input.get_batch_size()
-      self._train_model(model, x_train, y_train, batch_size)
+class TestSavedModelBase(tf.test.TestCase, parameterized.TestCase):
+    """Base class for testing saving/loading with DS."""
+
+    def setUp(self):
+        np.random.seed(_RANDOM_SEED)
+        tf.compat.v1.set_random_seed(_RANDOM_SEED)
+        self._root_dir = "base"
+        super().setUp()
+
+    def _save_model(self, model, saved_dir):
+        """Save the given model to the given saved_dir.
+
+        This method needs to be implemented by the subclasses.
+
+        Args:
+          model: a keras model object to save.
+          saved_dir: a string representing the path to save the keras model
+        """
+        raise NotImplementedError("must be implemented in descendants")
+
+    def _load_and_run_model(
+        self, distribution, saved_dir, predict_dataset, output_name="output_1"
+    ):
+        """Load the model and run 1 step of predict with it.
+
+        This method must be implemented by the subclasses.
+
+        Args:
+          distribution: the distribution strategy used to load the model. None
+            if no distribution strategy is used
+          saved_dir: the string representing the path where the model is saved.
+          predict_dataset: the data used to do the predict on the model for
+            cross_replica context.
+          output_name: the string representing the name of the output layer of
+            the model.
+        """
+
+        raise NotImplementedError("must be implemented in descendants")
+
+    def _train_model(self, model, x_train, y_train, batch_size):
+        training_dataset = tf.data.Dataset.from_tensor_slices(
+            (x_train, y_train)
+        )
+        training_dataset = training_dataset.repeat()
+        training_dataset = training_dataset.batch(batch_size)
+
+        # Train the model for 1 epoch
+        model.fit(x=training_dataset, epochs=1, steps_per_epoch=100)
+
+    def _predict_with_model(self, distribution, model, predict_dataset):
+        return model.predict(predict_dataset, steps=PREDICT_STEPS)
+
+    def _get_predict_dataset(self, x_predict, batch_size):
+        predict_dataset = tf.data.Dataset.from_tensor_slices(x_predict)
+        predict_dataset = predict_dataset.repeat()
+        predict_dataset = predict_dataset.batch(batch_size)
+        return predict_dataset
+
+    def run_test_save_no_strategy_restore_strategy(
+        self, model_and_input, distribution
+    ):
+        """Save a model without DS, and restore it with DS."""
+
+        saved_dir = os.path.join(self.get_temp_dir(), "0")
+
+        model = model_and_input.get_model()
+        x_train, y_train, x_predict = model_and_input.get_data()
+        batch_size = model_and_input.get_batch_size()
+        predict_dataset = self._get_predict_dataset(x_predict, batch_size)
+
+        self._train_model(model, x_train, y_train, batch_size)
+        result_before_save = self._predict_with_model(
+            None, model, predict_dataset
+        )
 
-    if save_in_scope:
-      with distribution.scope():
         self._save_model(model, saved_dir)
-    else:
-      self._save_model(model, saved_dir)
-    return saved_dir
+
+        with distribution.scope():
+            result_after_save = self._load_and_run_model(
+                distribution=distribution,
+                saved_dir=saved_dir,
+                predict_dataset=predict_dataset,
+            )
+
+        self.assertAllClose(result_before_save, result_after_save)
+
+    def run_test_save_strategy_restore_no_strategy(
+        self, model_and_input, distribution, save_in_scope
+    ):
+        """Save a model with DS, and restore it without DS."""
+
+        saved_dir = os.path.join(self.get_temp_dir(), "1")
+
+        with distribution.scope():
+            model = model_and_input.get_model()
+            x_train, y_train, x_predict = model_and_input.get_data()
+            batch_size = model_and_input.get_batch_size()
+
+            self._train_model(model, x_train, y_train, batch_size)
+            predict_dataset = self._get_predict_dataset(x_predict, batch_size)
+            result_before_save = self._predict_with_model(
+                distribution, model, predict_dataset
+            )
+
+        if save_in_scope:
+            with distribution.scope():
+                self._save_model(model, saved_dir)
+        else:
+            self._save_model(model, saved_dir)
+
+        load_result = self._load_and_run_model(
+            distribution=None,
+            saved_dir=saved_dir,
+            predict_dataset=predict_dataset,
+        )
+
+        self.assertAllClose(result_before_save, load_result)
+
+    def run_test_save_strategy_restore_strategy(
+        self,
+        model_and_input,
+        distribution_for_saving,
+        distribution_for_restoring,
+        save_in_scope,
+    ):
+        """Save a model with DS, and restore it with potentially different
+        DS."""
+        saved_dir = os.path.join(self.get_temp_dir(), "2")
+
+        with distribution_for_saving.scope():
+            model = model_and_input.get_model()
+            x_train, y_train, x_predict = model_and_input.get_data()
+            batch_size = model_and_input.get_batch_size()
+
+            self._train_model(model, x_train, y_train, batch_size)
+            predict_dataset = self._get_predict_dataset(x_predict, batch_size)
+            result_before_save = self._predict_with_model(
+                distribution_for_saving, model, predict_dataset
+            )
+
+        if save_in_scope:
+            with distribution_for_saving.scope():
+                self._save_model(model, saved_dir)
+        else:
+            self._save_model(model, saved_dir)
+
+        with distribution_for_restoring.scope():
+
+            load_result = self._load_and_run_model(
+                distribution=distribution_for_restoring,
+                saved_dir=saved_dir,
+                predict_dataset=predict_dataset,
+            )
+
+        self.assertAllClose(result_before_save, load_result)
+
+    def run_test_save_strategy(
+        self, model_and_input, distribution, save_in_scope
+    ):
+        """Save a model with DS."""
+        saved_dir = os.path.join(self.get_temp_dir(), "3")
+        with distribution.scope():
+            model = model_and_input.get_model()
+            x_train, y_train, _ = model_and_input.get_data()
+            batch_size = model_and_input.get_batch_size()
+            self._train_model(model, x_train, y_train, batch_size)
+
+        if save_in_scope:
+            with distribution.scope():
+                self._save_model(model, saved_dir)
+        else:
+            self._save_model(model, saved_dir)
+        return saved_dir
diff --git a/keras/distribute/sharded_variable_test.py b/keras/distribute/sharded_variable_test.py
index 7b9b8eda6cd1..acd1e6fd3bf6 100644
--- a/keras/distribute/sharded_variable_test.py
+++ b/keras/distribute/sharded_variable_test.py
@@ -14,406 +14,458 @@
 # ==============================================================================
 """Tests for ClusterCoordinator and Keras models."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 import keras
 from keras.distribute import multi_worker_testing_utils
 from keras.distribute import strategy_combinations
 from keras.engine import base_layer
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 class ShardedVariableTest(tf.test.TestCase, parameterized.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super().setUpClass()
-    cls.strategy = tf.distribute.experimental.ParameterServerStrategy(
-        multi_worker_testing_utils.make_parameter_server_cluster(3, 2),
-        variable_partitioner=tf.distribute.experimental.partitioners
-        .FixedShardsPartitioner(2))
-
-  def assert_list_all_equal(self, list1, list2):
-    """Used in lieu of `assertAllEqual`.
-
-    This is used to replace standard `assertAllEqual` for the cases where
-    `list1` and `list2` contain `AggregatingVariable`. Lists with
-    `AggregatingVariable` are not convertible to numpy array via `np.array`
-    calls as numpy would raise `ValueError: setting an array element with a
-    sequence.`
-
-    Args:
-      list1: The first list to compare equality.
-      list2: The second list to compare equality.
-    """
-    for lhs, rhs in zip(list1, list2):
-      self.assertEqual(lhs, rhs)
-
-  def test_keras_layer_setattr(self):
-
-    class Layer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.w = tf.Variable([0, 1])
-        self.b = tf.Variable([2, 3], trainable=False)
-
-    with self.strategy.scope():
-      layer = Layer()
-
-    self.assertLen(layer.trainable_weights, 2)
-    self.assertEqual(layer.trainable_weights[0], [0])
-    self.assertEqual(layer.trainable_weights[1], [1])
-    self.assertLen(layer.non_trainable_weights, 2)
-    self.assertEqual(layer.non_trainable_weights[0], [2])
-    self.assertEqual(layer.non_trainable_weights[1], [3])
-    self.assert_list_all_equal(
-        layer.weights, layer.trainable_weights + layer.non_trainable_weights)
-    self.assert_list_all_equal(layer.trainable_weights,
-                               layer.trainable_variables)
-    self.assert_list_all_equal(layer.weights, layer.variables)
-
-    checkpoint_deps = set(layer._trackable_children().values())
-    self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
-
-  def test_keras_layer_add_weight(self):
-
-    class Layer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.w = self.add_weight(
-            shape=(2,),
-            initializer=lambda shape, dtype: tf.constant([0., 1.],),
-            trainable=True)
-        self.b = self.add_weight(
-            shape=(2,),
-            initializer=lambda shape, dtype: tf.constant([2., 3.]),
-            trainable=False)
-
-    with self.strategy.scope():
-      layer = Layer()
-
-    self.assertLen(layer.trainable_weights, 2)
-    self.assertEqual(layer.trainable_weights[0], [0.])
-    self.assertEqual(layer.trainable_weights[1], [1.])
-    self.assertLen(layer.non_trainable_weights, 2)
-    self.assertEqual(layer.non_trainable_weights[0], [2.])
-    self.assertEqual(layer.non_trainable_weights[1], [3.])
-    self.assert_list_all_equal(
-        layer.weights, layer.trainable_weights + layer.non_trainable_weights)
-    self.assert_list_all_equal(layer.trainable_weights,
-                               layer.trainable_variables)
-    self.assert_list_all_equal(layer.weights, layer.variables)
-
-    checkpoint_deps = set(layer._trackable_children().values())
-    self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
-
-  def test_keras_metrics(self):
-    with self.strategy.scope():
-      fp = keras.metrics.FalsePositives(thresholds=[0.2, 0.5, 0.7, 0.8])
-      auc = keras.metrics.AUC(num_thresholds=10)
-
-    @tf.function
-    def update():
-      fp.update_state([0., 1., 0., 0.], [0., 0., 0.3, 0.9])
-      auc.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
-
-    @tf.function
-    def reset():
-      fp.reset_state()
-      auc.reset_state()
-
-    update()
-    self.assertEqual(auc.result(), 0.75)
-    self.assertAllEqual(fp.result(), [2., 1., 1., 1.])
-    reset()
-    self.assertEqual(auc.result(), 0.0)
-    self.assertAllEqual(fp.result(), [0., 0., 0., 0.])
-
-    self.assertTrue(hasattr(auc.true_positives, 'variables'))
-    self.assertTrue(hasattr(fp.accumulator, 'variables'))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          shard_config=[[2, 2], [2, 3], [3, 2], [2, 1], [1, 1], [1, 2], [1, 3]],
-          model_type=['dense', 'embedding'],
-      ))
-  def test_saved_model_combined(self, shard_config, model_type):
-    """Test saving and loading models with various fixed numbers of shards.
-
-    Args:
-      shard_config: The number of shards to use per variable before and after
-        loading. For example, [1, 3] means to create and save the model with 1
-        shard (i.e., no variable partitioning), and load it into 3 shards per
-        variable.
-      model_type: Either 'dense' or 'embedding', which simple model to test.
-    """
-
-    def create_embedding_model():
-      inputs = keras.layers.Input(shape=(6,))
-      embedding = keras.layers.Embedding(output_dim=2, input_dim=6)
-      outputs = embedding(inputs)
-      model = keras.Model(inputs, outputs)
-      model.compile(optimizer='adam', loss='mean_squared_error')
-      return model
-
-    def create_dense_model():
-      inputs = keras.layers.Input(shape=(6,))
-      outputs = keras.layers.Dense(6)(inputs)
-      model = keras.Model(inputs, outputs)
-      model.compile(optimizer='adam', loss='mean_squared_error')
-      return model
-
-    # Maybe create new strategy with different number of shards
-    if shard_config[0] > 2:
-      strategy = tf.distribute.experimental.ParameterServerStrategy(
-          multi_worker_testing_utils.make_parameter_server_cluster(3, 3),
-          variable_partitioner=tf.distribute.experimental.partitioners
-          .FixedShardsPartitioner(shard_config[0]))
-    elif shard_config[0] == 2:
-      strategy = self.strategy
-    else:
-      # Just one shard, so use default strategy
-      strategy = tf.distribute.get_strategy()
-
-    x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
-    with strategy.scope():
-      model = (
-          create_dense_model()
-          if model_type == 'dense' else create_embedding_model())
-      expect = model(x)
-
-    # Dense layers have two variables (kernel and bias), embedding layers have 1
-    n_expected_variables = shard_config[0] * (2 if model_type == 'dense' else 1)
-    self.assertLen(model.variables, n_expected_variables)
-    model_weights = [v.numpy() for v in model.variables]
-
-    saved_dir = self.get_temp_dir()
-    model.save(saved_dir)
-
-    if shard_config[1] > 2:
-      strategy2 = tf.distribute.experimental.ParameterServerStrategy(
-          multi_worker_testing_utils.make_parameter_server_cluster(3, 3),
-          variable_partitioner=tf.distribute.experimental.partitioners
-          .FixedShardsPartitioner(shard_config[1]))
-    elif shard_config[1] == 2:
-      strategy2 = self.strategy
-    else:
-      # Just one shard, so use default strategy
-      strategy2 = tf.distribute.get_strategy()
-
-    with strategy2.scope():
-      loaded_model = keras.models.load_model(saved_dir)
-      got = loaded_model(x)
-
-      self.assertAllClose(got, expect)
-      n_expected_variables = shard_config[1] * (2
-                                                if model_type == 'dense' else 1)
-      self.assertLen(loaded_model.variables, n_expected_variables)
-      loaded_model_weights = [v.numpy() for v in loaded_model.variables]
-      self.assertAllClose(
-          np.concatenate([w.flatten() for w in model_weights]),
-          np.concatenate([w.flatten() for w in loaded_model_weights]))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          distribution=strategy_combinations.strategies_minus_tpu,
-          model_type=['dense', 'embedding'],
-      ))
-  def test_saved_model_load_non_pss(self, model_type, distribution):
-
-    def create_embedding_model():
-      inputs = keras.layers.Input(shape=(6,))
-      embedding = keras.layers.Embedding(output_dim=2, input_dim=6)
-      outputs = embedding(inputs)
-      model = keras.Model(inputs, outputs)
-      model.compile(optimizer='adam', loss='mean_squared_error')
-      return model
-
-    def create_dense_model():
-      inputs = keras.layers.Input(shape=(6,))
-      outputs = keras.layers.Dense(6)(inputs)
-      model = keras.Model(inputs, outputs)
-      model.compile(optimizer='adam', loss='mean_squared_error')
-      return model
-
-    x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
-    with self.strategy.scope():
-      model = (
-          create_dense_model()
-          if model_type == 'dense' else create_embedding_model())
-      expect = model(x)
-
-    model_weights = [v.numpy() for v in model.variables]
-
-    saved_dir = self.get_temp_dir()
-    model.save(saved_dir)
-
-    with distribution.scope():
-      loaded_model = keras.models.load_model(saved_dir)
-      got = loaded_model(x)
-
-      self.assertAllClose(got, expect)
-      n_expected_variables = 2 if model_type == 'dense' else 1
-      self.assertLen(loaded_model.variables, n_expected_variables)
-      loaded_model_weights = [v.numpy() for v in loaded_model.variables]
-      self.assertAllClose(
-          np.concatenate([w.flatten() for w in model_weights]),
-          np.concatenate([w.flatten() for w in loaded_model_weights]))
-
-  def test_slot_variable_checkpointing(self):
-
-    with self.strategy.scope():
-      # Set a name so the ShardedVariable is well-named for slot var keying
-      var = tf.Variable([1., 2., 3., 4., 5., 6.], name='test')
-
-    opt = keras.optimizers.optimizer_v2.adam.Adam()
-
-    # Run once to trigger apply_gradients to populate optimizer slot variables.
-    def train_step():
-      with tf.GradientTape() as tape:
-        loss = sum(var)
-      opt.minimize(loss, var.variables, tape=tape)
-
-    self.strategy.run(train_step)
-
-    # Check that we can call get_slot using each slot, before and after
-    # Checkpointing, and get the same results
-    pre_ckpt_slots = []
-    for slot in opt.get_slot_names():
-      pre_ckpt_slots.extend([v.numpy() for v in opt.get_slot(var, slot)])
-
-    ckpt = tf.train.Checkpoint(var=var, opt=opt)
-
-    # Assert that checkpoint has slots for each shard and the ShardedVariable
-    self.assertLen(ckpt.opt._slots, 3)
-    for var_name in ckpt.opt._slots.keys():
-      self.assertLen(ckpt.opt._slots[var_name], 2)
-      self.assertEqual(ckpt.opt._slots[var_name].keys(), {'m', 'v'})
-      if hasattr(ckpt.opt._slots[var_name]['m'], 'variables'):
-        self.assertLen(ckpt.opt._slots[var_name]['m'].variables, 2)
-        self.assertLen(ckpt.opt._slots[var_name]['v'].variables, 2)
-
-    saved_dir = self.get_temp_dir()
-    ckpt_prefix = f'{saved_dir}/ckpt'
-    ckpt.save(ckpt_prefix)
-
-    # Run once more to alter slot variables and ensure checkpoint restores
-    # the earlier values.
-    self.strategy.run(train_step)
-
-    changed_ckpt_slots = []
-    for slot in opt.get_slot_names():
-      changed_ckpt_slots.extend([v.numpy() for v in opt.get_slot(var, slot)])
-    self.assertNotAllClose(pre_ckpt_slots, changed_ckpt_slots)
-
-    ckpt.restore(tf.train.latest_checkpoint(saved_dir))
-
-    post_ckpt_slots = []
-    for slot in opt.get_slot_names():
-      post_ckpt_slots.extend([v.numpy() for v in opt.get_slot(var, slot)])
-
-    self.assertAllClose(pre_ckpt_slots, post_ckpt_slots)
-
-  def test_slot_variable_checkpoint_load_with_diff_shards(self):
-
-    with self.strategy.scope():
-      # Set a name so the ShardedVariable is well-named for slot var keying
-      var = tf.Variable([1., 2., 3., 4., 5., 6.], name='test')
-
-    opt = keras.optimizers.optimizer_v2.adam.Adam()
-
-    # Run once to trigger apply_gradients to populate optimizer slot variables.
-    def train_step():
-      with tf.GradientTape() as tape:
-        loss = sum(var)
-      opt.minimize(loss, var.variables, tape=tape)
-
-    self.strategy.run(train_step)
-
-    # Check that we can call get_slot using each slot, before and after
-    # Checkpointing, and get the same results
-    pre_ckpt_slots = []
-    for slot in opt.get_slot_names():
-      pre_ckpt_slots.extend(
-          tf.concat(list(opt.get_slot(var, slot)), axis=0).numpy())
-
-    ckpt = tf.train.Checkpoint(var=var, opt=opt)
-    saved_dir = self.get_temp_dir()
-    ckpt_prefix = f'{saved_dir}/ckpt'
-    ckpt.save(ckpt_prefix)
-
-    # Create new strategy with different number of shards
-    strategy2 = tf.distribute.experimental.ParameterServerStrategy(
-        multi_worker_testing_utils.make_parameter_server_cluster(3, 2),
-        variable_partitioner=tf.distribute.experimental.partitioners
-        .FixedShardsPartitioner(3))
-
-    # Create new variable with different values, to be overwritten by ckpt.
-    with strategy2.scope():
-      var = tf.Variable([0., 1., 2., 3., 4., 5.], name='test')
-
-    opt = keras.optimizers.optimizer_v2.adam.Adam()
-    # Run once to trigger apply_gradients to populate optimizer slot variables.
-    strategy2.run(train_step)
-
-    new_ckpt = tf.train.Checkpoint(var=var, opt=opt)
-    new_ckpt.restore(tf.train.latest_checkpoint(saved_dir))
-    post_ckpt_slots = []
-    for slot in new_ckpt.opt.get_slot_names():
-      post_ckpt_slots.extend(
-          tf.concat(list(new_ckpt.opt.get_slot(var, slot)), axis=0).numpy())
-    self.assertAllClose(pre_ckpt_slots, post_ckpt_slots)
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.strategy = tf.distribute.experimental.ParameterServerStrategy(
+            multi_worker_testing_utils.make_parameter_server_cluster(3, 2),
+            variable_partitioner=tf.distribute.experimental.partitioners.FixedShardsPartitioner(  # noqa: E501
+                2
+            ),
+        )
+
+    def assert_list_all_equal(self, list1, list2):
+        """Used in lieu of `assertAllEqual`.
+
+        This is used to replace standard `assertAllEqual` for the cases where
+        `list1` and `list2` contain `AggregatingVariable`. Lists with
+        `AggregatingVariable` are not convertible to numpy array via `np.array`
+        calls as numpy would raise `ValueError: setting an array element with a
+        sequence.`
+
+        Args:
+          list1: The first list to compare equality.
+          list2: The second list to compare equality.
+        """
+        for lhs, rhs in zip(list1, list2):
+            self.assertEqual(lhs, rhs)
+
+    def test_keras_layer_setattr(self):
+        class Layer(base_layer.Layer):
+            def __init__(self):
+                super().__init__()
+                self.w = tf.Variable([0, 1])
+                self.b = tf.Variable([2, 3], trainable=False)
+
+        with self.strategy.scope():
+            layer = Layer()
+
+        self.assertLen(layer.trainable_weights, 2)
+        self.assertEqual(layer.trainable_weights[0], [0])
+        self.assertEqual(layer.trainable_weights[1], [1])
+        self.assertLen(layer.non_trainable_weights, 2)
+        self.assertEqual(layer.non_trainable_weights[0], [2])
+        self.assertEqual(layer.non_trainable_weights[1], [3])
+        self.assert_list_all_equal(
+            layer.weights, layer.trainable_weights + layer.non_trainable_weights
+        )
+        self.assert_list_all_equal(
+            layer.trainable_weights, layer.trainable_variables
+        )
+        self.assert_list_all_equal(layer.weights, layer.variables)
+
+        checkpoint_deps = set(layer._trackable_children().values())
+        self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
+
+    def test_keras_layer_add_weight(self):
+        class Layer(base_layer.Layer):
+            def __init__(self):
+                super().__init__()
+                self.w = self.add_weight(
+                    shape=(2,),
+                    initializer=lambda shape, dtype: tf.constant(
+                        [0.0, 1.0],
+                    ),
+                    trainable=True,
+                )
+                self.b = self.add_weight(
+                    shape=(2,),
+                    initializer=lambda shape, dtype: tf.constant([2.0, 3.0]),
+                    trainable=False,
+                )
+
+        with self.strategy.scope():
+            layer = Layer()
+
+        self.assertLen(layer.trainable_weights, 2)
+        self.assertEqual(layer.trainable_weights[0], [0.0])
+        self.assertEqual(layer.trainable_weights[1], [1.0])
+        self.assertLen(layer.non_trainable_weights, 2)
+        self.assertEqual(layer.non_trainable_weights[0], [2.0])
+        self.assertEqual(layer.non_trainable_weights[1], [3.0])
+        self.assert_list_all_equal(
+            layer.weights, layer.trainable_weights + layer.non_trainable_weights
+        )
+        self.assert_list_all_equal(
+            layer.trainable_weights, layer.trainable_variables
+        )
+        self.assert_list_all_equal(layer.weights, layer.variables)
+
+        checkpoint_deps = set(layer._trackable_children().values())
+        self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
+
+    def test_keras_metrics(self):
+        with self.strategy.scope():
+            fp = keras.metrics.FalsePositives(thresholds=[0.2, 0.5, 0.7, 0.8])
+            auc = keras.metrics.AUC(num_thresholds=10)
+
+        @tf.function
+        def update():
+            fp.update_state([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.3, 0.9])
+            auc.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+
+        @tf.function
+        def reset():
+            fp.reset_state()
+            auc.reset_state()
+
+        update()
+        self.assertEqual(auc.result(), 0.75)
+        self.assertAllEqual(fp.result(), [2.0, 1.0, 1.0, 1.0])
+        reset()
+        self.assertEqual(auc.result(), 0.0)
+        self.assertAllEqual(fp.result(), [0.0, 0.0, 0.0, 0.0])
+
+        self.assertTrue(hasattr(auc.true_positives, "variables"))
+        self.assertTrue(hasattr(fp.accumulator, "variables"))
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            shard_config=[
+                [2, 2],
+                [2, 3],
+                [3, 2],
+                [2, 1],
+                [1, 1],
+                [1, 2],
+                [1, 3],
+            ],
+            model_type=["dense", "embedding"],
+        )
+    )
+    def test_saved_model_combined(self, shard_config, model_type):
+        """Test saving and loading models with various fixed numbers of shards.
+
+        Args:
+          shard_config: The number of shards to use per variable before and
+            after loading. For example, [1, 3] means to create and save the
+            model with 1 shard (i.e., no variable partitioning), and load it
+            into 3 shards per variable.
+          model_type: Either 'dense' or 'embedding', which simple model to test.
+        """
+
+        def create_embedding_model():
+            inputs = keras.layers.Input(shape=(6,))
+            embedding = keras.layers.Embedding(output_dim=2, input_dim=6)
+            outputs = embedding(inputs)
+            model = keras.Model(inputs, outputs)
+            model.compile(optimizer="adam", loss="mean_squared_error")
+            return model
+
+        def create_dense_model():
+            inputs = keras.layers.Input(shape=(6,))
+            outputs = keras.layers.Dense(6)(inputs)
+            model = keras.Model(inputs, outputs)
+            model.compile(optimizer="adam", loss="mean_squared_error")
+            return model
+
+        # Maybe create new strategy with different number of shards
+        if shard_config[0] > 2:
+            strategy = tf.distribute.experimental.ParameterServerStrategy(
+                multi_worker_testing_utils.make_parameter_server_cluster(3, 3),
+                variable_partitioner=tf.distribute.experimental.partitioners.FixedShardsPartitioner(  # noqa: E501
+                    shard_config[0]
+                ),
+            )
+        elif shard_config[0] == 2:
+            strategy = self.strategy
+        else:
+            # Just one shard, so use default strategy
+            strategy = tf.distribute.get_strategy()
+
+        x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
+        with strategy.scope():
+            model = (
+                create_dense_model()
+                if model_type == "dense"
+                else create_embedding_model()
+            )
+            expect = model(x)
+
+        # Dense layers have two variables (kernel and bias), embedding layers
+        # have 1
+        n_expected_variables = shard_config[0] * (
+            2 if model_type == "dense" else 1
+        )
+        self.assertLen(model.variables, n_expected_variables)
+        model_weights = [v.numpy() for v in model.variables]
+
+        saved_dir = self.get_temp_dir()
+        model.save(saved_dir)
+
+        if shard_config[1] > 2:
+            strategy2 = tf.distribute.experimental.ParameterServerStrategy(
+                multi_worker_testing_utils.make_parameter_server_cluster(3, 3),
+                variable_partitioner=tf.distribute.experimental.partitioners.FixedShardsPartitioner(  # noqa: E501
+                    shard_config[1]
+                ),
+            )
+        elif shard_config[1] == 2:
+            strategy2 = self.strategy
+        else:
+            # Just one shard, so use default strategy
+            strategy2 = tf.distribute.get_strategy()
+
+        with strategy2.scope():
+            loaded_model = keras.models.load_model(saved_dir)
+            got = loaded_model(x)
+
+            self.assertAllClose(got, expect)
+            n_expected_variables = shard_config[1] * (
+                2 if model_type == "dense" else 1
+            )
+            self.assertLen(loaded_model.variables, n_expected_variables)
+            loaded_model_weights = [v.numpy() for v in loaded_model.variables]
+            self.assertAllClose(
+                np.concatenate([w.flatten() for w in model_weights]),
+                np.concatenate([w.flatten() for w in loaded_model_weights]),
+            )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=strategy_combinations.strategies_minus_tpu,
+            model_type=["dense", "embedding"],
+        )
+    )
+    def test_saved_model_load_non_pss(self, model_type, distribution):
+        def create_embedding_model():
+            inputs = keras.layers.Input(shape=(6,))
+            embedding = keras.layers.Embedding(output_dim=2, input_dim=6)
+            outputs = embedding(inputs)
+            model = keras.Model(inputs, outputs)
+            model.compile(optimizer="adam", loss="mean_squared_error")
+            return model
+
+        def create_dense_model():
+            inputs = keras.layers.Input(shape=(6,))
+            outputs = keras.layers.Dense(6)(inputs)
+            model = keras.Model(inputs, outputs)
+            model.compile(optimizer="adam", loss="mean_squared_error")
+            return model
+
+        x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
+        with self.strategy.scope():
+            model = (
+                create_dense_model()
+                if model_type == "dense"
+                else create_embedding_model()
+            )
+            expect = model(x)
+
+        model_weights = [v.numpy() for v in model.variables]
+
+        saved_dir = self.get_temp_dir()
+        model.save(saved_dir)
+
+        with distribution.scope():
+            loaded_model = keras.models.load_model(saved_dir)
+            got = loaded_model(x)
+
+            self.assertAllClose(got, expect)
+            n_expected_variables = 2 if model_type == "dense" else 1
+            self.assertLen(loaded_model.variables, n_expected_variables)
+            loaded_model_weights = [v.numpy() for v in loaded_model.variables]
+            self.assertAllClose(
+                np.concatenate([w.flatten() for w in model_weights]),
+                np.concatenate([w.flatten() for w in loaded_model_weights]),
+            )
+
+    def test_slot_variable_checkpointing(self):
+
+        with self.strategy.scope():
+            # Set a name so the ShardedVariable is well-named for slot var
+            # keying
+            var = tf.Variable([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="test")
+
+        opt = keras.optimizers.legacy.adam.Adam()
+
+        # Run once to trigger apply_gradients to populate optimizer slot
+        # variables.
+        def train_step():
+            with tf.GradientTape() as tape:
+                loss = sum(var)
+            opt.minimize(loss, var.variables, tape=tape)
+
+        self.strategy.run(train_step)
+
+        # Check that we can call get_slot using each slot, before and after
+        # Checkpointing, and get the same results
+        pre_ckpt_slots = []
+        for slot in opt.get_slot_names():
+            pre_ckpt_slots.extend([v.numpy() for v in opt.get_slot(var, slot)])
+
+        ckpt = tf.train.Checkpoint(var=var, opt=opt)
+
+        # Assert that checkpoint has slots for each shard and the
+        # ShardedVariable
+        self.assertLen(ckpt.opt._slots, 3)
+        for var_name in ckpt.opt._slots.keys():
+            self.assertLen(ckpt.opt._slots[var_name], 2)
+            self.assertEqual(ckpt.opt._slots[var_name].keys(), {"m", "v"})
+            if hasattr(ckpt.opt._slots[var_name]["m"], "variables"):
+                self.assertLen(ckpt.opt._slots[var_name]["m"].variables, 2)
+                self.assertLen(ckpt.opt._slots[var_name]["v"].variables, 2)
+
+        saved_dir = self.get_temp_dir()
+        ckpt_prefix = f"{saved_dir}/ckpt"
+        ckpt.save(ckpt_prefix)
+
+        # Run once more to alter slot variables and ensure checkpoint restores
+        # the earlier values.
+        self.strategy.run(train_step)
+
+        changed_ckpt_slots = []
+        for slot in opt.get_slot_names():
+            changed_ckpt_slots.extend(
+                [v.numpy() for v in opt.get_slot(var, slot)]
+            )
+        self.assertNotAllClose(pre_ckpt_slots, changed_ckpt_slots)
+
+        ckpt.restore(tf.train.latest_checkpoint(saved_dir))
+
+        post_ckpt_slots = []
+        for slot in opt.get_slot_names():
+            post_ckpt_slots.extend([v.numpy() for v in opt.get_slot(var, slot)])
+
+        self.assertAllClose(pre_ckpt_slots, post_ckpt_slots)
+
+    def test_slot_variable_checkpoint_load_with_diff_shards(self):
+
+        with self.strategy.scope():
+            # Set a name so the ShardedVariable is well-named for slot var
+            # keying
+            var = tf.Variable([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="test")
+
+        opt = keras.optimizers.legacy.adam.Adam()
+
+        # Run once to trigger apply_gradients to populate optimizer slot
+        # variables.
+        def train_step():
+            with tf.GradientTape() as tape:
+                loss = sum(var)
+            opt.minimize(loss, var.variables, tape=tape)
+
+        self.strategy.run(train_step)
+
+        # Check that we can call get_slot using each slot, before and after
+        # Checkpointing, and get the same results
+        pre_ckpt_slots = []
+        for slot in opt.get_slot_names():
+            pre_ckpt_slots.extend(
+                tf.concat(list(opt.get_slot(var, slot)), axis=0).numpy()
+            )
+
+        ckpt = tf.train.Checkpoint(var=var, opt=opt)
+        saved_dir = self.get_temp_dir()
+        ckpt_prefix = f"{saved_dir}/ckpt"
+        ckpt.save(ckpt_prefix)
+
+        # Create new strategy with different number of shards
+        strategy2 = tf.distribute.experimental.ParameterServerStrategy(
+            multi_worker_testing_utils.make_parameter_server_cluster(3, 2),
+            variable_partitioner=tf.distribute.experimental.partitioners.FixedShardsPartitioner(  # noqa: E501
+                3
+            ),
+        )
+
+        # Create new variable with different values, to be overwritten by ckpt.
+        with strategy2.scope():
+            var = tf.Variable([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], name="test")
+
+        opt = keras.optimizers.legacy.adam.Adam()
+        # Run once to trigger apply_gradients to populate optimizer slot
+        # variables.
+        strategy2.run(train_step)
+
+        new_ckpt = tf.train.Checkpoint(var=var, opt=opt)
+        new_ckpt.restore(tf.train.latest_checkpoint(saved_dir))
+        post_ckpt_slots = []
+        for slot in new_ckpt.opt.get_slot_names():
+            post_ckpt_slots.extend(
+                tf.concat(
+                    list(new_ckpt.opt.get_slot(var, slot)), axis=0
+                ).numpy()
+            )
+        self.assertAllClose(pre_ckpt_slots, post_ckpt_slots)
 
 
 class ShardedVariableMixedPartitioningTest(tf.test.TestCase):
-
-  def test_saved_model_min_size_partitioner(self):
-
-    # set min_shard_bytes such that Dense kernel is split into 2 and bias into 1
-    partitioner = tf.distribute.experimental.partitioners.MinSizePartitioner(
-        min_shard_bytes=(6 * 6 * 4) // 2, max_shards=2)
-
-    cluster_resolver = multi_worker_testing_utils.make_parameter_server_cluster(
-        3, 2)
-    strategy = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver, variable_partitioner=partitioner)
-
-    def create_dense_model():
-      inputs = keras.layers.Input(shape=(6,))
-      outputs = keras.layers.Dense(6)(inputs)
-      model = keras.Model(inputs, outputs)
-      model.compile(optimizer='adam', loss='mean_squared_error')
-      return model
-
-    x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
-    with strategy.scope():
-      model = create_dense_model()
-      expect = model(x)
-
-    # 2 kernel variables, 1 bias
-    self.assertLen(model.variables, 3)
-
-    saved_dir = self.get_temp_dir()
-    model.save(saved_dir)
-
-    # set min_shard_bytes such that Dense kernel is split into 3 and bias into 1
-    partitioner2 = tf.distribute.experimental.partitioners.MinSizePartitioner(
-        min_shard_bytes=(6 * 6 * 4) // 3, max_shards=3)
-    strategy2 = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver, variable_partitioner=partitioner2)
-
-    with strategy2.scope():
-      loaded_model = keras.models.load_model(saved_dir)
-      got = loaded_model(x)
-
-      self.assertAllClose(got, expect)
-      # 3 kernel variables, 1 bias
-      self.assertLen(loaded_model.variables, 4)
-
-
-if __name__ == '__main__':
-  tf.compat.v1.enable_v2_behavior()
-  tf.test.main()
+    def test_saved_model_min_size_partitioner(self):
+
+        # set min_shard_bytes such that Dense kernel is split into 2 and bias
+        # into 1
+        partitioner = (
+            tf.distribute.experimental.partitioners.MinSizePartitioner(
+                min_shard_bytes=(6 * 6 * 4) // 2, max_shards=2
+            )
+        )
+
+        cluster_resolver = (
+            multi_worker_testing_utils.make_parameter_server_cluster(3, 2)
+        )
+        strategy = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver, variable_partitioner=partitioner
+        )
+
+        def create_dense_model():
+            inputs = keras.layers.Input(shape=(6,))
+            outputs = keras.layers.Dense(6)(inputs)
+            model = keras.Model(inputs, outputs)
+            model.compile(optimizer="adam", loss="mean_squared_error")
+            return model
+
+        x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
+        with strategy.scope():
+            model = create_dense_model()
+            expect = model(x)
+
+        # 2 kernel variables, 1 bias
+        self.assertLen(model.variables, 3)
+
+        saved_dir = self.get_temp_dir()
+        model.save(saved_dir)
+
+        # set min_shard_bytes such that Dense kernel is split into 3 and bias
+        # into 1
+        partitioner2 = (
+            tf.distribute.experimental.partitioners.MinSizePartitioner(
+                min_shard_bytes=(6 * 6 * 4) // 3, max_shards=3
+            )
+        )
+        strategy2 = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver, variable_partitioner=partitioner2
+        )
+
+        with strategy2.scope():
+            loaded_model = keras.models.load_model(saved_dir)
+            got = loaded_model(x)
+
+            self.assertAllClose(got, expect)
+            # 3 kernel variables, 1 bias
+            self.assertLen(loaded_model.variables, 4)
+
+
+if __name__ == "__main__":
+    tf.compat.v1.enable_v2_behavior()
+    tf.test.main()
diff --git a/keras/distribute/sidecar_evaluator.py b/keras/distribute/sidecar_evaluator.py
deleted file mode 100644
index 0e9cfe56c21f..000000000000
--- a/keras/distribute/sidecar_evaluator.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python module for evaluation loop."""
-
-import tensorflow.compat.v2 as tf
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import keras_export
-
-_PRINT_EVAL_STEP_EVERY_SEC = 60.0
-_ITERATIONS_UNINITIALIZED = -1
-_CHECKPOINT_TIMEOUT_SEC = 30
-
-
-def list_checkpoint_attributes(ckpt_dir_or_file):
-  """Lists all the attributes in a checkpoint.
-
-  Checkpoint keys are paths in a checkpoint graph, and attribute is the first
-  element in the path. e.g. with a checkpoint key
-  "optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE", optimizer is the attribute. The
-  attribute is also used to save/restore a variable in a checkpoint,
-  e.g. tf.train.Checkpoint(optimizer=optimizer, model=model).
-
-  Args:
-    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
-
-  Returns:
-    Set of attributes in a checkpoint.
-  """
-  reader = tf.train.load_checkpoint(ckpt_dir_or_file)
-  variable_map = reader.get_variable_to_shape_map()
-  return {name.split('/')[0] for name in variable_map.keys()}
-
-
-@keras_export('keras.utils.SidecarEvaluator', v1=[])
-class SidecarEvaluator:
-  """A class designed for a dedicated evaluator task.
-
-  `SidecarEvaluator` is expected to be run in a process on a separate machine
-  from the training cluster. It is meant for the purpose of a dedicated
-  evaluator, evaluating the metric results of a training cluster which has one
-  or more workers performing the training, and saving checkpoints.
-
-  The `SidecarEvaluator` API is compatible with both Custom Training Loop (CTL),
-  and Keras `Model.fit` to be used in the training cluster. Using the model
-  (with compiled metrics) provided at `__init__`, `SidecarEvaluator` repeatedly
-  performs evaluation "epochs" when it finds a checkpoint that has not yet been
-  used. Depending on the `steps` argument, an eval epoch is evaluation over all
-  eval data, or up to certain number of steps (batches). See examples below for
-  how the training program should save the checkpoints in order to be recognized
-  by `SidecarEvaluator`.
-
-  Since under the hood, `SidecarEvaluator` uses `model.evaluate` for evaluation,
-  it also supports arbitrary Keras callbacks. That is, if one or more callbacks
-  are provided, their `on_test_batch_begin` and `on_test_batch_end` methods are
-  called at the start and end of a batch, and their `on_test_begin` and
-  `on_test_end` are called at the start and end of an evaluation epoch. Note
-  that `SidecarEvaluator` may skip some checkpoints because it always picks up
-  the latest checkpoint available, and during an evaluation epoch, multiple
-  checkpoints can be produced from the training side.
-
-  Example:
-  ```python
-  model = tf.keras.models.Sequential(...)
-  model.compile(metrics=tf.keras.metrics.SparseCategoricalAccuracy(
-      name="eval_metrics"))
-  data = tf.data.Dataset.from_tensor_slices(...)
-
-  tf.keras.SidecarEvaluator(
-      model=model,
-      data=data,
-      checkpoint_dir='/tmp/checkpoint_dir',  # dir for training-saved checkpoint
-      steps=None,  # Eval until dataset is exhausted
-      max_evaluations=None,  # The evaluation needs to be stopped manually
-      callbacks=[tf.keras.callbacks.TensorBoard(log_dir='/tmp/log_dir')]
-  ).start()
-  ```
-
-  `SidecarEvaluator.start` writes a series of summary
-  files which can be visualized by tensorboard (which provides a webpage link):
-
-  ```bash
-  $ tensorboard --logdir=/tmp/log_dir/validation
-  ...
-  TensorBoard 2.4.0a0 at http://host:port (Press CTRL+C to quit)
-  ```
-
-  If the training cluster uses a CTL, the `checkpoint_dir` should contain
-  checkpoints that track both `model` and `optimizer`, to fulfill
-  `SidecarEvaluator`'s expectation. This can be done by a
-  `tf.train.Checkpoint` and a `tf.train.CheckpointManager`:
-
-  ```python
-  checkpoint_dir = ...  # Same `checkpoint_dir` supplied to `SidecarEvaluator`.
-  checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
-  checkpoint_manager = tf.train.CheckpointManager(
-      checkpoint, checkpoint_dir=..., max_to_keep=...)
-  checkpoint_manager.save()
-  ```
-
-  If the training cluster uses Keras `Model.fit` API, a
-  `tf.keras.callbacks.ModelCheckpoint` should be used, with
-  `save_weights_only=True`, and the `filepath` should have 'ckpt-{epoch}'
-  appended:
-
-  ```python
-  checkpoint_dir = ...  # Same `checkpoint_dir` supplied to `SidecarEvaluator`.
-  model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
-      filepath=os.path.join(checkpoint_dir, 'ckpt-{epoch}'),
-      save_weights_only=True)
-  model.fit(dataset, epochs, callbacks=[model_checkpoint])
-  ```
-  """
-
-  def __init__(self,
-               model,
-               data,
-               checkpoint_dir,
-               steps=None,
-               max_evaluations=None,
-               callbacks=None):
-    """Initializes an `SidecarEvaluator` object.
-
-    Args:
-      model: Model to use for evaluation. The model object used here should be a
-        `tf.keras.Model`, and should be the same as the one that is used in
-        training, where `tf.keras.Model`s are checkpointed. The model should
-        have one or more metrics compiled before using `SidecarEvaluator`.
-      data: The input data for evaluation. `SidecarEvaluator` supports all data
-        types that Keras `model.evaluate` supports as the input data `x`, such
-        as a `tf.data.Dataset`.
-      checkpoint_dir: Directory where checkpoint files are saved.
-      steps: Number of steps to perform evaluation for, when evaluating a single
-        checkpoint file. If `None`, evaluation continues until the dataset is
-        exhausted. For repeated evaluation dataset, user must specify `steps` to
-        avoid infinite evaluation loop.
-      max_evaluations: Maximum number of the checkpoint file to be evaluated,
-        for `SidecarEvaluator` to know when to stop. The evaluator will stop
-        after it evaluates a checkpoint filepath ending with
-        '<ckpt_name>-<max_evaluations>'. If using
-        `tf.train.CheckpointManager.save` for saving checkpoints, the kth saved
-        checkpoint has the filepath suffix '<ckpt_name>-<k>' (k=1 for the first
-        saved), and if checkpoints are saved every epoch after training, the
-        filepath saved at the kth epoch would end with '<ckpt_name>-<k>. Thus,
-        if training runs for n epochs, and the evaluator should end after the
-        training finishes, use n for this parameter. Note that this is not
-        necessarily equal to the number of total evaluations, since some
-        checkpoints may be skipped if evaluation is slower than checkpoint
-        creation. If `None`, `SidecarEvaluator` will evaluate indefinitely, and
-        the user must terminate evaluator program themselves.
-      callbacks: List of `keras.callbacks.Callback` instances to apply during
-        evaluation. See [callbacks](/api_docs/python/tf/keras/callbacks).
-    """
-    self.model = model
-    self.data = data
-    self.checkpoint_dir = checkpoint_dir
-    self._iterations = tf.Variable(
-        name='iterations',
-        initial_value=_ITERATIONS_UNINITIALIZED,
-        dtype=tf.int64)
-    self.max_evaluations = max_evaluations
-    self.steps = steps
-    self.callbacks = callbacks or []
-
-  def _timeout_fn(self):
-    logging.info(
-        f'No checkpoints appear to be found after {_CHECKPOINT_TIMEOUT_SEC} '
-        'seconds. Please check if you are properly using a '
-        '`tf.train.Checkpoint/CheckpointManager` or '
-        '`tf.keras.callbacks.ModelCheckpoint(save_weights_only=True)` to save '
-        'checkpoints by the training. See '
-        '`tf.keras.SidecarEvaluator` doc for recommended flows '
-        'of saving checkpoints.')
-    return False
-
-  def start(self):
-    """Starts the evaluation loop."""
-    optimizer_checkpoint = tf.train.Checkpoint(iter=self._iterations)
-    checkpoint = tf.train.Checkpoint(
-        model=self.model, optimizer=optimizer_checkpoint)
-
-    for latest_checkpoint in tf.train.checkpoints_iterator(
-        self.checkpoint_dir,
-        timeout=_CHECKPOINT_TIMEOUT_SEC,
-        timeout_fn=self._timeout_fn):
-      try:
-        # `expect_partial` because the checkpoint can have other `Trackable`s
-        # such as `optimizer`.
-        checkpoint.restore(latest_checkpoint).expect_partial()
-        checkpoint_attributes = list_checkpoint_attributes(latest_checkpoint)
-        # The checkpoint should contain model and optimizer for SidecarEvaluator
-        # to work. But the model weights saved by ModelCheckpoint callback does
-        # not contain model as an attribute. To make SidecarEvaluator compatibly
-        # work in this case, use model.load_weights to load the model's weights,
-        # while self._iterations is still restored by checkpoint variable.
-        if 'model' not in checkpoint_attributes:
-          self.model.load_weights(latest_checkpoint)
-        # The model checkpoint might not include optimizer in cases, e.g.
-        # using a custom training loop. Directly assign the iterations
-        # property to be used in callbacks.
-        if self.model.optimizer:
-          self.model.optimizer.iterations.assign(self._iterations)
-      except (tf.errors.OpError,) as e:
-        # A couple errors can happen here with the coordinator racing to write
-        # checkpoint:
-        # 1) OpError: open failed for <file path>: No such file or directory
-        # 2) NotFoundError (subclass of OpError): Unsuccessful
-        # TensorSliceReader constructor.
-        # TODO(rchao): Remove this except block once b/150954027 is resolved.
-        logging.info(
-            'SidecarEvaluator encountered an error when loading the checkpoint '
-            f'at {latest_checkpoint}. Retrying. '
-            f'Error: {e.__class__.__name__}: {e}')
-        continue
-
-      if self._iterations.numpy() == _ITERATIONS_UNINITIALIZED:
-        raise RuntimeError(
-            'Variable `iterations` cannot be loaded from the '
-            f'checkpoint file at {self.checkpoint_dir}. '
-            'Please ensure `iterations` is '
-            'included in the checkpoint saved during training.')
-
-      logging.info(
-          'Evaluation starts: Model weights loaded from latest '
-          f'checkpoint file {latest_checkpoint}')
-
-      self.model.evaluate(
-          self.data, steps=self.steps, callbacks=self.callbacks, verbose=2)
-
-      return_metrics = {}
-      for metric in self.model.metrics:
-        result = metric.result()
-        if isinstance(result, dict):
-          return_metrics.update(result)
-        else:
-          return_metrics[metric.name] = result
-
-      logging.info(
-          'End of evaluation. Metrics: %s', ' '.join([
-              '{}={}'.format(name, value.numpy())
-              for name, value in return_metrics.items()
-          ]))
-
-      if (self.max_evaluations and
-          (self.max_evaluations <= int(latest_checkpoint.split('-')[-1]))):
-        # Exit the loop because we have evaluated the final checkpoint file.
-        logging.info('Last checkpoint evaluated. SidecarEvaluator stops.')
-        return
-
-
-@keras_export('keras.experimental.SidecarEvaluator', v1=[])
-@deprecation.deprecated_endpoints('keras.experimental.SidecarEvaluator')
-class SidecarEvaluatorExperimental(SidecarEvaluator):
-  """Deprecated. Please use `tf.keras.utils.SidecarEvaluator` instead.
-
-  Caution: `tf.keras.experimental.SidecarEvaluator` endpoint is
-    deprecated and will be removed in a future release. Please use
-    `tf.keras.utils.SidecarEvaluator`.
-  """
-
-  def __init__(self, *args, **kwargs):
-    logging.warning(
-        '`tf.keras.experimental.SidecarEvaluator` endpoint is '
-        'deprecated and will be removed in a future release. Please use '
-        '`tf.keras.utils.SidecarEvaluator`.')
-    super().__init__(*args, **kwargs)
diff --git a/keras/distribute/sidecar_evaluator_test.py b/keras/distribute/sidecar_evaluator_test.py
deleted file mode 100644
index 0d5b54dbd419..000000000000
--- a/keras/distribute/sidecar_evaluator_test.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test covering sidecar_evaluator.py."""
-
-import enum
-import os
-import threading
-import time
-
-from absl.testing import parameterized
-import keras
-from keras.distribute import sidecar_evaluator as sidecar_evaluator_lib
-from keras.optimizers.optimizer_v2 import gradient_descent
-from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-from tensorflow.python.platform import tf_logging as logging
-
-_BATCH_SIZE = 32
-
-
-class TestModel(keras.Model):
-
-  def __init__(self):
-    super().__init__(name='test_model')
-    self.dense = keras.layers.Dense(10)
-
-  def call(self, inputs):
-    return self.dense(inputs)
-
-
-class DictMetric(keras.metrics.MeanSquaredError):
-
-  def result(self):
-    res = super().result()
-    return {'mean_squared_error_1': res, 'mean_squared_error_2': res}
-
-
-class ModelType(enum.Enum):
-  SEQUENTIAL = 'sequential'
-  SUBCLASS = 'subclass'
-
-
-def _test_model_builder(model_type: ModelType, compile_model, build_model):
-  if model_type == ModelType.SEQUENTIAL:
-    model = keras.Sequential([keras.layers.Dense(10)])
-  elif model_type == ModelType.SUBCLASS:
-    model = TestModel()
-
-  if compile_model:
-    model.compile(
-        gradient_descent.SGD(),
-        loss='mse',
-        metrics=[keras.metrics.CategoricalAccuracy(),
-                 DictMetric()])
-  if build_model:
-    model.build((None, 32))
-
-  return model
-
-
-@test_utils.run_v2_only
-class SidecarEvaluatorTest(tf.test.TestCase, parameterized.TestCase):
-
-  def assertSummaryEventsWritten(self, log_dir):
-    # Asserts summary files do get written when log_dir is provided.
-    summary_files = tf.io.gfile.listdir(log_dir)
-    self.assertNotEmpty(
-        summary_files, 'Summary should have been written and '
-        'log_dir should not be empty.')
-
-    # Asserts the content of the summary file.
-    event_pb_written = False
-    event_tags = []
-    for summary_file in summary_files:
-      for event_pb in tf.compat.v1.train.summary_iterator(
-          os.path.join(log_dir, summary_file)):
-        if event_pb.step > 0:
-          self.assertEqual(event_pb.step, 32)
-          event_tags.append(event_pb.summary.value[0].tag)
-          event_pb_written = True
-    self.assertCountEqual(event_tags, [
-        'evaluation_categorical_accuracy_vs_iterations',
-        'evaluation_loss_vs_iterations',
-        'evaluation_mean_squared_error_1_vs_iterations',
-        'evaluation_mean_squared_error_2_vs_iterations',
-    ])
-
-    # Verifying at least one non-zeroth step is written to summary.
-    self.assertTrue(event_pb_written)
-
-  def assertModelsSameVariables(self, model_a, model_b):
-    # Check both have the same number of variables.
-    self.assertEqual(len(model_a.variables), len(model_b.variables))
-
-    # Check variable values to be equal.
-    for var_a, var_b in zip(model_a.variables, model_b.variables):
-      self.assertAllEqual(var_a.numpy(), var_b.numpy())
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'], model_type=[ModelType.SEQUENTIAL,
-                                      ModelType.SUBCLASS]))
-  def testIterationsNotSavedWillRaiseError(self, model_type):
-    model = _test_model_builder(
-        model_type=model_type, compile_model=False, build_model=True)
-
-    checkpoint_dir = self.get_temp_dir()
-    checkpoint = tf.train.Checkpoint(model=model)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint, checkpoint_dir, max_to_keep=2)
-    checkpoint_manager.save()
-
-    sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
-        model, data=None, checkpoint_dir=checkpoint_dir)
-    with self.assertRaisesRegex(
-        RuntimeError, '`iterations` cannot be loaded '
-        'from the checkpoint file.'):
-      sidecar_evaluator.start()
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'], model_type=[ModelType.SEQUENTIAL,
-                                      ModelType.SUBCLASS]))
-  def testModelNotBuiltRaiseError(self, model_type):
-    model = _test_model_builder(
-        model_type=model_type, compile_model=False, build_model=False)
-
-    checkpoint_dir = self.get_temp_dir()
-    checkpoint = tf.train.Checkpoint(model=model)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint, checkpoint_dir, max_to_keep=2)
-    checkpoint_manager.save()
-
-    sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
-        model, data=None, checkpoint_dir=checkpoint_dir)
-    with self.assertRaisesRegex(AssertionError, 'Nothing to load.'):
-      sidecar_evaluator.start()
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'],
-          model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
-          build_model=[True, False]))
-  def testSidecarEvaluatorOutputsSummary(self, model_type, build_model):
-    # Create a model with synthetic data, and fit for one epoch.
-    model = _test_model_builder(
-        model_type=model_type, compile_model=True, build_model=False)
-    data = np.random.random((1000, 32))
-    labels = np.random.random((1000, 10))
-    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-    dataset = dataset.batch(32)
-    model.fit(dataset, epochs=1)
-
-    # Save a checkpoint.
-    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
-    log_dir = os.path.join(self.get_temp_dir(), 'summary')
-    logging.info('checkpoint_dir = %s, log_dir = %s', checkpoint_dir, log_dir)
-    checkpoint = tf.train.Checkpoint(
-        model=model, optimizer=model.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint, checkpoint_dir, max_to_keep=2)
-    logging.info('Checkpoint manager saved to: %s', checkpoint_manager.save())
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(checkpoint_dir),
-        'Checkpoint should have been written and '
-        'checkpoint_dir should not be empty.')
-
-    # Create a new model used for evaluation.
-    eval_model = _test_model_builder(
-        model_type=model_type, compile_model=True, build_model=build_model)
-    # Have a sidecar_evaluator evaluate once.
-    sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
-        eval_model,
-        data=dataset,
-        checkpoint_dir=checkpoint_dir,
-        max_evaluations=1,
-        callbacks=[keras.callbacks.TensorBoard(log_dir=log_dir)])
-    sidecar_evaluator.start()
-    # Eval model has been restored to the same state as the original model, so
-    # their weights should match. If not, restoration of the model didn't
-    # work.
-    self.assertModelsSameVariables(model, eval_model)
-
-    self.assertSummaryEventsWritten(os.path.join(log_dir, 'validation'))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'],
-          model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
-          build_model=[True, False]))
-  def testSidecarEvaluatorOutputsSummarySavedWithCallback(
-      self, model_type, build_model):
-    checkpoint_dir = os.path.join(self.get_temp_dir(), 'checkpoints')
-    log_dir = os.path.join(self.get_temp_dir(), 'summary')
-    # Create a model with synthetic data, and fit for one epoch.
-    model = _test_model_builder(
-        model_type=model_type, compile_model=True, build_model=False)
-    data = np.random.random((1000, 32))
-    labels = np.random.random((1000, 10))
-    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-    dataset = dataset.batch(_BATCH_SIZE)
-    save_callback = keras.callbacks.ModelCheckpoint(
-        filepath=os.path.join(checkpoint_dir, 'ckpt-{epoch}'),
-        save_weights_only=True)
-    model.fit(dataset, epochs=1, callbacks=[save_callback])
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(checkpoint_dir),
-        'Checkpoint should have been written and '
-        'checkpoint_dir should not be empty.')
-
-    # Create a new model used for evaluation.
-    eval_model = _test_model_builder(
-        model_type=model_type, compile_model=True, build_model=build_model)
-    # Have an sidecar_evaluator evaluate once.
-    sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
-        eval_model,
-        data=dataset,
-        checkpoint_dir=checkpoint_dir,
-        max_evaluations=1,
-        callbacks=[keras.callbacks.TensorBoard(log_dir=log_dir)])
-    with self.assertLogs() as cm:
-      sidecar_evaluator.start()
-
-    metrics_logging = [
-        line for line in cm.output if 'End of evaluation' in line
-    ]
-    self.assertLen(metrics_logging, 1)
-    expected_logged_metrics = [
-        'loss', 'categorical_accuracy', 'mean_squared_error_1',
-        'mean_squared_error_2'
-    ]
-    for metric_name in expected_logged_metrics:
-      self.assertRegex(metrics_logging[0], f'{metric_name}=')
-
-    # Eval model has been restored to the same state as the original model, so
-    # their weights should match. If not, restoration of the model didn't
-    # work.
-    self.assertModelsSameVariables(model, eval_model)
-
-    # check the iterations is restored.
-    self.assertEqual(sidecar_evaluator._iterations.numpy(), _BATCH_SIZE)
-
-    self.assertSummaryEventsWritten(os.path.join(log_dir, 'validation'))
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'],
-          model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
-          build_model=[True, False]))
-  def testTimeoutFunction(self, model_type, build_model):
-    checkpoint_dir = os.path.join(self.get_temp_dir(), 'checkpoints')
-    # Create a model with synthetic data, and fit for one epoch.
-    data = np.random.random((1000, 32))
-    labels = np.random.random((1000, 10))
-    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-    dataset = dataset.batch(_BATCH_SIZE)
-
-    # Create a new model used for evaluation.
-    eval_model = _test_model_builder(
-        model_type=model_type, compile_model=True, build_model=build_model)
-    # Have an sidecar_evaluator evaluate once.
-    sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
-        eval_model,
-        data=dataset,
-        checkpoint_dir=checkpoint_dir,
-        max_evaluations=1)
-    with self.assertLogs() as cm:
-      threading.Thread(target=sidecar_evaluator.start, daemon=True).start()
-      time.sleep(50)
-
-    metrics_logging = [
-        l for l in cm.output if 'No checkpoints appear to be found' in l
-    ]
-    self.assertGreaterEqual(len(metrics_logging), 1)
-
-  def testExperimentalDeprecatedMessage(self):
-
-    warning_messages = []
-
-    def warning(msg):
-      warning_messages.append(msg)
-
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning', warning):
-      sidecar_evaluator_lib.SidecarEvaluatorExperimental(None, None, None)
-
-    warning_msg = ('`tf.keras.experimental.SidecarEvaluator` '
-                   'endpoint is deprecated')
-    self.assertIn(warning_msg, '\n'.join(warning_messages))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/distribute/simple_models.py b/keras/distribute/simple_models.py
index e9f751fc87d7..0b5384e12f85 100644
--- a/keras/distribute/simple_models.py
+++ b/keras/distribute/simple_models.py
@@ -14,126 +14,115 @@
 # ==============================================================================
 """A simple functional keras model with one layer."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.distribute import model_collection_base
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
 
 _BATCH_SIZE = 10
 
 
 def _get_data_for_simple_models():
-  x_train = tf.constant(np.random.rand(1000, 3), dtype=tf.float32)
-  y_train = tf.constant(np.random.rand(1000, 5), dtype=tf.float32)
-  x_predict = tf.constant(
-      np.random.rand(1000, 3), dtype=tf.float32)
+    x_train = tf.constant(np.random.rand(1000, 3), dtype=tf.float32)
+    y_train = tf.constant(np.random.rand(1000, 5), dtype=tf.float32)
+    x_predict = tf.constant(np.random.rand(1000, 3), dtype=tf.float32)
 
-  return x_train, y_train, x_predict
+    return x_train, y_train, x_predict
 
 
 class SimpleFunctionalModel(model_collection_base.ModelAndInput):
-  """A simple functional model and its inputs."""
+    """A simple functional model and its inputs."""
 
-  def get_model(self, **kwargs):
-    output_name = 'output_1'
+    def get_model(self, **kwargs):
+        output_name = "output_1"
 
-    x = keras.layers.Input(shape=(3,), dtype=tf.float32)
-    y = keras.layers.Dense(5, dtype=tf.float32, name=output_name)(x)
+        x = keras.layers.Input(shape=(3,), dtype=tf.float32)
+        y = keras.layers.Dense(5, dtype=tf.float32, name=output_name)(x)
 
-    model = keras.Model(inputs=x, outputs=y)
-    optimizer = gradient_descent.SGD(learning_rate=0.001)
-    model.compile(
-        loss='mse',
-        metrics=['mae'],
-        optimizer=optimizer)
+        model = keras.Model(inputs=x, outputs=y)
+        optimizer = gradient_descent.SGD(learning_rate=0.001)
+        model.compile(loss="mse", metrics=["mae"], optimizer=optimizer)
 
-    return model
+        return model
 
-  def get_data(self):
-    return _get_data_for_simple_models()
+    def get_data(self):
+        return _get_data_for_simple_models()
 
-  def get_batch_size(self):
-    return _BATCH_SIZE
+    def get_batch_size(self):
+        return _BATCH_SIZE
 
 
 class SimpleSequentialModel(model_collection_base.ModelAndInput):
-  """A simple sequential model and its inputs."""
+    """A simple sequential model and its inputs."""
 
-  def get_model(self, **kwargs):
-    output_name = 'output_1'
+    def get_model(self, **kwargs):
+        output_name = "output_1"
 
-    model = keras.Sequential()
-    y = keras.layers.Dense(
-        5, dtype=tf.float32, name=output_name, input_dim=3)
-    model.add(y)
-    optimizer = gradient_descent.SGD(learning_rate=0.001)
-    model.compile(
-        loss='mse',
-        metrics=['mae'],
-        optimizer=optimizer)
+        model = keras.Sequential()
+        y = keras.layers.Dense(
+            5, dtype=tf.float32, name=output_name, input_dim=3
+        )
+        model.add(y)
+        optimizer = gradient_descent.SGD(learning_rate=0.001)
+        model.compile(loss="mse", metrics=["mae"], optimizer=optimizer)
 
-    return model
+        return model
 
-  def get_data(self):
-    return _get_data_for_simple_models()
+    def get_data(self):
+        return _get_data_for_simple_models()
 
-  def get_batch_size(self):
-    return _BATCH_SIZE
+    def get_batch_size(self):
+        return _BATCH_SIZE
 
 
 class _SimpleModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+        self._dense_layer = keras.layers.Dense(5, dtype=tf.float32)
 
-  def __init__(self):
-    super().__init__()
-    self._dense_layer = keras.layers.Dense(5, dtype=tf.float32)
-
-  def call(self, inputs):
-    return self._dense_layer(inputs)
+    def call(self, inputs):
+        return self._dense_layer(inputs)
 
 
 class SimpleSubclassModel(model_collection_base.ModelAndInput):
-  """A simple subclass model and its data."""
+    """A simple subclass model and its data."""
 
-  def get_model(self, **kwargs):
-    model = _SimpleModel()
-    optimizer = gradient_descent.SGD(learning_rate=0.001)
-    model.compile(
-        loss='mse',
-        metrics=['mae'],
-        cloning=False,
-        optimizer=optimizer)
+    def get_model(self, **kwargs):
+        model = _SimpleModel()
+        optimizer = gradient_descent.SGD(learning_rate=0.001)
+        model.compile(
+            loss="mse", metrics=["mae"], cloning=False, optimizer=optimizer
+        )
 
-    return model
+        return model
 
-  def get_data(self):
-    return _get_data_for_simple_models()
+    def get_data(self):
+        return _get_data_for_simple_models()
 
-  def get_batch_size(self):
-    return _BATCH_SIZE
+    def get_batch_size(self):
+        return _BATCH_SIZE
 
 
 class _SimpleModule(tf.Module):
+    def __init__(self):
+        self.v = tf.Variable(3.0)
 
-  def __init__(self):
-    self.v = tf.Variable(3.0)
-
-  @tf.function
-  def __call__(self, x):
-    return self.v * x
+    @tf.function
+    def __call__(self, x):
+        return self.v * x
 
 
 class SimpleTFModuleModel(model_collection_base.ModelAndInput):
-  """A simple model based on tf.Module and its data."""
+    """A simple model based on tf.Module and its data."""
 
-  def get_model(self, **kwargs):
-    model = _SimpleModule()
-    return model
+    def get_model(self, **kwargs):
+        model = _SimpleModule()
+        return model
 
-  def get_data(self):
-    return _get_data_for_simple_models()
+    def get_data(self):
+        return _get_data_for_simple_models()
 
-  def get_batch_size(self):
-    return _BATCH_SIZE
+    def get_batch_size(self):
+        return _BATCH_SIZE
diff --git a/keras/distribute/strategy_combinations.py b/keras/distribute/strategy_combinations.py
index 5b38b9a24aa0..8261e2386ce7 100644
--- a/keras/distribute/strategy_combinations.py
+++ b/keras/distribute/strategy_combinations.py
@@ -16,7 +16,6 @@
 
 import tensorflow.compat.v2 as tf
 
-
 multidevice_strategies = [
     tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
     tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
@@ -26,7 +25,7 @@
 multiworker_strategies = [
     tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,
     tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,
-    tf.__internal__.distribute.combinations.multi_worker_mirrored_2x2_gpu
+    tf.__internal__.distribute.combinations.multi_worker_mirrored_2x2_gpu,
 ]
 
 strategies_minus_default_minus_tpu = [
@@ -34,7 +33,7 @@
     tf.__internal__.distribute.combinations.one_device_strategy_gpu,
     tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
     tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-    tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu
+    tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu,  # noqa: E501
 ]
 
 strategies_minus_tpu = [
@@ -43,13 +42,13 @@
     tf.__internal__.distribute.combinations.one_device_strategy_gpu,
     tf.__internal__.distribute.combinations.mirrored_strategy_with_gpu_and_cpu,
     tf.__internal__.distribute.combinations.mirrored_strategy_with_two_gpus,
-    tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu
+    tf.__internal__.distribute.combinations.central_storage_strategy_with_gpu_and_cpu,  # noqa: E501
 ]
 
 multi_worker_mirrored_strategies = [
     tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_cpu,
     tf.__internal__.distribute.combinations.multi_worker_mirrored_2x1_gpu,
-    tf.__internal__.distribute.combinations.multi_worker_mirrored_2x2_gpu
+    tf.__internal__.distribute.combinations.multi_worker_mirrored_2x2_gpu,
 ]
 
 tpu_strategies = [
@@ -57,13 +56,13 @@
 ]
 
 parameter_server_strategies_single_worker = [
-    tf.__internal__.distribute.combinations.parameter_server_strategy_1worker_2ps_cpu,
-    tf.__internal__.distribute.combinations.parameter_server_strategy_1worker_2ps_1gpu,
+    tf.__internal__.distribute.combinations.parameter_server_strategy_1worker_2ps_cpu,  # noqa: E501
+    tf.__internal__.distribute.combinations.parameter_server_strategy_1worker_2ps_1gpu,  # noqa: E501
 ]
 
 parameter_server_strategies_multi_worker = [
-    tf.__internal__.distribute.combinations.parameter_server_strategy_3worker_2ps_cpu,
-    tf.__internal__.distribute.combinations.parameter_server_strategy_3worker_2ps_1gpu,
+    tf.__internal__.distribute.combinations.parameter_server_strategy_3worker_2ps_cpu,  # noqa: E501
+    tf.__internal__.distribute.combinations.parameter_server_strategy_3worker_2ps_1gpu,  # noqa: E501
 ]
 
 all_strategies = strategies_minus_tpu + tpu_strategies
diff --git a/keras/distribute/test_example.py b/keras/distribute/test_example.py
index 5d6e5981d2ef..aa216592b781 100644
--- a/keras/distribute/test_example.py
+++ b/keras/distribute/test_example.py
@@ -14,78 +14,95 @@
 # ==============================================================================
 """A simple network to use in tests and examples."""
 
+import tensorflow.compat.v2 as tf
+
 from keras.legacy_tf_layers import core
 from keras.legacy_tf_layers import normalization
-from keras.optimizers.optimizer_v2 import optimizer_v2
-
-import tensorflow.compat.v2 as tf
+from keras.optimizers.legacy import optimizer_v2
 
 
 def minimize_loss_example(optimizer, use_bias=False, use_callable_loss=True):
-  """Example of non-distribution-aware legacy code."""
-
-  def dataset_fn():
-    dataset = tf.data.Dataset.from_tensors([[1.]]).repeat()
-    # TODO(isaprykin): batch with drop_remainder causes shapes to be
-    # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
-    return dataset.batch(1, drop_remainder=True)
-
-  layer = core.Dense(1, use_bias=use_bias)
-
-  def model_fn(x):
-    """A very simple model written by the user."""
-
-    def loss_fn():
-      y = tf.reshape(layer(x), []) - tf.constant(1.)
-      return y * y
-
-    if isinstance(optimizer, optimizer_v2.OptimizerV2):
-      return optimizer.minimize(loss_fn, lambda: layer.trainable_variables)
-    elif use_callable_loss:
-      return optimizer.minimize(loss_fn)
-    else:
-      return optimizer.minimize(loss_fn())
-
-  return model_fn, dataset_fn, layer
-
-
-def batchnorm_example(optimizer_fn,
-                      batch_per_epoch=1,
-                      momentum=0.9,
-                      renorm=False,
-                      update_ops_in_replica_mode=False):
-  """Example of non-distribution-aware legacy code with batch normalization."""
-
-  def dataset_fn():
-    # input shape is [16, 8], input values are increasing in both dimensions.
-    return tf.data.Dataset.from_tensor_slices(
-        [[[float(x * 8 + y + z * 100)
-           for y in range(8)]
-          for x in range(16)]
-         for z in range(batch_per_epoch)]).repeat()
-
-  optimizer = optimizer_fn()
-  batchnorm = normalization.BatchNormalization(
-      renorm=renorm, momentum=momentum, fused=False)
-  layer = core.Dense(1, use_bias=False)
-
-  def model_fn(x):
-    """A model that uses batchnorm."""
-
-    def loss_fn():
-      y = batchnorm(x, training=True)
-      with tf.control_dependencies(
-          tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
-          if update_ops_in_replica_mode else []):
-        loss = tf.reduce_mean(
-            tf.reduce_sum(layer(y)) - tf.constant(1.))
-      # `x` and `y` will be fetched by the gradient computation, but not `loss`.
-      return loss
-
-    if isinstance(optimizer, optimizer_v2.OptimizerV2):
-      return optimizer.minimize(loss_fn, lambda: layer.trainable_variables)
-
-    # Callable loss.
-    return optimizer.minimize(loss_fn)
-
-  return model_fn, dataset_fn, batchnorm
+    """Example of non-distribution-aware legacy code."""
+
+    def dataset_fn():
+        dataset = tf.data.Dataset.from_tensors([[1.0]]).repeat()
+        # TODO(isaprykin): batch with drop_remainder causes shapes to be
+        # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
+        return dataset.batch(1, drop_remainder=True)
+
+    layer = core.Dense(1, use_bias=use_bias)
+
+    def model_fn(x):
+        """A very simple model written by the user."""
+
+        def loss_fn():
+            y = tf.reshape(layer(x), []) - tf.constant(1.0)
+            return y * y
+
+        if isinstance(optimizer, optimizer_v2.OptimizerV2):
+            return optimizer.minimize(
+                loss_fn, lambda: layer.trainable_variables
+            )
+        elif use_callable_loss:
+            return optimizer.minimize(loss_fn)
+        else:
+            return optimizer.minimize(loss_fn())
+
+    return model_fn, dataset_fn, layer
+
+
+def batchnorm_example(
+    optimizer_fn,
+    batch_per_epoch=1,
+    momentum=0.9,
+    renorm=False,
+    update_ops_in_replica_mode=False,
+):
+    """Example of non-distribution-aware legacy code with batch
+    normalization."""
+
+    def dataset_fn():
+        # input shape is [16, 8], input values are increasing in both
+        # dimensions.
+        return tf.data.Dataset.from_tensor_slices(
+            [
+                [
+                    [float(x * 8 + y + z * 100) for y in range(8)]
+                    for x in range(16)
+                ]
+                for z in range(batch_per_epoch)
+            ]
+        ).repeat()
+
+    optimizer = optimizer_fn()
+    batchnorm = normalization.BatchNormalization(
+        renorm=renorm, momentum=momentum, fused=False
+    )
+    layer = core.Dense(1, use_bias=False)
+
+    def model_fn(x):
+        """A model that uses batchnorm."""
+
+        def loss_fn():
+            y = batchnorm(x, training=True)
+            with tf.control_dependencies(
+                tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
+                if update_ops_in_replica_mode
+                else []
+            ):
+                loss = tf.reduce_mean(
+                    tf.reduce_sum(layer(y)) - tf.constant(1.0)
+                )
+            # `x` and `y` will be fetched by the gradient computation, but not
+            # `loss`.
+            return loss
+
+        if isinstance(optimizer, optimizer_v2.OptimizerV2):
+            return optimizer.minimize(
+                loss_fn, lambda: layer.trainable_variables
+            )
+
+        # Callable loss.
+        return optimizer.minimize(loss_fn)
+
+    return model_fn, dataset_fn, batchnorm
diff --git a/keras/distribute/tpu_strategy_test_utils.py b/keras/distribute/tpu_strategy_test_utils.py
index 8a167fbb40bb..f94c3d3cf2ea 100644
--- a/keras/distribute/tpu_strategy_test_utils.py
+++ b/keras/distribute/tpu_strategy_test_utils.py
@@ -15,7 +15,6 @@
 """Utility functions for tests using TPUStrategy."""
 
 import tensorflow.compat.v2 as tf
-
 from absl import flags
 
 FLAGS = flags.FLAGS
@@ -25,16 +24,16 @@
 
 
 def get_tpu_cluster_resolver():
-  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-      tpu=FLAGS.tpu,
-      zone=FLAGS.zone,
-      project=FLAGS.project,
-  )
-  return resolver
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu,
+        zone=FLAGS.zone,
+        project=FLAGS.project,
+    )
+    return resolver
 
 
 def get_tpu_strategy():
-  resolver = get_tpu_cluster_resolver()
-  tf.config.experimental_connect_to_cluster(resolver)
-  tf.tpu.experimental.initialize_tpu_system(resolver)
-  return tf.distribute.experimental.TPUStrategy(resolver)
+    resolver = get_tpu_cluster_resolver()
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    return tf.distribute.experimental.TPUStrategy(resolver)
diff --git a/keras/distribute/worker_training_state.py b/keras/distribute/worker_training_state.py
index ff550dae11a1..335feedc8174 100644
--- a/keras/distribute/worker_training_state.py
+++ b/keras/distribute/worker_training_state.py
@@ -14,126 +14,212 @@
 # ==============================================================================
 """Training state management."""
 
+import os
+
 import tensorflow.compat.v2 as tf
 
-import os
 from keras import backend
 from keras.distribute import distributed_file_utils
 from keras.utils import mode_keys
 
-# Constant for `tf.keras.Model` attribute to store the epoch at which the most
-# recently saved checkpoint was saved.
-CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
+# isort: off
+from keras.distribute.distributed_file_utils import (
+    support_on_demand_checkpoint_callback,
+)  # noqa: E501
 
-CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
 
+MAX_CHECKPOINT_TO_KEEP = 1
 
-class WorkerTrainingState:
-  """Training state management class.
-
-  This class provides apis for backing up and restoring the training state.
-  This allows model and epoch information to be saved periodically and restore
-  for fault-tolerance, also known as preemption-recovery purpose.
-  """
-
-  def __init__(self, model, checkpoint_dir):
-    self._model = model
-
-    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
-    # GPU device only has int64 dtype registered VarHandleOp.
-    self._ckpt_saved_epoch = tf.Variable(
-        initial_value=tf.constant(
-            CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=tf.int64),
-        name='ckpt_saved_epoch')
-
-    # Variable initialization.
-    backend.set_value(self._ckpt_saved_epoch, CKPT_SAVED_EPOCH_UNUSED_VALUE)
-
-    # _ckpt_saved_epoch gets tracked and is included in the checkpoint file
-    # when backing up.
-    checkpoint = tf.train.Checkpoint(
-        model=self._model, ckpt_saved_epoch=self._ckpt_saved_epoch,
-        train_counter=self._model._train_counter)
-
-    # If this is single-worker training, checkpoint_dir are the same for
-    # write_checkpoint_manager and read_checkpoint_manager.
-    #
-    # If this is multi-worker training, and this worker should not
-    # save checkpoint, we replace the write_checkpoint_manager's checkpoint_dir
-    # with a temp filepath, so it writes to a file that will be removed at the
-    # end of back_up() call. This is necessary because the SyncOnReadVariable
-    # needs to be synced across all the workers in order to be read, and all
-    # workers need to perform `save()`.
-    # But all workers should restore from the same checkpoint_dir as passed in
-    # read_checkpoint_manager.
-    self.read_checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        directory=os.path.join(checkpoint_dir, 'chief'),
-        max_to_keep=1)
-    write_checkpoint_dir = distributed_file_utils.write_dirpath(
-        checkpoint_dir, self._model.distribute_strategy)
-    if self._model.distribute_strategy.extended.should_checkpoint:
-      self.write_checkpoint_manager = self.read_checkpoint_manager
-    else:
-      self.write_checkpoint_manager = tf.train.CheckpointManager(
-          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
-
-  def back_up(self, epoch):
-    """Back up the current state of training into a checkpoint file.
-
-    Args:
-      epoch: The current epoch information to be saved.
-    """
-    backend.set_value(self._ckpt_saved_epoch, epoch)
-    # Save the model plus CKPT_SAVED_EPOCH variable.
-    if self.write_checkpoint_manager.save():
-      distributed_file_utils.remove_temp_dirpath(
-          self.write_checkpoint_manager.directory,
-          self._model.distribute_strategy)
-
-  def restore(self):
-    """Restore the training state from the backed up checkpoint file.
-
-    Returns:
-      True if the training state is successfully restored. False if the training
-      state doesn't need to be restored, or error occurred so it can't.
-    """
-    self.read_checkpoint_manager.restore_or_initialize()
 
-  def delete_backup(self):
-    """Delete the backup directories.
+class WorkerTrainingState:
+    """Training state management class.
 
-    Delete the backup directories which should not exist after `fit()`
-    successfully finishes.
-    """
-    if self.write_checkpoint_manager is self.read_checkpoint_manager:
-      try:
-        tf.io.gfile.rmtree(self.write_checkpoint_manager.directory)
-      except tf.errors.NotFoundError:
-        pass
-
-  def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
-    """Maybe load initial epoch from ckpt considering possible worker recovery.
-
-    When `_ckpt_saved_epoch` attribute exists and is not
-    `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training setting
-    and indicates the worker is recovering from previous failure. In this case,
-    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
-    unfinished training from certain epoch.
-
-    Args:
-      initial_epoch: The original initial_epoch user passes in in `fit()`.
-      mode: The mode for running `model.fit()`.
-
-    Returns:
-      If the training is recovering from previous failure under multi-worker
-      training setting, return the epoch the training is supposed to continue
-      at. Otherwise, return the `initial_epoch` the user passes in.
+    This class provides apis for backing up and restoring the training state.
+    This allows model and epoch and batch information to be saved periodically
+    and restore for fault-tolerance, also known as preemption-recovery purpose.
     """
 
-    epoch = backend.eval(self._ckpt_saved_epoch)
-    if mode == mode_keys.ModeKeys.TRAIN and epoch >= 0:
-      # The most recently saved epoch is one epoch prior to the epoch it
-      # failed at, so return the value of 'self._ckpt_saved_epoch' plus one.
-      return epoch + 1
-    return initial_epoch
+    # Constant for `tf.keras.Model` attribute to store the epoch and batch
+    # at which the most recently saved checkpoint was saved.
+    CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
+
+    CKPT_SAVED_BATCH_UNUSED_VALUE = -1
+
+    def __init__(
+        self,
+        model,
+        checkpoint_dir,
+        save_freq="epoch",
+        save_before_preemption_arg=None,
+    ):
+        self._enable_save_before_preemption = save_before_preemption_arg and (
+            support_on_demand_checkpoint_callback(model.distribute_strategy)
+        )
+        self._model = model
+
+        self._save_freq = save_freq
+        # The batch and epoch at which the checkpoint is saved. Used for
+        # fault-tolerance. GPU device only has int64 dtype registered
+        # VarHandleOp.
+        self._ckpt_saved_epoch = tf.Variable(
+            initial_value=tf.constant(
+                self.CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=tf.int64
+            ),
+            name="ckpt_saved_epoch",
+        )
+        self._ckpt_saved_batch = tf.Variable(
+            initial_value=tf.constant(
+                self.CKPT_SAVED_BATCH_UNUSED_VALUE, dtype=tf.int64
+            ),
+            name="ckpt_saved_batch",
+        )
+        # Variable initialization.
+        backend.set_value(
+            self._ckpt_saved_epoch, self.CKPT_SAVED_EPOCH_UNUSED_VALUE
+        )
+        backend.set_value(
+            self._ckpt_saved_batch, self.CKPT_SAVED_BATCH_UNUSED_VALUE
+        )
+        # _ckpt_saved_epoch and _ckpt_saved_batch gets tracked and is included
+        # in the checkpoint file when backing up.
+        checkpoint = tf.train.Checkpoint(
+            model=self._model,
+            ckpt_saved_epoch=self._ckpt_saved_epoch,
+            ckpt_saved_batch=self._ckpt_saved_batch,
+            train_counter=self._model._train_counter,
+        )
+
+        # If this is single-worker training, checkpoint_dir are the same for
+        # write_checkpoint_manager and read_checkpoint_manager.
+        #
+        # If this is multi-worker training, and this worker should not save
+        # checkpoint, we replace the write_checkpoint_manager's checkpoint_dir
+        # with a temp filepath, so it writes to a file that will be removed at
+        # the end of back_up() call. This is necessary because the
+        # SyncOnReadVariable needs to be synced across all the workers in order
+        # to be read, and all workers need to perform `save()`.  But all workers
+        # should restore from the same checkpoint_dir as passed in
+        # read_checkpoint_manager.
+        self.read_checkpoint_manager = tf.train.CheckpointManager(
+            checkpoint,
+            directory=os.path.join(checkpoint_dir, "chief"),
+            max_to_keep=MAX_CHECKPOINT_TO_KEEP,
+        )
+        write_checkpoint_dir = distributed_file_utils.write_dirpath(
+            checkpoint_dir, self._model.distribute_strategy
+        )
+        if self._model.distribute_strategy.extended.should_checkpoint:
+            self.write_checkpoint_manager = self.read_checkpoint_manager
+        else:
+            self.write_checkpoint_manager = tf.train.CheckpointManager(
+                checkpoint,
+                directory=write_checkpoint_dir,
+                max_to_keep=MAX_CHECKPOINT_TO_KEEP,
+            )
+
+        if self._enable_save_before_preemption:
+            self.preemption_handler = (
+                tf.distribute.experimental.PreemptionCheckpointHandler(
+                    self._model.distribute_strategy.cluster_resolver,
+                    self.write_checkpoint_manager,
+                )
+            )
+            self.preemption_handler._read_checkpoint_manager = (
+                self.read_checkpoint_manager
+            )
+            self._model._preemption_handler = self.preemption_handler
+
+    def back_up(self, epoch, batch=0):
+        """Back up the current state of training into a checkpoint file.
+
+        Args:
+          epoch: The current epoch information to be saved.
+          batch: The current batch(step) information to be saved.
+        """
+        # Save the model plus CKPT_SAVED_EPOCH and CKPT_SAVED_BATCH variable.
+        if self.write_checkpoint_manager.save():
+            distributed_file_utils.remove_temp_dirpath(
+                self.write_checkpoint_manager.directory,
+                self._model.distribute_strategy,
+            )
+
+    def backup_if_preempted(self):
+        if self._enable_save_before_preemption:
+            self.preemption_handler._run_counter += 1
+            self.preemption_handler._check_preemption_and_maybe_checkpoint()
+
+    def restore(self):
+        """Restore the training state from the backed up checkpoint file.
+
+        Returns:
+          True if the training state is successfully restored. False if the
+          training state doesn't need to be restored, or error occurred so it
+          can't.
+        """
+        # When creating the PreemptionCheckpointHandler object, we have already
+        # restored the checkpoint.
+        if not self._enable_save_before_preemption:
+            self.read_checkpoint_manager.restore_or_initialize()
+
+    def delete_backup(self):
+        """Delete the backup directories.
+
+        Delete the backup directories which should not exist after `fit()`
+        successfully finishes.
+        """
+        if self.write_checkpoint_manager is self.read_checkpoint_manager:
+            try:
+                tf.io.gfile.rmtree(self.write_checkpoint_manager.directory)
+            except tf.errors.NotFoundError:
+                pass
+
+    def maybe_load_initial_counters_from_ckpt(
+        self, steps_per_epoch, initial_epoch, mode
+    ):
+        """Maybe load 1st epoch from checkpoint, considering worker recovery.
+
+        When `_ckpt_saved_epoch` attribute exists and is not
+        `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training
+        setting and indicates the worker is recovering from previous failure. In
+        this case, infer `initial_epoch` from `self._ckpt_saved_epoch` to
+        continue previous unfinished training from certain epoch.
+
+        Args:
+          steps_per_epoch: The number of steps per epoch value.
+          initial_epoch: The original initial_epoch user passes in in `fit()`.
+          mode: The mode for running `model.fit()`.
+
+        Returns:
+          If the training is recovering from previous failure under multi-worker
+          training setting, return the (epoch, step) the training is supposed to
+          continue at. Otherwise, return the `initial_epoch, initial_step` the
+          user passes in.
+        """
+
+        initial_step = 0
+        epoch = backend.eval(self._ckpt_saved_epoch)
+        batch = backend.eval(self._ckpt_saved_batch)
+        if mode == mode_keys.ModeKeys.TRAIN:
+            # For batch-level saving
+            if self._enable_save_before_preemption or isinstance(
+                self._save_freq, int
+            ):
+                if batch >= 0:
+                    # If the checkpoint was last saved at last batch of the
+                    # epoch, return the next epoch number and batch=0
+                    if batch == steps_per_epoch - 1:
+                        initial_epoch = epoch + 1
+                        initial_step = 0
+                    else:
+                        # If the checkpoint was not last saved at last batch of
+                        # the epoch, return the same epoch and next batch number
+                        initial_epoch = epoch
+                        initial_step = batch + 1
+            else:
+                if epoch >= 0:
+                    # The most recently saved epoch is one epoch prior to the
+                    # epoch it failed at, so return the value of
+                    # 'self._ckpt_saved_epoch' plus one.
+                    initial_epoch = epoch + 1
+
+        return (initial_epoch, initial_step)
diff --git a/keras/distribute/worker_training_state_test.py b/keras/distribute/worker_training_state_test.py
index b63f0525f043..c2d3cde468d2 100644
--- a/keras/distribute/worker_training_state_test.py
+++ b/keras/distribute/worker_training_state_test.py
@@ -14,40 +14,40 @@
 # ==============================================================================
 """Tests of `worker_training_state.py` utilities."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import sys
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import callbacks
 from keras.distribute import multi_worker_testing_utils
 
 
-class ModelCheckpointTest(tf.test.TestCase, parameterized.TestCase):
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'],
-          file_format=['h5', 'tf'],
-          save_weights_only=[True, False]))
-  def testCheckpointExists(self, file_format, save_weights_only):
-    train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
-    model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-    saving_dir = self.get_temp_dir()
-    saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
-    callbacks_list = [
-        callbacks.ModelCheckpoint(
-            filepath=saving_filepath, save_weights_only=save_weights_only)
-    ]
-    self.assertFalse(tf.io.gfile.exists(saving_filepath))
-    model.fit(x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
-    tf_saved_model_exists = tf.io.gfile.exists(saving_filepath)
-    tf_weights_only_checkpoint_exists = tf.io.gfile.exists(saving_filepath +
-                                                               '.index')
-    self.assertTrue(tf_saved_model_exists or tf_weights_only_checkpoint_exists)
-
-
-if __name__ == '__main__':
-  with tf.compat.v1.test.mock.patch.object(sys, 'exit', os._exit):
-    tf.test.main()
+class WorkerTrainingStateTest(tf.test.TestCase, parameterized.TestCase):
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(mode=["eager"])
+    )
+    def testCheckpointExists(self):
+        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
+        model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
+        saving_dir = self.get_temp_dir()
+        callbacks_list = [
+            callbacks.BackupAndRestore(
+                backup_dir=saving_dir, delete_checkpoint=False
+            )
+        ]
+        self.assertLen(tf.io.gfile.glob(os.path.join(saving_dir, "*")), 0)
+        model.fit(
+            x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list
+        )
+        # By default worker_training_state only keeps the results from one
+        # checkpoint. Even though the test is expected to checkpoint twice, it
+        # only keeps the checkpoint files from the second checkpoint.
+        checkpoint_path = os.path.join(saving_dir, "chief", "ckpt-2.index")
+        self.assertLen(tf.io.gfile.glob(checkpoint_path), 1)
+
+
+if __name__ == "__main__":
+    with tf.compat.v1.test.mock.patch.object(sys, "exit", os._exit):
+        tf.test.main()
diff --git a/keras/dtensor/BUILD b/keras/dtensor/BUILD
index 65b9d509b295..79716c1a3c4a 100644
--- a/keras/dtensor/BUILD
+++ b/keras/dtensor/BUILD
@@ -2,11 +2,22 @@
 # Since DTensor is not a public API yet, all the DTensor related change
 # can't be exposed to public yet.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
+# copybara:uncomment_begin(google-only)
+# load(
+#     "//third_party/tensorflow/dtensor:build_defs.bzl",
+#     "dtensor_test",
+# )
+# copybara:uncomment_end
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
+        "//learning/brain/distribute/experimental/auto_distribute:__pkg__",
+        "//learning/brain/distribute/python:__subpackages__",
         "//learning/brain/experimental/dtensor/models:__subpackages__",
     ],
     licenses = ["notice"],
@@ -15,6 +26,9 @@ package(
 py_library(
     name = "dtensor",
     srcs = ["__init__.py"],
+    deps = [
+        "//:expect_tensorflow_installed",
+    ],
 )
 
 tf_py_test(
@@ -65,12 +79,13 @@ tf_py_test(
     deps = [
         ":dtensor",
         ":layout_map",
+        ":test_util",
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras:backend",
         "//keras/layers",
+        "//keras/models",
         "//keras/utils:tf_utils",
-        "//learning/brain/experimental/dtensor/tests:test_util",
     ],
 )
 
@@ -105,42 +120,45 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "mnist_model_test",
-    srcs = ["mnist_model_test.py"],
-    tags = [
-        "requires-net:external",
-    ],
-    deps = [
-        ":integration_test_utils",
-        ":optimizers",
-        ":test_util",
-        "//:expect_numpy_installed",
-        "//:expect_tensorflow_installed",
-        "//keras/utils:tf_utils",
-    ],
-)
-
-py_library(
-    name = "optimizers",
-    srcs = ["optimizers.py"],
-    deps = [
-        ":dtensor",
-        "//:expect_tensorflow_installed",
-        "//keras/optimizers/optimizer_experimental:optimizer",
-        "//keras/optimizers/schedules:learning_rate_schedule",
-    ],
-)
+# copybara:uncomment_begin(google-only)
+# dtensor_test(
+#     name = "mnist_model_test",
+#     srcs = ["mnist_model_test.py"],
+#     env = {
+#         "CUDA_MODULE_LOADING": "LAZY",
+#         "TF_GPU_ALLOCATOR": "cuda_malloc_async",
+#     },
+#     tags = [
+#         "no_oss",
+#         "requires-net:external",
+#     ],
+#     deps = [
+#         ":dtensor",
+#         ":integration_test_utils",
+#         ":layout_map",
+#         ":test_util",
+#         "//keras:backend",
+#         "//keras/optimizers",
+#         "//keras/utils:tf_utils",
+#         "//:expect_numpy_installed",
+#         "//:expect_tensorflow_installed",
+#     ],
+# )
+# copybara:uncomment_end
 
 tf_py_test(
     name = "optimizers_test",
     srcs = ["optimizers_test.py"],
     deps = [
         ":dtensor",
-        ":optimizers",
+        ":layout_map",
         ":test_util",
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
+        "//keras:losses",
+        "//keras/layers",
+        "//keras/models",
+        "//keras/optimizers",
     ],
 )
 
@@ -184,3 +202,44 @@ py_library(
         "//:expect_tensorflow_installed",
     ],
 )
+
+tf_py_test(
+    name = "save_load_test",
+    srcs = ["save_load_test.py"],
+    deps = [
+        ":dtensor",
+        ":layout_map",
+        ":test_util",
+        "//keras",
+        "//keras:backend",
+        "//keras/layers",
+        "//keras/models",
+        "//keras/utils:tf_utils",
+    ],
+)
+
+# copybara:uncomment_begin(google-only)
+# dtensor_test(
+#     name = "strategy_integration_test",
+#     srcs = ["strategy_integration_test.py"],
+#     shard_count = {
+#         "CPU": 2,
+#         "GPU": 4,
+#         "TPU": 2,
+#     },
+#     tags = ["no_oss"],
+#     deps = [
+#         ":integration_test_utils",
+#         ":test_util",
+#         "//:expect_absl_installed",
+#         "//keras:backend",
+#         "//keras/mixed_precision:mixed_precision_experimental",
+#         "//keras/optimizers",
+#         "//keras/utils:tf_utils",
+#         "//:expect_numpy_installed",
+#         "//:expect_tensorflow_installed",
+#         "//third_party/tensorflow/dtensor/python/tests:test_util",
+#         "//third_party/tensorflow/python/distribute/experimental:mirrored_strategy",
+#     ],
+# )
+# copybara:uncomment_end
diff --git a/keras/dtensor/__init__.py b/keras/dtensor/__init__.py
index ec4357740cc4..59a004592af3 100644
--- a/keras/dtensor/__init__.py
+++ b/keras/dtensor/__init__.py
@@ -14,13 +14,5 @@
 # ==============================================================================
 """Keras' DTensor library."""
 
-_DTENSOR_API_ENABLED = True
 
-
-# Conditional import the dtensor API, since it is currently broken in OSS.
-if _DTENSOR_API_ENABLED:
-  from tensorflow.compat.v2.experimental import dtensor as dtensor_api  # pylint: disable=g-import-not-at-top
-else:
-  # Leave it with a placeholder, so that the import line from other python file
-  # will not break.
-  dtensor_api = None
+from tensorflow.compat.v2.experimental import dtensor as dtensor_api
diff --git a/keras/dtensor/initializers_test.py b/keras/dtensor/initializers_test.py
index d2c47f8bca81..11d97fca2895 100644
--- a/keras/dtensor/initializers_test.py
+++ b/keras/dtensor/initializers_test.py
@@ -14,142 +14,149 @@
 # ==============================================================================
 """Tests for initializers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import backend
 from keras import initializers
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import test_util
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 class InitializersTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
 
-  def setUp(self):
-    super().setUp()
-    global_ids = test_util.create_device_ids_array((2, 2))
-    local_device_ids = np.ravel(global_ids).tolist()
-    mesh_dict = {
-        'CPU':
-            dtensor.Mesh(['X', 'Y'], global_ids, local_device_ids,
-                         test_util.create_device_list((2, 2), 'CPU'))
-    }
-    self.mesh = self.configTestMesh(mesh_dict)
-
-  @parameterized.named_parameters(
-      ('Zeros', initializers.Zeros, {}),
-      ('Ones', initializers.Ones, {}),
-      ('Constant', initializers.Constant, {'value': 3.}),
-      # TODO(b/222160686): Add Identity after after we have SPMD support for
-      # tf.MatrixDiagV3
-      # ('Identity', initializers.Identity, {}),
-  )
-  def test_static_value_initializer(self, initializer_cls, init_args):
-    layout = dtensor.Layout([dtensor.UNSHARDED, dtensor.UNSHARDED], self.mesh)
-    shape = (4, 4)
-    initializer = initializer_cls(**init_args)
-    value = initializer(shape=shape, layout=layout)
-    normal_tensor_value = initializer(shape=shape)
+    @parameterized.named_parameters(
+        ("Zeros", initializers.Zeros, {}),
+        ("Ones", initializers.Ones, {}),
+        ("Constant", initializers.Constant, {"value": 3.0}),
+        # TODO(b/222160686): Add Identity after after we have SPMD support for
+        # tf.MatrixDiagV3
+        # ('Identity', initializers.Identity, {}),
+    )
+    def test_static_value_initializer(self, initializer_cls, init_args):
+        layout = dtensor.Layout(
+            [dtensor.UNSHARDED, dtensor.UNSHARDED], self.mesh
+        )
+        shape = (4, 4)
+        initializer = initializer_cls(**init_args)
+        value = initializer(shape=shape, layout=layout)
+        normal_tensor_value = initializer(shape=shape)
 
-    self.assertEqual(value.shape, shape)
-    fetched_layout = dtensor.fetch_layout(value)
-    self.assertEqual(layout, fetched_layout)
+        self.assertEqual(value.shape, shape)
+        fetched_layout = dtensor.fetch_layout(value)
+        self.assertEqual(layout, fetched_layout)
 
-    self.assertAllClose(value, normal_tensor_value)
+        self.assertAllClose(value, normal_tensor_value)
 
-  @parameterized.named_parameters(
-      ('RandomUniform', initializers.RandomUniform, {}),
-      ('RandomUniform_seeded', initializers.RandomUniform, {'seed': 1}),
-      ('RandomNormal', initializers.RandomNormal, {}),
-      ('RandomNormal_seeded', initializers.RandomNormal, {'seed': 1}),
-      ('TruncatedNormal', initializers.TruncatedNormal, {}),
-      ('TruncatedNormal_seeded', initializers.TruncatedNormal, {'seed': 1}),
-      ('Orthogonal', initializers.Orthogonal, {}),
-      ('Orthogonal_seeded', initializers.Orthogonal, {'seed': 1}),
-      ('VarianceScaling', initializers.VarianceScaling, {}),
-      ('VarianceScaling_seeded', initializers.VarianceScaling, {'seed': 1}),
-      ('GlorotUniform', initializers.GlorotUniform, {}),
-      ('GlorotUniform_seeded', initializers.GlorotUniform, {'seed': 1}),
-      ('GlorotNormal', initializers.GlorotNormal, {}),
-      ('GlorotNormal_seeded', initializers.GlorotNormal, {'seed': 1}),
-      ('LecunNormal', initializers.LecunNormal, {}),
-      ('LecunNormal_seeded', initializers.LecunNormal, {'seed': 1}),
-      ('LecunUniform', initializers.LecunUniform, {}),
-      ('LecunUniform_seeded', initializers.LecunUniform, {'seed': 1}),
-      ('HeNormal', initializers.HeNormal, {}),
-      ('HeNormal_seeded', initializers.HeNormal, {'seed': 1}),
-      ('HeUniform', initializers.HeUniform, {}),
-      ('HeUniform_seeded', initializers.HeUniform, {'seed': 1}),
-  )
-  def test_random_value_initializer(self, initializer_cls, init_args):
-    layout = dtensor.Layout([dtensor.UNSHARDED, dtensor.UNSHARDED], self.mesh)
-    shape = (4, 4)
-    initializer = initializer_cls(**init_args)
-    # Make sure to raise error when keras global seed is not set.
-    with self.assertRaisesRegex(ValueError, 'set the global seed'):
-      initializer(shape=shape, layout=layout)
+    @parameterized.named_parameters(
+        ("RandomUniform", initializers.RandomUniform, {}),
+        ("RandomUniform_seeded", initializers.RandomUniform, {"seed": 1}),
+        ("RandomNormal", initializers.RandomNormal, {}),
+        ("RandomNormal_seeded", initializers.RandomNormal, {"seed": 1}),
+        ("TruncatedNormal", initializers.TruncatedNormal, {}),
+        ("TruncatedNormal_seeded", initializers.TruncatedNormal, {"seed": 1}),
+        ("Orthogonal", initializers.Orthogonal, {}),
+        ("Orthogonal_seeded", initializers.Orthogonal, {"seed": 1}),
+        ("VarianceScaling", initializers.VarianceScaling, {}),
+        ("VarianceScaling_seeded", initializers.VarianceScaling, {"seed": 1}),
+        ("GlorotUniform", initializers.GlorotUniform, {}),
+        ("GlorotUniform_seeded", initializers.GlorotUniform, {"seed": 1}),
+        ("GlorotNormal", initializers.GlorotNormal, {}),
+        ("GlorotNormal_seeded", initializers.GlorotNormal, {"seed": 1}),
+        ("LecunNormal", initializers.LecunNormal, {}),
+        ("LecunNormal_seeded", initializers.LecunNormal, {"seed": 1}),
+        ("LecunUniform", initializers.LecunUniform, {}),
+        ("LecunUniform_seeded", initializers.LecunUniform, {"seed": 1}),
+        ("HeNormal", initializers.HeNormal, {}),
+        ("HeNormal_seeded", initializers.HeNormal, {"seed": 1}),
+        ("HeUniform", initializers.HeUniform, {}),
+        ("HeUniform_seeded", initializers.HeUniform, {"seed": 1}),
+    )
+    def test_random_value_initializer(self, initializer_cls, init_args):
+        layout = dtensor.Layout(
+            [dtensor.UNSHARDED, dtensor.UNSHARDED], self.mesh
+        )
+        shape = (4, 4)
+        initializer = initializer_cls(**init_args)
+        # Make sure to raise error when keras global seed is not set.
+        with self.assertRaisesRegex(ValueError, "set the global seed"):
+            initializer(shape=shape, layout=layout)
 
-    try:
-      tf_utils.set_random_seed(1337)
-      value = initializer(shape=shape, layout=layout)
-      self.assertEqual(value.shape, shape)
-      fetched_layout = dtensor.fetch_layout(value)
-      self.assertEqual(layout, fetched_layout)
+        try:
+            tf_utils.set_random_seed(1337)
+            value = initializer(shape=shape, layout=layout)
+            self.assertEqual(value.shape, shape)
+            fetched_layout = dtensor.fetch_layout(value)
+            self.assertEqual(layout, fetched_layout)
 
-      # Make sure when same seed is set again, the new initializer should
-      # generate same result
-      tf_utils.set_random_seed(1337)
-      initializer = initializer_cls(**init_args)
-      new_value = initializer(shape=shape, layout=layout)
-      self.assertAllClose(value, new_value)
-    finally:
-      # Unset the keras global generator so that it doesn't affect other tests
-      # that need to verify the existence of global generator.
-      backend._SEED_GENERATOR.generator = None
+            # Make sure when same seed is set again, the new initializer should
+            # generate same result
+            tf_utils.set_random_seed(1337)
+            initializer = initializer_cls(**init_args)
+            new_value = initializer(shape=shape, layout=layout)
+            self.assertAllClose(value, new_value)
+        finally:
+            # Unset the keras global generator so that it doesn't affect other
+            # tests that need to verify the existence of global generator.
+            backend._SEED_GENERATOR.generator = None
 
-  @parameterized.named_parameters(
-      ('zeros', 'zeros', initializers.Zeros),
-      ('Zeros', 'Zeros', initializers.Zeros),
-      ('ones', 'ones', initializers.Ones),
-      ('Ones', 'Ones', initializers.Ones),
-      ('constant', 'constant', initializers.Constant),
-      ('Constant', 'Constant', initializers.Constant),
-      ('random_uniform', 'random_uniform', initializers.RandomUniform),
-      ('RandomUniform', 'RandomUniform', initializers.RandomUniform),
-      ('random_normal', 'random_normal', initializers.RandomNormal),
-      ('RandomNormal', 'RandomNormal', initializers.RandomNormal),
-      ('truncated_normal', 'truncated_normal', initializers.TruncatedNormal),
-      ('TruncatedNormal', 'TruncatedNormal', initializers.TruncatedNormal),
-      ('Identity', 'Identity', initializers.Identity),
-      ('identity', 'identity', initializers.Identity),
-      ('Orthogonal', 'Orthogonal', initializers.Orthogonal),
-      ('orthogonal', 'orthogonal', initializers.Orthogonal),
-      ('variance_scaling', 'variance_scaling', initializers.VarianceScaling),
-      ('VarianceScaling', 'VarianceScaling', initializers.VarianceScaling),
-      ('glorot_uniform', 'glorot_uniform', initializers.GlorotUniform),
-      ('GlorotUniform', 'GlorotUniform', initializers.GlorotUniform),
-      ('glorot_normal', 'glorot_normal', initializers.GlorotNormal),
-      ('GlorotNormal', 'GlorotNormal', initializers.GlorotNormal),
-      ('lecun_normal', 'lecun_normal', initializers.LecunNormal),
-      ('LecunNormal', 'LecunNormal', initializers.LecunNormal),
-      ('lecun_uniform', 'lecun_uniform', initializers.LecunUniform),
-      ('LecunUniform', 'LecunUniform', initializers.LecunUniform),
-      ('he_normal', 'he_normal', initializers.HeNormal),
-      ('HeNormal', 'HeNormal', initializers.HeNormal),
-      ('he_uniform', 'he_uniform', initializers.HeUniform),
-      ('HeUniform', 'HeUniform', initializers.HeUniform),
-  )
-  def test_serialization_deserialization(self, cls_name, expected_cls):
-    initializer = initializers.get(cls_name)
-    self.assertIsInstance(initializer, expected_cls)
+    @parameterized.named_parameters(
+        ("zeros", "zeros", initializers.Zeros),
+        ("Zeros", "Zeros", initializers.Zeros),
+        ("ones", "ones", initializers.Ones),
+        ("Ones", "Ones", initializers.Ones),
+        ("constant", "constant", initializers.Constant),
+        ("Constant", "Constant", initializers.Constant),
+        ("random_uniform", "random_uniform", initializers.RandomUniform),
+        ("RandomUniform", "RandomUniform", initializers.RandomUniform),
+        ("random_normal", "random_normal", initializers.RandomNormal),
+        ("RandomNormal", "RandomNormal", initializers.RandomNormal),
+        ("truncated_normal", "truncated_normal", initializers.TruncatedNormal),
+        ("TruncatedNormal", "TruncatedNormal", initializers.TruncatedNormal),
+        ("Identity", "Identity", initializers.Identity),
+        ("identity", "identity", initializers.Identity),
+        ("Orthogonal", "Orthogonal", initializers.Orthogonal),
+        ("orthogonal", "orthogonal", initializers.Orthogonal),
+        ("variance_scaling", "variance_scaling", initializers.VarianceScaling),
+        ("VarianceScaling", "VarianceScaling", initializers.VarianceScaling),
+        ("glorot_uniform", "glorot_uniform", initializers.GlorotUniform),
+        ("GlorotUniform", "GlorotUniform", initializers.GlorotUniform),
+        ("glorot_normal", "glorot_normal", initializers.GlorotNormal),
+        ("GlorotNormal", "GlorotNormal", initializers.GlorotNormal),
+        ("lecun_normal", "lecun_normal", initializers.LecunNormal),
+        ("LecunNormal", "LecunNormal", initializers.LecunNormal),
+        ("lecun_uniform", "lecun_uniform", initializers.LecunUniform),
+        ("LecunUniform", "LecunUniform", initializers.LecunUniform),
+        ("he_normal", "he_normal", initializers.HeNormal),
+        ("HeNormal", "HeNormal", initializers.HeNormal),
+        ("he_uniform", "he_uniform", initializers.HeUniform),
+        ("HeUniform", "HeUniform", initializers.HeUniform),
+    )
+    def test_serialization_deserialization(self, cls_name, expected_cls):
+        initializer = initializers.get(cls_name)
+        self.assertIsInstance(initializer, expected_cls)
 
-    config = initializers.serialize(initializer)
-    recreated = initializers.deserialize(config)
+        config = initializers.serialize(initializer)
+        recreated = initializers.deserialize(config)
 
-    self.assertIsInstance(recreated, expected_cls)
-    self.assertEqual(config, initializers.serialize(recreated))
+        self.assertIsInstance(recreated, expected_cls)
+        self.assertEqual(config, initializers.serialize(recreated))
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/integration_test_utils.py b/keras/dtensor/integration_test_utils.py
index e8a69e8d8df4..3db7cc00d428 100644
--- a/keras/dtensor/integration_test_utils.py
+++ b/keras/dtensor/integration_test_utils.py
@@ -20,7 +20,10 @@
 """
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl import logging
+
 from keras import layers
 from keras import losses
 from keras import models
@@ -28,119 +31,136 @@
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import layout_map as layout_map_lib
 from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-# pylint: disable=missing-function-docstring
 
 NUM_CLASS = 10  # MNIST has 10 digits
 
 
 def get_model_with_layout_map(layout_map):
-  """Builds a Sequential CNN model to recognize MNIST digits.
+    """Builds a Sequential CNN model to recognize MNIST digits.
+
+    Args:
+      layout_map: dict of string name -> Layout, for weights creation.
+
+    Returns:
+      a CNN Keras model used for MNIST
+    """
 
-  Args:
-    layout_map: dict of string name -> Layout, for weights creation.
+    with layout_map_lib.layout_map_scope(layout_map):
+        # Define a CNN model to recognize MNIST digits.
+        return get_model()
 
-  Returns:
-    a CNN Keras model used for MNIST
-  """
 
-  with layout_map_lib.layout_map_scope(layout_map):
-    # Define a CNN model to recognize MNIST digits.
+def get_model():
+    """Builds a Sequential CNN model to recognize MNIST digits."""
     model = models.Sequential()
     model.add(
         layers.Conv2D(
             32,
-            name='conv2d_1',
+            name="conv2d_1",
             kernel_size=(3, 3),
-            activation='relu',
-            input_shape=(28, 28, 1),    # channel last gray scale input
-            ))
-    model.add(layers.Conv2D(
-        64,
-        name='conv2d_2',
-        kernel_size=(3, 3),
-        activation='relu',
-        ))
+            activation="relu",
+            input_shape=(28, 28, 1),  # channel last gray scale input
+        )
+    )
+    model.add(
+        layers.Conv2D(
+            64,
+            name="conv2d_2",
+            kernel_size=(3, 3),
+            activation="relu",
+        )
+    )
     model.add(layers.MaxPooling2D(pool_size=(2, 2)))
     model.add(layers.Dropout(0.25))
     model.add(layers.Flatten())
-    model.add(layers.Dense(
-        128,
-        name='dense_1',
-        activation='relu',
-        ))
+    model.add(
+        layers.Dense(
+            128,
+            name="dense_1",
+            activation="relu",
+        )
+    )
     model.add(layers.Dropout(0.5))
-    model.add(layers.Dense(
-        NUM_CLASS,
-        name='dense_2',
-        activation='softmax',
-        ))
+    model.add(
+        layers.Dense(
+            NUM_CLASS,
+            name="dense_2",
+            activation="softmax",
+        )
+    )
     return model
 
 
 def get_all_replicated_layout_map(mesh):
-  layout_map = layout_map_lib.LayoutMap(mesh=mesh)
+    layout_map = layout_map_lib.LayoutMap(mesh=mesh)
 
-  layout_4d = dtensor.Layout.replicated(mesh, rank=4)
-  layout_2d = dtensor.Layout.replicated(mesh, rank=2)
-  layout_1d = dtensor.Layout.replicated(mesh, rank=1)
+    layout_4d = dtensor.Layout.replicated(mesh, rank=4)
+    layout_2d = dtensor.Layout.replicated(mesh, rank=2)
+    layout_1d = dtensor.Layout.replicated(mesh, rank=1)
 
-  layout_map['conv2d.*kernel'] = layout_4d
-  layout_map['conv2d.*bias'] = layout_1d
-  layout_map['dense.*kernel'] = layout_2d
-  layout_map['dense.*bias'] = layout_1d
+    layout_map["conv2d.*kernel"] = layout_4d
+    layout_map["conv2d.*bias"] = layout_1d
+    layout_map["dense.*kernel"] = layout_2d
+    layout_map["dense.*bias"] = layout_1d
 
-  return layout_map
+    return layout_map
 
 
 def get_mnist_datasets(num_class, batch_size):
-  (x_train, y_train), (x_test, y_test) = mnist.load_data()
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
 
-  x_train = np.expand_dims(x_train, axis=-1).astype('float32')
-  x_test = np.expand_dims(x_test, axis=-1).astype('float32')
-  x_train /= 255  # normalize to 0~1
-  x_test /= 255
+    x_train = np.expand_dims(x_train, axis=-1).astype("float32")
+    x_test = np.expand_dims(x_test, axis=-1).astype("float32")
+    x_train /= 255  # normalize to 0~1
+    x_test /= 255
 
-  y_train = np_utils.to_categorical(y_train, num_class)
-  y_test = np_utils.to_categorical(y_test, num_class)
+    y_train = np_utils.to_categorical(y_train, num_class)
+    y_test = np_utils.to_categorical(y_test, num_class)
 
-  train_ds = tf.data.Dataset.from_tensor_slices(
-      (x_train, y_train)).repeat().batch(batch_size, drop_remainder=True)
-  eval_ds = tf.data.Dataset.from_tensor_slices(
-      (x_test, y_test)).repeat().batch(batch_size, drop_remainder=True)
+    train_ds = (
+        tf.data.Dataset.from_tensor_slices((x_train, y_train))
+        .repeat()
+        .batch(batch_size, drop_remainder=True)
+    )
+    eval_ds = (
+        tf.data.Dataset.from_tensor_slices((x_test, y_test))
+        .repeat()
+        .batch(batch_size, drop_remainder=True)
+    )
 
-  return train_ds, eval_ds
+    return train_ds, eval_ds
 
 
 def train_mnist_model_batch_sharded(
-    model, optimizer, mesh, num_epochs, steps_per_epoch, global_batch_size):
+    model, optimizer, mesh, num_epochs, steps_per_epoch, global_batch_size
+):
 
-  dataset, _ = get_mnist_datasets(NUM_CLASS, global_batch_size)
+    dataset, _ = get_mnist_datasets(NUM_CLASS, global_batch_size)
 
-  input_image_layout = dtensor.Layout.batch_sharded(mesh, 'batch', rank=4)
-  input_label_layout = dtensor.Layout.batch_sharded(mesh, 'batch', rank=2)
-  loss_obj = losses.CategoricalCrossentropy()
+    input_image_layout = dtensor.Layout.batch_sharded(mesh, "batch", rank=4)
+    input_label_layout = dtensor.Layout.batch_sharded(mesh, "batch", rank=2)
+    loss_obj = losses.CategoricalCrossentropy()
 
-  num_local_devices = mesh.num_local_devices()
-  iterator = iter(dataset)
-  train_losses = []
-  for epoch in range(num_epochs):
-    total_loss = 0.00
-    for _ in range(steps_per_epoch):
-      images, labels = next(iterator)
-      images = tf.split(images, num_local_devices)
-      labels = tf.split(labels, num_local_devices)
-      d_images = dtensor.pack(images, input_image_layout)
-      d_labels = dtensor.pack(labels, input_label_layout)
-      total_loss += train_step(model, d_images, d_labels, loss_obj, optimizer)
+    num_local_devices = mesh.num_local_devices()
+    iterator = iter(dataset)
+    train_losses = []
+    for epoch in range(num_epochs):
+        total_loss = 0.00
+        for _ in range(steps_per_epoch):
+            images, labels = next(iterator)
+            images = tf.split(images, num_local_devices)
+            labels = tf.split(labels, num_local_devices)
+            d_images = dtensor.pack(images, input_image_layout)
+            d_labels = dtensor.pack(labels, input_label_layout)
+            total_loss += train_step(
+                model, d_images, d_labels, loss_obj, optimizer
+            )
 
-    train_loss = tf.reduce_mean(total_loss / steps_per_epoch)
+        train_loss = tf.reduce_mean(total_loss / steps_per_epoch)
 
-    logging.info('Epoch %d, Loss: %f', epoch, train_loss)
-    train_losses.append(train_loss)
-  return train_losses
+        logging.info("Epoch %d, Loss: %f", epoch, train_loss)
+        train_losses.append(train_loss)
+    return train_losses
 
 
 # Change to use model.fit when dataset has the correct layout info populated
@@ -148,12 +168,10 @@ def train_mnist_model_batch_sharded(
 @tf.function
 def train_step(model, feature, label, loss_obj, optimizer):
 
-  with tf.GradientTape() as tape:
-    predict = model(feature, training=True)
-    loss = loss_obj(label, predict)
-
-  gradients = tape.gradient(loss, model.trainable_variables)
-  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
-  return loss
-
+    with tf.GradientTape() as tape:
+        predict = model(feature, training=True)
+        loss = loss_obj(label, predict)
 
+    gradients = tape.gradient(loss, model.trainable_variables)
+    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+    return loss
diff --git a/keras/dtensor/layers_test.py b/keras/dtensor/layers_test.py
index 11b83f6a557c..5efc2b7a8f26 100644
--- a/keras/dtensor/layers_test.py
+++ b/keras/dtensor/layers_test.py
@@ -14,87 +14,142 @@
 # ==============================================================================
 """Tests for layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import backend
 from keras import layers
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import test_util
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 class LayersTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        backend.enable_tf_random_generator()
+        tf_utils.set_random_seed(1337)
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
 
-  def setUp(self):
-    super().setUp()
-    backend.enable_tf_random_generator()
-    tf_utils.set_random_seed(1337)
-    global_ids = test_util.create_device_ids_array((2, 2))
-    local_device_ids = np.ravel(global_ids).tolist()
-    mesh_dict = {
-        'CPU':
-            dtensor.Mesh(['X', 'Y'], global_ids,
-                         local_device_ids,
-                         test_util.create_device_list((2, 2), 'CPU'))
-    }
-    self.mesh = self.configTestMesh(mesh_dict)
+    @parameterized.named_parameters(
+        (
+            "dense",
+            layers.Dense,
+            {"units": 4},
+            {"kernel": 2, "bias": 1},
+            [10, 8],
+        ),
+        # TODO(b/224861663): Enable this test.
+        # ('embedding', layers.Embedding, {'input_dim': 100, 'output_dim': 32},
+        #  {'embeddings': 2}, [10,], np.int32),
+        (
+            "conv1d",
+            layers.Conv1D,
+            {"filters": 4, "kernel_size": 3},
+            {"kernel": 3, "bias": 1},
+            [10, 28, 3],
+        ),
+        (
+            "conv1d_transpose",
+            layers.Conv1DTranspose,
+            {"filters": 4, "kernel_size": 3},
+            {"kernel": 3, "bias": 1},
+            [10, 28, 3],
+        ),
+        (
+            "conv2d",
+            layers.Conv2D,
+            {"filters": 4, "kernel_size": (3, 3)},
+            {"kernel": 4, "bias": 1},
+            [10, 28, 28, 3],
+        ),
+        (
+            "conv2d_transpose",
+            layers.Conv2DTranspose,
+            {"filters": 4, "kernel_size": (3, 3)},
+            {"kernel": 4, "bias": 1},
+            [10, 28, 28, 3],
+        ),
+        (
+            "conv3d",
+            layers.Conv3D,
+            {"filters": 4, "kernel_size": (3, 3, 3)},
+            {"kernel": 5, "bias": 1},
+            [10, 28, 28, 28, 3],
+        ),
+        # TODO(b/224862394): Add support for tf.Conv3DBackpropInputV2
+        # ('conv3dtranspose', layers.Conv3DTranspose,
+        #  {'filters': 4, 'kernel_size': (3, 3, 3)},
+        #  {'kernel': 5, 'bias': 1}, [10, 28, 28, 28, 3]),
+        (
+            "batch_norm",
+            layers.BatchNormalization,
+            {"fused": False},
+            {"beta": 1, "gamma": 1, "moving_mean": 1, "moving_variance": 1},
+            [10, 28, 28, 3],
+        ),
+        (
+            "layer_norm",
+            layers.LayerNormalization,
+            {"dtype": tf.float64},
+            {"beta": 1, "gamma": 1},
+            [10, 28, 28, 3],
+        ),
+    )
+    def test_layer(
+        self,
+        layer_cls,
+        init_args,
+        variable_settings,
+        input_shape,
+        input_dtype=np.float32,
+    ):
+        args_with_layout = init_args.copy()
+        for variable_name, variable_rank in variable_settings.items():
+            args_with_layout[
+                variable_name + "_layout"
+            ] = dtensor.Layout.replicated(self.mesh, variable_rank)
 
-  @parameterized.named_parameters(
-      ('dense', layers.Dense, {'units': 4}, {'kernel': 2, 'bias': 1}, [10, 8]),
-      # TODO(b/224861663): Enable this test.
-      # ('embedding', layers.Embedding, {'input_dim': 100, 'output_dim': 32},
-      #  {'embeddings': 2}, [10,], np.int32),
-      ('conv1d', layers.Conv1D, {'filters': 4, 'kernel_size': 3},
-       {'kernel': 3, 'bias': 1}, [10, 28, 3]),
-      ('conv1d_transpose', layers.Conv1DTranspose,
-       {'filters': 4, 'kernel_size': 3}, {'kernel': 3, 'bias': 1}, [10, 28, 3]),
-      ('conv2d', layers.Conv2D, {'filters': 4, 'kernel_size': (3, 3)},
-       {'kernel': 4, 'bias': 1}, [10, 28, 28, 3]),
-      ('conv2d_transpose', layers.Conv2DTranspose,
-       {'filters': 4, 'kernel_size': (3, 3)},
-       {'kernel': 4, 'bias': 1}, [10, 28, 28, 3]),
-      ('conv3d', layers.Conv3D, {'filters': 4, 'kernel_size': (3, 3, 3)},
-       {'kernel': 5, 'bias': 1}, [10, 28, 28, 28, 3]),
-      # TODO(b/224862394): Add support for tf.Conv3DBackpropInputV2
-      # ('conv3dtranspose', layers.Conv3DTranspose,
-      #  {'filters': 4, 'kernel_size': (3, 3, 3)},
-      #  {'kernel': 5, 'bias': 1}, [10, 28, 28, 28, 3]),
-      ('batch_norm', layers.BatchNormalization, {'fused': False},
-       {'beta': 1, 'gamma': 1, 'moving_mean': 1, 'moving_variance': 1},
-       [10, 28, 28, 3]),
-      ('layer_norm', layers.LayerNormalization, {'dtype': tf.float64},
-       {'beta': 1, 'gamma': 1}, [10, 28, 28, 3])
-  )
-  def test_layer(self, layer_cls, init_args, variable_settings, input_shape,
-                 input_dtype=np.float32):
-    args_with_layout = init_args.copy()
-    for variable_name, variable_rank in variable_settings.items():
-      args_with_layout[variable_name + '_layout'] = dtensor.Layout.replicated(
-          self.mesh, variable_rank)
+        layer = layer_cls(**args_with_layout)
+        # inputs = np.random.random(input_shape)
+        inputs = np.random.randn(*input_shape).astype(input_dtype)
+        d_inputs = dtensor.copy_to_mesh(
+            inputs, dtensor.Layout.replicated(self.mesh, len(input_shape))
+        )
+        d_output = layer(d_inputs)
 
-    layer = layer_cls(**args_with_layout)
-    # inputs = np.random.random(input_shape)
-    inputs = np.random.randn(*input_shape).astype(input_dtype)
-    d_inputs = dtensor.copy_to_mesh(
-        inputs, dtensor.Layout.replicated(self.mesh, len(input_shape)))
-    d_output = layer(d_inputs)
+        for variable_name, variable_rank in variable_settings.items():
+            self.assertIsInstance(
+                getattr(layer, variable_name), dtensor.DVariable
+            )
 
-    for variable_name, variable_rank in variable_settings.items():
-      self.assertIsInstance(getattr(layer, variable_name), dtensor.DVariable)
+        expected_layout = dtensor.Layout.replicated(
+            self.mesh, d_output.shape.rank
+        )
+        self.assertEqual(dtensor.fetch_layout(d_output), expected_layout)
 
-    expected_layout = dtensor.Layout.replicated(self.mesh, d_output.shape.rank)
-    self.assertEqual(dtensor.fetch_layout(d_output), expected_layout)
+        # Make sure to produce same output when layout is not used
+        tf_utils.set_random_seed(1337)
+        layer_2 = layer_cls(**init_args)
+        output = layer_2(inputs)
+        self.assertAllClose(d_output, output)
 
-    # Make sure to produce same output when layout is not used
-    tf_utils.set_random_seed(1337)
-    layer_2 = layer_cls(**init_args)
-    output = layer_2(inputs)
-    self.assertAllClose(d_output, output)
+        for variable_name, variable_rank in variable_settings.items():
+            self.assertNotIsInstance(
+                getattr(layer_2, variable_name), dtensor.DVariable
+            )
 
-    for variable_name, variable_rank in variable_settings.items():
-      self.assertNotIsInstance(getattr(layer_2, variable_name),
-                               dtensor.DVariable)
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/layout_map.py b/keras/dtensor/layout_map.py
index 8b23ab79ac49..c7fd3407d533 100644
--- a/keras/dtensor/layout_map.py
+++ b/keras/dtensor/layout_map.py
@@ -19,433 +19,579 @@
 import re
 import threading
 
+import tensorflow.compat.v2 as tf
+
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import lazy_variable
 from keras.dtensor import utils
 from keras.engine import base_layer
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-# pylint: disable=missing-class-docstring
 
 # We will skip the path for certain attributes when mapping the layout, e.g.
 # model._self_tracked_trackables, or layer._trainable_weights/
 # _non_trainable_weights, etc. Those attributes are usually served as a cache,
 # and the actual variable should be in somewhere else.
-_KERAS_ATTRIBUTES_TO_SKIP = ['_self_tracked_trackables', '_trainable_weights',
-                             '_non_trainable_weights',
-                             '_captured_weight_regularizer']
+_KERAS_ATTRIBUTES_TO_SKIP = [
+    "_self_tracked_trackables",
+    "_trainable_weights",
+    "_non_trainable_weights",
+    "_captured_weight_regularizer",
+]
 
 
 _LAYOUT_MAP = threading.local()
 
 
 def get_current_layout_map():
-  return getattr(_LAYOUT_MAP, 'layout_map', None)
+    return getattr(_LAYOUT_MAP, "layout_map", None)
 
 
-@keras_export('keras.dtensor.experimental.LayoutMap', v1=[])
+@keras_export("keras.dtensor.experimental.LayoutMap", v1=[])
 class LayoutMap(collections.abc.MutableMapping):
-  """A dict-like object that maps string to `Layout` instances.
-
-  `LayoutMap` uses a string as key and a `Layout` as value. There is a behavior
-  difference between a normal Python dict and this class. The string key will be
-  treated as a regex when retrieving the value. See the docstring of
-  `get` for more details.
-
-  See below for a usage example. You can define the naming schema
-  of the `Layout`, and then retrieve the corresponding `Layout` instance.
-
-  To use the `LayoutMap` with a `Model`, please see the docstring of
-  `tf.keras.dtensor.experimental.layout_map_scope`.
-
-  ```python
-  map = LayoutMap(mesh=None)
-  map['.*dense.*kernel'] = layout_2d
-  map['.*dense.*bias'] = layout_1d
-  map['.*conv2d.*kernel'] = layout_4d
-  map['.*conv2d.*bias'] = layout_1d
-
-  layout_1 = map['dense_1.kernel']    #   layout_1 == layout_2d
-  layout_2 = map['dense_1.bias']      #   layout_2 == layout_1d
-  layout_3 = map['dense_2.kernel']    #   layout_3 == layout_2d
-  layout_4 = map['dense_2.bias']      #   layout_4 == layout_1d
-  layout_5 = map['my_model/conv2d_123/kernel']    #   layout_5 == layout_4d
-  layout_6 = map['my_model/conv2d_123/bias']      #   layout_6 == layout_1d
-  ```
-
-  Args:
-    mesh: An optional `Mesh` that can be used to create all replicated
-      layout as default when there isn't a layout found based on the input
-      string query.
-  """
-
-  def __init__(self, mesh=None):
-    self._layout_map = collections.OrderedDict()
-    self._default_mesh = mesh
-
-  def __getitem__(self, key):
-    """Retrieve the corresponding layout by the string key.
-
-    When there isn't an exact match, all the existing keys in the layout map
-    will be treated as a regex and map against the input key again. The first
-    match will be returned, based on the key insertion order. Return None if
-    there isn't any match found.
+    """A dict-like object that maps string to `Layout` instances.
+
+    `LayoutMap` uses a string as key and a `Layout` as value. There is a
+    behavior difference between a normal Python dict and this class. The string
+    key will be treated as a regex when retrieving the value. See the docstring
+    of `get` for more details.
+
+    See below for a usage example. You can define the naming schema
+    of the `Layout`, and then retrieve the corresponding `Layout` instance.
+
+    To use the `LayoutMap` with a `Model`, please see the docstring of
+    `tf.keras.dtensor.experimental.layout_map_scope`.
+
+    ```python
+    map = LayoutMap(mesh=None)
+    map['.*dense.*kernel'] = layout_2d
+    map['.*dense.*bias'] = layout_1d
+    map['.*conv2d.*kernel'] = layout_4d
+    map['.*conv2d.*bias'] = layout_1d
+
+    layout_1 = map['dense_1.kernel']    #   layout_1 == layout_2d
+    layout_2 = map['dense_1.bias']      #   layout_2 == layout_1d
+    layout_3 = map['dense_2.kernel']    #   layout_3 == layout_2d
+    layout_4 = map['dense_2.bias']      #   layout_4 == layout_1d
+    layout_5 = map['my_model/conv2d_123/kernel']    #   layout_5 == layout_4d
+    layout_6 = map['my_model/conv2d_123/bias']      #   layout_6 == layout_1d
+    ```
 
     Args:
-      key: the string key as the query for the layout.
-
-    Returns:
-      Corresponding layout based on the query.
+      mesh: An optional `Mesh` that can be used to create all replicated
+        layout as default when there isn't a layout found based on the input
+        string query.
     """
-    if key in self._layout_map:
-      return self._layout_map[key]
-
-    for k in self._layout_map:
-      if re.match(k, key):
-        return self._layout_map[k]
-    return None
-
-  def __setitem__(self, key, layout):
-    if key in self._layout_map:
-      raise ValueError(f'{key} already exist in the LayoutMap with '
-                       f'value {self._layout_map[key]}. Please make sure to '
-                       'not use duplicated keys.')
-    if not isinstance(layout, dtensor.Layout):
-      raise ValueError(f'{layout} should be a dtensor.Layout type, '
-                       f'got {type(layout)}')
-
-    self._layout_map[key] = layout
 
-  def __delitem__(self, key):
-    # let the dict to handle the key missing error
-    return self._layout_map.pop(key)
-
-  def __len__(self):
-    return len(self._layout_map)
-
-  def __iter__(self):
-    return iter(self._layout_map)
-
-  def get_default_mesh(self):
-    """Return the default `Mesh` set at instance creation.
-
-    The `Mesh` can be used to create default replicated `Layout` when there
-    isn't a match of the input string query.
-    """
-    return self._default_mesh
+    def __init__(self, mesh=None):
+        self._layout_map = collections.OrderedDict()
+        self._default_mesh = mesh
+
+    def __getitem__(self, key):
+        """Retrieve the corresponding layout by the string key.
+
+        When there isn't an exact match, all the existing keys in the layout map
+        will be treated as a regex and map against the input key again. The
+        first match will be returned, based on the key insertion order. Return
+        None if there isn't any match found.
+
+        Args:
+          key: the string key as the query for the layout.
+
+        Returns:
+          Corresponding layout based on the query.
+        """
+        if key in self._layout_map:
+            return self._layout_map[key]
+
+        for k in self._layout_map:
+            if re.match(k, key):
+                return self._layout_map[k]
+        return None
+
+    def __setitem__(self, key, layout):
+        if key in self._layout_map:
+            raise ValueError(
+                f"{key} already exist in the LayoutMap with "
+                f"value {self._layout_map[key]}. Please make sure to "
+                "not use duplicated keys."
+            )
+        if not isinstance(layout, dtensor.Layout):
+            raise ValueError(
+                f"{layout} should be a dtensor.Layout type, got {type(layout)}"
+            )
+
+        self._layout_map[key] = layout
+
+    def __delitem__(self, key):
+        # let the dict to handle the key missing error
+        return self._layout_map.pop(key)
+
+    def __len__(self):
+        return len(self._layout_map)
+
+    def __iter__(self):
+        return iter(self._layout_map)
+
+    def get_default_mesh(self):
+        """Return the default `Mesh` set at instance creation.
+
+        The `Mesh` can be used to create default replicated `Layout` when there
+        isn't a match of the input string query.
+        """
+        return self._default_mesh
+
+    def scope(self):
+        """Apply layout to all `tf.Variable` instances created under the scope.
+
+        All `tf.Variable` instances created under this scope
+        will be lazily initialized first. Once they are attached as the model
+        or layer attributes, and there is a stable layout mapping for it, the
+        variables will be reinitialized into a
+        `tf.experimental.dtensor.DVariable` with corresponding layout.
+
+        Note that the layout mapping will use object/attribute names as the
+        keys to map the variable to the layout.
+
+        For subclassed models, the full object/attribute name is used as the
+        key. For Functional/Sequential models, we use `layer.name` as
+        the key for the layer, followed by the attribute name. Keras ensures
+        name uniqueness among the layers within a Functional/Sequential model.
+
+        See the following examples that show variable object names
+        for different Keras model types:
+
+        ```python
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        layout_map['d1.kernel'] = layout_1
+        layout_map['d1.bias'] = layout_2
+        layout_map['d2.kernel'] = layout_3
+        layout_map['d2.bias'] = layout_4
+
+        ## Subclassed model
+        class SubclassModel(tf.keras.Model):
+
+          def __init__(self, name=None):
+            super().__init__(name=name)
+            self.d1 = tf.keras.layers.Dense(1000)
+            self.d2 = tf.keras.layers.Dense(1000)
+
+          def call(self, inputs):
+            x = self.d1(inputs)
+            return self.d2(x)
+
+        with layout_map.scope():
+          model = SubclassModel()
+        inputs = tf.zeros((10, 10))
+        results = model(inputs)
+
+        model.d1.kernel.layout == layout_1
+        model.d1.bias.layout == layout_2
+        model.d2.kernel.layout == layout_3
+        model.d2.bias.layout == layout_4
+
+        ## Functional model
+        with layout_map.scope():
+          inputs = tf.keras.Input((10,), batch_size=10)
+          x = tf.keras.layers.Dense(20, name='d1')(inputs)
+          output = tf.keras.layers.Dense(30, name='d2')(x)
+
+          model = tf.keras.Model(inputs, output)
+
+        d1 = model.layers[1]
+        d2 = model.layers[2]
+
+        d1.kernel.layout == layout_1
+        d1.bias.layout == layout_2
+        d1.kernel.layout == layout_3
+        d1.bias.layout == layout_4
+
+        ## Sequential model
+        with layout_map.scope():
+          model = tf.keras.Sequential([
+              tf.keras.layers.Dense(20, name='d1', input_shape=(10,)),
+              tf.keras.layers.Dense(30, name='d2')
+          ])
+
+        d1 = model.layers[0]
+        d2 = model.layers[1]
+
+        d1.kernel.layout == layout_1
+        d1.bias.layout == layout_2
+        d1.kernel.layout == layout_3
+        d1.bias.layout == layout_4
+        ```
+
+        Returns:
+          A context that will lazily initialize all `tf.Variable` objects
+          within the model, with their attributed layouts.
+        """
+        return layout_map_scope(self)
 
 
 LayoutMap.get.__doc__ = LayoutMap.__getitem__.__doc__
 
 
-@keras_export('keras.dtensor.experimental.layout_map_scope', v1=[])
 @contextlib.contextmanager
 def layout_map_scope(layout_map):
-  """Apply the layout to all the tf.Variables created under the scope.
-
-  Create a scope that all the tf.Variable created under this scope
-  will be lazily inited, and initialized later on with proper layout when the
-  object path in the model is stable/finalized.
-
-  Note that the layout mapping will use the object/attribute names as the key
-  to map the variable against the layout.
-
-  For subclassed models, the full object/attribute name is used as the key.
-  For Functional/Sequential models, since the layers within the model do not get
-  assigned to a meaningful attribute, we use `layer.name` as the key
-  for the layer, followed by the attribute name. Keras ensures
-  name uniqueness among the layers in all Functional/Sequential models.
-
-  See the following examples that show the variable object names
-  for different Keras model types:
-
-  ```python
-  layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
-  layout_map['d1.kernel'] = layout_1
-  layout_map['d1.bias'] = layout_2
-  layout_map['d2.kernel'] = layout_3
-  layout_map['d2.bias'] = layout_4
-
-  ## Subclassed model
-  class SubclassModel(tf.keras.Model):
-
-    def __init__(self, name=None):
-      super().__init__(name=name)
-      self.d1 = tf.keras.layers.Dense(1000)
-      self.d2 = tf.keras.layers.Dense(1000)
-
-    def call(self, inputs):
-      x = self.d1(inputs)
-      return self.d2(x)
-
-  with layout_map_scope(layout_map):
-    model = SubclassModel()
-  # Triggering the creation of weights within or outside of the scope works
-  inputs = tf.zeros((10, 10))
-  results = model(inputs)
-
-  model.d1.kernel.layout == layout_1
-  model.d1.bias.layout == layout_2
-  model.d2.kernel.layout == layout_3
-  model.d2.bias.layout == layout_4
-
-  ## Functional model
-  with layout_map_scope(layout_map):
-    inputs = tf.keras.Input((10,), batch_size=10)
-    x = tf.keras.layers.Dense(20, name='d1')(inputs)
-    output = tf.keras.layers.Dense(30, name='d2')(x)
-
-    model = tf.keras.Model(inputs, output)
-
-  d1 = model.layers[1]
-  d2 = model.layers[2]
-
-  d1.kernel.layout == layout_1
-  d1.bias.layout == layout_2
-  d1.kernel.layout == layout_3
-  d1.bias.layout == layout_4
-
-  ## Sequential model
-  with layout_map_scope(layout_map):
-    model = tf.keras.Sequential([
-        tf.keras.layers.Dense(20, name='d1', input_shape=(10,)),
-        tf.keras.layers.Dense(30, name='d2')
-    ])
-
-  d1 = model.layers[0]
-  d2 = model.layers[1]
-
-  d1.kernel.layout == layout_1
-  d1.bias.layout == layout_2
-  d1.kernel.layout == layout_3
-  d1.bias.layout == layout_4
-  ```
-
-  Args:
-    layout_map: a LayoutMap which contains the variable_object_path (string) ->
-      Layout. When a layout is not found for the variable, a default all
-      replicated layout will be created for the variable.
-
-  Yields:
-    A context that will lazily initialize all `tf.Variable` objects
-    within the model, with their attributed layouts.
-  """
-  previous_layout_map = get_current_layout_map()
-  global _LAYOUT_MAP
-  _LAYOUT_MAP.layout_map = layout_map
-
-  with lazy_variable.lazy_init_scope():
-    try:
-      yield
-    finally:
-      _LAYOUT_MAP.layout_map = previous_layout_map
+    """Apply the layout to all the tf.Variables created under the scope.
+
+    Create a scope that all the tf.Variable created under this scope
+    will be lazily inited, and initialized later on with proper layout when the
+    object path in the model is stable/finalized.
+
+    Note that the layout mapping will use the object/attribute names as the key
+    to map the variable against the layout.
+
+    For subclassed models, the full object/attribute name is used as the key.
+    For Functional/Sequential models, since the layers within the model do not
+    get assigned to a meaningful attribute, we use `layer.name` as the key for
+    the layer, followed by the attribute name. Keras ensures name uniqueness
+    among the layers in all Functional/Sequential models.
+
+    See the following examples that show the variable object names
+    for different Keras model types:
+
+    ```python
+    layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+    layout_map['d1.kernel'] = layout_1
+    layout_map['d1.bias'] = layout_2
+    layout_map['d2.kernel'] = layout_3
+    layout_map['d2.bias'] = layout_4
+
+    ## Subclassed model
+    class SubclassModel(tf.keras.Model):
+
+      def __init__(self, name=None):
+        super().__init__(name=name)
+        self.d1 = tf.keras.layers.Dense(1000)
+        self.d2 = tf.keras.layers.Dense(1000)
+
+      def call(self, inputs):
+        x = self.d1(inputs)
+        return self.d2(x)
+
+    with layout_map_scope(layout_map):
+      model = SubclassModel()
+    # Triggering the creation of weights within or outside of the scope works
+    inputs = tf.zeros((10, 10))
+    results = model(inputs)
+
+    model.d1.kernel.layout == layout_1
+    model.d1.bias.layout == layout_2
+    model.d2.kernel.layout == layout_3
+    model.d2.bias.layout == layout_4
+
+    ## Functional model
+    with layout_map_scope(layout_map):
+      inputs = tf.keras.Input((10,), batch_size=10)
+      x = tf.keras.layers.Dense(20, name='d1')(inputs)
+      output = tf.keras.layers.Dense(30, name='d2')(x)
+
+      model = tf.keras.Model(inputs, output)
+
+    d1 = model.layers[1]
+    d2 = model.layers[2]
+
+    d1.kernel.layout == layout_1
+    d1.bias.layout == layout_2
+    d1.kernel.layout == layout_3
+    d1.bias.layout == layout_4
+
+    ## Sequential model
+    with layout_map_scope(layout_map):
+      model = tf.keras.Sequential([
+          tf.keras.layers.Dense(20, name='d1', input_shape=(10,)),
+          tf.keras.layers.Dense(30, name='d2')
+      ])
+
+    d1 = model.layers[0]
+    d2 = model.layers[1]
+
+    d1.kernel.layout == layout_1
+    d1.bias.layout == layout_2
+    d1.kernel.layout == layout_3
+    d1.bias.layout == layout_4
+    ```
 
+    Args:
+      layout_map: a LayoutMap which contains the variable_object_path (string)
+        -> Layout. When a layout is not found for the variable, a default all
+        replicated layout will be created for the variable.
 
-def _map_subclass_model_variable(model, layout_map):
-  """Map/Replace LazyInitVariable for subclass model."""
-  lazy_init_variable_to_tf_variable_map = {}
-
-  # Note that the model._flatten is a method from tf.Module, and it returns
-  # duplicated items (since some of the items have different paths).
-  for path, variable in model._flatten(predicate=_is_lazy_init_variable,  # pylint: disable=protected-access
-                                       with_path=True):
-    # Note that path is a tuple that contains string and ints, eg:
-    # ('d1', '_trainable_weights', 0) maps to model.d1._trainable_weights[0]
-    if [a for a in _KERAS_ATTRIBUTES_TO_SKIP if a in path]:
-      continue
-    # Convert all the ints to string and join with .
-    object_path = '.'.join([str(item) for item in path])
-
-    new_variable = _create_dvariable(layout_map, object_path, variable)
-    _set_object_by_path(model, path, new_variable)
-    lazy_init_variable_to_tf_variable_map[id(variable)] = new_variable
-
-  for layer in model._flatten(  # pylint: disable=protected-access
-      predicate=lambda o: isinstance(o, base_layer.Layer)):
-    _config_dvariable_regularization(
-        layer, lazy_init_variable_to_tf_variable_map)
-  # After we replaced all the variables, we want to make sure all the cached
-  # attributes are having the new variable, rather than old LazyInitVariable.
-  for path, variable in model._flatten(predicate=_is_lazy_init_variable,  # pylint: disable=protected-access
-                                       with_path=True):
-    tf_variable = lazy_init_variable_to_tf_variable_map[id(variable)]
-    _set_object_by_path(model, path, tf_variable)
-
-  _init_state_variable_for_rng(model, layout_map)
-  return model
+    Yields:
+      A context that will lazily initialize all `tf.Variable` objects
+      within the model, with their attributed layouts.
+    """
+    previous_layout_map = get_current_layout_map()
+    global _LAYOUT_MAP
+    _LAYOUT_MAP.layout_map = layout_map
 
+    with lazy_variable.lazy_init_scope():
+        try:
+            yield
+        finally:
+            _LAYOUT_MAP.layout_map = previous_layout_map
 
-def _map_functional_model_variable(model, layout_map):
-  """Map/Replace LazyInitVariable for functional/sequential model."""
-  lazy_init_variable_to_tf_variable_map = {}
-
-  for layer in model.layers:
-    # Note that layer name is unique among the functional/sequential model
-    # when the layer name is not provided, Keras will auto generate a layer
-    # name based on the class name.
-    layer_name = layer.name
-    for path, variable in layer._flatten(predicate=_is_lazy_init_variable,  # pylint: disable=protected-access
-                                         with_path=True):
-      # Note that path is a tuple that contains string and ints, eg:
-      # ('d1', '_trainable_weights', 0) maps to model.d1._trainable_weights[0]
-      if [a for a in _KERAS_ATTRIBUTES_TO_SKIP if a in path]:
-        continue
-      # Convert all the ints to string and join with .
-      object_path = '.'.join([str(item) for item in path])
-      # Also attach the layer name
-      object_path = layer_name + '.' + object_path
-
-      new_variable = _create_dvariable(layout_map, object_path, variable)
-      _set_object_by_path(layer, path, new_variable)
-      lazy_init_variable_to_tf_variable_map[id(variable)] = new_variable
-
-    _config_dvariable_regularization(
-        layer, lazy_init_variable_to_tf_variable_map)
 
+def _map_subclass_model_variable(model, layout_map):
+    """Map/Replace LazyInitVariable for subclass model."""
+    lazy_init_variable_to_tf_variable_map = {}
+
+    # Note that the model._flatten is a method from tf.Module, and it returns
+    # duplicated items (since some of the items have different paths).
+    for path, variable in model._flatten(
+        predicate=_is_lazy_init_variable,
+        with_path=True,
+    ):
+        # Note that path is a tuple that contains string and ints, eg:
+        # ('d1', '_trainable_weights', 0) maps to model.d1._trainable_weights[0]
+        if [a for a in _KERAS_ATTRIBUTES_TO_SKIP if a in path]:
+            continue
+        # Convert all the ints to string and join with .
+        object_path = ".".join([str(item) for item in path])
+
+        new_variable = _create_dvariable(layout_map, object_path, variable)
+        _set_object_by_path(model, path, new_variable)
+        lazy_init_variable_to_tf_variable_map[id(variable)] = new_variable
+
+    for layer in model._flatten(
+        predicate=lambda o: isinstance(o, base_layer.Layer)
+    ):
+        _config_dvariable_regularization(
+            layer, lazy_init_variable_to_tf_variable_map
+        )
     # After we replaced all the variables, we want to make sure all the cached
     # attributes are having the new variable, rather than old LazyInitVariable.
-    for path, variable in layer._flatten(predicate=_is_lazy_init_variable,  # pylint: disable=protected-access
-                                         with_path=True):
-      tf_variable = lazy_init_variable_to_tf_variable_map[id(variable)]
-      _set_object_by_path(layer, path, tf_variable)
+    for path, variable in model._flatten(
+        predicate=_is_lazy_init_variable,
+        with_path=True,
+    ):
+        tf_variable = lazy_init_variable_to_tf_variable_map[id(variable)]
+        _set_object_by_path(model, path, tf_variable)
+
+    _init_state_variable_for_rng(model, layout_map)
+    _update_trackable_reference(model, lazy_init_variable_to_tf_variable_map)
+    return model
 
-  _init_state_variable_for_rng(model, layout_map)
-  return model
+
+def _map_functional_model_variable(model, layout_map):
+    """Map/Replace LazyInitVariable for functional/sequential model."""
+    lazy_init_variable_to_tf_variable_map = {}
+
+    for layer in model.layers:
+        # Note that layer name is unique among the functional/sequential model
+        # when the layer name is not provided, Keras will auto generate a layer
+        # name based on the class name.
+        layer_name = layer.name
+        for path, variable in layer._flatten(
+            predicate=_is_lazy_init_variable,
+            with_path=True,
+        ):
+            # Note that path is a tuple that contains string and ints, eg:
+            # ('d1', '_trainable_weights', 0) maps to
+            # model.d1._trainable_weights[0]
+            if [a for a in _KERAS_ATTRIBUTES_TO_SKIP if a in path]:
+                continue
+            # Convert all the ints to string and join with .
+            object_path = ".".join([str(item) for item in path])
+            # Also attach the layer name
+            object_path = layer_name + "." + object_path
+
+            new_variable = _create_dvariable(layout_map, object_path, variable)
+            _set_object_by_path(layer, path, new_variable)
+            lazy_init_variable_to_tf_variable_map[id(variable)] = new_variable
+
+        _config_dvariable_regularization(
+            layer, lazy_init_variable_to_tf_variable_map
+        )
+
+        # After we replaced all the variables, we want to make sure all the
+        # cached attributes are having the new variable, rather than old
+        # LazyInitVariable.
+        for path, variable in layer._flatten(
+            predicate=_is_lazy_init_variable,
+            with_path=True,
+        ):
+            tf_variable = lazy_init_variable_to_tf_variable_map[id(variable)]
+            _set_object_by_path(layer, path, tf_variable)
+
+    _init_state_variable_for_rng(model, layout_map)
+    _update_trackable_reference(model, lazy_init_variable_to_tf_variable_map)
+    return model
 
 
 def _init_state_variable_for_rng(model, layout_map):
-  """Init the state variable in tf.ranodm.Generator.
-
-  Since the BaseRandomLayer in keras explicitly untrack the tf.random.Generator,
-  the variable in it will stay as LazyInitVariable, which cause runtime error if
-  we don't replace them with proper DVariable. Since user usually are not
-  aware the existance of those variable, we will just give them replicated
-  layout since they are tiny.
-
-  Args:
-    model: the model whose layers will be checked to find the BaseRandomLayers.
-    layout_map: used to get the default mesh information to create DVariable.
-  """
-  # pylint: disable=protected-access
-  for l in model._flatten(
-      predicate=lambda o: isinstance(o, base_layer.BaseRandomLayer)):
-    keras_generator = l._random_generator
-    if keras_generator._built and keras_generator._generator is None:
-      raise ValueError(
-          'Keras is expected to use tf.random.Generator when using DTensor API.'
-          'Please call '
-          '`tf.keras.backend.experimental.enable_tf_random_generator` at the '
-          'beginning of your program.')
-    if hasattr(keras_generator, '_generator') and _is_lazy_init_variable(
-        keras_generator._generator._state_var):
-      # Replace it with DVariable
-      keras_generator._generator._state_var = _create_dvariable(
-          layout_map, '', keras_generator._generator._state_var)
-    else:
-      # When the keras_generator is not built yet. Call the init function with
-      # DTensor device to init all the variable with default replicated layout.
-      with dtensor.run_on(layout_map.get_default_mesh()):
-        keras_generator._maybe_init()
+    """Init the state variable in tf.ranodm.Generator.
+
+    Since the BaseRandomLayer in keras explicitly untrack the
+    tf.random.Generator, the variable in it will stay as LazyInitVariable, which
+    cause runtime error if we don't replace them with proper DVariable. Since
+    user usually are not aware the existence of those variable, we will just
+    give them replicated layout since they are tiny.
+
+    Args:
+      model: the model whose layers will be checked to find the
+        BaseRandomLayers.
+      layout_map: used to get the default mesh information to create DVariable.
+    """
+
+    for l in model._flatten(
+        predicate=lambda o: isinstance(o, base_layer.BaseRandomLayer)
+    ):
+        keras_generator = l._random_generator
+        if keras_generator._built and keras_generator._generator is None:
+            raise ValueError(
+                "Keras is expected to use tf.random.Generator when using "
+                "DTensor API. Please call "
+                "`tf.keras.backend.experimental.enable_tf_random_generator` at "
+                "the beginning of your program."
+            )
+        if hasattr(keras_generator, "_generator") and _is_lazy_init_variable(
+            keras_generator._generator._state_var
+        ):
+            # Replace it with DVariable
+            keras_generator._generator._state_var = _create_dvariable(
+                layout_map, "", keras_generator._generator._state_var
+            )
+        else:
+            # When the keras_generator is not built yet. Call the init function
+            # with DTensor device to init all the variable with default
+            # replicated layout.
+            with dtensor.default_mesh(layout_map.get_default_mesh()):
+                keras_generator._maybe_init()
 
 
 def _config_dvariable_regularization(
-    layer, lazy_init_variable_to_tf_variable_map):
-  """Update the weights regularizer for newly created `DVariable`.
-
-  The weight regularization usually happens when `layer.add_weight()` is called,
-  at which point the library will first create a `LazyInitVariable`, and then
-  replace it with a `DVariable`. We will defer the creation of those losses,
-  until the DVariable is created.
-
-  See `layer._captured_weight_regularizer` for more details.
-
-  Args:
-    layer: the layer instance for DVariable regularization config.
-    lazy_init_variable_to_tf_variable_map: the dict between LazyInitVariable ID
-      and newly created DVariable.
-  """
-  # pylint: disable=protected-access
-  for (name, variable, regualarizer) in layer._captured_weight_regularizer:
-    if not _is_lazy_init_variable(variable):
-      raise ValueError('Expect the regularization loss are created from '
-                       f'LazyInitVariable, got {variable}')
-    d_variable = lazy_init_variable_to_tf_variable_map[id(variable)]
-    layer._handle_weight_regularization(name, d_variable, regualarizer)
-  # After that, we should cleanup `layer._captured_weight_regularizer`
-  layer._captured_weight_regularizer = []
+    layer, lazy_init_variable_to_tf_variable_map
+):
+    """Update the weights regularizer for newly created `DVariable`.
+
+    The weight regularization usually happens when `layer.add_weight()` is
+    called, at which point the library will first create a `LazyInitVariable`,
+    and then replace it with a `DVariable`. We will defer the creation of those
+    losses, until the DVariable is created.
+
+    See `layer._captured_weight_regularizer` for more details.
+
+    Args:
+      layer: the layer instance for DVariable regularization config.
+      lazy_init_variable_to_tf_variable_map: the dict between LazyInitVariable
+        ID and newly created DVariable.
+    """
+
+    for name, variable, regualarizer in layer._captured_weight_regularizer:
+        if not _is_lazy_init_variable(variable):
+            raise ValueError(
+                "Expect the regularization loss are created from "
+                f"LazyInitVariable, got {variable}"
+            )
+        d_variable = lazy_init_variable_to_tf_variable_map[id(variable)]
+        layer._handle_weight_regularization(name, d_variable, regualarizer)
+    # After that, we should cleanup `layer._captured_weight_regularizer`
+    layer._captured_weight_regularizer = []
 
 
 def _create_dvariable(layout_map, object_path, variable):
-  """Create a new variable instead of using the LazyInitVariable.
-
-  We choose to do this since even the LazyInitVariable might behavior like
-  a normal tf.Variable/DVariable, it is not future proof for any new changes
-  to variable class. It will also fail the instance type check in python,
-  which could affect user's code when they do any filtering based on type to
-  find any variables.
-
-  Args:
-    layout_map: a LayoutMap which contains the variable_object_path (string) ->
-      Layout.
-    object_path: string, the object attribute path for the variable.
-    variable: LazyInitVariable which will be replaced by the newly created
-      tf.Variable.
-  Returns:
-    A new tf.Variable with correct layout information.
-  """
-  # TODO(b/228209108): Revisit this in future and see if we can just reuse the
-  # LazyInitVariable rather than creating a new tf.Variable instance.
-  layout = layout_map[object_path]
-  if layout is None:
-    variable_rank = variable.shape.rank
-    layout = dtensor.Layout.replicated(
-        mesh=layout_map.get_default_mesh(),
-        rank=variable_rank)
-  init_val = variable._initial_value  # pylint: disable=protected-access
-  if callable(init_val):
-    with lazy_variable.disable_init_variable_creator():
-      init_val = utils.call_with_layout(init_val, layout)
-  else:
-    # The init value is probably already created as a tensor, we will just copy
-    # it to mesh and give it a proper layout.
-    init_val = dtensor.copy_to_mesh(init_val, layout)
-  # Use the original variable name for new DVariable creation. TF was adding
-  # ":0" suffix to it.
-  variable_name = variable.name
-  if variable_name.endswith(':0'):
-    variable_name = variable_name[:-2]
-  new_variable = dtensor.DVariable(init_val,
-                                   trainable=variable.trainable,
-                                   name=variable_name)
-  return new_variable
+    """Create a new variable instead of using the LazyInitVariable.
 
+    We choose to do this since even the LazyInitVariable might behavior like
+    a normal tf.Variable/DVariable, it is not future proof for any new changes
+    to variable class. It will also fail the instance type check in python,
+    which could affect user's code when they do any filtering based on type to
+    find any variables.
 
-def _set_object_by_path(object_to_set, path, value):
-  """Set the attribute of instance to the object.
-
-  Args:
-    object_to_set: the instance whose attribute should be set.
-    path: the tuple/list of string and ints, representing the attribute names.
-      Int means that the attribute to set is a item a list.
-    value: the value of the attribute.
-  """
-
-  for i, attr_name in enumerate(path):
-    if i == len(path) - 1:
-      # We found the actual attribute to set
-      if isinstance(attr_name, int):
-        # This means we are trying to set an element in the array, make sure the
-        # instance is array like object.
-        object_to_set[attr_name] = value
-      else:
-        setattr(object_to_set, attr_name, value)
+    Args:
+      layout_map: a LayoutMap which contains the variable_object_path (string)
+        -> Layout.
+      object_path: string, the object attribute path for the variable.
+      variable: LazyInitVariable which will be replaced by the newly created
+        tf.Variable.
+    Returns:
+      A new tf.Variable with correct layout information.
+    """
+    # TODO(b/228209108): Revisit this in future and see if we can just reuse the
+    # LazyInitVariable rather than creating a new tf.Variable instance.
+    layout = layout_map[object_path]
+    if layout is None:
+        variable_rank = variable.shape.rank
+        layout = dtensor.Layout.replicated(
+            mesh=layout_map.get_default_mesh(), rank=variable_rank
+        )
+    init_val = variable._initial_value
+    if callable(init_val):
+        with lazy_variable.disable_init_variable_creator():
+            init_val = utils.call_with_layout(init_val, layout)
     else:
-      if isinstance(attr_name, int):
-        object_to_set = object_to_set[attr_name]
-      else:
-        object_to_set = getattr(object_to_set, attr_name)
+        # The init value is probably already created as a tensor, we will just
+        # copy it to mesh and give it a proper layout.
+        init_val = dtensor.copy_to_mesh(init_val, layout)
+    # Use the original variable name for new DVariable creation. TF was adding
+    # ":0" suffix to it.
+    variable_name = variable.name
+    if variable_name.endswith(":0"):
+        variable_name = variable_name[:-2]
+    new_variable = dtensor.DVariable(
+        init_val, trainable=variable.trainable, name=variable_name
+    )
+    return new_variable
+
+
+def _set_object_by_path(object_to_set, path, value):
+    """Set the attribute of instance to the object.
+
+    Args:
+      object_to_set: the instance whose attribute should be set.
+      path: the tuple/list of string and ints, representing the attribute names.
+        Int means that the attribute to set is a item a list.
+      value: the value of the attribute.
+    """
+
+    for i, attr_name in enumerate(path):
+        if i == len(path) - 1:
+            # We found the actual attribute to set
+            if isinstance(attr_name, int):
+                # This means we are trying to set an element in the array, make
+                # sure the instance is array like object.
+                object_to_set[attr_name] = value
+            else:
+                setattr(object_to_set, attr_name, value)
+        else:
+            if isinstance(attr_name, int):
+                object_to_set = object_to_set[attr_name]
+            else:
+                object_to_set = getattr(object_to_set, attr_name)
+
+
+# TODO(b/228209108): Revisit this after we can reinit LazyInitVariable.
+def _update_trackable_reference(model, lazy_init_variable_to_tf_variable_map):
+    """Update the trackable object references for the model.
+
+    Note that this method is only needed because of a corner case for model
+    checkpoint, where it could accidently catch a LazyInitVariable in checkpoint
+    dependency and not visible to the model attribute graph itself.
+
+    Args:
+      model: the keras model instance whose checkpoint dependency will be
+        examed.
+      lazy_init_variable_to_tf_variable_map: the dict between LazyInitVariable
+        ID and newly created DVariable.
+    """
+    # See b/234621758 for more details.
+    object_graph = tf.__internal__.tracking.ObjectGraphView(model)
+    trackables, _ = object_graph.breadth_first_traversal()
+    for trackable in trackables:
+        for ref_name, ref in trackable._trackable_children().items():
+            if _is_lazy_init_variable(ref):
+                # Replacing the LazyVariable with DVariable.
+                trackable._track_trackable(
+                    lazy_init_variable_to_tf_variable_map[id(ref)],
+                    ref_name,
+                    overwrite=True,
+                )
 
 
 def _is_lazy_init_variable(obj):
-  return isinstance(obj, lazy_variable.LazyInitVariable)
+    return isinstance(obj, lazy_variable.LazyInitVariable)
diff --git a/keras/dtensor/layout_map_test.py b/keras/dtensor/layout_map_test.py
index 5d0860c5cb9e..7df61a78d475 100644
--- a/keras/dtensor/layout_map_test.py
+++ b/keras/dtensor/layout_map_test.py
@@ -14,329 +14,399 @@
 # ==============================================================================
 """Tests for layout_map."""
 
+import os
+import shutil
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import layers
+from keras import models
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import layout_map as layout_map_lib
+from keras.dtensor import test_util
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-# TODO(scottzhu): Fix the layout map test with keras/dtensor/test_util
-from keras.dtensor.tests import test_util
 
 
 class LayoutMapTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        backend.enable_tf_random_generator()
+        tf_utils.set_random_seed(1337)
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+        self.layout_2d = dtensor.Layout.replicated(self.mesh, rank=2)
+        self.layout_1d = dtensor.Layout.replicated(self.mesh, rank=1)
 
-  def setUp(self):
-    super().setUp()
-    backend.enable_tf_random_generator()
-    tf_utils.set_random_seed(1337)
-    global_ids = test_util.create_device_ids_array((2, 2))
-    local_device_ids = np.ravel(global_ids).tolist()
-    mesh_dict = {
-        'CPU':
-            dtensor.Mesh(['X', 'Y'], global_ids,
-                         local_device_ids,
-                         test_util.create_device_list((2, 2), 'CPU'))
-    }
-    self.mesh = self.configTestMesh(mesh_dict)
-    self.layout_2d = dtensor.Layout.replicated(self.mesh, rank=2)
-    self.layout_1d = dtensor.Layout.replicated(self.mesh, rank=1)
-
-    self.sharded_2d = dtensor.Layout.batch_sharded(self.mesh, 'X', rank=2)
-    self.sharded_1d = dtensor.Layout.batch_sharded(self.mesh, 'X', rank=1)
+        self.sharded_2d = dtensor.Layout.batch_sharded(self.mesh, "X", rank=2)
+        self.sharded_1d = dtensor.Layout.batch_sharded(self.mesh, "X", rank=1)
 
-  def test_add(self):
-    layout_map = layout_map_lib.LayoutMap()
+    def test_add(self):
+        layout_map = layout_map_lib.LayoutMap()
 
-    layout_map['dense/kernel'] = self.layout_2d
-    layout_map['dense/bias'] = self.layout_1d
+        layout_map["dense/kernel"] = self.layout_2d
+        layout_map["dense/bias"] = self.layout_1d
 
-    # Make there are two items in the map, and we access them via the
-    # underlying container at layout_map._layout_map
-    self.assertLen(layout_map._layout_map, 2)
-    self.assertEqual(layout_map._layout_map['dense/kernel'], self.layout_2d)
-    self.assertEqual(layout_map._layout_map['dense/bias'], self.layout_1d)
+        # Make there are two items in the map, and we access them via the
+        # underlying container at layout_map._layout_map
+        self.assertLen(layout_map._layout_map, 2)
+        self.assertEqual(layout_map._layout_map["dense/kernel"], self.layout_2d)
+        self.assertEqual(layout_map._layout_map["dense/bias"], self.layout_1d)
 
-    with self.assertRaisesRegex(ValueError, 'dense/kernel already exist'):
-      layout_map['dense/kernel'] = self.layout_1d
+        with self.assertRaisesRegex(ValueError, "dense/kernel already exist"):
+            layout_map["dense/kernel"] = self.layout_1d
 
-    with self.assertRaisesRegex(ValueError, 'should be a dtensor.Layout'):
-      layout_map['conv.kernel'] = [1, 2, 3]
+        with self.assertRaisesRegex(ValueError, "should be a dtensor.Layout"):
+            layout_map["conv.kernel"] = [1, 2, 3]
 
-  def test_get(self):
-    layout_map = layout_map_lib.LayoutMap()
+    def test_get(self):
+        layout_map = layout_map_lib.LayoutMap()
 
-    layout_map['dense/kernel'] = self.sharded_2d
-    layout_map['dense/bias'] = self.sharded_1d
+        layout_map["dense/kernel"] = self.sharded_2d
+        layout_map["dense/bias"] = self.sharded_1d
 
-    layout_map['dense.*kernel'] = self.layout_2d
-    layout_map['dense.*bias'] = self.layout_1d
+        layout_map["dense.*kernel"] = self.layout_2d
+        layout_map["dense.*bias"] = self.layout_1d
 
-    layout_map['.*bias'] = self.sharded_1d
+        layout_map[".*bias"] = self.sharded_1d
 
-    self.assertEqual(layout_map['dense/kernel'], self.sharded_2d)
-    self.assertEqual(layout_map['dense/bias'], self.sharded_1d)
+        self.assertEqual(layout_map["dense/kernel"], self.sharded_2d)
+        self.assertEqual(layout_map["dense/bias"], self.sharded_1d)
 
-    # Map against the wildcard bias rule for dense, and based on the order of
-    # insertion, it will not use .*bias.
-    self.assertEqual(layout_map['dense_2/kernel'], self.layout_2d)
-    self.assertEqual(layout_map['dense_2/bias'], self.layout_1d)
+        # Map against the wildcard bias rule for dense, and based on the order
+        # of insertion, it will not use .*bias.
+        self.assertEqual(layout_map["dense_2/kernel"], self.layout_2d)
+        self.assertEqual(layout_map["dense_2/bias"], self.layout_1d)
 
-    self.assertIsNone(layout_map['conv2d/kernel'])
-    self.assertEqual(layout_map['conv2d/bias'], self.sharded_1d)
+        self.assertIsNone(layout_map["conv2d/kernel"])
+        self.assertEqual(layout_map["conv2d/bias"], self.sharded_1d)
 
-  def test_delete(self):
-    layout_map = layout_map_lib.LayoutMap()
+    def test_delete(self):
+        layout_map = layout_map_lib.LayoutMap()
 
-    layout_map['dense/kernel'] = self.layout_2d
-    layout_map['dense/bias'] = self.layout_1d
+        layout_map["dense/kernel"] = self.layout_2d
+        layout_map["dense/bias"] = self.layout_1d
 
-    self.assertEqual(layout_map.pop('dense/kernel'), self.layout_2d)
-    # Make sure to match against the exact string, not the regex
-    with self.assertRaises(KeyError):
-      layout_map.pop('.*bias')
+        self.assertEqual(layout_map.pop("dense/kernel"), self.layout_2d)
+        # Make sure to match against the exact string, not the regex
+        with self.assertRaises(KeyError):
+            layout_map.pop(".*bias")
 
-    # Make sure del also works
-    del layout_map['dense/bias']
+        # Make sure del also works
+        del layout_map["dense/bias"]
 
-    self.assertEmpty(layout_map._layout_map)
+        self.assertEmpty(layout_map._layout_map)
 
-  def test_len(self):
-    layout_map = layout_map_lib.LayoutMap()
-    self.assertEmpty(layout_map)
+    def test_len(self):
+        layout_map = layout_map_lib.LayoutMap()
+        self.assertEmpty(layout_map)
 
-    layout_map['dense/kernel'] = self.layout_2d
-    layout_map['dense/bias'] = self.layout_1d
+        layout_map["dense/kernel"] = self.layout_2d
+        layout_map["dense/bias"] = self.layout_1d
 
-    self.assertLen(layout_map, 2)
+        self.assertLen(layout_map, 2)
 
-  def test_iter(self):
-    layout_map = layout_map_lib.LayoutMap()
+    def test_iter(self):
+        layout_map = layout_map_lib.LayoutMap()
 
-    layout_map['dense/kernel'] = self.layout_2d
-    layout_map['dense/bias'] = self.layout_1d
+        layout_map["dense/kernel"] = self.layout_2d
+        layout_map["dense/bias"] = self.layout_1d
 
-    # Make sure the items are ordered based on the insertion order.
-    self.assertEqual(list(layout_map.keys()), ['dense/kernel', 'dense/bias'])
+        # Make sure the items are ordered based on the insertion order.
+        self.assertEqual(
+            list(layout_map.keys()), ["dense/kernel", "dense/bias"]
+        )
 
-    keys = []
-    values = []
-    for k, v in layout_map.items():
-      keys.append(k)
-      values.append(v)
+        keys = []
+        values = []
+        for k, v in layout_map.items():
+            keys.append(k)
+            values.append(v)
 
-    self.assertEqual(keys, ['dense/kernel', 'dense/bias'])
-    self.assertEqual(values, [self.layout_2d, self.layout_1d])
+        self.assertEqual(keys, ["dense/kernel", "dense/bias"])
+        self.assertEqual(values, [self.layout_2d, self.layout_1d])
 
 
 # Class used for testing.
-class SubclassModel(tf.keras.Model):
+class SubclassModel(models.Model):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.d1 = layers.Dense(1000)
+        self.d2 = layers.Dense(1000)
+        self.dropout = layers.Dropout(0.1)
 
-  def __init__(self, name=None):
-    super().__init__(name=name)
-    self.d1 = layers.Dense(1000)
-    self.d2 = layers.Dense(1000)
-    self.dropout = layers.Dropout(0.1)
+    def call(self, inputs, training=None):
+        x = self.d1(inputs)
+        x = self.dropout(x, training=training)
+        return self.d2(x)
 
-  def call(self, inputs, training=None):
-    x = self.d1(inputs)
-    x = self.dropout(x, training=training)
-    return self.d2(x)
 
+class SubclassLayer(layers.Layer):
+    def __init__(self, unit):
+        super().__init__()
+        self.unit = unit
 
-class ObjectPathMappingTest(test_util.DTensorBaseTest):
+    def build(self, input_shape):
+        weight_shape = (input_shape[-1], self.unit)
+        # Note that the variable name is "kernel", but assigned to "_weight"
+        # This will cause the checkpoint to record 2 dependencies.
+        self._weight = self.add_weight(shape=weight_shape, name="kernel")
 
-  def setUp(self):
-    super().setUp()
-    backend.enable_tf_random_generator()
-    tf_utils.set_random_seed(1337)
-    global_ids = test_util.create_device_ids_array((2, 2))
-    local_device_ids = np.ravel(global_ids).tolist()
-    mesh_dict = {
-        'CPU':
-            dtensor.Mesh(['X', 'Y'], global_ids,
-                         local_device_ids,
-                         test_util.create_device_list((2, 2), 'CPU'))
-    }
-    self.mesh = self.configTestMesh(mesh_dict)
-    self.layout_2d = dtensor.Layout.replicated(self.mesh, rank=2)
-    self.layout_1d = dtensor.Layout.replicated(self.mesh, rank=1)
-
-    self.sharded_2d = dtensor.Layout.batch_sharded(self.mesh, 'X', rank=2)
-    self.sharded_1d = dtensor.Layout.batch_sharded(self.mesh, 'X', rank=1)
-
-  def test_init_subclass_model_variable_with_layout(self):
-    layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
-    layout_map['d1.kernel'] = self.layout_2d
-    layout_map['d1.bias'] = self.layout_1d
-    layout_map['d2.kernel'] = self.layout_2d
-    layout_map['d2.bias'] = self.layout_1d
-
-    with layout_map_lib.layout_map_scope(layout_map):
-      model = SubclassModel(name='model')
-
-    # Init the model with eager tensor, make sure the model weights have correct
-    # layout, as well as produce correct result.
-    inputs = tf.zeros((10, 10))
-    inputs = dtensor.copy_to_mesh(inputs, layout=self.layout_2d)
-    result = model(inputs)
-    self.assertAllClose(result, tf.zeros((10, 1000)))
-    d1 = model.d1
-    d2 = model.d2
-    self.assertEqual(d1.kernel.layout, self.layout_2d)
-    self.assertEqual(d1.bias.layout, self.layout_1d)
-    self.assertEqual(d2.kernel.layout, self.layout_2d)
-    self.assertEqual(d2.bias.layout, self.layout_1d)
-
-    # Also make sure we repopulate the cached attributes like
-    # layer._trainable_weights
-    self.assertIs(d1.kernel, d1._trainable_weights[0])
-    self.assertIs(d1.bias, d1._trainable_weights[1])
-    self.assertIs(d2.kernel, d2._trainable_weights[0])
-    self.assertIs(d2.bias, d2._trainable_weights[1])
-
-    result = model(inputs, training=True)
-    self.assertAllClose(result, tf.zeros((10, 1000), layout=self.layout_2d))
-
-  def test_init_functional_model_variable_with_layout(self):
-    # Note that the functional model is using layers name + attribute name
-    # the layer name are unique among the functional model, and when the layer
-    # doesn't have a name, keras will give it a unique name based on the layer
-    # class.
-    layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
-    layout_map['d1.kernel'] = self.layout_2d
-    layout_map['d1.bias'] = self.layout_1d
-    layout_map['d2.kernel'] = self.layout_2d
-    layout_map['d2.bias'] = self.layout_1d
-
-    with layout_map_lib.layout_map_scope(layout_map):
-      inputs = tf.keras.Input((10,), batch_size=10)
-      x = layers.Dense(20, name='d1')(inputs)
-      x = layers.Dropout(0.1)(x)
-      output = layers.Dense(30, name='d2')(x)
-
-      model = tf.keras.Model(inputs, output)
-
-    # It includes input layer as well.
-    self.assertLen(model.layers, 4)
-    d1 = model.layers[1]
-    d2 = model.layers[3]
-
-    self.assertEqual(d1.kernel.layout, self.layout_2d)
-    self.assertEqual(d1.bias.layout, self.layout_1d)
-    self.assertEqual(d2.kernel.layout, self.layout_2d)
-    self.assertEqual(d2.bias.layout, self.layout_1d)
-
-    # Also make sure we repopulate the cached attributes like
-    # layer._trainable_weights
-    self.assertIs(d1.kernel, d1._trainable_weights[0])
-    self.assertIs(d1.bias, d1._trainable_weights[1])
-    self.assertIs(d2.kernel, d2._trainable_weights[0])
-    self.assertIs(d2.bias, d2._trainable_weights[1])
-
-    inputs = tf.zeros((10, 10))
-    inputs = dtensor.copy_to_mesh(inputs, layout=self.layout_2d)
-    result = model(inputs, training=True)
-    expected_result = tf.zeros((10, 30))
-    expected_result = dtensor.copy_to_mesh(
-        expected_result, layout=self.layout_2d)
-    self.assertAllClose(result, expected_result)
-
-  def test_init_sequential_model_variable_with_layout(self):
-    # Note that the sequential model is using layers name + attribute name
-    # the layer name are unique among the functional model, and when the layer
-    # doesn't have a name, keras will give it a unique name based on the layer
-    # class.
-    layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
-    layout_map['d1.kernel'] = self.layout_2d
-    layout_map['d1.bias'] = self.layout_1d
-    layout_map['d2.kernel'] = self.layout_2d
-    layout_map['d2.bias'] = self.layout_1d
-
-    with layout_map_lib.layout_map_scope(layout_map):
-      model = tf.keras.Sequential([
-          layers.Dense(20, name='d1', input_shape=(10,)),
-          layers.Dropout(0.1),
-          layers.Dense(30, name='d2')
-      ])
-
-    self.assertLen(model.layers, 3)
-    d1 = model.layers[0]
-    d2 = model.layers[2]
-
-    self.assertEqual(d1.kernel.layout, self.layout_2d)
-    self.assertEqual(d1.bias.layout, self.layout_1d)
-    self.assertEqual(d2.kernel.layout, self.layout_2d)
-    self.assertEqual(d2.bias.layout, self.layout_1d)
-
-    # Also make sure we repopulate the cached attributes like
-    # layer._trainable_weights
-    self.assertIs(d1.kernel, d1._trainable_weights[0])
-    self.assertIs(d1.bias, d1._trainable_weights[1])
-    self.assertIs(d2.kernel, d2._trainable_weights[0])
-    self.assertIs(d2.bias, d2._trainable_weights[1])
-
-    inputs = tf.zeros((10, 10))
-    inputs = dtensor.copy_to_mesh(inputs, layout=self.layout_2d)
-    result = model(inputs, training=True)
-    expected_result = tf.zeros((10, 30))
-    expected_result = dtensor.copy_to_mesh(
-        expected_result, layout=self.layout_2d)
-    self.assertAllClose(result, expected_result)
-
-  def test_init_model_with_empty_layout_map(self):
-    # Create empty layout map, which means all the weights just default to
-    # all replicated.
-    layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
-    with layout_map_lib.layout_map_scope(layout_map):
-      model = tf.keras.Sequential([
-          layers.Dense(20, name='d1', input_shape=(10,)),
-          layers.Dropout(0.1),
-          layers.Dense(30, name='d2')
-      ])
-
-    self.assertLen(model.layers, 3)
-    d1 = model.layers[0]
-    d2 = model.layers[2]
-
-    self.assertEqual(d1.kernel.layout, self.layout_2d)
-    self.assertEqual(d1.bias.layout, self.layout_1d)
-    self.assertEqual(d2.kernel.layout, self.layout_2d)
-    self.assertEqual(d2.bias.layout, self.layout_1d)
-
-  def test_weight_regularization(self):
-    layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
-    with layout_map_lib.layout_map_scope(layout_map):
-      model = tf.keras.Sequential([
-          layers.Dense(20, name='d1', input_shape=(10,),
-                       kernel_initializer='ones',
-                       kernel_regularizer='l2'),
-          layers.Dropout(0.1),
-          layers.Dense(30, name='d2', kernel_initializer='ones',
-                       kernel_regularizer='l2')
-      ])
-
-    self.assertLen(model.losses, 2)
-    # kernel shape [10, 20] with all "1", timed by 0.01 from l2
-    self.assertAllClose(model.losses[0], 2.0)
-    # kernel shape [20, 30] with all "1", timed by 0.01 from l2
-    self.assertAllClose(model.losses[1], 6.0)
-
-  def test_dvariable_name(self):
-    layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
-    with layout_map_lib.layout_map_scope(layout_map):
-      model = tf.keras.Sequential([
-          layers.Dense(20, name='d1', input_shape=(10,)),
-          layers.Dropout(0.1),
-          layers.Dense(30, name='d2')
-      ])
-
-    self.assertLen(model.layers, 3)
-    self.assertEqual(model.layers[0].kernel.name, 'd1/kernel:0')
-    self.assertEqual(model.layers[0].bias.name, 'd1/bias:0')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def call(self, inputs):
+        return tf.matmul(inputs, self._weight)
+
+
+class ObjectPathMappingTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        backend.enable_tf_random_generator()
+        tf_utils.set_random_seed(1337)
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+        self.layout_2d = dtensor.Layout.replicated(self.mesh, rank=2)
+        self.layout_1d = dtensor.Layout.replicated(self.mesh, rank=1)
+
+        self.sharded_2d = dtensor.Layout.batch_sharded(self.mesh, "X", rank=2)
+        self.sharded_1d = dtensor.Layout.batch_sharded(self.mesh, "X", rank=1)
+
+    def test_init_subclass_model_variable_with_layout(self):
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        layout_map["d1.kernel"] = self.layout_2d
+        layout_map["d1.bias"] = self.layout_1d
+        layout_map["d2.kernel"] = self.layout_2d
+        layout_map["d2.bias"] = self.layout_1d
+
+        with layout_map.scope():
+            model = SubclassModel(name="model")
+
+        # Init the model with eager tensor, make sure the model weights have
+        # correct layout, as well as produce correct result.
+        inputs = tf.zeros((10, 10))
+        inputs = dtensor.copy_to_mesh(inputs, layout=self.layout_2d)
+        result = model(inputs)
+        self.assertAllClose(result, tf.zeros((10, 1000)))
+        d1 = model.d1
+        d2 = model.d2
+        self.assertEqual(d1.kernel.layout, self.layout_2d)
+        self.assertEqual(d1.bias.layout, self.layout_1d)
+        self.assertEqual(d2.kernel.layout, self.layout_2d)
+        self.assertEqual(d2.bias.layout, self.layout_1d)
+
+        # Also make sure we repopulate the cached attributes like
+        # layer._trainable_weights
+        self.assertIs(d1.kernel, d1._trainable_weights[0])
+        self.assertIs(d1.bias, d1._trainable_weights[1])
+        self.assertIs(d2.kernel, d2._trainable_weights[0])
+        self.assertIs(d2.bias, d2._trainable_weights[1])
+
+        result = model(inputs, training=True)
+        self.assertAllClose(
+            result,
+            tf.experimental.dtensor.copy_to_mesh(
+                tf.zeros((10, 1000)), self.layout_2d
+            ),
+        )
+
+    def test_init_functional_model_variable_with_layout(self):
+        # Note that the functional model is using layers name + attribute name
+        # the layer name are unique among the functional model, and when the
+        # layer doesn't have a name, keras will give it a unique name based on
+        # the layer class.
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        layout_map["d1.kernel"] = self.layout_2d
+        layout_map["d1.bias"] = self.layout_1d
+        layout_map["d2.kernel"] = self.layout_2d
+        layout_map["d2.bias"] = self.layout_1d
+
+        with layout_map.scope():
+            inputs = layers.Input((10,), batch_size=10)
+            x = layers.Dense(20, name="d1")(inputs)
+            x = layers.Dropout(0.1)(x)
+            output = layers.Dense(30, name="d2")(x)
+
+            model = models.Model(inputs, output)
+
+        # It includes input layer as well.
+        self.assertLen(model.layers, 4)
+        d1 = model.layers[1]
+        d2 = model.layers[3]
+
+        self.assertEqual(d1.kernel.layout, self.layout_2d)
+        self.assertEqual(d1.bias.layout, self.layout_1d)
+        self.assertEqual(d2.kernel.layout, self.layout_2d)
+        self.assertEqual(d2.bias.layout, self.layout_1d)
+
+        # Also make sure we repopulate the cached attributes like
+        # layer._trainable_weights
+        self.assertIs(d1.kernel, d1._trainable_weights[0])
+        self.assertIs(d1.bias, d1._trainable_weights[1])
+        self.assertIs(d2.kernel, d2._trainable_weights[0])
+        self.assertIs(d2.bias, d2._trainable_weights[1])
+
+        inputs = tf.zeros((10, 10))
+        inputs = dtensor.copy_to_mesh(inputs, layout=self.layout_2d)
+        result = model(inputs, training=True)
+        expected_result = tf.zeros((10, 30))
+        expected_result = dtensor.copy_to_mesh(
+            expected_result, layout=self.layout_2d
+        )
+        self.assertAllClose(result, expected_result)
+
+    def test_init_sequential_model_variable_with_layout(self):
+        # Note that the sequential model is using layers name + attribute name
+        # the layer name are unique among the functional model, and when the
+        # layer doesn't have a name, keras will give it a unique name based on
+        # the layer class.
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        layout_map["d1.kernel"] = self.layout_2d
+        layout_map["d1.bias"] = self.layout_1d
+        layout_map["d2.kernel"] = self.layout_2d
+        layout_map["d2.bias"] = self.layout_1d
+
+        with layout_map.scope():
+            model = models.Sequential(
+                [
+                    layers.Dense(20, name="d1", input_shape=(10,)),
+                    layers.Dropout(0.1),
+                    layers.Dense(30, name="d2"),
+                ]
+            )
+
+        self.assertLen(model.layers, 3)
+        d1 = model.layers[0]
+        d2 = model.layers[2]
+
+        self.assertEqual(d1.kernel.layout, self.layout_2d)
+        self.assertEqual(d1.bias.layout, self.layout_1d)
+        self.assertEqual(d2.kernel.layout, self.layout_2d)
+        self.assertEqual(d2.bias.layout, self.layout_1d)
+
+        # Also make sure we repopulate the cached attributes like
+        # layer._trainable_weights
+        self.assertIs(d1.kernel, d1._trainable_weights[0])
+        self.assertIs(d1.bias, d1._trainable_weights[1])
+        self.assertIs(d2.kernel, d2._trainable_weights[0])
+        self.assertIs(d2.bias, d2._trainable_weights[1])
+
+        inputs = tf.zeros((10, 10))
+        inputs = dtensor.copy_to_mesh(inputs, layout=self.layout_2d)
+        result = model(inputs, training=True)
+        expected_result = tf.zeros((10, 30))
+        expected_result = dtensor.copy_to_mesh(
+            expected_result, layout=self.layout_2d
+        )
+        self.assertAllClose(result, expected_result)
+
+    def test_init_model_with_empty_layout_map(self):
+        # Create empty layout map, which means all the weights just default to
+        # all replicated.
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        with layout_map.scope():
+            model = models.Sequential(
+                [
+                    layers.Dense(20, name="d1", input_shape=(10,)),
+                    layers.Dropout(0.1),
+                    layers.Dense(30, name="d2"),
+                ]
+            )
+
+        self.assertLen(model.layers, 3)
+        d1 = model.layers[0]
+        d2 = model.layers[2]
+
+        self.assertEqual(d1.kernel.layout, self.layout_2d)
+        self.assertEqual(d1.bias.layout, self.layout_1d)
+        self.assertEqual(d2.kernel.layout, self.layout_2d)
+        self.assertEqual(d2.bias.layout, self.layout_1d)
+
+    def test_weight_regularization(self):
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        with layout_map.scope():
+            model = models.Sequential(
+                [
+                    layers.Dense(
+                        20,
+                        name="d1",
+                        input_shape=(10,),
+                        kernel_initializer="ones",
+                        kernel_regularizer="l2",
+                    ),
+                    layers.Dropout(0.1),
+                    layers.Dense(
+                        30,
+                        name="d2",
+                        kernel_initializer="ones",
+                        kernel_regularizer="l2",
+                    ),
+                ]
+            )
+
+        self.assertLen(model.losses, 2)
+        # kernel shape [10, 20] with all "1", timed by 0.01 from l2
+        self.assertAllClose(model.losses[0], 2.0)
+        # kernel shape [20, 30] with all "1", timed by 0.01 from l2
+        self.assertAllClose(model.losses[1], 6.0)
+
+    def test_dvariable_name(self):
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        with layout_map.scope():
+            model = models.Sequential(
+                [
+                    layers.Dense(20, name="d1", input_shape=(10,)),
+                    layers.Dropout(0.1),
+                    layers.Dense(30, name="d2"),
+                ]
+            )
+
+        self.assertLen(model.layers, 3)
+        self.assertEqual(model.layers[0].kernel.name, "d1/kernel:0")
+        self.assertEqual(model.layers[0].bias.name, "d1/bias:0")
+
+    @tf.compat.v1.test.mock.patch.dict(
+        "os.environ", {"DTENSOR_ENABLE_CHECKPOINT_V2": "True"}
+    )
+    def test_checkpoint(self):
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        with layout_map.scope():
+            model = models.Sequential(
+                [
+                    layers.Dense(20, name="d1", input_shape=(10,)),
+                    SubclassLayer(10),
+                ]
+            )
+        cpt = tf.train.Checkpoint(root=model)
+        options = tf.train.CheckpointOptions(
+            experimental_io_device=dtensor.device_name()
+        )
+        tmpdir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+        saved_path = cpt.save(
+            os.path.join(tmpdir, "checkpoint"),
+            options=options,
+        )
+
+        cpt.restore(saved_path, options=options)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/lazy_variable.py b/keras/dtensor/lazy_variable.py
index b69e96dd2bff..1bf9887137e4 100644
--- a/keras/dtensor/lazy_variable.py
+++ b/keras/dtensor/lazy_variable.py
@@ -16,204 +16,244 @@
 
 import threading
 
-
+# isort: off
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.trackable import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_contextlib
 
-
 _DISABLE_LAZY_VARIABLE_INIT = threading.local()
 
 
 def _infer_shape_dtype_and_create_handle(initial_value, shape, dtype, name):
-  """Infer shape and dtype from initial_value and create a variable handle."""
-  with ops.name_scope(name, "Variable", skip_on_eager=False) as name:
-    handle_name = ops.name_from_scope_name(name)
-    unique_id = "%s_%d" % (handle_name, ops.uid())
-
-    # Use attr_scope and device(None) to simulate the behavior of
-    # colocate_with when the variable we want to colocate with doesn't
-    # yet exist.
-    device_context_manager = ops.NullContextmanager
-    attr = attr_value_pb2.AttrValue(
-        list=attr_value_pb2.AttrValue.ListValue(
-            s=[compat.as_bytes("loc:@%s" % handle_name)]))
-    with ops.get_default_graph()._attr_scope({"_class": attr}):  # pylint: disable=protected-access
-      with ops.name_scope("Initializer"), device_context_manager(None):
-        if not callable(initial_value):
-          if isinstance(initial_value, trackable.CheckpointInitialValue):
-            raise NotImplementedError(
-                "CheckpointInitialValue is not supported to be the initial "
-                "value of a lazy variable.")
-          initial_value = ops.convert_to_tensor(
-              initial_value, name="initial_value", dtype=dtype)
-          assert not callable(initial_value)
-
-          assert initial_value.shape.is_compatible_with(shape)
-          dtype = dtype or initial_value.dtype.base_dtype
-          shape = shape or initial_value.shape
-
-      assert dtype
-      assert shape
-      handle = resource_variable_ops._variable_handle_from_shape_and_dtype(  # pylint: disable=protected-access
-          shape=shape,
-          dtype=dtype,
-          shared_name=None,  # Never shared
-          name=name,
-          graph_mode=False,
-          initial_value=None)
-      # initial_value=initial_value if not callable(initial_value) else None)
-  return initial_value, shape, dtype, handle, handle_name, unique_id
+    """Infer shape and dtype from initial_value and create a variable handle."""
+    with ops.name_scope(name, "Variable", skip_on_eager=False) as name:
+        handle_name = ops.name_from_scope_name(name)
+        unique_id = "%s_%d" % (handle_name, ops.uid())
+
+        # Use attr_scope and device(None) to simulate the behavior of
+        # colocate_with when the variable we want to colocate with doesn't
+        # yet exist.
+        device_context_manager = ops.NullContextmanager
+        attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(
+                s=[compat.as_bytes(f"loc:@{handle_name}")]
+            )
+        )
+        with ops.get_default_graph()._attr_scope({"_class": attr}):
+            with ops.name_scope("Initializer"), device_context_manager(None):
+                if not callable(initial_value):
+                    if isinstance(
+                        initial_value, trackable.CheckpointInitialValue
+                    ):
+                        raise NotImplementedError(
+                            "CheckpointInitialValue is not supported to be the "
+                            "initial value of a lazy variable."
+                        )
+                    initial_value = ops.convert_to_tensor(
+                        initial_value, name="initial_value", dtype=dtype
+                    )
+                    assert not callable(initial_value)
+
+                    assert initial_value.shape.is_compatible_with(shape)
+                    dtype = dtype or initial_value.dtype.base_dtype
+                    shape = shape or initial_value.shape
+
+            assert dtype
+            assert shape
+            handle = (
+                resource_variable_ops._variable_handle_from_shape_and_dtype(
+                    shape=shape,
+                    dtype=dtype,
+                    shared_name=None,  # Never shared
+                    name=name,
+                    graph_mode=False,
+                    initial_value=None,
+                )
+            )
+            # initial_value=initial_value if not callable(initial_value) else
+            # None)
+    return initial_value, shape, dtype, handle, handle_name, unique_id
 
 
 class LazyInitVariable(resource_variable_ops.BaseResourceVariable):
-  """Lazily initialized variables.
-
-  The major use case for this class is to serve as a memory efficient
-  alternative for tf.Variable. The resource handle of this class is point to
-  nothing, which mean it will raise error when its value is fetched in a eager
-  context. Having said that, it will perform like a normal tf.Variable when
-  using with graph tensor, like KerasTensor produced from tf.keras.Input.
-  """
-
-  def __init__(
-      self,
-      initial_value=None,
-      trainable=None,
-      collections=None,
-      validate_shape=True,  # pylint: disable=unused-argument
-      caching_device=None,
-      name=None,
-      dtype=None,
-      variable_def=None,
-      import_scope=None,
-      constraint=None,
-      distribute_strategy=None,
-      synchronization=None,
-      aggregation=None,
-      shape=None,
-      **kwargs):
-    assert context.executing_eagerly()  # To simplify the logic
-    assert variable_def is None  # Not supported yet.
-    assert caching_device is None  # Not supported yet
-
-    if initial_value is None:
-      raise ValueError("The `initial_value` arg to `tf.Variable` must "
-                       "be specified except when you are not providing a "
-                       "`variable_def`. You provided neither.")
-
-    if isinstance(initial_value, ops.Tensor) and hasattr(
-        initial_value, "graph") and initial_value.graph.building_function:
-      raise ValueError(f"Argument `initial_value` ({initial_value}) could not "
-                       "be lifted out of a `tf.function`. "
-                       f"(Tried to create variable with name='{name}'). "
-                       "To avoid this error, when constructing `tf.Variable`s "
-                       "inside of `tf.function` you can create the "
-                       "`initial_value` tensor in a "
-                       "`tf.init_scope` or pass a callable `initial_value` "
-                       "(e.g., `tf.Variable(lambda : "
-                       "tf.truncated_normal([10, 40]))`). "
-                       "Please file a feature request if this "
-                       "restriction inconveniences you.")
-
-    if constraint is not None and not callable(constraint):
-      raise ValueError(f"Argument `constraint` must be None or a callable. "
-                       f"a callable. Got a {type(constraint)}:  {constraint}")
-
-    self._name = name
-    (initial_value, shape, dtype, handle, handle_name,
-     unique_id) = _infer_shape_dtype_and_create_handle(initial_value, shape,
-                                                       dtype, name)
-
-    super().__init__(
-        distribute_strategy=distribute_strategy,
-        initial_value=initial_value,
-        shape=shape,
-        dtype=dtype,
-        name=name,
-        unique_id=unique_id,
-        handle_name=handle_name,
-        constraint=constraint,
-        handle=handle,
-        graph_element=None,
-        trainable=trainable,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        in_graph_mode=False)
-
-  # TODO(scottzhu): This method and create_and_initialize might be removed if
-  # we decide to just use the tf.Variable to replace this class.
-  def initialize(self):
-    with ops.name_scope(self._name, "Variable", skip_on_eager=False) as name:
-      with ops.colocate_with(self._handle), ops.name_scope("Initializer"):
+    """Lazily initialized variables.
+
+    The major use case for this class is to serve as a memory efficient
+    alternative for tf.Variable. The resource handle of this class is point to
+    nothing, which mean it will raise error when its value is fetched in a eager
+    context. Having said that, it will perform like a normal tf.Variable when
+    using with graph tensor, like KerasTensor produced from tf.keras.Input.
+    """
+
+    def __init__(
+        self,
+        initial_value=None,
+        trainable=None,
+        collections=None,
+        validate_shape=True,
+        caching_device=None,
+        name=None,
+        dtype=None,
+        variable_def=None,
+        import_scope=None,
+        constraint=None,
+        distribute_strategy=None,
+        synchronization=None,
+        aggregation=None,
+        shape=None,
+        **kwargs,
+    ):
+        assert context.executing_eagerly()  # To simplify the logic
+        assert variable_def is None  # Not supported yet.
+        assert caching_device is None  # Not supported yet
+
+        if initial_value is None:
+            raise ValueError(
+                "The `initial_value` arg to `tf.Variable` must "
+                "be specified except when you are not providing a "
+                "`variable_def`. You provided neither."
+            )
+
+        if (
+            isinstance(initial_value, tensor.Tensor)
+            and hasattr(initial_value, "graph")
+            and initial_value.graph.building_function
+        ):
+            raise ValueError(
+                f"Argument `initial_value` ({initial_value}) could not "
+                "be lifted out of a `tf.function`. "
+                f"(Tried to create variable with name='{name}'). "
+                "To avoid this error, when constructing `tf.Variable`s "
+                "inside of `tf.function` you can create the "
+                "`initial_value` tensor in a "
+                "`tf.init_scope` or pass a callable `initial_value` "
+                "(e.g., `tf.Variable(lambda : "
+                "tf.truncated_normal([10, 40]))`). "
+                "Please file a feature request if this "
+                "restriction inconveniences you."
+            )
+
+        if constraint is not None and not callable(constraint):
+            raise ValueError(
+                "Argument `constraint` must be None or a callable. "
+                f"a callable. Got a {type(constraint)}:  {constraint}"
+            )
+
+        self._name = name
+        (
+            initial_value,
+            shape,
+            dtype,
+            handle,
+            handle_name,
+            unique_id,
+        ) = _infer_shape_dtype_and_create_handle(
+            initial_value, shape, dtype, name
+        )
+
+        super().__init__(
+            distribute_strategy=distribute_strategy,
+            initial_value=initial_value,
+            shape=shape,
+            dtype=dtype,
+            name=name,
+            unique_id=unique_id,
+            handle_name=handle_name,
+            constraint=constraint,
+            handle=handle,
+            graph_element=None,
+            trainable=trainable,
+            synchronization=synchronization,
+            aggregation=aggregation,
+            in_graph_mode=False,
+        )
+
+    # TODO(scottzhu): This method and create_and_initialize might be removed if
+    # we decide to just use the tf.Variable to replace this class.
+    def initialize(self):
+        with ops.name_scope(self._name, "Variable", skip_on_eager=False):
+            with ops.colocate_with(self._handle), ops.name_scope("Initializer"):
+                if callable(self._initial_value):
+                    initial_value = self._initial_value()
+                else:
+                    initial_value = self._initial_value
+
+                if not initial_value.shape.is_compatible_with(self._shape):
+                    raise ValueError(
+                        "In this `tf.Variable` creation, the initial value's "
+                        f"shape ({initial_value.shape}) is not compatible with "
+                        "the explicitly supplied `shape` "
+                        f"argument ({self._shape})."
+                    )
+                assert self._dtype is initial_value.dtype.base_dtype
+            gen_resource_variable_ops.assign_variable_op(
+                self._handle, initial_value
+            )
+
+    def create_and_initialize(self):
         if callable(self._initial_value):
-          initial_value = self._initial_value()
-        else:
-          initial_value = self._initial_value
-
-        if not initial_value.shape.is_compatible_with(self._shape):
-          raise ValueError(
-              f"In this `tf.Variable` creation, the initial value's shape "
-              f"({initial_value.shape}) is not compatible with "
-              f"the explicitly supplied `shape` argument ({self._shape}).")
-        assert self._dtype is initial_value.dtype.base_dtype
-      gen_resource_variable_ops.assign_variable_op(self._handle, initial_value)
-
-  def create_and_initialize(self):
-    if callable(self._initial_value):
-      initial_value = self._initial_value()
-
-    with ops.device(initial_value.device):
-      (initial_value, shape, dtype, handle, handle_name,
-       unique_id) = _infer_shape_dtype_and_create_handle(
-           initial_value, self._shape, self._dtype, self._name)
-      self.initialize()
-
-    super().__init__(
-        trainable=self._trainable,
-        shape=shape,
-        dtype=dtype,
-        handle=handle,
-        synchronization=self._synchronization,
-        constraint=self._constraint,
-        aggregation=self._aggregation,
-        distribute_strategy=self._distribute_strategy,
-        name=self._name,
-        unique_id=unique_id,
-        handle_name=handle_name,
-        graph_element=None,
-        initial_value=initial_value,
-        initializer_op=None,
-        is_initialized_op=None,
-        cached_value=None,
-        caching_device=None)
+            initial_value = self._initial_value()
+
+        with ops.device(initial_value.device):
+            (
+                initial_value,
+                shape,
+                dtype,
+                handle,
+                handle_name,
+                unique_id,
+            ) = _infer_shape_dtype_and_create_handle(
+                initial_value, self._shape, self._dtype, self._name
+            )
+            self.initialize()
+
+        super().__init__(
+            trainable=self._trainable,
+            shape=shape,
+            dtype=dtype,
+            handle=handle,
+            synchronization=self._synchronization,
+            constraint=self._constraint,
+            aggregation=self._aggregation,
+            distribute_strategy=self._distribute_strategy,
+            name=self._name,
+            unique_id=unique_id,
+            handle_name=handle_name,
+            graph_element=None,
+            initial_value=initial_value,
+            initializer_op=None,
+            is_initialized_op=None,
+            cached_value=None,
+            caching_device=None,
+        )
 
 
 def _lazy_init_variable_creator(next_creator, **kwargs):
-  if getattr(_DISABLE_LAZY_VARIABLE_INIT, "disabled", False):
-    return next_creator(**kwargs)
-  else:
-    return LazyInitVariable(**kwargs)
+    if getattr(_DISABLE_LAZY_VARIABLE_INIT, "disabled", False):
+        return next_creator(**kwargs)
+    else:
+        return LazyInitVariable(**kwargs)
 
 
 @tf_contextlib.contextmanager
 def lazy_init_scope():
-  with variable_scope.variable_creator_scope(_lazy_init_variable_creator):
-    yield
+    with variable_scope.variable_creator_scope(_lazy_init_variable_creator):
+        yield
 
 
 @tf_contextlib.contextmanager
 def disable_init_variable_creator():
-  try:
-    global _DISABLE_LAZY_VARIABLE_INIT
-    existing_value = getattr(_DISABLE_LAZY_VARIABLE_INIT, "disabled", False)
-    _DISABLE_LAZY_VARIABLE_INIT.disabled = True
-    yield
-  finally:
-    _DISABLE_LAZY_VARIABLE_INIT.disabled = existing_value
+    try:
+        global _DISABLE_LAZY_VARIABLE_INIT
+        existing_value = getattr(_DISABLE_LAZY_VARIABLE_INIT, "disabled", False)
+        _DISABLE_LAZY_VARIABLE_INIT.disabled = True
+        yield
+    finally:
+        _DISABLE_LAZY_VARIABLE_INIT.disabled = existing_value
diff --git a/keras/dtensor/metrics_test.py b/keras/dtensor/metrics_test.py
index 4be1afcd92db..ddad4077ef95 100644
--- a/keras/dtensor/metrics_test.py
+++ b/keras/dtensor/metrics_test.py
@@ -14,78 +14,81 @@
 # ==============================================================================
 """Tests for metrics."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import metrics
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import test_util
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 class MetricsTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+        tf_utils.set_random_seed(1337)
 
-  def setUp(self):
-    super().setUp()
-    global_ids = test_util.create_device_ids_array((2, 2))
-    local_device_ids = np.ravel(global_ids).tolist()
-    mesh_dict = {
-        'CPU':
-            dtensor.Mesh(['X', 'Y'], global_ids, local_device_ids,
-                         test_util.create_device_list((2, 2), 'CPU'))
-    }
-    self.mesh = self.configTestMesh(mesh_dict)
-    tf_utils.set_random_seed(1337)
-
-  @parameterized.parameters(
-      (metrics.Accuracy, {}),
-      (metrics.AUC, {}),
-      (metrics.BinaryAccuracy, {}),
-      (metrics.BinaryCrossentropy, {}),
-      (metrics.BinaryIoU, {}),
-      (metrics.CategoricalAccuracy, {}),
-      (metrics.CategoricalCrossentropy, {}),
-      (metrics.CategoricalHinge, {}),
-      (metrics.CosineSimilarity, {}),
-      (metrics.FalseNegatives, {}),
-      (metrics.FalsePositives, {}),
-      (metrics.Hinge, {}),
-      (metrics.IoU, {'num_classes': 3, 'target_class_ids': [1]}),
-      (metrics.KLDivergence, {}),
-      (metrics.LogCoshError, {}),
-      (metrics.Mean, {}),
-      (metrics.MeanAbsoluteError, {}),
-      (metrics.MeanAbsolutePercentageError, {}),
-      (metrics.MeanIoU, {'num_classes': 3}),
-      (metrics.MeanRelativeError, {'normalizer': [1, 3, 2, 3]}),
-      (metrics.MeanSquaredError, {}),
-      (metrics.MeanSquaredLogarithmicError, {}),
-      (metrics.OneHotIoU, {'num_classes': 3, 'target_class_ids': [1]}),
-      (metrics.OneHotMeanIoU, {'num_classes': 3}),
-      (metrics.Poisson, {}),
-      (metrics.Precision, {}),
-      (metrics.PrecisionAtRecall, {'recall': 0.5}),
-      (metrics.Recall, {}),
-      (metrics.RecallAtPrecision, {'precision': 0.5}),
-      (metrics.RootMeanSquaredError, {}),
-      (metrics.SensitivityAtSpecificity, {'specificity': 0.5}),
-      (metrics.SparseCategoricalAccuracy, {}),
-      (metrics.SparseCategoricalCrossentropy, {}),
-      (metrics.SparseTopKCategoricalAccuracy, {}),
-      (metrics.SpecificityAtSensitivity, {'sensitivity': 0.5}),
-      (metrics.SquaredHinge, {}),
-      (metrics.Sum, {}),
-      (metrics.TopKCategoricalAccuracy, {}),
-      (metrics.TrueNegatives, {}),
-      (metrics.TruePositives, {}),
-  )
-  def test_metric_layout(self, metric_cls, init_args):
-    metric = metric_cls(**init_args, mesh=self.mesh)
+    @parameterized.parameters(
+        (metrics.Accuracy, {}),
+        (metrics.AUC, {}),
+        (metrics.BinaryAccuracy, {}),
+        (metrics.BinaryCrossentropy, {}),
+        (metrics.BinaryIoU, {}),
+        (metrics.CategoricalAccuracy, {}),
+        (metrics.CategoricalCrossentropy, {}),
+        (metrics.CategoricalHinge, {}),
+        (metrics.CosineSimilarity, {}),
+        (metrics.FalseNegatives, {}),
+        (metrics.FalsePositives, {}),
+        (metrics.Hinge, {}),
+        (metrics.IoU, {"num_classes": 3, "target_class_ids": [1]}),
+        (metrics.KLDivergence, {}),
+        (metrics.LogCoshError, {}),
+        (metrics.Mean, {}),
+        (metrics.MeanAbsoluteError, {}),
+        (metrics.MeanAbsolutePercentageError, {}),
+        (metrics.MeanIoU, {"num_classes": 3}),
+        (metrics.MeanRelativeError, {"normalizer": [1, 3, 2, 3]}),
+        (metrics.MeanSquaredError, {}),
+        (metrics.MeanSquaredLogarithmicError, {}),
+        (metrics.OneHotIoU, {"num_classes": 3, "target_class_ids": [1]}),
+        (metrics.OneHotMeanIoU, {"num_classes": 3}),
+        (metrics.Poisson, {}),
+        (metrics.Precision, {}),
+        (metrics.PrecisionAtRecall, {"recall": 0.5}),
+        (metrics.Recall, {}),
+        (metrics.RecallAtPrecision, {"precision": 0.5}),
+        (metrics.RootMeanSquaredError, {}),
+        (metrics.SensitivityAtSpecificity, {"specificity": 0.5}),
+        (metrics.SparseCategoricalAccuracy, {}),
+        (metrics.SparseCategoricalCrossentropy, {}),
+        (metrics.SparseTopKCategoricalAccuracy, {}),
+        (metrics.SpecificityAtSensitivity, {"sensitivity": 0.5}),
+        (metrics.SquaredHinge, {}),
+        (metrics.Sum, {}),
+        (metrics.TopKCategoricalAccuracy, {}),
+        (metrics.TrueNegatives, {}),
+        (metrics.TruePositives, {}),
+    )
+    def test_metric_layout(self, metric_cls, init_args):
+        metric = metric_cls(**init_args, mesh=self.mesh)
 
-    for weight in metric.non_trainable_weights:
-      self.assertIsInstance(weight, dtensor.DVariable)
-      self.assertTrue(weight.layout.is_fully_replicated())
+        for weight in metric.non_trainable_weights:
+            self.assertIsInstance(weight, dtensor.DVariable)
+            self.assertTrue(weight.layout.is_fully_replicated())
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/mnist_model_test.py b/keras/dtensor/mnist_model_test.py
index 800dcdfae76d..13cd15d5a4ae 100644
--- a/keras/dtensor/mnist_model_test.py
+++ b/keras/dtensor/mnist_model_test.py
@@ -14,70 +14,99 @@
 # ==============================================================================
 """E2E Tests for mnist_model."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+from tensorflow.compat.v2.experimental import dtensor
+
 from keras import backend
-from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import integration_test_utils
-from keras.dtensor import optimizers as optimizer_lib
+from keras.dtensor import layout_map as layout_map_lib
 from keras.dtensor import test_util
+from keras.optimizers import adam
 from keras.utils import tf_utils
 
-import tensorflow.compat.v2 as tf
-
-
-from tensorflow.dtensor.python import mesh_util
-from tensorflow.dtensor.python import tpu_util
-
-
 
 class MnistTest(test_util.DTensorBaseTest):
-
-  def test_mnist_training_cpu(self):
-    devices = tf.config.list_physical_devices('CPU')
-    tf.config.set_logical_device_configuration(
-        devices[0], [tf.config.LogicalDeviceConfiguration(),] * 8)
-
-    mesh = mesh_util.create_mesh(
-        devices=['CPU:%d' % i for i in range(8)], mesh_dims=[('batch', 8)])
-
-    backend.enable_tf_random_generator()
-    # Needed by keras initializers.
-    tf_utils.set_random_seed(1337)
-
-    model = integration_test_utils.get_model_with_layout_map(
-        integration_test_utils.get_all_replicated_layout_map(mesh))
-
-    optimizer = optimizer_lib.Adam(learning_rate=0.001, mesh=mesh)
-    optimizer.build(model.trainable_variables)
-
-    train_losses = integration_test_utils.train_mnist_model_batch_sharded(
-        model, optimizer, mesh, num_epochs=3, steps_per_epoch=100,
-        global_batch_size=64)
-    # Make sure the losses are decreasing
-    self.assertEqual(train_losses, sorted(train_losses, reverse=True))
-
-  def DISABLED_test_mnist_training_tpu(self):
-    # TODO(scottzhu): Enable TPU test once the dtensor_test rule is migrated out
-    # of learning/brain
-    tpu_util.dtensor_initialize_tpu_system()
-    total_tpu_device_count = dtensor.num_global_devices('TPU')
-    mesh_shape = [total_tpu_device_count]
-    mesh = tpu_util.create_tpu_mesh(['batch'], mesh_shape, 'tpu_mesh')
-
-    # Needed by keras initializers.
-    tf_utils.set_random_seed(1337)
-
-    model = integration_test_utils.get_model_with_layout_map(
-        integration_test_utils.get_all_replicated_layout_map(mesh))
-
-    optimizer = optimizer_lib.Adam(learning_rate=0.001, mesh=mesh)
-    optimizer.build(model.trainable_variables)
-
-    train_losses = integration_test_utils.train_mnist_model_batch_sharded(
-        model, optimizer, mesh, num_epochs=3, steps_per_epoch=100,
-        global_batch_size=64)
-    # Make sure the losses are decreasing
-    self.assertEqual(train_losses, sorted(train_losses, reverse=True))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def setUp(self):
+        super().setUp()
+        backend.enable_tf_random_generator()
+        tf_utils.set_random_seed(1337)
+        global_ids = test_util.create_device_ids_array((2,))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            device: tf.experimental.dtensor.Mesh(
+                ["batch"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2,), device),
+            )
+            for device in ("CPU", "GPU", "TPU")
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+
+    def test_mnist_training(self):
+        layout_map = layout_map_lib.LayoutMap(self.mesh)
+        with layout_map.scope():
+            model = integration_test_utils.get_model()
+
+        optimizer = adam.Adam(learning_rate=0.001, mesh=self.mesh)
+        optimizer.build(model.trainable_variables)
+
+        train_losses = integration_test_utils.train_mnist_model_batch_sharded(
+            model,
+            optimizer,
+            self.mesh,
+            num_epochs=3,
+            steps_per_epoch=20,
+            global_batch_size=64,
+        )
+        # Make sure the losses are decreasing
+        self.assertEqual(train_losses, sorted(train_losses, reverse=True))
+
+    def test_model_fit(self):
+        layout_map = layout_map_lib.LayoutMap(self.mesh)
+        with layout_map.scope():
+            model = integration_test_utils.get_model()
+
+        optimizer = adam.Adam(learning_rate=0.001, mesh=self.mesh)
+
+        global_batch_size = 64
+        model.compile(
+            loss="CategoricalCrossentropy", optimizer=optimizer, metrics="acc"
+        )
+        train_ds, eval_ds = integration_test_utils.get_mnist_datasets(
+            integration_test_utils.NUM_CLASS, global_batch_size
+        )
+
+        def distribute_ds(dataset):
+            dataset = dataset.unbatch()
+
+            def _create_batch_layout(tensor_spec):
+                rank = len(tensor_spec.shape) + 1
+                return dtensor.Layout.batch_sharded(
+                    self.mesh, batch_dim="batch", rank=rank
+                )
+
+            layouts = tf.nest.map_structure(
+                _create_batch_layout, dataset.element_spec
+            )
+
+            return dtensor.DTensorDataset(
+                dataset=dataset,
+                mesh=self.mesh,
+                layouts=layouts,
+                global_batch_size=global_batch_size,
+                dataset_already_batched=False,
+                batch_dim="batch",
+                prefetch=None,
+                tf_data_service_config=None,
+            )
+
+        train_ds = distribute_ds(train_ds)
+        eval_ds = distribute_ds(eval_ds)
+        model.fit(train_ds, steps_per_epoch=10)
+        model.evaluate(eval_ds, steps=10)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/optimizers.py b/keras/dtensor/optimizers.py
deleted file mode 100644
index d94d243dd4d6..000000000000
--- a/keras/dtensor/optimizers.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""DTensor specific Keras optimizers."""
-
-from keras.dtensor import dtensor_api as dtensor
-from keras.optimizers.optimizer_experimental import adadelta
-from keras.optimizers.optimizer_experimental import adagrad
-from keras.optimizers.optimizer_experimental import adam
-from keras.optimizers.optimizer_experimental import optimizer as optimizer_lib
-from keras.optimizers.optimizer_experimental import rmsprop
-from keras.optimizers.optimizer_experimental import sgd
-from keras.optimizers.schedules import learning_rate_schedule
-
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-from tensorflow.tools.docs import doc_controls
-
-
-# pylint: disable=protected-access,missing-class-docstring
-class Optimizer(optimizer_lib._BaseOptimizer):
-  """DTensor specific optimizers.
-
-  The major changes for this class is that all the variable init logic will be
-  mesh/layout aware.
-
-  """
-  # Note that we didn't subclass optimizer_lib.Optimizer since it contains the
-  # extra logic of handling distribution strategy, which we don't need for
-  # DTensor
-
-  def __init__(self, name, mesh=None):
-    """Create a new Optimizer.
-
-    Args:
-      name: String. The name of the optimizer, which will appear in all the
-        state variables created by this optimizer.
-      mesh: dtensor.Mesh. The optional Mesh which will be used to create
-        the states. Note that usually the state variable will use the layout
-        from the corresponding model variables. This mesh only used for global
-        variables like globle steps, learning rate, etc.
-    """
-    # TODO(scottzhu): Skip the gradients_clip_option and ema_option for now, and
-    # will cover them in future if really needed.
-    # TODO(scottzhu): We might want to make mesh to be required in future.
-    self._mesh = mesh
-    super().__init__(name=name)
-
-  def _create_iteration_variable(self):
-    init_val = tf.constant(0, dtype=tf.int64)
-    if self._mesh:
-      init_val = dtensor.copy_to_mesh(
-          init_val, dtensor.Layout.replicated(self._mesh, rank=0))
-    with tf.init_scope():
-      # Lift the variable creation to init scope to avoid environment issue.
-      self._iterations = dtensor.DVariable(init_val, name='iteration')
-
-  ################## Override methods from keras.Optimizer ################
-  def add_variable_from_reference(self,
-                                  model_variable,
-                                  variable_name,
-                                  initial_value=None):
-    """Create an optimizer variable from model variable.
-
-    Create an optimizer variable based on the information of model variable.
-    For example, in SGD optimizer momemtum, for each model variable, a
-    corresponding momemtum variable is created of the same shape and dtype.
-
-    Args:
-      model_variable: The corresponding model variable to the optimizer variable
-        to be created.
-      variable_name: The name prefix of the optimizer variable to be created.
-        The create variables name will follow the pattern
-        `{variable_name}/{model_variable.name}`, e.g., `momemtum/dense_1`.
-      initial_value: The initial value of the optimizer variable, if None, the
-        value will be default to 0.
-
-    Returns:
-      An optimizer variable.
-    """
-    if initial_value is None:
-      # Use tf.zeros_like which will propagate the layout information from the
-      # model weights if any.
-      initial_value = tf.zeros_like(model_variable)
-    elif isinstance(initial_value, tf.Tensor):
-      initial_value = dtensor.copy_to_mesh(
-          initial_value,
-          dtensor.Layout.replicated(self._mesh, rank=initial_value.shape.rank))
-    return dtensor.DVariable(
-        initial_value=initial_value,
-        name=f'{variable_name}/{model_variable._shared_name}',
-        dtype=model_variable.dtype,
-        trainable=False)
-
-  @doc_controls.do_not_generate_docs
-  def aggregate_gradients(self, grads_and_vars):
-    # Hide the aggregate_gradients from Optimizer.aggregate_gradients
-    raise NotImplementedError(
-        'Dtensor doesn\'t need to manually aggregate gradients')
-
-  def _var_key(self, variable):
-    """Get a unique identifier of the given variable."""
-    return optimizer_lib._BaseOptimizer._var_key(self, variable)
-
-  def apply_gradients(self, grads_and_vars):
-    """Apply gradients to variables.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-
-    Returns:
-      None
-
-    Raises:
-      TypeError: If `grads_and_vars` is malformed.
-    """
-    # Explicitly call the _BaseOptimizer to avoid any chance of using
-    # Optimizers.apply_gradients which contains distribution strategy logic.
-    optimizer_lib._BaseOptimizer.apply_gradients(self, grads_and_vars)
-
-  def _internal_apply_gradients(self, grads_and_vars):
-    """Helper function of apply gradients.
-
-    This is required for separating out distributed training logic.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-    """
-    # Explicitly call the _BaseOptimizer to avoid any chance of using
-    # Optimizers.apply_gradients which contains distribution strategy logic.
-    optimizer_lib._BaseOptimizer._internal_apply_gradients(self, grads_and_vars)
-
-  def _overwrite_model_variables_with_average_value_helper(self, var_list):
-    """Helper function to _overwrite_model_variables_with_average_value."""
-    (optimizer_lib._BaseOptimizer.
-     _overwrite_model_variables_with_average_value_helper(self, var_list))
-
-  def _build_learning_rate(self, learning_rate):
-    if isinstance(learning_rate, learning_rate_schedule.LearningRateSchedule):
-      # Create a variable to hold the current learning rate.
-      # Note that the init value `learning_rate(self.iterations)` should have
-      # the correct layout information from self.iterations.
-      self._current_learning_rate = dtensor.DVariable(
-          learning_rate(self.iterations),
-          name='learning_rate',
-          dtype=tf.float32)
-      return learning_rate
-    init_val = tf.constant(learning_rate, dtype=tf.float32)
-    if self._mesh:
-      init_val = dtensor.copy_to_mesh(
-          init_val, dtensor.Layout.replicated(self._mesh, rank=0))
-    return dtensor.DVariable(init_val, name='learning_rate')
-
-
-@keras_export('keras.dtensor.experimental.optimizers.Adadelta', v1=[])
-class Adadelta(Optimizer, adadelta.Adadelta):
-
-  def __init__(self,
-               learning_rate=0.001,
-               rho=0.95,
-               epsilon=1e-7,
-               gradients_clip_option=None,
-               ema_option=None,
-               name='Adadelta',
-               mesh=None):
-    # Skip the adam.Adadelta.__init__ and only call the Optimizer.__init__
-    # this is to skip the keras.Optimizer.__init__, which contains the logic
-    # of distribution strategy. Same for all the optimizers subclasses.
-    Optimizer.__init__(self, name=name, mesh=mesh)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.rho = rho
-    self.epsilon = epsilon
-
-
-@keras_export('keras.dtensor.experimental.optimizers.Adagrad', v1=[])
-class Adagrad(Optimizer, adagrad.Adagrad):
-
-  def __init__(self,
-               learning_rate=0.001,
-               initial_accumulator_value=0.1,
-               epsilon=1e-7,
-               gradients_clip_option=None,
-               ema_option=None,
-               name='Adagrad',
-               mesh=None):
-    Optimizer.__init__(self, name=name, mesh=mesh)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.initial_accumulator_value = initial_accumulator_value
-    self.epsilon = epsilon
-
-
-@keras_export('keras.dtensor.experimental.optimizers.Adam', v1=[])
-class Adam(Optimizer, adam.Adam):
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               gradients_clip_option=None,
-               ema_option=None,
-               name='Adam',
-               mesh=None):
-    Optimizer.__init__(self, name=name, mesh=mesh)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.beta_1 = beta_1
-    self.beta_2 = beta_2
-    self.epsilon = epsilon
-    self.amsgrad = amsgrad
-
-
-@keras_export('keras.dtensor.experimental.optimizers.RMSprop', v1=[])
-class RMSprop(Optimizer, rmsprop.RMSprop):
-
-  def __init__(self,
-               learning_rate=0.001,
-               rho=0.9,
-               momentum=0.0,
-               epsilon=1e-7,
-               centered=False,
-               gradients_clip_option=None,
-               ema_option=None,
-               jit_compile=False,
-               name='RMSprop',
-               mesh=None):
-    Optimizer.__init__(self, name=name, mesh=mesh)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.rho = rho
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.centered = centered
-
-
-@keras_export('keras.dtensor.experimental.optimizers.SGD', v1=[])
-class SGD(Optimizer, sgd.SGD):
-
-  def __init__(self,
-               learning_rate=0.01,
-               momentum=0.0,
-               nesterov=False,
-               amsgrad=False,
-               gradients_clip_option=None,
-               ema_option=None,
-               jit_compile=False,
-               name='SGD',
-               mesh=None):
-    Optimizer.__init__(self, name=name, mesh=mesh)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.momentum = momentum
-    self.nesterov = nesterov
-    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
-      raise ValueError('`momentum` must be between [0, 1].')
-
-
-Adadelta.__doc__ = Optimizer.__doc__ + adadelta.Adadelta.__doc__
-Adagrad.__doc__ = Optimizer.__doc__ + adagrad.Adagrad.__doc__
-Adam.__doc__ = Optimizer.__doc__ + adam.Adam.__doc__
-RMSprop.__doc__ = Optimizer.__doc__ + rmsprop.RMSprop.__doc__
-SGD.__doc__ = Optimizer.__doc__ + sgd.SGD.__doc__
diff --git a/keras/dtensor/optimizers_test.py b/keras/dtensor/optimizers_test.py
index bfaf076225d2..356d2d2965e7 100644
--- a/keras/dtensor/optimizers_test.py
+++ b/keras/dtensor/optimizers_test.py
@@ -14,92 +14,240 @@
 # ==============================================================================
 """Tests for initializers."""
 
+import os
+
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
+from keras import backend
+from keras import layers
+from keras import losses
+from keras import models
 from keras.dtensor import dtensor_api as dtensor
-from keras.dtensor import optimizers
+from keras.dtensor import layout_map
 from keras.dtensor import test_util
-import numpy as np
-import tensorflow.compat.v2 as tf
+from keras.optimizers import adadelta
+from keras.optimizers import adagrad
+from keras.optimizers import adam
+from keras.optimizers import adamw
+from keras.optimizers import rmsprop
+from keras.optimizers import sgd
 
 
 class OptimizersTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+
+    def test_add_variable_from_reference(self):
+        optimizer = adam.Adam(mesh=self.mesh)
+        variable_init_value = tf.ones([4, 4], dtype=tf.float32)
+        variable_init_value = dtensor.copy_to_mesh(
+            variable_init_value,
+            layout=dtensor.Layout.replicated(self.mesh, rank=2),
+        )
+        model_variable = dtensor.DVariable(
+            variable_init_value, trainable=True, name="tmp"
+        )
+        state_variable = optimizer.add_variable_from_reference(
+            model_variable, "test"
+        )
+        self.assertEqual(state_variable._shared_name, "test/tmp")
+        self.assertAllClose(self.evaluate(state_variable), tf.zeros([4, 4]))
+        # Make sure the variable contains the correct layout info
+        self.assertEqual(state_variable.layout, model_variable.layout)
+
+    def test_build_index_dict(self):
+        optimizer = adam.Adam(mesh=self.mesh)
+        variable_init_value = tf.ones(shape=(), dtype=tf.float32)
+        variable_init_value = dtensor.copy_to_mesh(
+            variable_init_value,
+            layout=dtensor.Layout.replicated(self.mesh, rank=0),
+        )
+        var_list = [
+            dtensor.DVariable(variable_init_value, name=f"var{i}")
+            for i in range(10)
+        ]
+        optimizer._build_index_dict(var_list)
+        self.assertEqual(
+            optimizer._index_dict[optimizer._var_key(var_list[7])], 7
+        )
+
+    def test_aggregate_gradients_noop(self):
+        optimizer = adam.Adam(mesh=self.mesh)
+
+        variable_init_value = tf.ones(shape=(), dtype=tf.float32)
+        model_variable = dtensor.DVariable(
+            variable_init_value,
+            trainable=True,
+            layout=dtensor.Layout.replicated(self.mesh, rank=0),
+        )
+        grads = tf.ones_like(variable_init_value)
+
+        grad_and_var = zip([grads], [model_variable])
+
+        result = optimizer.aggregate_gradients(grad_and_var)
+        self.assertEqual(result, grad_and_var)
+
+    @parameterized.named_parameters(
+        (
+            "Adadelta",
+            adadelta.Adadelta,
+            {},
+            [
+                "Adadelta/accumulated_grad/Variable",
+                "Adadelta/accumulated_delta_var/Variable",
+                "iteration",
+            ],
+        ),
+        (
+            "Adam",
+            adam.Adam,
+            {"amsgrad": True},
+            [
+                "Adam/m/Variable",
+                "Adam/v/Variable",
+                "Adam/vhat/Variable",
+                "iteration",
+            ],
+        ),
+        (
+            "AdamW",
+            adamw.AdamW,
+            {"amsgrad": True},
+            [
+                "AdamW/m/Variable",
+                "AdamW/v/Variable",
+                "AdamW/vhat/Variable",
+                "iteration",
+            ],
+        ),
+        (
+            "Adagrad",
+            adagrad.Adagrad,
+            {},
+            ["Adagrad/accumulator/Variable", "iteration"],
+        ),
+        (
+            "RMSprop",
+            rmsprop.RMSprop,
+            {"momentum": 0.1, "centered": True},
+            [
+                "RMSprop/velocity/Variable",
+                "RMSprop/momentum/Variable",
+                "RMSprop/average_gradient/Variable",
+                "iteration",
+            ],
+        ),
+        (
+            "SGD",
+            sgd.SGD,
+            {"momentum": 0.1},
+            ["SGD/m/Variable", "iteration"],
+        ),
+    )
+    def test_apply_gradients(
+        self, optimizer_cls, init_args, expect_variable_names
+    ):
+        optimizer = optimizer_cls(mesh=self.mesh, **init_args)
+
+        self.assertEqual(self.evaluate(optimizer.iterations), 0)
+        self.assertEqual(
+            optimizer.iterations.layout,
+            dtensor.Layout.replicated(self.mesh, rank=0),
+        )
+
+        variable_init_value = tf.ones([4, 4], dtype=tf.float32)
+        variable_init_value = dtensor.copy_to_mesh(
+            variable_init_value,
+            layout=dtensor.Layout.replicated(self.mesh, rank=2),
+        )
+        model_variable = dtensor.DVariable(variable_init_value, trainable=True)
+
+        grads = tf.ones_like(variable_init_value)
+        optimizer.apply_gradients(zip([grads], [model_variable]))
+        optimizer_variables = optimizer.variables
+
+        self.assertEqual(self.evaluate(optimizer.iterations), 1)
+
+        all_names = [var._shared_name for var in optimizer_variables]
+        self.assertCountEqual(all_names, expect_variable_names)
+
+    def test_embedding_lookup_backward_path(self):
+        # See b/265441685 for more context.
+        backend.enable_tf_random_generator()
+        os.environ[
+            "DTENSOR_ENABLE_REPLICATED_SPMD_AS_DEFAULT_TF.RESOURCESCATTERADD"
+        ] = "1"
+        # Build a small functional model with embedding layer, it contains
+        # tf.gather ops which will trigger the _deduplicate_sparse_grad() code
+        # path. tf.unique op will have a shape mismatch issue for dtensor.
+        batch_size = 16
+        seq_length = 10
+        vocab_size = 100
+        output_size = 8
+
+        def produce_data():
+            inputs = tf.random.uniform(
+                maxval=vocab_size,
+                shape=(batch_size, seq_length),
+                dtype=tf.int32,
+            )
+            label = tf.random.uniform(
+                maxval=output_size, shape=(batch_size,), dtype=tf.int32
+            )
+            inputs = dtensor.copy_to_mesh(
+                inputs, layout=dtensor.Layout.replicated(self.mesh, rank=2)
+            )
+            inputs = dtensor.relayout(
+                inputs, dtensor.Layout.batch_sharded(self.mesh, "X", 2)
+            )
+            label = dtensor.copy_to_mesh(
+                label, layout=dtensor.Layout.replicated(self.mesh, rank=1)
+            )
+            label = dtensor.relayout(
+                label, dtensor.Layout.batch_sharded(self.mesh, "X", 1)
+            )
+            return inputs, label
+
+        with layout_map.LayoutMap(self.mesh).scope():
+            inputs = layers.Input(shape=(seq_length,))
+            x = layers.Embedding(vocab_size, 64)(inputs)
+            x = layers.GlobalAveragePooling1D()(x)
+            preds = layers.Dense(output_size, activation="softmax")(x)
+            model = models.Model(inputs, preds)
+
+        optimizer = adam.Adam(mesh=self.mesh)
+
+        @tf.function
+        def train_func(model, inputs, label, optimizer):
+            with tf.GradientTape() as tape:
+                output = model(inputs)
+                loss = losses.sparse_categorical_crossentropy(label, output)
+            optimizer.minimize(loss, model.variables, tape)
+            return loss
+
+        # The error only happens across the batch, where the value of
+        # tf.unique are different.
+        input1, label1 = produce_data()
+        train_func(model, input1, label1, optimizer)
+        input2, label2 = produce_data()
+        train_func(model, input2, label2, optimizer)
+        # Assert nothing here, and expect the train_func can run properly with
+        # different inputs.
+
 
-  def setUp(self):
-    super().setUp()
-    global_ids = test_util.create_device_ids_array((2, 2))
-    local_device_ids = np.ravel(global_ids).tolist()
-    mesh_dict = {
-        'CPU':
-            dtensor.Mesh(['X', 'Y'], global_ids,
-                         local_device_ids,
-                         test_util.create_device_list((2, 2), 'CPU'))
-    }
-    self.mesh = self.configTestMesh(mesh_dict)
-
-  def test_add_variable_from_reference(self):
-    optimizer = optimizers.Adam(mesh=self.mesh)
-    variable_init_value = tf.ones([4, 4], dtype=tf.float32)
-    variable_init_value = dtensor.copy_to_mesh(
-        variable_init_value,
-        layout=dtensor.Layout.replicated(self.mesh, rank=2))
-    model_variable = dtensor.DVariable(variable_init_value,
-                                       trainable=True,
-                                       name='tmp')
-    state_variable = optimizer.add_variable_from_reference(
-        model_variable, 'test')
-    self.assertEqual(state_variable._shared_name, 'test/tmp')
-    self.assertAllClose(self.evaluate(state_variable), tf.zeros([4, 4]))
-    # Make sure the variable contains the correct layout info
-    self.assertEqual(state_variable.layout, model_variable.layout)
-
-  def test_build_index_dict(self):
-    optimizer = optimizers.Adam(mesh=self.mesh)
-    variable_init_value = tf.ones(shape=(), dtype=tf.float32)
-    variable_init_value = dtensor.copy_to_mesh(
-        variable_init_value,
-        layout=dtensor.Layout.replicated(self.mesh, rank=0))
-    var_list = [dtensor.DVariable(variable_init_value, name=f'var{i}')
-                for i in range(10)]
-    optimizer._build_index_dict(var_list)
-    self.assertEqual(optimizer._index_dict[optimizer._var_key(var_list[7])], 7)
-
-  @parameterized.named_parameters(
-      ('Adadelta', optimizers.Adadelta, {},
-       ['Adadelta/accumulated_grad/Variable',
-        'Adadelta/accumulated_delta_var/Variable']),
-      ('Adam', optimizers.Adam, {'amsgrad': True},
-       ['Adam/m/Variable', 'Adam/v/Variable', 'Adam/vhat/Variable']),
-      ('Adagrad', optimizers.Adagrad, {}, ['Adagrad/accumulator/Variable']),
-      ('RMSprop', optimizers.RMSprop, {'momentum': 0.1, 'centered': True},
-       ['RMSprop/velocity/Variable', 'RMSprop/momentum/Variable',
-        'RMSprop/average_gradient/Variable']),
-      ('SGD', optimizers.SGD, {'momentum': 0.1}, ['SGD/m/Variable'])
-  )
-  def test_apply_gradients(self, optimizer_cls, init_args,
-                           expect_variable_names):
-    optimizer = optimizer_cls(mesh=self.mesh, **init_args)
-
-    self.assertEqual(self.evaluate(optimizer.iterations), 0)
-    self.assertEqual(optimizer.iterations.layout,
-                     dtensor.Layout.replicated(self.mesh, rank=0))
-
-    variable_init_value = tf.ones([4, 4], dtype=tf.float32)
-    variable_init_value = dtensor.copy_to_mesh(
-        variable_init_value,
-        layout=dtensor.Layout.replicated(self.mesh, rank=2))
-    model_variable = dtensor.DVariable(variable_init_value,
-                                       trainable=True)
-
-    grads = tf.ones_like(variable_init_value)
-    optimizer.apply_gradients(zip([grads], [model_variable]))
-    optimizer_variables = optimizer.variables
-
-    self.assertEqual(self.evaluate(optimizer.iterations), 1)
-
-    all_names = [var._shared_name for var in optimizer_variables]
-    expect_variable_names.extend(['iteration', 'learning_rate'])
-    self.assertCountEqual(all_names, expect_variable_names)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/save_load_test.py b/keras/dtensor/save_load_test.py
new file mode 100644
index 000000000000..e188c9ee4761
--- /dev/null
+++ b/keras/dtensor/save_load_test.py
@@ -0,0 +1,116 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras model save/load."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras import layers
+from keras import models
+from keras.dtensor import dtensor_api as dtensor
+from keras.dtensor import layout_map as layout_map_lib
+from keras.dtensor import test_util
+from keras.utils import tf_utils
+
+
+def _create_test_model():
+    model = models.Sequential()
+    model.add(
+        layers.Conv2D(
+            32,
+            name="conv2d_1",
+            kernel_size=(3, 3),
+            activation="relu",
+            input_shape=(28, 28, 1),  # channel last gray scale input
+        )
+    )
+    model.add(
+        layers.Conv2D(
+            64,
+            name="conv2d_2",
+            kernel_size=(3, 3),
+            activation="relu",
+        )
+    )
+    return model
+
+
+class SaveLoadTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        backend.enable_tf_random_generator()
+        tf_utils.set_random_seed(1337)
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+
+    def test_save_h5_weights_for_dtensor_model(self):
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        with layout_map_lib.layout_map_scope(layout_map):
+            dtensor_model = _create_test_model()
+
+        self.assertNotEmpty(dtensor_model.weights)
+        for w in dtensor_model.weights:
+            # Make sure the weights are DVariable
+            self.assertIsNotNone(w.layout)
+
+        save_file = self.create_tempfile("dtensor_model.h5")
+        dtensor_model.save_weights(save_file)
+
+        # Make sure the weights can be load back to a normal keras model.
+        normal_model = _create_test_model()
+        normal_model.load_weights(save_file)
+
+        for (
+            w1,
+            w2,
+        ) in zip(normal_model.weights, dtensor_model.weights):
+            self.assertAllClose(w1.numpy(), w2.numpy())
+            self.assertIsNone(getattr(w1, "layout", None))
+
+    def test_load_h5_weights_for_dtensor_model(self):
+        normal_model = _create_test_model()
+
+        save_file = self.create_tempfile("normal_model.h5")
+        normal_model.save_weights(save_file)
+
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        with layout_map_lib.layout_map_scope(layout_map):
+            dtensor_model = _create_test_model()
+
+        self.assertNotEmpty(dtensor_model.weights)
+        for w in dtensor_model.weights:
+            self.assertIsNotNone(w.layout)
+
+        dtensor_model.load_weights(save_file)
+
+        for (
+            w1,
+            w2,
+        ) in zip(normal_model.weights, dtensor_model.weights):
+            self.assertAllClose(w1.numpy(), w2.numpy())
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/strategy_integration_test.py b/keras/dtensor/strategy_integration_test.py
new file mode 100644
index 000000000000..0f5d660b4cd2
--- /dev/null
+++ b/keras/dtensor/strategy_integration_test.py
@@ -0,0 +1,118 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DTensor based strategy training."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras import backend
+from keras import mixed_precision
+from keras.dtensor import integration_test_utils
+from keras.optimizers import adam
+from keras.utils import tf_utils
+
+# isort: off
+# Import the MirroredStrategy that is backed by DTensor
+# It is not a public API yet, so we do a private symbol import for now.
+from tensorflow.python.distribute.experimental import (
+    mirrored_strategy as dtensor_mirrored_strategy,
+)
+from tensorflow.dtensor.python.tests import test_util
+
+
+class TrainingTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        backend.enable_tf_random_generator()
+        tf_utils.set_random_seed(1337)
+        global_ids = test_util.create_device_ids_array((2,))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            device: tf.experimental.dtensor.Mesh(
+                ["batch"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2,), device),
+            )
+            for device in ("CPU", "GPU", "TPU")
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+
+    def tearDown(self):
+        super().tearDown()
+        # clean up the mixed precision setting if any.
+        mixed_precision.set_global_policy("float32")
+
+    @parameterized.product(
+        run_eagerly=[True, False],
+        jit_compile=[True, False],
+        optimizer_creator=[lambda: adam.Adam(), lambda: "adam"],
+        enable_mixed_precision=[True, False],
+    )
+    def test_model_fit(
+        self,
+        run_eagerly,
+        jit_compile,
+        optimizer_creator,
+        enable_mixed_precision,
+    ):
+        if run_eagerly and jit_compile:
+            self.skipTest("run_eagerly can't run with jit_compile")
+        if enable_mixed_precision and self.mesh.device_type() != "GPU":
+            self.skipTest("Only run mixed_precision on GPU for performance")
+
+        if enable_mixed_precision:
+            mixed_precision.set_global_policy("mixed_float16")
+        dtensor_strategy = dtensor_mirrored_strategy.MirroredStrategy(
+            mesh=self.mesh
+        )
+        # Make fake MNIST-like image data.
+        batch_size = 64
+        dataset = tf.data.Dataset.from_tensor_slices(
+            (
+                np.random.uniform(size=(batch_size, 28, 28, 1)).astype(
+                    np.float32
+                ),
+                np.random.randint(0, 10, size=(batch_size,)),
+            )
+        )
+        dataset = dataset.shuffle(64).repeat().batch(64, drop_remainder=True)
+
+        with dtensor_strategy.scope():
+            model = integration_test_utils.get_model()
+            optimizer = optimizer_creator()
+
+        model.compile(
+            loss="SparseCategoricalCrossentropy",
+            optimizer=optimizer,
+            metrics="acc",
+            run_eagerly=run_eagerly,
+            jit_compile=jit_compile,
+        )
+        model.fit(dataset, steps_per_epoch=10)
+
+        prediction = model.predict(
+            np.random.uniform(size=(batch_size, 28, 28, 1)).astype(np.float32)
+        )
+        self.assertEqual(prediction.shape, (batch_size, 10))
+        if enable_mixed_precision:
+            self.assertEqual(prediction.dtype, tf.float16)
+        else:
+            self.assertEqual(prediction.dtype, tf.float32)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/dtensor/test_util.py b/keras/dtensor/test_util.py
index 7d2019df670b..44e2b7f709ce 100644
--- a/keras/dtensor/test_util.py
+++ b/keras/dtensor/test_util.py
@@ -14,120 +14,137 @@
 # ==============================================================================
 """Keras utilities for DTensor unit test."""
 
-from absl.testing import parameterized
 import numpy as np
-
 import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
-
+# isort: off
 from tensorflow.dtensor.python import api as dtensor_api
 from tensorflow.python.eager import context
 
-
 _DEFAULT_GPU_MEMORY_LIMIT = 200  # MB
 
 
 class DTensorBaseTest(tf.test.TestCase, parameterized.TestCase):
-  """Provides comparison helper for dtensor vs local results."""
-
-  @classmethod
-  def setUpClass(cls):
-    super(DTensorBaseTest, cls).setUpClass()
-
-  def tearDown(self):
-    super().tearDown()
-    # Make sure all async ops finish.
-    context.async_wait()
-
-    # TODO(hthu): Remove the reset once we fixed the CopyToMesh with
-    # DefaultMesh placement issue.
-    reset_dtensor()
-
-  @staticmethod
-  def configTestMesh(device_type_mesh_map):    # pylint: disable=invalid-name
-    """Configs corresponding mesh given test context.
-
-    If runs on a CPU mesh, set virtual device on CPU.
-    If runs on a GPU mesh, sets virtual device on GPU with proper memory limits.
-    if runs on a TPU mesh, initializes TPU system.
-
-    Args:
-      device_type_mesh_map: A dictionary containing device_type -> mesh mapping.
-
-    Returns:
-      A properly configured mesh for use in test.
-    """
-    reset_context()
-
-    def get_mesh(device_type):
-      mesh = device_type_mesh_map.get(device_type, None)
-      if mesh is None:
-        raise ValueError('Requires a %s mesh to run test on %s.' %
-                         (device_type, device_type))
-      return mesh
-
-    mesh = None
-    if tf.config.list_physical_devices('GPU'):
-      mesh = get_mesh('GPU')
-      reset_logical_devices('GPU', np.prod(mesh.shape()))
-    else:
-      mesh = get_mesh('CPU')
-      reset_logical_devices('CPU', np.prod(mesh.shape()))
-
-    context.ensure_initialized()
-    return mesh
+    """Provides comparison helper for dtensor vs local results."""
+
+    @classmethod
+    def setUpClass(cls):
+        super(DTensorBaseTest, cls).setUpClass()
+
+    def tearDown(self):
+        super().tearDown()
+        # Make sure all async ops finish.
+        context.async_wait()
+
+        # TODO(hthu): Remove the reset once we fixed the CopyToMesh with
+        # DefaultMesh placement issue.
+        reset_dtensor()
+
+    @staticmethod
+    def configTestMesh(device_type_mesh_map):
+        """Configs corresponding mesh given test context.
+
+        If runs on a CPU mesh, set virtual device on CPU.
+        If runs on a GPU mesh, sets virtual device on GPU with proper memory
+        limits.
+        if runs on a TPU mesh, initializes TPU system.
+
+        Args:
+          device_type_mesh_map: A dictionary containing device_type -> mesh
+            mapping.
+
+        Returns:
+          A properly configured mesh for use in test.
+        """
+        reset_context()
+
+        def get_mesh(device_type):
+            mesh = device_type_mesh_map.get(device_type, None)
+            if mesh is None:
+                dt = device_type
+                raise ValueError(f"Requires a {dt} mesh to run test on {dt}.")
+            return mesh
+
+        mesh = None
+        if tf.config.list_physical_devices("GPU"):
+            mesh = get_mesh("GPU")
+            reset_logical_devices("GPU", np.prod(mesh.shape()))
+        else:
+            mesh = get_mesh("CPU")
+            reset_logical_devices("CPU", np.prod(mesh.shape()))
+
+        context.ensure_initialized()
+        return mesh
 
 
 def create_device_array(shape, device_type):
-  device_count = np.prod(shape)
-  return np.asarray([
-      tf.DeviceSpec(  # pylint: disable=g-complex-comprehension
-          job='localhost/replica:0/task:0',
-          device_type=device_type,
-          device_index=i) for i in range(device_count)
-  ]).reshape(shape)
+    device_count = np.prod(shape)
+    return np.asarray(
+        [
+            tf.DeviceSpec(
+                job="localhost/replica:0/task:0",
+                device_type=device_type,
+                device_index=i,
+            )
+            for i in range(device_count)
+        ]
+    ).reshape(shape)
 
 
 def create_device_list(shape, device_type):
-  devices = create_device_array(shape, device_type)
-  return np.ravel(devices).tolist()
+    devices = create_device_array(shape, device_type)
+    return np.ravel(devices).tolist()
 
 
 def create_device_ids_array(shape):
-  device_count = np.prod(shape)
-  return np.arange(device_count).reshape(shape)
+    device_count = np.prod(shape)
+    return np.arange(device_count).reshape(shape)
 
 
 def reset_context():
-  context._reset_context()  # pylint: disable=protected-access
+    context._reset_context()
 
 
 def reset_logical_devices(device_type, count):
-  """Resets logical devices for CPU/GPU.
-
-  Logical devices can only be instantiated once on a particular context. For
-  now, context re-use is triggering some function duplication errors, so we
-  reset the context on each call.
-
-  Args:
-    device_type: The device_type to reset.
-    count: numbers of virtual device to reset to.
-  """
-  reset_context()
-  devices = tf.config.list_physical_devices(device_type)
-  if device_type.upper() == 'CPU':
-    tf.config.set_logical_device_configuration(devices[0], [
-        tf.config.LogicalDeviceConfiguration(),
-    ] * count)
-  elif device_type.upper() == 'GPU':
-    tf.config.set_logical_device_configuration(devices[0], [
-        tf.config.LogicalDeviceConfiguration(
-            memory_limit=_DEFAULT_GPU_MEMORY_LIMIT),
-    ] * count)
-  else:
-    raise ValueError('resetting logical device for non-supported device type : '
-                     '%s' % device_type)
+    """Resets logical devices for CPU/GPU.
+
+    Logical devices can only be instantiated once on a particular context. For
+    now, context re-use is triggering some function duplication errors, so we
+    reset the context on each call.
+
+    Args:
+      device_type: The device_type to reset.
+      count: numbers of virtual device to reset to.
+    """
+    if device_type.upper() not in ["CPU", "GPU"]:
+        raise ValueError(
+            "resetting logical device for non-supported device type: "
+            f"{device_type}"
+        )
+    reset_context()
+
+    cpus = tf.config.list_physical_devices("CPU")
+    if device_type.upper() == "GPU":
+        gpus = tf.config.list_physical_devices(device_type)
+        tf.config.set_logical_device_configuration(
+            gpus[0],
+            [
+                tf.config.LogicalDeviceConfiguration(
+                    memory_limit=_DEFAULT_GPU_MEMORY_LIMIT
+                ),
+            ]
+            * count,
+        )
+    # Always config CPU mesh as the host mesh for DTensor
+    tf.config.set_logical_device_configuration(
+        cpus[0],
+        [
+            tf.config.LogicalDeviceConfiguration(),
+        ]
+        * count,
+    )
 
 
 def reset_dtensor():
-  dtensor_api._reset()    # pylint: disable=protected-access
+    dtensor_api._reset()
diff --git a/keras/dtensor/utils.py b/keras/dtensor/utils.py
index 378560af8cec..234ffe13cbf6 100644
--- a/keras/dtensor/utils.py
+++ b/keras/dtensor/utils.py
@@ -16,9 +16,9 @@
 
 import inspect
 
-from keras.dtensor import dtensor_api as dtensor
 import tensorflow.compat.v2 as tf
 
+from keras.dtensor import dtensor_api as dtensor
 
 # All the variable names in the default keras layers. We will use those to map
 # against the args in the __init__ method to find corresponding layout args.
@@ -39,125 +39,148 @@
 
 
 def allow_initializer_layout(init_method):
-  """A decorator for injecting layout information to layer.__init__.
-
-  Layout will be a new param for any of the weights for all the keras layers.
-  Adding the param to all the __init__ method will be a big/duplicated work.
-
-  This decorator is design to reduce and code duplication and make it easy to
-  add/remove the dtensor feature if needed.
-
-  Sample usage:
-  ```python
-  class Dense(tf.keras.layer.Layer):
-
-    @allow_initializer_layout
-    def __init__(self, units,
-                 kernel_initializer='zeros',
-                 bias_initializer='zeros',
-                 **kwargs):
-       super().__init__(**kwargs)
-
-  d = Dense(units=8, kernel_layout=layout1, bias_layout=layout2)
-  d.kernel_layout == layout1
-  d.bias_layout == layout2
-  ```
-
-  By adding this annotation, it will:
-
-  1. Filter out the kwargs based on some keywords, eg if the 'kernel_initialzer'
-     appears in method signature, then it will try to pop the 'kernel_layout' if
-     it presents. Same for "bias" and "recurrent_kernel", etc. This will make
-     sure the layout related param is not passed to `BaseLayer.__init__`, which
-     will raise error about unexpect keyword args.
-  2. Set the self.kernel/bias_layout attribute after the `__init__` method is
-     called. Keras framework will use those fields to create weights down the
-     stream.
-
-  Args:
-    init_method: the `__init__` method of the Keras layer to annotate.
-
-  Returns:
-    the annotated __init__ method.
-  """
-
-  def _wrap_function(layer_instance, *args, **kwargs):
-    signature = inspect.signature(init_method)
-    layout_args = {}
-    # Check args like 'kernel_initializer' and pop the 'kernel_layout' if it
-    # presents.
-    for variable_name in KERAS_VARIABLE_NAMES:
-      if variable_name + "_initializer" in signature.parameters:
-        layout = kwargs.pop(variable_name + "_layout", None)
-        if layout:
-          layout_args[variable_name + "_layout"] = layout
-
-    init_method(layer_instance, *args, **kwargs)
-
-    # Inject the layout parameter after the invocation of __init__()
-    for layout_param_name, layout in layout_args.items():
-      setattr(layer_instance, layout_param_name, layout)
-
-  # return decorated
-  return tf.__internal__.decorator.make_decorator(
-      target=init_method, decorator_func=_wrap_function)
+    """A decorator for injecting layout information to layer.__init__.
+
+    Layout will be a new param for any of the weights for all the keras layers.
+    Adding the param to all the __init__ method will be a big/duplicated work.
+
+    This decorator is design to reduce and code duplication and make it easy to
+    add/remove the dtensor feature if needed.
+
+    Sample usage:
+    ```python
+    class Dense(tf.keras.layer.Layer):
+
+      @allow_initializer_layout
+      def __init__(self, units,
+                   kernel_initializer='zeros',
+                   bias_initializer='zeros',
+                   **kwargs):
+         super().__init__(**kwargs)
+
+    d = Dense(units=8, kernel_layout=layout1, bias_layout=layout2)
+    d.kernel_layout == layout1
+    d.bias_layout == layout2
+    ```
+
+    By adding this annotation, it will:
+
+    1. Filter out the kwargs based on some keywords, eg if the
+      'kernel_initialzer' appears in method signature, then it will try to pop
+      the 'kernel_layout' if it presents. Same for "bias" and
+      "recurrent_kernel", etc. This will make sure the layout related param is
+      not passed to `BaseLayer.__init__`, which will raise error about unexpect
+      keyword args.
+    2. Set the self.kernel/bias_layout attribute after the `__init__` method is
+       called. Keras framework will use those fields to create weights down the
+       stream.
+
+    Args:
+      init_method: the `__init__` method of the Keras layer to annotate.
+
+    Returns:
+      the annotated __init__ method.
+    """
+
+    def _wrap_function(layer_instance, *args, **kwargs):
+        signature = inspect.signature(init_method)
+        layout_args = {}
+        # Check args like 'kernel_initializer' and pop the 'kernel_layout' if it
+        # presents.
+        for variable_name in KERAS_VARIABLE_NAMES:
+            if variable_name + "_initializer" in signature.parameters:
+                layout = kwargs.pop(variable_name + "_layout", None)
+                if layout:
+                    layout_args[variable_name + "_layout"] = layout
+
+        init_method(layer_instance, *args, **kwargs)
+
+        # Inject the layout parameter after the invocation of __init__()
+        for layout_param_name, layout in layout_args.items():
+            setattr(layer_instance, layout_param_name, layout)
+
+    # return decorated
+    return tf.__internal__.decorator.make_decorator(
+        target=init_method, decorator_func=_wrap_function
+    )
 
 
 def inject_mesh(init_method):
-  """Inject DTensor mesh information to an object.
+    """Inject DTensor mesh information to an object.
+
+    This is useful for keras object like `Metric` and `Optimizer` which need
+    DTensor mesh to create the weights, but doesn't want to change the current
+    public API interface.
 
-  This is useful for keras object like `Metric` and `Optimizer` which need
-  DTensor mesh to create the weights, but doesn't want to change the current
-  public API interface.
+    This is for temporary usage and eventually the mesh/layout information will
+    be public arguments in the `__init__` method.
 
-  This is for temporary usage and eventually the mesh/layout information will be
-  public arguments in the `__init__` method
+    Sample usage:
+    ```python
+    class Accuracy(tf.keras.metrics.Metric):
 
-  Sample usage:
-  ```python
-  class Accuracy(tf.keras.metrics.Metric):
+      @inject_mesh
+      def __init__(self, name='accuracy', dtype=None):
+         super().__init__(**kwargs)
 
-    @inject_mesh
-    def __init__(self, name='accuracy', dtype=None):
-       super().__init__(**kwargs)
+      acc = Accuracy(mesh=mesh)
+      assert acc._mesh == mesh
+    ```
 
-    acc = Accuracy(mesh=mesh)
-    assert acc._mesh == mesh
-  ```
+    Args:
+      init_method: the `__init__` method of the Keras class to annotate.
 
-  Args:
-    init_method: the `__init__` method of the Keras class to annotate.
+    Returns:
+      the annotated __init__ method.
+    """
 
-  Returns:
-    the annotated __init__ method.
-  """
-  def _wrap_function(instance, *args, **kwargs):
-    mesh = kwargs.pop("mesh", None)
-    # Note that the injection of _mesh need to happen before the invocation of
-    # __init__, since the class might need the mesh to create weights in the
-    # __init__.
-    if mesh is not None:
-      instance._mesh = mesh  # pylint: disable=protected-access
-    init_method(instance, *args, **kwargs)
+    def _wrap_function(instance, *args, **kwargs):
+        mesh = kwargs.pop("mesh", None)
+        # Note that the injection of _mesh need to happen before the invocation
+        # of __init__, since the class might need the mesh to create weights in
+        # the __init__.
+        if mesh is not None:
+            instance._mesh = mesh
+        init_method(instance, *args, **kwargs)
 
-  return tf.__internal__.decorator.make_decorator(
-      target=init_method, decorator_func=_wrap_function)
+    return tf.__internal__.decorator.make_decorator(
+        target=init_method, decorator_func=_wrap_function
+    )
 
 
 def call_with_layout(fn, layout, *args, **kwargs):
-  """Invoke the function with inputs and relayout the result.
-
-  Args:
-    fn: the function to invoke.
-    layout: if not None, the output of the fn will be relayout with this.
-    *args: positional arguments to be called with fn.
-    **kwargs: keyword arguments to be called with fn.
-
-  Returns:
-    The output of fn, with potential relayout with the layout specified.
-  """
-  if layout:
-    with dtensor.run_on(layout.mesh):
-      result = fn(*args, **kwargs)
-      return dtensor.relayout(result, layout)
-  return fn(*args, **kwargs)
+    """Invoke the function with inputs and relayout the result.
+
+    Args:
+      fn: the function to invoke.
+      layout: if not None, the output of the fn will be relayout with this.
+      *args: positional arguments to be called with fn.
+      **kwargs: keyword arguments to be called with fn.
+
+    Returns:
+      The output of fn, with potential relayout with the layout specified.
+    """
+    if layout:
+        with dtensor.default_mesh(layout.mesh):
+            result = fn(*args, **kwargs)
+            return dtensor.relayout(result, layout)
+    return fn(*args, **kwargs)
+
+
+def running_with_dtensor_strategy():
+    """Check whether running with a `Strategy` that is backed by DTensor.
+
+    In the DTensor based training, all the tensors are in global context, which
+    is different from the local context. Some keras components need to
+    behave differently, e.g. BatchNormalization and SyncBatchNormalization, as
+    well as optimizers.
+
+    This check will help those layer to branch the logic and keep the correct
+    behavior between different context.
+    """
+    if not tf.distribute.has_strategy():
+        return False
+    strategy = tf.distribute.get_strategy()
+    # TODO(scottzhu): Finalize the strategy API to check if a strategy is backed
+    # by DTensor.
+    return getattr(strategy, "_mesh", None) is not None
diff --git a/keras/dtensor/utils_test.py b/keras/dtensor/utils_test.py
index 98851163a72a..407ecf149abc 100644
--- a/keras/dtensor/utils_test.py
+++ b/keras/dtensor/utils_test.py
@@ -14,63 +14,83 @@
 # ==============================================================================
 """Tests for utils."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import layers
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import test_util
 from keras.dtensor import utils
 
-import numpy as np
-import tensorflow.compat.v2 as tf
-
 
 class UtilsTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["X", "Y"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+        self.layout = dtensor.Layout.replicated(self.mesh, rank=1)
 
-  def setUp(self):
-    super().setUp()
-    global_ids = test_util.create_device_ids_array((2, 2))
-    local_device_ids = np.ravel(global_ids).tolist()
-    mesh_dict = {
-        'CPU':
-            dtensor.Mesh(['X', 'Y'], global_ids,
-                         local_device_ids,
-                         test_util.create_device_list((2, 2), 'CPU'))
-    }
-    self.mesh = self.configTestMesh(mesh_dict)
-    self.layout = dtensor.Layout.replicated(self.mesh, rank=1)
-
-  @parameterized.named_parameters(
-      ('Dense', layers.Dense, {'units': 4}, ['kernel_layout', 'bias_layout']),
-      ('Conv2D', layers.Conv2D, {'filters': 2, 'kernel_size': 3},
-       ['kernel_layout', 'bias_layout']),
-      ('BatchNorm', layers.BatchNormalization, {},
-       ['beta_layout', 'gamma_layout', 'moving_mean_layout',
-        'moving_variance_layout']),
-      ('Embedding', layers.Embedding, {'input_dim': 100, 'output_dim': 20},
-       ['embeddings_layout']),
-      (' PReLU', layers. PReLU, {}, ['alpha_layout']),
-      ('SeparableConv2D', layers.SeparableConv2D,
-       {'filters': 2, 'kernel_size': 3},
-       ['depthwise_layout', 'pointwise_layout', 'bias_layout']),
-      # TODO(scottzhu): Probably add more coverage for all the layers.
-  )
-  def test_all_layout_decorator(self, layer_cls, init_args, layout_args):
+    @parameterized.named_parameters(
+        ("Dense", layers.Dense, {"units": 4}, ["kernel_layout", "bias_layout"]),
+        (
+            "Conv2D",
+            layers.Conv2D,
+            {"filters": 2, "kernel_size": 3},
+            ["kernel_layout", "bias_layout"],
+        ),
+        (
+            "BatchNorm",
+            layers.BatchNormalization,
+            {},
+            [
+                "beta_layout",
+                "gamma_layout",
+                "moving_mean_layout",
+                "moving_variance_layout",
+            ],
+        ),
+        (
+            "Embedding",
+            layers.Embedding,
+            {"input_dim": 100, "output_dim": 20},
+            ["embeddings_layout"],
+        ),
+        (" PReLU", layers.PReLU, {}, ["alpha_layout"]),
+        (
+            "SeparableConv2D",
+            layers.SeparableConv2D,
+            {"filters": 2, "kernel_size": 3},
+            ["depthwise_layout", "pointwise_layout", "bias_layout"],
+        ),
+        # TODO(scottzhu): Probably add more coverage for all the layers.
+    )
+    def test_all_layout_decorator(self, layer_cls, init_args, layout_args):
 
-    layer_cls.__init__ = utils.allow_initializer_layout(layer_cls.__init__)
+        layer_cls.__init__ = utils.allow_initializer_layout(layer_cls.__init__)
 
-    # Make sure we don't set the layout attribute if the init kwargs is not
-    # provided.
-    layer = layer_cls(**init_args)
-    for layout_arg in layout_args:
-      self.assertFalse(hasattr(layer, layout_arg))
+        # Make sure we don't set the layout attribute if the init kwargs is not
+        # provided.
+        layer = layer_cls(**init_args)
+        for layout_arg in layout_args:
+            self.assertFalse(hasattr(layer, layout_arg))
 
-    layout_kwargs = {k: self.layout for k in layout_args}
-    init_args.update(layout_kwargs)
-    layer = layer_cls(**init_args)
+        layout_kwargs = {k: self.layout for k in layout_args}
+        init_args.update(layout_kwargs)
+        layer = layer_cls(**init_args)
 
-    for layout_arg in layout_args:
-      self.assertEqual(getattr(layer, layout_arg), self.layout)
+        for layout_arg in layout_args:
+            self.assertEqual(getattr(layer, layout_arg), self.layout)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/BUILD b/keras/engine/BUILD
index 90d067461a7d..a2c40e878106 100644
--- a/keras/engine/BUILD
+++ b/keras/engine/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   Contains the Keras engine API (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
+
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
@@ -8,12 +10,9 @@ load("@org_keras//keras:keras.bzl", "tf_py_test")
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
-    default_visibility = [
-        "//keras:friends",
-        "//third_party/tensorflow/python:__pkg__",
-        "//third_party/tensorflow/python/feature_column:__pkg__",
-    ],
+    default_visibility = ["//keras:friends"],
     licenses = ["notice"],
 )
 
@@ -59,6 +58,7 @@ py_library(
         "//keras/distribute",
         "//keras/distribute:distribute_coordinator_utils",
         "//keras/dtensor:layout_map",
+        "//keras/export:export_lib",
         "//keras/initializers",
         "//keras/metrics",
         "//keras/mixed_precision:autocast_variable",
@@ -66,10 +66,10 @@ py_library(
         "//keras/mixed_precision:policy",
         "//keras/optimizers",
         "//keras/saving",
-        "//keras/saving/experimental",
         "//keras/utils:engine_utils",
         "//keras/utils:metrics_utils",
         "//keras/utils:mode_keys",
+        "//keras/utils:steps_per_execution_tuning",
         "//keras/utils:tf_utils",
         "//keras/utils:version_utils",
     ],
@@ -153,6 +153,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//:expect_tensorflow_installed",
+        "//keras/distribute",
         "//keras/utils:dataset_creator",
         "//keras/utils:engine_utils",
         "//keras/utils:tf_utils",
@@ -382,6 +383,7 @@ tf_py_test(
         "//keras:losses",
         "//keras/layers",
         "//keras/metrics",
+        "//keras/mixed_precision:policy",
         "//keras/testing_infra:test_combinations",
         "//keras/testing_infra:test_utils",
         "//keras/utils:data_utils",
@@ -461,7 +463,7 @@ tf_py_test(
         "//keras:losses",
         "//keras/layers",
         "//keras/metrics",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
         "//keras/testing_infra:test_utils",
         "//keras/utils:data_utils",
@@ -627,7 +629,7 @@ tf_py_test(
         "//keras/layers",
         "//keras/legacy_tf_layers:core",
         "//keras/mixed_precision:policy",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
         "//keras/testing_infra:test_utils",
         "//keras/utils:tf_utils",
@@ -657,6 +659,7 @@ tf_py_test(
     size = "medium",
     srcs = ["sequential_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
diff --git a/keras/engine/base_layer.py b/keras/engine/base_layer.py
index 647e8cd9cf51..4e4039631ba5 100644
--- a/keras/engine/base_layer.py
+++ b/keras/engine/base_layer.py
@@ -12,12 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-# pylint: disable=g-classes-have-attributes
-# pylint: disable=g-bad-import-order
-"""Contains the base Layer class, from which all layers inherit."""
 
-import tensorflow.compat.v2 as tf
+
+"""Contains the base Layer class, from which all layers inherit."""
 
 import collections
 import contextlib
@@ -29,8 +26,8 @@
 import weakref
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
-from google.protobuf import json_format
 from keras import backend
 from keras import constraints
 from keras import initializers
@@ -41,9 +38,9 @@
 from keras.engine import keras_tensor
 from keras.engine import node as node_module
 from keras.mixed_precision import autocast_variable
-from keras.mixed_precision import loss_scale_optimizer
 from keras.mixed_precision import policy
-from keras.saving.saved_model import layer_serialization
+from keras.saving import serialization_lib
+from keras.saving.legacy.saved_model import layer_serialization
 from keras.utils import generic_utils
 from keras.utils import layer_utils
 from keras.utils import object_identity
@@ -51,36 +48,45 @@
 from keras.utils import tf_utils
 from keras.utils import traceback_utils
 from keras.utils import version_utils
+
 # A module that only depends on `keras.layers` import these from here.
-from keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
-from keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
+from keras.utils.generic_utils import to_snake_case  # noqa: F401
+from keras.utils.tf_utils import is_tensor_or_tensor_list  # noqa: F401
+
+# isort: off
+from google.protobuf import json_format
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
+from tensorflow.python.util.tf_export import (
+    get_canonical_name_for_symbol,
+)
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
-# pylint: disable=g-inconsistent-quotes
+
 metrics_mod = generic_utils.LazyLoader(
-    "metrics_mod", globals(),
-    "keras.metrics")
-# pylint: enable=g-inconsistent-quotes
+    "metrics_mod", globals(), "keras.metrics"
+)
+
 
 # Prefix that is added to the TF op layer names.
-_TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
+_TF_OP_LAYER_NAME_PREFIX = "tf_op_layer_"
 
 # TODO(mdan): Should we have a single generic type for types that can be passed
 # to tf.cast?
-_AUTOCAST_TYPES = (tf.Tensor, tf.SparseTensor,
-                   tf.RaggedTensor)
+_AUTOCAST_TYPES = (tf.Tensor, tf.SparseTensor, tf.RaggedTensor)
 
 keras_layers_gauge = tf.__internal__.monitoring.BoolGauge(
-    '/tensorflow/api/keras/layers', 'keras layers usage', 'method')
+    "/tensorflow/api/keras/layers", "keras layers usage", "method"
+)
 keras_models_gauge = tf.__internal__.monitoring.BoolGauge(
-    '/tensorflow/api/keras/models', 'keras model usage', 'method')
+    "/tensorflow/api/keras/models", "keras model usage", "method"
+)
 keras_api_gauge = tf.__internal__.monitoring.BoolGauge(
-    '/tensorflow/api/keras', 'keras api usage', 'method')
+    "/tensorflow/api/keras", "keras api usage", "method"
+)
 keras_premade_model_gauge = tf.__internal__.monitoring.BoolGauge(
-    '/tensorflow/api/keras/premade_models', 'premade keras model usage', 'type')
+    "/tensorflow/api/keras/premade_models", "premade keras model usage", "type"
+)
 
 _is_name_scope_on_model_declaration_enabled = False
 
@@ -89,3262 +95,3753 @@
 
 @contextlib.contextmanager
 def _name_scope_unnester(full_name_scope):
-  """Helper to get relative name scope from fully specified nested name scopes.
-
-  Args:
-    full_name_scope: full(absolute) name scope path.
-
-  Yields:
-    Relative name scope path from the parent `_name_scope_unnester` context
-    manager.
-
-  Example:
-  ```
-  with _name_scope_unnester('a') as name1:  # name1 == 'a'
-    with _name_scope_unnester('a/b') as name2:  # name2 == 'b'
-      with _name_scope_unnester('a/b/c') as name3:  # name3 == 'c'
-        pass
-  ```
-  """
-  if not getattr(_name_scope_unnester_stack, 'value', None):
-    _name_scope_unnester_stack.value = ['']
-
-  _name_scope_unnester_stack.value.append(full_name_scope)
-
-  try:
-    full_name_scope = _name_scope_unnester_stack.value[-1]
-    outer_name_scope = _name_scope_unnester_stack.value[-2]
-    relative_name_scope = full_name_scope.lstrip(outer_name_scope)
-    relative_name_scope = relative_name_scope.lstrip('/')
-    yield relative_name_scope
-  finally:
-    _name_scope_unnester_stack.value.pop()
-
-
-@keras_export('keras.layers.Layer')
-class Layer(tf.Module, version_utils.LayerVersionSelector):
-  """This is the class from which all layers inherit.
-
-  A layer is a callable object that takes as input one or more tensors and
-  that outputs one or more tensors. It involves *computation*, defined
-  in the `call()` method, and a *state* (weight variables). State can be
-  created in various places, at the convenience of the subclass implementer:
-
-  * in `__init__()`;
-  * in the optional `build()` method, which is invoked by the first
-    `__call__()` to the layer, and supplies the shape(s) of the input(s),
-    which may not have been known at initialization time;
-  * in the first invocation of `call()`, with some caveats discussed
-    below.
-
-  Users will just instantiate a layer and then treat it as a callable.
-
-  Args:
-    trainable: Boolean, whether the layer's variables should be trainable.
-    name: String name of the layer.
-    dtype: The dtype of the layer's computations and weights. Can also be a
-      `tf.keras.mixed_precision.Policy`, which allows the computation and weight
-      dtype to differ. Default of `None` means to use
-      `tf.keras.mixed_precision.global_policy()`, which is a float32 policy
-      unless set to different value.
-    dynamic: Set this to `True` if your layer should only be run eagerly, and
-      should not be used to generate a static computation graph.
-      This would be the case for a Tree-RNN or a recursive network,
-      for example, or generally for any layer that manipulates tensors
-      using Python control flow. If `False`, we assume that the layer can
-      safely be used to generate a static computation graph.
-
-  Attributes:
-    name: The name of the layer (string).
-    dtype: The dtype of the layer's weights.
-    variable_dtype: Alias of `dtype`.
-    compute_dtype: The dtype of the layer's computations. Layers automatically
-      cast inputs to this dtype which causes the computations and output to also
-      be in this dtype. When mixed precision is used with a
-      `tf.keras.mixed_precision.Policy`, this will be different than
-      `variable_dtype`.
-    dtype_policy: The layer's dtype policy. See the
-      `tf.keras.mixed_precision.Policy` documentation for details.
-    trainable_weights: List of variables to be included in backprop.
-    non_trainable_weights: List of variables that should not be
-      included in backprop.
-    weights: The concatenation of the lists trainable_weights and
-      non_trainable_weights (in this order).
-    trainable: Whether the layer should be trained (boolean), i.e. whether
-      its potentially-trainable weights should be returned as part of
-      `layer.trainable_weights`.
-    input_spec: Optional (list of) `InputSpec` object(s) specifying the
-      constraints on inputs that can be accepted by the layer.
-
-  We recommend that descendants of `Layer` implement the following methods:
-
-  * `__init__()`: Defines custom layer attributes, and creates layer weights
-    that do not depend on input shapes, using `add_weight()`, or other state.
-  * `build(self, input_shape)`: This method can be used to create weights that
-    depend on the shape(s) of the input(s), using `add_weight()`, or other
-    state. `__call__()` will automatically build the layer (if it has not been
-    built yet) by calling `build()`.
-  * `call(self, inputs, *args, **kwargs)`: Called in `__call__` after making
-    sure `build()` has been called. `call()` performs the logic of applying the
-    layer to the `inputs`. The first invocation may additionally create state
-    that could not be conveniently created in `build()`; see its docstring
-    for details.
-    Two reserved keyword arguments you can optionally use in `call()` are:
-      - `training` (boolean, whether the call is in inference mode or training
-        mode). See more details in [the layer/model subclassing guide](
-        https://www.tensorflow.org/guide/keras/custom_layers_and_models#privileged_training_argument_in_the_call_method)
-      - `mask` (boolean tensor encoding masked timesteps in the input, used
-        in RNN layers). See more details in [the layer/model subclassing guide](
-        https://www.tensorflow.org/guide/keras/custom_layers_and_models#privileged_mask_argument_in_the_call_method)
-    A typical signature for this method is `call(self, inputs)`, and user could
-    optionally add `training` and `mask` if the layer need them. `*args` and
-    `**kwargs` is only useful for future extension when more input parameters
-    are planned to be added.
-  * `get_config(self)`: Returns a dictionary containing the configuration used
-    to initialize this layer. If the keys differ from the arguments
-    in `__init__`, then override `from_config(self)` as well.
-    This method is used when saving
-    the layer or a model that contains this layer.
-
-  Examples:
-
-  Here's a basic example: a layer with two variables, `w` and `b`,
-  that returns `y = w . x + b`.
-  It shows how to implement `build()` and `call()`.
-  Variables set as attributes of a layer are tracked as weights
-  of the layers (in `layer.weights`).
-
-  ```python
-  class SimpleDense(Layer):
-
-    def __init__(self, units=32):
-        super(SimpleDense, self).__init__()
-        self.units = units
-
-    def build(self, input_shape):  # Create the state of the layer (weights)
-      w_init = tf.random_normal_initializer()
-      self.w = tf.Variable(
-          initial_value=w_init(shape=(input_shape[-1], self.units),
-                               dtype='float32'),
-          trainable=True)
-      b_init = tf.zeros_initializer()
-      self.b = tf.Variable(
-          initial_value=b_init(shape=(self.units,), dtype='float32'),
-          trainable=True)
-
-    def call(self, inputs):  # Defines the computation from inputs to outputs
-        return tf.matmul(inputs, self.w) + self.b
-
-  # Instantiates the layer.
-  linear_layer = SimpleDense(4)
-
-  # This will also call `build(input_shape)` and create the weights.
-  y = linear_layer(tf.ones((2, 2)))
-  assert len(linear_layer.weights) == 2
-
-  # These weights are trainable, so they're listed in `trainable_weights`:
-  assert len(linear_layer.trainable_weights) == 2
-  ```
-
-  Note that the method `add_weight()` offers a shortcut to create weights:
-
-  ```python
-  class SimpleDense(Layer):
-
-    def __init__(self, units=32):
-        super(SimpleDense, self).__init__()
-        self.units = units
-
-    def build(self, input_shape):
-        self.w = self.add_weight(shape=(input_shape[-1], self.units),
-                                 initializer='random_normal',
-                                 trainable=True)
-        self.b = self.add_weight(shape=(self.units,),
-                                 initializer='random_normal',
-                                 trainable=True)
-
-    def call(self, inputs):
-        return tf.matmul(inputs, self.w) + self.b
-  ```
-
-  Besides trainable weights, updated via backpropagation during training,
-  layers can also have non-trainable weights. These weights are meant to
-  be updated manually during `call()`. Here's a example layer that computes
-  the running sum of its inputs:
-
-  ```python
-  class ComputeSum(Layer):
-
-    def __init__(self, input_dim):
-        super(ComputeSum, self).__init__()
-        # Create a non-trainable weight.
-        self.total = tf.Variable(initial_value=tf.zeros((input_dim,)),
-                                 trainable=False)
-
-    def call(self, inputs):
-        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
-        return self.total
-
-  my_sum = ComputeSum(2)
-  x = tf.ones((2, 2))
-
-  y = my_sum(x)
-  print(y.numpy())  # [2. 2.]
-
-  y = my_sum(x)
-  print(y.numpy())  # [4. 4.]
-
-  assert my_sum.weights == [my_sum.total]
-  assert my_sum.non_trainable_weights == [my_sum.total]
-  assert my_sum.trainable_weights == []
-  ```
-
-  For more information about creating layers, see the guide
-  [Making new Layers and Models via subclassing](
-    https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self,
-               trainable=True,
-               name=None,
-               dtype=None,
-               dynamic=False,
-               **kwargs):
-    self._instrument_layer_creation()
-
-    # These properties should be set by the user via keyword arguments.
-    # note that 'dtype', 'input_shape' and 'batch_input_shape'
-    # are only applicable to input layers: do not pass these keywords
-    # to non-input layers.
-    allowed_kwargs = {
-        'input_dim',
-        'input_shape',
-        'batch_input_shape',
-        'batch_size',
-        'weights',
-        'activity_regularizer',
-        'autocast',
-        'implementation',
-    }
-    # Validate optional keyword arguments.
-    generic_utils.validate_kwargs(kwargs, allowed_kwargs)
-
-    # Mutable properties
-    # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training.
-    if not (isinstance(trainable, bool) or
-            (isinstance(trainable, (tf.Tensor, tf.Variable)) and
-             trainable.dtype is tf.bool)):
-      raise TypeError(
-          'Expected `trainable` argument to be a boolean, '
-          f'but got: {trainable}')
-    self._trainable = trainable
-    # A stateful layer is a layer whose updates are run during inference too,
-    # for instance stateful RNNs.
-    self._stateful = False
-    # Indicates whether `build` needs to be called upon layer call, to create
-    # the layer's weights. (Note that the first call() may also create weights,
-    # independent of build().)
-    self.built = False
-    # Provides information about which inputs are compatible with the layer.
-    self._input_spec = None
-
-    # SavedModel-related attributes.
-    # Record the build input shape for loading purposes.
-    # TODO(kathywu): Move this to Layer._set_save_spec once cl/290121460 is
-    # submitted.
-    self._build_input_shape = None
-    self._saved_model_inputs_spec = None
-    self._saved_model_arg_spec = None
-
-    # `Layer.compute_mask` will be called at the end of `Layer.__call__` if
-    # `Layer.compute_mask` is overridden, or if the `Layer` subclass sets
-    # `self.supports_masking=True`.
-    self._supports_masking = not generic_utils.is_default(self.compute_mask)
-
-    self._init_set_name(name)
-    self._activity_regularizer = regularizers.get(
-        kwargs.pop('activity_regularizer', None))
-    self._maybe_create_attribute('_trainable_weights', [])
-    self._maybe_create_attribute('_non_trainable_weights', [])
-    self._updates = []
-    # Object to store all thread local layer properties.
-    self._thread_local = threading.local()
-    # A list of zero-argument lambdas which return Tensors, used for variable
-    # regularizers.
-    self._callable_losses = []
-    # A list of symbolic Tensors containing activity regularizers and losses
-    # manually added through `add_loss` in graph-building mode.
-    self._losses = []
-    # A list of metric instances corresponding to the symbolic metric tensors
-    # added using the `add_metric` API.
-    self._metrics = []
-    # Ensures the same metric is not added multiple times in `MirroredStrategy`.
-    self._metrics_lock = threading.Lock()
-
-    # Note that models also have a dtype policy, as they are layers. For
-    # functional models, the policy is only used in Model.compile, which wraps
-    # the optimizer with a LossScaleOptimizer if the policy name is
-    # "mixed_float16". Subclassed models additionally use the policy's compute
-    # and variable dtypes, as like any ordinary layer.
-    self._set_dtype_policy(dtype)
-    # Boolean indicating whether the layer automatically casts its inputs to the
-    # layer's compute_dtype.
-    self._autocast = kwargs.get('autocast',
-                                base_layer_utils.v2_dtype_behavior_enabled())
-
-    # Tracks `TrackableDataStructure`s, `Module`s, and `Layer`s.
-    # Ordered by when the object was assigned as an attr.
-    # Entries are unique.
-    self._maybe_create_attribute('_self_tracked_trackables', [])
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    # Used in symbolic mode only, only in conjunction with graph-networks
-    self._inbound_nodes_value = []
-    self._outbound_nodes_value = []
-
-    self._init_call_fn_args()
-
-    # Whether the `call` method can be used to build a TF graph without issues.
-    # This attribute has no effect if the model is created using the Functional
-    # API. Instead, `model.dynamic` is determined based on the internal layers.
-    if not isinstance(dynamic, bool):
-      raise TypeError(
-          f'Expected `dynamic` argument to be a boolean, but got: {dynamic}')
-    self._dynamic = dynamic
-
-    # Manage input shape information if passed.
-    if 'input_dim' in kwargs and 'input_shape' not in kwargs:
-      # Backwards compatibility: alias 'input_dim' to 'input_shape'.
-      kwargs['input_shape'] = (kwargs['input_dim'],)
-    if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
-      # In this case we will later create an input layer
-      # to insert before the current layer
-      if 'batch_input_shape' in kwargs:
-        batch_input_shape = tuple(kwargs['batch_input_shape'])
-      elif 'input_shape' in kwargs:
-        if 'batch_size' in kwargs:
-          batch_size = kwargs['batch_size']
-        else:
-          batch_size = None
-        batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
-      self._batch_input_shape = batch_input_shape
-
-    # Manage initial weight values if passed.
-    self._initial_weights = kwargs.get('weights', None)
-
-    # Whether the layer will track any layers that is set as attribute on itself
-    # as sub-layers, the weights from the sub-layers will be included in the
-    # parent layer's variables() as well.
-    # Default to True, which means auto tracking is turned on. Certain subclass
-    # might want to turn it off, like Sequential model.
-    self._auto_track_sub_layers = True
-
-    # For backwards compat reasons, most built-in layers do not guarantee
-    # That they will 100% preserve the structure of input args when saving
-    # / loading configs. E.g. they may un-nest an arg that is
-    # a list with one element.
-    self._preserve_input_structure_in_config = False
-
-    # Save outer name scope at layer declaration so that it is preserved at
-    # the actual layer construction.
-    self._name_scope_on_declaration = tf.get_current_name_scope()
-
-    # Save the temp regularization losses created in the DTensor use case.
-    # When DTensor is enable, we will first create LazyInitVariable and then
-    # DVariable with proper layout afterward. For the weights regularization
-    # loss, we have to create against the DVariable as well.
-    self._captured_weight_regularizer = []
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  @generic_utils.default
-  def build(self, input_shape):
-    """Creates the variables of the layer (optional, for subclass implementers).
-
-    This is a method that implementers of subclasses of `Layer` or `Model`
-    can override if they need a state-creation step in-between
-    layer instantiation and layer call. It is invoked automatically before
-    the first execution of `call()`.
-
-    This is typically used to create the weights of `Layer` subclasses
-    (at the discretion of the subclass implementer).
-
-    Args:
-      input_shape: Instance of `TensorShape`, or list of instances of
-        `TensorShape` if the layer expects a list of inputs
-        (one instance per input).
-    """
-    self._build_input_shape = input_shape
-    self.built = True
-
-  @doc_controls.for_subclass_implementers
-  def call(self, inputs, *args, **kwargs):  # pylint: disable=unused-argument
-    """This is where the layer's logic lives.
-
-    The `call()` method may not create state (except in its first invocation,
-    wrapping the creation of variables or other resources in `tf.init_scope()`).
-    It is recommended to create state in `__init__()`, or the `build()` method
-    that is called automatically before `call()` executes the first time.
-
-    Args:
-      inputs: Input tensor, or dict/list/tuple of input tensors.
-        The first positional `inputs` argument is subject to special rules:
-        - `inputs` must be explicitly passed. A layer cannot have zero
-          arguments, and `inputs` cannot be provided via the default value
-          of a keyword argument.
-        - NumPy array or Python scalar values in `inputs` get cast as tensors.
-        - Keras mask metadata is only collected from `inputs`.
-        - Layers are built (`build(input_shape)` method)
-          using shape info from `inputs` only.
-        - `input_spec` compatibility is only checked against `inputs`.
-        - Mixed precision input casting is only applied to `inputs`.
-          If a layer has tensor arguments in `*args` or `**kwargs`, their
-          casting behavior in mixed precision should be handled manually.
-        - The SavedModel input specification is generated using `inputs` only.
-        - Integration with various ecosystem packages like TFMOT, TFLite,
-          TF.js, etc is only supported for `inputs` and not for tensors in
-          positional and keyword arguments.
-      *args: Additional positional arguments. May contain tensors, although
-        this is not recommended, for the reasons above.
-      **kwargs: Additional keyword arguments. May contain tensors, although
-        this is not recommended, for the reasons above.
-        The following optional keyword arguments are reserved:
-        - `training`: Boolean scalar tensor of Python boolean indicating
-          whether the `call` is meant for training or inference.
-        - `mask`: Boolean input mask. If the layer's `call()` method takes a
-          `mask` argument, its default value will be set to the mask generated
-          for `inputs` by the previous layer (if `input` did come from a layer
-          that generated a corresponding mask, i.e. if it came from a Keras
-          layer with masking support).
-
-    Returns:
-      A tensor or list/tuple of tensors.
-    """
-    return inputs
-
-  @doc_controls.for_subclass_implementers
-  def add_weight(self,
-                 name=None,
-                 shape=None,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 use_resource=None,
-                 synchronization=tf.VariableSynchronization.AUTO,
-                 aggregation=tf.VariableAggregation.NONE,
-                 **kwargs):
-    """Adds a new variable to the layer.
-
-    Args:
-      name: Variable name.
-      shape: Variable shape. Defaults to scalar if unspecified.
-      dtype: The type of the variable. Defaults to `self.dtype`.
-      initializer: Initializer instance (callable).
-      regularizer: Regularizer instance (callable).
-      trainable: Boolean, whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases)
-        or "non_trainable_variables" (e.g. BatchNorm mean and variance).
-        Note that `trainable` cannot be `True` if `synchronization`
-        is set to `ON_READ`.
-      constraint: Constraint instance (callable).
-      use_resource: Whether to use a `ResourceVariable` or not.
-         See [this guide](https://www.tensorflow.org/guide/migrate/tf1_vs_tf2#resourcevariables_instead_of_referencevariables)  # pylint: disable=line-too-long
-         for more information.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      **kwargs: Additional keyword arguments. Accepted values are `getter`,
-        `collections`, `experimental_autocast` and `caching_device`.
-
-    Returns:
-      The variable created.
-
-    Raises:
-      ValueError: When giving unsupported dtype and no initializer or when
-        trainable has been set to True with synchronization set as `ON_READ`.
-    """
-    if shape is None:
-      shape = ()
-    kwargs.pop('partitioner', None)  # Ignored.
-    # Validate optional keyword arguments.
-    for kwarg in kwargs:
-      if kwarg not in ['collections', 'experimental_autocast',
-                       'caching_device', 'getter', 'layout']:
-        raise TypeError('Unknown keyword argument:', kwarg)
-    collections_arg = kwargs.pop('collections', None)
-    # 'experimental_autocast' can be set to False by the caller to indicate an
-    # AutoCastVariable should never be created.
-    autocast = kwargs.pop('experimental_autocast', True)
-    # See the docstring for tf.Variable about the details for caching_device.
-    caching_device = kwargs.pop('caching_device', None)
-
-    layout = kwargs.pop('layout', None)
-    # Specially handling of auto layout fetch, based on the variable name and
-    # attribute name. For built-in keras layers, usually the variable name, eg
-    # 'kernel', will match with a 'kernel_layout' attribute name on the
-    # instance. We will try to do this auto fetch if layout is not explicitly
-    # specified. This is mainly a quick workaround for not applying too many
-    # interface change to built-in layers, until DTensor is a public API.
-    # Also see dtensor.utils.allow_initializer_layout for more details.
-    # TODO(scottzhu): Remove this once dtensor is public to end user.
-    if not layout and name:
-      layout = getattr(self, name + '_layout', None)
-
-    if dtype is None:
-      dtype = self.dtype or backend.floatx()
-    dtype = tf.as_dtype(dtype)
-    if self._dtype_policy.variable_dtype is None:
-      # The policy is "_infer", so we infer the policy from the variable dtype.
-      self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
-    initializer = initializers.get(initializer)
-    regularizer = regularizers.get(regularizer)
-    constraint = constraints.get(constraint)
-
-    if synchronization == tf.VariableSynchronization.ON_READ:
-      if trainable:
-        raise ValueError(
-            'Synchronization value can be set to '
-            'VariableSynchronization.ON_READ only for non-trainable variables. '
-            'You have specified trainable=True and '
-            'synchronization=VariableSynchronization.ON_READ.')
-      else:
-        # Set trainable to be false when variable is to be synced on read.
-        trainable = False
-    elif trainable is None:
-      trainable = True
-
-    # Initialize variable when no initializer provided
-    if initializer is None:
-      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
-      if dtype.is_floating:
-        initializer = initializers.get('glorot_uniform')
-      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
-      # If dtype is DT_BOOL, provide a default value `FALSE`
-      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
-        initializer = initializers.get('zeros')
-      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
-      elif 'getter' not in kwargs:
-        # When `getter` is specified, it's possibly fine for `initializer` to be
-        # None since it's up to the custom `getter` to raise error in case it
-        # indeed needs `initializer`.
-        raise ValueError(f'An initializer for variable {name} of type '
-                         f'{dtype.base_dtype} is required for layer '
-                         f'{self.name}. Received: {initializer}.')
-
-    getter = kwargs.pop('getter', base_layer_utils.make_variable)
-    if (autocast and
-        self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype
-        and dtype.is_floating):
-      old_getter = getter
-      # Wrap variable constructor to return an AutoCastVariable.
-      def getter(*args, **kwargs):  # pylint: disable=function-redefined
-        variable = old_getter(*args, **kwargs)
-        return autocast_variable.create_autocast_variable(variable)
-      # Also the caching_device does not work with the mixed precision API,
-      # disable it if it is specified.
-      # TODO(b/142020079): Re-enable it once the bug is fixed.
-      if caching_device is not None:
-        tf_logging.warning(
-            '`caching_device` does not work with mixed precision API. Ignoring '
-            'user specified `caching_device`.')
-        caching_device = None
-    if layout:
-      getter = functools.partial(getter, layout=layout)
-
-    variable = self._add_variable_with_custom_getter(
-        name=name,
-        shape=shape,
-        # TODO(allenl): a `make_variable` equivalent should be added as a
-        # `Trackable` method.
-        getter=getter,
-        # Manage errors in Layer rather than Trackable.
-        overwrite=True,
-        initializer=initializer,
-        dtype=dtype,
-        constraint=constraint,
-        trainable=trainable,
-        use_resource=use_resource,
-        collections=collections_arg,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        caching_device=caching_device)
-    if regularizer is not None:
-      # TODO(fchollet): in the future, this should be handled at the
-      # level of variable creation, and weight regularization losses
-      # should be variable attributes.
-      name_in_scope = variable.name[:variable.name.find(':')]
-      self._handle_weight_regularization(name_in_scope,
-                                         variable,
-                                         regularizer)
-    if base_layer_utils.is_split_variable(variable):
-      for v in variable:
-        backend.track_variable(v)
-        if trainable:
-          self._trainable_weights.append(v)
-        else:
-          self._non_trainable_weights.append(v)
-    else:
-      backend.track_variable(variable)
-      if trainable:
-        self._trainable_weights.append(variable)
-      else:
-        self._non_trainable_weights.append(variable)
-    return variable
-
-  @generic_utils.default
-  def get_config(self):
-    """Returns the config of the layer.
-
-    A layer config is a Python dictionary (serializable)
-    containing the configuration of a layer.
-    The same layer can be reinstantiated later
-    (without its trained weights) from this configuration.
-
-    The config of a layer does not include connectivity
-    information, nor the layer class name. These are handled
-    by `Network` (one layer of abstraction above).
-
-    Note that `get_config()` does not guarantee to return a fresh copy of dict
-    every time it is called. The callers should make a copy of the returned dict
-    if they want to modify it.
-
-    Returns:
-        Python dictionary.
-    """
-    all_args = tf_inspect.getfullargspec(self.__init__).args
-    config = {
-        'name': self.name,
-        'trainable': self.trainable,
-    }
-    if hasattr(self, '_batch_input_shape'):
-      config['batch_input_shape'] = self._batch_input_shape
-    config['dtype'] = policy.serialize(self._dtype_policy)
-    if hasattr(self, 'dynamic'):
-      # Only include `dynamic` in the `config` if it is `True`
-      if self.dynamic:
-        config['dynamic'] = self.dynamic
-      elif 'dynamic' in all_args:
-        all_args.remove('dynamic')
-    expected_args = config.keys()
-    # Finds all arguments in the `__init__` that are not in the config:
-    extra_args = [arg for arg in all_args if arg not in expected_args]
-    # Check that either the only argument in the `__init__` is  `self`,
-    # or that `get_config` has been overridden:
-    if len(extra_args) > 1 and hasattr(self.get_config, '_is_default'):
-      raise NotImplementedError(textwrap.dedent(f"""
-          Layer {self.__class__.__name__} has arguments {extra_args}
-          in `__init__` and therefore must override `get_config()`.
-
-          Example:
-
-          class CustomLayer(keras.layers.Layer):
-              def __init__(self, arg1, arg2):
-                  super().__init__()
-                  self.arg1 = arg1
-                  self.arg2 = arg2
-
-              def get_config(self):
-                  config = super().get_config()
-                  config.update({{
-                      "arg1": self.arg1,
-                      "arg2": self.arg2,
-                  }})
-                  return config"""))
-
-    return config
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates a layer from its config.
-
-    This method is the reverse of `get_config`,
-    capable of instantiating the same layer from the config
-    dictionary. It does not handle layer connectivity
-    (handled by Network), nor weights (handled by `set_weights`).
+    """Helper to get relative name scope from fully-speced nested name scopes.
 
     Args:
-        config: A Python dictionary, typically the
-            output of get_config.
-
-    Returns:
-        A layer instance.
-    """
-    return cls(**config)
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer.
-
-    This method will cause the layer's state to be built, if that has not
-    happened before. This requires that the layer will later be used with
-    inputs that match the input shape provided here.
-
-    Args:
-        input_shape: Shape tuple (tuple of integers)
-            or list of shape tuples (one per output tensor of the layer).
-            Shape tuples can include None for free dimensions,
-            instead of an integer.
-
-    Returns:
-        An input shape tuple.
-    """
-    if tf.executing_eagerly():
-      # In this case we build the model first in order to do shape inference.
-      # This is acceptable because the framework only calls
-      # `compute_output_shape` on shape values that the layer would later be
-      # built for. It would however cause issues in case a user attempts to
-      # use `compute_output_shape` manually with shapes that are incompatible
-      # with the shape the Layer will be called on (these users will have to
-      # implement `compute_output_shape` themselves).
-      self._maybe_build(input_shape)
-      graph_name = str(self.name) + '_scratch_graph'
-      with tf.__internal__.FuncGraph(graph_name).as_default():
-        input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-        def _make_placeholder_like(shape):
-          ph = backend.placeholder(shape=shape, dtype=self.dtype)
-          ph._keras_mask = None
-          return ph
-        inputs = tf.nest.map_structure(_make_placeholder_like, input_shape)
-        try:
-          outputs = self(inputs, training=False)
-        except TypeError as e:
-          raise NotImplementedError(
-              'We could not automatically infer the static shape of the '
-              'layer\'s output. Please implement the '
-              '`compute_output_shape` method on your layer (%s).' %
-              self.__class__.__name__) from e
-      return tf.nest.map_structure(lambda t: t.shape, outputs)
-    raise NotImplementedError(
-        'Please run in eager mode or implement the `compute_output_shape` '
-        'method on your layer (%s).' % self.__class__.__name__)
-
-  @doc_controls.for_subclass_implementers
-  def compute_output_signature(self, input_signature):
-    """Compute the output tensor signature of the layer based on the inputs.
-
-    Unlike a TensorShape object, a TensorSpec object contains both shape
-    and dtype information for a tensor. This method allows layers to provide
-    output dtype information if it is different from the input dtype.
-    For any layer that doesn't implement this function,
-    the framework will fall back to use `compute_output_shape`, and will
-    assume that the output dtype matches the input dtype.
-
-    Args:
-      input_signature: Single TensorSpec or nested structure of TensorSpec
-        objects, describing a candidate input for the layer.
-
-    Returns:
-      Single TensorSpec or nested structure of TensorSpec objects, describing
-        how the layer would transform the provided input.
-
-    Raises:
-      TypeError: If input_signature contains a non-TensorSpec object.
-    """
-    def check_type_return_shape(s):
-      if not isinstance(s, tf.TensorSpec):
-        raise TypeError('Only TensorSpec signature types are supported. '
-                        f'Received: {s}.')
-      return s.shape
-    input_shape = tf.nest.map_structure(
-        check_type_return_shape, input_signature)
-    output_shape = self.compute_output_shape(input_shape)
-    dtype = self._compute_dtype
-    if dtype is None:
-      input_dtypes = [s.dtype for s in tf.nest.flatten(input_signature)]
-      # Default behavior when self.dtype is None, is to use the first input's
-      # dtype.
-      dtype = input_dtypes[0]
-    return tf.nest.map_structure(
-        lambda s: tf.TensorSpec(dtype=dtype, shape=s),
-        output_shape)
-
-  @generic_utils.default
-  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
-    """Computes an output mask tensor.
-
-    Args:
-        inputs: Tensor or list of tensors.
-        mask: Tensor or list of tensors.
-
-    Returns:
-        None or a tensor (or list of tensors,
-            one per output tensor of the layer).
-    """
-    if not self._supports_masking:
-      if any(m is not None for m in tf.nest.flatten(mask)):
-        raise TypeError('Layer ' + self.name + ' does not support masking, '
-                        'but was passed an input_mask: ' + str(mask))
-      # masking not explicitly supported: return None as mask.
-      return None
-    # if masking is explicitly supported, by default
-    # carry over the input mask
-    return mask
-
-  @traceback_utils.filter_traceback
-  def __call__(self, *args, **kwargs):
-    """Wraps `call`, applying pre- and post-processing steps.
-
-    Args:
-      *args: Positional arguments to be passed to `self.call`.
-      **kwargs: Keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-
-    Note:
-      - The following optional keyword arguments are reserved for specific uses:
-        * `training`: Boolean scalar tensor of Python boolean indicating
-          whether the `call` is meant for training or inference.
-        * `mask`: Boolean input mask.
-      - If the layer's `call` method takes a `mask` argument (as some Keras
-        layers do), its default value will be set to the mask generated
-        for `inputs` by the previous layer (if `input` did come from
-        a layer that generated a corresponding mask, i.e. if it came from
-        a Keras layer with masking support.
-      - If the layer is not built, the method will call `build`.
-
-    Raises:
-      ValueError: if the layer's `call` method returns None (an invalid value).
-      RuntimeError: if `super().__init__()` was not called in the constructor.
-    """
-    if not hasattr(self, '_thread_local'):
-      raise RuntimeError(
-          'You must call `super().__init__()` in the layer constructor.')
-
-    # `inputs` (the first arg in the method spec) is special cased in
-    # layer call due to historical reasons.
-    # This special casing currently takes the form of:
-    # - 'inputs' must be explicitly passed. A layer cannot have zero arguments,
-    #   and inputs cannot have been provided via the default value of a kwarg.
-    # - numpy/scalar values in `inputs` get converted to tensors
-    # - implicit masks / mask metadata are only collected from 'inputs`
-    # - Layers are built using shape info from 'inputs' only
-    # - input_spec compatibility is only checked against `inputs`
-    # - mixed precision casting (autocast) is only applied to `inputs`,
-    #   not to any other argument.
-    inputs, args, kwargs = self._call_spec.split_out_first_arg(args, kwargs)
-    input_list = tf.nest.flatten(inputs)
-
-    # Functional Model construction mode is invoked when `Layer`s are called on
-    # symbolic `KerasTensor`s, i.e.:
-    # >> inputs = tf.keras.Input(10)
-    # >> outputs = MyLayer()(inputs)  # Functional construction mode.
-    # >> model = tf.keras.Model(inputs, outputs)
-    if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
-      return self._functional_construction_call(inputs, args, kwargs,
-                                                input_list)
-
-    # Maintains info about the `Layer.call` stack.
-    call_context = base_layer_utils.call_context()
-
-    # Accept NumPy and scalar inputs by converting to Tensors.
-    if any(isinstance(x, (
-        tf.Tensor, np.ndarray, float, int)) for x in input_list):
-      inputs = tf.nest.map_structure(_convert_numpy_or_python_types, inputs)
-      input_list = tf.nest.flatten(inputs)
-
-    # Handle `mask` propagation from previous layer to current layer. Masks can
-    # be propagated explicitly via the `mask` argument, or implicitly via
-    # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed
-    # explicitly take priority.
-    input_masks, mask_is_implicit = self._get_input_masks(
-        inputs, input_list, args, kwargs)
-    if self._expects_mask_arg and mask_is_implicit:
-      kwargs['mask'] = input_masks
-
-    # Training mode for `Layer.call` is set via (in order of priority):
-    # (1) The `training` argument passed to this `Layer.call`, if it is not None
-    # (2) The training mode of an outer `Layer.call`.
-    # (3) The default mode set by `tf.keras.backend.set_learning_phase` (if set)
-    # (4) Any non-None default value for `training` specified in the call
-    #  signature
-    # (5) False (treating the layer as if it's in inference)
-    args, kwargs, training_mode = self._set_training_mode(
-        args, kwargs, call_context)
-
-    # Losses are cleared for all sublayers on the outermost `Layer.call`.
-    # Losses are not cleared on inner `Layer.call`s, because sublayers can be
-    # called multiple times.
-    if not call_context.in_call:
-      self._clear_losses()
-
-    eager = tf.executing_eagerly()
-    with call_context.enter(
-        layer=self,
-        inputs=inputs,
-        build_graph=not eager,
-        training=training_mode):
-
-      input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
-
-      if eager:
-        call_fn = self.call
-        name_scope = self._name
-      else:
-        name_scope = self._get_unnested_name_scope()
-        call_fn = self._autographed_call()
-
-      call_fn = traceback_utils.inject_argument_info_in_traceback(
-          call_fn,
-          object_name=f'layer "{self.name}" (type {self.__class__.__name__})')
-      with contextlib.ExitStack() as namescope_stack:
-        if _is_name_scope_on_model_declaration_enabled:
-          namescope_stack.enter_context(_name_scope_unnester(
-              self._name_scope_on_declaration))
-        namescope_stack.enter_context(tf.name_scope(name_scope))
-
-        if not self.built:
-          self._maybe_build(inputs)
-
-        if self._autocast:
-          inputs = self._maybe_cast_inputs(inputs, input_list)
-
-        with autocast_variable.enable_auto_cast_variables(
-            self._compute_dtype_object):
-          outputs = call_fn(inputs, *args, **kwargs)
-
-        if self._activity_regularizer:
-          self._handle_activity_regularization(inputs, outputs)
-        if self._supports_masking:
-          self._set_mask_metadata(inputs, outputs, input_masks, not eager)
-        if self._saved_model_inputs_spec is None:
-          self._set_save_spec(inputs, args, kwargs)
+      full_name_scope: full(absolute) name scope path.
 
-        return outputs
+    Yields:
+      Relative name scope path from the parent `_name_scope_unnester` context
+      manager.
 
-  def _get_unnested_name_scope(self):
-    if _is_name_scope_on_model_declaration_enabled:
-      with _name_scope_unnester(self._name_scope_on_declaration
-                               ) as relative_name_scope_on_declaration:
-        # To avoid `tf.name_scope` autoincrement, use absolute path.
-        relative_name_scope = filter(
-            None,
-            [tf.get_current_name_scope(), relative_name_scope_on_declaration])
-        current_name_scope = '/'.join(relative_name_scope) + '/'
-        if current_name_scope == '/':
-          current_name_scope = self._name_scope_on_declaration
-        with tf.name_scope(current_name_scope):
-          name_scope = self._name_scope()  # Avoid autoincrementing.  # pylint: disable=not-callable
-    else:
-      name_scope = self._name_scope()
-
-    return name_scope
-
-  @property
-  def dtype(self):
-    """The dtype of the layer weights.
-
-    This is equivalent to `Layer.dtype_policy.variable_dtype`. Unless
-    mixed precision is used, this is the same as `Layer.compute_dtype`, the
-    dtype of the layer's computations.
+    Example:
+    ```
+    with _name_scope_unnester('a') as name1:  # name1 == 'a'
+      with _name_scope_unnester('a/b') as name2:  # name2 == 'b'
+        with _name_scope_unnester('a/b/c') as name3:  # name3 == 'c'
+          pass
+    ```
     """
-    return self._dtype_policy.variable_dtype
-
-  @property
-  def name(self):
-    """Name of the layer (string), set in the constructor."""
-    return self._name
+    if not getattr(_name_scope_unnester_stack, "value", None):
+        _name_scope_unnester_stack.value = [""]
 
-  @property
-  def supports_masking(self):
-    """Whether this layer supports computing a mask using `compute_mask`."""
-    return self._supports_masking
+    _name_scope_unnester_stack.value.append(full_name_scope)
 
-  @supports_masking.setter
-  def supports_masking(self, value):
-    self._supports_masking = value
+    try:
+        full_name_scope = _name_scope_unnester_stack.value[-1]
+        outer_name_scope = _name_scope_unnester_stack.value[-2]
+        relative_name_scope = full_name_scope.lstrip(outer_name_scope)
+        relative_name_scope = relative_name_scope.lstrip("/")
+        yield relative_name_scope
+    finally:
+        _name_scope_unnester_stack.value.pop()
 
-  @property
-  def dynamic(self):
-    """Whether the layer is dynamic (eager-only); set in the constructor."""
-    return any(layer._dynamic for layer in self._flatten_layers())
 
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def stateful(self):
-    return any(layer._stateful for layer in self._flatten_layers())
+@keras_export("keras.layers.Layer")
+class Layer(tf.Module, version_utils.LayerVersionSelector):
+    """This is the class from which all layers inherit.
 
-  @stateful.setter
-  def stateful(self, value):
-    self._stateful = value
+    A layer is a callable object that takes as input one or more tensors and
+    that outputs one or more tensors. It involves *computation*, defined
+    in the `call()` method, and a *state* (weight variables). State can be
+    created in various places, at the convenience of the subclass implementer:
 
-  @property
-  def trainable(self):
-    return self._trainable
+    * in `__init__()`;
+    * in the optional `build()` method, which is invoked by the first
+      `__call__()` to the layer, and supplies the shape(s) of the input(s),
+      which may not have been known at initialization time;
+    * in the first invocation of `call()`, with some caveats discussed
+      below.
 
-  @trainable.setter
-  def trainable(self, value):
-    """Sets trainable attribute for the layer and its sublayers.
+    Layers are recursively composable: If you assign a Layer instance as an
+    attribute of another Layer, the outer layer will start tracking the weights
+    created by the inner layer. Nested layers should be instantiated in the
+    `__init__()` method.
 
-    When this value is changed during training (e.g. with a
-    `tf.keras.callbacks.Callback`) you need to call the parent
-    `tf.keras.Model.make_train_function` with `force=True` in order to recompile
-    the training graph.
+    Users will just instantiate a layer and then treat it as a callable.
 
     Args:
-      value: Boolean with the desired state for the layer's trainable attribute.
-    """
-    for layer in self._flatten_layers():
-      layer._trainable = value
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
-
-  @activity_regularizer.setter
-  def activity_regularizer(self, regularizer):
-    """Optional regularizer function for the output of this layer."""
-    self._activity_regularizer = regularizer
-
-  @property
-  def input_spec(self):
-    """`InputSpec` instance(s) describing the input format for this layer.
-
-    When you create a layer subclass, you can set `self.input_spec` to enable
-    the layer to run input compatibility checks when it is called.
-    Consider a `Conv2D` layer: it can only be called on a single input tensor
-    of rank 4. As such, you can set, in `__init__()`:
-
-    ```python
-    self.input_spec = tf.keras.layers.InputSpec(ndim=4)
-    ```
-
-    Now, if you try to call the layer on an input that isn't rank 4
-    (for instance, an input of shape `(2,)`, it will raise a nicely-formatted
-    error:
-
-    ```
-    ValueError: Input 0 of layer conv2d is incompatible with the layer:
-    expected ndim=4, found ndim=1. Full shape received: [2]
-    ```
-
-    Input checks that can be specified via `input_spec` include:
-    - Structure (e.g. a single input, a list of 2 inputs, etc)
-    - Shape
-    - Rank (ndim)
-    - Dtype
-
-    For more information, see `tf.keras.layers.InputSpec`.
-
-    Returns:
-      A `tf.keras.layers.InputSpec` instance, or nested structure thereof.
-    """
-    return self._input_spec
-
-  @input_spec.setter
-  # Must be decorated to prevent tracking, since the input_spec can be nested
-  # InputSpec objects.
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def input_spec(self, value):
-    for v in tf.nest.flatten(value):
-      if v is not None and not isinstance(v, input_spec.InputSpec):
-        raise TypeError('Layer input_spec must be an instance of InputSpec. '
-                        'Got: {}'.format(v))
-    self._input_spec = value
-
-  @property
-  def trainable_weights(self):
-    """List of all trainable weights tracked by this layer.
-
-    Trainable weights are updated via gradient descent during training.
-
-    Returns:
-      A list of trainable variables.
-    """
-    if self.trainable:
-      children_weights = self._gather_children_attribute('trainable_variables')
-      return self._dedup_weights(self._trainable_weights + children_weights)
-    else:
-      return []
-
-  @property
-  def non_trainable_weights(self):
-    """List of all non-trainable weights tracked by this layer.
-
-    Non-trainable weights are *not* updated during training. They are expected
-    to be updated manually in `call()`.
-
-    Returns:
-      A list of non-trainable variables.
-    """
-    if self.trainable:
-      children_weights = self._gather_children_attribute(
-          'non_trainable_variables')
-      non_trainable_weights = self._non_trainable_weights + children_weights
-    else:
-      children_weights = self._gather_children_attribute('variables')
-      non_trainable_weights = (
-          self._trainable_weights + self._non_trainable_weights +
-          children_weights)
-    return self._dedup_weights(non_trainable_weights)
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.trainable_weights + self.non_trainable_weights
-
-  @property
-  @doc_controls.do_not_generate_docs
-  def updates(self):
-    warnings.warn(
-        '`layer.updates` will be removed in a future version. '
-        'This property should not be used in TensorFlow 2.0, '
-        'as `updates` are applied automatically.',
-        stacklevel=2)
-    return []
-
-  @property
-  def losses(self):
-    """List of losses added using the `add_loss()` API.
-
-    Variable regularization tensors are created when this property is accessed,
-    so it is eager safe: accessing `losses` under a `tf.GradientTape` will
-    propagate gradients back to the corresponding variables.
+      trainable: Boolean, whether the layer's variables should be trainable.
+      name: String name of the layer.
+      dtype: The dtype of the layer's computations and weights. Can also be a
+        `tf.keras.mixed_precision.Policy`, which allows the computation and
+        weight dtype to differ. Default of `None` means to use
+        `tf.keras.mixed_precision.global_policy()`, which is a float32 policy
+        unless set to different value.
+      dynamic: Set this to `True` if your layer should only be run eagerly, and
+        should not be used to generate a static computation graph.
+        This would be the case for a Tree-RNN or a recursive network,
+        for example, or generally for any layer that manipulates tensors
+        using Python control flow. If `False`, we assume that the layer can
+        safely be used to generate a static computation graph.
+
+    Attributes:
+      name: The name of the layer (string).
+      dtype: The dtype of the layer's weights.
+      variable_dtype: Alias of `dtype`.
+      compute_dtype: The dtype of the layer's computations. Layers automatically
+        cast inputs to this dtype which causes the computations and output to
+        also be in this dtype. When mixed precision is used with a
+        `tf.keras.mixed_precision.Policy`, this will be different than
+        `variable_dtype`.
+      dtype_policy: The layer's dtype policy. See the
+        `tf.keras.mixed_precision.Policy` documentation for details.
+      trainable_weights: List of variables to be included in backprop.
+      non_trainable_weights: List of variables that should not be
+        included in backprop.
+      weights: The concatenation of the lists trainable_weights and
+        non_trainable_weights (in this order).
+      trainable: Whether the layer should be trained (boolean), i.e. whether
+        its potentially-trainable weights should be returned as part of
+        `layer.trainable_weights`.
+      input_spec: Optional (list of) `InputSpec` object(s) specifying the
+        constraints on inputs that can be accepted by the layer.
+
+    We recommend that descendants of `Layer` implement the following methods:
+
+    * `__init__()`: Defines custom layer attributes, and creates layer weights
+      that do not depend on input shapes, using `add_weight()`, or other state.
+    * `build(self, input_shape)`: This method can be used to create weights that
+      depend on the shape(s) of the input(s), using `add_weight()`, or other
+      state. `__call__()` will automatically build the layer (if it has not been
+      built yet) by calling `build()`.
+    * `call(self, inputs, *args, **kwargs)`: Called in `__call__` after making
+      sure `build()` has been called. `call()` performs the logic of applying
+      the layer to the `inputs`. The first invocation may additionally create
+      state that could not be conveniently created in `build()`; see its
+      docstring for details.
+      Two reserved keyword arguments you can optionally use in `call()` are:
+        - `training` (boolean, whether the call is in inference mode or training
+          mode). See more details in [the layer/model subclassing guide](
+          https://www.tensorflow.org/guide/keras/custom_layers_and_models#privileged_training_argument_in_the_call_method)
+        - `mask` (boolean tensor encoding masked timesteps in the input, used
+          in RNN layers). See more details in
+          [the layer/model subclassing guide](
+          https://www.tensorflow.org/guide/keras/custom_layers_and_models#privileged_mask_argument_in_the_call_method)
+      A typical signature for this method is `call(self, inputs)`, and user
+      could optionally add `training` and `mask` if the layer need them. `*args`
+      and `**kwargs` is only useful for future extension when more input
+      parameters are planned to be added.
+    * `get_config(self)`: Returns a dictionary containing the configuration used
+      to initialize this layer. If the keys differ from the arguments
+      in `__init__`, then override `from_config(self)` as well.
+      This method is used when saving
+      the layer or a model that contains this layer.
 
     Examples:
 
-    >>> class MyLayer(tf.keras.layers.Layer):
-    ...   def call(self, inputs):
-    ...     self.add_loss(tf.abs(tf.reduce_mean(inputs)))
-    ...     return inputs
-    >>> l = MyLayer()
-    >>> l(np.ones((10, 1)))
-    >>> l.losses
-    [1.0]
-
-    >>> inputs = tf.keras.Input(shape=(10,))
-    >>> x = tf.keras.layers.Dense(10)(inputs)
-    >>> outputs = tf.keras.layers.Dense(1)(x)
-    >>> model = tf.keras.Model(inputs, outputs)
-    >>> # Activity regularization.
-    >>> len(model.losses)
-    0
-    >>> model.add_loss(tf.abs(tf.reduce_mean(x)))
-    >>> len(model.losses)
-    1
-
-    >>> inputs = tf.keras.Input(shape=(10,))
-    >>> d = tf.keras.layers.Dense(10, kernel_initializer='ones')
-    >>> x = d(inputs)
-    >>> outputs = tf.keras.layers.Dense(1)(x)
-    >>> model = tf.keras.Model(inputs, outputs)
-    >>> # Weight regularization.
-    >>> model.add_loss(lambda: tf.reduce_mean(d.kernel))
-    >>> model.losses
-    [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
-
-    Returns:
-      A list of tensors.
-    """
-    collected_losses = []
-    for layer in self._flatten_layers():
-      # If any eager losses are present, we assume the model to be part of an
-      # eager training loop (either a custom one or the one used when
-      # `run_eagerly=True`) and so we always return just the eager losses.
-      if layer._eager_losses:
-        # Filter placeholder losses that may have been added by revived layers.
-        # (see base_layer_utils for details).
-        if (layer._eager_losses[0] is
-            not base_layer_utils.REVIVED_LOSS_PLACEHOLDER):
-          collected_losses.extend(layer._eager_losses)
-      else:
-        collected_losses.extend(layer._losses)
-      for regularizer in layer._callable_losses:
-        loss_tensor = regularizer()
-        if loss_tensor is not None:
-          collected_losses.append(loss_tensor)
-    return collected_losses
-
-  def add_loss(self, losses, **kwargs):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    This method can be used inside a subclassed layer or model's `call`
-    function, in which case `losses` should be a Tensor or list of Tensors.
-
-    Example:
-
-    ```python
-    class MyLayer(tf.keras.layers.Layer):
-      def call(self, inputs):
-        self.add_loss(tf.abs(tf.reduce_mean(inputs)))
-        return inputs
-    ```
-
-    This method can also be called directly on a Functional Model during
-    construction. In this case, any loss Tensors passed to this Model must
-    be symbolic and be able to be traced back to the model's `Input`s. These
-    losses become part of the model's topology and are tracked in `get_config`.
-
-    Example:
+    Here's a basic example: a layer with two variables, `w` and `b`,
+    that returns `y = w . x + b`.
+    It shows how to implement `build()` and `call()`.
+    Variables set as attributes of a layer are tracked as weights
+    of the layers (in `layer.weights`).
 
     ```python
-    inputs = tf.keras.Input(shape=(10,))
-    x = tf.keras.layers.Dense(10)(inputs)
-    outputs = tf.keras.layers.Dense(1)(x)
-    model = tf.keras.Model(inputs, outputs)
-    # Activity regularization.
-    model.add_loss(tf.abs(tf.reduce_mean(x)))
+    class SimpleDense(Layer):
+
+      def __init__(self, units=32):
+          super(SimpleDense, self).__init__()
+          self.units = units
+
+      def build(self, input_shape):  # Create the state of the layer (weights)
+        w_init = tf.random_normal_initializer()
+        self.w = tf.Variable(
+            initial_value=w_init(shape=(input_shape[-1], self.units),
+                                 dtype='float32'),
+            trainable=True)
+        b_init = tf.zeros_initializer()
+        self.b = tf.Variable(
+            initial_value=b_init(shape=(self.units,), dtype='float32'),
+            trainable=True)
+
+      def call(self, inputs):  # Defines the computation from inputs to outputs
+          return tf.matmul(inputs, self.w) + self.b
+
+    # Instantiates the layer.
+    linear_layer = SimpleDense(4)
+
+    # This will also call `build(input_shape)` and create the weights.
+    y = linear_layer(tf.ones((2, 2)))
+    assert len(linear_layer.weights) == 2
+
+    # These weights are trainable, so they're listed in `trainable_weights`:
+    assert len(linear_layer.trainable_weights) == 2
     ```
 
-    If this is not the case for your loss (if, for example, your loss references
-    a `Variable` of one of the model's layers), you can wrap your loss in a
-    zero-argument lambda. These losses are not tracked as part of the model's
-    topology since they can't be serialized.
-
-    Example:
+    Note that the method `add_weight()` offers a shortcut to create weights:
 
     ```python
-    inputs = tf.keras.Input(shape=(10,))
-    d = tf.keras.layers.Dense(10)
-    x = d(inputs)
-    outputs = tf.keras.layers.Dense(1)(x)
-    model = tf.keras.Model(inputs, outputs)
-    # Weight regularization.
-    model.add_loss(lambda: tf.reduce_mean(d.kernel))
-    ```
+    class SimpleDense(Layer):
 
-    Args:
-      losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
-        may also be zero-argument callables which create a loss tensor.
-      **kwargs: Used for backwards compatibility only.
-    """
-    kwargs.pop('inputs', None)
-    if kwargs:
-      raise TypeError('Unknown keyword arguments: %s' % (kwargs.keys(),))
-
-    def _tag_callable(loss):
-      """Tags callable loss tensor as `_unconditional_loss`."""
-      if callable(loss):
-        # We run the loss without autocasting, as regularizers are often
-        # numerically unstable in float16.
-        with autocast_variable.enable_auto_cast_variables(None):
-          loss = loss()
-      if loss is None:
-        return None  # Will be filtered out when computing the .losses property
-      if not tf.is_tensor(loss):
-        loss = tf.convert_to_tensor(
-            loss, dtype=backend.floatx())
-      loss._unconditional_loss = True  # pylint: disable=protected-access
-      return loss
-
-    losses = tf.nest.flatten(losses)
-
-    callable_losses = []
-    eager_losses = []
-    symbolic_losses = []
-    for loss in losses:
-      if callable(loss):
-        callable_losses.append(functools.partial(_tag_callable, loss))
-        continue
-      if loss is None:
-        continue
-      if not tf.is_tensor(loss) and not isinstance(
-          loss, keras_tensor.KerasTensor):
-        loss = tf.convert_to_tensor(
-            loss, dtype=backend.floatx())
-      # TF Functions should take the eager path.
-      if ((tf_utils.is_symbolic_tensor(loss) or
-           isinstance(loss, keras_tensor.KerasTensor)) and
-          not base_layer_utils.is_in_tf_function()):
-        symbolic_losses.append(loss)
-      elif tf.is_tensor(loss):
-        eager_losses.append(loss)
-
-    self._callable_losses.extend(callable_losses)
-
-    in_call_context = base_layer_utils.call_context().in_call
-    if eager_losses and not in_call_context:
-      raise ValueError(
-          'Expected a symbolic Tensors or a callable for the loss value. '
-          'Please wrap your loss computation in a zero argument `lambda`.')
-
-    self._eager_losses.extend(eager_losses)
-
-    for symbolic_loss in symbolic_losses:
-      if getattr(self, '_is_graph_network', False):
-        self._graph_network_add_loss(symbolic_loss)
-      else:
-        # Possible a loss was added in a Layer's `build`.
-        self._losses.append(symbolic_loss)
-
-  @property
-  def metrics(self):
-    """List of metrics added using the `add_metric()` API.
+      def __init__(self, units=32):
+          super(SimpleDense, self).__init__()
+          self.units = units
 
-    Example:
-
-    >>> input = tf.keras.layers.Input(shape=(3,))
-    >>> d = tf.keras.layers.Dense(2)
-    >>> output = d(input)
-    >>> d.add_metric(tf.reduce_max(output), name='max')
-    >>> d.add_metric(tf.reduce_min(output), name='min')
-    >>> [m.name for m in d.metrics]
-    ['max', 'min']
-
-    Returns:
-      A list of `Metric` objects.
-    """
-    collected_metrics = []
-    for layer in self._flatten_layers():
-      if not hasattr(layer, '_metrics_lock'):
-        continue
-      with layer._metrics_lock:
-        collected_metrics.extend(layer._metrics)
-    return collected_metrics
-
-  def add_metric(self, value, name=None, **kwargs):
-    """Adds metric tensor to the layer.
-
-    This method can be used inside the `call()` method of a subclassed layer
-    or model.
-
-    ```python
-    class MyMetricLayer(tf.keras.layers.Layer):
-      def __init__(self):
-        super(MyMetricLayer, self).__init__(name='my_metric_layer')
-        self.mean = tf.keras.metrics.Mean(name='metric_1')
+      def build(self, input_shape):
+          self.w = self.add_weight(shape=(input_shape[-1], self.units),
+                                   initializer='random_normal',
+                                   trainable=True)
+          self.b = self.add_weight(shape=(self.units,),
+                                   initializer='random_normal',
+                                   trainable=True)
 
       def call(self, inputs):
-        self.add_metric(self.mean(inputs))
-        self.add_metric(tf.reduce_sum(inputs), name='metric_2')
-        return inputs
+          return tf.matmul(inputs, self.w) + self.b
     ```
 
-    This method can also be called directly on a Functional Model during
-    construction. In this case, any tensor passed to this Model must
-    be symbolic and be able to be traced back to the model's `Input`s. These
-    metrics become part of the model's topology and are tracked when you
-    save the model via `save()`.
+    Besides trainable weights, updated via backpropagation during training,
+    layers can also have non-trainable weights. These weights are meant to
+    be updated manually during `call()`. Here's a example layer that computes
+    the running sum of its inputs:
 
     ```python
-    inputs = tf.keras.Input(shape=(10,))
-    x = tf.keras.layers.Dense(10)(inputs)
-    outputs = tf.keras.layers.Dense(1)(x)
-    model = tf.keras.Model(inputs, outputs)
-    model.add_metric(math_ops.reduce_sum(x), name='metric_1')
-    ```
-
-    Note: Calling `add_metric()` with the result of a metric object on a
-    Functional Model, as shown in the example below, is not supported. This is
-    because we cannot trace the metric result tensor back to the model's inputs.
-
-    ```python
-    inputs = tf.keras.Input(shape=(10,))
-    x = tf.keras.layers.Dense(10)(inputs)
-    outputs = tf.keras.layers.Dense(1)(x)
-    model = tf.keras.Model(inputs, outputs)
-    model.add_metric(tf.keras.metrics.Mean()(x), name='metric_1')
-    ```
+    class ComputeSum(Layer):
 
-    Args:
-      value: Metric tensor.
-      name: String metric name.
-      **kwargs: Additional keyword arguments for backward compatibility.
-        Accepted values:
-        `aggregation` - When the `value` tensor provided is not the result of
-        calling a `keras.Metric` instance, it will be aggregated by default
-        using a `keras.Metric.Mean`.
-    """
-    kwargs_keys = list(kwargs.keys())
-    if (len(kwargs_keys) > 1 or
-        (len(kwargs_keys) == 1 and kwargs_keys[0] != 'aggregation')):
-      raise TypeError(f'Unknown keyword arguments: {kwargs.keys()}. '
-                      'Expected `aggregation`.')
-
-    from_metric_obj = hasattr(value, '_metric_obj')
-    is_symbolic = isinstance(value, keras_tensor.KerasTensor)
-    in_call_context = base_layer_utils.call_context().in_call
-
-    if name is None and not from_metric_obj:
-      # Eg. `self.add_metric(math_ops.reduce_sum(x))`
-      # In eager mode, we use metric name to lookup a metric. Without a name,
-      # a new Mean metric wrapper will be created on every model/layer call.
-      # So, we raise an error when no name is provided.
-      # We will do the same for symbolic mode for consistency although a name
-      # will be generated if no name is provided.
-
-      # We will not raise this error in the foll use case for the sake of
-      # consistency as name in provided in the metric constructor.
-      # mean = metrics.Mean(name='my_metric')
-      # model.add_metric(mean(outputs))
-      raise ValueError('Please provide a name for your metric like '
-                       '`self.add_metric(tf.reduce_sum(inputs), '
-                       'name=\'mean_activation\')`')
-    elif from_metric_obj:
-      name = value._metric_obj.name
-
-    if not in_call_context and not is_symbolic:
-      raise ValueError('Expected a symbolic Tensor for the metric value, '
-                       'received: ' + str(value))
-
-    # If a metric was added in a Layer's `call` or `build`.
-    if in_call_context or not getattr(self, '_is_graph_network', False):
-      # TF Function path should take the eager path.
-
-      # If the given metric is available in `metrics` list we just update state
-      # on it, otherwise we create a new metric instance and
-      # add it to the `metrics` list.
-      metric_obj = getattr(value, '_metric_obj', None)
-      # Tensors that come from a Metric object already updated the Metric state.
-      should_update_state = not metric_obj
-      name = metric_obj.name if metric_obj else name
-
-      with self._metrics_lock:
-        match = self._get_existing_metric(name)
-        if match:
-          metric_obj = match
-        elif metric_obj:
-          self._metrics.append(metric_obj)
-        else:
-          # Build the metric object with the value's dtype if it defines one
-          metric_obj = metrics_mod.Mean(
-              name=name, dtype=getattr(value, 'dtype', None))
-          self._metrics.append(metric_obj)
-
-      if should_update_state:
-        metric_obj(value)
-    else:
-      if from_metric_obj:
-        raise ValueError('Using the result of calling a `Metric` object '
-                         'when calling `add_metric` on a Functional '
-                         'Model is not supported. Please pass the '
-                         'Tensor to monitor directly.')
-
-      # Insert layers into the Keras Graph Network.
-      aggregation = None if from_metric_obj else 'mean'
-      self._graph_network_add_metric(value, aggregation, name)
-
-  @doc_controls.do_not_doc_inheritable
-  def add_update(self, updates):
-    """Add update op(s), potentially dependent on layer inputs.
-
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    This call is ignored when eager execution is enabled (in that case, variable
-    updates are run on the fly and thus do not need to be tracked for later
-    execution).
+      def __init__(self, input_dim):
+          super(ComputeSum, self).__init__()
+          # Create a non-trainable weight.
+          self.total = tf.Variable(initial_value=tf.zeros((input_dim,)),
+                                   trainable=False)
 
-    Args:
-      updates: Update op, or list/tuple of update ops, or zero-arg callable
-        that returns an update op. A zero-arg callable should be passed in
-        order to disable running the updates by setting `trainable=False`
-        on this Layer, when executing in Eager mode.
-    """
-    call_context = base_layer_utils.call_context()
-    # No need to run updates during Functional API construction.
-    if call_context.in_keras_graph:
-      return
-
-    # Callable updates are disabled by setting `trainable=False`.
-    if not call_context.frozen:
-      for update in tf.nest.flatten(updates):
-        if callable(update):
-          update()  # pylint: disable=not-callable
-
-  def set_weights(self, weights):
-    """Sets the weights of the layer, from NumPy arrays.
-
-    The weights of a layer represent the state of the layer. This function
-    sets the weight values from numpy arrays. The weight values should be
-    passed in the order they are created by the layer. Note that the layer's
-    weights must be instantiated before calling this function, by calling
-    the layer.
-
-    For example, a `Dense` layer returns a list of two values: the kernel matrix
-    and the bias vector. These can be used to set the weights of another
-    `Dense` layer:
-
-    >>> layer_a = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(1.))
-    >>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
-    >>> layer_a.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-    >>> layer_b = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(2.))
-    >>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
-    >>> layer_b.get_weights()
-    [array([[2.],
-           [2.],
-           [2.]], dtype=float32), array([0.], dtype=float32)]
-    >>> layer_b.set_weights(layer_a.get_weights())
-    >>> layer_b.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-
-    Args:
-      weights: a list of NumPy arrays. The number
-        of arrays and their shape must match
-        number of the dimensions of the weights
-        of the layer (i.e. it should match the
-        output of `get_weights`).
-
-    Raises:
-      ValueError: If the provided weights list does not match the
-        layer's specifications.
-    """
-    params = self.weights
-
-    expected_num_weights = 0
-    for param in params:
-      if isinstance(param, base_layer_utils.TrackableWeightHandler):
-        expected_num_weights += param.num_tensors
-      else:
-        expected_num_weights += 1
-
-    if expected_num_weights != len(weights):
-      raise ValueError(
-          'You called `set_weights(weights)` on layer "%s" '
-          'with a weight list of length %s, but the layer was '
-          'expecting %s weights. Provided weights: %s...' %
-          (self.name, len(weights), expected_num_weights, str(weights)[:50]))
-
-    weight_index = 0
-    weight_value_tuples = []
-    for param in params:
-      if isinstance(param, base_layer_utils.TrackableWeightHandler):
-        num_tensors = param.num_tensors
-        tensors = weights[weight_index:weight_index + num_tensors]
-        param.set_weights(tensors)
-        weight_index += num_tensors
-      else:
-        weight = weights[weight_index]
-        weight_shape = weight.shape if hasattr(weight, 'shape') else ()
-        ref_shape = param.shape
-        if not ref_shape.is_compatible_with(weight_shape):
-          raise ValueError(
-              f'Layer {self.name} weight shape {ref_shape} '
-              'is not compatible with provided weight '
-              f'shape {weight_shape}.')
-        weight_value_tuples.append((param, weight))
-        weight_index += 1
-
-    backend.batch_set_value(weight_value_tuples)
-
-    # Perform any layer defined finalization of the layer state.
-    for layer in self._flatten_layers():
-      layer.finalize_state()
-
-  def get_weights(self):
-    """Returns the current weights of the layer, as NumPy arrays.
-
-    The weights of a layer represent the state of the layer. This function
-    returns both trainable and non-trainable weight values associated with this
-    layer as a list of NumPy arrays, which can in turn be used to load state
-    into similarly parameterized layers.
-
-    For example, a `Dense` layer returns a list of two values: the kernel matrix
-    and the bias vector. These can be used to set the weights of another
-    `Dense` layer:
-
-    >>> layer_a = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(1.))
-    >>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
-    >>> layer_a.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-    >>> layer_b = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(2.))
-    >>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
-    >>> layer_b.get_weights()
-    [array([[2.],
-           [2.],
-           [2.]], dtype=float32), array([0.], dtype=float32)]
-    >>> layer_b.set_weights(layer_a.get_weights())
-    >>> layer_b.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-
-    Returns:
-        Weights values as a list of NumPy arrays.
-    """
-    weights = self.weights
-    output_weights = []
-    for weight in weights:
-      if isinstance(weight, base_layer_utils.TrackableWeightHandler):
-        output_weights.extend(weight.get_tensors())
-      else:
-        output_weights.append(weight)
-    return backend.batch_get_value(output_weights)
-
-  @doc_controls.do_not_generate_docs
-  def finalize_state(self):
-    """Finalizes the layers state after updating layer weights.
-
-    This function can be subclassed in a layer and will be called after updating
-    a layer weights. It can be overridden to finalize any additional layer state
-    after a weight update.
-
-    This function will be called after weights of a layer have been restored
-    from a loaded model.
-    """
-    pass
-
-  @doc_controls.do_not_doc_inheritable
-  def get_input_mask_at(self, node_index):
-    """Retrieves the input mask tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A mask tensor
-        (or list of tensors if the layer has multiple inputs).
-    """
-    inputs = self.get_input_at(node_index)
-    if isinstance(inputs, list):
-      return [getattr(x, '_keras_mask', None) for x in inputs]
-    else:
-      return getattr(inputs, '_keras_mask', None)
-
-  @doc_controls.do_not_doc_inheritable
-  def get_output_mask_at(self, node_index):
-    """Retrieves the output mask tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A mask tensor
-        (or list of tensors if the layer has multiple outputs).
-    """
-    output = self.get_output_at(node_index)
-    if isinstance(output, list):
-      return [getattr(x, '_keras_mask', None) for x in output]
-    else:
-      return getattr(output, '_keras_mask', None)
-
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def input_mask(self):
-    """Retrieves the input mask tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one inbound node,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input mask tensor (potentially None) or list of input
-        mask tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-    """
-    inputs = self.input
-    if isinstance(inputs, list):
-      return [getattr(x, '_keras_mask', None) for x in inputs]
-    else:
-      return getattr(inputs, '_keras_mask', None)
-
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def output_mask(self):
-    """Retrieves the output mask tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one inbound node,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Output mask tensor (potentially None) or list of output
-        mask tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-    """
-    output = self.output
-    if isinstance(output, list):
-      return [getattr(x, '_keras_mask', None) for x in output]
-    else:
-      return getattr(output, '_keras_mask', None)
-
-  @doc_controls.do_not_doc_inheritable
-  def get_input_shape_at(self, node_index):
-    """Retrieves the input shape(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'input_shapes',
-                                             'input shape')
-
-  @doc_controls.do_not_doc_inheritable
-  def get_output_shape_at(self, node_index):
-    """Retrieves the output shape(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'output_shapes',
-                                             'output shape')
-
-  @doc_controls.do_not_doc_inheritable
-  def get_input_at(self, node_index):
-    """Retrieves the input tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first input node of the layer.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'input_tensors',
-                                             'input')
-
-  @doc_controls.do_not_doc_inheritable
-  def get_output_at(self, node_index):
-    """Retrieves the output tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first output node of the layer.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'output_tensors',
-                                             'output')
-
-  @property
-  def input(self):
-    """Retrieves the input tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input tensor or list of input tensors.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-      AttributeError: If no inbound nodes are found.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name +
-                           ' is not connected, no input to return.')
-    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
-
-  @property
-  def output(self):
-    """Retrieves the output tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one output,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-      Output tensor or list of output tensors.
-
-    Raises:
-      AttributeError: if the layer is connected to more than one incoming
-        layers.
-      RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
-    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
-
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def input_shape(self):
-    """Retrieves the input shape(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer, or if all inputs
-    have the same shape.
-
-    Returns:
-        Input shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per input tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined input_shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError(f'The layer "{self.name}" has never been called '
-                           'and thus has no defined input shape. Note that the '
-                           '`input_shape` property is only available for '
-                           'Functional and Sequential models.')
-    all_input_shapes = set(
-        [str(node.input_shapes) for node in self._inbound_nodes])
-    if len(all_input_shapes) == 1:
-      return self._inbound_nodes[0].input_shapes
-    else:
-      raise AttributeError('The layer "' + str(self.name) +
-                           '" has multiple inbound nodes, '
-                           'with different input shapes. Hence '
-                           'the notion of "input shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_input_shape_at(node_index)` '
-                           'instead.')
-
-  def count_params(self):
-    """Count the total number of scalars composing the weights.
-
-    Returns:
-        An integer count.
-
-    Raises:
-        ValueError: if the layer isn't yet built
-          (in which case its weights aren't yet defined).
-    """
-    if not self.built:
-      if getattr(self, '_is_graph_network', False):
-        with tf_utils.maybe_init_scope(self):
-          self._maybe_build(self.inputs)
-      else:
-        raise ValueError('You tried to call `count_params` '
-                         f'on layer {self.name}'
-                         ', but the layer isn\'t built. '
-                         'You can build it manually via: '
-                         f'`{self.name}.build(batch_input_shape)`.')
-    return layer_utils.count_params(self.weights)
-
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def output_shape(self):
-    """Retrieves the output shape(s) of a layer.
-
-    Only applicable if the layer has one output,
-    or if all outputs have the same shape.
-
-    Returns:
-        Output shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per output tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined output shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError(f'The layer "{self.name}" has never been called '
-                           'and thus has no defined output shape.')
-    all_output_shapes = set(
-        [str(node.output_shapes) for node in self._inbound_nodes])
-    if len(all_output_shapes) == 1:
-      return self._inbound_nodes[0].output_shapes
-    else:
-      raise AttributeError('The layer "%s"'
-                           ' has multiple inbound nodes, '
-                           'with different output shapes. Hence '
-                           'the notion of "output shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_output_shape_at(node_index)` '
-                           'instead.' % self.name)
-
-  @property
-  def dtype_policy(self):
-    """The dtype policy associated with this layer.
-
-    This is an instance of a `tf.keras.mixed_precision.Policy`.
-    """
-    return self._dtype_policy
+      def call(self, inputs):
+          self.total.assign_add(tf.reduce_sum(inputs, axis=0))
+          return self.total
 
-  @property
-  def compute_dtype(self):
-    """The dtype of the layer's computations.
+    my_sum = ComputeSum(2)
+    x = tf.ones((2, 2))
 
-    This is equivalent to `Layer.dtype_policy.compute_dtype`. Unless
-    mixed precision is used, this is the same as `Layer.dtype`, the dtype of
-    the weights.
+    y = my_sum(x)
+    print(y.numpy())  # [2. 2.]
 
-    Layers automatically cast their inputs to the compute dtype, which causes
-    computations and the output to be in the compute dtype as well. This is done
-    by the base Layer class in `Layer.__call__`, so you do not have to insert
-    these casts if implementing your own layer.
+    y = my_sum(x)
+    print(y.numpy())  # [4. 4.]
 
-    Layers often perform certain internal computations in higher precision when
-    `compute_dtype` is float16 or bfloat16 for numeric stability. The output
-    will still typically be float16 or bfloat16 in such cases.
+    assert my_sum.weights == [my_sum.total]
+    assert my_sum.non_trainable_weights == [my_sum.total]
+    assert my_sum.trainable_weights == []
+    ```
 
-    Returns:
-      The layer's compute dtype.
+    For more information about creating layers, see the guide
+    [Making new Layers and Models via subclassing](
+      https://www.tensorflow.org/guide/keras/custom_layers_and_models)
     """
-    return self._dtype_policy.compute_dtype
-
-  @property
-  def variable_dtype(self):
-    """Alias of `Layer.dtype`, the dtype of the weights."""
-    return self.dtype
 
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def inbound_nodes(self):
-    """Return Functional API nodes upstream of this layer."""
-    return self._inbound_nodes
-
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def outbound_nodes(self):
-    """Return Functional API nodes downstream of this layer."""
-    return self._outbound_nodes
-
-  ##############################################################################
-  # Methods & attributes below are public aliases of other methods.            #
-  ##############################################################################
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(
+        self, trainable=True, name=None, dtype=None, dynamic=False, **kwargs
+    ):
+        self._instrument_layer_creation()
+
+        # These properties should be set by the user via keyword arguments.
+        # note that 'dtype', 'input_shape' and 'batch_input_shape'
+        # are only applicable to input layers: do not pass these keywords
+        # to non-input layers.
+        allowed_kwargs = {
+            "input_dim",
+            "input_shape",
+            "batch_input_shape",
+            "batch_size",
+            "weights",
+            "activity_regularizer",
+            "autocast",
+            "implementation",
+        }
+        # Validate optional keyword arguments.
+        generic_utils.validate_kwargs(kwargs, allowed_kwargs)
+
+        # Mutable properties
+        # Indicates whether the layer's weights are updated during training
+        # and whether the layer's updates are run during training.
+        if not (
+            isinstance(trainable, bool)
+            or (
+                isinstance(trainable, (tf.Tensor, tf.Variable))
+                and trainable.dtype is tf.bool
+            )
+        ):
+            raise TypeError(
+                "Expected `trainable` argument to be a boolean, "
+                f"but got: {trainable}"
+            )
+        self._trainable = trainable
+        # A stateful layer is a layer whose updates are run during inference
+        # too, for instance stateful RNNs.
+        self._stateful = False
+        # Indicates whether `build` needs to be called upon layer call, to
+        # create the layer's weights. (Note that the first call() may also
+        # create weights, independent of build().)
+        self.built = False
+        # Provides information about which inputs are compatible with the layer.
+        self._input_spec = None
+
+        # SavedModel-related attributes.
+        # Record the build input shape for loading purposes.
+        # TODO(kathywu): Move this to Layer._set_save_spec once cl/290121460 is
+        # submitted.
+        self._build_input_shape = None
+        self._saved_model_inputs_spec = None
+        self._saved_model_arg_spec = None
+
+        # `Layer.compute_mask` will be called at the end of `Layer.__call__` if
+        # `Layer.compute_mask` is overridden, or if the `Layer` subclass sets
+        # `self.supports_masking=True`.
+        self._supports_masking = not generic_utils.is_default(self.compute_mask)
+
+        self._init_set_name(name)
+        self._activity_regularizer = regularizers.get(
+            kwargs.pop("activity_regularizer", None)
+        )
+        self._maybe_create_attribute("_trainable_weights", [])
+        self._maybe_create_attribute("_non_trainable_weights", [])
+        self._updates = []
+        # Object to store all thread local layer properties.
+        self._thread_local = threading.local()
+        # A list of zero-argument lambdas which return Tensors, used for
+        # variable regularizers.
+        self._callable_losses = []
+        # A list of symbolic Tensors containing activity regularizers and losses
+        # manually added through `add_loss` in graph-building mode.
+        self._losses = []
+        # A list of metric instances corresponding to the symbolic metric
+        # tensors added using the `add_metric` API.
+        self._metrics = []
+        # Ensures the same metric is not added multiple times in
+        # `MirroredStrategy`.
+        self._metrics_lock = threading.Lock()
+
+        # Note that models also have a dtype policy, as they are layers. For
+        # functional models, the policy is only used in Model.compile, which
+        # wraps the optimizer with a LossScaleOptimizer if the policy name is
+        # "mixed_float16". Subclassed models additionally use the policy's
+        # compute and variable dtypes, as like any ordinary layer.
+        self._set_dtype_policy(dtype)
+        # Boolean indicating whether the layer automatically casts its inputs to
+        # the layer's compute_dtype.
+        self._autocast = kwargs.get(
+            "autocast", base_layer_utils.v2_dtype_behavior_enabled()
+        )
+
+        # Tracks `TrackableDataStructure`s, `Module`s, and `Layer`s.
+        # Ordered by when the object was assigned as an attr.
+        # Entries are unique.
+        self._maybe_create_attribute("_self_tracked_trackables", [])
+
+        # These lists will be filled via successive calls
+        # to self._add_inbound_node().
+        # Used in symbolic mode only, only in conjunction with graph-networks
+        self._inbound_nodes_value = []
+        self._outbound_nodes_value = []
+
+        self._init_call_fn_args()
+
+        # Whether the `call` method can be used to build a TF graph without
+        # issues.  This attribute has no effect if the model is created using
+        # the Functional API. Instead, `model.dynamic` is determined based on
+        # the internal layers.
+        if not isinstance(dynamic, bool):
+            raise TypeError(
+                "Expected `dynamic` argument to be a boolean, "
+                f"but got: {dynamic}"
+            )
+        self._dynamic = dynamic
+
+        # Manage input shape information if passed.
+        if "input_dim" in kwargs and "input_shape" not in kwargs:
+            # Backwards compatibility: alias 'input_dim' to 'input_shape'.
+            kwargs["input_shape"] = (kwargs["input_dim"],)
+        if "input_shape" in kwargs or "batch_input_shape" in kwargs:
+            # In this case we will later create an input layer
+            # to insert before the current layer
+            if "batch_input_shape" in kwargs:
+                batch_input_shape = tuple(kwargs["batch_input_shape"])
+            elif "input_shape" in kwargs:
+                if "batch_size" in kwargs:
+                    batch_size = kwargs["batch_size"]
+                else:
+                    batch_size = None
+                batch_input_shape = (batch_size,) + tuple(kwargs["input_shape"])
+            self._batch_input_shape = batch_input_shape
+
+        # Manage initial weight values if passed.
+        self._initial_weights = kwargs.get("weights", None)
+
+        # Whether the layer will track any layers that is set as attribute on
+        # itself as sub-layers, the weights from the sub-layers will be included
+        # in the parent layer's variables() as well.  Defaults to `True`, which
+        # means auto tracking is turned on. Certain subclass might want to turn
+        # it off, like Sequential model.
+        self._auto_track_sub_layers = True
+
+        # For backwards compat reasons, most built-in layers do not guarantee
+        # That they will 100% preserve the structure of input args when saving
+        # / loading configs. E.g. they may un-nest an arg that is
+        # a list with one element.
+        self._preserve_input_structure_in_config = False
+
+        # Save outer name scope at layer declaration so that it is preserved at
+        # the actual layer construction.
+        self._name_scope_on_declaration = tf.get_current_name_scope()
+
+        # Save the temp regularization losses created in the DTensor use case.
+        # When DTensor is enable, we will first create LazyInitVariable and then
+        # DVariable with proper layout afterward. For the weights regularization
+        # loss, we have to create against the DVariable as well.
+        self._captured_weight_regularizer = []
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    @generic_utils.default
+    def build(self, input_shape):
+        """Creates the variables of the layer (for subclass implementers).
+
+        This is a method that implementers of subclasses of `Layer` or `Model`
+        can override if they need a state-creation step in-between
+        layer instantiation and layer call. It is invoked automatically before
+        the first execution of `call()`.
+
+        This is typically used to create the weights of `Layer` subclasses
+        (at the discretion of the subclass implementer).
+
+        Args:
+          input_shape: Instance of `TensorShape`, or list of instances of
+            `TensorShape` if the layer expects a list of inputs
+            (one instance per input).
+        """
+        self._build_input_shape = input_shape
+        self.built = True
+
+    @doc_controls.for_subclass_implementers
+    def call(self, inputs, *args, **kwargs):
+        """This is where the layer's logic lives.
+
+        The `call()` method may not create state (except in its first
+        invocation, wrapping the creation of variables or other resources in
+        `tf.init_scope()`).  It is recommended to create state, including
+        `tf.Variable` instances and nested `Layer` instances,
+         in `__init__()`, or in the `build()` method that is
+        called automatically before `call()` executes for the first time.
+
+        Args:
+          inputs: Input tensor, or dict/list/tuple of input tensors.
+            The first positional `inputs` argument is subject to special rules:
+            - `inputs` must be explicitly passed. A layer cannot have zero
+              arguments, and `inputs` cannot be provided via the default value
+              of a keyword argument.
+            - NumPy array or Python scalar values in `inputs` get cast as
+              tensors.
+            - Keras mask metadata is only collected from `inputs`.
+            - Layers are built (`build(input_shape)` method)
+              using shape info from `inputs` only.
+            - `input_spec` compatibility is only checked against `inputs`.
+            - Mixed precision input casting is only applied to `inputs`.
+              If a layer has tensor arguments in `*args` or `**kwargs`, their
+              casting behavior in mixed precision should be handled manually.
+            - The SavedModel input specification is generated using `inputs`
+              only.
+            - Integration with various ecosystem packages like TFMOT, TFLite,
+              TF.js, etc is only supported for `inputs` and not for tensors in
+              positional and keyword arguments.
+          *args: Additional positional arguments. May contain tensors, although
+            this is not recommended, for the reasons above.
+          **kwargs: Additional keyword arguments. May contain tensors, although
+            this is not recommended, for the reasons above.
+            The following optional keyword arguments are reserved:
+            - `training`: Boolean scalar tensor of Python boolean indicating
+              whether the `call` is meant for training or inference.
+            - `mask`: Boolean input mask. If the layer's `call()` method takes a
+              `mask` argument, its default value will be set to the mask
+              generated for `inputs` by the previous layer (if `input` did come
+              from a layer that generated a corresponding mask, i.e. if it came
+              from a Keras layer with masking support).
+
+        Returns:
+          A tensor or list/tuple of tensors.
+        """
+        return inputs
 
-  @property
-  @doc_controls.do_not_generate_docs
-  def variables(self):
-    """Returns the list of all layer variables/weights.
+    @doc_controls.for_subclass_implementers
+    def add_weight(
+        self,
+        name=None,
+        shape=None,
+        dtype=None,
+        initializer=None,
+        regularizer=None,
+        trainable=None,
+        constraint=None,
+        use_resource=None,
+        synchronization=tf.VariableSynchronization.AUTO,
+        aggregation=tf.VariableAggregation.NONE,
+        **kwargs,
+    ):
+        """Adds a new variable to the layer.
+
+        Args:
+          name: Variable name.
+          shape: Variable shape. Defaults to scalar if unspecified.
+          dtype: The type of the variable. Defaults to `self.dtype`.
+          initializer: Initializer instance (callable).
+          regularizer: Regularizer instance (callable).
+          trainable: Boolean, whether the variable should be part of the layer's
+            "trainable_variables" (e.g. variables, biases)
+            or "non_trainable_variables" (e.g. BatchNorm mean and variance).
+            Note that `trainable` cannot be `True` if `synchronization`
+            is set to `ON_READ`.
+          constraint: Constraint instance (callable).
+          use_resource: Whether to use a `ResourceVariable` or not.
+            See [this guide](
+            https://www.tensorflow.org/guide/migrate/tf1_vs_tf2#resourcevariables_instead_of_referencevariables)
+             for more information.
+          synchronization: Indicates when a distributed a variable will be
+            aggregated. Accepted values are constants defined in the class
+            `tf.VariableSynchronization`. By default the synchronization is set
+            to `AUTO` and the current `DistributionStrategy` chooses when to
+            synchronize. If `synchronization` is set to `ON_READ`, `trainable`
+            must not be set to `True`.
+          aggregation: Indicates how a distributed variable will be aggregated.
+            Accepted values are constants defined in the class
+            `tf.VariableAggregation`.
+          **kwargs: Additional keyword arguments. Accepted values are `getter`,
+            `collections`, `experimental_autocast` and `caching_device`.
+
+        Returns:
+          The variable created.
+
+        Raises:
+          ValueError: When giving unsupported dtype and no initializer or when
+            trainable has been set to True with synchronization set as
+            `ON_READ`.
+        """
+        if shape is None:
+            shape = ()
+        kwargs.pop("partitioner", None)  # Ignored.
+        # Validate optional keyword arguments.
+        for kwarg in kwargs:
+            if kwarg not in [
+                "collections",
+                "experimental_autocast",
+                "caching_device",
+                "getter",
+                "layout",
+                "experimental_enable_variable_lifting",
+            ]:
+                raise TypeError("Unknown keyword argument:", kwarg)
+        collections_arg = kwargs.pop("collections", None)
+        # 'experimental_autocast' can be set to False by the caller to indicate
+        # an AutoCastVariable should never be created.
+        autocast = kwargs.pop("experimental_autocast", True)
+        # See the docstring for tf.Variable about the details for
+        # caching_device.
+        caching_device = kwargs.pop("caching_device", None)
+
+        layout = kwargs.pop("layout", None)
+        # Specially handling of auto layout fetch, based on the variable name
+        # and attribute name. For built-in keras layers, usually the variable
+        # name, eg 'kernel', will match with a 'kernel_layout' attribute name on
+        # the instance. We will try to do this auto fetch if layout is not
+        # explicitly specified. This is mainly a quick workaround for not
+        # applying too many interface change to built-in layers, until DTensor
+        # is a public API.  Also see dtensor.utils.allow_initializer_layout for
+        # more details.
+        # TODO(scottzhu): Remove this once dtensor is public to end user.
+        if not layout and name:
+            layout = getattr(self, name + "_layout", None)
+
+        if dtype is None:
+            dtype = self.dtype or backend.floatx()
+        dtype = tf.as_dtype(dtype)
+        if self._dtype_policy.variable_dtype is None:
+            # The policy is "_infer", so we infer the policy from the variable
+            # dtype.
+            self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
+        initializer = initializers.get(initializer)
+        regularizer = regularizers.get(regularizer)
+        constraint = constraints.get(constraint)
+
+        if synchronization == tf.VariableSynchronization.ON_READ:
+            if trainable:
+                raise ValueError(
+                    "Synchronization value can be set to "
+                    "VariableSynchronization.ON_READ only for non-trainable "
+                    "variables. You have specified trainable=True and "
+                    "synchronization=VariableSynchronization.ON_READ."
+                )
+            else:
+                # Set trainable to be false when variable is to be synced on
+                # read.
+                trainable = False
+        elif trainable is None:
+            trainable = True
+
+        # Initialize variable when no initializer provided
+        if initializer is None:
+            # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+            if dtype.is_floating:
+                initializer = initializers.get("glorot_uniform")
+            # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+            # If dtype is DT_BOOL, provide a default value `FALSE`
+            elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+                initializer = initializers.get("zeros")
+            # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX
+            # here?
+            elif "getter" not in kwargs:
+                # When `getter` is specified, it's possibly fine for
+                # `initializer` to be None since it's up to the custom `getter`
+                # to raise error in case it indeed needs `initializer`.
+                raise ValueError(
+                    f"An initializer for variable {name} of type "
+                    f"{dtype.base_dtype} is required for layer "
+                    f"{self.name}. Received: {initializer}."
+                )
+
+        getter = kwargs.pop("getter", base_layer_utils.make_variable)
+        if (
+            autocast
+            and self._dtype_policy.compute_dtype
+            != self._dtype_policy.variable_dtype
+            and dtype.is_floating
+        ):
+            old_getter = getter
+
+            # Wrap variable constructor to return an AutoCastVariable.
+            def getter(*args, **kwargs):
+                variable = old_getter(*args, **kwargs)
+                return autocast_variable.create_autocast_variable(variable)
+
+            # Also the caching_device does not work with the mixed precision
+            # API, disable it if it is specified.
+            # TODO(b/142020079): Re-enable it once the bug is fixed.
+            if caching_device is not None:
+                tf_logging.warning(
+                    "`caching_device` does not work with mixed precision API. "
+                    "Ignoring user specified `caching_device`."
+                )
+                caching_device = None
+        if layout:
+            getter = functools.partial(getter, layout=layout)
+
+        variable = self._add_variable_with_custom_getter(
+            name=name,
+            shape=shape,
+            # TODO(allenl): a `make_variable` equivalent should be added as a
+            # `Trackable` method.
+            getter=getter,
+            # Manage errors in Layer rather than Trackable.
+            overwrite=True,
+            initializer=initializer,
+            dtype=dtype,
+            constraint=constraint,
+            trainable=trainable,
+            use_resource=use_resource,
+            collections=collections_arg,
+            synchronization=synchronization,
+            aggregation=aggregation,
+            caching_device=caching_device,
+        )
+        if regularizer is not None:
+            # TODO(fchollet): in the future, this should be handled at the
+            # level of variable creation, and weight regularization losses
+            # should be variable attributes.
+            name_in_scope = variable.name[: variable.name.find(":")]
+            self._handle_weight_regularization(
+                name_in_scope, variable, regularizer
+            )
+        if base_layer_utils.is_split_variable(variable):
+            for v in variable:
+                backend.track_variable(v)
+                if trainable:
+                    self._trainable_weights.append(v)
+                else:
+                    self._non_trainable_weights.append(v)
+        else:
+            backend.track_variable(variable)
+            if trainable:
+                self._trainable_weights.append(variable)
+            else:
+                self._non_trainable_weights.append(variable)
+        return variable
+
+    def __new__(cls, *args, **kwargs):
+        # Generate a config to be returned by default by `get_config()`.
+        arg_names = tf_inspect.getfullargspec(cls.__init__).args
+        kwargs.update(dict(zip(arg_names[1 : len(args) + 1], args)))
+        instance = super(Layer, cls).__new__(cls, *args, **kwargs)
+        # For safety, we only rely on auto-configs for a small set of
+        # serializable types.
+        supported_types = (str, int, float, bool, type(None))
+        try:
+            flat_arg_values = tf.nest.flatten(kwargs)
+            auto_get_config = True
+            for value in flat_arg_values:
+                if not isinstance(value, supported_types):
+                    auto_get_config = False
+                    break
+        except TypeError:
+            auto_get_config = False
+        try:
+            instance._auto_get_config = auto_get_config
+            if auto_get_config:
+                instance._auto_config = serialization_lib.Config(**kwargs)
+        except RecursionError:
+            # Setting an instance attribute in __new__ has the potential
+            # to trigger an infinite recursion if a subclass overrides
+            # setattr in an unsafe way.
+            pass
+        return instance
+
+    @generic_utils.default
+    def get_config(self):
+        """Returns the config of the layer.
+
+        A layer config is a Python dictionary (serializable)
+        containing the configuration of a layer.
+        The same layer can be reinstantiated later
+        (without its trained weights) from this configuration.
+
+        The config of a layer does not include connectivity
+        information, nor the layer class name. These are handled
+        by `Network` (one layer of abstraction above).
+
+        Note that `get_config()` does not guarantee to return a fresh copy of
+        dict every time it is called. The callers should make a copy of the
+        returned dict if they want to modify it.
+
+        Returns:
+            Python dictionary.
+        """
+        config = {
+            "name": self.name,
+            "trainable": self.trainable,
+        }
+        config["dtype"] = policy.serialize(self._dtype_policy)
+        if hasattr(self, "_batch_input_shape"):
+            config["batch_input_shape"] = self._batch_input_shape
+
+        if not generic_utils.is_default(self.get_config):
+            # In this case the subclass implements get_config()
+            return config
+
+        # In this case the subclass doesn't implement get_config():
+        # Let's see if we can autogenerate it.
+        if getattr(self, "_auto_get_config", False):
+            xtra_args = set(config.keys())
+            config.update(self._auto_config.config)
+            # Remove args non explicitly supported
+            argspec = tf_inspect.getfullargspec(self.__init__)
+            if argspec.varkw != "kwargs":
+                for key in xtra_args - xtra_args.intersection(argspec.args[1:]):
+                    config.pop(key, None)
+            return config
+        else:
+            raise NotImplementedError(
+                textwrap.dedent(
+                    f"""
+        Layer {self.__class__.__name__} was created by passing
+        non-serializable argument values in `__init__()`,
+        and therefore the layer must override `get_config()` in
+        order to be serializable. Please implement `get_config()`.
+
+        Example:
+
+        class CustomLayer(keras.layers.Layer):
+            def __init__(self, arg1, arg2, **kwargs):
+                super().__init__(**kwargs)
+                self.arg1 = arg1
+                self.arg2 = arg2
+
+            def get_config(self):
+                config = super().get_config()
+                config.update({{
+                    "arg1": self.arg1,
+                    "arg2": self.arg2,
+                }})
+                return config"""
+                )
+            )
+
+    @classmethod
+    def from_config(cls, config):
+        """Creates a layer from its config.
+
+        This method is the reverse of `get_config`,
+        capable of instantiating the same layer from the config
+        dictionary. It does not handle layer connectivity
+        (handled by Network), nor weights (handled by `set_weights`).
+
+        Args:
+            config: A Python dictionary, typically the
+                output of get_config.
+
+        Returns:
+            A layer instance.
+        """
+        try:
+            return cls(**config)
+        except Exception as e:
+            raise TypeError(
+                f"Error when deserializing class '{cls.__name__}' using "
+                f"config={config}.\n\nException encountered: {e}"
+            )
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer.
+
+        This method will cause the layer's state to be built, if that has not
+        happened before. This requires that the layer will later be used with
+        inputs that match the input shape provided here.
+
+        Args:
+            input_shape: Shape tuple (tuple of integers) or `tf.TensorShape`,
+                or structure of shape tuples / `tf.TensorShape` instances
+                (one per output tensor of the layer).
+                Shape tuples can include None for free dimensions,
+                instead of an integer.
+
+        Returns:
+            A `tf.TensorShape` instance
+            or structure of `tf.TensorShape` instances.
+        """
+        if tf.executing_eagerly():
+            # In this case we build the model first in order to do shape
+            # inference.  This is acceptable because the framework only calls
+            # `compute_output_shape` on shape values that the layer would later
+            # be built for. It would however cause issues in case a user
+            # attempts to use `compute_output_shape` manually with shapes that
+            # are incompatible with the shape the Layer will be called on (these
+            # users will have to implement `compute_output_shape` themselves).
+            self._maybe_build(input_shape)
+            graph_name = str(self.name) + "_scratch_graph"
+            with tf.__internal__.FuncGraph(graph_name).as_default():
+                input_shape = tf_utils.convert_shapes(
+                    input_shape, to_tuples=False
+                )
+
+                def _make_placeholder_like(shape):
+                    ph = backend.placeholder(shape=shape, dtype=self.dtype)
+                    ph._keras_mask = None
+                    return ph
+
+                inputs = tf.nest.map_structure(
+                    _make_placeholder_like, input_shape
+                )
+                try:
+                    outputs = self(inputs, training=False)
+                except TypeError as e:
+                    raise NotImplementedError(
+                        "We could not automatically infer the static shape of "
+                        "the layer's output. Please implement the "
+                        "`compute_output_shape` method on your layer (%s)."
+                        % self.__class__.__name__
+                    ) from e
+            return tf.nest.map_structure(lambda t: t.shape, outputs)
+        raise NotImplementedError(
+            "Please run in eager mode or implement the `compute_output_shape` "
+            "method on your layer (%s)." % self.__class__.__name__
+        )
+
+    @doc_controls.for_subclass_implementers
+    def compute_output_signature(self, input_signature):
+        """Compute the output tensor signature of the layer based on the inputs.
+
+        Unlike a TensorShape object, a TensorSpec object contains both shape
+        and dtype information for a tensor. This method allows layers to provide
+        output dtype information if it is different from the input dtype.
+        For any layer that doesn't implement this function,
+        the framework will fall back to use `compute_output_shape`, and will
+        assume that the output dtype matches the input dtype.
+
+        Args:
+          input_signature: Single TensorSpec or nested structure of TensorSpec
+            objects, describing a candidate input for the layer.
+
+        Returns:
+          Single TensorSpec or nested structure of TensorSpec objects,
+            describing how the layer would transform the provided input.
+
+        Raises:
+          TypeError: If input_signature contains a non-TensorSpec object.
+        """
+
+        def check_type_return_shape(s):
+            if not isinstance(s, tf.TensorSpec):
+                raise TypeError(
+                    "Only TensorSpec signature types are supported. "
+                    f"Received: {s}."
+                )
+            return s.shape
+
+        input_shape = tf.nest.map_structure(
+            check_type_return_shape, input_signature
+        )
+        output_shape = self.compute_output_shape(input_shape)
 
-    Alias of `self.weights`.
+        try:
+            dtype = self.output.dtype
+        except AttributeError:
+            dtype = self._compute_dtype
+
+        if dtype is None:
+            input_dtypes = [s.dtype for s in tf.nest.flatten(input_signature)]
+            # Default behavior when self.dtype is None, is to use the first
+            # input's dtype.
+            dtype = input_dtypes[0]
+        return tf.nest.map_structure(
+            lambda s: tf.TensorSpec(dtype=dtype, shape=s), output_shape
+        )
+
+    @generic_utils.default
+    def compute_mask(self, inputs, mask=None):
+        """Computes an output mask tensor.
+
+        Args:
+            inputs: Tensor or list of tensors.
+            mask: Tensor or list of tensors.
+
+        Returns:
+            None or a tensor (or list of tensors,
+                one per output tensor of the layer).
+        """
+        if not self._supports_masking:
+            if any(m is not None for m in tf.nest.flatten(mask)):
+                raise TypeError(
+                    "Layer " + self.name + " does not support masking, "
+                    "but was passed an input_mask: " + str(mask)
+                )
+            # masking not explicitly supported: return None as mask.
+            return None
+        # if masking is explicitly supported, by default
+        # carry over the input mask
+        return mask
+
+    @traceback_utils.filter_traceback
+    def __call__(self, *args, **kwargs):
+        """Wraps `call`, applying pre- and post-processing steps.
+
+        Args:
+          *args: Positional arguments to be passed to `self.call`.
+          **kwargs: Keyword arguments to be passed to `self.call`.
+
+        Returns:
+          Output tensor(s).
+
+        Note:
+          - The following optional keyword arguments are reserved for specific
+            uses:
+            * `training`: Boolean scalar tensor of Python boolean indicating
+              whether the `call` is meant for training or inference.
+            * `mask`: Boolean input mask.
+          - If the layer's `call` method takes a `mask` argument (as some Keras
+            layers do), its default value will be set to the mask generated
+            for `inputs` by the previous layer (if `input` did come from
+            a layer that generated a corresponding mask, i.e. if it came from
+            a Keras layer with masking support.
+          - If the layer is not built, the method will call `build`.
+
+        Raises:
+          ValueError: if the layer's `call` method returns None (an invalid
+            value).
+          RuntimeError: if `super().__init__()` was not called in the
+            constructor.
+        """
+        if not hasattr(self, "_thread_local"):
+            raise RuntimeError(
+                "You must call `super().__init__()` in the layer constructor."
+            )
+
+        # `inputs` (the first arg in the method spec) is special cased in
+        # layer call due to historical reasons.
+        # This special casing currently takes the form of:
+        # - 'inputs' must be explicitly passed. A layer cannot have zero
+        #   arguments, and inputs cannot have been provided via the default
+        #   value of a kwarg.
+        # - numpy/scalar values in `inputs` get converted to tensors
+        # - implicit masks / mask metadata are only collected from 'inputs`
+        # - Layers are built using shape info from 'inputs' only
+        # - input_spec compatibility is only checked against `inputs`
+        # - mixed precision casting (autocast) is only applied to `inputs`,
+        #   not to any other argument.
+        inputs, args, kwargs = self._call_spec.split_out_first_arg(args, kwargs)
+        input_list = tf.nest.flatten(inputs)
+
+        # Functional Model construction mode is invoked when `Layer`s are called
+        # on symbolic `KerasTensor`s, i.e.:
+        # >> inputs = tf.keras.Input(10)
+        # >> outputs = MyLayer()(inputs)  # Functional construction mode.
+        # >> model = tf.keras.Model(inputs, outputs)
+        if _in_functional_construction_mode(
+            self, inputs, args, kwargs, input_list
+        ):
+            return self._functional_construction_call(
+                inputs, args, kwargs, input_list
+            )
+
+        # Maintains info about the `Layer.call` stack.
+        call_context = base_layer_utils.call_context()
+
+        # Accept NumPy and scalar inputs by converting to Tensors.
+        if any(
+            isinstance(x, (tf.Tensor, np.ndarray, float, int))
+            for x in input_list
+        ):
+            inputs = tf.nest.map_structure(
+                _convert_numpy_or_python_types, inputs
+            )
+            input_list = tf.nest.flatten(inputs)
+
+        # Handle `mask` propagation from previous layer to current layer. Masks
+        # can be propagated explicitly via the `mask` argument, or implicitly
+        # via setting the `_keras_mask` attribute on the inputs to a Layer.
+        # Masks passed explicitly take priority.
+        input_masks, mask_is_implicit = self._get_input_masks(
+            inputs, input_list, args, kwargs
+        )
+        if self._expects_mask_arg and mask_is_implicit:
+            kwargs["mask"] = input_masks
+
+        # Training mode for `Layer.call` is set via (in order of priority):
+        # (1) The `training` argument passed to this `Layer.call`, if it is not
+        #  None
+        # (2) The training mode of an outer `Layer.call`.
+        # (3) The default mode set by `tf.keras.backend.set_learning_phase` (if
+        #  set)
+        # (4) Any non-None default value for `training` specified in the call
+        #  signature
+        # (5) False (treating the layer as if it's in inference)
+        args, kwargs, training_mode = self._set_training_mode(
+            args, kwargs, call_context
+        )
+
+        # Losses are cleared for all sublayers on the outermost `Layer.call`.
+        # Losses are not cleared on inner `Layer.call`s, because sublayers can
+        # be called multiple times.
+        if not call_context.in_call:
+            self._clear_losses()
+
+        eager = tf.executing_eagerly()
+        with call_context.enter(
+            layer=self,
+            inputs=inputs,
+            build_graph=not eager,
+            training=training_mode,
+        ):
+            input_spec.assert_input_compatibility(
+                self.input_spec, inputs, self.name
+            )
+
+            if eager:
+                call_fn = self.call
+                name_scope = self._name
+            else:
+                name_scope = self._get_unnested_name_scope()
+                call_fn = self._autographed_call()
+
+            call_fn = traceback_utils.inject_argument_info_in_traceback(
+                call_fn,
+                object_name=(
+                    f"layer '{self.name}' (type {self.__class__.__name__})"
+                ),
+            )
+            with contextlib.ExitStack() as namescope_stack:
+                if _is_name_scope_on_model_declaration_enabled:
+                    namescope_stack.enter_context(
+                        _name_scope_unnester(self._name_scope_on_declaration)
+                    )
+                namescope_stack.enter_context(tf.name_scope(name_scope))
+
+                if not self.built:
+                    self._maybe_build(inputs)
+
+                if self._autocast:
+                    inputs = self._maybe_cast_inputs(inputs, input_list)
+
+                with autocast_variable.enable_auto_cast_variables(
+                    self._compute_dtype_object
+                ):
+                    outputs = call_fn(inputs, *args, **kwargs)
+
+                if self._activity_regularizer:
+                    self._handle_activity_regularization(inputs, outputs)
+                if self._supports_masking:
+                    self._set_mask_metadata(
+                        inputs, outputs, input_masks, not eager
+                    )
+                if self._saved_model_inputs_spec is None:
+                    self._set_save_spec(inputs, args, kwargs)
+
+                return outputs
+
+    def _get_unnested_name_scope(self):
+        if _is_name_scope_on_model_declaration_enabled:
+            with _name_scope_unnester(
+                self._name_scope_on_declaration
+            ) as relative_name_scope_on_declaration:
+                # To avoid `tf.name_scope` autoincrement, use absolute path.
+                relative_name_scope = filter(
+                    None,
+                    [
+                        tf.get_current_name_scope(),
+                        relative_name_scope_on_declaration,
+                    ],
+                )
+                current_name_scope = "/".join(relative_name_scope) + "/"
+                if current_name_scope == "/":
+                    current_name_scope = self._name_scope_on_declaration
+                with tf.name_scope(current_name_scope):
+                    name_scope = self._name_scope()  # Avoid autoincrementing.
+        else:
+            name_scope = self._name_scope()
+
+        return name_scope
+
+    @property
+    def dtype(self):
+        """The dtype of the layer weights.
+
+        This is equivalent to `Layer.dtype_policy.variable_dtype`. Unless
+        mixed precision is used, this is the same as `Layer.compute_dtype`, the
+        dtype of the layer's computations.
+        """
+        return self._dtype_policy.variable_dtype
+
+    @property
+    def name(self):
+        """Name of the layer (string), set in the constructor."""
+        return self._name
+
+    @property
+    def supports_masking(self):
+        """Whether this layer supports computing a mask using `compute_mask`."""
+        return self._supports_masking
+
+    @supports_masking.setter
+    def supports_masking(self, value):
+        self._supports_masking = value
+
+    @property
+    def dynamic(self):
+        """Whether the layer is dynamic (eager-only); set in the constructor."""
+        return any(layer._dynamic for layer in self._flatten_layers())
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def stateful(self):
+        return any(layer._stateful for layer in self._flatten_layers())
+
+    @stateful.setter
+    def stateful(self, value):
+        self._stateful = value
+
+    @property
+    def trainable(self):
+        return self._trainable
+
+    @trainable.setter
+    def trainable(self, value):
+        """Sets trainable attribute for the layer and its sublayers.
+
+        When this value is changed during training (e.g. with a
+        `tf.keras.callbacks.Callback`) you need to call the parent
+        `tf.keras.Model.make_train_function` with `force=True` in order to
+        recompile the training graph.
+
+        Args:
+          value: Boolean with the desired state for the layer's trainable
+            attribute.
+        """
+        for layer in self._flatten_layers():
+            layer._trainable = value
+
+    @property
+    def activity_regularizer(self):
+        """Optional regularizer function for the output of this layer."""
+        return self._activity_regularizer
+
+    @activity_regularizer.setter
+    def activity_regularizer(self, regularizer):
+        """Optional regularizer function for the output of this layer."""
+        self._activity_regularizer = regularizer
+
+    @property
+    def input_spec(self):
+        """`InputSpec` instance(s) describing the input format for this layer.
+
+        When you create a layer subclass, you can set `self.input_spec` to
+        enable the layer to run input compatibility checks when it is called.
+        Consider a `Conv2D` layer: it can only be called on a single input
+        tensor of rank 4. As such, you can set, in `__init__()`:
+
+        ```python
+        self.input_spec = tf.keras.layers.InputSpec(ndim=4)
+        ```
+
+        Now, if you try to call the layer on an input that isn't rank 4
+        (for instance, an input of shape `(2,)`, it will raise a
+        nicely-formatted error:
+
+        ```
+        ValueError: Input 0 of layer conv2d is incompatible with the layer:
+        expected ndim=4, found ndim=1. Full shape received: [2]
+        ```
+
+        Input checks that can be specified via `input_spec` include:
+        - Structure (e.g. a single input, a list of 2 inputs, etc)
+        - Shape
+        - Rank (ndim)
+        - Dtype
+
+        For more information, see `tf.keras.layers.InputSpec`.
+
+        Returns:
+          A `tf.keras.layers.InputSpec` instance, or nested structure thereof.
+        """
+        return self._input_spec
+
+    @input_spec.setter
+    # Must be decorated to prevent tracking, since the input_spec can be nested
+    # InputSpec objects.
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def input_spec(self, value):
+        for v in tf.nest.flatten(value):
+            if v is not None and not isinstance(v, input_spec.InputSpec):
+                raise TypeError(
+                    "Layer input_spec must be an instance of InputSpec. "
+                    "Got: {}".format(v)
+                )
+        self._input_spec = value
+
+    @property
+    def trainable_weights(self):
+        """List of all trainable weights tracked by this layer.
+
+        Trainable weights are updated via gradient descent during training.
+
+        Returns:
+          A list of trainable variables.
+        """
+        self._update_trackables()
+        if self.trainable:
+            children_weights = self._gather_children_attribute(
+                "trainable_variables"
+            )
+            return self._dedup_weights(
+                self._trainable_weights + children_weights
+            )
+        else:
+            return []
+
+    @property
+    def non_trainable_weights(self):
+        """List of all non-trainable weights tracked by this layer.
+
+        Non-trainable weights are *not* updated during training. They are
+        expected to be updated manually in `call()`.
+
+        Returns:
+          A list of non-trainable variables.
+        """
+        self._update_trackables()
+        if self.trainable:
+            children_weights = self._gather_children_attribute(
+                "non_trainable_variables"
+            )
+            non_trainable_weights = (
+                self._non_trainable_weights + children_weights
+            )
+        else:
+            children_weights = self._gather_children_attribute("variables")
+            non_trainable_weights = (
+                self._trainable_weights
+                + self._non_trainable_weights
+                + children_weights
+            )
+        return self._dedup_weights(non_trainable_weights)
+
+    @property
+    def weights(self):
+        """Returns the list of all layer variables/weights.
+
+        Returns:
+          A list of variables.
+        """
+        return self.trainable_weights + self.non_trainable_weights
+
+    @property
+    @doc_controls.do_not_generate_docs
+    def updates(self):
+        warnings.warn(
+            "`layer.updates` will be removed in a future version. "
+            "This property should not be used in TensorFlow 2.0, "
+            "as `updates` are applied automatically.",
+            stacklevel=2,
+        )
+        return []
+
+    @property
+    def losses(self):
+        """List of losses added using the `add_loss()` API.
+
+        Variable regularization tensors are created when this property is
+        accessed, so it is eager safe: accessing `losses` under a
+        `tf.GradientTape` will propagate gradients back to the corresponding
+        variables.
+
+        Examples:
+
+        >>> class MyLayer(tf.keras.layers.Layer):
+        ...   def call(self, inputs):
+        ...     self.add_loss(tf.abs(tf.reduce_mean(inputs)))
+        ...     return inputs
+        >>> l = MyLayer()
+        >>> l(np.ones((10, 1)))
+        >>> l.losses
+        [1.0]
+
+        >>> inputs = tf.keras.Input(shape=(10,))
+        >>> x = tf.keras.layers.Dense(10)(inputs)
+        >>> outputs = tf.keras.layers.Dense(1)(x)
+        >>> model = tf.keras.Model(inputs, outputs)
+        >>> # Activity regularization.
+        >>> len(model.losses)
+        0
+        >>> model.add_loss(tf.abs(tf.reduce_mean(x)))
+        >>> len(model.losses)
+        1
+
+        >>> inputs = tf.keras.Input(shape=(10,))
+        >>> d = tf.keras.layers.Dense(10, kernel_initializer='ones')
+        >>> x = d(inputs)
+        >>> outputs = tf.keras.layers.Dense(1)(x)
+        >>> model = tf.keras.Model(inputs, outputs)
+        >>> # Weight regularization.
+        >>> model.add_loss(lambda: tf.reduce_mean(d.kernel))
+        >>> model.losses
+        [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
+
+        Returns:
+          A list of tensors.
+        """
+        collected_losses = []
+        for layer in self._flatten_layers():
+            # If any eager losses are present, we assume the model to be part of
+            # an eager training loop (either a custom one or the one used when
+            # `run_eagerly=True`) and so we always return just the eager losses.
+            if layer._eager_losses:
+                # Filter placeholder losses that may have been added by revived
+                # layers.  (see base_layer_utils for details).
+                if (
+                    layer._eager_losses[0]
+                    is not base_layer_utils.REVIVED_LOSS_PLACEHOLDER
+                ):
+                    collected_losses.extend(layer._eager_losses)
+            else:
+                collected_losses.extend(layer._losses)
+            for regularizer in layer._callable_losses:
+                loss_tensor = regularizer()
+                if loss_tensor is not None:
+                    collected_losses.append(loss_tensor)
+        return collected_losses
+
+    def add_loss(self, losses, **kwargs):
+        """Add loss tensor(s), potentially dependent on layer inputs.
+
+        Some losses (for instance, activity regularization losses) may be
+        dependent on the inputs passed when calling a layer. Hence, when reusing
+        the same layer on different inputs `a` and `b`, some entries in
+        `layer.losses` may be dependent on `a` and some on `b`. This method
+        automatically keeps track of dependencies.
+
+        This method can be used inside a subclassed layer or model's `call`
+        function, in which case `losses` should be a Tensor or list of Tensors.
+
+        Example:
+
+        ```python
+        class MyLayer(tf.keras.layers.Layer):
+          def call(self, inputs):
+            self.add_loss(tf.abs(tf.reduce_mean(inputs)))
+            return inputs
+        ```
+
+        The same code works in distributed training: the input to `add_loss()`
+        is treated like a regularization loss and averaged across replicas
+        by the training loop (both built-in `Model.fit()` and compliant custom
+        training loops).
+
+        The `add_loss` method can also be called directly on a Functional Model
+        during construction. In this case, any loss Tensors passed to this Model
+        must be symbolic and be able to be traced back to the model's `Input`s.
+        These losses become part of the model's topology and are tracked in
+        `get_config`.
+
+        Example:
+
+        ```python
+        inputs = tf.keras.Input(shape=(10,))
+        x = tf.keras.layers.Dense(10)(inputs)
+        outputs = tf.keras.layers.Dense(1)(x)
+        model = tf.keras.Model(inputs, outputs)
+        # Activity regularization.
+        model.add_loss(tf.abs(tf.reduce_mean(x)))
+        ```
+
+        If this is not the case for your loss (if, for example, your loss
+        references a `Variable` of one of the model's layers), you can wrap your
+        loss in a zero-argument lambda. These losses are not tracked as part of
+        the model's topology since they can't be serialized.
+
+        Example:
+
+        ```python
+        inputs = tf.keras.Input(shape=(10,))
+        d = tf.keras.layers.Dense(10)
+        x = d(inputs)
+        outputs = tf.keras.layers.Dense(1)(x)
+        model = tf.keras.Model(inputs, outputs)
+        # Weight regularization.
+        model.add_loss(lambda: tf.reduce_mean(d.kernel))
+        ```
+
+        Args:
+          losses: Loss tensor, or list/tuple of tensors. Rather than tensors,
+            losses may also be zero-argument callables which create a loss
+            tensor.
+          **kwargs: Used for backwards compatibility only.
+        """
+        kwargs.pop("inputs", None)
+        if kwargs:
+            raise TypeError(f"Unknown keyword arguments: {kwargs.keys()}")
+
+        def _tag_callable(loss):
+            """Tags callable loss tensor as `_unconditional_loss`."""
+            if callable(loss):
+                # We run the loss without autocasting, as regularizers are often
+                # numerically unstable in float16.
+                with autocast_variable.enable_auto_cast_variables(None):
+                    loss = loss()
+            if loss is None:
+                # Will be filtered out when computing the .losses property
+                return None
+            if not tf.is_tensor(loss):
+                loss = tf.convert_to_tensor(loss, dtype=backend.floatx())
+            loss._unconditional_loss = True
+            return loss
+
+        losses = tf.nest.flatten(losses)
+
+        callable_losses = []
+        eager_losses = []
+        symbolic_losses = []
+        for loss in losses:
+            if callable(loss):
+                callable_losses.append(functools.partial(_tag_callable, loss))
+                continue
+            if loss is None:
+                continue
+            if not tf.is_tensor(loss) and not isinstance(
+                loss, keras_tensor.KerasTensor
+            ):
+                loss = tf.convert_to_tensor(loss, dtype=backend.floatx())
+            # TF Functions should take the eager path.
+            if (
+                tf_utils.is_symbolic_tensor(loss)
+                or isinstance(loss, keras_tensor.KerasTensor)
+            ) and not base_layer_utils.is_in_tf_function():
+                symbolic_losses.append(loss)
+            elif tf.is_tensor(loss):
+                eager_losses.append(loss)
+
+        self._callable_losses.extend(callable_losses)
+
+        in_call_context = base_layer_utils.call_context().in_call
+        if eager_losses and not in_call_context:
+            raise ValueError(
+                "Expected a symbolic Tensors or a callable for the loss value. "
+                "Please wrap your loss computation in a zero argument `lambda`."
+            )
+
+        self._eager_losses.extend(eager_losses)
+
+        for symbolic_loss in symbolic_losses:
+            if getattr(self, "_is_graph_network", False):
+                self._graph_network_add_loss(symbolic_loss)
+            else:
+                # Possible a loss was added in a Layer's `build`.
+                self._losses.append(symbolic_loss)
+
+    @property
+    def metrics(self):
+        """List of metrics attached to the layer.
+
+        Returns:
+            A list of `Metric` objects.
+        """
+        collected_metrics = []
+        for layer in self._flatten_layers():
+            if not hasattr(layer, "_metrics_lock"):
+                continue
+            with layer._metrics_lock:
+                collected_metrics.extend(layer._metrics)
+        return collected_metrics
+
+    @doc_controls.do_not_generate_docs
+    def add_metric(self, value, name=None, **kwargs):
+        """Adds metric tensor to the layer.
+
+        This method can be used inside the `call()` method of a subclassed layer
+        or model.
+
+        ```python
+        class MyMetricLayer(tf.keras.layers.Layer):
+          def __init__(self):
+            super(MyMetricLayer, self).__init__(name='my_metric_layer')
+            self.mean = tf.keras.metrics.Mean(name='metric_1')
+
+          def call(self, inputs):
+            self.add_metric(self.mean(inputs))
+            self.add_metric(tf.reduce_sum(inputs), name='metric_2')
+            return inputs
+        ```
+
+        This method can also be called directly on a Functional Model during
+        construction. In this case, any tensor passed to this Model must
+        be symbolic and be able to be traced back to the model's `Input`s. These
+        metrics become part of the model's topology and are tracked when you
+        save the model via `save()`.
+
+        ```python
+        inputs = tf.keras.Input(shape=(10,))
+        x = tf.keras.layers.Dense(10)(inputs)
+        outputs = tf.keras.layers.Dense(1)(x)
+        model = tf.keras.Model(inputs, outputs)
+        model.add_metric(math_ops.reduce_sum(x), name='metric_1')
+        ```
+
+        Note: Calling `add_metric()` with the result of a metric object on a
+        Functional Model, as shown in the example below, is not supported. This
+        is because we cannot trace the metric result tensor back to the model's
+        inputs.
+
+        ```python
+        inputs = tf.keras.Input(shape=(10,))
+        x = tf.keras.layers.Dense(10)(inputs)
+        outputs = tf.keras.layers.Dense(1)(x)
+        model = tf.keras.Model(inputs, outputs)
+        model.add_metric(tf.keras.metrics.Mean()(x), name='metric_1')
+        ```
+
+        Args:
+          value: Metric tensor.
+          name: String metric name.
+          **kwargs: Additional keyword arguments for backward compatibility.
+            Accepted values:
+            `aggregation` - When the `value` tensor provided is not the result
+            of calling a `keras.Metric` instance, it will be aggregated by
+            default using a `keras.Metric.Mean`.
+        """
+        kwargs_keys = list(kwargs.keys())
+        if len(kwargs_keys) > 1 or (
+            len(kwargs_keys) == 1 and kwargs_keys[0] != "aggregation"
+        ):
+            raise TypeError(
+                f"Unknown keyword arguments: {kwargs.keys()}. "
+                "Expected `aggregation`."
+            )
+
+        from_metric_obj = hasattr(value, "_metric_obj")
+        is_symbolic = isinstance(value, keras_tensor.KerasTensor)
+        in_call_context = base_layer_utils.call_context().in_call
+
+        if name is None and not from_metric_obj:
+            # Eg. `self.add_metric(math_ops.reduce_sum(x))` In eager mode, we
+            # use metric name to lookup a metric. Without a name, a new Mean
+            # metric wrapper will be created on every model/layer call. So, we
+            # raise an error when no name is provided. We will do the same for
+            # symbolic mode for consistency although a name will be generated if
+            # no name is provided.
+
+            # We will not raise this error in the foll use case for the sake of
+            # consistency as name in provided in the metric constructor.
+            # mean = metrics.Mean(name='my_metric')
+            # model.add_metric(mean(outputs))
+            raise ValueError(
+                "Please provide a name for your metric like "
+                "`self.add_metric(tf.reduce_sum(inputs), "
+                "name='mean_activation')`"
+            )
+        elif from_metric_obj:
+            name = value._metric_obj.name
+
+        if not in_call_context and not is_symbolic:
+            raise ValueError(
+                "Expected a symbolic Tensor for the metric value, received: "
+                + str(value)
+            )
+
+        # If a metric was added in a Layer's `call` or `build`.
+        if in_call_context or not getattr(self, "_is_graph_network", False):
+            # TF Function path should take the eager path.
+
+            # If the given metric is available in `metrics` list we just update
+            # state on it, otherwise we create a new metric instance and
+            # add it to the `metrics` list.
+            metric_obj = getattr(value, "_metric_obj", None)
+            # Tensors that come from a Metric object already updated the Metric
+            # state.
+            should_update_state = not metric_obj
+            name = metric_obj.name if metric_obj else name
+
+            with self._metrics_lock:
+                match = self._get_existing_metric(name)
+                if match:
+                    metric_obj = match
+                elif metric_obj:
+                    self._metrics.append(metric_obj)
+                else:
+                    # Build the metric object with the value's dtype if it
+                    # defines one
+                    metric_obj = metrics_mod.Mean(
+                        name=name, dtype=getattr(value, "dtype", None)
+                    )
+                    self._metrics.append(metric_obj)
+
+            if should_update_state:
+                metric_obj(value)
+        else:
+            if from_metric_obj:
+                raise ValueError(
+                    "Using the result of calling a `Metric` object "
+                    "when calling `add_metric` on a Functional "
+                    "Model is not supported. Please pass the "
+                    "Tensor to monitor directly."
+                )
+
+            # Insert layers into the Keras Graph Network.
+            aggregation = None if from_metric_obj else "mean"
+            self._graph_network_add_metric(value, aggregation, name)
+
+    @doc_controls.do_not_doc_inheritable
+    def add_update(self, updates):
+        """Add update op(s), potentially dependent on layer inputs.
+
+        Weight updates (for instance, the updates of the moving mean and
+        variance in a BatchNormalization layer) may be dependent on the inputs
+        passed when calling a layer. Hence, when reusing the same layer on
+        different inputs `a` and `b`, some entries in `layer.updates` may be
+        dependent on `a` and some on `b`. This method automatically keeps track
+        of dependencies.
+
+        This call is ignored when eager execution is enabled (in that case,
+        variable updates are run on the fly and thus do not need to be tracked
+        for later execution).
+
+        Args:
+          updates: Update op, or list/tuple of update ops, or zero-arg callable
+            that returns an update op. A zero-arg callable should be passed in
+            order to disable running the updates by setting `trainable=False`
+            on this Layer, when executing in Eager mode.
+        """
+        call_context = base_layer_utils.call_context()
+        # No need to run updates during Functional API construction.
+        if call_context.in_keras_graph:
+            return
+
+        # Callable updates are disabled by setting `trainable=False`.
+        if not call_context.frozen:
+            for update in tf.nest.flatten(updates):
+                if callable(update):
+                    update()
+
+    def set_weights(self, weights):
+        """Sets the weights of the layer, from NumPy arrays.
+
+        The weights of a layer represent the state of the layer. This function
+        sets the weight values from numpy arrays. The weight values should be
+        passed in the order they are created by the layer. Note that the layer's
+        weights must be instantiated before calling this function, by calling
+        the layer.
+
+        For example, a `Dense` layer returns a list of two values: the kernel
+        matrix and the bias vector. These can be used to set the weights of
+        another `Dense` layer:
+
+        >>> layer_a = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(1.))
+        >>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+        >>> layer_a.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+        >>> layer_b = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(2.))
+        >>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+        >>> layer_b.get_weights()
+        [array([[2.],
+               [2.],
+               [2.]], dtype=float32), array([0.], dtype=float32)]
+        >>> layer_b.set_weights(layer_a.get_weights())
+        >>> layer_b.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+
+        Args:
+          weights: a list of NumPy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the layer (i.e. it should match the
+            output of `get_weights`).
+
+        Raises:
+          ValueError: If the provided weights list does not match the
+            layer's specifications.
+        """
+        params = self.weights
+
+        expected_num_weights = 0
+        for param in params:
+            if isinstance(param, base_layer_utils.TrackableWeightHandler):
+                expected_num_weights += param.num_tensors
+            else:
+                expected_num_weights += 1
+
+        if expected_num_weights != len(weights):
+            raise ValueError(
+                'You called `set_weights(weights)` on layer "%s" '
+                "with a weight list of length %s, but the layer was "
+                "expecting %s weights. Provided weights: %s..."
+                % (
+                    self.name,
+                    len(weights),
+                    expected_num_weights,
+                    str(weights)[:50],
+                )
+            )
+
+        weight_index = 0
+        weight_value_tuples = []
+        for param in params:
+            if isinstance(param, base_layer_utils.TrackableWeightHandler):
+                num_tensors = param.num_tensors
+                tensors = weights[weight_index : weight_index + num_tensors]
+                param.set_weights(tensors)
+                weight_index += num_tensors
+            else:
+                weight = weights[weight_index]
+                weight_shape = weight.shape if hasattr(weight, "shape") else ()
+                ref_shape = param.shape
+                if not ref_shape.is_compatible_with(weight_shape):
+                    raise ValueError(
+                        f"Layer {self.name} weight shape {ref_shape} "
+                        "is not compatible with provided weight "
+                        f"shape {weight_shape}."
+                    )
+                weight_value_tuples.append((param, weight))
+                weight_index += 1
+
+        backend.batch_set_value(weight_value_tuples)
+
+        # Perform any layer defined finalization of the layer state.
+        for layer in self._flatten_layers():
+            layer.finalize_state()
+
+    def get_weights(self):
+        """Returns the current weights of the layer, as NumPy arrays.
+
+        The weights of a layer represent the state of the layer. This function
+        returns both trainable and non-trainable weight values associated with
+        this layer as a list of NumPy arrays, which can in turn be used to load
+        state into similarly parameterized layers.
+
+        For example, a `Dense` layer returns a list of two values: the kernel
+        matrix and the bias vector. These can be used to set the weights of
+        another `Dense` layer:
+
+        >>> layer_a = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(1.))
+        >>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+        >>> layer_a.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+        >>> layer_b = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(2.))
+        >>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+        >>> layer_b.get_weights()
+        [array([[2.],
+               [2.],
+               [2.]], dtype=float32), array([0.], dtype=float32)]
+        >>> layer_b.set_weights(layer_a.get_weights())
+        >>> layer_b.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+
+        Returns:
+            Weights values as a list of NumPy arrays.
+        """
+        weights = self.weights
+        output_weights = []
+        for weight in weights:
+            if isinstance(weight, base_layer_utils.TrackableWeightHandler):
+                output_weights.extend(weight.get_tensors())
+            else:
+                output_weights.append(weight)
+        return backend.batch_get_value(output_weights)
+
+    @doc_controls.do_not_generate_docs
+    def finalize_state(self):
+        """Finalizes the layers state after updating layer weights.
+
+        This function can be subclassed in a layer and will be called after
+        updating a layer weights. It can be overridden to finalize any
+        additional layer state after a weight update.
+
+        This function will be called after weights of a layer have been restored
+        from a loaded model.
+        """
+        pass
 
-    Note: This will not track the weights of nested `tf.Modules` that are not
-    themselves Keras layers.
+    @doc_controls.do_not_doc_inheritable
+    def get_input_mask_at(self, node_index):
+        """Retrieves the input mask tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A mask tensor
+            (or list of tensors if the layer has multiple inputs).
+        """
+        inputs = self.get_input_at(node_index)
+        if isinstance(inputs, list):
+            return [getattr(x, "_keras_mask", None) for x in inputs]
+        else:
+            return getattr(inputs, "_keras_mask", None)
+
+    @doc_controls.do_not_doc_inheritable
+    def get_output_mask_at(self, node_index):
+        """Retrieves the output mask tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A mask tensor
+            (or list of tensors if the layer has multiple outputs).
+        """
+        output = self.get_output_at(node_index)
+        if isinstance(output, list):
+            return [getattr(x, "_keras_mask", None) for x in output]
+        else:
+            return getattr(output, "_keras_mask", None)
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def input_mask(self):
+        """Retrieves the input mask tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one inbound node,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+            Input mask tensor (potentially None) or list of input
+            mask tensors.
+
+        Raises:
+            AttributeError: if the layer is connected to
+            more than one incoming layers.
+        """
+        inputs = self.input
+        if isinstance(inputs, list):
+            return [getattr(x, "_keras_mask", None) for x in inputs]
+        else:
+            return getattr(inputs, "_keras_mask", None)
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def output_mask(self):
+        """Retrieves the output mask tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one inbound node,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+            Output mask tensor (potentially None) or list of output
+            mask tensors.
+
+        Raises:
+            AttributeError: if the layer is connected to
+            more than one incoming layers.
+        """
+        output = self.output
+        if isinstance(output, list):
+            return [getattr(x, "_keras_mask", None) for x in output]
+        else:
+            return getattr(output, "_keras_mask", None)
+
+    @doc_controls.do_not_doc_inheritable
+    def get_input_shape_at(self, node_index):
+        """Retrieves the input shape(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A shape tuple
+            (or list of shape tuples if the layer has multiple inputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "input_shapes", "input shape"
+        )
+
+    @doc_controls.do_not_doc_inheritable
+    def get_output_shape_at(self, node_index):
+        """Retrieves the output shape(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A shape tuple
+            (or list of shape tuples if the layer has multiple outputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "output_shapes", "output shape"
+        )
+
+    @doc_controls.do_not_doc_inheritable
+    def get_input_at(self, node_index):
+        """Retrieves the input tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first input node of the layer.
+
+        Returns:
+            A tensor (or list of tensors if the layer has multiple inputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "input_tensors", "input"
+        )
+
+    @doc_controls.do_not_doc_inheritable
+    def get_output_at(self, node_index):
+        """Retrieves the output tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first output node of the layer.
+
+        Returns:
+            A tensor (or list of tensors if the layer has multiple outputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "output_tensors", "output"
+        )
+
+    @property
+    def input(self):
+        """Retrieves the input tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one input,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+            Input tensor or list of input tensors.
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+          AttributeError: If no inbound nodes are found.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                "Layer " + self.name + " is not connected, no input to return."
+            )
+        return self._get_node_attribute_at_index(0, "input_tensors", "input")
+
+    @property
+    def output(self):
+        """Retrieves the output tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one output,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+          Output tensor or list of output tensors.
+
+        Raises:
+          AttributeError: if the layer is connected to more than one incoming
+            layers.
+          RuntimeError: if called in Eager mode.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                "Layer " + self.name + " has no inbound nodes."
+            )
+        return self._get_node_attribute_at_index(0, "output_tensors", "output")
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def input_shape(self):
+        """Retrieves the input shape(s) of a layer.
+
+        Only applicable if the layer has exactly one input,
+        i.e. if it is connected to one incoming layer, or if all inputs
+        have the same shape.
+
+        Returns:
+            Input shape, as an integer shape tuple
+            (or list of shape tuples, one tuple per input tensor).
+
+        Raises:
+            AttributeError: if the layer has no defined input_shape.
+            RuntimeError: if called in Eager mode.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                f'The layer "{self.name}" has never been called '
+                "and thus has no defined input shape. Note that the "
+                "`input_shape` property is only available for "
+                "Functional and Sequential models."
+            )
+        all_input_shapes = set(
+            [str(node.input_shapes) for node in self._inbound_nodes]
+        )
+        if len(all_input_shapes) == 1:
+            return self._inbound_nodes[0].input_shapes
+        else:
+            raise AttributeError(
+                'The layer "'
+                + str(self.name)
+                + '" has multiple inbound nodes, '
+                "with different input shapes. Hence "
+                'the notion of "input shape" is '
+                "ill-defined for the layer. "
+                "Use `get_input_shape_at(node_index)` "
+                "instead."
+            )
+
+    def count_params(self):
+        """Count the total number of scalars composing the weights.
+
+        Returns:
+            An integer count.
+
+        Raises:
+            ValueError: if the layer isn't yet built
+              (in which case its weights aren't yet defined).
+        """
+        if not self.built:
+            if getattr(self, "_is_graph_network", False):
+                with tf_utils.maybe_init_scope(self):
+                    self._maybe_build(self.inputs)
+            else:
+                raise ValueError(
+                    "You tried to call `count_params` "
+                    f"on layer {self.name}"
+                    ", but the layer isn't built. "
+                    "You can build it manually via: "
+                    f"`{self.name}.build(batch_input_shape)`."
+                )
+        return layer_utils.count_params(self.weights)
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def output_shape(self):
+        """Retrieves the output shape(s) of a layer.
+
+        Only applicable if the layer has one output,
+        or if all outputs have the same shape.
+
+        Returns:
+            Output shape, as an integer shape tuple
+            (or list of shape tuples, one tuple per output tensor).
+
+        Raises:
+            AttributeError: if the layer has no defined output shape.
+            RuntimeError: if called in Eager mode.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                f'The layer "{self.name}" has never been called '
+                "and thus has no defined output shape."
+            )
+        all_output_shapes = set(
+            [str(node.output_shapes) for node in self._inbound_nodes]
+        )
+        if len(all_output_shapes) == 1:
+            return self._inbound_nodes[0].output_shapes
+        else:
+            raise AttributeError(
+                'The layer "%s"'
+                " has multiple inbound nodes, "
+                "with different output shapes. Hence "
+                'the notion of "output shape" is '
+                "ill-defined for the layer. "
+                "Use `get_output_shape_at(node_index)` "
+                "instead." % self.name
+            )
+
+    @property
+    def dtype_policy(self):
+        """The dtype policy associated with this layer.
+
+        This is an instance of a `tf.keras.mixed_precision.Policy`.
+        """
+        return self._dtype_policy
+
+    @property
+    def compute_dtype(self):
+        """The dtype of the layer's computations.
+
+        This is equivalent to `Layer.dtype_policy.compute_dtype`. Unless
+        mixed precision is used, this is the same as `Layer.dtype`, the dtype of
+        the weights.
+
+        Layers automatically cast their inputs to the compute dtype, which
+        causes computations and the output to be in the compute dtype as well.
+        This is done by the base Layer class in `Layer.__call__`, so you do not
+        have to insert these casts if implementing your own layer.
+
+        Layers often perform certain internal computations in higher precision
+        when `compute_dtype` is float16 or bfloat16 for numeric stability. The
+        output will still typically be float16 or bfloat16 in such cases.
+
+        Returns:
+          The layer's compute dtype.
+        """
+        return self._dtype_policy.compute_dtype
+
+    @property
+    def variable_dtype(self):
+        """Alias of `Layer.dtype`, the dtype of the weights."""
+        return self.dtype
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def inbound_nodes(self):
+        """Return Functional API nodes upstream of this layer."""
+        return self._inbound_nodes
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def outbound_nodes(self):
+        """Return Functional API nodes downstream of this layer."""
+        return self._outbound_nodes
+
+    ############################################################################
+    # Methods & attributes below are public aliases of other methods.          #
+    ############################################################################
+
+    @property
+    @doc_controls.do_not_generate_docs
+    def variables(self):
+        """Returns the list of all layer variables/weights.
+
+        Alias of `self.weights`.
+
+        Note: This will not track the weights of nested `tf.Modules` that are
+        not themselves Keras layers.
+
+        Returns:
+          A list of variables.
+        """
+        return self.weights
+
+    @property
+    @doc_controls.do_not_generate_docs
+    def trainable_variables(self):
+        return self.trainable_weights
+
+    @property
+    @doc_controls.do_not_generate_docs
+    def non_trainable_variables(self):
+        return self.non_trainable_weights
+
+    @doc_controls.do_not_doc_inheritable
+    def add_variable(self, *args, **kwargs):
+        """Deprecated, do NOT use! Alias for `add_weight`."""
+        warnings.warn(
+            "`layer.add_variable` is deprecated and "
+            "will be removed in a future version. "
+            "Please use the `layer.add_weight()` method instead.",
+            stacklevel=2,
+        )
+        return self.add_weight(*args, **kwargs)
+
+    def get_build_config(self):
+        """Returns a dictionary with the layer's input shape.
+
+        This method returns a config dict that can be used by
+        `build_from_config(config)` to create all states (e.g. Variables and
+        Lookup tables) needed by the layer.
+
+        By default, the config only contains the input shape that the layer
+        was built with. If you're writing a custom layer that creates state in
+        an unusual way, you should override this method to make sure this state
+        is already created when Keras attempts to load its value upon model
+        loading.
+
+        Returns:
+            A dict containing the input shape associated with the layer.
+        """
+        if self._build_input_shape is not None:
+
+            def convert_tensorshapes(x):
+                if isinstance(x, tf.TensorShape) and x._dims:
+                    return tuple(x.as_list())
+                return x
+
+            return {
+                "input_shape": tf.nest.map_structure(
+                    convert_tensorshapes, self._build_input_shape
+                )
+            }
+
+    def build_from_config(self, config):
+        """Builds the layer's states with the supplied config dict.
+
+        By default, this method calls the `build(config["input_shape"])` method,
+        which creates weights based on the layer's input shape in the supplied
+        config. If your config contains other information needed to load the
+        layer's state, you should override this method.
+
+        Args:
+            config: Dict containing the input shape associated with this layer.
+        """
+        input_shape = config["input_shape"]
+        if input_shape is not None:
+            self.build(input_shape)
+
+    ############################################################################
+    # Methods & attributes below are all private and only used by the framework.
+    ############################################################################
+
+    # See tf.Module for the usage of this property.
+    # The key for _obj_reference_counts_dict is a Trackable, which could be a
+    # variable or layer etc. tf.Module._flatten will fail to flatten the key
+    # since it is trying to convert Trackable to a string. This attribute can be
+    # ignored even after the fix of nest lib, since the trackable object should
+    # already been available as individual attributes.
+    # _obj_reference_counts_dict just contains a copy of them.
+    _TF_MODULE_IGNORED_PROPERTIES = frozenset(
+        itertools.chain(
+            ("_obj_reference_counts_dict",),
+            tf.Module._TF_MODULE_IGNORED_PROPERTIES,
+        )
+    )
+
+    # When loading from a SavedModel, Layers typically can be revived into a
+    # generic Layer wrapper. Sometimes, however, layers may implement methods
+    # that go beyond this wrapper, as in the case of PreprocessingLayers'
+    # `adapt` method. When this is the case, layer implementers can override
+    # must_restore_from_config to return True; layers with this property must
+    # be restored into their actual objects (and will fail if the object is
+    # not available to the restoration code).
+    _must_restore_from_config = False
+
+    def _get_cell_name(self):
+        canonical_name = get_canonical_name_for_symbol(
+            self.__class__, api_name="keras", add_prefix_to_v1_names=True
+        )
+        if canonical_name is not None:
+            return f"tf.{canonical_name}"
+        return self.__class__.__module__ + "." + self.__class__.__name__
+
+    def _instrument_layer_creation(self):
+        self._instrumented_keras_api = False
+        self._instrumented_keras_layer_class = False
+        self._instrumented_keras_model_class = False
+        if not getattr(self, "_disable_keras_instrumentation", False):
+            keras_api_gauge.get_cell("layer").set(True)
+            self._instrumented_keras_api = True
+            if getattr(self, "_is_model_for_instrumentation", False):
+                keras_models_gauge.get_cell(self._get_cell_name()).set(True)
+                self._instrumented_keras_model_class = True
+            else:
+                keras_layers_gauge.get_cell(self._get_cell_name()).set(True)
+                self._instrumented_keras_layer_class = True
+        else:
+            # This is a legacy layer that has disabled instrumentation
+            # as a native keras object. We still instrument this as
+            # legacy usage.
+            keras_api_gauge.get_cell("legacy_layer").set(True)
+
+    @doc_controls.for_subclass_implementers
+    def _add_trackable(self, trackable_object, trainable):
+        """Adds a Trackable object to this layer's state.
+
+        Args:
+          trackable_object: The tf.tracking.Trackable object to add.
+          trainable: Boolean, whether the variable should be part of the layer's
+            "trainable_variables" (e.g. variables, biases) or
+            "non_trainable_variables" (e.g. BatchNorm mean and variance).
+
+        Returns:
+          The TrackableWeightHandler used to track this object.
+        """
+        if isinstance(
+            trackable_object, base_layer_utils.TrackableWeightHandler
+        ):
+            handler = trackable_object
+        else:
+            handler = base_layer_utils.TrackableWeightHandler(trackable_object)
+        if trainable:
+            self._trainable_weights.append(handler)
+        else:
+            self._non_trainable_weights.append(handler)
+        return handler
+
+    def _clear_losses(self):
+        """Used every step in eager to reset losses."""
+        # Set to thread local directly to avoid Layer.__setattr__ overhead.
+        if not getattr(
+            self, "_self_tracked_trackables", None
+        ):  # Fast path for single Layer.
+            self._thread_local._eager_losses = []
+        else:
+            for layer in self._flatten_layers():
+                layer._thread_local._eager_losses = []
+
+    def _keras_tensor_symbolic_call(self, inputs, input_masks, args, kwargs):
+        if self.dynamic:
+            # We will use static shape inference to return symbolic tensors
+            # matching the specifications of the layer outputs.
+            # Since `self.dynamic` is True, we will never attempt to
+            # run the underlying TF graph (which is disconnected).
+            # TODO(fchollet): consider py_func as an alternative, which
+            # would enable us to run the underlying graph if needed.
+            input_signature = tf.nest.map_structure(
+                lambda x: tf.TensorSpec(shape=x.shape, dtype=x.dtype), inputs
+            )
+            output_signature = self.compute_output_signature(input_signature)
+            return tf.nest.map_structure(
+                keras_tensor.KerasTensor, output_signature
+            )
+        else:
+            return self._infer_output_signature(
+                inputs, args, kwargs, input_masks
+            )
 
-    Returns:
-      A list of variables.
-    """
-    return self.weights
-
-  @property
-  @doc_controls.do_not_generate_docs
-  def trainable_variables(self):
-    return self.trainable_weights
-
-  @property
-  @doc_controls.do_not_generate_docs
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
-
-  @doc_controls.do_not_doc_inheritable
-  def add_variable(self, *args, **kwargs):
-    """Deprecated, do NOT use! Alias for `add_weight`."""
-    warnings.warn(
-        '`layer.add_variable` is deprecated and '
-        'will be removed in a future version. '
-        'Please use the `layer.add_weight()` method instead.',
-        stacklevel=2)
-    return self.add_weight(*args, **kwargs)
-
-  ##############################################################################
-  # Methods & attributes below are all private and only used by the framework. #
-  ##############################################################################
-
-  # See tf.Module for the usage of this property.
-  # The key for _obj_reference_counts_dict is a Trackable, which could be a
-  # variable or layer etc. tf.Module._flatten will fail to flatten the key
-  # since it is trying to convert Trackable to a string. This attribute can be
-  # ignored even after the fix of nest lib, since the trackable object should
-  # already been available as individual attributes. _obj_reference_counts_dict
-  # just contains a copy of them.
-  _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
-      ('_obj_reference_counts_dict',),
-      tf.Module._TF_MODULE_IGNORED_PROPERTIES
-  ))
-
-  # When loading from a SavedModel, Layers typically can be revived into a
-  # generic Layer wrapper. Sometimes, however, layers may implement methods
-  # that go beyond this wrapper, as in the case of PreprocessingLayers'
-  # `adapt` method. When this is the case, layer implementers can override
-  # must_restore_from_config to return True; layers with this property must
-  # be restored into their actual objects (and will fail if the object is
-  # not available to the restoration code).
-  _must_restore_from_config = False
-
-  def _get_cell_name(self):
-    canonical_name = get_canonical_name_for_symbol(
-        self.__class__, api_name='keras', add_prefix_to_v1_names=True)
-    if canonical_name is not None:
-      return 'tf.{}'.format(canonical_name)
-    return self.__class__.__module__ + '.' + self.__class__.__name__
-
-  def _instrument_layer_creation(self):
-    self._instrumented_keras_api = False
-    self._instrumented_keras_layer_class = False
-    self._instrumented_keras_model_class = False
-    if not getattr(self, '_disable_keras_instrumentation', False):
-      keras_api_gauge.get_cell('layer').set(True)
-      self._instrumented_keras_api = True
-      if getattr(self, '_is_model_for_instrumentation', False):
-        keras_models_gauge.get_cell(self._get_cell_name()).set(True)
-        self._instrumented_keras_model_class = True
-      else:
-        keras_layers_gauge.get_cell(self._get_cell_name()).set(True)
-        self._instrumented_keras_layer_class = True
-    else:
-      # This is a legacy layer that has disabled instrumentation
-      # as a native keras object. We still instrument this as
-      # legacy usage.
-      keras_api_gauge.get_cell('legacy_layer').set(True)
-
-  @doc_controls.for_subclass_implementers
-  def _add_trackable(self, trackable_object, trainable):
-    """Adds a Trackable object to this layer's state.
+    def _infer_output_signature(self, inputs, args, kwargs, input_masks):
+        """Call the layer on input KerasTensors, returns output KerasTensors."""
 
-    Args:
-      trackable_object: The tf.tracking.Trackable object to add.
-      trainable: Boolean, whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases) or
-        "non_trainable_variables" (e.g. BatchNorm mean and variance).
+        keras_tensor_inputs = inputs
+        call_fn = self.call
+        # Wrapping `call` function in autograph to allow for dynamic control
+        # flow and control dependencies in call. We are limiting this to
+        # subclassed layers as autograph is strictly needed only for
+        # subclassed layers and models.
+        # tf_convert will respect the value of autograph setting in the
+        # enclosing tf.function, if any.
+        if base_layer_utils.is_subclassed(
+            self
+        ) and not base_layer_utils.from_saved_model(self):
+            call_fn = tf.__internal__.autograph.tf_convert(
+                self.call, tf.__internal__.autograph.control_status_ctx()
+            )
+
+        call_fn = traceback_utils.inject_argument_info_in_traceback(
+            call_fn,
+            object_name=f'layer "{self.name}" (type {self.__class__.__name__})',
+        )
+
+        # We enter a scratch graph and build placeholder inputs inside of it
+        # that match the input args.
+        # We then call the layer inside of the scratch graph to identify the
+        # output signatures, then we build KerasTensors corresponding to those
+        # outputs.
+        scratch_graph = tf.__internal__.FuncGraph(
+            str(self.name) + "_scratch_graph"
+        )
+        with scratch_graph.as_default():
+            inputs = tf.nest.map_structure(
+                keras_tensor.keras_tensor_to_placeholder, inputs
+            )
+            args = tf.nest.map_structure(
+                keras_tensor.keras_tensor_to_placeholder, args
+            )
+            kwargs = tf.nest.map_structure(
+                keras_tensor.keras_tensor_to_placeholder, kwargs
+            )
+            input_masks = tf.nest.map_structure(
+                keras_tensor.keras_tensor_to_placeholder, input_masks
+            )
+
+            with backend.name_scope(self._name_scope()):
+                with autocast_variable.enable_auto_cast_variables(
+                    self._compute_dtype_object
+                ):
+                    # Build layer if applicable (if the `build` method has been
+                    # overridden).
+                    # TODO(kaftan): do we maybe_build here, or have we already
+                    # done it?
+                    self._maybe_build(inputs)
+                    inputs = self._maybe_cast_inputs(inputs)
+                    outputs = call_fn(inputs, *args, **kwargs)
+
+                self._handle_activity_regularization(inputs, outputs)
+            self._set_mask_metadata(
+                inputs, outputs, input_masks, build_graph=False
+            )
+            outputs = tf.nest.map_structure(
+                keras_tensor.keras_tensor_from_tensor, outputs
+            )
+
+        self._set_save_spec(keras_tensor_inputs, args, kwargs)
+        if hasattr(self, "_set_inputs") and not self.inputs:
+            # TODO(kaftan): figure out if we need to do this at all
+            # Subclassed network: explicitly set metadata normally set by
+            # a call to self._set_inputs().
+            self._set_inputs(inputs, outputs)
+        del scratch_graph
+        return outputs
 
-    Returns:
-      The TrackableWeightHandler used to track this object.
-    """
-    if isinstance(trackable_object, base_layer_utils.TrackableWeightHandler):
-      handler = trackable_object
-    else:
-      handler = base_layer_utils.TrackableWeightHandler(trackable_object)
-    if trainable:
-      self._trainable_weights.append(handler)
-    else:
-      self._non_trainable_weights.append(handler)
-    return handler
-
-  def _clear_losses(self):
-    """Used every step in eager to reset losses."""
-    # Set to thread local directly to avoid Layer.__setattr__ overhead.
-    if not getattr(self, '_self_tracked_trackables',
-                   None):  # Fast path for single Layer.
-      self._thread_local._eager_losses = []
-    else:
-      for layer in self._flatten_layers():
-        layer._thread_local._eager_losses = []
-
-  def _keras_tensor_symbolic_call(self, inputs, input_masks, args, kwargs):
-    if self.dynamic:
-      # We will use static shape inference to return symbolic tensors
-      # matching the specifications of the layer outputs.
-      # Since `self.dynamic` is True, we will never attempt to
-      # run the underlying TF graph (which is disconnected).
-      # TODO(fchollet): consider py_func as an alternative, which
-      # would enable us to run the underlying graph if needed.
-      input_signature = tf.nest.map_structure(
-          lambda x: tf.TensorSpec(shape=x.shape, dtype=x.dtype),
-          inputs)
-      output_signature = self.compute_output_signature(input_signature)
-      return tf.nest.map_structure(keras_tensor.KerasTensor, output_signature)
-    else:
-      return self._infer_output_signature(inputs, args, kwargs, input_masks)
-
-  def _infer_output_signature(self, inputs, args, kwargs, input_masks):
-    """Call the layer on input KerasTensors and returns output KerasTensors."""
-
-    keras_tensor_inputs = inputs
-    call_fn = self.call
-    # Wrapping `call` function in autograph to allow for dynamic control
-    # flow and control dependencies in call. We are limiting this to
-    # subclassed layers as autograph is strictly needed only for
-    # subclassed layers and models.
-    # tf_convert will respect the value of autograph setting in the
-    # enclosing tf.function, if any.
-    if (base_layer_utils.is_subclassed(self) and
-        not base_layer_utils.from_saved_model(self)):
-      call_fn = tf.__internal__.autograph.tf_convert(
-          self.call, tf.__internal__.autograph.control_status_ctx())
-
-    call_fn = traceback_utils.inject_argument_info_in_traceback(
-        call_fn,
-        object_name=f'layer "{self.name}" (type {self.__class__.__name__})')
-
-    # We enter a scratch graph and build placeholder inputs inside of it that
-    # match the input args.
-    # We then call the layer inside of the scratch graph to identify the
-    # output signatures, then we build KerasTensors corresponding to those
-    # outputs.
-    scratch_graph = tf.__internal__.FuncGraph(str(self.name) + '_scratch_graph')
-    with scratch_graph.as_default():
-      inputs = tf.nest.map_structure(
-          keras_tensor.keras_tensor_to_placeholder, inputs)
-      args = tf.nest.map_structure(
-          keras_tensor.keras_tensor_to_placeholder, args)
-      kwargs = tf.nest.map_structure(
-          keras_tensor.keras_tensor_to_placeholder, kwargs)
-      input_masks = tf.nest.map_structure(
-          keras_tensor.keras_tensor_to_placeholder, input_masks)
-
-      with backend.name_scope(self._name_scope()):  # pylint: disable=not-callable
-        with autocast_variable.enable_auto_cast_variables(
-            self._compute_dtype_object):
-          # Build layer if applicable (if the `build` method has been
-          # overridden).
-          # TODO(kaftan): do we maybe_build here, or have we already done it?
-          self._maybe_build(inputs)
-          inputs = self._maybe_cast_inputs(inputs)
-          outputs = call_fn(inputs, *args, **kwargs)
-
-        self._handle_activity_regularization(inputs, outputs)
-      self._set_mask_metadata(inputs, outputs, input_masks,
-                              build_graph=False)
-      outputs = tf.nest.map_structure(
-          keras_tensor.keras_tensor_from_tensor, outputs)
-
-    self._set_save_spec(keras_tensor_inputs, args, kwargs)
-    if hasattr(self, '_set_inputs') and not self.inputs:
-      # TODO(kaftan): figure out if we need to do this at all
-      # Subclassed network: explicitly set metadata normally set by
-      # a call to self._set_inputs().
-      self._set_inputs(inputs, outputs)
-    del scratch_graph
-    return outputs
-
-  def _functional_construction_call(self, inputs, args, kwargs, input_list):
-    call_context = base_layer_utils.call_context()
-
-    # Accept NumPy and scalar inputs by converting to Tensors.
-    if any(isinstance(x, (
-        tf.Tensor, np.ndarray, float, int)) for x in input_list):
-
-      def _convert_non_tensor(x):
-        # Don't call `ops.convert_to_tensor` on all `inputs` because
-        # `SparseTensors` can't be converted to `Tensor`.
-        if isinstance(x, (tf.Tensor, np.ndarray, float, int)):
-          return tf.convert_to_tensor(x)
-        return x
-
-      inputs = tf.nest.map_structure(_convert_non_tensor, inputs)
-      input_list = tf.nest.flatten(inputs)
-
-    # Handle `mask` propagation from previous layer to current layer. Masks can
-    # be propagated explicitly via the `mask` argument, or implicitly via
-    # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed
-    # explicitly take priority.
-    mask_arg_passed_by_framework = False
-    input_masks, mask_is_implicit = self._get_input_masks(
-        inputs, input_list, args, kwargs)
-    if self._expects_mask_arg and mask_is_implicit:
-      kwargs['mask'] = input_masks
-      mask_arg_passed_by_framework = True
-
-    # If `training` argument is None or not explicitly passed,
-    # propagate `training` value from this layer's calling layer.
-    training_value = None
-    training_arg_passed_by_framework = False
-    # Priority 1: `training` was explicitly passed a non-None value.
-    if self._call_spec.arg_was_passed('training', args, kwargs):
-      training_value = self._call_spec.get_arg_value('training', args, kwargs)
-      if not self._expects_training_arg:
-        kwargs.pop('training')
-
-    if training_value is None:
-      # Priority 2: `training` was passed to a parent layer.
-      if call_context.training is not None:
-        training_value = call_context.training
-      # Priority 3: `learning_phase()` has been set.
-      elif backend.global_learning_phase_is_set():
-        training_value = backend.learning_phase()
-        # Force the training_value to be bool type which matches to the contract
-        # for layer/model call args.
-        if tf.is_tensor(training_value):
-          training_value = tf.cast(training_value, tf.bool)
+    def _functional_construction_call(self, inputs, args, kwargs, input_list):
+        call_context = base_layer_utils.call_context()
+
+        # Accept NumPy and scalar inputs by converting to Tensors.
+        if any(
+            isinstance(x, (tf.Tensor, np.ndarray, float, int))
+            for x in input_list
+        ):
+
+            def _convert_non_tensor(x):
+                # Don't call `ops.convert_to_tensor` on all `inputs` because
+                # `SparseTensors` can't be converted to `Tensor`.
+                if isinstance(x, (tf.Tensor, np.ndarray, float, int)):
+                    return tf.convert_to_tensor(x)
+                return x
+
+            inputs = tf.nest.map_structure(_convert_non_tensor, inputs)
+            input_list = tf.nest.flatten(inputs)
+
+        # Handle `mask` propagation from previous layer to current layer. Masks
+        # can be propagated explicitly via the `mask` argument, or implicitly
+        # via setting the `_keras_mask` attribute on the inputs to a Layer.
+        # Masks passed explicitly take priority.
+        mask_arg_passed_by_framework = False
+        input_masks, mask_is_implicit = self._get_input_masks(
+            inputs, input_list, args, kwargs
+        )
+        if self._expects_mask_arg and mask_is_implicit:
+            kwargs["mask"] = input_masks
+            mask_arg_passed_by_framework = True
+
+        # If `training` argument is None or not explicitly passed,
+        # propagate `training` value from this layer's calling layer.
+        training_value = None
+        training_arg_passed_by_framework = False
+        # Priority 1: `training` was explicitly passed a non-None value.
+        if self._call_spec.arg_was_passed("training", args, kwargs):
+            training_value = self._call_spec.get_arg_value(
+                "training", args, kwargs
+            )
+            if not self._expects_training_arg:
+                kwargs.pop("training")
+
+        if training_value is None:
+            # Priority 2: `training` was passed to a parent layer.
+            if call_context.training is not None:
+                training_value = call_context.training
+            # Priority 3: `learning_phase()` has been set.
+            elif backend.global_learning_phase_is_set():
+                training_value = backend.learning_phase()
+                # Force the training_value to be bool type which matches to the
+                # contract for layer/model call args.
+                if tf.is_tensor(training_value):
+                    training_value = tf.cast(training_value, tf.bool)
+                else:
+                    training_value = bool(training_value)
+            # Priority 4: trace layer with the default training argument
+            # specified in the `call` signature (or in inference mode if the
+            # `call` signature specifies no non-None default).
+            else:
+                training_value = self._call_spec.default_training_arg
+            # In cases (2), (3), (4) the training argument is passed
+            # automatically by the framework, and will not be hard-coded into
+            # the model.
+            if self._expects_training_arg:
+                args, kwargs = self._call_spec.set_arg_value(
+                    "training", training_value, args, kwargs
+                )
+                training_arg_passed_by_framework = True
+
+        with call_context.enter(
+            layer=self, inputs=inputs, build_graph=True, training=training_value
+        ):
+            # Check input assumptions set after layer building, e.g. input
+            # shape.
+            try:
+                outputs = self._keras_tensor_symbolic_call(
+                    inputs, input_masks, args, kwargs
+                )
+            except TypeError as e:
+                if "DictWrapper" in str(e):
+                    raise TypeError(
+                        f"{self} could not be deserialized properly. Please"
+                        " ensure that components that are Python object"
+                        " instances (layers, models, etc.) returned by"
+                        " `get_config()` are explicitly deserialized in the"
+                        " model's `from_config()` method."
+                    ) from e
+                else:
+                    raise e
+
+            if outputs is None:
+                raise ValueError(
+                    "A layer's `call` method should return a "
+                    "Tensor or a list of Tensors, not None "
+                    "(layer: " + self.name + ")."
+                )
+            if training_arg_passed_by_framework:
+                args, kwargs = self._call_spec.set_arg_value(
+                    "training", None, args, kwargs, pop_kwarg_if_none=True
+                )
+            if mask_arg_passed_by_framework:
+                kwargs.pop("mask")
+            # Node connectivity does not special-case the first argument.
+            outputs = self._set_connectivity_metadata(
+                (inputs,) + args, kwargs, outputs
+            )
+            return outputs
+
+    def _set_training_mode(self, args, kwargs, call_context):
+        training_mode = None
+        if self._expects_training_arg:
+            # (1) `training` was passed to this `Layer.call`.
+            if self._call_spec.arg_was_passed("training", args, kwargs):
+                training_mode = self._call_spec.get_arg_value(
+                    "training", args, kwargs
+                )
+            # If no `training` arg was passed, or `None` was explicitly passed,
+            # the framework will make a decision about the training mode is.
+            if training_mode is None:
+                call_ctx_training = call_context.training
+                # (2) `training` mode is inferred from an outer `Layer.call`.
+                if call_ctx_training is not None:
+                    training_mode = call_ctx_training
+                # (3) User set `tf.keras.backend.set_learning_phase`.
+                elif backend.global_learning_phase_is_set():
+                    training_mode = backend.learning_phase()
+                    # Ensure value is a `bool` or `tf.bool`.
+                    if isinstance(training_mode, bool):
+                        pass
+                    elif tf.is_tensor(training_mode):
+                        training_mode = tf.cast(training_mode, tf.bool)
+                    else:
+                        training_mode = bool(training_mode)
+                # (4) We default to using `call`'s default value for `training`,
+                # or treating the layer as if it is in inference if no non-None
+                # default is specified in the `call` signature.
+                else:
+                    training_mode = self._call_spec.default_training_arg
+
+                # For case (2), (3), (4) `training` arg is passed by framework.
+                args, kwargs = self._call_spec.set_arg_value(
+                    "training", training_mode, args, kwargs
+                )
         else:
-          training_value = bool(training_value)
-      # Priority 4: trace layer with the default training argument specified
-      # in the `call` signature (or in inference mode if the `call` signature
-      # specifies no non-None default).
-      else:
-        training_value = self._call_spec.default_training_arg
-      # In cases (2), (3), (4) the training argument is passed automatically
-      # by the framework, and will not be hard-coded into the model.
-      if self._expects_training_arg:
-        args, kwargs = self._call_spec.set_arg_value('training', training_value,
-                                                     args, kwargs)
-        training_arg_passed_by_framework = True
-
-    with call_context.enter(
-        layer=self, inputs=inputs, build_graph=True, training=training_value):
-      # Check input assumptions set after layer building, e.g. input shape.
-      outputs = self._keras_tensor_symbolic_call(
-          inputs, input_masks, args, kwargs)
-
-      if outputs is None:
-        raise ValueError('A layer\'s `call` method should return a '
-                         'Tensor or a list of Tensors, not None '
-                         '(layer: ' + self.name + ').')
-      if training_arg_passed_by_framework:
-        args, kwargs = self._call_spec.set_arg_value(
-            'training', None, args, kwargs, pop_kwarg_if_none=True)
-      if mask_arg_passed_by_framework:
-        kwargs.pop('mask')
-      # Node connectivity does not special-case the first argument.
-      outputs = self._set_connectivity_metadata((inputs,) + args, kwargs,
-                                                outputs)
-      return outputs
-
-  def _set_training_mode(self, args, kwargs, call_context):
-    training_mode = None
-    if self._expects_training_arg:
-      # (1) `training` was passed to this `Layer.call`.
-      if self._call_spec.arg_was_passed('training', args, kwargs):
-        training_mode = self._call_spec.get_arg_value('training', args, kwargs)
-      # If no `training` arg was passed, or `None` was explicitly passed,
-      # the framework will make a decision about the training mode is.
-      if training_mode is None:
-        call_ctx_training = call_context.training
-        # (2) `training` mode is inferred from an outer `Layer.call`.
-        if call_ctx_training is not None:
-          training_mode = call_ctx_training
-        # (3) User set `tf.keras.backend.set_learning_phase`.
-        elif backend.global_learning_phase_is_set():
-          training_mode = backend.learning_phase()
-          # Ensure value is a `bool` or `tf.bool`.
-          if isinstance(training_mode, bool):
-            pass
-          elif tf.is_tensor(training_mode):
-            training_mode = tf.cast(training_mode, tf.bool)
-          else:
-            training_mode = bool(training_mode)
-        # (4) We default to using `call`'s default value for `training`,
-        # or treating the layer as if it is in inference if no non-None default
-        # is specified in the `call` signature.
+            if "training" in kwargs:
+                # `training` was passed to this `Layer` but is not needed for
+                # `Layer.call`. It will set the default mode for inner
+                # `Layer.call`s.
+                training_mode = kwargs.pop("training")
+            else:
+                # Grab the current `training` mode from any outer `Layer.call`.
+                training_mode = call_context.training
+
+        return args, kwargs, training_mode
+
+    def _autographed_call(self):
+        # Wrapping `call` function in autograph to allow for dynamic control
+        # flow and control dependencies in call. We are limiting this to
+        # subclassed layers as autograph is strictly needed only for
+        # subclassed layers and models.
+        # tf_convert will respect the value of autograph setting in the
+        # enclosing tf.function, if any.
+        if base_layer_utils.is_subclassed(
+            self
+        ) and not base_layer_utils.from_saved_model(self):
+            return tf.__internal__.autograph.tf_convert(
+                self.call, tf.__internal__.autograph.control_status_ctx()
+            )
+        else:
+            return self.call
+
+    @property
+    def _inbound_nodes(self):
+        return self._inbound_nodes_value
+
+    @_inbound_nodes.setter
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _inbound_nodes(self, value):
+        self._inbound_nodes_value = value
+
+    @property
+    def _outbound_nodes(self):
+        return self._outbound_nodes_value
+
+    @_outbound_nodes.setter
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _outbound_nodes(self, value):
+        self._outbound_nodes_value = value
+
+    def _set_dtype_policy(self, dtype):
+        """Sets self._dtype_policy."""
+        self._dtype_policy = policy.get_policy(dtype)
+
+        # Performance optimization: cache the compute dtype as a Dtype object or
+        # None, so that str to Dtype conversion doesn't happen in
+        # Layer.__call__.
+        # TODO(b/157486353): Investigate returning DTypes in Policy.
+        if self._dtype_policy.compute_dtype:
+            self._compute_dtype_object = tf.as_dtype(
+                self._dtype_policy.compute_dtype
+            )
+        else:
+            self._compute_dtype_object = None
+
+    @property
+    def _compute_dtype(self):
+        """Deprecated alias of `compute_dtype`."""
+        return self._dtype_policy.compute_dtype
+
+    def _maybe_cast_inputs(self, inputs, input_list=None):
+        """Maybe casts the inputs to the compute dtype.
+
+        If self._compute_dtype is floating-point, and self_autocast is True,
+        floating-point inputs are casted to self._compute_dtype.
+
+        Args:
+          inputs: Input tensor, or structure of input tensors.
+          input_list: Flat list of input tensors.
+
+        Returns:
+          `inputs`, but tensors may have been casted to self._compute_dtype
+        """
+        if not input_list:
+            input_list = tf.nest.flatten(inputs)
+
+        compute_dtype_object = self._compute_dtype_object
+        should_autocast = (
+            self._autocast
+            and compute_dtype_object
+            and compute_dtype_object.is_floating
+        )
+
+        if should_autocast and any(
+            map(self._should_cast_single_input, input_list)
+        ):
+            # Only perform expensive `nest` operation when needed.
+            return tf.nest.map_structure(self._cast_single_input, inputs)
         else:
-          training_mode = self._call_spec.default_training_arg
-
-        # For case (2), (3), (4) `training` arg is passed by framework.
-        args, kwargs = self._call_spec.set_arg_value('training', training_mode,
-                                                     args, kwargs)
-    else:
-      if 'training' in kwargs:
-        # `training` was passed to this `Layer` but is not needed for
-        # `Layer.call`. It will set the default mode for inner `Layer.call`s.
-        training_mode = kwargs.pop('training')
-      else:
-        # Grab the current `training` mode from any outer `Layer.call`.
-        training_mode = call_context.training
-
-    return args, kwargs, training_mode
-
-  def _autographed_call(self):
-    # Wrapping `call` function in autograph to allow for dynamic control
-    # flow and control dependencies in call. We are limiting this to
-    # subclassed layers as autograph is strictly needed only for
-    # subclassed layers and models.
-    # tf_convert will respect the value of autograph setting in the
-    # enclosing tf.function, if any.
-    if (base_layer_utils.is_subclassed(self) and
-        not base_layer_utils.from_saved_model(self)):
-      return tf.__internal__.autograph.tf_convert(
-          self.call, tf.__internal__.autograph.control_status_ctx())
-    else:
-      return self.call
-
-  @property
-  def _inbound_nodes(self):
-    return self._inbound_nodes_value
-
-  @_inbound_nodes.setter
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _inbound_nodes(self, value):
-    self._inbound_nodes_value = value
-
-  @property
-  def _outbound_nodes(self):
-    return self._outbound_nodes_value
-
-  @_outbound_nodes.setter
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _outbound_nodes(self, value):
-    self._outbound_nodes_value = value
-
-  def _set_dtype_policy(self, dtype):
-    """Sets self._dtype_policy."""
-    if isinstance(dtype, policy.Policy):
-      self._dtype_policy = dtype
-    elif isinstance(dtype, dict):
-      self._dtype_policy = policy.deserialize(dtype)
-    elif isinstance(dtype, str) and dtype in ('mixed_float16',
-                                              'mixed_bfloat16'):
-      # The isinstance check is required since np.dtype raises an error if
-      # compared to a non-dtype string.
-      self._dtype_policy = policy.Policy(dtype)
-    elif dtype:
-      self._dtype_policy = policy.Policy(tf.as_dtype(dtype).name)
-    else:
-      self._dtype_policy = policy.global_policy()
-    if (self._dtype_policy.name == 'mixed_float16' and
-        not loss_scale_optimizer.strategy_supports_loss_scaling()):
-      # Although only loss scaling doesn't support certain strategies, to avoid
-      # confusion, we disallow the 'mixed_float16' policy with unsupported
-      # strategies. This is because 'mixed_float16' requires loss scaling for
-      # numeric stability.
-      strategy = tf.distribute.get_strategy()
-      raise ValueError('Mixed precision is not supported with the '
-                       'tf.distribute.Strategy: %s. Either stop using mixed '
-                       'precision by removing the use of the "%s" policy or '
-                       'use a different Strategy, e.g. a MirroredStrategy.' %
-                       (strategy.__class__.__name__, self._dtype_policy.name))
-
-    # Performance optimization: cache the compute dtype as a Dtype object or
-    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
-    # TODO(b/157486353): Investigate returning DTypes in Policy.
-    if self._dtype_policy.compute_dtype:
-      self._compute_dtype_object = tf.as_dtype(
-          self._dtype_policy.compute_dtype)
-    else:
-      self._compute_dtype_object = None
-
-  @property
-  def _compute_dtype(self):
-    """Deprecated alias of `compute_dtype`."""
-    return self._dtype_policy.compute_dtype
-
-  def _maybe_cast_inputs(self, inputs, input_list=None):
-    """Maybe casts the inputs to the compute dtype.
-
-    If self._compute_dtype is floating-point, and self_autocast is True,
-    floating-point inputs are casted to self._compute_dtype.
+            return inputs
+
+    def _should_cast_single_input(self, x):
+        if isinstance(x, _AUTOCAST_TYPES):
+            return (
+                self._compute_dtype_object
+                and x.dtype != self._compute_dtype_object
+                and x.dtype.is_floating
+            )
+        return False
+
+    def _cast_single_input(self, x):
+        """Cast a single Tensor or TensorSpec to the compute dtype."""
+        if self._should_cast_single_input(x):
+            return tf.cast(x, self._compute_dtype_object)
+        else:
+            return x
+
+    # _dtype used to be an attribute set in the constructor. We still expose it
+    # because some clients still use it.
+    # TODO(reedwm): Deprecate, then remove the _dtype property.
+    @property
+    def _dtype(self):
+        # This is equivalent to returning self.dtype . We do not return
+        # self.dtype as it would cause infinite recursion in a few subclasses,
+        # which override "dtype" to return self._dtype.
+        return self._dtype_policy.variable_dtype
+
+    @_dtype.setter
+    def _dtype(self, value):
+        value = tf.as_dtype(value).name
+        self._set_dtype_policy(policy.Policy(value))
+
+    def _name_scope(self):
+        if not tf.__internal__.tf2.enabled():
+            return self.name
+        name_scope = self.name
+        current_name_scope = tf.__internal__.get_name_scope()
+        if current_name_scope:
+            name_scope = current_name_scope + "/" + name_scope
+        if name_scope:
+            # Note that the trailing `/` prevents autogenerated
+            # numerical suffixes to get appended. It will also fully reset
+            # nested name scope (i.e. the outer name scope has no effect).
+            name_scope += "/"
+        return name_scope
+
+    def _init_set_name(self, name, zero_based=True):
+        if name is None:
+            self._name = backend.unique_object_name(
+                generic_utils.to_snake_case(self.__class__.__name__),
+                zero_based=zero_based,
+            )
+        elif isinstance(name, str):
+            backend.observe_object_name(name)
+            self._name = name
+        else:
+            raise TypeError(
+                f"Expected `name` argument to be a string, but got: {name}"
+            )
+
+    def _get_existing_metric(self, name=None):
+        match = [m for m in self._metrics if m.name == name]
+        if not match:
+            return
+        if len(match) > 1:
+            raise ValueError(
+                "Please provide different names for the metrics you have "
+                'added. We found {} metrics with the name: "{}"'.format(
+                    len(match), name
+                )
+            )
+        return match[0]
+
+    def _handle_weight_regularization(self, name, variable, regularizer):
+        """Create lambdas which compute regularization losses."""
+
+        def _loss_for_variable(v):
+            """Creates a regularization loss `Tensor` for variable `v`."""
+            with backend.name_scope(name + "/Regularizer"):
+                regularization = regularizer(v)
+            return regularization
+
+        if base_layer_utils.is_split_variable(variable):
+            for v in variable:
+                self.add_loss(functools.partial(_loss_for_variable, v))
+        elif isinstance(variable, lazy_variable.LazyInitVariable):
+            self._captured_weight_regularizer.append(
+                (name, variable, regularizer)
+            )
+        else:
+            self.add_loss(functools.partial(_loss_for_variable, variable))
 
-    Args:
-      inputs: Input tensor, or structure of input tensors.
-      input_list: Flat list of input tensors.
+    def _handle_activity_regularization(self, inputs, outputs):
+        # Apply activity regularization.
+        # Note that it should be applied every time the layer creates a new
+        # output, since it is output-specific.
+        if self._activity_regularizer:
+            output_list = tf.nest.flatten(outputs)
+            with backend.name_scope("ActivityRegularizer"):
+                for output in output_list:
+                    activity_loss = tf.convert_to_tensor(
+                        self._activity_regularizer(output)
+                    )
+                    batch_size = tf.cast(
+                        tf.shape(output)[0], activity_loss.dtype
+                    )
+                    # Make activity regularization strength batch-agnostic.
+                    mean_activity_loss = tf.math.divide_no_nan(
+                        activity_loss, batch_size
+                    )
+                    self.add_loss(mean_activity_loss)
+
+    def _set_mask_metadata(self, inputs, outputs, previous_mask, build_graph):
+        # Many `Layer`s don't need to call `compute_mask`.
+        # This method is optimized to do as little work as needed for the common
+        # case.
+        if not self._supports_masking:
+            return
+
+        flat_outputs = tf.nest.flatten(outputs)
+
+        mask_already_computed = getattr(
+            self, "_compute_output_and_mask_jointly", False
+        ) or all(
+            getattr(x, "_keras_mask", None) is not None for x in flat_outputs
+        )
+        if mask_already_computed:
+            if build_graph:
+                self._set_mask_keras_history_checked(flat_outputs)
+            return
+
+        output_masks = self.compute_mask(inputs, previous_mask)
+        if output_masks is None:
+            return
+
+        flat_masks = tf.nest.flatten(output_masks)
+        for tensor, mask in zip(flat_outputs, flat_masks):
+            try:
+                tensor._keras_mask = mask
+            except AttributeError:
+                # C Type such as np.ndarray.
+                pass
+
+        if build_graph:
+            self._set_mask_keras_history_checked(flat_outputs)
+
+    def _set_mask_keras_history_checked(self, flat_outputs):
+        for output in flat_outputs:
+            if getattr(output, "_keras_mask", None) is not None:
+                # Do not track masks for `TensorFlowOpLayer` construction.
+                output._keras_mask._keras_history_checked = True
+
+    def _get_input_masks(self, inputs, input_list, args, kwargs):
+        if not self._supports_masking and not self._expects_mask_arg:
+            # Input masks only need to be retrieved if they are needed for
+            # `call` or `compute_mask`.
+            input_masks = None
+            implicit_mask = False
+        elif self._call_spec.arg_was_passed("mask", args, kwargs):
+            input_masks = self._call_spec.get_arg_value("mask", args, kwargs)
+            implicit_mask = False
+        else:
+            input_masks = [getattr(t, "_keras_mask", None) for t in input_list]
+            if all(mask is None for mask in input_masks):
+                input_masks = None
+                implicit_mask = False
+            else:
+                # Only do expensive `nest` op when masking is actually being
+                # used.
+                input_masks = tf.nest.pack_sequence_as(inputs, input_masks)
+                implicit_mask = True
+        return input_masks, implicit_mask
+
+    def _set_connectivity_metadata(self, args, kwargs, outputs):
+        # If the layer returns tensors from its inputs unmodified,
+        # we copy them to avoid loss of KerasHistory metadata.
+        flat_outputs = tf.nest.flatten(outputs)
+        flat_inputs = tf.nest.flatten((args, kwargs))
+        input_ids_set = {id(i) for i in flat_inputs}
+        outputs_copy = []
+        for x in flat_outputs:
+            if id(x) in input_ids_set:
+                with backend.name_scope(self.name):
+                    x = tf.identity(x)
+            outputs_copy.append(x)
+        outputs = tf.nest.pack_sequence_as(outputs, outputs_copy)
+
+        # Create node, Node wires itself to inbound and outbound layers.  The
+        # Node constructor actually updates this layer's self._inbound_nodes,
+        # sets _keras_history on the outputs, and adds itself to the
+        # `_outbound_nodes` of the layers that produced the inputs to this layer
+        # call.
+        node_module.Node(
+            self, call_args=args, call_kwargs=kwargs, outputs=outputs
+        )
+        return outputs
 
-    Returns:
-      `inputs`, but tensors may have been casted to self._compute_dtype
-    """
-    if not input_list:
-      input_list = tf.nest.flatten(inputs)
-
-    compute_dtype_object = self._compute_dtype_object
-    should_autocast = (
-        self._autocast and compute_dtype_object and
-        compute_dtype_object.is_floating)
-
-    if (should_autocast and
-        any(map(self._should_cast_single_input, input_list))):
-      # Only perform expensive `nest` operation when needed.
-      return tf.nest.map_structure(self._cast_single_input, inputs)
-    else:
-      return inputs
-
-  def _should_cast_single_input(self, x):
-    if isinstance(x, _AUTOCAST_TYPES):
-      return (self._compute_dtype_object and
-              x.dtype != self._compute_dtype_object and x.dtype.is_floating)
-    return False
-
-  def _cast_single_input(self, x):
-    """Cast a single Tensor or TensorSpec to the compute dtype."""
-    if self._should_cast_single_input(x):
-      return tf.cast(x, self._compute_dtype_object)
-    else:
-      return x
-
-  # _dtype used to be an attribute set in the constructor. We still expose it
-  # because some clients still use it.
-  # TODO(reedwm): Deprecate, then remove the _dtype property.
-  @property
-  def _dtype(self):
-    # This is equivalent to returning self.dtype . We do not return self.dtype
-    # as it would cause infinite recursion in a few subclasses, which override
-    # "dtype" to return self._dtype.
-    return self._dtype_policy.variable_dtype
-
-  @_dtype.setter
-  def _dtype(self, value):
-    value = tf.as_dtype(value).name
-    self._set_dtype_policy(policy.Policy(value))
-
-  def _name_scope(self):  # pylint: disable=method-hidden
-    if not tf.__internal__.tf2.enabled():
-      return self.name
-    name_scope = self.name
-    current_name_scope = tf.__internal__.get_name_scope()
-    if current_name_scope:
-      name_scope = current_name_scope + '/' + name_scope
-    if name_scope:
-      # Note that the trailing `/` prevents autogenerated
-      # numerical suffixes to get appended. It will also fully reset
-      # nested name scope (i.e. the outer name scope has no effect).
-      name_scope += '/'
-    return name_scope
-
-  def _init_set_name(self, name, zero_based=True):
-    if name is None:
-      self._name = backend.unique_object_name(
-          generic_utils.to_snake_case(self.__class__.__name__),
-          zero_based=zero_based)
-    elif isinstance(name, str):
-      backend.observe_object_name(name)
-      self._name = name
-    else:
-      raise TypeError(
-          f'Expected `name` argument to be a string, but got: {name}')
-
-  def _get_existing_metric(self, name=None):
-    match = [m for m in self._metrics if m.name == name]
-    if not match:
-      return
-    if len(match) > 1:
-      raise ValueError(
-          'Please provide different names for the metrics you have added. '
-          'We found {} metrics with the name: "{}"'.format(len(match), name))
-    return match[0]
-
-  def _handle_weight_regularization(self, name, variable, regularizer):
-    """Create lambdas which compute regularization losses."""
-
-    def _loss_for_variable(v):
-      """Creates a regularization loss `Tensor` for variable `v`."""
-      with backend.name_scope(name + '/Regularizer'):
-        regularization = regularizer(v)
-      return regularization
-
-    if base_layer_utils.is_split_variable(variable):
-      for v in variable:
-        self.add_loss(functools.partial(_loss_for_variable, v))
-    elif isinstance(variable, lazy_variable.LazyInitVariable):
-      self._captured_weight_regularizer.append((name, variable, regularizer))
-    else:
-      self.add_loss(functools.partial(_loss_for_variable, variable))
-
-  def _handle_activity_regularization(self, inputs, outputs):
-    # Apply activity regularization.
-    # Note that it should be applied every time the layer creates a new
-    # output, since it is output-specific.
-    if self._activity_regularizer:
-      output_list = tf.nest.flatten(outputs)
-      with backend.name_scope('ActivityRegularizer'):
-        for output in output_list:
-          activity_loss = tf.convert_to_tensor(
-              self._activity_regularizer(output))
-          batch_size = tf.cast(
-              tf.shape(output)[0], activity_loss.dtype)
-          # Make activity regularization strength batch-agnostic.
-          mean_activity_loss = activity_loss / batch_size
-          self.add_loss(mean_activity_loss)
-
-  def _set_mask_metadata(self, inputs, outputs, previous_mask, build_graph):
-    # Many `Layer`s don't need to call `compute_mask`.
-    # This method is optimized to do as little work as needed for the common
-    # case.
-    if not self._supports_masking:
-      return
-
-    flat_outputs = tf.nest.flatten(outputs)
-
-    mask_already_computed = (
-        getattr(self, '_compute_output_and_mask_jointly', False) or
-        all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs))
-    if mask_already_computed:
-      if build_graph:
-        self._set_mask_keras_history_checked(flat_outputs)
-      return
-
-    output_masks = self.compute_mask(inputs, previous_mask)
-    if output_masks is None:
-      return
-
-    flat_masks = tf.nest.flatten(output_masks)
-    for tensor, mask in zip(flat_outputs, flat_masks):
-      try:
-        tensor._keras_mask = mask
-      except AttributeError:
-        # C Type such as np.ndarray.
-        pass
+    def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+        """Private utility to retrieves an attribute (e.g. inputs) from a node.
+
+        This is used to implement the methods:
+            - get_input_shape_at
+            - get_output_shape_at
+            - get_input_at
+            etc...
+
+        Args:
+            node_index: Integer index of the node from which
+                to retrieve the attribute.
+            attr: Exact node attribute name.
+            attr_name: Human-readable attribute name, for error messages.
+
+        Returns:
+            The layer's attribute `attr` at the node of index `node_index`.
+
+        Raises:
+            RuntimeError: If the layer has no inbound nodes, or if called in
+                Eager mode.
+            ValueError: If the index provided does not match any node.
+        """
+        if not self._inbound_nodes:
+            raise RuntimeError(
+                f"The layer {self.name} has never been called "
+                f"and thus has no defined {attr_name}."
+            )
+        if not len(self._inbound_nodes) > node_index:
+            raise ValueError(
+                f"Asked to get {attr_name} at node "
+                f"{node_index}, but the layer has only "
+                f"{len(self._inbound_nodes)} inbound nodes."
+            )
+        values = getattr(self._inbound_nodes[node_index], attr)
+        if isinstance(values, list) and len(values) == 1:
+            return values[0]
+        else:
+            return values
 
-    if build_graph:
-      self._set_mask_keras_history_checked(flat_outputs)
-
-  def _set_mask_keras_history_checked(self, flat_outputs):
-    for output in flat_outputs:
-      if getattr(output, '_keras_mask', None) is not None:
-        # Do not track masks for `TensorFlowOpLayer` construction.
-        output._keras_mask._keras_history_checked = True
-
-  def _get_input_masks(self, inputs, input_list, args, kwargs):
-    if not self._supports_masking and not self._expects_mask_arg:
-      # Input masks only need to be retrieved if they are needed for `call`
-      # or `compute_mask`.
-      input_masks = None
-      implicit_mask = False
-    elif self._call_spec.arg_was_passed('mask', args, kwargs):
-      input_masks = self._call_spec.get_arg_value('mask', args, kwargs)
-      implicit_mask = False
-    else:
-      input_masks = [getattr(t, '_keras_mask', None) for t in input_list]
-      if all(mask is None for mask in input_masks):
-        input_masks = None
-        implicit_mask = False
-      else:
-        # Only do expensive `nest` op when masking is actually being used.
-        input_masks = tf.nest.pack_sequence_as(inputs, input_masks)
-        implicit_mask = True
-    return input_masks, implicit_mask
-
-  def _set_connectivity_metadata(self, args, kwargs, outputs):
-    # If the layer returns tensors from its inputs unmodified,
-    # we copy them to avoid loss of KerasHistory metadata.
-    flat_outputs = tf.nest.flatten(outputs)
-    flat_inputs = tf.nest.flatten((args, kwargs))
-    input_ids_set = {id(i) for i in flat_inputs}
-    outputs_copy = []
-    for x in flat_outputs:
-      if id(x) in input_ids_set:
-        with backend.name_scope(self.name):
-          x = tf.identity(x)
-      outputs_copy.append(x)
-    outputs = tf.nest.pack_sequence_as(outputs, outputs_copy)
-
-    # Create node, Node wires itself to inbound and outbound layers.
-    # The Node constructor actually updates this layer's self._inbound_nodes,
-    # sets _keras_history on the outputs, and adds itself to the
-    # `_outbound_nodes` of the layers that produced the inputs to this
-    # layer call.
-    node_module.Node(self, call_args=args, call_kwargs=kwargs, outputs=outputs)
-    return outputs
-
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
-
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
+    def _maybe_build(self, inputs):
+        # Check input assumptions set before layer building, e.g. input rank.
+        if not self.built:
+            input_spec.assert_input_compatibility(
+                self.input_spec, inputs, self.name
+            )
+            input_list = tf.nest.flatten(inputs)
+            if input_list and self._dtype_policy.compute_dtype is None:
+                try:
+                    dtype = input_list[0].dtype.base_dtype.name
+                except AttributeError:
+                    pass
+                else:
+                    self._set_dtype_policy(policy.Policy(dtype))
+            input_shapes = None
+            # Converts Tensors / CompositeTensors to TensorShapes.
+            if any(hasattr(x, "shape") for x in input_list):
+                input_shapes = tf_utils.get_shapes(inputs)
+            else:
+                # Converts input shape to TensorShapes.
+                try:
+                    input_shapes = tf_utils.convert_shapes(
+                        inputs, to_tuples=False
+                    )
+                except ValueError:
+                    pass
+            # Only call `build` if the user has manually overridden the build
+            # method.
+            if not hasattr(self.build, "_is_default"):
+                # Any setup work performed only once should happen in an
+                # `init_scope` to avoid creating symbolic Tensors that will
+                # later pollute any eager operations.
+                with tf_utils.maybe_init_scope(self):
+                    self.build(input_shapes)
+            # We must set also ensure that the layer is marked as built, and the
+            # build shape is stored since user defined build functions may not
+            # be calling `super.build()`
+            Layer.build(self, input_shapes)
+
+        # Optionally load weight values specified at layer instantiation.
+        if self._initial_weights is not None:
+            with tf.init_scope():
+                # Using `init_scope` since we want variable assignment in
+                # `set_weights` to be treated like variable initialization.
+                self.set_weights(self._initial_weights)
+            self._initial_weights = None
+
+    def _get_trainable_state(self):
+        """Get the `trainable` state of each sublayer.
+
+        Returns:
+          A dict mapping all sublayers to their `trainable` value.
+        """
+        trainable_state = weakref.WeakKeyDictionary()
+        for layer in self._flatten_layers():
+            trainable_state[layer] = layer.trainable
+        return trainable_state
+
+    def _set_trainable_state(self, trainable_state):
+        """Set `trainable` state for each sublayer."""
+        for layer in self._flatten_layers():
+            if layer in trainable_state:
+                layer.trainable = trainable_state[layer]
+
+    @property
+    def _obj_reference_counts(self):
+        """A dict counting the number of attributes referencing an object."""
+        self._maybe_create_attribute(
+            "_obj_reference_counts_dict",
+            object_identity.ObjectIdentityDictionary(),
+        )
+        return self._obj_reference_counts_dict
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _maybe_create_attribute(self, name, default_value):
+        """Create attribute (with the default value) if it hasn't been created.
+
+        This is useful for fields that is used for tracking purpose,
+        _trainable_weights, or _layers. Note that user could create a layer
+        subclass and assign an internal field before invoking the
+        Layer.__init__(), the __setattr__() need to create the tracking fields
+        and __init__() need to not override them.
+
+        Args:
+          name: String, the name of the attribute.
+          default_value: Object, the default value of the attribute.
+        """
+        if not hasattr(self, name):
+            self.__setattr__(name, default_value)
+
+    def __delattr__(self, name):
+        # For any super.__delattr__() call, we will directly use the
+        # implementation in Trackable and skip the behavior in AutoTrackable.
+        # The Layer was originally use Trackable as base class, the change of
+        # using Module as base class forced us to have AutoTrackable in the
+        # class hierarchy.
+        #
+        # TODO(b/180760306) Keeping the status quo of skipping _delattr__ and
+        # __setattr__ in AutoTrackable may be unsustainable.
+        existing_value = getattr(self, name, None)
+
+        # If this value is replacing an existing object assigned to an
+        # attribute, we should clean it out to avoid leaking memory. First we
+        # check if there are other attributes referencing it.
+        reference_counts = self._obj_reference_counts
+        if existing_value not in reference_counts:
+            super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(
+                name
+            )
+            return
+
+        reference_count = reference_counts[existing_value]
+        if reference_count > 1:
+            # There are other remaining references. We can't remove this object
+            # from _layers etc.
+            reference_counts[existing_value] = reference_count - 1
+            super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(
+                name
+            )
+            return
+        else:
+            # This is the last remaining reference.
+            del reference_counts[existing_value]
+
+        super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)
+
+        if isinstance(existing_value, Layer) or base_layer_utils.has_weights(
+            existing_value
+        ):
+            super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                "_self_tracked_trackables",
+                [
+                    l
+                    for l in self._self_tracked_trackables
+                    if l is not existing_value
+                ],
+            )
+        if isinstance(existing_value, tf.Variable):
+            super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                "_trainable_weights",
+                [w for w in self._trainable_weights if w is not existing_value],
+            )
+            super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                "_non_trainable_weights",
+                [
+                    w
+                    for w in self._non_trainable_weights
+                    if w is not existing_value
+                ],
+            )
+
+    def __setattr__(self, name, value):
+        if (
+            name == "_self_setattr_tracking"
+            or not getattr(self, "_self_setattr_tracking", True)
+            # Exclude @property.setters from tracking
+            or hasattr(self.__class__, name)
+        ):
+            try:
+                super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                    name, value
+                )
+            except AttributeError:
+                raise AttributeError(
+                    (
+                        'Can\'t set the attribute "{}", likely because it '
+                        "conflicts with an existing read-only @property of the "
+                        "object. Please choose a different name."
+                    ).format(name)
+                )
+            return
+
+        # Wraps data structures in `Trackable`, unwraps `NoDependency` objects.
+        value = tf.__internal__.tracking.sticky_attribute_assignment(
+            trackable=self, value=value, name=name
+        )
+
+        reference_counts = self._obj_reference_counts
+        reference_counts[value] = reference_counts.get(value, 0) + 1
+
+        # When replacing an existing tf.Variable with a new one, we want to
+        # check its existing position in the
+        # self._trainable/non_trainable_variable, so that we can put it back to
+        # the original position.
+        if isinstance(value, tf.Variable) and isinstance(
+            getattr(self, name, None), tf.Variable
+        ):
+            existing_variable = getattr(self, name)
+
+            def _get_variable_from_list(var_list, var):
+                # helper function to get the tf.variable from the list
+                # the default list.index() use == for comparison, which will
+                # cause issue for eager tensor.
+                for i in range(len(var_list)):
+                    if var_list[i] is var:
+                        return i
+                return None
+
+            if existing_variable.trainable:
+                self._maybe_create_attribute("_trainable_weights", [])
+                position = _get_variable_from_list(
+                    self._trainable_weights, existing_variable
+                )
+            else:
+                self._maybe_create_attribute("_non_trainable_variable", [])
+                position = _get_variable_from_list(
+                    self._non_trainable_variable, existing_variable
+                )
+        else:
+            position = None
 
-    Args:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
-
-    Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
-
-    Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
-    """
-    if not self._inbound_nodes:
-      raise RuntimeError(f'The layer {self.name} has never been called '
-                         f'and thus has no defined {attr_name}.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError(f'Asked to get {attr_name} at node '
-                       f'{node_index}, but the layer has only '
-                       f'{len(self._inbound_nodes)} inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if isinstance(values, list) and len(values) == 1:
-      return values[0]
-    else:
-      return values
-
-  def _maybe_build(self, inputs):
-    # Check input assumptions set before layer building, e.g. input rank.
-    if not self.built:
-      input_spec.assert_input_compatibility(
-          self.input_spec, inputs, self.name)
-      input_list = tf.nest.flatten(inputs)
-      if input_list and self._dtype_policy.compute_dtype is None:
+        # Clean out the old attribute, which clears _layers and
+        # _trainable_weights if necessary.
         try:
-          dtype = input_list[0].dtype.base_dtype.name
+            self.__delattr__(name)
         except AttributeError:
-          pass
-        else:
-          self._set_dtype_policy(policy.Policy(dtype))
-      input_shapes = None
-      # Converts Tensors / CompositeTensors to TensorShapes.
-      if any(hasattr(x, 'shape') for x in input_list):
-        input_shapes = tf_utils.get_shapes(inputs)
-      else:
-        # Converts input shape to TensorShapes.
-        try:
-          input_shapes = tf_utils.convert_shapes(inputs, to_tuples=False)
-        except ValueError:
-          pass
-      # Only call `build` if the user has manually overridden the build method.
-      if not hasattr(self.build, '_is_default'):
-        # Any setup work performed only once should happen in an `init_scope`
-        # to avoid creating symbolic Tensors that will later pollute any eager
-        # operations.
-        with tf_utils.maybe_init_scope(self):
-          self.build(input_shapes)  # pylint:disable=not-callable
-      # We must set also ensure that the layer is marked as built, and the build
-      # shape is stored since user defined build functions may not be calling
-      # `super.build()`
-      Layer.build(self, input_shapes)
-
-    # Optionally load weight values specified at layer instantiation.
-    if self._initial_weights is not None:
-      with tf.init_scope():
-        # Using `init_scope` since we want variable assignment in
-        # `set_weights` to be treated like variable initialization.
-        self.set_weights(self._initial_weights)
-      self._initial_weights = None
-
-  def _get_trainable_state(self):
-    """Get the `trainable` state of each sublayer.
-
-    Returns:
-      A dict mapping all sublayers to their `trainable` value.
-    """
-    trainable_state = weakref.WeakKeyDictionary()
-    for layer in self._flatten_layers():
-      trainable_state[layer] = layer.trainable
-    return trainable_state
-
-  def _set_trainable_state(self, trainable_state):
-    """Set `trainable` state for each sublayer."""
-    for layer in self._flatten_layers():
-      if layer in trainable_state:
-        layer.trainable = trainable_state[layer]
-
-  @property
-  def _obj_reference_counts(self):
-    """A dictionary counting the number of attributes referencing an object."""
-    self._maybe_create_attribute('_obj_reference_counts_dict',
-                                 object_identity.ObjectIdentityDictionary())
-    return self._obj_reference_counts_dict
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _maybe_create_attribute(self, name, default_value):
-    """Create the attribute with the default value if it hasn't been created.
-
-    This is useful for fields that is used for tracking purpose,
-    _trainable_weights, or _layers. Note that user could create a layer subclass
-    and assign an internal field before invoking the Layer.__init__(), the
-    __setattr__() need to create the tracking fields and __init__() need to not
-    override them.
+            pass
 
-    Args:
-      name: String, the name of the attribute.
-      default_value: Object, the default value of the attribute.
-    """
-    if not hasattr(self, name):
-      self.__setattr__(name, default_value)
-
-  def __delattr__(self, name):
-    # For any super.__delattr__() call, we will directly use the implementation
-    # in Trackable and skip the behavior in AutoTrackable. The Layer was
-    # originally use Trackable as base class, the change of using Module as base
-    # class forced us to have AutoTrackable in the class hierarchy.
-    #
-    # TODO(b/180760306) Keeping the status quo of skipping _delattr__ and
-    # __setattr__ in AutoTrackable may be unsustainable.
-    existing_value = getattr(self, name, None)
-
-    # If this value is replacing an existing object assigned to an attribute, we
-    # should clean it out to avoid leaking memory. First we check if there are
-    # other attributes referencing it.
-    reference_counts = self._obj_reference_counts
-    if existing_value not in reference_counts:
-      super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)  # pylint: disable=bad-super-call
-      return
-
-    reference_count = reference_counts[existing_value]
-    if reference_count > 1:
-      # There are other remaining references. We can't remove this object from
-      # _layers etc.
-      reference_counts[existing_value] = reference_count - 1
-      super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)  # pylint: disable=bad-super-call
-      return
-    else:
-      # This is the last remaining reference.
-      del reference_counts[existing_value]
-
-    super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)  # pylint: disable=bad-super-call
-
-    if (isinstance(existing_value, Layer)
-        or base_layer_utils.has_weights(existing_value)):
-      super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(  # pylint: disable=bad-super-call
-          '_self_tracked_trackables',
-          [l for l in self._self_tracked_trackables if l is not existing_value])
-    if isinstance(existing_value, tf.Variable):
-      super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(  # pylint: disable=bad-super-call
-          '_trainable_weights',
-          [w for w in self._trainable_weights if w is not existing_value])
-      super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(  # pylint: disable=bad-super-call
-          '_non_trainable_weights',
-          [w for w in self._non_trainable_weights if w is not existing_value])
-
-  def __setattr__(self, name, value):
-    if (name == '_self_setattr_tracking' or
-        not getattr(self, '_self_setattr_tracking', True) or
-        # Exclude @property.setters from tracking
-        hasattr(self.__class__, name)):
-      try:
+        # Keep track of metric instance created in subclassed layer.
+        for val in tf.nest.flatten(value):
+            if isinstance(val, metrics_mod.Metric) and hasattr(
+                self, "_metrics"
+            ):
+                self._metrics.append(val)
+
+        # Append value to self._self_tracked_trackables if relevant
+        if getattr(self, "_auto_track_sub_layers", True) and (
+            isinstance(value, tf.Module) or base_layer_utils.has_weights(value)
+        ):
+            self._maybe_create_attribute("_self_tracked_trackables", [])
+            # We need to check object identity to avoid de-duplicating empty
+            # container types which compare equal.
+            if not any(
+                (layer is value for layer in self._self_tracked_trackables)
+            ):
+                self._self_tracked_trackables.append(value)
+                if hasattr(value, "_use_resource_variables"):
+                    # Legacy layers (V1 tf.layers) must always use
+                    # resource variables.
+                    value._use_resource_variables = True
+
+        # Append value to list of trainable / non-trainable weights if relevant
+        # TODO(b/125122625): This won't pick up on any variables added to a
+        # list/dict after creation.
+        self._track_variables(value, position=position)
+
+        # TODO(b/180760306) Skip the auto trackable from tf.Module to keep
+        # status quo. See the comment at __delattr__.
         super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
-            name, value)  # pylint: disable=bad-super-call
-      except AttributeError:
-        raise AttributeError(
-            ('Can\'t set the attribute "{}", likely because it conflicts with '
-             'an existing read-only @property of the object. Please choose a '
-             'different name.').format(name))
-      return
-
-    # Wraps data structures in `Trackable`, unwraps `NoDependency` objects.
-    value = tf.__internal__.tracking.sticky_attribute_assignment(
-        trackable=self, value=value, name=name)
-
-    reference_counts = self._obj_reference_counts
-    reference_counts[value] = reference_counts.get(value, 0) + 1
-
-    # Clean out the old attribute, which clears _layers and _trainable_weights
-    # if necessary.
-    try:
-      self.__delattr__(name)
-    except AttributeError:
-      pass
-
-    # Keep track of metric instance created in subclassed layer.
-    for val in tf.nest.flatten(value):
-      if isinstance(val, metrics_mod.Metric) and hasattr(self, '_metrics'):
-        self._metrics.append(val)
-
-    # Append value to self._self_tracked_trackables if relevant
-    if (getattr(self, '_auto_track_sub_layers', True) and
-        (isinstance(value, tf.Module) or
-         base_layer_utils.has_weights(value))):
-      self._maybe_create_attribute('_self_tracked_trackables', [])
-      # We need to check object identity to avoid de-duplicating empty
-      # container types which compare equal.
-      if not any((layer is value for layer in self._self_tracked_trackables)):
-        self._self_tracked_trackables.append(value)
-        if hasattr(value, '_use_resource_variables'):
-          # Legacy layers (V1 tf.layers) must always use
-          # resource variables.
-          value._use_resource_variables = True
-
-    # Append value to list of trainable / non-trainable weights if relevant
-    # TODO(b/125122625): This won't pick up on any variables added to a
-    # list/dict after creation.
-    for val in tf.nest.flatten(value, expand_composites=True):
-      if not isinstance(val, tf.Variable):
-        continue
-
-      # Users may add extra weights/variables
-      # simply by assigning them to attributes (invalid for graph networks)
-      self._maybe_create_attribute('_trainable_weights', [])
-      self._maybe_create_attribute('_non_trainable_weights', [])
-      if val.trainable:
-        if any(val is w for w in self._trainable_weights):
-          continue
-        self._trainable_weights.append(val)
-      else:
-        if any(val is w for w in self._non_trainable_weights):
-          continue
-        self._non_trainable_weights.append(val)
-
-      backend.track_variable(val)
-
-    # TODO(b/180760306) Skip the auto trackable from tf.Module to keep status
-    # quo. See the comment at __delattr__.
-    super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(name, value)  # pylint: disable=bad-super-call
-
-  def _gather_children_attribute(self, attribute):
-    assert attribute in {
-        'variables', 'trainable_variables', 'non_trainable_variables'
-    }
-    if hasattr(self, '_self_tracked_trackables'):
-      nested_layers = self._flatten_modules(include_self=False, recursive=False)
-      return list(
-          itertools.chain.from_iterable(
-              getattr(layer, attribute) for layer in nested_layers))
-    return []
-
-  def _flatten_layers(self, recursive=True, include_self=True):
-    for m in self._flatten_modules(
-        recursive=recursive, include_self=include_self):
-      if isinstance(m, Layer):
-        yield m
-
-  def _flatten_modules(self, recursive=True, include_self=True):
-    """Flattens `tf.Module` instances (excluding `Metrics`).
-
-    Args:
-      recursive: Whether to recursively flatten through submodules.
-      include_self: Whether to include this `Layer` instance.
+            name, value
+        )
+
+    def _update_trackables(self):
+        """Track variables added to lists/dicts after creation"""
+        for trackable_obj in self._self_tracked_trackables:
+            if isinstance(
+                trackable_obj, tf.__internal__.tracking.TrackableDataStructure
+            ):
+                self._track_variables(trackable_obj)
+
+    def _track_variables(self, value, position=None):
+        """Tracks `Variable`s including `Variable`s in `CompositeTensor`s."""
+        for val in tf.nest.flatten(value):
+            if isinstance(val, tf.Variable):
+                self._track_variable(val, position=position)
+            elif tf_utils.is_extension_type(val):
+                # Manually expand extension types to track resource variables.
+                nested_vals = tf_utils.type_spec_from_value(val)._to_components(
+                    val
+                )
+                self._track_variables(nested_vals, position=position)
+
+    def _track_variable(self, val, position=None):
+        """Tracks the given `tf.Variable`."""
+        # Users may add extra weights/variables simply by assigning them to
+        # attributes (invalid for graph networks)
+        self._maybe_create_attribute("_trainable_weights", [])
+        self._maybe_create_attribute("_non_trainable_weights", [])
+        if val.trainable:
+            if any(val is w for w in self._trainable_weights):
+                return
+            if position is None:
+                self._trainable_weights.append(val)
+            else:
+                self._trainable_weights.insert(position, val)
+        else:
+            if any(val is w for w in self._non_trainable_weights):
+                return
+            if position is None:
+                self._non_trainable_weights.append(val)
+            else:
+                self._non_trainable_weights.insert(position, val)
+        backend.track_variable(val)
+
+    def _gather_children_attribute(self, attribute):
+        assert attribute in {
+            "variables",
+            "trainable_variables",
+            "non_trainable_variables",
+        }
+        if hasattr(self, "_self_tracked_trackables"):
+            nested_layers = self._flatten_modules(
+                include_self=False, recursive=False
+            )
+            return list(
+                itertools.chain.from_iterable(
+                    getattr(layer, attribute) for layer in nested_layers
+                )
+            )
+        return []
+
+    def _flatten_layers(self, recursive=True, include_self=True):
+        for m in self._flatten_modules(
+            recursive=recursive, include_self=include_self
+        ):
+            if isinstance(m, Layer):
+                yield m
+
+    def _flatten_modules(self, recursive=True, include_self=True):
+        """Flattens `tf.Module` instances (excluding `Metrics`).
+
+        Args:
+          recursive: Whether to recursively flatten through submodules.
+          include_self: Whether to include this `Layer` instance.
+
+        Yields:
+          `tf.Module` instance tracked by this `Layer`.
+        """
+        if include_self:
+            yield self
+
+        # Only instantiate set and deque if needed.
+        trackables = getattr(self, "_self_tracked_trackables", None)
+        if trackables:
+            seen_object_ids = set()
+            deque = collections.deque(trackables)
+            while deque:
+                trackable_obj = deque.popleft()
+                trackable_id = id(trackable_obj)
+                if trackable_id in seen_object_ids:
+                    continue
+                seen_object_ids.add(trackable_id)
+
+                # Metrics are not considered part of the Layer's topology.
+                if isinstance(trackable_obj, tf.Module) and not isinstance(
+                    trackable_obj, metrics_mod.Metric
+                ):
+                    yield trackable_obj
+                    # Introspect recursively through sublayers.
+                    if recursive:
+                        subtrackables = getattr(
+                            trackable_obj, "_self_tracked_trackables", None
+                        )
+                        if subtrackables:
+                            deque.extendleft(reversed(subtrackables))
+                elif isinstance(
+                    trackable_obj,
+                    tf.__internal__.tracking.TrackableDataStructure,
+                ):
+                    # Data structures are introspected even with
+                    # `recursive=False`.
+                    tracked_values = trackable_obj._values
+                    if tracked_values:
+                        deque.extendleft(reversed(tracked_values))
+
+    # This is a hack so that the is_layer (within
+    # training/trackable/layer_utils.py) check doesn't get the weights attr.
+    # TODO(b/110718070): Remove when fixed.
+    def _is_layer(self):
+        return True
+
+    def _init_call_fn_args(self, expects_training_arg=None):
+        self._call_spec = layer_utils.CallFunctionSpec(
+            tf_inspect.getfullargspec(self.call)
+        )
+        if expects_training_arg is not None:
+            self._call_spec.expects_training_arg = expects_training_arg
+
+    @property
+    def _expects_training_arg(self):
+        """Whether the call function uses 'training' as a parameter."""
+        return self._call_spec.expects_training_arg
+
+    @property
+    def _expects_mask_arg(self):
+        return self._call_spec.expects_mask_arg
+
+    @property
+    def _eager_losses(self):
+        # A list of loss values containing activity regularizers and losses
+        # manually added through `add_loss` during eager execution. It is
+        # cleared after every batch. Because we plan on eventually allowing a
+        # same model instance to be trained in eager mode or graph mode
+        # alternatively, we need to keep track of eager losses and symbolic
+        # losses via separate attributes.
+        if not hasattr(self._thread_local, "_eager_losses"):
+            self._thread_local._eager_losses = []
+        return self._thread_local._eager_losses
+
+    @_eager_losses.setter
+    def _eager_losses(self, losses):
+        self._thread_local._eager_losses = losses
+
+    def _dedup_weights(self, weights):
+        """Dedupe weights while maintaining order as much as possible."""
+        output, seen_ids = [], set()
+        for w in weights:
+            if id(w) not in seen_ids:
+                output.append(w)
+                # Track the Variable's identity to avoid __eq__ issues.
+                seen_ids.add(id(w))
+        return output
+
+    # SavedModel properties. Please see keras/saving/saved_model for details.
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _set_save_spec(self, inputs, args=None, kwargs=None):
+        """Defines the save spec so that serialization can trace layer calls.
+
+        The TensorSpecs of the call function `inputs`, `args`, and `kwargs` are
+        saved into a tuple of `([inputs] + args, kwargs)`.
+
+        Args:
+          inputs: possibly nested inputs passed into the call function.
+          args: a list of positional arguments passed into call.
+          kwargs: a dictionary of keyword arguments passed into call.
+        """
+        if self._saved_model_inputs_spec is not None:
+            return  # Already set.
+
+        inputs_spec = tf.nest.map_structure(tf_utils.get_tensor_spec, inputs)
+        args_spec = tf.nest.map_structure(tf_utils.get_tensor_spec, args or [])
+        kwargs_spec = {}
+        # Filter out non-tensor arguments from kwargs.
+        for key, kwarg in kwargs.items():
+            flat_kwarg = tf.nest.flatten(kwarg)
+            flat_specs = [tf_utils.get_tensor_spec(x) for x in flat_kwarg]
+            if any(s is None for s in flat_specs):
+                continue
+            kwargs_spec[key] = tf.nest.pack_sequence_as(kwarg, flat_specs)
+
+        self._saved_model_inputs_spec = inputs_spec
+        self._saved_model_arg_spec = (
+            [inputs_spec] + list(args_spec),
+            kwargs_spec,
+        )
+
+    def _get_save_spec(self, dynamic_batch=True, inputs_only=True):
+        if self._saved_model_inputs_spec is None:
+            return None
+
+        spec = tf.nest.map_structure(
+            lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
+            self._saved_model_arg_spec,
+        )
+        return spec[0][0] if inputs_only else spec
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return layer_serialization.LayerSavedModelSaver(self)
+
+    @property
+    def _object_identifier(self):
+        return self._trackable_saved_model_saver.object_identifier
+
+    @property
+    def _tracking_metadata(self):
+        """Info about this layer to be saved into the SavedModel."""
+        return self._trackable_saved_model_saver.tracking_metadata
+
+    def _trackable_children(self, save_type="checkpoint", **kwargs):
+        if save_type == "savedmodel":
+            cache = kwargs["cache"]
+            # TODO(b/213628533): This must be called before super() to ensure
+            # that any input shape changes are applied before getting the config
+            # of the model.
+            children = self._trackable_saved_model_saver.trackable_children(
+                cache
+            )
+        else:
+            children = {}
+        children.update(super()._trackable_children(save_type, **kwargs))
+        return children
+
+    @property
+    def _use_input_spec_as_call_signature(self):
+        # Whether input spec can be used as the call signature when tracing the
+        # Layer for SavedModel. By default, this is set to `True` for layers
+        # exported from the Keras library, because the layers more rigidly
+        # define the `input_specs` property (many custom layers only set the
+        # `ndims`)
+        return (
+            get_canonical_name_for_symbol(type(self), api_name="keras")
+            is not None
+        )
+
+    def __getstate__(self):
+        # Override to support `copy.deepcopy` and pickling.
+        # Thread-local objects cannot be copied in Python 3, so pop these.
+        # Thread-local objects are used to cache losses in MirroredStrategy, and
+        # so shouldn't be copied.
+        state = self.__dict__.copy()
+        state.pop("_thread_local", None)
+        state.pop("_metrics_lock", None)
+        return state
+
+    def __setstate__(self, state):
+        state["_thread_local"] = threading.local()
+        state["_metrics_lock"] = threading.Lock()
+        # Bypass Trackable logic as `__dict__` already contains this info.
+        object.__setattr__(self, "__dict__", state)
+
+    def save_own_variables(self, store):
+        """Saves the state of the layer.
+
+        You can override this method to take full control of how the state of
+        the layer is saved upon calling `model.save()`.
+
+        Args:
+            store: Dict where the state of the model will be saved.
+        """
+        all_vars = self._trainable_weights + self._non_trainable_weights
+        for i, v in enumerate(all_vars):
+            store[f"{i}"] = v.numpy()
+
+    def load_own_variables(self, store):
+        """Loads the state of the layer.
+
+        You can override this method to take full control of how the state of
+        the layer is loaded upon calling `keras.models.load_model()`.
+
+        Args:
+            store: Dict from which the state of the model will be loaded.
+        """
+        self._update_trackables()
+        all_vars = self._trainable_weights + self._non_trainable_weights
+        if len(store.keys()) != len(all_vars):
+            raise ValueError(
+                f"Layer '{self.name}' expected {len(all_vars)} variables, "
+                "but received "
+                f"{len(store.keys())} variables during loading. "
+                f"Expected: {[v.name for v in all_vars]}"
+            )
+        for i, v in enumerate(all_vars):
+            # TODO(rchao): check shapes and raise errors.
+            v.assign(store[f"{i}"])
 
-    Yields:
-      `tf.Module` instance tracked by this `Layer`.
-    """
-    if include_self:
-      yield self
-
-    # Only instantiate set and deque if needed.
-    trackables = getattr(self, '_self_tracked_trackables', None)
-    if trackables:
-      seen_object_ids = set()
-      deque = collections.deque(trackables)
-      while deque:
-        trackable_obj = deque.popleft()
-        trackable_id = id(trackable_obj)
-        if trackable_id in seen_object_ids:
-          continue
-        seen_object_ids.add(trackable_id)
-
-        # Metrics are not considered part of the Layer's topology.
-        if (isinstance(trackable_obj, tf.Module) and
-            not isinstance(trackable_obj, metrics_mod.Metric)):
-          yield trackable_obj
-          # Introspect recursively through sublayers.
-          if recursive:
-            subtrackables = getattr(trackable_obj, '_self_tracked_trackables',
-                                    None)
-            if subtrackables:
-              deque.extendleft(reversed(subtrackables))
-        elif isinstance(trackable_obj,
-                        tf.__internal__.tracking.TrackableDataStructure):
-          # Data structures are introspected even with `recursive=False`.
-          tracked_values = trackable_obj._values
-          if tracked_values:
-            deque.extendleft(reversed(tracked_values))
-
-  # This is a hack so that the is_layer (within
-  # training/trackable/layer_utils.py) check doesn't get the weights attr.
-  # TODO(b/110718070): Remove when fixed.
-  def _is_layer(self):
-    return True
-
-  def _init_call_fn_args(self, expects_training_arg=None):
-    self._call_spec = layer_utils.CallFunctionSpec(
-        tf_inspect.getfullargspec(self.call))
-    if expects_training_arg is not None:
-      self._call_spec.expects_training_arg = expects_training_arg
-
-  @property
-  def _expects_training_arg(self):
-    """Whether the call function uses 'training' as a parameter."""
-    return self._call_spec.expects_training_arg
-
-  @property
-  def _expects_mask_arg(self):
-    return self._call_spec.expects_mask_arg
-
-  @property
-  def _eager_losses(self):
-    # A list of loss values containing activity regularizers and losses
-    # manually added through `add_loss` during eager execution. It is cleared
-    # after every batch.
-    # Because we plan on eventually allowing a same model instance to be trained
-    # in eager mode or graph mode alternatively, we need to keep track of
-    # eager losses and symbolic losses via separate attributes.
-    if not hasattr(self._thread_local, '_eager_losses'):
-      self._thread_local._eager_losses = []
-    return self._thread_local._eager_losses
-
-  @_eager_losses.setter
-  def _eager_losses(self, losses):
-    self._thread_local._eager_losses = losses
-
-  def _dedup_weights(self, weights):
-    """Dedupe weights while maintaining order as much as possible."""
-    output, seen_ids = [], set()
-    for w in weights:
-      if id(w) not in seen_ids:
-        output.append(w)
-        # Track the Variable's identity to avoid __eq__ issues.
-        seen_ids.add(id(w))
-
-    return output
-
-  # SavedModel properties. Please see keras/saving/saved_model for details.
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _set_save_spec(self, inputs, args=None, kwargs=None):
-    """Defines the save spec so that serialization is able to trace layer call.
-
-    The TensorSpecs of the call function `inputs`, `args`, and `kwargs` are
-    saved into a tuple of `([inputs] + args, kwargs)`.
 
-    Args:
-      inputs: possibly nested inputs passed into the call function.
-      args: a list of positional arguments passed into call.
-      kwargs: a dictionary of keyword arguments passed into call.
+class TensorFlowOpLayer(Layer):
+    """Wraps a TensorFlow Operation in a Layer.
+
+    This class is used internally by the Functional API. When a user
+    uses a raw TensorFlow Operation on symbolic tensors originating
+    from an `Input` Layer, the resultant operation will be wrapped
+    with this Layer object in order to make the operation compatible
+    with the Keras API.
+
+    This Layer will create a new, identical operation (except for inputs
+    and outputs) every time it is called. If `run_eagerly` is `True`,
+    the op creation and calculation will happen inside an Eager function.
+
+    Instances of this Layer are created when `autolambda` is called, which
+    is whenever a Layer's `__call__` encounters symbolic inputs that do
+    not have Keras metadata, or when a Network's `__init__` encounters
+    outputs that do not have Keras metadata.
+
+    Attributes:
+      node_def: String, the serialized NodeDef of the Op this layer will wrap.
+      name: String, the name of the Layer.
+      constants: Dict of NumPy arrays, the values of any Tensors needed for this
+        Operation that do not originate from a Keras `Input` Layer. Since all
+        placeholders must come from Keras `Input` Layers, these Tensors must be
+        treated as constant in the Functional API.
+      trainable: Bool, whether this Layer is trainable. Currently Variables are
+        not supported, and so this parameter has no effect.
+      dtype: The default dtype of this Layer. Inherited from `Layer` and has no
+        effect on this class, however is used in `get_config`.
     """
-    if self._saved_model_inputs_spec is not None:
-      return  # Already set.
-
-    inputs_spec = tf.nest.map_structure(tf_utils.get_tensor_spec, inputs)
-    args_spec  = tf.nest.map_structure(tf_utils.get_tensor_spec, args or [])
-    kwargs_spec = {}
-    # Filter out non-tensor arguments from kwargs.
-    for key, kwarg in kwargs.items():
-      flat_kwarg = tf.nest.flatten(kwarg)
-      flat_specs = [tf_utils.get_tensor_spec(x) for x in flat_kwarg]
-      if any(s is None for s in flat_specs):
-        continue
-      kwargs_spec[key] = tf.nest.pack_sequence_as(kwarg, flat_specs)
-
-    self._saved_model_inputs_spec = inputs_spec
-    self._saved_model_arg_spec = ([inputs_spec] + list(args_spec), kwargs_spec)
-
-  def _get_save_spec(self, dynamic_batch=True, inputs_only=True):
-    if self._saved_model_inputs_spec is None:
-      return None
-
-    spec = tf.nest.map_structure(
-        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
-        self._saved_model_arg_spec)
-    return spec[0][0] if inputs_only else spec
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return layer_serialization.LayerSavedModelSaver(self)
-
-  @property
-  def _object_identifier(self):
-    return self._trackable_saved_model_saver.object_identifier
-
-  @property
-  def _tracking_metadata(self):
-    """Info about this layer to be saved into the SavedModel."""
-    return self._trackable_saved_model_saver.tracking_metadata
-
-  def _trackable_children(self, save_type='checkpoint', **kwargs):
-    if save_type == 'savedmodel':
-      cache = kwargs['cache']
-      # TODO(b/213628533): This must be called before super() to ensure
-      # that any input shape changes are applied before getting the config of
-      # the model.
-      children = self._trackable_saved_model_saver.trackable_children(cache)
-    else:
-      children = {}
-    children.update(super()._trackable_children(save_type, **kwargs))
-    return children
-
-  @property
-  def _use_input_spec_as_call_signature(self):
-    # Whether input spec can be used as the call signature when tracing the
-    # Layer for SavedModel. By default, this is set to `True` for layers
-    # exported from the Keras library, because the layers more rigidly define
-    # the `input_specs` property (many custom layers only set the `ndims`)
-    return get_canonical_name_for_symbol(type(self),
-                                         api_name='keras') is not None
-
-  def __getstate__(self):
-    # Override to support `copy.deepcopy` and pickling.
-    # Thread-local objects cannot be copied in Python 3, so pop these.
-    # Thread-local objects are used to cache losses in MirroredStrategy, and
-    # so shouldn't be copied.
-    state = self.__dict__.copy()
-    state.pop('_thread_local', None)
-    state.pop('_metrics_lock', None)
-    return state
-
-  def __setstate__(self, state):
-    state['_thread_local'] = threading.local()
-    state['_metrics_lock'] = threading.Lock()
-    # Bypass Trackable logic as `__dict__` already contains this info.
-    object.__setattr__(self, '__dict__', state)
 
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(
+        self, node_def, name, constants=None, trainable=True, dtype=None
+    ):
+        # Pass autocast=False, as if inputs are cast, input types might not
+        # match Operation type.
+        super(TensorFlowOpLayer, self).__init__(
+            name=_TF_OP_LAYER_NAME_PREFIX + name,
+            trainable=trainable,
+            dtype=dtype,
+            autocast=False,
+        )
+        if isinstance(node_def, dict):
+            self.node_def = json_format.ParseDict(
+                node_def, tf.compat.v1.NodeDef()
+            )
+        else:
+            if not isinstance(node_def, bytes):
+                node_def = node_def.encode("utf-8")
+            self.node_def = tf.compat.v1.NodeDef.FromString(node_def)
+        # JSON serialization stringifies keys which are integer input indices.
+        self.constants = (
+            {int(index): constant for index, constant in constants.items()}
+            if constants is not None
+            else {}
+        )
+        # Layer uses original op unless it is called on new inputs.
+        # This means `built` is not set in `__call__`.
+        self.built = True
+
+        # Do not individually trace TensorflowOpLayers in the SavedModel.
+        self._must_restore_from_config = True
 
-class TensorFlowOpLayer(Layer):
-  """Wraps a TensorFlow Operation in a Layer.
-
-  This class is used internally by the Functional API. When a user
-  uses a raw TensorFlow Operation on symbolic tensors originating
-  from an `Input` Layer, the resultant operation will be wrapped
-  with this Layer object in order to make the operation compatible
-  with the Keras API.
-
-  This Layer will create a new, identical operation (except for inputs
-  and outputs) every time it is called. If `run_eagerly` is `True`,
-  the op creation and calculation will happen inside an Eager function.
-
-  Instances of this Layer are created when `autolambda` is called, which
-  is whenever a Layer's `__call__` encounters symbolic inputs that do
-  not have Keras metadata, or when a Network's `__init__` encounters
-  outputs that do not have Keras metadata.
-
-  Attributes:
-    node_def: String, the serialized NodeDef of the Op this layer will wrap.
-    name: String, the name of the Layer.
-    constants: Dict of NumPy arrays, the values of any Tensors needed for this
-      Operation that do not originate from a Keras `Input` Layer. Since all
-      placeholders must come from Keras `Input` Layers, these Tensors must be
-      treated as constant in the Functional API.
-    trainable: Bool, whether this Layer is trainable. Currently Variables are
-      not supported, and so this parameter has no effect.
-    dtype: The default dtype of this Layer. Inherited from `Layer` and has no
-      effect on this class, however is used in `get_config`.
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self,
-               node_def,
-               name,
-               constants=None,
-               trainable=True,
-               dtype=None):
-    # Pass autocast=False, as if inputs are cast, input types might not match
-    # Operation type.
-    super(TensorFlowOpLayer, self).__init__(
-        name=_TF_OP_LAYER_NAME_PREFIX + name, trainable=trainable, dtype=dtype,
-        autocast=False)
-    if isinstance(node_def, dict):
-      self.node_def = json_format.ParseDict(node_def, tf.compat.v1.NodeDef())
-    else:
-      if not isinstance(node_def, bytes):
-        node_def = node_def.encode('utf-8')
-      self.node_def = tf.compat.v1.NodeDef.FromString(node_def)
-    # JSON serialization stringifies keys which are integer input indices.
-    self.constants = ({
-        int(index): constant for index, constant in constants.items()
-    } if constants is not None else {})
-    # Layer uses original op unless it is called on new inputs.
-    # This means `built` is not set in `__call__`.
-    self.built = True
-
-    # Do not individually trace TensorflowOpLayers in the SavedModel.
-    self._must_restore_from_config = True
-
-  def call(self, inputs):
-    if tf.executing_eagerly():
-      return self._defun_call(inputs)
-    return self._make_op(inputs)
-
-  def _make_node_def(self, graph):
-    node_def = tf.compat.v1.NodeDef()
-    node_def.CopyFrom(self.node_def)
-    # Used in TPUReplicateContext to indicate whether this node has been cloned
-    # and to not add TPU attributes.
-    node_def.attr['_cloned'].b = True
-    node_def.name = graph.unique_name(node_def.name)
-    return node_def
-
-  def _make_op(self, inputs):
-    inputs = tf.nest.flatten(inputs)
-    graph = inputs[0].graph
-    node_def = self._make_node_def(graph)
-    with graph.as_default():
-      for index, constant in self.constants.items():
-        # Recreate constant in graph to add distribution context.
-        value = tf.get_static_value(constant)
-        if value is not None:
-          constant = tf.constant(value, name=node_def.input[index])
-        inputs.insert(index, constant)
-      # TODO(b/183990973): We should drop or consolidate these private api calls
-      # for adding an op to the graph and recording its gradient.
-      c_op = tf.__internal__.create_c_op(graph, node_def, inputs, control_inputs=[])
-      op = graph._create_op_from_tf_operation(c_op)
-      op._control_flow_post_processing()
-
-      # Record the gradient because custom-made ops don't go through the
-      # code-gen'd eager call path
-      op_type = tf.compat.as_str(op.op_def.name)
-      attr_names = [tf.compat.as_str(attr.name) for attr in op.op_def.attr]
-      attrs = []
-      for attr_name in attr_names:
-        attrs.append(attr_name)
-        attrs.append(op.get_attr(attr_name))
-      attrs = tuple(attrs)
-      tf.__internal__.record_gradient(op_type, op.inputs, attrs, op.outputs)
-
-      if len(op.outputs) == 1:
-        return op.outputs[0]
-      return op.outputs
-
-  @tf.function
-  def _defun_call(self, inputs):
-    """Wraps the op creation method in an Eager function for `run_eagerly`."""
-    return self._make_op(inputs)
-
-  def get_config(self):
-    config = super(TensorFlowOpLayer, self).get_config()
-    config.update({
-        # `__init__` prefixes the name. Revert to the constructor argument.
-        'name': config['name'][len(_TF_OP_LAYER_NAME_PREFIX):],
-        'node_def': json_format.MessageToDict(self.node_def),
-        'constants': {
-            i: backend.get_value(c) for i, c in self.constants.items()
-        }
-    })
-    return config
+    def call(self, inputs):
+        if tf.executing_eagerly():
+            return self._defun_call(inputs)
+        return self._make_op(inputs)
+
+    def _make_node_def(self, graph):
+        node_def = tf.compat.v1.NodeDef()
+        node_def.CopyFrom(self.node_def)
+        # Used in TPUReplicateContext to indicate whether this node has been
+        # cloned and to not add TPU attributes.
+        node_def.attr["_cloned"].b = True
+        node_def.name = graph.unique_name(node_def.name)
+        return node_def
+
+    def _make_op(self, inputs):
+        inputs = tf.nest.flatten(inputs)
+        graph = inputs[0].graph
+        node_def = self._make_node_def(graph)
+        with graph.as_default():
+            for index, constant in self.constants.items():
+                # Recreate constant in graph to add distribution context.
+                value = tf.get_static_value(constant)
+                if value is not None:
+                    if isinstance(value, dict):
+                        value = serialization_lib.deserialize_keras_object(
+                            value
+                        )
+                    constant = tf.constant(value, name=node_def.input[index])
+                inputs.insert(index, constant)
+            # TODO(b/183990973): We should drop or consolidate these private api
+            # calls for adding an op to the graph and recording its gradient.
+            c_op = tf.__internal__.create_c_op(
+                graph, node_def, inputs, control_inputs=[]
+            )
+            op = graph._create_op_from_tf_operation(c_op)
+            op._control_flow_post_processing()
+
+            # Record the gradient because custom-made ops don't go through the
+            # code-gen'd eager call path
+            op_type = tf.compat.as_str(op.op_def.name)
+            attr_names = [
+                tf.compat.as_str(attr.name) for attr in op.op_def.attr
+            ]
+            attrs = []
+            for attr_name in attr_names:
+                attrs.append(attr_name)
+                attrs.append(op.get_attr(attr_name))
+            attrs = tuple(attrs)
+            tf.__internal__.record_gradient(
+                op_type, op.inputs, attrs, op.outputs
+            )
+
+            if len(op.outputs) == 1:
+                return op.outputs[0]
+            return op.outputs
+
+    @tf.function
+    def _defun_call(self, inputs):
+        """Wraps op creation method in an Eager function for `run_eagerly`."""
+        return self._make_op(inputs)
+
+    def get_config(self):
+        config = super(TensorFlowOpLayer, self).get_config()
+        config.update(
+            {
+                # `__init__` prefixes the name. Revert to the constructor
+                # argument.
+                "name": config["name"][len(_TF_OP_LAYER_NAME_PREFIX) :],
+                "node_def": json_format.MessageToDict(self.node_def),
+                "constants": {
+                    i: backend.get_value(c) for i, c in self.constants.items()
+                },
+            }
+        )
+        return config
 
 
 class AddLoss(Layer):
-  """Adds its inputs as a loss.
+    """Adds its inputs as a loss.
 
-  Attributes:
-    unconditional: Whether or not the loss should be conditioned on the inputs.
-  """
+    Attributes:
+      unconditional: Whether or not the loss should be conditioned on the
+        inputs.
+    """
 
-  def __init__(self, unconditional, **kwargs):
-    # Pass autocast=False, as there is no reason to cast loss to a different
-    # dtype.
-    kwargs['autocast'] = False
-    super(AddLoss, self).__init__(**kwargs)
-    self.unconditional = unconditional
+    def __init__(self, unconditional, **kwargs):
+        # Pass autocast=False, as there is no reason to cast loss to a different
+        # dtype.
+        kwargs["autocast"] = False
+        super(AddLoss, self).__init__(**kwargs)
+        self.unconditional = unconditional
 
-  def call(self, inputs):
-    self.add_loss(inputs, inputs=(not self.unconditional))
-    return inputs
+    def call(self, inputs):
+        self.add_loss(inputs, inputs=(not self.unconditional))
+        return inputs
 
-  def get_config(self):
-    config = super(AddLoss, self).get_config()
-    config.update({'unconditional': self.unconditional})
-    return config
+    def get_config(self):
+        config = super(AddLoss, self).get_config()
+        config.update({"unconditional": self.unconditional})
+        return config
 
 
 class AddMetric(Layer):
-  """Adds its inputs as a metric.
+    """Adds its inputs as a metric.
 
-  Attributes:
-    aggregation: 'mean' or None. How the inputs should be aggregated.
-    metric_name: The name to use for this metric.
-  """
+    Attributes:
+      aggregation: 'mean' or None. How the inputs should be aggregated.
+      metric_name: The name to use for this metric.
+    """
 
-  def __init__(self, aggregation=None, metric_name=None, **kwargs):
-    super(AddMetric, self).__init__(**kwargs)
-    self.aggregation = aggregation
-    self.metric_name = metric_name
+    def __init__(self, aggregation=None, metric_name=None, **kwargs):
+        super(AddMetric, self).__init__(**kwargs)
+        self.aggregation = aggregation
+        self.metric_name = metric_name
 
-  def call(self, inputs):
-    self.add_metric(inputs, aggregation=self.aggregation, name=self.metric_name)
-    return inputs
+    def call(self, inputs):
+        self.add_metric(
+            inputs, aggregation=self.aggregation, name=self.metric_name
+        )
+        return inputs
 
-  def get_config(self):
-    config = super(AddMetric, self).get_config()
-    config.update({
-        'aggregation': self.aggregation,
-        'metric_name': self.metric_name
-    })
-    return config
+    def get_config(self):
+        config = super(AddMetric, self).get_config()
+        config.update(
+            {"aggregation": self.aggregation, "metric_name": self.metric_name}
+        )
+        return config
 
 
-def _in_functional_construction_mode(layer, inputs, args, kwargs, input_list):  # pylint: disable=unused-argument
-  """Check the arguments to see if we are constructing a functional model."""
-  # We are constructing a functional model if any of the inputs
-  # are KerasTensors
-  return any(
-      isinstance(tensor, keras_tensor.KerasTensor)
-      for tensor in tf.nest.flatten([inputs, args, kwargs]))
+def _in_functional_construction_mode(layer, inputs, args, kwargs, input_list):
+    """Check the arguments to see if we are constructing a functional model."""
+    # We are constructing a functional model if any of the inputs
+    # are KerasTensors
+    return any(
+        isinstance(tensor, keras_tensor.KerasTensor)
+        for tensor in tf.nest.flatten([inputs, args, kwargs])
+    )
 
 
 def _convert_numpy_or_python_types(x):
-  if isinstance(x, (tf.Tensor, np.ndarray, float, int)):
-    return tf.convert_to_tensor(x)
-  return x
+    if isinstance(x, (tf.Tensor, np.ndarray, float, int)):
+        return tf.convert_to_tensor(x)
+    return x
 
 
-@keras_export(
-    'keras.__internal__.apply_name_scope_on_model_declaration', v1=[])
+@keras_export("keras.__internal__.apply_name_scope_on_model_declaration", v1=[])
 def _apply_name_scope_on_model_declaration(enable):
-  """Apply `with tf.name_scope(...)` on model declaration.
+    """Apply `with tf.name_scope(...)` on model declaration.
 
-  ```python
-  tf.keras.__internal__.apply_name_scope_on_model_declaration(True)
+    ```python
+    tf.keras.__internal__.apply_name_scope_on_model_declaration(True)
 
-  inputs = input_layer.Input((3,))
-  with tf.name_scope('MyScope'):
-    outputs = layers.Dense(10, name='MyDense')(inputs)
-  model = tf.keras.Model(inputs, outputs)
+    inputs = input_layer.Input((3,))
+    with tf.name_scope('MyScope'):
+      outputs = layers.Dense(10, name='MyDense')(inputs)
+    model = tf.keras.Model(inputs, outputs)
 
-  # with `tf.keras.__internal__.apply_name_scope_on_model_declaration(True)`,
-  # The name of the dense layer is "model/MyScope/MyDense/*", and without,
-  # "model/MyDense/*"
-  ```
+    # with `tf.keras.__internal__.apply_name_scope_on_model_declaration(True)`,
+    # The name of the dense layer is "model/MyScope/MyDense/*", and without,
+    # "model/MyDense/*"
+    ```
 
-  Args:
-    enable: Enables if `True`, disables if `False`.
-  """
-  if not isinstance(enable, bool):
-    raise TypeError(
-        '`enable` argument must be `True` or `False`, got {}'.format(enable))
+    Args:
+      enable: Enables if `True`, disables if `False`.
+    """
+    if not isinstance(enable, bool):
+        raise TypeError(
+            f"`enable` argument must be `True` or `False`, got {enable}"
+        )
 
-  global _is_name_scope_on_model_declaration_enabled
-  _is_name_scope_on_model_declaration_enabled = enable
+    global _is_name_scope_on_model_declaration_enabled
+    _is_name_scope_on_model_declaration_enabled = enable
 
 
-@keras_export('keras.__internal__.layers.BaseRandomLayer')
+@keras_export("keras.__internal__.layers.BaseRandomLayer")
 class BaseRandomLayer(Layer):
-  """A layer handle the random number creation and savemodel behavior."""
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, seed=None, force_generator=False, **kwargs):
-    """Initialize the BaseRandomLayer.
-
-    Note that the constructor is annotated with
-    @no_automatic_dependency_tracking. This is to skip the auto
-    tracking of self._random_generator instance, which is an AutoTrackable.
-    The backend.RandomGenerator could contain a tf.random.Generator instance
-    which will have tf.Variable as the internal state. We want to avoid saving
-    that state into model.weights and checkpoints for backward compatibility
-    reason. In the meantime, we still need to make them visible to SavedModel
-    when it is tracing the tf.function for the `call()`.
-    See _list_extra_dependencies_for_serialization below for more details.
+    """A layer handle the random number creation and savemodel behavior."""
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(
+        self, seed=None, force_generator=False, rng_type=None, **kwargs
+    ):
+        """Initialize the BaseRandomLayer.
+
+        Note that the constructor is annotated with
+        @no_automatic_dependency_tracking. This is to skip the auto
+        tracking of self._random_generator instance, which is an AutoTrackable.
+        The backend.RandomGenerator could contain a tf.random.Generator instance
+        which will have tf.Variable as the internal state. We want to avoid
+        saving that state into model.weights and checkpoints for backward
+        compatibility reason. In the meantime, we still need to make them
+        visible to SavedModel when it is tracing the tf.function for the
+        `call()`.
+        See _list_extra_dependencies_for_serialization below for more details.
+
+        Args:
+          seed: optional integer, used to create RandomGenerator.
+          force_generator: boolean, default to False, whether to force the
+            RandomGenerator to use the code branch of tf.random.Generator.
+          rng_type: string, the rng type that will be passed to backend
+            RandomGenerator. `None` will allow RandomGenerator to choose
+            types by itself. Valid values are "stateful", "stateless",
+            "legacy_stateful". Defaults to `None`.
+          **kwargs: other keyword arguments that will be passed to the parent
+            *class
+        """
+        super().__init__(**kwargs)
+        self._random_generator = backend.RandomGenerator(
+            seed, force_generator=force_generator, rng_type=rng_type
+        )
 
-    Args:
-      seed: optional integer, used to create RandomGenerator.
-      force_generator: boolean, default to False, whether to force the
-        RandomGenerator to use the code branch of tf.random.Generator.
-      **kwargs: other keyword arguments that will be passed to the parent class
-    """
-    super().__init__(**kwargs)
-    self._random_generator = backend.RandomGenerator(
-        seed, force_generator=force_generator)
-    # Eagerly init the generator to avoid any issue like b/206821407
-    self._random_generator._maybe_init()
-
-  def _trackable_children(self, save_type='checkpoint', **kwargs):
-    if save_type == 'savedmodel':
-      cache = kwargs['cache']
-      # TODO(b/213628533): This must be called before super() to ensure
-      # that any input shape changes are applied before getting the config of
-      # the model.
-      children = self._trackable_saved_model_saver.trackable_children(cache)
-      # This method exposes the self._random_generator to SavedModel only
-      # (not layer.weights and checkpoint).
-      children['_random_generator'] = self._random_generator
-    else:
-      children = {}
-    children.update(super()._trackable_children(save_type, **kwargs))
-    return children
+    def build(self, input_shape):
+        super().build(input_shape)
+        self._random_generator._maybe_init()
+
+    def _trackable_children(self, save_type="checkpoint", **kwargs):
+        if save_type == "savedmodel":
+            cache = kwargs["cache"]
+            # TODO(b/213628533): This must be called before super() to ensure
+            # that any input shape changes are applied before getting the config
+            # of the model.
+            children = self._trackable_saved_model_saver.trackable_children(
+                cache
+            )
+            # This method exposes the self._random_generator to SavedModel only
+            # (not layer.weights and checkpoint).
+            children["_random_generator"] = self._random_generator
+        else:
+            children = {}
+        children.update(super()._trackable_children(save_type, **kwargs))
+        return children
+
+    def _lookup_dependency(self, name, cached_dependencies=None):
+        # When loading from a Keras SavedModel load, make sure that the loader
+        # can find the random generator, otherwise the loader will assume that
+        # it does not exist, and will try to create a new generator.
+        if name == "_random_generator":
+            return self._random_generator
+        elif cached_dependencies is not None:
+            return cached_dependencies.get(name)
+        else:
+            return super()._lookup_dependency(name)
diff --git a/keras/engine/base_layer_test.py b/keras/engine/base_layer_test.py
index 7182da8fa36a..0389ea5126c1 100644
--- a/keras/engine/base_layer_test.py
+++ b/keras/engine/base_layer_test.py
@@ -13,1950 +13,2071 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for TensorFlow 2.0 layer behavior."""
-# pylint: disable=g-bad-import-order
-import tensorflow.compat.v2 as tf
-
 import copy
 import os
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import layers
 from keras import regularizers
-from keras.testing_infra import test_utils
 from keras.engine import base_layer
 from keras.engine import input_layer
 from keras.engine import sequential
 from keras.engine import training as training_lib
 from keras.legacy_tf_layers import core as legacy_core
-from keras.optimizers.optimizer_v2 import rmsprop
+from keras.optimizers.legacy import rmsprop
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 from keras.utils import control_flow_util
 
 
 class DynamicLayer(base_layer.Layer):
+    def __init__(self, dynamic=False, **kwargs):
+        super().__init__(dynamic=dynamic, **kwargs)
 
-  def __init__(self, dynamic=False, **kwargs):
-    super().__init__(dynamic=dynamic, **kwargs)
-
-  def call(self, inputs):
-    samples = tf.TensorArray(
-        dtype=tf.float32, size=tf.shape(inputs)[0])
-    for idx, sample in enumerate(inputs):
-      samples = samples.write(idx, tf.square(sample))
-    return samples.stack()
+    def call(self, inputs):
+        samples = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
+        for idx, sample in enumerate(inputs):
+            samples = samples.write(idx, tf.square(sample))
+        return samples.stack()
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    def compute_output_shape(self, input_shape):
+        return input_shape
 
 
 class InvalidLayer(base_layer.Layer):
-
-  def call(self, inputs):
-    raise ValueError('You did something wrong!')
+    def call(self, inputs):
+        raise ValueError("You did something wrong!")
 
 
 @test_utils.run_v2_only
 class BaseLayerTest(test_combinations.TestCase):
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_layer_instrumentation(self):
-    layer = layers.Add()
-    self.assertTrue(layer._instrumented_keras_api)
-    self.assertTrue(layer._instrumented_keras_layer_class)
-    self.assertFalse(layer._instrumented_keras_model_class)
-    self.assertTrue(base_layer.keras_api_gauge.get_cell('tf.keras.layers.Add'))
-
-    # Verify this was not instrumented as a legacy layer
-    self.assertFalse(
-        base_layer.keras_api_gauge.get_cell('legacy_layer').value())
-    base_layer.keras_api_gauge.get_cell('tf.keras.layers.Add').set(False)
-
-  @test_combinations.generate(test_combinations.keras_model_type_combinations())
-  def test_dynamic_layer(self):
-    model = test_utils.get_model_from_layers([DynamicLayer(dynamic=True)],
-                                             input_shape=(3,))
-    self.assertEqual(model.dynamic, True)
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    self.assertEqual(model.run_eagerly, True)
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-  @test_combinations.generate(test_combinations.keras_model_type_combinations())
-  def test_dynamic_layer_error(self):
-    # Functional Models hit the `dyanamic=True` error during construction.
-    # Subclass Models should just throw the original autograph error during
-    # execution.
-    raised_error = False
-    try:
-      model = test_utils.get_model_from_layers([DynamicLayer()],
-                                               input_shape=(3,))
-      model.compile(rmsprop.RMSprop(0.001), loss='mse')
-      model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-    except tf.errors.OperatorNotAllowedInGraphError as e:
-      if 'iterating over `tf.Tensor`' in str(e):
-        raised_error = True
-      elif 'Iterating over a symbolic `tf.Tensor`' in str(e):
-        raised_error = True
-    except TypeError as e:
-      if 'attempting to use Python control flow' in str(e):
-        raised_error = True
-      elif 'Attempting to use Python control flow' in str(e):
-        raised_error = True
-    self.assertTrue(raised_error)
-
-  @test_combinations.generate(test_combinations.keras_model_type_combinations())
-  def test_dynamic_layer_error_running_in_graph_mode(self):
-    with tf.compat.v1.get_default_graph().as_default():
-      model = test_utils.get_model_from_layers([DynamicLayer(dynamic=True)],
-                                               input_shape=(3,))
-      self.assertEqual(model.dynamic, True)
-      # But then you cannot run the model since you're in a graph scope.
-      with self.assertRaisesRegex(ValueError,
-                                  'You must enable eager execution'):
-        model.compile(rmsprop.RMSprop(0.001), loss='mse')
-
-  def test_manual_compute_output_shape(self):
-
-    class BuildCounter(base_layer.Layer):
-
-      def __init__(self, *args, **kwargs):  # pylint: disable=redefined-outer-name
-        super().__init__(*args, **kwargs)
-        self.build_counter = 0
-
-      def build(self, input_shape):
-        self.build_counter += 1
-        self.build_shape = input_shape
-
-      def call(self, inputs):
-        return inputs
-
-    layer = BuildCounter(dtype=tf.float64)
-    output_shape = layer.compute_output_shape((None, 10))
-    self.assertEqual(layer.build_counter, 1)
-    self.assertEqual(layer.build_shape.as_list(), [None, 10])
-    self.assertEqual(output_shape.as_list(), [None, 10])
-    output_signature = layer.compute_output_signature(
-        tf.TensorSpec(dtype=tf.float64, shape=[None, 10]))
-    self.assertEqual(layer.build_counter, 1)
-    self.assertEqual(layer.build_shape.as_list(), [None, 10])
-    self.assertEqual(output_signature.dtype, tf.float64)
-    self.assertEqual(output_signature.shape.as_list(), [None, 10])
-    layer(np.ones((5, 10)))
-    self.assertEqual(layer.build_counter, 1)
-    self.assertEqual(layer.build_shape.as_list(), [None, 10])
-
-  def test_dynamic_layer_with_deferred_sequential_model(self):
-    model = sequential.Sequential([DynamicLayer(dynamic=True), layers.Dense(3)])
-    self.assertEqual(model.dynamic, True)
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    self.assertEqual(model.run_eagerly, True)
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-  def test_nested_dynamic_layers_in_eager_mode(self):
-    inputs = input_layer.Input((3,))
-    outputs = DynamicLayer(dynamic=True)(inputs)
-    inner_model = training_lib.Model(inputs, outputs)
-    self.assertEqual(inner_model.dynamic, True)
-
-    inputs = input_layer.Input((3,))
-    x = DynamicLayer(dynamic=True)(inputs)
-    outputs = inner_model(x)
-
-    model = training_lib.Model(inputs, outputs)
-    self.assertEqual(model.dynamic, True)
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    self.assertEqual(model.run_eagerly, True)
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-  def test_dynamic_subclassed_model_no_shape_inference(self):
-
-    class MyModel(training_lib.Model):
-
-      def __init__(self):
-        super().__init__(dynamic=True)
-        self.layer1 = layers.Dense(3)
-        self.layer2 = layers.Dense(3)
-
-      def call(self, inputs):
-        if tf.reduce_sum(inputs) > 0:
-          return self.layer1(inputs)
-        else:
-          return self.layer2(inputs)
-
-    model = MyModel()
-    self.assertEqual(model.dynamic, True)
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    self.assertEqual(model.run_eagerly, True)
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-    self.assertEqual(model.outputs, None)
-
-  def test_dynamic_subclassed_model_with_shape_inference(self):
-
-    class MyModel(training_lib.Model):
-
-      def __init__(self):
-        super().__init__(dynamic=True)
-        self.layer1 = layers.Dense(3)
-        self.layer2 = layers.Dense(3)
-
-      def call(self, inputs):
-        if tf.reduce_sum(inputs) > 0:
-          return self.layer1(inputs)
-        else:
-          return self.layer2(inputs)
-
-      def compute_output_shape(self, input_shape):
-        return tuple(input_shape[:-1].as_list()) + (3,)
-
-    model = MyModel()
-    self.assertEqual(model.dynamic, True)
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    x, y = np.random.random((2, 3)), np.random.random((2, 3))
-    model.train_on_batch(x, y)
-    outputs = model(x)
-    self.assertEqual(outputs.shape.as_list(), [2, 3])
-
-  def test_deepcopy(self):
-    bias_reg = lambda x: 1e-3 * tf.reduce_sum(x)
-    layer = layers.Conv2D(32, (3, 3), bias_regularizer=bias_reg)
-    # Call the Layer on data to generate regularize losses.
-    layer(tf.ones((1, 10, 10, 3)))
-    self.assertLen(layer.losses, 1)
-    new_layer = copy.deepcopy(layer)
-    self.assertEqual(new_layer.bias_regularizer, bias_reg)
-    self.assertEqual(layer.get_config(), new_layer.get_config())
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_invalid_forward_pass(self):
-    inputs = input_layer.Input((3,))
-    with self.assertRaisesRegex(ValueError, 'You did something wrong!'):
-      _ = InvalidLayer()(inputs)
-
-  def test_no_legacy_model(self):
-    inputs = input_layer.Input((1,))
-    legacy_dense_0 = legacy_core.Dense(1, name='legacy_dense_0')
-    legacy_dense_1 = legacy_core.Dense(1, name='legacy_dense_1')
-
-    layer = legacy_dense_0(inputs)
-    layer = layers.Dense(1)(layer)
-    layer = legacy_dense_1(layer)
-
-    expected_regex = (r'The following are legacy tf\.layers\.Layers:\n  '
-                      '{}\n  {}'.format(legacy_dense_0, legacy_dense_1))
-
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      _ = training_lib.Model(inputs=[inputs], outputs=[layer])
-
-    model = training_lib.Model(inputs=[inputs], outputs=[inputs])
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      model._insert_layers([legacy_dense_0, legacy_dense_1])
-
-  def test_no_legacy_sequential(self):
-    layer = [layers.Dense(1), legacy_core.Dense(1, name='legacy_dense_0')]
-
-    expected_regex = r'legacy tf\.layers\.Layers:\n  {}'.format(layer[1])
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      _ = sequential.Sequential(layer)
-
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      _ = sequential.Sequential([input_layer.Input(shape=(4,))] + layer)
-
-    model = sequential.Sequential()
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      for l in layer:
-        model.add(l)
-
-  @test_combinations.generate(
-      test_combinations.times(
-          test_combinations.keras_model_type_combinations(),
-          test_combinations.combine(mode=['graph', 'eager'])))
-  def test_build_with_numpy_data(self):
-    model_layers = [
-        layers.Dense(3, activation='relu', kernel_initializer='ones'),
-        layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
-    ]
-    model = test_utils.get_model_from_layers(model_layers, input_shape=(4,))
-    model(np.zeros((2, 4), dtype='float32'))
-    self.assertTrue(model.built)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_default_add_weight(self):
-
-    class TestLayer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.default_weight = self.add_weight()
-        self.weight_without_name = self.add_weight(shape=(3, 4))
-        self.regularized_weight_without_name = self.add_weight(
-            shape=(3, 4), regularizer='l2')
-
-    layer = TestLayer()
-    self.assertEqual(layer.default_weight.shape.as_list(), [])
-    self.assertEqual(layer.weight_without_name.shape.as_list(), [3, 4])
-    self.assertEqual(layer.default_weight.dtype.name, 'float32')
-    self.assertEqual(layer.weight_without_name.dtype.name, 'float32')
-    self.assertEqual(len(layer.losses), 1)
-    if not tf.executing_eagerly():
-      # Cannot access tensor.name in eager execution.
-      self.assertIn('Variable_2/Regularizer', layer.losses[0].name)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_add_weight_by_getter(self):
-    layer = base_layer.Layer()
-    variable = tf.Variable('abc')
-    added = layer.add_weight(
-        dtype=tf.string, getter=lambda *_, **__: variable)
-    self.assertIs(variable, added)
-
-  @test_combinations.generate(
-      test_combinations.keras_mode_combinations(mode=['eager']))
-  def test_learning_phase_freezing_for_layers(self):
-
-    class LearningPhaseLayer(base_layer.Layer):
-
-      def call(self, inputs):
-        return backend.in_train_phase(lambda: tf.ones_like(inputs),
-                                      lambda: tf.zeros_like(inputs))
-
-    def get_learning_phase_value():
-      model = sequential.Sequential([LearningPhaseLayer(input_shape=(1,))])
-      model._run_eagerly = test_utils.should_run_eagerly()
-      return np.sum(model(np.ones((1, 1))))
-
-    self.assertEqual(get_learning_phase_value(), 0)
-
-    # Test scope.
-    with backend.learning_phase_scope(1):
-      self.assertEqual(get_learning_phase_value(), 1)
-
-    # The effects of the scope end after exiting it.
-    self.assertEqual(get_learning_phase_value(), 0)
-
-    # Test setting.
-    backend.set_learning_phase(1)
-    self.assertEqual(get_learning_phase_value(), 1)
-    backend.set_learning_phase(0)
-    self.assertEqual(get_learning_phase_value(), 0)
-
-  # Cannot be enabled with `run_eagerly=True`, see b/123904578
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_layer_can_return_variable(self):
-
-    class ComputeSum(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.total = tf.Variable(
-            initial_value=tf.zeros((1, 1)), trainable=False)
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_layer_instrumentation(self):
+        layer = layers.Add()
+        self.assertTrue(layer._instrumented_keras_api)
+        self.assertTrue(layer._instrumented_keras_layer_class)
+        self.assertFalse(layer._instrumented_keras_model_class)
+        self.assertTrue(
+            base_layer.keras_api_gauge.get_cell("tf.keras.layers.Add")
+        )
+
+        # Verify this was not instrumented as a legacy layer
+        self.assertFalse(
+            base_layer.keras_api_gauge.get_cell("legacy_layer").value()
+        )
+        base_layer.keras_api_gauge.get_cell("tf.keras.layers.Add").set(False)
+
+    @test_combinations.generate(
+        test_combinations.keras_model_type_combinations()
+    )
+    def test_dynamic_layer(self):
+        model = test_utils.get_model_from_layers(
+            [DynamicLayer(dynamic=True)], input_shape=(3,)
+        )
+        self.assertEqual(model.dynamic, True)
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        self.assertEqual(model.run_eagerly, True)
+        model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    @test_combinations.generate(
+        test_combinations.keras_model_type_combinations()
+    )
+    def test_dynamic_layer_error(self):
+        # Functional Models hit the `dyanamic=True` error during construction.
+        # Subclass Models should just throw the original autograph error during
+        # execution.
+        raised_error = False
+        try:
+            model = test_utils.get_model_from_layers(
+                [DynamicLayer()], input_shape=(3,)
+            )
+            model.compile(rmsprop.RMSprop(0.001), loss="mse")
+            model.train_on_batch(
+                np.random.random((2, 3)), np.random.random((2, 3))
+            )
+        except tf.errors.OperatorNotAllowedInGraphError as e:
+            if "iterating over `tf.Tensor`" in str(e):
+                raised_error = True
+            elif "Iterating over a symbolic `tf.Tensor`" in str(e):
+                raised_error = True
+        except TypeError as e:
+            if "attempting to use Python control flow" in str(e):
+                raised_error = True
+            elif "Attempting to use Python control flow" in str(e):
+                raised_error = True
+        self.assertTrue(raised_error)
+
+    @test_combinations.generate(
+        test_combinations.keras_model_type_combinations()
+    )
+    def test_dynamic_layer_error_running_in_graph_mode(self):
+        with tf.compat.v1.get_default_graph().as_default():
+            model = test_utils.get_model_from_layers(
+                [DynamicLayer(dynamic=True)], input_shape=(3,)
+            )
+            self.assertEqual(model.dynamic, True)
+            # But then you cannot run the model since you're in a graph scope.
+            with self.assertRaisesRegex(
+                ValueError, "You must enable eager execution"
+            ):
+                model.compile(rmsprop.RMSprop(0.001), loss="mse")
+
+    def test_manual_compute_output_shape(self):
+        class BuildCounter(base_layer.Layer):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.build_counter = 0
+
+            def build(self, input_shape):
+                self.build_counter += 1
+                self.build_shape = input_shape
+
+            def call(self, inputs):
+                return inputs
+
+        layer = BuildCounter(dtype=tf.float64)
+        output_shape = layer.compute_output_shape((None, 10))
+        self.assertEqual(layer.build_counter, 1)
+        self.assertEqual(layer.build_shape.as_list(), [None, 10])
+        self.assertEqual(output_shape.as_list(), [None, 10])
+        output_signature = layer.compute_output_signature(
+            tf.TensorSpec(dtype=tf.float64, shape=[None, 10])
+        )
+        self.assertEqual(layer.build_counter, 1)
+        self.assertEqual(layer.build_shape.as_list(), [None, 10])
+        self.assertEqual(output_signature.dtype, tf.float64)
+        self.assertEqual(output_signature.shape.as_list(), [None, 10])
+        layer(np.ones((5, 10)))
+        self.assertEqual(layer.build_counter, 1)
+        self.assertEqual(layer.build_shape.as_list(), [None, 10])
+
+    def test_dynamic_layer_with_deferred_sequential_model(self):
+        model = sequential.Sequential(
+            [DynamicLayer(dynamic=True), layers.Dense(3)]
+        )
+        self.assertEqual(model.dynamic, True)
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        self.assertEqual(model.run_eagerly, True)
+        model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    def test_nested_dynamic_layers_in_eager_mode(self):
+        inputs = input_layer.Input((3,))
+        outputs = DynamicLayer(dynamic=True)(inputs)
+        inner_model = training_lib.Model(inputs, outputs)
+        self.assertEqual(inner_model.dynamic, True)
+
+        inputs = input_layer.Input((3,))
+        x = DynamicLayer(dynamic=True)(inputs)
+        outputs = inner_model(x)
+
+        model = training_lib.Model(inputs, outputs)
+        self.assertEqual(model.dynamic, True)
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        self.assertEqual(model.run_eagerly, True)
+        model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    def test_dynamic_subclassed_model_no_shape_inference(self):
+        class MyModel(training_lib.Model):
+            def __init__(self):
+                super().__init__(dynamic=True)
+                self.layer1 = layers.Dense(3)
+                self.layer2 = layers.Dense(3)
+
+            def call(self, inputs):
+                if tf.reduce_sum(inputs) > 0:
+                    return self.layer1(inputs)
+                else:
+                    return self.layer2(inputs)
+
+        model = MyModel()
+        self.assertEqual(model.dynamic, True)
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        self.assertEqual(model.run_eagerly, True)
+        model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+        self.assertEqual(model.outputs, None)
+
+    def test_dynamic_subclassed_model_with_shape_inference(self):
+        class MyModel(training_lib.Model):
+            def __init__(self):
+                super().__init__(dynamic=True)
+                self.layer1 = layers.Dense(3)
+                self.layer2 = layers.Dense(3)
+
+            def call(self, inputs):
+                if tf.reduce_sum(inputs) > 0:
+                    return self.layer1(inputs)
+                else:
+                    return self.layer2(inputs)
+
+            def compute_output_shape(self, input_shape):
+                return tuple(input_shape[:-1].as_list()) + (3,)
+
+        model = MyModel()
+        self.assertEqual(model.dynamic, True)
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        x, y = np.random.random((2, 3)), np.random.random((2, 3))
+        model.train_on_batch(x, y)
+        outputs = model(x)
+        self.assertEqual(outputs.shape.as_list(), [2, 3])
+
+    def test_deepcopy(self):
+        bias_reg = lambda x: 1e-3 * tf.reduce_sum(x)
+        layer = layers.Conv2D(32, (3, 3), bias_regularizer=bias_reg)
+        # Call the Layer on data to generate regularize losses.
+        layer(tf.ones((1, 10, 10, 3)))
+        self.assertLen(layer.losses, 1)
+        new_layer = copy.deepcopy(layer)
+        self.assertEqual(new_layer.bias_regularizer, bias_reg)
+        self.assertEqual(layer.get_config(), new_layer.get_config())
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_invalid_forward_pass(self):
+        inputs = input_layer.Input((3,))
+        with self.assertRaisesRegex(ValueError, "You did something wrong!"):
+            _ = InvalidLayer()(inputs)
+
+    def test_no_legacy_model(self):
+        inputs = input_layer.Input((1,))
+        legacy_dense_0 = legacy_core.Dense(1, name="legacy_dense_0")
+        legacy_dense_1 = legacy_core.Dense(1, name="legacy_dense_1")
+
+        layer = legacy_dense_0(inputs)
+        layer = layers.Dense(1)(layer)
+        layer = legacy_dense_1(layer)
+
+        expected_regex = (
+            r"The following are legacy tf\.layers\.Layers:\n  "
+            "{}\n  {}".format(legacy_dense_0, legacy_dense_1)
+        )
+
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            _ = training_lib.Model(inputs=[inputs], outputs=[layer])
+
+        model = training_lib.Model(inputs=[inputs], outputs=[inputs])
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            model._insert_layers([legacy_dense_0, legacy_dense_1])
+
+    def test_no_legacy_sequential(self):
+        layer = [layers.Dense(1), legacy_core.Dense(1, name="legacy_dense_0")]
+
+        expected_regex = r"legacy tf\.layers\.Layers:\n  {}".format(layer[1])
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            _ = sequential.Sequential(layer)
+
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            _ = sequential.Sequential([input_layer.Input(shape=(4,))] + layer)
+
+        model = sequential.Sequential()
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            for l in layer:
+                model.add(l)
+
+    @test_combinations.generate(
+        test_combinations.times(
+            test_combinations.keras_model_type_combinations(),
+            test_combinations.combine(mode=["graph", "eager"]),
+        )
+    )
+    def test_build_with_numpy_data(self):
+        model_layers = [
+            layers.Dense(3, activation="relu", kernel_initializer="ones"),
+            layers.Dense(1, activation="sigmoid", kernel_initializer="ones"),
+        ]
+        model = test_utils.get_model_from_layers(model_layers, input_shape=(4,))
+        model(np.zeros((2, 4), dtype="float32"))
+        self.assertTrue(model.built)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_default_add_weight(self):
+        class TestLayer(base_layer.Layer):
+            def __init__(self):
+                super().__init__()
+                self.default_weight = self.add_weight()
+                self.weight_without_name = self.add_weight(shape=(3, 4))
+                self.regularized_weight_without_name = self.add_weight(
+                    shape=(3, 4), regularizer="l2"
+                )
+
+        layer = TestLayer()
+        self.assertEqual(layer.default_weight.shape.as_list(), [])
+        self.assertEqual(layer.weight_without_name.shape.as_list(), [3, 4])
+        self.assertEqual(layer.default_weight.dtype.name, "float32")
+        self.assertEqual(layer.weight_without_name.dtype.name, "float32")
+        self.assertEqual(len(layer.losses), 1)
         if not tf.executing_eagerly():
-          backend.get_session().run(self.total.initializer)
-
-      def call(self, inputs):
-        self.total.assign_add(inputs)
-        return self.total
-
-    inputs = input_layer.Input(shape=(1,))
-    model = training_lib.Model(inputs, ComputeSum()(inputs))
-    model.predict(np.ones((1, 1)))
-
-  def _get_layer_with_training_arg(self):
-
-    class TrainingLayer(base_layer.Layer):
-      """A layer with a `training` argument in a defuned `call`."""
-
-      @tf.function
-      def call(self, inputs, training=None):
-        if training is None:
-          training = backend.learning_phase()
-        return control_flow_util.smart_cond(
-            training, lambda: tf.ones_like(inputs),
-            lambda: tf.zeros_like(inputs))
-
-    return TrainingLayer()
-
-  # b/124459427: can't test with `run_eagerly=True` for now.
-  @test_combinations.generate(
-      test_combinations.times(
-          test_combinations.keras_mode_combinations(),
-          test_combinations.keras_model_type_combinations()))
-  def test_training_arg_in_defun(self):
-    layer = self._get_layer_with_training_arg()
-    model = test_utils.get_model_from_layers([layer], input_shape=(1,))
-    model.compile(rmsprop.RMSprop(0.),
-                  loss='mae')
-    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
-    self.assertEqual(history.history['loss'][0], 1.)
-    loss = model.evaluate(np.zeros((1, 1)), np.zeros((1, 1)))
-    self.assertEqual(loss, 0.)
-
-    # Test that the argument injection performed in `call` is not active
-    # when the argument is passed explicitly.
-    layer = self._get_layer_with_training_arg()
-    inputs = input_layer.Input(shape=(1,))
-    # Pass `training` by name
-    outputs = layer(inputs, training=False)
-    model = training_lib.Model(inputs, outputs)
-    model.compile(rmsprop.RMSprop(0.),
-                  loss='mae')
-    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
-    self.assertEqual(history.history['loss'][0], 0.)
-
-  @test_combinations.generate(
-      test_combinations.times(
-          test_combinations.keras_mode_combinations(),
-          test_combinations.keras_model_type_combinations()))
-  def test_raw_variable_assignment(self):
-
-    class RawVariableLayer(base_layer.Layer):
-
-      def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        # Test variables in nested structure.
-        self.var_list = [tf.Variable(1.), {'a': tf.Variable(2.)}]
-
-      def call(self, inputs):
-        return inputs * self.var_list[0] * self.var_list[1]['a']
-
-    model = test_utils.get_model_from_layers([RawVariableLayer()],
-                                             input_shape=(10,))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 10)), np.ones((10, 10))
-    # Checks that variables get initialized.
-    model.fit(x, y, batch_size=2, epochs=2)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_composite_variable_assignment(self):
-
-    class Spec(tf.TypeSpec):
-
-      value_type = property(lambda self: CompositeVariable)
-
-      def _component_specs(self):
-        pass
-
-      def _serialize(self):
-        pass
-
-      def _to_components(self, value):
-        return value._variables
-
-      def _from_components(self, variable_list):
-        return CompositeVariable(variable_list)
-
-    class CompositeVariable(tf.__internal__.CompositeTensor):
-
-      def __init__(self, variable_list):
-        self._variables = variable_list
-
-      @property
-      def _type_spec(self):
-        return Spec()
-
-    class CompositeVariableLayer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.composite_var = CompositeVariable(
-            [tf.Variable(1.),
-             tf.Variable(2.)])
-
-    layer = CompositeVariableLayer()
-    self.assertLen(layer.weights, 2)
-    self.assertIsInstance(layer.weights[0], tf.Variable)
-    self.assertIsInstance(layer.weights[1], tf.Variable)
-    self.assertEqual(self.evaluate(layer.weights[0]), 1.)
-    self.assertEqual(self.evaluate(layer.weights[1]), 2.)
-
-  def test_exception_if_trainable_not_boolean(self):
-    base_layer.Layer(trainable=True)
-    base_layer.Layer(trainable=tf.constant(True))
-    base_layer.Layer(trainable=tf.Variable(tf.constant(True)))
-    with self.assertRaisesRegex(
-        TypeError, 'Expected `trainable` argument to be a boolean'):
-      base_layer.Layer(trainable=0)
-
-  def test_exception_if_dynamic_not_boolean(self):
-    base_layer.Layer(dynamic=True)
-    with self.assertRaisesRegex(TypeError,
-                                'Expected `dynamic` argument to be a boolean'):
-      base_layer.Layer(dynamic=0)
-
-  def test_exception_if_name_not_string_or_none(self):
-    base_layer.Layer(name=None)
-    base_layer.Layer(name='layer_name')
-    with self.assertRaisesRegex(TypeError,
-                                'Expected `name` argument to be a string'):
-      base_layer.Layer(name=0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_layer_names(self):
-    inputs = input_layer.Input(shape=[2])
-    add1 = inputs + inputs
-    add2 = layers.Add()([inputs, inputs])
-    add3 = inputs + inputs
-    add4 = layers.Add()([inputs, inputs])
-    model = training_lib.Model(inputs=[inputs],
-                               outputs=[add1, add2, add3, add4])
-    actual_names = [l.name for l in model.layers]
-    graph_names = [
-        'input_1', 'tf_op_layer_add', 'add', 'tf_op_layer_add_2', 'add_1'
-    ]
-    eager_names = [
-        'input_1', 'tf.__operators__.add', 'add', 'tf.__operators__.add_1',
-        'add_1'
-    ]
-    for actual, eager, graph in zip(actual_names, graph_names, eager_names):
-      self.assertIn(actual, {eager, graph})
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_layer_names_after_loading(self):
-    backend.clear_session()
-    # Mimic loading a model that already contained add layers with
-    # name = 'add_1' and 'tf.__operators__.add'
-    layers.Add(name='add_1')
-    layers.Add(name='tf.__operators__.add')
-
-    inputs = input_layer.Input(shape=[2])
-    add1 = inputs + inputs
-    add2 = layers.Add()([inputs, inputs])
-    add3 = inputs + inputs
-    add4 = layers.Add()([inputs, inputs])
-    model = training_lib.Model(
-        inputs=[inputs], outputs=[add1, add2, add3, add4])
-    actual_names = [l.name for l in model.layers]
-    # The generated op layer names should have avoided layer names seen in
-    # the loaded model. (This avoiance should not apply to non-op-layers)
-    expected_names = [
-        'input_1', 'tf.__operators__.add_1',
-        'add', 'tf.__operators__.add_2', 'add_1'
-    ]
-    self.assertAllEqual(actual_names, expected_names)
-
-  def test_add_trainable_weight_on_frozen_layer(self):
-
-    class TestLayer(base_layer.Layer):
-
-      def build(self, input_shape):
-        self.w = self.add_weight(shape=(), trainable=True)
-
-      def call(self, inputs):
-        return self.w * inputs
-
-    layer = TestLayer()
-    layer.trainable = False
-    layer.build(None)
-    layer.trainable = True
-    self.assertListEqual(layer.trainable_weights, [layer.w])
-
-  @test_combinations.generate(
-      test_combinations.times(
-          test_combinations.keras_mode_combinations(),
-          test_combinations.keras_model_type_combinations()))
-  def test_passing_initial_weights_values(self):
-    kernel_value = np.random.random((10, 2))
-    layer_with_weights = layers.Dense(2, use_bias=False, weights=[kernel_value])
-
-    model = test_utils.get_model_from_layers([layer_with_weights],
-                                             input_shape=(10,))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    inputs = np.random.random((3, 10))
-    out = model.predict(inputs)
-    self.assertAllClose(model.layers[-1].get_weights()[0], kernel_value)
-    self.assertAllClose(out, np.dot(inputs, kernel_value))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_set_weights_and_get_weights(self):
-    layer = layers.Dense(2)
-    layer.build((None, 10))
-    kernel = np.random.random((10, 2))
-    bias = np.random.random((2,))
-    layer.set_weights([kernel, bias])
-    weights = layer.get_weights()
-    self.assertEqual(len(weights), 2)
-    self.assertAllClose(weights[0], kernel)
-    self.assertAllClose(weights[1], bias)
-    with self.assertRaisesRegex(ValueError,
-                                'but the layer was expecting 2 weights'):
-      layer.set_weights([1, 2, 3])
-    with self.assertRaisesRegex(ValueError,
-                                'not compatible with provided weight shape'):
-      layer.set_weights([kernel.T, bias])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_set_weights_accepts_output_of_get_weights(self):
-    layer = layers.Layer()
-    layer.add_weight(name='scalar_float', shape=(), dtype=tf.float32)
-    layer.add_weight(name='scalar_string', shape=(), dtype=tf.string,
-                     initializer=lambda *a, **k: 'abc')
-    layer.add_weight(name='vector_float', shape=(3,), dtype=tf.float32)
-    layer.add_weight(name='vector_string', shape=(2,), dtype=tf.string,
-                     initializer=lambda *a, **k: 2 * ['abc'])
-    layer.set_weights(layer.get_weights())
-
-  def test_get_config_error(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def __init__(self, my_kwarg='default', **kwargs):
-        super().__init__(**kwargs)
-        self.my_kwarg = my_kwarg
-
-    # `__init__` includes kwargs but `get_config` is not overridden, so
-    # an error should be thrown:
-    with self.assertRaisesRegex(NotImplementedError, 'Layer MyLayer has'):
-      MyLayer('custom').get_config()
-
-    class MyLayerNew(base_layer.Layer):
-
-      def __init__(self, my_kwarg='default', **kwargs):
-        super().__init__(**kwargs)
-        self.my_kwarg = my_kwarg
-
-      def get_config(self):
-        config = super().get_config()
-        config['my_kwarg'] = self.my_kwarg
-        return config
-
-    # Test to make sure that error is not raised if the method call is
-    # from an overridden `get_config`:
-    self.assertEqual(MyLayerNew('custom').get_config()['my_kwarg'], 'custom')
-
-    class MyLayerNew2(base_layer.Layer):
-
-      def __init__(self, name='MyLayerName', dtype=None, **kwargs):  # pylint:disable=redefined-outer-name
-        super().__init__(name=name, dtype=dtype, **kwargs)
-
-    # Check that if the kwargs in `__init__` are base layer constructor
-    # arguments, no error is thrown:
-    self.assertEqual(MyLayerNew2(name='New').get_config()['name'], 'New')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_count_params(self):
-    dense = layers.Dense(16)
-    dense.build((None, 4))
-    self.assertEqual(dense.count_params(), 16 * 4 + 16)
-
-    dense = layers.Dense(16)
-    with self.assertRaisesRegex(ValueError, 'call `count_params`'):
-      dense.count_params()
-
-    model = sequential.Sequential(layers.Dense(16))
-    with self.assertRaisesRegex(ValueError, 'call `count_params`'):
-      model.count_params()
-
-    dense = layers.Dense(16, input_dim=4)
-    model = sequential.Sequential(dense)
-    self.assertEqual(model.count_params(), 16 * 4 + 16)
-
-  def test_super_not_called(self):
-
-    class CustomLayerNotCallingSuper(base_layer.Layer):
-
-      def __init__(self):
-        pass
-
-    layer = CustomLayerNotCallingSuper()
-    with self.assertRaisesRegex(RuntimeError, 'You must call `super()'):
-      layer(np.random.random((10, 2)))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_first_arg_not_called_inputs(self):
-    x, y = tf.ones((10, 1)), tf.ones((10, 1))
-
-    class ArgLayer(base_layer.Layer):
-
-      def call(self, x, y):
-        return x + y
-
-    layer = ArgLayer()
-    out = self.evaluate(layer(x=x, y=y))
-    self.assertAllClose(out, 2 * np.ones((10, 1)))
-
-    class KwargLayer(base_layer.Layer):
-
-      def call(self, x=None, y=None):
-        return x + y
-
-    layer = KwargLayer()
-    out = self.evaluate(layer(x=x, y=y))
-    self.assertAllClose(out, 2 * np.ones((10, 1)))
-
-    with self.assertRaisesRegex(ValueError, 'must always be passed'):
-      layer(y=y)
-
-    class TFFunctionLayer(base_layer.Layer):
-
-      @tf.function
-      def call(self, x, y=None):
-        if y is None:
-          return x
-        return x + y
-
-    layer = TFFunctionLayer()
-    out = self.evaluate(layer(x=x, y=y))
-    self.assertAllClose(out, 2 * np.ones((10, 1)))
-
-  def test_build_input_shape(self):
-
-    class CustomLayer(base_layer.Layer):
-
-      def build(self, input_shape):
-        self.add_weight('w', shape=input_shape[1:])
-        super().build(input_shape)
-
-    layer = CustomLayer()
-    self.assertFalse(layer.built)
-
-    layer.build([None, 1, 2, 3])
-    self.assertTrue(layer.built)
-    self.assertEqual([None, 1, 2, 3], layer._build_input_shape)
-
-    layer = CustomLayer()
-    layer(input_layer.Input((3,)))
-    self.assertTrue(layer.built)
-    self.assertEqual([None, 3], layer._build_input_shape.as_list())
-
-  def test_build_input_shape_list_with_none(self):
-
-    class CustomLayer(base_layer.Layer):
-
-      def build(self, input_shape):
-        super().build(input_shape)
-        self.build_shape = input_shape
-
-      def call(self, inputs):
-        return inputs[0]
-
-    layer = CustomLayer()
-    layer([tf.constant([1.0]), None, tf.constant([2.0])])
-    self.assertEqual(layer.build_shape, [[1], None, [1]])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_layer_input_shape_raises_error(self):
-    layer = layers.Dense(3)
-    with self.assertRaisesRegex(AttributeError, 'no defined input shape'):
-      _ = layer.input_shape
-
-    layer(tf.ones((10, 1)))
-    with self.assertRaisesRegex(AttributeError, 'no defined input shape'):
-      _ = layer.input_shape
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_custom_layer_training_arg(self):
-    class CustomLayerNoTrainingArg(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs):
-        return self._nested_layer(inputs)
-
-    class CustomLayerDefaultTrainingMissing(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, training):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    class CustomLayerDefaultTrainingNone(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, training=None):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    class CustomLayerDefaultTrainingFalse(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, training=False):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    class CustomLayerDefaultTrainingTrue(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, training=True):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    self._test_custom_layer_training_arg(
-        CustomLayerNoTrainingArg=CustomLayerNoTrainingArg,
-        CustomLayerDefaultTrainingMissing=CustomLayerDefaultTrainingMissing,
-        CustomLayerDefaultTrainingNone=CustomLayerDefaultTrainingNone,
-        CustomLayerDefaultTrainingFalse=CustomLayerDefaultTrainingFalse,
-        CustomLayerDefaultTrainingTrue=CustomLayerDefaultTrainingTrue)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_custom_layer_training_arg_kwargonly(self):
-    class CustomLayerNoTrainingArg(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs):
-        return self._nested_layer(inputs)
-
-    class CustomLayerDefaultTrainingMissing(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, *, training):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    class CustomLayerDefaultTrainingNone(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, *, training=None):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    class CustomLayerDefaultTrainingFalse(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, *, training=False):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    class CustomLayerDefaultTrainingTrue(base_layer.Layer):
-
-      def __init__(self, nested_layer=None):
-        super().__init__()
-        self._nested_layer = nested_layer or tf.identity
-
-      def call(self, inputs, *, training=True):
-        if training:
-          return self._nested_layer(inputs)
-        else:
-          return self._nested_layer(inputs) * 0.5
-
-    self._test_custom_layer_training_arg(
-        CustomLayerNoTrainingArg=CustomLayerNoTrainingArg,
-        CustomLayerDefaultTrainingMissing=CustomLayerDefaultTrainingMissing,
-        CustomLayerDefaultTrainingNone=CustomLayerDefaultTrainingNone,
-        CustomLayerDefaultTrainingFalse=CustomLayerDefaultTrainingFalse,
-        CustomLayerDefaultTrainingTrue=CustomLayerDefaultTrainingTrue)
-
-  def _test_custom_layer_training_arg(self,
-                                      # pylint: disable=invalid-name
-                                      CustomLayerNoTrainingArg,
-                                      CustomLayerDefaultTrainingMissing,
-                                      CustomLayerDefaultTrainingNone,
-                                      CustomLayerDefaultTrainingFalse,
-                                      CustomLayerDefaultTrainingTrue,
-                                      # pylint: enable=invalid-name
-                                      ):
-    x = tf.ones(shape=(1, 1))
-
-    # If the layer signature doesn't specify a default training arg,
-    # run it in inference mode when to training arg is passed
-    # to __call__
-    layer = CustomLayerDefaultTrainingMissing()
-    self.assertAllEqual(layer(x), x * 0.5)
-    self.assertAllEqual(layer(x, training=False), x * 0.5)
-    self.assertAllEqual(layer(x, training=True), x)
-
-    # If the layer signature specifies `False` as the default training arg,
-    # run it in inference mode when no training arg is passed
-    # to __call__
-    layer = CustomLayerDefaultTrainingFalse()
-    self.assertAllEqual(layer(x), x * 0.5)
-    self.assertAllEqual(layer(x, training=False), x * 0.5)
-    self.assertAllEqual(layer(x, training=True), x)
-
-    # If the layer signature specifies `True` as the default training arg,
-    # explicitly run it in training mode when no training arg is passed
-    # to __call__
-    layer = CustomLayerDefaultTrainingTrue()
-    self.assertAllEqual(layer(x), x)
-    self.assertAllEqual(layer(x, training=False), x * 0.5)
-    self.assertAllEqual(layer(x, training=True), x)
-
-    # Outer layers/models should set the training context implicitly for all
-    # nested layers, respecting whatever mode the outer layer was run with.
-    layer = CustomLayerDefaultTrainingTrue(CustomLayerDefaultTrainingFalse())
-    # No outer value passed: use local defaults
-    self.assertAllEqual(layer(x), x)  # Use outer default True
-    # Outer value passed: override local defaults
-    self.assertAllEqual(layer(x, training=False), x * 0.25)
-    self.assertAllEqual(layer(x, training=True), x)
-
-    layer = CustomLayerDefaultTrainingFalse(CustomLayerDefaultTrainingTrue())
-    # No outer value passed: use local defaults
-    self.assertAllEqual(layer(x), x * 0.25)  # Use outer default False
-    # Outer value passed: override local defaults
-    self.assertAllEqual(layer(x, training=False), x * 0.25)
-    self.assertAllEqual(layer(x, training=True), x)
-
-    # If the outer layer `call` doesn't take a training argument at all,
-    # it'll set the nested scope as None when no training arg is passed in.
-    # If a training arg is passed in it won't use it directly in `call`, but
-    # it will set the nested training mode.
-    layer = CustomLayerNoTrainingArg(CustomLayerDefaultTrainingTrue())
-    self.assertAllEqual(layer(x), x)  # Use local default True
-    self.assertAllEqual(layer(x, training=False), x * 0.5)
-    self.assertAllEqual(layer(x, training=True), x)
-
-    layer = CustomLayerDefaultTrainingNone(CustomLayerDefaultTrainingTrue())
-    self.assertAllEqual(layer(x), x * 0.5)  # Nested use local default True
-    self.assertAllEqual(layer(x, training=False), x * 0.25)
-    self.assertAllEqual(layer(x, training=True), x)
-
-  def test_activity_regularizer_string(self):
-
-    class MyLayer(base_layer.Layer):
-      pass
-
-    layer = MyLayer(activity_regularizer='l2')
-    self.assertIsInstance(layer.activity_regularizer, regularizers.L2)
-
-  def test_tf_module_tracking(self):
-
-    class MyModule(tf.Module):
-
-      def __init__(self):
-        super().__init__()
-        self.v1 = tf.Variable(1., trainable=True, name='v1')
-        self.v2 = tf.Variable(2., trainable=False, name='v2')
-
-      def __call__(self, x):
-        return x * self.v1 * self.v2
-
-    class MyLayer(base_layer.Layer):
-
-      def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.my_modules = {}
-        self.my_modules['a'] = MyModule()
-
-      def call(self, x):
-        return self.my_modules['a'](x)
-
-    layer = MyLayer()
-    self.assertLen(layer.variables, 2)
-    self.assertLen(layer.trainable_variables, 1)
-    self.assertLen(layer.non_trainable_variables, 1)
-
-    layer.trainable = False
-    self.assertLen(layer.variables, 2)
-    self.assertLen(layer.trainable_variables, 0)
-    self.assertLen(layer.non_trainable_variables, 2)
-
-    class MyModel(training_lib.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.my_modules = []
-        self.my_modules.append(MyModule())
-
-      def call(self, x):
-        return self.my_modules[0](x)
-
-    model = MyModel()
-    self.assertLen(model.variables, 2)
-    self.assertLen(model.trainable_variables, 1)
-    self.assertLen(model.non_trainable_variables, 1)
-
-    model.trainable = False
-    self.assertLen(model.variables, 2)
-    self.assertLen(model.trainable_variables, 0)
-    self.assertLen(model.non_trainable_variables, 2)
+            # Cannot access tensor.name in eager execution.
+            self.assertIn("Variable_2/Regularizer", layer.losses[0].name)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_add_weight_by_getter(self):
+        layer = base_layer.Layer()
+        variable = tf.Variable("abc")
+        added = layer.add_weight(
+            dtype=tf.string, getter=lambda *_, **__: variable
+        )
+        self.assertIs(variable, added)
+
+    def test_variable_resetting(self):
+        dense = layers.Dense(1)
+        dense.build([8, 2])
+
+        self.assertIs(dense.trainable_variables[0], dense.kernel)
+        self.assertIs(dense.trainable_variables[1], dense.bias)
+
+        # when we reset the variable to another instance, make sure the ordering
+        # of the variable in the trainable_variables doesn't change.
+        # This is important for h5 saving/loading.
+        dense.bias = tf.Variable(initial_value=tf.zeros(shape=(1,)))
+        dense.kernel = tf.Variable(initial_value=tf.zeros(shape=(2, 1)))
+
+        self.assertIs(dense.trainable_variables[0], dense.kernel)
+        self.assertIs(dense.trainable_variables[1], dense.bias)
+
+    @test_combinations.generate(
+        test_combinations.keras_mode_combinations(mode=["eager"])
+    )
+    def test_learning_phase_freezing_for_layers(self):
+        class LearningPhaseLayer(base_layer.Layer):
+            def call(self, inputs):
+                return backend.in_train_phase(
+                    lambda: tf.ones_like(inputs), lambda: tf.zeros_like(inputs)
+                )
+
+        def get_learning_phase_value():
+            model = sequential.Sequential(
+                [LearningPhaseLayer(input_shape=(1,))]
+            )
+            model._run_eagerly = test_utils.should_run_eagerly()
+            return np.sum(model(np.ones((1, 1))))
+
+        self.assertEqual(get_learning_phase_value(), 0)
+
+        # Test scope.
+        with backend.learning_phase_scope(1):
+            self.assertEqual(get_learning_phase_value(), 1)
+
+        # The effects of the scope end after exiting it.
+        self.assertEqual(get_learning_phase_value(), 0)
+
+        # Test setting.
+        backend.set_learning_phase(1)
+        self.assertEqual(get_learning_phase_value(), 1)
+        backend.set_learning_phase(0)
+        self.assertEqual(get_learning_phase_value(), 0)
+
+    # Cannot be enabled with `run_eagerly=True`, see b/123904578
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_layer_can_return_variable(self):
+        class ComputeSum(base_layer.Layer):
+            def __init__(self):
+                super().__init__()
+                self.total = tf.Variable(
+                    initial_value=tf.zeros((1, 1)), trainable=False
+                )
+                if not tf.executing_eagerly():
+                    backend.get_session().run(self.total.initializer)
+
+            def call(self, inputs):
+                self.total.assign_add(inputs)
+                return self.total
+
+        inputs = input_layer.Input(shape=(1,))
+        model = training_lib.Model(inputs, ComputeSum()(inputs))
+        model.predict(np.ones((1, 1)))
+
+    def _get_layer_with_training_arg(self):
+        class TrainingLayer(base_layer.Layer):
+            """A layer with a `training` argument in a defuned `call`."""
+
+            @tf.function
+            def call(self, inputs, training=None):
+                if training is None:
+                    training = backend.learning_phase()
+                return control_flow_util.smart_cond(
+                    training,
+                    lambda: tf.ones_like(inputs),
+                    lambda: tf.zeros_like(inputs),
+                )
+
+        return TrainingLayer()
+
+    # b/124459427: can't test with `run_eagerly=True` for now.
+    @test_combinations.generate(
+        test_combinations.times(
+            test_combinations.keras_mode_combinations(),
+            test_combinations.keras_model_type_combinations(),
+        )
+    )
+    def test_training_arg_in_defun(self):
+        layer = self._get_layer_with_training_arg()
+        model = test_utils.get_model_from_layers([layer], input_shape=(1,))
+        model.compile(rmsprop.RMSprop(0.0), loss="mae")
+        history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+        self.assertEqual(history.history["loss"][0], 1.0)
+        loss = model.evaluate(np.zeros((1, 1)), np.zeros((1, 1)))
+        self.assertEqual(loss, 0.0)
+
+        # Test that the argument injection performed in `call` is not active
+        # when the argument is passed explicitly.
+        layer = self._get_layer_with_training_arg()
+        inputs = input_layer.Input(shape=(1,))
+        # Pass `training` by name
+        outputs = layer(inputs, training=False)
+        model = training_lib.Model(inputs, outputs)
+        model.compile(rmsprop.RMSprop(0.0), loss="mae")
+        history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(
+        test_combinations.times(
+            test_combinations.keras_mode_combinations(),
+            test_combinations.keras_model_type_combinations(),
+        )
+    )
+    def test_raw_variable_assignment(self):
+        class RawVariableLayer(base_layer.Layer):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                # Test variables in nested structure.
+                self.var_list = [tf.Variable(1.0), {"a": tf.Variable(2.0)}]
+
+            def call(self, inputs):
+                return inputs * self.var_list[0] * self.var_list[1]["a"]
+
+        model = test_utils.get_model_from_layers(
+            [RawVariableLayer()], input_shape=(10,)
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        x, y = np.ones((10, 10)), np.ones((10, 10))
+        # Checks that variables get initialized.
+        model.fit(x, y, batch_size=2, epochs=2)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_composite_variable_assignment(self):
+        class Spec(tf.TypeSpec):
+
+            value_type = property(lambda self: CompositeVariable)
+
+            def _component_specs(self):
+                pass
+
+            def _serialize(self):
+                pass
+
+            def _to_components(self, value):
+                return value._variables
+
+            def _from_components(self, variable_list):
+                return CompositeVariable(variable_list)
+
+        class CompositeVariable(tf.__internal__.CompositeTensor):
+            def __init__(self, variable_list):
+                self._variables = variable_list
+
+            @property
+            def _type_spec(self):
+                return Spec()
+
+        class CompositeVariableLayer(base_layer.Layer):
+            def __init__(self):
+                super().__init__()
+                self.composite_var = CompositeVariable(
+                    [tf.Variable(1.0), tf.Variable(2.0)]
+                )
+
+        layer = CompositeVariableLayer()
+        self.assertLen(layer.weights, 2)
+        self.assertIsInstance(layer.weights[0], tf.Variable)
+        self.assertIsInstance(layer.weights[1], tf.Variable)
+        self.assertEqual(self.evaluate(layer.weights[0]), 1.0)
+        self.assertEqual(self.evaluate(layer.weights[1]), 2.0)
+
+    def test_exception_if_trainable_not_boolean(self):
+        base_layer.Layer(trainable=True)
+        base_layer.Layer(trainable=tf.constant(True))
+        base_layer.Layer(trainable=tf.Variable(tf.constant(True)))
+        with self.assertRaisesRegex(
+            TypeError, "Expected `trainable` argument to be a boolean"
+        ):
+            base_layer.Layer(trainable=0)
+
+    def test_exception_if_dynamic_not_boolean(self):
+        base_layer.Layer(dynamic=True)
+        with self.assertRaisesRegex(
+            TypeError, "Expected `dynamic` argument to be a boolean"
+        ):
+            base_layer.Layer(dynamic=0)
+
+    def test_exception_if_name_not_string_or_none(self):
+        base_layer.Layer(name=None)
+        base_layer.Layer(name="layer_name")
+        with self.assertRaisesRegex(
+            TypeError, "Expected `name` argument to be a string"
+        ):
+            base_layer.Layer(name=0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_layer_names(self):
+        inputs = input_layer.Input(shape=[2])
+        add1 = inputs + inputs
+        add2 = layers.Add()([inputs, inputs])
+        add3 = inputs + inputs
+        add4 = layers.Add()([inputs, inputs])
+        model = training_lib.Model(
+            inputs=[inputs], outputs=[add1, add2, add3, add4]
+        )
+        actual_names = [l.name for l in model.layers]
+        graph_names = [
+            "input_1",
+            "tf_op_layer_add",
+            "add",
+            "tf_op_layer_add_2",
+            "add_1",
+        ]
+        eager_names = [
+            "input_1",
+            "tf.__operators__.add",
+            "add",
+            "tf.__operators__.add_1",
+            "add_1",
+        ]
+        for actual, eager, graph in zip(actual_names, graph_names, eager_names):
+            self.assertIn(actual, {eager, graph})
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_layer_names_after_loading(self):
+        backend.clear_session()
+        # Mimic loading a model that already contained add layers with
+        # name = 'add_1' and 'tf.__operators__.add'
+        layers.Add(name="add_1")
+        layers.Add(name="tf.__operators__.add")
+
+        inputs = input_layer.Input(shape=[2])
+        add1 = inputs + inputs
+        add2 = layers.Add()([inputs, inputs])
+        add3 = inputs + inputs
+        add4 = layers.Add()([inputs, inputs])
+        model = training_lib.Model(
+            inputs=[inputs], outputs=[add1, add2, add3, add4]
+        )
+        actual_names = [l.name for l in model.layers]
+        # The generated op layer names should have avoided layer names seen in
+        # the loaded model. (This avoiance should not apply to non-op-layers)
+        expected_names = [
+            "input_1",
+            "tf.__operators__.add_1",
+            "add",
+            "tf.__operators__.add_2",
+            "add_1",
+        ]
+        self.assertAllEqual(actual_names, expected_names)
+
+    def test_add_trainable_weight_on_frozen_layer(self):
+        class TestLayer(base_layer.Layer):
+            def build(self, input_shape):
+                self.w = self.add_weight(shape=(), trainable=True)
+
+            def call(self, inputs):
+                return self.w * inputs
+
+        layer = TestLayer()
+        layer.trainable = False
+        layer.build(None)
+        layer.trainable = True
+        self.assertListEqual(layer.trainable_weights, [layer.w])
+
+    @test_combinations.generate(
+        test_combinations.times(
+            test_combinations.keras_mode_combinations(),
+            test_combinations.keras_model_type_combinations(),
+        )
+    )
+    def test_passing_initial_weights_values(self):
+        kernel_value = np.random.random((10, 2))
+        layer_with_weights = layers.Dense(
+            2, use_bias=False, weights=[kernel_value]
+        )
+
+        model = test_utils.get_model_from_layers(
+            [layer_with_weights], input_shape=(10,)
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        inputs = np.random.random((3, 10))
+        out = model.predict(inputs)
+        self.assertAllClose(model.layers[-1].get_weights()[0], kernel_value)
+        self.assertAllClose(out, np.dot(inputs, kernel_value))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_set_weights_and_get_weights(self):
+        layer = layers.Dense(2)
+        layer.build((None, 10))
+        kernel = np.random.random((10, 2))
+        bias = np.random.random((2,))
+        layer.set_weights([kernel, bias])
+        weights = layer.get_weights()
+        self.assertEqual(len(weights), 2)
+        self.assertAllClose(weights[0], kernel)
+        self.assertAllClose(weights[1], bias)
+        with self.assertRaisesRegex(
+            ValueError, "but the layer was expecting 2 weights"
+        ):
+            layer.set_weights([1, 2, 3])
+        with self.assertRaisesRegex(
+            ValueError, "not compatible with provided weight shape"
+        ):
+            layer.set_weights([kernel.T, bias])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_set_weights_accepts_output_of_get_weights(self):
+        layer = layers.Layer()
+        layer.add_weight(name="scalar_float", shape=(), dtype=tf.float32)
+        layer.add_weight(
+            name="scalar_string",
+            shape=(),
+            dtype=tf.string,
+            initializer=lambda *a, **k: "abc",
+        )
+        layer.add_weight(name="vector_float", shape=(3,), dtype=tf.float32)
+        layer.add_weight(
+            name="vector_string",
+            shape=(2,),
+            dtype=tf.string,
+            initializer=lambda *a, **k: 2 * ["abc"],
+        )
+        layer.set_weights(layer.get_weights())
+
+    def test_get_config_error(self):
+        class MyLayer(base_layer.Layer):
+            def __init__(self, my_kwarg="default", **kwargs):
+                super().__init__(**kwargs)
+                self.my_kwarg = my_kwarg
+
+        # `__init__` includes kwargs but `get_config` is not overridden, so
+        # an error should be thrown:
+        with self.assertRaisesRegex(
+            NotImplementedError, "Layer MyLayer was created by"
+        ):
+            # We pass bytes because it's non-serializable and thus
+            # will not be handled by the auto-get_config
+            MyLayer(b"custom").get_config()
+
+        class MyLayerNew(base_layer.Layer):
+            def __init__(self, my_kwarg="default", **kwargs):
+                super().__init__(**kwargs)
+                self.my_kwarg = my_kwarg
+
+            def get_config(self):
+                config = super().get_config()
+                config["my_kwarg"] = self.my_kwarg
+                return config
+
+        # Test to make sure that error is not raised if the method call is
+        # from an overridden `get_config`:
+        self.assertEqual(
+            MyLayerNew("custom").get_config()["my_kwarg"], "custom"
+        )
+
+        class MyLayerNew2(base_layer.Layer):
+            def __init__(self, name="MyLayerName", dtype=None, **kwargs):
+                super().__init__(name=name, dtype=dtype, **kwargs)
+
+        # Check that if the kwargs in `__init__` are base layer constructor
+        # arguments, no error is thrown:
+        self.assertEqual(MyLayerNew2(name="New").get_config()["name"], "New")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_count_params(self):
+        dense = layers.Dense(16)
+        dense.build((None, 4))
+        self.assertEqual(dense.count_params(), 16 * 4 + 16)
+
+        dense = layers.Dense(16)
+        with self.assertRaisesRegex(ValueError, "call `count_params`"):
+            dense.count_params()
+
+        model = sequential.Sequential(layers.Dense(16))
+        with self.assertRaisesRegex(ValueError, "call `count_params`"):
+            model.count_params()
+
+        dense = layers.Dense(16, input_dim=4)
+        model = sequential.Sequential(dense)
+        self.assertEqual(model.count_params(), 16 * 4 + 16)
+
+    def test_super_not_called(self):
+        class CustomLayerNotCallingSuper(base_layer.Layer):
+            def __init__(self):
+                pass
+
+        layer = CustomLayerNotCallingSuper()
+        with self.assertRaisesRegex(RuntimeError, "You must call `super()"):
+            layer(np.random.random((10, 2)))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_first_arg_not_called_inputs(self):
+        x, y = tf.ones((10, 1)), tf.ones((10, 1))
+
+        class ArgLayer(base_layer.Layer):
+            def call(self, x, y):
+                return x + y
+
+        layer = ArgLayer()
+        out = self.evaluate(layer(x=x, y=y))
+        self.assertAllClose(out, 2 * np.ones((10, 1)))
+
+        class KwargLayer(base_layer.Layer):
+            def call(self, x=None, y=None):
+                return x + y
+
+        layer = KwargLayer()
+        out = self.evaluate(layer(x=x, y=y))
+        self.assertAllClose(out, 2 * np.ones((10, 1)))
+
+        with self.assertRaisesRegex(ValueError, "must always be passed"):
+            layer(y=y)
+
+        class TFFunctionLayer(base_layer.Layer):
+            @tf.function
+            def call(self, x, y=None):
+                if y is None:
+                    return x
+                return x + y
+
+        layer = TFFunctionLayer()
+        out = self.evaluate(layer(x=x, y=y))
+        self.assertAllClose(out, 2 * np.ones((10, 1)))
+
+    def test_build_input_shape(self):
+        class CustomLayer(base_layer.Layer):
+            def build(self, input_shape):
+                self.add_weight("w", shape=input_shape[1:])
+                super().build(input_shape)
+
+        layer = CustomLayer()
+        self.assertFalse(layer.built)
+
+        layer.build([None, 1, 2, 3])
+        self.assertTrue(layer.built)
+        self.assertEqual([None, 1, 2, 3], layer._build_input_shape)
+
+        layer = CustomLayer()
+        layer(input_layer.Input((3,)))
+        self.assertTrue(layer.built)
+        self.assertEqual([None, 3], layer._build_input_shape.as_list())
+
+    def test_build_input_shape_list_with_none(self):
+        class CustomLayer(base_layer.Layer):
+            def build(self, input_shape):
+                super().build(input_shape)
+                self.build_shape = input_shape
+
+            def call(self, inputs):
+                return inputs[0]
+
+        layer = CustomLayer()
+        layer([tf.constant([1.0]), None, tf.constant([2.0])])
+        self.assertEqual(layer.build_shape, [[1], None, [1]])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_layer_input_shape_raises_error(self):
+        layer = layers.Dense(3)
+        with self.assertRaisesRegex(AttributeError, "no defined input shape"):
+            _ = layer.input_shape
+
+        layer(tf.ones((10, 1)))
+        with self.assertRaisesRegex(AttributeError, "no defined input shape"):
+            _ = layer.input_shape
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_custom_layer_training_arg(self):
+        class CustomLayerNoTrainingArg(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs):
+                return self._nested_layer(inputs)
+
+        class CustomLayerDefaultTrainingMissing(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, training):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        class CustomLayerDefaultTrainingNone(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, training=None):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        class CustomLayerDefaultTrainingFalse(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, training=False):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        class CustomLayerDefaultTrainingTrue(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, training=True):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        self._test_custom_layer_training_arg(
+            CustomLayerNoTrainingArg=CustomLayerNoTrainingArg,
+            CustomLayerDefaultTrainingMissing=CustomLayerDefaultTrainingMissing,
+            CustomLayerDefaultTrainingNone=CustomLayerDefaultTrainingNone,
+            CustomLayerDefaultTrainingFalse=CustomLayerDefaultTrainingFalse,
+            CustomLayerDefaultTrainingTrue=CustomLayerDefaultTrainingTrue,
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_custom_layer_training_arg_kwargonly(self):
+        class CustomLayerNoTrainingArg(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs):
+                return self._nested_layer(inputs)
+
+        class CustomLayerDefaultTrainingMissing(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, *, training):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        class CustomLayerDefaultTrainingNone(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, *, training=None):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        class CustomLayerDefaultTrainingFalse(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, *, training=False):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        class CustomLayerDefaultTrainingTrue(base_layer.Layer):
+            def __init__(self, nested_layer=None):
+                super().__init__()
+                self._nested_layer = nested_layer or tf.identity
+
+            def call(self, inputs, *, training=True):
+                if training:
+                    return self._nested_layer(inputs)
+                else:
+                    return self._nested_layer(inputs) * 0.5
+
+        self._test_custom_layer_training_arg(
+            CustomLayerNoTrainingArg=CustomLayerNoTrainingArg,
+            CustomLayerDefaultTrainingMissing=CustomLayerDefaultTrainingMissing,
+            CustomLayerDefaultTrainingNone=CustomLayerDefaultTrainingNone,
+            CustomLayerDefaultTrainingFalse=CustomLayerDefaultTrainingFalse,
+            CustomLayerDefaultTrainingTrue=CustomLayerDefaultTrainingTrue,
+        )
+
+    def _test_custom_layer_training_arg(
+        self,
+        CustomLayerNoTrainingArg,
+        CustomLayerDefaultTrainingMissing,
+        CustomLayerDefaultTrainingNone,
+        CustomLayerDefaultTrainingFalse,
+        CustomLayerDefaultTrainingTrue,
+    ):
+        x = tf.ones(shape=(1, 1))
+
+        # If the layer signature doesn't specify a default training arg,
+        # run it in inference mode when to training arg is passed
+        # to __call__
+        layer = CustomLayerDefaultTrainingMissing()
+        self.assertAllEqual(layer(x), x * 0.5)
+        self.assertAllEqual(layer(x, training=False), x * 0.5)
+        self.assertAllEqual(layer(x, training=True), x)
+
+        # If the layer signature specifies `False` as the default training arg,
+        # run it in inference mode when no training arg is passed
+        # to __call__
+        layer = CustomLayerDefaultTrainingFalse()
+        self.assertAllEqual(layer(x), x * 0.5)
+        self.assertAllEqual(layer(x, training=False), x * 0.5)
+        self.assertAllEqual(layer(x, training=True), x)
+
+        # If the layer signature specifies `True` as the default training arg,
+        # explicitly run it in training mode when no training arg is passed
+        # to __call__
+        layer = CustomLayerDefaultTrainingTrue()
+        self.assertAllEqual(layer(x), x)
+        self.assertAllEqual(layer(x, training=False), x * 0.5)
+        self.assertAllEqual(layer(x, training=True), x)
+
+        # Outer layers/models should set the training context implicitly for all
+        # nested layers, respecting whatever mode the outer layer was run with.
+        layer = CustomLayerDefaultTrainingTrue(
+            CustomLayerDefaultTrainingFalse()
+        )
+        # No outer value passed: use local defaults
+        self.assertAllEqual(layer(x), x)  # Use outer default True
+        # Outer value passed: override local defaults
+        self.assertAllEqual(layer(x, training=False), x * 0.25)
+        self.assertAllEqual(layer(x, training=True), x)
+
+        layer = CustomLayerDefaultTrainingFalse(
+            CustomLayerDefaultTrainingTrue()
+        )
+        # No outer value passed: use local defaults
+        self.assertAllEqual(layer(x), x * 0.25)  # Use outer default False
+        # Outer value passed: override local defaults
+        self.assertAllEqual(layer(x, training=False), x * 0.25)
+        self.assertAllEqual(layer(x, training=True), x)
+
+        # If the outer layer `call` doesn't take a training argument at all,
+        # it'll set the nested scope as None when no training arg is passed in.
+        # If a training arg is passed in it won't use it directly in `call`, but
+        # it will set the nested training mode.
+        layer = CustomLayerNoTrainingArg(CustomLayerDefaultTrainingTrue())
+        self.assertAllEqual(layer(x), x)  # Use local default True
+        self.assertAllEqual(layer(x, training=False), x * 0.5)
+        self.assertAllEqual(layer(x, training=True), x)
+
+        layer = CustomLayerDefaultTrainingNone(CustomLayerDefaultTrainingTrue())
+        self.assertAllEqual(layer(x), x * 0.5)  # Nested use local default True
+        self.assertAllEqual(layer(x, training=False), x * 0.25)
+        self.assertAllEqual(layer(x, training=True), x)
+
+    def test_activity_regularizer_string(self):
+        class MyLayer(base_layer.Layer):
+            pass
+
+        layer = MyLayer(activity_regularizer="l2")
+        self.assertIsInstance(layer.activity_regularizer, regularizers.L2)
+
+    def test_tf_module_tracking(self):
+        class MyModule(tf.Module):
+            def __init__(self):
+                super().__init__()
+                self.v1 = tf.Variable(1.0, trainable=True, name="v1")
+                self.v2 = tf.Variable(2.0, trainable=False, name="v2")
+
+            def __call__(self, x):
+                return x * self.v1 * self.v2
+
+        class MyLayer(base_layer.Layer):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.my_modules = {}
+                self.my_modules["a"] = MyModule()
+
+            def call(self, x):
+                return self.my_modules["a"](x)
+
+        layer = MyLayer()
+        self.assertLen(layer.variables, 2)
+        self.assertLen(layer.trainable_variables, 1)
+        self.assertLen(layer.non_trainable_variables, 1)
+
+        layer.trainable = False
+        self.assertLen(layer.variables, 2)
+        self.assertLen(layer.trainable_variables, 0)
+        self.assertLen(layer.non_trainable_variables, 2)
+
+        class MyModel(training_lib.Model):
+            def __init__(self):
+                super().__init__()
+                self.my_modules = []
+                self.my_modules.append(MyModule())
+
+            def call(self, x):
+                return self.my_modules[0](x)
+
+        model = MyModel()
+        self.assertLen(model.variables, 2)
+        self.assertLen(model.trainable_variables, 1)
+        self.assertLen(model.non_trainable_variables, 1)
+
+        model.trainable = False
+        self.assertLen(model.variables, 2)
+        self.assertLen(model.trainable_variables, 0)
+        self.assertLen(model.non_trainable_variables, 2)
+
+    def test_tf_tracking_lists(self):
+        class MyLayer(base_layer.Layer):
+            def __init__(self, num_weights):
+                super().__init__()
+                self.num_weights = num_weights
+
+            def build(self, input_shape):
+                super().build(input_shape)
+                self.my_weights = []
+                w_init = tf.random_normal_initializer()
+                for i in range(self.num_weights):
+                    self.my_weights.append(
+                        tf.Variable(
+                            name=f"w_{i}",
+                            initial_value=w_init(
+                                shape=(input_shape[1], input_shape[1]),
+                                dtype="float32",
+                            ),
+                            trainable=True,
+                        )
+                    )
+
+            def call(self, x):
+                for w in self.my_weights:
+                    x = tf.matmul(x, w)
+                return x
+
+        layer = MyLayer(3)
+        layer(tf.constant([[1.0, 1.0, 1.0, 1.0]]))
+        self.assertLen(layer.variables, 3)
+        self.assertLen(layer.trainable_variables, 3)
+        self.assertLen(layer.non_trainable_variables, 0)
+
+        layer.trainable = False
+        self.assertLen(layer.variables, 3)
+        self.assertLen(layer.trainable_variables, 0)
+        self.assertLen(layer.non_trainable_variables, 3)
+
+    def test_auto_get_config(self):
+        class MyLayer(base_layer.Layer):
+            def __init__(self, var1, var2, var3=None, **kwargs):
+                super().__init__(**kwargs)
+
+        layer = MyLayer("a", 2, var3=True, name="mylayer")
+        config = layer.get_config()
+        self.assertLen(config, 6)
+        self.assertEqual(config["var1"], "a")
+        self.assertEqual(config["var2"], 2)
+        self.assertEqual(config["var3"], True)
+        self.assertEqual(config["name"], "mylayer")
+        self.assertEqual(config["trainable"], True)
+        self.assertEqual(config["dtype"], "float32")
+        layer = MyLayer.from_config(config)
+        self.assertDictEqual(layer.get_config(), config)
+
+        layer = MyLayer("a", 2, var3=tf.nn.relu)
+        with self.assertRaises(NotImplementedError):
+            config = layer.get_config()
 
 
 @test_utils.run_v2_only
 class SymbolicSupportTest(test_combinations.TestCase):
-
-  def test_using_symbolic_tensors_with_tf_ops(self):
-    # Single-input.
-    x = input_layer.Input((3,))
-    tf.square(x)
-
-    # Multi-inputs.
-    x1, x2 = input_layer.Input((3,)), input_layer.Input((3,))
-    tf.concat([x1, x2], axis=1)
-
-    # Mixing Keras symbolic tensors and graph tensors from the same graph works.
-    with backend.get_graph().as_default():
-      x1 = input_layer.Input((3,))
-    x2 = input_layer.Input((3,))
-    tf.matmul(x1, x2)
-
-    # Creating same op type (matmul) multiple times in the Keras graph works.
-    x1 = input_layer.Input((3,))
-    x2 = input_layer.Input((3,))
-    tf.matmul(x1, x2)
-
-  def test_mixing_eager_and_graph_tensors(self):
-    with tf.Graph().as_default():
-      x1 = tf.ones((3, 3))
-    x2 = tf.ones((3, 3))
-    with self.assertRaises(TypeError):
-      tf.matmul(x1, x2)
-
-  def test_mixing_numpy_arrays_and_graph_tensors(self):
-    with tf.Graph().as_default():
-      x1 = tf.ones((3, 3))
-    x2 = np.ones((3, 3), dtype='float32')
-    with self.assertRaises(TypeError):
-      tf.matmul(x1, x2)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
-    x1 = input_layer.Input((3,))
-    x2 = tf.ones((3, 3))
-    y = tf.matmul(x1, x2)
-
-    fn = backend.function(inputs=[x1], outputs=[y])
-    x_val = np.random.random((3, 3))
-    y_val = np.ones((3, 3))
-    self.assertAllClose(fn([x_val])[0],
-                        np.matmul(x_val, y_val),
-                        atol=1e-5)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
-    x1 = input_layer.Input((3,))
-    x2 = np.ones((3, 3), dtype='float32')
-    y = tf.matmul(x1, x2)
-
-    fn = backend.function(inputs=[x1], outputs=[y])
-    x_val = np.random.random((3, 3))
-    y_val = np.ones((3, 3))
-    self.assertAllClose(fn([x_val])[0],
-                        np.matmul(x_val, y_val),
-                        atol=1e-5)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_reraising_exception(self):
-    # When layer is not dynamic, we have some pattern matching during exception
-    # handling to detect when the user is trying to use python control flow.
-    # When an exception is thrown but the pattern doesn't match, we want to
-    # preserve the originating stack trace. An early implementation of this
-    # logic lost the stack trace. We test the correct behavior here.
-
-    class TypeErrorLayer(base_layer.Layer):
-
-      def call(self, inputs):
-        def easily_identifiable_name():
-          raise TypeError('Non-matching TypeError message.')
-        easily_identifiable_name()
-
-    inputs = input_layer.Input((3,))
-
-    try:
-      _ = TypeErrorLayer()(inputs)
-    except TypeError as e:
-      self.assertIn('easily_identifiable_name', str(e))  # pylint: disable=g-assert-in-except
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_summaries_in_tf_function(self):
-    if not tf.executing_eagerly():
-      return
-
-    class MyLayer(base_layer.Layer):
-
-      def call(self, inputs):
-        tf.summary.scalar('mean', tf.reduce_mean(inputs))
-        return inputs
-
-    tmp_dir = self.get_temp_dir()
-    writer = tf.summary.create_file_writer(tmp_dir)
-    with writer.as_default(step=1), tf.summary.record_if(True):
-      my_layer = MyLayer()
-      x = tf.ones((10, 10))
-
-      def my_fn(x):
-        return my_layer(x)
-
-      _ = my_fn(x)
-
-    event_file = tf.compat.v1.gfile.Glob(os.path.join(tmp_dir, 'events*'))
-    self.assertLen(event_file, 1)
-    event_file = event_file[0]
-    tags = set()
-    for e in tf.compat.v1.train.summary_iterator(event_file):
-      for val in e.summary.value:
-        tags.add(val.tag)
-    self.assertEqual(set(['my_layer/mean']), tags)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_error_when_passing_non_tensor(self):
-    # layers that have an `input_spec` will raise an error when called on
-    # non-tensors. This covers all built-in layers.
-    layer = layers.Dense(3)
-    x = object()
-    with self.assertRaisesRegex(TypeError, r'should be tensors'):
-      layer(x)
+    def test_using_symbolic_tensors_with_tf_ops(self):
+        # Single-input.
+        x = input_layer.Input((3,))
+        tf.square(x)
+
+        # Multi-inputs.
+        x1, x2 = input_layer.Input((3,)), input_layer.Input((3,))
+        tf.concat([x1, x2], axis=1)
+
+        # Mixing Keras symbolic tensors and graph tensors from the same graph
+        # works.
+        with backend.get_graph().as_default():
+            x1 = input_layer.Input((3,))
+        x2 = input_layer.Input((3,))
+        tf.matmul(x1, x2)
+
+        # Creating same op type (matmul) multiple times in the Keras graph
+        # works.
+        x1 = input_layer.Input((3,))
+        x2 = input_layer.Input((3,))
+        tf.matmul(x1, x2)
+
+    def test_mixing_eager_and_graph_tensors(self):
+        with tf.Graph().as_default():
+            x1 = tf.ones((3, 3))
+        x2 = tf.ones((3, 3))
+        with self.assertRaises(TypeError):
+            tf.matmul(x1, x2)
+
+    def test_mixing_numpy_arrays_and_graph_tensors(self):
+        with tf.Graph().as_default():
+            x1 = tf.ones((3, 3))
+        x2 = np.ones((3, 3), dtype="float32")
+        with self.assertRaises(TypeError):
+            tf.matmul(x1, x2)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
+        x1 = input_layer.Input((3,))
+        x2 = tf.ones((3, 3))
+        y = tf.matmul(x1, x2)
+
+        fn = backend.function(inputs=[x1], outputs=[y])
+        x_val = np.random.random((3, 3))
+        y_val = np.ones((3, 3))
+        self.assertAllClose(fn([x_val])[0], np.matmul(x_val, y_val), atol=1e-5)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
+        x1 = input_layer.Input((3,))
+        x2 = np.ones((3, 3), dtype="float32")
+        y = tf.matmul(x1, x2)
+
+        fn = backend.function(inputs=[x1], outputs=[y])
+        x_val = np.random.random((3, 3))
+        y_val = np.ones((3, 3))
+        self.assertAllClose(fn([x_val])[0], np.matmul(x_val, y_val), atol=1e-5)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_reraising_exception(self):
+        # When layer is not dynamic, we have some pattern matching during
+        # exception handling to detect when the user is trying to use python
+        # control flow.  When an exception is thrown but the pattern doesn't
+        # match, we want to preserve the originating stack trace. An early
+        # implementation of this logic lost the stack trace. We test the correct
+        # behavior here.
+
+        class TypeErrorLayer(base_layer.Layer):
+            def call(self, inputs):
+                def easily_identifiable_name():
+                    raise TypeError("Non-matching TypeError message.")
+
+                easily_identifiable_name()
+
+        inputs = input_layer.Input((3,))
+
+        try:
+            _ = TypeErrorLayer()(inputs)
+        except TypeError as e:
+            self.assertIn("easily_identifiable_name", str(e))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_summaries_in_tf_function(self):
+        if not tf.executing_eagerly():
+            return
+
+        class MyLayer(base_layer.Layer):
+            def call(self, inputs):
+                tf.summary.scalar("mean", tf.reduce_mean(inputs))
+                return inputs
+
+        tmp_dir = self.get_temp_dir()
+        writer = tf.summary.create_file_writer(tmp_dir)
+        with writer.as_default(step=1), tf.summary.record_if(True):
+            my_layer = MyLayer()
+            x = tf.ones((10, 10))
+
+            def my_fn(x):
+                return my_layer(x)
+
+            _ = my_fn(x)
+
+        event_file = tf.compat.v1.gfile.Glob(os.path.join(tmp_dir, "events*"))
+        self.assertLen(event_file, 1)
+        event_file = event_file[0]
+        tags = set()
+        for e in tf.compat.v1.train.summary_iterator(event_file):
+            for val in e.summary.value:
+                tags.add(val.tag)
+        self.assertEqual(set(["my_layer/mean"]), tags)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_error_when_passing_non_tensor(self):
+        # layers that have an `input_spec` will raise an error when called on
+        # non-tensors. This covers all built-in layers.
+        layer = layers.Dense(3)
+        x = object()
+        with self.assertRaisesRegex(TypeError, r"should be tensors"):
+            layer(x)
 
 
 @test_utils.run_v2_only
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class NestedTrackingTest(tf.test.TestCase):
-
-  def test_nested_layer_variable_tracking(self):
-    # Test that variables from nested sublayers are
-    # being tracked by subclassed layers.
-
-    class MyLayer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.dense1 = layers.Dense(1)
-        self.dense2 = layers.BatchNormalization()
-
-      def build(self, input_shape):
-        self.v1 = self.add_weight('v1', shape=input_shape[1:].as_list())
-        self.v2 = tf.Variable(
-            name='v2',
-            initial_value=np.zeros(input_shape[1:].as_list(), dtype='float32'),
-            trainable=False)
-
-      def call(self, inputs):
-        x = self.dense1(inputs) + self.dense2(inputs)
-        return x + self.v1 + self.v2
-
-    layer = MyLayer()
-    inputs = input_layer.Input((1,))
-    _ = layer(inputs)
-
-    self.assertEqual(len(layer.weights), 8)
-    self.assertEqual(len(layer.trainable_weights), 5)
-    self.assertEqual(len(layer.non_trainable_weights), 3)
-
-    layer.dense1.trainable = False
-    self.assertEqual(len(layer.weights), 8)
-    self.assertEqual(len(layer.trainable_weights), 3)
-    self.assertEqual(len(layer.non_trainable_weights), 5)
-
-    layer.trainable = False
-    self.assertEqual(len(layer.weights), 8)
-    self.assertEqual(len(layer.trainable_weights), 0)
-    self.assertEqual(len(layer.non_trainable_weights), 8)
-    self.assertEqual(
-        {id(v) for v in [layer.dense1, layer.dense2, layer.v1, layer.v2]},
-        {id(v) for v in layer._trackable_children().values()})
-
-  def test_nested_layer_updates_losses_tracking(self):
-    # Test that updates and losses from nested sublayers are
-    # being tracked by subclassed layers.
-
-    class UpdateAndLossLayer(base_layer.Layer):
-
-      def build(self, _):
-        self.v1 = self.add_weight('v1', shape=())
-
-      def call(self, inputs):
-        self.add_loss(tf.reduce_sum(inputs))
-        self.add_update(tf.compat.v1.assign_add(self.v1, 1))
-        return inputs + 1
-
-    class MyLayer(base_layer.Layer):
-
-      def build(self, _):
-        self.v1 = self.add_weight('v1', shape=())
-
-      def __init__(self):
-        super().__init__()
-        self.ul1 = UpdateAndLossLayer()
-        self.ul2 = UpdateAndLossLayer()
-
-      def call(self, inputs):
-        self.add_loss(tf.reduce_sum(inputs))
-        self.add_update(tf.compat.v1.assign_add(self.v1, 1))
-        x = self.ul1(inputs)
-        return self.ul2(x)
-
-    layer = MyLayer()
-
-    if tf.executing_eagerly():
-      inputs = tf.ones((3, 1))
-      _ = layer(inputs)
-      self.assertEqual(len(layer.losses), 3)
-    else:
-      inputs = input_layer.Input((1,))
-      _ = layer(inputs)
-      self.assertEqual(len(layer.losses), 3)
-      self.assertEqual(len(layer.updates), 3)
-
-  def test_attribute_reassignment(self):
-    l = base_layer.Layer()
-    l.a = base_layer.Layer()
-    l.a = []
-    l.a = tf.Variable(1.)
-    l.a = base_layer.Layer()
-    last_assignment = base_layer.Layer()
-    l.a = last_assignment
-    l.b = tf.Variable(1.)
-    del l.b
-    l.c = base_layer.Layer()
-    del l.c
-    l.d = last_assignment
-    del l.d
-    sublayers = list(l._flatten_layers(include_self=False, recursive=False))
-    self.assertEqual([last_assignment], sublayers)
-    self.assertEqual([], l.trainable_weights)
-    self.assertEqual([], l.non_trainable_weights)
-    self.assertEqual([], l.weights)
-    del l.a
-    self.assertEqual([], l._self_tracked_trackables)
-
-  def test_layer_class_not_tracked_as_sublayer(self):
-    # See https://github.com/tensorflow/tensorflow/issues/27431 for details.
-
-    class LayerWithClassAttribute(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.layer_fn = layers.Dense
-
-    layer = LayerWithClassAttribute()
-    self.assertEmpty(layer.variables)
-    self.assertEmpty(layer.submodules)
-
-  def test_layer_call_fn_args(self):
-
-    class NonDefunLayer(base_layer.Layer):
-
-      def call(self, inputs, a, mask, b=None, training=None):
-        return inputs
-
-    class DefunLayer(base_layer.Layer):
-
-      @tf.function
-      def call(self, x, mask, a, training=None, b=None):
-        return x
-
-    nondefun_layer = NonDefunLayer()
-    self.assertEqual(nondefun_layer._call_spec.arg_names,
-                     ['inputs', 'a', 'mask', 'b', 'training'])
-    defun_layer = DefunLayer()
-    self.assertEqual(defun_layer._call_spec.arg_names,
-                     ['x', 'mask', 'a', 'training', 'b'])
-
-  def test_sequential_model(self):
-    model = sequential.Sequential(
-        [layers.Dense(10, input_shape=(10,)),
-         layers.Dense(5)])
-    self.assertLen(model.layers, 2)
-    self.assertLen(model.weights, 4)
-
-    # Make sure a subclass model also works when it is called 'Sequential'.
-    class Sequential(training_lib.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense_layers = [layers.Dense(10), layers.Dense(5)]
-
-      def call(self, inputs):
-        x = inputs
-        for d in self.dense_layers:
-          x = d(x)
-        return x
-
-    s = Sequential()
-    self.assertLen(s.layers, 2)
-    self.assertLen(s.weights, 0)
-
-    s(input_layer.Input((10,)))
-    self.assertLen(s.weights, 4)
+    def test_nested_layer_variable_tracking(self):
+        # Test that variables from nested sublayers are
+        # being tracked by subclassed layers.
+
+        class MyLayer(base_layer.Layer):
+            def __init__(self):
+                super().__init__()
+                self.dense1 = layers.Dense(1)
+                self.dense2 = layers.BatchNormalization()
+
+            def build(self, input_shape):
+                self.v1 = self.add_weight("v1", shape=input_shape[1:].as_list())
+                self.v2 = tf.Variable(
+                    name="v2",
+                    initial_value=np.zeros(
+                        input_shape[1:].as_list(), dtype="float32"
+                    ),
+                    trainable=False,
+                )
+
+            def call(self, inputs):
+                x = self.dense1(inputs) + self.dense2(inputs)
+                return x + self.v1 + self.v2
+
+        layer = MyLayer()
+        inputs = input_layer.Input((1,))
+        _ = layer(inputs)
+
+        self.assertEqual(len(layer.weights), 8)
+        self.assertEqual(len(layer.trainable_weights), 5)
+        self.assertEqual(len(layer.non_trainable_weights), 3)
+
+        layer.dense1.trainable = False
+        self.assertEqual(len(layer.weights), 8)
+        self.assertEqual(len(layer.trainable_weights), 3)
+        self.assertEqual(len(layer.non_trainable_weights), 5)
+
+        layer.trainable = False
+        self.assertEqual(len(layer.weights), 8)
+        self.assertEqual(len(layer.trainable_weights), 0)
+        self.assertEqual(len(layer.non_trainable_weights), 8)
+        self.assertEqual(
+            {id(v) for v in [layer.dense1, layer.dense2, layer.v1, layer.v2]},
+            {id(v) for v in layer._trackable_children().values()},
+        )
+
+    def test_nested_layer_updates_losses_tracking(self):
+        # Test that updates and losses from nested sublayers are
+        # being tracked by subclassed layers.
+
+        class UpdateAndLossLayer(base_layer.Layer):
+            def build(self, _):
+                self.v1 = self.add_weight("v1", shape=())
+
+            def call(self, inputs):
+                self.add_loss(tf.reduce_sum(inputs))
+                self.add_update(tf.compat.v1.assign_add(self.v1, 1))
+                return inputs + 1
+
+        class MyLayer(base_layer.Layer):
+            def build(self, _):
+                self.v1 = self.add_weight("v1", shape=())
+
+            def __init__(self):
+                super().__init__()
+                self.ul1 = UpdateAndLossLayer()
+                self.ul2 = UpdateAndLossLayer()
+
+            def call(self, inputs):
+                self.add_loss(tf.reduce_sum(inputs))
+                self.add_update(tf.compat.v1.assign_add(self.v1, 1))
+                x = self.ul1(inputs)
+                return self.ul2(x)
+
+        layer = MyLayer()
+
+        if tf.executing_eagerly():
+            inputs = tf.ones((3, 1))
+            _ = layer(inputs)
+            self.assertEqual(len(layer.losses), 3)
+        else:
+            inputs = input_layer.Input((1,))
+            _ = layer(inputs)
+            self.assertEqual(len(layer.losses), 3)
+            self.assertEqual(len(layer.updates), 3)
+
+    def test_attribute_reassignment(self):
+        l = base_layer.Layer()
+        l.a = base_layer.Layer()
+        l.a = []
+        l.a = tf.Variable(1.0)
+        l.a = base_layer.Layer()
+        last_assignment = base_layer.Layer()
+        l.a = last_assignment
+        l.b = tf.Variable(1.0)
+        del l.b
+        l.c = base_layer.Layer()
+        del l.c
+        l.d = last_assignment
+        del l.d
+        sublayers = list(l._flatten_layers(include_self=False, recursive=False))
+        self.assertEqual([last_assignment], sublayers)
+        self.assertEqual([], l.trainable_weights)
+        self.assertEqual([], l.non_trainable_weights)
+        self.assertEqual([], l.weights)
+        del l.a
+        self.assertEqual([], l._self_tracked_trackables)
+
+    def test_layer_class_not_tracked_as_sublayer(self):
+        # See https://github.com/tensorflow/tensorflow/issues/27431 for details.
+
+        class LayerWithClassAttribute(base_layer.Layer):
+            def __init__(self):
+                super().__init__()
+                self.layer_fn = layers.Dense
+
+        layer = LayerWithClassAttribute()
+        self.assertEmpty(layer.variables)
+        self.assertEmpty(layer.submodules)
+
+    def test_layer_call_fn_args(self):
+        class NonDefunLayer(base_layer.Layer):
+            def call(self, inputs, a, mask, b=None, training=None):
+                return inputs
+
+        class DefunLayer(base_layer.Layer):
+            @tf.function
+            def call(self, x, mask, a, training=None, b=None):
+                return x
+
+        nondefun_layer = NonDefunLayer()
+        self.assertEqual(
+            nondefun_layer._call_spec.arg_names,
+            ["inputs", "a", "mask", "b", "training"],
+        )
+        defun_layer = DefunLayer()
+        self.assertEqual(
+            defun_layer._call_spec.arg_names,
+            ["x", "mask", "a", "training", "b"],
+        )
+
+    def test_sequential_model(self):
+        model = sequential.Sequential(
+            [layers.Dense(10, input_shape=(10,)), layers.Dense(5)]
+        )
+        self.assertLen(model.layers, 2)
+        self.assertLen(model.weights, 4)
+
+        # Make sure a subclass model also works when it is called 'Sequential'.
+        class Sequential(training_lib.Model):
+            def __init__(self):
+                super().__init__()
+                self.dense_layers = [layers.Dense(10), layers.Dense(5)]
+
+            def call(self, inputs):
+                x = inputs
+                for d in self.dense_layers:
+                    x = d(x)
+                return x
+
+        s = Sequential()
+        self.assertLen(s.layers, 2)
+        self.assertLen(s.weights, 0)
+
+        s(input_layer.Input((10,)))
+        self.assertLen(s.weights, 4)
 
 
 @test_utils.run_v2_only
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class NameScopingTest(test_combinations.TestCase):
-
-  def test_name_scope_layer(self):
-    x = backend.placeholder(shape=(10, 10))
-    layer = layers.Dense(10, name='MyName')
-    layer(x)
-    self.assertEqual(layer.bias.name, 'MyName/bias:0')
-    self.assertEqual(layer.kernel.name, 'MyName/kernel:0')
-
-  def test_name_scope_functional_api(self):
-    inputs = input_layer.Input((3,))
-    layer = layers.Dense(10, name='MyName')
-    _ = layer(inputs)
-    self.assertEqual(layer.bias.name, 'MyName/bias:0')
-    self.assertEqual(layer.kernel.name, 'MyName/kernel:0')
-
-  def test_name_scope_functional_api_nested(self):
-
-    class NestedLayer(base_layer.Layer):
-
-      def __init__(self, name='OuterName'):
-        super().__init__(name=name)
-        self.dense = layers.Dense(10, name='InnerName')
-
-      def call(self, inputs):
-        return self.dense(inputs)
-
-    inputs = input_layer.Input((3,))
-    layer = NestedLayer()
-    _ = layer(inputs)
-    self.assertEqual(layer.dense.bias.name, 'OuterName/InnerName/bias:0')
-    self.assertEqual(layer.dense.kernel.name, 'OuterName/InnerName/kernel:0')
-
-  def test_name_scope_sublayer(self):
-
-    class NameScopeTracker(base_layer.Layer):
-
-      def call(self, inputs):
-        self.active_name_scope = tf.__internal__.get_name_scope()
-        return inputs
-
-    x = backend.placeholder(shape=(10, 10))
-    sublayer = NameScopeTracker(name='Sublayer')
-    layer = layers.Dense(10, activation=sublayer, name='MyName2')
-    layer(x)
-    self.assertEqual(layer.bias.name, 'MyName2/bias:0')
-    self.assertEqual(layer.kernel.name, 'MyName2/kernel:0')
-    self.assertEqual(sublayer.active_name_scope, 'MyName2/Sublayer')
-
-  def test_name_scope_tf_tensor(self):
-    x = tf.convert_to_tensor(np.ones((10, 10)))
-    layer = layers.Dense(
-        10, activation=layers.ReLU(name='MyAct'), name='MyName3')
-    layer(x)
-    self.assertEqual(layer.bias.name, 'MyName3/bias:0')
-    self.assertEqual(layer.kernel.name, 'MyName3/kernel:0')
-
-  @test_utils.run_v2_only
-  def test_apply_name_scope_on_model_declaration(self):
-    if not tf.executing_eagerly():
-      self.skipTest('`apply_name_scope_on_model_declaration` API is supported'
-                    ' only for V2 eager')
-
-    base_layer._apply_name_scope_on_model_declaration(True)
-
-    inputs = input_layer.Input((3,))
-    x = layers.Dense(10, name='Dense1')(inputs)
-    with tf.name_scope('outer'):
-      x = layers.Dense(10, name='Dense2')(x)
-      with tf.name_scope('inner'):
-        x = layers.Dense(10, name='Dense3')(x)
-      x = layers.Dense(10, name='Dense4')(x)
-    outputs = layers.Dense(10, name='Dense5')(x)
-
-    model = training_lib.Model(inputs, outputs)
-    node_names = self._get_model_node_names(model, np.random.random((1, 3)),
-                                            'call_scope')
-    self.assertListEqual(node_names, [
-        'call_scope/Const',
-        'call_scope/model/Cast',
-        'call_scope/model/Dense1/MatMul/ReadVariableOp/resource',
-        'call_scope/model/Dense1/MatMul/ReadVariableOp',
-        'call_scope/model/Dense1/MatMul',
-        'call_scope/model/Dense1/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/Dense1/BiasAdd/ReadVariableOp',
-        'call_scope/model/Dense1/BiasAdd',
-        'call_scope/model/outer/Dense2/MatMul/ReadVariableOp/resource',
-        'call_scope/model/outer/Dense2/MatMul/ReadVariableOp',
-        'call_scope/model/outer/Dense2/MatMul',
-        'call_scope/model/outer/Dense2/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/outer/Dense2/BiasAdd/ReadVariableOp',
-        'call_scope/model/outer/Dense2/BiasAdd',
-        'call_scope/model/outer/inner/Dense3/MatMul/ReadVariableOp/resource',
-        'call_scope/model/outer/inner/Dense3/MatMul/ReadVariableOp',
-        'call_scope/model/outer/inner/Dense3/MatMul',
-        'call_scope/model/outer/inner/Dense3/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/outer/inner/Dense3/BiasAdd/ReadVariableOp',
-        'call_scope/model/outer/inner/Dense3/BiasAdd',
-        'call_scope/model/outer/Dense4/MatMul/ReadVariableOp/resource',
-        'call_scope/model/outer/Dense4/MatMul/ReadVariableOp',
-        'call_scope/model/outer/Dense4/MatMul',
-        'call_scope/model/outer/Dense4/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/outer/Dense4/BiasAdd/ReadVariableOp',
-        'call_scope/model/outer/Dense4/BiasAdd',
-        'call_scope/model/Dense5/MatMul/ReadVariableOp/resource',
-        'call_scope/model/Dense5/MatMul/ReadVariableOp',
-        'call_scope/model/Dense5/MatMul',
-        'call_scope/model/Dense5/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/Dense5/BiasAdd/ReadVariableOp',
-        'call_scope/model/Dense5/BiasAdd',
-        'Identity',
-        'NoOp'
-    ])
-    base_layer._apply_name_scope_on_model_declaration(False)
-
-  @test_utils.run_v2_only
-  def test_apply_name_scope_on_nested_layer_model_declaration(self):
-    if not tf.executing_eagerly():
-      self.skipTest('`apply_name_scope_on_model_declaration` API is supported'
-                    ' only for V2 eager')
-
-    base_layer._apply_name_scope_on_model_declaration(True)
-
-    class ThreeDenses(layers.Layer):
-
-      def __init__(self, name='ThreeDenses', **kwargs):
-        super().__init__(name=name, **kwargs)
-        self.inner_dense_1 = layers.Dense(10, name='NestedDense1')
-        with tf.name_scope('inner1/inner2'):
-          self.inner_dense_2 = layers.Dense(20, name='NestedDense2')
-        self.inner_dense_3 = layers.Dense(30, name='NestedDense3')
-
-      def call(self, x):
-        x = self.inner_dense_1(x)
-        x = self.inner_dense_2(x)
-        x = self.inner_dense_3(x)
-        return x
-
-    inputs = input_layer.Input((3,))
-    with tf.name_scope('outer'):
-      x = ThreeDenses()(inputs)
-    outputs = layers.Dense(10, name='OuterDense')(x)
-
-    model = training_lib.Model(inputs, outputs)
-    node_names = self._get_model_node_names(model, np.random.random((1, 3)),
-                                            'call_scope')
-
-    self.assertListEqual(node_names, [
-        'call_scope/Const', 'call_scope/model/Cast',
-        'call_scope/model/outer/ThreeDenses/NestedDense1/MatMul/ReadVariableOp/resource',
-        'call_scope/model/outer/ThreeDenses/NestedDense1/MatMul/ReadVariableOp',
-        'call_scope/model/outer/ThreeDenses/NestedDense1/MatMul',
-        'call_scope/model/outer/ThreeDenses/NestedDense1/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/outer/ThreeDenses/NestedDense1/BiasAdd/ReadVariableOp',
-        'call_scope/model/outer/ThreeDenses/NestedDense1/BiasAdd',
-        'call_scope/model/outer/ThreeDenses/inner1/inner2/NestedDense2/MatMul/ReadVariableOp/resource',
-        'call_scope/model/outer/ThreeDenses/inner1/inner2/NestedDense2/MatMul/ReadVariableOp',
-        'call_scope/model/outer/ThreeDenses/inner1/inner2/NestedDense2/MatMul',
-        'call_scope/model/outer/ThreeDenses/inner1/inner2/NestedDense2/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/outer/ThreeDenses/inner1/inner2/NestedDense2/BiasAdd/ReadVariableOp',
-        'call_scope/model/outer/ThreeDenses/inner1/inner2/NestedDense2/BiasAdd',
-        'call_scope/model/outer/ThreeDenses/NestedDense3/MatMul/ReadVariableOp/resource',
-        'call_scope/model/outer/ThreeDenses/NestedDense3/MatMul/ReadVariableOp',
-        'call_scope/model/outer/ThreeDenses/NestedDense3/MatMul',
-        'call_scope/model/outer/ThreeDenses/NestedDense3/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/outer/ThreeDenses/NestedDense3/BiasAdd/ReadVariableOp',
-        'call_scope/model/outer/ThreeDenses/NestedDense3/BiasAdd',
-        'call_scope/model/OuterDense/MatMul/ReadVariableOp/resource',
-        'call_scope/model/OuterDense/MatMul/ReadVariableOp',
-        'call_scope/model/OuterDense/MatMul',
-        'call_scope/model/OuterDense/BiasAdd/ReadVariableOp/resource',
-        'call_scope/model/OuterDense/BiasAdd/ReadVariableOp',
-        'call_scope/model/OuterDense/BiasAdd', 'Identity', 'NoOp'
-    ])
-    base_layer._apply_name_scope_on_model_declaration(False)
-
-  def _get_model_node_names(self, model, inputs, call_name_scope):
-    """Returns a list of model's node names."""
-
-    @tf.function()
-    def wrapper():
-      with tf.name_scope(call_name_scope):
-        return model(inputs)
-
-    return [
-        node.name
-        for node in wrapper.get_concrete_function().graph.as_graph_def().node
-    ]
+    def test_name_scope_layer(self):
+        x = backend.placeholder(shape=(10, 10))
+        layer = layers.Dense(10, name="MyName")
+        layer(x)
+        self.assertEqual(layer.bias.name, "MyName/bias:0")
+        self.assertEqual(layer.kernel.name, "MyName/kernel:0")
+
+    def test_name_scope_functional_api(self):
+        inputs = input_layer.Input((3,))
+        layer = layers.Dense(10, name="MyName")
+        _ = layer(inputs)
+        self.assertEqual(layer.bias.name, "MyName/bias:0")
+        self.assertEqual(layer.kernel.name, "MyName/kernel:0")
+
+    def test_name_scope_functional_api_nested(self):
+        class NestedLayer(base_layer.Layer):
+            def __init__(self, name="OuterName"):
+                super().__init__(name=name)
+                self.dense = layers.Dense(10, name="InnerName")
+
+            def call(self, inputs):
+                return self.dense(inputs)
+
+        inputs = input_layer.Input((3,))
+        layer = NestedLayer()
+        _ = layer(inputs)
+        self.assertEqual(layer.dense.bias.name, "OuterName/InnerName/bias:0")
+        self.assertEqual(
+            layer.dense.kernel.name, "OuterName/InnerName/kernel:0"
+        )
+
+    def test_name_scope_sublayer(self):
+        class NameScopeTracker(base_layer.Layer):
+            def call(self, inputs):
+                self.active_name_scope = tf.__internal__.get_name_scope()
+                return inputs
+
+        x = backend.placeholder(shape=(10, 10))
+        sublayer = NameScopeTracker(name="Sublayer")
+        layer = layers.Dense(10, activation=sublayer, name="MyName2")
+        layer(x)
+        self.assertEqual(layer.bias.name, "MyName2/bias:0")
+        self.assertEqual(layer.kernel.name, "MyName2/kernel:0")
+        self.assertEqual(sublayer.active_name_scope, "MyName2/Sublayer")
+
+    def test_name_scope_tf_tensor(self):
+        x = tf.convert_to_tensor(np.ones((10, 10)))
+        layer = layers.Dense(
+            10, activation=layers.ReLU(name="MyAct"), name="MyName3"
+        )
+        layer(x)
+        self.assertEqual(layer.bias.name, "MyName3/bias:0")
+        self.assertEqual(layer.kernel.name, "MyName3/kernel:0")
+
+    @test_utils.run_v2_only
+    def test_apply_name_scope_on_model_declaration(self):
+        if not tf.executing_eagerly():
+            self.skipTest(
+                "`apply_name_scope_on_model_declaration` API is supported"
+                " only for V2 eager"
+            )
+
+        base_layer._apply_name_scope_on_model_declaration(True)
+
+        inputs = input_layer.Input((3,))
+        x = layers.Dense(10, name="Dense1")(inputs)
+        with tf.name_scope("outer"):
+            x = layers.Dense(10, name="Dense2")(x)
+            with tf.name_scope("inner"):
+                x = layers.Dense(10, name="Dense3")(x)
+            x = layers.Dense(10, name="Dense4")(x)
+        outputs = layers.Dense(10, name="Dense5")(x)
+
+        model = training_lib.Model(inputs, outputs)
+        node_names = self._get_model_node_names(
+            model, np.random.random((1, 3)), "call_scope"
+        )
+        self.assertListEqual(
+            node_names,
+            [
+                "call_scope/Const",
+                "call_scope/model/Cast",
+                "call_scope/model/Dense1/MatMul/ReadVariableOp/resource",
+                "call_scope/model/Dense1/MatMul/ReadVariableOp",
+                "call_scope/model/Dense1/MatMul",
+                "call_scope/model/Dense1/BiasAdd/ReadVariableOp/resource",
+                "call_scope/model/Dense1/BiasAdd/ReadVariableOp",
+                "call_scope/model/Dense1/BiasAdd",
+                "call_scope/model/outer/Dense2/MatMul/ReadVariableOp/resource",
+                "call_scope/model/outer/Dense2/MatMul/ReadVariableOp",
+                "call_scope/model/outer/Dense2/MatMul",
+                "call_scope/model/outer/Dense2/BiasAdd/ReadVariableOp/resource",
+                "call_scope/model/outer/Dense2/BiasAdd/ReadVariableOp",
+                "call_scope/model/outer/Dense2/BiasAdd",
+                "call_scope/model/outer/inner/Dense3/MatMul/ReadVariableOp/"
+                "resource",
+                "call_scope/model/outer/inner/Dense3/MatMul/ReadVariableOp",
+                "call_scope/model/outer/inner/Dense3/MatMul",
+                "call_scope/model/outer/inner/Dense3/BiasAdd/ReadVariableOp/"
+                "resource",
+                "call_scope/model/outer/inner/Dense3/BiasAdd/ReadVariableOp",
+                "call_scope/model/outer/inner/Dense3/BiasAdd",
+                "call_scope/model/outer/Dense4/MatMul/ReadVariableOp/resource",
+                "call_scope/model/outer/Dense4/MatMul/ReadVariableOp",
+                "call_scope/model/outer/Dense4/MatMul",
+                "call_scope/model/outer/Dense4/BiasAdd/ReadVariableOp/resource",
+                "call_scope/model/outer/Dense4/BiasAdd/ReadVariableOp",
+                "call_scope/model/outer/Dense4/BiasAdd",
+                "call_scope/model/Dense5/MatMul/ReadVariableOp/resource",
+                "call_scope/model/Dense5/MatMul/ReadVariableOp",
+                "call_scope/model/Dense5/MatMul",
+                "call_scope/model/Dense5/BiasAdd/ReadVariableOp/resource",
+                "call_scope/model/Dense5/BiasAdd/ReadVariableOp",
+                "call_scope/model/Dense5/BiasAdd",
+                "Identity",
+                "NoOp",
+            ],
+        )
+        base_layer._apply_name_scope_on_model_declaration(False)
+
+    @test_utils.run_v2_only
+    def test_apply_name_scope_on_nested_layer_model_declaration(self):
+        if not tf.executing_eagerly():
+            self.skipTest(
+                "`apply_name_scope_on_model_declaration` API is supported"
+                " only for V2 eager"
+            )
+
+        base_layer._apply_name_scope_on_model_declaration(True)
+
+        class ThreeDenses(layers.Layer):
+            def __init__(self, name="ThreeDenses", **kwargs):
+                super().__init__(name=name, **kwargs)
+                self.inner_dense_1 = layers.Dense(10, name="NestedDense1")
+                with tf.name_scope("inner1/inner2"):
+                    self.inner_dense_2 = layers.Dense(20, name="NestedDense2")
+                self.inner_dense_3 = layers.Dense(30, name="NestedDense3")
+
+            def call(self, x):
+                x = self.inner_dense_1(x)
+                x = self.inner_dense_2(x)
+                x = self.inner_dense_3(x)
+                return x
+
+        inputs = input_layer.Input((3,))
+        with tf.name_scope("outer"):
+            x = ThreeDenses()(inputs)
+        outputs = layers.Dense(10, name="OuterDense")(x)
+
+        model = training_lib.Model(inputs, outputs)
+        node_names = self._get_model_node_names(
+            model, np.random.random((1, 3)), "call_scope"
+        )
+
+        self.assertListEqual(
+            node_names,
+            [
+                "call_scope/Const",
+                "call_scope/model/Cast",
+                "call_scope/model/outer/ThreeDenses/NestedDense1/MatMul/"
+                "ReadVariableOp/resource",
+                "call_scope/model/outer/ThreeDenses/NestedDense1/MatMul/"
+                "ReadVariableOp",
+                "call_scope/model/outer/ThreeDenses/NestedDense1/MatMul",
+                "call_scope/model/outer/ThreeDenses/NestedDense1/BiasAdd/"
+                "ReadVariableOp/resource",
+                "call_scope/model/outer/ThreeDenses/NestedDense1/BiasAdd/"
+                "ReadVariableOp",
+                "call_scope/model/outer/ThreeDenses/NestedDense1/BiasAdd",
+                "call_scope/model/outer/ThreeDenses/inner1/inner2/"
+                "NestedDense2/MatMul/ReadVariableOp/resource",
+                "call_scope/model/outer/ThreeDenses/inner1/inner2/"
+                "NestedDense2/MatMul/ReadVariableOp",
+                "call_scope/model/outer/ThreeDenses/inner1/inner2/"
+                "NestedDense2/MatMul",
+                "call_scope/model/outer/ThreeDenses/inner1/inner2/"
+                "NestedDense2/BiasAdd/ReadVariableOp/resource",
+                "call_scope/model/outer/ThreeDenses/inner1/inner2/"
+                "NestedDense2/BiasAdd/ReadVariableOp",
+                "call_scope/model/outer/ThreeDenses/inner1/inner2/"
+                "NestedDense2/BiasAdd",
+                "call_scope/model/outer/ThreeDenses/NestedDense3/"
+                "MatMul/ReadVariableOp/resource",
+                "call_scope/model/outer/ThreeDenses/NestedDense3/"
+                "MatMul/ReadVariableOp",
+                "call_scope/model/outer/ThreeDenses/NestedDense3/MatMul",
+                "call_scope/model/outer/ThreeDenses/NestedDense3/"
+                "BiasAdd/ReadVariableOp/resource",
+                "call_scope/model/outer/ThreeDenses/NestedDense3/"
+                "BiasAdd/ReadVariableOp",
+                "call_scope/model/outer/ThreeDenses/NestedDense3/BiasAdd",
+                "call_scope/model/OuterDense/MatMul/ReadVariableOp/resource",
+                "call_scope/model/OuterDense/MatMul/ReadVariableOp",
+                "call_scope/model/OuterDense/MatMul",
+                "call_scope/model/OuterDense/BiasAdd/ReadVariableOp/resource",
+                "call_scope/model/OuterDense/BiasAdd/ReadVariableOp",
+                "call_scope/model/OuterDense/BiasAdd",
+                "Identity",
+                "NoOp",
+            ],
+        )
+        base_layer._apply_name_scope_on_model_declaration(False)
+
+    def _get_model_node_names(self, model, inputs, call_name_scope):
+        """Returns a list of model's node names."""
+
+        @tf.function()
+        def wrapper():
+            with tf.name_scope(call_name_scope):
+                return model(inputs)
+
+        return [
+            node.name
+            for node in wrapper.get_concrete_function()
+            .graph.as_graph_def()
+            .node
+        ]
 
 
 @test_utils.run_v2_only
 @test_combinations.generate(
-    test_combinations.keras_mode_combinations(mode=['eager']))
+    test_combinations.keras_mode_combinations(mode=["eager"])
+)
 class AutographControlFlowTest(test_combinations.TestCase):
-
-  def test_disabling_in_context_is_matched(self):
-
-    test_obj = self
-
-    class MyLayer(base_layer.Layer):
-
-      def call(self, inputs, training=None):
-        with test_obj.assertRaisesRegex(TypeError, 'Tensor.*as.*bool'):
-          if tf.constant(False):
-            return inputs * 1.
-        return inputs * 0.
-
-    @tf.function(autograph=False)
-    def test_fn():
-      return MyLayer()(tf.constant([[1., 2., 3.]]))
-
-    test_fn()
-
-  def test_if_training_pattern_output(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def call(self, inputs, training=None):
-        if training:
-          return inputs * 1.
-        return inputs * 0.
-
-    inputs = input_layer.Input((3,))
-    outputs = MyLayer()(inputs)
-    model = training_lib.Model(inputs, outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(train_loss, 0.)
-    test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(test_loss, 1.)
-
-  def test_if_training_pattern_loss(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def call(self, inputs, training=None):
-        if training:
-          loss = tf.reduce_sum(inputs)
-        else:
-          loss = 0.
-        self.add_loss(loss)
-        return inputs
-
-    inputs = input_layer.Input((3,))
-    outputs = MyLayer()(inputs)
-    model = training_lib.Model(inputs, outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(train_loss, 2 * 3)
-    test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(test_loss, 0)
-
-  def test_if_training_pattern_metric(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def call(self, inputs, training=None):
-        if training:
-          metric = tf.reduce_sum(inputs)
-        else:
-          metric = 0.
-        self.add_metric(metric, name='my_metric', aggregation='mean')
-        return inputs
-
-    inputs = input_layer.Input((3,))
-    outputs = MyLayer()(inputs)
-    model = training_lib.Model(inputs, outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    for _ in range(3):
-      _, train_metric = model.train_on_batch(np.ones((2, 3)),
-                                             np.ones((2, 3)))
-
-      self.assertEqual(train_metric, 2 * 3)
-      _, test_metric = model.test_on_batch(np.ones((2, 3)),
-                                           np.ones((2, 3)))
-      self.assertEqual(test_metric, 0)
-
-  def test_if_training_pattern_update(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def build(self, input_shape):
-        self.counter = self.add_weight(
-            shape=(), trainable=False, initializer='zeros')
-
-      def call(self, inputs, training=None):
-        if training:
-          increment = 1.
-        else:
-          increment = 0.
-        self.counter.assign_add(increment)
-        return inputs
-
-    inputs = input_layer.Input((3,))
-    layer = MyLayer()
-    outputs = layer(inputs)
-    model = training_lib.Model(inputs, outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(backend.get_value(layer.counter), 1.)
-
-  def test_conditional_losses_in_call(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__(dynamic=test_utils.should_run_eagerly())
-
-      def call(self, inputs, training=None):
-        if training:
-          self.add_loss(tf.reduce_sum(inputs))
-        return inputs
-
-      def compute_output_shape(self, input_shape):
-        return input_shape
-
-    inputs = input_layer.Input((3,))
-    layer = MyLayer()
-    outputs = layer(inputs)
-    model = training_lib.Model(inputs, outputs)
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(loss, 2 * 3)
-
-  def test_conditional_callable_losses(self):
-    model = sequential.Sequential([
-        layers.Dense(
-            1, kernel_regularizer=regularizers.l2(1e-4), input_shape=(1,))
-    ])
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    def assert_graph(t):
-      if not tf.executing_eagerly():
-        self.assertEqual(t.graph, tf.compat.v1.get_default_graph())
-
-    @tf.function
-    def get_losses(t):
-      if t < 0:
-        return tf.reduce_sum(model.losses) * t
-      else:
-        return tf.reduce_sum(model.losses)
-
-    assert_graph(get_losses(tf.constant(2.)))
-    assert_graph(get_losses(tf.constant(0.5)))
-
-  def test_conditional_metrics_in_call(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def __init__(self):
-        super().__init__(dynamic=test_utils.should_run_eagerly())
-
-      def call(self, inputs, training=None):
-        if training:
-          self.add_metric(tf.reduce_sum(inputs),
-                          name='sum',
-                          aggregation='mean')
-        return inputs
-
-      def compute_output_shape(self, input_shape):
-        return input_shape
-
-    inputs = input_layer.Input((3,))
-    layer = MyLayer()
-    outputs = layer(inputs)
-    model = training_lib.Model(inputs, outputs)
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(history.history['sum'][-1], 2 * 3)
-
-  def test_conditional_activity_regularizer_in_call(self):
-
-    class TestModel(training_lib.Model):
-
-      def __init__(self):
-        super().__init__(
-            name='test_model', dynamic=test_utils.should_run_eagerly())
-        self.layer = layers.Dense(2, activity_regularizer='l2')
-
-      def call(self, x, training=None):
-        if tf.greater(tf.reduce_sum(x), 0.0):
-          return self.layer(x)
+    def test_disabling_in_context_is_matched(self):
+
+        test_obj = self
+
+        class MyLayer(base_layer.Layer):
+            def call(self, inputs, training=None):
+                with test_obj.assertRaisesRegex(TypeError, "Tensor.*as.*bool"):
+                    if tf.constant(False):
+                        return inputs * 1.0
+                return inputs * 0.0
+
+        @tf.function(autograph=False)
+        def test_fn():
+            return MyLayer()(tf.constant([[1.0, 2.0, 3.0]]))
+
+        test_fn()
+
+    def test_if_training_pattern_output(self):
+        class MyLayer(base_layer.Layer):
+            def call(self, inputs, training=None):
+                if training:
+                    return inputs * 1.0
+                return inputs * 0.0
+
+        inputs = input_layer.Input((3,))
+        outputs = MyLayer()(inputs)
+        model = training_lib.Model(inputs, outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(train_loss, 0.0)
+        test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(test_loss, 1.0)
+
+    def test_if_training_pattern_loss(self):
+        class MyLayer(base_layer.Layer):
+            def call(self, inputs, training=None):
+                if training:
+                    loss = tf.reduce_sum(inputs)
+                else:
+                    loss = 0.0
+                self.add_loss(loss)
+                return inputs
+
+        inputs = input_layer.Input((3,))
+        outputs = MyLayer()(inputs)
+        model = training_lib.Model(inputs, outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        train_loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(train_loss, 2 * 3)
+        test_loss = model.test_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(test_loss, 0)
+
+    def test_if_training_pattern_metric(self):
+        class MyLayer(base_layer.Layer):
+            def call(self, inputs, training=None):
+                if training:
+                    metric = tf.reduce_sum(inputs)
+                else:
+                    metric = 0.0
+                self.add_metric(metric, name="my_metric", aggregation="mean")
+                return inputs
+
+        inputs = input_layer.Input((3,))
+        outputs = MyLayer()(inputs)
+        model = training_lib.Model(inputs, outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        for _ in range(3):
+            _, train_metric = model.train_on_batch(
+                np.ones((2, 3)), np.ones((2, 3))
+            )
+
+            self.assertEqual(train_metric, 2 * 3)
+            _, test_metric = model.test_on_batch(
+                np.ones((2, 3)), np.ones((2, 3))
+            )
+            self.assertEqual(test_metric, 0)
+
+    def test_if_training_pattern_update(self):
+        class MyLayer(base_layer.Layer):
+            def build(self, input_shape):
+                self.counter = self.add_weight(
+                    shape=(), trainable=False, initializer="zeros"
+                )
+
+            def call(self, inputs, training=None):
+                if training:
+                    increment = 1.0
+                else:
+                    increment = 0.0
+                self.counter.assign_add(increment)
+                return inputs
+
+        inputs = input_layer.Input((3,))
+        layer = MyLayer()
+        outputs = layer(inputs)
+        model = training_lib.Model(inputs, outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(backend.get_value(layer.counter), 1.0)
+
+    def test_conditional_losses_in_call(self):
+        class MyLayer(base_layer.Layer):
+            def __init__(self):
+                super().__init__(dynamic=test_utils.should_run_eagerly())
+
+            def call(self, inputs, training=None):
+                if training:
+                    self.add_loss(tf.reduce_sum(inputs))
+                return inputs
+
+            def compute_output_shape(self, input_shape):
+                return input_shape
+
+        inputs = input_layer.Input((3,))
+        layer = MyLayer()
+        outputs = layer(inputs)
+        model = training_lib.Model(inputs, outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(loss, 2 * 3)
+
+    def test_conditional_callable_losses(self):
+        model = sequential.Sequential(
+            [
+                layers.Dense(
+                    1,
+                    kernel_regularizer=regularizers.l2(1e-4),
+                    input_shape=(1,),
+                )
+            ]
+        )
+        model._run_eagerly = test_utils.should_run_eagerly()
+
+        def assert_graph(t):
+            if not tf.executing_eagerly():
+                self.assertEqual(t.graph, tf.compat.v1.get_default_graph())
+
+        @tf.function
+        def get_losses(t):
+            if t < 0:
+                return tf.reduce_sum(model.losses) * t
+            else:
+                return tf.reduce_sum(model.losses)
+
+        assert_graph(get_losses(tf.constant(2.0)))
+        assert_graph(get_losses(tf.constant(0.5)))
+
+    def test_conditional_metrics_in_call(self):
+        class MyLayer(base_layer.Layer):
+            def __init__(self):
+                super().__init__(dynamic=test_utils.should_run_eagerly())
+
+            def call(self, inputs, training=None):
+                if training:
+                    self.add_metric(
+                        tf.reduce_sum(inputs), name="sum", aggregation="mean"
+                    )
+                return inputs
+
+            def compute_output_shape(self, input_shape):
+                return input_shape
+
+        inputs = input_layer.Input((3,))
+        layer = MyLayer()
+        outputs = layer(inputs)
+        model = training_lib.Model(inputs, outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(history.history["sum"][-1], 2 * 3)
+
+    def test_conditional_activity_regularizer_in_call(self):
+        class TestModel(training_lib.Model):
+            def __init__(self):
+                super().__init__(
+                    name="test_model", dynamic=test_utils.should_run_eagerly()
+                )
+                self.layer = layers.Dense(2, activity_regularizer="l2")
+
+            def call(self, x, training=None):
+                if tf.greater(tf.reduce_sum(x), 0.0):
+                    return self.layer(x)
+                else:
+                    return self.layer(x)
+
+        model = TestModel()
+        model.compile(
+            loss="mse",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+
+        if test_utils.should_run_eagerly():
+            model.fit(x, y, epochs=2, batch_size=5)
         else:
-          return self.layer(x)
-
-    model = TestModel()
-    model.compile(
-        loss='mse',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-
-    if test_utils.should_run_eagerly():
-      model.fit(x, y, epochs=2, batch_size=5)
-    else:
-      with self.assertRaisesRegex(ValueError, 'ActivityRegularizer'):
-        model.fit(x, y, epochs=2, batch_size=5)
-
-  def test_conditional_activity_regularizer_with_wrappers_in_call(self):
-
-    class TestModel(training_lib.Model):
-
-      def __init__(self):
-        super().__init__(
-            name='test_model', dynamic=test_utils.should_run_eagerly())
-        self.layer = layers.TimeDistributed(
-            layers.Dense(2, activity_regularizer='l2'), input_shape=(3, 4))
-
-      def call(self, x, training=None):
-        if tf.greater(tf.reduce_sum(x), 0.0):
-          return self.layer(x)
+            with self.assertRaisesRegex(ValueError, "ActivityRegularizer"):
+                model.fit(x, y, epochs=2, batch_size=5)
+
+    def test_conditional_activity_regularizer_with_wrappers_in_call(self):
+        class TestModel(training_lib.Model):
+            def __init__(self):
+                super().__init__(
+                    name="test_model", dynamic=test_utils.should_run_eagerly()
+                )
+                self.layer = layers.TimeDistributed(
+                    layers.Dense(2, activity_regularizer="l2"),
+                    input_shape=(3, 4),
+                )
+
+            def call(self, x, training=None):
+                if tf.greater(tf.reduce_sum(x), 0.0):
+                    return self.layer(x)
+                else:
+                    return self.layer(x)
+
+        model = TestModel()
+        model.compile(
+            loss="mse",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones(shape=(10, 3, 4))
+        y = np.ones(shape=(10, 3, 2))
+
+        if test_utils.should_run_eagerly():
+            model.fit(x, y, epochs=2, batch_size=5)
         else:
-          return self.layer(x)
-
-    model = TestModel()
-    model.compile(
-        loss='mse',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones(shape=(10, 3, 4))
-    y = np.ones(shape=(10, 3, 2))
-
-    if test_utils.should_run_eagerly():
-      model.fit(x, y, epochs=2, batch_size=5)
-    else:
-      with self.assertRaisesRegex(ValueError, 'ActivityRegularizer'):
-        model.fit(x, y, epochs=2, batch_size=5)
+            with self.assertRaisesRegex(ValueError, "ActivityRegularizer"):
+                model.fit(x, y, epochs=2, batch_size=5)
 
 
 class AddLayer(base_layer.Layer):
-  """A layer which adds its input to a variable.
+    """A layer which adds its input to a variable.
 
-  Useful for testing a layer with a variable
-  """
+    Useful for testing a layer with a variable
+    """
 
-  def build(self, _):
-    self.v = self.add_weight('v', (), initializer='ones')
-    self.built = True
+    def build(self, _):
+        self.v = self.add_weight("v", (), initializer="ones")
+        self.built = True
 
-  def call(self, inputs):
-    return inputs + self.v
+    def call(self, inputs):
+        return inputs + self.v
 
 
 class IdentityLayer(base_layer.Layer):
-  """A layer that returns its input.
+    """A layer that returns its input.
 
-  Useful for testing a layer without a variable.
-  """
+    Useful for testing a layer without a variable.
+    """
 
-  def call(self, inputs):
-    return inputs
+    def call(self, inputs):
+        return inputs
 
 
 @test_utils.run_v2_only
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class DTypeTest(test_combinations.TestCase):
-
-  def _const(self, dtype):
-    return tf.constant(1, dtype=dtype)
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_dtype_defaults_to_floatx(self):
-    layer = AddLayer()
-    self.assertEqual(layer.dtype, 'float32')
-    layer(self._const('float64'))
-    self.assertEqual(layer.dtype, 'float32')  # dtype should not change
-
-    try:
-      backend.set_floatx('float64')
-      layer = AddLayer()
-      self.assertEqual(layer.dtype, 'float64')
-    finally:
-      backend.set_floatx('float32')
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_passing_dtype_to_constructor(self):
-    layer = IdentityLayer(dtype='float64')
-    layer(self._const('float32'))
-    self.assertEqual(layer.dtype, 'float64')
-
-    layer = IdentityLayer(dtype='int32')
-    layer(self._const('float32'))
-    self.assertEqual(layer.dtype, 'int32')
-
-    layer = IdentityLayer(dtype=tf.float64)
-    layer(self._const('float32'))
-    self.assertEqual(layer.dtype, 'float64')
-
-  @test_utils.enable_v2_dtype_behavior
-  def input_cast_to_dtype(self):
-    layer = AddLayer()
-
-    # Input should be cast to layer.dtype, so output should also be layer.dtype
-    self.assertEqual(layer(self._const('float64')).dtype, 'float32')
-
-    layer = AddLayer(dtype='float64')
-    self.assertEqual(layer(self._const('float32')).dtype, 'float64')
-
-    # Test inputs are not casted if layer.dtype is not floating-point
-    layer = IdentityLayer(dtype='int32')
-    self.assertEqual(layer(self._const('float64')).dtype, 'float64')
-
-    # Test inputs are not casted if the inputs are not floating-point
-    layer = IdentityLayer(dtype='float32')
-    self.assertEqual(layer(self._const('int32')).dtype, 'int32')
-
-    # Test Numpy arrays are casted
-    layer = IdentityLayer(dtype='float64')
-    self.assertEqual(layer(np.array(1, dtype='float32')).dtype, 'float64')
-
-    # Test Python floats are casted
-    layer = IdentityLayer(dtype='float64')
-    self.assertEqual(layer(1.).dtype, 'float64')
-
-  @test_utils.enable_v2_dtype_behavior
-  def multiple_inputs_cast_to_dtype(self):
-
-    class MultiIdentityLayer(base_layer.Layer):
-
-      def call(self, inputs):
-        return [tf.identity(x) for x in inputs]
-
-    # Testing layer with default dtype of float32
-    layer = MultiIdentityLayer()
-    x, y = layer([self._const('float16'), self._const('float32')])
-    self.assertEqual(x.dtype, 'float32')
-    self.assertEqual(y.dtype, 'float32')
-
-    # Test passing dtype to the constructor
-    layer = MultiIdentityLayer(dtype='float64')
-    x, y = layer([self._const('float16'), self._const('float32')])
-    self.assertEqual(x.dtype, 'float64')
-    self.assertEqual(y.dtype, 'float64')
-
-    # Test several non-floating point types
-    layer = MultiIdentityLayer(dtype='float64')
-    x, y, z, w = layer([self._const('float16'), self._const('bool'),
-                        self._const('float64'), self._constant('complex64')])
-    self.assertEqual(x.dtype, 'float64')
-    self.assertEqual(y.dtype, 'bool')
-    self.assertEqual(z.dtype, 'float64')
-    self.assertEqual(w.dtype, 'complex64')
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_extra_args_and_kwargs_not_casted(self):
-
-    class IdentityLayerWithArgs(base_layer.Layer):
-
-      def call(self, inputs, *args, **kwargs):
-        kwargs.pop('training', None)
-        return tf.nest.flatten([inputs, args, kwargs])
-
-    layer = IdentityLayerWithArgs(dtype='float64')
-    x, y, z = layer(self._const('float16'), self._const('float16'),
-                    kwarg=self._const('float16'))
-    self.assertEqual(x.dtype, 'float64')
-    self.assertEqual(y.dtype, 'float16')
-    self.assertEqual(z.dtype, 'float16')
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_layer_without_autocast(self):
-
-    class IdentityLayerWithoutAutocast(IdentityLayer):
-
-      def __init__(self, *args, **kwargs):
-        kwargs['autocast'] = False
-        super().__init__(*args, **kwargs)
-
-    layer = IdentityLayerWithoutAutocast(dtype='float64')
-    self.assertEqual(layer(self._const('float32')).dtype, 'float32')
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_compute_output_signature(self):
-
-    class IdentityLayerWithOutputShape(IdentityLayer):
-
-      def compute_output_shape(self, input_shape):
-        return input_shape
-
-    layer = IdentityLayerWithOutputShape(dtype='float64')
-    output_signature = layer.compute_output_signature(
-        tf.TensorSpec(shape=(), dtype='float32'))
-    self.assertEqual(output_signature.shape, ())
-    self.assertEqual(output_signature.dtype, 'float64')
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_composite_tensors_input_casting(self):
-    sparse = tf.SparseTensor(
-        indices=tf.constant([[0, 1], [2, 3]], dtype='int64'),
-        values=tf.constant([0., 1.], dtype='float32'),
-        dense_shape=tf.constant([4, 4], dtype='int64'))
-    ragged = tf.RaggedTensor.from_row_splits(
-        values=tf.constant([1., 2., 3.], dtype='float32'),
-        row_splits=tf.constant([0, 2, 2, 3], dtype='int64'))
-
-    layer = IdentityLayer(dtype='float16')
-
-    for x in sparse, ragged:
-      self.assertEqual(x.dtype, 'float32')
-      y = layer(x)
-      self.assertEqual(y.dtype, 'float16')
-      self.assertEqual(type(x), type(y))
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_passing_non_tensor(self):
-    layer = IdentityLayer()
-    x = object()
-    y = layer(x)  # Layer should not cast 'x', as it's not a tensor
-    self.assertIs(x, y)
-
-  @test_utils.disable_v2_dtype_behavior
-  def test_v1_behavior(self):
-    # Test dtype defaults to None and inferred from input
-    layer = IdentityLayer()
-    self.assertIsNone(layer.dtype)
-    layer(self._const('float64'))
-    self.assertEqual(layer.dtype, 'float64')
-
-    # Test layer does not cast to dtype
-    self.assertEqual(layer(self._const('float32')).dtype, 'float32')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _const(self, dtype):
+        return tf.constant(1, dtype=dtype)
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_dtype_defaults_to_floatx(self):
+        layer = AddLayer()
+        self.assertEqual(layer.dtype, "float32")
+        layer(self._const("float64"))
+        self.assertEqual(layer.dtype, "float32")  # dtype should not change
+
+        try:
+            backend.set_floatx("float64")
+            layer = AddLayer()
+            self.assertEqual(layer.dtype, "float64")
+        finally:
+            backend.set_floatx("float32")
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_passing_dtype_to_constructor(self):
+        layer = IdentityLayer(dtype="float64")
+        layer(self._const("float32"))
+        self.assertEqual(layer.dtype, "float64")
+
+        layer = IdentityLayer(dtype="int32")
+        layer(self._const("float32"))
+        self.assertEqual(layer.dtype, "int32")
+
+        layer = IdentityLayer(dtype=tf.float64)
+        layer(self._const("float32"))
+        self.assertEqual(layer.dtype, "float64")
+
+    @test_utils.enable_v2_dtype_behavior
+    def input_cast_to_dtype(self):
+        layer = AddLayer()
+
+        # Input should be cast to layer.dtype, so output should also be
+        # layer.dtype
+        self.assertEqual(layer(self._const("float64")).dtype, "float32")
+
+        layer = AddLayer(dtype="float64")
+        self.assertEqual(layer(self._const("float32")).dtype, "float64")
+
+        # Test inputs are not casted if layer.dtype is not floating-point
+        layer = IdentityLayer(dtype="int32")
+        self.assertEqual(layer(self._const("float64")).dtype, "float64")
+
+        # Test inputs are not casted if the inputs are not floating-point
+        layer = IdentityLayer(dtype="float32")
+        self.assertEqual(layer(self._const("int32")).dtype, "int32")
+
+        # Test Numpy arrays are casted
+        layer = IdentityLayer(dtype="float64")
+        self.assertEqual(layer(np.array(1, dtype="float32")).dtype, "float64")
+
+        # Test Python floats are casted
+        layer = IdentityLayer(dtype="float64")
+        self.assertEqual(layer(1.0).dtype, "float64")
+
+    @test_utils.enable_v2_dtype_behavior
+    def multiple_inputs_cast_to_dtype(self):
+        class MultiIdentityLayer(base_layer.Layer):
+            def call(self, inputs):
+                return [tf.identity(x) for x in inputs]
+
+        # Testing layer with default dtype of float32
+        layer = MultiIdentityLayer()
+        x, y = layer([self._const("float16"), self._const("float32")])
+        self.assertEqual(x.dtype, "float32")
+        self.assertEqual(y.dtype, "float32")
+
+        # Test passing dtype to the constructor
+        layer = MultiIdentityLayer(dtype="float64")
+        x, y = layer([self._const("float16"), self._const("float32")])
+        self.assertEqual(x.dtype, "float64")
+        self.assertEqual(y.dtype, "float64")
+
+        # Test several non-floating point types
+        layer = MultiIdentityLayer(dtype="float64")
+        x, y, z, w = layer(
+            [
+                self._const("float16"),
+                self._const("bool"),
+                self._const("float64"),
+                self._constant("complex64"),
+            ]
+        )
+        self.assertEqual(x.dtype, "float64")
+        self.assertEqual(y.dtype, "bool")
+        self.assertEqual(z.dtype, "float64")
+        self.assertEqual(w.dtype, "complex64")
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_extra_args_and_kwargs_not_casted(self):
+        class IdentityLayerWithArgs(base_layer.Layer):
+            def call(self, inputs, *args, **kwargs):
+                kwargs.pop("training", None)
+                return tf.nest.flatten([inputs, args, kwargs])
+
+        layer = IdentityLayerWithArgs(dtype="float64")
+        x, y, z = layer(
+            self._const("float16"),
+            self._const("float16"),
+            kwarg=self._const("float16"),
+        )
+        self.assertEqual(x.dtype, "float64")
+        self.assertEqual(y.dtype, "float16")
+        self.assertEqual(z.dtype, "float16")
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_layer_without_autocast(self):
+        class IdentityLayerWithoutAutocast(IdentityLayer):
+            def __init__(self, *args, **kwargs):
+                kwargs["autocast"] = False
+                super().__init__(*args, **kwargs)
+
+        layer = IdentityLayerWithoutAutocast(dtype="float64")
+        self.assertEqual(layer(self._const("float32")).dtype, "float32")
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_compute_output_signature(self):
+        class IdentityLayerWithOutputShape(IdentityLayer):
+            def compute_output_shape(self, input_shape):
+                return input_shape
+
+        layer = IdentityLayerWithOutputShape(dtype="float64")
+        output_signature = layer.compute_output_signature(
+            tf.TensorSpec(shape=(), dtype="float32")
+        )
+        self.assertEqual(output_signature.shape, ())
+        self.assertEqual(output_signature.dtype, "float64")
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_composite_tensors_input_casting(self):
+        sparse = tf.SparseTensor(
+            indices=tf.constant([[0, 1], [2, 3]], dtype="int64"),
+            values=tf.constant([0.0, 1.0], dtype="float32"),
+            dense_shape=tf.constant([4, 4], dtype="int64"),
+        )
+        ragged = tf.RaggedTensor.from_row_splits(
+            values=tf.constant([1.0, 2.0, 3.0], dtype="float32"),
+            row_splits=tf.constant([0, 2, 2, 3], dtype="int64"),
+        )
+
+        layer = IdentityLayer(dtype="float16")
+
+        for x in sparse, ragged:
+            self.assertEqual(x.dtype, "float32")
+            y = layer(x)
+            self.assertEqual(y.dtype, "float16")
+            self.assertEqual(type(x), type(y))
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_passing_non_tensor(self):
+        layer = IdentityLayer()
+        x = object()
+        y = layer(x)  # Layer should not cast 'x', as it's not a tensor
+        self.assertIs(x, y)
+
+    @test_utils.disable_v2_dtype_behavior
+    def test_v1_behavior(self):
+        # Test dtype defaults to None and inferred from input
+        layer = IdentityLayer()
+        self.assertIsNone(layer.dtype)
+        layer(self._const("float64"))
+        self.assertEqual(layer.dtype, "float64")
+
+        # Test layer does not cast to dtype
+        self.assertEqual(layer(self._const("float32")).dtype, "float32")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/base_layer_utils.py b/keras/engine/base_layer_utils.py
index 8234e105bfc8..8e3de3d4df2e 100644
--- a/keras/engine/base_layer_utils.py
+++ b/keras/engine/base_layer_utils.py
@@ -14,188 +14,202 @@
 # ==============================================================================
 """Contains private utilities used mainly by the base Layer class."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 import threading
+
+import tensorflow.compat.v1 as tf1
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.dtensor import dtensor_api as dtensor
 from keras.utils import control_flow_util
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 _call_context = threading.local()
 
 
 def create_mean_metric(value, name=None):
-  # import keras will import base_layer and then this module, and metric relies
-  # on base_layer, which result into a cyclic dependency.
-  from keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-  metric_obj = metrics_module.Mean(name=name, dtype=value.dtype)
-  return metric_obj, metric_obj(value)
-
-
-def make_variable(name,
-                  shape=None,
-                  dtype=tf.float32,
-                  initializer=None,
-                  trainable=None,
-                  caching_device=None,
-                  validate_shape=True,
-                  constraint=None,
-                  use_resource=None,
-                  collections=None,
-                  synchronization=tf.VariableSynchronization.AUTO,
-                  aggregation=tf.VariableAggregation.NONE,
-                  partitioner=None,    # pylint: disable=unused-argument
-                  layout=None):
-  """Temporary util to create a variable (relies on `variable_scope.variable`).
-
-  Some reuse-related technicalities prevent us from using
-  `variable_scope.get_variable()` directly, so we use a subcomponent
-  that has fewer constraints (`variable_scope.variable()`).
-
-  In the longer term, it seems like a similar "default variable creator" method
-  should exist in `Trackable` instead. When this happens, we can get
-  rid of this temporary solution.
-
-  TODO(fchollet): remove this method when no longer needed.
-
-  Args:
-    name: Variable name.
-    shape: Variable shape.
-    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-    initializer: Initializer instance (callable).
-    trainable: Whether the variable should be part of the layer's
-      "trainable_variables" (e.g. variables, biases)
-      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-      Note, if the current variable scope is marked as non-trainable
-      then this parameter is ignored and any added variables are also
-      marked as non-trainable. `trainable` defaults to `True` unless
-      `synchronization` is set to `ON_READ`.
-    caching_device: Passed to `tf.Variable`.
-    validate_shape: Passed to `tf.Variable`.
-    constraint: Constraint instance (callable).
-    use_resource: Whether to use a `ResourceVariable`.
-    collections: List of graph collections keys. The new variable is added to
-      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-    synchronization: Indicates when a distributed a variable will be
-      aggregated. Accepted values are constants defined in the class
-      `tf.VariableSynchronization`. By default the synchronization is set to
-      `AUTO` and the current `DistributionStrategy` chooses
-      when to synchronize. If `synchronization` is set to `ON_READ`,
-      `trainable` must not be set to `True`.
-    aggregation: Indicates how a distributed variable will be aggregated.
-      Accepted values are constants defined in the class
-      `tf.VariableAggregation`.
-    partitioner: Not handled at this time.
-    layout: the optional DTensor layout, used for creating DVariable.
-
-  Returns:
-    Variable instance.
-  """
-  initializing_from_value = False
-  if initializer is not None and not callable(initializer):
-    initializing_from_value = True
-
-  if initializing_from_value:
-    init_val = initializer
-    variable_dtype = None
-  else:
-    # Instantiate initializer if provided initializer is a type object.
-    if tf_inspect.isclass(initializer):
-      initializer = initializer()
-    if layout:
-      init_val = functools.partial(initializer, shape, dtype=dtype,
-                                   layout=layout)
+    # import keras will import base_layer and then this module, and metric
+    # relies on base_layer, which result into a cyclic dependency.
+    from keras import metrics as metrics_module
+
+    metric_obj = metrics_module.Mean(name=name, dtype=value.dtype)
+    return metric_obj, metric_obj(value)
+
+
+def infer_init_val_and_dtype(initializer, dtype, shape, layout=None):
+    if initializer is not None and not callable(initializer):
+        init_val = initializer
+        variable_dtype = None
     else:
-      init_val = functools.partial(initializer, shape, dtype=dtype)
-    variable_dtype = dtype.base_dtype
-
-  variable_shape = tf.TensorShape(shape)
-
-  if use_resource is None:
-    use_resource = True
-
-  if layout is None:
-    # In theory, in `use_resource` is True and `collections` is empty
-    # (that is to say, in TF2), we can use tf.Variable.
-    # However, this breaks legacy (Estimator) checkpoints because
-    # it changes variable names. Remove this when V1 is fully deprecated.
-    return tf.compat.v1.Variable(
-        initial_value=init_val,
-        name=name,
-        trainable=trainable,
-        caching_device=caching_device,
-        dtype=variable_dtype,
-        validate_shape=validate_shape,
-        constraint=constraint,
-        use_resource=use_resource,
-        collections=collections,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        shape=variable_shape if variable_shape else None)
-  else:
-    return dtensor.DVariable(
-        initial_value=init_val,
-        name=name,
-        trainable=trainable,
-        caching_device=caching_device,
-        dtype=variable_dtype,
-        validate_shape=validate_shape,
-        constraint=constraint,
-        collections=collections,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        shape=variable_shape if variable_shape else None)
+        # Instantiate initializer if provided initializer is a type object.
+        if tf_inspect.isclass(initializer):
+            initializer = initializer()
+        if layout:
+            init_val = functools.partial(
+                initializer, shape, dtype=dtype, layout=layout
+            )
+        else:
+            init_val = functools.partial(initializer, shape, dtype=dtype)
+        variable_dtype = dtype.base_dtype
+    return init_val, variable_dtype
+
+
+def make_variable(
+    name,
+    shape=None,
+    dtype=tf.float32,
+    initializer=None,
+    trainable=None,
+    caching_device=None,
+    validate_shape=True,
+    constraint=None,
+    use_resource=None,
+    collections=None,
+    synchronization=tf.VariableSynchronization.AUTO,
+    aggregation=tf.VariableAggregation.NONE,
+    partitioner=None,
+    layout=None,
+    experimental_enable_variable_lifting=True,
+):
+    """Util to create a variable (relies on `variable_scope.variable`).
+
+    Some reuse-related technicalities prevent us from using
+    `variable_scope.get_variable()` directly, so we use a subcomponent
+    that has fewer constraints (`variable_scope.variable()`).
+
+    In the longer term, it seems like a similar "default variable creator"
+    method should exist in `Trackable` instead. When this happens, we can get
+    rid of this temporary solution.
+
+    TODO(fchollet): remove this method when no longer needed.
+
+    Args:
+      name: Variable name.
+      shape: Variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: Initializer instance (callable).
+      trainable: Whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable. `trainable` becomes `True` unless
+        `synchronization` is set to `ON_READ`. Defaults to `None`.
+      caching_device: Passed to `tf.Variable`.
+      validate_shape: Passed to `tf.Variable`.
+      constraint: Constraint instance (callable).
+      use_resource: Whether to use a `ResourceVariable`.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      partitioner: Not handled at this time.
+      layout: the optional DTensor layout, used for creating DVariable.
+
+    Returns:
+      Variable instance.
+    """
+    init_val, variable_dtype = infer_init_val_and_dtype(
+        initializer, dtype, shape, layout
+    )
+    variable_shape = tf.TensorShape(shape)
+
+    if use_resource is None:
+        use_resource = True
+
+    if layout is None:
+        # In theory, in `use_resource` is True and `collections` is empty
+        # (that is to say, in TF2), we can use tf.Variable.
+        # However, this breaks legacy (Estimator) checkpoints because
+        # it changes variable names. Remove this when V1 is fully deprecated.
+        return tf1.Variable(
+            initial_value=init_val,
+            name=name,
+            trainable=trainable,
+            caching_device=caching_device,
+            dtype=variable_dtype,
+            validate_shape=validate_shape,
+            constraint=constraint,
+            use_resource=use_resource,
+            collections=collections,
+            synchronization=synchronization,
+            aggregation=aggregation,
+            shape=variable_shape if variable_shape else None,
+            experimental_enable_variable_lifting=experimental_enable_variable_lifting,  # noqa: E501
+        )
+    else:
+        return dtensor.DVariable(
+            initial_value=init_val,
+            name=name,
+            trainable=trainable,
+            caching_device=caching_device,
+            dtype=variable_dtype,
+            validate_shape=validate_shape,
+            constraint=constraint,
+            collections=collections,
+            synchronization=synchronization,
+            aggregation=aggregation,
+            shape=variable_shape if variable_shape else None,
+        )
 
 
 def collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
+    """Retrieves the output mask(s) of the previous node.
 
-  Args:
-      input_tensors: An arbitrary structure of Tensors.
+    Args:
+        input_tensors: An arbitrary structure of Tensors.
 
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
+    Returns:
+        A mask tensor or list of mask tensors.
+    """
 
-  def _collect_previous_mask(x):
-    return getattr(x, '_keras_mask', None)
+    def _collect_previous_mask(x):
+        return getattr(x, "_keras_mask", None)
 
-  return tf.nest.map_structure(_collect_previous_mask, input_tensors)
+    return tf.nest.map_structure(_collect_previous_mask, input_tensors)
 
 
 def have_all_keras_metadata(tensors):
-  return all(hasattr(x, '_keras_history') for x in tf.nest.flatten(tensors))
+    return all(hasattr(x, "_keras_history") for x in tf.nest.flatten(tensors))
 
 
 def generate_placeholders_from_shape(shape):
-  return tf.compat.v1.placeholder(shape=shape, dtype=backend.floatx())
+    return tf1.placeholder(shape=shape, dtype=backend.floatx())
 
 
 def create_keras_history(tensors):
-  """Wraps TensorFlow Operations for compatibility with the Functional API.
+    """Wraps TensorFlow Operations for compatibility with the Functional API.
 
-  This method checks to see if a Tensor in `tensors` is missing Keras metadata
-  and has its origin in a Keras `Input` Layer. If so, this method will replace
-  the raw TensorFlow Operations that created this tensor with
-  `TensorFlowOpLayer` instances that create identical operations.
+    This method checks to see if a Tensor in `tensors` is missing Keras metadata
+    and has its origin in a Keras `Input` Layer. If so, this method will replace
+    the raw TensorFlow Operations that created this tensor with
+    `TensorFlowOpLayer` instances that create identical operations.
 
-  Any Tensors not originating from a Keras `Input` Layer will be treated as
-  constants when constructing `TensorFlowOpLayer` instances.
+    Any Tensors not originating from a Keras `Input` Layer will be treated as
+    constants when constructing `TensorFlowOpLayer` instances.
 
-  Args:
-    tensors: A structure of Tensors, some of which come from raw TensorFlow
-      operations and need to have Keras metadata assigned to them.
+    Args:
+      tensors: A structure of Tensors, some of which come from raw TensorFlow
+        operations and need to have Keras metadata assigned to them.
 
-  Returns:
-    created_layers: List. The `TensorFlowOpLayer` instances created to wrap
-      the raw Tensorflow operations.
-  """
-  _, created_layers = _create_keras_history_helper(tensors, set(), [])
-  return created_layers
+    Returns:
+      created_layers: List. The `TensorFlowOpLayer` instances created to wrap
+        the raw Tensorflow operations.
+    """
+    _, created_layers = _create_keras_history_helper(tensors, set(), [])
+    return created_layers
 
 
 # Unsafe Internal attribute.
@@ -212,232 +226,245 @@ def create_keras_history(tensors):
 
 
 def _create_keras_history_helper(tensors, processed_ops, created_layers):
-  """Helper method for `create_keras_history`.
-
-  Args:
-    tensors: A structure of Tensors for which to create Keras metadata.
-    processed_ops: Set. TensorFlow operations that have already been wrapped in
-      `TensorFlowOpLayer` instances.
-    created_layers: List. The `TensorFlowOpLayer` instances created.
-
-  Returns:
-    Tuple. First element is the updated set of TensorFlow Operations that
-    have been wrapped in `TensorFlowOpLayer` instances. Second element is
-    a list of the `TensorFlowOpLayer` instances created.
-  """
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    raise ValueError(
-        '`create_keras_history` should only be called if eager is disabled!')
-  # Import of `base_layer` needed in order to create `TensorFlowOpLayer`.
-  # Cannot be imported at top because of circular dependencies.
-  # TODO(omalleyt): Resolve circular dependency.
-  from keras.engine import base_layer  # pylint: disable=g-import-not-at-top
-  tensor_list = tf.nest.flatten(tensors)
-  sparse_ops = []
-  ragged_tensors = []
-  for tensor in tensor_list:
-    if getattr(tensor, '_keras_history', None) is not None:
-      continue
-    if isinstance(
-        tensor, (tf.SparseTensor, tf.compat.v1.SparseTensorValue)):
-      sparse_ops.append(tensor.op)
-      continue
-    if tf_utils.is_ragged(tensor):
-      # Ragged tensors don't have an op property
-      ragged_tensors.append(tensor)
-      continue
-    op = tensor.op  # The Op that created this Tensor.
-    if op not in processed_ops:
-      # Recursively set `_keras_history`.
-      op_inputs = list(op.inputs)
-      constants = {}
-      layer_inputs = []
-      for i, op_input in enumerate(op_inputs):
-        if uses_keras_history(op_input):
-          layer_inputs.append(op_input)
-        else:
-          # Treat any value not originating from a `keras.Input` as
-          # a constant. Variables cannot be supported.
-          ds_with_session = (
-              tf.distribute.in_cross_replica_context() and
-              not tf.compat.v1.executing_eagerly_outside_functions())
-          using_xla = control_flow_util.GraphOrParentsInXlaContext(
-              tf.compat.v1.get_default_graph())
-          if ds_with_session or using_xla or _UNSAFE_GRAPH_OP_LAYER_CREATION:
-            # In Legacy Graph mode, evaluating here makes Session be
-            # configured improperly. The downside of this is that saving
-            # via `get_config` breaks, but SavedModel still works.
-            constants[i] = op_input
-          else:
-            with tf.init_scope():
-              constants[i] = backend.function([], op_input)([])
-      layer_inputs = unnest_if_single_tensor(layer_inputs)
-      processed_ops, created_layers = _create_keras_history_helper(
-          layer_inputs, processed_ops, created_layers)
-      name = op.name
-      node_def = op.node_def.SerializeToString()
-      op_layer = base_layer.TensorFlowOpLayer(
-          node_def, constants=constants, name=name)
-      created_layers.append(op_layer)
-      op_layer._set_connectivity_metadata(  # pylint: disable=protected-access
-          args=(layer_inputs,),
-          kwargs={},
-          outputs=op.outputs)
-      processed_ops.update([op])
-  if sparse_ops or ragged_tensors:
-    lambda_example = """
+    """Helper method for `create_keras_history`.
+
+    Args:
+      tensors: A structure of Tensors for which to create Keras metadata.
+      processed_ops: Set. TensorFlow operations that have already been wrapped
+        in `TensorFlowOpLayer` instances.
+      created_layers: List. The `TensorFlowOpLayer` instances created.
+
+    Returns:
+      Tuple. First element is the updated set of TensorFlow Operations that
+      have been wrapped in `TensorFlowOpLayer` instances. Second element is
+      a list of the `TensorFlowOpLayer` instances created.
+    """
+    if tf1.executing_eagerly_outside_functions():
+        raise ValueError(
+            "`create_keras_history` should only be called if eager is disabled!"
+        )
+    # Import of `base_layer` needed in order to create `TensorFlowOpLayer`.
+    # Cannot be imported at top because of circular dependencies.
+    # TODO(omalleyt): Resolve circular dependency.
+    from keras.engine import base_layer
+
+    tensor_list = tf.nest.flatten(tensors)
+    sparse_ops = []
+    ragged_tensors = []
+    for tensor in tensor_list:
+        if getattr(tensor, "_keras_history", None) is not None:
+            continue
+        if isinstance(tensor, (tf.SparseTensor, tf1.SparseTensorValue)):
+            sparse_ops.append(tensor.op)
+            continue
+        if tf_utils.is_ragged(tensor):
+            # Ragged tensors don't have an op property
+            ragged_tensors.append(tensor)
+            continue
+        op = tensor.op  # The Op that created this Tensor.
+        if op not in processed_ops:
+            # Recursively set `_keras_history`.
+            op_inputs = list(op.inputs)
+            constants = {}
+            layer_inputs = []
+            for i, op_input in enumerate(op_inputs):
+                if uses_keras_history(op_input):
+                    layer_inputs.append(op_input)
+                else:
+                    # Treat any value not originating from a `keras.Input` as
+                    # a constant. Variables cannot be supported.
+                    ds_with_session = (
+                        tf.distribute.in_cross_replica_context()
+                        and not tf1.executing_eagerly_outside_functions()
+                    )
+                    using_xla = control_flow_util.GraphOrParentsInXlaContext(
+                        tf1.get_default_graph()
+                    )
+                    if (
+                        ds_with_session
+                        or using_xla
+                        or _UNSAFE_GRAPH_OP_LAYER_CREATION
+                    ):
+                        # In Legacy Graph mode, evaluating here makes Session be
+                        # configured improperly. The downside of this is that
+                        # saving via `get_config` breaks, but SavedModel still
+                        # works.
+                        constants[i] = op_input
+                    else:
+                        with tf.init_scope():
+                            constants[i] = backend.function([], op_input)([])
+            layer_inputs = unnest_if_single_tensor(layer_inputs)
+            processed_ops, created_layers = _create_keras_history_helper(
+                layer_inputs, processed_ops, created_layers
+            )
+            name = op.name
+            node_def = op.node_def.SerializeToString()
+            op_layer = base_layer.TensorFlowOpLayer(
+                node_def, constants=constants, name=name
+            )
+            created_layers.append(op_layer)
+            op_layer._set_connectivity_metadata(
+                args=(layer_inputs,), kwargs={}, outputs=op.outputs
+            )
+            processed_ops.update([op])
+    if sparse_ops or ragged_tensors:
+        lambda_example = """
     weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
     output = tf.keras.layers.Lambda(weights_mult)(input)
     """
-    raise ValueError(
-        'Tensorflow ops that generate ragged or sparse tensor '
-        'outputs are currently not supported by Keras automatic '
-        'op wrapping. Please wrap these ops in a Lambda layer: '
-        '\n\n```\n{example}\n```\n'
-        'Sparse ops encountered: {sparse_ops}\n'
-        'Ragged tensors encountered: {ragged_tensors}\n'.format(
-            example=lambda_example,
-            sparse_ops=str(sparse_ops),
-            ragged_tensors=str(ragged_tensors)))
-  return processed_ops, created_layers
+        raise ValueError(
+            "Tensorflow ops that generate ragged or sparse tensor "
+            "outputs are currently not supported by Keras automatic "
+            "op wrapping. Please wrap these ops in a Lambda layer: "
+            "\n\n```\n{example}\n```\n"
+            "Sparse ops encountered: {sparse_ops}\n"
+            "Ragged tensors encountered: {ragged_tensors}\n".format(
+                example=lambda_example,
+                sparse_ops=str(sparse_ops),
+                ragged_tensors=str(ragged_tensors),
+            )
+        )
+    return processed_ops, created_layers
 
 
 def unnest_if_single_tensor(input_tensors):
-  # Preserve compatibility with older configs
-  flat_input_tensors = tf.nest.flatten(input_tensors)
-  # If this is a single element but not a dict, unwrap. If this is a dict,
-  # assume the first layer expects a dict (as is the case with a
-  # DenseFeatures layer); pass through.
-  if not isinstance(input_tensors, dict) and len(flat_input_tensors) == 1:
-    input_tensors = flat_input_tensors[0]
-  return input_tensors
+    # Preserve compatibility with older configs
+    flat_input_tensors = tf.nest.flatten(input_tensors)
+    # If this is a single element but not a dict, unwrap. If this is a dict,
+    # assume the first layer expects a dict (as is the case with a
+    # DenseFeatures layer); pass through.
+    if not isinstance(input_tensors, dict) and len(flat_input_tensors) == 1:
+        input_tensors = flat_input_tensors[0]
+    return input_tensors
 
 
 def needs_keras_history(tensors, ignore_call_context=False):
-  """Check if any Tensors need to be wrapped in TensorFlowOpLayers.
-
-  This will never return True inside a sublayer, because sublayers
-  do not need to create Keras History. Otherwise, this returns True
-  if one or more of `tensors` originates from a `keras.Input` and
-  does not have `_keras_history` set.
-
-  Args:
-    tensors: An arbitrary nested structure of Tensors.
-    ignore_call_context: Whether to ignore the check of if currently
-      outside of a `call` context. This is `True` when creating
-      KerasHistory inside `Node`, where we always know that Tensors
-      are being used with the Functional API.
-
-  Returns:
-    Bool, whether at least one Tensor needs to be wrapped.
-  """
-  input_tensors = tf.nest.flatten(tensors)
-  if call_context().in_call and not ignore_call_context:
-    return False
-  if all(
-      getattr(tensor, '_keras_history', None) is not None
-      for tensor in input_tensors):
-    # KerasHistory already set.
-    return False
-  return uses_keras_history(tensors)
+    """Check if any Tensors need to be wrapped in TensorFlowOpLayers.
+
+    This will never return True inside a sublayer, because sublayers
+    do not need to create Keras History. Otherwise, this returns True
+    if one or more of `tensors` originates from a `keras.Input` and
+    does not have `_keras_history` set.
+
+    Args:
+      tensors: An arbitrary nested structure of Tensors.
+      ignore_call_context: Whether to ignore the check of if currently
+        outside of a `call` context. This is `True` when creating
+        KerasHistory inside `Node`, where we always know that Tensors
+        are being used with the Functional API.
+
+    Returns:
+      Bool, whether at least one Tensor needs to be wrapped.
+    """
+    input_tensors = tf.nest.flatten(tensors)
+    if call_context().in_call and not ignore_call_context:
+        return False
+    if all(
+        getattr(tensor, "_keras_history", None) is not None
+        for tensor in input_tensors
+    ):
+        # KerasHistory already set.
+        return False
+    return uses_keras_history(tensors)
 
 
 def is_in_keras_graph():
-  """Returns if currently executing inside of a Keras graph."""
-  return call_context().in_keras_graph
+    """Returns if currently executing inside of a Keras graph."""
+    return call_context().in_keras_graph
 
 
 def is_in_eager_or_tf_function():
-  """Returns if in eager mode or inside of a tf.function."""
-  return tf.executing_eagerly() or is_in_tf_function()
+    """Returns if in eager mode or inside of a tf.function."""
+    return tf.executing_eagerly() or is_in_tf_function()
 
 
 def is_in_tf_function():
-  """Returns if inside of a tf.function."""
-  # Check if running in V1 graph mode.
-  if not tf.compat.v1.executing_eagerly_outside_functions():
-    return False
-  if not tf.inside_function():
-    return False
-  # Check if inside Keras FuncGraph.
-  if is_in_keras_graph():
-    return False
-  # Check for a v1 `wrap_function` FuncGraph.
-  graph = tf.compat.v1.get_default_graph()
-  if (getattr(graph, 'name', False) and
-      graph.name.startswith('wrapped_function')):
-    return False
-  return True
+    """Returns if inside of a tf.function."""
+    # Check if running in V1 graph mode.
+    if not tf1.executing_eagerly_outside_functions():
+        return False
+    if not tf.inside_function():
+        return False
+    # Check if inside Keras FuncGraph.
+    if is_in_keras_graph():
+        return False
+    # Check for a v1 `wrap_function` FuncGraph.
+    graph = tf1.get_default_graph()
+    if getattr(graph, "name", False) and graph.name.startswith(
+        "wrapped_function"
+    ):
+        return False
+    return True
 
 
 def uses_keras_history(tensors):
-  """Check if at least one Tensor originates from a `keras.Input`.
+    """Check if at least one Tensor originates from a `keras.Input`.
 
-  This is `True` if at least one Tensor has its origin in a `keras.Input`.
-  Any Tensor that originates from a `keras.Input` will have a dependency
-  Tensor with a `_keras_history` attribute attached. Tensors that have
-  already been checked to not originate from a `keras.Input`
-  are marked as `_keras_history_checked`.
+    This is `True` if at least one Tensor has its origin in a `keras.Input`.
+    Any Tensor that originates from a `keras.Input` will have a dependency
+    Tensor with a `_keras_history` attribute attached. Tensors that have
+    already been checked to not originate from a `keras.Input`
+    are marked as `_keras_history_checked`.
 
-  Args:
-    tensors: An arbitrary nested structure of Tensors.
+    Args:
+      tensors: An arbitrary nested structure of Tensors.
 
-  Returns:
-    Bool, whether at least one Tensor originates from a `keras.Input`.
-  """
-  checked_tensors = set()
-  tensors_to_check = tf.nest.flatten(tensors)
+    Returns:
+      Bool, whether at least one Tensor originates from a `keras.Input`.
+    """
+    checked_tensors = set()
+    tensors_to_check = tf.nest.flatten(tensors)
 
-  while tensors_to_check:
-    new_tensors_to_check = []
-    for tensor in tensors_to_check:
-      if id(tensor) in checked_tensors:
-        continue
+    while tensors_to_check:
+        new_tensors_to_check = []
+        for tensor in tensors_to_check:
+            if id(tensor) in checked_tensors:
+                continue
 
-      checked_tensors.add(id(tensor))
+            checked_tensors.add(id(tensor))
 
-      if getattr(tensor, '_keras_history_checked', None) is not None:
-        continue
-      if getattr(tensor, '_keras_history', None) is not None:
-        return True
+            if getattr(tensor, "_keras_history_checked", None) is not None:
+                continue
+            if getattr(tensor, "_keras_history", None) is not None:
+                return True
 
-      try:
-        new_tensors_to_check.extend(tensor.op.inputs)
-      except AttributeError:
-        # In case `tensor` is a Variable created in an Eager context.
-        pass
+            try:
+                new_tensors_to_check.extend(tensor.op.inputs)
+            except AttributeError:
+                # In case `tensor` is a Variable created in an Eager context.
+                pass
 
-    tensors_to_check = new_tensors_to_check
+        tensors_to_check = new_tensors_to_check
 
-  # Mark that these Tensors have been checked once for `_keras_history`,
-  # and should not be checked again for performance reasons.
-  mark_checked(tensors)
-  return False
+    # Mark that these Tensors have been checked once for `_keras_history`,
+    # and should not be checked again for performance reasons.
+    mark_checked(tensors)
+    return False
 
 
 def mark_checked(tensors):
-  """Marks that these Tensors should not be tracked.
+    """Marks that these Tensors should not be tracked.
 
-  This prevents Layers from attempting to create TensorFlowOpLayers
-  for these Tensors.
+    This prevents Layers from attempting to create TensorFlowOpLayers
+    for these Tensors.
 
-  Args:
-    tensors: An arbitrary structure of Tensors.
-  """
+    Args:
+      tensors: An arbitrary structure of Tensors.
+    """
 
-  def _mark_checked(tensor):
-    tensor._keras_history_checked = True  # pylint: disable=protected-access
+    def _mark_checked(tensor):
+        tensor._keras_history_checked = True
 
-  tf.nest.map_structure(_mark_checked, tensors)
+    tf.nest.map_structure(_mark_checked, tensors)
 
 
 def call_context():
-  """Returns currently active `CallContext`."""
-  call_ctx = getattr(_call_context, 'call_context', None)
-  if call_ctx is None:
-    call_ctx = CallContext()
-    _call_context.call_context = call_ctx
-  return call_ctx
+    """Returns currently active `CallContext`."""
+    call_ctx = getattr(_call_context, "call_context", None)
+    if call_ctx is None:
+        call_ctx = CallContext()
+        _call_context.call_context = call_ctx
+    return call_ctx
 
 
 # Inject the call_context function to keras_deps to remove the dependency
@@ -446,167 +473,174 @@ def call_context():
 
 
 class CallContext:
-  """Keeps track of properties currently inside a Layer/Model's `call`.
-
-  Attributes:
-    in_call: Whether currently inside the `call` of a Layer.
-    layer: The `Layer` whose `call` is currently active.
-    inputs: The inputs to the currently active `Layer`.
-    build_graph: Whether currently inside a Graph or FuncGraph.
-    training: Whether currently executing in training or inference mode.
-    saving: Whether currently saving to SavedModel.
-    frozen: Whether currently executing inside a `Layer` with `trainable` set to
-      `False`.
-    in_keras_graph: Whether executing inside the Keras Graph.
-  """
-
-  def __init__(self):
-    # Handle `in_call` separately as it is the most-read attr and reading it is
-    # on the hot path.
-    self.in_call = False
-    self._state = {
-        'layer': None,
-        'inputs': None,
-        'build_graph': False,
-        'training': None,
-        'saving': None
-    }
-    # TODO(b/150169018): This logic can be replaced after the Functional API
-    # refactor.
-    self._in_keras_graph = False
-
-  def enter(self, layer, inputs, build_graph, training, saving=None):
-    """Push a Layer and its inputs and state onto the current call context.
+    """Keeps track of properties currently inside a Layer/Model's `call`.
 
-    Args:
+    Attributes:
+      in_call: Whether currently inside the `call` of a Layer.
       layer: The `Layer` whose `call` is currently active.
       inputs: The inputs to the currently active `Layer`.
       build_graph: Whether currently inside a Graph or FuncGraph.
       training: Whether currently executing in training or inference mode.
       saving: Whether currently saving to SavedModel.
-
-    Returns:
-      Context manager.
+      frozen: Whether currently executing inside a `Layer` with `trainable` set
+        to `False`.
+      in_keras_graph: Whether executing inside the Keras Graph.
     """
-    state = {
-        'layer': layer,
-        'inputs': inputs,
-        'build_graph': build_graph,
-        'training': training,
-        'saving': saving
-    }
-    return CallContextManager(self, state)
-
-  @property
-  def layer(self):
-    return self._state['layer']
-
-  @property
-  def inputs(self):
-    return self._state['inputs']
-
-  @property
-  def build_graph(self):
-    return self._state['build_graph']
-
-  @property
-  def training(self):
-    return self._state['training']
-
-  @property
-  def saving(self):
-    return self._state['saving']
-
-  @property
-  def frozen(self):
-    layer = self._state['layer']
-    if not layer:
-      return False
-    return not layer.trainable
-
-  @property
-  def in_keras_graph(self):
-    # Returns True even if in a subgraph of the Keras graph, such as those
-    # created by control flow ops.
-    if tf.executing_eagerly():
-      return False
-    return (self._in_keras_graph or
-            getattr(backend.get_graph(), 'name', None) == 'keras_graph')
+
+    def __init__(self):
+        # Handle `in_call` separately as it is the most-read attr and reading it
+        # is on the hot path.
+        self.in_call = False
+        self._state = {
+            "layer": None,
+            "inputs": None,
+            "build_graph": False,
+            "training": None,
+            "saving": None,
+        }
+        # TODO(b/150169018): This logic can be replaced after the Functional API
+        # refactor.
+        self._in_keras_graph = False
+
+    def enter(self, layer, inputs, build_graph, training, saving=None):
+        """Push a Layer and its inputs and state onto the current call context.
+
+        Args:
+          layer: The `Layer` whose `call` is currently active.
+          inputs: The inputs to the currently active `Layer`.
+          build_graph: Whether currently inside a Graph or FuncGraph.
+          training: Whether currently executing in training or inference mode.
+          saving: Whether currently saving to SavedModel.
+
+        Returns:
+          Context manager.
+        """
+        state = {
+            "layer": layer,
+            "inputs": inputs,
+            "build_graph": build_graph,
+            "training": training,
+            "saving": saving,
+        }
+        return CallContextManager(self, state)
+
+    @property
+    def layer(self):
+        return self._state["layer"]
+
+    @property
+    def inputs(self):
+        return self._state["inputs"]
+
+    @property
+    def build_graph(self):
+        return self._state["build_graph"]
+
+    @property
+    def training(self):
+        return self._state["training"]
+
+    @property
+    def saving(self):
+        return self._state["saving"]
+
+    @property
+    def frozen(self):
+        layer = self._state["layer"]
+        if not layer:
+            return False
+        return not layer.trainable
+
+    @property
+    def in_keras_graph(self):
+        # Returns True even if in a subgraph of the Keras graph, such as those
+        # created by control flow ops.
+        if tf.executing_eagerly():
+            return False
+        return (
+            self._in_keras_graph
+            or getattr(backend.get_graph(), "name", None) == "keras_graph"
+        )
 
 
 class CallContextManager:
-  """Context manager for `CallContext`."""
+    """Context manager for `CallContext`."""
 
-  def __init__(self, call_ctx, state):
-    self._call_ctx = call_ctx
-    self._state = state
-    self._build_graph = state['build_graph']
+    def __init__(self, call_ctx, state):
+        self._call_ctx = call_ctx
+        self._state = state
+        self._build_graph = state["build_graph"]
 
-  def __enter__(self):
-    call_ctx = self._call_ctx
-    self._prev_in_call = call_ctx.in_call
-    self._prev_state = call_ctx._state
+    def __enter__(self):
+        call_ctx = self._call_ctx
+        self._prev_in_call = call_ctx.in_call
+        self._prev_state = call_ctx._state
 
-    call_ctx.in_call = True
-    call_ctx._state = self._state
+        call_ctx.in_call = True
+        call_ctx._state = self._state
 
-    # TODO(b/150169018): This logic can be removed after the Functional API
-    # refactor.
-    if self._build_graph:
-      self._prev_in_keras_graph = call_ctx._in_keras_graph
-      call_ctx._in_keras_graph = (
-          call_ctx._in_keras_graph or
-          getattr(backend.get_graph(), 'name', None) == 'keras_graph')
+        # TODO(b/150169018): This logic can be removed after the Functional API
+        # refactor.
+        if self._build_graph:
+            self._prev_in_keras_graph = call_ctx._in_keras_graph
+            call_ctx._in_keras_graph = (
+                call_ctx._in_keras_graph
+                or getattr(backend.get_graph(), "name", None) == "keras_graph"
+            )
 
-  def __exit__(self, *exc_info):
-    call_ctx = self._call_ctx
-    call_ctx.in_call = self._prev_in_call
-    call_ctx._state = self._prev_state
+    def __exit__(self, *exc_info):
+        call_ctx = self._call_ctx
+        call_ctx.in_call = self._prev_in_call
+        call_ctx._state = self._prev_state
 
-    if self._build_graph:
-      call_ctx._in_keras_graph = self._prev_in_keras_graph
+        if self._build_graph:
+            call_ctx._in_keras_graph = self._prev_in_keras_graph
 
 
 def training_arg_passed_to_call(argspec, args, kwargs):
-  """Returns whether a user passed the `training` argument in `__call__`."""
-  # `argspec.args` starts with ['self', 'inputs']
-  full_args = dict(zip(argspec.args[2:], args))
-  full_args.update(kwargs)
-  return 'training' in full_args and full_args['training'] is not None
+    """Returns whether a user passed the `training` argument in `__call__`."""
+    # `argspec.args` starts with ['self', 'inputs']
+    full_args = dict(zip(argspec.args[2:], args))
+    full_args.update(kwargs)
+    return "training" in full_args and full_args["training"] is not None
 
 
 def is_subclassed(layer):
-  """Returns True if the object is a subclassed layer or subclassed model."""
-  return (layer.__module__.find('keras.engine') == -1 and
-          layer.__module__.find('keras.layers') == -1)
+    """Returns True if the object is a subclassed layer or subclassed model."""
+    return (
+        layer.__module__.find("keras.engine") == -1
+        and layer.__module__.find("keras.layers") == -1
+    )
 
 
 def from_saved_model(layer):
-  """Returns whether the layer is loaded from a SavedModel."""
-  return layer.__module__.find('keras.saving.saved_model') != -1
-
-
-def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
-  """Checks that tensors passed to `add_*` method match the Keras graph.
-
-  When one of the `add_*` method is called inside a V2 conditional branch,
-  the underlying tensor gets created in a FuncGraph managed by control_flow_v2.
-  We need to raise clear error messages in such cases.
-
-  Args:
-    tensor: Tensor to check, or `False` if it is known that an error
-      should be raised.
-    method: Caller method, one of {'add_metric', 'add_loss', 'add_update'}.
-    force_raise: If an error should be raised regardless of `tensor`.
-
-  Raises:
-    RuntimeError: In case of an out-of-graph tensor.
-  """
-  if (force_raise or
-      (tf.compat.v1.executing_eagerly_outside_functions() and
-       hasattr(tensor, 'graph') and tensor.graph.is_control_flow_graph)):
-    if method == 'activity_regularizer':
-      bad_example = """
+    """Returns whether the layer is loaded from a SavedModel."""
+    return layer.__module__.find("keras.saving.legacy.saved_model") != -1
+
+
+def check_graph_consistency(tensor=None, method="add_loss", force_raise=False):
+    """Checks that tensors passed to `add_*` method match the Keras graph.
+
+    When one of the `add_*` method is called inside a V2 conditional branch, the
+    underlying tensor gets created in a FuncGraph managed by control_flow_v2.
+    We need to raise clear error messages in such cases.
+
+    Args:
+      tensor: Tensor to check, or `False` if it is known that an error
+        should be raised.
+      method: Caller method, one of {'add_metric', 'add_loss', 'add_update'}.
+      force_raise: If an error should be raised regardless of `tensor`.
+
+    Raises:
+      RuntimeError: In case of an out-of-graph tensor.
+    """
+    if force_raise or (
+        tf1.executing_eagerly_outside_functions()
+        and hasattr(tensor, "graph")
+        and tensor.graph.is_control_flow_graph
+    ):
+        if method == "activity_regularizer":
+            bad_example = """
       class TestModel(tf.keras.Model):
 
         def __init__(self):
@@ -619,7 +653,7 @@ def call(self, x, training=None):
           else:
             return self.dense(x)
       """
-      correct_example = """
+            correct_example = """
       class TestModel(tf.keras.Model):
 
         def __init__(self):
@@ -629,28 +663,29 @@ def __init__(self):
         def call(self, x, training=None):
           return self.dense(x)
       """
-      raise RuntimeError(
-          'You are using a layer with `activity_regularizer` in a control flow '
-          'branch, e.g.:\n{bad_example}\nThis is currently not supported. '
-          'Please move your call to the layer with `activity_regularizer` out '
-          'of the control flow branch, e.g.:\n{correct_example}\n'
-          'You can also resolve this by marking your outer model/layer dynamic'
-          ' (eager-only) by passing `dynamic=True` to the layer constructor. '
-          'Any kind of control flow is supported with dynamic layers. '
-          'Note that using `dynamic=True` requires you to implement static '
-          'shape inference in the `compute_output_shape(input_shape)` '
-          'method.'.format(
-              bad_example=bad_example, correct_example=correct_example))
-
-    if method == 'add_metric':
-      bad_example = """
+            raise RuntimeError(
+                "You are using a layer with `activity_regularizer` in a "
+                f"control flow branch, e.g.:\n{bad_example}\nThis is currently "
+                "not supported. Please move your call to the layer with "
+                "`activity_regularizer` out of the control flow branch, "
+                f"e.g.:\n{correct_example}\nYou can also resolve this by "
+                "marking your outer model/layer dynamic (eager-only) by "
+                "passing `dynamic=True` to the layer constructor. Any kind of "
+                "control flow is supported with dynamic layers. Note that "
+                "using `dynamic=True` requires you to implement static shape "
+                "inference in the `compute_output_shape(input_shape)` "
+                "method."
+            )
+
+        if method == "add_metric":
+            bad_example = """
       def call(self, inputs, training=None):
         if training:
           metric = compute_metric(inputs)
           self.add_metric(metric, name='my_metric', aggregation='mean')
         return inputs
       """
-      correct_example = """
+            correct_example = """
       def call(self, inputs, training=None):
         if training:
           metric = compute_metric(inputs)
@@ -659,15 +694,15 @@ def call(self, inputs, training=None):
         self.add_metric(metric, name='my_metric', aggregation='mean')
         return inputs
       """
-    elif method == 'add_loss':
-      bad_example = """
+        elif method == "add_loss":
+            bad_example = """
       def call(self, inputs, training=None):
         if training:
           loss = compute_loss(inputs)
           self.add_loss(loss)
         return inputs
       """
-      correct_example = """
+            correct_example = """
       def call(self, inputs, training=None):
         if training:
           loss = compute_loss(inputs)
@@ -676,14 +711,14 @@ def call(self, inputs, training=None):
         self.add_loss(loss)
         return inputs
       """
-    else:
-      bad_example = """
+        else:
+            bad_example = """
       def call(self, inputs, training=None):
         if training:
           self.add_update(self.w.assign_add(1))
         return inputs
       """
-      correct_example = """
+            correct_example = """
       def call(self, inputs, training=None):
         if training:
           increment = 1
@@ -692,207 +727,225 @@ def call(self, inputs, training=None):
         self.add_update(self.w.assign_add(increment))
         return inputs
       """
-    raise RuntimeError(
-        'You are using the method `{method}` in a control flow branch '
-        'in your layer, e.g.:\n{bad_example}\n'
-        'This is not currently supported. '
-        'Please move your call to {method} out of the control flow branch, '
-        'e.g.:\n{correct_example}\n'
-        'You can also resolve this by marking your layer '
-        'as dynamic (eager-only) by passing '
-        '`dynamic=True` to the layer constructor. '
-        'Any kind of control flow is supported with dynamic layers. '
-        'Note that using `dynamic=True` requires you '
-        'to implement static shape inference '
-        'in the `compute_output_shape(input_shape)` method.'.format(
-            method=method,
-            bad_example=bad_example,
-            correct_example=correct_example))
+        raise RuntimeError(
+            "You are using the method `{method}` in a control flow branch "
+            "in your layer, e.g.:\n{bad_example}\n"
+            "This is not currently supported. "
+            "Please move your call to {method} out of the control flow branch, "
+            "e.g.:\n{correct_example}\n"
+            "You can also resolve this by marking your layer "
+            "as dynamic (eager-only) by passing "
+            "`dynamic=True` to the layer constructor. "
+            "Any kind of control flow is supported with dynamic layers. "
+            "Note that using `dynamic=True` requires you "
+            "to implement static shape inference "
+            "in the `compute_output_shape(input_shape)` method.".format(
+                method=method,
+                bad_example=bad_example,
+                correct_example=correct_example,
+            )
+        )
 
 
 def mark_as_return(outputs, acd):
-  """Marks `outputs` as the return values for automatic control deps."""
+    """Marks `outputs` as the return values for automatic control deps."""
 
-  def _mark_as_return(tensor):
-    """Marks `tensor` as the return value for automatic control deps."""
-    if not tf.is_tensor(tensor):
-      return tensor
+    def _mark_as_return(tensor):
+        """Marks `tensor` as the return value for automatic control deps."""
+        if not tf.is_tensor(tensor):
+            return tensor
 
-    # pylint: disable=protected-access
-    return_tensor = acd.mark_as_return(tensor)
-    if getattr(tensor, '_keras_mask', None) is not None:
-      return_tensor._keras_mask = acd.mark_as_return(tensor._keras_mask)
-    else:
-      return_tensor._keras_mask = None
+        return_tensor = acd.mark_as_return(tensor)
+        if getattr(tensor, "_keras_mask", None) is not None:
+            return_tensor._keras_mask = acd.mark_as_return(tensor._keras_mask)
+        else:
+            return_tensor._keras_mask = None
 
-    # Handle TensorFlow Probability attached metadata.
-    # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
-    if getattr(tensor, '_tfp_distribution', None) is not None:
-      return_tensor._tfp_distribution = tensor._tfp_distribution
+        # Handle TensorFlow Probability attached metadata.
+        # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
+        if getattr(tensor, "_tfp_distribution", None) is not None:
+            return_tensor._tfp_distribution = tensor._tfp_distribution
 
-    return return_tensor
-    # pylint: enable=protected-access
+        return return_tensor
 
-  return tf.nest.map_structure(_mark_as_return, outputs)
+    return tf.nest.map_structure(_mark_as_return, outputs)
 
 
 V2_DTYPE_BEHAVIOR = None
 
 
-@keras_export(v1=['keras.layers.enable_v2_dtype_behavior'])
+@keras_export(v1=["keras.layers.enable_v2_dtype_behavior"])
 def enable_v2_dtype_behavior():
-  """Enable the V2 dtype behavior for Keras layers.
-
-  By default, the V2 dtype behavior is enabled in TensorFlow 2, so this function
-  is only useful if `tf.compat.v1.disable_v2_behavior` has been called. Since
-  mixed precision requires V2 dtype behavior to be enabled, this function allows
-  you to use mixed precision in Keras layers if `disable_v2_behavior` has been
-  called.
-
-  When enabled, the dtype of Keras layers defaults to floatx (which is typically
-  float32) instead of None. In addition, layers will automatically cast
-  floating-point inputs to the layer's dtype.
-
-  >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
-  >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
-  >>> print(layer.dtype)  # float32 since V2 dtype behavior is enabled
-  float32
-  >>> y = layer(x)  # Layer casts inputs since V2 dtype behavior is enabled
-  >>> print(y.dtype.name)
-  float32
-
-  A layer author can opt-out their layer from the automatic input casting by
-  passing `autocast=False` to the base Layer's constructor. This disables the
-  autocasting part of the V2 behavior for that layer, but not the defaulting to
-  floatx part of the V2 behavior.
-
-  When a global `tf.keras.mixed_precision.Policy` is set, a Keras layer's dtype
-  will default to the global policy instead of floatx. Layers will automatically
-  cast inputs to the policy's compute_dtype.
-  """
-  global V2_DTYPE_BEHAVIOR
-  V2_DTYPE_BEHAVIOR = True
-
-
-@keras_export(v1=['keras.layers.disable_v2_dtype_behavior'])
+    """Enable the V2 dtype behavior for Keras layers.
+
+    By default, the V2 dtype behavior is enabled in TensorFlow 2, so this
+    function is only useful if `tf.compat.v1.disable_v2_behavior` has been
+    called. Since mixed precision requires V2 dtype behavior to be enabled, this
+    function allows you to use mixed precision in Keras layers if
+    `disable_v2_behavior` has been called.
+
+    When enabled, the dtype of Keras layers defaults to floatx (which is
+    typically float32) instead of None. In addition, layers will automatically
+    cast floating-point inputs to the layer's dtype.
+
+    >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
+    >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
+    >>> print(layer.dtype)  # float32 since V2 dtype behavior is enabled
+    float32
+    >>> y = layer(x)  # Layer casts inputs since V2 dtype behavior is enabled
+    >>> print(y.dtype.name)
+    float32
+
+    A layer author can opt-out their layer from the automatic input casting by
+    passing `autocast=False` to the base Layer's constructor. This disables the
+    autocasting part of the V2 behavior for that layer, but not the defaulting
+    to floatx part of the V2 behavior.
+
+    When a global `tf.keras.mixed_precision.Policy` is set, a Keras layer's
+    dtype will default to the global policy instead of floatx. Layers will
+    automatically cast inputs to the policy's compute_dtype.
+    """
+    global V2_DTYPE_BEHAVIOR
+    V2_DTYPE_BEHAVIOR = True
+
+
+@keras_export(v1=["keras.layers.disable_v2_dtype_behavior"])
 def disable_v2_dtype_behavior():
-  """Disables the V2 dtype behavior for Keras layers.
+    """Disables the V2 dtype behavior for Keras layers.
 
-  See `tf.compat.v1.keras.layers.enable_v2_dtype_behavior`.
-  """
-  global V2_DTYPE_BEHAVIOR
-  V2_DTYPE_BEHAVIOR = False
+    See `tf.compat.v1.keras.layers.enable_v2_dtype_behavior`.
+    """
+    global V2_DTYPE_BEHAVIOR
+    V2_DTYPE_BEHAVIOR = False
 
 
 def v2_dtype_behavior_enabled():
-  """Returns True if the V2 dtype behavior is enabled."""
-  if V2_DTYPE_BEHAVIOR is None:
-    return tf.__internal__.tf2.enabled()
-  return V2_DTYPE_BEHAVIOR
+    """Returns True if the V2 dtype behavior is enabled."""
+    if V2_DTYPE_BEHAVIOR is None:
+        return tf.__internal__.tf2.enabled()
+    return V2_DTYPE_BEHAVIOR
 
 
 class TrackableWeightHandler:
-  """Keras wrapper for handling tracking.Trackable object saving and restoring.
-
-  This class handles Trackables in both V1 and V2 modes, ensuring that they can
-  be saved and restored with the correct data and without adding additional ops
-  on every save.
-
-  Attributes:
-    trackable: The trackable to wrap.
-    num_tensors: The number of tensors that this trackable requires for saving.
-  """
-
-  def __init__(self, trackable):
-    if not isinstance(trackable, tf.__internal__.tracking.Trackable):
-      raise ValueError(f'{trackable} is not a Trackable object.')
-    self._trackable = trackable
-    self._distribute_strategy = tf.distribute.get_strategy()
-
-    saveables = tf.__internal__.tracking.saveable_objects_from_trackable(
-        trackable).values()
-    # 'Saveables' won't exist when we're passed a legacy TF1 table like
-    # a StaticHashTable.
-    if not saveables:
-      self._num_tensors = 0
-      self._setter = lambda weights: None
-      self._getter = lambda: []
-
-    elif len(saveables) == 1:
-      saveable = list(saveables)[0]
-
-      if tf.compat.v1.executing_eagerly_outside_functions():
-        # If we're in eager mode, we need to defer calling the Trackable's
-        # saveable() callable until data export time.
-        # However, it is safe to call the saveable as many times as we want, so
-        # we will call it now to figure out how many tensors this Trackable will
-        # produce.
-        self._saveable = saveable
-        self._num_tensors = len(self._saveable().specs)
-        self._setter = lambda weights: self._saveable().restore(weights, None)
-        self._getter = lambda: [spec.tensor for spec in self._saveable().specs]
-      else:
-        # If we're in Graph mode, we need to evaluate the Saveable only once and
-        # cache the resulting restore graph. Failing to do this will result in
-        # new assignment ops being added to the graph each time set_weights() is
-        # called.
-        self._placeholder_tensors = []
-        self._saveable = saveable()
-        self._num_tensors = len(self._saveable.specs)
-        for spec in self._saveable.specs:
-          tensor = spec.tensor
-          self._placeholder_tensors.append(
-              tf.compat.v1.placeholder(tensor.dtype, tensor.shape))
-        self._assign_op = self._saveable.restore(self._placeholder_tensors,
-                                                 None)
-        self._setter = self._set_weights_v1
-        self._getter = lambda: [spec.tensor for spec in self._saveable.specs]
-    else:
-      raise ValueError(
-          'Only Trackables with one Saveable are supported. The Trackable '
-          f'{trackable} has {len(saveables)} Saveables.')
-
-  @property
-  def num_tensors(self):
-    return self._num_tensors
+    """Keras wrapper for handling Trackable object saving and restoring.
 
-  def set_weights(self, weights):
-    if len(weights) != self._num_tensors:
-      raise ValueError(
-          f'Weight handler for trackable {self._trackable} received '
-          'an incorrect number of weights: '
-          f'expected {self._num_tensors} weights, got {len(weights)} weights.')
-    self._setter(weights)
+    This class handles Trackables in both V1 and V2 modes, ensuring that they
+    can be saved and restored with the correct data and without adding
+    additional ops on every save.
 
-  def get_tensors(self):
-    return self._getter()
+    Attributes:
+      trackable: The trackable to wrap.
+      num_tensors: The number of tensors that this trackable requires for
+        saving.
+    """
 
-  def _set_weights_v1(self, weights):
-    feed_dict = {}
-    for idx, tensor in enumerate(weights):
-      feed_dict[self._placeholder_tensors[idx]] = tensor
-    backend.get_session().run(self._assign_op, feed_dict)
+    def __init__(self, trackable):
+        if not isinstance(trackable, tf.__internal__.tracking.Trackable):
+            raise ValueError(f"{trackable} is not a Trackable object.")
+        self._trackable = trackable
+        self._distribute_strategy = tf.distribute.get_strategy()
+
+        saveables = tf.__internal__.tracking.saveable_objects_from_trackable(
+            trackable
+        ).values()
+        # 'Saveables' won't exist when we're passed a legacy TF1 table like
+        # a StaticHashTable.
+        if not saveables:
+            self._num_tensors = 0
+            self._setter = lambda weights: None
+            self._getter = lambda: []
+
+        elif len(saveables) == 1:
+            saveable = list(saveables)[0]
+
+            if tf1.executing_eagerly_outside_functions():
+                # If we're in eager mode, we need to defer calling the
+                # Trackable's saveable() callable until data export time.
+                # However, it is safe to call the saveable as many times as we
+                # want, so we will call it now to figure out how many tensors
+                # this Trackable will produce.
+                self._saveable = saveable
+                self._num_tensors = len(self._saveable().specs)
+                self._setter = lambda weights: self._saveable().restore(
+                    weights, None
+                )
+                self._getter = lambda: [
+                    spec.tensor for spec in self._saveable().specs
+                ]
+            else:
+                # If we're in Graph mode, we need to evaluate the Saveable only
+                # once and cache the resulting restore graph. Failing to do this
+                # will result in new assignment ops being added to the graph
+                # each time set_weights() is called.
+                self._placeholder_tensors = []
+                self._saveable = saveable()
+                self._num_tensors = len(self._saveable.specs)
+                for spec in self._saveable.specs:
+                    tensor = spec.tensor
+                    self._placeholder_tensors.append(
+                        tf1.placeholder(tensor.dtype, tensor.shape)
+                    )
+                self._assign_op = self._saveable.restore(
+                    self._placeholder_tensors, None
+                )
+                self._setter = self._set_weights_v1
+                self._getter = lambda: [
+                    spec.tensor for spec in self._saveable.specs
+                ]
+        else:
+            raise ValueError(
+                "Only Trackables with one Saveable are supported. "
+                f"The Trackable {trackable} has {len(saveables)} Saveables."
+            )
+
+    @property
+    def num_tensors(self):
+        return self._num_tensors
+
+    def set_weights(self, weights):
+        if len(weights) != self._num_tensors:
+            raise ValueError(
+                f"Weight handler for trackable {self._trackable} received "
+                "an incorrect number of weights: "
+                f"expected {self._num_tensors} weights, "
+                f"got {len(weights)} weights."
+            )
+        self._setter(weights)
+
+    def get_tensors(self):
+        return self._getter()
+
+    def _set_weights_v1(self, weights):
+        feed_dict = {}
+        for idx, tensor in enumerate(weights):
+            feed_dict[self._placeholder_tensors[idx]] = tensor
+        backend.get_session().run(self._assign_op, feed_dict)
 
 
 def no_ragged_support(inputs, layer_name):
-  input_list = tf.nest.flatten(inputs)
-  if any(isinstance(x, tf.RaggedTensor) for x in input_list):
-    raise ValueError(
-        f'Layer {layer_name} does not support RaggedTensors as input. '
-        f'Inputs received: {inputs}. You can try converting your '
-        'input to a dense (uniform) tensor.')
+    input_list = tf.nest.flatten(inputs)
+    if any(isinstance(x, tf.RaggedTensor) for x in input_list):
+        raise ValueError(
+            f"Layer {layer_name} does not support RaggedTensors as input. "
+            f"Inputs received: {inputs}. You can try converting your "
+            "input to a dense (uniform) tensor."
+        )
 
 
 def is_split_variable(v):
-  """Returns True if `v` is either a PartionedVariable or a ShardedVariable."""
-  return hasattr(v, '_variable_list') or hasattr(v, '_variables')
+    """Returns True if `v` is a PartitionedVariable or a ShardedVariable."""
+    return not {clz.__name__ for clz in v.__class__.__mro__}.isdisjoint(
+        {"PartitionedVariable", "ShardedVariable"}
+    )
 
 
 def has_weights(obj):
-  obj_type = type(obj)
-  return (hasattr(obj_type, 'trainable_weights') and
-          hasattr(obj_type, 'non_trainable_weights') and
-          not isinstance(obj, type))
+    obj_type = type(obj)
+    return (
+        hasattr(obj_type, "trainable_weights")
+        and hasattr(obj_type, "non_trainable_weights")
+        and not isinstance(obj, type)
+    )
 
 
 # TODO(kathywu): This is a temporary hack. When a network of layers is revived
@@ -902,4 +955,5 @@ def has_weights(obj):
 # whenever eager losses are added to one layer, add eager losses to all
 # child layers. This causes `.losses` to only return eager losses.
 REVIVED_LOSS_PLACEHOLDER = (
-    'This layer\'s losses have been added to the parent layer.')
+    "This layer's losses have been added to the parent layer."
+)
diff --git a/keras/engine/base_layer_utils_test.py b/keras/engine/base_layer_utils_test.py
index ed3c73a6c8ce..67a4d2d5db22 100644
--- a/keras/engine/base_layer_utils_test.py
+++ b/keras/engine/base_layer_utils_test.py
@@ -14,97 +14,95 @@
 # ==============================================================================
 
 import numpy as np
-
 import tensorflow.compat.v2 as tf
 
 import keras
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras.engine import base_layer_utils
+from keras.testing_infra import test_combinations
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class TrackableWeightHandlerTest(test_combinations.TestCase):
-
-  def get_table_handler(self):
-    # Note: There is some repetition in these tests' setup. However, Tensorflow
-    # does not play nicely with a separate setUp() call (causing errors related
-    # to graph building), so we have to use a called setup instead of a setUp()
-    # call.
-    table = tf.lookup.experimental.MutableHashTable(
-        key_dtype=tf.string, value_dtype=tf.int32, default_value=0)
-    return base_layer_utils.TrackableWeightHandler(table)
-
-  def test_get_num_tensors(self):
-    table_handler = self.get_table_handler()
-    self.assertEqual(2, table_handler.num_tensors)
-
-  def test_get_and_set_weights(self):
-    table_handler = self.get_table_handler()
-
-    table_data = {b'a': 1, b'b': 2, b'c': 3}
-    table_handler.set_weights(
-        [list(table_data.keys()),
-         list(table_data.values())])
-    weights = backend.batch_get_value(table_handler.get_tensors())
-    weight_data = {key: value for key, value in zip(weights[0], weights[1])}
-    self.assertDictEqual(table_data, weight_data)
-
-  def test_get_and_set_weights_does_not_add_ops(self):
-    table_handler = self.get_table_handler()
-    table_data = {b'a': 1, b'b': 2, b'c': 3}
-    table_handler.set_weights(
-        [list(table_data.keys()),
-         list(table_data.values())])
-    _ = backend.batch_get_value(table_handler.get_tensors())
-    backend.get_session().graph.finalize()
-    table_handler.set_weights(
-        [list(table_data.keys()),
-         list(table_data.values())])
-    _ = backend.batch_get_value(table_handler.get_tensors())
-
-
-@test_combinations.generate(test_combinations.combine(mode=['eager']))
+    def get_table_handler(self):
+        # Note: There is some repetition in these tests' setup. However,
+        # Tensorflow does not play nicely with a separate setUp() call (causing
+        # errors related to graph building), so we have to use a called setup
+        # instead of a setUp() call.
+        table = tf.lookup.experimental.MutableHashTable(
+            key_dtype=tf.string, value_dtype=tf.int32, default_value=0
+        )
+        return base_layer_utils.TrackableWeightHandler(table)
+
+    def test_get_num_tensors(self):
+        table_handler = self.get_table_handler()
+        self.assertEqual(2, table_handler.num_tensors)
+
+    def test_get_and_set_weights(self):
+        table_handler = self.get_table_handler()
+
+        table_data = {b"a": 1, b"b": 2, b"c": 3}
+        table_handler.set_weights(
+            [list(table_data.keys()), list(table_data.values())]
+        )
+        weights = backend.batch_get_value(table_handler.get_tensors())
+        weight_data = {key: value for key, value in zip(weights[0], weights[1])}
+        self.assertDictEqual(table_data, weight_data)
+
+    def test_get_and_set_weights_does_not_add_ops(self):
+        table_handler = self.get_table_handler()
+        table_data = {b"a": 1, b"b": 2, b"c": 3}
+        table_handler.set_weights(
+            [list(table_data.keys()), list(table_data.values())]
+        )
+        _ = backend.batch_get_value(table_handler.get_tensors())
+        backend.get_session().graph.finalize()
+        table_handler.set_weights(
+            [list(table_data.keys()), list(table_data.values())]
+        )
+        _ = backend.batch_get_value(table_handler.get_tensors())
+
+
+@test_combinations.generate(test_combinations.combine(mode=["eager"]))
 class OpLayerTest(test_combinations.TestCase):
-
-  def test_tensor_op_layer(self):
-    int_values = keras.Input(shape=(2,), dtype=tf.int32)
-    float_values = tf.cast(int_values, tf.float32)
-    model = keras.Model(int_values, float_values)
-    model.compile(loss='mse')
-
-    input_data = np.array([[1, 2], [3, 4]], dtype=np.int32)
-    expected = [[1.0, 2.0], [3.0, 4.0]]
-    output = model.predict(input_data)
-    self.assertAllClose(expected, output)
-
-  def test_ragged_op_layer_keras_tensors(self):
-    int_values = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
-    float_values = tf.cast(int_values, tf.float32)
-    model = keras.Model(int_values, float_values)
-    model.compile(loss='mse')
-
-    input_data = tf.ragged.constant(
-        [[1, 2], [3, 4]], dtype=np.int32)
-    expected = [[1.0, 2.0], [3.0, 4.0]]
-    output = model.predict(input_data)
-    self.assertIsInstance(output, tf.RaggedTensor)
-    self.assertAllClose(expected, output)
-
-  def test_sparse_op_layer_keras_tensors(self):
-    int_values = keras.Input(shape=(None,), dtype=tf.int32, sparse=True)
-    float_values = tf.cast(int_values, tf.float32)
-    _ = keras.Model(int_values, float_values)
-    model = keras.Model(int_values, float_values)
-    model.compile(loss='mse')
-
-    input_data = tf.sparse.from_dense(
-        np.array([[1, 2], [3, 4]], dtype=np.int32))
-    expected = [[1.0, 2.0], [3.0, 4.0]]
-    output = model.predict(input_data)
-    self.assertIsInstance(output, tf.SparseTensor)
-    self.assertAllClose(expected, tf.sparse.to_dense(output))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_tensor_op_layer(self):
+        int_values = keras.Input(shape=(2,), dtype=tf.int32)
+        float_values = tf.cast(int_values, tf.float32)
+        model = keras.Model(int_values, float_values)
+        model.compile(loss="mse")
+
+        input_data = np.array([[1, 2], [3, 4]], dtype=np.int32)
+        expected = [[1.0, 2.0], [3.0, 4.0]]
+        output = model.predict(input_data)
+        self.assertAllClose(expected, output)
+
+    def test_ragged_op_layer_keras_tensors(self):
+        int_values = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
+        float_values = tf.cast(int_values, tf.float32)
+        model = keras.Model(int_values, float_values)
+        model.compile(loss="mse")
+
+        input_data = tf.ragged.constant([[1, 2], [3, 4]], dtype=np.int32)
+        expected = [[1.0, 2.0], [3.0, 4.0]]
+        output = model.predict(input_data)
+        self.assertIsInstance(output, tf.RaggedTensor)
+        self.assertAllClose(expected, output)
+
+    def test_sparse_op_layer_keras_tensors(self):
+        int_values = keras.Input(shape=(None,), dtype=tf.int32, sparse=True)
+        float_values = tf.cast(int_values, tf.float32)
+        _ = keras.Model(int_values, float_values)
+        model = keras.Model(int_values, float_values)
+        model.compile(loss="mse")
+
+        input_data = tf.sparse.from_dense(
+            np.array([[1, 2], [3, 4]], dtype=np.int32)
+        )
+        expected = [[1.0, 2.0], [3.0, 4.0]]
+        output = model.predict(input_data)
+        self.assertIsInstance(output, tf.SparseTensor)
+        self.assertAllClose(expected, tf.sparse.to_dense(output))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/base_layer_v1.py b/keras/engine/base_layer_v1.py
index 1e2d281d2e92..e54211473268 100644
--- a/keras/engine/base_layer_v1.py
+++ b/keras/engine/base_layer_v1.py
@@ -12,17 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-# pylint: disable=g-bad-import-order
-"""Contains the base Layer class, from which all layers inherit."""
 
-import tensorflow.compat.v2 as tf
+
+"""Contains the base Layer class, from which all layers inherit."""
 
 import functools
 import itertools
 import threading
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import constraints
 from keras import initializers
@@ -33,2211 +33,2437 @@
 from keras.mixed_precision import autocast_variable
 from keras.mixed_precision import loss_scale_optimizer
 from keras.mixed_precision import policy
-from keras.saving.saved_model import layer_serialization
+from keras.saving.legacy.saved_model import layer_serialization
 from keras.utils import generic_utils
 from keras.utils import layer_utils
 from keras.utils import object_identity
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
+
 # A module that only depends on `keras.layers` import these from here.
-from keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
-from keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
+from keras.utils.generic_utils import to_snake_case  # noqa: F401
+from keras.utils.tf_utils import is_tensor_or_tensor_list  # noqa: F401
+
+# isort: off
 from tensorflow.python.platform import tf_logging
 from tensorflow.tools.docs import doc_controls
 
 
-# pylint: disable=g-classes-have-attributes
 class Layer(base_layer.Layer):
-  """Base layer class.
-
-  This is the class from which all layers inherit.
-
-  A layer is a class implementing common neural networks operations, such
-  as convolution, batch norm, etc. These operations require managing weights,
-  losses, updates, and inter-layer connectivity.
-
-  Users will just instantiate a layer and then treat it as a callable.
-
-  We recommend that descendants of `Layer` implement the following methods:
-
-  * `__init__()`: Save configuration in member variables
-  * `build()`: Called once from `__call__`, when we know the shapes of inputs
-    and `dtype`. Should have the calls to `add_weight()`, and then
-    call the super's `build()` (which sets `self.built = True`, which is
-    nice in case the user wants to call `build()` manually before the
-    first `__call__`).
-  * `call()`: Called in `__call__` after making sure `build()` has been called
-    once. Should actually perform the logic of applying the layer to the
-    input tensors (which should be passed in as the first argument).
-
-  Args:
-    trainable: Boolean, whether the layer's variables should be trainable.
-    name: String name of the layer.
-    dtype: The dtype of the layer's computations and weights (default of
-      `None` means use `tf.keras.backend.floatx` in TensorFlow 2, or the type
-      of the first input in TensorFlow 1).
-    dynamic: Set this to `True` if your layer should only be run eagerly, and
-      should not be used to generate a static computation graph.
-      This would be the case for a Tree-RNN or a recursive network,
-      for example, or generally for any layer that manipulates tensors
-      using Python control flow. If `False`, we assume that the layer can
-      safely be used to generate a static computation graph.
-
-  Attributes:
-    name: The name of the layer (string).
-    dtype: The dtype of the layer's computations and weights. If mixed
-      precision is used with a `tf.keras.mixed_precision.Policy`, this is
-      instead just the dtype of the layer's weights, as the computations are
-      done in a different dtype.
-    updates: List of update ops of this layer.
-    losses: List of losses added by this layer.
-    trainable_weights: List of variables to be included in backprop.
-    non_trainable_weights: List of variables that should not be
-      included in backprop.
-    weights: The concatenation of the lists trainable_weights and
-      non_trainable_weights (in this order).
-    trainable: Whether the layer should be trained (boolean).
-    input_spec: Optional (list of) `InputSpec` object(s) specifying the
-      constraints on inputs that can be accepted by the layer.
-
-  Each layer has a dtype, which is typically the dtype of the layer's
-  computations and variables. A layer's dtype can be queried via the
-  `Layer.dtype` property. The dtype is specified with the `dtype` constructor
-  argument. In TensorFlow 2, the dtype defaults to `tf.keras.backend.floatx()`
-  if no dtype is passed. `floatx()` itself defaults to "float32". Additionally,
-  layers will cast their inputs to the layer's dtype in TensorFlow 2. When mixed
-  precision is used, layers may have different computation and variable dtypes.
-  See `tf.keras.mixed_precision.Policy` for details on layer dtypes.
-  """
-
-  # See tf.Module for the usage of this property.
-  # The key for _obj_reference_counts_dict is a Trackable, which could be a
-  # variable or layer etc. tf.Module._flatten will fail to flatten the key
-  # since it is trying to convert Trackable to a string. This attribute can be
-  # ignored even after the fix of nest lib, since the trackable object should
-  # already been available as individual attributes. _obj_reference_counts_dict
-  # just contains a copy of them.
-  _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
-      ('_obj_reference_counts_dict',),
-      tf.Module._TF_MODULE_IGNORED_PROPERTIES
-  ))
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
-               **kwargs):
-    self._instrument_layer_creation()
-
-    # These properties should be set by the user via keyword arguments.
-    # note that 'dtype', 'input_shape' and 'batch_input_shape'
-    # are only applicable to input layers: do not pass these keywords
-    # to non-input layers.
-    allowed_kwargs = {
-        'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
-        'weights', 'activity_regularizer', 'autocast', 'implementation'
-    }
-    # Validate optional keyword arguments.
-    generic_utils.validate_kwargs(kwargs, allowed_kwargs)
-
-    # Mutable properties
-    # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training.
-    self._trainable = trainable
-    # A stateful layer is a layer whose updates are run during inference too,
-    # for instance stateful RNNs.
-    self._stateful = False
-    # Indicates whether `build` needs to be called upon layer call, to create
-    # the layer's weights.
-    self.built = False
-    self._build_input_shape = None
-    # Provides information about which inputs are compatible with the layer.
-    self._input_spec = None
-    self.supports_masking = False
-
-    self._init_set_name(name)
-    self._activity_regularizer = regularizers.get(
-        kwargs.pop('activity_regularizer', None))
-    self._maybe_create_attribute('_trainable_weights', [])
-    self._maybe_create_attribute('_non_trainable_weights', [])
-    self._updates = []
-    # Object to store all thread local layer properties.
-    self._thread_local = threading.local()
-    # A list of zero-argument lambdas which return Tensors, used for variable
-    # regularizers.
-    self._callable_losses = []
-    # A list of symbolic Tensors containing activity regularizers and losses
-    # manually added through `add_loss` in graph-building mode.
-    self._losses = []
-    # A list of metric instances corresponding to the symbolic metric tensors
-    # added using the `add_metric` API.
-    self._metrics = []
-
-    # Note that models also have a dtype policy, as they are layers. For
-    # functional models, the policy is only used in Model.compile, which wraps
-    # the optimizer with a LossScaleOptimizer if the policy name is
-    # "mixed_float16". Subclassed models additionally use the policy's compute
-    # and variable dtypes, as like any ordinary layer.
-    self._set_dtype_policy(dtype)
-    # Boolean indicating whether the layer automatically casts its inputs to the
-    # layer's compute_dtype.
-    self._autocast = kwargs.get('autocast',
-                                base_layer_utils.v2_dtype_behavior_enabled())
-
-    # Dependencies tracked via attribute assignment.
-    # All layers in order of horizontal graph traversal.
-    # Entries are unique. For models includes input and output layers.
-    self._maybe_create_attribute('_self_tracked_trackables', [])
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    # Used in symbolic mode only, only in conjunction with graph-networks
-    self._inbound_nodes_value = []
-    self._outbound_nodes_value = []
-
-    self._init_call_fn_args()
-
-    # Whether the `call` method can be used to build a TF graph without issues.
-    # This attribute has no effect if the model is created using the Functional
-    # API. Instead, `model.dynamic` is determined based on the internal layers.
-    self._dynamic = dynamic
-
-    # Manage input shape information if passed.
-    if 'input_dim' in kwargs and 'input_shape' not in kwargs:
-      # Backwards compatibility: alias 'input_dim' to 'input_shape'.
-      kwargs['input_shape'] = (kwargs['input_dim'],)
-    if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
-      # In this case we will later create an input layer
-      # to insert before the current layer
-      if 'batch_input_shape' in kwargs:
-        batch_input_shape = tuple(kwargs['batch_input_shape'])
-      elif 'input_shape' in kwargs:
-        if 'batch_size' in kwargs:
-          batch_size = kwargs['batch_size']
-        else:
-          batch_size = None
-        batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
-      self._batch_input_shape = batch_input_shape
-
-    # Manage initial weight values if passed.
-    self._initial_weights = kwargs.get('weights', None)
-
-    # Whether the layer will track any layers that is set as attribute on itself
-    # as sub-layers, the weights from the sub-layers will be included in the
-    # parent layer's variables() as well.
-    # Default to True, which means auto tracking is turned on. Certain subclass
-    # might want to turn it off, like Sequential model.
-    self._auto_track_sub_layers = True
+    """Base layer class.
 
-    # Mark this layer as having been originally built as a tf1 layer/model
-    self._originally_built_as_v1 = True
+    This is the class from which all layers inherit.
 
-    # For backwards compat reasons, most built-in layers do not guarantee
-    # That they will 100% preserve the structure of input args when saving
-    # / loading configs. E.g. they may un-nest an arg that is
-    # a list with one element.
-    self._preserve_input_structure_in_config = False
+    A layer is a class implementing common neural networks operations, such
+    as convolution, batch norm, etc. These operations require managing weights,
+    losses, updates, and inter-layer connectivity.
 
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  @generic_utils.default
-  def build(self, input_shape):
-    """Creates the variables of the layer (optional, for subclass implementers).
+    Users will just instantiate a layer and then treat it as a callable.
 
-    This is a method that implementers of subclasses of `Layer` or `Model`
-    can override if they need a state-creation step in-between
-    layer instantiation and layer call.
+    We recommend that descendants of `Layer` implement the following methods:
 
-    This is typically used to create the weights of `Layer` subclasses.
+    * `__init__()`: Save configuration in member variables
+    * `build()`: Called once from `__call__`, when we know the shapes of inputs
+      and `dtype`. Should have the calls to `add_weight()`, and then
+      call the super's `build()` (which sets `self.built = True`, which is
+      nice in case the user wants to call `build()` manually before the
+      first `__call__`).
+    * `call()`: Called in `__call__` after making sure `build()` has been called
+      once. Should actually perform the logic of applying the layer to the
+      input tensors (which should be passed in as the first argument).
 
     Args:
-      input_shape: Instance of `TensorShape`, or list of instances of
-        `TensorShape` if the layer expects a list of inputs
-        (one instance per input).
+      trainable: Boolean, whether the layer's variables should be trainable.
+      name: String name of the layer.
+      dtype: The dtype of the layer's computations and weights (default of
+        `None` means use `tf.keras.backend.floatx` in TensorFlow 2, or the type
+        of the first input in TensorFlow 1).
+      dynamic: Set this to `True` if your layer should only be run eagerly, and
+        should not be used to generate a static computation graph.
+        This would be the case for a Tree-RNN or a recursive network,
+        for example, or generally for any layer that manipulates tensors
+        using Python control flow. If `False`, we assume that the layer can
+        safely be used to generate a static computation graph.
+
+    Attributes:
+      name: The name of the layer (string).
+      dtype: The dtype of the layer's computations and weights. If mixed
+        precision is used with a `tf.keras.mixed_precision.Policy`, this is
+        instead just the dtype of the layer's weights, as the computations are
+        done in a different dtype.
+      updates: List of update ops of this layer.
+      losses: List of losses added by this layer.
+      trainable_weights: List of variables to be included in backprop.
+      non_trainable_weights: List of variables that should not be
+        included in backprop.
+      weights: The concatenation of the lists trainable_weights and
+        non_trainable_weights (in this order).
+      trainable: Whether the layer should be trained (boolean).
+      input_spec: Optional (list of) `InputSpec` object(s) specifying the
+        constraints on inputs that can be accepted by the layer.
+
+    Each layer has a dtype, which is typically the dtype of the layer's
+    computations and variables. A layer's dtype can be queried via the
+    `Layer.dtype` property. The dtype is specified with the `dtype` constructor
+    argument. In TensorFlow 2, the dtype defaults to `tf.keras.backend.floatx()`
+    if no dtype is passed. `floatx()` itself defaults to "float32".
+    Additionally, layers will cast their inputs to the layer's dtype in
+    TensorFlow 2. When mixed precision is used, layers may have different
+    computation and variable dtypes.  See `tf.keras.mixed_precision.Policy` for
+    details on layer dtypes.
     """
-    if not hasattr(self.build, '_is_default'):
-      self._build_input_shape = input_shape
-    self.built = True
 
-  @doc_controls.for_subclass_implementers
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """This is where the layer's logic lives.
-
-    Args:
-        inputs: Input tensor, or list/tuple of input tensors.
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        A tensor or list/tuple of tensors.
-    """
-    return inputs
-
-  @doc_controls.for_subclass_implementers
-  def _add_trackable(self, trackable_object, trainable):
-    """Adds a Trackable object to this layer's state.
-
-    Args:
-      trackable_object: The tf.tracking.Trackable object to add.
-      trainable: Boolean, whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases) or
-        "non_trainable_variables" (e.g. BatchNorm mean and variance).
-
-    Returns:
-      The TrackableWeightHandler used to track this object.
-    """
-    if isinstance(trackable_object, base_layer_utils.TrackableWeightHandler):
-      handler = trackable_object
-    else:
-      handler = base_layer_utils.TrackableWeightHandler(trackable_object)
-    if trainable:
-      self._trainable_weights.append(handler)
-    else:
-      self._non_trainable_weights.append(handler)
-    return handler
-
-  @doc_controls.for_subclass_implementers
-  def add_weight(self,
-                 name=None,
-                 shape=None,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 partitioner=None,
-                 use_resource=None,
-                 synchronization=tf.VariableSynchronization.AUTO,
-                 aggregation=tf.compat.v1.VariableAggregation.NONE,
-                 **kwargs):
-    """Adds a new variable to the layer.
+    # See tf.Module for the usage of this property.  The key for
+    # _obj_reference_counts_dict is a Trackable, which could be a variable or
+    # layer etc. tf.Module._flatten will fail to flatten the key since it is
+    # trying to convert Trackable to a string. This attribute can be ignored
+    # even after the fix of nest lib, since the trackable object should already
+    # been available as individual attributes. _obj_reference_counts_dict just
+    # contains a copy of them.
+    _TF_MODULE_IGNORED_PROPERTIES = frozenset(
+        itertools.chain(
+            ("_obj_reference_counts_dict",),
+            tf.Module._TF_MODULE_IGNORED_PROPERTIES,
+        )
+    )
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(
+        self, trainable=True, name=None, dtype=None, dynamic=False, **kwargs
+    ):
+        self._instrument_layer_creation()
+
+        # These properties should be set by the user via keyword arguments.
+        # note that 'dtype', 'input_shape' and 'batch_input_shape'
+        # are only applicable to input layers: do not pass these keywords
+        # to non-input layers.
+        allowed_kwargs = {
+            "input_dim",
+            "input_shape",
+            "batch_input_shape",
+            "batch_size",
+            "weights",
+            "activity_regularizer",
+            "autocast",
+            "implementation",
+        }
+        # Validate optional keyword arguments.
+        generic_utils.validate_kwargs(kwargs, allowed_kwargs)
+
+        # Mutable properties
+        # Indicates whether the layer's weights are updated during training
+        # and whether the layer's updates are run during training.
+        self._trainable = trainable
+        # A stateful layer is a layer whose updates are run during inference
+        # too, for instance stateful RNNs.
+        self._stateful = False
+        # Indicates whether `build` needs to be called upon layer call, to
+        # create the layer's weights.
+        self.built = False
+        self._build_input_shape = None
+        # Provides information about which inputs are compatible with the layer.
+        self._input_spec = None
+        self.supports_masking = False
+
+        self._init_set_name(name)
+        self._activity_regularizer = regularizers.get(
+            kwargs.pop("activity_regularizer", None)
+        )
+        self._maybe_create_attribute("_trainable_weights", [])
+        self._maybe_create_attribute("_non_trainable_weights", [])
+        self._updates = []
+        # Object to store all thread local layer properties.
+        self._thread_local = threading.local()
+        # A list of zero-argument lambdas which return Tensors, used for
+        # variable regularizers.
+        self._callable_losses = []
+        # A list of symbolic Tensors containing activity regularizers and losses
+        # manually added through `add_loss` in graph-building mode.
+        self._losses = []
+        # A list of metric instances corresponding to the symbolic metric
+        # tensors added using the `add_metric` API.
+        self._metrics = []
+
+        # Note that models also have a dtype policy, as they are layers. For
+        # functional models, the policy is only used in Model.compile, which
+        # wraps the optimizer with a LossScaleOptimizer if the policy name is
+        # "mixed_float16". Subclassed models additionally use the policy's
+        # compute and variable dtypes, as like any ordinary layer.
+        self._set_dtype_policy(dtype)
+        # Boolean indicating whether the layer automatically casts its inputs to
+        # the layer's compute_dtype.
+        self._autocast = kwargs.get(
+            "autocast", base_layer_utils.v2_dtype_behavior_enabled()
+        )
+
+        # Dependencies tracked via attribute assignment.
+        # All layers in order of horizontal graph traversal.
+        # Entries are unique. For models includes input and output layers.
+        self._maybe_create_attribute("_self_tracked_trackables", [])
+
+        # These lists will be filled via successive calls
+        # to self._add_inbound_node().
+        # Used in symbolic mode only, only in conjunction with graph-networks
+        self._inbound_nodes_value = []
+        self._outbound_nodes_value = []
+
+        self._init_call_fn_args()
+
+        # Whether the `call` method can be used to build a TF graph without
+        # issues.  This attribute has no effect if the model is created using
+        # the Functional API. Instead, `model.dynamic` is determined based on
+        # the internal layers.
+        self._dynamic = dynamic
+
+        # Manage input shape information if passed.
+        if "input_dim" in kwargs and "input_shape" not in kwargs:
+            # Backwards compatibility: alias 'input_dim' to 'input_shape'.
+            kwargs["input_shape"] = (kwargs["input_dim"],)
+        if "input_shape" in kwargs or "batch_input_shape" in kwargs:
+            # In this case we will later create an input layer
+            # to insert before the current layer
+            if "batch_input_shape" in kwargs:
+                batch_input_shape = tuple(kwargs["batch_input_shape"])
+            elif "input_shape" in kwargs:
+                if "batch_size" in kwargs:
+                    batch_size = kwargs["batch_size"]
+                else:
+                    batch_size = None
+                batch_input_shape = (batch_size,) + tuple(kwargs["input_shape"])
+            self._batch_input_shape = batch_input_shape
+
+        # Manage initial weight values if passed.
+        self._initial_weights = kwargs.get("weights", None)
+
+        # Whether the layer will track any layers that are set as attribute on
+        # itself as sub-layers, the weights from the sub-layers will be included
+        # in the parent layer's variables() as well.  Defaults to `True`, which
+        # means auto tracking is turned on. Certain subclass might want to turn
+        # it off, like the Sequential model.
+        self._auto_track_sub_layers = True
+
+        # Mark this layer as having been originally built as a tf1 layer/model
+        self._originally_built_as_v1 = True
+
+        # For backward compat reasons, most built-in layers do not guarantee
+        # That they will 100% preserve the structure of input args when saving
+        # / loading configs. E.g. they may un-nest an arg that is
+        # a list with one element.
+        self._preserve_input_structure_in_config = False
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    @generic_utils.default
+    def build(self, input_shape):
+        """Creates the variables of the layer (for subclass implementers).
+
+        This is a method that implementers of subclasses of `Layer` or `Model`
+        can override if they need a state-creation step in-between
+        layer instantiation and layer call.
+
+        This is typically used to create the weights of `Layer` subclasses.
+
+        Args:
+          input_shape: Instance of `TensorShape`, or list of instances of
+            `TensorShape` if the layer expects a list of inputs
+            (one instance per input).
+        """
+        if not hasattr(self.build, "_is_default"):
+            self._build_input_shape = input_shape
+        self.built = True
+
+    @doc_controls.for_subclass_implementers
+    def call(self, inputs, **kwargs):
+        """This is where the layer's logic lives.
+
+        Args:
+            inputs: Input tensor, or list/tuple of input tensors.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            A tensor or list/tuple of tensors.
+        """
+        return inputs
 
-    Args:
-      name: Variable name.
-      shape: Variable shape. Defaults to scalar if unspecified.
-      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-      initializer: Initializer instance (callable).
-      regularizer: Regularizer instance (callable).
-      trainable: Boolean, whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases)
-        or "non_trainable_variables" (e.g. BatchNorm mean and variance).
-        Note that `trainable` cannot be `True` if `synchronization`
-        is set to `ON_READ`.
-      constraint: Constraint instance (callable).
-      partitioner: Partitioner to be passed to the `Trackable` API.
-      use_resource: Whether to use `ResourceVariable`.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      **kwargs: Additional keyword arguments. Accepted values are `getter`,
-        `collections`, `experimental_autocast` and `caching_device`.
-
-    Returns:
-      The created variable. Usually either a `Variable` or `ResourceVariable`
-      instance. If `partitioner` is not `None`, a `PartitionedVariable`
-      instance is returned.
-
-    Raises:
-      RuntimeError: If called with partitioned variable regularization and
-        eager execution is enabled.
-      ValueError: When giving unsupported dtype and no initializer or when
-        trainable has been set to True with synchronization set as `ON_READ`.
-    """
-    if shape is None:
-      shape = ()
-    # Validate optional keyword arguments.
-    for kwarg in kwargs:
-      if kwarg not in ['getter', 'collections', 'experimental_autocast',
-                       'caching_device']:
-        raise TypeError('Unknown keyword argument:', kwarg)
-    has_custom_getter = 'getter' in kwargs
-    getter = kwargs.pop('getter', base_layer_utils.make_variable)
-    collections_arg = kwargs.pop('collections', None)
-    # 'experimental_autocast' can be set to False by the caller to indicate an
-    # AutoCastVariable should never be created.
-    autocast = kwargs.pop('experimental_autocast', True)
-    # See the docstring for tf.Variable about the details for caching_device.
-    caching_device = kwargs.pop('caching_device', None)
-
-    if dtype is None:
-      dtype = self.dtype or backend.floatx()
-    dtype = tf.as_dtype(dtype)
-    if self._dtype_policy.variable_dtype is None:
-      # The policy is "_infer", so we infer the policy from the variable dtype.
-      self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
-    initializer = initializers.get(initializer)
-    regularizer = regularizers.get(regularizer)
-    constraint = constraints.get(constraint)
-
-    if synchronization == tf.VariableSynchronization.ON_READ:
-      if trainable:
-        raise ValueError(
-            'Synchronization value can be set to '
-            'VariableSynchronization.ON_READ only for non-trainable variables. '
-            'You have specified trainable=True and '
-            'synchronization=VariableSynchronization.ON_READ.')
-      else:
-        # Set trainable to be false when variable is to be synced on read.
-        trainable = False
-    elif trainable is None:
-      trainable = True
-
-    # Initialize variable when no initializer provided
-    if initializer is None:
-      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
-      if dtype.is_floating:
-        initializer = initializers.get('glorot_uniform')
-      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
-      # If dtype is DT_BOOL, provide a default value `FALSE`
-      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
-        initializer = tf.compat.v1.zeros_initializer()
-      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
-      elif not has_custom_getter:
-        # When `getter` is specified, it's possibly fine for `initializer` to be
-        # None since it's up to the custom `getter` to raise error in case it
-        # indeed needs `initializer`.
-        raise ValueError('An initializer for variable %s of type %s is required'
-                         ' for layer %s' % (name, dtype.base_dtype, self.name))
-
-    if (autocast and
-        self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype
-        and dtype.is_floating):
-      # Wrap 'getter' with a version that returns an AutoCastVariable.
-      old_getter = getter
-      def getter(*args, **kwargs):  # pylint: disable=function-redefined
-        variable = old_getter(*args, **kwargs)
-        return autocast_variable.create_autocast_variable(variable)
-      # Also the caching_device does not work with the mixed precision API,
-      # disable it if it is specified.
-      # TODO(b/142020079): Re-enable it once the bug is fixed.
-      if caching_device is not None:
-        tf_logging.warning(
-            '`caching_device` does not work with mixed precision API. Ignoring '
-            'user specified `caching_device`.')
-        caching_device = None
-
-    variable = self._add_variable_with_custom_getter(
-        name=name,
-        shape=shape,
-        # TODO(allenl): a `make_variable` equivalent should be added as a
-        # `Trackable` method.
-        getter=getter,
-        # Manage errors in Layer rather than Trackable.
-        overwrite=True,
-        initializer=initializer,
-        dtype=dtype,
-        constraint=constraint,
-        trainable=trainable,
-        partitioner=partitioner,
-        use_resource=use_resource,
-        collections=collections_arg,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        caching_device=caching_device)
-    if regularizer is not None:
-      # TODO(fchollet): in the future, this should be handled at the
-      # level of variable creation, and weight regularization losses
-      # should be variable attributes.
-      name_in_scope = variable.name[:variable.name.find(':')]
-      self._handle_weight_regularization(name_in_scope,
-                                         variable,
-                                         regularizer)
-    if base_layer_utils.is_split_variable(variable):
-      for v in variable:
-        backend.track_variable(v)
+    @doc_controls.for_subclass_implementers
+    def _add_trackable(self, trackable_object, trainable):
+        """Adds a Trackable object to this layer's state.
+
+        Args:
+          trackable_object: The tf.tracking.Trackable object to add.
+          trainable: Boolean, whether the variable should be part of the layer's
+            "trainable_variables" (e.g. variables, biases) or
+            "non_trainable_variables" (e.g. BatchNorm mean and variance).
+
+        Returns:
+          The TrackableWeightHandler used to track this object.
+        """
+        if isinstance(
+            trackable_object, base_layer_utils.TrackableWeightHandler
+        ):
+            handler = trackable_object
+        else:
+            handler = base_layer_utils.TrackableWeightHandler(trackable_object)
         if trainable:
-          self._trainable_weights.append(v)
+            self._trainable_weights.append(handler)
         else:
-          self._non_trainable_weights.append(v)
-    else:
-      backend.track_variable(variable)
-      if trainable:
-        self._trainable_weights.append(variable)
-      else:
-        self._non_trainable_weights.append(variable)
-    return variable
-
-  @generic_utils.default
-  def get_config(self):
-    """Returns the config of the layer.
-
-    A layer config is a Python dictionary (serializable)
-    containing the configuration of a layer.
-    The same layer can be reinstantiated later
-    (without its trained weights) from this configuration.
-
-    The config of a layer does not include connectivity
-    information, nor the layer class name. These are handled
-    by `Network` (one layer of abstraction above).
-
-    Returns:
-        Python dictionary.
-    """
-    all_args = tf_inspect.getfullargspec(self.__init__).args
-    config = {'name': self.name, 'trainable': self.trainable}
-    if hasattr(self, '_batch_input_shape'):
-      config['batch_input_shape'] = self._batch_input_shape
-    config['dtype'] = policy.serialize(self._dtype_policy)
-    if hasattr(self, 'dynamic'):
-      # Only include `dynamic` in the `config` if it is `True`
-      if self.dynamic:
-        config['dynamic'] = self.dynamic
-      elif 'dynamic' in all_args:
-        all_args.remove('dynamic')
-    expected_args = config.keys()
-    # Finds all arguments in the `__init__` that are not in the config:
-    extra_args = [arg for arg in all_args if arg not in expected_args]
-    # Check that either the only argument in the `__init__` is  `self`,
-    # or that `get_config` has been overridden:
-    if len(extra_args) > 1 and hasattr(self.get_config, '_is_default'):
-      raise NotImplementedError('Layers with arguments in `__init__` must '
-                                'override `get_config`.')
-    return config
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates a layer from its config.
-
-    This method is the reverse of `get_config`,
-    capable of instantiating the same layer from the config
-    dictionary. It does not handle layer connectivity
-    (handled by Network), nor weights (handled by `set_weights`).
-
-    Args:
-        config: A Python dictionary, typically the
-            output of get_config.
-
-    Returns:
-        A layer instance.
-    """
-    return cls(**config)
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer.
-
-    If the layer has not been built, this method will call `build` on the
-    layer. This assumes that the layer will later be used with inputs that
-    match the input shape provided here.
-
-    Args:
-        input_shape: Shape tuple (tuple of integers)
-            or list of shape tuples (one per output tensor of the layer).
-            Shape tuples can include None for free dimensions,
-            instead of an integer.
-
-    Returns:
-        An input shape tuple.
-    """
-    if tf.executing_eagerly():
-      # In this case we build the model first in order to do shape inference.
-      # This is acceptable because the framework only calls
-      # `compute_output_shape` on shape values that the layer would later be
-      # built for. It would however cause issues in case a user attempts to
-      # use `compute_output_shape` manually with shapes that are incompatible
-      # with the shape the Layer will be called on (these users will have to
-      # implement `compute_output_shape` themselves).
-      self._maybe_build(input_shape)
-      with tf.compat.v1.get_default_graph().as_default():
-        graph = tf.__internal__.FuncGraph('graph')
-        with graph.as_default():
-          input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-          inputs = tf.nest.map_structure(
-              base_layer_utils.generate_placeholders_from_shape, input_shape)
-          try:
-            outputs = self(inputs, training=False)
-          except TypeError as e:
+            self._non_trainable_weights.append(handler)
+        return handler
+
+    @doc_controls.for_subclass_implementers
+    def add_weight(
+        self,
+        name=None,
+        shape=None,
+        dtype=None,
+        initializer=None,
+        regularizer=None,
+        trainable=None,
+        constraint=None,
+        partitioner=None,
+        use_resource=None,
+        synchronization=tf.VariableSynchronization.AUTO,
+        aggregation=tf.compat.v1.VariableAggregation.NONE,
+        **kwargs,
+    ):
+        """Adds a new variable to the layer.
+
+        Args:
+          name: Variable name.
+          shape: Variable shape. Defaults to scalar if unspecified.
+          dtype: The type of the variable. Defaults to `self.dtype` or
+            `float32`.
+          initializer: Initializer instance (callable).
+          regularizer: Regularizer instance (callable).
+          trainable: Boolean, whether the variable should be part of the layer's
+            "trainable_variables" (e.g. variables, biases)
+            or "non_trainable_variables" (e.g. BatchNorm mean and variance).
+            Note that `trainable` cannot be `True` if `synchronization`
+            is set to `ON_READ`.
+          constraint: Constraint instance (callable).
+          partitioner: Partitioner to be passed to the `Trackable` API.
+          use_resource: Whether to use `ResourceVariable`.
+          synchronization: Indicates when a distributed variable will be
+            aggregated. Accepted values are constants defined in the class
+            `tf.VariableSynchronization`. By default the synchronization is set
+            to `AUTO` and the current `DistributionStrategy` chooses when to
+            synchronize. If `synchronization` is set to `ON_READ`, `trainable`
+            must not be set to `True`.
+          aggregation: Indicates how a distributed variable will be aggregated.
+            Accepted values are constants defined in the class
+            `tf.VariableAggregation`.
+          **kwargs: Additional keyword arguments. Accepted values are `getter`,
+            `collections`, `experimental_autocast` and `caching_device`.
+
+        Returns:
+          The created variable. Usually either a `Variable` or
+          `ResourceVariable` instance. If `partitioner` is not `None`, a
+          `PartitionedVariable` instance is returned.
+
+        Raises:
+          RuntimeError: If called with partitioned variable regularization and
+            eager execution is enabled.
+          ValueError: When giving unsupported dtype and no initializer or when
+            trainable has been set to True with synchronization set as
+            `ON_READ`.
+        """
+        if shape is None:
+            shape = ()
+        # Validate optional keyword arguments.
+        for kwarg in kwargs:
+            if kwarg not in [
+                "getter",
+                "collections",
+                "experimental_autocast",
+                "caching_device",
+            ]:
+                raise TypeError("Unknown keyword argument:", kwarg)
+        has_custom_getter = "getter" in kwargs
+        getter = kwargs.pop("getter", base_layer_utils.make_variable)
+        collections_arg = kwargs.pop("collections", None)
+        # 'experimental_autocast' can be set to False by the caller to indicate
+        # an AutoCastVariable should never be created.
+        autocast = kwargs.pop("experimental_autocast", True)
+        # See the docstring for tf.Variable about the details for
+        # caching_device.
+        caching_device = kwargs.pop("caching_device", None)
+
+        if dtype is None:
+            dtype = self.dtype or backend.floatx()
+        dtype = tf.as_dtype(dtype)
+        if self._dtype_policy.variable_dtype is None:
+            # The policy is "_infer", so we infer the policy from the variable
+            # dtype.
+            self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
+        initializer = initializers.get(initializer)
+        regularizer = regularizers.get(regularizer)
+        constraint = constraints.get(constraint)
+
+        if synchronization == tf.VariableSynchronization.ON_READ:
+            if trainable:
+                raise ValueError(
+                    "Synchronization value can be set to "
+                    "VariableSynchronization.ON_READ only for non-trainable "
+                    "variables. You have specified trainable=True and "
+                    "synchronization=VariableSynchronization.ON_READ."
+                )
+            else:
+                # Set trainable to be false when the variable is to be synced on
+                # read.
+                trainable = False
+        elif trainable is None:
+            trainable = True
+
+        # Initialize variable when no initializer provided
+        if initializer is None:
+            # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+            if dtype.is_floating:
+                initializer = initializers.get("glorot_uniform")
+            # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+            # If dtype is DT_BOOL, provide a default value `FALSE`
+            elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+                initializer = tf.compat.v1.zeros_initializer()
+            # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX
+            # here?
+            elif not has_custom_getter:
+                # When `getter` is specified, it's possibly fine for
+                # `initializer` to be None since it's up to the custom `getter`
+                # to raise error in case it indeed needs `initializer`.
+                raise ValueError(
+                    "An initializer for variable %s of type %s is required"
+                    " for layer %s" % (name, dtype.base_dtype, self.name)
+                )
+
+        if (
+            autocast
+            and self._dtype_policy.compute_dtype
+            != self._dtype_policy.variable_dtype
+            and dtype.is_floating
+        ):
+            # Wrap 'getter' with a version that returns an AutoCastVariable.
+            old_getter = getter
+
+            def getter(*args, **kwargs):
+                variable = old_getter(*args, **kwargs)
+                return autocast_variable.create_autocast_variable(variable)
+
+            # Also the caching_device does not work with the mixed precision
+            # API, disable it if it is specified.
+            # TODO(b/142020079): Re-enable it once the bug is fixed.
+            if caching_device is not None:
+                tf_logging.warning(
+                    "`caching_device` does not work with mixed precision API. "
+                    "Ignoring user specified `caching_device`."
+                )
+                caching_device = None
+
+        variable = self._add_variable_with_custom_getter(
+            name=name,
+            shape=shape,
+            # TODO(allenl): a `make_variable` equivalent should be added as a
+            # `Trackable` method.
+            getter=getter,
+            # Manage errors in Layer rather than Trackable.
+            overwrite=True,
+            initializer=initializer,
+            dtype=dtype,
+            constraint=constraint,
+            trainable=trainable,
+            partitioner=partitioner,
+            use_resource=use_resource,
+            collections=collections_arg,
+            synchronization=synchronization,
+            aggregation=aggregation,
+            caching_device=caching_device,
+        )
+        if regularizer is not None:
+            # TODO(fchollet): in the future, this should be handled at the
+            # level of variable creation, and weight regularization losses
+            # should be variable attributes.
+            name_in_scope = variable.name[: variable.name.find(":")]
+            self._handle_weight_regularization(
+                name_in_scope, variable, regularizer
+            )
+        if base_layer_utils.is_split_variable(variable):
+            for v in variable:
+                backend.track_variable(v)
+                if trainable:
+                    self._trainable_weights.append(v)
+                else:
+                    self._non_trainable_weights.append(v)
+        else:
+            backend.track_variable(variable)
+            if trainable:
+                self._trainable_weights.append(variable)
+            else:
+                self._non_trainable_weights.append(variable)
+        return variable
+
+    @generic_utils.default
+    def get_config(self):
+        """Returns the config of the layer.
+
+        A layer config is a Python dictionary (serializable)
+        containing the configuration of a layer.
+        The same layer can be reinstantiated later
+        (without its trained weights) from this configuration.
+
+        The config of a layer does not include connectivity
+        information, nor the layer class name. These are handled
+        by `Network` (one layer of abstraction above).
+
+        Returns:
+            Python dictionary.
+        """
+        all_args = tf_inspect.getfullargspec(self.__init__).args
+        config = {"name": self.name, "trainable": self.trainable}
+        if hasattr(self, "_batch_input_shape"):
+            config["batch_input_shape"] = self._batch_input_shape
+        config["dtype"] = policy.serialize(self._dtype_policy)
+        if hasattr(self, "dynamic"):
+            # Only include `dynamic` in the `config` if it is `True`
+            if self.dynamic:
+                config["dynamic"] = self.dynamic
+            elif "dynamic" in all_args:
+                all_args.remove("dynamic")
+        expected_args = config.keys()
+        # Finds all arguments in the `__init__` that are not in the config:
+        extra_args = [arg for arg in all_args if arg not in expected_args]
+        # Check that either the only argument in the `__init__` is  `self`,
+        # or that `get_config` has been overridden:
+        if len(extra_args) > 1 and hasattr(self.get_config, "_is_default"):
             raise NotImplementedError(
-                'We could not automatically infer the static shape of the '
-                'layer\'s output. Please implement the '
-                '`compute_output_shape` method on your layer (%s).' %
-                self.__class__.__name__) from e
-      return tf.nest.map_structure(lambda t: t.shape, outputs)
-    raise NotImplementedError
-
-  @doc_controls.for_subclass_implementers
-  def compute_output_signature(self, input_signature):
-    """Compute the output tensor signature of the layer based on the inputs.
-
-    Unlike a TensorShape object, a TensorSpec object contains both shape
-    and dtype information for a tensor. This method allows layers to provide
-    output dtype information if it is different from the input dtype.
-    For any layer that doesn't implement this function,
-    the framework will fall back to use `compute_output_shape`, and will
-    assume that the output dtype matches the input dtype.
-
-    Args:
-      input_signature: Single TensorSpec or nested structure of TensorSpec
-        objects, describing a candidate input for the layer.
-
-    Returns:
-      Single TensorSpec or nested structure of TensorSpec objects, describing
-        how the layer would transform the provided input.
-
-    Raises:
-      TypeError: If input_signature contains a non-TensorSpec object.
-    """
-    def check_type_return_shape(s):
-      if not isinstance(s, tf.TensorSpec):
-        raise TypeError('Only TensorSpec signature types are supported, '
-                        'but saw signature entry: {}.'.format(s))
-      return s.shape
-    input_shape = tf.nest.map_structure(check_type_return_shape, input_signature)
-    output_shape = self.compute_output_shape(input_shape)
-    dtype = self._compute_dtype
-    if dtype is None:
-      input_dtypes = [s.dtype for s in tf.nest.flatten(input_signature)]
-      # Default behavior when self.dtype is None, is to use the first input's
-      # dtype.
-      dtype = input_dtypes[0]
-    return tf.nest.map_structure(
-        lambda s: tf.TensorSpec(dtype=dtype, shape=s),
-        output_shape)
-
-  @generic_utils.default
-  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
-    """Computes an output mask tensor.
+                "Layers with arguments in `__init__` must "
+                "override `get_config`."
+            )
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        """Creates a layer from its config.
+
+        This method is the reverse of `get_config`,
+        capable of instantiating the same layer from the config
+        dictionary. It does not handle layer connectivity
+        (handled by Network), nor weights (handled by `set_weights`).
+
+        Args:
+            config: A Python dictionary, typically the
+                output of get_config.
+
+        Returns:
+            A layer instance.
+        """
+        return cls(**config)
+
+    def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer.
+
+        If the layer has not been built, this method will call `build` on the
+        layer. This assumes that the layer will later be used with inputs that
+        match the input shape provided here.
+
+        Args:
+            input_shape: Shape tuple (tuple of integers)
+                or list of shape tuples (one per output tensor of the layer).
+                Shape tuples can include None for free dimensions,
+                instead of an integer.
+
+        Returns:
+            An input shape tuple.
+        """
+        if tf.executing_eagerly():
+            # In this case we build the model first in order to do shape
+            # inference.  This is acceptable because the framework only calls
+            # `compute_output_shape` on shape values that the layer would later
+            # be built for. It would however cause issues in case a user
+            # attempts to use `compute_output_shape` manually with shapes that
+            # are incompatible with the shape the Layer will be called on (these
+            # users will have to implement `compute_output_shape` themselves).
+            self._maybe_build(input_shape)
+            with tf.compat.v1.get_default_graph().as_default():
+                graph = tf.__internal__.FuncGraph("graph")
+                with graph.as_default():
+                    input_shape = tf_utils.convert_shapes(
+                        input_shape, to_tuples=False
+                    )
+                    inputs = tf.nest.map_structure(
+                        base_layer_utils.generate_placeholders_from_shape,
+                        input_shape,
+                    )
+                    try:
+                        outputs = self(inputs, training=False)
+                    except TypeError as e:
+                        raise NotImplementedError(
+                            "We could not automatically infer the static "
+                            "shape of the layer's output. Please implement the "
+                            "`compute_output_shape` method on your layer (%s)."
+                            % self.__class__.__name__
+                        ) from e
+            return tf.nest.map_structure(lambda t: t.shape, outputs)
+        raise NotImplementedError
+
+    @doc_controls.for_subclass_implementers
+    def compute_output_signature(self, input_signature):
+        """Compute the output tensor signature of the layer based on the inputs.
+
+        Unlike a TensorShape object, a TensorSpec object contains both shape
+        and dtype information for a tensor. This method allows layers to provide
+        output dtype information if it is different from the input dtype.
+        For any layer that doesn't implement this function,
+        the framework will fall back to use `compute_output_shape`, and will
+        assume that the output dtype matches the input dtype.
+
+        Args:
+          input_signature: Single TensorSpec or nested structure of TensorSpec
+            objects, describing a candidate input for the layer.
+
+        Returns:
+          Single TensorSpec or nested structure of TensorSpec objects,
+            describing how the layer would transform the provided input.
+
+        Raises:
+          TypeError: If input_signature contains a non-TensorSpec object.
+        """
+
+        def check_type_return_shape(s):
+            if not isinstance(s, tf.TensorSpec):
+                raise TypeError(
+                    "Only TensorSpec signature types are supported, "
+                    "but saw signature entry: {}.".format(s)
+                )
+            return s.shape
+
+        input_shape = tf.nest.map_structure(
+            check_type_return_shape, input_signature
+        )
+        output_shape = self.compute_output_shape(input_shape)
+        dtype = self._compute_dtype
+        if dtype is None:
+            input_dtypes = [s.dtype for s in tf.nest.flatten(input_signature)]
+            # Default behavior when self.dtype is None, is to use the first
+            # input's dtype.
+            dtype = input_dtypes[0]
+        return tf.nest.map_structure(
+            lambda s: tf.TensorSpec(dtype=dtype, shape=s), output_shape
+        )
+
+    @generic_utils.default
+    def compute_mask(self, inputs, mask=None):
+        """Computes an output mask tensor.
+
+        Args:
+            inputs: Tensor or list of tensors.
+            mask: Tensor or list of tensors.
+
+        Returns:
+            None or a tensor (or list of tensors,
+                one per output tensor of the layer).
+        """
+        if not self.supports_masking:
+            if any(m is not None for m in tf.nest.flatten(mask)):
+                raise TypeError(
+                    "Layer " + self.name + " does not support masking, "
+                    "but was passed an input_mask: " + str(mask)
+                )
+            # masking not explicitly supported: return None as mask.
+            return None
+        # if masking is explicitly supported, by default
+        # carry over the input mask
+        return mask
+
+    def __call__(self, *args, **kwargs):
+        """Wraps `call`, applying pre- and post-processing steps.
+
+        Args:
+          *args: Positional arguments to be passed to `self.call`.
+          **kwargs: Keyword arguments to be passed to `self.call`.
+
+        Returns:
+          Output tensor(s).
+
+        Note:
+          - The following optional keyword arguments are reserved for specific
+            uses:
+            * `training`: Boolean scalar tensor of Python boolean indicating
+              whether the `call` is meant for training or inference.
+            * `mask`: Boolean input mask.
+          - If the layer's `call` method takes a `mask` argument (as some Keras
+            layers do), its default value will be set to the mask generated
+            for `inputs` by the previous layer (if `input` did come from
+            a layer that generated a corresponding mask, i.e. if it came from
+            a Keras layer with masking support.
+
+        Raises:
+          ValueError: if the layer's `call` method returns None (an invalid
+            value).
+          RuntimeError: if `super().__init__()` was not called in the
+            constructor.
+        """
+        self._assert_built_as_v1()
+
+        if not hasattr(self, "_thread_local"):
+            raise RuntimeError(
+                "You must call `super().__init__()` in the layer constructor."
+            )
+
+        # Grab the first positional or keyword argument.
+        if args:
+            inputs = args[0]
+            args = args[1:]
+        elif self._call_spec.arg_names[0] in kwargs:
+            inputs = kwargs.pop(self._call_spec.arg_names[0])
+        else:
+            raise ValueError(
+                "The first argument to `Layer.call` must always be passed."
+            )
+
+        call_context = base_layer_utils.call_context()
+        input_list = tf.nest.flatten(inputs)
+
+        # We will attempt to build a TF graph if & only if all inputs are
+        # symbolic.  This is always the case in graph mode. It can also be the
+        # case in eager mode when all inputs can be traced back to
+        # `keras.Input()` (when building models using the functional API).
+        build_graph = tf_utils.are_all_symbolic_tensors(input_list)
+
+        # Accept NumPy and scalar inputs by converting to Tensors.
+        if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+
+            def _convert_non_tensor(x):
+                # Don't call `ops.convert_to_tensor` on all `inputs` because
+                # `SparseTensors` can't be converted to `Tensor`.
+                if isinstance(x, (np.ndarray, float, int)):
+                    return tf.convert_to_tensor(x)
+                return x
+
+            inputs = tf.nest.map_structure(_convert_non_tensor, inputs)
+            input_list = tf.nest.flatten(inputs)
+
+        # Handle `mask` propagation from previous layer to current layer. Masks
+        # can be propagated explicitly via the `mask` argument, or implicitly
+        # via setting the `_keras_mask` attribute on the inputs to a Layer.
+        # Masks passed explicitly take priority.
+        mask_arg_passed_by_framework = False
+        input_masks = self._collect_input_masks(inputs, args, kwargs)
+        if (
+            self._expects_mask_arg
+            and input_masks is not None
+            and not self._call_spec.arg_was_passed("mask", args, kwargs)
+        ):
+            mask_arg_passed_by_framework = True
+            kwargs["mask"] = input_masks
+
+        # If `training` argument is None or not explicitly passed,
+        # propagate `training` value from this layer's calling layer.
+        training_value = None
+        training_arg_passed_by_framework = False
+        # Priority 1: `training` was explicitly passed.
+        if self._call_spec.arg_was_passed("training", args, kwargs):
+            training_value = self._call_spec.get_arg_value(
+                "training", args, kwargs
+            )
+            if not self._expects_training_arg:
+                kwargs.pop("training")
+
+        if training_value is None:
+            # Priority 2: `training` was passed to a parent layer.
+            if call_context.training is not None:
+                training_value = call_context.training
+            # Priority 3a: `learning_phase()` has been set.
+            elif backend.global_learning_phase_is_set():
+                training_value = backend.learning_phase()
+            # Priority 3b: Pass the `learning_phase()` if in the Keras
+            # FuncGraph.
+            elif build_graph:
+                with backend.get_graph().as_default():
+                    if base_layer_utils.is_in_keras_graph():
+                        training_value = backend.learning_phase()
+
+            if self._expects_training_arg and training_value is not None:
+                # Force the training_value to be bool type which matches to the
+                # contract for layer/model call args.
+                if tf.is_tensor(training_value):
+                    training_value = tf.cast(training_value, tf.bool)
+                else:
+                    training_value = bool(training_value)
+                args, kwargs = self._call_spec.set_arg_value(
+                    "training", training_value, args, kwargs
+                )
+                training_arg_passed_by_framework = True
+
+        # Only create Keras history if at least one tensor originates from a
+        # `keras.Input`. Otherwise this Layer may be being used outside the
+        # Keras framework.
+        if build_graph and base_layer_utils.needs_keras_history(inputs):
+            base_layer_utils.create_keras_history(inputs)
+
+        with call_context.enter(self, inputs, build_graph, training_value):
+            # Check input assumptions set after layer building, e.g. input
+            # shape.
+            if build_graph:
+                # Symbolic execution on symbolic tensors. We will attempt to
+                # build the corresponding TF subgraph inside
+                # `backend.get_graph()`
+                input_spec.assert_input_compatibility(
+                    self.input_spec, inputs, self.name
+                )
+                graph = backend.get_graph()
+                with graph.as_default(), backend.name_scope(self._name_scope()):
+                    # Build layer if applicable (if the `build` method has been
+                    # overridden).
+                    self._maybe_build(inputs)
+                    cast_inputs = self._maybe_cast_inputs(inputs)
+
+                    # Wrapping `call` function in autograph to allow for dynamic
+                    # control flow and control dependencies in call. We are
+                    # limiting this to subclassed layers as autograph is
+                    # strictly needed only for subclassed layers and models.
+                    # tf_convert will respect the value of autograph setting in
+                    # the enclosing tf.function, if any.
+                    if base_layer_utils.is_subclassed(
+                        self
+                    ) and not base_layer_utils.from_saved_model(self):
+                        call_fn = tf.__internal__.autograph.tf_convert(
+                            self.call,
+                            tf.__internal__.autograph.control_status_ctx(),
+                        )
+                    else:
+                        call_fn = self.call
+
+                    if not self.dynamic:
+                        try:
+                            with autocast_variable.enable_auto_cast_variables(
+                                self._compute_dtype_object
+                            ):
+                                outputs = call_fn(cast_inputs, *args, **kwargs)
+
+                        except tf.errors.OperatorNotAllowedInGraphError as e:
+                            raise TypeError(
+                                "You are attempting to use Python control "
+                                "flow in a layer that was not declared to be "
+                                "dynamic. Pass `dynamic=True` to the class "
+                                'constructor.\nEncountered error:\n"""\n'
+                                + str(e)
+                                + '\n"""'
+                            )
+                    else:
+                        # We will use static shape inference to return symbolic
+                        # tensors matching the specifications of the layer
+                        # outputs.  Since `self.dynamic` is True, we will never
+                        # attempt to run the underlying TF graph (which is
+                        # disconnected).
+                        # TODO(fchollet): consider py_func as an alternative,
+                        # which would enable us to run the underlying graph if
+                        # needed.
+                        outputs = self._symbolic_call(inputs)
+
+                    if outputs is None:
+                        raise ValueError(
+                            "A layer's `call` method should return a "
+                            "Tensor or a list of Tensors, not None "
+                            "(layer: " + self.name + ")."
+                        )
+                    if base_layer_utils.have_all_keras_metadata(inputs):
+                        if training_arg_passed_by_framework:
+                            args, kwargs = self._call_spec.set_arg_value(
+                                "training",
+                                None,
+                                args,
+                                kwargs,
+                                pop_kwarg_if_none=True,
+                            )
+                        if mask_arg_passed_by_framework:
+                            kwargs.pop("mask")
+                        outputs = self._set_connectivity_metadata(
+                            (inputs,) + args, kwargs, outputs
+                        )
+                    self._handle_activity_regularization(inputs, outputs)
+                    self._set_mask_metadata(inputs, outputs, input_masks)
+                    if hasattr(self, "_set_inputs") and not self.inputs:
+                        # Subclassed network: explicitly set metadata normally
+                        # set by a call to self._set_inputs().
+                        # TODO(b/120997007): This should be done in Eager as
+                        # well, but causes garbage collection issues because of
+                        # the placeholders created on the default Keras graph.
+                        self._set_save_spec(inputs, args, kwargs)
+                        self._set_inputs(inputs, outputs)
+            else:
+                # Eager execution on data tensors.
+                with backend.name_scope(self._name_scope()):
+                    self._maybe_build(inputs)
+                    cast_inputs = self._maybe_cast_inputs(inputs)
+                    with autocast_variable.enable_auto_cast_variables(
+                        self._compute_dtype_object
+                    ):
+                        outputs = self.call(cast_inputs, *args, **kwargs)
+                    self._handle_activity_regularization(inputs, outputs)
+                    self._set_mask_metadata(inputs, outputs, input_masks)
+
+        return outputs
+
+    def _assert_built_as_v1(self):
+        if not hasattr(self, "_originally_built_as_v1"):
+            raise ValueError(
+                "Your Layer or Model is in an invalid state. "
+                "This can happen for the following cases:\n "
+                "1. You might be interleaving estimator/non-estimator models "
+                "or interleaving models/layers made in "
+                "tf.compat.v1.Graph.as_default() with models/layers created "
+                "outside of it. "
+                "Converting a model to an estimator (via model_to_estimator) "
+                "invalidates all models/layers made before the conversion "
+                "(even if they were not the model converted to an estimator). "
+                "Similarly, making a layer or a model inside a "
+                "a tf.compat.v1.Graph invalidates all layers/models you "
+                "previously made outside of the graph.\n"
+                "2. You might be using a custom keras layer implementation "
+                "with custom __init__ which didn't call super().__init__. "
+                " Please check the implementation of %s and its bases."
+                % (type(self),)
+            )
+
+    @property
+    def dtype(self):
+        return self._dtype_policy.variable_dtype
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def dynamic(self):
+        return any(layer._dynamic for layer in self._flatten_layers())
+
+    @property
+    @doc_controls.do_not_generate_docs
+    def stateful(self):
+        return any(layer._stateful for layer in self._flatten_layers())
+
+    @stateful.setter
+    def stateful(self, value):
+        self._stateful = value
+
+    @property
+    def trainable(self):
+        return self._trainable
+
+    @trainable.setter
+    def trainable(self, value):
+        self._trainable = value
+        for layer in getattr(self, "_self_tracked_trackables", []):
+            layer.trainable = value
+
+    @property
+    def activity_regularizer(self):
+        """Optional regularizer function for the output of this layer."""
+        return self._activity_regularizer
+
+    @activity_regularizer.setter
+    def activity_regularizer(self, regularizer):
+        """Optional regularizer function for the output of this layer."""
+        self._activity_regularizer = regularizer
+
+    @property
+    def input_spec(self):
+        return self._input_spec
+
+    @input_spec.setter
+    # Must be decorated to prevent tracking, since the input_spec can be nested
+    # InputSpec objects.
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def input_spec(self, value):
+        for v in tf.nest.flatten(value):
+            if v is not None and "InputSpec" not in v.__class__.__name__:
+                raise TypeError(
+                    "Layer input_spec must be an instance of InputSpec. "
+                    "Got: {}".format(v)
+                )
+        self._input_spec = value
+
+    @property
+    def updates(self):
+        collected_updates = []
+        all_layers = self._flatten_layers()
+        with backend.get_graph().as_default():
+            for layer in all_layers:
+                if not layer.trainable and not layer.stateful:
+                    continue
+                for u in layer._updates:
+                    if callable(u):
+                        try:
+                            u = u()
+                        except ValueError as e:
+                            if "InaccessibleTensorError" in type(e).__name__:
+                                # For one specific case of error we try to raise
+                                # a more meaningful error message about the
+                                # graph if we can.  This error is an internal TF
+                                # symbol that is not publicly exposed, so we
+                                # check the name directly rather than using a
+                                # direct import.
+                                base_layer_utils.check_graph_consistency(
+                                    method="add_update", force_raise=True
+                                )
+                            # check_graph_consistency may not always raise.
+                            raise
+                    base_layer_utils.check_graph_consistency(
+                        u, method="add_update"
+                    )
+                    collected_updates.append(u)
+        return collected_updates
+
+    @property
+    def losses(self):
+        """Losses which are associated with this `Layer`.
+
+        Variable regularization tensors are created when this property is
+        accessed, so it is eager safe: accessing `losses` under a
+        `tf.GradientTape` will propagate gradients back to the corresponding
+        variables.
+
+        Returns:
+          A list of tensors.
+        """
+        collected_losses = []
+        all_layers = self._flatten_layers()
+        for layer in all_layers:
+            # If any eager losses are present, we assume the model to be part of
+            # an eager training loop (either a custom one or the one used when
+            # `run_eagerly=True`) and so we always return just the eager losses.
+            collected_losses.extend(layer._losses)
+            for regularizer in layer._callable_losses:
+                loss_tensor = regularizer()
+                if loss_tensor is not None:
+                    collected_losses.append(loss_tensor)
+        return collected_losses
+
+    @doc_controls.for_subclass_implementers
+    def add_loss(self, losses, inputs=None):
+        """Add loss tensor(s), potentially dependent on layer inputs.
+
+        Some losses (for instance, activity regularization losses) may be
+        dependent on the inputs passed when calling a layer. Hence, when reusing
+        the same layer on different inputs `a` and `b`, some entries in
+        `layer.losses` may be dependent on `a` and some on `b`. This method
+        automatically keeps track of dependencies.
+
+        This method can be used inside a subclassed layer or model's `call`
+        function, in which case `losses` should be a Tensor or list of Tensors.
+
+        Example:
+
+        ```python
+        class MyLayer(tf.keras.layers.Layer):
+          def call(inputs, self):
+            self.add_loss(tf.abs(tf.reduce_mean(inputs)), inputs=True)
+            return inputs
+        ```
+
+        This method can also be called directly on a Functional Model during
+        construction. In this case, any loss Tensors passed to this Model must
+        be symbolic and be able to be traced back to the model's `Input`s. These
+        losses become part of the model's topology and are tracked in
+        `get_config`.
+
+        Example:
+
+        ```python
+        inputs = tf.keras.Input(shape=(10,))
+        x = tf.keras.layers.Dense(10)(inputs)
+        outputs = tf.keras.layers.Dense(1)(x)
+        model = tf.keras.Model(inputs, outputs)
+        # Activity regularization.
+        model.add_loss(tf.abs(tf.reduce_mean(x)))
+        ```
+
+        If this is not the case for your loss (if, for example, your loss
+        references a `Variable` of one of the model's layers), you can wrap your
+        loss in a zero-argument lambda. These losses are not tracked as part of
+        the model's topology since they can't be serialized.
+
+        Example:
+
+        ```python
+        inputs = tf.keras.Input(shape=(10,))
+        x = tf.keras.layers.Dense(10)(inputs)
+        outputs = tf.keras.layers.Dense(1)(x)
+        model = tf.keras.Model(inputs, outputs)
+        # Weight regularization.
+        model.add_loss(lambda: tf.reduce_mean(x.kernel))
+        ```
+
+        Args:
+          losses: Loss tensor, or list/tuple of tensors. Rather than tensors,
+            losses may also be zero-argument callables which create a loss
+            tensor.
+          inputs: Ignored when executing eagerly. If anything other than None is
+            passed, it signals the losses are conditional on some of the layer's
+            inputs, and thus they should only be run where these inputs are
+            available. This is the case for activity regularization losses, for
+            instance. If `None` is passed, the losses are assumed
+            to be unconditional, and will apply across all dataflows of the
+            layer (e.g. weight regularization losses).
+        """
+
+        def _tag_unconditional(loss):
+            """Process the loss and tag it by setting ._unconditional_loss."""
+            if callable(loss):
+                # We run the loss without autocasting, as regularizers are often
+                # numerically unstable in float16.
+                with autocast_variable.enable_auto_cast_variables(None):
+                    loss = loss()
+            if loss is None:
+                # Will be filtered out when computing the .losses property
+                return None
+            if not tf.is_tensor(loss):
+                loss = tf.convert_to_tensor(loss, dtype=backend.floatx())
+            loss._unconditional_loss = inputs is None
+            return loss
+
+        losses = tf.nest.flatten(losses)
+
+        callable_losses = []
+        symbolic_losses = []
+        for loss in losses:
+            if callable(loss):
+                callable_losses.append(
+                    functools.partial(_tag_unconditional, loss)
+                )
+                continue
+            if loss is None:
+                continue
+            if not tf.is_tensor(loss):
+                loss = tf.convert_to_tensor(loss, dtype=backend.floatx())
+            # TF Functions should take the eager path.
+            if (
+                tf_utils.is_symbolic_tensor(loss)
+                and not base_layer_utils.is_in_tf_function()
+            ):
+                symbolic_losses.append(_tag_unconditional(loss))
+                base_layer_utils.check_graph_consistency(
+                    loss, method="add_loss"
+                )
 
-    Args:
-        inputs: Tensor or list of tensors.
-        mask: Tensor or list of tensors.
+        self._callable_losses.extend(callable_losses)
 
-    Returns:
-        None or a tensor (or list of tensors,
-            one per output tensor of the layer).
-    """
-    if not self.supports_masking:
-      if any(m is not None for m in tf.nest.flatten(mask)):
-        raise TypeError('Layer ' + self.name + ' does not support masking, '
-                        'but was passed an input_mask: ' + str(mask))
-      # masking not explicitly supported: return None as mask.
-      return None
-    # if masking is explicitly supported, by default
-    # carry over the input mask
-    return mask
-
-  def __call__(self, *args, **kwargs):
-    """Wraps `call`, applying pre- and post-processing steps.
+        in_call_context = base_layer_utils.call_context().in_call
 
-    Args:
-      *args: Positional arguments to be passed to `self.call`.
-      **kwargs: Keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-
-    Note:
-      - The following optional keyword arguments are reserved for specific uses:
-        * `training`: Boolean scalar tensor of Python boolean indicating
-          whether the `call` is meant for training or inference.
-        * `mask`: Boolean input mask.
-      - If the layer's `call` method takes a `mask` argument (as some Keras
-        layers do), its default value will be set to the mask generated
-        for `inputs` by the previous layer (if `input` did come from
-        a layer that generated a corresponding mask, i.e. if it came from
-        a Keras layer with masking support.
-
-    Raises:
-      ValueError: if the layer's `call` method returns None (an invalid value).
-      RuntimeError: if `super().__init__()` was not called in the constructor.
-    """
-    self._assert_built_as_v1()
-
-    if not hasattr(self, '_thread_local'):
-      raise RuntimeError(
-          'You must call `super().__init__()` in the layer constructor.')
-
-    # Grab the first positional or keyword argument.
-    if args:
-      inputs = args[0]
-      args = args[1:]
-    elif self._call_spec.arg_names[0] in kwargs:
-      inputs = kwargs.pop(self._call_spec.arg_names[0])
-    else:
-      raise ValueError(
-          'The first argument to `Layer.call` must always be passed.')
-
-    call_context = base_layer_utils.call_context()
-    input_list = tf.nest.flatten(inputs)
-
-    # We will attempt to build a TF graph if & only if all inputs are symbolic.
-    # This is always the case in graph mode. It can also be the case in eager
-    # mode when all inputs can be traced back to `keras.Input()` (when building
-    # models using the functional API).
-    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
-
-    # Accept NumPy and scalar inputs by converting to Tensors.
-    if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
-      def _convert_non_tensor(x):
-        # Don't call `ops.convert_to_tensor` on all `inputs` because
-        # `SparseTensors` can't be converted to `Tensor`.
-        if isinstance(x, (np.ndarray, float, int)):
-          return tf.convert_to_tensor(x)
-        return x
-      inputs = tf.nest.map_structure(_convert_non_tensor, inputs)
-      input_list = tf.nest.flatten(inputs)
-
-    # Handle `mask` propagation from previous layer to current layer. Masks can
-    # be propagated explicitly via the `mask` argument, or implicitly via
-    # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed
-    # explicitly take priority.
-    mask_arg_passed_by_framework = False
-    input_masks = self._collect_input_masks(inputs, args, kwargs)
-    if (self._expects_mask_arg and input_masks is not None and
-        not self._call_spec.arg_was_passed('mask', args, kwargs)):
-      mask_arg_passed_by_framework = True
-      kwargs['mask'] = input_masks
-
-    # If `training` argument is None or not explicitly passed,
-    # propagate `training` value from this layer's calling layer.
-    training_value = None
-    training_arg_passed_by_framework = False
-    # Priority 1: `training` was explicitly passed.
-    if self._call_spec.arg_was_passed('training', args, kwargs):
-      training_value = self._call_spec.get_arg_value('training', args, kwargs)
-      if not self._expects_training_arg:
-        kwargs.pop('training')
-
-    if training_value is None:
-      # Priority 2: `training` was passed to a parent layer.
-      if call_context.training is not None:
-        training_value = call_context.training
-      # Priority 3a: `learning_phase()` has been set.
-      elif backend.global_learning_phase_is_set():
-        training_value = backend.learning_phase()
-      # Priority 3b: Pass the `learning_phase()` if in the Keras FuncGraph.
-      elif build_graph:
-        with backend.get_graph().as_default():
-          if base_layer_utils.is_in_keras_graph():
-            training_value = backend.learning_phase()
-
-      if self._expects_training_arg and training_value is not None:
-        # Force the training_value to be bool type which matches to the contract
-        # for layer/model call args.
-        if tf.is_tensor(training_value):
-          training_value = tf.cast(training_value, tf.bool)
+        if in_call_context:
+            for symbolic_loss in symbolic_losses:
+                self._losses.append(symbolic_loss)
         else:
-          training_value = bool(training_value)
-        args, kwargs = self._call_spec.set_arg_value('training', training_value,
-                                                     args, kwargs)
-        training_arg_passed_by_framework = True
-
-    # Only create Keras history if at least one tensor originates from a
-    # `keras.Input`. Otherwise this Layer may be being used outside the Keras
-    # framework.
-    if build_graph and base_layer_utils.needs_keras_history(inputs):
-      base_layer_utils.create_keras_history(inputs)
-
-    with call_context.enter(self, inputs, build_graph, training_value):
-      # Check input assumptions set after layer building, e.g. input shape.
-      if build_graph:
-        # Symbolic execution on symbolic tensors. We will attempt to build
-        # the corresponding TF subgraph inside `backend.get_graph()`
-        input_spec.assert_input_compatibility(self.input_spec, inputs,
-                                              self.name)
-        graph = backend.get_graph()
-        with graph.as_default(), backend.name_scope(self._name_scope()):  # pylint: disable=not-callable
-          # Build layer if applicable (if the `build` method has been
-          # overridden).
-          self._maybe_build(inputs)
-          cast_inputs = self._maybe_cast_inputs(inputs)
-
-          # Wrapping `call` function in autograph to allow for dynamic control
-          # flow and control dependencies in call. We are limiting this to
-          # subclassed layers as autograph is strictly needed only for
-          # subclassed layers and models.
-          # tf_convert will respect the value of autograph setting in the
-          # enclosing tf.function, if any.
-          if (base_layer_utils.is_subclassed(self) and
-              not base_layer_utils.from_saved_model(self)):
-            call_fn = tf.__internal__.autograph.tf_convert(
-                self.call, tf.__internal__.autograph.control_status_ctx())
-          else:
-            call_fn = self.call
-
-          if not self.dynamic:
-            try:
-              with autocast_variable.enable_auto_cast_variables(
-                  self._compute_dtype_object):
-                outputs = call_fn(cast_inputs, *args, **kwargs)
-
-            except tf.errors.OperatorNotAllowedInGraphError as e:
-              raise TypeError('You are attempting to use Python control '
-                              'flow in a layer that was not declared to be '
-                              'dynamic. Pass `dynamic=True` to the class '
-                              'constructor.\nEncountered error:\n"""\n' +
-                              str(e) + '\n"""')
-          else:
-            # We will use static shape inference to return symbolic tensors
-            # matching the specifications of the layer outputs.
-            # Since `self.dynamic` is True, we will never attempt to
-            # run the underlying TF graph (which is disconnected).
-            # TODO(fchollet): consider py_func as an alternative, which
-            # would enable us to run the underlying graph if needed.
-            outputs = self._symbolic_call(inputs)
-
-          if outputs is None:
-            raise ValueError('A layer\'s `call` method should return a '
-                             'Tensor or a list of Tensors, not None '
-                             '(layer: ' + self.name + ').')
-          if base_layer_utils.have_all_keras_metadata(inputs):
-            if training_arg_passed_by_framework:
-              args, kwargs = self._call_spec.set_arg_value(
-                  'training', None, args, kwargs, pop_kwarg_if_none=True)
-            if mask_arg_passed_by_framework:
-              kwargs.pop('mask')
-            outputs = self._set_connectivity_metadata((inputs,) + args, kwargs,
-                                                      outputs)
-          self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, input_masks)
-          if hasattr(self, '_set_inputs') and not self.inputs:
-            # Subclassed network: explicitly set metadata normally set by
-            # a call to self._set_inputs().
-            # TODO(b/120997007): This should be done in Eager as well, but
-            # causes garbage collection issues because of the placeholders
-            # created on the default Keras graph.
-            self._set_save_spec(inputs, args, kwargs)
-            self._set_inputs(inputs, outputs)
-      else:
-        # Eager execution on data tensors.
-        with backend.name_scope(self._name_scope()):  # pylint: disable=not-callable
-          self._maybe_build(inputs)
-          cast_inputs = self._maybe_cast_inputs(inputs)
-          with autocast_variable.enable_auto_cast_variables(
-              self._compute_dtype_object):
-            outputs = self.call(cast_inputs, *args, **kwargs)
-          self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, input_masks)
-
-    return outputs
-
-  def _assert_built_as_v1(self):
-    if not hasattr(self, '_originally_built_as_v1'):
-      raise ValueError(
-          'Your Layer or Model is in an invalid state. '
-          'This can happen for the following cases:\n '
-          '1. You might be interleaving estimator/non-estimator models or '
-          'interleaving models/layers made in tf.compat.v1.Graph.as_default() '
-          'with models/layers created outside of it. '
-          'Converting a model to an estimator (via model_to_estimator) '
-          'invalidates all models/layers made before the conversion (even '
-          'if they were not the model converted to an estimator). '
-          'Similarly, making a layer or a model inside a '
-          'a tf.compat.v1.Graph invalidates all layers/models you previously '
-          'made outside of the graph.\n'
-          '2. You might be using a custom keras layer implementation with '
-          ' custom __init__ which didn\'t call super().__init__. '
-          ' Please check the implementation of %s and its bases.' %
-          (type(self),))
-
-  @property
-  def dtype(self):
-    return self._dtype_policy.variable_dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def dynamic(self):
-    return any(layer._dynamic for layer in self._flatten_layers())
-
-  @property
-  @doc_controls.do_not_generate_docs
-  def stateful(self):
-    return any(layer._stateful for layer in self._flatten_layers())
-
-  @stateful.setter
-  def stateful(self, value):
-    self._stateful = value
-
-  @property
-  def trainable(self):
-    return self._trainable
-
-  @trainable.setter
-  def trainable(self, value):
-    self._trainable = value
-    for layer in getattr(self, '_self_tracked_trackables', []):
-      layer.trainable = value
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
-
-  @activity_regularizer.setter
-  def activity_regularizer(self, regularizer):
-    """Optional regularizer function for the output of this layer."""
-    self._activity_regularizer = regularizer
-
-  @property
-  def input_spec(self):
-    return self._input_spec
-
-  @input_spec.setter
-  # Must be decorated to prevent tracking, since the input_spec can be nested
-  # InputSpec objects.
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def input_spec(self, value):
-    for v in tf.nest.flatten(value):
-      if v is not None and not isinstance(v, input_spec.InputSpec):
-        raise TypeError('Layer input_spec must be an instance of InputSpec. '
-                        'Got: {}'.format(v))
-    self._input_spec = value
-
-  @property
-  def updates(self):
-    collected_updates = []
-    all_layers = self._flatten_layers()
-    with backend.get_graph().as_default():
-      for layer in all_layers:
-        if not layer.trainable and not layer.stateful:
-          continue
-        for u in layer._updates:
-          if callable(u):
+            for symbolic_loss in symbolic_losses:
+                if getattr(self, "_is_graph_network", False):
+                    self._graph_network_add_loss(symbolic_loss)
+                else:
+                    # Possible a loss was added in a Layer's `build`.
+                    self._losses.append(symbolic_loss)
+
+    @property
+    def metrics(self):
+        collected_metrics = []
+        for layer in self._flatten_layers():
+            collected_metrics.extend(layer._metrics)
+        return collected_metrics
+
+    @doc_controls.for_subclass_implementers
+    def add_metric(self, value, aggregation=None, name=None):
+        """Adds metric tensor to the layer.
+
+        Args:
+          value: Metric tensor.
+          aggregation: Sample-wise metric reduction function. If
+            `aggregation=None`, it indicates that the metric tensor provided has
+            been aggregated already. eg, `bin_acc = BinaryAccuracy(name='acc')`
+            followed by `model.add_metric(bin_acc(y_true, y_pred))`. If
+            aggregation='mean', the given metric tensor will be sample-wise
+            reduced using `mean` function.  eg,
+            `model.add_metric(tf.reduce_sum(outputs), name='output_mean',
+            aggregation='mean')`.
+          name: String metric name.
+
+        Raises:
+          ValueError: If `aggregation` is anything other than None or `mean`.
+        """
+        if aggregation is not None and aggregation != "mean":
+            raise ValueError(
+                "We currently support only `mean` sample-wise metric "
+                "aggregation. You provided aggregation=`%s`" % aggregation
+            )
+
+        from_metric_obj = hasattr(value, "_metric_obj")
+        is_symbolic = tf_utils.is_symbolic_tensor(value)
+        in_call_context = base_layer_utils.call_context().in_call
+
+        if name is None and not from_metric_obj:
+            # Eg. `self.add_metric(math_ops.reduce_sum(x), aggregation='mean')`
+            # In eager mode, we use metric name to lookup a metric. Without a
+            # name, a new Mean metric wrapper will be created on every
+            # model/layer call. So, we raise an error when no name is provided.
+            # We will do the same for symbolic mode for consistency although a
+            # name will be generated if no name is provided.
+
+            # We will not raise this error in the foll use case for the sake of
+            # consistency as name in provided in the metric constructor.
+            # mean = metrics.Mean(name='my_metric')
+            # model.add_metric(mean(outputs))
+            raise ValueError(
+                "Please provide a name for your metric like "
+                "`self.add_metric(tf.reduce_sum(inputs), "
+                "name='mean_activation', aggregation='mean')`"
+            )
+        elif from_metric_obj:
+            name = value._metric_obj.name
+
+        if in_call_context:
+            # TF Function path should take the eager path.
+            self._symbolic_add_metric(value, aggregation, name)
+        else:
+            if not is_symbolic:
+                raise ValueError(
+                    "Expected a symbolic Tensor for the metric value, "
+                    "received: " + str(value)
+                )
+
+            # Possible a metric was added in a Layer's `build`.
+            if not getattr(self, "_is_graph_network", False):
+                with backend.get_graph().as_default():
+                    self._symbolic_add_metric(value, aggregation, name)
+                return
+
+            if from_metric_obj:
+                raise ValueError(
+                    "Using the result of calling a `Metric` object "
+                    "when calling `add_metric` on a Functional "
+                    "Model is not supported. Please pass the "
+                    "Tensor to monitor directly."
+                )
+
+            # Insert layers into the Keras Graph Network.
+            self._graph_network_add_metric(value, aggregation, name)
+
+    @doc_controls.for_subclass_implementers
+    def add_update(self, updates):
+        """Add update op(s), potentially dependent on layer inputs.
+
+        Weight updates (for instance, the updates of the moving mean and
+        variance in a BatchNormalization layer) may be dependent on the inputs
+        passed when calling a layer. Hence, when reusing the same layer on
+        different inputs `a` and `b`, some entries in `layer.updates` may be
+        dependent on `a` and some on `b`. This method automatically keeps track
+        of dependencies.
+
+        The `get_updates_for` method allows to retrieve the updates relevant to
+        a specific set of inputs.
+
+        This call is ignored when eager execution is enabled (in that case,
+        variable updates are run on the fly and thus do not need to be tracked
+        for later execution).
+
+        Args:
+          updates: Update op, or list/tuple of update ops, or zero-arg callable
+            that returns an update op. A zero-arg callable should be passed in
+            order to disable running the updates by setting `trainable=False`
+            on this Layer, when executing in Eager mode.
+        """
+        call_context = base_layer_utils.call_context()
+
+        if (
+            tf.distribute.has_strategy()
+            and tf.distribute.in_cross_replica_context()
+            # When saving the model, the distribution strategy context should be
+            # ignored, following the default path for adding updates.
+            and not call_context.saving
+        ):
+            # Updates don't need to be run in a cross-replica context.
+            return
+
+        updates = generic_utils.to_list(updates)
+
+        if call_context.in_call:
+            relevant_inputs = call_context.inputs
+        else:
+            inbound_nodes = getattr(self, "_inbound_nodes", [])
+            relevant_inputs = [node.input_tensors for node in inbound_nodes]
+
+        def process_update(x):
+            """Standardize update ops.
+
+            Args:
+              x: Tensor, op, or callable.
+
+            Returns:
+              An update op.
+            """
+            if callable(x):
+                update = lambda: process_update(x())
+                return update()
+            elif isinstance(x, tf.Operation):
+                update = x
+            elif hasattr(x, "op"):
+                update = x.op
+            else:
+                update = tf.convert_to_tensor(x)
+
+            reachable = tf_utils.get_reachable_from_inputs(
+                relevant_inputs, [update]
+            )
+            update._unconditional_update = update not in reachable
+            return update
+
+        updates = [process_update(x) for x in updates]
+        self._updates.extend(updates)
+
+    def set_weights(self, weights):
+        """Sets the weights of the layer, from Numpy arrays.
+
+        The weights of a layer represent the state of the layer. This function
+        sets the weight values from numpy arrays. The weight values should be
+        passed in the order they are created by the layer. Note that the layer's
+        weights must be instantiated before calling this function by calling
+        the layer.
+
+        For example, a Dense layer returns a list of two values-- per-output
+        weights and the bias value. These can be used to set the weights of
+        another Dense layer:
+
+        >>> a = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(1.))
+        >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]]))
+        >>> a.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+        >>> b = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(2.))
+        >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]]))
+        >>> b.get_weights()
+        [array([[2.],
+               [2.],
+               [2.]], dtype=float32), array([0.], dtype=float32)]
+        >>> b.set_weights(a.get_weights())
+        >>> b.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+
+        Args:
+            weights: a list of Numpy arrays. The number
+                of arrays and their shape must match
+                number of the dimensions of the weights
+                of the layer (i.e. it should match the
+                output of `get_weights`).
+
+        Raises:
+            ValueError: If the provided weights list does not match the
+                layer's specifications.
+        """
+        params = self.weights
+
+        expected_num_weights = 0
+        for param in params:
+            if isinstance(param, base_layer_utils.TrackableWeightHandler):
+                expected_num_weights += param.num_tensors
+            else:
+                expected_num_weights += 1
+
+        if expected_num_weights != len(weights):
+            raise ValueError(
+                'You called `set_weights(weights)` on layer "%s" '
+                "with a weight list of length %s, but the layer was "
+                "expecting %s weights. Provided weights: %s..."
+                % (
+                    self.name,
+                    len(weights),
+                    expected_num_weights,
+                    str(weights)[:50],
+                )
+            )
+
+        weight_index = 0
+        weight_value_tuples = []
+        for param in params:
+            if isinstance(param, base_layer_utils.TrackableWeightHandler):
+                num_tensors = param.num_tensors
+                tensors = weights[weight_index : weight_index + num_tensors]
+                param.set_weights(tensors)
+                weight_index += num_tensors
+            else:
+                weight = weights[weight_index]
+                weight_shape = weight.shape if hasattr(weight, "shape") else ()
+                ref_shape = param.shape
+                if not ref_shape.is_compatible_with(weight_shape):
+                    raise ValueError(
+                        "Layer weight shape %s not compatible with provided "
+                        "weight shape %s" % (ref_shape, weight_shape)
+                    )
+                weight_value_tuples.append((param, weight))
+                weight_index += 1
+
+        backend.batch_set_value(weight_value_tuples)
+
+    def get_weights(self):
+        """Returns the current weights of the layer.
+
+        The weights of a layer represent the state of the layer. This function
+        returns both trainable and non-trainable weight values associated with
+        this layer as a list of Numpy arrays, which can in turn be used to load
+        state into similarly parameterized layers.
+
+        For example, a Dense layer returns a list of two values-- per-output
+        weights and the bias value. These can be used to set the weights of
+        another Dense layer:
+
+        >>> a = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(1.))
+        >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]]))
+        >>> a.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+        >>> b = tf.keras.layers.Dense(1,
+        ...   kernel_initializer=tf.constant_initializer(2.))
+        >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]]))
+        >>> b.get_weights()
+        [array([[2.],
+               [2.],
+               [2.]], dtype=float32), array([0.], dtype=float32)]
+        >>> b.set_weights(a.get_weights())
+        >>> b.get_weights()
+        [array([[1.],
+               [1.],
+               [1.]], dtype=float32), array([0.], dtype=float32)]
+
+        Returns:
+            Weights values as a list of numpy arrays.
+        """
+        weights = self.weights
+        output_weights = []
+        for weight in weights:
+            if isinstance(weight, base_layer_utils.TrackableWeightHandler):
+                output_weights.extend(weight.get_tensors())
+            else:
+                output_weights.append(weight)
+        return backend.batch_get_value(output_weights)
+
+    def get_updates_for(self, inputs):
+        """Retrieves updates relevant to a specific set of inputs.
+
+        Args:
+          inputs: Input tensor or list/tuple of input tensors.
+
+        Returns:
+          List of update ops of the layer that depend on `inputs`.
+        """
+        if inputs is None:
+            # Requesting unconditional updates.
+            return [u for u in self.updates if u._unconditional_update]
+
+        # Requesting input-conditional updates.
+        updates = [u for u in self.updates if not u._unconditional_update]
+        inputs = tf.nest.flatten(inputs)
+        reachable = tf_utils.get_reachable_from_inputs(inputs, updates)
+        return [u for u in updates if u in reachable]
+
+    def get_losses_for(self, inputs):
+        """Retrieves losses relevant to a specific set of inputs.
+
+        Args:
+          inputs: Input tensor or list/tuple of input tensors.
+
+        Returns:
+          List of loss tensors of the layer that depend on `inputs`.
+        """
+        if inputs is None:
+            # Requesting unconditional losses.
+            return [l for l in self.losses if l._unconditional_loss]
+
+        # Requesting input-conditional losses.
+        losses = [l for l in self.losses if not l._unconditional_loss]
+        inputs = tf.nest.flatten(inputs)
+        reachable = tf_utils.get_reachable_from_inputs(inputs, losses)
+        return [l for l in losses if l in reachable]
+
+    def get_input_mask_at(self, node_index):
+        """Retrieves the input mask tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A mask tensor
+            (or list of tensors if the layer has multiple inputs).
+        """
+        inputs = self.get_input_at(node_index)
+        if isinstance(inputs, list):
+            return [getattr(x, "_keras_mask", None) for x in inputs]
+        else:
+            return getattr(inputs, "_keras_mask", None)
+
+    def get_output_mask_at(self, node_index):
+        """Retrieves the output mask tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A mask tensor
+            (or list of tensors if the layer has multiple outputs).
+        """
+        output = self.get_output_at(node_index)
+        if isinstance(output, list):
+            return [getattr(x, "_keras_mask", None) for x in output]
+        else:
+            return getattr(output, "_keras_mask", None)
+
+    @property
+    def input_mask(self):
+        """Retrieves the input mask tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one inbound node,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+            Input mask tensor (potentially None) or list of input
+            mask tensors.
+
+        Raises:
+            AttributeError: if the layer is connected to
+            more than one incoming layers.
+        """
+        inputs = self.input
+        if isinstance(inputs, list):
+            return [getattr(x, "_keras_mask", None) for x in inputs]
+        else:
+            return getattr(inputs, "_keras_mask", None)
+
+    @property
+    def output_mask(self):
+        """Retrieves the output mask tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one inbound node,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+            Output mask tensor (potentially None) or list of output
+            mask tensors.
+
+        Raises:
+            AttributeError: if the layer is connected to
+            more than one incoming layers.
+        """
+        output = self.output
+        if isinstance(output, list):
+            return [getattr(x, "_keras_mask", None) for x in output]
+        else:
+            return getattr(output, "_keras_mask", None)
+
+    def get_input_shape_at(self, node_index):
+        """Retrieves the input shape(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A shape tuple
+            (or list of shape tuples if the layer has multiple inputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "input_shapes", "input shape"
+        )
+
+    def get_output_shape_at(self, node_index):
+        """Retrieves the output shape(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first time the layer was called.
+
+        Returns:
+            A shape tuple
+            (or list of shape tuples if the layer has multiple outputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "output_shapes", "output shape"
+        )
+
+    def get_input_at(self, node_index):
+        """Retrieves the input tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first input node of the layer.
+
+        Returns:
+            A tensor (or list of tensors if the layer has multiple inputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "input_tensors", "input"
+        )
+
+    def get_output_at(self, node_index):
+        """Retrieves the output tensor(s) of a layer at a given node.
+
+        Args:
+            node_index: Integer, index of the node
+                from which to retrieve the attribute.
+                E.g. `node_index=0` will correspond to the
+                first output node of the layer.
+
+        Returns:
+            A tensor (or list of tensors if the layer has multiple outputs).
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+        """
+        return self._get_node_attribute_at_index(
+            node_index, "output_tensors", "output"
+        )
+
+    @property
+    def input(self):
+        """Retrieves the input tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one input,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+            Input tensor or list of input tensors.
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+          AttributeError: If no inbound nodes are found.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                "Layer " + self.name + " is not connected, no input to return."
+            )
+        return self._get_node_attribute_at_index(0, "input_tensors", "input")
+
+    @property
+    def output(self):
+        """Retrieves the output tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one output,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+          Output tensor or list of output tensors.
+
+        Raises:
+          AttributeError: if the layer is connected to more than one incoming
+            layers.
+          RuntimeError: if called in Eager mode.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                "Layer " + self.name + " has no inbound nodes."
+            )
+        return self._get_node_attribute_at_index(0, "output_tensors", "output")
+
+    @property
+    def input_shape(self):
+        """Retrieves the input shape(s) of a layer.
+
+        Only applicable if the layer has exactly one input,
+        i.e. if it is connected to one incoming layer, or if all inputs
+        have the same shape.
+
+        Returns:
+            Input shape, as an integer shape tuple
+            (or list of shape tuples, one tuple per input tensor).
+
+        Raises:
+            AttributeError: if the layer has no defined input_shape.
+            RuntimeError: if called in Eager mode.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                f'The layer "{self.name}" has never been called '
+                "and thus has no defined input shape. Note that the "
+                "`input_shape` property is only available for "
+                "Functional and Sequential models."
+            )
+        all_input_shapes = set(
+            [str(node.input_shapes) for node in self._inbound_nodes]
+        )
+        if len(all_input_shapes) == 1:
+            return self._inbound_nodes[0].input_shapes
+        else:
+            raise AttributeError(
+                'The layer "' + str(self.name) + " has multiple inbound nodes, "
+                "with different input shapes. Hence "
+                'the notion of "input shape" is '
+                "ill-defined for the layer. "
+                "Use `get_input_shape_at(node_index)` "
+                "instead."
+            )
+
+    def count_params(self):
+        """Count the total number of scalars composing the weights.
+
+        Returns:
+            An integer count.
+
+        Raises:
+            ValueError: if the layer isn't yet built
+              (in which case its weights aren't yet defined).
+        """
+        if not self.built:
+            if getattr(self, "_is_graph_network", False):
+                with tf_utils.maybe_init_scope(self):
+                    self._maybe_build(self.inputs)
+            else:
+                raise ValueError(
+                    "You tried to call `count_params` on "
+                    + self.name
+                    + ", but the layer isn't built. "
+                    "You can build it manually via: `"
+                    + self.name
+                    + ".build(batch_input_shape)`."
+                )
+        return layer_utils.count_params(self.weights)
+
+    @property
+    def output_shape(self):
+        """Retrieves the output shape(s) of a layer.
+
+        Only applicable if the layer has one output,
+        or if all outputs have the same shape.
+
+        Returns:
+            Output shape, as an integer shape tuple
+            (or list of shape tuples, one tuple per output tensor).
+
+        Raises:
+            AttributeError: if the layer has no defined output shape.
+            RuntimeError: if called in Eager mode.
+        """
+        if not self._inbound_nodes:
+            raise AttributeError(
+                "The layer has never been called "
+                "and thus has no defined output shape."
+            )
+        all_output_shapes = set(
+            [str(node.output_shapes) for node in self._inbound_nodes]
+        )
+        if len(all_output_shapes) == 1:
+            return self._inbound_nodes[0].output_shapes
+        else:
+            raise AttributeError(
+                'The layer "%s"'
+                " has multiple inbound nodes, "
+                "with different output shapes. Hence "
+                'the notion of "output shape" is '
+                "ill-defined for the layer. "
+                "Use `get_output_shape_at(node_index)` "
+                "instead." % self.name
+            )
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def inbound_nodes(self):
+        """Deprecated, do NOT use! Only for external Keras compatibility ."""
+        return self._inbound_nodes
+
+    @property
+    @doc_controls.do_not_doc_inheritable
+    def outbound_nodes(self):
+        """Deprecated, do NOT use! Only for external Keras compatibility ."""
+        return self._outbound_nodes
+
+    ###########################################################################
+    # Methods & attributes below are public aliases of other methods.         #
+    ###########################################################################
+
+    @property
+    def variables(self):
+        """Returns the list of all layer variables/weights.
+
+        Alias of `self.weights`.
+
+        Returns:
+          A list of variables.
+        """
+        return self.weights
+
+    @property
+    def trainable_variables(self):
+        return self.trainable_weights
+
+    @property
+    def non_trainable_variables(self):
+        return self.non_trainable_weights
+
+    ############################################################################
+    # Methods & attributes below are all private and only used by the framework.
+    ############################################################################
+
+    @property
+    def _inbound_nodes(self):
+        return self._inbound_nodes_value
+
+    @_inbound_nodes.setter
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _inbound_nodes(self, value):
+        self._inbound_nodes_value = value
+
+    @property
+    def _outbound_nodes(self):
+        return self._outbound_nodes_value
+
+    @_outbound_nodes.setter
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _outbound_nodes(self, value):
+        self._outbound_nodes_value = value
+
+    def _set_dtype_policy(self, dtype):
+        """Sets self._dtype_policy."""
+        if isinstance(dtype, policy.Policy):
+            self._dtype_policy = dtype
+        elif isinstance(dtype, dict):
+            self._dtype_policy = policy.deserialize(dtype)
+        elif isinstance(dtype, str) and dtype in (
+            "mixed_float16",
+            "mixed_bfloat16",
+        ):
+            # The isinstance check is required since np.dtype raises an error if
+            # compared to a non-dtype string.
+            self._dtype_policy = policy.Policy(dtype)
+        elif dtype:
+            self._dtype_policy = policy.Policy(tf.as_dtype(dtype).name)
+        else:
+            self._dtype_policy = policy.global_policy()
+        if (
+            self._dtype_policy.name == "mixed_float16"
+            and not loss_scale_optimizer.strategy_supports_loss_scaling()
+        ):
+            # Although only loss scaling doesn't support certain strategies, to
+            # avoid confusion, we disallow the 'mixed_float16' policy with
+            # unsupported strategies. This is because 'mixed_float16' requires
+            # loss scaling for numeric stability.
+            strategy = tf.distribute.get_strategy()
+            raise ValueError(
+                "Mixed precision is not supported with the "
+                "tf.distribute.Strategy: %s. Either stop using mixed "
+                'precision by removing the use of the "%s" policy or '
+                "use a different Strategy, e.g. a MirroredStrategy."
+                % (strategy.__class__.__name__, self._dtype_policy.name)
+            )
+
+        # Performance optimization: cache the compute dtype as a Dtype object or
+        # None, so that str to Dtype conversion doesn't happen in
+        # Layer.__call__.
+        if self._dtype_policy.compute_dtype:
+            self._compute_dtype_object = tf.as_dtype(
+                self._dtype_policy.compute_dtype
+            )
+        else:
+            self._compute_dtype_object = None
+
+    # TODO(reedwm): Expose this property?
+    @property
+    def _compute_dtype(self):
+        """The layer's compute dtype.
+
+        Unless mixed-precision is used, this is the same as `Layer.dtype`.
+
+        If self._autocast is True, layer's will cast floating-point inputs to
+        this.
+
+        Returns:
+          The layer's compute dtype.
+        """
+        return self._dtype_policy.compute_dtype
+
+    def _maybe_cast_inputs(self, inputs):
+        """Maybe casts the inputs to the compute dtype.
+
+        If self._compute_dtype is floating-point, and self_autocast is True,
+        floating-point inputs are casted to self._compute_dtype.
+
+        Args:
+          inputs: Input tensor, or structure of input tensors.
+
+        Returns:
+          `inputs`, but tensors may have been casted to self._compute_dtype
+        """
+        compute_dtype = self._compute_dtype
+        if (
+            self._autocast
+            and compute_dtype
+            and tf.as_dtype(compute_dtype).is_floating
+        ):
+
+            def f(x):
+                """Cast a single Tensor or TensorSpec to the compute dtype."""
+                cast_types = (tf.Tensor, tf.SparseTensor, tf.RaggedTensor)
+                if (
+                    isinstance(x, cast_types)
+                    and x.dtype.is_floating
+                    and x.dtype.base_dtype.name != compute_dtype
+                ):
+                    return tf.cast(x, compute_dtype)
+                elif isinstance(x, tf.TensorSpec) and x.dtype.is_floating:
+                    # Inputs may be TensorSpecs when this function is called
+                    # from model._set_inputs.
+                    return tf.TensorSpec(x.shape, compute_dtype, x.name)
+                else:
+                    return x
+
+            return tf.nest.map_structure(f, inputs)
+        else:
+            return inputs
+
+    # _dtype used to be an attribute set in the constructor. We still expose it
+    # because some clients still use it.
+    # TODO(reedwm): Deprecate, then remove the _dtype property.
+    @property
+    def _dtype(self):
+        # This is equivalent to returning self.dtype . We do not return
+        # self.dtype as it would cause infinite recursion in a few subclasses,
+        # which override "dtype" to return self._dtype.
+        return self._dtype_policy.variable_dtype
+
+    @_dtype.setter
+    def _dtype(self, value):
+        value = tf.as_dtype(value).name
+        self._set_dtype_policy(policy.Policy(value))
+
+    def _name_scope(self):
+        return self.name
+
+    def _init_set_name(self, name, zero_based=True):
+        if not name:
+            self._name = backend.unique_object_name(
+                generic_utils.to_snake_case(self.__class__.__name__),
+                zero_based=zero_based,
+            )
+        else:
+            self._name = name
+
+    def _get_existing_metric(self, name=None):
+        match = [m for m in self._metrics if m.name == name]
+        if not match:
+            return
+        if len(match) > 1:
+            raise ValueError(
+                "Please provide different names for the metrics you have "
+                'added. We found {} metrics with the name: "{}"'.format(
+                    len(match), name
+                )
+            )
+        return match[0]
+
+    def _symbolic_add_metric(self, value, aggregation=None, name=None):
+        base_layer_utils.check_graph_consistency(value, method="add_metric")
+        match = self._get_existing_metric(name)
+        if aggregation is None:
+            # Iterate over the metrics and check if the given metric exists
+            # already.  This can happen when a metric instance is created in
+            # subclassed model layer `__init__` and we have tracked that
+            # instance already in model.__setattr__.
+            if match:
+                result_tensor = value
+                metric_obj = match
+            elif hasattr(value, "_metric_obj"):
+                # We track the instance using the metadata on the result tensor.
+                result_tensor = value
+                metric_obj = result_tensor._metric_obj
+                self._metrics.append(metric_obj)
+            else:
+                raise ValueError(
+                    "We do not support adding an aggregated metric result "
+                    "tensor that is not the output of a "
+                    "`tf.keras.metrics.Metric` metric instance. Without "
+                    "having access to the metric instance we cannot reset the "
+                    "state of a metric after every epoch during training. You "
+                    "can create a `tf.keras.metrics.Metric` instance and pass "
+                    "the result here or pass an un-aggregated result with "
+                    "`aggregation` parameter set as `mean`. For example: "
+                    "`self.add_metric(tf.reduce_sum(inputs), "
+                    "name='mean_activation', aggregation='mean')` "
+                )
+        else:
+            # If a non-aggregated tensor is given as input (ie. `aggregation` is
+            # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
+            if match:
+                result_tensor = match(value)
+                metric_obj = match
+            else:
+                metric_obj, result_tensor = base_layer_utils.create_mean_metric(
+                    value, name
+                )
+                self._metrics.append(metric_obj)
+
+    def _handle_weight_regularization(self, name, variable, regularizer):
+        """Create lambdas which compute regularization losses."""
+
+        def _loss_for_variable(v):
+            """Creates a regularization loss `Tensor` for variable `v`."""
+            with backend.name_scope(name + "/Regularizer"):
+                regularization = regularizer(v)
+            return regularization
+
+        if base_layer_utils.is_split_variable(variable):
+            for v in variable:
+                self.add_loss(functools.partial(_loss_for_variable, v))
+        else:
+            self.add_loss(functools.partial(_loss_for_variable, variable))
+
+    def _handle_activity_regularization(self, inputs, outputs):
+        # Apply activity regularization.
+        # Note that it should be applied every time the layer creates a new
+        # output, since it is output-specific.
+        if self._activity_regularizer:
+            output_list = tf.nest.flatten(outputs)
+            with backend.name_scope("ActivityRegularizer"):
+                for output in output_list:
+                    activity_loss = tf.convert_to_tensor(
+                        self._activity_regularizer(output)
+                    )
+                    batch_size = tf.cast(
+                        tf.compat.v1.shape(output)[0], activity_loss.dtype
+                    )
+                    # Make activity regularization strength batch-agnostic.
+                    mean_activity_loss = activity_loss / batch_size
+                    base_layer_utils.check_graph_consistency(
+                        mean_activity_loss, method="activity_regularizer"
+                    )
+                    self.add_loss(mean_activity_loss, inputs=inputs)
+
+    def _set_mask_metadata(self, inputs, outputs, previous_mask):
+        flat_outputs = tf.nest.flatten(outputs)
+
+        mask_already_computed = getattr(
+            self, "_compute_output_and_mask_jointly", False
+        ) or all(
+            getattr(x, "_keras_mask", None) is not None for x in flat_outputs
+        )
+
+        # Only compute the mask if the Layer explicitly supports masking or has
+        # overridden `compute_mask`.
+        should_compute_mask = hasattr(self, "compute_mask") and (
+            self.supports_masking
+            or not getattr(self.compute_mask, "_is_default", False)
+        )
+
+        if mask_already_computed:
+            flat_masks = [getattr(x, "_keras_mask", None) for x in flat_outputs]
+        elif not should_compute_mask:
+            flat_masks = [None for _ in flat_outputs]
+        else:
+            output_masks = self.compute_mask(inputs, previous_mask)
+            # `compute_mask` can return a single `None` even when a Layer
+            # has multiple outputs.
+            if output_masks is None:
+                flat_masks = [None for _ in flat_outputs]
+            else:
+                flat_masks = tf.nest.flatten(output_masks)
+
+        for output, mask in zip(flat_outputs, flat_masks):
             try:
-              u = u()
-            except ValueError as e:
-              if 'InaccessibleTensorError' in type(e).__name__:
-                # For one specific case of error we try to raise
-                # a more meaningful error message about the graph if we can.
-                # This error is an internal TF symbol that is not
-                # publicly exposed, so we check the name directly rather
-                # than using a direct import.
-                base_layer_utils.check_graph_consistency(
-                    method='add_update', force_raise=True)
-              raise  # check_graph_consistency may not always raise.
-          base_layer_utils.check_graph_consistency(u, method='add_update')
-          collected_updates.append(u)
-    return collected_updates
-
-  @property
-  def losses(self):
-    """Losses which are associated with this `Layer`.
-
-    Variable regularization tensors are created when this property is accessed,
-    so it is eager safe: accessing `losses` under a `tf.GradientTape` will
-    propagate gradients back to the corresponding variables.
-
-    Returns:
-      A list of tensors.
-    """
-    collected_losses = []
-    all_layers = self._flatten_layers()
-    for layer in all_layers:
-      # If any eager losses are present, we assume the model to be part of an
-      # eager training loop (either a custom one or the one used when
-      # `run_eagerly=True`) and so we always return just the eager losses.
-      collected_losses.extend(layer._losses)
-      for regularizer in layer._callable_losses:
-        loss_tensor = regularizer()
-        if loss_tensor is not None:
-          collected_losses.append(loss_tensor)
-    return collected_losses
-
-  @doc_controls.for_subclass_implementers
-  def add_loss(self, losses, inputs=None):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    This method can be used inside a subclassed layer or model's `call`
-    function, in which case `losses` should be a Tensor or list of Tensors.
-
-    Example:
-
-    ```python
-    class MyLayer(tf.keras.layers.Layer):
-      def call(inputs, self):
-        self.add_loss(tf.abs(tf.reduce_mean(inputs)), inputs=True)
-        return inputs
-    ```
-
-    This method can also be called directly on a Functional Model during
-    construction. In this case, any loss Tensors passed to this Model must
-    be symbolic and be able to be traced back to the model's `Input`s. These
-    losses become part of the model's topology and are tracked in `get_config`.
-
-    Example:
-
-    ```python
-    inputs = tf.keras.Input(shape=(10,))
-    x = tf.keras.layers.Dense(10)(inputs)
-    outputs = tf.keras.layers.Dense(1)(x)
-    model = tf.keras.Model(inputs, outputs)
-    # Activity regularization.
-    model.add_loss(tf.abs(tf.reduce_mean(x)))
-    ```
-
-    If this is not the case for your loss (if, for example, your loss references
-    a `Variable` of one of the model's layers), you can wrap your loss in a
-    zero-argument lambda. These losses are not tracked as part of the model's
-    topology since they can't be serialized.
-
-    Example:
-
-    ```python
-    inputs = tf.keras.Input(shape=(10,))
-    x = tf.keras.layers.Dense(10)(inputs)
-    outputs = tf.keras.layers.Dense(1)(x)
-    model = tf.keras.Model(inputs, outputs)
-    # Weight regularization.
-    model.add_loss(lambda: tf.reduce_mean(x.kernel))
-    ```
-
-    Args:
-      losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
-        may also be zero-argument callables which create a loss tensor.
-      inputs: Ignored when executing eagerly. If anything other than None is
-        passed, it signals the losses are conditional on some of the layer's
-        inputs, and thus they should only be run where these inputs are
-        available. This is the case for activity regularization losses, for
-        instance. If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
-    """
-    def _tag_unconditional(loss):
-      """Process the loss and tag it by setting loss._unconditional_loss."""
-      if callable(loss):
-        # We run the loss without autocasting, as regularizers are often
-        # numerically unstable in float16.
-        with autocast_variable.enable_auto_cast_variables(None):
-          loss = loss()
-      if loss is None:
-        return None  # Will be filtered out when computing the .losses property
-      if not tf.is_tensor(loss):
-        loss = tf.convert_to_tensor(
-            loss, dtype=backend.floatx())
-      loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
-      return loss
-
-    losses = tf.nest.flatten(losses)
-
-    callable_losses = []
-    symbolic_losses = []
-    for loss in losses:
-      if callable(loss):
-        callable_losses.append(functools.partial(_tag_unconditional, loss))
-        continue
-      if loss is None:
-        continue
-      if not tf.is_tensor(loss):
-        loss = tf.convert_to_tensor(
-            loss, dtype=backend.floatx())
-      # TF Functions should take the eager path.
-      if (tf_utils.is_symbolic_tensor(loss) and
-          not base_layer_utils.is_in_tf_function()):
-        symbolic_losses.append(_tag_unconditional(loss))
-        base_layer_utils.check_graph_consistency(loss, method='add_loss')
-
-    self._callable_losses.extend(callable_losses)
-
-    in_call_context = base_layer_utils.call_context().in_call
-
-    if in_call_context:
-      for symbolic_loss in symbolic_losses:
-        self._losses.append(symbolic_loss)
-    else:
-      for symbolic_loss in symbolic_losses:
-        if getattr(self, '_is_graph_network', False):
-          self._graph_network_add_loss(symbolic_loss)
+                output._keras_mask = mask
+            except AttributeError:
+                # C Type such as np.ndarray.
+                pass
+
+        if tf_utils.are_all_symbolic_tensors(flat_outputs):
+            for output in flat_outputs:
+                if getattr(output, "_keras_mask", None) is not None:
+                    # Do not track masks for `TensorFlowOpLayer` construction.
+                    output._keras_mask._keras_history_checked = True
+
+    def _collect_input_masks(self, inputs, args, kwargs):
+        """Checks if mask argument was passed, else gathers mask from inputs."""
+        if self._call_spec.arg_was_passed("mask", args, kwargs):
+            return self._call_spec.get_arg_value("mask", args, kwargs)
+
+        if not self._should_compute_mask:
+            return None
+
+        input_masks = tf.nest.map_structure(
+            lambda t: getattr(t, "_keras_mask", None), inputs
+        )
+        if generic_utils.is_all_none(input_masks):
+            return None
+        return input_masks
+
+    def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+        """Private utility to retrieves an attribute (e.g. inputs) from a node.
+
+        This is used to implement the methods:
+            - get_input_shape_at
+            - get_output_shape_at
+            - get_input_at
+            etc...
+
+        Args:
+            node_index: Integer index of the node from which
+                to retrieve the attribute.
+            attr: Exact node attribute name.
+            attr_name: Human-readable attribute name, for error messages.
+
+        Returns:
+            The layer's attribute `attr` at the node of index `node_index`.
+
+        Raises:
+            RuntimeError: If the layer has no inbound nodes, or if called in
+                Eager mode.
+            ValueError: If the index provided does not match any node.
+        """
+        if not self._inbound_nodes:
+            raise RuntimeError(
+                "The layer has never been called and thus has no defined "
+                + attr_name
+                + "."
+            )
+        if not len(self._inbound_nodes) > node_index:
+            raise ValueError(
+                "Asked to get "
+                + attr_name
+                + " at node "
+                + str(node_index)
+                + ", but the layer has only "
+                + str(len(self._inbound_nodes))
+                + " inbound nodes."
+            )
+        values = getattr(self._inbound_nodes[node_index], attr)
+        if isinstance(values, list) and len(values) == 1:
+            return values[0]
         else:
-          # Possible a loss was added in a Layer's `build`.
-          self._losses.append(symbolic_loss)
-
-  @property
-  def metrics(self):
-    collected_metrics = []
-    for layer in self._flatten_layers():
-      collected_metrics.extend(layer._metrics)
-    return collected_metrics
-
-  @doc_controls.for_subclass_implementers
-  def add_metric(self, value, aggregation=None, name=None):
-    """Adds metric tensor to the layer.
-
-    Args:
-      value: Metric tensor.
-      aggregation: Sample-wise metric reduction function. If `aggregation=None`,
-        it indicates that the metric tensor provided has been aggregated
-        already. eg, `bin_acc = BinaryAccuracy(name='acc')` followed by
-        `model.add_metric(bin_acc(y_true, y_pred))`. If aggregation='mean', the
-        given metric tensor will be sample-wise reduced using `mean` function.
-        eg, `model.add_metric(tf.reduce_sum(outputs), name='output_mean',
-        aggregation='mean')`.
-      name: String metric name.
-
-    Raises:
-      ValueError: If `aggregation` is anything other than None or `mean`.
-    """
-    if aggregation is not None and aggregation != 'mean':
-      raise ValueError(
-          'We currently support only `mean` sample-wise metric aggregation. '
-          'You provided aggregation=`%s`' % aggregation)
-
-    from_metric_obj = hasattr(value, '_metric_obj')
-    is_symbolic = tf_utils.is_symbolic_tensor(value)
-    in_call_context = base_layer_utils.call_context().in_call
-
-    if name is None and not from_metric_obj:
-      # Eg. `self.add_metric(math_ops.reduce_sum(x), aggregation='mean')`
-      # In eager mode, we use metric name to lookup a metric. Without a name,
-      # a new Mean metric wrapper will be created on every model/layer call.
-      # So, we raise an error when no name is provided.
-      # We will do the same for symbolic mode for consistency although a name
-      # will be generated if no name is provided.
-
-      # We will not raise this error in the foll use case for the sake of
-      # consistency as name in provided in the metric constructor.
-      # mean = metrics.Mean(name='my_metric')
-      # model.add_metric(mean(outputs))
-      raise ValueError('Please provide a name for your metric like '
-                       '`self.add_metric(tf.reduce_sum(inputs), '
-                       'name=\'mean_activation\', aggregation=\'mean\')`')
-    elif from_metric_obj:
-      name = value._metric_obj.name
-
-    if in_call_context:
-      # TF Function path should take the eager path.
-      self._symbolic_add_metric(value, aggregation, name)
-    else:
-      if not is_symbolic:
-        raise ValueError('Expected a symbolic Tensor for the metric value, '
-                         'received: ' + str(value))
-
-      # Possible a metric was added in a Layer's `build`.
-      if not getattr(self, '_is_graph_network', False):
-        with backend.get_graph().as_default():
-          self._symbolic_add_metric(value, aggregation, name)
-        return
-
-      if from_metric_obj:
-        raise ValueError('Using the result of calling a `Metric` object '
-                         'when calling `add_metric` on a Functional '
-                         'Model is not supported. Please pass the '
-                         'Tensor to monitor directly.')
-
-      # Insert layers into the Keras Graph Network.
-      self._graph_network_add_metric(value, aggregation, name)
-
-  @doc_controls.for_subclass_implementers
-  def add_update(self, updates):
-    """Add update op(s), potentially dependent on layer inputs.
-
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
-
-    This call is ignored when eager execution is enabled (in that case, variable
-    updates are run on the fly and thus do not need to be tracked for later
-    execution).
-
-    Args:
-      updates: Update op, or list/tuple of update ops, or zero-arg callable
-        that returns an update op. A zero-arg callable should be passed in
-        order to disable running the updates by setting `trainable=False`
-        on this Layer, when executing in Eager mode.
-    """
-    call_context = base_layer_utils.call_context()
-
-    if (tf.distribute.has_strategy() and
-        tf.distribute.in_cross_replica_context() and
-        # When saving the model, the distribution strategy context should be
-        # ignored, following the default path for adding updates.
-        not call_context.saving):
-      # Updates don't need to be run in a cross-replica context.
-      return
-
-    updates = generic_utils.to_list(updates)
-
-    if call_context.in_call:
-      relevant_inputs = call_context.inputs
-    else:
-      inbound_nodes = getattr(self, '_inbound_nodes', [])
-      relevant_inputs = [node.input_tensors for node in inbound_nodes]
-
-    def process_update(x):
-      """Standardize update ops.
-
-      Args:
-        x: Tensor, op, or callable.
-
-      Returns:
-        An update op.
-      """
-      if callable(x):
-        update = lambda: process_update(x())
-        return update()
-      elif isinstance(x, tf.Operation):
-        update = x
-      elif hasattr(x, 'op'):
-        update = x.op
-      else:
-        update = tf.convert_to_tensor(x)
-
-      reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, [update])
-      update._unconditional_update = update not in reachable
-      return update
-
-    updates = [process_update(x) for x in updates]
-    self._updates.extend(updates)
-
-  def set_weights(self, weights):
-    """Sets the weights of the layer, from Numpy arrays.
-
-    The weights of a layer represent the state of the layer. This function
-    sets the weight values from numpy arrays. The weight values should be
-    passed in the order they are created by the layer. Note that the layer's
-    weights must be instantiated before calling this function by calling
-    the layer.
-
-    For example, a Dense layer returns a list of two values-- per-output
-    weights and the bias value. These can be used to set the weights of another
-    Dense layer:
-
-    >>> a = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(1.))
-    >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]]))
-    >>> a.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(2.))
-    >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]]))
-    >>> b.get_weights()
-    [array([[2.],
-           [2.],
-           [2.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b.set_weights(a.get_weights())
-    >>> b.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-
-    Args:
-        weights: a list of Numpy arrays. The number
-            of arrays and their shape must match
-            number of the dimensions of the weights
-            of the layer (i.e. it should match the
-            output of `get_weights`).
-
-    Raises:
-        ValueError: If the provided weights list does not match the
-            layer's specifications.
-    """
-    params = self.weights
-
-    expected_num_weights = 0
-    for param in params:
-      if isinstance(param, base_layer_utils.TrackableWeightHandler):
-        expected_num_weights += param.num_tensors
-      else:
-        expected_num_weights += 1
-
-    if expected_num_weights != len(weights):
-      raise ValueError(
-          'You called `set_weights(weights)` on layer "%s" '
-          'with a weight list of length %s, but the layer was '
-          'expecting %s weights. Provided weights: %s...' %
-          (self.name, len(weights), expected_num_weights, str(weights)[:50]))
-
-    weight_index = 0
-    weight_value_tuples = []
-    for param in params:
-      if isinstance(param, base_layer_utils.TrackableWeightHandler):
-        num_tensors = param.num_tensors
-        tensors = weights[weight_index:weight_index + num_tensors]
-        param.set_weights(tensors)
-        weight_index += num_tensors
-      else:
-        weight = weights[weight_index]
-        weight_shape = weight.shape if hasattr(weight, 'shape') else ()
-        ref_shape = param.shape
-        if not ref_shape.is_compatible_with(weight_shape):
-          raise ValueError(
-              'Layer weight shape %s not compatible with provided weight '
-              'shape %s' % (ref_shape, weight_shape))
-        weight_value_tuples.append((param, weight))
-        weight_index += 1
-
-    backend.batch_set_value(weight_value_tuples)
-
-  def get_weights(self):
-    """Returns the current weights of the layer.
-
-    The weights of a layer represent the state of the layer. This function
-    returns both trainable and non-trainable weight values associated with this
-    layer as a list of Numpy arrays, which can in turn be used to load state
-    into similarly parameterized layers.
-
-    For example, a Dense layer returns a list of two values-- per-output
-    weights and the bias value. These can be used to set the weights of another
-    Dense layer:
-
-    >>> a = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(1.))
-    >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]]))
-    >>> a.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b = tf.keras.layers.Dense(1,
-    ...   kernel_initializer=tf.constant_initializer(2.))
-    >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]]))
-    >>> b.get_weights()
-    [array([[2.],
-           [2.],
-           [2.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b.set_weights(a.get_weights())
-    >>> b.get_weights()
-    [array([[1.],
-           [1.],
-           [1.]], dtype=float32), array([0.], dtype=float32)]
-
-    Returns:
-        Weights values as a list of numpy arrays.
-    """
-    weights = self.weights
-    output_weights = []
-    for weight in weights:
-      if isinstance(weight, base_layer_utils.TrackableWeightHandler):
-        output_weights.extend(weight.get_tensors())
-      else:
-        output_weights.append(weight)
-    return backend.batch_get_value(output_weights)
-
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
-
-    Args:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
-    """
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [u for u in self.updates if u._unconditional_update]
-
-    # Requesting input-conditional updates.
-    updates = [u for u in self.updates if not u._unconditional_update]
-    inputs = tf.nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, updates)
-    return [u for u in updates if u in reachable]
-
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
-
-    Args:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of loss tensors of the layer that depend on `inputs`.
-    """
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [l for l in self.losses if l._unconditional_loss]
-
-    # Requesting input-conditional losses.
-    losses = [l for l in self.losses if not l._unconditional_loss]
-    inputs = tf.nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, losses)
-    return [l for l in losses if l in reachable]
-
-  def get_input_mask_at(self, node_index):
-    """Retrieves the input mask tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A mask tensor
-        (or list of tensors if the layer has multiple inputs).
-    """
-    inputs = self.get_input_at(node_index)
-    if isinstance(inputs, list):
-      return [getattr(x, '_keras_mask', None) for x in inputs]
-    else:
-      return getattr(inputs, '_keras_mask', None)
-
-  def get_output_mask_at(self, node_index):
-    """Retrieves the output mask tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A mask tensor
-        (or list of tensors if the layer has multiple outputs).
-    """
-    output = self.get_output_at(node_index)
-    if isinstance(output, list):
-      return [getattr(x, '_keras_mask', None) for x in output]
-    else:
-      return getattr(output, '_keras_mask', None)
-
-  @property
-  def input_mask(self):
-    """Retrieves the input mask tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one inbound node,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input mask tensor (potentially None) or list of input
-        mask tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-    """
-    inputs = self.input
-    if isinstance(inputs, list):
-      return [getattr(x, '_keras_mask', None) for x in inputs]
-    else:
-      return getattr(inputs, '_keras_mask', None)
-
-  @property
-  def output_mask(self):
-    """Retrieves the output mask tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one inbound node,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Output mask tensor (potentially None) or list of output
-        mask tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-    """
-    output = self.output
-    if isinstance(output, list):
-      return [getattr(x, '_keras_mask', None) for x in output]
-    else:
-      return getattr(output, '_keras_mask', None)
-
-  def get_input_shape_at(self, node_index):
-    """Retrieves the input shape(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'input_shapes',
-                                             'input shape')
-
-  def get_output_shape_at(self, node_index):
-    """Retrieves the output shape(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'output_shapes',
-                                             'output shape')
-
-  def get_input_at(self, node_index):
-    """Retrieves the input tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first input node of the layer.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'input_tensors',
-                                             'input')
-
-  def get_output_at(self, node_index):
-    """Retrieves the output tensor(s) of a layer at a given node.
-
-    Args:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first output node of the layer.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'output_tensors',
-                                             'output')
-
-  @property
-  def input(self):
-    """Retrieves the input tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input tensor or list of input tensors.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-      AttributeError: If no inbound nodes are found.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name +
-                           ' is not connected, no input to return.')
-    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
-
-  @property
-  def output(self):
-    """Retrieves the output tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one output,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-      Output tensor or list of output tensors.
-
-    Raises:
-      AttributeError: if the layer is connected to more than one incoming
-        layers.
-      RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
-    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
-
-  @property
-  def input_shape(self):
-    """Retrieves the input shape(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer, or if all inputs
-    have the same shape.
-
-    Returns:
-        Input shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per input tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined input_shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError(f'The layer "{self.name}" has never been called '
-                           'and thus has no defined input shape. Note that the '
-                           '`input_shape` property is only available for '
-                           'Functional and Sequential models.')
-    all_input_shapes = set(
-        [str(node.input_shapes) for node in self._inbound_nodes])
-    if len(all_input_shapes) == 1:
-      return self._inbound_nodes[0].input_shapes
-    else:
-      raise AttributeError('The layer "' + str(self.name) +
-                           ' has multiple inbound nodes, '
-                           'with different input shapes. Hence '
-                           'the notion of "input shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_input_shape_at(node_index)` '
-                           'instead.')
-
-  def count_params(self):
-    """Count the total number of scalars composing the weights.
-
-    Returns:
-        An integer count.
-
-    Raises:
-        ValueError: if the layer isn't yet built
-          (in which case its weights aren't yet defined).
-    """
-    if not self.built:
-      if getattr(self, '_is_graph_network', False):
-        with tf_utils.maybe_init_scope(self):
-          self._maybe_build(self.inputs)
-      else:
-        raise ValueError('You tried to call `count_params` on ' + self.name +
-                         ', but the layer isn\'t built. '
-                         'You can build it manually via: `' + self.name +
-                         '.build(batch_input_shape)`.')
-    return layer_utils.count_params(self.weights)
-
-  @property
-  def output_shape(self):
-    """Retrieves the output shape(s) of a layer.
-
-    Only applicable if the layer has one output,
-    or if all outputs have the same shape.
-
-    Returns:
-        Output shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per output tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined output shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined output shape.')
-    all_output_shapes = set(
-        [str(node.output_shapes) for node in self._inbound_nodes])
-    if len(all_output_shapes) == 1:
-      return self._inbound_nodes[0].output_shapes
-    else:
-      raise AttributeError('The layer "%s"'
-                           ' has multiple inbound nodes, '
-                           'with different output shapes. Hence '
-                           'the notion of "output shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_output_shape_at(node_index)` '
-                           'instead.' % self.name)
-
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def inbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._inbound_nodes
-
-  @property
-  @doc_controls.do_not_doc_inheritable
-  def outbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._outbound_nodes
-
-  ##############################################################################
-  # Methods & attributes below are public aliases of other methods.            #
-  ##############################################################################
-
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
-
-    Alias of `self.weights`.
-
-    Returns:
-      A list of variables.
-    """
-    return self.weights
-
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
-
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
-
-  ##############################################################################
-  # Methods & attributes below are all private and only used by the framework. #
-  ##############################################################################
-
-  @property
-  def _inbound_nodes(self):
-    return self._inbound_nodes_value
-
-  @_inbound_nodes.setter
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _inbound_nodes(self, value):
-    self._inbound_nodes_value = value
-
-  @property
-  def _outbound_nodes(self):
-    return self._outbound_nodes_value
-
-  @_outbound_nodes.setter
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _outbound_nodes(self, value):
-    self._outbound_nodes_value = value
-
-  def _set_dtype_policy(self, dtype):
-    """Sets self._dtype_policy."""
-    if isinstance(dtype, policy.Policy):
-      self._dtype_policy = dtype
-    elif isinstance(dtype, dict):
-      self._dtype_policy = policy.deserialize(dtype)
-    elif isinstance(dtype, str) and dtype in ('mixed_float16',
-                                              'mixed_bfloat16'):
-      # The isinstance check is required since np.dtype raises an error if
-      # compared to a non-dtype string.
-      self._dtype_policy = policy.Policy(dtype)
-    elif dtype:
-      self._dtype_policy = policy.Policy(tf.as_dtype(dtype).name)
-    else:
-      self._dtype_policy = policy.global_policy()
-    if (self._dtype_policy.name == 'mixed_float16' and
-        not loss_scale_optimizer.strategy_supports_loss_scaling()):
-      # Although only loss scaling doesn't support certain strategies, to avoid
-      # confusion, we disallow the 'mixed_float16' policy with unsupported
-      # strategies. This is because 'mixed_float16' requires loss scaling for
-      # numeric stability.
-      strategy = tf.distribute.get_strategy()
-      raise ValueError('Mixed precision is not supported with the '
-                       'tf.distribute.Strategy: %s. Either stop using mixed '
-                       'precision by removing the use of the "%s" policy or '
-                       'use a different Strategy, e.g. a MirroredStrategy.' %
-                       (strategy.__class__.__name__, self._dtype_policy.name))
-
-    # Performance optimization: cache the compute dtype as a Dtype object or
-    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
-    if self._dtype_policy.compute_dtype:
-      self._compute_dtype_object = tf.as_dtype(
-          self._dtype_policy.compute_dtype)
-    else:
-      self._compute_dtype_object = None
-
-  # TODO(reedwm): Expose this property?
-  @property
-  def _compute_dtype(self):
-    """The layer's compute dtype.
-
-    Unless mixed-precision is used, this is the same as `Layer.dtype`.
-
-    If self._autocast is True, layer's will cast floating-point inputs to this.
-
-    Returns:
-      The layer's compute dtype.
-    """
-    return self._dtype_policy.compute_dtype
-
-  def _maybe_cast_inputs(self, inputs):
-    """Maybe casts the inputs to the compute dtype.
-
-    If self._compute_dtype is floating-point, and self_autocast is True,
-    floating-point inputs are casted to self._compute_dtype.
-
-    Args:
-      inputs: Input tensor, or structure of input tensors.
-
-    Returns:
-      `inputs`, but tensors may have been casted to self._compute_dtype
-    """
-    compute_dtype = self._compute_dtype
-    if (self._autocast and compute_dtype and
-        tf.as_dtype(compute_dtype).is_floating):
-      def f(x):
-        """Cast a single Tensor or TensorSpec to the compute dtype."""
-        cast_types = (tf.Tensor, tf.SparseTensor,
-                      tf.RaggedTensor)
-        if (isinstance(x, cast_types) and x.dtype.is_floating and
-            x.dtype.base_dtype.name != compute_dtype):
-          return tf.cast(x, compute_dtype)
-        elif isinstance(x, tf.TensorSpec) and x.dtype.is_floating:
-          # Inputs may be TensorSpecs when this function is called from
-          # model._set_inputs.
-          return tf.TensorSpec(x.shape, compute_dtype, x.name)
+            return values
+
+    def _maybe_build(self, inputs):
+        # Check input assumptions set before layer building, e.g. input rank.
+        if not self.built:
+            input_spec.assert_input_compatibility(
+                self.input_spec, inputs, self.name
+            )
+            input_list = tf.nest.flatten(inputs)
+            if input_list and self._dtype_policy.compute_dtype is None:
+                try:
+                    dtype = input_list[0].dtype.base_dtype.name
+                except AttributeError:
+                    pass
+                else:
+                    self._set_dtype_policy(policy.Policy(dtype))
+            input_shapes = None
+            if all(hasattr(x, "shape") for x in input_list):
+                input_shapes = tf.nest.map_structure(lambda x: x.shape, inputs)
+            # Only call `build` if the user has manually overridden the build
+            # method.
+            if not hasattr(self.build, "_is_default"):
+                # Any setup work performed only once should happen in an
+                # `init_scope` to avoid creating symbolic Tensors that will
+                # later pollute any eager operations.
+                with tf_utils.maybe_init_scope(self):
+                    self.build(input_shapes)
+            # We must set also ensure that the layer is marked as built, and the
+            # build shape is stored since user defined build functions may not
+            # be calling `super.build()`
+            Layer.build(self, input_shapes)
+
+        # Optionally load weight values specified at layer instantiation.
+        if self._initial_weights is not None:
+            self.set_weights(self._initial_weights)
+            self._initial_weights = None
+
+    def _symbolic_call(self, inputs):
+        input_shapes = tf.nest.map_structure(lambda x: x.shape, inputs)
+        output_shapes = self.compute_output_shape(input_shapes)
+
+        def _make_placeholder_like(shape):
+            ph = backend.placeholder(shape=shape, dtype=self.dtype)
+            ph._keras_mask = None
+            return ph
+
+        return tf.nest.map_structure(_make_placeholder_like, output_shapes)
+
+    def _get_trainable_state(self):
+        """Get the `trainable` state of each sublayer.
+
+        Returns:
+          A dict mapping all sublayers to their `trainable` value.
+        """
+        layers = self._flatten_layers(include_self=False, recursive=False)
+        trainable_state = {self: self.trainable}
+        for l in layers:
+            trainable_state.update(l._get_trainable_state())
+        return trainable_state
+
+    def _set_trainable_state(self, trainable_state):
+        """Set `trainable` state for each sublayer."""
+        if self in trainable_state:
+            self.trainable = trainable_state[self]
+        layers = self._flatten_layers(include_self=False, recursive=False)
+        for l in layers:
+            if l in trainable_state:
+                l._set_trainable_state(trainable_state)
+
+    @property
+    def _obj_reference_counts(self):
+        """A dict counting the number of attributes referencing an object."""
+        self._maybe_create_attribute(
+            "_obj_reference_counts_dict",
+            object_identity.ObjectIdentityDictionary(),
+        )
+        return self._obj_reference_counts_dict
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _maybe_create_attribute(self, name, default_value):
+        """Create attribute (with the default value) if it hasn't been created.
+
+        This is useful for fields that is used for tracking purpose,
+        _trainable_weights, or _layers. Note that user could create a layer
+        subclass and assign an internal field before invoking the
+        Layer.__init__(), the __setattr__() need to create the tracking fields
+        and __init__() need to not override them.
+
+        Args:
+          name: String, the name of the attribute.
+          default_value: Object, the default value of the attribute.
+        """
+        if not hasattr(self, name):
+            self.__setattr__(name, default_value)
+
+    def __delattr__(self, name):
+        # For any super.__delattr__() call, we will directly use the
+        # implementation in Trackable and skip the behavior in AutoTrackable.
+        # The Layer was originally use Trackable as base class, the change of
+        # using Module as base class forced us to have AutoTrackable in the
+        # class hierarchy.
+        #
+        # TODO(b/180760306) Keeping the status quo of skipping _delattr__ and
+        # __setattr__ in AutoTrackable may be unsustainable.
+        existing_value = getattr(self, name, None)
+
+        # If this value is replacing an existing object assigned to an
+        # attribute, we should clean it out to avoid leaking memory. First we
+        # check if there are other attributes referencing it.
+        reference_counts = self._obj_reference_counts
+        if existing_value not in reference_counts:
+            super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(
+                name
+            )
+            return
+
+        reference_count = reference_counts[existing_value]
+        if reference_count > 1:
+            # There are other remaining references. We can't remove this object
+            # from _layers etc.
+            reference_counts[existing_value] = reference_count - 1
+            super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(
+                name
+            )
+            return
         else:
-          return x
-      return tf.nest.map_structure(f, inputs)
-    else:
-      return inputs
-
-  # _dtype used to be an attribute set in the constructor. We still expose it
-  # because some clients still use it.
-  # TODO(reedwm): Deprecate, then remove the _dtype property.
-  @property
-  def _dtype(self):
-    # This is equivalent to returning self.dtype . We do not return self.dtype
-    # as it would cause infinite recursion in a few subclasses, which override
-    # "dtype" to return self._dtype.
-    return self._dtype_policy.variable_dtype
-
-  @_dtype.setter
-  def _dtype(self, value):
-    value = tf.as_dtype(value).name
-    self._set_dtype_policy(policy.Policy(value))
-
-  def _name_scope(self):  # pylint: disable=method-hidden
-    return self.name
-
-  def _init_set_name(self, name, zero_based=True):
-    if not name:
-      self._name = backend.unique_object_name(
-          generic_utils.to_snake_case(self.__class__.__name__),
-          zero_based=zero_based)
-    else:
-      self._name = name
-
-  def _get_existing_metric(self, name=None):
-    match = [m for m in self._metrics if m.name == name]
-    if not match:
-      return
-    if len(match) > 1:
-      raise ValueError(
-          'Please provide different names for the metrics you have added. '
-          'We found {} metrics with the name: "{}"'.format(len(match), name))
-    return match[0]
-
-  def _symbolic_add_metric(self, value, aggregation=None, name=None):
-    base_layer_utils.check_graph_consistency(value, method='add_metric')
-    match = self._get_existing_metric(name)
-    if aggregation is None:
-      # Iterate over the metrics and check if the given metric exists already.
-      # This can happen when a metric instance is created in subclassed model
-      # layer `__init__` and we have tracked that instance already in
-      # model.__setattr__.
-      if match:
-        result_tensor = value
-        metric_obj = match
-      elif hasattr(value, '_metric_obj'):
-        # We track the instance using the metadata on the result tensor.
-        result_tensor = value
-        metric_obj = result_tensor._metric_obj
-        self._metrics.append(metric_obj)
-      else:
-        raise ValueError(
-            'We do not support adding an aggregated metric result tensor that '
-            'is not the output of a `tf.keras.metrics.Metric` metric instance. '
-            'Without having access to the metric instance we cannot reset the '
-            'state of a metric after every epoch during training. You can '
-            'create a `tf.keras.metrics.Metric` instance and pass the result '
-            'here or pass an un-aggregated result with `aggregation` parameter '
-            'set as `mean`. For example: `self.add_metric(tf.reduce_sum(inputs)'
-            ', name=\'mean_activation\', aggregation=\'mean\')`')
-    else:
-      # If a non-aggregated tensor is given as input (ie. `aggregation` is
-      # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
-      if match:
-        result_tensor = match(value)
-        metric_obj = match
-      else:
-        metric_obj, result_tensor = base_layer_utils.create_mean_metric(
-            value, name)
-        self._metrics.append(metric_obj)
-
-  def _handle_weight_regularization(self, name, variable, regularizer):
-    """Create lambdas which compute regularization losses."""
-
-    def _loss_for_variable(v):
-      """Creates a regularization loss `Tensor` for variable `v`."""
-      with backend.name_scope(name + '/Regularizer'):
-        regularization = regularizer(v)
-      return regularization
-
-    if base_layer_utils.is_split_variable(variable):
-      for v in variable:
-        self.add_loss(functools.partial(_loss_for_variable, v))
-    else:
-      self.add_loss(functools.partial(_loss_for_variable, variable))
-
-  def _handle_activity_regularization(self, inputs, outputs):
-    # Apply activity regularization.
-    # Note that it should be applied every time the layer creates a new
-    # output, since it is output-specific.
-    if self._activity_regularizer:
-      output_list = tf.nest.flatten(outputs)
-      with backend.name_scope('ActivityRegularizer'):
-        for output in output_list:
-          activity_loss = tf.convert_to_tensor(
-              self._activity_regularizer(output))
-          batch_size = tf.cast(
-              tf.compat.v1.shape(output)[0], activity_loss.dtype)
-          # Make activity regularization strength batch-agnostic.
-          mean_activity_loss = activity_loss / batch_size
-          base_layer_utils.check_graph_consistency(
-              mean_activity_loss, method='activity_regularizer')
-          self.add_loss(mean_activity_loss, inputs=inputs)
-
-  def _set_mask_metadata(self, inputs, outputs, previous_mask):
-    flat_outputs = tf.nest.flatten(outputs)
-
-    mask_already_computed = (
-        getattr(self, '_compute_output_and_mask_jointly', False) or
-        all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs))
-
-    # Only compute the mask if the Layer explicitly supports masking or has
-    # overridden `compute_mask`.
-    should_compute_mask = (
-        hasattr(self, 'compute_mask') and
-        (self.supports_masking or
-         not getattr(self.compute_mask, '_is_default', False)))
-
-    if mask_already_computed:
-      flat_masks = [getattr(x, '_keras_mask', None) for x in flat_outputs]
-    elif not should_compute_mask:
-      flat_masks = [None for _ in flat_outputs]
-    else:
-      output_masks = self.compute_mask(inputs, previous_mask)
-      # `compute_mask` can return a single `None` even when a Layer
-      # has multiple outputs.
-      if output_masks is None:
-        flat_masks = [None for _ in flat_outputs]
-      else:
-        flat_masks = tf.nest.flatten(output_masks)
-
-    for output, mask in zip(flat_outputs, flat_masks):
-      try:
-        output._keras_mask = mask
-      except AttributeError:
-        # C Type such as np.ndarray.
-        pass
-
-    if tf_utils.are_all_symbolic_tensors(flat_outputs):
-      for output in flat_outputs:
-        if getattr(output, '_keras_mask', None) is not None:
-          # Do not track masks for `TensorFlowOpLayer` construction.
-          output._keras_mask._keras_history_checked = True
-
-  def _collect_input_masks(self, inputs, args, kwargs):
-    """Checks if `mask` argument was passed, else gathers mask from inputs."""
-    if self._call_spec.arg_was_passed('mask', args, kwargs):
-      return self._call_spec.get_arg_value('mask', args, kwargs)
-
-    if not self._should_compute_mask:
-      return None
-
-    input_masks = tf.nest.map_structure(
-        lambda t: getattr(t, '_keras_mask', None), inputs)
-    if generic_utils.is_all_none(input_masks):
-      return None
-    return input_masks
-
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
-
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
-
-    Args:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
-
-    Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
-
-    Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
-    """
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if isinstance(values, list) and len(values) == 1:
-      return values[0]
-    else:
-      return values
-
-  def _maybe_build(self, inputs):
-    # Check input assumptions set before layer building, e.g. input rank.
-    if not self.built:
-      input_spec.assert_input_compatibility(
-          self.input_spec, inputs, self.name)
-      input_list = tf.nest.flatten(inputs)
-      if input_list and self._dtype_policy.compute_dtype is None:
+            # This is the last remaining reference.
+            del reference_counts[existing_value]
+
+        super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)
+
+        if isinstance(existing_value, Layer) or base_layer_utils.has_weights(
+            existing_value
+        ):
+            super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                "_self_tracked_trackables",
+                [
+                    l
+                    for l in self._self_tracked_trackables
+                    if l is not existing_value
+                ],
+            )
+        if isinstance(existing_value, tf.Variable):
+            super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                "_trainable_weights",
+                [w for w in self._trainable_weights if w is not existing_value],
+            )
+            super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                "_non_trainable_weights",
+                [
+                    w
+                    for w in self._non_trainable_weights
+                    if w is not existing_value
+                ],
+            )
+
+    def __setattr__(self, name, value):
+        if (
+            name == "_self_setattr_tracking"
+            or not getattr(self, "_self_setattr_tracking", True)
+            # Exclude @property.setters from tracking
+            or hasattr(self.__class__, name)
+        ):
+            try:
+                super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
+                    name, value
+                )
+            except AttributeError:
+                raise AttributeError(
+                    (
+                        'Can\'t set the attribute "{}", likely because it '
+                        "conflicts with an existing read-only @property of the "
+                        "object. Please choose a different name."
+                    ).format(name)
+                )
+            return
+
+        # Keep track of trackable objects, for the needs of
+        # `Network.save_weights`.
+        value = tf.__internal__.tracking.sticky_attribute_assignment(
+            trackable=self, value=value, name=name
+        )
+
+        reference_counts = self._obj_reference_counts
+        reference_counts[value] = reference_counts.get(value, 0) + 1
+
+        # Clean out the old attribute, which clears _layers and
+        # _trainable_weights if necessary.
         try:
-          dtype = input_list[0].dtype.base_dtype.name
+            self.__delattr__(name)
         except AttributeError:
-          pass
-        else:
-          self._set_dtype_policy(policy.Policy(dtype))
-      input_shapes = None
-      if all(hasattr(x, 'shape') for x in input_list):
-        input_shapes = tf.nest.map_structure(lambda x: x.shape, inputs)
-      # Only call `build` if the user has manually overridden the build method.
-      if not hasattr(self.build, '_is_default'):
-        # Any setup work performed only once should happen in an `init_scope`
-        # to avoid creating symbolic Tensors that will later pollute any eager
-        # operations.
-        with tf_utils.maybe_init_scope(self):
-          self.build(input_shapes)
-      # We must set also ensure that the layer is marked as built, and the build
-      # shape is stored since user defined build functions may not be calling
-      # `super.build()`
-      Layer.build(self, input_shapes)
-
-    # Optionally load weight values specified at layer instantiation.
-    if self._initial_weights is not None:
-      self.set_weights(self._initial_weights)
-      self._initial_weights = None
-
-  def _symbolic_call(self, inputs):
-    input_shapes = tf.nest.map_structure(lambda x: x.shape, inputs)
-    output_shapes = self.compute_output_shape(input_shapes)
-
-    def _make_placeholder_like(shape):
-      ph = backend.placeholder(shape=shape, dtype=self.dtype)
-      ph._keras_mask = None
-      return ph
-
-    return tf.nest.map_structure(_make_placeholder_like, output_shapes)
-
-  def _get_trainable_state(self):
-    """Get the `trainable` state of each sublayer.
-
-    Returns:
-      A dict mapping all sublayers to their `trainable` value.
-    """
-    layers = self._flatten_layers(include_self=False, recursive=False)
-    trainable_state = {self: self.trainable}
-    for l in layers:
-      trainable_state.update(l._get_trainable_state())
-    return trainable_state
-
-  def _set_trainable_state(self, trainable_state):
-    """Set `trainable` state for each sublayer."""
-    if self in trainable_state:
-      self.trainable = trainable_state[self]
-    layers = self._flatten_layers(include_self=False, recursive=False)
-    for l in layers:
-      if l in trainable_state:
-        l._set_trainable_state(trainable_state)
-
-  @property
-  def _obj_reference_counts(self):
-    """A dictionary counting the number of attributes referencing an object."""
-    self._maybe_create_attribute('_obj_reference_counts_dict',
-                                 object_identity.ObjectIdentityDictionary())
-    return self._obj_reference_counts_dict
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _maybe_create_attribute(self, name, default_value):
-    """Create the attribute with the default value if it hasn't been created.
-
-    This is useful for fields that is used for tracking purpose,
-    _trainable_weights, or _layers. Note that user could create a layer subclass
-    and assign an internal field before invoking the Layer.__init__(), the
-    __setattr__() need to create the tracking fields and __init__() need to not
-    override them.
-
-    Args:
-      name: String, the name of the attribute.
-      default_value: Object, the default value of the attribute.
-    """
-    if not hasattr(self, name):
-      self.__setattr__(name, default_value)
-
-  def __delattr__(self, name):
-    # For any super.__delattr__() call, we will directly use the implementation
-    # in Trackable and skip the behavior in AutoTrackable. The Layer was
-    # originally use Trackable as base class, the change of using Module as base
-    # class forced us to have AutoTrackable in the class hierarchy.
-    #
-    # TODO(b/180760306) Keeping the status quo of skipping _delattr__ and
-    # __setattr__ in AutoTrackable may be unsustainable.
-    existing_value = getattr(self, name, None)
-
-    # If this value is replacing an existing object assigned to an attribute, we
-    # should clean it out to avoid leaking memory. First we check if there are
-    # other attributes referencing it.
-    reference_counts = self._obj_reference_counts
-    if existing_value not in reference_counts:
-      super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)  # pylint: disable=bad-super-call
-      return
-
-    reference_count = reference_counts[existing_value]
-    if reference_count > 1:
-      # There are other remaining references. We can't remove this object from
-      # _layers etc.
-      reference_counts[existing_value] = reference_count - 1
-      super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)  # pylint: disable=bad-super-call
-      return
-    else:
-      # This is the last remaining reference.
-      del reference_counts[existing_value]
-
-    super(tf.__internal__.tracking.AutoTrackable, self).__delattr__(name)  # pylint: disable=bad-super-call
-
-    if (isinstance(existing_value, Layer)
-        or base_layer_utils.has_weights(existing_value)):
-      super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(  # pylint: disable=bad-super-call
-          '_self_tracked_trackables',
-          [l for l in self._self_tracked_trackables if l is not existing_value])
-    if isinstance(existing_value, tf.Variable):
-      super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(  # pylint: disable=bad-super-call
-          '_trainable_weights',
-          [w for w in self._trainable_weights if w is not existing_value])
-      super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(  # pylint: disable=bad-super-call
-          '_non_trainable_weights',
-          [w for w in self._non_trainable_weights if w is not existing_value])
-
-  def __setattr__(self, name, value):
-    if (name == '_self_setattr_tracking' or
-        not getattr(self, '_self_setattr_tracking', True) or
-        # Exclude @property.setters from tracking
-        hasattr(self.__class__, name)):
-      try:
+            pass
+
+        # Keep track of metric instance created in subclassed layer.
+        from keras import metrics as metrics_module
+
+        for val in tf.nest.flatten(value):
+            if isinstance(val, metrics_module.Metric) and hasattr(
+                self, "_metrics"
+            ):
+                self._metrics.append(val)
+
+        # TODO(scottzhu): Need to track Module object as well for weight
+        # tracking.  Be careful about metric if it becomes a Module in future.
+        # Append value to self._layers if relevant
+        if getattr(self, "_auto_track_sub_layers", True) and (
+            isinstance(value, Layer) or base_layer_utils.has_weights(value)
+        ):
+            self._maybe_create_attribute("_self_tracked_trackables", [])
+            # We need to check object identity to avoid de-duplicating empty
+            # container types which compare equal.
+            if not any(
+                (layer is value for layer in self._self_tracked_trackables)
+            ):
+                self._self_tracked_trackables.append(value)
+                if hasattr(value, "_use_resource_variables"):
+                    # Legacy layers (V1 tf.layers) must always use
+                    # resource variables.
+                    value._use_resource_variables = True
+
+        # Append value to list of trainable / non-trainable weights if relevant
+        # TODO(b/125122625): This won't pick up on any variables added to a
+        # list/dict after creation.
+        for val in tf.nest.flatten(value):
+            if not isinstance(val, tf.Variable):
+                continue
+
+            # Users may add extra weights/variables simply by assigning them to
+            # attributes (invalid for graph networks)
+            self._maybe_create_attribute("_trainable_weights", [])
+            self._maybe_create_attribute("_non_trainable_weights", [])
+            if val.trainable:
+                if any(val is w for w in self._trainable_weights):
+                    continue
+                self._trainable_weights.append(val)
+            else:
+                if any(val is w for w in self._non_trainable_weights):
+                    continue
+                self._non_trainable_weights.append(val)
+
+            backend.track_variable(val)
+
+        # TODO(b/180760306) Skip the auto trackable from tf.Module to keep
+        # status quo. See the comment at __delattr__.
         super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(
-            name, value)  # pylint: disable=bad-super-call
-      except AttributeError:
-        raise AttributeError(
-            ('Can\'t set the attribute "{}", likely because it conflicts with '
-             'an existing read-only @property of the object. Please choose a '
-             'different name.').format(name))
-      return
-
-    # Keep track of trackable objects, for the needs of `Network.save_weights`.
-    value = tf.__internal__.tracking.sticky_attribute_assignment(
-        trackable=self, value=value, name=name)
-
-    reference_counts = self._obj_reference_counts
-    reference_counts[value] = reference_counts.get(value, 0) + 1
-
-    # Clean out the old attribute, which clears _layers and _trainable_weights
-    # if necessary.
-    try:
-      self.__delattr__(name)
-    except AttributeError:
-      pass
-
-    # Keep track of metric instance created in subclassed layer.
-    from keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-    for val in tf.nest.flatten(value):
-      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
-        self._metrics.append(val)
-
-    # TODO(scottzhu): Need to track Module object as well for weight tracking.
-    # Be careful about metric if it becomes a Module in future.
-    # Append value to self._layers if relevant
-    if (getattr(self, '_auto_track_sub_layers', True) and
-        (isinstance(value, Layer) or base_layer_utils.has_weights(value))):
-      self._maybe_create_attribute('_self_tracked_trackables', [])
-      # We need to check object identity to avoid de-duplicating empty
-      # container types which compare equal.
-      if not any((layer is value for layer in self._self_tracked_trackables)):
-        self._self_tracked_trackables.append(value)
-        if hasattr(value, '_use_resource_variables'):
-          # Legacy layers (V1 tf.layers) must always use
-          # resource variables.
-          value._use_resource_variables = True
-
-    # Append value to list of trainable / non-trainable weights if relevant
-    # TODO(b/125122625): This won't pick up on any variables added to a
-    # list/dict after creation.
-    for val in tf.nest.flatten(value):
-      if not isinstance(val, tf.Variable):
-        continue
-
-      # Users may add extra weights/variables
-      # simply by assigning them to attributes (invalid for graph networks)
-      self._maybe_create_attribute('_trainable_weights', [])
-      self._maybe_create_attribute('_non_trainable_weights', [])
-      if val.trainable:
-        if any(val is w for w in self._trainable_weights):
-          continue
-        self._trainable_weights.append(val)
-      else:
-        if any(val is w for w in self._non_trainable_weights):
-          continue
-        self._non_trainable_weights.append(val)
-
-      backend.track_variable(val)
-
-    # TODO(b/180760306) Skip the auto trackable from tf.Module to keep status
-    # quo. See the comment at __delattr__.
-    super(tf.__internal__.tracking.AutoTrackable, self).__setattr__(name, value)  # pylint: disable=bad-super-call
-
-  # This is a hack so that the is_layer (within
-  # training/trackable/layer_utils.py) check doesn't get the weights attr.
-  # TODO(b/110718070): Remove when fixed.
-  def _is_layer(self):
-    return True
-
-  @property
-  @layer_utils.cached_per_instance
-  def _should_compute_mask(self):
-    return ('mask' in self._call_spec.arg_names or
-            getattr(self, 'compute_mask', None) is not None)
-
-  def _dedup_weights(self, weights):
-    """Dedupe weights while maintaining order as much as possible."""
-    output, seen_ids = [], set()
-    for w in weights:
-      if id(w) not in seen_ids:
-        output.append(w)
-        # Track the Variable's identity to avoid __eq__ issues.
-        seen_ids.add(id(w))
-
-    return output
-
-  # SavedModel properties. Please see keras/saving/saved_model for details.
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return layer_serialization.LayerSavedModelSaver(self)
-
-  @property
-  def _object_identifier(self):
-    return self._trackable_saved_model_saver.object_identifier
-
-  @property
-  def _tracking_metadata(self):
-    return self._trackable_saved_model_saver.tracking_metadata
-
-  def _trackable_children(self, save_type='checkpoint', **kwargs):
-    if save_type == 'savedmodel':
-      cache = kwargs['cache']
-      # TODO(b/213628533): This must be called before super() to ensure
-      # that any input shape changes are applied before getting the config of
-      # the model.
-      children = self._trackable_saved_model_saver.trackable_children(cache)
-    else:
-      children = {}
-    children.update(super()._trackable_children(save_type, **kwargs))
-    return children
-
-  def __getstate__(self):
-    # Override to support `copy.deepcopy` and pickling.
-    # Thread-local objects cannot be copied in Python 3, so pop these.
-    # Thread-local objects are used to cache losses in MirroredStrategy, and
-    # so shouldn't be copied.
-    state = self.__dict__.copy()
-    state.pop('_thread_local', None)
-    return state
-
-  def __setstate__(self, state):
-    state['_thread_local'] = threading.local()
-    # Bypass Trackable logic as `__dict__` already contains this info.
-    object.__setattr__(self, '__dict__', state)
+            name, value
+        )
+
+    # This is a hack so that the is_layer (within
+    # training/trackable/layer_utils.py) check doesn't get the weights attr.
+    # TODO(b/110718070): Remove when fixed.
+    def _is_layer(self):
+        return True
+
+    @property
+    @layer_utils.cached_per_instance
+    def _should_compute_mask(self):
+        return (
+            "mask" in self._call_spec.arg_names
+            or getattr(self, "compute_mask", None) is not None
+        )
+
+    def _dedup_weights(self, weights):
+        """Dedupe weights while maintaining order as much as possible."""
+        output, seen_ids = [], set()
+        for w in weights:
+            if id(w) not in seen_ids:
+                output.append(w)
+                # Track the Variable's identity to avoid __eq__ issues.
+                seen_ids.add(id(w))
+
+        return output
+
+    # SavedModel properties. Please see keras/saving/saved_model for details.
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return layer_serialization.LayerSavedModelSaver(self)
+
+    @property
+    def _object_identifier(self):
+        return self._trackable_saved_model_saver.object_identifier
+
+    @property
+    def _tracking_metadata(self):
+        return self._trackable_saved_model_saver.tracking_metadata
+
+    def _trackable_children(self, save_type="checkpoint", **kwargs):
+        if save_type == "savedmodel":
+            cache = kwargs["cache"]
+            # TODO(b/213628533): This must be called before super() to ensure
+            # that any input shape changes are applied before getting the config
+            # of the model.
+            children = self._trackable_saved_model_saver.trackable_children(
+                cache
+            )
+        else:
+            children = {}
+        children.update(super()._trackable_children(save_type, **kwargs))
+        return children
+
+    def __getstate__(self):
+        # Override to support `copy.deepcopy` and pickling.
+        # Thread-local objects cannot be copied in Python 3, so pop these.
+        # Thread-local objects are used to cache losses in MirroredStrategy, and
+        # so shouldn't be copied.
+        state = self.__dict__.copy()
+        state.pop("_thread_local", None)
+        return state
+
+    def __setstate__(self, state):
+        state["_thread_local"] = threading.local()
+        # Bypass Trackable logic as `__dict__` already contains this info.
+        object.__setattr__(self, "__dict__", state)
diff --git a/keras/engine/base_preprocessing_layer.py b/keras/engine/base_preprocessing_layer.py
index 0df5fec54506..bdd32405ee0f 100644
--- a/keras/engine/base_preprocessing_layer.py
+++ b/keras/engine/base_preprocessing_layer.py
@@ -16,286 +16,296 @@
 
 import abc
 
+import tensorflow.compat.v2 as tf
+
 from keras.engine import data_adapter
 from keras.engine.base_layer import Layer
 from keras.utils import version_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.eager import context
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
-
 keras_kpl_gauge = tf.__internal__.monitoring.BoolGauge(
-    '/tensorflow/api/keras/layers/preprocessing',
-    'keras preprocessing layers usage', 'method')
+    "/tensorflow/api/keras/layers/preprocessing",
+    "keras preprocessing layers usage",
+    "method",
+)
 
 
-@keras_export('keras.layers.experimental.preprocessing.PreprocessingLayer')
+@keras_export("keras.layers.experimental.preprocessing.PreprocessingLayer")
 class PreprocessingLayer(Layer, metaclass=abc.ABCMeta):
-  """Base class for Preprocessing Layers.
-
-  **Don't use this class directly: it's an abstract base class!** You may
-  be looking for one of the many built-in
-  [preprocessing layers](https://keras.io/guides/preprocessing_layers/)
-  instead.
-
-  Preprocessing layers are layers whose state gets computed before model
-  training starts. They do not get updated during training.
-  Most preprocessing layers implement an `adapt()` method for state computation.
+    """Base class for Preprocessing Layers.
 
-  The `PreprocessingLayer` class is the base class you would subclass to
-  implement your own preprocessing layers.
-  """
-  _must_restore_from_config = True
+    **Don't use this class directly: it's an abstract base class!** You may
+    be looking for one of the many built-in
+    [preprocessing layers](https://keras.io/guides/preprocessing_layers/)
+    instead.
 
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    self._is_compiled = False
-    self._is_adapted = False
+    Preprocessing layers are layers whose state gets computed before model
+    training starts. They do not get updated during training. Most
+    preprocessing layers implement an `adapt()` method for state computation.
 
-    # Sets `is_adapted=False` when `reset_state` is called.
-    self._reset_state_impl = self.reset_state
-    self.reset_state = self._reset_state_wrapper
-
-    self._adapt_function = None
-
-  @property
-  def is_adapted(self):
-    """Whether the layer has been fit to data already."""
-    return self._is_adapted
-
-  @doc_controls.do_not_generate_docs
-  def update_state(self, data):
-    """Accumulates statistics for the preprocessing layer.
-
-    Arguments:
-      data: A mini-batch of inputs to the layer.
+    The `PreprocessingLayer` class is the base class you would subclass to
+    implement your own preprocessing layers.
     """
-    raise NotImplementedError
-
-  @doc_controls.do_not_generate_docs
-  def reset_state(self):  # pylint: disable=method-hidden
-    """Resets the statistics of the preprocessing layer."""
-    raise NotImplementedError
 
-  @doc_controls.do_not_generate_docs
-  def finalize_state(self):
-    """Finalize the statistics for the preprocessing layer.
-
-    This method is called at the end of `adapt` or after restoring a serialized
-    preprocessing layer's state. This method handles any one-time operations
-    that should occur on the layer's state before `Layer.__call__`.
-    """
-    pass
-
-  @doc_controls.do_not_generate_docs
-  def make_adapt_function(self):
-    """Creates a function to execute one step of `adapt`.
-
-    This method can be overridden to support custom adapt logic.
-    This method is called by `PreprocessingLayer.adapt`.
-
-    Typically, this method directly controls `tf.function` settings,
-    and delegates the actual state update logic to
-    `PreprocessingLayer.update_state`.
-
-    This function is cached the first time `PreprocessingLayer.adapt`
-    is called. The cache is cleared whenever `PreprocessingLayer.compile`
-    is called.
-
-    Returns:
-      Function. The function created by this method should accept a
-      `tf.data.Iterator`, retrieve a batch, and update the state of the
-      layer.
-    """
-    if self._adapt_function is not None:
-      return self._adapt_function
-
-    def adapt_step(iterator):
-      data = next(iterator)
-      self._adapt_maybe_build(data)
-      self.update_state(data)
-
-    if self._steps_per_execution.numpy().item() == 1:
-      adapt_fn = adapt_step
-    else:
-
-      def adapt_fn(iterator):
-        for _ in tf.range(self._steps_per_execution):
-          adapt_step(iterator)
-
-    if not self._run_eagerly:
-      adapt_fn = tf.function(adapt_fn)
-
-    self._adapt_function = adapt_fn
-    return self._adapt_function
-
-  def compile(self, run_eagerly=None, steps_per_execution=None):
-    """Configures the layer for `adapt`.
-
-    Arguments:
-      run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s logic
-        will not be wrapped in a `tf.function`. Recommended to leave this as
-        `None` unless your `Model` cannot be run inside a `tf.function`.
-        steps_per_execution: Int. Defaults to 1. The number of batches to run
-          during each `tf.function` call. Running multiple batches inside a
-          single `tf.function` call can greatly improve performance on TPUs or
-          small models with a large Python overhead.
-    """
-    if steps_per_execution is None:
-      steps_per_execution = 1
-    self._configure_steps_per_execution(steps_per_execution)
-
-    if run_eagerly is None:
-      run_eagerly = self.dynamic
-    self._run_eagerly = run_eagerly
-
-    self._is_compiled = True
-
-  def adapt(self, data, batch_size=None, steps=None):
-    """Fits the state of the preprocessing layer to the data being passed.
-
-    After calling `adapt` on a layer, a preprocessing layer's state will not
-    update during training. In order to make preprocessing layers efficient in
-    any distribution context, they are kept constant with respect to any
-    compiled `tf.Graph`s that call the layer. This does not affect the layer use
-    when adapting each layer only once, but if you adapt a layer multiple times
-    you will need to take care to re-compile any compiled functions as follows:
-
-     * If you are adding a preprocessing layer to a `keras.Model`, you need to
-       call `model.compile` after each subsequent call to `adapt`.
-     * If you are calling a preprocessing layer inside `tf.data.Dataset.map`,
-       you should call `map` again on the input `tf.data.Dataset` after each
-       `adapt`.
-     * If you are using a `tf.function` directly which calls a preprocessing
-       layer, you need to call `tf.function` again on your callable after
-       each subsequent call to `adapt`.
-
-    `tf.keras.Model` example with multiple adapts:
-
-    >>> layer = tf.keras.layers.Normalization(
-    ...     axis=None)
-    >>> layer.adapt([0, 2])
-    >>> model = tf.keras.Sequential(layer)
-    >>> model.predict([0, 1, 2])
-    array([-1.,  0.,  1.], dtype=float32)
-    >>> layer.adapt([-1, 1])
-    >>> model.compile() # This is needed to re-compile model.predict!
-    >>> model.predict([0, 1, 2])
-    array([0., 1., 2.], dtype=float32)
-
-    `tf.data.Dataset` example with multiple adapts:
-
-    >>> layer = tf.keras.layers.Normalization(
-    ...     axis=None)
-    >>> layer.adapt([0, 2])
-    >>> input_ds = tf.data.Dataset.range(3)
-    >>> normalized_ds = input_ds.map(layer)
-    >>> list(normalized_ds.as_numpy_iterator())
-    [array([-1.], dtype=float32),
-     array([0.], dtype=float32),
-     array([1.], dtype=float32)]
-    >>> layer.adapt([-1, 1])
-    >>> normalized_ds = input_ds.map(layer) # Re-map over the input dataset.
-    >>> list(normalized_ds.as_numpy_iterator())
-    [array([0.], dtype=float32),
-     array([1.], dtype=float32),
-     array([2.], dtype=float32)]
-
-    `adapt()` is meant only as a single machine utility to compute layer state.
-    To analyze a dataset that cannot fit on a single machine, see
-    [Tensorflow Transform](https://www.tensorflow.org/tfx/transform/get_started)
-    for a multi-machine, map-reduce solution.
-
-    Arguments:
-        data: The data to train on. It can be passed either as a tf.data
-          Dataset, or as a numpy array.
-        batch_size: Integer or `None`.
-            Number of samples per state update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of datasets, generators, or `keras.utils.Sequence` instances
-            (since they generate batches).
-        steps: Integer or `None`.
-            Total number of steps (batches of samples)
-            When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined. If x is a
-            `tf.data` dataset, and 'steps' is None, the epoch will run until
-            the input dataset is exhausted. When passing an infinitely
-            repeating dataset, you must specify the `steps` argument. This
-            argument is not supported with array inputs.
-    """
-    _disallow_inside_tf_function('adapt')
-    if not version_utils.should_use_v2():
-      raise RuntimeError('`adapt` is only supported in tensorflow v2.')  # pylint: disable=g-doc-exception
-    if not self._is_compiled:
-      self.compile()  # Compile with defaults.
-    if self.built:
-      self.reset_state()
-    data_handler = data_adapter.DataHandler(
-        data,
-        batch_size=batch_size,
-        steps_per_epoch=steps,
-        epochs=1,
-        steps_per_execution=self._steps_per_execution,
-        distribute=False)
-    self._adapt_function = self.make_adapt_function()
-    for _, iterator in data_handler.enumerate_epochs():
-      with data_handler.catch_stop_iteration():
-        for _ in data_handler.steps():
-          self._adapt_function(iterator)
-          if data_handler.should_sync:
-            context.async_wait()
-    self.finalize_state()
-    self._is_adapted = True
-
-  def _reset_state_wrapper(self):
-    """Calls `reset_state` and sets `adapted` to `False`."""
-    self._reset_state_impl()
-    self._is_adapted = False
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _configure_steps_per_execution(self, steps_per_execution):
-    self._steps_per_execution = tf.Variable(
-        steps_per_execution,
-        dtype='int64',
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-
-  # TODO(omalleyt): Unify this logic with `Layer._maybe_build`.
-  def _adapt_maybe_build(self, data):
-    if not self.built:
-      try:
-        # If this is a Numpy array or tensor, we can get shape from .shape.
-        # If not, an attribute error will be thrown.
-        data_shape = data.shape
-        data_shape_nones = tuple([None] * len(data.shape))
-      except AttributeError:
-        # The input has an unknown number of dimensions.
-        data_shape = None
-        data_shape_nones = None
-
-      # TODO (b/159261555): move this to base layer build.
-      batch_input_shape = getattr(self, '_batch_input_shape', None)
-      if batch_input_shape is None:
-        # Set the number of dimensions.
-        self._batch_input_shape = data_shape_nones
-      self.build(data_shape)
-      self.built = True
+    _must_restore_from_config = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._is_compiled = False
+        self._is_adapted = False
+
+        # Sets `is_adapted=False` when `reset_state` is called.
+        self._reset_state_impl = self.reset_state
+        self.reset_state = self._reset_state_wrapper
+
+        self._adapt_function = None
+
+    @property
+    def is_adapted(self):
+        """Whether the layer has been fit to data already."""
+        return self._is_adapted
+
+    @doc_controls.do_not_generate_docs
+    def update_state(self, data):
+        """Accumulates statistics for the preprocessing layer.
+
+        Arguments:
+          data: A mini-batch of inputs to the layer.
+        """
+        raise NotImplementedError
+
+    @doc_controls.do_not_generate_docs
+    def reset_state(self):
+        """Resets the statistics of the preprocessing layer."""
+        raise NotImplementedError
+
+    @doc_controls.do_not_generate_docs
+    def finalize_state(self):
+        """Finalize the statistics for the preprocessing layer.
+
+        This method is called at the end of `adapt` or after restoring a
+        serialized preprocessing layer's state. This method handles any one-time
+        operations that should occur on the layer's state before
+        `Layer.__call__`.
+        """
+        pass
+
+    @doc_controls.do_not_generate_docs
+    def make_adapt_function(self):
+        """Creates a function to execute one step of `adapt`.
+
+        This method can be overridden to support custom adapt logic.
+        This method is called by `PreprocessingLayer.adapt`.
+
+        Typically, this method directly controls `tf.function` settings,
+        and delegates the actual state update logic to
+        `PreprocessingLayer.update_state`.
+
+        This function is cached the first time `PreprocessingLayer.adapt`
+        is called. The cache is cleared whenever `PreprocessingLayer.compile`
+        is called.
+
+        Returns:
+          Function. The function created by this method should accept a
+          `tf.data.Iterator`, retrieve a batch, and update the state of the
+          layer.
+        """
+        if self._adapt_function is not None:
+            return self._adapt_function
+
+        def adapt_step(iterator):
+            data = next(iterator)
+            self._adapt_maybe_build(data)
+            self.update_state(data)
+
+        if self._steps_per_execution.numpy().item() == 1:
+            adapt_fn = adapt_step
+        else:
+
+            def adapt_fn(iterator):
+                for _ in tf.range(self._steps_per_execution):
+                    adapt_step(iterator)
+
+        if not self._run_eagerly:
+            adapt_fn = tf.function(adapt_fn)
+
+        self._adapt_function = adapt_fn
+        return self._adapt_function
+
+    def compile(self, run_eagerly=None, steps_per_execution=None):
+        """Configures the layer for `adapt`.
+
+        Arguments:
+          run_eagerly: Bool. If `True`, this `Model`'s
+            logic will not be wrapped in a `tf.function`. Recommended to leave
+            this as `None` unless your `Model` cannot be run inside a
+            `tf.function`. Defaults to `False`.
+          steps_per_execution: Int. The number of batches to run
+            during each `tf.function` call. Running multiple batches inside a
+            single `tf.function` call can greatly improve performance on TPUs or
+            small models with a large Python overhead. Defaults to `1`.
+        """
+        if steps_per_execution is None:
+            steps_per_execution = 1
+        self._configure_steps_per_execution(steps_per_execution)
+
+        if run_eagerly is None:
+            run_eagerly = self.dynamic
+        self._run_eagerly = run_eagerly
+
+        self._is_compiled = True
+
+    def adapt(self, data, batch_size=None, steps=None):
+        """Fits the state of the preprocessing layer to the data being passed.
+
+        After calling `adapt` on a layer, a preprocessing layer's state will not
+        update during training. In order to make preprocessing layers efficient
+        in any distribution context, they are kept constant with respect to any
+        compiled `tf.Graph`s that call the layer. This does not affect the layer
+        use when adapting each layer only once, but if you adapt a layer
+        multiple times you will need to take care to re-compile any compiled
+        functions as follows:
+
+         * If you are adding a preprocessing layer to a `keras.Model`, you need
+           to call `model.compile` after each subsequent call to `adapt`.
+         * If you are calling a preprocessing layer inside
+          `tf.data.Dataset.map`, you should call `map` again on the input
+          `tf.data.Dataset` after each `adapt`.
+         * If you are using a `tf.function` directly which calls a preprocessing
+           layer, you need to call `tf.function` again on your callable after
+           each subsequent call to `adapt`.
+
+        `tf.keras.Model` example with multiple adapts:
+
+        >>> layer = tf.keras.layers.Normalization(
+        ...     axis=None)
+        >>> layer.adapt([0, 2])
+        >>> model = tf.keras.Sequential(layer)
+        >>> model.predict([0, 1, 2])
+        array([-1.,  0.,  1.], dtype=float32)
+        >>> layer.adapt([-1, 1])
+        >>> model.compile() # This is needed to re-compile model.predict!
+        >>> model.predict([0, 1, 2])
+        array([0., 1., 2.], dtype=float32)
+
+        `tf.data.Dataset` example with multiple adapts:
+
+        >>> layer = tf.keras.layers.Normalization(
+        ...     axis=None)
+        >>> layer.adapt([0, 2])
+        >>> input_ds = tf.data.Dataset.range(3)
+        >>> normalized_ds = input_ds.map(layer)
+        >>> list(normalized_ds.as_numpy_iterator())
+        [array([-1.], dtype=float32),
+         array([0.], dtype=float32),
+         array([1.], dtype=float32)]
+        >>> layer.adapt([-1, 1])
+        >>> normalized_ds = input_ds.map(layer) # Re-map over the input dataset.
+        >>> list(normalized_ds.as_numpy_iterator())
+        [array([0.], dtype=float32),
+         array([1.], dtype=float32),
+         array([2.], dtype=float32)]
+
+        `adapt()` is meant only as a single machine utility to compute layer
+        state.  To analyze a dataset that cannot fit on a single machine, see
+        [Tensorflow Transform](
+        https://www.tensorflow.org/tfx/transform/get_started)
+        for a multi-machine, map-reduce solution.
+
+        Arguments:
+            data: The data to train on. It can be passed either as a tf.data
+              Dataset, or as a numpy array.
+            batch_size: Integer or `None`.
+                Number of samples per state update. If unspecified,
+                `batch_size` will default to 32.  Do not specify the
+                `batch_size` if your data is in the form of datasets,
+                generators, or `keras.utils.Sequence` instances (since they
+                generate batches).
+            steps: Integer or `None`.
+                Total number of steps (batches of samples)
+                When training with input tensors such as
+                TensorFlow data tensors, the default `None` is equal to
+                the number of samples in your dataset divided by
+                the batch size, or 1 if that cannot be determined. If x is a
+                `tf.data` dataset, and 'steps' is None, the epoch will run until
+                the input dataset is exhausted. When passing an infinitely
+                repeating dataset, you must specify the `steps` argument. This
+                argument is not supported with array inputs.
+        """
+        _disallow_inside_tf_function("adapt")
+        if not version_utils.should_use_v2():
+            raise RuntimeError("`adapt` is only supported in tensorflow v2.")
+        if not self._is_compiled:
+            self.compile()  # Compile with defaults.
+        if self.built:
+            self.reset_state()
+        data_handler = data_adapter.DataHandler(
+            data,
+            batch_size=batch_size,
+            steps_per_epoch=steps,
+            epochs=1,
+            steps_per_execution=self._steps_per_execution,
+            distribute=False,
+        )
+        self._adapt_function = self.make_adapt_function()
+        for _, iterator in data_handler.enumerate_epochs():
+            with data_handler.catch_stop_iteration():
+                for _ in data_handler.steps():
+                    self._adapt_function(iterator)
+                    if data_handler.should_sync:
+                        context.async_wait()
+        self.finalize_state()
+        self._is_adapted = True
+
+    def _reset_state_wrapper(self):
+        """Calls `reset_state` and sets `adapted` to `False`."""
+        self._reset_state_impl()
+        self._is_adapted = False
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _configure_steps_per_execution(self, steps_per_execution):
+        self._steps_per_execution = tf.Variable(
+            steps_per_execution,
+            dtype="int64",
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+        )
+
+    # TODO(omalleyt): Unify this logic with `Layer._maybe_build`.
+    def _adapt_maybe_build(self, data):
+        if not self.built:
+            try:
+                # If this is a Numpy array or tensor, we can get shape from
+                # .shape.  If not, an attribute error will be thrown.
+                data_shape = data.shape
+                data_shape_nones = tuple([None] * len(data.shape))
+            except AttributeError:
+                # The input has an unknown number of dimensions.
+                data_shape = None
+                data_shape_nones = None
+
+            # TODO (b/159261555): move this to base layer build.
+            batch_input_shape = getattr(self, "_batch_input_shape", None)
+            if batch_input_shape is None:
+                # Set the number of dimensions.
+                self._batch_input_shape = data_shape_nones
+            self.build(data_shape)
+            self.built = True
 
 
 def _disallow_inside_tf_function(method_name):
-  """Disallow calling a method inside a `tf.function`."""
-  if tf.inside_function():
-    error_msg = (
-        'Detected a call to `PreprocessingLayer.{method_name}` inside a '
-        '`tf.function`. `PreprocessingLayer.{method_name} is a high-level '
-        'endpoint that manages its own `tf.function`. Please move the call '
-        'to `PreprocessingLayer.{method_name}` outside of all enclosing '
-        '`tf.function`s. Note that you can call a `PreprocessingLayer` '
-        'directly on `Tensor`s inside a `tf.function` like: `layer(x)`, '
-        'or update its state like: `layer.update_state(x)`.').format(
-            method_name=method_name)
-    raise RuntimeError(error_msg)
+    """Disallow calling a method inside a `tf.function`."""
+    if tf.inside_function():
+        error_msg = (
+            "Detected a call to `PreprocessingLayer.{method_name}` inside a "
+            "`tf.function`. `PreprocessingLayer.{method_name} is a high-level "
+            "endpoint that manages its own `tf.function`. Please move the call "
+            "to `PreprocessingLayer.{method_name}` outside of all enclosing "
+            "`tf.function`s. Note that you can call a `PreprocessingLayer` "
+            "directly on `Tensor`s inside a `tf.function` like: `layer(x)`, "
+            "or update its state like: `layer.update_state(x)`."
+        ).format(method_name=method_name)
+        raise RuntimeError(error_msg)
diff --git a/keras/engine/base_preprocessing_layer_test.py b/keras/engine/base_preprocessing_layer_test.py
index f065c9325d38..af4344fd5ea6 100644
--- a/keras/engine/base_preprocessing_layer_test.py
+++ b/keras/engine/base_preprocessing_layer_test.py
@@ -16,223 +16,235 @@
 
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.engine import base_preprocessing_layer
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 # Define a test-only implementation of BasePreprocessingLayer to validate
 # its correctness directly.
 class AddingPreprocessingLayer(base_preprocessing_layer.PreprocessingLayer):
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.sum = tf.Variable(0.0, dtype=tf.float32)
 
-  def build(self, input_shape):
-    super().build(input_shape)
-    self.sum = tf.Variable(0., dtype=tf.float32)
+    def update_state(self, data):
+        self.sum.assign_add(tf.reduce_sum(tf.cast(data, tf.float32)))
 
-  def update_state(self, data):
-    self.sum.assign_add(tf.reduce_sum(tf.cast(data, tf.float32)))
+    def reset_state(self):
+        self.sum.assign(0.0)
 
-  def reset_state(self):  # pylint: disable=method-hidden
-    self.sum.assign(0.)
+    def set_total(self, sum_value):
+        """This is an example of how a subclass would implement a direct setter.
 
-  def set_total(self, sum_value):
-    """This is an example of how a subclass would implement a direct setter.
+        Args:
+          sum_value: The total to set.
+        """
+        self.sum.assign(sum_value)
 
-    Args:
-      sum_value: The total to set.
-    """
-    self.sum.assign(sum_value)
-
-  def call(self, inputs):
-    return inputs + self.sum
+    def call(self, inputs):
+        return inputs + self.sum
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class PreprocessingLayerTest(test_combinations.TestCase):
-
-  def test_adapt_bad_input_fails(self):
-    """Test that non-Dataset/Numpy inputs cause a reasonable error."""
-    input_dataset = {"foo": 0}
-
-    layer = AddingPreprocessingLayer()
-    if tf.executing_eagerly():
-      with self.assertRaisesRegex(ValueError, "Failed to find data adapter"):
+    def test_adapt_bad_input_fails(self):
+        """Test that non-Dataset/Numpy inputs cause a reasonable error."""
+        input_dataset = {"foo": 0}
+
+        layer = AddingPreprocessingLayer()
+        if tf.executing_eagerly():
+            with self.assertRaisesRegex(
+                ValueError, "Failed to find data adapter"
+            ):
+                layer.adapt(input_dataset)
+        else:
+            with self.assertRaisesRegex(ValueError, "requires a"):
+                layer.adapt(input_dataset)
+
+    def test_adapt_infinite_dataset_fails(self):
+        """Test that preproc layers fail if an infinite dataset is passed."""
+        input_dataset = tf.data.Dataset.from_tensor_slices(
+            np.array([[1], [2], [3], [4], [5], [0]])
+        ).repeat()
+
+        layer = AddingPreprocessingLayer()
+        if tf.executing_eagerly():
+            with self.assertRaisesRegex(ValueError, "infinite dataset"):
+                layer.adapt(input_dataset)
+        else:
+            with self.assertRaisesRegex(
+                ValueError, ".*infinite number of elements.*"
+            ):
+                layer.adapt(input_dataset)
+
+    def test_setter_update(self):
+        """Test the prototyped setter method."""
+        input_data = keras.Input(shape=(1,))
+        layer = AddingPreprocessingLayer()
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+        model._run_eagerly = test_utils.should_run_eagerly()
+
+        layer.set_total(15)
+
+        self.assertAllEqual([[16], [17], [18]], model.predict([1.0, 2.0, 3.0]))
+
+    def test_pre_build_adapt_update_numpy(self):
+        """Test that preproc layers can adapt() before build() is called."""
+        input_dataset = np.array([1, 2, 3, 4, 5])
+
+        layer = AddingPreprocessingLayer()
         layer.adapt(input_dataset)
-    else:
-      with self.assertRaisesRegex(ValueError, "requires a"):
-        layer.adapt(input_dataset)
-
-  def test_adapt_infinite_dataset_fails(self):
-    """Test that preproc layers fail if an infinite dataset is passed."""
-    input_dataset = tf.data.Dataset.from_tensor_slices(
-        np.array([[1], [2], [3], [4], [5], [0]])).repeat()
-
-    layer = AddingPreprocessingLayer()
-    if tf.executing_eagerly():
-      with self.assertRaisesRegex(ValueError, "infinite dataset"):
-        layer.adapt(input_dataset)
-    else:
-      with self.assertRaisesRegex(ValueError,
-                                  ".*infinite number of elements.*"):
-        layer.adapt(input_dataset)
-
-  def test_setter_update(self):
-    """Test the prototyped setter method."""
-    input_data = keras.Input(shape=(1,))
-    layer = AddingPreprocessingLayer()
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    layer.set_total(15)
-
-    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
-
-  def test_pre_build_adapt_update_numpy(self):
-    """Test that preproc layers can adapt() before build() is called."""
-    input_dataset = np.array([1, 2, 3, 4, 5])
-
-    layer = AddingPreprocessingLayer()
-    layer.adapt(input_dataset)
-
-    input_data = keras.Input(shape=(1,))
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
-    model._run_eagerly = test_utils.should_run_eagerly()
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
+        input_data = keras.Input(shape=(1,))
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+        model._run_eagerly = test_utils.should_run_eagerly()
 
-  def test_post_build_adapt_update_numpy(self):
-    """Test that preproc layers can adapt() after build() is called."""
-    input_dataset = np.array([1, 2, 3, 4, 5])
+        self.assertAllEqual([[16], [17], [18]], model.predict([1.0, 2.0, 3.0]))
 
-    input_data = keras.Input(shape=(1,))
-    layer = AddingPreprocessingLayer()
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
-    model._run_eagerly = test_utils.should_run_eagerly()
+    def test_post_build_adapt_update_numpy(self):
+        """Test that preproc layers can adapt() after build() is called."""
+        input_dataset = np.array([1, 2, 3, 4, 5])
 
-    layer.adapt(input_dataset)
+        input_data = keras.Input(shape=(1,))
+        layer = AddingPreprocessingLayer()
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+        model._run_eagerly = test_utils.should_run_eagerly()
 
-    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
-
-  def test_pre_build_adapt_update_dataset(self):
-    """Test that preproc layers can adapt() before build() is called."""
-    input_dataset = tf.data.Dataset.from_tensor_slices(
-        np.array([[1], [2], [3], [4], [5], [0]]))
-
-    layer = AddingPreprocessingLayer()
-    layer.adapt(input_dataset)
-
-    input_data = keras.Input(shape=(1,))
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
-
-  def test_post_build_adapt_update_dataset(self):
-    """Test that preproc layers can adapt() after build() is called."""
-    input_dataset = tf.data.Dataset.from_tensor_slices(
-        np.array([[1], [2], [3], [4], [5], [0]]))
-
-    input_data = keras.Input(shape=(1,))
-    layer = AddingPreprocessingLayer()
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    layer.adapt(input_dataset)
-
-    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
-
-  def test_weight_based_state_transfer(self):
-    """Test that preproc layers can transfer state via get/set weights.."""
-
-    def get_model():
-      input_data = keras.Input(shape=(1,))
-      layer = AddingPreprocessingLayer()
-      output = layer(input_data)
-      model = keras.Model(input_data, output)
-      model._run_eagerly = test_utils.should_run_eagerly()
-      return (model, layer)
-
-    input_dataset = np.array([1, 2, 3, 4, 5])
-    model, layer = get_model()
-    layer.adapt(input_dataset)
-    self.assertAllEqual([[16], [17], [18]], model.predict([1., 2., 3.]))
-
-    # Create a new model and verify it has no state carryover.
-    weights = model.get_weights()
-    model_2, _ = get_model()
-    self.assertAllEqual([[1], [2], [3]], model_2.predict([1., 2., 3.]))
+        layer.adapt(input_dataset)
 
-    # Transfer state from model to model_2 via get/set weights.
-    model_2.set_weights(weights)
-    self.assertAllEqual([[16], [17], [18]], model_2.predict([1., 2., 3.]))
+        self.assertAllEqual([[16], [17], [18]], model.predict([1.0, 2.0, 3.0]))
 
-  def test_loading_without_providing_class_fails(self):
-    input_data = keras.Input(shape=(1,))
-    layer = AddingPreprocessingLayer()
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
+    def test_pre_build_adapt_update_dataset(self):
+        """Test that preproc layers can adapt() before build() is called."""
+        input_dataset = tf.data.Dataset.from_tensor_slices(
+            np.array([[1], [2], [3], [4], [5], [0]])
+        )
 
-    if not tf.executing_eagerly():
-      self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+        layer = AddingPreprocessingLayer()
+        layer.adapt(input_dataset)
 
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
+        input_data = keras.Input(shape=(1,))
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+        model._run_eagerly = test_utils.should_run_eagerly()
 
-    with self.assertRaisesRegex(ValueError,
-                                "Unknown layer: AddingPreprocessingLayer"):
-      _ = keras.models.load_model(output_path)
+        self.assertAllEqual([[16], [17], [18]], model.predict([1.0, 2.0, 3.0]))
 
-  def test_adapt_sets_input_shape_rank(self):
-    """Check that `.adapt()` sets the `input_shape`'s rank."""
-    # Shape: (3,1,2)
-    adapt_dataset = np.array([[[1., 2.]], [[3., 4.]], [[5., 6.]]],
-                             dtype=np.float32)
+    def test_post_build_adapt_update_dataset(self):
+        """Test that preproc layers can adapt() after build() is called."""
+        input_dataset = tf.data.Dataset.from_tensor_slices(
+            np.array([[1], [2], [3], [4], [5], [0]])
+        )
 
-    layer = AddingPreprocessingLayer()
-    layer.adapt(adapt_dataset)
+        input_data = keras.Input(shape=(1,))
+        layer = AddingPreprocessingLayer()
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+        model._run_eagerly = test_utils.should_run_eagerly()
 
-    input_dataset = np.array([[[1., 2.], [3., 4.]], [[3., 4.], [5., 6.]]],
-                             dtype=np.float32)
-    layer(input_dataset)
+        layer.adapt(input_dataset)
 
-    model = keras.Sequential([layer])
-    self.assertTrue(model.built)
-    self.assertEqual(model.input_shape, (None, None, None))
+        self.assertAllEqual([[16], [17], [18]], model.predict([1.0, 2.0, 3.0]))
 
-  def test_adapt_doesnt_overwrite_input_shape(self):
-    """Check that `.adapt()` doesn't change the `input_shape`."""
-    # Shape: (3, 1, 2)
-    adapt_dataset = np.array([[[1., 2.]], [[3., 4.]], [[5., 6.]]],
-                             dtype=np.float32)
+    def test_weight_based_state_transfer(self):
+        """Test that preproc layers can transfer state via get/set weights.."""
 
-    layer = AddingPreprocessingLayer(input_shape=[1, 2])
-    layer.adapt(adapt_dataset)
+        def get_model():
+            input_data = keras.Input(shape=(1,))
+            layer = AddingPreprocessingLayer()
+            output = layer(input_data)
+            model = keras.Model(input_data, output)
+            model._run_eagerly = test_utils.should_run_eagerly()
+            return (model, layer)
 
-    model = keras.Sequential([layer])
-    self.assertTrue(model.built)
-    self.assertEqual(model.input_shape, (None, 1, 2))
+        input_dataset = np.array([1, 2, 3, 4, 5])
+        model, layer = get_model()
+        layer.adapt(input_dataset)
+        self.assertAllEqual([[16], [17], [18]], model.predict([1.0, 2.0, 3.0]))
+
+        # Create a new model and verify it has no state carryover.
+        weights = model.get_weights()
+        model_2, _ = get_model()
+        self.assertAllEqual([[1], [2], [3]], model_2.predict([1.0, 2.0, 3.0]))
+
+        # Transfer state from model to model_2 via get/set weights.
+        model_2.set_weights(weights)
+        self.assertAllEqual(
+            [[16], [17], [18]], model_2.predict([1.0, 2.0, 3.0])
+        )
+
+    def test_loading_without_providing_class_fails(self):
+        input_data = keras.Input(shape=(1,))
+        layer = AddingPreprocessingLayer()
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+
+        if not tf.executing_eagerly():
+            self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format="tf")
+
+        with self.assertRaisesRegex(
+            ValueError, "Unknown layer: 'AddingPreprocessingLayer'"
+        ):
+            _ = keras.models.load_model(output_path)
+
+    def test_adapt_sets_input_shape_rank(self):
+        """Check that `.adapt()` sets the `input_shape`'s rank."""
+        # Shape: (3,1,2)
+        adapt_dataset = np.array(
+            [[[1.0, 2.0]], [[3.0, 4.0]], [[5.0, 6.0]]], dtype=np.float32
+        )
+
+        layer = AddingPreprocessingLayer()
+        layer.adapt(adapt_dataset)
+
+        input_dataset = np.array(
+            [[[1.0, 2.0], [3.0, 4.0]], [[3.0, 4.0], [5.0, 6.0]]],
+            dtype=np.float32,
+        )
+        layer(input_dataset)
+
+        model = keras.Sequential([layer])
+        self.assertTrue(model.built)
+        self.assertEqual(model.input_shape, (None, None, None))
+
+    def test_adapt_doesnt_overwrite_input_shape(self):
+        """Check that `.adapt()` doesn't change the `input_shape`."""
+        # Shape: (3, 1, 2)
+        adapt_dataset = np.array(
+            [[[1.0, 2.0]], [[3.0, 4.0]], [[5.0, 6.0]]], dtype=np.float32
+        )
+
+        layer = AddingPreprocessingLayer(input_shape=[1, 2])
+        layer.adapt(adapt_dataset)
+
+        model = keras.Sequential([layer])
+        self.assertTrue(model.built)
+        self.assertEqual(model.input_shape, (None, 1, 2))
 
 
 class PreprocessingLayerV1Test(test_combinations.TestCase):
+    def test_adapt_fails(self):
+        """Test that calling adapt leads to a runtime error."""
+        input_dataset = {"foo": 0}
 
-  def test_adapt_fails(self):
-    """Test that calling adapt leads to a runtime error."""
-    input_dataset = {"foo": 0}
-
-    with tf.Graph().as_default():
-      layer = AddingPreprocessingLayer()
-      with self.assertRaisesRegex(RuntimeError,
-                                  "`adapt` is only supported in tensorflow v2"):
-        layer.adapt(input_dataset)
+        with tf.Graph().as_default():
+            layer = AddingPreprocessingLayer()
+            with self.assertRaisesRegex(
+                RuntimeError, "`adapt` is only supported in tensorflow v2"
+            ):
+                layer.adapt(input_dataset)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/engine/compile_utils.py b/keras/engine/compile_utils.py
index 3b487e15d388..5d443654ced9 100644
--- a/keras/engine/compile_utils.py
+++ b/keras/engine/compile_utils.py
@@ -12,805 +12,869 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Utilities for `Model.compile`."""
 
 
 import copy
+
+import tensorflow.compat.v2 as tf
+
 from keras import losses as losses_mod
 from keras import metrics as metrics_mod
-from keras.saving.experimental import saving_lib
+from keras.saving import saving_lib
 from keras.utils import generic_utils
 from keras.utils import losses_utils
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
 
 class Container:
-  """Base Container class."""
+    """Base Container class."""
 
-  def __init__(self, output_names=None):
-    self._output_names = output_names
+    def __init__(self, output_names=None, mesh=None):
+        self._output_names = output_names
+        # Used by DTensor layout map use case. Can be removed after DTensor
+        # based distribution strategy.
+        self._mesh = mesh
 
-  def build(self, y_pred):
-    if self._output_names is None:
-      # In Subclass API, output names like 'output_1' are used for
-      # `Metric` names.
-      self._output_names = create_pseudo_output_names(y_pred)
+    def build(self, y_pred):
+        if self._output_names is None:
+            # In Subclass API, output names like 'output_1' are used for
+            # `Metric` names.
+            self._output_names = create_pseudo_output_names(y_pred)
 
-  def _conform_to_outputs(self, outputs, struct):
-    """Convenience method to conform `struct` to `outputs` structure.
+    def _conform_to_outputs(self, outputs, struct):
+        """Convenience method to conform `struct` to `outputs` structure.
 
-    Mappings performed:
+        Mappings performed:
 
-    (1) Map a dict to a list of outputs, using the output names.
-    (2) Fill missing keys in a dict w/ `None`s.
-    (3) Map a single item to all outputs.
+        (1) Map a dict to a list of outputs, using the output names.
+        (2) Fill missing keys in a dict w/ `None`s.
+        (3) Map a single item to all outputs.
 
-    Args:
-      outputs: Model predictions.
-      struct: Arbitrary nested structure (e.g. of labels, sample_weights,
-        losses, or metrics).
+        Args:
+          outputs: Model predictions.
+          struct: Arbitrary nested structure (e.g. of labels, sample_weights,
+            losses, or metrics).
 
-    Returns:
-      Mapping of `struct` to `outputs` structure.
-    """
-    struct = map_to_output_names(outputs, self._output_names, struct)
-    struct = map_missing_dict_keys(outputs, struct)
-    # Allow passing one object that applies to all outputs.
-    if not tf.nest.is_nested(struct) and tf.nest.is_nested(outputs):
-      struct = tf.nest.map_structure(lambda _: struct, outputs)
-    return struct
+        Returns:
+          Mapping of `struct` to `outputs` structure.
+        """
+        struct = map_to_output_names(outputs, self._output_names, struct)
+        struct = map_missing_dict_keys(outputs, struct)
+        # Allow passing one object that applies to all outputs.
+        if not tf.nest.is_nested(struct) and tf.nest.is_nested(outputs):
+            struct = tf.nest.map_structure(lambda _: struct, outputs)
+        return struct
 
-  def _maybe_broadcast_to_outputs(self, outputs, objects):
-    """Determines if losses / metrics should be applied to all outputs.
+    def _maybe_broadcast_to_outputs(self, outputs, objects):
+        """Determines if losses / metrics should be applied to all outputs.
 
-    NOTE: This method should only be called for Metrics / Losses, not for
-    y_true / sample_weight.
+        NOTE: This method should only be called for Metrics / Losses, not for
+        y_true / sample_weight.
 
-    Args:
-      outputs: Model predictions.
-      objects: Arbitrary nested structure (e.g. of losses or metrics)
+        Args:
+          outputs: Model predictions.
+          objects: Arbitrary nested structure (e.g. of losses or metrics)
 
-    Returns:
-      Arbitrary nested structure of objects, maybe copied to each output.
+        Returns:
+          Arbitrary nested structure of objects, maybe copied to each output.
 
-    Applies a Loss / Metric to all outputs.
-    """
-    if not self._should_broadcast(objects):
-      return objects
+        Applies a Loss / Metric to all outputs.
+        """
+        if not self._should_broadcast(objects):
+            return objects
 
-    # When there is more than one Model output, this is needed to keep
-    # each Metric / Loss separate. When there is only one Model output,
-    # the user-supplied object should be used.
-    should_copy_objects = len(tf.nest.flatten(outputs)) > 1
+        # When there is more than one Model output, this is needed to keep
+        # each Metric / Loss separate. When there is only one Model output,
+        # the user-supplied object should be used.
+        should_copy_objects = len(tf.nest.flatten(outputs)) > 1
 
-    def _broadcast_fn():
-      if should_copy_objects:
-        return tf.nest.map_structure(self._copy_object, objects)
-      return objects
+        def _broadcast_fn():
+            if should_copy_objects:
+                return tf.nest.map_structure(self._copy_object, objects)
+            return objects
 
-    return tf.nest.map_structure(lambda _: _broadcast_fn(), outputs)
+        return tf.nest.map_structure(lambda _: _broadcast_fn(), outputs)
 
-  def _should_broadcast(self, objects):
-    raise NotImplementedError
+    def _should_broadcast(self, objects):
+        raise NotImplementedError
 
-  def _copy_object(self, obj):
-    raise NotImplementedError
+    def _copy_object(self, obj):
+        raise NotImplementedError
 
 
 class LossesContainer(Container):
-  """A container class for losses passed to `Model.compile()`.
-
-  Args:
-    losses: Struct of loss function(s). See `Model.compile()` doc for more
-      information.
-    loss_weights: Weights of the losses contributions of different model
-      outputs. See `Model.compile()` doc for more information.
-    output_names: List of string. Per-output metric names.
-    total_loss_mean: A `keras.metrics.Mean` instance that is used to track the
-      mean of all losses (including compiled and regularization losses).
-  """
-
-  def __init__(self,
-               losses,
-               loss_weights=None,
-               output_names=None,
-               total_loss_mean=None):
-    super(LossesContainer, self).__init__(output_names=output_names)
-
-    # Keep user-supplied values untouched for recompiling and serialization.
-    self._user_losses = losses
-    self._user_loss_weights = loss_weights
-
-    self._losses = losses
-    self._loss_weights = loss_weights
-    self._per_output_metrics = None  # Per-output losses become metrics.
-
-    # Mean of the total loss.
-    self._total_loss_mean = total_loss_mean or metrics_mod.Mean(name='loss')
-    self._built = False
-
-  def get_config(self):
-    # In case `self._losses` is a single string where we convert it to a list.
-    self._losses = tf.nest.flatten(self._losses)
-    return {
-        'losses': [
-            saving_lib.serialize_keras_object(obj)
-            for obj in self._losses
-            if obj is not None
-        ],
-        'total_loss_mean':
-            saving_lib.serialize_keras_object(self._total_loss_mean)
-    }
-
-  @classmethod
-  def from_config(cls, config):
-    """Returns the `LossesContainer` instance given the `config`."""
-    deserialized_config = {}
-    for key, value in config.items():
-      if isinstance(value, list):
-        deserialized_config[key] = [
-            saving_lib.deserialize_keras_object(item) for item in value
-        ]
-      else:
-        deserialized_config[key] = saving_lib.deserialize_keras_object(value)
-    return cls(**deserialized_config)
-
-  @property
-  def metrics(self):
-    """Per-output loss metrics."""
-    if not self._built:
-      return []
-    per_output_metrics = [
-        metric_obj for metric_obj in tf.nest.flatten(self._per_output_metrics)
-        if metric_obj is not None
-    ]
-    return [self._total_loss_mean] + per_output_metrics
-
-  def build(self, y_pred):
-    """One-time setup of loss objects."""
-    super(LossesContainer, self).build(y_pred)
-
-    self._losses = self._maybe_broadcast_to_outputs(y_pred, self._losses)
-    self._losses = self._conform_to_outputs(y_pred, self._losses)
-    self._losses = tf.nest.map_structure(self._get_loss_object, self._losses)
-    self._losses = tf.nest.flatten(self._losses)
-
-    self._loss_weights = self._maybe_broadcast_to_outputs(
-        y_pred, self._loss_weights)
-    self._loss_weights = self._conform_to_outputs(y_pred, self._loss_weights)
-    self._loss_weights = tf.nest.flatten(self._loss_weights)
-
-    self._create_metrics()
-    self._built = True
-
-  @property
-  def built(self):
-    return self._built
-
-  def _create_metrics(self):
-    """Creates per-output loss metrics, but only for multi-output Models."""
-    if len(self._output_names) == 1:
-      self._per_output_metrics = [None]
-    else:
-      self._per_output_metrics = []
-      for loss_obj, output_name in zip(self._losses, self._output_names):
-        if loss_obj is None:
-          self._per_output_metrics.append(None)
-        else:
-          self._per_output_metrics.append(
-              metrics_mod.Mean(output_name + '_loss'))
-
-  def __call__(self,
-               y_true,
-               y_pred,
-               sample_weight=None,
-               regularization_losses=None):
-    """Computes the overall loss.
-
-    Args:
-      y_true: An arbitrary structure of Tensors representing the ground truth.
-      y_pred: An arbitrary structure of Tensors representing a Model's outputs.
-      sample_weight: An arbitrary structure of Tensors representing the
-        per-sample loss weights. If one Tensor is passed, it is used for all
-        losses. If multiple Tensors are passed, the structure should match
-        `y_pred`.
-      regularization_losses: Additional losses to be added to the total loss.
-
-    Returns:
-      The total loss as a `tf.Tensor`, or `None` if no loss results.
-    """
-    y_true = self._conform_to_outputs(y_pred, y_true)
-    sample_weight = self._conform_to_outputs(y_pred, sample_weight)
-
-    if not self._built:
-      self.build(y_pred)
-
-    y_pred = tf.nest.flatten(y_pred)
-    y_true = tf.nest.flatten(y_true)
-    sample_weight = tf.nest.flatten(sample_weight)
-
-    loss_values = []  # Used for gradient calculation.
-    total_loss_mean_values = []  # Used for loss metric calculation.
-    batch_dim = None
-    zip_args = (y_true, y_pred, sample_weight, self._losses, self._loss_weights,
-                self._per_output_metrics)
-    for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args):
-      if y_t is None or loss_obj is None:  # Ok to have no loss for an output.
-        continue
-
-      y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
-      sw = apply_mask(y_p, sw, get_mask(y_p))
-      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
-
-      total_loss_mean_value = loss_value
-      # Correct for the `Mean` loss metrics counting each replica as a batch.
-      if loss_obj.reduction == losses_utils.ReductionV2.SUM:
-        total_loss_mean_value *= tf.distribute.get_strategy(
-        ).num_replicas_in_sync
-
-      if batch_dim is None:
-        if tf_utils.is_ragged(y_t):
-          batch_dim = y_t.nrows()
-        else:
-          batch_dim = tf.shape(y_t)[0]
-
-      if metric_obj is not None:
-        metric_obj.update_state(total_loss_mean_value, sample_weight=batch_dim)
-
-      if loss_weight is not None:
-        loss_value *= loss_weight
-        total_loss_mean_value *= loss_weight
-
-      if (loss_obj.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE or
-          loss_obj.reduction == losses_utils.ReductionV2.AUTO):
-        loss_value = losses_utils.scale_loss_for_distribution(loss_value)
-
-      loss_values.append(loss_value)
-      total_loss_mean_values.append(total_loss_mean_value)
-
-    if regularization_losses:
-      regularization_losses = losses_utils.cast_losses_to_common_dtype(
-          regularization_losses)
-      reg_loss = tf.add_n(regularization_losses)
-      total_loss_mean_values.append(reg_loss)
-      loss_values.append(losses_utils.scale_loss_for_distribution(reg_loss))
-
-    if loss_values:
-      total_loss_mean_values = losses_utils.cast_losses_to_common_dtype(
-          total_loss_mean_values)
-      total_total_loss_mean_value = tf.add_n(total_loss_mean_values)
-      self._total_loss_mean.update_state(
-          total_total_loss_mean_value, sample_weight=batch_dim)
-
-      loss_values = losses_utils.cast_losses_to_common_dtype(loss_values)
-      total_loss = tf.add_n(loss_values)
-      return total_loss
-    else:
-      return None
-
-  def reset_state(self):
-    """Resets the state of loss metrics."""
-    if not self._built:
-      return
-    metrics = [self._total_loss_mean] + tf.nest.flatten(
-        self._per_output_metrics)
-    for metric_obj in metrics:
-      if metric_obj is not None:
-        metric_obj.reset_state()
-
-  def _get_loss_object(self, loss):
-    """Returns a `Loss` object.
-
-    Converts the user-supplied loss to a `Loss` object. Also allows
-    `SUM_OVER_BATCH_SIZE` reduction to be used for this loss.
+    """A container class for losses passed to `Model.compile()`.
 
     Args:
-      loss: A string, function, or `Loss` object.
-
-    Returns:
-      A `Loss` object.
+      losses: Struct of loss function(s). See `Model.compile()` doc for more
+        information.
+      loss_weights: Weights of the losses contributions of different model
+        outputs. See `Model.compile()` doc for more information.
+      output_names: List of string. Per-output metric names.
+      total_loss_mean: A `keras.metrics.Mean` instance that is used to track the
+        mean of all losses (including compiled and regularization losses).
     """
-    if loss is None:
-      return None  # Ok to have no loss for an output.
-
-    loss = losses_mod.get(loss)
-    if not isinstance(loss, losses_mod.Loss):
-      loss_name = get_custom_object_name(loss)
-      if loss_name is None:
-        raise ValueError(
-            f'Loss should be a callable, received: {loss}')
-      loss = losses_mod.LossFunctionWrapper(loss, name=loss_name)
-    loss._allow_sum_over_batch_size = True  # pylint: disable=protected-access
-    return loss
-
-  def _should_broadcast(self, obj):
-    return not tf.nest.is_nested(obj)
 
-  def _copy_object(self, obj):
-    return obj  # Losses don't need to be copied.
+    def __init__(
+        self,
+        losses,
+        loss_weights=None,
+        output_names=None,
+        total_loss_mean=None,
+        mesh=None,
+    ):
+        super(LossesContainer, self).__init__(
+            output_names=output_names, mesh=mesh
+        )
+
+        # Keep user-supplied values untouched for recompiling and serialization.
+        self._user_losses = losses
+        self._user_loss_weights = loss_weights
+
+        self._losses = losses
+        self._loss_weights = loss_weights
+        self._per_output_metrics = None  # Per-output losses become metrics.
+
+        # Mean of the total loss.
+        self._total_loss_mean = total_loss_mean or metrics_mod.Mean(
+            name="loss", mesh=self._mesh
+        )
+        self._built = False
+
+    def get_config(self):
+        # In case `self._losses` is a single string where we convert it to a
+        # list.
+        self._losses = tf.nest.flatten(self._losses)
+        return {
+            "losses": [
+                saving_lib.serialize_keras_object(obj)
+                for obj in self._losses
+                if obj is not None
+            ],
+            "total_loss_mean": saving_lib.serialize_keras_object(
+                self._total_loss_mean
+            ),
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        """Returns the `LossesContainer` instance given the `config`."""
+        deserialized_config = {}
+        for key, value in config.items():
+            if isinstance(value, list):
+                deserialized_config[key] = [
+                    saving_lib.deserialize_keras_object(item) for item in value
+                ]
+            else:
+                deserialized_config[key] = saving_lib.deserialize_keras_object(
+                    value
+                )
+        return cls(**deserialized_config)
+
+    @property
+    def metrics(self):
+        """Per-output loss metrics."""
+        if not self._built:
+            return []
+        per_output_metrics = [
+            metric_obj
+            for metric_obj in tf.nest.flatten(self._per_output_metrics)
+            if metric_obj is not None
+        ]
+        return [self._total_loss_mean] + per_output_metrics
+
+    def build(self, y_pred):
+        """One-time setup of loss objects."""
+        super(LossesContainer, self).build(y_pred)
+
+        self._losses = self._maybe_broadcast_to_outputs(y_pred, self._losses)
+        self._losses = self._conform_to_outputs(y_pred, self._losses)
+        self._losses = tf.nest.map_structure(
+            self._get_loss_object, self._losses
+        )
+        self._losses = tf.nest.flatten(self._losses)
+
+        self._loss_weights = self._maybe_broadcast_to_outputs(
+            y_pred, self._loss_weights
+        )
+        self._loss_weights = self._conform_to_outputs(
+            y_pred, self._loss_weights
+        )
+        self._loss_weights = tf.nest.flatten(self._loss_weights)
+
+        self._create_metrics()
+        self._built = True
+
+    @property
+    def built(self):
+        return self._built
+
+    def _create_metrics(self):
+        """Creates per-output loss metrics, but only for multi-output Models."""
+        if len(self._output_names) == 1:
+            self._per_output_metrics = [None]
+        else:
+            self._per_output_metrics = []
+            for loss_obj, output_name in zip(self._losses, self._output_names):
+                if loss_obj is None:
+                    self._per_output_metrics.append(None)
+                else:
+                    self._per_output_metrics.append(
+                        metrics_mod.Mean(output_name + "_loss", mesh=self._mesh)
+                    )
+
+    def __call__(
+        self, y_true, y_pred, sample_weight=None, regularization_losses=None
+    ):
+        """Computes the overall loss.
+
+        Args:
+          y_true: An arbitrary structure of Tensors representing the ground
+            truth.
+          y_pred: An arbitrary structure of Tensors representing a Model's
+            outputs.
+          sample_weight: An arbitrary structure of Tensors representing the
+            per-sample loss weights. If one Tensor is passed, it is used for all
+            losses. If multiple Tensors are passed, the structure should match
+            `y_pred`.
+          regularization_losses: Additional losses to be added to the total
+            loss.
+
+        Returns:
+          The total loss as a `tf.Tensor`, or `None` if no loss results.
+        """
+        y_true = self._conform_to_outputs(y_pred, y_true)
+        sample_weight = self._conform_to_outputs(y_pred, sample_weight)
+
+        if not self._built:
+            self.build(y_pred)
+
+        y_pred = tf.nest.flatten(y_pred)
+        y_true = tf.nest.flatten(y_true)
+        sample_weight = tf.nest.flatten(sample_weight)
+
+        loss_values = []  # Used for gradient calculation.
+        total_loss_mean_values = []  # Used for loss metric calculation.
+        batch_dim = None
+        zip_args = (
+            y_true,
+            y_pred,
+            sample_weight,
+            self._losses,
+            self._loss_weights,
+            self._per_output_metrics,
+        )
+        for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args):
+            if (
+                y_t is None or loss_obj is None
+            ):  # Ok to have no loss for an output.
+                continue
+
+            y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
+            sw = losses_utils.apply_mask(y_p, sw, losses_utils.get_mask(y_p))
+            loss_value = loss_obj(y_t, y_p, sample_weight=sw)
+
+            total_loss_mean_value = loss_value
+            # Correct for the `Mean` loss metrics counting each replica as a
+            # batch.
+            if loss_obj.reduction == losses_utils.ReductionV2.SUM:
+                total_loss_mean_value *= (
+                    tf.distribute.get_strategy().num_replicas_in_sync
+                )
+
+            if batch_dim is None:
+                if tf_utils.is_ragged(y_t):
+                    batch_dim = y_t.nrows()
+                else:
+                    batch_dim = tf.shape(y_t)[0]
+
+            if metric_obj is not None:
+                metric_obj.update_state(
+                    total_loss_mean_value, sample_weight=batch_dim
+                )
+
+            if loss_weight is not None:
+                loss_value *= loss_weight
+                total_loss_mean_value *= loss_weight
+
+            if (
+                loss_obj.reduction
+                == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+                or loss_obj.reduction == losses_utils.ReductionV2.AUTO
+            ):
+                loss_value = losses_utils.scale_loss_for_distribution(
+                    loss_value
+                )
+
+            loss_values.append(loss_value)
+            total_loss_mean_values.append(total_loss_mean_value)
+
+        if regularization_losses:
+            regularization_losses = losses_utils.cast_losses_to_common_dtype(
+                regularization_losses
+            )
+            reg_loss = tf.add_n(regularization_losses)
+            total_loss_mean_values.append(reg_loss)
+            loss_values.append(
+                losses_utils.scale_loss_for_distribution(reg_loss)
+            )
+
+        if loss_values:
+            total_loss_mean_values = losses_utils.cast_losses_to_common_dtype(
+                total_loss_mean_values
+            )
+            total_total_loss_mean_value = tf.add_n(total_loss_mean_values)
+            self._total_loss_mean.update_state(
+                total_total_loss_mean_value, sample_weight=batch_dim
+            )
+
+            loss_values = losses_utils.cast_losses_to_common_dtype(loss_values)
+            total_loss = tf.add_n(loss_values)
+            return total_loss
+        else:
+            return None
+
+    def reset_state(self):
+        """Resets the state of loss metrics."""
+        if not self._built:
+            return
+        metrics = [self._total_loss_mean] + tf.nest.flatten(
+            self._per_output_metrics
+        )
+        for metric_obj in metrics:
+            if metric_obj is not None:
+                metric_obj.reset_state()
+
+    def _get_loss_object(self, loss):
+        """Returns a `Loss` object.
+
+        Converts the user-supplied loss to a `Loss` object. Also allows
+        `SUM_OVER_BATCH_SIZE` reduction to be used for this loss.
+
+        Args:
+          loss: A string, function, or `Loss` object.
+
+        Returns:
+          A `Loss` object.
+        """
+        if loss is None:
+            return None  # Ok to have no loss for an output.
+
+        loss = losses_mod.get(loss)
+        if not isinstance(loss, losses_mod.Loss):
+            loss_name = get_custom_object_name(loss)
+            if loss_name is None:
+                raise ValueError(f"Loss should be a callable, received: {loss}")
+            loss = losses_mod.LossFunctionWrapper(loss, name=loss_name)
+        loss._allow_sum_over_batch_size = True
+        return loss
+
+    def _should_broadcast(self, obj):
+        return not tf.nest.is_nested(obj)
+
+    def _copy_object(self, obj):
+        return obj  # Losses don't need to be copied.
 
 
 class MetricsContainer(Container):
-  """A container class for metrics passed to `Model.compile`."""
-
-  def __init__(self, metrics=None, weighted_metrics=None, output_names=None,
-               from_serialized=False):
-    """Initializes a container for metrics.
-
-    Arguments:
-      metrics: see the `metrics` argument from `tf.keras.Model.compile`.
-      weighted_metrics: see the `weighted_metrics` argument from
-        `tf.keras.Model.compile`.
-      output_names: A list of strings of names of outputs for the model.
-      from_serialized: Whether the model being compiled is from a serialized
-        model.  Used to avoid redundantly applying pre-processing renaming
-        steps.
-    """
-    super(MetricsContainer, self).__init__(output_names=output_names)
+    """A container class for metrics passed to `Model.compile`."""
+
+    def __init__(
+        self,
+        metrics=None,
+        weighted_metrics=None,
+        output_names=None,
+        from_serialized=False,
+        mesh=None,
+    ):
+        """Initializes a container for metrics.
+
+        Arguments:
+          metrics: see the `metrics` argument from `tf.keras.Model.compile`.
+          weighted_metrics: see the `weighted_metrics` argument from
+            `tf.keras.Model.compile`.
+          output_names: A list of strings of names of outputs for the model.
+          from_serialized: Whether the model being compiled is from a serialized
+            model.  Used to avoid redundantly applying pre-processing renaming
+            steps.
+        """
+        super(MetricsContainer, self).__init__(
+            output_names=output_names, mesh=mesh
+        )
+
+        self._check_duplicated_metrics(metrics, weighted_metrics)
+        # Keep user-supplied values untouched for recompiling and serialization.
+        self._user_metrics = metrics
+        self._user_weighted_metrics = weighted_metrics
+
+        self._metrics = metrics
+        self._weighted_metrics = weighted_metrics
+        self._built = False
+
+        self._from_serialized = from_serialized
+
+    def _check_duplicated_metrics(self, metrics, weighted_metrics):
+        """Raise error when user provided metrics have any duplications.
+
+        Note that metrics are stateful container, a shared metric instance
+        between model.metric and model.weighted_metric will make the same
+        intance to be udpated twice, and report wrong value.
+
+        Args:
+          metrics: User provided metrics list.
+          weighted_metrics: User provided weighted metrics list.
+
+        Raises:
+          ValueError, when duplicated metrics instance discovered in user
+            provided metrics and weighted metrics.
+        """
+        seen = set()
+        duplicated = []
+        for x in tf.nest.flatten(metrics) + tf.nest.flatten(weighted_metrics):
+            # We only check metrics object. The string and function objects
+            # will be converted to unique Metric instance.
+            if not isinstance(x, metrics_mod.Metric):
+                continue
+            if x in seen:
+                duplicated.append(x)
+            seen.add(x)
+
+        if duplicated:
+            raise ValueError(
+                "Found duplicated metrics object in the user provided "
+                "metrics and weighted metrics. This will cause the same "
+                "metric object to be updated multiple times, and report "
+                "wrong results. \n"
+                f"Duplicated items: {duplicated}"
+            )
+
+    @property
+    def metrics(self):
+        """All metrics in this container."""
+        if not self._built:
+            return []
+        return self._metrics_in_order
+
+    @property
+    def unweighted_metrics(self):
+        """Metrics in the container that should not be passed sample_weight."""
+        if not self._built:
+            return None
+        return tf.nest.flatten(self._metrics)
+
+    @property
+    def weighted_metrics(self):
+        """Metrics in this container that should be passed `sample_weight`."""
+        if not self._built:
+            return None
+        return tf.nest.flatten(self._weighted_metrics)
+
+    def build(self, y_pred, y_true):
+        """One-time setup of metric objects."""
+        super(MetricsContainer, self).build(y_pred)
+
+        self._metrics = self._maybe_broadcast_to_outputs(y_pred, self._metrics)
+        self._metrics = self._conform_to_outputs(y_pred, self._metrics)
+
+        self._weighted_metrics = self._maybe_broadcast_to_outputs(
+            y_pred, self._weighted_metrics
+        )
+        self._weighted_metrics = self._conform_to_outputs(
+            y_pred, self._weighted_metrics
+        )
+
+        # Standardize on tuple since `tf.data` turns lists into `Tensor`s.
+        y_pred = tf.__internal__.nest.list_to_tuple(y_pred)
+        y_true = tf.__internal__.nest.list_to_tuple(y_true)
+        self._metrics = tf.__internal__.nest.list_to_tuple(self._metrics)
+        self._weighted_metrics = tf.__internal__.nest.list_to_tuple(
+            self._weighted_metrics
+        )
+
+        # Convert to `Metric` objects, potentially disambiguating based on
+        # output properties.
+        self._metrics = tf.__internal__.nest.map_structure_up_to(
+            y_pred, self._get_metric_objects, self._metrics, y_true, y_pred
+        )
+        self._weighted_metrics = tf.__internal__.nest.map_structure_up_to(
+            y_pred,
+            self._get_metric_objects,
+            self._weighted_metrics,
+            y_true,
+            y_pred,
+        )
+
+        self._metrics = tf.__internal__.nest.flatten_up_to(
+            y_pred, self._metrics, check_types=False
+        )
+        self._weighted_metrics = tf.__internal__.nest.flatten_up_to(
+            y_pred, self._weighted_metrics, check_types=False
+        )
+
+        # Assumes metrics, weighted_metrics have been flattened up to outputs.
+        #
+        # If we are loading a model that has been already serialized, we do not
+        # want to re-apply any pre-processing metric renaming steps.
+        if not self._from_serialized:
+            self._set_metric_names()
+        self._create_ordered_metrics()
+        self._built = True
+
+    @property
+    def built(self):
+        return self._built
+
+    def _set_metric_names(self):
+        """Sets unique metric names."""
+        # For multi-output models, prepend the output name to the metric name.
+        # For weighted metrics, prepend "weighted_" if the name would be
+        # non-unique.
+
+        metric_names = set()
+        is_multi_output = len(self._output_names) > 1
+        zip_args = (self._output_names, self._metrics, self._weighted_metrics)
+        for output_name, output_metrics, weighted_output_metrics in zip(
+            *zip_args
+        ):
+            for m in output_metrics:
+                if m is None:
+                    continue
+                if is_multi_output:
+                    m._name = output_name + "_" + m._name
+                if m._name in metric_names:
+                    raise ValueError(
+                        f"Found two metrics with the same name: {m._name}. "
+                        "All the metrics added to the model need to have "
+                        "unique names."
+                    )
+                metric_names.add(m._name)
+
+            for wm in weighted_output_metrics:
+                if wm is None:
+                    continue
+                if is_multi_output:
+                    if output_name + "_" + wm._name in metric_names:
+                        wm._name = output_name + "_weighted_" + wm._name
+                    else:
+                        wm._name = output_name + "_" + wm._name
+                elif wm._name in metric_names:
+                    wm._name = "weighted_" + wm._name
+
+                if wm._name in metric_names:
+                    raise ValueError(
+                        "Found two weighted metrics with the same name: "
+                        f"{wm._name}.All the metrics added to the model need "
+                        "to have unique names."
+                    )
+                metric_names.add(wm._name)
+
+    def _create_ordered_metrics(self):
+        """Cache the flat order needed when return metrics, for backcompat."""
+        self._metrics_in_order = []
+        for output_metrics, output_weighted_metrics in zip(
+            self._metrics, self._weighted_metrics
+        ):
+            for m in tf.nest.flatten(output_metrics):
+                if m is not None:
+                    self._metrics_in_order.append(m)
+            for wm in tf.nest.flatten(output_weighted_metrics):
+                if wm is not None:
+                    self._metrics_in_order.append(wm)
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Updates the state of per-output metrics."""
+        y_true = self._conform_to_outputs(y_pred, y_true)
+        sample_weight = self._conform_to_outputs(y_pred, sample_weight)
+
+        if not self._built:
+            self.build(y_pred, y_true)
+
+        y_pred = tf.nest.flatten(y_pred)
+        y_true = tf.nest.flatten(y_true) if y_true is not None else []
+        sample_weight = tf.nest.flatten(sample_weight)
+
+        zip_args = (
+            y_true,
+            y_pred,
+            sample_weight,
+            self._metrics,
+            self._weighted_metrics,
+        )
+        for y_t, y_p, sw, metric_objs, weighted_metric_objs in zip(*zip_args):
+            # Ok to have no metrics for an output.
+            if y_t is None or (
+                all(m is None for m in metric_objs)
+                and all(wm is None for wm in weighted_metric_objs)
+            ):
+                continue
+
+            y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
+            mask = losses_utils.get_mask(y_p)
+            sw = losses_utils.apply_mask(y_p, sw, mask)
+
+            for metric_obj in metric_objs:
+                if metric_obj is None:
+                    continue
+                metric_obj.update_state(y_t, y_p, sample_weight=mask)
+
+            for weighted_metric_obj in weighted_metric_objs:
+                if weighted_metric_obj is None:
+                    continue
+                weighted_metric_obj.update_state(y_t, y_p, sample_weight=sw)
+
+    def reset_state(self):
+        """Resets the state of all `Metric`s in this container."""
+        if self._built:
+            metrics = self._metrics_in_order
+        else:
+            # If the user supplied `Metric` objects directly, we should
+            # reset those. This could also contain `str`s or `function`s
+            # though.
+            metrics = tf.nest.flatten(self._user_metrics) + tf.nest.flatten(
+                self._user_weighted_metrics
+            )
+
+        for metric_obj in metrics:
+            if isinstance(metric_obj, metrics_mod.Metric):
+                metric_obj.reset_state()
+
+    def _get_metric_objects(self, metrics, y_t, y_p):
+        """Convert user-supplied metrics to `Metric` objects."""
+        metrics = tf.nest.flatten(metrics)
+        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
+
+    def _get_metric_object(self, metric, y_t, y_p):
+        """Converts user-supplied metric to a `Metric` object.
+
+        Args:
+          metric: A string, function, or `Metric` object.
+          y_t: Sample of label.
+          y_p: Sample of output.
+
+        Returns:
+          A `Metric` object.
+        """
+        if metric is None:
+            return None  # Ok to have no metric for an output.
+
+        # Convenience feature for selecting b/t binary, categorical,
+        # and sparse categorical.
+        if str(metric).lower() not in ["accuracy", "acc", "crossentropy", "ce"]:
+            metric_obj = metrics_mod.get(metric)
+        else:
+            y_t_rank = len(y_t.shape.as_list())
+            y_p_rank = len(y_p.shape.as_list())
+            y_t_last_dim = y_t.shape.as_list()[-1]
+            y_p_last_dim = y_p.shape.as_list()[-1]
+
+            is_binary = y_p_last_dim == 1
+            is_sparse_categorical = (
+                y_t_rank < y_p_rank or y_t_last_dim == 1 and y_p_last_dim > 1
+            )
+
+            if str(metric).lower() in ["accuracy", "acc"]:
+                if is_binary:
+                    metric_obj = metrics_mod.binary_accuracy
+                elif is_sparse_categorical:
+                    metric_obj = metrics_mod.sparse_categorical_accuracy
+                else:
+                    metric_obj = metrics_mod.categorical_accuracy
+            else:
+                if is_binary:
+                    metric_obj = metrics_mod.binary_crossentropy
+                elif is_sparse_categorical:
+                    metric_obj = metrics_mod.sparse_categorical_crossentropy
+                else:
+                    metric_obj = metrics_mod.categorical_crossentropy
+
+        if isinstance(metric_obj, losses_mod.Loss):
+            metric_obj._allow_sum_over_batch_size = True
+
+        if not isinstance(metric_obj, metrics_mod.Metric):
+            if isinstance(metric, str):
+                metric_name = metric
+            else:
+                metric_name = get_custom_object_name(metric)
+                if metric_name is None:
+                    raise ValueError(
+                        f"Metric should be a callable, received: {metric}"
+                    )
+
+            metric_obj = metrics_mod.MeanMetricWrapper(
+                metric_obj, name=metric_name, mesh=self._mesh
+            )
+        return metric_obj
+
+    def _should_broadcast(self, obj):
+        # e.g. 'mse'.
+        if not tf.nest.is_nested(obj):
+            return True
+        # e.g. ['mse'] or ['mse', 'mae'].
+        return isinstance(obj, (list, tuple)) and not any(
+            tf.nest.is_nested(o) for o in obj
+        )
+
+    def _copy_object(self, obj):
+        if isinstance(obj, metrics_mod.Metric):
+            return obj.__class__.from_config(obj.get_config())
+        return obj  # Can be a function or `None`.
 
-    self._check_duplicated_metrics(metrics, weighted_metrics)
-    # Keep user-supplied values untouched for recompiling and serialization.
-    self._user_metrics = metrics
-    self._user_weighted_metrics = weighted_metrics
 
-    self._metrics = metrics
-    self._weighted_metrics = weighted_metrics
-    self._built = False
+def create_pseudo_output_names(outputs):
+    """Create pseudo output names for a subclassed Model."""
+    return _create_pseudo_names(outputs, prefix="output_")
 
-    self._from_serialized = from_serialized
 
-  def _check_duplicated_metrics(self, metrics, weighted_metrics):
-    """Check and raise error when user provided metrics has any duplications.
+def create_pseudo_input_names(inputs):
+    """Create pseudo input names for a subclassed Model."""
+    return _create_pseudo_names(inputs, prefix="input_")
 
-    Note that metrics are stateful container, a shared metric instance between
-    model.metric and model.weighted_metric will make the same intance to be
-    udpated twice, and report wrong value.
 
-    Args:
-      metrics: User provided metrics list.
-      weighted_metrics: User provided weighted metrics list.
+def _create_pseudo_names(tensors, prefix):
+    """Creates pseudo {input | output} names for subclassed Models.
 
-    Raises:
-      ValueError, when duplicated metrics instance discovered in user provided
-        metrics and weighted metrics.
-    """
-    seen = set()
-    duplicated = []
-    for x in tf.nest.flatten(metrics) + tf.nest.flatten(weighted_metrics):
-      # We only check metrics object. The string and function objects
-      # will be converted to unique Metric instance.
-      if not isinstance(x, metrics_mod.Metric):
-        continue
-      if x in seen:
-        duplicated.append(x)
-      seen.add(x)
-
-    if duplicated:
-      raise ValueError('Found duplicated metrics object in the user provided '
-                       'metrics and weighted metrics. This will cause the same '
-                       'metric object to be updated multiple times, and report '
-                       'wrong results. \n'
-                       f'Duplicated items: {duplicated}')
-
-  @property
-  def metrics(self):
-    """All metrics in this container."""
-    if not self._built:
-      return []
-    return self._metrics_in_order
-
-  @property
-  def unweighted_metrics(self):
-    """Metrics in this container that should not be passed `sample_weight`."""
-    if not self._built:
-      return None
-    return tf.nest.flatten(self._metrics)
-
-  @property
-  def weighted_metrics(self):
-    """Metrics in this container that should be passed `sample_weight`."""
-    if not self._built:
-      return None
-    return tf.nest.flatten(self._weighted_metrics)
-
-  def build(self, y_pred, y_true):
-    """One-time setup of metric objects."""
-    super(MetricsContainer, self).build(y_pred)
-
-    self._metrics = self._maybe_broadcast_to_outputs(y_pred, self._metrics)
-    self._metrics = self._conform_to_outputs(y_pred, self._metrics)
-
-    self._weighted_metrics = self._maybe_broadcast_to_outputs(
-        y_pred, self._weighted_metrics)
-    self._weighted_metrics = self._conform_to_outputs(y_pred,
-                                                      self._weighted_metrics)
-
-    # Standardize on tuple since `tf.data` turns lists into `Tensor`s.
-    y_pred = tf.__internal__.nest.list_to_tuple(y_pred)
-    y_true = tf.__internal__.nest.list_to_tuple(y_true)
-    self._metrics = tf.__internal__.nest.list_to_tuple(self._metrics)
-    self._weighted_metrics = tf.__internal__.nest.list_to_tuple(
-        self._weighted_metrics)
-
-    # Convert to `Metric` objects, potentially disambiguating based on output
-    # properties.
-    self._metrics = tf.__internal__.nest.map_structure_up_to(
-        y_pred,
-        self._get_metric_objects,
-        self._metrics,
-        y_true,
-        y_pred)
-    self._weighted_metrics = tf.__internal__.nest.map_structure_up_to(
-        y_pred,
-        self._get_metric_objects,
-        self._weighted_metrics,
-        y_true,
-        y_pred)
-
-    self._metrics = tf.__internal__.nest.flatten_up_to(
-        y_pred, self._metrics, check_types=False)
-    self._weighted_metrics = tf.__internal__.nest.flatten_up_to(
-        y_pred, self._weighted_metrics, check_types=False)
-
-    # Assumes metrics, weighted_metrics have been flattened up to outputs.
-    #
-    # If we are loading a model that has been already serialized, we do not
-    # want to re-apply any pre-processing metric renaming steps.
-    if not self._from_serialized:
-      self._set_metric_names()
-    self._create_ordered_metrics()
-    self._built = True
-
-  @property
-  def built(self):
-    return self._built
-
-  def _set_metric_names(self):
-    """Sets unique metric names."""
-    # For multi-output models, prepend the output name to the metric name.
-    # For weighted metrics, prepend "weighted_" if the name would be non-unique.
-    # pylint: disable=protected-access
-    metric_names = set()
-    is_multi_output = len(self._output_names) > 1
-    zip_args = (self._output_names, self._metrics, self._weighted_metrics)
-    for output_name, output_metrics, weighted_output_metrics in zip(*zip_args):
-      for m in output_metrics:
-        if m is None:
-          continue
-        if is_multi_output:
-          m._name = output_name + '_' + m._name
-        if m._name in metric_names:
-          raise ValueError(
-              f'Found two metrics with the same name: {m._name}. '
-              'All the metrics added to the model need to have unique names.')
-        metric_names.add(m._name)
-
-      for wm in weighted_output_metrics:
-        if wm is None:
-          continue
-        if is_multi_output:
-          if output_name + '_' + wm._name in metric_names:
-            wm._name = output_name + '_weighted_' + wm._name
-          else:
-            wm._name = output_name + '_' + wm._name
-        elif wm._name in metric_names:
-          wm._name = 'weighted_' + wm._name
-
-        if wm._name in metric_names:
-          raise ValueError(
-              f'Found two weighted metrics with the same name: {wm._name}.'
-              'All the metrics added to the model need to have unique names.')
-        metric_names.add(wm._name)
-    # pylint: enable=protected-access
-
-  def _create_ordered_metrics(self):
-    """Cache the flat order needed when returning metrics, for backwards compat."""
-    self._metrics_in_order = []
-    for output_metrics, output_weighted_metrics in zip(self._metrics,
-                                                       self._weighted_metrics):
-      for m in tf.nest.flatten(output_metrics):
-        if m is not None:
-          self._metrics_in_order.append(m)
-      for wm in tf.nest.flatten(output_weighted_metrics):
-        if wm is not None:
-          self._metrics_in_order.append(wm)
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Updates the state of per-output metrics."""
-    y_true = self._conform_to_outputs(y_pred, y_true)
-    sample_weight = self._conform_to_outputs(y_pred, sample_weight)
-
-    if not self._built:
-      self.build(y_pred, y_true)
-
-    y_pred = tf.nest.flatten(y_pred)
-    y_true = tf.nest.flatten(y_true) if y_true is not None else []
-    sample_weight = tf.nest.flatten(sample_weight)
-
-    zip_args = (y_true, y_pred, sample_weight, self._metrics,
-                self._weighted_metrics)
-    for y_t, y_p, sw, metric_objs, weighted_metric_objs in zip(*zip_args):
-      # Ok to have no metrics for an output.
-      if (y_t is None or (all(m is None for m in metric_objs) and
-                          all(wm is None for wm in weighted_metric_objs))):
-        continue
-
-      y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
-      mask = get_mask(y_p)
-      sw = apply_mask(y_p, sw, mask)
-
-      for metric_obj in metric_objs:
-        if metric_obj is None:
-          continue
-        metric_obj.update_state(y_t, y_p, sample_weight=mask)
-
-      for weighted_metric_obj in weighted_metric_objs:
-        if weighted_metric_obj is None:
-          continue
-        weighted_metric_obj.update_state(y_t, y_p, sample_weight=sw)
-
-  def reset_state(self):
-    """Resets the state of all `Metric`s in this container."""
-    if self._built:
-      metrics = self._metrics_in_order
-    else:
-      # If the user supplied `Metric` objects directly, we should
-      # reset those. This could also contain `str`s or `function`s
-      # though.
-      metrics = tf.nest.flatten(self._user_metrics) + tf.nest.flatten(
-          self._user_weighted_metrics)
+    Warning: this function should only be used to define default
+    names for `Metics` and `SavedModel`. No other use cases should
+    rely on a `Model`'s input or output names.
+
+    Example with dict:
 
-    for metric_obj in metrics:
-      if isinstance(metric_obj, metrics_mod.Metric):
-        metric_obj.reset_state()
+    `{'a': [x1, x2], 'b': x3}` becomes:
+    `['a_1', 'a_2', 'b']`
 
-  def _get_metric_objects(self, metrics, y_t, y_p):
-    """Convert user-supplied metrics to `Metric` objects."""
-    metrics = tf.nest.flatten(metrics)
-    return [self._get_metric_object(m, y_t, y_p) for m in metrics]
+    Example with list:
 
-  def _get_metric_object(self, metric, y_t, y_p):
-    """Converts user-supplied metric to a `Metric` object.
+    `[x, y]` becomes:
+    `['output_1', 'output_2']`
 
     Args:
-      metric: A string, function, or `Metric` object.
-      y_t: Sample of label.
-      y_p: Sample of output.
+      tensors: `Model`'s outputs or inputs.
+      prefix: 'output_' for outputs, 'input_' for inputs.
 
     Returns:
-      A `Metric` object.
+      Flattened list of pseudo names.
     """
-    if metric is None:
-      return None  # Ok to have no metric for an output.
 
-    # Convenience feature for selecting b/t binary, categorical,
-    # and sparse categorical.
-    if str(metric).lower() not in ['accuracy', 'acc', 'crossentropy', 'ce']:
-      metric_obj = metrics_mod.get(metric)
-    else:
-      y_t_rank = len(y_t.shape.as_list())
-      y_p_rank = len(y_p.shape.as_list())
-      y_t_last_dim = y_t.shape.as_list()[-1]
-      y_p_last_dim = y_p.shape.as_list()[-1]
-
-      is_binary = y_p_last_dim == 1
-      is_sparse_categorical = (
-          y_t_rank < y_p_rank or y_t_last_dim == 1 and y_p_last_dim > 1)
-
-      if str(metric).lower() in ['accuracy', 'acc']:
-        if is_binary:
-          metric_obj = metrics_mod.binary_accuracy
-        elif is_sparse_categorical:
-          metric_obj = metrics_mod.sparse_categorical_accuracy
-        else:
-          metric_obj = metrics_mod.categorical_accuracy
-      else:
-        if is_binary:
-          metric_obj = metrics_mod.binary_crossentropy
-        elif is_sparse_categorical:
-          metric_obj = metrics_mod.sparse_categorical_crossentropy
+    def one_index(ele):
+        # Start with "output_1" instead of "output_0".
+        if isinstance(ele, int):
+            return ele + 1
+        return ele
+
+    flat_paths = list(tf.__internal__.nest.yield_flat_paths(tensors))
+    flat_paths = tf.nest.map_structure(one_index, flat_paths)
+    names = []
+    for path in flat_paths:
+        if not path:
+            name = prefix + "1"  # Single output.
         else:
-          metric_obj = metrics_mod.categorical_crossentropy
-
-    if isinstance(metric_obj, losses_mod.Loss):
-      metric_obj._allow_sum_over_batch_size = True  # pylint: disable=protected-access
-
-    if not isinstance(metric_obj, metrics_mod.Metric):
-      if isinstance(metric, str):
-        metric_name = metric
-      else:
-        metric_name = get_custom_object_name(metric)
-        if metric_name is None:
-          raise ValueError(
-              f'Metric should be a callable, received: {metric}')
-
-      metric_obj = metrics_mod.MeanMetricWrapper(metric_obj, name=metric_name)
-
-    return metric_obj
-
-  def _should_broadcast(self, obj):
-    # e.g. 'mse'.
-    if not tf.nest.is_nested(obj):
-      return True
-    # e.g. ['mse'] or ['mse', 'mae'].
-    return (isinstance(obj, (list, tuple)) and
-            not any(tf.nest.is_nested(o) for o in obj))
-
-  def _copy_object(self, obj):
-    if isinstance(obj, metrics_mod.Metric):
-      return obj.__class__.from_config(obj.get_config())
-    return obj  # Can be a function or `None`.
-
-
-def create_pseudo_output_names(outputs):
-  """Create pseudo output names for a subclassed Model."""
-  return _create_pseudo_names(outputs, prefix='output_')
-
-
-def create_pseudo_input_names(inputs):
-  """Create pseudo input names for a subclassed Model."""
-  return _create_pseudo_names(inputs, prefix='input_')
-
-
-def _create_pseudo_names(tensors, prefix):
-  """Creates pseudo {input | output} names for subclassed Models.
+            name = "_".join(str(p) for p in path)
+            if isinstance(path[0], int):
+                name = prefix + name
+        names.append(name)
+    return names
 
-  Warning: this function should only be used to define default
-  names for `Metics` and `SavedModel`. No other use cases should
-  rely on a `Model`'s input or output names.
 
-  Example with dict:
-
-  `{'a': [x1, x2], 'b': x3}` becomes:
-  `['a_1', 'a_2', 'b']`
-
-  Example with list:
+def map_to_output_names(y_pred, output_names, struct):
+    """Maps a dict to a list using `output_names` as keys.
 
-  `[x, y]` becomes:
-  `['output_1', 'output_2']`
+    This is a convenience feature only. When a `Model`'s outputs
+    are a list, you can specify per-output losses and metrics as
+    a dict, where the keys are the output names. If you specify
+    per-output losses and metrics via the same structure as the
+    `Model`'s outputs (recommended), no mapping is performed.
 
-  Args:
-    tensors: `Model`'s outputs or inputs.
-    prefix: 'output_' for outputs, 'input_' for inputs.
+    For the Functional API, the output names are the names of the
+    last layer of each output. For the Subclass API, the output names
+    are determined by `create_pseudo_output_names` (For example:
+    `['output_1', 'output_2']` for a list of outputs).
 
-  Returns:
-    Flattened list of pseudo names.
-  """
+    This mapping preserves backwards compatibility for `compile` and
+    `fit`.
 
-  def one_index(ele):
-    # Start with "output_1" instead of "output_0".
-    if isinstance(ele, int):
-      return ele + 1
-    return ele
+    Args:
+      y_pred: Sample outputs of the Model, to determine if this convenience
+        feature should be applied (`struct` is returned unmodified if `y_pred`
+        isn't a flat list).
+      output_names: List. The names of the outputs of the Model.
+      struct: The structure to map.
 
-  flat_paths = list(tf.__internal__.nest.yield_flat_paths(tensors))
-  flat_paths = tf.nest.map_structure(one_index, flat_paths)
-  names = []
-  for path in flat_paths:
-    if not path:
-      name = prefix + '1'  # Single output.
+    Returns:
+      `struct` mapped to a list in same order as `output_names`.
+    """
+    single_output = not tf.nest.is_nested(y_pred)
+    outputs_are_flat_list = (
+        not single_output
+        and isinstance(y_pred, (list, tuple))
+        and not any(tf.nest.is_nested(y_p) for y_p in y_pred)
+    )
+
+    if (single_output or outputs_are_flat_list) and isinstance(struct, dict):
+        output_names = output_names or create_pseudo_output_names(y_pred)
+        struct = copy.copy(struct)
+        new_struct = [struct.pop(name, None) for name in output_names]
+        if struct:
+            raise ValueError(
+                "Found unexpected losses or metrics that do not correspond "
+                f"to any Model output: {struct.keys()}. "
+                f"Valid mode output names: {output_names}. "
+                f"Received struct is: {struct}."
+            )
+        if len(new_struct) == 1:
+            return new_struct[0]
+        return new_struct
     else:
-      name = '_'.join(str(p) for p in path)
-      if isinstance(path[0], int):
-        name = prefix + name
-    names.append(name)
-  return names
+        return struct
 
 
-def map_to_output_names(y_pred, output_names, struct):
-  """Maps a dict to a list using `output_names` as keys.
-
-  This is a convenience feature only. When a `Model`'s outputs
-  are a list, you can specify per-output losses and metrics as
-  a dict, where the keys are the output names. If you specify
-  per-output losses and metrics via the same structure as the
-  `Model`'s outputs (recommended), no mapping is performed.
-
-  For the Functional API, the output names are the names of the
-  last layer of each output. For the Subclass API, the output names
-  are determined by `create_pseudo_output_names` (For example:
-  `['output_1', 'output_2']` for a list of outputs).
-
-  This mapping preserves backwards compatibility for `compile` and
-  `fit`.
-
-  Args:
-    y_pred: Sample outputs of the Model, to determine if this convenience
-      feature should be applied (`struct` is returned unmodified if `y_pred`
-      isn't a flat list).
-    output_names: List. The names of the outputs of the Model.
-    struct: The structure to map.
-
-  Returns:
-    `struct` mapped to a list in same order as `output_names`.
-  """
-  single_output = not tf.nest.is_nested(y_pred)
-  outputs_are_flat_list = (not single_output and
-                           isinstance(y_pred, (list, tuple)) and
-                           not any(tf.nest.is_nested(y_p) for y_p in y_pred))
-
-  if (single_output or outputs_are_flat_list) and isinstance(struct, dict):
-    output_names = output_names or create_pseudo_output_names(y_pred)
+def map_missing_dict_keys(y_pred, struct):
+    """Replaces missing dict keys in `struct` with `None` placeholders."""
+    if not isinstance(y_pred, dict) or not isinstance(struct, dict):
+        return struct
     struct = copy.copy(struct)
-    new_struct = [struct.pop(name, None) for name in output_names]
-    if struct:
-      raise ValueError(
-          'Found unexpected losses or metrics that do not correspond '
-          f'to any Model output: {struct.keys()}. '
-          f'Valid mode output names: {output_names}. '
-          f'Received struct is: {struct}.')
-    if len(new_struct) == 1:
-      return new_struct[0]
-    return new_struct
-  else:
+    for k in y_pred.keys():
+        if k not in struct:
+            struct[k] = None
     return struct
 
 
-def map_missing_dict_keys(y_pred, struct):
-  """Replaces missing dict keys in `struct` with `None` placeholders."""
-  if not isinstance(y_pred, dict) or not isinstance(struct, dict):
-    return struct
-  struct = copy.copy(struct)
-  for k in y_pred.keys():
-    if k not in struct:
-      struct[k] = None
-  return struct
+def match_dtype_and_rank(y_t, y_p, sw):
+    """Match dtype and rank of predictions."""
+    if y_t.shape.rank == 1 and y_p.shape.rank == 2:
+        y_t = tf.expand_dims(y_t, axis=-1)
+    if sw is not None:
+        if sw.shape.rank == 1 and y_p.shape.rank == 2:
+            sw = tf.expand_dims(sw, axis=-1)
 
+    # Dtype.
+    # This is required mainly for custom loss functions which do not take care
+    # casting dtypes.
+    if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (
+        y_t.dtype.is_integer and y_p.dtype.is_integer
+    ):
+        y_t = tf.cast(y_t, y_p.dtype)
 
-def match_dtype_and_rank(y_t, y_p, sw):
-  """Match dtype and rank of predictions."""
-  if y_t.shape.rank == 1 and y_p.shape.rank == 2:
-    y_t = tf.expand_dims(y_t, axis=-1)
-  if sw is not None:
-    if sw.shape.rank == 1 and y_p.shape.rank == 2:
-      sw = tf.expand_dims(sw, axis=-1)
-
-  # Dtype.
-  # This is required mainly for custom loss functions which do not take care
-  # casting dtypes.
-  if ((y_t.dtype.is_floating and y_p.dtype.is_floating) or
-      (y_t.dtype.is_integer and y_p.dtype.is_integer)):
-    y_t = tf.cast(y_t, y_p.dtype)
-
-  if sw is not None:
-    sw = tf.cast(sw, y_p.dtype)
-  return y_t, y_p, sw
-
-
-def get_mask(y_p):
-  """Returns Keras mask from tensor."""
-  return getattr(y_p, '_keras_mask', None)
-
-
-def apply_mask(y_p, sw, mask):
-  """Applies any mask on predictions to sample weights."""
-  if mask is not None:
-    mask = tf.cast(mask, y_p.dtype)
     if sw is not None:
-      mask, _, sw = (
-          losses_utils.squeeze_or_expand_dimensions(mask, sample_weight=sw))
-      sw *= mask
-    else:
-      sw = mask
-  return sw
+        sw = tf.cast(sw, y_p.dtype)
+    return y_t, y_p, sw
 
 
 def get_custom_object_name(obj):
-  """Returns the name to use for a custom loss or metric callable.
-
-  Args:
-    obj: Custom loss of metric callable
-
-  Returns:
-    Name to use, or `None` if the object was not recognized.
-  """
-  if hasattr(obj, 'name'):  # Accept `Loss` instance as `Metric`.
-    return obj.name
-  elif hasattr(obj, '__name__'):  # Function.
-    return obj.__name__
-  elif hasattr(obj, '__class__'):  # Class instance.
-    return generic_utils.to_snake_case(obj.__class__.__name__)
-  else:  # Unrecognized object.
-    return None
+    """Returns the name to use for a custom loss or metric callable.
+
+    Args:
+      obj: Custom loss of metric callable
+
+    Returns:
+      Name to use, or `None` if the object was not recognized.
+    """
+    if hasattr(obj, "name"):  # Accept `Loss` instance as `Metric`.
+        return obj.name
+    elif hasattr(obj, "__name__"):  # Function.
+        return obj.__name__
+    elif hasattr(obj, "__class__"):  # Class instance.
+        return generic_utils.to_snake_case(obj.__class__.__name__)
+    else:  # Unrecognized object.
+        return None
diff --git a/keras/engine/compile_utils_test.py b/keras/engine/compile_utils_test.py
index e62a0a4bb117..557d6e2b4e23 100644
--- a/keras/engine/compile_utils_test.py
+++ b/keras/engine/compile_utils_test.py
@@ -15,833 +15,874 @@
 """Tests for compile utitilies."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import losses as losses_mod
 from keras import metrics as metrics_mod
 from keras.engine import compile_utils
+from keras.testing_infra import test_combinations
 
 
 class LossesContainerTest(test_combinations.TestCase):
+    def test_single_loss(self):
+        loss_container = compile_utils.LossesContainer("mse")
+        y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
+        total_loss = loss_container(y_t, y_p)
 
-  def test_single_loss(self):
-    loss_container = compile_utils.LossesContainer('mse')
-    y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
-    total_loss = loss_container(y_t, y_p)
-
-    self.assertTrue(loss_container._built)
-    self.assertLen(loss_container._losses, 1)
-    self.assertIsInstance(total_loss, tf.Tensor)
-    self.assertEqual(total_loss.numpy(), 1.)
-    self.assertLen(loss_container.metrics, 1)
+        self.assertTrue(loss_container._built)
+        self.assertLen(loss_container._losses, 1)
+        self.assertIsInstance(total_loss, tf.Tensor)
+        self.assertEqual(total_loss.numpy(), 1.0)
+        self.assertLen(loss_container.metrics, 1)
 
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 1.)
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 1.0)
 
-    loss_container.reset_state()
-    self.assertEqual(loss_metric.result().numpy(), 0.)
+        loss_container.reset_state()
+        self.assertEqual(loss_metric.result().numpy(), 0.0)
 
-  def test_loss_list(self):
-    loss_container = compile_utils.LossesContainer(['mse', 'mae'], [1, 0.5])
+    def test_loss_list(self):
+        loss_container = compile_utils.LossesContainer(["mse", "mae"], [1, 0.5])
 
-    y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
-    y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+        y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
+        y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
-    self.assertEqual(loss_container._output_names, ['output_1', 'output_2'])
+        self.assertEqual(loss_container._output_names, ["output_1", "output_2"])
 
-    self.assertLen(loss_container._losses, 2)
-    self.assertEqual(total_loss.numpy(), 0.25)
+        self.assertLen(loss_container._losses, 2)
+        self.assertEqual(total_loss.numpy(), 0.25)
 
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 0.25)
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 0.25)
 
-    output_1_metric = loss_container.metrics[1]
-    self.assertEqual(output_1_metric.name, 'output_1_loss')
-    self.assertEqual(output_1_metric.result().numpy(), 0)
-
-    output_2_metric = loss_container.metrics[2]
-    self.assertEqual(output_2_metric.name, 'output_2_loss')
-    self.assertEqual(output_2_metric.result().numpy(), 0.5)
-
-    loss_container.reset_state()
-    self.assertEqual(loss_metric.result().numpy(), 0)
-    self.assertEqual(output_1_metric.result().numpy(), 0)
-    self.assertEqual(output_2_metric.result().numpy(), 0)
-
-  def test_loss_dict(self):
-    loss_container = compile_utils.LossesContainer(
-        {
-            'out1': 'mse',
-            'out2': 'mae'
-        }, {
-            'out1': 1,
-            'out2': 0.5
-        })
-
-    y_t = {'out1': tf.ones((10, 1)), 'out2': tf.zeros((10, 1))}
-    y_p = {'out1': tf.ones((10, 1)), 'out2': tf.ones((10, 1))}
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-
-    self.assertLen(loss_container._losses, 2)
-    self.assertIsInstance(total_loss, tf.Tensor)
-    self.assertEqual(total_loss.numpy(), 0.25)
-    self.assertLen(loss_container.metrics, 3)
-
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 0.25)
-
-    out1_metric = loss_container.metrics[1]
-    self.assertEqual(out1_metric.name, 'out1_loss')
-    self.assertEqual(out1_metric.result().numpy(), 0)
-
-    out2_metric = loss_container.metrics[2]
-    self.assertEqual(out2_metric.name, 'out2_loss')
-    self.assertEqual(out2_metric.result().numpy(), 0.5)
-
-    loss_container.reset_state()
-    self.assertEqual(loss_metric.result().numpy(), 0)
-    self.assertEqual(out1_metric.result().numpy(), 0)
-    self.assertEqual(out2_metric.result().numpy(), 0)
-
-  def test_loss_partial_dict_with_output_names(self):
-    loss_container = compile_utils.LossesContainer(
-        {'out2': 'mae'}, {'out2': 1.}, output_names=['out1', 'out2'])
-
-    y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
-    y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-
-    self.assertEqual(total_loss.numpy(), 0.5)
-    self.assertLen(loss_container.metrics, 2)
-
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 0.5)
-
-    out2_metric = loss_container.metrics[1]
-    self.assertEqual(out2_metric.name, 'out2_loss')
-    self.assertEqual(out2_metric.result().numpy(), 0.5)
-
-  def test_loss_dict_with_nones(self):
-    loss_container = compile_utils.LossesContainer({
-        'out1': None,
-        'out2': 'mae'
-    })
-
-    y_t = {'out1': tf.ones((10, 1)), 'out2': tf.zeros((10, 1))}
-    y_p = {'out1': tf.ones((10, 1)), 'out2': tf.ones((10, 1))}
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-
-    self.assertIsInstance(total_loss, tf.Tensor)
-    self.assertEqual(total_loss.numpy(), 0.5)
-    self.assertLen(loss_container.metrics, 2)
-
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 0.5)
-
-    out2_metric = loss_container.metrics[1]
-    self.assertEqual(out2_metric.name, 'out2_loss')
-    self.assertEqual(out2_metric.result().numpy(), 0.5)
-
-  def test_nested_structure(self):
-    loss_container = compile_utils.LossesContainer(
-        {
-            'b': ['mse', None],
-            'a': 'mae'
-        }, loss_weights={
-            'b': [0.5, 0],
-            'a': 1
-        })
-
-    y_t = {
-        'b': [tf.ones((10, 1)),
-              tf.zeros((10, 1))],
-        'a': tf.zeros((10, 1))
-    }
-    y_p = {
-        'b': [tf.zeros((10, 1)),
-              tf.zeros((10, 1))],
-        'a': tf.ones((10, 1))
-    }
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-    self.assertIsInstance(total_loss, tf.Tensor)
-    self.assertEqual(total_loss.numpy(), 0.75)
-    self.assertLen(loss_container.metrics, 3)
-
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 0.75)
-
-    a_metric = loss_container.metrics[1]
-    self.assertEqual(a_metric.name, 'a_loss')
-    self.assertEqual(a_metric.result().numpy(), 0.5)
-
-    b_1_metric = loss_container.metrics[2]
-    self.assertEqual(b_1_metric.name, 'b_1_loss')
-    self.assertEqual(b_1_metric.result().numpy(), 0.5)
-
-  def test_no_input_mutation(self):
-    loss = {'a': 'mae'}
-    loss_container = compile_utils.LossesContainer(loss)
-
-    y_t = {'a': tf.zeros((10, 1))}
-    y_p = {'a': tf.ones((10, 1)), 'b': tf.zeros((10, 1))}
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-    self.assertIsInstance(total_loss, tf.Tensor)
-    self.assertEqual(total_loss.numpy(), 0.5)
-    self.assertLen(loss, 1)
-
-  def test_broadcast_single_loss(self):
-    loss_container = compile_utils.LossesContainer('mse')
-
-    y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
-    y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-    self.assertEqual(total_loss.numpy(), 0.5)
-    self.assertLen(loss_container.metrics, 3)
-
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 0.5)
-
-    output_1_metric = loss_container.metrics[1]
-    self.assertEqual(output_1_metric.name, 'output_1_loss')
-    self.assertEqual(output_1_metric.result().numpy(), 0.)
-
-    output_2_metric = loss_container.metrics[2]
-    self.assertEqual(output_2_metric.name, 'output_2_loss')
-    self.assertEqual(output_2_metric.result().numpy(), 0.5)
-
-  def test_missing_label_with_no_loss(self):
-    # It's ok to exclude a label if that label has no
-    # losses or metrics associated with it.
-    loss_container = compile_utils.LossesContainer({
-        'output1': 'mse',
-        'output3': 'mae'
-    })
-
-    y_p = {
-        'output1': tf.convert_to_tensor([[0], [1], [2]]),
-        'output2': tf.convert_to_tensor([[3], [4], [5]]),
-        'output3': tf.convert_to_tensor([[6], [7], [8]])
-    }
-    y_t = {
-        'output1': tf.convert_to_tensor([[1], [2], [3]]),
-        'output3': tf.convert_to_tensor([[4], [5], [6]])
-    }
-
-    total_loss = loss_container(y_t, y_p)
-    self.assertEqual(total_loss.numpy(), 3.)
-    self.assertLen(loss_container.metrics, 3)
-
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertEqual(loss_metric.result().numpy(), 3.)
-
-    output_1_metric = loss_container.metrics[1]
-    self.assertEqual(output_1_metric.name, 'output1_loss')
-    self.assertEqual(output_1_metric.result().numpy(), 1.)
-
-    output_3_metric = loss_container.metrics[2]
-    self.assertEqual(output_3_metric.name, 'output3_loss')
-    self.assertEqual(output_3_metric.result().numpy(), 2.)
-
-  def test_mismatched_dtypes(self):
-    y_t = tf.constant([1, 9, 2, -5], shape=(2, 2))
-    y_p = tf.constant([4, 8, 12, 8],
-                               shape=(2, 2),
-                               dtype=tf.float32)
-
-    def my_mae(labels, preds):
-      self.assertEqual(labels.dtype, tf.int32)
-      self.assertEqual(preds.dtype, tf.float32)
-      labels = tf.cast(labels, preds.dtype)
-      return backend.mean(tf.abs(preds - labels), axis=-1)
-
-    loss_container = compile_utils.LossesContainer(my_mae)
-    total_loss = loss_container(y_t, y_p)
-    self.assertEqual(total_loss.dtype, tf.float32)
-
-  def test_integer_dtypes(self):
-    y_t = tf.constant([1, 9, 2, -5], shape=(2, 2))
-    y_p = tf.constant([4, 8, 12, 8], shape=(2, 2), dtype=tf.int64)
-
-    def my_mae(labels, preds):
-      self.assertEqual(labels.dtype, tf.int64)
-      self.assertEqual(preds.dtype, tf.int64)
-      return backend.mean(tf.abs(preds - labels), axis=-1)
-
-    loss_container = compile_utils.LossesContainer(my_mae)
-    total_loss = loss_container(y_t, y_p)
-    self.assertEqual(total_loss.dtype, tf.int64)
-
-  def test_float_dtypes(self):
-    y_t = tf.constant([1, 9, 2, -5],
-                               shape=(2, 2),
-                               dtype=tf.float32)
-    y_p = tf.constant([4, 8, 12, 8],
-                               shape=(2, 2),
-                               dtype=tf.float64)
-
-    def my_mae(labels, preds):
-      self.assertEqual(labels.dtype, tf.float64)
-      self.assertEqual(preds.dtype, tf.float64)
-      return backend.mean(tf.abs(preds - labels), axis=-1)
-
-    loss_container = compile_utils.LossesContainer(my_mae)
-    total_loss = loss_container(y_t, y_p)
-    self.assertIsInstance(total_loss, tf.Tensor)
-    self.assertEqual(total_loss.dtype, tf.float64)
-
-  def test_loss_masking(self):
-    loss_container = compile_utils.LossesContainer('mae')
-    y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
-    y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
-    y_p._keras_mask = tf.constant([[1, 0], [1, 0]],
-                                           dtype=tf.float32)
-
-    total_loss = loss_container(y_t, y_p)
-    self.assertAlmostEqual(total_loss.numpy(), .25)  # sum over batch size
-
-    self.assertLen(loss_container.metrics, 1)
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertAlmostEqual(loss_metric.result().numpy(), .25)
-
-  def test_loss_sample_weight(self):
-    loss_container = compile_utils.LossesContainer('mae')
-    y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
-    y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
-    sw = tf.constant([[.2, .3], [.5, 0]], dtype=tf.float32)
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-    # (0 * .2 + 0 * .3 + 1 * .5 + 1 * 0) / 4
-    self.assertAlmostEqual(total_loss.numpy(), .125)
-
-    self.assertLen(loss_container.metrics, 1)
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertAlmostEqual(loss_metric.result().numpy(), .125)
-
-  def test_loss_masking_sample_weight(self):
-    loss_container = compile_utils.LossesContainer('mae')
-    y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
-    y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
-    sw = tf.constant([[.2, .3], [.5, 0]], dtype=tf.float32)
-    y_p._keras_mask = tf.constant([[1, 0], [1, 0]],
-                                           dtype=tf.float32)
-
-    total_loss = loss_container(y_t, y_p, sample_weight=sw)
-    # (0 * .2 + 1 * .5) / 4
-    self.assertAlmostEqual(total_loss.numpy(), .125)  # sum over batch size
-
-    self.assertLen(loss_container.metrics, 1)
-    loss_metric = loss_container.metrics[0]
-    self.assertEqual(loss_metric.name, 'loss')
-    self.assertAlmostEqual(loss_metric.result().numpy(), .125)
-
-  def test_custom_loss_callables(self):
-
-    def custom_loss_fn(y_true, y_pred):
-      return tf.reduce_sum(y_true - y_pred)
-
-    class CustomLossClass:
-
-      def __call__(self, y_true, y_pred):
-        return tf.reduce_sum(y_true - y_pred)
-
-    loss_container = compile_utils.LossesContainer(
-        [custom_loss_fn, CustomLossClass()])
-    y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
-    loss_container(y_t, y_p)
-
-    self.assertEqual(loss_container._losses[0].name, 'custom_loss_fn')
-    self.assertEqual(loss_container._losses[1].name, 'custom_loss_class')
-
-  def test_ragged_tensor_output(self):
-    """Ensure that ragged tensors can be passed as targets and predictions."""
-
-    def custom_loss_fn(y_true, y_pred):
-      """MSE supports RaggedTensors directly."""
-      return losses_mod.mse(y_true, y_pred)
-
-    class CustomLossClass(losses_mod.Loss):
-      """User defined loss function must implement RaggedTensor support."""
-
-      def call(self, y_true, y_pred):
-        losses = tf.ragged.map_flat_values(
-            tf.math.squared_difference, y_true, y_pred)
-        return tf.reduce_mean(losses)
-
-    loss_container = compile_utils.LossesContainer(
-        [custom_loss_fn, CustomLossClass()])
-
-    v_t = tf.constant([[3., 4.], [1., 2.], [3., 5.]])
-    v_p = tf.constant([[3.1, 4.], [1., 2.], [3., 5.]])
-
-    y_t = tf.expand_dims(
-        tf.RaggedTensor.from_row_splits(v_t, [0, 2, 3]), 0)
-    y_p = tf.expand_dims(
-        tf.RaggedTensor.from_row_splits(v_p, [0, 2, 3]), 0)
-    total_loss = loss_container(y_t, y_p)
-
-    self.assertIsInstance(total_loss, tf.Tensor)
-    self.assertEqual(loss_container._losses[0].name, 'custom_loss_fn')
+        output_1_metric = loss_container.metrics[1]
+        self.assertEqual(output_1_metric.name, "output_1_loss")
+        self.assertEqual(output_1_metric.result().numpy(), 0)
+
+        output_2_metric = loss_container.metrics[2]
+        self.assertEqual(output_2_metric.name, "output_2_loss")
+        self.assertEqual(output_2_metric.result().numpy(), 0.5)
+
+        loss_container.reset_state()
+        self.assertEqual(loss_metric.result().numpy(), 0)
+        self.assertEqual(output_1_metric.result().numpy(), 0)
+        self.assertEqual(output_2_metric.result().numpy(), 0)
+
+    def test_loss_dict(self):
+        loss_container = compile_utils.LossesContainer(
+            {"out1": "mse", "out2": "mae"}, {"out1": 1, "out2": 0.5}
+        )
+
+        y_t = {"out1": tf.ones((10, 1)), "out2": tf.zeros((10, 1))}
+        y_p = {"out1": tf.ones((10, 1)), "out2": tf.ones((10, 1))}
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+
+        self.assertLen(loss_container._losses, 2)
+        self.assertIsInstance(total_loss, tf.Tensor)
+        self.assertEqual(total_loss.numpy(), 0.25)
+        self.assertLen(loss_container.metrics, 3)
+
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 0.25)
+
+        out1_metric = loss_container.metrics[1]
+        self.assertEqual(out1_metric.name, "out1_loss")
+        self.assertEqual(out1_metric.result().numpy(), 0)
+
+        out2_metric = loss_container.metrics[2]
+        self.assertEqual(out2_metric.name, "out2_loss")
+        self.assertEqual(out2_metric.result().numpy(), 0.5)
+
+        loss_container.reset_state()
+        self.assertEqual(loss_metric.result().numpy(), 0)
+        self.assertEqual(out1_metric.result().numpy(), 0)
+        self.assertEqual(out2_metric.result().numpy(), 0)
+
+    def test_loss_partial_dict_with_output_names(self):
+        loss_container = compile_utils.LossesContainer(
+            {"out2": "mae"}, {"out2": 1.0}, output_names=["out1", "out2"]
+        )
+
+        y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
+        y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+
+        self.assertEqual(total_loss.numpy(), 0.5)
+        self.assertLen(loss_container.metrics, 2)
+
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 0.5)
+
+        out2_metric = loss_container.metrics[1]
+        self.assertEqual(out2_metric.name, "out2_loss")
+        self.assertEqual(out2_metric.result().numpy(), 0.5)
+
+    def test_loss_dict_with_nones(self):
+        loss_container = compile_utils.LossesContainer(
+            {"out1": None, "out2": "mae"}
+        )
+
+        y_t = {"out1": tf.ones((10, 1)), "out2": tf.zeros((10, 1))}
+        y_p = {"out1": tf.ones((10, 1)), "out2": tf.ones((10, 1))}
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+
+        self.assertIsInstance(total_loss, tf.Tensor)
+        self.assertEqual(total_loss.numpy(), 0.5)
+        self.assertLen(loss_container.metrics, 2)
+
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 0.5)
+
+        out2_metric = loss_container.metrics[1]
+        self.assertEqual(out2_metric.name, "out2_loss")
+        self.assertEqual(out2_metric.result().numpy(), 0.5)
+
+    def test_nested_structure(self):
+        loss_container = compile_utils.LossesContainer(
+            {"b": ["mse", None], "a": "mae"},
+            loss_weights={"b": [0.5, 0], "a": 1},
+        )
+
+        y_t = {
+            "b": [tf.ones((10, 1)), tf.zeros((10, 1))],
+            "a": tf.zeros((10, 1)),
+        }
+        y_p = {
+            "b": [tf.zeros((10, 1)), tf.zeros((10, 1))],
+            "a": tf.ones((10, 1)),
+        }
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+        self.assertIsInstance(total_loss, tf.Tensor)
+        self.assertEqual(total_loss.numpy(), 0.75)
+        self.assertLen(loss_container.metrics, 3)
+
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 0.75)
+
+        a_metric = loss_container.metrics[1]
+        self.assertEqual(a_metric.name, "a_loss")
+        self.assertEqual(a_metric.result().numpy(), 0.5)
+
+        b_1_metric = loss_container.metrics[2]
+        self.assertEqual(b_1_metric.name, "b_1_loss")
+        self.assertEqual(b_1_metric.result().numpy(), 0.5)
+
+    def test_no_input_mutation(self):
+        loss = {"a": "mae"}
+        loss_container = compile_utils.LossesContainer(loss)
+
+        y_t = {"a": tf.zeros((10, 1))}
+        y_p = {"a": tf.ones((10, 1)), "b": tf.zeros((10, 1))}
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+        self.assertIsInstance(total_loss, tf.Tensor)
+        self.assertEqual(total_loss.numpy(), 0.5)
+        self.assertLen(loss, 1)
+
+    def test_broadcast_single_loss(self):
+        loss_container = compile_utils.LossesContainer("mse")
+
+        y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
+        y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+        self.assertEqual(total_loss.numpy(), 0.5)
+        self.assertLen(loss_container.metrics, 3)
+
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 0.5)
+
+        output_1_metric = loss_container.metrics[1]
+        self.assertEqual(output_1_metric.name, "output_1_loss")
+        self.assertEqual(output_1_metric.result().numpy(), 0.0)
+
+        output_2_metric = loss_container.metrics[2]
+        self.assertEqual(output_2_metric.name, "output_2_loss")
+        self.assertEqual(output_2_metric.result().numpy(), 0.5)
+
+    def test_missing_label_with_no_loss(self):
+        # It's ok to exclude a label if that label has no
+        # losses or metrics associated with it.
+        loss_container = compile_utils.LossesContainer(
+            {"output1": "mse", "output3": "mae"}
+        )
+
+        y_p = {
+            "output1": tf.convert_to_tensor([[0], [1], [2]]),
+            "output2": tf.convert_to_tensor([[3], [4], [5]]),
+            "output3": tf.convert_to_tensor([[6], [7], [8]]),
+        }
+        y_t = {
+            "output1": tf.convert_to_tensor([[1], [2], [3]]),
+            "output3": tf.convert_to_tensor([[4], [5], [6]]),
+        }
+
+        total_loss = loss_container(y_t, y_p)
+        self.assertEqual(total_loss.numpy(), 3.0)
+        self.assertLen(loss_container.metrics, 3)
+
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertEqual(loss_metric.result().numpy(), 3.0)
+
+        output_1_metric = loss_container.metrics[1]
+        self.assertEqual(output_1_metric.name, "output1_loss")
+        self.assertEqual(output_1_metric.result().numpy(), 1.0)
+
+        output_3_metric = loss_container.metrics[2]
+        self.assertEqual(output_3_metric.name, "output3_loss")
+        self.assertEqual(output_3_metric.result().numpy(), 2.0)
+
+    def test_mismatched_dtypes(self):
+        y_t = tf.constant([1, 9, 2, -5], shape=(2, 2))
+        y_p = tf.constant([4, 8, 12, 8], shape=(2, 2), dtype=tf.float32)
+
+        def my_mae(labels, preds):
+            self.assertEqual(labels.dtype, tf.int32)
+            self.assertEqual(preds.dtype, tf.float32)
+            labels = tf.cast(labels, preds.dtype)
+            return backend.mean(tf.abs(preds - labels), axis=-1)
+
+        loss_container = compile_utils.LossesContainer(my_mae)
+        total_loss = loss_container(y_t, y_p)
+        self.assertEqual(total_loss.dtype, tf.float32)
+
+    def test_integer_dtypes(self):
+        y_t = tf.constant([1, 9, 2, -5], shape=(2, 2))
+        y_p = tf.constant([4, 8, 12, 8], shape=(2, 2), dtype=tf.int64)
+
+        def my_mae(labels, preds):
+            self.assertEqual(labels.dtype, tf.int64)
+            self.assertEqual(preds.dtype, tf.int64)
+            return backend.mean(tf.abs(preds - labels), axis=-1)
+
+        loss_container = compile_utils.LossesContainer(my_mae)
+        total_loss = loss_container(y_t, y_p)
+        self.assertEqual(total_loss.dtype, tf.int64)
+
+    def test_float_dtypes(self):
+        y_t = tf.constant([1, 9, 2, -5], shape=(2, 2), dtype=tf.float32)
+        y_p = tf.constant([4, 8, 12, 8], shape=(2, 2), dtype=tf.float64)
+
+        def my_mae(labels, preds):
+            self.assertEqual(labels.dtype, tf.float64)
+            self.assertEqual(preds.dtype, tf.float64)
+            return backend.mean(tf.abs(preds - labels), axis=-1)
+
+        loss_container = compile_utils.LossesContainer(my_mae)
+        total_loss = loss_container(y_t, y_p)
+        self.assertIsInstance(total_loss, tf.Tensor)
+        self.assertEqual(total_loss.dtype, tf.float64)
+
+    @test_combinations.generate(
+        test_combinations.combine(
+            input_type=["dense", "masked", "ragged"],
+            reduction=["auto", "sum"],
+            use_sample_weights=[True, False],
+        ),
+    )
+    def test_loss_consistency(self, input_type, reduction, use_sample_weights):
+        y_p = tf.ragged.constant(
+            [[[1], [1], [1]], [[1], [1]]], dtype=tf.float32
+        )
+        y_t = tf.ragged.constant(
+            [[[1], [0], [0]], [[1], [1]]], dtype=tf.float32
+        )
+
+        if input_type == "masked":
+            mask = tf.ones_like(y_p).to_tensor()
+            y_p = y_p.to_tensor()
+            y_t = y_t.to_tensor()
+            y_p._keras_mask = mask
+        elif input_type == "dense":
+            y_p = y_p.to_tensor()
+            y_t = y_t.to_tensor()
+
+        if input_type == "dense":
+            count = 6
+        else:
+            count = 5
+
+        if use_sample_weights:
+            wrong = 4
+            maybe_sample_weight = {
+                "sample_weight": tf.constant([[2], [1]], dtype=tf.float32)
+            }
+        else:
+            wrong = 2
+            maybe_sample_weight = {}
+
+        expected = wrong
+        if reduction != "sum":
+            expected /= count
+
+        loss_obj = losses_mod.MeanAbsoluteError(reduction=reduction)
+
+        result = loss_obj(y_t, y_p, **maybe_sample_weight)
+        self.assertAlmostEqual(result.numpy(), expected)
+
+        container = compile_utils.LossesContainer(loss_obj)
+        container_result = container(y_t, y_p, **maybe_sample_weight)
+        self.assertAlmostEqual(container_result.numpy(), expected)
+
+    def test_loss_masking(self):
+        loss_container = compile_utils.LossesContainer("mae")
+        y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
+        y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
+        # Reduction is "sum_over_batch_size" that's not the literal batch size,
+        # but the number of elements being summed: The number of valid
+        # emlements. So since the mask has two valid items, the number of
+        # elements is 2.
+        y_p._keras_mask = tf.constant([[1, 0], [1, 0]], dtype=tf.float32)
+
+        total_loss = loss_container(y_t, y_p)
+        self.assertAlmostEqual(total_loss.numpy(), 0.5)  # sum over num valid
+
+        self.assertLen(loss_container.metrics, 1)
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertAlmostEqual(loss_metric.result().numpy(), 0.5)
+
+    def test_loss_sample_weight(self):
+        loss_container = compile_utils.LossesContainer("mae")
+        y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
+        y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
+        sw = tf.constant([[0.2, 0.3], [0.5, 0]], dtype=tf.float32)
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+        # (0 * .2 + 0 * .3 + 1 * .5 + 1 * 0) / 4
+        self.assertAlmostEqual(total_loss.numpy(), 0.125)
+
+        self.assertLen(loss_container.metrics, 1)
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertAlmostEqual(loss_metric.result().numpy(), 0.125)
+
+    def test_loss_masking_sample_weight(self):
+        loss_container = compile_utils.LossesContainer("mae")
+        y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
+        y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
+        sw = tf.constant([[0.2, 0.3], [0.5, 0]], dtype=tf.float32)
+        y_p._keras_mask = tf.constant([[1, 0], [1, 0]], dtype=tf.float32)
+
+        total_loss = loss_container(y_t, y_p, sample_weight=sw)
+        # (0 * .2 + 1 * .5) / 2
+        self.assertAlmostEqual(total_loss.numpy(), 0.25)  # sum over num valid
+
+        self.assertLen(loss_container.metrics, 1)
+        loss_metric = loss_container.metrics[0]
+        self.assertEqual(loss_metric.name, "loss")
+        self.assertAlmostEqual(loss_metric.result().numpy(), 0.25)
+
+    def test_custom_loss_callables(self):
+        def custom_loss_fn(y_true, y_pred):
+            return tf.reduce_sum(y_true - y_pred)
+
+        class CustomLossClass:
+            def __call__(self, y_true, y_pred):
+                return tf.reduce_sum(y_true - y_pred)
+
+        loss_container = compile_utils.LossesContainer(
+            [custom_loss_fn, CustomLossClass()]
+        )
+        y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
+        loss_container(y_t, y_p)
+
+        self.assertEqual(loss_container._losses[0].name, "custom_loss_fn")
+        self.assertEqual(loss_container._losses[1].name, "custom_loss_class")
+
+    def test_ragged_tensor_output(self):
+        """Ensure ragged tensors can be passed as targets and predictions."""
+
+        def custom_loss_fn(y_true, y_pred):
+            """MSE supports RaggedTensors directly."""
+            return losses_mod.mse(y_true, y_pred)
+
+        class CustomLossClass(losses_mod.Loss):
+            """User defined loss func must implement RaggedTensor support."""
+
+            def call(self, y_true, y_pred):
+                losses = tf.ragged.map_flat_values(
+                    tf.math.squared_difference, y_true, y_pred
+                )
+                return tf.reduce_mean(losses)
+
+        loss_container = compile_utils.LossesContainer(
+            [custom_loss_fn, CustomLossClass()]
+        )
+
+        v_t = tf.constant([[3.0, 4.0], [1.0, 2.0], [3.0, 5.0]])
+        v_p = tf.constant([[3.1, 4.0], [1.0, 2.0], [3.0, 5.0]])
+
+        y_t = tf.expand_dims(tf.RaggedTensor.from_row_splits(v_t, [0, 2, 3]), 0)
+        y_p = tf.expand_dims(tf.RaggedTensor.from_row_splits(v_p, [0, 2, 3]), 0)
+        total_loss = loss_container(y_t, y_p)
+
+        self.assertIsInstance(total_loss, tf.Tensor)
+        self.assertEqual(loss_container._losses[0].name, "custom_loss_fn")
 
 
 class MetricsContainerTest(test_combinations.TestCase):
-
-  def test_single_metric(self):
-    metric_container = compile_utils.MetricsContainer('mse')
-    y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
-    metric_container.update_state(y_t, y_p)
-
-    self.assertLen(metric_container.metrics, 1)
-    metric = metric_container.metrics[0]
-    self.assertEqual(metric.name, 'mse')
-    self.assertEqual(metric.result().numpy(), 1.)
-
-    metric_container.reset_state()
-    self.assertEqual(metric.result().numpy(), 0.)
-
-  def test_list_of_metrics_one_output(self):
-    metric_container = compile_utils.MetricsContainer(['mse', 'mae'])
-    y_t, y_p = 2 * tf.ones((10, 5)), tf.zeros((10, 5))
-    metric_container.update_state(y_t, y_p)
-    self.assertLen(metric_container.metrics, 2)
-
-    mse_metric = metric_container.metrics[0]
-    self.assertEqual(mse_metric.name, 'mse')
-    self.assertEqual(mse_metric.result().numpy(), 4.)
-
-    mae_metric = metric_container.metrics[1]
-    self.assertEqual(mae_metric.name, 'mae')
-    self.assertEqual(mae_metric.result().numpy(), 2.)
-
-    metric_container.reset_state()
-    self.assertEqual(mse_metric.result().numpy(), 0.)
-    self.assertEqual(mae_metric.result().numpy(), 0.)
-
-  def test_list_of_metrics_list_of_outputs(self):
-    metric_container = compile_utils.MetricsContainer(
-        metrics=['mse', 'mae'],  # Should broadcast to both outputs.
-        weighted_metrics=['accuracy'])  # Should broadcast to both outputs.
-
-    y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
-    y_p = [tf.ones((10, 1)), 2 * tf.ones((10, 1))]
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-    metric_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metric_container.metrics, 6)
-
-    mse_metric = metric_container.metrics[0]
-    self.assertEqual(mse_metric.name, 'output_1_mse')
-    self.assertEqual(mse_metric.result().numpy(), 0.)
-
-    mse_metric = metric_container.metrics[1]
-    self.assertEqual(mse_metric.name, 'output_1_mae')
-    self.assertEqual(mse_metric.result().numpy(), 0.)
-
-    acc_metric_1 = metric_container.metrics[2]
-    self.assertEqual(acc_metric_1.name, 'output_1_accuracy')
-    self.assertEqual(acc_metric_1.result().numpy(), 1.)
-    self.assertEqual(acc_metric_1._fn, metrics_mod.binary_accuracy)
-
-    mae_metric = metric_container.metrics[3]
-    self.assertEqual(mae_metric.name, 'output_2_mse')
-    self.assertEqual(mae_metric.result().numpy(), 4.)
-
-    mae_metric = metric_container.metrics[4]
-    self.assertEqual(mae_metric.name, 'output_2_mae')
-    self.assertEqual(mae_metric.result().numpy(), 2.)
-
-    acc_metric_2 = metric_container.metrics[5]
-    self.assertEqual(acc_metric_2.name, 'output_2_accuracy')
-    self.assertEqual(acc_metric_2.result().numpy(), 0.)
-    self.assertEqual(acc_metric_2._fn, metrics_mod.binary_accuracy)
-
-    weighted_metrics = metric_container.weighted_metrics
-    self.assertLen(weighted_metrics, 2)
-    self.assertEqual(weighted_metrics[0].name, 'output_1_accuracy')
-    self.assertEqual(weighted_metrics[1].name, 'output_2_accuracy')
-
-    unweighted_metrics = metric_container.unweighted_metrics
-    self.assertLen(unweighted_metrics, 4)
-    self.assertEqual(unweighted_metrics[0].name, 'output_1_mse')
-    self.assertEqual(unweighted_metrics[1].name, 'output_1_mae')
-    self.assertEqual(unweighted_metrics[2].name, 'output_2_mse')
-    self.assertEqual(unweighted_metrics[3].name, 'output_2_mae')
-
-  def test_metric_dict(self):
-    metric_container = compile_utils.MetricsContainer(
-        metrics={
-            'out1': 'mse',
-            'out2': 'mae'
-        },
-        weighted_metrics={
-            'out1': 'mse',
-            'out2': 'mae'
-        })
-
-    y_t = {'out1': tf.ones((10, 1)), 'out2': tf.zeros((10, 1))}
-    y_p = {'out1': tf.ones((10, 1)), 'out2': 2 * tf.ones((10, 1))}
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-    metric_container.update_state(y_t, y_p, sample_weight=sw)
-
-    mse_metric = metric_container.metrics[0]
-    self.assertEqual(mse_metric.name, 'out1_mse')
-    self.assertEqual(mse_metric.result().numpy(), 0.)
-
-    weighted_mse_metric = metric_container.metrics[1]
-    self.assertEqual(weighted_mse_metric.name, 'out1_weighted_mse')
-    self.assertEqual(weighted_mse_metric.result().numpy(), 0.)
-
-    mae_metric = metric_container.metrics[2]
-    self.assertEqual(mae_metric.name, 'out2_mae')
-    self.assertEqual(mae_metric.result().numpy(), 2.)
-
-    weighted_mae_metric = metric_container.metrics[3]
-    self.assertEqual(weighted_mae_metric.name, 'out2_weighted_mae')
-    self.assertEqual(weighted_mae_metric.result().numpy(), 2.)
-
-    metric_container.reset_state()
-    self.assertEqual(mse_metric.result().numpy(), 0.)
-    self.assertEqual(weighted_mse_metric.result().numpy(), 0.)
-    self.assertEqual(mae_metric.result().numpy(), 0.)
-    self.assertEqual(weighted_mae_metric.result().numpy(), 0.)
-
-  def test_metric_partial_dict_with_output_names(self):
-    metric_container = compile_utils.MetricsContainer(
-        {'out2': 'mae'}, output_names=['out1', 'out2'])
-
-    y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
-    y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    metric_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metric_container.metrics, 1)
-
-    mae_metric = metric_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'out2_mae')
-    self.assertEqual(mae_metric.result().numpy(), 1.)
-
-  def test_metric_partial_dict_with_nones(self):
-    metric_container = compile_utils.MetricsContainer({
-        'out1': None,
-        'out2': 'mae'
-    })
-
-    y_t = {'out1': tf.ones((10, 1)), 'out2': tf.zeros((10, 1))}
-    y_p = {'out1': tf.ones((10, 1)), 'out2': tf.ones((10, 1))}
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    metric_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metric_container.metrics, 1)
-
-    mae_metric = metric_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'out2_mae')
-    self.assertEqual(mae_metric.result().numpy(), 1.)
-
-  def test_nested_structure(self):
-    metric_container = compile_utils.MetricsContainer(
-        metrics={
-            'b': ['mse', None],
-            'a': 'mae'
-        },
-        weighted_metrics={
-            'b': [None, None],
-            'a': 'mse'
-        })
-
-    y_t = {
-        'b': [2 * tf.ones((10, 1)),
-              tf.zeros((10, 1))],
-        'a': tf.zeros((10, 1))
-    }
-    y_p = {
-        'b': [tf.zeros((10, 1)),
-              tf.zeros((10, 1))],
-        'a': tf.ones((10, 1))
-    }
-    sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-
-    metric_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metric_container.metrics, 3)
-
-    a_mae_metric = metric_container.metrics[0]
-    self.assertEqual(a_mae_metric.name, 'a_mae')
-    self.assertEqual(a_mae_metric.result().numpy(), 1.)
-
-    weighted_a_mae_metric = metric_container.metrics[1]
-    self.assertEqual(weighted_a_mae_metric.name, 'a_mse')
-    self.assertEqual(weighted_a_mae_metric.result().numpy(), 1.)
-
-    b_1_mse_metric = metric_container.metrics[2]
-    self.assertEqual(b_1_mse_metric.name, 'b_1_mse')
-    self.assertEqual(b_1_mse_metric.result().numpy(), 4.)
-
-  def test_no_input_mutation(self):
-    metric = {'a': 'mae'}
-    metric_container = compile_utils.MetricsContainer(metric)
-
-    y_t = {'a': tf.zeros((10, 1))}
-    y_p = {'a': tf.ones((10, 1)), 'b': tf.zeros((10, 1))}
-
-    metric_container.update_state(y_t, y_p)
-    self.assertLen(metric, 1)
-    mae_metric = metric_container.metrics[0]
-    self.assertEqual(mae_metric.result().numpy(), 1.)
-
-  def test_crossentropy(self):
-    metric_container = compile_utils.MetricsContainer('crossentropy')
-    y_t, y_p = tf.ones((10, 1)), tf.ones((10, 1))
-    metric_container.update_state(y_t, y_p)
-    self.assertEqual(metric_container.metrics[0]._fn,
-                     metrics_mod.binary_crossentropy)
-
-    metric_container = compile_utils.MetricsContainer('crossentropy')
-    y_t, y_p = tf.ones((10, 1)), tf.ones((10, 20))
-    self.assertEqual(y_p.shape.as_list()[-1], 20)
-    metric_container.update_state(y_t, y_p)
-    self.assertEqual(metric_container.metrics[0]._fn,
-                     metrics_mod.sparse_categorical_crossentropy)
-
-    metric_container = compile_utils.MetricsContainer('crossentropy')
-    y_t, y_p = tf.ones((10, 20)), tf.ones((10, 20))
-    metric_container.update_state(y_t, y_p)
-    self.assertEqual(metric_container.metrics[0]._fn,
-                     metrics_mod.categorical_crossentropy)
-
-  def test_accuracy(self):
-    metric_container = compile_utils.MetricsContainer('accuracy')
-    y_t, y_p = tf.ones((10, 1)), tf.ones((10, 1))
-    metric_container.update_state(y_t, y_p)
-    self.assertEqual(metric_container.metrics[0]._fn,
-                     metrics_mod.binary_accuracy)
-
-    metric_container = compile_utils.MetricsContainer('Accuracy')
-    y_t, y_p = tf.ones((10, 1)), tf.ones((10, 1))
-    metric_container.update_state(y_t, y_p)
-    self.assertEqual(metric_container.metrics[0]._fn,
-                     metrics_mod.binary_accuracy)
-
-    metric_container = compile_utils.MetricsContainer('accuracy')
-    y_t, y_p = tf.ones((10, 1)), tf.ones((10, 20))
-    self.assertEqual(y_p.shape.as_list()[-1], 20)
-    metric_container.update_state(y_t, y_p)
-    self.assertEqual(metric_container.metrics[0]._fn,
-                     metrics_mod.sparse_categorical_accuracy)
-
-    metric_container = compile_utils.MetricsContainer('accuracy')
-    y_t, y_p = tf.ones((10, 20)), tf.ones((10, 20))
-    metric_container.update_state(y_t, y_p)
-    self.assertEqual(metric_container.metrics[0]._fn,
-                     metrics_mod.categorical_accuracy)
-
-  def test_metric_weighting(self):
-    metric_container = compile_utils.MetricsContainer(
-        metrics=['mae'], weighted_metrics=['mae'])
-
-    y_t = tf.convert_to_tensor([[0], [3], [0]])
-    y_p = tf.convert_to_tensor([[0], [0], [0]])
-    sw = tf.convert_to_tensor([[1], [0], [1]])
-
-    metric_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metric_container.metrics, 2)
-
-    mae_metric = metric_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'mae')
-    self.assertEqual(mae_metric.result().numpy(), 1.)
-
-    weighted_mae_metric = metric_container.metrics[1]
-    self.assertEqual(weighted_mae_metric.name, 'weighted_mae')
-    self.assertEqual(weighted_mae_metric.result().numpy(), 0.)
-
-  def test_broadcast_metrics_to_dict(self):
-    metric_container = compile_utils.MetricsContainer(metrics=['mae'])
-
-    y_p = {'output': tf.convert_to_tensor([[0], [1], [2]])}
-    y_t = {'output': tf.convert_to_tensor([[1], [2], [3]])}
-    metric_container.update_state(y_t, y_p)
-
-    mae_metric = metric_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'mae')
-    self.assertEqual(mae_metric.result().numpy(), 1.)
-
-  def test_broadcast_metrics_to_dict_with_output_names(self):
-    metric_container = compile_utils.MetricsContainer(
-        metrics=['mae'], output_names=['output'])
-
-    y_p = tf.convert_to_tensor([[0], [1], [2]])
-    y_t = {'output': tf.convert_to_tensor([[1], [2], [3]])}
-    metric_container.update_state(y_t, y_p)
-
-    mae_metric = metric_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'mae')
-    self.assertEqual(mae_metric.result().numpy(), 1.)
-
-  def test_missing_label_with_no_metrics(self):
-    # It's ok to exclude a label if that label has no
-    # losses or metrics associated with it.
-    metric_container = compile_utils.MetricsContainer(metrics={
-        'output1': 'mae',
-        'output3': 'mse'
-    })
-
-    y_p = {
-        'output1': tf.convert_to_tensor([[0], [1], [2]]),
-        'output2': tf.convert_to_tensor([[3], [4], [5]]),
-        'output3': tf.convert_to_tensor([[6], [7], [8]])
-    }
-    y_t = {
-        'output1': tf.convert_to_tensor([[1], [2], [3]]),
-        'output3': tf.convert_to_tensor([[4], [5], [6]])
-    }
-
-    metric_container.update_state(y_t, y_p)
-    self.assertLen(metric_container.metrics, 2)
-
-    mae_metric = metric_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'output1_mae')
-    self.assertEqual(mae_metric.result().numpy(), 1.)
-
-    mse_metric = metric_container.metrics[1]
-    self.assertEqual(mse_metric.name, 'output3_mse')
-    self.assertEqual(mse_metric.result().numpy(), 4.)
-
-  def test_metrics_masking(self):
-    metrics_container = compile_utils.MetricsContainer(
-        metrics=['mae'], weighted_metrics=['mse'])
-    y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
-    y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
-    y_p._keras_mask = tf.constant([[1, 1], [0, 0]],
-                                           dtype=tf.float32)
-
-    metrics_container.update_state(y_t, y_p)
-    self.assertLen(metrics_container.metrics, 2)
-
-    mae_metric = metrics_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'mae')
-    self.assertAlmostEqual(mae_metric.result().numpy(), 0)
-
-    weighted_mae_metric = metrics_container.metrics[1]
-    self.assertEqual(weighted_mae_metric.name, 'mse')
-    self.assertAlmostEqual(weighted_mae_metric.result().numpy(), 0)
-
-  def test_metrics_sample_weight(self):
-    metrics_container = compile_utils.MetricsContainer(
-        metrics=['mae'], weighted_metrics=['mse'])
-    y_p = tf.constant([[[1], [1]], [[0], [1]]], dtype=tf.float32)
-    y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
-    sw = tf.constant([[.2, .3], [.5, 0]], dtype=tf.float32)
-
-    metrics_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metrics_container.metrics, 2)
-
-    mae_metric = metrics_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'mae')
-    self.assertAlmostEqual(mae_metric.result().numpy(), .25)  # 1 / 4
-
-    weighted_mae_metric = metrics_container.metrics[1]
-    self.assertEqual(weighted_mae_metric.name, 'mse')
-    self.assertAlmostEqual(weighted_mae_metric.result().numpy(), .5)  # .5 / 1
-
-  def test_metrics_masking_sample_weight(self):
-    metrics_container = compile_utils.MetricsContainer(
-        metrics=['mae'], weighted_metrics=['mse'])
-    y_p = tf.constant([[[1], [1]], [[0], [1]]], dtype=tf.float32)
-    y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
-    sw = tf.constant([[.3, .2], [.2, .3]], dtype=tf.float32)
-    y_p._keras_mask = tf.constant([[1, 0], [1, 0]],
-                                           dtype=tf.float32)
-
-    metrics_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metrics_container.metrics, 2)
-
-    mae_metric = metrics_container.metrics[0]
-    self.assertEqual(mae_metric.name, 'mae')
-    self.assertAlmostEqual(mae_metric.result().numpy(), .5)  # 1 / .5
-
-    weighted_mae_metric = metrics_container.metrics[1]
-    self.assertEqual(weighted_mae_metric.name, 'mse')
-    self.assertAlmostEqual(weighted_mae_metric.result().numpy(), .2 / .5)
-
-  def test_loss_class_as_metric_with_distribution(self):
-    distribution = tf.distribute.OneDeviceStrategy('/device:CPU:0')
-    with distribution.scope():
-      metric_container = compile_utils.MetricsContainer(
-          losses_mod.MeanSquaredError())
-      y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
-      metric_container.update_state(y_t, y_p)
-
-      self.assertLen(metric_container.metrics, 1)
-      metric = metric_container.metrics[0]
-      self.assertEqual(metric.name, 'mean_squared_error')
-      self.assertEqual(metric.result().numpy(), 1.)
-
-  def test_custom_metric_callables(self):
-
-    def custom_metric_fn(y_true, y_pred):
-      return tf.reduce_sum(y_true - y_pred)
-
-    class CustomMetricClass:
-
-      def __call__(self, y_true, y_pred):
-        return tf.reduce_sum(y_true - y_pred)
-
-    metric_container = compile_utils.MetricsContainer(
-        [custom_metric_fn, CustomMetricClass()])
-    y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
-    metric_container.update_state(y_t, y_p)
-
-    self.assertEqual(metric_container.metrics[0].name, 'custom_metric_fn')
-    self.assertEqual(metric_container.metrics[1].name, 'custom_metric_class')
-
-  def test_reset_state_existing_metric_before_built(self):
-    metric = metrics_mod.Mean()
-    metric.update_state([2.0, 4.0])
-    self.assertEqual(metric.result().numpy(), 3.0)
-
-    metric_container = compile_utils.MetricsContainer(metric)
-    metric_container.reset_state()
-    self.assertEqual(metric.result().numpy(), 0.0)
-
-  def test_duplicated_metric_instance(self):
-    mean_obj = metrics_mod.Mean()
-    metric = mean_obj
-    with self.assertRaisesRegex(ValueError, 'Found duplicated metrics'):
-      compile_utils.MetricsContainer(metrics=metric, weighted_metrics=metric)
-
-    # duplicated string should be fine
-    metric = 'acc'
-    compile_utils.MetricsContainer(metrics=metric, weighted_metrics=metric)
-
-    # complicated structure
-    metric = [mean_obj, 'acc']
-    weighted_metric = {'output1': mean_obj, 'output2': 'acc'}
-    with self.assertRaisesRegex(ValueError, 'Found duplicated metrics'):
-      compile_utils.MetricsContainer(
-          metrics=metric, weighted_metrics=weighted_metric)
-
-
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    def test_single_metric(self):
+        metric_container = compile_utils.MetricsContainer("mse")
+        y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
+        metric_container.update_state(y_t, y_p)
+
+        self.assertLen(metric_container.metrics, 1)
+        metric = metric_container.metrics[0]
+        self.assertEqual(metric.name, "mse")
+        self.assertEqual(metric.result().numpy(), 1.0)
+
+        metric_container.reset_state()
+        self.assertEqual(metric.result().numpy(), 0.0)
+
+    def test_list_of_metrics_one_output(self):
+        metric_container = compile_utils.MetricsContainer(["mse", "mae"])
+        y_t, y_p = 2 * tf.ones((10, 5)), tf.zeros((10, 5))
+        metric_container.update_state(y_t, y_p)
+        self.assertLen(metric_container.metrics, 2)
+
+        mse_metric = metric_container.metrics[0]
+        self.assertEqual(mse_metric.name, "mse")
+        self.assertEqual(mse_metric.result().numpy(), 4.0)
+
+        mae_metric = metric_container.metrics[1]
+        self.assertEqual(mae_metric.name, "mae")
+        self.assertEqual(mae_metric.result().numpy(), 2.0)
+
+        metric_container.reset_state()
+        self.assertEqual(mse_metric.result().numpy(), 0.0)
+        self.assertEqual(mae_metric.result().numpy(), 0.0)
+
+    def test_list_of_metrics_list_of_outputs(self):
+        metric_container = compile_utils.MetricsContainer(
+            metrics=["mse", "mae"],  # Should broadcast to both outputs.
+            weighted_metrics=["accuracy"],
+        )  # Should broadcast to both outputs.
+
+        y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
+        y_p = [tf.ones((10, 1)), 2 * tf.ones((10, 1))]
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+        metric_container.update_state(y_t, y_p, sample_weight=sw)
+        self.assertLen(metric_container.metrics, 6)
+
+        mse_metric = metric_container.metrics[0]
+        self.assertEqual(mse_metric.name, "output_1_mse")
+        self.assertEqual(mse_metric.result().numpy(), 0.0)
+
+        mse_metric = metric_container.metrics[1]
+        self.assertEqual(mse_metric.name, "output_1_mae")
+        self.assertEqual(mse_metric.result().numpy(), 0.0)
+
+        acc_metric_1 = metric_container.metrics[2]
+        self.assertEqual(acc_metric_1.name, "output_1_accuracy")
+        self.assertEqual(acc_metric_1.result().numpy(), 1.0)
+        self.assertEqual(acc_metric_1._fn, metrics_mod.binary_accuracy)
+
+        mae_metric = metric_container.metrics[3]
+        self.assertEqual(mae_metric.name, "output_2_mse")
+        self.assertEqual(mae_metric.result().numpy(), 4.0)
+
+        mae_metric = metric_container.metrics[4]
+        self.assertEqual(mae_metric.name, "output_2_mae")
+        self.assertEqual(mae_metric.result().numpy(), 2.0)
+
+        acc_metric_2 = metric_container.metrics[5]
+        self.assertEqual(acc_metric_2.name, "output_2_accuracy")
+        self.assertEqual(acc_metric_2.result().numpy(), 0.0)
+        self.assertEqual(acc_metric_2._fn, metrics_mod.binary_accuracy)
+
+        weighted_metrics = metric_container.weighted_metrics
+        self.assertLen(weighted_metrics, 2)
+        self.assertEqual(weighted_metrics[0].name, "output_1_accuracy")
+        self.assertEqual(weighted_metrics[1].name, "output_2_accuracy")
+
+        unweighted_metrics = metric_container.unweighted_metrics
+        self.assertLen(unweighted_metrics, 4)
+        self.assertEqual(unweighted_metrics[0].name, "output_1_mse")
+        self.assertEqual(unweighted_metrics[1].name, "output_1_mae")
+        self.assertEqual(unweighted_metrics[2].name, "output_2_mse")
+        self.assertEqual(unweighted_metrics[3].name, "output_2_mae")
+
+    def test_metric_dict(self):
+        metric_container = compile_utils.MetricsContainer(
+            metrics={"out1": "mse", "out2": "mae"},
+            weighted_metrics={"out1": "mse", "out2": "mae"},
+        )
+
+        y_t = {"out1": tf.ones((10, 1)), "out2": tf.zeros((10, 1))}
+        y_p = {"out1": tf.ones((10, 1)), "out2": 2 * tf.ones((10, 1))}
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+        metric_container.update_state(y_t, y_p, sample_weight=sw)
+
+        mse_metric = metric_container.metrics[0]
+        self.assertEqual(mse_metric.name, "out1_mse")
+        self.assertEqual(mse_metric.result().numpy(), 0.0)
+
+        weighted_mse_metric = metric_container.metrics[1]
+        self.assertEqual(weighted_mse_metric.name, "out1_weighted_mse")
+        self.assertEqual(weighted_mse_metric.result().numpy(), 0.0)
+
+        mae_metric = metric_container.metrics[2]
+        self.assertEqual(mae_metric.name, "out2_mae")
+        self.assertEqual(mae_metric.result().numpy(), 2.0)
+
+        weighted_mae_metric = metric_container.metrics[3]
+        self.assertEqual(weighted_mae_metric.name, "out2_weighted_mae")
+        self.assertEqual(weighted_mae_metric.result().numpy(), 2.0)
+
+        metric_container.reset_state()
+        self.assertEqual(mse_metric.result().numpy(), 0.0)
+        self.assertEqual(weighted_mse_metric.result().numpy(), 0.0)
+        self.assertEqual(mae_metric.result().numpy(), 0.0)
+        self.assertEqual(weighted_mae_metric.result().numpy(), 0.0)
+
+    def test_metric_partial_dict_with_output_names(self):
+        metric_container = compile_utils.MetricsContainer(
+            {"out2": "mae"}, output_names=["out1", "out2"]
+        )
+
+        y_t = [tf.ones((10, 1)), tf.zeros((10, 1))]
+        y_p = [tf.ones((10, 1)), tf.ones((10, 1))]
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        metric_container.update_state(y_t, y_p, sample_weight=sw)
+        self.assertLen(metric_container.metrics, 1)
+
+        mae_metric = metric_container.metrics[0]
+        self.assertEqual(mae_metric.name, "out2_mae")
+        self.assertEqual(mae_metric.result().numpy(), 1.0)
+
+    def test_metric_partial_dict_with_nones(self):
+        metric_container = compile_utils.MetricsContainer(
+            {"out1": None, "out2": "mae"}
+        )
+
+        y_t = {"out1": tf.ones((10, 1)), "out2": tf.zeros((10, 1))}
+        y_p = {"out1": tf.ones((10, 1)), "out2": tf.ones((10, 1))}
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        metric_container.update_state(y_t, y_p, sample_weight=sw)
+        self.assertLen(metric_container.metrics, 1)
+
+        mae_metric = metric_container.metrics[0]
+        self.assertEqual(mae_metric.name, "out2_mae")
+        self.assertEqual(mae_metric.result().numpy(), 1.0)
+
+    def test_nested_structure(self):
+        metric_container = compile_utils.MetricsContainer(
+            metrics={"b": ["mse", None], "a": "mae"},
+            weighted_metrics={"b": [None, None], "a": "mse"},
+        )
+
+        y_t = {
+            "b": [2 * tf.ones((10, 1)), tf.zeros((10, 1))],
+            "a": tf.zeros((10, 1)),
+        }
+        y_p = {
+            "b": [tf.zeros((10, 1)), tf.zeros((10, 1))],
+            "a": tf.ones((10, 1)),
+        }
+        sw = tf.convert_to_tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+
+        metric_container.update_state(y_t, y_p, sample_weight=sw)
+        self.assertLen(metric_container.metrics, 3)
+
+        a_mae_metric = metric_container.metrics[0]
+        self.assertEqual(a_mae_metric.name, "a_mae")
+        self.assertEqual(a_mae_metric.result().numpy(), 1.0)
+
+        weighted_a_mae_metric = metric_container.metrics[1]
+        self.assertEqual(weighted_a_mae_metric.name, "a_mse")
+        self.assertEqual(weighted_a_mae_metric.result().numpy(), 1.0)
+
+        b_1_mse_metric = metric_container.metrics[2]
+        self.assertEqual(b_1_mse_metric.name, "b_1_mse")
+        self.assertEqual(b_1_mse_metric.result().numpy(), 4.0)
+
+    def test_no_input_mutation(self):
+        metric = {"a": "mae"}
+        metric_container = compile_utils.MetricsContainer(metric)
+
+        y_t = {"a": tf.zeros((10, 1))}
+        y_p = {"a": tf.ones((10, 1)), "b": tf.zeros((10, 1))}
+
+        metric_container.update_state(y_t, y_p)
+        self.assertLen(metric, 1)
+        mae_metric = metric_container.metrics[0]
+        self.assertEqual(mae_metric.result().numpy(), 1.0)
+
+    def test_crossentropy(self):
+        metric_container = compile_utils.MetricsContainer("crossentropy")
+        y_t, y_p = tf.ones((10, 1)), tf.ones((10, 1))
+        metric_container.update_state(y_t, y_p)
+        self.assertEqual(
+            metric_container.metrics[0]._fn, metrics_mod.binary_crossentropy
+        )
+
+        metric_container = compile_utils.MetricsContainer("crossentropy")
+        y_t, y_p = tf.ones((10, 1)), tf.ones((10, 20))
+        self.assertEqual(y_p.shape.as_list()[-1], 20)
+        metric_container.update_state(y_t, y_p)
+        self.assertEqual(
+            metric_container.metrics[0]._fn,
+            metrics_mod.sparse_categorical_crossentropy,
+        )
+
+        metric_container = compile_utils.MetricsContainer("crossentropy")
+        y_t, y_p = tf.ones((10, 20)), tf.ones((10, 20))
+        metric_container.update_state(y_t, y_p)
+        self.assertEqual(
+            metric_container.metrics[0]._fn,
+            metrics_mod.categorical_crossentropy,
+        )
+
+    def test_accuracy(self):
+        metric_container = compile_utils.MetricsContainer("accuracy")
+        y_t, y_p = tf.ones((10, 1)), tf.ones((10, 1))
+        metric_container.update_state(y_t, y_p)
+        self.assertEqual(
+            metric_container.metrics[0]._fn, metrics_mod.binary_accuracy
+        )
+
+        metric_container = compile_utils.MetricsContainer("Accuracy")
+        y_t, y_p = tf.ones((10, 1)), tf.ones((10, 1))
+        metric_container.update_state(y_t, y_p)
+        self.assertEqual(
+            metric_container.metrics[0]._fn, metrics_mod.binary_accuracy
+        )
+
+        metric_container = compile_utils.MetricsContainer("accuracy")
+        y_t, y_p = tf.ones((10, 1)), tf.ones((10, 20))
+        self.assertEqual(y_p.shape.as_list()[-1], 20)
+        metric_container.update_state(y_t, y_p)
+        self.assertEqual(
+            metric_container.metrics[0]._fn,
+            metrics_mod.sparse_categorical_accuracy,
+        )
+
+        metric_container = compile_utils.MetricsContainer("accuracy")
+        y_t, y_p = tf.ones((10, 20)), tf.ones((10, 20))
+        metric_container.update_state(y_t, y_p)
+        self.assertEqual(
+            metric_container.metrics[0]._fn, metrics_mod.categorical_accuracy
+        )
+
+    def test_metric_weighting(self):
+        metric_container = compile_utils.MetricsContainer(
+            metrics=["mae"], weighted_metrics=["mae"]
+        )
+
+        y_t = tf.convert_to_tensor([[0], [3], [0]])
+        y_p = tf.convert_to_tensor([[0], [0], [0]])
+        sw = tf.convert_to_tensor([[1], [0], [1]])
+
+        metric_container.update_state(y_t, y_p, sample_weight=sw)
+        self.assertLen(metric_container.metrics, 2)
+
+        mae_metric = metric_container.metrics[0]
+        self.assertEqual(mae_metric.name, "mae")
+        self.assertEqual(mae_metric.result().numpy(), 1.0)
+
+        weighted_mae_metric = metric_container.metrics[1]
+        self.assertEqual(weighted_mae_metric.name, "weighted_mae")
+        self.assertEqual(weighted_mae_metric.result().numpy(), 0.0)
+
+    def test_broadcast_metrics_to_dict(self):
+        metric_container = compile_utils.MetricsContainer(metrics=["mae"])
+
+        y_p = {"output": tf.convert_to_tensor([[0], [1], [2]])}
+        y_t = {"output": tf.convert_to_tensor([[1], [2], [3]])}
+        metric_container.update_state(y_t, y_p)
+
+        mae_metric = metric_container.metrics[0]
+        self.assertEqual(mae_metric.name, "mae")
+        self.assertEqual(mae_metric.result().numpy(), 1.0)
+
+    def test_broadcast_metrics_to_dict_with_output_names(self):
+        metric_container = compile_utils.MetricsContainer(
+            metrics=["mae"], output_names=["output"]
+        )
+
+        y_p = tf.convert_to_tensor([[0], [1], [2]])
+        y_t = {"output": tf.convert_to_tensor([[1], [2], [3]])}
+        metric_container.update_state(y_t, y_p)
+
+        mae_metric = metric_container.metrics[0]
+        self.assertEqual(mae_metric.name, "mae")
+        self.assertEqual(mae_metric.result().numpy(), 1.0)
+
+    def test_missing_label_with_no_metrics(self):
+        # It's ok to exclude a label if that label has no
+        # losses or metrics associated with it.
+        metric_container = compile_utils.MetricsContainer(
+            metrics={"output1": "mae", "output3": "mse"}
+        )
+
+        y_p = {
+            "output1": tf.convert_to_tensor([[0], [1], [2]]),
+            "output2": tf.convert_to_tensor([[3], [4], [5]]),
+            "output3": tf.convert_to_tensor([[6], [7], [8]]),
+        }
+        y_t = {
+            "output1": tf.convert_to_tensor([[1], [2], [3]]),
+            "output3": tf.convert_to_tensor([[4], [5], [6]]),
+        }
+
+        metric_container.update_state(y_t, y_p)
+        self.assertLen(metric_container.metrics, 2)
+
+        mae_metric = metric_container.metrics[0]
+        self.assertEqual(mae_metric.name, "output1_mae")
+        self.assertEqual(mae_metric.result().numpy(), 1.0)
+
+        mse_metric = metric_container.metrics[1]
+        self.assertEqual(mse_metric.name, "output3_mse")
+        self.assertEqual(mse_metric.result().numpy(), 4.0)
+
+    def test_metrics_masking(self):
+        metrics_container = compile_utils.MetricsContainer(
+            metrics=["mae"], weighted_metrics=["mse"]
+        )
+        y_p = tf.constant([[[1], [1]], [[0], [0]]], dtype=tf.float32)
+        y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
+        y_p._keras_mask = tf.constant([[1, 1], [0, 0]], dtype=tf.float32)
+
+        metrics_container.update_state(y_t, y_p)
+        self.assertLen(metrics_container.metrics, 2)
+
+        mae_metric = metrics_container.metrics[0]
+        self.assertEqual(mae_metric.name, "mae")
+        self.assertAlmostEqual(mae_metric.result().numpy(), 0)
+
+        weighted_mae_metric = metrics_container.metrics[1]
+        self.assertEqual(weighted_mae_metric.name, "mse")
+        self.assertAlmostEqual(weighted_mae_metric.result().numpy(), 0)
+
+    def test_metrics_sample_weight(self):
+        metrics_container = compile_utils.MetricsContainer(
+            metrics=["mae"], weighted_metrics=["mse"]
+        )
+        y_p = tf.constant([[[1], [1]], [[0], [1]]], dtype=tf.float32)
+        y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
+        sw = tf.constant([[0.2, 0.3], [0.5, 0]], dtype=tf.float32)
+
+        metrics_container.update_state(y_t, y_p, sample_weight=sw)
+        self.assertLen(metrics_container.metrics, 2)
+
+        mae_metric = metrics_container.metrics[0]
+        self.assertEqual(mae_metric.name, "mae")
+        self.assertAlmostEqual(mae_metric.result().numpy(), 0.25)  # 1 / 4
+
+        weighted_mae_metric = metrics_container.metrics[1]
+        self.assertEqual(weighted_mae_metric.name, "mse")
+        self.assertAlmostEqual(
+            weighted_mae_metric.result().numpy(), 0.5
+        )  # .5 / 1
+
+    def test_metrics_masking_sample_weight(self):
+        metrics_container = compile_utils.MetricsContainer(
+            metrics=["mae"], weighted_metrics=["mse"]
+        )
+        y_p = tf.constant([[[1], [1]], [[0], [1]]], dtype=tf.float32)
+        y_t = tf.constant([[[1], [1]], [[1], [1]]], dtype=tf.float32)
+        sw = tf.constant([[0.3, 0.2], [0.2, 0.3]], dtype=tf.float32)
+        y_p._keras_mask = tf.constant([[1, 0], [1, 0]], dtype=tf.float32)
+
+        metrics_container.update_state(y_t, y_p, sample_weight=sw)
+        self.assertLen(metrics_container.metrics, 2)
+
+        mae_metric = metrics_container.metrics[0]
+        self.assertEqual(mae_metric.name, "mae")
+        self.assertAlmostEqual(mae_metric.result().numpy(), 0.5)  # 1 / .5
+
+        weighted_mae_metric = metrics_container.metrics[1]
+        self.assertEqual(weighted_mae_metric.name, "mse")
+        self.assertAlmostEqual(weighted_mae_metric.result().numpy(), 0.2 / 0.5)
+
+    def test_loss_class_as_metric_with_distribution(self):
+        distribution = tf.distribute.OneDeviceStrategy("/device:CPU:0")
+        with distribution.scope():
+            metric_container = compile_utils.MetricsContainer(
+                losses_mod.MeanSquaredError()
+            )
+            y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
+            metric_container.update_state(y_t, y_p)
+
+            self.assertLen(metric_container.metrics, 1)
+            metric = metric_container.metrics[0]
+            self.assertEqual(metric.name, "mean_squared_error")
+            self.assertEqual(metric.result().numpy(), 1.0)
+
+    def test_custom_metric_callables(self):
+        def custom_metric_fn(y_true, y_pred):
+            return tf.reduce_sum(y_true - y_pred)
+
+        class CustomMetricClass:
+            def __call__(self, y_true, y_pred):
+                return tf.reduce_sum(y_true - y_pred)
+
+        metric_container = compile_utils.MetricsContainer(
+            [custom_metric_fn, CustomMetricClass()]
+        )
+        y_t, y_p = tf.ones((10, 5)), tf.zeros((10, 5))
+        metric_container.update_state(y_t, y_p)
+
+        self.assertEqual(metric_container.metrics[0].name, "custom_metric_fn")
+        self.assertEqual(
+            metric_container.metrics[1].name, "custom_metric_class"
+        )
+
+    def test_reset_state_existing_metric_before_built(self):
+        metric = metrics_mod.Mean()
+        metric.update_state([2.0, 4.0])
+        self.assertEqual(metric.result().numpy(), 3.0)
+
+        metric_container = compile_utils.MetricsContainer(metric)
+        metric_container.reset_state()
+        self.assertEqual(metric.result().numpy(), 0.0)
+
+    def test_duplicated_metric_instance(self):
+        mean_obj = metrics_mod.Mean()
+        metric = mean_obj
+        with self.assertRaisesRegex(ValueError, "Found duplicated metrics"):
+            compile_utils.MetricsContainer(
+                metrics=metric, weighted_metrics=metric
+            )
+
+        # duplicated string should be fine
+        metric = "acc"
+        compile_utils.MetricsContainer(metrics=metric, weighted_metrics=metric)
+
+        # complicated structure
+        metric = [mean_obj, "acc"]
+        weighted_metric = {"output1": mean_obj, "output2": "acc"}
+        with self.assertRaisesRegex(ValueError, "Found duplicated metrics"):
+            compile_utils.MetricsContainer(
+                metrics=metric, weighted_metrics=weighted_metric
+            )
+
+
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/engine/control_flow_test.py b/keras/engine/control_flow_test.py
index 6ac7586b03e0..161e05d24960 100644
--- a/keras/engine/control_flow_test.py
+++ b/keras/engine/control_flow_test.py
@@ -14,117 +14,118 @@
 # ==============================================================================
 """Tests for dynamic control flow behavior with Keras."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
+from keras.engine import base_layer
+from keras.optimizers.legacy import rmsprop
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.engine import base_layer
-from keras.optimizers.optimizer_v2 import rmsprop
 
 
 class ControlFlowLayer1(base_layer.Layer):
-  """Layer with an `if` condition in call."""
+    """Layer with an `if` condition in call."""
 
-  def call(self, inputs):
-    if tf.reduce_sum(inputs) > 0:
-      return tf.sqrt(inputs)
-    else:
-      return tf.square(inputs)
+    def call(self, inputs):
+        if tf.reduce_sum(inputs) > 0:
+            return tf.sqrt(inputs)
+        else:
+            return tf.square(inputs)
 
 
 class ControlFlowLayer2(base_layer.Layer):
-  """Layer with a `for` loop in call."""
+    """Layer with a `for` loop in call."""
 
-  def call(self, inputs):
-    samples = tf.TensorArray(
-        dtype=tf.float32, size=tf.shape(inputs)[0])
-    i = 0
-    for sample in inputs:
-      samples = samples.write(i, tf.square(sample))
-      i += 1
-    return samples.stack()
+    def call(self, inputs):
+        samples = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
+        i = 0
+        for sample in inputs:
+            samples = samples.write(i, tf.square(sample))
+            i += 1
+        return samples.stack()
 
 
 class NestedControlFlowLayer(base_layer.Layer):
-  """Layer nested with a control flow layer."""
+    """Layer nested with a control flow layer."""
 
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    self.layer = ControlFlowLayer1()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.layer = ControlFlowLayer1()
 
-  def call(self, inputs):
-    return self.layer(inputs)
+    def call(self, inputs):
+        return self.layer(inputs)
 
 
 class ControlFlowModel(keras.Model):
-  """Model with an `if` condition in call."""
+    """Model with an `if` condition in call."""
 
-  def call(self, inputs):
-    if tf.reduce_sum(inputs) > 0:
-      return tf.sqrt(inputs)
-    else:
-      return tf.square(inputs)
+    def call(self, inputs):
+        if tf.reduce_sum(inputs) > 0:
+            return tf.sqrt(inputs)
+        else:
+            return tf.square(inputs)
 
 
 class NestedControlFlowModel(keras.Model):
-  """Model with an `if` condition in call using a control flow layer."""
+    """Model with an `if` condition in call using a control flow layer."""
 
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    self.layer = NestedControlFlowLayer()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.layer = NestedControlFlowLayer()
 
-  def call(self, inputs):
-    inputs = self.layer(inputs)
-    if tf.reduce_sum(inputs) > 0:
-      return tf.sqrt(inputs)
-    else:
-      return tf.square(inputs)
+    def call(self, inputs):
+        inputs = self.layer(inputs)
+        if tf.reduce_sum(inputs) > 0:
+            return tf.sqrt(inputs)
+        else:
+            return tf.square(inputs)
 
 
 class FunctionControlFlowModel(keras.Model):
-  """Model with control flow where `call` is wrapped in function already."""
+    """Model with control flow where `call` is wrapped in function already."""
 
-  @tf.function
-  def call(self, inputs):
-    if tf.reduce_sum(inputs) > 0:
-      return tf.sqrt(inputs)
-    else:
-      return tf.square(inputs)
+    @tf.function
+    def call(self, inputs):
+        if tf.reduce_sum(inputs) > 0:
+            return tf.sqrt(inputs)
+        else:
+            return tf.square(inputs)
 
 
 @test_combinations.run_all_keras_modes
 class AutographWrapperTest(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  @parameterized.named_parameters(('with_if', ControlFlowLayer1),
-                                  ('with_for', ControlFlowLayer2),
-                                  ('nested', NestedControlFlowLayer))
-  def test_control_flow_layer(self, layer_class):
-    model = test_utils.get_model_from_layers([layer_class()],
-                                             input_shape=(3,))
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-  @parameterized.named_parameters(
-      ('with_if', ControlFlowModel), ('nested', NestedControlFlowModel),
-      ('wrapped_in_function', FunctionControlFlowModel))
-  def test_control_flow_model(self, model_class):
-    model = model_class()
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-  def test_control_flow_in_deferred_sequential_model(self):
-    model = keras.Sequential(
-        [ControlFlowLayer1(),
-         keras.layers.Dense(3),
-         ControlFlowLayer2()])
-    model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @test_combinations.run_with_all_model_types
+    @parameterized.named_parameters(
+        ("with_if", ControlFlowLayer1),
+        ("with_for", ControlFlowLayer2),
+        ("nested", NestedControlFlowLayer),
+    )
+    def test_control_flow_layer(self, layer_class):
+        model = test_utils.get_model_from_layers(
+            [layer_class()], input_shape=(3,)
+        )
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    @parameterized.named_parameters(
+        ("with_if", ControlFlowModel),
+        ("nested", NestedControlFlowModel),
+        ("wrapped_in_function", FunctionControlFlowModel),
+    )
+    def test_control_flow_model(self, model_class):
+        model = model_class()
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    def test_control_flow_in_deferred_sequential_model(self):
+        model = keras.Sequential(
+            [ControlFlowLayer1(), keras.layers.Dense(3), ControlFlowLayer2()]
+        )
+        model.compile(rmsprop.RMSprop(0.001), loss="mse")
+        model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/correctness_test.py b/keras/engine/correctness_test.py
index dd66f556e507..6b16e247cea9 100644
--- a/keras/engine/correctness_test.py
+++ b/keras/engine/correctness_test.py
@@ -14,10 +14,9 @@
 # ==============================================================================
 """Tests for numerical correctness."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
 from keras.testing_infra import test_combinations
@@ -25,117 +24,118 @@
 
 
 class MultiInputSubclassed(keras.Model):
-  """Subclassed Model that adds its inputs and then adds a bias."""
+    """Subclassed Model that adds its inputs and then adds a bias."""
 
-  def __init__(self):
-    super().__init__()
-    self.add = keras.layers.Add()
-    self.bias = test_utils.Bias()
+    def __init__(self):
+        super().__init__()
+        self.add = keras.layers.Add()
+        self.bias = test_utils.Bias()
 
-  def call(self, inputs):
-    added = self.add(inputs)
-    return self.bias(added)
+    def call(self, inputs):
+        added = self.add(inputs)
+        return self.bias(added)
 
 
 def multi_input_functional():
-  """Functional Model that adds its inputs and then adds a bias."""
-  input_1 = keras.Input(shape=(1,))
-  input_2 = keras.Input(shape=(1,))
-  input_3 = keras.Input(shape=(1,))
-  added = keras.layers.Add()([input_1, input_2, input_3])
-  output = test_utils.Bias()(added)
-  return keras.Model([input_1, input_2, input_3], output)
+    """Functional Model that adds its inputs and then adds a bias."""
+    input_1 = keras.Input(shape=(1,))
+    input_2 = keras.Input(shape=(1,))
+    input_3 = keras.Input(shape=(1,))
+    added = keras.layers.Add()([input_1, input_2, input_3])
+    output = test_utils.Bias()(added)
+    return keras.Model([input_1, input_2, input_3], output)
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 class SimpleBiasTest(test_combinations.TestCase):
+    def _get_simple_bias_model(self):
+        model = test_utils.get_model_from_layers(
+            [test_utils.Bias()], input_shape=(1,)
+        )
+        model.compile(
+            keras.optimizers.legacy.gradient_descent.SGD(0.1),
+            "mae",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
 
-  def _get_simple_bias_model(self):
-    model = test_utils.get_model_from_layers([test_utils.Bias()],
-                                             input_shape=(1,))
-    model.compile(
-        keras.optimizers.optimizer_v2.gradient_descent.SGD(0.1),
-        'mae',
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
+    def test_simple_bias_fit(self):
+        x = np.array([[0.0], [1.0], [2.0]])
+        y = np.array([[0.5], [2.0], [3.5]])
+        model = self._get_simple_bias_model()
 
-  def test_simple_bias_fit(self):
-    x = np.array([[0.], [1.], [2.]])
-    y = np.array([[0.5], [2.], [3.5]])
-    model = self._get_simple_bias_model()
+        history = model.fit(x, y, batch_size=3, epochs=5)
+        self.assertAllClose(history.history["loss"], [1.0, 0.9, 0.8, 0.7, 0.6])
 
-    history = model.fit(x, y, batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [1., 0.9, 0.8, 0.7, 0.6])
+    def test_simple_bias_evaluate(self):
+        x = np.array([[0.0], [1.0], [2.0]])
+        y = np.array([[1.0], [3.0], [5.0]])
+        model = self._get_simple_bias_model()
 
-  def test_simple_bias_evaluate(self):
-    x = np.array([[0.], [1.], [2.]])
-    y = np.array([[1.], [3.], [5.]])
-    model = self._get_simple_bias_model()
+        loss = model.evaluate(x, y, batch_size=1)
+        self.assertAlmostEqual(loss, 2.0)
 
-    loss = model.evaluate(x, y, batch_size=1)
-    self.assertAlmostEqual(loss, 2.)
+    def test_simple_bias_predict(self):
+        x = np.array([[0.0], [1.0], [2.0]])
+        model = self._get_simple_bias_model()
 
-  def test_simple_bias_predict(self):
-    x = np.array([[0.], [1.], [2.]])
-    model = self._get_simple_bias_model()
-
-    pred = model.predict(x, batch_size=1)
-    self.assertAllClose(x, pred)
+        pred = model.predict(x, batch_size=1)
+        self.assertAllClose(x, pred)
 
 
 @test_combinations.run_all_keras_modes
 class MultipleInputTest(test_combinations.TestCase):
-
-  def _get_multiple_input_model(self, subclassed=True):
-    if subclassed:
-      model = MultiInputSubclassed()
-    else:
-      model = multi_input_functional()
-    model.compile(
-        keras.optimizers.optimizer_v2.gradient_descent.SGD(0.1),
-        'mae',
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  @parameterized.named_parameters(('subclassed', True), ('functional', False))
-  def test_multiple_input_fit(self, subclassed):
-    x = [
-        np.array([[1.], [2.], [3.]]),
-        np.array([[4.], [5.], [6.]]),
-        np.array([[7.], [8.], [9.]])
-    ]
-    y = np.array([[12.5], [16.], [19.5]])
-
-    model = self._get_multiple_input_model(subclassed)
-    history = model.fit(x, y, batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [1., 0.9, 0.8, 0.7, 0.6])
-
-  @parameterized.named_parameters(('subclassed', True), ('functional', False))
-  def test_multiple_input_evaluate(self, subclassed):
-    x = [
-        np.array([[1.], [2.], [3.]]),
-        np.array([[4.], [5.], [6.]]),
-        np.array([[7.], [8.], [9.]])
-    ]
-    y = np.array([[13.], [17.], [21.]])
-
-    model = self._get_multiple_input_model(subclassed)
-    loss = model.evaluate(x, y, batch_size=3)
-    self.assertAlmostEqual(loss, 2.)
-
-  @parameterized.named_parameters(('subclassed', True), ('functional', False))
-  def test_multiple_input_predict(self, subclassed):
-    x = [
-        np.array([[1.], [2.], [3.]]),
-        np.array([[4.], [5.], [6.]]),
-        np.array([[7.], [8.], [9.]])
-    ]
-
-    model = self._get_multiple_input_model(subclassed)
-    pred = model.predict(x, batch_size=1)
-    self.assertAllClose(pred, [[12.], [15.], [18.]])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _get_multiple_input_model(self, subclassed=True):
+        if subclassed:
+            model = MultiInputSubclassed()
+        else:
+            model = multi_input_functional()
+        model.compile(
+            keras.optimizers.legacy.gradient_descent.SGD(0.1),
+            "mae",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
+
+    @parameterized.named_parameters(("subclassed", True), ("functional", False))
+    def test_multiple_input_fit(self, subclassed):
+        x = [
+            np.array([[1.0], [2.0], [3.0]]),
+            np.array([[4.0], [5.0], [6.0]]),
+            np.array([[7.0], [8.0], [9.0]]),
+        ]
+        y = np.array([[12.5], [16.0], [19.5]])
+
+        model = self._get_multiple_input_model(subclassed)
+        history = model.fit(x, y, batch_size=3, epochs=5)
+        self.assertAllClose(history.history["loss"], [1.0, 0.9, 0.8, 0.7, 0.6])
+
+    @parameterized.named_parameters(("subclassed", True), ("functional", False))
+    def test_multiple_input_evaluate(self, subclassed):
+        x = [
+            np.array([[1.0], [2.0], [3.0]]),
+            np.array([[4.0], [5.0], [6.0]]),
+            np.array([[7.0], [8.0], [9.0]]),
+        ]
+        y = np.array([[13.0], [17.0], [21.0]])
+
+        model = self._get_multiple_input_model(subclassed)
+        loss = model.evaluate(x, y, batch_size=3)
+        self.assertAlmostEqual(loss, 2.0)
+
+    @parameterized.named_parameters(("subclassed", True), ("functional", False))
+    def test_multiple_input_predict(self, subclassed):
+        x = [
+            np.array([[1.0], [2.0], [3.0]]),
+            np.array([[4.0], [5.0], [6.0]]),
+            np.array([[7.0], [8.0], [9.0]]),
+        ]
+
+        model = self._get_multiple_input_model(subclassed)
+        pred = model.predict(x, batch_size=1)
+        self.assertAllClose(pred, [[12.0], [15.0], [18.0]])
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/data_adapter.py b/keras/engine/data_adapter.py
index 00f8c41e4ab9..517684e75590 100644
--- a/keras/engine/data_adapter.py
+++ b/keras/engine/data_adapter.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Adapter module that convert different input data objects into tf.dataset."""
 
-import tensorflow.compat.v2 as tf
-
 import abc
 import contextlib
 import functools
@@ -24,1711 +22,1985 @@
 import random
 
 import numpy as np
-from tensorflow.python.eager import context
+import tensorflow.compat.v2 as tf
+
 from keras import backend
+from keras.distribute import distributed_training_utils
 from keras.engine import training_utils
 from keras.utils import data_utils
 from keras.utils import dataset_creator
 from keras.utils import tf_utils
-from tensorflow.python.distribute.input_lib import DistributedDataset
+
+# isort: off
+from tensorflow.python.distribute.input_lib import (
+    DistributedDataset,
+)
+from tensorflow.python.eager import context
 from tensorflow.python.framework import type_spec
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.data.ops import (
+    from_sparse_tensor_slices_op,
+)
+from tensorflow.python.data.ops import from_generator_op
+from tensorflow.python.data.ops import range_op
+from tensorflow.python.data.ops import from_tensors_op
+from tensorflow.python.data.ops import from_tensor_slices_op
 
 try:
-  import pandas as pd  # pylint: disable=g-import-not-at-top
+    import pandas as pd
 except ImportError:
-  pd = None
+    pd = None
 
 keras_data_adapter_gauge = tf.__internal__.monitoring.BoolGauge(
-    "/tensorflow/api/keras/data_adapters", "keras data adapter usage", "method")
+    "/tensorflow/api/keras/data_adapters", "keras data adapter usage", "method"
+)
 
 
 class DataAdapter(object, metaclass=abc.ABCMeta):
-  """Base class for input data adapter.
-
-  In TF 2.0, tf.data is the preferred API for user to feed in data. In order
-  to simplify the training code path, all the input data object will be
-  converted to `tf.data.Dataset` if possible.
-
-  Note that since this class is mainly targeted for TF 2.0, it might have a lot
-  of assumptions under the hood, e.g. eager context by default, distribution
-  strategy, etc. In the meantime, some legacy feature support might be dropped,
-  eg, Iterator from dataset API in v1, etc.
-
-  The sample usage of this class is like:
-
-  ```
-  x = tf.data.Dataset.range(100)
-  adapter_cls = [NumpyArrayDataAdapter, ..., DatasetAdapter]
-  applicable_adapters = [cls for cls in adapter_cls if cls.can_handle(x)]
-  if len(applicable_adapters) != 1:
-    raise ValueError("Expect only one adapter class to handle the input")
-
-  dataset = applicable_adapters[0](x).get_dataset()
-  for data in dataset:
-    # training
-  ```
-  """
-
-  @staticmethod
-  def can_handle(x, y=None):
-    """Whether the current DataAdapter could handle the input x and y.
-
-    Structure wise, x and y can be single object, or list of objects if there
-    multiple input/output, or dictionary of objects when the input/output are
-    named.
-
-    Args:
-      x: input features.
-      y: target labels. Note that y could be None in the case of prediction.
-
-    Returns:
-      boolean
-    """
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def __init__(self, x, y=None, **kwargs):
-    """Create a DataAdapter based on data inputs.
-
-    The caller must make sure to call `can_handle()` first before invoking this
-    method. Provide unsupported data type will result into unexpected behavior.
-
-    Args:
-      x: input features.
-      y: target labels. Note that y could be None in the case of prediction.
-      **kwargs: Other keyword arguments for DataAdapter during the construction
-        of the tf.dataset.Dataset. For example:
-        - Numpy data might have `sample_weights` which will be used for
-          weighting the loss function during training.
-        - Numpy data might need to have `batch_size` parameter when constructing
-          the dataset and iterator.
-        - Certain input might need to be distribution strategy aware. When
-          `distribution_strategy` is passed, the created dataset need to respect
-          the strategy.
-        DataAdapter might choose to ignore any keyword argument if it doesn't
-        use it, or raise exception if any required argument is not provided.
-    """
-    if not self.can_handle(x, y):
-      raise ValueError("{} Cannot handle input {}, {}".format(
-          self.__class__, x, y))
-
-  @abc.abstractmethod
-  def get_dataset(self):
-    """Get a dataset instance for the current DataAdapter.
-
-    Note that the dataset returned does not repeat for epoch, so caller might
-    need to create new iterator for the same dataset at the beginning of the
-    epoch. This behavior might change in the future.
-
-    Returns:
-      A `tf.data.Dataset`. Caller might use the dataset in different
-      context, e.g. iter(dataset) in eager to get the value directly, or in
-      graph mode, provide the iterator tensor to Keras model function.
-    """
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def get_size(self):
-    """Return the size (number of batches) for the dataset created.
-
-    For certain type of the data input, the number of batches is known, eg for
-    Numpy data, the size is same as (number_of_element / batch_size). Whereas
-    for dataset or python generator, the size is unknown since it may or may not
-    have an end state.
-
-    Returns:
-      int, the number of batches for the dataset, or None if it is unknown. The
-      caller could use this to control the loop of training, show progress bar,
-      or handle unexpected StopIteration error.
-    """
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def batch_size(self):
-    """Return the batch size of the dataset created.
-
-    For certain type of the data input, the batch size is known, and even
-    required, like numpy array. Whereas for dataset, the batch is unknown
-    unless we take a peek.
-
-    Returns:
-      int, the batch size of the dataset, or None if it is unknown.
-    """
-    raise NotImplementedError
-
-  def representative_batch_size(self):
-    """Return a representative size for batches in the dataset.
-
-    This is not guaranteed to be the batch size for all batches in the
-    dataset. It just needs to be a rough approximation for batch sizes in
-    the dataset.
-
-    Returns:
-      int, a representative size for batches found in the dataset,
-      or None if it is unknown.
-    """
-    return self.batch_size()
-
-  @abc.abstractmethod
-  def has_partial_batch(self):
-    """Whether the dataset has partial batch at the end."""
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def partial_batch_size(self):
-    """The size of the final partial batch for dataset.
-
-    Will return None if has_partial_batch is False or batch_size is None.
+    """Base class for input data adapter.
+
+    In TF 2.0, tf.data is the preferred API for user to feed in data. In order
+    to simplify the training code path, all the input data object will be
+    converted to `tf.data.Dataset` if possible.
+
+    Note that since this class is mainly targeted for TF 2.0, it might have a
+    lot of assumptions under the hood, e.g. eager context by default,
+    distribution strategy, etc. In the meantime, some legacy feature support
+    might be dropped, eg, Iterator from dataset API in v1, etc.
+
+    The sample usage of this class is like:
+
+    ```
+    x = tf.data.Dataset.range(100)
+    adapter_cls = [NumpyArrayDataAdapter, ..., DatasetAdapter]
+    applicable_adapters = [cls for cls in adapter_cls if cls.can_handle(x)]
+    if len(applicable_adapters) != 1:
+      raise ValueError("Expect only one adapter class to handle the input")
+
+    dataset = applicable_adapters[0](x).get_dataset()
+    for data in dataset:
+      # training
+    ```
     """
-    raise NotImplementedError
 
-  @abc.abstractmethod
-  def should_recreate_iterator(self):
-    """Returns whether a new iterator should be created every epoch."""
-    raise NotImplementedError
-
-  def get_samples(self):
-    """Returns number of samples in the data, or `None`."""
-    if not self.get_size() or not self.batch_size():
-      return None
-    total_sample = self.get_size() * self.batch_size()
-    if self.has_partial_batch():
-      total_sample -= (self.batch_size() - self.partial_batch_size())
-    return total_sample
-
-  def on_epoch_end(self):
-    """A hook called after each epoch."""
-    pass
+    @staticmethod
+    def can_handle(x, y=None):
+        """Whether the current DataAdapter could handle the input x and y.
+
+        Structure wise, x and y can be single object, or list of objects if
+        there multiple input/output, or dictionary of objects when the
+        input/output are named.
+
+        Args:
+          x: input features.
+          y: target labels. Note that y could be None in the case of prediction.
+
+        Returns:
+          boolean
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def __init__(self, x, y=None, **kwargs):
+        """Create a DataAdapter based on data inputs.
+
+        The caller must make sure to call `can_handle()` first before invoking
+        this method. Provide unsupported data type will result into unexpected
+        behavior.
+
+        Args:
+          x: input features.
+          y: target labels. Note that y could be None in the case of prediction.
+          **kwargs: Other keyword arguments for DataAdapter during the
+            construction of the tf.dataset.Dataset. For example:
+            - Numpy data might have `sample_weights` which will be used for
+              weighting the loss function during training.
+            - Numpy data might need to have `batch_size` parameter when
+              constructing the dataset and iterator.
+            - Certain input might need to be distribution strategy aware. When
+              `distribution_strategy` is passed, the created dataset need to
+              respect the strategy.
+            DataAdapter might choose to ignore any keyword argument if it
+            doesn't use it, or raise exception if any required argument is not
+            provided.
+        """
+        if not self.can_handle(x, y):
+            raise ValueError(f"{self.__class__} Cannot handle input {x}, {y}")
+
+    @abc.abstractmethod
+    def get_dataset(self):
+        """Get a dataset instance for the current DataAdapter.
+
+        Note that the dataset returned does not repeat for epoch, so caller
+        might need to create new iterator for the same dataset at the beginning
+        of the epoch. This behavior might change in the future.
+
+        Returns:
+          A `tf.data.Dataset`. Caller might use the dataset in different
+          context, e.g. iter(dataset) in eager to get the value directly, or in
+          graph mode, provide the iterator tensor to Keras model function.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_size(self):
+        """Return the size (number of batches) for the dataset created.
+
+        For certain type of the data input, the number of batches is known, eg
+        for Numpy data, the size is same as (number_of_element / batch_size).
+        Whereas for dataset or python generator, the size is unknown since it
+        may or may not have an end state.
+
+        Returns:
+          int, the number of batches for the dataset, or None if it is unknown.
+          The caller could use this to control the loop of training, show
+          progress bar, or handle unexpected StopIteration error.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def batch_size(self):
+        """Return the batch size of the dataset created.
+
+        For certain type of the data input, the batch size is known, and even
+        required, like numpy array. Whereas for dataset, the batch is unknown
+        unless we take a peek.
+
+        Returns:
+          int, the batch size of the dataset, or None if it is unknown.
+        """
+        raise NotImplementedError
+
+    def representative_batch_size(self):
+        """Return a representative size for batches in the dataset.
+
+        This is not guaranteed to be the batch size for all batches in the
+        dataset. It just needs to be a rough approximation for batch sizes in
+        the dataset.
+
+        Returns:
+          int, a representative size for batches found in the dataset,
+          or None if it is unknown.
+        """
+        return self.batch_size()
+
+    @abc.abstractmethod
+    def has_partial_batch(self):
+        """Whether the dataset has partial batch at the end."""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def partial_batch_size(self):
+        """The size of the final partial batch for dataset.
+
+        Will return None if has_partial_batch is False or batch_size is None.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def should_recreate_iterator(self):
+        """Returns whether a new iterator should be created every epoch."""
+        raise NotImplementedError
+
+    def get_samples(self):
+        """Returns number of samples in the data, or `None`."""
+        if not self.get_size() or not self.batch_size():
+            return None
+        total_sample = self.get_size() * self.batch_size()
+        if self.has_partial_batch():
+            total_sample -= self.batch_size() - self.partial_batch_size()
+        return total_sample
+
+    def on_epoch_end(self):
+        """A hook called after each epoch."""
+        pass
 
 
 class TensorLikeDataAdapter(DataAdapter):
-  """Adapter that handles Tensor-like objects, e.g. EagerTensor and NumPy."""
-
-  @staticmethod
-  def can_handle(x, y=None):
-    # TODO(kaftan): Check performance implications of using a flatten
-    #  here for other types of inputs.
-    flat_inputs = tf.nest.flatten(x)
-    if y is not None:
-      flat_inputs += tf.nest.flatten(y)
-
-    tensor_types = _get_tensor_types()
-
-    def _is_tensor(v):
-      if isinstance(v, tensor_types):
-        return True
-      return False
-
-    return all(_is_tensor(v) for v in flat_inputs)
-
-  def __init__(self,
-               x,
-               y=None,
-               sample_weights=None,
-               sample_weight_modes=None,
-               batch_size=None,
-               epochs=1,
-               steps=None,
-               shuffle=False,
-               **kwargs):
-    super().__init__(x, y, **kwargs)
-    x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
-    sample_weight_modes = broadcast_sample_weight_modes(
-        sample_weights, sample_weight_modes)
-
-    # If sample_weights are not specified for an output use 1.0 as weights.
-    (sample_weights, _, _) = training_utils.handle_partial_sample_weights(
-        y, sample_weights, sample_weight_modes, check_all_flat=True)
-
-    inputs = pack_x_y_sample_weight(x, y, sample_weights)
-
-    num_samples = set(int(i.shape[0]) for i in tf.nest.flatten(inputs)).pop()
-    _check_data_cardinality(inputs)
-
-    # If batch_size is not passed but steps is, calculate from the input data.
-    # Default to 32 for backwards compat.
-    if not batch_size:
-      batch_size = int(math.ceil(num_samples / steps)) if steps else 32
-
-    self._size = int(math.ceil(num_samples / batch_size))
-    self._batch_size = batch_size
-
-    num_full_batches = int(num_samples // batch_size)
-    self._partial_batch_size = num_samples % batch_size
-
-    if isinstance(shuffle, str):
-      shuffle = shuffle.lower()
-
-    self._shuffle = shuffle
-    # Vectorized version of shuffle.
-    # This is a performance improvement over using `from_tensor_slices`.
-    # The indices of the data are shuffled and batched, and these indices
-    # are then zipped with the data and used to extract a batch of the data
-    # at each step. The performance improvements here come from:
-    # 1. vectorized batch using gather
-    # 2. parallelized map
-    # 3. pipelined permutation generation
-    # 4. optimized permutation batching
-    # 5. disabled static optimizations
-
-    indices_dataset = tf.data.Dataset.range(1)
-    if shuffle != "batch":
-      indices_dataset = indices_dataset.repeat(epochs)
-
-    def permutation(_):
-      # It turns out to be more performant to make a new set of indices rather
-      # than reusing the same range Tensor. (presumably because of buffer
-      # forwarding.)
-      indices = tf.range(num_samples, dtype=tf.int64)
-      if shuffle and shuffle != "batch":
-        indices = tf.random.shuffle(indices)
-      return indices
-
-    # We prefetch a single element. Computing large permutations can take quite
-    # a while so we don't want to wait for prefetching over an epoch boundary to
-    # trigger the next permutation. On the other hand, too many simultaneous
-    # shuffles can contend on a hardware level and degrade all performance.
-    indices_dataset = indices_dataset.map(permutation).prefetch(1)
-
-    def slice_batch_indices(indices):
-      """Convert a Tensor of indices into a dataset of batched indices.
-
-      This step can be accomplished in several ways. The most natural is to
-      slice the Tensor in a Dataset map. (With a condition on the upper index to
-      handle the partial batch.) However it turns out that coercing the Tensor
-      into a shape which is divisible by the batch size (and handling the last
-      partial batch separately) allows for a much more favorable memory access
-      pattern and improved performance.
-
-      Args:
-        indices: Tensor which determines the data order for an entire epoch.
-
-      Returns:
-        A Dataset of batched indices.
-      """
-      num_in_full_batch = num_full_batches * batch_size
-      first_k_indices = tf.slice(indices, [0], [num_in_full_batch])
-      first_k_indices = tf.reshape(
-          first_k_indices, [num_full_batches, batch_size])
-
-      flat_dataset = tf.data.Dataset.from_tensor_slices(first_k_indices)
-      if self._partial_batch_size:
-        index_remainder = tf.data.Dataset.from_tensors(tf.slice(
-            indices, [num_in_full_batch], [self._partial_batch_size]))
-        flat_dataset = flat_dataset.concatenate(index_remainder)
-
-      if shuffle == "batch":
-        # 1024 is a magic constant that has not been properly evaluated
-        flat_dataset = flat_dataset.shuffle(1024).repeat(epochs)
-      return flat_dataset
-
-    indices_dataset = indices_dataset.flat_map(slice_batch_indices)
-
-    dataset = self.slice_inputs(indices_dataset, inputs)
-
-    if shuffle == "batch":
-      def shuffle_batch(*batch):
-        return tf.nest.map_structure(tf.random.shuffle, batch)
-      dataset = dataset.map(shuffle_batch)
-
-    self._dataset = dataset
-
-  def slice_inputs(self, indices_dataset, inputs):
-    """Slice inputs into a Dataset of batches.
-
-    Given a Dataset of batch indices and the unsliced inputs,
-    this step slices the inputs in a parallelized fashion
-    and produces a dataset of input batches.
-
-    Args:
-      indices_dataset: A Dataset of batched indices
-      inputs: A python data structure that contains the inputs, targets,
-        and possibly sample weights.
-
-    Returns:
-      A Dataset of input batches matching the batch indices.
-    """
-    dataset = tf.data.Dataset.zip((
-        indices_dataset,
-        tf.data.Dataset.from_tensors(inputs).repeat()
-    ))
+    """Adapter that handles Tensor-like objects, e.g. EagerTensor and NumPy."""
 
-    def grab_batch(i, data):
-      return tf.nest.map_structure(lambda d: tf.gather(d, i, axis=0), data)
+    @staticmethod
+    def can_handle(x, y=None):
+        # TODO(kaftan): Check performance implications of using a flatten
+        #  here for other types of inputs.
+        flat_inputs = tf.nest.flatten(x)
+        if y is not None:
+            flat_inputs += tf.nest.flatten(y)
 
-    dataset = dataset.map(
-        grab_batch, num_parallel_calls=tf.data.AUTOTUNE)
+        tensor_types = _get_tensor_types()
 
-    # Default optimizations are disabled to avoid the overhead of (unnecessary)
-    # input pipeline graph serialization and deserialization
-    options = tf.data.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    if self._shuffle:
-      # See b/141490660 for more details.
-      options.experimental_external_state_policy = (
-          tf.data.experimental.ExternalStatePolicy.IGNORE)
-    dataset = dataset.with_options(options)
-    return dataset
+        def _is_tensor(v):
+            if isinstance(v, tensor_types):
+                return True
+            return False
 
-  def get_dataset(self):
-    return self._dataset
+        return all(_is_tensor(v) for v in flat_inputs)
 
-  def get_size(self):
-    return self._size
-
-  def batch_size(self):
-    return self._batch_size
-
-  def has_partial_batch(self):
-    return self._partial_batch_size > 0
+    def __init__(
+        self,
+        x,
+        y=None,
+        sample_weights=None,
+        sample_weight_modes=None,
+        batch_size=None,
+        epochs=1,
+        steps=None,
+        shuffle=False,
+        **kwargs,
+    ):
+        super().__init__(x, y, **kwargs)
+        x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
+        sample_weight_modes = broadcast_sample_weight_modes(
+            sample_weights, sample_weight_modes
+        )
+
+        # If sample_weights are not specified for an output use 1.0 as weights.
+        (sample_weights, _, _) = training_utils.handle_partial_sample_weights(
+            y, sample_weights, sample_weight_modes, check_all_flat=True
+        )
+
+        inputs = pack_x_y_sample_weight(x, y, sample_weights)
+
+        num_samples = set(
+            int(i.shape[0]) for i in tf.nest.flatten(inputs)
+        ).pop()
+        _check_data_cardinality(inputs)
+
+        # If batch_size is not passed but steps is, calculate from the input
+        # data.  Defaults to `32` for backwards compatibility.
+        if not batch_size:
+            batch_size = int(math.ceil(num_samples / steps)) if steps else 32
+
+        self._size = int(math.ceil(num_samples / batch_size))
+        self._batch_size = batch_size
+
+        num_full_batches = int(num_samples // batch_size)
+        self._partial_batch_size = num_samples % batch_size
+
+        if isinstance(shuffle, str):
+            shuffle = shuffle.lower()
+
+        self._shuffle = shuffle
+        # Vectorized version of shuffle.
+        # This is a performance improvement over using `from_tensor_slices`.
+        # The indices of the data are shuffled and batched, and these indices
+        # are then zipped with the data and used to extract a batch of the data
+        # at each step. The performance improvements here come from:
+        # 1. vectorized batch using gather
+        # 2. parallelized map
+        # 3. pipelined permutation generation
+        # 4. optimized permutation batching
+        # 5. disabled static optimizations
+
+        indices_dataset = tf.data.Dataset.range(1)
+        if shuffle != "batch":
+            indices_dataset = indices_dataset.repeat(epochs)
+
+        def permutation(_):
+            # It turns out to be more performant to make a new set of indices
+            # rather than reusing the same range Tensor. (presumably because of
+            # buffer forwarding.)
+            indices = tf.range(num_samples, dtype=tf.int64)
+            if shuffle and shuffle != "batch":
+                indices = tf.random.shuffle(indices)
+            return indices
+
+        # We prefetch a single element. Computing large permutations can take
+        # quite a while so we don't want to wait for prefetching over an epoch
+        # boundary to trigger the next permutation. On the other hand, too many
+        # simultaneous shuffles can contend on a hardware level and degrade all
+        # performance.
+        indices_dataset = indices_dataset.map(permutation).prefetch(1)
+
+        def slice_batch_indices(indices):
+            """Convert a Tensor of indices into a dataset of batched indices.
+
+            This step can be accomplished in several ways. The most natural is
+            to slice the Tensor in a Dataset map. (With a condition on the upper
+            index to handle the partial batch.) However it turns out that
+            coercing the Tensor into a shape which is divisible by the batch
+            size (and handling the last partial batch separately) allows for a
+            much more favorable memory access pattern and improved performance.
+
+            Args:
+              indices: Tensor which determines the data order for an entire
+                epoch.
+
+            Returns:
+              A Dataset of batched indices.
+            """
+            num_in_full_batch = num_full_batches * batch_size
+            first_k_indices = tf.slice(indices, [0], [num_in_full_batch])
+            first_k_indices = tf.reshape(
+                first_k_indices, [num_full_batches, batch_size]
+            )
+
+            flat_dataset = tf.data.Dataset.from_tensor_slices(first_k_indices)
+            if self._partial_batch_size:
+                index_remainder = tf.data.Dataset.from_tensors(
+                    tf.slice(
+                        indices, [num_in_full_batch], [self._partial_batch_size]
+                    )
+                )
+                flat_dataset = flat_dataset.concatenate(index_remainder)
+
+            if shuffle == "batch":
+                # 1024 is a magic constant that has not been properly evaluated
+                flat_dataset = flat_dataset.shuffle(1024).repeat(epochs)
+            return flat_dataset
+
+        indices_dataset = indices_dataset.flat_map(slice_batch_indices)
+
+        dataset = self.slice_inputs(indices_dataset, inputs)
+
+        if shuffle == "batch":
+
+            def shuffle_batch(*batch):
+                return tf.nest.map_structure(tf.random.shuffle, batch)
+
+            dataset = dataset.map(shuffle_batch)
+
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = (
+            tf.data.experimental.AutoShardPolicy.DATA
+        )
+        dataset = dataset.with_options(options)
+
+        self._dataset = dataset.prefetch(tf.data.AUTOTUNE)
+
+    def slice_inputs(self, indices_dataset, inputs):
+        """Slice inputs into a Dataset of batches.
+
+        Given a Dataset of batch indices and the unsliced inputs,
+        this step slices the inputs in a parallelized fashion
+        and produces a dataset of input batches.
+
+        Args:
+          indices_dataset: A Dataset of batched indices
+          inputs: A python data structure that contains the inputs, targets,
+            and possibly sample weights.
+
+        Returns:
+          A Dataset of input batches matching the batch indices.
+        """
+        dataset = tf.data.Dataset.zip(
+            (indices_dataset, tf.data.Dataset.from_tensors(inputs).repeat())
+        )
+
+        def grab_batch(i, data):
+            return tf.nest.map_structure(
+                lambda d: tf.gather(d, i, axis=0), data
+            )
+
+        dataset = dataset.map(grab_batch, num_parallel_calls=tf.data.AUTOTUNE)
+
+        # Default optimizations are disabled to avoid the overhead of
+        # (unnecessary) input pipeline graph serialization and deserialization
+        options = tf.data.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        if self._shuffle:
+            # See b/141490660 for more details.
+            options.experimental_external_state_policy = (
+                tf.data.experimental.ExternalStatePolicy.IGNORE
+            )
+        dataset = dataset.with_options(options)
+        return dataset
+
+    def get_dataset(self):
+        return self._dataset
+
+    def get_size(self):
+        return self._size
 
-  def partial_batch_size(self):
-    return self._partial_batch_size or None
+    def batch_size(self):
+        return self._batch_size
+
+    def has_partial_batch(self):
+        return self._partial_batch_size > 0
+
+    def partial_batch_size(self):
+        return self._partial_batch_size or None
 
-  def should_recreate_iterator(self):
-    # An infinite dataset is always created here.
-    return False
+    def should_recreate_iterator(self):
+        # An infinite dataset is always created here.
+        return False
 
 
 class GenericArrayLikeDataAdapter(TensorLikeDataAdapter):
-  """Adapter that handles array-like data without forcing it into memory.
-
-  This adapter handles array-like datasets that may be too big to fully
-  fit into memory.
-
-  Specifically, this adapter handles any Python class which implements:
-  `__get_item__`, `__len__`, `shape`, and `dtype` with the same meanings
-  as Numpy, but it ignores any case where all the inputs are Tensors or Numpy
-  arrays (because that case is handled by the base TensorLikeDataAdapter).
-
-  It ignores scipy sparse matrices and Composite Tensors because those are
-  handled by the CompositeTensorDataAdapter.
-
-  It also does not handle lists/tuples of scalars, because those are handled
-  by the ListsOfScalarsDataAdapter.
-  """
-
-  @staticmethod
-  def can_handle(x, y=None):
-    flat_inputs = tf.nest.flatten(x)
-    if y is not None:
-      flat_inputs += tf.nest.flatten(y)
-
-    def _is_array_like(v):
-      """Return True if v is a Tensor, array, or is array-like."""
-      return (
-          hasattr(v, "__getitem__") and
-          hasattr(v, "shape") and
-          hasattr(v, "dtype") and
-          hasattr(v, "__len__")
-      )
-
-    if (not TensorLikeDataAdapter.can_handle(x, y) and
-        not CompositeTensorDataAdapter.can_handle(x, y)):
-      return all(_is_array_like(v) for v in flat_inputs)
-    else:
-      return False
-
-  def __init__(self, *args, **kwargs):
-    logging.warning(
-        "Keras is training/fitting/evaluating on array-like data. Keras may "
-        "not be optimized for this format, so if your input data format is "
-        "supported by TensorFlow I/O (https://github.com/tensorflow/io) we "
-        "recommend using that to load a Dataset instead.")
-
-    super().__init__(*args, **kwargs)
+    """Adapter that handles array-like data without forcing it into memory.
 
-  def slice_inputs(self, indices_dataset, inputs):
-    """Slice inputs into a Dataset of batches.
+    This adapter handles array-like datasets that may be too big to fully
+    fit into memory.
 
-    Given a Dataset of batch indices and the unsliced inputs,
-    this step slices the inputs in a parallelized fashion
-    and produces a dataset of input batches.
+    Specifically, this adapter handles any Python class which implements:
+    `__get_item__`, `__len__`, `shape`, and `dtype` with the same meanings
+    as Numpy, but it ignores any case where all the inputs are Tensors or Numpy
+    arrays (because that case is handled by the base TensorLikeDataAdapter).
 
-    Args:
-      indices_dataset: A Dataset of batched indices
-      inputs: A python data structure that contains the inputs, targets,
-        and possibly sample weights.
+    It ignores scipy sparse matrices and Composite Tensors because those are
+    handled by the CompositeTensorDataAdapter.
 
-    Returns:
-      A Dataset of input batches matching the batch indices.
+    It also does not handle lists/tuples of scalars, because those are handled
+    by the ListsOfScalarsDataAdapter.
     """
-    flat_inputs = tf.nest.flatten(inputs)
-    def dynamic_shape_like(t):
-      shape = list(t.shape)
-      shape[0] = None
-      return tuple(shape)
-
-    flat_dtypes = [inp.dtype for inp in flat_inputs]
-    contiguous = True
-    if self._shuffle and self._shuffle != "batch":
-      contiguous = False
-
-    def grab_batch(indices):
-      """Grab a batch of data from the inputs."""
-      # This uses a py_function to avoid converting the array-like
-      # into a Tensor before slicing it, because converting the array-like
-      # to a Tensor may force it into memory..
-      def py_method(ind):
-        def slice_array(data):
-          return training_utils.slice_arrays(data, ind.numpy(),
-                                             contiguous=contiguous)
-        return [slice_array(inp) for inp in flat_inputs]
-
-      flat_out = tf.py_function(py_method, [indices], flat_dtypes)
-      for v, original_inp in zip(flat_out, flat_inputs):
-        v.set_shape(dynamic_shape_like(original_inp))
-      return tf.nest.pack_sequence_as(inputs, flat_out)
-
-    dataset = indices_dataset.map(
-        grab_batch, num_parallel_calls=tf.data.AUTOTUNE)
-
-    return dataset
+
+    @staticmethod
+    def can_handle(x, y=None):
+        flat_inputs = tf.nest.flatten(x)
+        if y is not None:
+            flat_inputs += tf.nest.flatten(y)
+
+        def _is_array_like(v):
+            """Return True if v is a Tensor, array, or is array-like."""
+            return (
+                hasattr(v, "__getitem__")
+                and hasattr(v, "shape")
+                and hasattr(v, "dtype")
+                and hasattr(v, "__len__")
+            )
+
+        if not TensorLikeDataAdapter.can_handle(
+            x, y
+        ) and not CompositeTensorDataAdapter.can_handle(x, y):
+            return all(_is_array_like(v) for v in flat_inputs)
+        else:
+            return False
+
+    def __init__(self, *args, **kwargs):
+        logging.warning(
+            "Keras is training/fitting/evaluating on array-like data. Keras "
+            "may not be optimized for this format, so if your input data "
+            "format is supported by TensorFlow I/O "
+            "(https://github.com/tensorflow/io) we recommend using that to "
+            "load a Dataset instead."
+        )
+
+        super().__init__(*args, **kwargs)
+
+    def slice_inputs(self, indices_dataset, inputs):
+        """Slice inputs into a Dataset of batches.
+
+        Given a Dataset of batch indices and the unsliced inputs,
+        this step slices the inputs in a parallelized fashion
+        and produces a dataset of input batches.
+
+        Args:
+          indices_dataset: A Dataset of batched indices
+          inputs: A python data structure that contains the inputs, targets,
+            and possibly sample weights.
+
+        Returns:
+          A Dataset of input batches matching the batch indices.
+        """
+        flat_inputs = tf.nest.flatten(inputs)
+
+        def dynamic_shape_like(t):
+            shape = list(t.shape)
+            shape[0] = None
+            return tuple(shape)
+
+        flat_dtypes = [inp.dtype for inp in flat_inputs]
+        contiguous = True
+        if self._shuffle and self._shuffle != "batch":
+            contiguous = False
+
+        def grab_batch(indices):
+            """Grab a batch of data from the inputs."""
+            # This uses a py_function to avoid converting the array-like
+            # into a Tensor before slicing it, because converting the array-like
+            # to a Tensor may force it into memory..
+            def py_method(ind):
+                def slice_array(data):
+                    return training_utils.slice_arrays(
+                        data, ind.numpy(), contiguous=contiguous
+                    )
+
+                return [slice_array(inp) for inp in flat_inputs]
+
+            flat_out = tf.py_function(py_method, [indices], flat_dtypes)
+            for v, original_inp in zip(flat_out, flat_inputs):
+                v.set_shape(dynamic_shape_like(original_inp))
+            return tf.nest.pack_sequence_as(inputs, flat_out)
+
+        dataset = indices_dataset.map(
+            grab_batch, num_parallel_calls=tf.data.AUTOTUNE
+        )
+
+        return dataset
 
 
 class DatasetCreatorAdapter(DataAdapter):
-  """Adapter that handles dataset functions."""
-
-  def __init__(self, x, y, steps=None, distribution_strategy=None, **kwargs):
-    super().__init__(x, **kwargs)
-
-    if not isinstance(x, dataset_creator.DatasetCreator):
-      raise TypeError("The input of a `DatasetCreatorAdapter` should be a "
-                      "`DatasetCreator` but it received type {}.".format(
-                          type(x)))
-    if steps is None:
-      raise ValueError("When using a "
-                       "`tf.keras.utils.experimental.DatasetCreator`, "
-                       "`steps_per_epoch`, `validation_steps` or `steps` "
-                       "argument must be provided in `Model.fit`, "
-                       "`Model.evaluate`, or `Model.predict`.")
-    self.dataset_creator = x
-    self.steps = steps
-    self.strategy = distribution_strategy
-
-  @staticmethod
-  def can_handle(x, y=None):
-    if isinstance(x, dataset_creator.DatasetCreator):
-      assert y is None
-      return True
-
-  def should_recreate_iterator(self):
-    # We expect users to shuffle the dataset in their `dataset_fn` supplied to
-    # `DatasetCreator`. Since that is a buffered shuffle, we intend to not reset
-    # the dataset so the batches that are not shuffled can still be pulled.
-    return False
-
-  def get_size(self):
-    return None  # To be inferred by `DataHandler`.
-
-  def get_dataset(self):
-    return self.strategy.distribute_datasets_from_function(
-        self.dataset_creator, options=self.dataset_creator.input_options)
-
-  def batch_size(self):
-    raise NotImplementedError()
-
-  def has_partial_batch(self):
-    raise NotImplementedError()
-
-  def partial_batch_size(self):
-    raise NotImplementedError()
+    """Adapter that handles dataset functions."""
+
+    def __init__(self, x, y, steps=None, distribution_strategy=None, **kwargs):
+        super().__init__(x, **kwargs)
+
+        if not isinstance(x, dataset_creator.DatasetCreator):
+            raise TypeError(
+                "The input of a `DatasetCreatorAdapter` should be a "
+                "`DatasetCreator` but it received type {}.".format(type(x))
+            )
+        if steps is None:
+            if not kwargs.get("pss_evaluation_shards"):
+                raise ValueError(
+                    "When using a "
+                    "`tf.keras.utils.experimental.DatasetCreator`, "
+                    "`steps_per_epoch`, `validation_steps`, `steps`, or "
+                    "`pss_evaluation_shards` argument must be provided in "
+                    "`Model.fit`, `Model.evaluate`, or `Model.predict`."
+                )
+        self.dataset_creator = x
+        self.steps = steps
+        self.strategy = distribution_strategy
+
+    @staticmethod
+    def can_handle(x, y=None):
+        if isinstance(x, dataset_creator.DatasetCreator):
+            assert y is None
+            return True
+
+    def should_recreate_iterator(self):
+        # We expect users to shuffle the dataset in their `dataset_fn` supplied
+        # to `DatasetCreator`. Since that is a buffered shuffle, we intend to
+        # not reset the dataset so the batches that are not shuffled can still
+        # be pulled.
+        return False
+
+    def get_size(self):
+        return None  # To be inferred by `DataHandler`.
+
+    def get_dataset(self):
+        return self.strategy.distribute_datasets_from_function(
+            self.dataset_creator, options=self.dataset_creator.input_options
+        )
+
+    def batch_size(self):
+        raise NotImplementedError()
+
+    def has_partial_batch(self):
+        raise NotImplementedError()
+
+    def partial_batch_size(self):
+        raise NotImplementedError()
 
 
 class CompositeTensorDataAdapter(DataAdapter):
-  """Adapter that handles composite tensor."""
-
-  @staticmethod
-  def can_handle(x, y=None):
-    flat_inputs = tf.nest.flatten(x)
-    if y is not None:
-      flat_inputs += tf.nest.flatten(y)
-
-    def _is_composite(v):
-      # Dataset/iterator/DistributedDataset inherits from CompositeTensor but
-      # should be handled by DatasetAdapter and GeneratorAdapter.
-      if (tf_utils.is_extension_type(v) and
-          not isinstance(v,
-                         (tf.data.Dataset, tf.data.Iterator)) and
-          not _is_distributed_dataset(v)):
-        return True
-      # Support Scipy sparse tensors if scipy is installed
-      return _is_scipy_sparse(v)
-
-    def _is_tensor_or_composite(v):
-      if isinstance(v, (tf.Tensor, np.ndarray)):
+    """Adapter that handles composite tensor."""
+
+    @staticmethod
+    def can_handle(x, y=None):
+        flat_inputs = tf.nest.flatten(x)
+        if y is not None:
+            flat_inputs += tf.nest.flatten(y)
+
+        def _is_composite(v):
+            # Dataset/iterator/DistributedDataset inherits from CompositeTensor
+            # but should be handled by DatasetAdapter and GeneratorAdapter.
+            if (
+                tf_utils.is_extension_type(v)
+                and not isinstance(v, (tf.data.Dataset, tf.data.Iterator))
+                and not _is_distributed_dataset(v)
+            ):
+                return True
+            # Support Scipy sparse tensors if scipy is installed
+            return _is_scipy_sparse(v)
+
+        def _is_tensor_or_composite(v):
+            if isinstance(v, (tf.Tensor, np.ndarray)):
+                return True
+            return _is_composite(v)
+
+        return any(_is_composite(v) for v in flat_inputs) and all(
+            _is_tensor_or_composite(v) for v in flat_inputs
+        )
+
+    def __init__(
+        self,
+        x,
+        y=None,
+        sample_weights=None,
+        sample_weight_modes=None,
+        batch_size=None,
+        steps=None,
+        shuffle=False,
+        **kwargs,
+    ):
+        super().__init__(x, y, **kwargs)
+        x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
+        sample_weight_modes = broadcast_sample_weight_modes(
+            sample_weights, sample_weight_modes
+        )
+
+        # If sample_weights are not specified for an output use 1.0 as weights.
+        (sample_weights, _, _) = training_utils.handle_partial_sample_weights(
+            y, sample_weights, sample_weight_modes, check_all_flat=True
+        )
+
+        inputs = pack_x_y_sample_weight(x, y, sample_weights)
+
+        dataset = tf.data.Dataset.from_tensor_slices(inputs)
+        num_samples = int(tf.nest.flatten(x)[0].shape[0])
+        if shuffle:
+            dataset = dataset.shuffle(num_samples)
+
+        # If batch_size is not passed but steps is, calculate from the input
+        # data.  Defaults to `32` for backwards compatibility.
+        if not batch_size:
+            batch_size = int(math.ceil(num_samples / steps)) if steps else 32
+
+        dataset = dataset.batch(batch_size)
+        self._size = int(math.ceil(num_samples / batch_size))
+        self._batch_size = batch_size
+        self._has_partial_batch = self._size != (num_samples // batch_size)
+
+        self._partial_batch_size = None
+        if self._has_partial_batch:
+            self._partial_batch_size = (
+                num_samples - (self._size - 1) * self._batch_size
+            )
+
+        self._dataset = dataset.prefetch(tf.data.AUTOTUNE)
+
+    def get_dataset(self):
+        return self._dataset
+
+    def get_size(self):
+        return self._size
+
+    def batch_size(self):
+        return self._batch_size
+
+    def has_partial_batch(self):
+        return self._has_partial_batch
+
+    def partial_batch_size(self):
+        return self._partial_batch_size
+
+    def should_recreate_iterator(self):
         return True
-      return _is_composite(v)
-
-    return (any(_is_composite(v) for v in flat_inputs) and
-            all(_is_tensor_or_composite(v) for v in flat_inputs))
-
-  def __init__(self,
-               x,
-               y=None,
-               sample_weights=None,
-               sample_weight_modes=None,
-               batch_size=None,
-               steps=None,
-               shuffle=False,
-               **kwargs):
-    super().__init__(x, y, **kwargs)
-    x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
-    sample_weight_modes = broadcast_sample_weight_modes(
-        sample_weights, sample_weight_modes)
-
-    # If sample_weights are not specified for an output use 1.0 as weights.
-    (sample_weights, _, _) = training_utils.handle_partial_sample_weights(
-        y, sample_weights, sample_weight_modes, check_all_flat=True)
-
-    inputs = pack_x_y_sample_weight(x, y, sample_weights)
-
-    dataset = tf.data.Dataset.from_tensor_slices(inputs)
-    num_samples = int(tf.nest.flatten(x)[0].shape[0])
-    if shuffle:
-      dataset = dataset.shuffle(num_samples)
-
-    # If batch_size is not passed but steps is, calculate from the input data.
-    # Default to 32 for backwards compatibility.
-    if not batch_size:
-      batch_size = int(math.ceil(num_samples / steps)) if steps else 32
-
-    dataset = dataset.batch(batch_size)
-    self._size = int(math.ceil(num_samples / batch_size))
-    self._batch_size = batch_size
-    self._has_partial_batch = (self._size != (num_samples // batch_size))
-
-    self._partial_batch_size = None
-    if self._has_partial_batch:
-      self._partial_batch_size = (
-          num_samples - (self._size - 1) * self._batch_size)
-
-    self._dataset = dataset
-
-  def get_dataset(self):
-    return self._dataset
-
-  def get_size(self):
-    return self._size
-
-  def batch_size(self):
-    return self._batch_size
-
-  def has_partial_batch(self):
-    return self._has_partial_batch
-
-  def partial_batch_size(self):
-    return self._partial_batch_size
-
-  def should_recreate_iterator(self):
-    return True
 
 
 class ListsOfScalarsDataAdapter(DataAdapter):
-  """Adapter that handles lists of scalars and lists of lists of scalars."""
-
-  @staticmethod
-  def can_handle(x, y=None):
-    handles_x = ListsOfScalarsDataAdapter._is_list_of_scalars(x)
-    handles_y = True
-    if y is not None:
-      handles_y = ListsOfScalarsDataAdapter._is_list_of_scalars(y)
-    return handles_x and handles_y
-
-  @staticmethod
-  def _is_list_of_scalars(inp):
-    if isinstance(inp, (float, int, str, bytes, bytearray)):
-      return True
-    if isinstance(inp, (list, tuple)) and inp:
-      return ListsOfScalarsDataAdapter._is_list_of_scalars(inp[0])
-    return False
-
-  def __init__(self,
-               x,
-               y=None,
-               sample_weights=None,
-               sample_weight_modes=None,
-               batch_size=None,
-               shuffle=False,
-               **kwargs):
-    super().__init__(x, y, **kwargs)
-    x = np.asarray(x)
-    if y is not None:
-      y = np.asarray(y)
-    if sample_weights is not None:
-      sample_weights = np.asarray(sample_weights)
-    sample_weight_modes = broadcast_sample_weight_modes(
-        sample_weights, sample_weight_modes)
-
-    self._internal_adapter = TensorLikeDataAdapter(
+    """Adapter that handles lists of scalars and lists of lists of scalars."""
+
+    @staticmethod
+    def can_handle(x, y=None):
+        handles_x = ListsOfScalarsDataAdapter._is_list_of_scalars(x)
+        handles_y = True
+        if y is not None:
+            handles_y = ListsOfScalarsDataAdapter._is_list_of_scalars(y)
+        return handles_x and handles_y
+
+    @staticmethod
+    def _is_list_of_scalars(inp):
+        if isinstance(inp, (float, int, str, bytes, bytearray)):
+            return True
+        if isinstance(inp, (list, tuple)) and inp:
+            return ListsOfScalarsDataAdapter._is_list_of_scalars(inp[0])
+        return False
+
+    def __init__(
+        self,
         x,
-        y=y,
-        sample_weights=sample_weights,
-        sample_weight_modes=sample_weight_modes,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        **kwargs)
-
-  def get_dataset(self):
-    return self._internal_adapter.get_dataset()
-
-  def get_size(self):
-    return self._internal_adapter.get_size()
-
-  def batch_size(self):
-    return self._internal_adapter.batch_size()
-
-  def has_partial_batch(self):
-    return self._internal_adapter.has_partial_batch()
-
-  def partial_batch_size(self):
-    return self._internal_adapter.partial_batch_size()
-
-  def should_recreate_iterator(self):
-    return True
+        y=None,
+        sample_weights=None,
+        sample_weight_modes=None,
+        batch_size=None,
+        shuffle=False,
+        **kwargs,
+    ):
+        super().__init__(x, y, **kwargs)
+        x = np.asarray(x)
+        if y is not None:
+            y = np.asarray(y)
+        if sample_weights is not None:
+            sample_weights = np.asarray(sample_weights)
+        sample_weight_modes = broadcast_sample_weight_modes(
+            sample_weights, sample_weight_modes
+        )
+
+        self._internal_adapter = TensorLikeDataAdapter(
+            x,
+            y=y,
+            sample_weights=sample_weights,
+            sample_weight_modes=sample_weight_modes,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            **kwargs,
+        )
+
+    def get_dataset(self):
+        return self._internal_adapter.get_dataset()
+
+    def get_size(self):
+        return self._internal_adapter.get_size()
+
+    def batch_size(self):
+        return self._internal_adapter.batch_size()
+
+    def has_partial_batch(self):
+        return self._internal_adapter.has_partial_batch()
+
+    def partial_batch_size(self):
+        return self._internal_adapter.partial_batch_size()
+
+    def should_recreate_iterator(self):
+        return True
 
 
 class DatasetAdapter(DataAdapter):
-  """Adapter that handles `tf.data.Dataset`."""
-
-  @staticmethod
-  def can_handle(x, y=None):
-    return (isinstance(x, (tf.compat.v1.data.Dataset, tf.data.Dataset)) or
-            _is_distributed_dataset(x))
-
-  def __init__(self,
-               x,
-               y=None,
-               sample_weights=None,
-               steps=None,
-               **kwargs):
-    super().__init__(x, y, **kwargs)
-    # Note that the dataset instance is immutable, its fine to reuse the user
-    # provided dataset.
-    self._dataset = x
-
-    # The user-provided steps.
-    self._user_steps = steps
-
-    self._validate_args(y, sample_weights, steps)
-
-  def get_dataset(self):
-    return self._dataset
-
-  def get_size(self):
-    return  # Inferred in `DataHandler`.
-
-  def batch_size(self):
-    return None
-
-  def has_partial_batch(self):
-    return False
-
-  def partial_batch_size(self):
-    return None
-
-  def should_recreate_iterator(self):
-    # Since DistributedDatasets have no cardinality, the user must provide
-    # all steps that need to be run, calling `.repeat()` as needed.
-    if _is_distributed_dataset(self._dataset):
-      return False
-
-    # If user doesn't supply `steps`, or if they supply `steps` that
-    # exactly equals the size of the `Dataset`, create a new iterator
-    # each epoch.
-    return (self._user_steps is None or
-            tf.data.experimental.cardinality(self._dataset).numpy() == self._user_steps)
-
-  def _validate_args(self, y, sample_weights, steps):
-    """Validates `__init__` arguments."""
-    # Arguments that shouldn't be passed.
-    if not is_none_or_empty(y):
-      raise ValueError("`y` argument is not supported when using "
-                       "dataset as input.")
-    if not is_none_or_empty(sample_weights):
-      raise ValueError("`sample_weight` argument is not supported when using "
-                       "dataset as input.")
-
-    if steps is None:
-      if _is_distributed_dataset(self._dataset):
-        raise ValueError("When providing a distributed dataset, you must "
-                         "specify the number of steps to run.")
-
-      size = tf.data.experimental.cardinality(self._dataset).numpy()
-      if size == tf.data.experimental.INFINITE_CARDINALITY and steps is None:
-        raise ValueError(
-            "When providing an infinite dataset, you must specify "
-            "the number of steps to run (if you did not intend to "
-            "create an infinite dataset, make sure to not call "
-            "`repeat()` on the dataset).")
+    """Adapter that handles `tf.data.Dataset`."""
+
+    @staticmethod
+    def can_handle(x, y=None):
+        return isinstance(
+            x, (tf.compat.v1.data.Dataset, tf.data.Dataset)
+        ) or _is_distributed_dataset(x)
+
+    def __init__(self, x, y=None, sample_weights=None, steps=None, **kwargs):
+        super().__init__(x, y, **kwargs)
+        # Note that the dataset instance is immutable, its fine to reuse the
+        # user provided dataset.
+        self._dataset = x
+
+        # The user-provided steps.
+        self._user_steps = steps
+
+        self._validate_args(
+            y, sample_weights, steps, kwargs.get("pss_evaluation_shards")
+        )
+
+    def get_dataset(self):
+        return self._dataset
+
+    def get_size(self):
+        return  # Inferred in `DataHandler`.
+
+    def batch_size(self):
+        return None
+
+    def has_partial_batch(self):
+        return False
+
+    def partial_batch_size(self):
+        return None
+
+    def should_recreate_iterator(self):
+        # Since DistributedDatasets have no cardinality, the user must provide
+        # all steps that need to be run, calling `.repeat()` as needed.
+        if _is_distributed_dataset(self._dataset):
+            return False
+
+        # If user doesn't supply `steps`, or if they supply `steps` that
+        # exactly equals the size of the `Dataset`, create a new iterator
+        # each epoch.
+        return (
+            self._user_steps is None
+            or tf.data.experimental.cardinality(self._dataset).numpy()
+            == self._user_steps
+        )
+
+    def _validate_args(self, y, sample_weights, steps, pss_evaluation_shards):
+        """Validates `__init__` arguments."""
+        # Arguments that shouldn't be passed.
+        if not is_none_or_empty(y):
+            raise ValueError(
+                "`y` argument is not supported when using dataset as input."
+            )
+        if not is_none_or_empty(sample_weights):
+            raise ValueError(
+                "`sample_weight` argument is not supported when using "
+                "dataset as input."
+            )
+
+        if steps is None:
+            if _is_distributed_dataset(self._dataset):
+                if not pss_evaluation_shards:
+                    raise ValueError(
+                        "When providing a distributed dataset, you must "
+                        "specify the number of steps to run."
+                    )
+            else:
+                size = tf.data.experimental.cardinality(self._dataset).numpy()
+                if size == tf.data.experimental.INFINITE_CARDINALITY:
+                    if pss_evaluation_shards:
+                        raise ValueError(
+                            "When performing exact evaluation, the dataset "
+                            "must be finite. Make sure not to call `repeat()` "
+                            "on your dataset."
+                        )
+                    else:
+                        raise ValueError(
+                            "When providing an infinite dataset, you must "
+                            "specify the number of steps to run (if you did "
+                            "not intend to create an infinite dataset, make "
+                            "sure to not call `repeat()` on the dataset)."
+                        )
 
 
 class GeneratorDataAdapter(DataAdapter):
-  """Adapter that handles python generators and iterators."""
+    """Adapter that handles python generators and iterators."""
 
-  @staticmethod
-  def can_handle(x, y=None):
-    return ((hasattr(x, "__next__") or hasattr(x, "next"))
+    @staticmethod
+    def can_handle(x, y=None):
+        return (
+            (hasattr(x, "__next__") or hasattr(x, "next"))
             and hasattr(x, "__iter__")
-            and not isinstance(x, data_utils.Sequence))
-
-  def __init__(self,
-               x,
-               y=None,
-               sample_weights=None,
-               workers=1,
-               use_multiprocessing=False,
-               max_queue_size=10,
-               model=None,
-               **kwargs):
-    # Generators should never shuffle as exhausting the generator in order to
-    # shuffle the batches is inefficient.
-    kwargs.pop("shuffle", None)
-
-    if not is_none_or_empty(y):
-      raise ValueError("`y` argument is not supported when using "
-                       "python generator as input.")
-    if not is_none_or_empty(sample_weights):
-      raise ValueError("`sample_weight` argument is not supported when using "
-                       "python generator as input.")
-
-    super().__init__(x, y, **kwargs)
-
-    # Since we have to know the dtype of the python generator when we build the
-    # dataset, we have to look at a batch to infer the structure.
-    peek, x = self._peek_and_restore(x)
-    peek = self._standardize_batch(peek)
-    peek = _process_tensorlike(peek)
-
-    # Need to build the Model on concrete input shapes.
-    if model is not None and not model.built:
-      concrete_x, _, _ = unpack_x_y_sample_weight(peek)
-      try:
-        model.distribute_strategy.run(
-            lambda x: model(x, training=False), args=(concrete_x,))
-      except NotImplementedError:
-        # The above call may fail if the model is a container-like class that
-        # does not implement its own forward pass (e.g. a GAN or VAE where the
-        # forward pass is handled by subcomponents).
-        # Such a model does not need to be built.
-        pass
-
-    self._first_batch_size = int(tf.nest.flatten(peek)[0].shape[0])
-
-    def _get_tensor_spec(t):
-      # TODO(b/226395276): Remove _with_tensor_ranks_only usage.
-      return type_spec.type_spec_from_value(t)._with_tensor_ranks_only()  # pylint: disable=protected-access
-
-    output_signature = tf.nest.map_structure(_get_tensor_spec, peek)
-
-    # Note that dataset API takes a callable that creates a generator object,
-    # rather than generator itself, which is why we define a function here.
-    generator_fn = self._handle_multiprocessing(x, workers, use_multiprocessing,
-                                                max_queue_size)
-
-    def wrapped_generator():
-      for data in generator_fn():
-        yield self._standardize_batch(data)
-
-    dataset = tf.data.Dataset.from_generator(
-        wrapped_generator, output_signature=output_signature)
-
-    if workers == 1 and not use_multiprocessing:
-      dataset = dataset.prefetch(1)
-
-    self._dataset = dataset
-
-  def _standardize_batch(self, data):
-    """Standardizes a batch output by a generator."""
-    # Removes `None`s.
-    x, y, sample_weight = unpack_x_y_sample_weight(data)
-    data = pack_x_y_sample_weight(x, y, sample_weight)
-
-    data = tf.__internal__.nest.list_to_tuple(data)
+            and not isinstance(x, data_utils.Sequence)
+        )
 
-    def _convert_dtype(t):
-      if (isinstance(t, np.ndarray) and issubclass(t.dtype.type, np.floating)):
-        return np.array(t, dtype=backend.floatx())
-      return t
-
-    data = tf.nest.map_structure(_convert_dtype, data)
-    return data
-
-  @staticmethod
-  def _peek_and_restore(x):
-    peek = next(x)
-    return peek, itertools.chain([peek], x)
-
-  def _handle_multiprocessing(self, x, workers, use_multiprocessing,
-                              max_queue_size):
-    """Create a callable, possibly including an Enqueuer."""
-    if workers > 1 or (workers > 0 and use_multiprocessing):
-      def generator_fn():
-        enqueuer = data_utils.GeneratorEnqueuer(
-            x, use_multiprocessing=use_multiprocessing)
-        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-        return enqueuer.get()
-    else:
-      generator_fn = lambda: x
-    return generator_fn
-
-  def get_dataset(self):
-    return self._dataset
-
-  def get_size(self):
-    return None
-
-  def batch_size(self):
-    return None
-
-  def representative_batch_size(self):
-    return self._first_batch_size
+    def __init__(
+        self,
+        x,
+        y=None,
+        sample_weights=None,
+        workers=1,
+        use_multiprocessing=False,
+        max_queue_size=10,
+        model=None,
+        **kwargs,
+    ):
+        # Generators should never shuffle as exhausting the generator in order
+        # to shuffle the batches is inefficient.
+        kwargs.pop("shuffle", None)
+
+        if not is_none_or_empty(y):
+            raise ValueError(
+                "`y` argument is not supported when using "
+                "python generator as input."
+            )
+        if not is_none_or_empty(sample_weights):
+            raise ValueError(
+                "`sample_weight` argument is not supported when using "
+                "python generator as input."
+            )
+
+        super().__init__(x, y, **kwargs)
+
+        # Since we have to know the dtype of the python generator when we build
+        # the dataset, we have to look at a batch to infer the structure.
+        peek, x = self._peek_and_restore(x)
+        peek = self._standardize_batch(peek)
+        peek = _process_tensorlike(peek)
+
+        # Need to build the Model on concrete input shapes.
+        if model is not None and not model.built:
+            concrete_x, _, _ = unpack_x_y_sample_weight(peek)
+            try:
+                model.distribute_strategy.run(
+                    lambda x: model(x, training=False), args=(concrete_x,)
+                )
+            except NotImplementedError:
+                # The above call may fail if the model is a container-like class
+                # that does not implement its own forward pass (e.g. a GAN or
+                # VAE where the forward pass is handled by subcomponents).  Such
+                # a model does not need to be built.
+                pass
+
+        self._first_batch_size = int(tf.nest.flatten(peek)[0].shape[0])
+
+        def _get_tensor_spec(t):
+            # TODO(b/226395276): Remove _with_tensor_ranks_only usage.
+            return type_spec.type_spec_from_value(t)._with_tensor_ranks_only()
+
+        output_signature = tf.nest.map_structure(_get_tensor_spec, peek)
+
+        # Note that dataset API takes a callable that creates a generator
+        # object, rather than generator itself, which is why we define a
+        # function here.
+        generator_fn = self._handle_multiprocessing(
+            x, workers, use_multiprocessing, max_queue_size
+        )
+
+        def wrapped_generator():
+            for data in generator_fn():
+                yield self._standardize_batch(data)
+
+        dataset = tf.data.Dataset.from_generator(
+            wrapped_generator, output_signature=output_signature
+        )
+
+        if workers == 1 and not use_multiprocessing:
+            dataset = dataset.prefetch(1)
+
+        self._dataset = dataset.prefetch(tf.data.AUTOTUNE)
+
+    def _standardize_batch(self, data):
+        """Standardizes a batch output by a generator."""
+        # Removes `None`s.
+        x, y, sample_weight = unpack_x_y_sample_weight(data)
+        data = pack_x_y_sample_weight(x, y, sample_weight)
+
+        data = tf.__internal__.nest.list_to_tuple(data)
+
+        def _convert_dtype(t):
+            if isinstance(t, np.ndarray) and issubclass(
+                t.dtype.type, np.floating
+            ):
+                return np.array(t, dtype=backend.floatx())
+            return t
+
+        data = tf.nest.map_structure(_convert_dtype, data)
+        return data
+
+    @staticmethod
+    def _peek_and_restore(x):
+        peek = next(x)
+        return peek, itertools.chain([peek], x)
+
+    def _handle_multiprocessing(
+        self, x, workers, use_multiprocessing, max_queue_size
+    ):
+        """Create a callable, possibly including an Enqueuer."""
+        if workers > 1 or (workers > 0 and use_multiprocessing):
+
+            def generator_fn():
+                enqueuer = data_utils.GeneratorEnqueuer(
+                    x, use_multiprocessing=use_multiprocessing
+                )
+                enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+                return enqueuer.get()
+
+        else:
+            generator_fn = lambda: x
+        return generator_fn
+
+    def get_dataset(self):
+        return self._dataset
+
+    def get_size(self):
+        return None
+
+    def batch_size(self):
+        return None
 
-  def has_partial_batch(self):
-    return False
+    def representative_batch_size(self):
+        return self._first_batch_size
 
-  def partial_batch_size(self):
-    return
+    def has_partial_batch(self):
+        return False
+
+    def partial_batch_size(self):
+        return
 
-  def should_recreate_iterator(self):
-    return False
+    def should_recreate_iterator(self):
+        return False
 
 
 class KerasSequenceAdapter(GeneratorDataAdapter):
-  """Adapter that handles `keras.utils.Sequence`."""
-
-  @staticmethod
-  def can_handle(x, y=None):
-    return isinstance(x, data_utils.Sequence)
-
-  def __init__(self,
-               x,
-               y=None,
-               sample_weights=None,
-               shuffle=False,
-               workers=1,
-               use_multiprocessing=False,
-               max_queue_size=10,
-               model=None,
-               **kwargs):
-    if not is_none_or_empty(y):
-      raise ValueError("`y` argument is not supported when using "
-                       "`keras.utils.Sequence` as input.")
-    if not is_none_or_empty(sample_weights):
-      raise ValueError("`sample_weight` argument is not supported when using "
-                       "`keras.utils.Sequence` as input.")
-
-    self._shuffle_sequence = shuffle
-    self._keras_sequence = x
-    self._enqueuer = None
-    super().__init__(
-        x,
-        shuffle=False,  # Shuffle is handed in the _make_callable override.
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        max_queue_size=max_queue_size,
-        model=model,
-        **kwargs)
-
-  @staticmethod
-  def _peek_and_restore(x):
-    return x[0], x
-
-  def _handle_multiprocessing(self, x, workers, use_multiprocessing,
-                              max_queue_size):
-    if workers > 1 or (workers > 0 and use_multiprocessing):
-      def generator_fn():
-        self._enqueuer = data_utils.OrderedEnqueuer(
-            x, use_multiprocessing=use_multiprocessing,
-            shuffle=self._shuffle_sequence)
-        self._enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-        return self._enqueuer.get()
-    else:
-      def generator_fn():
-        order = range(len(x))
-        if self._shuffle_sequence:
-          # Match the shuffle convention in OrderedEnqueuer.
-          order = list(order)
-          random.shuffle(order)
-
-        for i in order:
-          yield x[i]
+    """Adapter that handles `keras.utils.Sequence`."""
 
-    return generator_fn
+    @staticmethod
+    def can_handle(x, y=None):
+        return isinstance(x, data_utils.Sequence)
 
-  def get_size(self):
-    return len(self._keras_sequence)
-
-  def should_recreate_iterator(self):
-    return True
+    def __init__(
+        self,
+        x,
+        y=None,
+        sample_weights=None,
+        shuffle=False,
+        workers=1,
+        use_multiprocessing=False,
+        max_queue_size=10,
+        model=None,
+        **kwargs,
+    ):
+        if not is_none_or_empty(y):
+            raise ValueError(
+                "`y` argument is not supported when using "
+                "`keras.utils.Sequence` as input."
+            )
+        if not is_none_or_empty(sample_weights):
+            raise ValueError(
+                "`sample_weight` argument is not supported when using "
+                "`keras.utils.Sequence` as input."
+            )
+
+        self._shuffle_sequence = shuffle
+        self._keras_sequence = x
+        self._enqueuer = None
+        super().__init__(
+            x,
+            shuffle=False,  # Shuffle is handed in the _make_callable override.
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            max_queue_size=max_queue_size,
+            model=model,
+            **kwargs,
+        )
+
+    @staticmethod
+    def _peek_and_restore(x):
+        return x[0], x
+
+    def _handle_multiprocessing(
+        self, x, workers, use_multiprocessing, max_queue_size
+    ):
+        if workers > 1 or (workers > 0 and use_multiprocessing):
+
+            def generator_fn():
+                self._enqueuer = data_utils.OrderedEnqueuer(
+                    x,
+                    use_multiprocessing=use_multiprocessing,
+                    shuffle=self._shuffle_sequence,
+                )
+                self._enqueuer.start(
+                    workers=workers, max_queue_size=max_queue_size
+                )
+                return self._enqueuer.get()
+
+        else:
+
+            def generator_fn():
+                order = range(len(x))
+                if self._shuffle_sequence:
+                    # Match the shuffle convention in OrderedEnqueuer.
+                    order = list(order)
+                    random.shuffle(order)
+
+                for i in order:
+                    yield x[i]
+
+        return generator_fn
+
+    def get_size(self):
+        return len(self._keras_sequence)
+
+    def should_recreate_iterator(self):
+        return True
 
-  def on_epoch_end(self):
-    if self._enqueuer:
-      self._enqueuer.stop()
-    self._keras_sequence.on_epoch_end()
+    def on_epoch_end(self):
+        if self._enqueuer:
+            self._enqueuer.stop()
+        self._keras_sequence.on_epoch_end()
 
 
 ALL_ADAPTER_CLS = [
-    ListsOfScalarsDataAdapter, TensorLikeDataAdapter,
-    GenericArrayLikeDataAdapter, DatasetAdapter, GeneratorDataAdapter,
-    KerasSequenceAdapter, CompositeTensorDataAdapter, DatasetCreatorAdapter
+    ListsOfScalarsDataAdapter,
+    TensorLikeDataAdapter,
+    GenericArrayLikeDataAdapter,
+    DatasetAdapter,
+    GeneratorDataAdapter,
+    KerasSequenceAdapter,
+    CompositeTensorDataAdapter,
+    DatasetCreatorAdapter,
+]
+
+UNSHARDABLE_DATASET_TYPES = [
+    from_generator_op._GeneratorDataset,
+    range_op._RangeDataset,
+    from_sparse_tensor_slices_op._SparseTensorSliceDataset,
+    from_tensors_op._TensorDataset,
+    from_tensor_slices_op._TensorSliceDataset,
 ]
 
 
 def select_data_adapter(x, y):
-  """Selects a data adapter that can handle a given x and y."""
-  adapter_cls = [cls for cls in ALL_ADAPTER_CLS if cls.can_handle(x, y)]
-  if not adapter_cls:
-    # TODO(scottzhu): This should be a less implementation-specific error.
-    raise ValueError(
-        "Failed to find data adapter that can handle "
-        "input: {}, {}".format(
-            _type_name(x), _type_name(y)))
-  elif len(adapter_cls) > 1:
-    raise RuntimeError(
-        "Data adapters should be mutually exclusive for "
-        "handling inputs. Found multiple adapters {} to handle "
-        "input: {}, {}".format(
-            adapter_cls, _type_name(x), _type_name(y)))
-  # Instrument the data adapter usage before returning it
-  keras_data_adapter_gauge.get_cell(adapter_cls[0].__name__).set(True)
-  return adapter_cls[0]
+    """Selects a data adapter that can handle a given x and y."""
+    adapter_cls = [cls for cls in ALL_ADAPTER_CLS if cls.can_handle(x, y)]
+    if not adapter_cls:
+        # TODO(scottzhu): This should be a less implementation-specific error.
+        raise ValueError(
+            "Failed to find data adapter that can handle input: {}, {}".format(
+                _type_name(x), _type_name(y)
+            )
+        )
+    elif len(adapter_cls) > 1:
+        raise RuntimeError(
+            "Data adapters should be mutually exclusive for "
+            "handling inputs. Found multiple adapters {} to handle "
+            "input: {}, {}".format(adapter_cls, _type_name(x), _type_name(y))
+        )
+    # Instrument the data adapter usage before returning it
+    keras_data_adapter_gauge.get_cell(adapter_cls[0].__name__).set(True)
+    return adapter_cls[0]
 
 
 def _type_name(x):
-  """Generates a description of the type of an object."""
-  if isinstance(x, dict):
-    key_types = set(_type_name(key) for key in x.keys())
-    val_types = set(_type_name(key) for key in x.values())
-    return "({} containing {} keys and {} values)".format(
-        type(x), key_types, val_types)
-  if isinstance(x, (list, tuple)):
-    types = set(_type_name(val) for val in x)
-    return "({} containing values of types {})".format(
-        type(x), types)
-  return str(type(x))
+    """Generates a description of the type of an object."""
+    if isinstance(x, dict):
+        key_types = set(_type_name(key) for key in x.keys())
+        val_types = set(_type_name(key) for key in x.values())
+        return f"({type(x)} containing {key_types} keys and {val_types} values)"
+    if isinstance(x, (list, tuple)):
+        types = set(_type_name(val) for val in x)
+        return f"({type(x)} containing values of types {types})"
+    return str(type(x))
 
 
 def _process_tensorlike(inputs):
-  """Process tensor-like inputs.
+    """Process tensor-like inputs.
 
-  This function:
+    This function:
 
-  (1) Converts `Numpy` arrays to `Tensor`s.
-  (2) Converts `Scipy` sparse matrices to `SparseTensor`s.
-  (3) Converts `pandas.Series` to `Tensor`s
-  (4) Converts `list`s to `tuple`s (for `tf.data` support).
+    (1) Converts `Numpy` arrays to `Tensor`s.
+    (2) Converts `Scipy` sparse matrices to `SparseTensor`s.
+    (3) Converts `pandas.Series` to `Tensor`s
+    (4) Converts `list`s to `tuple`s (for `tf.data` support).
 
-  Args:
-    inputs: Structure of `Tensor`s, `NumPy` arrays, or tensor-like.
+    Args:
+      inputs: Structure of `Tensor`s, `NumPy` arrays, or tensor-like.
 
-  Returns:
-    Structure of `Tensor`s or tensor-like.
-  """
+    Returns:
+      Structure of `Tensor`s or tensor-like.
+    """
 
-  def _convert_single_tensor(x):
-    if _is_pandas_series(x):
-      x = np.expand_dims(x.to_numpy(), axis=-1)
+    def _convert_single_tensor(x):
+        if _is_pandas_series(x):
+            x = np.expand_dims(x.to_numpy(), axis=-1)
 
-    if isinstance(x, np.ndarray):
-      dtype = None
-      if issubclass(x.dtype.type, np.floating):
-        dtype = backend.floatx()
-      return tf.convert_to_tensor(x, dtype=dtype)
-    elif _is_scipy_sparse(x):
-      return _scipy_sparse_to_sparse_tensor(x)
-    return x
+        if isinstance(x, np.ndarray):
+            dtype = None
+            if issubclass(x.dtype.type, np.floating):
+                dtype = backend.floatx()
+            return tf.convert_to_tensor(x, dtype=dtype)
+        elif _is_scipy_sparse(x):
+            return _scipy_sparse_to_sparse_tensor(x)
+        return x
 
-  inputs = tf.nest.map_structure(_convert_single_tensor, inputs)
-  return tf.__internal__.nest.list_to_tuple(inputs)
+    inputs = tf.nest.map_structure(_convert_single_tensor, inputs)
+    return tf.__internal__.nest.list_to_tuple(inputs)
 
 
 def is_none_or_empty(inputs):
-  # util method to check if the input is a None or a empty list.
-  # the python "not" check will raise an error like below if the input is a
-  # numpy array
-  # "The truth value of an array with more than one element is ambiguous.
-  # Use a.any() or a.all()"
-  return inputs is None or not tf.nest.flatten(inputs)
+    # util method to check if the input is a None or a empty list.
+    # the python "not" check will raise an error like below if the input is a
+    # numpy array
+    # "The truth value of an array with more than one element is ambiguous.
+    # Use a.any() or a.all()"
+    return inputs is None or not tf.nest.flatten(inputs)
 
 
 def broadcast_sample_weight_modes(target_structure, sample_weight_modes):
-  """Match sample_weight_modes structure with output structure."""
-  if target_structure is None or not tf.nest.flatten(target_structure):
+    """Match sample_weight_modes structure with output structure."""
+    if target_structure is None or not tf.nest.flatten(target_structure):
+        return sample_weight_modes
+
+    if isinstance(sample_weight_modes, str):
+        if isinstance(target_structure, dict):
+            return {key: sample_weight_modes for key in target_structure.keys()}
+        return [sample_weight_modes for _ in target_structure]
+
+    if sample_weight_modes:
+        try:
+            tf.nest.assert_same_structure(
+                training_utils.list_to_tuple(target_structure),
+                training_utils.list_to_tuple(sample_weight_modes),
+            )
+        except (ValueError, TypeError):
+            target_str = str(
+                tf.nest.map_structure(lambda _: "...", target_structure)
+            )
+            mode_str = str(
+                tf.nest.map_structure(lambda _: "...", sample_weight_modes)
+            )
+
+            # Attempt to coerce sample_weight_modes to the target structure.
+            # This implicitly depends on the fact that Model flattens outputs
+            # for its internal representation.
+            try:
+                sample_weight_modes = tf.nest.pack_sequence_as(
+                    target_structure, tf.nest.flatten(sample_weight_modes)
+                )
+                logging.warning(
+                    "sample_weight modes were coerced from\n  "
+                    "{}\n    to  \n  {}".format(target_str, mode_str)
+                )
+            except (ValueError, TypeError):
+                raise ValueError(
+                    "Unable to match target structure and sample_weight_modes "
+                    "structure:\n  {}\n    to  \n  {}".format(
+                        target_str, mode_str
+                    )
+                )
+
     return sample_weight_modes
 
-  if isinstance(sample_weight_modes, str):
-    if isinstance(target_structure, dict):
-      return {key: sample_weight_modes for key in target_structure.keys()}
-    return [sample_weight_modes for _ in target_structure]
 
-  if sample_weight_modes:
-    try:
-      tf.nest.assert_same_structure(
-          training_utils.list_to_tuple(target_structure),
-          training_utils.list_to_tuple(sample_weight_modes))
-    except (ValueError, TypeError):
-      target_str = str(tf.nest.map_structure(lambda _: "...", target_structure))
-      mode_str = str(
-          tf.nest.map_structure(lambda _: "...", sample_weight_modes))
-
-      # Attempt to coerce sample_weight_modes to the target structure. This
-      # implicitly depends on the fact that Model flattens outputs for its
-      # internal representation.
-      try:
-        sample_weight_modes = tf.nest.pack_sequence_as(
-            target_structure, tf.nest.flatten(sample_weight_modes))
+class DataHandler:
+    """Handles iterating over epoch-level `tf.data.Iterator` objects."""
+
+    def __init__(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        batch_size=None,
+        steps_per_epoch=None,
+        initial_epoch=0,
+        epochs=1,
+        shuffle=False,
+        class_weight=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        model=None,
+        steps_per_execution=None,
+        distribute=True,
+        pss_evaluation_shards=0,
+    ):
+        """Initializes a `DataHandler`.
+
+        Arguments:
+          x: See `Model.fit`.
+          y: See `Model.fit`.
+          sample_weight: See `Model.fit`.
+          batch_size: See `Model.fit`.
+          steps_per_epoch: See `Model.fit`.
+          initial_epoch: See `Model.fit`.
+          epochs: See `Model.fit`.
+          shuffle: See `Model.fit`.
+          class_weight: See `Model.fit`.
+          max_queue_size: See `Model.fit`.
+          workers: See `Model.fit`.
+          use_multiprocessing: See `Model.fit`.
+          model: The `Model` instance. Needed in order to correctly `build` the
+            `Model` using generator-like inputs (see `GeneratorDataAdapter`).
+          steps_per_execution: See `Model.compile`.
+          distribute: Whether to distribute the `tf.dataset`.
+            `PreprocessingLayer.adapt` does not support distributed datasets,
+            `Model` should always set this to `True`.
+          pss_evaluation_shards: See `Model.fit`.
+        """
+
+        self._initial_epoch = initial_epoch
+        self._initial_step = 0
+        self._epochs = epochs
+        self._insufficient_data = False
+        self._model = model
+
+        if steps_per_epoch == 0:
+            raise ValueError(
+                "Unexpected value for `steps_per_epoch`. Received value is 0. "
+                "Please check the docstring for `model.fit()` for supported "
+                "values."
+            )
+
+        self._steps_per_epoch = steps_per_epoch
+
+        # `steps_per_execution_value` is the cached initial value.
+        # `steps_per_execution` is mutable and may be changed by the DataAdapter
+        # to handle partial executions.
+        if steps_per_execution is None:
+            self._steps_per_execution = tf.Variable(1)
+        else:
+            self._steps_per_execution = steps_per_execution
+
+        adapter_cls = select_data_adapter(x, y)
+        self._adapter = adapter_cls(
+            x,
+            y,
+            batch_size=batch_size,
+            steps=steps_per_epoch,
+            epochs=epochs - initial_epoch,
+            sample_weights=sample_weight,
+            shuffle=shuffle,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            distribution_strategy=tf.distribute.get_strategy(),
+            model=model,
+            pss_evaluation_shards=pss_evaluation_shards,
+        )
+
+        strategy = tf.distribute.get_strategy()
+
+        self._current_step = 0
+        self._step_increment = self._steps_per_execution.numpy().item() - 1
+        self._insufficient_data = False
+
+        self._configure_dataset_and_inferred_steps(
+            strategy, x, steps_per_epoch, class_weight, distribute
+        )
+
+        if self._inferred_steps == 0:
+            raise ValueError("Expected input data to be non-empty.")
+
+    def _configure_dataset_and_inferred_steps(
+        self, strategy, x, steps_per_epoch, class_weight, distribute
+    ):
+        """Configure the `_dataset` and `_inferred_steps` attributes."""
+        del x
+        dataset = self._adapter.get_dataset()
+        if class_weight:
+            dataset = dataset.map(_make_class_weight_map_fn(class_weight))
+        self._inferred_steps = self._infer_steps(steps_per_epoch, dataset)
+
+        # `PreprocessingLayer.adapt` does not currently support distributed
+        # datasets, so we pass `distribute=False` there.
+        if distribute and not _is_distributed_dataset(dataset):
+            dataset = strategy.experimental_distribute_dataset(dataset)
+        self._dataset = dataset
+        self._validate_data_handler()
+
+    def enumerate_epochs(self):
+        """Yields `(epoch, tf.data.Iterator)`."""
+        with self._truncate_execution_to_epoch():
+            data_iterator = iter(self._dataset)
+            for epoch in range(self._initial_epoch, self._epochs):
+                if self._insufficient_data:  # Set by `catch_stop_iteration`.
+                    break
+                if self._adapter.should_recreate_iterator():
+                    data_iterator = iter(self._dataset)
+                    if not isinstance(self._dataset, DistributedDataset):
+                        steps = self._infer_steps(
+                            self._steps_per_epoch, self._dataset
+                        )
+                        if steps is not None:
+                            self._inferred_steps = steps
+                yield epoch, data_iterator
+                self._adapter.on_epoch_end()
+
+    @contextlib.contextmanager
+    def _truncate_execution_to_epoch(self):
+        """Truncates steps per execution to at most one epoch."""
+        should_truncate = (
+            self._inferred_steps is not None
+            and self._steps_per_execution.numpy().item() > self._inferred_steps
+        )
+        original_value = self._steps_per_execution.numpy().item()
+        try:
+            if should_truncate:
+                self._steps_per_execution.assign(self._inferred_steps)
+            yield
+        finally:
+            if should_truncate:
+                self._steps_per_execution.assign(original_value)
+
+    def sync(self):
+        context.async_wait()
+
+    @contextlib.contextmanager
+    def catch_stop_iteration(self):
+        """Catches errors when an iterator runs out of data."""
+        with distributed_training_utils.maybe_preemption_handler_scope(
+            self._model
+        ):
+            try:
+                yield
+                self.sync()
+            except (StopIteration, tf.errors.OutOfRangeError):
+                if self._inferred_steps is None:
+                    self._inferred_steps = self._current_step
+                else:
+                    self._insufficient_data = True
+                    total_epochs = self._epochs - self._initial_epoch
+                    logging.warning(
+                        "Your input ran out of data; interrupting training. "
+                        "Make sure that your dataset or generator can generate "
+                        "at least `steps_per_epoch * epochs` batches (in this "
+                        "case, {} batches). You may need to use the repeat() "
+                        "function when building your dataset.".format(
+                            total_epochs * self._inferred_steps
+                        )
+                    )
+
+    def steps(self):
+        """Yields steps for the current epoch."""
+        self._current_step = self._initial_step
+        self._initial_step = 0
+        # `self._inferred_steps` can be changed by `catch_stop_iteration`.
+        while (
+            self._inferred_steps is None
+            or self._current_step < self._inferred_steps
+        ):
+            if self._insufficient_data:  # Set by `catch_stop_iteration`.
+                break
+            original_spe = self._steps_per_execution.numpy().item()
+            can_run_full_execution = (
+                original_spe == 1
+                or self._inferred_steps is None
+                or self._inferred_steps - self._current_step >= original_spe
+            )
+
+            if can_run_full_execution:
+                self._step_increment = original_spe - 1
+                yield self._current_step
+                self._current_step += original_spe
+            else:
+                # Last partial execution.
+                steps_remaining = self._inferred_steps - self._current_step
+                self._steps_per_execution.assign(steps_remaining)
+                self._step_increment = steps_remaining - 1
+                yield self._current_step
+                self._current_step += steps_remaining
+                self._steps_per_execution.assign(original_spe)
+
+    @property
+    def step_increment(self):
+        """The number to increment the step for `on_batch_end` methods."""
+        return self._step_increment
+
+    @property
+    def inferred_steps(self):
+        """The inferred steps per epoch of the created `Dataset`.
+
+        This will be `None` in the case where:
+
+        (1) A `Dataset` of unknown cardinality was passed to the `DataHandler`,
+        (2) `steps_per_epoch` was not provided, and
+        (3) The first epoch of iteration has not yet completed.
+
+        Returns:
+          The inferred steps per epoch of the created `Dataset`.
+        """
+        return self._inferred_steps
+
+    @property
+    def should_sync(self):
+        # Catch OutOfRangeError for Datasets of unknown size.
+        # This blocks until the batch has finished executing.
+        # TODO(b/150292341): Allow multiple async steps here.
+        return self._inferred_steps is None
+
+    def _log_indefinite_training_warning(self):
         logging.warning(
-            "sample_weight modes were coerced from\n  {}\n    to  \n  {}"
-            .format(target_str, mode_str))
-      except (ValueError, TypeError):
-        raise ValueError(
-            "Unable to match target structure and sample_weight_modes "
-            "structure:\n  {}\n    to  \n  {}".format(target_str, mode_str))
+            "The training loop will run indefinitely since you have "
+            "set `steps_per_epoch=-1`. Please use batch-level "
+            "callbacks to save checkpoints or log training progress, "
+            "etc"
+        )
+
+    def _infer_steps(self, steps, dataset):
+        """Infers steps_per_epoch needed to loop through a dataset."""
+        if steps == -1:
+            self._log_indefinite_training_warning()
+            return None
+
+        if steps is not None:
+            return steps
+
+        adapter_steps = self._adapter.get_size()
+        if adapter_steps is not None:
+            return adapter_steps
+
+        # tf.distribute's `PerWorkerDataset` does not inherit from
+        # `tf.data.Dataset` and in those cases we give up on inferring steps.
+        if not isinstance(dataset, tf.data.Dataset):
+            return None
+
+        size = tf.data.experimental.cardinality(dataset)
+        if size == tf.data.experimental.INFINITE_CARDINALITY and steps is None:
+            raise ValueError(
+                "When passing an infinitely repeating dataset, please specify "
+                "a `steps_per_epoch` value so that epoch level "
+                "callbacks continue to work. The value can be arbitrary, or a "
+                "number that you think correctly defines the size of an epoch. "
+                "Epoch-level callbacks will then be called at this interval."
+            )
+        if size >= 0:
+            return size.numpy().item()
+        return None
+
+    @property
+    def _samples(self):
+        return self._adapter.get_samples()
+
+    def _validate_data_handler(self):
+        # TODO(b/152094471): Support this with DistIter.get_next_as_optional.
+        if (
+            self._steps_per_execution.numpy().item() > 1
+            and self._inferred_steps is None
+        ):
+            raise ValueError(
+                "Could not infer the size of the data. With "
+                "`steps_per_execution > 1`, you must specify the number of "
+                "steps to run."
+            )
 
-  return sample_weight_modes
 
+class _ClusterCoordinatorDataHandler(DataHandler):
+    """A `DataHandler` that is compatible with `ClusterCoordinator`."""
+
+    def __init__(self, x, y=None, **kwargs):
+        if not _is_distributed_dataset(x) and not isinstance(
+            x, (dataset_creator.DatasetCreator, tf.data.Dataset)
+        ):
+            x = self._convert_to_dataset_creator(x, y, **kwargs)
+
+        super().__init__(x=x, **kwargs)
+
+    def _convert_to_dataset_creator(self, x, y, **kwargs):
+        """Converts non-tf.data.Dataset to `DatasetCreator` instances."""
+
+        def _dataset_fn(input_context):
+            del input_context
+            data_adapter_cls = select_data_adapter(x, y)
+            return data_adapter_cls(x=x, y=y, **kwargs).get_dataset()
+
+        # This check is needed because types like `tf.data.Dataset` don't work
+        # with PSS yet. So only apply this logic to the types we can support.
+        if isinstance(x, _get_tensor_types()) and isinstance(
+            y, _get_tensor_types()
+        ):
+            return dataset_creator.DatasetCreator(_dataset_fn)
+        else:
+            raise NotImplementedError(
+                "Only `tf.keras.utils.experimental.DatasetCreator`, "
+                "`tf.Tensor`, numpy arrays and pandas dataframes are "
+                "supported types at this time."
+            )
+
+    def _configure_dataset_and_inferred_steps(
+        self, strategy, x, steps_per_epoch, class_weight, distribute
+    ):
+        if isinstance(x, dataset_creator.DatasetCreator):
+
+            def per_worker_dataset_fn():
+
+                return strategy.distribute_datasets_from_function(
+                    x, options=x.input_options
+                )
+
+            coordinator = self._model._cluster_coordinator
+            self._dataset = coordinator.create_per_worker_dataset(
+                per_worker_dataset_fn
+            )
+        else:
+            assert distribute
+            if not _is_distributed_dataset(x):
+                x = strategy.experimental_distribute_dataset(x)
+
+            coordinator = self._model._cluster_coordinator
+            self._dataset = coordinator.create_per_worker_dataset(x)
+
+        if steps_per_epoch == -1:
+            self._inferred_steps = None
+            self._log_indefinite_training_warning()
+        else:
+            self._inferred_steps = steps_per_epoch
+
+    def sync(self):
+        self._model._cluster_coordinator.join()
+
+
+class _ClusterCoordinatorExactEvalDataHandler(_ClusterCoordinatorDataHandler):
+    def __init__(self, x, y=None, **kwargs):
+        super().__init__(x=x, **kwargs)
+        self._total_shards = kwargs.get("pss_evaluation_shards")
+
+    def _warn_if_not_file_shardable(self, dataset):
+        # Traverse backwards to find source dataset and check if that is one of
+        # the unshardable types
+        # TODO(b/268521864): expand this to inspect dataset function graphs and
+        # use the auto-sharding logic rather than re-creating it here.
+        cur_dataset = dataset
+        while hasattr(cur_dataset, "_input_dataset"):
+            cur_dataset = cur_dataset._input_dataset
+        if type(cur_dataset) in UNSHARDABLE_DATASET_TYPES:
+            logging.warning(
+                "Found source dataset of type {}. This type is not "
+                "efficiently shardable, so exact evaluation may be "
+                "slower than inexact evaluation. Try converting to "
+                "a TFRecord or other file-based dataset if "
+                "performance is a concern.".format(type(cur_dataset))
+            )
+
+    def _configure_dataset_and_inferred_steps(
+        self, strategy, x, steps_per_epoch, class_weight, distribute
+    ):
+        if isinstance(x, dataset_creator.DatasetCreator):
+
+            def per_worker_dataset_fn():
+                ddf = strategy.distribute_datasets_from_function(
+                    x, options=x.input_options
+                )
+                return ddf
+
+            coordinator = self._model._cluster_coordinator
+            self._dataset = coordinator.create_per_worker_dataset(
+                per_worker_dataset_fn
+            )
+            logging.info("dataset element spec: %r", self._dataset.element_spec)
+            self._dataset = self._dataset.build()
+        else:
+            # TODO(b/268226218): Support DistributedDataset input
+            if not _is_distributed_dataset(x):
+                self._warn_if_not_file_shardable(x)
+                x = strategy.experimental_distribute_dataset(x)
+
+            coordinator = self._model._cluster_coordinator
+            self._dataset = coordinator.create_per_worker_dataset(x)
+            self._dataset = self._dataset.build()
+
+        if steps_per_epoch == -1:
+            self._inferred_steps = None
+            self._log_indefinite_training_warning()
+        else:
+            self._inferred_steps = steps_per_epoch
+
+    def enumerate_epochs(self):
+        """Yields `(epoch, dataset)`."""
+        for epoch in range(self._initial_epoch, self._epochs):
+            yield epoch, self._dataset
+            self._adapter.on_epoch_end()
+
+    def steps(self):
+        """Yields steps for the current epoch."""
+        for step in range(self._total_shards):
+            yield step
+
+
+@keras_export("keras.__internal__.utils.get_data_handler", v1=[])
+def get_data_handler(*args, **kwargs):
+    """Creates a `DataHandler`, providing standardized access to a `Dataset`.
+
+    See `DataHandler` for the list and definition of the arguments. See the
+    implementation of `Model.fit()`, `evaluate()`, or `predict()` methods
+    for complete usage examples. As a rule of tumb, `get_data_handler()` accepts
+    the same inputs as the `x` argument of `Model.fit()`.
+
+    Example:
+
+    ```python
+      def step(iterator):
+        data = next(iterator)
+        # result <= Do something with data
+        return result
+      tf_step = tf.function(step, reduce_retracing=True)
+
+      # Assume x is a tf.data Dataset.
+      data_handler = data_adapter.get_data_handler(x=x)
+      # Epoch iteration
+      for epo_idx, iterator in data_handler.enumerate_epochs():
+          # Stop on dataset exhaustion.
+          with data_handler.catch_stop_iteration():
+            for step in data_handler.steps(): # Step iteration
+                step_result = step(iterator)
+    ```
+
+    Args:
+      *args: Arguments passed to the `DataHandler` constructor.
+      **kwargs: Arguments passed to the `DataHandler` constructor.
+
+    Returns:
+      A `DataHandler` object. If the model's cluster coordinate is set (e.g. the
+      model was defined under a parameter-server strategy), returns a
+      `_ClusterCoordinatorDataHandler`.
 
-class DataHandler:
-  """Handles iterating over epoch-level `tf.data.Iterator` objects."""
-
-  def __init__(self,
-               x,
-               y=None,
-               sample_weight=None,
-               batch_size=None,
-               steps_per_epoch=None,
-               initial_epoch=0,
-               epochs=1,
-               shuffle=False,
-               class_weight=None,
-               max_queue_size=10,
-               workers=1,
-               use_multiprocessing=False,
-               model=None,
-               steps_per_execution=None,
-               distribute=True):
-    """Initializes a `DataHandler`.
-
-    Arguments:
-      x: See `Model.fit`.
-      y: See `Model.fit`.
-      sample_weight: See `Model.fit`.
-      batch_size: See `Model.fit`.
-      steps_per_epoch: See `Model.fit`.
-      initial_epoch: See `Model.fit`.
-      epochs: See `Model.fit`.
-      shuffle: See `Model.fit`.
-      class_weight: See `Model.fit`.
-      max_queue_size: See `Model.fit`.
-      workers: See `Model.fit`.
-      use_multiprocessing: See `Model.fit`.
-      model: The `Model` instance. Needed in order to correctly `build` the
-        `Model` using generator-like inputs (see `GeneratorDataAdapter`).
-      steps_per_execution: See `Model.compile`.
-      distribute: Whether to distribute the `tf.dataset`.
-        `PreprocessingLayer.adapt` does not support distributed datasets,
-        `Model` should always set this to `True`.
     """
+    if getattr(kwargs["model"], "_cluster_coordinator", None):
+        if kwargs.get("pss_evaluation_shards"):
+            return _ClusterCoordinatorExactEvalDataHandler(*args, **kwargs)
+        return _ClusterCoordinatorDataHandler(*args, **kwargs)
+    return DataHandler(*args, **kwargs)
 
-    self._initial_epoch = initial_epoch
-    self._initial_step = 0
-    self._epochs = epochs
-    self._insufficient_data = False
-    self._model = model
 
-    self._steps_per_epoch = steps_per_epoch
+def _make_class_weight_map_fn(class_weight):
+    """Applies class weighting to a `Dataset`.
 
-    # `steps_per_execution_value` is the cached initial value.
-    # `steps_per_execution` is mutable and may be changed by the DataAdapter
-    # to handle partial executions.
-    if steps_per_execution is None:
-      self._steps_per_execution = tf.Variable(1)
-    else:
-      self._steps_per_execution = steps_per_execution
+    The `Dataset` is assumed to be in format `(x, y)` or `(x, y, sw)`, where
+    `y` must be a single `Tensor`.
 
-    adapter_cls = select_data_adapter(x, y)
-    self._adapter = adapter_cls(
-        x,
-        y,
-        batch_size=batch_size,
-        steps=steps_per_epoch,
-        epochs=epochs - initial_epoch,
-        sample_weights=sample_weight,
-        shuffle=shuffle,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        distribution_strategy=tf.distribute.get_strategy(),
-        model=model)
-
-    strategy = tf.distribute.get_strategy()
-
-    self._current_step = 0
-    self._step_increment = self._steps_per_execution.numpy().item() - 1
-    self._insufficient_data = False
-
-    self._configure_dataset_and_inferred_steps(strategy, x, steps_per_epoch,
-                                               class_weight, distribute)
-
-  def _configure_dataset_and_inferred_steps(self, strategy, x, steps_per_epoch,
-                                            class_weight, distribute):
-    """Configure the `_dataset` and `_inferred_steps` attributes."""
-    del x
-    dataset = self._adapter.get_dataset()
-    if class_weight:
-      dataset = dataset.map(_make_class_weight_map_fn(class_weight))
-    self._inferred_steps = self._infer_steps(steps_per_epoch, dataset)
-
-    # `PreprocessingLayer.adapt` does not currently support distributed
-    # datasets, so we pass `distribute=False` there.
-    if distribute and not _is_distributed_dataset(dataset):
-      dataset = strategy.experimental_distribute_dataset(dataset)
-    self._dataset = dataset
-    self._validate_data_handler()
-
-  def enumerate_epochs(self):
-    """Yields `(epoch, tf.data.Iterator)`."""
-    with self._truncate_execution_to_epoch():
-      data_iterator = iter(self._dataset)
-      for epoch in range(self._initial_epoch, self._epochs):
-        if self._insufficient_data:  # Set by `catch_stop_iteration`.
-          break
-        if self._adapter.should_recreate_iterator():
-          data_iterator = iter(self._dataset)
-          if not isinstance(self._dataset, DistributedDataset):
-            steps = self._infer_steps(self._steps_per_epoch, self._dataset)
-            if steps is not None:
-              self._inferred_steps = steps
-        yield epoch, data_iterator
-        self._adapter.on_epoch_end()
-
-  @contextlib.contextmanager
-  def _truncate_execution_to_epoch(self):
-    """Truncates steps per execution to at most one epoch."""
-    should_truncate = (
-        self._inferred_steps is not None and
-        self._steps_per_execution.numpy().item() > self._inferred_steps)
-    original_value = self._steps_per_execution.numpy().item()
-    try:
-      if should_truncate:
-        self._steps_per_execution.assign(self._inferred_steps)
-      yield
-    finally:
-      if should_truncate:
-        self._steps_per_execution.assign(original_value)
-
-  def sync(self):
-    context.async_wait()
-
-  @contextlib.contextmanager
-  def catch_stop_iteration(self):
-    """Catches errors when an iterator runs out of data."""
-    try:
-      yield
-      self.sync()
-    except (StopIteration, tf.errors.OutOfRangeError):
-      if self._inferred_steps is None:
-        self._inferred_steps = self._current_step
-      else:
-        self._insufficient_data = True
-        total_epochs = self._epochs - self._initial_epoch
-        logging.warning(
-            "Your input ran out of data; interrupting training. "
-            "Make sure that your dataset or generator can generate at "
-            "least `steps_per_epoch * epochs` batches (in this case, "
-            "{} batches). You may need to use the repeat() function "
-            "when building your dataset.".format(total_epochs *
-                                                 self._inferred_steps))
-
-  def steps(self):
-    """Yields steps for the current epoch."""
-    self._current_step = self._initial_step
-    # `self._inferred_steps` can be changed by `catch_stop_iteration`.
-    while (self._inferred_steps is None or
-           self._current_step < self._inferred_steps):
-      if self._insufficient_data:  # Set by `catch_stop_iteration`.
-        break
-      original_spe = self._steps_per_execution.numpy().item()
-      can_run_full_execution = (
-          original_spe == 1 or
-          self._inferred_steps is None or
-          self._inferred_steps - self._current_step >=
-          original_spe)
-
-      if can_run_full_execution:
-        self._step_increment = original_spe - 1
-        yield self._current_step
-        self._current_step += original_spe
-      else:
-        # Last partial execution.
-        steps_remaining = self._inferred_steps - self._current_step
-        self._steps_per_execution.assign(steps_remaining)
-        self._step_increment = steps_remaining - 1
-        yield self._current_step
-        self._current_step += steps_remaining
-        self._steps_per_execution.assign(original_spe)
-
-  @property
-  def step_increment(self):
-    """The number to increment the step for `on_batch_end` methods."""
-    return self._step_increment
-
-  @property
-  def inferred_steps(self):
-    """The inferred steps per epoch of the created `Dataset`.
-
-    This will be `None` in the case where:
-
-    (1) A `Dataset` of unknown cardinality was passed to the `DataHandler`, and
-    (2) `steps_per_epoch` was not provided, and
-    (3) The first epoch of iteration has not yet completed.
+    Args:
+      class_weight: A map where the keys are integer class ids and values are
+        the class weights, e.g. `{0: 0.2, 1: 0.6, 2: 0.3}`
 
     Returns:
-      The inferred steps per epoch of the created `Dataset`.
+      A function that can be used with `tf.data.Dataset.map` to apply class
+      weighting.
     """
-    return self._inferred_steps
-
-  @property
-  def should_sync(self):
-    # Catch OutOfRangeError for Datasets of unknown size.
-    # This blocks until the batch has finished executing.
-    # TODO(b/150292341): Allow multiple async steps here.
-    return self._inferred_steps is None
-
-  def _log_indefinite_training_warning(self):
-    logging.warning("The training loop will run indefinitely since you have "
-                    "set `steps_per_epoch=-1`. Please use batch-level "
-                    "callbacks to save checkpoints or log training progress, "
-                    "etc")
-
-  def _infer_steps(self, steps, dataset):
-    """Infers steps_per_epoch needed to loop through a dataset."""
-    if steps == -1:
-      self._log_indefinite_training_warning()
-      return None
-
-    if steps is not None:
-      return steps
-
-    adapter_steps = self._adapter.get_size()
-    if adapter_steps is not None:
-      return adapter_steps
-
-    size = tf.data.experimental.cardinality(dataset)
-    if size == tf.data.experimental.INFINITE_CARDINALITY and steps is None:
-      raise ValueError(
-          "When passing an infinitely repeating dataset, please specify a "
-          "`steps_per_epoch` value so that epoch level "
-          "callbacks continue to work. The value can be arbitrary, or a number "
-          "that you think correctly defines the size of an epoch. "
-          "Epoch-level callbacks will then be called at this interval.")
-    if size >= 0:
-      return size.numpy().item()
-    return None
-
-  @property
-  def _samples(self):
-    return self._adapter.get_samples()
-
-  def _validate_data_handler(self):
-    # TODO(b/152094471): Support this with DistIter.get_next_as_optional.
-    if self._steps_per_execution.numpy().item(
-    ) > 1 and self._inferred_steps is None:
-      raise ValueError(
-          "Could not infer the size of the data. With "
-          "`steps_per_execution > 1`, you must specify the number of steps "
-          "to run.")
-
+    class_ids = list(sorted(class_weight.keys()))
+    expected_class_ids = list(range(len(class_ids)))
+    if class_ids != expected_class_ids:
+        error_msg = (
+            "Expected `class_weight` to be a dict with keys from 0 to one less "
+            "than the number of classes, found {}"
+        ).format(class_weight)
+        raise ValueError(error_msg)
+
+    class_weight_tensor = tf.convert_to_tensor(
+        [class_weight[int(c)] for c in class_ids]
+    )
+
+    def _class_weights_map_fn(*data):
+        """Convert `class_weight` to `sample_weight`."""
+        x, y, sw = unpack_x_y_sample_weight(data)
+
+        if tf.nest.is_nested(y):
+            raise ValueError(
+                "`class_weight` is only supported for Models with a single "
+                "output."
+            )
+
+        if y.shape.rank >= 2:
+            y_classes = tf.__internal__.smart_cond.smart_cond(
+                backend.shape(y)[-1] > 1,
+                lambda: backend.argmax(y, axis=-1),
+                lambda: tf.cast(tf.round(tf.squeeze(y, axis=-1)), tf.int64),
+            )
+        else:
+            # Special casing for rank 1, where we can guarantee sparse encoding.
+            y_classes = tf.cast(tf.round(y), tf.int64)
+
+        cw = tf.gather(class_weight_tensor, y_classes)
+        if sw is not None:
+            cw = tf.cast(cw, sw.dtype)
+            # `class_weight` and `sample_weight` are multiplicative.
+            # If class_weight has more than 2 dimensions, we need to reshape
+            # sample_weight to make broadcasting possible for multiplication.
+            rank_delta = cw.shape.rank - sw.shape.rank
+            sw = tf.reshape(sw, sw.shape + [1] * rank_delta)
+            sw = sw * cw
+        else:
+            sw = cw
+        return x, y, sw
+
+    return _class_weights_map_fn
 
-class _ClusterCoordinatorDataHandler(DataHandler):
-  """A `DataHandler` that is compatible with `ClusterCoordinator`."""
 
-  def __init__(self, x, y=None, **kwargs):
-    if (not _is_distributed_dataset(x) and
-        not isinstance(x, (dataset_creator.DatasetCreator, tf.data.Dataset))):
-      x = self._convert_to_dataset_creator(x, y, **kwargs)
-
-    super().__init__(x=x, **kwargs)
+def train_validation_split(arrays, validation_split):
+    """Split arrays into train and validation subsets in deterministic order.
 
-  def _convert_to_dataset_creator(self, x, y, **kwargs):
-    """Converts non-tf.data.Dataset to `DatasetCreator` instances."""
+    The last part of data will become validation data.
 
-    def _dataset_fn(input_context):
-      del input_context
-      data_adapter_cls = select_data_adapter(x, y)
-      return data_adapter_cls(x=x, y=y, **kwargs).get_dataset()
+    Args:
+      arrays: Tensors to split. Allowed inputs are arbitrarily nested structures
+        of Tensors and NumPy arrays.
+      validation_split: Float between 0 and 1. The proportion of the dataset to
+        include in the validation split. The rest of the dataset will be
+        included in the training split.
+    Returns:
+      `(train_arrays, validation_arrays)`
+    """
 
-    # This check is needed because types like `tf.data.Dataset` don't work with
-    # PSS yet. So only apply this logic to the types we can support.
-    if (isinstance(x, _get_tensor_types()) and
-        isinstance(y, _get_tensor_types())):
-      return dataset_creator.DatasetCreator(_dataset_fn)
-    else:
-      raise NotImplementedError(
-          "Only `tf.keras.utils.experimental.DatasetCreator`, `tf.Tensor`, "
-          "numpy arrays and pandas dataframes are supported types at this "
-          "time.")
+    def _can_split(t):
+        tensor_types = _get_tensor_types()
+        return isinstance(t, tensor_types) or t is None
 
-  def _configure_dataset_and_inferred_steps(self, strategy, x, steps_per_epoch,
-                                            class_weight, distribute):
-    if isinstance(x, dataset_creator.DatasetCreator):
+    flat_arrays = tf.nest.flatten(arrays)
+    unsplitable = [type(t) for t in flat_arrays if not _can_split(t)]
+    if unsplitable:
+        raise ValueError(
+            "`validation_split` is only supported for Tensors or NumPy "
+            "arrays, found following types in the input: {}".format(unsplitable)
+        )
 
-      def per_worker_dataset_fn():
+    if all(t is None for t in flat_arrays):
+        return arrays, arrays
 
-        return strategy.distribute_datasets_from_function(
-            x, options=x.input_options)
+    first_non_none = None
+    for t in flat_arrays:
+        if t is not None:
+            first_non_none = t
+            break
 
-      self._dataset = self._model._cluster_coordinator.create_per_worker_dataset(  # pylint: disable=protected-access
-          per_worker_dataset_fn)
-    else:
-      assert distribute
-      if not _is_distributed_dataset(x):
-        x = strategy.experimental_distribute_dataset(x)
+    # Assumes all arrays have the same batch shape or are `None`.
+    batch_dim = int(first_non_none.shape[0])
+    split_at = int(math.floor(batch_dim * (1.0 - validation_split)))
 
-      self._dataset = self._model._cluster_coordinator.create_per_worker_dataset(  # pylint: disable=protected-access
-          x)
+    if split_at == 0 or split_at == batch_dim:
+        raise ValueError(
+            "Training data contains {batch_dim} samples, which is not "
+            "sufficient to split it into a validation and training set as "
+            "specified by `validation_split={validation_split}`. Either "
+            "provide more data, or a different value for the "
+            "`validation_split` argument.".format(
+                batch_dim=batch_dim, validation_split=validation_split
+            )
+        )
+
+    def _split(t, start, end):
+        if t is None:
+            return t
+        return t[start:end]
+
+    train_arrays = tf.nest.map_structure(
+        functools.partial(_split, start=0, end=split_at), arrays
+    )
+    val_arrays = tf.nest.map_structure(
+        functools.partial(_split, start=split_at, end=batch_dim), arrays
+    )
+
+    return train_arrays, val_arrays
 
-    if steps_per_epoch == -1:
-      self._inferred_steps = None
-      self._log_indefinite_training_warning()
-    else:
-      self._inferred_steps = steps_per_epoch
 
-  def sync(self):
-    self._model._cluster_coordinator.join()  # pylint: disable=protected-access
+@keras_export("keras.utils.unpack_x_y_sample_weight", v1=[])
+def unpack_x_y_sample_weight(data):
+    """Unpacks user-provided data tuple.
 
-@keras_export("keras.__internal__.utils.get_data_handler", v1=[])
-def get_data_handler(*args, **kwargs):
-  """Creates a `DataHandler`, providing standardized access to a `Dataset`.
+    This is a convenience utility to be used when overriding
+    `Model.train_step`, `Model.test_step`, or `Model.predict_step`.
+    This utility makes it easy to support data of the form `(x,)`,
+    `(x, y)`, or `(x, y, sample_weight)`.
 
-  See `DataHandler` for the list and definition of the arguments. See the
-  implementation of `Model.fit()`, `evaluate()`, or `predict()` methods
-  for complete usage examples. As a rule of tumb, `get_data_handler()` accepts
-  the same inputs as the `x` argument of `Model.fit()`.
+    Standalone usage:
 
-  Example:
+    >>> features_batch = tf.ones((10, 5))
+    >>> labels_batch = tf.zeros((10, 5))
+    >>> data = (features_batch, labels_batch)
+    >>> # `y` and `sample_weight` will default to `None` if not provided.
+    >>> x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
+    >>> sample_weight is None
+    True
 
-  ```python
-    def step(iterator):
-      data = next(iterator)
-      # result <= Do something with data
-      return result
-    tf_step = tf.function(step, reduce_retracing=True)
+    Example in overridden `Model.train_step`:
 
-    # Assume x is a tf.data Dataset.
-    data_handler = data_adapter.get_data_handler(x=x)
-    for epo_idx, iterator in data_handler.enumerate_epochs():  # Epoch iteration
-        with data_handler.catch_stop_iteration(): # Stop on dataset exhaustion.
-          for step in data_handler.steps(): # Step iteration
-              step_result = step(iterator)
-  ```
+    ```python
+    class MyModel(tf.keras.Model):
 
-  Args:
-    *args: Arguments passed to the `DataHandler` constructor.
-    **kwargs: Arguments passed to the `DataHandler` constructor.
+      def train_step(self, data):
+        # If `sample_weight` is not provided, all samples will be weighted
+        # equally.
+        x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
 
-  Returns:
-    A `DataHandler` object. If the model's cluster coordinate is set (e.g. the
-    model was defined under a parameter-server strategy), returns a
-    `_ClusterCoordinatorDataHandler`.
+        with tf.GradientTape() as tape:
+          y_pred = self(x, training=True)
+          loss = self.compiled_loss(
+            y, y_pred, sample_weight, regularization_losses=self.losses)
+          trainable_variables = self.trainable_variables
+          gradients = tape.gradient(loss, trainable_variables)
+          self.optimizer.apply_gradients(zip(gradients, trainable_variables))
 
-  """
-  if getattr(kwargs["model"], "_cluster_coordinator", None):
-    return _ClusterCoordinatorDataHandler(*args, **kwargs)
-  return DataHandler(*args, **kwargs)
+        self.compiled_metrics.update_state(y, y_pred, sample_weight)
+        return {m.name: m.result() for m in self.metrics}
+    ```
 
+    Args:
+      data: A tuple of the form `(x,)`, `(x, y)`, or `(x, y, sample_weight)`.
 
-def _make_class_weight_map_fn(class_weight):
-  """Applies class weighting to a `Dataset`.
-
-  The `Dataset` is assumed to be in format `(x, y)` or `(x, y, sw)`, where
-  `y` must be a single `Tensor`.
-
-  Args:
-    class_weight: A map where the keys are integer class ids and values are
-      the class weights, e.g. `{0: 0.2, 1: 0.6, 2: 0.3}`
-
-  Returns:
-    A function that can be used with `tf.data.Dataset.map` to apply class
-    weighting.
-  """
-  class_ids = list(sorted(class_weight.keys()))
-  expected_class_ids = list(range(len(class_ids)))
-  if class_ids != expected_class_ids:
-    error_msg = (
-        "Expected `class_weight` to be a dict with keys from 0 to one less "
-        "than the number of classes, found {}").format(class_weight)
-    raise ValueError(error_msg)
-
-  class_weight_tensor = tf.convert_to_tensor(
-      [class_weight[int(c)] for c in class_ids])
-
-  def _class_weights_map_fn(*data):
-    """Convert `class_weight` to `sample_weight`."""
-    x, y, sw = unpack_x_y_sample_weight(data)
-
-    if tf.nest.is_nested(y):
-      raise ValueError(
-          "`class_weight` is only supported for Models with a single output.")
-
-    if y.shape.rank > 2:
-      raise ValueError("`class_weight` not supported for "
-                       "3+ dimensional targets.")
-
-    y_classes = tf.__internal__.smart_cond.smart_cond(
-        y.shape.rank == 2 and backend.shape(y)[1] > 1,
-        lambda: backend.argmax(y, axis=1),
-        lambda: tf.cast(backend.reshape(y, (-1,)), tf.int64))
-
-    cw = tf.gather(class_weight_tensor, y_classes)
-    if sw is not None:
-      cw = tf.cast(cw, sw.dtype)
-      # `class_weight` and `sample_weight` are multiplicative.
-      sw = sw * cw
+    Returns:
+      The unpacked tuple, with `None`s for `y` and `sample_weight` if they are
+      not provided.
+    """
+    if isinstance(data, list):
+        data = tuple(data)
+    if not isinstance(data, tuple):
+        return (data, None, None)
+    elif len(data) == 1:
+        return (data[0], None, None)
+    elif len(data) == 2:
+        return (data[0], data[1], None)
+    elif len(data) == 3:
+        return (data[0], data[1], data[2])
     else:
-      sw = cw
-    return x, y, sw
+        error_msg = (
+            "Data is expected to be in format `x`, `(x,)`, `(x, y)`, "
+            "or `(x, y, sample_weight)`, found: {}"
+        ).format(data)
+        raise ValueError(error_msg)
 
-  return _class_weights_map_fn
 
+@keras_export("keras.utils.pack_x_y_sample_weight", v1=[])
+def pack_x_y_sample_weight(x, y=None, sample_weight=None):
+    """Packs user-provided data into a tuple.
 
-def train_validation_split(arrays, validation_split):
-  """Split arrays into train and validation subsets in deterministic order.
-
-  The last part of data will become validation data.
-
-  Args:
-    arrays: Tensors to split. Allowed inputs are arbitrarily nested structures
-      of Tensors and NumPy arrays.
-    validation_split: Float between 0 and 1. The proportion of the dataset to
-      include in the validation split. The rest of the dataset will be included
-      in the training split.
-  Returns:
-    `(train_arrays, validation_arrays)`
-  """
-
-  def _can_split(t):
-    tensor_types = _get_tensor_types()
-    return isinstance(t, tensor_types) or t is None
-
-  flat_arrays = tf.nest.flatten(arrays)
-  unsplitable = [type(t) for t in flat_arrays if not _can_split(t)]
-  if unsplitable:
-    raise ValueError(
-        "`validation_split` is only supported for Tensors or NumPy "
-        "arrays, found following types in the input: {}".format(unsplitable))
-
-  if all(t is None for t in flat_arrays):
-    return arrays, arrays
-
-  first_non_none = None
-  for t in flat_arrays:
-    if t is not None:
-      first_non_none = t
-      break
-
-  # Assumes all arrays have the same batch shape or are `None`.
-  batch_dim = int(first_non_none.shape[0])
-  split_at = int(math.floor(batch_dim * (1. - validation_split)))
-
-  if split_at == 0 or split_at == batch_dim:
-    raise ValueError(
-        "Training data contains {batch_dim} samples, which is not sufficient "
-        "to split it into a validation and training set as specified by "
-        "`validation_split={validation_split}`. Either provide more data, or a "
-        "different value for the `validation_split` argument." .format(
-            batch_dim=batch_dim, validation_split=validation_split))
-
-  def _split(t, start, end):
-    if t is None:
-      return t
-    return t[start:end]
-
-  train_arrays = tf.nest.map_structure(
-      functools.partial(_split, start=0, end=split_at), arrays)
-  val_arrays = tf.nest.map_structure(
-      functools.partial(_split, start=split_at, end=batch_dim), arrays)
-
-  return train_arrays, val_arrays
+    This is a convenience utility for packing data into the tuple formats
+    that `Model.fit` uses.
 
+    Standalone usage:
 
-@keras_export("keras.utils.unpack_x_y_sample_weight", v1=[])
-def unpack_x_y_sample_weight(data):
-  """Unpacks user-provided data tuple.
-
-  This is a convenience utility to be used when overriding
-  `Model.train_step`, `Model.test_step`, or `Model.predict_step`.
-  This utility makes it easy to support data of the form `(x,)`,
-  `(x, y)`, or `(x, y, sample_weight)`.
-
-  Standalone usage:
-
-  >>> features_batch = tf.ones((10, 5))
-  >>> labels_batch = tf.zeros((10, 5))
-  >>> data = (features_batch, labels_batch)
-  >>> # `y` and `sample_weight` will default to `None` if not provided.
-  >>> x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
-  >>> sample_weight is None
-  True
-
-  Example in overridden `Model.train_step`:
-
-  ```python
-  class MyModel(tf.keras.Model):
-
-    def train_step(self, data):
-      # If `sample_weight` is not provided, all samples will be weighted
-      # equally.
-      x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
-
-      with tf.GradientTape() as tape:
-        y_pred = self(x, training=True)
-        loss = self.compiled_loss(
-          y, y_pred, sample_weight, regularization_losses=self.losses)
-        trainable_variables = self.trainable_variables
-        gradients = tape.gradient(loss, trainable_variables)
-        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
-
-      self.compiled_metrics.update_state(y, y_pred, sample_weight)
-      return {m.name: m.result() for m in self.metrics}
-  ```
-
-  Args:
-    data: A tuple of the form `(x,)`, `(x, y)`, or `(x, y, sample_weight)`.
-
-  Returns:
-    The unpacked tuple, with `None`s for `y` and `sample_weight` if they are not
-    provided.
-  """
-  if isinstance(data, list):
-    data = tuple(data)
-  if not isinstance(data, tuple):
-    return (data, None, None)
-  elif len(data) == 1:
-    return (data[0], None, None)
-  elif len(data) == 2:
-    return (data[0], data[1], None)
-  elif len(data) == 3:
-    return (data[0], data[1], data[2])
-  else:
-    error_msg = ("Data is expected to be in format `x`, `(x,)`, `(x, y)`, "
-                 "or `(x, y, sample_weight)`, found: {}").format(data)
-    raise ValueError(error_msg)
+    >>> x = tf.ones((10, 1))
+    >>> data = tf.keras.utils.pack_x_y_sample_weight(x)
+    >>> isinstance(data, tf.Tensor)
+    True
+    >>> y = tf.ones((10, 1))
+    >>> data = tf.keras.utils.pack_x_y_sample_weight(x, y)
+    >>> isinstance(data, tuple)
+    True
+    >>> x, y = data
 
+    Args:
+      x: Features to pass to `Model`.
+      y: Ground-truth targets to pass to `Model`.
+      sample_weight: Sample weight for each element.
 
-@keras_export("keras.utils.pack_x_y_sample_weight", v1=[])
-def pack_x_y_sample_weight(x, y=None, sample_weight=None):
-  """Packs user-provided data into a tuple.
-
-  This is a convenience utility for packing data into the tuple formats
-  that `Model.fit` uses.
-
-  Standalone usage:
-
-  >>> x = tf.ones((10, 1))
-  >>> data = tf.keras.utils.pack_x_y_sample_weight(x)
-  >>> isinstance(data, tf.Tensor)
-  True
-  >>> y = tf.ones((10, 1))
-  >>> data = tf.keras.utils.pack_x_y_sample_weight(x, y)
-  >>> isinstance(data, tuple)
-  True
-  >>> x, y = data
-
-  Args:
-    x: Features to pass to `Model`.
-    y: Ground-truth targets to pass to `Model`.
-    sample_weight: Sample weight for each element.
-
-  Returns:
-    Tuple in the format used in `Model.fit`.
-  """
-  if y is None:
-    # For single x-input, we do no tuple wrapping since in this case
-    # there is no ambiguity. This also makes NumPy and Dataset
-    # consistent in that the user does not have to wrap their Dataset
-    # data in an unnecessary tuple
-    if not tf.nest.is_nested(x):
-      return x
+    Returns:
+      Tuple in the format used in `Model.fit`.
+    """
+    if y is None:
+        # For single x-input, we do no tuple wrapping since in this case
+        # there is no ambiguity. This also makes NumPy and Dataset
+        # consistent in that the user does not have to wrap their Dataset
+        # data in an unnecessary tuple.
+        if not isinstance(x, tuple or list):
+            return x
+        else:
+            return (x,)
+    elif sample_weight is None:
+        return (x, y)
+    else:
+        return (x, y, sample_weight)
+
+
+def single_batch_iterator(
+    strategy, x, y=None, sample_weight=None, class_weight=None
+):
+    """Creates a single-batch dataset."""
+    x, y, sample_weight = _process_tensorlike((x, y, sample_weight))
+    if y is None:
+        data = (x,)
+    elif sample_weight is None:
+        data = (x, y)
     else:
-      return (x,)
-  elif sample_weight is None:
-    return (x, y)
-  else:
-    return (x, y, sample_weight)
-
-
-def single_batch_iterator(strategy,
-                          x,
-                          y=None,
-                          sample_weight=None,
-                          class_weight=None):
-  """Creates a single-batch dataset."""
-  x, y, sample_weight = _process_tensorlike((x, y, sample_weight))
-  if y is None:
-    data = (x,)
-  elif sample_weight is None:
-    data = (x, y)
-  else:
-    data = (x, y, sample_weight)
-
-  _check_data_cardinality(data)
-  dataset = tf.data.Dataset.from_tensors(data)
-  if class_weight:
-    dataset = dataset.map(_make_class_weight_map_fn(class_weight))
-  dataset = strategy.experimental_distribute_dataset(dataset)
-  return iter(dataset)
+        data = (x, y, sample_weight)
+
+    _check_data_cardinality(data)
+    dataset = tf.data.Dataset.from_tensors(data)
+    if class_weight:
+        dataset = dataset.map(_make_class_weight_map_fn(class_weight))
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    return iter(dataset)
 
 
 def _check_data_cardinality(data):
-  num_samples = set(int(i.shape[0]) for i in tf.nest.flatten(data))
-  if len(num_samples) > 1:
-    msg = "Data cardinality is ambiguous:\n"
-    for label, single_data in zip(["x", "y", "sample_weight"], data):
-      msg += "  {} sizes: {}\n".format(
-          label, ", ".join(str(i.shape[0])
-                           for i in tf.nest.flatten(single_data)))
-    msg += "Make sure all arrays contain the same number of samples."
-    raise ValueError(msg)
+    num_samples = set(int(i.shape[0]) for i in tf.nest.flatten(data))
+    if len(num_samples) > 1:
+        msg = "Data cardinality is ambiguous:\n"
+        for label, single_data in zip(["x", "y", "sample_weight"], data):
+            msg += "  {} sizes: {}\n".format(
+                label,
+                ", ".join(
+                    str(i.shape[0]) for i in tf.nest.flatten(single_data)
+                ),
+            )
+        msg += "Make sure all arrays contain the same number of samples."
+        raise ValueError(msg)
 
 
 def _get_tensor_types():
-  if pd is None:
-    return (tf.Tensor, np.ndarray)
-  else:
-    return (tf.Tensor, np.ndarray, pd.Series, pd.DataFrame)
+    if pd is None:
+        return (tf.Tensor, np.ndarray)
+    else:
+        return (tf.Tensor, np.ndarray, pd.Series, pd.DataFrame)
 
 
 def _is_scipy_sparse(x):
-  try:
-    from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
+    try:
+        from scipy.sparse import issparse
 
-    return issparse(x)
-  except ImportError:
-    return False
+        return issparse(x)
+    except ImportError:
+        return False
 
 
 def _is_pandas_series(x):
-  if pd is None:
-    return False
-  else:
-    return isinstance(x, pd.Series)
+    if pd is None:
+        return False
+    else:
+        return isinstance(x, pd.Series)
 
 
 def _scipy_sparse_to_sparse_tensor(t):
-  """Converts a SciPy sparse matrix to a SparseTensor."""
-  sparse_coo = t.tocoo()
-  row, col = sparse_coo.row, sparse_coo.col
-  data, shape = sparse_coo.data, sparse_coo.shape
-  if issubclass(data.dtype.type, np.floating):
-    data = data.astype(backend.floatx())
-  indices = np.concatenate(
-      (np.expand_dims(row, axis=1), np.expand_dims(col, axis=1)), axis=1)
-  return tf.SparseTensor(indices, data, shape)
+    """Converts a SciPy sparse matrix to a SparseTensor."""
+    sparse_coo = t.tocoo()
+    row, col = sparse_coo.row, sparse_coo.col
+    data, shape = sparse_coo.data, sparse_coo.shape
+    if issubclass(data.dtype.type, np.floating):
+        data = data.astype(backend.floatx())
+    indices = np.concatenate(
+        (np.expand_dims(row, axis=1), np.expand_dims(col, axis=1)), axis=1
+    )
+    return tf.SparseTensor(indices, data, shape)
 
 
 def _is_distributed_dataset(ds):
-  return isinstance(ds, tf.distribute.DistributedDataset)
+    return isinstance(
+        ds,
+        (
+            tf.distribute.DistributedDataset,
+            tf.experimental.dtensor.DTensorDataset,
+        ),
+    )
diff --git a/keras/engine/data_adapter_test.py b/keras/engine/data_adapter_test.py
index f0aa594326dc..2a480b385b96 100644
--- a/keras/engine/data_adapter_test.py
+++ b/keras/engine/data_adapter_test.py
@@ -14,1303 +14,1566 @@
 # ==============================================================================
 """DataAdapter tests."""
 
-import tensorflow.compat.v2 as tf
-
 import math
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
+from keras.engine import data_adapter
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.engine import data_adapter
 from keras.utils import data_utils
+
+# isort: off
 from tensorflow.python.eager import context
 
 
 class DummyArrayLike:
-  """Dummy array-like object."""
+    """Dummy array-like object."""
 
-  def __init__(self, data):
-    self.data = data
+    def __init__(self, data):
+        self.data = data
 
-  def __len__(self):
-    return len(self.data)
+    def __len__(self):
+        return len(self.data)
 
-  def __getitem__(self, key):
-    return self.data[key]
+    def __getitem__(self, key):
+        return self.data[key]
 
-  @property
-  def shape(self):
-    return self.data.shape
+    @property
+    def shape(self):
+        return self.data.shape
 
-  @property
-  def dtype(self):
-    return self.data.dtype
+    @property
+    def dtype(self):
+        return self.data.dtype
 
 
 def fail_on_convert(x, **kwargs):
-  _ = x
-  _ = kwargs
-  raise TypeError('Cannot convert DummyArrayLike to a tensor')
+    _ = x
+    _ = kwargs
+    raise TypeError("Cannot convert DummyArrayLike to a tensor")
+
+
 tf.register_tensor_conversion_function(DummyArrayLike, fail_on_convert)
 
 
 class DataAdapterTestBase(test_combinations.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.batch_size = 5
-    self.numpy_input = np.zeros((50, 10))
-    self.numpy_target = np.ones(50)
-    self.tensor_input = tf.constant(2.0, shape=(50, 10))
-    self.tensor_target = tf.ones((50,))
-    self.arraylike_input = DummyArrayLike(self.numpy_input)
-    self.arraylike_target = DummyArrayLike(self.numpy_target)
-    self.dataset_input = tf.data.Dataset.from_tensor_slices(
-        (self.numpy_input, self.numpy_target)).shuffle(50).batch(
-            self.batch_size)
-
-    def generator():
-      while True:
-        yield (np.zeros((self.batch_size, 10)), np.ones(self.batch_size))
-    self.generator_input = generator()
-    self.iterator_input = data_utils.threadsafe_generator(generator)()
-    self.sequence_input = TestSequence(batch_size=self.batch_size,
-                                       feature_shape=10)
-    self.text_input = [['abc']]
-    self.bytes_input = [[b'abc']]
-    self.model = keras.models.Sequential(
-        [keras.layers.Dense(8, input_shape=(10,), activation='softmax')])
+    def setUp(self):
+        super().setUp()
+        self.batch_size = 5
+        self.numpy_input = np.zeros((50, 10))
+        self.numpy_target = np.ones(50)
+        self.tensor_input = tf.constant(2.0, shape=(50, 10))
+        self.tensor_target = tf.ones((50,))
+        self.arraylike_input = DummyArrayLike(self.numpy_input)
+        self.arraylike_target = DummyArrayLike(self.numpy_target)
+        self.dataset_input = (
+            tf.data.Dataset.from_tensor_slices(
+                (self.numpy_input, self.numpy_target)
+            )
+            .shuffle(50)
+            .batch(self.batch_size)
+        )
+
+        def generator():
+            while True:
+                yield (
+                    np.zeros((self.batch_size, 10)),
+                    np.ones(self.batch_size),
+                )
+
+        self.generator_input = generator()
+        self.iterator_input = data_utils.threadsafe_generator(generator)()
+        self.sequence_input = TestSequence(
+            batch_size=self.batch_size, feature_shape=10
+        )
+        self.text_input = [["abc"]]
+        self.bytes_input = [[b"abc"]]
+        self.model = keras.models.Sequential(
+            [keras.layers.Dense(8, input_shape=(10,), activation="softmax")]
+        )
 
 
 class TestSequence(data_utils.Sequence):
+    def __init__(self, batch_size, feature_shape):
+        self.batch_size = batch_size
+        self.feature_shape = feature_shape
 
-  def __init__(self, batch_size, feature_shape):
-    self.batch_size = batch_size
-    self.feature_shape = feature_shape
+    def __getitem__(self, item):
+        return (
+            np.zeros((self.batch_size, self.feature_shape)),
+            np.ones((self.batch_size,)),
+        )
 
-  def __getitem__(self, item):
-    return (np.zeros((self.batch_size, self.feature_shape)),
-            np.ones((self.batch_size,)))
-
-  def __len__(self):
-    return 10
+    def __len__(self):
+        return 10
 
 
 class TestSparseSequence(TestSequence):
-
-  def __getitem__(self, item):
-    indices = [[row, self.feature_shape - 1] for row in range(self.batch_size)]
-    values = [1 for row in range(self.batch_size)]
-    st = tf.SparseTensor(indices, values, (self.batch_size, self.feature_shape))
-    return (st, np.ones((self.batch_size,)))
+    def __getitem__(self, item):
+        indices = [
+            [row, self.feature_shape - 1] for row in range(self.batch_size)
+        ]
+        values = [1 for row in range(self.batch_size)]
+        st = tf.SparseTensor(
+            indices, values, (self.batch_size, self.feature_shape)
+        )
+        return (st, np.ones((self.batch_size,)))
 
 
 class TestRaggedSequence(TestSequence):
-
-  def __getitem__(self, item):
-    values = np.random.randint(0, self.feature_shape,
-                               (self.batch_size, 2)).reshape(-1)
-    row_lengths = np.full(self.batch_size, 2)
-    rt = tf.RaggedTensor.from_row_lengths(values, row_lengths)
-    return (rt, np.ones((self.batch_size,)))
+    def __getitem__(self, item):
+        values = np.random.randint(
+            0, self.feature_shape, (self.batch_size, 2)
+        ).reshape(-1)
+        row_lengths = np.full(self.batch_size, 2)
+        rt = tf.RaggedTensor.from_row_lengths(values, row_lengths)
+        return (rt, np.ones((self.batch_size,)))
 
 
 class TestBatchSequence(data_utils.Sequence):
-
-  def __init__(self, batch_size, feature_shape, epochs=2):
-    """Creates a keras.utils.Sequence with increasing batch_size.
-
-    Args:
-        batch_size (Union[int, List[int]]): Can be a list containing two values:
-          start and end batch_size
-        feature_shape (int): Number of features in a sample
-        epochs (int, optional): Number of epochs
-    """
-    self.batch_size = batch_size
-    self.feature_shape = feature_shape
-
-    self._epochs = epochs
-    # we use `on_epoch_end` method to prepare data for the next epoch
-    # set current epoch to `-1`, so that `on_epoch_end` will increase it to `0`
-    self._current_epoch = -1
-    # actual batch size will be set inside `on_epoch_end`
-    self._current_batch_size = 0
-
-    self.on_epoch_end()
-
-  def __len__(self):
-    """Number of batches in the Sequence.
-
-    Returns: int
-        The number of batches in the Sequence.
-    """
-    # data was rebalanced, so need to recalculate number of examples
-    num_examples = 20
-    batch_size = self._current_batch_size
-    return num_examples // batch_size + int(
-        num_examples % batch_size >
-        0)  # = math.ceil(num_examples / batch_size )
-
-  def __getitem__(self, index):
-    """Gets batch at position `index`.
-
-    Arguments:
-        index (int): position of the batch in the Sequence.
-    Returns: Tuple[Any, Any] A batch (tuple of input data and target data).
-    """
-    # return input and target data, as our target data is inside the input
-    # data return None for the target data
-    return (np.zeros((self._current_batch_size, self.feature_shape)),
-            np.ones((self._current_batch_size,)))
-
-  def on_epoch_end(self):
-    """Updates the data after every epoch."""
-    self._current_epoch += 1
-    if self._current_epoch < self._epochs:
-      self._current_batch_size = self._linearly_increasing_batch_size()
-
-  def _linearly_increasing_batch_size(self):
-    """Linearly increase batch size with every epoch.
-
-    The idea comes from https://arxiv.org/abs/1711.00489.
-
-    Returns: int
-        The batch size to use in this epoch.
-    """
-    if not isinstance(self.batch_size, list):
-      return int(self.batch_size)
-
-    if self._epochs > 1:
-      return int(self.batch_size[0] + self._current_epoch *
-                 (self.batch_size[1] - self.batch_size[0]) / (self._epochs - 1))
-    else:
-      return int(self.batch_size[0])
+    def __init__(self, batch_size, feature_shape, epochs=2):
+        """Creates a keras.utils.Sequence with increasing batch_size.
+
+        Args:
+            batch_size (Union[int, List[int]]): Can be a list containing two
+                values: start and end batch_size
+            feature_shape (int): Number of features in a sample
+            epochs (int, optional): Number of epochs
+        """
+        self.batch_size = batch_size
+        self.feature_shape = feature_shape
+
+        self._epochs = epochs
+        # we use `on_epoch_end` method to prepare data for the next epoch set
+        # current epoch to `-1`, so that `on_epoch_end` will increase it to `0`
+        self._current_epoch = -1
+        # actual batch size will be set inside `on_epoch_end`
+        self._current_batch_size = 0
+
+        self.on_epoch_end()
+
+    def __len__(self):
+        """Number of batches in the Sequence.
+
+        Returns: int
+            The number of batches in the Sequence.
+        """
+        # data was rebalanced, so need to recalculate number of examples
+        num_examples = 20
+        batch_size = self._current_batch_size
+        return num_examples // batch_size + int(
+            num_examples % batch_size > 0
+        )  # = math.ceil(num_examples / batch_size )
+
+    def __getitem__(self, index):
+        """Gets batch at position `index`.
+
+        Arguments:
+            index (int): position of the batch in the Sequence.
+        Returns: Tuple[Any, Any] A batch (tuple of input data and target data).
+        """
+        # return input and target data, as our target data is inside the input
+        # data return None for the target data
+        return (
+            np.zeros((self._current_batch_size, self.feature_shape)),
+            np.ones((self._current_batch_size,)),
+        )
+
+    def on_epoch_end(self):
+        """Updates the data after every epoch."""
+        self._current_epoch += 1
+        if self._current_epoch < self._epochs:
+            self._current_batch_size = self._linearly_increasing_batch_size()
+
+    def _linearly_increasing_batch_size(self):
+        """Linearly increase batch size with every epoch.
+
+        The idea comes from https://arxiv.org/abs/1711.00489.
+
+        Returns: int
+            The batch size to use in this epoch.
+        """
+        if not isinstance(self.batch_size, list):
+            return int(self.batch_size)
+
+        if self._epochs > 1:
+            return int(
+                self.batch_size[0]
+                + self._current_epoch
+                * (self.batch_size[1] - self.batch_size[0])
+                / (self._epochs - 1)
+            )
+        else:
+            return int(self.batch_size[0])
 
 
 class TensorLikeDataAdapterTest(DataAdapterTestBase):
-
-  def setUp(self):
-    super().setUp()
-    self.adapter_cls = data_adapter.TensorLikeDataAdapter
-
-  def test_can_handle_numpy(self):
-    self.assertTrue(self.adapter_cls.can_handle(self.numpy_input))
-    self.assertTrue(
-        self.adapter_cls.can_handle(self.numpy_input, self.numpy_target))
-
-    self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
-
-  def test_size_numpy(self):
-    adapter = self.adapter_cls(
-        self.numpy_input, self.numpy_target, batch_size=5)
-    self.assertEqual(adapter.get_size(), 10)
-    self.assertFalse(adapter.has_partial_batch())
-
-  def test_batch_size_numpy(self):
-    adapter = self.adapter_cls(
-        self.numpy_input, self.numpy_target, batch_size=5)
-    self.assertEqual(adapter.batch_size(), 5)
-
-  def test_partial_batch_numpy(self):
-    adapter = self.adapter_cls(
-        self.numpy_input, self.numpy_target, batch_size=4)
-    self.assertEqual(adapter.get_size(), 13)   # 50/4
-    self.assertTrue(adapter.has_partial_batch())
-    self.assertEqual(adapter.partial_batch_size(), 2)
-
-  def test_epochs(self):
-    num_epochs = 3
-    adapter = self.adapter_cls(
-        self.numpy_input, self.numpy_target, batch_size=5, epochs=num_epochs)
-    ds_iter = iter(adapter.get_dataset())
-    num_batches_per_epoch = self.numpy_input.shape[0] // 5
-    for _ in range(num_batches_per_epoch * num_epochs):
-      next(ds_iter)
-    with self.assertRaises(StopIteration):
-      next(ds_iter)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training_numpy(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.numpy_input, self.numpy_target, batch_size=5)
-
-  def test_can_handle_pandas(self):
-    try:
-      import pandas as pd  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      self.skipTest('Skipping test because pandas is not installed.')
-    self.assertTrue(self.adapter_cls.can_handle(pd.DataFrame(self.numpy_input)))
-    self.assertTrue(
-        self.adapter_cls.can_handle(pd.DataFrame(self.numpy_input)[0]))
-    self.assertTrue(
-        self.adapter_cls.can_handle(
-            pd.DataFrame(self.numpy_input),
-            pd.DataFrame(self.numpy_input)[0]))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training_pandas(self):
-    try:
-      import pandas as pd  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      self.skipTest('Skipping test because pandas is not installed.')
-    input_a = keras.Input(shape=(3,), name='input_a')
-    input_b = keras.Input(shape=(3,), name='input_b')
-    input_c = keras.Input(shape=(1,), name='input_b')
-
-    x = keras.layers.Dense(4, name='dense_1')(input_a)
-    y = keras.layers.Dense(3, name='dense_2')(input_b)
-    z = keras.layers.Dense(1, name='dense_3')(input_c)
-
-    model_1 = keras.Model(inputs=input_a, outputs=x)
-    model_2 = keras.Model(inputs=[input_a, input_b], outputs=[x, y])
-    model_3 = keras.Model(inputs=input_c, outputs=z)
-
-    model_1.compile(optimizer='rmsprop', loss='mse')
-    model_2.compile(optimizer='rmsprop', loss='mse')
-    model_3.compile(optimizer='rmsprop', loss='mse')
-
-    input_a_np = np.random.random((10, 3))
-    input_b_np = np.random.random((10, 3))
-    input_a_df = pd.DataFrame(input_a_np)
-    input_b_df = pd.DataFrame(input_b_np)
-
-    output_a_df = pd.DataFrame(np.random.random((10, 4)))
-    output_b_df = pd.DataFrame(np.random.random((10, 3)))
-    output_c_series = pd.DataFrame(np.random.random((10, 4)))[0]
-
-    model_1.fit(input_a_df,
-                output_a_df)
-    model_2.fit([input_a_df, input_b_df],
-                [output_a_df, output_b_df])
-    model_3.fit(input_a_df[[0]],
-                output_c_series)
-    model_1.fit([input_a_df],
-                [output_a_df])
-    model_1.fit({'input_a': input_a_df},
-                output_a_df)
-    model_2.fit({'input_a': input_a_df, 'input_b': input_b_df},
-                [output_a_df, output_b_df])
-
-    model_1.evaluate(input_a_df,
-                     output_a_df)
-    model_2.evaluate([input_a_df, input_b_df],
-                     [output_a_df, output_b_df])
-    model_3.evaluate(input_a_df[[0]],
-                     output_c_series)
-    model_1.evaluate([input_a_df],
-                     [output_a_df])
-    model_1.evaluate({'input_a': input_a_df},
-                     output_a_df)
-    model_2.evaluate({'input_a': input_a_df, 'input_b': input_b_df},
-                     [output_a_df, output_b_df])
-
-    # Verify predicting on pandas vs numpy returns the same result
-    predict_1_pandas = model_1.predict(input_a_df)
-    predict_2_pandas = model_2.predict([input_a_df, input_b_df])
-    predict_3_pandas = model_3.predict(input_a_df[[0]])
-    predict_3_pandas_batch = model_3.predict_on_batch(input_a_df[0])
-
-    predict_1_numpy = model_1.predict(input_a_np)
-    predict_2_numpy = model_2.predict([input_a_np, input_b_np])
-    predict_3_numpy = model_3.predict(np.asarray(input_a_df[0]))
-
-    self.assertAllClose(predict_1_numpy, predict_1_pandas)
-    self.assertAllClose(predict_2_numpy, predict_2_pandas)
-    self.assertAllClose(predict_3_numpy, predict_3_pandas_batch)
-    self.assertAllClose(predict_3_numpy, predict_3_pandas)
-
-    # Extra ways to pass in dataframes
-    model_1.predict([input_a_df])
-    model_1.predict({'input_a': input_a_df})
-    model_2.predict({'input_a': input_a_df, 'input_b': input_b_df})
-
-  def test_can_handle(self):
-    self.assertTrue(self.adapter_cls.can_handle(self.tensor_input))
-    self.assertTrue(
-        self.adapter_cls.can_handle(self.tensor_input, self.tensor_target))
-
-    self.assertFalse(self.adapter_cls.can_handle(self.arraylike_input))
-    self.assertFalse(
-        self.adapter_cls.can_handle(self.arraylike_input,
-                                    self.arraylike_target))
-    self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.tensor_input, self.tensor_target, batch_size=5)
-
-  def test_size(self):
-    adapter = self.adapter_cls(
-        self.tensor_input, self.tensor_target, batch_size=5)
-    self.assertEqual(adapter.get_size(), 10)
-    self.assertFalse(adapter.has_partial_batch())
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_shuffle_correctness(self):
-    num_samples = 100
-    batch_size = 32
-    x = np.arange(num_samples)
-    np.random.seed(99)
-    adapter = self.adapter_cls(
-        x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
-
-    def _get_epoch(ds_iter):
-      ds_data = []
-      for _ in range(int(math.ceil(num_samples / batch_size))):
-        ds_data.append(next(ds_iter).numpy())
-      return np.concatenate(ds_data)
-
-    ds_iter = iter(adapter.get_dataset())
-
-    # First epoch.
-    epoch_data = _get_epoch(ds_iter)
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(epoch_data))
-
-    # Second epoch.
-    second_epoch_data = _get_epoch(ds_iter)
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, second_epoch_data)
-    # Check that shuffling is different across epochs.
-    self.assertNotAllClose(epoch_data, second_epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(second_epoch_data))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_batch_shuffle_correctness(self):
-    num_samples = 100
-    batch_size = 6
-    x = np.arange(num_samples)
-    np.random.seed(99)
-    adapter = self.adapter_cls(
-        x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
-
-    def _get_epoch_batches(ds_iter):
-      ds_data = []
-      for _ in range(int(math.ceil(num_samples / batch_size))):
-        ds_data.append(next(ds_iter)[0].numpy())
-      return ds_data
-
-    ds_iter = iter(adapter.get_dataset())
-
-    # First epoch.
-    epoch_batch_data = _get_epoch_batches(ds_iter)
-    epoch_data = np.concatenate(epoch_batch_data)
-
-    def _verify_batch(batch):
-      # Verify that a batch contains only contiguous data, and that it has
-      # been shuffled.
-      shuffled_batch = np.sort(batch)
-      self.assertNotAllClose(batch, shuffled_batch)
-      for i in range(1, len(batch)):
-        self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
-
-    # Assert that the data within each batch remains contiguous
-    for batch in epoch_batch_data:
-      _verify_batch(batch)
-
-    # Check that individual batches are unshuffled
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(epoch_data))
-
-    # Second epoch.
-    second_epoch_batch_data = _get_epoch_batches(ds_iter)
-    second_epoch_data = np.concatenate(second_epoch_batch_data)
-
-    # Assert that the data within each batch remains contiguous
-    for batch in second_epoch_batch_data:
-      _verify_batch(batch)
-
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, second_epoch_data)
-    # Check that shuffling is different across epochs.
-    self.assertNotAllClose(epoch_data, second_epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(second_epoch_data))
-
-  @parameterized.named_parameters(
-      ('batch_size_5', 5, None, 5),
-      ('batch_size_50', 50, 4, 50),  # Sanity check: batch_size takes precedence
-      ('steps_1', None, 1, 50),
-      ('steps_4', None, 4, 13),
-      )
-  def test_batch_size(self, batch_size_in, steps, batch_size_out):
-    adapter = self.adapter_cls(
-        self.tensor_input, self.tensor_target, batch_size=batch_size_in,
-        steps=steps)
-    self.assertEqual(adapter.batch_size(), batch_size_out)
-
-  @parameterized.named_parameters(
-      ('batch_size_5', 5, None, 10, 0),
-      ('batch_size_4', 4, None, 13, 2),
-      ('steps_1', None, 1, 1, 0),
-      ('steps_5', None, 5, 5, 0),
-      ('steps_4', None, 4, 4, 11),
-      )
-  def test_partial_batch(
-      self, batch_size_in, steps, size, partial_batch_size):
-    adapter = self.adapter_cls(
-        self.tensor_input, self.tensor_target, batch_size=batch_size_in,
-        steps=steps)
-    self.assertEqual(adapter.get_size(), size)   # 50/steps
-    self.assertEqual(adapter.has_partial_batch(), bool(partial_batch_size))
-    self.assertEqual(adapter.partial_batch_size(), partial_batch_size or None)
+    def setUp(self):
+        super().setUp()
+        self.adapter_cls = data_adapter.TensorLikeDataAdapter
+
+    def test_can_handle_numpy(self):
+        self.assertTrue(self.adapter_cls.can_handle(self.numpy_input))
+        self.assertTrue(
+            self.adapter_cls.can_handle(self.numpy_input, self.numpy_target)
+        )
+
+        self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
+
+    def test_size_numpy(self):
+        adapter = self.adapter_cls(
+            self.numpy_input, self.numpy_target, batch_size=5
+        )
+        self.assertEqual(adapter.get_size(), 10)
+        self.assertFalse(adapter.has_partial_batch())
+
+    def test_batch_size_numpy(self):
+        adapter = self.adapter_cls(
+            self.numpy_input, self.numpy_target, batch_size=5
+        )
+        self.assertEqual(adapter.batch_size(), 5)
+
+    def test_partial_batch_numpy(self):
+        adapter = self.adapter_cls(
+            self.numpy_input, self.numpy_target, batch_size=4
+        )
+        self.assertEqual(adapter.get_size(), 13)  # 50/4
+        self.assertTrue(adapter.has_partial_batch())
+        self.assertEqual(adapter.partial_batch_size(), 2)
+
+    def test_epochs(self):
+        num_epochs = 3
+        adapter = self.adapter_cls(
+            self.numpy_input, self.numpy_target, batch_size=5, epochs=num_epochs
+        )
+        ds_iter = iter(adapter.get_dataset())
+        num_batches_per_epoch = self.numpy_input.shape[0] // 5
+        for _ in range(num_batches_per_epoch * num_epochs):
+            next(ds_iter)
+        with self.assertRaises(StopIteration):
+            next(ds_iter)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training_numpy(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(self.numpy_input, self.numpy_target, batch_size=5)
+
+    def test_can_handle_pandas(self):
+        try:
+            import pandas as pd
+        except ImportError:
+            self.skipTest("Skipping test because pandas is not installed.")
+        self.assertTrue(
+            self.adapter_cls.can_handle(pd.DataFrame(self.numpy_input))
+        )
+        self.assertTrue(
+            self.adapter_cls.can_handle(pd.DataFrame(self.numpy_input)[0])
+        )
+        self.assertTrue(
+            self.adapter_cls.can_handle(
+                pd.DataFrame(self.numpy_input),
+                pd.DataFrame(self.numpy_input)[0],
+            )
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training_pandas(self):
+        try:
+            import pandas as pd
+        except ImportError:
+            self.skipTest("Skipping test because pandas is not installed.")
+        input_a = keras.Input(shape=(3,), name="input_a")
+        input_b = keras.Input(shape=(3,), name="input_b")
+        input_c = keras.Input(shape=(1,), name="input_b")
+
+        x = keras.layers.Dense(4, name="dense_1")(input_a)
+        y = keras.layers.Dense(3, name="dense_2")(input_b)
+        z = keras.layers.Dense(1, name="dense_3")(input_c)
+
+        model_1 = keras.Model(inputs=input_a, outputs=x)
+        model_2 = keras.Model(inputs=[input_a, input_b], outputs=[x, y])
+        model_3 = keras.Model(inputs=input_c, outputs=z)
+
+        model_1.compile(optimizer="rmsprop", loss="mse")
+        model_2.compile(optimizer="rmsprop", loss="mse")
+        model_3.compile(optimizer="rmsprop", loss="mse")
+
+        input_a_np = np.random.random((10, 3))
+        input_b_np = np.random.random((10, 3))
+        input_a_df = pd.DataFrame(input_a_np)
+        input_b_df = pd.DataFrame(input_b_np)
+
+        output_a_df = pd.DataFrame(np.random.random((10, 4)))
+        output_b_df = pd.DataFrame(np.random.random((10, 3)))
+        output_c_series = pd.DataFrame(np.random.random((10, 4)))[0]
+
+        model_1.fit(input_a_df, output_a_df)
+        model_2.fit([input_a_df, input_b_df], [output_a_df, output_b_df])
+        model_3.fit(input_a_df[[0]], output_c_series)
+        model_1.fit([input_a_df], [output_a_df])
+        model_1.fit({"input_a": input_a_df}, output_a_df)
+        model_2.fit(
+            {"input_a": input_a_df, "input_b": input_b_df},
+            [output_a_df, output_b_df],
+        )
+
+        model_1.evaluate(input_a_df, output_a_df)
+        model_2.evaluate([input_a_df, input_b_df], [output_a_df, output_b_df])
+        model_3.evaluate(input_a_df[[0]], output_c_series)
+        model_1.evaluate([input_a_df], [output_a_df])
+        model_1.evaluate({"input_a": input_a_df}, output_a_df)
+        model_2.evaluate(
+            {"input_a": input_a_df, "input_b": input_b_df},
+            [output_a_df, output_b_df],
+        )
+
+        # Verify predicting on pandas vs numpy returns the same result
+        predict_1_pandas = model_1.predict(input_a_df)
+        predict_2_pandas = model_2.predict([input_a_df, input_b_df])
+        predict_3_pandas = model_3.predict(input_a_df[[0]])
+        predict_3_pandas_batch = model_3.predict_on_batch(input_a_df[0])
+
+        predict_1_numpy = model_1.predict(input_a_np)
+        predict_2_numpy = model_2.predict([input_a_np, input_b_np])
+        predict_3_numpy = model_3.predict(np.asarray(input_a_df[0]))
+
+        self.assertAllClose(predict_1_numpy, predict_1_pandas)
+        self.assertAllClose(predict_2_numpy, predict_2_pandas)
+        self.assertAllClose(predict_3_numpy, predict_3_pandas_batch)
+        self.assertAllClose(predict_3_numpy, predict_3_pandas)
+
+        # Extra ways to pass in dataframes
+        model_1.predict([input_a_df])
+        model_1.predict({"input_a": input_a_df})
+        model_2.predict({"input_a": input_a_df, "input_b": input_b_df})
+
+    def test_can_handle(self):
+        self.assertTrue(self.adapter_cls.can_handle(self.tensor_input))
+        self.assertTrue(
+            self.adapter_cls.can_handle(self.tensor_input, self.tensor_target)
+        )
+
+        self.assertFalse(self.adapter_cls.can_handle(self.arraylike_input))
+        self.assertFalse(
+            self.adapter_cls.can_handle(
+                self.arraylike_input, self.arraylike_target
+            )
+        )
+        self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(self.tensor_input, self.tensor_target, batch_size=5)
+
+    def test_size(self):
+        adapter = self.adapter_cls(
+            self.tensor_input, self.tensor_target, batch_size=5
+        )
+        self.assertEqual(adapter.get_size(), 10)
+        self.assertFalse(adapter.has_partial_batch())
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_shuffle_correctness(self):
+        num_samples = 100
+        batch_size = 32
+        x = np.arange(num_samples)
+        np.random.seed(99)
+        adapter = self.adapter_cls(
+            x, y=None, batch_size=batch_size, shuffle=True, epochs=2
+        )
+
+        def _get_epoch(ds_iter):
+            ds_data = []
+            for _ in range(int(math.ceil(num_samples / batch_size))):
+                ds_data.append(next(ds_iter).numpy())
+            return np.concatenate(ds_data)
+
+        ds_iter = iter(adapter.get_dataset())
+
+        # First epoch.
+        epoch_data = _get_epoch(ds_iter)
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(epoch_data))
+
+        # Second epoch.
+        second_epoch_data = _get_epoch(ds_iter)
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, second_epoch_data)
+        # Check that shuffling is different across epochs.
+        self.assertNotAllClose(epoch_data, second_epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(second_epoch_data))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_batch_shuffle_correctness(self):
+        num_samples = 100
+        batch_size = 6
+        x = np.arange(num_samples)
+        np.random.seed(99)
+        adapter = self.adapter_cls(
+            x, y=None, batch_size=batch_size, shuffle="batch", epochs=2
+        )
+
+        def _get_epoch_batches(ds_iter):
+            ds_data = []
+            for _ in range(int(math.ceil(num_samples / batch_size))):
+                ds_data.append(next(ds_iter)[0].numpy())
+            return ds_data
+
+        ds_iter = iter(adapter.get_dataset())
+
+        # First epoch.
+        epoch_batch_data = _get_epoch_batches(ds_iter)
+        epoch_data = np.concatenate(epoch_batch_data)
+
+        def _verify_batch(batch):
+            # Verify that a batch contains only contiguous data, and that it has
+            # been shuffled.
+            shuffled_batch = np.sort(batch)
+            self.assertNotAllClose(batch, shuffled_batch)
+            for i in range(1, len(batch)):
+                self.assertEqual(shuffled_batch[i - 1] + 1, shuffled_batch[i])
+
+        # Assert that the data within each batch remains contiguous
+        for batch in epoch_batch_data:
+            _verify_batch(batch)
+
+        # Check that individual batches are unshuffled
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(epoch_data))
+
+        # Second epoch.
+        second_epoch_batch_data = _get_epoch_batches(ds_iter)
+        second_epoch_data = np.concatenate(second_epoch_batch_data)
+
+        # Assert that the data within each batch remains contiguous
+        for batch in second_epoch_batch_data:
+            _verify_batch(batch)
+
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, second_epoch_data)
+        # Check that shuffling is different across epochs.
+        self.assertNotAllClose(epoch_data, second_epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(second_epoch_data))
+
+    @parameterized.named_parameters(
+        ("batch_size_5", 5, None, 5),
+        (
+            "batch_size_50",
+            50,
+            4,
+            50,
+        ),  # Sanity check: batch_size takes precedence
+        ("steps_1", None, 1, 50),
+        ("steps_4", None, 4, 13),
+    )
+    def test_batch_size(self, batch_size_in, steps, batch_size_out):
+        adapter = self.adapter_cls(
+            self.tensor_input,
+            self.tensor_target,
+            batch_size=batch_size_in,
+            steps=steps,
+        )
+        self.assertEqual(adapter.batch_size(), batch_size_out)
+
+    @parameterized.named_parameters(
+        ("batch_size_5", 5, None, 10, 0),
+        ("batch_size_4", 4, None, 13, 2),
+        ("steps_1", None, 1, 1, 0),
+        ("steps_5", None, 5, 5, 0),
+        ("steps_4", None, 4, 4, 11),
+    )
+    def test_partial_batch(
+        self, batch_size_in, steps, size, partial_batch_size
+    ):
+        adapter = self.adapter_cls(
+            self.tensor_input,
+            self.tensor_target,
+            batch_size=batch_size_in,
+            steps=steps,
+        )
+        self.assertEqual(adapter.get_size(), size)  # 50/steps
+        self.assertEqual(adapter.has_partial_batch(), bool(partial_batch_size))
+        self.assertEqual(
+            adapter.partial_batch_size(), partial_batch_size or None
+        )
 
 
 class IncreasingBatchSizeAdapterTest(test_combinations.TestCase):
+    def setUp(self):
+        super(IncreasingBatchSizeAdapterTest, self).setUp()
+        self.adapter_cls = data_adapter.KerasSequenceAdapter
+
+        self.epochs = 2
+        self.increasing_batch_size = [5, 10]
+        self.sequence_input = TestBatchSequence(
+            batch_size=self.increasing_batch_size,
+            feature_shape=10,
+            epochs=self.epochs,
+        )
+        self.model = keras.models.Sequential(
+            [keras.layers.Dense(8, input_shape=(10,), activation="softmax")]
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training_with_test_batch_sequence(self):
+        """Ensures TestBatchSequence works as expected."""
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        # Check state before fit()
+        self.assertEqual(self.sequence_input._current_epoch, 0)
+        self.assertEqual(self.sequence_input._current_batch_size, 5)
+
+        # Execute fit()
+        self.model.fit(self.sequence_input, epochs=self.epochs)
+
+        # Check state after fit()
+        self.assertEqual(self.sequence_input._current_epoch, 2)
+        self.assertEqual(self.sequence_input._current_batch_size, 10)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training_with_increasing_batch_size(self):
+        """Ensures data_adapters DataHandler & DataAdapter work as expected."""
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.stop_training = False
+        self.model.train_function = self.model.make_train_function()
+
+        # Check state before fit()
+        self.assertEqual(self.sequence_input._current_epoch, 0)
+        self.assertEqual(self.sequence_input._current_batch_size, 5)
+        data_handler = data_adapter.get_data_handler(
+            self.sequence_input,
+            epochs=self.epochs,
+            model=self.model,
+        )
+        self.assertEqual(
+            data_handler.inferred_steps, 4
+        )  # 20 samples / 5 bs = 4
+
+        # Execute fit()-loop
+        for epoch, iterator in data_handler.enumerate_epochs():
+            self.model.reset_metrics()
+            with data_handler.catch_stop_iteration():
+                for step in data_handler.steps():
+                    with tf.profiler.experimental.Trace(
+                        "train",
+                        epoch_num=epoch,
+                        step_num=step,
+                        batch_size=self.sequence_input._current_batch_size,
+                        _r=1,
+                    ):
+                        if data_handler.should_sync:
+                            context.async_wait()
+                        if self.model.stop_training:
+                            break
+
+        # Check state after fit()
+        self.assertEqual(
+            data_handler.inferred_steps, 2
+        )  # 20 samples / 10 bs = 2
 
-  def setUp(self):
-    super(IncreasingBatchSizeAdapterTest, self).setUp()
-    self.adapter_cls = data_adapter.KerasSequenceAdapter
 
-    self.epochs = 2
-    self.increasing_batch_size = [5, 10]
-    self.sequence_input = TestBatchSequence(
-        batch_size=self.increasing_batch_size,
-        feature_shape=10,
-        epochs=self.epochs,
+class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
+    def setUp(self):
+        super().setUp()
+        self.adapter_cls = data_adapter.GenericArrayLikeDataAdapter
+
+    def test_can_handle_some_numpy(self):
+        self.assertTrue(self.adapter_cls.can_handle(self.arraylike_input))
+        self.assertTrue(
+            self.adapter_cls.can_handle(
+                self.arraylike_input, self.arraylike_target
+            )
+        )
+
+        # Because adapters are mutually exclusive, don't handle cases
+        # where all the data is numpy or an eagertensor
+        self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
+        self.assertFalse(
+            self.adapter_cls.can_handle(self.numpy_input, self.numpy_target)
+        )
+        self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
+        self.assertFalse(
+            self.adapter_cls.can_handle(self.tensor_input, self.tensor_target)
+        )
+
+        # But do handle mixes that include generic arraylike data
+        self.assertTrue(
+            self.adapter_cls.can_handle(self.numpy_input, self.arraylike_target)
+        )
+        self.assertTrue(
+            self.adapter_cls.can_handle(self.arraylike_input, self.numpy_target)
+        )
+        self.assertTrue(
+            self.adapter_cls.can_handle(
+                self.arraylike_input, self.tensor_target
+            )
+        )
+        self.assertTrue(
+            self.adapter_cls.can_handle(
+                self.tensor_input, self.arraylike_target
+            )
+        )
+
+        self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
+
+    def test_size(self):
+        adapter = self.adapter_cls(
+            self.arraylike_input, self.arraylike_target, batch_size=5
+        )
+        self.assertEqual(adapter.get_size(), 10)
+        self.assertFalse(adapter.has_partial_batch())
+
+    def test_epochs(self):
+        num_epochs = 3
+        adapter = self.adapter_cls(
+            self.arraylike_input,
+            self.numpy_target,
+            batch_size=5,
+            epochs=num_epochs,
+        )
+        ds_iter = iter(adapter.get_dataset())
+        num_batches_per_epoch = self.numpy_input.shape[0] // 5
+        for _ in range(num_batches_per_epoch * num_epochs):
+            next(ds_iter)
+        with self.assertRaises(StopIteration):
+            next(ds_iter)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training(self):
+        # First verify that DummyArrayLike can't be converted to a Tensor
+        with self.assertRaises(TypeError):
+            tf.convert_to_tensor(self.arraylike_input)
+
+        # Then train on the array like.
+        # It should not be converted to a tensor directly (which would force it
+        # into memory), only the sliced data should be converted.
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(
+            self.arraylike_input, self.arraylike_target, batch_size=5
+        )
+        self.model.fit(
+            self.arraylike_input,
+            self.arraylike_target,
+            shuffle=True,
+            batch_size=5,
+        )
+        self.model.fit(
+            self.arraylike_input,
+            self.arraylike_target,
+            shuffle="batch",
+            batch_size=5,
+        )
+        self.model.evaluate(
+            self.arraylike_input, self.arraylike_target, batch_size=5
+        )
+        self.model.predict(self.arraylike_input, batch_size=5)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training_numpy_target(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(self.arraylike_input, self.numpy_target, batch_size=5)
+        self.model.fit(
+            self.arraylike_input, self.numpy_target, shuffle=True, batch_size=5
+        )
+        self.model.fit(
+            self.arraylike_input,
+            self.numpy_target,
+            shuffle="batch",
+            batch_size=5,
+        )
+        self.model.evaluate(
+            self.arraylike_input, self.numpy_target, batch_size=5
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training_tensor_target(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(self.arraylike_input, self.tensor_target, batch_size=5)
+        self.model.fit(
+            self.arraylike_input, self.tensor_target, shuffle=True, batch_size=5
+        )
+        self.model.fit(
+            self.arraylike_input,
+            self.tensor_target,
+            shuffle="batch",
+            batch_size=5,
+        )
+        self.model.evaluate(
+            self.arraylike_input, self.tensor_target, batch_size=5
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_shuffle_correctness(self):
+        num_samples = 100
+        batch_size = 32
+        x = DummyArrayLike(np.arange(num_samples))
+        np.random.seed(99)
+        adapter = self.adapter_cls(
+            x, y=None, batch_size=batch_size, shuffle=True, epochs=2
+        )
+
+        def _get_epoch(ds_iter):
+            ds_data = []
+            for _ in range(int(math.ceil(num_samples / batch_size))):
+                ds_data.append(next(ds_iter).numpy())
+            return np.concatenate(ds_data)
+
+        ds_iter = iter(adapter.get_dataset())
+
+        # First epoch.
+        epoch_data = _get_epoch(ds_iter)
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(epoch_data))
+
+        # Second epoch.
+        second_epoch_data = _get_epoch(ds_iter)
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, second_epoch_data)
+        # Check that shuffling is different across epochs.
+        self.assertNotAllClose(epoch_data, second_epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(second_epoch_data))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_batch_shuffle_correctness(self):
+        num_samples = 100
+        batch_size = 6
+        x = DummyArrayLike(np.arange(num_samples))
+        np.random.seed(99)
+        adapter = self.adapter_cls(
+            x, y=None, batch_size=batch_size, shuffle="batch", epochs=2
+        )
+
+        def _get_epoch_batches(ds_iter):
+            ds_data = []
+            for _ in range(int(math.ceil(num_samples / batch_size))):
+                ds_data.append(next(ds_iter)[0].numpy())
+            return ds_data
+
+        ds_iter = iter(adapter.get_dataset())
+
+        # First epoch.
+        epoch_batch_data = _get_epoch_batches(ds_iter)
+        epoch_data = np.concatenate(epoch_batch_data)
+
+        def _verify_batch(batch):
+            # Verify that a batch contains only contiguous data, but that it has
+            # been shuffled.
+            shuffled_batch = np.sort(batch)
+            self.assertNotAllClose(batch, shuffled_batch)
+            for i in range(1, len(batch)):
+                self.assertEqual(shuffled_batch[i - 1] + 1, shuffled_batch[i])
+
+        # Assert that the data within each batch is shuffled contiguous data
+        for batch in epoch_batch_data:
+            _verify_batch(batch)
+
+        # Check that individual batches are unshuffled
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(epoch_data))
+
+        # Second epoch.
+        second_epoch_batch_data = _get_epoch_batches(ds_iter)
+        second_epoch_data = np.concatenate(second_epoch_batch_data)
+
+        # Assert that the data within each batch remains contiguous
+        for batch in second_epoch_batch_data:
+            _verify_batch(batch)
+
+        # Check that shuffling occurred.
+        self.assertNotAllClose(x, second_epoch_data)
+        # Check that shuffling is different across epochs.
+        self.assertNotAllClose(epoch_data, second_epoch_data)
+        # Check that each elements appears, and only once.
+        self.assertAllClose(x, np.sort(second_epoch_data))
+
+    @parameterized.named_parameters(
+        ("batch_size_5", 5, None, 5),
+        (
+            "batch_size_50",
+            50,
+            4,
+            50,
+        ),  # Sanity check: batch_size takes precedence
+        ("steps_1", None, 1, 50),
+        ("steps_4", None, 4, 13),
     )
-    self.model = keras.models.Sequential(
-        [keras.layers.Dense(8, input_shape=(10,), activation='softmax')])
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training_with_test_batch_sequence(self):
-    """Ensures TestBatchSequence works as expected."""
-    self.model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    # Check state before fit()
-    self.assertEqual(self.sequence_input._current_epoch, 0)
-    self.assertEqual(self.sequence_input._current_batch_size, 5)
-
-    # Execute fit()
-    self.model.fit(self.sequence_input, epochs=self.epochs)
-
-    # Check state after fit()
-    self.assertEqual(self.sequence_input._current_epoch, 2)
-    self.assertEqual(self.sequence_input._current_batch_size, 10)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training_with_increasing_batch_size(self):
-    """Ensures data_adapters DataHandler & DataAdapter work as expected."""
-    self.model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    self.model.stop_training = False
-    self.model.train_function = self.model.make_train_function()
-
-    # Check state before fit()
-    self.assertEqual(self.sequence_input._current_epoch, 0)
-    self.assertEqual(self.sequence_input._current_batch_size, 5)
-    data_handler = data_adapter.get_data_handler(
-        self.sequence_input,
-        epochs=self.epochs,
-        model=self.model,
+    def test_batch_size(self, batch_size_in, steps, batch_size_out):
+        adapter = self.adapter_cls(
+            self.arraylike_input,
+            self.arraylike_target,
+            batch_size=batch_size_in,
+            steps=steps,
+        )
+        self.assertEqual(adapter.batch_size(), batch_size_out)
+
+    @parameterized.named_parameters(
+        ("batch_size_5", 5, None, 10, 0),
+        ("batch_size_4", 4, None, 13, 2),
+        ("steps_1", None, 1, 1, 0),
+        ("steps_5", None, 5, 5, 0),
+        ("steps_4", None, 4, 4, 11),
     )
-    self.assertEqual(data_handler.inferred_steps, 4)  # 20 samples / 5 bs = 4
-
-    # Execute fit()-loop
-    for epoch, iterator in data_handler.enumerate_epochs():
-      self.model.reset_metrics()
-      with data_handler.catch_stop_iteration():
-        for step in data_handler.steps():
-          with tf.profiler.experimental.Trace(
-              'train',
-              epoch_num=epoch,
-              step_num=step,
-              batch_size=self.sequence_input._current_batch_size,
-              _r=1,
-          ):
-            if data_handler.should_sync:
-              context.async_wait()
-            if self.model.stop_training:
-              break
-
-    # Check state after fit()
-    self.assertEqual(data_handler.inferred_steps, 2)  # 20 samples / 10 bs = 2
-
-
-class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
-
-  def setUp(self):
-    super().setUp()
-    self.adapter_cls = data_adapter.GenericArrayLikeDataAdapter
-
-  def test_can_handle_some_numpy(self):
-    self.assertTrue(self.adapter_cls.can_handle(
-        self.arraylike_input))
-    self.assertTrue(
-        self.adapter_cls.can_handle(self.arraylike_input,
-                                    self.arraylike_target))
-
-    # Because adapters are mutually exclusive, don't handle cases
-    # where all the data is numpy or an eagertensor
-    self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
-    self.assertFalse(
-        self.adapter_cls.can_handle(self.numpy_input,
-                                    self.numpy_target))
-    self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
-    self.assertFalse(
-        self.adapter_cls.can_handle(self.tensor_input, self.tensor_target))
-
-    # But do handle mixes that include generic arraylike data
-    self.assertTrue(
-        self.adapter_cls.can_handle(self.numpy_input,
-                                    self.arraylike_target))
-    self.assertTrue(
-        self.adapter_cls.can_handle(self.arraylike_input,
-                                    self.numpy_target))
-    self.assertTrue(
-        self.adapter_cls.can_handle(self.arraylike_input,
-                                    self.tensor_target))
-    self.assertTrue(
-        self.adapter_cls.can_handle(self.tensor_input,
-                                    self.arraylike_target))
-
-    self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
-
-  def test_size(self):
-    adapter = self.adapter_cls(
-        self.arraylike_input,
-        self.arraylike_target, batch_size=5)
-    self.assertEqual(adapter.get_size(), 10)
-    self.assertFalse(adapter.has_partial_batch())
-
-  def test_epochs(self):
-    num_epochs = 3
-    adapter = self.adapter_cls(
-        self.arraylike_input,
-        self.numpy_target, batch_size=5, epochs=num_epochs)
-    ds_iter = iter(adapter.get_dataset())
-    num_batches_per_epoch = self.numpy_input.shape[0] // 5
-    for _ in range(num_batches_per_epoch * num_epochs):
-      next(ds_iter)
-    with self.assertRaises(StopIteration):
-      next(ds_iter)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training(self):
-    # First verify that DummyArrayLike can't be converted to a Tensor
-    with self.assertRaises(TypeError):
-      tf.convert_to_tensor(self.arraylike_input)
-
-    # Then train on the array like.
-    # It should not be converted to a tensor directly (which would force it into
-    # memory), only the sliced data should be converted.
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.arraylike_input,
-                   self.arraylike_target, batch_size=5)
-    self.model.fit(self.arraylike_input,
-                   self.arraylike_target,
-                   shuffle=True, batch_size=5)
-    self.model.fit(self.arraylike_input,
-                   self.arraylike_target,
-                   shuffle='batch', batch_size=5)
-    self.model.evaluate(self.arraylike_input,
-                        self.arraylike_target, batch_size=5)
-    self.model.predict(self.arraylike_input, batch_size=5)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training_numpy_target(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.arraylike_input,
-                   self.numpy_target, batch_size=5)
-    self.model.fit(self.arraylike_input,
-                   self.numpy_target, shuffle=True,
-                   batch_size=5)
-    self.model.fit(self.arraylike_input,
-                   self.numpy_target, shuffle='batch',
-                   batch_size=5)
-    self.model.evaluate(self.arraylike_input,
-                        self.numpy_target, batch_size=5)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training_tensor_target(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.arraylike_input,
-                   self.tensor_target, batch_size=5)
-    self.model.fit(self.arraylike_input,
-                   self.tensor_target, shuffle=True,
-                   batch_size=5)
-    self.model.fit(self.arraylike_input,
-                   self.tensor_target, shuffle='batch',
-                   batch_size=5)
-    self.model.evaluate(self.arraylike_input,
-                        self.tensor_target, batch_size=5)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_shuffle_correctness(self):
-    num_samples = 100
-    batch_size = 32
-    x = DummyArrayLike(np.arange(num_samples))
-    np.random.seed(99)
-    adapter = self.adapter_cls(
-        x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
-
-    def _get_epoch(ds_iter):
-      ds_data = []
-      for _ in range(int(math.ceil(num_samples / batch_size))):
-        ds_data.append(next(ds_iter).numpy())
-      return np.concatenate(ds_data)
-
-    ds_iter = iter(adapter.get_dataset())
-
-    # First epoch.
-    epoch_data = _get_epoch(ds_iter)
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(epoch_data))
-
-    # Second epoch.
-    second_epoch_data = _get_epoch(ds_iter)
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, second_epoch_data)
-    # Check that shuffling is different across epochs.
-    self.assertNotAllClose(epoch_data, second_epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(second_epoch_data))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_batch_shuffle_correctness(self):
-    num_samples = 100
-    batch_size = 6
-    x = DummyArrayLike(np.arange(num_samples))
-    np.random.seed(99)
-    adapter = self.adapter_cls(
-        x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
-
-    def _get_epoch_batches(ds_iter):
-      ds_data = []
-      for _ in range(int(math.ceil(num_samples / batch_size))):
-        ds_data.append(next(ds_iter)[0].numpy())
-      return ds_data
-
-    ds_iter = iter(adapter.get_dataset())
-
-    # First epoch.
-    epoch_batch_data = _get_epoch_batches(ds_iter)
-    epoch_data = np.concatenate(epoch_batch_data)
-
-    def _verify_batch(batch):
-      # Verify that a batch contains only contiguous data, but that it has
-      # been shuffled.
-      shuffled_batch = np.sort(batch)
-      self.assertNotAllClose(batch, shuffled_batch)
-      for i in range(1, len(batch)):
-        self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
-
-    # Assert that the data within each batch is shuffled contiguous data
-    for batch in epoch_batch_data:
-      _verify_batch(batch)
-
-    # Check that individual batches are unshuffled
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(epoch_data))
-
-    # Second epoch.
-    second_epoch_batch_data = _get_epoch_batches(ds_iter)
-    second_epoch_data = np.concatenate(second_epoch_batch_data)
-
-    # Assert that the data within each batch remains contiguous
-    for batch in second_epoch_batch_data:
-      _verify_batch(batch)
-
-    # Check that shuffling occurred.
-    self.assertNotAllClose(x, second_epoch_data)
-    # Check that shuffling is different across epochs.
-    self.assertNotAllClose(epoch_data, second_epoch_data)
-    # Check that each elements appears, and only once.
-    self.assertAllClose(x, np.sort(second_epoch_data))
-
-  @parameterized.named_parameters(
-      ('batch_size_5', 5, None, 5),
-      ('batch_size_50', 50, 4, 50),  # Sanity check: batch_size takes precedence
-      ('steps_1', None, 1, 50),
-      ('steps_4', None, 4, 13),
-  )
-  def test_batch_size(self, batch_size_in, steps, batch_size_out):
-    adapter = self.adapter_cls(
-        self.arraylike_input,
-        self.arraylike_target, batch_size=batch_size_in,
-        steps=steps)
-    self.assertEqual(adapter.batch_size(), batch_size_out)
-
-  @parameterized.named_parameters(
-      ('batch_size_5', 5, None, 10, 0),
-      ('batch_size_4', 4, None, 13, 2),
-      ('steps_1', None, 1, 1, 0),
-      ('steps_5', None, 5, 5, 0),
-      ('steps_4', None, 4, 4, 11),
-  )
-  def test_partial_batch(
-      self, batch_size_in, steps, size, partial_batch_size):
-    adapter = self.adapter_cls(
-        self.arraylike_input, self.arraylike_target,
-        batch_size=batch_size_in,
-        steps=steps)
-    self.assertEqual(adapter.get_size(), size)   # 50/steps
-    self.assertEqual(adapter.has_partial_batch(), bool(partial_batch_size))
-    self.assertEqual(adapter.partial_batch_size(), partial_batch_size or None)
+    def test_partial_batch(
+        self, batch_size_in, steps, size, partial_batch_size
+    ):
+        adapter = self.adapter_cls(
+            self.arraylike_input,
+            self.arraylike_target,
+            batch_size=batch_size_in,
+            steps=steps,
+        )
+        self.assertEqual(adapter.get_size(), size)  # 50/steps
+        self.assertEqual(adapter.has_partial_batch(), bool(partial_batch_size))
+        self.assertEqual(
+            adapter.partial_batch_size(), partial_batch_size or None
+        )
 
 
 class DatasetAdapterTest(DataAdapterTestBase):
-
-  def setUp(self):
-    super().setUp()
-    self.adapter_cls = data_adapter.DatasetAdapter
-
-  def test_can_handle(self):
-    self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
-    self.assertTrue(self.adapter_cls.can_handle(self.dataset_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training(self):
-    dataset = self.adapter_cls(self.dataset_input).get_dataset()
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(dataset)
-
-  def test_size(self):
-    adapter = self.adapter_cls(self.dataset_input)
-    self.assertIsNone(adapter.get_size())
-
-  def test_batch_size(self):
-    adapter = self.adapter_cls(self.dataset_input)
-    self.assertIsNone(adapter.batch_size())
-
-  def test_partial_batch(self):
-    adapter = self.adapter_cls(self.dataset_input)
-    self.assertFalse(adapter.has_partial_batch())
-    self.assertIsNone(adapter.partial_batch_size())
-
-  def test_invalid_targets_argument(self):
-    with self.assertRaisesRegex(ValueError, r'`y` argument is not supported'):
-      self.adapter_cls(self.dataset_input, y=self.dataset_input)
-
-  def test_invalid_sample_weights_argument(self):
-    with self.assertRaisesRegex(ValueError,
-                                r'`sample_weight` argument is not supported'):
-      self.adapter_cls(self.dataset_input, sample_weights=self.dataset_input)
+    def setUp(self):
+        super().setUp()
+        self.adapter_cls = data_adapter.DatasetAdapter
+
+    def test_can_handle(self):
+        self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
+        self.assertTrue(self.adapter_cls.can_handle(self.dataset_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training(self):
+        dataset = self.adapter_cls(self.dataset_input).get_dataset()
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(dataset)
+
+    def test_size(self):
+        adapter = self.adapter_cls(self.dataset_input)
+        self.assertIsNone(adapter.get_size())
+
+    def test_batch_size(self):
+        adapter = self.adapter_cls(self.dataset_input)
+        self.assertIsNone(adapter.batch_size())
+
+    def test_partial_batch(self):
+        adapter = self.adapter_cls(self.dataset_input)
+        self.assertFalse(adapter.has_partial_batch())
+        self.assertIsNone(adapter.partial_batch_size())
+
+    def test_invalid_targets_argument(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`y` argument is not supported"
+        ):
+            self.adapter_cls(self.dataset_input, y=self.dataset_input)
+
+    def test_invalid_sample_weights_argument(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`sample_weight` argument is not supported"
+        ):
+            self.adapter_cls(
+                self.dataset_input, sample_weights=self.dataset_input
+            )
 
 
 class GeneratorDataAdapterTest(DataAdapterTestBase):
-
-  def setUp(self):
-    super().setUp()
-    self.adapter_cls = data_adapter.GeneratorDataAdapter
-
-  def test_can_handle(self):
-    self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
-    self.assertTrue(self.adapter_cls.can_handle(self.generator_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.generator_input, steps_per_epoch=10)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @test_utils.run_v2_only
-  @data_utils.dont_use_multiprocessing_pool
-  def test_with_multiprocessing_training(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.iterator_input, workers=1, use_multiprocessing=True,
-                   max_queue_size=10, steps_per_epoch=10)
-    # Fit twice to ensure there isn't any duplication that prevent the worker
-    # from starting.
-    self.model.fit(self.iterator_input, workers=1, use_multiprocessing=True,
-                   max_queue_size=10, steps_per_epoch=10)
-
-  def test_size(self):
-    adapter = self.adapter_cls(self.generator_input)
-    self.assertIsNone(adapter.get_size())
-
-  def test_batch_size(self):
-    adapter = self.adapter_cls(self.generator_input)
-    self.assertEqual(adapter.batch_size(), None)
-    self.assertEqual(adapter.representative_batch_size(), 5)
-
-  def test_partial_batch(self):
-    adapter = self.adapter_cls(self.generator_input)
-    self.assertFalse(adapter.has_partial_batch())
-    self.assertIsNone(adapter.partial_batch_size())
-
-  def test_invalid_targets_argument(self):
-    with self.assertRaisesRegex(ValueError, r'`y` argument is not supported'):
-      self.adapter_cls(self.generator_input, y=self.generator_input)
-
-  def test_invalid_sample_weights_argument(self):
-    with self.assertRaisesRegex(ValueError,
-                                r'`sample_weight` argument is not supported'):
-      self.adapter_cls(
-          self.generator_input, sample_weights=self.generator_input)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_not_shuffled(self):
-    def generator():
-      for i in range(10):
-        yield np.ones((1, 1)) * i
-
-    adapter = self.adapter_cls(generator(), shuffle=True)
-    for i, data in enumerate(adapter.get_dataset()):
-      self.assertEqual(i, data[0].numpy().flatten())
-
-  def test_model_without_forward_pass(self):
-
-    class MyModel(keras.Model):
-
-      def train_step(self, data):
-        return {'loss': 0.}
-
-      def test_step(self, data):
-        return {'loss': 0.}
-
-    model = MyModel()
-    model.compile('rmsprop')
-    model.fit(self.generator_input, steps_per_epoch=5)
-    out = model.evaluate(self.generator_input, steps=5)
-    self.assertEqual(out, 0)
+    def setUp(self):
+        super().setUp()
+        self.adapter_cls = data_adapter.GeneratorDataAdapter
+
+    def test_can_handle(self):
+        self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
+        self.assertTrue(self.adapter_cls.can_handle(self.generator_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(self.generator_input, steps_per_epoch=10)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @test_utils.run_v2_only
+    @data_utils.dont_use_multiprocessing_pool
+    def test_with_multiprocessing_training(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(
+            self.iterator_input,
+            workers=1,
+            use_multiprocessing=True,
+            max_queue_size=10,
+            steps_per_epoch=10,
+        )
+        # Fit twice to ensure there isn't any duplication that prevent the
+        # worker from starting.
+        self.model.fit(
+            self.iterator_input,
+            workers=1,
+            use_multiprocessing=True,
+            max_queue_size=10,
+            steps_per_epoch=10,
+        )
+
+    def test_size(self):
+        adapter = self.adapter_cls(self.generator_input)
+        self.assertIsNone(adapter.get_size())
+
+    def test_batch_size(self):
+        adapter = self.adapter_cls(self.generator_input)
+        self.assertEqual(adapter.batch_size(), None)
+        self.assertEqual(adapter.representative_batch_size(), 5)
+
+    def test_partial_batch(self):
+        adapter = self.adapter_cls(self.generator_input)
+        self.assertFalse(adapter.has_partial_batch())
+        self.assertIsNone(adapter.partial_batch_size())
+
+    def test_invalid_targets_argument(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`y` argument is not supported"
+        ):
+            self.adapter_cls(self.generator_input, y=self.generator_input)
+
+    def test_invalid_sample_weights_argument(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`sample_weight` argument is not supported"
+        ):
+            self.adapter_cls(
+                self.generator_input, sample_weights=self.generator_input
+            )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_not_shuffled(self):
+        def generator():
+            for i in range(10):
+                yield np.ones((1, 1)) * i
+
+        adapter = self.adapter_cls(generator(), shuffle=True)
+        for i, data in enumerate(adapter.get_dataset()):
+            self.assertEqual(i, data[0].numpy().flatten())
+
+    def test_model_without_forward_pass(self):
+        class MyModel(keras.Model):
+            def train_step(self, data):
+                return {"loss": 0.0}
+
+            def test_step(self, data):
+                return {"loss": 0.0}
+
+        model = MyModel()
+        model.compile("rmsprop")
+        model.fit(self.generator_input, steps_per_epoch=5)
+        out = model.evaluate(self.generator_input, steps=5)
+        self.assertEqual(out, 0)
 
 
 class KerasSequenceAdapterTest(DataAdapterTestBase):
-
-  def setUp(self):
-    super().setUp()
-    self.adapter_cls = data_adapter.KerasSequenceAdapter
-
-  def test_can_handle(self):
-    self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
-    self.assertTrue(self.adapter_cls.can_handle(self.sequence_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_training(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.sequence_input)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @test_utils.run_v2_only
-  @data_utils.dont_use_multiprocessing_pool
-  def test_with_multiprocessing_training(self):
-    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
-                       run_eagerly=test_utils.should_run_eagerly())
-    self.model.fit(self.sequence_input, workers=1, use_multiprocessing=True,
-                   max_queue_size=10, steps_per_epoch=10)
-    # Fit twice to ensure there isn't any duplication that prevent the worker
-    # from starting.
-    self.model.fit(self.sequence_input, workers=1, use_multiprocessing=True,
-                   max_queue_size=10, steps_per_epoch=10)
-
-  def test_size(self):
-    adapter = self.adapter_cls(self.sequence_input)
-    self.assertEqual(adapter.get_size(), 10)
-
-  def test_batch_size(self):
-    adapter = self.adapter_cls(self.sequence_input)
-    self.assertEqual(adapter.batch_size(), None)
-    self.assertEqual(adapter.representative_batch_size(), 5)
-
-  def test_partial_batch(self):
-    adapter = self.adapter_cls(self.sequence_input)
-    self.assertFalse(adapter.has_partial_batch())
-    self.assertIsNone(adapter.partial_batch_size())
-
-  def test_invalid_targets_argument(self):
-    with self.assertRaisesRegex(ValueError, r'`y` argument is not supported'):
-      self.adapter_cls(self.sequence_input, y=self.sequence_input)
-
-  def test_invalid_sample_weights_argument(self):
-    with self.assertRaisesRegex(ValueError,
-                                r'`sample_weight` argument is not supported'):
-      self.adapter_cls(self.sequence_input, sample_weights=self.sequence_input)
+    def setUp(self):
+        super().setUp()
+        self.adapter_cls = data_adapter.KerasSequenceAdapter
+
+    def test_can_handle(self):
+        self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
+        self.assertTrue(self.adapter_cls.can_handle(self.sequence_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_training(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(self.sequence_input)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @test_utils.run_v2_only
+    @data_utils.dont_use_multiprocessing_pool
+    def test_with_multiprocessing_training(self):
+        self.model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.model.fit(
+            self.sequence_input,
+            workers=1,
+            use_multiprocessing=True,
+            max_queue_size=10,
+            steps_per_epoch=10,
+        )
+        # Fit twice to ensure there isn't any duplication that prevent the
+        # worker from starting.
+        self.model.fit(
+            self.sequence_input,
+            workers=1,
+            use_multiprocessing=True,
+            max_queue_size=10,
+            steps_per_epoch=10,
+        )
+
+    def test_size(self):
+        adapter = self.adapter_cls(self.sequence_input)
+        self.assertEqual(adapter.get_size(), 10)
+
+    def test_batch_size(self):
+        adapter = self.adapter_cls(self.sequence_input)
+        self.assertEqual(adapter.batch_size(), None)
+        self.assertEqual(adapter.representative_batch_size(), 5)
+
+    def test_partial_batch(self):
+        adapter = self.adapter_cls(self.sequence_input)
+        self.assertFalse(adapter.has_partial_batch())
+        self.assertIsNone(adapter.partial_batch_size())
+
+    def test_invalid_targets_argument(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`y` argument is not supported"
+        ):
+            self.adapter_cls(self.sequence_input, y=self.sequence_input)
+
+    def test_invalid_sample_weights_argument(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`sample_weight` argument is not supported"
+        ):
+            self.adapter_cls(
+                self.sequence_input, sample_weights=self.sequence_input
+            )
 
 
 class KerasSequenceAdapterSparseTest(KerasSequenceAdapterTest):
-
-  def setUp(self):
-    super().setUp()
-    self.sequence_input = TestSparseSequence(self.batch_size, 10)
+    def setUp(self):
+        super().setUp()
+        self.sequence_input = TestSparseSequence(self.batch_size, 10)
 
 
 class KerasSequenceAdapterRaggedTest(KerasSequenceAdapterTest):
+    def setUp(self):
+        super().setUp()
+        self.sequence_input = TestRaggedSequence(self.batch_size, 10)
 
-  def setUp(self):
-    super().setUp()
-    self.sequence_input = TestRaggedSequence(self.batch_size, 10)
-
-    self.model = keras.models.Sequential([
-        keras.layers.Input(shape=(None,), ragged=True),
-        keras.layers.Embedding(10, 10),
-        keras.layers.Lambda(tf.reduce_mean, arguments=dict(axis=1)),
-        keras.layers.Dense(8, input_shape=(10,), activation='relu'),
-    ])
+        self.model = keras.models.Sequential(
+            [
+                keras.layers.Input(shape=(None,), ragged=True),
+                keras.layers.Embedding(10, 10),
+                keras.layers.Lambda(tf.reduce_mean, arguments=dict(axis=1)),
+                keras.layers.Dense(8, input_shape=(10,), activation="relu"),
+            ]
+        )
 
 
 class DataHandlerTest(test_combinations.TestCase):
-
-  def test_finite_dataset_with_steps_per_epoch(self):
-    data = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3]).batch(1)
-    # User can choose to only partially consume `Dataset`.
-    data_handler = data_adapter.DataHandler(
-        data, initial_epoch=0, epochs=2, steps_per_epoch=2)
-    self.assertEqual(data_handler.inferred_steps, 2)
-    self.assertFalse(data_handler._adapter.should_recreate_iterator())
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator).numpy())
-      returned_data.append(epoch_data)
-    self.assertEqual(returned_data, [[0, 1], [2, 3]])
-
-  def test_finite_dataset_without_steps_per_epoch(self):
-    data = tf.data.Dataset.from_tensor_slices([0, 1, 2]).batch(1)
-    data_handler = data_adapter.DataHandler(data, initial_epoch=0, epochs=2)
-    self.assertEqual(data_handler.inferred_steps, 3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator).numpy())
-      returned_data.append(epoch_data)
-    self.assertEqual(returned_data, [[0, 1, 2], [0, 1, 2]])
-
-  def test_finite_dataset_with_steps_per_epoch_exact_size(self):
-    data = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3]).batch(1)
-    # If user specifies exact size of `Dataset` as `steps_per_epoch`,
-    # create a new iterator each epoch.
-    data_handler = data_adapter.DataHandler(
-        data, initial_epoch=0, epochs=2, steps_per_epoch=4)
-    self.assertTrue(data_handler._adapter.should_recreate_iterator())
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator).numpy())
-      returned_data.append(epoch_data)
-    self.assertEqual(returned_data, [[0, 1, 2, 3], [0, 1, 2, 3]])
-
-  def test_infinite_dataset_with_steps_per_epoch(self):
-    data = tf.data.Dataset.from_tensor_slices([0, 1, 2]).batch(1).repeat()
-    data_handler = data_adapter.DataHandler(
-        data, initial_epoch=0, epochs=2, steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator).numpy())
-      returned_data.append(epoch_data)
-    self.assertEqual(returned_data, [[0, 1, 2], [0, 1, 2]])
-
-  def test_unknown_cardinality_dataset_with_steps_per_epoch(self):
-    ds = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3, 4, 5, 6])
-    filtered_ds = ds.filter(lambda x: x < 4)
-    self.assertEqual(
-        tf.data.experimental.cardinality(filtered_ds).numpy(), tf.data.experimental.UNKNOWN_CARDINALITY)
-
-    # User can choose to only partially consume `Dataset`.
-    data_handler = data_adapter.DataHandler(
-        filtered_ds, initial_epoch=0, epochs=2, steps_per_epoch=2)
-    self.assertFalse(data_handler._adapter.should_recreate_iterator())
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data, [[0, 1], [2, 3]])
-    self.assertEqual(data_handler.inferred_steps, 2)
-
-  def test_unknown_cardinality_dataset_without_steps_per_epoch(self):
-    ds = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3, 4, 5, 6])
-    filtered_ds = ds.filter(lambda x: x < 4)
-    self.assertEqual(
-        tf.data.experimental.cardinality(filtered_ds).numpy(), tf.data.experimental.UNKNOWN_CARDINALITY)
-
-    data_handler = data_adapter.DataHandler(
-        filtered_ds, initial_epoch=0, epochs=2)
-    self.assertEqual(data_handler.inferred_steps, None)
-    self.assertTrue(data_handler._adapter.should_recreate_iterator())
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      with data_handler.catch_stop_iteration():
-        for _ in data_handler.steps():
-          epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data, [[0, 1, 2, 3], [0, 1, 2, 3]])
-    self.assertEqual(data_handler.inferred_steps, 4)
-
-  def test_insufficient_data(self):
-    ds = tf.data.Dataset.from_tensor_slices([0, 1])
-    ds = ds.filter(lambda *args, **kwargs: True)
-    data_handler = data_adapter.DataHandler(
-        ds, initial_epoch=0, epochs=2, steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        with data_handler.catch_stop_iteration():
-          epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertTrue(data_handler._insufficient_data)
-    self.assertEqual(returned_data, [[0, 1]])
-
-  def test_numpy(self):
-    x = np.array([0, 1, 2])
-    y = np.array([0, 2, 4])
-    sw = np.array([0, 4, 8])
-    data_handler = data_adapter.DataHandler(
-        x=x, y=y, sample_weight=sw, batch_size=1, epochs=2)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data,
-                     [[(0, 0, 0), (1, 2, 4),
-                       (2, 4, 8)], [(0, 0, 0), (1, 2, 4), (2, 4, 8)]])
-
-  def test_generator(self):
-
-    def generator():
-      for _ in range(2):
-        for step in range(3):
-          yield (tf.convert_to_tensor([step]),)
-
-    data_handler = data_adapter.DataHandler(
-        generator(), epochs=2, steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data, [[([0],), ([1],),
-                                      ([2],)], [([0],), ([1],), ([2],)]])
-
-  def test_composite_tensor(self):
-    st = tf.SparseTensor(
-        indices=[[0, 0], [1, 0], [2, 0]], values=[0, 1, 2], dense_shape=[3, 1])
-    data_handler = data_adapter.DataHandler(st, epochs=2, steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(
-        tf.nest.map_structure(tf.sparse.to_dense, returned_data))
-    self.assertEqual(returned_data, [[([0],), ([1],),
-                                      ([2],)], [([0],), ([1],), ([2],)]])
-
-  def test_iterator(self):
-    def generator():
-      for _ in range(2):
-        for step in range(3):
-          yield (tf.convert_to_tensor([step]),)
-
-    it = iter(tf.data.Dataset.from_generator(
-        generator, output_types=('float32',)))
-    data_handler = data_adapter.DataHandler(it, epochs=2, steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data, [[([0],), ([1],), ([2],)],
-                                     [([0],), ([1],), ([2],)]])
-
-  def test_list_of_scalars(self):
-    data_handler = data_adapter.DataHandler([[0], [1], [2]],
-                                            epochs=2,
-                                            steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data, [[([0],), ([1],),
-                                      ([2],)], [([0],), ([1],), ([2],)]])
-
-  def test_class_weight_user_errors(self):
-    with self.assertRaisesRegex(ValueError, 'to be a dict with keys'):
-      data_adapter.DataHandler(
-          x=[[0], [1], [2]],
-          y=[[2], [1], [0]],
-          batch_size=1,
-          sample_weight=[[1.], [2.], [4.]],
-          class_weight={
-              0: 0.5,
-              1: 1.,
-              3: 1.5  # Skips class `2`.
-          })
-
-    with self.assertRaisesRegex(ValueError, 'with a single output'):
-      data_adapter.DataHandler(
-          x=np.ones((10, 1)),
-          y=[np.ones((10, 1)), np.zeros((10, 1))],
-          batch_size=2,
-          class_weight={
-              0: 0.5,
-              1: 1.,
-              2: 1.5
-          })
-
-  @parameterized.named_parameters(('numpy', True), ('dataset', False))
-  def test_single_x_input_no_tuple_wrapping(self, use_numpy):
-    x = np.ones((10, 1))
-
-    if use_numpy:
-      batch_size = 2
-    else:
-      x = tf.data.Dataset.from_tensor_slices(x).batch(2)
-      batch_size = None
-
-    data_handler = data_adapter.DataHandler(x, batch_size=batch_size)
-    for _, iterator in data_handler.enumerate_epochs():
-      for _ in data_handler.steps():
-        # Check that single x input is not wrapped in a tuple.
-        self.assertIsInstance(next(iterator), tf.Tensor)
+    def test_finite_dataset_with_steps_per_epoch(self):
+        data = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3]).batch(1)
+        # User can choose to only partially consume `Dataset`.
+        data_handler = data_adapter.DataHandler(
+            data, initial_epoch=0, epochs=2, steps_per_epoch=2
+        )
+        self.assertEqual(data_handler.inferred_steps, 2)
+        self.assertFalse(data_handler._adapter.should_recreate_iterator())
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator).numpy())
+            returned_data.append(epoch_data)
+        self.assertEqual(returned_data, [[0, 1], [2, 3]])
+
+    def test_finite_dataset_without_steps_per_epoch(self):
+        data = tf.data.Dataset.from_tensor_slices([0, 1, 2]).batch(1)
+        data_handler = data_adapter.DataHandler(data, initial_epoch=0, epochs=2)
+        self.assertEqual(data_handler.inferred_steps, 3)
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator).numpy())
+            returned_data.append(epoch_data)
+        self.assertEqual(returned_data, [[0, 1, 2], [0, 1, 2]])
+
+    def test_finite_dataset_with_steps_per_epoch_exact_size(self):
+        data = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3]).batch(1)
+        # If user specifies exact size of `Dataset` as `steps_per_epoch`,
+        # create a new iterator each epoch.
+        data_handler = data_adapter.DataHandler(
+            data, initial_epoch=0, epochs=2, steps_per_epoch=4
+        )
+        self.assertTrue(data_handler._adapter.should_recreate_iterator())
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator).numpy())
+            returned_data.append(epoch_data)
+        self.assertEqual(returned_data, [[0, 1, 2, 3], [0, 1, 2, 3]])
+
+    def test_infinite_dataset_with_steps_per_epoch(self):
+        data = tf.data.Dataset.from_tensor_slices([0, 1, 2]).batch(1).repeat()
+        data_handler = data_adapter.DataHandler(
+            data, initial_epoch=0, epochs=2, steps_per_epoch=3
+        )
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator).numpy())
+            returned_data.append(epoch_data)
+        self.assertEqual(returned_data, [[0, 1, 2], [0, 1, 2]])
+
+    def test_unknown_cardinality_dataset_with_steps_per_epoch(self):
+        ds = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3, 4, 5, 6])
+        filtered_ds = ds.filter(lambda x: x < 4)
+        self.assertEqual(
+            tf.data.experimental.cardinality(filtered_ds).numpy(),
+            tf.data.experimental.UNKNOWN_CARDINALITY,
+        )
+
+        # User can choose to only partially consume `Dataset`.
+        data_handler = data_adapter.DataHandler(
+            filtered_ds, initial_epoch=0, epochs=2, steps_per_epoch=2
+        )
+        self.assertFalse(data_handler._adapter.should_recreate_iterator())
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+        self.assertEqual(returned_data, [[0, 1], [2, 3]])
+        self.assertEqual(data_handler.inferred_steps, 2)
+
+    def test_unknown_cardinality_dataset_without_steps_per_epoch(self):
+        ds = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3, 4, 5, 6])
+        filtered_ds = ds.filter(lambda x: x < 4)
+        self.assertEqual(
+            tf.data.experimental.cardinality(filtered_ds).numpy(),
+            tf.data.experimental.UNKNOWN_CARDINALITY,
+        )
+
+        data_handler = data_adapter.DataHandler(
+            filtered_ds, initial_epoch=0, epochs=2
+        )
+        self.assertEqual(data_handler.inferred_steps, None)
+        self.assertTrue(data_handler._adapter.should_recreate_iterator())
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            with data_handler.catch_stop_iteration():
+                for _ in data_handler.steps():
+                    epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+        self.assertEqual(returned_data, [[0, 1, 2, 3], [0, 1, 2, 3]])
+        self.assertEqual(data_handler.inferred_steps, 4)
+
+    def test_insufficient_data(self):
+        ds = tf.data.Dataset.from_tensor_slices([0, 1])
+        ds = ds.filter(lambda *args, **kwargs: True)
+        data_handler = data_adapter.DataHandler(
+            ds, initial_epoch=0, epochs=2, steps_per_epoch=3
+        )
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                with data_handler.catch_stop_iteration():
+                    epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+        self.assertTrue(data_handler._insufficient_data)
+        self.assertEqual(returned_data, [[0, 1]])
+
+    def test_numpy(self):
+        x = np.array([0, 1, 2])
+        y = np.array([0, 2, 4])
+        sw = np.array([0, 4, 8])
+        data_handler = data_adapter.DataHandler(
+            x=x, y=y, sample_weight=sw, batch_size=1, epochs=2
+        )
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+        self.assertEqual(
+            returned_data,
+            [
+                [(0, 0, 0), (1, 2, 4), (2, 4, 8)],
+                [(0, 0, 0), (1, 2, 4), (2, 4, 8)],
+            ],
+        )
+
+    def test_generator(self):
+        def generator():
+            for _ in range(2):
+                for step in range(3):
+                    yield (tf.convert_to_tensor([step]),)
+
+        data_handler = data_adapter.DataHandler(
+            generator(), epochs=2, steps_per_epoch=3
+        )
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+        self.assertEqual(
+            returned_data, [[([0],), ([1],), ([2],)], [([0],), ([1],), ([2],)]]
+        )
+
+    def test_composite_tensor(self):
+        st = tf.SparseTensor(
+            indices=[[0, 0], [1, 0], [2, 0]],
+            values=[0, 1, 2],
+            dense_shape=[3, 1],
+        )
+        data_handler = data_adapter.DataHandler(st, epochs=2, steps_per_epoch=3)
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(
+            tf.nest.map_structure(tf.sparse.to_dense, returned_data)
+        )
+        self.assertEqual(
+            returned_data, [[([0],), ([1],), ([2],)], [([0],), ([1],), ([2],)]]
+        )
+
+    def test_iterator(self):
+        def generator():
+            for _ in range(2):
+                for step in range(3):
+                    yield (tf.convert_to_tensor([step]),)
+
+        it = iter(
+            tf.data.Dataset.from_generator(generator, output_types=("float32",))
+        )
+        data_handler = data_adapter.DataHandler(it, epochs=2, steps_per_epoch=3)
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+        self.assertEqual(
+            returned_data, [[([0],), ([1],), ([2],)], [([0],), ([1],), ([2],)]]
+        )
+
+    def test_list_of_scalars(self):
+        data_handler = data_adapter.DataHandler(
+            [[0], [1], [2]], epochs=2, steps_per_epoch=3
+        )
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+        self.assertEqual(
+            returned_data, [[([0],), ([1],), ([2],)], [([0],), ([1],), ([2],)]]
+        )
+
+    def test_class_weight_user_errors(self):
+        with self.assertRaisesRegex(ValueError, "to be a dict with keys"):
+            data_adapter.DataHandler(
+                x=[[0], [1], [2]],
+                y=[[2], [1], [0]],
+                batch_size=1,
+                sample_weight=[[1.0], [2.0], [4.0]],
+                class_weight={0: 0.5, 1: 1.0, 3: 1.5},  # Skips class `2`.
+            )
+
+        with self.assertRaisesRegex(ValueError, "with a single output"):
+            data_adapter.DataHandler(
+                x=np.ones((10, 1)),
+                y=[np.ones((10, 1)), np.zeros((10, 1))],
+                batch_size=2,
+                class_weight={0: 0.5, 1: 1.0, 2: 1.5},
+            )
+
+    @parameterized.named_parameters(("one_hot", True), ("sparse", False))
+    def test_class_weights_applied(self, one_hot):
+        num_channels = 3
+        num_classes = 5
+        batch_size = 2
+        image_width = 8
+
+        input_shape = (batch_size, image_width, image_width, num_channels)
+        output_shape = (batch_size, image_width, image_width)
+
+        x = tf.random.uniform(input_shape)
+        sparse_y = tf.random.uniform(
+            output_shape, maxval=num_classes, dtype=tf.int32
+        )
+
+        if one_hot:
+            y = tf.one_hot(sparse_y, num_classes)
+        else:
+            y = tf.expand_dims(sparse_y, axis=-1)
+
+        # Class weight is equal to class number + 1
+        class_weight = dict([(x, x + 1) for x in range(num_classes)])
+
+        sample_weight = np.array([1, 2])
+
+        data_handler = data_adapter.DataHandler(
+            x=x,
+            y=y,
+            class_weight=class_weight,
+            sample_weight=sample_weight,
+            batch_size=batch_size,
+            epochs=1,
+        )
+        returned_data = []
+        for _, iterator in data_handler.enumerate_epochs():
+            epoch_data = []
+            for _ in data_handler.steps():
+                epoch_data.append(next(iterator))
+            returned_data.append(epoch_data)
+        returned_data = self.evaluate(returned_data)
+
+        # We had only 1 batch and 1 epoch, so we extract x, y, sample_weight
+        result_x, result_y, result_sample_weight = returned_data[0][0]
+        self.assertAllEqual(x, result_x)
+        self.assertAllEqual(y, result_y)
+
+        # Because class weight = class + 1, resulting class weight = y + 1
+        # Sample weight is 1 for the first sample, 2 for the second,
+        # so we double the expected sample weight for the second sample.
+        self.assertAllEqual(sparse_y[0] + 1, result_sample_weight[0])
+        self.assertAllEqual(2 * (sparse_y[1] + 1), result_sample_weight[1])
+
+    @parameterized.named_parameters(("numpy", True), ("dataset", False))
+    def test_single_x_input_no_tuple_wrapping(self, use_numpy):
+        x = np.ones((10, 1))
+
+        if use_numpy:
+            batch_size = 2
+        else:
+            x = tf.data.Dataset.from_tensor_slices(x).batch(2)
+            batch_size = None
+
+        data_handler = data_adapter.DataHandler(x, batch_size=batch_size)
+        for _, iterator in data_handler.enumerate_epochs():
+            for _ in data_handler.steps():
+                # Check that single x input is not wrapped in a tuple.
+                self.assertIsInstance(next(iterator), tf.Tensor)
+
+    def test_error_if_zero_steps_per_epoch(self):
+        data = tf.data.Dataset.from_tensor_slices([0, 1, 2, 3]).batch(1)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Unexpected value for `steps_per_epoch`. Received value is 0.",
+        ):
+            data_adapter.DataHandler(
+                data, initial_epoch=0, epochs=2, steps_per_epoch=0
+            )
+
+    def test_error_if_empty_array_input_data(self):
+        x = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
+        y = np.array([0, 1, 1, 0])
+        idx = []
+
+        with self.assertRaisesWithLiteralMatch(
+            ValueError,
+            "Expected input data to be non-empty.",
+        ):
+            data_adapter.DataHandler(x[idx], y[idx])
+
+    def test_error_if_empty_dataset_input_data(self):
+        data = tf.data.Dataset.from_tensor_slices([]).batch(1)
+
+        with self.assertRaisesWithLiteralMatch(
+            ValueError,
+            "Expected input data to be non-empty.",
+        ):
+            data_adapter.DataHandler(data)
 
 
 class TestValidationSplit(test_combinations.TestCase):
-
-  @parameterized.named_parameters(('numpy_arrays', True), ('tensors', False))
-  def test_validation_split_unshuffled(self, use_numpy):
-    if use_numpy:
-      x = np.array([0, 1, 2, 3, 4])
-      y = np.array([0, 2, 4, 6, 8])
-      sw = np.array([0, 4, 8, 12, 16])
-    else:
-      x = tf.convert_to_tensor([0, 1, 2, 3, 4])
-      y = tf.convert_to_tensor([0, 2, 4, 6, 8])
-      sw = tf.convert_to_tensor([0, 4, 8, 12, 16])
-
-    (train_x, train_y, train_sw), (val_x, val_y, val_sw) = (
-        data_adapter.train_validation_split((x, y, sw), validation_split=0.2))
-
-    if use_numpy:
-      train_x = tf.convert_to_tensor(train_x)
-      train_y = tf.convert_to_tensor(train_y)
-      train_sw = tf.convert_to_tensor(train_sw)
-      val_x = tf.convert_to_tensor(val_x)
-      val_y = tf.convert_to_tensor(val_y)
-      val_sw = tf.convert_to_tensor(val_sw)
-
-    self.assertEqual(train_x.numpy().tolist(), [0, 1, 2, 3])
-    self.assertEqual(train_y.numpy().tolist(), [0, 2, 4, 6])
-    self.assertEqual(train_sw.numpy().tolist(), [0, 4, 8, 12])
-
-    self.assertEqual(val_x.numpy().tolist(), [4])
-    self.assertEqual(val_y.numpy().tolist(), [8])
-    self.assertEqual(val_sw.numpy().tolist(), [16])
-
-  def test_validation_split_user_error(self):
-    with self.assertRaisesRegex(ValueError, 'is only supported for Tensors'):
-      data_adapter.train_validation_split(
-          lambda: np.ones((10, 1)), validation_split=0.2)
-
-  def test_validation_split_examples_too_few(self):
-    with self.assertRaisesRegex(ValueError, 'not sufficient to split it'):
-      data_adapter.train_validation_split(
-          np.ones((1, 10)), validation_split=0.2)
-
-  def test_validation_split_none(self):
-    train_sw, val_sw = data_adapter.train_validation_split(
-        None, validation_split=0.2)
-    self.assertIsNone(train_sw)
-    self.assertIsNone(val_sw)
-
-    (_, train_sw), (_, val_sw) = data_adapter.train_validation_split(
-        (np.ones((10, 1)), None), validation_split=0.2)
-    self.assertIsNone(train_sw)
-    self.assertIsNone(val_sw)
+    @parameterized.named_parameters(("numpy_arrays", True), ("tensors", False))
+    def test_validation_split_unshuffled(self, use_numpy):
+        if use_numpy:
+            x = np.array([0, 1, 2, 3, 4])
+            y = np.array([0, 2, 4, 6, 8])
+            sw = np.array([0, 4, 8, 12, 16])
+        else:
+            x = tf.convert_to_tensor([0, 1, 2, 3, 4])
+            y = tf.convert_to_tensor([0, 2, 4, 6, 8])
+            sw = tf.convert_to_tensor([0, 4, 8, 12, 16])
+
+        (train_x, train_y, train_sw), (
+            val_x,
+            val_y,
+            val_sw,
+        ) = data_adapter.train_validation_split(
+            (x, y, sw), validation_split=0.2
+        )
+
+        if use_numpy:
+            train_x = tf.convert_to_tensor(train_x)
+            train_y = tf.convert_to_tensor(train_y)
+            train_sw = tf.convert_to_tensor(train_sw)
+            val_x = tf.convert_to_tensor(val_x)
+            val_y = tf.convert_to_tensor(val_y)
+            val_sw = tf.convert_to_tensor(val_sw)
+
+        self.assertEqual(train_x.numpy().tolist(), [0, 1, 2, 3])
+        self.assertEqual(train_y.numpy().tolist(), [0, 2, 4, 6])
+        self.assertEqual(train_sw.numpy().tolist(), [0, 4, 8, 12])
+
+        self.assertEqual(val_x.numpy().tolist(), [4])
+        self.assertEqual(val_y.numpy().tolist(), [8])
+        self.assertEqual(val_sw.numpy().tolist(), [16])
+
+    def test_validation_split_user_error(self):
+        with self.assertRaisesRegex(
+            ValueError, "is only supported for Tensors"
+        ):
+            data_adapter.train_validation_split(
+                lambda: np.ones((10, 1)), validation_split=0.2
+            )
+
+    def test_validation_split_examples_too_few(self):
+        with self.assertRaisesRegex(ValueError, "not sufficient to split it"):
+            data_adapter.train_validation_split(
+                np.ones((1, 10)), validation_split=0.2
+            )
+
+    def test_validation_split_none(self):
+        train_sw, val_sw = data_adapter.train_validation_split(
+            None, validation_split=0.2
+        )
+        self.assertIsNone(train_sw)
+        self.assertIsNone(val_sw)
+
+        (_, train_sw), (_, val_sw) = data_adapter.train_validation_split(
+            (np.ones((10, 1)), None), validation_split=0.2
+        )
+        self.assertIsNone(train_sw)
+        self.assertIsNone(val_sw)
 
 
 class ListsOfScalarsDataAdapterTest(DataAdapterTestBase):
+    def setUp(self):
+        super().setUp()
+        self.adapter_cls = data_adapter.ListsOfScalarsDataAdapter
 
-  def setUp(self):
-    super().setUp()
-    self.adapter_cls = data_adapter.ListsOfScalarsDataAdapter
-
-  def test_can_list_inputs(self):
-    self.assertTrue(self.adapter_cls.can_handle(self.text_input))
-    self.assertTrue(self.adapter_cls.can_handle(self.bytes_input))
+    def test_can_list_inputs(self):
+        self.assertTrue(self.adapter_cls.can_handle(self.text_input))
+        self.assertTrue(self.adapter_cls.can_handle(self.bytes_input))
 
-    self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
-    self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
-    self.assertFalse(self.adapter_cls.can_handle([]))
+        self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
+        self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+        self.assertFalse(self.adapter_cls.can_handle([]))
 
 
 class TestDataAdapterUtils(DataAdapterTestBase):
-
-  def test_unpack_x_y_sample_weight_with_tuple_and_list(self):
-    tuple_version = data_adapter.unpack_x_y_sample_weight(
-        (self.tensor_input, self.tensor_target))
-    list_version = data_adapter.unpack_x_y_sample_weight(
-        [self.tensor_input, self.tensor_target])
-    self.assertEqual(tuple_version, list_version)
-
-
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    def test_unpack_x_y_sample_weight_with_tuple_and_list(self):
+        tuple_version = data_adapter.unpack_x_y_sample_weight(
+            (self.tensor_input, self.tensor_target)
+        )
+        list_version = data_adapter.unpack_x_y_sample_weight(
+            [self.tensor_input, self.tensor_target]
+        )
+        self.assertEqual(tuple_version, list_version)
+
+    def test_unpack_pack_dict(self):
+        # A dictionary can be unambiguously represented without a tuple.
+        x = {"key": self.tensor_input}
+        packed_x = data_adapter.pack_x_y_sample_weight(x)
+        self.assertEqual(packed_x, x)
+        unpacked_x, _, _ = data_adapter.unpack_x_y_sample_weight(x)
+        self.assertEqual(unpacked_x, x)
+
+
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/engine/deferred_sequential_test.py b/keras/engine/deferred_sequential_test.py
index f2133adcae7f..8d72abbef0d6 100644
--- a/keras/engine/deferred_sequential_test.py
+++ b/keras/engine/deferred_sequential_test.py
@@ -14,204 +14,226 @@
 # ==============================================================================
 """Tests specific to deferred-build `Sequential` models."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import unittest
+
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+    import h5py
 except ImportError:
-  h5py = None
+    h5py = None
 
 
 @test_utils.run_v2_only
 class TestDeferredSequential(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_build_behavior(self):
-    # Test graph network creation after __call__
-    model = get_model()
-    model(np.random.random((2, 6)))
-    self.assertLen(model.weights, 4)
-    self.assertTrue(model._is_graph_network)
-    self.assertLen(model.inputs, 1)
-    self.assertLen(model.outputs, 1)
-    self.assertEqual(model.inputs[0].shape.as_list(), [2, 6])
-    self.assertEqual(model.outputs[0].shape.as_list(), [2, 2])
-
-    # Test effect of new __call__ with a different shape
-    model(np.random.random((3, 6)))
-    self.assertLen(model.inputs, 1)
-    self.assertLen(model.outputs, 1)
-    self.assertEqual(model.inputs[0].shape.as_list(), [None, 6])
-    self.assertEqual(model.outputs[0].shape.as_list(), [None, 2])
-    model(np.random.random((4, 6)))
-    self.assertLen(model.inputs, 1)
-    self.assertLen(model.outputs, 1)
-    self.assertEqual(model.inputs[0].shape.as_list(), [None, 6])
-    self.assertEqual(model.outputs[0].shape.as_list(), [None, 2])
-
-    # Test graph network creation after build
-    model = get_model()
-    model.build((None, 6))
-    self.assertLen(model.weights, 4)
-    self.assertTrue(model._is_graph_network)
-    self.assertLen(model.inputs, 1)
-    self.assertLen(model.outputs, 1)
-    self.assertEqual(model.inputs[0].shape.as_list(), [None, 6])
-    self.assertEqual(model.outputs[0].shape.as_list(), [None, 2])
-
-    # Test graph network creation after compile/fit
-    model = get_model()
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=[keras.metrics.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(np.zeros((2, 6)), np.zeros((2, 2)))
-    self.assertLen(model.weights, 4)
-    self.assertTrue(model._is_graph_network)
-    self.assertLen(model.inputs, 1)
-    self.assertLen(model.outputs, 1)
-    # Inconsistency here: with eager `fit`, the model is built with shape
-    # (2, 6), but with graph function `fit`, it is built with shape `(None, 6)`.
-    # This is likely due to our assumption "the batch size should be dynamic"
-    # at the level of `Model`. TODO(fchollet): investigate and resolve.
-    self.assertEqual(model.inputs[0].shape.as_list()[-1], 6)
-    self.assertEqual(model.outputs[0].shape.as_list()[-1], 2)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_add_and_pop(self):
-    model = get_model()
-    model.build((None, 6))
-    self.assertTrue(model.built)
-    self.assertTrue(model._is_graph_network)
-    self.assertLen(model.layers, 3)
-    self.assertLen(model.weights, 4)
-    model.pop()
-    self.assertTrue(model.built)
-    self.assertTrue(model._is_graph_network)
-    self.assertLen(model.layers, 2)
-    self.assertLen(model.weights, 2)
-    model.add(keras.layers.Dense(2))
-    self.assertTrue(model.built)
-    self.assertTrue(model._is_graph_network)
-    self.assertLen(model.layers, 3)
-    self.assertLen(model.weights, 4)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_feature_extraction(self):
-    # This tests layer connectivity reset when rebuilding
-    model = get_model()
-    model(np.random.random((3, 6)))  # First build
-    model(np.random.random((4, 6)))  # Triggers a rebuild
-    # Classic feature extractor pattern
-    extractor = keras.Model(inputs=model.inputs,
-                            outputs=[layer.output for layer in model.layers])
-    # Check that inputs and outputs are connected
-    _ = extractor(np.random.random((4, 6)))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_saving_savedmodel(self):
-    model = get_model()
-    model(np.random.random((3, 6)))  # Build model
-
-    path = os.path.join(self.get_temp_dir(), 'model_path')
-    model.save(path)
-    new_model = keras.models.load_model(path)
-    model_layers = model._flatten_layers(include_self=True, recursive=False)
-    new_model_layers = new_model._flatten_layers(
-        include_self=True, recursive=False)
-    for layer1, layer2 in zip(model_layers, new_model_layers):
-      self.assertEqual(layer1.name, layer2.name)
-      for w1, w2 in zip(layer1.weights, layer2.weights):
-        self.assertAllClose(w1, w2)
-
-  @unittest.skipIf(h5py is None, 'Test requires h5py')
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_saving_h5(self):
-    path = os.path.join(self.get_temp_dir(), 'model_path.h5')
-    model = get_model()
-    model(np.random.random((3, 6)))  # Build model
-
-    path = os.path.join(self.get_temp_dir(), 'model_path.h5')
-    model.save(path)
-    new_model = keras.models.load_model(path)
-    model_layers = model._flatten_layers(include_self=True, recursive=False)
-    new_model_layers = new_model._flatten_layers(
-        include_self=True, recursive=False)
-    for layer1, layer2 in zip(model_layers, new_model_layers):
-      self.assertEqual(layer1.name, layer2.name)
-      for w1, w2 in zip(layer1.weights, layer2.weights):
-        self.assertAllClose(w1, w2)
-
-  @test_combinations.run_all_keras_modes
-  def test_shared_layer(self):
-    # This tests that preexisting layer connectivity is preserved
-    # when auto-building graph networks
-    shared_layer = keras.layers.Dense(2)
-    m1 = keras.Sequential([shared_layer])
-    m1(np.random.random((3, 6)))
-    m2 = keras.Sequential([shared_layer])
-    m2(np.random.random((3, 6)))
-    # Nesting case
-    shared_layer = keras.layers.Dense(2)
-    m1 = keras.Sequential([shared_layer])
-    m2 = keras.Sequential([shared_layer, m1])
-    m2(np.random.random((3, 2)))
-
-  @test_combinations.run_all_keras_modes
-  def test_loss_layer(self):
-    class LossLayer(keras.layers.Layer):
-
-      def call(self, inputs):
-        self.add_loss(tf.reduce_sum(inputs))
-        return inputs
-
-    # Test loss layer alone
-    model = keras.Sequential([LossLayer()])
-    model.compile('rmsprop', run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(np.ones((2, 2)))
-    self.assertAllClose(loss, 4.)
-    model(np.random.random((4, 2)))  # Triggers a rebuild
-    loss = model.train_on_batch(np.ones((1, 2)))
-    self.assertAllClose(loss, 2.)
-
-    # Test loss layer combined with another layer
-    model = keras.Sequential([
-        keras.layers.Dense(1, kernel_initializer='ones'),
-        LossLayer()])
-    model.compile('rmsprop', run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(np.ones((2, 2)))
-    self.assertAllClose(loss, 4.)
-    model(np.random.random((4, 2)))  # Triggers a rebuild
-    loss = model.train_on_batch(np.ones((1, 2)))
-    self.assertLess(loss, 2.)
-
-    # Test loss layer combined with external loss
-    model = keras.Sequential([
-        keras.layers.Dense(1, kernel_initializer='ones'),
-        LossLayer()])
-    model.compile('rmsprop', 'mse',
-                  run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(np.ones((2, 2)), np.ones((2, 2)))
-    model(np.random.random((4, 2)))  # Triggers a rebuild
-    loss = model.train_on_batch(np.ones((1, 2)), np.ones((1, 2)))
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_build_behavior(self):
+        # Test graph network creation after __call__
+        model = get_model()
+        model(np.random.random((2, 6)))
+        self.assertLen(model.weights, 4)
+        self.assertTrue(model._is_graph_network)
+        self.assertLen(model.inputs, 1)
+        self.assertLen(model.outputs, 1)
+        self.assertEqual(model.inputs[0].shape.as_list(), [2, 6])
+        self.assertEqual(model.outputs[0].shape.as_list(), [2, 2])
+
+        # Test effect of new __call__ with a different shape
+        model(np.random.random((3, 6)))
+        self.assertLen(model.inputs, 1)
+        self.assertLen(model.outputs, 1)
+        self.assertEqual(model.inputs[0].shape.as_list(), [None, 6])
+        self.assertEqual(model.outputs[0].shape.as_list(), [None, 2])
+        model(np.random.random((4, 6)))
+        self.assertLen(model.inputs, 1)
+        self.assertLen(model.outputs, 1)
+        self.assertEqual(model.inputs[0].shape.as_list(), [None, 6])
+        self.assertEqual(model.outputs[0].shape.as_list(), [None, 2])
+
+        # Test graph network creation after build
+        model = get_model()
+        model.build((None, 6))
+        self.assertLen(model.weights, 4)
+        self.assertTrue(model._is_graph_network)
+        self.assertLen(model.inputs, 1)
+        self.assertLen(model.outputs, 1)
+        self.assertEqual(model.inputs[0].shape.as_list(), [None, 6])
+        self.assertEqual(model.outputs[0].shape.as_list(), [None, 2])
+
+        # Test graph network creation after compile/fit
+        model = get_model()
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=[keras.metrics.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(np.zeros((2, 6)), np.zeros((2, 2)))
+        self.assertLen(model.weights, 4)
+        self.assertTrue(model._is_graph_network)
+        self.assertLen(model.inputs, 1)
+        self.assertLen(model.outputs, 1)
+        # Inconsistency here: with eager `fit`, the model is built with shape
+        # (2, 6), but with graph function `fit`, it is built with shape `(None,
+        # 6)`.  This is likely due to our assumption "the batch size should be
+        # dynamic" at the level of `Model`. TODO(fchollet): investigate and
+        # resolve.
+        self.assertEqual(model.inputs[0].shape.as_list()[-1], 6)
+        self.assertEqual(model.outputs[0].shape.as_list()[-1], 2)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_add_and_pop(self):
+        model = get_model()
+        model.build((None, 6))
+        self.assertTrue(model.built)
+        self.assertTrue(model._is_graph_network)
+        self.assertLen(model.layers, 3)
+        self.assertLen(model.weights, 4)
+        model.pop()
+        self.assertTrue(model.built)
+        self.assertTrue(model._is_graph_network)
+        self.assertLen(model.layers, 2)
+        self.assertLen(model.weights, 2)
+        model.add(keras.layers.Dense(2))
+        self.assertTrue(model.built)
+        self.assertTrue(model._is_graph_network)
+        self.assertLen(model.layers, 3)
+        self.assertLen(model.weights, 4)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_feature_extraction(self):
+        # This tests layer connectivity reset when rebuilding
+        model = get_model()
+        model(np.random.random((3, 6)))  # First build
+        model(np.random.random((4, 6)))  # Triggers a rebuild
+        # Classic feature extractor pattern
+        extractor = keras.Model(
+            inputs=model.inputs,
+            outputs=[layer.output for layer in model.layers],
+        )
+        # Check that inputs and outputs are connected
+        _ = extractor(np.random.random((4, 6)))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_saving_keras_v3(self):
+        model = get_model()
+        model(np.random.random((3, 6)))  # Build model
+
+        path = os.path.join(self.get_temp_dir(), "model_path.keras")
+        model.save(path)
+        new_model = keras.models.load_model(path)
+        model_layers = model._flatten_layers(include_self=True, recursive=False)
+        new_model_layers = new_model._flatten_layers(
+            include_self=True, recursive=False
+        )
+        for layer1, layer2 in zip(model_layers, new_model_layers):
+            self.assertEqual(layer1.name, layer2.name)
+            for w1, w2 in zip(layer1.weights, layer2.weights):
+                self.assertAllClose(w1, w2)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_saving_savedmodel(self):
+        model = get_model()
+        model(np.random.random((3, 6)))  # Build model
+
+        path = os.path.join(self.get_temp_dir(), "model_path")
+        model.save(path)
+        new_model = keras.models.load_model(path)
+        model_layers = model._flatten_layers(include_self=True, recursive=False)
+        new_model_layers = new_model._flatten_layers(
+            include_self=True, recursive=False
+        )
+        for layer1, layer2 in zip(model_layers, new_model_layers):
+            self.assertEqual(layer1.name, layer2.name)
+            for w1, w2 in zip(layer1.weights, layer2.weights):
+                self.assertAllClose(w1, w2)
+
+    @unittest.skipIf(h5py is None, "Test requires h5py")
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_saving_h5(self):
+        path = os.path.join(self.get_temp_dir(), "model_path.h5")
+        model = get_model()
+        model(np.random.random((3, 6)))  # Build model
+
+        path = os.path.join(self.get_temp_dir(), "model_path.h5")
+        model.save(path)
+        new_model = keras.models.load_model(path)
+        model_layers = model._flatten_layers(include_self=True, recursive=False)
+        new_model_layers = new_model._flatten_layers(
+            include_self=True, recursive=False
+        )
+        for layer1, layer2 in zip(model_layers, new_model_layers):
+            self.assertEqual(layer1.name, layer2.name)
+            for w1, w2 in zip(layer1.weights, layer2.weights):
+                self.assertAllClose(w1, w2)
+
+    @test_combinations.run_all_keras_modes
+    def test_shared_layer(self):
+        # This tests that preexisting layer connectivity is preserved
+        # when auto-building graph networks
+        shared_layer = keras.layers.Dense(2)
+        m1 = keras.Sequential([shared_layer])
+        m1(np.random.random((3, 6)))
+        m2 = keras.Sequential([shared_layer])
+        m2(np.random.random((3, 6)))
+        # Nesting case
+        shared_layer = keras.layers.Dense(2)
+        m1 = keras.Sequential([shared_layer])
+        m2 = keras.Sequential([shared_layer, m1])
+        m2(np.random.random((3, 2)))
+
+    @test_combinations.run_all_keras_modes
+    def test_loss_layer(self):
+        class LossLayer(keras.layers.Layer):
+            def call(self, inputs):
+                self.add_loss(tf.reduce_sum(inputs))
+                return inputs
+
+        # Test loss layer alone
+        model = keras.Sequential([LossLayer()])
+        model.compile("rmsprop", run_eagerly=test_utils.should_run_eagerly())
+        loss = model.train_on_batch(np.ones((2, 2)))
+        self.assertAllClose(loss, 4.0)
+        model(np.random.random((4, 2)))  # Triggers a rebuild
+        loss = model.train_on_batch(np.ones((1, 2)))
+        self.assertAllClose(loss, 2.0)
+
+        # Test loss layer combined with another layer
+        model = keras.Sequential(
+            [keras.layers.Dense(1, kernel_initializer="ones"), LossLayer()]
+        )
+        model.compile("rmsprop", run_eagerly=test_utils.should_run_eagerly())
+        loss = model.train_on_batch(np.ones((2, 2)))
+        self.assertAllClose(loss, 4.0)
+        model(np.random.random((4, 2)))  # Triggers a rebuild
+        loss = model.train_on_batch(np.ones((1, 2)))
+        self.assertLess(loss, 2.0)
+
+        # Test loss layer combined with external loss
+        model = keras.Sequential(
+            [keras.layers.Dense(1, kernel_initializer="ones"), LossLayer()]
+        )
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        loss = model.train_on_batch(np.ones((2, 2)), np.ones((2, 2)))
+        model(np.random.random((4, 2)))  # Triggers a rebuild
+        loss = model.train_on_batch(np.ones((1, 2)), np.ones((1, 2)))
 
 
 def get_model():
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(2, name='first_layer'))
-  model.add(keras.layers.Dropout(0.3, name='dp'))
-  model.add(keras.layers.Dense(2, name='last_layer'))
-  return model
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, name="first_layer"))
+    model.add(keras.layers.Dropout(0.3, name="dp"))
+    model.add(keras.layers.Dense(2, name="last_layer"))
+    return model
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/feature_columns_integration_test.py b/keras/engine/feature_columns_integration_test.py
index e8e0d1dec186..427a8c70b696 100644
--- a/keras/engine/feature_columns_integration_test.py
+++ b/keras/engine/feature_columns_integration_test.py
@@ -14,286 +14,307 @@
 # ==============================================================================
 """Tests specific to Feature Columns integration."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
-from keras.testing_infra import test_combinations
 from keras import metrics as metrics_module
-from keras.testing_infra import test_utils
 from keras.feature_column import dense_features as df
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 from keras.utils import np_utils
 
 
 class TestDNNModel(keras.models.Model):
+    def __init__(self, feature_columns, units, name=None, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self._input_layer = df.DenseFeatures(
+            feature_columns, name="input_layer"
+        )
+        self._dense_layer = keras.layers.Dense(units, name="dense_layer")
 
-  def __init__(self, feature_columns, units, name=None, **kwargs):
-    super().__init__(name=name, **kwargs)
-    self._input_layer = df.DenseFeatures(feature_columns, name='input_layer')
-    self._dense_layer = keras.layers.Dense(units, name='dense_layer')
-
-  def call(self, features):
-    net = self._input_layer(features)
-    net = self._dense_layer(net)
-    return net
+    def call(self, features):
+        net = self._input_layer(features)
+        net = self._dense_layer(net)
+        return net
 
 
 class FeatureColumnsIntegrationTest(test_combinations.TestCase):
-  """Most Sequential model API tests are covered in `training_test.py`.
-
-  """
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_model(self):
-    columns = [tf.feature_column.numeric_column('a')]
-    model = keras.models.Sequential([
-        df.DenseFeatures(columns),
-        keras.layers.Dense(64, activation='relu'),
-        keras.layers.Dense(20, activation='softmax')
-    ])
-    model.compile(
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = {'a': np.random.random((10, 1))}
-    y = np.random.randint(20, size=(10, 1))
-    y = np_utils.to_categorical(y, num_classes=20)
-    model.fit(x, y, epochs=1, batch_size=5)
-    model.fit(x, y, epochs=1, batch_size=5)
-    model.evaluate(x, y, batch_size=5)
-    model.predict(x, batch_size=5)
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_model_with_ds_input(self):
-    columns = [tf.feature_column.numeric_column('a')]
-    model = keras.models.Sequential([
-        df.DenseFeatures(columns),
-        keras.layers.Dense(64, activation='relu'),
-        keras.layers.Dense(20, activation='softmax')
-    ])
-    model.compile(
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    y = np.random.randint(20, size=(100, 1))
-    y = np_utils.to_categorical(y, num_classes=20)
-    x = {'a': np.random.random((100, 1))}
-    ds1 = tf.data.Dataset.from_tensor_slices(x)
-    ds2 = tf.data.Dataset.from_tensor_slices(y)
-    ds = tf.data.Dataset.zip((ds1, ds2)).batch(5)
-    model.fit(ds, steps_per_epoch=1)
-    model.fit(ds, steps_per_epoch=1)
-    model.evaluate(ds, steps=1)
-    model.predict(ds, steps=1)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_sequential_model_with_crossed_column(self):
-    feature_columns = []
-    age_buckets = tf.feature_column.bucketized_column(
-        tf.feature_column.numeric_column('age'),
-        boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
-    feature_columns.append(age_buckets)
-
-    # indicator cols
-    thal = tf.feature_column.categorical_column_with_vocabulary_list(
-        'thal', ['fixed', 'normal', 'reversible'])
-
-    crossed_feature = tf.feature_column.crossed_column([age_buckets, thal],
-                                        hash_bucket_size=1000)
-    crossed_feature = tf.feature_column.indicator_column(crossed_feature)
-    feature_columns.append(crossed_feature)
-
-    feature_layer = df.DenseFeatures(feature_columns)
-
-    model = keras.models.Sequential([
-        feature_layer,
-        keras.layers.Dense(128, activation='relu'),
-        keras.layers.Dense(128, activation='relu'),
-        keras.layers.Dense(1, activation='sigmoid')
-    ])
-
-    age_data = np.random.randint(10, 100, size=100)
-    thal_data = np.random.choice(['fixed', 'normal', 'reversible'], size=100)
-    inp_x = {'age': age_data, 'thal': thal_data}
-    inp_y = np.random.randint(0, 1, size=100)
-    ds = tf.data.Dataset.from_tensor_slices((inp_x, inp_y)).batch(5)
-    model.compile(optimizer='adam',
-                  loss='binary_crossentropy',
-                  metrics=['accuracy'],)
-    model.fit(ds, epochs=1)
-    model.fit(ds, epochs=1)
-    model.evaluate(ds)
-    model.predict(ds)
-
-  @test_combinations.run_all_keras_modes
-  def test_subclassed_model_with_feature_columns(self):
-    col_a = tf.feature_column.numeric_column('a')
-    col_b = tf.feature_column.numeric_column('b')
-
-    dnn_model = TestDNNModel([col_a, col_b], 20)
-
-    dnn_model.compile(
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = {'a': np.random.random((10, 1)), 'b': np.random.random((10, 1))}
-    y = np.random.randint(20, size=(10, 1))
-    y = np_utils.to_categorical(y, num_classes=20)
-    dnn_model.fit(x=x, y=y, epochs=1, batch_size=5)
-    dnn_model.fit(x=x, y=y, epochs=1, batch_size=5)
-    dnn_model.evaluate(x=x, y=y, batch_size=5)
-    dnn_model.predict(x=x, batch_size=5)
-
-  @test_combinations.run_all_keras_modes
-  def test_subclassed_model_with_feature_columns_with_ds_input(self):
-    col_a = tf.feature_column.numeric_column('a')
-    col_b = tf.feature_column.numeric_column('b')
-
-    dnn_model = TestDNNModel([col_a, col_b], 20)
-
-    dnn_model.compile(
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    y = np.random.randint(20, size=(100, 1))
-    y = np_utils.to_categorical(y, num_classes=20)
-    x = {'a': np.random.random((100, 1)), 'b': np.random.random((100, 1))}
-    ds1 = tf.data.Dataset.from_tensor_slices(x)
-    ds2 = tf.data.Dataset.from_tensor_slices(y)
-    ds = tf.data.Dataset.zip((ds1, ds2)).batch(5)
-    dnn_model.fit(ds, steps_per_epoch=1)
-    dnn_model.fit(ds, steps_per_epoch=1)
-    dnn_model.evaluate(ds, steps=1)
-    dnn_model.predict(ds, steps=1)
-
-  # TODO(kaftan) seems to throw an error when enabled.
-  @test_combinations.run_all_keras_modes
-  def DISABLED_test_function_model_feature_layer_input(self):
-    col_a = tf.feature_column.numeric_column('a')
-    col_b = tf.feature_column.numeric_column('b')
-
-    feature_layer = df.DenseFeatures([col_a, col_b], name='fc')
-    dense = keras.layers.Dense(4)
-
-    # This seems problematic.... We probably need something for DenseFeatures
-    # the way Input is for InputLayer.
-    output = dense(feature_layer)
-
-    model = keras.models.Model([feature_layer], [output])
-
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    loss_weights = [1., 0.5]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        loss_weights=loss_weights)
-
-    data = ({'a': np.arange(10), 'b': np.arange(10)}, np.arange(10, 20))
-    model.fit(*data, epochs=1)
-
-  # TODO(kaftan) seems to throw an error when enabled.
-  @test_combinations.run_all_keras_modes
-  def DISABLED_test_function_model_multiple_feature_layer_inputs(self):
-    col_a = tf.feature_column.numeric_column('a')
-    col_b = tf.feature_column.numeric_column('b')
-    col_c = tf.feature_column.numeric_column('c')
-
-    fc1 = df.DenseFeatures([col_a, col_b], name='fc1')
-    fc2 = df.DenseFeatures([col_b, col_c], name='fc2')
-    dense = keras.layers.Dense(4)
-
-    # This seems problematic.... We probably need something for DenseFeatures
-    # the way Input is for InputLayer.
-    output = dense(fc1) + dense(fc2)
-
-    model = keras.models.Model([fc1, fc2], [output])
-
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    loss_weights = [1., 0.5]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        loss_weights=loss_weights)
-
-    data_list = ([{
-        'a': np.arange(10),
-        'b': np.arange(10)
-    }, {
-        'b': np.arange(10),
-        'c': np.arange(10)
-    }], np.arange(10, 100))
-    model.fit(*data_list, epochs=1)
-
-    data_bloated_list = ([{
-        'a': np.arange(10),
-        'b': np.arange(10),
-        'c': np.arange(10)
-    }, {
-        'a': np.arange(10),
-        'b': np.arange(10),
-        'c': np.arange(10)
-    }], np.arange(10, 100))
-    model.fit(*data_bloated_list, epochs=1)
-
-    data_dict = ({
-        'fc1': {
-            'a': np.arange(10),
-            'b': np.arange(10)
-        },
-        'fc2': {
-            'b': np.arange(10),
-            'c': np.arange(10)
-        }
-    }, np.arange(10, 100))
-    model.fit(*data_dict, epochs=1)
-
-    data_bloated_dict = ({
-        'fc1': {
-            'a': np.arange(10),
-            'b': np.arange(10),
-            'c': np.arange(10)
-        },
-        'fc2': {
-            'a': np.arange(10),
-            'b': np.arange(10),
-            'c': np.arange(10)
+    """Most Sequential model API tests are covered in `training_test.py`."""
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_model(self):
+        columns = [tf.feature_column.numeric_column("a")]
+        model = keras.models.Sequential(
+            [
+                df.DenseFeatures(columns),
+                keras.layers.Dense(64, activation="relu"),
+                keras.layers.Dense(20, activation="softmax"),
+            ]
+        )
+        model.compile(
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = {"a": np.random.random((10, 1))}
+        y = np.random.randint(20, size=(10, 1))
+        y = np_utils.to_categorical(y, num_classes=20)
+        model.fit(x, y, epochs=1, batch_size=5)
+        model.fit(x, y, epochs=1, batch_size=5)
+        model.evaluate(x, y, batch_size=5)
+        model.predict(x, batch_size=5)
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_model_with_ds_input(self):
+        columns = [tf.feature_column.numeric_column("a")]
+        model = keras.models.Sequential(
+            [
+                df.DenseFeatures(columns),
+                keras.layers.Dense(64, activation="relu"),
+                keras.layers.Dense(20, activation="softmax"),
+            ]
+        )
+        model.compile(
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        y = np.random.randint(20, size=(100, 1))
+        y = np_utils.to_categorical(y, num_classes=20)
+        x = {"a": np.random.random((100, 1))}
+        ds1 = tf.data.Dataset.from_tensor_slices(x)
+        ds2 = tf.data.Dataset.from_tensor_slices(y)
+        ds = tf.data.Dataset.zip((ds1, ds2)).batch(5)
+        model.fit(ds, steps_per_epoch=1)
+        model.fit(ds, steps_per_epoch=1)
+        model.evaluate(ds, steps=1)
+        model.predict(ds, steps=1)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_sequential_model_with_crossed_column(self):
+        feature_columns = []
+        age_buckets = tf.feature_column.bucketized_column(
+            tf.feature_column.numeric_column("age"),
+            boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65],
+        )
+        feature_columns.append(age_buckets)
+
+        # indicator cols
+        thal = tf.feature_column.categorical_column_with_vocabulary_list(
+            "thal", ["fixed", "normal", "reversible"]
+        )
+
+        crossed_feature = tf.feature_column.crossed_column(
+            [age_buckets, thal], hash_bucket_size=1000
+        )
+        crossed_feature = tf.feature_column.indicator_column(crossed_feature)
+        feature_columns.append(crossed_feature)
+
+        feature_layer = df.DenseFeatures(feature_columns)
+
+        model = keras.models.Sequential(
+            [
+                feature_layer,
+                keras.layers.Dense(128, activation="relu"),
+                keras.layers.Dense(128, activation="relu"),
+                keras.layers.Dense(1, activation="sigmoid"),
+            ]
+        )
+
+        age_data = np.random.randint(10, 100, size=100)
+        thal_data = np.random.choice(
+            ["fixed", "normal", "reversible"], size=100
+        )
+        inp_x = {"age": age_data, "thal": thal_data}
+        inp_y = np.random.randint(0, 1, size=100)
+        ds = tf.data.Dataset.from_tensor_slices((inp_x, inp_y)).batch(5)
+        model.compile(
+            optimizer="adam",
+            loss="binary_crossentropy",
+            metrics=["accuracy"],
+        )
+        model.fit(ds, epochs=1)
+        model.fit(ds, epochs=1)
+        model.evaluate(ds)
+        model.predict(ds)
+
+    @test_combinations.run_all_keras_modes
+    def test_subclassed_model_with_feature_columns(self):
+        col_a = tf.feature_column.numeric_column("a")
+        col_b = tf.feature_column.numeric_column("b")
+
+        dnn_model = TestDNNModel([col_a, col_b], 20)
+
+        dnn_model.compile(
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = {"a": np.random.random((10, 1)), "b": np.random.random((10, 1))}
+        y = np.random.randint(20, size=(10, 1))
+        y = np_utils.to_categorical(y, num_classes=20)
+        dnn_model.fit(x=x, y=y, epochs=1, batch_size=5)
+        dnn_model.fit(x=x, y=y, epochs=1, batch_size=5)
+        dnn_model.evaluate(x=x, y=y, batch_size=5)
+        dnn_model.predict(x=x, batch_size=5)
+
+    @test_combinations.run_all_keras_modes
+    def test_subclassed_model_with_feature_columns_with_ds_input(self):
+        col_a = tf.feature_column.numeric_column("a")
+        col_b = tf.feature_column.numeric_column("b")
+
+        dnn_model = TestDNNModel([col_a, col_b], 20)
+
+        dnn_model.compile(
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        y = np.random.randint(20, size=(100, 1))
+        y = np_utils.to_categorical(y, num_classes=20)
+        x = {"a": np.random.random((100, 1)), "b": np.random.random((100, 1))}
+        ds1 = tf.data.Dataset.from_tensor_slices(x)
+        ds2 = tf.data.Dataset.from_tensor_slices(y)
+        ds = tf.data.Dataset.zip((ds1, ds2)).batch(5)
+        dnn_model.fit(ds, steps_per_epoch=1)
+        dnn_model.fit(ds, steps_per_epoch=1)
+        dnn_model.evaluate(ds, steps=1)
+        dnn_model.predict(ds, steps=1)
+
+    # TODO(kaftan) seems to throw an error when enabled.
+    @test_combinations.run_all_keras_modes
+    def DISABLED_test_function_model_feature_layer_input(self):
+        col_a = tf.feature_column.numeric_column("a")
+        col_b = tf.feature_column.numeric_column("b")
+
+        feature_layer = df.DenseFeatures([col_a, col_b], name="fc")
+        dense = keras.layers.Dense(4)
+
+        # This seems problematic.... We probably need something for
+        # DenseFeatures the way Input is for InputLayer.
+        output = dense(feature_layer)
+
+        model = keras.models.Model([feature_layer], [output])
+
+        optimizer = "rmsprop"
+        loss = "mse"
+        loss_weights = [1.0, 0.5]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=[metrics_module.CategoricalAccuracy(), "mae"],
+            loss_weights=loss_weights,
+        )
+
+        data = ({"a": np.arange(10), "b": np.arange(10)}, np.arange(10, 20))
+        model.fit(*data, epochs=1)
+
+    # TODO(kaftan) seems to throw an error when enabled.
+    @test_combinations.run_all_keras_modes
+    def DISABLED_test_function_model_multiple_feature_layer_inputs(self):
+        col_a = tf.feature_column.numeric_column("a")
+        col_b = tf.feature_column.numeric_column("b")
+        col_c = tf.feature_column.numeric_column("c")
+
+        fc1 = df.DenseFeatures([col_a, col_b], name="fc1")
+        fc2 = df.DenseFeatures([col_b, col_c], name="fc2")
+        dense = keras.layers.Dense(4)
+
+        # This seems problematic.... We probably need something for
+        # DenseFeatures the way Input is for InputLayer.
+        output = dense(fc1) + dense(fc2)
+
+        model = keras.models.Model([fc1, fc2], [output])
+
+        optimizer = "rmsprop"
+        loss = "mse"
+        loss_weights = [1.0, 0.5]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=[metrics_module.CategoricalAccuracy(), "mae"],
+            loss_weights=loss_weights,
+        )
+
+        data_list = (
+            [
+                {"a": np.arange(10), "b": np.arange(10)},
+                {"b": np.arange(10), "c": np.arange(10)},
+            ],
+            np.arange(10, 100),
+        )
+        model.fit(*data_list, epochs=1)
+
+        data_bloated_list = (
+            [
+                {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)},
+                {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)},
+            ],
+            np.arange(10, 100),
+        )
+        model.fit(*data_bloated_list, epochs=1)
+
+        data_dict = (
+            {
+                "fc1": {"a": np.arange(10), "b": np.arange(10)},
+                "fc2": {"b": np.arange(10), "c": np.arange(10)},
+            },
+            np.arange(10, 100),
+        )
+        model.fit(*data_dict, epochs=1)
+
+        data_bloated_dict = (
+            {
+                "fc1": {
+                    "a": np.arange(10),
+                    "b": np.arange(10),
+                    "c": np.arange(10),
+                },
+                "fc2": {
+                    "a": np.arange(10),
+                    "b": np.arange(10),
+                    "c": np.arange(10),
+                },
+            },
+            np.arange(10, 100),
+        )
+        model.fit(*data_bloated_dict, epochs=1)
+
+    @test_combinations.run_all_keras_modes
+    def test_string_input(self):
+        x = {
+            "age": np.random.random((1024, 1)),
+            "cabin": np.array(["a"] * 1024),
         }
-    }, np.arange(10, 100))
-    model.fit(*data_bloated_dict, epochs=1)
-
-  @test_combinations.run_all_keras_modes
-  def test_string_input(self):
-    x = {'age': np.random.random((1024, 1)),
-         'cabin': np.array(['a'] * 1024)}
-    y = np.random.randint(2, size=(1024, 1))
-    ds1 = tf.data.Dataset.from_tensor_slices(x)
-    ds2 = tf.data.Dataset.from_tensor_slices(y)
-    dataset = tf.data.Dataset.zip((ds1, ds2)).batch(4)
-    categorical_cols = [tf.feature_column.categorical_column_with_hash_bucket('cabin', 10)]
-    feature_cols = ([tf.feature_column.numeric_column('age')]
-                    + [tf.feature_column.indicator_column(cc) for cc in categorical_cols])
-    layers = [df.DenseFeatures(feature_cols),
-              keras.layers.Dense(128),
-              keras.layers.Dense(1)]
-
-    model = keras.models.Sequential(layers)
-    model.compile(optimizer='sgd',
-                  loss=keras.losses.BinaryCrossentropy())
-    model.fit(dataset)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+        y = np.random.randint(2, size=(1024, 1))
+        ds1 = tf.data.Dataset.from_tensor_slices(x)
+        ds2 = tf.data.Dataset.from_tensor_slices(y)
+        dataset = tf.data.Dataset.zip((ds1, ds2)).batch(4)
+        categorical_cols = [
+            tf.feature_column.categorical_column_with_hash_bucket("cabin", 10)
+        ]
+        feature_cols = [tf.feature_column.numeric_column("age")] + [
+            tf.feature_column.indicator_column(cc) for cc in categorical_cols
+        ]
+        layers = [
+            df.DenseFeatures(feature_cols),
+            keras.layers.Dense(128),
+            keras.layers.Dense(1),
+        ]
+
+        model = keras.models.Sequential(layers)
+        model.compile(optimizer="sgd", loss=keras.losses.BinaryCrossentropy())
+        model.fit(dataset)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/functional.py b/keras/engine/functional.py
index 727f90d3c4fc..1dd8ba006fe7 100644
--- a/keras/engine/functional.py
+++ b/keras/engine/functional.py
@@ -12,14 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-"""A `Network` is way to compose layers: the topological form of a `Model`."""
 
+"""A `Network` is way to compose layers: the topological form of a `Model`."""
 
 import collections
 import copy
 import itertools
 import warnings
+
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.dtensor import layout_map as layout_map_lib
 from keras.engine import base_layer
@@ -30,1448 +32,1672 @@
 from keras.engine import node as node_module
 from keras.engine import training as training_lib
 from keras.engine import training_utils
-from keras.saving.saved_model import json_utils
-from keras.saving.saved_model import network_serialization
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model import json_utils
+from keras.saving.legacy.saved_model import network_serialization
+from keras.saving.legacy.saved_model import utils as saved_model_utils
 from keras.utils import generic_utils
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.tools.docs import doc_controls
 
 
-# pylint: disable=g-classes-have-attributes
 class Functional(training_lib.Model):
-  """A `Functional` model is a `Model` defined as a directed graph of layers.
-
-  Three types of `Model` exist: subclassed `Model`, `Functional` model,
-  and `Sequential` (a special case of `Functional`).
-  In general, more Keras features are supported with `Functional`
-  than with subclassed `Model`s, specifically:
-
-  - Model cloning (`keras.models.clone`)
-  - Serialization (`model.get_config()/from_config`, `model.to_json()`
-  - Whole-model saving (`model.save()`)
-
-  A `Functional` model can be instantiated by passing two arguments to
-  `__init__`. The first argument is the `keras.Input` Tensors that represent
-  the inputs to the model. The second argument specifies the output
-  tensors that represent the outputs of this model. Both arguments can be a
-  nested structure of tensors.
-
-  Example:
-
-  ```
-  inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
-  t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
-  outputs = keras.layers.Add()([t, inputs['x2'])
-  model = keras.Model(inputs, outputs)
-  ```
-
-  A `Functional` model constructed using the Functional API can also include raw
-  TensorFlow functions, with the exception of functions that create Variables
-  or assign ops.
-
-  Example:
-
-  ```python
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(1)(inputs)
-  outputs = tf.nn.relu(x)
-  model = keras.Model(inputs, outputs)
-  ```
-
-  A new `Functional` model can also be created by using the
-  intermediate tensors. This enables you to quickly extract sub-components
-  of the model.
-
-  Example:
-
-  ```python
-  inputs = keras.Input(shape=(None, None, 3))
-  processed = keras.layers.RandomCrop(width=32, height=32)(inputs)
-  conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed)
-  pooling = keras.layers.GlobalAveragePooling2D()(conv)
-  feature = keras.layers.Dense(10)(pooling)
-
-  full_model = keras.Model(inputs, feature)
-  backbone = keras.Model(processed, conv)
-  activations = keras.Model(conv, feature)
-  ```
-
-  Note that the `backbone` and `activations` models are not
-  created with `keras.Input` objects, but with the tensors that are originated
-  from `keras.Inputs` objects. Under the hood, the layers and weights will
-  be shared across these models, so that user can train the `full_model`, and
-  use `backbone` or `activations` to do feature extraction.
-  The inputs and outputs of the model can be nested structures of tensors as
-  well, and the created models are standard `Functional` model that support
-  all the existing API.
-
-  Args:
-    inputs: List of input tensors (must be created via `tf.keras.Input()` or
-      originated from `tf.keras.Input()`).
-    outputs: List of output tensors.
-    name: String, optional. Name of the model.
-    trainable: Boolean, optional. If the model's variables should be trainable.
-  """
-
-  # See tf.Module for the usage of this property.
-  # The key of _layer_call_argspecs is a layer. tf.Module._flatten will fail to
-  # flatten the key since it is trying to convert Trackable/Layer to a string.
-  _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
-      ('_layer_call_argspecs', '_compiled_trainable_state',
-       '_output_mask_cache', '_output_tensor_cache', '_output_shape_cache'),
-      training_lib.Model._TF_MODULE_IGNORED_PROPERTIES
-  ))
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, inputs, outputs, name=None, trainable=True,
-               **kwargs):
-    # This is used by the Model class, since we have some logic to swap the
-    # class in the __new__ method, which will lead to __init__ get invoked
-    # twice. Using the skip_init to skip one of the invocation of __init__ to
-    # avoid any side effects
-    skip_init = kwargs.pop('skip_init', False)
-    if skip_init:
-      return
-    generic_utils.validate_kwargs(kwargs, {})
-    super().__init__(name=name, trainable=trainable)
-    # Check if the inputs contain any intermediate `KerasTensor` (not created
-    # by tf.keras.Input()). In this case we need to clone the `Node` and
-    # `KerasTensor` objects to mimic rebuilding a new model from new inputs.
-    # This feature is only enabled in TF2 not in v1 graph mode.
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      if not all([functional_utils.is_input_keras_tensor(t)
-                  for t in tf.nest.flatten(inputs)]):
-        inputs, outputs = functional_utils.clone_graph_nodes(inputs, outputs)
-    self._init_graph_network(inputs, outputs)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _init_graph_network(self, inputs, outputs):
-    # This method is needed for Sequential to reinitialize graph network when
-    # layer is added or removed.
-
-    base_layer.keras_api_gauge.get_cell('Functional').set(True)
-    self._is_graph_network = True
-
-    # Normalize and set self.inputs, self.outputs.
-    if isinstance(inputs, list) and len(tf.nest.flatten(inputs)) == 1:
-      inputs = inputs[0]
-    if isinstance(outputs, list) and len(tf.nest.flatten(outputs)) == 1:
-      outputs = outputs[0]
-    self._nested_inputs = inputs
-    self._nested_outputs = outputs
-    self.inputs = tf.nest.flatten(inputs)
-    self.outputs = tf.nest.flatten(outputs)
-
-    # Models constructed with a single Tensor or list of Tensors can
-    # be called with a dict, where the keys of the dict are the names
-    # of the `Input` objects. Extra keys are ignored with warning.
-    if not tf.nest.is_nested(self._nested_inputs):
-      self._enable_dict_to_input_mapping = True
-    elif (isinstance(self._nested_inputs, (list, tuple)) and
-          not any(tf.nest.is_nested(t) for t in self._nested_inputs)):
-      self._enable_dict_to_input_mapping = True
-    elif (isinstance(self._nested_inputs, dict) and
-          not any(tf.nest.is_nested(t) for t in self._nested_inputs.values())):
-      self._enable_dict_to_input_mapping = True
-    else:
-      self._enable_dict_to_input_mapping = False
+    """A `Functional` model is a `Model` defined as a directed graph of layers.
+
+    Three types of `Model` exist: subclassed `Model`, `Functional` model,
+    and `Sequential` (a special case of `Functional`).
+    In general, more Keras features are supported with `Functional`
+    than with subclassed `Model`s, specifically:
+
+    - Model cloning (`keras.models.clone`)
+    - Serialization (`model.get_config()/from_config`, `model.to_json()`
+    - Whole-model saving (`model.save()`)
+
+    A `Functional` model can be instantiated by passing two arguments to
+    `__init__`. The first argument is the `keras.Input` Tensors that represent
+    the inputs to the model. The second argument specifies the output
+    tensors that represent the outputs of this model. Both arguments can be a
+    nested structure of tensors.
+
+    Example:
+
+    ```
+    inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
+    t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
+    outputs = keras.layers.Add()([t, inputs['x2'])
+    model = keras.Model(inputs, outputs)
+    ```
+
+    A `Functional` model constructed using the Functional API can also include
+    raw TensorFlow functions, with the exception of functions that create
+    Variables or assign ops.
+
+    Example:
+
+    ```python
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(1)(inputs)
+    outputs = tf.nn.relu(x)
+    model = keras.Model(inputs, outputs)
+    ```
+
+    A new `Functional` model can also be created by using the
+    intermediate tensors. This enables you to quickly extract sub-components
+    of the model.
+
+    Example:
+
+    ```python
+    inputs = keras.Input(shape=(None, None, 3))
+    processed = keras.layers.RandomCrop(width=32, height=32)(inputs)
+    conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed)
+    pooling = keras.layers.GlobalAveragePooling2D()(conv)
+    feature = keras.layers.Dense(10)(pooling)
+
+    full_model = keras.Model(inputs, feature)
+    backbone = keras.Model(processed, conv)
+    activations = keras.Model(conv, feature)
+    ```
+
+    Note that the `backbone` and `activations` models are not
+    created with `keras.Input` objects, but with the tensors that are originated
+    from `keras.Input` objects. Under the hood, the layers and weights will
+    be shared across these models, so that user can train the `full_model`, and
+    use `backbone` or `activations` to do feature extraction.
+    The inputs and outputs of the model can be nested structures of tensors as
+    well, and the created models are standard `Functional` model that support
+    all the existing API.
 
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
-        base_layer_utils.create_keras_history(self._nested_outputs)
-
-    self._validate_graph_inputs_and_outputs()
-
-    # A Network does not create weights of its own, thus it is already
-    # built.
-    self.built = True
-    self._build_input_shape = tf.nest.map_structure(lambda x: x.shape, inputs)
-    self._compute_output_and_mask_jointly = True
-    # `_expects_training_arg` is True since the `training` argument is always
-    # present in the signature of the `call` method of a graph network.
-    self._call_spec.expects_training_arg = True
-    self._call_spec.expects_mask_arg = True
-    # A graph network does not autocast inputs, as its layers will cast them
-    # instead.
-    self._autocast = False
-
-    self._input_layers = []
-    self._output_layers = []
-    self._input_coordinates = []
-    self._output_coordinates = []
-
-    # This is for performance optimization when calling the Network on new
-    # inputs. Every time the Network is called on a set on input tensors,
-    # we compute the output tensors, output masks and output shapes in one pass,
-    # then cache them here. When any of these outputs is queried later, we
-    # retrieve it from there instead of recomputing it.
-    self._output_mask_cache = {}
-    self._output_tensor_cache = {}
-    self._output_shape_cache = {}
-
-    # Build self._output_layers:
-    for x in self.outputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      self._output_layers.append(layer)
-      self._output_coordinates.append((layer, node_index, tensor_index))
-
-    # Build self._input_layers:
-    for x in self.inputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      # It's supposed to be an input layer, so only one node
-      # and one tensor output.
-      assert node_index == 0
-      assert tensor_index == 0
-      self._input_layers.append(layer)
-      self._input_coordinates.append((layer, node_index, tensor_index))
-
-    # Keep track of the network's nodes and layers.
-    nodes, nodes_by_depth, layers, _ = _map_graph_network(
-        self.inputs, self.outputs)
-    self._network_nodes = nodes
-    self._nodes_by_depth = nodes_by_depth
-    self._self_tracked_trackables = layers
-    self._layer_call_argspecs = {}
-    for layer in self._self_tracked_trackables:
-      self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-
-    # Build self.input_names and self.output_names.
-    self._set_output_names()
-    self.input_names = []
-    self._feed_input_names = []
-    self._feed_inputs = []
-    self._feed_input_shapes = []
-    for layer in self._input_layers:
-      self.input_names.append(layer.name)
-      if layer.is_placeholder:
-        self._feed_input_names.append(layer.name)
-        # Use batch_input_shape here because non-eager composite tensors may not
-        # have a shape attribute that's meaningful (sparse, for instance, has
-        # a tensor that's non-constant and needs to be fed). This means that
-        # input layers that create placeholders will need to have the
-        # batch_input_shape attr to allow for input shape validation.
-        self._feed_input_shapes.append(layer._batch_input_shape)
-        self._feed_inputs.append(layer.input)
-
-    self._compute_tensor_usage_count()
-    self._set_save_spec(self._nested_inputs)
-    tf_utils.assert_no_legacy_layers(self.layers)
-
-    # Note that this method is used by both functional and sequential models,
-    # so we can't just have this method in functional.__init__, which will miss
-    #  the coverage of sequential model.
-    if self._layout_map is not None:
-      layout_map_lib._map_functional_model_variable(self, self._layout_map)
-
-  @property
-  def input(self):
-    """Retrieves the input tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input tensor or list of input tensors.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-      AttributeError: If no inbound nodes are found.
+    Args:
+      inputs: List of input tensors (must be created via `tf.keras.Input()` or
+        originated from `tf.keras.Input()`).
+      outputs: List of output tensors.
+      name: String, optional. Name of the model.
+      trainable: Boolean, optional. If the model's variables should be
+        trainable.
     """
-    return self._nested_inputs
-
-  @property
-  def input_shape(self):
-    """Retrieves the input shape(s) of a layer.
 
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer, or if all inputs
-    have the same shape.
+    # See tf.Module for the usage of this property.
+    # The key of _layer_call_argspecs is a layer. tf.Module._flatten will fail
+    # to flatten the key since it is trying to convert Trackable/Layer to a
+    # string.
+    _TF_MODULE_IGNORED_PROPERTIES = frozenset(
+        itertools.chain(
+            (
+                "_layer_call_argspecs",
+                "_output_mask_cache",
+                "_output_tensor_cache",
+                "_output_shape_cache",
+            ),
+            training_lib.Model._TF_MODULE_IGNORED_PROPERTIES,
+        )
+    )
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(self, inputs, outputs, name=None, trainable=True, **kwargs):
+        # This is used by the Model class, since we have some logic to swap the
+        # class in the __new__ method, which will lead to __init__ get invoked
+        # twice. Using the skip_init to skip one of the invocation of __init__
+        # to avoid any side effects
+        skip_init = kwargs.pop("skip_init", False)
+        if skip_init:
+            return
+        generic_utils.validate_kwargs(kwargs, {})
+        super().__init__(name=name, trainable=trainable)
+        # Check if the inputs contain any intermediate `KerasTensor` (not
+        # created by tf.keras.Input()). In this case we need to clone the `Node`
+        # and `KerasTensor` objects to mimic rebuilding a new model from new
+        # inputs.  This feature is only enabled in TF2 not in v1 graph mode.
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            if not all(
+                [
+                    functional_utils.is_input_keras_tensor(t)
+                    for t in tf.nest.flatten(inputs)
+                ]
+            ):
+                inputs, outputs = functional_utils.clone_graph_nodes(
+                    inputs, outputs
+                )
+        self._init_graph_network(inputs, outputs)
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _init_graph_network(self, inputs, outputs):
+        # This method is needed for Sequential to reinitialize graph network
+        # when layer is added or removed.
+
+        base_layer.keras_api_gauge.get_cell("Functional").set(True)
+        self._is_graph_network = True
+
+        # Normalize and set self.inputs, self.outputs.
+        if isinstance(inputs, list) and len(tf.nest.flatten(inputs)) == 1:
+            inputs = inputs[0]
+        if isinstance(outputs, list) and len(tf.nest.flatten(outputs)) == 1:
+            outputs = outputs[0]
+        self._nested_inputs = inputs
+        self._nested_outputs = outputs
+        self.inputs = tf.nest.flatten(inputs)
+        self.outputs = tf.nest.flatten(outputs)
+
+        # Models constructed with a single Tensor or list of Tensors can
+        # be called with a dict, where the keys of the dict are the names
+        # of the `Input` objects. Extra keys are ignored with warning.
+        if not tf.nest.is_nested(self._nested_inputs):
+            self._enable_dict_to_input_mapping = True
+        elif isinstance(self._nested_inputs, (list, tuple)) and not any(
+            tf.nest.is_nested(t) for t in self._nested_inputs
+        ):
+            self._enable_dict_to_input_mapping = True
+        elif isinstance(self._nested_inputs, dict) and not any(
+            tf.nest.is_nested(t) for t in self._nested_inputs.values()
+        ):
+            self._enable_dict_to_input_mapping = True
+        else:
+            self._enable_dict_to_input_mapping = False
+
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            if any(
+                not hasattr(tensor, "_keras_history") for tensor in self.outputs
+            ):
+                base_layer_utils.create_keras_history(self._nested_outputs)
+
+        self._validate_graph_inputs_and_outputs()
+
+        # A Network does not create weights of its own, thus it is already
+        # built.
+        self.built = True
+        self._build_input_shape = tf.nest.map_structure(
+            lambda x: x.shape, inputs
+        )
+        self._compute_output_and_mask_jointly = True
+        # `_expects_training_arg` is True since the `training` argument is
+        # always present in the signature of the `call` method of a graph
+        # network.
+        self._call_spec.expects_training_arg = True
+        self._call_spec.expects_mask_arg = True
+        # A graph network does not autocast inputs, as its layers will cast them
+        # instead.
+        self._autocast = False
+
+        self._input_layers = []
+        self._output_layers = []
+        self._input_coordinates = []
+        self._output_coordinates = []
+
+        # This is for performance optimization when calling the Network on new
+        # inputs. Every time the Network is called on a set on input tensors, we
+        # compute the output tensors, output masks and output shapes in one
+        # pass, then cache them here. When any of these outputs is queried
+        # later, we retrieve it from there instead of recomputing it.
+        self._output_mask_cache = {}
+        self._output_tensor_cache = {}
+        self._output_shape_cache = {}
+
+        # Build self._output_layers:
+        for x in self.outputs:
+            (
+                layer,
+                node_index,
+                tensor_index,
+            ) = x._keras_history
+            self._output_layers.append(layer)
+            self._output_coordinates.append((layer, node_index, tensor_index))
+
+        # Build self._input_layers:
+        for x in self.inputs:
+            (
+                layer,
+                node_index,
+                tensor_index,
+            ) = x._keras_history
+            # It's supposed to be an input layer, so only one node
+            # and one tensor output.
+            assert node_index == 0
+            assert tensor_index == 0
+            self._input_layers.append(layer)
+            self._input_coordinates.append((layer, node_index, tensor_index))
+
+        # Keep track of the network's nodes and layers.
+        nodes, nodes_by_depth, layers, _ = _map_graph_network(
+            self.inputs, self.outputs
+        )
+        self._network_nodes = nodes
+        self._nodes_by_depth = nodes_by_depth
+        self._self_tracked_trackables = layers
+        self._layer_call_argspecs = {}
+        for layer in self._self_tracked_trackables:
+            self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(
+                layer.call
+            )
+
+        # Build self.input_names and self.output_names.
+        self._set_output_names()
+        self.input_names = []
+        self._feed_input_names = []
+        self._feed_inputs = []
+        self._feed_input_shapes = []
+        for layer in self._input_layers:
+            self.input_names.append(layer.name)
+            if layer.is_placeholder:
+                self._feed_input_names.append(layer.name)
+                # Use batch_input_shape here because non-eager composite tensors
+                # may not have a shape attribute that's meaningful (sparse, for
+                # instance, has a tensor that's non-constant and needs to be
+                # fed). This means that input layers that create placeholders
+                # will need to have the batch_input_shape attr to allow for
+                # input shape validation.
+                self._feed_input_shapes.append(layer._batch_input_shape)
+                self._feed_inputs.append(layer.input)
+
+        self._compute_tensor_usage_count()
+        self._set_save_spec(self._nested_inputs)
+        tf_utils.assert_no_legacy_layers(self.layers)
+
+        # Note that this method is used by both functional and sequential
+        # models, so we can't just have this method in functional.__init__,
+        # which will miss the coverage of sequential model.
+        if self._layout_map is not None:
+            layout_map_lib._map_functional_model_variable(
+                self, self._layout_map
+            )
+
+    @property
+    def input(self):
+        """Retrieves the input tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one input,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+            Input tensor or list of input tensors.
+
+        Raises:
+          RuntimeError: If called in Eager mode.
+          AttributeError: If no inbound nodes are found.
+        """
+        return self._nested_inputs
+
+    @property
+    def input_shape(self):
+        """Retrieves the input shape(s) of a layer.
+
+        Only applicable if the layer has exactly one input,
+        i.e. if it is connected to one incoming layer, or if all inputs
+        have the same shape.
+
+        Returns:
+            Input shape, as an integer shape tuple
+            (or list of shape tuples, one tuple per input tensor).
+
+        Raises:
+            AttributeError: if the layer has no defined input_shape.
+            RuntimeError: if called in Eager mode.
+        """
+        return tf.nest.map_structure(backend.int_shape, self.input)
+
+    @property
+    def input_spec(self):
+        if hasattr(self, "_manual_input_spec"):
+            return self._manual_input_spec
+        if max([len(path) for path in nest.yield_flat_paths(
+            self._nested_inputs)]) > 1:
+        ) != len(self.inputs):
+            # Case where we have a nested structure.
+            # In such a case we can't safely run any checks.
+            return None
+        if isinstance(self._nested_inputs, dict):
+            # Case where `_nested_inputs` is a plain dict of Inputs.
+            names = sorted(self._nested_inputs.keys())
+            return [
+                input_spec.InputSpec(
+                    shape=shape_with_no_batch_size(self._nested_inputs[name]),
+                    allow_last_axis_squeeze=True,
+                    name=name,
+                )
+                for name in names
+            ]
+        else:
+            # Single input, or list / tuple of inputs.
+            # The data may be passed as a dict keyed by input name.
+            return [
+                input_spec.InputSpec(
+                    shape=shape_with_no_batch_size(x),
+                    allow_last_axis_squeeze=True,
+                    name=x._keras_history.layer.name,
+                )
+                for x in self.inputs
+            ]
+
+    @input_spec.setter
+    def input_spec(self, value):
+        self._manual_input_spec = value
+
+    @property
+    def output(self):
+        """Retrieves the output tensor(s) of a layer.
+
+        Only applicable if the layer has exactly one output,
+        i.e. if it is connected to one incoming layer.
+
+        Returns:
+          Output tensor or list of output tensors.
+
+        Raises:
+          AttributeError: if the layer is connected to more than one incoming
+            layers.
+          RuntimeError: if called in Eager mode.
+        """
+        return self._nested_outputs
+
+    @property
+    def output_shape(self):
+        """Retrieves the output shape(s) of a layer.
+
+        Only applicable if the layer has one output,
+        or if all outputs have the same shape.
+
+        Returns:
+            Output shape, as an integer shape tuple
+            (or list of shape tuples, one tuple per output tensor).
+
+        Raises:
+            AttributeError: if the layer has no defined output shape.
+            RuntimeError: if called in Eager mode.
+        """
+        return tf.nest.map_structure(backend.int_shape, self.output)
+
+    def _set_output_names(self):
+        """Assigns unique names to the Network's outputs.
+
+        Output layers with multiple output tensors would otherwise lead to
+        duplicate names in self.output_names.
+        """
+        uniquified = []
+        output_names = set()
+        prefix_count = {}
+        for layer in self._output_layers:
+            proposal = layer.name
+            while proposal in output_names:
+                existing_count = prefix_count.get(layer.name, 1)
+                proposal = f"{layer.name}_{existing_count}"
+                prefix_count[layer.name] = existing_count + 1
+            output_names.add(proposal)
+            uniquified.append(proposal)
+        self.output_names = uniquified
+
+    @property
+    def _layer_checkpoint_dependencies(self):
+        """Dictionary of layer dependencies to be included in the checkpoint."""
+        weight_layer_index = 0
+
+        dependencies = collections.OrderedDict()
+        for layer_index, layer in enumerate(self.layers):
+            try:
+                if layer.weights:
+                    # Keep a separate index for layers which have weights. This
+                    # allows users to insert Layers without weights anywhere in
+                    # the network without breaking checkpoints.
+                    dependencies[
+                        "layer_with_weights-%d" % weight_layer_index
+                    ] = layer
+                    weight_layer_index += 1
+            except ValueError:
+                # The layer might have weights, but may not be built yet. We
+                # just treat it as layer without weight.
+                pass
+
+            # Even if it doesn't have weights, we should still track everything
+            # in case it has/will have Trackable dependencies.
+            dependencies["layer-%d" % layer_index] = layer
+        return dependencies
+
+    def _trackable_children(self, save_type="checkpoint", **kwargs):
+        dependencies = self._layer_checkpoint_dependencies
+        dependencies.update(super()._trackable_children(save_type, **kwargs))
+        return dependencies
+
+    def _lookup_dependency(self, name, cached_dependencies=None):
+        if cached_dependencies:
+            return cached_dependencies.get(name)
+        # Fall back to slow lookup (`layer_checkpoint_dependencies` does a
+        # thorough check of all layer to see if they contain weights.)
+        layer_dependencies = self._layer_checkpoint_dependencies
+        if name in layer_dependencies:
+            return layer_dependencies[name]
+        return super()._lookup_dependency(name)
+
+    def _handle_deferred_layer_dependencies(self, layers):
+        """Handles layer checkpoint dependencies that are added after init."""
+        layer_checkpoint_dependencies = self._layer_checkpoint_dependencies
+        layer_to_name = {v: k for k, v in layer_checkpoint_dependencies.items()}
+        for layer in layers:
+            if layer in layer_to_name:
+                self._handle_deferred_dependencies(
+                    name=layer_to_name[layer], trackable=layer
+                )
+
+    @property
+    def _should_compute_mask(self):
+        return True
+
+    def compute_mask(self, inputs, mask):
+        # TODO(omalleyt): b/123540974 This function is not really safe to call
+        # by itself because it will duplicate any updates and losses in graph
+        # mode by `call`ing the Layers again.
+        output_tensors = self._run_internal_graph(inputs, mask=mask)
+        return tf.nest.map_structure(
+            lambda t: getattr(t, "_keras_mask", None), output_tensors
+        )
+
+    @doc_controls.do_not_doc_inheritable
+    def call(self, inputs, training=None, mask=None):
+        """Calls the model on new inputs.
+
+        In this case `call` just reapplies
+        all ops in the graph to the new inputs
+        (e.g. build a new computational graph from the provided inputs).
+
+        Args:
+            inputs: A tensor or list of tensors.
+            training: Boolean or boolean scalar tensor, indicating whether to
+                run the `Network` in training mode or inference mode.
+            mask: A mask or list of masks. A mask can be
+                either a tensor or None (no mask).
+
+        Returns:
+            A tensor if there is a single output, or
+            a list of tensors if there are more than one outputs.
+        """
+        return self._run_internal_graph(inputs, training=training, mask=mask)
+
+    def compute_output_shape(self, input_shape):
+        # Convert any shapes in tuple format to TensorShapes.
+        input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
+
+        if len(tf.nest.flatten(input_shape)) != len(
+            tf.nest.flatten(self._input_layers)
+        ):
+            raise ValueError(
+                f"Invalid `input_shape` argument {input_shape}: "
+                f"the model expects {len(self._input_layers)} "
+                "input tensors."
+            )
 
-    Returns:
-        Input shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per input tensor).
+        # Use the tuple of TensorShape as the cache key, since tuple is hashable
+        # and can be used as hash key.
+        try:
+            cache_key = tuple(
+                tf_utils.convert_shapes(input_shape, to_tuples=True)
+            )
+            if cache_key in self._output_shape_cache:
+                # Cache hit. Return shapes as TensorShapes.
+                return self._output_shape_cache[cache_key]
+        except ValueError:
+            # In case there are unknown TensorShape, eg for sparse tensor input,
+            # We skip the caching since the shape is unknown.
+            pass
+
+        layers_to_output_shapes = {}
+        for layer, shape in zip(
+            self._input_layers, tf.nest.flatten(input_shape)
+        ):
+            # It's an input layer: then `compute_output_shape` is identity,
+            # and there is only one node and one tensor..
+            shape_key = layer.name + "_0_0"
+            layers_to_output_shapes[shape_key] = shape
 
-    Raises:
-        AttributeError: if the layer has no defined input_shape.
-        RuntimeError: if called in Eager mode.
-    """
-    return tf.nest.map_structure(backend.int_shape, self.input)
-
-  @property
-  def input_spec(self):
-    if hasattr(self, '_manual_input_spec'):
-      return self._manual_input_spec
-    if (isinstance(self._nested_inputs, (dict, list, tuple)) and
-        len(self._nested_inputs) != len(self.inputs)):
-      # Case where we have a nested structure.
-      # In such a case we can't safely run any checks.
-      return None
-    if isinstance(self._nested_inputs, dict):
-      # Case where `_nested_inputs` is a plain dict of Inputs.
-      names = sorted(self._nested_inputs.keys())
-      return [input_spec.InputSpec(
-          shape=shape_with_no_batch_size(self._nested_inputs[name]),
-          allow_last_axis_squeeze=True, name=name) for name in names]
-    else:
-      # Single input, or list / tuple of inputs.
-      # The data may be passed as a dict keyed by input name.
-      return [input_spec.InputSpec(
-          shape=shape_with_no_batch_size(x), allow_last_axis_squeeze=True,
-          name=x._keras_history.layer.name) for x in self.inputs]
+        depth_keys = list(self._nodes_by_depth.keys())
+        depth_keys.sort(reverse=True)
+        # Iterate over nodes, by depth level.
+        if len(depth_keys) > 1:
+            for depth in depth_keys:
+                nodes = self._nodes_by_depth[depth]
+                for node in nodes:
+                    layer = node.layer
+                    if layer in self._input_layers:
+                        # We've already covered the input layers
+                        # a few lines above.
+                        continue
+                    # Get the input shapes for the first argument of the node
+                    layer_input_shapes = []
+                    layer_inputs = node.call_args[0]
+                    for layer_input in tf.nest.flatten(layer_inputs):
+                        kh = layer_input._keras_history
+                        input_layer_key = kh.layer.name + "_%s_%s" % (
+                            kh.node_index,
+                            kh.tensor_index,
+                        )
+                        layer_input_shapes.append(
+                            layers_to_output_shapes[input_layer_key]
+                        )
+                    layer_input_shapes = tf.nest.pack_sequence_as(
+                        layer_inputs, layer_input_shapes
+                    )
+                    # Layers expect shapes to be tuples for
+                    # `compute_output_shape`.
+                    layer_input_shapes = tf_utils.convert_shapes(
+                        layer_input_shapes, to_tuples=True
+                    )
+                    layer_output_shapes = layer.compute_output_shape(
+                        layer_input_shapes
+                    )
+                    # Convert back to TensorShapes.
+                    layer_output_shapes = tf_utils.convert_shapes(
+                        layer_output_shapes, to_tuples=False
+                    )
+
+                    node_index = layer._inbound_nodes.index(node)
+                    for j, shape in enumerate(
+                        tf.nest.flatten(layer_output_shapes)
+                    ):
+                        shape_key = layer.name + f"_{node_index}_{j}"
+                        layers_to_output_shapes[shape_key] = shape
+
+            # Read final output shapes from layers_to_output_shapes.
+            output_shapes = []
+            for i in range(len(self._output_layers)):
+                layer, node_index, tensor_index = self._output_coordinates[i]
+                shape_key = layer.name + f"_{node_index}_{tensor_index}"
+                output_shapes.append(layers_to_output_shapes[shape_key])
+            output_shapes = tf.nest.pack_sequence_as(
+                self._nested_outputs, output_shapes
+            )
+            # Store in cache.
+            self._output_shape_cache[cache_key] = output_shapes
+
+        # Return shapes as TensorShapes.
+        return output_shapes
+
+    def _init_set_name(self, name, zero_based=True):
+        if not name:
+            cls_name = self.__class__.__name__
+            if self.__class__ == Functional:
+                # Hide the functional class name from user, since its not a
+                # public visible class. Use "Model" instead,
+                cls_name = "Model"
+            self._name = backend.unique_object_name(
+                generic_utils.to_snake_case(cls_name), zero_based=zero_based
+            )
+        else:
+            self._name = name
 
-  @input_spec.setter
-  def input_spec(self, value):
-    self._manual_input_spec = value
+    def _run_internal_graph(self, inputs, training=None, mask=None):
+        """Computes output tensors for new inputs.
 
-  @property
-  def output(self):
-    """Retrieves the output tensor(s) of a layer.
+        # Note:
+            - Can be run on non-Keras tensors.
 
-    Only applicable if the layer has exactly one output,
-    i.e. if it is connected to one incoming layer.
+        Args:
+            inputs: Tensor or nested structure of Tensors.
+            training: Boolean learning phase.
+            mask: (Optional) Tensor or nested structure of Tensors.
 
-    Returns:
-      Output tensor or list of output tensors.
+        Returns:
+            output_tensors
+        """
+        inputs = self._flatten_to_reference_inputs(inputs)
+        if mask is None:
+            masks = [None] * len(inputs)
+        else:
+            masks = self._flatten_to_reference_inputs(mask)
+        for input_t, mask in zip(inputs, masks):
+            input_t._keras_mask = mask
+
+        # Dictionary mapping reference tensors to computed tensors.
+        tensor_dict = {}
+        tensor_usage_count = self._tensor_usage_count
+        for x, y in zip(self.inputs, inputs):
+            y = self._conform_to_reference_input(y, ref_input=x)
+            x_id = str(id(x))
+            tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
+
+        nodes_by_depth = self._nodes_by_depth
+        depth_keys = list(nodes_by_depth.keys())
+        depth_keys.sort(reverse=True)
+
+        for depth in depth_keys:
+            nodes = nodes_by_depth[depth]
+            for node in nodes:
+                if node.is_input:
+                    continue  # Input tensors already exist.
+
+                if any(t_id not in tensor_dict for t_id in node.flat_input_ids):
+                    continue  # Node is not computable, try skipping.
+
+                args, kwargs = node.map_arguments(tensor_dict)
+                outputs = node.layer(*args, **kwargs)
+
+                # Update tensor_dict.
+                for x_id, y in zip(
+                    node.flat_output_ids, tf.nest.flatten(outputs)
+                ):
+                    tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
+
+        output_tensors = []
+        for x in self.outputs:
+            x_id = str(id(x))
+            assert x_id in tensor_dict, "Could not compute output " + str(x)
+            output_tensors.append(tensor_dict[x_id].pop())
+
+        return tf.nest.pack_sequence_as(self._nested_outputs, output_tensors)
+
+    def _flatten_to_reference_inputs(self, tensors):
+        """Maps `tensors` to their respective `keras.Input`."""
+        if self._enable_dict_to_input_mapping and isinstance(tensors, dict):
+            ref_inputs = self._nested_inputs
+            if not tf.nest.is_nested(ref_inputs):
+                ref_inputs = [self._nested_inputs]
+            if isinstance(ref_inputs, dict):
+                # In the case that the graph is constructed with dict input
+                # tensors, We will use the original dict key to map with the
+                # keys in the input data. Note that the model.inputs is using
+                # nest.flatten to process the input tensors, which means the
+                # dict input tensors are ordered by their keys.
+                ref_input_names = sorted(ref_inputs.keys())
+            else:
+                ref_input_names = [
+                    inp._keras_history.layer.name for inp in ref_inputs
+                ]
+
+            # Raise an warning if there are more input data comparing to input
+            # tensor
+            if len(tensors) > len(ref_input_names):
+                warnings.warn(
+                    "Input dict contained keys {} which did not match any "
+                    "model input. They will be ignored by the model.".format(
+                        [n for n in tensors.keys() if n not in ref_input_names]
+                    ),
+                    stacklevel=2,
+                )
+
+            try:
+                # Flatten in the order `Input`s were passed during Model
+                # construction.
+                return [tensors[n] for n in ref_input_names]
+            except KeyError:
+                # TODO(b/151582614)
+                return tf.nest.flatten(tensors)
+
+        # Otherwise both self.inputs and tensors will already be in same order.
+        return tf.nest.flatten(tensors)
 
-    Raises:
-      AttributeError: if the layer is connected to more than one incoming
-        layers.
-      RuntimeError: if called in Eager mode.
-    """
-    return self._nested_outputs
+    def _conform_to_reference_input(self, tensor, ref_input):
+        """Set shape and dtype based on `keras.Input`s."""
+        if isinstance(tensor, tf.Tensor):
+            # Allow (None,) and (None, 1) Tensors to be passed interchangeably.
+            # Use the shape specified by the `keras.Input`.
+            t_shape = tensor.shape
+            t_rank = t_shape.rank
+            ref_shape = ref_input.shape
+            ref_rank = ref_shape.rank
+            keras_history = getattr(tensor, "_keras_history", None)
+            if t_rank is not None and ref_rank is not None:
+                # Should squeeze last dimension.  True if tensor is (BATCH, ...,
+                # 1) and reference is (BATCH, ...).
+                if t_rank == ref_rank + 1 and t_shape[-1] == 1:
+                    tensor = tf.squeeze(tensor, axis=-1)
+                # Should expand last_dimension.  True if tensor is (BATCH, ...)
+                # and reference is (BATCH, ..., 1).
+                elif t_rank == ref_rank - 1 and ref_shape[-1] == 1:
+                    tensor = tf.expand_dims(tensor, axis=-1)
+            if keras_history is not None:  # Restore keras history.
+                tensor._keras_history = keras_history
+
+            # Dtype casting.
+            tensor = tf.cast(tensor, dtype=ref_input.dtype)
+        elif tf_utils.is_extension_type(tensor):
+            # Dtype casting (If the extension type has a non-variant dtype and
+            # supports being cast).  Only cast if necessary (since some
+            # extension types may not implement tf.cast).
+            tensor_dtype = getattr(tensor, "dtype", None)
+            ref_input_dtype = getattr(ref_input, "dtype", None)
+            if (
+                ref_input_dtype is not None
+                and tensor_dtype is not None
+                and tensor_dtype != ref_input_dtype
+                and ref_input_dtype != tf.variant
+            ):
+                tensor = tf.cast(tensor, dtype=ref_input_dtype)
+
+        return tensor
+
+    @generic_utils.default
+    def get_config(self):
+        # Prepare base arguments
+        config = {
+            "name": self.name,
+            "trainable": self.trainable,
+        }
 
-  @property
-  def output_shape(self):
-    """Retrieves the output shape(s) of a layer.
+        if saved_model_utils.in_tf_saved_model_scope():
+            # SavedModel special case: need to preserve legacy (potentially
+            # incorrect) behavior.
+            return copy.deepcopy(get_network_config(self, config=config))
+
+        # Check whether the class has a constructor compatible with a Functional
+        # model or if it has a custom constructor.
+        if has_functional_like_constructor(self.__class__):
+            # Only return a Functional config if the constructor is the same
+            # as that of a Functional model. This excludes subclassed Functional
+            # models with a custom __init__.
+            config = copy.deepcopy(get_network_config(self, config=config))
+        else:
+            # Try to autogenerate config
+            xtra_args = set(config.keys())
+            if getattr(self, "_auto_get_config", False):
+                config.update(self._auto_config.config)
+            # Remove args non explicitly supported
+            argspec = tf_inspect.getfullargspec(self.__init__)
+            if argspec.varkw != "kwargs":
+                for key in xtra_args - xtra_args.intersection(argspec.args[1:]):
+                    config.pop(key, None)
+        return config
+
+    def get_weight_paths(self):
+        result = {}
+        for layer in self.layers:
+            (
+                descendants,
+                object_paths_dict,
+            ) = tf.__internal__.tracking.ObjectGraphView(
+                layer
+            ).breadth_first_traversal()
+            for descendant in descendants:
+                if isinstance(descendant, tf.Variable):
+                    trackable_references = object_paths_dict[descendant]
+                    object_path = ".".join(
+                        [t.name for t in trackable_references]
+                    )
+                    result[layer.name + "." + object_path] = descendant
+        return result
+
+    def _validate_graph_inputs_and_outputs(self):
+        """Validates the inputs and outputs of a Graph Network."""
+        # Check for redundancy in inputs.
+        if len({id(i) for i in self.inputs}) != len(self.inputs):
+            raise ValueError(
+                "The list of inputs passed to the model "
+                "contains the same input multiple times. "
+                "All inputs should only appear once."
+                f"Received inputs={self.inputs}"
+            )
+
+        for x in self.inputs:
+            # Check that x has appropriate `_keras_history` metadata.
+            if not hasattr(x, "_keras_history"):
+                cls_name = self.__class__.__name__
+                raise ValueError(
+                    f"Input tensors to a {cls_name} model "
+                    "must come from `tf.keras.Input`. "
+                    f"Received inputs={x} (missing previous layer metadata)."
+                )
+            # Check that x is an input tensor.
+
+            layer = x._keras_history.layer
+            if len(layer._inbound_nodes) > 1 or (
+                layer._inbound_nodes and not layer._inbound_nodes[0].is_input
+            ):
+                cls_name = self.__class__.__name__
+                logging.warning(
+                    f"{cls_name} model inputs must come from "
+                    "`tf.keras.Input` (thus holding past layer metadata). "
+                    "They cannot be the output of "
+                    "a previous non-Input layer. "
+                    "Here, a tensor specified as "
+                    f'input to "{self.name}" was not an Input tensor, '
+                    f'it was generated by layer "{layer.name}".\n'
+                    "Note that input tensors are "
+                    "instantiated via `tensor = tf.keras.Input(shape)`.\n"
+                    f"The tensor that caused the issue was: {x}"
+                )
+
+        # Check compatibility of batch sizes of Input Layers.
+        input_batch_sizes = set(
+            [
+                training_utils.get_static_batch_size(x._keras_history.layer)
+                for x in self.inputs
+            ]
+        )
+        input_batch_sizes.discard(None)
+        if len(input_batch_sizes) > 1:
+            logging.warning(
+                "Found incompatible static batch sizes among the "
+                f"inputs. Batch sizes: {sorted(input_batch_sizes)}"
+            )
+
+        for x in self.outputs:
+            if not hasattr(x, "_keras_history"):
+                cls_name = self.__class__.__name__
+                raise ValueError(
+                    f"Output tensors of a {cls_name} model must be "
+                    "the output of a TensorFlow `Layer` "
+                    f"(thus holding past layer metadata). Found: {x}"
+                )
+
+    def _insert_layers(self, layers, relevant_nodes=None):
+        """Inserts Layers into the Network after Network creation.
+
+        This is only valid for Keras Graph Networks.  Layers added via this
+        function will be included in the `call` computation and `get_config` of
+        this Network.  They will not be added to the Network's outputs.
+
+        Args:
+          layers: Arbitrary nested structure of Layers. Layers must be reachable
+            from one or more of the `keras.Input` Tensors that correspond to
+            this Network's inputs.
+          relevant_nodes: Nodes from the Layers that should be considered part
+            of this Network. If `None`, all Nodes will be considered part of
+            this Network.
+
+        Raises:
+          ValueError: If the layers depend on `Input`s not found in this Model.
+        """
+        layers = tf.nest.flatten(layers)
+        tf_utils.assert_no_legacy_layers(layers)
+        node_to_depth = {}
+        for depth, nodes in self._nodes_by_depth.items():
+            node_to_depth.update({node: depth for node in nodes})
+        # The nodes of these Layers that are relevant to this Network. If not
+        # provided, assume all Nodes are relevant
+        if not relevant_nodes:
+            relevant_nodes = tf.nest.flatten(
+                [layer._inbound_nodes for layer in layers]
+            )
+        network_nodes = set(relevant_nodes + list(node_to_depth.keys()))
+
+        def _get_min_depth(node):
+            """Gets the minimum depth at which node can be computed."""
+            min_depth = 0
+            for layer, node_id, _, _ in node.iterate_inbound():
+                inbound_node = layer._inbound_nodes[node_id]
+                if inbound_node in node_to_depth:
+                    min_depth = min(min_depth, node_to_depth[inbound_node])
+                elif inbound_node not in network_nodes:
+                    continue
+                else:
+                    # Previous relevant nodes haven't been processed yet.
+                    return None
+            # New node is one shallower than its shallowest input.
+            return min_depth - 1
+
+        # Insert nodes into `_nodes_by_depth` and other node attrs.
+        unprocessed_nodes = copy.copy(relevant_nodes)
+        i = 0
+        while unprocessed_nodes:
+            i += 1
+            # Do a sanity check. This can occur if `Input`s from outside this
+            # Model are being relied on.
+            if i > 10000:
+                raise ValueError(
+                    "Layers could not be added due to missing dependencies."
+                )
+
+            node = unprocessed_nodes.pop(0)
+            depth = _get_min_depth(node)
+            if depth is None:  # Defer until inbound nodes are processed.
+                unprocessed_nodes.append(node)
+                continue
+            node_key = _make_node_key(
+                node.layer.name, node.layer._inbound_nodes.index(node)
+            )
+            if node_key not in self._network_nodes:
+                node_to_depth[node] = depth
+                self._network_nodes.add(node_key)
+                self._nodes_by_depth[depth].append(node)
+
+        # Insert layers and update other layer attrs.
+        layer_set = set(self._self_tracked_trackables)
+        deferred_layers = []
+        for layer in layers:
+            if layer not in layer_set:
+                self._self_tracked_trackables.append(layer)
+                deferred_layers.append(layer)
+                self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(
+                    layer.call
+                )
+                layer_set.add(layer)
+        self._handle_deferred_layer_dependencies(deferred_layers)
+
+        self._compute_tensor_usage_count()
+
+    def _compute_tensor_usage_count(self):
+        """Compute the #. of tensor usages for all the output tensors of layers.
+
+        The computed tensor usage count is saved as `self._tensor_usage_count`.
+        This is later used for saving memory in eager computation by releasing
+        no-longer-needed tensors as early as possible.
+        """
+        tensor_usage_count = collections.Counter()
+        available_tensors = set(str(id(tensor)) for tensor in self.inputs)
+
+        depth_keys = list(self._nodes_by_depth.keys())
+        depth_keys.sort(reverse=True)
+        depth_keys = depth_keys[1:]
+
+        for depth in depth_keys:
+            for node in self._nodes_by_depth[depth]:
+                input_tensors = {
+                    str(id(tensor))
+                    for tensor in tf.nest.flatten(node.keras_inputs)
+                }
+                if input_tensors.issubset(available_tensors):
+                    for tensor in tf.nest.flatten(node.keras_inputs):
+                        tensor_usage_count[str(id(tensor))] += 1
+
+                    for output_tensor in tf.nest.flatten(node.outputs):
+                        available_tensors.add(str(id(output_tensor)))
+
+        for tensor in self.outputs:
+            tensor_usage_count[str(id(tensor))] += 1
 
-    Only applicable if the layer has one output,
-    or if all outputs have the same shape.
+        self._tensor_usage_count = tensor_usage_count
+
+    def _assert_weights_created(self):
+        # Override the implementation in Model.
+        # The Functional model should always have weight created already.
+        return
+
+    def _graph_network_add_loss(self, symbolic_loss):
+        new_nodes, new_layers = _map_subgraph_network(
+            self.inputs, [symbolic_loss]
+        )
+        # Losses must be keyed on inputs no matter what in order to be supported
+        # in DistributionStrategy.
+        add_loss_layer = base_layer.AddLoss(
+            unconditional=False, dtype=symbolic_loss.dtype
+        )
+        add_loss_layer(symbolic_loss)
+        new_nodes.extend(add_loss_layer.inbound_nodes)
+        new_layers.append(add_loss_layer)
+        self._insert_layers(new_layers, new_nodes)
+
+    def _graph_network_add_metric(self, value, aggregation, name):
+        new_nodes, new_layers = _map_subgraph_network(self.inputs, [value])
+        add_metric_layer = base_layer.AddMetric(
+            aggregation, name, dtype=value.dtype
+        )
+        add_metric_layer(value)
+        new_nodes.extend(add_metric_layer.inbound_nodes)
+        new_layers.append(add_metric_layer)
+        self._insert_layers(new_layers, new_nodes)
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return network_serialization.NetworkSavedModelSaver(self)
+
+    def _get_save_spec(self, dynamic_batch=True, inputs_only=True):
+        if getattr(self, "_has_explicit_input_shape", True):
+            # Functional models and Sequential models that have an explicit
+            # input shape should use the batch size set by the input layer.
+            dynamic_batch = False
+        return super()._get_save_spec(dynamic_batch, inputs_only)
 
-    Returns:
-        Output shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per output tensor).
 
-    Raises:
-        AttributeError: if the layer has no defined output shape.
-        RuntimeError: if called in Eager mode.
-    """
-    return tf.nest.map_structure(backend.int_shape, self.output)
+def _make_node_key(layer_name, node_index):
+    return layer_name + "_ib-" + str(node_index)
 
-  def _set_output_names(self):
-    """Assigns unique names to the Network's outputs.
 
-    Output layers with multiple output tensors would otherwise lead to duplicate
-    names in self.output_names.
-    """
-    uniquified = []
-    output_names = set()
-    prefix_count = {}
-    for layer in self._output_layers:
-      proposal = layer.name
-      while proposal in output_names:
-        existing_count = prefix_count.get(layer.name, 1)
-        proposal = '{}_{}'.format(layer.name, existing_count)
-        prefix_count[layer.name] = existing_count + 1
-      output_names.add(proposal)
-      uniquified.append(proposal)
-    self.output_names = uniquified
-
-  @property
-  def _layer_checkpoint_dependencies(self):
-    """Dictionary of layer dependencies to be included in the checkpoint."""
-    weight_layer_index = 0
-
-    dependencies = collections.OrderedDict()
-    for layer_index, layer in enumerate(self.layers):
-      try:
-        if layer.weights:
-          # Keep a separate index for layers which have weights. This allows
-          # users to insert Layers without weights anywhere in the network
-          # without breaking checkpoints.
-          dependencies['layer_with_weights-%d' % weight_layer_index] = layer
-          weight_layer_index += 1
-      except ValueError:
-        # The layer might have weights, but may not be built yet. We just treat
-        # it as layer without weight.
-        pass
-
-      # Even if it doesn't have weights, we should still track everything in
-      # case it has/will have Trackable dependencies.
-      dependencies['layer-%d' % layer_index] = layer
-    return dependencies
-
-  def _trackable_children(self, save_type='checkpoint', **kwargs):
-    dependencies = self._layer_checkpoint_dependencies
-    dependencies.update(
-        super()._trackable_children(save_type, **kwargs))
-    return dependencies
-
-  def _lookup_dependency(self, name):
-    layer_dependencies = self._layer_checkpoint_dependencies
-    if name in layer_dependencies:
-      return layer_dependencies[name]
-    return super()._lookup_dependency(name)
-
-  def _handle_deferred_layer_dependencies(self, layers):
-    """Handles layer checkpoint dependencies that are added after init."""
-    layer_checkpoint_dependencies = self._layer_checkpoint_dependencies
-    layer_to_name = {v: k for k, v in layer_checkpoint_dependencies.items()}
-    for layer in layers:
-      if layer in layer_to_name:
-        self._handle_deferred_dependencies(name=layer_to_name[layer],
-                                           trackable=layer)
-
-  @property
-  def _should_compute_mask(self):
-    return True
-
-  def compute_mask(self, inputs, mask):
-    # TODO(omalleyt): b/123540974 This function is not really safe to call
-    # by itself because it will duplicate any updates and losses in graph
-    # mode by `call`ing the Layers again.
-    output_tensors = self._run_internal_graph(inputs, mask=mask)
-    return tf.nest.map_structure(lambda t: getattr(t, '_keras_mask', None),
-                              output_tensors)
-
-  @doc_controls.do_not_doc_inheritable
-  def call(self, inputs, training=None, mask=None):
-    """Calls the model on new inputs.
-
-    In this case `call` just reapplies
-    all ops in the graph to the new inputs
-    (e.g. build a new computational graph from the provided inputs).
+def _map_graph_network(inputs, outputs):
+    """Validates a network's topology and gather its layers and nodes.
 
     Args:
-        inputs: A tensor or list of tensors.
-        training: Boolean or boolean scalar tensor, indicating whether to run
-          the `Network` in training mode or inference mode.
-        mask: A mask or list of masks. A mask can be
-            either a tensor or None (no mask).
+      inputs: List of input tensors.
+      outputs: List of outputs tensors.
 
     Returns:
-        A tensor if there is a single output, or
-        a list of tensors if there are more than one outputs.
+      A tuple `(nodes, nodes_by_depth, layers, layers_by_depth)`.
+      - nodes: list of Node instances.
+      - nodes_by_depth: dict mapping ints (depth) to lists of node instances.
+      - layers: list of Layer instances.
+      - layers_by_depth: dict mapping ints (depth) to lists of layer instances.
+
+    Raises:
+      ValueError: In case the network is not valid (e.g. disconnected graph).
     """
-    return self._run_internal_graph(
-        inputs, training=training, mask=mask)
-
-  def compute_output_shape(self, input_shape):
-    # Convert any shapes in tuple format to TensorShapes.
-    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-
-    if (len(tf.nest.flatten(input_shape)) !=
-        len(tf.nest.flatten(self._input_layers))):
-      raise ValueError(f'Invalid `input_shape` argument {input_shape}: '
-                       f'the model expects {len(self._input_layers)} '
-                       'input tensors.')
-
-    # Use the tuple of TensorShape as the cache key, since tuple is hashable
-    # and can be used as hash key.
-    try:
-      cache_key = tuple(tf_utils.convert_shapes(input_shape, to_tuples=True))
-      if cache_key in self._output_shape_cache:
-        # Cache hit. Return shapes as TensorShapes.
-        return self._output_shape_cache[cache_key]
-    except ValueError:
-      # In case there are unknown TensorShape, eg for sparse tensor input,
-      # We skip the caching since the shape is unknown.
-      pass
-
-    layers_to_output_shapes = {}
-    for layer, shape in zip(self._input_layers, tf.nest.flatten(input_shape)):
-      # It's an input layer: then `compute_output_shape` is identity,
-      # and there is only one node and one tensor..
-      shape_key = layer.name + '_0_0'
-      layers_to_output_shapes[shape_key] = shape
-
-    depth_keys = list(self._nodes_by_depth.keys())
+    # "depth" is number of layers between output Node and the Node.
+    # Nodes are ordered from inputs -> outputs.
+    nodes_in_decreasing_depth, layer_indices = _build_map(outputs)
+    network_nodes = {
+        _make_node_key(node.layer.name, node.layer._inbound_nodes.index(node))
+        for node in nodes_in_decreasing_depth
+    }
+
+    nodes_depths = {}  # dict {node: depth value}
+    layers_depths = {}  # dict {layer: depth value}
+
+    for node in reversed(nodes_in_decreasing_depth):
+        # If the depth is not set, the node has no outbound nodes (depth 0).
+        depth = nodes_depths.setdefault(node, 0)
+
+        # Update the depth of the corresponding layer
+        previous_depth = layers_depths.get(node.layer, 0)
+        # If we've seen this layer before at a higher depth,
+        # we should use that depth instead of the node depth.
+        # This is necessary for shared layers that have inputs at different
+        # depth levels in the graph.
+        depth = max(depth, previous_depth)
+        layers_depths[node.layer] = depth
+        nodes_depths[node] = depth
+
+        # Update the depth of inbound nodes.
+        # The "depth" of a node is the max of the depths
+        # of all nodes it is connected to + 1.
+        for node_dep in node.parent_nodes:
+            previous_depth = nodes_depths.get(node_dep, 0)
+            nodes_depths[node_dep] = max(depth + 1, previous_depth)
+
+    # Handle inputs that are not connected to outputs.
+    # We do not error out here because the inputs may be used to compute losses
+    # and metrics.
+    for input_t in inputs:
+        input_layer = input_t._keras_history[0]
+        if input_layer not in layers_depths:
+            layers_depths[input_layer] = 0
+            layer_indices[input_layer] = -1
+            nodes_depths[input_layer._inbound_nodes[0]] = 0
+            network_nodes.add(_make_node_key(input_layer.name, 0))
+
+    # Build a dict {depth: list of nodes with this depth}
+    nodes_by_depth = collections.defaultdict(list)
+    for node, depth in nodes_depths.items():
+        nodes_by_depth[depth].append(node)
+
+    # Build a dict {depth: list of layers with this depth}
+    layers_by_depth = collections.defaultdict(list)
+    for layer, depth in layers_depths.items():
+        layers_by_depth[depth].append(layer)
+
+    # Get sorted list of layer depths.
+    depth_keys = list(layers_by_depth.keys())
     depth_keys.sort(reverse=True)
-    # Iterate over nodes, by depth level.
-    if len(depth_keys) > 1:
-      for depth in depth_keys:
-        nodes = self._nodes_by_depth[depth]
-        for node in nodes:
-          layer = node.layer
-          if layer in self._input_layers:
-            # We've already covered the input layers
-            # a few lines above.
-            continue
-          # Get the input shapes for the first argument of the node
-          layer_input_shapes = []
-          layer_inputs = node.call_args[0]
-          for layer_input in tf.nest.flatten(layer_inputs):
-            kh = layer_input._keras_history
-            input_layer_key = kh.layer.name + '_%s_%s' % (kh.node_index,
-                                                          kh.tensor_index)
-            layer_input_shapes.append(layers_to_output_shapes[input_layer_key])
-          layer_input_shapes = tf.nest.pack_sequence_as(layer_inputs,
-                                                        layer_input_shapes)
-          # Layers expect shapes to be tuples for `compute_output_shape`.
-          layer_input_shapes = tf_utils.convert_shapes(
-              layer_input_shapes, to_tuples=True)
-          layer_output_shapes = layer.compute_output_shape(layer_input_shapes)
-          # Convert back to TensorShapes.
-          layer_output_shapes = tf_utils.convert_shapes(
-              layer_output_shapes, to_tuples=False)
-
-          node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
-          for j, shape in enumerate(tf.nest.flatten(layer_output_shapes)):
-            shape_key = layer.name + '_%s_%s' % (node_index, j)
-            layers_to_output_shapes[shape_key] = shape
-
-      # Read final output shapes from layers_to_output_shapes.
-      output_shapes = []
-      for i in range(len(self._output_layers)):
-        layer, node_index, tensor_index = self._output_coordinates[i]
-        shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
-        output_shapes.append(layers_to_output_shapes[shape_key])
-      output_shapes = tf.nest.pack_sequence_as(self._nested_outputs,
-                                               output_shapes)
-      # Store in cache.
-      self._output_shape_cache[cache_key] = output_shapes
-
-    # Return shapes as TensorShapes.
-    return output_shapes
-
-  def _init_set_name(self, name, zero_based=True):
-    if not name:
-      cls_name = self.__class__.__name__
-      if self.__class__ == Functional:
-        # Hide the functional class name from user, since its not a public
-        # visible class. Use "Model" instead,
-        cls_name = 'Model'
-      self._name = backend.unique_object_name(
-          generic_utils.to_snake_case(cls_name),
-          zero_based=zero_based)
-    else:
-      self._name = name
 
-  def _run_internal_graph(self, inputs, training=None, mask=None):
-    """Computes output tensors for new inputs.
-
-    # Note:
-        - Can be run on non-Keras tensors.
-
-    Args:
-        inputs: Tensor or nested structure of Tensors.
-        training: Boolean learning phase.
-        mask: (Optional) Tensor or nested structure of Tensors.
+    # Set self.layers ordered by depth.
+    layers = []
+    for depth in depth_keys:
+        layers_for_depth = layers_by_depth[depth]
+        # Network.layers needs to have a deterministic order:
+        # here we order them by traversal order.
+        layers_for_depth.sort(key=lambda x: layer_indices[x])
+        layers.extend(layers_for_depth)
 
-    Returns:
-        output_tensors
-    """
-    inputs = self._flatten_to_reference_inputs(inputs)
-    if mask is None:
-      masks = [None] * len(inputs)
-    else:
-      masks = self._flatten_to_reference_inputs(mask)
-    for input_t, mask in zip(inputs, masks):
-      input_t._keras_mask = mask
-
-    # Dictionary mapping reference tensors to computed tensors.
-    tensor_dict = {}
-    tensor_usage_count = self._tensor_usage_count
-    for x, y in zip(self.inputs, inputs):
-      y = self._conform_to_reference_input(y, ref_input=x)
-      x_id = str(id(x))
-      tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
-
-    nodes_by_depth = self._nodes_by_depth
+    # Get sorted list of node depths.
     depth_keys = list(nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
 
-    for depth in depth_keys:
-      nodes = nodes_by_depth[depth]
-      for node in nodes:
-        if node.is_input:
-          continue  # Input tensors already exist.
-
-        if any(t_id not in tensor_dict for t_id in node.flat_input_ids):
-          continue  # Node is not computable, try skipping.
+    # Check that all tensors required are computable.
+    # computable_tensors: all tensors in the graph
+    # that can be computed from the inputs provided.
+    computable_tensors = set()
+    for x in inputs:
+        computable_tensors.add(id(x))
 
-        args, kwargs = node.map_arguments(tensor_dict)
-        outputs = node.layer(*args, **kwargs)
+    layers_with_complete_input = []  # To provide a better error msg.
+    for depth in depth_keys:
+        for node in nodes_by_depth[depth]:
+            layer = node.layer
+            if layer and not node.is_input:
+                for x in tf.nest.flatten(node.keras_inputs):
+                    if id(x) not in computable_tensors:
+                        raise ValueError(
+                            "Graph disconnected: cannot obtain value for "
+                            f'tensor {x} at layer "{layer.name}". '
+                            "The following previous layers were accessed "
+                            f"without issue: {layers_with_complete_input}"
+                        )
+                for x in tf.nest.flatten(node.outputs):
+                    computable_tensors.add(id(x))
+                layers_with_complete_input.append(layer.name)
+
+    # Ensure name unicity, which will be crucial for serialization
+    # (since serialized nodes refer to layers by their name).
+    all_names = [layer.name for layer in layers]
+    for name in all_names:
+        if all_names.count(name) != 1:
+            raise ValueError(
+                f'The name "{name}" is used {all_names.count(name)} '
+                "times in the model. All layer names should be unique."
+            )
+    return network_nodes, nodes_by_depth, layers, layers_by_depth
 
-        # Update tensor_dict.
-        for x_id, y in zip(node.flat_output_ids, tf.nest.flatten(outputs)):
-          tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
 
-    output_tensors = []
-    for x in self.outputs:
-      x_id = str(id(x))
-      assert x_id in tensor_dict, 'Could not compute output ' + str(x)
-      output_tensors.append(tensor_dict[x_id].pop())
-
-    return tf.nest.pack_sequence_as(self._nested_outputs, output_tensors)
-
-  def _flatten_to_reference_inputs(self, tensors):
-    """Maps `tensors` to their respective `keras.Input`."""
-    if self._enable_dict_to_input_mapping and isinstance(tensors, dict):
-      ref_inputs = self._nested_inputs
-      if not tf.nest.is_nested(ref_inputs):
-        ref_inputs = [self._nested_inputs]
-      if isinstance(ref_inputs, dict):
-        # In the case that the graph is constructed with dict input tensors,
-        # We will use the original dict key to map with the keys in the input
-        # data. Note that the model.inputs is using nest.flatten to process the
-        # input tensors, which means the dict input tensors are ordered by their
-        # keys.
-        ref_input_names = sorted(ref_inputs.keys())
-      else:
-        ref_input_names = [inp._keras_history.layer.name for inp in ref_inputs]
-
-      # Raise an warning if there are more input data comparing to input tensor
-      if len(tensors) > len(ref_input_names):
-        warnings.warn(
-            'Input dict contained keys {} which did not match any model input. '
-            'They will be ignored by the model.'.format(
-                [n for n in tensors.keys() if n not in ref_input_names]),
-            stacklevel=2)
-
-      try:
-        # Flatten in the order `Input`s were passed during Model construction.
-        return [tensors[n] for n in ref_input_names]
-      except KeyError:
-        # TODO(b/151582614)
-        return tf.nest.flatten(tensors)
+def _build_map(outputs):
+    """This method topologically sorts nodes in order from inputs to outputs.
 
-    # Otherwise both self.inputs and tensors will already be in same order.
-    return tf.nest.flatten(tensors)
-
-  def _conform_to_reference_input(self, tensor, ref_input):
-    """Set shape and dtype based on `keras.Input`s."""
-    if isinstance(tensor, tf.Tensor):
-      # Allow (None,) and (None, 1) Tensors to be passed interchangeably. Use
-      # the shape specified by the `keras.Input`.
-      t_shape = tensor.shape
-      t_rank = t_shape.rank
-      ref_shape = ref_input.shape
-      ref_rank = ref_shape.rank
-      keras_history = getattr(tensor, '_keras_history', None)
-      if t_rank is not None and ref_rank is not None:
-        # Should squeeze last dimension.
-        # True if tensor is (BATCH, ..., 1) and reference is (BATCH, ...).
-        if (t_rank == ref_rank + 1 and t_shape[-1] == 1):
-          tensor = tf.squeeze(tensor, axis=-1)
-        # Should expand last_dimension.
-        # True if tensor is (BATCH, ...) and reference is (BATCH, ..., 1).
-        elif (t_rank == ref_rank - 1 and ref_shape[-1] == 1):
-          tensor = tf.expand_dims(tensor, axis=-1)
-      if keras_history is not None:  # Restore keras history.
-        tensor._keras_history = keras_history
-
-      # Add shape hints to Tensors that may have None shape dims but have shapes
-      # defined by the `keras.Input` (not applicable in eager mode).
-      if not tf.executing_eagerly():
-        try:
-          tensor.set_shape(tensor.shape.merge_with(ref_input.shape))
-        except ValueError:
-          logging.warning(
-              'Model was constructed with shape {} for input {}, but it was '
-              'called on an input with incompatible shape {}.'.format(
-                  ref_input.shape, ref_input, tensor.shape))
-
-      # Dtype casting.
-      tensor = tf.cast(tensor, dtype=ref_input.dtype)
-    elif tf_utils.is_extension_type(tensor):
-      # Dtype casting (If the extension type has a non-variant dtype and
-      # supports being cast).  Only cast if necessary (since some extension
-      # types may not implement tf.cast).
-      tensor_dtype = getattr(tensor, 'dtype', None)
-      ref_input_dtype = getattr(ref_input, 'dtype', None)
-      if (ref_input_dtype is not None and tensor_dtype is not None and
-          tensor_dtype != ref_input_dtype and ref_input_dtype != tf.variant):
-        tensor = tf.cast(tensor, dtype=ref_input_dtype)
-
-    return tensor
-
-  def get_config(self):
-    return copy.deepcopy(get_network_config(self))
-
-  def _validate_graph_inputs_and_outputs(self):
-    """Validates the inputs and outputs of a Graph Network."""
-    # Check for redundancy in inputs.
-    if len({id(i) for i in self.inputs}) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'contains the same input multiple times. '
-                       'All inputs should only appear once.'
-                       f'Received inputs={self.inputs}')
-
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError(
-            f'Input tensors to a {cls_name} model '
-            'must come from `tf.keras.Input`. '
-            f'Received inputs={x} (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer = x._keras_history.layer
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and not layer._inbound_nodes[0].is_input):
-        cls_name = self.__class__.__name__
-        logging.warning(f'{cls_name} model inputs must come from '
-                        '`tf.keras.Input` (thus holding past layer metadata). '
-                        'They cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        f'input to "{self.name}" was not an Input tensor, '
-                        f'it was generated by layer "{layer.name}".\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
-                        f'The tensor that caused the issue was: {x}')
-
-    # Check compatibility of batch sizes of Input Layers.
-    input_batch_sizes = set([
-        training_utils.get_static_batch_size(x._keras_history.layer)
-        for x in self.inputs])
-    input_batch_sizes.discard(None)
-    if len(input_batch_sizes) > 1:
-      logging.warning('Found incompatible static batch sizes among the '
-                      f'inputs. Batch sizes: {sorted(input_batch_sizes)}')
-
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError(f'Output tensors of a {cls_name} model must be '
-                         'the output of a TensorFlow `Layer` '
-                         f'(thus holding past layer metadata). Found: {x}')
-
-  def _insert_layers(self, layers, relevant_nodes=None):
-    """Inserts Layers into the Network after Network creation.
-
-    This is only valid for Keras Graph Networks.  Layers added via this function
-    will be included in the `call` computation and `get_config` of this Network.
-    They will not be added to the Network's outputs.
+    It uses a depth-first search to topologically sort nodes that appear in the
+    _keras_history connectivity metadata of `outputs`.
 
     Args:
-      layers: Arbitrary nested structure of Layers. Layers must be reachable
-        from one or more of the `keras.Input` Tensors that correspond to this
-        Network's inputs.
-      relevant_nodes: Nodes from the Layers that should be considered part of
-        this Network. If `None`, all Nodes will be considered part of this
-        Network.
+      outputs: the output tensors whose _keras_history metadata should be
+        walked. This may be an arbitrary nested structure.
 
-    Raises:
-      ValueError: If the layers depend on `Input`s not found in this Model.
-    """
-    layers = tf.nest.flatten(layers)
-    tf_utils.assert_no_legacy_layers(layers)
-    node_to_depth = {}
-    for depth, nodes in self._nodes_by_depth.items():
-      node_to_depth.update({node: depth for node in nodes})
-    # The nodes of these Layers that are relevant to this Network. If not
-    # provided, assume all Nodes are relevant
-    if not relevant_nodes:
-      relevant_nodes = tf.nest.flatten(
-          [layer._inbound_nodes for layer in layers])
-    network_nodes = set(relevant_nodes + list(node_to_depth.keys()))
-
-    def _get_min_depth(node):
-      """Gets the minimum depth at which node can be computed."""
-      min_depth = 0
-      for layer, node_id, _, _ in node.iterate_inbound():
-        inbound_node = layer._inbound_nodes[node_id]
-        if inbound_node in node_to_depth:
-          min_depth = min(min_depth, node_to_depth[inbound_node])
-        elif inbound_node not in network_nodes:
-          continue
-        else:
-          # Previous relevant nodes haven't been processed yet.
-          return None
-      # New node is one shallower than its shallowest input.
-      return min_depth - 1
-
-    # Insert nodes into `_nodes_by_depth` and other node attrs.
-    unprocessed_nodes = copy.copy(relevant_nodes)
-    i = 0
-    while unprocessed_nodes:
-      i += 1
-      # Do a sanity check. This can occur if `Input`s from outside this Model
-      # are being relied on.
-      if i > 10000:
-        raise ValueError('Layers could not be added due to missing '
-                         'dependencies.')
-
-      node = unprocessed_nodes.pop(0)
-      depth = _get_min_depth(node)
-      if depth is None:  # Defer until inbound nodes are processed.
-        unprocessed_nodes.append(node)
-        continue
-      node_key = _make_node_key(node.layer.name,
-                                node.layer._inbound_nodes.index(node))
-      if node_key not in self._network_nodes:
-        node_to_depth[node] = depth
-        self._network_nodes.add(node_key)
-        self._nodes_by_depth[depth].append(node)
-
-    # Insert layers and update other layer attrs.
-    layer_set = set(self._self_tracked_trackables)
-    deferred_layers = []
-    for layer in layers:
-      if layer not in layer_set:
-        self._self_tracked_trackables.append(layer)
-        deferred_layers.append(layer)
-        self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-        layer_set.add(layer)
-    self._handle_deferred_layer_dependencies(deferred_layers)
-
-    self._compute_tensor_usage_count()
-
-  def _compute_tensor_usage_count(self):
-    """Compute the #. of tensor usages for all the output tensors of layers.
-
-    The computed tensor usage count is saved as `self._tensor_usage_count`. This
-    is later used for saving memory in eager computation by releasing
-    no-longer-needed tensors as early as possible.
+    Returns:
+      A tuple like (ordered_nodes, layer_to_first_traversal_index)
+      ordered_nodes: list of nodes appearing in the keras history, topologically
+        sorted from original inputs to the `outputs`.
+        (If outputs have different sets of ancestors, the inputs to one output
+        may appear after a different output).
+      layer_to_first_traversal_index:
+        A dict mapping layer to the traversal index in the DFS where it is
+        seen. Note: if a layer is shared by several nodes, the dict will only
+        store the index corresponding to the *first* time the layer seen.
     """
-    tensor_usage_count = collections.Counter()
-    available_tensors = set(str(id(tensor)) for tensor in self.inputs)
-
-    depth_keys = list(self._nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-    depth_keys = depth_keys[1:]
-
-    for depth in depth_keys:
-      for node in self._nodes_by_depth[depth]:
-        input_tensors = {
-            str(id(tensor)) for tensor in tf.nest.flatten(node.keras_inputs)
-        }
-        if input_tensors.issubset(available_tensors):
-          for tensor in tf.nest.flatten(node.keras_inputs):
-            tensor_usage_count[str(id(tensor))] += 1
-
-          for output_tensor in tf.nest.flatten(node.outputs):
-            available_tensors.add(str(id(output_tensor)))
-
-    for tensor in self.outputs:
-      tensor_usage_count[str(id(tensor))] += 1
-
-    self._tensor_usage_count = tensor_usage_count
-
-  def _assert_weights_created(self):
-    # Override the implementation in Model.
-    # The Functional model should always have weight created already.
-    return
-
-  def _graph_network_add_loss(self, symbolic_loss):
-    new_nodes, new_layers = _map_subgraph_network(self.inputs, [symbolic_loss])
-    # Losses must be keyed on inputs no matter what in order to be supported in
-    # DistributionStrategy.
-    add_loss_layer = base_layer.AddLoss(
-        unconditional=False, dtype=symbolic_loss.dtype)
-    add_loss_layer(symbolic_loss)
-    new_nodes.extend(add_loss_layer.inbound_nodes)
-    new_layers.append(add_loss_layer)
-    self._insert_layers(new_layers, new_nodes)
-
-  def _graph_network_add_metric(self, value, aggregation, name):
-    new_nodes, new_layers = _map_subgraph_network(self.inputs, [value])
-    add_metric_layer = base_layer.AddMetric(
-        aggregation, name, dtype=value.dtype)
-    add_metric_layer(value)
-    new_nodes.extend(add_metric_layer.inbound_nodes)
-    new_layers.append(add_metric_layer)
-    self._insert_layers(new_layers, new_nodes)
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return network_serialization.NetworkSavedModelSaver(self)
-
-  def _get_save_spec(self, dynamic_batch=True, inputs_only=True):
-    if getattr(self, '_has_explicit_input_shape', True):
-      # Functional models and Sequential models that have an explicit input
-      # shape should use the batch size set by the input layer.
-      dynamic_batch = False
-    return super()._get_save_spec(dynamic_batch, inputs_only)
-
-
-def _make_node_key(layer_name, node_index):
-  return layer_name + '_ib-' + str(node_index)
-
-
-def _map_graph_network(inputs, outputs):
-  """Validates a network's topology and gather its layers and nodes.
-
-  Args:
-    inputs: List of input tensors.
-    outputs: List of outputs tensors.
-
-  Returns:
-    A tuple `(nodes, nodes_by_depth, layers, layers_by_depth)`.
-    - nodes: list of Node instances.
-    - nodes_by_depth: dict mapping ints (depth) to lists of node instances.
-    - layers: list of Layer instances.
-    - layers_by_depth: dict mapping ints (depth) to lists of layer instances.
-
-  Raises:
-    ValueError: In case the network is not valid (e.g. disconnected graph).
-  """
-  # "depth" is number of layers between output Node and the Node.
-  # Nodes are ordered from inputs -> outputs.
-  nodes_in_decreasing_depth, layer_indices = _build_map(outputs)
-  network_nodes = {
-      _make_node_key(node.layer.name, node.layer._inbound_nodes.index(node))
-      for node in nodes_in_decreasing_depth
-  }
-
-  nodes_depths = {}  # dict {node: depth value}
-  layers_depths = {}  # dict {layer: depth value}
-
-  for node in reversed(nodes_in_decreasing_depth):
-    # If the depth is not set, the node has no outbound nodes (depth 0).
-    depth = nodes_depths.setdefault(node, 0)
-
-    # Update the depth of the corresponding layer
-    previous_depth = layers_depths.get(node.layer, 0)
-    # If we've seen this layer before at a higher depth,
-    # we should use that depth instead of the node depth.
-    # This is necessary for shared layers that have inputs at different
-    # depth levels in the graph.
-    depth = max(depth, previous_depth)
-    layers_depths[node.layer] = depth
-    nodes_depths[node] = depth
-
-    # Update the depth of inbound nodes.
-    # The "depth" of a node is the max of the depths
-    # of all nodes it is connected to + 1.
-    for node_dep in node.parent_nodes:
-      previous_depth = nodes_depths.get(node_dep, 0)
-      nodes_depths[node_dep] = max(depth + 1, previous_depth)
-
-  # Handle inputs that are not connected to outputs.
-  # We do not error out here because the inputs may be used to compute losses
-  # and metrics.
-  for input_t in inputs:
-    input_layer = input_t._keras_history[0]
-    if input_layer not in layers_depths:
-      layers_depths[input_layer] = 0
-      layer_indices[input_layer] = -1
-      nodes_depths[input_layer._inbound_nodes[0]] = 0
-      network_nodes.add(_make_node_key(input_layer.name, 0))
-
-  # Build a dict {depth: list of nodes with this depth}
-  nodes_by_depth = collections.defaultdict(list)
-  for node, depth in nodes_depths.items():
-    nodes_by_depth[depth].append(node)
-
-  # Build a dict {depth: list of layers with this depth}
-  layers_by_depth = collections.defaultdict(list)
-  for layer, depth in layers_depths.items():
-    layers_by_depth[depth].append(layer)
-
-  # Get sorted list of layer depths.
-  depth_keys = list(layers_by_depth.keys())
-  depth_keys.sort(reverse=True)
-
-  # Set self.layers ordered by depth.
-  layers = []
-  for depth in depth_keys:
-    layers_for_depth = layers_by_depth[depth]
-    # Network.layers needs to have a deterministic order:
-    # here we order them by traversal order.
-    layers_for_depth.sort(key=lambda x: layer_indices[x])
-    layers.extend(layers_for_depth)
-
-  # Get sorted list of node depths.
-  depth_keys = list(nodes_by_depth.keys())
-  depth_keys.sort(reverse=True)
-
-  # Check that all tensors required are computable.
-  # computable_tensors: all tensors in the graph
-  # that can be computed from the inputs provided.
-  computable_tensors = set()
-  for x in inputs:
-    computable_tensors.add(id(x))
-
-  layers_with_complete_input = []  # To provide a better error msg.
-  for depth in depth_keys:
-    for node in nodes_by_depth[depth]:
-      layer = node.layer
-      if layer and not node.is_input:
-        for x in tf.nest.flatten(node.keras_inputs):
-          if id(x) not in computable_tensors:
-            raise ValueError(
-                f'Graph disconnected: cannot obtain value for tensor {x} '
-                f'at layer "{layer.name}". The following previous layers '
-                f'were accessed without issue: {layers_with_complete_input}')
-        for x in tf.nest.flatten(node.outputs):
-          computable_tensors.add(id(x))
-        layers_with_complete_input.append(layer.name)
-
-  # Ensure name unicity, which will be crucial for serialization
-  # (since serialized nodes refer to layers by their name).
-  all_names = [layer.name for layer in layers]
-  for name in all_names:
-    if all_names.count(name) != 1:
-      raise ValueError(
-          f'The name "{name}" is used {all_names.count(name)} '
-          'times in the model. All layer names should be unique.')
-  return network_nodes, nodes_by_depth, layers, layers_by_depth
-
-
-def _build_map(outputs):
-  """This method topologically sorts nodes in order from inputs to outputs.
-
-  It uses a depth-first search to topologically sort nodes that appear in the
-  _keras_history connectivity metadata of `outputs`.
-
-  Args:
-    outputs: the output tensors whose _keras_history metadata should be walked.
-    This may be an arbitrary nested structure.
-
-  Returns:
-    A tuple like (ordered_nodes, layer_to_first_traversal_index)
-    ordered_nodes: list of nodes appearing in the keras history, topologically
-      sorted from original inputs to the `outputs`.
-      (If outputs have different sets of ancestors, the inputs to one output
-      may appear after a different output).
-    layer_to_first_traversal_index:
-      A dict mapping layer to the traversal index in the DFS where it is
-      seen. Note: if a layer is shared by several nodes, the dict will only
-      store the index corresponding to the *first* time the layer seen.
-  """
-  finished_nodes = set()
-  nodes_in_progress = set()
-  nodes_in_decreasing_depth = []  # nodes from inputs -> outputs.
-  layer_indices = {}  # layer -> in traversal order.
-  for output in tf.nest.flatten(outputs):
-    _build_map_helper(output, finished_nodes, nodes_in_progress,
-                      nodes_in_decreasing_depth, layer_indices)
-  return nodes_in_decreasing_depth, layer_indices
-
-
-def _build_map_helper(tensor, finished_nodes, nodes_in_progress,
-                      nodes_in_decreasing_depth, layer_indices):
-  """Recursive helper for `_build_map`."""
-  layer, node_index, _ = tensor._keras_history  # pylint: disable=protected-access
-  node = layer._inbound_nodes[node_index]  # pylint: disable=protected-access
-
-  # Don't repeat work for shared subgraphs
-  if node in finished_nodes:
-    return
-
-  # Prevent cycles.
-  if node in nodes_in_progress:
-    raise ValueError(f'Tensor {tensor} from layer "{layer.name}" '
-                     'is part of a cycle.')
-
-  # Store the traversal order for layer sorting.
-  if layer not in layer_indices:
-    layer_indices[layer] = len(layer_indices)
-
-  # Propagate to all previous tensors connected to this node.
-  nodes_in_progress.add(node)
-  if not node.is_input:
-    for tensor in node.keras_inputs:
-      _build_map_helper(tensor, finished_nodes, nodes_in_progress,
-                        nodes_in_decreasing_depth, layer_indices)
-
-  finished_nodes.add(node)
-  nodes_in_progress.remove(node)
-  nodes_in_decreasing_depth.append(node)
+    finished_nodes = set()
+    nodes_in_progress = set()
+    nodes_in_decreasing_depth = []  # nodes from inputs -> outputs.
+    layer_indices = {}  # layer -> in traversal order.
+    for output in tf.nest.flatten(outputs):
+        _build_map_helper(
+            output,
+            finished_nodes,
+            nodes_in_progress,
+            nodes_in_decreasing_depth,
+            layer_indices,
+        )
+    return nodes_in_decreasing_depth, layer_indices
+
+
+def _build_map_helper(
+    tensor,
+    finished_nodes,
+    nodes_in_progress,
+    nodes_in_decreasing_depth,
+    layer_indices,
+):
+    """Recursive helper for `_build_map`."""
+    (
+        layer,
+        node_index,
+        _,
+    ) = tensor._keras_history
+    node = layer._inbound_nodes[node_index]
+
+    # Don't repeat work for shared subgraphs
+    if node in finished_nodes:
+        return
+
+    # Prevent cycles.
+    if node in nodes_in_progress:
+        raise ValueError(
+            f'Tensor {tensor} from layer "{layer.name}" is part of a cycle.'
+        )
+
+    # Store the traversal order for layer sorting.
+    if layer not in layer_indices:
+        layer_indices[layer] = len(layer_indices)
+
+    # Propagate to all previous tensors connected to this node.
+    nodes_in_progress.add(node)
+    if not node.is_input:
+        for tensor in node.keras_inputs:
+            _build_map_helper(
+                tensor,
+                finished_nodes,
+                nodes_in_progress,
+                nodes_in_decreasing_depth,
+                layer_indices,
+            )
+
+    finished_nodes.add(node)
+    nodes_in_progress.remove(node)
+    nodes_in_decreasing_depth.append(node)
 
 
 def _map_subgraph_network(inputs, outputs):
-  """Returns the nodes and layers in the topology from `inputs` to `outputs`.
+    """Returns the nodes and layers in the topology from `inputs` to `outputs`.
 
-  Args:
-    inputs: List of input tensors.
-    outputs: List of output tensors.
+    Args:
+      inputs: List of input tensors.
+      outputs: List of output tensors.
 
-  Returns:
-    A tuple of List{Node] and List[Layer].
-  """
-  if not tf.compat.v1.executing_eagerly_outside_functions():
-    base_layer_utils.create_keras_history(outputs)
-  # Keep only nodes and layers in the topology between inputs and outputs.
-  _, nodes_by_depth, layers, _ = _map_graph_network(inputs, outputs)
-  return tf.nest.flatten([nodes for nodes in nodes_by_depth.values()]), layers
+    Returns:
+      A tuple of List{Node] and List[Layer].
+    """
+    if not tf.compat.v1.executing_eagerly_outside_functions():
+        base_layer_utils.create_keras_history(outputs)
+    # Keep only nodes and layers in the topology between inputs and outputs.
+    _, nodes_by_depth, layers, _ = _map_graph_network(inputs, outputs)
+    return tf.nest.flatten([nodes for nodes in nodes_by_depth.values()]), layers
 
 
 def _should_skip_first_node(layer):
-  """Returns True if the first layer node should not be saved or loaded."""
-  # Networks that are constructed with an Input layer/shape start with a
-  # pre-existing node linking their input to output. This node is excluded from
-  # the network config.
-  if layer._self_tracked_trackables:
-    return (isinstance(layer, Functional) and
+    """Returns True if the first layer node should not be saved or loaded."""
+    # Networks that are constructed with an Input layer/shape start with a
+    # pre-existing node linking their input to output. This node is excluded
+    # from the network config.
+    if not hasattr(layer, "_self_tracked_trackables"):
+        # Special case for serialization of Functional models without
+        # defined input shape argument.
+        return isinstance(layer, Functional)
+    if layer._self_tracked_trackables:
+        return (
+            isinstance(layer, Functional)
             # Filter out Sequential models without an input shape.
-            isinstance(layer._self_tracked_trackables[0],
-                       input_layer_module.InputLayer))
-  else:
-    return isinstance(layer, Functional)
+            and isinstance(
+                layer._self_tracked_trackables[0], input_layer_module.InputLayer
+            )
+        )
+    else:
+        return isinstance(layer, Functional)
 
 
 def connect_ancillary_layers(model, created_layers):
-  """Adds layers that are not connected to the outputs to the model."""
-  # Layers not connected to outputs, such as those added in `add_loss`.
-  ancillary_layers = [
-      layer for layer in created_layers.values() if layer not in model.layers
-  ]
-  if ancillary_layers:
-    relevant_nodes = tf.nest.flatten([
-        layer.inbound_nodes[1:]
-        if _should_skip_first_node(layer) else layer.inbound_nodes
-        for layer in created_layers.values()
-    ])
-    model._insert_layers(ancillary_layers, relevant_nodes)
-  return model
+    """Adds layers that are not connected to the outputs to the model."""
+    # Layers not connected to outputs, such as those added in `add_loss`.
+    ancillary_layers = [
+        layer for layer in created_layers.values() if layer not in model.layers
+    ]
+    if ancillary_layers:
+        relevant_nodes = tf.nest.flatten(
+            [
+                layer.inbound_nodes[1:]
+                if _should_skip_first_node(layer)
+                else layer.inbound_nodes
+                for layer in created_layers.values()
+            ]
+        )
+        model._insert_layers(ancillary_layers, relevant_nodes)
+    return model
 
 
 def reconstruct_from_config(config, custom_objects=None, created_layers=None):
-  """Reconstructs graph from config object.
-
-  Args:
-    config: Dictionary returned from Network.get_config()
-    custom_objects: Optional dictionary mapping names (strings) to custom
-      classes or functions to be considered during deserialization.
-    created_layers: Optional dictionary mapping names to Layer objects. Any
-      layer not in this dictionary will be created and added to the dict.
-      This function will add new nodes to all layers (excluding InputLayers),
-      instead of re-using pre-existing nodes in the layers.
-
-  Returns:
-    Tuple of (input tensors, output tensors, dictionary of created layers)
-  """
-  # Layer instances created during the graph reconstruction process.
-  created_layers = created_layers or collections.OrderedDict()
-
-  # Maps input data (tuple of inbound layer name, node index) from the config
-  # to node indices in the newly generated model. The node indices may be
-  # different if the layers have already been called previously.
-  node_index_map = {}
-  node_count_by_layer = {}
-
-  # Dictionary mapping layer instances to
-  # node data that specifies a layer call.
-  # It acts as a queue that maintains any unprocessed
-  # layer call until it becomes possible to process it
-  # (i.e. until the input tensors to the call all exist).
-  unprocessed_nodes = collections.defaultdict(list)
-
-  def get_node_index(layer, config_node_index):
-    """Returns node index in layer (might differ from config_node_index)."""
-    if isinstance(layer, input_layer_module.InputLayer):
-      return 0
-    return node_index_map.get((layer.name, config_node_index), None)
-
-  def _deserialize_keras_tensors(kwargs, layer_map):
-    """Deserializes Keras Tensors passed to `call`.."""
-
-    def _deserialize_keras_tensor(t):
-      """Deserializes a single Keras Tensor passed to `call`."""
-      if isinstance(t, tf_utils.ListWrapper):
-        t = t.as_list()
-        layer_name = t[0]
-        node_index = t[1]
-        tensor_index = t[2]
-
-        layer = layer_map[layer_name]
-        new_node_index = get_node_index(layer, node_index)
-        if new_node_index is None:
-          # The inbound node may not have been processed yet,
-          # (This can happen e.g. if it depends on a different set
-          # of inputs than those that have been processed already).
-          # raise an IndexError so that the current node puts itself
-          # back on the unprocessed queue.
-          # Caution: This may lead to infinite loops for malformed
-          # network configurations! (or when there is a bug in
-          # the network config loading code).
-          raise IndexError
-        node = layer._inbound_nodes[new_node_index]
-        return tf.nest.flatten(node.outputs)[tensor_index]
-      return t
-
-    kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
-    return tf.nest.map_structure(_deserialize_keras_tensor, kwargs)
-
-  def process_node(layer, node_data):
-    """Deserialize a node.
+    """Reconstructs graph from config object.
 
     Args:
-        layer: layer instance.
-        node_data: Nested structure of `ListWrapper`.
+      config: Dictionary returned from Network.get_config()
+      custom_objects: Optional dictionary mapping names (strings) to custom
+        classes or functions to be considered during deserialization.
+      created_layers: Optional dictionary mapping names to Layer objects. Any
+        layer not in this dictionary will be created and added to the dict.
+        This function will add new nodes to all layers (excluding InputLayers),
+        instead of re-using pre-existing nodes in the layers.
 
     Returns:
-        Whether the node was processed (i.e. the layer was called on the inputs
-        specified by the node data)
-
-    Raises:
-        ValueError: In case of improperly formatted `node_data`.
+      Tuple of (input tensors, output tensors, dictionary of created layers)
     """
+    # Layer instances created during the graph reconstruction process.
+    created_layers = created_layers or collections.OrderedDict()
+
+    # Maps input data (tuple of inbound layer name, node index) from the config
+    # to node indices in the newly generated model. The node indices may be
+    # different if the layers have already been called previously.
+    node_index_map = {}
+    node_count_by_layer = {}
+
+    # Dictionary mapping layer instances to
+    # node data that specifies a layer call.
+    # It acts as a queue that maintains any unprocessed
+    # layer call until it becomes possible to process it
+    # (i.e. until the input tensors to the call all exist).
+    unprocessed_nodes = collections.defaultdict(list)
+
+    def get_node_index(layer, config_node_index):
+        """Returns node index in layer (might differ from config_node_index)."""
+        if isinstance(layer, input_layer_module.InputLayer):
+            return 0
+        return node_index_map.get((layer.name, config_node_index), None)
+
+    def _deserialize_keras_tensors(kwargs, layer_map):
+        """Deserializes Keras Tensors passed to `call`.."""
+
+        def _deserialize_keras_tensor(t):
+            """Deserializes a single Keras Tensor passed to `call`."""
+            if isinstance(t, tf_utils.ListWrapper):
+                t = t.as_list()
+                layer_name = t[0]
+                node_index = t[1]
+                tensor_index = t[2]
+
+                layer = layer_map[layer_name]
+                new_node_index = get_node_index(layer, node_index)
+                if new_node_index is None:
+                    # The inbound node may not have been processed yet,
+                    # (This can happen e.g. if it depends on a different set
+                    # of inputs than those that have been processed already).
+                    # raise an IndexError so that the current node puts itself
+                    # back on the unprocessed queue.
+                    # Caution: This may lead to infinite loops for malformed
+                    # network configurations! (or when there is a bug in
+                    # the network config loading code).
+                    raise IndexError
+                node = layer._inbound_nodes[new_node_index]
+                return tf.nest.flatten(node.outputs)[tensor_index]
+            return t
+
+        kwargs = tf_utils.convert_inner_node_data(kwargs, wrap=True)
+        return tf.nest.map_structure(_deserialize_keras_tensor, kwargs)
+
+    def process_node(layer, node_data):
+        """Deserialize a node.
+
+        Args:
+            layer: layer instance.
+            node_data: Nested structure of `ListWrapper`.
+
+        Returns:
+            Whether the node was processed (i.e. the layer was called on the
+            inputs specified by the node data)
+
+        Raises:
+            ValueError: In case of improperly formatted `node_data`.
+        """
+        input_tensors = []
+        for input_data in tf.nest.flatten(node_data):
+            input_data = input_data.as_list()
+            if len(input_data) == 3:
+                kwargs = {}
+            elif len(input_data) == 4:
+                kwargs = input_data[3]
+                try:
+                    kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+                except IndexError:
+                    # Happens if keras tensors in kwargs are still unprocessed
+                    return False
+            else:
+                raise ValueError("Improperly formatted model config.")
+
+            if input_data[0] != node_module._CONSTANT_VALUE:
+                inbound_layer_name = input_data[0]
+                inbound_node_index = input_data[1]
+                inbound_tensor_index = input_data[2]
+                inbound_layer = created_layers[inbound_layer_name]
+                inbound_node_index = get_node_index(
+                    inbound_layer, inbound_node_index
+                )
+
+                if inbound_node_index is None:
+                    return False
+                inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+                input_tensors.append(
+                    tf.nest.flatten(inbound_node.outputs)[inbound_tensor_index]
+                )
+            else:
+                # We received a constant w/ no Keras history attached,
+                # which means it is a constant tensor input.
+                # Input is a constant value.
+                # Format = [_CONSTANT_VALUE, -1, const_val, kwargs]
+                assert input_data[1] == -1
+                assert len(input_data) >= 3
+                const_val = input_data[2]
+                if (
+                    isinstance(const_val, tuple)
+                    and len(const_val) == 2
+                    and const_val[0] == node_module._COMPOSITE_TYPE
+                ):
+                    # It is a composite tensor.
+                    input_tensors.append(json_utils.decode(const_val[1]))
+                else:
+                    input_tensors.append(const_val)
+        input_tensors = tf.nest.pack_sequence_as(node_data, input_tensors)
+        # Call layer on its inputs, thus creating the node
+        # and building the layer if needed.
+        if input_tensors is not None:
+            if (
+                not hasattr(layer, "_preserve_input_structure_in_config")
+                or not layer._preserve_input_structure_in_config
+            ):
+                input_tensors = base_layer_utils.unnest_if_single_tensor(
+                    input_tensors
+                )
+            output_tensors = layer(input_tensors, **kwargs)
+
+            # Update node index map.
+            output_index = tf.nest.flatten(output_tensors)[
+                0
+            ]._keras_history.node_index
+            node_index_map[
+                (layer.name, node_count_by_layer[layer])
+            ] = output_index
+            node_count_by_layer[layer] += 1
+        return True
+
+    def process_layer(layer_data):
+        """Deserializes a layer, then call it on appropriate inputs.
+
+        Args:
+            layer_data: layer config dict.
+
+        Raises:
+            ValueError: In case of improperly formatted `layer_data` dict.
+        """
+        layer_name = layer_data["name"]
+
+        if layer_name in created_layers:
+            layer = created_layers[layer_name]
+        else:
+            # Instantiate layer.
+            from keras.layers import deserialize as deserialize_layer
+
+            layer = deserialize_layer(layer_data, custom_objects=custom_objects)
+            created_layers[layer_name] = layer
+
+        node_count_by_layer[layer] = int(_should_skip_first_node(layer))
+
+        # Gather layer inputs and convert to `ListWrapper` objects.
+        inbound_nodes_data = layer_data["inbound_nodes"]
+        inbound_nodes_data = tf_utils.convert_inner_node_data(
+            inbound_nodes_data, wrap=True
+        )
+        for node_data in inbound_nodes_data:
+            # We don't process nodes (i.e. make layer calls)
+            # on the fly because the inbound node may not yet exist,
+            # in case of layer shared at different topological depths
+            # (e.g. a model such as A(B(A(B(x)))))
+            unprocessed_nodes[layer].append(node_data)
+
+    # First, we create all layers and enqueue nodes to be processed
+    for layer_data in config["layers"]:
+        process_layer(layer_data)
+    # Then we process nodes in order of layer depth.
+    # Nodes that cannot yet be processed (if the inbound node
+    # does not yet exist) are re-enqueued, and the process
+    # is repeated until all nodes are processed.
+    while unprocessed_nodes:
+        for layer_data in config["layers"]:
+            layer = created_layers[layer_data["name"]]
+            if layer in unprocessed_nodes:
+                layer_nodes = unprocessed_nodes.pop(layer)
+                while layer_nodes:
+                    node_data = layer_nodes[0]
+                    if process_node(layer, node_data):
+                        layer_nodes.pop(0)
+                    else:
+                        # If a node can't be processed, stop processing the
+                        # nodes of the current layer to maintain node ordering.
+                        unprocessed_nodes[layer] = layer_nodes
+                        break
+
     input_tensors = []
-    for input_data in tf.nest.flatten(node_data):
-      input_data = input_data.as_list()
-      if len(input_data) == 3:
-        kwargs = {}
-      elif len(input_data) == 4:
-        kwargs = input_data[3]
-        try:
-          kwargs = _deserialize_keras_tensors(kwargs, created_layers)
-        except IndexError:
-          # Happens if keras tensors in kwargs are still unprocessed
-          return False
-      else:
-        raise ValueError('Improperly formatted model config.')
-
-      if input_data[0] != node_module._CONSTANT_VALUE:
-        inbound_layer_name = input_data[0]
-        inbound_node_index = input_data[1]
-        inbound_tensor_index = input_data[2]
-        inbound_layer = created_layers[inbound_layer_name]
-        inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
-
-        if inbound_node_index is None:
-          return False
-        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+    output_tensors = []
+
+    input_layers = tf_utils.convert_inner_node_data(
+        config["input_layers"], wrap=True
+    )
+    for layer_data in tf.nest.flatten(input_layers):
+        layer_name, node_index, tensor_index = layer_data.as_list()
+        assert layer_name in created_layers
+        layer = created_layers[layer_name]
+        node_index = get_node_index(layer, node_index)
+        layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
         input_tensors.append(
-            tf.nest.flatten(inbound_node.outputs)[inbound_tensor_index])
-      else:
-        # We received a constant w/ no Keras history attached,
-        # which means it is a constant tensor input.
-        # Input is a constant value.
-        # Format = [_CONSTANT_VALUE, -1, const_val, kwargs]
-        assert input_data[1] == -1
-        assert len(input_data) >= 3
-        const_val = input_data[2]
-        if (isinstance(const_val, tuple) and
-            len(const_val) == 2 and
-            const_val[0] == node_module._COMPOSITE_TYPE):
-          # It is a composite tensor.
-          input_tensors.append(json_utils.decode(const_val[1]))
-        else:
-          input_tensors.append(const_val)
-    input_tensors = tf.nest.pack_sequence_as(node_data, input_tensors)
-    # Call layer on its inputs, thus creating the node
-    # and building the layer if needed.
-    if input_tensors is not None:
-      if not layer._preserve_input_structure_in_config:
-        input_tensors = (
-            base_layer_utils.unnest_if_single_tensor(input_tensors))
-      output_tensors = layer(input_tensors, **kwargs)
-
-      # Update node index map.
-      output_index = (tf.nest.flatten(output_tensors)[0].
-                      _keras_history.node_index)
-      node_index_map[(layer.name, node_count_by_layer[layer])] = output_index
-      node_count_by_layer[layer] += 1
-    return True
-
-  def process_layer(layer_data):
-    """Deserializes a layer, then call it on appropriate inputs.
+            tf.nest.flatten(layer_output_tensors)[tensor_index]
+        )
+
+    output_layers = tf_utils.convert_inner_node_data(
+        config["output_layers"], wrap=True
+    )
+    for layer_data in tf.nest.flatten(output_layers):
+        layer_name, node_index, tensor_index = layer_data.as_list()
+        assert layer_name in created_layers
+        layer = created_layers[layer_name]
+        node_index = get_node_index(layer, node_index)
+        layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
+        output_tensors.append(
+            tf.nest.flatten(layer_output_tensors)[tensor_index]
+        )
+
+    input_tensors = tf.nest.pack_sequence_as(input_layers, input_tensors)
+    output_tensors = tf.nest.pack_sequence_as(output_layers, output_tensors)
+    return input_tensors, output_tensors, created_layers
+
+
+def get_network_config(network, serialize_layer_fn=None, config=None):
+    """Build the config, which consists of the node graph and serialized layers.
 
     Args:
-        layer_data: layer config dict.
+      network: A Network object.
+      serialize_layer_fn: Function used to serialize layers.
+      config: A dict to append more config entries into. If None, start with a
+          new dict for the config.
 
-    Raises:
-        ValueError: In case of improperly formatted `layer_data` dict.
+    Returns:
+      Config dictionary.
     """
-    layer_name = layer_data['name']
-
-    if layer_name in created_layers:
-      layer = created_layers[layer_name]
-    else:
-      # Instantiate layer.
-      from keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-
-      layer = deserialize_layer(layer_data, custom_objects=custom_objects)
-      created_layers[layer_name] = layer
-
-    node_count_by_layer[layer] = int(_should_skip_first_node(layer))
-
-    # Gather layer inputs and convert to `ListWrapper` objects.
-    inbound_nodes_data = layer_data['inbound_nodes']
-    inbound_nodes_data = tf_utils.convert_inner_node_data(
-        inbound_nodes_data, wrap=True)
-    for node_data in inbound_nodes_data:
-      # We don't process nodes (i.e. make layer calls)
-      # on the fly because the inbound node may not yet exist,
-      # in case of layer shared at different topological depths
-      # (e.g. a model such as A(B(A(B(x)))))
-      unprocessed_nodes[layer].append(node_data)
-
-  # First, we create all layers and enqueue nodes to be processed
-  for layer_data in config['layers']:
-    process_layer(layer_data)
-  # Then we process nodes in order of layer depth.
-  # Nodes that cannot yet be processed (if the inbound node
-  # does not yet exist) are re-enqueued, and the process
-  # is repeated until all nodes are processed.
-  while unprocessed_nodes:
-    for layer_data in config['layers']:
-      layer = created_layers[layer_data['name']]
-      if layer in unprocessed_nodes:
-        layer_nodes = unprocessed_nodes.pop(layer)
-        while layer_nodes:
-          node_data = layer_nodes[0]
-          if process_node(layer, node_data):
-            layer_nodes.pop(0)
-          else:
-            # If a node can't be processed, stop processing the nodes of
-            # the current layer to maintain node ordering.
-            unprocessed_nodes[layer] = layer_nodes
-            break
-
-  input_tensors = []
-  output_tensors = []
-
-  input_layers = tf_utils.convert_inner_node_data(
-      config['input_layers'], wrap=True)
-  for layer_data in tf.nest.flatten(input_layers):
-    layer_name, node_index, tensor_index = layer_data.as_list()
-    assert layer_name in created_layers
-    layer = created_layers[layer_name]
-    node_index = get_node_index(layer, node_index)
-    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-    input_tensors.append(tf.nest.flatten(layer_output_tensors)[tensor_index])
-
-  output_layers = tf_utils.convert_inner_node_data(
-      config['output_layers'], wrap=True)
-  for layer_data in tf.nest.flatten(output_layers):
-    layer_name, node_index, tensor_index = layer_data.as_list()
-    assert layer_name in created_layers
-    layer = created_layers[layer_name]
-    node_index = get_node_index(layer, node_index)
-    layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-    output_tensors.append(tf.nest.flatten(layer_output_tensors)[tensor_index])
-
-  input_tensors = tf.nest.pack_sequence_as(input_layers, input_tensors)
-  output_tensors = tf.nest.pack_sequence_as(output_layers, output_tensors)
-  return input_tensors, output_tensors, created_layers
-
-
-def get_network_config(network, serialize_layer_fn=None):
-  """Builds the config, which consists of the node graph and serialized layers.
-
-  Args:
-    network: A Network object.
-    serialize_layer_fn: Function used to serialize layers.
-
-  Returns:
-    Config dictionary.
-  """
-  serialize_layer_fn = (
-      serialize_layer_fn or generic_utils.serialize_keras_object)
-  config = {
-      'name': network.name,
-  }
-  node_conversion_map = {}
-  for layer in network.layers:
-    kept_nodes = 1 if _should_skip_first_node(layer) else 0
-    for original_node_index, node in enumerate(layer._inbound_nodes):
-      node_key = _make_node_key(layer.name, original_node_index)
-      if node_key in network._network_nodes:
-        node_conversion_map[node_key] = kept_nodes
-        kept_nodes += 1
-  layer_configs = []
-
-  with generic_utils.SharedObjectSavingScope():
-    for layer in network.layers:  # From the earliest layers on.
-      filtered_inbound_nodes = []
-      for original_node_index, node in enumerate(layer._inbound_nodes):
-        node_key = _make_node_key(layer.name, original_node_index)
-        if node_key in network._network_nodes and not node.is_input:
-          # The node is relevant to the model:
-          # add to filtered_inbound_nodes.
-          node_data = node.serialize(_make_node_key, node_conversion_map)
-          filtered_inbound_nodes.append(node_data)
-
-      layer_config = serialize_layer_fn(layer)
-      layer_config['name'] = layer.name
-      layer_config['inbound_nodes'] = filtered_inbound_nodes
-      layer_configs.append(layer_config)
-    config['layers'] = layer_configs
-
-  # Gather info about inputs and outputs.
-  model_inputs = []
-  for i in range(len(network._input_layers)):
-    layer, node_index, tensor_index = network._input_coordinates[i]
-    node_key = _make_node_key(layer.name, node_index)
-    if node_key not in network._network_nodes:
-      continue
-    new_node_index = node_conversion_map[node_key]
-    model_inputs.append(
-        tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
-  model_inputs = tf.nest.pack_sequence_as(network._nested_inputs, model_inputs)
-  # Preserve external Keras compat for Models with single input.
-  if not tf.nest.is_nested(model_inputs):
-    model_inputs = [model_inputs]
-  model_inputs = tf_utils.convert_inner_node_data(model_inputs)
-  config['input_layers'] = model_inputs
-
-  model_outputs = []
-  for i in range(len(network._output_layers)):
-    layer, node_index, tensor_index = network._output_coordinates[i]
-    node_key = _make_node_key(layer.name, node_index)
-    if node_key not in network._network_nodes:
-      continue
-    new_node_index = node_conversion_map[node_key]
-    model_outputs.append(
-        tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
-  model_outputs = tf.nest.pack_sequence_as(network._nested_outputs, model_outputs)
-  # Preserve external Keras compat for Models with single output.
-  if not tf.nest.is_nested(model_outputs):
-    model_outputs = [model_outputs]
-  model_outputs = tf_utils.convert_inner_node_data(model_outputs)
-  config['output_layers'] = model_outputs
-  return config
+    config = config or {}
+    serialize_obj_fn = serialization_lib.serialize_keras_object
+    set_layers_legacy = False
+    # To be removed after full affected g3 user migration to Keras V3 Saving.
+    if getattr(network, "use_legacy_config", False):
+        serialize_obj_fn = serialization.serialize_keras_object
+        set_layers_legacy = True
+    serialize_layer_fn = serialize_layer_fn or serialize_obj_fn
+    config["name"] = network.name
+    node_conversion_map = {}
+    for layer in network.layers:
+        kept_nodes = 1 if _should_skip_first_node(layer) else 0
+        for original_node_index, node in enumerate(layer._inbound_nodes):
+            node_key = _make_node_key(layer.name, original_node_index)
+            if node_key in network._network_nodes:
+                node_conversion_map[node_key] = kept_nodes
+                kept_nodes += 1
+    layer_configs = []
+
+    with serialization.SharedObjectSavingScope():
+        for layer in network.layers:  # From the earliest layers on.
+            filtered_inbound_nodes = []
+            for original_node_index, node in enumerate(layer._inbound_nodes):
+                node_key = _make_node_key(layer.name, original_node_index)
+                if node_key in network._network_nodes and not node.is_input:
+                    # The node is relevant to the model:
+                    # add to filtered_inbound_nodes.
+                    node_data = node.serialize(
+                        _make_node_key, node_conversion_map
+                    )
+                    filtered_inbound_nodes.append(node_data)
+
+            if isinstance(layer, Functional) and set_layers_legacy:
+                layer.use_legacy_config = True
+            layer_config = serialize_layer_fn(layer)
+            layer_config["name"] = layer.name
+            layer_config["inbound_nodes"] = filtered_inbound_nodes
+            layer_configs.append(layer_config)
+        config["layers"] = layer_configs
+
+    # Gather info about inputs and outputs.
+    model_inputs = []
+    for i in range(len(network._input_layers)):
+        layer, node_index, tensor_index = network._input_coordinates[i]
+        node_key = _make_node_key(layer.name, node_index)
+        if node_key not in network._network_nodes:
+            continue
+        new_node_index = node_conversion_map[node_key]
+        model_inputs.append(
+            tf_utils.ListWrapper([layer.name, new_node_index, tensor_index])
+        )
+    model_inputs = tf.nest.pack_sequence_as(
+        network._nested_inputs, model_inputs
+    )
+    # Preserve external Keras compat for Models with single input.
+    if not tf.nest.is_nested(model_inputs):
+        model_inputs = [model_inputs]
+    model_inputs = tf_utils.convert_inner_node_data(model_inputs)
+    config["input_layers"] = model_inputs
+
+    model_outputs = []
+    for i in range(len(network._output_layers)):
+        layer, node_index, tensor_index = network._output_coordinates[i]
+        node_key = _make_node_key(layer.name, node_index)
+        if node_key not in network._network_nodes:
+            continue
+        new_node_index = node_conversion_map[node_key]
+        model_outputs.append(
+            tf_utils.ListWrapper([layer.name, new_node_index, tensor_index])
+        )
+    model_outputs = tf.nest.pack_sequence_as(
+        network._nested_outputs, model_outputs
+    )
+    # Preserve external Keras compat for Models with single output.
+    if not tf.nest.is_nested(model_outputs):
+        model_outputs = [model_outputs]
+    model_outputs = tf_utils.convert_inner_node_data(model_outputs)
+    config["output_layers"] = model_outputs
+    return config
 
 
 def shape_with_no_batch_size(x):
-  if x.shape.rank is None:
-    return None
-  shape = x.shape.as_list()
-  if shape:
-    shape[0] = None
-  return shape
+    if x.shape.rank is None:
+        return None
+    shape = x.shape.as_list()
+    if shape:
+        shape[0] = None
+    return shape
 
 
 class ModuleWrapper(base_layer.Layer):
-  """Wrapper for `tf.Module`s to support the Functional and Sequential API."""
-
-  def __init__(self, module, method_name=None, **kwargs):
-    """Initializes the wrapper Layer for this module.
-
-    Args:
-      module: The `tf.Module` instance to be wrapped.
-      method_name: (Optional) str. The name of the method to use as the forward
-        pass of the module. If not set, defaults to '__call__' if defined, or
-        'call'.
-      **kwargs: Additional keywrod arguments. See `tf.keras.layers.Layer`.
-
-    Raises:
-      ValueError: If `method` is not defined on `module`.
-    """
-    super().__init__(**kwargs)
-    if method_name is None:
-      if hasattr(module, '__call__'):
-        method_name = '__call__'
-      elif hasattr(module, 'call'):
-        method_name = 'call'
-    if method_name is None or not hasattr(module, method_name):
-      raise ValueError('{} is not defined on object {}'.format(
-          method_name, module))
-
-    self._module = module
-    self._method_name = method_name
-
-    # Check if module.__call__ has a `training` arg or accepts `**kwargs`.
-    method = getattr(module, method_name)
-    method_arg_spec = tf_inspect.getfullargspec(method)
-    self._call_spec.expects_training_arg = ('training' in method_arg_spec.args
-                                            or
-                                            method_arg_spec.varkw is not None)
-    self._call_spec.expects_mask_arg = ('mask' in method_arg_spec.args or
-                                        method_arg_spec.varkw is not None)
-
-  def call(self, *args, **kwargs):
-    if 'training' in kwargs and not self._expects_training_arg:
-      kwargs.pop('training')
-    if 'mask' in kwargs and not self._expects_mask_arg:
-      kwargs.pop('mask')
-    return getattr(self._module, self._method_name)(*args, **kwargs)
+    """Wrapper for `tf.Module`s to support the Functional and Sequential API."""
+
+    def __init__(self, module, method_name=None, **kwargs):
+        """Initializes the wrapper Layer for this module.
+
+        Args:
+          module: The `tf.Module` instance to be wrapped.
+          method_name: (Optional) str. The name of the method to use as the
+            forward pass of the module. If not set, becomes '__call__' if
+            defined, or 'call'. Defaults to `None`.
+          **kwargs: Additional keywrod arguments. See `tf.keras.layers.Layer`.
+
+        Raises:
+          ValueError: If `method` is not defined on `module`.
+        """
+        super().__init__(**kwargs)
+        if method_name is None:
+            if hasattr(module, "__call__"):
+                method_name = "__call__"
+            elif hasattr(module, "call"):
+                method_name = "call"
+        if method_name is None or not hasattr(module, method_name):
+            raise ValueError(f"{method_name} is not defined on object {module}")
+
+        self._module = module
+        self._method_name = method_name
+
+        # Check if module.__call__ has a `training` arg or accepts `**kwargs`.
+        method = getattr(module, method_name)
+        method_arg_spec = tf_inspect.getfullargspec(method)
+        self._call_spec.expects_training_arg = (
+            "training" in method_arg_spec.args
+            or method_arg_spec.varkw is not None
+        )
+        self._call_spec.expects_mask_arg = (
+            "mask" in method_arg_spec.args or method_arg_spec.varkw is not None
+        )
+
+    def call(self, *args, **kwargs):
+        if "training" in kwargs and not self._expects_training_arg:
+            kwargs.pop("training")
+        if "mask" in kwargs and not self._expects_mask_arg:
+            kwargs.pop("mask")
+        return getattr(self._module, self._method_name)(*args, **kwargs)
+
+
+def has_functional_like_constructor(cls):
+    init_args = tf_inspect.getfullargspec(cls.__init__).args[1:]
+    functional_init_args = tf_inspect.getfullargspec(Functional.__init__).args[
+        1:
+    ]
+    if init_args == functional_init_args:
+        return True
+    return False
diff --git a/keras/engine/functional_test.py b/keras/engine/functional_test.py
index 6ae73b8948d0..302eae9d82bb 100644
--- a/keras/engine/functional_test.py
+++ b/keras/engine/functional_test.py
@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#,============================================================================
+# ,============================================================================
 """Tests for layer graphs construction & handling."""
 
 import warnings
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import layers
 from keras import losses
@@ -25,2562 +28,2697 @@
 from keras.engine import input_layer as input_layer_lib
 from keras.engine import sequential
 from keras.engine import training as training_lib
+from keras.saving import object_registration
+from keras.saving.legacy import save
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import layer_utils
 from keras.utils import tf_utils
 
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-
+# isort: off
+from tensorflow.python.checkpoint.checkpoint import (
+    Checkpoint,
+)
 from tensorflow.python.framework import extension_type
-from tensorflow.python.training.tracking.util import Checkpoint
-
 
 
 class NetworkConstructionTest(test_combinations.TestCase):
-
-  def test_default_model_name(self):
-    inputs = input_layer_lib.Input(shape=(1,))
-    outputs = layers.Dense(1, activation='relu')(inputs)
-    model = training_lib.Model(inputs=inputs, outputs=outputs)
-    self.assertEqual(model.name, 'model')
-
-    model_2 = training_lib.Model(inputs=inputs, outputs=outputs)
-    self.assertEqual(model_2.name, 'model_1')
-
-    model_3 = training_lib.Model(inputs=inputs, outputs=outputs)
-    self.assertEqual(model_3.name, 'model_2')
-
-  def test_get_updates(self):
-
-    class MyLayer(layers.Layer):
-
-      def build(self, input_shape):
-        self.a = self.add_weight('a',
-                                 (1, 1),
-                                 'float32',
-                                 trainable=False)
-        self.b = self.add_weight('b',
-                                 (1, 1),
-                                 'float32',
-                                 trainable=False)
-        self.add_update(tf.compat.v1.assign_add(
-            self.a, [[1.]], name='unconditional_update'))
-        self.built = True
-
-      def call(self, inputs):
-        self.add_update(
-            tf.compat.v1.assign_add(self.b, inputs, name='conditional_update'))
-        return inputs + 1
-
-    with tf.Graph().as_default():
-      x1 = input_layer_lib.Input(shape=(1,))
-      layer = MyLayer()
-      _ = layer(x1)
-
-      self.assertEqual(len(layer.updates), 2)
-
-      x2 = input_layer_lib.Input(shape=(1,))
-      y2 = layer(x2)
-
-      self.assertEqual(len(layer.updates), 3)
-
-      network = functional.Functional(x2, y2)
-      self.assertEqual(len(network.updates), 3)
-
-      x3 = input_layer_lib.Input(shape=(1,))
-      _ = layer(x3)
-      self.assertEqual(len(network.updates), 4)
-
-      x4 = input_layer_lib.Input(shape=(1,))
-      _ = network(x4)
-      self.assertEqual(len(network.updates), 5)
-
-      network.add_update(tf.compat.v1.assign_add(layer.a, [[1]]))
-      self.assertEqual(len(network.updates), 6)
-
-      network.add_update(tf.compat.v1.assign_add(layer.b, x4))
-      self.assertEqual(len(network.updates), 7)
-
-  @test_combinations.generate(test_combinations.combine(mode=['graph']))
-  def test_get_updates_bn(self):
-    x1 = input_layer_lib.Input(shape=(1,))
-    layer = layers.BatchNormalization()
-    _ = layer(x1)
-
-    self.assertEqual(len(layer.updates), 2)
-
-  def test_get_layer(self):
-    # create a simple network
-    x = input_layer_lib.Input(shape=(32,))
-    dense_a = layers.Dense(4, name='dense_a')
-    dense_b = layers.Dense(2, name='dense_b')
-    y = dense_b(dense_a(x))
-    network = functional.Functional(x, y, name='dense_network')
-
-    # test various get_layer by index
-    self.assertEqual(network.get_layer(index=1), dense_a)
-
-    # test invalid get_layer by index
-    with self.assertRaisesRegex(
-        ValueError, 'Was asked to retrieve layer at index ' + str(3) +
-        ' but model only has ' + str(len(network.layers)) + ' layers.'):
-      network.get_layer(index=3)
-
-    # test that only one between name and index is requested
-    with self.assertRaisesRegex(ValueError,
-                                'Provide only a layer name or a layer index'):
-      network.get_layer(index=1, name='dense_b')
-
-    # test that a name or an index must be provided
-    with self.assertRaisesRegex(ValueError,
-                                'Provide either a layer name or layer index.'):
-      network.get_layer()
-
-    # test various get_layer by name
-    self.assertEqual(network.get_layer(name='dense_a'), dense_a)
-
-    # test invalid get_layer by name
-    with self.assertRaisesRegex(ValueError, 'No such layer: dense_c.'):
-      network.get_layer(name='dense_c')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testTopologicalAttributes(self):
-    # test layer attributes / methods related to cross-layer connectivity.
-    a = input_layer_lib.Input(shape=(32,), name='input_a')
-    b = input_layer_lib.Input(shape=(32,), name='input_b')
-
-    # test input, output, input_shape, output_shape
-    test_layer = layers.Dense(16, name='test_layer')
-    a_test = test_layer(a)
-    self.assertIs(test_layer.input, a)
-    self.assertIs(test_layer.output, a_test)
-    self.assertEqual(test_layer.input_shape, (None, 32))
-    self.assertEqual(test_layer.output_shape, (None, 16))
-
-    # test `get_*_at` methods
-    dense = layers.Dense(16, name='dense_1')
-    a_2 = dense(a)
-    b_2 = dense(b)
-
-    self.assertIs(dense.get_input_at(0), a)
-    self.assertIs(dense.get_input_at(1), b)
-    self.assertIs(dense.get_output_at(0), a_2)
-    self.assertIs(dense.get_output_at(1), b_2)
-    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
-    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
-    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
-    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
-
-    # Test invalid value for attribute retrieval.
-    with self.assertRaises(ValueError):
-      dense.get_input_at(2)
-    with self.assertRaises(AttributeError):
-      new_dense = layers.Dense(16)
-      _ = new_dense.input
-    with self.assertRaises(AttributeError):
-      new_dense = layers.Dense(16)
-      _ = new_dense.output
-    with self.assertRaises(AttributeError):
-      new_dense = layers.Dense(16)
-      _ = new_dense.output_shape
-    with self.assertRaises(AttributeError):
-      new_dense = layers.Dense(16)
-      _ = new_dense.input_shape
-    with self.assertRaises(AttributeError):
-      new_dense = layers.Dense(16)
-      a = input_layer_lib.Input(shape=(3, 32))
-      a = input_layer_lib.Input(shape=(5, 32))
-      a_2 = dense(a)
-      b_2 = dense(b)
-      _ = new_dense.input_shape
-    with self.assertRaises(AttributeError):
-      new_dense = layers.Dense(16)
-      a = input_layer_lib.Input(shape=(3, 32))
-      a = input_layer_lib.Input(shape=(5, 32))
-      a_2 = dense(a)
-      b_2 = dense(b)
-      _ = new_dense.output_shape
-
-  def _assertAllIs(self, a, b):
-    self.assertTrue(all(x is y for x, y in zip(a, b)))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testTopologicalAttributesMultiOutputLayer(self):
-
-    class PowersLayer(layers.Layer):
-
-      def call(self, inputs):
-        return [inputs**2, inputs**3]
-
-    x = input_layer_lib.Input(shape=(32,))
-    test_layer = PowersLayer()
-    p1, p2 = test_layer(x)  # pylint: disable=not-callable
-
-    self.assertIs(test_layer.input, x)
-    self._assertAllIs(test_layer.output, [p1, p2])
-    self.assertEqual(test_layer.input_shape, (None, 32))
-    self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testTopologicalAttributesMultiInputLayer(self):
-
-    class AddLayer(layers.Layer):
-
-      def call(self, inputs):
-        assert len(inputs) == 2
-        return inputs[0] + inputs[1]
-
-    a = input_layer_lib.Input(shape=(32,))
-    b = input_layer_lib.Input(shape=(32,))
-    test_layer = AddLayer()
-    y = test_layer([a, b])  # pylint: disable=not-callable
-
-    self._assertAllIs(test_layer.input, [a, b])
-    self.assertIs(test_layer.output, y)
-    self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
-    self.assertEqual(test_layer.output_shape, (None, 32))
-
-  def testBasicNetwork(self):
-    with tf.Graph().as_default():
-      # minimum viable network
-      x = input_layer_lib.Input(shape=(32,))
-      dense = layers.Dense(2)
-      y = dense(x)
-      network = functional.Functional(x, y, name='dense_network')
-
-      # test basic attributes
-      self.assertEqual(network.name, 'dense_network')
-      self.assertEqual(len(network.layers), 2)  # InputLayer + Dense
-      self.assertEqual(network.layers[1], dense)
-      self._assertAllIs(network.weights, dense.weights)
-      self._assertAllIs(network.trainable_weights, dense.trainable_weights)
-      self._assertAllIs(network.non_trainable_weights,
-                        dense.non_trainable_weights)
-
-      # test callability on Input
-      x_2 = input_layer_lib.Input(shape=(32,))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.shape.as_list(), [None, 2])
-
-      # test callability on regular tensor
-      x_2 = tf.compat.v1.placeholder(dtype='float32', shape=(None, 32))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.shape.as_list(), [None, 2])
-
-      # test network `trainable` attribute
-      network.trainable = False
-      self._assertAllIs(network.weights, dense.weights)
-      self.assertEqual(network.trainable_weights, [])
-      self._assertAllIs(network.non_trainable_weights,
-                        dense.trainable_weights + dense.non_trainable_weights)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_trainable_weights(self):
-    a = layers.Input(shape=(2,))
-    b = layers.Dense(1)(a)
-    model = training_lib.Model(a, b)
-
-    weights = model.weights
-    self._assertAllIs(model.trainable_weights, weights)
-    self.assertListEqual(model.non_trainable_weights, [])
-
-    model.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-    self._assertAllIs(model.non_trainable_weights, weights)
-
-    model.trainable = True
-    self._assertAllIs(model.trainable_weights, weights)
-    self.assertListEqual(model.non_trainable_weights, [])
-
-    model.layers[1].trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-    self._assertAllIs(model.non_trainable_weights, weights)
-
-    # sequential model
-    model = sequential.Sequential()
-    model.add(layers.Dense(1, input_dim=2))
-    weights = model.weights
-
-    self._assertAllIs(model.trainable_weights, weights)
-    self.assertListEqual(model.non_trainable_weights, [])
-
-    model.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-    self._assertAllIs(model.non_trainable_weights, weights)
-
-    model.trainable = True
-    self._assertAllIs(model.trainable_weights, weights)
-    self.assertListEqual(model.non_trainable_weights, [])
-
-    model.layers[0].trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-    self._assertAllIs(model.non_trainable_weights, weights)
-
-  def test_layer_call_arguments(self):
-    with tf.Graph().as_default():
-      # Test the ability to pass and serialize arguments to `call`.
-      inp = layers.Input(shape=(2,))
-      x = layers.Dense(3)(inp)
-      x = layers.Dropout(0.5)(x, training=True)
-      model = training_lib.Model(inp, x)
-      # Would be `dropout/cond/Merge` by default
-      self.assertIn('dropout', model.output.op.name)
-
-      # Test that argument is kept when applying the model
-      inp2 = layers.Input(shape=(2,))
-      out2 = model(inp2)
-      self.assertIn('dropout', out2.op.name)
-
-      # Test that argument is kept after loading a model
-      config = model.get_config()
-      model = training_lib.Model.from_config(config)
-      self.assertIn('dropout', model.output.op.name)
-
-  def test_node_construction(self):
-    # test basics
-    a = layers.Input(shape=(32,), name='input_a')
-    b = layers.Input(shape=(32,), name='input_b')
-
-    with self.assertRaises(ValueError):
-      _ = layers.Input(shape=(32,), batch_shape=(10, 32))
-    with self.assertRaises(ValueError):
-      _ = layers.Input(shape=(32,), unknown_kwarg=None)
-
-    self.assertListEqual(a.shape.as_list(), [None, 32])
-    a_layer, a_node_index, a_tensor_index = a._keras_history
-    b_layer, _, _ = b._keras_history
-    self.assertEqual(len(a_layer._inbound_nodes), 1)
-    self.assertEqual(a_tensor_index, 0)
-    node = a_layer._inbound_nodes[a_node_index]
-    self.assertEqual(node.outbound_layer, a_layer)
-
-    self.assertListEqual(node.inbound_layers, [])
-    self.assertListEqual(node.input_tensors, [a])
-    self.assertListEqual(node.input_shapes, [(None, 32)])
-    self.assertListEqual(node.output_tensors, [a])
-    self.assertListEqual(node.output_shapes, [(None, 32)])
-
-    dense = layers.Dense(16, name='dense_1')
-    a_2 = dense(a)
-    b_2 = dense(b)
-
-    self.assertEqual(len(dense._inbound_nodes), 2)
-    self.assertEqual(len(dense._outbound_nodes), 0)
-    self.assertEqual(dense._inbound_nodes[0].inbound_layers, a_layer)
-    self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
-    self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
-    self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
-    self.assertIs(dense._inbound_nodes[0].input_tensors, a)
-    self.assertIs(dense._inbound_nodes[1].input_tensors, b)
-
-    # test layer properties
-    test_layer = layers.Dense(16, name='test_layer')
-    a_test = test_layer(a)
-    self.assertListEqual(test_layer.kernel.shape.as_list(), [32, 16])
-    self.assertIs(test_layer.input, a)
-    self.assertIs(test_layer.output, a_test)
-    self.assertEqual(test_layer.input_shape, (None, 32))
-    self.assertEqual(test_layer.output_shape, (None, 16))
-
-    self.assertIs(dense.get_input_at(0), a)
-    self.assertIs(dense.get_input_at(1), b)
-    self.assertIs(dense.get_output_at(0), a_2)
-    self.assertIs(dense.get_output_at(1), b_2)
-    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
-    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
-    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
-    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
-    self.assertEqual(dense.get_input_mask_at(0), None)
-    self.assertEqual(dense.get_input_mask_at(1), None)
-    self.assertEqual(dense.get_output_mask_at(0), None)
-    self.assertEqual(dense.get_output_mask_at(1), None)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_multi_input_layer(self):
-    with self.cached_session():
-      # test multi-input layer
-      a = layers.Input(shape=(32,), name='input_a')
-      b = layers.Input(shape=(32,), name='input_b')
-
-      dense = layers.Dense(16, name='dense_1')
-      a_2 = dense(a)
-      b_2 = dense(b)
-
-      merged = layers.concatenate([a_2, b_2], name='merge')
-      self.assertListEqual(merged.shape.as_list(), [None, 16 * 2])
-      merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
-
-      self.assertEqual(merge_node_index, 0)
-      self.assertEqual(merge_tensor_index, 0)
-
-      self.assertEqual(len(merge_layer._inbound_nodes), 1)
-      self.assertEqual(len(merge_layer._outbound_nodes), 0)
-
-      self.assertEqual(len(merge_layer._inbound_nodes[0].input_tensors), 2)
-      self.assertEqual(len(merge_layer._inbound_nodes[0].inbound_layers), 2)
-
-      c = layers.Dense(64, name='dense_2')(merged)
-      d = layers.Dense(5, name='dense_3')(c)
-
-      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
-      self.assertEqual(len(model.layers), 6)
-      output_shapes = model.compute_output_shape([(None, 32), (None, 32)])
-      self.assertListEqual(output_shapes[0].as_list(), [None, 64])
-      self.assertListEqual(output_shapes[1].as_list(), [None, 5])
-      self.assertListEqual(
-          model.compute_mask([a, b], [None, None]), [None, None])
-
-      # we don't check names of first 2 layers (inputs) because
-      # ordering of same-level layers is not fixed
-      self.assertListEqual([l.name for l in model.layers][2:],
-                           ['dense_1', 'merge', 'dense_2', 'dense_3'])
-      self.assertListEqual([l.name for l in model._input_layers],
-                           ['input_a', 'input_b'])
-      self.assertListEqual([l.name for l in model._output_layers],
-                           ['dense_2', 'dense_3'])
-
-      # actually run model
-      fn = backend.function(model.inputs, model.outputs)
-      input_a_np = np.random.random((10, 32))
-      input_b_np = np.random.random((10, 32))
-      fn_outputs = fn([input_a_np, input_b_np])
-      self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
-
-      # test get_source_inputs
-      self._assertAllIs(layer_utils.get_source_inputs(c), [a, b])
-
-      # serialization / deserialization
-      json_config = model.to_json()
-      recreated_model = models.model_from_json(json_config)
-      recreated_model.compile('rmsprop', 'mse')
-
-      self.assertListEqual([l.name for l in recreated_model.layers][2:],
-                           ['dense_1', 'merge', 'dense_2', 'dense_3'])
-      self.assertListEqual([l.name for l in recreated_model._input_layers],
-                           ['input_a', 'input_b'])
-      self.assertListEqual([l.name for l in recreated_model._output_layers],
-                           ['dense_2', 'dense_3'])
-
-      fn = backend.function(recreated_model.inputs, recreated_model.outputs)
-      input_a_np = np.random.random((10, 32))
-      input_b_np = np.random.random((10, 32))
-      fn_outputs = fn([input_a_np, input_b_np])
-      self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
-
-  def test_multi_output_layer_output_names(self):
-    inp = layers.Input(name='inp', shape=(None,), dtype=tf.float32)
-
-    class _MultiOutput(layers.Layer):
-
-      def call(self, x):
-        return x + 1., x + 2.
-
-    out = _MultiOutput(name='out')(inp)
-    model = training_lib.Model(inp, out)
-    self.assertEqual(['out', 'out_1'], model.output_names)
-    self.assertAllClose([2., 3.], model(1.))
-
-  def test_recursion(self):
-    with tf.Graph().as_default(), self.cached_session():
-      a = layers.Input(shape=(32,), name='input_a')
-      b = layers.Input(shape=(32,), name='input_b')
-
-      dense = layers.Dense(16, name='dense_1')
-      a_2 = dense(a)
-      b_2 = dense(b)
-      merged = layers.concatenate([a_2, b_2], name='merge')
-      c = layers.Dense(64, name='dense_2')(merged)
-      d = layers.Dense(5, name='dense_3')(c)
-
-      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
-
-      e = layers.Input(shape=(32,), name='input_e')
-      f = layers.Input(shape=(32,), name='input_f')
-      self.assertEqual(len(model.inputs), 2)
-      g, h = model([e, f])
-      self.assertEqual(len(model.inputs), 2)
-      self.assertEqual(g.name, 'model/dense_2/BiasAdd:0')
-
-      self.assertListEqual(g.shape.as_list(), c.shape.as_list())
-      self.assertListEqual(h.shape.as_list(), d.shape.as_list())
-
-      # test separate manipulation of different layer outputs
-      i = layers.Dense(7, name='dense_4')(h)
-
-      final_model = training_lib.Model(
-          inputs=[e, f], outputs=[i, g], name='final')
-      self.assertEqual(len(final_model.inputs), 2)
-      self.assertEqual(len(final_model.outputs), 2)
-      self.assertEqual(len(final_model.layers), 4)
-
-      # we don't check names of first 2 layers (inputs) because
-      # ordering of same-level layers is not fixed
-      self.assertListEqual([layer.name for layer in final_model.layers][2:],
-                           ['model', 'dense_4'])
-      self.assertListEqual(
-          model.compute_mask([e, f], [None, None]), [None, None])
-      self.assertListEqual(
-          final_model.compute_output_shape([(10, 32), (10, 32)]), [(10, 7),
-                                                                   (10, 64)])
-
-      # run recursive model
-      fn = backend.function(final_model.inputs, final_model.outputs)
-      input_a_np = np.random.random((10, 32))
-      input_b_np = np.random.random((10, 32))
-      fn_outputs = fn([input_a_np, input_b_np])
-      self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
-
-      # test serialization
-      model_config = final_model.get_config()
-      recreated_model = models.Model.from_config(model_config)
-
-      fn = backend.function(recreated_model.inputs, recreated_model.outputs)
-      input_a_np = np.random.random((10, 32))
-      input_b_np = np.random.random((10, 32))
-      fn_outputs = fn([input_a_np, input_b_np])
-      self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_multi_input_multi_output_recursion(self):
-    with self.cached_session():
-      # test multi-input multi-output
-      a = layers.Input(shape=(32,), name='input_a')
-      b = layers.Input(shape=(32,), name='input_b')
-
-      dense = layers.Dense(16, name='dense_1')
-      a_2 = dense(a)
-      b_2 = dense(b)
-      merged = layers.concatenate([a_2, b_2], name='merge')
-      c = layers.Dense(64, name='dense_2')(merged)
-      d = layers.Dense(5, name='dense_3')(c)
-
-      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
-
-      j = layers.Input(shape=(32,), name='input_j')
-      k = layers.Input(shape=(32,), name='input_k')
-      _, n = model([j, k])
-
-      o = layers.Input(shape=(32,), name='input_o')
-      p = layers.Input(shape=(32,), name='input_p')
-      q, _ = model([o, p])
-
-      self.assertListEqual(n.shape.as_list(), [None, 5])
-      self.assertListEqual(q.shape.as_list(), [None, 64])
-      s = layers.concatenate([n, q], name='merge_nq')
-      self.assertListEqual(s.shape.as_list(), [None, 64 + 5])
-
-      # test with single output as 1-elem list
-      multi_io_model = training_lib.Model([j, k, o, p], [s])
-
-      fn = backend.function(multi_io_model.inputs, multi_io_model.outputs)
-      fn_outputs = fn([
-          np.random.random((10, 32)), np.random.random((10, 32)),
-          np.random.random((10, 32)), np.random.random((10, 32))
-      ])
-      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
-
-      # test with single output as tensor
-      multi_io_model = training_lib.Model([j, k, o, p], s)
-
-      fn = backend.function(multi_io_model.inputs, multi_io_model.outputs)
-      fn_outputs = fn([
-          np.random.random((10, 32)), np.random.random((10, 32)),
-          np.random.random((10, 32)), np.random.random((10, 32))
-      ])
-      # note that the output of the function will still be a 1-elem list
-      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
-
-      # test serialization
-      model_config = multi_io_model.get_config()
-      recreated_model = models.Model.from_config(model_config)
-
-      fn = backend.function(recreated_model.inputs, recreated_model.outputs)
-      fn_outputs = fn([
-          np.random.random((10, 32)), np.random.random((10, 32)),
-          np.random.random((10, 32)), np.random.random((10, 32))
-      ])
-      # note that the output of the function will still be a 1-elem list
-      self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
-
-      config = model.get_config()
-      models.Model.from_config(config)
-
-      model.summary()
-      json_str = model.to_json()
-      models.model_from_json(json_str)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_invalid_graphs(self):
-    a = layers.Input(shape=(32,), name='input_a')
-    b = layers.Input(shape=(32,), name='input_b')
-
-    dense = layers.Dense(16, name='dense_1')
-    a_2 = dense(a)
-    b_2 = dense(b)
-    merged = layers.concatenate([a_2, b_2], name='merge')
-    c = layers.Dense(64, name='dense_2')(merged)
-    d = layers.Dense(5, name='dense_3')(c)
-
-    model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
-
-    # disconnected graph
-    j = layers.Input(shape=(32,), name='input_j')
-    k = layers.Input(shape=(32,), name='input_k')
-    m, n = model([j, k])
-    with self.assertRaises(Exception):
-      training_lib.Model([j], [m, n])
-
-    # redundant outputs
-    j = layers.Input(shape=(32,), name='input_j')
-    k = layers.Input(shape=(32,), name='input_k')
-    m, n = model([j, k])
-
-    training_lib.Model([j, k], [m, n, n])
-
-    # redundant inputs
-    j = layers.Input(shape=(32,), name='input_j')
-    k = layers.Input(shape=(32,), name='input_k')
-    m, n = model([j, k])
-    with self.assertRaises(Exception):
-      training_lib.Model([j, k, j], [m, n])
-
-    # i have not idea what I'm doing: garbage as inputs/outputs
-    j = layers.Input(shape=(32,), name='input_j')
-    k = layers.Input(shape=(32,), name='input_k')
-    m, n = model([j, k])
-    with self.assertRaises(Exception):
-      training_lib.Model([j, k], [m, n, 0])
-
-  def test_raw_tf_compatibility(self):
-    with tf.Graph().as_default():
-      # test calling layers/models on TF tensors
-      a = layers.Input(shape=(32,), name='input_a')
-      b = layers.Input(shape=(32,), name='input_b')
-
-      dense = layers.Dense(16, name='dense_1')
-      a_2 = dense(a)
-      b_2 = dense(b)
-      merged = layers.concatenate([a_2, b_2], name='merge')
-      c = layers.Dense(64, name='dense_2')(merged)
-      d = layers.Dense(5, name='dense_3')(c)
-
-      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
-
-      j = layers.Input(shape=(32,), name='input_j')
-      k = layers.Input(shape=(32,), name='input_k')
-      self.assertEqual(len(model.inputs), 2)
-      m, n = model([j, k])
-      self.assertEqual(len(model.inputs), 2)
-      tf_model = training_lib.Model([j, k], [m, n])
-
-      j_tf = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 32))
-      k_tf = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 32))
-      m_tf, n_tf = tf_model([j_tf, k_tf])
-      self.assertListEqual(m_tf.shape.as_list(), [None, 64])
-      self.assertListEqual(n_tf.shape.as_list(), [None, 5])
-
-      # test merge
-      layers.concatenate([j_tf, k_tf], axis=1)
-      layers.add([j_tf, k_tf])
-
-      # test tensor input
-      x = tf.compat.v1.placeholder(shape=(None, 2), dtype=tf.float32)
-      layers.InputLayer(input_tensor=x)
-
-      x = layers.Input(tensor=x)
-      layers.Dense(2)(x)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_basic_masking(self):
-    a = layers.Input(shape=(10, 32), name='input_a')
-    b = layers.Masking()(a)
-    model = training_lib.Model(a, b)
-    self.assertEqual(model.output_mask.shape.as_list(), [None, 10])
-
-  def testMaskingSingleInput(self):
-
-    class MaskedLayer(layers.Layer):
-
-      def call(self, inputs, mask=None):
-        if mask is not None:
-          return inputs * mask
-        return inputs
-
-      def compute_mask(self, inputs, mask=None):
-        return tf.ones_like(inputs)
-
-    if tf.executing_eagerly():
-      a = tf.constant([2] * 32)
-      mask = tf.constant([0, 1] * 16)
-      a._keras_mask = mask
-      b = MaskedLayer()(a)
-      self.assertTrue(hasattr(b, '_keras_mask'))
-      self.assertAllEqual(
-          self.evaluate(tf.ones_like(mask)),
-          self.evaluate(getattr(b, '_keras_mask')))
-      self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
-    else:
-      x = input_layer_lib.Input(shape=(32,))
-      y = MaskedLayer()(x)  # pylint: disable=not-callable
-      network = functional.Functional(x, y)
-
-      # test callability on Input
-      x_2 = input_layer_lib.Input(shape=(32,))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.shape.as_list(), [None, 32])
-
-      # test callability on regular tensor
-      x_2 = tf.compat.v1.placeholder(dtype='float32', shape=(None, 32))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.shape.as_list(), [None, 32])
-
-  def test_activity_regularization_with_model_composition(self):
-
-    def reg(x):
-      return tf.reduce_sum(x)
-
-    net_a_input = input_layer_lib.Input((2,))
-    net_a = net_a_input
-    net_a = layers.Dense(
-        2, kernel_initializer='ones', use_bias=False, activity_regularizer=reg)(
-            net_a)
-    model_a = training_lib.Model([net_a_input], [net_a])
-
-    net_b_input = input_layer_lib.Input((2,))
-    net_b = model_a(net_b_input)
-    model_b = training_lib.Model([net_b_input], [net_b])
-
-    model_b.compile(optimizer='sgd', loss=None)
-    x = np.ones((1, 2))
-    loss = model_b.evaluate(x)
-    self.assertEqual(loss, 4.)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_layer_sharing_at_heterogenous_depth(self):
-    x_val = np.random.random((10, 5))
-
-    x = input_layer_lib.Input(shape=(5,))
-    a = layers.Dense(5, name='A')
-    b = layers.Dense(5, name='B')
-    output = a(b(a(b(x))))
-    m = training_lib.Model(x, output)
-    m.run_eagerly = test_utils.should_run_eagerly()
-
-    output_val = m.predict(x_val)
-
-    config = m.get_config()
-    weights = m.get_weights()
-
-    m2 = models.Model.from_config(config)
-    m2.set_weights(weights)
-
-    output_val_2 = m2.predict(x_val)
-    self.assertAllClose(output_val, output_val_2, atol=1e-6)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_layer_sharing_at_heterogenous_depth_with_concat(self):
-    input_shape = (16, 9, 3)
-    input_layer = input_layer_lib.Input(shape=input_shape)
-
-    a = layers.Dense(3, name='dense_A')
-    b = layers.Dense(3, name='dense_B')
-    c = layers.Dense(3, name='dense_C')
-
-    x1 = b(a(input_layer))
-    x2 = a(c(input_layer))
-    output = layers.concatenate([x1, x2])
-
-    m = training_lib.Model(inputs=input_layer, outputs=output)
-    m.run_eagerly = test_utils.should_run_eagerly()
-
-    x_val = np.random.random((10, 16, 9, 3))
-    output_val = m.predict(x_val)
-
-    config = m.get_config()
-    weights = m.get_weights()
-
-    m2 = models.Model.from_config(config)
-    m2.set_weights(weights)
-
-    output_val_2 = m2.predict(x_val)
-    self.assertAllClose(output_val, output_val_2, atol=1e-6)
-
-  def test_layer_sharing_maintains_node_order(self):
-    # See https://github.com/keras-team/keras/issues/14838.
-    inp = input_layer_lib.Input(shape=[5], name='main_input')
-
-    zeros = layers.Lambda(tf.zeros_like, name='generate_zeros')(inp)
-    ones = layers.Lambda(tf.ones_like, name='generate_ones')(inp)
-
-    shared_layer = layers.Layer(name='shared')
-
-    ones_result = shared_layer(ones)
-    zeros_result = shared_layer(zeros)
-    zeros_result = layers.Layer(name='blank')(zeros_result)
-
-    m = training_lib.Model(
-        inputs=[inp], outputs=[zeros_result, ones_result])
-    m2 = models.Model.from_config(m.get_config())
-    self.assertAllClose(
-        m2.predict_on_batch(tf.zeros([1, 5])),
-        m.predict_on_batch(tf.zeros([1, 5])))
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_explicit_training_argument(self):
-    a = layers.Input(shape=(2,))
-    b = layers.Dropout(0.5)(a)
-    base_model = training_lib.Model(a, b)
-
-    a = layers.Input(shape=(2,))
-    b = base_model(a, training=False)
-    model = training_lib.Model(a, b)
-
-    x = np.ones((100, 2))
-    y = np.ones((100, 2))
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(x, y)
-    self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
-
-    a = layers.Input(shape=(2,))
-    b = base_model(a, training=True)
-    model = training_lib.Model(a, b)
-    preds = model.predict(x)
-    self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_mask_derived_from_keras_layer(self):
-    inputs = input_layer_lib.Input((5, 10))
-    mask = input_layer_lib.Input((5,))
-    outputs = layers.RNN(layers.LSTMCell(100))(inputs, mask=mask)
-    model = training_lib.Model([inputs, mask], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 5, 10)), np.zeros((10, 5))],
-        y=np.zeros((10, 100)),
-        batch_size=2)
-    # All data is masked, returned values are 0's.
-    self.assertEqual(history.history['loss'][0], 0.0)
-    history = model.fit(
-        x=[np.ones((10, 5, 10)), np.ones((10, 5))],
-        y=np.zeros((10, 100)),
-        batch_size=2)
-    # Data is not masked, returned values are random.
-    self.assertGreater(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(model.get_config())
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 5, 10)), np.zeros((10, 5))],
-        y=np.zeros((10, 100)),
-        batch_size=2)
-    # All data is masked, returned values are 0's.
-    self.assertEqual(history.history['loss'][0], 0.0)
-    history = model.fit(
-        x=[np.ones((10, 5, 10)), np.ones((10, 5))],
-        y=np.zeros((10, 100)),
-        batch_size=2)
-    # Data is not masked, returned values are random.
-    self.assertGreater(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_call_arg_derived_from_keras_layer(self):
-
-    class MyAdd(layers.Layer):
-
-      def call(self, x1, x2):
-        return x1 + x2
-
-    input1 = input_layer_lib.Input(10)
-    input2 = input_layer_lib.Input(10)
-    outputs = MyAdd()(input1, input2)
-    model = training_lib.Model([input1, input2], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    # Check serialization.
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'MyAdd': MyAdd})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(
-      test_combinations.keras_mode_combinations(mode='eager'),)
-  def test_only_some_in_first_arg_derived_from_keras_layer_keras_tensors(self):
-    # This functionality is unsupported in v1 graphs
-
-    class MyAddAll(layers.Layer):
-
-      def call(self, inputs):
-        x = inputs[0]
-        for inp in inputs[1:]:
-          if inp is not None:
-            x = x + inp
-        return x
-
-    input1 = input_layer_lib.Input(10)
-    input2 = input_layer_lib.Input(10)
-    layer = MyAddAll()
-    outputs = layer([0.0, input1, None, input2, None])
-    model = training_lib.Model([input1, input2], outputs)
-    self.assertIn(layer, model.layers)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    # Check serialization.
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'MyAddAll': MyAddAll})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(
-      test_combinations.times(
-          test_combinations.keras_mode_combinations(),
-          test_combinations.combine(share_already_used_layer=[True, False])))
-  def test_call_kwarg_derived_from_keras_layer(self, share_already_used_layer):
-
-    class MaybeAdd(layers.Layer):
-
-      def call(self, x1, x2=None):
-        if x2 is not None:
-          return x1 + x2
-        return x1
-
-    class IdentityLayer(layers.Layer):
-
-      def call(self, x):
-        return x
-
-    input1 = input_layer_lib.Input(10)
-    input2 = input_layer_lib.Input(10)
-    identity_layer = IdentityLayer()
-
-    if share_already_used_layer:
-      # We have had model serialization/deserialization break in the past:
-      # when a layer was previously used to construct other functional models
-      # and had a non-empty list of inbound nodes before being used to define
-      # the model being serialized/deserialized.
-      # (The serialization/deserialization was not correctly adjusting
-      # the node_index serialization/deserialization).
-      # So, we explicitly test this case.
-      training_lib.Model([input1], identity_layer(input1))
-
-    outputs = MaybeAdd()(input1, x2=identity_layer(input2))
-    model = training_lib.Model([input1, input2], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(
-        model.get_config(),
-        custom_objects={
-            'MaybeAdd': MaybeAdd,
-            'IdentityLayer': IdentityLayer
-        })
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_call_kwarg_dtype_serialization(self):
-
-    class Double(layers.Layer):
-
-      def call(self, x1, dtype=None):
-        return tf.cast(x1 + x1, dtype=dtype)
-
-    input1 = input_layer_lib.Input(10)
-    outputs = Double()(input1, dtype=tf.float16)
-    model = training_lib.Model([input1], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10))],
-        y=6 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that input was correctly doubled.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    # Check the output dtype
-    self.assertEqual(model(tf.ones((3, 10))).dtype, tf.float16)
-
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'Double': Double})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10))],
-        y=6 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that input was correctly doubled.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    # Check the output dtype
-    self.assertEqual(model(tf.ones((3, 10))).dtype, tf.float16)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_call_kwarg_nonserializable(self):
-
-    class Double(layers.Layer):
-
-      def call(self, x1, kwarg=None):
-        return x1 + x1
-
-    class NonSerializable:
-
-      def __init__(self, foo=None):
-        self.foo = foo
-
-    input1 = input_layer_lib.Input(10)
-    outputs = Double()(input1, kwarg=NonSerializable())
-    model = training_lib.Model([input1], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[3 * np.ones((10, 10))],
-        y=6 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that input was correctly doubled.
-    self.assertEqual(history.history['loss'][0], 0.0)
-    with self.assertRaisesRegex(
-        TypeError, 'Layer double was passed non-JSON-serializable arguments.'):
-      model.get_config()
-
-  @test_combinations.generate(
-      test_combinations.times(
-          test_combinations.keras_mode_combinations(),
-          test_combinations.combine(share_already_used_layer=[True, False])))
-  def test_call_kwarg_derived_from_keras_layer_and_first_arg_is_constant(
-      self, share_already_used_layer):
-
-    class IdentityLayer(layers.Layer):
-
-      def call(self, x):
-        return x
-
-    class MaybeAdd(layers.Layer):
-
-      def call(self, x1, x2=None):
-        if x2 is not None:
-          return x1 + x2
-        return x1
-
-    input2 = input_layer_lib.Input(10)
-    identity_layer = IdentityLayer()
-    if share_already_used_layer:
-      # We have had model serialization/deserialization break in the past:
-      # when a layer was previously used to construct other functional models
-      # and had a non-empty list of inbound nodes before being used to define
-      # the model being serialized/deserialized.
-      # (The serialization/deserialization was not correctly adjusting
-      # the node_index serialization/deserialization).
-      # So, we explicitly test this case.
-      training_lib.Model([input2], identity_layer(input2))
-
-    outputs = MaybeAdd()(3., x2=identity_layer(input2))
-    model = training_lib.Model([input2], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=7 * np.ones((10, 10)),
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(
-        model.get_config(),
-        custom_objects={
-            'MaybeAdd': MaybeAdd,
-            'IdentityLayer': IdentityLayer
-        })
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=7 * np.ones((10, 10)),
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_dont_cast_composite_unless_necessary(self):
-    if not tf.executing_eagerly():
-      return  # Creating Keras inputs from a type_spec only supported in eager.
-
-    # TODO(edloper): Change this to tf.experimental.ExtensionTyep once
-    # it's been released.
-    class MyType(extension_type.ExtensionType):
-      # TODO(edloper) Remove _shape and _dtype once Keras has been switched
-      # to use .shape and .dtype instead.
-      value: tf.Tensor
-      _shape = property(lambda self: self.value.shape)
-      shape = property(lambda self: self.value.shape)
-      _dtype = property(lambda self: self.value.dtype)
-      dtype = property(lambda self: self.value.dtype)
-
-      class Spec:
-        _shape = property(lambda self: self.value.shape)
-        shape = property(lambda self: self.value.shape)
-        _dtype = property(lambda self: self.value.dtype)
-        dtype = property(lambda self: self.value.dtype)
-
-    my_spec = MyType.Spec(tf.TensorSpec([5], tf.float32))
-    input1 = input_layer_lib.Input(type_spec=my_spec)
-    model = training_lib.Model([input1], input1)
-    model.compile(run_eagerly=test_utils.should_run_eagerly())
-    model(MyType([1., 2., 3., 4., 5.]))  # Does not require cast.
-    with self.assertRaises((ValueError, TypeError)):
-      model(MyType([1, 2, 3, 4, 5]))
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_composite_call_kwarg_derived_from_keras_layer(self):
-
-    # Create a test layer that accepts composite tensor inputs.
-    class MaybeAdd(layers.Layer):
-
-      def call(self, x1, x2=None):
-        # We need to convert this to a tensor for loss calculations -
-        # losses don't play nicely with ragged tensors yet.
-        if x2 is not None:
-          return (x1 + x2).to_tensor(default_value=0)
-        return x1.to_tensor(default_value=0)
-
-    input1 = input_layer_lib.Input((None,), ragged=True)
-    input2 = input_layer_lib.Input((None,), ragged=True)
-    outputs = MaybeAdd()(input1, x2=input2)
-    model = training_lib.Model([input1, input2], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    input_data = [
-        tf.ragged.constant([[3.0, 3.0], [3.0, 3.0], [3.0]]),
-        tf.ragged.constant([[7.0, 7.0], [7.0, 7.0], [7.0]])
-    ]
-    expected_data = np.array([[10.0, 10.0], [10.0, 10.0], [10.0, 0.0]])
-
-    history = model.fit(x=input_data, y=expected_data)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(x=input_data, y=expected_data)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(
-      test_combinations.keras_mode_combinations(mode='eager'))
-  def test_call_some_not_all_nested_in_first_arg_derived_from_keras_layer(self):
-    # This functionality is unsupported in v1 graphs
-
-    class AddAll(layers.Layer):
-
-      def call(self, x1_x2, x3):
-        x1, x2 = x1_x2
-        out = x1 + x2
-        if x3 is not None:
-          for t in x3.values():
-            out += t
-        return out
-
-    input1 = input_layer_lib.Input(10)
-    input2 = input_layer_lib.Input(10)
-    input3 = input_layer_lib.Input(10)
-
-    layer = AddAll()
-    outputs = layer(
-        [input1, 4 * tf.ones((1, 10))],
-        x3={
-            'a': input2,
-            'b': input3,
-            'c': 5 * tf.ones((1, 10))
-        })
-    model = training_lib.Model([input1, input2, input3], outputs)
-    self.assertIn(layer, model.layers)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
-        y=15 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that all inputs were correctly added.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'AddAll': AddAll})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
-        y=15 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that all inputs were correctly added.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_call_nested_arg_derived_from_keras_layer(self):
-
-    class AddAll(layers.Layer):
-
-      def call(self, x1, x2, x3=None):
-        out = x1 + x2
-        if x3 is not None:
-          for t in x3.values():
-            out += t
-        return out
-
-    input1 = input_layer_lib.Input(10)
-    input2 = input_layer_lib.Input(10)
-    input3 = input_layer_lib.Input(10)
-    outputs = AddAll()(
-        input1,
-        4 * tf.ones((1, 10)),
-        x3={
-            'a': input2,
-            'b': input3,
-            'c': 5 * tf.ones((1, 10))
-        })
-    model = training_lib.Model([input1, input2, input3], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
-        y=15 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that all inputs were correctly added.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'AddAll': AddAll})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
-        y=15 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that all inputs were correctly added.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_multi_output_model_with_none_masking(self):
-    def func(x):
-      return [x * 0.2, x * 0.3]
-
-    def output_shape(input_shape):
-      return [input_shape, input_shape]
-
-    i = layers.Input(shape=(3, 2, 1))
-    o = layers.Lambda(function=func, output_shape=output_shape)(i)
-
-    self.assertEqual(backend.int_shape(o[0]), (None, 3, 2, 1))
-    self.assertEqual(backend.int_shape(o[1]), (None, 3, 2, 1))
-
-    o = layers.add(o)
-    model = training_lib.Model(i, o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    i2 = layers.Input(shape=(3, 2, 1))
-    o2 = model(i2)
-    model2 = training_lib.Model(i2, o2)
-    model2.run_eagerly = test_utils.should_run_eagerly()
-
-    x = np.random.random((4, 3, 2, 1))
-    out = model2.predict(x)
-    assert out.shape == (4, 3, 2, 1)
-    self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_constant_initializer_with_numpy(self):
-    initializer = tf.compat.v1.constant_initializer(np.ones((3, 2)))
-    model = sequential.Sequential()
-    model.add(layers.Dense(2, input_shape=(3,), kernel_initializer=initializer))
-    model.add(layers.Dense(3))
-    model.compile(
-        loss='mse',
-        optimizer='sgd',
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    json_str = model.to_json()
-    models.model_from_json(json_str)
-
-  def test_subclassed_error_if_init_not_called(self):
-
-    class MyNetwork(training_lib.Model):
-
-      def __init__(self):
-        self._foo = [layers.Dense(10), layers.Dense(10)]
-
-    with self.assertRaisesRegex(RuntimeError, 'forgot to call'):
-      MyNetwork()
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_int_input_shape(self):
-    inputs = input_layer_lib.Input(10)
-    self.assertEqual([None, 10], inputs.shape.as_list())
-
-    inputs_with_batch = input_layer_lib.Input(batch_size=20, shape=5)
-    self.assertEqual([20, 5], inputs_with_batch.shape.as_list())
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_model_initialization(self):
-    # Functional model
-    inputs = input_layer_lib.Input(shape=(32,))
-    outputs = layers.Dense(4)(inputs)
-
-    with self.assertRaisesRegex(TypeError,
-                                'Keyword argument not understood'):
-      model = training_lib.Model(
-          inputs, outputs, name='m', trainable=False, dtype='int64')
-    with self.assertRaisesRegex(TypeError,
-                                'Keyword argument not understood'):
-      model = training_lib.Model(
-          inputs, outputs, name='m', trainable=False, dynamic=False)
-
-    model = training_lib.Model(inputs, outputs, name='m', trainable=False)
-    self.assertEqual('m', model.name)
-    self.assertFalse(model.trainable)
-    self.assertFalse(model.dynamic)
-
-    class SubclassModel(training_lib.Model):
-      pass
-    # Subclassed model
-    model = SubclassModel(
-        name='subclassed', trainable=True, dtype='int64', dynamic=True)
-    self.assertEqual('subclassed', model.name)
-    self.assertTrue(model.dynamic)
-    self.assertTrue(model.trainable)
-    w = model.add_weight(
-        'w', [], initializer=tf.compat.v1.constant_initializer(1))
-    self.assertEqual(tf.int64, w.dtype)
-
-  def test_disconnected_inputs(self):
-    input_tensor1 = input_layer_lib.Input(shape=[200], name='a')
-    input_tensor2 = input_layer_lib.Input(shape=[10], name='b')
-    output_tensor1 = layers.Dense(units=10)(input_tensor1)
-
-    net = functional.Functional(
-        inputs=[input_tensor1, input_tensor2], outputs=[output_tensor1])
-    net2 = functional.Functional.from_config(net.get_config())
-    self.assertLen(net2.inputs, 2)
-    self.assertEqual('a', net2.layers[0].name)
-    self.assertEqual('b', net2.layers[1].name)
-
-  @test_combinations.generate(test_combinations.keras_model_type_combinations())
-  def test_dependency_tracking(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.trackable = Checkpoint()
-    self.assertIn('trackable', model._unconditional_dependency_names)
-    self.assertEqual(model.trackable, model._lookup_dependency('trackable'))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_model_construction_in_tf_function(self):
-
-    d = {'model': None}
-
-    @tf.function
-    def fn(x):
-      if d['model'] is None:
-        # Check that Functional can be built in a `tf.function`.
+    def test_default_model_name(self):
+        inputs = input_layer_lib.Input(shape=(1,))
+        outputs = layers.Dense(1, activation="relu")(inputs)
+        model = training_lib.Model(inputs=inputs, outputs=outputs)
+        self.assertEqual(model.name, "model")
+
+        model_2 = training_lib.Model(inputs=inputs, outputs=outputs)
+        self.assertEqual(model_2.name, "model_1")
+
+        model_3 = training_lib.Model(inputs=inputs, outputs=outputs)
+        self.assertEqual(model_3.name, "model_2")
+
+    def test_get_updates(self):
+        class MyLayer(layers.Layer):
+            def build(self, input_shape):
+                self.a = self.add_weight(
+                    "a", (1, 1), "float32", trainable=False
+                )
+                self.b = self.add_weight(
+                    "b", (1, 1), "float32", trainable=False
+                )
+                self.add_update(
+                    tf.compat.v1.assign_add(
+                        self.a, [[1.0]], name="unconditional_update"
+                    )
+                )
+                self.built = True
+
+            def call(self, inputs):
+                self.add_update(
+                    tf.compat.v1.assign_add(
+                        self.b, inputs, name="conditional_update"
+                    )
+                )
+                return inputs + 1
+
+        with tf.Graph().as_default():
+            x1 = input_layer_lib.Input(shape=(1,))
+            layer = MyLayer()
+            _ = layer(x1)
+
+            self.assertEqual(len(layer.updates), 2)
+
+            x2 = input_layer_lib.Input(shape=(1,))
+            y2 = layer(x2)
+
+            self.assertEqual(len(layer.updates), 3)
+
+            network = functional.Functional(x2, y2)
+            self.assertEqual(len(network.updates), 3)
+
+            x3 = input_layer_lib.Input(shape=(1,))
+            _ = layer(x3)
+            self.assertEqual(len(network.updates), 4)
+
+            x4 = input_layer_lib.Input(shape=(1,))
+            _ = network(x4)
+            self.assertEqual(len(network.updates), 5)
+
+            network.add_update(tf.compat.v1.assign_add(layer.a, [[1]]))
+            self.assertEqual(len(network.updates), 6)
+
+            network.add_update(tf.compat.v1.assign_add(layer.b, x4))
+            self.assertEqual(len(network.updates), 7)
+
+    @test_combinations.generate(test_combinations.combine(mode=["graph"]))
+    def test_get_updates_bn(self):
+        x1 = input_layer_lib.Input(shape=(1,))
+        layer = layers.BatchNormalization()
+        _ = layer(x1)
+
+        self.assertEqual(len(layer.updates), 2)
+
+    def test_get_layer(self):
+        # create a simple network
+        x = input_layer_lib.Input(shape=(32,))
+        dense_a = layers.Dense(4, name="dense_a")
+        dense_b = layers.Dense(2, name="dense_b")
+        y = dense_b(dense_a(x))
+        network = functional.Functional(x, y, name="dense_network")
+
+        # test various get_layer by index
+        self.assertEqual(network.get_layer(index=1), dense_a)
+
+        # test invalid get_layer by index
+        with self.assertRaisesRegex(
+            ValueError,
+            "Was asked to retrieve layer at index "
+            + str(3)
+            + " but model only has "
+            + str(len(network.layers))
+            + " layers.",
+        ):
+            network.get_layer(index=3)
+
+        # test that only one between name and index is requested
+        with self.assertRaisesRegex(
+            ValueError, "Provide only a layer name or a layer index"
+        ):
+            network.get_layer(index=1, name="dense_b")
+
+        # test that a name or an index must be provided
+        with self.assertRaisesRegex(
+            ValueError, "Provide either a layer name or layer index."
+        ):
+            network.get_layer()
+
+        # test various get_layer by name
+        self.assertEqual(network.get_layer(name="dense_a"), dense_a)
+
+        # test invalid get_layer by name
+        with self.assertRaisesRegex(ValueError, "No such layer: dense_c."):
+            network.get_layer(name="dense_c")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTopologicalAttributes(self):
+        # test layer attributes / methods related to cross-layer connectivity.
+        a = input_layer_lib.Input(shape=(32,), name="input_a")
+        b = input_layer_lib.Input(shape=(32,), name="input_b")
+
+        # test input, output, input_shape, output_shape
+        test_layer = layers.Dense(16, name="test_layer")
+        a_test = test_layer(a)
+        self.assertIs(test_layer.input, a)
+        self.assertIs(test_layer.output, a_test)
+        self.assertEqual(test_layer.input_shape, (None, 32))
+        self.assertEqual(test_layer.output_shape, (None, 16))
+
+        # test `get_*_at` methods
+        dense = layers.Dense(16, name="dense_1")
+        a_2 = dense(a)
+        b_2 = dense(b)
+
+        self.assertIs(dense.get_input_at(0), a)
+        self.assertIs(dense.get_input_at(1), b)
+        self.assertIs(dense.get_output_at(0), a_2)
+        self.assertIs(dense.get_output_at(1), b_2)
+        self.assertEqual(dense.get_input_shape_at(0), (None, 32))
+        self.assertEqual(dense.get_input_shape_at(1), (None, 32))
+        self.assertEqual(dense.get_output_shape_at(0), (None, 16))
+        self.assertEqual(dense.get_output_shape_at(1), (None, 16))
+
+        # Test invalid value for attribute retrieval.
+        with self.assertRaises(ValueError):
+            dense.get_input_at(2)
+        with self.assertRaises(AttributeError):
+            new_dense = layers.Dense(16)
+            _ = new_dense.input
+        with self.assertRaises(AttributeError):
+            new_dense = layers.Dense(16)
+            _ = new_dense.output
+        with self.assertRaises(AttributeError):
+            new_dense = layers.Dense(16)
+            _ = new_dense.output_shape
+        with self.assertRaises(AttributeError):
+            new_dense = layers.Dense(16)
+            _ = new_dense.input_shape
+        with self.assertRaises(AttributeError):
+            new_dense = layers.Dense(16)
+            a = input_layer_lib.Input(shape=(3, 32))
+            a = input_layer_lib.Input(shape=(5, 32))
+            a_2 = dense(a)
+            b_2 = dense(b)
+            _ = new_dense.input_shape
+        with self.assertRaises(AttributeError):
+            new_dense = layers.Dense(16)
+            a = input_layer_lib.Input(shape=(3, 32))
+            a = input_layer_lib.Input(shape=(5, 32))
+            a_2 = dense(a)
+            b_2 = dense(b)
+            _ = new_dense.output_shape
+
+    def _assertAllIs(self, a, b):
+        self.assertTrue(all(x is y for x, y in zip(a, b)))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTopologicalAttributesMultiOutputLayer(self):
+        class PowersLayer(layers.Layer):
+            def call(self, inputs):
+                return [inputs**2, inputs**3]
+
+        x = input_layer_lib.Input(shape=(32,))
+        test_layer = PowersLayer()
+        p1, p2 = test_layer(x)
+
+        self.assertIs(test_layer.input, x)
+        self._assertAllIs(test_layer.output, [p1, p2])
+        self.assertEqual(test_layer.input_shape, (None, 32))
+        self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTopologicalAttributesMultiInputLayer(self):
+        class AddLayer(layers.Layer):
+            def call(self, inputs):
+                assert len(inputs) == 2
+                return inputs[0] + inputs[1]
+
+        a = input_layer_lib.Input(shape=(32,))
+        b = input_layer_lib.Input(shape=(32,))
+        test_layer = AddLayer()
+        y = test_layer([a, b])
+
+        self._assertAllIs(test_layer.input, [a, b])
+        self.assertIs(test_layer.output, y)
+        self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
+        self.assertEqual(test_layer.output_shape, (None, 32))
+
+    def testBasicNetwork(self):
+        with tf.Graph().as_default():
+            # minimum viable network
+            x = input_layer_lib.Input(shape=(32,))
+            dense = layers.Dense(2)
+            y = dense(x)
+            network = functional.Functional(x, y, name="dense_network")
+
+            # test basic attributes
+            self.assertEqual(network.name, "dense_network")
+            self.assertEqual(len(network.layers), 2)  # InputLayer + Dense
+            self.assertEqual(network.layers[1], dense)
+            self._assertAllIs(network.weights, dense.weights)
+            self._assertAllIs(
+                network.trainable_weights, dense.trainable_weights
+            )
+            self._assertAllIs(
+                network.non_trainable_weights, dense.non_trainable_weights
+            )
+
+            # test callability on Input
+            x_2 = input_layer_lib.Input(shape=(32,))
+            y_2 = network(x_2)
+            self.assertEqual(y_2.shape.as_list(), [None, 2])
+
+            # test callability on regular tensor
+            x_2 = tf.compat.v1.placeholder(dtype="float32", shape=(None, 32))
+            y_2 = network(x_2)
+            self.assertEqual(y_2.shape.as_list(), [None, 2])
+
+            # test network `trainable` attribute
+            network.trainable = False
+            self._assertAllIs(network.weights, dense.weights)
+            self.assertEqual(network.trainable_weights, [])
+            self._assertAllIs(
+                network.non_trainable_weights,
+                dense.trainable_weights + dense.non_trainable_weights,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_trainable_weights(self):
+        a = layers.Input(shape=(2,))
+        b = layers.Dense(1)(a)
+        model = training_lib.Model(a, b)
+
+        weights = model.weights
+        self._assertAllIs(model.trainable_weights, weights)
+        self.assertListEqual(model.non_trainable_weights, [])
+
+        model.trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+        self._assertAllIs(model.non_trainable_weights, weights)
+
+        model.trainable = True
+        self._assertAllIs(model.trainable_weights, weights)
+        self.assertListEqual(model.non_trainable_weights, [])
+
+        model.layers[1].trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+        self._assertAllIs(model.non_trainable_weights, weights)
+
+        # sequential model
+        model = sequential.Sequential()
+        model.add(layers.Dense(1, input_dim=2))
+        weights = model.weights
+
+        self._assertAllIs(model.trainable_weights, weights)
+        self.assertListEqual(model.non_trainable_weights, [])
+
+        model.trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+        self._assertAllIs(model.non_trainable_weights, weights)
+
+        model.trainable = True
+        self._assertAllIs(model.trainable_weights, weights)
+        self.assertListEqual(model.non_trainable_weights, [])
+
+        model.layers[0].trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+        self._assertAllIs(model.non_trainable_weights, weights)
+
+    def test_layer_call_arguments(self):
+        with tf.Graph().as_default():
+            # Test the ability to pass and serialize arguments to `call`.
+            inp = layers.Input(shape=(2,))
+            x = layers.Dense(3)(inp)
+            x = layers.Dropout(0.5)(x, training=True)
+            model = training_lib.Model(inp, x)
+            # Would be `dropout/cond/Merge` by default
+            self.assertIn("dropout", model.output.op.name)
+
+            # Test that argument is kept when applying the model
+            inp2 = layers.Input(shape=(2,))
+            out2 = model(inp2)
+            self.assertIn("dropout", out2.op.name)
+
+            # Test that argument is kept after loading a model
+            config = model.get_config()
+            model = training_lib.Model.from_config(config)
+            self.assertIn("dropout", model.output.op.name)
+
+    def test_node_construction(self):
+        # test basics
+        a = layers.Input(shape=(32,), name="input_a")
+        b = layers.Input(shape=(32,), name="input_b")
+
+        with self.assertRaises(ValueError):
+            _ = layers.Input(shape=(32,), batch_shape=(10, 32))
+        with self.assertRaises(ValueError):
+            _ = layers.Input(shape=(32,), unknown_kwarg=None)
+
+        self.assertListEqual(a.shape.as_list(), [None, 32])
+        a_layer, a_node_index, a_tensor_index = a._keras_history
+        b_layer, _, _ = b._keras_history
+        self.assertEqual(len(a_layer._inbound_nodes), 1)
+        self.assertEqual(a_tensor_index, 0)
+        node = a_layer._inbound_nodes[a_node_index]
+        self.assertEqual(node.outbound_layer, a_layer)
+
+        self.assertListEqual(node.inbound_layers, [])
+        self.assertListEqual(node.input_tensors, [a])
+        self.assertListEqual(node.input_shapes, [(None, 32)])
+        self.assertListEqual(node.output_tensors, [a])
+        self.assertListEqual(node.output_shapes, [(None, 32)])
+
+        dense = layers.Dense(16, name="dense_1")
+        a_2 = dense(a)
+        b_2 = dense(b)
+
+        self.assertEqual(len(dense._inbound_nodes), 2)
+        self.assertEqual(len(dense._outbound_nodes), 0)
+        self.assertEqual(dense._inbound_nodes[0].inbound_layers, a_layer)
+        self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
+        self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
+        self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
+        self.assertIs(dense._inbound_nodes[0].input_tensors, a)
+        self.assertIs(dense._inbound_nodes[1].input_tensors, b)
+
+        # test layer properties
+        test_layer = layers.Dense(16, name="test_layer")
+        a_test = test_layer(a)
+        self.assertListEqual(test_layer.kernel.shape.as_list(), [32, 16])
+        self.assertIs(test_layer.input, a)
+        self.assertIs(test_layer.output, a_test)
+        self.assertEqual(test_layer.input_shape, (None, 32))
+        self.assertEqual(test_layer.output_shape, (None, 16))
+
+        self.assertIs(dense.get_input_at(0), a)
+        self.assertIs(dense.get_input_at(1), b)
+        self.assertIs(dense.get_output_at(0), a_2)
+        self.assertIs(dense.get_output_at(1), b_2)
+        self.assertEqual(dense.get_input_shape_at(0), (None, 32))
+        self.assertEqual(dense.get_input_shape_at(1), (None, 32))
+        self.assertEqual(dense.get_output_shape_at(0), (None, 16))
+        self.assertEqual(dense.get_output_shape_at(1), (None, 16))
+        self.assertEqual(dense.get_input_mask_at(0), None)
+        self.assertEqual(dense.get_input_mask_at(1), None)
+        self.assertEqual(dense.get_output_mask_at(0), None)
+        self.assertEqual(dense.get_output_mask_at(1), None)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_multi_input_layer(self):
+        with self.cached_session():
+            # test multi-input layer
+            a = layers.Input(shape=(32,), name="input_a")
+            b = layers.Input(shape=(32,), name="input_b")
+
+            dense = layers.Dense(16, name="dense_1")
+            a_2 = dense(a)
+            b_2 = dense(b)
+
+            merged = layers.concatenate([a_2, b_2], name="merge")
+            self.assertListEqual(merged.shape.as_list(), [None, 16 * 2])
+            (
+                merge_layer,
+                merge_node_index,
+                merge_tensor_index,
+            ) = merged._keras_history
+
+            self.assertEqual(merge_node_index, 0)
+            self.assertEqual(merge_tensor_index, 0)
+
+            self.assertEqual(len(merge_layer._inbound_nodes), 1)
+            self.assertEqual(len(merge_layer._outbound_nodes), 0)
+
+            self.assertEqual(
+                len(merge_layer._inbound_nodes[0].input_tensors), 2
+            )
+            self.assertEqual(
+                len(merge_layer._inbound_nodes[0].inbound_layers), 2
+            )
+
+            c = layers.Dense(64, name="dense_2")(merged)
+            d = layers.Dense(5, name="dense_3")(c)
+
+            model = training_lib.Model(
+                inputs=[a, b], outputs=[c, d], name="model"
+            )
+            self.assertEqual(len(model.layers), 6)
+            output_shapes = model.compute_output_shape([(None, 32), (None, 32)])
+            self.assertListEqual(output_shapes[0].as_list(), [None, 64])
+            self.assertListEqual(output_shapes[1].as_list(), [None, 5])
+            self.assertListEqual(
+                model.compute_mask([a, b], [None, None]), [None, None]
+            )
+
+            # we don't check names of first 2 layers (inputs) because
+            # ordering of same-level layers is not fixed
+            self.assertListEqual(
+                [l.name for l in model.layers][2:],
+                ["dense_1", "merge", "dense_2", "dense_3"],
+            )
+            self.assertListEqual(
+                [l.name for l in model._input_layers], ["input_a", "input_b"]
+            )
+            self.assertListEqual(
+                [l.name for l in model._output_layers], ["dense_2", "dense_3"]
+            )
+
+            # actually run model
+            fn = backend.function(model.inputs, model.outputs)
+            input_a_np = np.random.random((10, 32))
+            input_b_np = np.random.random((10, 32))
+            fn_outputs = fn([input_a_np, input_b_np])
+            self.assertListEqual(
+                [x.shape for x in fn_outputs], [(10, 64), (10, 5)]
+            )
+
+            # test get_source_inputs
+            self._assertAllIs(layer_utils.get_source_inputs(c), [a, b])
+
+            # serialization / deserialization
+            json_config = model.to_json()
+            recreated_model = models.model_from_json(json_config)
+            recreated_model.compile("rmsprop", "mse")
+
+            self.assertListEqual(
+                [l.name for l in recreated_model.layers][2:],
+                ["dense_1", "merge", "dense_2", "dense_3"],
+            )
+            self.assertListEqual(
+                [l.name for l in recreated_model._input_layers],
+                ["input_a", "input_b"],
+            )
+            self.assertListEqual(
+                [l.name for l in recreated_model._output_layers],
+                ["dense_2", "dense_3"],
+            )
+
+            fn = backend.function(
+                recreated_model.inputs, recreated_model.outputs
+            )
+            input_a_np = np.random.random((10, 32))
+            input_b_np = np.random.random((10, 32))
+            fn_outputs = fn([input_a_np, input_b_np])
+            self.assertListEqual(
+                [x.shape for x in fn_outputs], [(10, 64), (10, 5)]
+            )
+
+    def test_multi_output_layer_output_names(self):
+        inp = layers.Input(name="inp", shape=(None,), dtype=tf.float32)
+
+        class _MultiOutput(layers.Layer):
+            def call(self, x):
+                return x + 1.0, x + 2.0
+
+        out = _MultiOutput(name="out")(inp)
+        model = training_lib.Model(inp, out)
+        self.assertEqual(["out", "out_1"], model.output_names)
+        self.assertAllClose([2.0, 3.0], model(1.0))
+
+    def test_recursion(self):
+        with tf.Graph().as_default(), self.cached_session():
+            a = layers.Input(shape=(32,), name="input_a")
+            b = layers.Input(shape=(32,), name="input_b")
+
+            dense = layers.Dense(16, name="dense_1")
+            a_2 = dense(a)
+            b_2 = dense(b)
+            merged = layers.concatenate([a_2, b_2], name="merge")
+            c = layers.Dense(64, name="dense_2")(merged)
+            d = layers.Dense(5, name="dense_3")(c)
+
+            model = training_lib.Model(
+                inputs=[a, b], outputs=[c, d], name="model"
+            )
+
+            e = layers.Input(shape=(32,), name="input_e")
+            f = layers.Input(shape=(32,), name="input_f")
+            self.assertEqual(len(model.inputs), 2)
+            g, h = model([e, f])
+            self.assertEqual(len(model.inputs), 2)
+            self.assertEqual(g.name, "model/dense_2/BiasAdd:0")
+
+            self.assertListEqual(g.shape.as_list(), c.shape.as_list())
+            self.assertListEqual(h.shape.as_list(), d.shape.as_list())
+
+            # test separate manipulation of different layer outputs
+            i = layers.Dense(7, name="dense_4")(h)
+
+            final_model = training_lib.Model(
+                inputs=[e, f], outputs=[i, g], name="final"
+            )
+            self.assertEqual(len(final_model.inputs), 2)
+            self.assertEqual(len(final_model.outputs), 2)
+            self.assertEqual(len(final_model.layers), 4)
+
+            # we don't check names of first 2 layers (inputs) because
+            # ordering of same-level layers is not fixed
+            self.assertListEqual(
+                [layer.name for layer in final_model.layers][2:],
+                ["model", "dense_4"],
+            )
+            self.assertListEqual(
+                model.compute_mask([e, f], [None, None]), [None, None]
+            )
+            self.assertListEqual(
+                final_model.compute_output_shape([(10, 32), (10, 32)]),
+                [(10, 7), (10, 64)],
+            )
+
+            # run recursive model
+            fn = backend.function(final_model.inputs, final_model.outputs)
+            input_a_np = np.random.random((10, 32))
+            input_b_np = np.random.random((10, 32))
+            fn_outputs = fn([input_a_np, input_b_np])
+            self.assertListEqual(
+                [x.shape for x in fn_outputs], [(10, 7), (10, 64)]
+            )
+
+            # test serialization
+            model_config = final_model.get_config()
+            recreated_model = models.Model.from_config(model_config)
+
+            fn = backend.function(
+                recreated_model.inputs, recreated_model.outputs
+            )
+            input_a_np = np.random.random((10, 32))
+            input_b_np = np.random.random((10, 32))
+            fn_outputs = fn([input_a_np, input_b_np])
+            self.assertListEqual(
+                [x.shape for x in fn_outputs], [(10, 7), (10, 64)]
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_multi_input_multi_output_recursion(self):
+        with self.cached_session():
+            # test multi-input multi-output
+            a = layers.Input(shape=(32,), name="input_a")
+            b = layers.Input(shape=(32,), name="input_b")
+
+            dense = layers.Dense(16, name="dense_1")
+            a_2 = dense(a)
+            b_2 = dense(b)
+            merged = layers.concatenate([a_2, b_2], name="merge")
+            c = layers.Dense(64, name="dense_2")(merged)
+            d = layers.Dense(5, name="dense_3")(c)
+
+            model = training_lib.Model(
+                inputs=[a, b], outputs=[c, d], name="model"
+            )
+
+            j = layers.Input(shape=(32,), name="input_j")
+            k = layers.Input(shape=(32,), name="input_k")
+            _, n = model([j, k])
+
+            o = layers.Input(shape=(32,), name="input_o")
+            p = layers.Input(shape=(32,), name="input_p")
+            q, _ = model([o, p])
+
+            self.assertListEqual(n.shape.as_list(), [None, 5])
+            self.assertListEqual(q.shape.as_list(), [None, 64])
+            s = layers.concatenate([n, q], name="merge_nq")
+            self.assertListEqual(s.shape.as_list(), [None, 64 + 5])
+
+            # test with single output as 1-elem list
+            multi_io_model = training_lib.Model([j, k, o, p], [s])
+
+            fn = backend.function(multi_io_model.inputs, multi_io_model.outputs)
+            fn_outputs = fn(
+                [
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                ]
+            )
+            self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+            # test with single output as tensor
+            multi_io_model = training_lib.Model([j, k, o, p], s)
+
+            fn = backend.function(multi_io_model.inputs, multi_io_model.outputs)
+            fn_outputs = fn(
+                [
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                ]
+            )
+            # note that the output of the function will still be a 1-elem list
+            self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+            # test serialization
+            model_config = multi_io_model.get_config()
+            recreated_model = models.Model.from_config(model_config)
+
+            fn = backend.function(
+                recreated_model.inputs, recreated_model.outputs
+            )
+            fn_outputs = fn(
+                [
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                    np.random.random((10, 32)),
+                ]
+            )
+            # note that the output of the function will still be a 1-elem list
+            self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
+
+            config = model.get_config()
+            models.Model.from_config(config)
+
+            model.summary()
+            json_str = model.to_json()
+            models.model_from_json(json_str)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_invalid_graphs(self):
+        a = layers.Input(shape=(32,), name="input_a")
+        b = layers.Input(shape=(32,), name="input_b")
+
+        dense = layers.Dense(16, name="dense_1")
+        a_2 = dense(a)
+        b_2 = dense(b)
+        merged = layers.concatenate([a_2, b_2], name="merge")
+        c = layers.Dense(64, name="dense_2")(merged)
+        d = layers.Dense(5, name="dense_3")(c)
+
+        model = training_lib.Model(inputs=[a, b], outputs=[c, d], name="model")
+
+        # disconnected graph
+        j = layers.Input(shape=(32,), name="input_j")
+        k = layers.Input(shape=(32,), name="input_k")
+        m, n = model([j, k])
+        with self.assertRaises(Exception):
+            training_lib.Model([j], [m, n])
+
+        # redundant outputs
+        j = layers.Input(shape=(32,), name="input_j")
+        k = layers.Input(shape=(32,), name="input_k")
+        m, n = model([j, k])
+
+        training_lib.Model([j, k], [m, n, n])
+
+        # redundant inputs
+        j = layers.Input(shape=(32,), name="input_j")
+        k = layers.Input(shape=(32,), name="input_k")
+        m, n = model([j, k])
+        with self.assertRaises(Exception):
+            training_lib.Model([j, k, j], [m, n])
+
+        # i have not idea what I'm doing: garbage as inputs/outputs
+        j = layers.Input(shape=(32,), name="input_j")
+        k = layers.Input(shape=(32,), name="input_k")
+        m, n = model([j, k])
+        with self.assertRaises(Exception):
+            training_lib.Model([j, k], [m, n, 0])
+
+    def test_raw_tf_compatibility(self):
+        with tf.Graph().as_default():
+            # test calling layers/models on TF tensors
+            a = layers.Input(shape=(32,), name="input_a")
+            b = layers.Input(shape=(32,), name="input_b")
+
+            dense = layers.Dense(16, name="dense_1")
+            a_2 = dense(a)
+            b_2 = dense(b)
+            merged = layers.concatenate([a_2, b_2], name="merge")
+            c = layers.Dense(64, name="dense_2")(merged)
+            d = layers.Dense(5, name="dense_3")(c)
+
+            model = training_lib.Model(
+                inputs=[a, b], outputs=[c, d], name="model"
+            )
+
+            j = layers.Input(shape=(32,), name="input_j")
+            k = layers.Input(shape=(32,), name="input_k")
+            self.assertEqual(len(model.inputs), 2)
+            m, n = model([j, k])
+            self.assertEqual(len(model.inputs), 2)
+            tf_model = training_lib.Model([j, k], [m, n])
+
+            j_tf = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 32))
+            k_tf = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 32))
+            m_tf, n_tf = tf_model([j_tf, k_tf])
+            self.assertListEqual(m_tf.shape.as_list(), [None, 64])
+            self.assertListEqual(n_tf.shape.as_list(), [None, 5])
+
+            # test merge
+            layers.concatenate([j_tf, k_tf], axis=1)
+            layers.add([j_tf, k_tf])
+
+            # test tensor input
+            x = tf.compat.v1.placeholder(shape=(None, 2), dtype=tf.float32)
+            layers.InputLayer(input_tensor=x)
+
+            x = layers.Input(tensor=x)
+            layers.Dense(2)(x)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_basic_masking(self):
+        a = layers.Input(shape=(10, 32), name="input_a")
+        b = layers.Masking()(a)
+        model = training_lib.Model(a, b)
+        self.assertEqual(model.output_mask.shape.as_list(), [None, 10])
+
+    def testMaskingSingleInput(self):
+        class MaskedLayer(layers.Layer):
+            def call(self, inputs, mask=None):
+                if mask is not None:
+                    return inputs * mask
+                return inputs
+
+            def compute_mask(self, inputs, mask=None):
+                return tf.ones_like(inputs)
+
+        if tf.executing_eagerly():
+            a = tf.constant([2] * 32)
+            mask = tf.constant([0, 1] * 16)
+            a._keras_mask = mask
+            b = MaskedLayer()(a)
+            self.assertTrue(hasattr(b, "_keras_mask"))
+            self.assertAllEqual(
+                self.evaluate(tf.ones_like(mask)),
+                self.evaluate(getattr(b, "_keras_mask")),
+            )
+            self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
+        else:
+            x = input_layer_lib.Input(shape=(32,))
+            y = MaskedLayer()(x)
+            network = functional.Functional(x, y)
+
+            # test callability on Input
+            x_2 = input_layer_lib.Input(shape=(32,))
+            y_2 = network(x_2)
+            self.assertEqual(y_2.shape.as_list(), [None, 32])
+
+            # test callability on regular tensor
+            x_2 = tf.compat.v1.placeholder(dtype="float32", shape=(None, 32))
+            y_2 = network(x_2)
+            self.assertEqual(y_2.shape.as_list(), [None, 32])
+
+    def test_activity_regularization_with_model_composition(self):
+        def reg(x):
+            return tf.reduce_sum(x)
+
+        net_a_input = input_layer_lib.Input((2,))
+        net_a = net_a_input
+        net_a = layers.Dense(
+            2,
+            kernel_initializer="ones",
+            use_bias=False,
+            activity_regularizer=reg,
+        )(net_a)
+        model_a = training_lib.Model([net_a_input], [net_a])
+
+        net_b_input = input_layer_lib.Input((2,))
+        net_b = model_a(net_b_input)
+        model_b = training_lib.Model([net_b_input], [net_b])
+
+        model_b.compile(optimizer="sgd", loss=None)
+        x = np.ones((1, 2))
+        loss = model_b.evaluate(x)
+        self.assertEqual(loss, 4.0)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_layer_sharing_at_heterogenous_depth(self):
+        x_val = np.random.random((10, 5))
+
+        x = input_layer_lib.Input(shape=(5,))
+        a = layers.Dense(5, name="A")
+        b = layers.Dense(5, name="B")
+        output = a(b(a(b(x))))
+        m = training_lib.Model(x, output)
+        m.run_eagerly = test_utils.should_run_eagerly()
+
+        output_val = m.predict(x_val)
+
+        config = m.get_config()
+        weights = m.get_weights()
+
+        m2 = models.Model.from_config(config)
+        m2.set_weights(weights)
+
+        output_val_2 = m2.predict(x_val)
+        self.assertAllClose(output_val, output_val_2, atol=1e-6)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_layer_sharing_at_heterogenous_depth_with_concat(self):
+        input_shape = (16, 9, 3)
+        input_layer = input_layer_lib.Input(shape=input_shape)
+
+        a = layers.Dense(3, name="dense_A")
+        b = layers.Dense(3, name="dense_B")
+        c = layers.Dense(3, name="dense_C")
+
+        x1 = b(a(input_layer))
+        x2 = a(c(input_layer))
+        output = layers.concatenate([x1, x2])
+
+        m = training_lib.Model(inputs=input_layer, outputs=output)
+        m.run_eagerly = test_utils.should_run_eagerly()
+
+        x_val = np.random.random((10, 16, 9, 3))
+        output_val = m.predict(x_val)
+
+        config = m.get_config()
+        weights = m.get_weights()
+
+        m2 = models.Model.from_config(config)
+        m2.set_weights(weights)
+
+        output_val_2 = m2.predict(x_val)
+        self.assertAllClose(output_val, output_val_2, atol=1e-6)
+
+    def test_layer_sharing_maintains_node_order(self):
+        # See https://github.com/keras-team/keras/issues/14838.
+        inp = input_layer_lib.Input(shape=[5], name="main_input")
+
+        shared_layer = layers.Layer(name="shared")
+
+        ones_result = shared_layer(tf.ones_like(inp))
+        zeros_result = shared_layer(tf.zeros_like(inp))
+        zeros_result = layers.Layer(name="blank")(zeros_result)
+
+        m = training_lib.Model(
+            inputs=[inp], outputs=[zeros_result, ones_result]
+        )
+        m2 = models.Model.from_config(m.get_config())
+        self.assertAllClose(
+            m2.predict_on_batch(tf.zeros([1, 5])),
+            m.predict_on_batch(tf.zeros([1, 5])),
+        )
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_explicit_training_argument(self):
+        a = layers.Input(shape=(2,))
+        b = layers.Dropout(0.5)(a)
+        base_model = training_lib.Model(a, b)
+
+        a = layers.Input(shape=(2,))
+        b = base_model(a, training=False)
+        model = training_lib.Model(a, b)
+
+        x = np.ones((100, 2))
+        y = np.ones((100, 2))
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        loss = model.train_on_batch(x, y)
+        self.assertEqual(
+            loss, 0
+        )  # In inference mode, output is equal to input.
+
+        a = layers.Input(shape=(2,))
+        b = base_model(a, training=True)
+        model = training_lib.Model(a, b)
+        preds = model.predict(x)
+        self.assertEqual(np.min(preds), 0.0)  # At least one unit was dropped.
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_mask_derived_from_keras_layer(self):
+        inputs = input_layer_lib.Input((5, 10))
+        mask = input_layer_lib.Input((5,))
+        outputs = layers.RNN(layers.LSTMCell(100))(inputs, mask=mask)
+        model = training_lib.Model([inputs, mask], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[np.ones((10, 5, 10)), np.zeros((10, 5))],
+            y=np.zeros((10, 100)),
+            batch_size=2,
+        )
+        # All data is masked, returned values are 0's.
+        self.assertEqual(history.history["loss"][0], 0.0)
+        history = model.fit(
+            x=[np.ones((10, 5, 10)), np.ones((10, 5))],
+            y=np.zeros((10, 100)),
+            batch_size=2,
+        )
+        # Data is not masked, returned values are random.
+        self.assertGreater(history.history["loss"][0], 0.0)
+
+        model = training_lib.Model.from_config(model.get_config())
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[np.ones((10, 5, 10)), np.zeros((10, 5))],
+            y=np.zeros((10, 100)),
+            batch_size=2,
+        )
+        # All data is masked, returned values are 0's.
+        self.assertEqual(history.history["loss"][0], 0.0)
+        history = model.fit(
+            x=[np.ones((10, 5, 10)), np.ones((10, 5))],
+            y=np.zeros((10, 100)),
+            batch_size=2,
+        )
+        # Data is not masked, returned values are random.
+        self.assertGreater(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_call_arg_derived_from_keras_layer(self):
+        class MyAdd(layers.Layer):
+            def call(self, x1, x2):
+                return x1 + x2
+
+        input1 = input_layer_lib.Input(10)
+        input2 = input_layer_lib.Input(10)
+        outputs = MyAdd()(input1, input2)
+        model = training_lib.Model([input1, input2], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+            y=10 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        # Check serialization.
+        model = training_lib.Model.from_config(
+            model.get_config(), custom_objects={"MyAdd": MyAdd}
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+            y=10 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(
+        test_combinations.keras_mode_combinations(mode="eager"),
+    )
+    def test_only_some_in_first_arg_derived_from_keras_layer_keras_tensors(
+        self,
+    ):
+        # This functionality is unsupported in v1 graphs
+
+        class MyAddAll(layers.Layer):
+            def call(self, inputs):
+                x = inputs[0]
+                for inp in inputs[1:]:
+                    if inp is not None:
+                        x = x + inp
+                return x
+
+        input1 = input_layer_lib.Input(10)
+        input2 = input_layer_lib.Input(10)
+        layer = MyAddAll()
+        outputs = layer([0.0, input1, None, input2, None])
+        model = training_lib.Model([input1, input2], outputs)
+        self.assertIn(layer, model.layers)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+            y=10 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        # Check serialization.
+        model = training_lib.Model.from_config(
+            model.get_config(), custom_objects={"MyAddAll": MyAddAll}
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+            y=10 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(
+        test_combinations.times(
+            test_combinations.keras_mode_combinations(),
+            test_combinations.combine(share_already_used_layer=[True, False]),
+        )
+    )
+    def test_call_kwarg_derived_from_keras_layer(
+        self, share_already_used_layer
+    ):
+        class MaybeAdd(layers.Layer):
+            def call(self, x1, x2=None):
+                if x2 is not None:
+                    return x1 + x2
+                return x1
+
+        class IdentityLayer(layers.Layer):
+            def call(self, x):
+                return x
+
+        input1 = input_layer_lib.Input(10)
+        input2 = input_layer_lib.Input(10)
+        identity_layer = IdentityLayer()
+
+        if share_already_used_layer:
+            # We have had model serialization/deserialization break in the past:
+            # when a layer was previously used to construct other functional
+            # models and had a non-empty list of inbound nodes before being used
+            # to define the model being serialized/deserialized. (The
+            # serialization/deserialization was not correctly adjusting the
+            # node_index serialization/deserialization). So, we explicitly test
+            # this case.
+            training_lib.Model([input1], identity_layer(input1))
+
+        outputs = MaybeAdd()(input1, x2=identity_layer(input2))
+        model = training_lib.Model([input1, input2], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+            y=10 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        model = training_lib.Model.from_config(
+            model.get_config(),
+            custom_objects={
+                "MaybeAdd": MaybeAdd,
+                "IdentityLayer": IdentityLayer,
+            },
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))],
+            y=10 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_call_kwarg_dtype_serialization(self):
+        class Double(layers.Layer):
+            def call(self, x1, dtype=None):
+                return tf.cast(x1 + x1, dtype=dtype)
+
+        input1 = input_layer_lib.Input(10)
+        outputs = Double()(input1, dtype=tf.float16)
+        model = training_lib.Model([input1], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10))], y=6 * np.ones((10, 10)), batch_size=2
+        )
+        # Check that input was correctly doubled.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        # Check the output dtype
+        self.assertEqual(model(tf.ones((3, 10))).dtype, tf.float16)
+
+        model = training_lib.Model.from_config(
+            model.get_config(), custom_objects={"Double": Double}
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10))], y=6 * np.ones((10, 10)), batch_size=2
+        )
+        # Check that input was correctly doubled.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        # Check the output dtype
+        self.assertEqual(model(tf.ones((3, 10))).dtype, tf.float16)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_call_kwarg_nonserializable(self):
+        class Double(layers.Layer):
+            def call(self, x1, kwarg=None):
+                return x1 + x1
+
+        class NonSerializable:
+            def __init__(self, foo=None):
+                self.foo = foo
+
+        input1 = input_layer_lib.Input(10)
+        outputs = Double()(input1, kwarg=NonSerializable())
+        model = training_lib.Model([input1], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[3 * np.ones((10, 10))], y=6 * np.ones((10, 10)), batch_size=2
+        )
+        # Check that input was correctly doubled.
+        self.assertEqual(history.history["loss"][0], 0.0)
+        with self.assertRaisesRegex(
+            TypeError,
+            "Layer double was passed non-JSON-serializable arguments.",
+        ):
+            model.get_config()
+
+    @test_combinations.generate(
+        test_combinations.times(
+            test_combinations.keras_mode_combinations(),
+            test_combinations.combine(share_already_used_layer=[True, False]),
+        )
+    )
+    def test_call_kwarg_derived_from_keras_layer_and_first_arg_is_constant(
+        self, share_already_used_layer
+    ):
+        class IdentityLayer(layers.Layer):
+            def call(self, x):
+                return x
+
+        class MaybeAdd(layers.Layer):
+            def call(self, x1, x2=None):
+                if x2 is not None:
+                    return x1 + x2
+                return x1
+
+        input2 = input_layer_lib.Input(10)
+        identity_layer = IdentityLayer()
+        if share_already_used_layer:
+            # We have had model serialization/deserialization break in the past:
+            # when a layer was previously used to construct other functional
+            # models and had a non-empty list of inbound nodes before being used
+            # to define the model being serialized/deserialized. (The
+            # serialization/deserialization was not correctly adjusting the
+            # node_index serialization/deserialization). So, we explicitly test
+            # this case.
+            training_lib.Model([input2], identity_layer(input2))
+
+        outputs = MaybeAdd()(3.0, x2=identity_layer(input2))
+        model = training_lib.Model([input2], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=7 * np.ones((10, 10)), y=10 * np.ones((10, 10)), batch_size=2
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        model = training_lib.Model.from_config(
+            model.get_config(),
+            custom_objects={
+                "MaybeAdd": MaybeAdd,
+                "IdentityLayer": IdentityLayer,
+            },
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=7 * np.ones((10, 10)), y=10 * np.ones((10, 10)), batch_size=2
+        )
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_dont_cast_composite_unless_necessary(self):
+        if not tf.executing_eagerly():
+            # Creating Keras inputs from a type_spec only supported in eager.
+            return
+
+        # TODO(edloper): Change this to tf.experimental.ExtensionTyep once
+        # it's been released.
+        class MyType(extension_type.ExtensionType):
+            # TODO(edloper) Remove _shape and _dtype once Keras has been
+            # switched to use .shape and .dtype instead.
+            value: tf.Tensor
+            _shape = property(lambda self: self.value.shape)
+            shape = property(lambda self: self.value.shape)
+            _dtype = property(lambda self: self.value.dtype)
+            dtype = property(lambda self: self.value.dtype)
+
+            class Spec:
+                _shape = property(lambda self: self.value.shape)
+                shape = property(lambda self: self.value.shape)
+                _dtype = property(lambda self: self.value.dtype)
+                dtype = property(lambda self: self.value.dtype)
+
+        my_spec = MyType.Spec(tf.TensorSpec([5], tf.float32))
+        input1 = input_layer_lib.Input(type_spec=my_spec)
+        model = training_lib.Model([input1], input1)
+        model.compile(run_eagerly=test_utils.should_run_eagerly())
+        model(MyType([1.0, 2.0, 3.0, 4.0, 5.0]))  # Does not require cast.
+        with self.assertRaises((ValueError, TypeError)):
+            model(MyType([1, 2, 3, 4, 5]))
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_composite_call_kwarg_derived_from_keras_layer(self):
+
+        # Create a test layer that accepts composite tensor inputs.
+        class MaybeAdd(layers.Layer):
+            def call(self, x1, x2=None):
+                # We need to convert this to a tensor for loss calculations -
+                # losses don't play nicely with ragged tensors yet.
+                if x2 is not None:
+                    return (x1 + x2).to_tensor(default_value=0)
+                return x1.to_tensor(default_value=0)
+
+        input1 = input_layer_lib.Input((None,), ragged=True)
+        input2 = input_layer_lib.Input((None,), ragged=True)
+        outputs = MaybeAdd()(input1, x2=input2)
+        model = training_lib.Model([input1, input2], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        input_data = [
+            tf.ragged.constant([[3.0, 3.0], [3.0, 3.0], [3.0]]),
+            tf.ragged.constant([[7.0, 7.0], [7.0, 7.0], [7.0]]),
+        ]
+        expected_data = np.array([[10.0, 10.0], [10.0, 10.0], [10.0, 0.0]])
+
+        history = model.fit(x=input_data, y=expected_data)
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        model = training_lib.Model.from_config(
+            model.get_config(), custom_objects={"MaybeAdd": MaybeAdd}
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(x=input_data, y=expected_data)
+        # Check that second input was correctly added to first.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(
+        test_combinations.keras_mode_combinations(mode="eager")
+    )
+    def test_call_some_not_all_nested_in_first_arg_derived_from_keras_layer(
+        self,
+    ):
+        # This functionality is unsupported in v1 graphs
+
+        class AddAll(layers.Layer):
+            def call(self, x1_x2, x3):
+                x1, x2 = x1_x2
+                out = x1 + x2
+                if x3 is not None:
+                    for t in x3.values():
+                        out += t
+                return out
+
+        input1 = input_layer_lib.Input(10)
+        input2 = input_layer_lib.Input(10)
+        input3 = input_layer_lib.Input(10)
+
+        layer = AddAll()
+        outputs = layer(
+            [input1, 4 * tf.ones((1, 10))],
+            x3={"a": input2, "b": input3, "c": 5 * tf.ones((1, 10))},
+        )
+        model = training_lib.Model([input1, input2, input3], outputs)
+        self.assertIn(layer, model.layers)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+            y=15 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that all inputs were correctly added.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        model = training_lib.Model.from_config(
+            model.get_config(), custom_objects={"AddAll": AddAll}
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+            y=15 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that all inputs were correctly added.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_call_nested_arg_derived_from_keras_layer(self):
+        class AddAll(layers.Layer):
+            def call(self, x1, x2, x3=None):
+                out = x1 + x2
+                if x3 is not None:
+                    for t in x3.values():
+                        out += t
+                return out
+
+        input1 = input_layer_lib.Input(10)
+        input2 = input_layer_lib.Input(10)
+        input3 = input_layer_lib.Input(10)
+        outputs = AddAll()(
+            input1,
+            4 * tf.ones((1, 10)),
+            x3={"a": input2, "b": input3, "c": 5 * tf.ones((1, 10))},
+        )
+        model = training_lib.Model([input1, input2, input3], outputs)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+            y=15 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that all inputs were correctly added.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+        model = training_lib.Model.from_config(
+            model.get_config(), custom_objects={"AddAll": AddAll}
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        history = model.fit(
+            x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+            y=15 * np.ones((10, 10)),
+            batch_size=2,
+        )
+        # Check that all inputs were correctly added.
+        self.assertEqual(history.history["loss"][0], 0.0)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_multi_output_model_with_none_masking(self):
+        def func(x):
+            return [x * 0.2, x * 0.3]
+
+        def output_shape(input_shape):
+            return [input_shape, input_shape]
+
+        i = layers.Input(shape=(3, 2, 1))
+        o = layers.Lambda(function=func, output_shape=output_shape)(i)
+
+        self.assertEqual(backend.int_shape(o[0]), (None, 3, 2, 1))
+        self.assertEqual(backend.int_shape(o[1]), (None, 3, 2, 1))
+
+        o = layers.add(o)
+        model = training_lib.Model(i, o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        i2 = layers.Input(shape=(3, 2, 1))
+        o2 = model(i2)
+        model2 = training_lib.Model(i2, o2)
+        model2.run_eagerly = test_utils.should_run_eagerly()
+
+        x = np.random.random((4, 3, 2, 1))
+        out = model2.predict(x)
+        assert out.shape == (4, 3, 2, 1)
+        self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_constant_initializer_with_numpy(self):
+        initializer = tf.compat.v1.constant_initializer(np.ones((3, 2)))
+        model = sequential.Sequential()
+        model.add(
+            layers.Dense(2, input_shape=(3,), kernel_initializer=initializer)
+        )
+        model.add(layers.Dense(3))
+        model.compile(
+            loss="mse",
+            optimizer="sgd",
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        json_str = model.to_json()
+        models.model_from_json(json_str)
+
+    def test_subclassed_error_if_init_not_called(self):
+        class MyNetwork(training_lib.Model):
+            def __init__(self):
+                self._foo = [layers.Dense(10), layers.Dense(10)]
+
+        with self.assertRaisesRegex(RuntimeError, "forgot to call"):
+            MyNetwork()
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_int_input_shape(self):
         inputs = input_layer_lib.Input(10)
-        outputs = layers.Dense(1)(inputs)
-        model = functional.Functional(inputs, outputs)
-        d['model'] = model
-      else:
-        model = d['model']
-
-      return model(x)
-
-    x = tf.ones((10, 10))
-    y = fn(x)
-    self.assertEqual(y.shape.as_list(), [10, 1])
-
-  def test_save_spec(self):
-    """Tests that functional model generates the correct save spec."""
-
-    class MultiInputModel(training_lib.Model):
-
-      def call(self, x, y):
-        return x
-
-    inp = input_layer_lib.Input(shape=(1,))
-    inp2 = input_layer_lib.Input(shape=(1,), batch_size=5, dtype=tf.int32)
-    out = MultiInputModel()(inp, inp2)
-    m = training_lib.Model(inputs={'x': inp, 'y': inp2}, outputs=out)
-    input_spec = m.save_spec(dynamic_batch=False)[0][0]
-    self.assertIn('x', input_spec)
-    self.assertIn('y', input_spec)
-    self.assertAllEqual([None, 1], input_spec['x'].shape.as_list())
-    self.assertAllEqual(tf.float32, input_spec['x'].dtype)
-    self.assertAllEqual([5, 1], input_spec['y'].shape.as_list())
-    self.assertAllEqual(tf.int32, input_spec['y'].dtype)
+        self.assertEqual([None, 10], inputs.shape.as_list())
+
+        inputs_with_batch = input_layer_lib.Input(batch_size=20, shape=5)
+        self.assertEqual([20, 5], inputs_with_batch.shape.as_list())
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_model_initialization(self):
+        # Functional model
+        inputs = input_layer_lib.Input(shape=(32,))
+        outputs = layers.Dense(4)(inputs)
+
+        with self.assertRaisesRegex(
+            TypeError, "Keyword argument not understood"
+        ):
+            model = training_lib.Model(
+                inputs, outputs, name="m", trainable=False, dtype="int64"
+            )
+        with self.assertRaisesRegex(
+            TypeError, "Keyword argument not understood"
+        ):
+            model = training_lib.Model(
+                inputs, outputs, name="m", trainable=False, dynamic=False
+            )
+
+        model = training_lib.Model(inputs, outputs, name="m", trainable=False)
+        self.assertEqual("m", model.name)
+        self.assertFalse(model.trainable)
+        self.assertFalse(model.dynamic)
+
+        class SubclassModel(training_lib.Model):
+            pass
+
+        # Subclassed model
+        model = SubclassModel(
+            name="subclassed", trainable=True, dtype="int64", dynamic=True
+        )
+        self.assertEqual("subclassed", model.name)
+        self.assertTrue(model.dynamic)
+        self.assertTrue(model.trainable)
+        w = model.add_weight(
+            "w", [], initializer=tf.compat.v1.constant_initializer(1)
+        )
+        self.assertEqual(tf.int64, w.dtype)
+
+    def test_disconnected_inputs(self):
+        input_tensor1 = input_layer_lib.Input(shape=[200], name="a")
+        input_tensor2 = input_layer_lib.Input(shape=[10], name="b")
+        output_tensor1 = layers.Dense(units=10)(input_tensor1)
+
+        net = functional.Functional(
+            inputs=[input_tensor1, input_tensor2], outputs=[output_tensor1]
+        )
+        net2 = functional.Functional.from_config(net.get_config())
+        self.assertLen(net2.inputs, 2)
+        self.assertEqual("a", net2.layers[0].name)
+        self.assertEqual("b", net2.layers[1].name)
+
+    @test_combinations.generate(
+        test_combinations.keras_model_type_combinations()
+    )
+    def test_dependency_tracking(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.trackable = Checkpoint()
+        self.assertIn("trackable", model._unconditional_dependency_names)
+        self.assertEqual(model.trackable, model._lookup_dependency("trackable"))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_model_construction_in_tf_function(self):
+
+        d = {"model": None}
+
+        @tf.function
+        def fn(x):
+            if d["model"] is None:
+                # Check that Functional can be built in a `tf.function`.
+                inputs = input_layer_lib.Input(10)
+                outputs = layers.Dense(1)(inputs)
+                model = functional.Functional(inputs, outputs)
+                d["model"] = model
+            else:
+                model = d["model"]
+
+            return model(x)
+
+        x = tf.ones((10, 10))
+        y = fn(x)
+        self.assertEqual(y.shape.as_list(), [10, 1])
+
+    def test_save_spec(self):
+        """Tests that functional model generates the correct save spec."""
+
+        class MultiInputModel(training_lib.Model):
+            def call(self, x, y):
+                return x
+
+        inp = input_layer_lib.Input(shape=(1,))
+        inp2 = input_layer_lib.Input(shape=(1,), batch_size=5, dtype=tf.int32)
+        out = MultiInputModel()(inp, inp2)
+        m = training_lib.Model(inputs={"x": inp, "y": inp2}, outputs=out)
+        input_spec = m.save_spec(dynamic_batch=False)[0][0]
+        self.assertIn("x", input_spec)
+        self.assertIn("y", input_spec)
+        self.assertAllEqual([None, 1], input_spec["x"].shape.as_list())
+        self.assertAllEqual(tf.float32, input_spec["x"].dtype)
+        self.assertAllEqual([5, 1], input_spec["y"].shape.as_list())
+        self.assertAllEqual(tf.int32, input_spec["y"].dtype)
+
+    def test_layer_ordering_checkpoint_compatibility(self):
+        class MLPKeras(layers.Layer):
+            def __init__(self, name: str) -> None:
+                super(MLPKeras, self).__init__(name=name)
+                self.layer_1 = layers.Dense(
+                    10, activation="relu", name=f"{name}_dense_1"
+                )
+                self.layer_2 = layers.Dense(
+                    10, activation="relu", name=f"{name}_dense_2"
+                )
+
+            def call(self, inputs: tf.Tensor) -> tf.Tensor:
+                return self.layer_2(self.layer_1(inputs))
+
+        mlp_keras_1 = MLPKeras("mlp_1")
+        mlp_keras_2 = MLPKeras("mlp_2")
+
+        inputs = input_layer_lib.Input((5,))
+
+        # Make model which is the sum of two MLPs.
+        outputs_1 = mlp_keras_1(inputs) + mlp_keras_2(inputs)
+        functional_model_1 = functional.Functional(
+            inputs=inputs, outputs=outputs_1
+        )
+
+        ckpt_1 = Checkpoint(model=functional_model_1)
+        filepath = tf.io.gfile.join(self.get_temp_dir(), "model_1_ckpt")
+        ckpt_path = ckpt_1.save(filepath)
+
+        # Swap order of MLPs.
+        outputs_2 = mlp_keras_2(inputs) + mlp_keras_1(inputs)
+        functional_model_2 = functional.Functional(
+            inputs=inputs, outputs=outputs_2
+        )
+        Checkpoint(model=functional_model_2).restore(
+            ckpt_path
+        ).assert_consumed()
 
 
 class DeferredModeTest(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testSimpleNetworkBuilding(self):
-    inputs = input_layer_lib.Input(shape=(32,))
-    if tf.executing_eagerly():
-      self.assertEqual(inputs.dtype.name, 'float32')
-      self.assertEqual(inputs.shape.as_list(), [None, 32])
-
-    x = layers.Dense(2)(inputs)
-    if tf.executing_eagerly():
-      self.assertEqual(x.dtype.name, 'float32')
-      self.assertEqual(x.shape.as_list(), [None, 2])
-
-    outputs = layers.Dense(4)(x)
-    network = functional.Functional(inputs, outputs)
-    self.assertIsInstance(network, functional.Functional)
-
-    if tf.executing_eagerly():
-      # It should be possible to call such a network on EagerTensors.
-      inputs = tf.constant(
-          np.random.random((10, 32)).astype('float32'))
-      outputs = network(inputs)
-      self.assertEqual(outputs.shape.as_list(), [10, 4])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testMultiIONetworkBuilding(self):
-    input_a = input_layer_lib.Input(shape=(32,))
-    input_b = input_layer_lib.Input(shape=(16,))
-    a = layers.Dense(16)(input_a)
-
-    class AddLayer(layers.Layer):
-
-      def call(self, inputs):
-        return inputs[0] + inputs[1]
-
-    c = AddLayer()([a, input_b])  # pylint: disable=not-callable
-    c = layers.Dense(2)(c)
-
-    network = functional.Functional([input_a, input_b], [a, c])
-    if tf.executing_eagerly():
-      a_val = tf.constant(
-          np.random.random((10, 32)).astype('float32'))
-      b_val = tf.constant(
-          np.random.random((10, 16)).astype('float32'))
-      outputs = network([a_val, b_val])
-      self.assertEqual(len(outputs), 2)
-      self.assertEqual(outputs[0].shape.as_list(), [10, 16])
-      self.assertEqual(outputs[1].shape.as_list(), [10, 2])
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testSimpleNetworkBuilding(self):
+        inputs = input_layer_lib.Input(shape=(32,))
+        if tf.executing_eagerly():
+            self.assertEqual(inputs.dtype.name, "float32")
+            self.assertEqual(inputs.shape.as_list(), [None, 32])
+
+        x = layers.Dense(2)(inputs)
+        if tf.executing_eagerly():
+            self.assertEqual(x.dtype.name, "float32")
+            self.assertEqual(x.shape.as_list(), [None, 2])
+
+        outputs = layers.Dense(4)(x)
+        network = functional.Functional(inputs, outputs)
+        self.assertIsInstance(network, functional.Functional)
+
+        if tf.executing_eagerly():
+            # It should be possible to call such a network on EagerTensors.
+            inputs = tf.constant(np.random.random((10, 32)).astype("float32"))
+            outputs = network(inputs)
+            self.assertEqual(outputs.shape.as_list(), [10, 4])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testMultiIONetworkBuilding(self):
+        input_a = input_layer_lib.Input(shape=(32,))
+        input_b = input_layer_lib.Input(shape=(16,))
+        a = layers.Dense(16)(input_a)
+
+        class AddLayer(layers.Layer):
+            def call(self, inputs):
+                return inputs[0] + inputs[1]
+
+        c = AddLayer()([a, input_b])
+        c = layers.Dense(2)(c)
+
+        network = functional.Functional([input_a, input_b], [a, c])
+        if tf.executing_eagerly():
+            a_val = tf.constant(np.random.random((10, 32)).astype("float32"))
+            b_val = tf.constant(np.random.random((10, 16)).astype("float32"))
+            outputs = network([a_val, b_val])
+            self.assertEqual(len(outputs), 2)
+            self.assertEqual(outputs[0].shape.as_list(), [10, 16])
+            self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
 
 class DefaultShapeInferenceBehaviorTest(test_combinations.TestCase):
-
-  def _testShapeInference(self, model, input_shape, expected_output_shape):
-    input_value = np.random.random(input_shape)
-    output_value = model.predict(input_value)
-    self.assertEqual(output_value.shape, expected_output_shape)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testSingleInputCase(self):
-
-    class LayerWithOneInput(layers.Layer):
-
-      def build(self, input_shape):
-        self.w = tf.ones(shape=(3, 4))
-
-      def call(self, inputs):
-        return backend.dot(inputs, self.w)
-
-    inputs = input_layer_lib.Input(shape=(3,))
-    layer = LayerWithOneInput()
-
-    if tf.executing_eagerly():
-      self.assertEqual(
-          layer.compute_output_shape((None, 3)).as_list(), [None, 4])
-      # As a side-effect, compute_output_shape builds the layer.
-      self.assertTrue(layer.built)
-      # We can still query the layer's compute_output_shape with compatible
-      # input shapes.
-      self.assertEqual(
-          layer.compute_output_shape((6, 3)).as_list(), [6, 4])
-
-    outputs = layer(inputs)
-    model = training_lib.Model(inputs, outputs)
-    self._testShapeInference(model, (2, 3), (2, 4))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testMultiInputOutputCase(self):
-
-    class MultiInputOutputLayer(layers.Layer):
-
-      def build(self, input_shape):
-        self.w = tf.ones(shape=(3, 4))
-
-      def call(self, inputs):
-        a = backend.dot(inputs[0], self.w)
-        b = a + inputs[1]
-        return [a, b]
-
-    input_a = input_layer_lib.Input(shape=(3,))
-    input_b = input_layer_lib.Input(shape=(4,))
-    output_a, output_b = MultiInputOutputLayer()([input_a, input_b])
-    model = training_lib.Model([input_a, input_b], [output_a, output_b])
-    output_a_val, output_b_val = model.predict(
-        [np.random.random((2, 3)), np.random.random((2, 4))])
-    self.assertEqual(output_a_val.shape, (2, 4))
-    self.assertEqual(output_b_val.shape, (2, 4))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testTrainingArgument(self):
-
-    class LayerWithTrainingArg(layers.Layer):
-
-      def build(self, input_shape):
-        self.w = tf.ones(shape=(3, 4))
-
-      def call(self, inputs, training):
-        return backend.dot(inputs, self.w)
-
-    inputs = input_layer_lib.Input(shape=(3,))
-    outputs = LayerWithTrainingArg()(inputs, training=False)
-    model = training_lib.Model(inputs, outputs)
-    self._testShapeInference(model, (2, 3), (2, 4))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoneInShape(self):
-
-    class Model(training_lib.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.conv1 = layers.Conv2D(8, 3)
-        self.pool = layers.GlobalAveragePooling2D()
-        self.fc = layers.Dense(3)
-
-      def call(self, x):
-        x = self.conv1(x)
-        x = self.pool(x)
-        x = self.fc(x)
-        return x
-
-    model = Model()
-    model.build(tf.TensorShape((None, None, None, 1)))
-    self.assertTrue(model.built, 'Model should be built')
-    self.assertTrue(model.weights,
-                    'Model should have its weights created as it '
-                    'has been built')
-    sample_input = tf.ones((1, 10, 10, 1))
-    output = model(sample_input)
-    self.assertEqual(output.shape, (1, 3))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoneInShapeWithCompoundModel(self):
-
-    class BasicBlock(training_lib.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.conv1 = layers.Conv2D(8, 3)
-        self.pool = layers.GlobalAveragePooling2D()
-        self.dense = layers.Dense(3)
-
-      def call(self, x):
-        x = self.conv1(x)
-        x = self.pool(x)
-        x = self.dense(x)
-        return x
-
-    class CompoundModel(training_lib.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.block = BasicBlock()
-
-      def call(self, x):
-        x = self.block(x)  # pylint: disable=not-callable
-        return x
-
-    model = CompoundModel()
-    model.build(tf.TensorShape((None, None, None, 1)))
-    self.assertTrue(model.built, 'Model should be built')
-    self.assertTrue(model.weights,
-                    'Model should have its weights created as it '
-                    'has been built')
-    sample_input = tf.ones((1, 10, 10, 1))
-    output = model(sample_input)  # pylint: disable=not-callable
-    self.assertEqual(output.shape, (1, 3))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoneInShapeWithFunctionalAPI(self):
-
-    class BasicBlock(training_lib.Model):
-      # Inheriting from layers.Layer since we are calling this layer
-      # inside a model created using functional API.
-
-      def __init__(self):
-        super().__init__()
-        self.conv1 = layers.Conv2D(8, 3)
-
-      def call(self, x):
-        x = self.conv1(x)
-        return x
-
-    input_layer = layers.Input(shape=(None, None, 1))
-    x = BasicBlock()(input_layer)
-    x = layers.GlobalAveragePooling2D()(x)
-    output_layer = layers.Dense(3)(x)
-
-    model = training_lib.Model(inputs=input_layer, outputs=output_layer)
-
-    model.build(tf.TensorShape((None, None, None, 1)))
-    self.assertTrue(model.built, 'Model should be built')
-    self.assertTrue(model.weights,
-                    'Model should have its weights created as it '
-                    'has been built')
-    sample_input = tf.ones((1, 10, 10, 1))
-    output = model(sample_input)
-    self.assertEqual(output.shape, (1, 3))
-
-  @test_combinations.generate(test_combinations.keras_mode_combinations())
-  def test_sequential_as_downstream_of_masking_layer(self):
-    inputs = layers.Input(shape=(3, 4))
-    x = layers.Masking(mask_value=0., input_shape=(3, 4))(inputs)
-
-    s = sequential.Sequential()
-    s.add(layers.Dense(5, input_shape=(4,)))
-
-    x = layers.TimeDistributed(s)(x)
-    model = training_lib.Model(inputs=inputs, outputs=x)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model_input = np.random.randint(
-        low=1, high=5, size=(10, 3, 4)).astype('float32')
-    for i in range(4):
-      model_input[i, i:, :] = 0.
-    model.fit(model_input,
-              np.random.random((10, 3, 5)), epochs=1, batch_size=6)
-
-    if not tf.executing_eagerly():
-      # Note: this doesn't work in eager due to DeferredTensor/ops compatibility
-      # issue.
-      mask_outputs = [model.layers[1].compute_mask(model.layers[1].input)]
-      mask_outputs += [model.layers[2].compute_mask(
-          model.layers[2].input, mask_outputs[-1])]
-      func = backend.function([model.input], mask_outputs)
-      mask_outputs_val = func([model_input])
-      self.assertAllClose(mask_outputs_val[0], np.any(model_input, axis=-1))
-      self.assertAllClose(mask_outputs_val[1], np.any(model_input, axis=-1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_external_keras_serialization_compat_input_layers(self):
-    inputs = input_layer_lib.Input(shape=(10,))
-    outputs = layers.Dense(1)(inputs)
-    model = training_lib.Model(inputs, outputs)
-    config = model.get_config()
-    # Checks that single inputs and outputs are still saved as 1-element lists.
-    # Saving as 1-element lists or not is equivalent in TF Keras, but only the
-    # 1-element list format is supported in TF.js and keras-team/Keras.
-    self.assertLen(config['input_layers'], 1)
-    self.assertLen(config['output_layers'], 1)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_external_keras_serialization_compat_inbound_nodes(self):
-    # Check single Tensor input.
-    inputs = input_layer_lib.Input(shape=(10,), name='in')
-    outputs = layers.Dense(1)(inputs)
-    model = training_lib.Model(inputs, outputs)
-    config = model.get_config()
-    self.assertEqual(config['layers'][1]['inbound_nodes'], [[['in', 0, 0, {}]]])
-
-    # Check multiple Tensor input.
-    inputs1 = input_layer_lib.Input(shape=(10,), name='in1')
-    inputs2 = input_layer_lib.Input(shape=(10,), name='in2')
-    outputs = layers.Add()([inputs1, inputs2])
-    model = training_lib.Model([inputs1, inputs2], outputs)
-    config = model.get_config()
-    self.assertEqual(config['layers'][2]['inbound_nodes'],
-                     [[['in1', 0, 0, {}], ['in2', 0, 0, {}]]])
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_dict_inputs_tensors(self):
-    # Note that this test is running with v2 eager only, since the v1
-    # will behave differently wrt to dict input for training.
-    inputs = {
-        'sentence2': input_layer_lib.Input(
-            shape=(), name='a', dtype=tf.string),
-        'sentence1': input_layer_lib.Input(
-            shape=(), name='b', dtype=tf.string),
-    }
-    strlen = layers.Lambda(tf.strings.length)
-    diff = layers.Subtract()(
-        [strlen(inputs['sentence1']), strlen(inputs['sentence2'])])
-    diff = tf.cast(diff, tf.float32)
-    model = training_lib.Model(inputs, diff)
-
-    extra_keys = {
-        'sentence1': tf.constant(['brown fox', 'lazy dog']),
-        'sentence2': tf.constant(['owl', 'cheeky cat']),
-        'label': tf.constant([0, 1]),
-    }
-
-    with warnings.catch_warnings(record=True) as w:
-      warnings.simplefilter('always')
-      model(extra_keys)
-      self.assertIn('ignored by the model', str(w[-1].message))
-
-    model.compile('sgd', 'mse')
-    with warnings.catch_warnings(record=True) as w:
-      warnings.simplefilter('always')
-      model.fit(extra_keys, y=tf.constant([0, 1]), steps_per_epoch=1)
-      self.assertIn('ignored by the model', str(w[-1].message))
-
-    with warnings.catch_warnings(record=True) as w:
-      warnings.simplefilter('always')
-      model.evaluate(extra_keys, tf.constant([0, 1]))
-      self.assertIn('ignored by the model', str(w[-1].message))
-
-    # Make sure the model inputs are sorted with the dict keys.
-    self.assertEqual(model.inputs[0]._keras_history.layer.name, 'b')
-    self.assertEqual(model.inputs[1]._keras_history.layer.name, 'a')
+    def _testShapeInference(self, model, input_shape, expected_output_shape):
+        input_value = np.random.random(input_shape)
+        output_value = model.predict(input_value)
+        self.assertEqual(output_value.shape, expected_output_shape)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testSingleInputCase(self):
+        class LayerWithOneInput(layers.Layer):
+            def build(self, input_shape):
+                self.w = tf.ones(shape=(3, 4))
+
+            def call(self, inputs):
+                return backend.dot(inputs, self.w)
+
+        inputs = input_layer_lib.Input(shape=(3,))
+        layer = LayerWithOneInput()
+
+        if tf.executing_eagerly():
+            self.assertEqual(
+                layer.compute_output_shape((None, 3)).as_list(), [None, 4]
+            )
+            # As a side-effect, compute_output_shape builds the layer.
+            self.assertTrue(layer.built)
+            # We can still query the layer's compute_output_shape with
+            # compatible input shapes.
+            self.assertEqual(
+                layer.compute_output_shape((6, 3)).as_list(), [6, 4]
+            )
+
+        outputs = layer(inputs)
+        model = training_lib.Model(inputs, outputs)
+        self._testShapeInference(model, (2, 3), (2, 4))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testMultiInputOutputCase(self):
+        class MultiInputOutputLayer(layers.Layer):
+            def build(self, input_shape):
+                self.w = tf.ones(shape=(3, 4))
+
+            def call(self, inputs):
+                a = backend.dot(inputs[0], self.w)
+                b = a + inputs[1]
+                return [a, b]
+
+        input_a = input_layer_lib.Input(shape=(3,))
+        input_b = input_layer_lib.Input(shape=(4,))
+        output_a, output_b = MultiInputOutputLayer()([input_a, input_b])
+        model = training_lib.Model([input_a, input_b], [output_a, output_b])
+        output_a_val, output_b_val = model.predict(
+            [np.random.random((2, 3)), np.random.random((2, 4))]
+        )
+        self.assertEqual(output_a_val.shape, (2, 4))
+        self.assertEqual(output_b_val.shape, (2, 4))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTrainingArgument(self):
+        class LayerWithTrainingArg(layers.Layer):
+            def build(self, input_shape):
+                self.w = tf.ones(shape=(3, 4))
+
+            def call(self, inputs, training):
+                return backend.dot(inputs, self.w)
+
+        inputs = input_layer_lib.Input(shape=(3,))
+        outputs = LayerWithTrainingArg()(inputs, training=False)
+        model = training_lib.Model(inputs, outputs)
+        self._testShapeInference(model, (2, 3), (2, 4))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoneInShape(self):
+        class Model(training_lib.Model):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = layers.Conv2D(8, 3)
+                self.pool = layers.GlobalAveragePooling2D()
+                self.fc = layers.Dense(3)
+
+            def call(self, x):
+                x = self.conv1(x)
+                x = self.pool(x)
+                x = self.fc(x)
+                return x
+
+        model = Model()
+        model.build(tf.TensorShape((None, None, None, 1)))
+        self.assertTrue(model.built, "Model should be built")
+        self.assertTrue(
+            model.weights,
+            "Model should have its weights created as it has been built",
+        )
+        sample_input = tf.ones((1, 10, 10, 1))
+        output = model(sample_input)
+        self.assertEqual(output.shape, (1, 3))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoneInShapeWithCompoundModel(self):
+        class BasicBlock(training_lib.Model):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = layers.Conv2D(8, 3)
+                self.pool = layers.GlobalAveragePooling2D()
+                self.dense = layers.Dense(3)
+
+            def call(self, x):
+                x = self.conv1(x)
+                x = self.pool(x)
+                x = self.dense(x)
+                return x
+
+        class CompoundModel(training_lib.Model):
+            def __init__(self):
+                super().__init__()
+                self.block = BasicBlock()
+
+            def call(self, x):
+                x = self.block(x)
+                return x
+
+        model = CompoundModel()
+        model.build(tf.TensorShape((None, None, None, 1)))
+        self.assertTrue(model.built, "Model should be built")
+        self.assertTrue(
+            model.weights,
+            "Model should have its weights created as it has been built",
+        )
+        sample_input = tf.ones((1, 10, 10, 1))
+        output = model(sample_input)
+        self.assertEqual(output.shape, (1, 3))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoneInShapeWithFunctionalAPI(self):
+        class BasicBlock(training_lib.Model):
+            # Inheriting from layers.Layer since we are calling this layer
+            # inside a model created using functional API.
+
+            def __init__(self):
+                super().__init__()
+                self.conv1 = layers.Conv2D(8, 3)
+
+            def call(self, x):
+                x = self.conv1(x)
+                return x
+
+        input_layer = layers.Input(shape=(None, None, 1))
+        x = BasicBlock()(input_layer)
+        x = layers.GlobalAveragePooling2D()(x)
+        output_layer = layers.Dense(3)(x)
+
+        model = training_lib.Model(inputs=input_layer, outputs=output_layer)
+
+        model.build(tf.TensorShape((None, None, None, 1)))
+        self.assertTrue(model.built, "Model should be built")
+        self.assertTrue(
+            model.weights,
+            "Model should have its weights created as it has been built",
+        )
+        sample_input = tf.ones((1, 10, 10, 1))
+        output = model(sample_input)
+        self.assertEqual(output.shape, (1, 3))
+
+    @test_combinations.generate(test_combinations.keras_mode_combinations())
+    def test_sequential_as_downstream_of_masking_layer(self):
+        inputs = layers.Input(shape=(3, 4))
+        x = layers.Masking(mask_value=0.0, input_shape=(3, 4))(inputs)
+
+        s = sequential.Sequential()
+        s.add(layers.Dense(5, input_shape=(4,)))
+
+        x = layers.TimeDistributed(s)(x)
+        model = training_lib.Model(inputs=inputs, outputs=x)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model_input = np.random.randint(low=1, high=5, size=(10, 3, 4)).astype(
+            "float32"
+        )
+        for i in range(4):
+            model_input[i, i:, :] = 0.0
+        model.fit(
+            model_input, np.random.random((10, 3, 5)), epochs=1, batch_size=6
+        )
+
+        if not tf.executing_eagerly():
+            # Note: this doesn't work in eager due to DeferredTensor/ops
+            # compatibility issue.
+            mask_outputs = [model.layers[1].compute_mask(model.layers[1].input)]
+            mask_outputs += [
+                model.layers[2].compute_mask(
+                    model.layers[2].input, mask_outputs[-1]
+                )
+            ]
+            func = backend.function([model.input], mask_outputs)
+            mask_outputs_val = func([model_input])
+            self.assertAllClose(
+                mask_outputs_val[0], np.any(model_input, axis=-1)
+            )
+            self.assertAllClose(
+                mask_outputs_val[1], np.any(model_input, axis=-1)
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_external_keras_serialization_compat_input_layers(self):
+        inputs = input_layer_lib.Input(shape=(10,))
+        outputs = layers.Dense(1)(inputs)
+        model = training_lib.Model(inputs, outputs)
+        config = model.get_config()
+        # Checks that single inputs and outputs are still saved as 1-element
+        # lists.  Saving as 1-element lists or not is equivalent in TF Keras,
+        # but only the 1-element list format is supported in TF.js and
+        # keras-team/Keras.
+        self.assertLen(config["input_layers"], 1)
+        self.assertLen(config["output_layers"], 1)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    @test_utils.run_v2_only
+    def test_save_load_with_single_elem_list_inputs_saved_model(self):
+        class MyLayer(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self._preserve_input_structure_in_config = True
+
+            def call(self, inputs):
+                return inputs[0]
+
+        inputs = input_layer_lib.Input(shape=(3,))
+        layer = MyLayer()
+        outputs = layer([inputs])
+
+        model = training_lib.Model(inputs=inputs, outputs=outputs)
+        model.save("/tmp/km2")
+
+        save.load_model("/tmp/km2")
+
+    @test_utils.run_v2_only
+    def test_save_load_with_single_elem_list_inputs_keras_v3(self):
+        @object_registration.register_keras_serializable()
+        class MyLayer(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self._preserve_input_structure_in_config = True
+
+            def call(self, inputs):
+                return inputs[0]
+
+        inputs = input_layer_lib.Input(shape=(3,))
+        layer = MyLayer()
+        outputs = layer([inputs])
+
+        model = training_lib.Model(inputs=inputs, outputs=outputs)
+        model.save("/tmp/model.keras")
+
+        models.load_model("/tmp/model.keras")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_external_keras_serialization_compat_inbound_nodes(self):
+        # Check single Tensor input.
+        inputs = input_layer_lib.Input(shape=(10,), name="in")
+        outputs = layers.Dense(1)(inputs)
+        model = training_lib.Model(inputs, outputs)
+        config = model.get_config()
+        self.assertEqual(
+            config["layers"][1]["inbound_nodes"], [[["in", 0, 0, {}]]]
+        )
+
+        # Check multiple Tensor input.
+        inputs1 = input_layer_lib.Input(shape=(10,), name="in1")
+        inputs2 = input_layer_lib.Input(shape=(10,), name="in2")
+        outputs = layers.Add()([inputs1, inputs2])
+        model = training_lib.Model([inputs1, inputs2], outputs)
+        config = model.get_config()
+        self.assertEqual(
+            config["layers"][2]["inbound_nodes"],
+            [[["in1", 0, 0, {}], ["in2", 0, 0, {}]]],
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_dict_inputs_tensors(self):
+        # Note that this test is running with v2 eager only, since the v1
+        # will behave differently wrt to dict input for training.
+        inputs = {
+            "sentence2": input_layer_lib.Input(
+                shape=(), name="a", dtype=tf.string
+            ),
+            "sentence1": input_layer_lib.Input(
+                shape=(), name="b", dtype=tf.string
+            ),
+        }
+        strlen = layers.Lambda(tf.strings.length)
+        diff = layers.Subtract()(
+            [strlen(inputs["sentence1"]), strlen(inputs["sentence2"])]
+        )
+        diff = tf.cast(diff, tf.float32)
+        model = training_lib.Model(inputs, diff)
+
+        extra_keys = {
+            "sentence1": tf.constant(["brown fox", "lazy dog"]),
+            "sentence2": tf.constant(["owl", "cheeky cat"]),
+            "label": tf.constant([0, 1]),
+        }
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            model(extra_keys)
+            self.assertIn("ignored by the model", str(w[-1].message))
+
+        model.compile("sgd", "mse")
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            model.fit(extra_keys, y=tf.constant([0, 1]), steps_per_epoch=1)
+            self.assertIn("ignored by the model", str(w[-1].message))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            model.evaluate(extra_keys, tf.constant([0, 1]))
+            self.assertIn("ignored by the model", str(w[-1].message))
+
+        # Make sure the model inputs are sorted with the dict keys.
+        self.assertEqual(model.inputs[0]._keras_history.layer.name, "b")
+        self.assertEqual(model.inputs[1]._keras_history.layer.name, "a")
 
 
 class GraphUtilsTest(tf.test.TestCase):
-
-  def testGetReachableFromInputs(self):
-
-    with tf.Graph().as_default(), self.cached_session():
-      pl_1 = tf.compat.v1.placeholder(shape=None, dtype='float32')
-      pl_2 = tf.compat.v1.placeholder(shape=None, dtype='float32')
-      pl_3 = tf.compat.v1.placeholder(shape=None, dtype='float32')
-      x_1 = pl_1 + pl_2
-      x_2 = pl_2 * 2
-      x_3 = pl_3 + 1
-      x_4 = x_1 + x_2
-      x_5 = x_3 * pl_1
-
-      self.assertEqual(
-          tf_utils.get_reachable_from_inputs([pl_1]),
-          {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op})
-      self.assertEqual(
-          tf_utils.get_reachable_from_inputs([pl_1, pl_2]),
-          {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op})
-      self.assertEqual(
-          tf_utils.get_reachable_from_inputs([pl_3]),
-          {pl_3, x_3, x_5, x_3.op, x_5.op})
-      self.assertEqual(
-          tf_utils.get_reachable_from_inputs([x_3]), {x_3, x_5, x_5.op})
+    def testGetReachableFromInputs(self):
+
+        with tf.Graph().as_default(), self.cached_session():
+            pl_1 = tf.compat.v1.placeholder(shape=None, dtype="float32")
+            pl_2 = tf.compat.v1.placeholder(shape=None, dtype="float32")
+            pl_3 = tf.compat.v1.placeholder(shape=None, dtype="float32")
+            x_1 = pl_1 + pl_2
+            x_2 = pl_2 * 2
+            x_3 = pl_3 + 1
+            x_4 = x_1 + x_2
+            x_5 = x_3 * pl_1
+
+            self.assertEqual(
+                tf_utils.get_reachable_from_inputs([pl_1]),
+                {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op},
+            )
+            self.assertEqual(
+                tf_utils.get_reachable_from_inputs([pl_1, pl_2]),
+                {
+                    pl_1,
+                    pl_2,
+                    x_1,
+                    x_2,
+                    x_4,
+                    x_5,
+                    x_1.op,
+                    x_2.op,
+                    x_4.op,
+                    x_5.op,
+                },
+            )
+            self.assertEqual(
+                tf_utils.get_reachable_from_inputs([pl_3]),
+                {pl_3, x_3, x_5, x_3.op, x_5.op},
+            )
+            self.assertEqual(
+                tf_utils.get_reachable_from_inputs([x_3]), {x_3, x_5, x_5.op}
+            )
 
 
 class NestedNetworkTest(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_nested_inputs_network(self):
-    inputs = {
-        'x1': input_layer_lib.Input(shape=(1,)),
-        'x2': input_layer_lib.Input(shape=(1,))
-    }
-    outputs = layers.Add()([inputs['x1'], inputs['x2']])
-    network = functional.Functional(inputs, outputs)
-
-    network = functional.Functional.from_config(network.get_config())
-
-    result_tensor = network({
-        'x1': tf.ones((1, 1), 'float32'),
-        'x2': tf.ones((1, 1), 'float32')
-    })
-    result = self.evaluate(result_tensor)
-    self.assertAllEqual(result, [[2.]])
-
-    # TODO(b/122726584): Investigate why concrete batch is flaky in some builds.
-    output_shape = network.compute_output_shape({
-        'x1': (None, 1),
-        'x2': (None, 1)
-    })
-    self.assertListEqual(output_shape.as_list(), [None, 1])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_nested_outputs_network(self):
-    inputs = input_layer_lib.Input(shape=(1,))
-    outputs = {
-        'x+x': layers.Add()([inputs, inputs]),
-        'x*x': layers.Multiply()([inputs, inputs])
-    }
-
-    network = functional.Functional(inputs, outputs)
-
-    network = functional.Functional.from_config(network.get_config())
-
-    result_tensor = network(tf.ones((1, 1), 'float32'))
-    result = self.evaluate(result_tensor)
-    self.assertAllEqual(result['x+x'], [[2.]])
-    self.assertAllEqual(result['x*x'], [[1.]])
-
-    output_shape = network.compute_output_shape((None, 1))
-    self.assertListEqual(output_shape['x+x'].as_list(), [None, 1])
-    self.assertListEqual(output_shape['x*x'].as_list(), [None, 1])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_nested_network_inside_network(self):
-    inner_inputs = {
-        'x1': input_layer_lib.Input(shape=(1,)),
-        'x2': input_layer_lib.Input(shape=(1,))
-    }
-    inner_outputs = {
-        'x1+x2': layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
-        'x1*x2': layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
-    }
-    inner_network = functional.Functional(
-        inner_inputs, inner_outputs)
-
-    inputs = [
-        input_layer_lib.Input(shape=(1,)),
-        input_layer_lib.Input(shape=(1,))
-    ]
-    middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
-    outputs = layers.Add()([middle['x1+x2'], middle['x1*x2']])
-    network = functional.Functional(inputs, outputs)
-
-    network = functional.Functional.from_config(network.get_config())
-
-    # Computes: `(x1+x2) + (x1*x2)`
-    result_tensor = network(
-        [tf.ones((1, 1), 'float32'),
-         tf.ones((1, 1), 'float32')])
-    result = self.evaluate(result_tensor)
-    self.assertAllEqual(result, [[3.]])
-
-    output_shape = network.compute_output_shape([(None, 1), (None, 1)])
-    self.assertListEqual(output_shape.as_list(), [None, 1])
-
-  @test_combinations.generate(test_combinations.combine(mode=['graph']))
-  def test_updates_with_direct_call(self):
-    inputs = input_layer_lib.Input(shape=(10,))
-    x = layers.BatchNormalization()(inputs)
-    x = layers.Dense(10)(x)
-    model = training_lib.Model(inputs, x)
-
-    ph = backend.placeholder(shape=(10, 10))
-    model(ph)
-
-    self.assertLen(model.updates, 4)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_dict_mapping_input(self):
-
-    class ReturnFirst(layers.Layer):
-
-      def call(self, inputs):
-        b, _ = inputs
-        return b
-
-    # Checks that inputs are put in same order as the
-    # Model was constructed with.
-    b = input_layer_lib.Input(shape=(10,), name='b')
-    a = input_layer_lib.Input(shape=(10,), name='a')
-    outputs = ReturnFirst()([b, a])
-
-    b_val = tf.ones((10, 10))
-    a_val = tf.zeros((10, 10))
-
-    model = training_lib.Model([b, a], outputs)
-    res = model({'a': a_val, 'b': b_val})
-    self.assertAllClose(self.evaluate(res), self.evaluate(b_val))
-
-    reversed_model = training_lib.Model([a, b], outputs)
-    res = reversed_model({'a': a_val, 'b': b_val})
-    self.assertAllClose(self.evaluate(res), self.evaluate(b_val))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_dict_mapping_single_input(self):
-    b = input_layer_lib.Input(shape=(1,), name='b')
-    outputs = b * 2
-    model = training_lib.Model(b, outputs)
-
-    b_val = tf.ones((1, 1))
-    extra_val = tf.ones((1, 10))
-
-    inputs = {'a': extra_val, 'b': b_val}
-    res = model(inputs)
-
-    # Check that 'b' was used and 'a' was ignored.
-    self.assertEqual(res.shape.as_list(), [1, 1])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_nested_dict_mapping(self):
-    a = input_layer_lib.Input(shape=(1,), dtype='int32', name='a')
-    b = input_layer_lib.Input(shape=(1,), dtype='int32', name='b')
-    c = input_layer_lib.Input(shape=(1,), dtype='int32', name='c')
-    d = input_layer_lib.Input(shape=(1,), dtype='int32', name='d')
-    inputs = {'a': (a, b), 'c': (c, d)}
-    outputs = 1000 * a + 100 * b + 10 * c + d
-    model = training_lib.Model(inputs, outputs)
-
-    a_val = tf.ones((1, 1), dtype='int32')
-    b_val = 2 * tf.ones((1, 1), dtype='int32')
-    c_val = 3 * tf.ones((1, 1), dtype='int32')
-    d_val = 4 * tf.ones((1, 1), dtype='int32')
-
-    inputs_val = {'a': (a_val, b_val), 'c': (c_val, d_val)}
-    res = model(inputs_val)
-
-    # Check that inputs were flattened in the correct order.
-    self.assertFalse(model._enable_dict_to_input_mapping)
-    self.assertEqual(self.evaluate(res), [1234])
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_nested_inputs_network(self):
+        inputs = {
+            "x1": input_layer_lib.Input(shape=(1,)),
+            "x2": input_layer_lib.Input(shape=(1,)),
+        }
+        outputs = layers.Add()([inputs["x1"], inputs["x2"]])
+        network = functional.Functional(inputs, outputs)
+
+        network = functional.Functional.from_config(network.get_config())
+
+        result_tensor = network(
+            {"x1": tf.ones((1, 1), "float32"), "x2": tf.ones((1, 1), "float32")}
+        )
+        result = self.evaluate(result_tensor)
+        self.assertAllEqual(result, [[2.0]])
+
+        # TODO(b/122726584): Investigate why concrete batch is flaky in some
+        # builds.
+        output_shape = network.compute_output_shape(
+            {"x1": (None, 1), "x2": (None, 1)}
+        )
+        self.assertListEqual(output_shape.as_list(), [None, 1])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_nested_outputs_network(self):
+        inputs = input_layer_lib.Input(shape=(1,))
+        outputs = {
+            "x+x": layers.Add()([inputs, inputs]),
+            "x*x": layers.Multiply()([inputs, inputs]),
+        }
+
+        network = functional.Functional(inputs, outputs)
+
+        network = functional.Functional.from_config(network.get_config())
+
+        result_tensor = network(tf.ones((1, 1), "float32"))
+        result = self.evaluate(result_tensor)
+        self.assertAllEqual(result["x+x"], [[2.0]])
+        self.assertAllEqual(result["x*x"], [[1.0]])
+
+        output_shape = network.compute_output_shape((None, 1))
+        self.assertListEqual(output_shape["x+x"].as_list(), [None, 1])
+        self.assertListEqual(output_shape["x*x"].as_list(), [None, 1])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_nested_network_inside_network(self):
+        inner_inputs = {
+            "x1": input_layer_lib.Input(shape=(1,)),
+            "x2": input_layer_lib.Input(shape=(1,)),
+        }
+        inner_outputs = {
+            "x1+x2": layers.Add()([inner_inputs["x1"], inner_inputs["x2"]]),
+            "x1*x2": layers.Multiply()(
+                [inner_inputs["x1"], inner_inputs["x2"]]
+            ),
+        }
+        inner_network = functional.Functional(inner_inputs, inner_outputs)
+
+        inputs = [
+            input_layer_lib.Input(shape=(1,)),
+            input_layer_lib.Input(shape=(1,)),
+        ]
+        middle = inner_network({"x1": inputs[0], "x2": inputs[1]})
+        outputs = layers.Add()([middle["x1+x2"], middle["x1*x2"]])
+        network = functional.Functional(inputs, outputs)
+
+        network = functional.Functional.from_config(network.get_config())
+
+        # Computes: `(x1+x2) + (x1*x2)`
+        result_tensor = network(
+            [tf.ones((1, 1), "float32"), tf.ones((1, 1), "float32")]
+        )
+        result = self.evaluate(result_tensor)
+        self.assertAllEqual(result, [[3.0]])
+
+        output_shape = network.compute_output_shape([(None, 1), (None, 1)])
+        self.assertListEqual(output_shape.as_list(), [None, 1])
+
+    @test_combinations.generate(test_combinations.combine(mode=["graph"]))
+    def test_updates_with_direct_call(self):
+        inputs = input_layer_lib.Input(shape=(10,))
+        x = layers.BatchNormalization()(inputs)
+        x = layers.Dense(10)(x)
+        model = training_lib.Model(inputs, x)
+
+        ph = backend.placeholder(shape=(10, 10))
+        model(ph)
+
+        self.assertLen(model.updates, 4)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_dict_mapping_input(self):
+        class ReturnFirst(layers.Layer):
+            def call(self, inputs):
+                b, _ = inputs
+                return b
+
+        # Checks that inputs are put in same order as the
+        # Model was constructed with.
+        b = input_layer_lib.Input(shape=(10,), name="b")
+        a = input_layer_lib.Input(shape=(10,), name="a")
+        outputs = ReturnFirst()([b, a])
+
+        b_val = tf.ones((10, 10))
+        a_val = tf.zeros((10, 10))
+
+        model = training_lib.Model([b, a], outputs)
+        res = model({"a": a_val, "b": b_val})
+        self.assertAllClose(self.evaluate(res), self.evaluate(b_val))
+
+        reversed_model = training_lib.Model([a, b], outputs)
+        res = reversed_model({"a": a_val, "b": b_val})
+        self.assertAllClose(self.evaluate(res), self.evaluate(b_val))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_dict_mapping_single_input(self):
+        b = input_layer_lib.Input(shape=(1,), name="b")
+        outputs = b * 2
+        model = training_lib.Model(b, outputs)
+
+        b_val = tf.ones((1, 1))
+        extra_val = tf.ones((1, 10))
+
+        inputs = {"a": extra_val, "b": b_val}
+        res = model(inputs)
+
+        # Check that 'b' was used and 'a' was ignored.
+        self.assertEqual(res.shape.as_list(), [1, 1])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_nested_dict_mapping(self):
+        a = input_layer_lib.Input(shape=(1,), dtype="int32", name="a")
+        b = input_layer_lib.Input(shape=(1,), dtype="int32", name="b")
+        c = input_layer_lib.Input(shape=(1,), dtype="int32", name="c")
+        d = input_layer_lib.Input(shape=(1,), dtype="int32", name="d")
+        inputs = {"a": (a, b), "c": (c, d)}
+        outputs = 1000 * a + 100 * b + 10 * c + d
+        model = training_lib.Model(inputs, outputs)
+
+        a_val = tf.ones((1, 1), dtype="int32")
+        b_val = 2 * tf.ones((1, 1), dtype="int32")
+        c_val = 3 * tf.ones((1, 1), dtype="int32")
+        d_val = 4 * tf.ones((1, 1), dtype="int32")
+
+        inputs_val = {"a": (a_val, b_val), "c": (c_val, d_val)}
+        res = model(inputs_val)
+
+        # Check that inputs were flattened in the correct order.
+        self.assertFalse(model._enable_dict_to_input_mapping)
+        self.assertEqual(self.evaluate(res), [1234])
 
 
 @test_combinations.generate(test_combinations.keras_mode_combinations())
 class AddLossTest(test_combinations.TestCase):
-
-  def test_add_loss_outside_call_only_loss(self):
-    inputs = input_layer_lib.Input((10,))
-    mid = layers.Dense(10)(inputs)
-    outputs = layers.Dense(1)(mid)
-    model = training_lib.Model(inputs, outputs)
-    model.add_loss(tf.reduce_mean(outputs))
-    self.assertLen(model.losses, 1)
-
-    initial_weights = model.get_weights()
-
-    x = np.ones((10, 10))
-    model.compile(
-        'sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, batch_size=2, epochs=1)
-
-    model2 = model.from_config(model.get_config())
-    model2.compile(
-        'sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    model2.set_weights(initial_weights)
-    model2.fit(x, batch_size=2, epochs=1)
-
-    # The TFOpLayer and the AddLoss layer are serialized.
-    self.assertLen(model2.layers, 5)
-    self.assertAllClose(model.get_weights(), model2.get_weights())
-
-  def test_add_loss_outside_call_multiple_losses(self):
-    inputs = input_layer_lib.Input((10,))
-    x1 = layers.Dense(10)(inputs)
-    x2 = layers.Dense(10)(x1)
-    outputs = layers.Dense(1)(x2)
-    model = training_lib.Model(inputs, outputs)
-    model.add_loss(tf.reduce_sum(x1 * x2))
-    model.add_loss(tf.reduce_mean(outputs))
-    self.assertLen(model.losses, 2)
-
-    initial_weights = model.get_weights()
-
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, batch_size=2, epochs=1)
-
-    model2 = model.from_config(model.get_config())
-    model2.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model2.set_weights(initial_weights)
-    model2.fit(x, y, batch_size=2, epochs=1)
-
-    self.assertAllClose(model.get_weights(), model2.get_weights())
-
-  def test_add_loss_crossentropy_backtracking(self):
-    inputs = input_layer_lib.Input((2,))
-    labels = input_layer_lib.Input((1,))
-    outputs = layers.Dense(1, activation='sigmoid')(inputs)
-    model = functional.Functional([inputs, labels], outputs)
-    model.add_loss(losses.binary_crossentropy(labels, outputs))
-    model.compile('adam')
-    x = np.random.random((2, 2))
-    y = np.random.random((2, 1))
-    model.fit([x, y])
-
-    inputs = input_layer_lib.Input((2,))
-    labels = input_layer_lib.Input((2,))
-    outputs = layers.Dense(2, activation='softmax')(inputs)
-    model = functional.Functional([inputs, labels], outputs)
-    model.add_loss(losses.categorical_crossentropy(labels, outputs))
-    model.compile('adam')
-    x = np.random.random((2, 2))
-    y = np.random.random((2, 2))
-    model.fit([x, y])
-
-    inputs = input_layer_lib.Input((2,))
-    labels = input_layer_lib.Input((1,), dtype='int32')
-    outputs = layers.Dense(2, activation='softmax')(inputs)
-    model = functional.Functional([inputs, labels], outputs)
-    model.add_loss(losses.sparse_categorical_crossentropy(labels, outputs))
-    model.compile('adam')
-    x = np.random.random((2, 2))
-    y = np.random.randint(0, 2, size=(2, 1))
-    model.fit([x, y])
+    def test_add_loss_outside_call_only_loss(self):
+        inputs = input_layer_lib.Input((10,))
+        mid = layers.Dense(10)(inputs)
+        outputs = layers.Dense(1)(mid)
+        model = training_lib.Model(inputs, outputs)
+        model.add_loss(tf.reduce_mean(outputs))
+        self.assertLen(model.losses, 1)
+
+        initial_weights = model.get_weights()
+
+        x = np.ones((10, 10))
+        model.compile("sgd", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(x, batch_size=2, epochs=1)
+
+        model2 = model.from_config(model.get_config())
+        model2.compile("sgd", run_eagerly=test_utils.should_run_eagerly())
+        model2.set_weights(initial_weights)
+        model2.fit(x, batch_size=2, epochs=1)
+
+        # The TFOpLayer and the AddLoss layer are serialized.
+        self.assertLen(model2.layers, 5)
+        self.assertAllClose(model.get_weights(), model2.get_weights())
+
+    def test_add_loss_outside_call_multiple_losses(self):
+        inputs = input_layer_lib.Input((10,))
+        x1 = layers.Dense(10)(inputs)
+        x2 = layers.Dense(10)(x1)
+        outputs = layers.Dense(1)(x2)
+        model = training_lib.Model(inputs, outputs)
+        model.add_loss(tf.reduce_sum(x1 * x2))
+        model.add_loss(tf.reduce_mean(outputs))
+        self.assertLen(model.losses, 2)
+
+        initial_weights = model.get_weights()
+
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(x, y, batch_size=2, epochs=1)
+
+        model2 = model.from_config(model.get_config())
+        model2.compile(
+            "sgd", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        model2.set_weights(initial_weights)
+        model2.fit(x, y, batch_size=2, epochs=1)
+
+        self.assertAllClose(model.get_weights(), model2.get_weights())
+
+    def test_add_loss_crossentropy_backtracking(self):
+        inputs = input_layer_lib.Input((2,))
+        labels = input_layer_lib.Input((1,))
+        outputs = layers.Dense(1, activation="sigmoid")(inputs)
+        model = functional.Functional([inputs, labels], outputs)
+        model.add_loss(losses.binary_crossentropy(labels, outputs))
+        model.compile("adam")
+        x = np.random.random((2, 2))
+        y = np.random.random((2, 1))
+        model.fit([x, y])
+
+        inputs = input_layer_lib.Input((2,))
+        labels = input_layer_lib.Input((2,))
+        outputs = layers.Dense(2, activation="softmax")(inputs)
+        model = functional.Functional([inputs, labels], outputs)
+        model.add_loss(losses.categorical_crossentropy(labels, outputs))
+        model.compile("adam")
+        x = np.random.random((2, 2))
+        y = np.random.random((2, 2))
+        model.fit([x, y])
+
+        inputs = input_layer_lib.Input((2,))
+        labels = input_layer_lib.Input((1,), dtype="int32")
+        outputs = layers.Dense(2, activation="softmax")(inputs)
+        model = functional.Functional([inputs, labels], outputs)
+        model.add_loss(losses.sparse_categorical_crossentropy(labels, outputs))
+        model.compile("adam")
+        x = np.random.random((2, 2))
+        y = np.random.randint(0, 2, size=(2, 1))
+        model.fit([x, y])
 
 
 @test_combinations.generate(test_combinations.keras_mode_combinations())
 class WeightAccessTest(test_combinations.TestCase):
+    def test_functional_model(self):
+        inputs = input_layer_lib.Input((10,))
+        x1 = layers.Dense(10)(inputs)
+        x2 = layers.Dense(10)(x1)
+        outputs = layers.Dense(1)(x2)
+        model = training_lib.Model(inputs, outputs)
 
-  def test_functional_model(self):
-    inputs = input_layer_lib.Input((10,))
-    x1 = layers.Dense(10)(inputs)
-    x2 = layers.Dense(10)(x1)
-    outputs = layers.Dense(1)(x2)
-    model = training_lib.Model(inputs, outputs)
-
-    self.assertEqual(len(model.weights), 6)
-
-  def test_sequential_model_with_input_shape(self):
-    x1 = layers.Dense(10, input_shape=(10,))
-    x2 = layers.Dense(10)
-    x3 = layers.Dense(1)
-    model = sequential.Sequential([x1, x2, x3])
+        self.assertEqual(len(model.weights), 6)
 
-    self.assertEqual(len(model.weights), 6)
+    def test_sequential_model_with_input_shape(self):
+        x1 = layers.Dense(10, input_shape=(10,))
+        x2 = layers.Dense(10)
+        x3 = layers.Dense(1)
+        model = sequential.Sequential([x1, x2, x3])
 
-  def test_sequential_model_without_input_shape(self):
-    x1 = layers.Dense(10)
-    x2 = layers.Dense(10)
-    x3 = layers.Dense(1)
-    model = sequential.Sequential([x1, x2, x3])
+        self.assertEqual(len(model.weights), 6)
 
-    with self.assertRaisesRegex(
-        ValueError, 'Weights for model .* have not yet been created'):
-      _ = model.weights
+    def test_sequential_model_without_input_shape(self):
+        x1 = layers.Dense(10)
+        x2 = layers.Dense(10)
+        x3 = layers.Dense(1)
+        model = sequential.Sequential([x1, x2, x3])
 
-  def test_subclass_model_with_build_method(self):
+        with self.assertRaisesRegex(
+            ValueError, "Weights for model .* have not yet been created"
+        ):
+            _ = model.weights
 
-    class SubclassModel(models.Model):
+    def test_subclass_model_with_build_method(self):
+        class SubclassModel(models.Model):
+            def build(self, input_shape):
+                self.w = self.add_weight(
+                    shape=input_shape[-1], initializer="ones"
+                )
 
-      def build(self, input_shape):
-        self.w = self.add_weight(shape=input_shape[-1], initializer='ones')
+            def call(self, inputs):
+                return inputs * self.w
 
-      def call(self, inputs):
-        return inputs * self.w
+        model = SubclassModel()
 
-    model = SubclassModel()
+        with self.assertRaisesRegex(
+            ValueError, "Weights for model .* have not yet been created"
+        ):
+            _ = model.weights
 
-    with self.assertRaisesRegex(
-        ValueError, 'Weights for model .* have not yet been created'):
-      _ = model.weights
+        model(input_layer_lib.Input((10,)))
+        self.assertEqual(len(model.weights), 1)
 
-    model(input_layer_lib.Input((10,)))
-    self.assertEqual(len(model.weights), 1)
+    def test_subclass_model_without_build_method(self):
+        class SubclassModel(models.Model):
+            def __init__(self):
+                super().__init__()
+                self.w = self.add_weight(shape=(), initializer="ones")
 
-  def test_subclass_model_without_build_method(self):
+            def call(self, inputs):
+                return inputs * self.w
 
-    class SubclassModel(models.Model):
+        model = SubclassModel()
+        self.assertEqual(len(model.weights), 1)
 
-      def __init__(self):
-        super().__init__()
-        self.w = self.add_weight(shape=(), initializer='ones')
 
-      def call(self, inputs):
-        return inputs * self.w
-
-    model = SubclassModel()
-    self.assertEqual(len(model.weights), 1)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class DTypeTest(test_combinations.TestCase):
+    @test_utils.enable_v2_dtype_behavior
+    def test_graph_network_dtype(self):
+        inputs = input_layer_lib.Input((10,))
+        outputs = layers.Dense(10)(inputs)
+        network = functional.Functional(inputs, outputs)
+        self.assertEqual(network.dtype, "float32")
 
-  @test_utils.enable_v2_dtype_behavior
-  def test_graph_network_dtype(self):
-    inputs = input_layer_lib.Input((10,))
-    outputs = layers.Dense(10)(inputs)
-    network = functional.Functional(inputs, outputs)
-    self.assertEqual(network.dtype, 'float32')
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_subclassed_network_dtype(self):
+    @test_utils.enable_v2_dtype_behavior
+    def test_subclassed_network_dtype(self):
+        class IdentityNetwork(training_lib.Model):
+            def call(self, inputs):
+                return inputs
 
-    class IdentityNetwork(training_lib.Model):
+        network = IdentityNetwork()
+        self.assertEqual(network.dtype, "float32")
+        self.assertEqual(network(tf.constant(1, "float64")).dtype, "float32")
 
-      def call(self, inputs):
-        return inputs
+        network = IdentityNetwork(dtype="float16")
+        self.assertEqual(network.dtype, "float16")
+        self.assertEqual(network(tf.constant(1, "float64")).dtype, "float16")
 
-    network = IdentityNetwork()
-    self.assertEqual(network.dtype, 'float32')
-    self.assertEqual(network(tf.constant(1, 'float64')).dtype, 'float32')
-
-    network = IdentityNetwork(dtype='float16')
-    self.assertEqual(network.dtype, 'float16')
-    self.assertEqual(network(tf.constant(1, 'float64')).dtype, 'float16')
-
-    network = IdentityNetwork(autocast=False)
-    self.assertEqual(network.dtype, 'float32')
-    self.assertEqual(network(tf.constant(1, 'float64')).dtype, 'float64')
+        network = IdentityNetwork(autocast=False)
+        self.assertEqual(network.dtype, "float32")
+        self.assertEqual(network(tf.constant(1, "float64")).dtype, "float64")
 
 
 class AttrTrackingLayer(base_layer.Layer):
-  """Count how many times `dynamic` and `stateful` are called.
+    """Count how many times `dynamic` and `stateful` are called.
+
+    These counts are used to test that the attribute cache behaves as expected.
+    """
 
-  These counts are used to test that the attribute cache behaves as expected.
-  """
-  def __init__(self, *args, **kwargs):
-    self.stateful_count = 0
-    self.dynamic_count = 0
-    super().__init__(*args, **kwargs)
+    def __init__(self, *args, **kwargs):
+        self.stateful_count = 0
+        self.dynamic_count = 0
+        super().__init__(*args, **kwargs)
 
-  @base_layer.Layer.stateful.getter
-  def stateful(self):
-    self.stateful_count += 1
-    return super().stateful
+    @base_layer.Layer.stateful.getter
+    def stateful(self):
+        self.stateful_count += 1
+        return super().stateful
 
-  @property
-  def dynamic(self):
-    self.dynamic_count += 1
-    return super().dynamic
+    @property
+    def dynamic(self):
+        self.dynamic_count += 1
+        return super().dynamic
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CacheCorrectnessTest(test_combinations.TestCase):
+    def layer_and_network_test(self):
+        # Top level layer
+        network = functional.Functional()
+
+        layer_0 = AttrTrackingLayer()
+
+        sub_network = functional.Functional()
+        layer_1 = AttrTrackingLayer(dynamic=True)
+        layer_2 = AttrTrackingLayer()
+        sub_network.sub_layers = [layer_1, layer_2]
+
+        network.sub_layer = layer_0
+
+        for _ in range(2):
+            self.assertEqual(network.dynamic, False)
+            self.assertEqual(network.stateful, False)
+
+            # The second pass should be a cache hit.
+            self.assertEqual(layer_0.dynamic_count, 1)
+            self.assertEqual(layer_0.stateful_count, 1)
+
+        # Mutations of the sub-layer should force recalculation of the network's
+        # stateful attribute. (mutations bubble up.)
+        layer_0.stateful = True
+        self.assertEqual(network.stateful, True)
+        self.assertEqual(layer_0.stateful_count, 2)
+
+        layer_0.stateful = False
+        self.assertEqual(network.stateful, False)
+        self.assertEqual(layer_0.stateful_count, 3)
+
+        # But changing stateful should not affect dynamic.
+        self.assertEqual(network.dynamic, False)
+        self.assertEqual(layer_0.dynamic_count, 1)
+
+        network.sub_network = sub_network
+
+        # Adding to the topology should invalidate the cache and reflect in the
+        # top level network.
+        self.assertEqual(network.dynamic, True)
+        self.assertEqual(layer_0.dynamic_count, 2)
+        self.assertEqual(layer_1.dynamic_count, 1)
+
+        # Still dynamic, but we need to recompute.
+        sub_network.sub_layers.pop()
+        self.assertEqual(network.dynamic, True)
+        self.assertEqual(layer_0.dynamic_count, 3)
+        self.assertEqual(layer_1.dynamic_count, 2)
+
+        # Now that we've removed the dynamic layer deep in the layer hierarchy,
+        # we need to make sure that that bubbles up through all the levels.
+        sub_network.sub_layers.pop()
+        self.assertEqual(network.dynamic, False)
+        self.assertEqual(layer_0.dynamic_count, 4)
+        self.assertEqual(layer_1.dynamic_count, 2)
+
+        # Now check with a tracked dict.
+        sub_network.sub_layers = {
+            "layer_1": layer_1,
+            "layer_2": layer_2,
+        }
+
+        self.assertEqual(network.dynamic, True)
+        self.assertEqual(layer_0.dynamic_count, 5)
+        self.assertEqual(layer_1.dynamic_count, 3)
+
+        # In-place assignment should still invalidate the cache.
+        sub_network.sub_layers["layer_1"] = layer_1
+        self.assertEqual(network.dynamic, True)
+        self.assertEqual(layer_0.dynamic_count, 6)
+        self.assertEqual(layer_1.dynamic_count, 4)
+
+        sub_network.sub_layers["layer_1"] = None
+        for _ in range(2):
+            self.assertEqual(network.dynamic, False)
+            self.assertEqual(layer_0.dynamic_count, 7)
+            self.assertEqual(layer_1.dynamic_count, 4)
+
+        layer_3 = AttrTrackingLayer()
+        layer_3.stateful = True
+
+        sub_network.sub_layers = None
+        self.assertEqual(network.dynamic, False)
+        self.assertEqual(network.stateful, False)
+
+        # Test duplicate layers.
+        sub_network.sub_layers = [layer_1, layer_1, layer_1, layer_3]
+        self.assertEqual(network.dynamic, True)
+        self.assertEqual(network.stateful, True)
+
+        for _ in range(3):
+            sub_network.sub_layers.pop()
+            self.assertEqual(network.dynamic, True)
+            self.assertEqual(network.stateful, False)
+
+        sub_network.sub_layers.pop()
+        self.assertEqual(network.dynamic, False)
+        self.assertEqual(network.stateful, False)
+
+    def test_compute_output_shape_cache(self):
+        # See https://github.com/tensorflow/tensorflow/issues/32029.
+        x = input_layer_lib.Input(shape=(None, 32))
+        dense = layers.Dense(2)
+        y = dense(x)
+        network = functional.Functional(x, y, name="dense_network")
+
+        for i in range(999, 1024):
+            self.assertEqual(
+                network.compute_output_shape((1, i, 32)), (1, i, 2)
+            )
+
+    def test_2d_inputs_squeezed_to_1d(self):
+        input_1d = input_layer_lib.Input(shape=())
+        outputs = input_1d * 2.0
+        net = functional.Functional(input_1d, outputs)
+
+        x = np.ones((10, 1))
+        y = net(x)
+        self.assertEqual(y.shape.rank, 1)
+
+    def test_1d_inputs_expanded_to_2d(self):
+        input_1d = input_layer_lib.Input(shape=(1,))
+        outputs = input_1d * 2.0
+        net = functional.Functional(input_1d, outputs)
+
+        x = np.ones((10,))
+        y = net(x)
+        self.assertEqual(y.shape.rank, 2)
+
+    def test_training_passed_during_construction(self):
+        def _call(inputs, training):
+            if training is None:
+                return inputs * -1.0
+            elif training:
+                return inputs
+            else:
+                return inputs * 0.0
+
+        class MyLayer(base_layer.Layer):
+            def call(self, inputs, training=True):
+                return _call(inputs, training)
+
+        my_layer = MyLayer()
+        x = np.ones((1, 10))
+
+        # Hard-coded `true` value passed during construction is respected.
+        inputs = input_layer_lib.Input(10)
+        outputs = my_layer(inputs, training=True)
+        network = functional.Functional(inputs, outputs)
+        self.assertAllEqual(network(x, training=True), _call(x, True))
+        self.assertAllEqual(network(x, training=False), _call(x, True))
+        self.assertAllEqual(network(x), _call(x, True))
 
-  def layer_and_network_test(self):
-    # Top level layer
-    network = functional.Functional()
-
-    layer_0 = AttrTrackingLayer()
-
-    sub_network = functional.Functional()
-    layer_1 = AttrTrackingLayer(dynamic=True)
-    layer_2 = AttrTrackingLayer()
-    sub_network.sub_layers = [layer_1, layer_2]
-
-    network.sub_layer = layer_0
-
-    for _ in range(2):
-      self.assertEqual(network.dynamic, False)
-      self.assertEqual(network.stateful, False)
-
-      # The second pass should be a cache hit.
-      self.assertEqual(layer_0.dynamic_count, 1)
-      self.assertEqual(layer_0.stateful_count, 1)
-
-    # Mutations of the sub-layer should force recalculation of the network's
-    # stateful attribute. (mutations bubble up.)
-    layer_0.stateful = True
-    self.assertEqual(network.stateful, True)
-    self.assertEqual(layer_0.stateful_count, 2)
-
-    layer_0.stateful = False
-    self.assertEqual(network.stateful, False)
-    self.assertEqual(layer_0.stateful_count, 3)
-
-    # But changing stateful should not affect dynamic.
-    self.assertEqual(network.dynamic, False)
-    self.assertEqual(layer_0.dynamic_count, 1)
-
-    network.sub_network = sub_network
-
-    # Adding to the topology should invalidate the cache and reflect in the top
-    # level network.
-    self.assertEqual(network.dynamic, True)
-    self.assertEqual(layer_0.dynamic_count, 2)
-    self.assertEqual(layer_1.dynamic_count, 1)
-
-    # Still dynamic, but we need to recompute.
-    sub_network.sub_layers.pop()
-    self.assertEqual(network.dynamic, True)
-    self.assertEqual(layer_0.dynamic_count, 3)
-    self.assertEqual(layer_1.dynamic_count, 2)
-
-    # Now that we've removed the dynamic layer deep in the layer hierarchy, we
-    # need to make sure that that bubbles up through all the levels.
-    sub_network.sub_layers.pop()
-    self.assertEqual(network.dynamic, False)
-    self.assertEqual(layer_0.dynamic_count, 4)
-    self.assertEqual(layer_1.dynamic_count, 2)
-
-    # Now check with a tracked dict.
-    sub_network.sub_layers = {
-        "layer_1": layer_1,
-        "layer_2": layer_2,
-    }
-
-    self.assertEqual(network.dynamic, True)
-    self.assertEqual(layer_0.dynamic_count, 5)
-    self.assertEqual(layer_1.dynamic_count, 3)
-
-    # In-place assignment should still invalidate the cache.
-    sub_network.sub_layers["layer_1"] = layer_1
-    self.assertEqual(network.dynamic, True)
-    self.assertEqual(layer_0.dynamic_count, 6)
-    self.assertEqual(layer_1.dynamic_count, 4)
-
-    sub_network.sub_layers["layer_1"] = None
-    for _ in range(2):
-      self.assertEqual(network.dynamic, False)
-      self.assertEqual(layer_0.dynamic_count, 7)
-      self.assertEqual(layer_1.dynamic_count, 4)
-
-    layer_3 = AttrTrackingLayer()
-    layer_3.stateful = True
-
-    sub_network.sub_layers = None
-    self.assertEqual(network.dynamic, False)
-    self.assertEqual(network.stateful, False)
-
-    # Test duplicate layers.
-    sub_network.sub_layers = [layer_1, layer_1, layer_1, layer_3]
-    self.assertEqual(network.dynamic, True)
-    self.assertEqual(network.stateful, True)
-
-    for _ in range(3):
-      sub_network.sub_layers.pop()
-      self.assertEqual(network.dynamic, True)
-      self.assertEqual(network.stateful, False)
-
-    sub_network.sub_layers.pop()
-    self.assertEqual(network.dynamic, False)
-    self.assertEqual(network.stateful, False)
-
-  def test_compute_output_shape_cache(self):
-    # See https://github.com/tensorflow/tensorflow/issues/32029.
-    x = input_layer_lib.Input(shape=(None, 32))
-    dense = layers.Dense(2)
-    y = dense(x)
-    network = functional.Functional(x, y, name='dense_network')
-
-    for i in range(999, 1024):
-      self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
-
-  def test_2d_inputs_squeezed_to_1d(self):
-    input_1d = input_layer_lib.Input(shape=())
-    outputs = input_1d * 2.
-    net = functional.Functional(input_1d, outputs)
-
-    x = np.ones((10, 1))
-    y = net(x)
-    self.assertEqual(y.shape.rank, 1)
-
-  def test_1d_inputs_expanded_to_2d(self):
-    input_1d = input_layer_lib.Input(shape=(1,))
-    outputs = input_1d * 2.
-    net = functional.Functional(input_1d, outputs)
-
-    x = np.ones((10,))
-    y = net(x)
-    self.assertEqual(y.shape.rank, 2)
-
-  def test_training_passed_during_construction(self):
-
-    def _call(inputs, training):
-      if training is None:
-        return inputs * -1.0
-      elif training:
-        return inputs
-      else:
-        return inputs * 0.0
-
-    class MyLayer(base_layer.Layer):
-
-      def call(self, inputs, training=True):
-        return _call(inputs, training)
-
-    my_layer = MyLayer()
-    x = np.ones((1, 10))
-
-    # Hard-coded `true` value passed during construction is respected.
-    inputs = input_layer_lib.Input(10)
-    outputs = my_layer(inputs, training=True)
-    network = functional.Functional(inputs, outputs)
-    self.assertAllEqual(network(x, training=True), _call(x, True))
-    self.assertAllEqual(network(x, training=False), _call(x, True))
-    self.assertAllEqual(network(x), _call(x, True))
-
-    # Hard-coded `false` value passed during construction is respected.
-    inputs = input_layer_lib.Input(10)
-    outputs = my_layer(inputs, training=False)
-    network = functional.Functional(inputs, outputs)
-    self.assertAllEqual(network(x, training=True), _call(x, False))
-    self.assertAllEqual(network(x, training=False), _call(x, False))
-    self.assertAllEqual(network(x), _call(x, False))
-
-    if tf.executing_eagerly():
-      # In v2, construction still works when no `training` is specified
-      # When no value passed during construction, it uses the local default.
-      inputs = input_layer_lib.Input(10)
-      outputs = my_layer(inputs)
-      network = functional.Functional(inputs, outputs)
-      self.assertAllEqual(network(x, training=True), _call(x, True))
-      self.assertAllEqual(network(x, training=False), _call(x, False))
-      self.assertAllEqual(network(x), _call(x, True))  # Use local default
-
-    # `None` value passed positionally during construction is ignored at runtime
-    inputs = input_layer_lib.Input(10)
-    outputs = my_layer(inputs, None)
-    network = functional.Functional(inputs, outputs)
-    self.assertAllEqual(network(x, training=True), _call(x, True))
-    self.assertAllEqual(network(x, training=False), _call(x, False))
-    if tf.executing_eagerly():
-      self.assertAllEqual(network(x), _call(x, True))  # Use local default
-    else:
-      # in v1 training would have defaulted to using the `None` inside the layer
-      # if training is not passed at runtime
-      self.assertAllEqual(network(x), _call(x, None))
-
-    # `None` value passed as kwarg during construction is ignored at runtime.
-    inputs = input_layer_lib.Input(10)
-    outputs = my_layer(inputs, training=None)
-    network = functional.Functional(inputs, outputs)
-    self.assertAllEqual(network(x, training=True), _call(x, True))
-    self.assertAllEqual(network(x, training=False), _call(x, False))
-    if tf.executing_eagerly():
-      self.assertAllEqual(network(x), _call(x, True))  # Use local default
-    else:
-      # in v1 training would have defaulted to using the `None` inside the layer
-      # if training is not passed at runtime
-      self.assertAllEqual(network(x), _call(x, None))
+        # Hard-coded `false` value passed during construction is respected.
+        inputs = input_layer_lib.Input(10)
+        outputs = my_layer(inputs, training=False)
+        network = functional.Functional(inputs, outputs)
+        self.assertAllEqual(network(x, training=True), _call(x, False))
+        self.assertAllEqual(network(x, training=False), _call(x, False))
+        self.assertAllEqual(network(x), _call(x, False))
+
+        if tf.executing_eagerly():
+            # In v2, construction still works when no `training` is specified
+            # When no value passed during construction, it uses the local
+            # default.
+            inputs = input_layer_lib.Input(10)
+            outputs = my_layer(inputs)
+            network = functional.Functional(inputs, outputs)
+            self.assertAllEqual(network(x, training=True), _call(x, True))
+            self.assertAllEqual(network(x, training=False), _call(x, False))
+            self.assertAllEqual(network(x), _call(x, True))  # Use local default
+
+        # `None` value passed positionally during construction is ignored at
+        # runtime
+        inputs = input_layer_lib.Input(10)
+        outputs = my_layer(inputs, None)
+        network = functional.Functional(inputs, outputs)
+        self.assertAllEqual(network(x, training=True), _call(x, True))
+        self.assertAllEqual(network(x, training=False), _call(x, False))
+        if tf.executing_eagerly():
+            self.assertAllEqual(network(x), _call(x, True))  # Use local default
+        else:
+            # in v1 training would have defaulted to using the `None` inside the
+            # layer if training is not passed at runtime
+            self.assertAllEqual(network(x), _call(x, None))
+
+        # `None` value passed as kwarg during construction is ignored at
+        # runtime.
+        inputs = input_layer_lib.Input(10)
+        outputs = my_layer(inputs, training=None)
+        network = functional.Functional(inputs, outputs)
+        self.assertAllEqual(network(x, training=True), _call(x, True))
+        self.assertAllEqual(network(x, training=False), _call(x, False))
+        if tf.executing_eagerly():
+            self.assertAllEqual(network(x), _call(x, True))  # Use local default
+        else:
+            # in v1 training would have defaulted to using the `None` inside the
+            # layer if training is not passed at runtime
+            self.assertAllEqual(network(x), _call(x, None))
 
 
 class InputsOutputsErrorTest(test_combinations.TestCase):
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_input_error(self):
-    inputs = input_layer_lib.Input((10,))
-    outputs = layers.Dense(10)(inputs)
-    with self.assertRaisesRegex(
-        TypeError, "('Keyword argument not understood:', 'input')"):
-      models.Model(input=inputs, outputs=outputs)
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_output_error(self):
-    inputs = input_layer_lib.Input((10,))
-    outputs = layers.Dense(10)(inputs)
-    with self.assertRaisesRegex(
-        TypeError, "('Keyword argument not understood:', 'output')"):
-      models.Model(inputs=inputs, output=outputs)
-
-  def test_input_spec(self):
-    if not tf.executing_eagerly():
-      return
-    inputs = input_layer_lib.Input((10,))
-    outputs = layers.Dense(10)(inputs)
-    model = models.Model(inputs, outputs)
-    with self.assertRaisesRegex(
-        ValueError, r'.*expected shape=.*'):
-      model(np.zeros((3, 11)))
-
-  def test_input_spec_list_of_inputs(self):
-    if not tf.executing_eagerly():
-      return
-    input_1 = input_layer_lib.Input((10,), name='1')
-    input_2 = input_layer_lib.Input((5,), name='2')
-    x = layers.Concatenate()([input_1, input_2])
-    outputs = layers.Dense(10)(x)
-    model = models.Model([input_1, input_2], outputs)
-    with self.assertRaisesRegex(
-        ValueError, r'.*expects 2 input.*'):
-      model(np.zeros((3, 10)))
-    with self.assertRaisesRegex(
-        ValueError, r'.*expects 2 input.*'):
-      model([np.zeros((3, 10)), np.zeros((3, 5)), np.zeros((3, 10))])
-    with self.assertRaisesRegex(
-        ValueError, r'.*expected shape=.*'):
-      model([np.zeros((3, 10)), np.zeros((3, 6))])
-
-    # Test passing data via dict keyed by input name
-    with self.assertRaisesRegex(
-        ValueError, r'Missing data for input.*'):
-      model({'1': np.zeros((3, 10))})
-    with self.assertRaisesRegex(
-        ValueError, r'.*expected shape=.*'):
-      model({'1': np.zeros((3, 10)), '2': np.zeros((3, 6))})
-
-  def test_input_spec_dict(self):
-    if not tf.executing_eagerly():
-      return
-    input_1 = input_layer_lib.Input((10,))
-    input_2 = input_layer_lib.Input((5,))
-    x = layers.Concatenate()([input_1, input_2])
-    outputs = layers.Dense(10)(x)
-    model = models.Model({'1': input_1, '2': input_2}, outputs)
-    with self.assertRaisesRegex(
-        ValueError, r'Missing data for input.*'):
-      model({'1': np.zeros((3, 10))})
-    with self.assertRaisesRegex(
-        ValueError, r'.*expected shape=.*'):
-      model({'1': np.zeros((3, 10)), '2': np.zeros((3, 6))})
+    @test_utils.enable_v2_dtype_behavior
+    def test_input_error(self):
+        inputs = input_layer_lib.Input((10,))
+        outputs = layers.Dense(10)(inputs)
+        with self.assertRaisesRegex(
+            TypeError, "('Keyword argument not understood:', 'input')"
+        ):
+            models.Model(input=inputs, outputs=outputs)
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_output_error(self):
+        inputs = input_layer_lib.Input((10,))
+        outputs = layers.Dense(10)(inputs)
+        with self.assertRaisesRegex(
+            TypeError, "('Keyword argument not understood:', 'output')"
+        ):
+            models.Model(inputs=inputs, output=outputs)
+
+    def test_input_spec(self):
+        if not tf.executing_eagerly():
+            return
+        inputs = input_layer_lib.Input((10,))
+        outputs = layers.Dense(10)(inputs)
+        model = models.Model(inputs, outputs)
+        with self.assertRaisesRegex(ValueError, r".*expected shape=.*"):
+            model(np.zeros((3, 11)))
+
+    def test_input_spec_list_of_inputs(self):
+        if not tf.executing_eagerly():
+            return
+        input_1 = input_layer_lib.Input((10,), name="1")
+        input_2 = input_layer_lib.Input((5,), name="2")
+        x = layers.Concatenate()([input_1, input_2])
+        outputs = layers.Dense(10)(x)
+        model = models.Model([input_1, input_2], outputs)
+        with self.assertRaisesRegex(ValueError, r".*expects 2 input.*"):
+            model(np.zeros((3, 10)))
+        with self.assertRaisesRegex(ValueError, r".*expects 2 input.*"):
+            model([np.zeros((3, 10)), np.zeros((3, 5)), np.zeros((3, 10))])
+        with self.assertRaisesRegex(ValueError, r".*expected shape=.*"):
+            model([np.zeros((3, 10)), np.zeros((3, 6))])
+
+        # Test passing data via dict keyed by input name
+        with self.assertRaisesRegex(ValueError, r"Missing data for input.*"):
+            model({"1": np.zeros((3, 10))})
+        with self.assertRaisesRegex(ValueError, r".*expected shape=.*"):
+            model({"1": np.zeros((3, 10)), "2": np.zeros((3, 6))})
+
+    def test_input_spec_dict(self):
+        if not tf.executing_eagerly():
+            return
+        input_1 = input_layer_lib.Input((10,))
+        input_2 = input_layer_lib.Input((5,))
+        x = layers.Concatenate()([input_1, input_2])
+        outputs = layers.Dense(10)(x)
+        model = models.Model({"1": input_1, "2": input_2}, outputs)
+        with self.assertRaisesRegex(ValueError, r"Missing data for input.*"):
+            model({"1": np.zeros((3, 10))})
+        with self.assertRaisesRegex(ValueError, r".*expected shape=.*"):
+            model({"1": np.zeros((3, 10)), "2": np.zeros((3, 6))})
 
 
 class FunctionalSubclassModel(training_lib.Model):
-
-  def __init__(self, *args, **kwargs):
-    self.foo = {'foo': 'bar'}  # Make sure users can assign dict attributes
-    my_input = input_layer_lib.Input(shape=(16,))
-    dense = layers.Dense(32, activation='relu')
-    output = dense(my_input)
-    outputs = {'output': output}
-    super().__init__(inputs=[my_input], outputs=outputs, *args, **kwargs)
+    def __init__(self, *args, **kwargs):
+        self.foo = {"foo": "bar"}  # Make sure users can assign dict attributes
+        my_input = input_layer_lib.Input(shape=(16,))
+        dense = layers.Dense(32, activation="relu")
+        output = dense(my_input)
+        outputs = {"output": output}
+        super().__init__(inputs=[my_input], outputs=outputs, *args, **kwargs)
 
 
 class MixinClass:
+    def __init__(self, foo, **kwargs):
+        self._foo = foo
+        super().__init__(**kwargs)
 
-  def __init__(self, foo, **kwargs):
-    self._foo = foo
-    super().__init__(**kwargs)
-
-  def get_foo(self):
-    return self._foo
+    def get_foo(self):
+        return self._foo
 
 
 class SubclassedModel(training_lib.Model):
+    def __init__(self, bar, **kwargs):
+        self._bar = bar
+        super().__init__(**kwargs)
 
-  def __init__(self, bar, **kwargs):
-    self._bar = bar
-    super().__init__(**kwargs)
-
-  def get_bar(self):
-    return self._bar
+    def get_bar(self):
+        return self._bar
 
 
 class MultipleInheritanceModelTest(test_combinations.TestCase):
-
-  def testFunctionalSubclass(self):
-    m = FunctionalSubclassModel()
-    # Some smoke test for the weights and output shape of the model
-    self.assertLen(m.weights, 2)
-    self.assertEqual(m.outputs[0].shape.as_list(), [None, 32])
-
-  def testFunctionalSubclassPreMixin(self):
-    class MixedFunctionalSubclassModel(MixinClass, FunctionalSubclassModel):
-      pass
-
-    m = MixedFunctionalSubclassModel(foo='123')
-    self.assertTrue(m._is_graph_network)
-    self.assertLen(m.weights, 2)
-    self.assertEqual(m.outputs[0].shape.as_list(), [None, 32])
-    self.assertEqual(m.get_foo(), '123')
-
-  def testFunctionalSubclassPostMixin(self):
-    # Make sure the the mixin class is also init correct when the order changed.
-
-    class MixedFunctionalSubclassModel(FunctionalSubclassModel, MixinClass):
-      pass
-
-    m = MixedFunctionalSubclassModel(foo='123')
-    self.assertTrue(m._is_graph_network)
-    self.assertLen(m.weights, 2)
-    self.assertEqual(m.outputs[0].shape.as_list(), [None, 32])
-    self.assertEqual(m.get_foo(), '123')
-
-  def testSubclassModelPreMixin(self):
-    class MixedSubclassModel(MixinClass, SubclassedModel):
-      pass
-
-    m = MixedSubclassModel(foo='123', bar='456')
-    self.assertFalse(m._is_graph_network)
-    self.assertEqual(m.get_foo(), '123')
-    self.assertEqual(m.get_bar(), '456')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def testFunctionalSubclass(self):
+        m = FunctionalSubclassModel()
+        # Some smoke test for the weights and output shape of the model
+        self.assertLen(m.weights, 2)
+        self.assertEqual(m.outputs[0].shape.as_list(), [None, 32])
+
+    def testFunctionalSubclassPreMixin(self):
+        class MixedFunctionalSubclassModel(MixinClass, FunctionalSubclassModel):
+            pass
+
+        m = MixedFunctionalSubclassModel(foo="123")
+        self.assertTrue(m._is_graph_network)
+        self.assertLen(m.weights, 2)
+        self.assertEqual(m.outputs[0].shape.as_list(), [None, 32])
+        self.assertEqual(m.get_foo(), "123")
+
+    def testFunctionalSubclassPostMixin(self):
+        # Make sure the the mixin class is also init correct when the order
+        # changed.
+
+        class MixedFunctionalSubclassModel(FunctionalSubclassModel, MixinClass):
+            pass
+
+        m = MixedFunctionalSubclassModel(foo="123")
+        self.assertTrue(m._is_graph_network)
+        self.assertLen(m.weights, 2)
+        self.assertEqual(m.outputs[0].shape.as_list(), [None, 32])
+        self.assertEqual(m.get_foo(), "123")
+
+    def testSubclassModelPreMixin(self):
+        class MixedSubclassModel(MixinClass, SubclassedModel):
+            pass
+
+        m = MixedSubclassModel(foo="123", bar="456")
+        self.assertFalse(m._is_graph_network)
+        self.assertEqual(m.get_foo(), "123")
+        self.assertEqual(m.get_bar(), "456")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/functional_utils.py b/keras/engine/functional_utils.py
index bd4e2e77eafa..bfc4acc4104a 100644
--- a/keras/engine/functional_utils.py
+++ b/keras/engine/functional_utils.py
@@ -14,235 +14,247 @@
 # ==============================================================================
 """Utilities for keras functional model."""
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine import input_layer as input_layer_module
 from keras.engine import keras_tensor
 from keras.engine import node as node_module
 
-import tensorflow.compat.v2 as tf
-
 _KERAS_TENSOR_TYPE_CHECK_ERROR_MSG = (
-    'Found unexpected instance while processing input tensors for keras '
-    'functional model. Expecting KerasTensor which is from tf.keras.Input() '
-    'or output from keras layer call(). Got: {}')
+    "Found unexpected instance while processing input tensors for keras "
+    "functional model. Expecting KerasTensor which is from tf.keras.Input() "
+    "or output from keras layer call(). Got: {}"
+)
 
 
 def is_input_keras_tensor(tensor):
-  """Check if tensor is directly generated from `tf.keras.Input`.
+    """Check if tensor is directly generated from `tf.keras.Input`.
 
-  This check is useful when constructing the functional model, since we will
-  need to clone Nodes and KerasTensors if the model is building from non input
-  tensor.
+    This check is useful when constructing the functional model, since we will
+    need to clone Nodes and KerasTensors if the model is building from non input
+    tensor.
 
-  Args:
-    tensor: A `KerasTensor` as inputs to the functional model.
+    Args:
+      tensor: A `KerasTensor` as inputs to the functional model.
 
-  Returns:
-    bool. Whether the tensor is directly generated from `tf.keras.Input`.
+    Returns:
+      bool. Whether the tensor is directly generated from `tf.keras.Input`.
 
-  Raises:
-    ValueError: if the tensor is not a KerasTensor instance.
-  """
-  if not node_module.is_keras_tensor(tensor):
-    raise ValueError(_KERAS_TENSOR_TYPE_CHECK_ERROR_MSG.format(tensor))
-  return tensor.node.is_input
+    Raises:
+      ValueError: if the tensor is not a KerasTensor instance.
+    """
+    if not node_module.is_keras_tensor(tensor):
+        raise ValueError(_KERAS_TENSOR_TYPE_CHECK_ERROR_MSG.format(tensor))
+    return tensor.node.is_input
 
 
 def find_nodes_by_inputs_and_outputs(inputs, outputs):
-  """Fetch all Nodes in the graph defined by "inputs" and "outputs".
-
-  This method is used to find and then clone Nodes when creating a new
-  sub-model from an existing functional model.
-
-  Args:
-    inputs: A nested structure of KerasTensor to use as model inputs.
-    outputs: A nested structure of KerasTensor to use as model outputs.
-
-  Returns:
-    A list of Nodes that are connected to the inputs and outputs.
-
-  Raises:
-    ValueError: when inputs and outputs are disconnected or in case of
-      unexpected objects in the inputs/outputs.
-  """
-  # We walk the graph bottom up, starting from output nodes, and keep tracing
-  # the upstream node, until we find all the inputs nodes. We don't use top
-  # down search here since we don't know whether a certain node is in the graph
-  # between inputs and outputs, e.g. a functional graph could have multiple
-  # outputs, and the user could choose a subset of them to build the model.
-  # The bottom up approach will ensure all the nodes we visit are actually
-  # in use. If we reach the top and didn't find the nodes in the `inputs`,
-  # that's an error, since the user didn't specify the correct inputs.
-  start_keras_tensors = tf.nest.flatten(outputs)
-  end_keras_tensors = tf.nest.flatten(inputs)
-
-  for t in start_keras_tensors + end_keras_tensors:
-    if not node_module.is_keras_tensor(t):
-      raise ValueError(_KERAS_TENSOR_TYPE_CHECK_ERROR_MSG.format(t))
-  end_ids = set([id(kt) for kt in end_keras_tensors])
-  # Track all the end tensors we found so far, if we didn't reach all the
-  # user-specified keras inputs after we finish the search, then that's an
-  # error since the inputs are disconnected from the outputs.
-  end_ids_found = set()
-
-  nodes_to_visit = []
-  nodes_in_graph = []
-  node_id_visited = set()
-  for t in start_keras_tensors:
-    nodes_to_visit.append(t.node)
-
-  while nodes_to_visit:
-    node = nodes_to_visit.pop(0)
-    if id(node) in node_id_visited:
-      continue
-    node_id_visited.add(id(node))
-    nodes_in_graph.append(node)
-    # Any input keras_tensor that produce the current node.
-    for kt in node.keras_inputs:
-      if id(kt) in end_ids:
-        # We found the inputs of the model, stop tracing upstream nodes
-        end_ids_found.add(id(kt))
-        continue
-
-      inbound_node = kt.node
-      # In case this is the tf.keras.Input node, we have reached the end of the
-      # tracing of upstream nodes. Any further tracing will just be an
-      # infinite loop. we should raise an error here since we didn't find the
-      # input in the user-specified inputs.
-      if inbound_node.is_input:
-        raise ValueError('Found input tensor cannot be reached given provided '
-                         'output tensors. Please make sure the tensor {} is '
-                         'included in the model inputs when building '
-                         'functional model.'.format(kt))
-      nodes_to_visit.append(inbound_node)
-
-  # Do a final check and make sure we have reached all the user-specified inputs
-  if end_ids != end_ids_found:
-    unvisited_inputs = [kt for kt in end_keras_tensors
-                        if id(kt) not in end_ids_found]
-    raise ValueError('Found unvisited input tensors that are disconnected from '
-                     'the outputs: {}'.format(unvisited_inputs))
-  return nodes_in_graph
+    """Fetch all Nodes in the graph defined by "inputs" and "outputs".
+
+    This method is used to find and then clone Nodes when creating a new
+    sub-model from an existing functional model.
+
+    Args:
+      inputs: A nested structure of KerasTensor to use as model inputs.
+      outputs: A nested structure of KerasTensor to use as model outputs.
+
+    Returns:
+      A list of Nodes that are connected to the inputs and outputs.
+
+    Raises:
+      ValueError: when inputs and outputs are disconnected or in case of
+        unexpected objects in the inputs/outputs.
+    """
+    # We walk the graph bottom up, starting from output nodes, and keep tracing
+    # the upstream node, until we find all the inputs nodes. We don't use top
+    # down search here since we don't know whether a certain node is in the
+    # graph between inputs and outputs, e.g. a functional graph could have
+    # multiple outputs, and the user could choose a subset of them to build the
+    # model. The bottom up approach will ensure all the nodes we visit are
+    # actually in use. If we reach the top and didn't find the nodes in the
+    # `inputs`, that's an error, since the user didn't specify the correct
+    # inputs.
+    start_keras_tensors = tf.nest.flatten(outputs)
+    end_keras_tensors = tf.nest.flatten(inputs)
+
+    for t in start_keras_tensors + end_keras_tensors:
+        if not node_module.is_keras_tensor(t):
+            raise ValueError(_KERAS_TENSOR_TYPE_CHECK_ERROR_MSG.format(t))
+    end_ids = set([id(kt) for kt in end_keras_tensors])
+    # Track all the end tensors we found so far, if we didn't reach all the
+    # user-specified keras inputs after we finish the search, then that's an
+    # error since the inputs are disconnected from the outputs.
+    end_ids_found = set()
+
+    nodes_to_visit = []
+    nodes_in_graph = []
+    node_id_visited = set()
+    for t in start_keras_tensors:
+        nodes_to_visit.append(t.node)
+
+    while nodes_to_visit:
+        node = nodes_to_visit.pop(0)
+        if id(node) in node_id_visited:
+            continue
+        node_id_visited.add(id(node))
+        nodes_in_graph.append(node)
+        # Any input keras_tensor that produce the current node.
+        for kt in node.keras_inputs:
+            if id(kt) in end_ids:
+                # We found the inputs of the model, stop tracing upstream nodes
+                end_ids_found.add(id(kt))
+                continue
+
+            inbound_node = kt.node
+            # In case this is the tf.keras.Input node, we have reached the end
+            # of the tracing of upstream nodes. Any further tracing will just be
+            # an infinite loop. we should raise an error here since we didn't
+            # find the input in the user-specified inputs.
+            if inbound_node.is_input:
+                raise ValueError(
+                    "Found input tensor cannot be reached given provided "
+                    "output tensors. Please make sure the tensor {} is "
+                    "included in the model inputs when building "
+                    "functional model.".format(kt)
+                )
+            nodes_to_visit.append(inbound_node)
+
+    # Do a final check and make sure we have reached all the user-specified
+    # inputs
+    if end_ids != end_ids_found:
+        unvisited_inputs = [
+            kt for kt in end_keras_tensors if id(kt) not in end_ids_found
+        ]
+        raise ValueError(
+            "Found unvisited input tensors that are disconnected from "
+            "the outputs: {}".format(unvisited_inputs)
+        )
+    return nodes_in_graph
 
 
 def clone_graph_nodes(inputs, outputs):
-  """Clone the `Node` between the inputs and output tensors.
-
-  This function is used to create a new functional model from any intermediate
-  keras tensors. The clone of the nodes mimic the behavior of reconstructing the
-  functional graph network by re-executing all the __call__ methods. The cloned
-  nodes will be appended to the layers.
-
-  Note that a new tf.keras.Inputs will be created for any items in the `inputs`
-
-  Args:
-    inputs: A nested structure of keras_tensors.
-    outputs: A nested structure of keras_tensors.
-
-  Returns:
-    A pair of inputs and outputs, with cloned keras_tensors. They can be used to
-    create a new functional model.
-  """
-  nodes_to_clone = find_nodes_by_inputs_and_outputs(inputs, outputs)
-  cloned_inputs = []
-  cloned_outputs = []
-  # We not only need to create copies of Nodes (mimic the calls), also need to
-  # clone keras_tensors to avoid the override of _keras_history attached on the
-  # keras_tensor. The following dict is used to track any keras tensor we cloned
-  # The key is the string ID of the original keras tensor, and value is the
-  # cloned keras_tensor instance.
-  kt_id_mapping = {}
-
-  for kt_input in tf.nest.flatten(inputs):
-    if kt_input.node.is_input:
-      # For any existing keras_tensor from tf.keras.Input, we leave them as is.
-      cloned_inputs.append(kt_input)
-      kt_id_mapping[id(kt_input)] = kt_input
-    else:
-      # We need to create a new tf.keras.Input for any intermediate keras_tensor
-      cpy = _clone_keras_tensor(kt_input)
-      cloned_input = input_layer_module.Input(tensor=cpy)
-      cloned_inputs.append(cloned_input)
-      kt_id_mapping[id(kt_input)] = cloned_input
-  cloned_inputs = tf.nest.pack_sequence_as(inputs, cloned_inputs)
-
-  for kt_output in tf.nest.flatten(outputs):
-    cpy = _clone_keras_tensor(kt_output)
-    # We reuse the _keras_history here, which contains the old information. It
-    # is used in the Node constructor to check if the tensor "is_keras_tensor()"
-    # The history will be override by the Node constructor anyway for the
-    # corresponding layer output anyway.
-    cpy._keras_history = kt_output._keras_history  # pylint: disable=protected-access
-    cloned_outputs.append(cpy)
-    kt_id_mapping[id(kt_output)] = cpy
-  cloned_outputs = tf.nest.pack_sequence_as(outputs, cloned_outputs)
-
-  for node in nodes_to_clone:
-    # Clone any keras_tensors to avoid override of _keras_history
-    # Or reuse an existing keras_tensor if it has already been cloned.
-    output_copy = clone_keras_tensors(node.output_tensors, kt_id_mapping)
-    call_args_copy = clone_keras_tensors(node.call_args, kt_id_mapping)
-    call_kwargs_copy = clone_keras_tensors(node.call_kwargs, kt_id_mapping)
-    # Creating new nodes based on the existing node information.
-    # Node wires itself to inbound and outbound layers.
-    # The Node constructor actually updates this layer's self._inbound_nodes,
-    # sets _keras_history on the outputs, and adds itself to the
-    # `_outbound_nodes` of the layers that produced the inputs to this
-    # layer call.
-    node_module.Node(node.layer,
-                     call_args=call_args_copy,
-                     call_kwargs=call_kwargs_copy,
-                     outputs=output_copy)
-  return cloned_inputs, cloned_outputs
+    """Clone the `Node` between the inputs and output tensors.
+
+    This function is used to create a new functional model from any intermediate
+    keras tensors. The clone of the nodes mimic the behavior of reconstructing
+    the functional graph network by re-executing all the __call__ methods. The
+    cloned nodes will be appended to the layers.
+
+    Note that a new tf.keras.Inputs will be created for any items in the
+    `inputs`
+
+    Args:
+      inputs: A nested structure of keras_tensors.
+      outputs: A nested structure of keras_tensors.
+
+    Returns:
+      A pair of inputs and outputs, with cloned keras_tensors. They can be used
+      to create a new functional model.
+    """
+    nodes_to_clone = find_nodes_by_inputs_and_outputs(inputs, outputs)
+    cloned_inputs = []
+    cloned_outputs = []
+    # We not only need to create copies of Nodes (mimic the calls), also need to
+    # clone keras_tensors to avoid the override of _keras_history attached on
+    # the keras_tensor. The following dict is used to track any keras tensor we
+    # cloned The key is the string ID of the original keras tensor, and value is
+    # the cloned keras_tensor instance.
+    kt_id_mapping = {}
+
+    for kt_input in tf.nest.flatten(inputs):
+        if kt_input.node.is_input:
+            # For any existing keras_tensor from tf.keras.Input, we leave them
+            # as is.
+            cloned_inputs.append(kt_input)
+            kt_id_mapping[id(kt_input)] = kt_input
+        else:
+            # We need to create a new tf.keras.Input for any intermediate
+            # keras_tensor
+            cpy = _clone_keras_tensor(kt_input)
+            cloned_input = input_layer_module.Input(tensor=cpy)
+            cloned_inputs.append(cloned_input)
+            kt_id_mapping[id(kt_input)] = cloned_input
+    cloned_inputs = tf.nest.pack_sequence_as(inputs, cloned_inputs)
+
+    for kt_output in tf.nest.flatten(outputs):
+        cpy = _clone_keras_tensor(kt_output)
+        # We reuse the _keras_history here, which contains the old information.
+        # It is used in the Node constructor to check if the tensor
+        # "is_keras_tensor()" The history will be override by the Node
+        # constructor anyway for the corresponding layer output anyway.
+        cpy._keras_history = kt_output._keras_history
+        cloned_outputs.append(cpy)
+        kt_id_mapping[id(kt_output)] = cpy
+    cloned_outputs = tf.nest.pack_sequence_as(outputs, cloned_outputs)
+
+    for node in nodes_to_clone:
+        # Clone any keras_tensors to avoid override of _keras_history
+        # Or reuse an existing keras_tensor if it has already been cloned.
+        output_copy = clone_keras_tensors(node.output_tensors, kt_id_mapping)
+        call_args_copy = clone_keras_tensors(node.call_args, kt_id_mapping)
+        call_kwargs_copy = clone_keras_tensors(node.call_kwargs, kt_id_mapping)
+        # Creating new nodes based on the existing node information.  Node wires
+        # itself to inbound and outbound layers.  The Node constructor actually
+        # updates this layer's self._inbound_nodes, sets _keras_history on the
+        # outputs, and adds itself to the `_outbound_nodes` of the layers that
+        # produced the inputs to this layer call.
+        node_module.Node(
+            node.layer,
+            call_args=call_args_copy,
+            call_kwargs=call_kwargs_copy,
+            outputs=output_copy,
+        )
+    return cloned_inputs, cloned_outputs
 
 
 def clone_keras_tensors(args, keras_tensor_mapping):
-  """Clone the keras tensors from the inputs.
-
-  For any KerasTensor instance in the `args`, a new copy of KerasTensor will
-  be created if it has not been cloned yet (by checking the
-  `keras_tensor_mapping`). For any other types, the instance will be unchanged.
-  This function is useful for cloning the Nodes since KerasTensor can't be
-  reused across the models.
-
-  Args:
-    args: A nested structure of objects, which could contain KerasTensor.
-    keras_tensor_mapping: A dict contains the ID of original KerasTensor, and
-      the cloned KerasTensor instance. The dict will be updated with newly
-      copied KerasTensor instances within this method.
-  Returns:
-    Same structure as inputs, with KerasTensor cloned.
-  """
-  result = []
-  for obj in tf.nest.flatten(args):
-    if node_module.is_keras_tensor(obj):
-      if id(obj) in keras_tensor_mapping:
-        cpy = keras_tensor_mapping[id(obj)]
-      else:
-        # Create copy of keras_tensor if we haven't done it before
-        cpy = _clone_keras_tensor(obj)
-        cpy._keras_history = obj._keras_history  # pylint: disable=protected-access
-        keras_tensor_mapping[id(obj)] = cpy
-      result.append(cpy)
-    else:
-      result.append(obj)
-  return tf.nest.pack_sequence_as(args, result)
+    """Clone the keras tensors from the inputs.
+
+    For any KerasTensor instance in the `args`, a new copy of KerasTensor will
+    be created if it has not been cloned yet (by checking the
+    `keras_tensor_mapping`). For any other types, the instance will be
+    unchanged. This function is useful for cloning the Nodes since KerasTensor
+    can't be reused across the models.
+
+    Args:
+      args: A nested structure of objects, which could contain KerasTensor.
+      keras_tensor_mapping: A dict contains the ID of original KerasTensor, and
+        the cloned KerasTensor instance. The dict will be updated with newly
+        copied KerasTensor instances within this method.
+    Returns:
+      Same structure as inputs, with KerasTensor cloned.
+    """
+    result = []
+    for obj in tf.nest.flatten(args):
+        if node_module.is_keras_tensor(obj):
+            if id(obj) in keras_tensor_mapping:
+                cpy = keras_tensor_mapping[id(obj)]
+            else:
+                # Create copy of keras_tensor if we haven't done it before
+                cpy = _clone_keras_tensor(obj)
+                cpy._keras_history = obj._keras_history
+                keras_tensor_mapping[id(obj)] = cpy
+            result.append(cpy)
+        else:
+            result.append(obj)
+    return tf.nest.pack_sequence_as(args, result)
 
 
 def _clone_keras_tensor(kt):
-  """Create an identical keras_tensor based on the input.
-
-  We use keras_tensor_to_placeholder and keras_tensor_from_tensor to make sure
-  inferred shape are not lost during the copy.
-
-  Args:
-    kt: the input KerasTensor.
-
-  Returns:
-    An identical copy of the input KerasTensor.
-  """
-  # Create a scratch graph since we don't intend to use the placeholders.
-  with backend._scratch_graph() as scratch_graph:  # pylint: disable=protected-access
-    with scratch_graph.as_default():
-      placeholder = keras_tensor.keras_tensor_to_placeholder(kt)
-      return keras_tensor.keras_tensor_from_tensor(placeholder)
+    """Create an identical keras_tensor based on the input.
+
+    We use keras_tensor_to_placeholder and keras_tensor_from_tensor to make sure
+    inferred shape are not lost during the copy.
+
+    Args:
+      kt: the input KerasTensor.
+
+    Returns:
+      An identical copy of the input KerasTensor.
+    """
+    # Create a scratch graph since we don't intend to use the placeholders.
+    with backend._scratch_graph() as scratch_graph:
+        with scratch_graph.as_default():
+            placeholder = keras_tensor.keras_tensor_to_placeholder(kt)
+            return keras_tensor.keras_tensor_from_tensor(placeholder)
diff --git a/keras/engine/functional_utils_test.py b/keras/engine/functional_utils_test.py
index aeb6dc163d9f..3d5be79a157c 100644
--- a/keras/engine/functional_utils_test.py
+++ b/keras/engine/functional_utils_test.py
@@ -11,200 +11,257 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#,============================================================================
+# ,============================================================================
 """Tests for functional_utils."""
 
 import collections
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import layers
 from keras import models
 from keras.engine import functional_utils
 from keras.engine import input_layer as input_layer_lib
 from keras.testing_infra import test_combinations
 
-import numpy as np
-import tensorflow.compat.v2 as tf
-
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class FunctionalModelSlideTest(test_combinations.TestCase):
-
-  def test_find_nodes_by_inputs_and_outputs(self):
-    inputs = input_layer_lib.Input((10,))
-    unconnected_inputs = input_layer_lib.Input((10,))
-    x = layers.Dense(8)(inputs)
-    y = layers.Dense(6)(x)
-    output = layers.Dense(4)(y)
-
-    nodes_in_graph = functional_utils.find_nodes_by_inputs_and_outputs(
-        x, output)
-    self.assertLen(nodes_in_graph, 2)
-    expected_nodes = [output.node, y.node]
-    self.assertCountEqual(nodes_in_graph, expected_nodes)
-
-    # Make sure we raise error if we specify invalid input/output pair
-    with self.assertRaisesRegex(
-        ValueError, 'Found input tensor cannot be reached'):
-      functional_utils.find_nodes_by_inputs_and_outputs(output, x)
-
-    with self.assertRaisesRegex(
-        ValueError, 'Found input tensor cannot be reached'):
-      functional_utils.find_nodes_by_inputs_and_outputs(unconnected_inputs,
-                                                        output)
-
-    with self.assertRaisesRegex(
-        ValueError, 'Found unvisited input tensors that are disconnected'):
-      functional_utils.find_nodes_by_inputs_and_outputs(
-          [inputs, unconnected_inputs], output)
-
-  def test_find_nodes_by_inputs_and_outputs_with_complicated_network(self):
-    input1 = input_layer_lib.Input((10,))
-    input2 = input_layer_lib.Input((10,))
-    input3 = input_layer_lib.Input((10,))
-    unconnected_input = input_layer_lib.Input((10,))
-
-    dense1 = layers.Dense(4, name='dense1')
-    dense2 = layers.Dense(4, name='dense2')
-    # dense1 are shared between input1 and input2
-    a = dense1(input1)
-    b = dense1(input2)
-
-    c = layers.Add()([a, b])
-    d = dense2(input3)
-    e = layers.Add()([c, d])
-    # There are 5 nodes (invoke of __call__) in the graph.
-
-    nodes = functional_utils.find_nodes_by_inputs_and_outputs(input1, a)
-    self.assertCountEqual(nodes, [a.node])
-
-    nodes = functional_utils.find_nodes_by_inputs_and_outputs(input2, b)
-    self.assertCountEqual(nodes, [b.node])
-
-    nodes = functional_utils.find_nodes_by_inputs_and_outputs([input2, input1],
-                                                              c)
-    # This should contains 2 dense call and 1 add
-    self.assertCountEqual(nodes, [a.node, b.node, c.node])
-
-    # Missing input3
-    with self.assertRaisesRegex(
-        ValueError, 'Found input tensor cannot be reached'):
-      functional_utils.find_nodes_by_inputs_and_outputs([input1, input2], e)
-
-    nodes = functional_utils.find_nodes_by_inputs_and_outputs(
-        [input1, input2, input3], e)
-    self.assertCountEqual(nodes, [a.node, b.node, c.node, d.node, e.node])
-
-    # Make sure we can create from intermediate tensors
-    nodes = functional_utils.find_nodes_by_inputs_and_outputs([a, b, input3], e)
-    self.assertCountEqual(nodes, [c.node, d.node, e.node])
-    # Also make sure we can add intermediate outputs
-    nodes = functional_utils.find_nodes_by_inputs_and_outputs([a, b, input3],
-                                                              [d, e])
-    self.assertCountEqual(nodes, [c.node, d.node, e.node])
-
-    # input1 and 2 are not needed for computing d
-    with self.assertRaisesRegex(
-        ValueError, 'Found unvisited input tensors that are disconnected'):
-      functional_utils.find_nodes_by_inputs_and_outputs(
-          [input1, input2, input3], d)
-
-    with self.assertRaisesRegex(
-        ValueError, 'Found unvisited input tensors that are disconnected'):
-      functional_utils.find_nodes_by_inputs_and_outputs(
-          [a, b, input3, unconnected_input], [e, d, c])
-
-  def test_build_model_from_intermediate_tensor(self):
-    batch_size = 4
-    inputs = input_layer_lib.Input(shape=(8,))
-    layer1 = layers.Dense(32)
-    layer2 = layers.Dense(16)
-    x = layer1(inputs)
-    y = layer2(x)
-    model = models.Model(x, y)
-    # Make sure a new node is attached to layer2, which mimic y = layer2(x)
-    self.assertLen(layer2.inbound_nodes, 2)
-
-    self.assertIsInstance(model, models.Model)
-    # The model only contains 1 dense layer and 1 input layer.
-    self.assertLen(model.layers, 2)
-    self.assertIs(model.layers[1], layer2)
-
-    model.compile('rmsprop', 'mse')
-    model.fit(np.random.randn(batch_size, 32), np.random.randn(batch_size, 16))
-    # Test for model saving
-    output_path = os.path.join(self.get_temp_dir(), 'tf_keras_saved_model')
-    model.save(output_path, save_format='tf')
-    loaded_model = models.load_model(output_path)
-    self.assertEqual(model.summary(), loaded_model.summary())
-
-    # Also make sure the original inputs and y can still be used to build model
-    new_model = models.Model(inputs, y)
-    # Make sure no new node is attached to layer2
-    self.assertLen(layer2.inbound_nodes, 2)
-
-    self.assertLen(new_model.layers, 3)
-    self.assertIs(new_model.layers[1], layer1)
-    self.assertIs(new_model.layers[2], layer2)
-
-  def test_build_model_from_intermediate_tensor_with_complicated_model(self):
-    # The topology is like below:
-    # input1 -> dense1 -> a
-    #                     + -> c - + --> d - + --> output
-    # input2 -> dense1 -> b -------^         ^
-    # input3 -> dense2 -> e -----------------|
-    batch_size = 8
-    input1 = input_layer_lib.Input((2,))
-    input2 = input_layer_lib.Input((2,))
-    input3 = input_layer_lib.Input((8,))
-
-    dense1 = layers.Dense(8, name='dense1')
-    dense2 = layers.Dense(8, name='dense2')
-
-    # dense1 are shared between input1 and input2
-    a = dense1(input1)
-    b = dense1(input2)
-
-    c = layers.Add()([a, b])
-    # d has a residual connection from b.
-    d = layers.Add()([b, c])
-    e = dense2(input3)
-    output = layers.Add()([d, e])
-
-    # We skip the input2 here and use b instead.
-    model = models.Model([input1, b, input3], output)
-    # Make sure we have 8 layers, 3 for inputs, 2 for dense and 3 for Add.
-    # Note that dense1 is still in use by input1.
-    self.assertLen(model.layers, 8)
-    # Since the layers are not ordered, let's check class of the layers to make
-    # sure it match the expectation.
-    class_count = collections.Counter([l.__class__ for l in model.layers])
-    self.assertEqual(class_count[input_layer_lib.InputLayer], 3)
-    self.assertEqual(class_count[layers.Dense], 2)
-    self.assertEqual(class_count[layers.Add], 3)
-
-    model.compile('rmsprop', 'mse')
-    model.fit([np.random.randn(batch_size, 2),
-               np.random.randn(batch_size, 8),  # The shape of b is (batch, 8)
-               np.random.randn(batch_size, 8)],
-              np.random.randn(batch_size, 8))
-    output_path = os.path.join(self.get_temp_dir(), 'tf_keras_saved_model')
-    model.save(output_path, save_format='tf')
-    loaded_model = models.load_model(output_path)
-    self.assertEqual(model.summary(), loaded_model.summary())
-
-    model2 = models.Model([a, b], d)
-    # 2 input layers and 2 Add layer.
-    self.assertLen(model2.layers, 4)
-    class_count = collections.Counter([l.__class__ for l in model2.layers])
-    self.assertEqual(class_count[input_layer_lib.InputLayer], 2)
-    self.assertEqual(class_count[layers.Add], 2)
-
-    model2.compile('rmsprop', 'mse')
-    model2.fit([np.random.randn(batch_size, 8),
-                np.random.randn(batch_size, 8)],
-               np.random.randn(batch_size, 8))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_find_nodes_by_inputs_and_outputs(self):
+        inputs = input_layer_lib.Input((10,))
+        unconnected_inputs = input_layer_lib.Input((10,))
+        x = layers.Dense(8)(inputs)
+        y = layers.Dense(6)(x)
+        output = layers.Dense(4)(y)
+
+        nodes_in_graph = functional_utils.find_nodes_by_inputs_and_outputs(
+            x, output
+        )
+        self.assertLen(nodes_in_graph, 2)
+        expected_nodes = [output.node, y.node]
+        self.assertCountEqual(nodes_in_graph, expected_nodes)
+
+        # Make sure we raise error if we specify invalid input/output pair
+        with self.assertRaisesRegex(
+            ValueError, "Found input tensor cannot be reached"
+        ):
+            functional_utils.find_nodes_by_inputs_and_outputs(output, x)
+
+        with self.assertRaisesRegex(
+            ValueError, "Found input tensor cannot be reached"
+        ):
+            functional_utils.find_nodes_by_inputs_and_outputs(
+                unconnected_inputs, output
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "Found unvisited input tensors that are disconnected"
+        ):
+            functional_utils.find_nodes_by_inputs_and_outputs(
+                [inputs, unconnected_inputs], output
+            )
+
+    def test_find_nodes_by_inputs_and_outputs_with_complicated_network(self):
+        input1 = input_layer_lib.Input((10,))
+        input2 = input_layer_lib.Input((10,))
+        input3 = input_layer_lib.Input((10,))
+        unconnected_input = input_layer_lib.Input((10,))
+
+        dense1 = layers.Dense(4, name="dense1")
+        dense2 = layers.Dense(4, name="dense2")
+        # dense1 are shared between input1 and input2
+        a = dense1(input1)
+        b = dense1(input2)
+
+        c = layers.Add()([a, b])
+        d = dense2(input3)
+        e = layers.Add()([c, d])
+        # There are 5 nodes (invoke of __call__) in the graph.
+
+        nodes = functional_utils.find_nodes_by_inputs_and_outputs(input1, a)
+        self.assertCountEqual(nodes, [a.node])
+
+        nodes = functional_utils.find_nodes_by_inputs_and_outputs(input2, b)
+        self.assertCountEqual(nodes, [b.node])
+
+        nodes = functional_utils.find_nodes_by_inputs_and_outputs(
+            [input2, input1], c
+        )
+        # This should contains 2 dense call and 1 add
+        self.assertCountEqual(nodes, [a.node, b.node, c.node])
+
+        # Missing input3
+        with self.assertRaisesRegex(
+            ValueError, "Found input tensor cannot be reached"
+        ):
+            functional_utils.find_nodes_by_inputs_and_outputs(
+                [input1, input2], e
+            )
+
+        nodes = functional_utils.find_nodes_by_inputs_and_outputs(
+            [input1, input2, input3], e
+        )
+        self.assertCountEqual(nodes, [a.node, b.node, c.node, d.node, e.node])
+
+        # Make sure we can create from intermediate tensors
+        nodes = functional_utils.find_nodes_by_inputs_and_outputs(
+            [a, b, input3], e
+        )
+        self.assertCountEqual(nodes, [c.node, d.node, e.node])
+        # Also make sure we can add intermediate outputs
+        nodes = functional_utils.find_nodes_by_inputs_and_outputs(
+            [a, b, input3], [d, e]
+        )
+        self.assertCountEqual(nodes, [c.node, d.node, e.node])
+
+        # input1 and 2 are not needed for computing d
+        with self.assertRaisesRegex(
+            ValueError, "Found unvisited input tensors that are disconnected"
+        ):
+            functional_utils.find_nodes_by_inputs_and_outputs(
+                [input1, input2, input3], d
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "Found unvisited input tensors that are disconnected"
+        ):
+            functional_utils.find_nodes_by_inputs_and_outputs(
+                [a, b, input3, unconnected_input], [e, d, c]
+            )
+
+    def test_build_model_from_intermediate_tensor(self):
+        batch_size = 4
+        inputs = input_layer_lib.Input(shape=(8,))
+        layer1 = layers.Dense(32)
+        layer2 = layers.Dense(16)
+        x = layer1(inputs)
+        y = layer2(x)
+        model = models.Model(x, y)
+        # Make sure a new node is attached to layer2, which mimic y = layer2(x)
+        self.assertLen(layer2.inbound_nodes, 2)
+
+        self.assertIsInstance(model, models.Model)
+        # The model only contains 1 dense layer and 1 input layer.
+        self.assertLen(model.layers, 2)
+        self.assertIs(model.layers[1], layer2)
+
+        model.compile("rmsprop", "mse")
+        model.fit(
+            np.random.randn(batch_size, 32), np.random.randn(batch_size, 16)
+        )
+
+        # Also make sure the original inputs and y can still be used to build
+        # model
+        new_model = models.Model(inputs, y)
+        # Make sure no new node is attached to layer2
+        self.assertLen(layer2.inbound_nodes, 2)
+
+        self.assertLen(new_model.layers, 3)
+        self.assertIs(new_model.layers[1], layer1)
+        self.assertIs(new_model.layers[2], layer2)
+
+        # Test for model saving
+        with self.subTest("savedmodel"):
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_v3_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
+    def test_build_model_from_intermediate_tensor_with_complicated_model(self):
+        # The topology is like below:
+        # input1 -> dense1 -> a
+        #                     + -> c - + --> d - + --> output
+        # input2 -> dense1 -> b -------^         ^
+        # input3 -> dense2 -> e -----------------|
+        batch_size = 8
+        input1 = input_layer_lib.Input((2,))
+        input2 = input_layer_lib.Input((2,))
+        input3 = input_layer_lib.Input((8,))
+
+        dense1 = layers.Dense(8, name="dense1")
+        dense2 = layers.Dense(8, name="dense2")
+
+        # dense1 are shared between input1 and input2
+        a = dense1(input1)
+        b = dense1(input2)
+
+        c = layers.Add()([a, b])
+        # d has a residual connection from b.
+        d = layers.Add()([b, c])
+        e = dense2(input3)
+        output = layers.Add()([d, e])
+
+        # We skip the input2 here and use b instead.
+        model = models.Model([input1, b, input3], output)
+        # Make sure we have 8 layers, 3 for inputs, 2 for dense and 3 for Add.
+        # Note that dense1 is still in use by input1.
+        self.assertLen(model.layers, 8)
+        # Since the layers are not ordered, let's check class of the layers to
+        # make sure it match the expectation.
+        class_count = collections.Counter([l.__class__ for l in model.layers])
+        self.assertEqual(class_count[input_layer_lib.InputLayer], 3)
+        self.assertEqual(class_count[layers.Dense], 2)
+        self.assertEqual(class_count[layers.Add], 3)
+
+        model.compile("rmsprop", "mse")
+        model.fit(
+            [
+                np.random.randn(batch_size, 2),
+                np.random.randn(batch_size, 8),  # The shape of b is (batch, 8)
+                np.random.randn(batch_size, 8),
+            ],
+            np.random.randn(batch_size, 8),
+        )
+
+        model2 = models.Model([a, b], d)
+        # 2 input layers and 2 Add layer.
+        self.assertLen(model2.layers, 4)
+        class_count = collections.Counter([l.__class__ for l in model2.layers])
+        self.assertEqual(class_count[input_layer_lib.InputLayer], 2)
+        self.assertEqual(class_count[layers.Add], 2)
+
+        model2.compile("rmsprop", "mse")
+        model2.fit(
+            [np.random.randn(batch_size, 8), np.random.randn(batch_size, 8)],
+            np.random.randn(batch_size, 8),
+        )
+
+        with self.subTest("savedmodel"):
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_v3_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/input_layer.py b/keras/engine/input_layer.py
index fd0e196d443d..b4f57818fb3d 100644
--- a/keras/engine/input_layer.py
+++ b/keras/engine/input_layer.py
@@ -12,252 +12,289 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Input layer code (`Input` and `InputLayer`)."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.distribute import distributed_training_utils
 from keras.engine import base_layer
 from keras.engine import keras_tensor
 from keras.engine import node as node_module
-from keras.saving.saved_model import layer_serialization
+from keras.saving import serialization_lib
+from keras.saving.legacy.saved_model import layer_serialization
 from keras.utils import tf_utils
 from keras.utils import traceback_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
 def _assert_other_arg_none(arg_name, arg):
-  if arg is not None:
-    raise ValueError('When `type_spec` is not None, all other args '
-                     'except `name` must be None, '
-                     'but %s is not None.' % arg_name)
+    if arg is not None:
+        raise ValueError(
+            "When `type_spec` is not None, all other args "
+            "except `name` must be None, "
+            "but %s is not None." % arg_name
+        )
 
 
-@keras_export('keras.layers.InputLayer')
+@keras_export("keras.layers.InputLayer")
 class InputLayer(base_layer.Layer):
-  """Layer to be used as an entry point into a Network (a graph of layers).
-
-  It can either wrap an existing tensor (pass an `input_tensor` argument)
-  or create a placeholder tensor (pass arguments `input_shape`, and
-  optionally, `dtype`).
-
-  It is generally recommend to use the Keras Functional model via `Input`,
-  (which creates an `InputLayer`) without directly using `InputLayer`.
-
-  When using `InputLayer` with the Keras Sequential model, it can be skipped by
-  moving the `input_shape` parameter to the first layer after the `InputLayer`.
-
-  This class can create placeholders for `tf.Tensors`, `tf.SparseTensors`, and
-  `tf.RaggedTensors` by choosing `sparse=True` or `ragged=True`. Note that
-  `sparse` and `ragged` can't be configured to `True` at the same time.
-  Usage:
-
-  ```python
-  # With explicit InputLayer.
-  model = tf.keras.Sequential([
-    tf.keras.layers.InputLayer(input_shape=(4,)),
-    tf.keras.layers.Dense(8)])
-  model.compile(tf.optimizers.RMSprop(0.001), loss='mse')
-  model.fit(np.zeros((10, 4)),
-            np.ones((10, 8)))
-
-  # Without InputLayer and let the first layer to have the input_shape.
-  # Keras will add a input for the model behind the scene.
-  model = tf.keras.Sequential([
-    tf.keras.layers.Dense(8, input_shape=(4,))])
-  model.compile(tf.optimizers.RMSprop(0.001), loss='mse')
-  model.fit(np.zeros((10, 4)),
-            np.ones((10, 8)))
-  ```
-
-  Args:
-      input_shape: Shape tuple (not including the batch axis), or `TensorShape`
-        instance (not including the batch axis).
-      batch_size: Optional input batch size (integer or `None`).
-      dtype: Optional datatype of the input. When not provided, the Keras
-          default `float` type will be used.
-      input_tensor: Optional tensor to use as layer input. If set, the layer
-          will use the `tf.TypeSpec` of this tensor rather
-          than creating a new placeholder tensor.
-      sparse: Boolean, whether the placeholder created is meant to be sparse.
-          Default to `False`.
-      ragged: Boolean, whether the placeholder created is meant to be ragged.
-          In this case, values of `None` in the `shape` argument represent
-          ragged dimensions. For more information about `tf.RaggedTensor`, see
-          [this guide](https://www.tensorflow.org/guide/ragged_tensor).
-          Default to `False`.
-      type_spec: A `tf.TypeSpec` object to create Input from. This `tf.TypeSpec`
-          represents the entire batch. When provided, all other args except
-          name must be `None`.
-      name: Optional name of the layer (string).
-  """
-
-  @traceback_utils.filter_traceback
-  def __init__(self,
-               input_shape=None,
-               batch_size=None,
-               dtype=None,
-               input_tensor=None,
-               sparse=None,
-               name=None,
-               ragged=None,
-               type_spec=None,
-               **kwargs):
-    self._init_input_shape = input_shape
-    self._init_batch_size = batch_size
-    self._init_dtype = dtype
-    self._init_sparse = sparse
-    self._init_ragged = ragged
-    self._init_type_spec = type_spec
-
-    strategy = tf.distribute.get_strategy()
-    if strategy and batch_size is not None and \
-        distributed_training_utils.global_batch_size_supported(strategy):
-      if batch_size % strategy.num_replicas_in_sync != 0:
-        raise ValueError('The `batch_size` argument ({}) must be divisible by '
-                         'the number of replicas ({})'.format(
-                             batch_size, strategy.num_replicas_in_sync))
-      batch_size = batch_size // strategy.num_replicas_in_sync
-
-    if 'batch_input_shape' in kwargs:
-      batch_input_shape = kwargs.pop('batch_input_shape')
-      if input_shape and batch_input_shape:
-        raise ValueError('Only provide the input_shape OR '
-                         'batch_input_shape argument to '
-                         'InputLayer, not both at the same time.')
-      # Set the input shape and batch size from the batch_input_shape.
-      # Note that batch_input_shape can be None (unknown rank) or [] (scalar),
-      # in which case the batch size must be None.
-      if batch_input_shape:
-        batch_size = batch_input_shape[0]
-        input_shape = batch_input_shape[1:]
-    if kwargs:
-      raise ValueError(f'Unrecognized keyword arguments: {list(kwargs.keys())}')
-
-    if sparse and ragged:
-      raise ValueError(
-          'Cannot set both sparse and ragged to True in a Keras input.')
-
-    if not name:
-      prefix = 'input'
-      name = prefix + '_' + str(backend.get_uid(prefix))
-
-    if not dtype:
-      if input_tensor is None:
-        dtype = backend.floatx()
-      else:
-        dtype = backend.dtype(input_tensor)
-    elif input_tensor is not None and input_tensor.dtype != dtype:
-      raise ValueError(
-          '`input_tensor.dtype` differs from `dtype`. Received: '
-          f'input_tensor.dtype={input_tensor.dtype} '
-          f'but expected dtype={dtype}')
-    super().__init__(dtype=dtype, name=name)
-    self.built = True
-    self.sparse = True if sparse else False
-    self.ragged = True if ragged else False
-    self.batch_size = batch_size
-    self.supports_masking = True
-
-    if isinstance(input_shape, tf.TensorShape):
-      input_shape = tuple(input_shape.as_list())
-    elif isinstance(input_shape, int):
-      input_shape = (input_shape,)
-
-    if type_spec is not None:
-      args_that_must_be_none = [
-          ('(input_)shape', self._init_input_shape),
-          ('batch_size', self._init_batch_size),
-          ('dtype', self._init_dtype),
-          ('input_tensor', input_tensor),
-          ('sparse', self._init_sparse),
-          ('ragged', self._init_ragged),
-      ]
-      for arg_name, arg in args_that_must_be_none:
-        _assert_other_arg_none(arg_name, arg)
-      if not tf.compat.v1.executing_eagerly_outside_functions():
-        raise ValueError('Creating Keras inputs from a type_spec is only '
-                         'supported when eager execution is enabled.')
-      input_tensor = keras_tensor.keras_tensor_from_type_spec(type_spec)
-      if isinstance(input_tensor, keras_tensor.SparseKerasTensor):
-        self.sparse = True
-      if isinstance(input_tensor, keras_tensor.RaggedKerasTensor):
-        self.ragged = True
-      self.is_placeholder = True
-      try:
-        self._batch_input_shape = tuple(input_tensor.shape.as_list())
-      except ValueError:
-        # If the shape cannot be represented as a tuple (e.g. unknown rank)
-        self._batch_input_shape = None
-    elif input_tensor is None:
-      if input_shape is not None:
-        batch_input_shape = (batch_size,) + tuple(input_shape)
-      else:
-        batch_input_shape = None
-      graph = backend.get_graph()
-      with graph.as_default():
-        input_tensor = backend.placeholder(
-            shape=batch_input_shape,
-            dtype=dtype,
-            name=self.name,
-            sparse=sparse,
-            ragged=ragged)
-
-      self.is_placeholder = True
-      self._batch_input_shape = batch_input_shape
-    else:
-      if tf.compat.v1.executing_eagerly_outside_functions():
-        if not isinstance(input_tensor, keras_tensor.KerasTensor):
-          input_tensor = keras_tensor.keras_tensor_from_tensor(input_tensor)
-      else:
-        if not tf_utils.is_symbolic_tensor(input_tensor):
-          raise ValueError('You should not pass an EagerTensor to `Input`. '
-                           'For example, instead of creating an '
-                           '`InputLayer`, you should instantiate your model '
-                           'and directly call it on your input.')
-      self.is_placeholder = False
-      try:
-        self._batch_input_shape = tuple(input_tensor.shape.as_list())
-      except ValueError:
-        # If the shape cannot be represented as a tuple (e.g. unknown rank)
-        self._batch_input_shape = None
-    # Create an input node.
-    input_tensor._keras_mask = None
-    node_module.Node(layer=self, outputs=input_tensor)
-
-    # Store type spec
-    if isinstance(input_tensor, keras_tensor.KerasTensor) or (
-        tf_utils.is_extension_type(input_tensor)):
-      self._type_spec = input_tensor._type_spec  # pylint: disable=protected-access
-    else:
-      self._type_spec = tf.TensorSpec(
-          shape=input_tensor.shape, dtype=input_tensor.dtype, name=self.name)
-
-  def get_config(self):
-    if self._init_type_spec is not None:
-      config = {
-          'name': self.name,
-          'type_spec': self._init_type_spec
-      }
-    else:
-      config = {
-          'batch_input_shape': self._batch_input_shape,
-          'dtype': self.dtype,
-          'sparse': self.sparse,
-          'ragged': self.ragged,
-          'name': self.name,
-      }
-    return config
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return layer_serialization.InputLayerSavedModelSaver(self)
-
-
-@keras_export('keras.Input', 'keras.layers.Input')
+    """Layer to be used as an entry point into a Network (a graph of layers).
+
+    It can either wrap an existing tensor (pass an `input_tensor` argument)
+    or create a placeholder tensor (pass arguments `input_shape`, and
+    optionally, `dtype`).
+
+    It is generally recommend to use the Keras Functional model via `Input`,
+    (which creates an `InputLayer`) without directly using `InputLayer`.
+
+    When using `InputLayer` with the Keras Sequential model, it can be skipped
+    by moving the `input_shape` parameter to the first layer after the
+    `InputLayer`.
+
+    This class can create placeholders for `tf.Tensors`, `tf.SparseTensors`, and
+    `tf.RaggedTensors` by choosing `sparse=True` or `ragged=True`. Note that
+    `sparse` and `ragged` can't be configured to `True` at the same time.
+    Usage:
+
+    ```python
+    # With explicit InputLayer.
+    model = tf.keras.Sequential([
+      tf.keras.layers.InputLayer(input_shape=(4,)),
+      tf.keras.layers.Dense(8)])
+    model.compile(tf.keras.optimizers.RMSprop(0.001), loss='mse')
+    model.fit(np.zeros((10, 4)),
+              np.ones((10, 8)))
+
+    # Without InputLayer and let the first layer to have the input_shape.
+    # Keras will add a input for the model behind the scene.
+    model = tf.keras.Sequential([
+      tf.keras.layers.Dense(8, input_shape=(4,))])
+    model.compile(tf.keras.optimizers.RMSprop(0.001), loss='mse')
+    model.fit(np.zeros((10, 4)),
+              np.ones((10, 8)))
+    ```
+
+    Args:
+        input_shape: Shape tuple (not including the batch axis), or
+            `TensorShape` instance (not including the batch axis).
+        batch_size: Optional input batch size (integer or `None`).
+        dtype: Optional datatype of the input. When not provided, the Keras
+            default `float` type will be used.
+        input_tensor: Optional tensor to use as layer input. If set, the layer
+            will use the `tf.TypeSpec` of this tensor rather
+            than creating a new placeholder tensor.
+        sparse: Boolean, whether the placeholder created is meant to be sparse.
+            Defaults to `False`.
+        ragged: Boolean, whether the placeholder created is meant to be ragged.
+            In this case, values of `None` in the `shape` argument represent
+            ragged dimensions. For more information about `tf.RaggedTensor`, see
+            [this guide](https://www.tensorflow.org/guide/ragged_tensor).
+            Defaults to `False`.
+        type_spec: A `tf.TypeSpec` object to create Input from. This
+            `tf.TypeSpec` represents the entire batch. When provided, all other
+            args except name must be `None`.
+        name: Optional name of the layer (string).
+    """
+
+    @traceback_utils.filter_traceback
+    def __init__(
+        self,
+        input_shape=None,
+        batch_size=None,
+        dtype=None,
+        input_tensor=None,
+        sparse=None,
+        name=None,
+        ragged=None,
+        type_spec=None,
+        **kwargs,
+    ):
+        self._init_input_shape = input_shape
+        self._init_batch_size = batch_size
+        self._init_dtype = dtype
+        self._init_sparse = sparse
+        self._init_ragged = ragged
+        self._init_type_spec = type_spec
+
+        strategy = tf.distribute.get_strategy()
+        if (
+            strategy
+            and batch_size is not None
+            and distributed_training_utils.global_batch_size_supported(strategy)
+        ):
+            if batch_size % strategy.num_replicas_in_sync != 0:
+                raise ValueError(
+                    "The `batch_size` argument ({}) must be divisible by "
+                    "the number of replicas ({})".format(
+                        batch_size, strategy.num_replicas_in_sync
+                    )
+                )
+            batch_size = batch_size // strategy.num_replicas_in_sync
+
+        if "batch_input_shape" in kwargs:
+            batch_input_shape = kwargs.pop("batch_input_shape")
+            if input_shape and batch_input_shape:
+                raise ValueError(
+                    "Only provide the input_shape OR "
+                    "batch_input_shape argument to "
+                    "InputLayer, not both at the same time."
+                )
+            # Set the input shape and batch size from the batch_input_shape.
+            # Note that batch_input_shape can be None (unknown rank) or []
+            # (scalar), in which case the batch size must be None.
+            if batch_input_shape:
+                batch_size = batch_input_shape[0]
+                input_shape = batch_input_shape[1:]
+        if kwargs:
+            raise ValueError(
+                f"Unrecognized keyword arguments: {list(kwargs.keys())}"
+            )
+
+        if sparse and ragged:
+            raise ValueError(
+                "Cannot set both sparse and ragged to True in a Keras input."
+            )
+
+        if not name:
+            prefix = "input"
+            name = prefix + "_" + str(backend.get_uid(prefix))
+
+        if not dtype:
+            if input_tensor is None:
+                dtype = backend.floatx()
+            else:
+                dtype = backend.dtype(input_tensor)
+        elif input_tensor is not None and input_tensor.dtype != dtype:
+            raise ValueError(
+                "`input_tensor.dtype` differs from `dtype`. Received: "
+                f"input_tensor.dtype={input_tensor.dtype} "
+                f"but expected dtype={dtype}"
+            )
+        super().__init__(dtype=dtype, name=name)
+        self.built = True
+        self.sparse = True if sparse else False
+        self.ragged = True if ragged else False
+        self.batch_size = batch_size
+        self.supports_masking = True
+
+        if isinstance(input_shape, tf.TensorShape):
+            input_shape = tuple(input_shape.as_list())
+        elif isinstance(input_shape, int):
+            input_shape = (input_shape,)
+
+        if type_spec is not None:
+            args_that_must_be_none = [
+                ("(input_)shape", self._init_input_shape),
+                ("batch_size", self._init_batch_size),
+                ("dtype", self._init_dtype),
+                ("input_tensor", input_tensor),
+                ("sparse", self._init_sparse),
+                ("ragged", self._init_ragged),
+            ]
+            for arg_name, arg in args_that_must_be_none:
+                _assert_other_arg_none(arg_name, arg)
+            if not tf.compat.v1.executing_eagerly_outside_functions():
+                raise ValueError(
+                    "Creating Keras inputs from a type_spec is only "
+                    "supported when eager execution is enabled."
+                )
+            # Needed for type_spec deserialization since TypeSpec objects
+            # are not Keras-native (not automatically deserialized).
+            if isinstance(type_spec, dict):
+                type_spec = serialization_lib.deserialize_keras_object(
+                    type_spec
+                )
+            input_tensor = keras_tensor.keras_tensor_from_type_spec(type_spec)
+            if isinstance(input_tensor, keras_tensor.SparseKerasTensor):
+                self.sparse = True
+            if isinstance(input_tensor, keras_tensor.RaggedKerasTensor):
+                self.ragged = True
+            self.is_placeholder = True
+            try:
+                self._batch_input_shape = tuple(input_tensor.shape.as_list())
+            except ValueError:
+                # If the shape cannot be represented as a tuple (e.g. unknown
+                # rank)
+                self._batch_input_shape = None
+        elif input_tensor is None:
+            if input_shape is not None:
+                batch_input_shape = (batch_size,) + tuple(input_shape)
+            else:
+                batch_input_shape = None
+            graph = backend.get_graph()
+            with graph.as_default():
+                input_tensor = backend.placeholder(
+                    shape=batch_input_shape,
+                    dtype=dtype,
+                    name=self.name,
+                    sparse=sparse,
+                    ragged=ragged,
+                )
+
+            self.is_placeholder = True
+            self._batch_input_shape = batch_input_shape
+        else:
+            if tf.compat.v1.executing_eagerly_outside_functions():
+                if not isinstance(input_tensor, keras_tensor.KerasTensor):
+                    input_tensor = keras_tensor.keras_tensor_from_tensor(
+                        input_tensor
+                    )
+            else:
+                if not tf_utils.is_symbolic_tensor(input_tensor):
+                    raise ValueError(
+                        "You should not pass an EagerTensor to `Input`. "
+                        "For example, instead of creating an "
+                        "`InputLayer`, you should instantiate your model "
+                        "and directly call it on your input."
+                    )
+            self.is_placeholder = False
+            try:
+                self._batch_input_shape = tuple(input_tensor.shape.as_list())
+            except ValueError:
+                # If the shape cannot be represented as a tuple (e.g. unknown
+                # rank)
+                self._batch_input_shape = None
+        # Create an input node.
+        input_tensor._keras_mask = None
+        node_module.Node(layer=self, outputs=input_tensor)
+
+        # Store type spec
+        if isinstance(input_tensor, keras_tensor.KerasTensor) or (
+            tf_utils.is_extension_type(input_tensor)
+        ):
+            self._type_spec = input_tensor._type_spec
+        else:
+            self._type_spec = tf.TensorSpec(
+                shape=input_tensor.shape,
+                dtype=input_tensor.dtype,
+                name=self.name,
+            )
+
+    def get_config(self):
+        if self._init_type_spec is not None:
+            config = {"name": self.name, "type_spec": self._init_type_spec}
+        else:
+            config = {
+                "batch_input_shape": self._batch_input_shape,
+                "dtype": self.dtype,
+                "sparse": self.sparse,
+                "ragged": self.ragged,
+                "name": self.name,
+            }
+        return config
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return layer_serialization.InputLayerSavedModelSaver(self)
+
+
+@keras_export("keras.Input", "keras.layers.Input")
 @traceback_utils.filter_traceback
-def Input(  # pylint: disable=invalid-name
+def Input(
     shape=None,
     batch_size=None,
     name=None,
@@ -266,131 +303,161 @@ def Input(  # pylint: disable=invalid-name
     tensor=None,
     ragged=None,
     type_spec=None,
-    **kwargs):
-  """`Input()` is used to instantiate a Keras tensor.
-
-  A Keras tensor is a symbolic tensor-like object,
-  which we augment with certain attributes that allow us to build a Keras model
-  just by knowing the inputs and outputs of the model.
-
-  For instance, if `a`, `b` and `c` are Keras tensors,
-  it becomes possible to do:
-  `model = Model(input=[a, b], output=c)`
-
-  Args:
-      shape: A shape tuple (integers), not including the batch size.
-          For instance, `shape=(32,)` indicates that the expected input
-          will be batches of 32-dimensional vectors. Elements of this tuple
-          can be None; 'None' elements represent dimensions where the shape is
-          not known.
-      batch_size: optional static batch size (integer).
-      name: An optional name string for the layer.
-          Should be unique in a model (do not reuse the same name twice).
-          It will be autogenerated if it isn't provided.
-      dtype: The data type expected by the input, as a string
-          (`float32`, `float64`, `int32`...)
-      sparse: A boolean specifying whether the placeholder to be created is
-          sparse. Only one of 'ragged' and 'sparse' can be True. Note that,
-          if `sparse` is False, sparse tensors can still be passed into the
-          input - they will be densified with a default value of 0.
-      tensor: Optional existing tensor to wrap into the `Input` layer.
-          If set, the layer will use the `tf.TypeSpec` of this tensor rather
-          than creating a new placeholder tensor.
-      ragged: A boolean specifying whether the placeholder to be created is
-          ragged. Only one of 'ragged' and 'sparse' can be True. In this case,
-          values of 'None' in the 'shape' argument represent ragged dimensions.
-          For more information about RaggedTensors, see
-          [this guide](https://www.tensorflow.org/guide/ragged_tensors).
-      type_spec: A `tf.TypeSpec` object to create the input placeholder from.
-          When provided, all other args except name must be None.
-      **kwargs: deprecated arguments support. Supports `batch_shape` and
-          `batch_input_shape`.
-
-  Returns:
-    A `tensor`.
-
-  Example:
-
-  ```python
-  # this is a logistic regression in Keras
-  x = Input(shape=(32,))
-  y = Dense(16, activation='softmax')(x)
-  model = Model(x, y)
-  ```
-
-  Note that even if eager execution is enabled,
-  `Input` produces a symbolic tensor-like object (i.e. a placeholder).
-  This symbolic tensor-like object can be used with lower-level
-  TensorFlow ops that take tensors as inputs, as such:
-
-  ```python
-  x = Input(shape=(32,))
-  y = tf.square(x)  # This op will be treated like a layer
-  model = Model(x, y)
-  ```
-
-  (This behavior does not work for higher-order TensorFlow APIs such as
-  control flow and being directly watched by a `tf.GradientTape`).
-
-  However, the resulting model will not track any variables that were
-  used as inputs to TensorFlow ops. All variable usages must happen within
-  Keras layers to make sure they will be tracked by the model's weights.
-
-  The Keras Input can also create a placeholder from an arbitrary `tf.TypeSpec`,
-  e.g:
-
-  ```python
-  x = Input(type_spec=tf.RaggedTensorSpec(shape=[None, None],
-                                          dtype=tf.float32, ragged_rank=1))
-  y = x.values
-  model = Model(x, y)
-  ```
-  When passing an arbitrary `tf.TypeSpec`, it must represent the signature of an
-  entire batch instead of just one example.
-
-  Raises:
-    ValueError: If both `sparse` and `ragged` are provided.
-    ValueError: If both `shape` and (`batch_input_shape` or `batch_shape`) are
-      provided.
-    ValueError: If `shape`, `tensor` and `type_spec` are None.
-    ValueError: If arguments besides `type_spec` are non-None while `type_spec`
-                is passed.
-    ValueError: if any unrecognized parameters are provided.
-  """
-  if sparse and ragged:
-    raise ValueError(
-        'Cannot set both `sparse` and `ragged` to `True` in a Keras `Input`.')
-
-  input_layer_config = {'name': name, 'dtype': dtype, 'sparse': sparse,
-                        'ragged': ragged, 'input_tensor': tensor,
-                        'type_spec': type_spec}
-
-  batch_input_shape = kwargs.pop('batch_input_shape',
-                                 kwargs.pop('batch_shape', None))
-  if shape is not None and batch_input_shape is not None:
-    raise ValueError('Only provide the `shape` OR `batch_input_shape` argument '
-                     'to Input, not both at the same time.')
-  if (batch_input_shape is None and shape is None and tensor is None
-      and type_spec is None):
-    raise ValueError('Please provide to Input a `shape` '
-                     'or a `tensor` or a `type_spec` argument. Note that '
-                     '`shape` does not include the batch '
-                     'dimension.')
-  if kwargs:
-    raise ValueError(f'Unrecognized keyword arguments: {list(kwargs.keys())}')
-
-  if batch_input_shape:
-    shape = batch_input_shape[1:]
-    input_layer_config.update({'batch_input_shape': batch_input_shape})
-  else:
-    input_layer_config.update(
-        {'batch_size': batch_size, 'input_shape': shape})
-  input_layer = InputLayer(**input_layer_config)
-
-  # Return tensor including `_keras_history`.
-  # Note that in this case train_output and test_output are the same pointer.
-  outputs = input_layer._inbound_nodes[0].outputs
-  if isinstance(outputs, list) and len(outputs) == 1:
-    return outputs[0]
-  else:
-    return outputs
+    **kwargs,
+):
+    """`Input()` is used to instantiate a Keras tensor.
+
+    A Keras tensor is a symbolic tensor-like object, which we augment with
+    certain attributes that allow us to build a Keras model just by knowing the
+    inputs and outputs of the model.
+
+    For instance, if `a`, `b` and `c` are Keras tensors,
+    it becomes possible to do:
+    `model = Model(input=[a, b], output=c)`
+
+    Args:
+        shape: A shape tuple (integers), not including the batch size.
+            For instance, `shape=(32,)` indicates that the expected input
+            will be batches of 32-dimensional vectors. Elements of this tuple
+            can be None; 'None' elements represent dimensions where the shape is
+            not known.
+        batch_size: optional static batch size (integer).
+        name: An optional name string for the layer.
+            Should be unique in a model (do not reuse the same name twice).
+            It will be autogenerated if it isn't provided.
+        dtype: The data type expected by the input, as a string
+            (`float32`, `float64`, `int32`...)
+        sparse: A boolean specifying whether the placeholder to be created is
+            sparse. Only one of 'ragged' and 'sparse' can be True. Note that,
+            if `sparse` is False, sparse tensors can still be passed into the
+            input - they will be densified with a default value of 0.
+        tensor: Optional existing tensor to wrap into the `Input` layer.
+            If set, the layer will use the `tf.TypeSpec` of this tensor rather
+            than creating a new placeholder tensor.
+        ragged: A boolean specifying whether the placeholder to be created is
+            ragged. Only one of 'ragged' and 'sparse' can be True. In this case,
+            values of 'None' in the 'shape' argument represent ragged
+            dimensions.  For more information about RaggedTensors, see
+            [this guide](https://www.tensorflow.org/guide/ragged_tensor).
+        type_spec: A `tf.TypeSpec` object to create the input placeholder from.
+            When provided, all other args except name must be None.
+        **kwargs: deprecated arguments support. Supports `batch_shape` and
+            `batch_input_shape`.
+
+    Returns:
+      A `tensor`.
+
+    Example:
+
+    ```python
+    # this is a logistic regression in Keras
+    x = Input(shape=(32,))
+    y = Dense(16, activation='softmax')(x)
+    model = Model(x, y)
+    ```
+
+    Note that even if eager execution is enabled,
+    `Input` produces a symbolic tensor-like object (i.e. a placeholder).
+    This symbolic tensor-like object can be used with lower-level
+    TensorFlow ops that take tensors as inputs, as such:
+
+    ```python
+    x = Input(shape=(32,))
+    y = tf.square(x)  # This op will be treated like a layer
+    model = Model(x, y)
+    ```
+
+    (This behavior does not work for higher-order TensorFlow APIs such as
+    control flow and being directly watched by a `tf.GradientTape`).
+
+    However, the resulting model will not track any variables that were
+    used as inputs to TensorFlow ops. All variable usages must happen within
+    Keras layers to make sure they will be tracked by the model's weights.
+
+    The Keras Input can also create a placeholder from an arbitrary
+    `tf.TypeSpec`, e.g:
+
+    ```python
+    x = Input(type_spec=tf.RaggedTensorSpec(shape=[None, None],
+                                            dtype=tf.float32, ragged_rank=1))
+    y = x.values
+    model = Model(x, y)
+    ```
+    When passing an arbitrary `tf.TypeSpec`, it must represent the signature of
+    an entire batch instead of just one example.
+
+    Raises:
+      ValueError: If both `sparse` and `ragged` are provided.
+      ValueError: If both `shape` and (`batch_input_shape` or `batch_shape`) are
+        provided.
+      ValueError: If `shape`, `tensor` and `type_spec` are None.
+      ValueError: If arguments besides `type_spec` are non-None while
+        `type_spec` is passed.
+      ValueError: if any unrecognized parameters are provided.
+    """
+    if sparse and ragged:
+        raise ValueError(
+            "Cannot set both `sparse` and `ragged` to `True` in a "
+            "Keras `Input`."
+        )
+
+    has_spec_name = (
+        name is None and type_spec is not None and hasattr(type_spec, "name")
+    )
+
+    if has_spec_name:
+        name = type_spec.name
+
+    input_layer_config = {
+        "name": name,
+        "dtype": dtype,
+        "sparse": sparse,
+        "ragged": ragged,
+        "input_tensor": tensor,
+        "type_spec": type_spec,
+    }
+
+    batch_input_shape = kwargs.pop(
+        "batch_input_shape", kwargs.pop("batch_shape", None)
+    )
+    if shape is not None and batch_input_shape is not None:
+        raise ValueError(
+            "Only provide the `shape` OR `batch_input_shape` argument "
+            "to Input, not both at the same time."
+        )
+    if (
+        batch_input_shape is None
+        and shape is None
+        and tensor is None
+        and type_spec is None
+    ):
+        raise ValueError(
+            "Please provide to Input a `shape` "
+            "or a `tensor` or a `type_spec` argument. Note that "
+            "`shape` does not include the batch "
+            "dimension."
+        )
+    if kwargs:
+        raise ValueError(
+            f"Unrecognized keyword arguments: {list(kwargs.keys())}"
+        )
+
+    if batch_input_shape:
+        shape = batch_input_shape[1:]
+        input_layer_config.update({"batch_input_shape": batch_input_shape})
+    else:
+        input_layer_config.update(
+            {"batch_size": batch_size, "input_shape": shape}
+        )
+    input_layer = InputLayer(**input_layer_config)
+
+    # Return tensor including `_keras_history`.
+    # Note that in this case train_output and test_output are the same pointer.
+    outputs = input_layer._inbound_nodes[0].outputs
+    if isinstance(outputs, list) and len(outputs) == 1:
+        output = outputs[0]
+    else:
+        output = outputs
+    if has_spec_name and hasattr(output, "_name"):
+        output._name = input_layer.name
+    return output
diff --git a/keras/engine/input_layer_test.py b/keras/engine/input_layer_test.py
index 142119fb3ee1..636d6aa4faee 100644
--- a/keras/engine/input_layer_test.py
+++ b/keras/engine/input_layer_test.py
@@ -11,359 +11,456 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#,============================================================================
+# ,============================================================================
 """Tests for InputLayer construction."""
 
+
 import tensorflow.compat.v2 as tf
-from tensorflow.python.framework import type_spec
+
+from keras import Sequential
 from keras import backend
-from keras.testing_infra import test_combinations
+from keras import models
 from keras.engine import functional
 from keras.engine import input_layer as input_layer_lib
+from keras.layers import Dense
 from keras.layers import core
-from keras.saving import model_config
-
-
-class TwoTensors(tf.__internal__.CompositeTensor):
-  """A simple value type to test TypeSpec.
-
-  Contains two tensors (x, y) and a string (color).  The color value is a
-  stand-in for any extra type metadata we might need to store.
+from keras.saving.legacy import model_config
+from keras.saving.serialization_lib import SafeModeScope
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
-  This value type contains no single dtype.
-  """
+# isort: off
+from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 
-  def __init__(self, x, y, color='red', assign_variant_dtype=False):
-    assert isinstance(color, str)
-    self.x = tf.convert_to_tensor(x)
-    self.y = tf.convert_to_tensor(y)
-    self.color = color
-    self.shape = tf.TensorShape(None)
-    self._shape = tf.TensorShape(None)
-    if assign_variant_dtype:
-      self.dtype = tf.variant
-    self._assign_variant_dtype = assign_variant_dtype
 
-  def _type_spec(self):
-    return TwoTensorsSpecNoOneDtype(
-        self.x.shape, self.x.dtype, self.y.shape,
-        self.y.dtype, color=self.color,
-        assign_variant_dtype=self._assign_variant_dtype)
+class TwoTensors(tf.__internal__.CompositeTensor):
+    """A simple value type to test TypeSpec.
+
+    Contains two tensors (x, y) and a string (color).  The color value is a
+    stand-in for any extra type metadata we might need to store.
+
+    This value type contains no single dtype.
+    """
+
+    def __init__(self, x, y, color="red", assign_variant_dtype=False):
+        assert isinstance(color, str)
+        self.x = tf.convert_to_tensor(x)
+        self.y = tf.convert_to_tensor(y)
+        self.color = color
+        self.shape = tf.TensorShape(None)
+        self._shape = tf.TensorShape(None)
+        if assign_variant_dtype:
+            self.dtype = tf.variant
+        self._assign_variant_dtype = assign_variant_dtype
+
+    def _type_spec(self):
+        return TwoTensorsSpecNoOneDtype(
+            self.x.shape,
+            self.x.dtype,
+            self.y.shape,
+            self.y.dtype,
+            color=self.color,
+            assign_variant_dtype=self._assign_variant_dtype,
+        )
 
 
 def as_shape(shape):
-  """Converts the given object to a TensorShape."""
-  if isinstance(shape, tf.TensorShape):
-    return shape
-  else:
-    return tf.TensorShape(shape)
+    """Converts the given object to a TensorShape."""
+    if isinstance(shape, tf.TensorShape):
+        return shape
+    else:
+        return tf.TensorShape(shape)
 
 
-@type_spec.register('tf.TwoTensorsSpec')
+@type_spec_registry.register("tf.TwoTensorsSpec")
 class TwoTensorsSpecNoOneDtype(tf.TypeSpec):
-  """A TypeSpec for the TwoTensors value type."""
-
-  def __init__(
-      self, x_shape, x_dtype, y_shape, y_dtype, color='red',
-      assign_variant_dtype=False):
-    self.x_shape = as_shape(x_shape)
-    self.x_dtype = tf.as_dtype(x_dtype)
-    self.y_shape = as_shape(y_shape)
-    self.y_dtype = tf.as_dtype(y_dtype)
-    self.color = color
-    self.shape = tf.TensorShape(None)
-    self._shape = tf.TensorShape(None)
-    if assign_variant_dtype:
-      self.dtype = tf.variant
-    self._assign_variant_dtype = assign_variant_dtype
+    """A TypeSpec for the TwoTensors value type."""
+
+    def __init__(
+        self,
+        x_shape,
+        x_dtype,
+        y_shape,
+        y_dtype,
+        color="red",
+        assign_variant_dtype=False,
+    ):
+        self.x_shape = as_shape(x_shape)
+        self.x_dtype = tf.as_dtype(x_dtype)
+        self.y_shape = as_shape(y_shape)
+        self.y_dtype = tf.as_dtype(y_dtype)
+        self.color = color
+        self.shape = tf.TensorShape(None)
+        self._shape = tf.TensorShape(None)
+        if assign_variant_dtype:
+            self.dtype = tf.variant
+        self._assign_variant_dtype = assign_variant_dtype
+
+    value_type = property(lambda self: TwoTensors)
+
+    @property
+    def _component_specs(self):
+        return (
+            tf.TensorSpec(self.x_shape, self.x_dtype),
+            tf.TensorSpec(self.y_shape, self.y_dtype),
+        )
+
+    def _to_components(self, value):
+        return (value.x, value.y)
+
+    def _from_components(self, components):
+        x, y = components
+        return TwoTensors(x, y, self.color)
+
+    def _serialize(self):
+        return (
+            self.x_shape,
+            self.x_dtype,
+            self.y_shape,
+            self.y_dtype,
+            self.color,
+        )
+
+    @classmethod
+    def from_value(cls, value):
+        return cls(
+            value.x.shape,
+            value.x.dtype,
+            value.y.shape,
+            value.y.dtype,
+            value.color,
+        )
 
-  value_type = property(lambda self: TwoTensors)
 
-  @property
-  def _component_specs(self):
-    return (tf.TensorSpec(self.x_shape, self.x_dtype),
-            tf.TensorSpec(self.y_shape, self.y_dtype))
+type_spec.register_type_spec_from_value_converter(
+    TwoTensors, TwoTensorsSpecNoOneDtype.from_value
+)
 
-  def _to_components(self, value):
-    return (value.x, value.y)
 
-  def _from_components(self, components):
-    x, y = components
-    return TwoTensors(x, y, self.color)
+class InputLayerTest(test_combinations.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicOutputShapeNoBatchSize(self):
+        # Create a Keras Input
+        x = input_layer_lib.Input(shape=(32,), name="input_a")
+        self.assertAllEqual(x.shape.as_list(), [None, 32])
 
-  def _serialize(self):
-    return (self.x_shape, self.x_dtype, self.y_shape, self.y_dtype, self.color)
+        # Verify you can construct and use a model w/ this input
+        model = functional.Functional(x, x * 2.0)
+        self.assertAllEqual(model(tf.ones((3, 32))), tf.ones((3, 32)) * 2.0)
 
-  @classmethod
-  def from_value(cls, value):
-    return cls(value.x.shape, value.x.dtype, value.y.shape, value.y.dtype,
-               value.color)
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicOutputShapeWithBatchSize(self):
+        # Create a Keras Input
+        x = input_layer_lib.Input(batch_size=6, shape=(32,), name="input_b")
+        self.assertAllEqual(x.shape.as_list(), [6, 32])
 
+        # Verify you can construct and use a model w/ this input
+        model = functional.Functional(x, x * 2.0)
+        self.assertAllEqual(model(tf.ones(x.shape)), tf.ones(x.shape) * 2.0)
 
-type_spec.register_type_spec_from_value_converter(
-    TwoTensors, TwoTensorsSpecNoOneDtype.from_value)
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testBasicOutputShapeNoBatchSizeInTFFunction(self):
+        model = None
 
+        @tf.function
+        def run_model(inp):
+            nonlocal model
+            if not model:
+                # Create a Keras Input
+                x = input_layer_lib.Input(shape=(8,), name="input_a")
+                self.assertAllEqual(x.shape.as_list(), [None, 8])
 
-class InputLayerTest(test_combinations.TestCase):
+                # Verify you can construct and use a model w/ this input
+                model = functional.Functional(x, x * 2.0)
+            return model(inp)
 
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testBasicOutputShapeNoBatchSize(self):
-    # Create a Keras Input
-    x = input_layer_lib.Input(shape=(32,), name='input_a')
-    self.assertAllEqual(x.shape.as_list(), [None, 32])
-
-    # Verify you can construct and use a model w/ this input
-    model = functional.Functional(x, x * 2.0)
-    self.assertAllEqual(model(tf.ones((3, 32))),
-                        tf.ones((3, 32)) * 2.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testBasicOutputShapeWithBatchSize(self):
-    # Create a Keras Input
-    x = input_layer_lib.Input(batch_size=6, shape=(32,), name='input_b')
-    self.assertAllEqual(x.shape.as_list(), [6, 32])
-
-    # Verify you can construct and use a model w/ this input
-    model = functional.Functional(x, x * 2.0)
-    self.assertAllEqual(model(tf.ones(x.shape)),
-                        tf.ones(x.shape) * 2.0)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testBasicOutputShapeNoBatchSizeInTFFunction(self):
-    model = None
-    @tf.function
-    def run_model(inp):
-      nonlocal model
-      if not model:
-        # Create a Keras Input
-        x = input_layer_lib.Input(shape=(8,), name='input_a')
-        self.assertAllEqual(x.shape.as_list(), [None, 8])
+        self.assertAllEqual(run_model(tf.ones((10, 8))), tf.ones((10, 8)) * 2.0)
 
-        # Verify you can construct and use a model w/ this input
+    @test_combinations.run_all_keras_modes
+    def testBasicOutputShapeWithBatchSizeAndNoneDimensionsPlaceholder(self):
+        x = input_layer_lib.Input((2, 3), batch_size=4, dtype=tf.float32)
         model = functional.Functional(x, x * 2.0)
-      return model(inp)
-
-    self.assertAllEqual(run_model(tf.ones((10, 8))),
-                        tf.ones((10, 8)) * 2.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInputTensorArg(self):
-    # Create a Keras Input
-    x = input_layer_lib.Input(tensor=tf.zeros((7, 32)))
-    self.assertAllEqual(x.shape.as_list(), [7, 32])
-
-    # Verify you can construct and use a model w/ this input
-    model = functional.Functional(x, x * 2.0)
-    self.assertAllEqual(model(tf.ones(x.shape)),
-                        tf.ones(x.shape) * 2.0)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testInputTensorArgInTFFunction(self):
-    # We use a mutable model container instead of a model python variable,
-    # because python 2.7 does not have `nonlocal`
-    model_container = {}
-
-    @tf.function
-    def run_model(inp):
-      if not model_container:
+        output = model(backend.placeholder(shape=[None, None, 3]))
+        # batch size and dimension defined in Input should not be applied
+        self.assertAllEqual(output.shape.as_list(), [None, None, 3])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInputTensorArg(self):
         # Create a Keras Input
-        x = input_layer_lib.Input(tensor=tf.zeros((10, 16)))
-        self.assertAllEqual(x.shape.as_list(), [10, 16])
+        x = input_layer_lib.Input(tensor=tf.zeros((7, 32)))
+        self.assertAllEqual(x.shape.as_list(), [7, 32])
 
         # Verify you can construct and use a model w/ this input
-        model_container['model'] = functional.Functional(x, x * 3.0)
-      return model_container['model'](inp)
-
-    self.assertAllEqual(run_model(tf.ones((10, 16))),
-                        tf.ones((10, 16)) * 3.0)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testCompositeInputTensorArg(self):
-    # Create a Keras Input
-    rt = tf.RaggedTensor.from_row_splits(
-        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    x = input_layer_lib.Input(tensor=rt)
-
-    # Verify you can construct and use a model w/ this input
-    model = functional.Functional(x, x * 2)
-
-    # And that the model works
-    rt = tf.RaggedTensor.from_row_splits(
-        values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    self.assertAllEqual(model(rt), rt * 2)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testCompositeInputTensorArgInTFFunction(self):
-    # We use a mutable model container instead of a model python variable,
-    # because python 2.7 does not have `nonlocal`
-    model_container = {}
-
-    @tf.function
-    def run_model(inp):
-      if not model_container:
+        model = functional.Functional(x, x * 2.0)
+        self.assertAllEqual(model(tf.ones(x.shape)), tf.ones(x.shape) * 2.0)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testInputTensorArgInTFFunction(self):
+        # We use a mutable model container instead of a model python variable,
+        # because python 2.7 does not have `nonlocal`
+        model_container = {}
+
+        @tf.function
+        def run_model(inp):
+            if not model_container:
+                # Create a Keras Input
+                x = input_layer_lib.Input(tensor=tf.zeros((10, 16)))
+                self.assertAllEqual(x.shape.as_list(), [10, 16])
+
+                # Verify you can construct and use a model w/ this input
+                model_container["model"] = functional.Functional(x, x * 3.0)
+            return model_container["model"](inp)
+
+        self.assertAllEqual(
+            run_model(tf.ones((10, 16))), tf.ones((10, 16)) * 3.0
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testCompositeInputTensorArg(self):
         # Create a Keras Input
         rt = tf.RaggedTensor.from_row_splits(
-            values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+            values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8]
+        )
         x = input_layer_lib.Input(tensor=rt)
 
         # Verify you can construct and use a model w/ this input
-        model_container['model'] = functional.Functional(x, x * 3)
-      return model_container['model'](inp)
-
-    # And verify the model works
-    rt = tf.RaggedTensor.from_row_splits(
-        values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    self.assertAllEqual(run_model(rt), rt * 3)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testNoMixingArgsWithTypeSpecArg(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'all other args except `name` must be None'):
-      input_layer_lib.Input(
-          shape=(4, 7),
-          type_spec=tf.TensorSpec((2, 7, 32), tf.float32))
-    with self.assertRaisesRegexp(
-        ValueError, 'all other args except `name` must be None'):
-      input_layer_lib.Input(
-          batch_size=4,
-          type_spec=tf.TensorSpec((7, 32), tf.float32))
-    with self.assertRaisesRegexp(
-        ValueError, 'all other args except `name` must be None'):
-      input_layer_lib.Input(
-          dtype=tf.int64,
-          type_spec=tf.TensorSpec((7, 32), tf.float32))
-    with self.assertRaisesRegexp(
-        ValueError, 'all other args except `name` must be None'):
-      input_layer_lib.Input(
-          sparse=True,
-          type_spec=tf.TensorSpec((7, 32), tf.float32))
-    with self.assertRaisesRegexp(
-        ValueError, 'all other args except `name` must be None'):
-      input_layer_lib.Input(
-          ragged=True,
-          type_spec=tf.TensorSpec((7, 32), tf.float32))
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testTypeSpecArg(self):
-    # Create a Keras Input
-    x = input_layer_lib.Input(
-        type_spec=tf.TensorSpec((7, 32), tf.float32))
-    self.assertAllEqual(x.shape.as_list(), [7, 32])
-
-    # Verify you can construct and use a model w/ this input
-    model = functional.Functional(x, x * 2.0)
-    self.assertAllEqual(model(tf.ones(x.shape)),
-                        tf.ones(x.shape) * 2.0)
-
-    # Test serialization / deserialization
-    model = functional.Functional.from_config(model.get_config())
-    self.assertAllEqual(model(tf.ones(x.shape)),
-                        tf.ones(x.shape) * 2.0)
-
-    model = model_config.model_from_json(model.to_json())
-    self.assertAllEqual(model(tf.ones(x.shape)),
-                        tf.ones(x.shape) * 2.0)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testTypeSpecArgInTFFunction(self):
-    # We use a mutable model container instead of a model python variable,
-    # because python 2.7 does not have `nonlocal`
-    model_container = {}
-
-    @tf.function
-    def run_model(inp):
-      if not model_container:
+        model = functional.Functional(x, x * 2)
+
+        # And that the model works
+        rt = tf.RaggedTensor.from_row_splits(
+            values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8]
+        )
+        self.assertAllEqual(model(rt), rt * 2)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testCompositeInputTensorArgInTFFunction(self):
+        # We use a mutable model container instead of a model python variable,
+        # because python 2.7 does not have `nonlocal`
+        model_container = {}
+
+        @tf.function
+        def run_model(inp):
+            if not model_container:
+                # Create a Keras Input
+                rt = tf.RaggedTensor.from_row_splits(
+                    values=[3, 1, 4, 1, 5, 9, 2, 6],
+                    row_splits=[0, 4, 4, 7, 8, 8],
+                )
+                x = input_layer_lib.Input(tensor=rt)
+
+                # Verify you can construct and use a model w/ this input
+                model_container["model"] = functional.Functional(x, x * 3)
+            return model_container["model"](inp)
+
+        # And verify the model works
+        rt = tf.RaggedTensor.from_row_splits(
+            values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8]
+        )
+        self.assertAllEqual(run_model(rt), rt * 3)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testNoMixingArgsWithTypeSpecArg(self):
+        with self.assertRaisesRegexp(
+            ValueError, "all other args except `name` must be None"
+        ):
+            input_layer_lib.Input(
+                shape=(4, 7), type_spec=tf.TensorSpec((2, 7, 32), tf.float32)
+            )
+        with self.assertRaisesRegexp(
+            ValueError, "all other args except `name` must be None"
+        ):
+            input_layer_lib.Input(
+                batch_size=4, type_spec=tf.TensorSpec((7, 32), tf.float32)
+            )
+        with self.assertRaisesRegexp(
+            ValueError, "all other args except `name` must be None"
+        ):
+            input_layer_lib.Input(
+                dtype=tf.int64, type_spec=tf.TensorSpec((7, 32), tf.float32)
+            )
+        with self.assertRaisesRegexp(
+            ValueError, "all other args except `name` must be None"
+        ):
+            input_layer_lib.Input(
+                sparse=True, type_spec=tf.TensorSpec((7, 32), tf.float32)
+            )
+        with self.assertRaisesRegexp(
+            ValueError, "all other args except `name` must be None"
+        ):
+            input_layer_lib.Input(
+                ragged=True, type_spec=tf.TensorSpec((7, 32), tf.float32)
+            )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testTypeSpecArg(self):
         # Create a Keras Input
-        x = input_layer_lib.Input(
-            type_spec=tf.TensorSpec((10, 16), tf.float32))
-        self.assertAllEqual(x.shape.as_list(), [10, 16])
+        x = input_layer_lib.Input(type_spec=tf.TensorSpec((7, 32), tf.float32))
+        self.assertAllEqual(x.shape.as_list(), [7, 32])
 
         # Verify you can construct and use a model w/ this input
-        model_container['model'] = functional.Functional(x, x * 3.0)
-      return model_container['model'](inp)
-
-    self.assertAllEqual(run_model(tf.ones((10, 16))),
-                        tf.ones((10, 16)) * 3.0)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testCompositeTypeSpecArg(self):
-    # Create a Keras Input
-    rt = tf.RaggedTensor.from_row_splits(
-        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    x = input_layer_lib.Input(type_spec=rt._type_spec)
-
-    # Verify you can construct and use a model w/ this input
-    model = functional.Functional(x, x * 2)
-
-    # And that the model works
-    rt = tf.RaggedTensor.from_row_splits(
-        values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    self.assertAllEqual(model(rt), rt * 2)
-
-    # Test serialization / deserialization
-    model = functional.Functional.from_config(model.get_config())
-    self.assertAllEqual(model(rt), rt * 2)
-    model = model_config.model_from_json(model.to_json())
-    self.assertAllEqual(model(rt), rt * 2)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testCompositeTypeSpecArgInTFFunction(self):
-    # We use a mutable model container instead of a model pysthon variable,
-    # because python 2.7 does not have `nonlocal`
-    model_container = {}
-
-    @tf.function
-    def run_model(inp):
-      if not model_container:
+        model = functional.Functional(x, x * 2.0)
+        self.assertAllEqual(model(tf.ones(x.shape)), tf.ones(x.shape) * 2.0)
+
+        # Test serialization / deserialization
+        model = functional.Functional.from_config(model.get_config())
+        self.assertAllEqual(model(tf.ones(x.shape)), tf.ones(x.shape) * 2.0)
+
+        model = model_config.model_from_json(model.to_json())
+        self.assertAllEqual(model(tf.ones(x.shape)), tf.ones(x.shape) * 2.0)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testTypeSpecArgInTFFunction(self):
+        # We use a mutable model container instead of a model python variable,
+        # because python 2.7 does not have `nonlocal`
+        model_container = {}
+
+        @tf.function
+        def run_model(inp):
+            if not model_container:
+                # Create a Keras Input
+                x = input_layer_lib.Input(
+                    type_spec=tf.TensorSpec((10, 16), tf.float32)
+                )
+                self.assertAllEqual(x.shape.as_list(), [10, 16])
+
+                # Verify you can construct and use a model w/ this input
+                model_container["model"] = functional.Functional(x, x * 3.0)
+            return model_container["model"](inp)
+
+        self.assertAllEqual(
+            run_model(tf.ones((10, 16))), tf.ones((10, 16)) * 3.0
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testCompositeTypeSpecArg(self):
         # Create a Keras Input
         rt = tf.RaggedTensor.from_row_splits(
-            values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+            values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8]
+        )
         x = input_layer_lib.Input(type_spec=rt._type_spec)
 
         # Verify you can construct and use a model w/ this input
-        model_container['model'] = functional.Functional(x, x * 3)
-      return model_container['model'](inp)
-
-    # And verify the model works
-    rt = tf.RaggedTensor.from_row_splits(
-        values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    self.assertAllEqual(run_model(rt), rt * 3)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testCompositeTypeSpecArgWithoutDtype(self):
-    for assign_variant_dtype in [False, True]:
-      # Create a Keras Input
-      spec = TwoTensorsSpecNoOneDtype(
-          (1, 2, 3), tf.float32, (1, 2, 3), tf.int64,
-          assign_variant_dtype=assign_variant_dtype)
-      x = input_layer_lib.Input(type_spec=spec)
-
-      def lambda_fn(tensors):
-        return (tf.cast(tensors.x, tf.float64)
-                + tf.cast(tensors.y, tf.float64))
-      # Verify you can construct and use a model w/ this input
-      model = functional.Functional(x, core.Lambda(lambda_fn)(x))
-
-      # And that the model works
-      two_tensors = TwoTensors(tf.ones((1, 2, 3)) * 2.0,
-                               tf.ones(1, 2, 3))
-      self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors))
-
-      # Test serialization / deserialization
-      model = functional.Functional.from_config(model.get_config())
-      self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors))
-      model = model_config.model_from_json(model.to_json())
-      self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors))
-
-  def test_serialize_with_unknown_rank(self):
-    inp = backend.placeholder(shape=None, dtype=tf.string)
-    x = input_layer_lib.InputLayer(input_tensor=inp, dtype=tf.string)
-    loaded = input_layer_lib.InputLayer.from_config(x.get_config())
-    self.assertIsNone(loaded._batch_input_shape)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+        model = functional.Functional(x, x * 2)
+
+        # And that the model works
+        rt = tf.RaggedTensor.from_row_splits(
+            values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8]
+        )
+        self.assertAllEqual(model(rt), rt * 2)
+
+        # Test serialization / deserialization
+        model = functional.Functional.from_config(model.get_config())
+        self.assertAllEqual(model(rt), rt * 2)
+        model = model_config.model_from_json(model.to_json())
+        self.assertAllEqual(model(rt), rt * 2)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testCompositeTypeSpecArgInTFFunction(self):
+        # We use a mutable model container instead of a model pysthon variable,
+        # because python 2.7 does not have `nonlocal`
+        model_container = {}
+
+        @tf.function
+        def run_model(inp):
+            if not model_container:
+                # Create a Keras Input
+                rt = tf.RaggedTensor.from_row_splits(
+                    values=[3, 1, 4, 1, 5, 9, 2, 6],
+                    row_splits=[0, 4, 4, 7, 8, 8],
+                )
+                x = input_layer_lib.Input(type_spec=rt._type_spec)
+
+                # Verify you can construct and use a model w/ this input
+                model_container["model"] = functional.Functional(x, x * 3)
+            return model_container["model"](inp)
+
+        # And verify the model works
+        rt = tf.RaggedTensor.from_row_splits(
+            values=[3, 21, 4, 1, 53, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8]
+        )
+        self.assertAllEqual(run_model(rt), rt * 3)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testCompositeTypeSpecArgWithoutDtype(self):
+        for assign_variant_dtype in [False, True]:
+            # Create a Keras Input
+            spec = TwoTensorsSpecNoOneDtype(
+                (1, 2, 3),
+                tf.float32,
+                (1, 2, 3),
+                tf.int64,
+                assign_variant_dtype=assign_variant_dtype,
+            )
+            x = input_layer_lib.Input(type_spec=spec)
+
+            def lambda_fn(tensors):
+                return tf.cast(tensors.x, tf.float64) + tf.cast(
+                    tensors.y, tf.float64
+                )
+
+            # Verify you can construct and use a model w/ this input
+            model = functional.Functional(x, core.Lambda(lambda_fn)(x))
+
+            # And that the model works
+            two_tensors = TwoTensors(tf.ones((1, 2, 3)) * 2.0, tf.ones(1, 2, 3))
+            self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors))
+
+            # Test serialization / deserialization
+            with SafeModeScope(safe_mode=False):
+                model = functional.Functional.from_config(model.get_config())
+                self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors))
+                model = model_config.model_from_json(model.to_json())
+                self.assertAllEqual(model(two_tensors), lambda_fn(two_tensors))
+
+    def test_serialize_with_unknown_rank(self):
+        inp = backend.placeholder(shape=None, dtype=tf.string)
+        x = input_layer_lib.InputLayer(input_tensor=inp, dtype=tf.string)
+        loaded = input_layer_lib.InputLayer.from_config(x.get_config())
+        self.assertIsNone(loaded._batch_input_shape)
+
+    @test_utils.run_v2_only
+    def test_typespec_naming_propagation(self):
+        type_spec = tf.TensorSpec(name="test", shape=(None, None, 2))
+        input1 = input_layer_lib.Input(type_spec=type_spec)
+        self.assertEqual(input1.name, "test")
+
+    @test_utils.run_v2_only
+    def test_save_input_naming(self):
+        x = input_layer_lib.Input(shape=(10,), name="features")
+        y = Dense(1)(x)
+        model = functional.Functional(x, y)
+        self.assertEqual(model.layers[0].name, "features")
+        save_path = self.get_temp_dir() + "/basic_model.keras"
+        model.save(save_path)
+        reloaded_model = models.load_model(save_path)
+        self.assertEqual(reloaded_model.layers[0].name, "features")
+
+    @test_utils.run_v2_only
+    def test_export_input_naming(self):
+        model = Sequential(
+            layers=[
+                input_layer_lib.Input(shape=(8,), name="features"),
+                Dense(1),
+            ]
+        )
+        x = tf.random.normal((8, 8))
+        model(x)
+
+        export_path = self.get_temp_dir() + "test_model"
+        model.export(export_path)
+        reloaded_artifact = tf.saved_model.load(export_path)
+        self.assertEqual(
+            reloaded_artifact.signatures._signatures["serve"]._arg_keywords[-1],
+            "features",
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/input_spec.py b/keras/engine/input_spec.py
index 354b0b7e0f46..1e18c83cd0df 100644
--- a/keras/engine/input_spec.py
+++ b/keras/engine/input_spec.py
@@ -12,269 +12,305 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-# pylint: disable=g-classes-have-attributes
+
+
 """Contains the InputSpec class."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import backend
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@keras_export('keras.layers.InputSpec',
-              v1=['keras.layers.InputSpec',
-                  'keras.__internal__.legacy.layers.InputSpec'])
-@tf_export(v1=['layers.InputSpec'])
+@keras_export(
+    "keras.layers.InputSpec",
+    v1=["keras.layers.InputSpec", "keras.__internal__.legacy.layers.InputSpec"],
+)
+@tf_export(v1=["layers.InputSpec"])
 class InputSpec:
-  """Specifies the rank, dtype and shape of every input to a layer.
-
-  Layers can expose (if appropriate) an `input_spec` attribute:
-  an instance of `InputSpec`, or a nested structure of `InputSpec` instances
-  (one per input tensor). These objects enable the layer to run input
-  compatibility checks for input structure, input rank, input shape, and
-  input dtype.
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Args:
-    dtype: Expected DataType of the input.
-    shape: Shape tuple, expected shape of the input
-      (may include None for unchecked axes). Includes the batch size.
-    ndim: Integer, expected rank of the input.
-    max_ndim: Integer, maximum rank of the input.
-    min_ndim: Integer, minimum rank of the input.
-    axes: Dictionary mapping integer axes to
-      a specific dimension value.
-    allow_last_axis_squeeze: If True, then allow inputs of rank N+1 as long
-      as the last axis of the input is 1, as well as inputs of rank N-1
-      as long as the last axis of the spec is 1.
-    name: Expected key corresponding to this input when passing data as
-      a dictionary.
-
-  Example:
-
-  ```python
-  class MyLayer(Layer):
-      def __init__(self):
-          super(MyLayer, self).__init__()
-          # The layer will accept inputs with shape (?, 28, 28) & (?, 28, 28, 1)
-          # and raise an appropriate error message otherwise.
-          self.input_spec = InputSpec(
-              shape=(None, 28, 28, 1),
-              allow_last_axis_squeeze=True)
-  ```
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None,
-               allow_last_axis_squeeze=False,
-               name=None):
-    self.dtype = tf.as_dtype(dtype).name if dtype is not None else None
-    shape = tf.TensorShape(shape)
-    if shape.rank is None:
-      shape = None
-    else:
-      shape = tuple(shape.as_list())
-    if shape is not None:
-      self.ndim = len(shape)
-      self.shape = shape
-    else:
-      self.ndim = ndim
-      self.shape = None
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.name = name
-    self.allow_last_axis_squeeze = allow_last_axis_squeeze
-    try:
-      axes = axes or {}
-      self.axes = {int(k): axes[k] for k in axes}
-    except (ValueError, TypeError):
-      raise TypeError('Argument `axes` must be a dict with integer keys. '
-                      f'Received: axes={axes}')
-
-    if self.axes and (self.ndim is not None or self.max_ndim is not None):
-      max_dim = (self.ndim if self.ndim else self.max_ndim) - 1
-      max_axis = max(self.axes)
-      if max_axis > max_dim:
-        raise ValueError('Axis {} is greater than the maximum allowed value: {}'
-                         .format(max_axis, max_dim))
-
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
-
-  def get_config(self):
-    return {
-        'dtype': self.dtype,
-        'shape': self.shape,
-        'ndim': self.ndim,
-        'max_ndim': self.max_ndim,
-        'min_ndim': self.min_ndim,
-        'axes': self.axes}
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
+    """Specifies the rank, dtype and shape of every input to a layer.
+
+    Layers can expose (if appropriate) an `input_spec` attribute:
+    an instance of `InputSpec`, or a nested structure of `InputSpec` instances
+    (one per input tensor). These objects enable the layer to run input
+    compatibility checks for input structure, input rank, input shape, and
+    input dtype.
+
+    A None entry in a shape is compatible with any dimension,
+    a None shape is compatible with any shape.
+
+    Args:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+        (may include None for unchecked axes). Includes the batch size.
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+        a specific dimension value.
+      allow_last_axis_squeeze: If True, then allow inputs of rank N+1 as long
+        as the last axis of the input is 1, as well as inputs of rank N-1
+        as long as the last axis of the spec is 1.
+      name: Expected key corresponding to this input when passing data as
+        a dictionary.
+
+    Example:
+
+    ```python
+    class MyLayer(Layer):
+        def __init__(self):
+            super(MyLayer, self).__init__()
+            # The layer will accept inputs with
+            # shape (?, 28, 28) & (?, 28, 28, 1)
+            # and raise an appropriate error message otherwise.
+            self.input_spec = InputSpec(
+                shape=(None, 28, 28, 1),
+                allow_last_axis_squeeze=True)
+    ```
+    """
+
+    def __init__(
+        self,
+        dtype=None,
+        shape=None,
+        ndim=None,
+        max_ndim=None,
+        min_ndim=None,
+        axes=None,
+        allow_last_axis_squeeze=False,
+        name=None,
+    ):
+        self.dtype = tf.as_dtype(dtype).name if dtype is not None else None
+        shape = tf.TensorShape(shape)
+        if shape.rank is None:
+            shape = None
+        else:
+            shape = tuple(shape.as_list())
+        if shape is not None:
+            self.ndim = len(shape)
+            self.shape = shape
+        else:
+            self.ndim = ndim
+            self.shape = None
+        self.max_ndim = max_ndim
+        self.min_ndim = min_ndim
+        self.name = name
+        self.allow_last_axis_squeeze = allow_last_axis_squeeze
+        try:
+            axes = axes or {}
+            self.axes = {int(k): axes[k] for k in axes}
+        except (ValueError, TypeError):
+            raise TypeError(
+                "Argument `axes` must be a dict with integer keys. "
+                f"Received: axes={axes}"
+            )
+
+        if self.axes and (self.ndim is not None or self.max_ndim is not None):
+            max_dim = (self.ndim if self.ndim else self.max_ndim) - 1
+            max_axis = max(self.axes)
+            if max_axis > max_dim:
+                raise ValueError(
+                    "Axis {} is greater than the maximum "
+                    "allowed value: {}".format(max_axis, max_dim)
+                )
+
+    def __repr__(self):
+        spec = [
+            ("dtype=" + str(self.dtype)) if self.dtype else "",
+            ("shape=" + str(self.shape)) if self.shape else "",
+            ("ndim=" + str(self.ndim)) if self.ndim else "",
+            ("max_ndim=" + str(self.max_ndim)) if self.max_ndim else "",
+            ("min_ndim=" + str(self.min_ndim)) if self.min_ndim else "",
+            ("axes=" + str(self.axes)) if self.axes else "",
+        ]
+        return f"InputSpec({', '.join(x for x in spec if x)})"
+
+    def get_config(self):
+        return {
+            "dtype": self.dtype,
+            "shape": self.shape,
+            "ndim": self.ndim,
+            "max_ndim": self.max_ndim,
+            "min_ndim": self.min_ndim,
+            "axes": self.axes,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
 
 
 def to_tensor_shape(spec):
-  """Returns a tf.TensorShape object that matches the shape specifications.
+    """Returns a tf.TensorShape object that matches the shape specifications.
 
-  If the InputSpec's shape or ndim is defined, this method will return a fully
-  or partially-known shape. Otherwise, the returned TensorShape is None.
+    If the InputSpec's shape or ndim is defined, this method will return a fully
+    or partially-known shape. Otherwise, the returned TensorShape is None.
 
-  Args:
-    spec: an InputSpec object.
+    Args:
+      spec: an InputSpec object.
 
-  Returns:
-    a tf.TensorShape object
-  """
-  if spec.ndim is None and spec.shape is None:
-    return tf.TensorShape(None)
-  elif spec.shape is not None:
-    return tf.TensorShape(spec.shape)
-  else:
-    shape = [None] * spec.ndim
-    for a in spec.axes:
-      shape[a] = spec.axes[a]  # Assume that axes is defined
-    return tf.TensorShape(shape)
+    Returns:
+      a tf.TensorShape object
+    """
+    if spec.ndim is None and spec.shape is None:
+        return tf.TensorShape(None)
+    elif spec.shape is not None:
+        return tf.TensorShape(spec.shape)
+    else:
+        shape = [None] * spec.ndim
+        for a in spec.axes:
+            shape[a] = spec.axes[a]  # Assume that axes is defined
+        return tf.TensorShape(shape)
 
 
 def assert_input_compatibility(input_spec, inputs, layer_name):
-  """Checks compatibility between the layer and provided inputs.
-
-  This checks that the tensor(s) `inputs` verify the input assumptions
-  of a layer (if any). If not, a clear and actional exception gets raised.
-
-  Args:
-      input_spec: An InputSpec instance, list of InputSpec instances, a nested
-          structure of InputSpec instances, or None.
-      inputs: Input tensor, list of input tensors, or a nested structure of
-          input tensors.
-      layer_name: String, name of the layer (for error message formatting).
-
-  Raises:
-      ValueError: in case of mismatch between
-          the provided inputs and the expectations of the layer.
-  """
-  if not input_spec:
-    return
-
-  input_spec = tf.nest.flatten(input_spec)
-  if isinstance(inputs, dict):
-    # Flatten `inputs` by reference order if input spec names are provided
-    names = [spec.name for spec in input_spec]
-    if all(names):
-      list_inputs = []
-      for name in names:
-        if name not in inputs:
-          raise ValueError(f'Missing data for input "{name}". '
-                           'You passed a data dictionary with keys '
-                           f'{list(inputs.keys())}. '
-                           f'Expected the following keys: {names}')
-        list_inputs.append(inputs[name])
-      inputs = list_inputs
-
-  inputs = tf.nest.flatten(inputs)
-  for x in inputs:
-    # Having a shape/dtype is the only commonality of the various tensor-like
-    # objects that may be passed. The most common kind of invalid type we are
-    # guarding for is a Layer instance (Functional API), which does not
-    # have a `shape` attribute.
-    if not hasattr(x, 'shape'):
-      raise TypeError(f'Inputs to a layer should be tensors. Got: {x}')
-
-  if len(inputs) != len(input_spec):
-    raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'
-                     f' but it received {len(inputs)} input tensors. '
-                     f'Inputs received: {inputs}')
-  for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-    if spec is None:
-      continue
-
-    shape = tf.TensorShape(x.shape)
-    if shape.rank is None:
-      return
-    # Check ndim.
-    if spec.ndim is not None and not spec.allow_last_axis_squeeze:
-      ndim = shape.rank
-      if ndim != spec.ndim:
-        raise ValueError(f'Input {input_index} of layer "{layer_name}" '
-                         'is incompatible with the layer: '
-                         f'expected ndim={spec.ndim}, found ndim={ndim}. '
-                         f'Full shape received: {tuple(shape)}')
-    if spec.max_ndim is not None:
-      ndim = x.shape.rank
-      if ndim is not None and ndim > spec.max_ndim:
-        raise ValueError(f'Input {input_index} of layer "{layer_name}" '
-                         'is incompatible with the layer: '
-                         f'expected max_ndim={spec.max_ndim}, '
-                         f'found ndim={ndim}')
-    if spec.min_ndim is not None:
-      ndim = x.shape.rank
-      if ndim is not None and ndim < spec.min_ndim:
-        raise ValueError(f'Input {input_index} of layer "{layer_name}" '
-                         'is incompatible with the layer: '
-                         f'expected min_ndim={spec.min_ndim}, '
-                         f'found ndim={ndim}. '
-                         f'Full shape received: {tuple(shape)}')
-    # Check dtype.
-    if spec.dtype is not None:
-      if x.dtype.name != spec.dtype:
-        raise ValueError(f'Input {input_index} of layer "{layer_name}" '
-                         'is incompatible with the layer: '
-                         f'expected dtype={spec.dtype}, '
-                         f'found dtype={x.dtype}')
-
-    # Check specific shape axes.
-    shape_as_list = shape.as_list()
-    if spec.axes:
-      for axis, value in spec.axes.items():
-        if hasattr(value, 'value'):
-          value = value.value
-        if value is not None and shape_as_list[int(axis)] not in {value, None}:
-          raise ValueError(
-              f'Input {input_index} of layer "{layer_name}" is '
-              f'incompatible with the layer: expected axis {axis} '
-              f'of input shape to have value {value}, '
-              f'but received input with shape {display_shape(x.shape)}')
-    # Check shape.
-    if spec.shape is not None and shape.rank is not None:
-      spec_shape = spec.shape
-      if spec.allow_last_axis_squeeze:
-        if shape_as_list and shape_as_list[-1] == 1:
-          shape_as_list = shape_as_list[:-1]
-        if spec_shape and spec_shape[-1] == 1:
-          spec_shape = spec_shape[:-1]
-      for spec_dim, dim in zip(spec_shape, shape_as_list):
-        if spec_dim is not None and dim is not None:
-          if spec_dim != dim:
-            raise ValueError(f'Input {input_index} of layer "{layer_name}" is '
-                             'incompatible with the layer: '
-                             f'expected shape={spec.shape}, '
-                             f'found shape={display_shape(x.shape)}')
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `inputs` verify the input assumptions
+    of a layer (if any). If not, a clear and actional exception gets raised.
+
+    Args:
+        input_spec: An InputSpec instance, list of InputSpec instances, a nested
+            structure of InputSpec instances, or None.
+        inputs: Input tensor, list of input tensors, or a nested structure of
+            input tensors.
+        layer_name: String, name of the layer (for error message formatting).
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not input_spec:
+        return
+
+    input_spec = tf.nest.flatten(input_spec)
+    if isinstance(inputs, dict):
+        # Flatten `inputs` by reference order if input spec names are provided
+        names = [spec.name for spec in input_spec]
+        if all(names):
+            list_inputs = []
+            for name in names:
+                if name not in inputs:
+                    raise ValueError(
+                        f'Missing data for input "{name}". '
+                        "You passed a data dictionary with keys "
+                        f"{list(inputs.keys())}. "
+                        f"Expected the following keys: {names}"
+                    )
+                list_inputs.append(inputs[name])
+            inputs = list_inputs
+
+    inputs = tf.nest.flatten(inputs)
+    for x in inputs:
+        # Having a shape/dtype is the only commonality of the various
+        # tensor-like objects that may be passed. The most common kind of
+        # invalid type we are guarding for is a Layer instance (Functional API),
+        # which does not have a `shape` attribute.
+        if not hasattr(x, "shape"):
+            raise TypeError(
+                f"Inputs to a layer should be tensors. Got '{x}' "
+                f"(of type {type(x)}) as input for layer '{layer_name}'."
+            )
+
+    if len(inputs) != len(input_spec):
+        raise ValueError(
+            f'Layer "{layer_name}" expects {len(input_spec)} input(s),'
+            f" but it received {len(inputs)} input tensors. "
+            f"Inputs received: {inputs}"
+        )
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+        if spec is None:
+            continue
+
+        shape = tf.TensorShape(x.shape)
+        if shape.rank is None:
+            return
+        # Check ndim.
+        if spec.ndim is not None and not spec.allow_last_axis_squeeze:
+            ndim = shape.rank
+            if ndim != spec.ndim:
+                raise ValueError(
+                    f'Input {input_index} of layer "{layer_name}" '
+                    "is incompatible with the layer: "
+                    f"expected ndim={spec.ndim}, found ndim={ndim}. "
+                    f"Full shape received: {tuple(shape)}"
+                )
+        if spec.max_ndim is not None:
+            ndim = x.shape.rank
+            if ndim is not None and ndim > spec.max_ndim:
+                raise ValueError(
+                    f'Input {input_index} of layer "{layer_name}" '
+                    "is incompatible with the layer: "
+                    f"expected max_ndim={spec.max_ndim}, "
+                    f"found ndim={ndim}"
+                )
+        if spec.min_ndim is not None:
+            ndim = x.shape.rank
+            if ndim is not None and ndim < spec.min_ndim:
+                raise ValueError(
+                    f'Input {input_index} of layer "{layer_name}" '
+                    "is incompatible with the layer: "
+                    f"expected min_ndim={spec.min_ndim}, "
+                    f"found ndim={ndim}. "
+                    f"Full shape received: {tuple(shape)}"
+                )
+        # Check dtype.
+        if spec.dtype is not None:
+            if x.dtype.name != spec.dtype:
+                raise ValueError(
+                    f'Input {input_index} of layer "{layer_name}" '
+                    "is incompatible with the layer: "
+                    f"expected dtype={spec.dtype}, "
+                    f"found dtype={x.dtype}"
+                )
+
+        # Check specific shape axes.
+        shape_as_list = shape.as_list()
+        if spec.axes:
+            for axis, value in spec.axes.items():
+                if hasattr(value, "value"):
+                    value = value.value
+                if value is not None and shape_as_list[int(axis)] not in {
+                    value,
+                    None,
+                }:
+                    raise ValueError(
+                        f'Input {input_index} of layer "{layer_name}" is '
+                        f"incompatible with the layer: expected axis {axis} "
+                        f"of input shape to have value {value}, "
+                        "but received input with "
+                        f"shape {display_shape(x.shape)}"
+                    )
+        # Check shape.
+        if spec.shape is not None and shape.rank is not None:
+            spec_shape = spec.shape
+            if spec.allow_last_axis_squeeze:
+                if shape_as_list and shape_as_list[-1] == 1:
+                    shape_as_list = shape_as_list[:-1]
+                if spec_shape and spec_shape[-1] == 1:
+                    spec_shape = spec_shape[:-1]
+            for spec_dim, dim in zip(spec_shape, shape_as_list):
+                if spec_dim is not None and dim is not None:
+                    if spec_dim != dim:
+                        raise ValueError(
+                            f'Input {input_index} of layer "{layer_name}" is '
+                            "incompatible with the layer: "
+                            f"expected shape={spec.shape}, "
+                            f"found shape={display_shape(x.shape)}"
+                        )
 
 
 def display_shape(shape):
-  return str(tuple(shape.as_list()))
+    return str(tuple(shape.as_list()))
 
 
 def to_tensor_spec(input_spec, default_dtype=None):
-  """Converts a Keras InputSpec object to a TensorSpec."""
-  default_dtype = default_dtype or backend.floatx()
-  if isinstance(input_spec, InputSpec):
-    dtype = input_spec.dtype or default_dtype
-    return tf.TensorSpec(to_tensor_shape(input_spec), dtype)
-  return tf.TensorSpec(None, default_dtype)
+    """Converts a Keras InputSpec object to a TensorSpec."""
+    default_dtype = default_dtype or backend.floatx()
+    if isinstance(input_spec, InputSpec):
+        dtype = input_spec.dtype or default_dtype
+        return tf.TensorSpec(to_tensor_shape(input_spec), dtype)
+    return tf.TensorSpec(None, default_dtype)
diff --git a/keras/engine/input_spec_test.py b/keras/engine/input_spec_test.py
index 2fb54f39bd2a..95f295ff5309 100644
--- a/keras/engine/input_spec_test.py
+++ b/keras/engine/input_spec_test.py
@@ -24,44 +24,46 @@
 
 
 class InputSpecTest(tf.test.TestCase):
-
-  def test_axes_initialization(self):
-    input_spec.InputSpec(shape=[1, None, 2, 3], axes={3: 5, '2': 2})
-    with self.assertRaisesRegex(ValueError, 'Axis 4 is greater than'):
-      input_spec.InputSpec(shape=[1, None, 2, 3], axes={4: 5})
-    with self.assertRaisesRegex(TypeError, 'Argument `axes` must be a dict'):
-      input_spec.InputSpec(shape=[1, None, 2, 3], axes={'string': 5})
+    def test_axes_initialization(self):
+        input_spec.InputSpec(shape=[1, None, 2, 3], axes={3: 5, "2": 2})
+        with self.assertRaisesRegex(ValueError, "Axis 4 is greater than"):
+            input_spec.InputSpec(shape=[1, None, 2, 3], axes={4: 5})
+        with self.assertRaisesRegex(
+            TypeError, "Argument `axes` must be a dict"
+        ):
+            input_spec.InputSpec(shape=[1, None, 2, 3], axes={"string": 5})
 
 
 class InputSpecToTensorShapeTest(tf.test.TestCase):
-
-  def test_defined_shape(self):
-    spec = input_spec.InputSpec(shape=[1, None, 2, 3])
-    self.assertAllEqual(
-        [1, None, 2, 3], input_spec.to_tensor_shape(spec).as_list())
-
-  def test_defined_ndims(self):
-    spec = input_spec.InputSpec(ndim=5)
-    self.assertAllEqual(
-        [None] * 5, input_spec.to_tensor_shape(spec).as_list())
-
-    spec = input_spec.InputSpec(ndim=0)
-    self.assertAllEqual(
-        [], input_spec.to_tensor_shape(spec).as_list())
-
-    spec = input_spec.InputSpec(ndim=3, axes={1: 3, -1: 2})
-    self.assertAllEqual(
-        [None, 3, 2], input_spec.to_tensor_shape(spec).as_list())
-
-  def test_undefined_shapes(self):
-    spec = input_spec.InputSpec(max_ndim=5)
-    with self.assertRaisesRegex(ValueError, 'unknown TensorShape'):
-      input_spec.to_tensor_shape(spec).as_list()
-
-    spec = input_spec.InputSpec(min_ndim=5, max_ndim=5)
-    with self.assertRaisesRegex(ValueError, 'unknown TensorShape'):
-      input_spec.to_tensor_shape(spec).as_list()
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_defined_shape(self):
+        spec = input_spec.InputSpec(shape=[1, None, 2, 3])
+        self.assertAllEqual(
+            [1, None, 2, 3], input_spec.to_tensor_shape(spec).as_list()
+        )
+
+    def test_defined_ndims(self):
+        spec = input_spec.InputSpec(ndim=5)
+        self.assertAllEqual(
+            [None] * 5, input_spec.to_tensor_shape(spec).as_list()
+        )
+
+        spec = input_spec.InputSpec(ndim=0)
+        self.assertAllEqual([], input_spec.to_tensor_shape(spec).as_list())
+
+        spec = input_spec.InputSpec(ndim=3, axes={1: 3, -1: 2})
+        self.assertAllEqual(
+            [None, 3, 2], input_spec.to_tensor_shape(spec).as_list()
+        )
+
+    def test_undefined_shapes(self):
+        spec = input_spec.InputSpec(max_ndim=5)
+        with self.assertRaisesRegex(ValueError, "unknown TensorShape"):
+            input_spec.to_tensor_shape(spec).as_list()
+
+        spec = input_spec.InputSpec(min_ndim=5, max_ndim=5)
+        with self.assertRaisesRegex(ValueError, "unknown TensorShape"):
+            input_spec.to_tensor_shape(spec).as_list()
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/keras_tensor.py b/keras/engine/keras_tensor.py
index 7b225bb92def..cc04cc26c25b 100644
--- a/keras/engine/keras_tensor.py
+++ b/keras/engine/keras_tensor.py
@@ -14,12 +14,13 @@
 # ==============================================================================
 """Keras Input Tensor used to track functional API Topology."""
 
-from keras.utils import object_identity
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.data.util import structure
+from keras.utils import object_identity
 
-# pylint: disable=g-classes-have-attributes
+# isort: off
+from tensorflow.python.data.util import structure
+from tensorflow.python.util.tf_export import keras_export
 
 
 # Tensorflow tensors have a maximum rank of 254
@@ -29,463 +30,516 @@
 _MAX_TENSOR_RANK = 254
 
 
+@keras_export("keras.__internal__.KerasTensor", v1=[])
 class KerasTensor:
-  """A representation of a Keras in/output during Functional API construction.
-
-  `KerasTensor`s are tensor-like objects that represent the symbolic inputs
-  and outputs of Keras layers during Functional model construction. They are
-  comprised of the `tf.TypeSpec` of the (Composite)Tensor that will be
-  consumed/produced in the corresponding location of the Functional model.
-
-  KerasTensors are intended as a private API, so users should never need to
-  directly instantiate `KerasTensor`s.
-
-  **Building Functional Models with KerasTensors**
-  `tf.keras.Input` produces `KerasTensor`s that represent the symbolic inputs
-  to your model.
-
-  Passing a `KerasTensor` to a `tf.keras.Layer` `__call__` lets the layer know
-  that you are building a Functional model. The layer __call__ will
-  infer the output signature and return `KerasTensor`s with `tf.TypeSpec`s
-  corresponding to the symbolic outputs of that layer call. These output
-  `KerasTensor`s will have all of the internal KerasHistory metadata attached
-  to them that Keras needs to construct a Functional Model.
-
-  Currently, layers infer the output signature by:
-    * creating a scratch `FuncGraph`
-    * making placeholders in the scratch graph that match the input typespecs
-    * Calling `layer.call` on these placeholders
-    * extracting the signatures of the outputs before clearing the scratch graph
-
-  (Note: names assigned to KerasTensors by this process are not guaranteed to
-  be unique, and are subject to implementation details).
-
-  `tf.nest` methods are used to insure all of the inputs/output data
-  structures get maintained, with elements swapped between KerasTensors and
-  placeholders.
-
-  In rare cases (such as when directly manipulating shapes using Keras layers),
-  the layer may be able to partially infer the value of the output in addition
-  to just inferring the signature.
-  When this happens, the returned KerasTensor will also contain the inferred
-  value information. Follow-on layers can use this information.
-  during their own output signature inference.
-  E.g. if one layer produces a symbolic `KerasTensor` that the next layer uses
-  as the shape of its outputs, partially knowing the value helps infer the
-  output shape.
-
-  **Automatically converting TF APIs to layers**:
-  If you passing a `KerasTensor` to a TF API that supports dispatching,
-  Keras will automatically turn that API call into a lambda
-  layer in the Functional model, and return KerasTensors representing the
-  symbolic outputs.
-
-  Most TF APIs that take only tensors as input and produce output tensors
-  will support dispatching.
-
-  Calling a `tf.function` does not support dispatching, so you cannot pass
-  `KerasTensor`s as inputs to a `tf.function`.
-
-  Higher-order APIs that take methods which produce tensors (e.g. `tf.while`,
-  `tf.map_fn`, `tf.cond`) also do not currently support dispatching. So, you
-  cannot directly pass KerasTensors as inputs to these APIs either. If you
-  want to use these APIs inside of a Functional model, you must put them inside
-  of a custom layer.
-
-  Args:
-    type_spec: The `tf.TypeSpec` for the symbolic input created by
-      `tf.keras.Input`, or symbolically inferred for the output
-      during a symbolic layer `__call__`.
-    inferred_value: (Optional) a non-symbolic static value, possibly partially
-      specified, that could be symbolically inferred for the outputs during
-      a symbolic layer `__call__`. This will generally only happen when
-      grabbing and manipulating `tf.int32` shapes directly as tensors.
-      Statically inferring values in this way and storing them in the
-      KerasTensor allows follow-on layers to infer output signatures
-      more effectively. (e.g. when using a symbolic shape tensor to later
-      construct a tensor with that shape).
-    name: (optional) string name for this KerasTensor. Names automatically
-      generated by symbolic layer `__call__`s are not guaranteed to be unique,
-      and are subject to implementation details.
-  """
-
-  def __init__(self, type_spec, inferred_value=None, name=None):
-    """Constructs a KerasTensor."""
-    if not isinstance(type_spec, tf.TypeSpec):
-      raise ValueError('KerasTensors must be constructed with a `tf.TypeSpec`.')
-
-    self._type_spec = type_spec
-    self._inferred_value = inferred_value
-    self._name = name
-
-    if not isinstance(type_spec, structure.NoneTensorSpec):
-      if not hasattr(type_spec, 'shape'):
-        raise ValueError(
-            'KerasTensor only supports TypeSpecs that have a shape field; got '
-            f'{type(type_spec).__qualname__}, which does not have a shape.')
-      if not isinstance(type_spec.shape, tf.TensorShape):
-        raise TypeError(
-            "KerasTensor requires that wrapped TypeSpec's shape is a "
-            f'TensorShape; got TypeSpec {type(type_spec).__qualname__}, whose '
-            'shape field has unexpected type '
-            f'{type(type_spec.dtype).__qualname__}.')
-
-  @property
-  def type_spec(self):
-    """Returns the `tf.TypeSpec` symbolically inferred for this Keras output."""
-    return self._type_spec
-
-  @property
-  def shape(self):
-    """Returns the `TensorShape` symbolically inferred for this Keras output."""
-    return self._type_spec.shape
-
-  @classmethod
-  def from_tensor(cls, tensor):
-    """Convert a traced (composite)tensor to a representative KerasTensor."""
-    if isinstance(tensor, tf.Tensor):
-      name = getattr(tensor, 'name', None)
-      type_spec = tf.type_spec_from_value(tensor)
-      inferred_value = None
-      if (type_spec.dtype == tf.int32 and type_spec.shape.rank is not None
-          and type_spec.shape.rank < 2):
-        # If this tensor might be representing shape information,
-        # (dtype=int32, rank of 0 or 1, not too large to represent a shape)
-        # we attempt to capture any value information tensorflow's
-        # shape handling can extract from the current scratch graph.
-        #
-        # Even though keras layers each trace in their own scratch
-        # graph, this shape value info extraction allows us to capture
-        # a sizable and useful subset of the C++ shape value inference TF can do
-        # if all tf ops appear in the same graph when using shape ops.
-        #
-        # Examples of things this cannot infer concrete dimensions for
-        # that the full single-graph C++ shape inference sometimes can are:
-        # * cases where the shape tensor is cast out of int32 before being
-        #   manipulated w/ floating point numbers then converted back
-        # * cases where int32 tensors w/ rank >= 2 are manipulated before being
-        #   used as a shape tensor
-        # * cases where int32 tensors too large to represent shapes are
-        #   manipulated to a smaller size before being used as a shape tensor
-        inferred_value = tf.ones(shape=tensor).shape
-        if inferred_value.dims:
-          inferred_value = inferred_value.as_list()
-          if len(inferred_value) > _MAX_TENSOR_RANK:
-            inferred_value = None
-        else:
-          inferred_value = None
-
-      return KerasTensor(type_spec, inferred_value=inferred_value, name=name)
-    else:
-      # Fallback to the generic arbitrary-typespec KerasTensor
-      name = getattr(tensor, 'name', None)
-      type_spec = tf.type_spec_from_value(tensor)
-      return cls(type_spec, name=name)
-
-  @classmethod
-  def from_type_spec(cls, type_spec, name=None):
-    return cls(type_spec=type_spec, name=name)
-
-  def _to_placeholder(self):
-    """Convert this KerasTensor to a placeholder in a graph."""
-    # If there is an inferred value for this tensor, inject the inferred value
-    if self._inferred_value is not None:
-      # If we suspect this KerasTensor might be representing a shape tensor,
-      # and we were able to extract value information with TensorFlow's shape
-      # handling when making the KerasTensor, we construct the placeholder by
-      # re-injecting the inferred value information into the graph. We
-      # do this injection through the shape of a placeholder, because that
-      # allows us to specify partially-unspecified shape values.
-      #
-      # See the comment on value extraction inside `from_tensor` for more info.
-      inferred_value = tf.shape(
-          tf.compat.v1.placeholder(
-              shape=self._inferred_value, dtype=tf.int32))
-      if self.type_spec.shape.rank == 0:
-        # `tf.shape` always returns a rank-1, we may need to turn it back to a
-        # scalar.
-        inferred_value = inferred_value[0]
-      return inferred_value
-
-    # Use the generic conversion from typespec to a placeholder.
-    def component_to_placeholder(component):
-      return tf.compat.v1.placeholder(component.dtype, component.shape)
-
-    return tf.nest.map_structure(
-        component_to_placeholder, self.type_spec, expand_composites=True)
-
-  def get_shape(self):
-    return self.shape
-
-  def __len__(self):
-    raise TypeError('Keras symbolic inputs/outputs do not '
-                    'implement `__len__`. You may be '
-                    'trying to pass Keras symbolic inputs/outputs '
-                    'to a TF API that does not register dispatching, '
-                    'preventing Keras from automatically '
-                    'converting the API call to a lambda layer '
-                    'in the Functional Model. This error will also get raised '
-                    'if you try asserting a symbolic input/output directly.')
-
-  @property
-  def op(self):
-    raise TypeError('Keras symbolic inputs/outputs do not '
-                    'implement `op`. You may be '
-                    'trying to pass Keras symbolic inputs/outputs '
-                    'to a TF API that does not register dispatching, '
-                    'preventing Keras from automatically '
-                    'converting the API call to a lambda layer '
-                    'in the Functional Model.')
-
-  def __hash__(self):
-    raise TypeError(f'Tensors are unhashable (this tensor: {self}). '
-                    'Instead, use tensor.ref() as the key.')
-
-  # Note: This enables the KerasTensor's overloaded "right" binary
-  # operators to run when the left operand is an ndarray, because it
-  # accords the Tensor class higher priority than an ndarray, or a
-  # numpy matrix.
-  # In the future explore changing this to using numpy's __numpy_ufunc__
-  # mechanism, which allows more control over how Tensors interact
-  # with ndarrays.
-  __array_priority__ = 100
-
-  def __array__(self, dtype=None):
-    raise TypeError(
-        f'You are passing {self}, an intermediate Keras symbolic input/output, '
-        'to a TF API that does not allow registering custom dispatchers, such '
-        'as `tf.cond`, `tf.function`, gradient tapes, or `tf.map_fn`. '
-        'Keras Functional model construction only supports '
-        'TF API calls that *do* support dispatching, such as `tf.math.add` or '
-        '`tf.reshape`. '
-        'Other APIs cannot be called directly on symbolic Keras'
-        'inputs/outputs. You can work around '
-        'this limitation by putting the operation in a custom Keras layer '
-        '`call` and calling that layer '
-        'on this symbolic input/output.')
-
-  @property
-  def is_tensor_like(self):
-    return True
-
-  def set_shape(self, shape):
-    """Updates the shape of this KerasTensor. Mimics `tf.Tensor.set_shape()`."""
-    if not isinstance(shape, tf.TensorShape):
-      shape = tf.TensorShape(shape)
-    if not self.shape.is_compatible_with(shape):
-      raise ValueError(
-          f"Keras symbolic input/output's shape {self.shape} is not "
-          f"compatible with supplied shape {shape}.")
-    else:
-      shape = self.shape.merge_with(shape)
-      self._type_spec = type_spec_with_shape(self._type_spec, shape)
-
-  def __str__(self):
-    symbolic_description = ''
-    inferred_value_string = ''
-    name_string = ''
-
-    if hasattr(self, '_keras_history'):
-      layer = self._keras_history.layer
-      symbolic_description = (
-          ', description="created by layer \'%s\'"' % (layer.name,))
-    if self._inferred_value is not None:
-      inferred_value_string = (
-          ', inferred_value=%s' % self._inferred_value)
-    if self.name is not None:
-      name_string = ', name=\'%s\'' % self._name
-    return 'KerasTensor(type_spec=%s%s%s%s)' % (
-        self.type_spec, inferred_value_string,
-        name_string, symbolic_description)
-
-  def __repr__(self):
-    symbolic_description = ''
-    inferred_value_string = ''
-    if isinstance(self.type_spec, tf.TensorSpec):
-      type_spec_string = 'shape=%s dtype=%s' % (self.shape, self.dtype.name)
-    else:
-      type_spec_string = 'type_spec=%s' % self.type_spec
-
-    if hasattr(self, '_keras_history'):
-      layer = self._keras_history.layer
-      symbolic_description = ' (created by layer \'%s\')' % (layer.name,)
-    if self._inferred_value is not None:
-      inferred_value_string = (
-          ' inferred_value=%s' % self._inferred_value)
-    return '<KerasTensor: %s%s%s>' % (
-        type_spec_string, inferred_value_string, symbolic_description)
-
-  @property
-  def dtype(self):
-    """Returns the `dtype` symbolically inferred for this Keras output."""
-    type_spec = self._type_spec
-    if not hasattr(type_spec, 'dtype'):
-      raise AttributeError(
-          f'KerasTensor wraps TypeSpec {type(type_spec).__qualname__}, '
-          'which does not have a dtype.')
-    if not isinstance(type_spec.dtype, tf.DType):
-      raise TypeError(
-          "KerasTensor requires that wrapped TypeSpec's dtype is a DType; got "
-          f'TypeSpec {type(type_spec).__qualname__}, whose dtype field has '
-          f'unexpected type {type(type_spec.dtype).__qualname__}.')
-    return type_spec.dtype
-
-  def ref(self):
-    """Returns a hashable reference object to this KerasTensor.
-
-    The primary use case for this API is to put KerasTensors in a
-    set/dictionary. We can't put tensors in a set/dictionary as
-    `tensor.__hash__()` is not available and tensor equality (`==`) is supposed
-    to produce a tensor representing if the two inputs are equal.
-
-    See the documentation of `tf.Tensor.ref()` for more info.
-    """
-    return object_identity.Reference(self)
-
-  @property
-  def node(self):
-    """Find the corresponding `Node` that produce this keras_tensor.
-
-    During functional model construction, Keras will attach `KerasHistory` to
-    keras tensor to track the connectivity between calls of layers. Return
-    None if there isn't any KerasHistory attached to this tensor.
-    """
-    if hasattr(self, '_keras_history'):
-      layer, node_index, _ = self._keras_history
-      return layer.inbound_nodes[node_index]
-    return None
-
-  def __iter__(self):
-    shape = None
-    if self.shape.ndims is not None:
-      shape = [dim.value for dim in self.shape.dims]
-
-    if shape is None:
-      raise TypeError('Cannot iterate over a Tensor with unknown shape.')
-    if not shape:
-      raise TypeError('Cannot iterate over a scalar.')
-    if shape[0] is None:
-      raise TypeError(
-          'Cannot iterate over a Tensor with unknown first dimension.')
-    return _KerasTensorIterator(self, shape[0])
-
-  @property
-  def name(self):
-    """Returns the (non-unique, optional) name of this symbolic Keras value."""
-    return self._name
-
-  @classmethod
-  def _overload_all_operators(cls, tensor_class):  # pylint: disable=invalid-name
-    """Register overloads for all operators."""
-    for operator in tf.Tensor.OVERLOADABLE_OPERATORS:
-      cls._overload_operator(tensor_class, operator)
-
-    # We include `experimental_ref` for versions of TensorFlow that
-    # still include the deprecated method in Tensors.
-    if hasattr(tensor_class, 'experimental_ref'):
-      cls._overload_operator(tensor_class, 'experimental_ref')
-
-  @classmethod
-  def _overload_operator(cls, tensor_class, operator):  # pylint: disable=invalid-name
-    """Overload an operator with the same implementation as a base Tensor class.
-
-    We pull the operator out of the class dynamically to avoid ordering issues.
+    """A representation of a Keras in/output during Functional API construction.
+
+    `KerasTensor`s are tensor-like objects that represent the symbolic inputs
+    and outputs of Keras layers during Functional model construction. They are
+    comprised of the `tf.TypeSpec` of the (Composite)Tensor that will be
+    consumed/produced in the corresponding location of the Functional model.
+
+    KerasTensors are intended as a private API, so users should never need to
+    directly instantiate `KerasTensor`s.
+
+    **Building Functional Models with KerasTensors**
+    `tf.keras.Input` produces `KerasTensor`s that represent the symbolic inputs
+    to your model.
+
+    Passing a `KerasTensor` to a `tf.keras.Layer` `__call__` lets the layer know
+    that you are building a Functional model. The layer __call__ will
+    infer the output signature and return `KerasTensor`s with `tf.TypeSpec`s
+    corresponding to the symbolic outputs of that layer call. These output
+    `KerasTensor`s will have all of the internal KerasHistory metadata attached
+    to them that Keras needs to construct a Functional Model.
+
+    Currently, layers infer the output signature by:
+      * creating a scratch `FuncGraph`
+      * making placeholders in the scratch graph that match the input typespecs
+      * Calling `layer.call` on these placeholders
+      * extracting the signatures of the outputs before clearing the scratch
+        graph
+
+    (Note: names assigned to KerasTensors by this process are not guaranteed to
+    be unique, and are subject to implementation details).
+
+    `tf.nest` methods are used to insure all of the inputs/output data
+    structures get maintained, with elements swapped between KerasTensors and
+    placeholders.
+
+    In rare cases (such as when directly manipulating shapes using Keras
+    layers), the layer may be able to partially infer the value of the output in
+    addition to just inferring the signature.
+    When this happens, the returned KerasTensor will also contain the inferred
+    value information. Follow-on layers can use this information.
+    during their own output signature inference.
+    E.g. if one layer produces a symbolic `KerasTensor` that the next layer uses
+    as the shape of its outputs, partially knowing the value helps infer the
+    output shape.
+
+    **Automatically converting TF APIs to layers**:
+    If you passing a `KerasTensor` to a TF API that supports dispatching,
+    Keras will automatically turn that API call into a lambda
+    layer in the Functional model, and return KerasTensors representing the
+    symbolic outputs.
+
+    Most TF APIs that take only tensors as input and produce output tensors
+    will support dispatching.
+
+    Calling a `tf.function` does not support dispatching, so you cannot pass
+    `KerasTensor`s as inputs to a `tf.function`.
+
+    Higher-order APIs that take methods which produce tensors (e.g. `tf.while`,
+    `tf.map_fn`, `tf.cond`) also do not currently support dispatching. So, you
+    cannot directly pass KerasTensors as inputs to these APIs either. If you
+    want to use these APIs inside of a Functional model, you must put them
+    inside of a custom layer.
 
     Args:
-      tensor_class: The (Composite)Tensor to get the method from.
-      operator: string. The operator name.
+      type_spec: The `tf.TypeSpec` for the symbolic input created by
+        `tf.keras.Input`, or symbolically inferred for the output
+        during a symbolic layer `__call__`.
+      inferred_value: (Optional) a non-symbolic static value, possibly partially
+        specified, that could be symbolically inferred for the outputs during
+        a symbolic layer `__call__`. This will generally only happen when
+        grabbing and manipulating `tf.int32` shapes directly as tensors.
+        Statically inferring values in this way and storing them in the
+        KerasTensor allows follow-on layers to infer output signatures
+        more effectively. (e.g. when using a symbolic shape tensor to later
+        construct a tensor with that shape).
+      name: (optional) string name for this KerasTensor. Names automatically
+        generated by symbolic layer `__call__`s are not guaranteed to be unique,
+        and are subject to implementation details.
     """
-    tensor_oper = getattr(tensor_class, operator)
 
-    # Compatibility with Python 2:
-    # Python 2 unbound methods have type checks for the first arg,
-    # so we need to extract the underlying function
-    tensor_oper = getattr(tensor_oper, '__func__', tensor_oper)
-
-    setattr(cls, operator, tensor_oper)
+    def __init__(self, type_spec, inferred_value=None, name=None):
+        """Constructs a KerasTensor."""
+        if not isinstance(type_spec, tf.TypeSpec):
+            raise ValueError(
+                "KerasTensors must be constructed with a `tf.TypeSpec`."
+            )
+
+        self._type_spec = type_spec
+        self._inferred_value = inferred_value
+        self._name = name
+
+        if not isinstance(type_spec, structure.NoneTensorSpec):
+            if not hasattr(type_spec, "shape"):
+                raise ValueError(
+                    "KerasTensor only supports TypeSpecs that have a shape "
+                    f"field; got {type(type_spec).__qualname__}, "
+                    "which does not have a shape."
+                )
+            if not isinstance(type_spec.shape, tf.TensorShape):
+                raise TypeError(
+                    "KerasTensor requires that wrapped TypeSpec's shape is a "
+                    f"TensorShape; got TypeSpec {type(type_spec).__qualname__}"
+                    ", whose shape field has unexpected type "
+                    f"{type(type_spec.dtype).__qualname__}."
+                )
+
+    @property
+    def type_spec(self):
+        """Returns the `tf.TypeSpec` symbolically inferred for Keras output."""
+        return self._type_spec
+
+    @property
+    def shape(self):
+        """Returns the `TensorShape` symbolically inferred for Keras output."""
+        return self._type_spec.shape
+
+    @classmethod
+    def from_tensor(cls, tensor):
+        """Convert a traced (composite)tensor to a representative
+        KerasTensor."""
+        if isinstance(tensor, tf.Tensor):
+            name = getattr(tensor, "name", None)
+            type_spec = tf.type_spec_from_value(tensor)
+            inferred_value = None
+            if (
+                type_spec.dtype == tf.int32
+                and type_spec.shape.rank is not None
+                and type_spec.shape.rank < 2
+            ):
+                # If this tensor might be representing shape information,
+                # (dtype=int32, rank of 0 or 1, not too large to represent a
+                # shape) we attempt to capture any value information
+                # tensorflow's shape handling can extract from the current
+                # scratch graph.
+                #
+                # Even though keras layers each trace in their own scratch
+                # graph, this shape value info extraction allows us to capture a
+                # sizable and useful subset of the C++ shape value inference TF
+                # can do if all tf ops appear in the same graph when using shape
+                # ops.
+                #
+                # Examples of things this cannot infer concrete dimensions for
+                # that the full single-graph C++ shape inference sometimes can
+                # are:
+                # * cases where the shape tensor is cast out of int32 before
+                #   being manipulated w/ floating point numbers then converted
+                #   back
+                # * cases where int32 tensors w/ rank >= 2 are manipulated
+                #   before being used as a shape tensor
+                # * cases where int32 tensors too large to represent shapes are
+                #   manipulated to a smaller size before being used as a shape
+                #   tensor
+                inferred_value = tf.ones(shape=tensor).shape
+                if inferred_value.dims:
+                    inferred_value = inferred_value.as_list()
+                    if len(inferred_value) > _MAX_TENSOR_RANK:
+                        inferred_value = None
+                else:
+                    inferred_value = None
+
+            return KerasTensor(
+                type_spec, inferred_value=inferred_value, name=name
+            )
+        else:
+            # Fallback to the generic arbitrary-typespec KerasTensor
+            name = getattr(tensor, "name", None)
+            type_spec = tf.type_spec_from_value(tensor)
+            return cls(type_spec, name=name)
+
+    @classmethod
+    def from_type_spec(cls, type_spec, name=None):
+        return cls(type_spec=type_spec, name=name)
+
+    def _to_placeholder(self):
+        """Convert this KerasTensor to a placeholder in a graph."""
+        # If there is an inferred value for this tensor, inject the inferred
+        # value
+        if self._inferred_value is not None:
+            # If we suspect this KerasTensor might be representing a shape
+            # tensor, and we were able to extract value information with
+            # TensorFlow's shape handling when making the KerasTensor, we
+            # construct the placeholder by re-injecting the inferred value
+            # information into the graph. We do this injection through the shape
+            # of a placeholder, because that allows us to specify
+            # partially-unspecified shape values.
+            #
+            # See the comment on value extraction inside `from_tensor` for more
+            # info.
+            inferred_value = tf.shape(
+                tf.compat.v1.placeholder(
+                    shape=self._inferred_value, dtype=tf.int32
+                )
+            )
+            if self.type_spec.shape.rank == 0:
+                # `tf.shape` always returns a rank-1, we may need to turn it
+                # back to a scalar.
+                inferred_value = inferred_value[0]
+            return inferred_value
+
+        # Use the generic conversion from typespec to a placeholder.
+        def component_to_placeholder(component):
+            return tf.compat.v1.placeholder(component.dtype, component.shape)
+
+        return tf.nest.map_structure(
+            component_to_placeholder, self.type_spec, expand_composites=True
+        )
+
+    def get_shape(self):
+        return self.shape
+
+    def __len__(self):
+        raise TypeError(
+            "Keras symbolic inputs/outputs do not "
+            "implement `__len__`. You may be "
+            "trying to pass Keras symbolic inputs/outputs "
+            "to a TF API that does not register dispatching, "
+            "preventing Keras from automatically "
+            "converting the API call to a lambda layer "
+            "in the Functional Model. This error will also get raised "
+            "if you try asserting a symbolic input/output directly."
+        )
+
+    @property
+    def op(self):
+        raise TypeError(
+            "Keras symbolic inputs/outputs do not "
+            "implement `op`. You may be "
+            "trying to pass Keras symbolic inputs/outputs "
+            "to a TF API that does not register dispatching, "
+            "preventing Keras from automatically "
+            "converting the API call to a lambda layer "
+            "in the Functional Model."
+        )
+
+    def __hash__(self):
+        raise TypeError(
+            f"Tensors are unhashable (this tensor: {self}). "
+            "Instead, use tensor.ref() as the key."
+        )
+
+    # Note: This enables the KerasTensor's overloaded "right" binary
+    # operators to run when the left operand is an ndarray, because it
+    # accords the Tensor class higher priority than an ndarray, or a
+    # numpy matrix.
+    # In the future explore changing this to using numpy's __numpy_ufunc__
+    # mechanism, which allows more control over how Tensors interact
+    # with ndarrays.
+    __array_priority__ = 100
+
+    def __array__(self, dtype=None):
+        raise TypeError(
+            f"You are passing {self}, an intermediate Keras symbolic "
+            "input/output, to a TF API that does not allow registering custom "
+            "dispatchers, such as `tf.cond`, `tf.function`, gradient tapes, "
+            "or `tf.map_fn`. Keras Functional model construction only supports "
+            "TF API calls that *do* support dispatching, such as `tf.math.add` "
+            "or `tf.reshape`. "
+            "Other APIs cannot be called directly on symbolic Keras"
+            "inputs/outputs. You can work around "
+            "this limitation by putting the operation in a custom Keras layer "
+            "`call` and calling that layer "
+            "on this symbolic input/output."
+        )
+
+    @property
+    def is_tensor_like(self):
+        return True
+
+    def set_shape(self, shape):
+        """Updates the shape of this KerasTensor. Mimics
+        `tf.Tensor.set_shape()`."""
+        if not isinstance(shape, tf.TensorShape):
+            shape = tf.TensorShape(shape)
+        if not self.shape.is_compatible_with(shape):
+            raise ValueError(
+                f"Keras symbolic input/output's shape {self.shape} is not "
+                f"compatible with supplied shape {shape}."
+            )
+        else:
+            shape = self.shape.merge_with(shape)
+            self._type_spec = type_spec_with_shape(self._type_spec, shape)
+
+    def __str__(self):
+        symbolic_description = ""
+        inferred_value_string = ""
+        name_string = ""
+
+        if hasattr(self, "_keras_history"):
+            layer = self._keras_history.layer
+            symbolic_description = ", description=\"created by layer '%s'\"" % (
+                layer.name,
+            )
+        if self._inferred_value is not None:
+            inferred_value_string = f", inferred_value={self._inferred_value}"
+        if self.name is not None:
+            name_string = f", name='{self._name}'"
+        return "KerasTensor(type_spec=%s%s%s%s)" % (
+            self.type_spec,
+            inferred_value_string,
+            name_string,
+            symbolic_description,
+        )
+
+    def __repr__(self):
+        symbolic_description = ""
+        inferred_value_string = ""
+        if isinstance(self.type_spec, tf.TensorSpec):
+            type_spec_string = f"shape={self.shape} dtype={self.dtype.name}"
+        else:
+            type_spec_string = f"type_spec={self.type_spec}"
+
+        if hasattr(self, "_keras_history"):
+            layer = self._keras_history.layer
+            symbolic_description = f" (created by layer '{layer.name}')"
+        if self._inferred_value is not None:
+            inferred_value_string = f" inferred_value={self._inferred_value}"
+        return "<KerasTensor: %s%s%s>" % (
+            type_spec_string,
+            inferred_value_string,
+            symbolic_description,
+        )
+
+    @property
+    def dtype(self):
+        """Returns the `dtype` symbolically inferred for this Keras output."""
+        type_spec = self._type_spec
+        if not hasattr(type_spec, "dtype"):
+            raise AttributeError(
+                f"KerasTensor wraps TypeSpec {type(type_spec).__qualname__}, "
+                "which does not have a dtype."
+            )
+        if not isinstance(type_spec.dtype, tf.DType):
+            raise TypeError(
+                "KerasTensor requires that wrapped TypeSpec's dtype is a "
+                f"DType; got TypeSpec {type(type_spec).__qualname__}, whose "
+                "dtype field has unexpected type "
+                f"{type(type_spec.dtype).__qualname__}."
+            )
+        return type_spec.dtype
+
+    def ref(self):
+        """Returns a hashable reference object to this KerasTensor.
+
+        The primary use case for this API is to put KerasTensors in a
+        set/dictionary. We can't put tensors in a set/dictionary as
+        `tensor.__hash__()` is not available and tensor equality (`==`) is
+        supposed to produce a tensor representing if the two inputs are equal.
+
+        See the documentation of `tf.Tensor.ref()` for more info.
+        """
+        return object_identity.Reference(self)
+
+    @property
+    def node(self):
+        """Find the corresponding `Node` that produce this keras_tensor.
+
+        During functional model construction, Keras will attach `KerasHistory`
+        to keras tensor to track the connectivity between calls of layers.
+        Return None if there isn't any KerasHistory attached to this tensor.
+        """
+        if hasattr(self, "_keras_history"):
+            layer, node_index, _ = self._keras_history
+            return layer.inbound_nodes[node_index]
+        return None
+
+    def __iter__(self):
+        shape = None
+        if self.shape.ndims is not None:
+            shape = [dim.value for dim in self.shape.dims]
+
+        if shape is None:
+            raise TypeError("Cannot iterate over a Tensor with unknown shape.")
+        if not shape:
+            raise TypeError("Cannot iterate over a scalar.")
+        if shape[0] is None:
+            raise TypeError(
+                "Cannot iterate over a Tensor with unknown first dimension."
+            )
+        return _KerasTensorIterator(self, shape[0])
+
+    @property
+    def name(self):
+        """Returns the (non-unique, optional) name of this symbolic Keras
+        value."""
+        return self._name
+
+    @classmethod
+    def _overload_all_operators(cls, tensor_class):
+        """Register overloads for all operators."""
+        for operator in tf.Tensor.OVERLOADABLE_OPERATORS:
+            cls._overload_operator(tensor_class, operator)
+
+        # We include `experimental_ref` for versions of TensorFlow that
+        # still include the deprecated method in Tensors.
+        if hasattr(tensor_class, "experimental_ref"):
+            cls._overload_operator(tensor_class, "experimental_ref")
+
+    @classmethod
+    def _overload_operator(cls, tensor_class, operator):
+        """Overload operator with the same implementation as the Tensor class.
+
+        We pull the operator out of the class dynamically to avoid ordering
+        issues.
+
+        Args:
+          tensor_class: The (Composite)Tensor to get the method from.
+          operator: string. The operator name.
+        """
+        tensor_oper = getattr(tensor_class, operator)
+
+        # Compatibility with Python 2:
+        # Python 2 unbound methods have type checks for the first arg,
+        # so we need to extract the underlying function
+        tensor_oper = getattr(tensor_oper, "__func__", tensor_oper)
+
+        setattr(cls, operator, tensor_oper)
+
+
+KerasTensor._overload_all_operators(tf.Tensor)
+
+
+@keras_export("keras.__internal__.SparseKerasTensor", v1=[])
+class SparseKerasTensor(KerasTensor):
+    """A specialized KerasTensor representation for `tf.sparse.SparseTensor`s.
 
+    Specifically, it specializes the conversion to a placeholder in order
+    to maintain dense shape information.
+    """
 
-KerasTensor._overload_all_operators(tf.Tensor)  # pylint: disable=protected-access
+    def _to_placeholder(self):
+        spec = self.type_spec
 
+        # nest.map_structure loses dense shape information for sparse tensors.
+        # So, we special-case sparse placeholder creation.
+        # This only preserves shape information for top-level sparse tensors;
+        # not for sparse tensors that are nested inside another composite
+        # tensor.
+        return tf.compat.v1.sparse_placeholder(
+            dtype=spec.dtype, shape=spec.shape
+        )
 
-class SparseKerasTensor(KerasTensor):
-  """A specialized KerasTensor representation for `tf.sparse.SparseTensor`s.
 
-  Specifically, it specializes the conversion to a placeholder in order
-  to maintain dense shape information.
-  """
+@keras_export("keras.__internal__.RaggedKerasTensor", v1=[])
+class RaggedKerasTensor(KerasTensor):
+    """A specialized KerasTensor representation for `tf.RaggedTensor`s.
 
-  def _to_placeholder(self):
-    spec = self.type_spec
+    Specifically, it:
 
-    # nest.map_structure loses dense shape information for sparse tensors.
-    # So, we special-case sparse placeholder creation.
-    # This only preserves shape information for top-level sparse tensors;
-    # not for sparse tensors that are nested inside another composite
-    # tensor.
-    return tf.compat.v1.sparse_placeholder(dtype=spec.dtype, shape=spec.shape)
+    1. Specializes the conversion to a placeholder in order
+    to maintain shape information for non-ragged dimensions.
+    2. Overloads the KerasTensor's operators with the RaggedTensor versions
+    when they don't match the `tf.Tensor` versions
+    3. Exposes some of the instance method/attribute that are unique to
+    the RaggedTensor API (such as ragged_rank).
+    """
 
+    def _to_placeholder(self):
+        ragged_spec = self.type_spec
+        if ragged_spec.ragged_rank == 0 or ragged_spec.shape.rank is None:
+            return super()._to_placeholder()
+
+        flat_shape = ragged_spec.shape[ragged_spec.ragged_rank :]
+        result = tf.compat.v1.placeholder(ragged_spec.dtype, flat_shape)
+
+        known_num_splits = []
+        prod = 1
+        for axis_size in ragged_spec.shape:
+            if prod is not None:
+                if axis_size is None or (
+                    getattr(axis_size, "value", True) is None
+                ):
+                    prod = None
+                else:
+                    prod = prod * axis_size
+            known_num_splits.append(prod)
+
+        for axis in range(ragged_spec.ragged_rank, 0, -1):
+            axis_size = ragged_spec.shape[axis]
+            if axis_size is None or (getattr(axis_size, "value", True) is None):
+                num_splits = known_num_splits[axis - 1]
+                if num_splits is not None:
+                    num_splits = num_splits + 1
+                splits = tf.compat.v1.placeholder(
+                    ragged_spec.row_splits_dtype, [num_splits]
+                )
+                result = tf.RaggedTensor.from_row_splits(
+                    result, splits, validate=False
+                )
+            else:
+                rowlen = tf.constant(axis_size, ragged_spec.row_splits_dtype)
+                result = tf.RaggedTensor.from_uniform_row_length(
+                    result, rowlen, validate=False
+                )
+        return result
+
+    @property
+    def ragged_rank(self):
+        return self.type_spec.ragged_rank
 
-class RaggedKerasTensor(KerasTensor):
-  """A specialized KerasTensor representation for `tf.RaggedTensor`s.
-
-  Specifically, it:
-
-  1. Specializes the conversion to a placeholder in order
-  to maintain shape information for non-ragged dimensions.
-  2. Overloads the KerasTensor's operators with the RaggedTensor versions
-  when they don't match the `tf.Tensor` versions
-  3. Exposes some of the instance method/attribute that are unique to
-  the RaggedTensor API (such as ragged_rank).
-  """
-
-  def _to_placeholder(self):
-    ragged_spec = self.type_spec
-    if ragged_spec.ragged_rank == 0 or ragged_spec.shape.rank is None:
-      return super()._to_placeholder()
-
-    flat_shape = ragged_spec.shape[ragged_spec.ragged_rank:]
-    result = tf.compat.v1.placeholder(ragged_spec.dtype, flat_shape)
-
-    known_num_splits = []
-    prod = 1
-    for axis_size in ragged_spec.shape:
-      if prod is not None:
-        if axis_size is None or (
-            getattr(axis_size, 'value', True) is None):
-          prod = None
-        else:
-          prod = prod * axis_size
-      known_num_splits.append(prod)
-
-    for axis in range(ragged_spec.ragged_rank, 0, -1):
-      axis_size = ragged_spec.shape[axis]
-      if axis_size is None or (getattr(axis_size, 'value', True) is None):
-        num_splits = known_num_splits[axis-1]
-        if num_splits is not None:
-          num_splits = num_splits + 1
-        splits = tf.compat.v1.placeholder(
-            ragged_spec.row_splits_dtype, [num_splits])
-        result = tf.RaggedTensor.from_row_splits(
-            result, splits, validate=False)
-      else:
-        rowlen = tf.constant(axis_size, ragged_spec.row_splits_dtype)
-        result = tf.RaggedTensor.from_uniform_row_length(
-            result, rowlen, validate=False)
-    return result
-
-  @property
-  def ragged_rank(self):
-    return self.type_spec.ragged_rank
 
 # Overload slicing
-RaggedKerasTensor._overload_operator(tf.RaggedTensor, '__getitem__')  # pylint: disable=protected-access
+RaggedKerasTensor._overload_operator(tf.RaggedTensor, "__getitem__")
 
 # Overload math ops
-RaggedKerasTensor._overload_operator(tf.RaggedTensor, '__add__')  # pylint: disable=protected-access
-RaggedKerasTensor._overload_operator(tf.RaggedTensor, '__radd__')  # pylint: disable=protected-access
-RaggedKerasTensor._overload_operator(tf.RaggedTensor, '__mul__')  # pylint: disable=protected-access
-RaggedKerasTensor._overload_operator(tf.RaggedTensor, '__rmul__')  # pylint: disable=protected-access
+RaggedKerasTensor._overload_operator(tf.RaggedTensor, "__add__")
+RaggedKerasTensor._overload_operator(tf.RaggedTensor, "__radd__")
+RaggedKerasTensor._overload_operator(tf.RaggedTensor, "__mul__")
+RaggedKerasTensor._overload_operator(tf.RaggedTensor, "__rmul__")
 
 
 # TODO(b/161487382):
@@ -495,27 +549,27 @@ def ragged_rank(self):
 # This is needed to not break Tensorflow probability
 # while they finish migrating to composite tensors.
 class UserRegisteredSpec(tf.TypeSpec):
-  """TypeSpec to represent user-registered symbolic objects."""
+    """TypeSpec to represent user-registered symbolic objects."""
 
-  def __init__(self, shape, dtype):
-    self.shape = shape
-    self._dtype = dtype
-    self.dtype = dtype
+    def __init__(self, shape, dtype):
+        self.shape = shape
+        self._dtype = dtype
+        self.dtype = dtype
 
-  def _component_specs(self):
-    raise NotImplementedError
+    def _component_specs(self):
+        raise NotImplementedError
 
-  def _from_components(self, components):
-    raise NotImplementedError
+    def _from_components(self, components):
+        raise NotImplementedError
 
-  def _serialize(self):
-    raise NotImplementedError
+    def _serialize(self):
+        raise NotImplementedError
 
-  def _to_components(self, value):
-    raise NotImplementedError
+    def _to_components(self, value):
+        raise NotImplementedError
 
-  def value_type(self):
-    raise NotImplementedError
+    def value_type(self):
+        raise NotImplementedError
 
 
 # TODO(b/161487382):
@@ -525,46 +579,49 @@ def value_type(self):
 # This is needed to not break Tensorflow probability
 # while they finish migrating to composite tensors.
 class UserRegisteredTypeKerasTensor(KerasTensor):
-  """KerasTensor that represents legacy register_symbolic_tensor_type."""
+    """KerasTensor that represents legacy register_symbolic_tensor_type."""
 
-  def __init__(self, user_registered_symbolic_object):
-    x = user_registered_symbolic_object
-    self._user_registered_symbolic_object = x
-    type_spec = UserRegisteredSpec(x.shape, x.dtype)
-    name = getattr(x, 'name', None)
+    def __init__(self, user_registered_symbolic_object):
+        x = user_registered_symbolic_object
+        self._user_registered_symbolic_object = x
+        type_spec = UserRegisteredSpec(x.shape, x.dtype)
+        name = getattr(x, "name", None)
 
-    super().__init__(type_spec, name)
+        super().__init__(type_spec, name)
 
-  @classmethod
-  def from_tensor(cls, tensor):
-    return cls(tensor)
+    @classmethod
+    def from_tensor(cls, tensor):
+        return cls(tensor)
 
-  @classmethod
-  def from_type_spec(cls, type_spec, name=None):
-    raise NotImplementedError('You cannot instantiate a KerasTensor '
-                              'directly from TypeSpec: %s' % type_spec)
+    @classmethod
+    def from_type_spec(cls, type_spec, name=None):
+        raise NotImplementedError(
+            "You cannot instantiate a KerasTensor directly from TypeSpec: %s"
+            % type_spec
+        )
 
-  def _to_placeholder(self):
-    return self._user_registered_symbolic_object
+    def _to_placeholder(self):
+        return self._user_registered_symbolic_object
 
 
 class _KerasTensorIterator:
-  """Iterates over the leading dim of a KerasTensor. Performs 0 error checks."""
+    """Iterates over the leading dim of a KerasTensor. Performs 0 error
+    checks."""
 
-  def __init__(self, tensor, dim0):
-    self._tensor = tensor
-    self._index = 0
-    self._limit = dim0
+    def __init__(self, tensor, dim0):
+        self._tensor = tensor
+        self._index = 0
+        self._limit = dim0
 
-  def __iter__(self):
-    return self
+    def __iter__(self):
+        return self
 
-  def __next__(self):
-    if self._index == self._limit:
-      raise StopIteration
-    result = self._tensor[self._index]
-    self._index += 1
-    return result
+    def __next__(self):
+        if self._index == self._limit:
+            raise StopIteration
+        result = self._tensor[self._index]
+        self._index += 1
+        return result
 
 
 # Specify the mappings of tensor class to KerasTensor class.
@@ -579,76 +636,83 @@ def __next__(self):
     (tf.Tensor, KerasTensor),
     (tf.SparseTensor, SparseKerasTensor),
     (tf.RaggedTensor, RaggedKerasTensor),
-    (object, KerasTensor)
+    (object, KerasTensor),
 ]
 
 
 def register_keras_tensor_specialization(cls, keras_tensor_subclass):
-  """Register a specialized KerasTensor subclass for a Tensor type."""
-  # We always leave (object, KerasTensor) at the end as a generic fallback
-  keras_tensor_classes.insert(-1, (cls, keras_tensor_subclass))
+    """Register a specialized KerasTensor subclass for a Tensor type."""
+    # We always leave (object, KerasTensor) at the end as a generic fallback
+    keras_tensor_classes.insert(-1, (cls, keras_tensor_subclass))
 
 
 def keras_tensor_to_placeholder(x):
-  """Construct a graph placeholder to represent a KerasTensor when tracing."""
-  if isinstance(x, KerasTensor):
-    return x._to_placeholder()  # pylint: disable=protected-access
-  else:
-    return x
+    """Construct a graph placeholder to represent a KerasTensor when tracing."""
+    if isinstance(x, KerasTensor):
+        return x._to_placeholder()
+    else:
+        return x
 
 
 def keras_tensor_from_tensor(tensor):
-  """Convert a traced (composite)tensor to a representative KerasTensor."""
-  # Create a specialized KerasTensor that supports instance methods,
-  # operators, and additional value inference if possible
-  keras_tensor_cls = None
-  for tensor_type, cls in keras_tensor_classes:
-    if isinstance(tensor, tensor_type):
-      keras_tensor_cls = cls
-      break
+    """Convert a traced (composite)tensor to a representative KerasTensor."""
+    # Create a specialized KerasTensor that supports instance methods,
+    # operators, and additional value inference if possible
+    keras_tensor_cls = None
+    for tensor_type, cls in keras_tensor_classes:
+        if isinstance(tensor, tensor_type):
+            keras_tensor_cls = cls
+            break
 
-  out = keras_tensor_cls.from_tensor(tensor)
+    out = keras_tensor_cls.from_tensor(tensor)
 
-  if hasattr(tensor, '_keras_mask'):
-    out._keras_mask = keras_tensor_from_tensor(tensor._keras_mask)  # pylint: disable=protected-access
-  return out
+    if getattr(tensor, "_keras_mask", None) is not None:
+        out._keras_mask = keras_tensor_from_tensor(tensor._keras_mask)
+    return out
 
 
 def keras_tensor_from_type_spec(type_spec, name=None):
-  """Convert a TypeSpec to a representative KerasTensor."""
-  # Create a specialized KerasTensor that supports instance methods,
-  # operators, and additional value inference if possible
-  keras_tensor_cls = None
-  value_type = type_spec.value_type
-  for tensor_type, cls in keras_tensor_classes:
-    if issubclass(value_type, tensor_type):
-      keras_tensor_cls = cls
-      break
+    """Convert a TypeSpec to a representative KerasTensor."""
+    # Create a specialized KerasTensor that supports instance methods,
+    # operators, and additional value inference if possible
+    keras_tensor_cls = None
+    value_type = type_spec.value_type
+    for tensor_type, cls in keras_tensor_classes:
+        if issubclass(value_type, tensor_type):
+            keras_tensor_cls = cls
+            break
 
-  return keras_tensor_cls.from_type_spec(type_spec, name=name)
+    return keras_tensor_cls.from_type_spec(type_spec, name=name)
 
 
 def type_spec_with_shape(spec, shape):
-  """Returns a copy of TypeSpec `spec` with its shape set to `shape`."""
-  if isinstance(spec, tf.TensorSpec):
-    # pylint: disable=protected-access
-    # TODO(b/203201161) Figure out why mutation is needed here, and remove it.
-    # (TensorSpec objects should be immutable; and we should not be modifying
-    # private fields.)
-    shape = tf.TensorShape(shape)
-    spec._shape = shape
-    return spec
-  elif isinstance(spec, tf.RaggedTensorSpec):
-    return tf.RaggedTensorSpec(shape, spec.dtype, spec.ragged_rank,
-                               spec.row_splits_dtype,
-                               spec.flat_values_spec)
-  elif isinstance(spec, tf.SparseTensorSpec):
-    return tf.SparseTensorSpec(shape, spec.dtype)
-  elif hasattr(spec, 'with_shape'):
-    # TODO(edloper): Consider adding .with_shape method to TensorSpec,
-    # RaggedTensorSpec, and SparseTensorSpec.
-    return spec.with_shape(shape)
-  else:
-    # TODO(edloper): Consider moving this check to the KerasTensor constructor.
-    raise ValueError('Keras requires TypeSpec to have a `with_shape` method '
-                     'that returns a copy of `self` with an updated shape.')
+    """Returns a copy of TypeSpec `spec` with its shape set to `shape`."""
+    if isinstance(spec, tf.TensorSpec):
+
+        # TODO(b/203201161) Figure out why mutation is needed here, and remove
+        # it. (TensorSpec objects should be immutable; and we should not be
+        # modifying private fields.)
+        shape = tf.TensorShape(shape)
+        spec._shape = shape
+        return spec
+    elif isinstance(spec, tf.RaggedTensorSpec):
+        return tf.RaggedTensorSpec(
+            shape,
+            spec.dtype,
+            spec.ragged_rank,
+            spec.row_splits_dtype,
+            spec.flat_values_spec,
+        )
+    elif isinstance(spec, tf.SparseTensorSpec):
+        return tf.SparseTensorSpec(shape, spec.dtype)
+    elif hasattr(spec, "with_shape"):
+        # TODO(edloper): Consider adding .with_shape method to TensorSpec,
+        # RaggedTensorSpec, and SparseTensorSpec.
+        return spec.with_shape(shape)
+    else:
+        # TODO(edloper): Consider moving this check to the KerasTensor
+        # constructor.
+        raise ValueError(
+            "Keras requires TypeSpec to have a `with_shape` method "
+            "that returns a copy of `self` with an updated shape."
+        )
diff --git a/keras/engine/keras_tensor_test.py b/keras/engine/keras_tensor_test.py
index bd0b4f271454..6f08689c7ebf 100644
--- a/keras/engine/keras_tensor_test.py
+++ b/keras/engine/keras_tensor_test.py
@@ -13,211 +13,265 @@
 # limitations under the License.
 # ==============================================================================
 """InputSpec tests."""
-# pylint: disable=g-bad-import-order
 
-import tensorflow.compat.v2 as tf
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
-from keras.testing_infra import test_combinations
+
 from keras import layers
-from keras.testing_infra import test_utils
 from keras.engine import keras_tensor
 from keras.engine import training
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 class CustomTypeSpec(tf.TypeSpec):
-  """Stubbed-out custom type spec, for testing."""
+    """Stubbed-out custom type spec, for testing."""
 
-  def __init__(self, shape, dtype):
-    self.shape = tf.TensorShape(shape)
-    self.dtype = tf.dtypes.as_dtype(dtype)
+    def __init__(self, shape, dtype):
+        self.shape = tf.TensorShape(shape)
+        self.dtype = tf.dtypes.as_dtype(dtype)
 
-  # Stub implementations for all the TypeSpec methods:
-  value_type = None
-  _to_components = lambda self, value: None
-  _from_components = lambda self, components: None
-  _component_specs = property(lambda self: None)
-  _serialize = lambda self: (self.shape, self.dtype)
+    # Stub implementations for all the TypeSpec methods:
+    value_type = None
+    _to_components = lambda self, value: None
+    _from_components = lambda self, components: None
+    _component_specs = property(lambda self: None)
+    _serialize = lambda self: (self.shape, self.dtype)
 
 
 class CustomTypeSpec2(CustomTypeSpec):
-  """Adds a with_shape method to CustomTypeSpec."""
+    """Adds a with_shape method to CustomTypeSpec."""
 
-  def with_shape(self, new_shape):
-    return CustomTypeSpec2(new_shape, self.dtype)
+    def with_shape(self, new_shape):
+        return CustomTypeSpec2(new_shape, self.dtype)
 
 
 @test_utils.run_v2_only
 class KerasTensorTest(test_combinations.TestCase):
+    def test_repr_and_string(self):
+        kt = keras_tensor.KerasTensor(
+            type_spec=tf.TensorSpec(shape=(1, 2, 3), dtype=tf.float32)
+        )
+        expected_str = (
+            "KerasTensor(type_spec=TensorSpec(shape=(1, 2, 3), "
+            "dtype=tf.float32, name=None))"
+        )
+        expected_repr = "<KerasTensor: shape=(1, 2, 3) dtype=float32>"
+        self.assertEqual(expected_str, str(kt))
+        self.assertEqual(expected_repr, repr(kt))
+
+        kt = keras_tensor.KerasTensor(
+            type_spec=tf.TensorSpec(shape=(2,), dtype=tf.int32),
+            inferred_value=[2, 3],
+        )
+        expected_str = (
+            "KerasTensor(type_spec=TensorSpec(shape=(2,), "
+            "dtype=tf.int32, name=None), inferred_value=[2, 3])"
+        )
+        expected_repr = (
+            "<KerasTensor: shape=(2,) dtype=int32 inferred_value=[2, 3]>"
+        )
+        self.assertEqual(expected_str, str(kt))
+        self.assertEqual(expected_repr, repr(kt))
+
+        kt = keras_tensor.KerasTensor(
+            type_spec=tf.SparseTensorSpec(shape=(1, 2, 3), dtype=tf.float32)
+        )
+        expected_str = (
+            "KerasTensor(type_spec=SparseTensorSpec("
+            "TensorShape([1, 2, 3]), tf.float32))"
+        )
+        expected_repr = (
+            "<KerasTensor: type_spec=SparseTensorSpec("
+            "TensorShape([1, 2, 3]), tf.float32)>"
+        )
+        self.assertEqual(expected_str, str(kt))
+        self.assertEqual(expected_repr, repr(kt))
+
+        inp = layers.Input(shape=(3, 5))
+        kt = layers.Dense(10)(inp)
+        expected_str = (
+            "KerasTensor(type_spec=TensorSpec(shape=(None, 3, 10), "
+            "dtype=tf.float32, name=None), name='dense/BiasAdd:0', "
+            "description=\"created by layer 'dense'\")"
+        )
+        expected_repr = (
+            "<KerasTensor: shape=(None, 3, 10) dtype=float32 (created "
+            "by layer 'dense')>"
+        )
+        self.assertEqual(expected_str, str(kt))
+        self.assertEqual(expected_repr, repr(kt))
+
+        kt = tf.reshape(kt, shape=(3, 5, 2))
+        expected_str = (
+            "KerasTensor(type_spec=TensorSpec(shape=(3, 5, 2), "
+            "dtype=tf.float32, name=None), name='tf.reshape/Reshape:0', "
+            "description=\"created by layer 'tf.reshape'\")"
+        )
+        expected_repr = (
+            "<KerasTensor: shape=(3, 5, 2) dtype=float32 (created "
+            "by layer 'tf.reshape')>"
+        )
+        self.assertEqual(expected_str, str(kt))
+        self.assertEqual(expected_repr, repr(kt))
+
+        kts = tf.unstack(kt)
+        for i in range(3):
+            expected_str = (
+                "KerasTensor(type_spec=TensorSpec(shape=(5, 2), "
+                "dtype=tf.float32, name=None), name='tf.unstack/unstack:%s', "
+                "description=\"created by layer 'tf.unstack'\")" % (i,)
+            )
+            expected_repr = (
+                "<KerasTensor: shape=(5, 2) dtype=float32 "
+                "(created by layer 'tf.unstack')>"
+            )
+            self.assertEqual(expected_str, str(kts[i]))
+            self.assertEqual(expected_repr, repr(kts[i]))
+
+    @parameterized.parameters(
+        {"property_name": "values"},
+        {"property_name": "indices"},
+        {"property_name": "dense_shape"},
+    )
+    def test_sparse_instance_property(self, property_name):
+        inp = layers.Input(shape=[3], sparse=True)
+        out = getattr(inp, property_name)
+        model = training.Model(inp, out)
+
+        x = tf.SparseTensor(
+            [[0, 0], [0, 1], [1, 1], [1, 2]], [1, 2, 3, 4], [2, 3]
+        )
+        expected_property = getattr(x, property_name)
+        self.assertAllEqual(model(x), expected_property)
+
+        # Test that it works with serialization and deserialization as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected_property)
+
+    @parameterized.parameters(
+        [
+            (tf.TensorSpec([2, 3], tf.int32), [2, 3]),
+            (tf.RaggedTensorSpec([2, None]), [2, None]),
+            (tf.SparseTensorSpec([8]), [8]),
+            (CustomTypeSpec([3, 8], tf.int32), [3, 8]),
+        ]
+    )
+    def test_shape(self, spec, expected_shape):
+        kt = keras_tensor.KerasTensor(spec)
+        self.assertEqual(kt.shape.as_list(), expected_shape)
+
+    @parameterized.parameters(
+        [
+            (tf.TensorSpec([8, 3], tf.int32), [8, 3], [8, 3]),
+            (tf.TensorSpec([None, 3], tf.int32), [8, 3], [8, 3]),
+            (tf.TensorSpec([8, 3], tf.int32), [None, 3], [8, 3]),
+            (tf.TensorSpec(None, tf.int32), [8, 3], [8, 3]),
+            (tf.TensorSpec(None, tf.int32), [8, None], [8, None]),
+            (tf.TensorSpec(None, tf.int32), None, None),
+            (tf.RaggedTensorSpec([2, None, None]), [2, None, 5], [2, None, 5]),
+            (tf.SparseTensorSpec([8]), [8], [8]),
+            (CustomTypeSpec2([3, None], tf.int32), [3, 8], [3, 8]),
+        ]
+    )
+    def test_set_shape(self, spec, new_shape, expected_shape):
+        kt = keras_tensor.KerasTensor(spec)
+        kt.set_shape(new_shape)
+        if expected_shape is None:
+            self.assertIsNone(kt.type_spec.shape.rank)
+        else:
+            self.assertEqual(kt.type_spec.shape.as_list(), expected_shape)
+        self.assertTrue(kt.type_spec.is_compatible_with(spec))
+
+    @parameterized.parameters(
+        [
+            (layers.Input(shape=[3, 4], batch_size=7), tf.reshape),
+            (layers.Input(shape=[3, 4], ragged=True, batch_size=7), tf.reshape),
+            (
+                layers.Input(shape=[3, 4], sparse=True, batch_size=7),
+                tf.sparse.reshape,
+            ),
+        ]
+    )
+    def test_reshape(self, inp, reshape_op):
+        out = reshape_op(inp, shape=[7, 4, 3])
+        self.assertEqual(out.type_spec.shape.as_list(), [7, 4, 3])
+
+    def test_set_shape_error(self):
+        spec = CustomTypeSpec([3, None], tf.int32)
+        kt = keras_tensor.KerasTensor(spec)
+        with self.assertRaisesRegex(
+            ValueError, "Keras requires TypeSpec to have a `with_shape` method"
+        ):
+            kt.set_shape([3, 3])
+
+    def test_set_shape_equals_expected_shape(self):
+        # Tests b/203201161: DenseSpec has both a _shape and a _shape_tuple
+        # field, and we need to be sure both get updated.
+        kt = keras_tensor.KerasTensor(tf.TensorSpec([8, None], tf.int32))
+        kt.set_shape([8, 3])
+        self.assertEqual(kt.type_spec, tf.TensorSpec([8, 3], tf.int32))
+
+    def test_type_spec_with_shape_equals_expected_shape(self):
+        # Tests b/203201161: DenseSpec has both a _shape and a _shape_tuple
+        # field, and we need to be sure both get updated.
+        spec1 = tf.TensorSpec([8, None], tf.int32)
+        spec2 = keras_tensor.type_spec_with_shape(spec1, [8, 3])
+        expected = tf.TensorSpec([8, 3], tf.int32)
+        self.assertEqual(spec2, expected)
+
+    def test_missing_shape_error(self):
+        spec = CustomTypeSpec(None, tf.int32)
+        del spec.shape
+        with self.assertRaisesRegex(
+            ValueError,
+            "KerasTensor only supports TypeSpecs that have a shape field; .*",
+        ):
+            keras_tensor.KerasTensor(spec)
+
+    def test_wrong_shape_type_error(self):
+        spec = CustomTypeSpec(None, tf.int32)
+        spec.shape = "foo"
+        with self.assertRaisesRegex(
+            TypeError,
+            "KerasTensor requires that wrapped TypeSpec's shape is a "
+            "TensorShape; .*",
+        ):
+            keras_tensor.KerasTensor(spec)
+
+    def test_missing_dtype_error(self):
+        spec = CustomTypeSpec(None, tf.int32)
+        del spec.dtype
+        kt = keras_tensor.KerasTensor(spec)
+        with self.assertRaisesRegex(
+            AttributeError,
+            "KerasTensor wraps TypeSpec .* which does not have a dtype.",
+        ):
+            kt.dtype
+
+    def test_wrong_dtype_type_error(self):
+        spec = CustomTypeSpec(None, tf.int32)
+        spec.dtype = "foo"
+        kt = keras_tensor.KerasTensor(spec)
+        with self.assertRaisesRegex(
+            TypeError,
+            "KerasTensor requires that wrapped TypeSpec's dtype is a DType; .*",
+        ):
+            kt.dtype
+
+    def test_from_tensor_mask_tensor_is_none(self):
+        tensor = tf.constant([1.0])
+        kt = keras_tensor.keras_tensor_from_tensor(tensor)
+        self.assertIsNone(getattr(kt, "_keras_mask", None))
 
-  def test_repr_and_string(self):
-    kt = keras_tensor.KerasTensor(
-        type_spec=tf.TensorSpec(shape=(1, 2, 3), dtype=tf.float32))
-    expected_str = ("KerasTensor(type_spec=TensorSpec(shape=(1, 2, 3), "
-                    "dtype=tf.float32, name=None))")
-    expected_repr = "<KerasTensor: shape=(1, 2, 3) dtype=float32>"
-    self.assertEqual(expected_str, str(kt))
-    self.assertEqual(expected_repr, repr(kt))
-
-    kt = keras_tensor.KerasTensor(
-        type_spec=tf.TensorSpec(shape=(2,), dtype=tf.int32),
-        inferred_value=[2, 3])
-    expected_str = ("KerasTensor(type_spec=TensorSpec(shape=(2,), "
-                    "dtype=tf.int32, name=None), inferred_value=[2, 3])")
-    expected_repr = (
-        "<KerasTensor: shape=(2,) dtype=int32 inferred_value=[2, 3]>")
-    self.assertEqual(expected_str, str(kt))
-    self.assertEqual(expected_repr, repr(kt))
-
-    kt = keras_tensor.KerasTensor(
-        type_spec=tf.SparseTensorSpec(
-            shape=(1, 2, 3), dtype=tf.float32))
-    expected_str = ("KerasTensor(type_spec=SparseTensorSpec("
-                    "TensorShape([1, 2, 3]), tf.float32))")
-    expected_repr = (
-        "<KerasTensor: type_spec=SparseTensorSpec("
-        "TensorShape([1, 2, 3]), tf.float32)>")
-    self.assertEqual(expected_str, str(kt))
-    self.assertEqual(expected_repr, repr(kt))
-
-    inp = layers.Input(shape=(3, 5))
-    kt = layers.Dense(10)(inp)
-    expected_str = (
-        "KerasTensor(type_spec=TensorSpec(shape=(None, 3, 10), "
-        "dtype=tf.float32, name=None), name='dense/BiasAdd:0', "
-        "description=\"created by layer 'dense'\")")
-    expected_repr = (
-        "<KerasTensor: shape=(None, 3, 10) dtype=float32 (created "
-        "by layer 'dense')>")
-    self.assertEqual(expected_str, str(kt))
-    self.assertEqual(expected_repr, repr(kt))
-
-    kt = tf.reshape(kt, shape=(3, 5, 2))
-    expected_str = (
-        "KerasTensor(type_spec=TensorSpec(shape=(3, 5, 2), dtype=tf.float32, "
-        "name=None), name='tf.reshape/Reshape:0', description=\"created "
-        "by layer 'tf.reshape'\")")
-    expected_repr = ("<KerasTensor: shape=(3, 5, 2) dtype=float32 (created "
-                     "by layer 'tf.reshape')>")
-    self.assertEqual(expected_str, str(kt))
-    self.assertEqual(expected_repr, repr(kt))
-
-    kts = tf.unstack(kt)
-    for i in range(3):
-      expected_str = (
-          "KerasTensor(type_spec=TensorSpec(shape=(5, 2), dtype=tf.float32, "
-          "name=None), name='tf.unstack/unstack:%s', description=\"created "
-          "by layer 'tf.unstack'\")" % (i,))
-      expected_repr = ("<KerasTensor: shape=(5, 2) dtype=float32 "
-                       "(created by layer 'tf.unstack')>")
-      self.assertEqual(expected_str, str(kts[i]))
-      self.assertEqual(expected_repr, repr(kts[i]))
-
-  @parameterized.parameters(
-      {"property_name": "values"},
-      {"property_name": "indices"},
-      {"property_name": "dense_shape"},
-  )
-  def test_sparse_instance_property(self, property_name):
-    inp = layers.Input(shape=[3], sparse=True)
-    out = getattr(inp, property_name)
-    model = training.Model(inp, out)
-
-    x = tf.SparseTensor([[0, 0], [0, 1], [1, 1], [1, 2]], [1, 2, 3, 4], [2, 3])
-    expected_property = getattr(x, property_name)
-    self.assertAllEqual(model(x), expected_property)
-
-    # Test that it works with serialization and deserialization as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected_property)
-
-  @parameterized.parameters([
-      (tf.TensorSpec([2, 3], tf.int32), [2, 3]),
-      (tf.RaggedTensorSpec([2, None]), [2, None]),
-      (tf.SparseTensorSpec([8]), [8]),
-      (CustomTypeSpec([3, 8], tf.int32), [3, 8]),
-  ])
-  def test_shape(self, spec, expected_shape):
-    kt = keras_tensor.KerasTensor(spec)
-    self.assertEqual(kt.shape.as_list(), expected_shape)
-
-  @parameterized.parameters([
-      (tf.TensorSpec([8, 3], tf.int32), [8, 3], [8, 3]),
-      (tf.TensorSpec([None, 3], tf.int32), [8, 3], [8, 3]),
-      (tf.TensorSpec([8, 3], tf.int32), [None, 3], [8, 3]),
-      (tf.TensorSpec(None, tf.int32), [8, 3], [8, 3]),
-      (tf.TensorSpec(None, tf.int32), [8, None], [8, None]),
-      (tf.TensorSpec(None, tf.int32), None, None),
-      (tf.RaggedTensorSpec([2, None, None]), [2, None, 5], [2, None, 5]),
-      (tf.SparseTensorSpec([8]), [8], [8]),
-      (CustomTypeSpec2([3, None], tf.int32), [3, 8], [3, 8]),
-  ])
-  def test_set_shape(self, spec, new_shape, expected_shape):
-    kt = keras_tensor.KerasTensor(spec)
-    kt.set_shape(new_shape)
-    if expected_shape is None:
-      self.assertIsNone(kt.type_spec.shape.rank)
-    else:
-      self.assertEqual(kt.type_spec.shape.as_list(), expected_shape)
-    self.assertTrue(kt.type_spec.is_compatible_with(spec))
-
-  def test_set_shape_error(self):
-    spec = CustomTypeSpec([3, None], tf.int32)
-    kt = keras_tensor.KerasTensor(spec)
-    with self.assertRaisesRegex(
-        ValueError, "Keras requires TypeSpec to have a `with_shape` method"):
-      kt.set_shape([3, 3])
-
-  def test_set_shape_equals_expected_shape(self):
-    # Tests b/203201161: DenseSpec has both a _shape and a _shape_tuple field,
-    # and we need to be sure both get updated.
-    kt = keras_tensor.KerasTensor(tf.TensorSpec([8, None], tf.int32))
-    kt.set_shape([8, 3])
-    self.assertEqual(kt.type_spec, tf.TensorSpec([8, 3], tf.int32))
-
-  def test_type_spec_with_shape_equals_expected_shape(self):
-    # Tests b/203201161: DenseSpec has both a _shape and a _shape_tuple field,
-    # and we need to be sure both get updated.
-    spec1 = tf.TensorSpec([8, None], tf.int32)
-    spec2 = keras_tensor.type_spec_with_shape(spec1, [8, 3])
-    expected = tf.TensorSpec([8, 3], tf.int32)
-    self.assertEqual(spec2, expected)
-
-  def test_missing_shape_error(self):
-    spec = CustomTypeSpec(None, tf.int32)
-    del spec.shape
-    with self.assertRaisesRegex(
-        ValueError,
-        "KerasTensor only supports TypeSpecs that have a shape field; .*"):
-      keras_tensor.KerasTensor(spec)
-
-  def test_wrong_shape_type_error(self):
-    spec = CustomTypeSpec(None, tf.int32)
-    spec.shape = "foo"
-    with self.assertRaisesRegex(
-        TypeError, "KerasTensor requires that wrapped TypeSpec's shape is a "
-        "TensorShape; .*"):
-      keras_tensor.KerasTensor(spec)
-
-  def test_missing_dtype_error(self):
-    spec = CustomTypeSpec(None, tf.int32)
-    del spec.dtype
-    kt = keras_tensor.KerasTensor(spec)
-    with self.assertRaisesRegex(
-        AttributeError,
-        "KerasTensor wraps TypeSpec .* which does not have a dtype."):
-      kt.dtype  # pylint: disable=pointless-statement
-
-  def test_wrong_dtype_type_error(self):
-    spec = CustomTypeSpec(None, tf.int32)
-    spec.dtype = "foo"
-    kt = keras_tensor.KerasTensor(spec)
-    with self.assertRaisesRegex(
-        TypeError,
-        "KerasTensor requires that wrapped TypeSpec's dtype is a DType; .*"):
-      kt.dtype  # pylint: disable=pointless-statement
+    def test_from_tensor_mask_tensor_is_not_none(self):
+        tensor = tf.constant([1.0])
+        tensor._keras_mask = tf.constant([1.0])
+        kt = keras_tensor.keras_tensor_from_tensor(tensor)
+        self.assertIsInstance(kt._keras_mask, keras_tensor.KerasTensor)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/engine/node.py b/keras/engine/node.py
index 2647f44d614a..946b9fce32b2 100644
--- a/keras/engine/node.py
+++ b/keras/engine/node.py
@@ -12,306 +12,333 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-# pylint: disable=g-classes-have-attributes
-"""Contains the `Node` class."""
 
-import tensorflow.compat.v2 as tf
+
+"""Contains the `Node` class."""
 
 import collections
 import copy
 import json
+
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine import base_layer_utils
-from keras.saving.saved_model import json_utils
+from keras.saving.legacy.saved_model import json_utils
 from keras.utils import tf_utils
 
-_CONSTANT_VALUE = '_CONSTANT_VALUE'
+_CONSTANT_VALUE = "_CONSTANT_VALUE"
 # Using dict to avoid conflict with constant string tensor.
-_COMPOSITE_TYPE = {'_TYPE': 'COMPOSITE'}
+_COMPOSITE_TYPE = {"_TYPE": "COMPOSITE"}
 
 
 class Node:
-  """A `Node` describes a layer `__call__()` event.
-
-  A Functional model is a DAG with `Node` instances as nodes, and `KerasTensor`
-  instances as edges. Nodes aren't `Layer` instances, because a single layer
-  could be called multiple times, which would result in graph cycles.
-
-  A `__call__()` event involves input tensors (and other input arguments),
-  the layer that was called, and the resulting output tensors.
-  A `Node` will include all this information.
-
-  Since a single `Layer` could be called multiple times, the `Node` instances
-  are stored on layers as a list. Each time a layer is called
-  a node is added to `layer._inbound_nodes`. Each time the output of a layer is
-  used by another layer, a node is added to `layer._outbound_nodes`.
-
-  Every `KerasTensor` instance has a `KerasHistory` object attached,
-  which tracks the `Node` that records the `__call__()` event that created
-  the tensor. By recursively walking through `Node` instances
-  via the `KerasHistory` metadata of `KerasTensor` instances, once can
-  retrieve the entire DAG of a Functional model.
-
-  Args:
-      layer: The layer that was called in the `Layer.__call__()`
-        event that this node represents.
-      call_args: The positional arguments the layer was called with.
-      call_kwargs: The keyword arguments the layer was called with.
-      outputs: The output tensors of the `Layer.__call__()`
-  """
-
-  def __init__(self,
-               layer,
-               call_args=None,
-               call_kwargs=None,
-               outputs=None):
-    call_args = [] if call_args is None else call_args
-    call_kwargs = {} if call_kwargs is None else call_kwargs
-    outputs = [] if outputs is None else outputs
-
-    self.layer = layer
-    self.is_input = not call_args and not call_kwargs
-
-    # These arguments are user-provided. Copy the structures here so that
-    # future user modifications do not affect the node's metadata.
-    # We copy using map_structure rather than python's shallow or deep copy,
-    # because the args can be data structures (so shallow copy is
-    # insufficient), but individual values might not support copy.copy
-    # or be too expensive to deep copy.
-    call_args = tf.nest.map_structure(lambda t: t, call_args)
-    call_kwargs = tf.nest.map_structure(lambda t: t, call_kwargs)
-    self.outputs = tf.nest.map_structure(lambda t: t, outputs)
-    self.call_args = call_args
-    self.call_kwargs = call_kwargs
-
-    # Cached for performance.
-    self._flat_arguments = tf.nest.flatten((self.call_args, self.call_kwargs))
-    # Used to avoid expensive `nest` operations in the most common case.
-    self._single_positional_tensor_passed = (not self.call_kwargs and len(
-        self.call_args) == 1 and tf.is_tensor(self.call_args[0]))
-
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      # Create TensorFlowOpLayers if needed (in TF1)
-      for obj in self._flat_arguments:
-        if (isinstance(obj, tf.Tensor) and
-            base_layer_utils.needs_keras_history(
-                obj, ignore_call_context=True)):
-          base_layer_utils.create_keras_history(obj)
-
-    self._keras_inputs = []
-    self._keras_inputs_ids_and_indices = []
-    for i, ele in enumerate(self._flat_arguments):
-      if is_keras_tensor(ele):
-        self._keras_inputs.append(ele)
-        kt_id = str(id(ele))
-        kt_index = i
-        self._keras_inputs_ids_and_indices.append((kt_id, kt_index))
-
-    # Wire up Node to Layers.
-    self.layer._inbound_nodes.append(self)
-    for kt in self.keras_inputs:
-      inbound_layer = kt._keras_history.layer
-      if inbound_layer is not None:  # `None` for `Input` tensors.
-        inbound_layer._outbound_nodes.append(self)
-
-    # Set metadata on outputs.
-    node_index = len(self.layer._inbound_nodes) - 1
-    for i, tensor in enumerate(tf.nest.flatten(outputs)):
-      tensor._keras_history = KerasHistory(
-          layer=layer, node_index=node_index, tensor_index=i)
-
-    # Cached for performance.
-    self.flat_input_ids = [str(id(t)) for t in self._keras_inputs]
-    self.flat_output_ids = [str(id(t)) for t in tf.nest.flatten(self.outputs)]
-
-  @property
-  def keras_inputs(self):
-    """Tensors input to this node that can be traced back to a `keras.Input`."""
-    return self._keras_inputs
-
-  @property
-  def parent_nodes(self):
-    """Returns all the `Node`s whose output this node immediately depends on."""
-    node_deps = []
-    for kt in self.keras_inputs:
-      layer = kt._keras_history.layer
-      node_index = kt._keras_history.node_index
-      if layer is not None:  # `None` for `Input` tensors.
-        node_deps.append(layer._inbound_nodes[node_index])
-    return node_deps
-
-  def iterate_inbound(self):
-    """Yields tuples representing the data inbound from other nodes.
-
-    Yields:
-      tuples like: (inbound_layer, node_index, tensor_index, tensor).
+    """A `Node` describes a layer `__call__()` event.
+
+    A Functional model is a DAG with `Node` instances as nodes, and
+    `KerasTensor` instances as edges. Nodes aren't `Layer` instances, because a
+    single layer could be called multiple times, which would result in graph
+    cycles.
+
+    A `__call__()` event involves input tensors (and other input arguments),
+    the layer that was called, and the resulting output tensors.
+    A `Node` will include all this information.
+
+    Since a single `Layer` could be called multiple times, the `Node` instances
+    are stored on layers as a list. Each time a layer is called a node is added
+    to `layer._inbound_nodes`. Each time the output of a layer is used by
+    another layer, a node is added to `layer._outbound_nodes`.
+
+    Every `KerasTensor` instance has a `KerasHistory` object attached,
+    which tracks the `Node` that records the `__call__()` event that created
+    the tensor. By recursively walking through `Node` instances
+    via the `KerasHistory` metadata of `KerasTensor` instances, once can
+    retrieve the entire DAG of a Functional model.
+
+    Args:
+        layer: The layer that was called in the `Layer.__call__()`
+          event that this node represents.
+        call_args: The positional arguments the layer was called with.
+        call_kwargs: The keyword arguments the layer was called with.
+        outputs: The output tensors of the `Layer.__call__()`
     """
-    for kt in self.keras_inputs:
-      keras_history = kt._keras_history
-      layer = keras_history.layer
-      node_index = keras_history.node_index
-      tensor_index = keras_history.tensor_index
-      yield layer, node_index, tensor_index, kt
-
-  def map_arguments(self, tensor_dict):
-    """Maps Keras Tensors to computed Tensors using `tensor_dict`."""
-    if self._single_positional_tensor_passed:
-      # Performance optimization for most common case.
-      kt_id, _ = self._keras_inputs_ids_and_indices[0]
-      return (tensor_dict[kt_id].pop(),), {}
-    else:
-      flat_arguments = copy.copy(self._flat_arguments)
-      for kt_id, kt_index in self._keras_inputs_ids_and_indices:
-        flat_arguments[kt_index] = tensor_dict[kt_id].pop()
-
-      args, kwargs = tf.nest.pack_sequence_as((self.call_args, self.call_kwargs),
-                                           flat_arguments)
-      return args, kwargs
-
-  def serialize(self, make_node_key, node_conversion_map):
-    """Serializes `Node` for Functional API's `get_config`."""
-    # Serialization still special-cases first argument.
-    args, kwargs = self.call_args, self.call_kwargs
-    inputs, args, kwargs = self.layer._call_spec.split_out_first_arg(
-        args, kwargs)
-
-    # Treat everything other than first argument as a kwarg.
-    arguments = dict(zip(self.layer._call_spec.arg_names[1:], args))
-    arguments.update(kwargs)
-    kwargs = arguments
-
-    def _serialize_keras_tensor(t):
-      """Serializes a single Tensor passed to `call`."""
-      if hasattr(t, '_keras_history'):
-        kh = t._keras_history
-        node_index = kh.node_index
-        node_key = make_node_key(kh.layer.name, node_index)
-        new_node_index = node_conversion_map.get(node_key, 0)
-        return [kh.layer.name, new_node_index, kh.tensor_index]
-
-      if isinstance(t, np.ndarray):
-        return t.tolist()
-
-      if isinstance(t, tf.Tensor):
-        return backend.get_value(t).tolist()
-
-      # Not using json_utils to serialize both constant Tensor and constant
-      # CompositeTensor for saving format backward compatibility.
-      if isinstance(t, tf.__internal__.CompositeTensor):
-        return (_COMPOSITE_TYPE, json_utils.Encoder().encode(t))
-
-      return t
-
-    kwargs = tf.nest.map_structure(_serialize_keras_tensor, kwargs)
-    try:
-      json.dumps(kwargs, default=json_utils.get_json_type)
-    except TypeError:
-      kwarg_types = tf.nest.map_structure(type, kwargs)
-      raise TypeError('Layer ' + self.layer.name +
-                      ' was passed non-JSON-serializable arguments. ' +
-                      'Arguments had types: ' +
-                      str(kwarg_types) + '. They cannot be serialized out '
-                      'when saving the model.')
-
-    # `kwargs` is added to each Tensor in the first arg. This should be
-    # changed in a future version of the serialization format.
-    def serialize_first_arg_tensor(t):
-      if is_keras_tensor(t):
-        kh = t._keras_history
-        node_index = kh.node_index
-        node_key = make_node_key(kh.layer.name, node_index)
-        new_node_index = node_conversion_map.get(node_key, 0)
-        data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
-      else:
-        # If an element in the first call argument did not originate as a
-        # keras tensor and is a constant value, we save it using the format
-        # ['_CONSTANT_VALUE', -1, serialized_tensor_or_python_constant]
-        # (potentially including serialized kwargs in an optional 4th argument).
-        data = [_CONSTANT_VALUE, -1, _serialize_keras_tensor(t), kwargs]
-      return tf_utils.ListWrapper(data)
-
-    data = tf.nest.map_structure(serialize_first_arg_tensor, inputs)
-    if (not tf.nest.is_nested(data) and
-        not self.layer._preserve_input_structure_in_config):
-      data = [data]
-    data = tf_utils.convert_inner_node_data(data)
-    return data
-
-  #############################################################
-  # Properties for Backwards compatibility.
-  # These only check the first input argument
-  # As nodes are internal, they may be removed in the future.
-  #############################################################
-
-  @property
-  def input_tensors(self):
-    if self.is_input:
-      return [self.outputs]  # Used in `Layer.input`.
-    return self.call_args[0]
-
-  @property
-  def output_tensors(self):
-    if self.is_input:
-      return [self.outputs]  # Used in `Layer.input`.
-    return self.outputs
-
-  @property
-  def input_shapes(self):
-    input_shapes = tf.nest.map_structure(backend.int_shape, self.input_tensors)
-    if len(input_shapes) == 1 and not self.is_input:
-      return input_shapes[0]
-    return input_shapes
-
-  @property
-  def output_shapes(self):
-    return tf.nest.map_structure(backend.int_shape, self.output_tensors)
-
-  @property
-  def outbound_layer(self):
-    return self.layer
-
-  @property
-  def inbound_layers(self):
-    """Return all layers that feed into the current node."""
-    if self.is_input:
-      return []
-    tensor_call_args = [x for x in self._flat_arguments
-                        if tf.is_tensor(x) and hasattr(x, '_keras_history')]
-    inbound_layers = tf.nest.map_structure(lambda t: t._keras_history.layer,
-                                           tensor_call_args)
-    if len(inbound_layers) == 1:
-      return inbound_layers[0]
-    return inbound_layers
+
+    def __init__(self, layer, call_args=None, call_kwargs=None, outputs=None):
+        call_args = [] if call_args is None else call_args
+        call_kwargs = {} if call_kwargs is None else call_kwargs
+        outputs = [] if outputs is None else outputs
+
+        self.layer = layer
+        self.is_input = not call_args and not call_kwargs
+
+        # These arguments are user-provided. Copy the structures here so that
+        # future user modifications do not affect the node's metadata.
+        # We copy using map_structure rather than python's shallow or deep copy,
+        # because the args can be data structures (so shallow copy is
+        # insufficient), but individual values might not support copy.copy
+        # or be too expensive to deep copy.
+        call_args = tf.nest.map_structure(lambda t: t, call_args)
+        call_kwargs = tf.nest.map_structure(lambda t: t, call_kwargs)
+        self.outputs = tf.nest.map_structure(lambda t: t, outputs)
+        self.call_args = call_args
+        self.call_kwargs = call_kwargs
+
+        # Cached for performance.
+        self._flat_arguments = tf.nest.flatten(
+            (self.call_args, self.call_kwargs)
+        )
+        # Used to avoid expensive `nest` operations in the most common case.
+        self._single_positional_tensor_passed = (
+            not self.call_kwargs
+            and len(self.call_args) == 1
+            and tf.is_tensor(self.call_args[0])
+        )
+
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            # Create TensorFlowOpLayers if needed (in TF1)
+            for obj in self._flat_arguments:
+                if isinstance(
+                    obj, tf.Tensor
+                ) and base_layer_utils.needs_keras_history(
+                    obj, ignore_call_context=True
+                ):
+                    base_layer_utils.create_keras_history(obj)
+
+        self._keras_inputs = []
+        self._keras_inputs_ids_and_indices = []
+        for i, ele in enumerate(self._flat_arguments):
+            if is_keras_tensor(ele):
+                self._keras_inputs.append(ele)
+                kt_id = str(id(ele))
+                kt_index = i
+                self._keras_inputs_ids_and_indices.append((kt_id, kt_index))
+
+        # Wire up Node to Layers.
+        self.layer._inbound_nodes.append(self)
+        for kt in self.keras_inputs:
+            inbound_layer = kt._keras_history.layer
+            if inbound_layer is not None:  # `None` for `Input` tensors.
+                inbound_layer._outbound_nodes.append(self)
+
+        # Set metadata on outputs.
+        node_index = len(self.layer._inbound_nodes) - 1
+        for i, tensor in enumerate(tf.nest.flatten(outputs)):
+            tensor._keras_history = KerasHistory(
+                layer=layer, node_index=node_index, tensor_index=i
+            )
+
+        # Cached for performance.
+        self.flat_input_ids = [str(id(t)) for t in self._keras_inputs]
+        self.flat_output_ids = [
+            str(id(t)) for t in tf.nest.flatten(self.outputs)
+        ]
+
+    @property
+    def keras_inputs(self):
+        """Tensors input to this node that can be traced back to a
+        `keras.Input`."""
+        return self._keras_inputs
+
+    @property
+    def parent_nodes(self):
+        """Returns all the `Node`s whose output this node immediately depends
+        on."""
+        node_deps = []
+        for kt in self.keras_inputs:
+            layer = kt._keras_history.layer
+            node_index = kt._keras_history.node_index
+            if layer is not None:  # `None` for `Input` tensors.
+                node_deps.append(layer._inbound_nodes[node_index])
+        return node_deps
+
+    def iterate_inbound(self):
+        """Yields tuples representing the data inbound from other nodes.
+
+        Yields:
+          tuples like: (inbound_layer, node_index, tensor_index, tensor).
+        """
+        for kt in self.keras_inputs:
+            keras_history = kt._keras_history
+            layer = keras_history.layer
+            node_index = keras_history.node_index
+            tensor_index = keras_history.tensor_index
+            yield layer, node_index, tensor_index, kt
+
+    def map_arguments(self, tensor_dict):
+        """Maps Keras Tensors to computed Tensors using `tensor_dict`."""
+        if self._single_positional_tensor_passed:
+            # Performance optimization for most common case.
+            kt_id, _ = self._keras_inputs_ids_and_indices[0]
+            return (tensor_dict[kt_id].pop(),), {}
+        else:
+            flat_arguments = copy.copy(self._flat_arguments)
+            for kt_id, kt_index in self._keras_inputs_ids_and_indices:
+                flat_arguments[kt_index] = tensor_dict[kt_id].pop()
+
+            args, kwargs = tf.nest.pack_sequence_as(
+                (self.call_args, self.call_kwargs), flat_arguments
+            )
+            return args, kwargs
+
+    def serialize(self, make_node_key, node_conversion_map):
+        """Serializes `Node` for Functional API's `get_config`."""
+        # Serialization still special-cases first argument.
+        args, kwargs = self.call_args, self.call_kwargs
+        inputs, args, kwargs = self.layer._call_spec.split_out_first_arg(
+            args, kwargs
+        )
+
+        # Treat everything other than first argument as a kwarg.
+        arguments = dict(zip(self.layer._call_spec.arg_names[1:], args))
+        arguments.update(kwargs)
+        kwargs = arguments
+
+        def _serialize_keras_tensor(t):
+            """Serializes a single Tensor passed to `call`."""
+            if hasattr(t, "_keras_history"):
+                kh = t._keras_history
+                node_index = kh.node_index
+                node_key = make_node_key(kh.layer.name, node_index)
+                new_node_index = node_conversion_map.get(node_key, 0)
+                return [kh.layer.name, new_node_index, kh.tensor_index]
+
+            if isinstance(t, np.ndarray):
+                return t.tolist()
+
+            if isinstance(t, tf.Tensor):
+                return backend.get_value(t).tolist()
+
+            # Not using json_utils to serialize both constant Tensor and
+            # constant CompositeTensor for saving format backward compatibility.
+            if isinstance(t, tf.__internal__.CompositeTensor):
+                return (_COMPOSITE_TYPE, json_utils.Encoder().encode(t))
+
+            return t
+
+        kwargs = tf.nest.map_structure(_serialize_keras_tensor, kwargs)
+        try:
+            json.dumps(kwargs, default=json_utils.get_json_type)
+        except TypeError:
+            kwarg_types = tf.nest.map_structure(type, kwargs)
+            raise TypeError(
+                "Layer "
+                + self.layer.name
+                + " was passed non-JSON-serializable arguments. "
+                + "Arguments had types: "
+                + str(kwarg_types)
+                + ". They cannot be serialized out when saving the model."
+            )
+
+        # `kwargs` is added to each Tensor in the first arg. This should be
+        # changed in a future version of the serialization format.
+        def serialize_first_arg_tensor(t):
+            if is_keras_tensor(t):
+                kh = t._keras_history
+                node_index = kh.node_index
+                node_key = make_node_key(kh.layer.name, node_index)
+                new_node_index = node_conversion_map.get(node_key, 0)
+                data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+            else:
+                # If an element in the first call argument did not originate as
+                # a keras tensor and is a constant value, we save it using the
+                # format ['_CONSTANT_VALUE', -1,
+                # serialized_tensor_or_python_constant] (potentially including
+                # serialized kwargs in an optional 4th argument).
+                data = [_CONSTANT_VALUE, -1, _serialize_keras_tensor(t), kwargs]
+            return tf_utils.ListWrapper(data)
+
+        data = tf.nest.map_structure(serialize_first_arg_tensor, inputs)
+        if (
+            not tf.nest.is_nested(data)
+            and not self.layer._preserve_input_structure_in_config
+        ):
+            data = [data]
+        data = tf_utils.convert_inner_node_data(data)
+        return data
+
+    #############################################################
+    # Properties for Backwards compatibility.
+    # These only check the first input argument
+    # As nodes are internal, they may be removed in the future.
+    #############################################################
+
+    @property
+    def input_tensors(self):
+        if self.is_input:
+            return [self.outputs]  # Used in `Layer.input`.
+        return self.call_args[0]
+
+    @property
+    def output_tensors(self):
+        if self.is_input:
+            return [self.outputs]  # Used in `Layer.input`.
+        return self.outputs
+
+    @property
+    def input_shapes(self):
+        input_shapes = tf.nest.map_structure(
+            backend.int_shape, self.input_tensors
+        )
+        if len(input_shapes) == 1 and not self.is_input:
+            return input_shapes[0]
+        return input_shapes
+
+    @property
+    def output_shapes(self):
+        return tf.nest.map_structure(backend.int_shape, self.output_tensors)
+
+    @property
+    def outbound_layer(self):
+        return self.layer
+
+    @property
+    def inbound_layers(self):
+        """Return all layers that feed into the current node."""
+        if self.is_input:
+            return []
+        tensor_call_args = [
+            x
+            for x in self._flat_arguments
+            if tf.is_tensor(x) and hasattr(x, "_keras_history")
+        ]
+        inbound_layers = tf.nest.map_structure(
+            lambda t: t._keras_history.layer, tensor_call_args
+        )
+        if len(inbound_layers) == 1:
+            return inbound_layers[0]
+        return inbound_layers
 
 
 class KerasHistory(
-    collections.namedtuple('KerasHistory',
-                           ['layer', 'node_index', 'tensor_index'])):
-  """Tracks the Layer call that created a Tensor, for Keras Graph Networks.
-
-  During construction of Keras Graph Networks, this metadata is added to
-  each Tensor produced as the output of a Layer, starting with an
-  `InputLayer`. This allows Keras to track how each Tensor was produced, and
-  this information is later retraced by the `keras.engine.Network` class to
-  reconstruct the Keras Graph Network.
-
-  Attributes:
-    layer: The Layer that produced the Tensor.
-    node_index: The specific call to the Layer that produced this Tensor. Layers
-      can be called multiple times in order to share weights. A new node is
-      created every time a Layer is called. The corresponding node that
-      represents the call event that produced the Tensor can be found at
-      `layer._inbound_nodes[node_index]`.
-    tensor_index: The output index for this Tensor. Always zero if the Layer
-      that produced this Tensor only has one output. Nested structures of
-      Tensors are deterministically assigned an index via `nest.flatten`.
-  """
-  # Added to maintain memory and performance characteristics of `namedtuple`
-  # while subclassing.
-  __slots__ = ()
+    collections.namedtuple(
+        "KerasHistory", ["layer", "node_index", "tensor_index"]
+    )
+):
+    """Tracks the Layer call that created a Tensor, for Keras Graph Networks.
+
+    During construction of Keras Graph Networks, this metadata is added to
+    each Tensor produced as the output of a Layer, starting with an
+    `InputLayer`. This allows Keras to track how each Tensor was produced, and
+    this information is later retraced by the `keras.engine.Network` class to
+    reconstruct the Keras Graph Network.
+
+    Attributes:
+      layer: The Layer that produced the Tensor.
+      node_index: The specific call to the Layer that produced this Tensor.
+        Layers can be called multiple times in order to share weights. A new
+        node is created every time a Layer is called. The corresponding node
+        that represents the call event that produced the Tensor can be found at
+        `layer._inbound_nodes[node_index]`.
+      tensor_index: The output index for this Tensor. Always zero if the Layer
+        that produced this Tensor only has one output. Nested structures of
+        Tensors are deterministically assigned an index via `nest.flatten`.
+    """
+
+    # Added to maintain memory and performance characteristics of `namedtuple`
+    # while subclassing.
+    __slots__ = ()
 
 
 def is_keras_tensor(obj):
-  return hasattr(obj, '_keras_history')
+    return hasattr(obj, "_keras_history")
diff --git a/keras/engine/node_test.py b/keras/engine/node_test.py
index 4f2c30590433..5fa822e30131 100644
--- a/keras/engine/node_test.py
+++ b/keras/engine/node_test.py
@@ -11,148 +11,162 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#,============================================================================
+# ,============================================================================
 """Tests for layer graphs construction & handling."""
 
+import tensorflow.compat.v2 as tf
+
 from keras.engine import base_layer
 from keras.engine import node as node_module
 from keras.testing_infra import test_combinations
-import tensorflow.compat.v2 as tf
 
 
 class DummyTensor(tf.__internal__.types.Tensor):
+    def __init__(self, shape=None):
+        self._shape = shape
 
-  def __init__(self, shape=None):
-    self._shape = shape
-
-  @property
-  def shape(self):
-    return self._shape
+    @property
+    def shape(self):
+        return self._shape
 
 
 class DummyLayer(base_layer.Layer):
-  pass
+    pass
 
 
 class NetworkConstructionTest(test_combinations.TestCase):
-
-  def test_chained_node_construction(self):
-    # test basics
-    a = DummyTensor(shape=(None, 32))
-    b = DummyTensor(shape=(None, 32))
-
-    a_layer = DummyLayer()
-    node = node_module.Node(a_layer, outputs=a)
-    self.assertEqual(node.outbound_layer, a_layer)
-
-    self.assertTrue(node.is_input)
-    self.assertListEqual(node.inbound_layers, [])
-    self.assertListEqual(node.input_tensors, [a])
-    self.assertListEqual(node.input_shapes, [(None, 32)])
-    self.assertListEqual(node.output_tensors, [a])
-    self.assertListEqual(node.output_shapes, [(None, 32)])
-
-    b_layer = DummyLayer()
-    node_module.Node(b_layer, outputs=b)
-
-    dense = DummyLayer()
-    a_2 = DummyTensor()
-    node_a = node_module.Node(layer=dense, call_args=(a,), outputs=a_2)
-    b_2 = DummyTensor()
-    node_b = node_module.Node(layer=dense, call_args=(b,), outputs=b_2)
-
-    # test the node attributes
-    self.assertFalse(node_a.is_input)
-    self.assertFalse(node_b.is_input)
-    self.assertEqual(node_a.call_args, (a,))
-    self.assertEqual(node_a.call_kwargs, {})
-    self.assertEqual(node_a.outputs, a_2)
-
-    # Test the layer wiring
-    self.assertLen(dense._inbound_nodes, 2)
-    self.assertLen(dense._outbound_nodes, 0)
-    self.assertEqual(dense._inbound_nodes, [node_a, node_b])
-    self.assertEqual(dense._inbound_nodes[0].inbound_layers, a_layer)
-    self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
-    self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
-    self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
-    self.assertIs(dense._inbound_nodes[0].input_tensors, a)
-    self.assertIs(dense._inbound_nodes[1].input_tensors, b)
-
-  def test_multi_input_node(self):
-    # test multi-input layer
-    a = DummyTensor()
-    b = DummyTensor()
-
-    dense = DummyLayer()
-    a_2 = DummyTensor()
-    node_module.Node(layer=dense, call_args=(a,), outputs=a_2)
-    b_2 = DummyTensor()
-    node_module.Node(layer=dense, call_args=(b,), outputs=b_2)
-
-    concat_layer = DummyLayer()
-    merged = DummyTensor()
-    node_module.Node(layer=concat_layer, call_args=([a_2, b_2],),
-                     outputs=merged)
-
-    merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
-
-    self.assertEqual(merge_node_index, 0)
-    self.assertEqual(merge_tensor_index, 0)
-
-    self.assertLen(merge_layer._inbound_nodes, 1)
-    self.assertLen(merge_layer._outbound_nodes, 0)
-
-    self.assertLen(merge_layer._inbound_nodes[0].input_tensors, 2)
-    self.assertEqual(merge_layer._inbound_nodes[0].input_tensors, [a_2, b_2])
-    self.assertLen(merge_layer._inbound_nodes[0].inbound_layers, 2)
-
-  def test_arg_and_kwarg_mix(self):
-    input_layer = DummyLayer()
-    input_layer_2 = DummyLayer()
-    a = DummyTensor()
-    node_a = node_module.Node(layer=input_layer, outputs=a)
-    b = DummyTensor()
-    node_b = node_module.Node(layer=input_layer_2, outputs=b)
-
-    arg_2 = DummyTensor()
-    arg_3 = DummyTensor()
-    node_c = node_module.Node(layer=input_layer, outputs=arg_3)
-
-    kwarg_x = DummyTensor()
-    kwarg_y = DummyTensor()
-    node_d = node_module.Node(layer=input_layer, outputs=kwarg_y)
-
-    merge_layer = DummyLayer()
-    merged = DummyTensor()
-    node = node_module.Node(layer=merge_layer,
-                            call_args=([a, b], arg_2, arg_3),
-                            call_kwargs={'x': kwarg_x, 'y': kwarg_y},
-                            outputs=merged)
-
-    merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
-
-    # Check the saved call args/kwargs
-    self.assertEqual(([a, b], arg_2, arg_3), node.call_args)
-    self.assertEqual({'x': kwarg_x, 'y': kwarg_y}, node.call_kwargs)
-
-    # Only the inputs that were produced by input nodes should appear in
-    # keras_tensors
-    self.assertEqual({a, b, arg_3, kwarg_y}, set(node.keras_inputs))
-    self.assertEqual(set(node.parent_nodes), {node_a, node_b, node_c, node_d})
-
-    # Check the layer wirings
-    self.assertEqual(merge_node_index, 0)
-    self.assertEqual(merge_tensor_index, 0)
-    self.assertLen(merge_layer._inbound_nodes, 1)
-    self.assertLen(merge_layer._outbound_nodes, 0)
-    self.assertLen(input_layer._outbound_nodes, 3)
-    self.assertLen(input_layer_2._outbound_nodes, 1)
-
-    self.assertLen(merge_layer._inbound_nodes[0].input_tensors, 2)
-    self.assertEqual(merge_layer._inbound_nodes[0].input_tensors, [a, b])
-    self.assertLen(merge_layer._inbound_nodes[0].inbound_layers, 4)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_chained_node_construction(self):
+        # test basics
+        a = DummyTensor(shape=(None, 32))
+        b = DummyTensor(shape=(None, 32))
+
+        a_layer = DummyLayer()
+        node = node_module.Node(a_layer, outputs=a)
+        self.assertEqual(node.outbound_layer, a_layer)
+
+        self.assertTrue(node.is_input)
+        self.assertListEqual(node.inbound_layers, [])
+        self.assertListEqual(node.input_tensors, [a])
+        self.assertListEqual(node.input_shapes, [(None, 32)])
+        self.assertListEqual(node.output_tensors, [a])
+        self.assertListEqual(node.output_shapes, [(None, 32)])
+
+        b_layer = DummyLayer()
+        node_module.Node(b_layer, outputs=b)
+
+        dense = DummyLayer()
+        a_2 = DummyTensor()
+        node_a = node_module.Node(layer=dense, call_args=(a,), outputs=a_2)
+        b_2 = DummyTensor()
+        node_b = node_module.Node(layer=dense, call_args=(b,), outputs=b_2)
+
+        # test the node attributes
+        self.assertFalse(node_a.is_input)
+        self.assertFalse(node_b.is_input)
+        self.assertEqual(node_a.call_args, (a,))
+        self.assertEqual(node_a.call_kwargs, {})
+        self.assertEqual(node_a.outputs, a_2)
+
+        # Test the layer wiring
+        self.assertLen(dense._inbound_nodes, 2)
+        self.assertLen(dense._outbound_nodes, 0)
+        self.assertEqual(dense._inbound_nodes, [node_a, node_b])
+        self.assertEqual(dense._inbound_nodes[0].inbound_layers, a_layer)
+        self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
+        self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
+        self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
+        self.assertIs(dense._inbound_nodes[0].input_tensors, a)
+        self.assertIs(dense._inbound_nodes[1].input_tensors, b)
+
+    def test_multi_input_node(self):
+        # test multi-input layer
+        a = DummyTensor()
+        b = DummyTensor()
+
+        dense = DummyLayer()
+        a_2 = DummyTensor()
+        node_module.Node(layer=dense, call_args=(a,), outputs=a_2)
+        b_2 = DummyTensor()
+        node_module.Node(layer=dense, call_args=(b,), outputs=b_2)
+
+        concat_layer = DummyLayer()
+        merged = DummyTensor()
+        node_module.Node(
+            layer=concat_layer, call_args=([a_2, b_2],), outputs=merged
+        )
+
+        (
+            merge_layer,
+            merge_node_index,
+            merge_tensor_index,
+        ) = merged._keras_history
+
+        self.assertEqual(merge_node_index, 0)
+        self.assertEqual(merge_tensor_index, 0)
+
+        self.assertLen(merge_layer._inbound_nodes, 1)
+        self.assertLen(merge_layer._outbound_nodes, 0)
+
+        self.assertLen(merge_layer._inbound_nodes[0].input_tensors, 2)
+        self.assertEqual(
+            merge_layer._inbound_nodes[0].input_tensors, [a_2, b_2]
+        )
+        self.assertLen(merge_layer._inbound_nodes[0].inbound_layers, 2)
+
+    def test_arg_and_kwarg_mix(self):
+        input_layer = DummyLayer()
+        input_layer_2 = DummyLayer()
+        a = DummyTensor()
+        node_a = node_module.Node(layer=input_layer, outputs=a)
+        b = DummyTensor()
+        node_b = node_module.Node(layer=input_layer_2, outputs=b)
+
+        arg_2 = DummyTensor()
+        arg_3 = DummyTensor()
+        node_c = node_module.Node(layer=input_layer, outputs=arg_3)
+
+        kwarg_x = DummyTensor()
+        kwarg_y = DummyTensor()
+        node_d = node_module.Node(layer=input_layer, outputs=kwarg_y)
+
+        merge_layer = DummyLayer()
+        merged = DummyTensor()
+        node = node_module.Node(
+            layer=merge_layer,
+            call_args=([a, b], arg_2, arg_3),
+            call_kwargs={"x": kwarg_x, "y": kwarg_y},
+            outputs=merged,
+        )
+
+        (
+            merge_layer,
+            merge_node_index,
+            merge_tensor_index,
+        ) = merged._keras_history
+
+        # Check the saved call args/kwargs
+        self.assertEqual(([a, b], arg_2, arg_3), node.call_args)
+        self.assertEqual({"x": kwarg_x, "y": kwarg_y}, node.call_kwargs)
+
+        # Only the inputs that were produced by input nodes should appear in
+        # keras_tensors
+        self.assertEqual({a, b, arg_3, kwarg_y}, set(node.keras_inputs))
+        self.assertEqual(
+            set(node.parent_nodes), {node_a, node_b, node_c, node_d}
+        )
+
+        # Check the layer wirings
+        self.assertEqual(merge_node_index, 0)
+        self.assertEqual(merge_tensor_index, 0)
+        self.assertLen(merge_layer._inbound_nodes, 1)
+        self.assertLen(merge_layer._outbound_nodes, 0)
+        self.assertLen(input_layer._outbound_nodes, 3)
+        self.assertLen(input_layer_2._outbound_nodes, 1)
+
+        self.assertLen(merge_layer._inbound_nodes[0].input_tensors, 2)
+        self.assertEqual(merge_layer._inbound_nodes[0].input_tensors, [a, b])
+        self.assertLen(merge_layer._inbound_nodes[0].inbound_layers, 4)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/partial_batch_padding_handler.py b/keras/engine/partial_batch_padding_handler.py
index 998526f6c1c5..a67fa70de6d1 100644
--- a/keras/engine/partial_batch_padding_handler.py
+++ b/keras/engine/partial_batch_padding_handler.py
@@ -14,92 +14,101 @@
 # ==============================================================================
 """Utility object to handler partial batches for TPUStrategy."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-# pylint: disable=protected-access
 
-import numpy as np
 from keras import backend
 
 
 class PartialBatchPaddingHandler:
-  """A container that holds info about partial batches for `predict()`."""
-
-  def __init__(self, output_shape):
-    self.padded_batch_size = 0
-    self.padding_mask = tf.zeros(0)
-    self.output_shape = output_shape
-
-  def get_real_batch_size(self, dataset_batch):
-    """Returns the number of elements in a potentially partial batch."""
-    if isinstance(dataset_batch, (tuple, list)):
-      dataset_batch = dataset_batch[0]
-
-    assert tf.nest.flatten(dataset_batch)
-
-    def _find_any_tensor(batch_features):
-      tensors = [
-          x for x in tf.nest.flatten(batch_features) if tf.is_tensor(x)
-      ]
-      if not tensors:
-        raise ValueError('Cannot find any Tensor in features dict.')
-      return tensors[0]
-
-    return backend.cast(backend.shape(_find_any_tensor(dataset_batch))[0],
-                        dtype='int64')
-
-  def update_mask(self, padding_mask, dataset_batch):
-    """Calculate and cache the amount of padding required for a batch."""
-    original_batch_size = self.get_real_batch_size(dataset_batch)
-    missing_count = self.padded_batch_size - original_batch_size
-    mask = backend.concatenate([tf.ones(original_batch_size),
-                                tf.zeros(missing_count)], axis=0)
-    return backend.concatenate([padding_mask, mask], axis=0)
-
-  def pad_batch(self, *dataset_batch_elements):
-    """Pads out the batch dimension of a tensor to the complete batch size."""
-    def _pad(batch):
-      """Helper function to pad nested data within each batch elements."""
-      padded_dict_batch = {}
-      if isinstance(batch, dict):
-        for key, value in batch.items():
-          padded_dict_batch[key] = _pad(value)
-        return padded_dict_batch
-
-      rank = len(batch.shape)
-      assert rank > 0
-      missing_count = (self.padded_batch_size -
-                       self.get_real_batch_size(batch))
-      padding = backend.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
-      return tf.pad(batch, padding, 'constant')
-
-    if len(dataset_batch_elements) == 1:
-      return _pad(dataset_batch_elements[0])
-
-    batch_elements = []
-    for batch_element in dataset_batch_elements:
-      batch_elements.append(_pad(batch_element))
-    return tuple(batch_elements)
-
-  def apply_mask(self, prediction_result):
-    """Removes prediction output that corresponds to padded input."""
-    padding_mask = backend.get_value(self.padding_mask)
-    assert len(padding_mask.shape) == 1
-
-    if len(self.output_shape) == 1:
-      prediction = np.take(prediction_result,
-                           np.nonzero(
-                               padding_mask[:len(prediction_result)]),
-                           axis=0)
-      if prediction.shape[0] == 1:
-        prediction = np.squeeze(prediction, axis=0)
-      return prediction
-
-    else:
-      predictions = []
-      for i in range(len(self.output_shape)):
-        prediction = prediction_result[i]
-        prediction = np.take(prediction, np.nonzero(
-            padding_mask[:len(prediction)]), axis=0)
-        predictions.append(np.squeeze(prediction))
-
-      return predictions
+    """A container that holds info about partial batches for `predict()`."""
+
+    def __init__(self, output_shape):
+        self.padded_batch_size = 0
+        self.padding_mask = tf.zeros(0)
+        self.output_shape = output_shape
+
+    def get_real_batch_size(self, dataset_batch):
+        """Returns the number of elements in a potentially partial batch."""
+        if isinstance(dataset_batch, (tuple, list)):
+            dataset_batch = dataset_batch[0]
+
+        assert tf.nest.flatten(dataset_batch)
+
+        def _find_any_tensor(batch_features):
+            tensors = [
+                x for x in tf.nest.flatten(batch_features) if tf.is_tensor(x)
+            ]
+            if not tensors:
+                raise ValueError("Cannot find any Tensor in features dict.")
+            return tensors[0]
+
+        return backend.cast(
+            backend.shape(_find_any_tensor(dataset_batch))[0], dtype="int64"
+        )
+
+    def update_mask(self, padding_mask, dataset_batch):
+        """Calculate and cache the amount of padding required for a batch."""
+        original_batch_size = self.get_real_batch_size(dataset_batch)
+        missing_count = self.padded_batch_size - original_batch_size
+        mask = backend.concatenate(
+            [tf.ones(original_batch_size), tf.zeros(missing_count)], axis=0
+        )
+        return backend.concatenate([padding_mask, mask], axis=0)
+
+    def pad_batch(self, *dataset_batch_elements):
+        """Pads the batch dimension of a tensor to the complete batch size."""
+
+        def _pad(batch):
+            """Helper function to pad nested data within each batch elements."""
+            padded_dict_batch = {}
+            if isinstance(batch, dict):
+                for key, value in batch.items():
+                    padded_dict_batch[key] = _pad(value)
+                return padded_dict_batch
+
+            rank = len(batch.shape)
+            assert rank > 0
+            missing_count = self.padded_batch_size - self.get_real_batch_size(
+                batch
+            )
+            padding = backend.stack(
+                [[0, missing_count]] + [[0, 0]] * (rank - 1)
+            )
+            return tf.pad(batch, padding, "constant")
+
+        if len(dataset_batch_elements) == 1:
+            return _pad(dataset_batch_elements[0])
+
+        batch_elements = []
+        for batch_element in dataset_batch_elements:
+            batch_elements.append(_pad(batch_element))
+        return tuple(batch_elements)
+
+    def apply_mask(self, prediction_result):
+        """Removes prediction output that corresponds to padded input."""
+        padding_mask = backend.get_value(self.padding_mask)
+        assert len(padding_mask.shape) == 1
+
+        if len(self.output_shape) == 1:
+            prediction = np.take(
+                prediction_result,
+                np.nonzero(padding_mask[: len(prediction_result)]),
+                axis=0,
+            )
+            if prediction.shape[0] == 1:
+                prediction = np.squeeze(prediction, axis=0)
+            return prediction
+
+        else:
+            predictions = []
+            for i in range(len(self.output_shape)):
+                prediction = prediction_result[i]
+                prediction = np.take(
+                    prediction,
+                    np.nonzero(padding_mask[: len(prediction)]),
+                    axis=0,
+                )
+                predictions.append(np.squeeze(prediction))
+
+            return predictions
diff --git a/keras/engine/ragged_keras_tensor_test.py b/keras/engine/ragged_keras_tensor_test.py
index c31908b05c47..cad4e02e281b 100644
--- a/keras/engine/ragged_keras_tensor_test.py
+++ b/keras/engine/ragged_keras_tensor_test.py
@@ -14,365 +14,357 @@
 # ==============================================================================
 """RaggedKerasTensor tests."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
+
 from keras import layers
-from keras.testing_infra import test_utils
 from keras.engine import training
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 @test_utils.run_v2_only
 class RaggedKerasTensorTest(test_combinations.TestCase):
-
-  @parameterized.parameters(
-      {'batch_size': None, 'shape': (None, 5), 'ragged_rank': 1},
-      {'batch_size': None, 'shape': (None, 3, 5), 'ragged_rank': 1},
-      {'batch_size': None, 'shape': (5, None), 'ragged_rank': 2},
-      {'batch_size': None, 'shape': (3, 5, None), 'ragged_rank': 3},
-      {'batch_size': None, 'shape': (None, 3, 5, None), 'ragged_rank': 4},
-      {'batch_size': None, 'shape': (2, 3, None, 4, 5, None), 'ragged_rank': 6},
-      {'batch_size': 8, 'shape': (None, 5), 'ragged_rank': 1},
-      {'batch_size': 9, 'shape': (None, 3, 5), 'ragged_rank': 1},
-      {'batch_size': 1, 'shape': (5, None), 'ragged_rank': 2},
-      {'batch_size': 4, 'shape': (3, 5, None), 'ragged_rank': 3},
-      {'batch_size': 7, 'shape': (None, 3, 5, None), 'ragged_rank': 4},
-      {'batch_size': 12, 'shape': (2, 3, None, 4, 5, None), 'ragged_rank': 6},
-  )
-  def test_to_placeholder(self, shape, batch_size, ragged_rank):
-    inp = layers.Input(shape=shape, batch_size=batch_size, ragged=True)
-    self.assertEqual(inp.ragged_rank, ragged_rank)
-    self.assertAllEqual(inp.shape, [batch_size] + list(shape))
-    with tf.__internal__.FuncGraph('test').as_default():
-      placeholder = inp._to_placeholder()
-      self.assertEqual(placeholder.ragged_rank, ragged_rank)
-      self.assertAllEqual(placeholder.shape, [batch_size] + list(shape))
-
-  def test_add(self):
-    inp = layers.Input(shape=[None], ragged=True)
-    out = inp + inp
-    model = training.Model(inp, out)
-
-    x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
-    self.assertAllEqual(model(x), x + x)
-
-  def test_mul(self):
-    inp = layers.Input(shape=[None], ragged=True)
-    out = inp * inp
-    model = training.Model(inp, out)
-
-    x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
-    self.assertAllEqual(model(x), x * x)
-
-  def test_sub(self):
-    inp = layers.Input(shape=[None], ragged=True)
-    out = inp - inp
-    model = training.Model(inp, out)
-
-    x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
-    self.assertAllEqual(model(x), x - x)
-
-  def test_div(self):
-    inp = layers.Input(shape=[None], ragged=True)
-    out = inp / inp
-    model = training.Model(inp, out)
-
-    x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
-    self.assertAllEqual(model(x), x / x)
-
-  def test_getitem(self):
-    # Test slicing / getitem
-    inp = layers.Input(shape=(None, 2), ragged=True)
-    out = inp[:, :2]
-    model = training.Model(inp, out)
-
-    x = tf.RaggedTensor.from_row_lengths(
-        tf.cast(np.random.randn(6, 2), dtype=tf.float32), [3, 1, 2])
-    expected = x[:, :2]
-
-    self.assertAllEqual(model(x), expected)
-
-    # Test that models w/ slicing are correctly serialized/deserialized
-    config = model.get_config()
-    model = training.Model.from_config(config)
-
-    self.assertAllEqual(model(x), expected)
-
-  @parameterized.parameters(
-      {'property_name': 'values'},
-      {'property_name': 'flat_values'},
-      {'property_name': 'row_splits'},
-      {'property_name': 'nested_row_splits'},
-  )
-  def test_instance_property(self, property_name):
-    inp = layers.Input(shape=[None], ragged=True)
-    out = getattr(inp, property_name)
-    model = training.Model(inp, out)
-
-    x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
-    expected_property = getattr(x, property_name)
-    self.assertAllEqual(model(x), expected_property)
-
-    # Test that it works with serialization and deserialization as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected_property)
-
-  @parameterized.parameters(
-      {'name': 'value_rowids'},
-      {'name': 'nested_value_rowids'},
-      {'name': 'nrows'},
-      {'name': 'row_starts'},
-      {'name': 'row_limits'},
-      {'name': 'row_lengths'},
-      {'name': 'nested_row_lengths'},
-      {'name': 'bounding_shape'},
-      {
-          'name': 'with_values',
-          'args': [[1, 2, 3, 4, 5, 6]]
-      },
-      {
-          'name': 'with_flat_values',
-          'kwargs': {
-              'new_values': [1, 2, 3, 4, 5, 6]
-          }
-      },
-      {
-          'name': 'with_row_splits_dtype',
-          'kwargs': {
-              'dtype': tf.int32
-          }
-      },
-      {
-          'name': 'merge_dims',
-          'args': [0],
-          'kwargs': {
-              'inner_axis': 1
-          }
-      },
-      {'name': 'to_tensor'},
-      {'name': 'to_sparse'},
-  )
-  def test_instance_method(self, name, args=None, kwargs=None):
-    if not args:
-      args = []
-    if not kwargs:
-      kwargs = {}
-
-    inp = layers.Input(shape=[None], ragged=True)
-    out = getattr(inp, name)(*args, **kwargs)
-    model = training.Model(inp, out)
-
-    x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
-    expected_property = getattr(x, name)(*args, **kwargs)
-    # We expand composites before checking equality because
-    # assertAllEqual otherwise wouldn't work for SparseTensor outputs
-    for a, b in zip(tf.nest.flatten(model(x), expand_composites=True),
-                    tf.nest.flatten(expected_property, expand_composites=True)):
-      self.assertAllEqual(a, b)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    for a, b in zip(tf.nest.flatten(model2(x), expand_composites=True),
-                    tf.nest.flatten(expected_property, expand_composites=True)):
-      self.assertAllEqual(a, b)
+    @parameterized.parameters(
+        {"batch_size": None, "shape": (None, 5), "ragged_rank": 1},
+        {"batch_size": None, "shape": (None, 3, 5), "ragged_rank": 1},
+        {"batch_size": None, "shape": (5, None), "ragged_rank": 2},
+        {"batch_size": None, "shape": (3, 5, None), "ragged_rank": 3},
+        {"batch_size": None, "shape": (None, 3, 5, None), "ragged_rank": 4},
+        {
+            "batch_size": None,
+            "shape": (2, 3, None, 4, 5, None),
+            "ragged_rank": 6,
+        },
+        {"batch_size": 8, "shape": (None, 5), "ragged_rank": 1},
+        {"batch_size": 9, "shape": (None, 3, 5), "ragged_rank": 1},
+        {"batch_size": 1, "shape": (5, None), "ragged_rank": 2},
+        {"batch_size": 4, "shape": (3, 5, None), "ragged_rank": 3},
+        {"batch_size": 7, "shape": (None, 3, 5, None), "ragged_rank": 4},
+        {"batch_size": 12, "shape": (2, 3, None, 4, 5, None), "ragged_rank": 6},
+    )
+    def test_to_placeholder(self, shape, batch_size, ragged_rank):
+        inp = layers.Input(shape=shape, batch_size=batch_size, ragged=True)
+        self.assertEqual(inp.ragged_rank, ragged_rank)
+        self.assertAllEqual(inp.shape, [batch_size] + list(shape))
+        with tf.__internal__.FuncGraph("test").as_default():
+            placeholder = inp._to_placeholder()
+            self.assertEqual(placeholder.ragged_rank, ragged_rank)
+            self.assertAllEqual(placeholder.shape, [batch_size] + list(shape))
+
+    def test_add(self):
+        inp = layers.Input(shape=[None], ragged=True)
+        out = inp + inp
+        model = training.Model(inp, out)
+
+        x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
+        self.assertAllEqual(model(x), x + x)
+
+    def test_mul(self):
+        inp = layers.Input(shape=[None], ragged=True)
+        out = inp * inp
+        model = training.Model(inp, out)
+
+        x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
+        self.assertAllEqual(model(x), x * x)
+
+    def test_sub(self):
+        inp = layers.Input(shape=[None], ragged=True)
+        out = inp - inp
+        model = training.Model(inp, out)
+
+        x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
+        self.assertAllEqual(model(x), x - x)
+
+    def test_div(self):
+        inp = layers.Input(shape=[None], ragged=True)
+        out = inp / inp
+        model = training.Model(inp, out)
+
+        x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
+        self.assertAllEqual(model(x), x / x)
+
+    def test_getitem(self):
+        # Test slicing / getitem
+        inp = layers.Input(shape=(None, 2), ragged=True)
+        out = inp[:, :2]
+        model = training.Model(inp, out)
+
+        x = tf.RaggedTensor.from_row_lengths(
+            tf.cast(np.random.randn(6, 2), dtype=tf.float32), [3, 1, 2]
+        )
+        expected = x[:, :2]
+
+        self.assertAllEqual(model(x), expected)
+
+        # Test that models w/ slicing are correctly serialized/deserialized
+        config = model.get_config()
+        model = training.Model.from_config(config)
+
+        self.assertAllEqual(model(x), expected)
+
+    @parameterized.parameters(
+        {"property_name": "values"},
+        {"property_name": "flat_values"},
+        {"property_name": "row_splits"},
+        {"property_name": "nested_row_splits"},
+    )
+    def test_instance_property(self, property_name):
+        inp = layers.Input(shape=[None], ragged=True)
+        out = getattr(inp, property_name)
+        model = training.Model(inp, out)
+
+        x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
+        expected_property = getattr(x, property_name)
+        self.assertAllEqual(model(x), expected_property)
+
+        # Test that it works with serialization and deserialization as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected_property)
+
+    @parameterized.parameters(
+        {"name": "value_rowids"},
+        {"name": "nested_value_rowids"},
+        {"name": "nrows"},
+        {"name": "row_starts"},
+        {"name": "row_limits"},
+        {"name": "row_lengths"},
+        {"name": "nested_row_lengths"},
+        {"name": "bounding_shape"},
+        {"name": "with_values", "args": [[1, 2, 3, 4, 5, 6]]},
+        {
+            "name": "with_flat_values",
+            "kwargs": {"new_values": [1, 2, 3, 4, 5, 6]},
+        },
+        {"name": "with_row_splits_dtype", "kwargs": {"dtype": tf.int32}},
+        {"name": "merge_dims", "args": [0], "kwargs": {"inner_axis": 1}},
+        {"name": "to_tensor"},
+        {"name": "to_sparse"},
+    )
+    def test_instance_method(self, name, args=None, kwargs=None):
+        if not args:
+            args = []
+        if not kwargs:
+            kwargs = {}
+
+        inp = layers.Input(shape=[None], ragged=True)
+        out = getattr(inp, name)(*args, **kwargs)
+        model = training.Model(inp, out)
+
+        x = tf.ragged.constant([[3, 4], [1, 2], [3, 5]])
+        expected_property = getattr(x, name)(*args, **kwargs)
+        # We expand composites before checking equality because
+        # assertAllEqual otherwise wouldn't work for SparseTensor outputs
+        for a, b in zip(
+            tf.nest.flatten(model(x), expand_composites=True),
+            tf.nest.flatten(expected_property, expand_composites=True),
+        ):
+            self.assertAllEqual(a, b)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        for a, b in zip(
+            tf.nest.flatten(model2(x), expand_composites=True),
+            tf.nest.flatten(expected_property, expand_composites=True),
+        ):
+            self.assertAllEqual(a, b)
 
 
 @test_utils.run_v2_only
 class RaggedTensorClassMethodAsLayerTest(test_combinations.TestCase):
-
-  def test_from_value_rowids(self):
-    inp = layers.Input(shape=[None])
-    out = tf.RaggedTensor.from_value_rowids(
-        inp, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
-    model = training.Model(inp, out)
-
-    x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
-    expected = tf.RaggedTensor.from_value_rowids(
-        x, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_row_splits(self):
-    inp = layers.Input(shape=[None])
-    out = tf.RaggedTensor.from_row_splits(
-        inp, row_splits=[0, 4, 4, 7, 8, 8])
-    model = training.Model(inp, out)
-
-    x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
-    expected = tf.RaggedTensor.from_row_splits(
-        x, row_splits=[0, 4, 4, 7, 8, 8])
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_row_lengths(self):
-    inp = layers.Input(shape=[None])
-    out = tf.RaggedTensor.from_row_lengths(
-        inp, row_lengths=[4, 0, 3, 1, 0])
-    model = training.Model(inp, out)
-
-    x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
-    expected = tf.RaggedTensor.from_row_lengths(
-        x, row_lengths=[4, 0, 3, 1, 0])
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_row_starts(self):
-    inp = layers.Input(shape=[None])
-    out = tf.RaggedTensor.from_row_starts(
-        inp, row_starts=[0, 4, 4, 7, 8])
-    model = training.Model(inp, out)
-
-    x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
-    expected = tf.RaggedTensor.from_row_starts(
-        x, row_starts=[0, 4, 4, 7, 8])
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_row_limits(self):
-    row_limits = tf.constant([2, 2, 5, 6, 7], tf.int64)
-
-    inp = layers.Input(shape=[None], dtype=tf.string)
-    out = tf.RaggedTensor.from_row_limits(
-        inp, row_limits, validate=False)
-    model = training.Model(inp, out)
-
-    x = tf.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
-    expected = tf.RaggedTensor.from_row_limits(
-        x, row_limits, validate=False)
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_uniform_row_length(self):
-    inp = layers.Input(shape=[None])
-    out = tf.RaggedTensor.from_uniform_row_length(inp, 2, 8)
-    model = training.Model(inp, out)
-
-    x = tf.constant(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
-    expected = tf.RaggedTensor.from_uniform_row_length(x, 2, 8)
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_nested_value_row_ids(self):
-    nested_value_rowids = [
-        tf.constant([0, 0, 1, 3, 3], tf.int64),
-        tf.constant([0, 0, 2, 2, 2, 3, 4], tf.int64)
-    ]
-    inp = layers.Input(shape=[None], dtype=tf.string)
-    out = tf.RaggedTensor.from_nested_value_rowids(
-        inp, nested_value_rowids)
-    model = training.Model(inp, out)
-
-    x = tf.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
-    expected = tf.RaggedTensor.from_nested_value_rowids(
-        x, nested_value_rowids)
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_nested_row_splits(self):
-    nested_row_splits = [
-        tf.constant([0, 2, 3, 3, 5], tf.int64),
-        tf.constant([0, 2, 2, 5, 6, 7], tf.int64)
-    ]
-    inp = layers.Input(shape=[None], dtype=tf.string)
-    out = tf.RaggedTensor.from_nested_row_splits(
-        inp, nested_row_splits)
-    model = training.Model(inp, out)
-
-    x = tf.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
-    expected = tf.RaggedTensor.from_nested_row_splits(
-        x, nested_row_splits)
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_nested_row_lengths(self):
-    nested_row_lengths = [
-        tf.constant([2, 1, 0, 2], tf.int64),
-        tf.constant([2, 0, 3, 1, 1], tf.int64)
-    ]
-    inp = layers.Input(shape=[None], dtype=tf.string)
-    out = tf.RaggedTensor.from_nested_row_lengths(
-        inp, nested_row_lengths)
-    model = training.Model(inp, out)
-
-    x = tf.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
-    expected = tf.RaggedTensor.from_nested_row_lengths(
-        x, nested_row_lengths)
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_tensor(self):
-    inp = layers.Input(shape=[None], ragged=False)
-    out = tf.RaggedTensor.from_tensor(inp)
-    model = training.Model(inp, out)
-
-    x = tf.constant([[3., 4.], [1., 2.], [3., 5.]])
-    expected = tf.RaggedTensor.from_tensor(x)
-    self.assertAllEqual(model(x), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(x), expected)
-
-  def test_from_sparse(self):
-    inp = layers.Input(shape=[None], sparse=True, dtype=tf.string)
-    out = tf.RaggedTensor.from_sparse(inp)
-    model = training.Model(inp, out)
-
-    indices = [[0, 0], [1, 0], [1, 1], [2, 0]]
-    values = [b'a', b'b', b'c', b'd']
-    shape = [4, 5]
-    sp_value = tf.SparseTensor(indices, values, shape)
-
-    expected = tf.RaggedTensor.from_sparse(sp_value)
-    self.assertAllEqual(model(sp_value), expected)
-
-    # Test that the model can serialize and deserialize as well
-    model_config = model.get_config()
-    model2 = training.Model.from_config(model_config)
-    self.assertAllEqual(model2(sp_value), expected)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_from_value_rowids(self):
+        inp = layers.Input(shape=[None])
+        out = tf.RaggedTensor.from_value_rowids(
+            inp, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5
+        )
+        model = training.Model(inp, out)
+
+        x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
+        expected = tf.RaggedTensor.from_value_rowids(
+            x, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5
+        )
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_row_splits(self):
+        inp = layers.Input(shape=[None])
+        out = tf.RaggedTensor.from_row_splits(
+            inp, row_splits=[0, 4, 4, 7, 8, 8]
+        )
+        model = training.Model(inp, out)
+
+        x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
+        expected = tf.RaggedTensor.from_row_splits(
+            x, row_splits=[0, 4, 4, 7, 8, 8]
+        )
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_row_lengths(self):
+        inp = layers.Input(shape=[None])
+        out = tf.RaggedTensor.from_row_lengths(inp, row_lengths=[4, 0, 3, 1, 0])
+        model = training.Model(inp, out)
+
+        x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
+        expected = tf.RaggedTensor.from_row_lengths(
+            x, row_lengths=[4, 0, 3, 1, 0]
+        )
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_row_starts(self):
+        inp = layers.Input(shape=[None])
+        out = tf.RaggedTensor.from_row_starts(inp, row_starts=[0, 4, 4, 7, 8])
+        model = training.Model(inp, out)
+
+        x = tf.constant([3, 1, 4, 1, 5, 9, 2, 6])
+        expected = tf.RaggedTensor.from_row_starts(
+            x, row_starts=[0, 4, 4, 7, 8]
+        )
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_row_limits(self):
+        row_limits = tf.constant([2, 2, 5, 6, 7], tf.int64)
+
+        inp = layers.Input(shape=[None], dtype=tf.string)
+        out = tf.RaggedTensor.from_row_limits(inp, row_limits, validate=False)
+        model = training.Model(inp, out)
+
+        x = tf.constant(["a", "b", "c", "d", "e", "f", "g"])
+        expected = tf.RaggedTensor.from_row_limits(
+            x, row_limits, validate=False
+        )
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_uniform_row_length(self):
+        inp = layers.Input(shape=[None])
+        out = tf.RaggedTensor.from_uniform_row_length(inp, 2, 8)
+        model = training.Model(inp, out)
+
+        x = tf.constant([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
+        expected = tf.RaggedTensor.from_uniform_row_length(x, 2, 8)
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_nested_value_row_ids(self):
+        nested_value_rowids = [
+            tf.constant([0, 0, 1, 3, 3], tf.int64),
+            tf.constant([0, 0, 2, 2, 2, 3, 4], tf.int64),
+        ]
+        inp = layers.Input(shape=[None], dtype=tf.string)
+        out = tf.RaggedTensor.from_nested_value_rowids(inp, nested_value_rowids)
+        model = training.Model(inp, out)
+
+        x = tf.constant(["a", "b", "c", "d", "e", "f", "g"])
+        expected = tf.RaggedTensor.from_nested_value_rowids(
+            x, nested_value_rowids
+        )
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_nested_row_splits(self):
+        nested_row_splits = [
+            tf.constant([0, 2, 3, 3, 5], tf.int64),
+            tf.constant([0, 2, 2, 5, 6, 7], tf.int64),
+        ]
+        inp = layers.Input(shape=[None], dtype=tf.string)
+        out = tf.RaggedTensor.from_nested_row_splits(inp, nested_row_splits)
+        model = training.Model(inp, out)
+
+        x = tf.constant(["a", "b", "c", "d", "e", "f", "g"])
+        expected = tf.RaggedTensor.from_nested_row_splits(x, nested_row_splits)
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_nested_row_lengths(self):
+        nested_row_lengths = [
+            tf.constant([2, 1, 0, 2], tf.int64),
+            tf.constant([2, 0, 3, 1, 1], tf.int64),
+        ]
+        inp = layers.Input(shape=[None], dtype=tf.string)
+        out = tf.RaggedTensor.from_nested_row_lengths(inp, nested_row_lengths)
+        model = training.Model(inp, out)
+
+        x = tf.constant(["a", "b", "c", "d", "e", "f", "g"])
+        expected = tf.RaggedTensor.from_nested_row_lengths(
+            x, nested_row_lengths
+        )
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_tensor(self):
+        inp = layers.Input(shape=[None], ragged=False)
+        out = tf.RaggedTensor.from_tensor(inp)
+        model = training.Model(inp, out)
+
+        x = tf.constant([[3.0, 4.0], [1.0, 2.0], [3.0, 5.0]])
+        expected = tf.RaggedTensor.from_tensor(x)
+        self.assertAllEqual(model(x), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(x), expected)
+
+    def test_from_sparse(self):
+        inp = layers.Input(shape=[None], sparse=True, dtype=tf.string)
+        out = tf.RaggedTensor.from_sparse(inp)
+        model = training.Model(inp, out)
+
+        indices = [[0, 0], [1, 0], [1, 1], [2, 0]]
+        values = [b"a", b"b", b"c", b"d"]
+        shape = [4, 5]
+        sp_value = tf.SparseTensor(indices, values, shape)
+
+        expected = tf.RaggedTensor.from_sparse(sp_value)
+        self.assertAllEqual(model(sp_value), expected)
+
+        # Test that the model can serialize and deserialize as well
+        model_config = model.get_config()
+        model2 = training.Model.from_config(model_config)
+        self.assertAllEqual(model2(sp_value), expected)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/saving.py b/keras/engine/saving.py
index fdddf130cee5..f72fe1c22165 100644
--- a/keras/engine/saving.py
+++ b/keras/engine/saving.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Model saving utilities.
 
 Everything has been moved to keras/saving/. This file will be deleted soon.
 """
 
-from keras.saving import *  # pylint: disable=wildcard-import
+from keras.saving import *  # noqa: F401,F403
diff --git a/keras/engine/sequential.py b/keras/engine/sequential.py
index 6fc7208efb96..137926b97c84 100644
--- a/keras/engine/sequential.py
+++ b/keras/engine/sequential.py
@@ -12,503 +12,541 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Home of the `Sequential` model."""
 
+import copy
+
 import tensorflow.compat.v2 as tf
 
-import copy
 from keras import layers as layer_module
 from keras.engine import base_layer
 from keras.engine import functional
 from keras.engine import input_layer
+from keras.engine import training
 from keras.engine import training_utils
-from keras.saving.saved_model import model_serialization
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.legacy.saved_model import model_serialization
 from keras.utils import generic_utils
 from keras.utils import layer_utils
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
 from keras.utils import traceback_utils
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-SINGLE_LAYER_OUTPUT_ERROR_MSG = ('All layers in a Sequential model should have '
-                                 'a single output tensor. For multi-output '
-                                 'layers, use the functional API.')
+SINGLE_LAYER_OUTPUT_ERROR_MSG = (
+    "All layers in a Sequential model should have "
+    "a single output tensor. For multi-output "
+    "layers, use the functional API."
+)
 
 
-@keras_export('keras.Sequential', 'keras.models.Sequential')
+@keras_export("keras.Sequential", "keras.models.Sequential")
 class Sequential(functional.Functional):
-  """`Sequential` groups a linear stack of layers into a `tf.keras.Model`.
-
-  `Sequential` provides training and inference features on this model.
-
-  Examples:
-
-  ```python
-  # Optionally, the first layer can receive an `input_shape` argument:
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
-  # Afterwards, we do automatic shape inference:
-  model.add(tf.keras.layers.Dense(4))
-
-  # This is identical to the following:
-  model = tf.keras.Sequential()
-  model.add(tf.keras.Input(shape=(16,)))
-  model.add(tf.keras.layers.Dense(8))
-
-  # Note that you can also omit the `input_shape` argument.
-  # In that case the model doesn't have any weights until the first call
-  # to a training/evaluation method (since it isn't yet built):
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(8))
-  model.add(tf.keras.layers.Dense(4))
-  # model.weights not created yet
-
-  # Whereas if you specify the input shape, the model gets built
-  # continuously as you are adding layers:
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(8, input_shape=(16,)))
-  model.add(tf.keras.layers.Dense(4))
-  len(model.weights)
-  # Returns "4"
-
-  # When using the delayed-build pattern (no input shape specified), you can
-  # choose to manually build your model by calling
-  # `build(batch_input_shape)`:
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(8))
-  model.add(tf.keras.layers.Dense(4))
-  model.build((None, 16))
-  len(model.weights)
-  # Returns "4"
-
-  # Note that when using the delayed-build pattern (no input shape specified),
-  # the model gets built the first time you call `fit`, `eval`, or `predict`,
-  # or the first time you call the model on some input data.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(8))
-  model.add(tf.keras.layers.Dense(1))
-  model.compile(optimizer='sgd', loss='mse')
-  # This builds the model for the first time:
-  model.fit(x, y, batch_size=32, epochs=10)
-  ```
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  @traceback_utils.filter_traceback
-  def __init__(self, layers=None, name=None):
-    """Creates a `Sequential` model instance.
-
-    Args:
-      layers: Optional list of layers to add to the model.
-      name: Optional name for the model.
-    """
-    # Skip the init in FunctionalModel since model doesn't have input/output yet
-    super(functional.Functional, self).__init__(  # pylint: disable=bad-super-call
-        name=name, autocast=False)
-    base_layer.keras_api_gauge.get_cell('Sequential').set(True)
-    self.supports_masking = True
-    self._compute_output_and_mask_jointly = True
-    self._auto_track_sub_layers = False
-    self._inferred_input_shape = None
-    self._has_explicit_input_shape = False
-    self._input_dtype = None
-    self._layer_call_argspecs = {}
-    self._created_nodes = set()
-    # Flag that indicate whether the sequential network topology has been
-    # created. It is false when there isn't any layer, or the layers don't
-    # have an input shape.
-    self._graph_initialized = False
-
-    # Unfortunately some Sequential models using custom layers or FeatureColumn
-    # layers have multiple inputs. This is fundamentally incompatible with
-    # most of the Sequential API, and we have to disable a number of features
-    # for such models.
-    self._use_legacy_deferred_behavior = False
-
-    # Add to the model any layers passed to the constructor.
-    if layers:
-      if not isinstance(layers, (list, tuple)):
-        layers = [layers]
-      for layer in layers:
-        self.add(layer)
-
-  @property
-  def layers(self):
-    # Historically, `sequential.layers` only returns layers that were added
-    # via `add`, and omits the auto-generated `InputLayer` that comes at the
-    # bottom of the stack.
-    # `Trackable` manages the `_layers` attributes and does filtering
-    # over it.
-    layers = super().layers
-    if layers and isinstance(layers[0], input_layer.InputLayer):
-      return layers[1:]
-    return layers[:]
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  @traceback_utils.filter_traceback
-  def add(self, layer):
-    """Adds a layer instance on top of the layer stack.
-
-    Args:
-        layer: layer instance.
-
-    Raises:
-        TypeError: If `layer` is not a layer instance.
-        ValueError: In case the `layer` argument does not
-            know its input shape.
-        ValueError: In case the `layer` argument has
-            multiple output tensors, or is already connected
-            somewhere else (forbidden in `Sequential` models).
+    """`Sequential` groups a linear stack of layers into a `tf.keras.Model`.
+
+    `Sequential` provides training and inference features on this model.
+
+    Examples:
+
+    ```python
+    model = tf.keras.Sequential()
+    model.add(tf.keras.Input(shape=(16,)))
+    model.add(tf.keras.layers.Dense(8))
+
+    # Note that you can also omit the initial `Input`.
+    # In that case the model doesn't have any weights until the first call
+    # to a training/evaluation method (since it isn't yet built):
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(8))
+    model.add(tf.keras.layers.Dense(4))
+    # model.weights not created yet
+
+    # Whereas if you specify an `Input`, the model gets built
+    # continuously as you are adding layers:
+    model = tf.keras.Sequential()
+    model.add(tf.keras.Input(shape=(16,)))
+    model.add(tf.keras.layers.Dense(4))
+    len(model.weights)
+    # Returns "2"
+
+    # When using the delayed-build pattern (no input shape specified), you can
+    # choose to manually build your model by calling
+    # `build(batch_input_shape)`:
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(8))
+    model.add(tf.keras.layers.Dense(4))
+    model.build((None, 16))
+    len(model.weights)
+    # Returns "4"
+
+    # Note that when using the delayed-build pattern (no input shape specified),
+    # the model gets built the first time you call `fit`, `eval`, or `predict`,
+    # or the first time you call the model on some input data.
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(8))
+    model.add(tf.keras.layers.Dense(1))
+    model.compile(optimizer='sgd', loss='mse')
+    # This builds the model for the first time:
+    model.fit(x, y, batch_size=32, epochs=10)
+    ```
     """
-    # If we are passed a Keras tensor created by keras.Input(), we can extract
-    # the input layer from its keras history and use that without any loss of
-    # generality.
-    if hasattr(layer, '_keras_history'):
-      origin_layer = layer._keras_history[0]
-      if isinstance(origin_layer, input_layer.InputLayer):
-        layer = origin_layer
-
-    if isinstance(layer, tf.Module):
-      if not isinstance(layer, base_layer.Layer):
-        layer = functional.ModuleWrapper(layer)
-    else:
-      raise TypeError('The added layer must be an instance of class Layer. '
-                      f'Received: layer={layer} of type {type(layer)}.')
-
-    tf_utils.assert_no_legacy_layers([layer])
-    if not self._is_layer_name_unique(layer):
-      raise ValueError(
-          'All layers added to a Sequential model '
-          f'should have unique names. Name "{layer.name}" is already the name '
-          'of a layer in this model. Update the `name` argument '
-          'to pass a unique name.')
-
-    self.built = False
-    set_inputs = False
-    self._maybe_create_attribute('_self_tracked_trackables', [])
-    if not self._self_tracked_trackables:
-      if isinstance(layer, input_layer.InputLayer):
-        # Case where the user passes an Input or InputLayer layer via `add`.
-        set_inputs = True
-      else:
-        batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
-        if batch_shape:
-          # Instantiate an input layer.
-          x = input_layer.Input(
-              batch_shape=batch_shape, dtype=dtype, name=layer.name + '_input')
-          # This will build the current layer
-          # and create the node connecting the current layer
-          # to the input layer we just created.
-          layer(x)
-          set_inputs = True
-
-      if set_inputs:
-        outputs = tf.nest.flatten(layer._inbound_nodes[-1].outputs)
-        if len(outputs) != 1:
-          raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
-        self.outputs = outputs
-        self.inputs = layer_utils.get_source_inputs(self.outputs[0])
-        self.built = True
-        self._has_explicit_input_shape = True
-
-    elif self.outputs:
-      # If the model is being built continuously on top of an input layer:
-      # refresh its output.
-      output_tensor = layer(self.outputs[0])
-      if len(tf.nest.flatten(output_tensor)) != 1:
-        raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
-      self.outputs = [output_tensor]
-      self.built = True
-
-    if set_inputs or self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs)
-      self._graph_initialized = True
-    else:
-      self._self_tracked_trackables.append(layer)
-      self._handle_deferred_layer_dependencies([layer])
-
-    self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  @traceback_utils.filter_traceback
-  def pop(self):
-    """Removes the last layer in the model.
-
-    Raises:
-        TypeError: if there are no layers in the model.
-    """
-    if not self.layers:
-      raise TypeError('There are no layers in the model.')
-
-    layer = self._self_tracked_trackables.pop()
-    self._layer_call_argspecs.pop(layer)
-    if not self.layers:
-      self.outputs = None
-      self.inputs = None
-      self.built = False
-      self._inferred_input_shape = None
-      self._has_explicit_input_shape = False
-      self._graph_initialized = False
-    elif self._graph_initialized:
-      self.layers[-1]._outbound_nodes = []
-      self.outputs = [self.layers[-1].output]
-      self._init_graph_network(self.inputs, self.outputs)
-      self.built = True
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _build_graph_network_for_inferred_shape(self,
-                                              input_shape,
-                                              input_dtype=None):
-    if input_shape is None or not self.layers:
-      return
-    if not tf.__internal__.tf2.enabled() or not tf.compat.v1.executing_eagerly_outside_functions():
-      # This behavior is disabled in V1 or when eager execution is disabled.
-      return
-    if (not self._has_explicit_input_shape and
-        not self._use_legacy_deferred_behavior):
-      # Determine whether the input shape is novel, i.e. whether the model
-      # should be rebuilt.
-      input_shape = tuple(input_shape)
-      if self._inferred_input_shape is None:
-        new_shape = input_shape
-      else:
-        new_shape = relax_input_shape(self._inferred_input_shape, input_shape)
-      if (new_shape is not None and new_shape != self._inferred_input_shape):
-        # A novel shape has been received: we need to rebuild the model.
-        # In case we are inside a graph function, we step out of it.
-        with tf.init_scope():
-          inputs = input_layer.Input(
-              batch_shape=new_shape,
-              dtype=input_dtype,
-              name=self.layers[0].name + '_input')
-          layer_input = inputs
-          created_nodes = set()
-          for layer in self.layers:
-            # Clear nodes previously created via this method. This prevents
-            # node accumulation and ensures that e.g. `layer.output` is
-            # always connected to `model.inputs`
-            # (this is important e.g. for the feature extraction use case).
-            # We don't just do `layer._inbound_nodes = []` in order
-            # not to break shared layers added to Sequential models (which is
-            # technically illegal as per the `add()` docstring,
-            # but wasn't previously disabled).
-            clear_previously_created_nodes(layer, self._created_nodes)
-            try:
-              # Create Functional API connection by calling the current layer
-              layer_output = layer(layer_input)
-            except:  # pylint:disable=bare-except
-              # Functional API calls may fail for a number of reasons:
-              # 1) The layer may be buggy. In this case it will be easier for
-              # the user to debug if we fail on the first call on concrete data,
-              # instead of our own call on a symbolic input.
-              # 2) The layer is dynamic (graph-incompatible) and hasn't
-              # overridden `compute_output_shape`. In this case, it is
-              # impossible to build a graph network.
-              # 3) The layer is otherwise incompatible with the Functional API
-              # (e.g. this is the case for some probabilistic layers that rely
-              # on hacks and that do not return tensors).
-              # In all these cases, we should avoid creating a graph network
-              # (or we simply can't).
-              self._use_legacy_deferred_behavior = True
-              return
-            if len(tf.nest.flatten(layer_output)) != 1:
-              raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
-            # Keep track of nodes just created above
-            track_nodes_created_by_last_call(layer, created_nodes)
-            layer_input = layer_output
-            outputs = layer_output
-          self._created_nodes = created_nodes
-          try:
-            # Initialize a graph Network. This call will never fail for
-            # a stack of valid Keras layers.
-            # However some users have layers that are fundamentally incompatible
-            # with the Functional API, which do not return tensors. In this
-            # case, we fall back to the legacy deferred behavior.
-            # TODO(fchollet): consider raising here, as we should not be
-            # supporting such layers.
-            self._init_graph_network(inputs, outputs)
-            self._graph_initialized = True
-          except:  # pylint:disable=bare-except
-            self._use_legacy_deferred_behavior = True
-        self._inferred_input_shape = new_shape
-
-  @generic_utils.default
-  def build(self, input_shape=None):
-    if self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs)
-    else:
-      if input_shape is None:
-        raise ValueError('You must provide an `input_shape` argument.')
-      self._build_graph_network_for_inferred_shape(input_shape)
-      if not self.built:
-        input_shape = tuple(input_shape)
-        self._build_input_shape = input_shape
-        super().build(input_shape)
-    self.built = True
-
-  def call(self, inputs, training=None, mask=None):  # pylint: disable=redefined-outer-name
-    # If applicable, update the static input shape of the model.
-    if not self._has_explicit_input_shape:
-      if not tf.is_tensor(inputs) and not isinstance(
-          inputs, tf.Tensor):
-        # This is a Sequential with multiple inputs. This is technically an
-        # invalid use case of Sequential, but we tolerate it for backwards
-        # compatibility.
-        self._use_legacy_deferred_behavior = True
-        self._build_input_shape = tf.nest.map_structure(
-            _get_shape_tuple, inputs)
-        if tf.__internal__.tf2.enabled():
-          logging.warning('Layers in a Sequential model should only have a '
-                          f'single input tensor. Received: inputs={inputs}. '
-                          'Consider rewriting this model with the Functional '
-                          'API.')
-      else:
-        self._build_graph_network_for_inferred_shape(inputs.shape, inputs.dtype)
-
-    if self._graph_initialized:
-      if not self.built:
-        self._init_graph_network(self.inputs, self.outputs)
-      return super().call(inputs, training=training, mask=mask)
-
-    outputs = inputs  # handle the corner case where self.layers is empty
-    for layer in self.layers:
-      # During each iteration, `inputs` are the inputs to `layer`, and `outputs`
-      # are the outputs of `layer` applied to `inputs`. At the end of each
-      # iteration `inputs` is set to `outputs` to prepare for the next layer.
-      kwargs = {}
-      argspec = self._layer_call_argspecs[layer].args
-      if 'mask' in argspec:
-        kwargs['mask'] = mask
-      if 'training' in argspec:
-        kwargs['training'] = training
-
-      outputs = layer(inputs, **kwargs)
-
-      if len(tf.nest.flatten(outputs)) != 1:
-        raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
-      # `outputs` will be the inputs to the next layer.
-      inputs = outputs
-      mask = getattr(outputs, '_keras_mask', None)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    shape = input_shape
-    for layer in self.layers:
-      shape = layer.compute_output_shape(shape)
-    return shape
-
-  def compute_mask(self, inputs, mask):
-    # TODO(omalleyt): b/123540974 This function is not really safe to call
-    # by itself because it will duplicate any updates and losses in graph
-    # mode by `call`ing the Layers again.
-    outputs = self.call(inputs, mask=mask)  # pylint: disable=unexpected-keyword-arg
-    return getattr(outputs, '_keras_mask', None)
-
-  def get_config(self):
-    layer_configs = []
-    for layer in super().layers:
-      # `super().layers` include the InputLayer if available (it is filtered out
-      # of `self.layers`). Note that `self._self_tracked_trackables` is managed
-      # by the tracking infrastructure and should not be used.
-      layer_configs.append(generic_utils.serialize_keras_object(layer))
-    config = {
-        'name': self.name,
-        'layers': copy.deepcopy(layer_configs)
-    }
-    if not self._is_graph_network and self._build_input_shape is not None:
-      config['build_input_shape'] = self._build_input_shape
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    if 'name' in config:
-      name = config['name']
-      build_input_shape = config.get('build_input_shape')
-      layer_configs = config['layers']
-    else:
-      name = None
-      build_input_shape = None
-      layer_configs = config
-    model = cls(name=name)
-    for layer_config in layer_configs:
-      layer = layer_module.deserialize(layer_config,
-                                       custom_objects=custom_objects)
-      model.add(layer)
-    if (not model.inputs and build_input_shape and
-        isinstance(build_input_shape, (tuple, list))):
-      model.build(build_input_shape)
-    return model
-
-  @property
-  def input_spec(self):
-    if hasattr(self, '_manual_input_spec'):
-      return self._manual_input_spec
-    if self._has_explicit_input_shape:
-      return super().input_spec
-    return None
-
-  @input_spec.setter
-  def input_spec(self, value):
-    self._manual_input_spec = value
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return model_serialization.SequentialSavedModelSaver(self)
 
-  def _is_layer_name_unique(self, layer):
-    for ref_layer in self.layers:
-      if layer.name == ref_layer.name and ref_layer is not layer:
-        return False
-    return True
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    @traceback_utils.filter_traceback
+    def __init__(self, layers=None, name=None):
+        """Creates a `Sequential` model instance.
+
+        Args:
+          layers: Optional list of layers to add to the model.
+          name: Optional name for the model.
+        """
+        # Skip the init in FunctionalModel since model doesn't have input/output
+        # yet
+        super(functional.Functional, self).__init__(name=name, autocast=False)
+        base_layer.keras_api_gauge.get_cell("Sequential").set(True)
+        self.supports_masking = True
+        self._compute_output_and_mask_jointly = True
+        self._auto_track_sub_layers = False
+        self._inferred_input_shape = None
+        self._has_explicit_input_shape = False
+        self._input_dtype = None
+        self._layer_call_argspecs = {}
+        self._created_nodes = set()
+        # Flag that indicate whether the sequential network topology has been
+        # created. It is false when there isn't any layer, or the layers don't
+        # have an input shape.
+        self._graph_initialized = False
+
+        # Unfortunately some Sequential models using custom layers or
+        # FeatureColumn layers have multiple inputs. This is fundamentally
+        # incompatible with most of the Sequential API, and we have to disable a
+        # number of features for such models.
+        self._use_legacy_deferred_behavior = False
+
+        # Add to the model any layers passed to the constructor.
+        if layers:
+            if not isinstance(layers, (list, tuple)):
+                layers = [layers]
+            for layer in layers:
+                self.add(layer)
+
+    @property
+    def layers(self):
+        # Historically, `sequential.layers` only returns layers that were added
+        # via `add`, and omits the auto-generated `InputLayer` that comes at the
+        # bottom of the stack.
+        # `Trackable` manages the `_layers` attributes and does filtering
+        # over it.
+        layers = super().layers
+        if layers and isinstance(layers[0], input_layer.InputLayer):
+            return layers[1:]
+        return layers[:]
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    @traceback_utils.filter_traceback
+    def add(self, layer):
+        """Adds a layer instance on top of the layer stack.
+
+        Args:
+            layer: layer instance.
+
+        Raises:
+            TypeError: If `layer` is not a layer instance.
+            ValueError: In case the `layer` argument does not
+                know its input shape.
+            ValueError: In case the `layer` argument has
+                multiple output tensors, or is already connected
+                somewhere else (forbidden in `Sequential` models).
+        """
+        # If we are passed a Keras tensor created by keras.Input(), we can
+        # extract the input layer from its keras history and use that without
+        # any loss of
+        # generality.
+        if hasattr(layer, "_keras_history"):
+            origin_layer = layer._keras_history[0]
+            if isinstance(origin_layer, input_layer.InputLayer):
+                layer = origin_layer
+
+        if isinstance(layer, tf.Module):
+            if not isinstance(layer, base_layer.Layer):
+                layer = functional.ModuleWrapper(layer)
+        else:
+            raise TypeError(
+                "The added layer must be an instance of class Layer. "
+                f"Received: layer={layer} of type {type(layer)}."
+            )
+
+        tf_utils.assert_no_legacy_layers([layer])
+        if not self._is_layer_name_unique(layer):
+            raise ValueError(
+                "All layers added to a Sequential model "
+                f'should have unique names. Name "{layer.name}" is already '
+                "the name of a layer in this model. Update the `name` argument "
+                "to pass a unique name."
+            )
+
+        self.built = False
+        set_inputs = False
+        self._maybe_create_attribute("_self_tracked_trackables", [])
+        if not self._self_tracked_trackables:
+            if isinstance(layer, input_layer.InputLayer):
+                # Case where the user passes an Input or InputLayer layer via
+                # `add`.
+                set_inputs = True
+            else:
+                batch_shape, dtype = training_utils.get_input_shape_and_dtype(
+                    layer
+                )
+                if batch_shape:
+                    # Instantiate an input layer.
+                    x = input_layer.Input(
+                        batch_shape=batch_shape,
+                        dtype=dtype,
+                        name=layer.name + "_input",
+                    )
+                    # This will build the current layer
+                    # and create the node connecting the current layer
+                    # to the input layer we just created.
+                    layer(x)
+                    set_inputs = True
+
+            if set_inputs:
+                outputs = tf.nest.flatten(layer._inbound_nodes[-1].outputs)
+                if len(outputs) != 1:
+                    raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
+                self.outputs = outputs
+                self.inputs = layer_utils.get_source_inputs(self.outputs[0])
+                self.built = True
+                self._has_explicit_input_shape = True
+
+        elif self.outputs:
+            # If the model is being built continuously on top of an input layer:
+            # refresh its output.
+            output_tensor = layer(self.outputs[0])
+            if len(tf.nest.flatten(output_tensor)) != 1:
+                raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
+            self.outputs = [output_tensor]
+            self.built = True
+
+        if set_inputs or self._graph_initialized:
+            self._init_graph_network(self.inputs, self.outputs)
+            self._graph_initialized = True
+        else:
+            self._self_tracked_trackables.append(layer)
+            self._handle_deferred_layer_dependencies([layer])
+
+        self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    @traceback_utils.filter_traceback
+    def pop(self):
+        """Removes the last layer in the model.
+
+        Raises:
+            TypeError: if there are no layers in the model.
+        """
+        if not self.layers:
+            raise TypeError("There are no layers in the model.")
+
+        layer = self._self_tracked_trackables.pop()
+        self._layer_call_argspecs.pop(layer)
+        if not self.layers:
+            self.outputs = None
+            self.inputs = None
+            self.built = False
+            self._inferred_input_shape = None
+            self._has_explicit_input_shape = False
+            self._graph_initialized = False
+        elif self._graph_initialized:
+            self.layers[-1]._outbound_nodes = []
+            self.outputs = [self.layers[-1].output]
+            self._init_graph_network(self.inputs, self.outputs)
+            self.built = True
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _build_graph_network_for_inferred_shape(
+        self, input_shape, input_dtype=None
+    ):
+        if input_shape is None or not self.layers:
+            return
+        if (
+            not tf.__internal__.tf2.enabled()
+            or not tf.compat.v1.executing_eagerly_outside_functions()
+        ):
+            # This behavior is disabled in V1 or when eager execution is
+            # disabled.
+            return
+        if (
+            not self._has_explicit_input_shape
+            and not self._use_legacy_deferred_behavior
+        ):
+            # Determine whether the input shape is novel, i.e. whether the model
+            # should be rebuilt.
+            input_shape = tuple(input_shape)
+            if self._inferred_input_shape is None:
+                new_shape = input_shape
+            else:
+                new_shape = relax_input_shape(
+                    self._inferred_input_shape, input_shape
+                )
+            if (
+                new_shape is not None
+                and new_shape != self._inferred_input_shape
+            ):
+                # A novel shape has been received: we need to rebuild the model.
+                # In case we are inside a graph function, we step out of it.
+                with tf.init_scope():
+                    inputs = input_layer.Input(
+                        batch_shape=new_shape,
+                        dtype=input_dtype,
+                        name=self.layers[0].name + "_input",
+                    )
+                    layer_input = inputs
+                    created_nodes = set()
+                    for layer in self.layers:
+                        # Clear nodes previously created via this method. This
+                        # prevents node accumulation and ensures that e.g.
+                        # `layer.output` is always connected to `model.inputs`
+                        # (this is important e.g. for the feature extraction use
+                        # case).  We don't just do `layer._inbound_nodes = []`
+                        # in order not to break shared layers added to
+                        # Sequential models (which is technically illegal as per
+                        # the `add()` docstring, but wasn't previously
+                        # disabled).
+                        clear_previously_created_nodes(
+                            layer, self._created_nodes
+                        )
+                        try:
+                            # Create Functional API connection by calling the
+                            # current layer
+                            layer_output = layer(layer_input)
+                        except:  # noqa: E722
+                            # Functional API calls may fail for a number of
+                            # reasons: 1) The layer may be buggy. In this case
+                            # it will be easier for the user to debug if we fail
+                            # on the first call on concrete data, instead of our
+                            # own call on a symbolic input. 2) The layer is
+                            # dynamic (graph-incompatible) and hasn't overridden
+                            # `compute_output_shape`. In this case, it is
+                            # impossible to build a graph network. 3) The layer
+                            # is otherwise incompatible with the Functional API
+                            # (e.g. this is the case for some probabilistic
+                            # layers that rely on hacks and that do not return
+                            # tensors). In all these cases, we should avoid
+                            # creating a graph network (or we simply can't).
+                            self._use_legacy_deferred_behavior = True
+                            return
+                        if len(tf.nest.flatten(layer_output)) != 1:
+                            raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
+                        # Keep track of nodes just created above
+                        track_nodes_created_by_last_call(layer, created_nodes)
+                        layer_input = layer_output
+                        outputs = layer_output
+                    self._created_nodes = created_nodes
+                    try:
+                        # Initialize a graph Network. This call will never fail
+                        # for a stack of valid Keras layers. However some users
+                        # have layers that are fundamentally incompatible with
+                        # the Functional API, which do not return tensors. In
+                        # this case, we fall back to the legacy deferred
+                        # behavior.
+                        # TODO(fchollet): consider raising here, as we should
+                        # not be supporting such layers.
+                        self._init_graph_network(inputs, outputs)
+                        self._graph_initialized = True
+                    except:  # noqa: E722
+                        self._use_legacy_deferred_behavior = True
+                self._inferred_input_shape = new_shape
+
+    @generic_utils.default
+    def build(self, input_shape=None):
+        if self._graph_initialized:
+            self._init_graph_network(self.inputs, self.outputs)
+        else:
+            if input_shape is None:
+                raise ValueError("You must provide an `input_shape` argument.")
+            self._build_graph_network_for_inferred_shape(input_shape)
+            if not self.built:
+                input_shape = tuple(input_shape)
+                self._build_input_shape = input_shape
+                super().build(input_shape)
+        self.built = True
 
-  def _assert_weights_created(self):
-    if self._graph_initialized:
-      return
-    # When the graph has not been initialized, use the Model's implementation to
-    # to check if the weights has been created.
-    super(functional.Functional, self)._assert_weights_created()  # pylint: disable=bad-super-call
+    def call(self, inputs, training=None, mask=None):
+        # If applicable, update the static input shape of the model.
+        if not self._has_explicit_input_shape:
+            if not tf.is_tensor(inputs) and not isinstance(inputs, tf.Tensor):
+                # This is a Sequential with multiple inputs. This is technically
+                # an invalid use case of Sequential, but we tolerate it for
+                # backwards compatibility.
+                self._use_legacy_deferred_behavior = True
+                self._build_input_shape = tf.nest.map_structure(
+                    _get_shape_tuple, inputs
+                )
+            else:
+                self._build_graph_network_for_inferred_shape(
+                    inputs.shape, inputs.dtype
+                )
+
+        if self._graph_initialized:
+            if not self.built:
+                self._init_graph_network(self.inputs, self.outputs)
+            return super().call(inputs, training=training, mask=mask)
+
+        outputs = inputs  # handle the corner case where self.layers is empty
+        for layer in self.layers:
+            # During each iteration, `inputs` are the inputs to `layer`, and
+            # `outputs` are the outputs of `layer` applied to `inputs`. At the
+            # end of each iteration `inputs` is set to `outputs` to prepare for
+            # the next layer.
+            kwargs = {}
+            argspec = self._layer_call_argspecs[layer].args
+            if "mask" in argspec:
+                kwargs["mask"] = mask
+            if "training" in argspec:
+                kwargs["training"] = training
+
+            outputs = layer(inputs, **kwargs)
+
+            inputs = outputs
+
+            def _get_mask_from_keras_tensor(kt):
+                return getattr(kt, "_keras_mask", None)
+
+            mask = tf.nest.map_structure(_get_mask_from_keras_tensor, outputs)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        shape = input_shape
+        for layer in self.layers:
+            shape = layer.compute_output_shape(shape)
+        return shape
+
+    def compute_mask(self, inputs, mask):
+        # TODO(omalleyt): b/123540974 This function is not really safe to call
+        # by itself because it will duplicate any updates and losses in graph
+        # mode by `call`ing the Layers again.
+        outputs = self.call(inputs, mask=mask)
+        return getattr(outputs, "_keras_mask", None)
+
+    def get_config(self):
+        layer_configs = []
+        serialize_obj_fn = serialization_lib.serialize_keras_object
+        if getattr(self, "use_legacy_config", None):
+            serialize_obj_fn = legacy_serialization.serialize_keras_object
+        for layer in super().layers:
+            # `super().layers` include the InputLayer if available (it is
+            # filtered out of `self.layers`). Note that
+            # `self._self_tracked_trackables` is managed by the tracking
+            # infrastructure and should not be used.
+            layer_configs.append(serialize_obj_fn(layer))
+        config = training.Model.get_config(self)
+        config["name"] = self.name
+        config["layers"] = copy.deepcopy(layer_configs)
+        if not self._is_graph_network and self._build_input_shape is not None:
+            config["build_input_shape"] = self._build_input_shape
+        return config
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        if "name" in config:
+            name = config["name"]
+            build_input_shape = config.get("build_input_shape")
+            layer_configs = config["layers"]
+        else:
+            name = None
+            layer_configs = config
+        model = cls(name=name)
+        for layer_config in layer_configs:
+            use_legacy_format = "module" not in layer_config
+            layer = layer_module.deserialize(
+                layer_config,
+                custom_objects=custom_objects,
+                use_legacy_format=use_legacy_format,
+            )
+            model.add(layer)
+
+        if (
+            not model.inputs
+            and build_input_shape
+            and isinstance(build_input_shape, (tuple, list))
+        ):
+            model.build(build_input_shape)
+
+        return model
+
+    @property
+    def input_spec(self):
+        if hasattr(self, "_manual_input_spec"):
+            return self._manual_input_spec
+        if self._has_explicit_input_shape:
+            return super().input_spec
+        return None
+
+    @input_spec.setter
+    def input_spec(self, value):
+        self._manual_input_spec = value
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return model_serialization.SequentialSavedModelSaver(self)
+
+    def _is_layer_name_unique(self, layer):
+        for ref_layer in self.layers:
+            if layer.name == ref_layer.name and ref_layer is not layer:
+                return False
+        return True
+
+    def _assert_weights_created(self):
+        if self._graph_initialized:
+            return
+        # When the graph has not been initialized, use the Model's
+        # implementation to to check if the weights has been created.
+        super(functional.Functional, self)._assert_weights_created()
 
 
 def _get_shape_tuple(t):
-  if hasattr(t, 'shape'):
-    shape = t.shape
-    if isinstance(shape, tuple):
-      return shape
-    if shape.rank is not None:
-      return tuple(shape.as_list())
+    if hasattr(t, "shape"):
+        shape = t.shape
+        if isinstance(shape, tuple):
+            return shape
+        if shape.rank is not None:
+            return tuple(shape.as_list())
+        return None
     return None
-  return None
 
 
 def relax_input_shape(shape_1, shape_2):
-  if shape_1 is None or shape_2 is None:
-    return None
-  if len(shape_1) != len(shape_2):
-    return None
-  return tuple(None if d1 != d2 else d1 for d1, d2 in zip(shape_1, shape_2))
+    if shape_1 is None or shape_2 is None:
+        return None
+    if len(shape_1) != len(shape_2):
+        return None
+    return tuple(None if d1 != d2 else d1 for d1, d2 in zip(shape_1, shape_2))
 
 
 def clear_previously_created_nodes(layer, created_nodes):
-  """Remove nodes from `created_nodes` from the layer's inbound_nodes."""
-  for node in layer._inbound_nodes:
-    prev_layers = node.inbound_layers
-    for prev_layer in tf.nest.flatten(prev_layers):
-      prev_layer._outbound_nodes = [
-          n for n in prev_layer._outbound_nodes
-          if n not in created_nodes]
-  layer._inbound_nodes = [
-      n for n in layer._inbound_nodes if n not in created_nodes]
+    """Remove nodes from `created_nodes` from the layer's inbound_nodes."""
+    for node in layer._inbound_nodes:
+        prev_layers = node.inbound_layers
+        for prev_layer in tf.nest.flatten(prev_layers):
+            prev_layer._outbound_nodes = [
+                n for n in prev_layer._outbound_nodes if n not in created_nodes
+            ]
+    layer._inbound_nodes = [
+        n for n in layer._inbound_nodes if n not in created_nodes
+    ]
 
 
 def track_nodes_created_by_last_call(layer, created_nodes):
-  """Adds to `created_nodes` the nodes created by the last call to `layer`."""
-  if not layer._inbound_nodes:
-    return
-  created_nodes.add(layer._inbound_nodes[-1])
-  prev_layers = layer._inbound_nodes[-1].inbound_layers
-  for prev_layer in tf.nest.flatten(prev_layers):
-    if prev_layer._outbound_nodes:
-      created_nodes.add(prev_layer._outbound_nodes[-1])
+    """Adds to `created_nodes` the nodes created by the last call to `layer`."""
+    if not layer._inbound_nodes:
+        return
+    created_nodes.add(layer._inbound_nodes[-1])
+    prev_layers = layer._inbound_nodes[-1].inbound_layers
+    for prev_layer in tf.nest.flatten(prev_layers):
+        if prev_layer._outbound_nodes:
+            created_nodes.add(prev_layer._outbound_nodes[-1])
diff --git a/keras/engine/sequential_test.py b/keras/engine/sequential_test.py
index 11b22397da44..54097e71b42b 100644
--- a/keras/engine/sequential_test.py
+++ b/keras/engine/sequential_test.py
@@ -14,555 +14,638 @@
 # ==============================================================================
 """Tests specific to `Sequential` model."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
-from tensorflow.python.framework import test_util as tf_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
 
 class TestSequential(test_combinations.TestCase):
-  """Most Sequential model API tests are covered in `training_test.py`.
-  """
-
-  @test_combinations.run_all_keras_modes
-  def test_basic_methods(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_dim=2))
-    model.add(keras.layers.Dropout(0.3, name='dp'))
-    model.add(keras.layers.Dense(2, kernel_regularizer='l2',
-                                 kernel_constraint='max_norm'))
-    self.assertEqual(len(model.layers), 3)
-    self.assertEqual(len(model.weights), 2 * 2)
-    self.assertEqual(model.get_layer(name='dp').name, 'dp')
-
-  @test_combinations.run_all_keras_modes
-  def test_input_defined_first_layer(self):
-    model = keras.models.Sequential()
-    model.add(keras.Input(shape=(2,), name='input_layer'))
-    model.add(keras.layers.Dense(1))
-    model.add(keras.layers.Dropout(0.3, name='dp'))
-    model.add(keras.layers.Dense(2, kernel_regularizer='l2',
-                                 kernel_constraint='max_norm'))
-    self.assertLen(model.layers, 3)
-    self.assertLen(model.weights, 2 * 2)
-    self.assertEqual(model.get_layer(name='dp').name, 'dp')
-
-  @test_combinations.run_all_keras_modes
-  def test_single_layer_in_init(self):
-    model = keras.models.Sequential(keras.layers.Dense(1))
-    self.assertLen(model.layers, 1)
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_pop(self):
-    num_hidden = 5
-    input_dim = 3
-    batch_size = 5
-    num_classes = 2
-
-    model = test_utils.get_small_sequential_mlp(
-        num_hidden, num_classes, input_dim)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.random.random((batch_size, input_dim))
-    y = np.random.random((batch_size, num_classes))
-    model.fit(x, y, epochs=1)
-    model.pop()
-    self.assertEqual(len(model.layers), 1)
-    self.assertEqual(model.output_shape, (None, num_hidden))
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    y = np.random.random((batch_size, num_hidden))
-    model.fit(x, y, epochs=1)
-
-    # Test popping single-layer model
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-    model.pop()
-    self.assertEqual(model.layers, [])
-    self.assertEqual(model.outputs, None)
-
-    # Invalid use case
-    model = keras.models.Sequential()
-    with self.assertRaises(TypeError):
-      model.pop()
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_deferred_build_with_np_arrays(self):
-    num_hidden = 5
-    input_dim = 3
-    batch_size = 5
-    num_classes = 2
-
-    model = test_utils.get_small_sequential_mlp(num_hidden, num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=[keras.metrics.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertEqual(len(model.layers), 2)
-    with self.assertRaisesRegex(
-        ValueError, 'Weights for model .* have not yet been created'):
-      len(model.weights)
-    self.assertFalse(model.built)
-
-    x = np.random.random((batch_size, input_dim))
-    y = np.random.random((batch_size, num_classes))
-    model.fit(x, y, epochs=1)
-    self.assertTrue(model.built)
-    self.assertEqual(len(model.weights), 2 * 2)
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_deferred_build_with_dataset_iterators(self):
-    num_hidden = 5
-    input_dim = 3
-    num_classes = 2
-    num_samples = 50
-    steps_per_epoch = 10
-
-    model = test_utils.get_small_sequential_mlp(num_hidden, num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=[keras.metrics.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertEqual(len(model.layers), 2)
-    with self.assertRaisesRegex(
-        ValueError, 'Weights for model .* have not yet been created'):
-      len(model.weights)
-    self.assertFalse(model.built)
-
-    x = tf.ones((num_samples, input_dim))
-    y = tf.zeros((num_samples, num_classes))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=steps_per_epoch)
-    self.assertTrue(model.built)
-    self.assertEqual(len(model.weights), 2 * 2)
-
-  # TODO(kaftan) This test fails w/ run_with_all_keras_modes. File ticket
-  @parameterized.parameters((True,), (False,))
-  def test_training_and_eval_methods_on_symbolic_tensors(self, deferred):
-    with tf.Graph().as_default(), self.cached_session():
-
-      def get_model():
-        if deferred:
-          model = test_utils.get_small_sequential_mlp(10, 4)
-        else:
-          model = test_utils.get_small_sequential_mlp(10, 4, input_dim=3)
+    """Most Sequential model API tests are covered in `training_test.py`."""
+
+    @test_combinations.run_all_keras_modes
+    def test_basic_methods(self):
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(1, input_dim=2))
+        model.add(keras.layers.Dropout(0.3, name="dp"))
+        model.add(
+            keras.layers.Dense(
+                2, kernel_regularizer="l2", kernel_constraint="max_norm"
+            )
+        )
+        self.assertEqual(len(model.layers), 3)
+        self.assertEqual(len(model.weights), 2 * 2)
+        self.assertEqual(model.get_layer(name="dp").name, "dp")
+
+    @test_combinations.run_all_keras_modes
+    def test_input_defined_first_layer(self):
+        model = keras.models.Sequential()
+        model.add(keras.Input(shape=(2,), name="input_layer"))
+        model.add(keras.layers.Dense(1))
+        model.add(keras.layers.Dropout(0.3, name="dp"))
+        model.add(
+            keras.layers.Dense(
+                2, kernel_regularizer="l2", kernel_constraint="max_norm"
+            )
+        )
+        self.assertLen(model.layers, 3)
+        self.assertLen(model.weights, 2 * 2)
+        self.assertEqual(model.get_layer(name="dp").name, "dp")
+
+    @test_combinations.run_all_keras_modes
+    def test_single_layer_in_init(self):
+        model = keras.models.Sequential(keras.layers.Dense(1))
+        self.assertLen(model.layers, 1)
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_pop(self):
+        num_hidden = 5
+        input_dim = 3
+        batch_size = 5
+        num_classes = 2
+
+        model = test_utils.get_small_sequential_mlp(
+            num_hidden, num_classes, input_dim
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.random.random((batch_size, input_dim))
+        y = np.random.random((batch_size, num_classes))
+        model.fit(x, y, epochs=1)
+        model.pop()
+        self.assertEqual(len(model.layers), 1)
+        self.assertEqual(model.output_shape, (None, num_hidden))
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        y = np.random.random((batch_size, num_hidden))
+        model.fit(x, y, epochs=1)
+
+        # Test popping single-layer model
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+        model.pop()
+        self.assertEqual(model.layers, [])
+        self.assertEqual(model.outputs, None)
+
+        # Invalid use case
+        model = keras.models.Sequential()
+        with self.assertRaises(TypeError):
+            model.pop()
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_deferred_build_with_np_arrays(self):
+        num_hidden = 5
+        input_dim = 3
+        batch_size = 5
+        num_classes = 2
+
+        model = test_utils.get_small_sequential_mlp(num_hidden, num_classes)
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=[keras.metrics.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertEqual(len(model.layers), 2)
+        with self.assertRaisesRegex(
+            ValueError, "Weights for model .* have not yet been created"
+        ):
+            len(model.weights)
+        self.assertFalse(model.built)
+
+        x = np.random.random((batch_size, input_dim))
+        y = np.random.random((batch_size, num_classes))
+        model.fit(x, y, epochs=1)
+        self.assertTrue(model.built)
+        self.assertEqual(len(model.weights), 2 * 2)
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_deferred_build_with_dataset_iterators(self):
+        num_hidden = 5
+        input_dim = 3
+        num_classes = 2
+        num_samples = 50
+        steps_per_epoch = 10
+
+        model = test_utils.get_small_sequential_mlp(num_hidden, num_classes)
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=[keras.metrics.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertEqual(len(model.layers), 2)
+        with self.assertRaisesRegex(
+            ValueError, "Weights for model .* have not yet been created"
+        ):
+            len(model.weights)
+        self.assertFalse(model.built)
+
+        x = tf.ones((num_samples, input_dim))
+        y = tf.zeros((num_samples, num_classes))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+
+        model.fit(dataset, epochs=1, steps_per_epoch=steps_per_epoch)
+        self.assertTrue(model.built)
+        self.assertEqual(len(model.weights), 2 * 2)
+
+    # TODO(kaftan) This test fails w/ run_with_all_keras_modes. File ticket
+    @parameterized.parameters((True,), (False,))
+    def test_training_and_eval_methods_on_symbolic_tensors(self, deferred):
+        with tf.Graph().as_default(), self.cached_session():
+
+            def get_model():
+                if deferred:
+                    model = test_utils.get_small_sequential_mlp(10, 4)
+                else:
+                    model = test_utils.get_small_sequential_mlp(
+                        10, 4, input_dim=3
+                    )
+                model.compile(
+                    optimizer="rmsprop",
+                    loss="categorical_crossentropy",
+                    metrics=["accuracy"],
+                )
+                return model
+
+            inputs = keras.backend.zeros(shape=(10, 3))
+            targets = keras.backend.zeros(shape=(10, 4))
+
+            model = get_model()
+            model.fit(inputs, targets, epochs=10, steps_per_epoch=30)
+
+            model = get_model()
+            model.evaluate(inputs, targets, steps=2, verbose=0)
+
+            model = get_model()
+            model.predict(inputs, steps=2)
+
+            model = get_model()
+            model.train_on_batch(inputs, targets)
+
+            model = get_model()
+            model.test_on_batch(inputs, targets)
+
+            model = get_model()
+            model.fit(
+                inputs,
+                targets,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                validation_data=(inputs, targets),
+                validation_steps=2,
+            )
+
+    @test_combinations.run_all_keras_modes
+    def test_invalid_use_cases(self):
+        # Added objects must be layer instances
+        with self.assertRaises(TypeError):
+            model = keras.models.Sequential()
+            model.add(None)
+
+    @test_combinations.run_all_keras_modes
+    def test_nested_sequential_trainability(self):
+        input_dim = 20
+        num_units = 10
+        num_classes = 2
+
+        inner_model = keras.models.Sequential()
+        inner_model.add(keras.layers.Dense(num_units, input_shape=(input_dim,)))
+
+        model = keras.models.Sequential()
+        model.add(inner_model)
+        model.add(keras.layers.Dense(num_classes))
+
+        self.assertEqual(len(model.layers), 2)
+
+        self.assertEqual(len(model.trainable_weights), 4)
+        inner_model.trainable = False
+        self.assertEqual(len(model.trainable_weights), 2)
+        inner_model.trainable = True
+        self.assertEqual(len(model.trainable_weights), 4)
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_update_disabling(self):
+        val_a = np.random.random((10, 4))
+        val_out = np.random.random((10, 4))
+
+        model = keras.models.Sequential()
+        model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+
+        model.trainable = False
+        model.compile("sgd", "mse")
+
+        x1 = model.predict(val_a)
+        model.train_on_batch(val_a, val_out)
+        x2 = model.predict(val_a)
+        self.assertAllClose(x1, x2, atol=1e-7)
+
+        model.trainable = True
+        model.compile("sgd", "mse")
+
+        model.train_on_batch(val_a, val_out)
+        x2 = model.predict(val_a)
+        assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_deferred_build_serialization(self):
+        num_hidden = 5
+        input_dim = 3
+        batch_size = 5
+        num_classes = 2
+
+        model = test_utils.get_small_sequential_mlp(num_hidden, num_classes)
         model.compile(
-            optimizer='rmsprop',
-            loss='categorical_crossentropy',
-            metrics=['accuracy'])
-        return model
-
-      inputs = keras.backend.zeros(shape=(10, 3))
-      targets = keras.backend.zeros(shape=(10, 4))
-
-      model = get_model()
-      model.fit(inputs, targets, epochs=10, steps_per_epoch=30)
-
-      model = get_model()
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-
-      model = get_model()
-      model.predict(inputs, steps=2)
-
-      model = get_model()
-      model.train_on_batch(inputs, targets)
-
-      model = get_model()
-      model.test_on_batch(inputs, targets)
-
-      model = get_model()
-      model.fit(
-          inputs,
-          targets,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          validation_data=(inputs, targets),
-          validation_steps=2)
-
-  @test_combinations.run_all_keras_modes
-  def test_invalid_use_cases(self):
-    # Added objects must be layer instances
-    with self.assertRaises(TypeError):
-      model = keras.models.Sequential()
-      model.add(None)
-
-  @test_combinations.run_all_keras_modes
-  def test_nested_sequential_trainability(self):
-    input_dim = 20
-    num_units = 10
-    num_classes = 2
-
-    inner_model = keras.models.Sequential()
-    inner_model.add(keras.layers.Dense(num_units, input_shape=(input_dim,)))
-
-    model = keras.models.Sequential()
-    model.add(inner_model)
-    model.add(keras.layers.Dense(num_classes))
-
-    self.assertEqual(len(model.layers), 2)
-
-    self.assertEqual(len(model.trainable_weights), 4)
-    inner_model.trainable = False
-    self.assertEqual(len(model.trainable_weights), 2)
-    inner_model.trainable = True
-    self.assertEqual(len(model.trainable_weights), 4)
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_update_disabling(self):
-    val_a = np.random.random((10, 4))
-    val_out = np.random.random((10, 4))
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.BatchNormalization(input_shape=(4,)))
-
-    model.trainable = False
-    model.compile('sgd', 'mse')
-
-    x1 = model.predict(val_a)
-    model.train_on_batch(val_a, val_out)
-    x2 = model.predict(val_a)
-    self.assertAllClose(x1, x2, atol=1e-7)
-
-    model.trainable = True
-    model.compile('sgd', 'mse')
-
-    model.train_on_batch(val_a, val_out)
-    x2 = model.predict(val_a)
-    assert np.abs(np.sum(x1 - x2)) > 1e-5
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_deferred_build_serialization(self):
-    num_hidden = 5
-    input_dim = 3
-    batch_size = 5
-    num_classes = 2
-
-    model = test_utils.get_small_sequential_mlp(num_hidden, num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=[keras.metrics.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertFalse(model.built)
-
-    x = np.random.random((batch_size, input_dim))
-    y = np.random.random((batch_size, num_classes))
-    model.train_on_batch(x, y)
-    self.assertTrue(model.built)
-
-    config = model.get_config()
-    new_model = keras.models.Sequential.from_config(config)
-    new_model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=[keras.metrics.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.random.random((batch_size, input_dim))
-    y = np.random.random((batch_size, num_classes))
-    new_model.train_on_batch(x, y)
-    self.assertEqual(len(new_model.layers), 2)
-    self.assertEqual(len(new_model.weights), 4)
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_shape_inference_deferred(self):
-    model = test_utils.get_small_sequential_mlp(4, 5)
-    output_shape = model.compute_output_shape((None, 7))
-    self.assertEqual(tuple(output_shape.as_list()), (None, 5))
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_build_deferred(self):
-    model = test_utils.get_small_sequential_mlp(4, 5)
-
-    model.build((None, 10))
-    self.assertTrue(model.built)
-    self.assertEqual(len(model.weights), 4)
-
-    # Test with nested model
-    model = test_utils.get_small_sequential_mlp(4, 3)
-    inner_model = test_utils.get_small_sequential_mlp(4, 5)
-    model.add(inner_model)
-
-    model.build((None, 10))
-    self.assertTrue(model.built)
-    self.assertEqual(len(model.weights), 8)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_sequential_deferred_manual_build(self):
-    model = test_utils.get_small_sequential_mlp(4, 5)
-    self.assertFalse(model.built)
-    model(tf.zeros([1, 2]))
-    self.assertTrue(model.built)
-    model.compile(
-        'rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((1, 2)), np.zeros((1, 5)))
-
-  @test_combinations.run_all_keras_modes
-  def test_sequential_nesting(self):
-    model = test_utils.get_small_sequential_mlp(4, 3)
-    inner_model = test_utils.get_small_sequential_mlp(4, 5)
-    model.add(inner_model)
-
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.random.random((2, 6))
-    y = np.random.random((2, 5))
-    model.fit(x, y, epochs=1)
-
-  @tf_test_utils.run_v1_only('Behavior changed in V2.')
-  def test_variable_names_deferred(self):
-    model = keras.models.Sequential([keras.layers.Dense(3)])
-    model.add(keras.layers.Dense(2))
-    model(tf.ones([2, 4]))
-    # Note that for regular sequential models (wrapping graph network),
-    # the layers' weights are built
-    # without the model name as prefix (because the Functional API __call__
-    # reset the name scope). This is fixable, but it would be
-    # backwards incompatible.
-    self.assertEqual(
-        ['sequential/dense/kernel:0', 'sequential/dense/bias:0',
-         'sequential/dense_1/kernel:0', 'sequential/dense_1/bias:0'],
-        [v.name for v in model.variables])
-
-  @test_combinations.run_all_keras_modes
-  def test_input_assumptions_propagation(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1))
-    if tf.executing_eagerly():
-      with self.assertRaisesRegex(ValueError,
-                                  'expected min_ndim=2, found ndim=0'):
-        model(1.0)
-
-  @test_combinations.run_all_keras_modes
-  def test_string_input(self):
-    seq = keras.Sequential([
-        keras.layers.InputLayer(input_shape=(1,), dtype=tf.string),
-        keras.layers.Lambda(lambda x: x[0])
-    ])
-    seq.run_eagerly = test_utils.should_run_eagerly()
-    preds = seq.predict([['tensorflow eager']])
-    self.assertEqual(preds.shape, (1,))
-
-  @test_combinations.run_all_keras_modes
-  def test_multi_output_layer_not_accepted(self):
-
-    class MultiOutputLayer(keras.layers.Layer):
-
-      def call(self, inputs):
-        return inputs, inputs
-
-    with self.assertRaisesRegex(ValueError,
-                                'should have a single output tensor'):
-      keras.Sequential([MultiOutputLayer(input_shape=(3,))])
-
-    with self.assertRaisesRegex(ValueError,
-                                'should have a single output tensor'):
-      keras.Sequential([
-          keras.layers.Dense(1, input_shape=(3,)),
-          MultiOutputLayer()])
-
-    # Should also raise error in a deferred build mode
-    with self.assertRaisesRegex(ValueError,
-                                'should have a single output tensor'):
-      keras.Sequential([MultiOutputLayer()])(np.zeros((10, 10)))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_layer_add_after_compile_deferred(self):
-    model = keras.Sequential([keras.layers.Dense(3)])
-    self.assertFalse(model.built)
-
-    model.compile('adam', loss='mse')
-    model.fit(np.random.random((1, 3)), np.random.random((1, 3)))
-    self.assertTrue(model.built)
-
-    model.add(keras.layers.Dense(3))
-
-    model.compile('adam', loss='mse')
-    model.fit(np.random.random((1, 3)), np.random.random((1, 3)))
-    self.assertTrue(model.built)
-
-  def test_sequential_layer_tracking(self):
-    """Test that Sequential only tracks layers added in init or `.add`."""
-    layer = keras.layers.Dense(1)
-    model = keras.Sequential([layer])
-    self.assertEqual(
-        list(model._flatten_layers(include_self=False, recursive=False))[-1],
-        layer)
-
-    model.a = [keras.layers.Dense(3)]  # should not be added to the layers list.
-    self.assertEqual(
-        list(model._flatten_layers(include_self=False, recursive=False))[-1],
-        layer)
-
-    layer2 = keras.layers.Dense(2)
-    model.add(layer2)
-    self.assertEqual(
-        list(model._flatten_layers(include_self=False, recursive=False))[-1],
-        layer2)
-
-    model.a = [keras.layers.Dense(3)]  # should not be added to the layers list.
-    self.assertEqual(
-        list(model._flatten_layers(include_self=False, recursive=False))[-1],
-        layer2)
-
-    model.pop()
-    self.assertEqual(
-        list(model._flatten_layers(include_self=False, recursive=False))[-1],
-        layer)
-
-  def test_config_preserves_input_layer(self):
-    model = keras.Sequential([
-        keras.Input((None,), name='my_embedding_input', dtype='int32'),
-        keras.layers.Embedding(32, 32),
-        keras.layers.Dense(3),
-    ])
-    config = model.get_config()
-    new_model = keras.Sequential.from_config(config)
-    self.assertTrue(new_model.built)
-    layers = list(
-        new_model._flatten_layers(include_self=False, recursive=False))
-    self.assertEqual(layers[0].dtype, 'int32')
-    self.assertEqual(layers[0].name, 'my_embedding_input')
-
-  def test_name_unicity(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(3, name='specific_name'))
-    with self.assertRaisesRegex(ValueError, 'should have unique names'):
-      model.add(keras.layers.Dense(3, name='specific_name'))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_tf_module_call(self):
-
-    class MyModule(tf.Module):
-
-      def __init__(self):
-        self.v = tf.Variable(2.)
-
-      def __call__(self, x):
-        return self.v * x
-
-    model = keras.Sequential()
-    model.add(MyModule())
-    model.compile('sgd', 'mse')
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model.fit(x, y, batch_size=2)
-    self.assertLen(model.trainable_variables, 1)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_tf_module_training(self):
-
-    class MyModule(tf.Module):
-
-      def __init__(self):
-        self.v = tf.Variable(2.)
-
-      def call(self, x, training=None):
-        # training should be set by Sequential.
-        assert training is not None
-        return self.v * x
-
-    model = keras.Sequential()
-    model.add(MyModule())
-    model.compile('sgd', 'mse')
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model.fit(x, y, batch_size=2)
-    self.assertLen(model.trainable_variables, 1)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_tf_module_error(self):
-
-    class MyModule(tf.Module):
-
-      def __init__(self):
-        self.v = tf.Variable(2.)
-
-    model = keras.Sequential()
-    with self.assertRaisesRegex(ValueError, 'is not defined'):
-      model.add(MyModule())
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=[keras.metrics.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertFalse(model.built)
+
+        x = np.random.random((batch_size, input_dim))
+        y = np.random.random((batch_size, num_classes))
+        model.train_on_batch(x, y)
+        self.assertTrue(model.built)
+
+        config = model.get_config()
+        new_model = keras.models.Sequential.from_config(config)
+        new_model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=[keras.metrics.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.random.random((batch_size, input_dim))
+        y = np.random.random((batch_size, num_classes))
+        new_model.train_on_batch(x, y)
+        self.assertEqual(len(new_model.layers), 2)
+        self.assertEqual(len(new_model.weights), 4)
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_shape_inference_deferred(self):
+        model = test_utils.get_small_sequential_mlp(4, 5)
+        output_shape = model.compute_output_shape((None, 7))
+        self.assertEqual(tuple(output_shape.as_list()), (None, 5))
+
+    @test_combinations.run_all_keras_modes
+    def test_sequential_build_deferred(self):
+        model = test_utils.get_small_sequential_mlp(4, 5)
+
+        model.build((None, 10))
+        self.assertTrue(model.built)
+        self.assertEqual(len(model.weights), 4)
+
+        # Test with nested model
+        model = test_utils.get_small_sequential_mlp(4, 3)
+        inner_model = test_utils.get_small_sequential_mlp(4, 5)
+        model.add(inner_model)
+
+        model.build((None, 10))
+        self.assertTrue(model.built)
+        self.assertEqual(len(model.weights), 8)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_sequential_deferred_manual_build(self):
+        model = test_utils.get_small_sequential_mlp(4, 5)
+        self.assertFalse(model.built)
+        model(tf.zeros([1, 2]))
+        self.assertTrue(model.built)
+        model.compile(
+            "rmsprop", loss="mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        model.train_on_batch(np.zeros((1, 2)), np.zeros((1, 5)))
 
+    @test_combinations.run_all_keras_modes
+    def test_sequential_nesting(self):
+        model = test_utils.get_small_sequential_mlp(4, 3)
+        inner_model = test_utils.get_small_sequential_mlp(4, 5)
+        model.add(inner_model)
 
-class TestSequentialEagerIntegration(test_combinations.TestCase):
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.random.random((2, 6))
+        y = np.random.random((2, 5))
+        model.fit(x, y, epochs=1)
+
+    @tf_test_utils.run_v1_only("Behavior changed in V2.")
+    def test_variable_names_deferred(self):
+        model = keras.models.Sequential([keras.layers.Dense(3)])
+        model.add(keras.layers.Dense(2))
+        model(tf.ones([2, 4]))
+        # Note that for regular sequential models (wrapping graph network),
+        # the layers' weights are built
+        # without the model name as prefix (because the Functional API __call__
+        # reset the name scope). This is fixable, but it would be
+        # backwards incompatible.
+        self.assertEqual(
+            [
+                "sequential/dense/kernel:0",
+                "sequential/dense/bias:0",
+                "sequential/dense_1/kernel:0",
+                "sequential/dense_1/bias:0",
+            ],
+            [v.name for v in model.variables],
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_input_assumptions_propagation(self):
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(1))
+        if tf.executing_eagerly():
+            with self.assertRaisesRegex(
+                ValueError, "expected min_ndim=2, found ndim=0"
+            ):
+                model(1.0)
+
+    @test_combinations.run_all_keras_modes
+    def test_string_input(self):
+        seq = keras.Sequential(
+            [
+                keras.layers.InputLayer(input_shape=(1,), dtype=tf.string),
+                keras.layers.Lambda(lambda x: x[0]),
+            ]
+        )
+        seq.run_eagerly = test_utils.should_run_eagerly()
+        preds = seq.predict([["tensorflow eager"]])
+        self.assertEqual(preds.shape, (1,))
+
+    @test_combinations.run_all_keras_modes
+    def test_multi_output_layer_not_accepted(self):
+        class MultiOutputLayer(keras.layers.Layer):
+            def call(self, inputs):
+                return inputs, inputs
+
+        with self.assertRaisesRegex(
+            ValueError, "should have a single output tensor"
+        ):
+            keras.Sequential([MultiOutputLayer(input_shape=(3,))])
+
+        with self.assertRaisesRegex(
+            ValueError, "should have a single output tensor"
+        ):
+            keras.Sequential(
+                [keras.layers.Dense(1, input_shape=(3,)), MultiOutputLayer()]
+            )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_layer_add_after_compile_deferred(self):
+        model = keras.Sequential([keras.layers.Dense(3)])
+        self.assertFalse(model.built)
+
+        model.compile("adam", loss="mse")
+        model.fit(np.random.random((1, 3)), np.random.random((1, 3)))
+        self.assertTrue(model.built)
+
+        model.add(keras.layers.Dense(3))
+
+        model.compile("adam", loss="mse")
+        model.fit(np.random.random((1, 3)), np.random.random((1, 3)))
+        self.assertTrue(model.built)
+
+    def test_sequential_layer_tracking(self):
+        """Test that Sequential only tracks layers added in init or `.add`."""
+        layer = keras.layers.Dense(1)
+        model = keras.Sequential([layer])
+        self.assertEqual(
+            list(model._flatten_layers(include_self=False, recursive=False))[
+                -1
+            ],
+            layer,
+        )
+
+        model.a = [
+            keras.layers.Dense(3)
+        ]  # should not be added to the layers list.
+        self.assertEqual(
+            list(model._flatten_layers(include_self=False, recursive=False))[
+                -1
+            ],
+            layer,
+        )
+
+        layer2 = keras.layers.Dense(2)
+        model.add(layer2)
+        self.assertEqual(
+            list(model._flatten_layers(include_self=False, recursive=False))[
+                -1
+            ],
+            layer2,
+        )
+
+        model.a = [
+            keras.layers.Dense(3)
+        ]  # should not be added to the layers list.
+        self.assertEqual(
+            list(model._flatten_layers(include_self=False, recursive=False))[
+                -1
+            ],
+            layer2,
+        )
+
+        model.pop()
+        self.assertEqual(
+            list(model._flatten_layers(include_self=False, recursive=False))[
+                -1
+            ],
+            layer,
+        )
+
+    def test_config_preserves_input_layer(self):
+        model = keras.Sequential(
+            [
+                keras.Input((None,), name="my_embedding_input", dtype="int32"),
+                keras.layers.Embedding(32, 32),
+                keras.layers.Dense(3),
+            ]
+        )
+        config = model.get_config()
+        new_model = keras.Sequential.from_config(config)
+        self.assertTrue(new_model.built)
+        layers = list(
+            new_model._flatten_layers(include_self=False, recursive=False)
+        )
+        self.assertEqual(layers[0].dtype, "int32")
+        self.assertEqual(layers[0].name, "my_embedding_input")
+
+    def test_name_unicity(self):
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(3, name="specific_name"))
+        with self.assertRaisesRegex(ValueError, "should have unique names"):
+            model.add(keras.layers.Dense(3, name="specific_name"))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_tf_module_call(self):
+        class MyModule(tf.Module):
+            def __init__(self):
+                self.v = tf.Variable(2.0)
+
+            def __call__(self, x):
+                return self.v * x
+
+        model = keras.Sequential()
+        model.add(MyModule())
+        model.compile("sgd", "mse")
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, batch_size=2)
+        self.assertLen(model.trainable_variables, 1)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_tf_module_training(self):
+        class MyModule(tf.Module):
+            def __init__(self):
+                self.v = tf.Variable(2.0)
+
+            def call(self, x, training=None):
+                # training should be set by Sequential.
+                assert training is not None
+                return self.v * x
+
+        model = keras.Sequential()
+        model.add(MyModule())
+        model.compile("sgd", "mse")
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, batch_size=2)
+        self.assertLen(model.trainable_variables, 1)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_tf_module_error(self):
+        class MyModule(tf.Module):
+            def __init__(self):
+                self.v = tf.Variable(2.0)
+
+        model = keras.Sequential()
+        with self.assertRaisesRegex(ValueError, "is not defined"):
+            model.add(MyModule())
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_multi_inputs_outputs(self):
+        model = keras.Sequential(
+            [
+                ImageAugmentLayer(),
+                ImageAugmentLayer(),
+            ]
+        )
+
+        image_inputs = tf.ones((2, 512, 512, 3))
+        label_inputs = tf.ones((2, 2))
+
+        output = model({"images": image_inputs, "labels": label_inputs})
+        self.assertAllClose(output["images"], image_inputs)
+        self.assertAllClose(output["labels"], label_inputs)
+
+        model.compile(loss="mse")
+        model.fit(
+            x={"images": image_inputs, "labels": label_inputs},
+            y={"images": image_inputs, "labels": label_inputs},
+            steps_per_epoch=1,
+        )
+        self.assertIsNone(model.inputs)
+        self.assertIsNone(model.outputs)
+
+        # Use the same model with image input only
+        model({"images": image_inputs})
+        model.fit(
+            x={"images": image_inputs},
+            y={"images": image_inputs},
+            steps_per_epoch=1,
+        )
+
+        model(image_inputs)
+        model.fit(x=image_inputs, y=image_inputs, steps_per_epoch=1)
 
-  @test_combinations.run_all_keras_modes
-  def test_defun_on_call(self):
-    # Check that one can subclass Sequential and place the `call` in a `defun`.
 
-    class MySequential(keras.Sequential):
+class TestSequentialEagerIntegration(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes
+    def test_defun_on_call(self):
+        # Check that one can subclass Sequential and place the `call` in a
+        # `defun`.
 
-      def __init__(self, name=None):
-        super().__init__(name=name)
-        self.call = tf.function(self.call)
+        class MySequential(keras.Sequential):
+            def __init__(self, name=None):
+                super().__init__(name=name)
+                self.call = tf.function(self.call)
 
-    model = MySequential()
-    model.add(keras.layers.Dense(4, activation='relu'))
-    model.add(keras.layers.Dense(5, activation='softmax'))
+        model = MySequential()
+        model.add(keras.layers.Dense(4, activation="relu"))
+        model.add(keras.layers.Dense(5, activation="softmax"))
 
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.random.random((2, 6))
+        y = np.random.random((2, 5))
+        model.fit(x, y, epochs=1)
+
+    @test_combinations.run_all_keras_modes
+    def test_build_before_fit(self):
+        # Fix for b/112433577
+        model = test_utils.get_small_sequential_mlp(4, 5)
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
 
-    x = np.random.random((2, 6))
-    y = np.random.random((2, 5))
-    model.fit(x, y, epochs=1)
+        model.build((None, 6))
 
-  @test_combinations.run_all_keras_modes
-  def test_build_before_fit(self):
-    # Fix for b/112433577
-    model = test_utils.get_small_sequential_mlp(4, 5)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
+        x = np.random.random((2, 6))
+        y = np.random.random((2, 5))
+        model.fit(x, y, epochs=1)
 
-    model.build((None, 6))
+    @test_combinations.run_all_keras_modes
+    def test_build_empty_network(self):
+        x = np.random.random((2, 6))
+        y = np.random.random((2, 5))
+        model = keras.Sequential()
 
-    x = np.random.random((2, 6))
-    y = np.random.random((2, 5))
-    model.fit(x, y, epochs=1)
+        # Make sure an empty sequential model can still work with build().
+        model.build((None, 6))
+        self.assertTrue(model.built)
 
-  @test_combinations.run_all_keras_modes
-  def test_build_empty_network(self):
-    x = np.random.random((2, 6))
-    y = np.random.random((2, 5))
-    model = keras.Sequential()
+        model.add(keras.layers.Dense(5, input_shape=(6,)))
 
-    # Make sure an empty sequential model can still work with build().
-    model.build((None, 6))
-    self.assertTrue(model.built)
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(x, y)
 
-    model.add(keras.layers.Dense(5, input_shape=(6,)))
+        model.pop()
+        self.assertFalse(model.built)
 
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y)
+        model.build((None, 6))
+        self.assertTrue(model.built)
 
-    model.pop()
-    self.assertFalse(model.built)
 
-    model.build((None, 6))
-    self.assertTrue(model.built)
+class ImageAugmentLayer(keras.layers.Layer):
+    def call(self, inputs):
+        return inputs
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/training.py b/keras/engine/training.py
index 510d8c2d5fb5..f8b2dbcfa2a3 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -17,13 +17,22 @@
 import copy
 import itertools
 import json
-import os
 import warnings
 import weakref
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.eager import context
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
+
 from keras import backend
 from keras import callbacks as callbacks_module
 from keras import optimizers
+from keras.dtensor import dtensor_api
 from keras.dtensor import layout_map as layout_map_lib
 from keras.engine import base_layer
 from keras.engine import base_layer_utils
@@ -31,3559 +40,4439 @@
 from keras.engine import data_adapter
 from keras.engine import input_layer as input_layer_module
 from keras.engine import training_utils
+from keras.metrics import base_metric
 from keras.mixed_precision import loss_scale_optimizer as lso
+from keras.optimizers import optimizer
 from keras.optimizers import optimizer_v1
-from keras.optimizers.optimizer_experimental import optimizer as optimizer_experimental
-from keras.saving import hdf5_format
 from keras.saving import pickle_utils
-from keras.saving import save
-from keras.saving import saving_utils
-from keras.saving.experimental import saving_lib
-from keras.saving.saved_model import json_utils
-from keras.saving.saved_model import model_serialization
+from keras.saving import saving_api
+from keras.saving import saving_lib
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model import json_utils
+from keras.saving.legacy.saved_model import model_serialization
 from keras.utils import generic_utils
 from keras.utils import io_utils
 from keras.utils import layer_utils
+from keras.utils import steps_per_execution_tuning
+from keras.utils import tf_inspect
 from keras.utils import tf_utils
 from keras.utils import traceback_utils
 from keras.utils import version_utils
 from keras.utils.mode_keys import ModeKeys
 
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.eager import context
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import keras_export
-from tensorflow.tools.docs import doc_controls
-
-# pylint: disable=g-import-not-at-top
 try:
-  import h5py
+    import h5py
 except ImportError:
-  h5py = None
-# pylint: enable=g-import-not-at-top
+    h5py = None
 
 
-@keras_export('keras.Model', 'keras.models.Model')
+@keras_export("keras.Model", "keras.models.Model")
 class Model(base_layer.Layer, version_utils.ModelVersionSelector):
-  """`Model` groups layers into an object with training and inference features.
-
-  Args:
-      inputs: The input(s) of the model: a `keras.Input` object or list of
-          `keras.Input` objects.
-      outputs: The output(s) of the model. See Functional API example below.
-      name: String, the name of the model.
-
-  There are two ways to instantiate a `Model`:
-
-  1 - With the "Functional API", where you start from `Input`,
-  you chain layer calls to specify the model's forward pass,
-  and finally you create your model from inputs and outputs:
-
-  ```python
-  import tensorflow as tf
-
-  inputs = tf.keras.Input(shape=(3,))
-  x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
-  outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
-  model = tf.keras.Model(inputs=inputs, outputs=outputs)
-  ```
-
-  Note: Only dicts, lists, and tuples of input tensors are supported. Nested
-  inputs are not supported (e.g. lists of list or dicts of dict).
-
-  A new Functional API model can also be created by using the
-  intermediate tensors. This enables you to quickly extract sub-components
-  of the model.
-
-  Example:
-
-  ```python
-  inputs = keras.Input(shape=(None, None, 3))
-  processed = keras.layers.RandomCrop(width=32, height=32)(inputs)
-  conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed)
-  pooling = keras.layers.GlobalAveragePooling2D()(conv)
-  feature = keras.layers.Dense(10)(pooling)
-
-  full_model = keras.Model(inputs, feature)
-  backbone = keras.Model(processed, conv)
-  activations = keras.Model(conv, feature)
-  ```
-
-  Note that the `backbone` and `activations` models are not
-  created with `keras.Input` objects, but with the tensors that are originated
-  from `keras.Inputs` objects. Under the hood, the layers and weights will
-  be shared across these models, so that user can train the `full_model`, and
-  use `backbone` or `activations` to do feature extraction.
-  The inputs and outputs of the model can be nested structures of tensors as
-  well, and the created models are standard Functional API models that support
-  all the existing APIs.
-
-  2 - By subclassing the `Model` class: in that case, you should define your
-  layers in `__init__()` and you should implement the model's forward pass
-  in `call()`.
-
-  ```python
-  import tensorflow as tf
-
-  class MyModel(tf.keras.Model):
-
-    def __init__(self):
-      super().__init__()
-      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
-      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
-
-    def call(self, inputs):
-      x = self.dense1(inputs)
-      return self.dense2(x)
-
-  model = MyModel()
-  ```
-
-  If you subclass `Model`, you can optionally have
-  a `training` argument (boolean) in `call()`, which you can use to specify
-  a different behavior in training and inference:
-
-  ```python
-  import tensorflow as tf
-
-  class MyModel(tf.keras.Model):
-
-    def __init__(self):
-      super().__init__()
-      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
-      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
-      self.dropout = tf.keras.layers.Dropout(0.5)
-
-    def call(self, inputs, training=False):
-      x = self.dense1(inputs)
-      if training:
-        x = self.dropout(x, training=training)
-      return self.dense2(x)
-
-  model = MyModel()
-  ```
-
-  Once the model is created, you can config the model with losses and metrics
-  with `model.compile()`, train the model with `model.fit()`, or use the model
-  to do prediction with `model.predict()`.
-  """
-  _TF_MODULE_IGNORED_PROPERTIES = frozenset(
-      itertools.chain(('_train_counter', '_test_counter', '_predict_counter',
-                       '_steps_per_execution'),
-                      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
-  _SCALAR_UPRANKING_ON = False
-
-  def __new__(cls, *args, **kwargs):
-    # Signature detection
-    if is_functional_model_init_params(args, kwargs) and cls == Model:
-      # Functional model
-      from keras.engine import functional  # pylint: disable=g-import-not-at-top
-      return functional.Functional(skip_init=True, *args, **kwargs)
-    else:
-      return super(Model, cls).__new__(cls, *args, **kwargs)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  @traceback_utils.filter_traceback
-  def __init__(self, *args, **kwargs):
-    self._is_model_for_instrumentation = True
-    base_layer.keras_api_gauge.get_cell('model').set(True)
-
-    # Special case for Subclassed Functional Model, which we couldn't detect
-    # when __new__ is called. We only realize it is a functional model when it
-    # calls super.__init__ with input and output tensor.
-    from keras.engine import functional  # pylint: disable=g-import-not-at-top
-    if (is_functional_model_init_params(args, kwargs) and
-        not isinstance(self, functional.Functional)):
-      # Filter the kwargs for multiple inheritance.
-      supported_kwargs = ['inputs', 'outputs', 'name', 'trainable', 'skip_init']
-      model_kwargs = {k: kwargs[k] for k in kwargs if k in supported_kwargs}
-      other_kwargs = {k: kwargs[k] for k in kwargs if k not in supported_kwargs}
-      inject_functional_model_class(self.__class__)
-      functional.Functional.__init__(self, *args, **model_kwargs)
-
-      # In case there is any multiple inheritance here, we need to call the
-      # __init__ for any class that appears after the Functional class.
-      clz_to_init = []
-      found_functional_class = False
-      for clz in self.__class__.__bases__:
-        if issubclass(clz, functional.Functional):
-          found_functional_class = True
-          continue
-        if found_functional_class:
-          clz_to_init.append(clz)
-
-      if clz_to_init:
-        for clz in clz_to_init:
-          clz.__init__(self, *args, **other_kwargs)
-      elif other_kwargs:
-        # In case there are unused kwargs, we should raise an error to user, in
-        # case they have a typo in the param name.
-        raise TypeError(
-            'The following keyword arguments passed to `Model` aren\'t '
-            'supported: {}.'.format(other_kwargs))
-      return
-
-    base_layer.keras_api_gauge.get_cell('Model subclass').set(True)
-    # The following are implemented as property functions:
-    # self.trainable_weights
-    # self.non_trainable_weights
-    # `inputs` / `outputs` will only appear in kwargs if either are misspelled.
-    generic_utils.validate_kwargs(kwargs, {
-        'trainable', 'dtype', 'dynamic', 'name', 'autocast', 'inputs', 'outputs'
-    })
-    super().__init__(**kwargs)
-    # By default, Model is a subclass model, which is not in graph network.
-    self._is_graph_network = False
-
-    self.inputs = None
-    self.outputs = None
-    self.input_names = None
-    self.output_names = None
-    # stop_training is used by callback to stop training when error happens
-    self.stop_training = False
-    self.history = None
-    # These objects are used in the default `Model.compile`. They are not
-    # guaranteed to be set after `Model.compile` is called, as users can
-    # override compile with custom logic.
-    self.compiled_loss = None
-    self.compiled_metrics = None
-
-    # This is True for Sequential networks and Functional networks.
-    self._compute_output_and_mask_jointly = False
-
-    # Don't reset compilation if already done. This may occur if calling
-    # `__init__` (or `_init_graph_network`) on an already-compiled model
-    # such as a Sequential model. Sequential models may need to rebuild
-    # themselves after compilation.
-    self._maybe_create_attribute('_is_compiled', False)
-    self._maybe_create_attribute('optimizer', None)
-
-    # Model must be created under scope of DistStrat it will be trained with.
-    if tf.distribute.has_strategy():
-      self._distribution_strategy = tf.distribute.get_strategy()
-    else:
-      self._distribution_strategy = None
-
-    self._cluster_coordinator = None
-
-    # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
-    self._run_eagerly = None
-    # Initialize cache attrs.
-    self._reset_compile_cache()
-
-    # Fault-tolerance handler. Set in `ModelCheckpoint`.
-    self._training_state = None
-    self._saved_model_inputs_spec = None
-    self._saved_model_arg_spec = None
-    self._checkpoint = tf.train.Checkpoint(root=weakref.ref(self))
-
-    self._steps_per_execution = None
-
-    self._init_batch_counters()
-    self._base_model_initialized = True
-
-    # `jit_compile` starts off with None as default and gets overwritten by the
-    # value specified in `Model.compile`, and this is effective for `fit`,
-    # `evaluate`, and `predict`.
-    self._jit_compile = None
-
-    self._layout_map = layout_map_lib.get_current_layout_map()
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _init_batch_counters(self):
-    # Untracked Variables, used to keep track of mini-batches seen in `fit`,
-    # `evaluate`, and `predict`.
-    agg = tf.VariableAggregation.ONLY_FIRST_REPLICA
-    self._train_counter = tf.Variable(0, dtype='int64', aggregation=agg)
-    self._test_counter = tf.Variable(0, dtype='int64', aggregation=agg)
-    self._predict_counter = tf.Variable(
-        0, dtype='int64', aggregation=agg)
-
-  def __setattr__(self, name, value):
-    if not getattr(self, '_self_setattr_tracking', True):
-      super().__setattr__(name, value)
-      return
-
-    if all(
-        isinstance(v, (base_layer.Layer, tf.Variable)) or
-        base_layer_utils.has_weights(v) for v in tf.nest.flatten(value)):
-      try:
-        self._base_model_initialized
-      except AttributeError:
-        raise RuntimeError(
-            'It looks like you are subclassing `Model` and you '
-            'forgot to call `super().__init__()`.'
-            ' Always start with this line.')
-
-    super().__setattr__(name, value)
-
-  def __reduce__(self):
-    if self.built:
-      return (pickle_utils.deserialize_model_from_bytecode,
-              pickle_utils.serialize_model_as_bytecode(self))
-    else:
-      # SavedModel (and hence serialize_model_as_bytecode) only support
-      # built models, but if the model is not built,
-      # it may be possible to serialize as a plain Python object,
-      # as long as the constituent parts (layers, optimizers, losses, etc.)
-      # can be serialized as plain Python objects.
-      # Thus we call up the superclass hierarchy to get an implementation of
-      # __reduce__ that can pickle this Model as a plain Python object.
-      return super().__reduce__()
-
-  def __deepcopy__(self, memo):
-    if self.built:
-      new = pickle_utils.deserialize_model_from_bytecode(
-          *pickle_utils.serialize_model_as_bytecode(self))
-      memo[id(self)] = new
-    else:
-      # See comment in __reduce__ for explanation
-      deserializer, serialized, *rest = super().__reduce__()
-      new = deserializer(*serialized)
-      memo[id(self)] = new
-      if rest:
-        state = copy.deepcopy(rest[0], memo=memo)
-        new.__setstate__(state)
-    return new
-
-  def __copy__(self):
-    return self.__deepcopy__({})
-
-  @generic_utils.default
-  def build(self, input_shape):
-    """Builds the model based on input shapes received.
-
-    This is to be used for subclassed models, which do not know at instantiation
-    time what their inputs look like.
-
-    This method only exists for users who want to call `model.build()` in a
-    standalone way (as a substitute for calling the model on real data to
-    build it). It will never be called by the framework (and thus it will
-    never throw unexpected errors in an unrelated workflow).
+    """A model grouping layers into an object with training/inference features.
 
     Args:
-     input_shape: Single tuple, `TensorShape` instance, or list/dict of shapes,
-       where shapes are tuples, integers, or `TensorShape` instances.
+        inputs: The input(s) of the model: a `keras.Input` object or a
+            combination of `keras.Input` objects in a dict, list or tuple.
+        outputs: The output(s) of the model: a tensor that originated from
+            `keras.Input` objects or a combination of such tensors in a dict,
+            list or tuple. See Functional API example below.
+        name: String, the name of the model.
 
-    Raises:
-      ValueError:
-        1. In case of invalid user-provided data (not of type tuple,
-           list, `TensorShape`, or dict).
-        2. If the model requires call arguments that are agnostic
-           to the input shapes (positional or keyword arg in call signature).
-        3. If not all layers were properly built.
-        4. If float type inputs are not supported within the layers.
-
-      In each of these cases, the user should build their model by calling it
-      on real tensor data.
-    """
-    if self._is_graph_network:
-      super().build(input_shape)
-      return
-
-    if input_shape is None:
-      raise ValueError('Input shape must be defined when calling `build()` on '
-                       'a `Model` subclass.')
-    valid_types = (tuple, list, tf.TensorShape, dict)
-    if not isinstance(input_shape, valid_types):
-      raise ValueError('Specified input shape is not one of the valid types. '
-                       'Please specify a batch input shape of type tuple or '
-                       'list of input shapes. User provided '
-                       'input type: {}.'.format(type(input_shape)))
-
-    if input_shape and not self.inputs:
-      # We create placeholders for the `None`s in the shape and build the model
-      # in a Graph. Since tf.Variable is compatible with both eager execution
-      # and graph building, the variables created after building the model in
-      # a Graph are still valid when executing eagerly.
-      if tf.executing_eagerly():
-        graph = tf.__internal__.FuncGraph('build_graph')
-      else:
-        graph = backend.get_graph()
-      with graph.as_default():
-        if (isinstance(input_shape, list) and
-            all(d is None or isinstance(d, int) for d in input_shape)):
-          input_shape = tuple(input_shape)
-        if isinstance(input_shape, list):
-          x = [base_layer_utils.generate_placeholders_from_shape(shape)
-               for shape in input_shape]
-        elif isinstance(input_shape, dict):
-          x = {
-              k: base_layer_utils.generate_placeholders_from_shape(shape)
-              for k, shape in input_shape.items()
-          }
-        else:
-          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
-
-        kwargs = {}
-        call_signature = self._call_spec.full_argspec
-        call_args = call_signature.args
-        # Exclude `self`, `inputs`, and any argument with a default value.
-        if len(call_args) > 2:
-          if call_signature.defaults:
-            call_args = call_args[2:-len(call_signature.defaults)]
-          else:
-            call_args = call_args[2:]
-          for arg in call_args:
-            if arg == 'training':
-              # Case where `training` is a positional arg with no default.
-              kwargs['training'] = False
-            else:
-              # Has invalid call signature with unknown positional arguments.
-              raise ValueError(
-                  'Currently, you cannot build your model if it has '
-                  'positional or keyword arguments that are not '
-                  'inputs to the model, but are required for its '
-                  '`call()` method. Instead, in order to instantiate '
-                  'and build your model, `call()` your model on real '
-                  'tensor data with all expected call arguments. The argument '
-                  'for `call()` can be a single list/tuple that contains '
-                  'multiple inputs.')
-        elif len(call_args) < 2:
-          # Signature without `inputs`.
-          raise ValueError(
-              'You can only call `build()` on a model if its `call()` '
-              'method accepts an `inputs` argument.')
-        try:
-          self.call(x, **kwargs)
-        except (tf.errors.InvalidArgumentError, TypeError) as e:
-          raise ValueError('You cannot build your model by calling `build` '
-                           'if your layers do not support float type inputs. '
-                           'Instead, in order to instantiate and build your '
-                           'model, call your model on real tensor data (of '
-                           'the correct dtype).\n\nThe actual error from '
-                           f'`call` is: {e}.')
-    super().build(input_shape)
-
-  @traceback_utils.filter_traceback
-  def __call__(self, *args, **kwargs):
-    if self._layout_map is not None and not self.built:
-      # Note that this method is only overridden for DTensor and layout
-      # injection purpose.
-      # Capture the inputs and create graph input as replacement for model
-      # to initialize its weights first.
-      copied_args = copy.copy(args)
-      copied_kwargs = copy.copy(kwargs)
-
-      inputs, copied_args, copied_kwargs = self._call_spec.split_out_first_arg(
-          copied_args, copied_kwargs)
-
-      def _convert_to_graph_inputs(x):
-        if isinstance(x, (tf.Tensor, np.ndarray, float, int)):
-          x = tf.convert_to_tensor(x)
-          return input_layer_module.Input(x.shape)
-
-      # TODO(scottzhu): maybe better handle mask and training flag.
-      inputs = tf.nest.map_structure(_convert_to_graph_inputs, inputs)
-      copied_args = tf.nest.map_structure(_convert_to_graph_inputs, copied_args)
-      copied_kwargs = tf.nest.map_structure(
-          _convert_to_graph_inputs, copied_kwargs)
-
-      # pylint: disable=g-import-not-at-top
-      with layout_map_lib.layout_map_scope(self._layout_map):
-        # We ignore the result here.
-        super().__call__(inputs, *copied_args, **copied_kwargs)
-
-      layout_map_lib._map_subclass_model_variable(self, self._layout_map)
-
-    return super().__call__(*args, **kwargs)
-
-  @doc_controls.doc_in_current_and_subclasses
-  def call(self, inputs, training=None, mask=None):
-    """Calls the model on new inputs and returns the outputs as tensors.
-
-    In this case `call()` just reapplies
-    all ops in the graph to the new inputs
-    (e.g. build a new computational graph from the provided inputs).
-
-    Note: This method should not be called directly. It is only meant to be
-    overridden when subclassing `tf.keras.Model`.
-    To call a model on an input, always use the `__call__()` method,
-    i.e. `model(inputs)`, which relies on the underlying `call()` method.
+    There are two ways to instantiate a `Model`:
 
-    Args:
-        inputs: Input tensor, or dict/list/tuple of input tensors.
-        training: Boolean or boolean scalar tensor, indicating whether to run
-          the `Network` in training mode or inference mode.
-        mask: A mask or list of masks. A mask can be either a boolean tensor or
-          None (no mask). For more details, check the guide
-            [here](https://www.tensorflow.org/guide/keras/masking_and_padding).
-
-    Returns:
-        A tensor if there is a single output, or
-        a list of tensors if there are more than one outputs.
-    """
-    raise NotImplementedError('Unimplemented `tf.keras.Model.call()`: if you '
-                              'intend to create a `Model` with the Functional '
-                              'API, please provide `inputs` and `outputs` '
-                              'arguments. Otherwise, subclass `Model` with an '
-                              'overridden `call()` method.')
-
-  @traceback_utils.filter_traceback
-  def compile(self,
-              optimizer='rmsprop',
-              loss=None,
-              metrics=None,
-              loss_weights=None,
-              weighted_metrics=None,
-              run_eagerly=None,
-              steps_per_execution=None,
-              jit_compile=None,
-              **kwargs):
-    """Configures the model for training.
-
-    Example:
+    1 - With the "Functional API", where you start from `Input`,
+    you chain layer calls to specify the model's forward pass,
+    and finally you create your model from inputs and outputs:
 
     ```python
-    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
-                  loss=tf.keras.losses.BinaryCrossentropy(),
-                  metrics=[tf.keras.metrics.BinaryAccuracy(),
-                           tf.keras.metrics.FalseNegatives()])
-    ```
+    import tensorflow as tf
 
-    Args:
-        optimizer: String (name of optimizer) or optimizer instance. See
-          `tf.keras.optimizers`.
-        loss: Loss function. May be a string (name of loss function), or
-          a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
-          function is any callable with the signature `loss = fn(y_true,
-          y_pred)`, where `y_true` are the ground truth values, and
-          `y_pred` are the model's predictions.
-          `y_true` should have shape
-          `(batch_size, d0, .. dN)` (except in the case of
-          sparse loss functions such as
-          sparse categorical crossentropy which expects integer arrays of shape
-          `(batch_size, d0, .. dN-1)`).
-          `y_pred` should have shape `(batch_size, d0, .. dN)`.
-          The loss function should return a float tensor.
-          If a custom `Loss` instance is
-          used and reduction is set to `None`, return value has shape
-          `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
-          values; otherwise, it is a scalar. If the model has multiple outputs,
-          you can use a different loss on each output by passing a dictionary
-          or a list of losses. The loss value that will be minimized by the
-          model will then be the sum of all individual losses, unless
-          `loss_weights` is specified.
-        metrics: List of metrics to be evaluated by the model during training
-          and testing. Each of this can be a string (name of a built-in
-          function), function or a `tf.keras.metrics.Metric` instance. See
-          `tf.keras.metrics`. Typically you will use `metrics=['accuracy']`. A
-          function is any callable with the signature `result = fn(y_true,
-          y_pred)`. To specify different metrics for different outputs of a
-          multi-output model, you could also pass a dictionary, such as
-          `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
-          You can also pass a list to specify a metric or a list of metrics
-          for each output, such as `metrics=[['accuracy'], ['accuracy', 'mse']]`
-          or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
-          strings 'accuracy' or 'acc', we convert this to one of
-          `tf.keras.metrics.BinaryAccuracy`,
-          `tf.keras.metrics.CategoricalAccuracy`,
-          `tf.keras.metrics.SparseCategoricalAccuracy` based on the loss
-          function used and the model output shape. We do a similar
-          conversion for the strings 'crossentropy' and 'ce' as well.
-          The metrics passed here are evaluated without sample weighting; if you
-          would like sample weighting to apply, you can specify your
-          metrics via the `weighted_metrics` argument instead.
-        loss_weights: Optional list or dictionary specifying scalar coefficients
-          (Python floats) to weight the loss contributions of different model
-          outputs. The loss value that will be minimized by the model will then
-          be the *weighted sum* of all individual losses, weighted by the
-          `loss_weights` coefficients.
-            If a list, it is expected to have a 1:1 mapping to the model's
-              outputs. If a dict, it is expected to map output names (strings)
-              to scalar coefficients.
-        weighted_metrics: List of metrics to be evaluated and weighted by
-          `sample_weight` or `class_weight` during training and testing.
-        run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
-          logic will not be wrapped in a `tf.function`. Recommended to leave
-          this as `None` unless your `Model` cannot be run inside a
-          `tf.function`. `run_eagerly=True` is not supported when using
-          `tf.distribute.experimental.ParameterServerStrategy`.
-        steps_per_execution: Int. Defaults to 1. The number of batches to run
-          during each `tf.function` call. Running multiple batches inside a
-          single `tf.function` call can greatly improve performance on TPUs or
-          small models with a large Python overhead. At most, one full epoch
-          will be run each execution. If a number larger than the size of the
-          epoch is passed, the execution will be truncated to the size of the
-          epoch. Note that if `steps_per_execution` is set to `N`,
-          `Callback.on_batch_begin` and `Callback.on_batch_end` methods will
-          only be called every `N` batches (i.e. before/after each `tf.function`
-          execution).
-        jit_compile: If `True`, compile the model training step with XLA.
-          [XLA](https://www.tensorflow.org/xla) is an optimizing compiler for
-          machine learning.
-          `jit_compile` is not enabled for by default.
-          This option cannot be enabled with `run_eagerly=True`.
-          Note that `jit_compile=True`
-          may not necessarily work for all models.
-          For more information on supported operations please refer to the
-          [XLA documentation](https://www.tensorflow.org/xla).
-          Also refer to
-          [known XLA issues](https://www.tensorflow.org/xla/known_issues) for
-          more details.
-        **kwargs: Arguments supported for backwards compatibility only.
-    """
-    base_layer.keras_api_gauge.get_cell('compile').set(True)
-    with self.distribute_strategy.scope():
-      if 'experimental_steps_per_execution' in kwargs:
-        logging.warning('The argument `steps_per_execution` is no longer '
-                        'experimental. Pass `steps_per_execution` instead of '
-                        '`experimental_steps_per_execution`.')
-        if not steps_per_execution:
-          steps_per_execution = kwargs.pop('experimental_steps_per_execution')
-
-      # When compiling from an already-serialized model, we do not want to
-      # reapply some processing steps (e.g. metric renaming for multi-output
-      # models, which have prefixes added for each corresponding output name).
-      from_serialized = kwargs.pop('from_serialized', False)
-
-      self._validate_compile(optimizer, metrics, **kwargs)
-      self._run_eagerly = run_eagerly
-
-      self.optimizer = self._get_optimizer(optimizer)
-      if isinstance(loss, compile_utils.LossesContainer):
-        self.compiled_loss = loss
-      else:
-        self.compiled_loss = compile_utils.LossesContainer(
-            loss, loss_weights, output_names=self.output_names)
-      self.compiled_metrics = compile_utils.MetricsContainer(
-          metrics, weighted_metrics, output_names=self.output_names,
-          from_serialized=from_serialized)
-
-      self._configure_steps_per_execution(steps_per_execution or 1)
-
-      # Initializes attrs that are reset each time `compile` is called.
-      self._reset_compile_cache()
-      self._is_compiled = True
-      self.loss = loss or {}
-      if (self._run_eagerly or self.dynamic) and jit_compile:
-        raise ValueError(
-            'You cannot enable `run_eagerly` and `jit_compile` '
-            'at the same time.')
-      else:
-        self._jit_compile = jit_compile
-
-  def _get_optimizer(self, optimizer):
-    """Wraps `optimizer` in `LossScaleOptimizer` if necessary."""
-    def _get_single_optimizer(opt):
-      opt = optimizers.get(opt)
-      if (self.dtype_policy.name == 'mixed_float16' and
-          not isinstance(opt, lso.LossScaleOptimizer)):
-        # Loss scaling is necessary with mixed_float16 for models to converge to
-        # the same accuracy as with float32.
-        opt = lso.LossScaleOptimizer(opt)
-      return opt
-
-    return tf.nest.map_structure(_get_single_optimizer, optimizer)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _reset_compile_cache(self):
-    self.train_function = None
-    self.test_function = None
-    self.predict_function = None
-    # Used to cache the `tf.function`'ed `train_function` to be logged in
-    # TensorBoard, since the original `train_function` is not necessarily
-    # a `tf.function` (e.g., with ParameterServerStrategy, the `train_function`
-    # is a scheduling of the actual training function to a remote worker).
-    self.train_tf_function = None
-
-    # Used to cache `trainable` attr of `Layer`s for `fit`.
-    self._compiled_trainable_state = self._get_trainable_state()
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _configure_steps_per_execution(self, steps_per_execution):
-    self._steps_per_execution = tf.Variable(
-        steps_per_execution,
-        dtype='int64',
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-
-  @property
-  def _should_compute_mask(self):
-    return False
+    inputs = tf.keras.Input(shape=(3,))
+    x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
+    outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
+    model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    ```
 
-  @property
-  def metrics(self):
-    """Returns the model's metrics added using `compile()`, `add_metric()` APIs.
-
-    Note: Metrics passed to `compile()` are available only after a `keras.Model`
-    has been trained/evaluated on actual data.
-
-    Examples:
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> outputs = tf.keras.layers.Dense(2)(inputs)
-    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
-    >>> [m.name for m in model.metrics]
-    []
-
-    >>> x = np.random.random((2, 3))
-    >>> y = np.random.randint(0, 2, (2, 2))
-    >>> model.fit(x, y)
-    >>> [m.name for m in model.metrics]
-    ['loss', 'mae']
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> d = tf.keras.layers.Dense(2, name='out')
-    >>> output_1 = d(inputs)
-    >>> output_2 = d(inputs)
-    >>> model = tf.keras.models.Model(
-    ...    inputs=inputs, outputs=[output_1, output_2])
-    >>> model.add_metric(
-    ...    tf.reduce_sum(output_2), name='mean', aggregation='mean')
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"])
-    >>> model.fit(x, (y, y))
-    >>> [m.name for m in model.metrics]
-    ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae',
-    'out_1_acc', 'mean']
+    Note: Only dicts, lists, and tuples of input tensors are supported. Nested
+    inputs are not supported (e.g. lists of list or dicts of dict).
 
-    """
-    metrics = []
-    if self._is_compiled:
-      # TODO(omalleyt): Track `LossesContainer` and `MetricsContainer` objects
-      # so that attr names are not load-bearing.
-      if self.compiled_loss is not None:
-        metrics += self.compiled_loss.metrics
-      if self.compiled_metrics is not None:
-        metrics += self.compiled_metrics.metrics
-
-    for l in self._flatten_layers():
-      metrics.extend(l._metrics)  # pylint: disable=protected-access
-    return metrics
-
-  @property
-  def metrics_names(self):
-    """Returns the model's display labels for all outputs.
-
-    Note: `metrics_names` are available only after a `keras.Model` has been
-    trained/evaluated on actual data.
-
-    Examples:
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> outputs = tf.keras.layers.Dense(2)(inputs)
-    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
-    >>> model.metrics_names
-    []
-
-    >>> x = np.random.random((2, 3))
-    >>> y = np.random.randint(0, 2, (2, 2))
-    >>> model.fit(x, y)
-    >>> model.metrics_names
-    ['loss', 'mae']
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> d = tf.keras.layers.Dense(2, name='out')
-    >>> output_1 = d(inputs)
-    >>> output_2 = d(inputs)
-    >>> model = tf.keras.models.Model(
-    ...    inputs=inputs, outputs=[output_1, output_2])
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"])
-    >>> model.fit(x, (y, y))
-    >>> model.metrics_names
-    ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae',
-    'out_1_acc']
+    A new Functional API model can also be created by using the
+    intermediate tensors. This enables you to quickly extract sub-components
+    of the model.
 
-    """
-
-    # This property includes all output names including `loss` and per-output
-    # losses for backward compatibility.
-    return [m.name for m in self.metrics]
+    Example:
 
-  @property
-  def distribute_strategy(self):
-    """The `tf.distribute.Strategy` this model was created under."""
-    return self._distribution_strategy or tf.distribute.get_strategy()
+    ```python
+    inputs = keras.Input(shape=(None, None, 3))
+    processed = keras.layers.RandomCrop(width=32, height=32)(inputs)
+    conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed)
+    pooling = keras.layers.GlobalAveragePooling2D()(conv)
+    feature = keras.layers.Dense(10)(pooling)
+
+    full_model = keras.Model(inputs, feature)
+    backbone = keras.Model(processed, conv)
+    activations = keras.Model(conv, feature)
+    ```
 
-  @property
-  def run_eagerly(self):
-    """Settable attribute indicating whether the model should run eagerly.
+    Note that the `backbone` and `activations` models are not
+    created with `keras.Input` objects, but with the tensors that are originated
+    from `keras.Input` objects. Under the hood, the layers and weights will
+    be shared across these models, so that user can train the `full_model`, and
+    use `backbone` or `activations` to do feature extraction.
+    The inputs and outputs of the model can be nested structures of tensors as
+    well, and the created models are standard Functional API models that support
+    all the existing APIs.
 
-    Running eagerly means that your model will be run step by step,
-    like Python code. Your model might run slower, but it should become easier
-    for you to debug it by stepping into individual layer calls.
+    2 - By subclassing the `Model` class: in that case, you should define your
+    layers in `__init__()` and you should implement the model's forward pass
+    in `call()`.
 
-    By default, we will attempt to compile your model to a static graph to
-    deliver the best execution performance.
+    ```python
+    import tensorflow as tf
 
-    Returns:
-      Boolean, whether the model should run eagerly.
-    """
-    if self.dynamic and self._run_eagerly is False:  # pylint:disable=g-bool-id-comparison
-      # TODO(fchollet): consider using py_func to enable this.
-      raise ValueError('Your model contains layers that can only be '
-                       'successfully run in eager execution (layers '
-                       'constructed with `dynamic=True`). '
-                       'You cannot set `run_eagerly=False`.')
-
-    if self._cluster_coordinator and self._run_eagerly:
-      raise ValueError('When using `Model` with `ParameterServerStrategy`, '
-                       '`run_eagerly` is not supported.')
-
-    # Run eagerly logic, by priority:
-    # (1) Dynamic models must be run eagerly.
-    # (2) Explicitly setting run_eagerly causes a Model to be run eagerly.
-    # (3) Not explicitly setting run_eagerly defaults to TF's global setting.
-    return (self.dynamic or self._run_eagerly or
-            (tf.config.functions_run_eagerly() and
-             self._run_eagerly is None))
-
-  @run_eagerly.setter
-  def run_eagerly(self, value):
-    self._run_eagerly = value
-
-  def _validate_target_and_loss(self, y, loss):
-    """Raises error if target or loss is not found.
-
-    This method verifies that the target and loss are properly populated
-    when applicable, or raises errors.
+    class MyModel(tf.keras.Model):
 
-    Args:
-      y: the target for training.
-      loss: the total loss tensor including loss added via `compile` and
-        `add_loss`.
-    """
+      def __init__(self):
+        super().__init__()
+        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
 
-    # `self.loss` references the loss added via `compile` call. If users have
-    # provided such, the target must be provided; otherwise it's a user error.
-    # Note that `self.loss` does not include losses added via `add_loss`, and it
-    # is a valid use when such loss from `add_loss` exists and target does not.
-    if self.loss and y is None:
-      raise ValueError(
-          'Target data is missing. Your model was compiled with '
-          f'loss={self.loss}, '
-          'and therefore expects target data to be provided in `fit()`.')
-
-    # For training, there must be compiled loss or regularization loss to exist
-    # in order to apply the gradients. If one is not found, it means no loss
-    # was supplied via `compile` or `add_loss`.
-    elif loss is None:
-      raise ValueError(
-          'No loss found. You may have forgotten to provide a `loss` argument '
-          'in the `compile()` method.')
-
-  def train_step(self, data):
-    """The logic for one training step.
-
-    This method can be overridden to support custom training logic.
-    For concrete examples of how to override this method see
-    [Customizing what happends in fit](https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit).
-    This method is called by `Model.make_train_function`.
-
-    This method should contain the mathematical logic for one step of training.
-    This typically includes the forward pass, loss calculation, backpropagation,
-    and metric updates.
-
-    Configuration details for *how* this logic is run (e.g. `tf.function` and
-    `tf.distribute.Strategy` settings), should be left to
-    `Model.make_train_function`, which can also be overridden.
+      def call(self, inputs):
+        x = self.dense1(inputs)
+        return self.dense2(x)
 
-    Args:
-      data: A nested structure of `Tensor`s.
+    model = MyModel()
+    ```
 
-    Returns:
-      A `dict` containing values that will be passed to
-      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
-      values of the `Model`'s metrics are returned. Example:
-      `{'loss': 0.2, 'accuracy': 0.7}`.
-    """
-    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
-    # Run forward pass.
-    with tf.GradientTape() as tape:
-      y_pred = self(x, training=True)
-      loss = self.compute_loss(x, y, y_pred, sample_weight)
-    self._validate_target_and_loss(y, loss)
-    # Run backwards pass.
-    self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
-    return self.compute_metrics(x, y, y_pred, sample_weight)
-
-  def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
-    """Compute the total loss, validate it, and return it.
-
-    Subclasses can optionally override this method to provide custom loss
-    computation logic.
+    If you subclass `Model`, you can optionally have
+    a `training` argument (boolean) in `call()`, which you can use to specify
+    a different behavior in training and inference:
 
-    Example:
     ```python
-    class MyModel(tf.keras.Model):
+    import tensorflow as tf
 
-      def __init__(self, *args, **kwargs):
-        super(MyModel, self).__init__(*args, **kwargs)
-        self.loss_tracker = tf.keras.metrics.Mean(name='loss')
-
-      def compute_loss(self, x, y, y_pred, sample_weight):
-        loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y))
-        loss += tf.add_n(self.losses)
-        self.loss_tracker.update_state(loss)
-        return loss
-
-      def reset_metrics(self):
-        self.loss_tracker.reset_states()
-
-      @property
-      def metrics(self):
-        return [self.loss_tracker]
+    class MyModel(tf.keras.Model):
 
-    tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
-    dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
+      def __init__(self):
+        super().__init__()
+        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+        self.dropout = tf.keras.layers.Dropout(0.5)
 
-    inputs = tf.keras.layers.Input(shape=(10,), name='my_input')
-    outputs = tf.keras.layers.Dense(10)(inputs)
-    model = MyModel(inputs, outputs)
-    model.add_loss(tf.reduce_sum(outputs))
+      def call(self, inputs, training=False):
+        x = self.dense1(inputs)
+        if training:
+          x = self.dropout(x, training=training)
+        return self.dense2(x)
 
-    optimizer = tf.keras.optimizers.SGD()
-    model.compile(optimizer, loss='mse', steps_per_execution=10)
-    model.fit(dataset, epochs=2, steps_per_epoch=10)
-    print('My custom loss: ', model.loss_tracker.result().numpy())
+    model = MyModel()
     ```
 
-    Args:
-      x: Input data.
-      y: Target data.
-      y_pred: Predictions returned by the model (output of `model(x)`)
-      sample_weight: Sample weights for weighting the loss function.
-
-    Returns:
-      The total loss as a `tf.Tensor`, or `None` if no loss results (which is
-      the case when called by `Model.test_step`).
+    Once the model is created, you can config the model with losses and metrics
+    with `model.compile()`, train the model with `model.fit()`, or use the model
+    to do prediction with `model.predict()`.
     """
-    del x  # The default implementation does not use `x`.
-    return self.compiled_loss(
-        y, y_pred, sample_weight, regularization_losses=self.losses)
 
-  def compute_metrics(self, x, y, y_pred, sample_weight):
-    """Update metric states and collect all metrics to be returned.
+    _TF_MODULE_IGNORED_PROPERTIES = frozenset(
+        itertools.chain(
+            (
+                "_train_counter",
+                "_test_counter",
+                "_predict_counter",
+                "_steps_per_execution",
+                "_compiled_trainable_state",
+            ),
+            base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES,
+        )
+    )
+    _SCALAR_UPRANKING_ON = False
 
-    Subclasses can optionally override this method to provide custom metric
-    updating and collection logic.
+    def __new__(cls, *args, **kwargs):
+        # Signature detection
+        if is_functional_model_init_params(args, kwargs) and cls == Model:
+            # Functional model
+            from keras.engine import functional
 
-    Example:
-    ```python
-    class MyModel(tf.keras.Sequential):
+            return functional.Functional(skip_init=True, *args, **kwargs)
+        else:
+            return super(Model, cls).__new__(cls, *args, **kwargs)
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    @traceback_utils.filter_traceback
+    def __init__(self, *args, **kwargs):
+        self._is_model_for_instrumentation = True
+        base_layer.keras_api_gauge.get_cell("model").set(True)
+
+        # Special case for Subclassed Functional Model, which we couldn't detect
+        # when __new__ is called. We only realize it is a functional model when
+        # it calls super.__init__ with input and output tensor.
+        from keras.engine import functional
+
+        if is_functional_model_init_params(args, kwargs) and not isinstance(
+            self, functional.Functional
+        ):
+            # Filter the kwargs for multiple inheritance.
+            supported_kwargs = [
+                "inputs",
+                "outputs",
+                "name",
+                "trainable",
+                "skip_init",
+            ]
+            model_kwargs = {
+                k: kwargs[k] for k in kwargs if k in supported_kwargs
+            }
+            other_kwargs = {
+                k: kwargs[k] for k in kwargs if k not in supported_kwargs
+            }
+            inject_functional_model_class(self.__class__)
+            functional.Functional.__init__(self, *args, **model_kwargs)
+
+            # In case there is any multiple inheritance here, we need to call
+            # the __init__ for any class that appears after the Functional
+            # class.
+            clz_to_init = []
+            found_functional_class = False
+            for clz in self.__class__.__bases__:
+                if issubclass(clz, functional.Functional):
+                    found_functional_class = True
+                    continue
+                if found_functional_class:
+                    clz_to_init.append(clz)
+
+            if clz_to_init:
+                for clz in clz_to_init:
+                    clz.__init__(self, *args, **other_kwargs)
+            elif other_kwargs:
+                # In case there are unused kwargs, we should raise an error to
+                # user, in case they have a typo in the param name.
+                raise TypeError(
+                    "The following keyword arguments passed to `Model` aren't "
+                    "supported: {}.".format(other_kwargs)
+                )
+            return
+
+        base_layer.keras_api_gauge.get_cell("Model subclass").set(True)
+        # The following are implemented as property functions:
+        # self.trainable_weights
+        # self.non_trainable_weights
+        # `inputs` / `outputs` will only appear in kwargs if either are
+        # misspelled.
+        generic_utils.validate_kwargs(
+            kwargs,
+            {
+                "trainable",
+                "dtype",
+                "dynamic",
+                "name",
+                "autocast",
+                "inputs",
+                "outputs",
+            },
+        )
+        super().__init__(**kwargs)
+        # By default, Model is a subclass model, which is not in graph network.
+        self._is_graph_network = False
+
+        self.inputs = None
+        self.outputs = None
+        self.input_names = None
+        self.output_names = None
+        # stop_training is used by callback to stop training when error happens
+        self.stop_training = False
+        self.history = None
+        # These objects are used in the default `Model.compile`. They are not
+        # guaranteed to be set after `Model.compile` is called, as users can
+        # override compile with custom logic.
+        self.compiled_loss = None
+        self.compiled_metrics = None
+
+        # This is True for Sequential networks and Functional networks.
+        self._compute_output_and_mask_jointly = False
+
+        # Don't reset compilation if already done. This may occur if calling
+        # `__init__` (or `_init_graph_network`) on an already-compiled model
+        # such as a Sequential model. Sequential models may need to rebuild
+        # themselves after compilation.
+        self._maybe_create_attribute("_is_compiled", False)
+        self._maybe_create_attribute("optimizer", None)
+
+        # Model must be created under scope of DistStrat it will be trained
+        # with.
+        if tf.distribute.has_strategy():
+            self._distribution_strategy = tf.distribute.get_strategy()
+        else:
+            self._distribution_strategy = None
+        self._distribute_reduction_method = None
+
+        self._cluster_coordinator = None
+
+        # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
+        self._run_eagerly = None
+        # Initialize cache attrs.
+        self._reset_compile_cache()
+
+        # Fault-tolerance handler. Set in `ModelCheckpoint`.
+        self._training_state = None
+        self._saved_model_inputs_spec = None
+        self._saved_model_arg_spec = None
+        self._checkpoint = tf.train.Checkpoint(root=weakref.ref(self))
+
+        self._steps_per_execution = None
+        self._steps_per_execution_tuner = None
+        self._autotune_steps_per_execution = False
+
+        self._layout_map = layout_map_lib.get_current_layout_map()
+
+        self._init_batch_counters()
+        self._base_model_initialized = True
+
+        # `jit_compile` starts off with None as default and gets overwritten by
+        # the value specified in `Model.compile`, and this is effective for
+        # `fit`, `evaluate`, and `predict`.
+        self._jit_compile = None
+
+    def _create_counter_variable(self, init_value):
+        """Helper function for counter variable creation.
+
+        For the DTensor use case with layout map, since the variable are not
+        tracked by model, they can't be visited by the layout map, and need to
+        be properly initialized as DVariable.
+        """
+        # This function should be removed after we move to the strategy based
+        # implementation for DTensor.
+        if self._layout_map is None:
+            agg = tf.VariableAggregation.ONLY_FIRST_REPLICA
+            return tf.Variable(init_value, dtype="int64", aggregation=agg)
+        else:
+            layout = dtensor_api.Layout.replicated(
+                mesh=self._layout_map.get_default_mesh(), rank=0
+            )
+            return dtensor_api.DVariable(
+                init_value, dtype="int64", layout=layout
+            )
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _init_batch_counters(self):
+        # Untracked Variables, used to keep track of mini-batches seen in `fit`,
+        # `evaluate`, and `predict`.
+        if not tf.inside_function():
+            # Creating variables inside tf.function is not allowed, hence
+            # these would otherwise prevent users from creating Keras layers
+            # inside tf.function.
+            # These variables are not connected to outputs so they have no
+            # effect on graph generation anyway.
+
+            self._train_counter = self._create_counter_variable(0)
+            self._test_counter = self._create_counter_variable(0)
+            self._predict_counter = self._create_counter_variable(0)
+
+    def __setattr__(self, name, value):
+        if not getattr(self, "_self_setattr_tracking", True):
+            super().__setattr__(name, value)
+            return
+
+        if all(
+            isinstance(v, (base_layer.Layer, tf.Variable))
+            or base_layer_utils.has_weights(v)
+            for v in tf.nest.flatten(value)
+        ):
+            try:
+                self._base_model_initialized
+            except AttributeError:
+                raise RuntimeError(
+                    "It looks like you are subclassing `Model` and you "
+                    "forgot to call `super().__init__()`."
+                    " Always start with this line."
+                )
+
+        super().__setattr__(name, value)
+
+    def __reduce__(self):
+        if self.built:
+            return (
+                pickle_utils.deserialize_model_from_bytecode,
+                (pickle_utils.serialize_model_as_bytecode(self),),
+            )
+        else:
+            # SavedModel (and hence serialize_model_as_bytecode) only support
+            # built models, but if the model is not built,
+            # it may be possible to serialize as a plain Python object,
+            # as long as the constituent parts (layers, optimizers, losses,
+            # etc.) can be serialized as plain Python objects.  Thus we call up
+            # the superclass hierarchy to get an implementation of __reduce__
+            # that can pickle this Model as a plain Python object.
+            return super().__reduce__()
+
+    def __deepcopy__(self, memo):
+        if self.built:
+            new = pickle_utils.deserialize_model_from_bytecode(
+                pickle_utils.serialize_model_as_bytecode(self)
+            )
+            memo[id(self)] = new
+        else:
+            # See comment in __reduce__ for explanation
+            deserializer, serialized, *rest = super().__reduce__()
+            new = deserializer(*serialized)
+            memo[id(self)] = new
+            if rest:
+                state = copy.deepcopy(rest[0], memo=memo)
+                new.__setstate__(state)
+        return new
+
+    def __copy__(self):
+        return self.__deepcopy__({})
+
+    @generic_utils.default
+    def build(self, input_shape):
+        """Builds the model based on input shapes received.
+
+        This is to be used for subclassed models, which do not know at
+        instantiation time what their inputs look like.
+
+        This method only exists for users who want to call `model.build()` in a
+        standalone way (as a substitute for calling the model on real data to
+        build it). It will never be called by the framework (and thus it will
+        never throw unexpected errors in an unrelated workflow).
+
+        Args:
+         input_shape: Single tuple, `TensorShape` instance, or list/dict of
+           shapes, where shapes are tuples, integers, or `TensorShape`
+           instances.
+
+        Raises:
+          ValueError:
+            1. In case of invalid user-provided data (not of type tuple,
+               list, `TensorShape`, or dict).
+            2. If the model requires call arguments that are agnostic
+               to the input shapes (positional or keyword arg in call
+               signature).
+            3. If not all layers were properly built.
+            4. If float type inputs are not supported within the layers.
+
+          In each of these cases, the user should build their model by calling
+          it on real tensor data.
+        """
+        if self._is_graph_network:
+            super().build(input_shape)
+            return
+
+        if input_shape is None:
+            raise ValueError(
+                "Input shape must be defined when calling `build()` on "
+                "a `Model` subclass."
+            )
+        valid_types = (tuple, list, tf.TensorShape, dict)
+        if not isinstance(input_shape, valid_types):
+            raise ValueError(
+                "Specified input shape is not one of the valid types. "
+                "Please specify a batch input shape of type tuple or "
+                "list of input shapes. User provided "
+                "input type: {}.".format(type(input_shape))
+            )
+
+        if input_shape and not self.inputs:
+            # We create placeholders for the `None`s in the shape and build the
+            # model in a Graph. Since tf.Variable is compatible with both eager
+            # execution and graph building, the variables created after building
+            # the model in a Graph are still valid when executing eagerly.
+            if tf.executing_eagerly():
+                graph = tf.__internal__.FuncGraph("build_graph")
+            else:
+                graph = backend.get_graph()
+            with graph.as_default():
+                if isinstance(input_shape, list) and all(
+                    d is None or isinstance(d, int) for d in input_shape
+                ):
+                    input_shape = tuple(input_shape)
+                if isinstance(input_shape, list):
+                    x = [
+                        base_layer_utils.generate_placeholders_from_shape(shape)
+                        for shape in input_shape
+                    ]
+                elif isinstance(input_shape, dict):
+                    x = {
+                        k: base_layer_utils.generate_placeholders_from_shape(
+                            shape
+                        )
+                        for k, shape in input_shape.items()
+                    }
+                else:
+                    x = base_layer_utils.generate_placeholders_from_shape(
+                        input_shape
+                    )
+
+                kwargs = {}
+                call_signature = self._call_spec.full_argspec
+                call_args = call_signature.args
+                # Exclude `self`, `inputs`, and any argument with a default
+                # value.
+                if len(call_args) > 2:
+                    if call_signature.defaults:
+                        call_args = call_args[2 : -len(call_signature.defaults)]
+                    else:
+                        call_args = call_args[2:]
+                    for arg in call_args:
+                        if arg == "training":
+                            # Case where `training` is a positional arg with no
+                            # default.
+                            kwargs["training"] = False
+                        else:
+                            # Has invalid call signature with unknown positional
+                            # arguments.
+                            raise ValueError(
+                                "Currently, you cannot build your model if it "
+                                "has positional or keyword arguments that are "
+                                "not inputs to the model, but are required for "
+                                "its `call()` method. Instead, in order to "
+                                "instantiate and build your model, `call()` "
+                                "your model on real tensor data with all "
+                                "expected call arguments. The argument "
+                                "for `call()` can be a single list/tuple that "
+                                "contains multiple inputs."
+                            )
+                elif len(call_args) < 2:
+                    # Signature without `inputs`.
+                    raise ValueError(
+                        "You can only call `build()` on a model if its "
+                        "`call()` method accepts an `inputs` argument."
+                    )
+                try:
+                    self.call(x, **kwargs)
+                except (tf.errors.InvalidArgumentError, TypeError) as e:
+                    raise ValueError(
+                        "You cannot build your model by calling `build` "
+                        "if your layers do not support float type inputs. "
+                        "Instead, in order to instantiate and build your "
+                        "model, call your model on real tensor data (of "
+                        "the correct dtype).\n\nThe actual error from "
+                        f"`call` is: {e}."
+                    )
+        super().build(input_shape)
+
+    @traceback_utils.filter_traceback
+    def __call__(self, *args, **kwargs):
+        if self._layout_map is not None and not self.built:
+            # Note that this method is only overridden for DTensor and layout
+            # injection purpose.
+            # Capture the inputs and create graph input as replacement for model
+            # to initialize its weights first.
+            copied_args = copy.copy(args)
+            copied_kwargs = copy.copy(kwargs)
+
+            (
+                inputs,
+                copied_args,
+                copied_kwargs,
+            ) = self._call_spec.split_out_first_arg(copied_args, copied_kwargs)
+
+            def _convert_to_graph_inputs(x):
+                if isinstance(x, (tf.Tensor, np.ndarray, float, int)):
+                    x = tf.convert_to_tensor(x)
+                    return input_layer_module.Input(x.shape)
+
+            # TODO(scottzhu): maybe better handle mask and training flag.
+            inputs = tf.nest.map_structure(_convert_to_graph_inputs, inputs)
+            copied_args = tf.nest.map_structure(
+                _convert_to_graph_inputs, copied_args
+            )
+            copied_kwargs = tf.nest.map_structure(
+                _convert_to_graph_inputs, copied_kwargs
+            )
+
+            with layout_map_lib.layout_map_scope(self._layout_map):
+                # We ignore the result here.
+                super().__call__(inputs, *copied_args, **copied_kwargs)
+
+            layout_map_lib._map_subclass_model_variable(self, self._layout_map)
+
+        return super().__call__(*args, **kwargs)
+
+    @doc_controls.doc_in_current_and_subclasses
+    def call(self, inputs, training=None, mask=None):
+        """Calls the model on new inputs and returns the outputs as tensors.
+
+        In this case `call()` just reapplies
+        all ops in the graph to the new inputs
+        (e.g. build a new computational graph from the provided inputs).
+
+        Note: This method should not be called directly. It is only meant to be
+        overridden when subclassing `tf.keras.Model`.
+        To call a model on an input, always use the `__call__()` method,
+        i.e. `model(inputs)`, which relies on the underlying `call()` method.
+
+        Args:
+            inputs: Input tensor, or dict/list/tuple of input tensors.
+            training: Boolean or boolean scalar tensor, indicating whether to
+              run the `Network` in training mode or inference mode.
+            mask: A mask or list of masks. A mask can be either a boolean tensor
+              or None (no mask). For more details, check the guide
+              [here](https://www.tensorflow.org/guide/keras/masking_and_padding).
+
+        Returns:
+            A tensor if there is a single output, or
+            a list of tensors if there are more than one outputs.
+        """
+        raise NotImplementedError(
+            "Unimplemented `tf.keras.Model.call()`: if you "
+            "intend to create a `Model` with the Functional "
+            "API, please provide `inputs` and `outputs` "
+            "arguments. Otherwise, subclass `Model` with an "
+            "overridden `call()` method."
+        )
+
+    @traceback_utils.filter_traceback
+    def compile(
+        self,
+        optimizer="rmsprop",
+        loss=None,
+        metrics=None,
+        loss_weights=None,
+        weighted_metrics=None,
+        run_eagerly=None,
+        steps_per_execution=None,
+        jit_compile=None,
+        pss_evaluation_shards=0,
+        **kwargs,
+    ):
+        """Configures the model for training.
+
+        Example:
+
+        ```python
+        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
+                      loss=tf.keras.losses.BinaryCrossentropy(),
+                      metrics=[tf.keras.metrics.BinaryAccuracy(),
+                               tf.keras.metrics.FalseNegatives()])
+        ```
+
+        Args:
+            optimizer: String (name of optimizer) or optimizer instance. See
+              `tf.keras.optimizers`.
+            loss: Loss function. May be a string (name of loss function), or
+              a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
+              function is any callable with the signature `loss = fn(y_true,
+              y_pred)`, where `y_true` are the ground truth values, and
+              `y_pred` are the model's predictions.
+              `y_true` should have shape
+              `(batch_size, d0, .. dN)` (except in the case of
+              sparse loss functions such as
+              sparse categorical crossentropy which expects integer arrays of
+              shape `(batch_size, d0, .. dN-1)`).
+              `y_pred` should have shape `(batch_size, d0, .. dN)`.
+              The loss function should return a float tensor.
+              If a custom `Loss` instance is
+              used and reduction is set to `None`, return value has shape
+              `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
+              values; otherwise, it is a scalar. If the model has multiple
+              outputs, you can use a different loss on each output by passing a
+              dictionary or a list of losses. The loss value that will be
+              minimized by the model will then be the sum of all individual
+              losses, unless `loss_weights` is specified.
+            metrics: List of metrics to be evaluated by the model during
+              training and testing. Each of this can be a string (name of a
+              built-in function), function or a `tf.keras.metrics.Metric`
+              instance. See `tf.keras.metrics`. Typically you will use
+              `metrics=['accuracy']`.
+              A function is any callable with the signature `result = fn(y_true,
+              y_pred)`. To specify different metrics for different outputs of a
+              multi-output model, you could also pass a dictionary, such as
+              `metrics={'output_a':'accuracy', 'output_b':['accuracy', 'mse']}`.
+              You can also pass a list to specify a metric or a list of metrics
+              for each output, such as
+              `metrics=[['accuracy'], ['accuracy', 'mse']]`
+              or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
+              strings 'accuracy' or 'acc', we convert this to one of
+              `tf.keras.metrics.BinaryAccuracy`,
+              `tf.keras.metrics.CategoricalAccuracy`,
+              `tf.keras.metrics.SparseCategoricalAccuracy` based on the shapes
+              of the targets and of the model output. We do a similar
+              conversion for the strings 'crossentropy' and 'ce' as well.
+              The metrics passed here are evaluated without sample weighting; if
+              you would like sample weighting to apply, you can specify your
+              metrics via the `weighted_metrics` argument instead.
+            loss_weights: Optional list or dictionary specifying scalar
+              coefficients (Python floats) to weight the loss contributions of
+              different model outputs. The loss value that will be minimized by
+              the model will then be the *weighted sum* of all individual
+              losses, weighted by the `loss_weights` coefficients.  If a list,
+              it is expected to have a 1:1 mapping to the model's outputs. If a
+              dict, it is expected to map output names (strings) to scalar
+              coefficients.
+            weighted_metrics: List of metrics to be evaluated and weighted by
+              `sample_weight` or `class_weight` during training and testing.
+            run_eagerly: Bool. If `True`, this `Model`'s logic will not be
+              wrapped in a `tf.function`. Recommended to leave this as `None`
+              unless your `Model` cannot be run inside a `tf.function`.
+              `run_eagerly=True` is not supported when using
+              `tf.distribute.experimental.ParameterServerStrategy`. Defaults to
+               `False`.
+            steps_per_execution: Int or `'auto'`. The number of batches to
+              run during each `tf.function` call. If set to "auto", keras will
+              automatically tune `steps_per_execution` during runtime. Running
+              multiple batches inside a single `tf.function` call can greatly
+              improve performance on TPUs, when used with distributed strategies
+              such as `ParameterServerStrategy`, or with small models with a
+              large Python overhead. At most, one full epoch will be run each
+              execution. If a number larger than the size of the epoch is
+              passed, the execution will be truncated to the size of the epoch.
+              Note that if `steps_per_execution` is set to `N`,
+              `Callback.on_batch_begin` and `Callback.on_batch_end` methods will
+              only be called every `N` batches (i.e. before/after each
+              `tf.function` execution). Defaults to `1`.
+            jit_compile: If `True`, compile the model training step with XLA.
+              [XLA](https://www.tensorflow.org/xla) is an optimizing compiler
+              for machine learning.
+              `jit_compile` is not enabled for by default.
+              Note that `jit_compile=True`
+              may not necessarily work for all models.
+              For more information on supported operations please refer to the
+              [XLA documentation](https://www.tensorflow.org/xla).
+              Also refer to
+              [known XLA issues](https://www.tensorflow.org/xla/known_issues)
+              for more details.
+            pss_evaluation_shards: Integer or 'auto'. Used for
+              `tf.distribute.ParameterServerStrategy` training only. This arg
+              sets the number of shards to split the dataset into, to enable an
+              exact visitation guarantee for evaluation, meaning the model will
+              be applied to each dataset element exactly once, even if workers
+              fail. The dataset must be sharded to ensure separate workers do
+              not process the same data. The number of shards should be at least
+              the number of workers for good performance. A value of 'auto'
+              turns on exact evaluation and uses a heuristic for the number of
+              shards based on the number of workers. 0, meaning no
+              visitation guarantee is provided. NOTE: Custom implementations of
+              `Model.test_step` will be ignored when doing exact evaluation.
+              Defaults to `0`.
+            **kwargs: Arguments supported for backwards compatibility only.
+        """
+        if jit_compile and not tf_utils.can_jit_compile(warn=True):
+            jit_compile = False
+        base_layer.keras_api_gauge.get_cell("compile").set(True)
+        self._compile_config = serialization_lib.Config(
+            optimizer=optimizer,
+            loss=loss,
+            metrics=metrics,
+            loss_weights=loss_weights,
+            weighted_metrics=weighted_metrics,
+            run_eagerly=run_eagerly,
+            steps_per_execution=steps_per_execution,
+            jit_compile=jit_compile,
+        )
+        with self.distribute_strategy.scope():
+            if "experimental_steps_per_execution" in kwargs:
+                logging.warning(
+                    "The argument `steps_per_execution` is no longer "
+                    "experimental. Pass `steps_per_execution` instead of "
+                    "`experimental_steps_per_execution`."
+                )
+                if not steps_per_execution:
+                    steps_per_execution = kwargs.pop(
+                        "experimental_steps_per_execution"
+                    )
+
+            # When compiling from an already-serialized model, we do not want to
+            # reapply some processing steps (e.g. metric renaming for
+            # multi-output models, which have prefixes added for each
+            # corresponding output name).
+            from_serialized = kwargs.pop("from_serialized", False)
+
+            self._validate_compile(optimizer, metrics, **kwargs)
+            self._run_eagerly = run_eagerly
+
+            self.optimizer = self._get_optimizer(optimizer)
+
+            mesh = None
+            if self._layout_map is not None:
+                mesh = self._layout_map.get_default_mesh()
+
+            if isinstance(loss, compile_utils.LossesContainer):
+                self.compiled_loss = loss
+            else:
+                self.compiled_loss = compile_utils.LossesContainer(
+                    loss,
+                    loss_weights,
+                    output_names=self.output_names,
+                    mesh=mesh,
+                )
+            self.compiled_metrics = compile_utils.MetricsContainer(
+                metrics,
+                weighted_metrics,
+                output_names=self.output_names,
+                from_serialized=from_serialized,
+                mesh=mesh,
+            )
+
+            if steps_per_execution == "auto":
+                if self._steps_per_execution is None:
+                    self._configure_steps_per_execution(1)
+                self._steps_per_execution_tuner = (
+                    steps_per_execution_tuning.StepsPerExecutionTuner(
+                        self.optimizer, self._steps_per_execution
+                    )
+                )
+                self._autotune_steps_per_execution = True
+            else:
+                self._configure_steps_per_execution(steps_per_execution or 1)
+
+            self._pss_evaluation_shards = self._infer_exact_eval_shards(
+                pss_evaluation_shards
+            )
+
+            # Initializes attrs that are reset each time `compile` is called.
+            self._reset_compile_cache()
+            self._is_compiled = True
+            self.loss = loss or {}
+            if (self._run_eagerly or self.dynamic) and jit_compile:
+                raise ValueError(
+                    "You cannot enable `run_eagerly` and `jit_compile` "
+                    "at the same time."
+                )
+            else:
+                self._jit_compile = jit_compile
+
+    def _get_optimizer(self, optimizer):
+        """Wraps `optimizer` in `LossScaleOptimizer` if necessary."""
+
+        def _get_single_optimizer(opt):
+            opt = optimizers.get(opt)
+            if self.dtype_policy.name == "mixed_float16" and not isinstance(
+                opt, lso.BaseLossScaleOptimizer
+            ):
+                # Loss scaling is necessary with mixed_float16 for models to
+                # converge to the same accuracy as with float32.
+                opt = lso.BaseLossScaleOptimizer(opt)
+            return opt
+
+        return tf.nest.map_structure(_get_single_optimizer, optimizer)
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _reset_compile_cache(self):
+        self.train_function = None
+        self.test_function = None
+        self.predict_function = None
+        # Used to cache the `tf.function`'ed `train_function` to be logged in
+        # TensorBoard, since the original `train_function` is not necessarily
+        # a `tf.function` (e.g., with ParameterServerStrategy, the
+        # `train_function` is a scheduling of the actual training function to a
+        # remote worker).
+        self.train_tf_function = None
+
+        # Used to cache `trainable` attr of `Layer`s for `fit`.
+        self._compiled_trainable_state = self._get_trainable_state()
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _configure_steps_per_execution(self, steps_per_execution):
+        self._steps_per_execution = self._create_counter_variable(
+            steps_per_execution
+        )
+
+    @property
+    def _should_compute_mask(self):
+        return False
+
+    @property
+    def metrics(self):
+        """Return metrics added using `compile()` or `add_metric()`.
+
+        Note: Metrics passed to `compile()` are available only after a
+        `keras.Model` has been trained/evaluated on actual data.
+
+        Examples:
+
+        >>> inputs = tf.keras.layers.Input(shape=(3,))
+        >>> outputs = tf.keras.layers.Dense(2)(inputs)
+        >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+        >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
+        >>> [m.name for m in model.metrics]
+        []
+
+        >>> x = np.random.random((2, 3))
+        >>> y = np.random.randint(0, 2, (2, 2))
+        >>> model.fit(x, y)
+        >>> [m.name for m in model.metrics]
+        ['loss', 'mae']
+
+        >>> inputs = tf.keras.layers.Input(shape=(3,))
+        >>> d = tf.keras.layers.Dense(2, name='out')
+        >>> output_1 = d(inputs)
+        >>> output_2 = d(inputs)
+        >>> model = tf.keras.models.Model(
+        ...    inputs=inputs, outputs=[output_1, output_2])
+        >>> model.add_metric(
+        ...    tf.reduce_sum(output_2), name='mean', aggregation='mean')
+        >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"])
+        >>> model.fit(x, (y, y))
+        >>> [m.name for m in model.metrics]
+        ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae',
+        'out_1_acc', 'mean']
+
+        """
+        metrics = []
+        if self._is_compiled:
+            if self.compiled_loss is not None:
+                metrics += self.compiled_loss.metrics
+            if self.compiled_metrics is not None:
+                metrics += self.compiled_metrics.metrics
+
+        for l in self._flatten_layers():
+            metrics.extend(l._metrics)
+        return metrics
+
+    @property
+    def metrics_names(self):
+        """Returns the model's display labels for all outputs.
+
+        Note: `metrics_names` are available only after a `keras.Model` has been
+        trained/evaluated on actual data.
+
+        Examples:
+
+        >>> inputs = tf.keras.layers.Input(shape=(3,))
+        >>> outputs = tf.keras.layers.Dense(2)(inputs)
+        >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+        >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
+        >>> model.metrics_names
+        []
+
+        >>> x = np.random.random((2, 3))
+        >>> y = np.random.randint(0, 2, (2, 2))
+        >>> model.fit(x, y)
+        >>> model.metrics_names
+        ['loss', 'mae']
+
+        >>> inputs = tf.keras.layers.Input(shape=(3,))
+        >>> d = tf.keras.layers.Dense(2, name='out')
+        >>> output_1 = d(inputs)
+        >>> output_2 = d(inputs)
+        >>> model = tf.keras.models.Model(
+        ...    inputs=inputs, outputs=[output_1, output_2])
+        >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"])
+        >>> model.fit(x, (y, y))
+        >>> model.metrics_names
+        ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae',
+        'out_1_acc']
+
+        """
+
+        # This property includes all output names including `loss` and
+        # per-output losses for backward compatibility.
+        return [m.name for m in self.metrics]
+
+    @property
+    def distribute_strategy(self):
+        """The `tf.distribute.Strategy` this model was created under."""
+        return self._distribution_strategy or tf.distribute.get_strategy()
+
+    @property
+    def run_eagerly(self):
+        """Settable attribute indicating whether the model should run eagerly.
+
+        Running eagerly means that your model will be run step by step,
+        like Python code. Your model might run slower, but it should become
+        easier for you to debug it by stepping into individual layer calls.
+
+        By default, we will attempt to compile your model to a static graph to
+        deliver the best execution performance.
+
+        Returns:
+          Boolean, whether the model should run eagerly.
+        """
+        if self.dynamic and self._run_eagerly == False:
+            # TODO(fchollet): consider using py_func to enable this.
+            raise ValueError(
+                "Your model contains layers that can only be "
+                "successfully run in eager execution (layers "
+                "constructed with `dynamic=True`). "
+                "You cannot set `run_eagerly=False`."
+            )
+
+        if self._cluster_coordinator and self._run_eagerly:
+            raise ValueError(
+                "When using `Model` with `ParameterServerStrategy`, "
+                "`run_eagerly` is not supported."
+            )
+
+        # Run eagerly logic, by priority:
+        # (1) Dynamic models must be run eagerly.
+        # (2) Explicitly setting run_eagerly causes a Model to be run eagerly.
+        # (3) Not explicitly setting run_eagerly defaults to TF's global
+        # setting.
+        return (
+            self.dynamic
+            or self._run_eagerly
+            or (tf.config.functions_run_eagerly() and self._run_eagerly is None)
+        )
+
+    @run_eagerly.setter
+    def run_eagerly(self, value):
+        self._run_eagerly = value
+
+    @property
+    def autotune_steps_per_execution(self):
+        """Settable property to enable tuning for steps_per_execution"""
+        return self._autotune_steps_per_execution
+
+    @autotune_steps_per_execution.setter
+    def autotune_steps_per_execution(self, value):
+        self._autotune_steps_per_execution = value
+        if value and self._steps_per_execution_tuner is None:
+            if self._steps_per_execution is None:
+                self._configure_steps_per_execution(1)
+            self._steps_per_execution_tuner = (
+                steps_per_execution_tuning.StepsPerExecutionTuner(
+                    self.optimizer, self._steps_per_execution
+                )
+            )
+
+    @property
+    def steps_per_execution(self):
+        """Settable `steps_per_execution variable. Requires a compiled model."""
+        return self._steps_per_execution
+
+    @steps_per_execution.setter
+    def steps_per_execution(self, value):
+        if self._steps_per_execution is None:
+            self._configure_steps_per_execution(value)
+        else:
+            self._steps_per_execution.assign(value)
+
+    @property
+    def jit_compile(self):
+        """Specify whether to compile the model with XLA.
+
+        [XLA](https://www.tensorflow.org/xla) is an optimizing compiler
+        for machine learning. `jit_compile` is not enabled by default.
+        Note that `jit_compile=True` may not necessarily work for all models.
+
+        For more information on supported operations please refer to the
+        [XLA documentation](https://www.tensorflow.org/xla). Also refer to
+        [known XLA issues](https://www.tensorflow.org/xla/known_issues)
+        for more details.
+        """
+        return self._jit_compile
+
+    @jit_compile.setter
+    def jit_compile(self, value):
+        # Function remains cached with previous jit_compile settings
+        if self._jit_compile == value:
+            # Avoid resetting compiler cache if possible if the value is the
+            # same
+            return
+        # Check if TensorFlow is compiled with XLA before setting the value
+        if value and not tf_utils.can_jit_compile(warn=True):
+            self._jit_compile = False
+            return
+
+        self._jit_compile = value
+        # Setting `jit_compile` should invalidate previously cached functions.
+        self._reset_compile_cache()
+
+    @property
+    def distribute_reduction_method(self):
+        """The method employed to reduce per-replica values during training.
+
+        Unless specified, the value "auto" will be assumed, indicating that
+        the reduction strategy should be chosen based on the current
+        running environment.
+        See `reduce_per_replica` function for more details.
+
+        """
+        return self._distribute_reduction_method or "auto"
+
+    @distribute_reduction_method.setter
+    def distribute_reduction_method(self, value):
+        self._distribute_reduction_method = value
+
+    def _validate_target_and_loss(self, y, loss):
+        """Raises error if target or loss is not found.
+
+        This method verifies that the target and loss are properly populated
+        when applicable, or raises errors.
+
+        Args:
+          y: the target for training.
+          loss: the total loss tensor including loss added via `compile` and
+            `add_loss`.
+        """
+
+        # `self.loss` references the loss added via `compile` call. If users
+        # have provided such, the target must be provided; otherwise it's a user
+        # error.  Note that `self.loss` does not include losses added via
+        # `add_loss`, and it is a valid use when such loss from `add_loss`
+        # exists and target does not.
+        if self.loss and y is None:
+            raise ValueError(
+                "Target data is missing. Your model was compiled with "
+                f"loss={self.loss}, "
+                "and therefore expects target data to be provided in `fit()`."
+            )
+
+        # For training, there must be compiled loss or regularization loss to
+        # exist in order to apply the gradients. If one is not found, it means
+        # no loss was supplied via `compile` or `add_loss`.
+        elif loss is None:
+            raise ValueError(
+                "No loss found. You may have forgotten to provide a `loss` "
+                "argument in the `compile()` method."
+            )
+
+    def train_step(self, data):
+        """The logic for one training step.
+
+        This method can be overridden to support custom training logic.
+        For concrete examples of how to override this method see
+        [Customizing what happens in fit](
+        https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit).
+        This method is called by `Model.make_train_function`.
+
+        This method should contain the mathematical logic for one step of
+        training.  This typically includes the forward pass, loss calculation,
+        backpropagation, and metric updates.
+
+        Configuration details for *how* this logic is run (e.g. `tf.function`
+        and `tf.distribute.Strategy` settings), should be left to
+        `Model.make_train_function`, which can also be overridden.
+
+        Args:
+          data: A nested structure of `Tensor`s.
+
+        Returns:
+          A `dict` containing values that will be passed to
+          `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
+          values of the `Model`'s metrics are returned. Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
+        """
+        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+        # Run forward pass.
+        with tf.GradientTape() as tape:
+            y_pred = self(x, training=True)
+            loss = self.compute_loss(x, y, y_pred, sample_weight)
+        self._validate_target_and_loss(y, loss)
+        # Run backwards pass.
+        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
+        return self.compute_metrics(x, y, y_pred, sample_weight)
+
+    def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
+        """Compute the total loss, validate it, and return it.
+
+        Subclasses can optionally override this method to provide custom loss
+        computation logic.
+
+        Example:
+        ```python
+        class MyModel(tf.keras.Model):
+
+          def __init__(self, *args, **kwargs):
+            super(MyModel, self).__init__(*args, **kwargs)
+            self.loss_tracker = tf.keras.metrics.Mean(name='loss')
+
+          def compute_loss(self, x, y, y_pred, sample_weight):
+            loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y))
+            loss += tf.add_n(self.losses)
+            self.loss_tracker.update_state(loss)
+            return loss
+
+          def reset_metrics(self):
+            self.loss_tracker.reset_states()
+
+          @property
+          def metrics(self):
+            return [self.loss_tracker]
+
+        tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
+        dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
+
+        inputs = tf.keras.layers.Input(shape=(10,), name='my_input')
+        outputs = tf.keras.layers.Dense(10)(inputs)
+        model = MyModel(inputs, outputs)
+        model.add_loss(tf.reduce_sum(outputs))
+
+        optimizer = tf.keras.optimizers.SGD()
+        model.compile(optimizer, loss='mse', steps_per_execution=10)
+        model.fit(dataset, epochs=2, steps_per_epoch=10)
+        print('My custom loss: ', model.loss_tracker.result().numpy())
+        ```
+
+        Args:
+          x: Input data.
+          y: Target data.
+          y_pred: Predictions returned by the model (output of `model(x)`)
+          sample_weight: Sample weights for weighting the loss function.
+
+        Returns:
+          The total loss as a `tf.Tensor`, or `None` if no loss results (which
+          is the case when called by `Model.test_step`).
+        """
+        del x  # The default implementation does not use `x`.
+        return self.compiled_loss(
+            y, y_pred, sample_weight, regularization_losses=self.losses
+        )
+
+    def compute_metrics(self, x, y, y_pred, sample_weight):
+        """Update metric states and collect all metrics to be returned.
+
+        Subclasses can optionally override this method to provide custom metric
+        updating and collection logic.
+
+        Example:
+        ```python
+        class MyModel(tf.keras.Sequential):
+
+          def compute_metrics(self, x, y, y_pred, sample_weight):
+
+            # This super call updates `self.compiled_metrics` and returns
+            # results for all metrics listed in `self.metrics`.
+            metric_results = super(MyModel, self).compute_metrics(
+                x, y, y_pred, sample_weight)
+
+            # Note that `self.custom_metric` is not listed in `self.metrics`.
+            self.custom_metric.update_state(x, y, y_pred, sample_weight)
+            metric_results['custom_metric_name'] = self.custom_metric.result()
+            return metric_results
+        ```
+
+        Args:
+          x: Input data.
+          y: Target data.
+          y_pred: Predictions returned by the model (output of `model.call(x)`)
+          sample_weight: Sample weights for weighting the loss function.
+
+        Returns:
+          A `dict` containing values that will be passed to
+          `tf.keras.callbacks.CallbackList.on_train_batch_end()`. Typically, the
+          values of the metrics listed in `self.metrics` are returned. Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
+        """
+        del x  # The default implementation does not use `x`.
+        self.compiled_metrics.update_state(y, y_pred, sample_weight)
+        return self.get_metrics_result()
+
+    def get_metrics_result(self):
+        """Returns the model's metrics values as a dict.
+
+        If any of the metric result is a dict (containing multiple metrics),
+        each of them gets added to the top level returned dict of this method.
+
+        Returns:
+          A `dict` containing values of the metrics listed in `self.metrics`.
+          Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
+        """
+        # Collect metrics to return
+        return_metrics = {}
+        for metric in self.metrics:
+            result = metric.result()
+            if isinstance(result, dict):
+                return_metrics.update(result)
+            else:
+                return_metrics[metric.name] = result
+        return return_metrics
+
+    def _validate_and_get_metrics_result(self, logs):
+        """Returns model metrics as a dict if the keys match with input logs.
+
+        When the training / evalution is performed with asynchronous steps, such
+        as the case with `tf.distribute.ParameterServerStrategy`, the last
+        scheduled `train / test_step` may not give the latest metrics because it
+        is not guaranteed to be executed the last. This method gets metrics from
+        the model directly instead of relying on the return from last step
+        function.
+
+        It logs a warning if the metric results could not be overridden when
+        used with `tf.distribute.ParameterServerStrategy`.
+
+        When the user has custom train / test step functions, the metrics
+        returned may be different from `Model.metrics`. In those instances,
+        this function will be no-op and return the logs.
+
+        Args:
+          logs: A `dict` of metrics returned by train / test step function.
+
+        Returns:
+          A `dict` containing values of the metrics listed in `self.metrics`
+          when logs and model metrics keys match. Otherwise it returns input
+          `logs`.
+        """
+        PSS_WARN_MSG = "Could not get Model metric results. \
+        Using the results of last step function could lead to incorrect \
+        results when used with ParameterServerStrategy"
+        try:
+            metric_logs = self.get_metrics_result()
+        except TypeError:
+            if self._cluster_coordinator:
+                logging.warning(PSS_WARN_MSG)
+        else:
+            # Verify that train / test step logs passed and metric logs have
+            # matching keys. Could be different when using custom step functions
+            if isinstance(logs, dict) and set(logs.keys()) == set(
+                metric_logs.keys()
+            ):
+                logs = tf_utils.sync_to_numpy_or_python_type(metric_logs)
+            elif self._cluster_coordinator:
+                logging.warning(PSS_WARN_MSG)
+        return logs
 
-      def compute_metrics(self, x, y, y_pred, sample_weight):
+    def _aggregate_exact_metrics(self, logs):
+        # When doing exact evaluation, `logs` is a list of each data shard's
+        # metric variables, which will be used to update the metrics.
+        for shard_result in logs:
+            for metric in self.metrics:
+                if metric.name not in shard_result.keys():
+                    logging.log_first_n(
+                        logging.WARN,
+                        f"No matching result found for metric {metric.name}. "
+                        "This metric's computed result may be incorrect.",
+                        3,
+                    )
+                    continue
+                metric_result = shard_result[metric.name]
+                if len(metric_result) != len(metric.weights):
+                    raise ValueError(
+                        f"Expected {len(metric.weights)} variables in result "
+                        f"for metric {metric.name}, but found "
+                        f"{len(metric_result)}."
+                    )
+                for weight, val in zip(metric.weights, metric_result):
+                    weight.assign_add(val)
+        return self.get_metrics_result()
+
+    def make_train_function(self, force=False):
+        """Creates a function that executes one step of training.
+
+        This method can be overridden to support custom training logic.
+        This method is called by `Model.fit` and `Model.train_on_batch`.
+
+        Typically, this method directly controls `tf.function` and
+        `tf.distribute.Strategy` settings, and delegates the actual training
+        logic to `Model.train_step`.
+
+        This function is cached the first time `Model.fit` or
+        `Model.train_on_batch` is called. The cache is cleared whenever
+        `Model.compile` is called. You can skip the cache and generate again the
+        function with `force=True`.
+
+        Args:
+          force: Whether to regenerate the train function and skip the cached
+            function if available.
+
+        Returns:
+          Function. The function created by this method should accept a
+          `tf.data.Iterator`, and return a `dict` containing values that will
+          be passed to `tf.keras.Callbacks.on_train_batch_end`, such as
+          `{'loss': 0.2, 'accuracy': 0.7}`.
+        """
+        if self.train_function is not None and not force:
+            return self.train_function
+
+        def step_function(model, iterator):
+            """Runs a single training step."""
+
+            def run_step(data):
+                outputs = model.train_step(data)
+                # Ensure counter is updated only if `train_step` succeeds.
+                with tf.control_dependencies(_minimum_control_deps(outputs)):
+                    model._train_counter.assign_add(1)
+                return outputs
+
+            if self.jit_compile:
+                run_step = tf.function(
+                    run_step, jit_compile=True, reduce_retracing=True
+                )
+            data = next(iterator)
+            outputs = model.distribute_strategy.run(run_step, args=(data,))
+            outputs = reduce_per_replica(
+                outputs,
+                self.distribute_strategy,
+                reduction=self.distribute_reduction_method,
+            )
+            return outputs
+
+        # Special case if steps_per_execution is one.
+        if (
+            self._steps_per_execution is None
+            or self._steps_per_execution.numpy().item() == 1
+            and not self.autotune_steps_per_execution
+        ):
+
+            def train_function(iterator):
+                """Runs a training execution with a single step."""
+                return step_function(self, iterator)
+
+            if not self.run_eagerly:
+                train_function = tf.function(
+                    train_function, reduce_retracing=True
+                )
+                self.train_tf_function = train_function
+
+            if self._cluster_coordinator:
+                self.train_function = (
+                    lambda it: self._cluster_coordinator.schedule(
+                        train_function, args=(it,)
+                    )
+                )
+            else:
+                self.train_function = train_function
+
+        # If we're using a coordinator, use the value of
+        # self._steps_per_execution at the time the function is
+        # called/scheduled, and not when it is actually executed.
+        elif self._cluster_coordinator:
+
+            def train_function(iterator, steps_per_execution):
+                """Runs a training execution with multiple steps."""
+                for _ in tf.range(steps_per_execution):
+                    outputs = step_function(self, iterator)
+                return outputs
+
+            if not self.run_eagerly:
+                train_function = tf.function(
+                    train_function, reduce_retracing=True
+                )
+                self.train_tf_function = train_function
+
+            self.train_function = lambda it: self._cluster_coordinator.schedule(
+                train_function, args=(it, self._steps_per_execution.value())
+            )
+        else:
 
-        # This super call updates `self.compiled_metrics` and returns results
-        # for all metrics listed in `self.metrics`.
-        metric_results = super(MyModel, self).compute_metrics(
-            x, y, y_pred, sample_weight)
+            def train_function(iterator):
+                """Runs a training execution with multiple steps."""
+                for _ in tf.range(self._steps_per_execution):
+                    outputs = step_function(self, iterator)
+                return outputs
 
-        # Note that `self.custom_metric` is not listed in `self.metrics`.
-        self.custom_metric.update_state(x, y, y_pred, sample_weight)
-        metric_results['custom_metric_name'] = self.custom_metric.result()
-        return metric_results
-    ```
+            if not self.run_eagerly:
+                train_function = tf.function(
+                    train_function, reduce_retracing=True
+                )
+                self.train_tf_function = train_function
+            self.train_function = train_function
 
-    Args:
-      x: Input data.
-      y: Target data.
-      y_pred: Predictions returned by the model (output of `model.call(x)`)
-      sample_weight: Sample weights for weighting the loss function.
+        return self.train_function
 
-    Returns:
-      A `dict` containing values that will be passed to
-      `tf.keras.callbacks.CallbackList.on_train_batch_end()`. Typically, the
-      values of the metrics listed in `self.metrics` are returned. Example:
-      `{'loss': 0.2, 'accuracy': 0.7}`.
-    """
-    del x  # The default implementation does not use `x`.
-    self.compiled_metrics.update_state(y, y_pred, sample_weight)
-    # Collect metrics to return
-    return_metrics = {}
-    for metric in self.metrics:
-      result = metric.result()
-      if isinstance(result, dict):
-        return_metrics.update(result)
-      else:
-        return_metrics[metric.name] = result
-    return return_metrics
-
-  def make_train_function(self, force=False):
-    """Creates a function that executes one step of training.
-
-    This method can be overridden to support custom training logic.
-    This method is called by `Model.fit` and `Model.train_on_batch`.
-
-    Typically, this method directly controls `tf.function` and
-    `tf.distribute.Strategy` settings, and delegates the actual training
-    logic to `Model.train_step`.
-
-    This function is cached the first time `Model.fit` or
-    `Model.train_on_batch` is called. The cache is cleared whenever
-    `Model.compile` is called. You can skip the cache and generate again the
-    function with `force=True`.
+    @traceback_utils.filter_traceback
+    def fit(
+        self,
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose="auto",
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_batch_size=None,
+        validation_freq=1,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+    ):
+        """Trains the model for a fixed number of epochs (dataset iterations).
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                if the model has named inputs.
+              - A `tf.data` dataset. Should return a tuple
+                of either `(inputs, targets)` or
+                `(inputs, targets, sample_weights)`.
+              - A generator or `keras.utils.Sequence` returning `(inputs,
+                targets)` or `(inputs, targets, sample_weights)`.
+              - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
+                callable that takes a single argument of type
+                `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
+                `DatasetCreator` should be used when users prefer to specify the
+                per-replica batching and sharding logic for the `Dataset`.
+                See `tf.keras.utils.experimental.DatasetCreator` doc for more
+                information.
+              A more detailed description of unpacking behavior for iterator
+              types (Dataset, generator, Sequence) is given below. If these
+              include `sample_weights` as a third component, note that sample
+              weighting applies to the `weighted_metrics` argument but not the
+              `metrics` argument in `compile()`. If using
+              `tf.distribute.experimental.ParameterServerStrategy`, only
+              `DatasetCreator` type is supported for `x`.
+            y: Target data. Like the input data `x`,
+              it could be either Numpy array(s) or TensorFlow tensor(s).
+              It should be consistent with `x` (you cannot have Numpy inputs and
+              tensor targets, or inversely). If `x` is a dataset, generator,
+              or `keras.utils.Sequence` instance, `y` should
+              not be specified (since targets will be obtained from `x`).
+            batch_size: Integer or `None`.
+                Number of samples per gradient update.
+                If unspecified, `batch_size` will default to 32.
+                Do not specify the `batch_size` if your data is in the
+                form of datasets, generators, or `keras.utils.Sequence`
+                instances (since they generate batches).
+            epochs: Integer. Number of epochs to train the model.
+                An epoch is an iteration over the entire `x` and `y`
+                data provided
+                (unless the `steps_per_epoch` flag is set to
+                something other than None).
+                Note that in conjunction with `initial_epoch`,
+                `epochs` is to be understood as "final epoch".
+                The model is not trained for a number of iterations
+                given by `epochs`, but merely until the epoch
+                of index `epochs` is reached.
+            verbose: 'auto', 0, 1, or 2. Verbosity mode.
+                0 = silent, 1 = progress bar, 2 = one line per epoch.
+                'auto' becomes 1 for most cases, but 2 when used with
+                `ParameterServerStrategy`. Note that the progress bar is not
+                particularly useful when logged to a file, so verbose=2 is
+                recommended when not running interactively (eg, in a production
+                environment). Defaults to 'auto'.
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during training.
+                See `tf.keras.callbacks`. Note
+                `tf.keras.callbacks.ProgbarLogger` and
+                `tf.keras.callbacks.History` callbacks are created automatically
+                and need not be passed into `model.fit`.
+                `tf.keras.callbacks.ProgbarLogger` is created or not based on
+                `verbose` argument to `model.fit`.
+                Callbacks with batch-level calls are currently unsupported with
+                `tf.distribute.experimental.ParameterServerStrategy`, and users
+                are advised to implement epoch-level calls instead with an
+                appropriate `steps_per_epoch` value.
+            validation_split: Float between 0 and 1.
+                Fraction of the training data to be used as validation data.
+                The model will set apart this fraction of the training data,
+                will not train on it, and will evaluate
+                the loss and any model metrics
+                on this data at the end of each epoch.
+                The validation data is selected from the last samples
+                in the `x` and `y` data provided, before shuffling. This
+                argument is not supported when `x` is a dataset, generator or
+                `keras.utils.Sequence` instance.
+                If both `validation_data` and `validation_split` are provided,
+                `validation_data` will override `validation_split`.
+                `validation_split` is not yet supported with
+                `tf.distribute.experimental.ParameterServerStrategy`.
+            validation_data: Data on which to evaluate
+                the loss and any model metrics at the end of each epoch.
+                The model will not be trained on this data. Thus, note the fact
+                that the validation loss of data provided using
+                `validation_split` or `validation_data` is not affected by
+                regularization layers like noise and dropout.
+                `validation_data` will override `validation_split`.
+                `validation_data` could be:
+                  - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
+                  - A tuple `(x_val, y_val, val_sample_weights)` of NumPy
+                    arrays.
+                  - A `tf.data.Dataset`.
+                  - A Python generator or `keras.utils.Sequence` returning
+                  `(inputs, targets)` or `(inputs, targets, sample_weights)`.
+                `validation_data` is not yet supported with
+                `tf.distribute.experimental.ParameterServerStrategy`.
+            shuffle: Boolean (whether to shuffle the training data
+                before each epoch) or str (for 'batch'). This argument is
+                ignored when `x` is a generator or an object of tf.data.Dataset.
+                'batch' is a special option for dealing
+                with the limitations of HDF5 data; it shuffles in batch-sized
+                chunks. Has no effect when `steps_per_epoch` is not `None`.
+            class_weight: Optional dictionary mapping class indices (integers)
+                to a weight (float) value, used for weighting the loss function
+                (during training only).
+                This can be useful to tell the model to
+                "pay more attention" to samples from
+                an under-represented class. When `class_weight` is specified
+                and targets have a rank of 2 or greater, either `y` must be
+                one-hot encoded, or an explicit final dimension of `1` must
+                be included for sparse class labels.
+            sample_weight: Optional Numpy array of weights for
+                the training samples, used for weighting the loss function
+                (during training only). You can either pass a flat (1D)
+                Numpy array with the same length as the input samples
+                (1:1 mapping between weights and samples),
+                or in the case of temporal data,
+                you can pass a 2D array with shape
+                `(samples, sequence_length)`,
+                to apply a different weight to every timestep of every sample.
+                This argument is not supported when `x` is a dataset, generator,
+                or `keras.utils.Sequence` instance, instead provide the
+                sample_weights as the third element of `x`.
+                Note that sample weighting does not apply to metrics specified
+                via the `metrics` argument in `compile()`. To apply sample
+                weighting to your metrics, you can specify them via the
+                `weighted_metrics` in `compile()` instead.
+            initial_epoch: Integer.
+                Epoch at which to start training
+                (useful for resuming a previous training run).
+            steps_per_epoch: Integer or `None`.
+                Total number of steps (batches of samples)
+                before declaring one epoch finished and starting the
+                next epoch. When training with input tensors such as
+                TensorFlow data tensors, the default `None` is equal to
+                the number of samples in your dataset divided by
+                the batch size, or 1 if that cannot be determined. If x is a
+                `tf.data` dataset, and 'steps_per_epoch'
+                is None, the epoch will run until the input dataset is
+                exhausted.  When passing an infinitely repeating dataset, you
+                must specify the `steps_per_epoch` argument. If
+                `steps_per_epoch=-1` the training will run indefinitely with an
+                infinitely repeating dataset.  This argument is not supported
+                with array inputs.
+                When using `tf.distribute.experimental.ParameterServerStrategy`:
+                  * `steps_per_epoch=None` is not supported.
+            validation_steps: Only relevant if `validation_data` is provided and
+                is a `tf.data` dataset. Total number of steps (batches of
+                samples) to draw before stopping when performing validation
+                at the end of every epoch. If 'validation_steps' is None,
+                validation will run until the `validation_data` dataset is
+                exhausted. In the case of an infinitely repeated dataset, it
+                will run into an infinite loop. If 'validation_steps' is
+                specified and only part of the dataset will be consumed, the
+                evaluation will start from the beginning of the dataset at each
+                epoch. This ensures that the same validation samples are used
+                every time.
+            validation_batch_size: Integer or `None`.
+                Number of samples per validation batch.
+                If unspecified, will default to `batch_size`.
+                Do not specify the `validation_batch_size` if your data is in
+                the form of datasets, generators, or `keras.utils.Sequence`
+                instances (since they generate batches).
+            validation_freq: Only relevant if validation data is provided.
+              Integer or `collections.abc.Container` instance (e.g. list, tuple,
+              etc.).  If an integer, specifies how many training epochs to run
+              before a new validation run is performed, e.g. `validation_freq=2`
+              runs validation every 2 epochs. If a Container, specifies the
+              epochs on which to run validation, e.g.
+              `validation_freq=[1, 2, 10]` runs validation at the end of the
+              1st, 2nd, and 10th epochs.
+            max_queue_size: Integer. Used for generator or
+              `keras.utils.Sequence` input only. Maximum size for the generator
+              queue.  If unspecified, `max_queue_size` will default to 10.
+            workers: Integer. Used for generator or `keras.utils.Sequence` input
+                only. Maximum number of processes to spin up
+                when using process-based threading. If unspecified, `workers`
+                will default to 1.
+            use_multiprocessing: Boolean. Used for generator or
+                `keras.utils.Sequence` input only. If `True`, use process-based
+                threading. If unspecified, `use_multiprocessing` will default to
+                `False`. Note that because this implementation relies on
+                multiprocessing, you should not pass non-pickleable arguments to
+                the generator as they can't be passed easily to children
+                processes.
+
+        Unpacking behavior for iterator-like inputs:
+            A common pattern is to pass a tf.data.Dataset, generator, or
+          tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
+          yield not only features (x) but optionally targets (y) and sample
+          weights.  Keras requires that the output of such iterator-likes be
+          unambiguous. The iterator should return a tuple of length 1, 2, or 3,
+          where the optional second and third elements will be used for y and
+          sample_weight respectively. Any other type provided will be wrapped in
+          a length one tuple, effectively treating everything as 'x'. When
+          yielding dicts, they should still adhere to the top-level tuple
+          structure.
+          e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate
+          features, targets, and weights from the keys of a single dict.
+            A notable unsupported data type is the namedtuple. The reason is
+          that it behaves like both an ordered datatype (tuple) and a mapping
+          datatype (dict). So given a namedtuple of the form:
+              `namedtuple("example_tuple", ["y", "x"])`
+          it is ambiguous whether to reverse the order of the elements when
+          interpreting the value. Even worse is a tuple of the form:
+              `namedtuple("other_tuple", ["x", "y", "z"])`
+          where it is unclear if the tuple was intended to be unpacked into x,
+          y, and sample_weight or passed through as a single element to `x`. As
+          a result the data processing code will simply raise a ValueError if it
+          encounters a namedtuple. (Along with instructions to remedy the
+          issue.)
+
+        Returns:
+            A `History` object. Its `History.history` attribute is
+            a record of training loss values and metrics values
+            at successive epochs, as well as validation loss values
+            and validation metrics values (if applicable).
+
+        Raises:
+            RuntimeError: 1. If the model was never compiled or,
+            2. If `model.fit` is  wrapped in `tf.function`.
+
+            ValueError: In case of mismatch between the provided input data
+                and what the model expects or when the input data is empty.
+        """
+        base_layer.keras_api_gauge.get_cell("fit").set(True)
+        # Legacy graph support is contained in `training_v1.Model`.
+        version_utils.disallow_legacy_graph("Model", "fit")
+        self._assert_compile_was_called()
+        self._check_call_args("fit")
+        _disallow_inside_tf_function("fit")
+
+        verbose = _get_verbosity(verbose, self.distribute_strategy)
+
+        if validation_split and validation_data is None:
+            # Create the validation data using the training data. Only supported
+            # for `Tensor` and `NumPy` input.
+            (
+                x,
+                y,
+                sample_weight,
+            ), validation_data = data_adapter.train_validation_split(
+                (x, y, sample_weight), validation_split=validation_split
+            )
+
+        if validation_data:
+            (
+                val_x,
+                val_y,
+                val_sample_weight,
+            ) = data_adapter.unpack_x_y_sample_weight(validation_data)
+
+        if self.distribute_strategy._should_use_with_coordinator:
+            self._cluster_coordinator = (
+                tf.distribute.experimental.coordinator.ClusterCoordinator(
+                    self.distribute_strategy
+                )
+            )
+
+        with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState(  # noqa: E501
+            self
+        ):
+            # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+            data_handler = data_adapter.get_data_handler(
+                x=x,
+                y=y,
+                sample_weight=sample_weight,
+                batch_size=batch_size,
+                steps_per_epoch=steps_per_epoch,
+                initial_epoch=initial_epoch,
+                epochs=epochs,
+                shuffle=shuffle,
+                class_weight=class_weight,
+                max_queue_size=max_queue_size,
+                workers=workers,
+                use_multiprocessing=use_multiprocessing,
+                model=self,
+                steps_per_execution=self._steps_per_execution,
+            )
+
+            # Container that configures and calls `tf.keras.Callback`s.
+            if not isinstance(callbacks, callbacks_module.CallbackList):
+                callbacks = callbacks_module.CallbackList(
+                    callbacks,
+                    add_history=True,
+                    add_progbar=verbose != 0,
+                    model=self,
+                    verbose=verbose,
+                    epochs=epochs,
+                    steps=data_handler.inferred_steps,
+                )
+
+            self.stop_training = False
+            self.train_function = self.make_train_function()
+            self._train_counter.assign(0)
+            callbacks.on_train_begin()
+            training_logs = None
+            if self.autotune_steps_per_execution:
+                self._steps_per_execution_tuner.start()
+            # Handle fault-tolerance for multi-worker.
+            # TODO(omalleyt): Fix the ordering issues that mean this has to
+            # happen after `callbacks.on_train_begin`.
+            steps_per_epoch_inferred = (
+                steps_per_epoch or data_handler.inferred_steps
+            )
+            (
+                data_handler._initial_epoch,
+                data_handler._initial_step,
+            ) = self._maybe_load_initial_counters_from_ckpt(
+                steps_per_epoch_inferred, initial_epoch
+            )
+            logs = None
+            for epoch, iterator in data_handler.enumerate_epochs():
+                self.reset_metrics()
+                callbacks.on_epoch_begin(epoch)
+                with data_handler.catch_stop_iteration():
+                    for step in data_handler.steps():
+                        with tf.profiler.experimental.Trace(
+                            "train",
+                            epoch_num=epoch,
+                            step_num=step,
+                            batch_size=batch_size,
+                            _r=1,
+                        ):
+                            callbacks.on_train_batch_begin(step)
+                            tmp_logs = self.train_function(iterator)
+                            if data_handler.should_sync:
+                                context.async_wait()
+                            # No error, now safe to assign to logs.
+                            logs = tmp_logs
+                            end_step = step + data_handler.step_increment
+                            callbacks.on_train_batch_end(end_step, logs)
+                            if self.stop_training:
+                                break
+
+                logs = tf_utils.sync_to_numpy_or_python_type(logs)
+                if logs is None:
+                    raise ValueError(
+                        "Unexpected result of `train_function` "
+                        "(Empty logs). This could be due to issues in input "
+                        "pipeline that resulted in an empty dataset. "
+                        "Otherwise, please use "
+                        "`Model.compile(..., run_eagerly=True)`, or "
+                        "`tf.config.run_functions_eagerly(True)` for more "
+                        "information of where went wrong, or file a "
+                        "issue/bug to `tf.keras`."
+                    )
+                # Override with model metrics instead of last step logs
+                logs = self._validate_and_get_metrics_result(logs)
+                epoch_logs = copy.copy(logs)
+
+                # Run validation.
+                if validation_data and self._should_eval(
+                    epoch, validation_freq
+                ):
+                    if self._pss_evaluation_shards:
+                        self._disallow_exact_eval_with_add_metrics()
+                    # Create data_handler for evaluation and cache it.
+                    if getattr(self, "_eval_data_handler", None) is None:
+                        self._eval_data_handler = data_adapter.get_data_handler(
+                            x=val_x,
+                            y=val_y,
+                            sample_weight=val_sample_weight,
+                            batch_size=validation_batch_size or batch_size,
+                            steps_per_epoch=validation_steps,
+                            initial_epoch=0,
+                            epochs=1,
+                            max_queue_size=max_queue_size,
+                            workers=workers,
+                            use_multiprocessing=use_multiprocessing,
+                            model=self,
+                            steps_per_execution=self._steps_per_execution,
+                            pss_evaluation_shards=self._pss_evaluation_shards,
+                        )
+                    val_logs = self.evaluate(
+                        x=val_x,
+                        y=val_y,
+                        sample_weight=val_sample_weight,
+                        batch_size=validation_batch_size or batch_size,
+                        steps=validation_steps,
+                        callbacks=callbacks,
+                        max_queue_size=max_queue_size,
+                        workers=workers,
+                        use_multiprocessing=use_multiprocessing,
+                        return_dict=True,
+                        _use_cached_eval_dataset=True,
+                    )
+                    val_logs = {
+                        "val_" + name: val for name, val in val_logs.items()
+                    }
+                    epoch_logs.update(val_logs)
+
+                callbacks.on_epoch_end(epoch, epoch_logs)
+                training_logs = epoch_logs
+                if self.stop_training:
+                    break
+
+            if isinstance(self.optimizer, optimizer.Optimizer) and epochs > 0:
+                self.optimizer.finalize_variable_values(
+                    self.trainable_variables
+                )
+
+            # If eval data_handler exists, delete it after all epochs are done.
+            if getattr(self, "_eval_data_handler", None) is not None:
+                del self._eval_data_handler
+            if self.autotune_steps_per_execution:
+                self._steps_per_execution_tuner.stop()
+            callbacks.on_train_end(logs=training_logs)
+            return self.history
+
+    def test_step(self, data):
+        """The logic for one evaluation step.
+
+        This method can be overridden to support custom evaluation logic.
+        This method is called by `Model.make_test_function`.
+
+        This function should contain the mathematical logic for one step of
+        evaluation.
+        This typically includes the forward pass, loss calculation, and metrics
+        updates.
+
+        Configuration details for *how* this logic is run (e.g. `tf.function`
+        and `tf.distribute.Strategy` settings), should be left to
+        `Model.make_test_function`, which can also be overridden.
+
+        Args:
+          data: A nested structure of `Tensor`s.
+
+        Returns:
+          A `dict` containing values that will be passed to
+          `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
+          values of the `Model`'s metrics are returned.
+        """
+        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+
+        y_pred = self(x, training=False)
+        # Updates stateful loss metrics.
+        self.compute_loss(x, y, y_pred, sample_weight)
+        return self.compute_metrics(x, y, y_pred, sample_weight)
+
+    def _make_test_function_exact(self):
+        if getattr(self, "_shard_test_function", None):
+            return self._shard_test_function
+
+        def step_function(batch):
+            def run_step(data):
+                # TODO(b/272050910): Use sample_weight for weighted metrics.
+                x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(
+                    data
+                )
+                y_pred = self(x, training=False)
+                return x, y, y_pred, sample_weight
+
+            if self._jit_compile:
+                run_step = tf.function(
+                    run_step, jit_compile=True, reduce_retracing=True
+                )
+
+            outputs = self.distribute_strategy.run(run_step, args=(batch,))
+            outputs = reduce_per_replica(
+                outputs,
+                self.distribute_strategy,
+                reduction=self.distribute_reduction_method,
+            )
+            return outputs
+
+        def shard_test_function(dataset, total_shards, shard_idx):
+            # Copy loss and metric variables to the worker and work with them
+            # locally. This ensures each shard function is atomic: if a worker
+            # is preempted, the intermediate progress is discarded and that
+            # shard is retried. This in turn guarantees exactly-once visitation.
+            local_unweighted_metrics, local_weighted_metrics = [], []
+            with tf_utils.with_metric_local_vars_scope():
+                # TODO(jmullenbach): implement and use a clone for
+                # `MetricsContainer` and use its `update_state` method directly.
+                for metric in self.compiled_metrics.unweighted_metrics:
+                    if metric is not None:
+                        local_unweighted_metrics.append(
+                            base_metric.clone_metric(metric)
+                        )
+                for metric in self.compiled_metrics.weighted_metrics:
+                    if metric is not None:
+                        local_weighted_metrics.append(
+                            base_metric.clone_metric(metric)
+                        )
+                local_loss = compile_utils.LossesContainer.from_config(
+                    self.compiled_loss.get_config()
+                )
+
+            dataset = input_ops.auto_shard_dataset(
+                dataset, total_shards, shard_idx
+            )
+            iterator = iter(dataset)
+            with distribute_utils.cache_variable_reads():
+                for batch in iterator:
+                    x, y, y_pred, sample_weight = step_function(batch)
+                    for weighted_metric in local_weighted_metrics:
+                        weighted_metric.update_state(y, y_pred, sample_weight)
+                    for unweighted_metric in local_unweighted_metrics:
+                        unweighted_metric.update_state(y, y_pred)
+                    local_loss(y, y_pred, sample_weight)
+            local_metrics = (
+                local_unweighted_metrics
+                + local_weighted_metrics
+                + local_loss.metrics
+            )
+            outputs = {metric.name: metric.weights for metric in local_metrics}
+            with tf.control_dependencies(_minimum_control_deps(outputs)):
+                self._test_counter.assign_add(1)
+            return outputs
+
+        if not self.run_eagerly:
+            shard_test_function = tf.function(
+                shard_test_function, reduce_retracing=True
+            )
+
+        self._shard_test_function = (
+            lambda *args: self._cluster_coordinator.schedule(
+                shard_test_function,
+                args=args,
+            )
+        )
+        return self._shard_test_function
+
+    def make_test_function(self, force=False):
+        """Creates a function that executes one step of evaluation.
+
+        This method can be overridden to support custom evaluation logic.
+        This method is called by `Model.evaluate` and `Model.test_on_batch`.
+
+        Typically, this method directly controls `tf.function` and
+        `tf.distribute.Strategy` settings, and delegates the actual evaluation
+        logic to `Model.test_step`.
+
+        This function is cached the first time `Model.evaluate` or
+        `Model.test_on_batch` is called. The cache is cleared whenever
+        `Model.compile` is called. You can skip the cache and generate again the
+        function with `force=True`.
+
+        Args:
+          force: Whether to regenerate the test function and skip the cached
+            function if available.
+
+        Returns:
+          Function. The function created by this method should accept a
+          `tf.data.Iterator`, and return a `dict` containing values that will
+          be passed to `tf.keras.Callbacks.on_test_batch_end`.
+        """
+        if self.test_function is not None and not force:
+            return self.test_function
+
+        def step_function(model, iterator):
+            """Runs a single evaluation step."""
+
+            def run_step(data):
+                outputs = model.test_step(data)
+                # Ensure counter is updated only if `test_step` succeeds.
+                with tf.control_dependencies(_minimum_control_deps(outputs)):
+                    model._test_counter.assign_add(1)
+                return outputs
+
+            if self.jit_compile:
+                run_step = tf.function(
+                    run_step, jit_compile=True, reduce_retracing=True
+                )
+
+            data = next(iterator)
+            outputs = model.distribute_strategy.run(run_step, args=(data,))
+            outputs = reduce_per_replica(
+                outputs,
+                self.distribute_strategy,
+                reduction=self.distribute_reduction_method,
+            )
+            return outputs
+
+        # Special case if steps_per_execution is one.
+        if (
+            self._steps_per_execution is None
+            or self._steps_per_execution.numpy().item() == 1
+            and not self.autotune_steps_per_execution
+        ):
+
+            def test_function(iterator):
+                """Runs a test execution with a single step."""
+                return step_function(self, iterator)
+
+            if not self.run_eagerly:
+                test_function = tf.function(
+                    test_function, reduce_retracing=True
+                )
+
+            if self._cluster_coordinator:
+                self.test_function = (
+                    lambda it: self._cluster_coordinator.schedule(
+                        test_function, args=(it,)
+                    )
+                )
+            else:
+                self.test_function = test_function
+
+        # If we're using a coordinator, use the value of
+        # self._steps_per_execution at the time the function is
+        # called/scheduled, and not when it is actually executed.
+        elif self._cluster_coordinator:
+
+            def test_function(iterator, steps_per_execution):
+                """Runs a test execution with multiple steps."""
+                for _ in tf.range(steps_per_execution):
+                    outputs = step_function(self, iterator)
+                return outputs
+
+            if not self.run_eagerly:
+                test_function = tf.function(
+                    test_function, reduce_retracing=True
+                )
+
+            self.test_function = lambda it: self._cluster_coordinator.schedule(
+                test_function, args=(it, self._steps_per_execution.value())
+            )
+        else:
 
-    Args:
-      force: Whether to regenerate the train function and skip the cached
-        function if available.
+            def test_function(iterator):
+                """Runs a test execution with multiple steps."""
+                for _ in tf.range(self._steps_per_execution):
+                    outputs = step_function(self, iterator)
+                return outputs
 
-    Returns:
-      Function. The function created by this method should accept a
-      `tf.data.Iterator`, and return a `dict` containing values that will
-      be passed to `tf.keras.Callbacks.on_train_batch_end`, such as
-      `{'loss': 0.2, 'accuracy': 0.7}`.
-    """
-    if self.train_function is not None and not force:
-      return self.train_function
-
-    def step_function(model, iterator):
-      """Runs a single training step."""
-
-      def run_step(data):
-        outputs = model.train_step(data)
-        # Ensure counter is updated only if `train_step` succeeds.
-        with tf.control_dependencies(_minimum_control_deps(outputs)):
-          model._train_counter.assign_add(1)  # pylint: disable=protected-access
-        return outputs
-
-      if self._jit_compile:
-        run_step = tf.function(
-            run_step, jit_compile=True, reduce_retracing=True)
-      data = next(iterator)
-      outputs = model.distribute_strategy.run(run_step, args=(data,))
-      outputs = reduce_per_replica(
-          outputs, self.distribute_strategy, reduction='first')
-      return outputs
-
-    # Special case if steps_per_execution is one.
-    if (self._steps_per_execution is None or
-        self._steps_per_execution.numpy().item() == 1):
-
-      def train_function(iterator):
-        """Runs a training execution with a single step."""
-        return step_function(self, iterator)
-
-      if not self.run_eagerly:
-        train_function = tf.function(
-            train_function, reduce_retracing=True)
-        self.train_tf_function = train_function
-
-      if self._cluster_coordinator:
-        self.train_function = lambda it: self._cluster_coordinator.schedule(  # pylint: disable=g-long-lambda
-            train_function, args=(it,))
-      else:
-        self.train_function = train_function
-
-    # If we're using a coordinator, use the value of self._steps_per_execution
-    # at the time the function is called/scheduled, and not when it is actually
-    # executed.
-    elif self._cluster_coordinator:
-
-      def train_function(iterator, steps_per_execution):
-        """Runs a training execution with multiple steps."""
-        for _ in tf.range(steps_per_execution):
-          outputs = step_function(self, iterator)
-        return outputs
-
-      if not self.run_eagerly:
-        train_function = tf.function(
-            train_function, reduce_retracing=True)
-        self.train_tf_function = train_function
-
-      self.train_function = lambda it: self._cluster_coordinator.schedule(  # pylint: disable=g-long-lambda
-          train_function,
-          args=(it, self._steps_per_execution.value()))
-    else:
+            if not self.run_eagerly:
+                test_function = tf.function(
+                    test_function, reduce_retracing=True
+                )
+            self.test_function = test_function
 
-      def train_function(iterator):
-        """Runs a training execution with multiple steps."""
-        for _ in tf.range(self._steps_per_execution):
-          outputs = step_function(self, iterator)
-        return outputs
-
-      if not self.run_eagerly:
-        train_function = tf.function(
-            train_function, reduce_retracing=True)
-        self.train_tf_function = train_function
-      self.train_function = train_function
-
-    return self.train_function
-
-  @traceback_utils.filter_traceback
-  def fit(self,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose='auto',
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_batch_size=None,
-          validation_freq=1,
-          max_queue_size=10,
-          workers=1,
-          use_multiprocessing=False):
-    """Trains the model for a fixed number of epochs (iterations on a dataset).
+        return self.test_function
 
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset. Should return a tuple
-            of either `(inputs, targets)` or
-            `(inputs, targets, sample_weights)`.
-          - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
-            or `(inputs, targets, sample_weights)`.
-          - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
-            callable that takes a single argument of type
-            `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
-            `DatasetCreator` should be used when users prefer to specify the
-            per-replica batching and sharding logic for the `Dataset`.
-            See `tf.keras.utils.experimental.DatasetCreator` doc for more
-            information.
-          A more detailed description of unpacking behavior for iterator types
-          (Dataset, generator, Sequence) is given below. If these include
-          `sample_weights` as a third component, note that sample weighting
-          applies to the `weighted_metrics` argument but not the `metrics`
-          argument in `compile()`. If using
-          `tf.distribute.experimental.ParameterServerStrategy`, only
-          `DatasetCreator` type is supported for `x`.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset, generator,
-          or `keras.utils.Sequence` instance, `y` should
-          not be specified (since targets will be obtained from `x`).
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of datasets, generators, or `keras.utils.Sequence` instances
-            (since they generate batches).
-        epochs: Integer. Number of epochs to train the model.
-            An epoch is an iteration over the entire `x` and `y`
-            data provided
-            (unless the `steps_per_epoch` flag is set to
-            something other than None).
-            Note that in conjunction with `initial_epoch`,
-            `epochs` is to be understood as "final epoch".
-            The model is not trained for a number of iterations
-            given by `epochs`, but merely until the epoch
-            of index `epochs` is reached.
-        verbose: 'auto', 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = one line per epoch.
-            'auto' defaults to 1 for most cases, but 2 when used with
-            `ParameterServerStrategy`. Note that the progress bar is not
-            particularly useful when logged to a file, so verbose=2 is
-            recommended when not running interactively (eg, in a production
-            environment).
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during training.
-            See `tf.keras.callbacks`. Note `tf.keras.callbacks.ProgbarLogger`
-            and `tf.keras.callbacks.History` callbacks are created automatically
-            and need not be passed into `model.fit`.
-            `tf.keras.callbacks.ProgbarLogger` is created or not based on
-            `verbose` argument to `model.fit`.
-            Callbacks with batch-level calls are currently unsupported with
-            `tf.distribute.experimental.ParameterServerStrategy`, and users are
-            advised to implement epoch-level calls instead with an appropriate
-            `steps_per_epoch` value.
-        validation_split: Float between 0 and 1.
-            Fraction of the training data to be used as validation data.
-            The model will set apart this fraction of the training data,
-            will not train on it, and will evaluate
-            the loss and any model metrics
-            on this data at the end of each epoch.
-            The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling. This argument is
-            not supported when `x` is a dataset, generator or
-            `keras.utils.Sequence` instance.
-            If both `validation_data` and `validation_split` are provided,
-            `validation_data` will override `validation_split`.
-            `validation_split` is not yet supported with
-            `tf.distribute.experimental.ParameterServerStrategy`.
-        validation_data: Data on which to evaluate
-            the loss and any model metrics at the end of each epoch.
-            The model will not be trained on this data. Thus, note the fact
-            that the validation loss of data provided using `validation_split`
-            or `validation_data` is not affected by regularization layers like
-            noise and dropout.
-            `validation_data` will override `validation_split`.
-            `validation_data` could be:
-              - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
-              - A tuple `(x_val, y_val, val_sample_weights)` of NumPy arrays.
-              - A `tf.data.Dataset`.
-              - A Python generator or `keras.utils.Sequence` returning
-              `(inputs, targets)` or `(inputs, targets, sample_weights)`.
-            `validation_data` is not yet supported with
-            `tf.distribute.experimental.ParameterServerStrategy`.
-        shuffle: Boolean (whether to shuffle the training data
-            before each epoch) or str (for 'batch'). This argument is ignored
-            when `x` is a generator or an object of tf.data.Dataset.
-            'batch' is a special option for dealing
-            with the limitations of HDF5 data; it shuffles in batch-sized
-            chunks. Has no effect when `steps_per_epoch` is not `None`.
-        class_weight: Optional dictionary mapping class indices (integers)
-            to a weight (float) value, used for weighting the loss function
-            (during training only).
-            This can be useful to tell the model to
-            "pay more attention" to samples from
-            an under-represented class.
-        sample_weight: Optional Numpy array of weights for
-            the training samples, used for weighting the loss function
-            (during training only). You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample. This
-            argument is not supported when `x` is a dataset, generator, or
-           `keras.utils.Sequence` instance, instead provide the sample_weights
-            as the third element of `x`.
-            Note that sample weighting does not apply to metrics specified
-            via the `metrics` argument in `compile()`. To apply sample weighting
-            to your metrics, you can specify them via the `weighted_metrics` in
-            `compile()` instead.
-        initial_epoch: Integer.
-            Epoch at which to start training
-            (useful for resuming a previous training run).
-        steps_per_epoch: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined. If x is a
-            `tf.data` dataset, and 'steps_per_epoch'
-            is None, the epoch will run until the input dataset is exhausted.
-            When passing an infinitely repeating dataset, you must specify the
-            `steps_per_epoch` argument. If `steps_per_epoch=-1` the training
-            will run indefinitely with an infinitely repeating dataset.
-            This argument is not supported with array inputs.
-            When using `tf.distribute.experimental.ParameterServerStrategy`:
-              * `steps_per_epoch=None` is not supported.
-        validation_steps: Only relevant if `validation_data` is provided and
-            is a `tf.data` dataset. Total number of steps (batches of
-            samples) to draw before stopping when performing validation
-            at the end of every epoch. If 'validation_steps' is None, validation
-            will run until the `validation_data` dataset is exhausted. In the
-            case of an infinitely repeated dataset, it will run into an
-            infinite loop. If 'validation_steps' is specified and only part of
-            the dataset will be consumed, the evaluation will start from the
-            beginning of the dataset at each epoch. This ensures that the same
-            validation samples are used every time.
-        validation_batch_size: Integer or `None`.
-            Number of samples per validation batch.
-            If unspecified, will default to `batch_size`.
-            Do not specify the `validation_batch_size` if your data is in the
-            form of datasets, generators, or `keras.utils.Sequence` instances
-            (since they generate batches).
-        validation_freq: Only relevant if validation data is provided. Integer
-            or `collections.abc.Container` instance (e.g. list, tuple, etc.).
-            If an integer, specifies how many training epochs to run before a
-            new validation run is performed, e.g. `validation_freq=2` runs
-            validation every 2 epochs. If a Container, specifies the epochs on
-            which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
-            validation at the end of the 1st, 2nd, and 10th epochs.
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up
-            when using process-based threading. If unspecified, `workers`
-            will default to 1.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
-
-    Unpacking behavior for iterator-like inputs:
-        A common pattern is to pass a tf.data.Dataset, generator, or
-      tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
-      yield not only features (x) but optionally targets (y) and sample weights.
-      Keras requires that the output of such iterator-likes be unambiguous. The
-      iterator should return a tuple of length 1, 2, or 3, where the optional
-      second and third elements will be used for y and sample_weight
-      respectively. Any other type provided will be wrapped in a length one
-      tuple, effectively treating everything as 'x'. When yielding dicts, they
-      should still adhere to the top-level tuple structure.
-      e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate
-      features, targets, and weights from the keys of a single dict.
-        A notable unsupported data type is the namedtuple. The reason is that
-      it behaves like both an ordered datatype (tuple) and a mapping
-      datatype (dict). So given a namedtuple of the form:
-          `namedtuple("example_tuple", ["y", "x"])`
-      it is ambiguous whether to reverse the order of the elements when
-      interpreting the value. Even worse is a tuple of the form:
-          `namedtuple("other_tuple", ["x", "y", "z"])`
-      where it is unclear if the tuple was intended to be unpacked into x, y,
-      and sample_weight or passed through as a single element to `x`. As a
-      result the data processing code will simply raise a ValueError if it
-      encounters a namedtuple. (Along with instructions to remedy the issue.)
+    @traceback_utils.filter_traceback
+    def evaluate(
+        self,
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose="auto",
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        return_dict=False,
+        **kwargs,
+    ):
+        """Returns the loss value & metrics values for the model in test mode.
+
+        Computation is done in batches (see the `batch_size` arg.)
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                if the model has named inputs.
+              - A `tf.data` dataset. Should return a tuple
+                of either `(inputs, targets)` or
+                `(inputs, targets, sample_weights)`.
+              - A generator or `keras.utils.Sequence` returning `(inputs,
+                targets)` or `(inputs, targets, sample_weights)`.
+              A more detailed description of unpacking behavior for iterator
+              types (Dataset, generator, Sequence) is given in the `Unpacking
+              behavior for iterator-like inputs` section of `Model.fit`.
+            y: Target data. Like the input data `x`, it could be either Numpy
+              array(s) or TensorFlow tensor(s). It should be consistent with `x`
+              (you cannot have Numpy inputs and tensor targets, or inversely).
+              If `x` is a dataset, generator or `keras.utils.Sequence` instance,
+              `y` should not be specified (since targets will be obtained from
+              the iterator/dataset).
+            batch_size: Integer or `None`. Number of samples per batch of
+              computation. If unspecified, `batch_size` will default to 32. Do
+              not specify the `batch_size` if your data is in the form of a
+              dataset, generators, or `keras.utils.Sequence` instances (since
+              they generate batches).
+            verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
+                0 = silent, 1 = progress bar, 2 = single line.
+                `"auto"` becomes 1 for most cases, and to 2 when used with
+                `ParameterServerStrategy`. Note that the progress bar is not
+                particularly useful when logged to a file, so `verbose=2` is
+                recommended when not running interactively (e.g. in a production
+                environment). Defaults to 'auto'.
+            sample_weight: Optional Numpy array of weights for the test samples,
+              used for weighting the loss function. You can either pass a flat
+              (1D) Numpy array with the same length as the input samples
+                (1:1 mapping between weights and samples), or in the case of
+                  temporal data, you can pass a 2D array with shape `(samples,
+                  sequence_length)`, to apply a different weight to every
+                  timestep of every sample. This argument is not supported when
+                  `x` is a dataset, instead pass sample weights as the third
+                  element of `x`.
+            steps: Integer or `None`. Total number of steps (batches of samples)
+              before declaring the evaluation round finished. Ignored with the
+              default value of `None`. If x is a `tf.data` dataset and `steps`
+              is None, 'evaluate' will run until the dataset is exhausted. This
+              argument is not supported with array inputs.
+            callbacks: List of `keras.callbacks.Callback` instances. List of
+              callbacks to apply during evaluation. See
+              [callbacks](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks).
+            max_queue_size: Integer. Used for generator or
+              `keras.utils.Sequence` input only. Maximum size for the generator
+              queue. If unspecified, `max_queue_size` will default to 10.
+            workers: Integer. Used for generator or `keras.utils.Sequence` input
+              only. Maximum number of processes to spin up when using
+              process-based threading. If unspecified, `workers` will default to
+              1.
+            use_multiprocessing: Boolean. Used for generator or
+              `keras.utils.Sequence` input only. If `True`, use process-based
+              threading. If unspecified, `use_multiprocessing` will default to
+              `False`. Note that because this implementation relies on
+              multiprocessing, you should not pass non-pickleable arguments to
+              the generator as they can't be passed easily to children
+              processes.
+            return_dict: If `True`, loss and metric results are returned as a
+              dict, with each key being the name of the metric. If `False`, they
+              are returned as a list.
+            **kwargs: Unused at this time.
+
+        See the discussion of `Unpacking behavior for iterator-like inputs` for
+        `Model.fit`.
+
+        Returns:
+            Scalar test loss (if the model has a single output and no metrics)
+            or list of scalars (if the model has multiple outputs
+            and/or metrics). The attribute `model.metrics_names` will give you
+            the display labels for the scalar outputs.
+
+        Raises:
+            RuntimeError: If `model.evaluate` is wrapped in a `tf.function`.
+        """
+        base_layer.keras_api_gauge.get_cell("evaluate").set(True)
+        version_utils.disallow_legacy_graph("Model", "evaluate")
+        self._assert_compile_was_called()
+        self._check_call_args("evaluate")
+        self._check_sample_weight_warning(x, sample_weight)
+        _disallow_inside_tf_function("evaluate")
+        use_cached_eval_dataset = kwargs.pop("_use_cached_eval_dataset", False)
+        if kwargs:
+            raise TypeError(f"Invalid keyword arguments: {list(kwargs.keys())}")
+
+        if self.distribute_strategy._should_use_with_coordinator:
+            self._cluster_coordinator = (
+                tf.distribute.experimental.coordinator.ClusterCoordinator(
+                    self.distribute_strategy
+                )
+            )
+
+        verbose = _get_verbosity(verbose, self.distribute_strategy)
+        if self._pss_evaluation_shards:
+            self._disallow_exact_eval_with_add_metrics()
+        with self.distribute_strategy.scope():
+            # Use cached evaluation data only when it's called in `Model.fit`
+            if (
+                use_cached_eval_dataset
+                and getattr(self, "_eval_data_handler", None) is not None
+            ):
+                data_handler = self._eval_data_handler
+            else:
+                # Creates a `tf.data.Dataset` and handles batch and epoch
+                # iteration.
+                data_handler = data_adapter.get_data_handler(
+                    x=x,
+                    y=y,
+                    sample_weight=sample_weight,
+                    batch_size=batch_size,
+                    steps_per_epoch=steps,
+                    initial_epoch=0,
+                    epochs=1,
+                    max_queue_size=max_queue_size,
+                    workers=workers,
+                    use_multiprocessing=use_multiprocessing,
+                    model=self,
+                    steps_per_execution=self._steps_per_execution,
+                    pss_evaluation_shards=self._pss_evaluation_shards,
+                )
+
+            # Container that configures and calls `tf.keras.Callback`s.
+            if not isinstance(callbacks, callbacks_module.CallbackList):
+                callbacks = callbacks_module.CallbackList(
+                    callbacks,
+                    add_history=True,
+                    add_progbar=verbose != 0,
+                    model=self,
+                    verbose=verbose,
+                    epochs=1,
+                    steps=data_handler.inferred_steps,
+                )
+
+            # Initialize to prevent errors if 0 epochs are evaluated.
+            logs = {}
+
+            test_function_runner = self._get_test_function_runner(callbacks)
+            self._test_counter.assign(0)
+            callbacks.on_test_begin()
+            if self.autotune_steps_per_execution:
+                self._steps_per_execution_tuner.start()
+            for (
+                _,
+                dataset_or_iterator,
+            ) in data_handler.enumerate_epochs():  # Single epoch.
+                self.reset_metrics()
+                with data_handler.catch_stop_iteration():
+                    for step in data_handler.steps():
+                        with tf.profiler.experimental.Trace(
+                            "test", step_num=step, _r=1
+                        ):
+                            callbacks.on_test_batch_begin(step)
+                            logs = test_function_runner.run_step(
+                                dataset_or_iterator,
+                                data_handler,
+                                step,
+                                self._pss_evaluation_shards,
+                            )
+
+            logs = tf_utils.sync_to_numpy_or_python_type(logs)
+            # Override with model metrics instead of last step logs
+            if self._pss_evaluation_shards:
+                logs = self._aggregate_exact_metrics(logs)
+            else:
+                logs = self._validate_and_get_metrics_result(logs)
+            if self.autotune_steps_per_execution:
+                self._steps_per_execution_tuner.stop()
+            callbacks.on_test_end(logs=logs)
 
-    Returns:
-        A `History` object. Its `History.history` attribute is
-        a record of training loss values and metrics values
-        at successive epochs, as well as validation loss values
-        and validation metrics values (if applicable).
+            if return_dict:
+                return logs
+            else:
+                return flatten_metrics_in_order(logs, self.metrics_names)
+
+    def _disallow_exact_eval_with_add_metrics(self):
+        metrics_from_add_metric = [
+            metric
+            for layer in self._flatten_layers()
+            for metric in layer._metrics
+        ]
+        compiled_metrics = self.compiled_metrics.metrics
+        if any(
+            [
+                metric not in compiled_metrics
+                for metric in metrics_from_add_metric
+            ]
+        ):
+            raise ValueError(
+                "Detected that a metric was added to this model "
+                "via `Model.add_metric`. This is not currently "
+                "supported when using exact evaluation with "
+                "`tf.distribute.ParameterServerStrategy`."
+            )
+
+    def _infer_exact_eval_shards(self, pss_evaluation_shards):
+        if not self.distribute_strategy._should_use_with_coordinator:
+            return 0
+        if pss_evaluation_shards == "auto":
+            # TODO(b/264265138) evaluate and improve this heuristic
+            return self.distribute_strategy._num_workers * 5
+        return pss_evaluation_shards
+
+    def _get_test_function_runner(self, callbacks):
+        if (
+            self._pss_evaluation_shards
+            and self.distribute_strategy._should_use_with_coordinator
+        ):
+            self.test_function = self._make_test_function_exact()
+            test_function_runner = _ExactTestFunction(
+                self.test_function, callbacks
+            )
+        else:
+            self.test_function = self.make_test_function()
+            test_function_runner = _TestFunction(self.test_function, callbacks)
+        return test_function_runner
+
+    def predict_step(self, data):
+        """The logic for one inference step.
+
+        This method can be overridden to support custom inference logic.
+        This method is called by `Model.make_predict_function`.
+
+        This method should contain the mathematical logic for one step of
+        inference.  This typically includes the forward pass.
+
+        Configuration details for *how* this logic is run (e.g. `tf.function`
+        and `tf.distribute.Strategy` settings), should be left to
+        `Model.make_predict_function`, which can also be overridden.
+
+        Args:
+          data: A nested structure of `Tensor`s.
+
+        Returns:
+          The result of one inference step, typically the output of calling the
+          `Model` on data.
+        """
+        x, _, _ = data_adapter.unpack_x_y_sample_weight(data)
+        return self(x, training=False)
+
+    def make_predict_function(self, force=False):
+        """Creates a function that executes one step of inference.
+
+        This method can be overridden to support custom inference logic.
+        This method is called by `Model.predict` and `Model.predict_on_batch`.
+
+        Typically, this method directly controls `tf.function` and
+        `tf.distribute.Strategy` settings, and delegates the actual evaluation
+        logic to `Model.predict_step`.
+
+        This function is cached the first time `Model.predict` or
+        `Model.predict_on_batch` is called. The cache is cleared whenever
+        `Model.compile` is called. You can skip the cache and generate again the
+        function with `force=True`.
+
+        Args:
+          force: Whether to regenerate the predict function and skip the cached
+            function if available.
+
+        Returns:
+          Function. The function created by this method should accept a
+          `tf.data.Iterator`, and return the outputs of the `Model`.
+        """
+        if self.predict_function is not None and not force:
+            return self.predict_function
+
+        def step_function(model, iterator):
+            """Runs a single evaluation step."""
+
+            def run_step(data):
+                outputs = model.predict_step(data)
+                # Ensure counter is updated only if `test_step` succeeds.
+                with tf.control_dependencies(_minimum_control_deps(outputs)):
+                    model._predict_counter.assign_add(1)
+                return outputs
+
+            if self.jit_compile:
+                run_step = tf.function(
+                    run_step, jit_compile=True, reduce_retracing=True
+                )
+
+            data = next(iterator)
+            outputs = model.distribute_strategy.run(run_step, args=(data,))
+            outputs = reduce_per_replica(
+                outputs, self.distribute_strategy, reduction="concat"
+            )
+            return outputs
+
+        # Special case if steps_per_execution is one.
+        if (
+            self._steps_per_execution is None
+            or self._steps_per_execution.numpy().item() == 1
+            and not self.autotune_steps_per_execution
+        ):
+
+            def predict_function(iterator):
+                """Runs an evaluation execution with a single step."""
+                return step_function(self, iterator)
 
-    Raises:
-        RuntimeError: 1. If the model was never compiled or,
-        2. If `model.fit` is  wrapped in `tf.function`.
+        else:
 
-        ValueError: In case of mismatch between the provided input data
-            and what the model expects or when the input data is empty.
-    """
-    base_layer.keras_api_gauge.get_cell('fit').set(True)
-    # Legacy graph support is contained in `training_v1.Model`.
-    version_utils.disallow_legacy_graph('Model', 'fit')
-    self._assert_compile_was_called()
-    self._check_call_args('fit')
-    _disallow_inside_tf_function('fit')
-
-    verbose = _get_verbosity(verbose, self.distribute_strategy)
-
-    if validation_split and validation_data is None:
-      # Create the validation data using the training data. Only supported for
-      # `Tensor` and `NumPy` input.
-      (x, y, sample_weight), validation_data = (
-          data_adapter.train_validation_split(
-              (x, y, sample_weight), validation_split=validation_split))
-
-    if validation_data:
-      val_x, val_y, val_sample_weight = (
-          data_adapter.unpack_x_y_sample_weight(validation_data))
-
-    if self.distribute_strategy._should_use_with_coordinator:  # pylint: disable=protected-access
-      self._cluster_coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
-          self.distribute_strategy)
-
-    with self.distribute_strategy.scope(), \
-         training_utils.RespectCompiledTrainableState(self):
-      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
-      data_handler = data_adapter.get_data_handler(
-          x=x,
-          y=y,
-          sample_weight=sample_weight,
-          batch_size=batch_size,
-          steps_per_epoch=steps_per_epoch,
-          initial_epoch=initial_epoch,
-          epochs=epochs,
-          shuffle=shuffle,
-          class_weight=class_weight,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          model=self,
-          steps_per_execution=self._steps_per_execution)
-
-      # Container that configures and calls `tf.keras.Callback`s.
-      if not isinstance(callbacks, callbacks_module.CallbackList):
-        callbacks = callbacks_module.CallbackList(
-            callbacks,
-            add_history=True,
-            add_progbar=verbose != 0,
-            model=self,
-            verbose=verbose,
-            epochs=epochs,
-            steps=data_handler.inferred_steps)
-
-      self.stop_training = False
-      self.train_function = self.make_train_function()
-      self._train_counter.assign(0)
-      callbacks.on_train_begin()
-      training_logs = None
-      # Handle fault-tolerance for multi-worker.
-      # TODO(omalleyt): Fix the ordering issues that mean this has to
-      # happen after `callbacks.on_train_begin`.
-      data_handler._initial_epoch = (  # pylint: disable=protected-access
-          self._maybe_load_initial_epoch_from_ckpt(initial_epoch))
-      logs = None
-      for epoch, iterator in data_handler.enumerate_epochs():
-        self.reset_metrics()
-        callbacks.on_epoch_begin(epoch)
-        with data_handler.catch_stop_iteration():
-          data_handler._initial_step = self._maybe_load_initial_step_from_ckpt()  # pylint: disable=protected-access
-          for step in data_handler.steps():
-            with tf.profiler.experimental.Trace(
-                'train',
-                epoch_num=epoch,
-                step_num=step,
+            def predict_function(iterator):
+                """Runs an evaluation execution with multiple steps."""
+                outputs = step_function(self, iterator)
+                for _ in tf.range(self._steps_per_execution - 1):
+                    tf.autograph.experimental.set_loop_options(
+                        shape_invariants=[
+                            (
+                                outputs,
+                                tf.nest.map_structure(
+                                    lambda t: tf_utils.get_tensor_spec(
+                                        t, dynamic_batch=True
+                                    ).shape,
+                                    outputs,
+                                ),
+                            )
+                        ]
+                    )
+                    step_outputs = step_function(self, iterator)
+                    outputs = tf.nest.map_structure(
+                        lambda t1, t2: concat([t1, t2]), outputs, step_outputs
+                    )
+                return outputs
+
+        if not self.run_eagerly:
+            predict_function = tf.function(
+                predict_function, reduce_retracing=True
+            )
+        self.predict_function = predict_function
+
+        return self.predict_function
+
+    @traceback_utils.filter_traceback
+    def predict(
+        self,
+        x,
+        batch_size=None,
+        verbose="auto",
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+    ):
+        """Generates output predictions for the input samples.
+
+        Computation is done in batches. This method is designed for batch
+        processing of large numbers of inputs. It is not intended for use inside
+        of loops that iterate over your data and process small numbers of inputs
+        at a time.
+
+        For small numbers of inputs that fit in one batch,
+        directly use `__call__()` for faster execution, e.g.,
+        `model(x)`, or `model(x, training=False)` if you have layers such as
+        `tf.keras.layers.BatchNormalization` that behave differently during
+        inference. You may pair the individual model call with a `tf.function`
+        for additional performance inside your inner loop.
+        If you need access to numpy array values instead of tensors after your
+        model call, you can use `tensor.numpy()` to get the numpy array value of
+        an eager tensor.
+
+        Also, note the fact that test loss is not affected by
+        regularization layers like noise and dropout.
+
+        Note: See [this FAQ entry](
+        https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
+        for more details about the difference between `Model` methods
+        `predict()` and `__call__()`.
+
+        Args:
+            x: Input samples. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A `tf.data` dataset.
+              - A generator or `keras.utils.Sequence` instance.
+              A more detailed description of unpacking behavior for iterator
+              types (Dataset, generator, Sequence) is given in the `Unpacking
+              behavior for iterator-like inputs` section of `Model.fit`.
+            batch_size: Integer or `None`.
+                Number of samples per batch.
+                If unspecified, `batch_size` will default to 32.
+                Do not specify the `batch_size` if your data is in the
+                form of dataset, generators, or `keras.utils.Sequence` instances
+                (since they generate batches).
+            verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
+                0 = silent, 1 = progress bar, 2 = single line.
+                `"auto"` becomes 1 for most cases, and to 2 when used with
+                `ParameterServerStrategy`. Note that the progress bar is not
+                particularly useful when logged to a file, so `verbose=2` is
+                recommended when not running interactively (e.g. in a production
+                environment). Defaults to 'auto'.
+            steps: Total number of steps (batches of samples)
+                before declaring the prediction round finished.
+                Ignored with the default value of `None`. If x is a `tf.data`
+                dataset and `steps` is None, `predict()` will
+                run until the input dataset is exhausted.
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during prediction.
+                See [callbacks](
+                https://www.tensorflow.org/api_docs/python/tf/keras/callbacks).
+            max_queue_size: Integer. Used for generator or
+                `keras.utils.Sequence` input only. Maximum size for the
+                generator queue. If unspecified, `max_queue_size` will default
+                to 10.
+            workers: Integer. Used for generator or `keras.utils.Sequence` input
+                only. Maximum number of processes to spin up when using
+                process-based threading. If unspecified, `workers` will default
+                to 1.
+            use_multiprocessing: Boolean. Used for generator or
+                `keras.utils.Sequence` input only. If `True`, use process-based
+                threading. If unspecified, `use_multiprocessing` will default to
+                `False`. Note that because this implementation relies on
+                multiprocessing, you should not pass non-pickleable arguments to
+                the generator as they can't be passed easily to children
+                processes.
+
+        See the discussion of `Unpacking behavior for iterator-like inputs` for
+        `Model.fit`. Note that Model.predict uses the same interpretation rules
+        as `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for
+        all three methods.
+
+        Returns:
+            Numpy array(s) of predictions.
+
+        Raises:
+            RuntimeError: If `model.predict` is wrapped in a `tf.function`.
+            ValueError: In case of mismatch between the provided
+                input data and the model's expectations,
+                or in case a stateful model receives a number of samples
+                that is not a multiple of the batch size.
+        """
+        base_layer.keras_api_gauge.get_cell("predict").set(True)
+        version_utils.disallow_legacy_graph("Model", "predict")
+        self._check_call_args("predict")
+        _disallow_inside_tf_function("predict")
+
+        # TODO(yashkatariya): Cache model on the coordinator for faster
+        # prediction.  If running under PSS, then swap it with OneDeviceStrategy
+        # so that execution will run on the coordinator.
+        original_pss_strategy = None
+        if self.distribute_strategy._should_use_with_coordinator:
+            original_pss_strategy = self.distribute_strategy
+            self._distribution_strategy = None
+
+        # Cluster coordinator is set by `.fit()` and `.evaluate()` which is not
+        # needed in `.predict()` because all the predictions happen on the
+        # coordinator/locally.
+        if self._cluster_coordinator:
+            self._cluster_coordinator = None
+
+        verbose = _get_verbosity(verbose, self.distribute_strategy)
+        outputs = None
+        with self.distribute_strategy.scope():
+            # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+            dataset_types = (tf.compat.v1.data.Dataset, tf.data.Dataset)
+            if (
+                self._in_multi_worker_mode()
+                or _is_tpu_multi_host(self.distribute_strategy)
+            ) and isinstance(x, dataset_types):
+                try:
+                    options = tf.data.Options()
+                    data_option = tf.data.experimental.AutoShardPolicy.DATA
+                    options.experimental_distribute.auto_shard_policy = (
+                        data_option
+                    )
+                    x = x.with_options(options)
+                except ValueError:
+                    warnings.warn(
+                        "Using Model.predict with MultiWorkerMirroredStrategy "
+                        "or TPUStrategy and AutoShardPolicy.FILE might lead to "
+                        "out-of-order result. Consider setting it to "
+                        "AutoShardPolicy.DATA.",
+                        stacklevel=2,
+                    )
+
+            data_handler = data_adapter.get_data_handler(
+                x=x,
                 batch_size=batch_size,
-                _r=1):
-              callbacks.on_train_batch_begin(step)
-              tmp_logs = self.train_function(iterator)
-              if data_handler.should_sync:
-                context.async_wait()
-              logs = tmp_logs  # No error, now safe to assign to logs.
-              end_step = step + data_handler.step_increment
-              callbacks.on_train_batch_end(end_step, logs)
-              if self.stop_training:
-                break
-
-        logs = tf_utils.sync_to_numpy_or_python_type(logs)
-        if logs is None:
-          raise ValueError('Unexpected result of `train_function` '
-                           '(Empty logs). Please use '
-                           '`Model.compile(..., run_eagerly=True)`, or '
-                           '`tf.config.run_functions_eagerly(True)` for more '
-                           'information of where went wrong, or file a '
-                           'issue/bug to `tf.keras`.')
-        epoch_logs = copy.copy(logs)
-
-        # Run validation.
-        if validation_data and self._should_eval(epoch, validation_freq):
-          # Create data_handler for evaluation and cache it.
-          if getattr(self, '_eval_data_handler', None) is None:
-            self._eval_data_handler = data_adapter.get_data_handler(
-                x=val_x,
-                y=val_y,
-                sample_weight=val_sample_weight,
-                batch_size=validation_batch_size or batch_size,
-                steps_per_epoch=validation_steps,
+                steps_per_epoch=steps,
                 initial_epoch=0,
                 epochs=1,
                 max_queue_size=max_queue_size,
                 workers=workers,
                 use_multiprocessing=use_multiprocessing,
                 model=self,
-                steps_per_execution=self._steps_per_execution)
-          val_logs = self.evaluate(
-              x=val_x,
-              y=val_y,
-              sample_weight=val_sample_weight,
-              batch_size=validation_batch_size or batch_size,
-              steps=validation_steps,
-              callbacks=callbacks,
-              max_queue_size=max_queue_size,
-              workers=workers,
-              use_multiprocessing=use_multiprocessing,
-              return_dict=True,
-              _use_cached_eval_dataset=True)
-          val_logs = {'val_' + name: val for name, val in val_logs.items()}
-          epoch_logs.update(val_logs)
-
-        callbacks.on_epoch_end(epoch, epoch_logs)
-        training_logs = epoch_logs
-        if self.stop_training:
-          break
-
-      if isinstance(self.optimizer, optimizer_experimental.Optimizer):
-        self.optimizer.finalize_variable_values(self.trainable_variables)
-
-      # If eval data_handler exists, delete it after all epochs are done.
-      if getattr(self, '_eval_data_handler', None) is not None:
-        del self._eval_data_handler
-      callbacks.on_train_end(logs=training_logs)
-      return self.history
-
-  def test_step(self, data):
-    """The logic for one evaluation step.
-
-    This method can be overridden to support custom evaluation logic.
-    This method is called by `Model.make_test_function`.
-
-    This function should contain the mathematical logic for one step of
-    evaluation.
-    This typically includes the forward pass, loss calculation, and metrics
-    updates.
-
-    Configuration details for *how* this logic is run (e.g. `tf.function` and
-    `tf.distribute.Strategy` settings), should be left to
-    `Model.make_test_function`, which can also be overridden.
-
-    Args:
-      data: A nested structure of `Tensor`s.
-
-    Returns:
-      A `dict` containing values that will be passed to
-      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
-      values of the `Model`'s metrics are returned.
-    """
-    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
-
-    y_pred = self(x, training=False)
-    # Updates stateful loss metrics.
-    self.compute_loss(x, y, y_pred, sample_weight)
-    return self.compute_metrics(x, y, y_pred, sample_weight)
-
-  def make_test_function(self, force=False):
-    """Creates a function that executes one step of evaluation.
-
-    This method can be overridden to support custom evaluation logic.
-    This method is called by `Model.evaluate` and `Model.test_on_batch`.
-
-    Typically, this method directly controls `tf.function` and
-    `tf.distribute.Strategy` settings, and delegates the actual evaluation
-    logic to `Model.test_step`.
-
-    This function is cached the first time `Model.evaluate` or
-    `Model.test_on_batch` is called. The cache is cleared whenever
-    `Model.compile` is called. You can skip the cache and generate again the
-    function with `force=True`.
-
-    Args:
-      force: Whether to regenerate the test function and skip the cached
-        function if available.
-
-    Returns:
-      Function. The function created by this method should accept a
-      `tf.data.Iterator`, and return a `dict` containing values that will
-      be passed to `tf.keras.Callbacks.on_test_batch_end`.
-    """
-    if self.test_function is not None and not force:
-      return self.test_function
-
-    def step_function(model, iterator):
-      """Runs a single evaluation step."""
-
-      def run_step(data):
-        outputs = model.test_step(data)
-        # Ensure counter is updated only if `test_step` succeeds.
-        with tf.control_dependencies(_minimum_control_deps(outputs)):
-          model._test_counter.assign_add(1)  # pylint: disable=protected-access
-        return outputs
-
-      if self._jit_compile:
-        run_step = tf.function(
-            run_step, jit_compile=True, reduce_retracing=True)
-
-      data = next(iterator)
-      outputs = model.distribute_strategy.run(run_step, args=(data,))
-      outputs = reduce_per_replica(
-          outputs, self.distribute_strategy, reduction='first')
-      return outputs
-
-    # Special case if steps_per_execution is one.
-    if (self._steps_per_execution is None or
-        self._steps_per_execution.numpy().item() == 1):
-
-      def test_function(iterator):
-        """Runs a test execution with a single step."""
-        return step_function(self, iterator)
-
-      if not self.run_eagerly:
-        test_function = tf.function(
-            test_function, reduce_retracing=True)
-
-      if self._cluster_coordinator:
-        self.test_function = lambda it: self._cluster_coordinator.schedule(  # pylint: disable=g-long-lambda
-            test_function, args=(it,))
-      else:
-        self.test_function = test_function
-
-    # If we're using a coordinator, use the value of self._steps_per_execution
-    # at the time the function is called/scheduled, and not when it is actually
-    # executed.
-    elif self._cluster_coordinator:
-
-      def test_function(iterator, steps_per_execution):
-        """Runs a test execution with multiple steps."""
-        for _ in tf.range(steps_per_execution):
-          outputs = step_function(self, iterator)
-        return outputs
-
-      if not self.run_eagerly:
-        test_function = tf.function(
-            test_function, reduce_retracing=True)
-
-      self.test_function = lambda it: self._cluster_coordinator.schedule(  # pylint: disable=g-long-lambda
-          test_function,
-          args=(it, self._steps_per_execution.value()))
-    else:
+                steps_per_execution=self._steps_per_execution,
+            )
+
+            # Container that configures and calls `tf.keras.Callback`s.
+            if not isinstance(callbacks, callbacks_module.CallbackList):
+                callbacks = callbacks_module.CallbackList(
+                    callbacks,
+                    add_history=True,
+                    add_progbar=verbose != 0,
+                    model=self,
+                    verbose=verbose,
+                    epochs=1,
+                    steps=data_handler.inferred_steps,
+                )
+
+            self.predict_function = self.make_predict_function()
+            self._predict_counter.assign(0)
+            callbacks.on_predict_begin()
+            if self.autotune_steps_per_execution:
+                self._steps_per_execution_tuner.start()
+            batch_outputs = None
+            for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
+                with data_handler.catch_stop_iteration():
+                    for step in data_handler.steps():
+                        callbacks.on_predict_batch_begin(step)
+                        tmp_batch_outputs = self.predict_function(iterator)
+                        if data_handler.should_sync:
+                            context.async_wait()
+                        batch_outputs = (
+                            tmp_batch_outputs  # No error, now safe to assign.
+                        )
+                        if outputs is None:
+                            outputs = tf.nest.map_structure(
+                                lambda batch_output: [batch_output],
+                                batch_outputs,
+                            )
+                        else:
+                            tf.__internal__.nest.map_structure_up_to(
+                                batch_outputs,
+                                lambda output, batch_output: output.append(
+                                    batch_output
+                                ),
+                                outputs,
+                                batch_outputs,
+                            )
+                        end_step = step + data_handler.step_increment
+                        callbacks.on_predict_batch_end(
+                            end_step, {"outputs": batch_outputs}
+                        )
+            if batch_outputs is None:
+                raise ValueError(
+                    "Unexpected result of `predict_function` "
+                    "(Empty batch_outputs). Please use "
+                    "`Model.compile(..., run_eagerly=True)`, or "
+                    "`tf.config.run_functions_eagerly(True)` for more "
+                    "information of where went wrong, or file a "
+                    "issue/bug to `tf.keras`."
+                )
+            if self.autotune_steps_per_execution:
+                self._steps_per_execution_tuner.stop()
+            callbacks.on_predict_end()
+        all_outputs = tf.__internal__.nest.map_structure_up_to(
+            batch_outputs, potentially_ragged_concat, outputs
+        )
+
+        # If originally PSS strategy was used, then replace it back since
+        # predict is running under `OneDeviceStrategy` after the swap and once
+        # its done we need to replace it back to PSS again.
+        if original_pss_strategy is not None:
+            self._distribution_strategy = original_pss_strategy
+
+        return tf_utils.sync_to_numpy_or_python_type(all_outputs)
+
+    def reset_metrics(self):
+        """Resets the state of all the metrics in the model.
+
+        Examples:
+
+        >>> inputs = tf.keras.layers.Input(shape=(3,))
+        >>> outputs = tf.keras.layers.Dense(2)(inputs)
+        >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+        >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
+
+        >>> x = np.random.random((2, 3))
+        >>> y = np.random.randint(0, 2, (2, 2))
+        >>> _ = model.fit(x, y, verbose=0)
+        >>> assert all(float(m.result()) for m in model.metrics)
+
+        >>> model.reset_metrics()
+        >>> assert all(float(m.result()) == 0 for m in model.metrics)
+
+        """
+        for m in self.metrics:
+            m.reset_state()
+
+    def train_on_batch(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        class_weight=None,
+        reset_metrics=True,
+        return_dict=False,
+    ):
+        """Runs a single gradient update on a single batch of data.
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                  (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                  (in case the model has multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                  if the model has named inputs.
+            y: Target data. Like the input data `x`, it could be either Numpy
+              array(s) or TensorFlow tensor(s).
+            sample_weight: Optional array of the same length as x, containing
+              weights to apply to the model's loss for each sample. In the case
+              of temporal data, you can pass a 2D array with shape (samples,
+              sequence_length), to apply a different weight to every timestep of
+              every sample.
+            class_weight: Optional dictionary mapping class indices (integers)
+              to a weight (float) to apply to the model's loss for the samples
+              from this class during training. This can be useful to tell the
+              model to "pay more attention" to samples from an under-represented
+              class. When `class_weight` is specified and targets have a rank of
+              2 or greater, either `y` must be one-hot encoded, or an explicit
+              final dimension of `1` must be included for sparse class labels.
+            reset_metrics: If `True`, the metrics returned will be only for this
+              batch. If `False`, the metrics will be statefully accumulated
+              across batches.
+            return_dict: If `True`, loss and metric results are returned as a
+              dict, with each key being the name of the metric. If `False`, they
+              are returned as a list.
+
+        Returns:
+            Scalar training loss
+            (if the model has a single output and no metrics)
+            or list of scalars (if the model has multiple outputs
+            and/or metrics). The attribute `model.metrics_names` will give you
+            the display labels for the scalar outputs.
+
+        Raises:
+          RuntimeError: If `model.train_on_batch` is wrapped in a `tf.function`.
+        """
+        self._assert_compile_was_called()
+        self._check_call_args("train_on_batch")
+        _disallow_inside_tf_function("train_on_batch")
+        if reset_metrics:
+            self.reset_metrics()
+        with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState(  # noqa: E501
+            self
+        ):
+            iterator = data_adapter.single_batch_iterator(
+                self.distribute_strategy, x, y, sample_weight, class_weight
+            )
+            self.train_function = self.make_train_function()
+            logs = self.train_function(iterator)
 
-      def test_function(iterator):
-        """Runs a test execution with multiple steps."""
-        for _ in tf.range(self._steps_per_execution):
-          outputs = step_function(self, iterator)
-        return outputs
-
-      if not self.run_eagerly:
-        test_function = tf.function(
-            test_function, reduce_retracing=True)
-      self.test_function = test_function
-
-    return self.test_function
-
-  @traceback_utils.filter_traceback
-  def evaluate(self,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose='auto',
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               max_queue_size=10,
-               workers=1,
-               use_multiprocessing=False,
-               return_dict=False,
-               **kwargs):
-    """Returns the loss value & metrics values for the model in test mode.
-
-    Computation is done in batches (see the `batch_size` arg.)
+        logs = tf_utils.sync_to_numpy_or_python_type(logs)
+        if return_dict:
+            return logs
+        else:
+            return flatten_metrics_in_order(logs, self.metrics_names)
 
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset. Should return a tuple
-            of either `(inputs, targets)` or
-            `(inputs, targets, sample_weights)`.
-          - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
-            or `(inputs, targets, sample_weights)`.
-          A more detailed description of unpacking behavior for iterator types
-          (Dataset, generator, Sequence) is given in the `Unpacking behavior
-          for iterator-like inputs` section of `Model.fit`.
-        y: Target data. Like the input data `x`, it could be either Numpy
-          array(s) or TensorFlow tensor(s). It should be consistent with `x`
-          (you cannot have Numpy inputs and tensor targets, or inversely). If
-          `x` is a dataset, generator or `keras.utils.Sequence` instance, `y`
-          should not be specified (since targets will be obtained from the
-          iterator/dataset).
-        batch_size: Integer or `None`. Number of samples per batch of
-          computation. If unspecified, `batch_size` will default to 32. Do not
-          specify the `batch_size` if your data is in the form of a dataset,
-          generators, or `keras.utils.Sequence` instances (since they generate
-          batches).
-        verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = single line.
-            `"auto"` defaults to 1 for most cases, and to 2 when used with
-            `ParameterServerStrategy`. Note that the progress bar is not
-            particularly useful when logged to a file, so `verbose=2` is
-            recommended when not running interactively (e.g. in a production
-            environment).
-        sample_weight: Optional Numpy array of weights for the test samples,
-          used for weighting the loss function. You can either pass a flat (1D)
-          Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples), or in the case of
-              temporal data, you can pass a 2D array with shape `(samples,
-              sequence_length)`, to apply a different weight to every timestep
-              of every sample. This argument is not supported when `x` is a
-              dataset, instead pass sample weights as the third element of `x`.
-        steps: Integer or `None`. Total number of steps (batches of samples)
-          before declaring the evaluation round finished. Ignored with the
-          default value of `None`. If x is a `tf.data` dataset and `steps` is
-          None, 'evaluate' will run until the dataset is exhausted. This
-          argument is not supported with array inputs.
-        callbacks: List of `keras.callbacks.Callback` instances. List of
-          callbacks to apply during evaluation. See
-          [callbacks](/api_docs/python/tf/keras/callbacks).
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-          input only. Maximum size for the generator queue. If unspecified,
-          `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-          only. Maximum number of processes to spin up when using process-based
-          threading. If unspecified, `workers` will default to 1.
-        use_multiprocessing: Boolean. Used for generator or
-          `keras.utils.Sequence` input only. If `True`, use process-based
-          threading. If unspecified, `use_multiprocessing` will default to
-          `False`. Note that because this implementation relies on
-          multiprocessing, you should not pass non-picklable arguments to the
-          generator as they can't be passed easily to children processes.
-        return_dict: If `True`, loss and metric results are returned as a dict,
-          with each key being the name of the metric. If `False`, they are
-          returned as a list.
-        **kwargs: Unused at this time.
-
-    See the discussion of `Unpacking behavior for iterator-like inputs` for
-    `Model.fit`.
+    def test_on_batch(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        reset_metrics=True,
+        return_dict=False,
+    ):
+        """Test the model on a single batch of samples.
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays (in case the
+                  model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors (in case the model has
+                  multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                  if the model has named inputs.
+            y: Target data. Like the input data `x`, it could be either Numpy
+              array(s) or TensorFlow tensor(s). It should be consistent with `x`
+              (you cannot have Numpy inputs and tensor targets, or inversely).
+            sample_weight: Optional array of the same length as x, containing
+              weights to apply to the model's loss for each sample. In the case
+              of temporal data, you can pass a 2D array with shape (samples,
+              sequence_length), to apply a different weight to every timestep of
+              every sample.
+            reset_metrics: If `True`, the metrics returned will be only for this
+              batch. If `False`, the metrics will be statefully accumulated
+              across batches.
+            return_dict: If `True`, loss and metric results are returned as a
+              dict, with each key being the name of the metric. If `False`, they
+              are returned as a list.
+
+        Returns:
+            Scalar test loss (if the model has a single output and no metrics)
+            or list of scalars (if the model has multiple outputs
+            and/or metrics). The attribute `model.metrics_names` will give you
+            the display labels for the scalar outputs.
+
+        Raises:
+            RuntimeError: If `model.test_on_batch` is wrapped in a
+              `tf.function`.
+        """
+        self._assert_compile_was_called()
+        self._check_call_args("test_on_batch")
+        _disallow_inside_tf_function("test_on_batch")
+        if reset_metrics:
+            self.reset_metrics()
+        with self.distribute_strategy.scope():
+            iterator = data_adapter.single_batch_iterator(
+                self.distribute_strategy, x, y, sample_weight
+            )
+            self.test_function = self.make_test_function()
+            logs = self.test_function(iterator)
 
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+        logs = tf_utils.sync_to_numpy_or_python_type(logs)
+        if return_dict:
+            return logs
+        else:
+            return flatten_metrics_in_order(logs, self.metrics_names)
+
+    def predict_on_batch(self, x):
+        """Returns predictions for a single batch of samples.
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays (in case the
+                  model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors (in case the model has
+                  multiple inputs).
+
+        Returns:
+            Numpy array(s) of predictions.
+
+        Raises:
+            RuntimeError: If `model.predict_on_batch` is wrapped in a
+              `tf.function`.
+        """
+        self._check_call_args("predict_on_batch")
+        _disallow_inside_tf_function("predict_on_batch")
+        with self.distribute_strategy.scope():
+            iterator = data_adapter.single_batch_iterator(
+                self.distribute_strategy, x
+            )
+            self.predict_function = self.make_predict_function()
+            outputs = self.predict_function(iterator)
+        return tf_utils.sync_to_numpy_or_python_type(outputs)
+
+    @doc_controls.do_not_generate_docs
+    def fit_generator(
+        self,
+        generator,
+        steps_per_epoch=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_data=None,
+        validation_steps=None,
+        validation_freq=1,
+        class_weight=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        shuffle=True,
+        initial_epoch=0,
+    ):
+        """Fits the model on data yielded batch-by-batch by a Python generator.
+
+        DEPRECATED:
+          `Model.fit` now supports generators, so there is no longer any need to
+          use this endpoint.
+        """
+        warnings.warn(
+            "`Model.fit_generator` is deprecated and "
+            "will be removed in a future version. "
+            "Please use `Model.fit`, which supports generators.",
+            stacklevel=2,
+        )
+        return self.fit(
+            generator,
+            steps_per_epoch=steps_per_epoch,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_data=validation_data,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq,
+            class_weight=class_weight,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            shuffle=shuffle,
+            initial_epoch=initial_epoch,
+        )
 
-    Raises:
-        RuntimeError: If `model.evaluate` is wrapped in a `tf.function`.
-    """
-    base_layer.keras_api_gauge.get_cell('evaluate').set(True)
-    version_utils.disallow_legacy_graph('Model', 'evaluate')
-    self._assert_compile_was_called()
-    self._check_call_args('evaluate')
-    self._check_sample_weight_warning(x, sample_weight)
-    _disallow_inside_tf_function('evaluate')
-    use_cached_eval_dataset = kwargs.pop('_use_cached_eval_dataset', False)
-    if kwargs:
-      raise TypeError(f'Invalid keyword arguments: {list(kwargs.keys())}')
-
-    if self.distribute_strategy._should_use_with_coordinator:  # pylint: disable=protected-access
-      self._cluster_coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
-          self.distribute_strategy)
-
-    verbose = _get_verbosity(verbose, self.distribute_strategy)
-    with self.distribute_strategy.scope():
-      # Use cached evaluation data only when it's called in `Model.fit`
-      if (use_cached_eval_dataset
-          and getattr(self, '_eval_data_handler', None) is not None):
-        data_handler = self._eval_data_handler
-      else:
-        # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
-        data_handler = data_adapter.get_data_handler(
-            x=x,
-            y=y,
-            sample_weight=sample_weight,
-            batch_size=batch_size,
-            steps_per_epoch=steps,
-            initial_epoch=0,
-            epochs=1,
+    @doc_controls.do_not_generate_docs
+    def evaluate_generator(
+        self,
+        generator,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        verbose=0,
+    ):
+        """Evaluates the model on a data generator.
+
+        DEPRECATED:
+          `Model.evaluate` now supports generators, so there is no longer any
+          need to use this endpoint.
+        """
+        warnings.warn(
+            "`Model.evaluate_generator` is deprecated and "
+            "will be removed in a future version. "
+            "Please use `Model.evaluate`, which supports generators.",
+            stacklevel=2,
+        )
+        self._check_call_args("evaluate_generator")
+
+        return self.evaluate(
+            generator,
+            steps=steps,
             max_queue_size=max_queue_size,
             workers=workers,
             use_multiprocessing=use_multiprocessing,
-            model=self,
-            steps_per_execution=self._steps_per_execution)
-
-      # Container that configures and calls `tf.keras.Callback`s.
-      if not isinstance(callbacks, callbacks_module.CallbackList):
-        callbacks = callbacks_module.CallbackList(
-            callbacks,
-            add_history=True,
-            add_progbar=verbose != 0,
-            model=self,
             verbose=verbose,
-            epochs=1,
-            steps=data_handler.inferred_steps)
-
-      logs = {}
-      self.test_function = self.make_test_function()
-      self._test_counter.assign(0)
-      callbacks.on_test_begin()
-      for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
-        self.reset_metrics()
-        with data_handler.catch_stop_iteration():
-          for step in data_handler.steps():
-            with tf.profiler.experimental.Trace('test', step_num=step, _r=1):
-              callbacks.on_test_batch_begin(step)
-              tmp_logs = self.test_function(iterator)
-              if data_handler.should_sync:
-                context.async_wait()
-              logs = tmp_logs  # No error, now safe to assign to logs.
-              end_step = step + data_handler.step_increment
-              callbacks.on_test_batch_end(end_step, logs)
-      logs = tf_utils.sync_to_numpy_or_python_type(logs)
-      callbacks.on_test_end(logs=logs)
-
-      if return_dict:
-        return logs
-      else:
-        return flatten_metrics_in_order(logs, self.metrics_names)
-
-  def predict_step(self, data):
-    """The logic for one inference step.
-
-    This method can be overridden to support custom inference logic.
-    This method is called by `Model.make_predict_function`.
-
-    This method should contain the mathematical logic for one step of inference.
-    This typically includes the forward pass.
+            callbacks=callbacks,
+        )
 
-    Configuration details for *how* this logic is run (e.g. `tf.function` and
-    `tf.distribute.Strategy` settings), should be left to
-    `Model.make_predict_function`, which can also be overridden.
-
-    Args:
-      data: A nested structure of `Tensor`s.
-
-    Returns:
-      The result of one inference step, typically the output of calling the
-      `Model` on data.
-    """
-    x, _, _ = data_adapter.unpack_x_y_sample_weight(data)
-    return self(x, training=False)
-
-  def make_predict_function(self, force=False):
-    """Creates a function that executes one step of inference.
-
-    This method can be overridden to support custom inference logic.
-    This method is called by `Model.predict` and `Model.predict_on_batch`.
-
-    Typically, this method directly controls `tf.function` and
-    `tf.distribute.Strategy` settings, and delegates the actual evaluation
-    logic to `Model.predict_step`.
-
-    This function is cached the first time `Model.predict` or
-    `Model.predict_on_batch` is called. The cache is cleared whenever
-    `Model.compile` is called. You can skip the cache and generate again the
-    function with `force=True`.
-
-    Args:
-      force: Whether to regenerate the predict function and skip the cached
-        function if available.
-
-    Returns:
-      Function. The function created by this method should accept a
-      `tf.data.Iterator`, and return the outputs of the `Model`.
-    """
-    if self.predict_function is not None and not force:
-      return self.predict_function
-
-    def step_function(model, iterator):
-      """Runs a single evaluation step."""
-
-      def run_step(data):
-        outputs = model.predict_step(data)
-        # Ensure counter is updated only if `test_step` succeeds.
-        with tf.control_dependencies(_minimum_control_deps(outputs)):
-          model._predict_counter.assign_add(1)  # pylint: disable=protected-access
-        return outputs
-
-      if self._jit_compile:
-        run_step = tf.function(
-            run_step, jit_compile=True, reduce_retracing=True)
-
-      data = next(iterator)
-      outputs = model.distribute_strategy.run(run_step, args=(data,))
-      outputs = reduce_per_replica(
-          outputs, self.distribute_strategy, reduction='concat')
-      return outputs
-
-    # Special case if steps_per_execution is one.
-    if (self._steps_per_execution is None or
-        self._steps_per_execution.numpy().item() == 1):
-
-      def predict_function(iterator):
-        """Runs an evaluation execution with a single step."""
-        return step_function(self, iterator)
-
-    else:
-
-      def predict_function(iterator):
-        """Runs an evaluation execution with multiple steps."""
-        outputs = step_function(self, iterator)
-        for _ in tf.range(self._steps_per_execution - 1):
-          tf.autograph.experimental.set_loop_options(shape_invariants=[(
-              outputs,
-              tf.nest.map_structure(
-                  lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=True).
-                  shape, outputs))])
-          step_outputs = step_function(self, iterator)
-          outputs = tf.nest.map_structure(lambda t1, t2: concat([t1, t2]),
-                                          outputs, step_outputs)
-        return outputs
-
-    if not self.run_eagerly:
-      predict_function = tf.function(
-          predict_function, reduce_retracing=True)
-    self.predict_function = predict_function
-
-    return self.predict_function
-
-  @traceback_utils.filter_traceback
-  def predict(self,
-              x,
-              batch_size=None,
-              verbose='auto',
-              steps=None,
-              callbacks=None,
-              max_queue_size=10,
-              workers=1,
-              use_multiprocessing=False):
-    """Generates output predictions for the input samples.
-
-    Computation is done in batches. This method is designed for batch processing
-    of large numbers of inputs. It is not intended for use inside of loops
-    that iterate over your data and process small numbers of inputs at a time.
-
-    For small numbers of inputs that fit in one batch,
-    directly use `__call__()` for faster execution, e.g.,
-    `model(x)`, or `model(x, training=False)` if you have layers such as
-    `tf.keras.layers.BatchNormalization` that behave differently during
-    inference. You may pair the individual model call with a `tf.function`
-    for additional performance inside your inner loop.
-    If you need access to numpy array values instead of tensors after your
-    model call, you can use `tensor.numpy()` to get the numpy array value of
-    an eager tensor.
-
-    Also, note the fact that test loss is not affected by
-    regularization layers like noise and dropout.
-
-    Note: See [this FAQ entry](
-    https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
-    for more details about the difference between `Model` methods `predict()`
-    and `__call__()`.
-
-    Args:
-        x: Input samples. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset.
-          - A generator or `keras.utils.Sequence` instance.
-          A more detailed description of unpacking behavior for iterator types
-          (Dataset, generator, Sequence) is given in the `Unpacking behavior
-          for iterator-like inputs` section of `Model.fit`.
-        batch_size: Integer or `None`.
-            Number of samples per batch.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of dataset, generators, or `keras.utils.Sequence` instances
-            (since they generate batches).
-        verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = single line.
-            `"auto"` defaults to 1 for most cases, and to 2 when used with
-            `ParameterServerStrategy`. Note that the progress bar is not
-            particularly useful when logged to a file, so `verbose=2` is
-            recommended when not running interactively (e.g. in a production
-            environment).
-        steps: Total number of steps (batches of samples)
-            before declaring the prediction round finished.
-            Ignored with the default value of `None`. If x is a `tf.data`
-            dataset and `steps` is None, `predict()` will
-            run until the input dataset is exhausted.
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during prediction.
-            See [callbacks](/api_docs/python/tf/keras/callbacks).
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
-
-    See the discussion of `Unpacking behavior for iterator-like inputs` for
-    `Model.fit`. Note that Model.predict uses the same interpretation rules as
-    `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all
-    three methods.
-
-    Returns:
-        Numpy array(s) of predictions.
-
-    Raises:
-        RuntimeError: If `model.predict` is wrapped in a `tf.function`.
-        ValueError: In case of mismatch between the provided
-            input data and the model's expectations,
-            or in case a stateful model receives a number of samples
-            that is not a multiple of the batch size.
-    """
-    base_layer.keras_api_gauge.get_cell('predict').set(True)
-    version_utils.disallow_legacy_graph('Model', 'predict')
-    self._check_call_args('predict')
-    _disallow_inside_tf_function('predict')
-
-    # TODO(yashkatariya): Cache model on the coordinator for faster prediction.
-    # If running under PSS, then swap it with OneDeviceStrategy so that
-    # execution will run on the coordinator.
-    original_pss_strategy = None
-    if self.distribute_strategy._should_use_with_coordinator:  # pylint: disable=protected-access
-      original_pss_strategy = self.distribute_strategy
-      self._distribution_strategy = None
-
-    # Cluster coordinator is set by `.fit()` and `.evaluate()` which is not
-    # needed in `.predict()` because all the predictions happen on the
-    # coordinator/locally.
-    if self._cluster_coordinator:
-      self._cluster_coordinator = None
-
-    verbose = _get_verbosity(verbose, self.distribute_strategy)
-    outputs = None
-    with self.distribute_strategy.scope():
-      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
-      dataset_types = (tf.compat.v1.data.Dataset, tf.data.Dataset)
-      if (self._in_multi_worker_mode() or _is_tpu_multi_host(
-          self.distribute_strategy)) and isinstance(x, dataset_types):
-        try:
-          options = tf.data.Options()
-          data_option = tf.data.experimental.AutoShardPolicy.DATA
-          options.experimental_distribute.auto_shard_policy = data_option
-          x = x.with_options(options)
-        except ValueError:
-          warnings.warn(
-              'Using Model.predict with MultiWorkerMirroredStrategy or '
-              'TPUStrategy and AutoShardPolicy.FILE might lead to out-of-order '
-              'result. Consider setting it to AutoShardPolicy.DATA.',
-              stacklevel=2)
-
-      data_handler = data_adapter.get_data_handler(
-          x=x,
-          batch_size=batch_size,
-          steps_per_epoch=steps,
-          initial_epoch=0,
-          epochs=1,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          model=self,
-          steps_per_execution=self._steps_per_execution)
-
-      # Container that configures and calls `tf.keras.Callback`s.
-      if not isinstance(callbacks, callbacks_module.CallbackList):
-        callbacks = callbacks_module.CallbackList(
-            callbacks,
-            add_history=True,
-            add_progbar=verbose != 0,
-            model=self,
+    @doc_controls.do_not_generate_docs
+    def predict_generator(
+        self,
+        generator,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        verbose=0,
+    ):
+        """Generates predictions for the input samples from a data generator.
+
+        DEPRECATED:
+          `Model.predict` now supports generators, so there is no longer any
+          need to use this endpoint.
+        """
+        warnings.warn(
+            "`Model.predict_generator` is deprecated and "
+            "will be removed in a future version. "
+            "Please use `Model.predict`, which supports generators.",
+            stacklevel=2,
+        )
+        return self.predict(
+            generator,
+            steps=steps,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
             verbose=verbose,
-            epochs=1,
-            steps=data_handler.inferred_steps)
-
-      self.predict_function = self.make_predict_function()
-      self._predict_counter.assign(0)
-      callbacks.on_predict_begin()
-      batch_outputs = None
-      for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
-        with data_handler.catch_stop_iteration():
-          for step in data_handler.steps():
-            callbacks.on_predict_batch_begin(step)
-            tmp_batch_outputs = self.predict_function(iterator)
-            if data_handler.should_sync:
-              context.async_wait()
-            batch_outputs = tmp_batch_outputs  # No error, now safe to assign.
-            if outputs is None:
-              outputs = tf.nest.map_structure(lambda batch_output: [batch_output],
-                                           batch_outputs)
-            else:
-              tf.__internal__.nest.map_structure_up_to(
-                  batch_outputs,
-                  lambda output, batch_output: output.append(batch_output),
-                  outputs, batch_outputs)
-            end_step = step + data_handler.step_increment
-            callbacks.on_predict_batch_end(end_step, {'outputs': batch_outputs})
-      if batch_outputs is None:
-        raise ValueError('Unexpected result of `predict_function` '
-                         '(Empty batch_outputs). Please use '
-                         '`Model.compile(..., run_eagerly=True)`, or '
-                         '`tf.config.run_functions_eagerly(True)` for more '
-                         'information of where went wrong, or file a '
-                         'issue/bug to `tf.keras`.')
-      callbacks.on_predict_end()
-    all_outputs = tf.__internal__.nest.map_structure_up_to(
-        batch_outputs, potentially_ragged_concat, outputs)
-
-    # If originally PSS strategy was used, then replace it back since predict
-    # is running under `OneDeviceStrategy` after the swap and once its done
-    # we need to replace it back to PSS again.
-    if original_pss_strategy is not None:
-      self._distribution_strategy = original_pss_strategy
-
-    return tf_utils.sync_to_numpy_or_python_type(all_outputs)
-
-  def reset_metrics(self):
-    """Resets the state of all the metrics in the model.
-
-    Examples:
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> outputs = tf.keras.layers.Dense(2)(inputs)
-    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
-
-    >>> x = np.random.random((2, 3))
-    >>> y = np.random.randint(0, 2, (2, 2))
-    >>> _ = model.fit(x, y, verbose=0)
-    >>> assert all(float(m.result()) for m in model.metrics)
-
-    >>> model.reset_metrics()
-    >>> assert all(float(m.result()) == 0 for m in model.metrics)
-
-    """
-    for m in self.metrics:
-      m.reset_state()
-
-  def train_on_batch(self,
-                     x,
-                     y=None,
-                     sample_weight=None,
-                     class_weight=None,
-                     reset_metrics=True,
-                     return_dict=False):
-    """Runs a single gradient update on a single batch of data.
-
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-              (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-              (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-              if the model has named inputs.
-        y: Target data. Like the input data `x`, it could be either Numpy
-          array(s) or TensorFlow tensor(s).
-        sample_weight: Optional array of the same length as x, containing
-          weights to apply to the model's loss for each sample. In the case of
-          temporal data, you can pass a 2D array with shape (samples,
-          sequence_length), to apply a different weight to every timestep of
-          every sample.
-        class_weight: Optional dictionary mapping class indices (integers) to a
-          weight (float) to apply to the model's loss for the samples from this
-          class during training. This can be useful to tell the model to "pay
-          more attention" to samples from an under-represented class.
-        reset_metrics: If `True`, the metrics returned will be only for this
-          batch. If `False`, the metrics will be statefully accumulated across
-          batches.
-        return_dict: If `True`, loss and metric results are returned as a dict,
-          with each key being the name of the metric. If `False`, they are
-          returned as a list.
+            callbacks=callbacks,
+        )
+
+    ######################################################################
+    # Functions below are not training related. They are for model weights
+    # tracking, save/load, serialization, etc.
+    ######################################################################
+
+    @property
+    def trainable_weights(self):
+        self._assert_weights_created()
+        if not self._trainable:
+            return []
+        trainable_variables = []
+        for trackable_obj in self._self_tracked_trackables:
+            trainable_variables += trackable_obj.trainable_variables
+        trainable_variables += self._trainable_weights
+        return self._dedup_weights(trainable_variables)
+
+    @property
+    def non_trainable_weights(self):
+        self._assert_weights_created()
+        non_trainable_variables = []
+        for trackable_obj in self._self_tracked_trackables:
+            non_trainable_variables += trackable_obj.non_trainable_variables
+
+        if not self._trainable:
+            # Return order is all trainable vars, then all non-trainable vars.
+            trainable_variables = []
+            for trackable_obj in self._self_tracked_trackables:
+                trainable_variables += trackable_obj.trainable_variables
+
+            non_trainable_variables = (
+                trainable_variables
+                + self._trainable_weights
+                + non_trainable_variables
+                + self._non_trainable_weights
+            )
+        else:
+            non_trainable_variables = (
+                non_trainable_variables + self._non_trainable_weights
+            )
+
+        return self._dedup_weights(non_trainable_variables)
+
+    def get_weights(self):
+        """Retrieves the weights of the model.
+
+        Returns:
+            A flat list of Numpy arrays.
+        """
+        with self.distribute_strategy.scope():
+            return super().get_weights()
+
+    @traceback_utils.filter_traceback
+    def save(self, filepath, overwrite=True, save_format=None, **kwargs):
+        """Saves a model as a TensorFlow SavedModel or HDF5 file.
+
+        See the [Serialization and Saving guide](
+            https://keras.io/guides/serialization_and_saving/) for details.
+
+        Args:
+            model: Keras model instance to be saved.
+            filepath: `str` or `pathlib.Path` object. Path where to save the
+                model.
+            overwrite: Whether we should overwrite any existing model at the
+                target location, or instead ask the user via an interactive
+                prompt.
+            save_format: Either `"keras"`, `"tf"`, `"h5"`,
+                indicating whether to save the model
+                in the native Keras format (`.keras`),
+                in the TensorFlow SavedModel format
+                (referred to as "SavedModel" below),
+                or in the legacy HDF5 format (`.h5`).
+                Defaults to `"tf"` in TF 2.X, and `"h5"` in TF 1.X.
+
+        SavedModel format arguments:
+            include_optimizer: Only applied to SavedModel and legacy HDF5
+                formats. If False, do not save the optimizer state.
+                Defaults to `True`.
+            signatures: Only applies to SavedModel format. Signatures to save
+                with the SavedModel. See the `signatures` argument in
+                `tf.saved_model.save` for details.
+            options: Only applies to SavedModel format.
+                `tf.saved_model.SaveOptions` object that specifies SavedModel
+                saving options.
+            save_traces: Only applies to SavedModel format. When enabled, the
+                SavedModel will store the function traces for each layer. This
+                can be disabled, so that only the configs of each layer are
+                stored. Defaults to `True`.
+                Disabling this will decrease serialization time
+                and reduce file size, but it requires that all custom
+                layers/models implement a `get_config()` method.
+
+        Example:
+
+        ```python
+        model = tf.keras.Sequential([
+            tf.keras.layers.Dense(5, input_shape=(3,)),
+            tf.keras.layers.Softmax()])
+        model.save("model.keras")
+        loaded_model = tf.keras.models.load_model("model.keras")
+        x = tf.random.uniform((10, 3))
+        assert np.allclose(model.predict(x), loaded_model.predict(x))
+        ```
+
+        Note that `model.save()` is an alias for `tf.keras.models.save_model()`.
+        """
+        saving_api.save_model(
+            self,
+            filepath=filepath,
+            overwrite=overwrite,
+            save_format=save_format,
+            **kwargs,
+        )
+
+    @traceback_utils.filter_traceback
+    def save_weights(
+        self, filepath, overwrite=True, save_format=None, options=None
+    ):
+        """Saves all layer weights.
+
+        Either saves in HDF5 or in TensorFlow format based on the `save_format`
+        argument.
+
+        When saving in HDF5 format, the weight file has:
+          - `layer_names` (attribute), a list of strings
+              (ordered names of model layers).
+          - For every layer, a `group` named `layer.name`
+              - For every such layer group, a group attribute `weight_names`,
+                  a list of strings
+                  (ordered names of weights tensor of the layer).
+              - For every weight in the layer, a dataset
+                  storing the weight value, named after the weight tensor.
+
+        When saving in TensorFlow format, all objects referenced by the network
+        are saved in the same format as `tf.train.Checkpoint`, including any
+        `Layer` instances or `Optimizer` instances assigned to object
+        attributes. For networks constructed from inputs and outputs using
+        `tf.keras.Model(inputs, outputs)`, `Layer` instances used by the network
+        are tracked/saved automatically. For user-defined classes which inherit
+        from `tf.keras.Model`, `Layer` instances must be assigned to object
+        attributes, typically in the constructor. See the documentation of
+        `tf.train.Checkpoint` and `tf.keras.Model` for details.
+
+        While the formats are the same, do not mix `save_weights` and
+        `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should
+        be loaded using `Model.load_weights`. Checkpoints saved using
+        `tf.train.Checkpoint.save` should be restored using the corresponding
+        `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
+        `save_weights` for training checkpoints.
+
+        The TensorFlow format matches objects and variables by starting at a
+        root object, `self` for `save_weights`, and greedily matching attribute
+        names. For `Model.save` this is the `Model`, and for `Checkpoint.save`
+        this is the `Checkpoint` even if the `Checkpoint` has a model attached.
+        This means saving a `tf.keras.Model` using `save_weights` and loading
+        into a `tf.train.Checkpoint` with a `Model` attached (or vice versa)
+        will not match the `Model`'s variables. See the
+        [guide to training checkpoints](
+        https://www.tensorflow.org/guide/checkpoint) for details on
+        the TensorFlow format.
+
+        Args:
+            filepath: String or PathLike, path to the file to save the weights
+                to. When saving in TensorFlow format, this is the prefix used
+                for checkpoint files (multiple files are generated). Note that
+                the '.h5' suffix causes weights to be saved in HDF5 format.
+            overwrite: Whether to silently overwrite any existing file at the
+                target location, or provide the user with a manual prompt.
+            save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+                '.keras' will default to HDF5 if `save_format` is `None`.
+                Otherwise, `None` becomes 'tf'. Defaults to `None`.
+            options: Optional `tf.train.CheckpointOptions` object that specifies
+                options for saving weights.
+
+        Raises:
+            ImportError: If `h5py` is not available when attempting to save in
+                HDF5 format.
+        """
+        saving_api.save_weights(
+            self,
+            filepath=filepath,
+            overwrite=overwrite,
+            save_format=save_format,
+            options=options,
+        )
+
+    @traceback_utils.filter_traceback
+    def load_weights(
+        self, filepath, skip_mismatch=False, by_name=False, options=None
+    ):
+        """Loads all layer weights from a saved files.
+
+        The saved file could be a SavedModel file, a `.keras` file (v3 saving
+        format), or a file created via `model.save_weights()`.
+
+        By default, weights are loaded based on the network's
+        topology. This means the architecture should be the same as when the
+        weights were saved. Note that layers that don't have weights are not
+        taken into account in the topological ordering, so adding or removing
+        layers is fine as long as they don't have weights.
+
+        **Partial weight loading**
+
+        If you have modified your model, for instance by adding a new layer
+        (with weights) or by changing the shape of the weights of a layer,
+        you can choose to ignore errors and continue loading
+        by setting `skip_mismatch=True`. In this case any layer with
+        mismatching weights will be skipped. A warning will be displayed
+        for each skipped layer.
+
+        **Weight loading by name**
+
+        If your weights are saved as a `.h5` file created
+        via `model.save_weights()`, you can use the argument `by_name=True`.
+
+        In this case, weights are loaded into layers only if they share
+        the same name. This is useful for fine-tuning or transfer-learning
+        models where some of the layers have changed.
+
+        Note that only topological loading (`by_name=False`) is supported when
+        loading weights from the `.keras` v3 format or from the TensorFlow
+        SavedModel format.
+
+        Args:
+            filepath: String, path to the weights file to load. For weight files
+                in TensorFlow format, this is the file prefix (the same as was
+                passed to `save_weights()`). This can also be a path to a
+                SavedModel or a `.keras` file (v3 saving format) saved
+                via `model.save()`.
+            skip_mismatch: Boolean, whether to skip loading of layers where
+                there is a mismatch in the number of weights, or a mismatch in
+                the shape of the weights.
+            by_name: Boolean, whether to load weights by name or by topological
+                order. Only topological loading is supported for weight files in
+                the `.keras` v3 format or in the TensorFlow SavedModel format.
+            options: Optional `tf.train.CheckpointOptions` object that specifies
+                options for loading weights (only valid for a SavedModel file).
+        """
+        return saving_api.load_weights(
+            self,
+            filepath=filepath,
+            by_name=by_name,
+            skip_mismatch=skip_mismatch,
+            options=options,
+        )
+
+    def _updated_config(self):
+        """Util shared between different serialization methods.
+
+        Returns:
+            Model config with Keras version information added.
+        """
+        from keras import __version__ as keras_version
+
+        config = self.get_config()
+        model_config = {
+            "class_name": self.__class__.__name__,
+            "config": config,
+            "keras_version": keras_version,
+            "backend": backend.backend(),
+        }
+        return model_config
+
+    @generic_utils.default
+    def get_config(self):
+        """Returns the config of the `Model`.
+
+        Config is a Python dictionary (serializable) containing the
+        configuration of an object, which in this case is a `Model`. This allows
+        the `Model` to be be reinstantiated later (without its trained weights)
+        from this configuration.
+
+        Note that `get_config()` does not guarantee to return a fresh copy of
+        dict every time it is called. The callers should make a copy of the
+        returned dict if they want to modify it.
+
+        Developers of subclassed `Model` are advised to override this method,
+        and continue to update the dict from `super(MyModel, self).get_config()`
+        to provide the proper configuration of this `Model`. The default config
+        will return config dict for init parameters if they are basic types.
+        Raises `NotImplementedError` when in cases where a custom
+        `get_config()` implementation is required for the subclassed model.
+
+        Returns:
+            Python dictionary containing the configuration of this `Model`.
+        """
+        # If sublcass doesn't implement `get_config()` parse from init args
+        # otherwise default to empty dict
+        if generic_utils.is_default(self.get_config):
+            try:
+                config = base_layer.Layer.get_config(self)
+            except NotImplementedError:
+                config = {}
+                logging.warning(
+                    "Model's `__init__()` arguments contain non-serializable "
+                    "objects. Please implement a `get_config()` method in the "
+                    "subclassed Model for proper saving and loading. "
+                    "Defaulting to empty config."
+                )
+        else:
+            config = {}
+        return config
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        # `from_config` assumes `cls` is either `Functional` or a child class of
+        # `Functional`. In the case that `cls` is meant to behave like a child
+        # class of `Functional` but only inherits from the `Model` class, we
+        # have to call `cls(...)` instead of `Functional.from_config`.
+        from keras.engine import functional
+
+        with serialization.SharedObjectLoadingScope():
+            functional_config_keys = [
+                "name",
+                "layers",
+                "input_layers",
+                "output_layers",
+            ]
+            is_functional_config = all(
+                key in config for key in functional_config_keys
+            )
+            argspec = tf_inspect.getfullargspec(cls.__init__)
+            functional_init_args = tf_inspect.getfullargspec(
+                functional.Functional.__init__
+            ).args[1:]
+            revivable_as_functional = (
+                cls in {functional.Functional, Model}
+                or argspec.args[1:] == functional_init_args
+                or (argspec.varargs == "args" and argspec.varkw == "kwargs")
+            )
+            if is_functional_config and revivable_as_functional:
+                # Revive Functional model
+                # (but not Functional subclasses with a custom __init__)
+                inputs, outputs, layers = functional.reconstruct_from_config(
+                    config, custom_objects
+                )
+                model = cls(
+                    inputs=inputs, outputs=outputs, name=config.get("name")
+                )
+                functional.connect_ancillary_layers(model, layers)
 
-    Returns:
-        Scalar training loss
-        (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+            else:
+                # Either the model has a custom __init__, or the config
+                # does not contain all the information necessary to
+                # revive a Functional model. This happens when the user creates
+                # subclassed models where `get_config()` is returning
+                # insufficient information to be considered a Functional model.
+                # In this case, we fall back to provide all config into the
+                # constructor of the class.
+                try:
+                    model = cls(**config)
+                except TypeError as e:
+                    raise TypeError(
+                        "Unable to revive model from config. When overriding "
+                        "the `get_config()` method, make sure that the "
+                        "returned config contains all items used as arguments "
+                        f"in the  constructor to {cls}, "
+                        "which is the default behavior. "
+                        "You can override this default behavior by defining a "
+                        "`from_config(cls, config)` class method to specify "
+                        "how to create an "
+                        f"instance of {cls.__name__} from its config.\n\n"
+                        f"Received config={config}\n\n"
+                        f"Error encountered during deserialization: {e}"
+                    )
+            return model
+
+    def to_json(self, **kwargs):
+        """Returns a JSON string containing the network configuration.
+
+        To load a network from a JSON save file, use
+        `keras.models.model_from_json(json_string, custom_objects={})`.
+
+        Args:
+            **kwargs: Additional keyword arguments to be passed to
+                *`json.dumps()`.
+
+        Returns:
+            A JSON string.
+        """
+        model_config = self._updated_config()
+        return json.dumps(
+            model_config, default=json_utils.get_json_type, **kwargs
+        )
+
+    def to_yaml(self, **kwargs):
+        """Returns a yaml string containing the network configuration.
+
+        Note: Since TF 2.6, this method is no longer supported and will raise a
+        RuntimeError.
+
+        To load a network from a yaml save file, use
+        `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+        `custom_objects` should be a dictionary mapping
+        the names of custom losses / layers / etc to the corresponding
+        functions / classes.
+
+        Args:
+            **kwargs: Additional keyword arguments
+                to be passed to `yaml.dump()`.
+
+        Returns:
+            A YAML string.
+
+        Raises:
+            RuntimeError: announces that the method poses a security risk
+        """
+        raise RuntimeError(
+            "Method `model.to_yaml()` has been removed due to security risk of "
+            "arbitrary code execution. Please use `model.to_json()` instead."
+        )
+
+    def reset_states(self):
+        for layer in self.layers:
+            if hasattr(layer, "reset_states") and getattr(
+                layer, "stateful", False
+            ):
+                layer.reset_states()
+
+    @property
+    @doc_controls.do_not_generate_docs
+    def state_updates(self):
+        """Deprecated, do NOT use!
+
+        Returns the `updates` from all layers that are stateful.
+
+        This is useful for separating training updates and
+        state updates, e.g. when we need to update a layer's internal state
+        during prediction.
+
+        Returns:
+            A list of update ops.
+        """
+        warnings.warn(
+            "`Model.state_updates` will be removed in a future version. "
+            "This property should not be used in TensorFlow 2.0, "
+            "as `updates` are applied automatically.",
+            stacklevel=2,
+        )
+        state_updates = []
+        for layer in self.layers:
+            if getattr(layer, "stateful", False):
+                if hasattr(layer, "updates"):
+                    state_updates += layer.updates
+        return state_updates
+
+    @property
+    def weights(self):
+        """Returns the list of all layer variables/weights.
+
+        Note: This will not track the weights of nested `tf.Modules` that are
+        not themselves Keras layers.
+
+        Returns:
+          A list of variables.
+        """
+        return self._dedup_weights(self._undeduplicated_weights)
+
+    @property
+    def _undeduplicated_weights(self):
+        """Returns the undeduplicated list of all layer variables/weights."""
+        self._assert_weights_created()
+        weights = []
+        for layer in self._self_tracked_trackables:
+            weights += layer.variables
+        weights += self._trainable_weights + self._non_trainable_weights
+        return weights
+
+    def summary(
+        self,
+        line_length=None,
+        positions=None,
+        print_fn=None,
+        expand_nested=False,
+        show_trainable=False,
+        layer_range=None,
+    ):
+        """Prints a string summary of the network.
+
+        Args:
+            line_length: Total length of printed lines
+                (e.g. set this to adapt the display to different
+                terminal window sizes).
+            positions: Relative or absolute positions of log elements
+                in each line. If not provided, becomes
+                `[0.3, 0.6, 0.70, 1.]`. Defaults to `None`.
+            print_fn: Print function to use. By default, prints to `stdout`.
+                If `stdout` doesn't work in your environment, change to `print`.
+                It will be called on each line of the summary.
+                You can set it to a custom function
+                in order to capture the string summary.
+            expand_nested: Whether to expand the nested models.
+                Defaults to `False`.
+            show_trainable: Whether to show if a layer is trainable.
+                Defaults to `False`.
+            layer_range: a list or tuple of 2 strings,
+                which is the starting layer name and ending layer name
+                (both inclusive) indicating the range of layers to be printed
+                in summary. It also accepts regex patterns instead of exact
+                name. In such case, start predicate will be the first element
+                it matches to `layer_range[0]` and the end predicate will be
+                the last element it matches to `layer_range[1]`.
+                By default `None` which considers all layers of model.
+
+        Raises:
+            ValueError: if `summary()` is called before the model is built.
+        """
+        if not self.built:
+            raise ValueError(
+                "This model has not yet been built. "
+                "Build the model first by calling `build()` or by calling "
+                "the model on a batch of data."
+            )
+        layer_utils.print_summary(
+            self,
+            line_length=line_length,
+            positions=positions,
+            print_fn=print_fn,
+            expand_nested=expand_nested,
+            show_trainable=show_trainable,
+            layer_range=layer_range,
+        )
+
+    @property
+    def layers(self):
+        return list(self._flatten_layers(include_self=False, recursive=False))
+
+    @layers.setter
+    def layers(self, _):
+        raise AttributeError(
+            "`Model.layers` attribute is reserved and should not be used. "
+            "Please use another name."
+        )
+
+    def get_layer(self, name=None, index=None):
+        """Retrieves a layer based on either its name (unique) or index.
+
+        If `name` and `index` are both provided, `index` will take precedence.
+        Indices are based on order of horizontal graph traversal (bottom-up).
+
+        Args:
+            name: String, name of layer.
+            index: Integer, index of layer.
+
+        Returns:
+            A layer instance.
+        """
+        # TODO(fchollet): We could build a dictionary based on layer names
+        # since they are constant, but we have not done that yet.
+        if index is not None and name is not None:
+            raise ValueError(
+                "Provide only a layer name or a layer index. Received: "
+                f"index={index}, name={name}."
+            )
+
+        if index is not None:
+            if len(self.layers) <= index:
+                raise ValueError(
+                    f"Was asked to retrieve layer at index {index}"
+                    f" but model only has {len(self.layers)}"
+                    " layers."
+                )
+            else:
+                return self.layers[index]
+
+        if name is not None:
+            for layer in self.layers:
+                if layer.name == name:
+                    return layer
+            raise ValueError(
+                f"No such layer: {name}. Existing layers are: "
+                f"{list(layer.name for layer in self.layers)}."
+            )
+        raise ValueError(
+            "Provide either a layer name or layer index at `get_layer`."
+        )
+
+    def get_weight_paths(self):
+        """Retrieve all the variables and their paths for the model.
+
+        The variable path (string) is a stable key to identify a `tf.Variable`
+        instance owned by the model. It can be used to specify variable-specific
+        configurations (e.g. DTensor, quantization) from a global view.
+
+        This method returns a dict with weight object paths as keys
+        and the corresponding `tf.Variable` instances as values.
+
+        Note that if the model is a subclassed model and the weights haven't
+        been initialized, an empty dict will be returned.
+
+        Returns:
+            A dict where keys are variable paths and values are `tf.Variable`
+             instances.
+
+        Example:
+
+        ```python
+        class SubclassModel(tf.keras.Model):
+
+          def __init__(self, name=None):
+            super().__init__(name=name)
+            self.d1 = tf.keras.layers.Dense(10)
+            self.d2 = tf.keras.layers.Dense(20)
+
+          def call(self, inputs):
+            x = self.d1(inputs)
+            return self.d2(x)
+
+        model = SubclassModel()
+        model(tf.zeros((10, 10)))
+        weight_paths = model.get_weight_paths()
+        # weight_paths:
+        # {
+        #    'd1.kernel': model.d1.kernel,
+        #    'd1.bias': model.d1.bias,
+        #    'd2.kernel': model.d2.kernel,
+        #    'd2.bias': model.d2.bias,
+        # }
+
+        # Functional model
+        inputs = tf.keras.Input((10,), batch_size=10)
+        x = tf.keras.layers.Dense(20, name='d1')(inputs)
+        output = tf.keras.layers.Dense(30, name='d2')(x)
+        model = tf.keras.Model(inputs, output)
+        d1 = model.layers[1]
+        d2 = model.layers[2]
+        weight_paths = model.get_weight_paths()
+        # weight_paths:
+        # {
+        #    'd1.kernel': d1.kernel,
+        #    'd1.bias': d1.bias,
+        #    'd2.kernel': d2.kernel,
+        #    'd2.bias': d2.bias,
+        # }
+        ```
+        """
+        result = {}
+        (
+            descendants,
+            object_paths_dict,
+        ) = tf.__internal__.tracking.ObjectGraphView(
+            self
+        ).breadth_first_traversal()
+        for descendant in descendants:
+            if isinstance(descendant, tf.Variable):
+                trackable_references = object_paths_dict[descendant]
+                object_path = ".".join([t.name for t in trackable_references])
+                result[object_path] = descendant
+        return result
+
+    def get_compile_config(self):
+        """Returns a serialized config with information for compiling the model.
+
+        This method returns a config dictionary containing all the information
+        (optimizer, loss, metrics, etc.) with which the model was compiled.
+
+        Returns:
+            A dict containing information for compiling the model.
+        """
+        if self._is_compiled and hasattr(self, "_compile_config"):
+            return self._compile_config.serialize()
+
+    def compile_from_config(self, config):
+        """Compiles the model with the information given in config.
+
+        This method uses the information in the config (optimizer, loss,
+        metrics, etc.) to compile the model.
+
+        Args:
+            config: Dict containing information for compiling the model.
+        """
+        has_overridden_compile = self.__class__.compile != Model.compile
+        if has_overridden_compile:
+            logging.warning(
+                "`compile()` was not called as part of model loading "
+                "because the model's `compile()` method is custom. "
+                "All subclassed Models that have `compile()` "
+                "overridden should also override "
+                "`get_compile_config()` and `compile_from_config(config)`. "
+                "Alternatively, you can "
+                "call `compile()` manually after loading."
+            )
+            return
+        config = saving_lib.deserialize_keras_object(config)
+        self.compile(**config)
+        if hasattr(self, "optimizer") and self.built:
+            # Create optimizer variables.
+            self.optimizer.build(self.trainable_variables)
+
+    def export(self, filepath):
+        """Create a SavedModel artifact for inference (e.g. via TF-Serving).
+
+        This method lets you export a model to a lightweight SavedModel artifact
+        that contains the model's forward pass only (its `call()` method)
+        and can be served via e.g. TF-Serving. The forward pass is registered
+        under the name `serve()` (see example below).
+
+        The original code of the model (including any custom layers you may
+        have used) is *no longer* necessary to reload the artifact -- it is
+        entirely standalone.
+
+        Args:
+            filepath: `str` or `pathlib.Path` object. Path where to save
+                the artifact.
+
+        Example:
+
+        ```python
+        # Create the artifact
+        model.export("path/to/location")
+
+        # Later, in a different process / environment...
+        reloaded_artifact = tf.saved_model.load("path/to/location")
+        predictions = reloaded_artifact.serve(input_data)
+        ```
+
+        If you would like to customize your serving endpoints, you can
+        use the lower-level `keras.export.ExportArchive` class. The `export()`
+        method relies on `ExportArchive` internally.
+        """
+        from keras.export import export_lib
+
+        export_lib.export_model(self, filepath)
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _set_save_spec(self, inputs, args=None, kwargs=None):
+        """Defines the save spec so that serialization can trace `call()`.
+
+        The TensorSpecs of the call function `inputs`, `args`, and `kwargs` are
+        saved into a tuple of `([inputs] + args, kwargs)`. The input
+        `TensorSpec` names are updated to match the built `input_names`.
+
+        The specs can be retrieved with the `save_spec` property.
+
+        Args:
+          inputs: possibly nested inputs passed into the call function.
+          args: a list of positional arguments passed into call.
+          kwargs: a dictionary of keyword arguments passed into call.
+        """
+        if self._saved_model_inputs_spec is not None:
+            return  # Already set.
+        args = args or []
+        kwargs = kwargs or {}
+
+        input_names = self.input_names
+        if not input_names:
+            input_names = compile_utils.create_pseudo_input_names(inputs)
+
+        flat_inputs = tf.nest.flatten(inputs)
+        inputs_spec = []
+        for name, tensor in zip(input_names, flat_inputs):
+            inputs_spec.append(
+                tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name)
+            )
+        inputs_spec = tf.nest.pack_sequence_as(inputs, inputs_spec)
+        super()._set_save_spec(inputs_spec, args, kwargs)
+
+        # Store the input shapes
+        if (
+            self.__class__.__name__ == "Sequential"
+            and self._build_input_shape is None
+        ):
+            self._build_input_shape = tf.nest.map_structure(
+                lambda x: None if x is None else x.shape, inputs_spec
+            )
+
+    def save_spec(self, dynamic_batch=True):
+        """Returns the `tf.TensorSpec` of call args as a tuple `(args, kwargs)`.
+
+        This value is automatically defined after calling the model for the
+        first time. Afterwards, you can use it when exporting the model for
+        serving:
+
+        ```python
+        model = tf.keras.Model(...)
+
+        @tf.function
+        def serve(*args, **kwargs):
+          outputs = model(*args, **kwargs)
+          # Apply postprocessing steps, or add additional outputs.
+          ...
+          return outputs
+
+        # arg_specs is `[tf.TensorSpec(...), ...]`. kwarg_specs, in this
+        # example, is an empty dict since functional models do not use keyword
+        # arguments.
+        arg_specs, kwarg_specs = model.save_spec()
+
+        model.save(path, signatures={
+          'serving_default': serve.get_concrete_function(*arg_specs,
+                                                         **kwarg_specs)
+        })
+        ```
+
+        Args:
+          dynamic_batch: Whether to set the batch sizes of all the returned
+            `tf.TensorSpec` to `None`. (Note that when defining functional or
+            Sequential models with `tf.keras.Input([...], batch_size=X)`, the
+            batch size will always be preserved). Defaults to `True`.
+        Returns:
+          If the model inputs are defined, returns a tuple `(args, kwargs)`. All
+          elements in `args` and `kwargs` are `tf.TensorSpec`.
+          If the model inputs are not defined, returns `None`.
+          The model inputs are automatically set when calling the model,
+          `model.fit`, `model.evaluate` or `model.predict`.
+        """
+        return self._get_save_spec(dynamic_batch, inputs_only=False)
+
+    def _assert_weights_created(self):
+        """Asserts that all the weights for the model have been created.
+
+        For a non-dynamic model, the weights must already be created after the
+        layer has been called. For a dynamic model, the exact list of weights
+        can never be known for certain since it may change at any time during
+        execution.
+
+        We run this check right before accessing weights or getting the Numpy
+        value for the current weights. Otherwise, if the layer has never been
+        called, the user would just get an empty list, which is misleading.
+
+        Raises:
+          ValueError: if the weights of the network have not yet been created.
+        """
+        if self.dynamic:
+            return
+
+        if (
+            "build" in self.__class__.__dict__
+            and self.__class__ != Model
+            and not self.built
+        ):
+            # For any model that has customized build() method but hasn't been
+            # invoked yet, this will cover both sequential and subclass model.
+            # Also make sure to exclude Model class itself which has build()
+            # defined.
+            raise ValueError(
+                f"Weights for model '{self.name}' have not yet been "
+                "created. "
+                "Weights are created when the model is first called on "
+                "inputs or `build()` is called with an `input_shape`."
+            )
+
+    def _check_call_args(self, method_name):
+        """Check that `call()` has only one positional arg."""
+        # Always allow first arg, regardless of arg name.
+        fullargspec = self._call_spec.full_argspec
+        if fullargspec.defaults:
+            positional_args = fullargspec.args[: -len(fullargspec.defaults)]
+        else:
+            positional_args = fullargspec.args
+        if "training" in positional_args:
+            positional_args.remove("training")
+
+        # self and first arg can be positional.
+        if len(positional_args) > 2:
+            extra_args = positional_args[2:]
+            raise ValueError(
+                f"Models passed to `{method_name}` can only have `training` "
+                "and the first argument in `call()` as positional arguments, "
+                f"found: {extra_args}."
+            )
+
+    def _validate_compile(self, optimizer, metrics, **kwargs):
+        """Performs validation checks for the default `compile()`."""
+        if any(
+            isinstance(opt, optimizer_v1.Optimizer)
+            for opt in tf.nest.flatten(optimizer)
+        ):
+            raise ValueError(
+                f"`tf.compat.v1.keras` Optimizer ({optimizer}) is "
+                "not supported when eager execution is enabled. Use a "
+                "`tf.keras` Optimizer instead, or disable eager "
+                "execution."
+            )
+
+        kwargs.pop("cloning", None)  # Legacy DistStrat argument, never used.
+        kwargs.pop("experimental_run_tf_function", None)  # Always `True`.
+        distribute_arg = kwargs.pop("distribute", None)
+        if distribute_arg is not None:
+            raise ValueError(
+                "`distribute` argument in compile is not available in TF 2.0. "
+                "Please create the model under the `strategy.scope()`. "
+                f"Received: {distribute_arg}."
+            )
+        target_tensor_arg = kwargs.pop("target_tensors", None)
+        if target_tensor_arg is not None:
+            raise ValueError(
+                "`target_tensors` argument is not supported when executing "
+                f"eagerly. Received: {target_tensor_arg}."
+            )
+        invalid_kwargs = set(kwargs) - {"sample_weight_mode"}
+        if invalid_kwargs:
+            raise TypeError(
+                "Invalid keyword argument(s) in `compile()`: "
+                f"{(invalid_kwargs,)}. Valid keyword arguments include "
+                '"cloning", "experimental_run_tf_function", "distribute",'
+                ' "target_tensors", or "sample_weight_mode".'
+            )
+
+        # Model must be created and compiled with the same DistStrat.
+        if self.built and tf.distribute.has_strategy():
+            strategy = tf.distribute.get_strategy()
+            for v in self.variables:
+                if not strategy.extended.variable_created_in_scope(v):
+                    raise ValueError(
+                        f"Variable ({v}) was not created in the distribution "
+                        f"strategy scope of ({strategy}). It is most likely "
+                        "because some layers, model, or optimizer was being "
+                        "created outside the distribution strategy scope. Try "
+                        "to make sure your code looks similar "
+                        "to the following.\nwith strategy.scope():\n"
+                        "  model=_create_model()\n"
+                        "  model.compile(...)"
+                    )
+
+        # Model metrics must be created in the same distribution strategy scope
+        # as the model.
+        strategy = self.distribute_strategy
+        for metric in tf.nest.flatten(metrics):
+            for v in getattr(metric, "variables", []):
+                if not strategy.extended.variable_created_in_scope(v):
+                    raise ValueError(
+                        f"Metric ({metric}) passed to `model.compile` was "
+                        "created inside a different distribution strategy "
+                        "scope than the model. All metrics must be created "
+                        "in the same distribution strategy "
+                        f"scope as the model (in this case {strategy}). "
+                        "If you pass in a string identifier for a metric to "
+                        "compile, the metric will automatically be created "
+                        "in the correct distribution strategy scope."
+                    )
+
+        # Model metrics must be created in the same distribution strategy scope
+        # as the model.
+        for opt in tf.nest.flatten(optimizer):
+            for v in getattr(opt, "_weights", []):
+                if not strategy.extended.variable_created_in_scope(v):
+                    raise ValueError(
+                        f"Optimizer ({optimizer}) passed to `model.compile` "
+                        "was created inside a different distribution strategy "
+                        "scope than the model. All optimizers must be created "
+                        "in the same distribution strategy scope as the model "
+                        f"(in this case {strategy}). If you pass in a string "
+                        "identifier for an optimizer to compile, the optimizer "
+                        "will automatically be created in the correct "
+                        "distribution strategy scope."
+                    )
+
+    def _maybe_load_initial_counters_from_ckpt(
+        self, steps_per_epoch, initial_epoch
+    ):
+        """Maybe load initial epoch from ckpt, considering worker recovery.
+
+        Refer to tensorflow/python/keras/distribute/worker_training_state.py
+        for more information.
+
+        Args:
+          steps_per_epoch: The number of step per epoch.
+          initial_epoch: The original initial_epoch user passes in `fit()`.
+          mode: The mode for running `model.fit()`.
+
+        Returns:
+          If the training is recovering from previous failure under multi-worker
+          training setting, return the (epoch, step) the training is supposed to
+          continue at. Otherwise, return the `initial_epoch, initial_step` the
+          user passes in.
+        """
+        initial_step = 0
+        if self._training_state is not None:
+            return self._training_state.maybe_load_initial_counters_from_ckpt(
+                steps_per_epoch, initial_epoch, mode=ModeKeys.TRAIN
+            )
+        return (initial_epoch, initial_step)
+
+    def _assert_compile_was_called(self):
+        # Checks whether `compile` has been called. If it has been called,
+        # then the optimizer is set. This is different from whether the
+        # model is compiled
+        # (i.e. whether the model is built and its inputs/outputs are set).
+        if not self._is_compiled:
+            raise RuntimeError(
+                "You must compile your model before "
+                "training/testing. "
+                "Use `model.compile(optimizer, loss)`."
+            )
+
+    def _check_sample_weight_warning(self, x, sample_weight):
+        # Datasets can include sample weight, by returning a tuple with the
+        # structure of `(x, y, sample_weight)`.
+        sample_weight_present = sample_weight is not None or (
+            isinstance(x, tf.data.Dataset)
+            and isinstance(x.element_spec, tuple)
+            and len(x.element_spec) == 3
+        )
+
+        if (
+            sample_weight_present
+            and self.compiled_metrics._user_weighted_metrics is None
+        ):
+            logging.warning(
+                "`evaluate()` received a value for `sample_weight`, but "
+                "`weighted_metrics` were not provided.  Did you mean to pass "
+                "metrics to `weighted_metrics` in `compile()`?  If this is "
+                "intentional you can pass `weighted_metrics=[]` to `compile()` "
+                "in order to silence this warning."
+            )
+
+    def _set_inputs(self, inputs, outputs=None, training=None):
+        """This method is for compat with Modelv1. Only inputs are needed
+        here."""
+        self._set_save_spec(inputs)
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return model_serialization.ModelSavedModelSaver(self)
+
+    def _trackable_children(self, save_type="checkpoint", **kwargs):
+        if save_type == "savedmodel":
+            # SavedModel needs to ignore the execution functions.
+            train_function = self.train_function
+            test_function = self.test_function
+            predict_function = self.predict_function
+            train_tf_function = self.train_tf_function
+            self.train_function = None
+            self.test_function = None
+            self.predict_function = None
+            self.train_tf_function = None
+
+        children = super()._trackable_children(save_type, **kwargs)
+
+        if save_type == "savedmodel":
+            self.train_function = train_function
+            self.test_function = test_function
+            self.predict_function = predict_function
+            self.train_tf_function = train_tf_function
+
+        return children
+
+    def _should_eval(self, epoch, validation_freq):
+        epoch = epoch + 1  # one-index the user-facing epoch.
+        if isinstance(validation_freq, int):
+            return epoch % validation_freq == 0
+        elif isinstance(validation_freq, list):
+            return epoch in validation_freq
+        else:
+            raise ValueError(
+                "Expected `validation_freq` to be a list or int. "
+                f"Received: validation_freq={validation_freq} of the "
+                f"type {type(validation_freq)}."
+            )
+
+    ######################################################################
+    # Functions below exist only as v1 / v2 compatibility shims.
+    ######################################################################
+
+    def _get_compile_args(self, user_metrics=True):
+        """Used for saving or cloning a Model.
+
+        Args:
+          user_metrics: Whether to return user-supplied metrics or `Metric`
+            objects. If True, returns the user-supplied metrics.
+            Defaults to `True`.
+
+        Returns:
+          Dictionary of arguments that were used when compiling the model.
+        """
+        self._assert_compile_was_called()
+        saved_metrics = self.compiled_metrics._user_metrics
+        saved_weighted_metrics = self.compiled_metrics._user_weighted_metrics
+
+        if not user_metrics:
+            if saved_metrics is not None:
+                saved_metrics = self.compiled_metrics._metrics
+            if saved_weighted_metrics is not None:
+                saved_weighted_metrics = self.compiled_metrics._weighted_metrics
+
+        compile_args = {
+            "optimizer": self.optimizer,
+            "loss": self.compiled_loss._user_losses,
+            "metrics": saved_metrics,
+            "weighted_metrics": saved_weighted_metrics,
+            "loss_weights": self.compiled_loss._user_loss_weights,
+        }
+        return compile_args
+
+    def _get_callback_model(self):
+        return self
+
+    def _in_multi_worker_mode(self):
+        return self.distribute_strategy.extended._in_multi_worker_mode()
+
+    @property
+    def _compile_was_called(self):
+        return self._is_compiled
+
+    def _save_experimental(self, filepath):
+        return saving_lib.save_model(self, filepath)
+
+
+class _TestFunction:
+    def __init__(self, function, callbacks):
+        self._function = function
+        self._callbacks = callbacks
+
+    def run_step(self, dataset_or_iterator, data_handler, step, unused_shards):
+        tmp_logs = self._function(dataset_or_iterator)
+        if data_handler.should_sync:
+            context.async_wait()
+        logs = tmp_logs
+        end_step = step + data_handler.step_increment
+        self._callbacks.on_test_batch_end(end_step, logs)
+        return logs
 
-    Raises:
-      RuntimeError: If `model.train_on_batch` is wrapped in a `tf.function`.
-    """
-    self._assert_compile_was_called()
-    self._check_call_args('train_on_batch')
-    _disallow_inside_tf_function('train_on_batch')
-    if reset_metrics:
-      self.reset_metrics()
-    with self.distribute_strategy.scope(), \
-         training_utils.RespectCompiledTrainableState(self):
-      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
-                                                    y, sample_weight,
-                                                    class_weight)
-      self.train_function = self.make_train_function()
-      logs = self.train_function(iterator)
-
-    logs = tf_utils.sync_to_numpy_or_python_type(logs)
-    if return_dict:
-      return logs
-    else:
-      return flatten_metrics_in_order(logs, self.metrics_names)
 
-  def test_on_batch(self,
-                    x,
-                    y=None,
-                    sample_weight=None,
-                    reset_metrics=True,
-                    return_dict=False):
-    """Test the model on a single batch of samples.
+class _ExactTestFunction(_TestFunction):
+    def __init__(self, function, callbacks):
+        super().__init__(function, callbacks)
+        self._logs = []
+
+    def run_step(self, dataset_or_iterator, data_handler, step, shards):
+        tmp_logs = self._function(
+            dataset_or_iterator,
+            tf.constant(shards, dtype=tf.int64),
+            tf.constant(step, dtype=tf.int64),
+        )
+        if data_handler.should_sync:
+            context.async_wait()
+        self._logs.append(tmp_logs)
+        return self._logs
+
+
+def reduce_per_replica(values, strategy, reduction):
+    """Attempt to reduce the structure `values` to single values.
+
+    Given `values` (a `tf.Tensor` or a `PerReplica` structure),
+    which represents the values across all the replicas, `reduce_per_replica`
+    attempts to "reduce" those values and returns the corresponding structure
+    that represents only single values.
+
+    Currently, `reduce_per_replica` is only used for reducing the metric results
+    from `tf.distribute.Strategy.run()`. Depending on the underlying
+    `Strategy` implementation, `values` may be a `PerReplica` object,
+     which can be thought of as a collection of values across the replicas,
+    or a `tf.Tensor`, if the strategy has already conducted the reduction
+    for the downstream library.
+
+    There are five possible outcomes of reduction:
+
+    1) if the `values` is a structure of simple `tf.Tensor`s, meaning that
+       reduction is not actually needed, `reduce_per_replica` returns the
+       structure as-is.
+    2) else, if `reduction="auto"`, then the best reduction strategy is
+       chosen based on the current environment. This should only be used
+       for training cases (`fit()`).
+    3) else, if `reduction="first"`, then `reduce_per_replica`
+       returns the values of the first replica. This is used in the case of
+       training and evaluation, where `values` is expected to hold the same
+       value across the replicas as a result of `Strategy`'s synchronization
+       across the replicas.
+       `reduce_per_replica` does not synchronize the values.
+    4) else, if `reduction="sum"`, then `reduce_per_replica` returns the sum
+       of values for all replicas. This may be used in the custom training loop
+       case, where each replica contain different values which are not
+       synchronized.
+    5) else, if `reduction="concat"`, then `reduce_per_replica`
+       returns the concatenation of the values across the replicas, along the
+       axis of dimension 0. This is used in the inference case (`predict()`).
 
     Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays (in case the
-              model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors (in case the model has
-              multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors, if
-              the model has named inputs.
-        y: Target data. Like the input data `x`, it could be either Numpy
-          array(s) or TensorFlow tensor(s). It should be consistent with `x`
-          (you cannot have Numpy inputs and tensor targets, or inversely).
-        sample_weight: Optional array of the same length as x, containing
-          weights to apply to the model's loss for each sample. In the case of
-          temporal data, you can pass a 2D array with shape (samples,
-          sequence_length), to apply a different weight to every timestep of
-          every sample.
-        reset_metrics: If `True`, the metrics returned will be only for this
-          batch. If `False`, the metrics will be statefully accumulated across
-          batches.
-        return_dict: If `True`, loss and metric results are returned as a dict,
-          with each key being the name of the metric. If `False`, they are
-          returned as a list.
+      values: Structure of `PerReplica` objects or `tf.Tensor`s. `tf.Tensor`s
+        are returned as-is.
+      strategy: `tf.distribute.Strategy` object.
+      reduction: One of `"auto"`, `"first"`, `"concat"`, or `"sum"`.
+        `"auto"` will select `"first"` when used under a TPUStrategy, or
+        `"sum"` otherwise.
 
     Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+      Structure of `Tensor`s, representing the result of reduction.
 
     Raises:
-        RuntimeError: If `model.test_on_batch` is wrapped in a `tf.function`.
+      ValueError: if the reduction method is not supported.
     """
-    self._assert_compile_was_called()
-    self._check_call_args('test_on_batch')
-    _disallow_inside_tf_function('test_on_batch')
-    if reset_metrics:
-      self.reset_metrics()
-    with self.distribute_strategy.scope():
-      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
-                                                    y, sample_weight)
-      self.test_function = self.make_test_function()
-      logs = self.test_function(iterator)
-
-    logs = tf_utils.sync_to_numpy_or_python_type(logs)
-    if return_dict:
-      return logs
-    else:
-      return flatten_metrics_in_order(logs, self.metrics_names)
 
-  def predict_on_batch(self, x):
-    """Returns predictions for a single batch of samples.
+    if reduction == "auto":
+        reduction = "first" if backend.is_tpu_strategy(strategy) else "sum"
+
+    def _reduce(v):
+        """Reduce a single `PerReplica` object."""
+        if _collective_all_reduce_multi_worker(strategy):
+            if reduction == "concat":
+                return _multi_worker_concat(v, strategy)
+            elif reduction == "sum":
+                return strategy.reduce("SUM", v, axis=None)
+
+        if _is_dtensor_per_replica_instance(v):
+            return _reduce_dtensor_per_replica(v, strategy, reduction)
+        elif not _is_per_replica_instance(v):
+            return v
+        elif reduction == "first":
+            return strategy.experimental_local_results(v)[0]
+        elif reduction == "concat":
+            if _is_tpu_multi_host(strategy):
+                return _tpu_multi_host_concat(v, strategy)
+            else:
+                return concat(strategy.experimental_local_results(v))
+        elif reduction == "sum":
+            return tf.reduce_sum(strategy.experimental_local_results(v))
+        else:
+            raise ValueError(
+                '`reduction` must be "first", "concat", "sum", or "auto". '
+                f"Received: reduction={reduction}."
+            )
 
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays (in case the
-              model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors (in case the model has
-              multiple inputs).
+    return tf.nest.map_structure(_reduce, values)
 
-    Returns:
-        Numpy array(s) of predictions.
 
-    Raises:
-        RuntimeError: If `model.predict_on_batch` is wrapped in a `tf.function`.
-    """
-    self._check_call_args('predict_on_batch')
-    _disallow_inside_tf_function('predict_on_batch')
-    with self.distribute_strategy.scope():
-      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x)
-      self.predict_function = self.make_predict_function()
-      outputs = self.predict_function(iterator)
-    return tf_utils.sync_to_numpy_or_python_type(outputs)
-
-  @doc_controls.do_not_generate_docs
-  def fit_generator(self,
-                    generator,
-                    steps_per_epoch=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    validation_data=None,
-                    validation_steps=None,
-                    validation_freq=1,
-                    class_weight=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False,
-                    shuffle=True,
-                    initial_epoch=0):
-    """Fits the model on data yielded batch-by-batch by a Python generator.
-
-    DEPRECATED:
-      `Model.fit` now supports generators, so there is no longer any need to use
-      this endpoint.
-    """
-    warnings.warn(
-        '`Model.fit_generator` is deprecated and '
-        'will be removed in a future version. '
-        'Please use `Model.fit`, which supports generators.',
-        stacklevel=2)
-    return self.fit(
-        generator,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch)
-
-  @doc_controls.do_not_generate_docs
-  def evaluate_generator(self,
-                         generator,
-                         steps=None,
-                         callbacks=None,
-                         max_queue_size=10,
-                         workers=1,
-                         use_multiprocessing=False,
-                         verbose=0):
-    """Evaluates the model on a data generator.
-
-    DEPRECATED:
-      `Model.evaluate` now supports generators, so there is no longer any need
-      to use this endpoint.
-    """
-    warnings.warn(
-        '`Model.evaluate_generator` is deprecated and '
-        'will be removed in a future version. '
-        'Please use `Model.evaluate`, which supports generators.',
-        stacklevel=2)
-    self._check_call_args('evaluate_generator')
-
-    return self.evaluate(
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose,
-        callbacks=callbacks)
-
-  @doc_controls.do_not_generate_docs
-  def predict_generator(self,
-                        generator,
-                        steps=None,
-                        callbacks=None,
-                        max_queue_size=10,
-                        workers=1,
-                        use_multiprocessing=False,
-                        verbose=0):
-    """Generates predictions for the input samples from a data generator.
-
-    DEPRECATED:
-      `Model.predict` now supports generators, so there is no longer any need
-      to use this endpoint.
-    """
-    warnings.warn(
-        '`Model.predict_generator` is deprecated and '
-        'will be removed in a future version. '
-        'Please use `Model.predict`, which supports generators.',
-        stacklevel=2)
-    return self.predict(
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose,
-        callbacks=callbacks)
-
-  ######################################################################
-  # Functions below are not training related. They are for model weights
-  # tracking, save/load, serialization, etc.
-  ######################################################################
-
-  @property
-  def trainable_weights(self):
-    self._assert_weights_created()
-    if not self._trainable:
-      return []
-    trainable_variables = []
-    for trackable_obj in self._self_tracked_trackables:
-      trainable_variables += trackable_obj.trainable_variables
-    trainable_variables += self._trainable_weights
-    return self._dedup_weights(trainable_variables)
-
-  @property
-  def non_trainable_weights(self):
-    self._assert_weights_created()
-    non_trainable_variables = []
-    for trackable_obj in self._self_tracked_trackables:
-      non_trainable_variables += trackable_obj.non_trainable_variables
-
-    if not self._trainable:
-      # Return order is all trainable vars, then all non-trainable vars.
-      trainable_variables = []
-      for trackable_obj in self._self_tracked_trackables:
-        trainable_variables += trackable_obj.trainable_variables
-
-      non_trainable_variables = (
-          trainable_variables + self._trainable_weights +
-          non_trainable_variables + self._non_trainable_weights)
+def concat(tensors, axis=0):
+    """Concats `tensor`s along `axis`."""
+    if isinstance(tensors[0], tf.SparseTensor):
+        return tf.sparse.concat(axis=axis, sp_inputs=tensors)
+    elif _is_scalar(tensors[0]):
+        return tf.stack(tensors, axis=axis)
     else:
-      non_trainable_variables = (
-          non_trainable_variables + self._non_trainable_weights)
-
-    return self._dedup_weights(non_trainable_variables)
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    with self.distribute_strategy.scope():
-      return super().get_weights()
-
-  @traceback_utils.filter_traceback
-  def save(self,
-           filepath,
-           overwrite=True,
-           include_optimizer=True,
-           save_format=None,
-           signatures=None,
-           options=None,
-           save_traces=True):
-    # pylint: disable=line-too-long
-    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
-
-    Please see `tf.keras.models.save_model` or the
-    [Serialization and Saving guide](https://keras.io/guides/serialization_and_saving/)
-    for details.
-
-    Args:
-        filepath: String, PathLike, path to SavedModel or H5 file to save the
-            model.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        include_optimizer: If True, save optimizer's state together.
-        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
-            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
-            and 'h5' in TF 1.X.
-        signatures: Signatures to save with the SavedModel. Applicable to the
-            'tf' format only. Please see the `signatures` argument in
-            `tf.saved_model.save` for details.
-        options: (only applies to SavedModel format)
-            `tf.saved_model.SaveOptions` object that specifies options for
-            saving to SavedModel.
-        save_traces: (only applies to SavedModel format) When enabled, the
-            SavedModel will store the function traces for each layer. This
-            can be disabled, so that only the configs of each layer are stored.
-            Defaults to `True`. Disabling this will decrease serialization time
-            and reduce file size, but it requires that all custom layers/models
-            implement a `get_config()` method.
+        return tf.concat(tensors, axis=axis)
 
-    Example:
 
-    ```python
-    from keras.models import load_model
-
-    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
-    del model  # deletes the existing model
-
-    # returns a compiled model
-    # identical to the previous one
-    model = load_model('my_model.h5')
-    ```
-    """
-    # pylint: enable=line-too-long
-    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
-                    signatures, options, save_traces)
-
-  @traceback_utils.filter_traceback
-  def save_weights(self,
-                   filepath,
-                   overwrite=True,
-                   save_format=None,
-                   options=None):
-    """Saves all layer weights.
-
-    Either saves in HDF5 or in TensorFlow format based on the `save_format`
-    argument.
-
-    When saving in HDF5 format, the weight file has:
-      - `layer_names` (attribute), a list of strings
-          (ordered names of model layers).
-      - For every layer, a `group` named `layer.name`
-          - For every such layer group, a group attribute `weight_names`,
-              a list of strings
-              (ordered names of weights tensor of the layer).
-          - For every weight in the layer, a dataset
-              storing the weight value, named after the weight tensor.
-
-    When saving in TensorFlow format, all objects referenced by the network are
-    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
-    instances or `Optimizer` instances assigned to object attributes. For
-    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
-    outputs)`, `Layer` instances used by the network are tracked/saved
-    automatically. For user-defined classes which inherit from `tf.keras.Model`,
-    `Layer` instances must be assigned to object attributes, typically in the
-    constructor. See the documentation of `tf.train.Checkpoint` and
-    `tf.keras.Model` for details.
-
-    While the formats are the same, do not mix `save_weights` and
-    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
-    loaded using `Model.load_weights`. Checkpoints saved using
-    `tf.train.Checkpoint.save` should be restored using the corresponding
-    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
-    `save_weights` for training checkpoints.
-
-    The TensorFlow format matches objects and variables by starting at a root
-    object, `self` for `save_weights`, and greedily matching attribute
-    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
-    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
-    means saving a `tf.keras.Model` using `save_weights` and loading into a
-    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
-    the `Model`'s variables. See the
-    [guide to training checkpoints](https://www.tensorflow.org/guide/checkpoint)
-    for details on the TensorFlow format.
-
-    Args:
-        filepath: String or PathLike, path to the file to save the weights to.
-            When saving in TensorFlow format, this is the prefix used for
-            checkpoint files (multiple files are generated). Note that the '.h5'
-            suffix causes weights to be saved in HDF5 format.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
-            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
-            `None` defaults to 'tf'.
-        options: Optional `tf.train.CheckpointOptions` object that specifies
-            options for saving weights.
-
-    Raises:
-        ImportError: If `h5py` is not available when attempting to save in HDF5
-            format.
-    """
-    self._assert_weights_created()
-    filepath = io_utils.path_to_string(filepath)
-    filepath_is_h5 = saving_utils.is_hdf5_filepath(filepath)
-    if save_format is None:
-      if filepath_is_h5:
-        save_format = 'h5'
-      else:
-        save_format = 'tf'
-    else:
-      user_format = save_format.lower().strip()
-      if user_format in ('tensorflow', 'tf'):
-        save_format = 'tf'
-      elif user_format in ('hdf5', 'h5', 'keras'):
-        save_format = 'h5'
-      else:
-        raise ValueError(
-            f'Unknown format. Received: `save_format`={save_format}. Was '
-            'expecting one of {"tf", "h5"}.')
-    if save_format == 'tf' and filepath_is_h5:
-      raise ValueError(
-          'save_weights got save_format="tf"/"tensorflow", but the '
-          f'filepath ({filepath}) looks like an HDF5 file. '
-          'Omit the ".h5"/".keras" when saving in TensorFlow format.')
-
-    if save_format == 'h5' and h5py is None:
-      raise ImportError(
-          '`save_weights` requires h5py when saving in hdf5, but h5py is not '
-          'available. Try installing h5py package.')
-    if save_format == 'tf':
-      check_filepath = filepath + '.index'
-    else:
-      check_filepath = filepath
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(check_filepath):
-      proceed = io_utils.ask_to_proceed_with_overwrite(check_filepath)
-      if not proceed:
-        return
-    if save_format == 'h5':
-      with h5py.File(filepath, 'w') as f:
-        hdf5_format.save_weights_to_hdf5_group(f, self)
-    else:
-      if not tf.executing_eagerly():
-        # Call `get_session` to initialize any uninitialized variables.
-        backend.get_session()
-      self._checkpoint.write(filepath, options=options)
-
-      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      tf.__internal__.train.update_checkpoint_state(
-          save_dir=os.path.dirname(filepath),
-          model_checkpoint_path=filepath,
-          save_relative_paths=True,
-          all_model_checkpoint_paths=[filepath])
-
-  @traceback_utils.filter_traceback
-  def load_weights(self,
-                   filepath,
-                   by_name=False,
-                   skip_mismatch=False,
-                   options=None):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
-
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
-
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
+def potentially_ragged_concat(tensors):
+    """Concats `Tensor`s along their first dimension.
 
     Args:
-        filepath: String, path to the weights file to load. For weight files in
-            TensorFlow format, this is the file prefix (the same as was passed
-            to `save_weights`). This can also be a path to a SavedModel
-            saved from `model.save`.
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-        options: Optional `tf.train.CheckpointOptions` object that specifies
-            options for loading weights.
+      tensors: List of `Tensor`s.
 
     Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
-
-    Raises:
-        ImportError: If `h5py` is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
+      Concatenation of the inputs along the first dimension -- of type `Tensor`
+      if all input shapes are compatible, or `RaggedTensor` if not.
     """
-    if backend.is_tpu_strategy(self._distribution_strategy):
-      if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not saving_utils.is_hdf5_filepath(filepath))):
-        spr = self._distribution_strategy.extended.steps_per_run
-        raise ValueError('Load weights is not implemented with TPUStrategy '
-                         'with `steps_per_run` greater than 1. The '
-                         f'`steps_per_run` is {spr}')
-    if skip_mismatch and not by_name:
-      raise ValueError(
-          'When calling model.load_weights, skip_mismatch can only be set to '
-          'True when by_name is True.')
-
-    filepath, save_format = _detect_save_format(filepath)
-    if save_format == 'tf':
-      status = self._checkpoint.read(filepath, options)
-      if by_name:
-        raise NotImplementedError(
-            'Weights may only be loaded based on topology into Models when '
-            'loading TensorFlow-formatted weights (got by_name=True to '
-            'load_weights).')
-      if not tf.executing_eagerly():
-        session = backend.get_session()
-        # Restore existing variables (if any) immediately, and set up a
-        # streaming restore for any variables created in the future.
-        tf.__internal__.tracking.streaming_restore(status=status,
-                                                   session=session)
-      status.assert_nontrivial_match()
-    else:
-      status = None
-      if h5py is None:
-        raise ImportError(
-            '`load_weights` requires h5py package when loading weights from '
-            'HDF5. Try installing h5py.')
-      if not self._is_graph_network and not self.built:
-        raise ValueError(
-            'Unable to load weights saved in HDF5 format into a subclassed '
-            'Model which has not created its variables yet. Call the Model '
-            'first, then load the weights.')
-      self._assert_weights_created()
-      with h5py.File(filepath, 'r') as f:
-        if 'layer_names' not in f.attrs and 'model_weights' in f:
-          f = f['model_weights']
-        if by_name:
-          hdf5_format.load_weights_from_hdf5_group_by_name(
-              f, self, skip_mismatch)
+    if len(tensors) == 1:
+        return tensors[0]
+    if isinstance(tensors[0], tf.SparseTensor):
+        return tf.sparse.concat(axis=0, sp_inputs=tensors)
+    elif isinstance(tensors[0], tf.RaggedTensor):
+        return tf.concat(tensors, axis=0)
+    elif not tf.__internal__.tf2.enabled():
+        return tf.concat(tensors, axis=0)
+
+    non_batch_shapes = tf.stack([tf.shape(tensor)[1:] for tensor in tensors])
+    constant_dims = tf.math.reduce_all(
+        non_batch_shapes == non_batch_shapes[:1], axis=0
+    )
+    if tf.math.reduce_all(constant_dims).numpy().item():
+        # All non-batch dims are constant
+        if _is_scalar(tensors[0]):
+            return tf.stack(tensors, axis=0)
         else:
-          hdf5_format.load_weights_from_hdf5_group(f, self)
-
-    # Perform any layer defined finalization of the layer state.
-    for layer in self.layers:
-      layer.finalize_state()
-    return status
+            return tf.concat(tensors, axis=0)
 
-  def _updated_config(self):
-    """Util shared between different serialization methods.
-
-    Returns:
-        Model config with Keras version information added.
-    """
-    from keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-    config = self.get_config()
-    model_config = {
-        'class_name': self.__class__.__name__,
-        'config': config,
-        'keras_version': keras_version,
-        'backend': backend.backend()
-    }
-    return model_config
-
-  def get_config(self):
-    """Returns the config of the `Model`.
-
-    Config is a Python dictionary (serializable) containing the configuration of
-    an object, which in this case is a `Model`. This allows the `Model` to be
-    be reinstantiated later (without its trained weights) from this
-    configuration.
-
-    Note that `get_config()` does not guarantee to return a fresh copy of dict
-    every time it is called. The callers should make a copy of the returned dict
-    if they want to modify it.
-
-    Developers of subclassed `Model` are advised to override this method, and
-    continue to update the dict from `super(MyModel, self).get_config()`
-    to provide the proper configuration of this `Model`. The default config
-    is an empty dict. Optionally, raise `NotImplementedError` to allow Keras to
-    attempt a default serialization.
-
-    Returns:
-        Python dictionary containing the configuration of this `Model`.
-    """
-
-    # Return an empty dict here because otherwise subclass model developers may
-    # see their model's `__init__()` be fed with unexpected keyword argument, if
-    # their `__init__()` takes no argument for example, and they don't override
-    # `from_config()`, which would use `cls(**config)` as a result.
-    config = {}
-
-    if saving_lib._ENABLED:  # pylint: disable=protected-access
-      if self.optimizer:
-        config['optimizer'] = saving_lib.serialize_keras_object(self.optimizer)
-      if self.compiled_loss:
-        config['loss'] = saving_lib.serialize_keras_object(self.compiled_loss)
-      if self.built:
-        config['input_shape'] = self._build_input_shape
-
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    # `from_config` assumes `cls` is either `Functional` or a child class of
-    # `Functional`. In the case that `cls` is meant to behave like a child class
-    # of `Functional` but only inherits from the `Model` class, we have to call
-    # `cls(...)` instead of `Functional.from_config`.
-    from keras.engine import functional  # pylint: disable=g-import-not-at-top
-    with generic_utils.SharedObjectLoadingScope():
-      functional_model_keys = [
-          'name', 'layers', 'input_layers', 'output_layers'
-      ]
-      if all(key in config for key in functional_model_keys):
-        inputs, outputs, layers = functional.reconstruct_from_config(
-            config, custom_objects)
-        model = cls(inputs=inputs, outputs=outputs, name=config.get('name'))
-        functional.connect_ancillary_layers(model, layers)
-        return model
-
-      # The config does not contain all the information necessary to revive a
-      # Functional model. This happens when the user creates subclassed models
-      # where `get_config()` is returning insufficient information to be
-      # considered a Functional model. In this case, we fall back to provide
-      # all config into the constructor of the class.
-      optimizer, loss = None, None
-
-      optimizer_dict = config.pop('optimizer', {})
-      if optimizer_dict:
-        optimizer = saving_lib.deserialize_keras_object(optimizer_dict)
-
-      loss_dict = config.pop('loss', {})
-      if loss_dict:
-        loss = saving_lib.deserialize_keras_object(loss_dict)
-
-      input_shape = config.pop('input_shape', {})
-
-      try:
-        model = cls(**config)
-      except TypeError as e:
-        raise TypeError('Unable to revive model from config. When overriding '
-                        'the `get_config()`, make sure that the returned '
-                        'config contains all items used as arguments in the '
-                        f'constructor to {cls}, which is the default behavior. '
-                        'You can override this default behavior by defining a '
-                        '`from_config` method to specify how to create an '
-                        f'instance of {cls.__name__} from the config. \n\n'
-                        f'Error encountered during deserialization:\n{e}')
-
-      if saving_lib._ENABLED:  # pylint: disable=protected-access
-
-        if optimizer or loss:
-          model.compile(optimizer=optimizer, loss=loss)
-
-        if input_shape:
-          model.build(input_shape)
-
-      return model
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the network configuration.
-
-    To load a network from a JSON save file, use
-    `keras.models.model_from_json(json_string, custom_objects={})`.
-
-    Args:
-        **kwargs: Additional keyword arguments to be passed to `json.dumps()`.
-
-    Returns:
-        A JSON string.
-    """
-    model_config = self._updated_config()
-    return json.dumps(
-        model_config, default=json_utils.get_json_type, **kwargs)
-
-  def to_yaml(self, **kwargs):
-    """Returns a yaml string containing the network configuration.
-
-    Note: Since TF 2.6, this method is no longer supported and will raise a
-    RuntimeError.
-
-    To load a network from a yaml save file, use
-    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
-
-    `custom_objects` should be a dictionary mapping
-    the names of custom losses / layers / etc to the corresponding
-    functions / classes.
-
-    Args:
-        **kwargs: Additional keyword arguments
-            to be passed to `yaml.dump()`.
-
-    Returns:
-        A YAML string.
-
-    Raises:
-        RuntimeError: announces that the method poses a security risk
-    """
-    raise RuntimeError(
-        'Method `model.to_yaml()` has been removed due to security risk of '
-        'arbitrary code execution. Please use `model.to_json()` instead.'
+    # First, identify constant inner dimensions by finding the
+    # rightmost dimension that is not constant
+    constant_inner_dimensions = (
+        constant_dims.numpy().tolist()[::-1].index(False)
     )
-
-  def reset_states(self):
-    for layer in self.layers:
-      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
-        layer.reset_states()
-
-  @property
-  @doc_controls.do_not_generate_docs
-  def state_updates(self):
-    """Deprecated, do NOT use!
-
-    Returns the `updates` from all layers that are stateful.
-
-    This is useful for separating training updates and
-    state updates, e.g. when we need to update a layer's internal state
-    during prediction.
-
-    Returns:
-        A list of update ops.
-    """
-    warnings.warn(
-        '`Model.state_updates` will be removed in a future version. '
-        'This property should not be used in TensorFlow 2.0, '
-        'as `updates` are applied automatically.',
-        stacklevel=2)
-    state_updates = []
-    for layer in self.layers:
-      if getattr(layer, 'stateful', False):
-        if hasattr(layer, 'updates'):
-          state_updates += layer.updates
-    return state_updates
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Note: This will not track the weights of nested `tf.Modules` that are not
-    themselves Keras layers.
-
-    Returns:
-      A list of variables.
-    """
-    return self._dedup_weights(self._undeduplicated_weights)
-
-  @property
-  def _undeduplicated_weights(self):
-    """Returns the undeduplicated list of all layer variables/weights."""
-    self._assert_weights_created()
-    weights = []
-    for layer in self._self_tracked_trackables:
-      weights += layer.variables
-    weights += (self._trainable_weights + self._non_trainable_weights)
-    return weights
-
-  def summary(self,
-              line_length=None,
-              positions=None,
-              print_fn=None,
-              expand_nested=False,
-              show_trainable=False):
-    """Prints a string summary of the network.
-
-    Args:
-        line_length: Total length of printed lines
-            (e.g. set this to adapt the display to different
-            terminal window sizes).
-        positions: Relative or absolute positions of log elements
-            in each line. If not provided,
-            defaults to `[.33, .55, .67, 1.]`.
-        print_fn: Print function to use. Defaults to `print`.
-            It will be called on each line of the summary.
-            You can set it to a custom function
-            in order to capture the string summary.
-        expand_nested: Whether to expand the nested models.
-            If not provided, defaults to `False`.
-        show_trainable: Whether to show if a layer is trainable.
-            If not provided, defaults to `False`.
-
-    Raises:
-        ValueError: if `summary()` is called before the model is built.
-    """
-    if not self.built:
-      raise ValueError(
-          'This model has not yet been built. '
-          'Build the model first by calling `build()` or by calling '
-          'the model on a batch of data.')
-    layer_utils.print_summary(
-        self,
-        line_length=line_length,
-        positions=positions,
-        print_fn=print_fn,
-        expand_nested=expand_nested,
-        show_trainable=show_trainable)
-
-  @property
-  def layers(self):
-    return list(self._flatten_layers(include_self=False, recursive=False))
-
-  @layers.setter
-  def layers(self, _):
-    raise AttributeError(
-        '`Model.layers` attribute is reserved and should not be used. '
-        'Please use another name.')
-
-  def get_layer(self, name=None, index=None):
-    """Retrieves a layer based on either its name (unique) or index.
-
-    If `name` and `index` are both provided, `index` will take precedence.
-    Indices are based on order of horizontal graph traversal (bottom-up).
-
-    Args:
-        name: String, name of layer.
-        index: Integer, index of layer.
-
-    Returns:
-        A layer instance.
-    """
-    # TODO(fchollet): We could build a dictionary based on layer names
-    # since they are constant, but we have not done that yet.
-    if index is not None and name is not None:
-      raise ValueError('Provide only a layer name or a layer index. Received: '
-                       f'index={index}, name={name}.')
-
-    if index is not None:
-      if len(self.layers) <= index:
-        raise ValueError(f'Was asked to retrieve layer at index {index}'
-                         f' but model only has {len(self.layers)}'
-                         ' layers.')
-      else:
-        return self.layers[index]
-
-    if name is not None:
-      for layer in self.layers:
-        if layer.name == name:
-          return layer
-      raise ValueError(f'No such layer: {name}. Existing layers are: '
-                       f'{list(layer.name for layer in self.layers)}.')
-    raise ValueError('Provide either a layer name or layer index at '
-                     '`get_layer`.')
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _set_save_spec(self, inputs, args=None, kwargs=None):
-    """Defines the save spec so that serialization is able to trace model call.
-
-    The TensorSpecs of the call function `inputs`, `args`, and `kwargs` are
-    saved into a tuple of `([inputs] + args, kwargs)`. The input `TensorSpec`
-    names are updated to match the built `input_names`.
-
-    The specs can be retrieved with the `save_spec` property.
-
-    Args:
-      inputs: possibly nested inputs passed into the call function.
-      args: a list of positional arguments passed into call.
-      kwargs: a dictionary of keyword arguments passed into call.
-    """
-    if self._saved_model_inputs_spec is not None:
-      return  # Already set.
-    args = args or []
-    kwargs = kwargs or {}
-
-    input_names = self.input_names
-    if not input_names:
-      input_names = compile_utils.create_pseudo_input_names(inputs)
-
-    flat_inputs = tf.nest.flatten(inputs)
-    inputs_spec = []
-    for name, tensor in zip(input_names, flat_inputs):
-      inputs_spec.append(
-          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
-    inputs_spec = tf.nest.pack_sequence_as(inputs, inputs_spec)
-    super()._set_save_spec(inputs_spec, args, kwargs)
-
-    # Store the input shapes
-    if (self.__class__.__name__ == 'Sequential' and
-        self._build_input_shape is None):
-      self._build_input_shape = tf.nest.map_structure(
-          lambda x: None if x is None else x.shape, inputs_spec)
-
-  def save_spec(self, dynamic_batch=True):
-    """Returns the `tf.TensorSpec` of call inputs as a tuple `(args, kwargs)`.
-
-    This value is automatically defined after calling the model for the first
-    time. Afterwards, you can use it when exporting the model for serving:
-
-    ```python
-    model = tf.keras.Model(...)
-
-    @tf.function
-    def serve(*args, **kwargs):
-      outputs = model(*args, **kwargs)
-      # Apply postprocessing steps, or add additional outputs.
-      ...
-      return outputs
-
-    # arg_specs is `[tf.TensorSpec(...), ...]`. kwarg_specs, in this example, is
-    # an empty dict since functional models do not use keyword arguments.
-    arg_specs, kwarg_specs = model.save_spec()
-
-    model.save(path, signatures={
-      'serving_default': serve.get_concrete_function(*arg_specs, **kwarg_specs)
-    })
-    ```
-
-    Args:
-      dynamic_batch: Whether to set the batch sizes of all the returned
-        `tf.TensorSpec` to `None`. (Note that when defining functional or
-        Sequential models with `tf.keras.Input([...], batch_size=X)`, the
-        batch size will always be preserved). Defaults to `True`.
-    Returns:
-      If the model inputs are defined, returns a tuple `(args, kwargs)`. All
-      elements in `args` and `kwargs` are `tf.TensorSpec`.
-      If the model inputs are not defined, returns `None`.
-      The model inputs are automatically set when calling the model,
-      `model.fit`, `model.evaluate` or `model.predict`.
-    """
-    return self._get_save_spec(dynamic_batch, inputs_only=False)
-
-  def _assert_weights_created(self):
-    """Asserts that all the weights for the model have been created.
-
-    For a non-dynamic model, the weights must already be created after the
-    layer has been called. For a dynamic model, the exact list of weights can
-    never be known for certain since it may change at any time during execution.
-
-    We run this check right before accessing weights or getting the Numpy value
-    for the current weights. Otherwise, if the layer has never been called,
-    the user would just get an empty list, which is misleading.
-
-    Raises:
-      ValueError: if the weights of the network have not yet been created.
-    """
-    if self.dynamic:
-      return
-
-    if ('build' in self.__class__.__dict__ and
-        self.__class__ != Model and
-        not self.built):
-      # For any model that has customized build() method but hasn't
-      # been invoked yet, this will cover both sequential and subclass model.
-      # Also make sure to exclude Model class itself which has build() defined.
-      raise ValueError(f'Weights for model {self.name} have not yet been '
-                       'created. '
-                       'Weights are created when the Model is first called on '
-                       'inputs or `build()` is called with an `input_shape`.')
-
-  def _check_call_args(self, method_name):
-    """Check that `call()` has only one positional arg."""
-    # Always allow first arg, regardless of arg name.
-    fullargspec = self._call_spec.full_argspec
-    if fullargspec.defaults:
-      positional_args = fullargspec.args[:-len(fullargspec.defaults)]
-    else:
-      positional_args = fullargspec.args
-    if 'training' in positional_args:
-      positional_args.remove('training')
-
-    # self and first arg can be positional.
-    if len(positional_args) > 2:
-      extra_args = positional_args[2:]
-      raise ValueError(
-          f'Models passed to `{method_name}` can only have `training` '
-          'and the first argument in `call()` as positional arguments, '
-          f'found: {extra_args}.')
-
-  def _validate_compile(self, optimizer, metrics, **kwargs):
-    """Performs validation checks for the default `compile()`."""
-    if any(
-        isinstance(opt, optimizer_v1.Optimizer)
-        for opt in tf.nest.flatten(optimizer)):
-      raise ValueError(
-          f'`tf.compat.v1.keras` Optimizer ({optimizer}) is '
-          'not supported when eager execution is enabled. Use a '
-          '`tf.keras` Optimizer instead, or disable eager '
-          'execution.')
-
-    kwargs.pop('cloning', None)  # Legacy DistStrat argument, never used.
-    kwargs.pop('experimental_run_tf_function', None)  # Always `True`.
-    distribute_arg = kwargs.pop('distribute', None)
-    if distribute_arg is not None:
-      raise ValueError(
-          '`distribute` argument in compile is not available in TF 2.0. Please '
-          'create the model under the `strategy.scope()`. Received: '
-          f'{distribute_arg}.')
-    target_tensor_arg = kwargs.pop('target_tensors', None)
-    if target_tensor_arg is not None:
-      raise ValueError(
-          '`target_tensors` argument is not supported when executing eagerly. '
-          f'Received: {target_tensor_arg}.')
-    invalid_kwargs = set(kwargs) - {'sample_weight_mode'}
-    if invalid_kwargs:
-      raise TypeError('Invalid keyword argument(s) in `compile()`: '
-                      f'{(invalid_kwargs,)}. Valid keyword arguments include '
-                      '"cloning", "experimental_run_tf_function", "distribute",'
-                      ' "target_tensors", or "sample_weight_mode".')
-
-    # Model must be created and compiled with the same DistStrat.
-    if self.built and tf.distribute.has_strategy():
-      strategy = tf.distribute.get_strategy()
-      for v in self.variables:
-        if not strategy.extended.variable_created_in_scope(v):
-          raise ValueError(
-              f'Variable ({v}) was not created in the distribution strategy '
-              f'scope of ({strategy}). It is most likely because some '
-              'layers, model, or optimizer was being created outside the '
-              'distribution strategy scope. Try to make sure your code looks '
-              'similar to the following.\n'
-              'with strategy.scope():\n'
-              '  model=_create_model()\n'
-              '  model.compile(...)')
-
-    # Model metrics must be created in the same distribution strategy scope
-    # as the model.
-    strategy = self.distribute_strategy
-    for metric in tf.nest.flatten(metrics):
-      for v in getattr(metric, 'variables', []):
-        if not strategy.extended.variable_created_in_scope(v):
-          raise ValueError(
-              f'Metric ({metric}) passed to `model.compile` was created inside '
-              'a different distribution strategy scope than the model. All '
-              'metrics must be created in the same distribution strategy '
-              f'scope as the model (in this case {strategy}). If you pass in a '
-              'string identifier for a metric to compile, the metric will '
-              'automatically be created in the correct distribution '
-              'strategy scope.'
-          )
-
-    # Model metrics must be created in the same distribution strategy scope
-    # as the model.
-    for opt in tf.nest.flatten(optimizer):
-      for v in getattr(opt, '_weights', []):
-        if not strategy.extended.variable_created_in_scope(v):
-          raise ValueError(
-              f'Optimizer ({optimizer}) passed to `model.compile` was created '
-              'inside a different distribution strategy scope than the model. '
-              'All optimizers must be created in the same distribution '
-              f'strategy scope as the model (in this case {strategy}). If you '
-              'pass in a string identifier for an optimizer to compile, the '
-              'optimizer will automatically be created in the correct '
-              'distribution strategy scope.'
-          )
-
-  def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch):
-    """Maybe load initial epoch from ckpt considering possible worker recovery.
-
-    Refer to tensorflow/python/keras/distribute/worker_training_state.py
-    for more information.
-
-    Args:
-      initial_epoch: The original initial_epoch user passes in in `fit()`.
-
-    Returns:
-      If the training is recovering from previous failure under multi-worker
-      training setting, return the epoch the training is supposed to continue
-      at. Otherwise, return the `initial_epoch` the user passes in.
-    """
-    if self._training_state is not None:
-      return self._training_state.maybe_load_initial_epoch_from_ckpt(
-          initial_epoch, mode=ModeKeys.TRAIN)
-
-    return initial_epoch
-
-  def _maybe_load_initial_step_from_ckpt(self):
-    if getattr(self, '_callback_step', 0) > 0:
-      return self._callback_step.numpy() + 1
-
-    return 0
-
-  def _assert_compile_was_called(self):
-    # Checks whether `compile` has been called. If it has been called,
-    # then the optimizer is set. This is different from whether the
-    # model is compiled
-    # (i.e. whether the model is built and its inputs/outputs are set).
-    if not self._is_compiled:
-      raise RuntimeError('You must compile your model before '
-                         'training/testing. '
-                         'Use `model.compile(optimizer, loss)`.')
-
-  def _check_sample_weight_warning(self, x, sample_weight):
-    # Datasets can include sample weight, by returning a tuple with the
-    # structure of `(x, y, sample_weight)`.
-    sample_weight_present = sample_weight is not None or (
-        isinstance(x, tf.data.Dataset) and isinstance(x.element_spec, tuple) and
-        len(x.element_spec) == 3)
-
-    # pylint: disable=protected-access
-    if (sample_weight_present and
-        self.compiled_metrics._user_weighted_metrics is None):
-      logging.warning(
-          '`evaluate()` received a value for `sample_weight`, but '
-          '`weighted_metrics` were not provided.  Did you mean to pass metrics '
-          'to `weighted_metrics` in `compile()`?  If this is intentional '
-          'you can pass `weighted_metrics=[]` to `compile()` in order to '
-          'silence this warning.')
-
-  def _set_inputs(self, inputs, outputs=None, training=None):
-    """This method is for compat with Modelv1. Only inputs are needed here."""
-    self._set_save_spec(inputs)
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return model_serialization.ModelSavedModelSaver(self)
-
-  def _trackable_children(self, save_type='checkpoint', **kwargs):
-    if save_type == 'savedmodel':
-      # SavedModel needs to ignore the execution functions.
-      train_function = self.train_function
-      test_function = self.test_function
-      predict_function = self.predict_function
-      train_tf_function = self.train_tf_function
-      self.train_function = None
-      self.test_function = None
-      self.predict_function = None
-      self.train_tf_function = None
-
-    children = super()._trackable_children(save_type, **kwargs)
-
-    if save_type == 'savedmodel':
-      self.train_function = train_function
-      self.test_function = test_function
-      self.predict_function = predict_function
-      self.train_tf_function = train_tf_function
-
-    return children
-
-  def _should_eval(self, epoch, validation_freq):
-    epoch = epoch + 1  # one-index the user-facing epoch.
-    if isinstance(validation_freq, int):
-      return epoch % validation_freq == 0
-    elif isinstance(validation_freq, list):
-      return epoch in validation_freq
+    # If there are constant inner dimensions, define a constant inner shape
+    if constant_inner_dimensions == 0:
+        constant_inner_shape = None
     else:
-      raise ValueError('Expected `validation_freq` to be a list or int. '
-                       f'Received: validation_freq={validation_freq} of the '
-                       f'type {type(validation_freq)}.')
-
-  ######################################################################
-  # Functions below exist only as v1 / v2 compatibility shims.
-  ######################################################################
-
-  def _get_compile_args(self, user_metrics=True):
-    """Used for saving or cloning a Model.
-
-    Args:
-      user_metrics: Whether to return user-supplied metrics or `Metric` objects.
-        Defaults to returning the user-supplied metrics.
-
-    Returns:
-      Dictionary of arguments that were used when compiling the model.
-    """
-    self._assert_compile_was_called()
-    # pylint: disable=protected-access
-
-    saved_metrics = self.compiled_metrics._user_metrics
-    saved_weighted_metrics = self.compiled_metrics._user_weighted_metrics
-
-    if not user_metrics:
-      if saved_metrics is not None:
-        saved_metrics = self.compiled_metrics._metrics
-      if saved_weighted_metrics is not None:
-        saved_weighted_metrics = self.compiled_metrics._weighted_metrics
-
-    compile_args = {
-        'optimizer': self.optimizer,
-        'loss': self.compiled_loss._user_losses,
-        'metrics': saved_metrics,
-        'weighted_metrics': saved_weighted_metrics,
-        'loss_weights': self.compiled_loss._user_loss_weights,
-    }
-    # pylint: enable=protected-access
-    return compile_args
-
-  def _get_callback_model(self):
-    return self
-
-  def _in_multi_worker_mode(self):
-    return self.distribute_strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
-
-  @property
-  def _compile_was_called(self):
-    return self._is_compiled
-
-  def _save_new(self, dirpath):
-    return saving_lib.save(self, dirpath)
-
-
-def reduce_per_replica(values, strategy, reduction='first'):
-  """Reduce PerReplica objects.
-
-  Args:
-    values: Structure of `PerReplica` objects or `Tensor`s. `Tensor`s are
-      returned as-is.
-    strategy: `tf.distribute.Strategy` object.
-    reduction: One of 'first', 'concat'.
-
-  Returns:
-    Structure of `Tensor`s.
-  """
-
-  def _reduce(v):
-    """Reduce a single `PerReplica` object."""
-    if reduction == 'concat' and _collective_all_reduce_multi_worker(strategy):
-      return _multi_worker_concat(v, strategy)
-    if not _is_per_replica_instance(v):
-      return v
-    elif reduction == 'first':
-      return strategy.experimental_local_results(v)[0]
-    elif reduction == 'concat':
-      if _is_tpu_multi_host(strategy):
-        return _tpu_multi_host_concat(v, strategy)
-      else:
-        return concat(strategy.experimental_local_results(v))
+        constant_inner_shape = tensors[0].shape[-constant_inner_dimensions:]
+    return tf.ragged.constant(
+        [tensor.numpy() for tensor in tensors], inner_shape=constant_inner_shape
+    ).merge_dims(0, 1)
+
+
+def _reduce_dtensor_per_replica(value, strategy, reduction):
+    # Note that this function could happen in graph, so we can't just access
+    # the per-replica.values(), which will trigger unpack in graph and result
+    # into error.
+    # For now we will perform ops on dtensor instance directly on a global
+    # context.
+    dtensor = value._dtensor
+    if reduction == "first":
+        num_replica = strategy.num_replicas_in_sync
+        return tf.split(dtensor, num_replica, axis=0)[0]
+    elif reduction == "concat":
+        # Since dtensor is already in global context, the concat is a no-op
+        return dtensor
+    elif reduction == "sum":
+        return tf.reduce_sum(dtensor)
     else:
-      raise ValueError('`reduction` must be "first" or "concat". Received: '
-                       f'reduction={reduction}.')
-
-  return tf.nest.map_structure(_reduce, values)
-
-
-def concat(tensors, axis=0):
-  """Concats `tensor`s along `axis`."""
-  if isinstance(tensors[0], tf.SparseTensor):
-    return tf.sparse.concat(axis=axis, sp_inputs=tensors)
-  return tf.concat(tensors, axis=axis)
-
-
-def potentially_ragged_concat(tensors):
-  """Concats `Tensor`s along their first dimension.
-
-  Args:
-    tensors: List of `Tensor`s.
-
-  Returns:
-    Concatenation of the inputs along the first dimension -- of type `Tensor`
-    if all input shapes are compatible, or `RaggedTensor` if not.
-  """
-  if len(tensors) == 1:
-    return tensors[0]
-  if isinstance(tensors[0], tf.SparseTensor):
-    return tf.sparse.concat(axis=0, sp_inputs=tensors)
-  elif isinstance(tensors[0], tf.RaggedTensor):
-    return tf.concat(tensors, axis=0)
-  elif not tf.__internal__.tf2.enabled():
-    return tf.concat(tensors, axis=0)
-
-  non_batch_shapes = tf.stack([tf.shape(tensor)[1:] for tensor in tensors])
-  constant_dims = tf.math.reduce_all(
-      non_batch_shapes == non_batch_shapes[:1], axis=0)
-  if tf.math.reduce_all(constant_dims).numpy().item():
-    # All non-batch dims are constant
-    return tf.concat(tensors, axis=0)
-
-  # First, identify constant inner dimensions by finding the
-  # rightmost dimension that is not constant
-  constant_inner_dimensions = constant_dims.numpy().tolist()[::-1].index(False)
-  # If there are constant inner dimensions, define a constant inner shape
-  if constant_inner_dimensions == 0:
-    constant_inner_shape = None
-  else:
-    constant_inner_shape = tensors[0].shape[-constant_inner_dimensions:]
-  return tf.ragged.constant([tensor.numpy() for tensor in tensors],
-                            inner_shape=constant_inner_shape).merge_dims(0, 1)
+        raise ValueError(
+            '`reduction` must be one of "first", "concat", "sum", or "auto". '
+            f"Received: reduction={reduction}."
+        )
 
 
 def _get_verbosity(verbose, distribute_strategy):
-  """Find the right verbosity value for 'auto'."""
-  if verbose == 1 and distribute_strategy._should_use_with_coordinator:  # pylint: disable=protected-access
-    raise ValueError(
-        '`verbose=1` is not allowed with `ParameterServerStrategy` for '
-        f'performance reasons. Received: verbose={verbose}')
-  if verbose == 'auto':
-    if (distribute_strategy._should_use_with_coordinator or  # pylint: disable=protected-access
-        not io_utils.is_interactive_logging_enabled()):
-      # Default to epoch-level logging for PSStrategy or using absl logging.
-      return 2
-    else:
-      return 1  # Default to batch-level logging otherwise.
-  return verbose
+    """Find the right verbosity value for 'auto'."""
+    if verbose == 1 and distribute_strategy._should_use_with_coordinator:
+        raise ValueError(
+            "`verbose=1` is not allowed with `ParameterServerStrategy` for "
+            f"performance reasons. Received: verbose={verbose}"
+        )
+    if verbose == "auto":
+        if (
+            distribute_strategy._should_use_with_coordinator
+            or not io_utils.is_interactive_logging_enabled()
+        ):
+            # Defaults to epoch-level logging for PSStrategy or using absl
+            # logging.
+            return 2
+        else:
+            return 1  # Defaults to batch-level logging otherwise.
+    return verbose
 
 
 def _is_tpu_multi_host(strategy):
-  return (backend.is_tpu_strategy(strategy) and
-          strategy.extended.num_hosts > 1)
+    return backend.is_tpu_strategy(strategy) and strategy.extended.num_hosts > 1
 
 
 def _tpu_multi_host_concat(v, strategy):
-  """Correctly order TPU PerReplica objects."""
-  replicas = strategy.experimental_local_results(v)
-  # When distributed datasets are created from Tensors / NumPy,
-  # TPUStrategy.experimental_distribute_dataset shards data in
-  # (Replica, Host) order, and TPUStrategy.experimental_local_results returns
-  # it in (Host, Replica) order.
-  # TODO(b/150317897): Figure out long-term plan here.
-  num_replicas_per_host = strategy.extended.num_replicas_per_host
-  ordered_replicas = []
-  for replica_id in range(num_replicas_per_host):
-    ordered_replicas += replicas[replica_id::num_replicas_per_host]
-  return concat(ordered_replicas)
+    """Correctly order TPU PerReplica objects."""
+    replicas = strategy.experimental_local_results(v)
+    # When distributed datasets are created from Tensors / NumPy,
+    # TPUStrategy.experimental_distribute_dataset shards data in
+    # (Replica, Host) order, and TPUStrategy.experimental_local_results returns
+    # it in (Host, Replica) order.
+    # TODO(b/150317897): Figure out long-term plan here.
+    num_replicas_per_host = strategy.extended.num_replicas_per_host
+    ordered_replicas = []
+    for replica_id in range(num_replicas_per_host):
+        ordered_replicas += replicas[replica_id::num_replicas_per_host]
+    return concat(ordered_replicas)
 
 
 def _collective_all_reduce_multi_worker(strategy):
-  return (isinstance(strategy,
-                     tf.distribute.MultiWorkerMirroredStrategy)
-         ) and strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
+    return (
+        isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy)
+    ) and strategy.extended._in_multi_worker_mode()
 
 
 # TODO(wxinyi): merge this with _tpu_multi_host_concat once we have all_gather
 # for all strategies
 def _multi_worker_concat(v, strategy):
-  """Order PerReplica objects for CollectiveAllReduceStrategy and concat."""
-  replicas = strategy.gather(v, axis=0)
-  # v might not have the same shape on different replicas
-  if _is_per_replica_instance(v):
-    shapes = tf.concat([
-        tf.expand_dims(tf.shape(single_value)[0], axis=0)
-        for single_value in v.values
-    ], axis=0)
-    all_shapes = strategy.gather(shapes, axis=0)
-  else:
-    # v is a tensor. This may happen when, say, we have 2x1 multi-worker.
-    all_shapes = strategy.gather(
-        tf.expand_dims(tf.shape(v)[0], axis=0), axis=0)
-
-  replicas = tf.split(
-      replicas,
-      num_or_size_splits=all_shapes,
-      num=strategy.num_replicas_in_sync)
-  ordered_replicas = []
-  num_replicas_per_worker = len(strategy.extended.worker_devices)
-  for replica_id in range(num_replicas_per_worker):
-    ordered_replicas += replicas[replica_id::num_replicas_per_worker]
-  return concat(ordered_replicas)
+    """Order PerReplica objects for CollectiveAllReduceStrategy and concat."""
+    replicas = strategy.gather(v, axis=0)
+    # v might not have the same shape on different replicas
+    if _is_per_replica_instance(v):
+        shapes = tf.concat(
+            [
+                tf.expand_dims(tf.shape(single_value)[0], axis=0)
+                for single_value in v.values
+            ],
+            axis=0,
+        )
+        all_shapes = strategy.gather(shapes, axis=0)
+    else:
+        # v is a tensor. This may happen when, say, we have 2x1 multi-worker.
+        all_shapes = strategy.gather(
+            tf.expand_dims(tf.shape(v)[0], axis=0), axis=0
+        )
+
+    replicas = tf.split(
+        replicas,
+        num_or_size_splits=all_shapes,
+        num=strategy.num_replicas_in_sync,
+    )
+    ordered_replicas = []
+    num_replicas_per_worker = len(strategy.extended.worker_devices)
+    for replica_id in range(num_replicas_per_worker):
+        ordered_replicas += replicas[replica_id::num_replicas_per_worker]
+    return concat(ordered_replicas)
 
 
 def _is_scalar(x):
-  return isinstance(x, (tf.Tensor, tf.Variable)) and x.shape.rank == 0
+    return isinstance(x, (tf.Tensor, tf.Variable)) and x.shape.rank == 0
 
 
 def _minimum_control_deps(outputs):
-  """Returns the minimum control dependencies to ensure step succeeded."""
-  if tf.executing_eagerly():
-    return []  # Control dependencies not needed.
-  outputs = tf.nest.flatten(outputs, expand_composites=True)
-  for out in outputs:
-    # Variables can't be control dependencies.
-    if not isinstance(out, tf.Variable):
-      return [out]  # Return first Tensor or Op from outputs.
-  return []  # No viable Tensor or Op to use for control deps.
+    """Returns the minimum control dependencies to ensure step succeeded."""
+    if tf.executing_eagerly():
+        return []  # Control dependencies not needed.
+    outputs = tf.nest.flatten(outputs, expand_composites=True)
+    for out in outputs:
+        # Variables can't be control dependencies.
+        if not isinstance(out, tf.Variable):
+            return [out]  # Return first Tensor or Op from outputs.
+    return []  # No viable Tensor or Op to use for control deps.
 
 
 def _disallow_inside_tf_function(method_name):
-  if tf.inside_function():
-    error_msg = (
-        'Detected a call to `Model.{method_name}` inside a `tf.function`. '
-        '`Model.{method_name} is a high-level endpoint that manages its own '
-        '`tf.function`. Please move the call to `Model.{method_name}` outside '
-        'of all enclosing `tf.function`s. Note that you can call a `Model` '
-        'directly on `Tensor`s inside a `tf.function` like: `model(x)`.'
-    ).format(method_name=method_name)
-    raise RuntimeError(error_msg)
-
-
-def _detect_save_format(filepath):
-  """Returns path to weights file and save format."""
-
-  filepath = io_utils.path_to_string(filepath)
-  if saving_utils.is_hdf5_filepath(filepath):
-    return filepath, 'h5'
-
-  # Filepath could be a TensorFlow checkpoint file prefix or SavedModel
-  # directory. It's possible for filepath to be both a prefix and directory.
-  # Prioritize checkpoint over SavedModel.
-  if _is_readable_tf_checkpoint(filepath):
-    save_format = 'tf'
-  elif tf.saved_model.contains_saved_model(filepath):
-    ckpt_path = os.path.join(filepath, tf.saved_model.VARIABLES_DIRECTORY,
-                             tf.saved_model.VARIABLES_FILENAME)
-    if _is_readable_tf_checkpoint(ckpt_path):
-      filepath = ckpt_path
-      save_format = 'tf'
-    else:
-      raise ValueError('Unable to load weights. filepath {} appears to be a '
-                       'SavedModel directory, but checkpoint either doesn\'t '
-                       'exist, or is incorrectly formatted.'.format(filepath))
-  else:
-    # Not a TensorFlow checkpoint. This filepath is likely an H5 file that
-    # doesn't have the hdf5/keras extensions.
-    save_format = 'h5'
-  return filepath, save_format
-
-
-def _is_readable_tf_checkpoint(filepath):
-  try:
-    tf.compat.v1.train.NewCheckpointReader(filepath)
-    return True
-  except tf.errors.DataLossError:
-    # The checkpoint is not readable in TensorFlow format.
-    return False
+    if tf.inside_function():
+        error_msg = (
+            "Detected a call to `Model.{method_name}` inside a `tf.function`. "
+            "`Model.{method_name} is a high-level endpoint that manages its "
+            "own `tf.function`. Please move the call to `Model.{method_name}` "
+            "outside of all enclosing `tf.function`s. Note that you can call a "
+            "`Model` directly on `Tensor`s inside a `tf.function` like: "
+            "`model(x)`."
+        ).format(method_name=method_name)
+        raise RuntimeError(error_msg)
 
 
 def flatten_metrics_in_order(logs, metrics_names):
-  """Turns the `logs` dict into a list as per key order of `metrics_names`."""
-  results = []
-  for name in metrics_names:
-    if name in logs:
-      results.append(logs[name])
-  for key in sorted(logs.keys()):
-    if key not in metrics_names:
-      results.append(logs[key])
-  if len(results) == 1:
-    return results[0]
-  return results
+    """Turns the `logs` dict into a list as per key order of `metrics_names`."""
+    results = []
+    for name in metrics_names:
+        if name in logs:
+            results.append(logs[name])
+    for key in sorted(logs.keys()):
+        if key not in metrics_names:
+            results.append(logs[key])
+    if len(results) == 1:
+        return results[0]
+    return results
 
 
 def _is_per_replica_instance(obj):
-  return (isinstance(obj, tf.distribute.DistributedValues) and
-          isinstance(obj, tf.__internal__.CompositeTensor))
+    return isinstance(obj, tf.distribute.DistributedValues) and isinstance(
+        obj, tf.__internal__.CompositeTensor
+    )
 
 
-def disable_multi_worker(method):
-  """Decorator that disallows multi-worker use of `method`."""
+def _is_dtensor_per_replica_instance(obj):
+    # This is a temp check for DTensorDistributedValue, which is not public API
+    # yet.
+    # TODO(scottzhu): Move to more stable API when dtensor based strategy is
+    # ready.
+    return isinstance(obj, tf.distribute.DistributedValues) and hasattr(
+        obj, "_dtensor"
+    )
 
-  def _method_wrapper(self, *args, **kwargs):
-    if self._in_multi_worker_mode():  # pylint: disable=protected-access
-      raise ValueError(f'{method.__name__} is not supported in multi-worker '
-                       'mode. Please use a non-multi-worker '
-                       '`tf.distribute.Strategy` such as '
-                       '`tf.distribute.MirroredStrategy`.')
-    return method(self, *args, **kwargs)
 
-  return tf.__internal__.decorator.make_decorator(
-      target=method, decorator_func=_method_wrapper)
+def disable_multi_worker(method):
+    """Decorator that disallows multi-worker use of `method`."""
+
+    def _method_wrapper(self, *args, **kwargs):
+        if self._in_multi_worker_mode():
+            raise ValueError(
+                f"{method.__name__} is not supported in multi-worker "
+                "mode. Please use a non-multi-worker "
+                "`tf.distribute.Strategy` such as "
+                "`tf.distribute.MirroredStrategy`."
+            )
+        return method(self, *args, **kwargs)
+
+    return tf.__internal__.decorator.make_decorator(
+        target=method, decorator_func=_method_wrapper
+    )
 
 
 def inject_functional_model_class(cls):
-  """Inject `Functional` into the hierarchy of this class if needed."""
-  from keras.engine import functional  # pylint: disable=g-import-not-at-top
-  from keras.engine import training_v1  # pylint: disable=g-import-not-at-top
-  if cls == Model or cls == training_v1.Model:
-    return functional.Functional
-  # In case there is any multiple inheritance, we stop injecting the
-  # class if keras model is not in its class hierarchy.
-  if cls == object:
-    return object
-
-  cls.__bases__ = tuple(inject_functional_model_class(base)
-                        for base in cls.__bases__)
-  # Trigger any `__new__` class swapping that needed to happen on `Functional`
-  # but did not because functional was not in the class hierarchy.
-  cls.__new__(cls)
+    """Inject `Functional` into the hierarchy of this class if needed."""
+    from keras.engine import functional
+    from keras.engine import training_v1
+
+    if cls == Model or cls == training_v1.Model:
+        return functional.Functional
+    # In case there is any multiple inheritance, we stop injecting the
+    # class if keras model is not in its class hierarchy.
+    if cls == object:
+        return object
+
+    cls.__bases__ = tuple(
+        inject_functional_model_class(base) for base in cls.__bases__
+    )
+    # Trigger any `__new__` class swapping that needed to happen on `Functional`
+    # but did not because functional was not in the class hierarchy.
+    cls.__new__(cls)
 
-  return cls
+    return cls
 
 
 def is_functional_model_init_params(args, kwargs):
-  return (len(args) == 2 or
-          len(args) == 1 and 'outputs' in kwargs or
-          'inputs' in kwargs and 'outputs' in kwargs)
+    # Both inputs and outputs in args
+    if len(args) == 2:
+        return True
+    # Both inputs in args, outputs in kwargs
+    if len(args) == 1 and "outputs" in kwargs:
+        return True
+    # Both in kwargs
+    if "inputs" in kwargs and "outputs" in kwargs:
+        return True
+    return False
diff --git a/keras/engine/training_arrays_test.py b/keras/engine/training_arrays_test.py
index f94d6b46c79b..cf85bafc3a25 100644
--- a/keras/engine/training_arrays_test.py
+++ b/keras/engine/training_arrays_test.py
@@ -14,226 +14,255 @@
 # ==============================================================================
 """Tests for model.fit calls with a Dataset object passed as validation_data."""
 
-import tensorflow.compat.v2 as tf
-
 import io
 import sys
 from unittest import mock
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from tensorflow.python.framework import test_util as tf_test_utils
 from keras.engine import data_adapter
+from keras.layers import core
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.layers import core
 from keras.utils import io_utils
 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
 
 def _create_dataset(num_samples, batch_size):
-  input_data = np.random.rand(num_samples, 1)
-  expected_data = input_data * 3
-  dataset = tf.data.Dataset.from_tensor_slices((input_data, expected_data))
-  return dataset.shuffle(10 * batch_size).batch(batch_size)
+    input_data = np.random.rand(num_samples, 1)
+    expected_data = input_data * 3
+    dataset = tf.data.Dataset.from_tensor_slices((input_data, expected_data))
+    return dataset.shuffle(10 * batch_size).batch(batch_size)
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class ValidationDatasetAndValidationSplit(test_combinations.TestCase,
-                                          parameterized.TestCase):
-  """Verifies when validation_data is provided validation_split is ignored.
-
-  The validation_split arg can't be passed in v1 mode because
-  training_utils_v1.py:validate_dataset_input will raise a ValueError that
-  validation_split is not supported when input x is a dataset or a dataset
-  iterator.
-  """
-
-  @parameterized.named_parameters(("with_default_falsey_validation_split", 0.),
-                                  ("with_non_falsey_validation_split", 0.1))
-  def test_ignore_validation_split_when_validation_dataset_is_present(
-      self, validation_split):
-    # Create a model that learns y=Mx.
-    layers = [core.Dense(1)]
-    model = test_utils.get_model_from_layers(layers, input_shape=(1,))
-    model.compile(loss="mse", optimizer="adam", metrics=["mean_absolute_error"])
-
-    train_dataset = _create_dataset(num_samples=200, batch_size=10)
-    eval_dataset = _create_dataset(num_samples=50, batch_size=25)
-
-    # Make sure model.fit doesn't raise an error because of the mocking alone.
-    mock_train_validation_split_return = ((train_dataset, None, None),
-                                          eval_dataset)
-
-    with mock.patch.object(
-        data_adapter,
-        "train_validation_split",
-        return_value=mock_train_validation_split_return
-    ) as mock_train_validation_split:
-      model.fit(
-          x=train_dataset,
-          validation_split=validation_split,
-          validation_data=eval_dataset,
-          epochs=2)
-      mock_train_validation_split.assert_not_called()
-
-      history = model.fit(
-          x=train_dataset, validation_data=eval_dataset, epochs=2)
-      evaluation = model.evaluate(x=eval_dataset)
-
-      # See test_validation_dataset_with_no_step_arg for details.
-      self.assertAlmostEqual(
-          history.history["val_mean_absolute_error"][-1],
-          evaluation[-1],
-          places=5)
+class ValidationDatasetAndValidationSplit(
+    test_combinations.TestCase, parameterized.TestCase
+):
+    """Verifies when validation_data is provided validation_split is ignored.
+
+    The validation_split arg can't be passed in v1 mode because
+    training_utils_v1.py:validate_dataset_input will raise a ValueError that
+    validation_split is not supported when input x is a dataset or a dataset
+    iterator.
+    """
+
+    @parameterized.named_parameters(
+        ("with_default_falsey_validation_split", 0.0),
+        ("with_non_falsey_validation_split", 0.1),
+    )
+    def test_ignore_validation_split_when_validation_dataset_is_present(
+        self, validation_split
+    ):
+        # Create a model that learns y=Mx.
+        layers = [core.Dense(1)]
+        model = test_utils.get_model_from_layers(layers, input_shape=(1,))
+        model.compile(
+            loss="mse", optimizer="adam", metrics=["mean_absolute_error"]
+        )
+
+        train_dataset = _create_dataset(num_samples=200, batch_size=10)
+        eval_dataset = _create_dataset(num_samples=50, batch_size=25)
+
+        # Make sure model.fit doesn't raise an error because of the mocking
+        # alone.
+        mock_train_validation_split_return = (
+            (train_dataset, None, None),
+            eval_dataset,
+        )
+
+        with mock.patch.object(
+            data_adapter,
+            "train_validation_split",
+            return_value=mock_train_validation_split_return,
+        ) as mock_train_validation_split:
+            model.fit(
+                x=train_dataset,
+                validation_split=validation_split,
+                validation_data=eval_dataset,
+                epochs=2,
+            )
+            mock_train_validation_split.assert_not_called()
+
+            history = model.fit(
+                x=train_dataset, validation_data=eval_dataset, epochs=2
+            )
+            evaluation = model.evaluate(x=eval_dataset)
+
+            # See test_validation_dataset_with_no_step_arg for details.
+            self.assertAlmostEqual(
+                history.history["val_mean_absolute_error"][-1],
+                evaluation[-1],
+                places=5,
+            )
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 class ValidationDatasetNoLimitTest(test_combinations.TestCase):
-
-  def test_validation_dataset_with_no_step_arg(self):
-    # Create a model that learns y=Mx.
-    layers = [core.Dense(1)]
-    model = test_utils.get_model_from_layers(layers, input_shape=(1,))
-    model.compile(loss="mse", optimizer="adam", metrics=["mean_absolute_error"])
-
-    train_dataset = _create_dataset(num_samples=200, batch_size=10)
-    eval_dataset = _create_dataset(num_samples=50, batch_size=25)
-
-    history = model.fit(x=train_dataset, validation_data=eval_dataset, epochs=2)
-    evaluation = model.evaluate(x=eval_dataset)
-
-    # If the fit call used the entire dataset, then the final val MAE error
-    # from the fit history should be equal to the final element in the output
-    # of evaluating the model on the same eval dataset.
-    self.assertAlmostEqual(history.history["val_mean_absolute_error"][-1],
-                           evaluation[-1], places=5)
-
-
-class PrintTrainingInfoTest(test_combinations.TestCase,
-                            parameterized.TestCase):
-
-  @tf_test_utils.run_v1_only("Only relevant in graph mode.")
-  def test_print_info_with_datasets(self):
-    """Print training info should work with val datasets (b/133391839)."""
-
-    model = keras.models.Sequential([keras.layers.Dense(1, input_shape=(1,))])
-    model.compile(loss="mse", optimizer="sgd")
-
-    dataset = tf.data.Dataset.from_tensors(
-        ([1.], [1.])).repeat(100).batch(10)
-
-    val_dataset = tf.data.Dataset.from_tensors(
-        ([1.], [1.])).repeat(50).batch(10)
-
-    mock_stdout = io.StringIO()
-    io_utils.enable_interactive_logging()
-    with tf.compat.v1.test.mock.patch.object(sys, "stdout", mock_stdout):
-      model.fit(dataset, epochs=2, validation_data=val_dataset)
-
-    self.assertIn(
-        "Train on 10 steps, validate on 5 steps", mock_stdout.getvalue())
-
-  @parameterized.named_parameters(
-      ("with_validation", True), ("without_validation", False))
-  @tf_test_utils.run_v1_only("Only relevant in graph mode.")
-  def test_print_info_with_numpy(self, do_validation):
-    """Print training info should work with val datasets (b/133391839)."""
-
-    model = keras.models.Sequential([keras.layers.Dense(1, input_shape=(2,))])
-    model.compile(loss="mse", optimizer="sgd")
-
-    dataset = np.arange(200).reshape(100, 2)
-
-    if do_validation:
-      val_data = (np.arange(100).reshape(50, 2), np.arange(50).reshape(50, 1))
-    else:
-      val_data = None
-
-    mock_stdout = io.StringIO()
-    with tf.compat.v1.test.mock.patch.object(sys, "stdout", mock_stdout):
-      model.fit(dataset, batch_size=10, epochs=2, validation_data=val_data)
-
-    self.assertIn("Train on 100 samples", mock_stdout.getvalue())
-
-    if do_validation:
-      self.assertIn(", validate on 50 samples", mock_stdout.getvalue())
-
-  @test_combinations.run_all_keras_modes
-  def test_dict_float64_input(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__(self)
-        self.dense1 = keras.layers.Dense(10, activation="relu")
-        self.dense2 = keras.layers.Dense(10, activation="relu")
-        self.concat = keras.layers.Concatenate()
-        self.dense3 = keras.layers.Dense(1, activation="sigmoid")
-
-      def call(self, inputs):
-        d1 = self.dense1(inputs["one"])
-        d2 = self.dense2(inputs["two"])
-        concat = self.concat([d1, d2])
-        return self.dense3(concat)
-
-    model = MyModel()
-    model.compile(
-        loss="mae",
-        optimizer="adam",
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit(
-        x={
-            "one": np.random.rand(100, 10, 1),
-            "two": np.random.rand(100, 10, 1)
-        },
-        y=np.random.rand(100, 10, 1))
-
-  def test_dict_validation_input(self):
-    """Test case for GitHub issue 30122."""
-    train_input_0 = np.random.rand(1000, 1)
-    train_input_1 = np.random.rand(1000, 1)
-    train_labels = np.random.rand(1000, 1)
-    val_input_0 = np.random.rand(1000, 1)
-    val_input_1 = np.random.rand(1000, 1)
-    val_labels = np.random.rand(1000, 1)
-
-    input_0 = keras.Input(shape=(None,), name="input_0")
-    input_1 = keras.Input(shape=(None,), name="input_1")
-
-    class my_model(keras.Model):
-
-      def __init__(self):
-        super().__init__(self)
-        self.hidden_layer_0 = keras.layers.Dense(100, activation="relu")
-        self.hidden_layer_1 = keras.layers.Dense(100, activation="relu")
-        self.concat = keras.layers.Concatenate()
-        self.out_layer = keras.layers.Dense(1, activation="sigmoid")
-
-      def call(self, inputs=[input_0, input_1]):
-        activation_0 = self.hidden_layer_0(inputs["input_0"])
-        activation_1 = self.hidden_layer_1(inputs["input_1"])
-        concat = self.concat([activation_0, activation_1])
-        return self.out_layer(concat)
-
-    model = my_model()
-    model.compile(loss="mae", optimizer="adam")
-
-    model.fit(
-        x={
-            "input_0": train_input_0,
-            "input_1": train_input_1
-        },
-        y=train_labels,
-        validation_data=({
-            "input_0": val_input_0,
-            "input_1": val_input_1
-        }, val_labels))
+    def test_validation_dataset_with_no_step_arg(self):
+        # Create a model that learns y=Mx.
+        layers = [core.Dense(1)]
+        model = test_utils.get_model_from_layers(layers, input_shape=(1,))
+        model.compile(
+            loss="mse", optimizer="adam", metrics=["mean_absolute_error"]
+        )
+
+        train_dataset = _create_dataset(num_samples=200, batch_size=10)
+        eval_dataset = _create_dataset(num_samples=50, batch_size=25)
+
+        history = model.fit(
+            x=train_dataset, validation_data=eval_dataset, epochs=2
+        )
+        evaluation = model.evaluate(x=eval_dataset)
+
+        # If the fit call used the entire dataset, then the final val MAE error
+        # from the fit history should be equal to the final element in the
+        # output of evaluating the model on the same eval dataset.
+        self.assertAlmostEqual(
+            history.history["val_mean_absolute_error"][-1],
+            evaluation[-1],
+            places=5,
+        )
+
+
+class PrintTrainingInfoTest(test_combinations.TestCase, parameterized.TestCase):
+    @tf_test_utils.run_v1_only("Only relevant in graph mode.")
+    def test_print_info_with_datasets(self):
+        """Print training info should work with val datasets (b/133391839)."""
+
+        model = keras.models.Sequential(
+            [keras.layers.Dense(1, input_shape=(1,))]
+        )
+        model.compile(loss="mse", optimizer="sgd")
+
+        dataset = (
+            tf.data.Dataset.from_tensors(([1.0], [1.0])).repeat(100).batch(10)
+        )
+
+        val_dataset = (
+            tf.data.Dataset.from_tensors(([1.0], [1.0])).repeat(50).batch(10)
+        )
+
+        mock_stdout = io.StringIO()
+        io_utils.enable_interactive_logging()
+        with tf.compat.v1.test.mock.patch.object(sys, "stdout", mock_stdout):
+            model.fit(dataset, epochs=2, validation_data=val_dataset)
+
+        self.assertIn(
+            "Train on 10 steps, validate on 5 steps", mock_stdout.getvalue()
+        )
+
+    @parameterized.named_parameters(
+        ("with_validation", True), ("without_validation", False)
+    )
+    @tf_test_utils.run_v1_only("Only relevant in graph mode.")
+    def test_print_info_with_numpy(self, do_validation):
+        """Print training info should work with val datasets (b/133391839)."""
+
+        model = keras.models.Sequential(
+            [keras.layers.Dense(1, input_shape=(2,))]
+        )
+        model.compile(loss="mse", optimizer="sgd")
+
+        dataset = np.arange(200).reshape(100, 2)
+
+        if do_validation:
+            val_data = (
+                np.arange(100).reshape(50, 2),
+                np.arange(50).reshape(50, 1),
+            )
+        else:
+            val_data = None
+
+        mock_stdout = io.StringIO()
+        with tf.compat.v1.test.mock.patch.object(sys, "stdout", mock_stdout):
+            model.fit(
+                dataset, batch_size=10, epochs=2, validation_data=val_data
+            )
+
+        self.assertIn("Train on 100 samples", mock_stdout.getvalue())
+
+        if do_validation:
+            self.assertIn(", validate on 50 samples", mock_stdout.getvalue())
+
+    @test_combinations.run_all_keras_modes
+    def test_dict_float64_input(self):
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__(self)
+                self.dense1 = keras.layers.Dense(10, activation="relu")
+                self.dense2 = keras.layers.Dense(10, activation="relu")
+                self.concat = keras.layers.Concatenate()
+                self.dense3 = keras.layers.Dense(1, activation="sigmoid")
+
+            def call(self, inputs):
+                d1 = self.dense1(inputs["one"])
+                d2 = self.dense2(inputs["two"])
+                concat = self.concat([d1, d2])
+                return self.dense3(concat)
+
+        model = MyModel()
+        model.compile(
+            loss="mae",
+            optimizer="adam",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model.fit(
+            x={
+                "one": np.random.rand(100, 10, 1),
+                "two": np.random.rand(100, 10, 1),
+            },
+            y=np.random.rand(100, 10, 1),
+        )
+
+    def test_dict_validation_input(self):
+        """Test case for GitHub issue 30122."""
+        train_input_0 = np.random.rand(1000, 1)
+        train_input_1 = np.random.rand(1000, 1)
+        train_labels = np.random.rand(1000, 1)
+        val_input_0 = np.random.rand(1000, 1)
+        val_input_1 = np.random.rand(1000, 1)
+        val_labels = np.random.rand(1000, 1)
+
+        input_0 = keras.Input(shape=(None,), name="input_0")
+        input_1 = keras.Input(shape=(None,), name="input_1")
+
+        class my_model(keras.Model):
+            def __init__(self):
+                super().__init__(self)
+                self.hidden_layer_0 = keras.layers.Dense(100, activation="relu")
+                self.hidden_layer_1 = keras.layers.Dense(100, activation="relu")
+                self.concat = keras.layers.Concatenate()
+                self.out_layer = keras.layers.Dense(1, activation="sigmoid")
+
+            def call(self, inputs=[input_0, input_1]):
+                activation_0 = self.hidden_layer_0(inputs["input_0"])
+                activation_1 = self.hidden_layer_1(inputs["input_1"])
+                concat = self.concat([activation_0, activation_1])
+                return self.out_layer(concat)
+
+        model = my_model()
+        model.compile(loss="mae", optimizer="adam")
+
+        model.fit(
+            x={"input_0": train_input_0, "input_1": train_input_1},
+            y=train_labels,
+            validation_data=(
+                {"input_0": val_input_0, "input_1": val_input_1},
+                val_labels,
+            ),
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/engine/training_arrays_v1.py b/keras/engine/training_arrays_v1.py
index 463511009263..a3920e2a1a6b 100644
--- a/keras/engine/training_arrays_v1.py
+++ b/keras/engine/training_arrays_v1.py
@@ -14,693 +14,795 @@
 # ==============================================================================
 """Part of the Keras training engine related to plain array data."""
 
-import tensorflow.compat.v2 as tf
-# pylint: disable=protected-access
-
 import functools
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import callbacks as cbks
 from keras.distribute import distributed_training_utils_v1
 from keras.engine import training_utils_v1
+from keras.utils import io_utils
 from keras.utils.generic_utils import make_batches
 from keras.utils.generic_utils import slice_arrays
-from keras.utils import io_utils
 from keras.utils.mode_keys import ModeKeys
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
+
 try:
-  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
+    from scipy.sparse import issparse
 except ImportError:
-  issparse = None
-
-
-def model_iteration(model,
-                    inputs,
-                    targets=None,
-                    sample_weights=None,
-                    batch_size=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    val_inputs=None,
-                    val_targets=None,
-                    val_sample_weights=None,
-                    shuffle=True,
-                    initial_epoch=0,
-                    steps_per_epoch=None,
-                    validation_steps=None,
-                    validation_freq=1,
-                    mode=ModeKeys.TRAIN,
-                    validation_in_fit=False,
-                    prepared_feed_values_from_dataset=False,
-                    steps_name='steps',
-                    **kwargs):
-  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
-
-  Args:
-      model: Keras Model instance.
-      inputs: Either a list or dictionary of arrays, or a dataset instance.
-      targets: List/dictionary of input arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: Integer batch size or None if unknown.
-      epochs: Number of times to iterate over the data
-      verbose: 0, 1, or 2. Verbosity mode.
-        0 = silent, 1 = progress bar, 2 = one line per epoch.
-        Note that the progress bar is not particularly useful when
-        logged to a file, so verbose=2 is recommended when not running
-        interactively (eg, in a production environment).
-      callbacks: List of callbacks to be called during training
-      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
-      val_targets: List/dictionary of target arrays.
-      val_sample_weights: Optional list of sample weight arrays.
-      shuffle: Whether to shuffle the data at the beginning of each epoch
-        concatenation of list the display names of the outputs of `f` and the
-        list of display names of the outputs of `f_val`.
-      initial_epoch: Epoch at which to start training (useful for resuming a
-        previous training run)
-      steps_per_epoch: Total number of steps (batches of samples) before
-        declaring one epoch finished and starting the next epoch. Ignored with
-        the default value of `None`.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with the default value of
-        `None`.
-      validation_freq: Only relevant if validation data is provided. Integer or
-        `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
-        integer, specifies how many training epochs to run before a new
-        validation run is performed, e.g. `validation_freq=2` runs
-        validation every 2 epochs. If a Container, specifies the epochs on
-        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
-        validation at the end of the 1st, 2nd, and 10th epochs.
-      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-      validation_in_fit: if true, then this method is invoked from within
-        training iteration (for validation). In the case where `val_inputs` is
-        a dataset, this flag indicates that its iterator and feed values are
-        already created so should properly reuse resources.
-      prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
-        tensors returned from `_prepare_feed_values` call on the validation
-        dataset, so do not call it again on `inputs`. Should only be used for
-        inline validation (i.e., only if `validation_in_fit` is also True).
-      steps_name: The string name of the steps argument, either `steps`,
-        `validation_steps`, or `steps_per_epoch`. Only used for error message
-        formatting.
-      **kwargs: Additional arguments for backwards compatibility.
-
-  Returns:
-      - In TRAIN mode: `History` object.
-      - In TEST mode: Evaluation metrics.
-      - In PREDICT mode: Outputs of the Model called on inputs.
-
-  Raises:
-      ValueError: in case of invalid arguments.
-  """
-  # Backwards compatibility.
-  if 'steps' in kwargs:
-    steps_per_epoch = kwargs.pop('steps')
-  if kwargs:
-    raise TypeError('Unknown arguments: %s' % (kwargs,))
-
-  # In case we were passed a dataset, we extract symbolic tensors from it.
-  reset_dataset_after_each_epoch = False
-  input_iterator = None
-  is_dataset = isinstance(inputs,
-                          (tf.compat.v1.data.Dataset, tf.data.Dataset))
-  # TODO(fchollet): consider moving `steps_per_epoch` inference to
-  # _standardize_user_data and set reset_dataset_after_each_epoch as an
-  # attribute on the dataset instance.
-  if is_dataset:
-    if steps_per_epoch is None:
-      reset_dataset_after_each_epoch = True
-      steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
-          model, inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
-    input_iterator = _get_iterator(inputs, model._distribution_strategy)
-
-  # Enter tf.distribute.Strategy scope.
-  if model._distribution_strategy:
-    scope = distributed_training_utils_v1.distributed_scope(
-        strategy=model._distribution_strategy,
-        learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
-    scope.__enter__()
-
-  use_steps = is_dataset or steps_per_epoch is not None
-  do_validation = val_inputs is not None
-
-  # Prepare input data.
-  inputs = input_iterator or inputs
-  if validation_in_fit and prepared_feed_values_from_dataset:
-    # When invoking validation in training loop, avoid creating iterator and
-    # list of feed values for the same validation dataset multiple times (which
-    # essentially would call `iterator.get_next()` that slows down execution and
-    # leads to OOM errors eventually.
-    ins = inputs
-  else:
-    ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
-    # `ins` is a function when a distribute strategy is used in Eager mode.  In
-    # that case `is_dataset` is True.  The code branches that have requirements
-    # about the type of `ins` do not trigger in the distributed case.
-
-  if not is_dataset:
-    num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
-                                                     steps_per_epoch)
-  else:
-    num_samples_or_steps = steps_per_epoch
-
-  # Update sample_weight_mode of the model if sample_weights is specified by the
-  # user. We need to call this function after we have a handle on the inputs
-  # (both numpy arrays and datasets) in order to determine if the user has
-  # specified sample_weights.
-  _update_sample_weight_mode(model, mode, ins)
-
-  # Get step function and loop type. As part of building the execution
-  # function we recompile the metrics based on the updated
-  # sample_weight_mode value.
-  f = _make_execution_function(model, mode)
-
-  # Prepare validation data. Hold references to the iterator and the input list
-  # to properly reinitialize and reuse in multiple validation passes.
-  val_iterator = None
-  if isinstance(val_inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
-    if validation_steps is None:
-      # Because we pass an iterator feed instead of a Dataset to the eval
-      # model_iteration() call, it will not trigger the dataset-input path
-      # that determines the number of steps required. To avoid this issue,
-      # set validation_steps here if validation_steps is None.
-      validation_steps = training_utils_v1.infer_steps_for_dataset(
-          model,
-          val_inputs,
-          validation_steps,
-          epochs=epochs,
-          steps_name='validation_steps')
-    val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
-    val_inputs = _prepare_feed_values(
-        model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST)
-    # Get num steps for printing.
-    val_samples_or_steps = validation_steps
-  else:
-    # Get num samples for printing.
-    val_samples_or_steps = val_inputs and tf.nest.flatten(
-        val_inputs)[0].shape[0] or None
-
-  if mode == ModeKeys.TRAIN and verbose:
-    _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset)
-
-  # Configure callbacks.
-  count_mode = 'steps' if use_steps else 'samples'
-  callbacks = cbks.configure_callbacks(
-      callbacks,
-      model,
-      do_validation=do_validation,
-      batch_size=batch_size,
-      epochs=epochs,
-      steps_per_epoch=steps_per_epoch,
-      samples=num_samples_or_steps,
-      count_mode=count_mode,
-      verbose=verbose,
-      mode=mode)
-
-  # Find beforehand arrays that need sparse-to-dense conversion.
-  if issparse is not None and not use_steps:
-    indices_for_conversion_to_dense = []
-    feed = _get_model_feed(model, mode)
-    for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
-      if issparse(input_data) and not backend.is_sparse(feed_tensor):
-        indices_for_conversion_to_dense.append(i)
-
-  # Select aggregation method.
-  if mode == ModeKeys.PREDICT:
-    aggregator = training_utils_v1.OutputsAggregator(
-        use_steps,
-        num_samples=None if steps_per_epoch else num_samples_or_steps,
-        steps=steps_per_epoch)
-  else:
-    aggregator = training_utils_v1.MetricsAggregator(
-        use_steps,
-        num_samples=None if steps_per_epoch else num_samples_or_steps,
-        steps=steps_per_epoch)
-
-  if model._compile_distribution:
-    distributed_training_utils_v1._copy_weights_to_distributed_model(
-        model, mode)
-
-  callbacks.model.stop_training = False
-  callbacks._call_begin_hook(mode)
-
-  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
-
-  for epoch in range(initial_epoch, epochs):
-    if callbacks.model.stop_training:
-      break
-
-    # Setup work for each epoch
-    epoch_logs = {}
-    if mode != ModeKeys.PREDICT:
-      # Collecting and resetting metrics has non-zero cost and will needlessly
-      # slow down model.predict.
-      model.reset_metrics()
-    if mode == ModeKeys.TRAIN:
-      callbacks.on_epoch_begin(epoch, epoch_logs)
-
-    if use_steps:
-      # Step-wise loop.
-      if steps_per_epoch is None:
-        # Loop over dataset until `OutOfRangeError` is raised.
-        target_steps = np.inf
-      else:
-        # Loop over dataset for the specified number of steps.
-        target_steps = steps_per_epoch
-
-      step = 0
-      while step < target_steps:
-        batch_logs = {'batch': step, 'size': 1}
-        callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
-
-        # Get outputs.
-        try:
-          # `ins` can be callable in tf.distribute.Strategy + eager case.
-          if not callable(ins) or (model._distribution_strategy and
-                                   not distributed_training_utils_v1
-                                   .is_distributing_by_cloning(model)):
-            actual_inputs = ins
-          else:
-            actual_inputs = ins()
-          batch_outs = f(actual_inputs)
-        except tf.errors.OutOfRangeError:
-          if is_dataset:
-            # The dataset passed by the user ran out of batches.
-            # Now we know the cardinality of the dataset.
-            # If steps_per_epoch was specified, then running out of data is
-            # unexpected, so we stop training and inform the user.
-            if steps_per_epoch:
-              callbacks.model.stop_training = True
-              logging.warning(
-                  'Your dataset ran out of data; interrupting training. '
-                  'Make sure that your dataset can generate at least '
-                  '`%s * epochs` batches (in this case, %d batches). '
-                  'You may need to use the repeat() function when '
-                  'building your dataset.'
-                  % (steps_name, steps_per_epoch * epochs))
-            elif step > 0:
-              steps_per_epoch = step
-              aggregator.steps = steps_per_epoch
-          else:
-            # We ran out of batches while the user passed an iterator (legacy).
-            callbacks.model.stop_training = True
-            logging.warning(
-                'Your dataset iterator ran out of data; '
-                'interrupting training. Make sure that your iterator '
-                'can generate at least `%s * epochs` '
-                'batches (in this case, %d batches). You may need to'
-                'use the repeat() function when building your '
-                'dataset.' % (steps_name, steps_per_epoch * epochs))
-          break
-
-        if not isinstance(batch_outs, list):
-          batch_outs = [batch_outs]
-
-        if model._distribution_strategy:
-          batch_outs = (
-              distributed_training_utils_v1._per_replica_aggregate_batch(
-                  model._distribution_strategy, batch_outs, model, mode))
-
-        # Aggregate results.
-        if step == 0:
-          aggregator.create(batch_outs)
-        aggregator.aggregate(batch_outs)
-
-        # Callbacks batch end.
-        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
-        callbacks._call_batch_hook(mode, 'end', step, batch_logs)
-        step += 1
-
-        if callbacks.model.stop_training:
-          break
+    issparse = None
+
+
+def model_iteration(
+    model,
+    inputs,
+    targets=None,
+    sample_weights=None,
+    batch_size=None,
+    epochs=1,
+    verbose=1,
+    callbacks=None,
+    val_inputs=None,
+    val_targets=None,
+    val_sample_weights=None,
+    shuffle=True,
+    initial_epoch=0,
+    steps_per_epoch=None,
+    validation_steps=None,
+    validation_freq=1,
+    mode=ModeKeys.TRAIN,
+    validation_in_fit=False,
+    prepared_feed_values_from_dataset=False,
+    steps_name="steps",
+    **kwargs,
+):
+    """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
+
+    Args:
+        model: Keras Model instance.
+        inputs: Either a list or dictionary of arrays, or a dataset instance.
+        targets: List/dictionary of input arrays.
+        sample_weights: Optional list of sample weight arrays.
+        batch_size: Integer batch size or None if unknown.
+        epochs: Number of times to iterate over the data
+        verbose: 0, 1, or 2. Verbosity mode.
+          0 = silent, 1 = progress bar, 2 = one line per epoch.
+          Note that the progress bar is not particularly useful when
+          logged to a file, so verbose=2 is recommended when not running
+          interactively (eg, in a production environment).
+        callbacks: List of callbacks to be called during training
+        val_inputs: Either a list or dictionary of arrays, or a dataset
+          instance.
+        val_targets: List/dictionary of target arrays.
+        val_sample_weights: Optional list of sample weight arrays.
+        shuffle: Whether to shuffle the data at the beginning of each epoch
+          concatenation of list the display names of the outputs of `f` and the
+          list of display names of the outputs of `f_val`.
+        initial_epoch: Epoch at which to start training (useful for resuming a
+          previous training run)
+        steps_per_epoch: Total number of steps (batches of samples) before
+          declaring one epoch finished and starting the next epoch. Ignored with
+          the default value of `None`.
+        validation_steps: Number of steps to run validation for (only if doing
+          validation from data tensors). Ignored with the default value of
+          `None`.
+        validation_freq: Only relevant if validation data is provided. Integer
+          or `collections.abc.Container` instance (e.g. list, tuple, etc.). If
+          an integer, specifies how many training epochs to run before a new
+          validation run is performed, e.g. `validation_freq=2` runs validation
+          every 2 epochs. If a Container, specifies the epochs on which to run
+          validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the
+          end of the 1st, 2nd, and 10th epochs.
+        mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+        validation_in_fit: if true, then this method is invoked from within
+          training iteration (for validation). In the case where `val_inputs` is
+          a dataset, this flag indicates that its iterator and feed values are
+          already created so should properly reuse resources.
+        prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
+          tensors returned from `_prepare_feed_values` call on the validation
+          dataset, so do not call it again on `inputs`. Should only be used for
+          inline validation (i.e., only if `validation_in_fit` is also True).
+        steps_name: The string name of the steps argument, either `steps`,
+          `validation_steps`, or `steps_per_epoch`. Only used for error message
+          formatting.
+        **kwargs: Additional arguments for backwards compatibility.
+
+    Returns:
+        - In TRAIN mode: `History` object.
+        - In TEST mode: Evaluation metrics.
+        - In PREDICT mode: Outputs of the Model called on inputs.
+
+    Raises:
+        ValueError: in case of invalid arguments.
+    """
+    # Backwards compatibility.
+    if "steps" in kwargs:
+        steps_per_epoch = kwargs.pop("steps")
+    if kwargs:
+        raise TypeError(f"Unknown arguments: {kwargs}")
+
+    # In case we were passed a dataset, we extract symbolic tensors from it.
+    reset_dataset_after_each_epoch = False
+    input_iterator = None
+    is_dataset = isinstance(
+        inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset)
+    )
+    # TODO(fchollet): consider moving `steps_per_epoch` inference to
+    # _standardize_user_data and set reset_dataset_after_each_epoch as an
+    # attribute on the dataset instance.
+    if is_dataset:
+        if steps_per_epoch is None:
+            reset_dataset_after_each_epoch = True
+            steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
+                model,
+                inputs,
+                steps_per_epoch,
+                epochs=epochs,
+                steps_name=steps_name,
+            )
+        input_iterator = _get_iterator(inputs, model._distribution_strategy)
+
+    # Enter tf.distribute.Strategy scope.
+    if model._distribution_strategy:
+        scope = distributed_training_utils_v1.distributed_scope(
+            strategy=model._distribution_strategy,
+            learning_phase=(1 if mode == ModeKeys.TRAIN else 0),
+        )
+        scope.__enter__()
+
+    use_steps = is_dataset or steps_per_epoch is not None
+    do_validation = val_inputs is not None
+
+    # Prepare input data.
+    inputs = input_iterator or inputs
+    if validation_in_fit and prepared_feed_values_from_dataset:
+        # When invoking validation in training loop, avoid creating iterator and
+        # list of feed values for the same validation dataset multiple times
+        # (which essentially would call `iterator.get_next()` that slows down
+        # execution and leads to OOM errors eventually.
+        ins = inputs
     else:
-      # Sample-wise loop.
-      index_array = np.arange(num_samples_or_steps)
-      if shuffle == 'batch':
-        index_array = training_utils_v1.batch_shuffle(index_array, batch_size)
-      elif shuffle:
-        np.random.shuffle(index_array)
-      batches = make_batches(num_samples_or_steps, batch_size)
-      for batch_index, (batch_start, batch_end) in enumerate(batches):
-        batch_ids = index_array[batch_start:batch_end]
-        # Slice into a batch.
-        if len(batches) == 1:
-          # If we only have one batch, do not slice. This takes care of
-          # composite tensors in non-Dataset modes; we currently don't support
-          # slicing them.
-          # TODO(b/133517906): Add slicing support.
-          ins_batch = ins
-        else:
-          try:
-            if ins and isinstance(ins[-1], int):
-              # Do not slice the training phase flag.
-              ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-            else:
-              ins_batch = slice_arrays(ins, batch_ids)
-          except TypeError:
-            raise TypeError('TypeError while preparing batch. '
-                            'If using HDF5 input data, '
-                            'pass shuffle="batch".')
-
-        # Sparse to dense conversion.
-        if issparse is not None:
-          for i in indices_for_conversion_to_dense:
-            ins_batch[i] = ins_batch[i].toarray()
-
-        # Callbacks batch_begin.
-        batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
-        callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
-
-        # Get outputs.
-        batch_outs = f(ins_batch)
-        if not isinstance(batch_outs, list):
-          batch_outs = [batch_outs]
-
-        # Aggregate results.
-        if batch_index == 0:
-          aggregator.create(batch_outs)
-        aggregator.aggregate(batch_outs, batch_start, batch_end)
-
-        # Callbacks batch end.
-        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
-        callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
-
-        if callbacks.model.stop_training:
-          break
-
-    aggregator.finalize()
-    results = aggregator.results
-    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
-    if len(results) == 1:
-      results = results[0]
-
-    # Run the test loop every `validation_freq` epochs during training.
-    if (do_validation and
-        training_utils_v1.should_run_validation(validation_freq, epoch) and
-        not callbacks.model.stop_training):
-
-      if model._compile_distribution:
-        # Since we create a new clone from the original model we need to copy
-        # the weights back to the original model before we can run validation.
-        distributed_training_utils_v1._copy_weights_to_original_model(
-            model, ModeKeys.TRAIN)
-
-      val_results = model_iteration(
-          model,
-          val_inputs,
-          targets=val_targets,
-          sample_weights=val_sample_weights,
-          batch_size=batch_size,
-          steps_per_epoch=validation_steps,
-          callbacks=callbacks,
-          verbose=0,
-          mode=ModeKeys.TEST,
-          validation_in_fit=True,
-          prepared_feed_values_from_dataset=(val_iterator is not None),
-          steps_name='validation_steps')
-      if not isinstance(val_results, list):
-        val_results = [val_results]
-      epoch_logs = cbks.make_logs(
-          model, epoch_logs, val_results, mode, prefix='val_')
-      if val_iterator and epoch < epochs - 1:
-        _reinitialize_iterator(val_iterator, model._distribution_strategy)
+        ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
+        # `ins` is a function when a distribute strategy is used in Eager mode.
+        # In that case `is_dataset` is True.  The code branches that have
+        # requirements about the type of `ins` do not trigger in the distributed
+        # case.
+
+    if not is_dataset:
+        num_samples_or_steps = _get_num_samples_or_steps(
+            ins, batch_size, steps_per_epoch
+        )
+    else:
+        num_samples_or_steps = steps_per_epoch
+
+    # Update sample_weight_mode of the model if sample_weights is specified by
+    # the user. We need to call this function after we have a handle on the
+    # inputs (both numpy arrays and datasets) in order to determine if the user
+    # has specified sample_weights.
+    _update_sample_weight_mode(model, mode, ins)
+
+    # Get step function and loop type. As part of building the execution
+    # function we recompile the metrics based on the updated
+    # sample_weight_mode value.
+    f = _make_execution_function(model, mode)
+
+    # Prepare validation data. Hold references to the iterator and the input
+    # list to properly reinitialize and reuse in multiple validation passes.
+    val_iterator = None
+    if isinstance(val_inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
+        if validation_steps is None:
+            # Because we pass an iterator feed instead of a Dataset to the eval
+            # model_iteration() call, it will not trigger the dataset-input path
+            # that determines the number of steps required. To avoid this issue,
+            # set validation_steps here if validation_steps is None.
+            validation_steps = training_utils_v1.infer_steps_for_dataset(
+                model,
+                val_inputs,
+                validation_steps,
+                epochs=epochs,
+                steps_name="validation_steps",
+            )
+        val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
+        val_inputs = _prepare_feed_values(
+            model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST
+        )
+        # Get num steps for printing.
+        val_samples_or_steps = validation_steps
+    else:
+        # Get num samples for printing.
+        val_samples_or_steps = (
+            val_inputs and tf.nest.flatten(val_inputs)[0].shape[0] or None
+        )
+
+    if mode == ModeKeys.TRAIN and verbose:
+        _print_train_info(
+            num_samples_or_steps, val_samples_or_steps, is_dataset
+        )
+
+    # Configure callbacks.
+    count_mode = "steps" if use_steps else "samples"
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=do_validation,
+        batch_size=batch_size,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        samples=num_samples_or_steps,
+        count_mode=count_mode,
+        verbose=verbose,
+        mode=mode,
+    )
+
+    # Find beforehand arrays that need sparse-to-dense conversion.
+    if issparse is not None and not use_steps:
+        indices_for_conversion_to_dense = []
+        feed = _get_model_feed(model, mode)
+        for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
+            if issparse(input_data) and not backend.is_sparse(feed_tensor):
+                indices_for_conversion_to_dense.append(i)
+
+    # Select aggregation method.
+    if mode == ModeKeys.PREDICT:
+        aggregator = training_utils_v1.OutputsAggregator(
+            use_steps,
+            num_samples=None if steps_per_epoch else num_samples_or_steps,
+            steps=steps_per_epoch,
+        )
+    else:
+        aggregator = training_utils_v1.MetricsAggregator(
+            use_steps,
+            num_samples=None if steps_per_epoch else num_samples_or_steps,
+            steps=steps_per_epoch,
+        )
 
-    if mode == ModeKeys.TRAIN:
-      # Epochs only apply to `fit`.
-      callbacks.on_epoch_end(epoch, epoch_logs)
+    if model._compile_distribution:
+        distributed_training_utils_v1._copy_weights_to_distributed_model(
+            model, mode
+        )
 
-    # Reinitialize dataset iterator for the next epoch.
-    if reset_dataset_after_each_epoch and epoch < epochs - 1:
-      _reinitialize_iterator(input_iterator, model._distribution_strategy)
+    callbacks.model.stop_training = False
+    callbacks._call_begin_hook(mode)
 
-  model._successful_loop_finish = True
-  callbacks._call_end_hook(mode)
+    initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
+        initial_epoch, mode
+    )
 
-  if model._distribution_strategy:
-    if model._compile_distribution:
-      # TODO(priyag, psv): Copy back metrics to the original model as well?
-      distributed_training_utils_v1._copy_weights_to_original_model(model, mode)
-    scope.__exit__(None, None, None)
+    for epoch in range(initial_epoch, epochs):
+        if callbacks.model.stop_training:
+            break
+
+        # Setup work for each epoch
+        epoch_logs = {}
+        if mode != ModeKeys.PREDICT:
+            # Collecting and resetting metrics has non-zero cost and will
+            # needlessly slow down model.predict.
+            model.reset_metrics()
+        if mode == ModeKeys.TRAIN:
+            callbacks.on_epoch_begin(epoch, epoch_logs)
+
+        if use_steps:
+            # Step-wise loop.
+            if steps_per_epoch is None:
+                # Loop over dataset until `OutOfRangeError` is raised.
+                target_steps = np.inf
+            else:
+                # Loop over dataset for the specified number of steps.
+                target_steps = steps_per_epoch
+
+            step = 0
+            while step < target_steps:
+                batch_logs = {"batch": step, "size": 1}
+                callbacks._call_batch_hook(mode, "begin", step, batch_logs)
+
+                # Get outputs.
+                try:
+                    # `ins` can be callable in tf.distribute.Strategy + eager
+                    # case.
+                    if not callable(ins) or (
+                        model._distribution_strategy
+                        and not distributed_training_utils_v1.is_distributing_by_cloning(  # noqa: E501
+                            model
+                        )
+                    ):
+                        actual_inputs = ins
+                    else:
+                        actual_inputs = ins()
+                    batch_outs = f(actual_inputs)
+                except tf.errors.OutOfRangeError:
+                    if is_dataset:
+                        # The dataset passed by the user ran out of batches.
+                        # Now we know the cardinality of the dataset.  If
+                        # steps_per_epoch was specified, then running out of
+                        # data is unexpected, so we stop training and inform the
+                        # user.
+                        if steps_per_epoch:
+                            callbacks.model.stop_training = True
+                            logging.warning(
+                                "Your dataset ran out of data; interrupting "
+                                "training. Make sure that your dataset can "
+                                "generate at least `%s * epochs` batches (in "
+                                "this case, %d batches). You may need to use "
+                                "the repeat() function when building your "
+                                "dataset."
+                                % (steps_name, steps_per_epoch * epochs)
+                            )
+                        elif step > 0:
+                            steps_per_epoch = step
+                            aggregator.steps = steps_per_epoch
+                    else:
+                        # We ran out of batches while the user passed an
+                        # iterator (legacy).
+                        callbacks.model.stop_training = True
+                        logging.warning(
+                            "Your dataset iterator ran out of data; "
+                            "interrupting training. Make sure that your "
+                            "iterator can generate at least `%s * epochs` "
+                            "batches (in this case, %d batches). You may need "
+                            "to use the repeat() function when building your "
+                            "dataset." % (steps_name, steps_per_epoch * epochs)
+                        )
+                    break
+
+                if not isinstance(batch_outs, list):
+                    batch_outs = [batch_outs]
+
+                if model._distribution_strategy:
+                    batch_outs = distributed_training_utils_v1._per_replica_aggregate_batch(  # noqa: E501
+                        model._distribution_strategy, batch_outs, model, mode
+                    )
+
+                # Aggregate results.
+                if step == 0:
+                    aggregator.create(batch_outs)
+                aggregator.aggregate(batch_outs)
+
+                # Callbacks batch end.
+                batch_logs = callbacks.make_logs(
+                    model, batch_logs, batch_outs, mode
+                )
+                callbacks._call_batch_hook(mode, "end", step, batch_logs)
+                step += 1
+
+                if callbacks.model.stop_training:
+                    break
+        else:
+            # Sample-wise loop.
+            index_array = np.arange(num_samples_or_steps)
+            if shuffle == "batch":
+                index_array = training_utils_v1.batch_shuffle(
+                    index_array, batch_size
+                )
+            elif shuffle:
+                np.random.shuffle(index_array)
+            batches = make_batches(num_samples_or_steps, batch_size)
+            for batch_index, (batch_start, batch_end) in enumerate(batches):
+                batch_ids = index_array[batch_start:batch_end]
+                # Slice into a batch.
+                if len(batches) == 1:
+                    # If we only have one batch, do not slice. This takes care
+                    # of composite tensors in non-Dataset modes; we currently
+                    # don't support slicing them.
+                    # TODO(b/133517906): Add slicing support.
+                    ins_batch = ins
+                else:
+                    try:
+                        if ins and isinstance(ins[-1], int):
+                            # Do not slice the training phase flag.
+                            ins_batch = slice_arrays(ins[:-1], batch_ids) + [
+                                ins[-1]
+                            ]
+                        else:
+                            ins_batch = slice_arrays(ins, batch_ids)
+                    except TypeError:
+                        raise TypeError(
+                            "TypeError while preparing batch. "
+                            "If using HDF5 input data, "
+                            'pass shuffle="batch".'
+                        )
+
+                # Sparse to dense conversion.
+                if issparse is not None:
+                    for i in indices_for_conversion_to_dense:
+                        ins_batch[i] = ins_batch[i].toarray()
+
+                # Callbacks batch_begin.
+                batch_logs = {"batch": batch_index, "size": len(batch_ids)}
+                callbacks._call_batch_hook(
+                    mode, "begin", batch_index, batch_logs
+                )
+
+                # Get outputs.
+                batch_outs = f(ins_batch)
+                if not isinstance(batch_outs, list):
+                    batch_outs = [batch_outs]
+
+                # Aggregate results.
+                if batch_index == 0:
+                    aggregator.create(batch_outs)
+                aggregator.aggregate(batch_outs, batch_start, batch_end)
+
+                # Callbacks batch end.
+                batch_logs = callbacks.make_logs(
+                    model, batch_logs, batch_outs, mode
+                )
+                callbacks._call_batch_hook(mode, "end", batch_index, batch_logs)
+
+                if callbacks.model.stop_training:
+                    break
+
+        aggregator.finalize()
+        results = aggregator.results
+        epoch_logs = callbacks.make_logs(model, epoch_logs, results, mode)
+        if len(results) == 1:
+            results = results[0]
+
+        # Run the test loop every `validation_freq` epochs during training.
+        if (
+            do_validation
+            and training_utils_v1.should_run_validation(validation_freq, epoch)
+            and not callbacks.model.stop_training
+        ):
+
+            if model._compile_distribution:
+                # Since we create a new clone from the original model we need to
+                # copy the weights back to the original model before we can run
+                # validation.
+                distributed_training_utils_v1._copy_weights_to_original_model(
+                    model, ModeKeys.TRAIN
+                )
+
+            val_results = model_iteration(
+                model,
+                val_inputs,
+                targets=val_targets,
+                sample_weights=val_sample_weights,
+                batch_size=batch_size,
+                steps_per_epoch=validation_steps,
+                callbacks=callbacks,
+                verbose=0,
+                mode=ModeKeys.TEST,
+                validation_in_fit=True,
+                prepared_feed_values_from_dataset=(val_iterator is not None),
+                steps_name="validation_steps",
+            )
+            if not isinstance(val_results, list):
+                val_results = [val_results]
+            epoch_logs = callbacks.make_logs(
+                model, epoch_logs, val_results, mode, prefix="val_"
+            )
+            if val_iterator and epoch < epochs - 1:
+                _reinitialize_iterator(
+                    val_iterator, model._distribution_strategy
+                )
+
+        if mode == ModeKeys.TRAIN:
+            # Epochs only apply to `fit`.
+            callbacks.on_epoch_end(epoch, epoch_logs)
+
+        # Reinitialize dataset iterator for the next epoch.
+        if reset_dataset_after_each_epoch and epoch < epochs - 1:
+            _reinitialize_iterator(input_iterator, model._distribution_strategy)
+
+    model._successful_loop_finish = True
+    callbacks._call_end_hook(mode)
+
+    if model._distribution_strategy:
+        if model._compile_distribution:
+            # TODO(priyag, psv): Copy back metrics to the original model as
+            # well?
+            distributed_training_utils_v1._copy_weights_to_original_model(
+                model, mode
+            )
+        scope.__exit__(None, None, None)
 
-  if mode == ModeKeys.TRAIN:
-    return model.history
-  return results
+    if mode == ModeKeys.TRAIN:
+        return model.history
+    return results
 
 
 def _get_model_feed(model, mode):
-  if mode == ModeKeys.PREDICT:
-    feed = model._feed_inputs
-  else:
-    feed = (
-        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
-  return feed
+    if mode == ModeKeys.PREDICT:
+        feed = model._feed_inputs
+    else:
+        feed = (
+            model._feed_inputs
+            + model._feed_targets
+            + model._feed_sample_weights
+        )
+    return feed
 
 
 def _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset):
-  increment = 'steps' if is_dataset else 'samples'
-  msg = 'Train on {0} {increment}'.format(
-      num_samples_or_steps, increment=increment)
-  if val_samples_or_steps:
-    msg += ', validate on {0} {increment}'.format(
-        val_samples_or_steps, increment=increment)
-  io_utils.print_msg(msg)
+    increment = "steps" if is_dataset else "samples"
+    msg = f"Train on {num_samples_or_steps} {increment}"
+    if val_samples_or_steps:
+        msg += f", validate on {val_samples_or_steps} {increment}"
+    io_utils.print_msg(msg)
 
 
 def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
-  """Returns total number of samples (when training in batch mode) or steps."""
-  if steps_per_epoch:
-    return steps_per_epoch
-  return training_utils_v1.check_num_samples(ins, batch_size, steps_per_epoch,
-                                             'steps_per_epoch')
+    """Returns total number of samples when training in batch mode or steps."""
+    if steps_per_epoch:
+        return steps_per_epoch
+    return training_utils_v1.check_num_samples(
+        ins, batch_size, steps_per_epoch, "steps_per_epoch"
+    )
 
 
 def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
-  """Prepare feed values to the model execution function.
-
-  Args:
-    model: Model to prepare feed values for.
-    inputs: List or dict of model inputs.
-    targets: Optional list of model targets.
-    sample_weights: Optional list of sample weight arrays.
-    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-
-  Returns:
-    Feed values for the model in the given mode.
-  """
-  if model._distribution_strategy:
-    if isinstance(inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
-      inputs = distributed_training_utils_v1.get_iterator(
-          inputs, model._distribution_strategy)
-
-    def get_distributed_inputs():
-      return distributed_training_utils_v1._prepare_feed_values(
-          model, inputs, targets, sample_weights, mode)
-
-    # In the eager case, we want to call the input method per step, so return
-    # a lambda from here that can be called. Note that this is applicable only
-    # in Distribution Strategy case as it follows the same code path for both
-    # eager and graph modes.
-    # TODO(priyag,omalleyt): Either we should move the training DS with
-    # IteratorBase to use training_generator code path, or figure out how to
-    # set a symbolic Iterator out of a Dataset when in eager mode.
-    if tf.executing_eagerly():
-      return get_distributed_inputs
-    else:
-      return get_distributed_inputs()
+    """Prepare feed values to the model execution function.
 
-  if isinstance(inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset,
-                         tf.compat.v1.data.Iterator)):
-    inputs, targets, sample_weights = model._standardize_user_data(
-        inputs,
-        extract_tensors_from_dataset=True)
+    Args:
+      model: Model to prepare feed values for.
+      inputs: List or dict of model inputs.
+      targets: Optional list of model targets.
+      sample_weights: Optional list of sample weight arrays.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+
+    Returns:
+      Feed values for the model in the given mode.
+    """
+    if model._distribution_strategy:
+        if isinstance(inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
+            inputs = distributed_training_utils_v1.get_iterator(
+                inputs, model._distribution_strategy
+            )
+
+        def get_distributed_inputs():
+            return distributed_training_utils_v1._prepare_feed_values(
+                model, inputs, targets, sample_weights, mode
+            )
+
+        # In the eager case, we want to call the input method per step, so
+        # return a lambda from here that can be called. Note that this is
+        # applicable only in Distribution Strategy case as it follows the same
+        # code path for both eager and graph modes.
+        # TODO(priyag,omalleyt): Either we should move the training DS with
+        # IteratorBase to use training_generator code path, or figure out how to
+        # set a symbolic Iterator out of a Dataset when in eager mode.
+        if tf.executing_eagerly():
+            return get_distributed_inputs
+        else:
+            return get_distributed_inputs()
 
-  inputs = training_utils_v1.ModelInputs(inputs).as_list()
-  targets = list(targets or [])
-  sample_weights = list(sample_weights or [])
-  ins = inputs + targets + sample_weights
-  if mode == ModeKeys.TRAIN and not isinstance(
-      backend.symbolic_learning_phase(), int):
-    ins += [True]  # Add learning phase value.
-  return ins
+    if isinstance(
+        inputs,
+        (
+            tf.compat.v1.data.Dataset,
+            tf.data.Dataset,
+            tf.compat.v1.data.Iterator,
+        ),
+    ):
+        inputs, targets, sample_weights = model._standardize_user_data(
+            inputs, extract_tensors_from_dataset=True
+        )
+
+    inputs = training_utils_v1.ModelInputs(inputs).as_list()
+    targets = list(targets or [])
+    sample_weights = list(sample_weights or [])
+    ins = inputs + targets + sample_weights
+    if mode == ModeKeys.TRAIN and not isinstance(
+        backend.symbolic_learning_phase(), int
+    ):
+        ins += [True]  # Add learning phase value.
+    return ins
 
 
 def _get_iterator(inputs, distribution_strategy=None):
-  if distribution_strategy:
-    return distributed_training_utils_v1.get_iterator(
-        inputs, distribution_strategy)
-  return training_utils_v1.get_iterator(inputs)
+    if distribution_strategy:
+        return distributed_training_utils_v1.get_iterator(
+            inputs, distribution_strategy
+        )
+    return training_utils_v1.get_iterator(inputs)
 
 
 def _reinitialize_iterator(iterator, distribution_strategy=None):
-  if distribution_strategy:
-    distributed_training_utils_v1.initialize_iterator(
-        iterator, distribution_strategy)
-  else:
-    training_utils_v1.initialize_iterator(iterator)
+    if distribution_strategy:
+        distributed_training_utils_v1.initialize_iterator(
+            iterator, distribution_strategy
+        )
+    else:
+        training_utils_v1.initialize_iterator(iterator)
 
 
 def _make_execution_function(model, mode):
-  """Makes function to run one step of model execution."""
-  if model._distribution_strategy:
-    return distributed_training_utils_v1._make_execution_function(model, mode)
-  return model._make_execution_function(mode)
+    """Makes function to run one step of model execution."""
+    if model._distribution_strategy:
+        return distributed_training_utils_v1._make_execution_function(
+            model, mode
+        )
+    return model._make_execution_function(mode)
 
 
 def _update_sample_weight_mode(model, mode, inputs):
-  """Updates the sample_weight_mode of a given model."""
-  # Add a quick return to prevent us from calling model._feed_targets that
-  # accesses certain model properties that may not be set in the `PREDICT` mode.
-  if mode == ModeKeys.PREDICT:
-    return
-
-  sample_weights = None
-  # `inputs` is the model's inputs + targets + sample_weights +
-  # learning phase placeholder if specified. To update the sample_weight_mode
-  # we need to determine if the user has passed sample weights as part of the
-  # input.
-  if not callable(inputs):
-    sample_weights = inputs[len(model._feed_inputs) + len(model._feed_targets):]
-    has_learning_phase_pl = (mode == ModeKeys.TRAIN and
-                             not isinstance(backend.symbolic_learning_phase(),
-                                            int))
-    if has_learning_phase_pl:
-      sample_weights = sample_weights[:-1]
-    model._update_sample_weight_modes(sample_weights=sample_weights)
-
-  # Call the DistributionStrategy specific function to update the
-  # sample_weight_mode on the model.
-  if model._distribution_strategy:
-    distributed_training_utils_v1._update_sample_weight_modes(model, mode,
-                                                              sample_weights)
+    """Updates the sample_weight_mode of a given model."""
+    # Add a quick return to prevent us from calling model._feed_targets that
+    # accesses certain model properties that may not be set in the `PREDICT`
+    # mode.
+    if mode == ModeKeys.PREDICT:
+        return
+
+    sample_weights = None
+    # `inputs` is the model's inputs + targets + sample_weights +
+    # learning phase placeholder if specified. To update the sample_weight_mode
+    # we need to determine if the user has passed sample weights as part of the
+    # input.
+    if not callable(inputs):
+        sample_weights = inputs[
+            len(model._feed_inputs) + len(model._feed_targets) :
+        ]
+        has_learning_phase_pl = mode == ModeKeys.TRAIN and not isinstance(
+            backend.symbolic_learning_phase(), int
+        )
+        if has_learning_phase_pl:
+            sample_weights = sample_weights[:-1]
+        model._update_sample_weight_modes(sample_weights=sample_weights)
+
+    # Call the DistributionStrategy specific function to update the
+    # sample_weight_mode on the model.
+    if model._distribution_strategy:
+        distributed_training_utils_v1._update_sample_weight_modes(
+            model, mode, sample_weights
+        )
+
 
 # For backwards compatibility for internal users of these loops.
 fit_loop = functools.partial(model_iteration, mode=ModeKeys.TRAIN)
 test_loop = functools.partial(
-    model_iteration, mode=ModeKeys.TEST, shuffle=False)
+    model_iteration, mode=ModeKeys.TEST, shuffle=False
+)
 predict_loop = functools.partial(
-    model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
+    model_iteration, mode=ModeKeys.PREDICT, shuffle=False
+)
 
 
 class ArrayLikeTrainingLoop(training_utils_v1.TrainingLoop):
-  """TrainingLoop that handle inputs like array.
-
-  This is the default handler for most of the input data types, includes
-  symbolic tensors or Numpy array-like, Datasets and iterators in graph mode
-  (since they generate symbolic tensors). This Function is used to handle model
-  with `run_eagerly` = False.
-  """
-
-  def fit(self,
-          model,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          **kwargs):
-    batch_size = model._validate_or_infer_batch_size(batch_size,
-                                                     steps_per_epoch, x)
-
-    x, y, sample_weights = model._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        class_weight=class_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps_per_epoch',
-        steps=steps_per_epoch,
-        validation_split=validation_split,
-        shuffle=shuffle)
-
-    if validation_data:
-      val_x, val_y, val_sample_weights = model._prepare_validation_data(
-          validation_data, batch_size, validation_steps)
-    elif validation_split and 0. < validation_split < 1.:
-      (x, y, sample_weights, val_x, val_y, val_sample_weights
-      ) = training_utils_v1.split_training_and_validation_data(
-          x, y, sample_weights, validation_split)
-    else:
-      if validation_steps:
-        raise ValueError('`validation_steps` should not be specified if '
-                         '`validation_data` is None.')
-      val_x, val_y, val_sample_weights = None, None, None
+    """TrainingLoop that handle inputs like array.
 
-    return fit_loop(
+    This is the default handler for most of the input data types, includes
+    symbolic tensors or Numpy array-like, Datasets and iterators in graph mode
+    (since they generate symbolic tensors). This Function is used to handle
+    model with `run_eagerly` = False.
+    """
+
+    def fit(
+        self,
         model,
-        inputs=x,
-        targets=y,
-        sample_weights=sample_weights,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        val_inputs=val_x,
-        val_targets=val_y,
-        val_sample_weights=val_sample_weights,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        steps_name='steps_per_epoch')
-
-  def evaluate(self,
-               model,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               **kwargs):
-    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-    x, y, sample_weights = model._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps',
-        steps=steps)
-    return test_loop(
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_freq=1,
+        **kwargs,
+    ):
+        batch_size = model._validate_or_infer_batch_size(
+            batch_size, steps_per_epoch, x
+        )
+
+        x, y, sample_weights = model._standardize_user_data(
+            x,
+            y,
+            sample_weight=sample_weight,
+            class_weight=class_weight,
+            batch_size=batch_size,
+            check_steps=True,
+            steps_name="steps_per_epoch",
+            steps=steps_per_epoch,
+            validation_split=validation_split,
+            shuffle=shuffle,
+        )
+
+        if validation_data:
+            val_x, val_y, val_sample_weights = model._prepare_validation_data(
+                validation_data, batch_size, validation_steps
+            )
+        elif validation_split and 0.0 < validation_split < 1.0:
+            (
+                x,
+                y,
+                sample_weights,
+                val_x,
+                val_y,
+                val_sample_weights,
+            ) = training_utils_v1.split_training_and_validation_data(
+                x, y, sample_weights, validation_split
+            )
+        else:
+            if validation_steps:
+                raise ValueError(
+                    "`validation_steps` should not be specified if "
+                    "`validation_data` is None."
+                )
+            val_x, val_y, val_sample_weights = None, None, None
+
+        return fit_loop(
+            model,
+            inputs=x,
+            targets=y,
+            sample_weights=sample_weights,
+            batch_size=batch_size,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            val_inputs=val_x,
+            val_targets=val_y,
+            val_sample_weights=val_sample_weights,
+            shuffle=shuffle,
+            initial_epoch=initial_epoch,
+            steps_per_epoch=steps_per_epoch,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq,
+            steps_name="steps_per_epoch",
+        )
+
+    def evaluate(
+        self,
         model,
-        inputs=x,
-        targets=y,
-        sample_weights=sample_weights,
-        batch_size=batch_size,
-        verbose=verbose,
-        steps=steps,
-        callbacks=callbacks)
-
-  def predict(self,
-              model,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              **kwargs):
-    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-    x, _, _ = model._standardize_user_data(
-        x, check_steps=True, steps_name='steps', steps=steps)
-    return predict_loop(
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose=1,
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+        x, y, sample_weights = model._standardize_user_data(
+            x,
+            y,
+            sample_weight=sample_weight,
+            batch_size=batch_size,
+            check_steps=True,
+            steps_name="steps",
+            steps=steps,
+        )
+        return test_loop(
+            model,
+            inputs=x,
+            targets=y,
+            sample_weights=sample_weights,
+            batch_size=batch_size,
+            verbose=verbose,
+            steps=steps,
+            callbacks=callbacks,
+        )
+
+    def predict(
+        self,
         model,
         x,
-        batch_size=batch_size,
-        verbose=verbose,
-        steps=steps,
-        callbacks=callbacks)
+        batch_size=None,
+        verbose=0,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+        x, _, _ = model._standardize_user_data(
+            x, check_steps=True, steps_name="steps", steps=steps
+        )
+        return predict_loop(
+            model,
+            x,
+            batch_size=batch_size,
+            verbose=verbose,
+            steps=steps,
+            callbacks=callbacks,
+        )
diff --git a/keras/engine/training_dataset_test.py b/keras/engine/training_dataset_test.py
index 55335d95699f..07d5d839c72f 100644
--- a/keras/engine/training_dataset_test.py
+++ b/keras/engine/training_dataset_test.py
@@ -14,558 +14,621 @@
 # ==============================================================================
 """Tests for training routines."""
 
-import tensorflow.compat.v2 as tf
-
 import io
 import sys
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras import callbacks
-from keras.testing_infra import test_combinations
 from keras import metrics as metrics_module
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import io_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 
 class BatchCounterCallback(callbacks.Callback):
+    def __init__(self):
+        self.batch_begin_count = 0
+        self.batch_end_count = 0
 
-  def __init__(self):
-    self.batch_begin_count = 0
-    self.batch_end_count = 0
-
-  def on_batch_begin(self, *args, **kwargs):
-    self.batch_begin_count += 1
+    def on_batch_begin(self, *args, **kwargs):
+        self.batch_begin_count += 1
 
-  def on_batch_end(self, *args, **kwargs):
-    self.batch_end_count += 1
+    def on_batch_end(self, *args, **kwargs):
+        self.batch_end_count += 1
 
 
 class TestTrainingWithDataset(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_calling_model_on_same_dataset(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    # Call fit with validation data
-    model.fit(
-        dataset,
-        epochs=1,
-        steps_per_epoch=2,
-        verbose=0,
-        validation_data=dataset,
-        validation_steps=2)
-    model.fit(
-        dataset,
-        epochs=1,
-        steps_per_epoch=2,
-        verbose=0,
-        validation_data=dataset,
-        validation_steps=2)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_training_and_eval_methods_on_dataset(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat()  # Infinite dataset.
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-
-    # Test with validation data
-    model.fit(
-        dataset,
-        epochs=1,
-        steps_per_epoch=2,
-        verbose=0,
-        validation_data=dataset,
-        validation_steps=2)
-
-    # Test with validation split
-    with self.assertRaises(ValueError):
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          validation_split=0.5,
-          validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegex(
-        ValueError, r'`sample_weight` argument is not supported .+dataset'):
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    with self.assertRaisesRegex(
-        ValueError, '(you should not specify a target)|'
-        '(`y` argument is not supported when using dataset as input.)'):
-      model.fit(dataset, dataset, epochs=1, steps_per_epoch=2, verbose=0)
-
-    # With an infinite dataset, `steps_per_epoch`/`steps` argument is required.
-    with self.assertRaises(ValueError):
-      model.fit(dataset, epochs=1, verbose=0)
-    with self.assertRaises(ValueError):
-      model.evaluate(dataset, verbose=0)
-    with self.assertRaises(ValueError):
-      model.predict(dataset, verbose=0)
-
-  @test_combinations.run_with_all_model_types(exclude_models='sequential')
-  @test_combinations.run_all_keras_modes
-  def test_training_and_eval_methods_on_multi_input_output_dataset(self):
-    input_a = keras.layers.Input(shape=(3,), name='input_1')
-    input_b = keras.layers.Input(shape=(3,), name='input_2')
-    dense = keras.layers.Dense(4, name='dense')
-    dropout = keras.layers.Dropout(0.5, name='dropout')
-    branch_a = [input_a, dense]
-    branch_b = [input_b, dense, dropout]
-
-    model = test_utils.get_multi_io_model(branch_a, branch_b)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    input_a_np = np.random.random((10, 3)).astype(dtype=np.float32)
-    input_b_np = np.random.random((10, 3)).astype(dtype=np.float32)
-    output_d_np = np.random.random((10, 4)).astype(dtype=np.float32)
-    output_e_np = np.random.random((10, 4)).astype(dtype=np.float32)
-
-    # Test with tuples
-    dataset_tuple = tf.data.Dataset.from_tensor_slices(
-        ((input_a_np, input_b_np), (output_d_np, output_e_np)))
-    dataset_tuple = dataset_tuple.repeat(100)
-    dataset_tuple = dataset_tuple.batch(10)
-
-    model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset_tuple, steps=2, verbose=1)
-
-    # Test with dict
-    input_dict = {'input_1': input_a_np, 'input_2': input_b_np}
-    if test_utils.get_model_type() == 'subclass':
-      output_dict = {'output_1': output_d_np, 'output_2': output_e_np}
-    else:
-      output_dict = {'dense': output_d_np, 'dropout': output_e_np}
-
-    dataset_dict = tf.data.Dataset.from_tensor_slices(
-        (input_dict, output_dict))
-    dataset_dict = dataset_dict.repeat(100)
-    dataset_dict = dataset_dict.batch(10)
-
-    model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset_dict, steps=2, verbose=1)
-
-    predict_dataset_dict = tf.data.Dataset.from_tensor_slices(input_dict)
-    predict_dataset_dict = predict_dataset_dict.repeat(100)
-    predict_dataset_dict = predict_dataset_dict.batch(10)
-    model.predict(predict_dataset_dict, steps=1)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_dataset_with_sample_weights(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets, sample_weights))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_dataset_with_sample_weights_correctness(self):
-    x = keras.layers.Input(shape=(1,), name='input')
-    y = keras.layers.Dense(
-        1, kernel_initializer='ones', bias_initializer='zeros', name='dense')(
-            x)
-    model = keras.Model(x, y)
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    model.compile(optimizer, loss)
-    inputs = np.array([[0], [1], [2], [3]], np.float32)
-    targets = np.array([[2], [4], [6], [8]], np.float32)
-    sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32)
-    ds = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets, sample_weights)).batch(2)
-    result = model.evaluate(ds, verbose=1)
-    # The per sample loss is multiplied by the corresponding sample weight. The
-    # average of these weighted losses is the return value of the `evaluate`
-    # call. For example, in the test above the average weighted loss is
-    # calculated in the following manner:
-    # ((2-0)^2) * 0.25 + ((4-1)^2) * 0.5 + ((6-2)^2 * 0.75) + ((8-3)^2 * 1)
-    #  equals 42.5 / 4 = 10.625
-    self.assertEqual(result, 10.625)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_dataset_with_sparse_labels(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    model.compile(
-        optimizer,
-        loss='sparse_categorical_crossentropy',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((10, 3), dtype=np.float32)
-    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  @test_combinations.run_all_keras_modes
-  def test_dataset_fit_correctness(self):
-
-    class SumLayer(keras.layers.Layer):
-
-      def build(self, _):
-        self.w = self.add_weight('w', ())
-
-      def call(self, inputs):
-        return keras.backend.sum(inputs, axis=1, keepdims=True) + self.w * 0
-
-    model = keras.Sequential([SumLayer(input_shape=(2,))])
-    model.compile(
-        'rmsprop', loss='mae', run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((40, 2), dtype=np.float32)
-    inputs[10:20, :] = 2
-    inputs[20:30, :] = 1
-    inputs[30:, :] = 4
-    targets = np.zeros((40, 1), dtype=np.float32)
-
-    # Test correctness with `steps_per_epoch`.
-    train_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    val_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    history = model.fit(
-        train_dataset,
-        epochs=2,
-        steps_per_epoch=2,
-        verbose=1,
-        validation_data=val_dataset,
-        validation_steps=2)
-    self.assertAllClose(history.history['loss'],
-                        [inputs[:20].sum() / 20, inputs[20:].sum() / 20])
-    # The validation dataset will be reset at the end of each validation run.
-    self.assertAllClose(history.history['val_loss'],
-                        [inputs[:20].sum() / 20, inputs[:20].sum() / 20])
-
-    # Test correctness with dataset reset.
-    train_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    val_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    history = model.fit(
-        train_dataset, epochs=2, verbose=1, validation_data=val_dataset)
-    self.assertAllClose(
-        history.history['loss'],
-        [inputs.sum() / 40, inputs.sum() / 40])
-    self.assertAllClose(
-        history.history['val_loss'],
-        [inputs.sum() / 40, inputs.sum() / 40])
-
-  def test_dataset_input_shape_validation(self):
-    with tf.compat.v1.get_default_graph().as_default(), self.cached_session():
-      model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      model.compile(optimizer='rmsprop', loss='mse')
-
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-
-      with self.assertRaisesRegex(
-          ValueError,
-          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
-      ):
-        model.train_on_batch(dataset)
-
-      # Wrong input shape
-      inputs = np.zeros((10, 5))
-      targets = np.zeros((10, 4))
-      dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      with self.assertRaisesRegex(ValueError,
-                                  r'expected (.*?) to have shape \(3,\)'):
-        model.train_on_batch(dataset)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_finite_dataset_known_cardinality_no_steps_arg(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.compile(
-        'rmsprop', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((100, 3), dtype=np.float32)
-    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.batch(10)
-
-    batch_counter = BatchCounterCallback()
-    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
-
-    self.assertLen(history.history['loss'], 2)
-    self.assertEqual(batch_counter.batch_end_count, 20)
-    model.evaluate(dataset)
-    out = model.predict(dataset)
-    self.assertEqual(out.shape[0], 100)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_finite_dataset_unknown_cardinality_no_steps_arg(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.compile(
-        'rmsprop', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((100, 3), dtype=np.float32)
-    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.filter(lambda x, y: True).batch(10)
-    self.assertEqual(
-        keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
-        tf.data.experimental.UNKNOWN_CARDINALITY)
-
-    batch_counter = BatchCounterCallback()
-    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
-
-    self.assertLen(history.history['loss'], 2)
-    self.assertEqual(batch_counter.batch_end_count, 20)
-    model.evaluate(dataset)
-    out = model.predict(dataset)
-    self.assertEqual(out.shape[0], 100)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(self):
-
-    class CaptureStdout:
-
-      def __enter__(self):
-        self._stdout = sys.stdout
-        string_io = io.StringIO()
-        sys.stdout = string_io
-        self._stringio = string_io
-        return self
-
-      def __exit__(self, *args):
-        self.output = self._stringio.getvalue()
-        sys.stdout = self._stdout
-
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.compile(
-        'rmsprop', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((100, 3), dtype=np.float32)
-    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.filter(lambda x, y: True).batch(10)
-    self.assertEqual(
-        keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
-        tf.data.experimental.UNKNOWN_CARDINALITY)
-
-    batch_counter = BatchCounterCallback()
-    io_utils.enable_interactive_logging()
-    with CaptureStdout() as capture:
-      history = model.fit(
-          dataset,
-          epochs=2,
-          callbacks=[batch_counter],
-          validation_data=dataset.take(3))
-
-    lines = capture.output.splitlines()
-
-    self.assertIn('10/10', lines[-1])
-
-    self.assertLen(history.history['loss'], 2)
-    self.assertEqual(batch_counter.batch_begin_count, 21)
-    self.assertEqual(batch_counter.batch_end_count, 20)
-    model.evaluate(dataset)
-    out = model.predict(dataset)
-    self.assertEqual(out.shape[0], 100)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_finite_dataset_unknown_cardinality_out_of_data(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.compile(
-        'rmsprop', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((100, 3), dtype=np.float32)
-    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.filter(lambda x, y: True).batch(10)
-    self.assertEqual(
-        keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
-        tf.data.experimental.UNKNOWN_CARDINALITY)
-
-    batch_counter = BatchCounterCallback()
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning') as mock_log:
-      # steps_per_epoch (200) is greater than the dataset size (100). As this is
-      # unexpected, training will stop and not make it to the second epoch.
-      history = model.fit(
-          dataset,
-          epochs=2,
-          verbose=1,
-          callbacks=[batch_counter],
-          steps_per_epoch=200)
-      self.assertIn('ran out of data; interrupting training.',
-                    str(mock_log.call_args))
-      self.assertIn(
-          'can generate at least '
-          '`steps_per_epoch * epochs` batches (in this case, 400 batches). '
-          'You may need to use the repeat() function when '
-          'building your dataset.', str(mock_log.call_args))
-
-    self.assertLen(history.history['loss'], 1)
-    self.assertEqual(batch_counter.batch_end_count, 10)
-    model.evaluate(dataset)
-    out = model.predict(dataset)
-    self.assertEqual(out.shape[0], 100)
-
-  @test_combinations.run_all_keras_modes
-  def test_with_external_loss(self):
-    inp = keras.Input(shape=(4,), name='inp1')
-    out = keras.layers.Dense(2)(inp)
-    model = keras.Model(inp, out)
-    model.add_loss(tf.reduce_mean(out))
-    model.compile('rmsprop')
-    x = np.ones((10, 4))
-
-    # dataset contains only features, no labels.
-    dataset = tf.data.Dataset.from_tensor_slices(x).repeat(10).batch(10)
-    model.fit(dataset)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_train_eval_with_steps(self):
-    # See b/142880049 for more details.
-    inp = keras.Input(shape=(4,), name='inp1')
-    out = keras.layers.Dense(2)(inp)
-    model = keras.Model(inp, out)
-    model.compile(
-        'rmsprop', loss='mse', run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.zeros((100, 4), dtype=np.float32)
-    targets = np.random.randint(0, 2, size=100, dtype=np.int32)
-    training_ds = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).repeat().batch(10)
-
-    # Create eval dataset with generator, so that dataset won't contain the
-    # overall size metadata. Without eval_steps, we expect to run through all
-    # the data in this dataset every epoch.
-    def gen():
-      for _ in range(100):
-        yield (np.zeros(4, dtype=np.float32),
-               np.random.randint(0, 2, size=1, dtype=np.int32))
-
-    eval_ds = tf.data.Dataset.from_generator(
-        generator=gen,
-        output_types=('float64', 'int32'),
-        output_shapes=([4], [1])).batch(100)
-    batch_counter = BatchCounterCallback()
-
-    model.fit(
-        training_ds,
-        steps_per_epoch=10,
-        epochs=10,
-        validation_data=eval_ds,
-        callbacks=[batch_counter])
-
-    # Expect 10 batch from training per epoch.
-    self.assertEqual(batch_counter.batch_end_count, 100)
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_calling_model_on_same_dataset(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        optimizer = "rmsprop"
+        loss = "mse"
+        metrics = ["mae"]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=metrics,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.zeros((10, 3), np.float32)
+        targets = np.zeros((10, 4), np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+
+        # Call fit with validation data
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_data=dataset,
+            validation_steps=2,
+        )
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_data=dataset,
+            validation_steps=2,
+        )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_training_and_eval_methods_on_dataset(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        optimizer = "rmsprop"
+        loss = "mse"
+        metrics = ["mae", metrics_module.CategoricalAccuracy()]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=metrics,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.zeros((10, 3), np.float32)
+        targets = np.zeros((10, 4), np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat()  # Infinite dataset.
+        dataset = dataset.batch(10)
+
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+        model.evaluate(dataset, steps=2, verbose=1)
+        model.predict(dataset, steps=2)
+
+        # Test with validation data
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_data=dataset,
+            validation_steps=2,
+        )
+
+        # Test with validation split
+        with self.assertRaises(ValueError):
+            model.fit(
+                dataset,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                validation_split=0.5,
+                validation_steps=2,
+            )
+
+        # Test with sample weight.
+        sample_weight = np.random.random((10,))
+        with self.assertRaisesRegex(
+            ValueError, r"`sample_weight` argument is not supported .+dataset"
+        ):
+            model.fit(
+                dataset,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                sample_weight=sample_weight,
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "(you should not specify a target)|"
+            "(`y` argument is not supported when using dataset as input.)",
+        ):
+            model.fit(dataset, dataset, epochs=1, steps_per_epoch=2, verbose=0)
+
+        # With an infinite dataset, `steps_per_epoch`/`steps` argument is
+        # required.
+        with self.assertRaises(ValueError):
+            model.fit(dataset, epochs=1, verbose=0)
+        with self.assertRaises(ValueError):
+            model.evaluate(dataset, verbose=0)
+        with self.assertRaises(ValueError):
+            model.predict(dataset, verbose=0)
+
+    @test_combinations.run_with_all_model_types(exclude_models="sequential")
+    @test_combinations.run_all_keras_modes
+    def test_training_and_eval_methods_on_multi_input_output_dataset(self):
+        input_a = keras.layers.Input(shape=(3,), name="input_1")
+        input_b = keras.layers.Input(shape=(3,), name="input_2")
+        dense = keras.layers.Dense(4, name="dense")
+        dropout = keras.layers.Dropout(0.5, name="dropout")
+        branch_a = [input_a, dense]
+        branch_b = [input_b, dense, dropout]
+
+        model = test_utils.get_multi_io_model(branch_a, branch_b)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        input_a_np = np.random.random((10, 3)).astype(dtype=np.float32)
+        input_b_np = np.random.random((10, 3)).astype(dtype=np.float32)
+        output_d_np = np.random.random((10, 4)).astype(dtype=np.float32)
+        output_e_np = np.random.random((10, 4)).astype(dtype=np.float32)
+
+        # Test with tuples
+        dataset_tuple = tf.data.Dataset.from_tensor_slices(
+            ((input_a_np, input_b_np), (output_d_np, output_e_np))
+        )
+        dataset_tuple = dataset_tuple.repeat(100)
+        dataset_tuple = dataset_tuple.batch(10)
+
+        model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
+        model.evaluate(dataset_tuple, steps=2, verbose=1)
+
+        # Test with dict
+        input_dict = {"input_1": input_a_np, "input_2": input_b_np}
+        if test_utils.get_model_type() == "subclass":
+            output_dict = {"output_1": output_d_np, "output_2": output_e_np}
+        else:
+            output_dict = {"dense": output_d_np, "dropout": output_e_np}
+
+        dataset_dict = tf.data.Dataset.from_tensor_slices(
+            (input_dict, output_dict)
+        )
+        dataset_dict = dataset_dict.repeat(100)
+        dataset_dict = dataset_dict.batch(10)
+
+        model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
+        model.evaluate(dataset_dict, steps=2, verbose=1)
+
+        predict_dataset_dict = tf.data.Dataset.from_tensor_slices(input_dict)
+        predict_dataset_dict = predict_dataset_dict.repeat(100)
+        predict_dataset_dict = predict_dataset_dict.batch(10)
+        model.predict(predict_dataset_dict, steps=1)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_dataset_with_sample_weights(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        optimizer = "rmsprop"
+        loss = "mse"
+        metrics = ["mae", metrics_module.CategoricalAccuracy()]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=metrics,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.zeros((10, 3), np.float32)
+        targets = np.zeros((10, 4), np.float32)
+        sample_weights = np.ones((10), np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets, sample_weights)
+        )
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+        model.evaluate(dataset, steps=2, verbose=1)
+        model.predict(dataset, steps=2)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_dataset_with_sample_weights_correctness(self):
+        x = keras.layers.Input(shape=(1,), name="input")
+        y = keras.layers.Dense(
+            1, kernel_initializer="ones", bias_initializer="zeros", name="dense"
+        )(x)
+        model = keras.Model(x, y)
+        optimizer = "rmsprop"
+        loss = "mse"
+        model.compile(optimizer, loss)
+        inputs = np.array([[0], [1], [2], [3]], np.float32)
+        targets = np.array([[2], [4], [6], [8]], np.float32)
+        sample_weights = np.array([0.25, 0.5, 0.75, 1], np.float32)
+        ds = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets, sample_weights)
+        ).batch(2)
+        result = model.evaluate(ds, verbose=1)
+        # The per sample loss is multiplied by the corresponding sample weight.
+        # The average of these weighted losses is the return value of the
+        # `evaluate` call. For example, in the test above the average weighted
+        # loss is calculated in the following manner:
+        # ((2-0)^2) * 0.25 + ((4-1)^2) * 0.5 + ((6-2)^2 * 0.75) + ((8-3)^2 * 1)
+        #  equals 42.5 / 4 = 10.625
+        self.assertEqual(result, 10.625)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_dataset_with_sparse_labels(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        optimizer = "rmsprop"
+        model.compile(
+            optimizer,
+            loss="sparse_categorical_crossentropy",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.zeros((10, 3), dtype=np.float32)
+        targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+    @test_combinations.run_all_keras_modes
+    def test_dataset_fit_correctness(self):
+        class SumLayer(keras.layers.Layer):
+            def build(self, _):
+                self.w = self.add_weight("w", ())
+
+            def call(self, inputs):
+                return (
+                    keras.backend.sum(inputs, axis=1, keepdims=True)
+                    + self.w * 0
+                )
+
+        model = keras.Sequential([SumLayer(input_shape=(2,))])
+        model.compile(
+            "rmsprop", loss="mae", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.zeros((40, 2), dtype=np.float32)
+        inputs[10:20, :] = 2
+        inputs[20:30, :] = 1
+        inputs[30:, :] = 4
+        targets = np.zeros((40, 1), dtype=np.float32)
+
+        # Test correctness with `steps_per_epoch`.
+        train_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        val_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        history = model.fit(
+            train_dataset,
+            epochs=2,
+            steps_per_epoch=2,
+            verbose=1,
+            validation_data=val_dataset,
+            validation_steps=2,
+        )
+        self.assertAllClose(
+            history.history["loss"],
+            [inputs[:20].sum() / 20, inputs[20:].sum() / 20],
+        )
+        # The validation dataset will be reset at the end of each validation
+        # run.
+        self.assertAllClose(
+            history.history["val_loss"],
+            [inputs[:20].sum() / 20, inputs[:20].sum() / 20],
+        )
+
+        # Test correctness with dataset reset.
+        train_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        val_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        history = model.fit(
+            train_dataset, epochs=2, verbose=1, validation_data=val_dataset
+        )
+        self.assertAllClose(
+            history.history["loss"], [inputs.sum() / 40, inputs.sum() / 40]
+        )
+        self.assertAllClose(
+            history.history["val_loss"], [inputs.sum() / 40, inputs.sum() / 40]
+        )
+
+    def test_dataset_input_shape_validation(self):
+        with tf.compat.v1.get_default_graph().as_default(), self.cached_session():  # noqa: E501
+            model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
+            model.compile(optimizer="rmsprop", loss="mse")
+
+            # User forgets to batch the dataset
+            inputs = np.zeros((10, 3))
+            targets = np.zeros((10, 4))
+            dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+            dataset = dataset.repeat(100)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                r"expected (.*?) to have shape \(3,\) "
+                r"but got array with shape \(1,\)",
+            ):
+                model.train_on_batch(dataset)
+
+            # Wrong input shape
+            inputs = np.zeros((10, 5))
+            targets = np.zeros((10, 4))
+            dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+            dataset = dataset.repeat(100)
+            dataset = dataset.batch(10)
+
+            with self.assertRaisesRegex(
+                ValueError, r"expected (.*?) to have shape \(3,\)"
+            ):
+                model.train_on_batch(dataset)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_finite_dataset_known_cardinality_no_steps_arg(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.zeros((100, 3), dtype=np.float32)
+        targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.batch(10)
+
+        batch_counter = BatchCounterCallback()
+        history = model.fit(
+            dataset, epochs=2, verbose=1, callbacks=[batch_counter]
+        )
+
+        self.assertLen(history.history["loss"], 2)
+        self.assertEqual(batch_counter.batch_end_count, 20)
+        model.evaluate(dataset)
+        out = model.predict(dataset)
+        self.assertEqual(out.shape[0], 100)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_finite_dataset_unknown_cardinality_no_steps_arg(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.zeros((100, 3), dtype=np.float32)
+        targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.filter(lambda x, y: True).batch(10)
+        self.assertEqual(
+            keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
+            tf.data.experimental.UNKNOWN_CARDINALITY,
+        )
+
+        batch_counter = BatchCounterCallback()
+        history = model.fit(
+            dataset, epochs=2, verbose=1, callbacks=[batch_counter]
+        )
+
+        self.assertLen(history.history["loss"], 2)
+        self.assertEqual(batch_counter.batch_end_count, 20)
+        model.evaluate(dataset)
+        out = model.predict(dataset)
+        self.assertEqual(out.shape[0], 100)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(
+        self,
+    ):
+        class CaptureStdout:
+            def __enter__(self):
+                self._stdout = sys.stdout
+                string_io = io.StringIO()
+                sys.stdout = string_io
+                self._stringio = string_io
+                return self
+
+            def __exit__(self, *args):
+                self.output = self._stringio.getvalue()
+                sys.stdout = self._stdout
+
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.zeros((100, 3), dtype=np.float32)
+        targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.filter(lambda x, y: True).batch(10)
+        self.assertEqual(
+            keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
+            tf.data.experimental.UNKNOWN_CARDINALITY,
+        )
+
+        batch_counter = BatchCounterCallback()
+        io_utils.enable_interactive_logging()
+        with CaptureStdout() as capture:
+            history = model.fit(
+                dataset,
+                epochs=2,
+                callbacks=[batch_counter],
+                validation_data=dataset.take(3),
+            )
+
+        lines = capture.output.splitlines()
+
+        self.assertIn("10/10", lines[-1])
+
+        self.assertLen(history.history["loss"], 2)
+        self.assertEqual(batch_counter.batch_begin_count, 21)
+        self.assertEqual(batch_counter.batch_end_count, 20)
+        model.evaluate(dataset)
+        out = model.predict(dataset)
+        self.assertEqual(out.shape[0], 100)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_finite_dataset_unknown_cardinality_out_of_data(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.zeros((100, 3), dtype=np.float32)
+        targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.filter(lambda x, y: True).batch(10)
+        self.assertEqual(
+            keras.backend.get_value(tf.data.experimental.cardinality(dataset)),
+            tf.data.experimental.UNKNOWN_CARDINALITY,
+        )
+
+        batch_counter = BatchCounterCallback()
+        with tf.compat.v1.test.mock.patch.object(
+            logging, "warning"
+        ) as mock_log:
+            # steps_per_epoch (200) is greater than the dataset size (100). As
+            # this is unexpected, training will stop and not make it to the
+            # second epoch.
+            history = model.fit(
+                dataset,
+                epochs=2,
+                verbose=1,
+                callbacks=[batch_counter],
+                steps_per_epoch=200,
+            )
+            self.assertIn(
+                "ran out of data; interrupting training.",
+                str(mock_log.call_args),
+            )
+            self.assertIn(
+                "can generate at least "
+                "`steps_per_epoch * epochs` batches (in this case, "
+                "400 batches). You may need to use the repeat() function when "
+                "building your dataset.",
+                str(mock_log.call_args),
+            )
+
+        self.assertLen(history.history["loss"], 1)
+        self.assertEqual(batch_counter.batch_end_count, 10)
+        model.evaluate(dataset)
+        out = model.predict(dataset)
+        self.assertEqual(out.shape[0], 100)
+
+    @test_combinations.run_all_keras_modes
+    def test_with_external_loss(self):
+        inp = keras.Input(shape=(4,), name="inp1")
+        out = keras.layers.Dense(2)(inp)
+        model = keras.Model(inp, out)
+        model.add_loss(tf.reduce_mean(out))
+        model.compile("rmsprop")
+        x = np.ones((10, 4))
+
+        # dataset contains only features, no labels.
+        dataset = tf.data.Dataset.from_tensor_slices(x).repeat(10).batch(10)
+        model.fit(dataset)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_train_eval_with_steps(self):
+        # See b/142880049 for more details.
+        inp = keras.Input(shape=(4,), name="inp1")
+        out = keras.layers.Dense(2)(inp)
+        model = keras.Model(inp, out)
+        model.compile(
+            "rmsprop", loss="mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.zeros((100, 4), dtype=np.float32)
+        targets = np.random.randint(0, 2, size=100, dtype=np.int32)
+        training_ds = (
+            tf.data.Dataset.from_tensor_slices((inputs, targets))
+            .repeat()
+            .batch(10)
+        )
+
+        # Create eval dataset with generator, so that dataset won't contain the
+        # overall size metadata. Without eval_steps, we expect to run through
+        # all the data in this dataset every epoch.
+        def gen():
+            for _ in range(100):
+                yield (
+                    np.zeros(4, dtype=np.float32),
+                    np.random.randint(0, 2, size=1, dtype=np.int32),
+                )
+
+        eval_ds = tf.data.Dataset.from_generator(
+            generator=gen,
+            output_types=("float64", "int32"),
+            output_shapes=([4], [1]),
+        ).batch(100)
+        batch_counter = BatchCounterCallback()
+
+        model.fit(
+            training_ds,
+            steps_per_epoch=10,
+            epochs=10,
+            validation_data=eval_ds,
+            callbacks=[batch_counter],
+        )
+
+        # Expect 10 batch from training per epoch.
+        self.assertEqual(batch_counter.batch_end_count, 100)
 
 
 class TestMetricsWithDatasets(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_metrics_correctness_with_dataset(self):
-    layers = [
-        keras.layers.Dense(
-            8, activation='relu', input_dim=4, kernel_initializer='ones'),
-        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
-    ]
-
-    model = test_utils.get_model_from_layers(layers, (4,))
-
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    np.random.seed(123)
-    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
-    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(10)
-    outs = model.evaluate(dataset, steps=10)
-    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
-    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
-
-    y = np.zeros((100, 1), dtype=np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    outs = model.evaluate(dataset, steps=10)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_metrics_correctness_with_dataset(self):
+        layers = [
+            keras.layers.Dense(
+                8, activation="relu", input_dim=4, kernel_initializer="ones"
+            ),
+            keras.layers.Dense(
+                1, activation="sigmoid", kernel_initializer="ones"
+            ),
+        ]
+
+        model = test_utils.get_model_from_layers(layers, (4,))
+
+        model.compile(
+            loss="binary_crossentropy",
+            metrics=["accuracy", metrics_module.BinaryAccuracy()],
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        np.random.seed(123)
+        x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+        y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        dataset = dataset.batch(10)
+        outs = model.evaluate(dataset, steps=10)
+        self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+        self.assertEqual(np.around(outs[2], decimals=1), 0.5)
+
+        y = np.zeros((100, 1), dtype=np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+        outs = model.evaluate(dataset, steps=10)
+        self.assertEqual(outs[1], 0.0)
+        self.assertEqual(outs[2], 0.0)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/training_distributed_v1.py b/keras/engine/training_distributed_v1.py
index 70e8cfaaecb3..dc600160d658 100644
--- a/keras/engine/training_distributed_v1.py
+++ b/keras/engine/training_distributed_v1.py
@@ -14,11 +14,9 @@
 # ==============================================================================
 """Part of the Keras training engine related to distributed training."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-# pylint: disable=protected-access
 
-import numpy as np
-from tensorflow.python.distribute import input_lib
 from keras import backend
 from keras import callbacks as cbks
 from keras.distribute import distribute_coordinator_utils as dc
@@ -28,761 +26,898 @@
 from keras.engine import training_utils_v1
 from keras.utils.generic_utils import Progbar
 from keras.utils.mode_keys import ModeKeys
+
+# isort: off
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.platform import tf_logging as logging
 
 
 def _per_replica_execution_function(model, mode):
-  exec_func = model._make_execution_function(mode)
-  return (exec_func.inputs, exec_func.outputs, exec_func.updates_op,
-          exec_func.session_kwargs)
+    exec_func = model._make_execution_function(mode)
+    return (
+        exec_func.inputs,
+        exec_func.outputs,
+        exec_func.updates_op,
+        exec_func.session_kwargs,
+    )
 
 
 def _build_model(strategy, model, mode, inputs, targets=None):
-  if model._compile_distribution:
-    dist_utils.clone_model_on_replicas(
-        model, strategy, mode, inputs=inputs, targets=targets)
-  else:
-    dist_utils._build_distributed_network(model, strategy, mode, inputs,
-                                          targets)
+    if model._compile_distribution:
+        dist_utils.clone_model_on_replicas(
+            model, strategy, mode, inputs=inputs, targets=targets
+        )
+    else:
+        dist_utils._build_distributed_network(
+            model, strategy, mode, inputs, targets
+        )
 
 
 def _make_train_step_fn(model, mode, strategy, output_labels):
-  """Create step fn.
-
-  Args:
-    model: a Keras Model instance.
-    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-    strategy: a `tf.distribute.Strategy` instance.
-    output_labels: the output labels for the step function.
-
-  Returns:
-    A step function to run by `tf.distribute.Strategy`.
-  """
-
-  def _step_fn(ctx, inputs):
-    """A step fn that returns update ops."""
-    if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
-      inputs, targets = inputs
-    else:
-      targets = None
-
-    # When input feature is a dictionary of tensors, dictionary is flattended
-    # to an array and passed as a model input. This results in input mismatch
-    # when model input layer names are not sorted in alphabetical order as
-    # `nest.flatten()`sorts dictionary elements by keys. As so, transform input
-    # tensors into an array and order it along `model._feed_input_names`.
-    if isinstance(inputs, dict):
-      inputs = [inputs[input_name] for input_name in model._feed_input_names]
-
-    _build_model(strategy, model, mode, inputs, targets)
-
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_replica_execution_function,
-         args=(dist_utils.get_distributed_model(model, mode), mode))
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = dist_utils.unwrap_values(strategy, grouped_inputs,
-                                                  grouped_outputs,
-                                                  grouped_updates,
-                                                  grouped_session_args)
-    combined_fn = backend.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_' + str(mode) + '_function',
-        **all_session_args)
-
-    for label, output in zip(output_labels, combined_fn.outputs):
-      if label == 'loss':
-        reduce_op = tf.distribute.ReduceOp.SUM
-      else:
-        # We reduce all other metrics using mean for now. This is temporary
-        # workaround until new metrics are in place.
-        reduce_op = tf.distribute.ReduceOp.MEAN
-      ctx.set_last_step_output(label, output, reduce_op)
-
-    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
-    # feed_dict, session kwargs, run options, run_metadata for now. These should
-    # be handled appropriately
-    return combined_fn.updates_op
-
-  return _step_fn
-
-
-def experimental_tpu_fit_loop(model,
-                              dataset,
-                              epochs=100,
-                              verbose=1,
-                              callbacks=None,
-                              initial_epoch=0,
-                              steps_per_epoch=None,
-                              val_dataset=None,
-                              validation_steps=None,
-                              validation_freq=1):
-  """Fit loop for training with TPU tf.distribute.Strategy.
-
-  Args:
-      model: Keras Model instance.
-      dataset: Dataset that returns inputs and targets
-      epochs: Number of times to iterate over the data
-      verbose: Integer, Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      val_dataset: Dataset for validation data.
-      validation_steps: Number of steps to run validation for
-          (only if doing validation from data tensors).
-          Ignored with the default value of `None`.
-      validation_freq: Only relevant if validation data is provided. Integer or
-          `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
-          integer, specifies how many training epochs to run before a new
-          validation run is performed, e.g. `validation_freq=2` runs
-          validation every 2 epochs. If a Container, specifies the epochs on
-          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
-          validation at the end of the 1st, 2nd, and 10th epochs.
-
-  Returns:
-      Returns `None`.
-
-  Raises:
-      ValueError: in case of invalid arguments.
-  """
-  mode = ModeKeys.TRAIN
-
-  current_strategy = model._distribution_strategy
-  iteration_value = min(steps_per_epoch,
-                        current_strategy.extended.steps_per_run)
-  steps_per_run = backend.variable(
-      value=iteration_value,
-      dtype='int32',
-      name='steps_per_run')
-
-  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
-  iterator = dist_utils.get_iterator(dataset, current_strategy)
-
-  scope = dist_utils.distributed_scope(
-      strategy=current_strategy, learning_phase=1)
-  scope.__enter__()
-
-  out_labels = model.metrics_names or []
-
-  step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy,
-                                out_labels)
-
-  # Add initial dummy values for loss and other metric tensors.
-  initial_loop_values = {}
-  initial_loop_values['loss'] = tf.constant(1e7)
-  for m in model._get_training_eval_metrics():
-    tensor = m.result()
-    initial_loop_values[m.name] = tf.zeros(tensor.shape, tensor.dtype)
-
-  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
-      step_fn, iterator, iterations=steps_per_run,
-      initial_loop_values=initial_loop_values)
-  train_op = ctx.run_op
-  output_tensors = ctx.last_step_outputs
-
-  do_validation = bool(validation_steps)
-
-  if model._compile_distribution:
-    dist_utils._copy_weights_to_distributed_model(model, mode)
-
-  callbacks = cbks.configure_callbacks(
-      callbacks,
-      model,
-      do_validation=do_validation,
-      epochs=epochs,
-      steps_per_epoch=steps_per_epoch,
-      verbose=verbose,
-      count_mode='steps',
-      mode=mode)
-
-  # Calculate the steps each time on the device.
-  steps_to_run = ([current_strategy.extended.steps_per_run] *
-                  (steps_per_epoch //
-                   current_strategy.extended.steps_per_run))
-  if steps_per_epoch % current_strategy.extended.steps_per_run:
-    steps_to_run.append(
-        steps_per_epoch % current_strategy.extended.steps_per_run)
-  target_steps = len(steps_to_run)
-
-  callbacks._call_begin_hook(mode)
-
-  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
-
-  for epoch in range(initial_epoch, epochs):
+    """Create step fn.
+
+    Args:
+      model: a Keras Model instance.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+      strategy: a `tf.distribute.Strategy` instance.
+      output_labels: the output labels for the step function.
+
+    Returns:
+      A step function to run by `tf.distribute.Strategy`.
+    """
+
+    def _step_fn(ctx, inputs):
+        """A step fn that returns update ops."""
+        if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
+            inputs, targets = inputs
+        else:
+            targets = None
+
+        # When input feature is a dictionary of tensors, dictionary is
+        # flattended to an array and passed as a model input. This results in
+        # input mismatch when model input layer names are not sorted in
+        # alphabetical order as `nest.flatten()`sorts dictionary elements by
+        # keys. As so, transform input tensors into an array and order it along
+        # `model._feed_input_names`.
+        if isinstance(inputs, dict):
+            inputs = [
+                inputs[input_name] for input_name in model._feed_input_names
+            ]
+
+        _build_model(strategy, model, mode, inputs, targets)
+
+        (
+            grouped_inputs,
+            grouped_outputs,
+            grouped_updates,
+            grouped_session_args,
+        ) = strategy.extended.call_for_each_replica(
+            _per_replica_execution_function,
+            args=(dist_utils.get_distributed_model(model, mode), mode),
+        )
+        (
+            all_inputs,
+            all_outputs,
+            all_updates,
+            all_session_args,
+        ) = dist_utils.unwrap_values(
+            strategy,
+            grouped_inputs,
+            grouped_outputs,
+            grouped_updates,
+            grouped_session_args,
+        )
+        combined_fn = backend.function(
+            all_inputs,
+            all_outputs,
+            updates=all_updates,
+            name="distributed_" + str(mode) + "_function",
+            **all_session_args
+        )
+
+        for label, output in zip(output_labels, combined_fn.outputs):
+            if label == "loss":
+                reduce_op = tf.distribute.ReduceOp.SUM
+            else:
+                # We reduce all other metrics using mean for now. This is
+                # temporary workaround until new metrics are in place.
+                reduce_op = tf.distribute.ReduceOp.MEAN
+            ctx.set_last_step_output(label, output, reduce_op)
+
+        # TODO(priyag, sourabhbajaj): Ignoring these things from the
+        # combined_fn: feed_dict, session kwargs, run options, run_metadata for
+        # now. These should be handled appropriately
+        return combined_fn.updates_op
+
+    return _step_fn
+
+
+def experimental_tpu_fit_loop(
+    model,
+    dataset,
+    epochs=100,
+    verbose=1,
+    callbacks=None,
+    initial_epoch=0,
+    steps_per_epoch=None,
+    val_dataset=None,
+    validation_steps=None,
+    validation_freq=1,
+):
+    """Fit loop for training with TPU tf.distribute.Strategy.
+
+    Args:
+        model: Keras Model instance.
+        dataset: Dataset that returns inputs and targets
+        epochs: Number of times to iterate over the data
+        verbose: Integer, Verbosity mode, 0, 1 or 2
+        callbacks: List of callbacks to be called during training
+        initial_epoch: Epoch at which to start training
+            (useful for resuming a previous training run)
+        steps_per_epoch: Total number of steps (batches of samples)
+            before declaring one epoch finished and starting the
+            next epoch. Ignored with the default value of `None`.
+        val_dataset: Dataset for validation data.
+        validation_steps: Number of steps to run validation for
+            (only if doing validation from data tensors).
+            Ignored with the default value of `None`.
+        validation_freq: Only relevant if validation data is provided. Integer
+            or `collections.abc.Container` instance (e.g. list, tuple, etc.). If
+            an integer, specifies how many training epochs to run before a new
+            validation run is performed, e.g. `validation_freq=2` runs
+            validation every 2 epochs. If a Container, specifies the epochs on
+            which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+            validation at the end of the 1st, 2nd, and 10th epochs.
+
+    Returns:
+        Returns `None`.
+
+    Raises:
+        ValueError: in case of invalid arguments.
+    """
+    mode = ModeKeys.TRAIN
+
+    current_strategy = model._distribution_strategy
+    iteration_value = min(
+        steps_per_epoch, current_strategy.extended.steps_per_run
+    )
+    steps_per_run = backend.variable(
+        value=iteration_value, dtype="int32", name="steps_per_run"
+    )
+
+    # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
+    iterator = dist_utils.get_iterator(dataset, current_strategy)
+
+    scope = dist_utils.distributed_scope(
+        strategy=current_strategy, learning_phase=1
+    )
+    scope.__enter__()
+
+    out_labels = model.metrics_names or []
+
+    step_fn = _make_train_step_fn(
+        model, ModeKeys.TRAIN, current_strategy, out_labels
+    )
+
+    # Add initial dummy values for loss and other metric tensors.
+    initial_loop_values = {}
+    initial_loop_values["loss"] = tf.constant(1e7)
+    for m in model._get_training_eval_metrics():
+        tensor = m.result()
+        initial_loop_values[m.name] = tf.zeros(tensor.shape, tensor.dtype)
+
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
+        step_fn,
+        iterator,
+        iterations=steps_per_run,
+        initial_loop_values=initial_loop_values,
+    )
+    train_op = ctx.run_op
+    output_tensors = ctx.last_step_outputs
+
+    do_validation = bool(validation_steps)
+
+    if model._compile_distribution:
+        dist_utils._copy_weights_to_distributed_model(model, mode)
+
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=do_validation,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        verbose=verbose,
+        count_mode="steps",
+        mode=mode,
+    )
+
+    # Calculate the steps each time on the device.
+    steps_to_run = [current_strategy.extended.steps_per_run] * (
+        steps_per_epoch // current_strategy.extended.steps_per_run
+    )
+    if steps_per_epoch % current_strategy.extended.steps_per_run:
+        steps_to_run.append(
+            steps_per_epoch % current_strategy.extended.steps_per_run
+        )
+    target_steps = len(steps_to_run)
+
+    callbacks._call_begin_hook(mode)
+
+    initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
+        initial_epoch, mode
+    )
+
+    for epoch in range(initial_epoch, epochs):
+        dist_utils._reset_metrics(model)
+        callbacks.on_epoch_begin(epoch)
+        epoch_logs = {}
+        step_index = 0
+        prev_step_count = None
+        current_step = 0
+        while current_step < target_steps:
+            step_count = steps_to_run[current_step]
+            batch_logs = {
+                "batch": step_index,
+                "size": 1,
+                "num_steps": step_count,
+            }
+            callbacks._call_batch_hook(mode, "begin", step_index, batch_logs)
+            if prev_step_count is None or step_count != prev_step_count:
+                backend.get_session().run(steps_per_run.assign(step_count))
+                prev_step_count = step_count
+            try:
+                _, outputs = backend.batch_get_value([train_op, output_tensors])
+            except tf.errors.OutOfRangeError:
+                logging.warning(
+                    "Your dataset iterator ran out of data; "
+                    "interrupting training. Make sure that your dataset "
+                    "can generate at least `steps_per_epoch * epochs` "
+                    "batches (in this case, %d batches)."
+                    % steps_per_epoch
+                    * epochs
+                )
+                break
+
+            batch_logs.update(outputs)
+            callbacks._call_batch_hook(mode, "end", step_index, batch_logs)
+            step_index = step_index + step_count
+            current_step += 1
+
+            if callbacks.model.stop_training:
+                break
+
+        if do_validation and training_utils_v1.should_run_validation(
+            validation_freq, epoch
+        ):
+            logging.info("Running validation at fit epoch: %s", epoch)
+
+            if model._compile_distribution:
+                # Since we create a new clone from the original model we need to
+                # copy the weights back to the original model before we can run
+                # validation.
+                dist_utils._copy_weights_to_original_model(
+                    model, ModeKeys.TRAIN
+                )
+
+            val_outs = experimental_tpu_test_loop(
+                model,
+                val_dataset,
+                steps=validation_steps,
+                verbose=verbose,
+                callbacks=callbacks,
+            )
+            if not isinstance(val_outs, list):
+                val_outs = [val_outs]
+            # Same labels assumed.
+            for label, val_out in zip(out_labels, val_outs):
+                epoch_logs["val_" + label] = val_out
+
+        callbacks.on_epoch_end(epoch, epoch_logs)
+        if callbacks.model.stop_training:
+            break
+    model._successful_loop_finish = True
+    callbacks._call_end_hook(mode)
+
+    if model._compile_distribution:
+        # Copy the weights back from the replicated model to the original model.
+        dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
+    scope.__exit__(None, None, None)
+    return model.history
+
+
+def experimental_tpu_test_loop(
+    model, dataset, verbose=0, steps=None, callbacks=None
+):
+    """Test loop for evaluating with TPU tf.distribute.Strategy.
+
+    Args:
+        model: Keras Model instance.
+        dataset: Dataset for input data.
+        verbose: Integer, Verbosity mode 0 or 1.
+        steps: Total number of steps (batches of samples)
+            before declaring predictions finished.
+            Ignored with the default value of `None`.
+        callbacks: List of callbacks to be called during training
+
+    Returns:
+        Scalar loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the outputs.
+    """
+    mode = ModeKeys.TEST
+    current_strategy = model._distribution_strategy
+    iterator = dist_utils.get_iterator(dataset, current_strategy)
+
+    scope = dist_utils.distributed_scope(
+        strategy=current_strategy, learning_phase=0
+    )
+    scope.__enter__()
+
+    out_labels = model.metrics_names
+
+    def _test_step_fn(inputs):
+        """A fn that returns output of single test step."""
+        if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
+            inputs, targets = inputs
+        else:
+            targets = None
+
+        (
+            tf.distribute.get_replica_context().merge_call(
+                _build_model, args=(model, mode, inputs, targets)
+            )
+        )
+
+        (_, outputs, updates, _) = _per_replica_execution_function(
+            dist_utils.get_distributed_model(model, mode), mode
+        )
+        with tf.control_dependencies([updates]):
+            return [tf.identity(out) for out in outputs]
+
+    test_input_data = iterator.get_next()
+    per_replica_outputs = current_strategy.run(
+        _test_step_fn, args=(test_input_data,)
+    )
+    output_tensors = {}
+    for label, output in zip(out_labels, per_replica_outputs):
+        if label == "loss":
+            reduce_op = tf.distribute.ReduceOp.SUM
+        else:
+            # We reduce all other metrics using mean for now. This is temporary
+            # workaround until new metrics are in place.
+            reduce_op = tf.distribute.ReduceOp.MEAN
+        output_tensors[label] = current_strategy.reduce(
+            reduce_op, output, axis=None
+        )
+    test_op = tf.group(list(output_tensors.values()))
+
+    if verbose >= 1:
+        progbar = Progbar(target=steps)
+
+    if model._compile_distribution:
+        dist_utils._copy_weights_to_distributed_model(model, mode)
+
     dist_utils._reset_metrics(model)
-    callbacks.on_epoch_begin(epoch)
-    epoch_logs = {}
-    step_index = 0
-    prev_step_count = None
+
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=False,
+        epochs=1,
+        steps_per_epoch=steps,
+        verbose=verbose,
+        count_mode="steps",
+        mode=ModeKeys.TEST,
+    )
+    callbacks._call_begin_hook(mode)
+
+    outs = [0.0] * len(model.metrics_names)
+    if steps is not None:
+        target_steps = steps
+    else:
+        raise ValueError(
+            "Number of steps could not be inferred from the data, "
+            "please pass the steps argument."
+        )
+
     current_step = 0
     while current_step < target_steps:
-      step_count = steps_to_run[current_step]
-      batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
-      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
-      if prev_step_count is None or step_count != prev_step_count:
-        backend.get_session().run(steps_per_run.assign(step_count))
-        prev_step_count = step_count
-      try:
-        _, outputs = backend.batch_get_value([train_op, output_tensors])
-      except tf.errors.OutOfRangeError:
-        logging.warning('Your dataset iterator ran out of data; '
-                        'interrupting training. Make sure that your dataset '
-                        'can generate at least `steps_per_epoch * epochs` '
-                        'batches (in this case, %d batches).' %
-                        steps_per_epoch * epochs)
-        break
-
-      batch_logs.update(outputs)
-      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
-      step_index = step_index + step_count
-      current_step += 1
-
-      if callbacks.model.stop_training:
-        break
-
-    if (do_validation and
-        training_utils_v1.should_run_validation(validation_freq, epoch)):
-      logging.info('Running validation at fit epoch: %s', epoch)
-
-      if model._compile_distribution:
-        # Since we create a new clone from the original model we need to copy
-        # the weights back to the original model before we can run validation.
-        dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
+        batch_logs = {"batch": current_step, "size": 1}
+        callbacks._call_batch_hook(mode, "begin", current_step, batch_logs)
+        try:
+            _, batch_outs = backend.batch_get_value([test_op, output_tensors])
+        except tf.errors.OutOfRangeError:
+            warning_msg = (
+                "Make sure that your dataset can generate at least "
+                "`steps` batches (in this case, {} batches).".format(steps)
+            )
+
+            logging.warning(
+                "Your dataset iterator ran out of data; "
+                "interrupting evaluation. " + warning_msg
+            )
+            target_steps = current_step
+            break
+        for i, label in enumerate(model.metrics_names):
+            if i == 0:
+                # Loss is stateless metrics.
+                outs[i] += batch_outs[label]
+            else:
+                # For all stateful metrics, the aggregation is handled by
+                # mirrored vars.
+                outs[i] = batch_outs[label]
+
+        batch_logs = callbacks.make_logs(model, batch_logs, outs, mode)
+        callbacks._call_batch_hook(mode, "end", current_step, batch_logs)
+        if verbose == 1:
+            progbar.update(current_step + 1)
+        current_step += 1
+
+    if verbose >= 1:
+        # Progress bar finishes at the end.
+        progbar.update(target_steps)
+    callbacks._call_end_hook(mode)
+
+    scope.__exit__(None, None, None)
+    if len(outs) > 0:
+        outs[0] /= target_steps
+
+    if len(outs) == 1:
+        return outs[0]
+    return outs
+
+
+def experimental_tpu_predict_loop(
+    model, dataset, verbose=0, steps=None, callbacks=None
+):
+    """Predict loop for predicting with TPU tf.distribute.Strategy.
+
+    Args:
+        model: Keras Model instance.
+        dataset: Dataset for input data.
+        verbose: Integer, Verbosity mode 0 or 1.
+        steps: Total number of steps (batches of samples)
+            before declaring `_predict_loop` finished.
+            Ignored with the default value of `None`.
+        callbacks: List of callbacks to be called during training
+
+    Returns:
+        Array of predictions (if the model has a single output)
+        or list of arrays of predictions
+        (if the model has multiple outputs).
+    """
+    mode = ModeKeys.PREDICT
+    dataset_fully_shaped = dist_utils.is_dataset_shape_fully_defined(dataset)
+    padding_handler = None
+    if not dataset_fully_shaped:
+        # TODO(hongjunchoi): Investigate whether operations from
+        # PartialBatchPaddingHandler are unnecessarily pruned out
+        # during graph optimization.
+        padding_handler = padding_util.PartialBatchPaddingHandler(
+            model._feed_output_shapes
+        )
+        batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(
+            dataset
+        )
+        padding_handler.padded_batch_size = batch_size
+        padding_handler.padding_mask = dataset.reduce(
+            padding_handler.padding_mask, padding_handler.update_mask
+        )
+
+        dataset = dataset.map(padding_handler.pad_batch)
+        dataset = dataset.unbatch()
+        # Upon this point, it is guaranteed that the dataset does not
+        # have partial batches. Thus, we set `drop_remainder=True` to
+        # get static shape information about the elements in the dataset.
+        dataset = dataset.batch(batch_size, drop_remainder=True)
+
+        if prefetch_buffer is not None:
+            dataset = dataset.prefetch(prefetch_buffer)
+
+    current_strategy = model._distribution_strategy
+    iterator = dist_utils.get_iterator(dataset, current_strategy)
+
+    scope = dist_utils.distributed_scope(
+        strategy=current_strategy, learning_phase=0
+    )
+    scope.__enter__()
+
+    def _predict_step_fn(inputs):
+        """A fn that returns output of single prediction step."""
+
+        (
+            tf.distribute.get_replica_context().merge_call(
+                _build_model, args=(model, mode, inputs)
+            )
+        )
+
+        (_, outputs, updates, _) = _per_replica_execution_function(
+            dist_utils.get_distributed_model(model, mode), mode
+        )
+
+        with tf.control_dependencies([updates]):
+            return [tf.identity(out) for out in outputs]
+
+    # TODO(hongjunchoi): When numpy array is passed as an input to `predict()`
+    # use numpy arrays directly to avoid cumulating unnecessary input pipeline
+    # ops.
+    predict_input_data = iterator.get_next()
+    per_replica_outputs = current_strategy.run(
+        _predict_step_fn, args=(predict_input_data,)
+    )
+    output_tensors = dist_utils.flatten_per_replica_values(
+        current_strategy, per_replica_outputs
+    )
+
+    if verbose >= 1:
+        progbar = Progbar(target=steps)
+
+    if model._compile_distribution:
+        dist_utils._copy_weights_to_distributed_model(model, mode)
+
+    dist_utils._reset_metrics(model)
 
-      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
-          model,
-          val_dataset,
-          steps=validation_steps,
-          verbose=verbose,
-          callbacks=callbacks)
-      if not isinstance(val_outs, list):
-        val_outs = [val_outs]
-      # Same labels assumed.
-      for label, val_out in zip(out_labels, val_outs):
-        epoch_logs['val_' + label] = val_out
-
-    callbacks.on_epoch_end(epoch, epoch_logs)
-    if callbacks.model.stop_training:
-      break
-  model._successful_loop_finish = True
-  callbacks._call_end_hook(mode)
-
-  if model._compile_distribution:
-    # Copy the weights back from the replicated model to the original model.
-    dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
-  scope.__exit__(None, None, None)
-  return model.history
-
-
-def experimental_tpu_test_loop(model,
-                               dataset,
-                               verbose=0,
-                               steps=None,
-                               callbacks=None):
-  """Test loop for evaluating with TPU tf.distribute.Strategy.
-
-  Args:
-      model: Keras Model instance.
-      dataset: Dataset for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-      callbacks: List of callbacks to be called during training
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the outputs.
-  """
-  mode = ModeKeys.TEST
-  current_strategy = model._distribution_strategy
-  iterator = dist_utils.get_iterator(dataset, current_strategy)
-
-  scope = dist_utils.distributed_scope(
-      strategy=current_strategy, learning_phase=0)
-  scope.__enter__()
-
-  out_labels = model.metrics_names
-
-  def _test_step_fn(inputs):
-    """A fn that returns output of single test step."""
-    if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
-      inputs, targets = inputs
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=False,
+        epochs=1,
+        steps_per_epoch=steps,
+        verbose=verbose,
+        count_mode="steps",
+        mode=mode,
+    )
+    callbacks._call_begin_hook(mode)
+
+    # Since we do not know how many samples we will see, we cannot pre-allocate
+    # the returned Numpy arrays. Instead, we store one array per batch seen
+    # and concatenate them upon returning.
+    num_model_outputs = len(model.output_names)
+    unconcatenated_outs = [[] for _ in range(num_model_outputs)]
+    if steps is not None:
+        target_steps = steps
     else:
-      targets = None
-
-    (tf.distribute.get_replica_context().merge_call(
-        _build_model, args=(model, mode, inputs, targets)))
-
-    (_, outputs, updates, _) = _per_replica_execution_function(
-        dist_utils.get_distributed_model(model, mode), mode)
-    with tf.control_dependencies([updates]):
-      return [tf.identity(out) for out in outputs]
-
-  test_input_data = iterator.get_next()
-  per_replica_outputs = current_strategy.run(
-      _test_step_fn, args=(test_input_data,))
-  output_tensors = {}
-  for label, output in zip(out_labels, per_replica_outputs):
-    if label == 'loss':
-      reduce_op = tf.distribute.ReduceOp.SUM
+        raise ValueError(
+            "Number of steps could not be inferred from the data, "
+            "please pass the steps argument."
+        )
+
+    current_step = 0
+    while current_step < target_steps:
+        batch_logs = {"batch": current_step, "size": 1}
+        callbacks._call_batch_hook(mode, "begin", current_step, batch_logs)
+        try:
+            predict_ops = tf.group(output_tensors)
+            _, batch_outs = backend.batch_get_value(
+                [predict_ops, output_tensors]
+            )
+
+        except tf.errors.OutOfRangeError:
+            warning_msg = (
+                "Make sure that your dataset can generate at least "
+                "`steps` batches (in this case, {} batches).".format(steps)
+            )
+
+            logging.warning(
+                "Your dataset iterator ran out of data; "
+                "interrupting evaluation. " + warning_msg
+            )
+            break
+
+        # TODO(priyag): maybe need to unwrap the outputs first for
+        # MirroredStrategy.
+        for i in range(num_model_outputs):
+            output_start_index = i * current_strategy.num_replicas_in_sync
+            output_end_index = (
+                output_start_index + current_strategy.num_replicas_in_sync
+            )
+            single_model_output = batch_outs[
+                output_start_index:output_end_index
+            ]
+            unconcatenated_outs[i].extend(single_model_output)
+
+        batch_logs = callbacks.make_logs(model, batch_logs, batch_outs, mode)
+        callbacks._call_batch_hook(mode, "end", current_step, batch_logs)
+        if verbose == 1:
+            progbar.update(current_step + 1)
+        current_step += 1
+
+    if verbose >= 1:
+        # Progress bar finishes at the end.
+        progbar.update(current_step)
+
+    callbacks._call_end_hook(mode)
+
+    scope.__exit__(None, None, None)
+
+    if len(unconcatenated_outs) == 1:
+        prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
     else:
-      # We reduce all other metrics using mean for now. This is temporary
-      # workaround until new metrics are in place.
-      reduce_op = tf.distribute.ReduceOp.MEAN
-    output_tensors[label] = current_strategy.reduce(reduce_op, output,
-                                                    axis=None)
-  test_op = tf.group(list(output_tensors.values()))
-
-  if verbose >= 1:
-    progbar = Progbar(target=steps)
-
-  if model._compile_distribution:
-    dist_utils._copy_weights_to_distributed_model(model, mode)
-
-  dist_utils._reset_metrics(model)
-
-  callbacks = cbks.configure_callbacks(
-      callbacks,
-      model,
-      do_validation=False,
-      epochs=1,
-      steps_per_epoch=steps,
-      verbose=verbose,
-      count_mode='steps',
-      mode=ModeKeys.TEST)
-  callbacks._call_begin_hook(mode)
-
-  outs = [0.] * len(model.metrics_names)
-  if steps is not None:
-    target_steps = steps
-  else:
-    raise ValueError('Number of steps could not be inferred from the data, '
-                     'please pass the steps argument.')
-
-  current_step = 0
-  while current_step < target_steps:
-    batch_logs = {'batch': current_step, 'size': 1}
-    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
-    try:
-      _, batch_outs = backend.batch_get_value([test_op, output_tensors])
-    except tf.errors.OutOfRangeError:
-      warning_msg = (
-          'Make sure that your dataset can generate at least '
-          '`steps` batches (in this case, {} batches).'.format(steps))
-
-      logging.warning('Your dataset iterator ran out of data; '
-                      'interrupting evaluation. ' + warning_msg)
-      target_steps = current_step
-      break
-    for i, label in enumerate(model.metrics_names):
-      if i == 0:
-        # Loss is stateless metrics.
-        outs[i] += batch_outs[label]
-      else:
-        # For all stateful metrics, the aggregation is handled by mirrored vars.
-        outs[i] = batch_outs[label]
-
-    batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
-    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
-    if verbose == 1:
-      progbar.update(current_step + 1)
-    current_step += 1
-
-  if verbose >= 1:
-    # Progress bar finishes at the end.
-    progbar.update(target_steps)
-  callbacks._call_end_hook(mode)
-
-  scope.__exit__(None, None, None)
-  if len(outs) >= 0:
-    outs[0] /= (target_steps)
-
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
-def experimental_tpu_predict_loop(model,
-                                  dataset,
-                                  verbose=0,
-                                  steps=None,
-                                  callbacks=None):
-  """Predict loop for predicting with TPU tf.distribute.Strategy.
-
-  Args:
-      model: Keras Model instance.
-      dataset: Dataset for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-      callbacks: List of callbacks to be called during training
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  mode = ModeKeys.PREDICT
-  dataset_fully_shaped = dist_utils.is_dataset_shape_fully_defined(dataset)
-  padding_handler = None
-  if not dataset_fully_shaped:
-    # TODO(hongjunchoi): Investigate whether operations from
-    # PartialBatchPaddingHandler are unnecessarily pruned out
-    # during graph optimization.
-    padding_handler = padding_util.PartialBatchPaddingHandler(
-        model._feed_output_shapes)
-    batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset)
-    padding_handler.padded_batch_size = batch_size
-    padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask,
-                                                  padding_handler.update_mask)
-
-    dataset = dataset.map(padding_handler.pad_batch)
-    dataset = dataset.unbatch()
-    # Upon this point, it is guaranteed that the dataset does not
-    # have partial batches. Thus, we set `drop_remainder=True` to
-    # get static shape information about the elements in the dataset.
-    dataset = dataset.batch(batch_size, drop_remainder=True)
-
-    if prefetch_buffer is not None:
-      dataset = dataset.prefetch(prefetch_buffer)
-
-  current_strategy = model._distribution_strategy
-  iterator = dist_utils.get_iterator(dataset, current_strategy)
-
-  scope = dist_utils.distributed_scope(
-      strategy=current_strategy, learning_phase=0)
-  scope.__enter__()
-
-  def _predict_step_fn(inputs):
-    """A fn that returns output of single prediction step."""
-
-    (tf.distribute.get_replica_context().merge_call(
-        _build_model, args=(model, mode, inputs)))
-
-    (_, outputs, updates, _) = _per_replica_execution_function(
-        dist_utils.get_distributed_model(model, mode), mode)
-
-    with tf.control_dependencies([updates]):
-      return [tf.identity(out) for out in outputs]
-
-  # TODO(hongjunchoi): When numpy array is passed as an input to `predict()`
-  # use numpy arrays directly to avoid cumulating unnecessary input pipeline
-  # ops.
-  predict_input_data = iterator.get_next()
-  per_replica_outputs = current_strategy.run(
-      _predict_step_fn, args=(predict_input_data,))
-  output_tensors = dist_utils.flatten_per_replica_values(
-      current_strategy, per_replica_outputs)
-
-  if verbose >= 1:
-    progbar = Progbar(target=steps)
-
-  if model._compile_distribution:
-    dist_utils._copy_weights_to_distributed_model(model, mode)
-
-  dist_utils._reset_metrics(model)
-
-  callbacks = cbks.configure_callbacks(
-      callbacks,
-      model,
-      do_validation=False,
-      epochs=1,
-      steps_per_epoch=steps,
-      verbose=verbose,
-      count_mode='steps',
-      mode=mode)
-  callbacks._call_begin_hook(mode)
-
-  # Since we do not know how many samples we will see, we cannot pre-allocate
-  # the returned Numpy arrays. Instead, we store one array per batch seen
-  # and concatenate them upon returning.
-  num_model_outputs = len(model.output_names)
-  unconcatenated_outs = [[] for _ in range(num_model_outputs)]
-  if steps is not None:
-    target_steps = steps
-  else:
-    raise ValueError('Number of steps could not be inferred from the data, '
-                     'please pass the steps argument.')
-
-  current_step = 0
-  while current_step < target_steps:
-    batch_logs = {'batch': current_step, 'size': 1}
-    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
-    try:
-      predict_ops = tf.group(output_tensors)
-      _, batch_outs = backend.batch_get_value([predict_ops, output_tensors])
-
-    except tf.errors.OutOfRangeError:
-      warning_msg = (
-          'Make sure that your dataset can generate at least '
-          '`steps` batches (in this case, {} batches).'.format(steps))
-
-      logging.warning('Your dataset iterator ran out of data; '
-                      'interrupting evaluation. ' + warning_msg)
-      break
-
-    # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
-    for i in range(num_model_outputs):
-      output_start_index = i * current_strategy.num_replicas_in_sync
-      output_end_index = (
-          output_start_index + current_strategy.num_replicas_in_sync)
-      single_model_output = batch_outs[output_start_index:output_end_index]
-      unconcatenated_outs[i].extend(single_model_output)
-
-    batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
-    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
-    if verbose == 1:
-      progbar.update(current_step + 1)
-    current_step += 1
-
-  if verbose >= 1:
-    # Progress bar finishes at the end.
-    progbar.update(current_step)
-
-  callbacks._call_end_hook(mode)
-
-  scope.__exit__(None, None, None)
-
-  if len(unconcatenated_outs) == 1:
-    prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
-  else:
-    prediction_result = [
-        np.concatenate(out, axis=0) for out in unconcatenated_outs
-    ]
-
-  if padding_handler:
-    prediction_result = padding_handler.apply_mask(prediction_result)
-
-  return prediction_result
+        prediction_result = [
+            np.concatenate(out, axis=0) for out in unconcatenated_outs
+        ]
+
+    if padding_handler:
+        prediction_result = padding_handler.apply_mask(prediction_result)
+
+    return prediction_result
 
 
 class DistributionSingleWorkerTrainingLoop(training_utils_v1.TrainingLoop):
-  """Training loop for distribution strategy with single worker."""
-
-  def fit(self,
-          model,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          **kwargs):
-    """Fit loop for Distribution Strategies."""
-    dist_utils.validate_callbacks(input_callbacks=callbacks,
-                                  optimizer=model.optimizer)
-    dist_utils.validate_inputs(x, y)
-
-    batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
-        model._distribution_strategy,
-        x,
-        batch_size,
-        steps_per_epoch,
-        ModeKeys.TRAIN,
-        validation_split=validation_split)
-    batch_size = model._validate_or_infer_batch_size(
-        batch_size, steps_per_epoch, x)
-    dataset = model._distribution_standardize_user_data(
-        x, y,
-        sample_weight=sample_weight,
-        class_weight=class_weight,
-        batch_size=batch_size,
-        validation_split=validation_split,
-        shuffle=shuffle,
-        epochs=epochs)
-    if not dist_utils.is_distributing_by_cloning(model):
-      with model._distribution_strategy.scope():
-        (dataset, _, _) = model._standardize_user_data(
-            dataset,
+    """Training loop for distribution strategy with single worker."""
+
+    def fit(
+        self,
+        model,
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_freq=1,
+        **kwargs
+    ):
+        """Fit loop for Distribution Strategies."""
+        dist_utils.validate_callbacks(
+            input_callbacks=callbacks, optimizer=model.optimizer
+        )
+        dist_utils.validate_inputs(x, y)
+
+        batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
+            model._distribution_strategy,
+            x,
+            batch_size,
+            steps_per_epoch,
+            ModeKeys.TRAIN,
+            validation_split=validation_split,
+        )
+        batch_size = model._validate_or_infer_batch_size(
+            batch_size, steps_per_epoch, x
+        )
+        dataset = model._distribution_standardize_user_data(
+            x,
+            y,
             sample_weight=sample_weight,
             class_weight=class_weight,
             batch_size=batch_size,
             validation_split=validation_split,
-            shuffle=shuffle)
-
-    val_dataset = None
-    if validation_data:
-      val_x, val_y, val_sample_weights = (
-          training_utils_v1.unpack_validation_data(validation_data))
-      dist_utils.validate_inputs(val_x, val_y)
-      _, validation_steps = dist_utils.process_batch_and_step_size(
-          model._distribution_strategy, val_x, batch_size, validation_steps,
-          ModeKeys.TEST)
-
-      val_dataset = model._distribution_standardize_user_data(
-          val_x, val_y,
-          sample_weight=val_sample_weights,
-          class_weight=None,
-          batch_size=batch_size,
-          validation_split=validation_split,
-          shuffle=shuffle,
-          allow_partial_batch=True)
-    elif validation_split:
-      raise ValueError('validation_split argument is not supported with '
-                       'distribution strategies.')
-
-    if backend.is_tpu_strategy(model._distribution_strategy):
-      steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
-          model, dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
-      if steps_per_epoch is None:
-        raise ValueError('Number of steps could not be inferred from the data, '
-                         'please pass the steps_per_epoch argument.')
-
-      if not tf.executing_eagerly():
-        # Run TPU training in a custom loop in graph mode.
-        return experimental_tpu_fit_loop(
+            shuffle=shuffle,
+            epochs=epochs,
+        )
+        if not dist_utils.is_distributing_by_cloning(model):
+            with model._distribution_strategy.scope():
+                (dataset, _, _) = model._standardize_user_data(
+                    dataset,
+                    sample_weight=sample_weight,
+                    class_weight=class_weight,
+                    batch_size=batch_size,
+                    validation_split=validation_split,
+                    shuffle=shuffle,
+                )
+
+        val_dataset = None
+        if validation_data:
+            (
+                val_x,
+                val_y,
+                val_sample_weights,
+            ) = training_utils_v1.unpack_validation_data(validation_data)
+            dist_utils.validate_inputs(val_x, val_y)
+            _, validation_steps = dist_utils.process_batch_and_step_size(
+                model._distribution_strategy,
+                val_x,
+                batch_size,
+                validation_steps,
+                ModeKeys.TEST,
+            )
+
+            val_dataset = model._distribution_standardize_user_data(
+                val_x,
+                val_y,
+                sample_weight=val_sample_weights,
+                class_weight=None,
+                batch_size=batch_size,
+                validation_split=validation_split,
+                shuffle=shuffle,
+                allow_partial_batch=True,
+            )
+        elif validation_split:
+            raise ValueError(
+                "validation_split argument is not supported with "
+                "distribution strategies."
+            )
+
+        if backend.is_tpu_strategy(model._distribution_strategy):
+            steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
+                model,
+                dataset,
+                steps_per_epoch,
+                epochs,
+                steps_name="steps_per_epoch",
+            )
+            if steps_per_epoch is None:
+                raise ValueError(
+                    "Number of steps could not be inferred from the data, "
+                    "please pass the steps_per_epoch argument."
+                )
+
+            if not tf.executing_eagerly():
+                # Run TPU training in a custom loop in graph mode.
+                return experimental_tpu_fit_loop(
+                    model,
+                    dataset,
+                    epochs=epochs,
+                    verbose=verbose,
+                    callbacks=callbacks,
+                    val_dataset=val_dataset,
+                    initial_epoch=initial_epoch,
+                    steps_per_epoch=steps_per_epoch,
+                    validation_steps=validation_steps,
+                    validation_freq=validation_freq,
+                )
+
+        return training_arrays_v1.fit_loop(
             model,
             dataset,
+            batch_size=batch_size,
             epochs=epochs,
             verbose=verbose,
             callbacks=callbacks,
-            val_dataset=val_dataset,
+            val_inputs=val_dataset,
+            shuffle=shuffle,
             initial_epoch=initial_epoch,
             steps_per_epoch=steps_per_epoch,
             validation_steps=validation_steps,
-            validation_freq=validation_freq)
+            validation_freq=validation_freq,
+            steps_name="steps_per_epoch",
+        )
 
-    return training_arrays_v1.fit_loop(
+    def evaluate(
+        self,
         model,
-        dataset,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        val_inputs=val_dataset,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        steps_name='steps_per_epoch')
-
-  def evaluate(self,
-               model,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               **kwargs):
-    """Evaluate loop for Distribution Strategies."""
-    dist_utils.validate_inputs(x, y)
-    batch_size, steps = dist_utils.process_batch_and_step_size(
-        model._distribution_strategy, x, batch_size, steps, ModeKeys.TEST)
-    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-    dataset = model._distribution_standardize_user_data(
-        x, y,
-        sample_weight=sample_weight,
-        batch_size=batch_size,
-        allow_partial_batch=True)
-
-    if backend.is_tpu_strategy(model._distribution_strategy):
-      steps = training_utils_v1.infer_steps_for_dataset(
-          model, dataset, steps, steps_name='steps')
-      if steps is None:
-        raise ValueError('Number of steps could not be inferred from the data, '
-                         'please pass the steps argument.')
-
-      if not tf.executing_eagerly():
-        # Run TPU evaluation in a custom loop in graph mode.
-        return experimental_tpu_test_loop(
-            model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
-
-    return training_arrays_v1.test_loop(
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose=1,
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        **kwargs
+    ):
+        """Evaluate loop for Distribution Strategies."""
+        dist_utils.validate_inputs(x, y)
+        batch_size, steps = dist_utils.process_batch_and_step_size(
+            model._distribution_strategy, x, batch_size, steps, ModeKeys.TEST
+        )
+        batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+        dataset = model._distribution_standardize_user_data(
+            x,
+            y,
+            sample_weight=sample_weight,
+            batch_size=batch_size,
+            allow_partial_batch=True,
+        )
+
+        if backend.is_tpu_strategy(model._distribution_strategy):
+            steps = training_utils_v1.infer_steps_for_dataset(
+                model, dataset, steps, steps_name="steps"
+            )
+            if steps is None:
+                raise ValueError(
+                    "Number of steps could not be inferred from the data, "
+                    "please pass the steps argument."
+                )
+
+            if not tf.executing_eagerly():
+                # Run TPU evaluation in a custom loop in graph mode.
+                return experimental_tpu_test_loop(
+                    model,
+                    dataset,
+                    verbose=verbose,
+                    steps=steps,
+                    callbacks=callbacks,
+                )
+
+        return training_arrays_v1.test_loop(
+            model,
+            inputs=dataset,
+            batch_size=batch_size,
+            verbose=verbose,
+            steps=steps,
+            callbacks=callbacks,
+        )
+
+    def predict(
+        self,
         model,
-        inputs=dataset,
-        batch_size=batch_size,
-        verbose=verbose,
-        steps=steps,
-        callbacks=callbacks)
-
-  def predict(self,
-              model,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              **kwargs):
-    """Predict loop for Distribution Strategies."""
-    dist_utils.validate_inputs(x=x, y=None)
-    batch_size, steps = dist_utils.process_batch_and_step_size(
-        model._distribution_strategy, x, batch_size, steps, ModeKeys.PREDICT)
-    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-    dataset = model._distribution_standardize_user_data(
         x,
-        batch_size=batch_size,
-        allow_partial_batch=True)
-    if backend.is_tpu_strategy(model._distribution_strategy):
-      steps = training_utils_v1.infer_steps_for_dataset(
-          model, dataset, steps, steps_name='steps')
-      if steps is None:
-        raise ValueError('Number of steps could not be inferred from the data, '
-                         'please pass the steps argument.')
-      if not tf.executing_eagerly():
-        return experimental_tpu_predict_loop(
-            model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
-    return training_arrays_v1.predict_loop(
-        model,
-        dataset,
-        batch_size=batch_size,
-        verbose=verbose,
-        steps=steps,
-        callbacks=callbacks)
+        batch_size=None,
+        verbose=0,
+        steps=None,
+        callbacks=None,
+        **kwargs
+    ):
+        """Predict loop for Distribution Strategies."""
+        dist_utils.validate_inputs(x=x, y=None)
+        batch_size, steps = dist_utils.process_batch_and_step_size(
+            model._distribution_strategy, x, batch_size, steps, ModeKeys.PREDICT
+        )
+        batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+        dataset = model._distribution_standardize_user_data(
+            x, batch_size=batch_size, allow_partial_batch=True
+        )
+        if backend.is_tpu_strategy(model._distribution_strategy):
+            steps = training_utils_v1.infer_steps_for_dataset(
+                model, dataset, steps, steps_name="steps"
+            )
+            if steps is None:
+                raise ValueError(
+                    "Number of steps could not be inferred from the data, "
+                    "please pass the steps argument."
+                )
+            if not tf.executing_eagerly():
+                return experimental_tpu_predict_loop(
+                    model,
+                    dataset,
+                    verbose=verbose,
+                    steps=steps,
+                    callbacks=callbacks,
+                )
+        return training_arrays_v1.predict_loop(
+            model,
+            dataset,
+            batch_size=batch_size,
+            verbose=verbose,
+            steps=steps,
+            callbacks=callbacks,
+        )
 
 
 def _train_with_multi_worker(method):
-  """Decorator that handles multi worker training with distribution strategy."""
+    """Decorator handles multi worker training with distribution strategy."""
 
-  def wrapper(model, **kwargs):
-    def _worker_fn(_):
-      callbacks = kwargs.pop('callbacks', None)
-      filtered_callbacks = dist_utils.filter_distributed_callbacks(
-          callbacks, model)
-      kwargs['callbacks'] = filtered_callbacks
-      return method(model, **kwargs)
+    def wrapper(model, **kwargs):
+        def _worker_fn(_):
+            callbacks = kwargs.pop("callbacks", None)
+            filtered_callbacks = dist_utils.filter_distributed_callbacks(
+                callbacks, model
+            )
+            kwargs["callbacks"] = filtered_callbacks
+            return method(model, **kwargs)
 
-    return dc.run_distribute_coordinator(
-        _worker_fn,
-        model._distribution_strategy)
+        return dc.run_distribute_coordinator(
+            _worker_fn, model._distribution_strategy
+        )
 
-  return wrapper
+    return wrapper
 
 
 class DistributionMultiWorkerTrainingLoop(training_utils_v1.TrainingLoop):
-  """Training loop for distribution strategy with multiple worker."""
+    """Training loop for distribution strategy with multiple worker."""
 
-  def __init__(self, single_worker_loop):
-    self._single_worker_loop = single_worker_loop
+    def __init__(self, single_worker_loop):
+        self._single_worker_loop = single_worker_loop
 
-  def fit(self, *args, **kwargs):
-    return _train_with_multi_worker(self._single_worker_loop.fit)(
-        *args, **kwargs)
+    def fit(self, *args, **kwargs):
+        return _train_with_multi_worker(self._single_worker_loop.fit)(
+            *args, **kwargs
+        )
 
-  def evaluate(self, *args, **kwargs):
-    return _train_with_multi_worker(self._single_worker_loop.evaluate)(
-        *args, **kwargs)
+    def evaluate(self, *args, **kwargs):
+        return _train_with_multi_worker(self._single_worker_loop.evaluate)(
+            *args, **kwargs
+        )
 
-  def predict(self, *args, **kwargs):
-    # Currently predict is still using the single worker implementation.
-    return self._single_worker_loop.predict(*args, **kwargs)
+    def predict(self, *args, **kwargs):
+        # Currently predict is still using the single worker implementation.
+        return self._single_worker_loop.predict(*args, **kwargs)
diff --git a/keras/engine/training_eager_test.py b/keras/engine/training_eager_test.py
index 0b4ecd42c91d..317ca1f790dc 100644
--- a/keras/engine/training_eager_test.py
+++ b/keras/engine/training_eager_test.py
@@ -14,341 +14,404 @@
 # ==============================================================================
 """Tests for training routines."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
-from keras.testing_infra import test_combinations
 from keras import metrics as metrics_module
+from keras.optimizers.legacy import rmsprop
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.optimizers.optimizer_v2 import rmsprop
 
 
 class TrainingTest(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_dynamic_model_has_trainable_weights(self):
-    if not tf.executing_eagerly():
-      # Only test Eager modes, as Graph mode is not relevant for dynamic models.
-      return
-
-    class DynamicModel(keras.Model):
-
-      def __init__(self):
-        super().__init__(dynamic=True)
-        self.dense = keras.layers.Dense(
-            1, kernel_initializer='zeros', bias_initializer='ones')
-
-      def call(self, inputs):
-        return self.dense(inputs)
-
-    model = DynamicModel()
-    model.compile(
-        'rmsprop', 'mae',
-        run_eagerly=True)
-    hist = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
-    self.assertEqual(hist.history['loss'][-1], 1)
-    self.assertEqual(len(model.trainable_weights), 2)
-    loss = model.train_on_batch(np.zeros((1, 1)), np.zeros((1, 1)))
-    # The loss must have been updated if the trainable weights are taken into
-    # account during tracking.
-    self.assertLess(loss, 1)
-
-  @test_combinations.run_with_all_model_types(exclude_models='sequential')
-  @test_combinations.run_all_keras_modes
-  def test_model_methods_with_eager_tensors_multi_io(self):
-    if not tf.executing_eagerly():
-      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
-      # symbolic tensors has different requirements.
-      return
-
-    input_a = keras.layers.Input(shape=(3,), name='input_a')
-    input_b = keras.layers.Input(shape=(3,), name='input_b')
-
-    dense = keras.layers.Dense(4, name='dense')
-    dropout = keras.layers.Dropout(0.5, name='dropout')
-
-    model = test_utils.get_multi_io_model(
-        [input_a, dense], [input_b, dense, dropout])
-
-    optimizer = rmsprop.RMSprop(learning_rate=0.001)
-    loss = 'mse'
-    loss_weights = [1., 0.5]
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        loss_weights=loss_weights,
-        run_eagerly=test_utils.should_run_eagerly(),
-        sample_weight_mode=None)
-
-    input_a = tf.zeros(shape=(10, 3))
-    input_b = tf.zeros(shape=(10, 3))
-    target_a = tf.zeros(shape=(10, 4))
-    target_b = tf.zeros(shape=(10, 4))
-
-    model.fit(
-        [input_a, input_b], [target_a, target_b],
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    # Test: no shuffle.
-    model.fit(
-        [input_a, input_b], [target_a, target_b],
-        epochs=1,
-        batch_size=5,
-        verbose=0,
-        shuffle=False)
-    # Test: validation data.
-    model.fit([input_a, input_b], [target_a, target_b],
-              epochs=1, batch_size=2, verbose=0,
-              validation_data=([input_a, input_b], [target_a, target_b]))
-    model.train_on_batch([input_a, input_b], [target_a, target_b])
-    model.predict([input_a, input_b], batch_size=5)
-    model.evaluate([input_a, input_b], [target_a, target_b],
-                   batch_size=2, verbose=0)
-    model.test_on_batch([input_a, input_b], [target_a, target_b])
-
-    # Test: mix np and tensors.
-    input_b = np.zeros(shape=(10, 3)).astype('float32')
-    target_b = np.zeros(shape=(10, 4)).astype('float32')
-    model.fit(
-        [input_a, input_b], [target_a, target_b],
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.fit([input_a, input_b], [target_a, target_b],
-              epochs=1, batch_size=2, verbose=0,
-              validation_data=([input_a, input_b], [target_a, target_b]))
-    model.fit(
-        [input_a, input_b], [target_a, target_b],
-        epochs=1,
-        batch_size=5,
-        verbose=0,
-        shuffle=False)
-    model.train_on_batch([input_a, input_b], [target_a, target_b])
-    model.predict([input_a, input_b], batch_size=5)
-    model.evaluate([input_a, input_b], [target_a, target_b],
-                   batch_size=2, verbose=0)
-    model.test_on_batch([input_a, input_b], [target_a, target_b])
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_model_methods_with_eager_tensors_single_io(self):
-    if not tf.executing_eagerly():
-      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
-      # symbolic tensors has different requirements.
-      return
-
-    model = test_utils.get_small_mlp(10, 4, 3)
-
-    optimizer = rmsprop.RMSprop(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=metrics,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = tf.zeros(shape=(10, 3))
-    targets = tf.zeros(shape=(10, 4))
-
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
-    model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
-    model.fit(inputs, targets, epochs=1, batch_size=4, verbose=0,
-              validation_data=(inputs, targets))
-    model.evaluate(inputs, targets, batch_size=2, verbose=0)
-    model.predict(inputs, batch_size=2)
-    model.train_on_batch(inputs, targets)
-    model.test_on_batch(inputs, targets)
-
-  @test_combinations.run_with_all_model_types
-  def test_model_fit_and_validation_with_missing_arg_errors(self):
-    model = test_utils.get_small_mlp(10, 4, 3)
-    model.compile(optimizer=rmsprop.RMSprop(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=True)
-
-    x = tf.zeros(shape=(10, 3))
-    y = tf.zeros(shape=(10, 4))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
-    validation_dataset = tf.data.Dataset.from_tensor_slices(
-        (x, y)).repeat().batch(5)  # Infinite dataset.
-
-    model.fit(dataset, epochs=1, verbose=0)
-
-    # Step argument is required for infinite datasets.
-    with self.assertRaises(ValueError):
-      model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0,
-                validation_data=validation_dataset)
-    with self.assertRaises(ValueError):
-      model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0,
-                validation_data=validation_dataset)
-
-  # TODO(b/120931266): Enable test on subclassed models after bug causing an
-  # extra dimension to be added to predict outputs is fixed.
-  @test_combinations.run_with_all_model_types(exclude_models='subclass')
-  def test_generator_methods(self):
-    model = test_utils.get_small_mlp(10, 4, 3)
-    optimizer = rmsprop.RMSprop(learning_rate=0.001)
-    model.compile(
-        optimizer,
-        loss='mse',
-        metrics=['mae', metrics_module.CategoricalAccuracy()],
-        run_eagerly=True)
-
-    x = np.random.random((10, 3))
-    y = np.random.random((10, 4))
-
-    def numpy_iterator():
-      while True:
-        yield x, y
-
-    model.fit_generator(numpy_iterator(), steps_per_epoch=3, epochs=1)
-    model.evaluate_generator(numpy_iterator(), steps=3)
-
-    def inference_numpy_iterator():
-      while True:
-        yield x
-
-    out = model.predict_generator(inference_numpy_iterator(), steps=3)
-    self.assertEqual(out.shape, (30, 4))
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_dynamic_model_has_trainable_weights(self):
+        if not tf.executing_eagerly():
+            # Only test Eager modes, as Graph mode is not relevant for dynamic
+            # models.
+            return
+
+        class DynamicModel(keras.Model):
+            def __init__(self):
+                super().__init__(dynamic=True)
+                self.dense = keras.layers.Dense(
+                    1, kernel_initializer="zeros", bias_initializer="ones"
+                )
+
+            def call(self, inputs):
+                return self.dense(inputs)
+
+        model = DynamicModel()
+        model.compile("rmsprop", "mae", run_eagerly=True)
+        hist = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+        self.assertEqual(hist.history["loss"][-1], 1)
+        self.assertEqual(len(model.trainable_weights), 2)
+        loss = model.train_on_batch(np.zeros((1, 1)), np.zeros((1, 1)))
+        # The loss must have been updated if the trainable weights are taken
+        # into account during tracking.
+        self.assertLess(loss, 1)
+
+    @test_combinations.run_with_all_model_types(exclude_models="sequential")
+    @test_combinations.run_all_keras_modes
+    def test_model_methods_with_eager_tensors_multi_io(self):
+        if not tf.executing_eagerly():
+            # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+            # symbolic tensors has different requirements.
+            return
+
+        input_a = keras.layers.Input(shape=(3,), name="input_a")
+        input_b = keras.layers.Input(shape=(3,), name="input_b")
+
+        dense = keras.layers.Dense(4, name="dense")
+        dropout = keras.layers.Dropout(0.5, name="dropout")
+
+        model = test_utils.get_multi_io_model(
+            [input_a, dense], [input_b, dense, dropout]
+        )
+
+        optimizer = rmsprop.RMSprop(learning_rate=0.001)
+        loss = "mse"
+        loss_weights = [1.0, 0.5]
+        metrics = ["mae", metrics_module.CategoricalAccuracy()]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=metrics,
+            loss_weights=loss_weights,
+            run_eagerly=test_utils.should_run_eagerly(),
+            sample_weight_mode=None,
+        )
+
+        input_a = tf.zeros(shape=(10, 3))
+        input_b = tf.zeros(shape=(10, 3))
+        target_a = tf.zeros(shape=(10, 4))
+        target_b = tf.zeros(shape=(10, 4))
+
+        model.fit(
+            [input_a, input_b],
+            [target_a, target_b],
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+        # Test: no shuffle.
+        model.fit(
+            [input_a, input_b],
+            [target_a, target_b],
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+            shuffle=False,
+        )
+        # Test: validation data.
+        model.fit(
+            [input_a, input_b],
+            [target_a, target_b],
+            epochs=1,
+            batch_size=2,
+            verbose=0,
+            validation_data=([input_a, input_b], [target_a, target_b]),
+        )
+        model.train_on_batch([input_a, input_b], [target_a, target_b])
+        model.predict([input_a, input_b], batch_size=5)
+        model.evaluate(
+            [input_a, input_b], [target_a, target_b], batch_size=2, verbose=0
+        )
+        model.test_on_batch([input_a, input_b], [target_a, target_b])
+
+        # Test: mix np and tensors.
+        input_b = np.zeros(shape=(10, 3)).astype("float32")
+        target_b = np.zeros(shape=(10, 4)).astype("float32")
+        model.fit(
+            [input_a, input_b],
+            [target_a, target_b],
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+        model.fit(
+            [input_a, input_b],
+            [target_a, target_b],
+            epochs=1,
+            batch_size=2,
+            verbose=0,
+            validation_data=([input_a, input_b], [target_a, target_b]),
+        )
+        model.fit(
+            [input_a, input_b],
+            [target_a, target_b],
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+            shuffle=False,
+        )
+        model.train_on_batch([input_a, input_b], [target_a, target_b])
+        model.predict([input_a, input_b], batch_size=5)
+        model.evaluate(
+            [input_a, input_b], [target_a, target_b], batch_size=2, verbose=0
+        )
+        model.test_on_batch([input_a, input_b], [target_a, target_b])
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_model_methods_with_eager_tensors_single_io(self):
+        if not tf.executing_eagerly():
+            # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+            # symbolic tensors has different requirements.
+            return
+
+        model = test_utils.get_small_mlp(10, 4, 3)
+
+        optimizer = rmsprop.RMSprop(learning_rate=0.001)
+        loss = "mse"
+        metrics = ["mae", metrics_module.CategoricalAccuracy()]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=metrics,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = tf.zeros(shape=(10, 3))
+        targets = tf.zeros(shape=(10, 4))
+
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
+        model.fit(
+            inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False
+        )
+        model.fit(
+            inputs,
+            targets,
+            epochs=1,
+            batch_size=4,
+            verbose=0,
+            validation_data=(inputs, targets),
+        )
+        model.evaluate(inputs, targets, batch_size=2, verbose=0)
+        model.predict(inputs, batch_size=2)
+        model.train_on_batch(inputs, targets)
+        model.test_on_batch(inputs, targets)
+
+    @test_combinations.run_with_all_model_types
+    def test_model_fit_and_validation_with_missing_arg_errors(self):
+        model = test_utils.get_small_mlp(10, 4, 3)
+        model.compile(
+            optimizer=rmsprop.RMSprop(learning_rate=0.001),
+            loss="mse",
+            run_eagerly=True,
+        )
+
+        x = tf.zeros(shape=(10, 3))
+        y = tf.zeros(shape=(10, 4))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
+        validation_dataset = (
+            tf.data.Dataset.from_tensor_slices((x, y)).repeat().batch(5)
+        )  # Infinite dataset.
+
+        model.fit(dataset, epochs=1, verbose=0)
+
+        # Step argument is required for infinite datasets.
+        with self.assertRaises(ValueError):
+            model.fit(
+                dataset,
+                steps_per_epoch=2,
+                epochs=1,
+                verbose=0,
+                validation_data=validation_dataset,
+            )
+        with self.assertRaises(ValueError):
+            model.fit(
+                dataset,
+                steps_per_epoch=2,
+                epochs=1,
+                verbose=0,
+                validation_data=validation_dataset,
+            )
+
+    # TODO(b/120931266): Enable test on subclassed models after bug causing an
+    # extra dimension to be added to predict outputs is fixed.
+    @test_combinations.run_with_all_model_types(exclude_models="subclass")
+    def test_generator_methods(self):
+        model = test_utils.get_small_mlp(10, 4, 3)
+        optimizer = rmsprop.RMSprop(learning_rate=0.001)
+        model.compile(
+            optimizer,
+            loss="mse",
+            metrics=["mae", metrics_module.CategoricalAccuracy()],
+            run_eagerly=True,
+        )
+
+        x = np.random.random((10, 3))
+        y = np.random.random((10, 4))
+
+        def numpy_iterator():
+            while True:
+                yield x, y
+
+        model.fit_generator(numpy_iterator(), steps_per_epoch=3, epochs=1)
+        model.evaluate_generator(numpy_iterator(), steps=3)
+
+        def inference_numpy_iterator():
+            while True:
+                yield x
+
+        out = model.predict_generator(inference_numpy_iterator(), steps=3)
+        self.assertEqual(out.shape, (30, 4))
 
 
 class CorrectnessTest(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      ('', dict()),
-      ('_clipvalue_inf', {'clipvalue': 999999}),
-      ('_clipnorm_inf', {'clipnorm': 999999}),
-  ])
-  def test_loss_correctness(self, optimizer_kwargs):
-    # Test that training loss is the same in eager and graph
-    # (by comparing it to a reference value in a deterministic case)
-    layers = [
-        keras.layers.Dense(3, activation='relu',
-                           kernel_initializer='ones'),
-        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones')]
-    model = test_utils.get_model_from_layers(layers, input_shape=(4,))
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer=rmsprop.RMSprop(learning_rate=0.001, **optimizer_kwargs),
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.ones((100, 4))
-    np.random.seed(123)
-    y = np.random.randint(0, 1, size=(100, 1))
-    history = model.fit(x, y, epochs=1, batch_size=10)
-    self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_loss_correctness_clipvalue_zero(self):
-    # Test that training loss is the same in eager and graph
-    # (by comparing it to a reference value in a deterministic case)
-    # And confirm that setting clipvalue to zero stops all training
-    layers = [
-        keras.layers.Dense(3, activation='relu',
-                           kernel_initializer='ones'),
-        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones')]
-    model = test_utils.get_model_from_layers(layers, input_shape=(4,))
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer=rmsprop.RMSprop(learning_rate=0.001, clipvalue=0.0),
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.ones((100, 4))
-    np.random.seed(123)
-    y = np.random.randint(0, 1, size=(100, 1))
-    history = model.fit(x, y, epochs=3, batch_size=10)
-    self.assertAlmostEqual(history.history['loss'][-3], 0.6931, 4)
-    self.assertAlmostEqual(history.history['loss'][-2], 0.6931, 4)
-    self.assertAlmostEqual(history.history['loss'][-1], 0.6931, 4)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_loss_correctness_with_iterator(self):
-    # Test that training loss is the same in eager and graph
-    # (by comparing it to a reference value in a deterministic case)
-    layers = [
-        keras.layers.Dense(3, activation='relu',
-                           kernel_initializer='ones'),
-        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones')]
-    model = test_utils.get_model_from_layers(layers, input_shape=(4,))
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer=rmsprop.RMSprop(learning_rate=0.001),
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.ones((100, 4), dtype=np.float32)
-    np.random.seed(123)
-    y = np.random.randint(0, 1, size=(100, 1))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    history = model.fit(dataset, epochs=1, steps_per_epoch=10)
-    self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
-
-  @parameterized.named_parameters([
-      ('_None', None, 0., 4.),
-      ('_False', False, 4., 4.),
-      ('_True', True, 0., 0.),
-  ])
-  def test_nested_model_learning_phase(self, training,
-                                       expected_training_loss,
-                                       expected_validation_loss):
-    """Tests that learning phase is correctly set in an intermediate layer."""
-
-    def _make_unregularized_model():
-      inputs = keras.Input((4,))
-      # Zero out activations when `training=True`.
-      x = keras.layers.Dropout(1. - 1. / (1 << 24))(inputs)
-      x = keras.layers.Dense(
-          10,
-          activation='relu',
-          trainable=False,
-          bias_initializer='zeros',
-          kernel_initializer='ones')(
-              x)  # Just sum together all the activations.
-      outputs = keras.layers.Dense(3)(x)
-      return keras.Model(inputs, outputs)
-
-    def _regularize_model(unregularized_model):
-      # Regularize the most recent activations of a post-dropout layer.
-      sample_activations = unregularized_model.get_layer(
-          index=-2).get_output_at(-1)
-      regularization_loss = keras.backend.mean(sample_activations)
-      unregularized_model.add_loss(regularization_loss)
-      unregularized_model.add_metric(
-          regularization_loss, aggregation='mean', name='regularization_loss')
-      inputs = keras.Input(unregularized_model.inputs[0].shape[1:])
-      logits = unregularized_model(inputs, training=training)
-      outputs = keras.activations.softmax(logits)
-      model = keras.Model(inputs, outputs)
-      return model
-
-    # Make and compile models.
-    model = _regularize_model(_make_unregularized_model())
-    model.compile('sgd', 'sparse_categorical_crossentropy')
-    # Prepare fake data.
-    x = np.ones((20, 4)).astype(np.float32)
-    y = np.random.randint(0, 3, size=(20,)).astype(np.int64)
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
-    results = model.evaluate(dataset)
-    evaluation_results = dict(zip(model.metrics_names, results))
-    # Rate of dropout depends on the learning phase.
-    self.assertEqual(evaluation_results['regularization_loss'],
-                     expected_validation_loss)
-    history = model.fit(dataset, epochs=2, validation_data=dataset).history
-    self.assertAllEqual(history['regularization_loss'],
-                        [expected_training_loss] * 2)
-    self.assertAllEqual(history['val_regularization_loss'],
-                        [expected_validation_loss] * 2)
-
-
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            ("", dict()),
+            ("_clipvalue_inf", {"clipvalue": 999999}),
+            ("_clipnorm_inf", {"clipnorm": 999999}),
+        ]
+    )
+    def test_loss_correctness(self, optimizer_kwargs):
+        # Test that training loss is the same in eager and graph
+        # (by comparing it to a reference value in a deterministic case)
+        layers = [
+            keras.layers.Dense(3, activation="relu", kernel_initializer="ones"),
+            keras.layers.Dense(
+                2, activation="softmax", kernel_initializer="ones"
+            ),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(4,))
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer=rmsprop.RMSprop(learning_rate=0.001, **optimizer_kwargs),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.ones((100, 4))
+        np.random.seed(123)
+        y = np.random.randint(0, 1, size=(100, 1))
+        history = model.fit(x, y, epochs=1, batch_size=10)
+        self.assertAlmostEqual(history.history["loss"][-1], 0.5836, 4)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_loss_correctness_clipvalue_zero(self):
+        # Test that training loss is the same in eager and graph
+        # (by comparing it to a reference value in a deterministic case)
+        # And confirm that setting clipvalue to zero stops all training
+        layers = [
+            keras.layers.Dense(3, activation="relu", kernel_initializer="ones"),
+            keras.layers.Dense(
+                2, activation="softmax", kernel_initializer="ones"
+            ),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(4,))
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer=rmsprop.RMSprop(learning_rate=0.001, clipvalue=0.0),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.ones((100, 4))
+        np.random.seed(123)
+        y = np.random.randint(0, 1, size=(100, 1))
+        history = model.fit(x, y, epochs=3, batch_size=10)
+        self.assertAlmostEqual(history.history["loss"][-3], 0.6931, 4)
+        self.assertAlmostEqual(history.history["loss"][-2], 0.6931, 4)
+        self.assertAlmostEqual(history.history["loss"][-1], 0.6931, 4)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_loss_correctness_with_iterator(self):
+        # Test that training loss is the same in eager and graph
+        # (by comparing it to a reference value in a deterministic case)
+        layers = [
+            keras.layers.Dense(3, activation="relu", kernel_initializer="ones"),
+            keras.layers.Dense(
+                2, activation="softmax", kernel_initializer="ones"
+            ),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(4,))
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer=rmsprop.RMSprop(learning_rate=0.001),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.ones((100, 4), dtype=np.float32)
+        np.random.seed(123)
+        y = np.random.randint(0, 1, size=(100, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+        history = model.fit(dataset, epochs=1, steps_per_epoch=10)
+        self.assertAlmostEqual(history.history["loss"][-1], 0.5836, 4)
+
+    @parameterized.named_parameters(
+        [
+            ("_None", None, 0.0, 4.0),
+            ("_False", False, 4.0, 4.0),
+            ("_True", True, 0.0, 0.0),
+        ]
+    )
+    def test_nested_model_learning_phase(
+        self, training, expected_training_loss, expected_validation_loss
+    ):
+        """Tests learning phase is correctly set in an intermediate layer."""
+
+        def _make_unregularized_model():
+            inputs = keras.Input((4,))
+            # Zero out activations when `training=True`.
+            x = keras.layers.Dropout(1.0 - 1.0 / (1 << 24))(inputs)
+            x = keras.layers.Dense(
+                10,
+                activation="relu",
+                trainable=False,
+                bias_initializer="zeros",
+                kernel_initializer="ones",
+            )(
+                x
+            )  # Just sum together all the activations.
+            outputs = keras.layers.Dense(3)(x)
+            return keras.Model(inputs, outputs)
+
+        def _regularize_model(unregularized_model):
+            # Regularize the most recent activations of a post-dropout layer.
+            sample_activations = unregularized_model.get_layer(
+                index=-2
+            ).get_output_at(-1)
+            regularization_loss = keras.backend.mean(sample_activations)
+            unregularized_model.add_loss(regularization_loss)
+            unregularized_model.add_metric(
+                regularization_loss,
+                aggregation="mean",
+                name="regularization_loss",
+            )
+            inputs = keras.Input(unregularized_model.inputs[0].shape[1:])
+            logits = unregularized_model(inputs, training=training)
+            outputs = keras.activations.softmax(logits)
+            model = keras.Model(inputs, outputs)
+            return model
+
+        # Make and compile models.
+        model = _regularize_model(_make_unregularized_model())
+        model.compile("sgd", "sparse_categorical_crossentropy")
+        # Prepare fake data.
+        x = np.ones((20, 4)).astype(np.float32)
+        y = np.random.randint(0, 3, size=(20,)).astype(np.int64)
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
+        results = model.evaluate(dataset)
+        evaluation_results = dict(zip(model.metrics_names, results))
+        # Rate of dropout depends on the learning phase.
+        self.assertEqual(
+            evaluation_results["regularization_loss"], expected_validation_loss
+        )
+        history = model.fit(dataset, epochs=2, validation_data=dataset).history
+        self.assertAllEqual(
+            history["regularization_loss"], [expected_training_loss] * 2
+        )
+        self.assertAllEqual(
+            history["val_regularization_loss"], [expected_validation_loss] * 2
+        )
+
+
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/engine/training_eager_v1.py b/keras/engine/training_eager_v1.py
index 8d02110610c7..427b816f8478 100644
--- a/keras/engine/training_eager_v1.py
+++ b/keras/engine/training_eager_v1.py
@@ -14,351 +14,392 @@
 # ==============================================================================
 """Keras training and evaluation routines for eager execution."""
 
-import tensorflow.compat.v2 as tf
-# pylint: disable=protected-access
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
-from tensorflow.python.eager.backprop import GradientTape
 from keras import backend
 from keras.engine import training_utils
 from keras.engine import training_utils_v1
 from keras.mixed_precision import loss_scale_optimizer
 from keras.utils import losses_utils
+
+# isort: off
+from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.platform import tf_logging as logging
 
 
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
-  with backend.name_scope(output_name + '_loss'):
-    loss = loss_fn(targets, outputs)
-  return loss
+    with backend.name_scope(output_name + "_loss"):
+        loss = loss_fn(targets, outputs)
+    return loss
 
 
 def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
-  """Calculates the metrics for each output of the given model.
-
-  Args:
-      model: The model on which metrics are being calculated.
-      outputs: The outputs of the given model.
-      targets: The predictions or targets of the given model.
-      sample_weights: Optional list of sample weights for each output.
-      masks: Optional list of masks for each output.
-
-  Returns:
-      Returns the metric results for each output of the model.
-  """
-  outputs = tf.nest.flatten(outputs)
-  targets = tf.nest.flatten(targets)
-  # Invoke all(weighted and unweighted) metrics.
-  metric_results = []
-  if targets:
-    # Insert None values corresponding to the targets that need to be skipped
-    # on the model.
-    if len(model._targets) != len(targets):
-      new_targets = [
-          None if t is None else targets.pop(0) for t in model._targets
-      ]
-      targets = new_targets
-
-    metric_results = model._handle_metrics(
-        outputs,
-        targets=targets,
-        sample_weights=sample_weights,
-        masks=masks,
-        return_weighted_and_unweighted_metrics=True,
-        skip_target_masks=model._prepare_skip_target_masks())
-
-  # Add metric results from the `add_metric` metrics.
-  metric_results.extend([
-      m.result()
-      for m in model.metrics
-      if m not in model._compile_metric_functions
-  ])
-  return metric_results
+    """Calculates the metrics for each output of the given model.
+
+    Args:
+        model: The model on which metrics are being calculated.
+        outputs: The outputs of the given model.
+        targets: The predictions or targets of the given model.
+        sample_weights: Optional list of sample weights for each output.
+        masks: Optional list of masks for each output.
+
+    Returns:
+        Returns the metric results for each output of the model.
+    """
+    outputs = tf.nest.flatten(outputs)
+    targets = tf.nest.flatten(targets)
+    # Invoke all(weighted and unweighted) metrics.
+    metric_results = []
+    if targets:
+        # Insert None values corresponding to the targets that need to be
+        # skipped on the model.
+        if len(model._targets) != len(targets):
+            new_targets = [
+                None if t is None else targets.pop(0) for t in model._targets
+            ]
+            targets = new_targets
+
+        metric_results = model._handle_metrics(
+            outputs,
+            targets=targets,
+            sample_weights=sample_weights,
+            masks=masks,
+            return_weighted_and_unweighted_metrics=True,
+            skip_target_masks=model._prepare_skip_target_masks(),
+        )
+
+    # Add metric results from the `add_metric` metrics.
+    metric_results.extend(
+        [
+            m.result()
+            for m in model.metrics
+            if m not in model._compile_metric_functions
+        ]
+    )
+    return metric_results
+
+
+def _model_loss(
+    model,
+    inputs,
+    targets,
+    output_loss_metrics=None,
+    sample_weights=None,
+    training=False,
+):
+    """Calculates the loss for a given model.
+
+    Args:
+        model: The model on which metrics are being calculated.
+        inputs: Either a dictionary of inputs to the model or a list of input
+          arrays.
+        targets: List of target arrays.
+        output_loss_metrics: List of metrics that are used to aggregated output
+          loss values.
+        sample_weights: Optional list of sample weight arrays.
+        training: Whether the model should be run in inference or training mode.
+
+    Returns:
+       Returns the model output, total loss, loss value calculated using the
+       specified loss function and masks for each output. The total loss
+       includes regularization losses and applies masking and sample weighting
+       to the loss value.
+    """
+    # TODO(psv): Dedup code here with graph mode prepare_total_loss() fn.
+    # Used to keep track of the total loss value (stateless).
+    # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+    #                   loss_weight_2 * output_2_loss_fn(...) +
+    #                   layer losses.
+    total_loss = 0
+    kwargs = {}
+    if model._expects_training_arg:
+        kwargs["training"] = training
+    if len(inputs) == 1 and not isinstance(inputs, dict):
+        inputs = inputs[0]
+
+    # Allow mixed `NumPy` and `EagerTensor` input here.
+    if any(
+        isinstance(input_t, (np.ndarray, float, int))
+        for input_t in tf.nest.flatten(inputs)
+    ):
+        inputs = tf.nest.map_structure(tf.convert_to_tensor, inputs)
+
+    outs = model(inputs, **kwargs)
+    outs = tf.nest.flatten(outs)
+
+    if targets:
+        targets = training_utils_v1.cast_if_floating_dtype_and_mismatch(
+            targets, outs
+        )
+    # TODO(sallymatson/psv): check if we should do same mismatch fix for weights
+    if sample_weights:
+        sample_weights = [
+            training_utils_v1.cast_if_floating_dtype(tf.convert_to_tensor(val))
+            if val is not None
+            else None
+            for val in sample_weights
+        ]
+
+    masks = [getattr(t, "_keras_mask", None) for t in outs]
+    targets = tf.nest.flatten(targets)
+
+    # Used to keep track of individual output losses.
+    output_losses = []
+
+    with backend.name_scope("loss"):
+        loss_fns = [
+            loss_fn for loss_fn in model.loss_functions if loss_fn is not None
+        ]
+        custom_losses = model.losses  # Regularization losses
+
+        if not loss_fns and not custom_losses:
+            if training:
+                raise ValueError(
+                    "The model cannot be trained "
+                    "because it has no loss to optimize."
+                )
+            else:
+                raise ValueError(
+                    "The model cannot be evaluated "
+                    "because it has no loss to compute."
+                )
+
+        for i, loss_fn in enumerate(loss_fns):
+            weights = sample_weights[i] if sample_weights else None
+            mask = masks[i]
+            with backend.name_scope(model.output_names[i] + "_loss"):
+                if mask is not None:
+                    mask = tf.cast(mask, outs[i].dtype)
+                    # Update weights with mask.
+                    if weights is None:
+                        weights = mask
+                    else:
+                        # Update dimensions of weights to match with mask if
+                        # possible.
+                        weights = tf.cast(weights, outs[i].dtype)
+                        (
+                            mask,
+                            _,
+                            weights,
+                        ) = losses_utils.squeeze_or_expand_dimensions(
+                            mask, sample_weight=weights
+                        )
+                        weights *= mask
+
+                if hasattr(loss_fn, "reduction"):
+                    per_sample_losses = loss_fn.call(targets[i], outs[i])
+                    weighted_losses = losses_utils.compute_weighted_loss(
+                        per_sample_losses,
+                        sample_weight=weights,
+                        reduction=losses_utils.ReductionV2.NONE,
+                    )
+                    loss_reduction = loss_fn.reduction
+
+                    # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE`
+                    # for all compile use cases.
+                    if loss_reduction == losses_utils.ReductionV2.AUTO:
+                        loss_reduction = (
+                            losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+                        )
+
+                    # Compute the stateless loss value.
+                    output_loss = losses_utils.reduce_weighted_loss(
+                        weighted_losses, reduction=loss_reduction
+                    )
+                else:
+                    # Compute the stateless loss value for a custom loss class.
+                    # Here we assume that the class takes care of loss reduction
+                    # because if this class returns a vector value we cannot
+                    # differentiate between use case where a custom optimizer
+                    # expects a vector loss value vs unreduced per-sample loss
+                    # value.
+                    output_loss = loss_fn(
+                        targets[i], outs[i], sample_weight=weights
+                    )
+                    loss_reduction = (
+                        losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+                    )
+
+            # If the number of outputs is 1 then we don't append the loss metric
+            # associated with each model output. When there are multiple outputs
+            # associated with a model, each output's loss is calculated and
+            # returned as part of the loss_metrics.
+            if len(model.outputs) > 1:
+                # Keep track of the stateful output loss result.
+                output_losses.append(output_loss_metrics[i](output_loss))
+
+            # Scale output loss for distribution. For custom losses we assume
+            # reduction was mean.
+            if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
+                output_loss = losses_utils.scale_loss_for_distribution(
+                    output_loss
+                )
+            total_loss += model._loss_weights_list[i] * output_loss
+
+        # Add regularization losses
+        if custom_losses:
+            total_loss += losses_utils.scale_loss_for_distribution(
+                tf.add_n(custom_losses)
+            )
+    return outs, total_loss, output_losses, masks
 
 
-def _model_loss(model,
+def _process_single_batch(
+    model,
+    inputs,
+    targets,
+    output_loss_metrics=None,
+    sample_weights=None,
+    training=False,
+):
+    """Calculate the loss and gradient for one input batch.
+
+       The model weights are updated if training is set to True.
+
+    Args:
+        model: Model whose loss has to be calculated.
+        inputs: List of input arrays.
+        targets: List of target arrays.
+        output_loss_metrics: List of metrics that are used to aggregated output
+          loss values.
+        sample_weights: Optional list of sample weight arrays.
+        training: The boolean represents if the weights of the model are
+          updated. 'fit' methods will set this to True while 'evaluate' methods
+          will set this to False.
+
+    Returns:
+        output of the model, total loss, the loss and the mask
+        associated with each output.
+
+    Raises:
+        ValueError: If the model has no loss to optimize.
+    """
+    with backend.eager_learning_phase_scope(
+        1 if training else 0
+    ), training_utils.RespectCompiledTrainableState(model):
+        with GradientTape() as tape:
+            outs, total_loss, output_losses, masks = _model_loss(
+                model,
                 inputs,
                 targets,
-                output_loss_metrics=None,
-                sample_weights=None,
-                training=False):
-  """Calculates the loss for a given model.
-
-  Args:
-      model: The model on which metrics are being calculated.
-      inputs: Either a dictionary of inputs to the model or a list of input
-        arrays.
-      targets: List of target arrays.
-      output_loss_metrics: List of metrics that are used to aggregated output
-        loss values.
-      sample_weights: Optional list of sample weight arrays.
-      training: Whether the model should be run in inference or training mode.
-
-  Returns:
-     Returns the model output, total loss, loss value calculated using the
-     specified loss function and masks for each output. The total loss includes
-     regularization losses and applies masking and sample weighting
-     to the loss value.
-  """
-  # TODO(psv): Dedup code here with graph mode prepare_total_loss() fn.
-  # Used to keep track of the total loss value (stateless).
-  # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
-  #                   loss_weight_2 * output_2_loss_fn(...) +
-  #                   layer losses.
-  total_loss = 0
-  kwargs = {}
-  if model._expects_training_arg:
-    kwargs['training'] = training
-  if len(inputs) == 1 and not isinstance(inputs, dict):
-    inputs = inputs[0]
-
-  # Allow mixed `NumPy` and `EagerTensor` input here.
-  if any(
-      isinstance(input_t, (np.ndarray, float, int))
-      for input_t in tf.nest.flatten(inputs)):
-    inputs = tf.nest.map_structure(tf.convert_to_tensor, inputs)
-
-  outs = model(inputs, **kwargs)
-  outs = tf.nest.flatten(outs)
-
-  if targets:
-    targets = training_utils_v1.cast_if_floating_dtype_and_mismatch(
-        targets, outs)
-  # TODO(sallymatson/psv): check if we should do same mismatch fix for weights
-  if sample_weights:
-    sample_weights = [
-        training_utils_v1.cast_if_floating_dtype(
-            tf.convert_to_tensor(val))
-        if val is not None else None for val in sample_weights
-    ]
-
-  masks = [getattr(t, '_keras_mask', None) for t in outs]
-  targets = tf.nest.flatten(targets)
-
-  # Used to keep track of individual output losses.
-  output_losses = []
-
-  with backend.name_scope('loss'):
-    loss_fns = [
-        loss_fn for loss_fn in model.loss_functions if loss_fn is not None
-    ]
-    custom_losses = model.losses  # Regularization losses
-
-    if not loss_fns and not custom_losses:
-      if training:
-        raise ValueError('The model cannot be trained '
-                         'because it has no loss to optimize.')
-      else:
-        raise ValueError('The model cannot be evaluated '
-                         'because it has no loss to compute.')
-
-    for i, loss_fn in enumerate(loss_fns):
-      weights = sample_weights[i] if sample_weights else None
-      mask = masks[i]
-      with backend.name_scope(model.output_names[i] + '_loss'):
-        if mask is not None:
-          mask = tf.cast(mask, outs[i].dtype)
-          # Update weights with mask.
-          if weights is None:
-            weights = mask
-          else:
-            # Update dimensions of weights to match with mask if possible.
-            weights = tf.cast(weights, outs[i].dtype)
-            mask, _, weights = (
-                losses_utils.squeeze_or_expand_dimensions(
-                    mask, sample_weight=weights))
-            weights *= mask
-
-        if hasattr(loss_fn, 'reduction'):
-          per_sample_losses = loss_fn.call(targets[i], outs[i])
-          weighted_losses = losses_utils.compute_weighted_loss(
-              per_sample_losses,
-              sample_weight=weights,
-              reduction=losses_utils.ReductionV2.NONE)
-          loss_reduction = loss_fn.reduction
-
-          # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all
-          # compile use cases.
-          if loss_reduction == losses_utils.ReductionV2.AUTO:
-            loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
-
-          # Compute the stateless loss value.
-          output_loss = losses_utils.reduce_weighted_loss(
-              weighted_losses, reduction=loss_reduction)
-        else:
-          # Compute the stateless loss value for a custom loss class.
-          # Here we assume that the class takes care of loss reduction
-          # because if this class returns a vector value we cannot
-          # differentiate between use case where a custom optimizer
-          # expects a vector loss value vs unreduced per-sample loss value.
-          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
-          loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
-
-      # If the number of outputs is 1 then we don't append the loss metric
-      # associated with each model output. When there are multiple outputs
-      # associated with a model, each output's loss is calculated and returned
-      # as part of the loss_metrics.
-      if len(model.outputs) > 1:
-        # Keep track of the stateful output loss result.
-        output_losses.append(output_loss_metrics[i](output_loss))
-
-      # Scale output loss for distribution. For custom losses we assume
-      # reduction was mean.
-      if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
-        output_loss = losses_utils.scale_loss_for_distribution(output_loss)
-      total_loss += model._loss_weights_list[i] * output_loss
-
-    # Add regularization losses
-    if custom_losses:
-      total_loss += losses_utils.scale_loss_for_distribution(
-          tf.add_n(custom_losses))
-  return outs, total_loss, output_losses, masks
-
-
-def _process_single_batch(model,
-                          inputs,
-                          targets,
-                          output_loss_metrics=None,
-                          sample_weights=None,
-                          training=False):
-  """Calculate the loss and gradient for one input batch.
-
-     The model weights are updated if training is set to True.
-
-  Args:
-      model: Model whose loss has to be calculated.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      output_loss_metrics: List of metrics that are used to aggregated output
-        loss values.
-      sample_weights: Optional list of sample weight arrays.
-      training: The boolean represents if the weights of the model are updated.
-              'fit' methods will set this to True while 'evaluate' methods will
-              set this to False.
-
-  Returns:
-      output of the model, total loss, the loss and the mask
-      associated with each output.
-
-  Raises:
-      ValueError: If the model has no loss to optimize.
-  """
-  with backend.eager_learning_phase_scope(1 if training else 0), \
-      training_utils.RespectCompiledTrainableState(model):
-    with GradientTape() as tape:
-      outs, total_loss, output_losses, masks = (
-          _model_loss(
-              model,
-              inputs,
-              targets,
-              output_loss_metrics=output_loss_metrics,
-              sample_weights=sample_weights,
-              training=training))
-      if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer):
-        scaled_total_loss = model.optimizer.get_scaled_loss(total_loss)
-      else:
-        scaled_total_loss = total_loss
-    if training:
-      trainable_weights = model.trainable_weights
-      if trainable_weights:
-        # TODO(tanzheny) b/132690565: Provide mechanism for user to override
-        # model.train_on_batch.
-        if hasattr(model, '_backwards'):
-          model._backwards(tape, scaled_total_loss)
-        else:
-          grads = tape.gradient(scaled_total_loss, trainable_weights)
-          if isinstance(model.optimizer,
-                        loss_scale_optimizer.LossScaleOptimizer):
-            grads = model.optimizer.get_unscaled_gradients(grads)
-          model.optimizer.apply_gradients(zip(grads, trainable_weights))
-      else:
-        logging.warning('The list of trainable weights is empty. Make sure that'
-                        ' you are not setting model.trainable to False before '
-                        'compiling the model.')
-    return outs, total_loss, output_losses, masks
-
-
-def train_on_batch(model,
-                   inputs,
-                   targets,
-                   sample_weights=None,
-                   output_loss_metrics=None):
-  """Calculates the loss and gradient updates for one input batch.
-
-  Args:
-      model: Model whose loss has to be calculated.
-      inputs: Input batch data.
-      targets: Target batch data.
-      sample_weights: Sample weight batch data.
-      output_loss_metrics: List of metrics that are used to aggregated output
-        loss values.
-
-  Returns:
-      Dict with three items:
-        'total_loss': list with a single tensor for overall loss,
-        'output_losses': list of tensors for loss corresponding to each of the
-          model output. Could be a empty list when model has only one output.
-        'metrics': list of tensors for metric specified.
-  """
-  inputs = training_utils_v1.cast_to_model_input_dtypes(inputs, model)
-  outs, total_loss, output_losses, masks = (
-      _process_single_batch(
-          model,
-          inputs,
-          targets,
-          sample_weights=sample_weights,
-          training=True,
-          output_loss_metrics=output_loss_metrics))
-  if not isinstance(outs, list):
-    outs = [outs]
-  metrics_results = _eager_metrics_fn(
-      model, outs, targets, sample_weights=sample_weights, masks=masks)
-  total_loss = tf.nest.flatten(total_loss)
-  return {'total_loss': total_loss,
-          'output_losses': output_losses,
-          'metrics': metrics_results}
-
-
-def test_on_batch(model,
-                  inputs,
-                  targets,
-                  sample_weights=None,
-                  output_loss_metrics=None):
-  """Calculates the loss for one input batch.
-
-  Args:
-      model: Model whose loss has to be calculated.
-      inputs: Input batch data.
-      targets: Target batch data.
-      sample_weights: Sample weight batch data.
-      output_loss_metrics: List of metrics that are used to aggregated output
-        loss values.
-
-  Returns:
-      Dict with three items:
-        'total_loss': single tensor for overall loss,
-        'output_losses': list of tensors for loss corresponding to each of the
-          model output. Could be a empty list when model has only one output.
-        'metrics': list of tensors for metric specified.
-  """
-  inputs = training_utils_v1.cast_to_model_input_dtypes(inputs, model)
-
-  with backend.eager_learning_phase_scope(0):
-    outs, total_loss, output_losses, masks = (
-        _model_loss(
+                output_loss_metrics=output_loss_metrics,
+                sample_weights=sample_weights,
+                training=training,
+            )
+            if isinstance(
+                model.optimizer, loss_scale_optimizer.LossScaleOptimizer
+            ):
+                scaled_total_loss = model.optimizer.get_scaled_loss(total_loss)
+            else:
+                scaled_total_loss = total_loss
+        if training:
+            trainable_weights = model.trainable_weights
+            if trainable_weights:
+                # TODO(tanzheny) b/132690565: Provide mechanism for user to
+                # override model.train_on_batch.
+                if hasattr(model, "_backwards"):
+                    model._backwards(tape, scaled_total_loss)
+                else:
+                    grads = tape.gradient(scaled_total_loss, trainable_weights)
+                    if isinstance(
+                        model.optimizer, loss_scale_optimizer.LossScaleOptimizer
+                    ):
+                        grads = model.optimizer.get_unscaled_gradients(grads)
+                    model.optimizer.apply_gradients(
+                        zip(grads, trainable_weights)
+                    )
+            else:
+                logging.warning(
+                    "The list of trainable weights is empty. Make sure that"
+                    " you are not setting model.trainable to False before "
+                    "compiling the model."
+                )
+        return outs, total_loss, output_losses, masks
+
+
+def train_on_batch(
+    model, inputs, targets, sample_weights=None, output_loss_metrics=None
+):
+    """Calculates the loss and gradient updates for one input batch.
+
+    Args:
+        model: Model whose loss has to be calculated.
+        inputs: Input batch data.
+        targets: Target batch data.
+        sample_weights: Sample weight batch data.
+        output_loss_metrics: List of metrics that are used to aggregated output
+          loss values.
+
+    Returns:
+        Dict with three items:
+          'total_loss': list with a single tensor for overall loss,
+          'output_losses': list of tensors for loss corresponding to each of the
+            model output. Could be a empty list when model has only one output.
+          'metrics': list of tensors for metric specified.
+    """
+    inputs = training_utils_v1.cast_to_model_input_dtypes(inputs, model)
+    outs, total_loss, output_losses, masks = _process_single_batch(
+        model,
+        inputs,
+        targets,
+        sample_weights=sample_weights,
+        training=True,
+        output_loss_metrics=output_loss_metrics,
+    )
+    if not isinstance(outs, list):
+        outs = [outs]
+    metrics_results = _eager_metrics_fn(
+        model, outs, targets, sample_weights=sample_weights, masks=masks
+    )
+    total_loss = tf.nest.flatten(total_loss)
+    return {
+        "total_loss": total_loss,
+        "output_losses": output_losses,
+        "metrics": metrics_results,
+    }
+
+
+def test_on_batch(
+    model, inputs, targets, sample_weights=None, output_loss_metrics=None
+):
+    """Calculates the loss for one input batch.
+
+    Args:
+        model: Model whose loss has to be calculated.
+        inputs: Input batch data.
+        targets: Target batch data.
+        sample_weights: Sample weight batch data.
+        output_loss_metrics: List of metrics that are used to aggregated output
+          loss values.
+
+    Returns:
+        Dict with three items:
+          'total_loss': single tensor for overall loss,
+          'output_losses': list of tensors for loss corresponding to each of the
+            model output. Could be a empty list when model has only one output.
+          'metrics': list of tensors for metric specified.
+    """
+    inputs = training_utils_v1.cast_to_model_input_dtypes(inputs, model)
+
+    with backend.eager_learning_phase_scope(0):
+        outs, total_loss, output_losses, masks = _model_loss(
             model,
             inputs,
             targets,
             sample_weights=sample_weights,
             training=False,
-            output_loss_metrics=output_loss_metrics))
-  if not isinstance(outs, list):
-    outs = [outs]
-  metrics_results = _eager_metrics_fn(
-      model, outs, targets, sample_weights=sample_weights, masks=masks)
-  total_loss = tf.nest.flatten(total_loss)
-
-  return {'total_loss': total_loss,
-          'output_losses': output_losses,
-          'metrics': metrics_results}
+            output_loss_metrics=output_loss_metrics,
+        )
+    if not isinstance(outs, list):
+        outs = [outs]
+    metrics_results = _eager_metrics_fn(
+        model, outs, targets, sample_weights=sample_weights, masks=masks
+    )
+    total_loss = tf.nest.flatten(total_loss)
+
+    return {
+        "total_loss": total_loss,
+        "output_losses": output_losses,
+        "metrics": metrics_results,
+    }
diff --git a/keras/engine/training_generator_test.py b/keras/engine/training_generator_test.py
index 3c64c36eaea5..70c32ca78d66 100644
--- a/keras/engine/training_generator_test.py
+++ b/keras/engine/training_generator_test.py
@@ -14,515 +14,594 @@
 # ==============================================================================
 """Tests for training routines."""
 
-import tensorflow.compat.v2 as tf
-
 import itertools
 
-from absl.testing import parameterized
 import numpy as np
-from keras.testing_infra import test_combinations
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras import layers as layers_module
 from keras import losses
 from keras import metrics as metrics_module
-from keras.testing_infra import test_utils
 from keras.engine import input_layer
 from keras.engine import training
 from keras.engine import training_generator_v1
-from keras.optimizers.optimizer_v2 import rmsprop
+from keras.optimizers.legacy import rmsprop
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 from keras.utils import data_utils
 
 
 def custom_generator(mode=2):
-  batch_size = 10
-  num_samples = 50
-  arr_data = np.random.random((num_samples, 2))
-  arr_labels = np.random.random((num_samples, 4))
-  arr_weights = np.random.random((num_samples,))
-  i = 0
-  while True:
-    batch_index = i * batch_size % num_samples
-    i += 1
-    start = batch_index
-    end = start + batch_size
-    x = arr_data[start: end]
-    y = arr_labels[start: end]
-    w = arr_weights[start: end]
-    if mode == 1:
-      yield x
-    elif mode == 2:
-      yield x, y
-    else:
-      yield x, y, w
+    batch_size = 10
+    num_samples = 50
+    arr_data = np.random.random((num_samples, 2))
+    arr_labels = np.random.random((num_samples, 4))
+    arr_weights = np.random.random((num_samples,))
+    i = 0
+    while True:
+        batch_index = i * batch_size % num_samples
+        i += 1
+        start = batch_index
+        end = start + batch_size
+        x = arr_data[start:end]
+        y = arr_labels[start:end]
+        w = arr_weights[start:end]
+        if mode == 1:
+            yield x
+        elif mode == 2:
+            yield x, y
+        else:
+            yield x, y, w
 
 
 def custom_generator_changing_batch_size(mode=2):
-  batch_size = 10
-  cur_batch_size = 11
-  num_samples = 50
-  arr_data = np.random.random((num_samples, 2))
-  arr_labels = np.random.random((num_samples, 4))
-  arr_weights = np.random.random((num_samples,))
-  i = 0
-  while True:
-    if cur_batch_size > 1:
-      cur_batch_size -= 1
-    batch_index = i * batch_size % num_samples
-    i += 1
-    start = batch_index
-    end = start + cur_batch_size
-    x = arr_data[start: end]
-    y = arr_labels[start: end]
-    w = arr_weights[start: end]
-    if mode == 1:
-      yield x
-    elif mode == 2:
-      yield x, y
-    else:
-      yield x, y, w
+    batch_size = 10
+    cur_batch_size = 11
+    num_samples = 50
+    arr_data = np.random.random((num_samples, 2))
+    arr_labels = np.random.random((num_samples, 4))
+    arr_weights = np.random.random((num_samples,))
+    i = 0
+    while True:
+        if cur_batch_size > 1:
+            cur_batch_size -= 1
+        batch_index = i * batch_size % num_samples
+        i += 1
+        start = batch_index
+        end = start + cur_batch_size
+        x = arr_data[start:end]
+        y = arr_labels[start:end]
+        w = arr_weights[start:end]
+        if mode == 1:
+            yield x
+        elif mode == 2:
+            yield x, y
+        else:
+            yield x, y, w
+
 
 custom_generator_threads = data_utils.threadsafe_generator(custom_generator)
 
 
 class TestGeneratorMethods(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @data_utils.dont_use_multiprocessing_pool
-  def test_fit_generator_method(self):
-    model = test_utils.get_small_mlp(
-        num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(
-        loss='mse',
-        optimizer=rmsprop.RMSprop(1e-3),
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
-
-    model.fit_generator(custom_generator_threads(),
-                        steps_per_epoch=5,
-                        epochs=1,
-                        verbose=1,
-                        max_queue_size=10,
-                        workers=4,
-                        use_multiprocessing=True)
-    model.fit_generator(custom_generator(),
-                        steps_per_epoch=5,
-                        epochs=1,
-                        verbose=1,
-                        max_queue_size=10,
-                        use_multiprocessing=False)
-    model.fit_generator(custom_generator(),
-                        steps_per_epoch=5,
-                        epochs=1,
-                        verbose=1,
-                        max_queue_size=10,
-                        use_multiprocessing=False,
-                        validation_data=custom_generator(),
-                        validation_steps=10)
-    model.fit_generator(custom_generator(),
-                        steps_per_epoch=5,
-                        validation_data=custom_generator(),
-                        validation_steps=1,
-                        workers=0)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @data_utils.dont_use_multiprocessing_pool
-  def test_evaluate_generator_method(self):
-    model = test_utils.get_small_mlp(
-        num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(
-        loss='mse',
-        optimizer=rmsprop.RMSprop(1e-3),
-        metrics=['mae', metrics_module.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.evaluate_generator(custom_generator_threads(),
-                             steps=5,
-                             max_queue_size=10,
-                             workers=2,
-                             verbose=1,
-                             use_multiprocessing=True)
-    model.evaluate_generator(custom_generator(),
-                             steps=5,
-                             max_queue_size=10,
-                             use_multiprocessing=False)
-    model.evaluate_generator(custom_generator(),
-                             steps=5,
-                             max_queue_size=10,
-                             use_multiprocessing=False,
-                             workers=0)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @data_utils.dont_use_multiprocessing_pool
-  def test_predict_generator_method(self):
-    model = test_utils.get_small_mlp(
-        num_hidden=3, num_classes=4, input_dim=2)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    model.predict_generator(custom_generator_threads(),
-                            steps=5,
-                            max_queue_size=10,
-                            workers=2,
-                            use_multiprocessing=True)
-    model.predict_generator(custom_generator(),
-                            steps=5,
-                            max_queue_size=10,
-                            use_multiprocessing=False)
-    model.predict_generator(custom_generator(),
-                            steps=5,
-                            max_queue_size=10,
-                            workers=0)
-    # Test generator with just inputs (no targets)
-    model.predict_generator(custom_generator_threads(mode=1),
-                            steps=5,
-                            max_queue_size=10,
-                            workers=2,
-                            use_multiprocessing=True)
-    model.predict_generator(custom_generator(mode=1),
-                            steps=5,
-                            max_queue_size=10,
-                            use_multiprocessing=False)
-    model.predict_generator(custom_generator(mode=1),
-                            steps=5,
-                            max_queue_size=10,
-                            workers=0)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_generator_methods_with_sample_weights(self):
-    model = test_utils.get_small_mlp(
-        num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(
-        loss='mse',
-        optimizer=rmsprop.RMSprop(1e-3),
-        metrics=['mae', metrics_module.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit_generator(custom_generator(mode=3),
-                        steps_per_epoch=5,
-                        epochs=1,
-                        verbose=1,
-                        max_queue_size=10,
-                        use_multiprocessing=False)
-    model.fit_generator(custom_generator(mode=3),
-                        steps_per_epoch=5,
-                        epochs=1,
-                        verbose=1,
-                        max_queue_size=10,
-                        use_multiprocessing=False,
-                        validation_data=custom_generator(mode=3),
-                        validation_steps=10)
-    model.predict_generator(custom_generator(mode=3),
-                            steps=5,
-                            max_queue_size=10,
-                            use_multiprocessing=False)
-    model.evaluate_generator(custom_generator(mode=3),
-                             steps=5,
-                             max_queue_size=10,
-                             use_multiprocessing=False)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_generator_methods_invalid_use_case(self):
-    def invalid_generator():
-      while 1:
-        yield (0, 0, 0, 0)
-
-    model = test_utils.get_small_mlp(
-        num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(
-        loss='mse',
-        optimizer=rmsprop.RMSprop(1e-3),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    with self.assertRaises(ValueError):
-      model.fit_generator(invalid_generator(),
-                          steps_per_epoch=5,
-                          epochs=1,
-                          verbose=1,
-                          max_queue_size=10,
-                          use_multiprocessing=False)
-    with self.assertRaises(ValueError):
-      model.fit_generator(custom_generator(),
-                          steps_per_epoch=5,
-                          epochs=1,
-                          verbose=1,
-                          max_queue_size=10,
-                          use_multiprocessing=False,
-                          validation_data=invalid_generator(),
-                          validation_steps=10)
-    with self.assertRaises(ValueError):
-      model.predict_generator(invalid_generator(),
-                              steps=5,
-                              max_queue_size=10,
-                              use_multiprocessing=False)
-    with self.assertRaises(ValueError):
-      model.evaluate_generator(invalid_generator(),
-                               steps=5,
-                               max_queue_size=10,
-                               use_multiprocessing=False)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_generator_input_to_fit_eval_predict(self):
-    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    def ones_generator():
-      while True:
-        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    model = test_utils.get_small_mlp(
-        num_hidden=10, num_classes=1, input_dim=10)
-
-    model.compile(
-        rmsprop.RMSprop(0.001),
-        'binary_crossentropy',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(
-        ones_generator(),
-        steps_per_epoch=2,
-        validation_data=val_data,
-        epochs=2)
-    model.evaluate(ones_generator(), steps=2)
-    model.predict(ones_generator(), steps=2)
-
-    # Test with a changing batch size
-    model = test_utils.get_small_mlp(
-        num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(
-        loss='mse',
-        optimizer=rmsprop.RMSprop(1e-3),
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
-    model.fit_generator(custom_generator_changing_batch_size(),
-                        steps_per_epoch=5,
-                        epochs=1,
-                        verbose=1,
-                        max_queue_size=10,
-                        use_multiprocessing=False)
-    model.fit_generator(custom_generator_changing_batch_size(),
-                        steps_per_epoch=5,
-                        epochs=1,
-                        verbose=1,
-                        max_queue_size=10,
-                        use_multiprocessing=False,
-                        validation_data=custom_generator_changing_batch_size(),
-                        validation_steps=10)
-
-    model.fit(
-        custom_generator_changing_batch_size(),
-        steps_per_epoch=5,
-        validation_data=custom_generator_changing_batch_size(),
-        validation_steps=10,
-        epochs=2)
-    model.evaluate(custom_generator_changing_batch_size(), steps=5)
-    model.predict(custom_generator_changing_batch_size(), steps=5)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @data_utils.dont_use_multiprocessing_pool
-  def test_generator_dynamic_shapes(self):
-
-    x = [
-        'I think juice is great',
-        'unknown is the best language since slicedbread', 'a a a a a a a',
-        'matmul', 'Yaks are also quite nice'
-    ]
-    y = [1, 0, 0, 1, 1]
-
-    vocab = {
-        word: i + 1 for i, word in
-        enumerate(
-            sorted(set(itertools.chain(*[i.split() for i in x]))))
-    }
-
-    def data_gen(batch_size=2):
-      np.random.seed(0)
-      data = list(zip(x, y)) * 10
-      np.random.shuffle(data)
-
-      def pack_and_pad(queue):
-        x = [[vocab[j] for j in i[0].split()] for i in queue]
-        pad_len = max(len(i) for i in x)
-        x = np.array([i + [0] * (pad_len - len(i)) for i in x])
-        y = np.array([i[1] for i in queue])
-        del queue[:]
-        return x, y[:, np.newaxis]
-
-      queue = []
-      for i, element in enumerate(data):
-        queue.append(element)
-        if not (i + 1) % batch_size:
-          yield pack_and_pad(queue)
-
-      if queue:
-        # Last partial batch
-        yield pack_and_pad(queue)
-
-    model = test_utils.get_model_from_layers([
-        layers_module.Embedding(input_dim=len(vocab) + 1, output_dim=4),
-        layers_module.SimpleRNN(units=1),
-        layers_module.Activation('sigmoid')
-    ], input_shape=(None,))
-
-    model.compile(loss=losses.binary_crossentropy, optimizer='sgd')
-    model.fit(data_gen(), epochs=1, steps_per_epoch=5)
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @data_utils.dont_use_multiprocessing_pool
+    def test_fit_generator_method(self):
+        model = test_utils.get_small_mlp(
+            num_hidden=3, num_classes=4, input_dim=2
+        )
+        model.compile(
+            loss="mse",
+            optimizer=rmsprop.RMSprop(1e-3),
+            metrics=["mae", metrics_module.CategoricalAccuracy()],
+        )
+
+        model.fit_generator(
+            custom_generator_threads(),
+            steps_per_epoch=5,
+            epochs=1,
+            verbose=1,
+            max_queue_size=10,
+            workers=4,
+            use_multiprocessing=True,
+        )
+        model.fit_generator(
+            custom_generator(),
+            steps_per_epoch=5,
+            epochs=1,
+            verbose=1,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+        model.fit_generator(
+            custom_generator(),
+            steps_per_epoch=5,
+            epochs=1,
+            verbose=1,
+            max_queue_size=10,
+            use_multiprocessing=False,
+            validation_data=custom_generator(),
+            validation_steps=10,
+        )
+        model.fit_generator(
+            custom_generator(),
+            steps_per_epoch=5,
+            validation_data=custom_generator(),
+            validation_steps=1,
+            workers=0,
+        )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @data_utils.dont_use_multiprocessing_pool
+    def test_evaluate_generator_method(self):
+        model = test_utils.get_small_mlp(
+            num_hidden=3, num_classes=4, input_dim=2
+        )
+        model.compile(
+            loss="mse",
+            optimizer=rmsprop.RMSprop(1e-3),
+            metrics=["mae", metrics_module.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model.evaluate_generator(
+            custom_generator_threads(),
+            steps=5,
+            max_queue_size=10,
+            workers=2,
+            verbose=1,
+            use_multiprocessing=True,
+        )
+        model.evaluate_generator(
+            custom_generator(),
+            steps=5,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+        model.evaluate_generator(
+            custom_generator(),
+            steps=5,
+            max_queue_size=10,
+            use_multiprocessing=False,
+            workers=0,
+        )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @data_utils.dont_use_multiprocessing_pool
+    def test_predict_generator_method(self):
+        model = test_utils.get_small_mlp(
+            num_hidden=3, num_classes=4, input_dim=2
+        )
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        model.predict_generator(
+            custom_generator_threads(),
+            steps=5,
+            max_queue_size=10,
+            workers=2,
+            use_multiprocessing=True,
+        )
+        model.predict_generator(
+            custom_generator(),
+            steps=5,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+        model.predict_generator(
+            custom_generator(), steps=5, max_queue_size=10, workers=0
+        )
+        # Test generator with just inputs (no targets)
+        model.predict_generator(
+            custom_generator_threads(mode=1),
+            steps=5,
+            max_queue_size=10,
+            workers=2,
+            use_multiprocessing=True,
+        )
+        model.predict_generator(
+            custom_generator(mode=1),
+            steps=5,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+        model.predict_generator(
+            custom_generator(mode=1), steps=5, max_queue_size=10, workers=0
+        )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_generator_methods_with_sample_weights(self):
+        model = test_utils.get_small_mlp(
+            num_hidden=3, num_classes=4, input_dim=2
+        )
+        model.compile(
+            loss="mse",
+            optimizer=rmsprop.RMSprop(1e-3),
+            metrics=["mae", metrics_module.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model.fit_generator(
+            custom_generator(mode=3),
+            steps_per_epoch=5,
+            epochs=1,
+            verbose=1,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+        model.fit_generator(
+            custom_generator(mode=3),
+            steps_per_epoch=5,
+            epochs=1,
+            verbose=1,
+            max_queue_size=10,
+            use_multiprocessing=False,
+            validation_data=custom_generator(mode=3),
+            validation_steps=10,
+        )
+        model.predict_generator(
+            custom_generator(mode=3),
+            steps=5,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+        model.evaluate_generator(
+            custom_generator(mode=3),
+            steps=5,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_generator_methods_invalid_use_case(self):
+        def invalid_generator():
+            while 1:
+                yield (0, 0, 0, 0)
+
+        model = test_utils.get_small_mlp(
+            num_hidden=3, num_classes=4, input_dim=2
+        )
+        model.compile(
+            loss="mse",
+            optimizer=rmsprop.RMSprop(1e-3),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        with self.assertRaises(ValueError):
+            model.fit_generator(
+                invalid_generator(),
+                steps_per_epoch=5,
+                epochs=1,
+                verbose=1,
+                max_queue_size=10,
+                use_multiprocessing=False,
+            )
+        with self.assertRaises(ValueError):
+            model.fit_generator(
+                custom_generator(),
+                steps_per_epoch=5,
+                epochs=1,
+                verbose=1,
+                max_queue_size=10,
+                use_multiprocessing=False,
+                validation_data=invalid_generator(),
+                validation_steps=10,
+            )
+        with self.assertRaises(ValueError):
+            model.predict_generator(
+                invalid_generator(),
+                steps=5,
+                max_queue_size=10,
+                use_multiprocessing=False,
+            )
+        with self.assertRaises(ValueError):
+            model.evaluate_generator(
+                invalid_generator(),
+                steps=5,
+                max_queue_size=10,
+                use_multiprocessing=False,
+            )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_generator_input_to_fit_eval_predict(self):
+        val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+        def ones_generator():
+            while True:
+                yield np.ones([10, 10], np.float32), np.ones(
+                    [10, 1], np.float32
+                )
+
+        model = test_utils.get_small_mlp(
+            num_hidden=10, num_classes=1, input_dim=10
+        )
+
+        model.compile(
+            rmsprop.RMSprop(0.001),
+            "binary_crossentropy",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            ones_generator(),
+            steps_per_epoch=2,
+            validation_data=val_data,
+            epochs=2,
+        )
+        model.evaluate(ones_generator(), steps=2)
+        model.predict(ones_generator(), steps=2)
+
+        # Test with a changing batch size
+        model = test_utils.get_small_mlp(
+            num_hidden=3, num_classes=4, input_dim=2
+        )
+        model.compile(
+            loss="mse",
+            optimizer=rmsprop.RMSprop(1e-3),
+            metrics=["mae", metrics_module.CategoricalAccuracy()],
+        )
+        model.fit_generator(
+            custom_generator_changing_batch_size(),
+            steps_per_epoch=5,
+            epochs=1,
+            verbose=1,
+            max_queue_size=10,
+            use_multiprocessing=False,
+        )
+        model.fit_generator(
+            custom_generator_changing_batch_size(),
+            steps_per_epoch=5,
+            epochs=1,
+            verbose=1,
+            max_queue_size=10,
+            use_multiprocessing=False,
+            validation_data=custom_generator_changing_batch_size(),
+            validation_steps=10,
+        )
+
+        model.fit(
+            custom_generator_changing_batch_size(),
+            steps_per_epoch=5,
+            validation_data=custom_generator_changing_batch_size(),
+            validation_steps=10,
+            epochs=2,
+        )
+        model.evaluate(custom_generator_changing_batch_size(), steps=5)
+        model.predict(custom_generator_changing_batch_size(), steps=5)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @data_utils.dont_use_multiprocessing_pool
+    def test_generator_dynamic_shapes(self):
+
+        x = [
+            "I think juice is great",
+            "unknown is the best language since slicedbread",
+            "a a a a a a a",
+            "matmul",
+            "Yaks are also quite nice",
+        ]
+        y = [1, 0, 0, 1, 1]
+
+        vocab = {
+            word: i + 1
+            for i, word in enumerate(
+                sorted(set(itertools.chain(*[i.split() for i in x])))
+            )
+        }
+
+        def data_gen(batch_size=2):
+            np.random.seed(0)
+            data = list(zip(x, y)) * 10
+            np.random.shuffle(data)
+
+            def pack_and_pad(queue):
+                x = [[vocab[j] for j in i[0].split()] for i in queue]
+                pad_len = max(len(i) for i in x)
+                x = np.array([i + [0] * (pad_len - len(i)) for i in x])
+                y = np.array([i[1] for i in queue])
+                del queue[:]
+                return x, y[:, np.newaxis]
+
+            queue = []
+            for i, element in enumerate(data):
+                queue.append(element)
+                if not (i + 1) % batch_size:
+                    yield pack_and_pad(queue)
+
+            if queue:
+                # Last partial batch
+                yield pack_and_pad(queue)
+
+        model = test_utils.get_model_from_layers(
+            [
+                layers_module.Embedding(input_dim=len(vocab) + 1, output_dim=4),
+                layers_module.SimpleRNN(units=1),
+                layers_module.Activation("sigmoid"),
+            ],
+            input_shape=(None,),
+        )
+
+        model.compile(loss=losses.binary_crossentropy, optimizer="sgd")
+        model.fit(data_gen(), epochs=1, steps_per_epoch=5)
 
 
 class TestGeneratorMethodsWithSequences(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @data_utils.dont_use_multiprocessing_pool
-  def test_training_with_sequences(self):
-
-    class DummySequence(data_utils.Sequence):
-
-      def __getitem__(self, idx):
-        return np.zeros([10, 2]), np.ones([10, 4])
-
-      def __len__(self):
-        return 10
-
-    model = test_utils.get_small_mlp(
-        num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(loss='mse', optimizer=rmsprop.RMSprop(1e-3))
-
-    model.fit_generator(DummySequence(),
-                        steps_per_epoch=10,
-                        validation_data=custom_generator(),
-                        validation_steps=1,
-                        max_queue_size=10,
-                        workers=0,
-                        use_multiprocessing=True)
-    model.fit_generator(DummySequence(),
-                        steps_per_epoch=10,
-                        validation_data=custom_generator(),
-                        validation_steps=1,
-                        max_queue_size=10,
-                        workers=0,
-                        use_multiprocessing=False)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @data_utils.dont_use_multiprocessing_pool
-  def test_sequence_input_to_fit_eval_predict(self):
-    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    class CustomSequence(data_utils.Sequence):
-
-      def __getitem__(self, idx):
-        return np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-      def __len__(self):
-        return 2
-
-    class CustomSequenceChangingBatchSize(data_utils.Sequence):
-
-      def __getitem__(self, idx):
-        batch_size = 10 - idx
-        return (np.ones([batch_size, 10], np.float32),
-                np.ones([batch_size, 1], np.float32))
-
-      def __len__(self):
-        return 2
-
-    model = test_utils.get_small_mlp(
-        num_hidden=10, num_classes=1, input_dim=10)
-
-    model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy')
-    model.fit(CustomSequence(), validation_data=val_data, epochs=2)
-    model.evaluate(CustomSequence())
-    model.predict(CustomSequence())
-
-    with self.assertRaisesRegex(ValueError, '`y` argument is not supported'):
-      model.fit(CustomSequence(), y=np.ones([10, 1]))
-
-    with self.assertRaisesRegex(ValueError,
-                                '`sample_weight` argument is not supported'):
-      model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
-
-    model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy')
-    model.fit(CustomSequenceChangingBatchSize(),
-              validation_data=val_data, epochs=2)
-    model.evaluate(CustomSequenceChangingBatchSize())
-    model.predict(CustomSequenceChangingBatchSize())
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_sequence_on_epoch_end(self):
-
-    class MySequence(data_utils.Sequence):
-
-      def __init__(self):
-        self.epochs = 0
-
-      def __getitem__(self, idx):
-        return np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-      def __len__(self):
-        return 2
-
-      def on_epoch_end(self):
-        self.epochs += 1
-
-    inputs = input_layer.Input(10)
-    outputs = layers_module.Dense(1)(inputs)
-    model = training.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-    my_seq = MySequence()
-    model.fit(my_seq, epochs=2)
-    self.assertEqual(my_seq.epochs, 2)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @data_utils.dont_use_multiprocessing_pool
+    def test_training_with_sequences(self):
+        class DummySequence(data_utils.Sequence):
+            def __getitem__(self, idx):
+                return np.zeros([10, 2]), np.ones([10, 4])
+
+            def __len__(self):
+                return 10
+
+        model = test_utils.get_small_mlp(
+            num_hidden=3, num_classes=4, input_dim=2
+        )
+        model.compile(loss="mse", optimizer=rmsprop.RMSprop(1e-3))
+
+        model.fit_generator(
+            DummySequence(),
+            steps_per_epoch=10,
+            validation_data=custom_generator(),
+            validation_steps=1,
+            max_queue_size=10,
+            workers=0,
+            use_multiprocessing=True,
+        )
+        model.fit_generator(
+            DummySequence(),
+            steps_per_epoch=10,
+            validation_data=custom_generator(),
+            validation_steps=1,
+            max_queue_size=10,
+            workers=0,
+            use_multiprocessing=False,
+        )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @data_utils.dont_use_multiprocessing_pool
+    def test_sequence_input_to_fit_eval_predict(self):
+        val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+        class CustomSequence(data_utils.Sequence):
+            def __getitem__(self, idx):
+                return np.ones([10, 10], np.float32), np.ones(
+                    [10, 1], np.float32
+                )
+
+            def __len__(self):
+                return 2
+
+        class CustomSequenceChangingBatchSize(data_utils.Sequence):
+            def __getitem__(self, idx):
+                batch_size = 10 - idx
+                return (
+                    np.ones([batch_size, 10], np.float32),
+                    np.ones([batch_size, 1], np.float32),
+                )
+
+            def __len__(self):
+                return 2
+
+        model = test_utils.get_small_mlp(
+            num_hidden=10, num_classes=1, input_dim=10
+        )
+
+        model.compile(rmsprop.RMSprop(0.001), "binary_crossentropy")
+        model.fit(CustomSequence(), validation_data=val_data, epochs=2)
+        model.evaluate(CustomSequence())
+        model.predict(CustomSequence())
+
+        with self.assertRaisesRegex(
+            ValueError, "`y` argument is not supported"
+        ):
+            model.fit(CustomSequence(), y=np.ones([10, 1]))
+
+        with self.assertRaisesRegex(
+            ValueError, "`sample_weight` argument is not supported"
+        ):
+            model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
+
+        model.compile(rmsprop.RMSprop(0.001), "binary_crossentropy")
+        model.fit(
+            CustomSequenceChangingBatchSize(),
+            validation_data=val_data,
+            epochs=2,
+        )
+        model.evaluate(CustomSequenceChangingBatchSize())
+        model.predict(CustomSequenceChangingBatchSize())
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_sequence_on_epoch_end(self):
+        class MySequence(data_utils.Sequence):
+            def __init__(self):
+                self.epochs = 0
+
+            def __getitem__(self, idx):
+                return np.ones([10, 10], np.float32), np.ones(
+                    [10, 1], np.float32
+                )
+
+            def __len__(self):
+                return 2
+
+            def on_epoch_end(self):
+                self.epochs += 1
+
+        inputs = input_layer.Input(10)
+        outputs = layers_module.Dense(1)(inputs)
+        model = training.Model(inputs, outputs)
+        model.compile("sgd", "mse")
+        my_seq = MySequence()
+        model.fit(my_seq, epochs=2)
+        self.assertEqual(my_seq.epochs, 2)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class TestConvertToGeneratorLike(tf.test.TestCase, parameterized.TestCase):
-  simple_inputs = (np.ones((10, 10)), np.ones((10, 1)))
-  nested_inputs = ((np.ones((10, 10)), np.ones((10, 20))), (np.ones((10, 1)),
-                                                            np.ones((10, 3))))
-
-  def _make_dataset(self, inputs, batches):
-    return tf.data.Dataset.from_tensors(inputs).repeat(batches)
-
-  def _make_iterator(self, inputs, batches):
-    return tf.compat.v1.data.make_one_shot_iterator(
-        self._make_dataset(inputs, batches))
-
-  def _make_generator(self, inputs, batches):
-
-    def _gen():
-      for _ in range(batches):
-        yield inputs
-
-    return _gen()
-
-  def _make_numpy(self, inputs, _):
-    return inputs
-
-  @parameterized.named_parameters(
-      ('simple_dataset', _make_dataset, simple_inputs),
-      ('simple_iterator', _make_iterator, simple_inputs),
-      ('simple_generator', _make_generator, simple_inputs),
-      ('simple_numpy', _make_numpy, simple_inputs),
-      ('nested_dataset', _make_dataset, nested_inputs),
-      ('nested_iterator', _make_iterator, nested_inputs),
-      ('nested_generator', _make_generator, nested_inputs),
-      ('nested_numpy', _make_numpy, nested_inputs))
-  def test_convert_to_generator_like(self, input_fn, inputs):
-    expected_batches = 5
-    data = input_fn(self, inputs, expected_batches)
-
-    # Dataset and Iterator not supported in Legacy Graph mode.
-    if (not tf.executing_eagerly() and
-        isinstance(data, (tf.data.Dataset, tf.compat.v1.data.Iterator))):
-      return
-
-    generator, steps = training_generator_v1.convert_to_generator_like(
-        data, batch_size=2, steps_per_epoch=expected_batches)
-    self.assertEqual(steps, expected_batches)
-
-    for _ in range(expected_batches):
-      outputs = next(generator)
-    tf.nest.assert_same_structure(outputs, inputs)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    simple_inputs = (np.ones((10, 10)), np.ones((10, 1)))
+    nested_inputs = (
+        (np.ones((10, 10)), np.ones((10, 20))),
+        (np.ones((10, 1)), np.ones((10, 3))),
+    )
+
+    def _make_dataset(self, inputs, batches):
+        return tf.data.Dataset.from_tensors(inputs).repeat(batches)
+
+    def _make_iterator(self, inputs, batches):
+        return tf.compat.v1.data.make_one_shot_iterator(
+            self._make_dataset(inputs, batches)
+        )
+
+    def _make_generator(self, inputs, batches):
+        def _gen():
+            for _ in range(batches):
+                yield inputs
+
+        return _gen()
+
+    def _make_numpy(self, inputs, _):
+        return inputs
+
+    @parameterized.named_parameters(
+        ("simple_dataset", _make_dataset, simple_inputs),
+        ("simple_iterator", _make_iterator, simple_inputs),
+        ("simple_generator", _make_generator, simple_inputs),
+        ("simple_numpy", _make_numpy, simple_inputs),
+        ("nested_dataset", _make_dataset, nested_inputs),
+        ("nested_iterator", _make_iterator, nested_inputs),
+        ("nested_generator", _make_generator, nested_inputs),
+        ("nested_numpy", _make_numpy, nested_inputs),
+    )
+    def test_convert_to_generator_like(self, input_fn, inputs):
+        expected_batches = 5
+        data = input_fn(self, inputs, expected_batches)
+
+        # Dataset and Iterator not supported in Legacy Graph mode.
+        if not tf.executing_eagerly() and isinstance(
+            data, (tf.data.Dataset, tf.compat.v1.data.Iterator)
+        ):
+            return
+
+        generator, steps = training_generator_v1.convert_to_generator_like(
+            data, batch_size=2, steps_per_epoch=expected_batches
+        )
+        self.assertEqual(steps, expected_batches)
+
+        for _ in range(expected_batches):
+            outputs = next(generator)
+        tf.nest.assert_same_structure(outputs, inputs)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/training_generator_v1.py b/keras/engine/training_generator_v1.py
index ae9e7ec6e457..4b82fad14d81 100644
--- a/keras/engine/training_generator_v1.py
+++ b/keras/engine/training_generator_v1.py
@@ -15,13 +15,12 @@
 """Part of the Keras training engine related to Python generators of array data.
 """
 
-import tensorflow.compat.v2 as tf
-# pylint: disable=protected-access
-
 import functools
 import math
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import callbacks as cbks
 from keras.engine import training_utils
@@ -29,796 +28,926 @@
 from keras.utils import data_utils
 from keras.utils import generic_utils
 from keras.utils.mode_keys import ModeKeys
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 
-def model_iteration(model,
-                    data,
-                    steps_per_epoch=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    validation_data=None,
-                    validation_steps=None,
-                    validation_freq=1,
-                    class_weight=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False,
-                    shuffle=False,
-                    initial_epoch=0,
-                    mode=ModeKeys.TRAIN,
-                    batch_size=None,
-                    steps_name='steps',
-                    **kwargs):
-  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
-
-  Args:
-      model: Keras Model instance.
-      data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
-        `(x, y, sample_weights)`) or a generator or
-        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
-      steps_per_epoch: Total number of steps (batches of samples) before
-        declaring one epoch finished and starting the next epoch. Ignored with
-        the default value of `None`.
-      epochs: Number of times to iterate over the data.
-      verbose: 0, 1, or 2. Verbosity mode.
-        0 = silent, 1 = progress bar, 2 = one line per epoch.
-        Note that the progress bar is not particularly useful when
-        logged to a file, so verbose=2 is recommended when not running
-        interactively (eg, in a production environment).
-      callbacks: List of callbacks to be called during training.
-      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
-        `(x, y)` or `(x, y, sample_weights)`) or a generator or
-        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
-      validation_steps: Total number of steps (batches of samples) before
-        declaring validation finished.
-      validation_freq: Only relevant if validation data is provided. Integer or
-        `collections.abc.Container` instance (e.g. list, tuple, etc.). If an
-        integer, specifies how many training epochs to run before a new
-        validation run is performed, e.g. `validation_freq=2` runs
-        validation every 2 epochs. If a Container, specifies the epochs on
-        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
-        validation at the end of the 1st, 2nd, and 10th epochs.
-      class_weight: Dictionary mapping class indices to a weight for the class.
-      max_queue_size: Integer. Maximum size for the generator queue. If
-        unspecified, `max_queue_size` will default to 10.
-      workers: Integer. Maximum number of processes to spin up when using
-        process-based threading. If unspecified, `workers` will default to 1. If
-        0, will execute the generator on the main thread.
-      use_multiprocessing: Boolean. If `True`, use process-based threading. If
-        unspecified, `use_multiprocessing` will default to `False`. Note that
-        because this implementation relies on multiprocessing, you should not
-        pass non-picklable arguments to the generator as they can't be passed
-        easily to children processes.
-      shuffle: Boolean. Whether to shuffle the order of the batches at the
-        beginning of each epoch. Only used with instances of `Sequence`
-        (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
-        `None`.
-      initial_epoch: Epoch at which to start training (useful for resuming a
-        previous training run).
-      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-      batch_size: Integer batch size or None if unknown. Will only be used if
-        `data` is in NumPy/Tensor format.
-      steps_name: The string name of the steps argument, either `steps`,
-        `validation_steps`, or `steps_per_epoch`. Only used for error message
-        formatting.
-      **kwargs: Additional arguments for backwards compatibility. `steps` is
-        accepted as an alias for `steps_per_epoch`.
-
-  Returns:
-      - In TRAIN mode: `History` object.
-      - In TEST mode: Evaluation metrics.
-      - In PREDICT mode: Outputs of the Model called on inputs.
-
-  Raises:
-      ValueError: in case of invalid arguments.
-  """
-  if 'steps' in kwargs:
-    steps_per_epoch = kwargs['steps']
-
-  # Determine the number of steps per epoch and whether we should reset the
-  # dataset at the end of each epoch.
-  reset_dataset_after_each_epoch = False
-  original_dataset = None
-  is_dataset = isinstance(data, (tf.data.Dataset, tf.compat.v1.data.Dataset))
-  if is_dataset:
-    original_dataset = data
-    if steps_per_epoch is None:
-      reset_dataset_after_each_epoch = True
-      steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
-          model, data, steps_per_epoch, epochs=epochs, steps_name=steps_name)
-
-  # Convert to a format that supports `next(generator)`.
-  generator, steps_per_epoch = convert_to_generator_like(
-      data,
-      steps_per_epoch=steps_per_epoch,
-      batch_size=batch_size,
-      epochs=epochs - initial_epoch,
-      shuffle=shuffle)
-
-  do_validation = validation_data is not None
-  is_sequence = isinstance(generator, data_utils.Sequence)
-  _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
-                      steps_per_epoch, validation_data, validation_steps, mode,
-                      kwargs)
-
-  batch_function = _make_execution_function(
-      model, mode, class_weight=class_weight)
-
-  # Create the queue for the generator.
-  enqueuer = None
-  if not is_dataset:
-    generator, enqueuer = _make_enqueued_generator(
-        generator,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        max_queue_size=max_queue_size,
-        shuffle=shuffle)
-
-  num_samples_or_steps, use_steps = _get_num_samples_or_steps(
-      data, steps_per_epoch)
-
-  count_mode = 'steps' if use_steps else 'samples'
-  callbacks = cbks.configure_callbacks(
-      callbacks,
-      model,
-      do_validation=do_validation,
-      epochs=epochs,
-      steps_per_epoch=steps_per_epoch,
-      batch_size=batch_size,
-      samples=num_samples_or_steps,
-      count_mode=count_mode,
-      verbose=verbose,
-      mode=mode)
-
-  if mode == ModeKeys.PREDICT:
-    aggregator = training_utils_v1.OutputsAggregator(
-        True, steps=steps_per_epoch)
-  else:
-    aggregator = training_utils_v1.MetricsAggregator(
-        True, steps=steps_per_epoch)
-
-  should_set_learning_phase = tf.executing_eagerly() and model.run_eagerly
-  if should_set_learning_phase:
-    learning_phase_scope = backend.eager_learning_phase_scope(
-        1 if mode == ModeKeys.TRAIN else 0)
-    learning_phase_scope.__enter__()
-
-  callbacks.model.stop_training = False
-  callbacks._call_begin_hook(mode)
-
-  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
-
-  for epoch in range(initial_epoch, epochs):
-    if callbacks.model.stop_training:
-      break
-
-    # Setup work for each epoch.
-    model.reset_metrics()
-    epoch_logs = {}
-    if mode == ModeKeys.TRAIN:
-      callbacks.on_epoch_begin(epoch, epoch_logs)
+def model_iteration(
+    model,
+    data,
+    steps_per_epoch=None,
+    epochs=1,
+    verbose=1,
+    callbacks=None,
+    validation_data=None,
+    validation_steps=None,
+    validation_freq=1,
+    class_weight=None,
+    max_queue_size=10,
+    workers=1,
+    use_multiprocessing=False,
+    shuffle=False,
+    initial_epoch=0,
+    mode=ModeKeys.TRAIN,
+    batch_size=None,
+    steps_name="steps",
+    **kwargs,
+):
+    """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
+
+    Args:
+        model: Keras Model instance.
+        data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
+          `(x, y, sample_weights)`) or a generator or
+          `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+        steps_per_epoch: Total number of steps (batches of samples) before
+          declaring one epoch finished and starting the next epoch. Ignored with
+          the default value of `None`.
+        epochs: Number of times to iterate over the data.
+        verbose: 0, 1, or 2. Verbosity mode.
+          0 = silent, 1 = progress bar, 2 = one line per epoch.
+          Note that the progress bar is not particularly useful when
+          logged to a file, so verbose=2 is recommended when not running
+          interactively (eg, in a production environment).
+        callbacks: List of callbacks to be called during training.
+        validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
+          `(x, y)` or `(x, y, sample_weights)`) or a generator or
+          `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+        validation_steps: Total number of steps (batches of samples) before
+          declaring validation finished.
+        validation_freq: Only relevant if validation data is provided. Integer
+          or `collections.abc.Container` instance (e.g. list, tuple, etc.). If
+          an integer, specifies how many training epochs to run before a new
+          validation run is performed, e.g. `validation_freq=2` runs validation
+          every 2 epochs. If a Container, specifies the epochs on which to run
+          validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the
+          end of the 1st, 2nd, and 10th epochs.
+        class_weight: Dictionary mapping class indices to a weight for the
+            class.
+        max_queue_size: Integer. Maximum size for the generator queue. If
+          unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Maximum number of processes to spin up when using
+          process-based threading. If unspecified, `workers` will default to 1.
+          If 0, will execute the generator on the main thread.
+        use_multiprocessing: Boolean. If `True`, use process-based threading. If
+          unspecified, `use_multiprocessing` will default to `False`. Note that
+          because this implementation relies on multiprocessing, you should not
+          pass non-pickleable arguments to the generator as they can't be passed
+          easily to children processes.
+        shuffle: Boolean. Whether to shuffle the order of the batches at the
+          beginning of each epoch. Only used with instances of `Sequence`
+          (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
+          `None`.
+        initial_epoch: Epoch at which to start training (useful for resuming a
+          previous training run).
+        mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+        batch_size: Integer batch size or None if unknown. Will only be used if
+          `data` is in NumPy/Tensor format.
+        steps_name: The string name of the steps argument, either `steps`,
+          `validation_steps`, or `steps_per_epoch`. Only used for error message
+          formatting.
+        **kwargs: Additional arguments for backwards compatibility. `steps` is
+          accepted as an alias for `steps_per_epoch`.
+
+    Returns:
+        - In TRAIN mode: `History` object.
+        - In TEST mode: Evaluation metrics.
+        - In PREDICT mode: Outputs of the Model called on inputs.
+
+    Raises:
+        ValueError: in case of invalid arguments.
+    """
+    if "steps" in kwargs:
+        steps_per_epoch = kwargs["steps"]
+
+    # Determine the number of steps per epoch and whether we should reset the
+    # dataset at the end of each epoch.
+    reset_dataset_after_each_epoch = False
+    original_dataset = None
+    is_dataset = isinstance(data, (tf.data.Dataset, tf.compat.v1.data.Dataset))
+    if is_dataset:
+        original_dataset = data
+        if steps_per_epoch is None:
+            reset_dataset_after_each_epoch = True
+            steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
+                model,
+                data,
+                steps_per_epoch,
+                epochs=epochs,
+                steps_name=steps_name,
+            )
+
+    # Convert to a format that supports `next(generator)`.
+    generator, steps_per_epoch = convert_to_generator_like(
+        data,
+        steps_per_epoch=steps_per_epoch,
+        batch_size=batch_size,
+        epochs=epochs - initial_epoch,
+        shuffle=shuffle,
+    )
+
+    do_validation = validation_data is not None
+    is_sequence = isinstance(generator, data_utils.Sequence)
+    _validate_arguments(
+        is_sequence,
+        is_dataset,
+        use_multiprocessing,
+        workers,
+        steps_per_epoch,
+        validation_data,
+        validation_steps,
+        mode,
+        kwargs,
+    )
+
+    batch_function = _make_execution_function(
+        model, mode, class_weight=class_weight
+    )
+
+    # Create the queue for the generator.
+    enqueuer = None
+    if not is_dataset:
+        generator, enqueuer = _make_enqueued_generator(
+            generator,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            max_queue_size=max_queue_size,
+            shuffle=shuffle,
+        )
+
+    num_samples_or_steps, use_steps = _get_num_samples_or_steps(
+        data, steps_per_epoch
+    )
+
+    count_mode = "steps" if use_steps else "samples"
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=do_validation,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        batch_size=batch_size,
+        samples=num_samples_or_steps,
+        count_mode=count_mode,
+        verbose=verbose,
+        mode=mode,
+    )
 
-    if steps_per_epoch is None:
-      # Loop over dataset until `OutOfRangeError` is raised.
-      target_steps = np.inf
+    if mode == ModeKeys.PREDICT:
+        aggregator = training_utils_v1.OutputsAggregator(
+            True, steps=steps_per_epoch
+        )
     else:
-      # Loop over dataset for the specified number of steps.
-      target_steps = steps_per_epoch
-
-    step = 0
-    while step < target_steps:
-      batch_data = _get_next_batch(generator)
-      if batch_data is None:
-        if is_dataset:
-          # The dataset passed by the user ran out of batches.
-          # Now we know the cardinality of the dataset.
-          # If steps_per_epoch was specified, then running out of data is
-          # unexpected, so we stop training and inform the user.
-          if steps_per_epoch:
-            callbacks.model.stop_training = True
-            logging.warning(
-                'Your dataset ran out of data; interrupting training. '
-                'Make sure that your dataset can generate at least '
-                '`%s * epochs` batches (in this case, %d batches). '
-                'You may need to use the repeat() function when '
-                'building your dataset.'
-                % (steps_name, steps_per_epoch * epochs))
-          elif step > 0:
-            steps_per_epoch = step
-            aggregator.steps = steps_per_epoch
+        aggregator = training_utils_v1.MetricsAggregator(
+            True, steps=steps_per_epoch
+        )
+
+    should_set_learning_phase = tf.executing_eagerly() and model.run_eagerly
+    if should_set_learning_phase:
+        learning_phase_scope = backend.eager_learning_phase_scope(
+            1 if mode == ModeKeys.TRAIN else 0
+        )
+        learning_phase_scope.__enter__()
+
+    callbacks.model.stop_training = False
+    callbacks._call_begin_hook(mode)
+
+    initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
+        initial_epoch, mode
+    )
+
+    for epoch in range(initial_epoch, epochs):
+        if callbacks.model.stop_training:
+            break
+
+        # Setup work for each epoch.
+        model.reset_metrics()
+        epoch_logs = {}
+        if mode == ModeKeys.TRAIN:
+            callbacks.on_epoch_begin(epoch, epoch_logs)
+
+        if steps_per_epoch is None:
+            # Loop over dataset until `OutOfRangeError` is raised.
+            target_steps = np.inf
         else:
-          # We ran out of batches while the user passed an iterator (legacy).
-          callbacks.model.stop_training = True
-          logging.warning(
-              'Your dataset iterator ran out of data; '
-              'interrupting training. Make sure that your iterator '
-              'can generate at least `%s * epochs` '
-              'batches (in this case, %d batches). You may need to'
-              'use the repeat() function when building your '
-              'dataset.' % (steps_name, steps_per_epoch * epochs))
-        break
-
-      # `batch_size` used for validation data if validation
-      # data is NumPy/EagerTensors.
-      batch_size = int(tf.nest.flatten(batch_data)[0].shape[0])
-
-      # Callbacks batch begin.
-      batch_logs = {'batch': step, 'size': batch_size}
-      callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
-
-      is_deferred = not model._is_compiled
-      batch_outs = batch_function(*batch_data)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-
-      if step == 0:
-        aggregator.create(batch_outs)
-
-        if is_deferred:
-          # Set callbacks params. We do this here when model is compiled only
-          # in the first iteration of this loop (deferred build scenario).
-          cbks.set_callback_parameters(
-              callbacks,
-              model,
-              do_validation=do_validation,
-              batch_size=batch_size,
-              epochs=epochs,
-              steps_per_epoch=steps_per_epoch,
-              samples=num_samples_or_steps,
-              verbose=verbose,
-              mode=mode)
-
-      # Aggregate results.
-      aggregator.aggregate(batch_outs)
-
-      # Callbacks batch end.
-      batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
-      callbacks._call_batch_hook(mode, 'end', step, batch_logs)
-      step += 1
-
-      if callbacks.model.stop_training:
-        break
-
-    aggregator.finalize()
-    results = aggregator.results
-    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
-    if len(results) == 1:
-      results = results[0]
-
-    # Run the test loop every epoch during training.
-    if (do_validation and
-        training_utils_v1.should_run_validation(validation_freq, epoch) and
-        not callbacks.model.stop_training):
-      val_results = model_iteration(
-          model,
-          validation_data,
-          steps_per_epoch=validation_steps,
-          batch_size=batch_size,
-          class_weight=class_weight,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          max_queue_size=max_queue_size,
-          callbacks=callbacks,
-          verbose=verbose,
-          mode=ModeKeys.TEST,
-          steps_name='validation_steps')
-
-      if not isinstance(val_results, list):
-        val_results = [val_results]
-      epoch_logs = cbks.make_logs(
-          model, epoch_logs, val_results, mode, prefix='val_')
+            # Loop over dataset for the specified number of steps.
+            target_steps = steps_per_epoch
+
+        step = 0
+        while step < target_steps:
+            batch_data = _get_next_batch(generator)
+            if batch_data is None:
+                if is_dataset:
+                    # The dataset passed by the user ran out of batches.  Now we
+                    # know the cardinality of the dataset.  If steps_per_epoch
+                    # was specified, then running out of data is unexpected, so
+                    # we stop training and inform the user.
+                    if steps_per_epoch:
+                        callbacks.model.stop_training = True
+                        logging.warning(
+                            "Your dataset ran out of data; interrupting "
+                            "training. Make sure that your dataset can "
+                            "generate at least `%s * epochs` batches (in "
+                            "this case, %d batches). You may need to use "
+                            "the repeat() function when building your dataset."
+                            % (steps_name, steps_per_epoch * epochs)
+                        )
+                    elif step > 0:
+                        steps_per_epoch = step
+                        aggregator.steps = steps_per_epoch
+                else:
+                    # We ran out of batches while the user passed an iterator
+                    # (legacy).
+                    callbacks.model.stop_training = True
+                    logging.warning(
+                        "Your dataset iterator ran out of data; "
+                        "interrupting training. Make sure that your iterator "
+                        "can generate at least `%s * epochs` "
+                        "batches (in this case, %d batches). You may need to"
+                        "use the repeat() function when building your "
+                        "dataset." % (steps_name, steps_per_epoch * epochs)
+                    )
+                break
+
+            # `batch_size` used for validation data if validation
+            # data is NumPy/EagerTensors.
+            batch_size = int(tf.nest.flatten(batch_data)[0].shape[0])
+
+            # Callbacks batch begin.
+            batch_logs = {"batch": step, "size": batch_size}
+            callbacks._call_batch_hook(mode, "begin", step, batch_logs)
+
+            is_deferred = not model._is_compiled
+            batch_outs = batch_function(*batch_data)
+            if not isinstance(batch_outs, list):
+                batch_outs = [batch_outs]
+
+            if step == 0:
+                aggregator.create(batch_outs)
+
+                if is_deferred:
+                    # Set callbacks params. We do this here when model is
+                    # compiled only in the first iteration of this loop
+                    # (deferred build scenario).
+                    cbks.set_callback_parameters(
+                        callbacks,
+                        model,
+                        do_validation=do_validation,
+                        batch_size=batch_size,
+                        epochs=epochs,
+                        steps_per_epoch=steps_per_epoch,
+                        samples=num_samples_or_steps,
+                        verbose=verbose,
+                        mode=mode,
+                    )
+
+            # Aggregate results.
+            aggregator.aggregate(batch_outs)
+
+            # Callbacks batch end.
+            batch_logs = callbacks.make_logs(
+                model, batch_logs, batch_outs, mode
+            )
+            callbacks._call_batch_hook(mode, "end", step, batch_logs)
+            step += 1
+
+            if callbacks.model.stop_training:
+                break
+
+        aggregator.finalize()
+        results = aggregator.results
+        epoch_logs = callbacks.make_logs(model, epoch_logs, results, mode)
+        if len(results) == 1:
+            results = results[0]
+
+        # Run the test loop every epoch during training.
+        if (
+            do_validation
+            and training_utils_v1.should_run_validation(validation_freq, epoch)
+            and not callbacks.model.stop_training
+        ):
+            val_results = model_iteration(
+                model,
+                validation_data,
+                steps_per_epoch=validation_steps,
+                batch_size=batch_size,
+                class_weight=class_weight,
+                workers=workers,
+                use_multiprocessing=use_multiprocessing,
+                max_queue_size=max_queue_size,
+                callbacks=callbacks,
+                verbose=verbose,
+                mode=ModeKeys.TEST,
+                steps_name="validation_steps",
+            )
+
+            if not isinstance(val_results, list):
+                val_results = [val_results]
+            epoch_logs = callbacks.make_logs(
+                model, epoch_logs, val_results, mode, prefix="val_"
+            )
+
+        if mode == ModeKeys.TRAIN:
+            # Epochs only apply to `fit`.
+            callbacks.on_epoch_end(epoch, epoch_logs)
+
+        # Recreate dataset iterator for the next epoch.
+        if reset_dataset_after_each_epoch and epoch < epochs - 1:
+            generator = tf.compat.v1.data.make_one_shot_iterator(
+                original_dataset
+            )
+
+    model._successful_loop_finish = True
+    callbacks._call_end_hook(mode)
+
+    if enqueuer is not None:
+        enqueuer.stop()
+
+    if should_set_learning_phase:
+        learning_phase_scope.__exit__(None, None, None)
 
     if mode == ModeKeys.TRAIN:
-      # Epochs only apply to `fit`.
-      callbacks.on_epoch_end(epoch, epoch_logs)
-
-    # Recreate dataset iterator for the next epoch.
-    if reset_dataset_after_each_epoch and epoch < epochs - 1:
-      generator = tf.compat.v1.data.make_one_shot_iterator(original_dataset)
-
-  model._successful_loop_finish = True
-  callbacks._call_end_hook(mode)
-
-  if enqueuer is not None:
-    enqueuer.stop()
-
-  if should_set_learning_phase:
-    learning_phase_scope.__exit__(None, None, None)
-
-  if mode == ModeKeys.TRAIN:
-    return model.history
-  return results
+        return model.history
+    return results
 
 
 # Maintain compatibility with the existing names.
 fit_generator = functools.partial(model_iteration, mode=ModeKeys.TRAIN)
 evaluate_generator = functools.partial(
-    model_iteration, mode=ModeKeys.TEST, shuffle=False)
+    model_iteration, mode=ModeKeys.TEST, shuffle=False
+)
 predict_generator = functools.partial(
-    model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
+    model_iteration, mode=ModeKeys.PREDICT, shuffle=False
+)
 
 
 def _get_next_batch(generator):
-  """Retrieves the next batch of input data."""
-  try:
-    generator_output = next(generator)
-  except (StopIteration, tf.errors.OutOfRangeError):
-    return None
-
-  if not isinstance(generator_output, tuple):
-    # Always wrap in a tuple.
-    generator_output = (generator_output,)
-  if len(generator_output) not in [1, 2, 3]:
-    raise ValueError(
-        'Output of generator should be a tuple of 1 or 2 or 3 '
-        'elements: (input,) or (input, target) or '
-        '(input, target, sample_weights). Received {}'.format(generator_output))
-  return generator_output
-
-
-def _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
-                        steps_per_epoch, validation_data, validation_steps,
-                        mode, kwargs):
-  """Raises errors if arguments are invalid.
-
-  Args:
-    is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
-      instance.
-    is_dataset: Boolean, whether data is a dataset instance.
-    use_multiprocessing: Boolean. If `True`, use process-based threading. If
-      unspecified, `use_multiprocessing` will default to `False`. Note that
-      because this implementation relies on multiprocessing, you should not pass
-      non-picklable arguments to the generator as they can't be passed easily to
-      children processes.
-    workers: Integer. Maximum number of processes to spin up when using
-      process-based threading. If unspecified, `workers` will default to 1. If
-      0, will execute the generator on the main thread.
-    steps_per_epoch: Total number of steps (batches of samples) before declaring
-      one epoch finished and starting the next epoch. Ignored with the default
-      value of `None`.
-    validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x,
-      y)` or `(x, y, sample_weights)`) or a generator or
-      `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
-    validation_steps: Total number of steps (batches of samples) before
-      declaring validation finished.
-    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-    kwargs: Additional arguments for backwards compatibility.
-
-  Raises:
-    ValueError: If `steps_per_epoch` or `validation_steps` are not passed
-      for data types that require them, or if unrecognized keyword
-      arguments are passed.
-  """
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the `keras.utils.Sequence`'
-                    ' class.'))
-
-  if steps_per_epoch is None and not is_dataset:
-    arg_name = 'steps_per_epoch' if mode == ModeKeys.TRAIN else 'steps'
-    raise ValueError('Please specify the number of steps via the '
-                     '`{}` argument.'.format(arg_name))
-
-  val_gen = (
-      data_utils.is_generator_or_sequence(validation_data) or
-      isinstance(validation_data, tf.data.Iterator))
-  if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
-      not validation_steps):
-    raise ValueError('Please specify the `validation_steps` argument.')
-
-  if any(k != 'steps' for k in kwargs):
-    raise ValueError('Invalid arguments passed: {}'.format(
-        [k for k in kwargs if k != 'steps']))
-
-
-def convert_to_generator_like(data,
-                              batch_size=None,
-                              steps_per_epoch=None,
-                              epochs=1,
-                              shuffle=False):
-  """Make a generator out of NumPy or EagerTensor inputs.
-
-  Args:
-    data: Either a generator or `keras.utils.data_utils.Sequence` object or
-      `Dataset`, `Iterator`, or a {1,2,3}-tuple of NumPy arrays or EagerTensors.
-      If a tuple, the elements represent `(x, y, sample_weights)` and may be
-      `None` or `[None]`.
-    batch_size: Used when creating a generator out of tuples of NumPy arrays or
-      EagerTensors.
-    steps_per_epoch: Steps of the generator to run each epoch. If `None` the
-      number of steps will be read from the data (for
-      `keras.utils.data_utils.Sequence` types).
-    epochs: Total number of epochs to run.
-    shuffle: Whether the data should be shuffled.
-
-  Returns:
-    - Generator, `keras.utils.data_utils.Sequence`, or `Iterator`.
-
-  Raises:
-    - ValueError: If `batch_size` is not provided for NumPy or EagerTensor
-      inputs.
-  """
-  if isinstance(data, tuple):
-    # Scrub `Nones` that might have been passed for `targets`, `sample_weights`.
-    data = tuple(
-        ele for ele in data if not all(e is None for e in tf.nest.flatten(ele)))
-
-  if data_utils.is_generator_or_sequence(data) or isinstance(
-      data, tf.data.Iterator):
-    if isinstance(data, data_utils.Sequence):
-      if steps_per_epoch is None:
-        steps_per_epoch = len(data)
-    return data, steps_per_epoch
-  if isinstance(data, tf.data.Dataset):
-    return tf.compat.v1.data.make_one_shot_iterator(data), steps_per_epoch
-
-  # Create generator from NumPy or EagerTensor Input.
-  num_samples = int(tf.nest.flatten(data)[0].shape[0])
-  if batch_size is None:
-    raise ValueError(
-        'When passing input data as arrays, do not specify '
-        '`steps_per_epoch`/`steps` argument. Please use `batch_size` instead.')
-  steps_per_epoch = int(math.ceil(num_samples / batch_size))
-
-  def _gen(data):
-    """Makes a generator out of a structure of NumPy/EagerTensors."""
-    index_array = np.arange(num_samples)
-    for _ in range(epochs):
-      if shuffle:
-        np.random.shuffle(index_array)
-      batches = generic_utils.make_batches(num_samples, batch_size)
-      for (batch_start, batch_end) in batches:
-        batch_ids = index_array[batch_start:batch_end]
-        flat_batch_data = training_utils.slice_arrays(
-            tf.nest.flatten(data), batch_ids, contiguous=(not shuffle))
-        yield tf.nest.pack_sequence_as(data, flat_batch_data)
-
-  return _gen(data), steps_per_epoch
-
-
-def _make_enqueued_generator(generator,
-                             workers=1,
-                             use_multiprocessing=False,
-                             max_queue_size=10,
-                             shuffle=False):
-  """Create a buffered queue of next elements of the generator."""
-  is_sequence = isinstance(generator, data_utils.Sequence)
-  enqueuer = None
-  if workers > 0:
-    if is_sequence:
-      enqueuer = data_utils.OrderedEnqueuer(
-          generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
-    else:
-      enqueuer = data_utils.GeneratorEnqueuer(
-          generator, use_multiprocessing=use_multiprocessing)
-    enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-    output_generator = enqueuer.get()
-  else:
-    if is_sequence:
-      output_generator = data_utils.iter_sequence_infinite(generator)
+    """Retrieves the next batch of input data."""
+    try:
+        generator_output = next(generator)
+    except (StopIteration, tf.errors.OutOfRangeError):
+        return None
+
+    if not isinstance(generator_output, tuple):
+        # Always wrap in a tuple.
+        generator_output = (generator_output,)
+    if len(generator_output) not in [1, 2, 3]:
+        raise ValueError(
+            "Output of generator should be a tuple of 1 or 2 or 3 "
+            "elements: (input,) or (input, target) or "
+            "(input, target, sample_weights). Received {}".format(
+                generator_output
+            )
+        )
+    return generator_output
+
+
+def _validate_arguments(
+    is_sequence,
+    is_dataset,
+    use_multiprocessing,
+    workers,
+    steps_per_epoch,
+    validation_data,
+    validation_steps,
+    mode,
+    kwargs,
+):
+    """Raises errors if arguments are invalid.
+
+    Args:
+      is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
+        instance.
+      is_dataset: Boolean, whether data is a dataset instance.
+      use_multiprocessing: Boolean. If `True`, use process-based threading. If
+        unspecified, `use_multiprocessing` will default to `False`. Note that
+        because this implementation relies on multiprocessing, you should not
+        pass non-pickleable arguments to the generator as they can't be passed
+        easily to children processes.
+      workers: Integer. Maximum number of processes to spin up when using
+        process-based threading. If unspecified, `workers` will default to 1. If
+        0, will execute the generator on the main thread.
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
+        `(x, y)` or `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      validation_steps: Total number of steps (batches of samples) before
+        declaring validation finished.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+      kwargs: Additional arguments for backwards compatibility.
+
+    Raises:
+      ValueError: If `steps_per_epoch` or `validation_steps` are not passed
+        for data types that require them, or if unrecognized keyword
+        arguments are passed.
+    """
+    if not is_sequence and use_multiprocessing and workers > 1:
+        logging.warning(
+            UserWarning(
+                "Using a generator with `use_multiprocessing=True`"
+                " and multiple workers may duplicate your data."
+                " Please consider using the `keras.utils.Sequence`"
+                " class."
+            )
+        )
+
+    if steps_per_epoch is None and not is_dataset:
+        arg_name = "steps_per_epoch" if mode == ModeKeys.TRAIN else "steps"
+        raise ValueError(
+            f"Please specify the number of steps via the `{arg_name}` argument."
+        )
+
+    val_gen = data_utils.is_generator_or_sequence(
+        validation_data
+    ) or isinstance(validation_data, tf.data.Iterator)
+    if (
+        val_gen
+        and not isinstance(validation_data, data_utils.Sequence)
+        and not validation_steps
+    ):
+        raise ValueError("Please specify the `validation_steps` argument.")
+
+    if any(k != "steps" for k in kwargs):
+        raise ValueError(
+            f"Invalid arguments passed: {[k for k in kwargs if k != 'steps']}"
+        )
+
+
+def convert_to_generator_like(
+    data, batch_size=None, steps_per_epoch=None, epochs=1, shuffle=False
+):
+    """Make a generator out of NumPy or EagerTensor inputs.
+
+    Args:
+      data: Either a generator or `keras.utils.data_utils.Sequence` object or
+        `Dataset`, `Iterator`, or a {1,2,3}-tuple of NumPy arrays or
+        EagerTensors.  If a tuple, the elements represent `(x, y,
+        sample_weights)` and may be `None` or `[None]`.
+      batch_size: Used when creating a generator out of tuples of NumPy arrays
+        or EagerTensors.
+      steps_per_epoch: Steps of the generator to run each epoch. If `None` the
+        number of steps will be read from the data (for
+        `keras.utils.data_utils.Sequence` types).
+      epochs: Total number of epochs to run.
+      shuffle: Whether the data should be shuffled.
+
+    Returns:
+      - Generator, `keras.utils.data_utils.Sequence`, or `Iterator`.
+
+    Raises:
+      - ValueError: If `batch_size` is not provided for NumPy or EagerTensor
+        inputs.
+    """
+    if isinstance(data, tuple):
+        # Scrub `Nones` that might have been passed for `targets`,
+        # `sample_weights`.
+        data = tuple(
+            ele
+            for ele in data
+            if not all(e is None for e in tf.nest.flatten(ele))
+        )
+
+    if data_utils.is_generator_or_sequence(data) or isinstance(
+        data, tf.data.Iterator
+    ):
+        if isinstance(data, data_utils.Sequence):
+            if steps_per_epoch is None:
+                steps_per_epoch = len(data)
+        return data, steps_per_epoch
+    if isinstance(data, tf.data.Dataset):
+        return tf.compat.v1.data.make_one_shot_iterator(data), steps_per_epoch
+
+    # Create generator from NumPy or EagerTensor Input.
+    num_samples = int(tf.nest.flatten(data)[0].shape[0])
+    if batch_size is None:
+        raise ValueError(
+            "When passing input data as arrays, do not specify "
+            "`steps_per_epoch`/`steps` argument. "
+            "Please use `batch_size` instead."
+        )
+    steps_per_epoch = int(math.ceil(num_samples / batch_size))
+
+    def _gen(data):
+        """Makes a generator out of a structure of NumPy/EagerTensors."""
+        index_array = np.arange(num_samples)
+        for _ in range(epochs):
+            if shuffle:
+                np.random.shuffle(index_array)
+            batches = generic_utils.make_batches(num_samples, batch_size)
+            for batch_start, batch_end in batches:
+                batch_ids = index_array[batch_start:batch_end]
+                flat_batch_data = training_utils.slice_arrays(
+                    tf.nest.flatten(data), batch_ids, contiguous=(not shuffle)
+                )
+                yield tf.nest.pack_sequence_as(data, flat_batch_data)
+
+    return _gen(data), steps_per_epoch
+
+
+def _make_enqueued_generator(
+    generator,
+    workers=1,
+    use_multiprocessing=False,
+    max_queue_size=10,
+    shuffle=False,
+):
+    """Create a buffered queue of next elements of the generator."""
+    is_sequence = isinstance(generator, data_utils.Sequence)
+    enqueuer = None
+    if workers > 0:
+        if is_sequence:
+            enqueuer = data_utils.OrderedEnqueuer(
+                generator,
+                use_multiprocessing=use_multiprocessing,
+                shuffle=shuffle,
+            )
+        else:
+            enqueuer = data_utils.GeneratorEnqueuer(
+                generator, use_multiprocessing=use_multiprocessing
+            )
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        output_generator = enqueuer.get()
     else:
-      output_generator = generator
-  return output_generator, enqueuer
+        if is_sequence:
+            output_generator = data_utils.iter_sequence_infinite(generator)
+        else:
+            output_generator = generator
+    return output_generator, enqueuer
 
 
 def _make_execution_function(model, mode, class_weight=None):
-  """Makes function to run one step of model execution."""
-  if mode == ModeKeys.TRAIN:
-    f = functools.partial(model.train_on_batch, class_weight=class_weight)
-  elif mode == ModeKeys.TEST:
-    f = model.test_on_batch
-  else:
-    # Match signature of other modes to allow
-    # 1, 2, or 3-tuples from generator
-    def predict_on_batch(x, y=None, sample_weights=None):  # pylint: disable=unused-argument
-      return model.predict_on_batch(x)
+    """Makes function to run one step of model execution."""
+    if mode == ModeKeys.TRAIN:
+        f = functools.partial(model.train_on_batch, class_weight=class_weight)
+    elif mode == ModeKeys.TEST:
+        f = model.test_on_batch
+    else:
+        # Match signature of other modes to allow
+        # 1, 2, or 3-tuples from generator
+        def predict_on_batch(x, y=None, sample_weights=None):
+            return model.predict_on_batch(x)
 
-    f = predict_on_batch
+        f = predict_on_batch
 
-  # Maintain stateful metrics across batch-level calls.
-  if mode != ModeKeys.PREDICT:
-    f = functools.partial(f, reset_metrics=False)
+    # Maintain stateful metrics across batch-level calls.
+    if mode != ModeKeys.PREDICT:
+        f = functools.partial(f, reset_metrics=False)
 
-  return f
+    return f
 
 
 def _get_num_samples_or_steps(data, steps_per_epoch):
-  """Returns number of samples or steps, and whether to use steps count mode."""
-  flat_inputs = tf.nest.flatten(data)
-  if hasattr(flat_inputs[0], 'shape'):
-    return int(flat_inputs[0].shape[0]), False
-  return steps_per_epoch, True
+    """Returns number of samples or steps, and whether to use steps count
+    mode."""
+    flat_inputs = tf.nest.flatten(data)
+    if hasattr(flat_inputs[0], "shape"):
+        return int(flat_inputs[0].shape[0]), False
+    return steps_per_epoch, True
 
 
 class GeneratorOrSequenceTrainingLoop(training_utils_v1.TrainingLoop):
-  """Generator-like.
-
-  Input is Python generator, or Sequence object.
-
-  The difference between this class and `GeneratorLikeTrainingFunction` is that
-  this class only handles inputs that with x, y and sample_weight fused into one
-  param.
-  """
-
-  def fit(self,
-          model,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          max_queue_size=10,
-          workers=1,
-          use_multiprocessing=False):
-    model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
-    training_utils_v1.check_generator_arguments(
-        y, sample_weight, validation_split=validation_split)
-    return fit_generator(
+    """Generator-like.
+
+    Input is Python generator, or Sequence object.
+
+    The difference between this class and `GeneratorLikeTrainingFunction` is
+    that this class only handles inputs that with x, y and sample_weight fused
+    into one param.
+    """
+
+    def fit(
+        self,
         model,
-        x,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch,
-        steps_name='steps_per_epoch')
-
-  def evaluate(self,
-               model,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               max_queue_size=10,
-               workers=1,
-               use_multiprocessing=False):
-    model._validate_or_infer_batch_size(batch_size, steps, x)
-    training_utils_v1.check_generator_arguments(y, sample_weight)
-    return evaluate_generator(
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_freq=1,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+    ):
+        model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
+        training_utils_v1.check_generator_arguments(
+            y, sample_weight, validation_split=validation_split
+        )
+        return fit_generator(
+            model,
+            x,
+            steps_per_epoch=steps_per_epoch,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_data=validation_data,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq,
+            class_weight=class_weight,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            shuffle=shuffle,
+            initial_epoch=initial_epoch,
+            steps_name="steps_per_epoch",
+        )
+
+    def evaluate(
+        self,
         model,
-        x,
-        steps=steps,
-        verbose=verbose,
-        callbacks=callbacks,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
-
-  def predict(self,
-              model,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              max_queue_size=10,
-              workers=1,
-              use_multiprocessing=False):
-    model._validate_or_infer_batch_size(batch_size, steps, x)
-    return predict_generator(
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose=1,
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+    ):
+        model._validate_or_infer_batch_size(batch_size, steps, x)
+        training_utils_v1.check_generator_arguments(y, sample_weight)
+        return evaluate_generator(
+            model,
+            x,
+            steps=steps,
+            verbose=verbose,
+            callbacks=callbacks,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+        )
+
+    def predict(
+        self,
         model,
         x,
-        steps=steps,
-        verbose=verbose,
-        callbacks=callbacks,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
+        batch_size=None,
+        verbose=0,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+    ):
+        model._validate_or_infer_batch_size(batch_size, steps, x)
+        return predict_generator(
+            model,
+            x,
+            steps=steps,
+            verbose=verbose,
+            callbacks=callbacks,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+        )
 
 
 class EagerDatasetOrIteratorTrainingLoop(training_utils_v1.TrainingLoop):
-  """A non-distributed Dataset or iterator in eager execution."""
-
-  def fit(self,
-          model,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          **kwargs):
-    model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
-    # Make sure that y, sample_weights, validation_split are not passed.
-    training_utils_v1.validate_dataset_input(x, y, sample_weight,
-                                             validation_split)
-    if (isinstance(x, (tf.compat.v1.data.Dataset, tf.data.Dataset)) and
-        shuffle):
-      training_utils_v1.verify_dataset_shuffled(x)
-
-    return fit_generator(
+    """A non-distributed Dataset or iterator in eager execution."""
+
+    def fit(
+        self,
+        model,
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_freq=1,
+        **kwargs,
+    ):
+        model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
+        # Make sure that y, sample_weights, validation_split are not passed.
+        training_utils_v1.validate_dataset_input(
+            x, y, sample_weight, validation_split
+        )
+        if (
+            isinstance(x, (tf.compat.v1.data.Dataset, tf.data.Dataset))
+            and shuffle
+        ):
+            training_utils_v1.verify_dataset_shuffled(x)
+
+        return fit_generator(
+            model,
+            x,
+            steps_per_epoch=steps_per_epoch,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_data=validation_data,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq,
+            class_weight=class_weight,
+            workers=0,
+            shuffle=shuffle,
+            initial_epoch=initial_epoch,
+            steps_name="steps_per_epoch",
+        )
+
+    def evaluate(
+        self,
+        model,
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose=1,
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        model._validate_or_infer_batch_size(batch_size, steps, x)
+        # Make sure that y, sample_weights, validation_split are not passed.
+        training_utils_v1.validate_dataset_input(x, y, sample_weight)
+        return evaluate_generator(
+            model,
+            x,
+            steps=steps,
+            verbose=verbose,
+            workers=0,
+            callbacks=callbacks,
+        )
+
+    def predict(
+        self,
         model,
         x,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        class_weight=class_weight,
-        workers=0,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch,
-        steps_name='steps_per_epoch')
-
-  def evaluate(self,
-               model,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               **kwargs):
-    model._validate_or_infer_batch_size(batch_size, steps, x)
-    # Make sure that y, sample_weights, validation_split are not passed.
-    training_utils_v1.validate_dataset_input(x, y, sample_weight)
-    return evaluate_generator(
-        model, x, steps=steps, verbose=verbose, workers=0, callbacks=callbacks)
-
-  def predict(self,
-              model,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              **kwargs):
-    model._validate_or_infer_batch_size(batch_size, steps, x)
-    return predict_generator(
-        model, x, steps=steps, verbose=verbose, workers=0, callbacks=callbacks)
+        batch_size=None,
+        verbose=0,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        model._validate_or_infer_batch_size(batch_size, steps, x)
+        return predict_generator(
+            model,
+            x,
+            steps=steps,
+            verbose=verbose,
+            workers=0,
+            callbacks=callbacks,
+        )
 
 
 class GeneratorLikeTrainingLoop(training_utils_v1.TrainingLoop):
-  """TrainingLoop that handle inputs like python generator.
-
-  This is the default handler for most of the input data types, includes
-  symbolic tensors or Numpy array-like, Datasets and iterators in graph mode
-  (since they generate symbolic tensors). This Function is used to handle model
-  with `run_eagerly` = True.
-  """
-
-  def fit(self,
-          model,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          **kwargs):
-    batch_size = model._validate_or_infer_batch_size(batch_size,
-                                                     steps_per_epoch, x)
-    x, y, sample_weights = model._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        class_weight=class_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps_per_epoch',
-        steps=steps_per_epoch,
-        validation_split=validation_split,
-        shuffle=shuffle)
-
-    if validation_data:
-      validation_data = model._prepare_validation_data(validation_data,
-                                                       batch_size,
-                                                       validation_steps)
-    elif validation_split and 0. < validation_split < 1.:
-      (x, y, sample_weights, val_x, val_y,
-       val_sample_weights) = (
-           training_utils_v1.split_training_and_validation_data(
-               x, y, sample_weights, validation_split))
-      validation_data = (val_x, val_y, val_sample_weights)
-    else:
-      if validation_steps:
-        raise ValueError('`validation_steps` should not be specified if '
-                         '`validation_data` is None.')
+    """TrainingLoop that handle inputs like python generator.
 
-    return fit_generator(
-        model, (x, y, sample_weights),
-        steps_per_epoch=steps_per_epoch,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        workers=0,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch,
-        steps_name='steps_per_epoch')
-
-  def evaluate(self,
-               model,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               **kwargs):
-    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-    x, y, sample_weights = model._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps',
-        steps=steps)
-    return evaluate_generator(
-        model, (x, y, sample_weights),
-        steps=steps,
-        batch_size=batch_size,
-        verbose=verbose,
-        workers=0,
-        callbacks=callbacks)
-
-  def predict(self,
-              model,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              **kwargs):
-    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
-    x, _, _ = model._standardize_user_data(
-        x, check_steps=True, steps_name='steps', steps=steps)
-    return predict_generator(
+    This is the default handler for most of the input data types, includes
+    symbolic tensors or Numpy array-like, Datasets and iterators in graph mode
+    (since they generate symbolic tensors). This Function is used to handle
+    model with `run_eagerly` = True.
+    """
+
+    def fit(
+        self,
+        model,
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_freq=1,
+        **kwargs,
+    ):
+        batch_size = model._validate_or_infer_batch_size(
+            batch_size, steps_per_epoch, x
+        )
+        x, y, sample_weights = model._standardize_user_data(
+            x,
+            y,
+            sample_weight=sample_weight,
+            class_weight=class_weight,
+            batch_size=batch_size,
+            check_steps=True,
+            steps_name="steps_per_epoch",
+            steps=steps_per_epoch,
+            validation_split=validation_split,
+            shuffle=shuffle,
+        )
+
+        if validation_data:
+            validation_data = model._prepare_validation_data(
+                validation_data, batch_size, validation_steps
+            )
+        elif validation_split and 0.0 < validation_split < 1.0:
+            (
+                x,
+                y,
+                sample_weights,
+                val_x,
+                val_y,
+                val_sample_weights,
+            ) = training_utils_v1.split_training_and_validation_data(
+                x, y, sample_weights, validation_split
+            )
+            validation_data = (val_x, val_y, val_sample_weights)
+        else:
+            if validation_steps:
+                raise ValueError(
+                    "`validation_steps` should not be specified if "
+                    "`validation_data` is None."
+                )
+
+        return fit_generator(
+            model,
+            (x, y, sample_weights),
+            steps_per_epoch=steps_per_epoch,
+            batch_size=batch_size,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_data=validation_data,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq,
+            workers=0,
+            shuffle=shuffle,
+            initial_epoch=initial_epoch,
+            steps_name="steps_per_epoch",
+        )
+
+    def evaluate(
+        self,
+        model,
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose=1,
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+        x, y, sample_weights = model._standardize_user_data(
+            x,
+            y,
+            sample_weight=sample_weight,
+            batch_size=batch_size,
+            check_steps=True,
+            steps_name="steps",
+            steps=steps,
+        )
+        return evaluate_generator(
+            model,
+            (x, y, sample_weights),
+            steps=steps,
+            batch_size=batch_size,
+            verbose=verbose,
+            workers=0,
+            callbacks=callbacks,
+        )
+
+    def predict(
+        self,
         model,
         x,
-        steps=steps,
-        batch_size=batch_size,
-        verbose=verbose,
-        workers=0,
-        callbacks=callbacks)
+        batch_size=None,
+        verbose=0,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+        x, _, _ = model._standardize_user_data(
+            x, check_steps=True, steps_name="steps", steps=steps
+        )
+        return predict_generator(
+            model,
+            x,
+            steps=steps,
+            batch_size=batch_size,
+            verbose=verbose,
+            workers=0,
+            callbacks=callbacks,
+        )
diff --git a/keras/engine/training_gpu_test.py b/keras/engine/training_gpu_test.py
index 0972670f9105..cfa3eb5b394c 100644
--- a/keras/engine/training_gpu_test.py
+++ b/keras/engine/training_gpu_test.py
@@ -14,113 +14,151 @@
 # ==============================================================================
 """Tests for training routines."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
+
 from keras import backend
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.engine import input_layer
 from keras.engine import training
 from keras.layers.convolutional import Conv2D
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 class TrainingGPUTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_model_with_crossentropy_losses_channels_first(self):
-    """Tests use of all crossentropy losses with `channels_first`.
-
-    Tests `sparse_categorical_crossentropy`, `categorical_crossentropy`,
-    and `binary_crossentropy`.
-    Verifies that evaluate gives the same result with either `channels_first`
-    or `channels_last` image_data_format.
-    """
-    def prepare_simple_model(input_tensor, loss_name, target):
-      axis = 1 if backend.image_data_format() == 'channels_first' else -1
-      loss = None
-      num_channels = None
-      activation = None
-      if loss_name == 'sparse_categorical_crossentropy':
-        loss = lambda y_true, y_pred: backend.sparse_categorical_crossentropy(  # pylint: disable=g-long-lambda
-            y_true, y_pred, axis=axis)
-        num_channels = int(np.amax(target) + 1)
-        activation = 'softmax'
-      elif loss_name == 'categorical_crossentropy':
-        loss = lambda y_true, y_pred: backend.categorical_crossentropy(  # pylint: disable=g-long-lambda
-            y_true, y_pred, axis=axis)
-        num_channels = target.shape[axis]
-        activation = 'softmax'
-      elif loss_name == 'binary_crossentropy':
-        loss = lambda y_true, y_pred: backend.binary_crossentropy(  # pylint: disable=g-long-lambda, unnecessary-lambda
-            y_true, y_pred)
-        num_channels = target.shape[axis]
-        activation = 'sigmoid'
-
-      predictions = Conv2D(num_channels,
-                           1,
-                           activation=activation,
-                           kernel_initializer='ones',
-                           bias_initializer='ones')(input_tensor)
-      simple_model = training.Model(inputs=input_tensor, outputs=predictions)
-      simple_model.compile(optimizer='rmsprop', loss=loss)
-      return simple_model
-
-    if tf.test.is_gpu_available(cuda_only=True):
-      with test_utils.use_gpu():
-        losses_to_test = ['sparse_categorical_crossentropy',
-                          'categorical_crossentropy', 'binary_crossentropy']
-
-        data_channels_first = np.array([[[[8., 7.1, 0.], [4.5, 2.6, 0.55],
-                                          [0.9, 4.2, 11.2]]]], dtype=np.float32)
-        # Labels for testing 4-class sparse_categorical_crossentropy, 4-class
-        # categorical_crossentropy, and 2-class binary_crossentropy:
-        labels_channels_first = [np.array([[[[0, 1, 3], [2, 1, 0], [2, 2, 1]]]], dtype=np.float32),  # pylint: disable=line-too-long
-                                 np.array([[[[0, 1, 0], [0, 1, 0], [0, 0, 0]],
-                                            [[1, 0, 0], [0, 0, 1], [0, 1, 0]],
-                                            [[0, 0, 0], [1, 0, 0], [0, 0, 1]],
-                                            [[0, 0, 1], [0, 0, 0], [1, 0, 0]]]], dtype=np.float32),  # pylint: disable=line-too-long
-                                 np.array([[[[0, 1, 0], [0, 1, 0], [0, 0, 1]],
-                                            [[1, 0, 1], [1, 0, 1], [1, 1, 0]]]], dtype=np.float32)]  # pylint: disable=line-too-long
-        # Compute one loss for each loss function in the list `losses_to_test`:
-        loss_channels_last = [0., 0., 0.]
-        loss_channels_first = [0., 0., 0.]
-
-        old_data_format = backend.image_data_format()
-
-        # Evaluate a simple network with channels last, with all three loss
-        # functions:
-        backend.set_image_data_format('channels_last')
-        data = np.moveaxis(data_channels_first, 1, -1)
-        for index, loss_function in enumerate(losses_to_test):
-          labels = np.moveaxis(labels_channels_first[index], 1, -1)
-          inputs = input_layer.Input(shape=(3, 3, 1))
-          model = prepare_simple_model(inputs, loss_function, labels)
-          loss_channels_last[index] = model.evaluate(x=data, y=labels,
-                                                     batch_size=1, verbose=0)
-
-        # Evaluate the same network with channels first, with all three loss
-        # functions:
-        backend.set_image_data_format('channels_first')
-        data = data_channels_first
-        for index, loss_function in enumerate(losses_to_test):
-          labels = labels_channels_first[index]
-          inputs = input_layer.Input(shape=(1, 3, 3))
-          model = prepare_simple_model(inputs, loss_function, labels)
-          loss_channels_first[index] = model.evaluate(x=data, y=labels,
-                                                      batch_size=1, verbose=0)
-
-        backend.set_image_data_format(old_data_format)
-
-        np.testing.assert_allclose(
-            loss_channels_first,
-            loss_channels_last,
-            rtol=1e-06,
-            err_msg='{}{}'.format('Computed different losses for ',
-                                  'channels_first and channels_last'))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_model_with_crossentropy_losses_channels_first(self):
+        """Tests use of all crossentropy losses with `channels_first`.
+
+        Tests `sparse_categorical_crossentropy`, `categorical_crossentropy`,
+        and `binary_crossentropy`.
+        Verifies that evaluate gives the same result with either
+        `channels_first` or `channels_last` image_data_format.
+        """
+
+        def prepare_simple_model(input_tensor, loss_name, target):
+            axis = 1 if backend.image_data_format() == "channels_first" else -1
+            loss = None
+            num_channels = None
+            activation = None
+            if loss_name == "sparse_categorical_crossentropy":
+                loss = lambda y_true, y_pred: backend.sparse_categorical_crossentropy(  # noqa: E501
+                    y_true, y_pred, axis=axis
+                )
+                num_channels = int(np.amax(target) + 1)
+                activation = "softmax"
+            elif loss_name == "categorical_crossentropy":
+                loss = lambda y_true, y_pred: backend.categorical_crossentropy(
+                    y_true, y_pred, axis=axis
+                )
+                num_channels = target.shape[axis]
+                activation = "softmax"
+            elif loss_name == "binary_crossentropy":
+                loss = lambda y_true, y_pred: backend.binary_crossentropy(
+                    y_true, y_pred
+                )
+                num_channels = target.shape[axis]
+                activation = "sigmoid"
+
+            predictions = Conv2D(
+                num_channels,
+                1,
+                activation=activation,
+                kernel_initializer="ones",
+                bias_initializer="ones",
+            )(input_tensor)
+            simple_model = training.Model(
+                inputs=input_tensor, outputs=predictions
+            )
+            simple_model.compile(optimizer="rmsprop", loss=loss)
+            return simple_model
+
+        if tf.test.is_gpu_available(cuda_only=True):
+            with test_utils.use_gpu():
+                losses_to_test = [
+                    "sparse_categorical_crossentropy",
+                    "categorical_crossentropy",
+                    "binary_crossentropy",
+                ]
+
+                data_channels_first = np.array(
+                    [[[[8.0, 7.1, 0.0], [4.5, 2.6, 0.55], [0.9, 4.2, 11.2]]]],
+                    dtype=np.float32,
+                )
+                # Labels for testing 4-class sparse_categorical_crossentropy,
+                # 4-class categorical_crossentropy, and 2-class
+                # binary_crossentropy:
+                labels_channels_first = [
+                    np.array(
+                        [[[[0, 1, 3], [2, 1, 0], [2, 2, 1]]]], dtype=np.float32
+                    ),
+                    np.array(
+                        [
+                            [
+                                [[0, 1, 0], [0, 1, 0], [0, 0, 0]],
+                                [[1, 0, 0], [0, 0, 1], [0, 1, 0]],
+                                [[0, 0, 0], [1, 0, 0], [0, 0, 1]],
+                                [[0, 0, 1], [0, 0, 0], [1, 0, 0]],
+                            ]
+                        ],
+                        dtype=np.float32,
+                    ),
+                    np.array(
+                        [
+                            [
+                                [[0, 1, 0], [0, 1, 0], [0, 0, 1]],
+                                [[1, 0, 1], [1, 0, 1], [1, 1, 0]],
+                            ]
+                        ],
+                        dtype=np.float32,
+                    ),
+                ]
+                # Compute one loss for each loss function in the list
+                # `losses_to_test`:
+                loss_channels_last = [0.0, 0.0, 0.0]
+                loss_channels_first = [0.0, 0.0, 0.0]
+
+                old_data_format = backend.image_data_format()
+
+                # Evaluate a simple network with channels last, with all three
+                # loss functions:
+                backend.set_image_data_format("channels_last")
+                data = np.moveaxis(data_channels_first, 1, -1)
+                for index, loss_function in enumerate(losses_to_test):
+                    labels = np.moveaxis(labels_channels_first[index], 1, -1)
+                    inputs = input_layer.Input(shape=(3, 3, 1))
+                    model = prepare_simple_model(inputs, loss_function, labels)
+                    loss_channels_last[index] = model.evaluate(
+                        x=data, y=labels, batch_size=1, verbose=0
+                    )
+
+                # Evaluate the same network with channels first, with all three
+                # loss functions:
+                backend.set_image_data_format("channels_first")
+                data = data_channels_first
+                for index, loss_function in enumerate(losses_to_test):
+                    labels = labels_channels_first[index]
+                    inputs = input_layer.Input(shape=(1, 3, 3))
+                    model = prepare_simple_model(inputs, loss_function, labels)
+                    loss_channels_first[index] = model.evaluate(
+                        x=data, y=labels, batch_size=1, verbose=0
+                    )
+
+                backend.set_image_data_format(old_data_format)
+
+                np.testing.assert_allclose(
+                    loss_channels_first,
+                    loss_channels_last,
+                    rtol=1e-06,
+                    err_msg="{}{}".format(
+                        "Computed different losses for ",
+                        "channels_first and channels_last",
+                    ),
+                )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/training_integration_test.py b/keras/engine/training_integration_test.py
index f3516718ad12..8b6050c396bc 100644
--- a/keras/engine/training_integration_test.py
+++ b/keras/engine/training_integration_test.py
@@ -14,13 +14,12 @@
 # ==============================================================================
 """End-to-end tests for a variety of small models."""
 
-import tensorflow.compat.v2 as tf
-
 import collections
 import itertools
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras.testing_infra import test_combinations
@@ -28,8 +27,8 @@
 
 
 def _conv2d_filter(**kwargs):
-  """Convolution with non-default strides and dilation rate is not supported."""
-  return kwargs['strides'] <= 1 or kwargs['dilation_rate'] <= 1
+    """Conv with non-default strides and dilation rate is not supported."""
+    return kwargs["strides"] <= 1 or kwargs["dilation_rate"] <= 1
 
 
 # Scheme: (layer_class, data_shape, fuzz_dims, constructor_args, filter_fn)
@@ -51,147 +50,211 @@ def _conv2d_filter(**kwargs):
 #     constructor args, and prevents generation of contradictory combinations.
 #     A True return value indicates a valid test.
 _LAYERS_TO_TEST = [
-    (keras.layers.Dense, (1,), (False,), collections.OrderedDict([
-        ('units', [1])]), None),
-    (keras.layers.Activation, (2, 2), (True, True), collections.OrderedDict([
-        ('activation', ['relu'])]), None),
-    (keras.layers.Dropout, (16,), (False,), collections.OrderedDict([
-        ('rate', [0.25])]), None),
-    (keras.layers.BatchNormalization, (8, 8, 3), (True, True, False),
-     collections.OrderedDict([
-         ('axis', [3]),
-         ('center', [True, False]),
-         ('scale', [True, False])
-     ]), None),
-    (keras.layers.Conv1D, (8, 8), (False, False), collections.OrderedDict([
-        ('filters', [1]),
-        ('kernel_size', [1, 3]),
-        ('strides', [1, 2]),
-        ('padding', ['valid', 'same']),
-        ('use_bias', [True]),
-        ('kernel_regularizer', ['l2']),
-        ('data_format', ['channels_last'])
-    ]), None),
-    (keras.layers.Conv2D, (8, 8, 3), (True, True, False),
-     collections.OrderedDict([
-         ('filters', [1]),
-         ('kernel_size', [1, 3]),
-         ('strides', [1, 2]),
-         ('padding', ['valid', 'same']),
-         ('use_bias', [True, False]),
-         ('kernel_regularizer', ['l2']),
-         ('dilation_rate', [1, 2]),
-         ('data_format', ['channels_last'])
-     ]), _conv2d_filter),
-    (keras.layers.LSTM, (4, 4), (False, False), collections.OrderedDict([
-        ('units', [1]),
-        ('kernel_regularizer', ['l2']),
-        ('dropout', [0, 0.5]),
-        ('stateful', [True, False]),
-        ('unroll', [True, False]),
-        ('return_sequences', [True, False])
-    ]), None),
+    (
+        keras.layers.Dense,
+        (1,),
+        (False,),
+        collections.OrderedDict([("units", [1])]),
+        None,
+    ),
+    (
+        keras.layers.Activation,
+        (2, 2),
+        (True, True),
+        collections.OrderedDict([("activation", ["relu"])]),
+        None,
+    ),
+    (
+        keras.layers.Dropout,
+        (16,),
+        (False,),
+        collections.OrderedDict([("rate", [0.25])]),
+        None,
+    ),
+    (
+        keras.layers.BatchNormalization,
+        (8, 8, 3),
+        (True, True, False),
+        collections.OrderedDict(
+            [("axis", [3]), ("center", [True, False]), ("scale", [True, False])]
+        ),
+        None,
+    ),
+    (
+        keras.layers.Conv1D,
+        (8, 8),
+        (False, False),
+        collections.OrderedDict(
+            [
+                ("filters", [1]),
+                ("kernel_size", [1, 3]),
+                ("strides", [1, 2]),
+                ("padding", ["valid", "same"]),
+                ("use_bias", [True]),
+                ("kernel_regularizer", ["l2"]),
+                ("data_format", ["channels_last"]),
+            ]
+        ),
+        None,
+    ),
+    (
+        keras.layers.Conv2D,
+        (8, 8, 3),
+        (True, True, False),
+        collections.OrderedDict(
+            [
+                ("filters", [1]),
+                ("kernel_size", [1, 3]),
+                ("strides", [1, 2]),
+                ("padding", ["valid", "same"]),
+                ("use_bias", [True, False]),
+                ("kernel_regularizer", ["l2"]),
+                ("dilation_rate", [1, 2]),
+                ("data_format", ["channels_last"]),
+            ]
+        ),
+        _conv2d_filter,
+    ),
+    (
+        keras.layers.LSTM,
+        (4, 4),
+        (False, False),
+        collections.OrderedDict(
+            [
+                ("units", [1]),
+                ("kernel_regularizer", ["l2"]),
+                ("dropout", [0, 0.5]),
+                ("stateful", [True, False]),
+                ("unroll", [True, False]),
+                ("return_sequences", [True, False]),
+            ]
+        ),
+        None,
+    ),
 ]
 
 
 def _gather_test_cases():
-  cases = []
-  for layer_type, inp_shape, fuzz_dims, arg_dict, filter_fn in _LAYERS_TO_TEST:
-    arg_combinations = [[(k, i) for i in v] for k, v in arg_dict.items()]  # pylint: disable=g-complex-comprehension
-    for arguments in itertools.product(*arg_combinations):
-      layer_kwargs = {k: v for k, v in arguments}
-      if filter_fn is not None and not filter_fn(**layer_kwargs):
-        continue
-
-      name = '_{}_{}'.format(layer_type.__name__,
-                             '_'.join('{}_{}'.format(*i) for i in arguments))
-      cases.append((name, layer_type, inp_shape, fuzz_dims, layer_kwargs))
-  return cases
+    cases = []
+    for (
+        layer_type,
+        inp_shape,
+        fuzz_dims,
+        arg_dict,
+        filter_fn,
+    ) in _LAYERS_TO_TEST:
+        arg_combinations = [[(k, i) for i in v] for k, v in arg_dict.items()]
+        for arguments in itertools.product(*arg_combinations):
+            layer_kwargs = {k: v for k, v in arguments}
+            if filter_fn is not None and not filter_fn(**layer_kwargs):
+                continue
+
+            name = "_{}_{}".format(
+                layer_type.__name__,
+                "_".join("{}_{}".format(*i) for i in arguments),
+            )
+            cases.append((name, layer_type, inp_shape, fuzz_dims, layer_kwargs))
+    return cases
 
 
 OUTPUT_TEST_CASES = _gather_test_cases()
 
 
 class CoreLayerIntegrationTest(test_combinations.TestCase):
-  """Test that layers and models produce the correct tensor types."""
-
-  # In v1 graph there are only symbolic tensors.
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @parameterized.named_parameters(*OUTPUT_TEST_CASES)
-  def test_layer_output_type(self, layer_to_test, input_shape, _, layer_kwargs):
-    layer = layer_to_test(**layer_kwargs)
-
-    input_data = np.ones(shape=(2,) + input_shape, dtype=np.float32)
-    layer_result = layer(input_data)
-
-    inp = keras.layers.Input(shape=input_shape, batch_size=2)
-    model = keras.models.Model(inp, layer_to_test(**layer_kwargs)(inp))
-    model_result = model(input_data)
-
-    for x in [layer_result, model_result]:
-      if not isinstance(x, tf.Tensor):
-        raise ValueError('Tensor or EagerTensor expected, got type {}'
-                         .format(type(x)))
-
-      if isinstance(x, tf.__internal__.EagerTensor) != tf.executing_eagerly():
-        expected_type = (tf.__internal__.EagerTensor if tf.executing_eagerly()
-                         else tf.Tensor)
-        raise ValueError('Expected type {}, got type {}'
-                         .format(expected_type, type(x)))
-
-  def _run_fit_eval_predict(self, layer_to_test, input_shape, data_shape,
-                            layer_kwargs):
-    batch_size = 2
-    run_eagerly = test_utils.should_run_eagerly()
-
-    def map_fn(_):
-      x = keras.backend.random_uniform(shape=data_shape)
-      y = keras.backend.random_uniform(shape=(1,))
-      return x, y
-
-    dataset = tf.data.Dataset.range(4).map(map_fn).batch(batch_size)
-
-    inp = keras.layers.Input(shape=input_shape, batch_size=batch_size)
-    layer = layer_to_test(**layer_kwargs)(inp)
-
-    # Condense the output down to a single scalar.
-    layer = keras.layers.Flatten()(layer)
-    layer = keras.layers.Lambda(
-        lambda x: tf.reduce_mean(x, keepdims=True))(layer)
-    layer = keras.layers.Dense(1, activation=None)(layer)
-    model = keras.models.Model(inp, layer)
-
-    model.compile(loss='mse', optimizer='sgd', run_eagerly=run_eagerly)
-    model.fit(dataset, verbose=2, epochs=2)
-
-    model.compile(loss='mse', optimizer='sgd', run_eagerly=run_eagerly)
-    model.fit(dataset.repeat(2), verbose=2, epochs=2, steps_per_epoch=2)
-
-    eval_dataset = tf.data.Dataset.range(4).map(map_fn).batch(batch_size)
-    model.evaluate(eval_dataset, verbose=2)
-
-    def pred_map_fn(_):
-      return keras.backend.random_uniform(shape=data_shape)
-
-    pred_dataset = tf.data.Dataset.range(4)
-    pred_dataset = pred_dataset.map(pred_map_fn).batch(batch_size)
-    model.predict(pred_dataset, verbose=2)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=False)
-  @parameterized.named_parameters(*OUTPUT_TEST_CASES)
-  def test_model_loops(self, layer_to_test, input_shape, fuzz_dims,
-                       layer_kwargs):
-    self._run_fit_eval_predict(layer_to_test, input_shape,
-                               input_shape, layer_kwargs)
-
-    if any(fuzz_dims):
-      fuzzed_shape = []
-      for dim, should_fuzz in zip(input_shape, fuzz_dims):
-        fuzzed_shape.append(None if should_fuzz else dim)
-
-      self._run_fit_eval_predict(layer_to_test, fuzzed_shape,
-                                 input_shape, layer_kwargs)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Test that layers and models produce the correct tensor types."""
+
+    # In v1 graph there are only symbolic tensors.
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @parameterized.named_parameters(*OUTPUT_TEST_CASES)
+    def test_layer_output_type(
+        self, layer_to_test, input_shape, _, layer_kwargs
+    ):
+        layer = layer_to_test(**layer_kwargs)
+
+        input_data = np.ones(shape=(2,) + input_shape, dtype=np.float32)
+        layer_result = layer(input_data)
+
+        inp = keras.layers.Input(shape=input_shape, batch_size=2)
+        model = keras.models.Model(inp, layer_to_test(**layer_kwargs)(inp))
+        model_result = model(input_data)
+
+        for x in [layer_result, model_result]:
+            if not isinstance(x, tf.Tensor):
+                raise ValueError(
+                    f"Tensor or EagerTensor expected, got type {type(x)}"
+                )
+
+            if (
+                isinstance(x, tf.__internal__.EagerTensor)
+                != tf.executing_eagerly()
+            ):
+                expected_type = (
+                    tf.__internal__.EagerTensor
+                    if tf.executing_eagerly()
+                    else tf.Tensor
+                )
+                raise ValueError(
+                    f"Expected type {expected_type}, got type {type(x)}"
+                )
+
+    def _run_fit_eval_predict(
+        self, layer_to_test, input_shape, data_shape, layer_kwargs
+    ):
+        batch_size = 2
+        run_eagerly = test_utils.should_run_eagerly()
+
+        def map_fn(_):
+            x = keras.backend.random_uniform(shape=data_shape)
+            y = keras.backend.random_uniform(shape=(1,))
+            return x, y
+
+        dataset = tf.data.Dataset.range(4).map(map_fn).batch(batch_size)
+
+        inp = keras.layers.Input(shape=input_shape, batch_size=batch_size)
+        layer = layer_to_test(**layer_kwargs)(inp)
+
+        # Condense the output down to a single scalar.
+        layer = keras.layers.Flatten()(layer)
+        layer = keras.layers.Lambda(lambda x: tf.reduce_mean(x, keepdims=True))(
+            layer
+        )
+        layer = keras.layers.Dense(1, activation=None)(layer)
+        model = keras.models.Model(inp, layer)
+
+        model.compile(loss="mse", optimizer="sgd", run_eagerly=run_eagerly)
+        model.fit(dataset, verbose=2, epochs=2)
+
+        model.compile(loss="mse", optimizer="sgd", run_eagerly=run_eagerly)
+        model.fit(dataset.repeat(2), verbose=2, epochs=2, steps_per_epoch=2)
+
+        eval_dataset = tf.data.Dataset.range(4).map(map_fn).batch(batch_size)
+        model.evaluate(eval_dataset, verbose=2)
+
+        def pred_map_fn(_):
+            return keras.backend.random_uniform(shape=data_shape)
+
+        pred_dataset = tf.data.Dataset.range(4)
+        pred_dataset = pred_dataset.map(pred_map_fn).batch(batch_size)
+        model.predict(pred_dataset, verbose=2)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=False)
+    @parameterized.named_parameters(*OUTPUT_TEST_CASES)
+    def test_model_loops(
+        self, layer_to_test, input_shape, fuzz_dims, layer_kwargs
+    ):
+        self._run_fit_eval_predict(
+            layer_to_test, input_shape, input_shape, layer_kwargs
+        )
+
+        if any(fuzz_dims):
+            fuzzed_shape = []
+            for dim, should_fuzz in zip(input_shape, fuzz_dims):
+                fuzzed_shape.append(None if should_fuzz else dim)
+
+            self._run_fit_eval_predict(
+                layer_to_test, fuzzed_shape, input_shape, layer_kwargs
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/training_test.py b/keras/engine/training_test.py
index 3227b076adb2..579367c3c24d 100644
--- a/keras/engine/training_test.py
+++ b/keras/engine/training_test.py
@@ -20,7 +20,10 @@
 import sys
 import tempfile
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras import backend
 from keras import layers as layers_module
@@ -32,4267 +35,5096 @@
 from keras.engine import training as training_module
 from keras.engine import training_utils_v1
 from keras.layers.preprocessing import string_lookup
-from keras.optimizers import optimizer_v2
-from keras.optimizers.optimizer_experimental import sgd as sgd_experimental
+from keras.mixed_precision import policy
+from keras.optimizers import legacy as optimizer_legacy
+from keras.optimizers import rmsprop
+from keras.optimizers import sgd as sgd_experimental
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import data_utils
 from keras.utils import io_utils
 from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util as tf_test_utils
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.training.rmsprop import (
+    RMSPropOptimizer,
+)
 
 try:
-  import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
+    import scipy.sparse as scipy_sparse
 except ImportError:
-  scipy_sparse = None
+    scipy_sparse = None
 
 
 class TrainingTest(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_model_instrumentation(self):
-    layers = [
-        layers_module.Dense(10, dtype=np.float64),
-        layers_module.Dense(10, dtype=np.float64)
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(1,))
-
-    self.assertTrue(model._instrumented_keras_api)
-    self.assertTrue(model._instrumented_keras_model_class)
-    self.assertFalse(model._instrumented_keras_layer_class)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_fit_training_arg(self):
-
-    class ReturnTraining(layers_module.Layer):
-
-      def call(self, inputs, training):
-        if training:
-          return inputs + tf.constant([100], 'float32')
-        else:
-          return inputs + tf.constant([0], 'float32')
-
-    model = sequential.Sequential([ReturnTraining()])
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    hist = model.fit(x=np.array([0.]), y=np.array([0.]))
-    self.assertAllClose(hist.history['loss'][0], 10000)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_fit_on_empty(self):
-    model = sequential.Sequential([layers_module.Dense(1)])
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    with self.assertRaisesRegex(ValueError,
-                                'Unexpected result of `train_function`.*'):
-      model.fit(x=np.array([]), y=np.array([]))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_compile_fit_with_jit_compile(self):
-    # Test with jit_compile = True
-    model = sequential.Sequential([layers_module.Dense(1)])
-    model.compile(
-        'sgd', loss='mse', run_eagerly=False, jit_compile=True)
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model.fit(x, y, epochs=2)
-    # Test fcompile fit for a RNN model
-    model = sequential.Sequential()
-    model.add(
-        layers_module.TimeDistributed(
-            layers_module.Embedding(5, 6, mask_zero=True),
-            input_shape=(None, None)))  # N by t_1 by t_2 by 6
-    model.add(
-        layers_module.TimeDistributed(
-            layers_module.SimpleRNN(7, return_sequences=True)))
-    model.add(
-        layers_module.TimeDistributed(
-            layers_module.SimpleRNN(8, return_sequences=False)))
-    model.add(layers_module.SimpleRNN(1, return_sequences=False))
-    model.compile(optimizer='sgd', loss='mse', jit_compile=True)
-    model_input = np.random.randint(
-        low=1, high=5, size=(10, 3, 4), dtype='int32')
-    for i in range(4):
-      model_input[i, i:, i:] = 0
-    model.fit(model_input, np.random.random((10, 1)), epochs=1, batch_size=10)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_compile_fit_evaluate_predict_with_mirrored_strategy(self):
-    # Test with jit_compile = True
-    strategy = tf.distribute.MirroredStrategy()
-    with strategy.scope():
-      model = sequential.Sequential([layers_module.Dense(1)])
-    model.compile('sgd', loss='mse', run_eagerly=False, jit_compile=True)
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model.fit(x, y, epochs=2)
-    model.evaluate(x, y)
-    model.predict(x)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_verify_xla_compile_with_jit_compile(self):
-    vocab_data = ['earth', 'wind', 'and', 'fire']
-    input_array = np.array([['earth', 'wind', 'and', 'fire'],
-                            ['fire', 'and', 'earth', 'michigan']])
-    expected_output = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
-    strategy = tf.distribute.MirroredStrategy()
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = string_lookup.StringLookup(vocabulary=vocab_data)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-      model.compile('sgd', loss='mse', run_eagerly=False, jit_compile=True)
-      # Added a string op unsupported by XLA compiler to make sure that an
-      # error is thrown, This ensures that the graph is indeed being compiled
-      # using XLA
-      with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                  'Graph execution error'):
-        model.fit(input_array, expected_output, epochs=1)
-        model.predict(input_array)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_jit_compile_for_compile_evaluate_predict(self):
-    # Test with jit_compile = True for model.compile(), model.evaluate(),
-    # model.predict()
-    model = sequential.Sequential([layers_module.Dense(1)])
-    self.assertIsNone(model._jit_compile)
-    model.compile('sgd', loss='mse', run_eagerly=False, jit_compile=True)
-    self.assertTrue(model._jit_compile)
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model.fit(x, y, epochs=2)
-    model.evaluate(x, y)
-    model.predict(x)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_fit_without_loss_at_compile(self):
-    model = sequential.Sequential([layers_module.Dense(1)])
-    model.compile('sgd', run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    with self.assertRaisesRegex(ValueError, 'No loss found..*'):
-      model.fit(x, y, epochs=2)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_fit_without_loss_at_compile_but_with_add_loss(self):
-
-    class MyModel(sequential.Sequential):
-
-      def call(self, x):
-        self.add_loss(tf.reduce_sum(x))
-        return x
-
-    model = MyModel([layers_module.Dense(1)])
-    model.compile('sgd', run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model.fit(x, y, epochs=2)
-
-  @test_combinations.run_all_keras_modes
-  def test_run_eagerly_setting(self):
-    model = sequential.Sequential([layers_module.Dense(1)])
-    run_eagerly = test_utils.should_run_eagerly()
-    model.compile('sgd', 'mse', run_eagerly=run_eagerly)
-    self.assertEqual(model.run_eagerly, run_eagerly)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @parameterized.named_parameters(
-      ('train_on_batch', 'train_on_batch'),
-      ('test_on_batch', 'test_on_batch'),
-      ('predict_on_batch', 'predict_on_batch'),
-      ('fit', 'fit'),
-      ('evaluate', 'evaluate'),
-      ('predict', 'predict'),
-  )
-  def test_disallow_methods_inside_tf_function(self, method_name):
-    model = sequential.Sequential([layers_module.Dense(1)])
-    run_eagerly = test_utils.should_run_eagerly()
-    model.compile('sgd', 'mse', run_eagerly=run_eagerly)
-
-    @tf.function
-    def my_fn():
-      getattr(model, method_name)(1)
-
-    error_msg = 'inside a `tf.function`'
-    with self.assertRaisesRegex(RuntimeError, error_msg):
-      my_fn()
-
-  @test_combinations.run_all_keras_modes
-  def test_fit_and_validate_learning_phase(self):
-
-    class ReturnTraining(layers_module.Layer):
-
-      def call(self, inputs):
-        return backend.in_train_phase(lambda: tf.ones_like(inputs),
-                                      lambda: tf.zeros_like(inputs))
-
-    model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
-    model.compile(
-        'sgd',
-        loss='mae',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.ones((40, 2), dtype=np.float32)
-    targets = np.ones((40, 1), dtype=np.float32)
-
-    # Test correctness with `steps_per_epoch`.
-    train_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    val_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    history = model.fit(
-        train_dataset, epochs=2, verbose=1, validation_data=val_dataset)
-
-    # The training loss should be 0.0
-    self.assertAllClose(history.history['loss'][0], 0.0)
-    # The validation loss should be 1.0.
-    self.assertAllClose(history.history['val_loss'][0], 1.0)
-
-  @test_combinations.run_all_keras_modes(
-      always_skip_v1=True)
-  def test_warn_on_evaluate(self):
-    i = layers_module.Input((1,))
-    x = np.ones((100, 1))
-    y = np.ones((100, 1))
-    sample_weight = np.ones((100,))
-    model = training_module.Model(i, i)
-    model.compile(loss='mse', metrics=['mse'])
-
-    logging.set_verbosity(2)
-    with self.assertLogs(level=2) as logs:
-      model.evaluate(x, y, sample_weight=sample_weight)
-    self.assertTrue(
-        any('`evaluate()` received a value for `sample_weight`' in log
-            for log in logs.output))
-
-  @test_combinations.run_all_keras_modes(
-      always_skip_v1=True)
-  def test_sample_weight_warning_disable(self):
-    i = layers_module.Input((1,))
-    x = np.ones((100, 1))
-    y = np.ones((100, 1))
-    sample_weight = np.ones((100,))
-    model = training_module.Model(i, i)
-    model.compile(loss='mse', metrics=['mse'], weighted_metrics=[])
-
-    logging.set_verbosity(2)
-    with self.assertLogs(level=2) as logs:
-      model.evaluate(x, y, sample_weight=sample_weight)
-    self.assertFalse(
-        any('`evaluate()` received a value for `sample_weight`' in log
-            for log in logs.output))
-
-  @test_combinations.run_all_keras_modes(
-      always_skip_v1=True)
-  def test_warn_on_evaluate_with_tf_dataset(self):
-    i = layers_module.Input((1,))
-
-    x = tf.ones((100, 1), tf.float32)
-    y = tf.ones((100, 1), tf.float32)
-    sample_weight = tf.ones((100,), dtype=tf.float32)
-    val_dataset = tf.data.Dataset.from_tensor_slices(
-        (x, y, sample_weight)).batch(10)
-    model = training_module.Model(i, i)
-    model.compile(loss='mse', metrics=['mse'])
-
-    logging.set_verbosity(2)
-    with self.assertLogs(level=2) as logs:
-      model.evaluate(val_dataset)
-    self.assertTrue(
-        any('`evaluate()` received a value for `sample_weight`' in log
-            for log in logs.output))
-
-  @test_combinations.run_all_keras_modes
-  def test_fit_and_validate_training_arg(self):
-
-    class ReturnTraining(layers_module.Layer):
-
-      def call(self, inputs, training=None):
-        return backend.in_train_phase(
-            lambda: tf.ones_like(inputs),
-            lambda: tf.zeros_like(inputs),
-            training=training)
-
-    model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
-    model.compile(
-        'sgd',
-        loss='mae',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.ones((40, 2), dtype=np.float32)
-    targets = np.ones((40, 1), dtype=np.float32)
-
-    # Test correctness with `steps_per_epoch`.
-    train_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    val_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    history = model.fit(
-        train_dataset, epochs=2, verbose=1, validation_data=val_dataset)
-
-    # The training loss should be 0.0
-    self.assertAllClose(history.history['loss'][0], 0.0)
-    # The validation loss should be 1.0.
-    self.assertAllClose(history.history['val_loss'][0], 1.0)
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_target_dtype_matches_output(self):
-
-    def loss_fn(labels, preds):
-      self.assertEqual(labels.dtype, preds.dtype)
-      return labels - preds
-
-    layers = [
-        layers_module.Dense(10, dtype=np.float64),
-        layers_module.Dense(10, dtype=np.float64)
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(1,))
-    inputs = np.ones(shape=(10, 1), dtype=np.float64)
-    targets = np.ones(shape=(10, 1), dtype=np.float64)
-    model.compile(
-        'sgd',
-        loss=loss_fn,
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(inputs, targets)
-    model.test_on_batch(inputs, targets)
-    self.assertEqual(model.predict(inputs).dtype, np.float64)
-
-  @test_combinations.run_all_keras_modes
-  def test_fit_and_validate_nested_training_arg(self):
-
-    class NestedReturnTraining(layers_module.Layer):
-
-      def call(self, inputs, training=None):
-        return backend.in_train_phase(
-            lambda: tf.ones_like(inputs),
-            lambda: tf.zeros_like(inputs),
-            training=training)
-
-    class ReturnTraining(layers_module.Layer):
-
-      def __init__(self, input_shape=None, **kwargs):
-        super().__init__(input_shape=input_shape, **kwargs)
-        self._nested_layer = None
-
-      def build(self, input_shape):
-        self._nested_layer = NestedReturnTraining()
-        self.built = True
-
-      def call(self, inputs):
-        return self._nested_layer(inputs)
-
-    model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
-    model.compile(
-        'sgd',
-        loss='mae',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.ones((40, 2), dtype=np.float32)
-    targets = np.ones((40, 1), dtype=np.float32)
-
-    # Test correctness with `steps_per_epoch`.
-    train_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    val_dataset = tf.data.Dataset.from_tensor_slices(
-        (inputs, targets)).batch(10)
-    history = model.fit(
-        train_dataset, epochs=2, verbose=1, validation_data=val_dataset)
-
-    # The training loss should be 0.0
-    self.assertAllClose(history.history['loss'][0], 0.0)
-    # The validation loss should be 1.0.
-    self.assertAllClose(history.history['val_loss'][0], 1.0)
-
-  @test_combinations.run_with_all_model_types(exclude_models='sequential')
-  @test_combinations.run_all_keras_modes
-  def test_fit_on_arrays(self):
-    input_a = layers_module.Input(shape=(3,), name='input_a')
-    input_b = layers_module.Input(shape=(3,), name='input_b')
-
-    dense = layers_module.Dense(4, name='dense')
-    dropout = layers_module.Dropout(0.5, name='dropout')
-    branch_a = [input_a, dense]
-    branch_b = [input_b, dense, dropout]
-
-    model = test_utils.get_multi_io_model(branch_a, branch_b)
-
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    loss_weights = [1., 0.5]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        loss_weights=loss_weights,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    input_a_np = np.random.random((10, 3))
-    input_b_np = np.random.random((10, 3))
-
-    output_d_np = np.random.random((10, 4))
-    output_e_np = np.random.random((10, 4))
-
-    # Test fit at different verbosity
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        epochs=1,
-        batch_size=5,
-        verbose=1)
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        epochs=2,
-        batch_size=5,
-        verbose=2)
-    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
-
-    # Test with validation data
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        validation_data=([input_a_np, input_b_np], [output_d_np,
-                                                    output_e_np]),
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        validation_data=([input_a_np, input_b_np], [output_d_np,
-                                                    output_e_np]),
-        epochs=2,
-        batch_size=5,
-        verbose=1)
-    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
-              validation_data=([input_a_np,
-                                input_b_np], [output_d_np, output_e_np]),
-              epochs=2,
-              batch_size=5,
-              verbose=2)
-    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
-              validation_data=[[input_a_np, input_b_np],
-                               [output_d_np, output_e_np]],
-              epochs=2,
-              batch_size=5,
-              verbose=2)
-    # Test with validation split
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        epochs=2,
-        batch_size=5,
-        verbose=0,
-        validation_split=0.2)
-
-    if test_utils.get_model_type() == 'functional':
-      # Test with dictionary inputs
-      model.fit(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          epochs=1,
-          batch_size=5,
-          verbose=0)
-      model.fit(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          epochs=1,
-          batch_size=5,
-          verbose=1)
-      model.fit(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          validation_data=({
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          }),
-          epochs=1,
-          batch_size=5,
-          verbose=0)
-      model.train_on_batch({
-          'input_a': input_a_np,
-          'input_b': input_b_np
-      }, {
-          'dense': output_d_np,
-          'dropout': output_e_np
-      })
-
-    # Test with lists for loss, metrics
-    loss = ['mae', 'mse']
-    model.compile(
-        optimizer,
-        loss,
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-
-    # Test with dictionaries for loss, metrics, loss weights
-    if test_utils.get_model_type() == 'functional':
-      loss = {'dense': 'mse', 'dropout': 'mae'}
-      loss_weights = {'dense': 1., 'dropout': 0.5}
-      metrics = {
-          'dense': 'mse',
-          'dropout': metrics_module.CategoricalAccuracy()
-      }
-      model.compile(
-          optimizer,
-          loss,
-          metrics=metrics,
-          loss_weights=loss_weights,
-          run_eagerly=test_utils.should_run_eagerly())
-    model.fit(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-
-    # Build single-input model
-    x = layers_module.Input(shape=(3,), name='input_a')
-    y = layers_module.Dense(4)(x)
-    model = training_module.Model(x, y)
-    model.compile(
-        optimizer,
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    # This will work
-    model.fit([input_a_np], output_d_np, epochs=1)
-
-    # Test model on a list of floats
-    input_a_np = np.random.random((10, 3))
-    input_b_np = np.random.random((10, 4))
-
-    # Test execution on inputs that are lists of scalars.
-    # TF2 and TF1 have slightly different semantics:
-    if tf.executing_eagerly():
-      # In TF2 to avoid any ambiguity when there are nested lists
-      # the entire input gets converted to a
-      # single numpy array (& it only works in the case of a single io model)
-      model.fit(np.ndarray.tolist(input_a_np),
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_model_instrumentation(self):
+        layers = [
+            layers_module.Dense(10, dtype=np.float64),
+            layers_module.Dense(10, dtype=np.float64),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(1,))
+
+        self.assertTrue(model._instrumented_keras_api)
+        self.assertTrue(model._instrumented_keras_model_class)
+        self.assertFalse(model._instrumented_keras_layer_class)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_fit_training_arg(self):
+        class ReturnTraining(layers_module.Layer):
+            def call(self, inputs, training):
+                if training:
+                    return inputs + tf.constant([100], "float32")
+                else:
+                    return inputs + tf.constant([0], "float32")
+
+        model = sequential.Sequential([ReturnTraining()])
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        hist = model.fit(x=np.array([0.0]), y=np.array([0.0]))
+        self.assertAllClose(hist.history["loss"][0], 10000)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_fit_on_empty(self):
+        model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        with self.assertRaisesRegex(
+            ValueError, "Expected input data to be non-empty."
+        ):
+            model.fit(x=np.array([]), y=np.array([]))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_compile_fit_with_jit_compile(self):
+        # Test with jit_compile = True
+        model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile("sgd", loss="mse", run_eagerly=False, jit_compile=True)
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        # Test fcompile fit for a RNN model
+        model = sequential.Sequential()
+        model.add(
+            layers_module.TimeDistributed(
+                layers_module.Embedding(5, 6, mask_zero=True),
+                input_shape=(None, None),
+            )
+        )  # N by t_1 by t_2 by 6
+        model.add(
+            layers_module.TimeDistributed(
+                layers_module.SimpleRNN(7, return_sequences=True)
+            )
+        )
+        model.add(
+            layers_module.TimeDistributed(
+                layers_module.SimpleRNN(8, return_sequences=False)
+            )
+        )
+        model.add(layers_module.SimpleRNN(1, return_sequences=False))
+        model.compile(optimizer="sgd", loss="mse", jit_compile=True)
+        model_input = np.random.randint(
+            low=1, high=5, size=(10, 3, 4), dtype="int32"
+        )
+        for i in range(4):
+            model_input[i, i:, i:] = 0
+        model.fit(
+            model_input, np.random.random((10, 1)), epochs=1, batch_size=10
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_compile_fit_evaluate_predict_with_mirrored_strategy(self):
+        # Test with jit_compile = True
+        strategy = tf.distribute.MirroredStrategy()
+        with strategy.scope():
+            model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile("sgd", loss="mse", run_eagerly=False, jit_compile=True)
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        model.evaluate(x, y)
+        model.predict(x)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_distribution_reduction_method_sum_default_train_step(self):
+
+        strategy = tf.distribute.MirroredStrategy(
+            ["/cpu:1", "/cpu:2", "/cpu:3", "/cpu:4"]
+        )
+        BATCH_SIZE = 10
+
+        # A model that always outputs `1`:
+        with strategy.scope():
+            inputs = layers_module.Input(shape=(1,), name="my_input")
+            outputs = layers_module.Dense(
+                units=1, kernel_initializer="zeros", bias_initializer="ones"
+            )(inputs)
+            model = training_module.Model(inputs, outputs)
+
+        model.trainable = False
+        model.compile(optimizer="sgd", loss="mean_absolute_error")
+
+        # Data points are always equal to `2`:
+        x, y = 2 * np.ones((40, 1)), 2 * np.ones((40, 1))
+
+        # For every output x_i = 1, every target y_i = 2,
+        #   loss_i     = |1-2| = 1; and
+        #   loss_total = sum([1, 1, ..., 1]) / BATCH_SIZE = 1.0
+        history = model.fit(x, y, epochs=1, batch_size=BATCH_SIZE)
+        self.assertAllClose(history.history["loss"][-1], 1.0)
+
+        eval_output = model.evaluate(x, y, batch_size=BATCH_SIZE)
+        self.assertAllClose(eval_output, 1.0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_distribution_reduction_method_sum_custom_train_step(self):
+
+        strategy = tf.distribute.MirroredStrategy(
+            ["/cpu:1", "/cpu:2", "/cpu:3", "/cpu:4"]
+        )
+        BATCH_SIZE = 10
+
+        class MyModel(training_module.Model):
+            @staticmethod
+            def reduce_loss(loss_value, global_batch_size):
+                REDUCTION_AXES = range(1, backend.ndim(loss_value))
+                loss_value = tf.reduce_mean(loss_value, axis=REDUCTION_AXES)
+                return tf.nn.compute_average_loss(
+                    loss_value, global_batch_size=global_batch_size
+                )
+
+            def train_step(self, data):
+                loss_value = tf.ones_like(data[0])
+                return {
+                    "loss": MyModel.reduce_loss(
+                        loss_value, global_batch_size=BATCH_SIZE
+                    )
+                }
+
+            def test_step(self, data):
+                loss_value = tf.ones_like(data[0])
+                return {
+                    "metric": MyModel.reduce_loss(
+                        loss_value, global_batch_size=BATCH_SIZE
+                    )
+                }
+
+        with strategy.scope():
+            inputs = layers_module.Input(shape=(1,), name="my_input")
+            outputs = layers_module.Dense(1)(inputs)
+            model = MyModel(inputs, outputs)
+
+        model.compile()
+
+        x, y = np.ones((40, 1)), np.ones((40, 1))
+        history = model.fit(x, y, epochs=2, batch_size=BATCH_SIZE)
+        self.assertAllClose(history.history["loss"][-1], 1.0)
+
+        eval_output = model.evaluate(x, y, batch_size=BATCH_SIZE)
+        self.assertAllClose(eval_output, 1.0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_verify_xla_compile_with_jit_compile(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
+        strategy = tf.distribute.MirroredStrategy()
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = string_lookup.StringLookup(vocabulary=vocab_data)
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+            model.compile(
+                "sgd", loss="mse", run_eagerly=False, jit_compile=True
+            )
+            # Added a string op unsupported by XLA compiler to make sure that an
+            # error is thrown, This ensures that the graph is indeed being
+            # compiled using XLA
+            with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError, "Graph execution error"
+            ):
+                model.fit(input_array, expected_output, epochs=1)
+                model.predict(input_array)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_jit_compile_for_compile_evaluate_predict(self):
+        # Test with jit_compile = True for model.compile(), model.evaluate(),
+        # model.predict()
+        model = sequential.Sequential([layers_module.Dense(1)])
+        self.assertIsNone(model._jit_compile)
+        model.compile("sgd", loss="mse", run_eagerly=False, jit_compile=True)
+        self.assertTrue(model._jit_compile)
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        model.evaluate(x, y)
+        model.predict(x)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_jit_compile_true_for_evaluate_predict_but_false_for_compile(self):
+        # Test with jit_compile = True for model.compile(), model.evaluate(),
+        # model.predict()
+        model = sequential.Sequential([layers_module.Dense(1)])
+        self.assertIsNone(model._jit_compile)
+        self.assertIsNone(model.jit_compile)
+        model.compile("sgd", loss="mse")
+        model.jit_compile = True
+        self.assertTrue(model._jit_compile)
+        self.assertTrue(model.jit_compile)
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        model.evaluate(x, y)
+        model.predict(x)
+        self.assertTrue(model._jit_compile)
+        self.assertTrue(model.jit_compile)
+        model.compile("sgd", loss="mse", jit_compile=False)
+        self.assertFalse(model._jit_compile)
+        self.assertFalse(model.jit_compile)
+        model.compile("sgd", loss="mse", jit_compile=True)
+        self.assertTrue(model._jit_compile)
+        self.assertTrue(model.jit_compile)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_predict_xla_compile_with_jit_compile_setter_false_then_true(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        strategy = tf.distribute.MirroredStrategy()
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            # Added a string op unsupported by XLA compiler to make sure that an
+            # error is thrown, This ensures that the graph is indeed being
+            # compiled using XLA
+            layer = string_lookup.StringLookup(vocabulary=vocab_data)
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+            # Compiled without jit_compile
+            model.predict(input_array)
+            model.jit_compile = True
+            with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError, "Graph execution error"
+            ):
+                model.predict(input_array)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_fit_without_loss_at_compile(self):
+        model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile("sgd", run_eagerly=test_utils.should_run_eagerly())
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        with self.assertRaisesRegex(ValueError, "No loss found..*"):
+            model.fit(x, y, epochs=2)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_fit_without_loss_at_compile_but_with_add_loss(self):
+        class MyModel(sequential.Sequential):
+            def call(self, x):
+                self.add_loss(tf.reduce_sum(x))
+                return x
+
+        model = MyModel([layers_module.Dense(1)])
+        model.compile("sgd", run_eagerly=test_utils.should_run_eagerly())
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+
+    @test_combinations.run_all_keras_modes
+    def test_run_eagerly_setting(self):
+        model = sequential.Sequential([layers_module.Dense(1)])
+        run_eagerly = test_utils.should_run_eagerly()
+        model.compile("sgd", "mse", run_eagerly=run_eagerly)
+        self.assertEqual(model.run_eagerly, run_eagerly)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @parameterized.named_parameters(
+        ("train_on_batch", "train_on_batch"),
+        ("test_on_batch", "test_on_batch"),
+        ("predict_on_batch", "predict_on_batch"),
+        ("fit", "fit"),
+        ("evaluate", "evaluate"),
+        ("predict", "predict"),
+    )
+    def test_disallow_methods_inside_tf_function(self, method_name):
+        model = sequential.Sequential([layers_module.Dense(1)])
+        run_eagerly = test_utils.should_run_eagerly()
+        model.compile("sgd", "mse", run_eagerly=run_eagerly)
+
+        @tf.function
+        def my_fn():
+            getattr(model, method_name)(1)
+
+        error_msg = "inside a `tf.function`"
+        with self.assertRaisesRegex(RuntimeError, error_msg):
+            my_fn()
+
+    @test_combinations.run_all_keras_modes
+    def test_fit_and_validate_learning_phase(self):
+        class ReturnTraining(layers_module.Layer):
+            def call(self, inputs):
+                return backend.in_train_phase(
+                    lambda: tf.ones_like(inputs), lambda: tf.zeros_like(inputs)
+                )
+
+        model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
+        model.compile(
+            "sgd", loss="mae", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.ones((40, 2), dtype=np.float32)
+        targets = np.ones((40, 1), dtype=np.float32)
+
+        # Test correctness with `steps_per_epoch`.
+        train_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        val_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        history = model.fit(
+            train_dataset, epochs=2, verbose=1, validation_data=val_dataset
+        )
+
+        # The training loss should be 0.0
+        self.assertAllClose(history.history["loss"][0], 0.0)
+        # The validation loss should be 1.0.
+        self.assertAllClose(history.history["val_loss"][0], 1.0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_warn_on_evaluate(self):
+        i = layers_module.Input((1,))
+        x = np.ones((100, 1))
+        y = np.ones((100, 1))
+        sample_weight = np.ones((100,))
+        model = training_module.Model(i, i)
+        model.compile(loss="mse", metrics=["mse"])
+
+        logging.set_verbosity(2)
+        with self.assertLogs(level=2) as logs:
+            model.evaluate(x, y, sample_weight=sample_weight)
+        self.assertTrue(
+            any(
+                "`evaluate()` received a value for `sample_weight`" in log
+                for log in logs.output
+            )
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_sample_weight_warning_disable(self):
+        i = layers_module.Input((1,))
+        x = np.ones((100, 1))
+        y = np.ones((100, 1))
+        sample_weight = np.ones((100,))
+        model = training_module.Model(i, i)
+        model.compile(loss="mse", metrics=["mse"], weighted_metrics=[])
+
+        logging.set_verbosity(2)
+        with self.assertLogs(level=2) as logs:
+            model.evaluate(x, y, sample_weight=sample_weight)
+        self.assertFalse(
+            any(
+                "`evaluate()` received a value for `sample_weight`" in log
+                for log in logs.output
+            )
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_warn_on_evaluate_with_tf_dataset(self):
+        i = layers_module.Input((1,))
+
+        x = tf.ones((100, 1), tf.float32)
+        y = tf.ones((100, 1), tf.float32)
+        sample_weight = tf.ones((100,), dtype=tf.float32)
+        val_dataset = tf.data.Dataset.from_tensor_slices(
+            (x, y, sample_weight)
+        ).batch(10)
+        model = training_module.Model(i, i)
+        model.compile(loss="mse", metrics=["mse"])
+
+        logging.set_verbosity(2)
+        with self.assertLogs(level=2) as logs:
+            model.evaluate(val_dataset)
+        self.assertTrue(
+            any(
+                "`evaluate()` received a value for `sample_weight`" in log
+                for log in logs.output
+            )
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_fit_and_validate_training_arg(self):
+        class ReturnTraining(layers_module.Layer):
+            def call(self, inputs, training=None):
+                return backend.in_train_phase(
+                    lambda: tf.ones_like(inputs),
+                    lambda: tf.zeros_like(inputs),
+                    training=training,
+                )
+
+        model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
+        model.compile(
+            "sgd", loss="mae", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.ones((40, 2), dtype=np.float32)
+        targets = np.ones((40, 1), dtype=np.float32)
+
+        # Test correctness with `steps_per_epoch`.
+        train_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        val_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        history = model.fit(
+            train_dataset, epochs=2, verbose=1, validation_data=val_dataset
+        )
+
+        # The training loss should be 0.0
+        self.assertAllClose(history.history["loss"][0], 0.0)
+        # The validation loss should be 1.0.
+        self.assertAllClose(history.history["val_loss"][0], 1.0)
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_target_dtype_matches_output(self):
+        def loss_fn(labels, preds):
+            self.assertEqual(labels.dtype, preds.dtype)
+            return labels - preds
+
+        layers = [
+            layers_module.Dense(10, dtype=np.float64),
+            layers_module.Dense(10, dtype=np.float64),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(1,))
+        inputs = np.ones(shape=(10, 1), dtype=np.float64)
+        targets = np.ones(shape=(10, 1), dtype=np.float64)
+        model.compile(
+            "sgd", loss=loss_fn, run_eagerly=test_utils.should_run_eagerly()
+        )
+        model.train_on_batch(inputs, targets)
+        model.test_on_batch(inputs, targets)
+        self.assertEqual(model.predict(inputs).dtype, np.float64)
+
+    @test_combinations.run_all_keras_modes
+    def test_fit_and_validate_nested_training_arg(self):
+        class NestedReturnTraining(layers_module.Layer):
+            def call(self, inputs, training=None):
+                return backend.in_train_phase(
+                    lambda: tf.ones_like(inputs),
+                    lambda: tf.zeros_like(inputs),
+                    training=training,
+                )
+
+        class ReturnTraining(layers_module.Layer):
+            def __init__(self, input_shape=None, **kwargs):
+                super().__init__(input_shape=input_shape, **kwargs)
+                self._nested_layer = None
+
+            def build(self, input_shape):
+                self._nested_layer = NestedReturnTraining()
+                self.built = True
+
+            def call(self, inputs):
+                return self._nested_layer(inputs)
+
+        model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
+        model.compile(
+            "sgd", loss="mae", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.ones((40, 2), dtype=np.float32)
+        targets = np.ones((40, 1), dtype=np.float32)
+
+        # Test correctness with `steps_per_epoch`.
+        train_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        val_dataset = tf.data.Dataset.from_tensor_slices(
+            (inputs, targets)
+        ).batch(10)
+        history = model.fit(
+            train_dataset, epochs=2, verbose=1, validation_data=val_dataset
+        )
+
+        # The training loss should be 0.0
+        self.assertAllClose(history.history["loss"][0], 0.0)
+        # The validation loss should be 1.0.
+        self.assertAllClose(history.history["val_loss"][0], 1.0)
+
+    @test_combinations.run_with_all_model_types(exclude_models="sequential")
+    @test_combinations.run_all_keras_modes
+    def test_fit_on_arrays(self):
+        input_a = layers_module.Input(shape=(3,), name="input_a")
+        input_b = layers_module.Input(shape=(3,), name="input_b")
+
+        dense = layers_module.Dense(4, name="dense")
+        dropout = layers_module.Dropout(0.5, name="dropout")
+        branch_a = [input_a, dense]
+        branch_b = [input_b, dense, dropout]
+
+        model = test_utils.get_multi_io_model(branch_a, branch_b)
+
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        loss = "mse"
+        loss_weights = [1.0, 0.5]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=[metrics_module.CategoricalAccuracy(), "mae"],
+            loss_weights=loss_weights,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        input_a_np = np.random.random((10, 3))
+        input_b_np = np.random.random((10, 3))
+
+        output_d_np = np.random.random((10, 4))
+        output_e_np = np.random.random((10, 4))
+
+        # Test fit at different verbosity
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=1,
+            batch_size=5,
+            verbose=1,
+        )
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=2,
+            batch_size=5,
+            verbose=2,
+        )
+        model.train_on_batch(
+            [input_a_np, input_b_np], [output_d_np, output_e_np]
+        )
+
+        # Test with validation data
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            validation_data=(
+                [input_a_np, input_b_np],
+                [output_d_np, output_e_np],
+            ),
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            validation_data=(
+                [input_a_np, input_b_np],
+                [output_d_np, output_e_np],
+            ),
+            epochs=2,
+            batch_size=5,
+            verbose=1,
+        )
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            validation_data=(
+                [input_a_np, input_b_np],
+                [output_d_np, output_e_np],
+            ),
+            epochs=2,
+            batch_size=5,
+            verbose=2,
+        )
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            validation_data=[
+                [input_a_np, input_b_np],
+                [output_d_np, output_e_np],
+            ],
+            epochs=2,
+            batch_size=5,
+            verbose=2,
+        )
+        # Test with validation split
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=2,
+            batch_size=5,
+            verbose=0,
+            validation_split=0.2,
+        )
+
+        if test_utils.get_model_type() == "functional":
+            # Test with dictionary inputs
+            model.fit(
+                {"input_a": input_a_np, "input_b": input_b_np},
+                {"dense": output_d_np, "dropout": output_e_np},
+                epochs=1,
+                batch_size=5,
+                verbose=0,
+            )
+            model.fit(
+                {"input_a": input_a_np, "input_b": input_b_np},
+                {"dense": output_d_np, "dropout": output_e_np},
+                epochs=1,
+                batch_size=5,
+                verbose=1,
+            )
+            model.fit(
+                {"input_a": input_a_np, "input_b": input_b_np},
+                {"dense": output_d_np, "dropout": output_e_np},
+                validation_data=(
+                    {"input_a": input_a_np, "input_b": input_b_np},
+                    {"dense": output_d_np, "dropout": output_e_np},
+                ),
+                epochs=1,
+                batch_size=5,
+                verbose=0,
+            )
+            model.train_on_batch(
+                {"input_a": input_a_np, "input_b": input_b_np},
+                {"dense": output_d_np, "dropout": output_e_np},
+            )
+
+        # Test with lists for loss, metrics
+        loss = ["mae", "mse"]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=[metrics_module.CategoricalAccuracy(), "mae"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+
+        # Test with dictionaries for loss, metrics, loss weights
+        if test_utils.get_model_type() == "functional":
+            loss = {"dense": "mse", "dropout": "mae"}
+            loss_weights = {"dense": 1.0, "dropout": 0.5}
+            metrics = {
+                "dense": "mse",
+                "dropout": metrics_module.CategoricalAccuracy(),
+            }
+            model.compile(
+                optimizer,
+                loss,
+                metrics=metrics,
+                loss_weights=loss_weights,
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+
+        # Build single-input model
+        x = layers_module.Input(shape=(3,), name="input_a")
+        y = layers_module.Dense(4)(x)
+        model = training_module.Model(x, y)
+        model.compile(
+            optimizer, loss="mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        # This will work
+        model.fit([input_a_np], output_d_np, epochs=1)
+
+        # Test model on a list of floats
+        input_a_np = np.random.random((10, 3))
+        input_b_np = np.random.random((10, 4))
+
+        # Test execution on inputs that are lists of scalars.
+        # TF2 and TF1 have slightly different semantics:
+        if tf.executing_eagerly():
+            # In TF2 to avoid any ambiguity when there are nested lists
+            # the entire input gets converted to a
+            # single numpy array (& it only works in the case of a single io
+            # model)
+            model.fit(
+                np.ndarray.tolist(input_a_np),
                 np.ndarray.tolist(input_b_np),
                 epochs=2,
                 batch_size=5,
-                verbose=2)
-    else:
-      # In TF1 there was logic to try disambiguating between the individual
-      # inputs when lists are nested. This allowed multi-io functional models
-      # to support lists of scalars as input, but it caused ambiguity issues
-      # for subclass models & made it trickier to pass multi-dimensional inputs
-      # as lists of scalars to single io models. This was an excessive amount
-      # of complexity for what boiled down to a convenience method we were
-      # mainly just using for writing tests.
-      model.fit([np.ndarray.tolist(input_a_np)],
+                verbose=2,
+            )
+        else:
+            # In TF1 there was logic to try disambiguating between the
+            # individual inputs when lists are nested. This allowed multi-io
+            # functional models to support lists of scalars as input, but it
+            # caused ambiguity issues for subclass models & made it trickier to
+            # pass multi-dimensional inputs as lists of scalars to single io
+            # models. This was an excessive amount of complexity for what boiled
+            # down to a convenience method we were mainly just using for writing
+            # tests.
+            model.fit(
+                [np.ndarray.tolist(input_a_np)],
                 [np.ndarray.tolist(input_b_np)],
                 epochs=2,
                 batch_size=5,
-                verbose=2)
-
-  @test_combinations.run_all_keras_modes
-  def test_evaluate_predict_on_arrays(self):
-    a = layers_module.Input(shape=(3,), name='input_a')
-    b = layers_module.Input(shape=(3,), name='input_b')
-
-    dense = layers_module.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = layers_module.Dropout(0.5, name='dropout')(c)
-
-    model = training_module.Model([a, b], [d, e])
-
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    loss_weights = [1., 0.5]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=['mae', metrics_module.CategoricalAccuracy()],
-        loss_weights=loss_weights,
-        sample_weight_mode=None,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    input_a_np = np.random.random((10, 3))
-    input_b_np = np.random.random((10, 3))
-
-    output_d_np = np.random.random((10, 4))
-    output_e_np = np.random.random((10, 4))
-
-    # Test evaluate at different verbosity
-    out = model.evaluate(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        batch_size=5,
-        verbose=0)
-    self.assertEqual(len(out), 7)
-    out = model.evaluate(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        batch_size=5,
-        verbose=1)
-    self.assertEqual(len(out), 7)
-    out = model.evaluate(
-        [input_a_np, input_b_np], [output_d_np, output_e_np],
-        batch_size=5,
-        verbose=2)
-    self.assertEqual(len(out), 7)
-    out = model.test_on_batch([input_a_np, input_b_np],
-                              [output_d_np, output_e_np])
-    self.assertEqual(len(out), 7)
-
-    # Test evaluate with dictionary inputs
-    model.evaluate(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        batch_size=5,
-        verbose=0)
-    model.evaluate(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        batch_size=5,
-        verbose=1)
-
-    # Test predict
-    out = model.predict([input_a_np, input_b_np], batch_size=5)
-    self.assertEqual(len(out), 2)
-    out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
-    self.assertEqual(len(out), 2)
-    out = model.predict_on_batch({
-        'input_a': input_a_np,
-        'input_b': input_b_np
-    })
-    self.assertEqual(len(out), 2)
-
-  def _make_sequence_input_functions(self, input_type):
-    # train and test
-    xy_namedtuple = collections.namedtuple('xy_namedtuple', ['x', 'y'])
-
-    # predict
-    x_namedtuple = collections.namedtuple('x_namedtuple', ['x'])
-
-    if input_type == 'dataset':
-      dataset = tf.data.Dataset.range(16).map(
-          lambda _: tf.ones(shape=(1,)))
-
-      xy_dataset = tf.data.Dataset.zip((dataset, dataset)).batch(4)
-      x_dataset = dataset.batch(4)
-      def xy_function(use_namedtuple):
-        return xy_dataset.map(xy_namedtuple) if use_namedtuple else xy_dataset
-
-      def x_function(use_namedtuple):
-        return x_dataset.map(x_namedtuple) if use_namedtuple else x_dataset
-
-      return xy_function, x_function
-
-    elif input_type == 'generator':
-      def xy_generator(use_namedtuple):
-        x, y = np.ones((4, 1)), np.ones((4, 1))
-        for _ in range(4):
-          if use_namedtuple:
-            yield xy_namedtuple(x, y)
-          else:
-            yield x, y
-
-      def x_generator(use_namedtuple):
-        x = np.ones((4, 1))
-        for _ in range(4):
-          if use_namedtuple:
-            yield x_namedtuple(x)
-          else:
-            yield x
-
-      return xy_generator, x_generator
-
-    elif input_type == 'sequence':
-      class XYSequence(data_utils.Sequence):
-
-        def __init__(self, use_namedtuple):
-          self._use_namedtuple = use_namedtuple
-          super().__init__()
-
-        def __getitem__(self, idx):
-          x, y = np.ones((4, 1)), np.ones((4, 1))
-          if self._use_namedtuple:
-            return xy_namedtuple(x, y)
-          return x, y
-
-        def __len__(self):
-          return 4
-
-      class XSequence(data_utils.Sequence):
-
-        def __init__(self, use_namedtuple):
-          self._use_namedtuple = use_namedtuple
-          super().__init__()
-
-        def __getitem__(self, idx):
-          x = np.ones((4, 1))
-          if self._use_namedtuple:
-            return x_namedtuple(x)
-          return x
-
-        def __len__(self):
-          return 4
-
-      return XYSequence, XSequence
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @test_combinations.run_with_all_model_types
-  @parameterized.named_parameters(
-      ('dataset', 'dataset'),
-      ('generator', 'generator'),
-      ('sequence', 'sequence'),
-  )
-  def test_sequence_input_types(self, input_type):
-    """Ensure that namedtuples and tuples are plumbed identically."""
-    if not tf.executing_eagerly():
-      self.skipTest('Improved checking is only present in data_adapter.')
-
-    xy_function, x_function = self._make_sequence_input_functions(input_type)
-    fit_kwargs, evaluate_kwargs, predict_kwargs = {}, {}, {}
-    if input_type == 'generator':
-      fit_kwargs['steps_per_epoch'] = 4
-      evaluate_kwargs['steps'] = 4
-      predict_kwargs['steps'] = 4
-
-    model = test_utils.get_small_mlp(1, 1, 1)
-    model.compile(
-        loss='mse',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit(xy_function(use_namedtuple=False), **fit_kwargs)
-    model.evaluate(xy_function(use_namedtuple=False), **evaluate_kwargs)
-    model.predict(x_function(use_namedtuple=False), **predict_kwargs)
-
-  @test_combinations.run_all_keras_modes
-  def test_custom_mapping_in_config(self):
-
-    class MyModel(training_module.Model):
-
-      def call(self, inputs):
-        return inputs
-
-      def get_config(self):
-        self.a = {}
-        return {'a': self.a}
-
-    model = MyModel()
-    self.assertIn('{"a": {}}', model.to_json())
-
-  def test_training_on_sparse_data_with_dense_placeholders_v1(self):
-    with tf.Graph().as_default():
-      if scipy_sparse is None:
-        return
-
-      test_inputs = [
-          scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
-      ]
-      test_outputs = [
-          scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
-      ]
-      in1 = layers_module.Input(shape=(3,))
-      in2 = layers_module.Input(shape=(3,))
-      out1 = layers_module.Dropout(0.5, name='dropout')(in1)
-      out2 = layers_module.Dense(4, name='dense_1')(in2)
-      model = training_module.Model([in1, in2], [out1, out2])
-      model.predict(test_inputs, batch_size=2)
-      optimizer = 'rmsprop'
-      model.compile(
-          optimizer,
-          'mse',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-      model.fit(test_inputs, test_outputs,
-                epochs=1, batch_size=2, validation_split=0.5)
-      model.evaluate(test_inputs, test_outputs, batch_size=2)
-
-  @test_combinations.run_all_keras_modes
-  def test_compile_with_sparse_placeholders(self):
-    inputs = layers_module.Input(shape=(10,), sparse=True)
-    weights = tf.Variable(
-        np.ones((10, 1)).astype(np.float32), name='weights')
-    weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
-    output_layer = layers_module.Lambda(weights_mult)(inputs)
-    model = training_module.Model([inputs], output_layer)
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer='adam',
-        metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-  @test_combinations.run_all_keras_modes
-  def test_that_trainable_disables_updates(self):
-    val_a = np.random.random((10, 4))
-    val_out = np.random.random((10, 4))
-
-    a = layers_module.Input(shape=(4,))
-    layer = layers_module.BatchNormalization(input_shape=(4,))
-    b = layer(a)
-    model = training_module.Model(a, b)
-
-    model.trainable = False
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertEmpty(model.updates)
-
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertEmpty(model.updates)
-
-    x1 = model.predict(val_a)
-    model.train_on_batch(val_a, val_out)
-    x2 = model.predict(val_a)
-    self.assertAllClose(x1, x2, atol=1e-7)
-
-    model.trainable = True
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertAllGreater(len(model.updates), 0)
-
-    model.train_on_batch(val_a, val_out)
-    x2 = model.predict(val_a)
-    assert np.abs(np.sum(x1 - x2)) > 1e-5
-
-    layer.trainable = False
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertEmpty(model.updates)
-
-    x1 = model.predict(val_a)
-    model.train_on_batch(val_a, val_out)
-    x2 = model.predict(val_a)
-    self.assertAllClose(x1, x2, atol=1e-7)
-
-  def test_weight_deduplication_in_methods(self):
-    inp = layers_module.Input(shape=(1,))
-    bn = layers_module.BatchNormalization()
-    d = layers_module.Dense(1)
-
-    m0 = training_module.Model(inp, d(bn(inp)))
-    m1 = training_module.Model(inp, d(bn(inp)))
-
-    x0 = m0(inp)
-    x1 = m1(inp)
-    x = layers_module.Add()([x0, x1])
-
-    model = training_module.Model(inp, x)
-    self.assertLen(model.trainable_weights, 4)
-    self.assertLen(model.non_trainable_weights, 2)
-    self.assertLen(model.weights, 6)
-
-  @test_combinations.run_all_keras_modes
-  def test_weight_deduplication(self):
-
-    class WatchingLayer(layers_module.Layer):
-
-      def __init__(self, dense_to_track):
-        # This will cause the kernel and bias to be double counted, effectively
-        # doubling the learning rate if weights are not deduped.
-        self._kernel = dense_to_track.kernel
-        self._bias = dense_to_track.bias
-        super().__init__()
-
-    inp = layers_module.Input(shape=(1,))
-    dense_layer = layers_module.Dense(1)
-    dense_output = dense_layer(inp)  # This will build the dense kernel
-
-    # Deterministically set weights to make the test repeatable.
-    dense_layer.set_weights([np.ones((1, 1)), np.zeros((1,))])
-    output = WatchingLayer(dense_layer)(dense_output)
-
-    model = training_module.Model(inp, output)
-
-    # 0.25 is the edge of the radius of convergence for the double apply case.
-    # At lr=0.24, the double apply case will very slowly descend while the
-    # correct case will drop very quickly.
-    model.compile(
-        loss='mse',
-        optimizer=optimizer_v2.gradient_descent.SGD(0.24),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((64 * 2,))
-    y = 4.5 * x - 3.
-
-    history = model.fit(x, y, batch_size=64, epochs=2, verbose=2)
-
-    # If the gradient apply is duplicated then the loss after 2 epochs will
-    # be ~0.15, compared to the correct answer of O(1e-7).
-    self.assertLess(history.history['loss'][-1], 1e-6)
-
-  @test_combinations.run_all_keras_modes
-  def test_weight_shared_across_layers(self):
-
-    class AddWeightLayer(layers_module.Layer):
-
-      def __init__(self, trainable_var, non_trainable_var):
-        self.trainable_var = trainable_var
-        self.non_trainable_var = non_trainable_var
-        super().__init__()
-
-      def call(self, inputs):
-        return inputs + self.trainable_var
-
-    class LayerWithWeightSharedLayers(layers_module.Layer):
-
-      def __init__(self):
-        super().__init__()
-        shared_trainable_var = tf.Variable(1.)
-        shared_non_trainable_var = tf.Variable(
-            1., trainable=False)
-        self.layer1 = AddWeightLayer(shared_trainable_var,
-                                     shared_non_trainable_var)
-        self.layer2 = AddWeightLayer(shared_trainable_var,
-                                     shared_non_trainable_var)
-
-      def call(self, inputs):
-        return self.layer2(self.layer1(inputs))
-
-    l = LayerWithWeightSharedLayers()
-    layers = list(l._flatten_layers(include_self=False, recursive=False))
-    self.assertEqual(layers, [l.layer1, l.layer2])
-    self.assertEqual(l.variables,
-                     [l.layer1.trainable_var, l.layer1.non_trainable_var])
-    self.assertEqual(l.trainable_variables, [l.layer1.trainable_var])
-    self.assertEqual(l.non_trainable_variables, [l.layer1.non_trainable_var])
-    self.assertLen(l.get_weights(), 2)
-
-  @test_combinations.run_all_keras_modes
-  def test_weight_tracking_for_template(self):
-    def variable_scoped_function(trainable=True):
-      return tf.compat.v1.get_variable(
-          'dummy', shape=[1], trainable=trainable,
-          initializer=tf.compat.v1.zeros_initializer())
-    def nested_template():
-      nested1 = tf.compat.v1.make_template('nested', variable_scoped_function)
-      nested2 = tf.compat.v1.make_template('nested', variable_scoped_function)
-      v1 = nested1()
-      v2 = nested2()
-
-      # nested1 and nested2 should not share variables
-      self.assertIsNot(v1, v2)
-
-      # Variables created by nested1 should be isolated from variables
-      # created by nested2.
-      self.assertEqual(1, len(nested1.variables))
-      self.assertEqual(1, len(nested2.variables))
-      self.assertIs(nested1.variables[0], v1)
-      self.assertIs(nested2.variables[0], v2)
-      self.assertEqual(1, len(nested1.trainable_variables))
-      self.assertEqual(1, len(nested2.trainable_variables))
-      self.assertIs(nested1.trainable_variables[0], v1)
-      self.assertIs(nested2.trainable_variables[0], v2)
-      self.assertEqual(len(nested1.non_trainable_variables), 0)
-      self.assertEqual(len(nested2.non_trainable_variables), 0)
-      return v1, v2
-
-    tmpl1 = tf.compat.v1.make_template('s1', nested_template)
-    tmpl2 = tf.compat.v1.make_template('s1', nested_template)
-
-    v1, v2 = tmpl1()
-    v5, v6 = tmpl2()
-
-    model = training_module.Model()
-    model.template = tmpl1
-    self.assertEqual(2, len(model.variables))
-    self.assertIs(model.variables[0], v1)
-    self.assertIs(model.variables[1], v2)
-    self.assertEqual(2, len(model.variables))
-    self.assertIs(model.trainable_variables[0], v1)
-    self.assertIs(model.trainable_variables[1], v2)
-    self.assertEqual(len(model.non_trainable_variables), 0)
-    model.templates = [tmpl2]
-    for v, w in zip(model.variables, [v1, v2, v5, v6]):
-      self.assertIs(v, w)
-    for v, w in zip(model.trainable_variables, [v1, v2, v5, v6]):
-      self.assertIs(v, w)
-    self.assertEqual(len(model.non_trainable_variables), 0)
-    # Make sure losses, layers, and updates aren't broken by having a Template
-    # in the mix, which does not expose any updates or losses.
-    self.assertEqual([], model.layers)
-    self.assertEqual([], model.updates)
-    self.assertEqual([], model.losses)
-    self.assertEqual([], model.templates.layers)
-    self.assertEqual([], model.templates.updates)
-    self.assertEqual([], model.templates.losses)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_logs_passed_to_callbacks(self):
-    input_dim = 5
-    num_classes = 1
-
-    class TestCallback(Callback):
-
-      def __init__(self):
-        super().__init__()
-        self.epoch_end_logs = None
-        self.batch_end_logs = None
-        self.epoch_end_call_count = 0
-        self.batch_end_call_count = 0
-
-      def on_epoch_end(self, epoch, logs=None):
-        self.epoch_end_logs = logs
-        self.epoch_end_call_count += 1
-
-      def on_batch_end(self, batch, logs=None):
-        self.batch_end_logs = logs
-        self.batch_end_call_count += 1
-
-    model = test_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=['acc'],
-        weighted_metrics=['mae'],
-        optimizer=RMSPropOptimizer(learning_rate=0.01),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    np.random.seed(1337)
-    (x_train, y_train), (_, _) = test_utils.get_test_data(
-        train_samples=10,
-        test_samples=10,
-        input_shape=(input_dim,),
-        num_classes=num_classes)
-
-    test_callback = TestCallback()
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=2,
-        epochs=2,
-        verbose=0,
-        callbacks=[test_callback],
-        validation_data=(x_train, y_train))
-    self.assertEqual(test_callback.batch_end_call_count, 10)
-    self.assertEqual(test_callback.epoch_end_call_count, 2)
-
-    self.assertSetEqual(
-        set(test_callback.batch_end_logs.keys()), set(['acc', 'loss', 'mae']))
-    self.assertSetEqual(
-        set(test_callback.epoch_end_logs.keys()),
-        set(['acc', 'loss', 'mae', 'val_acc', 'val_loss', 'val_mae']))
-
-  @test_combinations.run_all_keras_modes
-  def test_mismatched_output_shape_and_target_shape(self):
-    model = sequential.Sequential([
-        layers_module.Dense(2, input_shape=(3, 4)),
-        layers_module.Dense(5),
-    ])
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        loss='sparse_categorical_crossentropy',
-        run_eagerly=test_utils.should_run_eagerly())
-    # Test with Numpy data
-    x_train = np.random.random((10, 3, 4)).astype(np.float32)
-    y_train = np.random.randint(0, 5, size=(10, 3)).astype(np.float32)
-    model.fit(x_train, y_train, batch_size=5, epochs=1)
-
-    # Test with iterator
-    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-    dataset = dataset.repeat(10)
-    dataset = dataset.batch(10)
-    model.fit(dataset, epochs=1, steps_per_epoch=2)
-
-    if tf.executing_eagerly():
-      # Test with eager execution
-      model.compile(RMSPropOptimizer(learning_rate=0.001),
-                    loss='sparse_categorical_crossentropy',
-                    run_eagerly=True)
-      model.fit(x_train, y_train, batch_size=5, epochs=1)
-
-      # Test with eager execution and iterator
-      model.fit(dataset, epochs=1, steps_per_epoch=2)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_losses_in_defun(self):
-    layer = layers_module.Dense(1, kernel_regularizer='l1')
-    layer(tf.ones([1, 10]))
-
-    @tf.function
-    def get_losses():
-      return layer.losses
-
-    self.assertAllEqual(
-        self.evaluate(layer.losses), self.evaluate(get_losses()))
-
-  @test_combinations.run_all_keras_modes
-  def test_logging(self):
-    mock_stdout = io.StringIO()
-    model = sequential.Sequential()
-    model.add(layers_module.Dense(10, activation='relu'))
-    model.add(layers_module.Dense(1, activation='sigmoid'))
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        loss='binary_crossentropy',
-        run_eagerly=test_utils.should_run_eagerly())
-    io_utils.enable_interactive_logging()
-    with tf.compat.v1.test.mock.patch.object(sys, 'stdout', mock_stdout):
-      model.fit(
-          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
-    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_training_with_loss_instance(self):
-    a = layers_module.Input(shape=(3,), name='input_a')
-    b = layers_module.Input(shape=(3,), name='input_b')
-
-    dense = layers_module.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = layers_module.Dropout(0.5, name='dropout')(c)
-
-    model = training_module.Model([a, b], [d, e])
-    loss_weights = [1., 0.5]
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        loss=losses.MeanSquaredError(),
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        loss_weights=loss_weights)
-
-    input_a_np = np.random.random((10, 3))
-    input_b_np = np.random.random((10, 3))
-
-    output_d_np = np.random.random((10, 4))
-    output_e_np = np.random.random((10, 4))
-
-    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
-              epochs=1,
-              batch_size=5)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_static_batch_in_input_layer(self):
-    if tf.executing_eagerly():
-      self.skipTest('Not inferred in eager.')
-
-    class Counter(Callback):
-
-      def __init__(self):
-        self.batches = 0
-
-      def on_batch_end(self, batch, logs=None):
-        self.batches += 1
-
-    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
-
-    for batch_size, expected_batches in [(None, 2), (4, 16)]:
-      inputs = input_layer.Input(batch_size=batch_size, shape=(10,))
-      outputs = layers_module.Dense(1, activation='sigmoid')(inputs)
-      model = training_module.Model(inputs, outputs)
-
-      model.compile(optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
-      counter = Counter()
-      model.fit(x, y, callbacks=[counter])
-      self.assertEqual(counter.batches, expected_batches)
-
-      model = sequential.Sequential(
-          [layers_module.Dense(1, batch_input_shape=(batch_size, 10))])
-      model.compile(optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
-      counter = Counter()
-      model.fit(x, y, callbacks=[counter])
-      self.assertEqual(counter.batches, expected_batches)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_static_batch_in_input_layer_consistency_checks(self):
-    if tf.executing_eagerly():
-      self.skipTest('Not inferred in eager.')
-    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
-
-    inputs = input_layer.Input(batch_size=2, shape=(10,))
-    outputs = layers_module.Dense(1, activation='sigmoid')(inputs)
-    model = training_module.Model(inputs, outputs)
-    model.compile(optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
-    with self.assertRaisesRegex(ValueError,
-                                'incompatible with the specified batch size'):
-      model.fit(x, y, batch_size=4)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_compatible_batch_size_functional_model(self):
-
-    class MyLayer(layers_module.Layer):
-
-      def call(self, inputs):
-        return tf.concat(inputs, axis=0)
-
-    input1 = input_layer.Input(batch_size=2, shape=(10,))
-    input2 = input_layer.Input(batch_size=3, shape=(10,))
-    outputs = MyLayer()([input1, input2])
-    with tf.compat.v1.test.mock.patch.object(
-        logging, 'warning') as mock_warn:
-      training_module.Model([input1, input2], outputs)
-      self.assertEqual(
-          mock_warn.call_args_list[0][0][0],
-          'Found incompatible static batch sizes among the inputs. '
-          'Batch sizes: [2, 3]')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_calling_subclass_model_on_different_datasets(self):
-
-    class SubclassedModel(training_module.Model):
-
-      def call(self, inputs):
-        return inputs * 2
-
-    model = SubclassedModel()
-    dataset_one = tf.data.Dataset.from_tensor_slices([[0], [1]]).batch(2)
-    dataset_two = tf.data.Dataset.from_tensor_slices(
-        [[3], [4], [5], [6], [7], [8]]).batch(2)
-    self.assertAllEqual([[0], [2]], model.predict(dataset_one, steps=1))
-    self.assertAllEqual([[6], [8], [10], [12]],
-                        model.predict(dataset_two, steps=2))
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_training_on_sparse_categorical_crossentropy_loss_with_softmax(self):
-    np.random.seed(1337)
-    train_x = np.ones((100, 4))
-    train_y = np.random.randint(0, 1, size=(100, 1))
-
-    reference_model = test_utils.get_small_sequential_mlp(16, 2,
-                                                          input_dim=4)
-    reference_model.compile(loss='sparse_categorical_crossentropy',
-                            optimizer=RMSPropOptimizer(learning_rate=0.001),
-                            run_eagerly=True)
-    fixed_weights = reference_model.get_weights()
-    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
-
-    test_model = test_utils.get_small_sequential_mlp(16, 2, input_dim=4)
-    test_model.compile(loss='sparse_categorical_crossentropy',
-                       optimizer=RMSPropOptimizer(learning_rate=0.001),
-                       run_eagerly=False)
-    test_model.set_weights(fixed_weights)
-    test_model_loss = test_model.train_on_batch(train_x, train_y)
-    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_training_on_categorical_crossentropy_loss_with_softmax(self):
-    np.random.seed(1337)
-    train_x = np.ones((100, 4))
-    train_y = np_utils.to_categorical(
-        np.random.randint(0, 1, size=(100, 1)), 2)
-
-    reference_model = test_utils.get_small_sequential_mlp(16, 2,
-                                                          input_dim=4)
-    reference_model.compile(loss='categorical_crossentropy',
-                            optimizer=RMSPropOptimizer(learning_rate=0.001),
-                            run_eagerly=True)
-    fixed_weights = reference_model.get_weights()
-    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
-
-    test_model = test_utils.get_small_sequential_mlp(16, 2, input_dim=4)
-    test_model.compile(loss='categorical_crossentropy',
-                       optimizer=RMSPropOptimizer(learning_rate=0.001),
-                       run_eagerly=False)
-    test_model.set_weights(fixed_weights)
-    test_model_loss = test_model.train_on_batch(train_x, train_y)
-    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_training_on_binary_crossentropy_loss(self):
-    train_x = np.ones((100, 4), dtype=np.float32)
-    train_y = np.ones((100, 1), dtype=np.float32)
-    reference_model = test_utils.get_small_sequential_mlp(16, 1,
-                                                          input_dim=4)
-    reference_model.compile(loss='binary_crossentropy',
-                            optimizer=RMSPropOptimizer(learning_rate=0.001),
-                            run_eagerly=True)
-    fixed_weights = reference_model.get_weights()
-    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
-
-    test_model = test_utils.get_small_sequential_mlp(16, 1, input_dim=4)
-    test_model.compile(loss='binary_crossentropy',
-                       optimizer=RMSPropOptimizer(learning_rate=0.001),
-                       run_eagerly=False)
-    test_model.set_weights(fixed_weights)
-    test_model_loss = test_model.train_on_batch(train_x, train_y)
-    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      ('default', 1, 4), ('integer_two', 2, 2), ('integer_four', 4, 1),
-      ('simple_list', [1, 3, 4], 3), ('duplicated_list', [4, 2, 2], 2))
-  def test_validation_freq(self, validation_freq, expected_runs):
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    model = test_utils.get_small_mlp(2, 1, 10)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    class ValCounter(Callback):
-
-      def __init__(self):
-        self.val_runs = 0
-
-      def on_test_begin(self, logs=None):
-        self.val_runs += 1
-
-    val_counter = ValCounter()
-    model.fit(
-        x,
-        y,
-        epochs=4,
-        validation_data=(x, y),
-        validation_freq=validation_freq,
-        callbacks=[val_counter])
-    self.assertEqual(val_counter.val_runs, expected_runs)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_validation_steps_without_data(self):
-    if tf.executing_eagerly():
-      self.skipTest('Check removed in new `fit`')
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    model = test_utils.get_small_mlp(2, 1, 10)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    with self.assertRaisesRegex(
-        ValueError, '`validation_steps` should not be specified if '
-        '`validation_data` is None.'):
-      model.fit(x, y, epochs=4, validation_data=None, validation_steps=3)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_layer_with_variable_output(self):
-
-    class VariableOutputLayer(layers_module.Layer):
-
-      def build(self, input_shape):
-        self.v = self.add_weight('output_var', shape=(2, 5), initializer='ones')
-
-      def call(self, inputs):
-        return self.v
-
-    model = test_utils.get_model_from_layers(
-        [VariableOutputLayer(), layers_module.Dense(1)], input_shape=(10,))
-    # TODO(omalleyt): Make this work with `run_eagerly=True`.
-    model.compile('sgd', 'mse', run_eagerly=False)
-    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=5)
-
-    self.assertLen(model.trainable_variables, 3)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @test_utils.enable_v2_dtype_behavior
-  def test_model_dtype(self):
-
-    class AssertTypeLayer(layers_module.Layer):
-
-      def call(self, inputs):
-        assert inputs.dtype.name == self.dtype, (
-            'Input tensor has type %s which does not match assert type %s' %
-            (inputs.dtype.name, self.assert_type))
-        return inputs + 1.
-
-    for dtype in ('float16', 'float32', 'float64'):
-      model = test_utils.get_model_from_layers(
-          [AssertTypeLayer(dtype=dtype)], input_shape=(10,))
-      model.compile(
-          'sgd',
-          'mse',
-          run_eagerly=test_utils.should_run_eagerly())
-
-      x = np.ones((10, 10))
-      y = np.ones((10, 10))
-      model.fit(x, y)
-      model.test_on_batch(x, y)
-      model(x)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @test_utils.enable_v2_dtype_behavior
-  def test_model_input_dtype(self):
-    model = test_utils.get_small_mlp(1, 10, 10)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.ones((10, 10)).astype(np.float64)
-    y = np.ones((10, 10)).astype(np.float64)
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
-    model.fit(dataset)
-    self.assertEqual(model._compute_dtype, 'float32')
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_subclassed_model_with_training_arg(self):
-
-    class LayerWithTrainingArg(layers_module.Layer):
-
-      def call(self, inputs, training=None):
-        self.training = training
-        return inputs
-
-    class ModelWithTrainingArg(training_module.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.l1 = LayerWithTrainingArg()
-
-      def call(self, inputs, training=None):
-        self.training = training
-        inputs = self.l1(inputs, training=training)
-        return inputs
-
-    x = np.zeros((1, 2))
-    model = ModelWithTrainingArg()
-    model.compile(
-        loss='mse',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, x, epochs=1)
-
-    if tf.executing_eagerly():
-      expected_training_arg = True
-    else:
-      expected_training_arg = backend.symbolic_learning_phase()
-
-    self.assertIs(model.training, expected_training_arg)
-    self.assertIs(model.l1.training, expected_training_arg)
-
-  @test_combinations.run_all_keras_modes
-  def test_error_when_model_is_not_compiled(self):
-    inputs = input_layer.Input(shape=(1,))
-    outputs = layers_module.Dense(1)(inputs)
-    model = training_module.Model(inputs, outputs)
-    with self.assertRaisesRegex(RuntimeError, 'must compile your model'):
-      model.fit(np.ones((1, 1)), np.ones((1, 1)))
-
-    class MyModel(training_module.Model):
-
-      def call(self, x):
-        self.add_loss(tf.reduce_sum(x))
-        return x
-
-    model = MyModel()
-    with self.assertRaisesRegex(RuntimeError, 'must compile your model'):
-      model.fit(np.random.random((32, 1)), epochs=2)
-
-  @test_combinations.run_all_keras_modes
-  @test_utils.enable_v2_dtype_behavior
-  def test_losses_of_different_dtypes(self):
-    inp = input_layer.Input(shape=(2,))
-    out_1 = layers_module.Dense(
-        2, dtype='float32', kernel_regularizer='l2')(
-            inp)
-    out_2 = layers_module.Dense(
-        2, dtype='float16', kernel_regularizer='l2')(
-            inp)
-    model = training_module.Model(inp, [out_1, out_2])
-    extra_loss = tf.reduce_sum(tf.cast(out_2, 'float64'))
-    model.add_loss(extra_loss)
-    model.compile('sgd', ['mse', 'mse'],
-                  run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 2)), np.ones((10, 2))
-    model.fit(x, [y, y])
-
-  @test_combinations.run_all_keras_modes
-  @test_utils.enable_v2_dtype_behavior
-  def test_losses_of_different_dtypes_with_subclassed_model(self):
-
-    class MyModel(training_module.Model):
-
-      def build(self, _):
-        self.dense = layers_module.Dense(2)
-
-      def call(self, inputs):
-        self.add_loss(tf.cast(tf.nn.l2_loss(inputs), 'float64'))
-        return self.dense(inputs)
-
-    model = MyModel(dtype='float32')
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 2)), np.ones((10, 2))
-    model.fit(x, y)
-
-  @test_combinations.run_all_keras_modes
-  @test_utils.enable_v2_dtype_behavior
-  def test_regularizer_of_different_dtype(self):
-    inp = input_layer.Input(shape=(2,))
-
-    def regularizer(weight):
-      return tf.cast(tf.nn.l2_loss(weight), 'float64')
-
-    out = layers_module.Dense(
-        2, dtype='float32', kernel_regularizer=regularizer)(
-            inp)
-    model = training_module.Model(inp, out)
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 2)), np.ones((10, 2))
-    model.fit(x, y)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_outputs_are_floats(self):
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model = sequential.Sequential([layers_module.Dense(1)])
-    model.compile('sgd', 'mse', metrics=['accuracy'],
-                  run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit(x, y, epochs=2)
-    self.assertIsInstance(history.history['loss'][0], float)
-    self.assertIsInstance(history.history['accuracy'][0], float)
-
-    loss, accuracy = model.train_on_batch(x, y)
-    self.assertIsInstance(loss, float)
-    self.assertIsInstance(accuracy, float)
-
-    loss, accuracy = model.evaluate(x, y)
-    self.assertIsInstance(loss, float)
-    self.assertIsInstance(accuracy, float)
-
-    loss, accuracy = model.test_on_batch(x, y)
-    self.assertIsInstance(loss, float)
-    self.assertIsInstance(accuracy, float)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_int_output(self):
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-    model = sequential.Sequential([layers_module.Dense(1)])
-
-    class MyMetric(metrics_module.Metric):
-
-      def update_state(self, y_true, y_pred, sample_weight=None):
-        del y_true, y_pred, sample_weight
-
-      def result(self):
-        return tf.constant(1, dtype='int64')
-
-    model.compile('sgd', 'mse', metrics=[MyMetric()],
-                  run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(x, y, epochs=2)
-    self.assertIsInstance(history.history['my_metric'][0], int)
-
-  @test_combinations.run_all_keras_modes
-  def test_calling_aggregate_gradient(self):
-
-    class _Optimizer(optimizer_v2.gradient_descent.SGD):
-      """Mock optimizer to check if _aggregate_gradient is called."""
-
-      _HAS_AGGREGATE_GRAD = True
-
-      def __init__(self):
-        self.aggregate_gradients_called = False
-        super().__init__(name='MyOptimizer')
-
-      def _aggregate_gradients(self, grads):
-        self.aggregate_gradients_called = True
-        return super()._aggregate_gradients(grads)
-
-    mock_optimizer = _Optimizer()
-
-    model = sequential.Sequential()
-    model.add(layers_module.Dense(10, activation='relu'))
-
-    model.compile(mock_optimizer, 'mse',
-                  run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 10)), np.ones((10, 10))
-    model.fit(x, y)
-    self.assertEqual(model.optimizer.aggregate_gradients_called, True)
-
-    class _OptimizerOverrideApplyGradients(_Optimizer):
-      """Override apply_gradients.
-
-      To test the case where the optimizer does not define the
-      experimental_aggregate_gradients parameter.
-      """
-
-      _HAS_AGGREGATE_GRAD = False
-
-      def apply_gradients(self, grads_and_vars, name=None):  # pylint: disable=useless-super-delegation
-        return super().apply_gradients(grads_and_vars, name)
-
-    mock_optimizer = _OptimizerOverrideApplyGradients()
-    model.compile(mock_optimizer, 'mse',
-                  run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 10)), np.ones((10, 10))
-    model.fit(x, y)
-    self.assertEqual(model.optimizer.aggregate_gradients_called, True)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_gradients_are_none(self):
-
-    class DenseWithExtraWeight(layers_module.Dense):
-
-      def build(self, input_shape):
-        # Gradients w.r.t. extra_weights are None
-        self.extra_weight_1 = self.add_weight('extra_weight_1', shape=(),
-                                              initializer='ones')
-        super().build(input_shape)
-        self.extra_weight_2 = self.add_weight('extra_weight_2', shape=(),
-                                              initializer='ones')
-
-    model = sequential.Sequential([DenseWithExtraWeight(4, input_shape=(4,))])
-    # Test clipping can handle None gradients
-    opt = optimizer_v2.adam.Adam(clipnorm=1.0, clipvalue=1.0)
-    model.compile(opt, 'mse', run_eagerly=test_utils.should_run_eagerly())
-    inputs = np.random.normal(size=(64, 4))
-    targets = np.random.normal(size=(64, 4))
-    old_kernel = model.get_weights()[1]
-    model.fit(inputs, targets)
-    new_kernel = model.get_weights()[1]
-    self.assertNotAllEqual(old_kernel, new_kernel)
-
-  @test_combinations.run_all_keras_modes
-  def test_layer_ordering(self):
-
-    class MyLayer(layers_module.Layer):
-      pass
-
-    class MyModel(training_module.Model):
-
-      def __init__(self, name):
-        super().__init__(name=name)
-
-        self.weight = tf.Variable(0, name=name)
-
-        self.direct_sublayer = MyLayer(name='direct')
-        self.direct_sublayer.d = {'d': MyLayer(name='direct/dict')}
-
-        self.dict_sublayer = {'d': MyLayer(name='dict')}
-        self.dict_sublayer['d'].direct = MyLayer(name='dict/direct')
-
-    model = MyModel('model')
-    # All sublayers, including self and recursive sublayers.
-    self.assertEqual(['model', 'direct', 'direct/dict', 'dict', 'dict/direct'],
-                     [l.name for l in model._flatten_layers()])
-    # Only direct sublayers, including those in data structures.
-    self.assertEqual(['direct', 'dict'], [l.name for l in model.layers])
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_trainable_state_setting(self):
-
-    class UpdateLayer(layers_module.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.v = tf.Variable(0., trainable=False)
-
-      def call(self, x):
-        self.add_update(lambda: self.v.assign_add(1.))
-        return x * self.v
-
-    layer = UpdateLayer()
-    model_with_updates = sequential.Sequential([layer])
-    model_with_updates.compile(
-        'sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    layer.trainable = False
-    model_without_updates = sequential.Sequential([layer])
-    model_without_updates.compile(
-        'sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    x, y = np.ones((10, 1)), np.ones((10, 1))
-
-    self.assertEqual(self.evaluate(layer.v), 0.)
-    model_with_updates.fit(x, y, batch_size=10)
-    # assign_add called.
-    self.assertEqual(self.evaluate(layer.v), 1.)
-    model_without_updates.fit(x, y, batch_size=10)
-    # assign_add not called.
-    self.assertEqual(self.evaluate(layer.v), 1.)
-
-  @test_combinations.run_all_keras_modes(
-      always_skip_v1=True)
-  @parameterized.named_parameters(
-      ('numpy_array', 'numpy_array'),
-      ('dataset_array', 'dataset_array'),
-      ('dataset_dict', 'dataset_dict'))
-  def test_single_input_no_tuple_wrapping(self, input_type):
-    x = np.ones((10, 1))
-
-    if input_type == 'numpy_array':
-      batch_size = 3
-      expected_data_type = tf.Tensor
-    elif input_type == 'dataset_array':
-      x = tf.data.Dataset.from_tensor_slices(x).batch(3)
-      batch_size = None
-      expected_data_type = tf.Tensor
-    else:
-      x = {'my_input': x}
-      x = tf.data.Dataset.from_tensor_slices(x).batch(3)
-      batch_size = None
-      expected_data_type = dict
-
-    test_case = self
-
-    class MyModel(training_module.Model):
-
-      def train_step(self, data):
-        # No tuple wrapping for single x input and no targets.
-        test_case.assertIsInstance(data, expected_data_type)
-        return super().train_step(data)
-
-      def test_step(self, data):
-        test_case.assertIsInstance(data, expected_data_type)
-        return super().test_step(data)
-
-      def predict_step(self, data):
-        test_case.assertIsInstance(data, expected_data_type)
-        return super().predict_step(data)
-
-    inputs = layers_module.Input(shape=(1,), name='my_input')
-    outputs = layers_module.Dense(1)(inputs)
-    model = MyModel(inputs, outputs)
-    model.add_loss(tf.reduce_sum(outputs))
-    model.compile('sgd')
-    model.fit(x, batch_size=batch_size)
-    model.evaluate(x, batch_size=batch_size)
-    model.predict(x, batch_size=batch_size)
-
-  @test_combinations.run_all_keras_modes(
-      always_skip_v1=True)
-  @parameterized.named_parameters(
-      ('custom_metrics', False, True),
-      ('compiled_metrics', True, False),
-      ('both_compiled_and_custom_metrics', True, True))
-  def test_evaluate_with_custom_test_step(
-      self, use_compiled_metrics, use_custom_metrics):
-
-    class MyModel(training_module.Model):
-
-      def test_step(self, data):
-        x, y = data
-        pred = self(x)
-        metrics = {}
-        if use_compiled_metrics:
-          self.compiled_metrics.update_state(y, pred)
-          self.compiled_loss(y, pred)
-          for metric in self.metrics:
-            metrics[metric.name] = metric.result()
-        if use_custom_metrics:
-          custom_metrics = {
-              'mean': tf.reduce_mean(pred),
-              'sum': tf.reduce_sum(pred)
-          }
-          metrics.update(custom_metrics)
-        return metrics
-
-    inputs = layers_module.Input((2,))
-    outputs = layers_module.Dense(3)(inputs)
-    model = MyModel(inputs, outputs)
-    if use_compiled_metrics:
-      model.compile('adam', 'mse', metrics=['mae', 'mape'],
-                    run_eagerly=test_utils.should_run_eagerly())
-    else:
-      model.compile('adam', 'mse',
-                    run_eagerly=test_utils.should_run_eagerly())
-    x = np.random.random((4, 2))
-    y = np.random.random((4, 3))
-    results_list = model.evaluate(x, y)
-    results_dict = model.evaluate(x, y, return_dict=True)
-    self.assertLen(results_list, len(results_dict))
-    if use_compiled_metrics and use_custom_metrics:
-      self.assertLen(results_list, 5)
-      self.assertEqual(results_list,
-                       [results_dict['loss'],
-                        results_dict['mae'], results_dict['mape'],
-                        results_dict['mean'], results_dict['sum']])
-    if use_compiled_metrics and not use_custom_metrics:
-      self.assertLen(results_list, 3)
-      self.assertEqual(results_list,
-                       [results_dict['loss'],
-                        results_dict['mae'], results_dict['mape']])
-    if not use_compiled_metrics and use_custom_metrics:
-      self.assertLen(results_list, 2)
-      self.assertEqual(results_list,
-                       [results_dict['mean'], results_dict['sum']])
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_model_make_function(self):
-    layers = [
-        layers_module.Dense(10, dtype=np.float64),
-        layers_module.Dense(10, dtype=np.float64)
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(1,))
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    original_train_function = model.make_train_function()
-    self.assertIsNotNone(original_train_function)
-    self.assertEqual(model.make_train_function(), original_train_function)
-    # Check that we regenerate it without reusing the cached version.
-    self.assertNotEqual(
-        model.make_train_function(force=True), original_train_function)
-
-    original_test_function = model.make_test_function()
-    self.assertIsNotNone(original_test_function)
-    self.assertEqual(model.make_test_function(), original_test_function)
-    # Check that we regenerate it without reusing the cached version.
-    self.assertNotEqual(
-        model.make_test_function(force=True), original_test_function)
-
-    original_predict_function = model.make_predict_function()
-    self.assertIsNotNone(original_predict_function)
-    self.assertEqual(model.make_predict_function(), original_predict_function)
-    # Check that we regenerate it without reusing the cached version.
-    self.assertNotEqual(
-        model.make_predict_function(force=True), original_predict_function)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_custom_compute_metrics(self):
-
-    class CustomMetric(metrics_module.Mean):
-
-      def sq_diff_plus_x(self, x, y_true, y_pred):
-        y_pred = tf.convert_to_tensor(y_pred)
-        y_true = tf.cast(y_true, y_pred.dtype)
-        sq_diff_plus_x = tf.add(x, tf.math.squared_difference(y_pred, y_true))
-        return backend.mean(sq_diff_plus_x, axis=-1)
-
-      def update_state(self, x, y_true, y_pred, sample_weight=None):
-        matches = self.sq_diff_plus_x(x, y_true, y_pred)
-        return super().update_state(matches)
-
-    class MyModel(sequential.Sequential):
-
-      def compute_metrics(self, x, y, y_pred, sample_weight):
-        metric_results = super().compute_metrics(x, y, y_pred,
-                                                     sample_weight)
-        self.custom_metric.update_state(x, y, y_pred, sample_weight)
-        metric_results['custom_metric_name'] = self.custom_metric.result()
-        return metric_results
-
-    tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
-    dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
-    model = MyModel([layers_module.Dense(10)])
-    model.custom_metric = CustomMetric('my_metric')
-    initial_result = model.custom_metric.result()
-    optimizer = optimizer_v2.gradient_descent.SGD()
-    model.compile(optimizer, loss='mse', steps_per_execution=10)
-    model.fit(dataset, epochs=2, steps_per_epoch=10, verbose=2)
-    after_fit_result = model.custom_metric.result()
-
-    self.assertEqual(self.evaluate(initial_result), 0.0)
-    self.assertNotEqual(self.evaluate(initial_result),
-                        self.evaluate(after_fit_result))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_custom_compute_loss(self):
-
-    class MyModel(training_module.Model):
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.loss_metric = metrics_module.Mean(name='loss')
-
-      def compute_loss(self, x, y, y_pred, sample_weight):
-        loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y))
-        loss += tf.add_n(self.losses)
-        self.loss_metric.update_state(loss)
-        return loss
-
-      def reset_metrics(self):
-        self.loss_metric.reset_states()
-
-      @property
-      def metrics(self):
-        return [self.loss_metric]
-
-    tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
-    dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
-
-    inputs = layers_module.Input(shape=(10,), name='my_input')
-    outputs = layers_module.Dense(10)(inputs)
-    model = MyModel(inputs, outputs)
-    model.add_loss(tf.reduce_sum(outputs))
-
-    optimizer = optimizer_v2.gradient_descent.SGD()
-    model.compile(optimizer, loss='mse', steps_per_execution=10)
-    history = model.fit(dataset, epochs=2, steps_per_epoch=10)
-    self.assertLen(history.history['loss'], 2)
-    self.assertAllClose(history.history['loss'][1], model.loss_metric.result())
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_ema_overwrite(self):
-
-    model = sequential.Sequential()
-    model.add(input_layer.Input(shape=(4,)))
-    model.add(layers_module.Dense(1, activation='relu'))
-
-    tensors = tf.random.uniform((4, 4)), tf.random.uniform((4,))
-    dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
-
-    optimizer = sgd_experimental.SGD(use_ema=True, ema_momentum=1)
-    model.compile(optimizer, loss='mse', steps_per_execution=10)
-    initial_value = tf.Variable(model.trainable_variables[0])
-    history = model.fit(dataset, epochs=2, steps_per_epoch=10)
-    self.assertLen(history.history['loss'], 2)
-    self.assertAllClose(initial_value, model.trainable_variables[0])
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_get_verbosity(self):
-    class MyStrategy(tf.distribute.Strategy):
-
-      def __init__(self):
-        self._should_use_with_coordinator = True
-    with self.assertRaisesRegex(ValueError, '`verbose=1` is not allowed'):
-      training_module._get_verbosity(1, MyStrategy())
-
-    io_utils.enable_interactive_logging()
-    self.assertEqual(training_module._get_verbosity('auto', MyStrategy()), 2)
-    self.assertEqual(training_module._get_verbosity(
-        'auto', tf.distribute.MirroredStrategy()), 1)
-    self.assertEqual(training_module._get_verbosity(
-        2, tf.distribute.MirroredStrategy()), 2)
-
-    io_utils.disable_interactive_logging()
-    self.assertEqual(training_module._get_verbosity(
-        'auto', tf.distribute.MirroredStrategy()), 2)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_save_spec(self):
-
-    class Model(training_module.Model):
-
-      def call(self, arg_input_1, arg_input_2, keyword_input, training=None):
-        return 0
-
-    # Test subclassed model save specs.
-    model = Model()
-    model(tf.ones([1, 1]), tf.ones([2, 2]), keyword_input=tf.ones([3, 3]),
-          training=False)
-    spec = model.save_spec(dynamic_batch=False)
-    self.assertEqual(spec[0][0].shape.as_list(), [1, 1])
-    self.assertEqual(spec[0][1].shape.as_list(), [2, 2])
-    self.assertEqual(spec[1]['keyword_input'].shape.as_list(), [3, 3])
-    spec = model.save_spec(dynamic_batch=True)
-    self.assertEqual(spec[0][0].shape.as_list(), [None, 1])
-
-    # Test functional model save specs.
-    input_1 = layers_module.Input((1,), batch_size=1)
-    input_2 = layers_module.Input((2,), batch_size=2)
-    input_3 = layers_module.Input((3,), batch_size=3)
-    output = model(input_1, input_2, keyword_input=input_3, training=True)
-    functional = training_module.Model([input_1, input_2, input_3], output)
-    # Functional models should ignore dynamic_batch if the input layers have a
-    # known batch size.
-    spec = functional.save_spec(dynamic_batch=True)
-    input_specs = spec[0][0]
-    self.assertEqual(input_specs[0].shape.as_list(), [1, 1])
-    self.assertEqual(input_specs[1].shape.as_list(), [2, 2])
-    self.assertEqual(input_specs[2].shape.as_list(), [3, 3])
-
-
-class TestExceptionsAndWarnings(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @test_combinations.run_with_all_model_types
-  def test_fit_on_no_output(self):
-    inputs = layers_module.Input((3,))
-    outputs = layers_module.Dense(2)(inputs)
-    model = training_module.Model(inputs, outputs)
-    model.compile('rmsprop', 'mse')
-    x = np.zeros((32, 3))
-    with self.assertRaisesRegex(ValueError, 'Target data is missing..*'):
-      model.fit(x)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @test_combinations.run_with_all_model_types
-  def test_fit_on_wrong_output_type(self):
-    inputs1 = layers_module.Input((3,), name='a')
-    inputs2 = layers_module.Input((3,), name='b')
-    x = layers_module.Concatenate()([inputs1, inputs2])
-    outputs = layers_module.Dense(2, name='c')(x)
-    model = training_module.Model([inputs1, inputs2], outputs)
-    model.compile('rmsprop', 'mse')
-    x = np.zeros((32, 3))
-    y = np.zeros((32, 2))
-    with self.assertRaisesRegex(ValueError, 'Target data is missing..*'):
-      model.fit({'a': x, 'b': x, 'c': y})
-
-  @test_combinations.run_all_keras_modes
-  def test_compile_warning_for_loss_missing_output(self):
-    with self.cached_session():
-      inp = layers_module.Input(shape=(16,), name='input_a')
-      out_1 = layers_module.Dense(8, name='dense_1')(inp)
-      out_2 = layers_module.Dense(
-          3, activation='softmax', name='dense_2')(
-              out_1)
-      model = training_module.Model(inputs=[inp], outputs=[out_1, out_2])
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-
-      model.compile(
-          optimizer,
-          loss={
-              'dense_2': 'categorical_crossentropy',
-          },
-          metrics={
-              'dense_2': 'categorical_accuracy',
-              'dense_1': metrics_module.CategoricalAccuracy(),
-          },
-          run_eagerly=test_utils.should_run_eagerly())
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_predict_error_with_empty_x(self):
-    inputs = layers_module.Input(shape=(2,))
-    outputs = layers_module.Dense(4)(inputs)
-    model = training_module.Model(inputs=inputs, outputs=outputs)
-    model.compile(loss='mse')
-
-    with self.assertRaisesRegex(ValueError,
-                                'Unexpected result of `predict_function`.*'):
-      model.predict(np.array([]))
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @parameterized.named_parameters(
-      ('dynamic', 0, False),
-      ('dynamic_multistep', 10, False),
-      ('static', 0, True),
-      ('static_multistep', 10, True),
-  )
-  def test_predict_structured(self, spe, static_batch):
-    inputs = layers_module.Input(shape=(2,))
-    outputs = layers_module.Dense(2)(inputs)
-    model = training_module.Model(
-        inputs=inputs,
-        outputs={'out': outputs},
+                verbose=2,
+            )
+
+    @test_combinations.run_all_keras_modes
+    def test_evaluate_predict_on_arrays(self):
+        a = layers_module.Input(shape=(3,), name="input_a")
+        b = layers_module.Input(shape=(3,), name="input_b")
+
+        dense = layers_module.Dense(4, name="dense")
+        c = dense(a)
+        d = dense(b)
+        e = layers_module.Dropout(0.5, name="dropout")(c)
+
+        model = training_module.Model([a, b], [d, e])
+
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        loss = "mse"
+        loss_weights = [1.0, 0.5]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=["mae", metrics_module.CategoricalAccuracy()],
+            loss_weights=loss_weights,
+            sample_weight_mode=None,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        input_a_np = np.random.random((10, 3))
+        input_b_np = np.random.random((10, 3))
+
+        output_d_np = np.random.random((10, 4))
+        output_e_np = np.random.random((10, 4))
+
+        # Test evaluate at different verbosity
+        out = model.evaluate(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            batch_size=5,
+            verbose=0,
+        )
+        self.assertEqual(len(out), 7)
+        out = model.evaluate(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            batch_size=5,
+            verbose=1,
+        )
+        self.assertEqual(len(out), 7)
+        out = model.evaluate(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            batch_size=5,
+            verbose=2,
+        )
+        self.assertEqual(len(out), 7)
+        out = model.test_on_batch(
+            [input_a_np, input_b_np], [output_d_np, output_e_np]
+        )
+        self.assertEqual(len(out), 7)
+
+        # Test evaluate with dictionary inputs
+        model.evaluate(
+            {"input_a": input_a_np, "input_b": input_b_np},
+            {"dense": output_d_np, "dropout": output_e_np},
+            batch_size=5,
+            verbose=0,
+        )
+        model.evaluate(
+            {"input_a": input_a_np, "input_b": input_b_np},
+            {"dense": output_d_np, "dropout": output_e_np},
+            batch_size=5,
+            verbose=1,
+        )
+
+        # Test predict
+        out = model.predict([input_a_np, input_b_np], batch_size=5)
+        self.assertEqual(len(out), 2)
+        out = model.predict({"input_a": input_a_np, "input_b": input_b_np})
+        self.assertEqual(len(out), 2)
+        out = model.predict_on_batch(
+            {"input_a": input_a_np, "input_b": input_b_np}
+        )
+        self.assertEqual(len(out), 2)
+
+    def _make_sequence_input_functions(self, input_type):
+        # train and test
+        xy_namedtuple = collections.namedtuple("xy_namedtuple", ["x", "y"])
+
+        # predict
+        x_namedtuple = collections.namedtuple("x_namedtuple", ["x"])
+
+        if input_type == "dataset":
+            dataset = tf.data.Dataset.range(16).map(
+                lambda _: tf.ones(shape=(1,))
+            )
+
+            xy_dataset = tf.data.Dataset.zip((dataset, dataset)).batch(4)
+            x_dataset = dataset.batch(4)
+
+            def xy_function(use_namedtuple):
+                return (
+                    xy_dataset.map(xy_namedtuple)
+                    if use_namedtuple
+                    else xy_dataset
+                )
+
+            def x_function(use_namedtuple):
+                return (
+                    x_dataset.map(x_namedtuple) if use_namedtuple else x_dataset
+                )
+
+            return xy_function, x_function
+
+        elif input_type == "generator":
+
+            def xy_generator(use_namedtuple):
+                x, y = np.ones((4, 1)), np.ones((4, 1))
+                for _ in range(4):
+                    if use_namedtuple:
+                        yield xy_namedtuple(x, y)
+                    else:
+                        yield x, y
+
+            def x_generator(use_namedtuple):
+                x = np.ones((4, 1))
+                for _ in range(4):
+                    if use_namedtuple:
+                        yield x_namedtuple(x)
+                    else:
+                        yield x
+
+            return xy_generator, x_generator
+
+        elif input_type == "sequence":
+
+            class XYSequence(data_utils.Sequence):
+                def __init__(self, use_namedtuple):
+                    self._use_namedtuple = use_namedtuple
+                    super().__init__()
+
+                def __getitem__(self, idx):
+                    x, y = np.ones((4, 1)), np.ones((4, 1))
+                    if self._use_namedtuple:
+                        return xy_namedtuple(x, y)
+                    return x, y
+
+                def __len__(self):
+                    return 4
+
+            class XSequence(data_utils.Sequence):
+                def __init__(self, use_namedtuple):
+                    self._use_namedtuple = use_namedtuple
+                    super().__init__()
+
+                def __getitem__(self, idx):
+                    x = np.ones((4, 1))
+                    if self._use_namedtuple:
+                        return x_namedtuple(x)
+                    return x
+
+                def __len__(self):
+                    return 4
+
+            return XYSequence, XSequence
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @test_combinations.run_with_all_model_types
+    @parameterized.named_parameters(
+        ("dataset", "dataset"),
+        ("generator", "generator"),
+        ("sequence", "sequence"),
     )
-    model.compile(
-        loss='mse',
-        steps_per_execution=spe,
-        run_eagerly=test_utils.should_run_eagerly(),
+    def test_sequence_input_types(self, input_type):
+        """Ensure that namedtuples and tuples are plumbed identically."""
+        if not tf.executing_eagerly():
+            self.skipTest("Improved checking is only present in data_adapter.")
+
+        xy_function, x_function = self._make_sequence_input_functions(
+            input_type
+        )
+        fit_kwargs, evaluate_kwargs, predict_kwargs = {}, {}, {}
+        if input_type == "generator":
+            fit_kwargs["steps_per_epoch"] = 4
+            evaluate_kwargs["steps"] = 4
+            predict_kwargs["steps"] = 4
+
+        model = test_utils.get_small_mlp(1, 1, 1)
+        model.compile(
+            loss="mse",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model.fit(xy_function(use_namedtuple=False), **fit_kwargs)
+        model.evaluate(xy_function(use_namedtuple=False), **evaluate_kwargs)
+        model.predict(x_function(use_namedtuple=False), **predict_kwargs)
+
+    @test_combinations.run_all_keras_modes
+    def test_custom_mapping_in_config(self):
+        class MyModel(training_module.Model):
+            def call(self, inputs):
+                return inputs
+
+            def get_config(self):
+                self.a = {}
+                return {"a": self.a}
+
+        model = MyModel()
+        self.assertIn('{"a": {}}', model.to_json())
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_get_config_default(self):
+        class MyModel(training_module.Model):
+            def __init__(self, units):
+                super().__init__()
+                self.units = units
+
+            def call(self, inputs):
+                return inputs
+
+        # Test default config with named args
+        model = MyModel(units=10)
+        config = model.get_config()
+        self.assertLen(config, 1)
+        self.assertEqual(config["units"], 10)
+        model = model.from_config(config)
+        self.assertDictEqual(model.get_config(), config)
+
+        # Test default config with positinal args
+        model = MyModel(10)
+        config = model.get_config()
+        self.assertLen(config, 1)
+        self.assertEqual(config["units"], 10)
+        model = model.from_config(config)
+        self.assertDictEqual(model.get_config(), config)
+
+        # Test non-serializable
+        model = MyModel(units=np.int32(10))
+        config = model.get_config()
+        self.assertNotIn("units", config)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_get_config_kwargs(self):
+        class MyModel(training_module.Model):
+            def __init__(self, units, **kwargs):
+                super().__init__()
+                self.units = units
+
+            def call(self, inputs):
+                return inputs
+
+        model = MyModel(10, extra=1)
+        config = model.get_config()
+        # config = {'name': 'my_model', 'trainable': True, 'dtype': 'float32',
+        # 'extra': 1, 'units': 10}
+        self.assertLen(config, 5)
+        self.assertEqual(config["units"], 10)
+        self.assertEqual(config["extra"], 1)
+        model = model.from_config(config)
+        self.assertDictEqual(model.get_config(), config)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_get_config_override(self):
+        class MyModel(training_module.Model):
+            def __init__(self, units):
+                super().__init__()
+                self.units = units
+
+            def call(self, inputs):
+                return inputs
+
+            def get_config(self):
+                config = {"units": int(self.units)}
+                config.update(super().get_config())
+                return config
+
+        model = MyModel(units=np.int32(10))
+        config = model.get_config()
+        self.assertLen(config, 1)
+        self.assertEqual(config["units"], 10)
+        model = model.from_config(config)
+        self.assertDictEqual(model.get_config(), config)
+
+    def test_training_on_sparse_data_with_dense_placeholders_v1(self):
+        with tf.Graph().as_default():
+            if scipy_sparse is None:
+                return
+
+            test_inputs = [
+                scipy_sparse.random(6, 3, density=0.25).tocsr()
+                for _ in range(2)
+            ]
+            test_outputs = [
+                scipy_sparse.random(6, i, density=0.25).tocsr()
+                for i in range(3, 5)
+            ]
+            in1 = layers_module.Input(shape=(3,))
+            in2 = layers_module.Input(shape=(3,))
+            out1 = layers_module.Dropout(0.5, name="dropout")(in1)
+            out2 = layers_module.Dense(4, name="dense_1")(in2)
+            model = training_module.Model([in1, in2], [out1, out2])
+            model.predict(test_inputs, batch_size=2)
+            optimizer = "rmsprop"
+            model.compile(
+                optimizer,
+                "mse",
+                metrics=["mae", metrics_module.CategoricalAccuracy()],
+            )
+            model.fit(
+                test_inputs,
+                test_outputs,
+                epochs=1,
+                batch_size=2,
+                validation_split=0.5,
+            )
+            model.evaluate(test_inputs, test_outputs, batch_size=2)
+
+    @test_combinations.run_all_keras_modes
+    def test_compile_with_sparse_placeholders(self):
+        inputs = layers_module.Input(shape=(10,), sparse=True)
+        weights = tf.Variable(
+            np.ones((10, 1)).astype(np.float32), name="weights"
+        )
+        weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
+        output_layer = layers_module.Lambda(weights_mult)(inputs)
+        model = training_module.Model([inputs], output_layer)
+        model.compile(
+            loss="binary_crossentropy",
+            optimizer="adam",
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_that_trainable_disables_updates(self):
+        val_a = np.random.random((10, 4))
+        val_out = np.random.random((10, 4))
+
+        a = layers_module.Input(shape=(4,))
+        layer = layers_module.BatchNormalization(input_shape=(4,))
+        b = layer(a)
+        model = training_module.Model(a, b)
+
+        model.trainable = False
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertEmpty(model.updates)
+
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertEmpty(model.updates)
+
+        x1 = model.predict(val_a)
+        model.train_on_batch(val_a, val_out)
+        x2 = model.predict(val_a)
+        self.assertAllClose(x1, x2, atol=1e-7)
+
+        model.trainable = True
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertAllGreater(len(model.updates), 0)
+
+        model.train_on_batch(val_a, val_out)
+        x2 = model.predict(val_a)
+        assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+        layer.trainable = False
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertEmpty(model.updates)
+
+        x1 = model.predict(val_a)
+        model.train_on_batch(val_a, val_out)
+        x2 = model.predict(val_a)
+        self.assertAllClose(x1, x2, atol=1e-7)
+
+    def test_weight_deduplication_in_methods(self):
+        inp = layers_module.Input(shape=(1,))
+        bn = layers_module.BatchNormalization()
+        d = layers_module.Dense(1)
+
+        m0 = training_module.Model(inp, d(bn(inp)))
+        m1 = training_module.Model(inp, d(bn(inp)))
+
+        x0 = m0(inp)
+        x1 = m1(inp)
+        x = layers_module.Add()([x0, x1])
+
+        model = training_module.Model(inp, x)
+        self.assertLen(model.trainable_weights, 4)
+        self.assertLen(model.non_trainable_weights, 2)
+        self.assertLen(model.weights, 6)
+
+    @test_combinations.run_all_keras_modes
+    def test_weight_deduplication(self):
+        class WatchingLayer(layers_module.Layer):
+            def __init__(self, dense_to_track):
+                # This will cause the kernel and bias to be double counted,
+                # effectively doubling the learning rate if weights are not
+                # deduped.
+                self._kernel = dense_to_track.kernel
+                self._bias = dense_to_track.bias
+                super().__init__()
+
+        inp = layers_module.Input(shape=(1,))
+        dense_layer = layers_module.Dense(1)
+        dense_output = dense_layer(inp)  # This will build the dense kernel
+
+        # Deterministically set weights to make the test repeatable.
+        dense_layer.set_weights([np.ones((1, 1)), np.zeros((1,))])
+        output = WatchingLayer(dense_layer)(dense_output)
+
+        model = training_module.Model(inp, output)
+
+        # 0.25 is the edge of the radius of convergence for the double apply
+        # case. At lr=0.24, the double apply case will very slowly descend
+        # while the correct case will drop very quickly.
+        model.compile(
+            loss="mse",
+            optimizer=optimizer_legacy.gradient_descent.SGD(0.24),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones((64 * 2,))
+        y = 4.5 * x - 3.0
+
+        history = model.fit(x, y, batch_size=64, epochs=2, verbose=2)
+
+        # If the gradient apply is duplicated then the loss after 2 epochs will
+        # be ~0.15, compared to the correct answer of O(1e-7).
+        self.assertLess(history.history["loss"][-1], 1e-6)
+
+    @test_combinations.run_all_keras_modes
+    def test_weight_shared_across_layers(self):
+        class AddWeightLayer(layers_module.Layer):
+            def __init__(self, trainable_var, non_trainable_var):
+                self.trainable_var = trainable_var
+                self.non_trainable_var = non_trainable_var
+                super().__init__()
+
+            def call(self, inputs):
+                return inputs + self.trainable_var
+
+        class LayerWithWeightSharedLayers(layers_module.Layer):
+            def __init__(self):
+                super().__init__()
+                shared_trainable_var = tf.Variable(1.0)
+                shared_non_trainable_var = tf.Variable(1.0, trainable=False)
+                self.layer1 = AddWeightLayer(
+                    shared_trainable_var, shared_non_trainable_var
+                )
+                self.layer2 = AddWeightLayer(
+                    shared_trainable_var, shared_non_trainable_var
+                )
+
+            def call(self, inputs):
+                return self.layer2(self.layer1(inputs))
+
+        l = LayerWithWeightSharedLayers()
+        layers = list(l._flatten_layers(include_self=False, recursive=False))
+        self.assertEqual(layers, [l.layer1, l.layer2])
+        self.assertEqual(
+            l.variables, [l.layer1.trainable_var, l.layer1.non_trainable_var]
+        )
+        self.assertEqual(l.trainable_variables, [l.layer1.trainable_var])
+        self.assertEqual(
+            l.non_trainable_variables, [l.layer1.non_trainable_var]
+        )
+        self.assertLen(l.get_weights(), 2)
+
+    @test_combinations.run_all_keras_modes
+    def test_weight_tracking_for_template(self):
+        def variable_scoped_function(trainable=True):
+            return tf.compat.v1.get_variable(
+                "dummy",
+                shape=[1],
+                trainable=trainable,
+                initializer=tf.compat.v1.zeros_initializer(),
+            )
+
+        def nested_template():
+            nested1 = tf.compat.v1.make_template(
+                "nested", variable_scoped_function
+            )
+            nested2 = tf.compat.v1.make_template(
+                "nested", variable_scoped_function
+            )
+            v1 = nested1()
+            v2 = nested2()
+
+            # nested1 and nested2 should not share variables
+            self.assertIsNot(v1, v2)
+
+            # Variables created by nested1 should be isolated from variables
+            # created by nested2.
+            self.assertEqual(1, len(nested1.variables))
+            self.assertEqual(1, len(nested2.variables))
+            self.assertIs(nested1.variables[0], v1)
+            self.assertIs(nested2.variables[0], v2)
+            self.assertEqual(1, len(nested1.trainable_variables))
+            self.assertEqual(1, len(nested2.trainable_variables))
+            self.assertIs(nested1.trainable_variables[0], v1)
+            self.assertIs(nested2.trainable_variables[0], v2)
+            self.assertEqual(len(nested1.non_trainable_variables), 0)
+            self.assertEqual(len(nested2.non_trainable_variables), 0)
+            return v1, v2
+
+        tmpl1 = tf.compat.v1.make_template("s1", nested_template)
+        tmpl2 = tf.compat.v1.make_template("s1", nested_template)
+
+        v1, v2 = tmpl1()
+        v5, v6 = tmpl2()
+
+        model = training_module.Model()
+        model.template = tmpl1
+        self.assertEqual(2, len(model.variables))
+        self.assertIs(model.variables[0], v1)
+        self.assertIs(model.variables[1], v2)
+        self.assertEqual(2, len(model.variables))
+        self.assertIs(model.trainable_variables[0], v1)
+        self.assertIs(model.trainable_variables[1], v2)
+        self.assertEqual(len(model.non_trainable_variables), 0)
+        model.templates = [tmpl2]
+        for v, w in zip(model.variables, [v1, v2, v5, v6]):
+            self.assertIs(v, w)
+        for v, w in zip(model.trainable_variables, [v1, v2, v5, v6]):
+            self.assertIs(v, w)
+        self.assertEqual(len(model.non_trainable_variables), 0)
+        # Make sure losses, layers, and updates aren't broken by having a
+        # Template in the mix, which does not expose any updates or losses.
+        self.assertEqual([], model.layers)
+        self.assertEqual([], model.updates)
+        self.assertEqual([], model.losses)
+        self.assertEqual([], model.templates.layers)
+        self.assertEqual([], model.templates.updates)
+        self.assertEqual([], model.templates.losses)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_logs_passed_to_callbacks(self):
+        input_dim = 5
+        num_classes = 1
+
+        class TestCallback(Callback):
+            def __init__(self):
+                super().__init__()
+                self.epoch_end_logs = None
+                self.batch_end_logs = None
+                self.epoch_end_call_count = 0
+                self.batch_end_call_count = 0
+
+            def on_epoch_end(self, epoch, logs=None):
+                self.epoch_end_logs = logs
+                self.epoch_end_call_count += 1
+
+            def on_batch_end(self, batch, logs=None):
+                self.batch_end_logs = logs
+                self.batch_end_call_count += 1
+
+        model = test_utils.get_small_sequential_mlp(
+            num_hidden=10, num_classes=num_classes, input_dim=input_dim
+        )
+        model.compile(
+            loss="binary_crossentropy",
+            metrics=["acc"],
+            weighted_metrics=["mae"],
+            optimizer=RMSPropOptimizer(learning_rate=0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        np.random.seed(1337)
+        (x_train, y_train), (_, _) = test_utils.get_test_data(
+            train_samples=10,
+            test_samples=10,
+            input_shape=(input_dim,),
+            num_classes=num_classes,
+        )
+
+        test_callback = TestCallback()
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=2,
+            epochs=2,
+            verbose=0,
+            callbacks=[test_callback],
+            validation_data=(x_train, y_train),
+        )
+        self.assertEqual(test_callback.batch_end_call_count, 10)
+        self.assertEqual(test_callback.epoch_end_call_count, 2)
+
+        self.assertSetEqual(
+            set(test_callback.batch_end_logs.keys()),
+            set(["acc", "loss", "mae"]),
+        )
+        self.assertSetEqual(
+            set(test_callback.epoch_end_logs.keys()),
+            set(["acc", "loss", "mae", "val_acc", "val_loss", "val_mae"]),
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_mismatched_output_shape_and_target_shape(self):
+        model = sequential.Sequential(
+            [
+                layers_module.Dense(2, input_shape=(3, 4)),
+                layers_module.Dense(5),
+            ]
+        )
+        model.compile(
+            RMSPropOptimizer(learning_rate=0.001),
+            loss="sparse_categorical_crossentropy",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        # Test with Numpy data
+        x_train = np.random.random((10, 3, 4)).astype(np.float32)
+        y_train = np.random.randint(0, 5, size=(10, 3)).astype(np.float32)
+        model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+        # Test with iterator
+        dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+        dataset = dataset.repeat(10)
+        dataset = dataset.batch(10)
+        model.fit(dataset, epochs=1, steps_per_epoch=2)
+
+        if tf.executing_eagerly():
+            # Test with eager execution
+            model.compile(
+                RMSPropOptimizer(learning_rate=0.001),
+                loss="sparse_categorical_crossentropy",
+                run_eagerly=True,
+            )
+            model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+            # Test with eager execution and iterator
+            model.fit(dataset, epochs=1, steps_per_epoch=2)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_losses_in_defun(self):
+        layer = layers_module.Dense(1, kernel_regularizer="l1")
+        layer(tf.ones([1, 10]))
+
+        @tf.function
+        def get_losses():
+            return layer.losses
+
+        self.assertAllEqual(
+            self.evaluate(layer.losses), self.evaluate(get_losses())
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_logging(self):
+        mock_stdout = io.StringIO()
+        model = sequential.Sequential()
+        model.add(layers_module.Dense(10, activation="relu"))
+        model.add(layers_module.Dense(1, activation="sigmoid"))
+        model.compile(
+            RMSPropOptimizer(learning_rate=0.001),
+            loss="binary_crossentropy",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        io_utils.enable_interactive_logging()
+        with tf.compat.v1.test.mock.patch.object(sys, "stdout", mock_stdout):
+            model.fit(
+                np.ones((10, 10), "float32"),
+                np.ones((10, 1), "float32"),
+                epochs=10,
+            )
+        self.assertTrue("Epoch 5/10" in mock_stdout.getvalue())
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
     )
-    xdata = np.random.uniform(size=(8, 2)).astype(np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((xdata, xdata))
-    dataset = dataset.batch(8, drop_remainder=static_batch)
-    ret = model.predict(dataset, steps=1)
-    tf.nest.assert_same_structure(ret, {'out': ''})
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_on_batch_error_inconsistent_batch_size(self):
-    input_node1 = layers_module.Input(shape=(5,))
-    input_node2 = layers_module.Input(shape=(5,))
-    output_node = layers_module.Concatenate()([input_node1, input_node2])
-    output_node = layers_module.Dense(4)(output_node)
-    model = training_module.Model([input_node1, input_node2], output_node)
-    model.compile(loss='mse')
+    def test_training_with_loss_instance(self):
+        a = layers_module.Input(shape=(3,), name="input_a")
+        b = layers_module.Input(shape=(3,), name="input_b")
+
+        dense = layers_module.Dense(4, name="dense")
+        c = dense(a)
+        d = dense(b)
+        e = layers_module.Dropout(0.5, name="dropout")(c)
+
+        model = training_module.Model([a, b], [d, e])
+        loss_weights = [1.0, 0.5]
+        model.compile(
+            RMSPropOptimizer(learning_rate=0.001),
+            loss=losses.MeanSquaredError(),
+            metrics=[metrics_module.CategoricalAccuracy(), "mae"],
+            loss_weights=loss_weights,
+        )
+
+        input_a_np = np.random.random((10, 3))
+        input_b_np = np.random.random((10, 3))
+
+        output_d_np = np.random.random((10, 4))
+        output_e_np = np.random.random((10, 4))
+
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=1,
+            batch_size=5,
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_static_batch_in_input_layer(self):
+        if tf.executing_eagerly():
+            self.skipTest("Not inferred in eager.")
+
+        class Counter(Callback):
+            def __init__(self):
+                self.batches = 0
+
+            def on_batch_end(self, batch, logs=None):
+                self.batches += 1
+
+        x, y = np.ones((64, 10), "float32"), np.ones((64, 1), "float32")
+
+        for batch_size, expected_batches in [(None, 2), (4, 16)]:
+            inputs = input_layer.Input(batch_size=batch_size, shape=(10,))
+            outputs = layers_module.Dense(1, activation="sigmoid")(inputs)
+            model = training_module.Model(inputs, outputs)
+
+            model.compile(
+                optimizer_legacy.adam.Adam(0.001), "binary_crossentropy"
+            )
+            counter = Counter()
+            model.fit(x, y, callbacks=[counter])
+            self.assertEqual(counter.batches, expected_batches)
+
+            model = sequential.Sequential(
+                [layers_module.Dense(1, batch_input_shape=(batch_size, 10))]
+            )
+            model.compile(
+                optimizer_legacy.adam.Adam(0.001), "binary_crossentropy"
+            )
+            counter = Counter()
+            model.fit(x, y, callbacks=[counter])
+            self.assertEqual(counter.batches, expected_batches)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_static_batch_in_input_layer_consistency_checks(self):
+        if tf.executing_eagerly():
+            self.skipTest("Not inferred in eager.")
+        x, y = np.ones((64, 10), "float32"), np.ones((64, 1), "float32")
+
+        inputs = input_layer.Input(batch_size=2, shape=(10,))
+        outputs = layers_module.Dense(1, activation="sigmoid")(inputs)
+        model = training_module.Model(inputs, outputs)
+        model.compile(optimizer_legacy.adam.Adam(0.001), "binary_crossentropy")
+        with self.assertRaisesRegex(
+            ValueError, "incompatible with the specified batch size"
+        ):
+            model.fit(x, y, batch_size=4)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_compatible_batch_size_functional_model(self):
+        class MyLayer(layers_module.Layer):
+            def call(self, inputs):
+                return tf.concat(inputs, axis=0)
+
+        input1 = input_layer.Input(batch_size=2, shape=(10,))
+        input2 = input_layer.Input(batch_size=3, shape=(10,))
+        outputs = MyLayer()([input1, input2])
+        with tf.compat.v1.test.mock.patch.object(
+            logging, "warning"
+        ) as mock_warn:
+            training_module.Model([input1, input2], outputs)
+            self.assertEqual(
+                mock_warn.call_args_list[0][0][0],
+                "Found incompatible static batch sizes among the inputs. "
+                "Batch sizes: [2, 3]",
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_calling_subclass_model_on_different_datasets(self):
+        class SubclassedModel(training_module.Model):
+            def call(self, inputs):
+                return inputs * 2
+
+        model = SubclassedModel()
+        dataset_one = tf.data.Dataset.from_tensor_slices([[0], [1]]).batch(2)
+        dataset_two = tf.data.Dataset.from_tensor_slices(
+            [[3], [4], [5], [6], [7], [8]]
+        ).batch(2)
+        self.assertAllEqual([[0], [2]], model.predict(dataset_one, steps=1))
+        self.assertAllEqual(
+            [[6], [8], [10], [12]], model.predict(dataset_two, steps=2)
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_training_on_sparse_categorical_crossentropy_loss_with_softmax(
+        self,
+    ):
+        np.random.seed(1337)
+        train_x = np.ones((100, 4))
+        train_y = np.random.randint(0, 1, size=(100, 1))
+
+        reference_model = test_utils.get_small_sequential_mlp(
+            16, 2, input_dim=4
+        )
+        reference_model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=True,
+        )
+        fixed_weights = reference_model.get_weights()
+        reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+        test_model = test_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+        test_model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=False,
+        )
+        test_model.set_weights(fixed_weights)
+        test_model_loss = test_model.train_on_batch(train_x, train_y)
+        self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_training_on_categorical_crossentropy_loss_with_softmax(self):
+        np.random.seed(1337)
+        train_x = np.ones((100, 4))
+        train_y = np_utils.to_categorical(
+            np.random.randint(0, 1, size=(100, 1)), 2
+        )
+
+        reference_model = test_utils.get_small_sequential_mlp(
+            16, 2, input_dim=4
+        )
+        reference_model.compile(
+            loss="categorical_crossentropy",
+            optimizer=rmsprop.RMSprop(learning_rate=0.001),
+            run_eagerly=True,
+        )
+        fixed_weights = reference_model.get_weights()
+        reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+        test_model = test_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+        test_model.compile(
+            loss="categorical_crossentropy",
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=False,
+        )
+        test_model.set_weights(fixed_weights)
+        test_model_loss = test_model.train_on_batch(train_x, train_y)
+        self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_training_on_binary_crossentropy_loss(self):
+        train_x = np.ones((100, 4), dtype=np.float32)
+        train_y = np.ones((100, 1), dtype=np.float32)
+        reference_model = test_utils.get_small_sequential_mlp(
+            16, 1, input_dim=4
+        )
+        reference_model.compile(
+            loss="binary_crossentropy",
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=True,
+        )
+        fixed_weights = reference_model.get_weights()
+        reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+        test_model = test_utils.get_small_sequential_mlp(16, 1, input_dim=4)
+        test_model.compile(
+            loss="binary_crossentropy",
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=False,
+        )
+        test_model.set_weights(fixed_weights)
+        test_model_loss = test_model.train_on_batch(train_x, train_y)
+        self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        ("default", 1, 4),
+        ("integer_two", 2, 2),
+        ("integer_four", 4, 1),
+        ("simple_list", [1, 3, 4], 3),
+        ("duplicated_list", [4, 2, 2], 2),
+    )
+    def test_validation_freq(self, validation_freq, expected_runs):
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        model = test_utils.get_small_mlp(2, 1, 10)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        class ValCounter(Callback):
+            def __init__(self):
+                self.val_runs = 0
+
+            def on_test_begin(self, logs=None):
+                self.val_runs += 1
+
+        val_counter = ValCounter()
+        model.fit(
+            x,
+            y,
+            epochs=4,
+            validation_data=(x, y),
+            validation_freq=validation_freq,
+            callbacks=[val_counter],
+        )
+        self.assertEqual(val_counter.val_runs, expected_runs)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_validation_steps_without_data(self):
+        if tf.executing_eagerly():
+            self.skipTest("Check removed in new `fit`")
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        model = test_utils.get_small_mlp(2, 1, 10)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "`validation_steps` should not be specified if "
+            "`validation_data` is None.",
+        ):
+            model.fit(x, y, epochs=4, validation_data=None, validation_steps=3)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_layer_with_variable_output(self):
+        class VariableOutputLayer(layers_module.Layer):
+            def build(self, input_shape):
+                self.v = self.add_weight(
+                    "output_var", shape=(2, 5), initializer="ones"
+                )
+
+            def call(self, inputs):
+                return self.v
+
+        model = test_utils.get_model_from_layers(
+            [VariableOutputLayer(), layers_module.Dense(1)], input_shape=(10,)
+        )
+        # TODO(omalleyt): Make this work with `run_eagerly=True`.
+        model.compile("sgd", "mse", run_eagerly=False)
+        model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=5)
+
+        self.assertLen(model.trainable_variables, 3)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @test_utils.enable_v2_dtype_behavior
+    def test_model_dtype(self):
+        class AssertTypeLayer(layers_module.Layer):
+            def call(self, inputs):
+                assert inputs.dtype.name == self.dtype, (
+                    "Input tensor has type %s which does not match assert "
+                    "type %s" % (inputs.dtype.name, self.assert_type)
+                )
+                return inputs + 1.0
+
+        for dtype in ("float16", "float32", "float64"):
+            model = test_utils.get_model_from_layers(
+                [AssertTypeLayer(dtype=dtype)], input_shape=(10,)
+            )
+            model.compile(
+                "sgd", "mse", run_eagerly=test_utils.should_run_eagerly()
+            )
+
+            x = np.ones((10, 10))
+            y = np.ones((10, 10))
+            model.fit(x, y)
+            model.test_on_batch(x, y)
+            model(x)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @test_utils.enable_v2_dtype_behavior
+    def test_model_input_dtype(self):
+        model = test_utils.get_small_mlp(1, 10, 10)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        x = np.ones((10, 10)).astype(np.float64)
+        y = np.ones((10, 10)).astype(np.float64)
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
+        model.fit(dataset)
+        self.assertEqual(model._compute_dtype, "float32")
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_subclassed_model_with_training_arg(self):
+        class LayerWithTrainingArg(layers_module.Layer):
+            def call(self, inputs, training=None):
+                self.training = training
+                return inputs
+
+        class ModelWithTrainingArg(training_module.Model):
+            def __init__(self):
+                super().__init__()
+                self.l1 = LayerWithTrainingArg()
+
+            def call(self, inputs, training=None):
+                self.training = training
+                inputs = self.l1(inputs, training=training)
+                return inputs
+
+        x = np.zeros((1, 2))
+        model = ModelWithTrainingArg()
+        model.compile(
+            loss="mse",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(x, x, epochs=1)
+
+        if tf.executing_eagerly():
+            expected_training_arg = True
+        else:
+            expected_training_arg = backend.symbolic_learning_phase()
+
+        self.assertIs(model.training, expected_training_arg)
+        self.assertIs(model.l1.training, expected_training_arg)
+
+    @test_combinations.run_all_keras_modes
+    def test_error_when_model_is_not_compiled(self):
+        inputs = input_layer.Input(shape=(1,))
+        outputs = layers_module.Dense(1)(inputs)
+        model = training_module.Model(inputs, outputs)
+        with self.assertRaisesRegex(RuntimeError, "must compile your model"):
+            model.fit(np.ones((1, 1)), np.ones((1, 1)))
+
+        class MyModel(training_module.Model):
+            def call(self, x):
+                self.add_loss(tf.reduce_sum(x))
+                return x
+
+        model = MyModel()
+        with self.assertRaisesRegex(RuntimeError, "must compile your model"):
+            model.fit(np.random.random((32, 1)), epochs=2)
+
+    @test_combinations.run_all_keras_modes
+    @test_utils.enable_v2_dtype_behavior
+    def test_losses_of_different_dtypes(self):
+        inp = input_layer.Input(shape=(2,))
+        out_1 = layers_module.Dense(
+            2, dtype="float32", kernel_regularizer="l2"
+        )(inp)
+        out_2 = layers_module.Dense(
+            2, dtype="float16", kernel_regularizer="l2"
+        )(inp)
+        model = training_module.Model(inp, [out_1, out_2])
+        extra_loss = tf.reduce_sum(tf.cast(out_2, "float64"))
+        model.add_loss(extra_loss)
+        model.compile(
+            "sgd", ["mse", "mse"], run_eagerly=test_utils.should_run_eagerly()
+        )
+        x, y = np.ones((10, 2)), np.ones((10, 2))
+        model.fit(x, [y, y])
+
+    @test_combinations.run_all_keras_modes
+    @test_utils.enable_v2_dtype_behavior
+    def test_losses_of_different_dtypes_with_subclassed_model(self):
+        class MyModel(training_module.Model):
+            def build(self, _):
+                self.dense = layers_module.Dense(2)
+
+            def call(self, inputs):
+                self.add_loss(tf.cast(tf.nn.l2_loss(inputs), "float64"))
+                return self.dense(inputs)
+
+        model = MyModel(dtype="float32")
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        x, y = np.ones((10, 2)), np.ones((10, 2))
+        model.fit(x, y)
+
+    @test_combinations.run_all_keras_modes
+    @test_utils.enable_v2_dtype_behavior
+    def test_regularizer_of_different_dtype(self):
+        inp = input_layer.Input(shape=(2,))
+
+        def regularizer(weight):
+            return tf.cast(tf.nn.l2_loss(weight), "float64")
+
+        out = layers_module.Dense(
+            2, dtype="float32", kernel_regularizer=regularizer
+        )(inp)
+        model = training_module.Model(inp, out)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        x, y = np.ones((10, 2)), np.ones((10, 2))
+        model.fit(x, y)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_outputs_are_floats(self):
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile(
+            "sgd",
+            "mse",
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        history = model.fit(x, y, epochs=2)
+        self.assertIsInstance(history.history["loss"][0], float)
+        self.assertIsInstance(history.history["accuracy"][0], float)
+
+        loss, accuracy = model.train_on_batch(x, y)
+        self.assertIsInstance(loss, float)
+        self.assertIsInstance(accuracy, float)
+
+        loss, accuracy = model.evaluate(x, y)
+        self.assertIsInstance(loss, float)
+        self.assertIsInstance(accuracy, float)
+
+        loss, accuracy = model.test_on_batch(x, y)
+        self.assertIsInstance(loss, float)
+        self.assertIsInstance(accuracy, float)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_int_output(self):
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model = sequential.Sequential([layers_module.Dense(1)])
+
+        class MyMetric(metrics_module.Metric):
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                del y_true, y_pred, sample_weight
+
+            def result(self):
+                return tf.constant(1, dtype="int64")
+
+        model.compile(
+            "sgd",
+            "mse",
+            metrics=[MyMetric()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(x, y, epochs=2)
+        self.assertIsInstance(history.history["my_metric"][0], int)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @test_utils.enable_v2_dtype_behavior
+    def test_mixed_precision(self):
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        policy.set_global_policy("mixed_float16")
+        model = sequential.Sequential([layers_module.Dense(1)])
+        optimizer = sgd_experimental.SGD()
+        model.compile(
+            optimizer,
+            "mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(x, y, epochs=2)
+        policy.set_global_policy("float32")
+
+    @test_combinations.run_all_keras_modes
+    def test_calling_aggregate_gradient(self):
+        class _Optimizer(optimizer_legacy.gradient_descent.SGD):
+            """Mock optimizer to check if _aggregate_gradient is called."""
+
+            _HAS_AGGREGATE_GRAD = True
+
+            def __init__(self):
+                self.aggregate_gradients_called = False
+                super().__init__(name="MyOptimizer")
+
+            def _aggregate_gradients(self, grads):
+                self.aggregate_gradients_called = True
+                return super()._aggregate_gradients(grads)
+
+        mock_optimizer = _Optimizer()
+
+        model = sequential.Sequential()
+        model.add(layers_module.Dense(10, activation="relu"))
+
+        model.compile(
+            mock_optimizer, "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        x, y = np.ones((10, 10)), np.ones((10, 10))
+        model.fit(x, y)
+        self.assertEqual(model.optimizer.aggregate_gradients_called, True)
+
+        class _OptimizerOverrideApplyGradients(_Optimizer):
+            """Override apply_gradients.
+
+            To test the case where the optimizer does not define the
+            experimental_aggregate_gradients parameter.
+            """
+
+            _HAS_AGGREGATE_GRAD = False
+
+            def apply_gradients(self, grads_and_vars, name=None):
+                return super().apply_gradients(grads_and_vars, name)
+
+        mock_optimizer = _OptimizerOverrideApplyGradients()
+        model.compile(
+            mock_optimizer, "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        x, y = np.ones((10, 10)), np.ones((10, 10))
+        model.fit(x, y)
+        self.assertEqual(model.optimizer.aggregate_gradients_called, True)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_gradients_are_none(self):
+        class DenseWithExtraWeight(layers_module.Dense):
+            def build(self, input_shape):
+                # Gradients w.r.t. extra_weights are None
+                self.extra_weight_1 = self.add_weight(
+                    "extra_weight_1", shape=(), initializer="ones"
+                )
+                super().build(input_shape)
+                self.extra_weight_2 = self.add_weight(
+                    "extra_weight_2", shape=(), initializer="ones"
+                )
+
+        model = sequential.Sequential(
+            [DenseWithExtraWeight(4, input_shape=(4,))]
+        )
+        # Test clipping can handle None gradients
+        opt = optimizer_legacy.adam.Adam(clipnorm=1.0, clipvalue=1.0)
+        model.compile(opt, "mse", run_eagerly=test_utils.should_run_eagerly())
+        inputs = np.random.normal(size=(64, 4))
+        targets = np.random.normal(size=(64, 4))
+        old_kernel = model.get_weights()[1]
+        model.fit(inputs, targets)
+        new_kernel = model.get_weights()[1]
+        self.assertNotAllEqual(old_kernel, new_kernel)
+
+    @test_combinations.run_all_keras_modes
+    def test_layer_ordering(self):
+        class MyLayer(layers_module.Layer):
+            pass
+
+        class MyModel(training_module.Model):
+            def __init__(self, name):
+                super().__init__(name=name)
+
+                self.weight = tf.Variable(0, name=name)
+
+                self.direct_sublayer = MyLayer(name="direct")
+                self.direct_sublayer.d = {"d": MyLayer(name="direct/dict")}
+
+                self.dict_sublayer = {"d": MyLayer(name="dict")}
+                self.dict_sublayer["d"].direct = MyLayer(name="dict/direct")
+
+        model = MyModel("model")
+        # All sublayers, including self and recursive sublayers.
+        self.assertEqual(
+            ["model", "direct", "direct/dict", "dict", "dict/direct"],
+            [l.name for l in model._flatten_layers()],
+        )
+        # Only direct sublayers, including those in data structures.
+        self.assertEqual(["direct", "dict"], [l.name for l in model.layers])
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_trainable_state_setting(self):
+        class UpdateLayer(layers_module.Layer):
+            def __init__(self):
+                super().__init__()
+                self.v = tf.Variable(0.0, trainable=False)
+
+            def call(self, x):
+                self.add_update(lambda: self.v.assign_add(1.0))
+                return x * self.v
+
+        layer = UpdateLayer()
+        model_with_updates = sequential.Sequential([layer])
+        model_with_updates.compile(
+            "sgd", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        layer.trainable = False
+        model_without_updates = sequential.Sequential([layer])
+        model_without_updates.compile(
+            "sgd", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+
+        self.assertEqual(self.evaluate(layer.v), 0.0)
+        model_with_updates.fit(x, y, batch_size=10)
+        # assign_add called.
+        self.assertEqual(self.evaluate(layer.v), 1.0)
+        model_without_updates.fit(x, y, batch_size=10)
+        # assign_add not called.
+        self.assertEqual(self.evaluate(layer.v), 1.0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @parameterized.named_parameters(
+        ("numpy_array", "numpy_array"),
+        ("dataset_array", "dataset_array"),
+        ("dataset_dict", "dataset_dict"),
+    )
+    def test_single_input_no_tuple_wrapping(self, input_type):
+        x = np.ones((10, 1))
+
+        if input_type == "numpy_array":
+            batch_size = 3
+            expected_data_type = tf.Tensor
+        elif input_type == "dataset_array":
+            x = tf.data.Dataset.from_tensor_slices(x).batch(3)
+            batch_size = None
+            expected_data_type = tf.Tensor
+        else:
+            x = {"my_input": x}
+            x = tf.data.Dataset.from_tensor_slices(x).batch(3)
+            batch_size = None
+            expected_data_type = dict
+
+        test_case = self
+
+        class MyModel(training_module.Model):
+            def train_step(self, data):
+                # No tuple wrapping for single x input and no targets.
+                test_case.assertIsInstance(data, expected_data_type)
+                return super().train_step(data)
+
+            def test_step(self, data):
+                test_case.assertIsInstance(data, expected_data_type)
+                return super().test_step(data)
+
+            def predict_step(self, data):
+                test_case.assertIsInstance(data, expected_data_type)
+                return super().predict_step(data)
+
+        inputs = layers_module.Input(shape=(1,), name="my_input")
+        outputs = layers_module.Dense(1)(inputs)
+        model = MyModel(inputs, outputs)
+        model.add_loss(tf.reduce_sum(outputs))
+        model.compile("sgd")
+        model.fit(x, batch_size=batch_size)
+        model.evaluate(x, batch_size=batch_size)
+        model.predict(x, batch_size=batch_size)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @parameterized.named_parameters(
+        ("custom_metrics", False, True),
+        ("compiled_metrics", True, False),
+        ("both_compiled_and_custom_metrics", True, True),
+    )
+    def test_evaluate_with_custom_test_step(
+        self, use_compiled_metrics, use_custom_metrics
+    ):
+        class MyModel(training_module.Model):
+            def test_step(self, data):
+                x, y = data
+                pred = self(x)
+                metrics = {}
+                if use_compiled_metrics:
+                    self.compiled_metrics.update_state(y, pred)
+                    self.compiled_loss(y, pred)
+                    for metric in self.metrics:
+                        metrics[metric.name] = metric.result()
+                if use_custom_metrics:
+                    custom_metrics = {
+                        "mean": tf.reduce_mean(pred),
+                        "sum": tf.reduce_sum(pred),
+                    }
+                    metrics.update(custom_metrics)
+                return metrics
+
+        inputs = layers_module.Input((2,))
+        outputs = layers_module.Dense(3)(inputs)
+        model = MyModel(inputs, outputs)
+        if use_compiled_metrics:
+            model.compile(
+                "adam",
+                "mse",
+                metrics=["mae", "mape"],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+        else:
+            model.compile(
+                "adam", "mse", run_eagerly=test_utils.should_run_eagerly()
+            )
+        x = np.random.random((4, 2))
+        y = np.random.random((4, 3))
+        results_list = model.evaluate(x, y)
+        results_dict = model.evaluate(x, y, return_dict=True)
+        self.assertLen(results_list, len(results_dict))
+        if use_compiled_metrics and use_custom_metrics:
+            self.assertLen(results_list, 5)
+            self.assertEqual(
+                results_list,
+                [
+                    results_dict["loss"],
+                    results_dict["mae"],
+                    results_dict["mape"],
+                    results_dict["mean"],
+                    results_dict["sum"],
+                ],
+            )
+        if use_compiled_metrics and not use_custom_metrics:
+            self.assertLen(results_list, 3)
+            self.assertEqual(
+                results_list,
+                [
+                    results_dict["loss"],
+                    results_dict["mae"],
+                    results_dict["mape"],
+                ],
+            )
+        if not use_compiled_metrics and use_custom_metrics:
+            self.assertLen(results_list, 2)
+            self.assertEqual(
+                results_list, [results_dict["mean"], results_dict["sum"]]
+            )
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_model_make_function(self):
+        layers = [
+            layers_module.Dense(10, dtype=np.float64),
+            layers_module.Dense(10, dtype=np.float64),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(1,))
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        original_train_function = model.make_train_function()
+        self.assertIsNotNone(original_train_function)
+        self.assertEqual(model.make_train_function(), original_train_function)
+        # Check that we regenerate it without reusing the cached version.
+        self.assertNotEqual(
+            model.make_train_function(force=True), original_train_function
+        )
+
+        original_test_function = model.make_test_function()
+        self.assertIsNotNone(original_test_function)
+        self.assertEqual(model.make_test_function(), original_test_function)
+        # Check that we regenerate it without reusing the cached version.
+        self.assertNotEqual(
+            model.make_test_function(force=True), original_test_function
+        )
+
+        original_predict_function = model.make_predict_function()
+        self.assertIsNotNone(original_predict_function)
+        self.assertEqual(
+            model.make_predict_function(), original_predict_function
+        )
+        # Check that we regenerate it without reusing the cached version.
+        self.assertNotEqual(
+            model.make_predict_function(force=True), original_predict_function
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_custom_compute_metrics(self):
+        class CustomMetric(metrics_module.Mean):
+            def sq_diff_plus_x(self, x, y_true, y_pred):
+                y_pred = tf.convert_to_tensor(y_pred)
+                y_true = tf.cast(y_true, y_pred.dtype)
+                sq_diff_plus_x = tf.add(
+                    x, tf.math.squared_difference(y_pred, y_true)
+                )
+                return backend.mean(sq_diff_plus_x, axis=-1)
+
+            def update_state(self, x, y_true, y_pred, sample_weight=None):
+                matches = self.sq_diff_plus_x(x, y_true, y_pred)
+                return super().update_state(matches)
+
+        class MyModel(sequential.Sequential):
+            def compute_metrics(self, x, y, y_pred, sample_weight):
+                metric_results = super().compute_metrics(
+                    x, y, y_pred, sample_weight
+                )
+                self.custom_metric.update_state(x, y, y_pred, sample_weight)
+                metric_results[
+                    "custom_metric_name"
+                ] = self.custom_metric.result()
+                return metric_results
+
+        tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
+        dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
+        model = MyModel([layers_module.Dense(10)])
+        model.custom_metric = CustomMetric("my_metric")
+        initial_result = model.custom_metric.result()
+        optimizer = optimizer_legacy.gradient_descent.SGD()
+        model.compile(optimizer, loss="mse", steps_per_execution=10)
+        model.fit(dataset, epochs=2, steps_per_epoch=10, verbose=2)
+        after_fit_result = model.custom_metric.result()
+
+        self.assertEqual(self.evaluate(initial_result), 0.0)
+        self.assertNotEqual(
+            self.evaluate(initial_result), self.evaluate(after_fit_result)
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_custom_compute_loss(self):
+        class MyModel(training_module.Model):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.loss_metric = metrics_module.Mean(name="loss")
+
+            def compute_loss(self, x, y, y_pred, sample_weight):
+                loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y))
+                loss += tf.add_n(self.losses)
+                self.loss_metric.update_state(loss)
+                return loss
+
+            def reset_metrics(self):
+                self.loss_metric.reset_states()
+
+            @property
+            def metrics(self):
+                return [self.loss_metric]
+
+        tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
+        dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
+
+        inputs = layers_module.Input(shape=(10,), name="my_input")
+        outputs = layers_module.Dense(10)(inputs)
+        model = MyModel(inputs, outputs)
+        model.add_loss(tf.reduce_sum(outputs))
+
+        optimizer = optimizer_legacy.gradient_descent.SGD()
+        model.compile(optimizer, loss="mse", steps_per_execution=10)
+        history = model.fit(dataset, epochs=2, steps_per_epoch=10)
+        self.assertLen(history.history["loss"], 2)
+        self.assertAllClose(
+            history.history["loss"][1], model.loss_metric.result()
+        )
 
-    with self.assertRaisesRegex(ValueError, 'Data cardinality is ambiguous'):
-      model.train_on_batch([np.ones((10, 5)), np.ones((10, 5))],
-                           np.ones((11, 4)))
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @parameterized.named_parameters(
+        ("mixed_float16", "mixed_float16"), ("float32", "float32")
+    )
+    def test_ema_overwrite(self, test_policy):
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest("EMA optimizer is only available in TF2.")
+        policy.set_global_policy(test_policy)
+        model = sequential.Sequential()
+        model.add(input_layer.Input(shape=(4,)))
+        model.add(layers_module.Dense(1, activation="relu"))
+
+        tensors = tf.random.uniform((4, 4)), tf.random.uniform((4,))
+        dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
+
+        optimizer = sgd_experimental.SGD(use_ema=True, ema_momentum=1)
+        model.compile(optimizer, loss="mse", steps_per_execution=10)
+        initial_value = tf.Variable(model.trainable_variables[0])
+        history = model.fit(dataset, epochs=2, steps_per_epoch=10)
+        self.assertLen(history.history["loss"], 2)
+        self.assertAllClose(initial_value, model.trainable_variables[0])
+        policy.set_global_policy("float32")
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_get_verbosity(self):
+        class MyStrategy(tf.distribute.Strategy):
+            def __init__(self):
+                self._should_use_with_coordinator = True
+
+        with self.assertRaisesRegex(ValueError, "`verbose=1` is not allowed"):
+            training_module._get_verbosity(1, MyStrategy())
+
+        io_utils.enable_interactive_logging()
+        self.assertEqual(
+            training_module._get_verbosity("auto", MyStrategy()), 2
+        )
+        self.assertEqual(
+            training_module._get_verbosity(
+                "auto", tf.distribute.MirroredStrategy()
+            ),
+            1,
+        )
+        self.assertEqual(
+            training_module._get_verbosity(2, tf.distribute.MirroredStrategy()),
+            2,
+        )
+
+        io_utils.disable_interactive_logging()
+        self.assertEqual(
+            training_module._get_verbosity(
+                "auto", tf.distribute.MirroredStrategy()
+            ),
+            2,
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_save_spec(self):
+        class Model(training_module.Model):
+            def call(
+                self, arg_input_1, arg_input_2, keyword_input, training=None
+            ):
+                return 0
+
+        # Test subclassed model save specs.
+        model = Model()
+        model(
+            tf.ones([1, 1]),
+            tf.ones([2, 2]),
+            keyword_input=tf.ones([3, 3]),
+            training=False,
+        )
+        spec = model.save_spec(dynamic_batch=False)
+        self.assertEqual(spec[0][0].shape.as_list(), [1, 1])
+        self.assertEqual(spec[0][1].shape.as_list(), [2, 2])
+        self.assertEqual(spec[1]["keyword_input"].shape.as_list(), [3, 3])
+        spec = model.save_spec(dynamic_batch=True)
+        self.assertEqual(spec[0][0].shape.as_list(), [None, 1])
+
+        # Test functional model save specs.
+        input_1 = layers_module.Input((1,), batch_size=1)
+        input_2 = layers_module.Input((2,), batch_size=2)
+        input_3 = layers_module.Input((3,), batch_size=3)
+        output = model(input_1, input_2, keyword_input=input_3, training=True)
+        functional = training_module.Model([input_1, input_2, input_3], output)
+        # Functional models should ignore dynamic_batch if the input layers have
+        # a known batch size.
+        spec = functional.save_spec(dynamic_batch=True)
+        input_specs = spec[0][0]
+        self.assertEqual(input_specs[0].shape.as_list(), [1, 1])
+        self.assertEqual(input_specs[1].shape.as_list(), [2, 2])
+        self.assertEqual(input_specs[2].shape.as_list(), [3, 3])
+
+
+class TestAutotuneSPE(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_compile_fit_with_jit_compile(self):
+        # Test with jit_compile = True
+        model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile(
+            "sgd",
+            loss="mse",
+            run_eagerly=False,
+            jit_compile=True,
+            steps_per_execution="auto",
+        )
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        # Test fcompile fit for a RNN model
+        model = sequential.Sequential()
+        model.add(
+            layers_module.TimeDistributed(
+                layers_module.Embedding(5, 6, mask_zero=True),
+                input_shape=(None, None),
+            )
+        )  # N by t_1 by t_2 by 6
+        model.add(
+            layers_module.TimeDistributed(
+                layers_module.SimpleRNN(7, return_sequences=True)
+            )
+        )
+        model.add(
+            layers_module.TimeDistributed(
+                layers_module.SimpleRNN(8, return_sequences=False)
+            )
+        )
+        model.add(layers_module.SimpleRNN(1, return_sequences=False))
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            jit_compile=True,
+            steps_per_execution="auto",
+        )
+        model_input = np.random.randint(
+            low=1, high=5, size=(10, 3, 4), dtype="int32"
+        )
+        for i in range(4):
+            model_input[i, i:, i:] = 0
+        model.fit(
+            model_input, np.random.random((10, 1)), epochs=1, batch_size=10
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_compile_fit_evaluate_predict_with_mirrored_strategy(self):
+        # Test with jit_compile = True
+        strategy = tf.distribute.MirroredStrategy()
+        with strategy.scope():
+            model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile(
+            "sgd",
+            loss="mse",
+            run_eagerly=False,
+            jit_compile=True,
+            steps_per_execution="auto",
+        )
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        model.evaluate(x, y)
+        model.predict(x)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_spe_tune_compile_fit_then_false_predict(self):
+        strategy = tf.distribute.MirroredStrategy()
+        with strategy.scope():
+            model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile(
+            "sgd",
+            loss="mse",
+            run_eagerly=False,
+            jit_compile=True,
+            steps_per_execution="auto",
+        )
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        model.evaluate(x, y)
+        model.autotune_steps_per_execution = False
+        model.predict(x)
+        assert model.autotune_steps_per_execution == False
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_spe_tune_set_after_compile(self):
+        model = sequential.Sequential([layers_module.Dense(1)])
+        model.compile(
+            "sgd",
+            loss="mse",
+            run_eagerly=False,
+            jit_compile=True,
+            steps_per_execution=5,
+        )
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
+        assert model._steps_per_execution_tuner is None
+        model.autotune_steps_per_execution = True
+        model.fit(x, y, epochs=2)
+        assert model.steps_per_execution.numpy().item() == 5
+        assert model._steps_per_execution_tuner
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_spe_tune_set_before_compile(self):
+        model = sequential.Sequential([layers_module.Dense(1)])
+        model.steps_per_execution = 5
+        model.compile(
+            "sgd",
+            loss="mse",
+            run_eagerly=False,
+            jit_compile=True,
+            steps_per_execution="auto",
+        )
+        assert model.steps_per_execution.numpy().item() == 5
+        assert model._steps_per_execution_tuner
+
+        x, y = np.ones((10, 1)), np.ones((10, 1))
+        model.fit(x, y, epochs=2)
 
-    with self.assertRaisesRegex(ValueError, 'Data cardinality is ambiguous'):
-      model.test_on_batch([np.ones((10, 5)), np.ones((10, 5))],
-                          np.ones((11, 4)))
 
-    with self.assertRaisesRegex(ValueError, 'Data cardinality is ambiguous'):
-      model.predict_on_batch([np.ones((10, 5)), np.ones((11, 5))])
+class TestExceptionsAndWarnings(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @test_combinations.run_with_all_model_types
+    def test_fit_on_no_output(self):
+        inputs = layers_module.Input((3,))
+        outputs = layers_module.Dense(2)(inputs)
+        model = training_module.Model(inputs, outputs)
+        model.compile("rmsprop", "mse")
+        x = np.zeros((32, 3))
+        with self.assertRaisesRegex(ValueError, "Target data is missing..*"):
+            model.fit(x)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @test_combinations.run_with_all_model_types
+    def test_fit_on_wrong_output_type(self):
+        inputs1 = layers_module.Input((3,), name="a")
+        inputs2 = layers_module.Input((3,), name="b")
+        x = layers_module.Concatenate()([inputs1, inputs2])
+        outputs = layers_module.Dense(2, name="c")(x)
+        model = training_module.Model([inputs1, inputs2], outputs)
+        model.compile("rmsprop", "mse")
+        x = np.zeros((32, 3))
+        y = np.zeros((32, 2))
+        with self.assertRaisesRegex(ValueError, "Target data is missing..*"):
+            model.fit({"a": x, "b": x, "c": y})
+
+    @test_combinations.run_all_keras_modes
+    def test_compile_warning_for_loss_missing_output(self):
+        with self.cached_session():
+            inp = layers_module.Input(shape=(16,), name="input_a")
+            out_1 = layers_module.Dense(8, name="dense_1")(inp)
+            out_2 = layers_module.Dense(
+                3, activation="softmax", name="dense_2"
+            )(out_1)
+            model = training_module.Model(inputs=[inp], outputs=[out_1, out_2])
+            optimizer = RMSPropOptimizer(learning_rate=0.001)
+
+            model.compile(
+                optimizer,
+                loss={
+                    "dense_2": "categorical_crossentropy",
+                },
+                metrics={
+                    "dense_2": "categorical_accuracy",
+                    "dense_1": metrics_module.CategoricalAccuracy(),
+                },
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_predict_error_with_empty_x(self):
+        inputs = layers_module.Input(shape=(2,))
+        outputs = layers_module.Dense(4)(inputs)
+        model = training_module.Model(inputs=inputs, outputs=outputs)
+        model.compile(loss="mse")
+
+        with self.assertRaisesRegex(
+            ValueError, "Expected input data to be non-empty."
+        ):
+            model.predict(np.array([]))
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @parameterized.named_parameters(
+        ("dynamic", 0, False),
+        ("dynamic_multistep", 10, False),
+        ("static", 0, True),
+        ("static_multistep", 10, True),
+    )
+    def test_predict_structured(self, spe, static_batch):
+        inputs = layers_module.Input(shape=(2,))
+        outputs = layers_module.Dense(2)(inputs)
+        model = training_module.Model(
+            inputs=inputs,
+            outputs={"out": outputs},
+        )
+        model.compile(
+            loss="mse",
+            steps_per_execution=spe,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        xdata = np.random.uniform(size=(8, 2)).astype(np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((xdata, xdata))
+        dataset = dataset.batch(8, drop_remainder=static_batch)
+        ret = model.predict(dataset, steps=1)
+        tf.nest.assert_same_structure(ret, {"out": ""})
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_on_batch_error_inconsistent_batch_size(self):
+        input_node1 = layers_module.Input(shape=(5,))
+        input_node2 = layers_module.Input(shape=(5,))
+        output_node = layers_module.Concatenate()([input_node1, input_node2])
+        output_node = layers_module.Dense(4)(output_node)
+        model = training_module.Model([input_node1, input_node2], output_node)
+        model.compile(loss="mse")
+
+        with self.assertRaisesRegex(
+            ValueError, "Data cardinality is ambiguous"
+        ):
+            model.train_on_batch(
+                [np.ones((10, 5)), np.ones((10, 5))], np.ones((11, 4))
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "Data cardinality is ambiguous"
+        ):
+            model.test_on_batch(
+                [np.ones((10, 5)), np.ones((10, 5))], np.ones((11, 4))
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "Data cardinality is ambiguous"
+        ):
+            model.predict_on_batch([np.ones((10, 5)), np.ones((11, 5))])
 
 
 class LossWeightingTest(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes
-  def test_class_weights(self):
-    num_classes = 5
-    batch_size = 5
-    epochs = 10
-    weighted_class = 3
-    weight = .5
-    train_samples = 1000
-    test_samples = 1000
-    input_dim = 5
-    learning_rate = 0.001
-
-    model = test_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
-    model.compile(
-        loss='categorical_crossentropy',
-        metrics=['acc', metrics_module.CategoricalAccuracy()],
-        weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=learning_rate),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    np.random.seed(1337)
-    (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-        train_samples=train_samples,
-        test_samples=test_samples,
-        input_shape=(input_dim,),
-        num_classes=num_classes)
-    int_y_test = y_test.copy()
-    # convert class vectors to binary class matrices
-    y_train = np_utils.to_categorical(y_train, num_classes)
-    y_test = np_utils.to_categorical(y_test, num_classes)
-    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
-
-    class_weight = dict([(i, 1.) for i in range(num_classes)])
-    class_weight[weighted_class] = weight
-
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=batch_size,
-        epochs=epochs // 3,
-        verbose=0,
-        class_weight=class_weight,
-        validation_data=(x_train, y_train))
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=batch_size,
-        epochs=epochs // 2,
-        verbose=0,
-        class_weight=class_weight)
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=batch_size,
-        epochs=epochs // 2,
-        verbose=0,
-        class_weight=class_weight,
-        validation_split=0.1)
-
-    model.train_on_batch(
-        x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
-    ref_score = model.evaluate(x_test, y_test, verbose=0)  # pylint: disable=unused-variable
-    score = model.evaluate(  # pylint: disable=unused-variable
-        x_test[test_ids, :], y_test[test_ids, :], verbose=0)
-    # TODO(b/152990697): Fix the class weights test here.
-    # self.assertLess(score[0], ref_score[0])
-
-  @test_combinations.run_all_keras_modes
-  def test_temporal_sample_weights(self):
-    num_classes = 5
-    batch_size = 5
-    epochs = 10
-    weighted_class = 3
-    weight = 10.
-    train_samples = 1000
-    test_samples = 1000
-    input_dim = 5
-    timesteps = 3
-    learning_rate = 0.001
-
-    with self.cached_session():
-      model = sequential.Sequential()
-      model.add(
-          layers_module.TimeDistributed(
-              layers_module.Dense(num_classes),
-              input_shape=(timesteps, input_dim)))
-      model.add(layers_module.Activation('softmax'))
-
-      np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=test_samples,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
-      int_y_test = y_test.copy()
-      int_y_train = y_train.copy()
-      # convert class vectors to binary class matrices
-      y_train = np_utils.to_categorical(y_train, num_classes)
-      y_test = np_utils.to_categorical(y_test, num_classes)
-      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
-
-      sample_weight = np.ones((y_train.shape[0]))
-      sample_weight[int_y_train == weighted_class] = weight
-
-      temporal_x_train = np.reshape(x_train, (len(x_train), 1,
-                                              x_train.shape[1]))
-      temporal_x_train = np.repeat(temporal_x_train, timesteps, axis=1)
-      temporal_x_test = np.reshape(x_test, (len(x_test), 1, x_test.shape[1]))
-      temporal_x_test = np.repeat(temporal_x_test, timesteps, axis=1)
-
-      temporal_y_train = np.reshape(y_train, (len(y_train), 1,
-                                              y_train.shape[1]))
-      temporal_y_train = np.repeat(temporal_y_train, timesteps, axis=1)
-      temporal_y_test = np.reshape(y_test, (len(y_test), 1, y_test.shape[1]))
-      temporal_y_test = np.repeat(temporal_y_test, timesteps, axis=1)
-
-      temporal_sample_weight = np.reshape(sample_weight, (len(sample_weight),
-                                                          1))
-      temporal_sample_weight = np.repeat(
-          temporal_sample_weight, timesteps, axis=1)
-
-      model.compile(
-          RMSPropOptimizer(learning_rate=learning_rate),
-          loss='categorical_crossentropy',
-          metrics=['acc', metrics_module.CategoricalAccuracy()],
-          weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-          sample_weight_mode='temporal',
-          run_eagerly=test_utils.should_run_eagerly())
-
-      model.fit(
-          temporal_x_train,
-          temporal_y_train,
-          batch_size=batch_size,
-          epochs=epochs // 3,
-          verbose=0,
-          sample_weight=temporal_sample_weight)
-      model.fit(
-          temporal_x_train,
-          temporal_y_train,
-          batch_size=batch_size,
-          epochs=epochs // 3,
-          verbose=0,
-          sample_weight=temporal_sample_weight,
-          validation_split=0.1)
-
-      model.train_on_batch(
-          temporal_x_train[:batch_size],
-          temporal_y_train[:batch_size],
-          sample_weight=temporal_sample_weight[:batch_size])
-      model.test_on_batch(
-          temporal_x_train[:batch_size],
-          temporal_y_train[:batch_size],
-          sample_weight=temporal_sample_weight[:batch_size])
-      ref_score = model.evaluate(temporal_x_test, temporal_y_test, verbose=0)
-      if not tf.executing_eagerly():
-        score = model.evaluate(
-            temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
-        self.assertLess(score[0], ref_score[0])
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types(exclude_models='sequential')
-  def test_fit_with_incorrect_weights(self):
-    input_a = layers_module.Input(shape=(3,), name='input_a')
-    input_b = layers_module.Input(shape=(3,), name='input_b')
-
-    dense = layers_module.Dense(2, name='output_1')
-    dropout = layers_module.Dropout(0.5, name='output_2')
-    branch_a = [input_a, dense]
-    branch_b = [input_b, dense, dropout]
-
-    model = test_utils.get_multi_io_model(branch_a, branch_b)
-    model.compile(
-        optimizer='adam',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.random.random((10, 3))
-    y = np.random.random((10, 2))
-
-    with self.assertRaises(ValueError):
-      model.fit([x, x], [y, y], epochs=1, sample_weight={'unknown': x})
-
-    with self.assertRaises(ValueError):
-      model.fit([x, x], [y, y], epochs=1, class_weight={'unknown': 1})
-
-  @test_combinations.run_all_keras_modes
-  def test_default_sample_weight(self):
-    """Verifies that fit works without having to set sample_weight."""
-    num_classes = 5
-    input_dim = 5
-    timesteps = 3
-    learning_rate = 0.001
-
-    with self.cached_session():
-      model = sequential.Sequential()
-      model.add(
-          layers_module.TimeDistributed(
-              layers_module.Dense(num_classes),
-              input_shape=(timesteps, input_dim)))
-
-      x = np.random.random((10, timesteps, input_dim))
-      y = np.random.random((10, timesteps, num_classes))
-      optimizer = RMSPropOptimizer(learning_rate=learning_rate)
-
-      # sample_weight_mode is a list and mode value is None
-      model.compile(
-          optimizer,
-          loss='mse',
-          sample_weight_mode=[None],
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, epochs=1, batch_size=10)
-
-      # sample_weight_mode is a list and mode value is `temporal`
-      model.compile(
-          optimizer,
-          loss='mse',
-          sample_weight_mode=['temporal'],
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, epochs=1, batch_size=10)
-
-      # sample_weight_mode is a dict and mode value is None
-      model.compile(
-          optimizer,
-          loss='mse',
-          sample_weight_mode={'time_distributed': None},
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, epochs=1, batch_size=10)
-
-      # sample_weight_mode is a dict and mode value is `temporal`
-      model.compile(
-          optimizer,
-          loss='mse',
-          sample_weight_mode={'time_distributed': 'temporal'},
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, epochs=1, batch_size=10)
-
-      # sample_weight_mode is a not a list/dict and mode value is None
-      model.compile(
-          optimizer,
-          loss='mse',
-          sample_weight_mode=None,
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, epochs=1, batch_size=10)
-
-      # sample_weight_mode is a not a list/dict and mode value is `temporal`
-      model.compile(
-          optimizer,
-          loss='mse',
-          sample_weight_mode='temporal',
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, epochs=1, batch_size=10)
-
-  def test_sample_weight_tensor(self):
-    """Tests that sample weight may be defined as a tensor in the graph."""
-    with tf.compat.v1.get_default_graph().as_default():
-      # Create a simple pass-through model
-      inputs = layers_module.Input(shape=1, name='input_layer')
-      model = training_module.Model(inputs=inputs, outputs=inputs)
-      model.compile(
-          loss='mean_absolute_error',
-          optimizer='adam')
-
-      # Prepare sample weights iterator tensor
-      sample_weights = tf.constant(
-          [[0, .4, 1, 1], [2, .4, .3, 1]])
-      dataset = tf.data.Dataset.from_tensor_slices(sample_weights)
-      sample_weights = tf.compat.v1.data.make_one_shot_iterator(
-          dataset).get_next()
-      sample_weights = training_utils_v1.standardize_sample_weights(
-          sample_weights, model.output_names)
-
-      # Update model loss with sample weight tensor.
-      model._compile_weights_loss_and_weighted_metrics(sample_weights)
-
-      feeds = {'input_layer:0': [[0], [0], [0], [0]],
-               'input_layer_target:0': [[1], [1], [1], [1]]}
-      with self.cached_session() as sess:
-        self.assertAllClose(
-            (.4 + 1 + 1) / 4, sess.run(model.total_loss, feed_dict=feeds))
-        self.assertAllClose(
-            (2+ .4 + .3 + 1) / 4, sess.run(model.total_loss, feed_dict=feeds))
+    @test_combinations.run_all_keras_modes
+    def test_class_weights(self):
+        num_classes = 5
+        batch_size = 5
+        epochs = 10
+        weighted_class = 3
+        weight = 0.5
+        train_samples = 1000
+        test_samples = 1000
+        input_dim = 5
+        learning_rate = 0.001
+
+        model = test_utils.get_small_sequential_mlp(
+            num_hidden=10, num_classes=num_classes, input_dim=input_dim
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            metrics=["acc", metrics_module.CategoricalAccuracy()],
+            weighted_metrics=["mae", metrics_module.CategoricalAccuracy()],
+            optimizer=RMSPropOptimizer(learning_rate=learning_rate),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        np.random.seed(1337)
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=train_samples,
+            test_samples=test_samples,
+            input_shape=(input_dim,),
+            num_classes=num_classes,
+        )
+        int_y_test = y_test.copy()
+        # convert class vectors to binary class matrices
+        y_train = np_utils.to_categorical(y_train, num_classes)
+        y_test = np_utils.to_categorical(y_test, num_classes)
+        test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+        class_weight = dict([(i, 1.0) for i in range(num_classes)])
+        class_weight[weighted_class] = weight
+
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=batch_size,
+            epochs=epochs // 3,
+            verbose=0,
+            class_weight=class_weight,
+            validation_data=(x_train, y_train),
+        )
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=batch_size,
+            epochs=epochs // 2,
+            verbose=0,
+            class_weight=class_weight,
+        )
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=batch_size,
+            epochs=epochs // 2,
+            verbose=0,
+            class_weight=class_weight,
+            validation_split=0.1,
+        )
+
+        model.train_on_batch(
+            x_train[:batch_size],
+            y_train[:batch_size],
+            class_weight=class_weight,
+        )
+        ref_score = model.evaluate(x_test, y_test, verbose=0)  # noqa: F841
+        score = model.evaluate(  # noqa: F841
+            x_test[test_ids, :], y_test[test_ids, :], verbose=0
+        )
+        # TODO(b/152990697): Fix the class weights test here.
+        # self.assertLess(score[0], ref_score[0])
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_segmentation_class_weights(self):
+        num_channels = 3
+        num_classes = 5
+        batch_size = 2
+        image_width = 8
+
+        input_shape = (batch_size, image_width, image_width, num_channels)
+        output_shape = (batch_size, image_width, image_width, num_classes)
+
+        model = sequential.Sequential([layers_module.Conv2D(num_classes, 1)])
+
+        model.compile(
+            loss="categorical_crossentropy",
+            metrics=["acc", metrics_module.CategoricalAccuracy()],
+            weighted_metrics=["mae", metrics_module.CategoricalAccuracy()],
+            optimizer="adam",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = tf.random.uniform(input_shape)
+        y = tf.random.uniform(output_shape, dtype=tf.int32, maxval=num_classes)
+
+        # Class weights are just the class value + 1
+        class_weight = dict([(i, i + 1) for i in range(num_classes)])
+
+        # This test simply asserts that the model can be compiled and fit
+        # can run without error. Verification that the class weights are
+        # applied correctly is performed in data_adapter_test.
+        model.fit(x, y, class_weight=class_weight, steps_per_epoch=1)
+
+        sample_weight = np.array([x + 1 for x in range(batch_size)])
+        model.fit(
+            x,
+            y,
+            class_weight=class_weight,
+            sample_weight=sample_weight,
+            steps_per_epoch=1,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_temporal_sample_weights(self):
+        num_classes = 5
+        batch_size = 5
+        epochs = 10
+        weighted_class = 3
+        weight = 10.0
+        train_samples = 1000
+        test_samples = 1000
+        input_dim = 5
+        timesteps = 3
+        learning_rate = 0.001
+
+        with self.cached_session():
+            model = sequential.Sequential()
+            model.add(
+                layers_module.TimeDistributed(
+                    layers_module.Dense(num_classes),
+                    input_shape=(timesteps, input_dim),
+                )
+            )
+            model.add(layers_module.Activation("softmax"))
+
+            np.random.seed(1337)
+            (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+                train_samples=train_samples,
+                test_samples=test_samples,
+                input_shape=(input_dim,),
+                num_classes=num_classes,
+            )
+            int_y_test = y_test.copy()
+            int_y_train = y_train.copy()
+            # convert class vectors to binary class matrices
+            y_train = np_utils.to_categorical(y_train, num_classes)
+            y_test = np_utils.to_categorical(y_test, num_classes)
+            test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+            sample_weight = np.ones((y_train.shape[0]))
+            sample_weight[int_y_train == weighted_class] = weight
+
+            temporal_x_train = np.reshape(
+                x_train, (len(x_train), 1, x_train.shape[1])
+            )
+            temporal_x_train = np.repeat(temporal_x_train, timesteps, axis=1)
+            temporal_x_test = np.reshape(
+                x_test, (len(x_test), 1, x_test.shape[1])
+            )
+            temporal_x_test = np.repeat(temporal_x_test, timesteps, axis=1)
+
+            temporal_y_train = np.reshape(
+                y_train, (len(y_train), 1, y_train.shape[1])
+            )
+            temporal_y_train = np.repeat(temporal_y_train, timesteps, axis=1)
+            temporal_y_test = np.reshape(
+                y_test, (len(y_test), 1, y_test.shape[1])
+            )
+            temporal_y_test = np.repeat(temporal_y_test, timesteps, axis=1)
+
+            temporal_sample_weight = np.reshape(
+                sample_weight, (len(sample_weight), 1)
+            )
+            temporal_sample_weight = np.repeat(
+                temporal_sample_weight, timesteps, axis=1
+            )
+
+            model.compile(
+                RMSPropOptimizer(learning_rate=learning_rate),
+                loss="categorical_crossentropy",
+                metrics=["acc", metrics_module.CategoricalAccuracy()],
+                weighted_metrics=["mae", metrics_module.CategoricalAccuracy()],
+                sample_weight_mode="temporal",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+            model.fit(
+                temporal_x_train,
+                temporal_y_train,
+                batch_size=batch_size,
+                epochs=epochs // 3,
+                verbose=0,
+                sample_weight=temporal_sample_weight,
+            )
+            model.fit(
+                temporal_x_train,
+                temporal_y_train,
+                batch_size=batch_size,
+                epochs=epochs // 3,
+                verbose=0,
+                sample_weight=temporal_sample_weight,
+                validation_split=0.1,
+            )
+
+            model.train_on_batch(
+                temporal_x_train[:batch_size],
+                temporal_y_train[:batch_size],
+                sample_weight=temporal_sample_weight[:batch_size],
+            )
+            model.test_on_batch(
+                temporal_x_train[:batch_size],
+                temporal_y_train[:batch_size],
+                sample_weight=temporal_sample_weight[:batch_size],
+            )
+            ref_score = model.evaluate(
+                temporal_x_test, temporal_y_test, verbose=0
+            )
+            if not tf.executing_eagerly():
+                score = model.evaluate(
+                    temporal_x_test[test_ids],
+                    temporal_y_test[test_ids],
+                    verbose=0,
+                )
+                self.assertLess(score[0], ref_score[0])
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types(exclude_models="sequential")
+    def test_fit_with_incorrect_weights(self):
+        input_a = layers_module.Input(shape=(3,), name="input_a")
+        input_b = layers_module.Input(shape=(3,), name="input_b")
+
+        dense = layers_module.Dense(2, name="output_1")
+        dropout = layers_module.Dropout(0.5, name="output_2")
+        branch_a = [input_a, dense]
+        branch_b = [input_b, dense, dropout]
+
+        model = test_utils.get_multi_io_model(branch_a, branch_b)
+        model.compile(
+            optimizer="adam",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.random.random((10, 3))
+        y = np.random.random((10, 2))
+
+        with self.assertRaises(ValueError):
+            model.fit([x, x], [y, y], epochs=1, sample_weight={"unknown": x})
+
+        with self.assertRaises(ValueError):
+            model.fit([x, x], [y, y], epochs=1, class_weight={"unknown": 1})
+
+    @test_combinations.run_all_keras_modes
+    def test_default_sample_weight(self):
+        """Verifies that fit works without having to set sample_weight."""
+        num_classes = 5
+        input_dim = 5
+        timesteps = 3
+        learning_rate = 0.001
+
+        with self.cached_session():
+            model = sequential.Sequential()
+            model.add(
+                layers_module.TimeDistributed(
+                    layers_module.Dense(num_classes),
+                    input_shape=(timesteps, input_dim),
+                )
+            )
+
+            x = np.random.random((10, timesteps, input_dim))
+            y = np.random.random((10, timesteps, num_classes))
+            optimizer = RMSPropOptimizer(learning_rate=learning_rate)
+
+            # sample_weight_mode is a list and mode value is None
+            model.compile(
+                optimizer,
+                loss="mse",
+                sample_weight_mode=[None],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, epochs=1, batch_size=10)
+
+            # sample_weight_mode is a list and mode value is `temporal`
+            model.compile(
+                optimizer,
+                loss="mse",
+                sample_weight_mode=["temporal"],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, epochs=1, batch_size=10)
+
+            # sample_weight_mode is a dict and mode value is None
+            model.compile(
+                optimizer,
+                loss="mse",
+                sample_weight_mode={"time_distributed": None},
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, epochs=1, batch_size=10)
+
+            # sample_weight_mode is a dict and mode value is `temporal`
+            model.compile(
+                optimizer,
+                loss="mse",
+                sample_weight_mode={"time_distributed": "temporal"},
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, epochs=1, batch_size=10)
+
+            # sample_weight_mode is a not a list/dict and mode value is None
+            model.compile(
+                optimizer,
+                loss="mse",
+                sample_weight_mode=None,
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, epochs=1, batch_size=10)
+
+            # sample_weight_mode is a not a list/dict and mode value is
+            # `temporal`
+            model.compile(
+                optimizer,
+                loss="mse",
+                sample_weight_mode="temporal",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, epochs=1, batch_size=10)
+
+    def test_sample_weight_tensor(self):
+        """Tests that sample weight may be defined as a tensor in the graph."""
+        with tf.compat.v1.get_default_graph().as_default():
+            # Create a simple pass-through model
+            inputs = layers_module.Input(shape=1, name="input_layer")
+            model = training_module.Model(inputs=inputs, outputs=inputs)
+            model.compile(loss="mean_absolute_error", optimizer="adam")
+
+            # Prepare sample weights iterator tensor
+            sample_weights = tf.constant([[0, 0.4, 1, 1], [2, 0.4, 0.3, 1]])
+            dataset = tf.data.Dataset.from_tensor_slices(sample_weights)
+            sample_weights = tf.compat.v1.data.make_one_shot_iterator(
+                dataset
+            ).get_next()
+            sample_weights = training_utils_v1.standardize_sample_weights(
+                sample_weights, model.output_names
+            )
+
+            # Update model loss with sample weight tensor.
+            model._compile_weights_loss_and_weighted_metrics(sample_weights)
+
+            feeds = {
+                "input_layer:0": [[0], [0], [0], [0]],
+                "input_layer_target:0": [[1], [1], [1], [1]],
+            }
+            with self.cached_session() as sess:
+                self.assertAllClose(
+                    (0.4 + 1 + 1) / 4,
+                    sess.run(model.total_loss, feed_dict=feeds),
+                )
+                self.assertAllClose(
+                    (2 + 0.4 + 0.3 + 1) / 4,
+                    sess.run(model.total_loss, feed_dict=feeds),
+                )
 
 
 @test_combinations.run_all_keras_modes
 class MaskingTest(test_combinations.TestCase):
-
-  def _get_model(self, input_shape=None):
-    layers = [
-        layers_module.Masking(mask_value=0),
-        layers_module.TimeDistributed(
-            layers_module.Dense(1, kernel_initializer='one'))
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape)
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  @test_combinations.run_with_all_model_types
-  def test_masking(self):
-    model = self._get_model(input_shape=(2, 1))
-    x = np.array([[[1], [1]], [[0], [0]]])
-    y = np.array([[[1], [1]], [[1], [1]]])
-    loss = model.train_on_batch(x, y)
-    self.assertEqual(loss, 0)
-
-  @test_combinations.run_with_all_model_types(exclude_models='functional')
-  def test_masking_deferred(self):
-    model = self._get_model()
-    x = np.array([[[1], [1]], [[0], [0]]])
-    y = np.array([[[1], [1]], [[1], [1]]])
-    loss = model.train_on_batch(x, y)
-    self.assertEqual(loss, 0)
-
-  def test_mask_argument_in_layer(self):
-    # Test that the mask argument gets correctly passed to a layer in the
-    # functional API.
-
-    class CustomMaskedLayer(layers_module.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.supports_masking = True
-
-      def call(self, inputs, mask=None):
-        assert mask is not None
-        return inputs
-
-      def compute_output_shape(self, input_shape):
-        return input_shape
-
-    x = np.random.random((5, 3))
-    inputs = layers_module.Input((3,))
-    masked = layers_module.Masking(mask_value=0)(inputs)
-    outputs = CustomMaskedLayer()(masked)
-
-    model = training_module.Model(inputs, outputs)
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=test_utils.should_run_eagerly())
-    y = np.random.random((5, 3))
-    model.train_on_batch(x, y)
+    def _get_model(self, input_shape=None):
+        layers = [
+            layers_module.Masking(mask_value=0),
+            layers_module.TimeDistributed(
+                layers_module.Dense(1, kernel_initializer="one")
+            ),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape)
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
+
+    @test_combinations.run_with_all_model_types
+    def test_masking(self):
+        model = self._get_model(input_shape=(2, 1))
+        x = np.array([[[1], [1]], [[0], [0]]])
+        y = np.array([[[1], [1]], [[1], [1]]])
+        loss = model.train_on_batch(x, y)
+        self.assertEqual(loss, 0)
+
+    @test_combinations.run_with_all_model_types(exclude_models="functional")
+    def test_masking_deferred(self):
+        model = self._get_model()
+        x = np.array([[[1], [1]], [[0], [0]]])
+        y = np.array([[[1], [1]], [[1], [1]]])
+        loss = model.train_on_batch(x, y)
+        self.assertEqual(loss, 0)
+
+    def test_mask_argument_in_layer(self):
+        # Test that the mask argument gets correctly passed to a layer in the
+        # functional API.
+
+        class CustomMaskedLayer(layers_module.Layer):
+            def __init__(self):
+                super().__init__()
+                self.supports_masking = True
+
+            def call(self, inputs, mask=None):
+                assert mask is not None
+                return inputs
+
+            def compute_output_shape(self, input_shape):
+                return input_shape
+
+        x = np.random.random((5, 3))
+        inputs = layers_module.Input((3,))
+        masked = layers_module.Masking(mask_value=0)(inputs)
+        outputs = CustomMaskedLayer()(masked)
+
+        model = training_module.Model(inputs, outputs)
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        y = np.random.random((5, 3))
+        model.train_on_batch(x, y)
 
 
 @test_combinations.run_all_keras_modes
 class TestDynamicTrainability(test_combinations.TestCase):
-
-  def test_trainable_warning(self):
-    x = np.random.random((5, 3))
-    y = np.random.random((5, 2))
-
-    model = sequential.Sequential()
-    model.add(layers_module.Dense(2, input_dim=3))
-    model.trainable = False
-    model.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.trainable = True
-    model.train_on_batch(x, y)
-    self.assertRaises(Warning)
-
-  def test_trainable_argument(self):
-    with self.cached_session():
-      x = np.random.random((5, 3))
-      y = np.random.random((5, 2))
-
-      model = sequential.Sequential()
-      model.add(layers_module.Dense(2, input_dim=3, trainable=False))
-      model.compile(
-          'rmsprop',
-          'mse',
-          run_eagerly=test_utils.should_run_eagerly())
-      out = model.predict(x)
-      model.train_on_batch(x, y)
-      out_2 = model.predict(x)
-      self.assertAllClose(out, out_2)
-
-      # test with nesting
-      inputs = layers_module.Input(shape=(3,))
-      output = model(inputs)
-      model = training_module.Model(inputs, output)
-      model.compile(
-          'rmsprop',
-          'mse',
-          run_eagerly=test_utils.should_run_eagerly())
-      out = model.predict(x)
-      model.train_on_batch(x, y)
-      out_2 = model.predict(x)
-      self.assertAllClose(out, out_2)
-
-  def test_layer_trainability_switch(self):
-    # with constructor argument, in Sequential
-    model = sequential.Sequential()
-    model.add(layers_module.Dense(2, trainable=False, input_dim=1))
-    self.assertListEqual(model.trainable_weights, [])
-
-    # by setting the `trainable` argument, in Sequential
-    model = sequential.Sequential()
-    layer = layers_module.Dense(2, input_dim=1)
-    model.add(layer)
-    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
-    layer.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-    # with constructor argument, in Model
-    x = layers_module.Input(shape=(1,))
-    y = layers_module.Dense(2, trainable=False)(x)
-    model = training_module.Model(x, y)
-    self.assertListEqual(model.trainable_weights, [])
-
-    # by setting the `trainable` argument, in Model
-    x = layers_module.Input(shape=(1,))
-    layer = layers_module.Dense(2)
-    y = layer(x)
-    model = training_module.Model(x, y)
-    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
-    layer.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-  def test_model_trainability_switch(self):
-    # a non-trainable model has no trainable weights
-    x = layers_module.Input(shape=(1,))
-    y = layers_module.Dense(2)(x)
-    model = training_module.Model(x, y)
-    model.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-    # same for Sequential
-    model = sequential.Sequential()
-    model.add(layers_module.Dense(2, input_dim=1))
-    model.trainable = False
-    self.assertListEqual(model.trainable_weights, [])
-
-  def test_nested_model_trainability(self):
-    # a Sequential inside a Model
-    inner_model = sequential.Sequential()
-    inner_model.add(layers_module.Dense(2, input_dim=1))
-
-    x = layers_module.Input(shape=(1,))
-    y = inner_model(x)
-    outer_model = training_module.Model(x, y)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-    # a Sequential inside a Sequential
-    inner_model = sequential.Sequential()
-    inner_model.add(layers_module.Dense(2, input_dim=1))
-    outer_model = sequential.Sequential()
-    outer_model.add(inner_model)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-    # a Model inside a Model
-    x = layers_module.Input(shape=(1,))
-    y = layers_module.Dense(2)(x)
-    inner_model = training_module.Model(x, y)
-    x = layers_module.Input(shape=(1,))
-    y = inner_model(x)
-    outer_model = training_module.Model(x, y)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-    # a Model inside a Sequential
-    x = layers_module.Input(shape=(1,))
-    y = layers_module.Dense(2)(x)
-    inner_model = training_module.Model(x, y)
-    outer_model = sequential.Sequential()
-    outer_model.add(inner_model)
-    self.assertListEqual(outer_model.trainable_weights,
-                         inner_model.trainable_weights)
-    inner_model.trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-    inner_model.trainable = True
-    inner_model.layers[-1].trainable = False
-    self.assertListEqual(outer_model.trainable_weights, [])
-
-  def test_gan_workflow(self):
-    shared_layer = layers_module.BatchNormalization()
-
-    inputs1 = input_layer.Input(10)
-    outputs1 = shared_layer(inputs1)
-    model1 = training_module.Model(inputs1, outputs1)
-    shared_layer.trainable = False
-    model1.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs2 = input_layer.Input(10)
-    outputs2 = shared_layer(inputs2)
-    model2 = training_module.Model(inputs2, outputs2)
-    shared_layer.trainable = True
-    model2.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x, y = np.ones((10, 10)), np.ones((10, 10))
-
-    out1_0 = model1.predict_on_batch(x)
-    model1.train_on_batch(x, y)
-    out1_1 = model1.predict_on_batch(x)
-    self.assertAllClose(out1_0, out1_1)
-
-    out2_0 = model2.predict_on_batch(x)
-    model2.train_on_batch(x, y)
-    out2_1 = model2.predict_on_batch(x)
-    self.assertNotAllClose(out2_0, out2_1)
-
-  def test_toggle_value(self):
-    input_0 = layers_module.Input(shape=(1,))
-    dense_0 = layers_module.Dense(
-        1, kernel_initializer='ones', bias_initializer='ones')
-    dense_1 = layers_module.Dense(
-        1, kernel_initializer='ones', bias_initializer='ones')
-    result = layers_module.Add()([dense_0(input_0), dense_1(input_0)])
-    model = training_module.Model(input_0, result)
-    dense_0.trainable = False
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((10, 1))
-    y = 5 * x + 2
-    model.train_on_batch(x, y)
-    dense_0.trainable = True
-    model.train_on_batch(x, y)
-    kernel, bias = dense_0.get_weights()
-    self.assertAllEqual([kernel[0, 0], bias[0]], [1., 1.])
-
-    kernel, bias = dense_1.get_weights()
-    self.assertAllClose([kernel[0, 0], bias[0]], [1.1176, 1.1176])
+    def test_trainable_warning(self):
+        x = np.random.random((5, 3))
+        y = np.random.random((5, 2))
+
+        model = sequential.Sequential()
+        model.add(layers_module.Dense(2, input_dim=3))
+        model.trainable = False
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        model.trainable = True
+        model.train_on_batch(x, y)
+        self.assertRaises(Warning)
+
+    def test_trainable_argument(self):
+        with self.cached_session():
+            x = np.random.random((5, 3))
+            y = np.random.random((5, 2))
+
+            model = sequential.Sequential()
+            model.add(layers_module.Dense(2, input_dim=3, trainable=False))
+            model.compile(
+                "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+            )
+            out = model.predict(x)
+            model.train_on_batch(x, y)
+            out_2 = model.predict(x)
+            self.assertAllClose(out, out_2)
+
+            # test with nesting
+            inputs = layers_module.Input(shape=(3,))
+            output = model(inputs)
+            model = training_module.Model(inputs, output)
+            model.compile(
+                "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+            )
+            out = model.predict(x)
+            model.train_on_batch(x, y)
+            out_2 = model.predict(x)
+            self.assertAllClose(out, out_2)
+
+    def test_layer_trainability_switch(self):
+        # with constructor argument, in Sequential
+        model = sequential.Sequential()
+        model.add(layers_module.Dense(2, trainable=False, input_dim=1))
+        self.assertListEqual(model.trainable_weights, [])
+
+        # by setting the `trainable` argument, in Sequential
+        model = sequential.Sequential()
+        layer = layers_module.Dense(2, input_dim=1)
+        model.add(layer)
+        self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+        layer.trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+
+        # with constructor argument, in Model
+        x = layers_module.Input(shape=(1,))
+        y = layers_module.Dense(2, trainable=False)(x)
+        model = training_module.Model(x, y)
+        self.assertListEqual(model.trainable_weights, [])
+
+        # by setting the `trainable` argument, in Model
+        x = layers_module.Input(shape=(1,))
+        layer = layers_module.Dense(2)
+        y = layer(x)
+        model = training_module.Model(x, y)
+        self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+        layer.trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+
+    def test_model_trainability_switch(self):
+        # a non-trainable model has no trainable weights
+        x = layers_module.Input(shape=(1,))
+        y = layers_module.Dense(2)(x)
+        model = training_module.Model(x, y)
+        model.trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+
+        # same for Sequential
+        model = sequential.Sequential()
+        model.add(layers_module.Dense(2, input_dim=1))
+        model.trainable = False
+        self.assertListEqual(model.trainable_weights, [])
+
+    def test_nested_model_trainability(self):
+        # a Sequential inside a Model
+        inner_model = sequential.Sequential()
+        inner_model.add(layers_module.Dense(2, input_dim=1))
+
+        x = layers_module.Input(shape=(1,))
+        y = inner_model(x)
+        outer_model = training_module.Model(x, y)
+        self.assertListEqual(
+            outer_model.trainable_weights, inner_model.trainable_weights
+        )
+        inner_model.trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+        inner_model.trainable = True
+        inner_model.layers[-1].trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+
+        # a Sequential inside a Sequential
+        inner_model = sequential.Sequential()
+        inner_model.add(layers_module.Dense(2, input_dim=1))
+        outer_model = sequential.Sequential()
+        outer_model.add(inner_model)
+        self.assertListEqual(
+            outer_model.trainable_weights, inner_model.trainable_weights
+        )
+        inner_model.trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+        inner_model.trainable = True
+        inner_model.layers[-1].trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+
+        # a Model inside a Model
+        x = layers_module.Input(shape=(1,))
+        y = layers_module.Dense(2)(x)
+        inner_model = training_module.Model(x, y)
+        x = layers_module.Input(shape=(1,))
+        y = inner_model(x)
+        outer_model = training_module.Model(x, y)
+        self.assertListEqual(
+            outer_model.trainable_weights, inner_model.trainable_weights
+        )
+        inner_model.trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+        inner_model.trainable = True
+        inner_model.layers[-1].trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+
+        # a Model inside a Sequential
+        x = layers_module.Input(shape=(1,))
+        y = layers_module.Dense(2)(x)
+        inner_model = training_module.Model(x, y)
+        outer_model = sequential.Sequential()
+        outer_model.add(inner_model)
+        self.assertListEqual(
+            outer_model.trainable_weights, inner_model.trainable_weights
+        )
+        inner_model.trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+        inner_model.trainable = True
+        inner_model.layers[-1].trainable = False
+        self.assertListEqual(outer_model.trainable_weights, [])
+
+    def test_gan_workflow(self):
+        shared_layer = layers_module.BatchNormalization()
+
+        inputs1 = input_layer.Input(10)
+        outputs1 = shared_layer(inputs1)
+        model1 = training_module.Model(inputs1, outputs1)
+        shared_layer.trainable = False
+        model1.compile(
+            "sgd", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs2 = input_layer.Input(10)
+        outputs2 = shared_layer(inputs2)
+        model2 = training_module.Model(inputs2, outputs2)
+        shared_layer.trainable = True
+        model2.compile(
+            "sgd", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        x, y = np.ones((10, 10)), np.ones((10, 10))
+
+        out1_0 = model1.predict_on_batch(x)
+        model1.train_on_batch(x, y)
+        out1_1 = model1.predict_on_batch(x)
+        self.assertAllClose(out1_0, out1_1)
+
+        out2_0 = model2.predict_on_batch(x)
+        model2.train_on_batch(x, y)
+        out2_1 = model2.predict_on_batch(x)
+        self.assertNotAllClose(out2_0, out2_1)
+
+    def test_toggle_value(self):
+        input_0 = layers_module.Input(shape=(1,))
+        dense_0 = layers_module.Dense(
+            1, kernel_initializer="ones", bias_initializer="ones"
+        )
+        dense_1 = layers_module.Dense(
+            1, kernel_initializer="ones", bias_initializer="ones"
+        )
+        result = layers_module.Add()([dense_0(input_0), dense_1(input_0)])
+        model = training_module.Model(input_0, result)
+        dense_0.trainable = False
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        x = np.ones((10, 1))
+        y = 5 * x + 2
+        model.train_on_batch(x, y)
+        dense_0.trainable = True
+        model.train_on_batch(x, y)
+        kernel, bias = dense_0.get_weights()
+        self.assertAllEqual([kernel[0, 0], bias[0]], [1.0, 1.0])
+
+        kernel, bias = dense_1.get_weights()
+        self.assertAllClose([kernel[0, 0], bias[0]], [1.1176, 1.1176])
 
 
 class TestTrainingWithDataTensors(test_combinations.TestCase):
-
-  def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
-    with tf.Graph().as_default():
-      x = layers_module.Input(shape=(3,), name='input')
-      y = layers_module.Dense(4, name='dense')(x)
-      model = training_module.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-
-      inputs = backend.zeros(shape=(10, 3))
-      targets = backend.zeros(shape=(10, 4))
-
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
-
-      # Test with dynamic shape
-      inputs = tf.compat.v1.placeholder_with_default(
-          np.zeros((2, 3)), shape=tf.TensorShape([None, 3]))
-      targets = tf.compat.v1.placeholder_with_default(
-          np.zeros((2, 4)), shape=tf.TensorShape([None, 4]))
-      self.assertEqual(inputs.shape.dims[0].value, None)
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
-
-  def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
-    a = layers_module.Input(shape=(3,), name='input_a')
-    b = layers_module.Input(shape=(3,), name='input_b')
-
-    dense = layers_module.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = layers_module.Dropout(0.5, name='dropout')(c)
-
-    model = training_module.Model([a, b], [d, e])
-
-    optimizer = 'rmsprop'
-    loss = 'mse'
-    loss_weights = [1., 0.5]
-    model.compile(
-        optimizer,
-        loss,
-        metrics=['mae', metrics_module.CategoricalAccuracy()],
-        loss_weights=loss_weights)
-
-    input_a_tf = tf.zeros(shape=(10, 3))
-    input_b_tf = tf.zeros(shape=(10, 3))
-
-    output_d_tf = tf.zeros(shape=(10, 4))
-    output_e_tf = tf.zeros(shape=(10, 4))
-
-    model.fit([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-              epochs=1,
-              steps_per_epoch=2,
-              verbose=0)
-    model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
-
-    # Test with dictionary inputs
-    model.fit({
-        'input_a': input_a_tf,
-        'input_b': input_b_tf
-    }, {
-        'dense': output_d_tf,
-        'dropout': output_e_tf
-    },
-              epochs=1,
-              steps_per_epoch=2,
-              verbose=0)
-    model.fit({
-        'input_a': input_a_tf,
-        'input_b': input_b_tf
-    }, {
-        'dense': output_d_tf,
-        'dropout': output_e_tf
-    },
-              validation_data=({
-                  'input_a': input_a_tf,
-                  'input_b': input_b_tf
-              }, {
-                  'dense': output_d_tf,
-                  'dropout': output_e_tf
-              }),
-              epochs=1,
-              steps_per_epoch=2,
-              validation_steps=2,
-              verbose=0)
-    model.train_on_batch({
-        'input_a': input_a_tf,
-        'input_b': input_b_tf
-    }, {
-        'dense': output_d_tf,
-        'dropout': output_e_tf
-    })
-
-    # Test with validation data
-    model.fit([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-              validation_data=([input_a_tf,
-                                input_b_tf], [output_d_tf, output_e_tf]),
-              epochs=1,
-              steps_per_epoch=2,
-              validation_steps=2,
-              verbose=0)
-    # Test evaluation / prediction methods
-    model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-                   steps=2,
-                   verbose=0)
-    model.predict([input_a_tf, input_b_tf], steps=2)
-    model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
-
-  @tf_test_utils.run_deprecated_v1
-  def test_model_with_input_feed_tensor(self):
-    """We test building a model with a TF variable as input.
-
-    We should be able to call fit, evaluate, predict,
-    by only passing them data for the placeholder inputs
-    in the model.
-    """
-    with tf.Graph().as_default(), self.cached_session():
-      input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 3))
-
-      output_a_np = np.random.random((10, 4))
-      output_b_np = np.random.random((10, 3))
-
-      input_v = tf.Variable(input_a_np, dtype='float32')
-      self.evaluate(tf.compat.v1.variables_initializer([input_v]))
-      a = input_layer.Input(tensor=input_v)
-      b = input_layer.Input(shape=(3,), name='input_b')
-
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      dp = layers_module.Dropout(0.5, name='dropout')
-      b_2 = dp(b)
-
-      model = training_module.Model([a, b], [a_2, b_2])
-      model.summary()
-
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-      model.compile(optimizer, loss, metrics=['mean_squared_error'],
+    def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
+        with tf.Graph().as_default():
+            x = layers_module.Input(shape=(3,), name="input")
+            y = layers_module.Dense(4, name="dense")(x)
+            model = training_module.Model(x, y)
+
+            optimizer = RMSPropOptimizer(learning_rate=0.001)
+            loss = "mse"
+            model.compile(
+                optimizer,
+                loss,
+                metrics=["mae", metrics_module.CategoricalAccuracy()],
+            )
+
+            inputs = backend.zeros(shape=(10, 3))
+            targets = backend.zeros(shape=(10, 4))
+
+            model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+            model.evaluate(inputs, targets, steps=2, verbose=0)
+            model.predict(inputs, steps=2)
+            model.train_on_batch(inputs, targets)
+            model.test_on_batch(inputs, targets)
+            model.fit(
+                inputs,
+                targets,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                validation_data=(inputs, targets),
+                validation_steps=2,
+            )
+
+            # Test with dynamic shape
+            inputs = tf.compat.v1.placeholder_with_default(
+                np.zeros((2, 3)), shape=tf.TensorShape([None, 3])
+            )
+            targets = tf.compat.v1.placeholder_with_default(
+                np.zeros((2, 4)), shape=tf.TensorShape([None, 4])
+            )
+            self.assertEqual(inputs.shape.dims[0].value, None)
+            model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+            model.evaluate(inputs, targets, steps=2, verbose=0)
+            model.predict(inputs, steps=2)
+            model.train_on_batch(inputs, targets)
+            model.test_on_batch(inputs, targets)
+            model.fit(
+                inputs,
+                targets,
+                epochs=1,
+                steps_per_epoch=2,
+                verbose=0,
+                validation_data=(inputs, targets),
+                validation_steps=2,
+            )
+
+    def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
+        a = layers_module.Input(shape=(3,), name="input_a")
+        b = layers_module.Input(shape=(3,), name="input_b")
+
+        dense = layers_module.Dense(4, name="dense")
+        c = dense(a)
+        d = dense(b)
+        e = layers_module.Dropout(0.5, name="dropout")(c)
+
+        model = training_module.Model([a, b], [d, e])
+
+        optimizer = "rmsprop"
+        loss = "mse"
+        loss_weights = [1.0, 0.5]
+        model.compile(
+            optimizer,
+            loss,
+            metrics=["mae", metrics_module.CategoricalAccuracy()],
+            loss_weights=loss_weights,
+        )
+
+        input_a_tf = tf.zeros(shape=(10, 3))
+        input_b_tf = tf.zeros(shape=(10, 3))
+
+        output_d_tf = tf.zeros(shape=(10, 4))
+        output_e_tf = tf.zeros(shape=(10, 4))
+
+        model.fit(
+            [input_a_tf, input_b_tf],
+            [output_d_tf, output_e_tf],
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+        )
+        model.train_on_batch(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf]
+        )
+
+        # Test with dictionary inputs
+        model.fit(
+            {"input_a": input_a_tf, "input_b": input_b_tf},
+            {"dense": output_d_tf, "dropout": output_e_tf},
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+        )
+        model.fit(
+            {"input_a": input_a_tf, "input_b": input_b_tf},
+            {"dense": output_d_tf, "dropout": output_e_tf},
+            validation_data=(
+                {"input_a": input_a_tf, "input_b": input_b_tf},
+                {"dense": output_d_tf, "dropout": output_e_tf},
+            ),
+            epochs=1,
+            steps_per_epoch=2,
+            validation_steps=2,
+            verbose=0,
+        )
+        model.train_on_batch(
+            {"input_a": input_a_tf, "input_b": input_b_tf},
+            {"dense": output_d_tf, "dropout": output_e_tf},
+        )
+
+        # Test with validation data
+        model.fit(
+            [input_a_tf, input_b_tf],
+            [output_d_tf, output_e_tf],
+            validation_data=(
+                [input_a_tf, input_b_tf],
+                [output_d_tf, output_e_tf],
+            ),
+            epochs=1,
+            steps_per_epoch=2,
+            validation_steps=2,
+            verbose=0,
+        )
+        # Test evaluation / prediction methods
+        model.evaluate(
+            [input_a_tf, input_b_tf],
+            [output_d_tf, output_e_tf],
+            steps=2,
+            verbose=0,
+        )
+        model.predict([input_a_tf, input_b_tf], steps=2)
+        model.test_on_batch(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf]
+        )
+
+    @tf_test_utils.run_deprecated_v1
+    def test_model_with_input_feed_tensor(self):
+        """We test building a model with a TF variable as input.
+
+        We should be able to call fit, evaluate, predict,
+        by only passing them data for the placeholder inputs
+        in the model.
+        """
+        with tf.Graph().as_default(), self.cached_session():
+            input_a_np = np.random.random((10, 3))
+            input_b_np = np.random.random((10, 3))
+
+            output_a_np = np.random.random((10, 4))
+            output_b_np = np.random.random((10, 3))
+
+            input_v = tf.Variable(input_a_np, dtype="float32")
+            self.evaluate(tf.compat.v1.variables_initializer([input_v]))
+            a = input_layer.Input(tensor=input_v)
+            b = input_layer.Input(shape=(3,), name="input_b")
+
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            dp = layers_module.Dropout(0.5, name="dropout")
+            b_2 = dp(b)
+
+            model = training_module.Model([a, b], [a_2, b_2])
+            model.summary()
+
+            optimizer = "rmsprop"
+            loss = "mse"
+            loss_weights = [1.0, 0.5]
+            model.compile(
+                optimizer,
+                loss,
+                metrics=["mean_squared_error"],
+                loss_weights=loss_weights,
+                sample_weight_mode=None,
+            )
+
+            # test train_on_batch
+            out = model.train_on_batch(input_b_np, [output_a_np, output_b_np])
+            out = model.train_on_batch(
+                {"input_b": input_b_np}, [output_a_np, output_b_np]
+            )
+            out = model.test_on_batch(
+                {"input_b": input_b_np}, [output_a_np, output_b_np]
+            )
+            out = model.predict_on_batch({"input_b": input_b_np})
+
+            # test fit
+            out = model.fit(
+                {"input_b": input_b_np},
+                [output_a_np, output_b_np],
+                epochs=1,
+                batch_size=10,
+            )
+            out = model.fit(
+                input_b_np, [output_a_np, output_b_np], epochs=1, batch_size=10
+            )
+
+            # test evaluate
+            out = model.evaluate(
+                {"input_b": input_b_np},
+                [output_a_np, output_b_np],
+                batch_size=10,
+            )
+            out = model.evaluate(
+                input_b_np, [output_a_np, output_b_np], batch_size=10
+            )
+
+            # test predict
+            out = model.predict({"input_b": input_b_np}, batch_size=10)
+            out = model.predict(input_b_np, batch_size=10)
+            self.assertEqual(len(out), 2)
+
+            # Now test a model with a single input
+            # i.e. we don't pass any data to fit the model.
+            self.evaluate(tf.compat.v1.variables_initializer([input_v]))
+            a = input_layer.Input(tensor=input_v)
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            a_2 = layers_module.Dropout(0.5, name="dropout")(a_2)
+            model = training_module.Model(a, a_2)
+            model.summary()
+
+            optimizer = "rmsprop"
+            loss = "mse"
+            model.compile(optimizer, loss, metrics=["mean_squared_error"])
+
+            # test train_on_batch
+            out = model.train_on_batch(None, output_a_np)
+            out = model.train_on_batch(None, output_a_np)
+            out = model.test_on_batch(None, output_a_np)
+            out = model.predict_on_batch(None)
+            out = model.train_on_batch([], output_a_np)
+            out = model.train_on_batch({}, output_a_np)
+
+            # test fit
+            _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
+            _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
+
+            # test evaluate
+            _ = model.evaluate(None, output_a_np, steps=3)
+            _ = model.evaluate(None, output_a_np, steps=3)
+
+            # test predict
+            out = model.predict(None, steps=3)
+            out = model.predict(None, steps=3)
+            self.assertEqual(out.shape, (10 * 3, 4))
+
+            # Same, without learning phase
+            # i.e. we don't pass any data to fit the model.
+            self.evaluate(tf.compat.v1.variables_initializer([input_v]))
+            a = input_layer.Input(tensor=input_v)
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            model = training_module.Model(a, a_2)
+            model.summary()
+
+            optimizer = "rmsprop"
+            loss = "mse"
+            model.compile(optimizer, loss, metrics=["mean_squared_error"])
+
+            # test train_on_batch
+            out = model.train_on_batch(None, output_a_np)
+            out = model.train_on_batch(None, output_a_np)
+            out = model.test_on_batch(None, output_a_np)
+            out = model.predict_on_batch(None)
+            out = model.train_on_batch([], output_a_np)
+            out = model.train_on_batch({}, output_a_np)
+
+            # test fit
+            _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
+            _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
+
+            # test evaluate
+            _ = model.evaluate(None, output_a_np, steps=10)
+            _ = model.evaluate(None, output_a_np, steps=10)
+
+            # test predict
+            out = model.predict(None, steps=3)
+            out = model.predict(None, steps=3)
+            self.assertEqual(out.shape, (10 * 3, 4))
+
+    @test_combinations.run_all_keras_modes
+    def test_model_with_partial_loss(self):
+        with self.cached_session():
+            a = input_layer.Input(shape=(3,), name="input_a")
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            dp = layers_module.Dropout(0.5, name="dropout")
+            a_3 = dp(a_2)
+            model = training_module.Model(a, [a_2, a_3])
+
+            optimizer = "rmsprop"
+            loss = {"dropout": "mse"}
+            model.compile(optimizer, loss, metrics=["mae"])
+
+            input_a_np = np.random.random((10, 3))
+            output_a_np = np.random.random((10, 4))
+
+            # test train_on_batch
+            _ = model.train_on_batch(input_a_np, output_a_np)
+            _ = model.test_on_batch(input_a_np, output_a_np)
+            # fit
+            _ = model.fit(input_a_np, output_a_np)
+            # evaluate
+            _ = model.evaluate(input_a_np, output_a_np)
+
+            # Same without dropout.
+            a = input_layer.Input(shape=(3,), name="input_a")
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            a_3 = layers_module.Dense(4, name="dense_2")(a_2)
+            model = training_module.Model(a, [a_2, a_3])
+
+            optimizer = "rmsprop"
+            loss = {"dense_2": "mse"}
+            model.compile(optimizer, loss, metrics={"dense_1": "mae"})
+
+            # test train_on_batch
+            _ = model.train_on_batch(input_a_np, output_a_np)
+            _ = model.test_on_batch(input_a_np, output_a_np)
+            # fit
+            _ = model.fit(input_a_np, output_a_np)
+            # evaluate
+            _ = model.evaluate(input_a_np, output_a_np)
+
+    def test_model_with_external_loss(self):
+        with tf.Graph().as_default(), self.cached_session():
+            # None loss, only regularization loss.
+            a = input_layer.Input(shape=(3,), name="input_a")
+            a_2 = layers_module.Dense(
+                4,
+                name="dense_1",
+                kernel_regularizer="l1",
+                bias_regularizer="l2",
+            )(a)
+            dp = layers_module.Dropout(0.5, name="dropout")
+            a_3 = dp(a_2)
+
+            model = training_module.Model(a, [a_2, a_3])
+
+            optimizer = "rmsprop"
+            loss = None
+            model.compile(optimizer, loss, metrics=["mae"])
+
+            input_a_np = np.random.random((10, 3))
+
+            # test train_on_batch
+            out = model.train_on_batch(input_a_np, None)
+            out = model.test_on_batch(input_a_np, None)
+            # fit
+            out = model.fit(input_a_np, None)
+            # evaluate
+            out = model.evaluate(input_a_np, None)
+
+            # No dropout, external loss.
+            a = input_layer.Input(shape=(3,), name="input_a")
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            a_3 = layers_module.Dense(4, name="dense_2")(a)
+
+            model = training_module.Model(a, [a_2, a_3])
+            model.add_loss(backend.mean(a_3 + a_2))
+
+            optimizer = "rmsprop"
+            loss = None
+            model.compile(optimizer, loss, metrics=["mae"])
+
+            # test train_on_batch
+            out = model.train_on_batch(input_a_np, None)
+            out = model.test_on_batch(input_a_np, None)
+            # fit
+            out = model.fit(input_a_np, None)
+            # evaluate
+            out = model.evaluate(input_a_np, None)
+
+            # Test model with no external data at all.
+            input_v = tf.Variable(input_a_np, dtype="float32")
+            self.evaluate(tf.compat.v1.variables_initializer([input_v]))
+            a = input_layer.Input(tensor=input_v)
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            a_2 = layers_module.Dropout(0.5, name="dropout")(a_2)
+            model = training_module.Model(a, a_2)
+            model.add_loss(backend.mean(a_2))
+
+            model.compile(
+                optimizer="rmsprop", loss=None, metrics=["mean_squared_error"]
+            )
+
+            # test train_on_batch
+            out = model.train_on_batch(None, None)
+            out = model.test_on_batch(None, None)
+            out = model.predict_on_batch(None)
+
+            # Test multi-output model with no external data at all.
+            self.evaluate(tf.compat.v1.variables_initializer([input_v]))
+            a = input_layer.Input(tensor=input_v)
+            a_1 = layers_module.Dense(4, name="dense_1")(a)
+            a_2 = layers_module.Dropout(0.5, name="dropout")(a_1)
+            model = training_module.Model(a, [a_1, a_2])
+            model.add_loss(backend.mean(a_2))
+
+            model.compile(
+                optimizer="rmsprop", loss=None, metrics=["mean_squared_error"]
+            )
+
+            # test train_on_batch
+            out = model.train_on_batch(None, None)
+            out = model.test_on_batch(None, None)
+            out = model.predict_on_batch(None)
+
+            out = model.predict(None, steps=3)
+            self.assertEqual(len(out), 2)
+            self.assertEqual(out[0].shape, (10 * 3, 4))
+            self.assertEqual(out[1].shape, (10 * 3, 4))
+
+    def test_target_tensors(self):
+        with tf.Graph().as_default(), self.cached_session():
+            # single-output, as list
+            model = sequential.Sequential()
+            model.add(layers_module.Dense(4, input_shape=(4,), name="dense"))
+            input_val = np.random.random((10, 4))
+            target_val = np.random.random((10, 4))
+            target = backend.variable(target_val)
+            model.compile(
+                optimizer="rmsprop", loss="mse", target_tensors=[target]
+            )
+            model.train_on_batch(input_val, None)
+
+            # single-output, as single tensor
+            model.compile(
+                optimizer="rmsprop", loss="mse", target_tensors=target
+            )
+            model.train_on_batch(input_val, None)
+
+            # single-output, as dict
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                target_tensors={"dense": target},
+            )
+            model.train_on_batch(input_val, None)
+
+            # test invalid arguments
+            with self.assertRaises(TypeError):
+                model.compile(
+                    optimizer="rmsprop", loss="mse", target_tensors=set()
+                )
+            with self.assertRaises(ValueError):
+                model.compile(
+                    optimizer="rmsprop",
+                    loss="mse",
+                    target_tensors=[target, target],
+                )
+            with self.assertRaises(ValueError):
+                model.compile(
+                    optimizer="rmsprop",
+                    loss="mse",
+                    target_tensors={"dense2": None},
+                )
+            with self.assertRaises(ValueError):
+                model.compile(
+                    optimizer="rmsprop", loss="mse", target_tensors=[target]
+                )
+                model.train_on_batch(input_val, target_val)
+
+            # multi-output, as list
+            input_val = np.random.random((10, 4))
+            target_val_a = np.random.random((10, 4))
+            target_val_b = np.random.random((10, 4))
+            target_a = backend.variable(target_val_a)
+            target_b = backend.variable(target_val_b)
+
+            inputs = layers_module.Input(shape=(4,))
+            output_a = layers_module.Dense(4, name="dense_a")(inputs)
+            output_b = layers_module.Dense(4, name="dense_b")(inputs)
+            model = training_module.Model(inputs, [output_a, output_b])
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                target_tensors=[target_a, target_b],
+            )
+            model.train_on_batch(input_val, None)
+
+            # multi-output, as dict
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                target_tensors={"dense_a": target_a, "dense_b": target_b},
+            )
+            model.train_on_batch(input_val, None)
+
+            # test with sample weights
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                metrics=["mae", metrics_module.CategoricalAccuracy()],
+                target_tensors=[target_a, target_b],
+            )
+            model.train_on_batch(
+                input_val,
+                None,
+                sample_weight={"dense_a": np.random.random((10,))},
+            )
+
+    def test_model_custom_target_tensors(self):
+        with tf.Graph().as_default(), self.cached_session():
+            a = input_layer.Input(shape=(3,), name="input_a")
+            b = input_layer.Input(shape=(3,), name="input_b")
+
+            a_2 = layers_module.Dense(4, name="dense_1")(a)
+            dp = layers_module.Dropout(0.5, name="dropout")
+            b_2 = dp(b)
+
+            y = backend.placeholder([10, 4], name="y")
+            y1 = backend.placeholder([10, 3], name="y1")
+            y2 = backend.placeholder([7, 5], name="y2")
+            model = training_module.Model([a, b], [a_2, b_2])
+
+            optimizer = "rmsprop"
+            loss = "mse"
+            loss_weights = [1.0, 0.5]
+
+            # test list of target tensors
+            with self.assertRaises(ValueError):
+                model.compile(
+                    optimizer,
+                    loss,
+                    metrics=[],
                     loss_weights=loss_weights,
-                    sample_weight_mode=None)
-
-      # test train_on_batch
-      out = model.train_on_batch(input_b_np,
-                                 [output_a_np, output_b_np])
-      out = model.train_on_batch({'input_b': input_b_np},
-                                 [output_a_np, output_b_np])
-      out = model.test_on_batch({'input_b': input_b_np},
-                                [output_a_np, output_b_np])
-      out = model.predict_on_batch({'input_b': input_b_np})
-
-      # test fit
-      out = model.fit({'input_b': input_b_np},
-                      [output_a_np, output_b_np], epochs=1, batch_size=10)
-      out = model.fit(input_b_np,
-                      [output_a_np, output_b_np], epochs=1, batch_size=10)
-
-      # test evaluate
-      out = model.evaluate({'input_b': input_b_np},
-                           [output_a_np, output_b_np], batch_size=10)
-      out = model.evaluate(input_b_np,
-                           [output_a_np, output_b_np], batch_size=10)
-
-      # test predict
-      out = model.predict({'input_b': input_b_np}, batch_size=10)
-      out = model.predict(input_b_np, batch_size=10)
-      self.assertEqual(len(out), 2)
-
-      # Now test a model with a single input
-      # i.e. we don't pass any data to fit the model.
-      self.evaluate(tf.compat.v1.variables_initializer([input_v]))
-      a = input_layer.Input(tensor=input_v)
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      a_2 = layers_module.Dropout(0.5, name='dropout')(a_2)
-      model = training_module.Model(a, a_2)
-      model.summary()
-
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      model.compile(optimizer, loss, metrics=['mean_squared_error'])
-
-      # test train_on_batch
-      out = model.train_on_batch(None,
-                                 output_a_np)
-      out = model.train_on_batch(None,
-                                 output_a_np)
-      out = model.test_on_batch(None,
-                                output_a_np)
-      out = model.predict_on_batch(None)
-      out = model.train_on_batch([],
-                                 output_a_np)
-      out = model.train_on_batch({},
-                                 output_a_np)
-
-      # test fit
-      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
-      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
-
-      # test evaluate
-      _ = model.evaluate(None, output_a_np, steps=3)
-      _ = model.evaluate(None, output_a_np, steps=3)
-
-      # test predict
-      out = model.predict(None, steps=3)
-      out = model.predict(None, steps=3)
-      self.assertEqual(out.shape, (10 * 3, 4))
-
-      # Same, without learning phase
-      # i.e. we don't pass any data to fit the model.
-      self.evaluate(tf.compat.v1.variables_initializer([input_v]))
-      a = input_layer.Input(tensor=input_v)
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      model = training_module.Model(a, a_2)
-      model.summary()
-
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      model.compile(optimizer, loss, metrics=['mean_squared_error'])
-
-      # test train_on_batch
-      out = model.train_on_batch(None,
-                                 output_a_np)
-      out = model.train_on_batch(None,
-                                 output_a_np)
-      out = model.test_on_batch(None,
-                                output_a_np)
-      out = model.predict_on_batch(None)
-      out = model.train_on_batch([],
-                                 output_a_np)
-      out = model.train_on_batch({},
-                                 output_a_np)
-
-      # test fit
-      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
-      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
-
-      # test evaluate
-      _ = model.evaluate(None, output_a_np, steps=10)
-      _ = model.evaluate(None, output_a_np, steps=10)
-
-      # test predict
-      out = model.predict(None, steps=3)
-      out = model.predict(None, steps=3)
-      self.assertEqual(out.shape, (10 * 3, 4))
-
-  @test_combinations.run_all_keras_modes
-  def test_model_with_partial_loss(self):
-    with self.cached_session():
-      a = input_layer.Input(shape=(3,), name='input_a')
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      dp = layers_module.Dropout(0.5, name='dropout')
-      a_3 = dp(a_2)
-      model = training_module.Model(a, [a_2, a_3])
-
-      optimizer = 'rmsprop'
-      loss = {'dropout': 'mse'}
-      model.compile(optimizer, loss, metrics=['mae'])
-
-      input_a_np = np.random.random((10, 3))
-      output_a_np = np.random.random((10, 4))
-
-      # test train_on_batch
-      _ = model.train_on_batch(input_a_np, output_a_np)
-      _ = model.test_on_batch(input_a_np, output_a_np)
-      # fit
-      _ = model.fit(input_a_np, output_a_np)
-      # evaluate
-      _ = model.evaluate(input_a_np, output_a_np)
-
-      # Same without dropout.
-      a = input_layer.Input(shape=(3,), name='input_a')
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      a_3 = layers_module.Dense(4, name='dense_2')(a_2)
-      model = training_module.Model(a, [a_2, a_3])
-
-      optimizer = 'rmsprop'
-      loss = {'dense_2': 'mse'}
-      model.compile(optimizer, loss, metrics={'dense_1': 'mae'})
-
-      # test train_on_batch
-      _ = model.train_on_batch(input_a_np, output_a_np)
-      _ = model.test_on_batch(input_a_np, output_a_np)
-      # fit
-      _ = model.fit(input_a_np, output_a_np)
-      # evaluate
-      _ = model.evaluate(input_a_np, output_a_np)
-
-  def test_model_with_external_loss(self):
-    with tf.Graph().as_default(), self.cached_session():
-      # None loss, only regularization loss.
-      a = input_layer.Input(shape=(3,), name='input_a')
-      a_2 = layers_module.Dense(
-          4, name='dense_1', kernel_regularizer='l1', bias_regularizer='l2')(
-              a)
-      dp = layers_module.Dropout(0.5, name='dropout')
-      a_3 = dp(a_2)
-
-      model = training_module.Model(a, [a_2, a_3])
-
-      optimizer = 'rmsprop'
-      loss = None
-      model.compile(optimizer, loss, metrics=['mae'])
-
-      input_a_np = np.random.random((10, 3))
-
-      # test train_on_batch
-      out = model.train_on_batch(input_a_np, None)
-      out = model.test_on_batch(input_a_np, None)
-      # fit
-      out = model.fit(input_a_np, None)
-      # evaluate
-      out = model.evaluate(input_a_np, None)
-
-      # No dropout, external loss.
-      a = input_layer.Input(shape=(3,), name='input_a')
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      a_3 = layers_module.Dense(4, name='dense_2')(a)
-
-      model = training_module.Model(a, [a_2, a_3])
-      model.add_loss(backend.mean(a_3 + a_2))
-
-      optimizer = 'rmsprop'
-      loss = None
-      model.compile(optimizer, loss, metrics=['mae'])
-
-      # test train_on_batch
-      out = model.train_on_batch(input_a_np, None)
-      out = model.test_on_batch(input_a_np, None)
-      # fit
-      out = model.fit(input_a_np, None)
-      # evaluate
-      out = model.evaluate(input_a_np, None)
-
-      # Test model with no external data at all.
-      input_v = tf.Variable(input_a_np, dtype='float32')
-      self.evaluate(tf.compat.v1.variables_initializer([input_v]))
-      a = input_layer.Input(tensor=input_v)
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      a_2 = layers_module.Dropout(0.5, name='dropout')(a_2)
-      model = training_module.Model(a, a_2)
-      model.add_loss(backend.mean(a_2))
-
-      model.compile(optimizer='rmsprop',
-                    loss=None,
-                    metrics=['mean_squared_error'])
-
-      # test train_on_batch
-      out = model.train_on_batch(None, None)
-      out = model.test_on_batch(None, None)
-      out = model.predict_on_batch(None)
-
-      # Test multi-output model with no external data at all.
-      self.evaluate(tf.compat.v1.variables_initializer([input_v]))
-      a = input_layer.Input(tensor=input_v)
-      a_1 = layers_module.Dense(4, name='dense_1')(a)
-      a_2 = layers_module.Dropout(0.5, name='dropout')(a_1)
-      model = training_module.Model(a, [a_1, a_2])
-      model.add_loss(backend.mean(a_2))
-
-      model.compile(optimizer='rmsprop',
-                    loss=None,
-                    metrics=['mean_squared_error'])
-
-      # test train_on_batch
-      out = model.train_on_batch(None, None)
-      out = model.test_on_batch(None, None)
-      out = model.predict_on_batch(None)
-
-      out = model.predict(None, steps=3)
-      self.assertEqual(len(out), 2)
-      self.assertEqual(out[0].shape, (10 * 3, 4))
-      self.assertEqual(out[1].shape, (10 * 3, 4))
-
-  def test_target_tensors(self):
-    with tf.Graph().as_default(), self.cached_session():
-      # single-output, as list
-      model = sequential.Sequential()
-      model.add(layers_module.Dense(4, input_shape=(4,), name='dense'))
-      input_val = np.random.random((10, 4))
-      target_val = np.random.random((10, 4))
-      target = backend.variable(target_val)
-      model.compile(optimizer='rmsprop', loss='mse', target_tensors=[target])
-      model.train_on_batch(input_val, None)
-
-      # single-output, as single tensor
-      model.compile(optimizer='rmsprop', loss='mse', target_tensors=target)
-      model.train_on_batch(input_val, None)
-
-      # single-output, as dict
-      model.compile(optimizer='rmsprop', loss='mse',
-                    target_tensors={'dense': target})
-      model.train_on_batch(input_val, None)
-
-      # test invalid arguments
-      with self.assertRaises(TypeError):
-        model.compile(optimizer='rmsprop', loss='mse',
-                      target_tensors=set())
-      with self.assertRaises(ValueError):
-        model.compile(optimizer='rmsprop', loss='mse',
-                      target_tensors=[target, target])
-      with self.assertRaises(ValueError):
-        model.compile(optimizer='rmsprop', loss='mse',
-                      target_tensors={'dense2': None})
-      with self.assertRaises(ValueError):
-        model.compile(optimizer='rmsprop', loss='mse',
-                      target_tensors=[target])
-        model.train_on_batch(input_val, target_val)
-
-      # multi-output, as list
-      input_val = np.random.random((10, 4))
-      target_val_a = np.random.random((10, 4))
-      target_val_b = np.random.random((10, 4))
-      target_a = backend.variable(target_val_a)
-      target_b = backend.variable(target_val_b)
-
-      inputs = layers_module.Input(shape=(4,))
-      output_a = layers_module.Dense(4, name='dense_a')(inputs)
-      output_b = layers_module.Dense(4, name='dense_b')(inputs)
-      model = training_module.Model(inputs, [output_a, output_b])
-      model.compile(optimizer='rmsprop', loss='mse',
-                    target_tensors=[target_a, target_b])
-      model.train_on_batch(input_val, None)
-
-      # multi-output, as dict
-      model.compile(optimizer='rmsprop', loss='mse',
-                    target_tensors={'dense_a': target_a,
-                                    'dense_b': target_b})
-      model.train_on_batch(input_val, None)
-
-      # test with sample weights
-      model.compile(
-          optimizer='rmsprop',
-          loss='mse',
-          metrics=['mae', metrics_module.CategoricalAccuracy()],
-          target_tensors=[target_a, target_b])
-      model.train_on_batch(input_val, None,
-                           sample_weight={'dense_a': np.random.random((10,))})
-
-  def test_model_custom_target_tensors(self):
-    with tf.Graph().as_default(), self.cached_session():
-      a = input_layer.Input(shape=(3,), name='input_a')
-      b = input_layer.Input(shape=(3,), name='input_b')
-
-      a_2 = layers_module.Dense(4, name='dense_1')(a)
-      dp = layers_module.Dropout(0.5, name='dropout')
-      b_2 = dp(b)
-
-      y = backend.placeholder([10, 4], name='y')
-      y1 = backend.placeholder([10, 3], name='y1')
-      y2 = backend.placeholder([7, 5], name='y2')
-      model = training_module.Model([a, b], [a_2, b_2])
-
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-
-      # test list of target tensors
-      with self.assertRaises(ValueError):
-        model.compile(optimizer, loss, metrics=[], loss_weights=loss_weights,
-                      sample_weight_mode=None, target_tensors=[y, y1, y2])
-      model.compile(optimizer, loss, metrics=[], loss_weights=loss_weights,
-                    sample_weight_mode=None, target_tensors=[y, y1])
-      input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 3))
-
-      output_a_np = np.random.random((10, 4))
-      output_b_np = np.random.random((10, 3))
-
-      _ = model.train_on_batch([input_a_np, input_b_np],
-                               [output_a_np, output_b_np], {
-                                   'dense_1': np.random.random((10,)),
-                                   'dropout': np.random.random((10,))
-                               })
-      # test dictionary of target_tensors
-      with self.assertRaises(ValueError):
-        model.compile(optimizer, loss,
-                      metrics=[],
-                      loss_weights=loss_weights,
-                      sample_weight_mode=None,
-                      target_tensors={'does_not_exist': y2})
-      # test dictionary of target_tensors
-      model.compile(optimizer, loss,
+                    sample_weight_mode=None,
+                    target_tensors=[y, y1, y2],
+                )
+            model.compile(
+                optimizer,
+                loss,
+                metrics=[],
+                loss_weights=loss_weights,
+                sample_weight_mode=None,
+                target_tensors=[y, y1],
+            )
+            input_a_np = np.random.random((10, 3))
+            input_b_np = np.random.random((10, 3))
+
+            output_a_np = np.random.random((10, 4))
+            output_b_np = np.random.random((10, 3))
+
+            _ = model.train_on_batch(
+                [input_a_np, input_b_np],
+                [output_a_np, output_b_np],
+                {
+                    "dense_1": np.random.random((10,)),
+                    "dropout": np.random.random((10,)),
+                },
+            )
+            # test dictionary of target_tensors
+            with self.assertRaises(ValueError):
+                model.compile(
+                    optimizer,
+                    loss,
                     metrics=[],
                     loss_weights=loss_weights,
                     sample_weight_mode=None,
-                    target_tensors={'dense_1': y, 'dropout': y1})
-      _ = model.train_on_batch([input_a_np, input_b_np],
-                               [output_a_np, output_b_np], {
-                                   'dense_1': np.random.random((10,)),
-                                   'dropout': np.random.random((10,))
-                               })
-
-      # test with custom TF placeholder as target
-      pl_target_a = tf.compat.v1.placeholder('float32', shape=(None, 4))
-      model.compile(optimizer='rmsprop', loss='mse',
-                    target_tensors={'dense_1': pl_target_a})
-      model.train_on_batch([input_a_np, input_b_np],
-                           [output_a_np, output_b_np])
+                    target_tensors={"does_not_exist": y2},
+                )
+            # test dictionary of target_tensors
+            model.compile(
+                optimizer,
+                loss,
+                metrics=[],
+                loss_weights=loss_weights,
+                sample_weight_mode=None,
+                target_tensors={"dense_1": y, "dropout": y1},
+            )
+            _ = model.train_on_batch(
+                [input_a_np, input_b_np],
+                [output_a_np, output_b_np],
+                {
+                    "dense_1": np.random.random((10,)),
+                    "dropout": np.random.random((10,)),
+                },
+            )
+
+            # test with custom TF placeholder as target
+            pl_target_a = tf.compat.v1.placeholder("float32", shape=(None, 4))
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                target_tensors={"dense_1": pl_target_a},
+            )
+            model.train_on_batch(
+                [input_a_np, input_b_np], [output_a_np, output_b_np]
+            )
 
 
 class TestTrainingWithMetrics(test_combinations.TestCase):
-  """Training tests related to metrics."""
-
-  @test_combinations.run_all_keras_modes
-  def test_metrics_names(self):
-    a = layers_module.Input(shape=(3,), name='input_a')
-    b = layers_module.Input(shape=(3,), name='input_b')
-
-    dense = layers_module.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = layers_module.Dropout(0.5, name='dropout')(c)
-
-    model = training_module.Model([a, b], [d, e])
-
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    metrics = ['mse', metrics_module.BinaryAccuracy()]
-    model.compile(
-        optimizer,
-        loss='mae',
-        metrics=metrics,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    mse_metric = 'mse' if tf.executing_eagerly() else 'mean_squared_error'
-    reference_metric_names = [
-        'loss', 'dense_loss', 'dropout_loss', 'dense_' + mse_metric,
-        'dense_binary_accuracy', 'dropout_' + mse_metric,
-        'dropout_binary_accuracy'
-    ]
-
-    input_a_np = np.random.random((10, 3))
-    input_b_np = np.random.random((10, 3))
-
-    output_d_np = np.random.random((10, 4))
-    output_e_np = np.random.random((10, 4))
-
-    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
-              epochs=1,
-              batch_size=5)
-    self.assertEqual(reference_metric_names, model.metrics_names)
-
-  @test_combinations.run_all_keras_modes
-  def test_metric_state_reset_between_fit_and_evaluate(self):
-    model = sequential.Sequential()
-    model.add(layers_module.Dense(3, activation='relu', input_dim=4))
-    model.add(layers_module.Dense(1, activation='sigmoid'))
-    acc_obj = metrics_module.BinaryAccuracy()
-    model.compile(
-        loss='mae',
-        metrics=[acc_obj],
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x_train = np.random.random((100, 4))
-    y_train = np.random.random((100, 1))
-    model.fit(x_train, y_train, batch_size=5, epochs=2)
-    self.assertEqual(self.evaluate(acc_obj.count), 100)
-
-    x_test = np.random.random((10, 4))
-    y_test = np.random.random((10, 1))
-    model.evaluate(x_test, y_test, batch_size=5)
-    self.assertEqual(self.evaluate(acc_obj.count), 10)
-
-  @test_combinations.run_all_keras_modes
-  def test_metric_state_reset_between_test_on_batch_and_evaluate(self):
-    model = sequential.Sequential()
-    model.add(layers_module.Dense(3, activation='relu', input_dim=4))
-    model.add(layers_module.Dense(1, activation='sigmoid'))
-    acc_obj = metrics_module.BinaryAccuracy()
-    model.compile(
-        loss='mae',
-        metrics=[acc_obj],
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x_test = np.random.random((10, 4))
-    y_test = np.random.random((10, 1))
-    loss, acc = model.test_on_batch(x_test[:2], y_test[:2])
-    loss_eval, acc_eval = model.evaluate(x_test, y_test)
-    loss_1, acc_1 = model.test_on_batch(x_test[:2], y_test[:2])
-    loss_eval_1, acc_eval_1 = model.evaluate(x_test, y_test)
-    self.assertEqual(loss, loss_1)
-    self.assertEqual(acc, acc_1)
-    self.assertEqual(loss_eval, loss_eval_1)
-    self.assertEqual(acc_eval, acc_eval_1)
-
-  @test_combinations.run_with_all_model_types(exclude_models=['sequential'])
-  @test_combinations.run_all_keras_modes
-  def test_metrics_valid_compile_input_formats(self):
-    inp_1 = layers_module.Input(shape=(1,), name='input_1')
-    inp_2 = layers_module.Input(shape=(1,), name='input_2')
-    x = layers_module.Dense(3, kernel_initializer='ones', trainable=False)
-    out_1 = layers_module.Dense(
-        1, kernel_initializer='ones', name='output_1', trainable=False)
-    out_2 = layers_module.Dense(
-        1, kernel_initializer='ones', name='output_2', trainable=False)
-
-    branch_a = [inp_1, x, out_1]
-    branch_b = [inp_2, x, out_2]
-    model = test_utils.get_multi_io_model(branch_a, branch_b)
-
-    # list of metrics.
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        metrics=[metrics_module.MeanSquaredError()],
-        weighted_metrics=[metrics_module.MeanSquaredError()],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    # list of list of metrics.
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        metrics=[
-            metrics_module.MeanSquaredError(),
-            [metrics_module.MeanSquaredError(),
-             metrics_module.Accuracy()]
-        ],
-        weighted_metrics=[
-            metrics_module.MeanSquaredError(),
-            [metrics_module.MeanSquaredError(),
-             metrics_module.Accuracy()]
-        ],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    # dict of metrics.
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        metrics={
-            'output_1':
-                metrics_module.MeanSquaredError(),
-            'output_2': [
+    """Training tests related to metrics."""
+
+    @test_combinations.run_all_keras_modes
+    def test_metrics_names(self):
+        a = layers_module.Input(shape=(3,), name="input_a")
+        b = layers_module.Input(shape=(3,), name="input_b")
+
+        dense = layers_module.Dense(4, name="dense")
+        c = dense(a)
+        d = dense(b)
+        e = layers_module.Dropout(0.5, name="dropout")(c)
+
+        model = training_module.Model([a, b], [d, e])
+
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        metrics = ["mse", metrics_module.BinaryAccuracy()]
+        model.compile(
+            optimizer,
+            loss="mae",
+            metrics=metrics,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        mse_metric = "mse" if tf.executing_eagerly() else "mean_squared_error"
+        reference_metric_names = [
+            "loss",
+            "dense_loss",
+            "dropout_loss",
+            "dense_" + mse_metric,
+            "dense_binary_accuracy",
+            "dropout_" + mse_metric,
+            "dropout_binary_accuracy",
+        ]
+
+        input_a_np = np.random.random((10, 3))
+        input_b_np = np.random.random((10, 3))
+
+        output_d_np = np.random.random((10, 4))
+        output_e_np = np.random.random((10, 4))
+
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=1,
+            batch_size=5,
+        )
+        self.assertEqual(reference_metric_names, model.metrics_names)
+
+    @test_combinations.run_all_keras_modes
+    def test_metric_state_reset_between_fit_and_evaluate(self):
+        model = sequential.Sequential()
+        model.add(layers_module.Dense(3, activation="relu", input_dim=4))
+        model.add(layers_module.Dense(1, activation="sigmoid"))
+        acc_obj = metrics_module.BinaryAccuracy()
+        model.compile(
+            loss="mae",
+            metrics=[acc_obj],
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x_train = np.random.random((100, 4))
+        y_train = np.random.random((100, 1))
+        model.fit(x_train, y_train, batch_size=5, epochs=2)
+        self.assertEqual(self.evaluate(acc_obj.count), 100)
+
+        x_test = np.random.random((10, 4))
+        y_test = np.random.random((10, 1))
+        model.evaluate(x_test, y_test, batch_size=5)
+        self.assertEqual(self.evaluate(acc_obj.count), 10)
+
+    @test_combinations.run_all_keras_modes
+    def test_metric_state_reset_between_test_on_batch_and_evaluate(self):
+        model = sequential.Sequential()
+        model.add(layers_module.Dense(3, activation="relu", input_dim=4))
+        model.add(layers_module.Dense(1, activation="sigmoid"))
+        acc_obj = metrics_module.BinaryAccuracy()
+        model.compile(
+            loss="mae",
+            metrics=[acc_obj],
+            optimizer=RMSPropOptimizer(learning_rate=0.001),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x_test = np.random.random((10, 4))
+        y_test = np.random.random((10, 1))
+        loss, acc = model.test_on_batch(x_test[:2], y_test[:2])
+        loss_eval, acc_eval = model.evaluate(x_test, y_test)
+        loss_1, acc_1 = model.test_on_batch(x_test[:2], y_test[:2])
+        loss_eval_1, acc_eval_1 = model.evaluate(x_test, y_test)
+        self.assertEqual(loss, loss_1)
+        self.assertEqual(acc, acc_1)
+        self.assertEqual(loss_eval, loss_eval_1)
+        self.assertEqual(acc_eval, acc_eval_1)
+
+    @test_combinations.run_with_all_model_types(exclude_models=["sequential"])
+    @test_combinations.run_all_keras_modes
+    def test_metrics_valid_compile_input_formats(self):
+        inp_1 = layers_module.Input(shape=(1,), name="input_1")
+        inp_2 = layers_module.Input(shape=(1,), name="input_2")
+        x = layers_module.Dense(3, kernel_initializer="ones", trainable=False)
+        out_1 = layers_module.Dense(
+            1, kernel_initializer="ones", name="output_1", trainable=False
+        )
+        out_2 = layers_module.Dense(
+            1, kernel_initializer="ones", name="output_2", trainable=False
+        )
+
+        branch_a = [inp_1, x, out_1]
+        branch_b = [inp_2, x, out_2]
+        model = test_utils.get_multi_io_model(branch_a, branch_b)
+
+        # list of metrics.
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            metrics=[metrics_module.MeanSquaredError()],
+            weighted_metrics=[metrics_module.MeanSquaredError()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        # list of list of metrics.
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            metrics=[
                 metrics_module.MeanSquaredError(),
-                metrics_module.Accuracy()
+                [metrics_module.MeanSquaredError(), metrics_module.Accuracy()],
             ],
-        },
-        weighted_metrics={
-            'output_1':
+            weighted_metrics=[
                 metrics_module.MeanSquaredError(),
-            'output_2': [
-                metrics_module.MeanSquaredError(),
-                metrics_module.Accuracy()
+                [metrics_module.MeanSquaredError(), metrics_module.Accuracy()],
             ],
-        },
-        run_eagerly=test_utils.should_run_eagerly())
-
-  @test_combinations.run_all_keras_modes
-  def test_metrics_masking(self):
-    np.random.seed(1337)
-    model = sequential.Sequential()
-    model.add(layers_module.Masking(mask_value=0, input_shape=(2, 1)))
-    model.add(
-        layers_module.TimeDistributed(
-            layers_module.Dense(1, kernel_initializer='ones')))
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        loss='mse',
-        weighted_metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    # verify that masking is applied.
-    x = np.array([[[1], [1]], [[1], [1]], [[0], [0]]])
-    y = np.array([[[1], [1]], [[0], [1]], [[1], [1]]])
-    scores = model.train_on_batch(x, y)
-    self.assertArrayNear(scores, [0.25, 0.75], 0.1)
-
-    # verify that masking is combined with sample weights.
-    w = np.array([3, 2, 4])
-    scores = model.train_on_batch(x, y, sample_weight=w)
-    self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
-
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_with_tensor_on_model(self):
-    x = layers_module.Input(shape=(1,))
-    y = layers_module.Dense(1, kernel_initializer='ones')(x)
-    model = training_module.Model(x, y)
-    model.add_metric(
-        tf.reduce_sum(y), name='metric_1', aggregation='mean')
-
-    if tf.executing_eagerly():
-      # This is not a use case in v1 graph mode.
-      mean_result = metrics_module.Mean()(y)
-      with self.assertRaisesRegex(
-          ValueError, 'Expected a symbolic Tensor for the metric value'):
-        model.add_metric(mean_result, name='metric_2')
-    else:
-      with self.assertRaisesRegex(
-          ValueError, 'Using the result of calling a `Metric` object '):
-        with backend.get_graph().as_default():
-          model.add_metric(metrics_module.Mean(name='metric_2')(y))
-
-    model.compile(
-        'sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.ones(shape=(10, 1))
-    targets = np.ones(shape=(10, 1))
-    history = model.fit(
-        inputs,
-        targets,
-        epochs=2,
-        batch_size=5,
-        validation_data=(inputs, targets))
-    self.assertEqual(history.history['metric_1'][-1], 5)
-    self.assertEqual(history.history['val_metric_1'][-1], 5)
-
-    eval_results = model.evaluate(inputs, targets, batch_size=5)
-    self.assertEqual(eval_results[-1], 5)
-
-    model.predict(inputs, batch_size=5)
-    model.train_on_batch(inputs, targets)
-    model.test_on_batch(inputs, targets)
-
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_in_model_call(self):
-
-    class TestModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__(name='test_model')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-        self.mean = metrics_module.Mean(name='metric_1')
-
-      def call(self, x):
-        self.add_metric(
-            tf.reduce_sum(x), name='metric_2', aggregation='mean')
-        # Provide same name as in the instance created in __init__
-        # for eager mode
-        self.add_metric(self.mean(x), name='metric_1')
-        return self.dense1(x)
-
-    model = TestModel()
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(0.01),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
-    self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
-    self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
-    self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
-
-    eval_results = model.evaluate(x, y, batch_size=5)
-    self.assertAlmostEqual(eval_results[1], 1, 0)
-    self.assertAlmostEqual(eval_results[2], 5, 0)
-
-    model.predict(x, batch_size=5)
-    model.train_on_batch(x, y)
-    model.test_on_batch(x, y)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_in_layer_call(self):
-
-    class TestLayer(layers_module.Layer):
-
-      def build(self, input_shape):
-        self.a = self.add_weight(
-            'a', (1, 1), initializer='ones', trainable=False)
-        self.built = True
-
-      def call(self, inputs):
-        self.add_metric(
-            tf.reduce_sum(inputs), name='metric_1', aggregation='mean')
-        return inputs + 1
-
-    layers = [
-        TestLayer(input_shape=(1,)),
-        layers_module.Dense(2, kernel_initializer='ones')
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(1,))
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(0.01),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-    self.assertEqual(history.history['metric_1'][-1], 5)
-    self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_model_metrics_list(self):
-
-    class LayerWithAddMetric(layers_module.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.dense = layers_module.Dense(1, kernel_initializer='ones')
-
-      def __call__(self, inputs):
-        outputs = self.dense(inputs)
-        self.add_metric(
-            tf.reduce_sum(outputs), name='metric_1', aggregation='mean')
-        return outputs
-
-    class LayerWithNestedAddMetricLayer(layers_module.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = LayerWithAddMetric()
-
-      def call(self, inputs):
-        outputs = self.layer(inputs)
-        self.add_metric(
-            tf.reduce_sum(outputs), name='metric_2', aggregation='mean')
-        return outputs
-
-    x = layers_module.Input(shape=(1,))
-    y = LayerWithNestedAddMetricLayer()(x)
-
-    model = training_module.Model(x, y)
-    model.add_metric(
-        tf.reduce_sum(y), name='metric_3', aggregation='mean')
-
-    if tf.executing_eagerly():
-      # This is not a use case in v1 graph mode.
-      mean_result = metrics_module.Mean()(y)
-      with self.assertRaisesRegex(
-          ValueError, 'Expected a symbolic Tensor for the metric value'):
-        model.add_metric(mean_result, name='metric_4')
-
-    else:
-      with self.assertRaisesRegex(
-          ValueError, 'Using the result of calling a `Metric` object '):
-        with backend.get_graph().as_default():
-          model.add_metric(metrics_module.Mean(name='metric_4')(y))
-
-    model.compile(
-        'sgd',
-        loss='mse',
-        metrics=[metrics_module.Accuracy('metric_4')],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
-
-    # Verify that the metrics added using `compile` and `add_metric` API are
-    # included
-    self.assertEqual([m.name for m in model.metrics],
-                     ['loss', 'metric_4', 'metric_2', 'metric_1', 'metric_3'])
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_model_metrics_list_in_call(self):
-
-    class TestModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__(name='test_model')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-
-      def call(self, x):
-        self.add_metric(
-            tf.reduce_sum(x), name='metric_1', aggregation='mean')
-        return self.dense1(x)
-
-    model = TestModel()
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(0.01),
-        metrics=[metrics_module.Accuracy('acc')],
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-
-    self.assertEqual([m.name for m in model.metrics],
-                     ['loss', 'acc', 'metric_1'])
-
-  @test_combinations.run_all_keras_modes
-  def test_multiple_add_metric_calls(self):
-
-    class TestModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__(name='test_model')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-        self.mean1 = metrics_module.Mean(name='metric_1')
-        self.mean2 = metrics_module.Mean(name='metric_2')
-
-      def call(self, x):
-        self.add_metric(self.mean2(x), name='metric_2')
-        self.add_metric(self.mean1(x), name='metric_1')
-        self.add_metric(
-            tf.reduce_sum(x), name='metric_3', aggregation='mean')
-        return self.dense1(x)
-
-    model = TestModel()
-    self.assertListEqual([m.name for m in model.metrics],
-                         ['metric_1', 'metric_2'])
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(0.01),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
-    self.assertAlmostEqual(history.history['metric_2'][-1], 1, 0)
-    self.assertAlmostEqual(history.history['metric_3'][-1], 5, 0)
-
-    eval_results = model.evaluate(x, y, batch_size=5)
-    self.assertArrayNear(eval_results[1:4], [1, 1, 5], 0.1)
-
-    model.predict(x, batch_size=5)
-    model.train_on_batch(x, y)
-    model.test_on_batch(x, y)
-
-  @test_combinations.run_all_keras_modes
-  def test_multiple_add_metric_calls_layer(self):
-
-    class TestLayer(layers_module.Layer):
-
-      def __init__(self):
-        super().__init__(name='test_layer')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-        self.m1 = metrics_module.Mean(name='m_1')
-        self.m2 = [
-            metrics_module.Mean(name='m_2'),
-            metrics_module.Mean(name='m_3')
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        # dict of metrics.
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            metrics={
+                "output_1": metrics_module.MeanSquaredError(),
+                "output_2": [
+                    metrics_module.MeanSquaredError(),
+                    metrics_module.Accuracy(),
+                ],
+            },
+            weighted_metrics={
+                "output_1": metrics_module.MeanSquaredError(),
+                "output_2": [
+                    metrics_module.MeanSquaredError(),
+                    metrics_module.Accuracy(),
+                ],
+            },
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_metrics_masking(self):
+        np.random.seed(1337)
+        model = sequential.Sequential()
+        model.add(layers_module.Masking(mask_value=0, input_shape=(2, 1)))
+        model.add(
+            layers_module.TimeDistributed(
+                layers_module.Dense(1, kernel_initializer="ones")
+            )
+        )
+        model.compile(
+            RMSPropOptimizer(learning_rate=0.001),
+            loss="mse",
+            weighted_metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        # verify that masking is applied.
+        x = np.array(
+            # third row is masked
+            [[[1], [1]], [[1], [1]], [[0], [0]]]
+        )
+        y = np.array([[[1], [1]], [[0], [1]], [[1], [1]]])
+
+        scores = model.test_on_batch(x, y)
+        self.assertArrayNear(scores, [0.25, 0.75], 0.0001)
+
+        # verify that masking is combined with sample weights.
+        w = np.array([3, 2, 4])
+        scores = model.test_on_batch(x, y, sample_weight=w)
+        self.assertArrayNear(scores, [0.5, 0.8], 0.0001)
+
+        scores = model.train_on_batch(x, y)
+        self.assertArrayNear(scores, [0.25, 0.75], 0.0001)
+
+        scores = model.train_on_batch(x, y, sample_weight=w)
+        self.assertArrayNear(scores, [0.5 - 0.001037, 0.8], 0.0001)
+
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_with_tensor_on_model(self):
+        x = layers_module.Input(shape=(1,))
+        y = layers_module.Dense(1, kernel_initializer="ones")(x)
+        model = training_module.Model(x, y)
+        model.add_metric(tf.reduce_sum(y), name="metric_1", aggregation="mean")
+
+        if tf.executing_eagerly():
+            # This is not a use case in v1 graph mode.
+            mean_result = metrics_module.Mean()(y)
+            with self.assertRaisesRegex(
+                ValueError, "Expected a symbolic Tensor for the metric value"
+            ):
+                model.add_metric(mean_result, name="metric_2")
+        else:
+            with self.assertRaisesRegex(
+                ValueError, "Using the result of calling a `Metric` object "
+            ):
+                with backend.get_graph().as_default():
+                    model.add_metric(metrics_module.Mean(name="metric_2")(y))
+
+        model.compile(
+            "sgd", loss="mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        inputs = np.ones(shape=(10, 1))
+        targets = np.ones(shape=(10, 1))
+        history = model.fit(
+            inputs,
+            targets,
+            epochs=2,
+            batch_size=5,
+            validation_data=(inputs, targets),
+        )
+        self.assertEqual(history.history["metric_1"][-1], 5)
+        self.assertEqual(history.history["val_metric_1"][-1], 5)
+
+        eval_results = model.evaluate(inputs, targets, batch_size=5)
+        self.assertEqual(eval_results[-1], 5)
+
+        model.predict(inputs, batch_size=5)
+        model.train_on_batch(inputs, targets)
+        model.test_on_batch(inputs, targets)
+
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_in_model_call(self):
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+                self.mean = metrics_module.Mean(name="metric_1")
+
+            def call(self, x):
+                self.add_metric(
+                    tf.reduce_sum(x), name="metric_2", aggregation="mean"
+                )
+                # Provide same name as in the instance created in __init__
+                # for eager mode
+                self.add_metric(self.mean(x), name="metric_1")
+                return self.dense1(x)
+
+        model = TestModel()
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+        history = model.fit(
+            x, y, epochs=2, batch_size=5, validation_data=(x, y)
+        )
+        self.assertAlmostEqual(history.history["metric_1"][-1], 1, 0)
+        self.assertAlmostEqual(history.history["val_metric_1"][-1], 1, 0)
+        self.assertAlmostEqual(history.history["metric_2"][-1], 5, 0)
+        self.assertAlmostEqual(history.history["val_metric_2"][-1], 5, 0)
+
+        eval_results = model.evaluate(x, y, batch_size=5)
+        self.assertAlmostEqual(eval_results[1], 1, 0)
+        self.assertAlmostEqual(eval_results[2], 5, 0)
+
+        model.predict(x, batch_size=5)
+        model.train_on_batch(x, y)
+        model.test_on_batch(x, y)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_in_layer_call(self):
+        class TestLayer(layers_module.Layer):
+            def build(self, input_shape):
+                self.a = self.add_weight(
+                    "a", (1, 1), initializer="ones", trainable=False
+                )
+                self.built = True
+
+            def call(self, inputs):
+                self.add_metric(
+                    tf.reduce_sum(inputs), name="metric_1", aggregation="mean"
+                )
+                return inputs + 1
+
+        layers = [
+            TestLayer(input_shape=(1,)),
+            layers_module.Dense(2, kernel_initializer="ones"),
         ]
-        self.m3 = {
-            'mean4': metrics_module.Mean(name='m_4'),
-            'mean5': metrics_module.Mean(name='m_5')
-        }
-
-      def call(self, x):
-        self.add_metric(self.m2[0](x))
-        self.add_metric(self.m2[1](x))
-        self.add_metric(self.m1(x))
-        self.add_metric(self.m3['mean4'](x))
-        self.add_metric(self.m3['mean5'](x))
-        self.add_metric(tf.reduce_sum(x), name='m_6', aggregation='mean')
-        return self.dense1(x)
-
-    layer = TestLayer()
-    self.assertListEqual([m.name for m in layer.metrics],
-                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5'])
-
-    layer(np.ones((10, 10)))
-    self.assertListEqual([m.name for m in layer.metrics],
-                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
-
-  @test_combinations.run_all_keras_modes
-  def test_duplicate_metric_name_in_add_metric(self):
-
-    class TestModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__(name='test_model')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-        self.mean = metrics_module.Mean(name='metric_1')
-        self.mean2 = metrics_module.Mean(name='metric_1')
-
-      def call(self, x):
-        self.add_metric(self.mean(x), name='metric_1')
-        return self.dense1(x)
-
-    model = TestModel()
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(0.01),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-    with self.assertRaisesRegex(
-        ValueError,
-        'Please provide different names for the metrics you have added. '
-        'We found 2 metrics with the name: "metric_1"'):
-      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_without_name(self):
-
-    class TestModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__(name='test_model')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-
-      def call(self, x):
-        self.add_metric(tf.reduce_sum(x), aggregation='mean')
-        return self.dense1(x)
-
-    model = TestModel()
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(0.01),
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-
-    with self.assertRaisesRegex(ValueError,
-                                'Please provide a name for your metric like'):
-      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_correctness(self):
-    inputs = input_layer.Input(shape=(1,))
-    targets = input_layer.Input(shape=(1,))
-
-    class Bias(layers_module.Layer):
-
-      def build(self, input_shape):
-        self.bias = self.add_weight('bias', (1,), initializer='zeros')
-        self.mae = metrics_module.MeanAbsoluteError(name='mae_1')
-
-      def call(self, inputs):
-        inputs, targets = inputs
-        outputs = inputs + self.bias
-        self.add_metric(self.mae(targets, outputs), name='mae_1')
-        return outputs
-
-    outputs = Bias()([inputs, targets])
-    model = training_module.Model([inputs, targets], outputs)
-
-    model.add_metric(
-        metrics_module.mean_absolute_error(targets, outputs),
-        name='mae_2',
-        aggregation='mean')
-
-    model.compile(
-        loss='mae',
-        optimizer=optimizer_v2.gradient_descent.SGD(0.1),
-        metrics=[metrics_module.MeanAbsoluteError(name='mae_3')],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.array([[0.], [1.], [2.]])
-    y = np.array([[0.5], [2.], [3.5]])
-    history = model.fit([x, y], y, batch_size=3, epochs=5)
-
-    expected_val = [1., 0.9, 0.8, 0.7, 0.6]
-    for key in ['loss', 'mae_1', 'mae_2', 'mae_3']:
-      self.assertAllClose(history.history[key], expected_val, 1e-3)
-
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_order(self):
-
-    class MyLayer(layers_module.Layer):
-
-      def call(self, inputs, training=None, mask=None):
-        self.add_metric(
-            tf.ones([32]) * 2.0, name='two', aggregation='mean')
-        return inputs
+        model = test_utils.get_model_from_layers(layers, input_shape=(1,))
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+        history = model.fit(
+            x, y, epochs=2, batch_size=5, validation_data=(x, y)
+        )
+        self.assertEqual(history.history["metric_1"][-1], 5)
+        self.assertAlmostEqual(history.history["val_metric_1"][-1], 5, 0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_model_metrics_list(self):
+        class LayerWithAddMetric(layers_module.Layer):
+            def __init__(self):
+                super().__init__()
+                self.dense = layers_module.Dense(1, kernel_initializer="ones")
+
+            def __call__(self, inputs):
+                outputs = self.dense(inputs)
+                self.add_metric(
+                    tf.reduce_sum(outputs), name="metric_1", aggregation="mean"
+                )
+                return outputs
+
+        class LayerWithNestedAddMetricLayer(layers_module.Layer):
+            def __init__(self):
+                super().__init__()
+                self.layer = LayerWithAddMetric()
+
+            def call(self, inputs):
+                outputs = self.layer(inputs)
+                self.add_metric(
+                    tf.reduce_sum(outputs), name="metric_2", aggregation="mean"
+                )
+                return outputs
+
+        x = layers_module.Input(shape=(1,))
+        y = LayerWithNestedAddMetricLayer()(x)
+
+        model = training_module.Model(x, y)
+        model.add_metric(tf.reduce_sum(y), name="metric_3", aggregation="mean")
+
+        if tf.executing_eagerly():
+            # This is not a use case in v1 graph mode.
+            mean_result = metrics_module.Mean()(y)
+            with self.assertRaisesRegex(
+                ValueError, "Expected a symbolic Tensor for the metric value"
+            ):
+                model.add_metric(mean_result, name="metric_4")
 
-    class MyModel(training_module.Model):
-
-      def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self._sampler = MyLayer(name='sampler')
-
-      def call(self, inputs, training=None, mask=None):
-        z = self._sampler(inputs)
-        self.add_metric(
-            tf.ones([32]) * 1.0, name='one', aggregation='mean')
-        self.add_metric(
-            tf.ones([32]) * 3.0, name='three', aggregation='mean')
-        return z
-
-    xdata = np.random.uniform(size=[32, 16]).astype(np.float32)
-    dataset_train = tf.data.Dataset.from_tensor_slices((xdata, xdata))
-    dataset_train = dataset_train.batch(32, drop_remainder=True)
-
-    model = MyModel()
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(dataset_train, epochs=3)
-    self.assertDictEqual(
-        history.history, {
-            'loss': [0.0, 0.0, 0.0],
-            'three': [3.0, 3.0, 3.0],
-            'two': [2.0, 2.0, 2.0],
-            'one': [1.0, 1.0, 1.0]
-        })
-
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_aggregation_mean(self):
-
-    class TestModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__(name='test_model')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-
-      def call(self, x):
-        self.add_metric(
-            tf.reduce_sum(x), name='metric_1', aggregation='mean')
-        return self.dense1(x)
-
-    model = TestModel()
-    model.compile(
-        'rmsprop', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    model.fit(np.ones(shape=(10, 1)), np.ones(shape=(10, 2)), batch_size=5)
-
-  @test_combinations.run_all_keras_modes
-  def test_add_metric_aggregation_none(self):
-
-    class TestModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__(name='test_model')
-        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
-        self.mean = metrics_module.Mean(name='metric_1')
-
-      def call(self, x):
-        self.add_metric(self.mean(x), name='metric_1', aggregation=None)
-        return self.dense1(x)
-
-    model = TestModel()
-    model.compile(
-        'rmsprop', 'mse', run_eagerly=test_utils.should_run_eagerly())
-    model.fit(np.ones(shape=(10, 1)), np.ones(shape=(10, 2)), batch_size=5)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def DISABLED_test_add_metric_invalid_aggregation(self):
-    # TODO(psv): Re-enable test once it is fixed.
-    x = layers_module.Input(shape=(1,))
-    y = layers_module.Dense(1, kernel_initializer='ones')(x)
-    model = training_module.Model(x, y)
-    with self.assertRaisesRegex(ValueError,
-                                'only `mean` sample-wise metric aggregation'):
-      model.add_metric(
-          tf.reduce_sum(y), name='metric_1', aggregation='sum')
-
-    with self.assertRaisesRegex(ValueError,
-                                'only `mean` sample-wise metric aggregation'):
-      model.add_metric(
-          tf.reduce_sum(y), name='metric_1', aggregation=None)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_calling_evaluate_in_callback_during_fit(self):
-    # Check fix for a bug that caused `evaluate` to hit a cached dataset
-    # when run from inside a fit callback.
-    x = layers_module.Input(shape=(2,))
-    y = layers_module.Dense(2, kernel_initializer='ones', use_bias=False)(x)
-    model = training_module.Model(x, y)
-
-    ones = np.ones((10, 2), dtype=np.float32)
-    zeros = np.zeros((10, 2), dtype=np.float32)
-    train_ds = tf.data.Dataset.from_tensor_slices(
-        (ones, ones)).batch(5)
-    val_ds_1 = tf.data.Dataset.from_tensor_slices(
-        (ones, ones)).batch(5)
-    val_ds_2 = tf.data.Dataset.from_tensor_slices(
-        (zeros, zeros)).batch(5)
-    model.compile('sgd', 'mse', run_eagerly=test_utils.should_run_eagerly())
-
-    class MyCallback(Callback):
-
-      def on_epoch_end(self, *args, **kwargs):
-        eval_result = self.model.evaluate(val_ds_2)
-        if abs(eval_result) > 1e-7:
-          raise AssertionError(
-              'Expected to hit the zeros dataset but got high loss value of %s'
-              % eval_result)
-
-    history = model.fit(
-        train_ds, validation_data=val_ds_1, callbacks=[MyCallback()])
-    # Evaluate at the end of fit should hit the ones dataset (cached)
-    self.assertGreater(abs(history.history['val_loss'][-1]), 0.1)
-    # Standalone call to evaluate should not hit the cached dataset
-    eval_result = model.evaluate(val_ds_2)
-    self.assertLess(abs(eval_result), 1e-7)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_model_with_nested_compiled_model(self):
-
-    class LayerWithAddMetric(layers_module.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.dense = layers_module.Dense(1, kernel_initializer='ones')
-
-      def call(self, inputs):
-        outputs = self.dense(inputs)
-        self.add_metric(
-            tf.reduce_sum(outputs), name='mean', aggregation='mean')
-        return outputs
-
-    x = layers_module.Input(shape=(1,))
-    y = LayerWithAddMetric()(x)
-
-    inner_model = training_module.Model(x, y)
-    inner_model.add_metric(
-        tf.reduce_sum(y), name='mean1', aggregation='mean')
-
-    inner_model.compile(
-        'sgd',
-        loss='mse',
-        metrics=[metrics_module.Accuracy('acc')],
-        run_eagerly=test_utils.should_run_eagerly())
-    inner_model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
-
-    self.assertEqual([m.name for m in inner_model.metrics],
-                     ['loss', 'acc', 'mean', 'mean1'])
-
-    x = layers_module.Input(shape=[1])
-    y = inner_model(x)
-    outer_model = training_module.Model(x, y)
-    outer_model.add_metric(
-        tf.reduce_sum(y), name='mean2', aggregation='mean')
-
-    outer_model.compile(
-        'sgd',
-        loss='mse',
-        metrics=[metrics_module.Accuracy('acc2')],
-        run_eagerly=test_utils.should_run_eagerly())
-    outer_model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
-    self.assertEqual([m.name for m in outer_model.metrics],
-                     ['loss', 'acc2', 'mean', 'mean1', 'mean2'])
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_model_with_metric_class_that_returns_dict(self):
-    x = layers_module.Input(shape=(2,))
-    y = layers_module.Dense(3)(x)
-    model = training_module.Model(x, y)
-
-    class DictMetric(metrics_module.Metric):
-
-      def __init__(self):
-        super().__init__()
-        self.sample_count = tf.Variable(0)
-        self.l2_sum = tf.Variable(0.)
-
-      def update_state(self, y_true, y_pred, sample_weight=None):
-        self.l2_sum.assign_add(
-            tf.reduce_sum(tf.square(y_true - y_pred)))
-        self.sample_count.assign_add(tf.shape(y_true)[0])
-
-      def reset_state(self):
-        self.sample_count.assign(0)
-        self.l2_sum.assign(0.)
-
-      def result(self):
-        mse = self.l2_sum / tf.cast(self.sample_count, 'float32')
-        rmse = tf.sqrt(mse)
-        return {'my_mse': mse,
-                'my_rmse': rmse}
-
-    model.compile('sgd',
-                  'mse',
-                  metrics=['mae', DictMetric()],
-                  run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit(np.ones((10, 2)), np.ones((10, 3)))
-    self.assertEqual(list(history.history.keys()),
-                     ['loss', 'mae', 'my_mse', 'my_rmse'])
-    list_evaluate_res = model.evaluate(
-        np.ones((10, 2)), np.ones((10, 3)))
-    self.assertEqual(len(list_evaluate_res), 4)
-    dict_evaluate_res = model.evaluate(
-        np.ones((10, 2)), np.ones((10, 3)), return_dict=True)
-    self.assertEqual(list(dict_evaluate_res.keys()),
-                     ['loss', 'mae', 'my_mse', 'my_rmse'])
-    list_train_on_batch_res = model.train_on_batch(
-        np.ones((10, 2)), np.ones((10, 3)))
-    self.assertEqual(len(list_train_on_batch_res), 4)
-    dict_train_on_batch_res = model.train_on_batch(
-        np.ones((10, 2)), np.ones((10, 3)), return_dict=True)
-    self.assertEqual(list(dict_train_on_batch_res.keys()),
-                     ['loss', 'mae', 'my_mse', 'my_rmse'])
-    list_test_on_batch_res = model.test_on_batch(
-        np.ones((10, 2)), np.ones((10, 3)))
-    self.assertEqual(len(list_test_on_batch_res), 4)
-    dict_test_on_batch_res = model.test_on_batch(
-        np.ones((10, 2)), np.ones((10, 3)), return_dict=True)
-    self.assertEqual(list(dict_test_on_batch_res.keys()),
-                     ['loss', 'mae', 'my_mse', 'my_rmse'])
+        else:
+            with self.assertRaisesRegex(
+                ValueError, "Using the result of calling a `Metric` object "
+            ):
+                with backend.get_graph().as_default():
+                    model.add_metric(metrics_module.Mean(name="metric_4")(y))
+
+        model.compile(
+            "sgd",
+            loss="mse",
+            metrics=[metrics_module.Accuracy("metric_4")],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
+
+        # Verify that the metrics added using `compile` and `add_metric` API are
+        # included
+        self.assertEqual(
+            [m.name for m in model.metrics],
+            ["loss", "metric_4", "metric_2", "metric_1", "metric_3"],
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_model_metrics_list_in_call(self):
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+
+            def call(self, x):
+                self.add_metric(
+                    tf.reduce_sum(x), name="metric_1", aggregation="mean"
+                )
+                return self.dense1(x)
+
+        model = TestModel()
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(0.01),
+            metrics=[metrics_module.Accuracy("acc")],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+        self.assertEqual(
+            [m.name for m in model.metrics], ["loss", "acc", "metric_1"]
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_multiple_add_metric_calls(self):
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+                self.mean1 = metrics_module.Mean(name="metric_1")
+                self.mean2 = metrics_module.Mean(name="metric_2")
+
+            def call(self, x):
+                self.add_metric(self.mean2(x), name="metric_2")
+                self.add_metric(self.mean1(x), name="metric_1")
+                self.add_metric(
+                    tf.reduce_sum(x), name="metric_3", aggregation="mean"
+                )
+                return self.dense1(x)
+
+        model = TestModel()
+        self.assertListEqual(
+            [m.name for m in model.metrics], ["metric_1", "metric_2"]
+        )
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+        history = model.fit(
+            x, y, epochs=2, batch_size=5, validation_data=(x, y)
+        )
+        self.assertAlmostEqual(history.history["metric_1"][-1], 1, 0)
+        self.assertAlmostEqual(history.history["metric_2"][-1], 1, 0)
+        self.assertAlmostEqual(history.history["metric_3"][-1], 5, 0)
+
+        eval_results = model.evaluate(x, y, batch_size=5)
+        self.assertArrayNear(eval_results[1:4], [1, 1, 5], 0.1)
+
+        model.predict(x, batch_size=5)
+        model.train_on_batch(x, y)
+        model.test_on_batch(x, y)
+
+    @test_combinations.run_all_keras_modes
+    def test_multiple_add_metric_calls_layer(self):
+        class TestLayer(layers_module.Layer):
+            def __init__(self):
+                super().__init__(name="test_layer")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+                self.m1 = metrics_module.Mean(name="m_1")
+                self.m2 = [
+                    metrics_module.Mean(name="m_2"),
+                    metrics_module.Mean(name="m_3"),
+                ]
+                self.m3 = {
+                    "mean4": metrics_module.Mean(name="m_4"),
+                    "mean5": metrics_module.Mean(name="m_5"),
+                }
+
+            def call(self, x):
+                self.add_metric(self.m2[0](x))
+                self.add_metric(self.m2[1](x))
+                self.add_metric(self.m1(x))
+                self.add_metric(self.m3["mean4"](x))
+                self.add_metric(self.m3["mean5"](x))
+                self.add_metric(
+                    tf.reduce_sum(x), name="m_6", aggregation="mean"
+                )
+                return self.dense1(x)
+
+        layer = TestLayer()
+        self.assertListEqual(
+            [m.name for m in layer.metrics], ["m_1", "m_2", "m_3", "m_4", "m_5"]
+        )
+
+        layer(np.ones((10, 10)))
+        self.assertListEqual(
+            [m.name for m in layer.metrics],
+            ["m_1", "m_2", "m_3", "m_4", "m_5", "m_6"],
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_duplicate_metric_name_in_add_metric(self):
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+                self.mean = metrics_module.Mean(name="metric_1")
+                self.mean2 = metrics_module.Mean(name="metric_1")
+
+            def call(self, x):
+                self.add_metric(self.mean(x), name="metric_1")
+                return self.dense1(x)
+
+        model = TestModel()
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+        with self.assertRaisesRegex(
+            ValueError,
+            "Please provide different names for the metrics you have added. "
+            'We found 2 metrics with the name: "metric_1"',
+        ):
+            model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_without_name(self):
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+
+            def call(self, x):
+                self.add_metric(tf.reduce_sum(x), aggregation="mean")
+                return self.dense1(x)
+
+        model = TestModel()
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+
+        with self.assertRaisesRegex(
+            ValueError, "Please provide a name for your metric like"
+        ):
+            model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_correctness(self):
+        inputs = input_layer.Input(shape=(1,))
+        targets = input_layer.Input(shape=(1,))
+
+        class Bias(layers_module.Layer):
+            def build(self, input_shape):
+                self.bias = self.add_weight("bias", (1,), initializer="zeros")
+                self.mae = metrics_module.MeanAbsoluteError(name="mae_1")
+
+            def call(self, inputs):
+                inputs, targets = inputs
+                outputs = inputs + self.bias
+                self.add_metric(self.mae(targets, outputs), name="mae_1")
+                return outputs
+
+        outputs = Bias()([inputs, targets])
+        model = training_module.Model([inputs, targets], outputs)
+
+        model.add_metric(
+            metrics_module.mean_absolute_error(targets, outputs),
+            name="mae_2",
+            aggregation="mean",
+        )
+
+        model.compile(
+            loss="mae",
+            optimizer=optimizer_legacy.gradient_descent.SGD(0.1),
+            metrics=[metrics_module.MeanAbsoluteError(name="mae_3")],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.array([[0.0], [1.0], [2.0]])
+        y = np.array([[0.5], [2.0], [3.5]])
+        history = model.fit([x, y], y, batch_size=3, epochs=5)
+
+        expected_val = [1.0, 0.9, 0.8, 0.7, 0.6]
+        for key in ["loss", "mae_1", "mae_2", "mae_3"]:
+            self.assertAllClose(history.history[key], expected_val, 1e-3)
+
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_order(self):
+        class MyLayer(layers_module.Layer):
+            def call(self, inputs, training=None, mask=None):
+                self.add_metric(
+                    tf.ones([32]) * 2.0, name="two", aggregation="mean"
+                )
+                return inputs
+
+        class MyModel(training_module.Model):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self._sampler = MyLayer(name="sampler")
+
+            def call(self, inputs, training=None, mask=None):
+                z = self._sampler(inputs)
+                self.add_metric(
+                    tf.ones([32]) * 1.0, name="one", aggregation="mean"
+                )
+                self.add_metric(
+                    tf.ones([32]) * 3.0, name="three", aggregation="mean"
+                )
+                return z
+
+        xdata = np.random.uniform(size=[32, 16]).astype(np.float32)
+        dataset_train = tf.data.Dataset.from_tensor_slices((xdata, xdata))
+        dataset_train = dataset_train.batch(32, drop_remainder=True)
+
+        model = MyModel()
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(dataset_train, epochs=3)
+        self.assertDictEqual(
+            history.history,
+            {
+                "loss": [0.0, 0.0, 0.0],
+                "three": [3.0, 3.0, 3.0],
+                "two": [2.0, 2.0, 2.0],
+                "one": [1.0, 1.0, 1.0],
+            },
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_aggregation_mean(self):
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+
+            def call(self, x):
+                self.add_metric(
+                    tf.reduce_sum(x), name="metric_1", aggregation="mean"
+                )
+                return self.dense1(x)
+
+        model = TestModel()
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        model.fit(np.ones(shape=(10, 1)), np.ones(shape=(10, 2)), batch_size=5)
+
+    @test_combinations.run_all_keras_modes
+    def test_add_metric_aggregation_none(self):
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+                self.mean = metrics_module.Mean(name="metric_1")
+
+            def call(self, x):
+                self.add_metric(self.mean(x), name="metric_1", aggregation=None)
+                return self.dense1(x)
+
+        model = TestModel()
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        model.fit(np.ones(shape=(10, 1)), np.ones(shape=(10, 2)), batch_size=5)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def DISABLED_test_add_metric_invalid_aggregation(self):
+        # TODO(psv): Re-enable test once it is fixed.
+        x = layers_module.Input(shape=(1,))
+        y = layers_module.Dense(1, kernel_initializer="ones")(x)
+        model = training_module.Model(x, y)
+        with self.assertRaisesRegex(
+            ValueError, "only `mean` sample-wise metric aggregation"
+        ):
+            model.add_metric(
+                tf.reduce_sum(y), name="metric_1", aggregation="sum"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "only `mean` sample-wise metric aggregation"
+        ):
+            model.add_metric(
+                tf.reduce_sum(y), name="metric_1", aggregation=None
+            )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_calling_evaluate_in_callback_during_fit(self):
+        # Check fix for a bug that caused `evaluate` to hit a cached dataset
+        # when run from inside a fit callback.
+        x = layers_module.Input(shape=(2,))
+        y = layers_module.Dense(2, kernel_initializer="ones", use_bias=False)(x)
+        model = training_module.Model(x, y)
+
+        ones = np.ones((10, 2), dtype=np.float32)
+        zeros = np.zeros((10, 2), dtype=np.float32)
+        train_ds = tf.data.Dataset.from_tensor_slices((ones, ones)).batch(5)
+        val_ds_1 = tf.data.Dataset.from_tensor_slices((ones, ones)).batch(5)
+        val_ds_2 = tf.data.Dataset.from_tensor_slices((zeros, zeros)).batch(5)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        class MyCallback(Callback):
+            def on_epoch_end(self, *args, **kwargs):
+                eval_result = self.model.evaluate(val_ds_2)
+                if abs(eval_result) > 1e-7:
+                    raise AssertionError(
+                        "Expected to hit the zeros dataset but got high loss "
+                        "value of %s" % eval_result
+                    )
+
+        history = model.fit(
+            train_ds, validation_data=val_ds_1, callbacks=[MyCallback()]
+        )
+        # Evaluate at the end of fit should hit the ones dataset (cached)
+        self.assertGreater(abs(history.history["val_loss"][-1]), 0.1)
+        # Standalone call to evaluate should not hit the cached dataset
+        eval_result = model.evaluate(val_ds_2)
+        self.assertLess(abs(eval_result), 1e-7)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_model_with_nested_compiled_model(self):
+        class LayerWithAddMetric(layers_module.Layer):
+            def __init__(self):
+                super().__init__()
+                self.dense = layers_module.Dense(1, kernel_initializer="ones")
+
+            def call(self, inputs):
+                outputs = self.dense(inputs)
+                self.add_metric(
+                    tf.reduce_sum(outputs), name="mean", aggregation="mean"
+                )
+                return outputs
+
+        x = layers_module.Input(shape=(1,))
+        y = LayerWithAddMetric()(x)
+
+        inner_model = training_module.Model(x, y)
+        inner_model.add_metric(
+            tf.reduce_sum(y), name="mean1", aggregation="mean"
+        )
+
+        inner_model.compile(
+            "sgd",
+            loss="mse",
+            metrics=[metrics_module.Accuracy("acc")],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        inner_model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
+
+        self.assertEqual(
+            [m.name for m in inner_model.metrics],
+            ["loss", "acc", "mean", "mean1"],
+        )
+
+        x = layers_module.Input(shape=[1])
+        y = inner_model(x)
+        outer_model = training_module.Model(x, y)
+        outer_model.add_metric(
+            tf.reduce_sum(y), name="mean2", aggregation="mean"
+        )
+
+        outer_model.compile(
+            "sgd",
+            loss="mse",
+            metrics=[metrics_module.Accuracy("acc2")],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        outer_model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
+        self.assertEqual(
+            [m.name for m in outer_model.metrics],
+            ["loss", "acc2", "mean", "mean1", "mean2"],
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_model_with_metric_class_that_returns_dict(self):
+        x = layers_module.Input(shape=(2,))
+        y = layers_module.Dense(3)(x)
+        model = training_module.Model(x, y)
+
+        class DictMetric(metrics_module.Metric):
+            def __init__(self):
+                super().__init__()
+                self.sample_count = tf.Variable(0)
+                self.l2_sum = tf.Variable(0.0)
+
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                self.l2_sum.assign_add(
+                    tf.reduce_sum(tf.square(y_true - y_pred))
+                )
+                self.sample_count.assign_add(tf.shape(y_true)[0])
+
+            def reset_state(self):
+                self.sample_count.assign(0)
+                self.l2_sum.assign(0.0)
+
+            def result(self):
+                mse = self.l2_sum / tf.cast(self.sample_count, "float32")
+                rmse = tf.sqrt(mse)
+                return {"my_mse": mse, "my_rmse": rmse}
+
+        model.compile(
+            "sgd",
+            "mse",
+            metrics=["mae", DictMetric()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        history = model.fit(np.ones((10, 2)), np.ones((10, 3)))
+        self.assertEqual(
+            list(history.history.keys()), ["loss", "mae", "my_mse", "my_rmse"]
+        )
+        list_evaluate_res = model.evaluate(np.ones((10, 2)), np.ones((10, 3)))
+        self.assertEqual(len(list_evaluate_res), 4)
+        dict_evaluate_res = model.evaluate(
+            np.ones((10, 2)), np.ones((10, 3)), return_dict=True
+        )
+        self.assertEqual(
+            list(dict_evaluate_res.keys()), ["loss", "mae", "my_mse", "my_rmse"]
+        )
+        list_train_on_batch_res = model.train_on_batch(
+            np.ones((10, 2)), np.ones((10, 3))
+        )
+        self.assertEqual(len(list_train_on_batch_res), 4)
+        dict_train_on_batch_res = model.train_on_batch(
+            np.ones((10, 2)), np.ones((10, 3)), return_dict=True
+        )
+        self.assertEqual(
+            list(dict_train_on_batch_res.keys()),
+            ["loss", "mae", "my_mse", "my_rmse"],
+        )
+        list_test_on_batch_res = model.test_on_batch(
+            np.ones((10, 2)), np.ones((10, 3))
+        )
+        self.assertEqual(len(list_test_on_batch_res), 4)
+        dict_test_on_batch_res = model.test_on_batch(
+            np.ones((10, 2)), np.ones((10, 3)), return_dict=True
+        )
+        self.assertEqual(
+            list(dict_test_on_batch_res.keys()),
+            ["loss", "mae", "my_mse", "my_rmse"],
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_add_metric_in_model_call_that_returns_dict(self):
+        class DictMetric(metrics_module.Metric):
+            def __init__(self):
+                super().__init__()
+                self.sample_count = tf.Variable(0)
+                self.l2_sum = tf.Variable(0.0)
+
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                self.l2_sum.assign_add(
+                    tf.reduce_sum(tf.square(y_true - y_pred))
+                )
+                self.sample_count.assign_add(tf.shape(y_true)[0])
+
+            def reset_state(self):
+                self.sample_count.assign(0)
+                self.l2_sum.assign(0.0)
+
+            def result(self):
+                mse = self.l2_sum / tf.cast(self.sample_count, "float32")
+                rmse = tf.sqrt(mse)
+                return {"my_mse": mse, "my_rmse": rmse}
+
+        class TestModel(training_module.Model):
+            def __init__(self):
+                super().__init__(name="test_model")
+                self.dense1 = layers_module.Dense(2, kernel_initializer="ones")
+                self.dict_metric = DictMetric()
+
+            def call(self, x):
+                self.add_metric(
+                    tf.reduce_sum(x), name="metric_2", aggregation="mean"
+                )
+                # Provide same name as in the instance created in __init__
+                # for eager mode
+                self.add_metric(self.dict_metric(x, 1 - x), name="metric_1")
+                return self.dense1(x)
+
+        model = TestModel()
+        model.compile(
+            loss="mse",
+            optimizer=RMSPropOptimizer(0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones(shape=(10, 1))
+        y = np.ones(shape=(10, 2))
+        history = model.fit(
+            x, y, epochs=2, batch_size=5, validation_data=(x, y)
+        )
+        self.assertAlmostEqual(history.history["metric_2"][-1], 5, 0)
+        self.assertAlmostEqual(history.history["val_metric_2"][-1], 5, 0)
+        self.assertAlmostEqual(history.history["my_mse"][-1], 1, 0)
+        self.assertAlmostEqual(history.history["val_my_mse"][-1], 1, 0)
+        self.assertAlmostEqual(history.history["my_rmse"][-1], 1, 0)
+        self.assertAlmostEqual(history.history["val_my_rmse"][-1], 1, 0)
+
+        eval_results = model.evaluate(x, y, batch_size=5, return_dict=True)
+        self.assertAlmostEqual(eval_results["metric_2"], 5, 0)
+        self.assertAlmostEqual(eval_results["my_mse"], 1, 0)
+        self.assertAlmostEqual(eval_results["my_rmse"], 1, 0)
+
+        model.predict(x, batch_size=5)
+        model.train_on_batch(x, y)
+        model.test_on_batch(x, y)
 
 
 class BareUpdateLayer(layers_module.Layer):
+    def build(self, input_shape):
+        self.counter = self.add_weight(
+            "counter",
+            dtype="int32",
+            shape=(),
+            initializer="zeros",
+            trainable=False,
+        )
 
-  def build(self, input_shape):
-    self.counter = self.add_weight(
-        'counter',
-        dtype='int32',
-        shape=(),
-        initializer='zeros',
-        trainable=False)
-
-  def call(self, inputs):
-    tf.compat.v1.assign_add(self.counter, 1)
-    return tf.cast(self.counter, inputs.dtype) * inputs
+    def call(self, inputs):
+        tf.compat.v1.assign_add(self.counter, 1)
+        return tf.cast(self.counter, inputs.dtype) * inputs
 
 
 class LambdaUpdateLayer(layers_module.Layer):
+    def build(self, input_shape):
+        self.counter = self.add_weight(
+            "counter",
+            dtype="int32",
+            shape=(),
+            initializer="zeros",
+            trainable=False,
+        )
 
-  def build(self, input_shape):
-    self.counter = self.add_weight(
-        'counter',
-        dtype='int32',
-        shape=(),
-        initializer='zeros',
-        trainable=False)
-
-  def call(self, inputs):
-    # Make sure update isn't run twice.
-    self.add_update(lambda: tf.compat.v1.assign_add(self.counter, 1))
-    return tf.cast(self.counter, inputs.dtype) * inputs
+    def call(self, inputs):
+        # Make sure update isn't run twice.
+        self.add_update(lambda: tf.compat.v1.assign_add(self.counter, 1))
+        return tf.cast(self.counter, inputs.dtype) * inputs
 
 
 class NestedUpdateLayer(layers_module.Layer):
+    def build(self, input_shape):
+        self.layer = BareUpdateLayer()
+        self.layer.build(input_shape)
 
-  def build(self, input_shape):
-    self.layer = BareUpdateLayer()
-    self.layer.build(input_shape)
+    @property
+    def counter(self):
+        return self.layer.counter
 
-  @property
-  def counter(self):
-    return self.layer.counter
-
-  def call(self, inputs):
-    return self.layer(inputs)
+    def call(self, inputs):
+        return self.layer(inputs)
 
 
 class SubgraphUpdateLayer(layers_module.Layer):
+    def build(self, input_shape):
+        self.counter = self.add_weight(
+            "counter",
+            dtype="int32",
+            shape=(),
+            initializer="zeros",
+            trainable=False,
+        )
+
+    def call(self, inputs, training=None):
+        if training is None:
+            training = backend.learning_phase()
 
-  def build(self, input_shape):
-    self.counter = self.add_weight(
-        'counter',
-        dtype='int32',
-        shape=(),
-        initializer='zeros',
-        trainable=False)
-
-  def call(self, inputs, training=None):
-    if training is None:
-      training = backend.learning_phase()
-
-    if training:
-      self.counter.assign(self.counter + 1)
-    return inputs
+        if training:
+            self.counter.assign(self.counter + 1)
+        return inputs
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TestAutoUpdates(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  @parameterized.named_parameters(
-      ('bare_update', BareUpdateLayer),
-      ('lambda_update', LambdaUpdateLayer),
-      ('nested_update', NestedUpdateLayer))
-  def test_updates_in_model(self, layer_builder):
-    layer = layer_builder()
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    model = test_utils.get_model_from_layers(
-        [layer, layers_module.Dense(1)], input_shape=(10,))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, batch_size=2, epochs=1)
-    self.assertEqual(self.evaluate(layer.counter), 5)
-
-  @test_combinations.run_with_all_model_types
-  def test_lambda_updates_trainable_false(self):
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    layer = LambdaUpdateLayer()
-    model = test_utils.get_model_from_layers(
-        [layer, layers_module.Dense(1)], input_shape=(10,))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, batch_size=2, epochs=1)
-    self.assertEqual(self.evaluate(layer.counter), 5)
-    layer.trainable = False
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, batch_size=2, epochs=1)
-    self.assertEqual(self.evaluate(layer.counter), 5)
-
-  @test_combinations.run_with_all_model_types
-  def test_subgraph_updates_in_model(self):
-    layer = SubgraphUpdateLayer()
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    model = test_utils.get_model_from_layers(
-        [layer, layers_module.Dense(1)], input_shape=(10,))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, batch_size=2, epochs=1)
-    self.assertEqual(self.evaluate(layer.counter), 5)
-
-  @parameterized.named_parameters(
-      ('bare_update', BareUpdateLayer),
-      ('lambda_update', LambdaUpdateLayer),
-      ('nested_update', NestedUpdateLayer))
-  def test_updates_standalone_layer(self, layer_builder):
-    layer = layer_builder()
-    y = layer(np.ones((10, 10)))
-    self.evaluate(layer.counter.initializer)
-    self.evaluate(y)
-    self.assertEqual(self.evaluate(layer.counter), 1)
-
-  def test_trainable_false_standalone_layer(self):
-    layer = LambdaUpdateLayer()
-    y = layer(np.ones((10, 10)))
-    self.evaluate(layer.counter.initializer)
-    self.evaluate(y)
-    self.assertEqual(self.evaluate(layer.counter), 1)
-    layer.trainable = False
-    y = layer(np.ones((10, 10)))
-    self.evaluate(y)
-    self.assertEqual(self.evaluate(layer.counter), 1)
-
-  @test_combinations.run_with_all_model_types
-  def test_batchnorm_trainable_false(self):
-    bn = layers_module.BatchNormalization()
-    model = test_utils.get_model_from_layers([bn, layers_module.Dense(1)],
-                                             input_shape=(10,))
-    bn.trainable = False
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    model.fit(x, y, batch_size=2, epochs=1)
-    self.assertAllEqual(self.evaluate(bn.moving_mean), np.zeros((10,)))
-    self.assertAllEqual(self.evaluate(bn.moving_variance), np.ones((10,)))
+    @test_combinations.run_with_all_model_types
+    @parameterized.named_parameters(
+        ("bare_update", BareUpdateLayer),
+        ("lambda_update", LambdaUpdateLayer),
+        ("nested_update", NestedUpdateLayer),
+    )
+    def test_updates_in_model(self, layer_builder):
+        layer = layer_builder()
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        model = test_utils.get_model_from_layers(
+            [layer, layers_module.Dense(1)], input_shape=(10,)
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(x, y, batch_size=2, epochs=1)
+        self.assertEqual(self.evaluate(layer.counter), 5)
+
+    @test_combinations.run_with_all_model_types
+    def test_lambda_updates_trainable_false(self):
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        layer = LambdaUpdateLayer()
+        model = test_utils.get_model_from_layers(
+            [layer, layers_module.Dense(1)], input_shape=(10,)
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(x, y, batch_size=2, epochs=1)
+        self.assertEqual(self.evaluate(layer.counter), 5)
+        layer.trainable = False
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(x, y, batch_size=2, epochs=1)
+        self.assertEqual(self.evaluate(layer.counter), 5)
+
+    @test_combinations.run_with_all_model_types
+    def test_subgraph_updates_in_model(self):
+        layer = SubgraphUpdateLayer()
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        model = test_utils.get_model_from_layers(
+            [layer, layers_module.Dense(1)], input_shape=(10,)
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(x, y, batch_size=2, epochs=1)
+        self.assertEqual(self.evaluate(layer.counter), 5)
+
+    @parameterized.named_parameters(
+        ("bare_update", BareUpdateLayer),
+        ("lambda_update", LambdaUpdateLayer),
+        ("nested_update", NestedUpdateLayer),
+    )
+    def test_updates_standalone_layer(self, layer_builder):
+        layer = layer_builder()
+        y = layer(np.ones((10, 10)))
+        self.evaluate(layer.counter.initializer)
+        self.evaluate(y)
+        self.assertEqual(self.evaluate(layer.counter), 1)
+
+    def test_trainable_false_standalone_layer(self):
+        layer = LambdaUpdateLayer()
+        y = layer(np.ones((10, 10)))
+        self.evaluate(layer.counter.initializer)
+        self.evaluate(y)
+        self.assertEqual(self.evaluate(layer.counter), 1)
+        layer.trainable = False
+        y = layer(np.ones((10, 10)))
+        self.evaluate(y)
+        self.assertEqual(self.evaluate(layer.counter), 1)
+
+    @test_combinations.run_with_all_model_types
+    def test_batchnorm_trainable_false(self):
+        bn = layers_module.BatchNormalization()
+        model = test_utils.get_model_from_layers(
+            [bn, layers_module.Dense(1)], input_shape=(10,)
+        )
+        bn.trainable = False
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        model.fit(x, y, batch_size=2, epochs=1)
+        self.assertAllEqual(self.evaluate(bn.moving_mean), np.zeros((10,)))
+        self.assertAllEqual(self.evaluate(bn.moving_variance), np.ones((10,)))
 
 
 class TestFunctionTracing(test_combinations.TestCase):
+    def _seq_model_and_data(self):
+        model = sequential.Sequential(
+            [layers_module.Dense(4, activation="relu")]
+        )
+        model.compile(loss="mse", optimizer="rmsprop")
+        x = np.random.random((10, 6))
+        y = np.random.random((10, 4))
+        return model, x, y
+
+    @test_combinations.run_all_keras_modes(
+        always_skip_v1=True, always_skip_eager=True
+    )
+    def test_no_tracing_between_epoch(self):
+        if _is_oss():
+            self.skipTest("b/198729465")
 
-  def _seq_model_and_data(self):
-    model = sequential.Sequential([layers_module.Dense(4, activation='relu')])
-    model.compile(loss='mse', optimizer='rmsprop')
-    x = np.random.random((10, 6))
-    y = np.random.random((10, 4))
-    return model, x, y
-
-  @test_combinations.run_all_keras_modes(
-      always_skip_v1=True, always_skip_eager=True)
-  def test_no_tracing_between_epoch(self):
-    if _is_oss():
-      self.skipTest('b/198729465')
-
-    model, x, y = self._seq_model_and_data()
+        model, x, y = self._seq_model_and_data()
 
-    logging.set_verbosity(1)
-    with self.assertLogs(level=1) as logs:
-      model.fit(x, y, epochs=10, batch_size=5, validation_data=(x, y))
+        logging.set_verbosity(1)
+        with self.assertLogs(level=1) as logs:
+            model.fit(x, y, epochs=10, batch_size=5, validation_data=(x, y))
 
-    new_func_graph = 'INFO:absl:Creating new FuncGraph for Python function'
-    self.assertEqual(sum(new_func_graph in log for log in logs.output), 9)
+        new_func_graph = "INFO:absl:Creating new FuncGraph for Python function"
+        self.assertEqual(sum(new_func_graph in log for log in logs.output), 9)
 
-  @test_combinations.run_all_keras_modes(
-      always_skip_v1=True, always_skip_eager=True)
-  def test_evaluate_no_cached_data(self):
-    if _is_oss():
-      self.skipTest('b/198729465')
+    @test_combinations.run_all_keras_modes(
+        always_skip_v1=True, always_skip_eager=True
+    )
+    def test_evaluate_no_cached_data(self):
+        if _is_oss():
+            self.skipTest("b/198729465")
 
-    model, x, y = self._seq_model_and_data()
+        model, x, y = self._seq_model_and_data()
 
-    new_func_graph = 'INFO:absl:Creating new FuncGraph for Python function'
-    logging.set_verbosity(1)
-    with self.assertLogs(level=1) as eval_logs:
-      for _ in range(6):
-        model.evaluate(x, y, batch_size=5)
-    self.assertEqual(sum(new_func_graph in log for log in eval_logs.output), 20)
+        new_func_graph = "INFO:absl:Creating new FuncGraph for Python function"
+        logging.set_verbosity(1)
+        with self.assertLogs(level=1) as eval_logs:
+            for _ in range(6):
+                model.evaluate(x, y, batch_size=5)
+        self.assertEqual(
+            sum(new_func_graph in log for log in eval_logs.output), 20
+        )
 
 
 class TestBuildCustomModel(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes
-  def test_build_list_of_inputs(self):
-
-    class MyModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.l1 = layers_module.Dense(1)
-        self.l2 = layers_module.Dense(2)
-
-      def call(self, x):
-        a, b = x
-        return self.l1(a) + self.l2(b)
-
-    # List of tuples
-    model = MyModel()
-    model.build([(None, 1), (None, 2)])
-    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
-    self.assertEqual(model.l2.kernel.shape.as_list(), [2, 2])
-    # List of lists
-    model = MyModel()
-    model.build([[None, 1], [None, 2]])
-    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
-    self.assertEqual(model.l2.kernel.shape.as_list(), [2, 2])
-
-  @test_combinations.run_all_keras_modes
-  def test_build_single_inputs(self):
-
-    class MyModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.l1 = layers_module.Dense(1)
-
-      def call(self, x):
-        return self.l1(x)
-
-    model = MyModel()
-    model.build((None, 1))
-    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
-    model = MyModel()
-    model.build([None, 1])
-    self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
-
-  @test_combinations.run_all_keras_modes
-  def test_build_dict_inputs(self):
-
-    class MyModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.l1 = layers_module.Dense(1)
-
-      def call(self, inputs):
-        return self.l1(inputs['x'])
-
-    model = MyModel()
-    model.build({'x': [None, 16]})
-    self.assertEqual(model.l1.kernel.shape.as_list(), [16, 1])
-
-  def test_save_top_level_model_weights_h5(self):
-
-    class MyModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.class_token = self.add_weight(shape=(1,), name='class_token')
-        self.inner_layer = layers_module.Dense(1)
-
-      def call(self, inputs):
-        return self.inner_layer(inputs) * self.class_token
-
-    h5_file = tempfile.mktemp('.h5')
-    m1 = MyModel()
-    m1.build((1, 1))
-    m1.save_weights(h5_file)
-
-    m2 = MyModel()
-    m2.build((1, 1))
-    m2.load_weights(h5_file)
-    self.assertAllEqual(m1.get_weights(), m2.get_weights())
-    m2.load_weights(h5_file, by_name=True)
-    self.assertAllEqual(m1.get_weights(), m2.get_weights())
+    @test_combinations.run_all_keras_modes
+    def test_build_list_of_inputs(self):
+        class MyModel(training_module.Model):
+            def __init__(self):
+                super().__init__()
+                self.l1 = layers_module.Dense(1)
+                self.l2 = layers_module.Dense(2)
+
+            def call(self, x):
+                a, b = x
+                return self.l1(a) + self.l2(b)
+
+        # List of tuples
+        model = MyModel()
+        model.build([(None, 1), (None, 2)])
+        self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+        self.assertEqual(model.l2.kernel.shape.as_list(), [2, 2])
+        # List of lists
+        model = MyModel()
+        model.build([[None, 1], [None, 2]])
+        self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+        self.assertEqual(model.l2.kernel.shape.as_list(), [2, 2])
+
+    @test_combinations.run_all_keras_modes
+    def test_build_single_inputs(self):
+        class MyModel(training_module.Model):
+            def __init__(self):
+                super().__init__()
+                self.l1 = layers_module.Dense(1)
+
+            def call(self, x):
+                return self.l1(x)
+
+        model = MyModel()
+        model.build((None, 1))
+        self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+        model = MyModel()
+        model.build([None, 1])
+        self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
+
+    @test_combinations.run_all_keras_modes
+    def test_build_dict_inputs(self):
+        class MyModel(training_module.Model):
+            def __init__(self):
+                super().__init__()
+                self.l1 = layers_module.Dense(1)
+
+            def call(self, inputs):
+                return self.l1(inputs["x"])
+
+        model = MyModel()
+        model.build({"x": [None, 16]})
+        self.assertEqual(model.l1.kernel.shape.as_list(), [16, 1])
+
+    def test_save_top_level_model_weights_h5(self):
+        class MyModel(training_module.Model):
+            def __init__(self):
+                super().__init__()
+                self.class_token = self.add_weight(
+                    shape=(1,), name="class_token"
+                )
+                self.inner_layer = layers_module.Dense(1)
+
+            def call(self, inputs):
+                return self.inner_layer(inputs) * self.class_token
+
+        h5_file = tempfile.mktemp(".h5")
+        m1 = MyModel()
+        m1.build((1, 1))
+        m1.save_weights(h5_file)
+
+        m2 = MyModel()
+        m2.build((1, 1))
+        m2.load_weights(h5_file)
+        self.assertAllEqual(m1.get_weights(), m2.get_weights())
+        m2.load_weights(h5_file, by_name=True)
+        self.assertAllEqual(m1.get_weights(), m2.get_weights())
 
 
 class ScalarDataModelTest(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_scalar_loss_reduction(self):
-
-    class MyModel(training_module.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.w = self.add_weight(initializer='ones', name='kernel')
-        self.b = self.add_weight(initializer='zeros', name='bias')
-
-      def call(self, inputs):
-        return inputs * self.w + self.b
-
-    model = MyModel()
-    model.compile(optimizer_v2.gradient_descent.SGD(1e-2),
-                  loss='mse',
-                  metrics=['binary_accuracy'])
-    # learn y = x * 2 + 0.5
-    x = np.array([3, 5, 5, 3, 5], dtype='float32')
-    y = x * 2 + 0.5
-    x2d = np.expand_dims(x, axis=-1)
-    y2d = np.expand_dims(y, axis=-1)
-    loss, acc = model.evaluate(x, y)
-    loss2d, acc2d = model.evaluate(x2d, y2d)
-    self.assertAllClose([loss, acc], [loss2d, acc2d], atol=1e-6)
-    model.fit(x, y, epochs=20)
-    preds = model.predict(x)
-    self.assertEqual(preds.shape, (5,))
-    self.assertAllClose(preds, y, atol=2e-1)
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_scalar_loss_reduction(self):
+        class MyModel(training_module.Model):
+            def __init__(self):
+                super().__init__()
+                self.w = self.add_weight(initializer="ones", name="kernel")
+                self.b = self.add_weight(initializer="zeros", name="bias")
+
+            def call(self, inputs):
+                return inputs * self.w + self.b
+
+        model = MyModel()
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(1e-2),
+            loss="mse",
+            metrics=["binary_accuracy"],
+        )
+        # learn y = x * 2 + 0.5
+        x = np.array([3, 5, 5, 3, 5], dtype="float32")
+        y = x * 2 + 0.5
+        x2d = np.expand_dims(x, axis=-1)
+        y2d = np.expand_dims(y, axis=-1)
+        loss, acc = model.evaluate(x, y)
+        loss2d, acc2d = model.evaluate(x2d, y2d)
+        self.assertAllClose([loss, acc], [loss2d, acc2d], atol=1e-6)
+        model.fit(x, y, epochs=20)
+        preds = model.predict(x)
+        self.assertEqual(preds.shape, (5,))
+        self.assertAllClose(preds, y, atol=2e-1)
+
+
+# Class used for testing.
+class SubclassModel(training_module.Model):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.d1 = layers_module.Dense(1000)
+        self.d2 = layers_module.Dense(1000)
+        self.dropout = layers_module.Dropout(0.1)
+
+    def call(self, inputs, training=None):
+        x = self.d1(inputs)
+        x = self.dropout(x, training=training)
+        return self.d2(x)
+
+
+class TestVariableObjectPathMapping(test_combinations.TestCase):
+    def test_subclass_model_get_weight_paths(self):
+        model = SubclassModel()
+        # Make sure the object path produce nothing when weights are not
+        # initialized
+        self.assertEmpty(model.get_weight_paths())
+
+        model(tf.zeros((10, 10)))
+        mapping = model.get_weight_paths()
+        self.assertEqual(
+            mapping.keys(), {"d1.kernel", "d1.bias", "d2.kernel", "d2.bias"}
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_functional_model_get_weight_paths(self):
+        inputs = input_layer.Input(shape=(10,))
+        x = layers_module.Dense(100, name="d1")(inputs)
+        output = layers_module.Dense(200, name="d2", activation="softmax")(x)
+        model = training_module.Model(inputs, output)
+        mapping = model.get_weight_paths()
+        self.assertEqual(
+            mapping.keys(), {"d1.kernel", "d1.bias", "d2.kernel", "d2.bias"}
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_sequential_model_get_weight_paths(self):
+        model = sequential.Sequential(
+            [
+                layers_module.Dense(100, name="d1", input_shape=(10,)),
+                layers_module.Dense(200, name="d2", activation="softmax"),
+            ]
+        )
+        mapping = model.get_weight_paths()
+        self.assertEqual(
+            mapping.keys(), {"d1.kernel", "d1.bias", "d2.kernel", "d2.bias"}
+        )
 
 
 def _is_oss():
-  """Returns whether the test is run under OSS."""
-  return len(sys.argv) >= 1 and 'bazel' in sys.argv[0]
+    """Returns whether the test is run under OSS."""
+    return len(sys.argv) >= 1 and "bazel" in sys.argv[0]
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/training_utils.py b/keras/engine/training_utils.py
index 617713b543e5..4e298157378b 100644
--- a/keras/engine/training_utils.py
+++ b/keras/engine/training_utils.py
@@ -14,206 +14,225 @@
 # ==============================================================================
 """Training-related utilities."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 from keras.utils import generic_utils
 
 
 def slice_arrays(arrays, indices, contiguous=True):
-  """Slices batches out of provided arrays (workaround for eager tensors).
-
-  Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
-  hence we cannot use `generic_utils.slice_arrays` directly
-  and we have to implement this workaround based on `concat`. This has a
-  performance cost.
-
-  Args:
-    arrays: Single array or list of arrays.
-    indices: List of indices in the array that should be included in the output
-      batch.
-    contiguous: Boolean flag indicating whether the indices are contiguous.
-
-  Returns:
-    Slice of data (either single array or list of arrays).
-  """
-  converted_to_list = False
-  if not isinstance(arrays, list):
-    converted_to_list = True
-    arrays = [arrays]
-  if any(tf.is_tensor(x) for x in arrays):
-    if not contiguous:
-      entries = [[x[i:i + 1] for i in indices] for x in arrays]
-      slices = [tf.concat(x, axis=0) for x in entries]
+    """Slices batches out of provided arrays (workaround for eager tensors).
+
+    Unfortunately eager tensors don't have the same slicing behavior as
+    Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+    hence we cannot use `generic_utils.slice_arrays` directly
+    and we have to implement this workaround based on `concat`. This has a
+    performance cost.
+
+    Args:
+      arrays: Single array or list of arrays.
+      indices: List of indices in the array that should be included in the
+        output batch.
+      contiguous: Boolean flag indicating whether the indices are contiguous.
+
+    Returns:
+      Slice of data (either single array or list of arrays).
+    """
+    converted_to_list = False
+    if not isinstance(arrays, list):
+        converted_to_list = True
+        arrays = [arrays]
+    if any(tf.is_tensor(x) for x in arrays):
+        if not contiguous:
+            entries = [[x[i : i + 1] for i in indices] for x in arrays]
+            slices = [tf.concat(x, axis=0) for x in entries]
+        else:
+            slices = [x[indices[0] : indices[-1] + 1] for x in arrays]
     else:
-      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
-  else:
-    slices = generic_utils.slice_arrays(arrays, indices)
-
-  if converted_to_list:
-    slices = slices[0]
-  return slices
-
-
-def handle_partial_sample_weights(outputs, sample_weights, sample_weight_modes,
-                                  check_all_flat=False):
-  """Adds 1.0 as sample weights for the outputs for which there is no weight.
-
-  Args:
-    outputs: List of model outputs.
-    sample_weights: List of sample weight inputs.
-    sample_weight_modes: List of sample weight modes or None.
-    check_all_flat: Ensure that inputs are not nested structures. This is not
-      a free check, so we may not want to run it eagerly every iteration.
-
-  Returns:
-    Tuple of sample weights, one sample weight for every output, and booleans
-    describing the raw sample weights.
-  """
-  any_sample_weight = sample_weights is not None and any(
-      w is not None for w in sample_weights)
-  partial_sample_weight = any_sample_weight and any(
-      w is None for w in sample_weights)
-
-  if not any_sample_weight:
-    return None, any_sample_weight, partial_sample_weight
-
-  if not partial_sample_weight:
-    return sample_weights, any_sample_weight, partial_sample_weight
-
-  if check_all_flat:
-    tf.nest.assert_same_structure(
-        list_to_tuple(sample_weights),
-        list_to_tuple(tf.nest.flatten(sample_weights)))
-    tf.nest.assert_same_structure(
-        list_to_tuple(outputs),
-        list_to_tuple(tf.nest.flatten(outputs)))
-    if sample_weight_modes is not None:
-      tf.nest.assert_same_structure(
-          sample_weight_modes, tf.nest.flatten(sample_weight_modes))
-
-  new_sample_weights = []
-  for i, sw in enumerate(sample_weights):
-    if sw is None:
-      as_numpy = isinstance(outputs[i], np.ndarray)
-      output = outputs[i]
-      output_shape = output.shape if as_numpy else tf.shape(output)
-
-      is_temporal = (
-          sample_weight_modes is not None and
-          sample_weight_modes[i] == 'temporal')
-      sw_shape = (output_shape[0],
-                  output_shape[1]) if is_temporal else (output_shape[0],)
-
-      new_sample_weights.append(
-          np.ones(sw_shape) if as_numpy else tf.ones(sw_shape))
-
+        slices = generic_utils.slice_arrays(arrays, indices)
+
+    if converted_to_list:
+        slices = slices[0]
+    return slices
+
+
+def handle_partial_sample_weights(
+    outputs, sample_weights, sample_weight_modes, check_all_flat=False
+):
+    """Adds 1.0 as sample weights for the outputs for which there is no weight.
+
+    Args:
+      outputs: List of model outputs.
+      sample_weights: List of sample weight inputs.
+      sample_weight_modes: List of sample weight modes or None.
+      check_all_flat: Ensure that inputs are not nested structures. This is not
+        a free check, so we may not want to run it eagerly every iteration.
+
+    Returns:
+      Tuple of sample weights, one sample weight for every output, and booleans
+      describing the raw sample weights.
+    """
+    if not isinstance(sample_weights, (list, tuple)):
+        any_sample_weight = sample_weights is not None
+        partial_sample_weight = any_sample_weight and sample_weights is None
     else:
-      new_sample_weights.append(sw)
-  return (list_to_tuple(new_sample_weights),
-          any_sample_weight, partial_sample_weight)
+        any_sample_weight = sample_weights is not None and any(
+            w is not None for w in sample_weights
+        )
+        partial_sample_weight = any_sample_weight and any(
+            w is None for w in sample_weights
+        )
+
+    if not any_sample_weight:
+        return None, any_sample_weight, partial_sample_weight
+
+    if not partial_sample_weight:
+        return sample_weights, any_sample_weight, partial_sample_weight
+
+    if check_all_flat:
+        tf.nest.assert_same_structure(
+            list_to_tuple(sample_weights),
+            list_to_tuple(tf.nest.flatten(sample_weights)),
+        )
+        tf.nest.assert_same_structure(
+            list_to_tuple(outputs), list_to_tuple(tf.nest.flatten(outputs))
+        )
+        if sample_weight_modes is not None:
+            tf.nest.assert_same_structure(
+                sample_weight_modes, tf.nest.flatten(sample_weight_modes)
+            )
+
+    new_sample_weights = []
+    for i, sw in enumerate(sample_weights):
+        if sw is None:
+            as_numpy = isinstance(outputs[i], np.ndarray)
+            output = outputs[i]
+            output_shape = output.shape if as_numpy else tf.shape(output)
+
+            is_temporal = (
+                sample_weight_modes is not None
+                and sample_weight_modes[i] == "temporal"
+            )
+            sw_shape = (
+                (output_shape[0], output_shape[1])
+                if is_temporal
+                else (output_shape[0],)
+            )
+
+            new_sample_weights.append(
+                np.ones(sw_shape) if as_numpy else tf.ones(sw_shape)
+            )
+
+        else:
+            new_sample_weights.append(sw)
+    return (
+        list_to_tuple(new_sample_weights),
+        any_sample_weight,
+        partial_sample_weight,
+    )
 
 
 class RespectCompiledTrainableState:
-  """Set and restore trainable state if it has changed since compile.
-
-  The keras API guarantees that the value of each Layer's `trainable` property
-  at `Model.compile` time will be used when training that model. In order to
-  respect this requirement, it may be necessary to set the trainable value of
-  layers to their compile time values before beginning a training endpoint and
-  restore the values before returning from said endpoint. This scope checks if
-  any layer's trainable state has changed since Model compile, and performs this
-  set and un-set bookkeeping.
-
-  However, the trainable state of a layer changes quite infrequently, if ever,
-  for many kinds of workflows. Moreover, updating every layer in a model is an
-  expensive operation. As a result, we will only explicitly set and unset the
-  trainable state of a model if a trainable value has changed since compile.
-  """
-
-  def __init__(self, model):
-    self._model = model
-    self._current_trainable_state = None
-    self._compiled_trainable_state = None
-    self._should_set_trainable = False
-
-  def __enter__(self):
-    self._current_trainable_state = self._model._get_trainable_state()  # pylint: disable=protected-access
-    self._compiled_trainable_state = self._model._compiled_trainable_state  # pylint: disable=protected-access
-
-    # Check to see if any layer's trainable state has changed since `compile`.
-    for layer, trainable in self._compiled_trainable_state.items():
-      if (layer in self._current_trainable_state and
-          trainable != self._current_trainable_state[layer]):
-        self._should_set_trainable = True
-        break
-
-    # If so, restore the model to its compiled state.
-    if self._should_set_trainable:
-      self._model._set_trainable_state(self._compiled_trainable_state)  # pylint: disable=protected-access
-
-  def __exit__(self, type_arg, value_arg, traceback_arg):
-    # If we set the values to their compiled state in __enter__, we need to
-    # restore the original values before leaving the scope.
-    if self._should_set_trainable:
-      self._model._set_trainable_state(self._current_trainable_state)  # pylint: disable=protected-access
-    return False  # False values do not suppress exceptions
+    """Set and restore trainable state if it has changed since compile.
+
+    The keras API guarantees that the value of each Layer's `trainable` property
+    at `Model.compile` time will be used when training that model. In order to
+    respect this requirement, it may be necessary to set the trainable value of
+    layers to their compile time values before beginning a training endpoint and
+    restore the values before returning from said endpoint. This scope checks if
+    any layer's trainable state has changed since Model compile, and performs
+    this set and un-set bookkeeping.
+
+    However, the trainable state of a layer changes quite infrequently, if ever,
+    for many kinds of workflows. Moreover, updating every layer in a model is an
+    expensive operation. As a result, we will only explicitly set and unset the
+    trainable state of a model if a trainable value has changed since compile.
+    """
+
+    def __init__(self, model):
+        self._model = model
+        self._current_trainable_state = None
+        self._compiled_trainable_state = None
+        self._should_set_trainable = False
+
+    def __enter__(self):
+        self._current_trainable_state = self._model._get_trainable_state()
+        self._compiled_trainable_state = self._model._compiled_trainable_state
+
+        # Check to see if any layer's trainable state has changed since
+        # `compile`.
+        for layer, trainable in self._compiled_trainable_state.items():
+            if (
+                layer in self._current_trainable_state
+                and trainable != self._current_trainable_state[layer]
+            ):
+                self._should_set_trainable = True
+                break
+
+        # If so, restore the model to its compiled state.
+        if self._should_set_trainable:
+            self._model._set_trainable_state(self._compiled_trainable_state)
+
+    def __exit__(self, type_arg, value_arg, traceback_arg):
+        # If we set the values to their compiled state in __enter__, we need to
+        # restore the original values before leaving the scope.
+        if self._should_set_trainable:
+            self._model._set_trainable_state(self._current_trainable_state)
+        return False  # False values do not suppress exceptions
 
 
 # Allow use of methods not exposed to the user.
-# pylint: disable=protected-access
-def get_input_shape_and_dtype(layer):
-  """Retrieves input shape and input dtype of layer if applicable.
 
-  Args:
-    layer: Layer (or model) instance.
 
-  Returns:
-    Tuple (input_shape, input_dtype). Both could be None if the layer
-      does not have a defined input shape.
+def get_input_shape_and_dtype(layer):
+    """Retrieves input shape and input dtype of layer if applicable.
 
-  Raises:
-    ValueError: in case an empty Sequential or Functional model is passed.
-  """
+    Args:
+      layer: Layer (or model) instance.
 
-  def _is_graph_model(layer):
-    return ((hasattr(layer, '_is_graph_network') and layer._is_graph_network) or
-            layer.__class__.__name__ == 'Sequential')
+    Returns:
+      Tuple (input_shape, input_dtype). Both could be None if the layer
+        does not have a defined input shape.
 
-  # In case of nested models: recover the first layer
-  # of the deepest model to infer input shape and dtype.
-  # Subclassed Models may not have been built so can't be checked.
-  while _is_graph_model(layer):
-    if not layer.layers:
-      raise ValueError('An empty Model cannot be used as a Layer.')
-    layer = layer.layers[0]
+    Raises:
+      ValueError: in case an empty Sequential or Functional model is passed.
+    """
 
-  if getattr(layer, '_batch_input_shape', None):
-    return layer._batch_input_shape, layer.dtype
-  return None, None
+    def _is_graph_model(layer):
+        return (
+            hasattr(layer, "_is_graph_network") and layer._is_graph_network
+        ) or layer.__class__.__name__ == "Sequential"
 
+    # In case of nested models: recover the first layer
+    # of the deepest model to infer input shape and dtype.
+    # Subclassed Models may not have been built so can't be checked.
+    while _is_graph_model(layer):
+        if not layer.layers:
+            raise ValueError("An empty Model cannot be used as a Layer.")
+        layer = layer.layers[0]
 
-# pylint: enable=protected-access
+    if getattr(layer, "_batch_input_shape", None):
+        return layer._batch_input_shape, layer.dtype
+    return None, None
 
 
 def get_static_batch_size(layer):
-  """Gets the static batch size of a Layer.
+    """Gets the static batch size of a Layer.
 
-  Args:
-    layer: a `Layer` instance.
+    Args:
+      layer: a `Layer` instance.
 
-  Returns:
-    The static batch size of a Layer.
-  """
-  batch_input_shape, _ = get_input_shape_and_dtype(layer)
-  if batch_input_shape is not None:
-    return tf.compat.v1.Dimension(batch_input_shape[0]).value
-  return None
+    Returns:
+      The static batch size of a Layer.
+    """
+    batch_input_shape, _ = get_input_shape_and_dtype(layer)
+    if batch_input_shape is not None:
+        return tf.compat.v1.Dimension(batch_input_shape[0]).value
+    return None
 
 
 def list_to_tuple(maybe_list):
-  """Datasets will stack the list of tensor, so switch them to tuples."""
-  if isinstance(maybe_list, list):
-    return tuple(maybe_list)
-  return maybe_list
+    """Datasets will stack the list of tensor, so switch them to tuples."""
+    if isinstance(maybe_list, list):
+        return tuple(maybe_list)
+    return maybe_list
diff --git a/keras/engine/training_utils_v1.py b/keras/engine/training_utils_v1.py
index 371e86b027e9..48cfdd4c02f3 100644
--- a/keras/engine/training_utils_v1.py
+++ b/keras/engine/training_utils_v1.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Training-related utilities."""
 
-import tensorflow.compat.v2 as tf
-
 import abc
 import atexit
 import collections
@@ -25,6 +23,8 @@
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import callbacks as cbks
 from keras import losses
@@ -33,255 +33,281 @@
 from keras.utils import generic_utils
 from keras.utils import losses_utils
 from keras.utils import tf_inspect
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 
 def is_composite_or_composite_value(tensor):
-  """Returns true if 'tensor' is a CompositeTensor or a CT Value object."""
-  # TODO(b/125094323): This should be isinstance(CompositeTensor) or
-  # isinstance(CompositeTensorValue) once we support that.
-  return isinstance(
-      tensor,
-      (tf.__internal__.CompositeTensor, tf.compat.v1.SparseTensorValue,
-       tf.compat.v1.ragged.RaggedTensorValue))
+    """Returns true if 'tensor' is a CompositeTensor or a CT Value object."""
+    # TODO(b/125094323): This should be isinstance(CompositeTensor) or
+    # isinstance(CompositeTensorValue) once we support that.
+    return isinstance(
+        tensor,
+        (
+            tf.__internal__.CompositeTensor,
+            tf.compat.v1.SparseTensorValue,
+            tf.compat.v1.ragged.RaggedTensorValue,
+        ),
+    )
 
 
 class Aggregator(object, metaclass=abc.ABCMeta):
-  """Abstract base class used to aggregate batch-level outputs of a loop.
-
-  Attributes:
-    use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples: Total number of samples: `batch_size * num_batches`.
-    steps: Total number of steps.
-    batch_size: Batch size. It is used for validation checks between inputs and
-      outputs.
-    results: What to return at the end of the aggregation loop.
-  """
-
-  def __init__(self, use_steps, num_samples=None, steps=None, batch_size=None):
-    self.use_steps = use_steps
-    self.num_samples = num_samples
-    self.steps = steps
-    self.batch_size = batch_size
-    self.results = []
-
-  @abc.abstractmethod
-  def create(self, batch_outs):
-    """Creates the initial results from the first batch outputs.
-
-    Args:
-      batch_outs: A list of batch-level outputs.
+    """Abstract base class used to aggregate batch-level outputs of a loop.
+
+    Attributes:
+      use_steps: Whether the loop is using `step` or `batch_size`.
+      num_samples: Total number of samples: `batch_size * num_batches`.
+      steps: Total number of steps.
+      batch_size: Batch size. It is used for validation checks between inputs
+        and outputs.
+      results: What to return at the end of the aggregation loop.
     """
-    raise NotImplementedError('Must be implemented in subclasses.')
 
-  @abc.abstractmethod
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    """Aggregates batch-level results into total results.
+    def __init__(
+        self, use_steps, num_samples=None, steps=None, batch_size=None
+    ):
+        self.use_steps = use_steps
+        self.num_samples = num_samples
+        self.steps = steps
+        self.batch_size = batch_size
+        self.results = []
+
+    @abc.abstractmethod
+    def create(self, batch_outs):
+        """Creates the initial results from the first batch outputs.
+
+        Args:
+          batch_outs: A list of batch-level outputs.
+        """
+        raise NotImplementedError("Must be implemented in subclasses.")
+
+    @abc.abstractmethod
+    def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+        """Aggregates batch-level results into total results.
+
+        Args:
+          batch_outs: A list of batch-level outputs.
+          batch_start: The start index of this batch. Always `None` if
+            `use_steps` is `True`.
+          batch_end: The end index of this batch. Always `None` if `use_steps`
+            is `True`.
+        """
+        raise NotImplementedError("Must be implemented in subclasses.")
+
+    @abc.abstractmethod
+    def finalize(self):
+        """Prepares the total results to be returned."""
+        raise NotImplementedError("Must be implemented in subclasses.")
 
-    Args:
-      batch_outs: A list of batch-level outputs.
-      batch_start: The start index of this batch. Always `None` if `use_steps`
-        is `True`.
-      batch_end: The end index of this batch. Always `None` if `use_steps` is
-        `True`.
-    """
-    raise NotImplementedError('Must be implemented in subclasses.')
 
-  @abc.abstractmethod
-  def finalize(self):
-    """Prepares the total results to be returned."""
-    raise NotImplementedError('Must be implemented in subclasses.')
+class MetricsAggregator(Aggregator):
+    """Aggregator that calculates loss and metrics info.
 
+    Attributes:
+      use_steps: Whether the loop is using `step` or `batch_size`.
+      num_samples: Total number of samples: `batch_size*num_batches`.
+      steps: Total number of steps, ie number of times to iterate over a dataset
+        to cover all samples.
+    """
 
-class MetricsAggregator(Aggregator):
-  """Aggregator that calculates loss and metrics info.
-
-  Attributes:
-    use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples: Total number of samples: `batch_size*num_batches`.
-    steps: Total number of steps, ie number of times to iterate over a dataset
-      to cover all samples.
-  """
-
-  def __init__(self, use_steps, num_samples=None, steps=None):
-    super().__init__(
-        use_steps=use_steps,
-        num_samples=num_samples,
-        steps=steps,
-        batch_size=None)
-
-  def create(self, batch_outs):
-    self.results = [0.] * len(batch_outs)
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    # Loss.
-    if self.use_steps:
-      self.results[0] += batch_outs[0]
-    else:
-      self.results[0] += batch_outs[0] * (batch_end - batch_start)
-    # Metrics (always stateful, just grab current values.)
-    self.results[1:] = batch_outs[1:]
+    def __init__(self, use_steps, num_samples=None, steps=None):
+        super().__init__(
+            use_steps=use_steps,
+            num_samples=num_samples,
+            steps=steps,
+            batch_size=None,
+        )
+
+    def create(self, batch_outs):
+        self.results = [0.0] * len(batch_outs)
+
+    def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+        # Loss.
+        if self.use_steps:
+            self.results[0] += batch_outs[0]
+        else:
+            self.results[0] += batch_outs[0] * (batch_end - batch_start)
+        # Metrics (always stateful, just grab current values.)
+        self.results[1:] = batch_outs[1:]
 
-  def finalize(self):
-    if not self.results:
-      raise ValueError('Empty training data.')
-    self.results[0] /= (self.num_samples or self.steps)
+    def finalize(self):
+        if not self.results:
+            raise ValueError("Empty training data.")
+        self.results[0] /= self.num_samples or self.steps
 
 
 def _append_sparse_tensor_value(target, to_append):
-  """Append sparse tensor value objects."""
-  # Make sure the sparse tensors are of the same size (except for the 0th dim).
-  if len(target.dense_shape) != len(to_append.dense_shape):
-    raise RuntimeError(
-        'Unable to concatenate %s and %s. The inner dense shapes do not '
-        'have the same number of dimensions (%s vs %s)' %
-        (target, to_append, target.dense_shape, to_append.dense_shape))
-
-  if target.dense_shape[1:] != to_append.dense_shape[1:]:
-    raise RuntimeError(
-        'Unable to concatenate %s and %s. The inner dense shapes do not '
-        'match inner dimensions (%s vs %s)' %
-        (target, to_append, target.dense_shape[1:], to_append.dense_shape[1:]))
-
-  # Add the to_append indices to target, updating the 0th value, and keeping
-  # track of the maximum so we know the final dense_shape of this tensor.
-  base_dim0_value = target.dense_shape[0]
-  max_dim0_value = target.dense_shape[0]
-  new_indices = target.indices
-  for index in to_append.indices:
-    # Here, we iterate through the sparse indices of the tensor to append. For
-    # each index, we update its zeroth value (the batch index) by adding the
-    # number of batch items in the tensor we are appending to (so an index
-    # of [0, 0, 1] for a value that is being appended to a tensor with 0th dim
-    # size 3 would become [3, 0, 1].)
-    index[0] += base_dim0_value
-    max_dim0_value = max(max_dim0_value, index[0])
-    new_indices = np.append(new_indices, [index], axis=0)
-
-  # Extend the values array to contain all of the appended values. These will
-  # be in the same order as the indices added above.
-  new_values = np.concatenate((target.values, to_append.values), axis=0)
-
-  # Create a new dense shape by replacing the value for the 0th dimension
-  # with the new max dim0 value.
-  new_dense_shape = list(target.dense_shape)
-  new_dense_shape[0] = max_dim0_value + 1
-  new_dense_shape = tuple(new_dense_shape)
-
-  return tf.compat.v1.SparseTensorValue(
-      indices=new_indices, values=new_values, dense_shape=new_dense_shape)
+    """Append sparse tensor value objects."""
+    # Make sure the sparse tensors are of the same size (except for the 0th
+    # dim).
+    if len(target.dense_shape) != len(to_append.dense_shape):
+        raise RuntimeError(
+            "Unable to concatenate %s and %s. The inner dense shapes do not "
+            "have the same number of dimensions (%s vs %s)"
+            % (target, to_append, target.dense_shape, to_append.dense_shape)
+        )
+
+    if target.dense_shape[1:] != to_append.dense_shape[1:]:
+        raise RuntimeError(
+            "Unable to concatenate %s and %s. The inner dense shapes do not "
+            "match inner dimensions (%s vs %s)"
+            % (
+                target,
+                to_append,
+                target.dense_shape[1:],
+                to_append.dense_shape[1:],
+            )
+        )
+
+    # Add the to_append indices to target, updating the 0th value, and keeping
+    # track of the maximum so we know the final dense_shape of this tensor.
+    base_dim0_value = target.dense_shape[0]
+    max_dim0_value = target.dense_shape[0]
+    new_indices = target.indices
+    for index in to_append.indices:
+        # Here, we iterate through the sparse indices of the tensor to append.
+        # For each index, we update its zeroth value (the batch index) by adding
+        # the number of batch items in the tensor we are appending to (so an
+        # index of [0, 0, 1] for a value that is being appended to a tensor with
+        # 0th dim size 3 would become [3, 0, 1].)
+        index[0] += base_dim0_value
+        max_dim0_value = max(max_dim0_value, index[0])
+        new_indices = np.append(new_indices, [index], axis=0)
+
+    # Extend the values array to contain all of the appended values. These will
+    # be in the same order as the indices added above.
+    new_values = np.concatenate((target.values, to_append.values), axis=0)
+
+    # Create a new dense shape by replacing the value for the 0th dimension
+    # with the new max dim0 value.
+    new_dense_shape = list(target.dense_shape)
+    new_dense_shape[0] = max_dim0_value + 1
+    new_dense_shape = tuple(new_dense_shape)
+
+    return tf.compat.v1.SparseTensorValue(
+        indices=new_indices, values=new_values, dense_shape=new_dense_shape
+    )
 
 
 def _append_ragged_tensor_value(target, to_append):
-  """Append ragged tensor value objects."""
-  # Make sure the ragged tensors are of the same size (save for the 0th dim).
-  if len(target.shape) != len(to_append.shape):
-    raise RuntimeError('Unable to concatenate %s and %s' % (target, to_append))
-
-  if target.shape[1:] != to_append.shape[1:]:
-    raise RuntimeError('Unable to concatenate %s and %s' % (target, to_append))
-
-  adjusted_row_splits = to_append.row_splits[1:] + target.row_splits[-1]
-  new_row_splits = np.append(target.row_splits, adjusted_row_splits)
-  if isinstance(target.values, tf.compat.v1.ragged.RaggedTensorValue):
-    new_values = _append_ragged_tensor_value(target.values, to_append.values)
-  else:
-    new_values = np.concatenate((target.values, to_append.values), axis=0)
+    """Append ragged tensor value objects."""
+    # Make sure the ragged tensors are of the same size (save for the 0th dim).
+    if len(target.shape) != len(to_append.shape):
+        raise RuntimeError(f"Unable to concatenate {target} and {to_append}")
+
+    if target.shape[1:] != to_append.shape[1:]:
+        raise RuntimeError(f"Unable to concatenate {target} and {to_append}")
+
+    adjusted_row_splits = to_append.row_splits[1:] + target.row_splits[-1]
+    new_row_splits = np.append(target.row_splits, adjusted_row_splits)
+    if isinstance(target.values, tf.compat.v1.ragged.RaggedTensorValue):
+        new_values = _append_ragged_tensor_value(
+            target.values, to_append.values
+        )
+    else:
+        new_values = np.concatenate((target.values, to_append.values), axis=0)
 
-  return tf.compat.v1.ragged.RaggedTensorValue(new_values, new_row_splits)
+    return tf.compat.v1.ragged.RaggedTensorValue(new_values, new_row_splits)
 
 
 def _append_composite_tensor(target, to_append):
-  """Helper function to append composite tensors to each other in the 0 axis.
-
-  In order to support batching within a fit/evaluate/predict call, we need
-  to be able to aggregate within a CompositeTensor. Unfortunately, the CT
-  API currently does not make this easy - especially in V1 mode, where we're
-  working with CompositeTensor Value objects that have no connection with the
-  CompositeTensors that created them.
-
-  Args:
-    target: CompositeTensor or CompositeTensor value object that will be
-      appended to.
-    to_append: CompositeTensor or CompositeTensor value object to append to.
-      'target'.
-
-  Returns:
-    A CompositeTensor or CompositeTensor value object.
-
-  Raises:
-    RuntimeError: if concatenation is not possible.
-  """
-  if type(target) is not type(to_append):
-    raise RuntimeError('Unable to concatenate %s and %s' %
-                       (type(target), type(to_append)))
-
-  # Perform type-specific concatenation.
-  # TODO(b/125094323): This should be replaced by a simple call to
-  # target.append() that should work on all of the below classes.
-
-  # If we're seeing a CompositeTensor here, we know it's because we're in
-  # Eager mode (or else we'd have evaluated the CT to a CT Value object
-  # already). Therefore, it's safe to call concat() on it without evaluating
-  # the result any further. If not - that is, if we're seeing a
-  # SparseTensorValue or a RaggedTensorValue - we need to hand-update it
-  # since we're outside of the graph anyways.
-  if isinstance(target, tf.SparseTensor):
-    # We need to invoke the sparse version of concatenate here - tf.concat
-    # won't work.
-    return tf.compat.v1.sparse_concat(sp_inputs=[target, to_append], axis=0)
-  elif isinstance(target, tf.RaggedTensor):
-    return tf.concat([target, to_append], axis=0)
-  elif isinstance(target, tf.compat.v1.SparseTensorValue):
-    return _append_sparse_tensor_value(target, to_append)
-  elif isinstance(target, tf.compat.v1.ragged.RaggedTensorValue):
-    return _append_ragged_tensor_value(target, to_append)
-  else:
-    raise RuntimeError('Attempted to concatenate unsupported object %s.' %
-                       type(target))
+    """Helper function to append composite tensors to each other in the 0 axis.
 
+    In order to support batching within a fit/evaluate/predict call, we need
+    to be able to aggregate within a CompositeTensor. Unfortunately, the CT
+    API currently does not make this easy - especially in V1 mode, where we're
+    working with CompositeTensor Value objects that have no connection with the
+    CompositeTensors that created them.
 
-class ConcatAggregator(Aggregator):
-  """Combine tensor-likes which cannot be merged on the fly.
-
-  This class expects to aggregate a single tensor-like rather than a nested
-  structure of tensor-likes.
-  """
-
-  def __init__(self, batch_size):
-    self.composite = None
-    super().__init__(
-        use_steps=True, num_samples=None, steps=None, batch_size=batch_size)
-
-  def create(self, batch_element):
-    self.composite = is_composite_or_composite_value(batch_element)
-
-  def aggregate(self, batch_element, batch_start=None, batch_end=None):
-
-    # TODO(psv): Add num_samples check here to detect when output batch
-    # #samples is < batch size and != input batch #samples.
-    if self.batch_size and self.batch_size < batch_element.shape[0]:
-      raise ValueError(
-          'Mismatch between expected batch size and model output batch size. '
-          'Output shape = {}, expected output shape = shape {}'.format(
-              batch_element.shape,
-              (self.batch_size,) + batch_element.shape[1:]))
-    self.results.append(batch_element)
-
-  def finalize(self):
-    # Special case of single batch inference which skips a copy.
-    if len(self.results) == 1:
-      self.results = self.results[0]
-
-    elif self.composite:
-      # TODO(taylorrobie): efficiently concatenate.
-      results = self.results[0]
-      for r in self.results[1:]:
-        results = _append_composite_tensor(results, r)
-      self.results = results
+    Args:
+      target: CompositeTensor or CompositeTensor value object that will be
+        appended to.
+      to_append: CompositeTensor or CompositeTensor value object to append to.
+        'target'.
 
+    Returns:
+      A CompositeTensor or CompositeTensor value object.
+
+    Raises:
+      RuntimeError: if concatenation is not possible.
+    """
+    if type(target) is not type(to_append):
+        raise RuntimeError(
+            f"Unable to concatenate {type(target)} and {type(to_append)}"
+        )
+
+    # Perform type-specific concatenation.
+    # TODO(b/125094323): This should be replaced by a simple call to
+    # target.append() that should work on all of the below classes.
+
+    # If we're seeing a CompositeTensor here, we know it's because we're in
+    # Eager mode (or else we'd have evaluated the CT to a CT Value object
+    # already). Therefore, it's safe to call concat() on it without evaluating
+    # the result any further. If not - that is, if we're seeing a
+    # SparseTensorValue or a RaggedTensorValue - we need to hand-update it
+    # since we're outside of the graph anyways.
+    if isinstance(target, tf.SparseTensor):
+        # We need to invoke the sparse version of concatenate here - tf.concat
+        # won't work.
+        return tf.compat.v1.sparse_concat(sp_inputs=[target, to_append], axis=0)
+    elif isinstance(target, tf.RaggedTensor):
+        return tf.concat([target, to_append], axis=0)
+    elif isinstance(target, tf.compat.v1.SparseTensorValue):
+        return _append_sparse_tensor_value(target, to_append)
+    elif isinstance(target, tf.compat.v1.ragged.RaggedTensorValue):
+        return _append_ragged_tensor_value(target, to_append)
     else:
-      self.results = np.concatenate(self.results, axis=0)
+        raise RuntimeError(
+            f"Attempted to concatenate unsupported object {type(target)}."
+        )
+
+
+class ConcatAggregator(Aggregator):
+    """Combine tensor-likes which cannot be merged on the fly.
+
+    This class expects to aggregate a single tensor-like rather than a nested
+    structure of tensor-likes.
+    """
+
+    def __init__(self, batch_size):
+        self.composite = None
+        super().__init__(
+            use_steps=True, num_samples=None, steps=None, batch_size=batch_size
+        )
+
+    def create(self, batch_element):
+        self.composite = is_composite_or_composite_value(batch_element)
+
+    def aggregate(self, batch_element, batch_start=None, batch_end=None):
+
+        # TODO(psv): Add num_samples check here to detect when output batch
+        # #samples is < batch size and != input batch #samples.
+        if self.batch_size and self.batch_size < batch_element.shape[0]:
+            raise ValueError(
+                "Mismatch between expected batch size and model output batch "
+                "size. Output shape = {}, "
+                "expected output shape = shape {}".format(
+                    batch_element.shape,
+                    (self.batch_size,) + batch_element.shape[1:],
+                )
+            )
+        self.results.append(batch_element)
+
+    def finalize(self):
+        # Special case of single batch inference which skips a copy.
+        if len(self.results) == 1:
+            self.results = self.results[0]
+
+        elif self.composite:
+            # TODO(taylorrobie): efficiently concatenate.
+            results = self.results[0]
+            for r in self.results[1:]:
+                results = _append_composite_tensor(results, r)
+            self.results = results
+
+        else:
+            self.results = np.concatenate(self.results, axis=0)
 
 
 _COPY_THREADS = 4
@@ -289,1652 +315,1912 @@ def finalize(self):
 
 
 def get_copy_pool():
-  """Shared threadpool for copying arrays.
+    """Shared threadpool for copying arrays.
 
-  Pool instantiation takes ~ 2ms, so a singleton pool is used rather than
-  creating a pool per SliceAggregator.
+    Pool instantiation takes ~ 2ms, so a singleton pool is used rather than
+    creating a pool per SliceAggregator.
 
-  Returns:
-    The global copy threadpool.
-  """
-  global _COPY_POOL
-  if _COPY_POOL is None:
-    _COPY_POOL = multiprocessing.pool.ThreadPool(_COPY_THREADS)
-    atexit.register(_COPY_POOL.close)
-  return _COPY_POOL
+    Returns:
+      The global copy threadpool.
+    """
+    global _COPY_POOL
+    if _COPY_POOL is None:
+        _COPY_POOL = multiprocessing.pool.ThreadPool(_COPY_THREADS)
+        atexit.register(_COPY_POOL.close)
+    return _COPY_POOL
 
 
 class SliceAggregator(Aggregator):
-  """Combine arrays where the final size is known.
-
-  This class expects to aggregate a single tensor-like rather than a nested
-  structure of tensor-likes.
-
-  NumPy copies are an operation that threads handle quite well because all of
-  the heavy lifting is in c and does not need the GIL. Moreover, we can perform
-  lock-free writes to the same buffer in multiple threads because the nature of
-  result aggregation guarantees that either the indices are disjoint or the
-  aggregator will throw an exception in finalize. Moreover, because aggregation
-  is performed on the slowest varying dimension, assignments for a given batch
-  will write to contiguous blocks of memory, further minimizing contention.
-
-  There is, however, some scheduling and context switching overhead which will
-  offset the gains from pipelining the slice assignment. Below a given threshold
-  it is faster to simply assign in the main thread rather than enqueue the
-  assignment in a side thread. The exact threshold will vary from system to
-  system, but the time is not very sensitive to the exact transition so a value
-  of 2 ** 14 was chosen which should be reasonable on most systems.
-  """
-
-  _BINARY_SIZE_THRESHOLD = 2 ** 14
-  _MAX_COPY_SECONDS = 300
-
-  def __init__(self, num_samples, batch_size):
-    self._async_copies = []
-    self._pool = get_copy_pool()
-    self._errors = []
-    super().__init__(
-        use_steps=False,
-        num_samples=num_samples,
-        steps=None,
-        batch_size=batch_size)
-
-  def create(self, batch_element):
-    # This step does not need to be pipelined because NumPy empty array
-    # initialization is effectively instantaneous.
-    shape = (self.num_samples,) + batch_element.shape[1:]
-    dtype = batch_element.dtype
-
-    self.results = np.empty(shape=shape, dtype=dtype)
-
-  def aggregate(self, batch_element, batch_start, batch_end):
-    # Fail early.
-    if self._errors:
-      raise self._errors[0]
+    """Combine arrays where the final size is known.
+
+    This class expects to aggregate a single tensor-like rather than a nested
+    structure of tensor-likes.
+
+    NumPy copies are an operation that threads handle quite well because all of
+    the heavy lifting is in c and does not need the GIL. Moreover, we can
+    perform lock-free writes to the same buffer in multiple threads because the
+    nature of result aggregation guarantees that either the indices are disjoint
+    or the aggregator will throw an exception in finalize. Moreover, because
+    aggregation is performed on the slowest varying dimension, assignments for a
+    given batch will write to contiguous blocks of memory, further minimizing
+    contention.
+
+    There is, however, some scheduling and context switching overhead which will
+    offset the gains from pipelining the slice assignment. Below a given
+    threshold it is faster to simply assign in the main thread rather than
+    enqueue the assignment in a side thread. The exact threshold will vary from
+    system to system, but the time is not very sensitive to the exact transition
+    so a value of 2 ** 14 was chosen which should be reasonable on most systems.
+    """
 
-    # In the special case of single batch inference, no copy is needed.
-    if batch_end - batch_start == self.num_samples:
-      if self.num_samples != batch_element.shape[0]:
-        raise ValueError(
-            'Mismatch between expected batch size and model output batch size. '
-            'Output shape = {}, expected output shape = shape {}'.format(
-                batch_element.shape, self.results.shape))
-
-      self.results = batch_element
-      return
-
-    # This is an approximate threshold, so we don't need to consider the number
-    # of bytes per element.
-    num_elements = np.prod(batch_element.shape)
-    if num_elements < self._BINARY_SIZE_THRESHOLD:
-      self.results[batch_start:batch_end] = batch_element
-    else:
-      is_finished = threading.Event()
-      self._pool.apply_async(
-          self._slice_assign,
-          args=(batch_element, batch_start, batch_end, is_finished))
-      self._async_copies.append(is_finished)
-
-  def _slice_assign(self, batch_element, batch_start, batch_end, is_finished):
-    """Legacy utility method to slice input arrays."""
-    try:
-      self.results[batch_start:batch_end] = batch_element
+    _BINARY_SIZE_THRESHOLD = 2**14
+    _MAX_COPY_SECONDS = 300
+
+    def __init__(self, num_samples, batch_size):
+        self._async_copies = []
+        self._pool = get_copy_pool()
+        self._errors = []
+        super().__init__(
+            use_steps=False,
+            num_samples=num_samples,
+            steps=None,
+            batch_size=batch_size,
+        )
+
+    def create(self, batch_element):
+        # This step does not need to be pipelined because NumPy empty array
+        # initialization is effectively instantaneous.
+        shape = (self.num_samples,) + batch_element.shape[1:]
+        dtype = batch_element.dtype
+
+        self.results = np.empty(shape=shape, dtype=dtype)
+
+    def aggregate(self, batch_element, batch_start, batch_end):
+        # Fail early.
+        if self._errors:
+            raise self._errors[0]
+
+        # In the special case of single batch inference, no copy is needed.
+        if batch_end - batch_start == self.num_samples:
+            if self.num_samples != batch_element.shape[0]:
+                raise ValueError(
+                    "Mismatch between expected batch size and model "
+                    "output batch size. Output shape = {}, "
+                    "expected output shape = shape {}".format(
+                        batch_element.shape, self.results.shape
+                    )
+                )
+
+            self.results = batch_element
+            return
+
+        # This is an approximate threshold, so we don't need to consider the
+        # number of bytes per element.
+        num_elements = np.prod(batch_element.shape)
+        if num_elements < self._BINARY_SIZE_THRESHOLD:
+            self.results[batch_start:batch_end] = batch_element
+        else:
+            is_finished = threading.Event()
+            self._pool.apply_async(
+                self._slice_assign,
+                args=(batch_element, batch_start, batch_end, is_finished),
+            )
+            self._async_copies.append(is_finished)
+
+    def _slice_assign(self, batch_element, batch_start, batch_end, is_finished):
+        """Legacy utility method to slice input arrays."""
+        try:
+            self.results[batch_start:batch_end] = batch_element
+
+        except Exception as e:
+            # `_slice_assign` should only be called in threads and exceptions
+            # raised in threads do not carry over to the main thread. So instead
+            # we perform a a broad catch in the thread and then store the
+            # exception to be re-raised in the main thread.
+            self._errors.append(e)
+
+        finally:
+            is_finished.set()
+
+    def finalize(self):
+        start_time = time.time()
+        for is_finished in self._async_copies:
+            timeout = max(
+                [0.0, self._MAX_COPY_SECONDS - (time.time() - start_time)]
+            )
+            if not is_finished.wait(timeout):
+                raise ValueError("Timed out waiting for copy to complete.")
+
+        if self._errors:
+            raise self._errors[0]
 
-    except Exception as e:  # pylint: disable=broad-except
-      # `_slice_assign` should only be called in threads and exceptions raised
-      # in threads do not carry over to the main thread. So instead we perform a
-      # a broad catch in the thread and then store the exception to be re-raised
-      # in the main thread.
-      self._errors.append(e)
 
-    finally:
-      is_finished.set()
+class OutputsAggregator(Aggregator):
+    """Aggregator that concatenates outputs."""
+
+    _structure = None
+
+    def create(self, batch_outs):
+        # SparseTensorValue is a named tuple which nest will flatten, so we need
+        # to guard it to properly handle the structure.
+        self._structure = tf.__internal__.nest.get_traverse_shallow_structure(
+            lambda x: not is_composite_or_composite_value(x), batch_outs
+        )
+        batch_outs = tf.__internal__.nest.flatten_up_to(
+            self._structure, batch_outs
+        )
+
+        for batch_element in batch_outs:
+            if is_composite_or_composite_value(batch_element):
+                # If the output is not a ndarray, it will be either a composite
+                # tensor or a composite tensor's Value object. In either case,
+                # we can't allocate an array to hold the object - we'll handle
+                # it later.
+                self.results.append(ConcatAggregator(self.batch_size))
+            elif isinstance(batch_element, np.ndarray):
+                self.results.append(
+                    (
+                        ConcatAggregator(self.batch_size)
+                        if self.use_steps
+                        else SliceAggregator(self.num_samples, self.batch_size)
+                    )
+                )
+            else:
+                # This is not a ndarray, a CompositeTensor, or a
+                # CompositeTensorValue.  Fail fast rather than trying to
+                # concatenate it.
+                raise RuntimeError(
+                    "Attempted to aggregate unsupported object {}.".format(
+                        batch_element
+                    )
+                )
+
+            self.results[-1].create(batch_element)
+
+    def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+        batch_outs = tf.__internal__.nest.flatten_up_to(
+            self._structure, batch_outs
+        )
+        for batch_element, result in zip(batch_outs, self.results):
+            result.aggregate(batch_element, batch_start, batch_end)
+
+    def finalize(self):
+        for result in self.results:
+            result.finalize()
+        self.results = [i.results for i in self.results]
+        self.results = tf.nest.pack_sequence_as(self._structure, self.results)
 
-  def finalize(self):
-    start_time = time.time()
-    for is_finished in self._async_copies:
-      timeout = max([0., self._MAX_COPY_SECONDS - (time.time() - start_time)])
-      if not is_finished.wait(timeout):
-        raise ValueError('Timed out waiting for copy to complete.')
 
-    if self._errors:
-      raise self._errors[0]
+def get_progbar(model, count_mode, include_metrics=True):
+    """Get Progbar."""
+    if include_metrics:
+        stateful_metric_names = getattr(model, "metrics_names", None)
+        if stateful_metric_names:
+            stateful_metric_names = stateful_metric_names[1:]  # Exclude `loss`
+    else:
+        stateful_metric_names = None
+    return cbks.ProgbarLogger(
+        count_mode, stateful_metrics=stateful_metric_names
+    )
 
 
-class OutputsAggregator(Aggregator):
-  """Aggregator that concatenates outputs."""
-
-  _structure = None
-
-  def create(self, batch_outs):
-    # SparseTensorValue is a named tuple which nest will flatten, so we need
-    # to guard it to properly handle the structure.
-    self._structure = tf.__internal__.nest.get_traverse_shallow_structure(
-        lambda x: not is_composite_or_composite_value(x), batch_outs)
-    batch_outs = tf.__internal__.nest.flatten_up_to(self._structure, batch_outs)
-
-    for batch_element in batch_outs:
-      if is_composite_or_composite_value(batch_element):
-        # If the output is not a ndarray, it will be either a composite tensor
-        # or a composite tensor's Value object. In either case, we can't
-        # allocate an array to hold the object - we'll handle it later.
-        self.results.append(ConcatAggregator(self.batch_size))
-      elif isinstance(batch_element, np.ndarray):
-        self.results.append(
-            (ConcatAggregator(self.batch_size) if self.use_steps else
-             SliceAggregator(self.num_samples, self.batch_size)))
-      else:
-        # This is not a ndarray, a CompositeTensor, or a CompositeTensorValue.
-        # Fail fast rather than trying to concatenate it.
-        raise RuntimeError('Attempted to aggregate unsupported object {}.'
-                           .format(batch_element))
-
-      self.results[-1].create(batch_element)
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    batch_outs = tf.__internal__.nest.flatten_up_to(self._structure, batch_outs)
-    for batch_element, result in zip(batch_outs, self.results):
-      result.aggregate(batch_element, batch_start, batch_end)
-
-  def finalize(self):
-    for result in self.results:
-      result.finalize()
-    self.results = [i.results for i in self.results]
-    self.results = tf.nest.pack_sequence_as(self._structure, self.results)
+def check_num_samples(ins, batch_size=None, steps=None, steps_name="steps"):
+    """Determine the number of samples provided for training and evaluation.
 
+    The number of samples is not defined when running with `steps`,
+    in which case the number of samples is set to `None`.
 
-def get_progbar(model, count_mode, include_metrics=True):
-  """Get Progbar."""
-  if include_metrics:
-    stateful_metric_names = getattr(model, 'metrics_names', None)
-    if stateful_metric_names:
-      stateful_metric_names = stateful_metric_names[1:]  # Exclude `loss`
-  else:
-    stateful_metric_names = None
-  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
-
-
-def check_num_samples(ins, batch_size=None, steps=None, steps_name='steps'):
-  """Determine the number of samples provided for training and evaluation.
-
-  The number of samples is not defined when running with `steps`,
-  in which case the number of samples is set to `None`.
-
-  Args:
-      ins: List of tensors to be fed to the Keras function.
-      batch_size: Integer batch size or `None` if not defined.
-      steps: Total number of steps (batches of samples) before declaring
-        `_predict_loop` finished. Ignored with the default value of `None`.
-      steps_name: The public API's parameter name for `steps`.
-
-  Raises:
-      ValueError: when `steps` is `None` and the attribute `ins.shape`
-      does not exist. Also raises ValueError when `steps` is not `None`
-      and `batch_size` is not `None` because they are mutually
-      exclusive.
-
-  Returns:
-      When steps is `None`, returns the number of samples to be
-      processed based on the size of the first dimension of the
-      first input numpy array. When steps is not `None` and
-      `batch_size` is `None`, returns `None`.
-  """
-  if steps is not None and batch_size is not None:
-    raise ValueError('If ' + steps_name +
-                     ' is set, the `batch_size` must be None.')
-  if check_steps_argument(ins, steps, steps_name):
-    return None
+    Args:
+        ins: List of tensors to be fed to the Keras function.
+        batch_size: Integer batch size or `None` if not defined.
+        steps: Total number of steps (batches of samples) before declaring
+          `_predict_loop` finished. Ignored with the default value of `None`.
+        steps_name: The public API's parameter name for `steps`.
+
+    Raises:
+        ValueError: when `steps` is `None` and the attribute `ins.shape`
+        does not exist. Also raises ValueError when `steps` is not `None`
+        and `batch_size` is not `None` because they are mutually
+        exclusive.
+
+    Returns:
+        When steps is `None`, returns the number of samples to be
+        processed based on the size of the first dimension of the
+        first input numpy array. When steps is not `None` and
+        `batch_size` is `None`, returns `None`.
+    """
+    if steps is not None and batch_size is not None:
+        raise ValueError(
+            "If " + steps_name + " is set, the `batch_size` must be None."
+        )
+    if check_steps_argument(ins, steps, steps_name):
+        return None
 
-  if hasattr(ins[0], 'shape'):
-    return int(ins[0].shape[0])
-  return None  # Edge case where ins == [static_learning_phase]
+    if hasattr(ins[0], "shape"):
+        return int(ins[0].shape[0])
+    return None  # Edge case where ins == [static_learning_phase]
 
 
 def standardize_single_array(x, expected_shape=None):
-  """Expand data of shape (x,) to (x, 1), unless len(expected_shape)==1."""
-  if x is None:
-    return None
+    """Expand data of shape (x,) to (x, 1), unless len(expected_shape)==1."""
+    if x is None:
+        return None
 
-  if is_composite_or_composite_value(x):
+    if is_composite_or_composite_value(x):
+        return x
+
+    if isinstance(x, int):
+        raise ValueError(
+            f"Expected an array data type but received an integer: {x}"
+        )
+
+    if (
+        x.shape is not None
+        and len(x.shape) == 1
+        and (expected_shape is None or len(expected_shape) != 1)
+    ):
+        if tf.is_tensor(x):
+            x = tf.compat.v1.expand_dims(x, axis=1)
+        else:
+            x = np.expand_dims(x, 1)
     return x
 
-  if isinstance(x, int):
-    raise ValueError(
-        'Expected an array data type but received an integer: {}'.format(x))
 
-  if (x.shape is not None and len(x.shape) == 1 and
-      (expected_shape is None or len(expected_shape) != 1)):
-    if tf.is_tensor(x):
-      x = tf.compat.v1.expand_dims(x, axis=1)
+def get_composite_shape(tensor):
+    """Returns the shape of the passed composite tensor."""
+    if isinstance(tensor, tf.compat.v1.SparseTensorValue):
+        # SparseTensorValues use a 'dense_shape' attribute
+        return tensor.dense_shape
     else:
-      x = np.expand_dims(x, 1)
-  return x
+        return tensor.shape
 
 
-def get_composite_shape(tensor):
-  """Returns the shape of the passed composite tensor."""
-  if isinstance(tensor, tf.compat.v1.SparseTensorValue):
-    # SparseTensorValues use a 'dense_shape' attribute
-    return tensor.dense_shape
-  else:
-    return tensor.shape
-
-
-def standardize_input_data(data,
-                           names,
-                           shapes=None,
-                           check_batch_axis=True,
-                           exception_prefix=''):
-  """Normalizes inputs and targets provided by users.
-
-  Users may pass data as a list of arrays, dictionary of arrays,
-  or as a single array. We normalize this to an ordered list of
-  arrays (same order as `names`), while checking that the provided
-  arrays have shapes that match the network's expectations.
-
-  Args:
-      data: User-provided input data (polymorphic).
-      names: List of expected array names.
-      shapes: Optional list of expected array shapes.
-      check_batch_axis: Boolean; whether to check that the batch axis of the
-        arrays matches the expected value found in `shapes`.
-      exception_prefix: String prefix used for exception formatting.
-
-  Returns:
-      List of standardized input arrays (one array per model input).
-
-  Raises:
-      ValueError: in case of improperly formatted user-provided data.
-  """
-  try:
-    data_len = len(data)
-  except TypeError:
-    # For instance if data is `None` or a symbolic Tensor.
-    data_len = None
-
-  if not names:
-    if data_len and not isinstance(data, dict):
-      raise ValueError(
-          'Error when checking model ' + exception_prefix + ': '
-          'expected no data, but got:', data)
-    return []
-  if data is None:
-    return [None for _ in range(len(names))]
-
-  if isinstance(data, dict):
+def standardize_input_data(
+    data, names, shapes=None, check_batch_axis=True, exception_prefix=""
+):
+    """Normalizes inputs and targets provided by users.
+
+    Users may pass data as a list of arrays, dictionary of arrays,
+    or as a single array. We normalize this to an ordered list of
+    arrays (same order as `names`), while checking that the provided
+    arrays have shapes that match the network's expectations.
+
+    Args:
+        data: User-provided input data (polymorphic).
+        names: List of expected array names.
+        shapes: Optional list of expected array shapes.
+        check_batch_axis: Boolean; whether to check that the batch axis of the
+          arrays matches the expected value found in `shapes`.
+        exception_prefix: String prefix used for exception formatting.
+
+    Returns:
+        List of standardized input arrays (one array per model input).
+
+    Raises:
+        ValueError: in case of improperly formatted user-provided data.
+    """
     try:
-      data = [
-          data[x].values
-          if data[x].__class__.__name__ == 'DataFrame' else data[x]
-          for x in names
-      ]
-    except KeyError as e:
-      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
-                       'for each key in: ' + str(names))
-  elif isinstance(data, (list, tuple)):
-    if isinstance(data[0], (list, tuple)):
-      data = [np.asarray(d) for d in data]
-    elif len(names) == 1 and isinstance(data[0], (float, int)):
-      data = [np.asarray(data)]
-    else:
-      data = [
-          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
-      ]
-  else:
-    data = data.values if data.__class__.__name__ == 'DataFrame' else data
-    data = [data]
-
-  if shapes is not None:
-    data = [
-        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
-    ]
-  else:
-    data = [standardize_single_array(x) for x in data]
-
-  if len(data) != len(names):
-    if data and hasattr(data[0], 'shape'):
-      raise ValueError('Error when checking model ' + exception_prefix +
-                       ': the list of Numpy arrays that you are passing to '
-                       'your model is not the size the model expected. '
-                       'Expected to see ' + str(len(names)) + ' array(s), ' +
-                       'for inputs ' + str(names) + ' but instead got the '
-                       'following list of ' + str(len(data)) + ' arrays: ' +
-                       str(data)[:200] + '...')
-    elif len(names) > 1:
-      raise ValueError('Error when checking model ' + exception_prefix +
-                       ': you are passing a list as input to your model, '
-                       'but the model expects a list of ' + str(len(names)) +
-                       ' Numpy arrays instead. The list you passed was: ' +
-                       str(data)[:200])
-    elif len(data) == 1 and not hasattr(data[0], 'shape'):
-      raise TypeError('Error when checking model ' + exception_prefix +
-                      ': data should be a Numpy array, or list/dict of '
-                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
-    elif len(names) == 1:
-      data = [np.asarray(data)]
-
-  # Check shapes compatibility.
-  if shapes:
-    for i in range(len(names)):
-      if shapes[i] is not None:
-        if tf.is_tensor(data[i]):
-          tensorshape = data[i].shape
-          if not tensorshape:
-            continue
-          data_shape = tuple(tensorshape.as_list())
-        elif is_composite_or_composite_value(data[i]):
-          tensorshape = get_composite_shape(data[i])
-          data_shape = tuple(tensorshape.as_list())
+        data_len = len(data)
+    except TypeError:
+        # For instance if data is `None` or a symbolic Tensor.
+        data_len = None
+
+    if not names:
+        if data_len and not isinstance(data, dict):
+            raise ValueError(
+                "Error when checking model "
+                + exception_prefix
+                + ": expected no data, but got:",
+                data,
+            )
+        return []
+    if data is None:
+        return [None for _ in range(len(names))]
+
+    if isinstance(data, dict):
+        try:
+            data = [
+                data[x].values
+                if data[x].__class__.__name__ == "DataFrame"
+                else data[x]
+                for x in names
+            ]
+        except KeyError as e:
+            raise ValueError(
+                'No data provided for "'
+                + e.args[0]
+                + '". Need data for each key in: '
+                + str(names)
+            )
+    elif isinstance(data, (list, tuple)):
+        if isinstance(data[0], (list, tuple)):
+            data = [np.asarray(d) for d in data]
+        elif len(names) == 1 and isinstance(data[0], (float, int)):
+            data = [np.asarray(data)]
         else:
-          data_shape = data[i].shape
-
-        shape = shapes[i]
-        if len(data_shape) != len(shape):
-          raise ValueError('Error when checking ' + exception_prefix +
-                           ': expected ' + names[i] + ' to have ' +
-                           str(len(shape)) + ' dimensions, but got array '
-                           'with shape ' + str(data_shape))
-        if not check_batch_axis:
-          data_shape = data_shape[1:]
-          shape = shape[1:]
-        for dim, ref_dim in zip(data_shape, shape):
-          if ref_dim != dim and ref_dim is not None and dim is not None:
-            raise ValueError('Error when checking ' + exception_prefix +
-                             ': expected ' + names[i] + ' to have shape ' +
-                             str(shape) + ' but got array with shape ' +
-                             str(data_shape))
-  return data
+            data = [
+                x.values if x.__class__.__name__ == "DataFrame" else x
+                for x in data
+            ]
+    else:
+        data = data.values if data.__class__.__name__ == "DataFrame" else data
+        data = [data]
+
+    if shapes is not None:
+        data = [
+            standardize_single_array(x, shape)
+            for (x, shape) in zip(data, shapes)
+        ]
+    else:
+        data = [standardize_single_array(x) for x in data]
+
+    if len(data) != len(names):
+        if data and hasattr(data[0], "shape"):
+            raise ValueError(
+                "Error when checking model "
+                + exception_prefix
+                + ": the list of Numpy arrays that you are passing to "
+                "your model is not the size the model expected. "
+                "Expected to see "
+                + str(len(names))
+                + " array(s), "
+                + "for inputs "
+                + str(names)
+                + " but instead got the following list of "
+                + str(len(data))
+                + " arrays: "
+                + str(data)[:200]
+                + "..."
+            )
+        elif len(names) > 1:
+            raise ValueError(
+                "Error when checking model "
+                + exception_prefix
+                + ": you are passing a list as input to your model, "
+                "but the model expects a list of "
+                + str(len(names))
+                + " Numpy arrays instead. The list you passed was: "
+                + str(data)[:200]
+            )
+        elif len(data) == 1 and not hasattr(data[0], "shape"):
+            raise TypeError(
+                "Error when checking model "
+                + exception_prefix
+                + ": data should be a Numpy array, or list/dict of "
+                "Numpy arrays. Found: " + str(data)[:200] + "..."
+            )
+        elif len(names) == 1:
+            data = [np.asarray(data)]
+
+    # Check shapes compatibility.
+    if shapes:
+        for i in range(len(names)):
+            if shapes[i] is not None:
+                if tf.is_tensor(data[i]):
+                    tensorshape = data[i].shape
+                    if not tensorshape:
+                        continue
+                    data_shape = tuple(tensorshape.as_list())
+                elif is_composite_or_composite_value(data[i]):
+                    tensorshape = get_composite_shape(data[i])
+                    data_shape = tuple(tensorshape.as_list())
+                else:
+                    data_shape = data[i].shape
+
+                shape = shapes[i]
+                if len(data_shape) != len(shape):
+                    raise ValueError(
+                        "Error when checking "
+                        + exception_prefix
+                        + ": expected "
+                        + names[i]
+                        + " to have "
+                        + str(len(shape))
+                        + " dimensions, but got array with shape "
+                        + str(data_shape)
+                    )
+                if not check_batch_axis:
+                    data_shape = data_shape[1:]
+                    shape = shape[1:]
+                for dim, ref_dim in zip(data_shape, shape):
+                    if (
+                        ref_dim != dim
+                        and ref_dim is not None
+                        and dim is not None
+                    ):
+                        raise ValueError(
+                            "Error when checking "
+                            + exception_prefix
+                            + ": expected "
+                            + names[i]
+                            + " to have shape "
+                            + str(shape)
+                            + " but got array with shape "
+                            + str(data_shape)
+                        )
+    return data
 
 
 def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
-  """Maps `sample_weight` or `class_weight` to model outputs.
-
-  Args:
-      x_weight: User-provided `sample_weight` or `class_weight` argument.
-      output_names: List of output names (strings) in the model.
-      weight_type: A string used purely for exception printing.
-
-  Returns:
-      A list of `sample_weight` or `class_weight` where there are exactly
-          one element per model output.
-
-  Raises:
-      ValueError: In case of invalid user-provided argument.
-  """
-  if x_weight is None or (isinstance(x_weight, (list, tuple)) and
-                          len(x_weight) == 0):  # pylint: disable=g-explicit-length-test
-    return [None for _ in output_names]
-  if len(output_names) == 1:
-    if isinstance(x_weight, (list, tuple)) and len(x_weight) == 1:
-      return x_weight
-    if isinstance(x_weight, dict) and output_names[0] in x_weight:
-      return [x_weight[output_names[0]]]
+    """Maps `sample_weight` or `class_weight` to model outputs.
+
+    Args:
+        x_weight: User-provided `sample_weight` or `class_weight` argument.
+        output_names: List of output names (strings) in the model.
+        weight_type: A string used purely for exception printing.
+
+    Returns:
+        A list of `sample_weight` or `class_weight` where there are exactly
+            one element per model output.
+
+    Raises:
+        ValueError: In case of invalid user-provided argument.
+    """
+    if x_weight is None or (
+        isinstance(x_weight, (list, tuple)) and len(x_weight) == 0
+    ):
+        return [None for _ in output_names]
+    if len(output_names) == 1:
+        if isinstance(x_weight, (list, tuple)) and len(x_weight) == 1:
+            return x_weight
+        if isinstance(x_weight, dict) and output_names[0] in x_weight:
+            return [x_weight[output_names[0]]]
+        else:
+            return [x_weight]
+    if isinstance(x_weight, (list, tuple)):
+        if len(x_weight) != len(output_names):
+            raise ValueError(
+                "Provided `"
+                + weight_type
+                + "` was a list of "
+                + str(len(x_weight))
+                + " elements, but the model has "
+                + str(len(output_names))
+                + " outputs. You should provide one `"
+                + weight_type
+                + "`array per model output."
+            )
+        return x_weight
+    if isinstance(x_weight, collections.abc.Mapping):
+        generic_utils.check_for_unexpected_keys(
+            weight_type, x_weight, output_names
+        )
+        x_weights = []
+        for name in output_names:
+            x_weights.append(x_weight.get(name))
+        return x_weights
     else:
-      return [x_weight]
-  if isinstance(x_weight, (list, tuple)):
-    if len(x_weight) != len(output_names):
-      raise ValueError('Provided `' + weight_type + '` was a list of ' +
-                       str(len(x_weight)) + ' elements, but the model has ' +
-                       str(len(output_names)) + ' outputs. '
-                       'You should provide one `' + weight_type + '`'
-                       'array per model output.')
-    return x_weight
-  if isinstance(x_weight, collections.abc.Mapping):
-    generic_utils.check_for_unexpected_keys(weight_type, x_weight, output_names)
-    x_weights = []
-    for name in output_names:
-      x_weights.append(x_weight.get(name))
-    return x_weights
-  else:
-    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
-                    'should be either a list or a dict. '
-                    'Provided `' + weight_type + '` type not understood: ' +
-                    str(x_weight))
+        raise TypeError(
+            "The model has multiple outputs, so `"
+            + weight_type
+            + "` should be either a list or a dict. Provided `"
+            + weight_type
+            + "` type not understood: "
+            + str(x_weight)
+        )
 
 
 def standardize_class_weights(class_weight, output_names):
-  return standardize_sample_or_class_weights(class_weight, output_names,
-                                             'class_weight')
+    return standardize_sample_or_class_weights(
+        class_weight, output_names, "class_weight"
+    )
 
 
 def standardize_sample_weights(sample_weight, output_names):
-  return standardize_sample_or_class_weights(sample_weight, output_names,
-                                             'sample_weight')
+    return standardize_sample_or_class_weights(
+        sample_weight, output_names, "sample_weight"
+    )
 
 
 def check_array_lengths(inputs, targets, weights=None):
-  """Does user input validation for numpy arrays.
+    """Does user input validation for numpy arrays.
 
-  Args:
-      inputs: list of Numpy arrays of inputs.
-      targets: list of Numpy arrays of targets.
-      weights: list of Numpy arrays of sample weights.
+    Args:
+        inputs: list of Numpy arrays of inputs.
+        targets: list of Numpy arrays of targets.
+        weights: list of Numpy arrays of sample weights.
 
-  Raises:
-      ValueError: in case of incorrectly formatted data.
-  """
+    Raises:
+        ValueError: in case of incorrectly formatted data.
+    """
 
-  def is_tensor_or_composite_tensor(x):
-    return tf.is_tensor(x) or is_composite_or_composite_value(x)
+    def is_tensor_or_composite_tensor(x):
+        return tf.is_tensor(x) or is_composite_or_composite_value(x)
 
-  def set_of_lengths(x):
-    # Returns a set with the variation between
-    # different shapes, with None => 0
-    if x is None:
-      return {}
-    else:
-      return set([
-          y.shape[0]
-          for y in x
-          if y is not None and not is_tensor_or_composite_tensor(y)
-      ])
-
-  set_x = set_of_lengths(inputs)
-  set_y = set_of_lengths(targets)
-  set_w = set_of_lengths(weights)
-  if len(set_x) > 1:
-    raise ValueError('All input arrays (x) should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([x.shape for x in inputs]))
-  if len(set_y) > 1:
-    raise ValueError('All target arrays (y) should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([y.shape for y in targets]))
-  if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
-    raise ValueError('Input arrays should have '
-                     'the same number of samples as target arrays. '
-                     'Found ' + str(list(set_x)[0]) + ' input samples '
-                     'and ' + str(list(set_y)[0]) + ' target samples.')
-  if len(set_w) > 1:
-    raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([w.shape for w in weights]))
-  if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
-    raise ValueError('Sample_weight arrays should have '
-                     'the same number of samples as target arrays. Got ' +
-                     str(list(set_y)[0]) + ' input samples and ' +
-                     str(list(set_w)[0]) + ' target samples.')
+    def set_of_lengths(x):
+        # Returns a set with the variation between
+        # different shapes, with None => 0
+        if x is None:
+            return {}
+        else:
+            return set(
+                [
+                    y.shape[0]
+                    for y in x
+                    if y is not None and not is_tensor_or_composite_tensor(y)
+                ]
+            )
+
+    set_x = set_of_lengths(inputs)
+    set_y = set_of_lengths(targets)
+    set_w = set_of_lengths(weights)
+    if len(set_x) > 1:
+        raise ValueError(
+            "All input arrays (x) should have "
+            "the same number of samples. Got array shapes: "
+            + str([x.shape for x in inputs])
+        )
+    if len(set_y) > 1:
+        raise ValueError(
+            "All target arrays (y) should have "
+            "the same number of samples. Got array shapes: "
+            + str([y.shape for y in targets])
+        )
+    if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
+        raise ValueError(
+            "Input arrays should have "
+            "the same number of samples as target arrays. "
+            "Found "
+            + str(list(set_x)[0])
+            + " input samples and "
+            + str(list(set_y)[0])
+            + " target samples."
+        )
+    if len(set_w) > 1:
+        raise ValueError(
+            "All sample_weight arrays should have "
+            "the same number of samples. Got array shapes: "
+            + str([w.shape for w in weights])
+        )
+    if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
+        raise ValueError(
+            "Sample_weight arrays should have "
+            "the same number of samples as target arrays. Got "
+            + str(list(set_y)[0])
+            + " input samples and "
+            + str(list(set_w)[0])
+            + " target samples."
+        )
 
 
 def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
-  """Does validation on the compatibility of targets and loss functions.
-
-  This helps prevent users from using loss functions incorrectly. This check
-  is purely for UX purposes.
-
-  Args:
-      targets: list of Numpy arrays of targets.
-      loss_fns: list of loss functions.
-      output_shapes: list of shapes of model outputs.
-
-  Raises:
-      ValueError: if a loss function or target array
-          is incompatible with an output.
-  """
-  key_loss_fns = {
-      losses.mean_squared_error, losses.binary_crossentropy,
-      losses.categorical_crossentropy
-  }
-  key_loss_classes = (losses.MeanSquaredError, losses.BinaryCrossentropy,
-                      losses.CategoricalCrossentropy)
-  for y, loss, shape in zip(targets, loss_fns, output_shapes):
-    if y is None or loss is None or tf.is_tensor(y):
-      continue
-    if losses.is_categorical_crossentropy(loss):
-      if y.shape[-1] == 1:
-        raise ValueError('You are passing a target array of shape ' +
-                         str(y.shape) +
-                         ' while using as loss `categorical_crossentropy`. '
-                         '`categorical_crossentropy` expects '
-                         'targets to be binary matrices (1s and 0s) '
-                         'of shape (samples, classes). '
-                         'If your targets are integer classes, '
-                         'you can convert them to the expected format via:\n'
-                         '```\n'
-                         'from keras.utils import to_categorical\n'
-                         'y_binary = to_categorical(y_int)\n'
-                         '```\n'
-                         '\n'
-                         'Alternatively, you can use the loss function '
-                         '`sparse_categorical_crossentropy` instead, '
-                         'which does expect integer targets.')
-
-    is_loss_wrapper = isinstance(loss, losses.LossFunctionWrapper)
-    if (isinstance(loss, key_loss_classes) or (is_loss_wrapper and
-                                               (loss.fn in key_loss_fns))):
-      for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
-        if out_dim is not None and target_dim != out_dim:
-          loss_name = loss.name
-          if loss_name is None:
-            loss_type = loss.fn if is_loss_wrapper else type(loss)
-            loss_name = loss_type.__name__
-          raise ValueError('A target array with shape ' + str(y.shape) +
-                           ' was passed for an output of shape ' + str(shape) +
-                           ' while using as loss `' + loss_name + '`. '
-                           'This loss expects targets to have the same shape '
-                           'as the output.')
-
-
-def collect_per_output_metric_info(metrics,
-                                   output_names,
-                                   output_shapes,
-                                   loss_fns,
-                                   from_serialized=False,
-                                   is_weighted=False):
-  """Maps metric names and functions to model outputs.
-
-  Args:
-      metrics: a list or a list of lists or a dict of metric functions.
-      output_names: a list of the names (strings) of model outputs.
-      output_shapes: a list of the shapes (strings) of model outputs.
-      loss_fns: a list of the loss functions corresponding to the model outputs.
-      from_serialized: whether the model the metrics are being sourced from is
-        being initialized from a serialized format.
-      is_weighted: Boolean indicating whether the given metrics are weighted.
-
-  Returns:
-      A list (one entry per model output) of dicts.
-      For instance, if the model has 2 outputs, and for the first output
-      we want to compute "binary_accuracy" and "binary_crossentropy",
-      and just "binary_accuracy" for the second output,
-      the list would look like: `[{
-          'acc': binary_accuracy(),
-          'ce': binary_crossentropy(),
-        }, {
-          'acc': binary_accuracy(),
-        }]`
-
-  Raises:
-      TypeError: if an incorrect type is passed for the `metrics` argument.
-  """
-  if not metrics:
-    return [{} for _ in output_names]
-
-  if isinstance(metrics, list):
-    any_sub_list = any(isinstance(m, list) for m in metrics)
-    if any_sub_list:
-      if len(metrics) != len(output_names):
-        raise ValueError('When passing a list of lists as `metrics`, '
-                         'it should have one entry per model output. '
-                         'The model has ' + str(len(output_names)) +
-                         ' outputs, but you passed metrics=' + str(metrics))
-      # User has provided a list of len = len(outputs).
-      nested_metrics = [generic_utils.to_list(m) for m in metrics]
-    else:
-      # If it is a single list we then apply all metrics to all outputs.
-      if len(output_names) > 1:
+    """Does validation on the compatibility of targets and loss functions.
+
+    This helps prevent users from using loss functions incorrectly. This check
+    is purely for UX purposes.
+
+    Args:
+        targets: list of Numpy arrays of targets.
+        loss_fns: list of loss functions.
+        output_shapes: list of shapes of model outputs.
+
+    Raises:
+        ValueError: if a loss function or target array
+            is incompatible with an output.
+    """
+    key_loss_fns = {
+        losses.mean_squared_error,
+        losses.binary_crossentropy,
+        losses.categorical_crossentropy,
+    }
+    key_loss_classes = (
+        losses.MeanSquaredError,
+        losses.BinaryCrossentropy,
+        losses.CategoricalCrossentropy,
+    )
+    for y, loss, shape in zip(targets, loss_fns, output_shapes):
+        if y is None or loss is None or tf.is_tensor(y):
+            continue
+        if losses.is_categorical_crossentropy(loss):
+            if y.shape[-1] == 1:
+                raise ValueError(
+                    "You are passing a target array of shape "
+                    + str(y.shape)
+                    + " while using as loss `categorical_crossentropy`. "
+                    "`categorical_crossentropy` expects "
+                    "targets to be binary matrices (1s and 0s) "
+                    "of shape (samples, classes). "
+                    "If your targets are integer classes, "
+                    "you can convert them to the expected format via:\n"
+                    "```\n"
+                    "from keras.utils import to_categorical\n"
+                    "y_binary = to_categorical(y_int)\n"
+                    "```\n"
+                    "\n"
+                    "Alternatively, you can use the loss function "
+                    "`sparse_categorical_crossentropy` instead, "
+                    "which does expect integer targets."
+                )
+
+        is_loss_wrapper = isinstance(loss, losses.LossFunctionWrapper)
+        if isinstance(loss, key_loss_classes) or (
+            is_loss_wrapper and (loss.fn in key_loss_fns)
+        ):
+            for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
+                if out_dim is not None and target_dim != out_dim:
+                    loss_name = loss.name
+                    if loss_name is None:
+                        loss_type = loss.fn if is_loss_wrapper else type(loss)
+                        loss_name = loss_type.__name__
+                    raise ValueError(
+                        "A target array with shape "
+                        + str(y.shape)
+                        + " was passed for an output of shape "
+                        + str(shape)
+                        + " while using as loss `"
+                        + loss_name
+                        + "`. "
+                        "This loss expects targets to have the same shape "
+                        "as the output."
+                    )
+
+
+def collect_per_output_metric_info(
+    metrics,
+    output_names,
+    output_shapes,
+    loss_fns,
+    from_serialized=False,
+    is_weighted=False,
+):
+    """Maps metric names and functions to model outputs.
+
+    Args:
+        metrics: a list or a list of lists or a dict of metric functions.
+        output_names: a list of the names (strings) of model outputs.
+        output_shapes: a list of the shapes (strings) of model outputs.
+        loss_fns: a list of the loss functions corresponding to the model
+          outputs.
+        from_serialized: whether the model the metrics are being sourced from is
+          being initialized from a serialized format.
+        is_weighted: Boolean indicating whether the given metrics are weighted.
+
+    Returns:
+        A list (one entry per model output) of dicts.
+        For instance, if the model has 2 outputs, and for the first output
+        we want to compute "binary_accuracy" and "binary_crossentropy",
+        and just "binary_accuracy" for the second output,
+        the list would look like: `[{
+            'acc': binary_accuracy(),
+            'ce': binary_crossentropy(),
+          }, {
+            'acc': binary_accuracy(),
+          }]`
+
+    Raises:
+        TypeError: if an incorrect type is passed for the `metrics` argument.
+    """
+    if not metrics:
+        return [{} for _ in output_names]
+
+    if isinstance(metrics, list):
+        any_sub_list = any(isinstance(m, list) for m in metrics)
+        if any_sub_list:
+            if len(metrics) != len(output_names):
+                raise ValueError(
+                    "When passing a list of lists as `metrics`, "
+                    "it should have one entry per model output. "
+                    "The model has "
+                    + str(len(output_names))
+                    + " outputs, but you passed metrics="
+                    + str(metrics)
+                )
+            # User has provided a list of len = len(outputs).
+            nested_metrics = [generic_utils.to_list(m) for m in metrics]
+        else:
+            # If it is a single list we then apply all metrics to all outputs.
+            if len(output_names) > 1:
+                nested_metrics = []
+                for _ in output_names:
+                    nested_metrics.append(
+                        [metrics_module.clone_metric(m) for m in metrics]
+                    )
+            else:
+                nested_metrics = [metrics]
+    elif isinstance(metrics, collections.abc.Mapping):
+        generic_utils.check_for_unexpected_keys(
+            "metrics", metrics, output_names
+        )
         nested_metrics = []
-        for _ in output_names:
-          nested_metrics.append(
-              [metrics_module.clone_metric(m) for m in metrics])
-      else:
-        nested_metrics = [metrics]
-  elif isinstance(metrics, collections.abc.Mapping):
-    generic_utils.check_for_unexpected_keys('metrics', metrics, output_names)
-    nested_metrics = []
-    for name in output_names:
-      output_metrics = generic_utils.to_list(metrics.get(name, []))
-      nested_metrics.append(output_metrics)
-  else:
-    raise TypeError('Type of `metrics` argument not understood. '
-                    'Expected a list or dictionary, found: ' + str(metrics))
-
-  per_output_metrics = []
-  for i, metrics in enumerate(nested_metrics):
-    metrics_dict = collections.OrderedDict()
-    for metric in metrics:
-      metric_name = get_metric_name(metric, is_weighted)
-      metric_fn = get_metric_function(
-          metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
-      metric_fn._from_serialized = from_serialized  # pylint: disable=protected-access
-
-      # If the metric function is not stateful, we create a stateful version.
-      if not isinstance(metric_fn, metrics_module.Metric):
-        metric_fn = metrics_module.MeanMetricWrapper(
-            metric_fn, name=metric_name)
-        # If the metric is being revived from something stateless, such as a
-        # string (e.g. "accuracy"), we may need to later reapply transformations
-        # such as renaming.
-        metric_fn._from_serialized = False  # pylint: disable=protected-access
-      metrics_dict[metric_name] = metric_fn
-    per_output_metrics.append(metrics_dict)
-
-  return per_output_metrics
+        for name in output_names:
+            output_metrics = generic_utils.to_list(metrics.get(name, []))
+            nested_metrics.append(output_metrics)
+    else:
+        raise TypeError(
+            "Type of `metrics` argument not understood. "
+            "Expected a list or dictionary, found: " + str(metrics)
+        )
+
+    per_output_metrics = []
+    for i, metrics in enumerate(nested_metrics):
+        metrics_dict = collections.OrderedDict()
+        for metric in metrics:
+            metric_name = get_metric_name(metric, is_weighted)
+            metric_fn = get_metric_function(
+                metric, output_shape=output_shapes[i], loss_fn=loss_fns[i]
+            )
+            metric_fn._from_serialized = from_serialized
+
+            # If the metric function is not stateful, we create a stateful
+            # version.
+            if not isinstance(metric_fn, metrics_module.Metric):
+                metric_fn = metrics_module.MeanMetricWrapper(
+                    metric_fn, name=metric_name
+                )
+                # If the metric is being revived from something stateless, such
+                # as a string (e.g. "accuracy"), we may need to later reapply
+                # transformations such as renaming.
+                metric_fn._from_serialized = False
+            metrics_dict[metric_name] = metric_fn
+        per_output_metrics.append(metrics_dict)
+
+    return per_output_metrics
 
 
 def batch_shuffle(index_array, batch_size):
-  """Shuffles an array in a batch-wise fashion.
-
-  Useful for shuffling HDF5 arrays
-  (where one cannot access arbitrary indices).
-
-  Args:
-      index_array: array of indices to be shuffled.
-      batch_size: integer.
-
-  Returns:
-      The `index_array` array, shuffled in a batch-wise fashion.
-  """
-  batch_count = int(len(index_array) / batch_size)
-  # to reshape we need to be cleanly divisible by batch size
-  # we stash extra items and reappend them after shuffling
-  last_batch = index_array[batch_count * batch_size:]
-  index_array = index_array[:batch_count * batch_size]
-  index_array = index_array.reshape((batch_count, batch_size))
-  np.random.shuffle(index_array)
-  index_array = index_array.flatten()
-  return np.append(index_array, last_batch)
-
-
-def standardize_weights(y,
-                        sample_weight=None,
-                        class_weight=None,
-                        sample_weight_mode=None):
-  """Performs sample weight validation and standardization.
-
-  Everything gets normalized to a single sample-wise (or timestep-wise)
-  weight array. If both `sample_weight` and `class_weight` are provided,
-  the weights are multiplied.
-
-  Args:
-      y: Numpy array or Tensor of model targets to be weighted.
-      sample_weight: User-provided `sample_weight` argument.
-      class_weight: User-provided `class_weight` argument.
-      sample_weight_mode: One of `None` or `"temporal"`. `"temporal"` indicated
-        that we expect 2D weight data that will be applied to the last 2
-        dimensions of the targets (i.e. we are weighting timesteps, not
-        samples).
-
-  Returns:
-      A numpy array of target weights, one entry per sample to weight.
-
-  Raises:
-      ValueError: In case of invalid user-provided arguments.
-  """
-  # Iterator may return sample_weight as 1-tuple
-  if isinstance(sample_weight, tuple):
-    sample_weight = sample_weight[0]
-  if sample_weight_mode is not None and sample_weight_mode != 'samplewise':
-    if sample_weight_mode != 'temporal':
-      raise ValueError('"sample_weight_mode '
-                       'should be None or "temporal". '
-                       'Found: ' + str(sample_weight_mode))
-    if len(y.shape) < 3:
-      raise ValueError('Found a sample_weight array for '
-                       'an input with shape ' + str(y.shape) + '. '
-                       'Timestep-wise sample weighting (use of '
-                       'sample_weight_mode="temporal") is restricted to '
-                       'outputs that are at least 3D, i.e. that have '
-                       'a time dimension.')
-    if sample_weight is not None and len(sample_weight.shape) != 2:
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + '. '
-                       'In order to use timestep-wise sample weighting, '
-                       'you should pass a 2D sample_weight array.')
-  else:
-    if sample_weight is not None and len(sample_weight.shape) != 1:
-      raise ValueError(
-          'Found a sample_weight array with shape {}. In order to '
-          'use timestep-wise sample weights, you should specify '
-          'sample_weight_mode="temporal" in compile(); founssd "{}" '
-          'instead. If you just mean to use sample-wise weights, '
-          'make sure your sample_weight array is 1D.'.format(
-              sample_weight.shape, sample_weight_mode))
-
-  if sample_weight is not None:
-    if len(sample_weight.shape) > len(y.shape):
-      raise ValueError('Found a sample_weight with shape' +
-                       str(sample_weight.shape) + '.'
-                       'Expected sample_weight with rank '
-                       'less than or equal to ' + str(len(y.shape)))
-
-    if (not tf.is_tensor(sample_weight) and
-        y.shape[:sample_weight.ndim] != sample_weight.shape):
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + ' for an input with shape ' +
-                       str(y.shape) + '. '
-                       'sample_weight cannot be broadcast.')
-
-  # Class weights applied per-sample.
-  class_sample_weight = None
-  if isinstance(class_weight, dict):
-    if len(y.shape) > 2:
-      raise ValueError('`class_weight` not supported for '
-                       '3+ dimensional targets.')
-
-    if tf.is_tensor(y):
-      # Few classes are expected, so densifying is reasonable.
-      keys = np.array(sorted(class_weight.keys()))
-      values = np.array([class_weight[i] for i in keys])
-      weight_vector = np.zeros(np.max(keys) + 1)
-      weight_vector[:] = np.nan
-      weight_vector[keys] = values
-
-      y_classes = tf.__internal__.smart_cond.smart_cond(
-          len(y.shape.as_list()) == 2 and backend.shape(y)[1] > 1,
-          lambda: backend.argmax(y, axis=1),
-          lambda: tf.cast(backend.reshape(y, (-1,)), tf.int64))
-      class_sample_weight = tf.compat.v1.gather(weight_vector, y_classes)
-      tf.debugging.check_numerics(
-          class_sample_weight,
-          'Invalid classes or class weights detected. NaN values indicate that '
-          'an appropriate class weight could not be determined.')
-      class_sample_weight = tf.cast(class_sample_weight, backend.floatx())
-      if sample_weight is not None:
-        sample_weight = tf.cast(
-            tf.convert_to_tensor(sample_weight),
-            backend.floatx())
-    else:
-      y_classes = y
-      if len(y.shape) == 2:
-        if y.shape[1] > 1:
-          y_classes = np.argmax(y, axis=1)
-        elif y.shape[1] == 1:
-          y_classes = np.reshape(y, y.shape[0])
-
-      class_sample_weight = np.asarray(
-          [class_weight[cls] for cls in y_classes if cls in class_weight])
-
-      if len(class_sample_weight) != len(y_classes):
-        # subtract the sets to pick all missing classes
-        existing_classes = set(y_classes)
-        existing_class_weight = set(class_weight.keys())
-        raise ValueError(
-            '`class_weight` must contain all classes in the data.'
-            ' The classes %s exist in the data but not in '
-            '`class_weight`.' % (existing_classes - existing_class_weight))
+    """Shuffles an array in a batch-wise fashion.
 
-  if class_sample_weight is not None and sample_weight is not None:
-    # Multiply weights if both are provided.
-    return class_sample_weight * sample_weight
-  if sample_weight is not None:
-    return sample_weight
-  if class_sample_weight is not None:
-    return class_sample_weight
-  return None
+    Useful for shuffling HDF5 arrays
+    (where one cannot access arbitrary indices).
+
+    Args:
+        index_array: array of indices to be shuffled.
+        batch_size: integer.
+
+    Returns:
+        The `index_array` array, shuffled in a batch-wise fashion.
+    """
+    batch_count = int(len(index_array) / batch_size)
+    # to reshape we need to be cleanly divisible by batch size
+    # we stash extra items and reappend them after shuffling
+    last_batch = index_array[batch_count * batch_size :]
+    index_array = index_array[: batch_count * batch_size]
+    index_array = index_array.reshape((batch_count, batch_size))
+    np.random.shuffle(index_array)
+    index_array = index_array.flatten()
+    return np.append(index_array, last_batch)
+
+
+def standardize_weights(
+    y, sample_weight=None, class_weight=None, sample_weight_mode=None
+):
+    """Performs sample weight validation and standardization.
+
+    Everything gets normalized to a single sample-wise (or timestep-wise)
+    weight array. If both `sample_weight` and `class_weight` are provided,
+    the weights are multiplied.
+
+    Args:
+        y: Numpy array or Tensor of model targets to be weighted.
+        sample_weight: User-provided `sample_weight` argument.
+        class_weight: User-provided `class_weight` argument.
+        sample_weight_mode: One of `None` or `"temporal"`. `"temporal"`
+          indicated that we expect 2D weight data that will be applied to the
+          last 2 dimensions of the targets (i.e. we are weighting timesteps, not
+          samples).
+
+    Returns:
+        A numpy array of target weights, one entry per sample to weight.
+
+    Raises:
+        ValueError: In case of invalid user-provided arguments.
+    """
+    # Iterator may return sample_weight as 1-tuple
+    if isinstance(sample_weight, tuple):
+        sample_weight = sample_weight[0]
+    if sample_weight_mode is not None and sample_weight_mode != "samplewise":
+        if sample_weight_mode != "temporal":
+            raise ValueError(
+                '"sample_weight_mode should be None or "temporal". Found: '
+                + str(sample_weight_mode)
+            )
+        if len(y.shape) < 3:
+            raise ValueError(
+                "Found a sample_weight array for an input with shape "
+                + str(y.shape)
+                + ". "
+                "Timestep-wise sample weighting (use of "
+                'sample_weight_mode="temporal") is restricted to '
+                "outputs that are at least 3D, i.e. that have "
+                "a time dimension."
+            )
+        if sample_weight is not None and len(sample_weight.shape) != 2:
+            raise ValueError(
+                "Found a sample_weight array with shape "
+                + str(sample_weight.shape)
+                + ". "
+                "In order to use timestep-wise sample weighting, "
+                "you should pass a 2D sample_weight array."
+            )
+    else:
+        if sample_weight is not None and len(sample_weight.shape) != 1:
+            raise ValueError(
+                "Found a sample_weight array with shape {}. In order to "
+                "use timestep-wise sample weights, you should specify "
+                'sample_weight_mode="temporal" in compile(); founssd "{}" '
+                "instead. If you just mean to use sample-wise weights, "
+                "make sure your sample_weight array is 1D.".format(
+                    sample_weight.shape, sample_weight_mode
+                )
+            )
+
+    if sample_weight is not None:
+        if len(sample_weight.shape) > len(y.shape):
+            raise ValueError(
+                "Found a sample_weight with shape"
+                + str(sample_weight.shape)
+                + ".Expected sample_weight with rank less than or equal to "
+                + str(len(y.shape))
+            )
+
+        if (
+            not tf.is_tensor(sample_weight)
+            and y.shape[: sample_weight.ndim] != sample_weight.shape
+        ):
+            raise ValueError(
+                "Found a sample_weight array with shape "
+                + str(sample_weight.shape)
+                + " for an input with shape "
+                + str(y.shape)
+                + ". sample_weight cannot be broadcast."
+            )
+
+    # Class weights applied per-sample.
+    class_sample_weight = None
+    if isinstance(class_weight, dict):
+        if len(y.shape) > 2:
+            raise ValueError(
+                "`class_weight` not supported for 3+ dimensional targets."
+            )
+
+        if tf.is_tensor(y):
+            # Few classes are expected, so densifying is reasonable.
+            keys = np.array(sorted(class_weight.keys()))
+            values = np.array([class_weight[i] for i in keys])
+            weight_vector = np.zeros(np.max(keys) + 1)
+            weight_vector[:] = np.nan
+            weight_vector[keys] = values
+
+            y_classes = tf.__internal__.smart_cond.smart_cond(
+                len(y.shape.as_list()) == 2 and backend.shape(y)[1] > 1,
+                lambda: backend.argmax(y, axis=1),
+                lambda: tf.cast(backend.reshape(y, (-1,)), tf.int64),
+            )
+            class_sample_weight = tf.compat.v1.gather(weight_vector, y_classes)
+            tf.debugging.check_numerics(
+                class_sample_weight,
+                "Invalid classes or class weights detected. NaN values "
+                "indicate that an appropriate class weight could not be "
+                "determined.",
+            )
+            class_sample_weight = tf.cast(class_sample_weight, backend.floatx())
+            if sample_weight is not None:
+                sample_weight = tf.cast(
+                    tf.convert_to_tensor(sample_weight), backend.floatx()
+                )
+        else:
+            y_classes = y
+            if len(y.shape) == 2:
+                if y.shape[1] > 1:
+                    y_classes = np.argmax(y, axis=1)
+                elif y.shape[1] == 1:
+                    y_classes = np.reshape(y, y.shape[0])
+
+            class_sample_weight = np.asarray(
+                [class_weight[cls] for cls in y_classes if cls in class_weight]
+            )
+
+            if len(class_sample_weight) != len(y_classes):
+                # subtract the sets to pick all missing classes
+                existing_classes = set(y_classes)
+                existing_class_weight = set(class_weight.keys())
+                raise ValueError(
+                    "`class_weight` must contain all classes in the data."
+                    " The classes %s exist in the data but not in "
+                    "`class_weight`."
+                    % (existing_classes - existing_class_weight)
+                )
+
+    if class_sample_weight is not None and sample_weight is not None:
+        # Multiply weights if both are provided.
+        return class_sample_weight * sample_weight
+    if sample_weight is not None:
+        return sample_weight
+    if class_sample_weight is not None:
+        return class_sample_weight
+    return None
 
 
 def has_symbolic_tensors(ls):
-  if tf.executing_eagerly():
-    return False
-  return has_tensors(ls)
+    if tf.executing_eagerly():
+        return False
+    return has_tensors(ls)
 
 
 def has_tensors(ls):
-  """Returns true if `ls` contains tensors."""
-  # Note: at some point in time ragged tensors didn't count as tensors, so this
-  # returned false for ragged tensors. Making this return true fails some tests
-  # which would then require a steps_per_epoch argument.
-  if isinstance(ls, (list, tuple)):
-    return any(
-        tf.is_tensor(v) and
-        not isinstance(v, tf.RaggedTensor) for v in ls)
-  if isinstance(ls, dict):
-    return any(
-        tf.is_tensor(v) and
-        not isinstance(v, tf.RaggedTensor)
-        for _, v in ls.items())
-  return tf.is_tensor(ls) and not isinstance(
-      ls, tf.RaggedTensor)
+    """Returns true if `ls` contains tensors."""
+    # Note: at some point in time ragged tensors didn't count as tensors, so
+    # this returned false for ragged tensors. Making this return true fails some
+    # tests which would then require a steps_per_epoch argument.
+    if isinstance(ls, (list, tuple)):
+        return any(
+            tf.is_tensor(v) and not isinstance(v, tf.RaggedTensor) for v in ls
+        )
+    if isinstance(ls, dict):
+        return any(
+            tf.is_tensor(v) and not isinstance(v, tf.RaggedTensor)
+            for _, v in ls.items()
+        )
+    return tf.is_tensor(ls) and not isinstance(ls, tf.RaggedTensor)
 
 
 def get_metric_name(metric, weighted=False):
-  """Returns the name corresponding to the given metric input.
-
-  Args:
-    metric: Metric function name or reference.
-    weighted: Boolean indicating if the given metric is weighted.
-
-  Returns:
-      The metric name.
-  """
-  if tf.__internal__.tf2.enabled():
-    # We keep the string that the user has set in compile as the metric name.
-    if isinstance(metric, str):
-      return metric
-
-    metric = metrics_module.get(metric)
-    return metric.name if hasattr(metric, 'name') else metric.__name__
-  else:
-    metric_name_prefix = 'weighted_' if weighted else ''
-    if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-      if metric in ('accuracy', 'acc'):
-        suffix = 'acc'
-      elif metric in ('crossentropy', 'ce'):
-        suffix = 'ce'
+    """Returns the name corresponding to the given metric input.
+
+    Args:
+      metric: Metric function name or reference.
+      weighted: Boolean indicating if the given metric is weighted.
+
+    Returns:
+        The metric name.
+    """
+    if tf.__internal__.tf2.enabled():
+        # We keep the string that the user has set in compile as the metric
+        # name.
+        if isinstance(metric, str):
+            return metric
+
+        metric = metrics_module.get(metric)
+        return metric.name if hasattr(metric, "name") else metric.__name__
     else:
-      metric_fn = metrics_module.get(metric)
-      # Get metric name as string
-      if hasattr(metric_fn, 'name'):
-        suffix = metric_fn.name
-      else:
-        suffix = metric_fn.__name__
-    metric_name = metric_name_prefix + suffix
-    return metric_name
+        metric_name_prefix = "weighted_" if weighted else ""
+        if metric in ("accuracy", "acc", "crossentropy", "ce"):
+            if metric in ("accuracy", "acc"):
+                suffix = "acc"
+            elif metric in ("crossentropy", "ce"):
+                suffix = "ce"
+        else:
+            metric_fn = metrics_module.get(metric)
+            # Get metric name as string
+            if hasattr(metric_fn, "name"):
+                suffix = metric_fn.name
+            else:
+                suffix = metric_fn.__name__
+        metric_name = metric_name_prefix + suffix
+        return metric_name
 
 
 def get_metric_function(metric, output_shape=None, loss_fn=None):
-  """Returns the metric function corresponding to the given metric input.
+    """Returns the metric function corresponding to the given metric input.
 
-  Args:
-      metric: Metric function name or reference.
-      output_shape: The shape of the output that this metric will be calculated
-        for.
-      loss_fn: The loss function used.
-
-  Returns:
-      The metric function.
-  """
-  if metric not in ['accuracy', 'acc', 'crossentropy', 'ce']:
-    return metrics_module.get(metric)
-
-  is_sparse_categorical_crossentropy = (
-      isinstance(loss_fn, losses.SparseCategoricalCrossentropy) or
-      (isinstance(loss_fn, losses.LossFunctionWrapper) and
-       loss_fn.fn == losses.sparse_categorical_crossentropy))
-
-  is_binary_crossentropy = (
-      isinstance(loss_fn, losses.BinaryCrossentropy) or
-      (isinstance(loss_fn, losses.LossFunctionWrapper) and
-       loss_fn.fn == losses.binary_crossentropy))
-
-  if metric in ['accuracy', 'acc']:
-    if output_shape[-1] == 1 or is_binary_crossentropy:
-      return metrics_module.binary_accuracy
-    elif is_sparse_categorical_crossentropy:
-      return metrics_module.sparse_categorical_accuracy
-    # If the output_shape[-1] is not 1, then we know output is `categorical`.
-    # We assume it is sparse categorical only if loss is explicitly given
-    # as sparse categorical crossentropy loss.
-    return metrics_module.categorical_accuracy
-  else:
-    if output_shape[-1] == 1 or is_binary_crossentropy:
-      return metrics_module.binary_crossentropy
-    elif is_sparse_categorical_crossentropy:
-      return metrics_module.sparse_categorical_crossentropy
-    return metrics_module.categorical_crossentropy
-
-
-def call_metric_function(metric_fn,
-                         y_true,
-                         y_pred=None,
-                         weights=None,
-                         mask=None):
-  """Invokes metric function and returns the metric result tensor."""
-  if mask is not None:
-    mask = tf.cast(mask, y_pred.dtype)
-    if weights is None:
-      # Use mask as sample weight.
-      weights = mask
+    Args:
+        metric: Metric function name or reference.
+        output_shape: The shape of the output that this metric will be
+          calculated for.
+        loss_fn: The loss function used.
+
+    Returns:
+        The metric function.
+    """
+    if metric not in ["accuracy", "acc", "crossentropy", "ce"]:
+        return metrics_module.get(metric)
+
+    is_sparse_categorical_crossentropy = isinstance(
+        loss_fn, losses.SparseCategoricalCrossentropy
+    ) or (
+        isinstance(loss_fn, losses.LossFunctionWrapper)
+        and loss_fn.fn == losses.sparse_categorical_crossentropy
+    )
+
+    is_binary_crossentropy = isinstance(loss_fn, losses.BinaryCrossentropy) or (
+        isinstance(loss_fn, losses.LossFunctionWrapper)
+        and loss_fn.fn == losses.binary_crossentropy
+    )
+
+    if metric in ["accuracy", "acc"]:
+        if output_shape[-1] == 1 or is_binary_crossentropy:
+            return metrics_module.binary_accuracy
+        elif is_sparse_categorical_crossentropy:
+            return metrics_module.sparse_categorical_accuracy
+        # If the output_shape[-1] is not 1, then we know output is
+        # `categorical`.  We assume it is sparse categorical only if loss is
+        # explicitly given as sparse categorical crossentropy loss.
+        return metrics_module.categorical_accuracy
     else:
-      # Update dimensions of weights to match with mask.
-      weights = tf.cast(weights, dtype=y_pred.dtype)
-      mask, _, weights = losses_utils.squeeze_or_expand_dimensions(
-          mask, sample_weight=weights)
-      weights *= mask
+        if output_shape[-1] == 1 or is_binary_crossentropy:
+            return metrics_module.binary_crossentropy
+        elif is_sparse_categorical_crossentropy:
+            return metrics_module.sparse_categorical_crossentropy
+        return metrics_module.categorical_crossentropy
+
+
+def call_metric_function(
+    metric_fn, y_true, y_pred=None, weights=None, mask=None
+):
+    """Invokes metric function and returns the metric result tensor."""
+    if mask is not None:
+        mask = tf.cast(mask, y_pred.dtype)
+        if weights is None:
+            # Use mask as sample weight.
+            weights = mask
+        else:
+            # Update dimensions of weights to match with mask.
+            weights = tf.cast(weights, dtype=y_pred.dtype)
+            mask, _, weights = losses_utils.squeeze_or_expand_dimensions(
+                mask, sample_weight=weights
+            )
+            weights *= mask
 
-  if y_pred is not None:
-    return metric_fn(y_true, y_pred, sample_weight=weights)
-  # `Mean` metric only takes a single value.
-  return metric_fn(y_true, sample_weight=weights)
+    if y_pred is not None:
+        return metric_fn(y_true, y_pred, sample_weight=weights)
+    # `Mean` metric only takes a single value.
+    return metric_fn(y_true, sample_weight=weights)
 
 
 def get_loss_function(loss):
-  """Returns the loss corresponding to the loss input in `compile` API."""
-  if loss is None or isinstance(loss, losses.Loss):
-    return loss
+    """Returns the loss corresponding to the loss input in `compile` API."""
+    if loss is None or isinstance(loss, losses.Loss):
+        return loss
 
-  if tf_inspect.isclass(loss) and issubclass(loss, losses.Loss):
-    # It is not safe to assume that the loss takes no constructor arguments.
-    raise ValueError(
-        'Received uninstantiated Loss class: {}\nPlease call loss ""classes '
-        'before passing them to Model.compile.'.format(loss))
-
-  # Deserialize loss configuration, if needed.
-  if isinstance(loss, collections.abc.Mapping):
-    loss = losses.get(loss)
+    if tf_inspect.isclass(loss) and issubclass(loss, losses.Loss):
+        # It is not safe to assume that the loss takes no constructor arguments.
+        raise ValueError(
+            "Received uninstantiated Loss class: {}\n"
+            "Please call loss classes "
+            "before passing them to Model.compile.".format(loss)
+        )
+
+    # Deserialize loss configuration, if needed.
+    if isinstance(loss, collections.abc.Mapping):
+        loss = losses.get(loss)
+
+    # Custom callable class.
+    if callable(loss) and not hasattr(loss, "__name__"):
+        return loss
+
+    # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
+    # in `LossFunctionWrapper` class.
+    loss_fn = losses.get(loss)
+
+    # For losses which are given as strings/functions in the compile API,
+    # we always set the loss reduction type to be `SUM_OVER_BATCH_SIZE`
+    # (both in distribution strategy context and otherwise).
+    return losses.LossFunctionWrapper(
+        loss_fn,
+        name=loss_fn.__name__,
+        reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+    )
 
-  # Custom callable class.
-  if callable(loss) and not hasattr(loss, '__name__'):
-    return loss
 
-  # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
-  # in `LossFunctionWrapper` class.
-  loss_fn = losses.get(loss)
+def validate_dataset_input(x, y, sample_weight, validation_split=None):
+    """Validates user input arguments when a dataset iterator is passed.
 
-  # For losses which are given as strings/functions in the compile API,
-  # we always set the loss reduction type to be `SUM_OVER_BATCH_SIZE`
-  # (both in distribution strategy context and otherwise).
-  return losses.LossFunctionWrapper(
-      loss_fn,
-      name=loss_fn.__name__,
-      reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
+    Args:
+      x: Input data. A `tf.data` dataset or iterator.
+      y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
+        Expected to be `None` when `x` is a dataset iterator.
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`. Expected to be `None` when
+        `x` is a dataset iterator
+      validation_split: Float between 0 and 1. Fraction of the training data to
+        be used as validation data. Expected to be `None` when `x` is a dataset
+        iterator.
+
+    Raises:
+      ValueError: if argument `y` or `sample_weight` or `validation_split` are
+          provided by user.
+    """
+    if y is not None:
+        raise ValueError(
+            "You passed a dataset or dataset iterator (%s) as "
+            "input `x` to your model. In that case, you should "
+            "not specify a target (`y`) argument, since the dataset "
+            "or dataset iterator generates both input data and "
+            "target data. "
+            "Received: %s" % (x, y)
+        )
+    if sample_weight is not None:
+        raise ValueError(
+            "`sample_weight` argument is not supported when input "
+            "`x` is a dataset or a dataset iterator. Instead, you"
+            "can provide sample_weight as the third element  of your"
+            "dataset, i.e. (inputs, targets, sample_weight). "
+            "Received: x=%s, sample_weight=%s" % (x, sample_weight)
+        )
+    if validation_split is not None and validation_split != 0.0:
+        raise ValueError(
+            "`validation_split` argument is not supported when "
+            "input `x` is a dataset or a dataset iterator. "
+            "Received: x=%s, validation_split=%f" % (x, validation_split)
+        )
+
+
+def validate_input_types(inp, orig_inp, allow_dict=True, field_name="inputs"):
+    """Helper function to validate either inputs or targets."""
+    if isinstance(inp, (list, tuple)):
+        if not all(isinstance(v, np.ndarray) or tf.is_tensor(v) for v in inp):
+            raise ValueError(
+                "Please provide as model inputs either a single array or a "
+                f"list of arrays. You passed: {field_name}={str(orig_inp)}"
+            )
+    elif isinstance(inp, dict):
+        if not allow_dict:
+            raise ValueError(
+                f"You cannot pass a dictionary as model {field_name}."
+            )
+    elif not isinstance(inp, np.ndarray) and not tf.is_tensor(inp):
+        raise ValueError(
+            "Please provide as model inputs either a single array or a list of "
+            "arrays. You passed: {}={}".format(field_name, orig_inp)
+        )
 
 
-def validate_dataset_input(x, y, sample_weight, validation_split=None):
-  """Validates user input arguments when a dataset iterator is passed.
-
-  Args:
-    x: Input data. A `tf.data` dataset or iterator.
-    y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
-      Expected to be `None` when `x` is a dataset iterator.
-    sample_weight: An optional sample-weight array passed by the user to weight
-      the importance of each sample in `x`. Expected to be `None` when `x` is a
-      dataset iterator
-    validation_split: Float between 0 and 1. Fraction of the training data to be
-      used as validation data. Expected to be `None` when `x` is a dataset
-      iterator.
-
-  Raises:
-    ValueError: if argument `y` or `sample_weight` or `validation_split` are
-        provided by user.
-  """
-  if y is not None:
-    raise ValueError('You passed a dataset or dataset iterator (%s) as '
-                     'input `x` to your model. In that case, you should '
-                     'not specify a target (`y`) argument, since the dataset '
-                     'or dataset iterator generates both input data and '
-                     'target data. '
-                     'Received: %s' % (x, y))
-  if sample_weight is not None:
-    raise ValueError('`sample_weight` argument is not supported when input '
-                     '`x` is a dataset or a dataset iterator. Instead, you'
-                     'can provide sample_weight as the third element  of your'
-                     'dataset, i.e. (inputs, targets, sample_weight). '
-                     'Received: x=%s, sample_weight=%s' % (x, sample_weight))
-  if validation_split is not None and validation_split != 0.0:
-    raise ValueError(
-        '`validation_split` argument is not supported when '
-        'input `x` is a dataset or a dataset iterator. '
-        'Received: x=%s, validation_split=%f' % (x, validation_split))
-
-
-def validate_input_types(inp, orig_inp, allow_dict=True, field_name='inputs'):
-  """Helper function to validate either inputs or targets."""
-  if isinstance(inp, (list, tuple)):
-    if not all(isinstance(v, np.ndarray) or
-               tf.is_tensor(v) for v in inp):
-      raise ValueError(
-          'Please provide as model inputs either a single array or a list of '
-          'arrays. You passed: {}={}'.format(field_name, str(orig_inp)))
-  elif isinstance(inp, dict):
-    if not allow_dict:
-      raise ValueError(
-          'You cannot pass a dictionary as model {}.'.format(field_name))
-  elif not isinstance(inp, np.ndarray) and not tf.is_tensor(inp):
-    raise ValueError(
-        'Please provide as model inputs either a single array or a list of '
-        'arrays. You passed: {}={}'.format(field_name, orig_inp))
-
-
-def check_generator_arguments(y=None, sample_weight=None,
-                              validation_split=None):
-  """Validates arguments passed when using a generator."""
-  if y is not None:
-    raise ValueError('`y` argument is not supported when data is'
-                     'a generator or Sequence instance. Instead pass targets'
-                     ' as the second element of the generator.')
-  if sample_weight is not None:
-    raise ValueError('`sample_weight` argument is not supported when data is'
-                     'a generator or Sequence instance. Instead pass sample'
-                     ' weights as the third element of the generator.')
-  if validation_split:
-    raise ValueError('If your data is in the form of a Python generator, '
-                     'you cannot use `validation_split`.')
+def check_generator_arguments(
+    y=None, sample_weight=None, validation_split=None
+):
+    """Validates arguments passed when using a generator."""
+    if y is not None:
+        raise ValueError(
+            "`y` argument is not supported when data is"
+            "a generator or Sequence instance. Instead pass targets"
+            " as the second element of the generator."
+        )
+    if sample_weight is not None:
+        raise ValueError(
+            "`sample_weight` argument is not supported when data is"
+            "a generator or Sequence instance. Instead pass sample"
+            " weights as the third element of the generator."
+        )
+    if validation_split:
+        raise ValueError(
+            "If your data is in the form of a Python generator, "
+            "you cannot use `validation_split`."
+        )
 
 
 def check_steps_argument(input_data, steps, steps_name):
-  """Validates `steps` argument based on input data's type.
-
-  The cases when `steps` value must be provided are when
-    1. input data passed is an iterator.
-    2. model was built on top of symbolic tensors, input data is not
-       required and is `None`.
-    3. input data passed is a symbolic tensor.
-
-  Args:
-      input_data: Input data. Can be Numpy array(s) or TensorFlow tensor(s) or
-        tf.data.Dataset iterator or `None`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      steps_name: The public API's parameter name for `steps`.
-
-  Returns:
-    boolean, True if `steps` argument is required, else False.
-
-  Raises:
-      ValueError: if `steps` argument is required for given input data type
-        but not provided.
-  """
-  is_x_iterator = isinstance(
-      input_data, (tf.compat.v1.data.Iterator, tf.data.Iterator))
-  if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
-      (isinstance(input_data, list) and not input_data)):
-    if steps is None:
-      input_type_str = 'a Dataset iterator' if is_x_iterator else 'data tensors'
-      raise ValueError('When using {input_type} as input to a model, you should'
-                       ' specify the `{steps_name}` argument.'.format(
-                           input_type=input_type_str, steps_name=steps_name))
-    return True
-
-  if isinstance(input_data, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
-    return True
-
-  if steps is not None:
-    list_types = (np.ndarray, list, tuple)
-    if (isinstance(input_data, list_types) or
-        (isinstance(input_data, dict) and
-         any(isinstance(v, list_types) for v in input_data.values()))):
-      logging.warning('When passing input data as arrays, do not specify '
-                      '`steps_per_epoch`/`steps` argument. '
-                      'Please use `batch_size` instead.')
-  return False
+    """Validates `steps` argument based on input data's type.
+
+    The cases when `steps` value must be provided are when
+      1. input data passed is an iterator.
+      2. model was built on top of symbolic tensors, input data is not
+         required and is `None`.
+      3. input data passed is a symbolic tensor.
+
+    Args:
+        input_data: Input data. Can be Numpy array(s) or TensorFlow tensor(s) or
+          tf.data.Dataset iterator or `None`.
+        steps: Integer or `None`. Total number of steps (batches of samples) to
+          execute.
+        steps_name: The public API's parameter name for `steps`.
+
+    Returns:
+      boolean, True if `steps` argument is required, else False.
+
+    Raises:
+        ValueError: if `steps` argument is required for given input data type
+          but not provided.
+    """
+    is_x_iterator = isinstance(
+        input_data, (tf.compat.v1.data.Iterator, tf.data.Iterator)
+    )
+    if (
+        input_data is None
+        or is_x_iterator
+        or has_symbolic_tensors(input_data)
+        or (isinstance(input_data, list) and not input_data)
+    ):
+        if steps is None:
+            input_type_str = (
+                "a Dataset iterator" if is_x_iterator else "data tensors"
+            )
+            raise ValueError(
+                "When using {input_type} as input to a model, you should"
+                " specify the `{steps_name}` argument.".format(
+                    input_type=input_type_str, steps_name=steps_name
+                )
+            )
+        return True
+
+    if isinstance(input_data, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
+        return True
+
+    if steps is not None:
+        list_types = (np.ndarray, list, tuple)
+        if isinstance(input_data, list_types) or (
+            isinstance(input_data, dict)
+            and any(isinstance(v, list_types) for v in input_data.values())
+        ):
+            logging.warning(
+                "When passing input data as arrays, do not specify "
+                "`steps_per_epoch`/`steps` argument. "
+                "Please use `batch_size` instead."
+            )
+    return False
 
 
 def cast_single_tensor(x, dtype=None):
-  if isinstance(x, np.ndarray):
-    x = tf.convert_to_tensor(x)
-  dtype = dtype or backend.floatx()
-  if x.dtype.is_floating:
-    return tf.cast(x, dtype=dtype)
-  return x
+    if isinstance(x, np.ndarray):
+        x = tf.convert_to_tensor(x)
+    dtype = dtype or backend.floatx()
+    if x.dtype.is_floating:
+        return tf.cast(x, dtype=dtype)
+    return x
 
 
 def cast_if_floating_dtype_and_mismatch(targets, outputs):
-  """Returns target data tensors using correct datatype.
-
-  Checks that each target and output pair are the same datatype. If not, casts
-  the target to the output's datatype.
-
-  Args:
-    targets: tensor or list of targets.
-    outputs: tensor or list of outputs.
-
-  Returns:
-    Targets in appropriate datatype.
-  """
-  if tf.is_tensor(targets):
-    # There is one target, so output[0] should be the only output.
-    return cast_single_tensor(targets, dtype=outputs[0].dtype)
-  new_targets = []
-  for target, out in zip(targets, outputs):
-    if isinstance(target, np.ndarray):
-      target = tf.convert_to_tensor(target)
-    if target.dtype != out.dtype:
-      new_targets.append(cast_single_tensor(target, dtype=out.dtype))
-    else:
-      new_targets.append(target)
-  return new_targets
+    """Returns target data tensors using correct datatype.
+
+    Checks that each target and output pair are the same datatype. If not, casts
+    the target to the output's datatype.
+
+    Args:
+      targets: tensor or list of targets.
+      outputs: tensor or list of outputs.
+
+    Returns:
+      Targets in appropriate datatype.
+    """
+    if tf.is_tensor(targets):
+        # There is one target, so output[0] should be the only output.
+        return cast_single_tensor(targets, dtype=outputs[0].dtype)
+    new_targets = []
+    for target, out in zip(targets, outputs):
+        if isinstance(target, np.ndarray):
+            target = tf.convert_to_tensor(target)
+        if target.dtype != out.dtype:
+            new_targets.append(cast_single_tensor(target, dtype=out.dtype))
+        else:
+            new_targets.append(target)
+    return new_targets
 
 
 def cast_if_floating_dtype(x, dtype=None):
-  """Casts the given data tensors to the default floating point type.
+    """Casts the given data tensors to the default floating point type.
 
-  Casts only if the input is already a floating point type.
-  Args:
-    x: tensor or list/tuple of tensors.
-    dtype: The dtype to which Tensors should be cast.
+    Casts only if the input is already a floating point type.
+    Args:
+      x: tensor or list/tuple of tensors.
+      dtype: The dtype to which Tensors should be cast.
 
-  Returns:
-    Converted input.
-  """
-  return tf.nest.map_structure(functools.partial(cast_single_tensor, dtype=dtype),
-                            x)
+    Returns:
+      Converted input.
+    """
+    return tf.nest.map_structure(
+        functools.partial(cast_single_tensor, dtype=dtype), x
+    )
 
 
 def cast_to_model_input_dtypes(x, model):
-  """Casts the given data tensors to the dtypes of the model inputs.
+    """Casts the given data tensors to the dtypes of the model inputs.
 
-  Args:
-    x: tensor or list/tuple of tensors.
-    model: The model.
+    Args:
+      x: tensor or list/tuple of tensors.
+      model: The model.
 
-  Returns:
-    Converted input. Each tensor is casted to the corresponding input in
-    `model.inputs`.
-  """
-  input_dtypes = tf.nest.map_structure(lambda t: t.dtype, model.inputs)
-  return tf.nest.map_structure(tf.cast, x, input_dtypes)
+    Returns:
+      Converted input. Each tensor is casted to the corresponding input in
+      `model.inputs`.
+    """
+    input_dtypes = tf.nest.map_structure(lambda t: t.dtype, model.inputs)
+    return tf.nest.map_structure(tf.cast, x, input_dtypes)
 
 
 def prepare_sample_weight_modes(training_endpoints, sample_weight_mode):
-  """Prepares sample weight modes for the model.
-
-  Args:
-    training_endpoints: List of model _TrainingEndpoints.
-    sample_weight_mode: sample weight mode user input passed from compile API.
-
-  Raises:
-    ValueError: In case of invalid `sample_weight_mode` input.
-  """
-
-  if isinstance(sample_weight_mode, collections.abc.Mapping):
-    generic_utils.check_for_unexpected_keys(
-        'sample_weight_mode', sample_weight_mode,
-        [e.output_name for e in training_endpoints])
-
-    for end_point in training_endpoints:
-      if not end_point.should_skip_target_weights():
-        if end_point.output_name not in sample_weight_mode:
-          raise ValueError('Output ' + end_point.output_name +
-                           'missing from `_sample_weight_modes` dictionary')
-        else:
-          end_point.sample_weight_mode = sample_weight_mode.get(
-              end_point.output_name)
-  elif isinstance(sample_weight_mode, (list, tuple)):
-    if len(sample_weight_mode) != len(training_endpoints):
-      raise ValueError('When passing a list as sample_weight_mode, '
-                       'it should have one entry per model output. '
-                       'The model has ' + str(len(training_endpoints)) +
-                       ' outputs, but you passed ' +
-                       str(len(sample_weight_mode)) + '_sample_weight_modes.')
-    for mode, endpoint in zip(sample_weight_mode, training_endpoints):
-      if not endpoint.should_skip_target_weights():
-        endpoint.sample_weight_mode = mode
-  else:
-    for endpoint in training_endpoints:
-      if not endpoint.should_skip_target_weights():
-        endpoint.sample_weight_mode = sample_weight_mode
+    """Prepares sample weight modes for the model.
+
+    Args:
+      training_endpoints: List of model _TrainingEndpoints.
+      sample_weight_mode: sample weight mode user input passed from compile API.
+
+    Raises:
+      ValueError: In case of invalid `sample_weight_mode` input.
+    """
+
+    if isinstance(sample_weight_mode, collections.abc.Mapping):
+        generic_utils.check_for_unexpected_keys(
+            "sample_weight_mode",
+            sample_weight_mode,
+            [e.output_name for e in training_endpoints],
+        )
+
+        for end_point in training_endpoints:
+            if not end_point.should_skip_target_weights():
+                if end_point.output_name not in sample_weight_mode:
+                    raise ValueError(
+                        "Output "
+                        + end_point.output_name
+                        + "missing from `_sample_weight_modes` dictionary"
+                    )
+                else:
+                    end_point.sample_weight_mode = sample_weight_mode.get(
+                        end_point.output_name
+                    )
+    elif isinstance(sample_weight_mode, (list, tuple)):
+        if len(sample_weight_mode) != len(training_endpoints):
+            raise ValueError(
+                "When passing a list as sample_weight_mode, "
+                "it should have one entry per model output. "
+                "The model has "
+                + str(len(training_endpoints))
+                + " outputs, but you passed "
+                + str(len(sample_weight_mode))
+                + "_sample_weight_modes."
+            )
+        for mode, endpoint in zip(sample_weight_mode, training_endpoints):
+            if not endpoint.should_skip_target_weights():
+                endpoint.sample_weight_mode = mode
+    else:
+        for endpoint in training_endpoints:
+            if not endpoint.should_skip_target_weights():
+                endpoint.sample_weight_mode = sample_weight_mode
 
 
 def prepare_loss_functions(loss, output_names):
-  """Converts loss to a list of loss functions.
-
-  Args:
-      loss: String (name of objective function), objective function or
-        `tf.losses.Loss` instance. See `tf.losses`. If the model has multiple
-        outputs, you can use a different loss on each output by passing a
-        dictionary or a list of losses. The loss value that will be minimized by
-        the model will then be the sum of all individual losses.
-      output_names: List of model output names.
-
-  Returns:
-      A list of loss objective functions.
-
-  Raises:
-      ValueError: If loss is a dict with keys not in model output names,
-          or if loss is a list with len not equal to model outputs.
-  """
-  if isinstance(loss, collections.abc.Mapping):
-    generic_utils.check_for_unexpected_keys('loss', loss, output_names)
-    loss_functions = []
-    for name in output_names:
-      if name not in loss:
-        logging.warning(
-            'Output {0} missing from loss dictionary. We assume '
-            'this was done on purpose. The fit and evaluate APIs will not be '
-            'expecting any data to be passed to {0}.'.format(name))
-      loss_functions.append(get_loss_function(loss.get(name, None)))
-  elif isinstance(loss, str):
-    loss_functions = [get_loss_function(loss) for _ in output_names]
-  elif isinstance(loss, collections.abc.Sequence):
-    if len(loss) != len(output_names):
-      raise ValueError('When passing a list as loss, it should have one entry '
-                       'per model outputs. The model has {} outputs, but you '
-                       'passed loss={}'.format(len(output_names), loss))
-    loss_functions = tf.nest.map_structure(get_loss_function, loss)
-  else:
-    loss_functions = [get_loss_function(loss) for _ in range(len(output_names))]
-
-  return loss_functions
+    """Converts loss to a list of loss functions.
+
+    Args:
+        loss: String (name of objective function), objective function or
+          `tf.keras.losses.Loss` instance. See `tf.keras.losses`.
+          If the model has multiple
+          outputs, you can use a different loss on each output by passing a
+          dictionary or a list of losses. The loss value that will be minimized
+          by the model will then be the sum of all individual losses.
+        output_names: List of model output names.
+
+    Returns:
+        A list of loss objective functions.
+
+    Raises:
+        ValueError: If loss is a dict with keys not in model output names,
+            or if loss is a list with len not equal to model outputs.
+    """
+    if isinstance(loss, collections.abc.Mapping):
+        generic_utils.check_for_unexpected_keys("loss", loss, output_names)
+        loss_functions = []
+        for name in output_names:
+            if name not in loss:
+                logging.warning(
+                    "Output {0} missing from loss dictionary. We assume "
+                    "this was done on purpose. The fit and evaluate APIs will "
+                    f"not be expecting any data to be passed to {name}."
+                )
+            loss_functions.append(get_loss_function(loss.get(name, None)))
+    elif isinstance(loss, str):
+        loss_functions = [get_loss_function(loss) for _ in output_names]
+    elif isinstance(loss, collections.abc.Sequence):
+        if len(loss) != len(output_names):
+            raise ValueError(
+                "When passing a list as loss, it should have one entry "
+                "per model outputs. The model has {} outputs, but you "
+                "passed loss={}".format(len(output_names), loss)
+            )
+        loss_functions = tf.nest.map_structure(get_loss_function, loss)
+    else:
+        loss_functions = [
+            get_loss_function(loss) for _ in range(len(output_names))
+        ]
+
+    return loss_functions
 
 
 def prepare_loss_weights(training_endpoints, loss_weights=None):
-  """Converts loss weights to a list of loss weights.
+    """Converts loss weights to a list of loss weights.
 
-  The result loss weights will be populated on the training endpoint.
+    The result loss weights will be populated on the training endpoint.
 
-  Args:
-      training_endpoints: List of model training endpoints.
-      loss_weights: Optional list or dictionary specifying scalar coefficients
-        (Python floats) to weight the loss contributions of different model
-        outputs. The loss value that will be minimized by the model will then be
-        the *weighted sum* of all individual losses, weighted by the
+    Args:
+        training_endpoints: List of model training endpoints.
+        loss_weights: Optional list or dictionary specifying scalar coefficients
+          (Python floats) to weight the loss contributions of different model
+          outputs. The loss value that will be minimized by the model will then
+          be the *weighted sum* of all individual losses, weighted by the
           `loss_weights` coefficients. If a list, it is expected to have a 1:1
-            mapping to the model's outputs. If a dict, it is expected to map
-            output names (strings) to scalar coefficients.
-
-  Raises:
-      ValueError: If loss weight is a dict with key not in model output names,
-          or if loss is a list with len not equal to model outputs.
-  """
-  if loss_weights is None:
-    for e in training_endpoints:
-      e.loss_weight = 1.
-  elif isinstance(loss_weights, collections.abc.Mapping):
-    generic_utils.check_for_unexpected_keys(
-        'loss_weights', loss_weights,
-        [e.output_name for e in training_endpoints])
-    for e in training_endpoints:
-      e.loss_weight = loss_weights.get(e.output_name, 1.)
-  elif isinstance(loss_weights, list):
-    if len(loss_weights) != len(training_endpoints):
-      raise ValueError('When passing a list as loss_weights, '
-                       'it should have one entry per model output. '
-                       'The model has ' + str(len(training_endpoints)) +
-                       ' outputs, but you passed loss_weights=' +
-                       str(loss_weights))
-    for w, e in zip(loss_weights, training_endpoints):
-      e.loss_weight = w
-  else:
-    raise TypeError('Could not interpret loss_weights argument: ' +
-                    str(loss_weights) + ' - expected a list of dicts.')
+          mapping to the model's outputs. If a dict, it is expected to map
+          output names (strings) to scalar coefficients.
+
+    Raises:
+        ValueError: If loss weight is a dict with key not in model output names,
+            or if loss is a list with len not equal to model outputs.
+    """
+    if loss_weights is None:
+        for e in training_endpoints:
+            e.loss_weight = 1.0
+    elif isinstance(loss_weights, collections.abc.Mapping):
+        generic_utils.check_for_unexpected_keys(
+            "loss_weights",
+            loss_weights,
+            [e.output_name for e in training_endpoints],
+        )
+        for e in training_endpoints:
+            e.loss_weight = loss_weights.get(e.output_name, 1.0)
+    elif isinstance(loss_weights, list):
+        if len(loss_weights) != len(training_endpoints):
+            raise ValueError(
+                "When passing a list as loss_weights, "
+                "it should have one entry per model output. "
+                "The model has "
+                + str(len(training_endpoints))
+                + " outputs, but you passed loss_weights="
+                + str(loss_weights)
+            )
+        for w, e in zip(loss_weights, training_endpoints):
+            e.loss_weight = w
+    else:
+        raise TypeError(
+            "Could not interpret loss_weights argument: "
+            + str(loss_weights)
+            + " - expected a list of dicts."
+        )
 
 
 # TODO(rohanj): This is a hack to get around not depending on feature_column and
 # create a cyclical dependency. Figure out a cleaner solution
 def is_feature_layer(layer):
-  """Returns whether `layer` is a FeatureLayer or not."""
-  return getattr(layer, '_is_feature_layer', False)
+    """Returns whether `layer` is a FeatureLayer or not."""
+    return getattr(layer, "_is_feature_layer", False)
 
 
 def is_eager_dataset_or_iterator(data):
-  return tf.executing_eagerly() and isinstance(
-      data, (tf.compat.v1.data.Dataset, tf.data.Dataset,
-             tf.data.Iterator))
+    return tf.executing_eagerly() and isinstance(
+        data, (tf.compat.v1.data.Dataset, tf.data.Dataset, tf.data.Iterator)
+    )
 
 
-# pylint: disable=protected-access
 def get_dataset_graph_def(dataset):
-  if tf.executing_eagerly():
-    graph_def_str = dataset._as_serialized_graph().numpy()
-  else:
-    graph_def_str = backend.get_value(dataset._as_serialized_graph())
-  return tf.compat.v1.GraphDef().FromString(graph_def_str)
+    if tf.executing_eagerly():
+        graph_def_str = dataset._as_serialized_graph().numpy()
+    else:
+        graph_def_str = backend.get_value(dataset._as_serialized_graph())
+    return tf.compat.v1.GraphDef().FromString(graph_def_str)
 
 
 def verify_dataset_shuffled(x):
-  """Verifies that the dataset is shuffled.
-
-  Args:
-    x: Dataset passed as an input to the model.
-
-  Returns:
-    boolean, whether the input dataset is shuffled or not.
-  """
-  assert isinstance(x, tf.data.Dataset)
-  graph_def = get_dataset_graph_def(x)
-  for node in graph_def.node:
-    if node.op.startswith('ShuffleDataset'):
-      return True
-  # Also check graph_def.library.function for ds.interleave or ds.flat_map
-  for function in graph_def.library.function:
-    for node in function.node_def:
-      if node.op.startswith('ShuffleDataset'):
-        return True
-  logging.warning('Expected a shuffled dataset but input dataset `x` is '
-                  'not shuffled. Please invoke `shuffle()` on input dataset.')
-  return False
+    """Verifies that the dataset is shuffled.
+
+    Args:
+      x: Dataset passed as an input to the model.
+
+    Returns:
+      boolean, whether the input dataset is shuffled or not.
+    """
+    assert isinstance(x, tf.data.Dataset)
+    graph_def = get_dataset_graph_def(x)
+    for node in graph_def.node:
+        if node.op.startswith("ShuffleDataset"):
+            return True
+    # Also check graph_def.library.function for ds.interleave or ds.flat_map
+    for function in graph_def.library.function:
+        for node in function.node_def:
+            if node.op.startswith("ShuffleDataset"):
+                return True
+    logging.warning(
+        "Expected a shuffled dataset but input dataset `x` is "
+        "not shuffled. Please invoke `shuffle()` on input dataset."
+    )
+    return False
 
 
 def is_dataset_or_iterator(data):
-  return isinstance(data, (tf.compat.v1.data.Dataset, tf.data.Dataset,
-                           tf.compat.v1.data.Iterator, tf.data.Iterator))
+    return isinstance(
+        data,
+        (
+            tf.compat.v1.data.Dataset,
+            tf.data.Dataset,
+            tf.compat.v1.data.Iterator,
+            tf.data.Iterator,
+        ),
+    )
 
 
 def get_iterator(dataset):
-  """Create and initialize an iterator from a dataset."""
-  if tf.executing_eagerly():
-    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
-  else:
-    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-  initialize_iterator(iterator)
-  return iterator
+    """Create and initialize an iterator from a dataset."""
+    if tf.executing_eagerly():
+        iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
+    else:
+        iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
+    initialize_iterator(iterator)
+    return iterator
 
 
 def initialize_iterator(iterator):
-  if not tf.executing_eagerly():
-    init_op = iterator.initializer
-    backend.get_session((init_op,)).run(init_op)
+    if not tf.executing_eagerly():
+        init_op = iterator.initializer
+        backend.get_session((init_op,)).run(init_op)
 
 
 def extract_tensors_from_dataset(dataset):
-  """Extract a tuple of tensors `inputs, targets, sample_weight` from a dataset.
+    """Extract tuple of tensors `inputs, targets, sample_weight` from a dataset.
 
-  Args:
-    dataset: Dataset instance.
+    Args:
+      dataset: Dataset instance.
 
-  Returns:
-    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
-  """
-  iterator = get_iterator(dataset)
-  inputs, targets, sample_weight = unpack_iterator_input(iterator)
-  return inputs, targets, sample_weight
+    Returns:
+      Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+    """
+    iterator = get_iterator(dataset)
+    inputs, targets, sample_weight = unpack_iterator_input(iterator)
+    return inputs, targets, sample_weight
 
 
 def unpack_iterator_input(iterator):
-  """Convert a dataset iterator to a tuple of tensors `x, y, sample_weights`.
-
-  Args:
-    iterator: Instance of a dataset iterator.
-
-  Returns:
-    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
-  """
-  try:
-    next_element = iterator.get_next()
-  except tf.errors.OutOfRangeError:
-    raise RuntimeError('Your dataset iterator ran out of data; '
-                       'Make sure that your dataset can generate '
-                       'required number of samples.')
-
-  if isinstance(next_element, (list, tuple)):
-    if len(next_element) not in [2, 3]:
-      raise ValueError(
-          'Please provide model inputs as a list or tuple of 2 or 3 '
-          'elements: (input, target) or (input, target, sample_weights) '
-          'Received %s' % next_element)
-    if len(next_element) == 2:
-      x, y = next_element
-      weights = None
-    else:
-      x, y, weights = next_element
-  else:
-    x = next_element
-    y = None
-    weights = None
-  return x, y, weights
-
-
-def infer_steps_for_dataset(model,
-                            dataset,
-                            steps,
-                            epochs=1,
-                            steps_name='steps'):
-  """Infers steps_per_epoch needed to loop through a dataset.
-
-  Args:
-      model: Keras model instance.
-      dataset: Input data of type tf.data.Dataset.
-      steps: Number of steps to draw from the dataset (may be None if unknown).
-      epochs: Number of times to iterate over the dataset.
-      steps_name: The string name of the steps argument, either `steps`,
-        `validation_steps`, or `steps_per_epoch`. Only used for error message
-        formatting.
-
-  Returns:
-    Integer or `None`. Inferred number of steps to loop through the dataset.
-    `None` is returned if 1) the size of the dataset is unknown and `steps` was
-    not specified, or 2) this is multi-worker training and auto sharding is
-    enabled.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  assert isinstance(dataset, tf.data.Dataset)
-  if (model._in_multi_worker_mode() and
-      (dataset.options().experimental_distribute.auto_shard_policy !=
-       tf.data.experimental.AutoShardPolicy.OFF)):
-    # If the dataset would be auto-sharded, we should not infer a local
-    # steps_per_epoch due to the possible imbalanced sharding between workers.
-    return None
-
-  size = backend.get_value(tf.data.experimental.cardinality(dataset))
-  if size == tf.data.experimental.INFINITE_CARDINALITY and steps is None:
-    raise ValueError('When passing an infinitely repeating dataset, you '
-                     'must specify the `%s` argument.' % (steps_name,))
-  if size >= 0:
-    if steps is not None and steps * epochs > size:
-      if epochs > 1:
-        raise ValueError('The dataset you passed contains %s batches, but you '
-                         'passed `epochs=%s` and `%s=%s`, which is a total of '
-                         '%s steps. We cannot draw that many steps from this '
-                         'dataset. We suggest to set `%s=%s`.' %
-                         (size, epochs, steps_name, steps, steps * epochs,
-                          steps_name, size // epochs))
-      else:
-        raise ValueError('The dataset you passed contains %s batches, but you '
-                         'passed `%s=%s`. We cannot draw that many steps from '
-                         'this dataset. We suggest to set `%s=%s`.' %
-                         (size, steps_name, steps, steps_name, size))
-  if steps is None:
-    if size >= 0:
-      return size
-    return None
-  return steps
+    """Convert a dataset iterator to a tuple of tensors `x, y, sample_weights`.
 
+    Args:
+      iterator: Instance of a dataset iterator.
 
-class ModelInputs:
-  """Encapsulates model inputs.
+    Returns:
+      Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+    """
+    try:
+        next_element = iterator.get_next()
+    except tf.errors.OutOfRangeError:
+        raise RuntimeError(
+            "Your dataset iterator ran out of data; "
+            "Make sure that your dataset can generate "
+            "required number of samples."
+        )
+
+    if isinstance(next_element, (list, tuple)):
+        if len(next_element) not in [2, 3]:
+            raise ValueError(
+                "Please provide model inputs as a list or tuple of 2 or 3 "
+                "elements: (input, target) or (input, target, sample_weights) "
+                "Received %s" % next_element
+            )
+        if len(next_element) == 2:
+            x, y = next_element
+            weights = None
+        else:
+            x, y, weights = next_element
+    else:
+        x = next_element
+        y = None
+        weights = None
+    return x, y, weights
 
-  Allows for transforming model inputs while keeping the same structure.
-  """
 
-  def __init__(self, inputs):
-    self._inputs = inputs
-    self._is_dict = isinstance(self._inputs, dict)
-    self._is_single_input = not isinstance(self._inputs, (list, tuple, dict))
+def infer_steps_for_dataset(
+    model, dataset, steps, epochs=1, steps_name="steps"
+):
+    """Infers steps_per_epoch needed to loop through a dataset.
 
-    self._flattened_inputs = []
-    self._input_names = []
+    Args:
+        model: Keras model instance.
+        dataset: Input data of type tf.data.Dataset.
+        steps: Number of steps to draw from the dataset (may be None if
+          unknown).
+        epochs: Number of times to iterate over the dataset.
+        steps_name: The string name of the steps argument, either `steps`,
+          `validation_steps`, or `steps_per_epoch`. Only used for error message
+          formatting.
+
+    Returns:
+      Integer or `None`. Inferred number of steps to loop through the dataset.
+      `None` is returned if 1) the size of the dataset is unknown and `steps`
+      was not specified, or 2) this is multi-worker training and auto sharding
+      is enabled.
+
+    Raises:
+      ValueError: In case of invalid argument values.
+    """
+    assert isinstance(dataset, tf.data.Dataset)
+    if model._in_multi_worker_mode() and (
+        dataset.options().experimental_distribute.auto_shard_policy
+        != tf.data.experimental.AutoShardPolicy.OFF
+    ):
+        # If the dataset would be auto-sharded, we should not infer a local
+        # steps_per_epoch due to the possible imbalanced sharding between
+        # workers.
+        return None
+
+    size = backend.get_value(tf.data.experimental.cardinality(dataset))
+    if size == tf.data.experimental.INFINITE_CARDINALITY and steps is None:
+        raise ValueError(
+            "When passing an infinitely repeating dataset, you "
+            "must specify the `%s` argument." % (steps_name,)
+        )
+    if size >= 0:
+        if steps is not None and steps * epochs > size:
+            if epochs > 1:
+                raise ValueError(
+                    "The dataset you passed contains %s batches, but you "
+                    "passed `epochs=%s` and `%s=%s`, which is a total of "
+                    "%s steps. We cannot draw that many steps from this "
+                    "dataset. We suggest to set `%s=%s`."
+                    % (
+                        size,
+                        epochs,
+                        steps_name,
+                        steps,
+                        steps * epochs,
+                        steps_name,
+                        size // epochs,
+                    )
+                )
+            else:
+                raise ValueError(
+                    "The dataset you passed contains %s batches, but you "
+                    "passed `%s=%s`. We cannot draw that many steps from "
+                    "this dataset. We suggest to set `%s=%s`."
+                    % (size, steps_name, steps, steps_name, size)
+                )
+    if steps is None:
+        if size >= 0:
+            return size
+        return None
+    return steps
 
-    if self._is_dict:
-      for k in sorted(self._inputs.keys()):
-        self._flattened_inputs.append(self._inputs[k])
-        self._input_names.append(k)
-    else:
-      self._flattened_inputs = tf.nest.flatten(self._inputs)
-      self._input_names = [
-          'input_%d' % (i + 1) for i in range(len(self._flattened_inputs))
-      ]
 
-  def get_input_names(self):
-    """Returns keys to name inputs by.
+class ModelInputs:
+    """Encapsulates model inputs.
 
-    In case inputs provided were a list, tuple or single entry, we make up a
-    key 'input_%d'. For dictionary case, we return a sorted list of keys.
+    Allows for transforming model inputs while keeping the same structure.
     """
-    return self._input_names
-
-  def get_symbolic_inputs(self, return_single_as_list=False):
-    """Returns inputs to be set as self.inputs for a model."""
-    # TODO(karmel): There is a side-effect here where what you get
-    # with as_list and as_dict depends on whether you have called this
-    # method first, since it modifies in place.
-    for i, (k, v) in enumerate(zip(self._input_names, self._flattened_inputs)):
-      if isinstance(v, (list, float, int)):
-        v = np.asarray(v)
-        if v.ndim == 1:
-          v = np.expand_dims(v, 1)
-
-      if isinstance(v, np.ndarray):
-        # We fix the placeholder shape except the batch size.
-        # This is suboptimal, but it is the best we can do with the info
-        # we have. The user should call `model._set_inputs(placeholders)`
-        # to specify custom placeholders if the need arises.
-        shape = (None,) + tuple(v.shape[1:])
-        if shape == (None,):
-          shape = (None, 1)
-        dtype = tf.as_dtype(v.dtype)
-        if dtype.is_floating:
-          dtype = backend.floatx()
-        v = backend.placeholder(shape=shape, name=k, dtype=dtype)
-      elif isinstance(v, tf.TensorSpec):
-        shape = (None,) + tuple(v.shape.as_list()[1:])
-        if shape == (None,):
-          shape = (None, 1)
-        v = backend.placeholder(shape=shape, name=k, dtype=v.dtype)
-
-      self._flattened_inputs[i] = v
-
-    if self._is_dict:
-      return dict(zip(self._input_names, self._flattened_inputs))
-    if self._is_single_input and not return_single_as_list:
-      return self._flattened_inputs[0]
-    return self._flattened_inputs
-
-  def as_dict(self):
-    """An iterable over a dictionary version of inputs."""
-    for k, v in zip(self._input_names, self._flattened_inputs):
-      yield k, v
-
-  def as_list(self):
-    """Returning the inputs as a list."""
-    return self._flattened_inputs
 
+    def __init__(self, inputs):
+        self._inputs = inputs
+        self._is_dict = isinstance(self._inputs, dict)
+        self._is_single_input = not isinstance(
+            self._inputs, (list, tuple, dict)
+        )
 
-# Allow use of methods not exposed to the user.
-# pylint: disable=protected-access
+        self._flattened_inputs = []
+        self._input_names = []
+
+        if self._is_dict:
+            for k in sorted(self._inputs.keys()):
+                self._flattened_inputs.append(self._inputs[k])
+                self._input_names.append(k)
+        else:
+            self._flattened_inputs = tf.nest.flatten(self._inputs)
+            self._input_names = [
+                "input_%d" % (i + 1) for i in range(len(self._flattened_inputs))
+            ]
+
+    def get_input_names(self):
+        """Returns keys to name inputs by.
+
+        In case inputs provided were a list, tuple or single entry, we make up a
+        key 'input_%d'. For dictionary case, we return a sorted list of keys.
+        """
+        return self._input_names
+
+    def get_symbolic_inputs(self, return_single_as_list=False):
+        """Returns inputs to be set as self.inputs for a model."""
+        # TODO(karmel): There is a side-effect here where what you get
+        # with as_list and as_dict depends on whether you have called this
+        # method first, since it modifies in place.
+        for i, (k, v) in enumerate(
+            zip(self._input_names, self._flattened_inputs)
+        ):
+            if isinstance(v, (list, float, int)):
+                v = np.asarray(v)
+                if v.ndim == 1:
+                    v = np.expand_dims(v, 1)
+
+            if isinstance(v, np.ndarray):
+                # We fix the placeholder shape except the batch size.
+                # This is suboptimal, but it is the best we can do with the info
+                # we have. The user should call
+                # `model._set_inputs(placeholders)` to specify custom
+                # placeholders if the need arises.
+                shape = (None,) + tuple(v.shape[1:])
+                if shape == (None,):
+                    shape = (None, 1)
+                dtype = tf.as_dtype(v.dtype)
+                if dtype.is_floating:
+                    dtype = backend.floatx()
+                v = backend.placeholder(shape=shape, name=k, dtype=dtype)
+            elif isinstance(v, tf.TensorSpec):
+                shape = (None,) + tuple(v.shape.as_list()[1:])
+                if shape == (None,):
+                    shape = (None, 1)
+                v = backend.placeholder(shape=shape, name=k, dtype=v.dtype)
+
+            self._flattened_inputs[i] = v
+
+        if self._is_dict:
+            return dict(zip(self._input_names, self._flattened_inputs))
+        if self._is_single_input and not return_single_as_list:
+            return self._flattened_inputs[0]
+        return self._flattened_inputs
+
+    def as_dict(self):
+        """An iterable over a dictionary version of inputs."""
+        for k, v in zip(self._input_names, self._flattened_inputs):
+            yield k, v
+
+    def as_list(self):
+        """Returning the inputs as a list."""
+        return self._flattened_inputs
 
 
-# pylint: enable=protected-access
+# Allow use of methods not exposed to the user.
 
 
 def generic_output_names(outputs_list):
-  return ['output_%d' % (i + 1) for i in range(len(outputs_list))]
+    return ["output_%d" % (i + 1) for i in range(len(outputs_list))]
 
 
 def should_run_validation(validation_freq, epoch):
-  """Checks if validation should be run this epoch.
+    """Checks if validation should be run this epoch.
 
-  Args:
-    validation_freq: Integer or list. If an integer, specifies how many training
-      epochs to run before a new validation run is performed. If a list,
-      specifies the epochs on which to run validation.
-    epoch: Integer, the number of the training epoch just completed.
+    Args:
+      validation_freq: Integer or list. If an integer, specifies how many
+        training epochs to run before a new validation run is performed. If a
+        list, specifies the epochs on which to run validation.
+      epoch: Integer, the number of the training epoch just completed.
 
-  Returns:
-    Bool, True if validation should be run.
+    Returns:
+      Bool, True if validation should be run.
 
-  Raises:
-    ValueError: if `validation_freq` is an Integer and less than 1, or if
-    it is neither an Integer nor a Sequence.
-  """
-  # `epoch` is 0-indexed internally but 1-indexed in the public API.
-  one_indexed_epoch = epoch + 1
+    Raises:
+      ValueError: if `validation_freq` is an Integer and less than 1, or if
+      it is neither an Integer nor a Sequence.
+    """
+    # `epoch` is 0-indexed internally but 1-indexed in the public API.
+    one_indexed_epoch = epoch + 1
 
-  if isinstance(validation_freq, int):
-    if validation_freq < 1:
-      raise ValueError('`validation_freq` can not be less than 1.')
-    return one_indexed_epoch % validation_freq == 0
+    if isinstance(validation_freq, int):
+        if validation_freq < 1:
+            raise ValueError("`validation_freq` can not be less than 1.")
+        return one_indexed_epoch % validation_freq == 0
 
-  if not isinstance(validation_freq, collections.abc.Container):
-    raise ValueError('`validation_freq` must be an Integer or '
-                     '`collections.abc.Container` (e.g. list, tuple, etc.)')
-  return one_indexed_epoch in validation_freq
+    if not isinstance(validation_freq, collections.abc.Container):
+        raise ValueError(
+            "`validation_freq` must be an Integer or "
+            "`collections.abc.Container` (e.g. list, tuple, etc.)"
+        )
+    return one_indexed_epoch in validation_freq
 
 
 def split_training_and_validation_data(x, y, sample_weights, validation_split):
-  """Split input data into train/eval section based on validation_split."""
-  if has_symbolic_tensors(x):
-    raise ValueError('If your data is in the form of symbolic tensors, '
-                     'you cannot use `validation_split`.')
-  if hasattr(x[0], 'shape'):
-    split_at = int(x[0].shape[0] * (1. - validation_split))
-  else:
-    split_at = int(len(x[0]) * (1. - validation_split))
-  x, val_x = (generic_utils.slice_arrays(x, 0, split_at),
-              generic_utils.slice_arrays(x, split_at))
-  y, val_y = (generic_utils.slice_arrays(y, 0, split_at),
-              generic_utils.slice_arrays(y, split_at))
-  if sample_weights:
-    sample_weights, val_sample_weights = (
-        generic_utils.slice_arrays(sample_weights, 0, split_at),
-        generic_utils.slice_arrays(sample_weights, split_at),
+    """Split input data into train/eval section based on validation_split."""
+    if has_symbolic_tensors(x):
+        raise ValueError(
+            "If your data is in the form of symbolic tensors, "
+            "you cannot use `validation_split`."
+        )
+    if hasattr(x[0], "shape"):
+        split_at = int(x[0].shape[0] * (1.0 - validation_split))
+    else:
+        split_at = int(len(x[0]) * (1.0 - validation_split))
+    x, val_x = (
+        generic_utils.slice_arrays(x, 0, split_at),
+        generic_utils.slice_arrays(x, split_at),
+    )
+    y, val_y = (
+        generic_utils.slice_arrays(y, 0, split_at),
+        generic_utils.slice_arrays(y, split_at),
     )
-  else:
-    val_sample_weights = None
-  return x, y, sample_weights, val_x, val_y, val_sample_weights
+    if sample_weights:
+        sample_weights, val_sample_weights = (
+            generic_utils.slice_arrays(sample_weights, 0, split_at),
+            generic_utils.slice_arrays(sample_weights, split_at),
+        )
+    else:
+        val_sample_weights = None
+    return x, y, sample_weights, val_x, val_y, val_sample_weights
 
 
 def unpack_validation_data(validation_data, raise_if_ambiguous=True):
-  """Unpack validation data based input type.
-
-  The validation data is not touched if its dataset or dataset iterator.
-  For other type of input (Numpy or tensor), it will be unpacked into tuple of
-  3 which is x, y and sample weights.
-
-  Args:
-    validation_data: dataset, dataset iterator, or numpy, tensor tuple.
-    raise_if_ambiguous: boolean on whether to fail if validation_data cannot be
-      parsed. Otherwise simply return validation_data, None, None and defer the
-      decision to the caller.
-
-  Returns:
-    tuple of 3, (x, y, sample_weights) for numpy and tensor input.
-  """
-  if (isinstance(validation_data, (tf.compat.v1.data.Iterator,
-                                   tf.data.Iterator,
-                                   tf.data.Dataset,
-                                   data_utils.Sequence))
-      or not hasattr(validation_data, '__len__')):
-    val_x = validation_data
-    val_y = None
-    val_sample_weight = None
-  elif len(validation_data) == 2:
-    try:
-      val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-      val_sample_weight = None
-    except ValueError:
-      val_x, val_y, val_sample_weight = validation_data, None, None
-  elif len(validation_data) == 3:
-    try:
-      val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-    except ValueError:
-      val_x, val_y, val_sample_weight = validation_data, None, None
-  else:
-    if raise_if_ambiguous:
-      raise ValueError(
-          'When passing a `validation_data` argument, '
-          'it must contain either 2 items (x_val, y_val), '
-          'or 3 items (x_val, y_val, val_sample_weights), '
-          'or alternatively it could be a dataset or a '
-          'dataset or a dataset iterator. '
-          'However we received `validation_data=%s`' % validation_data)
-    val_x, val_y, val_sample_weight = validation_data, None, None
-  return val_x, val_y, val_sample_weight
+    """Unpack validation data based input type.
+
+    The validation data is not touched if its dataset or dataset iterator.
+    For other type of input (Numpy or tensor), it will be unpacked into tuple of
+    3 which is x, y and sample weights.
+
+    Args:
+      validation_data: dataset, dataset iterator, or numpy, tensor tuple.
+      raise_if_ambiguous: boolean on whether to fail if validation_data cannot
+        be parsed. Otherwise simply return validation_data, None, None and defer
+        the decision to the caller.
+
+    Returns:
+      tuple of 3, (x, y, sample_weights) for numpy and tensor input.
+    """
+    if isinstance(
+        validation_data,
+        (
+            tf.compat.v1.data.Iterator,
+            tf.data.Iterator,
+            tf.data.Dataset,
+            data_utils.Sequence,
+        ),
+    ) or not hasattr(validation_data, "__len__"):
+        val_x = validation_data
+        val_y = None
+        val_sample_weight = None
+    elif len(validation_data) == 2:
+        try:
+            (
+                val_x,
+                val_y,
+            ) = validation_data
+            val_sample_weight = None
+        except ValueError:
+            val_x, val_y, val_sample_weight = validation_data, None, None
+    elif len(validation_data) == 3:
+        try:
+            (
+                val_x,
+                val_y,
+                val_sample_weight,
+            ) = validation_data
+        except ValueError:
+            val_x, val_y, val_sample_weight = validation_data, None, None
+    else:
+        if raise_if_ambiguous:
+            raise ValueError(
+                "When passing a `validation_data` argument, "
+                "it must contain either 2 items (x_val, y_val), "
+                "or 3 items (x_val, y_val, val_sample_weights), "
+                "or alternatively it could be a dataset or a "
+                "dataset or a dataset iterator. "
+                "However we received `validation_data=%s`" % validation_data
+            )
+        val_x, val_y, val_sample_weight = validation_data, None, None
+    return val_x, val_y, val_sample_weight
 
 
 class TrainingLoop:
-  """TrainingLoop is a wrapper class around the training logic.
-
-  This class is trying to encapsulate the different logic of fit/eval/predict
-  with regard to different data input and model condition.
-
-  Note that TrainingLoop is stateless, which means it doesn't contain any
-  internal field and can be reused with different model and inputs.
-  """
-
-  def fit(self,
-          model,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          **kwargs):
-    """Train the model with the inputs and targets."""
-    raise NotImplementedError()
-
-  def evaluate(self,
-               model,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               **kwargs):
-    """Returns the loss value & metrics values for the model in test mode."""
-    raise NotImplementedError()
-
-  def predict(self,
-              model,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              **kwargs):
-    raise NotImplementedError()
+    """TrainingLoop is a wrapper class around the training logic.
+
+    This class is trying to encapsulate the different logic of fit/eval/predict
+    with regard to different data input and model condition.
+
+    Note that TrainingLoop is stateless, which means it doesn't contain any
+    internal field and can be reused with different model and inputs.
+    """
+
+    def fit(
+        self,
+        model,
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_freq=1,
+        **kwargs,
+    ):
+        """Train the model with the inputs and targets."""
+        raise NotImplementedError()
+
+    def evaluate(
+        self,
+        model,
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose=1,
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        """Returns the loss value & metrics values for the model in test
+        mode."""
+        raise NotImplementedError()
+
+    def predict(
+        self,
+        model,
+        x,
+        batch_size=None,
+        verbose=0,
+        steps=None,
+        callbacks=None,
+        **kwargs,
+    ):
+        raise NotImplementedError()
diff --git a/keras/engine/training_utils_v1_test.py b/keras/engine/training_utils_v1_test.py
index cd7aed6bdc37..d4cfb802765c 100644
--- a/keras/engine/training_utils_v1_test.py
+++ b/keras/engine/training_utils_v1_test.py
@@ -14,414 +14,492 @@
 # ==============================================================================
 """Tests for training utility functions."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 import multiprocessing.pool
 import time
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras import backend
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.engine import keras_tensor
 from keras.engine import training_utils_v1
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 
 class ModelInputsTest(tf.test.TestCase):
-
-  def test_single_thing(self):
-    a = np.ones(10)
-    model_inputs = training_utils_v1.ModelInputs(a)
-    self.assertEqual(['input_1'], model_inputs.get_input_names())
-    vals = model_inputs.get_symbolic_inputs()
-    self.assertTrue(tf.is_tensor(vals))
-    vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.assertEqual(1, len(vals))
-    self.assertTrue(tf.is_tensor(vals[0]))
-    self.assertEqual(backend.floatx(), vals[0].dtype)
-
-  def test_single_thing_eager(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Run in eager mode only.')
-    a = np.ones(10, dtype=np.int32)
-    model_inputs = training_utils_v1.ModelInputs(a)
-    self.assertEqual(['input_1'], model_inputs.get_input_names())
-    val = model_inputs.get_symbolic_inputs()
-    self.assertIsInstance(val, keras_tensor.KerasTensor)
-    vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.assertEqual(1, len(vals))
-    self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
-    self.assertEqual(tf.int32, vals[0].dtype)
-
-  def test_list(self):
-    a = [np.ones(10), np.ones(20)]
-    model_inputs = training_utils_v1.ModelInputs(a)
-    self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
-    vals = model_inputs.get_symbolic_inputs()
-    self.assertTrue(tf.is_tensor(vals[0]))
-    self.assertTrue(tf.is_tensor(vals[1]))
-
-  def test_list_eager(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Run in eager mode only.')
-    a = [np.ones(10), np.ones(20)]
-    model_inputs = training_utils_v1.ModelInputs(a)
-    self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
-    vals = model_inputs.get_symbolic_inputs()
-    self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
-    self.assertIsInstance(vals[1], keras_tensor.KerasTensor)
-
-  def test_dict(self):
-    a = {'b': np.ones(10), 'a': np.ones(20)}
-    model_inputs = training_utils_v1.ModelInputs(a)
-    self.assertEqual(['a', 'b'], model_inputs.get_input_names())
-    vals = model_inputs.get_symbolic_inputs()
-    self.assertTrue(tf.is_tensor(vals['a']))
-    self.assertTrue(tf.is_tensor(vals['b']))
-
-  def test_dict_eager(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Run in eager mode only.')
-    a = {'b': np.ones(10), 'a': np.ones(20)}
-    model_inputs = training_utils_v1.ModelInputs(a)
-    self.assertEqual(['a', 'b'], model_inputs.get_input_names())
-    vals = model_inputs.get_symbolic_inputs()
-    self.assertIsInstance(vals['a'], keras_tensor.KerasTensor)
-    self.assertIsInstance(vals['b'], keras_tensor.KerasTensor)
+    def test_single_thing(self):
+        a = np.ones(10)
+        model_inputs = training_utils_v1.ModelInputs(a)
+        self.assertEqual(["input_1"], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertTrue(tf.is_tensor(vals))
+        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+        self.assertEqual(1, len(vals))
+        self.assertTrue(tf.is_tensor(vals[0]))
+        self.assertEqual(backend.floatx(), vals[0].dtype)
+
+    def test_single_thing_eager(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Run in eager mode only.")
+        a = np.ones(10, dtype=np.int32)
+        model_inputs = training_utils_v1.ModelInputs(a)
+        self.assertEqual(["input_1"], model_inputs.get_input_names())
+        val = model_inputs.get_symbolic_inputs()
+        self.assertIsInstance(val, keras_tensor.KerasTensor)
+        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+        self.assertEqual(1, len(vals))
+        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+        self.assertEqual(tf.int32, vals[0].dtype)
+
+    def test_list(self):
+        a = [np.ones(10), np.ones(20)]
+        model_inputs = training_utils_v1.ModelInputs(a)
+        self.assertEqual(["input_1", "input_2"], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertTrue(tf.is_tensor(vals[0]))
+        self.assertTrue(tf.is_tensor(vals[1]))
+
+    def test_list_eager(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Run in eager mode only.")
+        a = [np.ones(10), np.ones(20)]
+        model_inputs = training_utils_v1.ModelInputs(a)
+        self.assertEqual(["input_1", "input_2"], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+        self.assertIsInstance(vals[1], keras_tensor.KerasTensor)
+
+    def test_dict(self):
+        a = {"b": np.ones(10), "a": np.ones(20)}
+        model_inputs = training_utils_v1.ModelInputs(a)
+        self.assertEqual(["a", "b"], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertTrue(tf.is_tensor(vals["a"]))
+        self.assertTrue(tf.is_tensor(vals["b"]))
+
+    def test_dict_eager(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Run in eager mode only.")
+        a = {"b": np.ones(10), "a": np.ones(20)}
+        model_inputs = training_utils_v1.ModelInputs(a)
+        self.assertEqual(["a", "b"], model_inputs.get_input_names())
+        vals = model_inputs.get_symbolic_inputs()
+        self.assertIsInstance(vals["a"], keras_tensor.KerasTensor)
+        self.assertIsInstance(vals["b"], keras_tensor.KerasTensor)
 
 
 class DatasetUtilsTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      # pylint: disable=g-long-lambda
-      ('Batch', lambda: tf.data.Dataset.range(5).batch(2)),
-      ('Cache', lambda: tf.data.Dataset.range(5).cache()),
-      ('Concatenate', lambda: tf.data.Dataset.range(5).concatenate(
-          tf.data.Dataset.range(5))),
-      ('FlatMap', lambda: tf.data.Dataset.range(5).flat_map(
-          lambda _: tf.data.Dataset.from_tensors(0))),
-      ('FlatMap_Shuffle', lambda: tf.data.Dataset.range(5).flat_map(
-          lambda _: tf.data.Dataset.from_tensors(0).shuffle(1)), True),
-      ('Filter', lambda: tf.data.Dataset.range(5).filter(lambda _: True)),
-      ('FixedLengthRecordDatasetV2',
-       lambda: tf.data.FixedLengthRecordDataset([], 42)),
-      ('FromTensors', lambda: tf.data.Dataset.from_tensors(0)),
-      ('FromTensorSlices',
-       lambda: tf.data.Dataset.from_tensor_slices([0, 0, 0])),
-      ('Interleave', lambda: tf.data.Dataset.range(5).interleave(
-          lambda _: tf.data.Dataset.from_tensors(0), cycle_length=1)),
-      ('Interleave_Shuffle', lambda: tf.data.Dataset.range(5).interleave(
-          lambda _: tf.data.Dataset.from_tensors(0).shuffle(1),
-          cycle_length=1), True),
-      ('Map', lambda: tf.data.Dataset.range(5).map(lambda x: x)),
-      ('Options',
-       lambda: tf.data.Dataset.range(5).with_options(tf.data.Options())
-      ),
-      ('PaddedBatch', lambda: tf.data.Dataset.range(5).padded_batch(2, [])),
-      ('ParallelInterleave', lambda: tf.data.Dataset.range(5).interleave(
-          lambda _: tf.data.Dataset.from_tensors(0),
-          cycle_length=1,
-          num_parallel_calls=1)),
-      ('ParallelMap', lambda: tf.data.Dataset.range(5).map(
-          lambda x: x, num_parallel_calls=1)),
-      ('Prefetch', lambda: tf.data.Dataset.range(5).prefetch(1)),
-      ('Range', lambda: tf.data.Dataset.range(0)),
-      ('Repeat', lambda: tf.data.Dataset.range(0).repeat(0)),
-      ('Shuffle', lambda: tf.data.Dataset.range(5).shuffle(1), True),
-      ('Skip', lambda: tf.data.Dataset.range(5).skip(2)),
-      ('Take', lambda: tf.data.Dataset.range(5).take(2)),
-      ('TextLineDataset', lambda: tf.data.TextLineDataset([])),
-      ('TFRecordDataset', lambda: tf.data.TFRecordDataset([])),
-      ('Window', lambda: tf.data.Dataset.range(5).window(2)),
-      ('Zip', lambda: tf.data.Dataset.zip(tf.data.Dataset.range(5))),
-      # pylint: enable=g-long-lambda
-  )
-  def test_verify_dataset_shuffled(self, dataset_fn, expect_shuffled=False):
-    dataset = dataset_fn()
-
-    if not expect_shuffled:
-      with tf.compat.v1.test.mock.patch.object(logging, 'warning') as mock_log:
-        shuffled = training_utils_v1.verify_dataset_shuffled(dataset)
-        self.assertRegex(
-            str(mock_log.call_args), 'input dataset `x` is not shuffled.')
-        self.assertFalse(shuffled)
-    else:
-      self.assertTrue(training_utils_v1.verify_dataset_shuffled(dataset))
+    @parameterized.named_parameters(
+        ("Batch", lambda: tf.data.Dataset.range(5).batch(2)),
+        ("Cache", lambda: tf.data.Dataset.range(5).cache()),
+        (
+            "Concatenate",
+            lambda: tf.data.Dataset.range(5).concatenate(
+                tf.data.Dataset.range(5)
+            ),
+        ),
+        (
+            "FlatMap",
+            lambda: tf.data.Dataset.range(5).flat_map(
+                lambda _: tf.data.Dataset.from_tensors(0)
+            ),
+        ),
+        (
+            "FlatMap_Shuffle",
+            lambda: tf.data.Dataset.range(5).flat_map(
+                lambda _: tf.data.Dataset.from_tensors(0).shuffle(1)
+            ),
+            True,
+        ),
+        ("Filter", lambda: tf.data.Dataset.range(5).filter(lambda _: True)),
+        (
+            "FixedLengthRecordDatasetV2",
+            lambda: tf.data.FixedLengthRecordDataset([], 42),
+        ),
+        ("FromTensors", lambda: tf.data.Dataset.from_tensors(0)),
+        (
+            "FromTensorSlices",
+            lambda: tf.data.Dataset.from_tensor_slices([0, 0, 0]),
+        ),
+        (
+            "Interleave",
+            lambda: tf.data.Dataset.range(5).interleave(
+                lambda _: tf.data.Dataset.from_tensors(0), cycle_length=1
+            ),
+        ),
+        (
+            "Interleave_Shuffle",
+            lambda: tf.data.Dataset.range(5).interleave(
+                lambda _: tf.data.Dataset.from_tensors(0).shuffle(1),
+                cycle_length=1,
+            ),
+            True,
+        ),
+        ("Map", lambda: tf.data.Dataset.range(5).map(lambda x: x)),
+        (
+            "Options",
+            lambda: tf.data.Dataset.range(5).with_options(tf.data.Options()),
+        ),
+        ("PaddedBatch", lambda: tf.data.Dataset.range(5).padded_batch(2, [])),
+        (
+            "ParallelInterleave",
+            lambda: tf.data.Dataset.range(5).interleave(
+                lambda _: tf.data.Dataset.from_tensors(0),
+                cycle_length=1,
+                num_parallel_calls=1,
+            ),
+        ),
+        (
+            "ParallelMap",
+            lambda: tf.data.Dataset.range(5).map(
+                lambda x: x, num_parallel_calls=1
+            ),
+        ),
+        ("Prefetch", lambda: tf.data.Dataset.range(5).prefetch(1)),
+        ("Range", lambda: tf.data.Dataset.range(0)),
+        ("Repeat", lambda: tf.data.Dataset.range(0).repeat(0)),
+        ("Shuffle", lambda: tf.data.Dataset.range(5).shuffle(1), True),
+        ("Skip", lambda: tf.data.Dataset.range(5).skip(2)),
+        ("Take", lambda: tf.data.Dataset.range(5).take(2)),
+        ("TextLineDataset", lambda: tf.data.TextLineDataset([])),
+        ("TFRecordDataset", lambda: tf.data.TFRecordDataset([])),
+        ("Window", lambda: tf.data.Dataset.range(5).window(2)),
+        ("Zip", lambda: tf.data.Dataset.zip(tf.data.Dataset.range(5))),
+    )
+    def test_verify_dataset_shuffled(self, dataset_fn, expect_shuffled=False):
+        dataset = dataset_fn()
+
+        if not expect_shuffled:
+            with tf.compat.v1.test.mock.patch.object(
+                logging, "warning"
+            ) as mock_log:
+                shuffled = training_utils_v1.verify_dataset_shuffled(dataset)
+                self.assertRegex(
+                    str(mock_log.call_args),
+                    "input dataset `x` is not shuffled.",
+                )
+                self.assertFalse(shuffled)
+        else:
+            self.assertTrue(training_utils_v1.verify_dataset_shuffled(dataset))
 
 
 class StandardizeWeightsTest(test_combinations.TestCase):
-
-  def test_sample_weights(self):
-    y = np.array([0, 1, 0, 0, 2])
-    sample_weights = np.array([0.5, 1., 1., 0., 2.])
-    weights = training_utils_v1.standardize_weights(y, sample_weights)
-    self.assertAllClose(weights, sample_weights)
-
-  def test_class_weights(self):
-    y = np.array([0, 1, 0, 0, 2])
-    class_weights = {0: 0.5, 1: 1., 2: 1.5}
-    weights = training_utils_v1.standardize_weights(
-        y, class_weight=class_weights)
-    self.assertAllClose(weights, np.array([0.5, 1., 0.5, 0.5, 1.5]))
-
-  def test_sample_weights_and_class_weights(self):
-    y = np.array([0, 1, 0, 0, 2])
-    sample_weights = np.array([0.5, 1., 1., 0., 2.])
-    class_weights = {0: 0.5, 1: 1., 2: 1.5}
-    weights = training_utils_v1.standardize_weights(y, sample_weights,
-                                                    class_weights)
-    expected = sample_weights * np.array([0.5, 1., 0.5, 0.5, 1.5])
-    self.assertAllClose(weights, expected)
-
-  def test_dataset_with_class_weight(self):
-    model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    model.compile('rmsprop', 'mse')
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    class_weight_np = np.array([0.25, 0.25, 0.25, 0.25])
-    class_weight = dict(enumerate(class_weight_np))
-
-    model.fit(
-        dataset,
-        epochs=1,
-        steps_per_epoch=2,
-        verbose=1,
-        class_weight=class_weight)
+    def test_sample_weights(self):
+        y = np.array([0, 1, 0, 0, 2])
+        sample_weights = np.array([0.5, 1.0, 1.0, 0.0, 2.0])
+        weights = training_utils_v1.standardize_weights(y, sample_weights)
+        self.assertAllClose(weights, sample_weights)
+
+    def test_class_weights(self):
+        y = np.array([0, 1, 0, 0, 2])
+        class_weights = {0: 0.5, 1: 1.0, 2: 1.5}
+        weights = training_utils_v1.standardize_weights(
+            y, class_weight=class_weights
+        )
+        self.assertAllClose(weights, np.array([0.5, 1.0, 0.5, 0.5, 1.5]))
+
+    def test_sample_weights_and_class_weights(self):
+        y = np.array([0, 1, 0, 0, 2])
+        sample_weights = np.array([0.5, 1.0, 1.0, 0.0, 2.0])
+        class_weights = {0: 0.5, 1: 1.0, 2: 1.5}
+        weights = training_utils_v1.standardize_weights(
+            y, sample_weights, class_weights
+        )
+        expected = sample_weights * np.array([0.5, 1.0, 0.5, 0.5, 1.5])
+        self.assertAllClose(weights, expected)
+
+    def test_dataset_with_class_weight(self):
+        model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
+        model.compile("rmsprop", "mse")
+
+        inputs = np.zeros((10, 3), np.float32)
+        targets = np.zeros((10, 4), np.float32)
+        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+        class_weight_np = np.array([0.25, 0.25, 0.25, 0.25])
+        class_weight = dict(enumerate(class_weight_np))
+
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=1,
+            class_weight=class_weight,
+        )
 
 
 class MonitoredPool(multiprocessing.pool.ThreadPool):
+    def __init__(self, *args, **kwargs):
+        self._apply_counter = 0
+        self._func_wrapper = None
+        super().__init__(*args, **kwargs)
 
-  def __init__(self, *args, **kwargs):
-    self._apply_counter = 0
-    self._func_wrapper = None
-    super().__init__(*args, **kwargs)
-
-  def apply_async(self, func, *args, **kwargs):
-    self._apply_counter += 1
-    if self._func_wrapper:
-      func = self._func_wrapper(func)  # pylint: disable=not-callable
-    return super().apply_async(func, *args, **kwargs)
+    def apply_async(self, func, *args, **kwargs):
+        self._apply_counter += 1
+        if self._func_wrapper:
+            func = self._func_wrapper(func)
+        return super().apply_async(func, *args, **kwargs)
 
 
 def add_sleep(f):
-  @functools.wraps(f)
-  def wrapped(*args, **kwargs):
-    time.sleep(1.)
-    return f(*args, **kwargs)
-  return wrapped
+    @functools.wraps(f)
+    def wrapped(*args, **kwargs):
+        time.sleep(1.0)
+        return f(*args, **kwargs)
+
+    return wrapped
 
 
 def cause_error(f):
-  @functools.wraps(f)
-  def wrapped(batch_element, batch_start, batch_end, is_finished):  # pylint: disable=unused-argument
-    # Induce a TypeError during assignment.
-    return f(None, None, None, is_finished)
-  return wrapped
+    @functools.wraps(f)
+    def wrapped(batch_element, batch_start, batch_end, is_finished):
+        # Induce a TypeError during assignment.
+        return f(None, None, None, is_finished)
 
+    return wrapped
 
-_TEST_DATA = np.array((
-    (3, 1, 3, 1, 2, 0, 3, 3, 1, 2),
-    (0, 1, 2, 1, 3, 0, 0, 1, 3, 0),
-    (3, 2, 1, 1, 1, 1, 1, 3, 2, 3),
-    (2, 2, 0, 1, 0, 3, 3, 2, 1, 1),
-    (3, 0, 3, 3, 3, 2, 1, 0, 0, 1),
-    (1, 0, 3, 3, 3, 2, 1, 2, 3, 1),))
 
+_TEST_DATA = np.array(
+    (
+        (3, 1, 3, 1, 2, 0, 3, 3, 1, 2),
+        (0, 1, 2, 1, 3, 0, 0, 1, 3, 0),
+        (3, 2, 1, 1, 1, 1, 1, 3, 2, 3),
+        (2, 2, 0, 1, 0, 3, 3, 2, 1, 1),
+        (3, 0, 3, 3, 3, 2, 1, 0, 0, 1),
+        (1, 0, 3, 3, 3, 2, 1, 2, 3, 1),
+    )
+)
 
-class AggregationTest(test_combinations.TestCase):
 
-  def setUp(self):
-    super().setUp()
-    self._old_pool = training_utils_v1._COPY_POOL
-    self._old_threshold = (
-        training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD)
-    self._old_timeout = training_utils_v1.SliceAggregator._MAX_COPY_SECONDS
-    training_utils_v1._COPY_POOL = MonitoredPool(
-        training_utils_v1._COPY_THREADS)
-
-  def tearDown(self):
-    super().tearDown()
-    training_utils_v1._COPY_POOL = self._old_pool
-    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = (
-        self._old_threshold)
-    training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = self._old_timeout
-
-  def _run_with_steps(self):
-    aggregator = training_utils_v1.OutputsAggregator(use_steps=True)
-    for i, batch in enumerate(np.array_split(_TEST_DATA, 4)):
-      if i == 0:
-        aggregator.create(batch)
-      aggregator.aggregate(batch)
-
-    assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils_v1.ConcatAggregator)
-
-    aggregator.finalize()
-    return aggregator.results
-
-  def _run_without_steps(self):
-    aggregator = training_utils_v1.OutputsAggregator(
-        use_steps=False, num_samples=6)
-
-    batch_start = 0
-    for i, batch in enumerate(np.array_split(_TEST_DATA, 4)):
-      if i == 0:
-        aggregator.create(batch)
-
-      batch_end = batch_start + batch.shape[0]
-      aggregator.aggregate(batch, batch_start, batch_end)
-      batch_start = batch_end
-
-    assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils_v1.SliceAggregator)
-
-    aggregator.finalize()
-    return aggregator.results
-
-  def test_with_steps(self):
-    self.assertAllEqual(self._run_with_steps(), _TEST_DATA)
-
-  def test_without_steps(self):
-    self.assertAllEqual(self._run_without_steps(), _TEST_DATA)
-
-  def test_nested_aggregation(self):
-    aggregator = training_utils_v1.OutputsAggregator(
-        use_steps=False, num_samples=6)
-
-    batches = np.array_split(_TEST_DATA, 4)
-    batch_start = 0
-    for i, batch in enumerate(zip(batches, batches)):
-      if i == 0:
-        aggregator.create(batch)
-
-      batch_end = batch_start + batch[0].shape[0]
-      aggregator.aggregate(batch, batch_start, batch_end)
-      batch_start = batch_end
-
-    assert len(aggregator.results) == 2
-    aggregator.finalize()
-    self.assertAllEqual(aggregator.results, (_TEST_DATA, _TEST_DATA))
-
-  def test_concat_single_batch(self):
-    aggregator = training_utils_v1.OutputsAggregator(use_steps=True)
-    data = _TEST_DATA.copy()
-    aggregator.create(data)
-    assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils_v1.ConcatAggregator)
-
-    aggregator.aggregate(data)
-    aggregator.finalize()
-    assert aggregator.results is data  # No copy.
-
-  def test_slice_single_batch(self):
-    aggregator = training_utils_v1.OutputsAggregator(
-        use_steps=False, num_samples=6)
-    data = _TEST_DATA.copy()
-    aggregator.create(data)
-    assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils_v1.SliceAggregator)
-
-    aggregator.aggregate(data, 0, 6)
-    aggregator.finalize()
-    assert aggregator.results is data  # No copy.
-
-  def test_async_copy(self):
-    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
-    self.assertAllEqual(self._run_without_steps(), _TEST_DATA)
-
-    # Two of the four batches will have 20 elements and two will have 10.
-    self.assertEqual(training_utils_v1._COPY_POOL._apply_counter, 2)
-
-  def test_async_copy_timeout(self):
-    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
-    training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = 0.1
-    training_utils_v1._COPY_POOL._func_wrapper = add_sleep
-    with self.assertRaisesRegex(ValueError, 'Timed out waiting for copy'):
-      self._run_without_steps()
-
-  def test_async_copy_reraise(self):
-    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
-    training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = 1.
-    training_utils_v1._COPY_POOL._func_wrapper = cause_error
-    with self.assertRaisesRegex(TypeError, 'NoneType'):
-      self._run_without_steps()
+class AggregationTest(test_combinations.TestCase):
+    def setUp(self):
+        super().setUp()
+        self._old_pool = training_utils_v1._COPY_POOL
+        self._old_threshold = (
+            training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD
+        )
+        self._old_timeout = training_utils_v1.SliceAggregator._MAX_COPY_SECONDS
+        training_utils_v1._COPY_POOL = MonitoredPool(
+            training_utils_v1._COPY_THREADS
+        )
+
+    def tearDown(self):
+        super().tearDown()
+        training_utils_v1._COPY_POOL = self._old_pool
+        training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = (
+            self._old_threshold
+        )
+        training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = self._old_timeout
+
+    def _run_with_steps(self):
+        aggregator = training_utils_v1.OutputsAggregator(use_steps=True)
+        for i, batch in enumerate(np.array_split(_TEST_DATA, 4)):
+            if i == 0:
+                aggregator.create(batch)
+            aggregator.aggregate(batch)
+
+        assert len(aggregator.results) == 1
+        assert isinstance(
+            aggregator.results[0], training_utils_v1.ConcatAggregator
+        )
+
+        aggregator.finalize()
+        return aggregator.results
+
+    def _run_without_steps(self):
+        aggregator = training_utils_v1.OutputsAggregator(
+            use_steps=False, num_samples=6
+        )
+
+        batch_start = 0
+        for i, batch in enumerate(np.array_split(_TEST_DATA, 4)):
+            if i == 0:
+                aggregator.create(batch)
+
+            batch_end = batch_start + batch.shape[0]
+            aggregator.aggregate(batch, batch_start, batch_end)
+            batch_start = batch_end
+
+        assert len(aggregator.results) == 1
+        assert isinstance(
+            aggregator.results[0], training_utils_v1.SliceAggregator
+        )
+
+        aggregator.finalize()
+        return aggregator.results
+
+    def test_with_steps(self):
+        self.assertAllEqual(self._run_with_steps(), _TEST_DATA)
+
+    def test_without_steps(self):
+        self.assertAllEqual(self._run_without_steps(), _TEST_DATA)
+
+    def test_nested_aggregation(self):
+        aggregator = training_utils_v1.OutputsAggregator(
+            use_steps=False, num_samples=6
+        )
+
+        batches = np.array_split(_TEST_DATA, 4)
+        batch_start = 0
+        for i, batch in enumerate(zip(batches, batches)):
+            if i == 0:
+                aggregator.create(batch)
+
+            batch_end = batch_start + batch[0].shape[0]
+            aggregator.aggregate(batch, batch_start, batch_end)
+            batch_start = batch_end
+
+        assert len(aggregator.results) == 2
+        aggregator.finalize()
+        self.assertAllEqual(aggregator.results, (_TEST_DATA, _TEST_DATA))
+
+    def test_concat_single_batch(self):
+        aggregator = training_utils_v1.OutputsAggregator(use_steps=True)
+        data = _TEST_DATA.copy()
+        aggregator.create(data)
+        assert len(aggregator.results) == 1
+        assert isinstance(
+            aggregator.results[0], training_utils_v1.ConcatAggregator
+        )
+
+        aggregator.aggregate(data)
+        aggregator.finalize()
+        assert aggregator.results is data  # No copy.
+
+    def test_slice_single_batch(self):
+        aggregator = training_utils_v1.OutputsAggregator(
+            use_steps=False, num_samples=6
+        )
+        data = _TEST_DATA.copy()
+        aggregator.create(data)
+        assert len(aggregator.results) == 1
+        assert isinstance(
+            aggregator.results[0], training_utils_v1.SliceAggregator
+        )
+
+        aggregator.aggregate(data, 0, 6)
+        aggregator.finalize()
+        assert aggregator.results is data  # No copy.
+
+    def test_async_copy(self):
+        training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
+        self.assertAllEqual(self._run_without_steps(), _TEST_DATA)
+
+        # Two of the four batches will have 20 elements and two will have 10.
+        self.assertEqual(training_utils_v1._COPY_POOL._apply_counter, 2)
+
+    def test_async_copy_timeout(self):
+        training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
+        training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = 0.1
+        training_utils_v1._COPY_POOL._func_wrapper = add_sleep
+        with self.assertRaisesRegex(ValueError, "Timed out waiting for copy"):
+            self._run_without_steps()
+
+    def test_async_copy_reraise(self):
+        training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
+        training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = 1.0
+        training_utils_v1._COPY_POOL._func_wrapper = cause_error
+        with self.assertRaisesRegex(TypeError, "NoneType"):
+            self._run_without_steps()
 
 
 class CompositeTensorTestUtils(test_combinations.TestCase):
-
-  def test_is_composite(self):
-    # Validate that all composite tensor and value types return true.
-    self.assertTrue(
-        training_utils_v1.is_composite_or_composite_value(
-            tf.SparseTensor([[0, 0]], [1], [1, 1])))
-    self.assertTrue(
-        training_utils_v1.is_composite_or_composite_value(
-            tf.compat.v1.SparseTensorValue([[0, 0]], [1], [1, 1])))
-    self.assertTrue(
-        training_utils_v1.is_composite_or_composite_value(
-            tf.RaggedTensor.from_row_splits(
-                np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64))))
-    self.assertTrue(
-        training_utils_v1.is_composite_or_composite_value(
-            tf.compat.v1.ragged.RaggedTensorValue(
-                np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64))))
-
-    # Test that numpy arrays and tensors return false.
-    self.assertFalse(
-        training_utils_v1.is_composite_or_composite_value(np.ndarray([0, 1])))
-    self.assertFalse(
-        training_utils_v1.is_composite_or_composite_value(
-            tf.convert_to_tensor([3, 1])))
-
-  def test_sparse_concatenation(self):
-    tensor_1 = tf.SparseTensor([[0, 0]], [1], [1, 1])
-    tensor_2 = tf.SparseTensor([[0, 0]], [2], [1, 1])
-    concatenated_tensor = training_utils_v1._append_composite_tensor(
-        tensor_1, tensor_2)
-    evaluated_tensor = self.evaluate(concatenated_tensor)
-    self.assertAllEqual(evaluated_tensor.indices, [[0, 0], [1, 0]])
-    self.assertAllEqual(evaluated_tensor.values, [1, 2])
-    self.assertAllEqual(evaluated_tensor.dense_shape, [2, 1])
-
-  def test_sparse_value_concatenation(self):
-    tensor_1 = tf.compat.v1.SparseTensorValue([[0, 0]], [1], [1, 1])
-    tensor_2 = tf.compat.v1.SparseTensorValue([[0, 0]], [2], [1, 1])
-    concatenated_tensor = training_utils_v1._append_composite_tensor(
-        tensor_1, tensor_2)
-    self.assertAllEqual(concatenated_tensor.indices, [[0, 0], [1, 0]])
-    self.assertAllEqual(concatenated_tensor.values, [1, 2])
-    self.assertAllEqual(concatenated_tensor.dense_shape, [2, 1])
-
-  def test_ragged_concatenation(self):
-    tensor_1 = tf.RaggedTensor.from_row_splits(
-        np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64))
-    tensor_2 = tf.RaggedTensor.from_row_splits(
-        np.array([3, 4, 5]), np.array([0, 2, 3], dtype=np.int64))
-    concatenated_tensor = training_utils_v1._append_composite_tensor(
-        tensor_1, tensor_2)
-    evaluated_tensor = self.evaluate(concatenated_tensor)
-
-    self.assertAllEqual(evaluated_tensor.values, [0, 1, 2, 3, 4, 5])
-    self.assertAllEqual(evaluated_tensor.row_splits, [0, 1, 3, 5, 6])
-
-  def test_ragged_value_concatenation(self):
-    tensor_1 = tf.compat.v1.ragged.RaggedTensorValue(
-        np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64))
-    tensor_2 = tf.compat.v1.ragged.RaggedTensorValue(
-        np.array([3, 4, 5]), np.array([0, 2, 3], dtype=np.int64))
-    concatenated_tensor = training_utils_v1._append_composite_tensor(
-        tensor_1, tensor_2)
-
-    self.assertAllEqual(concatenated_tensor.values, [0, 1, 2, 3, 4, 5])
-    self.assertAllEqual(concatenated_tensor.row_splits, [0, 1, 3, 5, 6])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_is_composite(self):
+        # Validate that all composite tensor and value types return true.
+        self.assertTrue(
+            training_utils_v1.is_composite_or_composite_value(
+                tf.SparseTensor([[0, 0]], [1], [1, 1])
+            )
+        )
+        self.assertTrue(
+            training_utils_v1.is_composite_or_composite_value(
+                tf.compat.v1.SparseTensorValue([[0, 0]], [1], [1, 1])
+            )
+        )
+        self.assertTrue(
+            training_utils_v1.is_composite_or_composite_value(
+                tf.RaggedTensor.from_row_splits(
+                    np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64)
+                )
+            )
+        )
+        self.assertTrue(
+            training_utils_v1.is_composite_or_composite_value(
+                tf.compat.v1.ragged.RaggedTensorValue(
+                    np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64)
+                )
+            )
+        )
+
+        # Test that numpy arrays and tensors return false.
+        self.assertFalse(
+            training_utils_v1.is_composite_or_composite_value(
+                np.ndarray([0, 1])
+            )
+        )
+        self.assertFalse(
+            training_utils_v1.is_composite_or_composite_value(
+                tf.convert_to_tensor([3, 1])
+            )
+        )
+
+    def test_sparse_concatenation(self):
+        tensor_1 = tf.SparseTensor([[0, 0]], [1], [1, 1])
+        tensor_2 = tf.SparseTensor([[0, 0]], [2], [1, 1])
+        concatenated_tensor = training_utils_v1._append_composite_tensor(
+            tensor_1, tensor_2
+        )
+        evaluated_tensor = self.evaluate(concatenated_tensor)
+        self.assertAllEqual(evaluated_tensor.indices, [[0, 0], [1, 0]])
+        self.assertAllEqual(evaluated_tensor.values, [1, 2])
+        self.assertAllEqual(evaluated_tensor.dense_shape, [2, 1])
+
+    def test_sparse_value_concatenation(self):
+        tensor_1 = tf.compat.v1.SparseTensorValue([[0, 0]], [1], [1, 1])
+        tensor_2 = tf.compat.v1.SparseTensorValue([[0, 0]], [2], [1, 1])
+        concatenated_tensor = training_utils_v1._append_composite_tensor(
+            tensor_1, tensor_2
+        )
+        self.assertAllEqual(concatenated_tensor.indices, [[0, 0], [1, 0]])
+        self.assertAllEqual(concatenated_tensor.values, [1, 2])
+        self.assertAllEqual(concatenated_tensor.dense_shape, [2, 1])
+
+    def test_ragged_concatenation(self):
+        tensor_1 = tf.RaggedTensor.from_row_splits(
+            np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64)
+        )
+        tensor_2 = tf.RaggedTensor.from_row_splits(
+            np.array([3, 4, 5]), np.array([0, 2, 3], dtype=np.int64)
+        )
+        concatenated_tensor = training_utils_v1._append_composite_tensor(
+            tensor_1, tensor_2
+        )
+        evaluated_tensor = self.evaluate(concatenated_tensor)
+
+        self.assertAllEqual(evaluated_tensor.values, [0, 1, 2, 3, 4, 5])
+        self.assertAllEqual(evaluated_tensor.row_splits, [0, 1, 3, 5, 6])
+
+    def test_ragged_value_concatenation(self):
+        tensor_1 = tf.compat.v1.ragged.RaggedTensorValue(
+            np.array([0, 1, 2]), np.array([0, 1, 3], dtype=np.int64)
+        )
+        tensor_2 = tf.compat.v1.ragged.RaggedTensorValue(
+            np.array([3, 4, 5]), np.array([0, 2, 3], dtype=np.int64)
+        )
+        concatenated_tensor = training_utils_v1._append_composite_tensor(
+            tensor_1, tensor_2
+        )
+
+        self.assertAllEqual(concatenated_tensor.values, [0, 1, 2, 3, 4, 5])
+        self.assertAllEqual(concatenated_tensor.row_splits, [0, 1, 3, 5, 6])
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/engine/training_v1.py b/keras/engine/training_v1.py
index 371feb42b0ed..3324e1c2b707 100644
--- a/keras/engine/training_v1.py
+++ b/keras/engine/training_v1.py
@@ -13,17 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """V1 Training-related part of the Keras engine."""
-# pylint: disable=g-classes-have-attributes
-import tensorflow.compat.v2 as tf
-
 import collections
 import warnings
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import losses
 from keras import metrics as metrics_module
-from keras.optimizers import optimizer_v1
 from keras import optimizers
 from keras.distribute import distributed_training_utils
 from keras.distribute import distributed_training_utils_v1
@@ -36,3160 +34,3599 @@
 from keras.engine import training_utils
 from keras.engine import training_utils_v1
 from keras.mixed_precision import loss_scale_optimizer
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from keras.saving import saving_utils
-from keras.saving.saved_model import model_serialization
+from keras.optimizers import optimizer_v1
+from keras.optimizers.legacy import optimizer_v2
+from keras.saving.legacy import saving_utils
+from keras.saving.legacy.saved_model import model_serialization
 from keras.utils import data_utils
 from keras.utils import layer_utils
 from keras.utils import losses_utils
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
 from keras.utils.mode_keys import ModeKeys
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 try:
-  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
+    from scipy.sparse import issparse
 except ImportError:
-  issparse = None
+    issparse = None
 
 
 class Model(training_lib.Model):
-  """`Model` groups layers into an object with training and inference features.
-
-  There are two ways to instantiate a `Model`:
-
-  1 - With the "functional API", where you start from `Input`,
-  you chain layer calls to specify the model's forward pass,
-  and finally you create your model from inputs and outputs:
+    """A model groups layers into an object with training & inference features.
 
-  ```python
-  import tensorflow as tf
+    There are two ways to instantiate a `Model`:
 
-  inputs = tf.keras.Input(shape=(3,))
-  x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
-  outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
-  model = tf.keras.Model(inputs=inputs, outputs=outputs)
-  ```
+    1 - With the "functional API", where you start from `Input`,
+    you chain layer calls to specify the model's forward pass,
+    and finally you create your model from inputs and outputs:
 
-  2 - By subclassing the `Model` class: in that case, you should define your
-  layers in `__init__` and you should implement the model's forward pass
-  in `call`.
+    ```python
+    import tensorflow as tf
 
-  ```python
-  import tensorflow as tf
+    inputs = tf.keras.Input(shape=(3,))
+    x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
+    outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
+    model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    ```
 
-  class MyModel(tf.keras.Model):
+    2 - By subclassing the `Model` class: in that case, you should define your
+    layers in `__init__` and you should implement the model's forward pass
+    in `call`.
 
-    def __init__(self):
-      super().__init__()
-      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
-      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+    ```python
+    import tensorflow as tf
 
-    def call(self, inputs):
-      x = self.dense1(inputs)
-      return self.dense2(x)
+    class MyModel(tf.keras.Model):
 
-  model = MyModel()
-  ```
+      def __init__(self):
+        super().__init__()
+        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
 
-  If you subclass `Model`, you can optionally have
-  a `training` argument (boolean) in `call`, which you can use to specify
-  a different behavior in training and inference:
+      def call(self, inputs):
+        x = self.dense1(inputs)
+        return self.dense2(x)
 
-  ```python
-  import tensorflow as tf
+    model = MyModel()
+    ```
 
-  class MyModel(tf.keras.Model):
+    If you subclass `Model`, you can optionally have
+    a `training` argument (boolean) in `call`, which you can use to specify
+    a different behavior in training and inference:
 
-    def __init__(self):
-      super().__init__()
-      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
-      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
-      self.dropout = tf.keras.layers.Dropout(0.5)
+    ```python
+    import tensorflow as tf
 
-    def call(self, inputs, training=False):
-      x = self.dense1(inputs)
-      if training:
-        x = self.dropout(x, training=training)
-      return self.dense2(x)
+    class MyModel(tf.keras.Model):
 
-  model = MyModel()
-  ```
-  """
+      def __init__(self):
+        super().__init__()
+        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+        self.dropout = tf.keras.layers.Dropout(0.5)
 
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    # initializing _distribution_strategy here since it is possible to call
-    # predict on a model without compiling it.
-    self._distribution_strategy = None
-    self._compile_time_distribution_strategy = None
-    if (tf.compat.v1.executing_eagerly_outside_functions() and
-        tf.distribute.has_strategy()):
-      self._set_strategy(
-          tf.distribute.get_strategy())
+      def call(self, inputs, training=False):
+        x = self.dense1(inputs)
+        if training:
+          x = self.dropout(x, training=training)
+        return self.dense2(x)
 
-    # This flag is used to track if the user is using the deprecated path of
-    # passing distribution strategy to compile rather than creating the model
-    # under distribution strategy scope.
-    self._compile_distribution = False
-
-    self._run_eagerly = None
-    self._experimental_run_tf_function = (
-        tf.compat.v1.executing_eagerly_outside_functions())
-
-    self._v1_compile_was_called = False
-
-  def _init_batch_counters(self):
-    pass  # Batch counters should not be created in legacy graph mode.
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _set_strategy(self, strategy):
-    self._compile_time_distribution_strategy = strategy
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays.
+    model = MyModel()
+    ```
     """
-    strategy = (self._distribution_strategy or
-                self._compile_time_distribution_strategy)
-    if strategy:
-      with strategy.scope():
-        return base_layer.Layer.get_weights(self)
-    return base_layer.Layer.get_weights(self)
 
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
-
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
-
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
-
-    Args:
-        filepath: String, path to the weights file to load. For weight files in
-            TensorFlow format, this is the file prefix (the same as was passed
-            to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
-
-    Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
-    """
-    if backend.is_tpu_strategy(self._distribution_strategy):
-      if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not saving_utils.is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
-        raise ValueError('Load weights is not yet supported with TPUStrategy '
-                         'with steps_per_run greater than 1.')
-    return super().load_weights(filepath, by_name, skip_mismatch)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def compile(self,
-              optimizer='rmsprop',
-              loss=None,
-              metrics=None,
-              loss_weights=None,
-              sample_weight_mode=None,
-              weighted_metrics=None,
-              target_tensors=None,
-              distribute=None,
-              **kwargs):
-    """Configures the model for training.
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # initializing _distribution_strategy here since it is possible to call
+        # predict on a model without compiling it.
+        self._distribution_strategy = None
+        self._compile_time_distribution_strategy = None
+        if (
+            tf.compat.v1.executing_eagerly_outside_functions()
+            and tf.distribute.has_strategy()
+        ):
+            self._set_strategy(tf.distribute.get_strategy())
+
+        # This flag is used to track if the user is using the deprecated path of
+        # passing distribution strategy to compile rather than creating the
+        # model under distribution strategy scope.
+        self._compile_distribution = False
+
+        self._run_eagerly = None
+        self._experimental_run_tf_function = (
+            tf.compat.v1.executing_eagerly_outside_functions()
+        )
+
+        self._v1_compile_was_called = False
+
+    def _init_batch_counters(self):
+        pass  # Batch counters should not be created in legacy graph mode.
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _set_strategy(self, strategy):
+        self._compile_time_distribution_strategy = strategy
+
+    def get_weights(self):
+        """Retrieves the weights of the model.
+
+        Returns:
+            A flat list of Numpy arrays.
+        """
+        strategy = (
+            self._distribution_strategy
+            or self._compile_time_distribution_strategy
+        )
+        if strategy:
+            with strategy.scope():
+                return base_layer.Layer.get_weights(self)
+        return base_layer.Layer.get_weights(self)
 
-    Args:
-        optimizer: String (name of optimizer) or optimizer instance.
-            See `tf.keras.optimizers`.
-        loss: String (name of objective function), objective function or
-            `tf.keras.losses.Loss` instance. See `tf.keras.losses`. An objective
-            function is any callable with the signature
-            `scalar_loss = fn(y_true, y_pred)`. If the model has multiple
-            outputs, you can use a different loss on each output by passing a
-            dictionary or a list of losses. The loss value that will be
-            minimized by the model will then be the sum of all individual
-            losses.
-        metrics: List of metrics to be evaluated by the model during training
-            and testing. Typically you will use `metrics=['accuracy']`.
-            To specify different metrics for different outputs of a
-            multi-output model, you could also pass a dictionary, such as
-            `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
-            You can also pass a list (len = len(outputs)) of lists of metrics
-            such as `metrics=[['accuracy'], ['accuracy', 'mse']]` or
-            `metrics=['accuracy', ['accuracy', 'mse']]`.
-        loss_weights: Optional list or dictionary specifying scalar
-            coefficients (Python floats) to weight the loss contributions
-            of different model outputs.
-            The loss value that will be minimized by the model
-            will then be the *weighted sum* of all individual losses,
-            weighted by the `loss_weights` coefficients.
-            If a list, it is expected to have a 1:1 mapping
-            to the model's outputs. If a tensor, it is expected to map
-            output names (strings) to scalar coefficients.
-        sample_weight_mode: If you need to do timestep-wise
-            sample weighting (2D weights), set this to `"temporal"`.
-            `None` defaults to sample-wise weights (1D).
-            If the model has multiple outputs, you can use a different
-            `sample_weight_mode` on each output by passing a
-            dictionary or a list of modes.
-        weighted_metrics: List of metrics to be evaluated and weighted
-            by sample_weight or class_weight during training and testing.
-        target_tensors: By default, Keras will create placeholders for the
-            model's target, which will be fed with the target data during
-            training. If instead you would like to use your own
-            target tensors (in turn, Keras will not expect external
-            Numpy data for these targets at training time), you
-            can specify them via the `target_tensors` argument. It can be
-            a single tensor (for a single-output model), a list of tensors,
-            or a dict mapping output names to target tensors.
-        distribute: NOT SUPPORTED IN TF 2.0, please create and compile the
-            model under distribution strategy scope instead of passing it to
-            compile.
-        **kwargs: Any additional arguments.
-
-    Raises:
-        ValueError: In case of invalid arguments for
-            `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
-    """
-    self._assert_built_as_v1()
-    self._run_eagerly = kwargs.pop('run_eagerly', None)
-    self._experimental_run_tf_function = kwargs.pop(
-        'experimental_run_tf_function', True)
-    self._v1_compile_was_called = True
-
-    # Prepare Session arguments (legacy).
-    kwargs.pop('cloning', None)  # Legacy DistStrat argument, never used.
-    self._from_serialized = kwargs.pop('from_serialized', False)
-    allowed_kwargs = {'feed_dict', 'fetches', 'options', 'run_metadata'}
-    unknown_kwargs = set(kwargs.keys()) - allowed_kwargs
-    if unknown_kwargs:
-      raise TypeError(
-          'Invalid keyword argument(s) in `compile`: %s' % (unknown_kwargs,))
-    self._function_kwargs = kwargs
-    if self._function_kwargs:
-      self._experimental_run_tf_function = False
-      if self.run_eagerly:
-        raise ValueError(
-            'Session keyword arguments are not supported '
-            'when `run_eagerly=True`. You passed the following '
-            'Session arguments: %s' % (self._function_kwargs,))
-
-    self._set_optimizer(optimizer)
-    is_any_keras_optimizer_v1 = any(
-        (isinstance(opt, optimizer_v1.Optimizer)
-         and not isinstance(opt, optimizer_v1.TFOptimizer)
-        ) for opt in tf.nest.flatten(self.optimizer))
-
-    if is_any_keras_optimizer_v1 and tf.compat.v1.executing_eagerly_outside_functions():
-      raise ValueError('`tf.compat.v1.keras` Optimizer (', optimizer, ') is '
-                       'not supported when eager execution is enabled. Use a '
-                       '`tf.keras` Optimizer instead, or disable eager '
-                       'execution.')
-
-    if ((target_tensors is not None)
-        or not tf.compat.v1.executing_eagerly_outside_functions()):
-      # Fallback out of things that aren't supported with v2 loops
-      self._experimental_run_tf_function = False
-
-    if distribute is not None:
-      if tf.__internal__.tf2.enabled() or self._experimental_run_tf_function:
-        raise ValueError(
-            'Distribute argument in compile is not available in TF 2.0 please '
-            'create the model under the distribution strategy scope.')
-      logging.warning('Distribute argument in compile is deprecated please '
-                      'create the model under the distribution strategy scope.')
-      self._distribution_strategy = distribute
-      self._compile_distribution = True
-    else:
-      if tf.distribute.has_strategy():
-        # When the user builds the model in the DS scope and cross replica
-        # context we want distribution strategy to be set but when building the
-        # replica copies of the models internally we should not be compiling
-        # with distribution strategy and use the default compilation path.
-        if tf.distribute.in_cross_replica_context():
-          self._distribution_strategy = (
-              tf.distribute.get_strategy())
-
-    if isinstance(self._distribution_strategy,
-                  tf.compat.v1.distribute.experimental.ParameterServerStrategy):
-      raise NotImplementedError(
-          '`tf.compat.v1.distribute.experimental.ParameterServerStrategy` '
-          'currently only works with the tf.Estimator API')
-
-    if isinstance(self._distribution_strategy,
-                  tf.distribute.experimental.ParameterServerStrategy):
-      raise NotImplementedError(
-          '`tf.distribute.experimental.ParameterServerStrategy` is only '
-          'supported in TF2.')
-
-    if not self._experimental_run_tf_function:
-      self._validate_compile_param_for_distribution_strategy(self.run_eagerly,
-                                                             sample_weight_mode,
-                                                             target_tensors,
-                                                             weighted_metrics)
-    # We've disabled automatic dependency tracking for this method, but do want
-    # to add a checkpoint dependency on the optimizer if it's trackable.
-    if isinstance(self.optimizer, tf.__internal__.tracking.Trackable):
-      self._track_trackable(
-          self.optimizer, name='optimizer', overwrite=True)
-    self.loss = loss or {}
-    self.loss_weights = loss_weights
-    self.sample_weight_mode = sample_weight_mode
-    self._compile_metrics = metrics or []
-    self._compile_weighted_metrics = weighted_metrics
-    if self.run_eagerly and target_tensors is not None:
-      raise ValueError(
-          'target_tensors argument is not supported when '
-          'running a model eagerly.')
-
-    # _training_endpoints contains a list of _TrainingEndpoint object, which has
-    # all the model output/target/loss and related metadata.
-    self._training_endpoints = []
-
-    # Used to freeze the behavior of the Model once `compile` has been called.
-    self._compiled_trainable_state = self._get_trainable_state()
-
-    # Set tf.distribute.Strategy specific parameters.
-    self._distributed_model_cache = {}
-    self._distributed_function_cache = {}
-
-    # Clear any `_eager_losses` that was added.
-    self._clear_losses()
-
-    if (not tf.executing_eagerly() and
-        self._distribution_strategy is not None):
-      # Ensures a Session is created and configured correctly for Distribution
-      # Strategy.
-      backend.configure_and_create_distributed_session(
-          self._distribution_strategy)
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-    if not self.built or not self.inputs or not self.outputs:
-      # Model is not compilable because it does not know its number of inputs
-      # and outputs, nor their shapes and names. We will compile after the first
-      # time the model gets called on training data.
-      return
-    self._is_compiled = True
-    base_layer.keras_api_gauge.get_cell('compile').set(True)
-
-    # Prepare list of loss functions, same size of model outputs.
-    self.loss_functions = training_utils_v1.prepare_loss_functions(
-        self.loss, self.output_names)
-
-    target_tensors = self._process_target_tensor_for_compile(target_tensors)
-
-    for o, n, l, t in zip(self.outputs, self.output_names,
-                          self.loss_functions, target_tensors):
-      endpoint = _TrainingEndpoint(o, n, l)
-      endpoint.create_training_target(t, run_eagerly=self.run_eagerly)
-      self._training_endpoints.append(endpoint)
-
-    # Prepare list loss weights, same size of model outputs.
-    training_utils_v1.prepare_loss_weights(self._training_endpoints,
-                                           loss_weights)
-
-    # Initialization for Eager mode execution.
-    if self.run_eagerly:
-      self._compile_eagerly(metrics, weighted_metrics, sample_weight_mode)
-      return
-
-    with backend.get_graph().as_default():
-      # Save all metric attributes per output of the model.
-      self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-      # Set metric attributes on model.
-      self._set_metric_attributes()
-
-      # Invoke metric functions (unweighted) for all the outputs.
-      self._handle_metrics(
-          self.outputs,
-          targets=self._targets,
-          skip_target_masks=self._prepare_skip_target_masks(),
-          masks=self._prepare_output_masks())
-
-      # Prepare sample weight modes. List with the same length as model outputs.
-      training_utils_v1.prepare_sample_weight_modes(
-          self._training_endpoints, sample_weight_mode)
-
-      # Creates the model loss and weighted metrics sub-graphs.
-      self._compile_weights_loss_and_weighted_metrics()
-
-      # Functions for train, test and predict will
-      # be compiled lazily when required.
-      # This saves time when the user is not using all functions.
-      self.train_function = None
-      self.test_function = None
-      self.predict_function = None
-
-      # Collected trainable weights, sorted in topological order.
-      self._collected_trainable_weights = self.trainable_weights
-
-      # Validate all variables were correctly created in distribution scope.
-      if self._distribution_strategy and not self._compile_distribution:
-        for v in self.variables:
-          strategy = self._distribution_strategy
-          if not strategy.extended.variable_created_in_scope(v):
+    def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+        """Loads all layer weights, either from a TensorFlow or an HDF5 file.
+
+        If `by_name` is False weights are loaded based on the network's
+        topology. This means the architecture should be the same as when the
+        weights were saved.  Note that layers that don't have weights are not
+        taken into account in the topological ordering, so adding or removing
+        layers is fine as long as they don't have weights.
+
+        If `by_name` is True, weights are loaded into layers only if they share
+        the same name. This is useful for fine-tuning or transfer-learning
+        models where some of the layers have changed.
+
+        Only topological loading (`by_name=False`) is supported when loading
+        weights from the TensorFlow format. Note that topological loading
+        differs slightly between TensorFlow and HDF5 formats for user-defined
+        classes inheriting from `tf.keras.Model`: HDF5 loads based on a
+        flattened list of weights, while the TensorFlow format loads based on
+        the object-local names of attributes to which layers are assigned in the
+        `Model`'s constructor.
+
+        Args:
+            filepath: String, path to the weights file to load. For weight files
+                in TensorFlow format, this is the file prefix (the same as was
+                passed to `save_weights`).
+            by_name: Boolean, whether to load weights by name or by topological
+                order. Only topological loading is supported for weight files in
+                TensorFlow format.
+            skip_mismatch: Boolean, whether to skip loading of layers where
+                there is a mismatch in the number of weights, or a mismatch in
+                the shape of the weight (only valid when `by_name=True`).
+
+        Returns:
+            When loading a weight file in TensorFlow format, returns the same
+            status object as `tf.train.Checkpoint.restore`. When graph building,
+            restore ops are run automatically as soon as the network is built
+            (on first call for user-defined classes inheriting from `Model`,
+            immediately if it is already built).
+
+            When loading weights in HDF5 format, returns `None`.
+
+        Raises:
+            ImportError: If h5py is not available and the weight file is in HDF5
+                format.
+            ValueError: If `skip_mismatch` is set to `True` when `by_name` is
+              `False`.
+        """
+        if backend.is_tpu_strategy(self._distribution_strategy):
+            if self._distribution_strategy.extended.steps_per_run > 1 and (
+                not saving_utils.is_hdf5_filepath(filepath)
+            ):
+                raise ValueError(
+                    "Load weights is not yet supported with TPUStrategy "
+                    "with steps_per_run greater than 1."
+                )
+        return super().load_weights(
+            filepath, by_name=by_name, skip_mismatch=skip_mismatch
+        )
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def compile(
+        self,
+        optimizer="rmsprop",
+        loss=None,
+        metrics=None,
+        loss_weights=None,
+        sample_weight_mode=None,
+        weighted_metrics=None,
+        target_tensors=None,
+        distribute=None,
+        **kwargs,
+    ):
+        """Configures the model for training.
+
+        Args:
+            optimizer: String (name of optimizer) or optimizer instance.
+                See `tf.keras.optimizers`.
+            loss: String (name of objective function), objective function or
+                `tf.keras.losses.Loss` instance. See `tf.keras.losses`. An
+                objective function is any callable with the signature
+                `scalar_loss = fn(y_true, y_pred)`. If the model has multiple
+                outputs, you can use a different loss on each output by passing
+                a dictionary or a list of losses. The loss value that will be
+                minimized by the model will then be the sum of all individual
+                losses.
+            metrics: List of metrics to be evaluated by the model during
+                training and testing. Typically you will use
+                `metrics=['accuracy']`.  To specify different metrics for
+                different outputs of a multi-output model, you could also pass a
+                dictionary, such as `metrics={'output_a': 'accuracy',
+                'output_b': ['accuracy', 'mse']}`.  You can also pass a list
+                (len = len(outputs)) of lists of metrics such as
+                `metrics=[['accuracy'], ['accuracy', 'mse']]` or
+                `metrics=['accuracy', ['accuracy', 'mse']]`.
+            loss_weights: Optional list or dictionary specifying scalar
+                coefficients (Python floats) to weight the loss contributions
+                of different model outputs.
+                The loss value that will be minimized by the model
+                will then be the *weighted sum* of all individual losses,
+                weighted by the `loss_weights` coefficients.
+                If a list, it is expected to have a 1:1 mapping
+                to the model's outputs. If a tensor, it is expected to map
+                output names (strings) to scalar coefficients.
+            sample_weight_mode: If you need to do timestep-wise
+                sample weighting (2D weights), set this to `"temporal"`.
+                `None` becomes sample-wise weights (1D).
+                If the model has multiple outputs, you can use a different
+                `sample_weight_mode` on each output by passing a
+                dictionary or a list of modes. Defaults to `None`.
+            weighted_metrics: List of metrics to be evaluated and weighted
+                by sample_weight or class_weight during training and testing.
+            target_tensors: By default, Keras will create placeholders for the
+                model's target, which will be fed with the target data during
+                training. If instead you would like to use your own
+                target tensors (in turn, Keras will not expect external
+                Numpy data for these targets at training time), you
+                can specify them via the `target_tensors` argument. It can be
+                a single tensor (for a single-output model), a list of tensors,
+                or a dict mapping output names to target tensors.
+            distribute: NOT SUPPORTED IN TF 2.0, please create and compile the
+                model under distribution strategy scope instead of passing it to
+                compile.
+            **kwargs: Any additional arguments.
+
+        Raises:
+            ValueError: In case of invalid arguments for
+                `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
+        """
+        self._assert_built_as_v1()
+        self._run_eagerly = kwargs.pop("run_eagerly", None)
+        self._experimental_run_tf_function = kwargs.pop(
+            "experimental_run_tf_function", True
+        )
+        self._v1_compile_was_called = True
+
+        # Prepare Session arguments (legacy).
+        kwargs.pop("cloning", None)  # Legacy DistStrat argument, never used.
+        self._from_serialized = kwargs.pop("from_serialized", False)
+        allowed_kwargs = {"feed_dict", "fetches", "options", "run_metadata"}
+        unknown_kwargs = set(kwargs.keys()) - allowed_kwargs
+        if unknown_kwargs:
+            raise TypeError(
+                f"Invalid keyword argument(s) in `compile`: {unknown_kwargs}"
+            )
+        self._function_kwargs = kwargs
+        if self._function_kwargs:
+            self._experimental_run_tf_function = False
+            if self.run_eagerly:
+                raise ValueError(
+                    "Session keyword arguments are not supported "
+                    "when `run_eagerly=True`. You passed the following "
+                    "Session arguments: %s" % (self._function_kwargs,)
+                )
+
+        self._set_optimizer(optimizer)
+        is_any_keras_optimizer_v1 = any(
+            (
+                isinstance(opt, optimizer_v1.Optimizer)
+                and not isinstance(opt, optimizer_v1.TFOptimizer)
+            )
+            for opt in tf.nest.flatten(self.optimizer)
+        )
+
+        if (
+            is_any_keras_optimizer_v1
+            and tf.compat.v1.executing_eagerly_outside_functions()
+        ):
             raise ValueError(
-                'Variable (%s) was not created in the distribution strategy '
-                'scope of (%s). It is most likely due to not all layers or '
-                'the model or optimizer being created outside the distribution '
-                'strategy scope. Try to make sure your code looks similar '
-                'to the following.\n'
-                'with strategy.scope():\n'
-                '  model=_create_model()\n'
-                '  model.compile(...)'% (v, strategy))
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _init_distributed_function_cache_if_not_compiled(self):
-    if not hasattr(self, '_distributed_function_cache'):
-      self._distributed_function_cache = {}
-
-  @property
-  def metrics(self):
-    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
-    metrics = []
-    if self._is_compiled:
-      if not hasattr(self, '_v1_compile_was_called'):
-        # See b/155687393 for more details, the model is created as a v2
-        # instance but converted to v1. Fallback to use base Model to retrieve
-        # the metrics.
-        return super().metrics
-      metrics += self._compile_metric_functions
-    metrics.extend(self._metrics)
-    metrics.extend(
-        _get_metrics_from_layers(
-            list(self._flatten_layers(include_self=False, recursive=False))))
-    return metrics
-
-  @property
-  def metrics_names(self):
-    """Returns the model's display labels for all outputs."""
-
-    # This property includes all output names including `loss` and per-output
-    # losses for backward compatibility.
-    metrics_names = ['loss']
-    if self._is_compiled:
-      if not hasattr(self, '_v1_compile_was_called'):
-        # See b/155687393 for more details, the model is created as a v2
-        # instance but converted to v1. Fallback to use base Model to retrieve
-        # the metrics name
-        return super().metrics_names
-
-      # Add output loss metric names to the metric names list.
-      if len(self._training_endpoints) > 1:
-        metrics_names.extend([
-            e.loss_name()
-            for e in self._training_endpoints
-            if not e.should_skip_target()
-        ])
-
-    # Add all metric names.
-    metrics_names += [m.name for m in self.metrics]
-    return metrics_names
-
-  @property
-  def run_eagerly(self):
-    """Settable attribute indicating whether the model should run eagerly.
-
-    Running eagerly means that your model will be run step by step,
-    like Python code. Your model might run slower, but it should become easier
-    for you to debug it by stepping into individual layer calls.
-
-    By default, we will attempt to compile your model to a static graph to
-    deliver the best execution performance.
-
-    Returns:
-      Boolean, whether the model should run eagerly.
-    """
-    if self._run_eagerly is True and not tf.executing_eagerly():
-      raise ValueError('You can only set `run_eagerly=True` if eager execution '
-                       'is enabled.')
-    if not self.dynamic:
-      if self._run_eagerly is None:
-        # Respect `tf.config.run_functions_eagerly` unless
-        # `run_eagerly` was explicitly passed to `compile`.
-        return tf.config.functions_run_eagerly()
-      else:
-        return self._run_eagerly
-    else:
-      if not tf.executing_eagerly():
-        raise ValueError('Your model contains layers that can only be '
-                         'successfully run in eager execution (layers '
-                         'constructed with `dynamic=True`). '
-                         'You must enable eager execution with '
-                         '`tf.enable_eager_execution()`.')
-      if self._run_eagerly is False:
-        # TODO(fchollet): consider using py_func to enable this.
-        raise ValueError('Your model contains layers that can only be '
-                         'successfully run in eager execution (layers '
-                         'constructed with `dynamic=True`). '
-                         'You cannot set `run_eagerly=False`.')
-      return tf.executing_eagerly()
-
-  @run_eagerly.setter
-  def run_eagerly(self, value):
-    self._run_eagerly = value
-
-  def _select_training_loop(self, inputs):
-    """Select training loop for fit/eval/predict based on the inputs."""
-    # TODO(kaftan) or TODO(scottzhu): This check should eventually be nicely
-    #  integrated into the data adapters in the v2 loop. We can't do this yet
-    #  because we currently have to fall back for unhandled data types.
-    if isinstance(inputs, (tf.compat.v1.data.Iterator,
-                           tf.data.Iterator)):
-      raise ValueError('For performance reasons Keras `fit`, `evaluate` and'
-                       '`predict` accept tf.data `Datasets` as input but not '
-                       'iterators that have been manually generated from '
-                       'Datasets by users. Please directly pass in the '
-                       'original `Dataset` object instead of passing in '
-                       '`iter(dataset)`.')
-
-    # Case 1: distribution strategy.
-    if self._distribution_strategy:
-      if self._in_multi_worker_mode():
-        return training_distributed_v1.DistributionMultiWorkerTrainingLoop(
-            training_distributed_v1.DistributionSingleWorkerTrainingLoop())
-      else:
-        return training_distributed_v1.DistributionSingleWorkerTrainingLoop()
-
-    # Case 2: generator-like. Input is Python generator, or Sequence object,
-    # or a non-distributed Dataset or iterator in eager execution.
-    if data_utils.is_generator_or_sequence(inputs):
-      return training_generator_v1.GeneratorOrSequenceTrainingLoop()
-    if training_utils_v1.is_eager_dataset_or_iterator(inputs):
-      return training_generator_v1.EagerDatasetOrIteratorTrainingLoop()
-
-    # Case 3: Symbolic tensors or Numpy array-like.
-    # This includes Datasets and iterators in graph mode (since they
-    # generate symbolic tensors).
-    if self.run_eagerly:
-      return training_generator_v1.GeneratorLikeTrainingLoop()
-    else:
-      return training_arrays_v1.ArrayLikeTrainingLoop()
-
-  def fit(self,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          max_queue_size=10,
-          workers=1,
-          use_multiprocessing=False,
-          **kwargs):
-    """Trains the model for a fixed number of epochs (iterations on a dataset).
-
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset. Should return a tuple
-            of either `(inputs, targets)` or
-            `(inputs, targets, sample_weights)`.
-          - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
-            or `(inputs, targets, sample weights)`.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset, generator,
-          or `keras.utils.Sequence` instance, `y` should
-          not be specified (since targets will be obtained from `x`).
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, datasets,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        epochs: Integer. Number of epochs to train the model.
-            An epoch is an iteration over the entire `x` and `y`
-            data provided.
-            Note that in conjunction with `initial_epoch`,
-            `epochs` is to be understood as "final epoch".
-            The model is not trained for a number of iterations
-            given by `epochs`, but merely until the epoch
-            of index `epochs` is reached.
-        verbose: 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = one line per epoch.
-            Note that the progress bar is not particularly useful when
-            logged to a file, so verbose=2 is recommended when not running
-            interactively (eg, in a production environment).
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during training.
-            See `tf.keras.callbacks`.
-        validation_split: Float between 0 and 1.
-            Fraction of the training data to be used as validation data.
-            The model will set apart this fraction of the training data,
-            will not train on it, and will evaluate
-            the loss and any model metrics
-            on this data at the end of each epoch.
-            The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling. This argument is
-            not supported when `x` is a dataset, generator or
-           `keras.utils.Sequence` instance.
-        validation_data: Data on which to evaluate
-            the loss and any model metrics at the end of each epoch.
-            The model will not be trained on this data.
-            `validation_data` will override `validation_split`.
-            `validation_data` could be:
-              - tuple `(x_val, y_val)` of Numpy arrays or tensors
-              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
-              - dataset
-            For the first two cases, `batch_size` must be provided.
-            For the last case, `validation_steps` could be provided.
-        shuffle: Boolean (whether to shuffle the training data
-            before each epoch) or str (for 'batch').
-            'batch' is a special option for dealing with the
-            limitations of HDF5 data; it shuffles in batch-sized chunks.
-            Has no effect when `steps_per_epoch` is not `None`.
-        class_weight: Optional dictionary mapping class indices (integers)
-            to a weight (float) value, used for weighting the loss function
-            (during training only).
-            This can be useful to tell the model to
-            "pay more attention" to samples from
-            an under-represented class.
-        sample_weight: Optional Numpy array of weights for
-            the training samples, used for weighting the loss function
-            (during training only). You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset, generator, or
-           `keras.utils.Sequence` instance, instead provide the sample_weights
-            as the third element of `x`.
-        initial_epoch: Integer.
-            Epoch at which to start training
-            (useful for resuming a previous training run).
-        steps_per_epoch: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined. If x is a
-            `tf.data` dataset, and 'steps_per_epoch'
-            is None, the epoch will run until the input dataset is exhausted.
-            This argument is not supported with array inputs.
-        validation_steps: Only relevant if `validation_data` is provided and
-            is a `tf.data` dataset. Total number of steps (batches of
-            samples) to draw before stopping when performing validation
-            at the end of every epoch. If 'validation_steps' is None, validation
-            will run until the `validation_data` dataset is exhausted. In the
-            case of a infinite dataset, it will run into a infinite loop.
-            If 'validation_steps' is specified and only part of the dataset
-            will be consumed, the evaluation will start from the beginning of
-            the dataset at each epoch. This ensures that the same validation
-            samples are used every time.
-        validation_freq: Only relevant if validation data is provided. Integer
-            or `collections.abc.Container` instance (e.g. list, tuple, etc.).
-            If an integer, specifies how many training epochs to run before a
-            new validation run is performed, e.g. `validation_freq=2` runs
-            validation every 2 epochs. If a Container, specifies the epochs on
-            which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
-            validation at the end of the 1st, 2nd, and 10th epochs.
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up
-            when using process-based threading. If unspecified, `workers`
-            will default to 1. If 0, will execute the generator on the main
-            thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
-        **kwargs: Used for backwards compatibility.
+                "`tf.compat.v1.keras` Optimizer (",
+                optimizer,
+                ") is "
+                "not supported when eager execution is enabled. Use a "
+                "`tf.keras` Optimizer instead, or disable eager "
+                "execution.",
+            )
+
+        if (
+            target_tensors is not None
+        ) or not tf.compat.v1.executing_eagerly_outside_functions():
+            # Fallback out of things that aren't supported with v2 loops
+            self._experimental_run_tf_function = False
+
+        if distribute is not None:
+            if (
+                tf.__internal__.tf2.enabled()
+                or self._experimental_run_tf_function
+            ):
+                raise ValueError(
+                    "Distribute argument in compile is not available in TF 2.0 "
+                    "please create the model under the distribution strategy "
+                    "scope."
+                )
+            logging.warning(
+                "Distribute argument in compile is deprecated please "
+                "create the model under the distribution strategy scope."
+            )
+            self._distribution_strategy = distribute
+            self._compile_distribution = True
+        else:
+            if tf.distribute.has_strategy():
+                # When the user builds the model in the DS scope and cross
+                # replica context we want distribution strategy to be set but
+                # when building the replica copies of the models internally we
+                # should not be compiling with distribution strategy and use the
+                # default compilation path.
+                if tf.distribute.in_cross_replica_context():
+                    self._distribution_strategy = tf.distribute.get_strategy()
+
+        if isinstance(
+            self._distribution_strategy,
+            tf.compat.v1.distribute.experimental.ParameterServerStrategy,
+        ):
+            raise NotImplementedError(
+                "`tf.compat.v1.distribute.experimental.ParameterServerStrategy`"
+                " currently only works with the tf.Estimator API"
+            )
+
+        if isinstance(
+            self._distribution_strategy,
+            tf.distribute.experimental.ParameterServerStrategy,
+        ):
+            raise NotImplementedError(
+                "`tf.distribute.experimental.ParameterServerStrategy` is only "
+                "supported in TF2."
+            )
+
+        if not self._experimental_run_tf_function:
+            self._validate_compile_param_for_distribution_strategy(
+                self.run_eagerly,
+                sample_weight_mode,
+                target_tensors,
+                weighted_metrics,
+            )
+        # We've disabled automatic dependency tracking for this method, but do
+        # want to add a checkpoint dependency on the optimizer if it's
+        # trackable.
+        if isinstance(self.optimizer, tf.__internal__.tracking.Trackable):
+            self._track_trackable(
+                self.optimizer, name="optimizer", overwrite=True
+            )
+        self.loss = loss or {}
+        self.loss_weights = loss_weights
+        self.sample_weight_mode = sample_weight_mode
+        self._compile_metrics = metrics or []
+        self._compile_weighted_metrics = weighted_metrics
+        if self.run_eagerly and target_tensors is not None:
+            raise ValueError(
+                "target_tensors argument is not supported when "
+                "running a model eagerly."
+            )
+
+        # _training_endpoints contains a list of _TrainingEndpoint object, which
+        # has all the model output/target/loss and related metadata.
+        self._training_endpoints = []
+
+        # Used to freeze the behavior of the Model once `compile` has been
+        # called.
+        self._compiled_trainable_state = self._get_trainable_state()
+
+        # Set tf.distribute.Strategy specific parameters.
+        self._distributed_model_cache = {}
+        self._distributed_function_cache = {}
+
+        # Clear any `_eager_losses` that was added.
+        self._clear_losses()
+
+        if (
+            not tf.executing_eagerly()
+            and self._distribution_strategy is not None
+        ):
+            # Ensures a Session is created and configured correctly for
+            # Distribution Strategy.
+            backend.configure_and_create_distributed_session(
+                self._distribution_strategy
+            )
+        # Initialize model metric attributes.
+        self._init_metric_attributes()
+        if not self.built or not self.inputs or not self.outputs:
+            # Model is not compilable because it does not know its number of
+            # inputs and outputs, nor their shapes and names. We will compile
+            # after the first time the model gets called on training data.
+            return
+        self._is_compiled = True
+        base_layer.keras_api_gauge.get_cell("compile").set(True)
+
+        # Prepare list of loss functions, same size of model outputs.
+        self.loss_functions = training_utils_v1.prepare_loss_functions(
+            self.loss, self.output_names
+        )
+
+        target_tensors = self._process_target_tensor_for_compile(target_tensors)
+
+        for o, n, l, t in zip(
+            self.outputs, self.output_names, self.loss_functions, target_tensors
+        ):
+            endpoint = _TrainingEndpoint(o, n, l)
+            endpoint.create_training_target(t, run_eagerly=self.run_eagerly)
+            self._training_endpoints.append(endpoint)
+
+        # Prepare list loss weights, same size of model outputs.
+        training_utils_v1.prepare_loss_weights(
+            self._training_endpoints, loss_weights
+        )
+
+        # Initialization for Eager mode execution.
+        if self.run_eagerly:
+            self._compile_eagerly(metrics, weighted_metrics, sample_weight_mode)
+            return
+
+        with backend.get_graph().as_default():
+            # Save all metric attributes per output of the model.
+            self._cache_output_metric_attributes(metrics, weighted_metrics)
+
+            # Set metric attributes on model.
+            self._set_metric_attributes()
+
+            # Invoke metric functions (unweighted) for all the outputs.
+            self._handle_metrics(
+                self.outputs,
+                targets=self._targets,
+                skip_target_masks=self._prepare_skip_target_masks(),
+                masks=self._prepare_output_masks(),
+            )
+
+            # Prepare sample weight modes. List with the same length as model
+            # outputs.
+            training_utils_v1.prepare_sample_weight_modes(
+                self._training_endpoints, sample_weight_mode
+            )
+
+            # Creates the model loss and weighted metrics sub-graphs.
+            self._compile_weights_loss_and_weighted_metrics()
+
+            # Functions for train, test and predict will
+            # be compiled lazily when required.
+            # This saves time when the user is not using all functions.
+            self.train_function = None
+            self.test_function = None
+            self.predict_function = None
+
+            # Collected trainable weights, sorted in topological order.
+            self._collected_trainable_weights = self.trainable_weights
+
+            # Validate all variables were correctly created in distribution
+            # scope.
+            if self._distribution_strategy and not self._compile_distribution:
+                for v in self.variables:
+                    strategy = self._distribution_strategy
+                    if not strategy.extended.variable_created_in_scope(v):
+                        raise ValueError(
+                            "Variable (%s) was not created in the distribution "
+                            "strategy scope of (%s). It is most likely due to "
+                            "not all layers or the model or optimizer being "
+                            "created outside the distribution strategy scope. "
+                            "Try to make sure your code looks similar "
+                            "to the following.\n"
+                            "with strategy.scope():\n"
+                            "  model=_create_model()\n"
+                            "  model.compile(...)" % (v, strategy)
+                        )
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _init_distributed_function_cache_if_not_compiled(self):
+        if not hasattr(self, "_distributed_function_cache"):
+            self._distributed_function_cache = {}
+
+    @property
+    def metrics(self):
+        """Returns the model's metrics added using `compile`, `add_metric`
+        APIs."""
+        metrics = []
+        if self._is_compiled:
+            if not hasattr(self, "_v1_compile_was_called"):
+                # See b/155687393 for more details, the model is created as a v2
+                # instance but converted to v1. Fallback to use base Model to
+                # retrieve the metrics.
+                return super().metrics
+            metrics += self._compile_metric_functions
+        metrics.extend(self._metrics)
+        metrics.extend(
+            _get_metrics_from_layers(
+                list(self._flatten_layers(include_self=False, recursive=False))
+            )
+        )
+        return metrics
+
+    @property
+    def metrics_names(self):
+        """Returns the model's display labels for all outputs."""
+
+        # This property includes all output names including `loss` and
+        # per-output losses for backward compatibility.
+        metrics_names = ["loss"]
+        if self._is_compiled:
+            if not hasattr(self, "_v1_compile_was_called"):
+                # See b/155687393 for more details, the model is created as a v2
+                # instance but converted to v1. Fallback to use base Model to
+                # retrieve the metrics name
+                return super().metrics_names
+
+            # Add output loss metric names to the metric names list.
+            if len(self._training_endpoints) > 1:
+                metrics_names.extend(
+                    [
+                        e.loss_name()
+                        for e in self._training_endpoints
+                        if not e.should_skip_target()
+                    ]
+                )
+
+        # Add all metric names.
+        metrics_names += [m.name for m in self.metrics]
+        return metrics_names
+
+    @property
+    def run_eagerly(self):
+        """Settable attribute indicating whether the model should run eagerly.
+
+        Running eagerly means that your model will be run step by step,
+        like Python code. Your model might run slower, but it should become
+        easier for you to debug it by stepping into individual layer calls.
+
+        By default, we will attempt to compile your model to a static graph to
+        deliver the best execution performance.
+
+        Returns:
+          Boolean, whether the model should run eagerly.
+        """
+        if self._run_eagerly is True and not tf.executing_eagerly():
+            raise ValueError(
+                "You can only set `run_eagerly=True` if eager execution "
+                "is enabled."
+            )
+        if not self.dynamic:
+            if self._run_eagerly is None:
+                # Respect `tf.config.run_functions_eagerly` unless
+                # `run_eagerly` was explicitly passed to `compile`.
+                return tf.config.functions_run_eagerly()
+            else:
+                return self._run_eagerly
+        else:
+            if not tf.executing_eagerly():
+                raise ValueError(
+                    "Your model contains layers that can only be "
+                    "successfully run in eager execution (layers "
+                    "constructed with `dynamic=True`). "
+                    "You must enable eager execution with "
+                    "`tf.enable_eager_execution()`."
+                )
+            if self._run_eagerly is False:
+                # TODO(fchollet): consider using py_func to enable this.
+                raise ValueError(
+                    "Your model contains layers that can only be "
+                    "successfully run in eager execution (layers "
+                    "constructed with `dynamic=True`). "
+                    "You cannot set `run_eagerly=False`."
+                )
+            return tf.executing_eagerly()
+
+    @run_eagerly.setter
+    def run_eagerly(self, value):
+        self._run_eagerly = value
+
+    def _select_training_loop(self, inputs):
+        """Select training loop for fit/eval/predict based on the inputs."""
+        # TODO(kaftan) or TODO(scottzhu): This check should eventually be nicely
+        # integrated into the data adapters in the v2 loop. We can't do this yet
+        # because we currently have to fall back for unhandled data types.
+        if isinstance(inputs, (tf.compat.v1.data.Iterator, tf.data.Iterator)):
+            raise ValueError(
+                "For performance reasons Keras `fit`, `evaluate` and"
+                "`predict` accept tf.data `Datasets` as input but not "
+                "iterators that have been manually generated from "
+                "Datasets by users. Please directly pass in the "
+                "original `Dataset` object instead of passing in "
+                "`iter(dataset)`."
+            )
+
+        # Case 1: distribution strategy.
+        if self._distribution_strategy:
+            if self._in_multi_worker_mode():
+                return training_distributed_v1.DistributionMultiWorkerTrainingLoop(  # noqa: E501
+                    training_distributed_v1.DistributionSingleWorkerTrainingLoop()  # noqa: E501
+                )
+            else:
+                return (
+                    training_distributed_v1.DistributionSingleWorkerTrainingLoop()  # noqa: E501
+                )
+
+        # Case 2: generator-like. Input is Python generator, or Sequence object,
+        # or a non-distributed Dataset or iterator in eager execution.
+        if data_utils.is_generator_or_sequence(inputs):
+            return training_generator_v1.GeneratorOrSequenceTrainingLoop()
+        if training_utils_v1.is_eager_dataset_or_iterator(inputs):
+            return training_generator_v1.EagerDatasetOrIteratorTrainingLoop()
+
+        # Case 3: Symbolic tensors or Numpy array-like.
+        # This includes Datasets and iterators in graph mode (since they
+        # generate symbolic tensors).
+        if self.run_eagerly:
+            return training_generator_v1.GeneratorLikeTrainingLoop()
+        else:
+            return training_arrays_v1.ArrayLikeTrainingLoop()
 
-    Returns:
-        A `History` object. Its `History.history` attribute is
-        a record of training loss values and metrics values
-        at successive epochs, as well as validation loss values
-        and validation metrics values (if applicable).
-
-    Raises:
-        RuntimeError: If the model was never compiled.
-        ValueError: In case of mismatch between the provided input data
-            and what the model expects.
-    """
-    self._assert_built_as_v1()
-    base_layer.keras_api_gauge.get_cell('fit').set(True)
-    # Legacy support
-    if 'nb_epoch' in kwargs:
-      logging.warning(
-          'The `nb_epoch` argument in `fit` has been renamed `epochs`.')
-      epochs = kwargs.pop('nb_epoch')
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
-    self._assert_compile_was_called()
-    self._check_call_args('fit')
-
-    func = self._select_training_loop(x)
-    return func.fit(
+    def fit(
         self,
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_split=validation_split,
-        validation_data=validation_data,
-        shuffle=shuffle,
-        class_weight=class_weight,
-        sample_weight=sample_weight,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
-
-  def evaluate(self,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               max_queue_size=10,
-               workers=1,
-               use_multiprocessing=False):
-    """Returns the loss value & metrics values for the model in test mode.
-
-    Computation is done in batches (see the `batch_size` arg.)
-
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset.
-          - A generator or `keras.utils.Sequence` instance.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely).
-          If `x` is a dataset, generator or
-          `keras.utils.Sequence` instance, `y` should not be specified (since
-          targets will be obtained from the iterator/dataset).
-        batch_size: Integer or `None`.
-            Number of samples per batch of computation.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, dataset,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        verbose: 0 or 1. Verbosity mode.
-            0 = silent, 1 = progress bar.
-        sample_weight: Optional Numpy array of weights for
-            the test samples, used for weighting the loss function.
-            You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset, instead pass
-            sample weights as the third element of `x`.
-        steps: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring the evaluation round finished.
-            Ignored with the default value of `None`.
-            If x is a `tf.data` dataset and `steps` is
-            None, 'evaluate' will run until the dataset is exhausted.
-            This argument is not supported with array inputs.
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during evaluation.
-            See [callbacks](/api_docs/python/tf/keras/callbacks).
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1. If 0, will execute the generator on the main thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
-
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        ValueError: in case of invalid arguments.
-    """
-    self._assert_built_as_v1()
-    base_layer.keras_api_gauge.get_cell('evaluate').set(True)
-    self._assert_compile_was_called()
-    self._check_call_args('evaluate')
-
-    func = self._select_training_loop(x)
-    return func.evaluate(
+        x=None,
+        y=None,
+        batch_size=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_split=0.0,
+        validation_data=None,
+        shuffle=True,
+        class_weight=None,
+        sample_weight=None,
+        initial_epoch=0,
+        steps_per_epoch=None,
+        validation_steps=None,
+        validation_freq=1,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        **kwargs,
+    ):
+        """Trains the model for a fixed number of epochs (dataset iterations).
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                if the model has named inputs.
+              - A `tf.data` dataset. Should return a tuple
+                of either `(inputs, targets)` or
+                `(inputs, targets, sample_weights)`.
+              - A generator or `keras.utils.Sequence` returning `(inputs,
+                targets)` or `(inputs, targets, sample weights)`.
+            y: Target data. Like the input data `x`,
+              it could be either Numpy array(s) or TensorFlow tensor(s).
+              It should be consistent with `x` (you cannot have Numpy inputs and
+              tensor targets, or inversely). If `x` is a dataset, generator,
+              or `keras.utils.Sequence` instance, `y` should
+              not be specified (since targets will be obtained from `x`).
+            batch_size: Integer or `None`.
+                Number of samples per gradient update.
+                If unspecified, `batch_size` will default to 32.
+                Do not specify the `batch_size` if your data is in the
+                form of symbolic tensors, datasets,
+                generators, or `keras.utils.Sequence` instances (since they
+                generate batches).
+            epochs: Integer. Number of epochs to train the model.
+                An epoch is an iteration over the entire `x` and `y`
+                data provided.
+                Note that in conjunction with `initial_epoch`,
+                `epochs` is to be understood as "final epoch".
+                The model is not trained for a number of iterations
+                given by `epochs`, but merely until the epoch
+                of index `epochs` is reached.
+            verbose: 0, 1, or 2. Verbosity mode.
+                0 = silent, 1 = progress bar, 2 = one line per epoch.
+                Note that the progress bar is not particularly useful when
+                logged to a file, so verbose=2 is recommended when not running
+                interactively (eg, in a production environment).
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during training.
+                See `tf.keras.callbacks`.
+            validation_split: Float between 0 and 1.
+                Fraction of the training data to be used as validation data.
+                The model will set apart this fraction of the training data,
+                will not train on it, and will evaluate
+                the loss and any model metrics
+                on this data at the end of each epoch.
+                The validation data is selected from the last samples
+                in the `x` and `y` data provided, before shuffling. This
+                argument is not supported when `x` is a dataset, generator or
+               `keras.utils.Sequence` instance.
+            validation_data: Data on which to evaluate
+                the loss and any model metrics at the end of each epoch.
+                The model will not be trained on this data.
+                `validation_data` will override `validation_split`.
+                `validation_data` could be:
+                  - tuple `(x_val, y_val)` of Numpy arrays or tensors
+                  - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
+                  - dataset
+                For the first two cases, `batch_size` must be provided.
+                For the last case, `validation_steps` could be provided.
+            shuffle: Boolean (whether to shuffle the training data
+                before each epoch) or str (for 'batch').
+                'batch' is a special option for dealing with the
+                limitations of HDF5 data; it shuffles in batch-sized chunks.
+                Has no effect when `steps_per_epoch` is not `None`.
+            class_weight: Optional dictionary mapping class indices (integers)
+                to a weight (float) value, used for weighting the loss function
+                (during training only).
+                This can be useful to tell the model to
+                "pay more attention" to samples from
+                an under-represented class.
+            sample_weight: Optional Numpy array of weights for
+                the training samples, used for weighting the loss function
+                (during training only). You can either pass a flat (1D)
+                Numpy array with the same length as the input samples
+                (1:1 mapping between weights and samples),
+                or in the case of temporal data,
+                you can pass a 2D array with shape
+                `(samples, sequence_length)`,
+                to apply a different weight to every timestep of every sample.
+                In this case you should make sure to specify
+                `sample_weight_mode="temporal"` in `compile()`. This argument is
+                not supported when `x` is a dataset, generator, or
+                `keras.utils.Sequence` instance, instead provide the
+                sample_weights as the third element of `x`.
+            initial_epoch: Integer.
+                Epoch at which to start training
+                (useful for resuming a previous training run).
+            steps_per_epoch: Integer or `None`.
+                Total number of steps (batches of samples)
+                before declaring one epoch finished and starting the
+                next epoch. When training with input tensors such as
+                TensorFlow data tensors, the default `None` is equal to
+                the number of samples in your dataset divided by
+                the batch size, or 1 if that cannot be determined. If x is a
+                `tf.data` dataset, and 'steps_per_epoch'
+                is None, the epoch will run until the input dataset is
+                exhausted.  This argument is not supported with array inputs.
+            validation_steps: Only relevant if `validation_data` is provided and
+                is a `tf.data` dataset. Total number of steps (batches of
+                samples) to draw before stopping when performing validation at
+                the end of every epoch. If 'validation_steps' is None,
+                validation will run until the `validation_data` dataset is
+                exhausted. In the case of a infinite dataset, it will run into a
+                infinite loop.  If 'validation_steps' is specified and only part
+                of the dataset will be consumed, the evaluation will start from
+                the beginning of the dataset at each epoch. This ensures that
+                the same validation samples are used every time.
+            validation_freq: Only relevant if validation data is provided.
+                Integer or `collections.abc.Container` instance (e.g. list,
+                tuple, etc.).  If an integer, specifies how many training epochs
+                to run before a new validation run is performed, e.g.
+                `validation_freq=2` runs validation every 2 epochs. If a
+                Container, specifies the epochs on which to run validation, e.g.
+                `validation_freq=[1, 2, 10]` runs validation at the end of the
+                1st, 2nd, and 10th epochs.
+            max_queue_size: Integer. Used for generator or
+                `keras.utils.Sequence` input only. Maximum size for the
+                generator queue.  If unspecified, `max_queue_size` will default
+                to 10.
+            workers: Integer. Used for generator or `keras.utils.Sequence` input
+                only. Maximum number of processes to spin up
+                when using process-based threading. If unspecified, `workers`
+                will default to 1. If 0, will execute the generator on the main
+                thread.
+            use_multiprocessing: Boolean. Used for generator or
+                `keras.utils.Sequence` input only. If `True`, use process-based
+                threading. If unspecified, `use_multiprocessing` will default to
+                `False`. Note that because this implementation relies on
+                multiprocessing, you should not pass non-pickleable arguments to
+                the generator as they can't be passed easily to children
+                processes.
+            **kwargs: Used for backwards compatibility.
+
+        Returns:
+            A `History` object. Its `History.history` attribute is
+            a record of training loss values and metrics values
+            at successive epochs, as well as validation loss values
+            and validation metrics values (if applicable).
+
+        Raises:
+            RuntimeError: If the model was never compiled.
+            ValueError: In case of mismatch between the provided input data
+                and what the model expects.
+        """
+        self._assert_built_as_v1()
+        base_layer.keras_api_gauge.get_cell("fit").set(True)
+        # Legacy support
+        if "nb_epoch" in kwargs:
+            logging.warning(
+                "The `nb_epoch` argument in `fit` has been renamed `epochs`."
+            )
+            epochs = kwargs.pop("nb_epoch")
+        if kwargs:
+            raise TypeError("Unrecognized keyword arguments: " + str(kwargs))
+        self._assert_compile_was_called()
+        self._check_call_args("fit")
+
+        func = self._select_training_loop(x)
+        return func.fit(
+            self,
+            x=x,
+            y=y,
+            batch_size=batch_size,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_split=validation_split,
+            validation_data=validation_data,
+            shuffle=shuffle,
+            class_weight=class_weight,
+            sample_weight=sample_weight,
+            initial_epoch=initial_epoch,
+            steps_per_epoch=steps_per_epoch,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+        )
+
+    def evaluate(
         self,
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        verbose=verbose,
-        sample_weight=sample_weight,
-        steps=steps,
-        callbacks=callbacks,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
-
-  def predict(self,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              max_queue_size=10,
-              workers=1,
-              use_multiprocessing=False):
-    """Generates output predictions for the input samples.
-
-    Computation is done in batches (see the `batch_size` arg.)
-
-    Args:
-        x: Input samples. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset.
-          - A generator or `keras.utils.Sequence` instance.
-        batch_size: Integer or `None`.
-            Number of samples per batch of computation.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, dataset,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        verbose: Verbosity mode, 0 or 1.
-        steps: Total number of steps (batches of samples)
-            before declaring the prediction round finished.
-            Ignored with the default value of `None`. If x is a `tf.data`
-            dataset and `steps` is None, `predict` will
-            run until the input dataset is exhausted.
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during prediction.
-            See [callbacks](/api_docs/python/tf/keras/callbacks).
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1. If 0, will execute the generator on the main thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
-
+        x=None,
+        y=None,
+        batch_size=None,
+        verbose=1,
+        sample_weight=None,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+    ):
+        """Returns the loss value & metrics values for the model in test mode.
+
+        Computation is done in batches (see the `batch_size` arg.)
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                if the model has named inputs.
+              - A `tf.data` dataset.
+              - A generator or `keras.utils.Sequence` instance.
+            y: Target data. Like the input data `x`,
+              it could be either Numpy array(s) or TensorFlow tensor(s).
+              It should be consistent with `x` (you cannot have Numpy inputs and
+              tensor targets, or inversely).
+              If `x` is a dataset, generator or
+              `keras.utils.Sequence` instance, `y` should not be specified
+              (since targets will be obtained from the iterator/dataset).
+            batch_size: Integer or `None`.
+                Number of samples per batch of computation.
+                If unspecified, `batch_size` will default to 32.
+                Do not specify the `batch_size` if your data is in the
+                form of symbolic tensors, dataset,
+                generators, or `keras.utils.Sequence` instances (since they
+                generate batches).
+            verbose: 0 or 1. Verbosity mode.
+                0 = silent, 1 = progress bar.
+            sample_weight: Optional Numpy array of weights for
+                the test samples, used for weighting the loss function.
+                You can either pass a flat (1D)
+                Numpy array with the same length as the input samples
+                (1:1 mapping between weights and samples),
+                or in the case of temporal data,
+                you can pass a 2D array with shape
+                `(samples, sequence_length)`,
+                to apply a different weight to every timestep of every sample.
+                In this case you should make sure to specify
+                `sample_weight_mode="temporal"` in `compile()`. This argument is
+                not supported when `x` is a dataset, instead pass sample weights
+                as the third element of `x`.
+            steps: Integer or `None`.
+                Total number of steps (batches of samples)
+                before declaring the evaluation round finished.
+                Ignored with the default value of `None`.
+                If x is a `tf.data` dataset and `steps` is
+                None, 'evaluate' will run until the dataset is exhausted.
+                This argument is not supported with array inputs.
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during evaluation.
+                See [callbacks](/api_docs/python/tf/keras/callbacks).
+            max_queue_size: Integer. Used for generator or
+                `keras.utils.Sequence` input only. Maximum size for the
+                generator queue.  If unspecified, `max_queue_size` will default
+                to 10.
+            workers: Integer. Used for generator or `keras.utils.Sequence` input
+                only. Maximum number of processes to spin up when using
+                process-based threading. If unspecified, `workers` will default
+                to 1. If 0, will execute the generator on the main thread.
+            use_multiprocessing: Boolean. Used for generator or
+                `keras.utils.Sequence` input only. If `True`, use process-based
+                threading. If unspecified, `use_multiprocessing` will default to
+                `False`. Note that because this implementation relies on
+                multiprocessing, you should not pass non-pickleable arguments to
+                the generator as they can't be passed easily to children
+                processes.
+
+        Returns:
+            Scalar test loss (if the model has a single output and no metrics)
+            or list of scalars (if the model has multiple outputs
+            and/or metrics). The attribute `model.metrics_names` will give you
+            the display labels for the scalar outputs.
+
+        Raises:
+            ValueError: in case of invalid arguments.
+        """
+        self._assert_built_as_v1()
+        base_layer.keras_api_gauge.get_cell("evaluate").set(True)
+        self._assert_compile_was_called()
+        self._check_call_args("evaluate")
+
+        func = self._select_training_loop(x)
+        return func.evaluate(
+            self,
+            x=x,
+            y=y,
+            batch_size=batch_size,
+            verbose=verbose,
+            sample_weight=sample_weight,
+            steps=steps,
+            callbacks=callbacks,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+        )
+
+    def predict(
+        self,
+        x,
+        batch_size=None,
+        verbose=0,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+    ):
+        """Generates output predictions for the input samples.
+
+        Computation is done in batches (see the `batch_size` arg.)
+
+        Args:
+            x: Input samples. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A `tf.data` dataset.
+              - A generator or `keras.utils.Sequence` instance.
+            batch_size: Integer or `None`.
+                Number of samples per batch of computation.
+                If unspecified, `batch_size` will default to 32.
+                Do not specify the `batch_size` if your data is in the
+                form of symbolic tensors, dataset,
+                generators, or `keras.utils.Sequence` instances (since they
+                generate batches).
+            verbose: Verbosity mode, 0 or 1.
+            steps: Total number of steps (batches of samples)
+                before declaring the prediction round finished.
+                Ignored with the default value of `None`. If x is a `tf.data`
+                dataset and `steps` is None, `predict` will
+                run until the input dataset is exhausted.
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during prediction.
+                See [callbacks](/api_docs/python/tf/keras/callbacks).
+            max_queue_size: Integer. Used for generator or
+                `keras.utils.Sequence` input only. Maximum size for the
+                generator queue. If unspecified, `max_queue_size` will default
+                to 10.
+            workers: Integer. Used for generator or `keras.utils.Sequence` input
+                only. Maximum number of processes to spin up when using
+                process-based threading. If unspecified, `workers` will default
+                to 1. If 0, will execute the generator on the main thread.
+            use_multiprocessing: Boolean. Used for generator or
+                `keras.utils.Sequence` input only. If `True`, use process-based
+                threading. If unspecified, `use_multiprocessing` will default to
+                `False`. Note that because this implementation relies on
+                multiprocessing, you should not pass non-pickleable arguments to
+                the generator as they can't be passed easily to children
+                processes.
+
+
+        Returns:
+            Numpy array(s) of predictions.
+
+        Raises:
+            ValueError: In case of mismatch between the provided
+                input data and the model's expectations,
+                or in case a stateful model receives a number of samples
+                that is not a multiple of the batch size.
+        """
+        self._assert_built_as_v1()
+        base_layer.keras_api_gauge.get_cell("predict").set(True)
+        self._check_call_args("predict")
+
+        func = self._select_training_loop(x)
+        return func.predict(
+            self,
+            x=x,
+            batch_size=batch_size,
+            verbose=verbose,
+            steps=steps,
+            callbacks=callbacks,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+        )
+
+    def reset_metrics(self):
+        """Resets the state of metrics."""
+        metrics = self._get_training_eval_metrics()
+        for m in metrics:
+            m.reset_state()
 
-    Returns:
-        Numpy array(s) of predictions.
+        # Reset metrics on all the distributed (cloned) models.
+        if self._distribution_strategy:
+            distributed_training_utils_v1._reset_metrics(self)
 
-    Raises:
-        ValueError: In case of mismatch between the provided
-            input data and the model's expectations,
-            or in case a stateful model receives a number of samples
-            that is not a multiple of the batch size.
-    """
-    self._assert_built_as_v1()
-    base_layer.keras_api_gauge.get_cell('predict').set(True)
-    self._check_call_args('predict')
+    def train_on_batch(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        class_weight=None,
+        reset_metrics=True,
+    ):
+        """Runs a single gradient update on a single batch of data.
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                  (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                  (in case the model has multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                  if the model has named inputs.
+              - A `tf.data` dataset.
+            y: Target data. Like the input data `x`, it could be either Numpy
+              array(s) or TensorFlow tensor(s). It should be consistent with `x`
+              (you cannot have Numpy inputs and tensor targets, or inversely).
+              If `x` is a dataset, `y` should not be specified
+              (since targets will be obtained from the iterator).
+            sample_weight: Optional array of the same length as x, containing
+              weights to apply to the model's loss for each sample. In the case
+              of temporal data, you can pass a 2D array with shape (samples,
+              sequence_length), to apply a different weight to every timestep of
+              every sample. In this case you should make sure to specify
+              sample_weight_mode="temporal" in compile(). This argument is not
+              supported when `x` is a dataset.
+            class_weight: Optional dictionary mapping class indices (integers)
+              to a weight (float) to apply to the model's loss for the samples
+              from this class during training. This can be useful to tell the
+              model to "pay more attention" to samples from an under-represented
+              class.
+            reset_metrics: If `True`, the metrics returned will be only for this
+              batch. If `False`, the metrics will be statefully accumulated
+              across batches.
+
+        Returns:
+            Scalar training loss
+            (if the model has a single output and no metrics)
+            or list of scalars (if the model has multiple outputs
+            and/or metrics). The attribute `model.metrics_names` will give you
+            the display labels for the scalar outputs.
+
+        Raises:
+          ValueError: In case of invalid user-provided arguments.
+        """
+        self._assert_compile_was_called()
+        self._check_call_args("train_on_batch")
+
+        # If at this point we are in the replica context, then it is okay to
+        # execute the Eager code path.  The expected way to get here is to call
+        # `fit` that calls `train_on_batch` on each replica.
+        if (
+            self._distribution_strategy
+            and tf.distribute.in_cross_replica_context()
+        ):
+            raise NotImplementedError(
+                "`train_on_batch` is not supported for models "
+                "distributed with tf.distribute.Strategy."
+            )
+        # Validate and standardize user data.
+        x, y, sample_weights = self._standardize_user_data(
+            x,
+            y,
+            sample_weight=sample_weight,
+            class_weight=class_weight,
+            extract_tensors_from_dataset=True,
+        )
+
+        # If `self._distribution_strategy` is True, then we are in a replica
+        # context at this point because of the check above.  `train_on_batch` is
+        # being run for each replica by `self._distribution_strategy` and the
+        # same code path as Eager is expected to be taken.
+        if self.run_eagerly or self._distribution_strategy:
+            output_dict = training_eager_v1.train_on_batch(
+                self,
+                x,
+                y,
+                sample_weights=sample_weights,
+                output_loss_metrics=self._output_loss_metrics,
+            )
+            outputs = (
+                output_dict["total_loss"]
+                + output_dict["output_losses"]
+                + output_dict["metrics"]
+            )
+            outputs = [_non_none_constant_value(v) for v in outputs]
+        else:
+            x = training_utils_v1.ModelInputs(x).as_list()
+            ins = x + list(y or []) + list(sample_weights or [])
+
+            if not isinstance(backend.symbolic_learning_phase(), int):
+                ins += [True]  # Add learning phase value.
+
+            self._update_sample_weight_modes(sample_weights=sample_weights)
+            self._make_train_function()
+            outputs = self.train_function(ins)
+
+        if reset_metrics:
+            self.reset_metrics()
+
+        if len(outputs) == 1:
+            return outputs[0]
+        return outputs
+
+    def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
+        """Test the model on a single batch of samples.
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A dict mapping input names to the corresponding array/tensors,
+                if the model has named inputs.
+              - A `tf.data` dataset.
+            y: Target data. Like the input data `x`,
+              it could be either Numpy array(s) or TensorFlow tensor(s).
+              It should be consistent with `x` (you cannot have Numpy inputs and
+              tensor targets, or inversely). If `x` is a dataset `y` should
+              not be specified (since targets will be obtained from the
+              iterator).
+            sample_weight: Optional array of the same length as x, containing
+                weights to apply to the model's loss for each sample.
+                In the case of temporal data, you can pass a 2D array
+                with shape (samples, sequence_length),
+                to apply a different weight to every timestep of every sample.
+                In this case you should make sure to specify
+                sample_weight_mode="temporal" in compile(). This argument is not
+                supported when `x` is a dataset.
+            reset_metrics: If `True`, the metrics returned will be only for this
+              batch. If `False`, the metrics will be statefully accumulated
+              across batches.
+
+        Returns:
+            Scalar test loss (if the model has a single output and no metrics)
+            or list of scalars (if the model has multiple outputs
+            and/or metrics). The attribute `model.metrics_names` will give you
+            the display labels for the scalar outputs.
+
+        Raises:
+            ValueError: In case of invalid user-provided arguments.
+        """
+        self._assert_compile_was_called()
+        self._check_call_args("test_on_batch")
+
+        if (
+            self._distribution_strategy
+            and tf.distribute.in_cross_replica_context()
+        ):
+            raise NotImplementedError(
+                "`test_on_batch` is not supported for models "
+                "distributed with tf.distribute.Strategy."
+            )
+        # Validate and standardize user data.
+        x, y, sample_weights = self._standardize_user_data(
+            x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True
+        )
+
+        # If `self._distribution_strategy` is True, then we are in a replica
+        # context at this point.
+        if self.run_eagerly or self._distribution_strategy:
+            output_dict = training_eager_v1.test_on_batch(
+                self,
+                x,
+                y,
+                sample_weights=sample_weights,
+                output_loss_metrics=self._output_loss_metrics,
+            )
+            outputs = (
+                output_dict["total_loss"]
+                + output_dict["output_losses"]
+                + output_dict["metrics"]
+            )
+            outputs = [_non_none_constant_value(v) for v in outputs]
+        else:
+            x = training_utils_v1.ModelInputs(x).as_list()
+            inputs = x + list(y or []) + list(sample_weights or [])
+
+            self._update_sample_weight_modes(sample_weights=sample_weights)
+            self._make_test_function()
+            outputs = self.test_function(inputs)
+
+        if reset_metrics:
+            self.reset_metrics()
+
+        if len(outputs) == 1:
+            return outputs[0]
+        return outputs
+
+    def predict_on_batch(self, x):
+        """Returns predictions for a single batch of samples.
+
+        Args:
+            x: Input data. It could be:
+              - A Numpy array (or array-like), or a list of arrays
+                (in case the model has multiple inputs).
+              - A TensorFlow tensor, or a list of tensors
+                (in case the model has multiple inputs).
+              - A `tf.data` dataset.
+
+        Returns:
+            Numpy array(s) of predictions.
+
+        Raises:
+            ValueError: In case of mismatch between given number of inputs and
+              expectations of the model.
+        """
+        self._check_call_args("predict_on_batch")
+
+        if (
+            self._distribution_strategy
+            and tf.distribute.in_cross_replica_context()
+        ):
+            raise NotImplementedError(
+                "`predict_on_batch` is not supported for models distributed "
+                "with tf.distribute.Strategy."
+            )
+        # Validate and standardize user data.
+        inputs, _, _ = self._standardize_user_data(
+            x, extract_tensors_from_dataset=True
+        )
+        # If `self._distribution_strategy` is True, then we are in a replica
+        # context at this point.
+        if self.run_eagerly or self._distribution_strategy:
+            inputs = training_utils_v1.cast_if_floating_dtype(inputs)
+            if isinstance(inputs, collections.abc.Sequence):
+                # Unwrap lists with only one input, as we do when training on
+                # batch
+                if len(inputs) == 1:
+                    inputs = inputs[0]
+
+            return self(inputs)
+
+        self._make_predict_function()
+        outputs = self.predict_function(inputs)
+
+        if len(outputs) == 1:
+            return outputs[0]
+        return outputs
+
+    def fit_generator(
+        self,
+        generator,
+        steps_per_epoch=None,
+        epochs=1,
+        verbose=1,
+        callbacks=None,
+        validation_data=None,
+        validation_steps=None,
+        validation_freq=1,
+        class_weight=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        shuffle=True,
+        initial_epoch=0,
+    ):
+        """Fits the model on data yielded batch-by-batch by a Python generator.
+
+        DEPRECATED:
+          `Model.fit` now supports generators, so there is no longer any need to
+          use this endpoint.
+        """
+        warnings.warn(
+            "`model.fit_generator` is deprecated and "
+            "will be removed in a future version. "
+            "Please use `Model.fit`, which supports generators.",
+            stacklevel=2,
+        )
+        return self.fit(
+            generator,
+            steps_per_epoch=steps_per_epoch,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_data=validation_data,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq,
+            class_weight=class_weight,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            shuffle=shuffle,
+            initial_epoch=initial_epoch,
+        )
+
+    def evaluate_generator(
+        self,
+        generator,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        verbose=0,
+    ):
+        """Evaluates the model on a data generator.
+
+        DEPRECATED:
+          `Model.evaluate` now supports generators, so there is no longer any
+          need to use this endpoint.
+        """
+        warnings.warn(
+            "`Model.evaluate_generator` is deprecated and "
+            "will be removed in a future version. "
+            "Please use `Model.evaluate`, which supports generators.",
+            stacklevel=2,
+        )
+        self._check_call_args("evaluate_generator")
+
+        return self.evaluate(
+            generator,
+            steps=steps,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            verbose=verbose,
+            callbacks=callbacks,
+        )
+
+    def predict_generator(
+        self,
+        generator,
+        steps=None,
+        callbacks=None,
+        max_queue_size=10,
+        workers=1,
+        use_multiprocessing=False,
+        verbose=0,
+    ):
+        """Generates predictions for the input samples from a data generator.
+
+        DEPRECATED:
+          `Model.predict` now supports generators, so there is no longer any
+          need to use this endpoint.
+        """
+        warnings.warn(
+            "`Model.predict_generator` is deprecated and "
+            "will be removed in a future version. "
+            "Please use `Model.predict`, which supports generators.",
+            stacklevel=2,
+        )
+        return self.predict(
+            generator,
+            steps=steps,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            verbose=verbose,
+            callbacks=callbacks,
+        )
+
+    def _check_call_args(self, method_name):
+        """Check that `call` has only one positional arg."""
+        # Always allow first arg, regardless of arg name.
+        fullargspec = self._call_spec.full_argspec
+        if fullargspec.defaults:
+            positional_args = fullargspec.args[: -len(fullargspec.defaults)]
+        else:
+            positional_args = fullargspec.args
+        if "training" in positional_args:
+            positional_args.remove("training")
 
-    func = self._select_training_loop(x)
-    return func.predict(
+        # self and first arg can be positional.
+        if len(positional_args) > 2:
+            extra_args = positional_args[2:]
+            raise ValueError(
+                "Models passed to `"
+                + method_name
+                + "` can only have `training` "
+                "and the first argument in `call` as positional arguments, "
+                "found: " + str(extra_args) + "."
+            )
+
+    def _set_optimizer(self, optimizer):
+        """Sets self.optimizer.
+
+        Sets self.optimizer to `optimizer`, potentially wrapping it with a
+        LossScaleOptimizer.
+
+        Args:
+          optimizer: The optimizer(s) to assign to self.optimizer.
+        """
+        if isinstance(optimizer, (list, tuple)):
+            self.optimizer = [optimizers.get(opt) for opt in optimizer]
+        else:
+            self.optimizer = optimizers.get(optimizer)
+
+        if self._dtype_policy.name == "mixed_float16" and not isinstance(
+            self.optimizer, loss_scale_optimizer.LossScaleOptimizer
+        ):
+            if isinstance(self.optimizer, list):
+                raise ValueError(
+                    'When the "mixed_float16" dtype policy is used, you '
+                    "can only pass a single optimizer. Using policy %s "
+                    "and got optimizers: %s" % self._dtype_policy,
+                    self.optimizer,
+                )
+            if not isinstance(self.optimizer, optimizer_v2.OptimizerV2):
+                raise ValueError(
+                    '"optimizer" must be an instance of '
+                    "tf.keras.optimizers.legacy.Optimizer when a dype policy "
+                    "with a loss scale is used, but got: %s. Using policy: "
+                    "%s" % (self.optimizer, self._dtype_policy)
+                )
+            self.optimizer = loss_scale_optimizer.LossScaleOptimizer(
+                self.optimizer
+            )
+
+    def _prepare_validation_data(
+        self, validation_data, batch_size, validation_steps
+    ):
+        """Unpack and check the validation data."""
+        (
+            val_x,
+            val_y,
+            val_sample_weights,
+        ) = training_utils_v1.unpack_validation_data(validation_data)
+        return self._standardize_user_data(
+            val_x,
+            val_y,
+            sample_weight=val_sample_weights,
+            batch_size=batch_size,
+            steps=validation_steps,
+            steps_name="validation_steps",
+        )
+
+    def _validate_compile_param_for_distribution_strategy(
+        self, run_eagerly, sample_weight_mode, target_tensors, weighted_metrics
+    ):
+        # Validate that arguments passed by the user to `compile` are supported
+        # by tf.distribute.Strategy.
+        if self._distribution_strategy:
+            if sample_weight_mode:
+                raise NotImplementedError(
+                    "sample_weight_mode is not supported with "
+                    "tf.distribute.Strategy."
+                )
+            if weighted_metrics:
+                raise NotImplementedError(
+                    "weighted_metrics is not supported with "
+                    "tf.distribute.Strategy."
+                )
+            if target_tensors:
+                raise ValueError(
+                    "target_tensors is not supported with "
+                    "tf.distribute.Strategy."
+                )
+
+            if run_eagerly:
+                raise ValueError(
+                    "We currently do not support enabling `run_eagerly` with "
+                    "distribution strategy."
+                )
+
+            if distributed_training_utils_v1.is_distributing_by_cloning(
+                self
+            ) and (not self.built or not self.inputs or not self.outputs):
+                raise ValueError(
+                    "We currently do not support distribution strategy with a "
+                    "`Sequential` model that is created without `input_shape`/"
+                    "`input_dim` set in its first layer or a subclassed model."
+                )
+
+    def _process_target_tensor_for_compile(self, target_tensors):
+        if self.run_eagerly:
+            # target tensor is not supported with run_eagerly. Create a list
+            # with None as placeholder for each output.
+            return [None for _ in self.output_names]
+
+        if target_tensors is not None and not (
+            isinstance(target_tensors, list) and target_tensors == []
+        ):
+            if isinstance(target_tensors, list):
+                if len(target_tensors) != len(self.outputs):
+                    raise ValueError(
+                        "When passing a list as `target_tensors`, "
+                        "it should have one entry per model output. "
+                        "The model has %s outputs, "
+                        "but you passed target_tensors=%s"
+                        % (len(self.outputs), target_tensors)
+                    )
+            elif isinstance(target_tensors, dict):
+                unexpected_target_tensor_names = set(
+                    target_tensors.keys()
+                ).difference(self.output_names)
+                if unexpected_target_tensor_names:
+                    raise ValueError(
+                        "Unknown entry in `target_tensors` dictionary: "
+                        '"{name}". '
+                        "Only expected the following keys: {keys}".format(
+                            name=unexpected_target_tensor_names,
+                            keys=str(self.output_names),
+                        )
+                    )
+                tmp_target_tensors = []
+                for name in self.output_names:
+                    tmp_target_tensors.append(target_tensors.get(name, None))
+                target_tensors = tmp_target_tensors
+            elif tf.is_tensor(target_tensors):
+                target_tensors = [target_tensors]
+            else:
+                raise TypeError(
+                    "Expected `target_tensors` to be a list or tuple or "
+                    "dict or a single tensor, but got:",
+                    target_tensors,
+                )
+        else:
+            # In case target tensor is empty or None, create a list with Nones
+            # that has same length as self.output_names. With that, the None
+            # check of target tensor can be skipped downstream.
+            target_tensors = [None for _ in self.output_names]
+        return target_tensors
+
+    def _compile_eagerly(self, metrics, weighted_metrics, sample_weight_mode):
+        # Prepare sample weight modes. List with the same length as model
+        # outputs.
+        training_utils_v1.prepare_sample_weight_modes(
+            self._training_endpoints, sample_weight_mode
+        )
+        # Prepare sample weights.
+        self._prepare_sample_weights()
+        # Save all metric attributes per output of the model.
+        self._cache_output_metric_attributes(metrics, weighted_metrics)
+        self.total_loss = None
+        # Set metric attributes on model.
+        self._set_metric_attributes()
+
+        self._collected_trainable_weights = self.trainable_weights
+
+    def _update_sample_weight_modes(self, sample_weights=None):
+        """Updates sample weight modes based on training/eval inputs.
+
+        Sample weight placeholders will be created for all or no outputs
+        based on whether sample_weight is provided for any output.
+
+        If model contains `_sample_weight_modes` we check if the input
+        `sample_weights` corresponds to the sample weight modes.
+          1. Set sample weight mode to be 'temporal' for output i, if `compile`
+            sample_weight_mode was set to `temporal` and sample weight inputs
+            are given for one or more outputs.
+          2. Set sample weight mode to be 'samplewise' for output i, if
+            `compile` sample_weight_mode was not set and sample weight inputs
+            are given for one or more outputs.
+          3. Reset sample weight mode to None for output i if sample weight mode
+            was set but there is no sample weight input.
+
+        Args:
+          sample_weights: List of sample weights of the same length as model
+            outputs or None.
+        """
+        if not self._is_compiled:
+            return
+        if sample_weights and any(s is not None for s in sample_weights):
+            for endpoint in self._training_endpoints:
+                endpoint.sample_weight_mode = (
+                    endpoint.sample_weight_mode or "samplewise"
+                )
+        else:
+            for endpoint in self._training_endpoints:
+                endpoint.sample_weight_mode = None
+
+    def _recompile_weights_loss_and_weighted_metrics(self):
+        if not self._is_compiled:
+            return False
+        recompile = any(
+            e.sample_weights_mismatch() for e in self._training_endpoints
+        )
+
+        if recompile:
+            self._compile_weights_loss_and_weighted_metrics()
+        return recompile
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _compile_weights_loss_and_weighted_metrics(self, sample_weights=None):
+        """Compiles the model loss and weighted metric sub-graphs.
+
+        This may be used to set graph tensors as sample weights (instead of
+        creating placeholders). This functionality is necessary for
+        `tf.keras.estimator.model_to_estimator`, which calls Keras models in a
+        v1 graph, and creates iterator tensors for inputs, targets, and sample
+        weights.
+
+        Args:
+          sample_weights: List of tensors to use as the sample weights. Must be
+            the same length as the number of outputs. If left as `None`,
+            placeholders are used instead.
+        """
+        with backend.get_graph().as_default():
+            if sample_weights is not None:
+                self._update_sample_weight_modes(sample_weights)
+            self._prepare_sample_weights(sample_weights)
+
+            masks = self._prepare_output_masks()
+
+            # Compute weighted metrics.
+            self._handle_metrics(
+                self.outputs,
+                targets=self._targets,
+                skip_target_masks=self._prepare_skip_target_masks(),
+                sample_weights=self.sample_weights,
+                masks=masks,
+                return_weighted_metrics=True,
+            )
+
+            # Compute total loss.
+            # Used to keep track of the total loss value (stateless).
+            # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+            #                   loss_weight_2 * output_2_loss_fn(...) +
+            #                   layer losses.
+            self.total_loss = self._prepare_total_loss(masks)
+
+    def _prepare_skip_target_masks(self):
+        """Boolean mask for whether target in output list should be skipped.
+
+        If the loss function corresponding to a model output is None, then this
+        output will be skipped during total loss calculation and feed targets
+        preparation.
+
+        Returns:
+          A boolean list for whether the corresponding target in the output list
+          should be skipped during loss calculation.
+        """
+        return [l is None for l in self.loss_functions]
+
+    def _prepare_output_masks(self):
+        """Returns masks corresponding to model outputs."""
+        return [getattr(x, "_keras_mask", None) for x in self.outputs]
+
+    def _prepare_total_loss(self, masks):
+        """Computes total loss from loss functions.
+
+        Args:
+            masks: List of mask values corresponding to each model output.
+
+        Returns:
+            A list of loss weights of python floats.
+
+        Raises:
+            TypeError: If model run_eagerly is True.
+        """
+        if self.run_eagerly:
+            raise TypeError(
+                "total loss can not be computed when compiled with "
+                "run_eagerly = True."
+            )
+        loss_list = []
+        with backend.name_scope("loss"):
+            for endpoint, mask in zip(self._training_endpoints, masks):
+                if endpoint.should_skip_target():
+                    continue
+                y_true = endpoint.training_target.target
+                y_pred = endpoint.output
+                loss_fn = endpoint.loss_fn
+                loss_weight = endpoint.loss_weight
+                loss_name = endpoint.loss_name()
+                sample_weight = endpoint.sample_weight
+
+                with backend.name_scope(loss_name):
+                    if mask is not None:
+                        mask = tf.cast(mask, y_pred.dtype)
+                        # Update weights with mask.
+                        if sample_weight is None:
+                            sample_weight = mask
+                        else:
+                            # Update dimensions of weights to match with mask if
+                            # possible.
+                            (
+                                mask,
+                                _,
+                                sample_weight,
+                            ) = losses_utils.squeeze_or_expand_dimensions(
+                                mask, sample_weight=sample_weight
+                            )
+
+                    if hasattr(loss_fn, "reduction"):
+                        per_sample_losses = loss_fn.call(y_true, y_pred)
+                        sample_weight = losses_utils.apply_valid_mask(
+                            per_sample_losses,
+                            sample_weight,
+                            mask,
+                            loss_fn.reduction,
+                        )
+                        weighted_losses = losses_utils.compute_weighted_loss(
+                            per_sample_losses,
+                            sample_weight=sample_weight,
+                            reduction=losses_utils.ReductionV2.NONE,
+                        )
+                        loss_reduction = loss_fn.reduction
+
+                        # `AUTO` loss reduction defaults to
+                        # `SUM_OVER_BATCH_SIZE` for all compile use cases.
+                        if loss_reduction == losses_utils.ReductionV2.AUTO:
+                            loss_reduction = (
+                                losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+                            )
+
+                        # Compute the stateless loss value.
+                        output_loss = losses_utils.reduce_weighted_loss(
+                            weighted_losses, reduction=loss_reduction
+                        )
+                    else:
+                        # Compute the stateless loss value for a custom loss
+                        # class.  Here we assume that the class takes care of
+                        # loss reduction because if this class returns a vector
+                        # value we cannot differentiate between use case where a
+                        # custom optimizer expects a vector loss value vs
+                        # unreduced per-sample loss value.
+                        output_loss = loss_fn(
+                            y_true, y_pred, sample_weight=sample_weight
+                        )
+                        loss_reduction = (
+                            losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+                        )
+
+                if len(self.outputs) > 1:
+                    # Keep track of stateful result tensor for the loss.
+                    endpoint.output_loss_metric(output_loss)
+
+                # Scale output loss for distribution. For custom losses we
+                # assume reduction was mean.
+                if (
+                    loss_reduction
+                    == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+                ):
+                    output_loss = losses_utils.scale_loss_for_distribution(
+                        output_loss
+                    )
+
+                loss_list.append(loss_weight * output_loss)
+            if not loss_list and not self.losses:
+                raise ValueError(
+                    "The model cannot be compiled "
+                    "because it has no loss to optimize."
+                )
+
+            # Add regularization penalties and other layer-specific losses.
+            custom_losses = self.get_losses_for(None) + self.get_losses_for(
+                self.inputs
+            )
+            if custom_losses:
+                total_custom_loss = tf.add_n(
+                    losses_utils.cast_losses_to_common_dtype(custom_losses)
+                )
+                loss_list.append(
+                    losses_utils.scale_loss_for_distribution(total_custom_loss)
+                )
+
+            loss_list = losses_utils.cast_losses_to_common_dtype(loss_list)
+            if loss_list:
+                total_loss = tf.add_n(loss_list)
+            else:
+                total_loss = 0.0
+        return total_loss
+
+    def _get_callback_model(self):
+        """Returns the Callback Model for this Model."""
+
+        if hasattr(self, "_replicated_model") and self._replicated_model:
+            # When using training_distributed, we set the callback model
+            # to an instance of the `DistributedModel` that we create in
+            # the `compile` call. The `DistributedModel` is initialized
+            # with the first replicated model. We need to set the callback
+            # model to a DistributedModel to allow us to override saving
+            # and loading weights when we checkpoint the model during training.
+            return self._replicated_model
+        if hasattr(self, "callback_model") and self.callback_model:
+            return self.callback_model
+        return self
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _make_callback_model(self, grouped_model):
+        first_replicated_model = self._distribution_strategy.unwrap(
+            grouped_model
+        )[0]
+        # We initialize the callback model with the first replicated model.
+        self._replicated_model = DistributedCallbackModel(
+            first_replicated_model
+        )
+        self._replicated_model.set_original_model(self)
+
+    def _validate_or_infer_batch_size(self, batch_size, steps, x):
+        """Validates that `batch_size` provided is consistent with InputLayer.
+
+        It's possible that the user specified a static batch size in their
+        InputLayer. If so, this method checks the provided `batch_size` and `x`
+        arguments are consistent with this static batch size. Also, if
+        `batch_size` is `None`, this method will attempt to infer the batch size
+        from the static batch size of the InputLayer. Lastly, ValueError will be
+        raised if `x` is a tf.data.Dataset and `batch_size` is specified as we
+        expect users to provide batched datasets.
+
+        Args:
+          batch_size: The batch_size provided as an argument to
+            fit/evaluate/predict.
+          steps: The steps provided as an argument to fit/evaluate/predict.
+          x: The data passed as `x` to fit/evaluate/predict.
+
+        Returns:
+          The validated batch_size, auto-inferred from the first layer if not
+          provided.
+        """
+        if isinstance(
+            x, (tf.compat.v1.data.Dataset, tf.data.Dataset, data_utils.Sequence)
+        ) or tf_inspect.isgenerator(x):
+            if batch_size is not None:
+                raise ValueError(
+                    "The `batch_size` argument must not be specified for the "
+                    "given input type. Received input: "
+                    "{}, batch_size: {}".format(x, batch_size)
+                )
+            return
+
+        # Avoids the override in Sequential.layers which filters Input layers.
+        # (Which are often the very layers that we're after.)
+        layers = self._flatten_layers(include_self=False, recursive=False)
+        first_layer = next(layers, None)
+        if first_layer:
+            # The per-replica static batch size.
+            static_batch_size = training_utils.get_static_batch_size(
+                first_layer
+            )
+            if static_batch_size is not None:
+
+                # Determine number of times the user-supplied batch size will be
+                # split.
+                if (
+                    self._distribution_strategy
+                    and distributed_training_utils.global_batch_size_supported(
+                        self._distribution_strategy
+                    )
+                ):
+                    num_splits_for_ds = (
+                        self._distribution_strategy.num_replicas_in_sync
+                    )
+                else:
+                    num_splits_for_ds = 1
+
+                # Check `batch_size` argument is consistent with InputLayer.
+                if batch_size is not None:
+                    if batch_size % num_splits_for_ds != 0:
+                        raise ValueError(
+                            "The `batch_size` argument ({}) must be divisible "
+                            "the by number of replicas ({})".format(
+                                batch_size, num_splits_for_ds
+                            )
+                        )
+                    per_replica_batch_size = batch_size // num_splits_for_ds
+
+                    if per_replica_batch_size != static_batch_size:
+                        raise ValueError(
+                            "The `batch_size` argument value {} is "
+                            "incompatible with the specified batch size of "
+                            "your Input Layer: {}".format(
+                                per_replica_batch_size, static_batch_size
+                            )
+                        )
+
+                # Check Dataset/Iterator batch size is consistent with
+                # InputLayer.
+                if isinstance(
+                    x,
+                    (
+                        tf.data.Dataset,
+                        tf.compat.v1.data.Iterator,
+                        tf.data.Iterator,
+                    ),
+                ):
+                    ds_batch_size = tf.compat.v1.Dimension(
+                        tf.nest.flatten(tf.compat.v1.data.get_output_shapes(x))[
+                            0
+                        ][0]
+                    ).value
+                    if ds_batch_size is not None:
+                        if ds_batch_size % num_splits_for_ds != 0:
+                            raise ValueError(
+                                "The batch output shape of your `Dataset` {} "
+                                "cannot be divisible by number of "
+                                "replicas {}".format(
+                                    ds_batch_size, num_splits_for_ds
+                                )
+                            )
+
+                        ds_per_replica_batch_size = (
+                            ds_batch_size // num_splits_for_ds
+                        )
+                        if ds_per_replica_batch_size != static_batch_size:
+                            raise ValueError(
+                                "The batch output shape of your `Dataset` is "
+                                "{}, which is incompatible with the specified "
+                                "batch size of your Input Layer: {}".format(
+                                    ds_per_replica_batch_size, static_batch_size
+                                )
+                            )
+
+                # Set inferred batch size from the InputLayer.
+                if steps is None:
+                    batch_size = static_batch_size * num_splits_for_ds
+
+        if batch_size is None and steps is None:
+            # Backwards compatibility
+            batch_size = 32
+        return batch_size
+
+    def _prepare_sample_weights(self, sample_weights=None):
+        """Sets sample weight attribute on the model."""
+        # List with the same length as model outputs.
+        if sample_weights is not None:
+            if len(sample_weights) != len(self._training_endpoints):
+                raise ValueError(
+                    "Provided sample weights must have same length as the "
+                    "number of outputs. Expected: {}, got: {}.".format(
+                        len(self._training_endpoints), len(sample_weights)
+                    )
+                )
+        else:
+            sample_weights = [None] * len(self._training_endpoints)
+        for endpoint, weight in zip(self._training_endpoints, sample_weights):
+            endpoint.populate_sample_weight(weight, endpoint.sample_weight_mode)
+
+    def _cache_output_metric_attributes(self, metrics, weighted_metrics):
+        """Caches metric name and function attributes for every model output."""
+        output_shapes = []
+        for output in self.outputs:
+            if output is None or output.shape.rank is None:
+                output_shapes.append(None)
+            else:
+                output_shapes.append(output.shape.as_list())
+        self._per_output_metrics = (
+            training_utils_v1.collect_per_output_metric_info(
+                metrics,
+                self.output_names,
+                output_shapes,
+                self.loss_functions,
+                from_serialized=self._from_serialized,
+            )
+        )
+        self._per_output_weighted_metrics = (
+            training_utils_v1.collect_per_output_metric_info(
+                weighted_metrics,
+                self.output_names,
+                output_shapes,
+                self.loss_functions,
+                from_serialized=self._from_serialized,
+                is_weighted=True,
+            )
+        )
+
+    def _add_unique_metric_name(self, metric_name, metric_fn, output_index):
+        """Makes the metric name unique.
+
+          If there are multiple outputs for which the metrics are calculated,
+          the metric names have to be made unique by appending an integer.
+
+        Args:
+          metric_name: Metric name that corresponds to the metric specified by
+            the user. For example: 'acc'.
+          metric_fn: The Metric object.
+          output_index: The index of the model output for which the metric name
+            is being added.
+
+        Returns:
+          string, name of the model's unique metric name
+        """
+        # For multi-output models, prepend the output names to the metric name.
+        if len(self.output_names) > 1:
+            # If we're loading from an already-serialized model, we've already
+            # prepended the output name, and we don't want to do it again.
+            #
+            # Alternatively, we may be receiving a stateless metric (e.g. the
+            # string "accuracy") rather than a `Metric` object, in which case we
+            # want to prepend the output name even if we are loading a
+            # serialized model.
+            if not getattr(metric_fn, "_from_serialized", False):
+                metric_name = f"{self.output_names[output_index]}_{metric_name}"
+
+        j = 1
+        base_metric_name = metric_name
+        while metric_name in self.metrics_names:
+            metric_name = "%s_%d" % (base_metric_name, j)
+            j += 1
+
+        return metric_name
+
+    def _init_metric_attributes(self):
+        """Initialized model metric attributes."""
+        # List of stateful metric functions. Used for resetting metric state
+        # during training/eval.
+        self._compile_metric_functions = []
+
+    def _set_per_output_metric_attributes(self, metrics_dict, output_index):
+        """Sets the metric attributes on the model for the given output.
+
+        Args:
+          metrics_dict: A dict with metric names as keys and metric fns as
+            values.
+          output_index: The index of the model output for which the metric
+            attributes are added.
+
+        Returns:
+          Metrics dict updated with unique metric names as keys.
+        """
+        updated_metrics_dict = collections.OrderedDict()
+        for metric_name, metric_fn in metrics_dict.items():
+            metric_name = self._add_unique_metric_name(
+                metric_name, metric_fn, output_index
+            )
+
+            # Update the name on the metric class to be the unique generated
+            # name.
+            metric_fn._name = metric_name
+            updated_metrics_dict[metric_name] = metric_fn
+            # Keep track of metric name and function.
+            self._compile_metric_functions.append(metric_fn)
+        return updated_metrics_dict
+
+    def _set_metric_attributes(self):
+        """Sets the metric attributes on the model for all the model outputs."""
+        updated_per_output_metrics = []
+        updated_per_output_weighted_metrics = []
+        for i, endpoint in enumerate(self._training_endpoints):
+            if endpoint.should_skip_target():
+                updated_per_output_metrics.append(self._per_output_metrics[i])
+                updated_per_output_weighted_metrics.append(
+                    self._per_output_weighted_metrics[i]
+                )
+                continue
+            updated_per_output_metrics.append(
+                self._set_per_output_metric_attributes(
+                    self._per_output_metrics[i], i
+                )
+            )
+            updated_per_output_weighted_metrics.append(
+                self._set_per_output_metric_attributes(
+                    self._per_output_weighted_metrics[i], i
+                )
+            )
+
+        # Create a metric wrapper for each output loss. This computes mean of an
+        # output loss across mini-batches (irrespective of how we reduce within
+        # a batch).
+        if len(self._training_endpoints) > 1:
+            for endpoint in self._training_endpoints:
+                if not endpoint.should_skip_target():
+                    endpoint.output_loss_metric = metrics_module.Mean(
+                        name=endpoint.loss_name()
+                    )
+
+        self._per_output_metrics = updated_per_output_metrics
+        self._per_output_weighted_metrics = updated_per_output_weighted_metrics
+
+    def _handle_per_output_metrics(
+        self, metrics_dict, y_true, y_pred, mask, weights=None
+    ):
+        """Calls metric functions for a single output.
+
+        Args:
+          metrics_dict: A dict with metric names as keys and metric fns as
+            values.
+          y_true: Target output.
+          y_pred: Predicted output.
+          mask: Computed mask value for the current output.
+          weights: Weights to be applied on the current output.
+
+        Returns:
+          A list of metric result tensors.
+        """
+        metric_results = []
+        for metric_name, metric_fn in metrics_dict.items():
+            with backend.name_scope(metric_name):
+                metric_result = training_utils_v1.call_metric_function(
+                    metric_fn, y_true, y_pred, weights=weights, mask=mask
+                )
+                metric_results.append(metric_result)
+        return metric_results
+
+    def _handle_metrics(
+        self,
+        outputs,
+        targets=None,
+        skip_target_masks=None,
+        sample_weights=None,
+        masks=None,
+        return_weighted_metrics=False,
+        return_weighted_and_unweighted_metrics=False,
+    ):
+        """Handles calling metric functions.
+
+        Args:
+          outputs: List of outputs (predictions).
+          targets: List of targets.
+          skip_target_masks: Optional. List of boolean for whether the
+            corresponding target should be ignored or not.
+          sample_weights: Optional list of sample weight arrays.
+          masks: List of computed output mask values.
+          return_weighted_metrics: Flag that indicates whether weighted metrics
+            should be computed instead of unweighted metrics. This flag is
+            ignored when `return_weighted_and_unweighted_metrics` is enabled.
+          return_weighted_and_unweighted_metrics: Flag that is used to indicate
+            whether both weighted and unweighted metrics should be computed.
+            When this is not enabled, we use `return_weighted_metrics` param to
+            indicate whether weighted or unweighted metrics should be returned.
+
+        Returns:
+          A list of metric result tensors.
+        """
+        # TODO(scottzhu): Update this to use the new training_endpoints.
+        # Currently the eager and graph logic is bit different.
+        skip_target_masks = skip_target_masks or [False] * len(outputs)
+        metric_results = []
+        with backend.name_scope("metrics"):
+            # Invoke all metrics added using `compile`.
+            for i in range(len(outputs)):
+                if skip_target_masks[i]:
+                    continue
+                output = outputs[i] if outputs else None
+                target = targets[i] if targets else None
+                output_mask = masks[i] if masks else None
+
+                if (
+                    return_weighted_and_unweighted_metrics
+                    or not return_weighted_metrics
+                ):
+                    metric_results.extend(
+                        self._handle_per_output_metrics(
+                            self._per_output_metrics[i],
+                            target,
+                            output,
+                            output_mask,
+                        )
+                    )
+                if (
+                    return_weighted_and_unweighted_metrics
+                    or return_weighted_metrics
+                ):
+                    metric_results.extend(
+                        self._handle_per_output_metrics(
+                            self._per_output_weighted_metrics[i],
+                            target,
+                            output,
+                            output_mask,
+                            weights=sample_weights[i]
+                            if sample_weights
+                            else None,
+                        )
+                    )
+        return metric_results
+
+    def _check_trainable_weights_consistency(self):
+        """Check trainable weights count consistency.
+
+        This will raise a warning if `trainable_weights` and
+        `_collected_trainable_weights` are inconsistent (i.e. have different
+        number of parameters).
+        Inconsistency will typically arise when one modifies `model.trainable`
+        without calling `model.compile` again.
+        """
+        if not hasattr(self, "_collected_trainable_weights"):
+            return
+
+        if len(self.trainable_weights) != len(
+            self._collected_trainable_weights
+        ):
+            logging.log_first_n(
+                logging.WARN,
+                "Discrepancy between trainable weights and collected"
+                " trainable weights, did you set `model.trainable`"
+                " without calling `model.compile` after ?",
+                1,
+            )
+
+    def _make_train_function(self):
+        has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
+        self._check_trainable_weights_consistency()
+        if isinstance(self.optimizer, list):
+            raise ValueError(
+                "The `optimizer` in `compile` should be a single optimizer."
+            )
+        # If we have re-compiled the loss/weighted metric sub-graphs then create
+        # train function even if one exists already. This is because
+        # `_feed_sample_weights` list has been updated on re-compile.
+        if getattr(self, "train_function", None) is None or has_recompiled:
+            # Restore the compiled trainable state.
+            current_trainable_state = self._get_trainable_state()
+            self._set_trainable_state(self._compiled_trainable_state)
+
+            inputs = (
+                self._feed_inputs
+                + self._feed_targets
+                + self._feed_sample_weights
+            )
+            if not isinstance(backend.symbolic_learning_phase(), int):
+                inputs += [backend.symbolic_learning_phase()]
+
+            with backend.get_graph().as_default():
+                with backend.name_scope("training"):
+                    # Training updates
+                    updates = self.optimizer.get_updates(
+                        params=self._collected_trainable_weights,
+                        loss=self.total_loss,
+                    )
+                    # Unconditional updates
+                    updates += self.get_updates_for(None)
+                    # Conditional updates relevant to this model
+                    updates += self.get_updates_for(self.inputs)
+
+                metrics = self._get_training_eval_metrics()
+                metrics_tensors = [
+                    m._call_result
+                    for m in metrics
+                    if hasattr(m, "_call_result")
+                ]
+
+            with backend.name_scope("training"):
+                # Gets loss and metrics. Updates weights at each call.
+                fn = backend.function(
+                    inputs,
+                    [self.total_loss] + metrics_tensors,
+                    updates=updates,
+                    name="train_function",
+                    **self._function_kwargs,
+                )
+                setattr(self, "train_function", fn)
+
+            # Restore the current trainable state
+            self._set_trainable_state(current_trainable_state)
+
+    def _make_test_function(self):
+        has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
+        # If we have re-compiled the loss/weighted metric sub-graphs then create
+        # test function even if one exists already. This is because
+        # `_feed_sample_weights` list has been updated on re-compile.
+        if getattr(self, "test_function", None) is None or has_recompiled:
+            inputs = (
+                self._feed_inputs
+                + self._feed_targets
+                + self._feed_sample_weights
+            )
+
+            with backend.get_graph().as_default():
+                metrics = self._get_training_eval_metrics()
+                metrics_tensors = [
+                    m._call_result
+                    for m in metrics
+                    if hasattr(m, "_call_result")
+                ]
+
+            with backend.name_scope("evaluation"):
+                updates = self.state_updates
+                # Return loss and metrics, no gradient updates.
+                # Does update the network states.
+                fn = backend.function(
+                    inputs,
+                    [self.total_loss] + metrics_tensors,
+                    updates=updates,
+                    name="test_function",
+                    **self._function_kwargs,
+                )
+                setattr(self, "test_function", fn)
+
+    def _make_predict_function(self):
+        if not hasattr(self, "predict_function"):
+            self.predict_function = None
+        if self.predict_function is None:
+            inputs = self._feed_inputs
+            # Gets network outputs. Does not update weights.
+            # Does update the network states.
+            kwargs = getattr(self, "_function_kwargs", {})
+            with backend.name_scope(ModeKeys.PREDICT):
+                self.predict_function = backend.function(
+                    inputs,
+                    self.outputs,
+                    updates=self.state_updates,
+                    name="predict_function",
+                    **kwargs,
+                )
+
+    def _make_execution_function(self, mode):
+        if mode == ModeKeys.TRAIN:
+            self._make_train_function()
+            return self.train_function
+        if mode == ModeKeys.TEST:
+            self._make_test_function()
+            return self.test_function
+        if mode == ModeKeys.PREDICT:
+            self._make_predict_function()
+            return self.predict_function
+
+    def _distribution_standardize_user_data(
         self,
-        x=x,
-        batch_size=batch_size,
-        verbose=verbose,
-        steps=steps,
-        callbacks=callbacks,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
-
-  def reset_metrics(self):
-    """Resets the state of metrics."""
-    metrics = self._get_training_eval_metrics()
-    for m in metrics:
-      m.reset_state()
-
-    # Reset metrics on all the distributed (cloned) models.
-    if self._distribution_strategy:
-      distributed_training_utils_v1._reset_metrics(self)  # pylint: disable=protected-access
-
-  def train_on_batch(self,
-                     x,
-                     y=None,
-                     sample_weight=None,
-                     class_weight=None,
-                     reset_metrics=True):
-    """Runs a single gradient update on a single batch of data.
+        x,
+        y=None,
+        sample_weight=None,
+        class_weight=None,
+        batch_size=None,
+        validation_split=0.0,
+        shuffle=False,
+        epochs=1,
+        allow_partial_batch=False,
+    ):
+        """Runs validation checks on input and target data passed by the user.
+
+        This is called when using tf.distribute.Strategy to train, evaluate or
+        serve the model.
+
+        Args:
+          x: Input data. A numpy array or `tf.data` dataset.
+          y: Target data. A numpy array or None if x is a `tf.data` dataset.
+          sample_weight: An optional sample-weight array passed by the user to
+            weight the importance of each sample in `x`.
+          class_weight: An optional class-weight array by the user to
+            weight the importance of samples in `x` based on the class they
+            belong to, as conveyed by `y`.
+          batch_size: Integer batch size. If provided, it is used to run
+            additional validation checks on stateful models.
+          validation_split: Float between 0 and 1.
+            Fraction of the training data to be used as validation data.
+          shuffle: Boolean whether to shuffle the training data before each
+            epoch.
+          epochs: Integer epochs. If > 1, repeat the numpy training data epochs
+            times when converting to training dataset.
+          allow_partial_batch: Boolean whether to enforce that all batches have
+            the same size.
+
+        Returns:
+          Dataset instance.
+
+        Raises:
+          ValueError: In case of invalid user-provided data.
+          RuntimeError: If the model was never compiled.
+        """
+        if class_weight:
+            raise NotImplementedError(
+                "`class_weight` is currently not supported "
+                "when using tf.distribute.Strategy."
+            )
+
+        if (
+            sample_weight is not None
+            and sample_weight.all()
+            and backend.is_tpu_strategy(self._distribution_strategy)
+        ):
+            raise NotImplementedError(
+                "`sample_weight` is currently not supported "
+                "when using TPUStrategy."
+            )
+
+        # Validates `steps` and `shuffle` arguments right at the beginning
+        # since we use it to construct the dataset object.
+        # TODO(anjalisridhar): Remove this check once we refactor the
+        # _standardize_user_data code path. This check is already present
+        # elsewhere in the codebase.
+        if isinstance(x, tf.data.Dataset):
+            if shuffle:
+                training_utils_v1.verify_dataset_shuffled(x)
+
+        strategy = self._distribution_strategy
+        with strategy.scope():
+            # We should be sure to call get_session() inside the
+            # strategy.scope() so the strategy can affect the session options.
+            if tf.compat.v1.executing_eagerly_outside_functions():
+                session = None
+            else:
+                session = backend.get_session()
+
+            first_x_value = tf.nest.flatten(x)[0]
+            if isinstance(first_x_value, np.ndarray):
+                x = training_utils.list_to_tuple(x)
+                if y is not None:
+                    y = training_utils.list_to_tuple(y)
+                    if sample_weight is not None:
+                        sample_weight = training_utils.list_to_tuple(
+                            sample_weight
+                        )
+                        in_tuple = (x, y, sample_weight)
+                    else:
+                        in_tuple = (x, y)
+                else:
+                    in_tuple = x
+
+                ds = strategy.extended.experimental_make_numpy_dataset(
+                    in_tuple, session=session
+                )
+                if shuffle:
+                    # We want a buffer size that is larger than the batch size
+                    # provided by the user and provides sufficient randomness.
+                    # Note that larger numbers introduce more memory usage based
+                    # on the size of each sample.
+                    ds = ds.shuffle(max(1024, batch_size * 8))
+                if epochs > 1:
+                    ds = ds.repeat(epochs)
+
+                # We need to use the drop_remainder argument to get a known
+                # static input shape which is required for TPUs.
+                drop_remainder = (
+                    not allow_partial_batch
+                    and strategy.extended.experimental_require_static_shapes
+                )
+
+                # TODO(b/131720208): We still drop remainder here if number of
+                # examples is divisible by batch size, as sometimes dynamic
+                # padder will time out with keras.metrics.CategoricalAccuracy()
+                # metric.
+                if backend.is_tpu_strategy(strategy) and not drop_remainder:
+                    dataset_size = first_x_value.shape[0]
+                    if dataset_size % batch_size == 0:
+                        drop_remainder = True
+
+                x = ds.batch(batch_size, drop_remainder=drop_remainder)
+            else:
+                assert isinstance(x, tf.data.Dataset)
+                training_utils_v1.validate_dataset_input(
+                    x, y, sample_weight, validation_split
+                )
+        return x
 
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
+    def _standardize_user_data(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        class_weight=None,
+        batch_size=None,
+        check_steps=False,
+        steps_name="steps",
+        steps=None,
+        validation_split=0.0,
+        shuffle=False,
+        extract_tensors_from_dataset=False,
+    ):
+        """Runs validation checks on input and target data passed by the user.
+
+        Also standardizes the data to lists of arrays, in order.
+
+        Also builds and compiles the model on the fly if it is a subclassed
+        model that has never been called before (and thus has no
+        inputs/outputs).
+
+        This is a purely internal method, subject to refactoring at any time.
+
+        Args:
+          x: Input data. It could be:
+            - A Numpy array (or array-like), or a list of arrays
               (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
+            - A TensorFlow tensor, or a list of tensors
               (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
+            - A dict mapping input names to the corresponding array/tensors,
               if the model has named inputs.
-          - A `tf.data` dataset.
-        y: Target data. Like the input data `x`, it could be either Numpy
-          array(s) or TensorFlow tensor(s). It should be consistent with `x`
-          (you cannot have Numpy inputs and tensor targets, or inversely). If
-          `x` is a dataset, `y` should not be specified
-          (since targets will be obtained from the iterator).
-        sample_weight: Optional array of the same length as x, containing
-          weights to apply to the model's loss for each sample. In the case of
-          temporal data, you can pass a 2D array with shape (samples,
-          sequence_length), to apply a different weight to every timestep of
-          every sample. In this case you should make sure to specify
-          sample_weight_mode="temporal" in compile(). This argument is not
-          supported when `x` is a dataset.
-        class_weight: Optional dictionary mapping class indices (integers) to a
-          weight (float) to apply to the model's loss for the samples from this
-          class during training. This can be useful to tell the model to "pay
-          more attention" to samples from an under-represented class.
-        reset_metrics: If `True`, the metrics returned will be only for this
-          batch. If `False`, the metrics will be statefully accumulated across
-          batches.
-
-    Returns:
-        Scalar training loss
-        (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-      ValueError: In case of invalid user-provided arguments.
-    """
-    self._assert_compile_was_called()
-    self._check_call_args('train_on_batch')
-
-    # If at this point we are in the replica context, then it is okay to execute
-    # the Eager code path.  The expected way to get here is to call `fit` that
-    # calls `train_on_batch` on each replica.
-    if (self._distribution_strategy and
-        tf.distribute.in_cross_replica_context()):
-      raise NotImplementedError('`train_on_batch` is not supported for models '
-                                'distributed with tf.distribute.Strategy.')
-    # Validate and standardize user data.
-    x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight, class_weight=class_weight,
-        extract_tensors_from_dataset=True)
-
-    # If `self._distribution_strategy` is True, then we are in a replica context
-    # at this point because of the check above.  `train_on_batch` is being run
-    # for each replica by `self._distribution_strategy` and the same code path
-    # as Eager is expected to be taken.
-    if self.run_eagerly or self._distribution_strategy:
-      output_dict = training_eager_v1.train_on_batch(
-          self,
-          x,
-          y,
-          sample_weights=sample_weights,
-          output_loss_metrics=self._output_loss_metrics)
-      outputs = (output_dict['total_loss'] + output_dict['output_losses']
-                 + output_dict['metrics'])
-      outputs = [_non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
-    else:
-      x = training_utils_v1.ModelInputs(x).as_list()
-      ins = x + list(y or []) + list(sample_weights or [])
-
-      if not isinstance(backend.symbolic_learning_phase(), int):
-        ins += [True]  # Add learning phase value.
-
-      self._update_sample_weight_modes(sample_weights=sample_weights)
-      self._make_train_function()
-      outputs = self.train_function(ins)  # pylint: disable=not-callable
-
-    if reset_metrics:
-      self.reset_metrics()
-
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
-
-  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
-    """Test the model on a single batch of samples.
-
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset `y` should
-          not be specified (since targets will be obtained from the iterator).
-        sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset.
-        reset_metrics: If `True`, the metrics returned will be only for this
-          batch. If `False`, the metrics will be statefully accumulated across
-          batches.
-
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        ValueError: In case of invalid user-provided arguments.
-    """
-    self._assert_compile_was_called()
-    self._check_call_args('test_on_batch')
-
-    if (self._distribution_strategy and
-        tf.distribute.in_cross_replica_context()):
-      raise NotImplementedError('`test_on_batch` is not supported for models '
-                                'distributed with tf.distribute.Strategy.')
-    # Validate and standardize user data.
-    x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True)
-
-    # If `self._distribution_strategy` is True, then we are in a replica context
-    # at this point.
-    if self.run_eagerly or self._distribution_strategy:
-      output_dict = training_eager_v1.test_on_batch(
-          self,
-          x,
-          y,
-          sample_weights=sample_weights,
-          output_loss_metrics=self._output_loss_metrics)
-      outputs = (output_dict['total_loss'] + output_dict['output_losses']
-                 + output_dict['metrics'])
-      outputs = [_non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
-    else:
-      x = training_utils_v1.ModelInputs(x).as_list()
-      inputs = x + list(y or []) + list(sample_weights or [])
-
-      self._update_sample_weight_modes(sample_weights=sample_weights)
-      self._make_test_function()
-      outputs = self.test_function(inputs)  # pylint: disable=not-callable
-
-    if reset_metrics:
-      self.reset_metrics()
-
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
-
-  def predict_on_batch(self, x):
-    """Returns predictions for a single batch of samples.
-
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset.
-
-    Returns:
-        Numpy array(s) of predictions.
+            - A `tf.data` dataset.
+          y: Target data. Like the input data `x`,
+            it could be either Numpy array(s) or TensorFlow tensor(s).
+            It should be consistent with `x` (you cannot have Numpy inputs and
+            tensor targets, or inversely). If `x` is a dataset, `y` should not
+            be specified (since targets will be obtained from the iterator).
+          sample_weight: An optional sample-weight array passed by the user to
+            weight the importance of each sample in `x`.
+          class_weight: An optional class-weight array by the user to
+            weight the importance of samples in `x` based on the class they
+            belong to, as conveyed by `y`. If both `sample_weight` and
+            `class_weight` are provided, the weights are multiplied.
+          batch_size: Integer batch size. If provided, it is used to run
+            additional validation checks on stateful models.
+          check_steps: boolean, True if we want to check for validity of `steps`
+            and False, otherwise. For example, when we are standardizing one
+            batch of data for train_on_batch/predict_on_batch/test_on_batch
+            APIs, `steps` value is not required and we should not check for its
+            validity in these cases.
+          steps_name: The public API's parameter name for `steps`.
+          steps: Integer or `None`. Total number of steps (batches of samples)
+            to execute.
+          validation_split: Float between 0 and 1.
+            Fraction of the training data to be used as validation data.
+          shuffle: Boolean whether to shuffle the training data before each
+            epoch.
+          extract_tensors_from_dataset: Boolean. When `x` is a dataset instance,
+            this indicates whether to extract actual tensors from the dataset or
+            instead output the dataset instance itself.
+            Set to True when calling from `train_on_batch`/etc.
+
+        Returns:
+          A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a
+          dict or not), target arrays, sample-weight arrays.  If the model's
+          input and targets are symbolic, these lists are empty (since the model
+          takes no user-provided data, instead the data comes from the symbolic
+          inputs/targets).
+
+        Raises:
+          ValueError: In case of invalid user-provided data.
+          RuntimeError: If the model was never compiled.
+        """
+        if isinstance(x, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
+            # Graph mode dataset. We'll pass the dataset as-is (unless
+            # `extract_tensors_from_dataset` is True, in which case we extract
+            # the tensors from the dataset and we output them.
+            training_utils_v1.validate_dataset_input(
+                x, y, sample_weight, validation_split
+            )
+            if shuffle:
+                training_utils_v1.verify_dataset_shuffled(x)
+
+            is_dataset = True
+            if extract_tensors_from_dataset:
+                # We do this for `train_on_batch`/etc.
+                (
+                    x,
+                    y,
+                    sample_weight,
+                ) = training_utils_v1.extract_tensors_from_dataset(x)
+        elif isinstance(x, tf.compat.v1.data.Iterator):
+            # Graph mode iterator. We extract the symbolic tensors.
+            training_utils_v1.validate_dataset_input(
+                x, y, sample_weight, validation_split
+            )
+            iterator = x
+            x, y, sample_weight = training_utils_v1.unpack_iterator_input(
+                iterator
+            )
+            is_dataset = True
+        else:
+            is_dataset = False
+
+        # Validates `steps` argument based on x's type.
+        if check_steps:
+            training_utils_v1.check_steps_argument(x, steps, steps_name)
+
+        # First, we build the model on the fly if necessary.
+        if not self.inputs:
+            all_inputs, y_input, dict_inputs = self._build_model_with_inputs(
+                x, y
+            )
+            is_build_called = True
+        else:
+            all_inputs = []
+            # Whether this is a subclassed model that expects dictionary inputs
+            # rather than list inputs (e.g. FeatureColumn-based models).
+            dict_inputs = isinstance(self.inputs, dict)
+            is_build_called = False
+            y_input = y
+
+        # Second, we compile the model on the fly if necessary, mostly for
+        # subclass models.
+        is_compile_called = False
+        if not self._is_compiled and self.optimizer:
+            self._compile_from_inputs(all_inputs, y_input, x, y)
+            is_compile_called = True
+
+        # In graph mode, if we had just set inputs and targets as symbolic
+        # tensors by invoking build and compile on the model respectively, we do
+        # not have to feed anything to the model. Model already has input and
+        # target data as part of the graph.  Note: in this case, `any` and `all`
+        # are equivalent since we disallow mixed symbolic/value inputs.
+
+        # self.run_eagerly is not free to compute, so we want to reuse the
+        # value.
+        run_eagerly = self.run_eagerly
+
+        if (
+            not run_eagerly
+            and is_build_called
+            and is_compile_called
+            and not is_dataset
+            and any(_is_symbolic_tensor(v) for v in all_inputs)
+        ):
+            return [], [], None
+
+        return self._standardize_tensors(
+            x,
+            y,
+            sample_weight,
+            run_eagerly=run_eagerly,
+            dict_inputs=dict_inputs,
+            is_dataset=is_dataset,
+            class_weight=class_weight,
+            batch_size=batch_size,
+        )
+
+    def _standardize_tensors(
+        self,
+        x,
+        y,
+        sample_weight,
+        run_eagerly,
+        dict_inputs,
+        is_dataset,
+        class_weight=None,
+        batch_size=None,
+    ):
+        if run_eagerly:
+            # In eager mode, do not do shape validation
+            # since the network has no input nodes (placeholders) to be fed.
+            feed_input_names = self.input_names
+            feed_input_shapes = None
+        elif not self._is_graph_network:
+            # Case: symbolic-mode subclassed network. Do not do shape
+            # validation.
+            feed_input_names = self._feed_input_names
+            feed_input_shapes = None
+        else:
+            # Case: symbolic-mode graph network.
+            # In this case, we run extensive shape validation checks.
+            feed_input_names = self._feed_input_names
+            feed_input_shapes = self._feed_input_shapes
+
+        # Standardize the inputs.
+        if not isinstance(x, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
+            # TODO(fchollet): run static checks with dataset output shape(s).
+            x = training_utils_v1.standardize_input_data(
+                x,
+                feed_input_names,
+                feed_input_shapes,
+                check_batch_axis=False,  # Don't enforce the batch size.
+                exception_prefix="input",
+            )
+
+        # Get typespecs for the input data and sanitize it if necessary.
+        # TODO(momernick): This should be capable of doing full input validation
+        # at all times - validate that this is so and refactor the
+        # standardization code.
+        if isinstance(x, tf.data.Dataset):
+            x_shapes = tf.data.experimental.get_structure(x)
+            if isinstance(x_shapes, tuple):
+                # If the output of a Dataset is a tuple, we assume it's either
+                # of the form (x_data, y_data) or (x_data, y_data,
+                # sample_weights). In either case, we only care about x_data
+                # here.
+                x_shapes = x_shapes[0]
+        else:
+            flat_inputs = tf.nest.flatten(x)
+            flat_expected_inputs = tf.nest.flatten(self.inputs)
+            converted_x = []
+            for a, b in zip(flat_inputs, flat_expected_inputs):
+                converted_x.append(_convert_scipy_sparse_tensor(a, b))
+            x = tf.nest.pack_sequence_as(x, converted_x)
+
+            # Convert ResourceVariables to tensors so nest.assert_same_structure
+            # below won't fail with Variable and Tensor.
+            x_tensors = tf_utils.convert_variables_to_tensors(x)
+            x_shapes = tf.nest.map_structure(
+                tf_utils.type_spec_from_value, x_tensors
+            )
+
+        flat_inputs = tf.nest.flatten(x_shapes)
+        # Convert ResourceVariables to tensors so nest.assert_same_structure
+        # below won't fail with Variable and Tensor.
+        flat_expected_inputs = tf.nest.flatten(
+            tf_utils.convert_variables_to_tensors(self.inputs)
+        )
+        for a, b in zip(flat_inputs, flat_expected_inputs):
+            tf.nest.assert_same_structure(a, b, expand_composites=True)
 
-    Raises:
-        ValueError: In case of mismatch between given number of inputs and
-          expectations of the model.
-    """
-    self._check_call_args('predict_on_batch')
-
-    if (self._distribution_strategy and
-        tf.distribute.in_cross_replica_context()):
-      raise NotImplementedError(
-          '`predict_on_batch` is not supported for models distributed with'
-          ' tf.distribute.Strategy.')
-    # Validate and standardize user data.
-    inputs, _, _ = self._standardize_user_data(
-        x, extract_tensors_from_dataset=True)
-    # If `self._distribution_strategy` is True, then we are in a replica context
-    # at this point.
-    if self.run_eagerly or self._distribution_strategy:
-      inputs = training_utils_v1.cast_if_floating_dtype(inputs)
-      if isinstance(inputs, collections.abc.Sequence):
-        # Unwrap lists with only one input, as we do when training on batch
-        if len(inputs) == 1:
-          inputs = inputs[0]
-
-      return self(inputs)  # pylint: disable=not-callable
-
-    self._make_predict_function()
-    outputs = self.predict_function(inputs)
-
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
-
-  def fit_generator(self,
-                    generator,
-                    steps_per_epoch=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    validation_data=None,
-                    validation_steps=None,
-                    validation_freq=1,
-                    class_weight=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False,
-                    shuffle=True,
-                    initial_epoch=0):
-    """Fits the model on data yielded batch-by-batch by a Python generator.
-
-    DEPRECATED:
-      `Model.fit` now supports generators, so there is no longer any need to use
-      this endpoint.
-    """
-    warnings.warn(
-        '`model.fit_generator` is deprecated and '
-        'will be removed in a future version. '
-        'Please use `Model.fit`, which supports generators.',
-        stacklevel=2)
-    return self.fit(
-        generator,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch)
-
-  def evaluate_generator(self,
-                         generator,
-                         steps=None,
-                         callbacks=None,
-                         max_queue_size=10,
-                         workers=1,
-                         use_multiprocessing=False,
-                         verbose=0):
-    """Evaluates the model on a data generator.
-
-    DEPRECATED:
-      `Model.evaluate` now supports generators, so there is no longer any need
-      to use this endpoint.
-    """
-    warnings.warn(
-        '`Model.evaluate_generator` is deprecated and '
-        'will be removed in a future version. '
-        'Please use `Model.evaluate`, which supports generators.',
-        stacklevel=2)
-    self._check_call_args('evaluate_generator')
-
-    return self.evaluate(
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose,
-        callbacks=callbacks)
-
-  def predict_generator(self,
-                        generator,
-                        steps=None,
-                        callbacks=None,
-                        max_queue_size=10,
-                        workers=1,
-                        use_multiprocessing=False,
-                        verbose=0):
-    """Generates predictions for the input samples from a data generator.
-
-    DEPRECATED:
-      `Model.predict` now supports generators, so there is no longer any need
-      to use this endpoint.
-    """
-    warnings.warn(
-        '`Model.predict_generator` is deprecated and '
-        'will be removed in a future version. '
-        'Please use `Model.predict`, which supports generators.',
-        stacklevel=2)
-    return self.predict(
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose,
-        callbacks=callbacks)
-
-  def _check_call_args(self, method_name):
-    """Check that `call` has only one positional arg."""
-    # Always allow first arg, regardless of arg name.
-    fullargspec = self._call_spec.full_argspec
-    if fullargspec.defaults:
-      positional_args = fullargspec.args[:-len(fullargspec.defaults)]
-    else:
-      positional_args = fullargspec.args
-    if 'training' in positional_args:
-      positional_args.remove('training')
+        if y is not None:
+            # Prepare self._sample_weight_modes. List with the same length as
+            # model outputs.
+            training_utils_v1.prepare_sample_weight_modes(
+                self._training_endpoints, self.sample_weight_mode
+            )
+            feed_output_names = self._feed_output_names
+            feed_sample_weight_modes = self._sample_weight_modes
+            if not self._is_graph_network:
+                feed_output_shapes = None
+            else:
+                feed_output_shapes = self._feed_output_shapes
+
+            # Standardize the outputs.
+            y = training_utils_v1.standardize_input_data(
+                y,
+                feed_output_names,
+                # Don't enforce target shapes to match output shapes.
+                # Precise checks will be run in
+                # `check_loss_and_target_compatibility`.
+                shapes=None,
+                check_batch_axis=False,  # Don't enforce the batch size.
+                exception_prefix="target",
+            )
+
+            # Generate sample-wise weight values given the `sample_weight` and
+            # `class_weight` arguments.
+            sample_weights = training_utils_v1.standardize_sample_weights(
+                sample_weight, feed_output_names
+            )
+            class_weights = training_utils_v1.standardize_class_weights(
+                class_weight, feed_output_names
+            )
+
+            sample_weights = [
+                training_utils_v1.standardize_weights(ref, sw, cw, mode)
+                for (ref, sw, cw, mode) in zip(
+                    y, sample_weights, class_weights, feed_sample_weight_modes
+                )
+            ]
+            # Check that all arrays have the same length.
+            if not self._distribution_strategy:
+                training_utils_v1.check_array_lengths(x, y, sample_weights)
+                if self._is_graph_network and not run_eagerly:
+                    # Additional checks to avoid users mistakenly using improper
+                    # loss fns.
+                    training_utils_v1.check_loss_and_target_compatibility(
+                        y, self._feed_loss_fns, feed_output_shapes
+                    )
+
+            sample_weights, _, _ = training_utils.handle_partial_sample_weights(
+                y, sample_weights, feed_sample_weight_modes, check_all_flat=True
+            )
+        else:
+            y = []
+            sample_weights = None
+
+        if self.stateful and batch_size and not is_dataset:
+            # Check that for stateful networks, number of samples is a multiple
+            # of the static batch size.
+            if x[0].shape[0] % batch_size != 0:
+                raise ValueError(
+                    "In a stateful network, "
+                    "you should only pass inputs with "
+                    "a number of samples that can be "
+                    "divided by the batch size. Found: "
+                    + str(x[0].shape[0])
+                    + " samples"
+                )
+
+        # If dictionary inputs were provided, we return a dictionary as well.
+        if dict_inputs and not isinstance(
+            x, (tf.compat.v1.data.Dataset, tf.data.Dataset)
+        ):
+            x = dict(zip(feed_input_names, x))
+        return x, y, sample_weights
+
+    def _build_model_with_inputs(self, inputs, targets):
+        """Build the model (set model inputs/outputs), mainly for subclass
+        model."""
+        processed_inputs = []
+        is_dict_inputs = False
+        orig_inputs = inputs
+        # We need to use `inputs` to set the model inputs.
+        # If input data is a dataset iterator in graph mode or if it is an eager
+        # iterator and only one batch of samples is required, we fetch the data
+        # tensors from the iterator and then standardize them.
+        if isinstance(inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
+            inputs, targets, _ = training_utils_v1.extract_tensors_from_dataset(
+                inputs
+            )
+        # We type-check that `inputs` and `targets` are either single arrays
+        # or lists of arrays, and extract a flat list of inputs from the passed
+        # structure.
+        training_utils_v1.validate_input_types(inputs, orig_inputs)
+
+        if isinstance(inputs, (list, tuple)):
+            processed_inputs += list(inputs)
+        elif isinstance(inputs, dict):
+            is_dict_inputs = True
+            keys = sorted(inputs.keys())
+            processed_inputs = [inputs[k] for k in keys]
+        else:
+            processed_inputs.append(inputs)
+        # Now that we have a flat set of inputs, we make sure that none of them
+        # are CompositeTensors or CompositeTensorValues of any type (or scipy
+        # sparse arrays, which we treat as SparseTensor values). We cannot
+        # safely infer input data from an arbitrary composite tensor, so we
+        # don't try - users should explicitly add composite tensor inputs to
+        # their subclassed models.
+        for input_tensor in processed_inputs:
+            if training_utils_v1.is_composite_or_composite_value(
+                input_tensor
+            ) and not isinstance(input_tensor, tf.Variable):
+                # TODO(b/132691975): Document subclass-model CT input handling.
+                raise ValueError(
+                    "All SparseTensor and RaggedTensor inputs must be "
+                    "explicitly declared using a keras.Input() with "
+                    "sparse=True or ragged=True. We found an undeclared "
+                    "input %s. For Sequential models, please add a "
+                    "keras.Input() as your first Layer. For subclassed models, "
+                    "please call self._set_inputs() on your input set, which "
+                    "you can create using keras.Input() for each input to your "
+                    "model." % (input_tensor,)
+                )
+        # Build the model using the retrieved inputs (value or symbolic).
+        # If values are generated from a dataset, then in symbolic-mode
+        # placeholders will be created to match the value shapes.
+        if isinstance(
+            orig_inputs,
+            (
+                tf.compat.v1.data.Dataset,
+                tf.data.Dataset,
+                tf.compat.v1.data.Iterator,
+            ),
+        ):
+            if not self.inputs:
+                # For subclassed models, a robust input spec is not available so
+                # we must cast to the model dtype.
+                inputs = training_utils_v1.cast_if_floating_dtype(
+                    inputs, self.dtype
+                )
+
+            def create_tensor_spec(t):
+                return tf.TensorSpec(t.shape, t.dtype)
+
+            cast_inputs = tf.nest.map_structure(create_tensor_spec, inputs)
+        elif training_utils_v1.has_tensors(inputs):
+            cast_inputs = training_utils_v1.cast_if_floating_dtype(inputs)
+        else:
+            cast_inputs = inputs
+        self._set_inputs(cast_inputs)
+        return processed_inputs, targets, is_dict_inputs
+
+    def _compile_from_inputs(
+        self, all_inputs, target, orig_inputs, orig_target
+    ):
+        if target is not None:
+            # We need to use `y` to set the model targets.
+            if training_utils_v1.has_tensors(target):
+                target = training_utils_v1.cast_if_floating_dtype_and_mismatch(
+                    target, self.outputs
+                )
+            training_utils_v1.validate_input_types(
+                target, orig_target, allow_dict=False, field_name="target"
+            )
+            if isinstance(target, (list, tuple)):
+                all_inputs += list(target)
+            else:
+                all_inputs.append(target)
+        # Type check that all inputs are *either* value *or* symbolic.
+        # TODO(fchollet): this check could be removed in Eager mode?
+        if any(tf.is_tensor(v) for v in all_inputs):
+            if not all(tf.is_tensor(v) for v in all_inputs):
+                raise ValueError(
+                    "Do not pass inputs that mix Numpy arrays and "
+                    "TensorFlow tensors. "
+                    "You passed: x="
+                    + str(orig_inputs)
+                    + "; y="
+                    + str(orig_target)
+                )
+        is_dataset = isinstance(
+            orig_inputs,
+            (
+                tf.compat.v1.data.Dataset,
+                tf.data.Dataset,
+                tf.compat.v1.data.Iterator,
+            ),
+        )
+        if is_dataset or tf.executing_eagerly():
+            target_tensors = None
+        else:
+            # Handle target tensors if any passed.
+            if target is not None:
+                if not isinstance(target, (list, tuple)):
+                    target = [target]
+                target_tensors = [v for v in target if _is_symbolic_tensor(v)]
+            else:
+                target_tensors = None
+
+        self.compile(
+            optimizer=self.optimizer,
+            loss=self.loss,
+            metrics=self._compile_metrics,
+            weighted_metrics=self._compile_weighted_metrics,
+            loss_weights=self.loss_weights,
+            target_tensors=target_tensors,
+            sample_weight_mode=self.sample_weight_mode,
+            run_eagerly=self.run_eagerly,
+            experimental_run_tf_function=self._experimental_run_tf_function,
+        )
+
+    # TODO(omalleyt): Consider changing to a more descriptive function name.
+    def _set_inputs(self, inputs, outputs=None, training=None):
+        """Set model's input and output specs based on the input data received.
+
+        This is to be used for Model subclasses, which do not know at
+        instantiation time what their inputs look like.
+
+        Args:
+          inputs: Single array, or list of arrays. The arrays could be
+            placeholders, Numpy arrays, data tensors, or TensorSpecs.
+            - if placeholders: the model is built on top of these placeholders,
+              and we expect Numpy data to be fed for them when calling
+              `fit`/etc.
+            - if Numpy data or TensorShapes: we create placeholders matching the
+              TensorShapes or shapes of the Numpy arrays. We expect Numpy data
+              to be fed for these placeholders when calling `fit`/etc.
+            - if data tensors: the model is built on top of these tensors.
+              We do not expect any Numpy data to be provided when calling
+              `fit`/etc.
+          outputs: None, a data tensor, or a list of tensors. If None, the
+            outputs will be determined by invoking `self.call()`, otherwise the
+            provided value will be used.
+          training: Boolean or None. Only relevant in symbolic mode. Specifies
+            whether to build the model's graph in inference mode (False),
+            training mode (True), or using the Keras learning phase (None).
+        Raises:
+          ValueError: If dict inputs are passed to a Sequential Model where the
+            first layer isn't FeatureLayer.
+        """
+        self._set_save_spec(inputs)
+        inputs = self._set_input_attrs(inputs)
+
+        if outputs is None:
+            kwargs = {}
+            if self._expects_training_arg:
+                # In V2 mode, feeding `training=None` is not allowed because any
+                # value explicitly passed by the user is respected, even
+                # `None`.`
+                if (
+                    training is None
+                    and not tf.compat.v1.executing_eagerly_outside_functions()
+                ):
+                    training = backend.learning_phase()
+                if training is not None:
+                    kwargs["training"] = training
+            try:
+                outputs = self(inputs, **kwargs)
+            except NotImplementedError:
+                # This Model or a submodel is dynamic and hasn't overridden
+                # `compute_output_shape`.
+                outputs = None
+
+        self._set_output_attrs(outputs)
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _set_input_attrs(self, inputs):
+        """Sets attributes related to the inputs of the Model."""
+        if self.inputs:
+            raise ValueError("Model inputs are already set.")
+
+        if self.__class__.__name__ == "Sequential" and not self.built:
+            if tf.is_tensor(inputs):
+                input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
+            elif isinstance(inputs, tf.TensorShape):
+                input_shape = (None,) + tuple(inputs.as_list()[1:])
+            elif isinstance(inputs, dict):
+                # We assert that the first layer is a FeatureLayer.
+                if not training_utils_v1.is_feature_layer(self.layers[0]):
+                    raise ValueError(
+                        "Passing a dictionary input to a Sequential Model "
+                        "which doesn't have FeatureLayer as the first layer"
+                        " is an error."
+                    )
+                input_shape = (None,)
+            else:
+                input_shape = (None,) + tuple(inputs.shape[1:])
+            self._build_input_shape = input_shape
+
+        # Cast inputs to the compute dtype. This is primarily used
+        # when saving to determine the correct dtype in the input signature.
+        inputs = self._maybe_cast_inputs(inputs)
+
+        # On-the-fly setting of symbolic model inputs (either by using the
+        # tensor provided, or by creating a placeholder if Numpy data was
+        # provided).
+        model_inputs = training_utils_v1.ModelInputs(inputs)
+        inputs = model_inputs.get_symbolic_inputs()
+        self.inputs = model_inputs.get_symbolic_inputs(
+            return_single_as_list=True
+        )
+        self.input_names = model_inputs.get_input_names()
+
+        self._feed_inputs = []
+        self._feed_input_names = []
+        self._feed_input_shapes = []
+
+        for k, v in model_inputs.as_dict():
+            if backend.is_placeholder(v):
+                self._feed_input_names.append(k)
+                self._feed_inputs.append(v)
+                self._feed_input_shapes.append(backend.int_shape(v))
+
+        return inputs
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _set_output_attrs(self, outputs):
+        """Sets attributes related to the outputs of the Model."""
+        # NOTE(taylorrobie): This convention cannot be changed without updating
+        # the data adapter since it assumes nest.flatten ordering.
+        outputs = tf.nest.flatten(outputs)
+        self.outputs = outputs
+        self.output_names = training_utils_v1.generic_output_names(outputs)
+        # TODO(scottzhu): Should we cleanup the self._training_endpoints here?
+        self.built = True
+
+    @property
+    def _targets(self):
+        """The output target tensors for the model."""
+        return [
+            e.training_target.target
+            for e in self._training_endpoints
+            if e.has_training_target()
+        ]
 
-    # self and first arg can be positional.
-    if len(positional_args) > 2:
-      extra_args = positional_args[2:]
-      raise ValueError(
-          'Models passed to `' + method_name + '` can only have `training` '
-          'and the first argument in `call` as positional arguments, '
-          'found: ' + str(extra_args) + '.')
+    @property
+    def _feed_targets(self):
+        return [
+            e.training_target.target
+            for e in self._training_endpoints
+            if e.has_feedable_training_target()
+        ]
 
-  def _set_optimizer(self, optimizer):
-    """Sets self.optimizer.
+    @property
+    def _feed_output_names(self):
+        return [
+            e.output_name
+            for e in self._training_endpoints
+            if e.has_feedable_training_target()
+        ]
 
-    Sets self.optimizer to `optimizer`, potentially wrapping it with a
-    LossScaleOptimizer.
+    @property
+    def _feed_output_shapes(self):
+        return [
+            e.feed_output_shape
+            for e in self._training_endpoints
+            if e.has_feedable_training_target()
+        ]
 
-    Args:
-      optimizer: The optimizer(s) to assign to self.optimizer.
-    """
-    if isinstance(optimizer, (list, tuple)):
-      self.optimizer = [optimizers.get(opt) for opt in optimizer]
-    else:
-      self.optimizer = optimizers.get(optimizer)
-
-    if (self._dtype_policy.name == 'mixed_float16' and
-        not isinstance(self.optimizer,
-                       loss_scale_optimizer.LossScaleOptimizer)):
-      if isinstance(self.optimizer, list):
-        raise ValueError('When the "mixed_float16" dtype policy is used, you '
-                         'can only pass a single optimizer. Using policy %s '
-                         'and got optimizers: %s' %
-                         self._dtype_policy, self.optimizer)
-      if not isinstance(self.optimizer, optimizer_v2.OptimizerV2):
-        raise ValueError('"optimizer" must be an instance of '
-                         'tf.keras.optimizers.Optimizer when a dype policy '
-                         'with a loss scale  used, but got: %s. Using policy: '
-                         '%s' %
-                         (self.optimizer, self._dtype_policy))
-      self.optimizer = loss_scale_optimizer.LossScaleOptimizer(self.optimizer)
-
-  def _prepare_validation_data(self, validation_data, batch_size,
-                               validation_steps):
-    """Unpack and check the validation data."""
-    val_x, val_y, val_sample_weights = training_utils_v1.unpack_validation_data(
-        validation_data)
-    return self._standardize_user_data(
-        val_x,
-        val_y,
-        sample_weight=val_sample_weights,
-        batch_size=batch_size,
-        steps=validation_steps,
-        steps_name='validation_steps')
-
-  def _validate_compile_param_for_distribution_strategy(
-      self, run_eagerly, sample_weight_mode, target_tensors, weighted_metrics):
-    # Validate that arguments passed by the user to `compile` are supported by
-    # tf.distribute.Strategy.
-    if self._distribution_strategy:
-      if sample_weight_mode:
-        raise NotImplementedError('sample_weight_mode is not supported with '
-                                  'tf.distribute.Strategy.')
-      if weighted_metrics:
-        raise NotImplementedError('weighted_metrics is not supported with '
-                                  'tf.distribute.Strategy.')
-      if target_tensors:
-        raise ValueError('target_tensors is not supported with '
-                         'tf.distribute.Strategy.')
-
-      if run_eagerly:
-        raise ValueError(
-            'We currently do not support enabling `run_eagerly` with '
-            'distribution strategy.')
-
-      if (distributed_training_utils_v1.is_distributing_by_cloning(self) and
-          (not self.built or not self.inputs or not self.outputs)):
-        raise ValueError(
-            'We currently do not support distribution strategy with a '
-            '`Sequential` model that is created without `input_shape`/'
-            '`input_dim` set in its first layer or a subclassed model.')
-
-  def _process_target_tensor_for_compile(self, target_tensors):
-    if self.run_eagerly:
-      # target tensor is not supported with run_eagerly. Create a list with None
-      # as placeholder for each output.
-      return [None for _ in self.output_names]
-
-    if target_tensors is not None and not (isinstance(target_tensors, list) and
-                                           target_tensors == []):  # pylint: disable=g-explicit-bool-comparison
-      if isinstance(target_tensors, list):
-        if len(target_tensors) != len(self.outputs):
-          raise ValueError(
-              'When passing a list as `target_tensors`, '
-              'it should have one entry per model output. '
-              'The model has %s outputs, but you passed target_tensors=%s' %
-              (len(self.outputs), target_tensors))
-      elif isinstance(target_tensors, dict):
-        unexpected_target_tensor_names = set(target_tensors.keys()).difference(
-            self.output_names)
-        if unexpected_target_tensor_names:
-          raise ValueError(
-              'Unknown entry in `target_tensors` dictionary: "{name}". '
-              'Only expected the following keys: {keys}'.format(
-                  name=unexpected_target_tensor_names,
-                  keys=str(self.output_names)))
-        tmp_target_tensors = []
-        for name in self.output_names:
-          tmp_target_tensors.append(target_tensors.get(name, None))
-        target_tensors = tmp_target_tensors
-      elif tf.is_tensor(target_tensors):
-        target_tensors = [target_tensors]
-      else:
-        raise TypeError('Expected `target_tensors` to be a list or tuple or '
-                        'dict or a single tensor, but got:', target_tensors)
-    else:
-      # In case target tensor is empty or None, create a list with Nones
-      # that has same length as self.output_names. With that, the None check of
-      # target tensor can be skipped downstream.
-      target_tensors = [None for _ in self.output_names]
-    return target_tensors
-
-  def _compile_eagerly(self, metrics, weighted_metrics, sample_weight_mode):
-    # Prepare sample weight modes. List with the same length as model outputs.
-    training_utils_v1.prepare_sample_weight_modes(
-        self._training_endpoints, sample_weight_mode)
-    # Prepare sample weights.
-    self._prepare_sample_weights()
-    # Save all metric attributes per output of the model.
-    self._cache_output_metric_attributes(metrics, weighted_metrics)
-    self.total_loss = None
-    # Set metric attributes on model.
-    self._set_metric_attributes()
-
-    self._collected_trainable_weights = self.trainable_weights
-
-  def _update_sample_weight_modes(self, sample_weights=None):
-    """Updates sample weight modes based on training/eval inputs.
-
-    Sample weight placeholders will be created for all or no outputs
-    based on whether sample_weight is provided for any output.
-
-    If model contains `_sample_weight_modes` we check if the input
-    `sample_weights` corresponds to the sample weight modes.
-      1. Set sample weight mode to be 'temporal' for output i, if `compile`
-        sample_weight_mode was set to `temporal` and sample weight inputs
-        are given for one or more outputs.
-      2. Set sample weight mode to be 'samplewise' for output i, if `compile`
-        sample_weight_mode was not set and sample weight inputs are given for
-        one or more outputs.
-      3. Reset sample weight mode to None for output i if sample weight mode
-        was set but there is no sample weight input.
+    @property
+    def _feed_loss_fns(self):
+        return [
+            e.loss_fn
+            for e in self._training_endpoints
+            if e.has_feedable_training_target()
+        ]
 
-    Args:
-      sample_weights: List of sample weights of the same length as model outputs
-        or None.
-    """
-    if not self._is_compiled:
-      return
-    if sample_weights and any(s is not None for s in sample_weights):
-      for endpoint in self._training_endpoints:
-        endpoint.sample_weight_mode = (
-            endpoint.sample_weight_mode or 'samplewise')
-    else:
-      for endpoint in self._training_endpoints:
-        endpoint.sample_weight_mode = None
+    @property
+    def _loss_weights_list(self):
+        return [e.loss_weight for e in self._training_endpoints]
+
+    @property
+    def _output_loss_metrics(self):
+        if hasattr(self, "_training_endpoints"):
+            return [
+                e.output_loss_metric
+                for e in self._training_endpoints
+                if e.output_loss_metric is not None
+            ]
+        return None
+
+    @property
+    def sample_weights(self):
+        return [e.sample_weight for e in self._training_endpoints]
+
+    @property
+    def _sample_weight_modes(self):
+        return [e.sample_weight_mode for e in self._training_endpoints]
+
+    @property
+    def _feed_sample_weights(self):
+        return [
+            e.sample_weight
+            for e in self._training_endpoints
+            if e.sample_weight is not None
+        ]
 
-  def _recompile_weights_loss_and_weighted_metrics(self):
-    if not self._is_compiled:
-      return False
-    recompile = any(
-        e.sample_weights_mismatch() for e in self._training_endpoints)
+    def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
+        """Maybe load 1st epoch from checkpoint, considering worker recovery.
+
+        Refer to tensorflow/python/keras/distribute/worker_training_state.py
+        for more information.
+
+        Args:
+          initial_epoch: The original initial_epoch user passes in in `fit()`.
+          mode: The mode for running `model.fit()`.
+
+        Returns:
+          If the training is recovering from previous failure under multi-worker
+          training setting, return the epoch the training is supposed to
+          continue at. Otherwise, return the `initial_epoch` the user passes in.
+        """
+        if self._training_state is not None:
+            return self._training_state.maybe_load_initial_epoch_from_ckpt(
+                initial_epoch, mode
+            )
+        return initial_epoch
+
+    def _get_training_eval_metrics(self):
+        """Returns all the metrics that are to be reported.
+
+        This includes the output loss metrics, compile metrics/weighted metrics,
+        add_metric metrics.
+        """
+        metrics = []
+        metrics.extend(getattr(self, "_output_loss_metrics", None) or [])
+        metrics.extend(getattr(self, "metrics", None) or [])
+        return metrics
+
+    def _assert_compile_was_called(self):
+        # Checks whether `compile` has been called. If it has been called,
+        # then the optimizer is set. This is different from whether the
+        # model is compiled
+        # (i.e. whether the model is built and its inputs/outputs are set).
+        if not self._compile_was_called:
+            raise RuntimeError(
+                "You must compile your model before "
+                "training/testing. "
+                "Use `model.compile(optimizer, loss)`."
+            )
+
+    def _in_multi_worker_mode(self):
+        """Method to infer if this `Model` is working in multi-worker settings.
+
+        Multi-worker training refers to the setup where the training is
+        distributed across multiple workers, as opposed to the case where
+        only a local process performs the training. This function is
+        used to infer for example whether or not a distribute coordinator
+        should be run, and thus TensorFlow servers should be started for
+        communication with other servers in the cluster, or whether or not
+        saving/restoring checkpoints is relevant for preemption fault tolerance.
+
+        Experimental. Signature and implementation are subject to change.
+
+        Returns:
+          Whether this model indicates it's working in multi-worker settings.
+        """
+        strategy = self._distribution_strategy
+
+        # Otherwise, use the strategy whose scope this is in.
+        if not strategy and tf.distribute.has_strategy():
+            strategy = tf.distribute.get_strategy()
+        return strategy and strategy.extended._in_multi_worker_mode()
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return model_serialization.ModelSavedModelSaver(self)
+
+    def _get_compile_args(self, user_metrics=True):
+        del user_metrics
+        self._assert_compile_was_called()
+        kwargs = {
+            "loss": self.loss,
+            "metrics": self._compile_metrics,
+            "loss_weights": self.loss_weights,
+            "sample_weight_mode": self.sample_weight_mode,
+            "weighted_metrics": self._compile_weighted_metrics,
+        }
+        return kwargs
+
+    @property
+    def _compile_was_called(self):
+        return self._v1_compile_was_called
 
-    if recompile:
-      self._compile_weights_loss_and_weighted_metrics()
-    return recompile
 
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _compile_weights_loss_and_weighted_metrics(self, sample_weights=None):
-    """Compiles the model loss and weighted metric sub-graphs.
+class DistributedCallbackModel(Model):
+    """Model that is used for callbacks with tf.distribute.Strategy."""
+
+    def __init__(self, model):
+        super().__init__()
+        self.optimizer = model.optimizer
+
+    def set_original_model(self, orig_model):
+        self._original_model = orig_model
+
+    def save_weights(self, filepath, overwrite=True, save_format=None):
+        self._replicated_model.save_weights(
+            filepath, overwrite=overwrite, save_format=save_format
+        )
+
+    def save(self, filepath, overwrite=True, include_optimizer=True):
+        # save weights from the distributed model to the original model
+        distributed_model_weights = self.get_weights()
+        self._original_model.set_weights(distributed_model_weights)
+        # TODO(anjalisridhar): Do we need to save the original model here?
+        # Saving the first replicated model works as well.
+        self._original_model.save(
+            filepath, overwrite=True, include_optimizer=False
+        )
+
+    def load_weights(self, filepath, by_name=False):
+        self._original_model.load_weights(filepath, by_name=False)
+        # Copy the weights from the original model to each of the replicated
+        # models.
+        orig_model_weights = self._original_model.get_weights()
+        distributed_training_utils_v1.set_weights(
+            self._original_model._distribution_strategy,
+            self,
+            orig_model_weights,
+        )
+
+    def __getattr__(self, item):
+        # Allowed attributes of the model that can be accessed by the user
+        # during a callback.
+        if item not in ("_setattr_tracking", "_layers"):
+            logging.warning(
+                "You are accessing attribute " + item + " of the "
+                "DistributedCallbackModel that may not have been set "
+                "correctly."
+            )
+        return super().__getattr__(item)
 
-    This may be used to set graph tensors as sample weights (instead of creating
-    placeholders). This functionality is necessary for
-    `tf.keras.estimator.model_to_estimator`, which calls Keras models in a v1
-    graph, and creates iterator tensors for inputs, targets, and sample weights.
 
-    Args:
-      sample_weights: List of tensors to use as the sample weights. Must be the
-        same length as the number of outputs. If left as `None`, placeholders
-        are used instead.
-    """
-    with backend.get_graph().as_default():
-      if sample_weights is not None:
-        self._update_sample_weight_modes(sample_weights)
-      self._prepare_sample_weights(sample_weights)
-
-      masks = self._prepare_output_masks()
-
-      # Compute weighted metrics.
-      self._handle_metrics(
-          self.outputs,
-          targets=self._targets,
-          skip_target_masks=self._prepare_skip_target_masks(),
-          sample_weights=self.sample_weights,
-          masks=masks,
-          return_weighted_metrics=True)
-
-      # Compute total loss.
-      # Used to keep track of the total loss value (stateless).
-      # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
-      #                   loss_weight_2 * output_2_loss_fn(...) +
-      #                   layer losses.
-      self.total_loss = self._prepare_total_loss(masks)
-
-  def _prepare_skip_target_masks(self):
-    """Boolean mask for whether the target in the output list should be skipped.
-
-    If the loss function corresponding to a model output is None, then this
-    output will be skipped during total loss calculation and feed targets
-    preparation.
+class _TrainingEndpoint:
+    """A container for the training output/target and related entities.
 
-    Returns:
-      A boolean list for whether the corresponding target in the output list
-      should be skipped during loss calculation.
+    In the case of model with multiple outputs, there is a one-to-one mapping
+    between model output (y_pred), model target (y_true), loss, metrics etc.
+    By unifying these entities into one class, different entity can access
+    information between each other, rather than currently access different list
+    of attributes of the model.
     """
-    return [l is None for l in self.loss_functions]
-
-  def _prepare_output_masks(self):
-    """Returns masks corresponding to model outputs."""
-    return [getattr(x, '_keras_mask', None) for x in self.outputs]
-
-  def _prepare_total_loss(self, masks):
-    """Computes total loss from loss functions.
-
-    Args:
-        masks: List of mask values corresponding to each model output.
-
-    Returns:
-        A list of loss weights of python floats.
 
-    Raises:
-        TypeError: If model run_eagerly is True.
-    """
-    if self.run_eagerly:
-      raise TypeError('total loss can not be computed when compiled with '
-                      'run_eagerly = True.')
-    loss_list = []
-    with backend.name_scope('loss'):
-      for endpoint, mask in zip(self._training_endpoints, masks):
-        if endpoint.should_skip_target():
-          continue
-        y_true = endpoint.training_target.target
-        y_pred = endpoint.output
-        loss_fn = endpoint.loss_fn
-        loss_weight = endpoint.loss_weight
-        loss_name = endpoint.loss_name()
-        sample_weight = endpoint.sample_weight
-
-        with backend.name_scope(loss_name):
-          if mask is not None:
-            mask = tf.cast(mask, y_pred.dtype)
-            # Update weights with mask.
-            if sample_weight is None:
-              sample_weight = mask
+    def __init__(
+        self,
+        output,
+        output_name,
+        loss_fn,
+        loss_weight=None,
+        training_target=None,
+        output_loss_metric=None,
+        sample_weight=None,
+        sample_weight_mode=None,
+    ):
+        """Initialize the _TrainingEndpoint.
+
+        Note that the output and output_name should be stable as long as the
+        model structure doesn't change. The training_target suppose to be
+        mutable since the information is provided via `compile()`
+
+        Args:
+          output: the output tensor of the model.
+          output_name: the unique name of the output tensor.
+          loss_fn: the loss function for the output tensor.
+          loss_weight: float, the weights for the loss.
+          training_target: the _TrainingTarget for the model.
+          output_loss_metric: the metric object for the loss function.
+          sample_weight: the weights for how a sample is weighted during metric
+            and loss calculation. Could be None.
+          sample_weight_mode: string, 'temporal', 'samplewise' or None. The mode
+            for how the sample_weight is populated.
+        """
+        self._output = output
+        self._output_name = output_name
+        self._loss_fn = loss_fn
+        self._loss_weight = loss_weight
+        self._training_target = training_target
+        self._output_loss_metric = output_loss_metric
+        self._sample_weight = sample_weight
+        self._sample_weight_mode = sample_weight_mode
+
+    @property
+    def output(self):
+        return self._output
+
+    @property
+    def output_name(self):
+        return self._output_name
+
+    @property
+    def shape(self):
+        return backend.int_shape(self.output)
+
+    @property
+    def loss_fn(self):
+        return self._loss_fn
+
+    @property
+    def loss_weight(self):
+        return self._loss_weight
+
+    @loss_weight.setter
+    def loss_weight(self, value):
+        self._loss_weight = value
+
+    @property
+    def training_target(self):
+        return self._training_target
+
+    @training_target.setter
+    def training_target(self, value):
+        self._training_target = value
+
+    def create_training_target(self, target, run_eagerly=False):
+        """Create training_target instance and update the self.training_target.
+
+        Note that the input target should just be a tensor or None, and
+        corresponding training target will be created based on the output and
+        loss_fn.
+
+        Args:
+          target: the target tensor for the current output. Could be None.
+          run_eagerly: boolean, whether the model is in run_eagerly mode.
+
+        Raises:
+          ValueError if the training_target field for the current instance has
+          already been populated.
+        """
+        if self.has_training_target():
+            raise ValueError(
+                "The training_target field for the _TrainingEndpoint "
+                "instance has already been populated"
+            )
+        if run_eagerly:
+            # When run_eagerly, the target tensor is ignored, and the None
+            # placeholder is created instead.
+            self.training_target = _TrainingTarget(
+                None, feedable=True, skip_target_weights=False
+            )
+            return
+
+        if self.should_skip_target():
+            self.training_target = _TrainingTarget(None)
+        else:
+            if target is not None and not backend.is_placeholder(target):
+                feedable = False
+                skip_target_weights = True
             else:
-              # Update dimensions of weights to match with mask if possible.
-              mask, _, sample_weight = (
-                  losses_utils.squeeze_or_expand_dimensions(
-                      mask, sample_weight=sample_weight))
-              sample_weight *= mask
-
-          if hasattr(loss_fn, 'reduction'):
-            per_sample_losses = loss_fn.call(y_true, y_pred)
-            weighted_losses = losses_utils.compute_weighted_loss(
-                per_sample_losses,
-                sample_weight=sample_weight,
-                reduction=losses_utils.ReductionV2.NONE)
-            loss_reduction = loss_fn.reduction
-
-            # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all
-            # compile use cases.
-            if loss_reduction == losses_utils.ReductionV2.AUTO:
-              loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
-
-            # Compute the stateless loss value.
-            output_loss = losses_utils.reduce_weighted_loss(
-                weighted_losses, reduction=loss_reduction)
-          else:
-            # Compute the stateless loss value for a custom loss class.
-            # Here we assume that the class takes care of loss reduction
-            # because if this class returns a vector value we cannot
-            # differentiate between use case where a custom optimizer
-            # expects a vector loss value vs unreduced per-sample loss value.
-            output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
-            loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
-
-        if len(self.outputs) > 1:
-          # Keep track of stateful result tensor for the loss.
-          endpoint.output_loss_metric(output_loss)
-
-        # Scale output loss for distribution. For custom losses we assume
-        # reduction was mean.
-        if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
-          output_loss = losses_utils.scale_loss_for_distribution(output_loss)
-
-        loss_list.append(loss_weight * output_loss)
-      if not loss_list and not self.losses:
-        raise ValueError('The model cannot be compiled '
-                         'because it has no loss to optimize.')
-
-      # Add regularization penalties and other layer-specific losses.
-      custom_losses = self.get_losses_for(None) + self.get_losses_for(
-          self.inputs)
-      if custom_losses:
-        total_custom_loss = tf.add_n(
-            losses_utils.cast_losses_to_common_dtype(custom_losses))
-        loss_list.append(
-            losses_utils.scale_loss_for_distribution(total_custom_loss))
-
-      loss_list = losses_utils.cast_losses_to_common_dtype(loss_list)
-      if loss_list:
-        total_loss = tf.add_n(loss_list)
-      else:
-        total_loss = 0.
-    return total_loss
-
-  def _get_callback_model(self):
-    """Returns the Callback Model for this Model."""
-
-    if hasattr(self, '_replicated_model') and self._replicated_model:
-      # When using training_distributed, we set the callback model
-      # to an instance of the `DistributedModel` that we create in
-      # the `compile` call. The `DistributedModel` is initialized
-      # with the first replicated model. We need to set the callback
-      # model to a DistributedModel to allow us to override saving
-      # and loading weights when we checkpoint the model during training.
-      return self._replicated_model
-    if hasattr(self, 'callback_model') and self.callback_model:
-      return self.callback_model
-    return self
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _make_callback_model(self, grouped_model):
-    first_replicated_model = self._distribution_strategy.unwrap(
-        grouped_model)[0]
-    # We initialize the callback model with the first replicated model.
-    self._replicated_model = DistributedCallbackModel(first_replicated_model)
-    self._replicated_model.set_original_model(self)
-
-  def _validate_or_infer_batch_size(self, batch_size, steps, x):
-    """Validates that the `batch_size` provided is consistent with InputLayer.
-
-    It's possible that the user specified a static batch size in their
-    InputLayer. If so, this method checks the provided `batch_size` and `x`
-    arguments are consistent with this static batch size. Also, if
-    `batch_size` is `None`, this method will attempt to infer the batch size
-    from the static batch size of the InputLayer. Lastly, ValueError will be
-    raised if `x` is a tf.data.Dataset and `batch_size` is specified as we
-    expect users to provide batched datasets.
-
-    Args:
-      batch_size: The batch_size provided as an argument to
-        fit/evaluate/predict.
-      steps: The steps provided as an argument to fit/evaluate/predict.
-      x: The data passed as `x` to fit/evaluate/predict.
-
-    Returns:
-      The validated batch_size, auto-inferred from the first layer if not
-      provided.
-    """
-    if (isinstance(x, (tf.compat.v1.data.Dataset,
-                       tf.data.Dataset,
-                       data_utils.Sequence)) or
-        tf_inspect.isgenerator(x)):
-      if batch_size is not None:
-        raise ValueError(
-            'The `batch_size` argument must not be specified for the given '
-            'input type. Received input: {}, batch_size: {}'.format(
-                x, batch_size))
-      return
-
-    # Avoids the override in Sequential.layers which filters Input layers.
-    # (Which are often the very layers that we're after.)
-    layers = self._flatten_layers(include_self=False, recursive=False)
-    first_layer = next(layers, None)
-    if first_layer:
-      # The per-replica static batch size.
-      static_batch_size = training_utils.get_static_batch_size(first_layer)
-      if static_batch_size is not None:
-
-        # Determine number of times the user-supplied batch size will be split.
-        if (self._distribution_strategy and
-            distributed_training_utils.global_batch_size_supported(
-                self._distribution_strategy)):
-          num_splits_for_ds = self._distribution_strategy.num_replicas_in_sync
+                feedable = True
+                skip_target_weights = False
+
+            if target is None:
+                target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                    self.loss_fn, backend.dtype(self.output)
+                )
+
+                target = backend.placeholder(
+                    ndim=len(self.shape),
+                    name=self.output_name + "_target",
+                    sparse=backend.is_sparse(self.output),
+                    dtype=target_dtype,
+                )
+
+            self.training_target = _TrainingTarget(
+                target,
+                feedable=feedable,
+                skip_target_weights=skip_target_weights,
+            )
+
+    @property
+    def output_loss_metric(self):
+        return self._output_loss_metric
+
+    @output_loss_metric.setter
+    def output_loss_metric(self, value):
+        self._output_loss_metric = value
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+    @sample_weight.setter
+    def sample_weight(self, value):
+        self._sample_weight = value
+
+    @property
+    def sample_weight_mode(self):
+        return self._sample_weight_mode
+
+    @sample_weight_mode.setter
+    def sample_weight_mode(self, value):
+        self._sample_weight_mode = value
+
+    def should_skip_target(self):
+        return self._loss_fn is None
+
+    def should_skip_target_weights(self):
+        return (
+            self.should_skip_target()
+            or self.training_target is None
+            or self.training_target.skip_target_weights
+        )
+
+    def has_training_target(self):
+        return self.training_target is not None
+
+    def has_feedable_training_target(self):
+        return (
+            not self.should_skip_target()
+            and self.training_target is not None
+            and self.training_target.feedable
+        )
+
+    def loss_name(self):
+        if self._loss_fn is not None:
+            return self._output_name + "_loss"
+        return None
+
+    @property
+    def feed_output_shape(self):
+        """The output shape for the feedable target."""
+        if not self.has_feedable_training_target():
+            return None
+
+        if (
+            (
+                isinstance(self.loss_fn, losses.LossFunctionWrapper)
+                and self.loss_fn.fn == losses.sparse_categorical_crossentropy
+            )
+        ) or (isinstance(self.loss_fn, losses.SparseCategoricalCrossentropy)):
+            if backend.image_data_format() == "channels_first":
+                return (self.shape[0], 1) + self.shape[2:]
+            else:
+                return self.shape[:-1] + (1,)
+        elif not isinstance(self.loss_fn, losses.Loss) or (
+            isinstance(self.loss_fn, losses.LossFunctionWrapper)
+            and (getattr(losses, self.loss_fn.fn.__name__, None) is None)
+        ):
+            # If the given loss is not an instance of the `Loss` class (custom
+            # class) or if the loss function that is wrapped is not in the
+            # `losses` module, then it is a user-defined loss and we make no
+            # assumptions about it.
+            return None
         else:
-          num_splits_for_ds = 1
-
-        # Check `batch_size` argument is consistent with InputLayer.
-        if batch_size is not None:
-          if batch_size % num_splits_for_ds != 0:
-            raise ValueError('The `batch_size` argument ({}) must be divisible '
-                             'the by number of replicas ({})'.format(
-                                 batch_size, num_splits_for_ds))
-          per_replica_batch_size = batch_size // num_splits_for_ds
-
-          if per_replica_batch_size != static_batch_size:
-            raise ValueError('The `batch_size` argument value {} is '
-                             'incompatible with the specified batch size of '
-                             'your Input Layer: {}'.format(
-                                 per_replica_batch_size, static_batch_size))
-
-        # Check Dataset/Iterator batch size is consistent with InputLayer.
-        if isinstance(x, (tf.data.Dataset, tf.compat.v1.data.Iterator,
-                          tf.data.Iterator)):
-          ds_batch_size = tf.compat.v1.Dimension(
-              tf.nest.flatten(tf.compat.v1.data.get_output_shapes(x))[0][0]).value
-          if ds_batch_size is not None:
-            if ds_batch_size % num_splits_for_ds != 0:
-              raise ValueError(
-                  'The batch output shape of your `Dataset` {} '
-                  'cannot be divisible by number of replicas {}'.format(
-                      ds_batch_size, num_splits_for_ds))
-
-            ds_per_replica_batch_size = ds_batch_size // num_splits_for_ds
-            if ds_per_replica_batch_size != static_batch_size:
-              raise ValueError('The batch output shape of your `Dataset` is '
-                               '{}, which is incompatible with the specified '
-                               'batch size of your Input Layer: {}'.format(
-                                   ds_per_replica_batch_size,
-                                   static_batch_size))
-
-        # Set inferred batch size from the InputLayer.
-        if steps is None:
-          batch_size = static_batch_size * num_splits_for_ds
-
-    if batch_size is None and steps is None:
-      # Backwards compatibility
-      batch_size = 32
-    return batch_size
-
-  def _prepare_sample_weights(self, sample_weights=None):
-    """Sets sample weight attribute on the model."""
-    # List with the same length as model outputs.
-    if sample_weights is not None:
-      if len(sample_weights) != len(self._training_endpoints):
-        raise ValueError('Provided sample weights must have same length as the '
-                         'number of outputs. Expected: {}, got: {}.'.format(
-                             len(self._training_endpoints),
-                             len(sample_weights)))
-    else:
-      sample_weights = [None] * len(self._training_endpoints)
-    for endpoint, weight in zip(self._training_endpoints, sample_weights):
-      endpoint.populate_sample_weight(weight, endpoint.sample_weight_mode)
-
-  def _cache_output_metric_attributes(self, metrics, weighted_metrics):
-    """Caches metric name and function attributes for every model output."""
-    output_shapes = []
-    for output in self.outputs:
-      if output is None or output.shape.rank is None:
-        output_shapes.append(None)
-      else:
-        output_shapes.append(output.shape.as_list())
-    self._per_output_metrics = training_utils_v1.collect_per_output_metric_info(
-        metrics, self.output_names, output_shapes, self.loss_functions,
-        from_serialized=self._from_serialized)
-    self._per_output_weighted_metrics = (
-        training_utils_v1.collect_per_output_metric_info(
-            weighted_metrics,
-            self.output_names,
-            output_shapes,
-            self.loss_functions,
-            from_serialized=self._from_serialized,
-            is_weighted=True))
-
-  def _add_unique_metric_name(self, metric_name, metric_fn, output_index):
-    """Makes the metric name unique.
-
-      If there are multiple outputs for which the metrics are calculated, the
-      metric names have to be made unique by appending an integer.
+            return self.shape
+
+    def sample_weights_mismatch(self):
+        """Check if the sample weight and the mode match or not."""
+        # If there is a mismatch between sample weight mode and the placeholders
+        # created, then recompile the sub-graphs that depend on sample weights.
+        return (
+            self.sample_weight_mode is not None and self.sample_weight is None
+        ) or (
+            self.sample_weight_mode is None and self.sample_weight is not None
+        )
+
+    def populate_sample_weight(self, sample_weight, sample_weight_mode):
+        """Populate the sample weight and based on the sample weight mode."""
+        if sample_weight is None and (
+            self.should_skip_target_weights()
+            or sample_weight_mode is None
+            or tf.executing_eagerly()
+        ):
+            self._sample_weight = None
+            return
+
+        assert sample_weight_mode in ["temporal", "samplewise"]
+        if sample_weight_mode == "temporal":
+            default_value = [[1.0]]
+            shape = [None, None]
+        else:
+            # sample_weight_mode == 'samplewise'
+            default_value = [1.0]
+            shape = [None]
+
+        if sample_weight is not None:
+            if not sample_weight.shape.is_compatible_with(shape):
+                raise ValueError(
+                    "Received sample weight with shape {}. Expected shape "
+                    "{}.".format(sample_weight.shape, shape)
+                )
+            self._sample_weight = sample_weight
+        else:
+            self._sample_weight = tf.compat.v1.placeholder_with_default(
+                tf.constant(default_value, dtype=backend.floatx()),
+                shape=shape,
+                name=self.output_name + "_sample_weights",
+            )
 
-    Args:
-      metric_name: Metric name that corresponds to the metric specified by the
-          user. For example: 'acc'.
-      metric_fn: The Metric object.
-      output_index: The index of the model output for which the metric name is
-        being added.
 
-    Returns:
-      string, name of the model's unique metric name
-    """
-    # For multi-output models, prepend the output names to the metric name.
-    if len(self.output_names) > 1:
-      # If we're loading from an already-serialized model, we've already
-      # prepended the output name, and we don't want to do it again.
-      #
-      # Alternatively, we may be receiving a stateless metric (e.g. the string
-      # "accuracy") rather than a `Metric` object, in which case we want to
-      # prepend the output name even if we are loading a serialized model.
-      if not getattr(metric_fn, '_from_serialized', False):
-        metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
-
-    j = 1
-    base_metric_name = metric_name
-    while metric_name in self.metrics_names:
-      metric_name = '%s_%d' % (base_metric_name, j)
-      j += 1
-
-    return metric_name
-
-  def _init_metric_attributes(self):
-    """Initialized model metric attributes."""
-    # List of stateful metric functions. Used for resetting metric state during
-    # training/eval.
-    self._compile_metric_functions = []
-
-  def _set_per_output_metric_attributes(self, metrics_dict, output_index):
-    """Sets the metric attributes on the model for the given output.
+class _TrainingTarget:
+    """Container for a target tensor (y_true) and its metadata (shape, loss...).
 
     Args:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      output_index: The index of the model output for which the metric
-        attributes are added.
-
-    Returns:
-      Metrics dict updated with unique metric names as keys.
+      target: A target tensor for the model. It may be `None` if the
+        output is excluded from loss computation. It is still kept as None
+        since each output of the model should have a corresponding target. If
+        the target is None, the rest of the attributes will be None as well.
+      feedable: Boolean, whether the target is feedable (requires data to be
+        passed in `fit` or `train_on_batch`), or not (model compiled with
+        `target_tensors` argument).
+      skip_target_weights: Boolean, whether the target should be skipped during
+        weights calculation.
     """
-    updated_metrics_dict = collections.OrderedDict()
-    for metric_name, metric_fn in metrics_dict.items():
-      metric_name = self._add_unique_metric_name(
-          metric_name, metric_fn, output_index)
-
-      # Update the name on the metric class to be the unique generated name.
-      metric_fn._name = metric_name  # pylint: disable=protected-access
-      updated_metrics_dict[metric_name] = metric_fn
-      # Keep track of metric name and function.
-      self._compile_metric_functions.append(metric_fn)
-    return updated_metrics_dict
-
-  def _set_metric_attributes(self):
-    """Sets the metric attributes on the model for all the model outputs."""
-    updated_per_output_metrics = []
-    updated_per_output_weighted_metrics = []
-    for i, endpoint in enumerate(self._training_endpoints):
-      if endpoint.should_skip_target():
-        updated_per_output_metrics.append(self._per_output_metrics[i])
-        updated_per_output_weighted_metrics.append(
-            self._per_output_weighted_metrics[i])
-        continue
-      updated_per_output_metrics.append(
-          self._set_per_output_metric_attributes(self._per_output_metrics[i],
-                                                 i))
-      updated_per_output_weighted_metrics.append(
-          self._set_per_output_metric_attributes(
-              self._per_output_weighted_metrics[i], i))
-
-    # Create a metric wrapper for each output loss. This computes mean of an
-    # output loss across mini-batches (irrespective of how we reduce within a
-    # batch).
-    if len(self._training_endpoints) > 1:
-      for endpoint in self._training_endpoints:
-        if not endpoint.should_skip_target():
-          endpoint.output_loss_metric = metrics_module.Mean(
-              name=endpoint.loss_name())
-
-    self._per_output_metrics = updated_per_output_metrics
-    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
-
-  def _handle_per_output_metrics(self,
-                                 metrics_dict,
-                                 y_true,
-                                 y_pred,
-                                 mask,
-                                 weights=None):
-    """Calls metric functions for a single output.
-
-    Args:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      y_true: Target output.
-      y_pred: Predicted output.
-      mask: Computed mask value for the current output.
-      weights: Weights to be applied on the current output.
 
-    Returns:
-      A list of metric result tensors.
-    """
-    metric_results = []
-    for metric_name, metric_fn in metrics_dict.items():
-      with backend.name_scope(metric_name):
-        metric_result = training_utils_v1.call_metric_function(
-            metric_fn, y_true, y_pred, weights=weights, mask=mask)
-        metric_results.append(metric_result)
-    return metric_results
-
-  def _handle_metrics(self,
-                      outputs,
-                      targets=None,
-                      skip_target_masks=None,
-                      sample_weights=None,
-                      masks=None,
-                      return_weighted_metrics=False,
-                      return_weighted_and_unweighted_metrics=False):
-    """Handles calling metric functions.
+    def __init__(self, target, feedable=False, skip_target_weights=True):
+        self._target = target
+        self._feedable = feedable
+        self._skip_target_weights = skip_target_weights
 
-    Args:
-      outputs: List of outputs (predictions).
-      targets: List of targets.
-      skip_target_masks: Optional. List of boolean for whether the corresponding
-        target should be ignored or not.
-      sample_weights: Optional list of sample weight arrays.
-      masks: List of computed output mask values.
-      return_weighted_metrics: Flag that indicates whether weighted metrics
-        should be computed instead of unweighted metrics. This flag is ignored
-        when `return_weighted_and_unweighted_metrics` is enabled.
-      return_weighted_and_unweighted_metrics: Flag that is used to indicate
-        whether both weighted and unweighted metrics should be computed. When
-        this is not enabled, we use `return_weighted_metrics` param to indicate
-        whether weighted or unweighted metrics should be returned.
+    @property
+    def target(self):
+        return self._target
 
-    Returns:
-      A list of metric result tensors.
-    """
-    # TODO(scottzhu): Update this to use the new training_endpoints. Currently
-    # the eager and graph logic is bit different.
-    skip_target_masks = skip_target_masks or [False] * len(outputs)
-    metric_results = []
-    with backend.name_scope('metrics'):
-      # Invoke all metrics added using `compile`.
-      for i in range(len(outputs)):
-        if skip_target_masks[i]:
-          continue
-        output = outputs[i] if outputs else None
-        target = targets[i] if targets else None
-        output_mask = masks[i] if masks else None
-
-        if (return_weighted_and_unweighted_metrics or
-            not return_weighted_metrics):
-          metric_results.extend(
-              self._handle_per_output_metrics(self._per_output_metrics[i],
-                                              target, output, output_mask))
-        if return_weighted_and_unweighted_metrics or return_weighted_metrics:
-          metric_results.extend(
-              self._handle_per_output_metrics(
-                  self._per_output_weighted_metrics[i],
-                  target,
-                  output,
-                  output_mask,
-                  weights=sample_weights[i] if sample_weights else None))
-    return metric_results
-
-  def _check_trainable_weights_consistency(self):
-    """Check trainable weights count consistency.
-
-    This will raise a warning if `trainable_weights` and
-    `_collected_trainable_weights` are inconsistent (i.e. have different
-    number of parameters).
-    Inconsistency will typically arise when one modifies `model.trainable`
-    without calling `model.compile` again.
-    """
-    if not hasattr(self, '_collected_trainable_weights'):
-      return
-
-    if len(self.trainable_weights) != len(self._collected_trainable_weights):
-      logging.log_first_n(
-          logging.WARN, 'Discrepancy between trainable weights and collected'
-          ' trainable weights, did you set `model.trainable`'
-          ' without calling `model.compile` after ?', 1)
-
-  def _make_train_function(self):
-    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
-    self._check_trainable_weights_consistency()
-    if isinstance(self.optimizer, list):
-      raise ValueError('The `optimizer` in `compile` should be a single '
-                       'optimizer.')
-    # If we have re-compiled the loss/weighted metric sub-graphs then create
-    # train function even if one exists already. This is because
-    # `_feed_sample_weights` list has been updated on re-compile.
-    if getattr(self, 'train_function', None) is None or has_recompiled:
-      # Restore the compiled trainable state.
-      current_trainable_state = self._get_trainable_state()
-      self._set_trainable_state(self._compiled_trainable_state)
-
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
-      if not isinstance(backend.symbolic_learning_phase(), int):
-        inputs += [backend.symbolic_learning_phase()]
-
-      with backend.get_graph().as_default():
-        with backend.name_scope('training'):
-          # Training updates
-          updates = self.optimizer.get_updates(
-              params=self._collected_trainable_weights, loss=self.total_loss)
-          # Unconditional updates
-          updates += self.get_updates_for(None)
-          # Conditional updates relevant to this model
-          updates += self.get_updates_for(self.inputs)
+    @property
+    def feedable(self):
+        return self._feedable
 
-        metrics = self._get_training_eval_metrics()
-        metrics_tensors = [
-            m._call_result for m in metrics if hasattr(m, '_call_result')  # pylint: disable=protected-access
-        ]
+    @property
+    def skip_target_weights(self):
+        return self._skip_target_weights
 
-      with backend.name_scope('training'):
-        # Gets loss and metrics. Updates weights at each call.
-        fn = backend.function(
-            inputs, [self.total_loss] + metrics_tensors,
-            updates=updates,
-            name='train_function',
-            **self._function_kwargs)
-        setattr(self, 'train_function', fn)
-
-      # Restore the current trainable state
-      self._set_trainable_state(current_trainable_state)
-
-  def _make_test_function(self):
-    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
-    # If we have re-compiled the loss/weighted metric sub-graphs then create
-    # test function even if one exists already. This is because
-    # `_feed_sample_weights` list has been updated on re-compile.
-    if getattr(self, 'test_function', None) is None or has_recompiled:
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
-
-      with backend.get_graph().as_default():
-        metrics = self._get_training_eval_metrics()
-        metrics_tensors = [
-            m._call_result for m in metrics if hasattr(m, '_call_result')  # pylint: disable=protected-access
-        ]
 
-      with backend.name_scope('evaluation'):
-        updates = self.state_updates
-        # Return loss and metrics, no gradient updates.
-        # Does update the network states.
-        fn = backend.function(
-            inputs, [self.total_loss] + metrics_tensors,
-            updates=updates,
-            name='test_function',
-            **self._function_kwargs)
-        setattr(self, 'test_function', fn)
-
-  def _make_predict_function(self):
-    if not hasattr(self, 'predict_function'):
-      self.predict_function = None
-    if self.predict_function is None:
-      inputs = self._feed_inputs
-      # Gets network outputs. Does not update weights.
-      # Does update the network states.
-      kwargs = getattr(self, '_function_kwargs', {})
-      with backend.name_scope(ModeKeys.PREDICT):
-        self.predict_function = backend.function(
-            inputs,
-            self.outputs,
-            updates=self.state_updates,
-            name='predict_function',
-            **kwargs)
-
-  def _make_execution_function(self, mode):
-    if mode == ModeKeys.TRAIN:
-      self._make_train_function()
-      return self.train_function
-    if mode == ModeKeys.TEST:
-      self._make_test_function()
-      return self.test_function
-    if mode == ModeKeys.PREDICT:
-      self._make_predict_function()
-      return self.predict_function
-
-  def _distribution_standardize_user_data(self,
-                                          x,
-                                          y=None,
-                                          sample_weight=None,
-                                          class_weight=None,
-                                          batch_size=None,
-                                          validation_split=0.,
-                                          shuffle=False,
-                                          epochs=1,
-                                          allow_partial_batch=False):
-    """Runs validation checks on input and target data passed by the user.
-
-    This is called when using tf.distribute.Strategy to train, evaluate or serve
-    the model.
+def _is_symbolic_tensor(x):
+    return tf.is_tensor(x)
 
-    Args:
-      x: Input data. A numpy array or `tf.data` dataset.
-      y: Target data. A numpy array or None if x is a `tf.data` dataset.
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
-      epochs: Integer epochs. If > 1, repeat the numpy training data epochs
-        times when converting to training dataset.
-      allow_partial_batch: Boolean whether to enforce that all batches have the
-        same size.
 
-    Returns:
-      Dataset instance.
+def _convert_scipy_sparse_tensor(value, expected_input):
+    """Handle scipy sparse tensor conversions.
 
-    Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
-    """
-    if class_weight:
-      raise NotImplementedError('`class_weight` is currently not supported '
-                                'when using tf.distribute.Strategy.')
-
-    if (sample_weight is not None and sample_weight.all() and
-        backend.is_tpu_strategy(self._distribution_strategy)):
-      raise NotImplementedError('`sample_weight` is currently not supported '
-                                'when using TPUStrategy.')
-
-    # Validates `steps` and `shuffle` arguments right at the beginning
-    # since we use it to construct the dataset object.
-    # TODO(anjalisridhar): Remove this check once we refactor the
-    # _standardize_user_data code path. This check is already present elsewhere
-    # in the codebase.
-    if isinstance(x, tf.data.Dataset):
-      if shuffle:
-        training_utils_v1.verify_dataset_shuffled(x)
-
-    strategy = self._distribution_strategy
-    with strategy.scope():
-      # We should be sure to call get_session() inside the strategy.scope()
-      # so the strategy can affect the session options.
-      if tf.compat.v1.executing_eagerly_outside_functions():
-        session = None
-      else:
-        session = backend.get_session()
-
-      first_x_value = tf.nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        x = training_utils.list_to_tuple(x)
-        if y is not None:
-          y = training_utils.list_to_tuple(y)
-          if sample_weight is not None:
-            sample_weight = training_utils.list_to_tuple(sample_weight)
-            in_tuple = (x, y, sample_weight)
-          else:
-            in_tuple = (x, y)
-        else:
-          in_tuple = x
-
-        ds = strategy.extended.experimental_make_numpy_dataset(in_tuple,
-                                                               session=session)
-        if shuffle:
-          # We want a buffer size that is larger than the batch size provided by
-          # the user and provides sufficient randomness. Note that larger
-          # numbers introduce more memory usage based on the size of each
-          # sample.
-          ds = ds.shuffle(max(1024, batch_size * 8))
-        if epochs > 1:
-          ds = ds.repeat(epochs)
-
-        # We need to use the drop_remainder argument to get a known static
-        # input shape which is required for TPUs.
-        drop_remainder = (not allow_partial_batch and
-                          strategy.extended.experimental_require_static_shapes)
-
-        # TODO(b/131720208): We still drop remainder here if number of examples
-        # is divisible by batch size, as sometimes dynamic padder will time out
-        # with keras.metrics.CategoricalAccuracy() metric.
-        if backend.is_tpu_strategy(strategy) and not drop_remainder:
-          dataset_size = first_x_value.shape[0]
-          if dataset_size % batch_size == 0:
-            drop_remainder = True
-
-        x = ds.batch(batch_size, drop_remainder=drop_remainder)
-      else:
-        assert isinstance(x, tf.data.Dataset)
-        training_utils_v1.validate_dataset_input(x, y, sample_weight,
-                                                 validation_split)
-    return x
-
-  def _standardize_user_data(self,
-                             x,
-                             y=None,
-                             sample_weight=None,
-                             class_weight=None,
-                             batch_size=None,
-                             check_steps=False,
-                             steps_name='steps',
-                             steps=None,
-                             validation_split=0.,
-                             shuffle=False,
-                             extract_tensors_from_dataset=False):
-    """Runs validation checks on input and target data passed by the user.
-
-    Also standardizes the data to lists of arrays, in order.
-
-    Also builds and compiles the model on the fly if it is a subclassed model
-    that has never been called before (and thus has no inputs/outputs).
-
-    This is a purely internal method, subject to refactoring at any time.
+    This method takes a value 'value' and returns the proper conversion. If
+    value is a scipy sparse tensor and the expected input is a dense tensor,
+    we densify 'value'. If value is a scipy sparse tensor and the expected input
+    is a TF SparseTensor, we convert 'value' to a SparseTensor. If 'value' is
+    not a scipy sparse tensor, or scipy is not imported, we pass it through
+    unchanged.
 
     Args:
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-          (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-          (in case the model has multiple inputs).
-        - A dict mapping input names to the corresponding array/tensors,
-          if the model has named inputs.
-        - A `tf.data` dataset.
-      y: Target data. Like the input data `x`,
-        it could be either Numpy array(s) or TensorFlow tensor(s).
-        It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset, `y` should not be
-        specified (since targets will be obtained from the iterator).
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`. If both `sample_weight` and `class_weight` are
-        provided, the weights are multiplied.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise. For example, when we are standardizing one batch of
-        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
-        value is not required and we should not check for its validity in these
-        cases.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
-      extract_tensors_from_dataset: Boolean. When `x` is a dataset instance,
-        this indicates whether to extract actual tensors from the dataset or
-        instead output the dataset instance itself.
-        Set to True when calling from `train_on_batch`/etc.
+      value: An object that may be a scipy sparse tensor
+      expected_input: The expected input placeholder.
 
     Returns:
-      A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
-      or not), target arrays, sample-weight arrays.
-      If the model's input and targets are symbolic, these lists are empty
-      (since the model takes no user-provided data, instead the data comes
-      from the symbolic inputs/targets).
-
-    Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
+      The possibly-converted 'value'.
     """
-    if isinstance(x, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
-      # Graph mode dataset. We'll pass the dataset as-is (unless
-      # `extract_tensors_from_dataset` is True, in which case we extract
-      # the tensors from the dataset and we output them.
-      training_utils_v1.validate_dataset_input(x, y, sample_weight,
-                                               validation_split)
-      if shuffle:
-        training_utils_v1.verify_dataset_shuffled(x)
-
-      is_dataset = True
-      if extract_tensors_from_dataset:
-        # We do this for `train_on_batch`/etc.
-        x, y, sample_weight = training_utils_v1.extract_tensors_from_dataset(x)
-    elif isinstance(x, tf.compat.v1.data.Iterator):
-      # Graph mode iterator. We extract the symbolic tensors.
-      training_utils_v1.validate_dataset_input(x, y, sample_weight,
-                                               validation_split)
-      iterator = x
-      x, y, sample_weight = training_utils_v1.unpack_iterator_input(iterator)
-      is_dataset = True
+    if issparse is not None and issparse(value):
+        if backend.is_sparse(expected_input):
+            sparse_coo = value.tocoo()
+            row, col = sparse_coo.row, sparse_coo.col
+            data, shape = sparse_coo.data, sparse_coo.shape
+            indices = np.concatenate(
+                (np.expand_dims(row, 1), np.expand_dims(col, 1)), 1
+            )
+            return tf.SparseTensor(indices, data, shape)
+        else:
+            if tf.compat.v1.executing_eagerly_outside_functions():
+                # In TF2 we do not silently densify sparse matrices.
+                raise ValueError(
+                    "A SciPy sparse matrix was passed to a model "
+                    "that expects dense inputs. Please densify your "
+                    "inputs first, such as by calling `x.toarray()."
+                )
+            return value.toarray()
     else:
-      is_dataset = False
+        return value
 
-    # Validates `steps` argument based on x's type.
-    if check_steps:
-      training_utils_v1.check_steps_argument(x, steps, steps_name)
 
-    # First, we build the model on the fly if necessary.
-    if not self.inputs:
-      all_inputs, y_input, dict_inputs = self._build_model_with_inputs(x, y)
-      is_build_called = True
-    else:
-      all_inputs = []
-      # Whether this is a subclassed model that expects dictionary inputs
-      # rather than list inputs (e.g. FeatureColumn-based models).
-      dict_inputs = isinstance(self.inputs, dict)
-      is_build_called = False
-      y_input = y
-
-    # Second, we compile the model on the fly if necessary, mostly for subclass
-    # models.
-    is_compile_called = False
-    if not self._is_compiled and self.optimizer:
-      self._compile_from_inputs(all_inputs, y_input, x, y)
-      is_compile_called = True
-
-    # In graph mode, if we had just set inputs and targets as symbolic tensors
-    # by invoking build and compile on the model respectively, we do not have to
-    # feed anything to the model. Model already has input and target data as
-    # part of the graph.
-    # Note: in this case, `any` and `all` are equivalent since we disallow
-    # mixed symbolic/value inputs.
-
-    # self.run_eagerly is not free to compute, so we want to reuse the value.
-    run_eagerly = self.run_eagerly
-
-    if (not run_eagerly and is_build_called and is_compile_called and
-        not is_dataset  and any(_is_symbolic_tensor(v) for v in all_inputs)):
-      return [], [], None
-
-    return self._standardize_tensors(
-        x, y, sample_weight,
-        run_eagerly=run_eagerly,
-        dict_inputs=dict_inputs,
-        is_dataset=is_dataset,
-        class_weight=class_weight,
-        batch_size=batch_size)
-
-  def _standardize_tensors(self, x, y, sample_weight, run_eagerly, dict_inputs,
-                           is_dataset, class_weight=None, batch_size=None):
-    if run_eagerly:
-      # In eager mode, do not do shape validation
-      # since the network has no input nodes (placeholders) to be fed.
-      feed_input_names = self.input_names
-      feed_input_shapes = None
-    elif not self._is_graph_network:
-      # Case: symbolic-mode subclassed network. Do not do shape validation.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = None
-    else:
-      # Case: symbolic-mode graph network.
-      # In this case, we run extensive shape validation checks.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = self._feed_input_shapes
-
-    # Standardize the inputs.
-    if not isinstance(x, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
-      # TODO(fchollet): run static checks with dataset output shape(s).
-      x = training_utils_v1.standardize_input_data(
-          x,
-          feed_input_names,
-          feed_input_shapes,
-          check_batch_axis=False,  # Don't enforce the batch size.
-          exception_prefix='input')
-
-    # Get typespecs for the input data and sanitize it if necessary.
-    # TODO(momernick): This should be capable of doing full input validation
-    # at all times - validate that this is so and refactor the standardization
-    # code.
-    if isinstance(x, tf.data.Dataset):
-      x_shapes = tf.data.experimental.get_structure(x)
-      if isinstance(x_shapes, tuple):
-        # If the output of a Dataset is a tuple, we assume it's either of the
-        # form (x_data, y_data) or (x_data, y_data, sample_weights). In either
-        # case, we only care about x_data here.
-        x_shapes = x_shapes[0]
-    else:
-      flat_inputs = tf.nest.flatten(x, expand_composites=False)
-      flat_expected_inputs = tf.nest.flatten(self.inputs, expand_composites=False)
-      converted_x = []
-      for (a, b) in zip(flat_inputs, flat_expected_inputs):
-        converted_x.append(_convert_scipy_sparse_tensor(a, b))
-      x = tf.nest.pack_sequence_as(x, converted_x, expand_composites=False)
-
-      def _type_spec_from_value(value):
-        """Grab type_spec without converting array-likes to tensors."""
-        if tf_utils.is_extension_type(value):
-          return value._type_spec  # pylint: disable=protected-access
-        # Get a TensorSpec for array-like data without
-        # converting the data to a Tensor
-        if hasattr(value, 'shape') and hasattr(value, 'dtype'):
-          return tf.TensorSpec(value.shape, value.dtype)
-        else:
-          return tf.type_spec_from_value(value)
-
-      x_shapes = tf.nest.map_structure(_type_spec_from_value, x)
-
-    flat_inputs = tf.nest.flatten(x_shapes, expand_composites=False)
-    flat_expected_inputs = tf.nest.flatten(self.inputs, expand_composites=False)
-    for (a, b) in zip(flat_inputs, flat_expected_inputs):
-      tf.nest.assert_same_structure(a, b, expand_composites=True)
-
-    if y is not None:
-      # Prepare self._sample_weight_modes. List with the same length as
-      # model outputs.
-      training_utils_v1.prepare_sample_weight_modes(self._training_endpoints,
-                                                    self.sample_weight_mode)
-      feed_output_names = self._feed_output_names
-      feed_sample_weight_modes = self._sample_weight_modes
-      if not self._is_graph_network:
-        feed_output_shapes = None
-      else:
-        feed_output_shapes = self._feed_output_shapes
-
-      # Standardize the outputs.
-      y = training_utils_v1.standardize_input_data(
-          y,
-          feed_output_names,
-          # Don't enforce target shapes to match output shapes.
-          # Precise checks will be run in `check_loss_and_target_compatibility`.
-          shapes=None,
-          check_batch_axis=False,  # Don't enforce the batch size.
-          exception_prefix='target')
-
-      # Generate sample-wise weight values given the `sample_weight` and
-      # `class_weight` arguments.
-      sample_weights = training_utils_v1.standardize_sample_weights(
-          sample_weight, feed_output_names)
-      class_weights = training_utils_v1.standardize_class_weights(
-          class_weight, feed_output_names)
-
-      sample_weights = [
-          training_utils_v1.standardize_weights(ref, sw, cw, mode)
-          for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
-                                         feed_sample_weight_modes)
-      ]
-      # Check that all arrays have the same length.
-      if not self._distribution_strategy:
-        training_utils_v1.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not run_eagerly:
-          # Additional checks to avoid users mistakenly using improper loss fns.
-          training_utils_v1.check_loss_and_target_compatibility(
-              y, self._feed_loss_fns, feed_output_shapes)
-
-      sample_weights, _, _ = training_utils.handle_partial_sample_weights(
-          y, sample_weights, feed_sample_weight_modes, check_all_flat=True)
-    else:
-      y = []
-      sample_weights = None
-
-    if self.stateful and batch_size and not is_dataset:
-      # Check that for stateful networks, number of samples is a multiple
-      # of the static batch size.
-      if x[0].shape[0] % batch_size != 0:
-        raise ValueError('In a stateful network, '
-                         'you should only pass inputs with '
-                         'a number of samples that can be '
-                         'divided by the batch size. Found: ' +
-                         str(x[0].shape[0]) + ' samples')
-
-    # If dictionary inputs were provided, we return a dictionary as well.
-    if dict_inputs and not isinstance(x, (tf.compat.v1.data.Dataset,
-                                          tf.data.Dataset)):
-      x = dict(zip(feed_input_names, x))
-    return x, y, sample_weights
-
-  def _build_model_with_inputs(self, inputs, targets):
-    """Build the model (set model inputs/outputs), mainly for subclass model."""
-    processed_inputs = []
-    is_dict_inputs = False
-    orig_inputs = inputs
-    # We need to use `inputs` to set the model inputs.
-    # If input data is a dataset iterator in graph mode or if it is an eager
-    # iterator and only one batch of samples is required, we fetch the data
-    # tensors from the iterator and then standardize them.
-    if isinstance(inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset)):
-      inputs, targets, _ = training_utils_v1.extract_tensors_from_dataset(
-          inputs)
-    # We type-check that `inputs` and `targets` are either single arrays
-    # or lists of arrays, and extract a flat list of inputs from the passed
-    # structure.
-    training_utils_v1.validate_input_types(inputs, orig_inputs)
-
-    if isinstance(inputs, (list, tuple)):
-      processed_inputs += list(inputs)
-    elif isinstance(inputs, dict):
-      is_dict_inputs = True
-      keys = sorted(inputs.keys())
-      processed_inputs = [inputs[k] for k in keys]
-    else:
-      processed_inputs.append(inputs)
-    # Now that we have a flat set of inputs, we make sure that none of them
-    # are CompositeTensors or CompositeTensorValues of any type (or scipy
-    # sparse arrays, which we treat as SparseTensor values). We cannot safely
-    # infer input data from an arbitrary composite tensor, so we don't try -
-    # users should explicitly add composite tensor inputs to their subclassed
-    # models.
-    for input_tensor in processed_inputs:
-      if training_utils_v1.is_composite_or_composite_value(input_tensor):
-        # TODO(b/132691975): Document subclass-model CT input handling.
-        raise ValueError(
-            'All SparseTensor and RaggedTensor inputs must be explicitly '
-            'declared using a keras.Input() with sparse=True or ragged=True. '
-            'We found an undeclared input %s. For Sequential models, please '
-            'add a keras.Input() as your first Layer. For subclassed models, '
-            'please call self._set_inputs() on your input set, which you can '
-            'create using keras.Input() for each input to your model.' %
-            (input_tensor,))
-    # Build the model using the retrieved inputs (value or symbolic).
-    # If values are generated from a dataset, then in symbolic-mode
-    # placeholders will be created to match the value shapes.
-    if isinstance(orig_inputs, (tf.compat.v1.data.Dataset, tf.data.Dataset,
-                                tf.compat.v1.data.Iterator)):
-      if not self.inputs:
-        # For subclassed models, a robust input spec is not available so we
-        # must cast to the model dtype.
-        inputs = training_utils_v1.cast_if_floating_dtype(inputs, self.dtype)
-
-      def create_tensor_spec(t):
-        return tf.TensorSpec(t.shape, t.dtype)
-
-      cast_inputs = tf.nest.map_structure(create_tensor_spec, inputs)
-    elif training_utils_v1.has_tensors(inputs):
-      cast_inputs = training_utils_v1.cast_if_floating_dtype(inputs)
-    else:
-      cast_inputs = inputs
-    self._set_inputs(cast_inputs)
-    return processed_inputs, targets, is_dict_inputs
-
-  def _compile_from_inputs(self, all_inputs, target, orig_inputs, orig_target):
-    if target is not None:
-      # We need to use `y` to set the model targets.
-      if training_utils_v1.has_tensors(target):
-        target = training_utils_v1.cast_if_floating_dtype_and_mismatch(
-            target, self.outputs)
-      training_utils_v1.validate_input_types(
-          target, orig_target, allow_dict=False, field_name='target')
-      if isinstance(target, (list, tuple)):
-        all_inputs += list(target)
-      else:
-        all_inputs.append(target)
-    # Type check that all inputs are *either* value *or* symbolic.
-    # TODO(fchollet): this check could be removed in Eager mode?
-    if any(tf.is_tensor(v) for v in all_inputs):
-      if not all(tf.is_tensor(v) for v in all_inputs):
-        raise ValueError('Do not pass inputs that mix Numpy arrays and '
-                         'TensorFlow tensors. '
-                         'You passed: x=' + str(orig_inputs) +
-                         '; y=' + str(orig_target))
-    is_dataset = isinstance(orig_inputs, (tf.compat.v1.data.Dataset,
-                                          tf.data.Dataset,
-                                          tf.compat.v1.data.Iterator))
-    if is_dataset or tf.executing_eagerly():
-      target_tensors = None
-    else:
-      # Handle target tensors if any passed.
-      if target is not None:
-        if not isinstance(target, (list, tuple)):
-          target = [target]
-        target_tensors = [v for v in target if _is_symbolic_tensor(v)]
-      else:
-        target_tensors = None
-
-    self.compile(
-        optimizer=self.optimizer,
-        loss=self.loss,
-        metrics=self._compile_metrics,
-        weighted_metrics=self._compile_weighted_metrics,
-        loss_weights=self.loss_weights,
-        target_tensors=target_tensors,
-        sample_weight_mode=self.sample_weight_mode,
-        run_eagerly=self.run_eagerly,
-        experimental_run_tf_function=self._experimental_run_tf_function)
-
-  # TODO(omalleyt): Consider changing to a more descriptive function name.
-  def _set_inputs(self, inputs, outputs=None, training=None):
-    """Set model's input and output specs based on the input data received.
-
-    This is to be used for Model subclasses, which do not know at instantiation
-    time what their inputs look like.
+def _get_metrics_from_layers(layers):
+    """Returns list of metrics from the given layers.
 
-    Args:
-      inputs: Single array, or list of arrays. The arrays could be placeholders,
-        Numpy arrays, data tensors, or TensorSpecs.
-        - if placeholders: the model is built on top of these placeholders,
-          and we expect Numpy data to be fed for them when calling `fit`/etc.
-        - if Numpy data or TensorShapes: we create placeholders matching the
-          TensorShapes or shapes of the Numpy arrays. We expect Numpy data to be
-          fed for these placeholders when calling `fit`/etc.
-        - if data tensors: the model is built on top of these tensors.
-          We do not expect any Numpy data to be provided when calling `fit`/etc.
-      outputs: None, a data tensor, or a list of tensors. If None, the
-        outputs will be determined by invoking `self.call()`, otherwise the
-        provided value will be used.
-      training: Boolean or None. Only relevant in symbolic mode. Specifies
-        whether to build the model's graph in inference mode (False), training
-        mode (True), or using the Keras learning phase (None).
-    Raises:
-      ValueError: If dict inputs are passed to a Sequential Model where the
-        first layer isn't FeatureLayer.
-    """
-    self._set_save_spec(inputs)
-    inputs = self._set_input_attrs(inputs)
-
-    if outputs is None:
-      kwargs = {}
-      if self._expects_training_arg:
-        # In V2 mode, feeding `training=None` is not allowed because any value
-        # explicitly passed by the user is respected, even `None`.`
-        if training is None and not tf.compat.v1.executing_eagerly_outside_functions():
-          training = backend.learning_phase()
-        if training is not None:
-          kwargs['training'] = training
-      try:
-        outputs = self(inputs, **kwargs)
-      except NotImplementedError:
-        # This Model or a submodel is dynamic and hasn't overridden
-        # `compute_output_shape`.
-        outputs = None
-
-    self._set_output_attrs(outputs)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _set_input_attrs(self, inputs):
-    """Sets attributes related to the inputs of the Model."""
-    if self.inputs:
-      raise ValueError('Model inputs are already set.')
-
-    if self.__class__.__name__ == 'Sequential' and not self.built:
-      if tf.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
-      elif isinstance(inputs, tf.TensorShape):
-        input_shape = (None,) + tuple(inputs.as_list()[1:])
-      elif isinstance(inputs, dict):
-        # We assert that the first layer is a FeatureLayer.
-        if not training_utils_v1.is_feature_layer(self.layers[0]):
-          raise ValueError('Passing a dictionary input to a Sequential Model '
-                           'which doesn\'t have FeatureLayer as the first layer'
-                           ' is an error.')
-        input_shape = (None,)
-      else:
-        input_shape = (None,) + tuple(inputs.shape[1:])
-      self._build_input_shape = input_shape
-
-    # Cast inputs to the compute dtype. This is primarily used
-    # when saving to determine the correct dtype in the input signature.
-    inputs = self._maybe_cast_inputs(inputs)
-
-    # On-the-fly setting of symbolic model inputs (either by using the tensor
-    # provided, or by creating a placeholder if Numpy data was provided).
-    model_inputs = training_utils_v1.ModelInputs(inputs)
-    inputs = model_inputs.get_symbolic_inputs()
-    self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.input_names = model_inputs.get_input_names()
-
-    self._feed_inputs = []
-    self._feed_input_names = []
-    self._feed_input_shapes = []
-
-    for k, v in model_inputs.as_dict():
-      if backend.is_placeholder(v):
-        self._feed_input_names.append(k)
-        self._feed_inputs.append(v)
-        self._feed_input_shapes.append(backend.int_shape(v))
-
-    return inputs
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _set_output_attrs(self, outputs):
-    """Sets attributes related to the outputs of the Model."""
-    # NOTE(taylorrobie): This convention cannot be changed without updating the
-    #                    data adapter since it assumes nest.flatten ordering.
-    outputs = tf.nest.flatten(outputs)
-    self.outputs = outputs
-    self.output_names = training_utils_v1.generic_output_names(outputs)
-    # TODO(scottzhu): Should we cleanup the self._training_endpoints here?
-    self.built = True
-
-  @property
-  def _targets(self):
-    """The output target tensors for the model."""
-    return [
-        e.training_target.target
-        for e in self._training_endpoints
-        if e.has_training_target()
-    ]
-
-  @property
-  def _feed_targets(self):
-    return [
-        e.training_target.target
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _feed_output_names(self):
-    return [
-        e.output_name
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _feed_output_shapes(self):
-    return [
-        e.feed_output_shape
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _feed_loss_fns(self):
-    return [
-        e.loss_fn
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _loss_weights_list(self):
-    return [e.loss_weight for e in self._training_endpoints]
-
-  @property
-  def _output_loss_metrics(self):
-    if hasattr(self, '_training_endpoints'):
-      return [
-          e.output_loss_metric
-          for e in self._training_endpoints
-          if e.output_loss_metric is not None
-      ]
-    return None
-
-  @property
-  def sample_weights(self):
-    return [e.sample_weight for e in self._training_endpoints]
-
-  @property
-  def _sample_weight_modes(self):
-    return [e.sample_weight_mode for e in self._training_endpoints]
-
-  @property
-  def _feed_sample_weights(self):
-    return [e.sample_weight for e in self._training_endpoints
-            if e.sample_weight is not None]
-
-  def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
-    """Maybe load initial epoch from ckpt considering possible worker recovery.
-
-    Refer to tensorflow/python/keras/distribute/worker_training_state.py
-    for more information.
+    This will not include the `compile` metrics of a model layer.
 
     Args:
-      initial_epoch: The original initial_epoch user passes in in `fit()`.
-      mode: The mode for running `model.fit()`.
+      layers: List of layers.
 
     Returns:
-      If the training is recovering from previous failure under multi-worker
-      training setting, return the epoch the training is supposed to continue
-      at. Otherwise, return the `initial_epoch` the user passes in.
-    """
-    if self._training_state is not None:
-      return self._training_state.maybe_load_initial_epoch_from_ckpt(
-          initial_epoch, mode)
-    return initial_epoch
-
-  def _get_training_eval_metrics(self):
-    """Returns all the metrics that are to be reported.
-
-    This includes the output loss metrics, compile metrics/weighted metrics,
-    add_metric metrics.
+      List of metrics.
     """
     metrics = []
-    metrics.extend(getattr(self, '_output_loss_metrics', None) or [])
-    metrics.extend(getattr(self, 'metrics', None) or [])
+    layers = layer_utils.filter_empty_layer_containers(layers)
+    for layer in layers:
+        if isinstance(layer, Model):
+            # We cannot call 'metrics' on the model because we do not want to
+            # include the metrics that were added in compile API of a nested
+            # model.
+            metrics.extend(layer._metrics)
+            metrics.extend(_get_metrics_from_layers(layer.layers))
+        else:
+            metrics.extend(layer.metrics)
     return metrics
 
-  def _assert_compile_was_called(self):
-    # Checks whether `compile` has been called. If it has been called,
-    # then the optimizer is set. This is different from whether the
-    # model is compiled
-    # (i.e. whether the model is built and its inputs/outputs are set).
-    if not self._compile_was_called:
-      raise RuntimeError('You must compile your model before '
-                         'training/testing. '
-                         'Use `model.compile(optimizer, loss)`.')
-
-  def _in_multi_worker_mode(self):
-    """Method to infer if this `Model` is working in multi-worker settings.
-
-    Multi-worker training refers to the setup where the training is
-    distributed across multiple workers, as opposed to the case where
-    only a local process performs the training. This function is
-    used to infer for example whether or not a distribute coordinator
-    should be run, and thus TensorFlow servers should be started for
-    communication with other servers in the cluster, or whether or not
-    saving/restoring checkpoints is relevant for preemption fault tolerance.
-
-    Experimental. Signature and implementation are subject to change.
-
-    Returns:
-      Whether this model indicates it's working in multi-worker settings.
-    """
-    strategy = self._distribution_strategy
-
-    # Otherwise, use the strategy whose scope this is in.
-    if not strategy and tf.distribute.has_strategy():
-      strategy = tf.distribute.get_strategy()
-    return strategy and strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return model_serialization.ModelSavedModelSaver(self)
-
-  def _get_compile_args(self, user_metrics=True):
-    del user_metrics
-    self._assert_compile_was_called()
-    kwargs = {
-        'loss': self.loss,
-        'metrics': self._compile_metrics,
-        'loss_weights': self.loss_weights,
-        'sample_weight_mode': self.sample_weight_mode,
-        'weighted_metrics': self._compile_weighted_metrics,
-    }
-    return kwargs
-
-  @property
-  def _compile_was_called(self):
-    return self._v1_compile_was_called
-
-
-class DistributedCallbackModel(Model):
-  """Model that is used for callbacks with tf.distribute.Strategy."""
-
-  def __init__(self, model):
-    super().__init__()
-    self.optimizer = model.optimizer
-
-  def set_original_model(self, orig_model):
-    self._original_model = orig_model
-
-  def save_weights(self, filepath, overwrite=True, save_format=None):
-    self._replicated_model.save_weights(filepath, overwrite=overwrite,
-                                        save_format=save_format)
-
-  def save(self, filepath, overwrite=True, include_optimizer=True):
-    # save weights from the distributed model to the original model
-    distributed_model_weights = self.get_weights()
-    self._original_model.set_weights(distributed_model_weights)
-    # TODO(anjalisridhar): Do we need to save the original model here?
-    # Saving the first replicated model works as well.
-    self._original_model.save(filepath, overwrite=True, include_optimizer=False)
-
-  def load_weights(self, filepath, by_name=False):
-    self._original_model.load_weights(filepath, by_name=False)
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = self._original_model.get_weights()
-    distributed_training_utils_v1.set_weights(
-        self._original_model._distribution_strategy, self,  # pylint: disable=protected-access
-        orig_model_weights)
-
-  def __getattr__(self, item):
-    # Allowed attributes of the model that can be accessed by the user
-    # during a callback.
-    if item not in ('_setattr_tracking', '_layers'):
-      logging.warning('You are accessing attribute ' + item + ' of the '
-                      'DistributedCallbackModel that may not have been set '
-                      'correctly.')
-    return super().__getattr__(item)
-
-
-class _TrainingEndpoint:
-  """A container for the training output/target and related entities.
-
-  In the case of model with multiple outputs, there is a one-to-one mapping
-  between model output (y_pred), model target (y_true), loss, metrics etc.
-  By unifying these entities into one class, different entity can access
-  information between each other, rather than currently access different list of
-  attributes of the model.
-  """
-
-  def __init__(self,
-               output,
-               output_name,
-               loss_fn,
-               loss_weight=None,
-               training_target=None,
-               output_loss_metric=None,
-               sample_weight=None,
-               sample_weight_mode=None):
-    """Initialize the _TrainingEndpoint.
-
-    Note that the output and output_name should be stable as long as the model
-    structure doesn't change. The training_target suppose to be mutable since
-    the information is provided via `compile()`
-
-    Args:
-      output: the output tensor of the model.
-      output_name: the unique name of the output tensor.
-      loss_fn: the loss function for the output tensor.
-      loss_weight: float, the weights for the loss.
-      training_target: the _TrainingTarget for the model.
-      output_loss_metric: the metric object for the loss function.
-      sample_weight: the weights for how a sample is weighted during metric and
-        loss calculation. Could be None.
-      sample_weight_mode: string, 'temporal', 'samplewise' or None. The mode for
-        how the sample_weight is populated.
-    """
-    self._output = output
-    self._output_name = output_name
-    self._loss_fn = loss_fn
-    self._loss_weight = loss_weight
-    self._training_target = training_target
-    self._output_loss_metric = output_loss_metric
-    self._sample_weight = sample_weight
-    self._sample_weight_mode = sample_weight_mode
-
-  @property
-  def output(self):
-    return self._output
-
-  @property
-  def output_name(self):
-    return self._output_name
-
-  @property
-  def shape(self):
-    return backend.int_shape(self.output)
-
-  @property
-  def loss_fn(self):
-    return self._loss_fn
-
-  @property
-  def loss_weight(self):
-    return self._loss_weight
-
-  @loss_weight.setter
-  def loss_weight(self, value):
-    self._loss_weight = value
-
-  @property
-  def training_target(self):
-    return self._training_target
-
-  @training_target.setter
-  def training_target(self, value):
-    self._training_target = value
-
-  def create_training_target(self, target, run_eagerly=False):
-    """Create training_target instance and update the self.training_target.
-
-    Note that the input target should just be a tensor or None, and
-    corresponding training target will be created based on the output and
-    loss_fn.
-
-    Args:
-      target: the target tensor for the current output. Could be None.
-      run_eagerly: boolean, whether the model is in run_eagerly mode.
-
-    Raises:
-      ValueError if the training_target field for the current instance has
-      already been populated.
-    """
-    if self.has_training_target():
-      raise ValueError('The training_target field for the _TrainingEndpoint '
-                       'instance has already been populated')
-    if run_eagerly:
-      # When run_eagerly, the target tensor is ignored, and the None placeholder
-      # is created instead.
-      self.training_target = _TrainingTarget(
-          None, feedable=True, skip_target_weights=False)
-      return
-
-    if self.should_skip_target():
-      self.training_target = _TrainingTarget(None)
-    else:
-      if target is not None and not backend.is_placeholder(target):
-        feedable = False
-        skip_target_weights = True
-      else:
-        feedable = True
-        skip_target_weights = False
-
-      if target is None:
-        target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
-            self.loss_fn, backend.dtype(self.output))
-
-        target = backend.placeholder(
-            ndim=len(self.shape),
-            name=self.output_name + '_target',
-            sparse=backend.is_sparse(self.output),
-            dtype=target_dtype)
-
-      self.training_target = _TrainingTarget(
-          target,
-          feedable=feedable,
-          skip_target_weights=skip_target_weights)
-
-  @property
-  def output_loss_metric(self):
-    return self._output_loss_metric
-
-  @output_loss_metric.setter
-  def output_loss_metric(self, value):
-    self._output_loss_metric = value
-
-  @property
-  def sample_weight(self):
-    return self._sample_weight
-
-  @sample_weight.setter
-  def sample_weight(self, value):
-    self._sample_weight = value
-
-  @property
-  def sample_weight_mode(self):
-    return self._sample_weight_mode
-
-  @sample_weight_mode.setter
-  def sample_weight_mode(self, value):
-    self._sample_weight_mode = value
-
-  def should_skip_target(self):
-    return self._loss_fn is None
-
-  def should_skip_target_weights(self):
-    return (self.should_skip_target() or self.training_target is None or
-            self.training_target.skip_target_weights)
-
-  def has_training_target(self):
-    return self.training_target is not None
-
-  def has_feedable_training_target(self):
-    return (not self.should_skip_target() and
-            self.training_target is not None and self.training_target.feedable)
-
-  def loss_name(self):
-    if self._loss_fn is not None:
-      return self._output_name + '_loss'
-    return None
-
-  @property
-  def feed_output_shape(self):
-    """The output shape for the feedable target."""
-    if not self.has_feedable_training_target():
-      return None
-
-    if ((isinstance(self.loss_fn, losses.LossFunctionWrapper) and
-         self.loss_fn.fn == losses.sparse_categorical_crossentropy)) or (
-             isinstance(self.loss_fn, losses.SparseCategoricalCrossentropy)):
-      if backend.image_data_format() == 'channels_first':
-        return (self.shape[0], 1) + self.shape[2:]
-      else:
-        return self.shape[:-1] + (1,)
-    elif (not isinstance(self.loss_fn, losses.Loss) or
-          (isinstance(self.loss_fn, losses.LossFunctionWrapper) and
-           (getattr(losses, self.loss_fn.fn.__name__, None) is None))):
-      # If the given loss is not an instance of the `Loss` class (custom
-      # class) or if the loss function that is wrapped is not in the
-      # `losses` module, then it is a user-defined loss and we make no
-      # assumptions about it.
-      return None
-    else:
-      return self.shape
-
-  def sample_weights_mismatch(self):
-    """Check if the sample weight and the mode match or not."""
-    # If there is a mismatch between sample weight mode and the placeholders
-    # created, then recompile the sub-graphs that depend on sample weights.
-    return (
-        (self.sample_weight_mode is not None and self.sample_weight is None) or
-        (self.sample_weight_mode is None and self.sample_weight is not None))
-
-  def populate_sample_weight(self, sample_weight, sample_weight_mode):
-    """Populate the sample weight and based on the sample weight mode."""
-    if (sample_weight is None and
-        (self.should_skip_target_weights() or sample_weight_mode is None or
-         tf.executing_eagerly())):
-      self._sample_weight = None
-      return
-
-    assert sample_weight_mode in ['temporal', 'samplewise']
-    if sample_weight_mode == 'temporal':
-      default_value = [[1.]]
-      shape = [None, None]
-    else:
-      # sample_weight_mode == 'samplewise'
-      default_value = [1.]
-      shape = [None]
-
-    if sample_weight is not None:
-      if not sample_weight.shape.is_compatible_with(shape):
-        raise ValueError('Received sample weight with shape {}. Expected shape '
-                         '{}.'.format(sample_weight.shape, shape))
-      self._sample_weight = sample_weight
-    else:
-      self._sample_weight = tf.compat.v1.placeholder_with_default(
-          tf.constant(default_value, dtype=backend.floatx()),
-          shape=shape,
-          name=self.output_name + '_sample_weights')
-
-
-class _TrainingTarget:
-  """Container for a target tensor (y_true) and its metadata (shape, loss...).
-
-  Args:
-    target: A target tensor for the model. It may be `None` if the
-      output is excluded from loss computation. It is still kept as None
-      since each output of the model should have a corresponding target. If
-      the target is None, the rest of the attributes will be None as well.
-    feedable: Boolean, whether the target is feedable (requires data to be
-      passed in `fit` or `train_on_batch`), or not (model compiled with
-      `target_tensors` argument).
-    skip_target_weights: Boolean, whether the target should be skipped during
-      weights calculation.
-  """
-
-  def __init__(self, target, feedable=False, skip_target_weights=True):
-    self._target = target
-    self._feedable = feedable
-    self._skip_target_weights = skip_target_weights
-
-  @property
-  def target(self):
-    return self._target
-
-  @property
-  def feedable(self):
-    return self._feedable
-
-  @property
-  def skip_target_weights(self):
-    return self._skip_target_weights
-
-
-def _is_symbolic_tensor(x):
-  return tf.is_tensor(x)
-
-
-def _convert_scipy_sparse_tensor(value, expected_input):
-  """Handle scipy sparse tensor conversions.
-
-  This method takes a value 'value' and returns the proper conversion. If
-  value is a scipy sparse tensor and the expected input is a dense tensor,
-  we densify 'value'. If value is a scipy sparse tensor and the expected input
-  is a TF SparseTensor, we convert 'value' to a SparseTensor. If 'value' is
-  not a scipy sparse tensor, or scipy is not imported, we pass it through
-  unchanged.
-
-  Args:
-    value: An object that may be a scipy sparse tensor
-    expected_input: The expected input placeholder.
-
-  Returns:
-    The possibly-converted 'value'.
-  """
-  if issparse is not None and issparse(value):
-    if backend.is_sparse(expected_input):
-      sparse_coo = value.tocoo()
-      row, col = sparse_coo.row, sparse_coo.col
-      data, shape = sparse_coo.data, sparse_coo.shape
-      indices = np.concatenate((np.expand_dims(row, 1), np.expand_dims(col, 1)),
-                               1)
-      return tf.SparseTensor(indices, data, shape)
-    else:
-      if tf.compat.v1.executing_eagerly_outside_functions():
-        # In TF2 we do not silently densify sparse matrices.
-        raise ValueError('A SciPy sparse matrix was passed to a model '
-                         'that expects dense inputs. Please densify your '
-                         'inputs first, such as by calling `x.toarray().')
-      return value.toarray()
-  else:
-    return value
-
-
-def _get_metrics_from_layers(layers):
-  """Returns list of metrics from the given layers.
-
-  This will not include the `compile` metrics of a model layer.
-
-  Args:
-    layers: List of layers.
-
-  Returns:
-    List of metrics.
-  """
-  metrics = []
-  layers = layer_utils.filter_empty_layer_containers(layers)
-  for layer in layers:
-    if isinstance(layer, Model):
-      # We cannot call 'metrics' on the model because we do not want to
-      # include the metrics that were added in compile API of a nested model.
-      metrics.extend(layer._metrics)  # pylint: disable=protected-access
-      metrics.extend(_get_metrics_from_layers(layer.layers))
-    else:
-      metrics.extend(layer.metrics)
-  return metrics
-
 
 def _non_none_constant_value(v):
-  constant_value = tf.get_static_value(v)
-  return constant_value if constant_value is not None else v
+    constant_value = tf.get_static_value(v)
+    return constant_value if constant_value is not None else v
diff --git a/keras/estimator/BUILD b/keras/estimator/BUILD
index 6d6ffd441685..6b871702e627 100644
--- a/keras/estimator/BUILD
+++ b/keras/estimator/BUILD
@@ -1,7 +1,10 @@
 # Description:
 #   Contains Keras models to Estimator converter
 
+# Placeholder: load unaliased py_library
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
diff --git a/keras/estimator/__init__.py b/keras/estimator/__init__.py
index b5efcbc14647..00fa3c96e2d0 100644
--- a/keras/estimator/__init__.py
+++ b/keras/estimator/__init__.py
@@ -16,6 +16,7 @@
 
 import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 # Keras has undeclared dependency on tensorflow/estimator:estimator_py.
@@ -23,345 +24,365 @@
 # everything will work as normal.
 
 _model_to_estimator_usage_gauge = tf.__internal__.monitoring.BoolGauge(
-    '/tensorflow/api/keras/model_to_estimator',
-    'Whether tf.keras.estimator.model_to_estimator() is called.', 'version')
+    "/tensorflow/api/keras/model_to_estimator",
+    "Whether tf.keras.estimator.model_to_estimator() is called.",
+    "version",
+)
 
 
 # LINT.IfChange
-@keras_export(v1=['keras.estimator.model_to_estimator'])
+@keras_export(v1=["keras.estimator.model_to_estimator"])
 def model_to_estimator(
     keras_model=None,
     keras_model_path=None,
     custom_objects=None,
     model_dir=None,
     config=None,
-    checkpoint_format='saver',
+    checkpoint_format="saver",
     metric_names_map=None,
-    export_outputs=None):
-  """Constructs an `Estimator` instance from given keras model.
+    export_outputs=None,
+):
+    """Constructs an `Estimator` instance from given keras model.
 
-  If you use infrastructure or other tooling that relies on Estimators, you can
-  still build a Keras model and use model_to_estimator to convert the Keras
-  model to an Estimator for use with downstream systems.
+    If you use infrastructure or other tooling that relies on Estimators, you
+    can still build a Keras model and use model_to_estimator to convert the
+    Keras model to an Estimator for use with downstream systems.
 
-  For usage example, please see:
-  [Creating estimators from Keras Models](
+    For usage example, please see:
+    [Creating estimators from Keras Models](
     https://www.tensorflow.org/guide/estimator#create_an_estimator_from_a_keras_model).
 
-  Sample Weights:
-  Estimators returned by `model_to_estimator` are configured so that they can
-  handle sample weights (similar to `keras_model.fit(x, y, sample_weights)`).
-
-  To pass sample weights when training or evaluating the Estimator, the first
-  item returned by the input function should be a dictionary with keys
-  `features` and `sample_weights`. Example below:
-
-  ```python
-  keras_model = tf.keras.Model(...)
-  keras_model.compile(...)
-
-  estimator = tf.keras.estimator.model_to_estimator(keras_model)
-
-  def input_fn():
-    return dataset_ops.Dataset.from_tensors(
-        ({'features': features, 'sample_weights': sample_weights},
-         targets))
-
-  estimator.train(input_fn, steps=1)
-  ```
-
-  Example with customized export signature:
-  ```python
-  inputs = {'a': tf.keras.Input(..., name='a'),
-            'b': tf.keras.Input(..., name='b')}
-  outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
-             'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
-  keras_model = tf.keras.Model(inputs, outputs)
-  keras_model.compile(...)
-  export_outputs = {'c': tf.estimator.export.RegressionOutput,
-                    'd': tf.estimator.export.ClassificationOutput}
-
-  estimator = tf.keras.estimator.model_to_estimator(
-      keras_model, export_outputs=export_outputs)
-
-  def input_fn():
-    return dataset_ops.Dataset.from_tensors(
-        ({'features': features, 'sample_weights': sample_weights},
-         targets))
-
-  estimator.train(input_fn, steps=1)
-  ```
-
-  Args:
-    keras_model: A compiled Keras model object. This argument is mutually
-      exclusive with `keras_model_path`. Estimator's `model_fn` uses the
-      structure of the model to clone the model. Defaults to `None`.
-    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
-      format, which can be generated with the `save()` method of a Keras model.
-      This argument is mutually exclusive with `keras_model`.
-      Defaults to `None`.
-    custom_objects: Dictionary for cloning customized objects. This is
-      used with classes that is not part of this pip package. For example, if
-      user maintains a `relu6` class that inherits from `tf.keras.layers.Layer`,
-      then pass `custom_objects={'relu6': relu6}`. Defaults to `None`.
-    model_dir: Directory to save `Estimator` model parameters, graph, summary
-      files for TensorBoard, etc. If unset a directory will be created with
-      `tempfile.mkdtemp`
-    config: `RunConfig` to config `Estimator`. Allows setting up things in
-      `model_fn` based on configuration such as `num_ps_replicas`, or
-      `model_dir`. Defaults to `None`. If both `config.model_dir` and the
-      `model_dir` argument (above) are specified the `model_dir` **argument**
-      takes precedence.
-    checkpoint_format: Sets the format of the checkpoint saved by the estimator
-      when training. May be `saver` or `checkpoint`, depending on whether to
-      save checkpoints from `tf.train.Saver` or `tf.train.Checkpoint`. This
-      argument currently defaults to `saver`. When 2.0 is released, the default
-      will be `checkpoint`. Estimators use name-based `tf.train.Saver`
-      checkpoints, while Keras models use object-based checkpoints from
-      `tf.train.Checkpoint`. Currently, saving object-based checkpoints from
-      `model_to_estimator` is only supported by Functional and Sequential
-      models. Defaults to 'saver'.
-    metric_names_map: Optional dictionary mapping Keras model output metric
-      names to custom names. This can be used to override the default Keras
-      model output metrics names in a multi IO model use case and provide custom
-      names for the `eval_metric_ops` in Estimator.
-      The Keras model metric names can be obtained using `model.metrics_names`
-      excluding any loss metrics such as total loss and output losses.
-      For example, if your Keras model has two outputs `out_1` and `out_2`,
-      with `mse` loss and `acc` metric, then `model.metrics_names` will be
-      `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
-      The model metric names excluding the loss metrics will be
-      `['out_1_acc', 'out_2_acc']`.
-    export_outputs: Optional dictionary. This can be used to override the
-      default Keras model output exports in a multi IO model use case and
-      provide custom names for the `export_outputs` in
-      `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
-      {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
-      the keys must match the keys of `model.output_names`.
-      A dict `{name: output}` where:
-        * name: An arbitrary name for this output.
-        * output: an `ExportOutput` class such as `ClassificationOutput`,
-          `RegressionOutput`, or `PredictOutput`. Single-headed models only need
-          to specify one entry in this dictionary. Multi-headed models should
-          specify one entry for each head, one of which must be named using
-          `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
-          If no entry is provided, a default `PredictOutput` mapping to
-          `predictions` will be created.
-
-  Returns:
-    An Estimator from given keras model.
-
-  Raises:
-    ValueError: If neither keras_model nor keras_model_path was given.
-    ValueError: If both keras_model and keras_model_path was given.
-    ValueError: If the keras_model_path is a GCS URI.
-    ValueError: If keras_model has not been compiled.
-    ValueError: If an invalid checkpoint_format was given.
-  """
-
-  try:
-    from tensorflow_estimator.python.estimator import keras_lib  # pylint: disable=g-import-not-at-top
-  except ImportError:
-    raise NotImplementedError(
-        'tf.keras.estimator.model_to_estimator function not available in your '
-        'installation.')
-  _model_to_estimator_usage_gauge.get_cell('v1').set(True)
-  return keras_lib.model_to_estimator(  # pylint:disable=unexpected-keyword-arg
-      keras_model=keras_model,
-      keras_model_path=keras_model_path,
-      custom_objects=custom_objects,
-      model_dir=model_dir,
-      config=config,
-      checkpoint_format=checkpoint_format,
-      use_v2_estimator=False,
-      metric_names_map=metric_names_map,
-      export_outputs=export_outputs)
-
-
-@keras_export('keras.estimator.model_to_estimator', v1=[])
-def model_to_estimator_v2(keras_model=None,
-                          keras_model_path=None,
-                          custom_objects=None,
-                          model_dir=None,
-                          config=None,
-                          checkpoint_format='checkpoint',
-                          metric_names_map=None,
-                          export_outputs=None):
-  """Constructs an `Estimator` instance from given keras model.
-
-  If you use infrastructure or other tooling that relies on Estimators, you can
-  still build a Keras model and use model_to_estimator to convert the Keras
-  model to an Estimator for use with downstream systems.
-
-  For usage example, please see:
-  [Creating estimators from Keras Models](
+    Sample Weights:
+    Estimators returned by `model_to_estimator` are configured so that they can
+    handle sample weights (similar to `keras_model.fit(x, y, sample_weights)`).
+
+    To pass sample weights when training or evaluating the Estimator, the first
+    item returned by the input function should be a dictionary with keys
+    `features` and `sample_weights`. Example below:
+
+    ```python
+    keras_model = tf.keras.Model(...)
+    keras_model.compile(...)
+
+    estimator = tf.keras.estimator.model_to_estimator(keras_model)
+
+    def input_fn():
+      return dataset_ops.Dataset.from_tensors(
+          ({'features': features, 'sample_weights': sample_weights},
+           targets))
+
+    estimator.train(input_fn, steps=1)
+    ```
+
+    Example with customized export signature:
+    ```python
+    inputs = {'a': tf.keras.Input(..., name='a'),
+              'b': tf.keras.Input(..., name='b')}
+    outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
+               'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
+    keras_model = tf.keras.Model(inputs, outputs)
+    keras_model.compile(...)
+    export_outputs = {'c': tf.estimator.export.RegressionOutput,
+                      'd': tf.estimator.export.ClassificationOutput}
+
+    estimator = tf.keras.estimator.model_to_estimator(
+        keras_model, export_outputs=export_outputs)
+
+    def input_fn():
+      return dataset_ops.Dataset.from_tensors(
+          ({'features': features, 'sample_weights': sample_weights},
+           targets))
+
+    estimator.train(input_fn, steps=1)
+    ```
+
+    Args:
+      keras_model: A compiled Keras model object. This argument is mutually
+        exclusive with `keras_model_path`. Estimator's `model_fn` uses the
+        structure of the model to clone the model. Defaults to `None`.
+      keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
+        format, which can be generated with the `save()` method of a Keras
+        model.  This argument is mutually exclusive with `keras_model`.
+        Defaults to `None`.
+      custom_objects: Dictionary for cloning customized objects. This is
+        used with classes that is not part of this pip package. For example, if
+        user maintains a `relu6` class that inherits from
+        `tf.keras.layers.Layer`, then pass `custom_objects={'relu6': relu6}`.
+        Defaults to `None`.
+      model_dir: Directory to save `Estimator` model parameters, graph, summary
+        files for TensorBoard, etc. If unset a directory will be created with
+        `tempfile.mkdtemp`
+      config: `RunConfig` to config `Estimator`. Allows setting up things in
+        `model_fn` based on configuration such as `num_ps_replicas`, or
+        `model_dir`. If both `config.model_dir` and the
+        `model_dir` argument (above) are specified the `model_dir` **argument**
+        takes precedence. Defaults to `None`.
+      checkpoint_format: Sets the format of the checkpoint saved by the
+        estimator when training. May be `saver` or `checkpoint`, depending on
+        whether to save checkpoints from `tf.train.Saver` or
+        `tf.train.Checkpoint`. Estimators use name-based `tf.train.Saver`
+        checkpoints, while Keras models use object-based checkpoints from
+        `tf.train.Checkpoint`. Currently, saving object-based checkpoints
+        from `model_to_estimator` is only supported by Functional and
+        Sequential models. Defaults to 'saver'.
+      metric_names_map: Optional dictionary mapping Keras model output metric
+        names to custom names. This can be used to override the default Keras
+        model output metrics names in a multi IO model use case and provide
+        custom names for the `eval_metric_ops` in Estimator.
+        The Keras model metric names can be obtained using `model.metrics_names`
+        excluding any loss metrics such as total loss and output losses.
+        For example, if your Keras model has two outputs `out_1` and `out_2`,
+        with `mse` loss and `acc` metric, then `model.metrics_names` will be
+        `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
+        The model metric names excluding the loss metrics will be
+        `['out_1_acc', 'out_2_acc']`.
+      export_outputs: Optional dictionary. This can be used to override the
+        default Keras model output exports in a multi IO model use case and
+        provide custom names for the `export_outputs` in
+        `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
+        {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
+        the keys must match the keys of `model.output_names`.
+        A dict `{name: output}` where:
+          * name: An arbitrary name for this output.
+          * output: an `ExportOutput` class such as `ClassificationOutput`,
+            `RegressionOutput`, or `PredictOutput`. Single-headed models only
+            need to specify one entry in this dictionary. Multi-headed models
+            should specify one entry for each head, one of which must be named
+            using
+            `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
+            If no entry is provided, a default `PredictOutput` mapping to
+            `predictions` will be created.
+
+    Returns:
+      An Estimator from given keras model.
+
+    Raises:
+      ValueError: If neither keras_model nor keras_model_path was given.
+      ValueError: If both keras_model and keras_model_path was given.
+      ValueError: If the keras_model_path is a GCS URI.
+      ValueError: If keras_model has not been compiled.
+      ValueError: If an invalid checkpoint_format was given.
+    """
+
+    try:
+        # isort: off
+        from tensorflow_estimator.python.estimator import (
+            keras_lib,
+        )
+    except ImportError:
+        raise NotImplementedError(
+            "tf.keras.estimator.model_to_estimator function not available in "
+            "your installation."
+        )
+    _model_to_estimator_usage_gauge.get_cell("v1").set(True)
+    return keras_lib.model_to_estimator(
+        keras_model=keras_model,
+        keras_model_path=keras_model_path,
+        custom_objects=custom_objects,
+        model_dir=model_dir,
+        config=config,
+        checkpoint_format=checkpoint_format,
+        use_v2_estimator=False,
+        metric_names_map=metric_names_map,
+        export_outputs=export_outputs,
+    )
+
+
+@keras_export("keras.estimator.model_to_estimator", v1=[])
+def model_to_estimator_v2(
+    keras_model=None,
+    keras_model_path=None,
+    custom_objects=None,
+    model_dir=None,
+    config=None,
+    checkpoint_format="checkpoint",
+    metric_names_map=None,
+    export_outputs=None,
+):
+    """Constructs an `Estimator` instance from given keras model.
+
+    If you use infrastructure or other tooling that relies on Estimators, you
+    can still build a Keras model and use model_to_estimator to convert the
+    Keras model to an Estimator for use with downstream systems.
+
+    For usage example, please see:
+    [Creating estimators from Keras Models](
     https://www.tensorflow.org/guide/estimators#creating_estimators_from_keras_models).
 
-  Sample Weights:
-  Estimators returned by `model_to_estimator` are configured so that they can
-  handle sample weights (similar to `keras_model.fit(x, y, sample_weights)`).
-
-  To pass sample weights when training or evaluating the Estimator, the first
-  item returned by the input function should be a dictionary with keys
-  `features` and `sample_weights`. Example below:
-
-  ```python
-  keras_model = tf.keras.Model(...)
-  keras_model.compile(...)
-
-  estimator = tf.keras.estimator.model_to_estimator(keras_model)
-
-  def input_fn():
-    return dataset_ops.Dataset.from_tensors(
-        ({'features': features, 'sample_weights': sample_weights},
-         targets))
-
-  estimator.train(input_fn, steps=1)
-  ```
-
-  Example with customized export signature:
-  ```python
-  inputs = {'a': tf.keras.Input(..., name='a'),
-            'b': tf.keras.Input(..., name='b')}
-  outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
-             'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
-  keras_model = tf.keras.Model(inputs, outputs)
-  keras_model.compile(...)
-  export_outputs = {'c': tf.estimator.export.RegressionOutput,
-                    'd': tf.estimator.export.ClassificationOutput}
-
-  estimator = tf.keras.estimator.model_to_estimator(
-      keras_model, export_outputs=export_outputs)
-
-  def input_fn():
-    return dataset_ops.Dataset.from_tensors(
-        ({'features': features, 'sample_weights': sample_weights},
-         targets))
-
-  estimator.train(input_fn, steps=1)
-  ```
-
-  Note: We do not support creating weighted metrics in Keras and converting them
-  to weighted metrics in the Estimator API using `model_to_estimator`.
-  You will have to create these metrics directly on the estimator spec using the
-  `add_metrics` function.
-
-  To customize the estimator `eval_metric_ops` names, you can pass in the
-  `metric_names_map` dictionary mapping the keras model output metric names
-  to the custom names as follows:
-
-  ```python
-    input_a = tf.keras.layers.Input(shape=(16,), name='input_a')
-    input_b = tf.keras.layers.Input(shape=(16,), name='input_b')
-    dense = tf.keras.layers.Dense(8, name='dense_1')
-    interm_a = dense(input_a)
-    interm_b = dense(input_b)
-    merged = tf.keras.layers.concatenate([interm_a, interm_b], name='merge')
-    output_a = tf.keras.layers.Dense(3, activation='softmax', name='dense_2')(
-            merged)
-    output_b = tf.keras.layers.Dense(2, activation='softmax', name='dense_3')(
-            merged)
-    keras_model = tf.keras.models.Model(
-        inputs=[input_a, input_b], outputs=[output_a, output_b])
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics={
-            'dense_2': 'categorical_accuracy',
-            'dense_3': 'categorical_accuracy'
-        })
-
-    metric_names_map = {
-        'dense_2_categorical_accuracy': 'acc_1',
-        'dense_3_categorical_accuracy': 'acc_2',
-    }
-    keras_est = tf.keras.estimator.model_to_estimator(
+    Sample Weights:
+    Estimators returned by `model_to_estimator` are configured so that they can
+    handle sample weights (similar to `keras_model.fit(x, y, sample_weights)`).
+
+    To pass sample weights when training or evaluating the Estimator, the first
+    item returned by the input function should be a dictionary with keys
+    `features` and `sample_weights`. Example below:
+
+    ```python
+    keras_model = tf.keras.Model(...)
+    keras_model.compile(...)
+
+    estimator = tf.keras.estimator.model_to_estimator(keras_model)
+
+    def input_fn():
+      return dataset_ops.Dataset.from_tensors(
+          ({'features': features, 'sample_weights': sample_weights},
+           targets))
+
+    estimator.train(input_fn, steps=1)
+    ```
+
+    Example with customized export signature:
+    ```python
+    inputs = {'a': tf.keras.Input(..., name='a'),
+              'b': tf.keras.Input(..., name='b')}
+    outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
+               'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
+    keras_model = tf.keras.Model(inputs, outputs)
+    keras_model.compile(...)
+    export_outputs = {'c': tf.estimator.export.RegressionOutput,
+                      'd': tf.estimator.export.ClassificationOutput}
+
+    estimator = tf.keras.estimator.model_to_estimator(
+        keras_model, export_outputs=export_outputs)
+
+    def input_fn():
+      return dataset_ops.Dataset.from_tensors(
+          ({'features': features, 'sample_weights': sample_weights},
+           targets))
+
+    estimator.train(input_fn, steps=1)
+    ```
+
+    Note: We do not support creating weighted metrics in Keras and converting
+    them to weighted metrics in the Estimator API using `model_to_estimator`.
+    You will have to create these metrics directly on the estimator spec using
+    the `add_metrics` function.
+
+    To customize the estimator `eval_metric_ops` names, you can pass in the
+    `metric_names_map` dictionary mapping the keras model output metric names
+    to the custom names as follows:
+
+    ```python
+      input_a = tf.keras.layers.Input(shape=(16,), name='input_a')
+      input_b = tf.keras.layers.Input(shape=(16,), name='input_b')
+      dense = tf.keras.layers.Dense(8, name='dense_1')
+      interm_a = dense(input_a)
+      interm_b = dense(input_b)
+      merged = tf.keras.layers.concatenate([interm_a, interm_b], name='merge')
+      output_a = tf.keras.layers.Dense(3, activation='softmax', name='dense_2')(
+              merged)
+      output_b = tf.keras.layers.Dense(2, activation='softmax', name='dense_3')(
+              merged)
+      keras_model = tf.keras.models.Model(
+          inputs=[input_a, input_b], outputs=[output_a, output_b])
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics={
+              'dense_2': 'categorical_accuracy',
+              'dense_3': 'categorical_accuracy'
+          })
+
+      metric_names_map = {
+          'dense_2_categorical_accuracy': 'acc_1',
+          'dense_3_categorical_accuracy': 'acc_2',
+      }
+      keras_est = tf.keras.estimator.model_to_estimator(
+          keras_model=keras_model,
+          config=config,
+          metric_names_map=metric_names_map)
+    ```
+
+    Args:
+      keras_model: A compiled Keras model object. This argument is mutually
+        exclusive with `keras_model_path`. Estimator's `model_fn` uses the
+        structure of the model to clone the model. Defaults to `None`.
+      keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
+        format, which can be generated with the `save()` method of a Keras
+        model.  This argument is mutually exclusive with `keras_model`.
+        Defaults to `None`.
+      custom_objects: Dictionary for cloning customized objects. This is
+        used with classes that is not part of this pip package. For example, if
+        user maintains a `relu6` class that inherits from
+        `tf.keras.layers.Layer`, then pass `custom_objects={'relu6': relu6}`.
+        Defaults to `None`.
+      model_dir: Directory to save `Estimator` model parameters, graph, summary
+        files for TensorBoard, etc. If unset a directory will be created with
+        `tempfile.mkdtemp`
+      config: `RunConfig` to config `Estimator`. Allows setting up things in
+        `model_fn` based on configuration such as `num_ps_replicas`, or
+        `model_dir`. If both `config.model_dir` and the
+        `model_dir` argument (above) are specified the `model_dir` **argument**
+        takes precedence. Defaults to `None`.
+      checkpoint_format: Sets the format of the checkpoint saved by the
+        estimator when training. May be `saver` or `checkpoint`, depending on
+        whether to save checkpoints from `tf.compat.v1.train.Saver` or
+        `tf.train.Checkpoint`.  The default is `checkpoint`. Estimators use
+        name-based `tf.train.Saver` checkpoints, while Keras models use
+        object-based checkpoints from `tf.train.Checkpoint`. Currently, saving
+        object-based checkpoints from `model_to_estimator` is only supported by
+        Functional and Sequential models. Defaults to 'checkpoint'.
+      metric_names_map: Optional dictionary mapping Keras model output metric
+        names to custom names. This can be used to override the default Keras
+        model output metrics names in a multi IO model use case and provide
+        custom names for the `eval_metric_ops` in Estimator.
+        The Keras model metric names can be obtained using `model.metrics_names`
+        excluding any loss metrics such as total loss and output losses.
+        For example, if your Keras model has two outputs `out_1` and `out_2`,
+        with `mse` loss and `acc` metric, then `model.metrics_names` will be
+        `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
+        The model metric names excluding the loss metrics will be
+        `['out_1_acc', 'out_2_acc']`.
+      export_outputs: Optional dictionary. This can be used to override the
+        default Keras model output exports in a multi IO model use case and
+        provide custom names for the `export_outputs` in
+        `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
+        {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
+        the keys must match the keys of `model.output_names`.
+        A dict `{name: output}` where:
+          * name: An arbitrary name for this output.
+          * output: an `ExportOutput` class such as `ClassificationOutput`,
+            `RegressionOutput`, or `PredictOutput`. Single-headed models only
+            need to specify one entry in this dictionary. Multi-headed models
+            should specify one entry for each head, one of which must be named
+            using
+            `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
+            If no entry is provided, a default `PredictOutput` mapping to
+            `predictions` will be created.
+
+    Returns:
+      An Estimator from given keras model.
+
+    Raises:
+      ValueError: If neither keras_model nor keras_model_path was given.
+      ValueError: If both keras_model and keras_model_path was given.
+      ValueError: If the keras_model_path is a GCS URI.
+      ValueError: If keras_model has not been compiled.
+      ValueError: If an invalid checkpoint_format was given.
+    """
+
+    try:
+        # isort: off
+        from tensorflow_estimator.python.estimator import (
+            keras_lib,
+        )
+    except ImportError:
+        raise NotImplementedError(
+            "tf.keras.estimator.model_to_estimator function not available in "
+            "your installation."
+        )
+    _model_to_estimator_usage_gauge.get_cell("v2").set(True)
+    return keras_lib.model_to_estimator(
         keras_model=keras_model,
+        keras_model_path=keras_model_path,
+        custom_objects=custom_objects,
+        model_dir=model_dir,
         config=config,
-        metric_names_map=metric_names_map)
-  ```
-
-  Args:
-    keras_model: A compiled Keras model object. This argument is mutually
-      exclusive with `keras_model_path`. Estimator's `model_fn` uses the
-      structure of the model to clone the model. Defaults to `None`.
-    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
-      format, which can be generated with the `save()` method of a Keras model.
-      This argument is mutually exclusive with `keras_model`.
-      Defaults to `None`.
-    custom_objects: Dictionary for cloning customized objects. This is
-      used with classes that is not part of this pip package. For example, if
-      user maintains a `relu6` class that inherits from `tf.keras.layers.Layer`,
-      then pass `custom_objects={'relu6': relu6}`. Defaults to `None`.
-    model_dir: Directory to save `Estimator` model parameters, graph, summary
-      files for TensorBoard, etc. If unset a directory will be created with
-      `tempfile.mkdtemp`
-    config: `RunConfig` to config `Estimator`. Allows setting up things in
-      `model_fn` based on configuration such as `num_ps_replicas`, or
-      `model_dir`. Defaults to `None`. If both `config.model_dir` and the
-      `model_dir` argument (above) are specified the `model_dir` **argument**
-      takes precedence.
-    checkpoint_format: Sets the format of the checkpoint saved by the estimator
-      when training. May be `saver` or `checkpoint`, depending on whether to
-      save checkpoints from `tf.compat.v1.train.Saver` or `tf.train.Checkpoint`.
-      The default is `checkpoint`. Estimators use name-based `tf.train.Saver`
-      checkpoints, while Keras models use object-based checkpoints from
-      `tf.train.Checkpoint`. Currently, saving object-based checkpoints from
-      `model_to_estimator` is only supported by Functional and Sequential
-      models. Defaults to 'checkpoint'.
-    metric_names_map: Optional dictionary mapping Keras model output metric
-      names to custom names. This can be used to override the default Keras
-      model output metrics names in a multi IO model use case and provide custom
-      names for the `eval_metric_ops` in Estimator.
-      The Keras model metric names can be obtained using `model.metrics_names`
-      excluding any loss metrics such as total loss and output losses.
-      For example, if your Keras model has two outputs `out_1` and `out_2`,
-      with `mse` loss and `acc` metric, then `model.metrics_names` will be
-      `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
-      The model metric names excluding the loss metrics will be
-      `['out_1_acc', 'out_2_acc']`.
-    export_outputs: Optional dictionary. This can be used to override the
-      default Keras model output exports in a multi IO model use case and
-      provide custom names for the `export_outputs` in
-      `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
-      {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
-      the keys must match the keys of `model.output_names`.
-      A dict `{name: output}` where:
-        * name: An arbitrary name for this output.
-        * output: an `ExportOutput` class such as `ClassificationOutput`,
-          `RegressionOutput`, or `PredictOutput`. Single-headed models only need
-          to specify one entry in this dictionary. Multi-headed models should
-          specify one entry for each head, one of which must be named using
-          `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
-          If no entry is provided, a default `PredictOutput` mapping to
-          `predictions` will be created.
-
-  Returns:
-    An Estimator from given keras model.
-
-  Raises:
-    ValueError: If neither keras_model nor keras_model_path was given.
-    ValueError: If both keras_model and keras_model_path was given.
-    ValueError: If the keras_model_path is a GCS URI.
-    ValueError: If keras_model has not been compiled.
-    ValueError: If an invalid checkpoint_format was given.
-  """
-
-  try:
-    from tensorflow_estimator.python.estimator import keras_lib  # pylint: disable=g-import-not-at-top
-  except ImportError:
-    raise NotImplementedError(
-        'tf.keras.estimator.model_to_estimator function not available in your '
-        'installation.')
-  _model_to_estimator_usage_gauge.get_cell('v2').set(True)
-  return keras_lib.model_to_estimator(  # pylint:disable=unexpected-keyword-arg
-      keras_model=keras_model,
-      keras_model_path=keras_model_path,
-      custom_objects=custom_objects,
-      model_dir=model_dir,
-      config=config,
-      checkpoint_format=checkpoint_format,
-      use_v2_estimator=True,
-      metric_names_map=metric_names_map,
-      export_outputs=export_outputs)
+        checkpoint_format=checkpoint_format,
+        use_v2_estimator=True,
+        metric_names_map=metric_names_map,
+        export_outputs=export_outputs,
+    )
+
+
 # LINT.ThenChange(//tensorflow_estimator/python/estimator/keras_lib.py)
diff --git a/keras/saving/experimental/BUILD b/keras/export/BUILD
similarity index 55%
rename from keras/saving/experimental/BUILD
rename to keras/export/BUILD
index e0dd9e851600..329076cafce1 100644
--- a/keras/saving/experimental/BUILD
+++ b/keras/export/BUILD
@@ -1,39 +1,39 @@
 # Description:
-#   Contains the Keras experimental idempotent saving API.
+#   Contains the Keras save model API (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//keras:friends",
-        "//third_party/tensorflow/python/distribute:__pkg__",
     ],
     licenses = ["notice"],
 )
 
 py_library(
-    name = "experimental",
+    name = "export_lib",
     srcs = [
-        "saving_lib.py",
+        "export_lib.py",
     ],
     srcs_version = "PY3",
     deps = [
         "//:expect_tensorflow_installed",
-        "//keras/saving/saved_model",
-        "//keras/utils:generic_utils",
     ],
 )
 
 tf_py_test(
-    name = "saving_lib_test",
-    size = "small",
-    srcs = ["saving_lib_test.py"],
+    name = "export_lib_test",
+    size = "medium",
+    srcs = ["export_lib_test.py"],
     python_version = "PY3",
     deps = [
+        ":export_lib",
         "//:expect_absl_installed",
         "//:expect_tensorflow_installed",
         "//keras",
-        "//keras/utils:generic_utils",
+        "//keras/testing_infra:test_combinations",
     ],
 )
diff --git a/keras/optimizers/optimizer_experimental/__init__.py b/keras/export/__init__.py
similarity index 84%
rename from keras/optimizers/optimizer_experimental/__init__.py
rename to keras/export/__init__.py
index bdf2826104b1..a82948d13416 100644
--- a/keras/optimizers/optimizer_experimental/__init__.py
+++ b/keras/export/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Experimental optimizer package."""
+
+from keras.export.export_lib import ExportArchive
diff --git a/keras/export/export_lib.py b/keras/export/export_lib.py
new file mode 100644
index 000000000000..eb8dc63f83e8
--- /dev/null
+++ b/keras/export/export_lib.py
@@ -0,0 +1,581 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for exporting inference-only Keras models/layers."""
+
+import tensorflow.compat.v2 as tf
+from tensorflow.python.util.tf_export import keras_export
+
+from keras.engine import base_layer
+from keras.engine import functional
+from keras.engine import sequential
+from keras.utils import io_utils
+
+
+@keras_export("keras.export.ExportArchive")
+class ExportArchive(tf.__internal__.tracking.AutoTrackable):
+    """ExportArchive is used to write SavedModel artifacts (e.g. for inference).
+
+    If you have a Keras model or layer that you want to export as SavedModel for
+    serving (e.g. via TensorFlow-Serving), you can use `ExportArchive`
+    to configure the different serving endpoints you need to make available,
+    as well as their signatures. Simply instantiate an `ExportArchive`,
+    use `track()` to register the layer(s) or model(s) to be used,
+    then use the `add_endpoint()` method to register a new serving endpoint.
+    When done, use the `write_out()` method to save the artifact.
+
+    The resulting artifact is a SavedModel and can be reloaded via
+    `tf.saved_model.load`.
+
+    Examples:
+
+    Here's how to export a model for inference.
+
+    ```python
+    export_archive = ExportArchive()
+    export_archive.track(model)
+    export_archive.add_endpoint(
+        name="serve",
+        fn=model.call,
+        input_signature=[tf.TensorSpec(shape=(None, 3), dtype=tf.float32)],
+    )
+    export_archive.write_out("path/to/location")
+
+    # Elsewhere, we can reload the artifact and serve it.
+    # The endpoint we added is available as a method:
+    serving_model = tf.saved_model.load("path/to/location")
+    outputs = serving_model.serve(inputs)
+    ```
+
+    Here's how to export a model with one endpoint for inference and one
+    endpoint for a training-mode forward pass (e.g. with dropout on).
+
+    ```python
+    export_archive = ExportArchive()
+    export_archive.track(model)
+    export_archive.add_endpoint(
+        name="call_inference",
+        fn=lambda x: model.call(x, training=False),
+        input_signature=[tf.TensorSpec(shape=(None, 3), dtype=tf.float32)],
+    )
+    export_archive.add_endpoint(
+        name="call_training",
+        fn=lambda x: model.call(x, training=True),
+        input_signature=[tf.TensorSpec(shape=(None, 3), dtype=tf.float32)],
+    )
+    export_archive.write_out("path/to/location")
+    ```
+
+    **Note on resource tracking:**
+
+    `ExportArchive` is able to automatically track all `tf.Variables` used
+    by its endpoints, so most of the time calling `.track(model)`
+    is not strictly required. However, if your model uses lookup layers such
+    as `IntegerLookup`, `StringLookup`, or `TextVectorization`,
+    it will need to be tracked explicitly via `.track(model)`.
+
+    Explicit tracking is also required if you need to be able to access
+    the properties `variables`, `trainable_variables`, or
+    `non_trainable_variables` on the revived archive.
+    """
+
+    def __init__(self):
+        self._endpoint_names = []
+        self._endpoint_signatures = {}
+        self.tensorflow_version = tf.__version__
+        self.variables = []
+        self.trainable_variables = []
+        self.non_trainable_variables = []
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def track(self, resource):
+        """Track the variables (and other assets) of a layer or model."""
+        if not isinstance(resource, tf.__internal__.tracking.Trackable):
+            raise ValueError(
+                "Invalid resource type. Expected an instance of a "
+                "TensorFlow `Trackable` (such as a Keras `Layer` or `Model`). "
+                f"Received instead an object of type '{type(resource)}'. "
+                f"Object received: {resource}"
+            )
+        if isinstance(resource, base_layer.Layer):
+            if not resource.built:
+                raise ValueError(
+                    "The layer provided has not yet been built. "
+                    "It must be built before export."
+                )
+
+        # Layers in `_tracked` are not part of the trackables that get saved,
+        # because we're creating the attribute in a
+        # no_automatic_dependency_tracking scope.
+        if not hasattr(self, "_tracked"):
+            self._tracked = []
+        self._tracked.append(resource)
+
+        if isinstance(resource, base_layer.Layer):
+            # Variables in the lists below are actually part of the trackables
+            # that get saved, because the lists are created in __init__.
+            self.variables += resource.variables
+            self.trainable_variables += resource.trainable_variables
+            self.non_trainable_variables += resource.non_trainable_variables
+
+    def add_endpoint(self, name, fn, input_signature=None):
+        """Register a new serving endpoint.
+
+        Arguments:
+            name: Str, name of the endpoint.
+            fn: A function. It should only leverage resources
+                (e.g. `tf.Variable` objects or `tf.lookup.StaticHashTable`
+                objects) that are available on the models/layers
+                tracked by the `ExportArchive` (you can call `.track(model)`
+                to track a new model).
+                The shape and dtype of the inputs to the function must be
+                known. For that purpose, you can either 1) make sure that
+                `fn` is a `tf.function` that has been called at least once, or
+                2) provide an `input_signature` argument that specifies the
+                shape and dtype of the inputs (see below).
+            input_signature: Used to specify the shape and dtype of the
+                inputs to `fn`. List of `tf.TensorSpec` objects (one
+                per positional input argument of `fn`). Nested arguments are
+                allowed (see below for an example showing a Functional model
+                with 2 input arguments).
+
+        Example:
+
+        Adding an endpoint using the `input_signature` argument when the
+        model has a single input argument:
+
+        ```python
+        export_archive = ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            name="serve",
+            fn=model.call,
+            input_signature=[tf.TensorSpec(shape=(None, 3), dtype=tf.float32)],
+        )
+        ```
+
+        Adding an endpoint using the `input_signature` argument when the
+        model has two positional input arguments:
+
+        ```python
+        export_archive = ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            name="serve",
+            fn=model.call,
+            input_signature=[
+                tf.TensorSpec(shape=(None, 3), dtype=tf.float32),
+                tf.TensorSpec(shape=(None, 4), dtype=tf.float32),
+            ],
+        )
+        ```
+
+        Adding an endpoint using the `input_signature` argument when the
+        model has one input argument that is a list of 2 tensors (e.g.
+        a Functional model with 2 inputs):
+
+        ```python
+        model = keras.Model(inputs=[x1, x2], outputs=outputs)
+
+        export_archive = ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            name="serve",
+            fn=model.call,
+            input_signature=[
+                [
+                    tf.TensorSpec(shape=(None, 3), dtype=tf.float32),
+                    tf.TensorSpec(shape=(None, 4), dtype=tf.float32),
+                ],
+            ],
+        )
+        ```
+
+        This also works with dictionary inputs:
+
+        ```python
+        model = keras.Model(inputs={"x1": x1, "x2": x2}, outputs=outputs)
+
+        export_archive = ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            name="serve",
+            fn=model.call,
+            input_signature=[
+                {
+                    "x1": tf.TensorSpec(shape=(None, 3), dtype=tf.float32),
+                    "x2": tf.TensorSpec(shape=(None, 4), dtype=tf.float32),
+                },
+            ],
+        )
+        ```
+
+        Adding an endpoint that is a `tf.function`:
+
+        ```python
+        @tf.function()
+        def serving_fn(x):
+            return model(x)
+
+        # The function must be traced, i.e. it must be called at least once.
+        serving_fn(tf.random.normal(shape=(2, 3)))
+
+        export_archive = ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(name="serve", fn=serving_fn)
+        ```
+        """
+        if name in self._endpoint_names:
+            raise ValueError(f"Endpoint name '{name}' is already taken.")
+
+        if input_signature:
+            decorated_fn = tf.function(fn, input_signature=input_signature)
+            self._endpoint_signatures[name] = input_signature
+        else:
+            if isinstance(fn, tf.types.experimental.GenericFunction):
+                if not fn._list_all_concrete_functions():
+                    raise ValueError(
+                        f"The provided tf.function '{fn}' "
+                        "has never been called. "
+                        "To specify the expected shape and dtype "
+                        "of the function's arguments, "
+                        "you must either provide a function that "
+                        "has been called at least once, or alternatively pass "
+                        "an `input_signature` argument in `add_endpoint()`."
+                    )
+                decorated_fn = fn
+            else:
+                raise ValueError(
+                    "If the `fn` argument provided is not a `tf.function`, "
+                    "you must provide an `input_signature` argument to "
+                    "specify the shape and dtype of the function arguments. "
+                    "Example:\n\n"
+                    "export_archive.add_endpoint(\n"
+                    "    name='call',\n"
+                    "    fn=model.call,\n"
+                    "    input_signature=[\n"
+                    "        tf.TensorSpec(\n"
+                    "            shape=(None, 224, 224, 3),\n"
+                    "            dtype=tf.float32,\n"
+                    "        )\n"
+                    "    ],\n"
+                    ")"
+                )
+        setattr(self, name, decorated_fn)
+        self._endpoint_names.append(name)
+
+    def add_variable_collection(self, name, variables):
+        """Register a set of variables to be retrieved after reloading.
+
+        Arguments:
+            name: The string name for the collection.
+            variables: A tuple/list/set of `tf.Variable` instances.
+
+        Example:
+
+        ```python
+        export_archive = ExportArchive()
+        export_archive.track(model)
+        # Register an endpoint
+        export_archive.add_endpoint(
+            name="serve",
+            fn=model.call,
+            input_signature=[tf.TensorSpec(shape=(None, 3), dtype=tf.float32)],
+        )
+        # Save a variable collection
+        export_archive.add_variable_collection(
+            name="optimizer_variables", variables=model.optimizer.variables)
+        export_archive.write_out("path/to/location")
+
+        # Reload the object
+        revived_object = tf.saved_model.load("path/to/location")
+        # Retrieve the variables
+        optimizer_variables = revived_object.optimizer_variables
+        ```
+        """
+        if not isinstance(variables, (list, tuple, set)):
+            raise ValueError(
+                "Expected `variables` to be a list/tuple/set. "
+                f"Received instead object of type '{type(variables)}'."
+            )
+        if not all(isinstance(v, tf.Variable) for v in variables):
+            raise ValueError(
+                "Expected all elements in `variables` to be "
+                "`tf.Variable` instances. Found instead the following types: "
+                f"{list(set(type(v) for v in variables))}"
+            )
+        setattr(self, name, list(variables))
+
+    def write_out(self, filepath, options=None):
+        """Write the corresponding SavedModel to disk.
+
+        Arguments:
+            filepath: `str` or `pathlib.Path` object.
+                Path where to save the artifact.
+            options: `tf.saved_model.SaveOptions` object that specifies
+                SavedModel saving options.
+
+        **Note on TF-Serving**: all endpoints registered via `add_endpoint()`
+        are made visible for TF-Serving in the SavedModel artifact. In addition,
+        the first endpoint registered is made visible under the alias
+        `"serving_default"` (unless an endpoint with the name
+        `"serving_default"` was already registered manually),
+        since TF-Serving requires this endpoint to be set.
+        """
+        if not self._endpoint_names:
+            raise ValueError(
+                "No endpoints have been set yet. Call add_endpoint()."
+            )
+        self._filter_and_track_resources()
+
+        signatures = {}
+        for name in self._endpoint_names:
+            signatures[name] = self._get_concrete_fn(name)
+        # Add "serving_default" signature key for TFServing
+        if "serving_default" not in self._endpoint_names:
+            signatures["serving_default"] = self._get_concrete_fn(
+                self._endpoint_names[0]
+            )
+        tf.saved_model.save(
+            self, filepath, options=options, signatures=signatures
+        )
+        # Print out available endpoints
+        endpoints = "\n\n".join(
+            _print_signature(getattr(self, name), name)
+            for name in self._endpoint_names
+        )
+        io_utils.print_msg(
+            f"Saved artifact at '{filepath}'. "
+            "The following endpoints are available:\n\n"
+            f"{endpoints}"
+        )
+
+    def _get_concrete_fn(self, endpoint):
+        """Workaround for some SavedModel quirks."""
+        if endpoint in self._endpoint_signatures:
+            return getattr(self, endpoint)
+        else:
+            traces = getattr(self, endpoint)._trackable_children("saved_model")
+            return list(traces.values())[0]
+
+    def _get_variables_used_by_endpoints(self):
+        fns = [self._get_concrete_fn(name) for name in self._endpoint_names]
+        return _list_variables_used_by_fns(fns)
+
+    def _filter_and_track_resources(self):
+        """Track resources used by endpoints / referenced in `track()` calls."""
+        # Start by extracting variables from endpoints.
+        fns = [self._get_concrete_fn(name) for name in self._endpoint_names]
+        tvs, ntvs = _list_variables_used_by_fns(fns)
+        self._all_variables = list(tvs + ntvs)
+
+        # Next, track lookup tables.
+        # Hopefully, one day this will be automated at the tf.function level.
+        self._misc_assets = []
+        from keras.layers.preprocessing.index_lookup import IndexLookup
+
+        if hasattr(self, "_tracked"):
+            for root in self._tracked:
+                descendants = tf.train.TrackableView(root).descendants()
+                for trackable in descendants:
+                    if isinstance(trackable, IndexLookup):
+                        self._misc_assets.append(trackable)
+
+
+def export_model(model, filepath):
+    export_archive = ExportArchive()
+    export_archive.track(model)
+    if isinstance(model, (functional.Functional, sequential.Sequential)):
+        input_signature = tf.nest.map_structure(_make_tensor_spec, model.inputs)
+        if isinstance(input_signature, list) and len(input_signature) > 1:
+            input_signature = [input_signature]
+        export_archive.add_endpoint("serve", model.__call__, input_signature)
+    else:
+        save_spec = model._get_save_spec()
+        if not save_spec:
+            raise ValueError(
+                "The model provided has never called. "
+                "It must be called at least once before export."
+            )
+        input_signature = [save_spec]
+        export_archive.add_endpoint("serve", model.__call__, input_signature)
+    export_archive.write_out(filepath)
+
+
+class ReloadedLayer(base_layer.Layer):
+    """Reload a Keras model/layer that was saved via SavedModel / ExportArchive.
+
+    Arguments:
+        filepath: `str` or `pathlib.Path` object. The path to the SavedModel.
+        call_endpoint: Name of the endpoint to use as the `call()` method
+            of the reloaded layer. If the SavedModel was created
+            via `model.export()`,
+            then the default endpoint name is `'serve'`. In other cases
+            it may be named `'serving_default'`.
+
+    Example:
+
+    ```python
+    model.export("path/to/artifact")
+    reloaded_layer = ReloadedLayer("path/to/artifact")
+    outputs = reloaded_layer(inputs)
+    ```
+
+    The reloaded object can be used like a regular Keras layer, and supports
+    training/fine-tuning of its trainable weights. Note that the reloaded
+    object retains none of the internal structure or custom methods of the
+    original object -- it's a brand new layer created around the saved
+    function.
+
+    **Limitations:**
+
+    * Only call endpoints with a single `inputs` tensor argument
+    (which may optionally be a dict/tuple/list of tensors) are supported.
+    For endpoints with multiple separate input tensor arguments, consider
+    subclassing `ReloadedLayer` and implementing a `call()` method with a
+    custom signature.
+    * If you need training-time behavior to differ from inference-time behavior
+    (i.e. if you need the reloaded object to support a `training=True` argument
+    in `__call__()`), make sure that the training-time call function is
+    saved as a standalone endpoint in the artifact, and provide its name
+    to the `ReloadedLayer` via the `call_training_endpoint` argument.
+    """
+
+    def __init__(
+        self,
+        filepath,
+        call_endpoint="serve",
+        call_training_endpoint=None,
+        trainable=True,
+        name=None,
+        dtype=None,
+    ):
+        # Initialize an empty layer, then add_weight() etc. as needed.
+        super().__init__(trainable=trainable, name=name, dtype=dtype)
+
+        self._reloaded_obj = tf.saved_model.load(filepath)
+
+        self.filepath = filepath
+        self.call_endpoint = call_endpoint
+        self.call_training_endpoint = call_training_endpoint
+
+        # Resolve the call function.
+        if hasattr(self._reloaded_obj, call_endpoint):
+            # Case 1: it's set as an attribute.
+            self.call_endpoint_fn = getattr(self._reloaded_obj, call_endpoint)
+        elif call_endpoint in self._reloaded_obj.signatures:
+            # Case 2: it's listed in the `signatures` field.
+            self.call_endpoint_fn = self._reloaded_obj.signatures[call_endpoint]
+        else:
+            raise ValueError(
+                f"The endpoint '{call_endpoint}' is neither an "
+                "attribute of the reloaded SavedModel, nor an entry "
+                "in the `signatures` field of the reloaded SavedModel. "
+            )
+
+        # Resolving the training function.
+        if call_training_endpoint:
+            if hasattr(self._reloaded_obj, call_training_endpoint):
+                self.call_training_endpoint_fn = getattr(
+                    self._reloaded_obj, call_training_endpoint
+                )
+            elif call_training_endpoint in self._reloaded_obj.signatures:
+                self.call_training_endpoint_fn = self._reloaded_obj.signatures[
+                    call_training_endpoint
+                ]
+            else:
+                raise ValueError(
+                    f"The endpoint '{call_training_endpoint}' is "
+                    "neither an attribute of the reloaded SavedModel, "
+                    "nor an entry in the `signatures` field of "
+                    "the reloaded SavedModel. "
+                )
+
+        # Add trainable and non-trainable weights from the call_endpoint_fn.
+        all_fns = [self.call_endpoint_fn]
+        if call_training_endpoint:
+            all_fns.append(self.call_training_endpoint_fn)
+        tvs, ntvs = _list_variables_used_by_fns(all_fns)
+        for v in tvs:
+            self._add_existing_weight(v, trainable=True)
+        for v in ntvs:
+            self._add_existing_weight(v, trainable=False)
+        self.built = True
+
+    def _add_existing_weight(self, weight, trainable):
+        """Calls add_weight() to register but not create an existing weight."""
+        self.add_weight(
+            name=weight.name,
+            shape=weight.shape,
+            dtype=weight.dtype,
+            trainable=trainable,
+            getter=lambda *_, **__: weight,
+        )
+
+    def call(self, inputs, training=False, **kwargs):
+        if training:
+            if self.call_training_endpoint:
+                return self.call_training_endpoint_fn(inputs, **kwargs)
+        return self.call_endpoint_fn(inputs, **kwargs)
+
+    def get_config(self):
+        base_config = super().get_config()
+        config = {
+            # Note: this is not intended to be portable.
+            "filepath": self.filepath,
+            "call_endpoint": self.call_endpoint,
+            "call_training_endpoint": self.call_training_endpoint,
+        }
+        return {**base_config, **config}
+
+
+def _make_tensor_spec(x):
+    return tf.TensorSpec(x.shape, dtype=x.dtype, name=x.name)
+
+
+def _print_signature(fn, name):
+    concrete_fn = fn._list_all_concrete_functions()[0]
+    pprinted_signature = concrete_fn.pretty_printed_signature(verbose=True)
+    lines = pprinted_signature.split("\n")
+    lines = [f"* Endpoint '{name}'"] + lines[1:]
+    endpoint = "\n".join(lines)
+    return endpoint
+
+
+def _list_variables_used_by_fns(fns):
+    trainable_variables = []
+    non_trainable_variables = []
+    trainable_variables_ids = set()
+    non_trainable_variables_ids = set()
+    for fn in fns:
+        if hasattr(fn, "concrete_functions"):
+            concrete_functions = fn.concrete_functions
+        elif hasattr(fn, "get_concrete_function"):
+            concrete_functions = [fn.get_concrete_function()]
+        else:
+            concrete_functions = [fn]
+        for concrete_fn in concrete_functions:
+            for v in concrete_fn.trainable_variables:
+                if id(v) not in trainable_variables_ids:
+                    trainable_variables.append(v)
+                    trainable_variables_ids.add(id(v))
+
+            for v in concrete_fn.variables:
+                if (
+                    id(v) not in trainable_variables_ids
+                    and id(v) not in non_trainable_variables_ids
+                ):
+                    non_trainable_variables.append(v)
+                    non_trainable_variables_ids.add(id(v))
+    return trainable_variables, non_trainable_variables
diff --git a/keras/export/export_lib_test.py b/keras/export/export_lib_test.py
new file mode 100644
index 000000000000..988b9a14904d
--- /dev/null
+++ b/keras/export/export_lib_test.py
@@ -0,0 +1,625 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for inference-only model/layer exporting utilities."""
+import os
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras.export import export_lib
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+
+def get_model():
+    layers = [
+        keras.layers.Dense(10, activation="relu"),
+        keras.layers.BatchNormalization(),
+        keras.layers.Dense(1, activation="sigmoid"),
+    ]
+    model = test_utils.get_model_from_layers(layers, input_shape=(10,))
+    return model
+
+
+@test_utils.run_v2_only
+class ExportArchiveTest(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.run_with_all_model_types
+    def test_standard_model_export(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        model = get_model()
+        ref_input = tf.random.normal((3, 10))
+        ref_output = model(ref_input).numpy()
+
+        export_lib.export_model(model, temp_filepath)
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_output, revived_model.serve(ref_input).numpy(), atol=1e-6
+        )
+
+    @test_combinations.run_with_all_model_types
+    def test_low_level_model_export(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+
+        model = get_model()
+        ref_input = tf.random.normal((3, 10))
+        ref_output = model(ref_input).numpy()
+
+        # Test variable tracking
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        self.assertLen(export_archive.variables, 8)
+        self.assertLen(export_archive.trainable_variables, 6)
+        self.assertLen(export_archive.non_trainable_variables, 2)
+
+        @tf.function()
+        def my_endpoint(x):
+            return model(x)
+
+        # Test registering an endpoint that is a tf.function (called)
+        my_endpoint(ref_input)  # Trace fn
+
+        export_archive.add_endpoint(
+            "call",
+            my_endpoint,
+        )
+        export_archive.write_out(temp_filepath)
+
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertFalse(hasattr(revived_model, "_tracked"))
+        self.assertAllClose(
+            ref_output, revived_model.call(ref_input).numpy(), atol=1e-6
+        )
+        self.assertLen(revived_model.variables, 8)
+        self.assertLen(revived_model.trainable_variables, 6)
+        self.assertLen(revived_model.non_trainable_variables, 2)
+
+        # Test registering an endpoint that is NOT a tf.function
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            "call",
+            model.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 10),
+                    dtype=tf.float32,
+                )
+            ],
+        )
+        export_archive.write_out(temp_filepath)
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_output, revived_model.call(ref_input).numpy(), atol=1e-6
+        )
+
+    def test_layer_export(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_layer")
+
+        layer = keras.layers.BatchNormalization()
+        ref_input = tf.random.normal((3, 10))
+        ref_output = layer(ref_input).numpy()  # Build layer (important)
+
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(layer)
+        export_archive.add_endpoint(
+            "call",
+            layer.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 10),
+                    dtype=tf.float32,
+                )
+            ],
+        )
+        export_archive.write_out(temp_filepath)
+        revived_layer = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_output, revived_layer.call(ref_input).numpy(), atol=1e-6
+        )
+
+    def test_multi_input_output_functional_model(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        x1 = keras.Input((2,))
+        x2 = keras.Input((2,))
+        y1 = keras.layers.Dense(3)(x1)
+        y2 = keras.layers.Dense(3)(x2)
+        model = keras.Model([x1, x2], [y1, y2])
+
+        ref_inputs = [tf.random.normal((3, 2)), tf.random.normal((3, 2))]
+        ref_outputs = model(ref_inputs)
+
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            "serve",
+            model.call,
+            input_signature=[
+                [
+                    tf.TensorSpec(
+                        shape=(None, 2),
+                        dtype=tf.float32,
+                    ),
+                    tf.TensorSpec(
+                        shape=(None, 2),
+                        dtype=tf.float32,
+                    ),
+                ]
+            ],
+        )
+        export_archive.write_out(temp_filepath)
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_outputs[0].numpy(),
+            revived_model.serve(ref_inputs)[0].numpy(),
+            atol=1e-6,
+        )
+        self.assertAllClose(
+            ref_outputs[1].numpy(),
+            revived_model.serve(ref_inputs)[1].numpy(),
+            atol=1e-6,
+        )
+
+        # Now test dict inputs
+        model = keras.Model({"x1": x1, "x2": x2}, [y1, y2])
+
+        ref_inputs = {
+            "x1": tf.random.normal((3, 2)),
+            "x2": tf.random.normal((3, 2)),
+        }
+        ref_outputs = model(ref_inputs)
+
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            "serve",
+            model.call,
+            input_signature=[
+                {
+                    "x1": tf.TensorSpec(
+                        shape=(None, 2),
+                        dtype=tf.float32,
+                    ),
+                    "x2": tf.TensorSpec(
+                        shape=(None, 2),
+                        dtype=tf.float32,
+                    ),
+                }
+            ],
+        )
+        export_archive.write_out(temp_filepath)
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_outputs[0].numpy(),
+            revived_model.serve(ref_inputs)[0].numpy(),
+            atol=1e-6,
+        )
+        self.assertAllClose(
+            ref_outputs[1].numpy(),
+            revived_model.serve(ref_inputs)[1].numpy(),
+            atol=1e-6,
+        )
+
+    def test_model_with_lookup_table(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        text_vectorization = keras.layers.TextVectorization()
+        text_vectorization.adapt(["one two", "three four", "five six"])
+        model = keras.Sequential(
+            [
+                text_vectorization,
+                keras.layers.Embedding(10, 32),
+                keras.layers.Dense(1),
+            ]
+        )
+        ref_input = tf.convert_to_tensor(["one two three four"])
+        ref_output = model(ref_input).numpy()
+
+        export_lib.export_model(model, temp_filepath)
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_output, revived_model.serve(ref_input).numpy(), atol=1e-6
+        )
+
+    def test_track_multiple_layers(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        layer_1 = keras.layers.Dense(2)
+        ref_input_1 = tf.random.normal((3, 4))
+        ref_output_1 = layer_1(ref_input_1).numpy()
+        layer_2 = keras.layers.Dense(3)
+        ref_input_2 = tf.random.normal((3, 5))
+        ref_output_2 = layer_2(ref_input_2).numpy()
+
+        export_archive = export_lib.ExportArchive()
+        export_archive.add_endpoint(
+            "call_1",
+            layer_1.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 4),
+                    dtype=tf.float32,
+                ),
+            ],
+        )
+        export_archive.add_endpoint(
+            "call_2",
+            layer_2.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 5),
+                    dtype=tf.float32,
+                ),
+            ],
+        )
+        export_archive.write_out(temp_filepath)
+        revived_layer = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_output_1,
+            revived_layer.call_1(ref_input_1).numpy(),
+            atol=1e-6,
+        )
+        self.assertAllClose(
+            ref_output_2,
+            revived_layer.call_2(ref_input_2).numpy(),
+            atol=1e-6,
+        )
+
+    def test_non_standard_layer_signature(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_layer")
+
+        layer = keras.layers.MultiHeadAttention(2, 2)
+        x1 = tf.random.normal((3, 2, 2))
+        x2 = tf.random.normal((3, 2, 2))
+        ref_output = layer(x1, x2).numpy()  # Build layer (important)
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(layer)
+        export_archive.add_endpoint(
+            "call",
+            layer.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 2, 2),
+                    dtype=tf.float32,
+                ),
+                tf.TensorSpec(
+                    shape=(None, 2, 2),
+                    dtype=tf.float32,
+                ),
+            ],
+        )
+        export_archive.write_out(temp_filepath)
+        revived_layer = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_output,
+            revived_layer.call(query=x1, value=x2).numpy(),
+            atol=1e-6,
+        )
+
+    def test_variable_collection(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+
+        model = keras.Sequential(
+            [
+                keras.Input((10,)),
+                keras.layers.Dense(2),
+                keras.layers.Dense(2),
+            ]
+        )
+
+        # Test variable tracking
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            "call",
+            model.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 10),
+                    dtype=tf.float32,
+                )
+            ],
+        )
+        export_archive.add_variable_collection(
+            "my_vars", model.layers[1].weights
+        )
+        self.assertLen(export_archive.my_vars, 2)
+        export_archive.write_out(temp_filepath)
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertLen(revived_model.my_vars, 2)
+
+    def test_export_model_errors(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+
+        # Model has not been built
+        model = keras.Sequential([keras.layers.Dense(2)])
+        with self.assertRaisesRegex(ValueError, "It must be built"):
+            export_lib.export_model(model, temp_filepath)
+
+        # Subclassed model has not been called
+        class MyModel(keras.Model):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.dense = keras.layers.Dense(2)
+
+            def build(self, input_shape):
+                self.dense.build(input_shape)
+                self.built = True
+
+            def call(self, x):
+                return self.dense(x)
+
+        model = MyModel()
+        model.build((2, 3))
+        with self.assertRaisesRegex(ValueError, "It must be called"):
+            export_lib.export_model(model, temp_filepath)
+
+    def test_export_archive_errors(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        model = keras.Sequential([keras.layers.Dense(2)])
+        model(tf.random.normal((2, 3)))
+
+        # Endpoint name reuse
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            "call",
+            model.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 3),
+                    dtype=tf.float32,
+                )
+            ],
+        )
+        with self.assertRaisesRegex(ValueError, "already taken"):
+            export_archive.add_endpoint(
+                "call",
+                model.call,
+                input_signature=[
+                    tf.TensorSpec(
+                        shape=(None, 3),
+                        dtype=tf.float32,
+                    )
+                ],
+            )
+
+        # Write out with no endpoints
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        with self.assertRaisesRegex(ValueError, "No endpoints have been set"):
+            export_archive.write_out(temp_filepath)
+
+        # Invalid object type
+        with self.assertRaisesRegex(ValueError, "Invalid resource type"):
+            export_archive = export_lib.ExportArchive()
+            export_archive.track("model")
+
+        # Set endpoint with no input signature
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        with self.assertRaisesRegex(
+            ValueError, "you must provide an `input_signature`"
+        ):
+            export_archive.add_endpoint(
+                "call",
+                model.call,
+            )
+
+        # Set endpoint that has never been called
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+
+        @tf.function()
+        def my_endpoint(x):
+            return model(x)
+
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        with self.assertRaisesRegex(
+            ValueError, "you must either provide a function"
+        ):
+            export_archive.add_endpoint(
+                "call",
+                my_endpoint,
+            )
+
+    def test_export_no_assets(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+
+        # Case where there are legitimately no assets.
+        model = keras.Sequential([keras.layers.Flatten()])
+        model(tf.random.normal((2, 3)))
+        export_archive = export_lib.ExportArchive()
+        export_archive.add_endpoint(
+            "call",
+            model.call,
+            input_signature=[
+                tf.TensorSpec(
+                    shape=(None, 3),
+                    dtype=tf.float32,
+                )
+            ],
+        )
+        export_archive.write_out(temp_filepath)
+
+    @test_combinations.run_with_all_model_types
+    def test_model_export_method(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        model = get_model()
+        ref_input = tf.random.normal((3, 10))
+        ref_output = model(ref_input).numpy()
+
+        model.export(temp_filepath)
+        revived_model = tf.saved_model.load(temp_filepath)
+        self.assertAllClose(
+            ref_output, revived_model.serve(ref_input).numpy(), atol=1e-6
+        )
+
+
+@test_utils.run_v2_only
+class TestReloadedLayer(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.run_with_all_model_types
+    def test_reloading_export_archive(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        model = get_model()
+        ref_input = tf.random.normal((3, 10))
+        ref_output = model(ref_input).numpy()
+
+        export_lib.export_model(model, temp_filepath)
+        reloaded_layer = export_lib.ReloadedLayer(temp_filepath)
+        self.assertAllClose(
+            reloaded_layer(ref_input).numpy(), ref_output, atol=1e-7
+        )
+        self.assertLen(reloaded_layer.weights, len(model.weights))
+        self.assertLen(
+            reloaded_layer.trainable_weights, len(model.trainable_weights)
+        )
+        self.assertLen(
+            reloaded_layer.non_trainable_weights,
+            len(model.non_trainable_weights),
+        )
+
+        # Test fine-tuning
+        new_model = keras.Sequential([reloaded_layer])
+        new_model.compile(optimizer="rmsprop", loss="mse")
+        x = tf.random.normal((32, 10))
+        y = tf.random.normal((32, 1))
+        new_model.train_on_batch(x, y)
+        new_output = reloaded_layer(ref_input).numpy()
+        self.assertNotAllClose(new_output, ref_output, atol=1e-5)
+
+        # Test that trainable can be set to False
+        reloaded_layer.trainable = False
+        new_model.compile(optimizer="rmsprop", loss="mse")
+        x = tf.random.normal((32, 10))
+        y = tf.random.normal((32, 1))
+        new_model.train_on_batch(x, y)
+        # The output must not have changed
+        self.assertAllClose(
+            reloaded_layer(ref_input).numpy(), new_output, atol=1e-7
+        )
+
+    @test_combinations.run_with_all_model_types
+    def test_reloading_default_saved_model(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        model = get_model()
+        ref_input = tf.random.normal((3, 10))
+        ref_output = model(ref_input).numpy()
+
+        tf.saved_model.save(model, temp_filepath)
+        reloaded_layer = export_lib.ReloadedLayer(
+            temp_filepath, call_endpoint="serving_default"
+        )
+        # The output is a dict, due to the nature of SavedModel saving.
+        new_output = reloaded_layer(ref_input)
+        self.assertAllClose(
+            new_output[list(new_output.keys())[0]].numpy(),
+            ref_output,
+            atol=1e-7,
+        )
+        self.assertLen(reloaded_layer.weights, len(model.weights))
+        self.assertLen(
+            reloaded_layer.trainable_weights, len(model.trainable_weights)
+        )
+        self.assertLen(
+            reloaded_layer.non_trainable_weights,
+            len(model.non_trainable_weights),
+        )
+
+    def test_call_training(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        keras.utils.set_random_seed(1337)
+        model = keras.Sequential(
+            [
+                keras.Input((10,)),
+                keras.layers.Dense(10),
+                keras.layers.Dropout(0.99999),
+            ]
+        )
+        export_archive = export_lib.ExportArchive()
+        export_archive.track(model)
+        export_archive.add_endpoint(
+            name="call_inference",
+            fn=lambda x: model(x, training=False),
+            input_signature=[tf.TensorSpec(shape=(None, 10), dtype=tf.float32)],
+        )
+        export_archive.add_endpoint(
+            name="call_training",
+            fn=lambda x: model(x, training=True),
+            input_signature=[tf.TensorSpec(shape=(None, 10), dtype=tf.float32)],
+        )
+        export_archive.write_out(temp_filepath)
+        reloaded_layer = export_lib.ReloadedLayer(
+            temp_filepath,
+            call_endpoint="call_inference",
+            call_training_endpoint="call_training",
+        )
+        inference_output = reloaded_layer(
+            tf.random.normal((1, 10)), training=False
+        )
+        training_output = reloaded_layer(
+            tf.random.normal((1, 10)), training=True
+        )
+        self.assertAllClose(np.mean(training_output), 0.0, atol=1e-7)
+        self.assertNotAllClose(np.mean(inference_output), 0.0, atol=1e-7)
+
+    @test_combinations.run_with_all_model_types
+    def test_serialization(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        model = get_model()
+        ref_input = tf.random.normal((3, 10))
+        ref_output = model(ref_input).numpy()
+
+        export_lib.export_model(model, temp_filepath)
+        reloaded_layer = export_lib.ReloadedLayer(temp_filepath)
+
+        # Test reinstantiation from config
+        config = reloaded_layer.get_config()
+        rereloaded_layer = export_lib.ReloadedLayer.from_config(config)
+        self.assertAllClose(
+            rereloaded_layer(ref_input).numpy(), ref_output, atol=1e-7
+        )
+
+        # Test whole model saving with reloaded layer inside
+        model = keras.Sequential([reloaded_layer])
+        temp_model_filepath = os.path.join(self.get_temp_dir(), "m.keras")
+        model.save(temp_model_filepath, save_format="keras_v3")
+        reloaded_model = keras.models.load_model(
+            temp_model_filepath,
+            custom_objects={"ReloadedLayer": export_lib.ReloadedLayer},
+        )
+        self.assertAllClose(
+            reloaded_model(ref_input).numpy(), ref_output, atol=1e-7
+        )
+
+    def test_errors(self):
+        # Test missing call endpoint
+        temp_filepath = os.path.join(self.get_temp_dir(), "exported_model")
+        model = keras.Sequential([keras.Input((2,)), keras.layers.Dense(3)])
+        export_lib.export_model(model, temp_filepath)
+        with self.assertRaisesRegex(ValueError, "The endpoint 'wrong'"):
+            export_lib.ReloadedLayer(temp_filepath, call_endpoint="wrong")
+
+        # Test missing call training endpoint
+        with self.assertRaisesRegex(ValueError, "The endpoint 'wrong'"):
+            export_lib.ReloadedLayer(
+                temp_filepath,
+                call_endpoint="serve",
+                call_training_endpoint="wrong",
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/feature_column/BUILD b/keras/feature_column/BUILD
index e9eb317b72b5..6684bc5dafcc 100644
--- a/keras/feature_column/BUILD
+++ b/keras/feature_column/BUILD
@@ -1,6 +1,8 @@
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/python/feature_column:__subpackages__",  # For unit testing
diff --git a/keras/feature_column/base_feature_layer.py b/keras/feature_column/base_feature_layer.py
index 3e44981260d0..085ccc6c3b55 100644
--- a/keras/feature_column/base_feature_layer.py
+++ b/keras/feature_column/base_feature_layer.py
@@ -21,196 +21,222 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import collections
 import re
+
+import tensorflow.compat.v2 as tf
+
 from keras.engine.base_layer import Layer
-from keras.utils import generic_utils
+from keras.saving import serialization_lib
 
 
 class _BaseFeaturesLayer(Layer):
-  """Base class for DenseFeatures and SequenceFeatures.
-
-  Defines common methods and helpers.
-
-  Args:
-    feature_columns: An iterable containing the FeatureColumns to use as
-      inputs to your model.
-    expected_column_type: Expected class for provided feature columns.
-    trainable:  Boolean, whether the layer's variables will be updated via
-      gradient descent during training.
-    name: Name to give to the DenseFeatures.
-    **kwargs: Keyword arguments to construct a layer.
-
-  Raises:
-    ValueError: if an item in `feature_columns` doesn't match
-      `expected_column_type`.
-  """
-
-  def __init__(self,
-               feature_columns,
-               expected_column_type,
-               trainable,
-               name,
-               partitioner=None,
-               **kwargs):
-    super().__init__(
-        name=name, trainable=trainable, **kwargs)
-    self._feature_columns = _normalize_feature_columns(
-        feature_columns)
-    self._state_manager = tf.__internal__.feature_column.StateManager(  # pylint: disable=protected-access
-        self, self.trainable)
-    self._partitioner = partitioner
-    for column in self._feature_columns:
-      if not isinstance(column, expected_column_type):
-        raise ValueError(
-            'Items of feature_columns must be a {}. '
-            'You can wrap a categorical column with an '
-            'embedding_column or indicator_column. Given: {}'.format(
-                expected_column_type, column))
-
-  def build(self, _):
-    for column in self._feature_columns:
-      with tf.compat.v1.variable_scope(
-          self.name, partitioner=self._partitioner):
-        with tf.compat.v1.variable_scope(
-            _sanitize_column_name_for_variable_scope(column.name)):
-          column.create_state(self._state_manager)
-    super().build(None)
-
-  def _output_shape(self, input_shape, num_elements):
-    """Computes expected output shape of the layer or a column's dense tensor.
+    """Base class for DenseFeatures and SequenceFeatures.
 
-    Args:
-      input_shape: Tensor or array with batch shape.
-      num_elements: Size of the last dimension of the output.
+    Defines common methods and helpers.
 
-    Returns:
-      Tuple with output shape.
+    Args:
+      feature_columns: An iterable containing the FeatureColumns to use as
+        inputs to your model.
+      expected_column_type: Expected class for provided feature columns.
+      trainable:  Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
+      name: Name to give to the DenseFeatures.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Raises:
+      ValueError: if an item in `feature_columns` doesn't match
+        `expected_column_type`.
     """
-    raise NotImplementedError('Calling an abstract method.')
-
-  def compute_output_shape(self, input_shape):
-    total_elements = 0
-    for column in self._feature_columns:
-      total_elements += column.variable_shape.num_elements()
-    return self._target_shape(input_shape, total_elements)
 
-  def _process_dense_tensor(self, column, tensor):
-    """Reshapes the dense tensor output of a column based on expected shape.
+    def __init__(
+        self,
+        feature_columns,
+        expected_column_type,
+        trainable,
+        name,
+        partitioner=None,
+        **kwargs
+    ):
+        super().__init__(name=name, trainable=trainable, **kwargs)
+        self._feature_columns = _normalize_feature_columns(feature_columns)
+        self._state_manager = tf.__internal__.feature_column.StateManager(
+            self, self.trainable
+        )
+        self._partitioner = partitioner
+        for column in self._feature_columns:
+            if not isinstance(column, expected_column_type):
+                raise ValueError(
+                    "Items of feature_columns must be a {}. "
+                    "You can wrap a categorical column with an "
+                    "embedding_column or indicator_column. Given: {}".format(
+                        expected_column_type, column
+                    )
+                )
+
+    def build(self, _):
+        for column in self._feature_columns:
+            with tf.compat.v1.variable_scope(
+                self.name, partitioner=self._partitioner
+            ):
+                with tf.compat.v1.variable_scope(
+                    _sanitize_column_name_for_variable_scope(column.name)
+                ):
+                    column.create_state(self._state_manager)
+        super().build(None)
+
+    def _output_shape(self, input_shape, num_elements):
+        """Computes expected output shape of the dense tensor of the layer.
+
+        Args:
+          input_shape: Tensor or array with batch shape.
+          num_elements: Size of the last dimension of the output.
+
+        Returns:
+          Tuple with output shape.
+        """
+        raise NotImplementedError("Calling an abstract method.")
+
+    def compute_output_shape(self, input_shape):
+        total_elements = 0
+        for column in self._feature_columns:
+            total_elements += column.variable_shape.num_elements()
+        return self._target_shape(input_shape, total_elements)
+
+    def _process_dense_tensor(self, column, tensor):
+        """Reshapes the dense tensor output of a column based on expected shape.
+
+        Args:
+          column: A DenseColumn or SequenceDenseColumn object.
+          tensor: A dense tensor obtained from the same column.
+
+        Returns:
+          Reshaped dense tensor.
+        """
+        num_elements = column.variable_shape.num_elements()
+        target_shape = self._target_shape(tf.shape(tensor), num_elements)
+        return tf.reshape(tensor, shape=target_shape)
+
+    def _verify_and_concat_tensors(self, output_tensors):
+        """Verifies and concatenates the dense output of several columns."""
+        _verify_static_batch_size_equality(
+            output_tensors, self._feature_columns
+        )
+        return tf.concat(output_tensors, -1)
+
+    def get_config(self):
+        column_configs = [
+            tf.__internal__.feature_column.serialize_feature_column(fc)
+            for fc in self._feature_columns
+        ]
+        config = {"feature_columns": column_configs}
+        config["partitioner"] = serialization_lib.serialize_keras_object(
+            self._partitioner
+        )
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config_cp = config.copy()
+        columns_by_name = {}
+        config_cp["feature_columns"] = [
+            tf.__internal__.feature_column.deserialize_feature_column(
+                c, custom_objects, columns_by_name
+            )
+            for c in config["feature_columns"]
+        ]
+        config_cp["partitioner"] = serialization_lib.deserialize_keras_object(
+            config["partitioner"], custom_objects
+        )
+
+        return cls(**config_cp)
 
-    Args:
-      column: A DenseColumn or SequenceDenseColumn object.
-      tensor: A dense tensor obtained from the same column.
-
-    Returns:
-      Reshaped dense tensor.
-    """
-    num_elements = column.variable_shape.num_elements()
-    target_shape = self._target_shape(tf.shape(tensor), num_elements)
-    return tf.reshape(tensor, shape=target_shape)
 
-  def _verify_and_concat_tensors(self, output_tensors):
-    """Verifies and concatenates the dense output of several columns."""
-    _verify_static_batch_size_equality(output_tensors, self._feature_columns)
-    return tf.concat(output_tensors, -1)
+def _sanitize_column_name_for_variable_scope(name):
+    """Sanitizes user-provided feature names for use as variable scopes."""
+    invalid_char = re.compile("[^A-Za-z0-9_.\\-]")
+    return invalid_char.sub("_", name)
 
-  def get_config(self):
-    column_configs = [tf.__internal__.feature_column.serialize_feature_column(fc)
-                      for fc in self._feature_columns]
-    config = {'feature_columns': column_configs}
-    config['partitioner'] = generic_utils.serialize_keras_object(
-        self._partitioner)
 
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+def _verify_static_batch_size_equality(tensors, columns):
+    """Verify equality between static batch sizes.
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config_cp = config.copy()
-    columns_by_name = {}
-    config_cp['feature_columns'] = [tf.__internal__.feature_column.deserialize_feature_column(
-        c, custom_objects, columns_by_name) for c in config['feature_columns']]
-    config_cp['partitioner'] = generic_utils.deserialize_keras_object(
-        config['partitioner'], custom_objects)
+    Args:
+      tensors: iterable of input tensors.
+      columns: Corresponding feature columns.
 
-    return cls(**config_cp)
+    Raises:
+      ValueError: in case of mismatched batch sizes.
+    """
+    expected_batch_size = None
+    for i in range(0, len(tensors)):
+        # bath_size is a Dimension object.
+        batch_size = tf.compat.v1.Dimension(
+            tf.compat.dimension_value(tensors[i].shape[0])
+        )
+        if batch_size.value is not None:
+            if expected_batch_size is None:
+                bath_size_column_index = i
+                expected_batch_size = batch_size
+            elif not expected_batch_size.is_compatible_with(batch_size):
+                raise ValueError(
+                    "Batch size (first dimension) of each feature must be "
+                    "same. Batch size of columns ({}, {}): ({}, {})".format(
+                        columns[bath_size_column_index].name,
+                        columns[i].name,
+                        expected_batch_size,
+                        batch_size,
+                    )
+                )
 
 
-def _sanitize_column_name_for_variable_scope(name):
-  """Sanitizes user-provided feature names for use as variable scopes."""
-  invalid_char = re.compile('[^A-Za-z0-9_.\\-]')
-  return invalid_char.sub('_', name)
+def _normalize_feature_columns(feature_columns):
+    """Normalizes the `feature_columns` input.
 
+    This method converts the `feature_columns` to list type as best as it can.
+    In addition, verifies the type and other parts of feature_columns, required
+    by downstream library.
 
-def _verify_static_batch_size_equality(tensors, columns):
-  """Verify equality between static batch sizes.
-
-  Args:
-    tensors: iterable of input tensors.
-    columns: Corresponding feature columns.
-
-  Raises:
-    ValueError: in case of mismatched batch sizes.
-  """
-  expected_batch_size = None
-  for i in range(0, len(tensors)):
-    # bath_size is a Dimension object.
-    batch_size = tf.compat.v1.Dimension(tf.compat.dimension_value(
-        tensors[i].shape[0]))
-    if batch_size.value is not None:
-      if expected_batch_size is None:
-        bath_size_column_index = i
-        expected_batch_size = batch_size
-      elif not expected_batch_size.is_compatible_with(batch_size):
-        raise ValueError(
-            'Batch size (first dimension) of each feature must be same. '
-            'Batch size of columns ({}, {}): ({}, {})'.format(
-                columns[bath_size_column_index].name, columns[i].name,
-                expected_batch_size, batch_size))
+    Args:
+      feature_columns: The raw feature columns, usually passed by users.
 
+    Returns:
+      The normalized feature column list.
 
-def _normalize_feature_columns(feature_columns):
-  """Normalizes the `feature_columns` input.
-
-  This method converts the `feature_columns` to list type as best as it can. In
-  addition, verifies the type and other parts of feature_columns, required by
-  downstream library.
-
-  Args:
-    feature_columns: The raw feature columns, usually passed by users.
-
-  Returns:
-    The normalized feature column list.
-
-  Raises:
-    ValueError: for any invalid inputs, such as empty, duplicated names, etc.
-  """
-  if isinstance(feature_columns, tf.__internal__.feature_column.FeatureColumn):
-    feature_columns = [feature_columns]
-
-  if isinstance(feature_columns, collections.abc.Iterator):
-    feature_columns = list(feature_columns)
-
-  if isinstance(feature_columns, dict):
-    raise ValueError('Expected feature_columns to be iterable, found dict.')
-
-  for column in feature_columns:
-    if not isinstance(column, tf.__internal__.feature_column.FeatureColumn):
-      raise ValueError('Items of feature_columns must be a FeatureColumn. '
-                       'Given (type {}): {}.'.format(type(column), column))
-  if not feature_columns:
-    raise ValueError('feature_columns must not be empty.')
-  name_to_column = {}
-  for column in feature_columns:
-    if column.name in name_to_column:
-      raise ValueError('Duplicate feature column name found for columns: {} '
-                       'and {}. This usually means that these columns refer to '
-                       'same base feature. Either one must be discarded or a '
-                       'duplicated but renamed item must be inserted in '
-                       'features dict.'.format(column,
-                                               name_to_column[column.name]))
-    name_to_column[column.name] = column
-
-  return sorted(feature_columns, key=lambda x: x.name)
+    Raises:
+      ValueError: for any invalid inputs, such as empty, duplicated names, etc.
+    """
+    if isinstance(
+        feature_columns, tf.__internal__.feature_column.FeatureColumn
+    ):
+        feature_columns = [feature_columns]
+
+    if isinstance(feature_columns, collections.abc.Iterator):
+        feature_columns = list(feature_columns)
+
+    if isinstance(feature_columns, dict):
+        raise ValueError("Expected feature_columns to be iterable, found dict.")
+
+    for column in feature_columns:
+        if not isinstance(column, tf.__internal__.feature_column.FeatureColumn):
+            raise ValueError(
+                "Items of feature_columns must be a FeatureColumn. "
+                "Given (type {}): {}.".format(type(column), column)
+            )
+    if not feature_columns:
+        raise ValueError("feature_columns must not be empty.")
+    name_to_column = {}
+    for column in feature_columns:
+        if column.name in name_to_column:
+            raise ValueError(
+                "Duplicate feature column name found for columns: {} "
+                "and {}. This usually means that these columns refer to "
+                "same base feature. Either one must be discarded or a "
+                "duplicated but renamed item must be inserted in "
+                "features dict.".format(column, name_to_column[column.name])
+            )
+        name_to_column[column.name] = column
+
+    return sorted(feature_columns, key=lambda x: x.name)
diff --git a/keras/feature_column/dense_features.py b/keras/feature_column/dense_features.py
index 9c2b4e868104..f5ae664581cc 100644
--- a/keras/feature_column/dense_features.py
+++ b/keras/feature_column/dense_features.py
@@ -18,157 +18,174 @@
 from __future__ import division
 from __future__ import print_function
 
+import json
+
 import tensorflow.compat.v2 as tf
 
-import json
 from keras import backend
 from keras.feature_column import base_feature_layer as kfc
-from keras.saving.saved_model import json_utils
+from keras.saving.legacy.saved_model import json_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export(v1=['keras.layers.DenseFeatures'])
-class DenseFeatures(kfc._BaseFeaturesLayer):  # pylint: disable=protected-access
-  """A layer that produces a dense `Tensor` based on given `feature_columns`.
-
-  Generally a single example in training data is described with FeatureColumns.
-  At the first layer of the model, this column-oriented data should be converted
-  to a single `Tensor`.
-
-  This layer can be called multiple times with different features.
-
-  This is the V1 version of this layer that uses variable_scope's or partitioner
-  to create variables which works well with PartitionedVariables. Variable
-  scopes are deprecated in V2, so the V2 version uses name_scopes instead. But
-  currently that lacks support for partitioned variables. Use this if you need
-  partitioned variables. Use the partitioner argument if you have a Keras model
-  and uses `tf.compat.v1.keras.estimator.model_to_estimator` for training.
-
-  Example:
-
-  ```python
-  price = tf.feature_column.numeric_column('price')
-  keywords_embedded = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_hash_bucket("keywords", 10K),
-      dimension=16)
-  columns = [price, keywords_embedded, ...]
-  partitioner = tf.compat.v1.fixed_size_partitioner(num_shards=4)
-  feature_layer = tf.compat.v1.keras.layers.DenseFeatures(
-      feature_columns=columns, partitioner=partitioner)
-
-  features = tf.io.parse_example(
-      ..., features=tf.feature_column.make_parse_example_spec(columns))
-  dense_tensor = feature_layer(features)
-  for units in [128, 64, 32]:
-    dense_tensor = tf.compat.v1.keras.layers.Dense(
-                       units, activation='relu')(dense_tensor)
-  prediction = tf.compat.v1.keras.layers.Dense(1)(dense_tensor)
-  ```
-  """
-
-  def __init__(self,
-               feature_columns,
-               trainable=True,
-               name=None,
-               partitioner=None,
-               **kwargs):
-    """Constructs a DenseFeatures layer.
-
-    Args:
-      feature_columns: An iterable containing the FeatureColumns to use as
-        inputs to your model. All items should be instances of classes derived
-        from `DenseColumn` such as `numeric_column`, `embedding_column`,
-        `bucketized_column`, `indicator_column`. If you have categorical
-        features, you can wrap them with an `embedding_column` or
-        `indicator_column`.
-      trainable:  Boolean, whether the layer's variables will be updated via
-        gradient descent during training.
-      name: Name to give to the DenseFeatures.
-      partitioner: Partitioner for input layer. Defaults to None.
-      **kwargs: Keyword arguments to construct a layer.
-
-    Raises:
-      ValueError: if an item in `feature_columns` is not a `DenseColumn`.
+@keras_export(v1=["keras.layers.DenseFeatures"])
+class DenseFeatures(kfc._BaseFeaturesLayer):
+    """A layer that produces a dense `Tensor` based on given `feature_columns`.
+
+    Generally a single example in training data is described with
+    FeatureColumns.  At the first layer of the model, this column-oriented data
+    should be converted to a single `Tensor`.
+
+    This layer can be called multiple times with different features.
+
+    This is the V1 version of this layer that uses variable_scope's or
+    partitioner to create variables which works well with PartitionedVariables.
+    Variable scopes are deprecated in V2, so the V2 version uses name_scopes
+    instead. But currently that lacks support for partitioned variables. Use
+    this if you need partitioned variables. Use the partitioner argument if you
+    have a Keras model and uses
+    `tf.compat.v1.keras.estimator.model_to_estimator` for training.
+
+    Example:
+
+    ```python
+    price = tf.feature_column.numeric_column('price')
+    keywords_embedded = tf.feature_column.embedding_column(
+        tf.feature_column.categorical_column_with_hash_bucket("keywords", 10K),
+        dimension=16)
+    columns = [price, keywords_embedded, ...]
+    partitioner = tf.compat.v1.fixed_size_partitioner(num_shards=4)
+    feature_layer = tf.compat.v1.keras.layers.DenseFeatures(
+        feature_columns=columns, partitioner=partitioner)
+
+    features = tf.io.parse_example(
+        ..., features=tf.feature_column.make_parse_example_spec(columns))
+    dense_tensor = feature_layer(features)
+    for units in [128, 64, 32]:
+      dense_tensor = tf.compat.v1.keras.layers.Dense(
+                         units, activation='relu')(dense_tensor)
+    prediction = tf.compat.v1.keras.layers.Dense(1)(dense_tensor)
+    ```
     """
-    super().__init__(
-        feature_columns=feature_columns,
-        trainable=trainable,
-        name=name,
-        partitioner=partitioner,
-        expected_column_type=tf.__internal__.feature_column.DenseColumn,
-        **kwargs)
-
-  @property
-  def _is_feature_layer(self):
-    return True
-
-  @property
-  def _tracking_metadata(self):
-    """String stored in metadata field in the SavedModel proto.
-
-    Returns:
-      A serialized JSON storing information necessary for recreating this layer.
-    """
-    metadata = json.loads(super()._tracking_metadata)
-    metadata['_is_feature_layer'] = True
-    return json.dumps(metadata, default=json_utils.get_json_type)
-
-  def _target_shape(self, input_shape, total_elements):
-    return (input_shape[0], total_elements)
-
-  def call(self, features, cols_to_output_tensors=None, training=None):
-    """Returns a dense tensor corresponding to the `feature_columns`.
-
-    Example usage:
-
-    >>> t1 = tf.feature_column.embedding_column(
-    ...    tf.feature_column.categorical_column_with_hash_bucket("t1", 2),
-    ...    dimension=8)
-    >>> t2 = tf.feature_column.numeric_column('t2')
-    >>> feature_layer = tf.compat.v1.keras.layers.DenseFeatures([t1, t2])
-    >>> features = {"t1": tf.constant(["a", "b"]), "t2": tf.constant([1, 2])}
-    >>> dense_tensor = feature_layer(features, training=True)
-
-    Args:
-      features: A mapping from key to tensors. `FeatureColumn`s look up via
-        these keys. For example `numeric_column('price')` will look at 'price'
-        key in this dict. Values can be a `SparseTensor` or a `Tensor` depends
-        on corresponding `FeatureColumn`.
-      cols_to_output_tensors: If not `None`, this will be filled with a dict
-        mapping feature columns to output tensors created.
-      training: Python boolean or None, indicating whether to the layer is being
-        run in training mode. This argument is passed to the call method of any
-        `FeatureColumn` that takes a `training` argument. For example, if a
-        `FeatureColumn` performed dropout, the column could expose a `training`
-        argument to control whether the dropout should be applied. If `None`,
-        defaults to `tf.keras.backend.learning_phase()`.
-
-
-    Returns:
-      A `Tensor` which represents input layer of a model. Its shape
-      is (batch_size, first_layer_dimension) and its dtype is `float32`.
-      first_layer_dimension is determined based on given `feature_columns`.
-
-    Raises:
-      ValueError: If features are not a dictionary.
-    """
-    if training is None:
-      training = backend.learning_phase()
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: ',
-                       features)
-    transformation_cache = tf.__internal__.feature_column.FeatureTransformationCache(features)
-    output_tensors = []
-    for column in self._feature_columns:
-      with backend.name_scope(column.name):
-        try:
-          tensor = column.get_dense_tensor(
-              transformation_cache, self._state_manager, training=training)
-        except TypeError:
-          tensor = column.get_dense_tensor(transformation_cache,
-                                           self._state_manager)
-        processed_tensors = self._process_dense_tensor(column, tensor)
-        if cols_to_output_tensors is not None:
-          cols_to_output_tensors[column] = processed_tensors
-        output_tensors.append(processed_tensors)
-    return self._verify_and_concat_tensors(output_tensors)
+
+    def __init__(
+        self,
+        feature_columns,
+        trainable=True,
+        name=None,
+        partitioner=None,
+        **kwargs
+    ):
+        """Constructs a DenseFeatures layer.
+
+        Args:
+          feature_columns: An iterable containing the FeatureColumns to use as
+            inputs to your model. All items should be instances of classes
+            derived from `DenseColumn` such as `numeric_column`,
+            `embedding_column`, `bucketized_column`, `indicator_column`. If you
+            have categorical features, you can wrap them with an
+            `embedding_column` or `indicator_column`.
+          trainable:  Boolean, whether the layer's variables will be updated via
+            gradient descent during training.
+          name: Name to give to the DenseFeatures.
+          partitioner: Partitioner for input layer. Defaults to `None`.
+          **kwargs: Keyword arguments to construct a layer.
+
+        Raises:
+          ValueError: if an item in `feature_columns` is not a `DenseColumn`.
+        """
+        super().__init__(
+            feature_columns=feature_columns,
+            trainable=trainable,
+            name=name,
+            partitioner=partitioner,
+            expected_column_type=tf.__internal__.feature_column.DenseColumn,
+            **kwargs
+        )
+
+    @property
+    def _is_feature_layer(self):
+        return True
+
+    @property
+    def _tracking_metadata(self):
+        """String stored in metadata field in the SavedModel proto.
+
+        Returns:
+          A serialized JSON storing information necessary for recreating this
+          layer.
+        """
+        metadata = json.loads(super()._tracking_metadata)
+        metadata["_is_feature_layer"] = True
+        return json.dumps(metadata, default=json_utils.get_json_type)
+
+    def _target_shape(self, input_shape, total_elements):
+        return (input_shape[0], total_elements)
+
+    def call(self, features, cols_to_output_tensors=None, training=None):
+        """Returns a dense tensor corresponding to the `feature_columns`.
+
+        Example usage:
+
+        >>> t1 = tf.feature_column.embedding_column(
+        ...    tf.feature_column.categorical_column_with_hash_bucket("t1", 2),
+        ...    dimension=8)
+        >>> t2 = tf.feature_column.numeric_column('t2')
+        >>> feature_layer = tf.compat.v1.keras.layers.DenseFeatures([t1, t2])
+        >>> features = {"t1": tf.constant(["a", "b"]),
+        ...             "t2": tf.constant([1, 2])}
+        >>> dense_tensor = feature_layer(features, training=True)
+
+        Args:
+          features: A mapping from key to tensors. `FeatureColumn`s look up via
+            these keys. For example `numeric_column('price')` will look at
+            'price' key in this dict. Values can be a `SparseTensor` or a
+            `Tensor` depends on corresponding `FeatureColumn`.
+          cols_to_output_tensors: If not `None`, this will be filled with a dict
+            mapping feature columns to output tensors created.
+          training: Python boolean or None, indicating whether to the layer is
+            being run in training mode. This argument is passed to the call
+            method of any `FeatureColumn` that takes a `training` argument. For
+            example, if a `FeatureColumn` performed dropout, the column could
+            expose a `training` argument to control whether the dropout should
+            be applied. If `None`, becomes `tf.keras.backend.learning_phase()`.
+            Defaults to `None`.
+
+
+        Returns:
+          A `Tensor` which represents input layer of a model. Its shape
+          is (batch_size, first_layer_dimension) and its dtype is `float32`.
+          first_layer_dimension is determined based on given `feature_columns`.
+
+        Raises:
+          ValueError: If features are not a dictionary.
+        """
+        if training is None:
+            training = backend.learning_phase()
+        if not isinstance(features, dict):
+            raise ValueError(
+                "We expected a dictionary here. Instead we got: ", features
+            )
+        transformation_cache = (
+            tf.__internal__.feature_column.FeatureTransformationCache(features)
+        )
+        output_tensors = []
+        for column in self._feature_columns:
+            with backend.name_scope(column.name):
+                try:
+                    tensor = column.get_dense_tensor(
+                        transformation_cache,
+                        self._state_manager,
+                        training=training,
+                    )
+                except TypeError:
+                    tensor = column.get_dense_tensor(
+                        transformation_cache, self._state_manager
+                    )
+                processed_tensors = self._process_dense_tensor(column, tensor)
+                if cols_to_output_tensors is not None:
+                    cols_to_output_tensors[column] = processed_tensors
+                output_tensors.append(processed_tensors)
+        return self._verify_and_concat_tensors(output_tensors)
diff --git a/keras/feature_column/dense_features_test.py b/keras/feature_column/dense_features_test.py
index 135cb3270bb5..a89c0f2566b4 100644
--- a/keras/feature_column/dense_features_test.py
+++ b/keras/feature_column/dense_features_test.py
@@ -18,1129 +18,1357 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
-from tensorflow.python.eager import backprop
-from tensorflow.python.framework import test_util as tf_test_utils
-from keras.testing_infra import test_combinations
+
 from keras.feature_column import dense_features as df
+from keras.testing_infra import test_combinations
+
+# isort: off
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
 def _initialized_session(config=None):
-  sess = tf.compat.v1.Session(config=config)
-  sess.run(tf.compat.v1.global_variables_initializer())
-  sess.run(tf.compat.v1.tables_initializer())
-  return sess
+    sess = tf.compat.v1.Session(config=config)
+    sess.run(tf.compat.v1.global_variables_initializer())
+    sess.run(tf.compat.v1.tables_initializer())
+    return sess
 
 
 class DenseFeaturesTest(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_retrieving_input(self):
-    features = {'a': [0.]}
-    dense_features = df.DenseFeatures(tf.feature_column.numeric_column('a'))
-    inputs = self.evaluate(dense_features(features))
-    self.assertAllClose([[0.]], inputs)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_reuses_variables(self):
-    sparse_input = tf.SparseTensor(
-        indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3))
-
-    # Create feature columns (categorical and embedding).
-    categorical_column = tf.feature_column.categorical_column_with_identity(
-        key='a', num_buckets=3)
-    embedding_dimension = 2
-
-    def _embedding_column_initializer(shape, dtype, partition_info=None):
-      del shape  # unused
-      del dtype  # unused
-      del partition_info  # unused
-      embedding_values = (
-          (1, 0),  # id 0
-          (0, 1),  # id 1
-          (1, 1))  # id 2
-      return embedding_values
-
-    embedding_column = tf.feature_column.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_embedding_column_initializer)
-
-    dense_features = df.DenseFeatures([embedding_column])
-    features = {'a': sparse_input}
-
-    inputs = dense_features(features)
-    variables = dense_features.variables
-
-    # Sanity check: test that the inputs are correct.
-    self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
-
-    # Check that only one variable was created.
-    self.assertEqual(1, len(variables))
-
-    # Check that invoking dense_features on the same features does not create
-    # additional variables
-    _ = dense_features(features)
-    self.assertEqual(1, len(variables))
-    self.assertIs(variables[0], dense_features.variables[0])
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_dense_feature_with_partitioner(self):
-    sparse_input = tf.SparseTensor(
-        indices=((0, 0), (1, 0), (2, 0), (3, 0)),
-        values=(0, 1, 3, 2),
-        dense_shape=(4, 4))
-
-    # Create feature columns (categorical and embedding).
-    categorical_column = tf.feature_column.categorical_column_with_identity(
-        key='a', num_buckets=4)
-    embedding_dimension = 2
-
-    def _embedding_column_initializer(shape, dtype, partition_info=None):
-      offset = partition_info._var_offset[0]
-      del shape  # unused
-      del dtype  # unused
-      if offset == 0:
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1))  # id 1
-      else:
-        embedding_values = (
-            (1, 1),  # id 2
-            (2, 2))  # id 3
-      return embedding_values
-
-    embedding_column = tf.feature_column.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_embedding_column_initializer)
-
-    dense_features = df.DenseFeatures(
-        [embedding_column], partitioner=tf.compat.v1.fixed_size_partitioner(2))
-    features = {'a': sparse_input}
-
-    inputs = dense_features(features)
-    variables = dense_features.variables
-
-    # Sanity check: test that the inputs are correct.
-    self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs)
-
-    # Check that only one variable was created.
-    self.assertEqual(2, len(variables))
-
-    # Check that invoking dense_features on the same features does not create
-    # additional variables
-    _ = dense_features(features)
-    self.assertEqual(2, len(variables))
-    self.assertIs(variables[0], dense_features.variables[0])
-    self.assertIs(variables[1], dense_features.variables[1])
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_feature_column_dense_features_gradient(self):
-    sparse_input = tf.SparseTensor(
-        indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3))
-
-    # Create feature columns (categorical and embedding).
-    categorical_column = tf.feature_column.categorical_column_with_identity(
-        key='a', num_buckets=3)
-    embedding_dimension = 2
-
-    def _embedding_column_initializer(shape, dtype, partition_info=None):
-      del shape  # unused
-      del dtype  # unused
-      del partition_info  # unused
-      embedding_values = (
-          (1, 0),  # id 0
-          (0, 1),  # id 1
-          (1, 1))  # id 2
-      return embedding_values
-
-    embedding_column = tf.feature_column.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_embedding_column_initializer)
-
-    dense_features = df.DenseFeatures([embedding_column])
-    features = {'a': sparse_input}
-
-    def scale_matrix():
-      matrix = dense_features(features)
-      return 2 * matrix
-
-    # Sanity check: Verify that scale_matrix returns the correct output.
-    self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
-
-    # Check that the returned gradient is correct.
-    grad_function = backprop.implicit_grad(scale_matrix)
-    grads_and_vars = grad_function()
-    indexed_slice = grads_and_vars[0][0]
-    gradient = grads_and_vars[0][0].values
-
-    self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-    self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
-
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegex(ValueError,
-                                'feature_columns must not be empty'):
-      df.DenseFeatures(feature_columns=[])(features={})
-
-  def test_should_be_dense_column(self):
-    with self.assertRaisesRegex(ValueError, 'must be a .*DenseColumn'):
-      df.DenseFeatures(feature_columns=[
-          tf.feature_column.categorical_column_with_hash_bucket('wire_cast', 4)
-      ])(
-          features={
-              'a': [[0]]
-          })
-
-  def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      df.DenseFeatures(
-          feature_columns={'a': tf.feature_column.numeric_column('a')})(
-              features={
-                  'a': [[0]]
-              })
-
-  def test_bare_column(self):
-    with tf.Graph().as_default():
-      features = features = {'a': [0.]}
-      net = df.DenseFeatures(tf.feature_column.numeric_column('a'))(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[0.]], self.evaluate(net))
-
-  def test_column_generator(self):
-    with tf.Graph().as_default():
-      features = features = {'a': [0.], 'b': [1.]}
-      columns = (tf.feature_column.numeric_column(key) for key in features)
-      net = df.DenseFeatures(columns)(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[0., 1.]], self.evaluate(net))
-
-  def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Duplicate feature column name found for columns'):
-      df.DenseFeatures(feature_columns=[
-          tf.feature_column.numeric_column('a'),
-          tf.feature_column.numeric_column('a')
-      ])(
-          features={
-              'a': [[0]]
-          })
-
-  def test_one_column(self):
-    price = tf.feature_column.numeric_column('price')
-    with tf.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      net = df.DenseFeatures([price])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1.], [5.]], self.evaluate(net))
-
-  def test_multi_dimension(self):
-    price = tf.feature_column.numeric_column('price', shape=2)
-    with tf.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      net = df.DenseFeatures([price])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
-
-  def test_compute_output_shape(self):
-    price1 = tf.feature_column.numeric_column('price1', shape=2)
-    price2 = tf.feature_column.numeric_column('price2', shape=4)
-    with tf.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
-      }
-      dense_features = df.DenseFeatures([price1, price2])
-      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
-      net = dense_features(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
-                          self.evaluate(net))
-
-  def test_raises_if_shape_mismatch(self):
-    price = tf.feature_column.numeric_column('price', shape=2)
-    with tf.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegex(
-          Exception,
-          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        df.DenseFeatures([price])(features)
-
-  def test_reshaping(self):
-    price = tf.feature_column.numeric_column('price', shape=[1, 2])
-    with tf.Graph().as_default():
-      features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = df.DenseFeatures([price])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
-
-  def test_multi_column(self):
-    price1 = tf.feature_column.numeric_column('price1', shape=2)
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      net = df.DenseFeatures([price1, price2])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
-
-  def test_cols_to_output_tensors(self):
-    price1 = tf.feature_column.numeric_column('price1', shape=2)
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      cols_dict = {}
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      dense_features = df.DenseFeatures([price1, price2])
-      net = dense_features(features, cols_dict)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]],
-                          self.evaluate(cols_dict[price1]))
-      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
-      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
-
-  def test_column_order(self):
-    price_a = tf.feature_column.numeric_column('price_a')
-    price_b = tf.feature_column.numeric_column('price_b')
-    with tf.Graph().as_default():
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-      }
-      net1 = df.DenseFeatures([price_a, price_b])(features)
-      net2 = df.DenseFeatures([price_b, price_a])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 3.]], self.evaluate(net1))
-      self.assertAllClose([[1., 3.]], self.evaluate(net2))
-
-  def test_fails_for_categorical_column(self):
-    animal = tf.feature_column.categorical_column_with_identity(
-        'animal', num_buckets=4)
-    with tf.Graph().as_default():
-      features = {
-          'animal':
-              tf.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      with self.assertRaisesRegex(Exception, 'must be a .*DenseColumn'):
-        df.DenseFeatures([animal])(features)
-
-  def test_static_batch_size_mismatch(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {
-          'price1': [[1.], [5.], [7.]],  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      with self.assertRaisesRegex(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        df.DenseFeatures([price1, price2])(features)
-
-  def test_subset_of_static_batch_size_mismatch(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    price3 = tf.feature_column.numeric_column('price3')
-    with tf.Graph().as_default():
-      features = {
-          'price1': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 3
-          'price2': [[3.], [4.]],  # batchsize = 2
-          'price3': [[3.], [4.], [5.]]  # batchsize = 3
-      }
-      with self.assertRaisesRegex(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        df.DenseFeatures([price1, price2, price3])(features)
-
-  def test_runtime_batch_size_mismatch(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {
-          'price1': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      net = df.DenseFeatures([price1, price2])(features)
-      with _initialized_session() as sess:
-        with self.assertRaisesRegex(tf.errors.OpError,
-                                    'Dimension 0 in both shapes must be equal|'
-                                    'Dimensions of inputs should match'):
-          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
-
-  def test_runtime_batch_size_matches(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {
-          'price1': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 2
-          'price2': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 2
-      }
-      net = df.DenseFeatures([price1, price2])(features)
-      with _initialized_session() as sess:
-        sess.run(
-            net,
-            feed_dict={
-                features['price1']: [[1.], [5.]],
-                features['price2']: [[1.], [5.]],
-            })
-
-  def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = tf.feature_column.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = tf.feature_column.embedding_column(
-        some_sparse_column, dimension=10)
-
-    with tf.Graph().as_default():
-      features = {
-          'sparse_feature': [['a'], ['x']],
-      }
-      all_cols = [some_embedding_column]
-      df.DenseFeatures(all_cols)(features)
-      df.DenseFeatures(all_cols)(features)
-      # Make sure that 2 variables get created in this case.
-      self.assertEqual(
-          2,
-          len(
-              tf.compat.v1.get_collection(
-                  tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-      expected_var_names = [
-          'dense_features/sparse_feature_embedding/embedding_weights:0',
-          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
-      ]
-      self.assertCountEqual(expected_var_names, [
-          v.name for v in tf.compat.v1.get_collection(
-              tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-      ])
-
-  @tf_test_utils.run_deprecated_v1
-  def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = tf.feature_column.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = tf.feature_column.shared_embeddings(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-
-    with tf.Graph().as_default():
-      features = {
-          'aaa':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      all_cols = [embedding_column_a, embedding_column_b]
-      df.DenseFeatures(all_cols)(features)
-      df.DenseFeatures(all_cols)(features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(
-          1,
-          len(
-              tf.compat.v1.get_collection(
-                  tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertCountEqual(['aaa_bbb_shared_embedding:0'], [
-          v.name for v in tf.compat.v1.get_collection(
-              tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-      ])
-
-  @tf_test_utils.run_deprecated_v1
-  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = tf.feature_column.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = tf.feature_column.shared_embeddings(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    all_cols = [embedding_column_a, embedding_column_b]
-
-    with tf.Graph().as_default():
-      features = {
-          'aaa':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      df.DenseFeatures(all_cols)(features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(
-          1,
-          len(
-              tf.compat.v1.get_collection(
-                  tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-
-    with tf.Graph().as_default():
-      features1 = {
-          'aaa':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-
-      df.DenseFeatures(all_cols)(features1)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(
-          1,
-          len(
-              tf.compat.v1.get_collection(
-                  tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertCountEqual(['aaa_bbb_shared_embedding:0'], [
-          v.name for v in tf.compat.v1.get_collection(
-              tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-      ])
-
-  @tf_test_utils.run_deprecated_v1
-  def test_with_1d_sparse_tensor(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
     )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in dense_features
-    price = tf.feature_column.numeric_column('price')
-
-    # one_hot_body_style has 3 dims in dense_features.
-    body_style = tf.feature_column.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = tf.feature_column.indicator_column(body_style)
-
-    # embedded_body_style has 5 dims in dense_features.
-    country = tf.feature_column.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = tf.feature_column.embedding_column(
-        country, dimension=5, initializer=_initializer)
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price':
-            tf.constant([
-                11.,
-                12.,
-            ]),
-        'body-style':
-            tf.SparseTensor(
+    def test_retrieving_input(self):
+        features = {"a": [0.0]}
+        dense_features = df.DenseFeatures(tf.feature_column.numeric_column("a"))
+        inputs = self.evaluate(dense_features(features))
+        self.assertAllClose([[0.0]], inputs)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_reuses_variables(self):
+        sparse_input = tf.SparseTensor(
+            indices=((0, 0), (1, 0), (2, 0)),
+            values=(0, 1, 2),
+            dense_shape=(3, 3),
+        )
+
+        # Create feature columns (categorical and embedding).
+        categorical_column = tf.feature_column.categorical_column_with_identity(
+            key="a", num_buckets=3
+        )
+        embedding_dimension = 2
+
+        def _embedding_column_initializer(shape, dtype, partition_info=None):
+            del shape  # unused
+            del dtype  # unused
+            del partition_info  # unused
+            embedding_values = ((1, 0), (0, 1), (1, 1))  # id 0  # id 1  # id 2
+            return embedding_values
+
+        embedding_column = tf.feature_column.embedding_column(
+            categorical_column,
+            dimension=embedding_dimension,
+            initializer=_embedding_column_initializer,
+        )
+
+        dense_features = df.DenseFeatures([embedding_column])
+        features = {"a": sparse_input}
+
+        inputs = dense_features(features)
+        variables = dense_features.variables
+
+        # Sanity check: test that the inputs are correct.
+        self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+        # Check that only one variable was created.
+        self.assertEqual(1, len(variables))
+
+        # Check that invoking dense_features on the same features does not
+        # create additional variables
+        _ = dense_features(features)
+        self.assertEqual(1, len(variables))
+        self.assertIs(variables[0], dense_features.variables[0])
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_dense_feature_with_partitioner(self):
+        sparse_input = tf.SparseTensor(
+            indices=((0, 0), (1, 0), (2, 0), (3, 0)),
+            values=(0, 1, 3, 2),
+            dense_shape=(4, 4),
+        )
+
+        # Create feature columns (categorical and embedding).
+        categorical_column = tf.feature_column.categorical_column_with_identity(
+            key="a", num_buckets=4
+        )
+        embedding_dimension = 2
+
+        def _embedding_column_initializer(shape, dtype, partition_info=None):
+            offset = partition_info._var_offset[0]
+            del shape  # unused
+            del dtype  # unused
+            if offset == 0:
+                embedding_values = ((1, 0), (0, 1))  # id 0  # id 1
+            else:
+                embedding_values = ((1, 1), (2, 2))  # id 2  # id 3
+            return embedding_values
+
+        embedding_column = tf.feature_column.embedding_column(
+            categorical_column,
+            dimension=embedding_dimension,
+            initializer=_embedding_column_initializer,
+        )
+
+        dense_features = df.DenseFeatures(
+            [embedding_column],
+            partitioner=tf.compat.v1.fixed_size_partitioner(2),
+        )
+        features = {"a": sparse_input}
+
+        inputs = dense_features(features)
+        variables = dense_features.variables
+
+        # Sanity check: test that the inputs are correct.
+        self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs)
+
+        # Check that only one variable was created.
+        self.assertEqual(2, len(variables))
+
+        # Check that invoking dense_features on the same features does not
+        # create additional variables
+        _ = dense_features(features)
+        self.assertEqual(2, len(variables))
+        self.assertIs(variables[0], dense_features.variables[0])
+        self.assertIs(variables[1], dense_features.variables[1])
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_feature_column_dense_features_gradient(self):
+        sparse_input = tf.SparseTensor(
+            indices=((0, 0), (1, 0), (2, 0)),
+            values=(0, 1, 2),
+            dense_shape=(3, 3),
+        )
+
+        # Create feature columns (categorical and embedding).
+        categorical_column = tf.feature_column.categorical_column_with_identity(
+            key="a", num_buckets=3
+        )
+        embedding_dimension = 2
+
+        def _embedding_column_initializer(shape, dtype, partition_info=None):
+            del shape  # unused
+            del dtype  # unused
+            del partition_info  # unused
+            embedding_values = ((1, 0), (0, 1), (1, 1))  # id 0  # id 1  # id 2
+            return embedding_values
+
+        embedding_column = tf.feature_column.embedding_column(
+            categorical_column,
+            dimension=embedding_dimension,
+            initializer=_embedding_column_initializer,
+        )
+
+        dense_features = df.DenseFeatures([embedding_column])
+        features = {"a": sparse_input}
+
+        def scale_matrix():
+            matrix = dense_features(features)
+            return 2 * matrix
+
+        # Sanity check: Verify that scale_matrix returns the correct output.
+        self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+        # Check that the returned gradient is correct.
+        grad_function = backprop.implicit_grad(scale_matrix)
+        grads_and_vars = grad_function()
+        indexed_slice = grads_and_vars[0][0]
+        gradient = grads_and_vars[0][0].values
+
+        self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+        self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+    def test_raises_if_empty_feature_columns(self):
+        with self.assertRaisesRegex(
+            ValueError, "feature_columns must not be empty"
+        ):
+            df.DenseFeatures(feature_columns=[])(features={})
+
+    def test_should_be_dense_column(self):
+        with self.assertRaisesRegex(ValueError, "must be a .*DenseColumn"):
+            df.DenseFeatures(
+                feature_columns=[
+                    tf.feature_column.categorical_column_with_hash_bucket(
+                        "wire_cast", 4
+                    )
+                ]
+            )(features={"a": [[0]]})
+
+    def test_does_not_support_dict_columns(self):
+        with self.assertRaisesRegex(
+            ValueError, "Expected feature_columns to be iterable, found dict."
+        ):
+            df.DenseFeatures(
+                feature_columns={"a": tf.feature_column.numeric_column("a")}
+            )(features={"a": [[0]]})
+
+    def test_bare_column(self):
+        with tf.Graph().as_default():
+            features = features = {"a": [0.0]}
+            net = df.DenseFeatures(tf.feature_column.numeric_column("a"))(
+                features
+            )
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[0.0]], self.evaluate(net))
+
+    def test_column_generator(self):
+        with tf.Graph().as_default():
+            features = features = {"a": [0.0], "b": [1.0]}
+            columns = (
+                tf.feature_column.numeric_column(key) for key in features
+            )
+            net = df.DenseFeatures(columns)(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[0.0, 1.0]], self.evaluate(net))
+
+    def test_raises_if_duplicate_name(self):
+        with self.assertRaisesRegex(
+            ValueError, "Duplicate feature column name found for columns"
+        ):
+            df.DenseFeatures(
+                feature_columns=[
+                    tf.feature_column.numeric_column("a"),
+                    tf.feature_column.numeric_column("a"),
+                ]
+            )(features={"a": [[0]]})
+
+    def test_one_column(self):
+        price = tf.feature_column.numeric_column("price")
+        with tf.Graph().as_default():
+            features = {"price": [[1.0], [5.0]]}
+            net = df.DenseFeatures([price])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0], [5.0]], self.evaluate(net))
+
+    def test_multi_dimension(self):
+        price = tf.feature_column.numeric_column("price", shape=2)
+        with tf.Graph().as_default():
+            features = {"price": [[1.0, 2.0], [5.0, 6.0]]}
+            net = df.DenseFeatures([price])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0, 2.0], [5.0, 6.0]], self.evaluate(net))
+
+    def test_compute_output_shape(self):
+        price1 = tf.feature_column.numeric_column("price1", shape=2)
+        price2 = tf.feature_column.numeric_column("price2", shape=4)
+        with tf.Graph().as_default():
+            features = {
+                "price1": [[1.0, 2.0], [5.0, 6.0]],
+                "price2": [[3.0, 4.0, 5.0, 6.0], [7.0, 8.0, 9.0, 10.0]],
+            }
+            dense_features = df.DenseFeatures([price1, price2])
+            self.assertEqual(
+                (None, 6), dense_features.compute_output_shape((None,))
+            )
+            net = dense_features(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose(
+                [
+                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+                    [5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+                ],
+                self.evaluate(net),
+            )
+
+    def test_raises_if_shape_mismatch(self):
+        price = tf.feature_column.numeric_column("price", shape=2)
+        with tf.Graph().as_default():
+            features = {"price": [[1.0], [5.0]]}
+            with self.assertRaisesRegex(
+                Exception,
+                r"Cannot reshape a tensor with 2 elements to shape \[2,2\]",
+            ):
+                df.DenseFeatures([price])(features)
+
+    def test_reshaping(self):
+        price = tf.feature_column.numeric_column("price", shape=[1, 2])
+        with tf.Graph().as_default():
+            features = {"price": [[[1.0, 2.0]], [[5.0, 6.0]]]}
+            net = df.DenseFeatures([price])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0, 2.0], [5.0, 6.0]], self.evaluate(net))
+
+    def test_multi_column(self):
+        price1 = tf.feature_column.numeric_column("price1", shape=2)
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": [[1.0, 2.0], [5.0, 6.0]],
+                "price2": [[3.0], [4.0]],
+            }
+            net = df.DenseFeatures([price1, price2])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose(
+                [[1.0, 2.0, 3.0], [5.0, 6.0, 4.0]], self.evaluate(net)
+            )
+
+    def test_cols_to_output_tensors(self):
+        price1 = tf.feature_column.numeric_column("price1", shape=2)
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            cols_dict = {}
+            features = {
+                "price1": [[1.0, 2.0], [5.0, 6.0]],
+                "price2": [[3.0], [4.0]],
+            }
+            dense_features = df.DenseFeatures([price1, price2])
+            net = dense_features(features, cols_dict)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose(
+                [[1.0, 2.0], [5.0, 6.0]], self.evaluate(cols_dict[price1])
+            )
+            self.assertAllClose(
+                [[3.0], [4.0]], self.evaluate(cols_dict[price2])
+            )
+            self.assertAllClose(
+                [[1.0, 2.0, 3.0], [5.0, 6.0, 4.0]], self.evaluate(net)
+            )
+
+    def test_column_order(self):
+        price_a = tf.feature_column.numeric_column("price_a")
+        price_b = tf.feature_column.numeric_column("price_b")
+        with tf.Graph().as_default():
+            features = {
+                "price_a": [[1.0]],
+                "price_b": [[3.0]],
+            }
+            net1 = df.DenseFeatures([price_a, price_b])(features)
+            net2 = df.DenseFeatures([price_b, price_a])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0, 3.0]], self.evaluate(net1))
+            self.assertAllClose([[1.0, 3.0]], self.evaluate(net2))
+
+    def test_fails_for_categorical_column(self):
+        animal = tf.feature_column.categorical_column_with_identity(
+            "animal", num_buckets=4
+        )
+        with tf.Graph().as_default():
+            features = {
+                "animal": tf.SparseTensor(
+                    indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]
+                )
+            }
+            with self.assertRaisesRegex(Exception, "must be a .*DenseColumn"):
+                df.DenseFeatures([animal])(features)
+
+    def test_static_batch_size_mismatch(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": [[1.0], [5.0], [7.0]],  # batchsize = 3
+                "price2": [[3.0], [4.0]],  # batchsize = 2
+            }
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Batch size \(first dimension\) of each feature must be same.",
+            ):
+                df.DenseFeatures([price1, price2])(features)
+
+    def test_subset_of_static_batch_size_mismatch(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        price3 = tf.feature_column.numeric_column("price3")
+        with tf.Graph().as_default():
+            features = {
+                "price1": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 3
+                "price2": [[3.0], [4.0]],  # batchsize = 2
+                "price3": [[3.0], [4.0], [5.0]],  # batchsize = 3
+            }
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Batch size \(first dimension\) of each feature must be same.",
+            ):
+                df.DenseFeatures([price1, price2, price3])(features)
+
+    def test_runtime_batch_size_mismatch(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 3
+                "price2": [[3.0], [4.0]],  # batchsize = 2
+            }
+            net = df.DenseFeatures([price1, price2])(features)
+            with _initialized_session() as sess:
+                with self.assertRaisesRegex(
+                    tf.errors.OpError,
+                    "Dimension 0 in both shapes must be equal|"
+                    "Dimensions of inputs should match",
+                ):
+                    sess.run(
+                        net,
+                        feed_dict={features["price1"]: [[1.0], [5.0], [7.0]]},
+                    )
+
+    def test_runtime_batch_size_matches(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 2
+                "price2": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 2
+            }
+            net = df.DenseFeatures([price1, price2])(features)
+            with _initialized_session() as sess:
+                sess.run(
+                    net,
+                    feed_dict={
+                        features["price1"]: [[1.0], [5.0]],
+                        features["price2"]: [[1.0], [5.0]],
+                    },
+                )
+
+    def test_multiple_layers_with_same_embedding_column(self):
+        some_sparse_column = (
+            tf.feature_column.categorical_column_with_hash_bucket(
+                "sparse_feature", hash_bucket_size=5
+            )
+        )
+        some_embedding_column = tf.feature_column.embedding_column(
+            some_sparse_column, dimension=10
+        )
+
+        with tf.Graph().as_default():
+            features = {
+                "sparse_feature": [["a"], ["x"]],
+            }
+            all_cols = [some_embedding_column]
+            df.DenseFeatures(all_cols)(features)
+            df.DenseFeatures(all_cols)(features)
+            # Make sure that 2 variables get created in this case.
+            self.assertEqual(
+                2,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+            expected_var_names = [
+                "dense_features/sparse_feature_embedding/embedding_weights:0",
+                "dense_features_1/sparse_feature_embedding/embedding_weights:0",
+            ]
+            self.assertCountEqual(
+                expected_var_names,
+                [
+                    v.name
+                    for v in tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ],
+            )
+
+    @tf_test_utils.run_deprecated_v1
+    def test_multiple_layers_with_same_shared_embedding_column(self):
+        categorical_column_a = (
+            tf.feature_column.categorical_column_with_identity(
+                key="aaa", num_buckets=3
+            )
+        )
+        categorical_column_b = (
+            tf.feature_column.categorical_column_with_identity(
+                key="bbb", num_buckets=3
+            )
+        )
+        embedding_dimension = 2
+        (
+            embedding_column_b,
+            embedding_column_a,
+        ) = tf.feature_column.shared_embeddings(
+            [categorical_column_b, categorical_column_a],
+            dimension=embedding_dimension,
+        )
+
+        with tf.Graph().as_default():
+            features = {
+                "aaa": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 1, 0),
+                    dense_shape=(2, 2),
+                ),
+                "bbb": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(1, 2, 1),
+                    dense_shape=(2, 2),
+                ),
+            }
+            all_cols = [embedding_column_a, embedding_column_b]
+            df.DenseFeatures(all_cols)(features)
+            df.DenseFeatures(all_cols)(features)
+            # Make sure that only 1 variable gets created in this case.
+            self.assertEqual(
+                1,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+            self.assertCountEqual(
+                ["aaa_bbb_shared_embedding:0"],
+                [
+                    v.name
+                    for v in tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ],
+            )
+
+    @tf_test_utils.run_deprecated_v1
+    def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(
+        self,
+    ):
+        categorical_column_a = (
+            tf.feature_column.categorical_column_with_identity(
+                key="aaa", num_buckets=3
+            )
+        )
+        categorical_column_b = (
+            tf.feature_column.categorical_column_with_identity(
+                key="bbb", num_buckets=3
+            )
+        )
+        embedding_dimension = 2
+        (
+            embedding_column_b,
+            embedding_column_a,
+        ) = tf.feature_column.shared_embeddings(
+            [categorical_column_b, categorical_column_a],
+            dimension=embedding_dimension,
+        )
+        all_cols = [embedding_column_a, embedding_column_b]
+
+        with tf.Graph().as_default():
+            features = {
+                "aaa": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 1, 0),
+                    dense_shape=(2, 2),
+                ),
+                "bbb": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(1, 2, 1),
+                    dense_shape=(2, 2),
+                ),
+            }
+            df.DenseFeatures(all_cols)(features)
+            # Make sure that only 1 variable gets created in this case.
+            self.assertEqual(
+                1,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+
+        with tf.Graph().as_default():
+            features1 = {
+                "aaa": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 1, 0),
+                    dense_shape=(2, 2),
+                ),
+                "bbb": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(1, 2, 1),
+                    dense_shape=(2, 2),
+                ),
+            }
+
+            df.DenseFeatures(all_cols)(features1)
+            # Make sure that only 1 variable gets created in this case.
+            self.assertEqual(
+                1,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+            self.assertCountEqual(
+                ["aaa_bbb_shared_embedding:0"],
+                [
+                    v.name
+                    for v in tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ],
+            )
+
+    @tf_test_utils.run_deprecated_v1
+    def test_with_1d_sparse_tensor(self):
+        embedding_values = (
+            (1.0, 2.0, 3.0, 4.0, 5.0),  # id 0
+            (6.0, 7.0, 8.0, 9.0, 10.0),  # id 1
+            (11.0, 12.0, 13.0, 14.0, 15.0),  # id 2
+        )
+
+        def _initializer(shape, dtype, partition_info=None):
+            del shape, dtype, partition_info
+            return embedding_values
+
+        # price has 1 dimension in dense_features
+        price = tf.feature_column.numeric_column("price")
+
+        # one_hot_body_style has 3 dims in dense_features.
+        body_style = tf.feature_column.categorical_column_with_vocabulary_list(
+            "body-style", vocabulary_list=["hardtop", "wagon", "sedan"]
+        )
+        one_hot_body_style = tf.feature_column.indicator_column(body_style)
+
+        # embedded_body_style has 5 dims in dense_features.
+        country = tf.feature_column.categorical_column_with_vocabulary_list(
+            "country", vocabulary_list=["US", "JP", "CA"]
+        )
+        embedded_country = tf.feature_column.embedding_column(
+            country, dimension=5, initializer=_initializer
+        )
+
+        # Provides 1-dim tensor and dense tensor.
+        features = {
+            "price": tf.constant(
+                [
+                    11.0,
+                    12.0,
+                ]
+            ),
+            "body-style": tf.SparseTensor(
                 indices=((0,), (1,)),
-                values=('sedan', 'hardtop'),
-                dense_shape=(2,)),
-        # This is dense tensor for the categorical_column.
-        'country':
-            tf.constant(['CA', 'US']),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-    self.assertEqual(1, features['country'].shape.ndims)
-
-    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
-        features)
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-                          sess.run(net))
-
-  @tf_test_utils.run_deprecated_v1
-  def test_with_1d_unknown_shape_sparse_tensor(self):
-    embedding_values = (
-        (1., 2.),  # id 0
-        (6., 7.),  # id 1
-        (11., 12.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in dense_features
-    price = tf.feature_column.numeric_column('price')
-
-    # one_hot_body_style has 3 dims in dense_features.
-    body_style = tf.feature_column.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = tf.feature_column.indicator_column(body_style)
-
-    # embedded_body_style has 5 dims in dense_features.
-    country = tf.feature_column.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = tf.feature_column.embedding_column(
-        country, dimension=2, initializer=_initializer)
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': tf.compat.v1.placeholder(tf.float32),
-        'body-style': tf.compat.v1.sparse_placeholder(tf.string),
-        # This is dense tensor for the categorical_column.
-        'country': tf.compat.v1.placeholder(tf.string),
-    }
-    self.assertIsNone(features['price'].shape.ndims)
-    self.assertIsNone(features['body-style'].get_shape().ndims)
-    self.assertIsNone(features['country'].shape.ndims)
-
-    price_data = np.array([11., 12.])
-    body_style_data = tf.compat.v1.SparseTensorValue(
-        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
-    country_data = np.array([['US'], ['CA']])
-
-    net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
-        features)
-    self.assertEqual(1 + 3 + 2, net.shape[1])
-    with _initialized_session() as sess:
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
-          sess.run(
-              net,
-              feed_dict={
-                  features['price']: price_data,
-                  features['body-style']: body_style_data,
-                  features['country']: country_data
-              }))
-
-  @tf_test_utils.run_deprecated_v1
-  def test_with_rank_0_feature(self):
-    # price has 1 dimension in dense_features
-    price = tf.feature_column.numeric_column('price')
-    features = {
-        'price': tf.constant(0),
-    }
-    self.assertEqual(0, features['price'].shape.ndims)
-
-    # Static rank 0 should fail
-    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
-      df.DenseFeatures([price])(features)
-
-    # Dynamic rank 0 should fail
-    features = {
-        'price': tf.compat.v1.placeholder(tf.float32),
-    }
-    net = df.DenseFeatures([price])(features)
-    self.assertEqual(1, net.shape[1])
-    with _initialized_session() as sess:
-      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
-        sess.run(net, feed_dict={features['price']: np.array(1)})
+                values=("sedan", "hardtop"),
+                dense_shape=(2,),
+            ),
+            # This is dense tensor for the categorical_column.
+            "country": tf.constant(["CA", "US"]),
+        }
+        self.assertEqual(1, features["price"].shape.ndims)
+        self.assertEqual(1, features["body-style"].dense_shape.get_shape()[0])
+        self.assertEqual(1, features["country"].shape.ndims)
+
+        net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+            features
+        )
+        self.assertEqual(1 + 3 + 5, net.shape[1])
+        with _initialized_session() as sess:
+
+            # Each row is formed by concatenating `embedded_body_style`,
+            # `one_hot_body_style`, and `price` in order.
+            self.assertAllEqual(
+                [
+                    [0.0, 0.0, 1.0, 11.0, 12.0, 13.0, 14.0, 15.0, 11.0],
+                    [1.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 12.0],
+                ],
+                sess.run(net),
+            )
+
+    @tf_test_utils.run_deprecated_v1
+    def test_with_1d_unknown_shape_sparse_tensor(self):
+        embedding_values = (
+            (1.0, 2.0),  # id 0
+            (6.0, 7.0),  # id 1
+            (11.0, 12.0),  # id 2
+        )
+
+        def _initializer(shape, dtype, partition_info=None):
+            del shape, dtype, partition_info
+            return embedding_values
+
+        # price has 1 dimension in dense_features
+        price = tf.feature_column.numeric_column("price")
+
+        # one_hot_body_style has 3 dims in dense_features.
+        body_style = tf.feature_column.categorical_column_with_vocabulary_list(
+            "body-style", vocabulary_list=["hardtop", "wagon", "sedan"]
+        )
+        one_hot_body_style = tf.feature_column.indicator_column(body_style)
+
+        # embedded_body_style has 5 dims in dense_features.
+        country = tf.feature_column.categorical_column_with_vocabulary_list(
+            "country", vocabulary_list=["US", "JP", "CA"]
+        )
+        embedded_country = tf.feature_column.embedding_column(
+            country, dimension=2, initializer=_initializer
+        )
+
+        # Provides 1-dim tensor and dense tensor.
+        features = {
+            "price": tf.compat.v1.placeholder(tf.float32),
+            "body-style": tf.compat.v1.sparse_placeholder(tf.string),
+            # This is dense tensor for the categorical_column.
+            "country": tf.compat.v1.placeholder(tf.string),
+        }
+        self.assertIsNone(features["price"].shape.ndims)
+        self.assertIsNone(features["body-style"].get_shape().ndims)
+        self.assertIsNone(features["country"].shape.ndims)
+
+        price_data = np.array([11.0, 12.0])
+        body_style_data = tf.compat.v1.SparseTensorValue(
+            indices=((0,), (1,)), values=("sedan", "hardtop"), dense_shape=(2,)
+        )
+        country_data = np.array([["US"], ["CA"]])
+
+        net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
+            features
+        )
+        self.assertEqual(1 + 3 + 2, net.shape[1])
+        with _initialized_session() as sess:
+
+            # Each row is formed by concatenating `embedded_body_style`,
+            # `one_hot_body_style`, and `price` in order.
+            self.assertAllEqual(
+                [
+                    [0.0, 0.0, 1.0, 1.0, 2.0, 11.0],
+                    [1.0, 0.0, 0.0, 11.0, 12.0, 12.0],
+                ],
+                sess.run(
+                    net,
+                    feed_dict={
+                        features["price"]: price_data,
+                        features["body-style"]: body_style_data,
+                        features["country"]: country_data,
+                    },
+                ),
+            )
+
+    @tf_test_utils.run_deprecated_v1
+    def test_with_rank_0_feature(self):
+        # price has 1 dimension in dense_features
+        price = tf.feature_column.numeric_column("price")
+        features = {
+            "price": tf.constant(0),
+        }
+        self.assertEqual(0, features["price"].shape.ndims)
+
+        # Static rank 0 should fail
+        with self.assertRaisesRegex(
+            ValueError, "Feature .* cannot have rank 0"
+        ):
+            df.DenseFeatures([price])(features)
+
+        # Dynamic rank 0 should fail
+        features = {
+            "price": tf.compat.v1.placeholder(tf.float32),
+        }
+        net = df.DenseFeatures([price])(features)
+        self.assertEqual(1, net.shape[1])
+        with _initialized_session() as sess:
+            with self.assertRaisesOpError("Feature .* cannot have rank 0"):
+                sess.run(net, feed_dict={features["price"]: np.array(1)})
 
 
 class IndicatorColumnTest(tf.test.TestCase):
+    @tf_test_utils.run_deprecated_v1
+    def test_dense_features(self):
+        animal = tf.feature_column.indicator_column(
+            tf.feature_column.categorical_column_with_identity(
+                "animal", num_buckets=4
+            )
+        )
+        with tf.Graph().as_default():
+            features = {
+                "animal": tf.SparseTensor(
+                    indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]
+                )
+            }
+            net = df.DenseFeatures([animal])(features)
 
-  @tf_test_utils.run_deprecated_v1
-  def test_dense_features(self):
-    animal = tf.feature_column.indicator_column(
-        tf.feature_column.categorical_column_with_identity(
-            'animal', num_buckets=4))
-    with tf.Graph().as_default():
-      features = {
-          'animal':
-              tf.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
 
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+            self.assertAllClose([[0.0, 1.0, 1.0, 0.0]], self.evaluate(net))
 
 
 class EmbeddingColumnTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True,
-          'partition_variables': False,
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False,
-          'partition_variables': False,
-      }, {
-          'testcase_name': 'use_safe_embedding_lookup_partitioned',
-          'use_safe_embedding_lookup': True,
-          'partition_variables': True,
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
-          'use_safe_embedding_lookup': False,
-          'partition_variables': True,
-      })
-  @tf_test_utils.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
-    # Inputs.
-    vocabulary_size = 4
-    sparse_input = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.),  # id 2
-        (9., 13.)  # id 3
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      if partition_variables:
-        self.assertEqual([vocabulary_size, embedding_dimension],
-                         partition_info.full_shape)
-        self.assertAllEqual((2, embedding_dimension), shape)
-      else:
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertIsNone(partition_info)
-
-      self.assertEqual(tf.float32, dtype)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "use_safe_embedding_lookup",
+            "use_safe_embedding_lookup": True,
+            "partition_variables": False,
+        },
+        {
+            "testcase_name": "dont_use_safe_embedding_lookup",
+            "use_safe_embedding_lookup": False,
+            "partition_variables": False,
+        },
+        {
+            "testcase_name": "use_safe_embedding_lookup_partitioned",
+            "use_safe_embedding_lookup": True,
+            "partition_variables": True,
+        },
+        {
+            "testcase_name": "dont_use_safe_embedding_lookup_partitioned",
+            "use_safe_embedding_lookup": False,
+            "partition_variables": True,
+        },
     )
-
-    # Build columns.
-    categorical_column = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    partitioner = None
-    if partition_variables:
-      partitioner = tf.compat.v1.fixed_size_partitioner(2, axis=0)
-    with tf.compat.v1.variable_scope('vars', partitioner=partitioner):
-      embedding_column = tf.feature_column.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_initializer,
-          use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-      # Provide sparse input and get dense result.
-      l = df.DenseFeatures((embedding_column,))
-      dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = tf.compat.v1.get_collection(
-        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
-           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
-          tuple([v.name for v in global_vars]))
-    else:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
-          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, tf.Variable)
-    trainable_vars = tf.compat.v1.get_collection(
-        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
-           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
-          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(tf.compat.v1.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn(
-          'SparseFillEmptyRows',
-          [x.type for x in tf.compat.v1.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in tf.compat.v1.get_default_graph().get_operations()])
-
-  @tf_test_utils.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(tf.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = tf.feature_column.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = tf.compat.v1.get_collection(
-        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertCountEqual([],
-                          tf.compat.v1.get_collection(
-                              tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(tf.compat.v1.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+    @tf_test_utils.run_deprecated_v1
+    def test_dense_features(
+        self, use_safe_embedding_lookup, partition_variables
+    ):
+        # Inputs.
+        vocabulary_size = 4
+        sparse_input = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            # example 2, ids []
+            # example 3, ids [1]
+            indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+            values=(2, 0, 1, 1),
+            dense_shape=(4, 5),
+        )
+
+        # Embedding variable.
+        embedding_dimension = 2
+        embedding_values = (
+            (1.0, 2.0),  # id 0
+            (3.0, 5.0),  # id 1
+            (7.0, 11.0),  # id 2
+            (9.0, 13.0),  # id 3
+        )
+
+        def _initializer(shape, dtype, partition_info=None):
+            if partition_variables:
+                self.assertEqual(
+                    [vocabulary_size, embedding_dimension],
+                    partition_info.full_shape,
+                )
+                self.assertAllEqual((2, embedding_dimension), shape)
+            else:
+                self.assertAllEqual(
+                    (vocabulary_size, embedding_dimension), shape
+                )
+                self.assertIsNone(partition_info)
+
+            self.assertEqual(tf.float32, dtype)
+            return embedding_values
+
+        # Expected lookup result, using combiner='mean'.
+        expected_lookups = (
+            # example 0, ids [2], embedding = [7, 11]
+            (7.0, 11.0),
+            # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2,
+            # 3.5]
+            (2.0, 3.5),
+            # example 2, ids [], embedding = [0, 0]
+            (0.0, 0.0),
+            # example 3, ids [1], embedding = [3, 5]
+            (3.0, 5.0),
+        )
+
+        # Build columns.
+        categorical_column = tf.feature_column.categorical_column_with_identity(
+            key="aaa", num_buckets=vocabulary_size
+        )
+        partitioner = None
+        if partition_variables:
+            partitioner = tf.compat.v1.fixed_size_partitioner(2, axis=0)
+        with tf.compat.v1.variable_scope("vars", partitioner=partitioner):
+            embedding_column = tf.feature_column.embedding_column(
+                categorical_column,
+                dimension=embedding_dimension,
+                initializer=_initializer,
+                use_safe_embedding_lookup=use_safe_embedding_lookup,
+            )
+
+            # Provide sparse input and get dense result.
+            l = df.DenseFeatures((embedding_column,))
+            dense_features = l({"aaa": sparse_input})
+
+        # Assert expected embedding variable and lookups.
+        global_vars = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+        )
+        if partition_variables:
+            self.assertCountEqual(
+                (
+                    "vars/dense_features/aaa_embedding/embedding_weights/"
+                    "part_0:0",
+                    "vars/dense_features/aaa_embedding/embedding_weights/"
+                    "part_1:0",
+                ),
+                tuple([v.name for v in global_vars]),
+            )
+        else:
+            self.assertCountEqual(
+                ("vars/dense_features/aaa_embedding/embedding_weights:0",),
+                tuple([v.name for v in global_vars]),
+            )
+        for v in global_vars:
+            self.assertIsInstance(v, tf.Variable)
+        trainable_vars = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+        )
+        if partition_variables:
+            self.assertCountEqual(
+                (
+                    "vars/dense_features/aaa_embedding/embedding_weights/"
+                    "part_0:0",
+                    "vars/dense_features/aaa_embedding/embedding_weights/"
+                    "part_1:0",
+                ),
+                tuple([v.name for v in trainable_vars]),
+            )
+        else:
+            self.assertCountEqual(
+                ("vars/dense_features/aaa_embedding/embedding_weights:0",),
+                tuple([v.name for v in trainable_vars]),
+            )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(tf.compat.v1.tables_initializer())
+
+        self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+        self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+        if use_safe_embedding_lookup:
+            self.assertIn(
+                "SparseFillEmptyRows",
+                [
+                    x.type
+                    for x in tf.compat.v1.get_default_graph().get_operations()
+                ],
+            )
+        else:
+            self.assertNotIn(
+                "SparseFillEmptyRows",
+                [
+                    x.type
+                    for x in tf.compat.v1.get_default_graph().get_operations()
+                ],
+            )
+
+    @tf_test_utils.run_deprecated_v1
+    def test_dense_features_not_trainable(self):
+        # Inputs.
+        vocabulary_size = 3
+        sparse_input = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            # example 2, ids []
+            # example 3, ids [1]
+            indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+            values=(2, 0, 1, 1),
+            dense_shape=(4, 5),
+        )
+
+        # Embedding variable.
+        embedding_dimension = 2
+        embedding_values = (
+            (1.0, 2.0),  # id 0
+            (3.0, 5.0),  # id 1
+            (7.0, 11.0),  # id 2
+        )
+
+        def _initializer(shape, dtype, partition_info=None):
+            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+            self.assertEqual(tf.float32, dtype)
+            self.assertIsNone(partition_info)
+            return embedding_values
+
+        # Expected lookup result, using combiner='mean'.
+        expected_lookups = (
+            # example 0, ids [2], embedding = [7, 11]
+            (7.0, 11.0),
+            # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2,
+            # 3.5]
+            (2.0, 3.5),
+            # example 2, ids [], embedding = [0, 0]
+            (0.0, 0.0),
+            # example 3, ids [1], embedding = [3, 5]
+            (3.0, 5.0),
+        )
+
+        # Build columns.
+        categorical_column = tf.feature_column.categorical_column_with_identity(
+            key="aaa", num_buckets=vocabulary_size
+        )
+        embedding_column = tf.feature_column.embedding_column(
+            categorical_column,
+            dimension=embedding_dimension,
+            initializer=_initializer,
+            trainable=False,
+        )
+
+        # Provide sparse input and get dense result.
+        dense_features = df.DenseFeatures((embedding_column,))(
+            {"aaa": sparse_input}
+        )
+
+        # Assert expected embedding variable and lookups.
+        global_vars = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+        )
+        self.assertCountEqual(
+            ("dense_features/aaa_embedding/embedding_weights:0",),
+            tuple([v.name for v in global_vars]),
+        )
+        self.assertCountEqual(
+            [],
+            tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+            ),
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(tf.compat.v1.tables_initializer())
+
+        self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+        self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
 
 
 class SharedEmbeddingColumnTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
+    def _test_dense_features(self, trainable=True):
+        # Inputs.
+        vocabulary_size = 3
+        sparse_input_a = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            indices=((0, 0), (1, 0), (1, 4)),
+            values=(2, 0, 1),
+            dense_shape=(2, 5),
+        )
+        sparse_input_b = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [0]
+            # example 1, ids []
+            indices=((0, 0),),
+            values=(0,),
+            dense_shape=(2, 5),
+        )
+        sparse_input_c = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            indices=((0, 1), (1, 1), (1, 3)),
+            values=(2, 0, 1),
+            dense_shape=(2, 5),
+        )
+        sparse_input_d = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids []
+            indices=((0, 1),),
+            values=(2,),
+            dense_shape=(2, 5),
+        )
+
+        # Embedding variable.
+        embedding_dimension = 2
+        embedding_values = (
+            (1.0, 2.0),  # id 0
+            (3.0, 5.0),  # id 1
+            (7.0, 11.0),  # id 2
+        )
+
+        def _initializer(shape, dtype, partition_info=None):
+            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+            self.assertEqual(tf.float32, dtype)
+            self.assertIsNone(partition_info)
+            return embedding_values
+
+        # Expected lookup result, using combiner='mean'.
+        expected_lookups = (
+            # example 0:
+            # A ids [2], embedding = [7, 11]
+            # B ids [0], embedding = [1, 2]
+            # C ids [2], embedding = [7, 11]
+            # D ids [2], embedding = [7, 11]
+            (7.0, 11.0, 1.0, 2.0, 7.0, 11.0, 7.0, 11.0),
+            # example 1:
+            # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+            # B ids [], embedding = [0, 0]
+            # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+            # D ids [], embedding = [0, 0]
+            (2.0, 3.5, 0.0, 0.0, 2.0, 3.5, 0.0, 0.0),
+        )
+
+        # Build columns.
+        categorical_column_a = (
+            tf.feature_column.categorical_column_with_identity(
+                key="aaa", num_buckets=vocabulary_size
+            )
+        )
+        categorical_column_b = (
+            tf.feature_column.categorical_column_with_identity(
+                key="bbb", num_buckets=vocabulary_size
+            )
+        )
+        categorical_column_c = (
+            tf.feature_column.categorical_column_with_identity(
+                key="ccc", num_buckets=vocabulary_size
+            )
+        )
+        categorical_column_d = (
+            tf.feature_column.categorical_column_with_identity(
+                key="ddd", num_buckets=vocabulary_size
+            )
+        )
+
+        (
+            embedding_column_a,
+            embedding_column_b,
+        ) = tf.feature_column.shared_embeddings(
+            [categorical_column_a, categorical_column_b],
+            dimension=embedding_dimension,
+            initializer=_initializer,
+            trainable=trainable,
+        )
+        (
+            embedding_column_c,
+            embedding_column_d,
+        ) = tf.feature_column.shared_embeddings(
+            [categorical_column_c, categorical_column_d],
+            dimension=embedding_dimension,
+            initializer=_initializer,
+            trainable=trainable,
+        )
+
+        features = {
+            "aaa": sparse_input_a,
+            "bbb": sparse_input_b,
+            "ccc": sparse_input_c,
+            "ddd": sparse_input_d,
+        }
+
+        # Provide sparse input and get dense result.
+        dense_features = df.DenseFeatures(
+            feature_columns=(
+                embedding_column_b,
+                embedding_column_a,
+                embedding_column_c,
+                embedding_column_d,
+            )
+        )(features)
+
+        # Assert expected embedding variable and lookups.
+        global_vars = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+        )
+        self.assertCountEqual(
+            ["aaa_bbb_shared_embedding:0", "ccc_ddd_shared_embedding:0"],
+            tuple([v.name for v in global_vars]),
+        )
+        for v in global_vars:
+            self.assertIsInstance(v, tf.Variable)
+        trainable_vars = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+        )
+        if trainable:
+            self.assertCountEqual(
+                ["aaa_bbb_shared_embedding:0", "ccc_ddd_shared_embedding:0"],
+                tuple([v.name for v in trainable_vars]),
+            )
+        else:
+            self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+        shared_embedding_vars = global_vars
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(tf.compat.v1.tables_initializer())
+
+        self.assertAllEqual(
+            embedding_values, self.evaluate(shared_embedding_vars[0])
+        )
+        self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    @tf_test_utils.run_deprecated_v1
+    def test_dense_features(self):
+        self._test_dense_features()
+
+    @tf_test_utils.run_deprecated_v1
+    def test_dense_features_no_trainable(self):
+        self._test_dense_features(trainable=False)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class DenseFeaturesSerializationTest(tf.test.TestCase, parameterized.TestCase):
+    @parameterized.named_parameters(
+        ("trainable", True, "trainable"), ("not_trainable", False, "frozen")
     )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(tf.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    def test_get_config(self, trainable, name):
+        cols = [
+            tf.feature_column.numeric_column("a"),
+            tf.feature_column.embedding_column(
+                tf.feature_column.categorical_column_with_identity(
+                    key="b", num_buckets=3
+                ),
+                dimension=2,
+            ),
+        ]
+        orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name)
+        config = orig_layer.get_config()
+
+        self.assertEqual(config["name"], orig_layer.name)
+        self.assertEqual(config["trainable"], trainable)
+        self.assertLen(config["feature_columns"], 2)
+        self.assertEqual(
+            config["feature_columns"][0]["class_name"], "NumericColumn"
+        )
+        self.assertEqual(config["feature_columns"][0]["config"]["shape"], (1,))
+        self.assertEqual(
+            config["feature_columns"][1]["class_name"], "EmbeddingColumn"
+        )
+
+    @parameterized.named_parameters(
+        ("trainable", True, "trainable"), ("not_trainable", False, "frozen")
     )
-
-    # Build columns.
-    categorical_column_a = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = tf.feature_column.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = tf.feature_column.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = tf.feature_column.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = tf.feature_column.shared_embeddings(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = tf.feature_column.shared_embeddings(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = tf.compat.v1.get_collection(
-        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, tf.Variable)
-    trainable_vars = tf.compat.v1.get_collection(
-        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertCountEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(tf.compat.v1.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @tf_test_utils.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @tf_test_utils.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class DenseFeaturesSerializationTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(('trainable', True, 'trainable'),
-                                  ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [
-        tf.feature_column.numeric_column('a'),
-        tf.feature_column.embedding_column(
-            tf.feature_column.categorical_column_with_identity(
-                key='b', num_buckets=3),
-            dimension=2)
-    ]
-    orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(config['feature_columns'][0]['class_name'],
-                     'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(config['feature_columns'][1]['class_name'],
-                     'EmbeddingColumn')
-
-  @parameterized.named_parameters(('trainable', True, 'trainable'),
-                                  ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [
-        tf.feature_column.numeric_column('a'),
-        tf.feature_column.embedding_column(
-            tf.feature_column.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']),
-            dimension=2),
-        tf.feature_column.indicator_column(
-            tf.feature_column.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))
-    ]
-    orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = df.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__)
-    self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__)
-    self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
-
-  def test_crossed_column(self):
-    a = tf.feature_column.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = tf.feature_column.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = tf.feature_column.crossed_column([a, b], hash_bucket_size=2)
-    cols = [tf.feature_column.indicator_column(ab)]
-
-    orig_layer = df.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = df.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_from_config(self, trainable, name):
+        cols = [
+            tf.feature_column.numeric_column("a"),
+            tf.feature_column.embedding_column(
+                tf.feature_column.categorical_column_with_vocabulary_list(
+                    "b", vocabulary_list=["1", "2", "3"]
+                ),
+                dimension=2,
+            ),
+            tf.feature_column.indicator_column(
+                tf.feature_column.categorical_column_with_hash_bucket(
+                    key="c", hash_bucket_size=3
+                )
+            ),
+        ]
+        orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name)
+        config = orig_layer.get_config()
+
+        new_layer = df.DenseFeatures.from_config(config)
+
+        self.assertEqual(new_layer.name, orig_layer.name)
+        self.assertEqual(new_layer.trainable, trainable)
+        self.assertLen(new_layer._feature_columns, 3)
+        self.assertEqual(new_layer._feature_columns[0].name, "a")
+        self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+        self.assertEqual(
+            new_layer._feature_columns[1].categorical_column.name, "b"
+        )
+        self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__)
+        self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__)
+        self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
+
+    def test_crossed_column(self):
+        a = tf.feature_column.categorical_column_with_vocabulary_list(
+            "a", vocabulary_list=["1", "2", "3"]
+        )
+        b = tf.feature_column.categorical_column_with_vocabulary_list(
+            "b", vocabulary_list=["1", "2", "3"]
+        )
+        ab = tf.feature_column.crossed_column([a, b], hash_bucket_size=2)
+        cols = [tf.feature_column.indicator_column(ab)]
+
+        orig_layer = df.DenseFeatures(cols)
+        config = orig_layer.get_config()
+
+        new_layer = df.DenseFeatures.from_config(config)
+
+        self.assertLen(new_layer._feature_columns, 1)
+        self.assertEqual(new_layer._feature_columns[0].name, "a_X_b_indicator")
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class SequenceFeatureColumnsTest(tf.test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = tf.feature_column.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = tf.feature_column.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = df.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegex(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = tf.feature_column.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = tf.feature_column.indicator_column(
-        categorical_column_a)
-
-    input_layer = df.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegex(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Tests DenseFeatures with sequence feature columns."""
+
+    def test_embedding_column(self):
+        """Tests that error is raised for sequence embedding column."""
+        vocabulary_size = 3
+        sparse_input = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(2, 0, 1),
+            dense_shape=(2, 2),
+        )
+
+        categorical_column_a = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                key="aaa", num_buckets=vocabulary_size
+            )
+        )
+        embedding_column_a = tf.feature_column.embedding_column(
+            categorical_column_a, dimension=2
+        )
+
+        input_layer = df.DenseFeatures([embedding_column_a])
+        with self.assertRaisesRegex(
+            ValueError,
+            r"In embedding_column: aaa_embedding\. categorical_column must not "
+            r"be of type SequenceCategoricalColumn\.",
+        ):
+            _ = input_layer({"aaa": sparse_input})
+
+    def test_indicator_column(self):
+        """Tests that error is raised for sequence indicator column."""
+        vocabulary_size = 3
+        sparse_input = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(2, 0, 1),
+            dense_shape=(2, 2),
+        )
+
+        categorical_column_a = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                key="aaa", num_buckets=vocabulary_size
+            )
+        )
+        indicator_column_a = tf.feature_column.indicator_column(
+            categorical_column_a
+        )
+
+        input_layer = df.DenseFeatures([indicator_column_a])
+        with self.assertRaisesRegex(
+            ValueError,
+            r"In indicator_column: aaa_indicator\. categorical_column must not "
+            r"be of type SequenceCategoricalColumn\.",
+        ):
+            _ = input_layer({"aaa": sparse_input})
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/feature_column/dense_features_v2.py b/keras/feature_column/dense_features_v2.py
index 16259f78125a..f731d7163a94 100644
--- a/keras/feature_column/dense_features_v2.py
+++ b/keras/feature_column/dense_features_v2.py
@@ -19,140 +19,146 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.feature_column import base_feature_layer as kfc
 from keras.feature_column import dense_features
 from keras.utils import tf_contextlib
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.DenseFeatures', v1=[])
+@keras_export("keras.layers.DenseFeatures", v1=[])
 class DenseFeatures(dense_features.DenseFeatures):
-  """A layer that produces a dense `Tensor` based on given `feature_columns`.
-
-  Generally a single example in training data is described with FeatureColumns.
-  At the first layer of the model, this column oriented data should be converted
-  to a single `Tensor`.
-
-  This layer can be called multiple times with different features.
-
-  This is the V2 version of this layer that uses name_scopes to create
-  variables instead of variable_scopes. But this approach currently lacks
-  support for partitioned variables. In that case, use the V1 version instead.
-
-  Example:
-
-  ```python
-  price = tf.feature_column.numeric_column('price')
-  keywords_embedded = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_hash_bucket("keywords", 10000),
-      dimensions=16)
-  columns = [price, keywords_embedded, ...]
-  feature_layer = tf.keras.layers.DenseFeatures(columns)
-
-  features = tf.io.parse_example(
-      ..., features=tf.feature_column.make_parse_example_spec(columns))
-  dense_tensor = feature_layer(features)
-  for units in [128, 64, 32]:
-    dense_tensor = tf.keras.layers.Dense(units, activation='relu')(dense_tensor)
-  prediction = tf.keras.layers.Dense(1)(dense_tensor)
-  ```
-  """
-
-  def __init__(self,
-               feature_columns,
-               trainable=True,
-               name=None,
-               **kwargs):
-    """Creates a DenseFeatures object.
-
-    Args:
-      feature_columns: An iterable containing the FeatureColumns to use as
-        inputs to your model. All items should be instances of classes derived
-        from `DenseColumn` such as `numeric_column`, `embedding_column`,
-        `bucketized_column`, `indicator_column`. If you have categorical
-        features, you can wrap them with an `embedding_column` or
-        `indicator_column`.
-      trainable:  Boolean, whether the layer's variables will be updated via
-        gradient descent during training.
-      name: Name to give to the DenseFeatures.
-      **kwargs: Keyword arguments to construct a layer.
-
-    Raises:
-      ValueError: if an item in `feature_columns` is not a `DenseColumn`.
+    """A layer that produces a dense `Tensor` based on given `feature_columns`.
+
+    Generally a single example in training data is described with
+    FeatureColumns.  At the first layer of the model, this column oriented data
+    should be converted to a single `Tensor`.
+
+    This layer can be called multiple times with different features.
+
+    This is the V2 version of this layer that uses name_scopes to create
+    variables instead of variable_scopes. But this approach currently lacks
+    support for partitioned variables. In that case, use the V1 version instead.
+
+    Example:
+
+    ```python
+    price = tf.feature_column.numeric_column('price')
+    keywords_embedded = tf.feature_column.embedding_column(
+        tf.feature_column.categorical_column_with_hash_bucket("keywords",
+                                                              10000),
+        dimensions=16)
+    columns = [price, keywords_embedded, ...]
+    feature_layer = tf.keras.layers.DenseFeatures(columns)
+
+    features = tf.io.parse_example(
+        ..., features=tf.feature_column.make_parse_example_spec(columns))
+    dense_tensor = feature_layer(features)
+    for units in [128, 64, 32]:
+      dense_tensor = tf.keras.layers.Dense(units, activation='relu')(
+        dense_tensor)
+    prediction = tf.keras.layers.Dense(1)(dense_tensor)
+    ```
     """
-    super().__init__(
-        feature_columns=feature_columns,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-    self._state_manager = _StateManagerImplV2(self, self.trainable)
-
-  def build(self, _):
-    for column in self._feature_columns:
-      with tf.name_scope(column.name):
-        column.create_state(self._state_manager)
-    # We would like to call Layer.build and not _DenseFeaturesHelper.build.
-    # pylint: disable=protected-access
-    super(kfc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
-
-
-class _StateManagerImplV2(tf.__internal__.feature_column.StateManager):  # pylint: disable=protected-access
-  """Manages the state of DenseFeatures."""
-
-  def create_variable(self,
-                      feature_column,
-                      name,
-                      shape,
-                      dtype=None,
-                      trainable=True,
-                      use_resource=True,
-                      initializer=None):
-    if name in self._cols_to_vars_map[feature_column]:
-      raise ValueError('Variable already exists.')
-
-    # We explicitly track these variables since `name` is not guaranteed to be
-    # unique and disable manual tracking that the add_weight call does.
-    with no_manual_dependency_tracking_scope(self._layer):
-      var = self._layer.add_weight(
-          name=name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          trainable=self._trainable and trainable,
-          use_resource=use_resource)
-    if isinstance(var, tf.__internal__.tracking.Trackable):
-      self._layer._track_trackable(var, feature_column.name + '/' + name)  # pylint: disable=protected-access
-    self._cols_to_vars_map[feature_column][name] = var
-    return var
+
+    def __init__(self, feature_columns, trainable=True, name=None, **kwargs):
+        """Creates a DenseFeatures object.
+
+        Args:
+          feature_columns: An iterable containing the FeatureColumns to use as
+            inputs to your model. All items should be instances of classes
+            derived from `DenseColumn` such as `numeric_column`,
+            `embedding_column`, `bucketized_column`, `indicator_column`. If you
+            have categorical features, you can wrap them with an
+            `embedding_column` or `indicator_column`.
+          trainable:  Boolean, whether the layer's variables will be updated via
+            gradient descent during training.
+          name: Name to give to the DenseFeatures.
+          **kwargs: Keyword arguments to construct a layer.
+
+        Raises:
+          ValueError: if an item in `feature_columns` is not a `DenseColumn`.
+        """
+        super().__init__(
+            feature_columns=feature_columns,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+        self._state_manager = _StateManagerImplV2(self, self.trainable)
+
+    def build(self, _):
+        for column in self._feature_columns:
+            with tf.name_scope(column.name):
+                column.create_state(self._state_manager)
+        # We would like to call Layer.build and not _DenseFeaturesHelper.build.
+
+        super(kfc._BaseFeaturesLayer, self).build(None)
+
+
+class _StateManagerImplV2(tf.__internal__.feature_column.StateManager):
+    """Manages the state of DenseFeatures."""
+
+    def create_variable(
+        self,
+        feature_column,
+        name,
+        shape,
+        dtype=None,
+        trainable=True,
+        use_resource=True,
+        initializer=None,
+    ):
+        if name in self._cols_to_vars_map[feature_column]:
+            raise ValueError("Variable already exists.")
+
+        # We explicitly track these variables since `name` is not guaranteed to
+        # be unique and disable manual tracking that the add_weight call does.
+        with no_manual_dependency_tracking_scope(self._layer):
+            var = self._layer.add_weight(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                initializer=initializer,
+                trainable=self._trainable and trainable,
+                use_resource=use_resource,
+            )
+        if isinstance(var, tf.__internal__.tracking.Trackable):
+            self._layer._track_trackable(var, feature_column.name + "/" + name)
+        self._cols_to_vars_map[feature_column][name] = var
+        return var
 
 
 @tf_contextlib.contextmanager
 def no_manual_dependency_tracking_scope(obj):
-  """A context that disables manual dependency tracking for the given `obj`.
-
-  Sometimes library methods might track objects on their own and we might want
-  to disable that and do the tracking on our own. One can then use this context
-  manager to disable the tracking the library method does and do your own
-  tracking.
-
-  For example:
-
-  class TestLayer(tf.keras.Layer):
-    def build():
-      with no_manual_dependency_tracking_scope(self):
-        var = self.add_weight("name1")  # Creates a var and doesn't track it
-      self._track_trackable("name2", var)  # We track variable with name `name2`
-
-  Args:
-    obj: A trackable object.
-
-  Yields:
-    a scope in which the object doesn't track dependencies manually.
-  """
-  # pylint: disable=protected-access
-  previous_value = getattr(obj, '_manual_tracking', True)
-  obj._manual_tracking = False
-  try:
-    yield
-  finally:
-    obj._manual_tracking = previous_value
+    """A context that disables manual dependency tracking for the given `obj`.
+
+    Sometimes library methods might track objects on their own and we might want
+    to disable that and do the tracking on our own. One can then use this
+    context manager to disable the tracking the library method does and do your
+    own tracking.
+
+    For example:
+
+    class TestLayer(tf.keras.Layer):
+      def build():
+        with no_manual_dependency_tracking_scope(self):
+          var = self.add_weight("name1")  # Creates a var and doesn't track it
+        # We track variable with name `name2`
+        self._track_trackable("name2", var)
+
+    Args:
+      obj: A trackable object.
+
+    Yields:
+      a scope in which the object doesn't track dependencies manually.
+    """
+
+    previous_value = getattr(obj, "_manual_tracking", True)
+    obj._manual_tracking = False
+    try:
+        yield
+    finally:
+        obj._manual_tracking = previous_value
diff --git a/keras/feature_column/dense_features_v2_test.py b/keras/feature_column/dense_features_v2_test.py
index d0b2ab342075..d984fced6ba8 100644
--- a/keras/feature_column/dense_features_v2_test.py
+++ b/keras/feature_column/dense_features_v2_test.py
@@ -18,638 +18,790 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
-from tensorflow.python.eager import backprop
-from keras.testing_infra import test_combinations
 from keras.feature_column import dense_features_v2 as df
+from keras.testing_infra import test_combinations
+
+# isort: off
+from tensorflow.python.eager import backprop
 
 
 def _initialized_session(config=None):
-  sess = tf.compat.v1.Session(config=config)
-  sess.run(tf.compat.v1.global_variables_initializer())
-  sess.run(tf.compat.v1.tables_initializer())
-  return sess
+    sess = tf.compat.v1.Session(config=config)
+    sess.run(tf.compat.v1.global_variables_initializer())
+    sess.run(tf.compat.v1.tables_initializer())
+    return sess
 
 
 class DenseFeaturesTest(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_retrieving_input(self):
-    features = {'a': [0.]}
-    dense_features = df.DenseFeatures(tf.feature_column.numeric_column('a'))
-    inputs = self.evaluate(dense_features(features))
-    self.assertAllClose([[0.]], inputs)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_reuses_variables(self):
-    sparse_input = tf.SparseTensor(
-        indices=((0, 0), (1, 0), (2, 0)),
-        values=(0, 1, 2),
-        dense_shape=(3, 3))
-
-    # Create feature columns (categorical and embedding).
-    categorical_column = tf.feature_column.categorical_column_with_identity(
-        key='a', num_buckets=3)
-    embedding_dimension = 2
-
-    def _embedding_column_initializer(shape, dtype, partition_info=None):
-      del shape  # unused
-      del dtype  # unused
-      del partition_info  # unused
-      embedding_values = (
-          (1, 0),  # id 0
-          (0, 1),  # id 1
-          (1, 1))  # id 2
-      return embedding_values
-
-    embedding_column = tf.feature_column.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_embedding_column_initializer)
-
-    dense_features = df.DenseFeatures([embedding_column])
-    features = {'a': sparse_input}
-
-    inputs = dense_features(features)
-    variables = dense_features.variables
-
-    # Sanity check: test that the inputs are correct.
-    self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
-
-    # Check that only one variable was created.
-    self.assertEqual(1, len(variables))
-
-    # Check that invoking dense_features on the same features does not create
-    # additional variables
-    _ = dense_features(features)
-    self.assertEqual(1, len(variables))
-    self.assertIs(variables[0], dense_features.variables[0])
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_feature_column_dense_features_gradient(self):
-    sparse_input = tf.SparseTensor(
-        indices=((0, 0), (1, 0), (2, 0)),
-        values=(0, 1, 2),
-        dense_shape=(3, 3))
-
-    # Create feature columns (categorical and embedding).
-    categorical_column = tf.feature_column.categorical_column_with_identity(
-        key='a', num_buckets=3)
-    embedding_dimension = 2
-
-    def _embedding_column_initializer(shape, dtype, partition_info=None):
-      del shape  # unused
-      del dtype  # unused
-      del partition_info  # unused
-      embedding_values = (
-          (1, 0),  # id 0
-          (0, 1),  # id 1
-          (1, 1))  # id 2
-      return embedding_values
-
-    embedding_column = tf.feature_column.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_embedding_column_initializer)
-
-    dense_features = df.DenseFeatures([embedding_column])
-    features = {'a': sparse_input}
-
-    def scale_matrix():
-      matrix = dense_features(features)
-      return 2 * matrix
-
-    # Sanity check: Verify that scale_matrix returns the correct output.
-    self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
-
-    # Check that the returned gradient is correct.
-    grad_function = backprop.implicit_grad(scale_matrix)
-    grads_and_vars = grad_function()
-    indexed_slice = grads_and_vars[0][0]
-    gradient = grads_and_vars[0][0].values
-
-    self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-    self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
-
-  def test_dense_feature_with_training_arg(self):
-    price1 = tf.feature_column.numeric_column('price1', shape=2)
-    price2 = tf.feature_column.numeric_column('price2')
-
-    # Monkey patch the second numeric column to simulate a column that has
-    # different behavior by mode.
-    def training_aware_get_dense_tensor(transformation_cache,
-                                        state_manager,
-                                        training=None):
-      return transformation_cache.get(price2, state_manager, training=training)
-
-    def training_aware_transform_feature(transformation_cache,
-                                         state_manager,
-                                         training=None):
-      input_tensor = transformation_cache.get(
-          price2.key, state_manager, training=training)
-      if training:
-        return input_tensor * 10.0
-      else:
-        return input_tensor * 20.0
-
-    price2.get_dense_tensor = training_aware_get_dense_tensor
-    price2.transform_feature = training_aware_transform_feature
-    with tf.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      train_mode = df.DenseFeatures([price1, price2])(features, training=True)
-      predict_mode = df.DenseFeatures([price1, price2
-                                      ])(features, training=False)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2., 30.], [5., 6., 40.]],
-                          self.evaluate(train_mode))
-      self.assertAllClose([[1., 2., 60.], [5., 6., 80.]],
-                          self.evaluate(predict_mode))
-
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegex(ValueError,
-                                'feature_columns must not be empty'):
-      df.DenseFeatures(feature_columns=[])(features={})
-
-  def test_should_be_dense_column(self):
-    with self.assertRaisesRegex(ValueError, 'must be a .*DenseColumn'):
-      df.DenseFeatures(feature_columns=[
-          tf.feature_column.categorical_column_with_hash_bucket('wire_cast', 4)
-      ])(
-          features={
-              'a': [[0]]
-          })
-
-  def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      df.DenseFeatures(feature_columns={'a': tf.feature_column.numeric_column('a')})(
-          features={
-              'a': [[0]]
-          })
-
-  def test_bare_column(self):
-    with tf.Graph().as_default():
-      features = features = {'a': [0.]}
-      net = df.DenseFeatures(tf.feature_column.numeric_column('a'))(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[0.]], self.evaluate(net))
-
-  def test_column_generator(self):
-    with tf.Graph().as_default():
-      features = features = {'a': [0.], 'b': [1.]}
-      columns = (tf.feature_column.numeric_column(key) for key in features)
-      net = df.DenseFeatures(columns)(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[0., 1.]], self.evaluate(net))
-
-  def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Duplicate feature column name found for columns'):
-      df.DenseFeatures(
-          feature_columns=[tf.feature_column.numeric_column('a'),
-                           tf.feature_column.numeric_column('a')])(
-                               features={
-                                   'a': [[0]]
-                               })
-
-  def test_one_column(self):
-    price = tf.feature_column.numeric_column('price')
-    with tf.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      net = df.DenseFeatures([price])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1.], [5.]], self.evaluate(net))
-
-  def test_multi_dimension(self):
-    price = tf.feature_column.numeric_column('price', shape=2)
-    with tf.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      net = df.DenseFeatures([price])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
-
-  def test_compute_output_shape(self):
-    price1 = tf.feature_column.numeric_column('price1', shape=2)
-    price2 = tf.feature_column.numeric_column('price2', shape=4)
-    with tf.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
-      }
-      dense_features = df.DenseFeatures([price1, price2])
-      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
-      net = dense_features(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
-                          self.evaluate(net))
-
-  def test_raises_if_shape_mismatch(self):
-    price = tf.feature_column.numeric_column('price', shape=2)
-    with tf.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegex(
-          Exception,
-          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        df.DenseFeatures([price])(features)
-
-  def test_reshaping(self):
-    price = tf.feature_column.numeric_column('price', shape=[1, 2])
-    with tf.Graph().as_default():
-      features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = df.DenseFeatures([price])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
-
-  def test_multi_column(self):
-    price1 = tf.feature_column.numeric_column('price1', shape=2)
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      net = df.DenseFeatures([price1, price2])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
-
-  def test_cols_to_output_tensors(self):
-    price1 = tf.feature_column.numeric_column('price1', shape=2)
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      cols_dict = {}
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      dense_features = df.DenseFeatures([price1, price2])
-      net = dense_features(features, cols_dict)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 2.], [5., 6.]],
-                          self.evaluate(cols_dict[price1]))
-      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
-      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
-
-  def test_column_order(self):
-    price_a = tf.feature_column.numeric_column('price_a')
-    price_b = tf.feature_column.numeric_column('price_b')
-    with tf.Graph().as_default():
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-      }
-      net1 = df.DenseFeatures([price_a, price_b])(features)
-      net2 = df.DenseFeatures([price_b, price_a])(features)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertAllClose([[1., 3.]], self.evaluate(net1))
-      self.assertAllClose([[1., 3.]], self.evaluate(net2))
-
-  def test_fails_for_categorical_column(self):
-    animal = tf.feature_column.categorical_column_with_identity('animal', num_buckets=4)
-    with tf.Graph().as_default():
-      features = {
-          'animal':
-              tf.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      with self.assertRaisesRegex(Exception, 'must be a .*DenseColumn'):
-        df.DenseFeatures([animal])(features)
-
-  def test_static_batch_size_mismatch(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {
-          'price1': [[1.], [5.], [7.]],  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      with self.assertRaisesRegex(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        df.DenseFeatures([price1, price2])(features)
-
-  def test_subset_of_static_batch_size_mismatch(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    price3 = tf.feature_column.numeric_column('price3')
-    with tf.Graph().as_default():
-      features = {
-          'price1': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 3
-          'price2': [[3.], [4.]],  # batchsize = 2
-          'price3': [[3.], [4.], [5.]]  # batchsize = 3
-      }
-      with self.assertRaisesRegex(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        df.DenseFeatures([price1, price2, price3])(features)
-
-  def test_runtime_batch_size_mismatch(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {
-          'price1': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      net = df.DenseFeatures([price1, price2])(features)
-      with _initialized_session() as sess:
-        with self.assertRaisesRegex(tf.errors.OpError,
-                                    'Dimension 0 in both shapes must be equal|'
-                                    'Dimensions of inputs should match'):
-          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
-
-  def test_runtime_batch_size_matches(self):
-    price1 = tf.feature_column.numeric_column('price1')
-    price2 = tf.feature_column.numeric_column('price2')
-    with tf.Graph().as_default():
-      features = {
-          'price1': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 2
-          'price2': tf.compat.v1.placeholder(dtype=tf.int64),  # batchsize = 2
-      }
-      net = df.DenseFeatures([price1, price2])(features)
-      with _initialized_session() as sess:
-        sess.run(
-            net,
-            feed_dict={
-                features['price1']: [[1.], [5.]],
-                features['price2']: [[1.], [5.]],
-            })
-
-  def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = tf.feature_column.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = tf.feature_column.embedding_column(
-        some_sparse_column, dimension=10)
-
-    with tf.Graph().as_default():
-      features = {
-          'sparse_feature': [['a'], ['x']],
-      }
-      all_cols = [some_embedding_column]
-      df.DenseFeatures(all_cols)(features)
-      df.DenseFeatures(all_cols)(features)
-      # Make sure that 2 variables get created in this case.
-      self.assertEqual(2,
-                       len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-      expected_var_names = [
-          'dense_features/sparse_feature_embedding/embedding_weights:0',
-          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
-      ]
-      self.assertItemsEqual(
-          expected_var_names,
-          [v.name for v in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)])
-
-  def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = tf.feature_column.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-
-    # feature_column.shared_embeddings is not supported in eager.
-    with tf.Graph().as_default():
-      embedding_column_b, embedding_column_a = tf.feature_column.shared_embeddings(
-          [categorical_column_b, categorical_column_a],
-          dimension=embedding_dimension)
-      features = {
-          'aaa':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      all_cols = [embedding_column_a, embedding_column_b]
-      df.DenseFeatures(all_cols)(features)
-      df.DenseFeatures(all_cols)(features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1,
-                       len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0'],
-          [v.name for v in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)])
-
-  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = tf.feature_column.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-
-    # feature_column.shared_embeddings is not supported in eager.
-    with tf.Graph().as_default():
-      embedding_column_b, embedding_column_a = tf.feature_column.shared_embeddings(
-          [categorical_column_b, categorical_column_a],
-          dimension=embedding_dimension)
-      all_cols = [embedding_column_a, embedding_column_b]
-      features = {
-          'aaa':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      df.DenseFeatures(all_cols)(features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1,
-                       len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-
-    with tf.Graph().as_default():
-      features1 = {
-          'aaa':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              tf.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-
-      df.DenseFeatures(all_cols)(features1)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1,
-                       len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0'],
-          [v.name for v in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)])
-
-  def test_with_1d_sparse_tensor(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
     )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in dense_features
-    price = tf.feature_column.numeric_column('price')
-
-    # one_hot_body_style has 3 dims in dense_features.
-    body_style = tf.feature_column.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = tf.feature_column.indicator_column(body_style)
-
-    # embedded_body_style has 5 dims in dense_features.
-    country = tf.feature_column.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = tf.feature_column.embedding_column(
-        country, dimension=5, initializer=_initializer)
-
-    with tf.Graph().as_default():
-      # Provides 1-dim tensor and dense tensor.
-      features = {
-          'price':
-              tf.constant([
-                  11.,
-                  12.,
-              ]),
-          'body-style':
-              tf.SparseTensor(
-                  indices=((0,), (1,)),
-                  values=('sedan', 'hardtop'),
-                  dense_shape=(2,)),
-          # This is dense tensor for the categorical_column.
-          'country':
-              tf.constant(['CA', 'US']),
-      }
-      self.assertEqual(1, features['price'].shape.ndims)
-      self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-      self.assertEqual(1, features['country'].shape.ndims)
-
-      net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
-          features)
-      self.assertEqual(1 + 3 + 5, net.shape[1])
-      with _initialized_session() as sess:
-
-        # Each row is formed by concatenating `embedded_body_style`,
-        # `one_hot_body_style`, and `price` in order.
-        self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-                             [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-                            sess.run(net))
-
-  def test_with_1d_unknown_shape_sparse_tensor(self):
-    embedding_values = (
-        (1., 2.),  # id 0
-        (6., 7.),  # id 1
-        (11., 12.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in dense_features
-    price = tf.feature_column.numeric_column('price')
-
-    # one_hot_body_style has 3 dims in dense_features.
-    body_style = tf.feature_column.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = tf.feature_column.indicator_column(body_style)
-
-    # embedded_body_style has 5 dims in dense_features.
-    country = tf.feature_column.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = tf.feature_column.embedding_column(
-        country, dimension=2, initializer=_initializer)
-
-    # Provides 1-dim tensor and dense tensor.
-    with tf.Graph().as_default():
-      features = {
-          'price': tf.compat.v1.placeholder(tf.float32),
-          'body-style': tf.compat.v1.sparse_placeholder(tf.string),
-          # This is dense tensor for the categorical_column.
-          'country': tf.compat.v1.placeholder(tf.string),
-      }
-      self.assertIsNone(features['price'].shape.ndims)
-      self.assertIsNone(features['body-style'].get_shape().ndims)
-      self.assertIsNone(features['country'].shape.ndims)
-
-      price_data = np.array([11., 12.])
-      body_style_data = tf.compat.v1.SparseTensorValue(
-          indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
-      country_data = np.array([['US'], ['CA']])
-
-      net = df.DenseFeatures([price, one_hot_body_style, embedded_country])(
-          features)
-      self.assertEqual(1 + 3 + 2, net.shape[1])
-      with _initialized_session() as sess:
-
-        # Each row is formed by concatenating `embedded_body_style`,
-        # `one_hot_body_style`, and `price` in order.
-        self.assertAllEqual(
-            [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
-            sess.run(
-                net,
-                feed_dict={
-                    features['price']: price_data,
-                    features['body-style']: body_style_data,
-                    features['country']: country_data
-                }))
-
-  def test_with_rank_0_feature(self):
-    # price has 1 dimension in dense_features
-    price = tf.feature_column.numeric_column('price')
-    features = {
-        'price': tf.constant(0),
-    }
-    self.assertEqual(0, features['price'].shape.ndims)
-
-    # Static rank 0 should fail
-    with self.assertRaisesRegex(ValueError, 'Feature .* cannot have rank 0'):
-      df.DenseFeatures([price])(features)
-
-    with tf.Graph().as_default():
-      # Dynamic rank 0 should fail
-      features = {
-          'price': tf.compat.v1.placeholder(tf.float32),
-      }
-      net = df.DenseFeatures([price])(features)
-      self.assertEqual(1, net.shape[1])
-      with _initialized_session() as sess:
-        with self.assertRaisesOpError('Feature .* cannot have rank 0'):
-          sess.run(net, feed_dict={features['price']: np.array(1)})
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_retrieving_input(self):
+        features = {"a": [0.0]}
+        dense_features = df.DenseFeatures(tf.feature_column.numeric_column("a"))
+        inputs = self.evaluate(dense_features(features))
+        self.assertAllClose([[0.0]], inputs)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_reuses_variables(self):
+        sparse_input = tf.SparseTensor(
+            indices=((0, 0), (1, 0), (2, 0)),
+            values=(0, 1, 2),
+            dense_shape=(3, 3),
+        )
+
+        # Create feature columns (categorical and embedding).
+        categorical_column = tf.feature_column.categorical_column_with_identity(
+            key="a", num_buckets=3
+        )
+        embedding_dimension = 2
+
+        def _embedding_column_initializer(shape, dtype, partition_info=None):
+            del shape  # unused
+            del dtype  # unused
+            del partition_info  # unused
+            embedding_values = ((1, 0), (0, 1), (1, 1))  # id 0  # id 1  # id 2
+            return embedding_values
+
+        embedding_column = tf.feature_column.embedding_column(
+            categorical_column,
+            dimension=embedding_dimension,
+            initializer=_embedding_column_initializer,
+        )
+
+        dense_features = df.DenseFeatures([embedding_column])
+        features = {"a": sparse_input}
+
+        inputs = dense_features(features)
+        variables = dense_features.variables
+
+        # Sanity check: test that the inputs are correct.
+        self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+        # Check that only one variable was created.
+        self.assertEqual(1, len(variables))
+
+        # Check that invoking dense_features on the same features does not
+        # create additional variables
+        _ = dense_features(features)
+        self.assertEqual(1, len(variables))
+        self.assertIs(variables[0], dense_features.variables[0])
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_feature_column_dense_features_gradient(self):
+        sparse_input = tf.SparseTensor(
+            indices=((0, 0), (1, 0), (2, 0)),
+            values=(0, 1, 2),
+            dense_shape=(3, 3),
+        )
+
+        # Create feature columns (categorical and embedding).
+        categorical_column = tf.feature_column.categorical_column_with_identity(
+            key="a", num_buckets=3
+        )
+        embedding_dimension = 2
+
+        def _embedding_column_initializer(shape, dtype, partition_info=None):
+            del shape  # unused
+            del dtype  # unused
+            del partition_info  # unused
+            embedding_values = ((1, 0), (0, 1), (1, 1))  # id 0  # id 1  # id 2
+            return embedding_values
+
+        embedding_column = tf.feature_column.embedding_column(
+            categorical_column,
+            dimension=embedding_dimension,
+            initializer=_embedding_column_initializer,
+        )
+
+        dense_features = df.DenseFeatures([embedding_column])
+        features = {"a": sparse_input}
+
+        def scale_matrix():
+            matrix = dense_features(features)
+            return 2 * matrix
+
+        # Sanity check: Verify that scale_matrix returns the correct output.
+        self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+        # Check that the returned gradient is correct.
+        grad_function = backprop.implicit_grad(scale_matrix)
+        grads_and_vars = grad_function()
+        indexed_slice = grads_and_vars[0][0]
+        gradient = grads_and_vars[0][0].values
+
+        self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+        self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+    def test_dense_feature_with_training_arg(self):
+        price1 = tf.feature_column.numeric_column("price1", shape=2)
+        price2 = tf.feature_column.numeric_column("price2")
+
+        # Monkey patch the second numeric column to simulate a column that has
+        # different behavior by mode.
+        def training_aware_get_dense_tensor(
+            transformation_cache, state_manager, training=None
+        ):
+            return transformation_cache.get(
+                price2, state_manager, training=training
+            )
+
+        def training_aware_transform_feature(
+            transformation_cache, state_manager, training=None
+        ):
+            input_tensor = transformation_cache.get(
+                price2.key, state_manager, training=training
+            )
+            if training:
+                return input_tensor * 10.0
+            else:
+                return input_tensor * 20.0
+
+        price2.get_dense_tensor = training_aware_get_dense_tensor
+        price2.transform_feature = training_aware_transform_feature
+        with tf.Graph().as_default():
+            features = {
+                "price1": [[1.0, 2.0], [5.0, 6.0]],
+                "price2": [[3.0], [4.0]],
+            }
+            train_mode = df.DenseFeatures([price1, price2])(
+                features, training=True
+            )
+            predict_mode = df.DenseFeatures([price1, price2])(
+                features, training=False
+            )
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose(
+                [[1.0, 2.0, 30.0], [5.0, 6.0, 40.0]], self.evaluate(train_mode)
+            )
+            self.assertAllClose(
+                [[1.0, 2.0, 60.0], [5.0, 6.0, 80.0]],
+                self.evaluate(predict_mode),
+            )
+
+    def test_raises_if_empty_feature_columns(self):
+        with self.assertRaisesRegex(
+            ValueError, "feature_columns must not be empty"
+        ):
+            df.DenseFeatures(feature_columns=[])(features={})
+
+    def test_should_be_dense_column(self):
+        with self.assertRaisesRegex(ValueError, "must be a .*DenseColumn"):
+            df.DenseFeatures(
+                feature_columns=[
+                    tf.feature_column.categorical_column_with_hash_bucket(
+                        "wire_cast", 4
+                    )
+                ]
+            )(features={"a": [[0]]})
+
+    def test_does_not_support_dict_columns(self):
+        with self.assertRaisesRegex(
+            ValueError, "Expected feature_columns to be iterable, found dict."
+        ):
+            df.DenseFeatures(
+                feature_columns={"a": tf.feature_column.numeric_column("a")}
+            )(features={"a": [[0]]})
+
+    def test_bare_column(self):
+        with tf.Graph().as_default():
+            features = features = {"a": [0.0]}
+            net = df.DenseFeatures(tf.feature_column.numeric_column("a"))(
+                features
+            )
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[0.0]], self.evaluate(net))
+
+    def test_column_generator(self):
+        with tf.Graph().as_default():
+            features = features = {"a": [0.0], "b": [1.0]}
+            columns = (
+                tf.feature_column.numeric_column(key) for key in features
+            )
+            net = df.DenseFeatures(columns)(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[0.0, 1.0]], self.evaluate(net))
+
+    def test_raises_if_duplicate_name(self):
+        with self.assertRaisesRegex(
+            ValueError, "Duplicate feature column name found for columns"
+        ):
+            df.DenseFeatures(
+                feature_columns=[
+                    tf.feature_column.numeric_column("a"),
+                    tf.feature_column.numeric_column("a"),
+                ]
+            )(features={"a": [[0]]})
+
+    def test_one_column(self):
+        price = tf.feature_column.numeric_column("price")
+        with tf.Graph().as_default():
+            features = {"price": [[1.0], [5.0]]}
+            net = df.DenseFeatures([price])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0], [5.0]], self.evaluate(net))
+
+    def test_multi_dimension(self):
+        price = tf.feature_column.numeric_column("price", shape=2)
+        with tf.Graph().as_default():
+            features = {"price": [[1.0, 2.0], [5.0, 6.0]]}
+            net = df.DenseFeatures([price])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0, 2.0], [5.0, 6.0]], self.evaluate(net))
+
+    def test_compute_output_shape(self):
+        price1 = tf.feature_column.numeric_column("price1", shape=2)
+        price2 = tf.feature_column.numeric_column("price2", shape=4)
+        with tf.Graph().as_default():
+            features = {
+                "price1": [[1.0, 2.0], [5.0, 6.0]],
+                "price2": [[3.0, 4.0, 5.0, 6.0], [7.0, 8.0, 9.0, 10.0]],
+            }
+            dense_features = df.DenseFeatures([price1, price2])
+            self.assertEqual(
+                (None, 6), dense_features.compute_output_shape((None,))
+            )
+            net = dense_features(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose(
+                [
+                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+                    [5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+                ],
+                self.evaluate(net),
+            )
+
+    def test_raises_if_shape_mismatch(self):
+        price = tf.feature_column.numeric_column("price", shape=2)
+        with tf.Graph().as_default():
+            features = {"price": [[1.0], [5.0]]}
+            with self.assertRaisesRegex(
+                Exception,
+                r"Cannot reshape a tensor with 2 elements to shape \[2,2\]",
+            ):
+                df.DenseFeatures([price])(features)
+
+    def test_reshaping(self):
+        price = tf.feature_column.numeric_column("price", shape=[1, 2])
+        with tf.Graph().as_default():
+            features = {"price": [[[1.0, 2.0]], [[5.0, 6.0]]]}
+            net = df.DenseFeatures([price])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0, 2.0], [5.0, 6.0]], self.evaluate(net))
+
+    def test_multi_column(self):
+        price1 = tf.feature_column.numeric_column("price1", shape=2)
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": [[1.0, 2.0], [5.0, 6.0]],
+                "price2": [[3.0], [4.0]],
+            }
+            net = df.DenseFeatures([price1, price2])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose(
+                [[1.0, 2.0, 3.0], [5.0, 6.0, 4.0]], self.evaluate(net)
+            )
+
+    def test_cols_to_output_tensors(self):
+        price1 = tf.feature_column.numeric_column("price1", shape=2)
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            cols_dict = {}
+            features = {
+                "price1": [[1.0, 2.0], [5.0, 6.0]],
+                "price2": [[3.0], [4.0]],
+            }
+            dense_features = df.DenseFeatures([price1, price2])
+            net = dense_features(features, cols_dict)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose(
+                [[1.0, 2.0], [5.0, 6.0]], self.evaluate(cols_dict[price1])
+            )
+            self.assertAllClose(
+                [[3.0], [4.0]], self.evaluate(cols_dict[price2])
+            )
+            self.assertAllClose(
+                [[1.0, 2.0, 3.0], [5.0, 6.0, 4.0]], self.evaluate(net)
+            )
+
+    def test_column_order(self):
+        price_a = tf.feature_column.numeric_column("price_a")
+        price_b = tf.feature_column.numeric_column("price_b")
+        with tf.Graph().as_default():
+            features = {
+                "price_a": [[1.0]],
+                "price_b": [[3.0]],
+            }
+            net1 = df.DenseFeatures([price_a, price_b])(features)
+            net2 = df.DenseFeatures([price_b, price_a])(features)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertAllClose([[1.0, 3.0]], self.evaluate(net1))
+            self.assertAllClose([[1.0, 3.0]], self.evaluate(net2))
+
+    def test_fails_for_categorical_column(self):
+        animal = tf.feature_column.categorical_column_with_identity(
+            "animal", num_buckets=4
+        )
+        with tf.Graph().as_default():
+            features = {
+                "animal": tf.SparseTensor(
+                    indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2]
+                )
+            }
+            with self.assertRaisesRegex(Exception, "must be a .*DenseColumn"):
+                df.DenseFeatures([animal])(features)
+
+    def test_static_batch_size_mismatch(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": [[1.0], [5.0], [7.0]],  # batchsize = 3
+                "price2": [[3.0], [4.0]],  # batchsize = 2
+            }
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Batch size \(first dimension\) of each feature must be same.",
+            ):
+                df.DenseFeatures([price1, price2])(features)
+
+    def test_subset_of_static_batch_size_mismatch(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        price3 = tf.feature_column.numeric_column("price3")
+        with tf.Graph().as_default():
+            features = {
+                "price1": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 3
+                "price2": [[3.0], [4.0]],  # batchsize = 2
+                "price3": [[3.0], [4.0], [5.0]],  # batchsize = 3
+            }
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Batch size \(first dimension\) of each feature must be same.",
+            ):
+                df.DenseFeatures([price1, price2, price3])(features)
+
+    def test_runtime_batch_size_mismatch(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 3
+                "price2": [[3.0], [4.0]],  # batchsize = 2
+            }
+            net = df.DenseFeatures([price1, price2])(features)
+            with _initialized_session() as sess:
+                with self.assertRaisesRegex(
+                    tf.errors.OpError,
+                    "Dimension 0 in both shapes must be equal|"
+                    "Dimensions of inputs should match",
+                ):
+                    sess.run(
+                        net,
+                        feed_dict={features["price1"]: [[1.0], [5.0], [7.0]]},
+                    )
+
+    def test_runtime_batch_size_matches(self):
+        price1 = tf.feature_column.numeric_column("price1")
+        price2 = tf.feature_column.numeric_column("price2")
+        with tf.Graph().as_default():
+            features = {
+                "price1": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 2
+                "price2": tf.compat.v1.placeholder(
+                    dtype=tf.int64
+                ),  # batchsize = 2
+            }
+            net = df.DenseFeatures([price1, price2])(features)
+            with _initialized_session() as sess:
+                sess.run(
+                    net,
+                    feed_dict={
+                        features["price1"]: [[1.0], [5.0]],
+                        features["price2"]: [[1.0], [5.0]],
+                    },
+                )
+
+    def test_multiple_layers_with_same_embedding_column(self):
+        some_sparse_column = (
+            tf.feature_column.categorical_column_with_hash_bucket(
+                "sparse_feature", hash_bucket_size=5
+            )
+        )
+        some_embedding_column = tf.feature_column.embedding_column(
+            some_sparse_column, dimension=10
+        )
+
+        with tf.Graph().as_default():
+            features = {
+                "sparse_feature": [["a"], ["x"]],
+            }
+            all_cols = [some_embedding_column]
+            df.DenseFeatures(all_cols)(features)
+            df.DenseFeatures(all_cols)(features)
+            # Make sure that 2 variables get created in this case.
+            self.assertEqual(
+                2,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+            expected_var_names = [
+                "dense_features/sparse_feature_embedding/embedding_weights:0",
+                "dense_features_1/sparse_feature_embedding/embedding_weights:0",
+            ]
+            self.assertItemsEqual(
+                expected_var_names,
+                [
+                    v.name
+                    for v in tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ],
+            )
+
+    def test_multiple_layers_with_same_shared_embedding_column(self):
+        categorical_column_a = (
+            tf.feature_column.categorical_column_with_identity(
+                key="aaa", num_buckets=3
+            )
+        )
+        categorical_column_b = (
+            tf.feature_column.categorical_column_with_identity(
+                key="bbb", num_buckets=3
+            )
+        )
+        embedding_dimension = 2
+
+        # feature_column.shared_embeddings is not supported in eager.
+        with tf.Graph().as_default():
+            (
+                embedding_column_b,
+                embedding_column_a,
+            ) = tf.feature_column.shared_embeddings(
+                [categorical_column_b, categorical_column_a],
+                dimension=embedding_dimension,
+            )
+            features = {
+                "aaa": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 1, 0),
+                    dense_shape=(2, 2),
+                ),
+                "bbb": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(1, 2, 1),
+                    dense_shape=(2, 2),
+                ),
+            }
+            all_cols = [embedding_column_a, embedding_column_b]
+            df.DenseFeatures(all_cols)(features)
+            df.DenseFeatures(all_cols)(features)
+            # Make sure that only 1 variable gets created in this case.
+            self.assertEqual(
+                1,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+            self.assertItemsEqual(
+                ["aaa_bbb_shared_embedding:0"],
+                [
+                    v.name
+                    for v in tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ],
+            )
+
+    def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(
+        self,
+    ):
+        categorical_column_a = (
+            tf.feature_column.categorical_column_with_identity(
+                key="aaa", num_buckets=3
+            )
+        )
+        categorical_column_b = (
+            tf.feature_column.categorical_column_with_identity(
+                key="bbb", num_buckets=3
+            )
+        )
+        embedding_dimension = 2
+
+        # feature_column.shared_embeddings is not supported in eager.
+        with tf.Graph().as_default():
+            (
+                embedding_column_b,
+                embedding_column_a,
+            ) = tf.feature_column.shared_embeddings(
+                [categorical_column_b, categorical_column_a],
+                dimension=embedding_dimension,
+            )
+            all_cols = [embedding_column_a, embedding_column_b]
+            features = {
+                "aaa": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 1, 0),
+                    dense_shape=(2, 2),
+                ),
+                "bbb": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(1, 2, 1),
+                    dense_shape=(2, 2),
+                ),
+            }
+            df.DenseFeatures(all_cols)(features)
+            # Make sure that only 1 variable gets created in this case.
+            self.assertEqual(
+                1,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+
+        with tf.Graph().as_default():
+            features1 = {
+                "aaa": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 1, 0),
+                    dense_shape=(2, 2),
+                ),
+                "bbb": tf.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(1, 2, 1),
+                    dense_shape=(2, 2),
+                ),
+            }
+
+            df.DenseFeatures(all_cols)(features1)
+            # Make sure that only 1 variable gets created in this case.
+            self.assertEqual(
+                1,
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ),
+            )
+            self.assertItemsEqual(
+                ["aaa_bbb_shared_embedding:0"],
+                [
+                    v.name
+                    for v in tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                    )
+                ],
+            )
+
+    def test_with_1d_sparse_tensor(self):
+        embedding_values = (
+            (1.0, 2.0, 3.0, 4.0, 5.0),  # id 0
+            (6.0, 7.0, 8.0, 9.0, 10.0),  # id 1
+            (11.0, 12.0, 13.0, 14.0, 15.0),  # id 2
+        )
+
+        def _initializer(shape, dtype, partition_info=None):
+            del shape, dtype, partition_info
+            return embedding_values
+
+        # price has 1 dimension in dense_features
+        price = tf.feature_column.numeric_column("price")
+
+        # one_hot_body_style has 3 dims in dense_features.
+        body_style = tf.feature_column.categorical_column_with_vocabulary_list(
+            "body-style", vocabulary_list=["hardtop", "wagon", "sedan"]
+        )
+        one_hot_body_style = tf.feature_column.indicator_column(body_style)
+
+        # embedded_body_style has 5 dims in dense_features.
+        country = tf.feature_column.categorical_column_with_vocabulary_list(
+            "country", vocabulary_list=["US", "JP", "CA"]
+        )
+        embedded_country = tf.feature_column.embedding_column(
+            country, dimension=5, initializer=_initializer
+        )
+
+        with tf.Graph().as_default():
+            # Provides 1-dim tensor and dense tensor.
+            features = {
+                "price": tf.constant(
+                    [
+                        11.0,
+                        12.0,
+                    ]
+                ),
+                "body-style": tf.SparseTensor(
+                    indices=((0,), (1,)),
+                    values=("sedan", "hardtop"),
+                    dense_shape=(2,),
+                ),
+                # This is dense tensor for the categorical_column.
+                "country": tf.constant(["CA", "US"]),
+            }
+            self.assertEqual(1, features["price"].shape.ndims)
+            self.assertEqual(
+                1, features["body-style"].dense_shape.get_shape()[0]
+            )
+            self.assertEqual(1, features["country"].shape.ndims)
+
+            net = df.DenseFeatures(
+                [price, one_hot_body_style, embedded_country]
+            )(features)
+            self.assertEqual(1 + 3 + 5, net.shape[1])
+            with _initialized_session() as sess:
+
+                # Each row is formed by concatenating `embedded_body_style`,
+                # `one_hot_body_style`, and `price` in order.
+                self.assertAllEqual(
+                    [
+                        [0.0, 0.0, 1.0, 11.0, 12.0, 13.0, 14.0, 15.0, 11.0],
+                        [1.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 12.0],
+                    ],
+                    sess.run(net),
+                )
+
+    def test_with_1d_unknown_shape_sparse_tensor(self):
+        embedding_values = (
+            (1.0, 2.0),  # id 0
+            (6.0, 7.0),  # id 1
+            (11.0, 12.0),  # id 2
+        )
+
+        def _initializer(shape, dtype, partition_info=None):
+            del shape, dtype, partition_info
+            return embedding_values
+
+        # price has 1 dimension in dense_features
+        price = tf.feature_column.numeric_column("price")
+
+        # one_hot_body_style has 3 dims in dense_features.
+        body_style = tf.feature_column.categorical_column_with_vocabulary_list(
+            "body-style", vocabulary_list=["hardtop", "wagon", "sedan"]
+        )
+        one_hot_body_style = tf.feature_column.indicator_column(body_style)
+
+        # embedded_body_style has 5 dims in dense_features.
+        country = tf.feature_column.categorical_column_with_vocabulary_list(
+            "country", vocabulary_list=["US", "JP", "CA"]
+        )
+        embedded_country = tf.feature_column.embedding_column(
+            country, dimension=2, initializer=_initializer
+        )
+
+        # Provides 1-dim tensor and dense tensor.
+        with tf.Graph().as_default():
+            features = {
+                "price": tf.compat.v1.placeholder(tf.float32),
+                "body-style": tf.compat.v1.sparse_placeholder(tf.string),
+                # This is dense tensor for the categorical_column.
+                "country": tf.compat.v1.placeholder(tf.string),
+            }
+            self.assertIsNone(features["price"].shape.ndims)
+            self.assertIsNone(features["body-style"].get_shape().ndims)
+            self.assertIsNone(features["country"].shape.ndims)
+
+            price_data = np.array([11.0, 12.0])
+            body_style_data = tf.compat.v1.SparseTensorValue(
+                indices=((0,), (1,)),
+                values=("sedan", "hardtop"),
+                dense_shape=(2,),
+            )
+            country_data = np.array([["US"], ["CA"]])
+
+            net = df.DenseFeatures(
+                [price, one_hot_body_style, embedded_country]
+            )(features)
+            self.assertEqual(1 + 3 + 2, net.shape[1])
+            with _initialized_session() as sess:
+
+                # Each row is formed by concatenating `embedded_body_style`,
+                # `one_hot_body_style`, and `price` in order.
+                self.assertAllEqual(
+                    [
+                        [0.0, 0.0, 1.0, 1.0, 2.0, 11.0],
+                        [1.0, 0.0, 0.0, 11.0, 12.0, 12.0],
+                    ],
+                    sess.run(
+                        net,
+                        feed_dict={
+                            features["price"]: price_data,
+                            features["body-style"]: body_style_data,
+                            features["country"]: country_data,
+                        },
+                    ),
+                )
+
+    def test_with_rank_0_feature(self):
+        # price has 1 dimension in dense_features
+        price = tf.feature_column.numeric_column("price")
+        features = {
+            "price": tf.constant(0),
+        }
+        self.assertEqual(0, features["price"].shape.ndims)
+
+        # Static rank 0 should fail
+        with self.assertRaisesRegex(
+            ValueError, "Feature .* cannot have rank 0"
+        ):
+            df.DenseFeatures([price])(features)
+
+        with tf.Graph().as_default():
+            # Dynamic rank 0 should fail
+            features = {
+                "price": tf.compat.v1.placeholder(tf.float32),
+            }
+            net = df.DenseFeatures([price])(features)
+            self.assertEqual(1, net.shape[1])
+            with _initialized_session() as sess:
+                with self.assertRaisesOpError("Feature .* cannot have rank 0"):
+                    sess.run(net, feed_dict={features["price"]: np.array(1)})
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/feature_column/sequence_feature_column.py b/keras/feature_column/sequence_feature_column.py
index 2d6bf69ef58e..89e4f5cfdb76 100644
--- a/keras/feature_column/sequence_feature_column.py
+++ b/keras/feature_column/sequence_feature_column.py
@@ -22,16 +22,17 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.feature_column import base_feature_layer as kfc
-from tensorflow.python.util.tf_export import keras_export
 
-# pylint: disable=protected-access
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.experimental.SequenceFeatures')
+@keras_export("keras.experimental.SequenceFeatures")
 class SequenceFeatures(kfc._BaseFeaturesLayer):
-  """A layer for sequence input.
+    """A layer for sequence input.
 
     All `feature_columns` must be sequence dense columns with the same
     `sequence_length`. The output of this method can be fed into sequence
@@ -76,104 +77,119 @@ class SequenceFeatures(kfc._BaseFeaturesLayer):
     rnn_layer = tf.keras.layers.RNN(rnn_cell)
     outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
     ```
-  """
-
-  def __init__(
-      self,
-      feature_columns,
-      trainable=True,
-      name=None,
-      **kwargs):
-    """"Constructs a SequenceFeatures layer.
-
-    Args:
-      feature_columns: An iterable of dense sequence columns. Valid columns are
-        - `embedding_column` that wraps a `sequence_categorical_column_with_*`
-        - `sequence_numeric_column`.
-      trainable: Boolean, whether the layer's variables will be updated via
-        gradient descent during training.
-      name: Name to give to the SequenceFeatures.
-      **kwargs: Keyword arguments to construct a layer.
-
-    Raises:
-      ValueError: If any of the `feature_columns` is not a
-        `SequenceDenseColumn`.
     """
-    super().__init__(
-        feature_columns=feature_columns,
-        trainable=trainable,
-        name=name,
-        expected_column_type=tf.__internal__.feature_column.SequenceDenseColumn,
-        **kwargs)
-
-  @property
-  def _is_feature_layer(self):
-    return True
-
-  def _target_shape(self, input_shape, total_elements):
-    return (input_shape[0], input_shape[1], total_elements)
-
-  def call(self, features, training=None):
-    """Returns sequence input corresponding to the `feature_columns`.
-
-    Args:
-      features: A dict mapping keys to tensors.
-      training: Python boolean or None, indicating whether to the layer is being
-        run in training mode. This argument is passed to the call method of any
-        `FeatureColumn` that takes a `training` argument. For example, if a
-        `FeatureColumn` performed dropout, the column could expose a `training`
-        argument to control whether the dropout should be applied. If `None`,
-        defaults to `tf.keras.backend.learning_phase()`.
-
-
-    Returns:
-      An `(input_layer, sequence_length)` tuple where:
-      - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
-          `T` is the maximum sequence length for this batch, which could differ
-          from batch to batch. `D` is the sum of `num_elements` for all
-          `feature_columns`.
-      - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
-          length for each example.
-
-    Raises:
-      ValueError: If features are not a dictionary.
-    """
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: ',
-                       features)
-    if training is None:
-      training = backend.learning_phase()
-    transformation_cache = tf.__internal__.feature_column.FeatureTransformationCache(features)
-    output_tensors = []
-    sequence_lengths = []
-
-    for column in self._feature_columns:
-      with backend.name_scope(column.name):
-        try:
-          dense_tensor, sequence_length = column.get_sequence_dense_tensor(
-              transformation_cache, self._state_manager, training=training)
-        except TypeError:
-          dense_tensor, sequence_length = column.get_sequence_dense_tensor(
-              transformation_cache, self._state_manager)
-        # Flattens the final dimension to produce a 3D Tensor.
-        output_tensors.append(self._process_dense_tensor(column, dense_tensor))
-        sequence_lengths.append(sequence_length)
-
-    # Check and process sequence lengths.
-    kfc._verify_static_batch_size_equality(    # pylint: disable=protected-access
-        sequence_lengths, self._feature_columns)
-    sequence_length = _assert_all_equal_and_return(sequence_lengths)
-
-    return self._verify_and_concat_tensors(output_tensors), sequence_length
+
+    def __init__(self, feature_columns, trainable=True, name=None, **kwargs):
+        """ "Constructs a SequenceFeatures layer.
+
+        Args:
+          feature_columns: An iterable of dense sequence columns. Valid columns
+            are
+            - `embedding_column` that wraps a
+              `sequence_categorical_column_with_*`
+            - `sequence_numeric_column`.
+          trainable: Boolean, whether the layer's variables will be updated via
+            gradient descent during training.
+          name: Name to give to the SequenceFeatures.
+          **kwargs: Keyword arguments to construct a layer.
+
+        Raises:
+          ValueError: If any of the `feature_columns` is not a
+            `SequenceDenseColumn`.
+        """
+        super().__init__(
+            feature_columns=feature_columns,
+            trainable=trainable,
+            name=name,
+            expected_column_type=tf.__internal__.feature_column.SequenceDenseColumn,  # noqa: E501
+            **kwargs
+        )
+
+    @property
+    def _is_feature_layer(self):
+        return True
+
+    def _target_shape(self, input_shape, total_elements):
+        return (input_shape[0], input_shape[1], total_elements)
+
+    def call(self, features, training=None):
+        """Returns sequence input corresponding to the `feature_columns`.
+
+        Args:
+          features: A dict mapping keys to tensors.
+          training: Python boolean or None, indicating whether to the layer is
+            being run in training mode. This argument is passed to the call
+            method of any `FeatureColumn` that takes a `training` argument. For
+            example, if a `FeatureColumn` performed dropout, the column could
+            expose a `training` argument to control whether the dropout should
+            be applied. If `None`, becomes `tf.keras.backend.learning_phase()`.
+            Defaults to `None`.
+
+
+        Returns:
+          An `(input_layer, sequence_length)` tuple where:
+          - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+              `T` is the maximum sequence length for this batch, which could
+              differ from batch to batch. `D` is the sum of `num_elements` for
+              all `feature_columns`.
+          - sequence_length: An int `Tensor` of shape `[batch_size]`. The
+            sequence length for each example.
+
+        Raises:
+          ValueError: If features are not a dictionary.
+        """
+        if not isinstance(features, dict):
+            raise ValueError(
+                "We expected a dictionary here. Instead we got: ", features
+            )
+        if training is None:
+            training = backend.learning_phase()
+        transformation_cache = (
+            tf.__internal__.feature_column.FeatureTransformationCache(features)
+        )
+        output_tensors = []
+        sequence_lengths = []
+
+        for column in self._feature_columns:
+            with backend.name_scope(column.name):
+                try:
+                    (
+                        dense_tensor,
+                        sequence_length,
+                    ) = column.get_sequence_dense_tensor(
+                        transformation_cache,
+                        self._state_manager,
+                        training=training,
+                    )
+                except TypeError:
+                    (
+                        dense_tensor,
+                        sequence_length,
+                    ) = column.get_sequence_dense_tensor(
+                        transformation_cache, self._state_manager
+                    )
+                # Flattens the final dimension to produce a 3D Tensor.
+                output_tensors.append(
+                    self._process_dense_tensor(column, dense_tensor)
+                )
+                sequence_lengths.append(sequence_length)
+
+        # Check and process sequence lengths.
+        kfc._verify_static_batch_size_equality(
+            sequence_lengths, self._feature_columns
+        )
+        sequence_length = _assert_all_equal_and_return(sequence_lengths)
+
+        return self._verify_and_concat_tensors(output_tensors), sequence_length
 
 
 def _assert_all_equal_and_return(tensors, name=None):
-  """Asserts that all tensors are equal and returns the first one."""
-  with backend.name_scope(name or 'assert_all_equal'):
-    if len(tensors) == 1:
-      return tensors[0]
-    assert_equal_ops = []
-    for t in tensors[1:]:
-      assert_equal_ops.append(tf.compat.v1.assert_equal(tensors[0], t))
-    with tf.control_dependencies(assert_equal_ops):
-      return tf.identity(tensors[0])
+    """Asserts that all tensors are equal and returns the first one."""
+    with backend.name_scope(name or "assert_all_equal"):
+        if len(tensors) == 1:
+            return tensors[0]
+        assert_equal_ops = []
+        for t in tensors[1:]:
+            assert_equal_ops.append(tf.compat.v1.assert_equal(tensors[0], t))
+        with tf.control_dependencies(assert_equal_ops):
+            return tf.identity(tensors[0])
diff --git a/keras/feature_column/sequence_feature_column_integration_test.py b/keras/feature_column/sequence_feature_column_integration_test.py
index e0a19df1ccf0..b76c04d1facc 100644
--- a/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/keras/feature_column/sequence_feature_column_integration_test.py
@@ -20,12 +20,6 @@
 
 import tensorflow.compat.v2 as tf
 
-
-from google.protobuf import text_format
-
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.framework import test_util as tf_test_utils
 from keras import backend
 from keras.feature_column import dense_features
 from keras.feature_column import sequence_feature_column as ksfc
@@ -33,115 +27,147 @@
 from keras.layers.rnn import base_rnn
 from keras.layers.rnn import simple_rnn
 
+# isort: off
+from google.protobuf import text_format
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
 
 class SequenceFeatureColumnIntegrationTest(tf.test.TestCase):
+    def _make_sequence_example(self):
+        example = example_pb2.SequenceExample()
+        example.context.feature["int_ctx"].int64_list.value.extend([5])
+        example.context.feature["float_ctx"].float_list.value.extend([123.6])
+        for val in range(0, 10, 2):
+            feat = feature_pb2.Feature()
+            feat.int64_list.value.extend([val] * val)
+            example.feature_lists.feature_list["int_list"].feature.extend(
+                [feat]
+            )
+        for val in range(1, 11, 2):
+            feat = feature_pb2.Feature()
+            feat.bytes_list.value.extend([tf.compat.as_bytes(str(val))] * val)
+            example.feature_lists.feature_list["str_list"].feature.extend(
+                [feat]
+            )
 
-  def _make_sequence_example(self):
-    example = example_pb2.SequenceExample()
-    example.context.feature['int_ctx'].int64_list.value.extend([5])
-    example.context.feature['float_ctx'].float_list.value.extend([123.6])
-    for val in range(0, 10, 2):
-      feat = feature_pb2.Feature()
-      feat.int64_list.value.extend([val] * val)
-      example.feature_lists.feature_list['int_list'].feature.extend([feat])
-    for val in range(1, 11, 2):
-      feat = feature_pb2.Feature()
-      feat.bytes_list.value.extend([tf.compat.as_bytes(str(val))] * val)
-      example.feature_lists.feature_list['str_list'].feature.extend([feat])
-
-    return example
+        return example
 
-  def _build_feature_columns(self):
-    col = tf.feature_column.categorical_column_with_identity('int_ctx', num_buckets=100)
-    ctx_cols = [
-        tf.feature_column.embedding_column(col, dimension=10),
-        tf.feature_column.numeric_column('float_ctx')
-    ]
+    def _build_feature_columns(self):
+        col = tf.feature_column.categorical_column_with_identity(
+            "int_ctx", num_buckets=100
+        )
+        ctx_cols = [
+            tf.feature_column.embedding_column(col, dimension=10),
+            tf.feature_column.numeric_column("float_ctx"),
+        ]
 
-    identity_col = tf.feature_column.sequence_categorical_column_with_identity(
-        'int_list', num_buckets=10)
-    bucket_col = tf.feature_column.sequence_categorical_column_with_hash_bucket(
-        'bytes_list', hash_bucket_size=100)
-    seq_cols = [
-        tf.feature_column.embedding_column(identity_col, dimension=10),
-        tf.feature_column.embedding_column(bucket_col, dimension=20)
-    ]
+        identity_col = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                "int_list", num_buckets=10
+            )
+        )
+        bucket_col = (
+            tf.feature_column.sequence_categorical_column_with_hash_bucket(
+                "bytes_list", hash_bucket_size=100
+            )
+        )
+        seq_cols = [
+            tf.feature_column.embedding_column(identity_col, dimension=10),
+            tf.feature_column.embedding_column(bucket_col, dimension=20),
+        ]
 
-    return ctx_cols, seq_cols
+        return ctx_cols, seq_cols
 
-  def test_sequence_example_into_input_layer(self):
-    examples = [_make_sequence_example().SerializeToString()] * 100
-    ctx_cols, seq_cols = self._build_feature_columns()
+    def test_sequence_example_into_input_layer(self):
+        examples = [_make_sequence_example().SerializeToString()] * 100
+        ctx_cols, seq_cols = self._build_feature_columns()
 
-    def _parse_example(example):
-      ctx, seq = tf.io.parse_single_sequence_example(
-          example,
-          context_features=tf.feature_column.make_parse_example_spec(ctx_cols),
-          sequence_features=tf.feature_column.make_parse_example_spec(seq_cols))
-      ctx.update(seq)
-      return ctx
+        def _parse_example(example):
+            ctx, seq = tf.io.parse_single_sequence_example(
+                example,
+                context_features=tf.feature_column.make_parse_example_spec(
+                    ctx_cols
+                ),
+                sequence_features=tf.feature_column.make_parse_example_spec(
+                    seq_cols
+                ),
+            )
+            ctx.update(seq)
+            return ctx
 
-    ds = tf.data.Dataset.from_tensor_slices(examples)
-    ds = ds.map(_parse_example)
-    ds = ds.batch(20)
+        ds = tf.data.Dataset.from_tensor_slices(examples)
+        ds = ds.map(_parse_example)
+        ds = ds.batch(20)
 
-    # Test on a single batch
-    features = tf.compat.v1.data.make_one_shot_iterator(ds).get_next()
+        # Test on a single batch
+        features = tf.compat.v1.data.make_one_shot_iterator(ds).get_next()
 
-    # Tile the context features across the sequence features
-    sequence_input_layer = ksfc.SequenceFeatures(seq_cols)
-    seq_input, _ = sequence_input_layer(features)
-    dense_input_layer = dense_features.DenseFeatures(ctx_cols)
-    ctx_input = dense_input_layer(features)
-    ctx_input = backend.repeat(ctx_input, tf.shape(seq_input)[1])
-    concatenated_input = merging.concatenate([seq_input, ctx_input])
+        # Tile the context features across the sequence features
+        sequence_input_layer = ksfc.SequenceFeatures(seq_cols)
+        seq_input, _ = sequence_input_layer(features)
+        dense_input_layer = dense_features.DenseFeatures(ctx_cols)
+        ctx_input = dense_input_layer(features)
+        ctx_input = backend.repeat(ctx_input, tf.shape(seq_input)[1])
+        concatenated_input = merging.concatenate([seq_input, ctx_input])
 
-    rnn_layer = base_rnn.RNN(simple_rnn.SimpleRNNCell(10))
-    output = rnn_layer(concatenated_input)
+        rnn_layer = base_rnn.RNN(simple_rnn.SimpleRNNCell(10))
+        output = rnn_layer(concatenated_input)
 
-    with self.cached_session() as sess:
-      sess.run(tf.compat.v1.global_variables_initializer())
-      features_r = sess.run(features)
-      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
+        with self.cached_session() as sess:
+            sess.run(tf.compat.v1.global_variables_initializer())
+            features_r = sess.run(features)
+            self.assertAllEqual(features_r["int_list"].dense_shape, [20, 3, 6])
 
-      output_r = sess.run(output)
-      self.assertAllEqual(output_r.shape, [20, 10])
+            output_r = sess.run(output)
+            self.assertAllEqual(output_r.shape, [20, 10])
 
-  @tf_test_utils.run_deprecated_v1
-  def test_shared_sequence_non_sequence_into_input_layer(self):
-    non_seq = tf.feature_column.categorical_column_with_identity('non_seq',
-                                                  num_buckets=10)
-    seq = tf.feature_column.sequence_categorical_column_with_identity('seq',
-                                                        num_buckets=10)
-    shared_non_seq, shared_seq = tf.feature_column.shared_embeddings(
-        [non_seq, seq],
-        dimension=4,
-        combiner='sum',
-        initializer=tf.ones_initializer(),
-        shared_embedding_collection_name='shared')
+    @tf_test_utils.run_deprecated_v1
+    def test_shared_sequence_non_sequence_into_input_layer(self):
+        non_seq = tf.feature_column.categorical_column_with_identity(
+            "non_seq", num_buckets=10
+        )
+        seq = tf.feature_column.sequence_categorical_column_with_identity(
+            "seq", num_buckets=10
+        )
+        shared_non_seq, shared_seq = tf.feature_column.shared_embeddings(
+            [non_seq, seq],
+            dimension=4,
+            combiner="sum",
+            initializer=tf.ones_initializer(),
+            shared_embedding_collection_name="shared",
+        )
 
-    seq = tf.SparseTensor(
-        indices=[[0, 0], [0, 1], [1, 0]],
-        values=[0, 1, 2],
-        dense_shape=[2, 2])
-    non_seq = tf.SparseTensor(
-        indices=[[0, 0], [0, 1], [1, 0]],
-        values=[0, 1, 2],
-        dense_shape=[2, 2])
-    features = {'seq': seq, 'non_seq': non_seq}
+        seq = tf.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 0]],
+            values=[0, 1, 2],
+            dense_shape=[2, 2],
+        )
+        non_seq = tf.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 0]],
+            values=[0, 1, 2],
+            dense_shape=[2, 2],
+        )
+        features = {"seq": seq, "non_seq": non_seq}
 
-    # Tile the context features across the sequence features
-    seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features)
-    non_seq_input = dense_features.DenseFeatures([shared_non_seq])(features)
+        # Tile the context features across the sequence features
+        seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features)
+        non_seq_input = dense_features.DenseFeatures([shared_non_seq])(features)
 
-    with self.cached_session() as sess:
-      sess.run(tf.compat.v1.global_variables_initializer())
-      output_seq, output_seq_length, output_non_seq = sess.run(
-          [seq_input, seq_length, non_seq_input])
-      self.assertAllEqual(output_seq, [[[1, 1, 1, 1], [1, 1, 1, 1]],
-                                       [[1, 1, 1, 1], [0, 0, 0, 0]]])
-      self.assertAllEqual(output_seq_length, [2, 1])
-      self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
+        with self.cached_session() as sess:
+            sess.run(tf.compat.v1.global_variables_initializer())
+            output_seq, output_seq_length, output_non_seq = sess.run(
+                [seq_input, seq_length, non_seq_input]
+            )
+            self.assertAllEqual(
+                output_seq,
+                [[[1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0]]],
+            )
+            self.assertAllEqual(output_seq_length, [2, 1])
+            self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
 
 
 _SEQ_EX_PROTO = """
@@ -248,9 +274,9 @@ def test_shared_sequence_non_sequence_into_input_layer(self):
 
 
 def _make_sequence_example():
-  example = example_pb2.SequenceExample()
-  return text_format.Parse(_SEQ_EX_PROTO, example)
+    example = example_pb2.SequenceExample()
+    return text_format.Parse(_SEQ_EX_PROTO, example)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/feature_column/sequence_feature_column_test.py b/keras/feature_column/sequence_feature_column_test.py
index 26a6d0895ad4..3e5b9ef1878d 100644
--- a/keras/feature_column/sequence_feature_column_test.py
+++ b/keras/feature_column/sequence_feature_column_test.py
@@ -18,650 +18,971 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
-from keras.testing_infra import test_combinations
 from keras.feature_column import sequence_feature_column as ksfc
-from keras.saving import model_config
+from keras.saving.legacy import model_config
+from keras.testing_infra import test_combinations
 
 
 def _initialized_session(config=None):
-  sess = tf.compat.v1.Session(config=config)
-  sess.run(tf.compat.v1.global_variables_initializer())
-  sess.run(tf.compat.v1.tables_initializer())
-  return sess
+    sess = tf.compat.v1.Session(config=config)
+    sess.run(tf.compat.v1.global_variables_initializer())
+    sess.run(tf.compat.v1.tables_initializer())
+    return sess
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class SequenceFeaturesTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args_a': {
-           # example 0, ids [2]
-           # example 1, ids [0, 1]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (2, 0, 1),
-           'dense_shape': (2, 2)},
-       'sparse_input_args_b': {
-           # example 0, ids [1]
-           # example 1, ids [2, 0]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (1, 2, 0),
-           'dense_shape': (2, 2)},
-       'expected_input_layer': [
-           # example 0, ids_a [2], ids_b [1]
-           [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
-           # example 1, ids_a [0, 1], ids_b [2, 0]
-           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],],
-       'expected_sequence_length': [1, 2]},
-      {'testcase_name': '3D',
-       'sparse_input_args_a': {
-           # feature 0, ids [[2], [0, 1]]
-           # feature 1, ids [[0, 0], [1]]
-           'indices': (
-               (0, 0, 0), (0, 1, 0), (0, 1, 1),
-               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
-           'values': (2, 0, 1, 0, 0, 1),
-           'dense_shape': (2, 2, 2)},
-       'sparse_input_args_b': {
-           # feature 0, ids [[1, 1], [1]]
-           # feature 1, ids [[2], [0]]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (1, 1, 1, 2, 0),
-           'dense_shape': (2, 2, 2)},
-       'expected_input_layer': [
-           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
-           [[5., 6., 14., 15., 16.], [2., 3., 14., 15., 16.]],
-           # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
-           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
-       'expected_sequence_length': [2, 2]},
-      )
-  def test_embedding_column(
-      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
-      expected_sequence_length):
-
-    sparse_input_a = tf.compat.v1.SparseTensorValue(**sparse_input_args_a)
-    sparse_input_b = tf.compat.v1.SparseTensorValue(**sparse_input_args_b)
-    vocabulary_size = 3
-    embedding_dimension_a = 2
-    embedding_values_a = (
-        (1., 2.),  # id 0
-        (3., 4.),  # id 1
-        (5., 6.)  # id 2
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "2D",
+            "sparse_input_args_a": {
+                # example 0, ids [2]
+                # example 1, ids [0, 1]
+                "indices": ((0, 0), (1, 0), (1, 1)),
+                "values": (2, 0, 1),
+                "dense_shape": (2, 2),
+            },
+            "sparse_input_args_b": {
+                # example 0, ids [1]
+                # example 1, ids [2, 0]
+                "indices": ((0, 0), (1, 0), (1, 1)),
+                "values": (1, 2, 0),
+                "dense_shape": (2, 2),
+            },
+            "expected_input_layer": [
+                # example 0, ids_a [2], ids_b [1]
+                [[5.0, 6.0, 14.0, 15.0, 16.0], [0.0, 0.0, 0.0, 0.0, 0.0]],
+                # example 1, ids_a [0, 1], ids_b [2, 0]
+                [[1.0, 2.0, 17.0, 18.0, 19.0], [3.0, 4.0, 11.0, 12.0, 13.0]],
+            ],
+            "expected_sequence_length": [1, 2],
+        },
+        {
+            "testcase_name": "3D",
+            "sparse_input_args_a": {
+                # feature 0, ids [[2], [0, 1]]
+                # feature 1, ids [[0, 0], [1]]
+                "indices": (
+                    (0, 0, 0),
+                    (0, 1, 0),
+                    (0, 1, 1),
+                    (1, 0, 0),
+                    (1, 0, 1),
+                    (1, 1, 0),
+                ),
+                "values": (2, 0, 1, 0, 0, 1),
+                "dense_shape": (2, 2, 2),
+            },
+            "sparse_input_args_b": {
+                # feature 0, ids [[1, 1], [1]]
+                # feature 1, ids [[2], [0]]
+                "indices": (
+                    (0, 0, 0),
+                    (0, 0, 1),
+                    (0, 1, 0),
+                    (1, 0, 0),
+                    (1, 1, 0),
+                ),
+                "values": (1, 1, 1, 2, 0),
+                "dense_shape": (2, 2, 2),
+            },
+            "expected_input_layer": [
+                # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+                [[5.0, 6.0, 14.0, 15.0, 16.0], [2.0, 3.0, 14.0, 15.0, 16.0]],
+                # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
+                [[1.0, 2.0, 17.0, 18.0, 19.0], [3.0, 4.0, 11.0, 12.0, 13.0]],
+            ],
+            "expected_sequence_length": [2, 2],
+        },
     )
-    embedding_dimension_b = 3
-    embedding_values_b = (
-        (11., 12., 13.),  # id 0
-        (14., 15., 16.),  # id 1
-        (17., 18., 19.)  # id 2
+    def test_embedding_column(
+        self,
+        sparse_input_args_a,
+        sparse_input_args_b,
+        expected_input_layer,
+        expected_sequence_length,
+    ):
+
+        sparse_input_a = tf.compat.v1.SparseTensorValue(**sparse_input_args_a)
+        sparse_input_b = tf.compat.v1.SparseTensorValue(**sparse_input_args_b)
+        vocabulary_size = 3
+        embedding_dimension_a = 2
+        embedding_values_a = (
+            (1.0, 2.0),  # id 0
+            (3.0, 4.0),  # id 1
+            (5.0, 6.0),  # id 2
+        )
+        embedding_dimension_b = 3
+        embedding_values_b = (
+            (11.0, 12.0, 13.0),  # id 0
+            (14.0, 15.0, 16.0),  # id 1
+            (17.0, 18.0, 19.0),  # id 2
+        )
+
+        def _get_initializer(embedding_dimension, embedding_values):
+            def _initializer(shape, dtype, partition_info=None):
+                self.assertAllEqual(
+                    (vocabulary_size, embedding_dimension), shape
+                )
+                self.assertEqual(tf.float32, dtype)
+                self.assertIsNone(partition_info)
+                return embedding_values
+
+            return _initializer
+
+        categorical_column_a = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                key="aaa", num_buckets=vocabulary_size
+            )
+        )
+        embedding_column_a = tf.feature_column.embedding_column(
+            categorical_column_a,
+            dimension=embedding_dimension_a,
+            initializer=_get_initializer(
+                embedding_dimension_a, embedding_values_a
+            ),
+        )
+        categorical_column_b = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                key="bbb", num_buckets=vocabulary_size
+            )
+        )
+        embedding_column_b = tf.feature_column.embedding_column(
+            categorical_column_b,
+            dimension=embedding_dimension_b,
+            initializer=_get_initializer(
+                embedding_dimension_b, embedding_values_b
+            ),
+        )
+
+        # Test that columns are reordered alphabetically.
+        sequence_input_layer = ksfc.SequenceFeatures(
+            [embedding_column_b, embedding_column_a]
+        )
+        input_layer, sequence_length = sequence_input_layer(
+            {
+                "aaa": sparse_input_a,
+                "bbb": sparse_input_b,
+            }
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        weights = sequence_input_layer.weights
+        self.assertCountEqual(
+            (
+                "sequence_features/aaa_embedding/embedding_weights:0",
+                "sequence_features/bbb_embedding/embedding_weights:0",
+            ),
+            tuple([v.name for v in weights]),
+        )
+        self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
+        self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
+        self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+        self.assertAllEqual(
+            expected_sequence_length, self.evaluate(sequence_length)
+        )
+
+    def test_embedding_column_with_non_sequence_categorical(self):
+        """Tests that error is raised for non-sequence embedding column."""
+        vocabulary_size = 3
+        sparse_input = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(2, 0, 1),
+            dense_shape=(2, 2),
+        )
+
+        categorical_column_a = (
+            tf.feature_column.categorical_column_with_identity(
+                key="aaa", num_buckets=vocabulary_size
+            )
+        )
+        embedding_column_a = tf.feature_column.embedding_column(
+            categorical_column_a, dimension=2
+        )
+        sequence_input_layer = ksfc.SequenceFeatures([embedding_column_a])
+        with self.assertRaisesRegex(
+            ValueError,
+            r"In embedding_column: aaa_embedding\. categorical_column must be "
+            r"of type SequenceCategoricalColumn to use SequenceFeatures\.",
+        ):
+            _, _ = sequence_input_layer({"aaa": sparse_input})
+
+    def test_shared_embedding_column(self):
+        with tf.Graph().as_default():
+            vocabulary_size = 3
+            sparse_input_a = tf.compat.v1.SparseTensorValue(
+                # example 0, ids [2]
+                # example 1, ids [0, 1]
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(2, 0, 1),
+                dense_shape=(2, 2),
+            )
+            sparse_input_b = tf.compat.v1.SparseTensorValue(
+                # example 0, ids [1]
+                # example 1, ids [2, 0]
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(1, 2, 0),
+                dense_shape=(2, 2),
+            )
+
+            embedding_dimension = 2
+            embedding_values = (
+                (1.0, 2.0),  # id 0
+                (3.0, 4.0),  # id 1
+                (5.0, 6.0),  # id 2
+            )
+
+            def _get_initializer(embedding_dimension, embedding_values):
+                def _initializer(shape, dtype, partition_info=None):
+                    self.assertAllEqual(
+                        (vocabulary_size, embedding_dimension), shape
+                    )
+                    self.assertEqual(tf.float32, dtype)
+                    self.assertIsNone(partition_info)
+                    return embedding_values
+
+                return _initializer
+
+            expected_input_layer = [
+                # example 0, ids_a [2], ids_b [1]
+                [[5.0, 6.0, 3.0, 4.0], [0.0, 0.0, 0.0, 0.0]],
+                # example 1, ids_a [0, 1], ids_b [2, 0]
+                [[1.0, 2.0, 5.0, 6.0], [3.0, 4.0, 1.0, 2.0]],
+            ]
+            expected_sequence_length = [1, 2]
+
+            categorical_column_a = (
+                tf.feature_column.sequence_categorical_column_with_identity(
+                    key="aaa", num_buckets=vocabulary_size
+                )
+            )
+            categorical_column_b = (
+                tf.feature_column.sequence_categorical_column_with_identity(
+                    key="bbb", num_buckets=vocabulary_size
+                )
+            )
+            # Test that columns are reordered alphabetically.
+            shared_embedding_columns = tf.feature_column.shared_embeddings(
+                [categorical_column_b, categorical_column_a],
+                dimension=embedding_dimension,
+                initializer=_get_initializer(
+                    embedding_dimension, embedding_values
+                ),
+            )
+
+            sequence_input_layer = ksfc.SequenceFeatures(
+                shared_embedding_columns
+            )
+            input_layer, sequence_length = sequence_input_layer(
+                {"aaa": sparse_input_a, "bbb": sparse_input_b}
+            )
+
+            global_vars = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+            )
+            self.assertCountEqual(
+                ("aaa_bbb_shared_embedding:0",),
+                tuple([v.name for v in global_vars]),
+            )
+            with _initialized_session() as sess:
+                self.assertAllEqual(
+                    embedding_values, global_vars[0].eval(session=sess)
+                )
+                self.assertAllEqual(
+                    expected_input_layer, input_layer.eval(session=sess)
+                )
+                self.assertAllEqual(
+                    expected_sequence_length, sequence_length.eval(session=sess)
+                )
+
+    def test_shared_embedding_column_with_non_sequence_categorical(self):
+        """Tests that error is raised for non-sequence shared embedding
+        column."""
+        with tf.Graph().as_default():
+            vocabulary_size = 3
+            sparse_input_a = tf.compat.v1.SparseTensorValue(
+                # example 0, ids [2]
+                # example 1, ids [0, 1]
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(2, 0, 1),
+                dense_shape=(2, 2),
+            )
+            sparse_input_b = tf.compat.v1.SparseTensorValue(
+                # example 0, ids [2]
+                # example 1, ids [0, 1]
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(2, 0, 1),
+                dense_shape=(2, 2),
+            )
+
+            categorical_column_a = (
+                tf.feature_column.categorical_column_with_identity(
+                    key="aaa", num_buckets=vocabulary_size
+                )
+            )
+            categorical_column_b = (
+                tf.feature_column.categorical_column_with_identity(
+                    key="bbb", num_buckets=vocabulary_size
+                )
+            )
+            shared_embedding_columns = tf.feature_column.shared_embeddings(
+                [categorical_column_a, categorical_column_b], dimension=2
+            )
+
+            sequence_input_layer = ksfc.SequenceFeatures(
+                shared_embedding_columns
+            )
+            with self.assertRaisesRegex(
+                ValueError,
+                r"In embedding_column: aaa_shared_embedding\. "
+                r"categorical_column must "
+                r"be of type SequenceCategoricalColumn to use "
+                r"SequenceFeatures\.",
+            ):
+                _, _ = sequence_input_layer(
+                    {"aaa": sparse_input_a, "bbb": sparse_input_b}
+                )
+
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "2D",
+            "sparse_input_args_a": {
+                # example 0, ids [2]
+                # example 1, ids [0, 1]
+                "indices": ((0, 0), (1, 0), (1, 1)),
+                "values": (2, 0, 1),
+                "dense_shape": (2, 2),
+            },
+            "sparse_input_args_b": {
+                # example 0, ids [1]
+                # example 1, ids [1, 0]
+                "indices": ((0, 0), (1, 0), (1, 1)),
+                "values": (1, 1, 0),
+                "dense_shape": (2, 2),
+            },
+            "expected_input_layer": [
+                # example 0, ids_a [2], ids_b [1]
+                [[0.0, 0.0, 1.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0]],
+                # example 1, ids_a [0, 1], ids_b [1, 0]
+                [[1.0, 0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 1.0, 0.0]],
+            ],
+            "expected_sequence_length": [1, 2],
+        },
+        {
+            "testcase_name": "3D",
+            "sparse_input_args_a": {
+                # feature 0, ids [[2], [0, 1]]
+                # feature 1, ids [[0, 0], [1]]
+                "indices": (
+                    (0, 0, 0),
+                    (0, 1, 0),
+                    (0, 1, 1),
+                    (1, 0, 0),
+                    (1, 0, 1),
+                    (1, 1, 0),
+                ),
+                "values": (2, 0, 1, 0, 0, 1),
+                "dense_shape": (2, 2, 2),
+            },
+            "sparse_input_args_b": {
+                # feature 0, ids [[1, 1], [1]]
+                # feature 1, ids [[1], [0]]
+                "indices": (
+                    (0, 0, 0),
+                    (0, 0, 1),
+                    (0, 1, 0),
+                    (1, 0, 0),
+                    (1, 1, 0),
+                ),
+                "values": (1, 1, 1, 1, 0),
+                "dense_shape": (2, 2, 2),
+            },
+            "expected_input_layer": [
+                # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+                [[0.0, 0.0, 1.0, 0.0, 2.0], [1.0, 1.0, 0.0, 0.0, 1.0]],
+                # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
+                [[2.0, 0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 1.0, 0.0]],
+            ],
+            "expected_sequence_length": [2, 2],
+        },
     )
-    def _get_initializer(embedding_dimension, embedding_values):
-
-      def _initializer(shape, dtype, partition_info=None):
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertEqual(tf.float32, dtype)
-        self.assertIsNone(partition_info)
-        return embedding_values
-      return _initializer
-
-    categorical_column_a = tf.feature_column.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = tf.feature_column.embedding_column(
-        categorical_column_a,
-        dimension=embedding_dimension_a,
-        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
-    categorical_column_b = tf.feature_column.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = tf.feature_column.embedding_column(
-        categorical_column_b,
-        dimension=embedding_dimension_b,
-        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
-
-    # Test that columns are reordered alphabetically.
-    sequence_input_layer = ksfc.SequenceFeatures(
-        [embedding_column_b, embedding_column_a])
-    input_layer, sequence_length = sequence_input_layer({
-        'aaa': sparse_input_a, 'bbb': sparse_input_b,})
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    weights = sequence_input_layer.weights
-    self.assertCountEqual(
-        ('sequence_features/aaa_embedding/embedding_weights:0',
-         'sequence_features/bbb_embedding/embedding_weights:0'),
-        tuple([v.name for v in weights]))
-    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
-    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  def test_embedding_column_with_non_sequence_categorical(self):
-    """Tests that error is raised for non-sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = tf.feature_column.embedding_column(
-        categorical_column_a, dimension=2)
-    sequence_input_layer = ksfc.SequenceFeatures([embedding_column_a])
-    with self.assertRaisesRegex(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must be of '
-        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
-      _, _ = sequence_input_layer({'aaa': sparse_input})
-
-  def test_shared_embedding_column(self):
-    with tf.Graph().as_default():
-      vocabulary_size = 3
-      sparse_input_a = tf.compat.v1.SparseTensorValue(
-          # example 0, ids [2]
-          # example 1, ids [0, 1]
-          indices=((0, 0), (1, 0), (1, 1)),
-          values=(2, 0, 1),
-          dense_shape=(2, 2))
-      sparse_input_b = tf.compat.v1.SparseTensorValue(
-          # example 0, ids [1]
-          # example 1, ids [2, 0]
-          indices=((0, 0), (1, 0), (1, 1)),
-          values=(1, 2, 0),
-          dense_shape=(2, 2))
-
-      embedding_dimension = 2
-      embedding_values = (
-          (1., 2.),  # id 0
-          (3., 4.),  # id 1
-          (5., 6.)  # id 2
-      )
-
-      def _get_initializer(embedding_dimension, embedding_values):
-
-        def _initializer(shape, dtype, partition_info=None):
-          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-          self.assertEqual(tf.float32, dtype)
-          self.assertIsNone(partition_info)
-          return embedding_values
-
-        return _initializer
-
-      expected_input_layer = [
-          # example 0, ids_a [2], ids_b [1]
-          [[5., 6., 3., 4.], [0., 0., 0., 0.]],
-          # example 1, ids_a [0, 1], ids_b [2, 0]
-          [[1., 2., 5., 6.], [3., 4., 1., 2.]],
-      ]
-      expected_sequence_length = [1, 2]
-
-      categorical_column_a = tf.feature_column.sequence_categorical_column_with_identity(
-          key='aaa', num_buckets=vocabulary_size)
-      categorical_column_b = tf.feature_column.sequence_categorical_column_with_identity(
-          key='bbb', num_buckets=vocabulary_size)
-      # Test that columns are reordered alphabetically.
-      shared_embedding_columns = tf.feature_column.shared_embeddings(
-          [categorical_column_b, categorical_column_a],
-          dimension=embedding_dimension,
-          initializer=_get_initializer(embedding_dimension, embedding_values))
-
-      sequence_input_layer = ksfc.SequenceFeatures(shared_embedding_columns)
-      input_layer, sequence_length = sequence_input_layer({
-          'aaa': sparse_input_a, 'bbb': sparse_input_b})
-
-      global_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-      self.assertCountEqual(
-          ('aaa_bbb_shared_embedding:0',),
-          tuple([v.name for v in global_vars]))
-      with _initialized_session() as sess:
-        self.assertAllEqual(embedding_values,
-                            global_vars[0].eval(session=sess))
-        self.assertAllEqual(expected_input_layer,
-                            input_layer.eval(session=sess))
+    def test_indicator_column(
+        self,
+        sparse_input_args_a,
+        sparse_input_args_b,
+        expected_input_layer,
+        expected_sequence_length,
+    ):
+        sparse_input_a = tf.compat.v1.SparseTensorValue(**sparse_input_args_a)
+        sparse_input_b = tf.compat.v1.SparseTensorValue(**sparse_input_args_b)
+
+        vocabulary_size_a = 3
+        vocabulary_size_b = 2
+
+        categorical_column_a = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                key="aaa", num_buckets=vocabulary_size_a
+            )
+        )
+        indicator_column_a = tf.feature_column.indicator_column(
+            categorical_column_a
+        )
+        categorical_column_b = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                key="bbb", num_buckets=vocabulary_size_b
+            )
+        )
+        indicator_column_b = tf.feature_column.indicator_column(
+            categorical_column_b
+        )
+        # Test that columns are reordered alphabetically.
+        sequence_input_layer = ksfc.SequenceFeatures(
+            [indicator_column_b, indicator_column_a]
+        )
+        input_layer, sequence_length = sequence_input_layer(
+            {"aaa": sparse_input_a, "bbb": sparse_input_b}
+        )
+
+        self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
         self.assertAllEqual(
-            expected_sequence_length, sequence_length.eval(session=sess))
-
-  def test_shared_embedding_column_with_non_sequence_categorical(self):
-    """Tests that error is raised for non-sequence shared embedding column."""
-    with tf.Graph().as_default():
-      vocabulary_size = 3
-      sparse_input_a = tf.compat.v1.SparseTensorValue(
-          # example 0, ids [2]
-          # example 1, ids [0, 1]
-          indices=((0, 0), (1, 0), (1, 1)),
-          values=(2, 0, 1),
-          dense_shape=(2, 2))
-      sparse_input_b = tf.compat.v1.SparseTensorValue(
-          # example 0, ids [2]
-          # example 1, ids [0, 1]
-          indices=((0, 0), (1, 0), (1, 1)),
-          values=(2, 0, 1),
-          dense_shape=(2, 2))
-
-      categorical_column_a = tf.feature_column.categorical_column_with_identity(
-          key='aaa', num_buckets=vocabulary_size)
-      categorical_column_b = tf.feature_column.categorical_column_with_identity(
-          key='bbb', num_buckets=vocabulary_size)
-      shared_embedding_columns = tf.feature_column.shared_embeddings(
-          [categorical_column_a, categorical_column_b], dimension=2)
-
-      sequence_input_layer = ksfc.SequenceFeatures(shared_embedding_columns)
-      with self.assertRaisesRegex(
-          ValueError,
-          r'In embedding_column: aaa_shared_embedding\. '
-          r'categorical_column must '
-          r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'):
-        _, _ = sequence_input_layer({'aaa': sparse_input_a,
-                                     'bbb': sparse_input_b})
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args_a': {
-           # example 0, ids [2]
-           # example 1, ids [0, 1]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (2, 0, 1),
-           'dense_shape': (2, 2)},
-       'sparse_input_args_b': {
-           # example 0, ids [1]
-           # example 1, ids [1, 0]
-           'indices': ((0, 0), (1, 0), (1, 1)),
-           'values': (1, 1, 0),
-           'dense_shape': (2, 2)},
-       'expected_input_layer': [
-           # example 0, ids_a [2], ids_b [1]
-           [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
-           # example 1, ids_a [0, 1], ids_b [1, 0]
-           [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
-       'expected_sequence_length': [1, 2]},
-      {'testcase_name': '3D',
-       'sparse_input_args_a': {
-           # feature 0, ids [[2], [0, 1]]
-           # feature 1, ids [[0, 0], [1]]
-           'indices': (
-               (0, 0, 0), (0, 1, 0), (0, 1, 1),
-               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
-           'values': (2, 0, 1, 0, 0, 1),
-           'dense_shape': (2, 2, 2)},
-       'sparse_input_args_b': {
-           # feature 0, ids [[1, 1], [1]]
-           # feature 1, ids [[1], [0]]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (1, 1, 1, 1, 0),
-           'dense_shape': (2, 2, 2)},
-       'expected_input_layer': [
-           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
-           [[0., 0., 1., 0., 2.], [1., 1., 0., 0., 1.]],
-           # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
-           [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
-       'expected_sequence_length': [2, 2]},
-      )
-  def test_indicator_column(
-      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
-      expected_sequence_length):
-    sparse_input_a = tf.compat.v1.SparseTensorValue(**sparse_input_args_a)
-    sparse_input_b = tf.compat.v1.SparseTensorValue(**sparse_input_args_b)
-
-    vocabulary_size_a = 3
-    vocabulary_size_b = 2
-
-    categorical_column_a = tf.feature_column.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = tf.feature_column.indicator_column(categorical_column_a)
-    categorical_column_b = tf.feature_column.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = tf.feature_column.indicator_column(categorical_column_b)
-    # Test that columns are reordered alphabetically.
-    sequence_input_layer = ksfc.SequenceFeatures(
-        [indicator_column_b, indicator_column_a])
-    input_layer, sequence_length = sequence_input_layer({
-        'aaa': sparse_input_a, 'bbb': sparse_input_b})
-
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  def test_indicator_column_with_non_sequence_categorical(self):
-    """Tests that error is raised for non-sequence categorical column."""
-    vocabulary_size = 3
-    sparse_input = tf.compat.v1.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = tf.feature_column.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = tf.feature_column.indicator_column(categorical_column_a)
-
-    sequence_input_layer = ksfc.SequenceFeatures([indicator_column_a])
-    with self.assertRaisesRegex(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must be of '
-        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
-      _, _ = sequence_input_layer({'aaa': sparse_input})
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, values [0., 1]
-           # example 1, [10.]
-           'indices': ((0, 0), (0, 1), (1, 0)),
-           'values': (0., 1., 10.),
-           'dense_shape': (2, 2)},
-       'expected_input_layer': [
-           [[0.], [1.]],
-           [[10.], [0.]]],
-       'expected_sequence_length': [2, 1]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # feature 0, ids [[20, 3], [5]]
-           # feature 1, ids [[3], [8]]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (20., 3., 5., 3., 8.),
-           'dense_shape': (2, 2, 2)},
-       'expected_input_layer': [
-           [[20.], [3.], [5.], [0.]],
-           [[3.], [0.], [8.], [0.]]],
-       'expected_sequence_length': [2, 2]},
-      )
-  def test_numeric_column(
-      self, sparse_input_args, expected_input_layer, expected_sequence_length):
-    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
-
-    numeric_column = tf.feature_column.sequence_numeric_column('aaa')
-
-    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
-    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
-
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
-           # example 1, [10., 11., 12., 13.]
-           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
-                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 8)},
-       'expected_input_layer': [
-           # The output of numeric_column._get_dense_tensor should be flattened.
-           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
-           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
-       'expected_sequence_length': [2, 1]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
-           # example 1, [[10., 11., 12., 13.], []]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
-                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
-                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 2, 4)},
-       'expected_input_layer': [
-           # The output of numeric_column._get_dense_tensor should be flattened.
-           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
-           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
-       'expected_sequence_length': [2, 1]},
-      )
-  def test_numeric_column_multi_dim(
-      self, sparse_input_args, expected_input_layer, expected_sequence_length):
-    """Tests SequenceFeatures for multi-dimensional numeric_column."""
-    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
-
-    numeric_column = tf.feature_column.sequence_numeric_column('aaa', shape=(2, 2))
-
-    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
-    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
-
-    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
-    self.assertAllEqual(
-        expected_sequence_length, self.evaluate(sequence_length))
-
-  def test_sequence_length_not_equal(self):
-    """Tests that an error is raised when sequence lengths are not equal."""
-    # Input a with sequence_length = [2, 1]
-    sparse_input_a = tf.compat.v1.SparseTensorValue(
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    # Input b with sequence_length = [1, 1]
-    sparse_input_b = tf.compat.v1.SparseTensorValue(
-        indices=((0, 0), (1, 0)),
-        values=(1., 10.),
-        dense_shape=(2, 2))
-    numeric_column_a = tf.feature_column.sequence_numeric_column('aaa')
-    numeric_column_b = tf.feature_column.sequence_numeric_column('bbb')
-
-    sequence_input_layer = ksfc.SequenceFeatures(
-        [numeric_column_a, numeric_column_b])
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                r'Condition x == y did not hold.*'):
-      _, sequence_length = sequence_input_layer({
-          'aaa': sparse_input_a,
-          'bbb': sparse_input_b
-      })
-      self.evaluate(sequence_length)
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-           # example 1, [[[10., 11.],  [12., 13.]]]
-           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
-                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 8)},
-       'expected_shape': [2, 2, 4]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
-           # example 1, [[10., 11., 12., 13.], []]
-           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
-                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
-                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
-           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-           'dense_shape': (2, 2, 4)},
-       'expected_shape': [2, 2, 4]},
-      )
-  def test_static_shape_from_tensors_numeric(
-      self, sparse_input_args, expected_shape):
-    """Tests that we return a known static shape when we have one."""
-    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
-    numeric_column = tf.feature_column.sequence_numeric_column('aaa', shape=(2, 2))
-
-    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
-    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
-    shape = input_layer.get_shape()
-    self.assertEqual(shape, expected_shape)
-
-  @parameterized.named_parameters(
-      {'testcase_name': '2D',
-       'sparse_input_args': {
-           # example 0, ids [2]
-           # example 1, ids [0, 1]
-           # example 2, ids []
-           # example 3, ids [1]
-           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
-           'values': (2, 0, 1, 1),
-           'dense_shape': (4, 2)},
-       'expected_shape': [4, 2, 3]},
-      {'testcase_name': '3D',
-       'sparse_input_args': {
-           # example 0, ids [[2]]
-           # example 1, ids [[0, 1], [2]]
-           # example 2, ids []
-           # example 3, ids [[1], [0, 2]]
-           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
-                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
-           'values': (2, 0, 1, 2, 1, 0, 2),
-           'dense_shape': (4, 2, 2)},
-       'expected_shape': [4, 2, 3]}
-      )
-  def test_static_shape_from_tensors_indicator(
-      self, sparse_input_args, expected_shape):
-    """Tests that we return a known static shape when we have one."""
-    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
-    categorical_column = tf.feature_column.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    indicator_column = tf.feature_column.indicator_column(categorical_column)
-
-    sequence_input_layer = ksfc.SequenceFeatures([indicator_column])
-    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
-    shape = input_layer.get_shape()
-    self.assertEqual(shape, expected_shape)
-
-  def test_compute_output_shape(self):
-    price1 = tf.feature_column.sequence_numeric_column('price1', shape=2)
-    price2 = tf.feature_column.sequence_numeric_column('price2')
-    features = {
-        'price1': tf.SparseTensor(
-            indices=[[0, 0, 0], [0, 0, 1],
-                     [0, 1, 0], [0, 1, 1],
-                     [1, 0, 0], [1, 0, 1],
-                     [2, 0, 0], [2, 0, 1],
-                     [3, 0, 0], [3, 0, 1]],
-            values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
-            dense_shape=(4, 3, 2)),
-        'price2': tf.SparseTensor(
-            indices=[[0, 0],
-                     [0, 1],
-                     [1, 0],
-                     [2, 0],
-                     [3, 0]],
-            values=[10., 11., 20., 30., 40.],
-            dense_shape=(4, 3))}
-    sequence_features = ksfc.SequenceFeatures([price1, price2])
-    seq_input, seq_len = sequence_features(features)
-    self.assertEqual(
-        sequence_features.compute_output_shape((None, None)),
-        (None, None, 3))
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(tf.compat.v1.tables_initializer())
-
-    self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
-                         [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
-                         [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
-                         [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
-                        self.evaluate(seq_input))
-    self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class SequenceFeaturesSerializationTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(('trainable', True, 'trainable'),
-                                  ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [tf.feature_column.sequence_numeric_column('a')]
-    orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 1)
-    self.assertEqual(config['feature_columns'][0]['class_name'],
-                     'SequenceNumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-
-  @parameterized.named_parameters(('trainable', True, 'trainable'),
-                                  ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [tf.feature_column.sequence_numeric_column('a')]
-    orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = ksfc.SequenceFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-
-  def test_serialization_sequence_features(self):
-    rating = tf.feature_column.sequence_numeric_column('rating')
-    sequence_feature = ksfc.SequenceFeatures([rating])
-    config = keras.layers.serialize(sequence_feature)
-
-    revived = keras.layers.deserialize(config)
-    self.assertIsInstance(revived, ksfc.SequenceFeatures)
+            expected_sequence_length, self.evaluate(sequence_length)
+        )
+
+    def test_indicator_column_with_non_sequence_categorical(self):
+        """Tests that error is raised for non-sequence categorical column."""
+        vocabulary_size = 3
+        sparse_input = tf.compat.v1.SparseTensorValue(
+            # example 0, ids [2]
+            # example 1, ids [0, 1]
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(2, 0, 1),
+            dense_shape=(2, 2),
+        )
+
+        categorical_column_a = (
+            tf.feature_column.categorical_column_with_identity(
+                key="aaa", num_buckets=vocabulary_size
+            )
+        )
+        indicator_column_a = tf.feature_column.indicator_column(
+            categorical_column_a
+        )
+
+        sequence_input_layer = ksfc.SequenceFeatures([indicator_column_a])
+        with self.assertRaisesRegex(
+            ValueError,
+            r"In indicator_column: aaa_indicator\. categorical_column must be "
+            r"of type SequenceCategoricalColumn to use SequenceFeatures\.",
+        ):
+            _, _ = sequence_input_layer({"aaa": sparse_input})
+
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "2D",
+            "sparse_input_args": {
+                # example 0, values [0., 1]
+                # example 1, [10.]
+                "indices": ((0, 0), (0, 1), (1, 0)),
+                "values": (0.0, 1.0, 10.0),
+                "dense_shape": (2, 2),
+            },
+            "expected_input_layer": [[[0.0], [1.0]], [[10.0], [0.0]]],
+            "expected_sequence_length": [2, 1],
+        },
+        {
+            "testcase_name": "3D",
+            "sparse_input_args": {
+                # feature 0, ids [[20, 3], [5]]
+                # feature 1, ids [[3], [8]]
+                "indices": (
+                    (0, 0, 0),
+                    (0, 0, 1),
+                    (0, 1, 0),
+                    (1, 0, 0),
+                    (1, 1, 0),
+                ),
+                "values": (20.0, 3.0, 5.0, 3.0, 8.0),
+                "dense_shape": (2, 2, 2),
+            },
+            "expected_input_layer": [
+                [[20.0], [3.0], [5.0], [0.0]],
+                [[3.0], [0.0], [8.0], [0.0]],
+            ],
+            "expected_sequence_length": [2, 2],
+        },
+    )
+    def test_numeric_column(
+        self, sparse_input_args, expected_input_layer, expected_sequence_length
+    ):
+        sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
 
+        numeric_column = tf.feature_column.sequence_numeric_column("aaa")
 
-class SequenceFeaturesSavingTest(tf.test.TestCase, parameterized.TestCase):
+        sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
+        input_layer, sequence_length = sequence_input_layer(
+            {"aaa": sparse_input}
+        )
 
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_saving_with_sequence_features(self):
-    cols = [
-        tf.feature_column.sequence_numeric_column('a'),
-        tf.feature_column.indicator_column(
-            tf.feature_column.sequence_categorical_column_with_vocabulary_list(
-                'b', ['one', 'two']))
-    ]
-    input_layers = {
-        'a':
-            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
-        'b':
-            keras.layers.Input(
-                shape=(None, 1), sparse=True, name='b', dtype='string')
-    }
-
-    fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
-    # TODO(tibell): Figure out the right dtype and apply masking.
-    # sequence_length_mask = array_ops.sequence_mask(sequence_length)
-    # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
-    x = keras.layers.GRU(32)(fc_layer)
-    output = keras.layers.Dense(10)(x)
-
-    model = keras.models.Model(input_layers, output)
-
-    model.compile(
-        loss=keras.losses.MSE,
-        optimizer='rmsprop',
-        metrics=[keras.metrics.categorical_accuracy])
-
-    config = model.to_json()
-    loaded_model = model_config.model_from_json(config)
-
-    batch_size = 10
-    timesteps = 1
-
-    values_a = np.arange(10, dtype=np.float32)
-    indices_a = np.zeros((10, 3), dtype=np.int64)
-    indices_a[:, 0] = np.arange(10)
-    inputs_a = tf.SparseTensor(indices_a, values_a,
-                                          (batch_size, timesteps, 1))
-
-    values_b = np.zeros(10, dtype=np.str)
-    indices_b = np.zeros((10, 3), dtype=np.int64)
-    indices_b[:, 0] = np.arange(10)
-    inputs_b = tf.SparseTensor(indices_b, values_b,
-                                          (batch_size, timesteps, 1))
-
-    with self.cached_session():
-      # Initialize tables for V1 lookup.
-      if not tf.executing_eagerly():
+        self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+        self.assertAllEqual(
+            expected_sequence_length, self.evaluate(sequence_length)
+        )
+
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "2D",
+            "sparse_input_args": {
+                # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
+                # example 1, [10., 11., 12., 13.]
+                "indices": (
+                    (0, 0),
+                    (0, 1),
+                    (0, 2),
+                    (0, 3),
+                    (0, 4),
+                    (0, 5),
+                    (0, 6),
+                    (0, 7),
+                    (1, 0),
+                    (1, 1),
+                    (1, 2),
+                    (1, 3),
+                ),
+                "values": (
+                    0.0,
+                    1.0,
+                    2.0,
+                    3.0,
+                    4.0,
+                    5.0,
+                    6.0,
+                    7.0,
+                    10.0,
+                    11.0,
+                    12.0,
+                    13.0,
+                ),
+                "dense_shape": (2, 8),
+            },
+            "expected_input_layer": [
+                # The output of numeric_column._get_dense_tensor should be
+                # flattened.
+                [[0.0, 1.0, 2.0, 3.0], [4.0, 5.0, 6.0, 7.0]],
+                [[10.0, 11.0, 12.0, 13.0], [0.0, 0.0, 0.0, 0.0]],
+            ],
+            "expected_sequence_length": [2, 1],
+        },
+        {
+            "testcase_name": "3D",
+            "sparse_input_args": {
+                # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+                # example 1, [[10., 11., 12., 13.], []]
+                "indices": (
+                    (0, 0, 0),
+                    (0, 0, 1),
+                    (0, 0, 2),
+                    (0, 0, 3),
+                    (0, 1, 0),
+                    (0, 1, 1),
+                    (0, 1, 2),
+                    (0, 1, 3),
+                    (1, 0, 0),
+                    (1, 0, 1),
+                    (1, 0, 2),
+                    (1, 0, 3),
+                ),
+                "values": (
+                    0.0,
+                    1.0,
+                    2.0,
+                    3.0,
+                    4.0,
+                    5.0,
+                    6.0,
+                    7.0,
+                    10.0,
+                    11.0,
+                    12.0,
+                    13.0,
+                ),
+                "dense_shape": (2, 2, 4),
+            },
+            "expected_input_layer": [
+                # The output of numeric_column._get_dense_tensor should be
+                # flattened.
+                [[0.0, 1.0, 2.0, 3.0], [4.0, 5.0, 6.0, 7.0]],
+                [[10.0, 11.0, 12.0, 13.0], [0.0, 0.0, 0.0, 0.0]],
+            ],
+            "expected_sequence_length": [2, 1],
+        },
+    )
+    def test_numeric_column_multi_dim(
+        self, sparse_input_args, expected_input_layer, expected_sequence_length
+    ):
+        """Tests SequenceFeatures for multi-dimensional numeric_column."""
+        sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
+
+        numeric_column = tf.feature_column.sequence_numeric_column(
+            "aaa", shape=(2, 2)
+        )
+
+        sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
+        input_layer, sequence_length = sequence_input_layer(
+            {"aaa": sparse_input}
+        )
+
+        self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+        self.assertAllEqual(
+            expected_sequence_length, self.evaluate(sequence_length)
+        )
+
+    def test_sequence_length_not_equal(self):
+        """Tests that an error is raised when sequence lengths are not equal."""
+        # Input a with sequence_length = [2, 1]
+        sparse_input_a = tf.compat.v1.SparseTensorValue(
+            indices=((0, 0), (0, 1), (1, 0)),
+            values=(0.0, 1.0, 10.0),
+            dense_shape=(2, 2),
+        )
+        # Input b with sequence_length = [1, 1]
+        sparse_input_b = tf.compat.v1.SparseTensorValue(
+            indices=((0, 0), (1, 0)), values=(1.0, 10.0), dense_shape=(2, 2)
+        )
+        numeric_column_a = tf.feature_column.sequence_numeric_column("aaa")
+        numeric_column_b = tf.feature_column.sequence_numeric_column("bbb")
+
+        sequence_input_layer = ksfc.SequenceFeatures(
+            [numeric_column_a, numeric_column_b]
+        )
+
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError, r"Condition x == y did not hold.*"
+        ):
+            _, sequence_length = sequence_input_layer(
+                {"aaa": sparse_input_a, "bbb": sparse_input_b}
+            )
+            self.evaluate(sequence_length)
+
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "2D",
+            "sparse_input_args": {
+                # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6.,
+                # 7.]]]
+                # example 1, [[[10., 11.],  [12., 13.]]]
+                "indices": (
+                    (0, 0),
+                    (0, 1),
+                    (0, 2),
+                    (0, 3),
+                    (0, 4),
+                    (0, 5),
+                    (0, 6),
+                    (0, 7),
+                    (1, 0),
+                    (1, 1),
+                    (1, 2),
+                    (1, 3),
+                ),
+                "values": (
+                    0.0,
+                    1.0,
+                    2.0,
+                    3.0,
+                    4.0,
+                    5.0,
+                    6.0,
+                    7.0,
+                    10.0,
+                    11.0,
+                    12.0,
+                    13.0,
+                ),
+                "dense_shape": (2, 8),
+            },
+            "expected_shape": [2, 2, 4],
+        },
+        {
+            "testcase_name": "3D",
+            "sparse_input_args": {
+                # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+                # example 1, [[10., 11., 12., 13.], []]
+                "indices": (
+                    (0, 0, 0),
+                    (0, 0, 1),
+                    (0, 0, 2),
+                    (0, 0, 3),
+                    (0, 1, 0),
+                    (0, 1, 1),
+                    (0, 1, 2),
+                    (0, 1, 3),
+                    (1, 0, 0),
+                    (1, 0, 1),
+                    (1, 0, 2),
+                    (1, 0, 3),
+                ),
+                "values": (
+                    0.0,
+                    1.0,
+                    2.0,
+                    3.0,
+                    4.0,
+                    5.0,
+                    6.0,
+                    7.0,
+                    10.0,
+                    11.0,
+                    12.0,
+                    13.0,
+                ),
+                "dense_shape": (2, 2, 4),
+            },
+            "expected_shape": [2, 2, 4],
+        },
+    )
+    def test_static_shape_from_tensors_numeric(
+        self, sparse_input_args, expected_shape
+    ):
+        """Tests that we return a known static shape when we have one."""
+        sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
+        numeric_column = tf.feature_column.sequence_numeric_column(
+            "aaa", shape=(2, 2)
+        )
+
+        sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
+        input_layer, _ = sequence_input_layer({"aaa": sparse_input})
+        shape = input_layer.get_shape()
+        self.assertEqual(shape, expected_shape)
+
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "2D",
+            "sparse_input_args": {
+                # example 0, ids [2]
+                # example 1, ids [0, 1]
+                # example 2, ids []
+                # example 3, ids [1]
+                "indices": ((0, 0), (1, 0), (1, 1), (3, 0)),
+                "values": (2, 0, 1, 1),
+                "dense_shape": (4, 2),
+            },
+            "expected_shape": [4, 2, 3],
+        },
+        {
+            "testcase_name": "3D",
+            "sparse_input_args": {
+                # example 0, ids [[2]]
+                # example 1, ids [[0, 1], [2]]
+                # example 2, ids []
+                # example 3, ids [[1], [0, 2]]
+                "indices": (
+                    (0, 0, 0),
+                    (1, 0, 0),
+                    (1, 0, 1),
+                    (1, 1, 0),
+                    (3, 0, 0),
+                    (3, 1, 0),
+                    (3, 1, 1),
+                ),
+                "values": (2, 0, 1, 2, 1, 0, 2),
+                "dense_shape": (4, 2, 2),
+            },
+            "expected_shape": [4, 2, 3],
+        },
+    )
+    def test_static_shape_from_tensors_indicator(
+        self, sparse_input_args, expected_shape
+    ):
+        """Tests that we return a known static shape when we have one."""
+        sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
+        categorical_column = (
+            tf.feature_column.sequence_categorical_column_with_identity(
+                key="aaa", num_buckets=3
+            )
+        )
+        indicator_column = tf.feature_column.indicator_column(
+            categorical_column
+        )
+
+        sequence_input_layer = ksfc.SequenceFeatures([indicator_column])
+        input_layer, _ = sequence_input_layer({"aaa": sparse_input})
+        shape = input_layer.get_shape()
+        self.assertEqual(shape, expected_shape)
+
+    def test_compute_output_shape(self):
+        price1 = tf.feature_column.sequence_numeric_column("price1", shape=2)
+        price2 = tf.feature_column.sequence_numeric_column("price2")
+        features = {
+            "price1": tf.SparseTensor(
+                indices=[
+                    [0, 0, 0],
+                    [0, 0, 1],
+                    [0, 1, 0],
+                    [0, 1, 1],
+                    [1, 0, 0],
+                    [1, 0, 1],
+                    [2, 0, 0],
+                    [2, 0, 1],
+                    [3, 0, 0],
+                    [3, 0, 1],
+                ],
+                values=[
+                    0.0,
+                    1.0,
+                    10.0,
+                    11.0,
+                    100.0,
+                    101.0,
+                    200.0,
+                    201.0,
+                    300.0,
+                    301.0,
+                ],
+                dense_shape=(4, 3, 2),
+            ),
+            "price2": tf.SparseTensor(
+                indices=[[0, 0], [0, 1], [1, 0], [2, 0], [3, 0]],
+                values=[10.0, 11.0, 20.0, 30.0, 40.0],
+                dense_shape=(4, 3),
+            ),
+        }
+        sequence_features = ksfc.SequenceFeatures([price1, price2])
+        seq_input, seq_len = sequence_features(features)
+        self.assertEqual(
+            sequence_features.compute_output_shape((None, None)),
+            (None, None, 3),
+        )
+        self.evaluate(tf.compat.v1.global_variables_initializer())
         self.evaluate(tf.compat.v1.tables_initializer())
 
-      self.assertLen(
-          loaded_model.predict({
-              'a': inputs_a,
-              'b': inputs_b
-          }, steps=1), batch_size)
+        self.assertAllClose(
+            [
+                [[0.0, 1.0, 10.0], [10.0, 11.0, 11.0], [0.0, 0.0, 0.0]],
+                [[100.0, 101.0, 20.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
+                [[200.0, 201.0, 30.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
+                [[300.0, 301.0, 40.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
+            ],
+            self.evaluate(seq_input),
+        )
+        self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class SequenceFeaturesSerializationTest(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @parameterized.named_parameters(
+        ("trainable", True, "trainable"), ("not_trainable", False, "frozen")
+    )
+    def test_get_config(self, trainable, name):
+        cols = [tf.feature_column.sequence_numeric_column("a")]
+        orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
+        config = orig_layer.get_config()
+
+        self.assertEqual(config["name"], orig_layer.name)
+        self.assertEqual(config["trainable"], trainable)
+        self.assertLen(config["feature_columns"], 1)
+        self.assertEqual(
+            config["feature_columns"][0]["class_name"], "SequenceNumericColumn"
+        )
+        self.assertEqual(config["feature_columns"][0]["config"]["shape"], (1,))
+
+    @parameterized.named_parameters(
+        ("trainable", True, "trainable"), ("not_trainable", False, "frozen")
+    )
+    def test_from_config(self, trainable, name):
+        cols = [tf.feature_column.sequence_numeric_column("a")]
+        orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
+        config = orig_layer.get_config()
+
+        new_layer = ksfc.SequenceFeatures.from_config(config)
 
+        self.assertEqual(new_layer.name, orig_layer.name)
+        self.assertEqual(new_layer.trainable, trainable)
+        self.assertLen(new_layer._feature_columns, 1)
+        self.assertEqual(new_layer._feature_columns[0].name, "a")
 
-if __name__ == '__main__':
-  tf.test.main()
+    def test_serialization_sequence_features(self):
+        rating = tf.feature_column.sequence_numeric_column("rating")
+        sequence_feature = ksfc.SequenceFeatures([rating])
+        config = keras.layers.serialize(sequence_feature)
+
+        revived = keras.layers.deserialize(config)
+        self.assertIsInstance(revived, ksfc.SequenceFeatures)
+
+
+class SequenceFeaturesSavingTest(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_saving_with_sequence_features(self):
+        cols = [
+            tf.feature_column.sequence_numeric_column("a"),
+            tf.feature_column.indicator_column(
+                tf.feature_column.sequence_categorical_column_with_vocabulary_list(  # noqa: E501
+                    "b", ["one", "two"]
+                )
+            ),
+        ]
+        input_layers = {
+            "a": keras.layers.Input(shape=(None, 1), sparse=True, name="a"),
+            "b": keras.layers.Input(
+                shape=(None, 1), sparse=True, name="b", dtype="string"
+            ),
+        }
+
+        fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
+        # TODO(tibell): Figure out the right dtype and apply masking.
+        # sequence_length_mask = array_ops.sequence_mask(sequence_length)
+        # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
+        x = keras.layers.GRU(32)(fc_layer)
+        output = keras.layers.Dense(10)(x)
+
+        model = keras.models.Model(input_layers, output)
+
+        model.compile(
+            loss=keras.losses.MSE,
+            optimizer="rmsprop",
+            metrics=[keras.metrics.categorical_accuracy],
+        )
+
+        config = model.to_json()
+        loaded_model = model_config.model_from_json(config)
+
+        batch_size = 10
+        timesteps = 1
+
+        values_a = np.arange(10, dtype=np.float32)
+        indices_a = np.zeros((10, 3), dtype=np.int64)
+        indices_a[:, 0] = np.arange(10)
+        inputs_a = tf.SparseTensor(
+            indices_a, values_a, (batch_size, timesteps, 1)
+        )
+
+        values_b = np.zeros(10, dtype=str)
+        indices_b = np.zeros((10, 3), dtype=np.int64)
+        indices_b[:, 0] = np.arange(10)
+        inputs_b = tf.SparseTensor(
+            indices_b, values_b, (batch_size, timesteps, 1)
+        )
+
+        with self.cached_session():
+            # Initialize tables for V1 lookup.
+            if not tf.executing_eagerly():
+                self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertLen(
+                loaded_model.predict({"a": inputs_a, "b": inputs_b}, steps=1),
+                batch_size,
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/initializers/BUILD b/keras/initializers/BUILD
index 17b421722145..5dadf380f4c4 100644
--- a/keras/initializers/BUILD
+++ b/keras/initializers/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #   Contains the Keras initializer API (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
@@ -14,14 +16,15 @@ py_library(
     name = "initializers",
     srcs = [
         "__init__.py",
+        "initializers.py",
         "initializers_v1.py",
-        "initializers_v2.py",
     ],
     srcs_version = "PY3",
     deps = [
         "//:expect_tensorflow_installed",
         "//keras:backend",
         "//keras/dtensor:utils",
+        "//keras/saving:serialization_lib",
         "//keras/utils:generic_utils",
         "//keras/utils:tf_inspect",
     ],
diff --git a/keras/initializers/__init__.py b/keras/initializers/__init__.py
index abb4fa36e46b..0069ca2a082e 100644
--- a/keras/initializers/__init__.py
+++ b/keras/initializers/__init__.py
@@ -14,108 +14,116 @@
 # ==============================================================================
 """Keras initializer serialization / deserialization."""
 
-import tensorflow.compat.v2 as tf
-
 import threading
+import warnings
 
-from tensorflow.python import tf2
+import tensorflow.compat.v2 as tf
+
+from keras.initializers import initializers
 from keras.initializers import initializers_v1
-from keras.initializers import initializers_v2
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization as legacy_serialization
 from keras.utils import generic_utils
 from keras.utils import tf_inspect as inspect
+
+# isort: off
+from tensorflow.python import tf2
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util.tf_export import keras_export
 
-
 # LOCAL.ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
 def populate_deserializable_objects():
-  """Populates dict ALL_OBJECTS with every built-in initializer.
-  """
-  global LOCAL
-  if not hasattr(LOCAL, 'ALL_OBJECTS'):
+    """Populates dict ALL_OBJECTS with every built-in initializer."""
+    global LOCAL
+    if not hasattr(LOCAL, "ALL_OBJECTS"):
+        LOCAL.ALL_OBJECTS = {}
+        LOCAL.GENERATED_WITH_V2 = None
+
+    if (
+        LOCAL.ALL_OBJECTS
+        and LOCAL.GENERATED_WITH_V2 == tf.__internal__.tf2.enabled()
+    ):
+        # Objects dict is already generated for the proper TF version:
+        # do nothing.
+        return
+
     LOCAL.ALL_OBJECTS = {}
-    LOCAL.GENERATED_WITH_V2 = None
-
-  if LOCAL.ALL_OBJECTS and LOCAL.GENERATED_WITH_V2 == tf.__internal__.tf2.enabled():
-    # Objects dict is already generated for the proper TF version:
-    # do nothing.
-    return
-
-  LOCAL.ALL_OBJECTS = {}
-  LOCAL.GENERATED_WITH_V2 = tf.__internal__.tf2.enabled()
-
-  # Compatibility aliases (need to exist in both V1 and V2).
-  LOCAL.ALL_OBJECTS['ConstantV2'] = initializers_v2.Constant
-  LOCAL.ALL_OBJECTS['GlorotNormalV2'] = initializers_v2.GlorotNormal
-  LOCAL.ALL_OBJECTS['GlorotUniformV2'] = initializers_v2.GlorotUniform
-  LOCAL.ALL_OBJECTS['HeNormalV2'] = initializers_v2.HeNormal
-  LOCAL.ALL_OBJECTS['HeUniformV2'] = initializers_v2.HeUniform
-  LOCAL.ALL_OBJECTS['IdentityV2'] = initializers_v2.Identity
-  LOCAL.ALL_OBJECTS['LecunNormalV2'] = initializers_v2.LecunNormal
-  LOCAL.ALL_OBJECTS['LecunUniformV2'] = initializers_v2.LecunUniform
-  LOCAL.ALL_OBJECTS['OnesV2'] = initializers_v2.Ones
-  LOCAL.ALL_OBJECTS['OrthogonalV2'] = initializers_v2.Orthogonal
-  LOCAL.ALL_OBJECTS['RandomNormalV2'] = initializers_v2.RandomNormal
-  LOCAL.ALL_OBJECTS['RandomUniformV2'] = initializers_v2.RandomUniform
-  LOCAL.ALL_OBJECTS['TruncatedNormalV2'] = initializers_v2.TruncatedNormal
-  LOCAL.ALL_OBJECTS['VarianceScalingV2'] = initializers_v2.VarianceScaling
-  LOCAL.ALL_OBJECTS['ZerosV2'] = initializers_v2.Zeros
-
-  # Out of an abundance of caution we also include these aliases that have
-  # a non-zero probability of having been included in saved configs in the past.
-  LOCAL.ALL_OBJECTS['glorot_normalV2'] = initializers_v2.GlorotNormal
-  LOCAL.ALL_OBJECTS['glorot_uniformV2'] = initializers_v2.GlorotUniform
-  LOCAL.ALL_OBJECTS['he_normalV2'] = initializers_v2.HeNormal
-  LOCAL.ALL_OBJECTS['he_uniformV2'] = initializers_v2.HeUniform
-  LOCAL.ALL_OBJECTS['lecun_normalV2'] = initializers_v2.LecunNormal
-  LOCAL.ALL_OBJECTS['lecun_uniformV2'] = initializers_v2.LecunUniform
-
-  if tf.__internal__.tf2.enabled():
-    # For V2, entries are generated automatically based on the content of
-    # initializers_v2.py.
-    v2_objs = {}
-    base_cls = initializers_v2.Initializer
-    generic_utils.populate_dict_with_module_objects(
-        v2_objs,
-        [initializers_v2],
-        obj_filter=lambda x: inspect.isclass(x) and issubclass(x, base_cls))
-    for key, value in v2_objs.items():
-      LOCAL.ALL_OBJECTS[key] = value
-      # Functional aliases.
-      LOCAL.ALL_OBJECTS[generic_utils.to_snake_case(key)] = value
-  else:
-    # V1 initializers.
-    v1_objs = {
-        'Constant': tf.compat.v1.constant_initializer,
-        'GlorotNormal': tf.compat.v1.glorot_normal_initializer,
-        'GlorotUniform': tf.compat.v1.glorot_uniform_initializer,
-        'Identity': tf.compat.v1.initializers.identity,
-        'Ones': tf.compat.v1.ones_initializer,
-        'Orthogonal': tf.compat.v1.orthogonal_initializer,
-        'VarianceScaling': tf.compat.v1.variance_scaling_initializer,
-        'Zeros': tf.compat.v1.zeros_initializer,
-        'HeNormal': initializers_v1.HeNormal,
-        'HeUniform': initializers_v1.HeUniform,
-        'LecunNormal': initializers_v1.LecunNormal,
-        'LecunUniform': initializers_v1.LecunUniform,
-        'RandomNormal': initializers_v1.RandomNormal,
-        'RandomUniform': initializers_v1.RandomUniform,
-        'TruncatedNormal': initializers_v1.TruncatedNormal,
-    }
-    for key, value in v1_objs.items():
-      LOCAL.ALL_OBJECTS[key] = value
-      # Functional aliases.
-      LOCAL.ALL_OBJECTS[generic_utils.to_snake_case(key)] = value
-
-  # More compatibility aliases.
-  LOCAL.ALL_OBJECTS['normal'] = LOCAL.ALL_OBJECTS['random_normal']
-  LOCAL.ALL_OBJECTS['uniform'] = LOCAL.ALL_OBJECTS['random_uniform']
-  LOCAL.ALL_OBJECTS['one'] = LOCAL.ALL_OBJECTS['ones']
-  LOCAL.ALL_OBJECTS['zero'] = LOCAL.ALL_OBJECTS['zeros']
+    LOCAL.GENERATED_WITH_V2 = tf.__internal__.tf2.enabled()
+
+    # Compatibility aliases (need to exist in both V1 and V2).
+    LOCAL.ALL_OBJECTS["ConstantV2"] = initializers.Constant
+    LOCAL.ALL_OBJECTS["GlorotNormalV2"] = initializers.GlorotNormal
+    LOCAL.ALL_OBJECTS["GlorotUniformV2"] = initializers.GlorotUniform
+    LOCAL.ALL_OBJECTS["HeNormalV2"] = initializers.HeNormal
+    LOCAL.ALL_OBJECTS["HeUniformV2"] = initializers.HeUniform
+    LOCAL.ALL_OBJECTS["IdentityV2"] = initializers.Identity
+    LOCAL.ALL_OBJECTS["LecunNormalV2"] = initializers.LecunNormal
+    LOCAL.ALL_OBJECTS["LecunUniformV2"] = initializers.LecunUniform
+    LOCAL.ALL_OBJECTS["OnesV2"] = initializers.Ones
+    LOCAL.ALL_OBJECTS["OrthogonalV2"] = initializers.Orthogonal
+    LOCAL.ALL_OBJECTS["RandomNormalV2"] = initializers.RandomNormal
+    LOCAL.ALL_OBJECTS["RandomUniformV2"] = initializers.RandomUniform
+    LOCAL.ALL_OBJECTS["TruncatedNormalV2"] = initializers.TruncatedNormal
+    LOCAL.ALL_OBJECTS["VarianceScalingV2"] = initializers.VarianceScaling
+    LOCAL.ALL_OBJECTS["ZerosV2"] = initializers.Zeros
+
+    # Out of an abundance of caution we also include these aliases that have
+    # a non-zero probability of having been included in saved configs in the
+    # past.
+    LOCAL.ALL_OBJECTS["glorot_normalV2"] = initializers.GlorotNormal
+    LOCAL.ALL_OBJECTS["glorot_uniformV2"] = initializers.GlorotUniform
+    LOCAL.ALL_OBJECTS["he_normalV2"] = initializers.HeNormal
+    LOCAL.ALL_OBJECTS["he_uniformV2"] = initializers.HeUniform
+    LOCAL.ALL_OBJECTS["lecun_normalV2"] = initializers.LecunNormal
+    LOCAL.ALL_OBJECTS["lecun_uniformV2"] = initializers.LecunUniform
+
+    if tf.__internal__.tf2.enabled():
+        # For V2, entries are generated automatically based on the content of
+        # initializers.py.
+        v2_objs = {}
+        base_cls = initializers.Initializer
+        generic_utils.populate_dict_with_module_objects(
+            v2_objs,
+            [initializers],
+            obj_filter=lambda x: inspect.isclass(x) and issubclass(x, base_cls),
+        )
+        for key, value in v2_objs.items():
+            LOCAL.ALL_OBJECTS[key] = value
+            # Functional aliases.
+            LOCAL.ALL_OBJECTS[generic_utils.to_snake_case(key)] = value
+    else:
+        # V1 initializers.
+        v1_objs = {
+            "Constant": tf.compat.v1.constant_initializer,
+            "GlorotNormal": tf.compat.v1.glorot_normal_initializer,
+            "GlorotUniform": tf.compat.v1.glorot_uniform_initializer,
+            "Identity": tf.compat.v1.initializers.identity,
+            "Ones": tf.compat.v1.ones_initializer,
+            "Orthogonal": tf.compat.v1.orthogonal_initializer,
+            "VarianceScaling": tf.compat.v1.variance_scaling_initializer,
+            "Zeros": tf.compat.v1.zeros_initializer,
+            "HeNormal": initializers_v1.HeNormal,
+            "HeUniform": initializers_v1.HeUniform,
+            "LecunNormal": initializers_v1.LecunNormal,
+            "LecunUniform": initializers_v1.LecunUniform,
+            "RandomNormal": initializers_v1.RandomNormal,
+            "RandomUniform": initializers_v1.RandomUniform,
+            "TruncatedNormal": initializers_v1.TruncatedNormal,
+        }
+        for key, value in v1_objs.items():
+            LOCAL.ALL_OBJECTS[key] = value
+            # Functional aliases.
+            LOCAL.ALL_OBJECTS[generic_utils.to_snake_case(key)] = value
+
+    # More compatibility aliases.
+    LOCAL.ALL_OBJECTS["normal"] = LOCAL.ALL_OBJECTS["random_normal"]
+    LOCAL.ALL_OBJECTS["uniform"] = LOCAL.ALL_OBJECTS["random_uniform"]
+    LOCAL.ALL_OBJECTS["one"] = LOCAL.ALL_OBJECTS["ones"]
+    LOCAL.ALL_OBJECTS["zero"] = LOCAL.ALL_OBJECTS["zeros"]
 
 
 # For backwards compatibility, we populate this file with the objects
@@ -127,67 +135,91 @@ def populate_deserializable_objects():
 # Utility functions
 
 
-@keras_export('keras.initializers.serialize')
-def serialize(initializer):
-  return generic_utils.serialize_keras_object(initializer)
-
-
-@keras_export('keras.initializers.deserialize')
-def deserialize(config, custom_objects=None):
-  """Return an `Initializer` object from its config."""
-  populate_deserializable_objects()
-  return generic_utils.deserialize_keras_object(
-      config,
-      module_objects=LOCAL.ALL_OBJECTS,
-      custom_objects=custom_objects,
-      printable_module_name='initializer')
-
-
-@keras_export('keras.initializers.get')
+@keras_export("keras.initializers.serialize")
+def serialize(initializer, use_legacy_format=False):
+    populate_deserializable_objects()
+    if initializer is None:
+        return None
+    if not isinstance(initializer, tuple(LOCAL.ALL_OBJECTS.values())):
+        warnings.warn(
+            "The `keras.initializers.serialize()` API should only be used for "
+            "objects of type `keras.initializers.Initializer`. Found an "
+            f"instance of type {type(initializer)}, which may lead to improper "
+            "serialization."
+        )
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(initializer)
+
+    return serialization_lib.serialize_keras_object(initializer)
+
+
+@keras_export("keras.initializers.deserialize")
+def deserialize(config, custom_objects=None, use_legacy_format=False):
+    """Return an `Initializer` object from its config."""
+    populate_deserializable_objects()
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            config,
+            module_objects=LOCAL.ALL_OBJECTS,
+            custom_objects=custom_objects,
+            printable_module_name="initializer",
+        )
+
+    return serialization_lib.deserialize_keras_object(
+        config,
+        module_objects=LOCAL.ALL_OBJECTS,
+        custom_objects=custom_objects,
+        printable_module_name="initializer",
+    )
+
+
+@keras_export("keras.initializers.get")
 def get(identifier):
-  """Retrieve a Keras initializer by the identifier.
-
-  The `identifier` may be the string name of a initializers function or class (
-  case-sensitively).
-
-  >>> identifier = 'Ones'
-  >>> tf.keras.initializers.deserialize(identifier)
-  <...keras.initializers.initializers_v2.Ones...>
-
-  You can also specify `config` of the initializer to this function by passing
-  dict containing `class_name` and `config` as an identifier. Also note that the
-  `class_name` must map to a `Initializer` class.
-
-  >>> cfg = {'class_name': 'Ones', 'config': {}}
-  >>> tf.keras.initializers.deserialize(cfg)
-  <...keras.initializers.initializers_v2.Ones...>
-
-  In the case that the `identifier` is a class, this method will return a new
-  instance of the class by its constructor.
-
-  Args:
-    identifier: String or dict that contains the initializer name or
-      configurations.
-
-  Returns:
-    Initializer instance base on the input identifier.
-
-  Raises:
-    ValueError: If the input identifier is not a supported type or in a bad
-      format.
-  """
-
-  if identifier is None:
-    return None
-  if isinstance(identifier, dict):
-    return deserialize(identifier)
-  elif isinstance(identifier, str):
-    identifier = str(identifier)
-    return deserialize(identifier)
-  elif callable(identifier):
-    if inspect.isclass(identifier):
-      identifier = identifier()
-    return identifier
-  else:
-    raise ValueError('Could not interpret initializer identifier: ' +
-                     str(identifier))
+    """Retrieve a Keras initializer by the identifier.
+
+    The `identifier` may be the string name of a initializers function or class
+    (case-sensitively).
+
+    >>> identifier = 'Ones'
+    >>> tf.keras.initializers.deserialize(identifier)
+    <...keras.initializers.initializers.Ones...>
+
+    You can also specify `config` of the initializer to this function by passing
+    dict containing `class_name` and `config` as an identifier. Also note that
+    the `class_name` must map to a `Initializer` class.
+
+    >>> cfg = {'class_name': 'Ones', 'config': {}}
+    >>> tf.keras.initializers.deserialize(cfg)
+    <...keras.initializers.initializers.Ones...>
+
+    In the case that the `identifier` is a class, this method will return a new
+    instance of the class by its constructor.
+
+    Args:
+      identifier: String or dict that contains the initializer name or
+        configurations.
+
+    Returns:
+      Initializer instance base on the input identifier.
+
+    Raises:
+      ValueError: If the input identifier is not a supported type or in a bad
+        format.
+    """
+
+    if identifier is None:
+        return None
+    if isinstance(identifier, dict):
+        use_legacy_format = "module" not in identifier
+        return deserialize(identifier, use_legacy_format=use_legacy_format)
+    elif isinstance(identifier, str):
+        config = {"class_name": str(identifier), "config": {}}
+        return get(config)
+    elif callable(identifier):
+        if inspect.isclass(identifier):
+            identifier = identifier()
+        return identifier
+    else:
+        raise ValueError(
+            "Could not interpret initializer identifier: " + str(identifier)
+        )
diff --git a/keras/initializers/initializers.py b/keras/initializers/initializers.py
new file mode 100644
index 000000000000..8fc3da655947
--- /dev/null
+++ b/keras/initializers/initializers.py
@@ -0,0 +1,1191 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras initializers."""
+
+import math
+import warnings
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.dtensor import utils
+from keras.saving import serialization_lib
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+_PARTITION_SHAPE = "partition_shape"
+_PARTITION_OFFSET = "partition_offset"
+_LAYOUT = "layout"
+_ALLOWED_INITIALIZER_KWARGS = [_PARTITION_SHAPE, _PARTITION_OFFSET, _LAYOUT]
+
+
+@keras_export("keras.initializers.Initializer")
+class Initializer:
+    """Initializer base class: all Keras initializers inherit from this class.
+
+    Initializers should implement a `__call__()` method with the following
+    signature:
+
+    ```python
+    def __call__(self, shape, dtype=None, **kwargs):
+        # returns a tensor of shape `shape` and dtype `dtype`
+        # containing values drawn from a distribution of your choice.
+        return tf.random.uniform(shape=shape, dtype=dtype)
+    ```
+
+    Optionally, you an also implement the method `get_config()` and the class
+    method `from_config()` in order to support serialization -- just like with
+    any Keras object.
+
+    Here's a simple example: a random normal initializer.
+
+    ```python
+    class ExampleRandomNormal(Initializer):
+        def __init__(self, mean, stddev):
+            self.mean = mean
+            self.stddev = stddev
+
+        def __call__(self, shape, dtype=None, **kwargs):
+            return tf.random.normal(
+                shape, mean=self.mean, stddev=self.stddev, dtype=dtype
+            )
+
+        def get_config(self):  # To support serialization
+            return {"mean": self.mean, "stddev": self.stddev}
+    ```
+
+    Note that we don't have to implement `from_config()` in the example above
+    since the constructor arguments of the class the keys in the config returned
+    by `get_config` are the same. In this case, the default `from_config()`
+    works fine.
+    """
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized as specified by the initializer.
+
+        Args:
+          shape: Shape of the tensor.
+          dtype: Optional dtype of the tensor.
+          **kwargs: Additional keyword arguments.
+        """
+        raise NotImplementedError(
+            "Initializer subclasses must implement the `__call__()` method."
+        )
+
+    def get_config(self):
+        """Returns the initializer's configuration as a JSON-serializable dict.
+
+        Returns:
+            A JSON-serializable Python dict.
+        """
+        return {}
+
+    @classmethod
+    def from_config(cls, config):
+        """Instantiates an initializer from a configuration dictionary.
+
+        Example:
+
+        ```python
+        initializer = RandomUniform(-1, 1)
+        config = initializer.get_config()
+        initializer = RandomUniform.from_config(config)
+        ```
+
+        Args:
+            config: A Python dictionary, the output of `get_config()`.
+
+        Returns:
+            An `Initializer` instance.
+        """
+        config.pop("dtype", None)
+        return cls(**config)
+
+    def _warn_reuse(self):
+        if getattr(self, "_used", False):
+            if getattr(self, "seed", None) is None:
+                warnings.warn(
+                    f"The initializer {self.__class__.__name__} is unseeded "
+                    "and being called multiple times, which will return "
+                    "identical values each time (even if the initializer is "
+                    "unseeded). Please update your code to provide a seed to "
+                    "the initializer, or avoid using the same initializer "
+                    "instance more than once."
+                )
+        else:
+            self._used = True
+
+
+@keras_export("keras.initializers.Zeros", "keras.initializers.zeros", v1=[])
+class Zeros(Initializer):
+    """Initializer that generates tensors initialized to 0.
+
+    Also available via the shortcut function `tf.keras.initializers.zeros`.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.Zeros()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.Zeros()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+    """
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized as specified by the initializer.
+
+        Args:
+            shape: Shape of the tensor.
+            dtype: Optional dtype of the tensor. Only numeric or boolean dtypes
+                are supported. If not specified, `keras.backend.floatx()` is
+                used, which defaults to `float32` unless you configured it
+                otherwise (via `keras.backend.set_floatx(float_dtype)`).
+            **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(self.__class__.__name__, kwargs)
+        dtype = _get_dtype(dtype)
+        if not dtype.is_numpy_compatible or dtype == tf.string:
+            raise ValueError(f"Expected numeric or boolean dtype, got {dtype}.")
+        if _PARTITION_SHAPE in kwargs:
+            shape = kwargs[_PARTITION_SHAPE]
+        layout = kwargs.pop("layout", None)
+        if layout:
+            return utils.call_with_layout(
+                tf.zeros, layout, shape=shape, dtype=dtype
+            )
+        return tf.zeros(shape, dtype)
+
+
+@keras_export("keras.initializers.Ones", "keras.initializers.ones", v1=[])
+class Ones(Initializer):
+    """Initializer that generates tensors initialized to 1.
+
+    Also available via the shortcut function `tf.keras.initializers.ones`.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.Ones()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.Ones()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+    """
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized as specified by the initializer.
+
+        Args:
+            shape: Shape of the tensor.
+            dtype: Optional dtype of the tensor. Only numeric or boolean dtypes
+                are supported. If not specified, `keras.backend.floatx()` is
+                used, which defaults to `float32` unless you configured it
+                otherwise (via `keras.backend.set_floatx(float_dtype)`).
+            **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(self.__class__.__name__, kwargs)
+        dtype = _get_dtype(dtype)
+        if not dtype.is_numpy_compatible or dtype == tf.string:
+            raise ValueError(f"Expected numeric or boolean dtype, got {dtype}.")
+        if _PARTITION_SHAPE in kwargs:
+            shape = kwargs[_PARTITION_SHAPE]
+        layout = kwargs.pop("layout", None)
+        if layout:
+            return utils.call_with_layout(
+                tf.ones, layout, shape=shape, dtype=dtype
+            )
+        return tf.ones(shape, dtype)
+
+
+@keras_export(
+    "keras.initializers.Constant", "keras.initializers.constant", v1=[]
+)
+class Constant(Initializer):
+    """Initializer that generates tensors with constant values.
+
+    Also available via the shortcut function `tf.keras.initializers.constant`.
+
+    Only scalar values are allowed.
+    The constant value provided must be convertible to the dtype requested
+    when calling the initializer.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.Constant(3.)
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.Constant(3.)
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+        value: A Python scalar.
+    """
+
+    def __init__(self, value=0):
+        self.value = value
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized to `self.value`.
+
+        Args:
+            shape: Shape of the tensor.
+            dtype: Optional dtype of the tensor. If not specified,
+                `keras.backend.floatx()` is used,
+                which defaults to `float32` unless you configured it
+                otherwise (via `keras.backend.set_floatx(float_dtype)`).
+                **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(self.__class__.__name__, kwargs)
+        dtype = _get_dtype(dtype)
+        if _PARTITION_SHAPE in kwargs:
+            shape = kwargs[_PARTITION_SHAPE]
+        layout = kwargs.pop("layout", None)
+        if layout:
+            return utils.call_with_layout(
+                tf.constant, layout, self.value, shape=shape, dtype=dtype
+            )
+        return tf.constant(self.value, dtype=_get_dtype(dtype), shape=shape)
+
+    def get_config(self):
+        return {"value": self.value}
+
+    @classmethod
+    def from_config(cls, config):
+        config.pop("dtype", None)
+        if "value" in config:
+            if isinstance(config["value"], dict):
+                config["value"] = serialization_lib.deserialize_keras_object(
+                    config["value"]
+                )
+        return cls(**config)
+
+
+@keras_export(
+    "keras.initializers.RandomUniform",
+    "keras.initializers.random_uniform",
+    v1=[],
+)
+class RandomUniform(Initializer):
+    """Initializer that generates tensors with a uniform distribution.
+
+    Also available via the shortcut function
+    `tf.keras.initializers.random_uniform`.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      minval: A python scalar or a scalar tensor. Lower bound of the range of
+        random values to generate (inclusive).
+      maxval: A python scalar or a scalar tensor. Upper bound of the range of
+        random values to generate (exclusive).
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will produce the same
+        random values across multiple calls.
+    """
+
+    def __init__(self, minval=-0.05, maxval=0.05, seed=None):
+        self.minval = minval
+        self.maxval = maxval
+        self.seed = seed
+        self._random_generator = backend.RandomGenerator(
+            seed, rng_type="stateless"
+        )
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized as specified by the initializer.
+
+        Args:
+          shape: Shape of the tensor.
+          dtype: Optional dtype of the tensor. Only floating point and integer
+          types are supported. If not specified,
+            `tf.keras.backend.floatx()` is used,
+           which default to `float32` unless you configured it otherwise
+           (via `tf.keras.backend.set_floatx(float_dtype)`).
+          **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(self.__class__.__name__, kwargs)
+        dtype = _get_dtype(dtype)
+        if not dtype.is_floating and not dtype.is_integer:
+            raise ValueError(f"Expected float or integer dtype, got {dtype}.")
+        if _PARTITION_SHAPE in kwargs:
+            shape = kwargs[_PARTITION_SHAPE]
+        partition_offset = kwargs.get(_PARTITION_OFFSET, None)
+        if partition_offset is None:
+            # We skip the reuse warning for partitioned variable, since the same
+            # initializer will be called multiple times for each partition.
+            self._warn_reuse()
+        nonce = hash(partition_offset) if partition_offset else None
+        layout = kwargs.pop("layout", None)
+        if layout:
+            _ensure_keras_seeded()
+            return utils.call_with_layout(
+                self._random_generator.random_uniform,
+                layout,
+                shape,
+                self.minval,
+                self.maxval,
+                dtype,
+                nonce,
+            )
+        return self._random_generator.random_uniform(
+            shape, self.minval, self.maxval, dtype, nonce
+        )
+
+    def get_config(self):
+        return {"minval": self.minval, "maxval": self.maxval, "seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.RandomNormal", "keras.initializers.random_normal", v1=[]
+)
+class RandomNormal(Initializer):
+    """Initializer that generates tensors with a normal distribution.
+
+    Also available via the shortcut function
+    `tf.keras.initializers.random_normal`.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=1.)
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=1.)
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      mean: a python scalar or a scalar tensor. Mean of the random values to
+        generate.
+      stddev: a python scalar or a scalar tensor. Standard deviation of the
+        random values to generate.
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will produce the same
+        random values across multiple calls.
+    """
+
+    def __init__(self, mean=0.0, stddev=0.05, seed=None):
+        self.mean = mean
+        self.stddev = stddev
+        self.seed = seed
+        self._random_generator = backend.RandomGenerator(
+            seed, rng_type="stateless"
+        )
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized to random normal values.
+
+        Args:
+          shape: Shape of the tensor.
+          dtype: Optional dtype of the tensor. Only floating point types are
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
+            `tf.keras.backend.set_floatx(float_dtype)`)
+          **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(self.__class__.__name__, kwargs)
+        dtype = _assert_float_dtype(_get_dtype(dtype))
+        if _PARTITION_SHAPE in kwargs:
+            shape = kwargs[_PARTITION_SHAPE]
+        partition_offset = kwargs.get(_PARTITION_OFFSET, None)
+        if partition_offset is None:
+            # We skip the reuse warning for partitioned variable, since the same
+            # initializer will be called multiple times for each partition.
+            self._warn_reuse()
+        nonce = hash(partition_offset) if partition_offset else None
+        layout = kwargs.pop("layout", None)
+        if layout:
+            _ensure_keras_seeded()
+            return utils.call_with_layout(
+                self._random_generator.random_normal,
+                layout,
+                shape,
+                self.mean,
+                self.stddev,
+                dtype,
+                nonce,
+            )
+        return self._random_generator.random_normal(
+            shape, self.mean, self.stddev, dtype, nonce
+        )
+
+    def get_config(self):
+        return {"mean": self.mean, "stddev": self.stddev, "seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.TruncatedNormal",
+    "keras.initializers.truncated_normal",
+    v1=[],
+)
+class TruncatedNormal(Initializer):
+    """Initializer that generates a truncated normal distribution.
+
+    Also available via the shortcut function
+    `tf.keras.initializers.truncated_normal`.
+
+    The values generated are similar to values from a
+    `tf.keras.initializers.RandomNormal` initializer except that values more
+    than two standard deviations from the mean are
+    discarded and re-drawn.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=1.)
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=1.)
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      mean: a python scalar or a scalar tensor. Mean of the random values
+        to generate.
+      stddev: a python scalar or a scalar tensor. Standard deviation of the
+        random values to generate before truncation.
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will produce the same
+        random values across multiple calls.
+    """
+
+    def __init__(self, mean=0.0, stddev=0.05, seed=None):
+        self.mean = mean
+        self.stddev = stddev
+        self.seed = seed
+        self._random_generator = backend.RandomGenerator(
+            seed, rng_type="stateless"
+        )
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor initialized to random normal values (truncated).
+
+        Args:
+          shape: Shape of the tensor.
+          dtype: Optional dtype of the tensor. Only floating point types are
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
+            `tf.keras.backend.set_floatx(float_dtype)`)
+          **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(self.__class__.__name__, kwargs)
+        dtype = _assert_float_dtype(_get_dtype(dtype))
+        if _PARTITION_SHAPE in kwargs:
+            shape = kwargs[_PARTITION_SHAPE]
+        partition_offset = kwargs.get(_PARTITION_OFFSET, None)
+        if partition_offset is None:
+            # We skip the reuse warning for partitioned variable, since the same
+            # initializer will be called multiple times for each partition.
+            self._warn_reuse()
+        nonce = hash(partition_offset) if partition_offset else None
+        layout = kwargs.pop("layout", None)
+        if layout:
+            # TODO(scottzhu): Remove this once the forward compat period above
+            # is expired.
+            self._random_generator._rng_type = (
+                self._random_generator.RNG_STATEFUL
+            )
+            _ensure_keras_seeded()
+            return utils.call_with_layout(
+                self._random_generator.truncated_normal,
+                layout,
+                shape,
+                self.mean,
+                self.stddev,
+                dtype,
+                nonce,
+            )
+        return self._random_generator.truncated_normal(
+            shape, self.mean, self.stddev, dtype, nonce
+        )
+
+    def get_config(self):
+        return {"mean": self.mean, "stddev": self.stddev, "seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.VarianceScaling",
+    "keras.initializers.variance_scaling",
+    v1=[],
+)
+class VarianceScaling(Initializer):
+    """Initializer that adapts its scale to the shape of its input tensors.
+
+    Also available via the shortcut function
+    `tf.keras.initializers.variance_scaling`.
+
+    With `distribution="truncated_normal" or "untruncated_normal"`, samples are
+    drawn from a truncated/untruncated normal distribution with a mean of zero
+    and a standard deviation (after truncation, if used) `stddev = sqrt(scale /
+    n)`, where `n` is:
+
+    - number of input units in the weight tensor, if `mode="fan_in"`
+    - number of output units, if `mode="fan_out"`
+    - average of the numbers of input and output units, if `mode="fan_avg"`
+
+    With `distribution="uniform"`, samples are drawn from a uniform distribution
+    within `[-limit, limit]`, where `limit = sqrt(3 * scale / n)`.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.VarianceScaling(
+    ... scale=0.1, mode='fan_in', distribution='uniform')
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.VarianceScaling(
+    ... scale=0.1, mode='fan_in', distribution='uniform')
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+        scale: Scaling factor (positive float).
+        mode: One of `"fan_in"`, `"fan_out"`, `"fan_avg"`.
+        distribution: Random distribution to use. One of `"truncated_normal"`,
+            `"untruncated_normal"`, or `"uniform"`.
+        seed: A Python integer. Used to make the behavior of the initializer
+            deterministic. Note that a seeded initializer will produce the same
+            random values across multiple calls.
+    """
+
+    def __init__(
+        self,
+        scale=1.0,
+        mode="fan_in",
+        distribution="truncated_normal",
+        seed=None,
+    ):
+        if scale <= 0.0:
+            raise ValueError(
+                f"`scale` must be positive float. Received: scale={scale}."
+            )
+        allowed_modes = {"fan_in", "fan_out", "fan_avg"}
+        if mode not in allowed_modes:
+            raise ValueError(
+                f"Invalid `mode` argument: {mode}. "
+                f"Please use one of the {allowed_modes}."
+            )
+        distribution = distribution.lower()
+        # Compatibility with keras-team/keras.
+        if distribution == "normal":
+            distribution = "truncated_normal"
+        allowed_distributions = {
+            "uniform",
+            "truncated_normal",
+            "untruncated_normal",
+        }
+        if distribution not in allowed_distributions:
+            raise ValueError(
+                f"Invalid `distribution` argument: {distribution}."
+                f"Allowed distributions: {allowed_distributions}."
+            )
+        self.scale = scale
+        self.mode = mode
+        self.distribution = distribution
+        self.seed = seed
+        self._random_generator = backend.RandomGenerator(
+            seed, rng_type="stateless"
+        )
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized as specified by the initializer.
+
+        Args:
+          shape: Shape of the tensor.
+          dtype: Optional dtype of the tensor. Only floating point types are
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
+            `tf.keras.backend.set_floatx(float_dtype)`)
+          **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(self.__class__.__name__, kwargs)
+        dtype = _assert_float_dtype(_get_dtype(dtype))
+        if _PARTITION_SHAPE in kwargs:
+            shape = kwargs[_PARTITION_SHAPE]
+        partition_offset = kwargs.get(_PARTITION_OFFSET, None)
+        if partition_offset is None:
+            # We skip the reuse warning for partitioned variable, since the same
+            # initializer will be called multiple times for each partition.
+            self._warn_reuse()
+        nonce = hash(partition_offset) if partition_offset else None
+        layout = kwargs.pop("layout", None)
+        if layout:
+            _ensure_keras_seeded()
+            return utils.call_with_layout(
+                self._generate_init_val,
+                layout,
+                shape=shape,
+                dtype=dtype,
+                nonce=nonce,
+            )
+        return self._generate_init_val(shape=shape, dtype=dtype, nonce=nonce)
+
+    def _generate_init_val(self, shape, dtype, nonce):
+        scale = self.scale
+        fan_in, fan_out = _compute_fans(shape)
+        if self.mode == "fan_in":
+            scale /= max(1.0, fan_in)
+        elif self.mode == "fan_out":
+            scale /= max(1.0, fan_out)
+        else:
+            scale /= max(1.0, (fan_in + fan_out) / 2.0)
+        if self.distribution == "truncated_normal":
+            # constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0.,
+            # scale=1.)
+            stddev = math.sqrt(scale) / 0.87962566103423978
+            return self._random_generator.truncated_normal(
+                shape, 0.0, stddev, dtype, nonce
+            )
+        elif self.distribution == "untruncated_normal":
+            stddev = math.sqrt(scale)
+            return self._random_generator.random_normal(
+                shape, 0.0, stddev, dtype, nonce
+            )
+        else:
+            limit = math.sqrt(3.0 * scale)
+            return self._random_generator.random_uniform(
+                shape, -limit, limit, dtype, nonce
+            )
+
+    def get_config(self):
+        return {
+            "scale": self.scale,
+            "mode": self.mode,
+            "distribution": self.distribution,
+            "seed": self.seed,
+        }
+
+
+@keras_export(
+    "keras.initializers.Orthogonal", "keras.initializers.orthogonal", v1=[]
+)
+class Orthogonal(Initializer):
+    """Initializer that generates an orthogonal matrix.
+
+    Also available via the shortcut function `tf.keras.initializers.orthogonal`.
+
+    If the shape of the tensor to initialize is two-dimensional, it is
+    initialized with an orthogonal matrix obtained from the QR decomposition of
+    a matrix of random numbers drawn from a normal distribution. If the matrix
+    has fewer rows than columns then the output will have orthogonal rows.
+    Otherwise, the output will have orthogonal columns.
+
+    If the shape of the tensor to initialize is more than two-dimensional,
+    a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
+    is initialized, where `n` is the length of the shape vector.
+    The matrix is subsequently reshaped to give a tensor of the desired shape.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.Orthogonal()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.Orthogonal()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      gain: multiplicative factor to apply to the orthogonal matrix
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will produce the same
+        random values across multiple calls.
+
+    References:
+      - [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
+    """
+
+    def __init__(self, gain=1.0, seed=None):
+        self.gain = gain
+        self.seed = seed
+        self._random_generator = backend.RandomGenerator(
+            seed, rng_type="stateless"
+        )
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized to an orthogonal matrix.
+
+        Args:
+          shape: Shape of the tensor.
+          dtype: Optional dtype of the tensor. Only floating point types are
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+           which default to `float32` unless you configured it otherwise
+           (via `tf.keras.backend.set_floatx(float_dtype)`)
+          **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(
+            self.__class__.__name__, kwargs, support_partition=False
+        )
+        dtype = _assert_float_dtype(_get_dtype(dtype))
+        # Check the shape
+        if len(shape) < 2:
+            raise ValueError(
+                "The tensor to initialize must be "
+                "at least two-dimensional. Received: "
+                f"shape={shape} of rank {len(shape)}."
+            )
+        self._warn_reuse()
+        layout = kwargs.pop("layout", None)
+        if layout:
+            _ensure_keras_seeded()
+            return utils.call_with_layout(
+                self._generate_init_val, layout, shape=shape, dtype=dtype
+            )
+        return self._generate_init_val(shape, dtype)
+
+    def _generate_init_val(self, shape, dtype):
+        # Flatten the input shape with the last dimension remaining
+        # its original shape so it works for conv2d
+        num_rows = 1
+        for dim in shape[:-1]:
+            num_rows *= dim
+        num_cols = shape[-1]
+        flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows))
+
+        # Generate a random matrix
+        a = self._random_generator.random_normal(flat_shape, dtype=dtype)
+        # Compute the qr factorization
+        q, r = tf.linalg.qr(a, full_matrices=False)
+        # Make Q uniform
+        d = tf.linalg.tensor_diag_part(r)
+        q *= tf.sign(d)
+        if num_rows < num_cols:
+            q = tf.linalg.matrix_transpose(q)
+        return self.gain * tf.reshape(q, shape)
+
+    def get_config(self):
+        return {"gain": self.gain, "seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.Identity", "keras.initializers.identity", v1=[]
+)
+class Identity(Initializer):
+    """Initializer that generates the identity matrix.
+
+    Also available via the shortcut function `tf.keras.initializers.identity`.
+
+    Only usable for generating 2D matrices.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.Identity()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.Identity()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      gain: Multiplicative factor to apply to the identity matrix.
+    """
+
+    def __init__(self, gain=1.0):
+        self.gain = gain
+
+    def __call__(self, shape, dtype=None, **kwargs):
+        """Returns a tensor object initialized to a 2D identity matrix.
+
+        Args:
+          shape: Shape of the tensor. It should have exactly rank 2.
+          dtype: Optional dtype of the tensor. Only floating point types are
+           supported. If not specified, `tf.keras.backend.floatx()` is used,
+           which default to `float32` unless you configured it otherwise
+           (via `tf.keras.backend.set_floatx(float_dtype)`)
+          **kwargs: Additional keyword arguments.
+        """
+        _validate_kwargs(
+            self.__class__.__name__, kwargs, support_partition=False
+        )
+        dtype = _assert_float_dtype(_get_dtype(dtype))
+        if len(shape) != 2:
+            raise ValueError(
+                "Identity matrix initializer can only be used for 2D matrices. "
+                f"Received: shape={shape} of rank {len(shape)}."
+            )
+        layout = kwargs.pop("layout", None)
+        if layout:
+            return utils.call_with_layout(
+                self._generate_init_val, layout, shape=shape, dtype=dtype
+            )
+        return self._generate_init_val(shape, dtype)
+
+    def _generate_init_val(self, shape, dtype):
+        initializer = tf.eye(*shape, dtype=dtype)
+        return self.gain * initializer
+
+    def get_config(self):
+        return {"gain": self.gain}
+
+
+@keras_export(
+    "keras.initializers.GlorotUniform",
+    "keras.initializers.glorot_uniform",
+    v1=[],
+)
+class GlorotUniform(VarianceScaling):
+    """The Glorot uniform initializer, also called Xavier uniform initializer.
+
+    Also available via the shortcut function
+    `tf.keras.initializers.glorot_uniform`.
+
+    Draws samples from a uniform distribution within `[-limit, limit]`, where
+    `limit = sqrt(6 / (fan_in + fan_out))` (`fan_in` is the number of input
+    units in the weight tensor and `fan_out` is the number of output units).
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.GlorotUniform()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.GlorotUniform()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
+
+    References:
+      - [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=1.0, mode="fan_avg", distribution="uniform", seed=seed
+        )
+
+    def get_config(self):
+        return {"seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.GlorotNormal", "keras.initializers.glorot_normal", v1=[]
+)
+class GlorotNormal(VarianceScaling):
+    """The Glorot normal initializer, also called Xavier normal initializer.
+
+    Also available via the shortcut function
+    `tf.keras.initializers.glorot_normal`.
+
+    Draws samples from a truncated normal distribution centered on 0 with
+    `stddev = sqrt(2 / (fan_in + fan_out))` where `fan_in` is the number of
+    input units in the weight tensor and `fan_out` is the number of output units
+    in the weight tensor.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.GlorotNormal()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.GlorotNormal()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
+
+    References:
+      - [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=1.0,
+            mode="fan_avg",
+            distribution="truncated_normal",
+            seed=seed,
+        )
+
+    def get_config(self):
+        return {"seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.LecunNormal", "keras.initializers.lecun_normal", v1=[]
+)
+class LecunNormal(VarianceScaling):
+    """Lecun normal initializer.
+
+     Also available via the shortcut function
+    `tf.keras.initializers.lecun_normal`.
+
+    Initializers allow you to pre-specify an initialization strategy, encoded in
+    the Initializer object, without knowing the shape and dtype of the variable
+    being initialized.
+
+    Draws samples from a truncated normal distribution centered on 0 with
+    `stddev = sqrt(1 / fan_in)` where `fan_in` is the number of input units in
+    the weight tensor.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.LecunNormal()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.LecunNormal()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
+
+    References:
+      - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=1.0, mode="fan_in", distribution="truncated_normal", seed=seed
+        )
+
+    def get_config(self):
+        return {"seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.LecunUniform", "keras.initializers.lecun_uniform", v1=[]
+)
+class LecunUniform(VarianceScaling):
+    """Lecun uniform initializer.
+
+     Also available via the shortcut function
+    `tf.keras.initializers.lecun_uniform`.
+
+    Draws samples from a uniform distribution within `[-limit, limit]`, where
+    `limit = sqrt(3 / fan_in)` (`fan_in` is the number of input units in the
+    weight tensor).
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.LecunUniform()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.LecunUniform()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
+
+    References:
+      - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=1.0, mode="fan_in", distribution="uniform", seed=seed
+        )
+
+    def get_config(self):
+        return {"seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.HeNormal", "keras.initializers.he_normal", v1=[]
+)
+class HeNormal(VarianceScaling):
+    """He normal initializer.
+
+     Also available via the shortcut function
+    `tf.keras.initializers.he_normal`.
+
+    It draws samples from a truncated normal distribution centered on 0 with
+    `stddev = sqrt(2 / fan_in)` where `fan_in` is the number of input units in
+    the weight tensor.
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.HeNormal()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.HeNormal()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
+
+    References:
+      - [He et al., 2015](https://arxiv.org/abs/1502.01852)
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=2.0, mode="fan_in", distribution="truncated_normal", seed=seed
+        )
+
+    def get_config(self):
+        return {"seed": self.seed}
+
+
+@keras_export(
+    "keras.initializers.HeUniform", "keras.initializers.he_uniform", v1=[]
+)
+class HeUniform(VarianceScaling):
+    """He uniform variance scaling initializer.
+
+     Also available via the shortcut function
+    `tf.keras.initializers.he_uniform`.
+
+    Draws samples from a uniform distribution within `[-limit, limit]`, where
+    `limit = sqrt(6 / fan_in)` (`fan_in` is the number of input units in the
+    weight tensor).
+
+    Examples:
+
+    >>> # Standalone usage:
+    >>> initializer = tf.keras.initializers.HeUniform()
+    >>> values = initializer(shape=(2, 2))
+
+    >>> # Usage in a Keras layer:
+    >>> initializer = tf.keras.initializers.HeUniform()
+    >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
+
+    Args:
+      seed: A Python integer. Used to make the behavior of the initializer
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
+
+    References:
+      - [He et al., 2015](https://arxiv.org/abs/1502.01852)
+    """
+
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=2.0, mode="fan_in", distribution="uniform", seed=seed
+        )
+
+    def get_config(self):
+        return {"seed": self.seed}
+
+
+def _get_dtype(dtype):
+    if dtype is None:
+        dtype = backend.floatx()
+    return tf.as_dtype(dtype)
+
+
+def _assert_float_dtype(dtype):
+    """Validate and return floating point type based on `dtype`.
+
+    `dtype` must be a floating point type.
+
+    Args:
+      dtype: The data type to validate.
+
+    Returns:
+      Validated type.
+
+    Raises:
+      ValueError: if `dtype` is not a floating point type.
+    """
+    dtype = tf.as_dtype(dtype)
+    if not dtype.is_floating:
+        raise ValueError(f"Expected floating point type, got {dtype}.")
+    return dtype
+
+
+def _compute_fans(shape):
+    """Computes the number of input and output units for a weight shape.
+
+    Args:
+      shape: Integer shape tuple or TF tensor shape.
+
+    Returns:
+      A tuple of integer scalars (fan_in, fan_out).
+    """
+    if len(shape) < 1:  # Just to avoid errors for constants.
+        fan_in = fan_out = 1
+    elif len(shape) == 1:
+        fan_in = fan_out = shape[0]
+    elif len(shape) == 2:
+        fan_in = shape[0]
+        fan_out = shape[1]
+    else:
+        # Assuming convolution kernels (2D, 3D, or more).
+        # kernel shape: (..., input_depth, depth)
+        receptive_field_size = 1
+        for dim in shape[:-2]:
+            receptive_field_size *= dim
+        fan_in = shape[-2] * receptive_field_size
+        fan_out = shape[-1] * receptive_field_size
+    return int(fan_in), int(fan_out)
+
+
+def _validate_kwargs(cls_name, kwargs, support_partition=True):
+    invalid_kwargs = [k for k in kwargs if k not in _ALLOWED_INITIALIZER_KWARGS]
+    if invalid_kwargs:
+        raise TypeError(
+            f"Unknown keyword arguments: {invalid_kwargs}. Allowed "
+            f"keyword arguments: {_ALLOWED_INITIALIZER_KWARGS}."
+        )
+    if not support_partition and (
+        _PARTITION_SHAPE in kwargs or _PARTITION_OFFSET in kwargs
+    ):
+        raise ValueError(
+            f"{cls_name} initializer doesn't support "
+            "partition-related arguments."
+        )
+
+
+def _ensure_keras_seeded():
+    """Make sure the keras.backend global seed generator is set.
+
+    This is important for DTensor use case to ensure that each client are
+    initialized with same seed for tf.random.Generator, so that the value
+    created are in sync among all the clients.
+    """
+    if not getattr(backend._SEED_GENERATOR, "generator", None):
+        raise ValueError(
+            "When using DTensor APIs, you need to set the global seed "
+            "before using any Keras initializers. Please make sure "
+            "to call `tf.keras.utils.set_random_seed()` in your code."
+        )
diff --git a/keras/initializers/initializers_test.py b/keras/initializers/initializers_test.py
index b460aab6b727..a45f54f6d0de 100644
--- a/keras/initializers/initializers_test.py
+++ b/keras/initializers/initializers_test.py
@@ -14,296 +14,312 @@
 # ==============================================================================
 """Tests for Keras initializers."""
 
+import warnings
+
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
-import numpy as np
 
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import initializers
 from keras import models
-from keras.testing_infra import test_utils
 from keras.engine import input_layer
 from keras.layers import core
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
-import tensorflow.compat.v2 as tf
+RANDOM_INITIALIZERS = [
+    initializers.RandomUniformV2,
+    initializers.RandomNormalV2,
+    initializers.OrthogonalV2,
+    # TODO(scottzhu): Enable this after the forward compat period expires for
+    # TruncatedNormalV2
+    # initializers.TruncatedNormalV2,
+    initializers.VarianceScalingV2,
+    initializers.LecunUniformV2,
+    initializers.LecunNormalV2,
+    initializers.GlorotUniformV2,
+    initializers.GlorotNormalV2,
+    initializers.HeNormalV2,
+    initializers.HeUniformV2,
+]
 
 
 def _compute_fans(shape):
-  """Computes the number of input and output units for a weight shape.
-
-  Args:
-    shape: Integer shape tuple or TF tensor shape.
-
-  Returns:
-    A tuple of integer scalars (fan_in, fan_out).
-  """
-  if len(shape) < 1:  # Just to avoid errors for constants.
-    fan_in = fan_out = 1
-  elif len(shape) == 1:
-    fan_in = fan_out = shape[0]
-  elif len(shape) == 2:
-    fan_in = shape[0]
-    fan_out = shape[1]
-  else:
-    # Assuming convolution kernels (2D, 3D, or more).
-    # kernel shape: (..., input_depth, depth)
-    receptive_field_size = 1
-    for dim in shape[:-2]:
-      receptive_field_size *= dim
-    fan_in = shape[-2] * receptive_field_size
-    fan_out = shape[-1] * receptive_field_size
-  return int(fan_in), int(fan_out)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    """Computes the number of input and output units for a weight shape.
+
+    Args:
+      shape: Integer shape tuple or TF tensor shape.
+
+    Returns:
+      A tuple of integer scalars (fan_in, fan_out).
+    """
+    if len(shape) < 1:  # Just to avoid errors for constants.
+        fan_in = fan_out = 1
+    elif len(shape) == 1:
+        fan_in = fan_out = shape[0]
+    elif len(shape) == 2:
+        fan_in = shape[0]
+        fan_out = shape[1]
+    else:
+        # Assuming convolution kernels (2D, 3D, or more).
+        # kernel shape: (..., input_depth, depth)
+        receptive_field_size = 1
+        for dim in shape[:-2]:
+            receptive_field_size *= dim
+        fan_in = shape[-2] * receptive_field_size
+        fan_out = shape[-1] * receptive_field_size
+    return int(fan_in), int(fan_out)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class KerasInitializersTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _runner(self, init, shape, target_mean=None, target_std=None,
-              target_max=None, target_min=None):
-    # The global seed is set so that we can get the same random streams between
-    # eager and graph mode when stateful op is used.
-    tf.random.set_seed(1337)
-    variable = backend.variable(init(shape))
-    output = backend.get_value(variable)
-    # Test serialization (assumes deterministic behavior).
-    config = init.get_config()
-    reconstructed_init = init.__class__.from_config(config)
-
-    tf.random.set_seed(1337)
-    variable = backend.variable(reconstructed_init(shape))
-    output_2 = backend.get_value(variable)
-    self.assertAllClose(output, output_2, atol=1e-4)
-
-  def test_uniform(self):
-    tensor_shape = (3, 2, 3)
-    with self.cached_session():
-      self._runner(
-          initializers.RandomUniformV2(minval=-1, maxval=1, seed=124),
-          tensor_shape,
-          target_mean=0.,
-          target_max=1,
-          target_min=-1)
-
-  def test_normal(self):
-    tensor_shape = (8, 12, 99)
-    with self.cached_session():
-      self._runner(
-          initializers.RandomNormalV2(mean=0, stddev=1, seed=153),
-          tensor_shape,
-          target_mean=0.,
-          target_std=1)
-
-  def test_truncated_normal(self):
-    tensor_shape = (12, 99, 7)
-    with self.cached_session():
-      self._runner(
-          initializers.TruncatedNormalV2(mean=0, stddev=1, seed=126),
-          tensor_shape,
-          target_mean=0.,
-          target_max=2,
-          target_min=-2)
-
-  def test_constant(self):
-    tensor_shape = (5, 6, 4)
-    with self.cached_session():
-      self._runner(
-          initializers.ConstantV2(2.),
-          tensor_shape,
-          target_mean=2,
-          target_max=2,
-          target_min=2)
-
-  def test_lecun_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
-    with self.cached_session():
-      fan_in, _ = _compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          initializers.LecunUniformV2(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
-
-  def test_glorot_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
-    with self.cached_session():
-      fan_in, fan_out = _compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          initializers.GlorotUniformV2(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
-
-  def test_he_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
-    with self.cached_session():
-      fan_in, _ = _compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          initializers.HeUniformV2(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
-
-  def test_lecun_normal(self):
-    tensor_shape = (5, 6, 4, 2)
-    with self.cached_session():
-      fan_in, _ = _compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          initializers.LecunNormalV2(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
-
-  def test_glorot_normal(self):
-    tensor_shape = (5, 6, 4, 2)
-    with self.cached_session():
-      fan_in, fan_out = _compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          initializers.GlorotNormalV2(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
-
-  def test_he_normal(self):
-    tensor_shape = (5, 6, 4, 2)
-    with self.cached_session():
-      fan_in, _ = _compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          initializers.HeNormalV2(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
-
-  def test_orthogonal(self):
-    tensor_shape = (20, 20)
-    with self.cached_session():
-      self._runner(
-          initializers.OrthogonalV2(seed=123), tensor_shape, target_mean=0.)
-
-  def test_identity(self):
-    with self.cached_session():
-      tensor_shape = (3, 4, 5)
-      with self.assertRaises(ValueError):
-        self._runner(
-            initializers.IdentityV2(),
-            tensor_shape,
-            target_mean=1. / tensor_shape[0],
-            target_max=1.)
-
-      tensor_shape = (3, 3)
-      self._runner(
-          initializers.IdentityV2(),
-          tensor_shape,
-          target_mean=1. / tensor_shape[0],
-          target_max=1.)
-
-  def test_zero(self):
-    tensor_shape = (4, 5)
-    with self.cached_session():
-      self._runner(
-          initializers.ZerosV2(), tensor_shape, target_mean=0., target_max=0.)
-
-  def test_one(self):
-    tensor_shape = (4, 5)
-    with self.cached_session():
-      self._runner(
-          initializers.OnesV2(), tensor_shape, target_mean=1., target_max=1.)
-
-  def test_default_random_uniform(self):
-    ru = initializers.get('uniform')
-    self.assertEqual(ru.minval, -0.05)
-    self.assertEqual(ru.maxval, 0.05)
-
-  def test_default_random_normal(self):
-    rn = initializers.get('normal')
-    self.assertEqual(rn.mean, 0.0)
-    self.assertEqual(rn.stddev, 0.05)
-
-  def test_default_truncated_normal(self):
-    tn = initializers.get('truncated_normal')
-    self.assertEqual(tn.mean, 0.0)
-    self.assertEqual(tn.stddev, 0.05)
-
-  def test_custom_initializer_saving(self):
-
-    def my_initializer(shape, dtype=None):
-      return tf.ones(shape, dtype=dtype)
-
-    inputs = input_layer.Input((10,))
-    outputs = core.Dense(1, kernel_initializer=my_initializer)(inputs)
-    model = models.Model(inputs, outputs)
-    model2 = model.from_config(
-        model.get_config(), custom_objects={'my_initializer': my_initializer})
-    self.assertEqual(model2.layers[1].kernel_initializer, my_initializer)
-
-  @test_utils.run_v2_only
-  def test_load_external_variance_scaling_v2(self):
-    external_serialized_json = {
-        'class_name': 'VarianceScaling',
-        'config': {
-            'distribution': 'normal',
-            'mode': 'fan_avg',
-            'scale': 1.0,
-            'seed': None
+    def _runner(
+        self,
+        init,
+        shape,
+    ):
+        # The global seed is set so that we can get the same random streams
+        # between eager and graph mode when stateful op is used.
+        tf.random.set_seed(1337)
+        variable = backend.variable(init(shape))
+        output = backend.get_value(variable)
+        # Test serialization (assumes deterministic behavior).
+        config = init.get_config()
+        reconstructed_init = init.__class__.from_config(config)
+
+        tf.random.set_seed(1337)
+        variable = backend.variable(reconstructed_init(shape))
+        output_2 = backend.get_value(variable)
+        self.assertAllClose(output, output_2, atol=1e-4)
+
+    def test_uniform(self):
+        tensor_shape = (3, 2, 3)
+        with self.cached_session():
+            self._runner(
+                initializers.RandomUniformV2(minval=-1, maxval=1, seed=124),
+                tensor_shape,
+            )
+
+    def test_normal(self):
+        tensor_shape = (8, 12, 99)
+        with self.cached_session():
+            self._runner(
+                initializers.RandomNormalV2(mean=0, stddev=1, seed=153),
+                tensor_shape,
+            )
+
+    def test_truncated_normal(self):
+        tensor_shape = (12, 99, 7)
+        with self.cached_session():
+            self._runner(
+                initializers.TruncatedNormalV2(mean=0, stddev=1, seed=126),
+                tensor_shape,
+            )
+
+    def test_constant(self):
+        tensor_shape = (5, 6, 4)
+        with self.cached_session():
+            self._runner(initializers.ConstantV2(2.0), tensor_shape)
+
+    def test_lecun_uniform(self):
+        tensor_shape = (5, 6, 4, 2)
+        with self.cached_session():
+            self._runner(initializers.LecunUniformV2(seed=123), tensor_shape)
+
+    def test_glorot_uniform(self):
+        tensor_shape = (5, 6, 4, 2)
+        with self.cached_session():
+            self._runner(initializers.GlorotUniformV2(seed=123), tensor_shape)
+
+    def test_he_uniform(self):
+        tensor_shape = (5, 6, 4, 2)
+        with self.cached_session():
+            self._runner(initializers.HeUniformV2(seed=123), tensor_shape)
+
+    def test_lecun_normal(self):
+        tensor_shape = (5, 6, 4, 2)
+        with self.cached_session():
+            self._runner(initializers.LecunNormalV2(seed=123), tensor_shape)
+
+    def test_glorot_normal(self):
+        tensor_shape = (5, 6, 4, 2)
+        with self.cached_session():
+            self._runner(initializers.GlorotNormalV2(seed=123), tensor_shape)
+
+    def test_he_normal(self):
+        tensor_shape = (5, 6, 4, 2)
+        with self.cached_session():
+            self._runner(initializers.HeNormalV2(seed=123), tensor_shape)
+
+    def test_orthogonal(self):
+        tensor_shape = (20, 20)
+        with self.cached_session():
+            self._runner(initializers.OrthogonalV2(seed=123), tensor_shape)
+
+    def test_identity(self):
+        with self.cached_session():
+            tensor_shape = (3, 4, 5)
+            with self.assertRaises(ValueError):
+                self._runner(initializers.IdentityV2(), tensor_shape)
+
+            tensor_shape = (3, 3)
+            self._runner(initializers.IdentityV2(), tensor_shape)
+
+    def test_zero(self):
+        tensor_shape = (4, 5)
+        with self.cached_session():
+            self._runner(initializers.ZerosV2(), tensor_shape)
+
+    def test_one(self):
+        tensor_shape = (4, 5)
+        with self.cached_session():
+            self._runner(initializers.OnesV2(), tensor_shape)
+
+    def test_default_random_uniform(self):
+        ru = initializers.get("uniform")
+        self.assertEqual(ru.minval, -0.05)
+        self.assertEqual(ru.maxval, 0.05)
+
+    def test_default_random_normal(self):
+        rn = initializers.get("normal")
+        self.assertEqual(rn.mean, 0.0)
+        self.assertEqual(rn.stddev, 0.05)
+
+    def test_default_truncated_normal(self):
+        tn = initializers.get("truncated_normal")
+        self.assertEqual(tn.mean, 0.0)
+        self.assertEqual(tn.stddev, 0.05)
+
+    def test_custom_initializer_saving(self):
+        def my_initializer(shape, dtype=None):
+            return tf.ones(shape, dtype=dtype)
+
+        inputs = input_layer.Input((10,))
+        outputs = core.Dense(1, kernel_initializer=my_initializer)(inputs)
+        model = models.Model(inputs, outputs)
+        model2 = model.from_config(
+            model.get_config(),
+            custom_objects={"my_initializer": my_initializer},
+        )
+        self.assertEqual(model2.layers[1].kernel_initializer, my_initializer)
+
+    @test_utils.run_v2_only
+    def test_load_external_variance_scaling_v2(self):
+        external_serialized_json = {
+            "class_name": "VarianceScaling",
+            "config": {
+                "distribution": "normal",
+                "mode": "fan_avg",
+                "scale": 1.0,
+                "seed": None,
+            },
         }
-    }
-    initializer = initializers.deserialize(external_serialized_json)
-    self.assertEqual(initializer.distribution, 'truncated_normal')
-
-  @parameterized.named_parameters(
-      ('Zeros', initializers.ZerosV2, {}),
-      ('Ones', initializers.OnesV2, {}),
-      ('Constant', initializers.ConstantV2, {}),
-      ('RandomUniform', initializers.RandomUniformV2, {}),
-      ('RandomUniform_seeded', initializers.RandomUniformV2, {'seed': 123}),
-      ('RandomNormal', initializers.RandomNormalV2, {}),
-      ('RandomNormal_seeded', initializers.RandomNormalV2, {'seed': 123}),
-      ('TruncatedNormal', initializers.TruncatedNormalV2, {}),
-      ('TruncatedNormal_seeded', initializers.TruncatedNormalV2, {'seed': 123}),
-      ('LecunUniform', initializers.LecunUniformV2, {}),
-      ('LecunUniform_seeded', initializers.LecunUniformV2, {'seed': 123}),
-      ('GlorotUniform', initializers.GlorotUniformV2, {}),
-      ('GlorotUniform_seeded', initializers.GlorotUniformV2, {'seed': 123}),
-      ('HeUniform', initializers.HeUniformV2, {}),
-      ('HeUniform_seeded', initializers.HeUniformV2, {'seed': 123}),
-  )
-  def test_partition(self, initializer_cls, kwargs):
-    with self.cached_session():
-      initializer = initializer_cls(**kwargs)
-      result = initializer(
-          shape=(4, 2), partition_shape=(2, 2), partition_offset=(0, 0))
-      self.assertEqual(result.shape, (2, 2))
-
-      if hasattr(initializer, 'seed'):
-        # Make sure the result are different when the partition_shape is same,
-        # but partition_offset is different, for random related initializers.
-        result_2 = initializer(
-            shape=(4, 2), partition_shape=(2, 2), partition_offset=(1, 0))
-        self.assertNotAllClose(result, result_2)
-
-        # Make sure initializer produce same result when provide same
-        # partition offset.
-        # TODO(scottzhu): Enable this assert when initializer is fully stateless
-        # result_3 = initializer(
-        #     shape=(4, 2), partition_shape=(2, 2), partition_offset=(1, 0))
-        # self.assertAllClose(result_2, result_3)
-
-  @parameterized.named_parameters(
-      ('Orthogonal', initializers.OrthogonalV2),
-      ('Identity', initializers.IdentityV2),
-  )
-  def test_partition_unsupported(self, initializer_cls):
-    with self.assertRaisesRegex(
-        ValueError,
-        "initializer doesn't support partition-related arguments"):
-      initializer_cls()(
-          shape=(4, 2), partition_shape=(2, 2), partition_offset=(0, 0))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+        initializer = initializers.deserialize(external_serialized_json)
+        self.assertEqual(initializer.distribution, "truncated_normal")
+
+    @parameterized.named_parameters(
+        ("Zeros", initializers.ZerosV2, {}),
+        ("Ones", initializers.OnesV2, {}),
+        ("Constant", initializers.ConstantV2, {}),
+        ("RandomUniform", initializers.RandomUniformV2, {}),
+        ("RandomUniform_seeded", initializers.RandomUniformV2, {"seed": 123}),
+        ("RandomNormal", initializers.RandomNormalV2, {}),
+        ("RandomNormal_seeded", initializers.RandomNormalV2, {"seed": 123}),
+        # TODO(scottzhu): Enable these tests after the forward compat period
+        # expires for TruncatedNormalV2.
+        # ("TruncatedNormal", initializers.TruncatedNormalV2, {}),
+        # (
+        #     "TruncatedNormal_seeded",
+        #     initializers.TruncatedNormalV2,
+        #     {"seed": 123},
+        # ),
+        ("LecunUniform", initializers.LecunUniformV2, {}),
+        ("LecunUniform_seeded", initializers.LecunUniformV2, {"seed": 123}),
+        ("GlorotUniform", initializers.GlorotUniformV2, {}),
+        ("GlorotUniform_seeded", initializers.GlorotUniformV2, {"seed": 123}),
+        ("HeUniform", initializers.HeUniformV2, {}),
+        ("HeUniform_seeded", initializers.HeUniformV2, {"seed": 123}),
+    )
+    def test_partition(self, initializer_cls, kwargs):
+        with self.cached_session():
+            initializer = initializer_cls(**kwargs)
+            result = initializer(
+                shape=(4, 2), partition_shape=(2, 2), partition_offset=(0, 0)
+            )
+            self.assertEqual(result.shape, (2, 2))
+
+            if hasattr(initializer, "seed"):
+                # Make sure the result are different when the partition_shape is
+                # same, but partition_offset is different, for random related
+                # initializers.
+                result_2 = initializer(
+                    shape=(4, 2),
+                    partition_shape=(2, 2),
+                    partition_offset=(1, 0),
+                )
+                self.assertNotAllClose(result, result_2)
+
+                # Make sure initializer produce same result when provide same
+                # partition offset.
+                result_3 = initializer(
+                    shape=(4, 2),
+                    partition_shape=(2, 2),
+                    partition_offset=(1, 0),
+                )
+                self.assertAllClose(result_2, result_3)
+
+    @parameterized.named_parameters(
+        ("Orthogonal", initializers.OrthogonalV2),
+        ("Identity", initializers.IdentityV2),
+    )
+    def test_partition_unsupported(self, initializer_cls):
+        with self.assertRaisesRegex(
+            ValueError,
+            "initializer doesn't support partition-related arguments",
+        ):
+            initializer_cls()(
+                shape=(4, 2), partition_shape=(2, 2), partition_offset=(0, 0)
+            )
+
+    @parameterized.parameters(RANDOM_INITIALIZERS)
+    def test_stateless(self, initializer_cl):
+        with self.cached_session():
+            initializer = initializer_cl()
+            output1 = initializer(shape=[2, 3])
+            output2 = initializer(shape=[2, 3])
+            initializer2 = initializer_cl()
+            output3 = initializer2(shape=[2, 3])
+            output4 = initializer2(shape=[2, 3])
+
+            self.assertAllClose(output1, output2)
+            self.assertAllClose(output3, output4)
+            self.assertNotAllClose(output1, output3)
+
+            with warnings.catch_warnings(record=True) as w:
+                initializer(shape=[2, 3])
+                self.assertLen(w, 1)
+                self.assertIn("being called multiple times", str(w[0].message))
+
+    @parameterized.parameters(RANDOM_INITIALIZERS)
+    def test_seed_stateless(self, initializer_cl):
+        with self.cached_session():
+            seed = 1337
+            initializer = initializer_cl(seed=seed)
+            output1 = initializer(shape=[2, 3])
+            output2 = initializer(shape=[2, 3])
+            initializer2 = initializer_cl(seed=seed)
+            output3 = initializer2(shape=[2, 3])
+            output4 = initializer2(shape=[2, 3])
+
+            self.assertAllClose(output1, output2)
+            self.assertAllClose(output3, output4)
+            self.assertAllClose(output1, output3)
+
+            # We don't raise warning for seeded initializer.
+            with warnings.catch_warnings(record=True) as w:
+                initializer(shape=[2, 3])
+                self.assertEmpty(w)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/initializers/initializers_v1.py b/keras/initializers/initializers_v1.py
index d48cdfb3d280..ccac2d3a664a 100644
--- a/keras/initializers/initializers_v1.py
+++ b/keras/initializers/initializers_v1.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 """Keras initializers for TF 1."""
-# pylint:disable=g-classes-have-attributes
+
 
 import tensorflow.compat.v2 as tf
-from tensorflow.python.util.tf_export import keras_export
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
 _v1_zeros_initializer = tf.compat.v1.zeros_initializer
 _v1_ones_initializer = tf.compat.v1.ones_initializer
@@ -28,372 +29,282 @@
 _v1_glorot_uniform_initializer = tf.compat.v1.glorot_uniform_initializer
 _v1_glorot_normal_initializer = tf.compat.v1.glorot_normal_initializer
 
-keras_export(v1=['keras.initializers.Zeros', 'keras.initializers.zeros'], allow_multiple_exports=True)(
-    _v1_zeros_initializer)
-keras_export(v1=['keras.initializers.Ones', 'keras.initializers.ones'], allow_multiple_exports=True)(
-    _v1_ones_initializer)
-keras_export(v1=['keras.initializers.Constant', 'keras.initializers.constant'], allow_multiple_exports=True)(
-    _v1_constant_initializer)
-keras_export(v1=['keras.initializers.VarianceScaling'], allow_multiple_exports=True)(
-    _v1_variance_scaling_initializer)
-keras_export(v1=['keras.initializers.Orthogonal',
-                 'keras.initializers.orthogonal'], allow_multiple_exports=True)(_v1_orthogonal_initializer)
-keras_export(v1=['keras.initializers.Identity',
-                 'keras.initializers.identity'], allow_multiple_exports=True)(_v1_identity)
-keras_export(v1=['keras.initializers.glorot_uniform'], allow_multiple_exports=True)(
-    _v1_glorot_uniform_initializer)
-keras_export(v1=['keras.initializers.glorot_normal'], allow_multiple_exports=True)(
-    _v1_glorot_normal_initializer)
-
-
-@keras_export(v1=['keras.initializers.RandomNormal',
-                  'keras.initializers.random_normal',
-                  'keras.initializers.normal'])
+keras_export(v1=["keras.initializers.Zeros", "keras.initializers.zeros"])(
+    _v1_zeros_initializer
+)
+keras_export(v1=["keras.initializers.Ones", "keras.initializers.ones"])(
+    _v1_ones_initializer
+)
+keras_export(v1=["keras.initializers.Constant", "keras.initializers.constant"])(
+    _v1_constant_initializer
+)
+keras_export(v1=["keras.initializers.VarianceScaling"])(
+    _v1_variance_scaling_initializer
+)
+keras_export(
+    v1=["keras.initializers.Orthogonal", "keras.initializers.orthogonal"]
+)(_v1_orthogonal_initializer)
+keras_export(v1=["keras.initializers.Identity", "keras.initializers.identity"])(
+    _v1_identity
+)
+keras_export(v1=["keras.initializers.glorot_uniform"])(
+    _v1_glorot_uniform_initializer
+)
+keras_export(v1=["keras.initializers.glorot_normal"])(
+    _v1_glorot_normal_initializer
+)
+
+
+@keras_export(
+    v1=[
+        "keras.initializers.RandomNormal",
+        "keras.initializers.random_normal",
+        "keras.initializers.normal",
+    ]
+)
 class RandomNormal(tf.compat.v1.random_normal_initializer):
-  """Initializer that generates a normal distribution.
-
-  Args:
-    mean: a python scalar or a scalar tensor. Mean of the random values to
-      generate.
-    stddev: a python scalar or a scalar tensor. Standard deviation of the random
-      values to generate.
-    seed: A Python integer. Used to create random seeds. See
-      `tf.compat.v1.set_random_seed` for behavior.
-    dtype: Default data type, used if no `dtype` argument is provided when
-      calling the initializer. Only floating point types are supported.
-
-  @compatibility(TF2)
-  Although it is a legacy compat.v1 api,
-  `tf.compat.v1.keras.initializers.RandomNormal` is compatible with eager
-  execution and `tf.function`.
-
-  To switch to native TF2, switch to using
-  `tf.keras.initializers.RandomNormal` (not from `compat.v1`) and
-  if you need to change the default dtype use
-  `tf.keras.backend.set_floatx(float_dtype)`
-  or pass the dtype when calling the initializer, rather than passing it
-  when constructing the initializer.
-
-  Random seed behavior:
-  Also be aware that if you pass a seed to the TF2 initializer
-  API it will reuse that same seed for every single initialization
-  (unlike the TF1 initializer)
-
-  #### Structural Mapping to Native TF2
-
-  Before:
-
-  ```python
-  initializer = tf.compat.v1.keras.initializers.RandomNormal(
-    mean=mean,
-    stddev=stddev,
-    seed=seed,
-    dtype=dtype)
-
-  weight_one = tf.Variable(initializer(shape_one))
-  weight_two = tf.Variable(initializer(shape_two))
-  ```
-
-  After:
-
-  ```python
-  initializer = tf.keras.initializers.RandomNormal(
-    mean=mean,
-    # seed=seed,  # Setting a seed in the native TF2 API
-                  # causes it to produce the same initializations
-                  # across multiple calls of the same initializer.
-    stddev=stddev)
-
-  weight_one = tf.Variable(initializer(shape_one, dtype=dtype))
-  weight_two = tf.Variable(initializer(shape_two, dtype=dtype))
-  ```
-
-  #### How to Map Arguments
-
-  | TF1 Arg Name      | TF2 Arg Name    | Note                       |
-  | :---------------- | :-------------- | :------------------------- |
-  | `mean`            | `mean`          | No change to defaults |
-  | `stddev`          | `stddev`        | No change to defaults |
-  | `seed`            | `seed`          | Different random number generation |
-  :                   :        : semantics (to change in a :
-  :                   :        : future version). If set, the TF2 version :
-  :                   :        : will use stateless random number :
-  :                   :        : generation which will produce the exact :
-  :                   :        : same initialization even across multiple :
-  :                   :        : calls of the initializer instance. the :
-  :                   :        : `compat.v1` version will generate new :
-  :                   :        : initializations each time. Do not set :
-  :                   :        : a seed if you need different          :
-  :                   :        : initializations each time. Instead    :
-  :                   :        : either set a global tf seed with      :
-  :                   :        : `tf.random.set_seed` if you need      :
-  :                   :        : determinism, or initialize each weight:
-  :                   :        : with a separate initializer instance  :
-  :                   :        : and a different seed.                 :
-  | `dtype`           | `dtype`  | The TF2 native api only takes it    |
-  :                   :      : as a `__call__` arg, not a constructor arg. :
-  | `partition_info`  | -    |  (`__call__` arg in TF1) Not supported      |
-
-  #### Example of fixed-seed behavior differences
-
-  `compat.v1` Fixed seed behavior:
-
-  >>> initializer = tf.compat.v1.keras.initializers.TruncatedNormal(seed=10)
-  >>> a = initializer(shape=(2, 2))
-  >>> b = initializer(shape=(2, 2))
-  >>> tf.reduce_sum(a - b) == 0
-  <tf.Tensor: shape=(), dtype=bool, numpy=False>
-
-  After:
-
-  >>> initializer = tf.keras.initializers.TruncatedNormal(seed=10)
-  >>> a = initializer(shape=(2, 2))
-  >>> b = initializer(shape=(2, 2))
-  >>> tf.reduce_sum(a - b) == 0
-  <tf.Tensor: shape=(), dtype=bool, numpy=False>
-
-  @end_compatibility
-  """
-
-  def __init__(self, mean=0.0, stddev=0.05, seed=None, dtype=tf.float32):
-    super().__init__(
-        mean=mean, stddev=stddev, seed=seed, dtype=dtype)
-
-
-@keras_export(v1=['keras.initializers.RandomUniform',
-                  'keras.initializers.random_uniform',
-                  'keras.initializers.uniform'])
+    """Initializer that generates a normal distribution.
+
+    Args:
+      mean: a python scalar or a scalar tensor. Mean of the random values to
+        generate.
+      stddev: a python scalar or a scalar tensor. Standard deviation of the
+        random values to generate.
+      seed: A Python integer. Used to create random seeds. See
+        `tf.compat.v1.set_random_seed` for behavior.
+      dtype: Default data type, used if no `dtype` argument is provided when
+        calling the initializer. Only floating point types are supported.
+
+    @compatibility(TF2)
+    Although it is a legacy compat.v1 api,
+    `tf.compat.v1.keras.initializers.RandomNormal` is compatible with eager
+    execution and `tf.function`.
+
+    To switch to native TF2, switch to using
+    `tf.keras.initializers.RandomNormal` (not from `compat.v1`) and
+    if you need to change the default dtype use
+    `tf.keras.backend.set_floatx(float_dtype)`
+    or pass the dtype when calling the initializer, rather than passing it
+    when constructing the initializer.
+
+    Random seed behavior:
+    Also be aware that if you pass a seed to the TF2 initializer
+    API it will reuse that same seed for every single initialization
+    (unlike the TF1 initializer)
+
+    #### Structural Mapping to Native TF2
+
+    Before:
+
+    ```python
+    initializer = tf.compat.v1.keras.initializers.RandomNormal(
+      mean=mean,
+      stddev=stddev,
+      seed=seed,
+      dtype=dtype)
+
+    weight_one = tf.Variable(initializer(shape_one))
+    weight_two = tf.Variable(initializer(shape_two))
+    ```
+
+    After:
+
+    ```python
+    initializer = tf.keras.initializers.RandomNormal(
+      mean=mean,
+      # seed=seed,  # Setting a seed in the native TF2 API
+                    # causes it to produce the same initializations
+                    # across multiple calls of the same initializer.
+      stddev=stddev)
+
+    weight_one = tf.Variable(initializer(shape_one, dtype=dtype))
+    weight_two = tf.Variable(initializer(shape_two, dtype=dtype))
+    ```
+
+    #### How to Map Arguments
+
+    | TF1 Arg Name      | TF2 Arg Name    | Note                       |
+    | :---------------- | :-------------- | :------------------------- |
+    | `mean`            | `mean`          | No change to defaults |
+    | `stddev`          | `stddev`        | No change to defaults |
+    | `seed`            | `seed`          | Different random number generation |
+    :                   :        : semantics (to change in a :
+    :                   :        : future version). If set, the TF2 version :
+    :                   :        : will use stateless random number :
+    :                   :        : generation which will produce the exact :
+    :                   :        : same initialization even across multiple :
+    :                   :        : calls of the initializer instance. the :
+    :                   :        : `compat.v1` version will generate new :
+    :                   :        : initializations each time. Do not set :
+    :                   :        : a seed if you need different          :
+    :                   :        : initializations each time. Instead    :
+    :                   :        : either set a global tf seed with      :
+    :                   :        : `tf.random.set_seed` if you need      :
+    :                   :        : determinism, or initialize each weight:
+    :                   :        : with a separate initializer instance  :
+    :                   :        : and a different seed.                 :
+    | `dtype`           | `dtype`  | The TF2 native api only takes it    |
+    :                   :      : as a `__call__` arg, not a constructor arg. :
+    | `partition_info`  | -    |  (`__call__` arg in TF1) Not supported      |
+
+    #### Example of fixed-seed behavior differences
+
+    `compat.v1` Fixed seed behavior:
+
+    >>> initializer = tf.compat.v1.keras.initializers.RandomNormal(seed=10)
+    >>> a = initializer(shape=(2, 2))
+    >>> b = initializer(shape=(2, 2))
+    >>> tf.reduce_sum(a - b) == 0
+    <tf.Tensor: shape=(), dtype=bool, numpy=False>
+
+    After:
+
+    >>> initializer = tf.keras.initializers.RandomNormal(seed=10)
+    >>> a = initializer(shape=(2, 2))
+    >>> b = initializer(shape=(2, 2))
+    >>> tf.reduce_sum(a - b) == 0
+    <tf.Tensor: shape=(), dtype=bool, numpy=True>
+
+    @end_compatibility
+    """
+
+    def __init__(self, mean=0.0, stddev=0.05, seed=None, dtype=tf.float32):
+        super().__init__(mean=mean, stddev=stddev, seed=seed, dtype=dtype)
+
+
+@keras_export(
+    v1=[
+        "keras.initializers.RandomUniform",
+        "keras.initializers.random_uniform",
+        "keras.initializers.uniform",
+    ]
+)
 class RandomUniform(tf.compat.v1.random_uniform_initializer):
-  """Initializer that generates tensors with a uniform distribution.
-
-  Args:
-    minval: A python scalar or a scalar tensor. Lower bound of the range of
-      random values to generate.
-    maxval: A python scalar or a scalar tensor. Upper bound of the range of
-      random values to generate.  Defaults to 1 for float types.
-    seed: A Python integer. Used to create random seeds. See
-      `tf.compat.v1.set_random_seed` for behavior.
-    dtype: Default data type, used if no `dtype` argument is provided when
-      calling the initializer.
-
-  @compatibility(TF2)
-  Although it is a legacy `compat.v1` api,
-  `tf.compat.v1.keras.initializers.RandomUniform` is compatible with eager
-  execution and `tf.function`.
-
-  To switch to native TF2, switch to using
-  `tf.keras.initializers.RandomUniform` (not from `compat.v1`) and
-  if you need to change the default dtype use
-  `tf.keras.backend.set_floatx(float_dtype)`
-  or pass the dtype when calling the initializer, rather than passing it
-  when constructing the initializer.
-
-  Random seed behavior:
-
-  Also be aware that if you pass a seed to the TF2 initializer
-  API it will reuse that same seed for every single initialization
-  (unlike the TF1 initializer)
-
-  #### Structural Mapping to Native TF2
-
-  Before:
-
-  ```python
-
-  initializer = tf.compat.v1.keras.initializers.RandomUniform(
-    minval=minval,
-    maxval=maxval,
-    seed=seed,
-    dtype=dtype)
-
-  weight_one = tf.Variable(initializer(shape_one))
-  weight_two = tf.Variable(initializer(shape_two))
-  ```
-
-  After:
-
-  ```python
-  initializer = tf.keras.initializers.RandomUniform(
-    minval=minval,
-    maxval=maxval,
-    # seed=seed,  # Setting a seed in the native TF2 API
-                  # causes it to produce the same initializations
-                  # across multiple calls of the same initializer.
-    )
-
-  weight_one = tf.Variable(initializer(shape_one, dtype=dtype))
-  weight_two = tf.Variable(initializer(shape_two, dtype=dtype))
-  ```
-
-  #### How to Map Arguments
-
-  | TF1 Arg Name      | TF2 Arg Name    | Note                       |
-  | :---------------- | :-------------- | :------------------------- |
-  | `minval`            | `minval`          | No change to defaults |
-  | `maxval`          | `maxval`        | No change to defaults |
-  | `seed`            | `seed`          | Different random number generation |
-  :                    :        : semantics (to change in a :
-  :                    :        : future version). If set, the TF2 version :
-  :                    :        : will use stateless random number :
-  :                    :        : generation which will produce the exact :
-  :                    :        : same initialization even across multiple :
-  :                    :        : calls of the initializer instance. the :
-  :                    :        : `compat.v1` version will generate new :
-  :                    :        : initializations each time. Do not set :
-  :                    :        : a seed if you need different          :
-  :                    :        : initializations each time. Instead    :
-  :                    :        : either set a global tf seed with
-  :                    :        : `tf.random.set_seed` if you need :
-  :                    :        : determinism, or initialize each weight :
-  :                    :        : with a separate initializer instance  :
-  :                    :        : and a different seed.                 :
-  | `dtype`           | `dtype`  | The TF2 native api only takes it  |
-  :                   :      : as a `__call__` arg, not a constructor arg. :
-  | `partition_info`  | -    |  (`__call__` arg in TF1) Not supported      |
-
-  #### Example of fixed-seed behavior differences
-
-  `compat.v1` Fixed seed behavior:
-
-  >>> initializer = tf.compat.v1.keras.initializers.RandomUniform(seed=10)
-  >>> a = initializer(shape=(2, 2))
-  >>> b = initializer(shape=(2, 2))
-  >>> tf.reduce_sum(a - b) == 0
-  <tf.Tensor: shape=(), dtype=bool, numpy=False>
-
-  After:
-
-  >>> initializer = tf.keras.initializers.RandomUniform(seed=10)
-  >>> a = initializer(shape=(2, 2))
-  >>> b = initializer(shape=(2, 2))
-  >>> tf.reduce_sum(a - b) == 0
-  <tf.Tensor: shape=(), dtype=bool, numpy=False>
-
-  @end_compatibility
-  """
-
-  def __init__(self, minval=-0.05, maxval=0.05, seed=None,
-               dtype=tf.float32):
-    super().__init__(
-        minval=minval, maxval=maxval, seed=seed, dtype=dtype)
-
-
-@keras_export(v1=['keras.initializers.TruncatedNormal',
-                  'keras.initializers.truncated_normal'])
+    """Initializer that generates tensors with a uniform distribution.
+
+    Args:
+      minval: A python scalar or a scalar tensor. Lower bound of the range of
+        random values to generate. Defaults to `-0.05`.
+      maxval: A python scalar or a scalar tensor. Upper bound of the range of
+        random values to generate. Defaults to `0.05`.
+      seed: A Python integer. Used to create random seeds. See
+        `tf.compat.v1.set_random_seed` for behavior.
+      dtype: Default data type, used if no `dtype` argument is provided when
+        calling the initializer.
+
+    @compatibility(TF2)
+    Although it is a legacy `compat.v1` api,
+    `tf.compat.v1.keras.initializers.RandomUniform` is compatible with eager
+    execution and `tf.function`.
+
+    To switch to native TF2, switch to using
+    `tf.keras.initializers.RandomUniform` (not from `compat.v1`) and
+    if you need to change the default dtype use
+    `tf.keras.backend.set_floatx(float_dtype)`
+    or pass the dtype when calling the initializer, rather than passing it
+    when constructing the initializer.
+
+    Random seed behavior:
+
+    Also be aware that if you pass a seed to the TF2 initializer
+    API it will reuse that same seed for every single initialization
+    (unlike the TF1 initializer)
+
+    #### Structural Mapping to Native TF2
+
+    Before:
+
+    ```python
+
+    initializer = tf.compat.v1.keras.initializers.RandomUniform(
+      minval=minval,
+      maxval=maxval,
+      seed=seed,
+      dtype=dtype)
+
+    weight_one = tf.Variable(initializer(shape_one))
+    weight_two = tf.Variable(initializer(shape_two))
+    ```
+
+    After:
+
+    ```python
+    initializer = tf.keras.initializers.RandomUniform(
+      minval=minval,
+      maxval=maxval,
+      # seed=seed,  # Setting a seed in the native TF2 API
+                    # causes it to produce the same initializations
+                    # across multiple calls of the same initializer.
+      )
+
+    weight_one = tf.Variable(initializer(shape_one, dtype=dtype))
+    weight_two = tf.Variable(initializer(shape_two, dtype=dtype))
+    ```
+
+    #### How to Map Arguments
+
+    | TF1 Arg Name      | TF2 Arg Name    | Note                       |
+    | :---------------- | :-------------- | :------------------------- |
+    | `minval`            | `minval`          | No change to defaults |
+    | `maxval`          | `maxval`        | No change to defaults |
+    | `seed`            | `seed`          | Different random number generation |
+    :                    :        : semantics (to change in a :
+    :                    :        : future version). If set, the TF2 version :
+    :                    :        : will use stateless random number :
+    :                    :        : generation which will produce the exact :
+    :                    :        : same initialization even across multiple :
+    :                    :        : calls of the initializer instance. the :
+    :                    :        : `compat.v1` version will generate new :
+    :                    :        : initializations each time. Do not set :
+    :                    :        : a seed if you need different          :
+    :                    :        : initializations each time. Instead    :
+    :                    :        : either set a global tf seed with
+    :                    :        : `tf.random.set_seed` if you need :
+    :                    :        : determinism, or initialize each weight :
+    :                    :        : with a separate initializer instance  :
+    :                    :        : and a different seed.                 :
+    | `dtype`           | `dtype`  | The TF2 native api only takes it  |
+    :                   :      : as a `__call__` arg, not a constructor arg. :
+    | `partition_info`  | -    |  (`__call__` arg in TF1) Not supported      |
+
+    #### Example of fixed-seed behavior differences
+
+    `compat.v1` Fixed seed behavior:
+
+    >>> initializer = tf.compat.v1.keras.initializers.RandomUniform(seed=10)
+    >>> a = initializer(shape=(2, 2))
+    >>> b = initializer(shape=(2, 2))
+    >>> tf.reduce_sum(a - b) == 0
+    <tf.Tensor: shape=(), dtype=bool, numpy=False>
+
+    After:
+
+    >>> initializer = tf.keras.initializers.RandomUniform(seed=10)
+    >>> a = initializer(shape=(2, 2))
+    >>> b = initializer(shape=(2, 2))
+    >>> tf.reduce_sum(a - b) == 0
+    <tf.Tensor: shape=(), dtype=bool, numpy=True>
+
+    @end_compatibility
+    """
+
+    def __init__(self, minval=-0.05, maxval=0.05, seed=None, dtype=tf.float32):
+        super().__init__(minval=minval, maxval=maxval, seed=seed, dtype=dtype)
+
+
+@keras_export(
+    v1=[
+        "keras.initializers.TruncatedNormal",
+        "keras.initializers.truncated_normal",
+    ]
+)
 class TruncatedNormal(tf.compat.v1.truncated_normal_initializer):
-  """Initializer that generates a truncated normal distribution.
-
-  These values are similar to values from a `random_normal_initializer`
-  except that values more than two standard deviations from the mean
-  are discarded and re-drawn. This is the recommended initializer for
-  neural network weights and filters.
-
-  Args:
-    mean: a python scalar or a scalar tensor. Mean of the random values to
-      generate.
-    stddev: a python scalar or a scalar tensor. Standard deviation of the
-      random values to generate.
-    seed: A Python integer. Used to create random seeds. See
-      `tf.compat.v1.set_random_seed` for behavior.
-    dtype: Default data type, used if no `dtype` argument is provided when
-      calling the initializer. Only floating point types are supported.
-
-  @compatibility(TF2)
-  Although it is a legacy compat.v1 api,
-  `tf.compat.v1.keras.initializers.TruncatedNormal` is compatible with eager
-  execution and `tf.function`.
-
-  To switch to native TF2, switch to using
-  `tf.keras.initializers.TruncatedNormal` (not from `compat.v1`) and
-  if you need to change the default dtype use
-  `tf.keras.backend.set_floatx(float_dtype)`
-  or pass the dtype when calling the initializer, rather than passing it
-  when constructing the initializer.
-
-  Random seed behavior:
-  Also be aware that if you pass a seed to the TF2 initializer
-  API it will reuse that same seed for every single initialization
-  (unlike the TF1 initializer)
-
-  #### Structural Mapping to Native TF2
-
-  Before:
-
-  ```python
-  initializer = tf.compat.v1.keras.initializers.TruncatedNormal(
-    mean=mean,
-    stddev=stddev,
-    seed=seed,
-    dtype=dtype)
-
-  weight_one = tf.Variable(initializer(shape_one))
-  weight_two = tf.Variable(initializer(shape_two))
-  ```
-
-  After:
-
-  ```python
-  initializer = tf.keras.initializers.TruncatedNormal(
-    mean=mean,
-    # seed=seed,  # Setting a seed in the native TF2 API
-                  # causes it to produce the same initializations
-                  # across multiple calls of the same initializer.
-    stddev=stddev)
-
-  weight_one = tf.Variable(initializer(shape_one, dtype=dtype))
-  weight_two = tf.Variable(initializer(shape_two, dtype=dtype))
-  ```
-
-  #### How to Map Arguments
-
-  | TF1 Arg Name      | TF2 Arg Name    | Note                       |
-  | :---------------- | :-------------- | :------------------------- |
-  | `mean`            | `mean`          | No change to defaults |
-  | `stddev`          | `stddev`        | No change to defaults |
-  | `seed`            | `seed`          | Different random number generation |
-  :                    :        : semantics (to change in a :
-  :                    :        : future version). If set, the TF2 version :
-  :                    :        : will use stateless random number :
-  :                    :        : generation which will produce the exact :
-  :                    :        : same initialization even across multiple :
-  :                    :        : calls of the initializer instance. the :
-  :                    :        : `compat.v1` version will generate new :
-  :                    :        : initializations each time. Do not set :
-  :                    :        : a seed if you need different          :
-  :                    :        : initializations each time. Instead    :
-  :                    :        : either set a global tf seed with
-  :                    :        : `tf.random.set_seed` if you need :
-  :                    :        : determinism, or initialize each weight :
-  :                    :        : with a separate initializer instance  :
-  :                    :        : and a different seed.                 :
-  | `dtype`           | `dtype`  | The TF2 native api only takes it  |
-  :                   :      : as a `__call__` arg, not a constructor arg. :
-  | `partition_info`  | -    |  (`__call__` arg in TF1) Not supported      |
-
-  #### Example of fixed-seed behavior differences
-
-  `compat.v1` Fixed seed behavior:
-
-  >>> initializer = tf.compat.v1.keras.initializers.TruncatedNormal(seed=10)
-  >>> a = initializer(shape=(2, 2))
-  >>> b = initializer(shape=(2, 2))
-  >>> tf.reduce_sum(a - b) == 0
-  <tf.Tensor: shape=(), dtype=bool, numpy=False>
-
-  After:
-
-  >>> initializer = tf.keras.initializers.TruncatedNormal(seed=10)
-  >>> a = initializer(shape=(2, 2))
-  >>> b = initializer(shape=(2, 2))
-  >>> tf.reduce_sum(a - b) == 0
-  <tf.Tensor: shape=(), dtype=bool, numpy=False>
-
-  @end_compatibility
-  """
-
-  def __init__(self, mean=0.0, stddev=0.05, seed=None, dtype=tf.float32):
     """Initializer that generates a truncated normal distribution.
 
+    These values are similar to values from a `random_normal_initializer`
+    except that values more than two standard deviations from the mean
+    are discarded and re-drawn. This is the recommended initializer for
+    neural network weights and filters.
 
     Args:
       mean: a python scalar or a scalar tensor. Mean of the random values to
@@ -404,50 +315,156 @@ def __init__(self, mean=0.0, stddev=0.05, seed=None, dtype=tf.float32):
         `tf.compat.v1.set_random_seed` for behavior.
       dtype: Default data type, used if no `dtype` argument is provided when
         calling the initializer. Only floating point types are supported.
+
+    @compatibility(TF2)
+    Although it is a legacy compat.v1 api,
+    `tf.compat.v1.keras.initializers.TruncatedNormal` is compatible with eager
+    execution and `tf.function`.
+
+    To switch to native TF2, switch to using
+    `tf.keras.initializers.TruncatedNormal` (not from `compat.v1`) and
+    if you need to change the default dtype use
+    `tf.keras.backend.set_floatx(float_dtype)`
+    or pass the dtype when calling the initializer, rather than passing it
+    when constructing the initializer.
+
+    Random seed behavior:
+    Also be aware that if you pass a seed to the TF2 initializer
+    API it will reuse that same seed for every single initialization
+    (unlike the TF1 initializer)
+
+    #### Structural Mapping to Native TF2
+
+    Before:
+
+    ```python
+    initializer = tf.compat.v1.keras.initializers.TruncatedNormal(
+      mean=mean,
+      stddev=stddev,
+      seed=seed,
+      dtype=dtype)
+
+    weight_one = tf.Variable(initializer(shape_one))
+    weight_two = tf.Variable(initializer(shape_two))
+    ```
+
+    After:
+
+    ```python
+    initializer = tf.keras.initializers.TruncatedNormal(
+      mean=mean,
+      # seed=seed,  # Setting a seed in the native TF2 API
+                    # causes it to produce the same initializations
+                    # across multiple calls of the same initializer.
+      stddev=stddev)
+
+    weight_one = tf.Variable(initializer(shape_one, dtype=dtype))
+    weight_two = tf.Variable(initializer(shape_two, dtype=dtype))
+    ```
+
+    #### How to Map Arguments
+
+    | TF1 Arg Name      | TF2 Arg Name    | Note                       |
+    | :---------------- | :-------------- | :------------------------- |
+    | `mean`            | `mean`          | No change to defaults |
+    | `stddev`          | `stddev`        | No change to defaults |
+    | `seed`            | `seed`          | Different random number generation |
+    :                    :        : semantics (to change in a :
+    :                    :        : future version). If set, the TF2 version :
+    :                    :        : will use stateless random number :
+    :                    :        : generation which will produce the exact :
+    :                    :        : same initialization even across multiple :
+    :                    :        : calls of the initializer instance. the :
+    :                    :        : `compat.v1` version will generate new :
+    :                    :        : initializations each time. Do not set :
+    :                    :        : a seed if you need different          :
+    :                    :        : initializations each time. Instead    :
+    :                    :        : either set a global tf seed with
+    :                    :        : `tf.random.set_seed` if you need :
+    :                    :        : determinism, or initialize each weight :
+    :                    :        : with a separate initializer instance  :
+    :                    :        : and a different seed.                 :
+    | `dtype`           | `dtype`  | The TF2 native api only takes it  |
+    :                   :      : as a `__call__` arg, not a constructor arg. :
+    | `partition_info`  | -    |  (`__call__` arg in TF1) Not supported      |
+
+    #### Example of fixed-seed behavior differences
+
+    `compat.v1` Fixed seed behavior:
+
+    >>> initializer = tf.compat.v1.keras.initializers.TruncatedNormal(seed=10)
+    >>> a = initializer(shape=(2, 2))
+    >>> b = initializer(shape=(2, 2))
+    >>> tf.reduce_sum(a - b) == 0
+    <tf.Tensor: shape=(), dtype=bool, numpy=False>
+
+    After:
+
+    >>> initializer = tf.keras.initializers.TruncatedNormal(seed=10)
+    >>> a = initializer(shape=(2, 2))
+    >>> b = initializer(shape=(2, 2))
+    >>> tf.reduce_sum(a - b) == 0
+    <tf.Tensor: shape=(), dtype=bool, numpy=True>
+
+    @end_compatibility
     """
-    super().__init__(
-        mean=mean, stddev=stddev, seed=seed, dtype=dtype)
 
+    def __init__(self, mean=0.0, stddev=0.05, seed=None, dtype=tf.float32):
+        """Initializer that generates a truncated normal distribution.
 
-@keras_export(v1=['keras.initializers.lecun_normal'])
-class LecunNormal(tf.compat.v1.variance_scaling_initializer):
 
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=1., mode='fan_in', distribution='truncated_normal', seed=seed)
+        Args:
+          mean: a python scalar or a scalar tensor. Mean of the random values to
+            generate.
+          stddev: a python scalar or a scalar tensor. Standard deviation of the
+            random values to generate.
+          seed: A Python integer. Used to create random seeds. See
+            `tf.compat.v1.set_random_seed` for behavior.
+          dtype: Default data type, used if no `dtype` argument is provided when
+            calling the initializer. Only floating point types are supported.
+        """
+        super().__init__(mean=mean, stddev=stddev, seed=seed, dtype=dtype)
 
-  def get_config(self):
-    return {'seed': self.seed}
 
+@keras_export(v1=["keras.initializers.lecun_normal"])
+class LecunNormal(tf.compat.v1.variance_scaling_initializer):
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=1.0, mode="fan_in", distribution="truncated_normal", seed=seed
+        )
 
-@keras_export(v1=['keras.initializers.lecun_uniform'])
-class LecunUniform(tf.compat.v1.variance_scaling_initializer):
+    def get_config(self):
+        return {"seed": self.seed}
 
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=1., mode='fan_in', distribution='uniform', seed=seed)
 
-  def get_config(self):
-    return {'seed': self.seed}
+@keras_export(v1=["keras.initializers.lecun_uniform"])
+class LecunUniform(tf.compat.v1.variance_scaling_initializer):
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=1.0, mode="fan_in", distribution="uniform", seed=seed
+        )
 
+    def get_config(self):
+        return {"seed": self.seed}
 
-@keras_export(v1=['keras.initializers.he_normal'])
-class HeNormal(tf.compat.v1.variance_scaling_initializer):
 
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=2., mode='fan_in', distribution='truncated_normal', seed=seed)
+@keras_export(v1=["keras.initializers.he_normal"])
+class HeNormal(tf.compat.v1.variance_scaling_initializer):
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=2.0, mode="fan_in", distribution="truncated_normal", seed=seed
+        )
 
-  def get_config(self):
-    return {'seed': self.seed}
+    def get_config(self):
+        return {"seed": self.seed}
 
 
-@keras_export(v1=['keras.initializers.he_uniform'])
+@keras_export(v1=["keras.initializers.he_uniform"])
 class HeUniform(tf.compat.v1.variance_scaling_initializer):
+    def __init__(self, seed=None):
+        super().__init__(
+            scale=2.0, mode="fan_in", distribution="uniform", seed=seed
+        )
 
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=2., mode='fan_in', distribution='uniform', seed=seed)
-
-  def get_config(self):
-    return {'seed': self.seed}
+    def get_config(self):
+        return {"seed": self.seed}
diff --git a/keras/initializers/initializers_v2.py b/keras/initializers/initializers_v2.py
deleted file mode 100644
index 8048f158e99d..000000000000
--- a/keras/initializers/initializers_v2.py
+++ /dev/null
@@ -1,1098 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras initializers for TF 2."""
-# pylint: disable=g-classes-have-attributes, missing-docstring, g-direct-tensorflow-import
-
-import math
-
-from keras import backend
-from keras.dtensor import utils
-
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-_PARTITION_SHAPE = 'partition_shape'
-_PARTITION_OFFSET = 'partition_offset'
-_LAYOUT = 'layout'
-_ALLOWED_INITIALIZER_KWARGS = [_PARTITION_SHAPE, _PARTITION_OFFSET, _LAYOUT]
-
-
-@keras_export('keras.initializers.Initializer')
-class Initializer:
-  """Initializer base class: all Keras initializers inherit from this class.
-
-  Initializers should implement a `__call__` method with the following
-  signature:
-
-  ```python
-  def __call__(self, shape, dtype=None, **kwargs):
-    # returns a tensor of shape `shape` and dtype `dtype`
-    # containing values drawn from a distribution of your choice.
-  ```
-
-  Optionally, you an also implement the method `get_config` and the class
-  method `from_config` in order to support serialization -- just like with
-  any Keras object.
-
-  Here's a simple example: a random normal initializer.
-
-  ```python
-  import tensorflow as tf
-
-  class ExampleRandomNormal(tf.keras.initializers.Initializer):
-
-    def __init__(self, mean, stddev):
-      self.mean = mean
-      self.stddev = stddev
-
-    def __call__(self, shape, dtype=None, **kwargs):
-      return tf.random.normal(
-          shape, mean=self.mean, stddev=self.stddev, dtype=dtype)
-
-    def get_config(self):  # To support serialization
-      return {"mean": self.mean, "stddev": self.stddev}
-  ```
-
-  Note that we don't have to implement `from_config` in the example above since
-  the constructor arguments of the class the keys in the config returned by
-  `get_config` are the same. In this case, the default `from_config`
-  works fine.
-  """
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized as specified by the initializer.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor.
-      **kwargs: Additional keyword arguments.
-    """
-    raise NotImplementedError('Initializer subclasses must implement the '
-                              '`__call__()` method.')
-
-  def get_config(self):
-    """Returns the configuration of the initializer as a JSON-serializable dict.
-
-    Returns:
-      A JSON-serializable Python dict.
-    """
-    return {}
-
-  @classmethod
-  def from_config(cls, config):
-    """Instantiates an initializer from a configuration dictionary.
-
-    Example:
-
-    ```python
-    initializer = RandomUniform(-1, 1)
-    config = initializer.get_config()
-    initializer = RandomUniform.from_config(config)
-    ```
-
-    Args:
-      config: A Python dictionary, the output of `get_config`.
-
-    Returns:
-      A `tf.keras.initializers.Initializer` instance.
-    """
-    config.pop('dtype', None)
-    return cls(**config)
-
-
-@keras_export('keras.initializers.Zeros', 'keras.initializers.zeros', v1=[])
-class Zeros(Initializer):
-  """Initializer that generates tensors initialized to 0.
-
-  Also available via the shortcut function `tf.keras.initializers.zeros`.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.Zeros()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.Zeros()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-  """
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized as specified by the initializer.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
-       supported. If not specified, `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`).
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs)
-    dtype = _get_dtype(dtype)
-    if not dtype.is_numpy_compatible or dtype == tf.string:
-      raise ValueError(f'Expected numeric or boolean dtype, got {dtype}.')
-    if _PARTITION_SHAPE in kwargs:
-      shape = kwargs[_PARTITION_SHAPE]
-    layout = kwargs.pop('layout', None)
-    if layout:
-      return utils.call_with_layout(tf.zeros, layout, shape=shape, dtype=dtype)
-    return tf.zeros(shape, dtype)
-
-
-@keras_export('keras.initializers.Ones', 'keras.initializers.ones', v1=[])
-class Ones(Initializer):
-  """Initializer that generates tensors initialized to 1.
-
-  Also available via the shortcut function `tf.keras.initializers.ones`.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.Ones()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.Ones()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-  """
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized as specified by the initializer.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
-       supported. If not specified, `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`).
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs)
-    dtype = _get_dtype(dtype)
-    if not dtype.is_numpy_compatible or dtype == tf.string:
-      raise ValueError(f'Expected numeric or boolean dtype, got {dtype}.')
-    if _PARTITION_SHAPE in kwargs:
-      shape = kwargs[_PARTITION_SHAPE]
-    layout = kwargs.pop('layout', None)
-    if layout:
-      return utils.call_with_layout(tf.ones, layout, shape=shape, dtype=dtype)
-    return tf.ones(shape, dtype)
-
-
-@keras_export('keras.initializers.Constant',
-              'keras.initializers.constant',
-              v1=[])
-class Constant(Initializer):
-  """Initializer that generates tensors with constant values.
-
-  Also available via the shortcut function `tf.keras.initializers.constant`.
-
-  Only scalar values are allowed.
-  The constant value provided must be convertible to the dtype requested
-  when calling the initializer.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.Constant(3.)
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.Constant(3.)
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    value: A Python scalar.
-  """
-
-  def __init__(self, value=0):
-    self.value = value
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized to `self.value`.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. If not specified,
-       `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`).
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs)
-    dtype = _get_dtype(dtype)
-    if _PARTITION_SHAPE in kwargs:
-      shape = kwargs[_PARTITION_SHAPE]
-    layout = kwargs.pop('layout', None)
-    if layout:
-      return utils.call_with_layout(tf.constant, layout, self.value,
-                                    shape=shape, dtype=dtype)
-    return tf.constant(
-        self.value, dtype=_get_dtype(dtype), shape=shape)
-
-  def get_config(self):
-    return {'value': self.value}
-
-
-@keras_export('keras.initializers.RandomUniform',
-              'keras.initializers.random_uniform',
-              v1=[])
-class RandomUniform(Initializer):
-  """Initializer that generates tensors with a uniform distribution.
-
-  Also available via the shortcut function
-  `tf.keras.initializers.random_uniform`.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.RandomUniform(minval=0., maxval=1.)
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    minval: A python scalar or a scalar tensor. Lower bound of the range of
-      random values to generate (inclusive).
-    maxval: A python scalar or a scalar tensor. Upper bound of the range of
-      random values to generate (exclusive).
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-  """
-
-  def __init__(self, minval=-0.05, maxval=0.05, seed=None):
-    self.minval = minval
-    self.maxval = maxval
-    self.seed = seed
-    self._random_generator = backend.RandomGenerator(seed)
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized as specified by the initializer.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. Only floating point and integer
-      types are supported. If not specified,
-        `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`).
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs)
-    dtype = _get_dtype(dtype)
-    if not dtype.is_floating and not dtype.is_integer:
-      raise ValueError(f'Expected float or integer dtype, got {dtype}.')
-    if _PARTITION_SHAPE in kwargs:
-      shape = kwargs[_PARTITION_SHAPE]
-    partition_offset = kwargs.get(_PARTITION_OFFSET, None)
-    nonce = hash(partition_offset) if partition_offset else None
-    layout = kwargs.pop('layout', None)
-    if layout:
-      self._random_generator._rng_type = self._random_generator.RNG_STATEFUL
-      _ensure_keras_seeded()
-      return utils.call_with_layout(
-          self._random_generator.random_uniform, layout, shape, self.minval,
-          self.maxval, dtype, nonce)
-    return self._random_generator.random_uniform(
-        shape, self.minval, self.maxval, dtype, nonce)
-
-  def get_config(self):
-    return {
-        'minval': self.minval,
-        'maxval': self.maxval,
-        'seed': self.seed
-    }
-
-
-@keras_export('keras.initializers.RandomNormal',
-              'keras.initializers.random_normal',
-              v1=[])
-class RandomNormal(Initializer):
-  """Initializer that generates tensors with a normal distribution.
-
-  Also available via the shortcut function
-  `tf.keras.initializers.random_normal`.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=1.)
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=1.)
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    mean: a python scalar or a scalar tensor. Mean of the random values to
-      generate.
-    stddev: a python scalar or a scalar tensor. Standard deviation of the random
-      values to generate.
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-  """
-
-  def __init__(self, mean=0.0, stddev=0.05, seed=None):
-    self.mean = mean
-    self.stddev = stddev
-    self.seed = seed
-    self._random_generator = backend.RandomGenerator(seed)
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized to random normal values.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. Only floating point types are
-        supported. If not specified, `tf.keras.backend.floatx()` is used, which
-        default to `float32` unless you configured it otherwise (via
-        `tf.keras.backend.set_floatx(float_dtype)`)
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs)
-    dtype = _assert_float_dtype(_get_dtype(dtype))
-    if _PARTITION_SHAPE in kwargs:
-      shape = kwargs[_PARTITION_SHAPE]
-    partition_offset = kwargs.get(_PARTITION_OFFSET, None)
-    nonce = hash(partition_offset) if partition_offset else None
-    layout = kwargs.pop('layout', None)
-    if layout:
-      self._random_generator._rng_type = self._random_generator.RNG_STATEFUL
-      _ensure_keras_seeded()
-      return utils.call_with_layout(
-          self._random_generator.random_normal, layout, shape, self.mean,
-          self.stddev, dtype, nonce)
-    return self._random_generator.random_normal(
-        shape, self.mean, self.stddev, dtype, nonce)
-
-  def get_config(self):
-    return {
-        'mean': self.mean,
-        'stddev': self.stddev,
-        'seed': self.seed
-    }
-
-
-@keras_export('keras.initializers.TruncatedNormal',
-              'keras.initializers.truncated_normal',
-              v1=[])
-class TruncatedNormal(Initializer):
-  """Initializer that generates a truncated normal distribution.
-
-  Also available via the shortcut function
-  `tf.keras.initializers.truncated_normal`.
-
-  The values generated are similar to values from a
-  `tf.keras.initializers.RandomNormal` initializer except that values more
-  than two standard deviations from the mean are
-  discarded and re-drawn.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=1.)
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=1.)
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    mean: a python scalar or a scalar tensor. Mean of the random values
-      to generate.
-    stddev: a python scalar or a scalar tensor. Standard deviation of the
-      random values to generate before truncation.
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-  """
-
-  def __init__(self, mean=0.0, stddev=0.05, seed=None):
-    self.mean = mean
-    self.stddev = stddev
-    self.seed = seed
-    self._random_generator = backend.RandomGenerator(seed)
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized to random normal values (truncated).
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. Only floating point types are
-        supported. If not specified, `tf.keras.backend.floatx()` is used, which
-        default to `float32` unless you configured it otherwise (via
-        `tf.keras.backend.set_floatx(float_dtype)`)
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs)
-    dtype = _assert_float_dtype(_get_dtype(dtype))
-    if _PARTITION_SHAPE in kwargs:
-      shape = kwargs[_PARTITION_SHAPE]
-    partition_offset = kwargs.get(_PARTITION_OFFSET, None)
-    nonce = hash(partition_offset) if partition_offset else None
-    layout = kwargs.pop('layout', None)
-    if layout:
-      self._random_generator._rng_type = self._random_generator.RNG_STATEFUL
-      _ensure_keras_seeded()
-      return utils.call_with_layout(
-          self._random_generator.truncated_normal, layout, shape, self.mean,
-          self.stddev, dtype, nonce)
-    return self._random_generator.truncated_normal(
-        shape, self.mean, self.stddev, dtype, nonce)
-
-  def get_config(self):
-    return {
-        'mean': self.mean,
-        'stddev': self.stddev,
-        'seed': self.seed
-    }
-
-
-@keras_export('keras.initializers.VarianceScaling',
-              'keras.initializers.variance_scaling',
-              v1=[])
-class VarianceScaling(Initializer):
-  """Initializer capable of adapting its scale to the shape of weights tensors.
-
-  Also available via the shortcut function
-  `tf.keras.initializers.variance_scaling`.
-
-  With `distribution="truncated_normal" or "untruncated_normal"`, samples are
-  drawn from a truncated/untruncated normal distribution with a mean of zero and
-  a standard deviation (after truncation, if used) `stddev = sqrt(scale / n)`,
-  where `n` is:
-
-  - number of input units in the weight tensor, if `mode="fan_in"`
-  - number of output units, if `mode="fan_out"`
-  - average of the numbers of input and output units, if `mode="fan_avg"`
-
-  With `distribution="uniform"`, samples are drawn from a uniform distribution
-  within `[-limit, limit]`, where `limit = sqrt(3 * scale / n)`.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.VarianceScaling(
-  ... scale=0.1, mode='fan_in', distribution='uniform')
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.VarianceScaling(
-  ... scale=0.1, mode='fan_in', distribution='uniform')
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    scale: Scaling factor (positive float).
-    mode: One of "fan_in", "fan_out", "fan_avg".
-    distribution: Random distribution to use. One of "truncated_normal",
-      "untruncated_normal" and  "uniform".
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-  """
-
-  def __init__(self,
-               scale=1.0,
-               mode='fan_in',
-               distribution='truncated_normal',
-               seed=None):
-    if scale <= 0.:
-      raise ValueError('`scale` must be positive float. '
-                       f'Received: scale={scale}.')
-    allowed_modes = {'fan_in', 'fan_out', 'fan_avg'}
-    if mode not in allowed_modes:
-      raise ValueError(f'Invalid `mode` argument: {mode}. '
-                       f'Please use one of the {allowed_modes}.')
-    distribution = distribution.lower()
-    # Compatibility with keras-team/keras.
-    if distribution == 'normal':
-      distribution = 'truncated_normal'
-    allowed_distributions = {
-        'uniform', 'truncated_normal', 'untruncated_normal'
-    }
-    if distribution not in allowed_distributions:
-      raise ValueError(f'Invalid `distribution` argument: {distribution}.'
-                       f'Allowed distributions: {allowed_distributions}.')
-    self.scale = scale
-    self.mode = mode
-    self.distribution = distribution
-    self.seed = seed
-    self._random_generator = backend.RandomGenerator(seed)
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized as specified by the initializer.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. Only floating point types are
-        supported. If not specified, `tf.keras.backend.floatx()` is used, which
-        default to `float32` unless you configured it otherwise (via
-        `tf.keras.backend.set_floatx(float_dtype)`)
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs)
-    dtype = _assert_float_dtype(_get_dtype(dtype))
-    if _PARTITION_SHAPE in kwargs:
-      shape = kwargs[_PARTITION_SHAPE]
-    partition_offset = kwargs.get(_PARTITION_OFFSET, None)
-    nonce = hash(partition_offset) if partition_offset else None
-    layout = kwargs.pop('layout', None)
-    if layout:
-      self._random_generator._rng_type = self._random_generator.RNG_STATEFUL
-      _ensure_keras_seeded()
-      return utils.call_with_layout(
-          self._generate_init_val, layout, shape=shape, dtype=dtype,
-          nonce=nonce)
-    return self._generate_init_val(shape=shape, dtype=dtype,
-                                   nonce=nonce)
-
-  def _generate_init_val(self, shape, dtype, nonce):
-    scale = self.scale
-    fan_in, fan_out = _compute_fans(shape)
-    if self.mode == 'fan_in':
-      scale /= max(1., fan_in)
-    elif self.mode == 'fan_out':
-      scale /= max(1., fan_out)
-    else:
-      scale /= max(1., (fan_in + fan_out) / 2.)
-    if self.distribution == 'truncated_normal':
-      # constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
-      stddev = math.sqrt(scale) / .87962566103423978
-      return self._random_generator.truncated_normal(
-          shape, 0.0, stddev, dtype, nonce)
-    elif self.distribution == 'untruncated_normal':
-      stddev = math.sqrt(scale)
-      return self._random_generator.random_normal(
-          shape, 0.0, stddev, dtype, nonce)
-    else:
-      limit = math.sqrt(3.0 * scale)
-      return self._random_generator.random_uniform(
-          shape, -limit, limit, dtype, nonce)
-
-  def get_config(self):
-    return {
-        'scale': self.scale,
-        'mode': self.mode,
-        'distribution': self.distribution,
-        'seed': self.seed
-    }
-
-
-@keras_export('keras.initializers.Orthogonal',
-              'keras.initializers.orthogonal',
-              v1=[])
-class Orthogonal(Initializer):
-  """Initializer that generates an orthogonal matrix.
-
-  Also available via the shortcut function `tf.keras.initializers.orthogonal`.
-
-  If the shape of the tensor to initialize is two-dimensional, it is initialized
-  with an orthogonal matrix obtained from the QR decomposition of a matrix of
-  random numbers drawn from a normal distribution.
-  If the matrix has fewer rows than columns then the output will have orthogonal
-  rows. Otherwise, the output will have orthogonal columns.
-
-  If the shape of the tensor to initialize is more than two-dimensional,
-  a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
-  is initialized, where `n` is the length of the shape vector.
-  The matrix is subsequently reshaped to give a tensor of the desired shape.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.Orthogonal()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.Orthogonal()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    gain: multiplicative factor to apply to the orthogonal matrix
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-
-  References:
-    - [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
-  """
-
-  def __init__(self, gain=1.0, seed=None):
-    self.gain = gain
-    self.seed = seed
-    self._random_generator = backend.RandomGenerator(seed)
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized to an orthogonal matrix.
-
-    Args:
-      shape: Shape of the tensor.
-      dtype: Optional dtype of the tensor. Only floating point types are
-        supported. If not specified, `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`)
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs, support_partition=False)
-    dtype = _assert_float_dtype(_get_dtype(dtype))
-    # Check the shape
-    if len(shape) < 2:
-      raise ValueError('The tensor to initialize must be '
-                       'at least two-dimensional. Received: '
-                       f'shape={shape} of rank {len(shape)}.')
-    layout = kwargs.pop('layout', None)
-    if layout:
-      self._random_generator._rng_type = self._random_generator.RNG_STATEFUL
-      _ensure_keras_seeded()
-      return utils.call_with_layout(
-          self._generate_init_val, layout, shape=shape, dtype=dtype)
-    return self._generate_init_val(shape, dtype)
-
-  def _generate_init_val(self, shape, dtype):
-    # Flatten the input shape with the last dimension remaining
-    # its original shape so it works for conv2d
-    num_rows = 1
-    for dim in shape[:-1]:
-      num_rows *= dim
-    num_cols = shape[-1]
-    flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows))
-
-    # Generate a random matrix
-    a = self._random_generator.random_normal(flat_shape, dtype=dtype)
-    # Compute the qr factorization
-    q, r = tf.linalg.qr(a, full_matrices=False)
-    # Make Q uniform
-    d = tf.linalg.tensor_diag_part(r)
-    q *= tf.sign(d)
-    if num_rows < num_cols:
-      q = tf.linalg.matrix_transpose(q)
-    return self.gain * tf.reshape(q, shape)
-
-  def get_config(self):
-    return {'gain': self.gain, 'seed': self.seed}
-
-
-@keras_export('keras.initializers.Identity',
-              'keras.initializers.identity',
-              v1=[])
-class Identity(Initializer):
-  """Initializer that generates the identity matrix.
-
-  Also available via the shortcut function `tf.keras.initializers.identity`.
-
-  Only usable for generating 2D matrices.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.Identity()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.Identity()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    gain: Multiplicative factor to apply to the identity matrix.
-  """
-
-  def __init__(self, gain=1.0):
-    self.gain = gain
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    """Returns a tensor object initialized to a 2D identity matrix.
-
-    Args:
-      shape: Shape of the tensor. It should have exactly rank 2.
-      dtype: Optional dtype of the tensor. Only floating point types are
-       supported. If not specified, `tf.keras.backend.floatx()` is used,
-       which default to `float32` unless you configured it otherwise
-       (via `tf.keras.backend.set_floatx(float_dtype)`)
-      **kwargs: Additional keyword arguments.
-    """
-    _validate_kwargs(self.__class__.__name__, kwargs, support_partition=False)
-    dtype = _assert_float_dtype(_get_dtype(dtype))
-    if len(shape) != 2:
-      raise ValueError(
-          'Identity matrix initializer can only be used for 2D matrices. '
-          f'Received: shape={shape} of rank {len(shape)}.')
-    layout = kwargs.pop('layout', None)
-    if layout:
-      return utils.call_with_layout(
-          self._generate_init_val, layout, shape=shape, dtype=dtype)
-    return self._generate_init_val(shape, dtype)
-
-  def _generate_init_val(self, shape, dtype):
-    initializer = tf.eye(*shape, dtype=dtype)
-    return self.gain * initializer
-
-  def get_config(self):
-    return {'gain': self.gain}
-
-
-@keras_export('keras.initializers.GlorotUniform',
-              'keras.initializers.glorot_uniform',
-              v1=[])
-class GlorotUniform(VarianceScaling):
-  """The Glorot uniform initializer, also called Xavier uniform initializer.
-
-  Also available via the shortcut function
-  `tf.keras.initializers.glorot_uniform`.
-
-  Draws samples from a uniform distribution within `[-limit, limit]`, where
-  `limit = sqrt(6 / (fan_in + fan_out))` (`fan_in` is the number of input units
-  in the weight tensor and `fan_out` is the number of output units).
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.GlorotUniform()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.GlorotUniform()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-
-  References:
-    - [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
-  """
-
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=1.0,
-        mode='fan_avg',
-        distribution='uniform',
-        seed=seed)
-
-  def get_config(self):
-    return {'seed': self.seed}
-
-
-@keras_export('keras.initializers.GlorotNormal',
-              'keras.initializers.glorot_normal',
-              v1=[])
-class GlorotNormal(VarianceScaling):
-  """The Glorot normal initializer, also called Xavier normal initializer.
-
-  Also available via the shortcut function
-  `tf.keras.initializers.glorot_normal`.
-
-  Draws samples from a truncated normal distribution centered on 0 with `stddev
-  = sqrt(2 / (fan_in + fan_out))` where `fan_in` is the number of input units in
-  the weight tensor and `fan_out` is the number of output units in the weight
-  tensor.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.GlorotNormal()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.GlorotNormal()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-
-  References:
-    - [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
-  """
-
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=1.0,
-        mode='fan_avg',
-        distribution='truncated_normal',
-        seed=seed)
-
-  def get_config(self):
-    return {'seed': self.seed}
-
-
-@keras_export('keras.initializers.LecunNormal',
-              'keras.initializers.lecun_normal',
-              v1=[])
-class LecunNormal(VarianceScaling):
-  """Lecun normal initializer.
-
-   Also available via the shortcut function
-  `tf.keras.initializers.lecun_normal`.
-
-  Initializers allow you to pre-specify an initialization strategy, encoded in
-  the Initializer object, without knowing the shape and dtype of the variable
-  being initialized.
-
-  Draws samples from a truncated normal distribution centered on 0 with `stddev
-  = sqrt(1 / fan_in)` where `fan_in` is the number of input units in the weight
-  tensor.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.LecunNormal()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.LecunNormal()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-
-  References:
-    - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
-  """
-
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=1., mode='fan_in', distribution='truncated_normal', seed=seed)
-
-  def get_config(self):
-    return {'seed': self.seed}
-
-
-@keras_export('keras.initializers.LecunUniform',
-              'keras.initializers.lecun_uniform',
-              v1=[])
-class LecunUniform(VarianceScaling):
-  """Lecun uniform initializer.
-
-   Also available via the shortcut function
-  `tf.keras.initializers.lecun_uniform`.
-
-  Draws samples from a uniform distribution within `[-limit, limit]`,
-  where `limit = sqrt(3 / fan_in)` (`fan_in` is the number of input units in the
-  weight tensor).
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.LecunUniform()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.LecunUniform()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-
-  References:
-    - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
-  """
-
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=1., mode='fan_in', distribution='uniform', seed=seed)
-
-  def get_config(self):
-    return {'seed': self.seed}
-
-
-@keras_export('keras.initializers.HeNormal',
-              'keras.initializers.he_normal',
-              v1=[])
-class HeNormal(VarianceScaling):
-  """He normal initializer.
-
-   Also available via the shortcut function
-  `tf.keras.initializers.he_normal`.
-
-  It draws samples from a truncated normal distribution centered on 0 with
-  `stddev = sqrt(2 / fan_in)` where `fan_in` is the number of input units in the
-  weight tensor.
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.HeNormal()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.HeNormal()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-
-  References:
-    - [He et al., 2015](https://arxiv.org/abs/1502.01852)
-  """
-
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=2., mode='fan_in', distribution='truncated_normal', seed=seed)
-
-  def get_config(self):
-    return {'seed': self.seed}
-
-
-@keras_export('keras.initializers.HeUniform',
-              'keras.initializers.he_uniform',
-              v1=[])
-class HeUniform(VarianceScaling):
-  """He uniform variance scaling initializer.
-
-   Also available via the shortcut function
-  `tf.keras.initializers.he_uniform`.
-
-  Draws samples from a uniform distribution within `[-limit, limit]`, where
-  `limit = sqrt(6 / fan_in)` (`fan_in` is the number of input units in the
-  weight tensor).
-
-  Examples:
-
-  >>> # Standalone usage:
-  >>> initializer = tf.keras.initializers.HeUniform()
-  >>> values = initializer(shape=(2, 2))
-
-  >>> # Usage in a Keras layer:
-  >>> initializer = tf.keras.initializers.HeUniform()
-  >>> layer = tf.keras.layers.Dense(3, kernel_initializer=initializer)
-
-  Args:
-    seed: A Python integer. Used to make the behavior of the initializer
-      deterministic. Note that a seeded
-      initializer will not produce the same random values across multiple calls,
-      but multiple initializers will produce the same sequence when constructed
-      with the same seed value.
-
-  References:
-    - [He et al., 2015](https://arxiv.org/abs/1502.01852)
-  """
-
-  def __init__(self, seed=None):
-    super().__init__(
-        scale=2., mode='fan_in', distribution='uniform', seed=seed)
-
-  def get_config(self):
-    return {'seed': self.seed}
-
-
-def _get_dtype(dtype):
-  if dtype is None:
-    dtype = backend.floatx()
-  return tf.as_dtype(dtype)
-
-
-def _assert_float_dtype(dtype):
-  """Validate and return floating point type based on `dtype`.
-
-  `dtype` must be a floating point type.
-
-  Args:
-    dtype: The data type to validate.
-
-  Returns:
-    Validated type.
-
-  Raises:
-    ValueError: if `dtype` is not a floating point type.
-  """
-  dtype = tf.as_dtype(dtype)
-  if not dtype.is_floating:
-    raise ValueError(f'Expected floating point type, got {dtype}.')
-  return dtype
-
-
-def _compute_fans(shape):
-  """Computes the number of input and output units for a weight shape.
-
-  Args:
-    shape: Integer shape tuple or TF tensor shape.
-
-  Returns:
-    A tuple of integer scalars (fan_in, fan_out).
-  """
-  if len(shape) < 1:  # Just to avoid errors for constants.
-    fan_in = fan_out = 1
-  elif len(shape) == 1:
-    fan_in = fan_out = shape[0]
-  elif len(shape) == 2:
-    fan_in = shape[0]
-    fan_out = shape[1]
-  else:
-    # Assuming convolution kernels (2D, 3D, or more).
-    # kernel shape: (..., input_depth, depth)
-    receptive_field_size = 1
-    for dim in shape[:-2]:
-      receptive_field_size *= dim
-    fan_in = shape[-2] * receptive_field_size
-    fan_out = shape[-1] * receptive_field_size
-  return int(fan_in), int(fan_out)
-
-
-def _validate_kwargs(cls_name, kwargs, support_partition=True):
-  invalid_kwargs = [k for k in kwargs if k not in _ALLOWED_INITIALIZER_KWARGS]
-  if invalid_kwargs:
-    raise TypeError(f'Unknown keyword arguments: {invalid_kwargs}. Allowed '
-                    f'keyword arguments: {_ALLOWED_INITIALIZER_KWARGS}.')
-  if not support_partition and (_PARTITION_SHAPE in kwargs or
-                                _PARTITION_OFFSET in kwargs):
-    raise ValueError(f'{cls_name} initializer doesn\'t support '
-                     'partition-related arguments.')
-
-
-def _ensure_keras_seeded():
-  """Make sure the keras.backend global seed generator is set.
-
-  This is important for DTensor use case to ensure that each client are
-  initialized with same seed for tf.random.Generator, so that the value created
-  are in sync among all the clients.
-  """
-  if not getattr(backend._SEED_GENERATOR, 'generator', None):  # pylint:disable=protected-access
-    raise ValueError('When using DTensor APIs, you need to set the global seed '
-                     'before using any Keras initializers. Please make sure '
-                     'to call `tf.keras.utils.set_random_seed()` in your code.')
diff --git a/keras/integration_test/BUILD b/keras/integration_test/BUILD
index 9d520a57e65b..348db2520583 100644
--- a/keras/integration_test/BUILD
+++ b/keras/integration_test/BUILD
@@ -1,12 +1,14 @@
 # Description:
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 load("@org_keras//keras:keras.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "tpu_py_test")
 load("@org_keras//keras:keras.bzl", "distribute_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
@@ -89,28 +91,28 @@ cuda_py_test(
     name = "gradient_checkpoint_test",
     srcs = ["gradient_checkpoint_test.py"],
     python_version = "PY3",
+    tags = ["no_oss"],  # TODO(b/249526796)
     deps = [
         "//:expect_tensorflow_installed",
         "//keras/api:keras_api",
     ],
 )
 
-# cuda_py_test(
-#     name = "central_storage_strategy_test",
-#     srcs = ["central_storage_strategy_test.py"],
-#     python_version = "PY3",
-#     tags = [
-#         "multi_and_single_gpu",
-#         "no_windows_gpu",  # TODO(b/130551176)
-#     ],
-#     deps = [
-#         "//:expect_absl_installed",
-#         "//:expect_tensorflow_installed",
-#         "//third_party/tensorflow/python/distribute:combinations",
-#         "//third_party/tensorflow/python/distribute:strategy_combinations",
-#         "//third_party/tensorflow/python/keras/utils:kpl_test_utils",
-#     ],
-# )
+cuda_py_test(
+    name = "central_storage_strategy_test",
+    srcs = ["central_storage_strategy_test.py"],
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+        "no_windows_gpu",  # TODO(b/130551176)
+    ],
+    deps = [
+        "//:expect_absl_installed",
+        "//:expect_tensorflow_installed",
+        "//keras/api:keras_api",
+        "//keras/utils:kpl_test_utils",
+    ],
+)
 
 tpu_py_test(
     name = "tpu_strategy_test",
@@ -145,11 +147,28 @@ tf_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "ctl_tutorial_test",
+    srcs = ["ctl_tutorial_test.py"],
+    main = "ctl_tutorial_test.py",
+    shard_count = 5,
+    tags = [
+        "multi_and_single_gpu",
+        "nomultivm",  # TODO(b/170502145)
+    ],
+    deps = [
+        "//:expect_absl_installed",
+        "//:expect_tensorflow_installed",
+        "//keras/api:keras_api",
+        "//keras/distribute:strategy_combinations",
+    ],
+)
+
 distribute_py_test(
     name = "parameter_server_keras_preprocessing_test",
     srcs = ["parameter_server_keras_preprocessing_test.py"],
     python_version = "PY3",
-    shard_count = 4,  # TODO(b/184290570): Investigate why only 1 shard times out.
+    shard_count = 6,  # TODO(b/184290570): Investigate why only 1 shard times out.
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/194935930): Flaky test
@@ -304,3 +323,77 @@ tf_py_test(
         "//keras/testing_infra:test_combinations",
     ],
 )
+
+tf_py_test(
+    name = "parameter_server_training_metric_test",
+    srcs = ["parameter_server_training_metric_test.py"],
+    python_version = "PY3",
+    tags = [
+        "nomac",  # TODO(mihaimaruseac): b/127695564
+        "notsan",  # TODO(b/156029134)
+    ],
+    deps = [
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/api:keras_api",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+tf_py_test(
+    name = "fit_test",
+    size = "medium",
+    srcs = ["fit_test.py"],
+    python_version = "PY3",
+    shard_count = 28,
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras/api:keras_api",
+        "//keras/integration_test/models",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+tf_py_test(
+    name = "saving_v3_test",
+    size = "medium",
+    srcs = ["saving_v3_test.py"],
+    python_version = "PY3",
+    shard_count = 12,
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras/api:keras_api",
+        "//keras/integration_test/models",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+tf_py_test(
+    name = "py_metric_test",
+    size = "medium",
+    srcs = ["py_metric_test.py"],
+    python_version = "PY3",
+    shard_count = 2,
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras/api:keras_api",
+        "//keras/metrics",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+tf_py_test(
+    name = "extension_type_test",
+    size = "medium",
+    srcs = ["extension_type_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/api:keras_api",
+        "//keras/engine",
+        "//keras/engine:input_layer",
+        "//keras/saving",
+    ],
+)
diff --git a/keras/integration_test/central_storage_strategy_test.py b/keras/integration_test/central_storage_strategy_test.py
index e0be1235a03c..5c1a670853c6 100644
--- a/keras/integration_test/central_storage_strategy_test.py
+++ b/keras/integration_test/central_storage_strategy_test.py
@@ -14,73 +14,81 @@
 # ==============================================================================
 """Tests for KPL + CentralStorageStrategy."""
 
-from absl.testing import parameterized
 import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras.utils import kpl_test_utils
+# isort: off
+from tensorflow.compat.v2.__internal__.distribute import combinations
+from keras.utils import kpl_test_utils
 
 
 # TODO(b/182278926): Combine this test with other strategies.
-@ds_combinations.generate(
-    combinations.combine(
-        distribution=[
-            strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
-        ],
-        mode=["eager"]))
+@combinations.generate(
+    tf.__internal__.test.combinations.combine(
+        distribution=[combinations.central_storage_strategy_with_gpu_and_cpu],
+        mode=["eager"],
+    )
+)
 class CentralStorageStrategyTest(tf.test.TestCase, parameterized.TestCase):
+    def testTrainAndServeWithKPL(self, distribution):
+        use_adapt = False
+        test_utils_obj = kpl_test_utils.DistributeKplTestUtils()
+        with distribution.scope():
+            (
+                feature_mapper,
+                label_mapper,
+            ) = test_utils_obj.define_kpls_for_training(use_adapt)
+            model = test_utils_obj.define_model()
+            optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
+            accuracy = tf.keras.metrics.Accuracy()
 
-  def testTrainAndServeWithKPL(self, distribution):
-    use_adapt = False
-    test_utils_obj = kpl_test_utils.DistributeKplTestUtils()
-    with distribution.scope():
-      feature_mapper, label_mapper = test_utils_obj.define_kpls_for_training(
-          use_adapt)
-      model = test_utils_obj.define_model()
-      optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
-      accuracy = tf.keras.metrics.Accuracy()
-
-      def dataset_fn(_):
-        return test_utils_obj.dataset_fn(feature_mapper, label_mapper)
+            def dataset_fn(_):
+                return test_utils_obj.dataset_fn(feature_mapper, label_mapper)
 
-      @tf.function
-      def train_step(iterator):
-        """The step function for one training step."""
+            @tf.function
+            def train_step(iterator):
+                """The step function for one training step."""
 
-        def step_fn(inputs):
-          """The computation to run on each replica."""
-          features, labels = inputs
-          with tf.GradientTape() as tape:
-            pred = model(features, training=True)
-            loss = tf.keras.losses.binary_crossentropy(labels, pred)
-            loss = tf.nn.compute_average_loss(loss)
-          grads = tape.gradient(loss, model.trainable_variables)
-          optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
+                def step_fn(inputs):
+                    """The computation to run on each replica."""
+                    features, labels = inputs
+                    with tf.GradientTape() as tape:
+                        pred = model(features, training=True)
+                        loss = tf.keras.losses.binary_crossentropy(labels, pred)
+                        loss = tf.nn.compute_average_loss(loss)
+                    grads = tape.gradient(loss, model.trainable_variables)
+                    optimizer.apply_gradients(
+                        list(zip(grads, model.trainable_variables))
+                    )
 
-          actual_pred = tf.cast(tf.math.greater(pred, 0.5), tf.dtypes.int64)
-          accuracy.update_state(labels, actual_pred)
+                    actual_pred = tf.cast(
+                        tf.math.greater(pred, 0.5), tf.dtypes.int64
+                    )
+                    accuracy.update_state(labels, actual_pred)
 
-        distribution.run(step_fn, args=(next(iterator),))
+                distribution.run(step_fn, args=(next(iterator),))
 
-      distributed_dataset = distribution.distribute_datasets_from_function(
-          dataset_fn)
-      distributed_iterator = iter(distributed_dataset)
-      num_epochs = 4
-      num_steps = 7
-      for _ in range(num_epochs):
-        accuracy.reset_state()
-        for _ in range(num_steps):
-          train_step(distributed_iterator)
+            distributed_dataset = (
+                distribution.distribute_datasets_from_function(dataset_fn)
+            )
+            distributed_iterator = iter(distributed_dataset)
+            num_epochs = 4
+            num_steps = 7
+            for _ in range(num_epochs):
+                accuracy.reset_state()
+                for _ in range(num_steps):
+                    train_step(distributed_iterator)
 
-      self.assertGreater(accuracy.result().numpy(), 0.5)
-      self.assertEqual(optimizer.iterations.numpy(), num_epochs * num_steps)
+            self.assertGreater(accuracy.result().numpy(), 0.5)
+            self.assertEqual(
+                optimizer.iterations.numpy(), num_epochs * num_steps
+            )
 
-    # Test save/load/serving the trained model.
-    test_utils_obj.test_save_load_serving_model(
-        model, feature_mapper, test_utils_obj.define_reverse_lookup_layer())
+        # Test save/load/serving the trained model.
+        test_utils_obj.test_save_load_serving_model(
+            model, feature_mapper, test_utils_obj.define_reverse_lookup_layer()
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/integration_test/ctl_tutorial_test.py b/keras/integration_test/ctl_tutorial_test.py
new file mode 100644
index 000000000000..e700d9ed4e93
--- /dev/null
+++ b/keras/integration_test/ctl_tutorial_test.py
@@ -0,0 +1,451 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that Custom Training Loop docs match actual behavior.
+
+The tutorial at https://www.tensorflow.org/tutorials/distribute/custom_training,
+defined at
+https://github.com/tensorflow/docs/blob/master/site/en/tutorials/distribute/custom_training.ipynb
+makes several statements about
+
+  * ways to reduce loss terms to the actual training loss, and
+  * how they compare to the built-in behavior of Keras Model.fit().
+
+This test verifies that these statements match the actual behavior,
+under a variety of distribution strategies.
+"""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.distribute import strategy_combinations
+
+
+def make_compute_loss_fn(variant, loss_object, GLOBAL_BATCH_SIZE):
+    """Returns the `compute_loss()` function as defined in the tutorial."""
+
+    if variant == "basic":
+        # The basic form of the loss function, shown verbatim in the tutorial.
+        def compute_loss(labels, predictions, model_losses):
+            per_example_loss = loss_object(labels, predictions)
+            loss = tf.nn.compute_average_loss(per_example_loss)
+            if model_losses:
+                loss += tf.nn.scale_regularization_loss(tf.add_n(model_losses))
+            return loss
+
+    elif variant == "fixed_batch_size":
+        # The variant that adds a fixed `global_batch_size=` arg
+        # (described but not shown verbatim).
+        def compute_loss(labels, predictions, model_losses):
+            per_example_loss = loss_object(labels, predictions)
+            loss = tf.nn.compute_average_loss(
+                per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE
+            )
+            if model_losses:
+                loss += tf.nn.scale_regularization_loss(tf.add_n(model_losses))
+            return loss
+
+    elif variant == "balanced":
+        # The variant that scales the loss to balance out varying batch sizes
+        # (described but not shown verbatim).
+        def compute_loss(labels, predictions, model_losses):
+            per_example_loss = loss_object(labels, predictions)
+            loss = tf.nn.compute_average_loss(per_example_loss)
+            if model_losses:
+                loss += tf.nn.scale_regularization_loss(tf.add_n(model_losses))
+            observed_global_batch_size = (
+                tf.distribute.get_strategy().num_replicas_in_sync
+                * tf.shape(per_example_loss)[0]
+            )
+            loss *= tf.math.divide(
+                tf.cast(observed_global_batch_size, tf.float32),
+                tf.cast(GLOBAL_BATCH_SIZE, tf.float32),
+            )
+            return loss
+
+    else:
+        raise ValueError(f"Unknown {variant=}")
+
+    return compute_loss
+
+
+def create_dataset(global_batch_size):
+    """Creates the dataset for ImpliedExampleWeightsTest.
+
+    It contains two batches: the first has full size, the second just 1 element.
+    The i-th element `(x,y)` has model input `x = onehot(i)` and label `y = 0`.
+    """
+    n = global_batch_size + 1
+    ds = tf.data.Dataset.from_tensor_slices((tf.eye(n), tf.zeros([n, 1])))
+    ds = ds.batch(global_batch_size)
+    return ds
+
+
+def create_model(n):
+    """Creates the model for ImpliedExampleWeightsTest.
+
+    The model has three trainable weights of interest, all initialized to 1.0:
+
+      * "predicting/kernel:0" of shape [n, 1] maps a one-hot encoded input to
+        the model output. When used with the MeanAbsoluteError loss, an input
+        onehot(i) produces a gradient onehot(i) for this weight, subject to
+        the training loop's loss reduction across examples.
+      * "activity_regularized/kernel:0" of shape [n, 1] has an activity
+        regularizer loss in the model so that input onehot(i) produces a
+        gradient of 1/batch_size * onehot(i) for this weight.
+      * "weight_regularized:0" of shape [1] has a weight regularizer loss in
+        the model that produces a gradient of 1 for this weight, independent
+        of batch size.
+    """
+    inputs = tf.keras.Input(shape=(n,), name="inputs")
+
+    predicting = tf.keras.layers.Dense(
+        1, use_bias=False, kernel_initializer="ones", name="predicting"
+    )
+    activity_regularized = tf.keras.layers.Dense(
+        1,
+        use_bias=False,
+        kernel_initializer="ones",
+        activity_regularizer=tf.keras.regularizers.L1(l1=1.0),
+        name="activity_regularized",
+    )
+    weight_regularized = tf.keras.layers.Dense(
+        1,
+        kernel_initializer="zeros",
+        bias_initializer="ones",
+        bias_regularizer=tf.keras.regularizers.L1(l1=1.0),
+        name="weight_regularized",
+    )
+
+    # Make outputs = predicting(inputs), depending on the other Layers as well.
+    add = tf.keras.layers.Add(name="add")
+    multiply = tf.keras.layers.Multiply(name="multiply")
+    outputs = add(
+        [
+            predicting(inputs),
+            multiply(
+                [np.array([[0.0]], np.float32), activity_regularized(inputs)]
+            ),
+            multiply(
+                [np.array([[0.0]], np.float32), weight_regularized(inputs)]
+            ),
+        ]
+    )
+
+    model = tf.keras.Model(inputs, outputs)
+    return model
+
+
+def create_loss(**kwargs):
+    """Returns the loss to be used with the model from create_model()."""
+    return tf.keras.losses.MeanAbsoluteError(**kwargs)
+
+
+def create_optimizer(learning_rate):
+    """Returns the optimizer that applies gradients in the most obvious way."""
+    return tf.keras.optimizers.SGD(learning_rate)
+
+
+def get_expected_example_weights(
+    ctl_variant, *, local_batch_size, num_replicas_in_sync
+):
+    """Returns the weights that examples have in the gradient updates seen."""
+
+    global_batch_size = local_batch_size * num_replicas_in_sync
+    n = global_batch_size + 1
+    num_batches = 2
+
+    expected = dict(
+        # Examples in a full batch receive the expected gradient weight,
+        # independent of the CTL variant.
+        example_prediction_fullbatch=1.0,
+        example_activity_fullbatch=1.0,
+    )
+    if ctl_variant == "basic":
+        # In the basic variant of the CTL, when a batch of size 1 hits a
+        # replica, the singleton example receives the weight that is
+        # normally spread evenly across the local_batch_size.
+        expected["example_prediction_singleton"] = local_batch_size
+        expected["example_activity_singleton"] = local_batch_size
+        # Weight regularization applies equally in each batch,
+        # irrespective of its size.
+        expected["total_weight_regularization"] = num_batches
+    elif ctl_variant == "fixed_batch_size":
+        # In the CTL variant that fixes GLOBAL_BATCH_SIZE for the reduction
+        # of prediction losses, the weight of a singleton example is
+        # reverted to normal for prediction, but activity and weight
+        # regularization behaves as in the "basic" variant.
+        expected["example_prediction_singleton"] = 1.0
+        expected["example_activity_singleton"] = local_batch_size
+        expected["total_weight_regularization"] = num_batches
+    elif ctl_variant == "balanced":
+        # The CTL variant that corrects both prediction and regularization
+        # losses for the batch size achieves equal weights of examples
+        # both for the prediction and for an activity regularizer
+        expected["example_prediction_singleton"] = 1.0
+        expected["example_activity_singleton"] = 1.0
+        # Weight regularization, in sync with the other loss terms,
+        # applies proportional to the number of examples.
+        expected["total_weight_regularization"] = n / global_batch_size
+    return expected
+
+
+class MaybeStrategyScope:
+    """Provides a context allowing no distribution strategy."""
+
+    def __init__(self, strategy):
+        self._strategy = strategy
+        self._scope = None
+
+    def __enter__(self):
+        if self._strategy:
+            self._scope = self._strategy.scope()
+            self._scope.__enter__()
+
+    def __exit__(self, exc_type, value, traceback):
+        if self._strategy:
+            self._scope.__exit__(exc_type, value, traceback)
+            self._scope = None
+
+
+class ImpliedExampleWeightsTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests weights of loss terms depending on batch size and training loop."""
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=strategy_combinations.all_strategies
+            + strategy_combinations.multiworker_strategies
+            + [None],
+            ctl_variant=["basic", "fixed_batch_size", "balanced"],
+        )
+    )
+    def test_ctl(self, strategy, ctl_variant):
+        """Tests a variant of the CTL under a distribution strategy."""
+        if strategy is None:
+            num_replicas_in_sync = 1
+        else:
+            num_replicas_in_sync = strategy.num_replicas_in_sync
+
+        local_batch_size = 2  # For a full batch; greater than 1.
+        global_batch_size = local_batch_size * num_replicas_in_sync
+        ds = create_dataset(global_batch_size)
+        if strategy is not None:
+            ds = strategy.experimental_distribute_dataset(ds)
+
+        n = global_batch_size + 1
+        learning_rate = 0.01
+        with MaybeStrategyScope(strategy):
+            model = create_model(n)
+            loss_object = create_loss(reduction=tf.keras.losses.Reduction.NONE)
+            compute_loss = make_compute_loss_fn(
+                ctl_variant, loss_object, global_batch_size
+            )
+            optimizer = create_optimizer(learning_rate)
+
+            def train_step(inputs):
+                x, labels = inputs
+                with tf.GradientTape() as tape:
+                    predictions = model(x, training=True)
+                    loss = compute_loss(labels, predictions, model.losses)
+                gradients = tape.gradient(loss, model.trainable_variables)
+                optimizer.apply_gradients(
+                    zip(gradients, model.trainable_variables)
+                )
+                return loss
+
+            @tf.function
+            def wrapped_train_step(inputs):
+                if strategy is None:
+                    return train_step(inputs)
+                else:
+                    per_replica_losses = strategy.run(
+                        train_step, args=(inputs,)
+                    )
+                    return strategy.reduce(
+                        tf.distribute.ReduceOp.SUM,
+                        per_replica_losses,
+                        axis=None,
+                    )
+
+            num_epochs = 1
+            num_batches = 0
+            for epoch in range(num_epochs):
+                total_loss = 0.0
+                for x in ds:
+                    total_loss += wrapped_train_step(x)
+                    num_batches += 1
+                train_loss = total_loss / num_batches
+                self.assertTrue(tf.math.is_finite(train_loss).numpy())
+
+        self.assertEqual(num_batches, 2)
+
+        expected = get_expected_example_weights(
+            ctl_variant,
+            local_batch_size=local_batch_size,
+            num_replicas_in_sync=num_replicas_in_sync,
+        )
+        self.assert_implied_example_weights(
+            model,
+            **expected,
+            rtol=1e-6 if strategy is None else 1e-4,
+            learning_rate=learning_rate,
+            global_batch_size=global_batch_size,
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=strategy_combinations.all_strategies
+            + strategy_combinations.multiworker_strategies
+            + [None],
+        )
+    )
+    def test_fit(self, strategy):
+        """Tests Model.fit()."""
+        if strategy is None:
+            num_replicas_in_sync = 1
+        else:
+            num_replicas_in_sync = strategy.num_replicas_in_sync
+
+        local_batch_size = 2  # For a full batch; greater than 1.
+        global_batch_size = local_batch_size * num_replicas_in_sync
+        ds = create_dataset(global_batch_size)
+
+        n = global_batch_size + 1
+        learning_rate = 0.01
+        with MaybeStrategyScope(strategy):
+            model = create_model(n)
+            model.compile(
+                optimizer=create_optimizer(learning_rate), loss=create_loss()
+            )
+        epochs = 1
+        steps_per_epoch = 2
+        model.fit(ds, epochs=epochs, steps_per_epoch=steps_per_epoch)
+
+        expected = get_expected_example_weights(
+            ctl_variant="basic",  # The tutorial claims this consistency!
+            local_batch_size=local_batch_size,
+            num_replicas_in_sync=num_replicas_in_sync,
+        )
+        self.assert_implied_example_weights(
+            model,
+            **expected,
+            rtol=1e-6 if strategy is None else 1e-4,
+            learning_rate=learning_rate,
+            global_batch_size=global_batch_size,
+        )
+
+    def assert_implied_example_weights(
+        self,
+        model,
+        *,
+        learning_rate,
+        global_batch_size,
+        rtol,
+        example_prediction_fullbatch,
+        example_prediction_singleton,
+        example_activity_fullbatch,
+        example_activity_singleton,
+        total_weight_regularization,
+    ):
+        """Checks model.weights for the expected effects of training."""
+        model_weights = {
+            v.name: self._get_var_value(v).numpy()
+            for v in model.trainable_variables
+        }
+
+        # The total weight received by each one-hot example in the prediction
+        # loss is the change of its corresponding weight from the initial
+        # value 1, adjusted for the expected averaging by global_batch_size and
+        # scaling by SGD's learning_rate.
+        predicting_kernel = model_weights["predicting/kernel:0"]
+        example_prediction_weights = (
+            (1.0 - predicting_kernel) / learning_rate * global_batch_size
+        )
+        # There was one full batch of examples, followed by a singleton.
+        self.assertEqual(predicting_kernel.shape, (global_batch_size + 1, 1))
+        # Check the examples in the full batch.
+        actual_example_prediction_fullbatch = self.reduce_assert_equal(
+            example_prediction_weights[:-1, 0]
+        )
+        self.assertAllClose(
+            example_prediction_fullbatch,
+            actual_example_prediction_fullbatch,
+            rtol=rtol,
+        )
+        # Check the singleton example after the full batch.
+        actual_example_prediction_singleton = example_prediction_weights[-1, 0]
+        self.assertAllClose(
+            example_prediction_singleton,
+            actual_example_prediction_singleton,
+            rtol=rtol,
+        )
+
+        # Analogous to predictions, check weights for acticity regularization.
+        activity_regularized_kernel = model_weights[
+            "activity_regularized/kernel:0"
+        ]
+        example_activity_weights = (
+            (1.0 - activity_regularized_kernel)
+            / learning_rate
+            * global_batch_size
+        )
+        self.assertEqual(
+            activity_regularized_kernel.shape, (global_batch_size + 1, 1)
+        )
+        actual_example_activity_fullbatch = self.reduce_assert_equal(
+            example_activity_weights[:-1, 0]
+        )
+        self.assertAllClose(
+            example_activity_fullbatch,
+            actual_example_activity_fullbatch,
+            rtol=rtol,
+        )
+        actual_example_activity_singleton = example_activity_weights[-1, 0]
+        self.assertAllClose(
+            example_activity_singleton,
+            actual_example_activity_singleton,
+            rtol=rtol,
+        )
+
+        # The total weight of weight regularization is the change of this
+        # (otherwise unused) bias term from its initial value 1,
+        # adjusted for the expected scaling by SGD's learning_rate.
+        actual_total_weight_reguarization = (
+            1.0 - model_weights["weight_regularized/bias:0"][0]
+        ) / learning_rate
+        self.assertAllClose(
+            total_weight_regularization,
+            actual_total_weight_reguarization,
+            rtol=rtol,
+        )
+
+    def reduce_assert_equal(self, x):
+        """Returns first element of x and asserts all others are equal."""
+        result = x[0]
+        for i, value in enumerate(x[1:]):
+            self.assertAllEqual(result, value, msg=f"at position {i=}")
+        return result
+
+    def _get_var_value(self, var):
+        """Returns the (unique) value of a (possibly distributed) Variable."""
+        if hasattr(var, "values"):  # Distributed.
+            result = self.reduce_assert_equal([v.value() for v in var.values])
+        else:
+            result = var.value()
+        return result
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/integration_test/custom_object_saving_test.py b/keras/integration_test/custom_object_saving_test.py
index a9d8eb97911d..3c20d80d42a2 100644
--- a/keras/integration_test/custom_object_saving_test.py
+++ b/keras/integration_test/custom_object_saving_test.py
@@ -16,137 +16,136 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import os
 import sys
-from absl.testing import parameterized
-from keras.saving.experimental import saving_lib
-from keras.testing_infra import test_utils
-from keras.utils import generic_utils
+
 import numpy as np
 import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.testing_infra import test_utils
+from keras.utils import get_custom_objects
 
 
 # `tf.print` message is only available in stderr in TF2, which this test checks.
 @test_utils.run_v2_only
 class CustomObjectSavingTest(tf.test.TestCase, parameterized.TestCase):
-  """Test for custom Keras object saving with `register_keras_serializable`."""
-
-  def setUp(self):
-    super().setUp()
-    generic_utils.get_custom_objects().clear()
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'], idempotent_saving_enabled=[True, False]))
-  def test_register_keras_serializable_correct_class(self,
-                                                     idempotent_saving_enabled):
-    saving_lib._ENABLED = idempotent_saving_enabled
-
-    train_step_message = 'This is my training step'
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-
-    @tf.keras.utils.register_keras_serializable('CustomModelX')
-    class CustomModelX(tf.keras.Model):
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.dense1 = MyDense(
-            1,
-            kernel_regularizer=MyRegularizer(0.01),
-            activity_regularizer=MyRegularizer(0.01))
-
-      def call(self, inputs):
-        return self.dense1(inputs)
-
-      def train_step(self, data):
-        tf.print(train_step_message)
-        x, y = data
-        with tf.GradientTape() as tape:
-          y_pred = self(x)
-          loss = self.compiled_loss(y, y_pred)
-
-        gradients = tape.gradient(loss, self.trainable_variables)
-        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-        return {}
-
-      def one(self):
-        return 1
-
-    @tf.keras.utils.register_keras_serializable('MyDense')
-    class MyDense(tf.keras.layers.Dense):
-
-      def two(self):
-        return 2
-
-    @tf.keras.utils.register_keras_serializable('MyAdam')
-    class MyAdam(tf.keras.optimizers.Adam):
-
-      def three(self):
-        return 3
-
-    @tf.keras.utils.register_keras_serializable('MyLoss')
-    class MyLoss(tf.keras.losses.MeanSquaredError):
-
-      def four(self):
-        return 4
-
-    @tf.keras.utils.register_keras_serializable('MyMetric')
-    class MyMetric(tf.keras.metrics.MeanAbsoluteError):
-
-      def five(self):
-        return 5
-
-    @tf.keras.utils.register_keras_serializable('MyRegularizer')
-    class MyRegularizer(tf.keras.regularizers.L2):
-
-      def six(self):
-        return 6
-
-    @tf.keras.utils.register_keras_serializable('my_sq_diff')
-    def my_sq_diff(y_true, y_pred):
-      y_pred = tf.convert_to_tensor(y_pred)
-      y_true = tf.cast(y_true, y_pred.dtype)
-      sq_diff_plus_x = tf.math.squared_difference(y_pred, y_true)
-      return tf.reduce_mean(sq_diff_plus_x, axis=-1)
-
-    subclassed_model = CustomModelX()
-    subclassed_model.compile(
-        optimizer=MyAdam(), loss=MyLoss(), metrics=[MyMetric(), my_sq_diff])
-
-    x = np.random.random((100, 32))
-    y = np.random.random((100, 1))
-    subclassed_model.fit(x, y, epochs=1)
-    subclassed_model.save(temp_dir, save_format='tf')
-
-    loaded_model = tf.keras.models.load_model(temp_dir)
-
-    # `tf.print` writes to stderr.
-    with self.captureWritesToStream(sys.stderr) as printed:
-      loaded_model.fit(x, y, epochs=1)
-      self.assertRegex(printed.contents(), train_step_message)
-
-    # Check that the custom classes do get used.
-    self.assertIs(loaded_model.__class__, CustomModelX)
-    self.assertIs(loaded_model.optimizer.__class__, MyAdam)
-    self.assertIs(loaded_model.compiled_loss._losses[0].__class__, MyLoss)
-    self.assertIs(loaded_model.compiled_metrics._metrics[0].__class__, MyMetric)
-    self.assertIs(loaded_model.compiled_metrics._metrics[1], my_sq_diff)
-    self.assertIs(loaded_model.layers[0].__class__, MyDense)
-    self.assertIs(loaded_model.layers[0].activity_regularizer.__class__,
-                  MyRegularizer)
-    self.assertIs(loaded_model.layers[0].kernel_regularizer.__class__,
-                  MyRegularizer)
-
-    # Check that the custom methods are available.
-    self.assertEqual(loaded_model.one(), 1)
-    self.assertEqual(loaded_model.layers[0].two(), 2)
-    self.assertEqual(loaded_model.optimizer.three(), 3)
-    self.assertEqual(loaded_model.compiled_loss._losses[0].four(), 4)
-    self.assertEqual(loaded_model.compiled_metrics._metrics[0].five(), 5)
-    self.assertEqual(loaded_model.layers[0].activity_regularizer.six(), 6)
-    self.assertEqual(loaded_model.layers[0].kernel_regularizer.six(), 6)
-    self.assertEqual(loaded_model.compiled_metrics._metrics[1]([1], [3]), 4)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Test for custom Keras object saving with
+    `register_keras_serializable`."""
+
+    def setUp(self):
+        super().setUp()
+        get_custom_objects().clear()
+
+    def test_register_keras_serializable_correct_class(self):
+        train_step_message = "This is my training step"
+        temp_dir = os.path.join(self.get_temp_dir(), "my_model")
+
+        @tf.keras.utils.register_keras_serializable("CustomModelX")
+        class CustomModelX(tf.keras.Model):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.dense1 = MyDense(
+                    1,
+                    kernel_regularizer=MyRegularizer(0.01),
+                    activity_regularizer=MyRegularizer(0.01),
+                )
+
+            def call(self, inputs):
+                return self.dense1(inputs)
+
+            def train_step(self, data):
+                tf.print(train_step_message)
+                x, y = data
+                with tf.GradientTape() as tape:
+                    y_pred = self(x)
+                    loss = self.compiled_loss(y, y_pred)
+
+                gradients = tape.gradient(loss, self.trainable_variables)
+                self.optimizer.apply_gradients(
+                    zip(gradients, self.trainable_variables)
+                )
+                return {}
+
+            def one(self):
+                return 1
+
+        @tf.keras.utils.register_keras_serializable("MyDense")
+        class MyDense(tf.keras.layers.Dense):
+            def two(self):
+                return 2
+
+        @tf.keras.utils.register_keras_serializable("MyAdam")
+        class MyAdam(tf.keras.optimizers.Adam):
+            def three(self):
+                return 3
+
+        @tf.keras.utils.register_keras_serializable("MyLoss")
+        class MyLoss(tf.keras.losses.MeanSquaredError):
+            def four(self):
+                return 4
+
+        @tf.keras.utils.register_keras_serializable("MyMetric")
+        class MyMetric(tf.keras.metrics.MeanAbsoluteError):
+            def five(self):
+                return 5
+
+        @tf.keras.utils.register_keras_serializable("MyRegularizer")
+        class MyRegularizer(tf.keras.regularizers.L2):
+            def six(self):
+                return 6
+
+        @tf.keras.utils.register_keras_serializable("my_sq_diff")
+        def my_sq_diff(y_true, y_pred):
+            y_pred = tf.convert_to_tensor(y_pred)
+            y_true = tf.cast(y_true, y_pred.dtype)
+            sq_diff_plus_x = tf.math.squared_difference(y_pred, y_true)
+            return tf.reduce_mean(sq_diff_plus_x, axis=-1)
+
+        subclassed_model = CustomModelX()
+        subclassed_model.compile(
+            optimizer=MyAdam(), loss=MyLoss(), metrics=[MyMetric(), my_sq_diff]
+        )
+
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        subclassed_model.fit(x, y, epochs=1)
+        subclassed_model.save(temp_dir, save_format="tf")
+
+        loaded_model = tf.keras.models.load_model(temp_dir)
+
+        # `tf.print` writes to stderr.
+        with self.captureWritesToStream(sys.stderr) as printed:
+            loaded_model.fit(x, y, epochs=1)
+            self.assertRegex(printed.contents(), train_step_message)
+
+        # Check that the custom classes do get used.
+        self.assertIs(loaded_model.__class__, CustomModelX)
+        self.assertIs(loaded_model.optimizer.__class__, MyAdam)
+        self.assertIs(loaded_model.compiled_loss._losses[0].__class__, MyLoss)
+        self.assertIs(
+            loaded_model.compiled_metrics._metrics[0].__class__, MyMetric
+        )
+        self.assertIs(loaded_model.compiled_metrics._metrics[1], my_sq_diff)
+        self.assertIs(loaded_model.layers[0].__class__, MyDense)
+        self.assertIs(
+            loaded_model.layers[0].activity_regularizer.__class__, MyRegularizer
+        )
+        self.assertIs(
+            loaded_model.layers[0].kernel_regularizer.__class__, MyRegularizer
+        )
+
+        # Check that the custom methods are available.
+        self.assertEqual(loaded_model.one(), 1)
+        self.assertEqual(loaded_model.layers[0].two(), 2)
+        self.assertEqual(loaded_model.optimizer.three(), 3)
+        self.assertEqual(loaded_model.compiled_loss._losses[0].four(), 4)
+        self.assertEqual(loaded_model.compiled_metrics._metrics[0].five(), 5)
+        self.assertEqual(loaded_model.layers[0].activity_regularizer.six(), 6)
+        self.assertEqual(loaded_model.layers[0].kernel_regularizer.six(), 6)
+        self.assertEqual(loaded_model.compiled_metrics._metrics[1]([1], [3]), 4)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/integration_test/distributed_training_test.py b/keras/integration_test/distributed_training_test.py
index 8f2ec67905cc..a0aa112d998b 100644
--- a/keras/integration_test/distributed_training_test.py
+++ b/keras/integration_test/distributed_training_test.py
@@ -17,7 +17,11 @@
 from __future__ import division
 from __future__ import print_function
 
+import glob
+import os
+
 import tensorflow.compat.v2 as tf
+
 ds_combinations = tf.__internal__.distribute.combinations
 
 # Note: Strategy combinations are not (yet) public APIs, so they are subject
@@ -25,7 +29,7 @@
 # TODO(b/188763034): Proceed to export the strategy combinations as public APIs.
 STRATEGIES = [
     ds_combinations.default_strategy,
-    ds_combinations.mirrored_strategy_with_cpu_1_and_2,
+    ds_combinations.mirrored_strategy_with_two_cpus,
     ds_combinations.mirrored_strategy_with_two_gpus,
     ds_combinations.tpu_strategy,
     ds_combinations.cloud_tpu_strategy,
@@ -38,39 +42,88 @@
 
 
 @ds_combinations.generate(
-    tf.__internal__.test.combinations.combine(
-        strategy=STRATEGIES, mode="eager"))
+    tf.__internal__.test.combinations.combine(strategy=STRATEGIES, mode="eager")
+)
 class DistributedTrainingTest(tf.test.TestCase):
-  """Test to demonstrate basic Keras training with a variety of strategies."""
+    """Test to demonstrate basic Keras training with a variety of strategies."""
+
+    def testKerasTrainingAPI(self, strategy):
+        if not tf.__internal__.tf2.enabled() and isinstance(
+            strategy, tf.distribute.experimental.ParameterServerStrategy
+        ):
+            self.skipTest(
+                "Parameter Server strategy with dataset creator need to be run "
+                "when eager execution is enabled."
+            )
+
+        # A `dataset_fn` is required for `Model.fit` to work across all
+        # strategies.
+        def dataset_fn(input_context):
+            batch_size = input_context.get_per_replica_batch_size(
+                global_batch_size=64
+            )
+            x = tf.random.uniform((10, 10))
+            y = tf.random.uniform((10,))
+            dataset = (
+                tf.data.Dataset.from_tensor_slices((x, y)).shuffle(10).repeat()
+            )
+            dataset = dataset.shard(
+                input_context.num_input_pipelines,
+                input_context.input_pipeline_id,
+            )
+            return dataset.batch(batch_size).prefetch(2)
+
+        with strategy.scope():
+            model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
+            optimizer = tf.keras.optimizers.SGD()
+            model.compile(optimizer, loss="mse", steps_per_execution=5)
+
+        x = tf.keras.utils.experimental.DatasetCreator(dataset_fn)
 
-  def testKerasTrainingAPI(self, strategy):
-    if (not tf.__internal__.tf2.enabled()
-        and isinstance(strategy,
-                       tf.distribute.experimental.ParameterServerStrategy)):
-      self.skipTest(
-          "Parameter Server strategy with dataset creator need to be run when "
-          "eager execution is enabled.")
+        logdir = os.path.join(self.get_temp_dir(), "logdir")
+        model.fit(
+            x,
+            epochs=2,
+            steps_per_epoch=20,
+            callbacks=[
+                tf.keras.callbacks.TensorBoard(
+                    logdir,
+                    update_freq=5,
+                    write_steps_per_second=True,
+                )
+            ],
+        )
 
-    # A `dataset_fn` is required for `Model.fit` to work across all strategies.
-    def dataset_fn(input_context):
-      batch_size = input_context.get_per_replica_batch_size(
-          global_batch_size=64)
-      x = tf.random.uniform((10, 10))
-      y = tf.random.uniform((10,))
-      dataset = tf.data.Dataset.from_tensor_slices((x, y)).shuffle(10).repeat()
-      dataset = dataset.shard(
-          input_context.num_input_pipelines, input_context.input_pipeline_id)
-      return dataset.batch(batch_size).prefetch(2)
+        events_got = []
+        for event_file in glob.glob(logdir + "/train/events.out.*"):
+            for event in tf.compat.v1.train.summary_iterator(event_file):
+                if not event.summary:
+                    continue
+                for value in event.summary.value:
+                    if value.tag != "batch_loss":
+                        continue
+                    events_got += [event.step]
 
-    with strategy.scope():
-      model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
-      optimizer = tf.keras.optimizers.SGD()
-      model.compile(optimizer, loss="mse", steps_per_execution=10)
+        # total steps = epochs * steps_per_epoch
+        events_expected = [5, 10, 15, 20, 25, 30, 35, 40]
 
-    x = tf.keras.utils.experimental.DatasetCreator(dataset_fn)
+        if isinstance(
+            strategy, tf.distribute.experimental.ParameterServerStrategy
+        ):
+            # Metrics are not logged with this strategy as they are not
+            # immediately available on batch end
+            events_expected = []
+        if (
+            strategy.cluster_resolver
+            and strategy.cluster_resolver.task_type == "worker"
+        ):
+            # The below assertion is run by both chief and workers when using
+            # `tf.distribute.MultiWorkerMirroredStrategy`, but only the chief
+            # will log events.
+            events_expected = []
 
-    model.fit(x, epochs=2, steps_per_epoch=10)
+        self.assertEqual(events_got, events_expected)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/integration_test/extension_type_test.py b/keras/integration_test/extension_type_test.py
new file mode 100644
index 000000000000..a7a0d050566f
--- /dev/null
+++ b/keras/integration_test/extension_type_test.py
@@ -0,0 +1,102 @@
+"""Test Model inference and save/load with an ExtensionType."""
+
+import os
+import typing
+
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras.engine.input_layer import Input
+from keras.engine.training import Model
+from keras.saving.saving_api import load_model
+from keras.testing_infra import test_utils
+
+
+class MaskedTensor(tf.experimental.BatchableExtensionType):
+    """Example subclass of ExtensionType, used for testing.
+
+    This version adds Keras required properties to MaskedTensor and its Spec
+    class, to test Keras integration.
+    """
+
+    __name__ = "tf.test.MaskedTensor.Spec"
+
+    values: typing.Union[tf.Tensor, tf.RaggedTensor]
+    mask: typing.Union[tf.Tensor, tf.RaggedTensor]
+
+    def __init__(self, values, mask):
+        if isinstance(values, tf.RaggedTensor):
+            assert isinstance(mask, tf.RaggedTensor)
+            assert mask.dtype == tf.dtypes.bool
+        else:
+            values = tf.convert_to_tensor(values)
+            mask = tf.convert_to_tensor(mask, tf.dtypes.bool)
+        self.values = values
+        self.mask = mask
+
+    # Required by assert_input_compatibility in keras/engine/input_spec.py
+    @property
+    def shape(self):
+        return self.values.shape
+
+    @property
+    def dtype(self):
+        return self.values.dtype
+
+    class Spec:
+
+        # Required by KerasTensor.shape in keras/engine/keras_tensor.py
+        @property
+        def shape(self):
+            return self.values._shape
+
+
+class ExtensionTypeTest(tf.test.TestCase):
+    @test_utils.run_v2_only
+    def testKerasModel(self):
+        mt_spec = MaskedTensor.Spec(
+            tf.TensorSpec(shape=[None, 1], dtype=tf.dtypes.int32),
+            tf.TensorSpec(shape=[None, 1], dtype=tf.dtypes.bool),
+        )
+        model_input = Input(type_spec=mt_spec)
+        model_output = keras.layers.Lambda(
+            lambda x: tf.identity(x, name="output")
+        )(model_input)
+        model = Model(inputs=model_input, outputs=model_output)
+        mt = MaskedTensor([[1], [2], [3]], [[True], [False], [True]])
+        self.assertEqual(model(mt), mt)
+        ds = tf.data.Dataset.from_tensors(mt)
+        self.assertEqual(model.predict(ds), mt)
+
+        with self.subTest("keras save"):
+            path = self.create_tempdir().full_path
+            model.save(path)
+            loaded_model = load_model(path)
+            self.assertEqual(loaded_model.input.type_spec, mt_spec)
+            self.assertEqual(loaded_model(mt), mt)
+
+            loaded_fn = tf.saved_model.load(path)
+            self.assertEqual(loaded_fn(mt), mt)
+            with self.assertRaisesRegex(
+                ValueError,
+                "Could not find matching concrete function to call "
+                "loaded from the SavedModel",
+            ):
+                loaded_fn(MaskedTensor([1, 2, 3], [True, False, True]))
+
+            # The serving_fn use flatten signature
+            serving_fn = loaded_fn.signatures["serving_default"]
+            self.assertEqual(
+                serving_fn(args_0=mt.values, args_0_1=mt.mask)["lambda"], mt
+            )
+
+        with self.subTest("keras v3"):
+            path = os.path.join(self.create_tempdir().full_path, "model.keras")
+            model.save(path)
+            loaded_model = load_model(path, safe_mode=False)
+            self.assertEqual(loaded_model.input.type_spec, mt_spec)
+            self.assertEqual(loaded_model(mt), mt)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/integration_test/fit_test.py b/keras/integration_test/fit_test.py
new file mode 100644
index 000000000000..bbd0134d4cba
--- /dev/null
+++ b/keras/integration_test/fit_test.py
@@ -0,0 +1,101 @@
+"""Test Model.fit across a diverse range of models."""
+
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.integration_test.models import bert
+from keras.integration_test.models import dcgan
+from keras.integration_test.models import edge_case_model
+from keras.integration_test.models import efficientnet_v2
+from keras.integration_test.models import input_spec
+from keras.integration_test.models import low_level_model
+from keras.integration_test.models import mini_unet
+from keras.integration_test.models import mini_xception
+from keras.integration_test.models import retinanet
+from keras.integration_test.models import structured_data_classification
+from keras.integration_test.models import text_classification
+from keras.integration_test.models import timeseries_forecasting
+from keras.integration_test.models import vae
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+# from keras.integration_test.models import ctc_speech_rnn
+# from keras.integration_test.models import translation
+
+
+def get_dataset(data_specs, batch_size):
+    values = tf.nest.map_structure(input_spec.spec_to_value, data_specs)
+    dataset = (
+        tf.data.Dataset.from_tensor_slices(values)
+        .prefetch(batch_size * 2)
+        .batch(batch_size)
+    )
+    return dataset
+
+
+@test_utils.run_v2_only
+class FitTest(test_combinations.TestCase):
+    @parameterized.named_parameters(
+        ("bert", bert),
+        # ("ctc_speech_rnn", ctc_speech_rnn),  # Buggy?
+        ("dcgan", dcgan),
+        ("edge_case_model", edge_case_model),
+        ("efficientnet_v2", efficientnet_v2),
+        ("low_level_model", low_level_model),
+        ("mini_unet", mini_unet),
+        ("mini_xception", mini_xception),
+        ("retinanet", retinanet),
+        ("structured_data_classification", structured_data_classification),
+        ("text_classification", text_classification),
+        ("timeseries_forecasting", timeseries_forecasting),
+        # ("translation", translation),  # Buggy?
+        ("vae", vae),
+    )
+    def test_fit_on_all_models_with_sync_preprocessing(self, module):
+        batch_size = 4
+        data_specs = module.get_data_spec(batch_size * 3)
+        dataset = get_dataset(data_specs, batch_size)
+
+        model = module.get_model(
+            build=True,
+            compile=True,
+            jit_compile=False,
+            include_preprocessing=True,
+        )
+        model.fit(dataset, epochs=1)
+
+    @parameterized.named_parameters(
+        ("bert", bert),
+        # ("ctc_speech_rnn", ctc_speech_rnn),  # Buggy?
+        ("dcgan", dcgan),
+        ("edge_case_model", edge_case_model),
+        ("efficientnet_v2", efficientnet_v2),
+        ("low_level_model", low_level_model),
+        # ("mini_unet", mini_unet),  # Not XLA compatible b/c of UpSampling2D
+        ("mini_xception", mini_xception),
+        # ("retinanet", retinanet),  # Not XLA compatible b/c of UpSampling2D
+        ("structured_data_classification", structured_data_classification),
+        ("text_classification", text_classification),
+        ("timeseries_forecasting", timeseries_forecasting),
+        # ("translation", translation),  # Buggy?
+        ("vae", vae),
+    )
+    def test_fit_on_all_models_with_async_preprocessing_and_xla(self, module):
+        batch_size = 4
+        data_specs = module.get_data_spec(batch_size * 3)
+        dataset = get_dataset(data_specs, batch_size)
+        preprocessor = module.get_input_preprocessor()
+        if preprocessor is not None:
+            dataset = dataset.map(lambda x, y: (preprocessor(x), y))
+
+        model = module.get_model(
+            build=True,
+            compile=True,
+            jit_compile=True,
+            include_preprocessing=False,
+        )
+        model.fit(dataset, epochs=1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/integration_test/forwardprop_test.py b/keras/integration_test/forwardprop_test.py
index e786a16e0190..5ef71e591454 100644
--- a/keras/integration_test/forwardprop_test.py
+++ b/keras/integration_test/forwardprop_test.py
@@ -15,301 +15,348 @@
 
 import functools
 
-from absl.testing import parameterized
 import numpy as np
 import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 
 def _jvp(f, primals, tangents):
-  """Compute the jacobian of `f` at `primals` multiplied by `tangents`."""
-  with tf.autodiff.ForwardAccumulator(primals, tangents) as acc:
-    primals_out = f(*primals)
-  return primals_out, acc.jvp(
-      primals_out, unconnected_gradients=tf.UnconnectedGradients.ZERO)
+    """Compute the jacobian of `f` at `primals` multiplied by `tangents`."""
+    with tf.autodiff.ForwardAccumulator(primals, tangents) as acc:
+        primals_out = f(*primals)
+    return primals_out, acc.jvp(
+        primals_out, unconnected_gradients=tf.UnconnectedGradients.ZERO
+    )
 
 
 def _jacfwd(f, primals):
-  """Compute the jacobian of `f` at `primals` using forward-mode autodiff."""
-  jac_flat = []
-  flat_primals = tf.nest.flatten(primals)
-  tangent_mask = [tf.zeros_like(primal) for primal in flat_primals]
-  for primal_index, primal in enumerate(flat_primals):
-    primal_vector = tf.reshape(primal, [-1])
-    primal_vector_length = tf.size(primal_vector)
-    jac_columns = []
-    for element_index in tf.range(primal_vector_length):
-      mask = tf.one_hot(element_index, primal_vector_length)
-      tangent_mask[primal_index] = tf.reshape(mask, tf.shape(primal))
-      jac_columns.append(
-          tf.nest.map_structure(
-              functools.partial(tf.reshape, shape=[-1]),
-              _jvp(f, primals, tf.nest.pack_sequence_as(primals,
-                                                        tangent_mask))[1]))
-    jac_flat.append(tf.stack(jac_columns, axis=1))
-    tangent_mask[primal_index] = tf.zeros_like(primal)
-  return tf.nest.pack_sequence_as(primals, jac_flat)
+    """Compute the jacobian of `f` at `primals` using forward-mode autodiff."""
+    jac_flat = []
+    flat_primals = tf.nest.flatten(primals)
+    tangent_mask = [tf.zeros_like(primal) for primal in flat_primals]
+    for primal_index, primal in enumerate(flat_primals):
+        primal_vector = tf.reshape(primal, [-1])
+        primal_vector_length = tf.size(primal_vector)
+        jac_columns = []
+        for element_index in tf.range(primal_vector_length):
+            mask = tf.one_hot(element_index, primal_vector_length)
+            tangent_mask[primal_index] = tf.reshape(mask, tf.shape(primal))
+            jac_columns.append(
+                tf.nest.map_structure(
+                    functools.partial(tf.reshape, shape=[-1]),
+                    _jvp(
+                        f,
+                        primals,
+                        tf.nest.pack_sequence_as(primals, tangent_mask),
+                    )[1],
+                )
+            )
+        jac_flat.append(tf.stack(jac_columns, axis=1))
+        tangent_mask[primal_index] = tf.zeros_like(primal)
+    return tf.nest.pack_sequence_as(primals, jac_flat)
 
 
 def _grad(f, argnums=0):
-  """Return a function which computes the gradient of `f`."""
+    """Return a function which computes the gradient of `f`."""
 
-  def _f(*params):
-    with tf.GradientTape() as tape:
-      tape.watch(params)
-      primals_out = f(*params)
-    return tape.gradient(
-        primals_out,
-        params[argnums],
-        unconnected_gradients=tf.UnconnectedGradients.ZERO)
+    def _f(*params):
+        with tf.GradientTape() as tape:
+            tape.watch(params)
+            primals_out = f(*params)
+        return tape.gradient(
+            primals_out,
+            params[argnums],
+            unconnected_gradients=tf.UnconnectedGradients.ZERO,
+        )
 
-  return _f
+    return _f
 
 
 def _hvp(f, primals, tangents):
-  """Compute a forward-over-back Hessian-vector product."""
-  with tf.autodiff.ForwardAccumulator(primals, tangents) as acc:
-    with tf.GradientTape() as tape:
-      tape.watch(primals)
-      f_out = f(*primals)
-      f_out.shape.assert_is_compatible_with([])
-    return acc.jvp(tape.gradient(f_out, primals))
+    """Compute a forward-over-back Hessian-vector product."""
+    with tf.autodiff.ForwardAccumulator(primals, tangents) as acc:
+        with tf.GradientTape() as tape:
+            tape.watch(primals)
+            f_out = f(*primals)
+            f_out.shape.assert_is_compatible_with([])
+        return acc.jvp(tape.gradient(f_out, primals))
 
 
 def _vectorize_parameters(f, params, use_pfor, dtype):
-  """Loop over `params`, providing a one-hot mask to `f` for each."""
-  parameter_sizes = [tf.size(param) for param in params]
-  total_size = tf.math.add_n(parameter_sizes)
+    """Loop over `params`, providing a one-hot mask to `f` for each."""
+    parameter_sizes = [tf.size(param) for param in params]
+    total_size = tf.math.add_n(parameter_sizes)
 
-  def _wrapper(index):
-    full_onehot = tf.one_hot(index, total_size)
-    split_onehot = tf.split(full_onehot, parameter_sizes)
-    tangents = [
-        tf.reshape(v, tf.shape(param))
-        for param, v in zip(params, split_onehot)
-    ]
-    return f(tangents)
+    def _wrapper(index):
+        full_onehot = tf.one_hot(index, total_size)
+        split_onehot = tf.split(full_onehot, parameter_sizes)
+        tangents = [
+            tf.reshape(v, tf.shape(param))
+            for param, v in zip(params, split_onehot)
+        ]
+        return f(tangents)
 
-  if use_pfor:
-    return tf.vectorized_map(_wrapper, tf.range(total_size))
-  else:
-    return tf.map_fn(_wrapper, tf.range(total_size), dtype)
+    if use_pfor:
+        return tf.vectorized_map(_wrapper, tf.range(total_size))
+    else:
+        return tf.map_fn(_wrapper, tf.range(total_size), dtype)
 
 
 def _forward_over_back_hessian(f, params, use_pfor, dtype=None):
-  """Computes the full Hessian matrix for the scalar-valued f(*params).
-
-  Args:
-    f: A function taking `params` and returning a scalar.
-    params: A possibly nested structure of tensors.
-    use_pfor: If true, uses `tf.vectorized_map` calls instead of looping.
-    dtype: Required if `use_pfor=False`. A possibly nested structure of dtypes
-      (e.g. `tf.float32`) matching the structure of `f`'s returns.
-
-  Returns:
-    A possibly nested structure of matrix slices corresponding to `params`. Each
-    slice has shape [P, p_s] where `p_s` is the number of parameters (`tf.size`)
-    in the corresponding element of `params` and `P` is the total number of
-    parameters (`sum_s(p_s)`). The full matrix can be obtained by concatenating
-    along the second axis.
-  """
-  return _vectorize_parameters(
-      functools.partial(_hvp, f, params),
-      params,
-      use_pfor=use_pfor,
-      dtype=dtype)
-
-
-def _test_gradients(testcase,
-                    f,
-                    primals,
-                    order,
-                    delta=1e-3,
-                    rtol=1e-2,
-                    atol=1e-6):
-  """Tests forward/backward jacobians of `f`'s [0, `order`)-order gradients."""
-  if order < 1:
-    raise ValueError(
-        "`order` should be a positive integer, got '{}'.".format(order))
-  if order > 1:
-    _test_gradients(
-        testcase=testcase,
-        f=_grad(f),
-        primals=primals,
-        order=order - 1,
-        delta=delta,
-        rtol=rtol,
-        atol=atol)
-  sym_jac_back, num_jac = tf.test.compute_gradient(f, primals, delta=delta)
-  testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
-  sym_jac_fwd = _jacfwd(f, primals)
-  testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol)
-  # And the symbolic computations should be much closer.
-  testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
+    """Computes the full Hessian matrix for the scalar-valued f(*params).
+
+    Args:
+      f: A function taking `params` and returning a scalar.
+      params: A possibly nested structure of tensors.
+      use_pfor: If true, uses `tf.vectorized_map` calls instead of looping.
+      dtype: Required if `use_pfor=False`. A possibly nested structure of dtypes
+        (e.g. `tf.float32`) matching the structure of `f`'s returns.
+
+    Returns:
+      A possibly nested structure of matrix slices corresponding to `params`.
+      Each slice has shape [P, p_s] where `p_s` is the number of parameters
+      (`tf.size`) in the corresponding element of `params` and `P` is the total
+      number of parameters (`sum_s(p_s)`). The full matrix can be obtained by
+      concatenating along the second axis.
+    """
+    return _vectorize_parameters(
+        functools.partial(_hvp, f, params),
+        params,
+        use_pfor=use_pfor,
+        dtype=dtype,
+    )
+
+
+def _test_gradients(
+    testcase, f, primals, order, delta=1e-3, rtol=1e-2, atol=1e-6
+):
+    """Tests forward/backward jacobians of `f`'s [0, `order`)-order
+    gradients."""
+    if order < 1:
+        raise ValueError(
+            f"`order` should be a positive integer, got '{order}'."
+        )
+    if order > 1:
+        _test_gradients(
+            testcase=testcase,
+            f=_grad(f),
+            primals=primals,
+            order=order - 1,
+            delta=delta,
+            rtol=rtol,
+            atol=atol,
+        )
+    sym_jac_back, num_jac = tf.test.compute_gradient(f, primals, delta=delta)
+    testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
+    sym_jac_fwd = _jacfwd(f, primals)
+    testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol)
+    # And the symbolic computations should be much closer.
+    testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
 
 
 class ForwardpropTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters([
-      ("Dense", [[0.1]], functools.partial(tf.keras.layers.Dense, 5)),
-      ("Conv2D",
-       np.reshape(
-           np.arange(start=-1., stop=1., step=2. / (1 * 2 * 4 * 4)),
-           [1, 2, 4, 4]), functools.partial(tf.keras.layers.Conv2D, 2, 2), 1e-3)
-  ])
-  def testKerasLayers(self, value, op_fn, atol=1e-6):
-    layer = op_fn()
-    input_value = tf.constant(value, dtype=tf.float32)
-    layer.build(input_value.shape)
-    # Make sure the test is deterministic by avoiding random variable
-    # initialization.
-    for v in layer.trainable_variables:
-      v.assign(
-          tf.reshape(
-              tf.range(
-                  -1.,
-                  1.,
-                  2. / tf.size(v, out_type=tf.float32),
-                  dtype=tf.float32), v.shape))
-    _test_gradients(
-        self,
-        layer,
-        [input_value],
-        atol=atol,
-        # These are linear, so second-order is pretty boring.
-        order=2)
-
-  @parameterized.named_parameters([
-      ("NonFused", [[0.1], [0.2], [-0.3]],
-       functools.partial(tf.keras.layers.BatchNormalization, fused=False)),
-      ("Fused", [[[[0.1, 2.]]], [[[0.2, -3.]]], [[[-0.3, 4.]]]],
-       functools.partial(tf.keras.layers.BatchNormalization, fused=True))
-  ])
-  def testBatchNorm(self, value, op_fn):
-    for training in [True, False]:
-      layer = op_fn()
-      input_value = tf.constant(value, dtype=tf.float32)
-      layer.build(input_value.shape)
-      _test_gradients(
-          self,
-          functools.partial(layer, training=training), [input_value],
-          order=2,
-          atol=1e-3)
-
-  @parameterized.named_parameters([
-      ("NonFused", [[0.1], [0.2], [-0.3]],
-       functools.partial(tf.keras.layers.BatchNormalization, fused=False)),
-      ("Fused", [[[[0.1, 2.]]], [[[0.2, -3.]]], [[[-0.3, 4.]]]],
-       functools.partial(tf.keras.layers.BatchNormalization, fused=True))
-  ])
-  def testBatchNormLayerParamGrads(self, value, op_fn):
-    for training in [True, False]:
-      layer = op_fn()
-      with tf.GradientTape() as tape:
+    @parameterized.named_parameters(
+        [
+            ("Dense", [[0.1]], functools.partial(tf.keras.layers.Dense, 5)),
+            (
+                "Conv2D",
+                np.reshape(
+                    np.arange(start=-1.0, stop=1.0, step=2.0 / (1 * 2 * 4 * 4)),
+                    [1, 2, 4, 4],
+                ),
+                functools.partial(tf.keras.layers.Conv2D, 2, 2),
+                1e-3,
+            ),
+        ]
+    )
+    def testKerasLayers(self, value, op_fn, atol=1e-6):
+        layer = op_fn()
         input_value = tf.constant(value, dtype=tf.float32)
-        tape.watch(input_value)
-        output = layer(input_value, training=training)
-      jac_back = tape.jacobian(output,
-                               [input_value] + layer.trainable_variables)
-      jac_forward = _jacfwd(
-          lambda *args: layer(args[0], training=training),  # pylint:disable=cell-var-from-loop
-          [input_value] + layer.trainable_variables)
-      for backward, forward in zip(jac_back, jac_forward):
-        forward = tf.reshape(forward, tf.shape(backward))
-        self.assertAllClose(backward, forward)
-
-  @parameterized.named_parameters([("Function", tf.function),
-                                   ("NoFunction", lambda f: f)])
-  def testVariablesHVP(self, decorator):
-
-    class _Model(tf.Module):
-
-      def __init__(self):
-        self._first_dense = tf.keras.layers.Dense(18)
-        self._conv = tf.keras.layers.Conv2D(2, 2)
-        self._norm = tf.keras.layers.BatchNormalization()
-        self._second_dense = tf.keras.layers.Dense(1)
-
-      def __call__(self, x):
-        x = self._first_dense(x)
-        x = tf.nn.relu(x)
-        x = self._norm(x)
-        x = tf.nn.relu(self._conv(tf.reshape(x, [-1, 2, 3, 3])))
-        return self._second_dense(x)
-
-    model = _Model()
-
-    def _loss():
-      input_value = tf.constant([[-0.5, 1.], [0.5, -1.]])
-      target = tf.constant([[-1.], [2.]])
-      return tf.math.reduce_sum((model(input_value) - target)**2.)
-
-    @decorator
-    def _compute_hvps():
-      with tf.GradientTape() as tape:
-        loss = _loss()
-      vector = tape.gradient(loss, model.trainable_variables)
-      variable_input_fn = lambda unused_variables: _loss()
-      forward_over_back_hvp, = _hvp(variable_input_fn,
-                                    [model.trainable_variables], [vector])
-      with tf.GradientTape(persistent=True) as tape:
-        tape.watch(model.trainable_variables)
-        loss = _loss()
-        first_grads = tape.gradient(loss, model.trainable_variables)
-      back_over_back_hvp = tape.gradient(
-          first_grads, model.trainable_variables, output_gradients=vector)
-      return forward_over_back_hvp, back_over_back_hvp
-
-    self.assertAllClose(*_compute_hvps(), rtol=1e-5, atol=1e-5)
-
-  def testEmbeddingLayerInFunction(self):
-
-    class M(tf.keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.embed = tf.keras.layers.Embedding(5, 1)
-        self.proj = tf.keras.layers.Dense(1)
-
-      @tf.function
-      def call(self, x):
-        return self.proj(self.embed(x))
-
-    model = M()
-    model(tf.zeros([3, 3], dtype=tf.int32))  # pylint: disable=not-callable
-    parameters = model.embed.variables
-    tangents = [tf.ones_like(v) for v in parameters]
-    with tf.autodiff.ForwardAccumulator(parameters, tangents):
-      # Note that forwardprop runs alongside the original computation. This test
-      # is just checking that it doesn't crash; correctness is tested in core
-      # TF.
-      model(tf.zeros([3, 3], dtype=tf.int32))  # pylint: disable=not-callable
+        layer.build(input_value.shape)
+        # Make sure the test is deterministic by avoiding random variable
+        # initialization.
+        for v in layer.trainable_variables:
+            v.assign(
+                tf.reshape(
+                    tf.range(
+                        -1.0,
+                        1.0,
+                        2.0 / tf.size(v, out_type=tf.float32),
+                        dtype=tf.float32,
+                    ),
+                    v.shape,
+                )
+            )
+        _test_gradients(
+            self,
+            layer,
+            [input_value],
+            atol=atol,
+            # These are linear, so second-order is pretty boring.
+            order=2,
+        )
+
+    @parameterized.named_parameters(
+        [
+            (
+                "NonFused",
+                [[0.1], [0.2], [-0.3]],
+                functools.partial(
+                    tf.keras.layers.BatchNormalization, fused=False
+                ),
+            ),
+            (
+                "Fused",
+                [[[[0.1, 2.0]]], [[[0.2, -3.0]]], [[[-0.3, 4.0]]]],
+                functools.partial(
+                    tf.keras.layers.BatchNormalization, fused=True
+                ),
+            ),
+        ]
+    )
+    def testBatchNorm(self, value, op_fn):
+        for training in [True, False]:
+            layer = op_fn()
+            input_value = tf.constant(value, dtype=tf.float32)
+            layer.build(input_value.shape)
+            _test_gradients(
+                self,
+                functools.partial(layer, training=training),
+                [input_value],
+                order=2,
+                atol=1e-3,
+            )
+
+    @parameterized.named_parameters(
+        [
+            (
+                "NonFused",
+                [[0.1], [0.2], [-0.3]],
+                functools.partial(
+                    tf.keras.layers.BatchNormalization, fused=False
+                ),
+            ),
+            (
+                "Fused",
+                [[[[0.1, 2.0]]], [[[0.2, -3.0]]], [[[-0.3, 4.0]]]],
+                functools.partial(
+                    tf.keras.layers.BatchNormalization, fused=True
+                ),
+            ),
+        ]
+    )
+    def testBatchNormLayerParamGrads(self, value, op_fn):
+        for training in [True, False]:
+            layer = op_fn()
+            with tf.GradientTape() as tape:
+                input_value = tf.constant(value, dtype=tf.float32)
+                tape.watch(input_value)
+                output = layer(input_value, training=training)
+            jac_back = tape.jacobian(
+                output, [input_value] + layer.trainable_variables
+            )
+            jac_forward = _jacfwd(
+                lambda *args: layer(args[0], training=training),
+                [input_value] + layer.trainable_variables,
+            )
+            for backward, forward in zip(jac_back, jac_forward):
+                forward = tf.reshape(forward, tf.shape(backward))
+                self.assertAllClose(backward, forward)
+
+    @parameterized.named_parameters(
+        [("Function", tf.function), ("NoFunction", lambda f: f)]
+    )
+    def testVariablesHVP(self, decorator):
+        class _Model(tf.Module):
+            def __init__(self):
+                self._first_dense = tf.keras.layers.Dense(18)
+                self._conv = tf.keras.layers.Conv2D(2, 2)
+                self._norm = tf.keras.layers.BatchNormalization()
+                self._second_dense = tf.keras.layers.Dense(1)
+
+            def __call__(self, x):
+                x = self._first_dense(x)
+                x = tf.nn.relu(x)
+                x = self._norm(x)
+                x = tf.nn.relu(self._conv(tf.reshape(x, [-1, 2, 3, 3])))
+                return self._second_dense(x)
+
+        model = _Model()
+
+        def _loss():
+            input_value = tf.constant([[-0.5, 1.0], [0.5, -1.0]])
+            target = tf.constant([[-1.0], [2.0]])
+            return tf.math.reduce_sum((model(input_value) - target) ** 2.0)
+
+        @decorator
+        def _compute_hvps():
+            with tf.GradientTape() as tape:
+                loss = _loss()
+            vector = tape.gradient(loss, model.trainable_variables)
+            variable_input_fn = lambda unused_variables: _loss()
+            (forward_over_back_hvp,) = _hvp(
+                variable_input_fn, [model.trainable_variables], [vector]
+            )
+            with tf.GradientTape(persistent=True) as tape:
+                tape.watch(model.trainable_variables)
+                loss = _loss()
+                first_grads = tape.gradient(loss, model.trainable_variables)
+            back_over_back_hvp = tape.gradient(
+                first_grads, model.trainable_variables, output_gradients=vector
+            )
+            return forward_over_back_hvp, back_over_back_hvp
+
+        self.assertAllClose(*_compute_hvps(), rtol=1e-5, atol=1e-5)
+
+    def testEmbeddingLayerInFunction(self):
+        class M(tf.keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.embed = tf.keras.layers.Embedding(5, 1)
+                self.proj = tf.keras.layers.Dense(1)
+
+            @tf.function
+            def call(self, x):
+                return self.proj(self.embed(x))
+
+        model = M()
+        model(tf.zeros([3, 3], dtype=tf.int32))
+        parameters = model.embed.variables
+        tangents = [tf.ones_like(v) for v in parameters]
+        with tf.autodiff.ForwardAccumulator(parameters, tangents):
+            # Note that forwardprop runs alongside the original computation.
+            # This test is just checking that it doesn't crash; correctness is
+            # tested in core TF.
+            model(tf.zeros([3, 3], dtype=tf.int32))
 
 
 class HessianTests(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters([("PFor", True), ("MapFn", False)])
-  def testHessianOfVariables(self, use_pfor):
-    model = tf.keras.layers.Dense(1)
-    model.build([None, 2])
-
-    def _loss(*unused_args):
-      input_value = tf.constant([[-0.5, 1.], [0.5, -1.]])
-      target = tf.constant([[-1.], [2.]])
-      return tf.math.reduce_sum((model(input_value) - target)**2.)
-
-    kernel_hess, bias_hess = _forward_over_back_hessian(
-        _loss, [model.kernel, model.bias],
-        use_pfor=use_pfor,
-        dtype=[tf.float32, tf.float32])
-    # 3 total parameters, the whole hessian is the 3x3 concatenation
-    self.assertEqual([3, 2, 1], kernel_hess.shape)
-    self.assertEqual([3, 1], bias_hess.shape)
-    full_hessian = tf.concat([tf.reshape(kernel_hess, [3, 2]), bias_hess],
-                             axis=1)
-    # The full Hessian should be symmetric.
-    self.assertAllClose(full_hessian, tf.transpose(full_hessian))
+    @parameterized.named_parameters([("PFor", True), ("MapFn", False)])
+    def testHessianOfVariables(self, use_pfor):
+        model = tf.keras.layers.Dense(1)
+        model.build([None, 2])
+
+        def _loss(*unused_args):
+            input_value = tf.constant([[-0.5, 1.0], [0.5, -1.0]])
+            target = tf.constant([[-1.0], [2.0]])
+            return tf.math.reduce_sum((model(input_value) - target) ** 2.0)
+
+        kernel_hess, bias_hess = _forward_over_back_hessian(
+            _loss,
+            [model.kernel, model.bias],
+            use_pfor=use_pfor,
+            dtype=[tf.float32, tf.float32],
+        )
+        # 3 total parameters, the whole hessian is the 3x3 concatenation
+        self.assertEqual([3, 2, 1], kernel_hess.shape)
+        self.assertEqual([3, 1], bias_hess.shape)
+        full_hessian = tf.concat(
+            [tf.reshape(kernel_hess, [3, 2]), bias_hess], axis=1
+        )
+        # The full Hessian should be symmetric.
+        self.assertAllClose(full_hessian, tf.transpose(full_hessian))
 
 
 if __name__ == "__main__":
-  if tf.__internal__.tf2.enabled():
-    tf.test.main()
+    if tf.__internal__.tf2.enabled():
+        tf.test.main()
diff --git a/keras/integration_test/function_test.py b/keras/integration_test/function_test.py
index 14e6e14be1b8..ba89f0424e80 100644
--- a/keras/integration_test/function_test.py
+++ b/keras/integration_test/function_test.py
@@ -19,221 +19,240 @@
 
 
 class MiniModel(tf.keras.Model):
-  """Minimal model for mnist.
+    """Minimal model for mnist.
 
-  Useful for testing and debugging on slow TPU simulators.
-  """
+    Useful for testing and debugging on slow TPU simulators.
+    """
 
-  def __init__(self):
-    super().__init__(name='')
-    self.fc = tf.keras.layers.Dense(1, name='fc', kernel_initializer='ones',
-                                    bias_initializer='ones')
+    def __init__(self):
+        super().__init__(name="")
+        self.fc = tf.keras.layers.Dense(
+            1, name="fc", kernel_initializer="ones", bias_initializer="ones"
+        )
 
-  def call(self, inputs, training=True):
-    return self.fc(inputs)
+    def call(self, inputs, training=True):
+        return self.fc(inputs)
 
 
 class DefunnedMiniModel(MiniModel):
-
-  @tf.function
-  def call(self, inputs, training=True):
-    return super(DefunnedMiniModel, self).call(inputs, training=training)
+    @tf.function
+    def call(self, inputs, training=True):
+        return super(DefunnedMiniModel, self).call(inputs, training=training)
 
 
 class ModelWithOptimizer(tf.keras.Model):
-
-  def __init__(self):
-    super().__init__()
-    self.dense = tf.keras.layers.Dense(1)
-    self.optimizer = tf.keras.optimizers.Adam(0.01)
-
-  @tf.function(
-      input_signature=(tf.TensorSpec([None, 2], tf.float32),
-                       tf.TensorSpec([None], tf.float32)))
-  def call(self, x, y):
-    with tf.GradientTape() as tape:
-      loss = tf.math.reduce_mean((self.dense(x) - y) ** 2.)
-    trainable_variables = self.trainable_variables
-    gradients = tape.gradient(loss, trainable_variables)
-    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
-    return {'loss': loss}
-
-
-class FunctionTest(tf.test.TestCase):
-
-  def testFunctionRelaxationLosesInnerDimWithKerasLayer(self):
-    layer = tf.keras.layers.Dense(1)
-    fn = tf.function(reduce_retracing=True)(layer)
-
-    with self.captureWritesToStream(sys.stderr) as printed:
-      fn(tf.ones((3, 2)))
-      self.assertNotIn('ValueError', printed.contents())
-    with self.captureWritesToStream(sys.stderr) as printed:
-      # Use batch size 2 to trigger a second cache miss on the shape.
-      fn(tf.ones((2, 2)))
-      self.assertNotIn('ValueError', printed.contents())
-
-    # Shape relaxation passes TensorShape([None, None]), which causes layer
-    # matmul to fail, due to incompatible dims.  What would have been a graph
-    # build time error (layer would complain about the inner dim being 4).
-    with self.captureWritesToStream(sys.stderr) as printed:
-      with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                  r'Matrix size-incompatible'):
-        fn(tf.ones((3, 4)))
-
-  def testDefunKerasModelCall(self):
-    model = MiniModel()
-    model.call = tf.function(model.call)
-
-    x = tf.ones([1, 2])
-    y = model(x)  # pylint:disable=not-callable
-
-    self.assertAllEqual([[3.0]], self.evaluate(y))
-
-    # Break the reference cycle between the MiniModel and the defun:
-    # `MiniModel` --(through its `call` method)--> `Function`
-    # `Function` --(instancemethod on `MiniModel`)--> `MiniModel`
-    del model.call
-
-  def testDecoratedMethod(self):
-    m = DefunnedMiniModel()
-    instance_call_one = m.call(tf.ones([1, 2]), training=True)
-    instance_call_two = m.call(
-        inputs=tf.ones([1, 2]), training=True)
-    class_call = DefunnedMiniModel.call(m, tf.ones([1, 2]), training=True)
-    self.assertAllEqual(instance_call_one, instance_call_two)
-    self.assertAllEqual(instance_call_one, class_call)
-
-  def testDecoratedMethodUniqueFunctionPerInstance(self):
-    m = DefunnedMiniModel()
-    n = DefunnedMiniModel()
-
-    class_method_one = DefunnedMiniModel.call
-    class_method_two = DefunnedMiniModel.call
-
-    m_method_one = m.call
-    m_method_two = m.call
-
-    n_method_one = n.call
-    n_method_two = n.call
-
-    self.assertEqual(class_method_one, class_method_two)
-    self.assertEqual(m_method_one, m_method_two)
-    self.assertEqual(n_method_one, n_method_two)
-    self.assertNotEqual(m.call, n.call)
-
-  def testDecoratedMethodGetConcreteFunction(self):
-    m = DefunnedMiniModel()
-    instance_call_one = m.call.get_concrete_function(
-        tf.ones([1, 2]), training=False)
-    instance_call_two = m.call.get_concrete_function(
-        inputs=tf.ones([1, 2]), training=False)
-    self.assertAllEqual(instance_call_one(tf.ones([1, 2])),
-                        instance_call_two(tf.ones([1, 2])))
-
-    # Also make sure get_concrete_function works on the class method
-    DefunnedMiniModel.call.get_concrete_function(
-        m, tf.ones([1, 2]), training=False)
-    DefunnedMiniModel.call.get_concrete_function(
-        m, inputs=tf.ones([1, 2]), training=True)
-
-  def testDecoratedMethodVariableCleanup(self):
-    m = DefunnedMiniModel()
-    m(tf.ones([1, 2]))  # pylint:disable=not-callable
-    variable_refs = list({v.ref() for v in m.variables})
-    self.assertLen(variable_refs, 2)
-    del m
-
-    # Verifying if the variables are only referenced from variable_refs.
-    # We expect the reference counter to be 1, but `sys.getrefcount` reports
-    # one higher reference counter because a temporary is created when we call
-    # sys.getrefcount().  Hence check if the number returned is 2.
-    # https://docs.python.org/3/library/sys.html#sys.getrefcount
-    self.assertEqual(sys.getrefcount(variable_refs[0].deref()), 2)
-    self.assertEqual(sys.getrefcount(variable_refs[1].deref()), 2)
-
-  def testStandardTrainingLoopInFunction(self):
-    layer = tf.keras.layers.Dense(2)
-    dataset = (
-        tf.data.Dataset.from_tensors((tf.ones([784]), tf.ones([], tf.int32)))
-        .map(lambda x, y: (x, y))
-        .repeat(10)
-        .batch(32))
-    optimizer = tf.keras.optimizers.Adam()
-
-    @tf.function
-    def train():
-      for x, y in dataset:
-        with tf.GradientTape() as tape:
-          out = layer(x)
-          loss = tf.reduce_mean(
-              tf.nn.sparse_softmax_cross_entropy_with_logits(
-                  logits=out, labels=y))
-        layer_variables = layer.trainable_variables
-        gradients = tape.gradient(loss, layer_variables)
-        optimizer.apply_gradients(zip(gradients, layer_variables))
-
-    train()
-
-  def testEarlyStoppingTrainingLoopInFunction(self):
-    layer = tf.keras.layers.Dense(2)
-    dataset = (
-        tf.data.Dataset.from_tensors((tf.ones([784]), tf.ones([], tf.int32)))
-        .map(lambda x, y: (x, y))
-        .repeat(10)
-        .batch(32))
-    optimizer = tf.keras.optimizers.Adam()
-
-    @tf.function
-    def train():
-      for x, y in dataset:
+    def __init__(self):
+        super().__init__()
+        self.dense = tf.keras.layers.Dense(1)
+        self.optimizer = tf.keras.optimizers.Adam(0.01)
+
+    @tf.function(
+        input_signature=(
+            tf.TensorSpec([None, 2], tf.float32),
+            tf.TensorSpec([None], tf.float32),
+        )
+    )
+    def call(self, x, y):
         with tf.GradientTape() as tape:
-          out = layer(x)
-          loss = tf.math.reduce_mean(
-              tf.nn.sparse_softmax_cross_entropy_with_logits(
-                  logits=out, labels=y))
-        layer_variables = layer.trainable_variables
-        gradients = tape.gradient(loss, layer_variables)
-        optimizer.apply_gradients(zip(gradients, layer_variables))
-        if optimizer.iterations > 3:
-          break
+            loss = tf.math.reduce_mean((self.dense(x) - y) ** 2.0)
+        trainable_variables = self.trainable_variables
+        gradients = tape.gradient(loss, trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+        return {"loss": loss}
 
-    train()
 
-  def test_optimizer(self):
-    x = tf.constant([[3., 4.]])
-    y = tf.constant([2.])
-    model = ModelWithOptimizer()
-    model(x, y)  # pylint:disable=not-callable
+class FunctionTest(tf.test.TestCase):
+    def testFunctionRelaxationLosesInnerDimWithKerasLayer(self):
+        layer = tf.keras.layers.Dense(1)
+        fn = tf.function(reduce_retracing=True)(layer)
+
+        with self.captureWritesToStream(sys.stderr) as printed:
+            fn(tf.ones((3, 2)))
+            self.assertNotIn("ValueError", printed.contents())
+        with self.captureWritesToStream(sys.stderr) as printed:
+            # Use batch size 2 to trigger a second cache miss on the shape.
+            fn(tf.ones((2, 2)))
+            self.assertNotIn("ValueError", printed.contents())
+
+        # Shape relaxation passes TensorShape([None, None]), which causes layer
+        # matmul to fail, due to incompatible dims.  What would have been a
+        # graph build time error (layer would complain about the inner dim being
+        # 4).
+        with self.captureWritesToStream(sys.stderr) as printed:
+            with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError, r"Matrix size-incompatible"
+            ):
+                fn(tf.ones((3, 4)))
+
+    def testDefunKerasModelCall(self):
+        model = MiniModel()
+        model.call = tf.function(model.call)
+
+        x = tf.ones([1, 2])
+        y = model(x)
+
+        self.assertAllEqual([[3.0]], self.evaluate(y))
+
+        # Break the reference cycle between the MiniModel and the defun:
+        # `MiniModel` --(through its `call` method)--> `Function`
+        # `Function` --(instancemethod on `MiniModel`)--> `MiniModel`
+        del model.call
+
+    def testDecoratedMethod(self):
+        m = DefunnedMiniModel()
+        instance_call_one = m.call(tf.ones([1, 2]), training=True)
+        instance_call_two = m.call(inputs=tf.ones([1, 2]), training=True)
+        class_call = DefunnedMiniModel.call(m, tf.ones([1, 2]), training=True)
+        self.assertAllEqual(instance_call_one, instance_call_two)
+        self.assertAllEqual(instance_call_one, class_call)
+
+    def testDecoratedMethodUniqueFunctionPerInstance(self):
+        m = DefunnedMiniModel()
+        n = DefunnedMiniModel()
+
+        class_method_one = DefunnedMiniModel.call
+        class_method_two = DefunnedMiniModel.call
+
+        m_method_one = m.call
+        m_method_two = m.call
+
+        n_method_one = n.call
+        n_method_two = n.call
+
+        self.assertEqual(class_method_one, class_method_two)
+        self.assertEqual(m_method_one, m_method_two)
+        self.assertEqual(n_method_one, n_method_two)
+        self.assertNotEqual(m.call, n.call)
+
+    def testDecoratedMethodGetConcreteFunction(self):
+        m = DefunnedMiniModel()
+        instance_call_one = m.call.get_concrete_function(
+            tf.ones([1, 2]), training=False
+        )
+        instance_call_two = m.call.get_concrete_function(
+            inputs=tf.ones([1, 2]), training=False
+        )
+        self.assertAllEqual(
+            instance_call_one(tf.ones([1, 2])),
+            instance_call_two(tf.ones([1, 2])),
+        )
+
+        # Also make sure get_concrete_function works on the class method
+        DefunnedMiniModel.call.get_concrete_function(
+            m, tf.ones([1, 2]), training=False
+        )
+        DefunnedMiniModel.call.get_concrete_function(
+            m, inputs=tf.ones([1, 2]), training=True
+        )
+
+    def testDecoratedMethodVariableCleanup(self):
+        m = DefunnedMiniModel()
+        m(tf.ones([1, 2]))
+        variable_refs = list({v.ref() for v in m.variables})
+        self.assertLen(variable_refs, 2)
+        del m
+
+        # Verifying if the variables are only referenced from variable_refs.
+        # We expect the reference counter to be 1, but `sys.getrefcount` reports
+        # one higher reference counter because a temporary is created when we
+        # call sys.getrefcount().  Hence check if the number returned is 2.
+        # https://docs.python.org/3/library/sys.html#sys.getrefcount
+        self.assertEqual(sys.getrefcount(variable_refs[0].deref()), 2)
+        self.assertEqual(sys.getrefcount(variable_refs[1].deref()), 2)
+
+    def testStandardTrainingLoopInFunction(self):
+        layer = tf.keras.layers.Dense(2)
+        dataset = (
+            tf.data.Dataset.from_tensors(
+                (tf.ones([784]), tf.ones([], tf.int32))
+            )
+            .map(lambda x, y: (x, y))
+            .repeat(10)
+            .batch(32)
+        )
+        optimizer = tf.keras.optimizers.Adam()
+
+        @tf.function
+        def train():
+            for x, y in dataset:
+                with tf.GradientTape() as tape:
+                    out = layer(x)
+                    loss = tf.reduce_mean(
+                        tf.nn.sparse_softmax_cross_entropy_with_logits(
+                            logits=out, labels=y
+                        )
+                    )
+                layer_variables = layer.trainable_variables
+                gradients = tape.gradient(loss, layer_variables)
+                optimizer.apply_gradients(zip(gradients, layer_variables))
+
+        train()
+
+    def testEarlyStoppingTrainingLoopInFunction(self):
+        layer = tf.keras.layers.Dense(2)
+        dataset = (
+            tf.data.Dataset.from_tensors(
+                (tf.ones([784]), tf.ones([], tf.int32))
+            )
+            .map(lambda x, y: (x, y))
+            .repeat(10)
+            .batch(32)
+        )
+        optimizer = tf.keras.optimizers.Adam()
+
+        @tf.function
+        def train():
+            for x, y in dataset:
+                with tf.GradientTape() as tape:
+                    out = layer(x)
+                    loss = tf.math.reduce_mean(
+                        tf.nn.sparse_softmax_cross_entropy_with_logits(
+                            logits=out, labels=y
+                        )
+                    )
+                layer_variables = layer.trainable_variables
+                gradients = tape.gradient(loss, layer_variables)
+                optimizer.apply_gradients(zip(gradients, layer_variables))
+                if optimizer.iterations > 3:
+                    break
+
+        train()
+
+    def test_optimizer(self):
+        x = tf.constant([[3.0, 4.0]])
+        y = tf.constant([2.0])
+        model = ModelWithOptimizer()
+        model(x, y)
 
 
 class AutomaticControlDependenciesTest(tf.test.TestCase):
-
-  def testVariableInitializersCanBeLifted(self):
-    # The initializer is a stateful op, but using it inside a function should
-    # *not* create additional dependencies.  That's what we're testing.
-    layer = tf.keras.layers.Dense(1, kernel_initializer='glorot_uniform')
-
-    @tf.function
-    def fn(x):
-      # Stateful operation
-      tf.debugging.Assert(x, ['Error'])
-      # Variable initialization should be lifted.  Prior to the change that
-      # added this test, the lifting would crash because of an auto control dep
-      # added on `x`.  Note, the error did not happen if we
-      # manually created a tf.Variable outside of function and used it
-      # here.  Alternatively, creating a tf.Variable inside fn() causes
-      # a different sort of error that is out of scope for this test.
-      return layer(tf.convert_to_tensor([[1.0, 1.0]]))
-
-    true = tf.convert_to_tensor(True)
-
-    concrete = fn.get_concrete_function(
-        tf.TensorSpec(shape=(), dtype=tf.bool))
-    self.evaluate(concrete(true))
-    self.evaluate(fn(True))
-
-
-if __name__ == '__main__':
-  if tf.__internal__.tf2.enabled():
-    tf.test.main()
+    def testVariableInitializersCanBeLifted(self):
+        # The initializer is a stateful op, but using it inside a function
+        # should *not* create additional dependencies.  That's what we're
+        # testing.
+        layer = tf.keras.layers.Dense(1, kernel_initializer="glorot_uniform")
+
+        @tf.function
+        def fn(x):
+            # Stateful operation
+            tf.debugging.Assert(x, ["Error"])
+            # Variable initialization should be lifted.  Prior to the change
+            # that added this test, the lifting would crash because of an auto
+            # control dep added on `x`. Note, the error did not happen if we
+            # manually created a tf.Variable outside of function and used it
+            # here.  Alternatively, creating a tf.Variable inside fn() causes a
+            # different sort of error that is out of scope for this test.
+            return layer(tf.convert_to_tensor([[1.0, 1.0]]))
+
+        true = tf.convert_to_tensor(True)
+
+        concrete = fn.get_concrete_function(
+            tf.TensorSpec(shape=(), dtype=tf.bool)
+        )
+        self.evaluate(concrete(true))
+        self.evaluate(fn(True))
+
+
+if __name__ == "__main__":
+    if tf.__internal__.tf2.enabled():
+        tf.test.main()
diff --git a/keras/integration_test/gradient_checkpoint_test.py b/keras/integration_test/gradient_checkpoint_test.py
index 691df25c6ad1..50efbbd98920 100644
--- a/keras/integration_test/gradient_checkpoint_test.py
+++ b/keras/integration_test/gradient_checkpoint_test.py
@@ -17,159 +17,194 @@
 
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util as tf_test_utils
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 from tensorflow.python.platform import test as test_lib
 
 layers = tf.keras.layers
 optimizers = tf.keras.optimizers
 
 
-def _get_big_cnn_model(img_dim, n_channels, num_partitions,
-                       blocks_per_partition):
-  """Creates a test model whose activations are significantly larger than model size."""
-  model = tf.keras.Sequential()
-  model.add(layers.Input(shape=(img_dim, img_dim, n_channels)))
-  for _ in range(num_partitions):
-    for _ in range(blocks_per_partition):
-      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-  model.add(layers.Flatten())
-  model.add(layers.Dense(32, activation=tf.nn.relu))
-  model.add(layers.Dense(10))
-  return model
-
-
-def _get_split_cnn_model(img_dim, n_channels, num_partitions,
-                         blocks_per_partition):
-  """Creates a test model that is split into `num_partitions` smaller models."""
-  models = [tf.keras.Sequential() for _ in range(num_partitions)]
-  models[0].add(layers.Input(shape=(img_dim, img_dim, n_channels)))
-  for i in range(num_partitions):
-    model = models[i]
-    if i > 0:
-      last_shape = models[i - 1].layers[-1].output_shape
-      model.add(layers.Input(shape=last_shape[1:]))
-    for _ in range(blocks_per_partition):
-      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-  models[-1].add(layers.Flatten())
-  models[-1].add(layers.Dense(32, activation=tf.nn.relu))
-  models[-1].add(layers.Dense(10))
-  return models
+def _get_big_cnn_model(
+    img_dim, n_channels, num_partitions, blocks_per_partition
+):
+    """Creates a test model whose activations are significantly larger than
+    model size."""
+    model = tf.keras.Sequential()
+    model.add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+    for _ in range(num_partitions):
+        for _ in range(blocks_per_partition):
+            model.add(
+                layers.Conv2D(10, 5, padding="same", activation=tf.nn.relu)
+            )
+            model.add(layers.MaxPooling2D((1, 1), padding="same"))
+            model.add(
+                layers.Conv2D(40, 5, padding="same", activation=tf.nn.relu)
+            )
+            model.add(layers.MaxPooling2D((1, 1), padding="same"))
+            model.add(
+                layers.Conv2D(20, 5, padding="same", activation=tf.nn.relu)
+            )
+            model.add(layers.MaxPooling2D((1, 1), padding="same"))
+    model.add(layers.Flatten())
+    model.add(layers.Dense(32, activation=tf.nn.relu))
+    model.add(layers.Dense(10))
+    return model
+
+
+def _get_split_cnn_model(
+    img_dim, n_channels, num_partitions, blocks_per_partition
+):
+    """Creates a test model that is split into `num_partitions` smaller
+    models."""
+    models = [tf.keras.Sequential() for _ in range(num_partitions)]
+    models[0].add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+    for i in range(num_partitions):
+        model = models[i]
+        if i > 0:
+            last_shape = models[i - 1].layers[-1].output_shape
+            model.add(layers.Input(shape=last_shape[1:]))
+        for _ in range(blocks_per_partition):
+            model.add(
+                layers.Conv2D(10, 5, padding="same", activation=tf.nn.relu)
+            )
+            model.add(layers.MaxPooling2D((1, 1), padding="same"))
+            model.add(
+                layers.Conv2D(40, 5, padding="same", activation=tf.nn.relu)
+            )
+            model.add(layers.MaxPooling2D((1, 1), padding="same"))
+            model.add(
+                layers.Conv2D(20, 5, padding="same", activation=tf.nn.relu)
+            )
+            model.add(layers.MaxPooling2D((1, 1), padding="same"))
+    models[-1].add(layers.Flatten())
+    models[-1].add(layers.Dense(32, activation=tf.nn.relu))
+    models[-1].add(layers.Dense(10))
+    return models
 
 
 def _compute_loss(logits, labels):
-  return tf.reduce_mean(
-      tf.nn.sparse_softmax_cross_entropy_with_logits(
-          logits=logits, labels=labels))
+    return tf.reduce_mean(
+        tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logits, labels=labels
+        )
+    )
 
 
 def _limit_gpu_memory():
-  """Helper function to limit GPU memory for testing."""
-  gpus = tf.config.experimental.list_physical_devices('GPU')
-  if gpus:
-    tf.config.experimental.set_virtual_device_configuration(
-        gpus[0],
-        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
-    return True
-  return False
+    """Helper function to limit GPU memory for testing."""
+    gpus = tf.config.experimental.list_physical_devices("GPU")
+    if gpus:
+        tf.config.experimental.set_virtual_device_configuration(
+            gpus[0],
+            [
+                tf.config.experimental.VirtualDeviceConfiguration(
+                    memory_limit=2048
+                )
+            ],
+        )
+        return True
+    return False
 
 
 def _get_dummy_data(img_dim, n_channels, batch_size):
-  inputs = tf.ones([batch_size, img_dim, img_dim, n_channels])
-  labels = tf.ones([batch_size], dtype=tf.int64)
-  return inputs, labels
+    inputs = tf.ones([batch_size, img_dim, img_dim, n_channels])
+    labels = tf.ones([batch_size], dtype=tf.int64)
+    return inputs, labels
 
 
 def _train_no_recompute(n_steps):
-  """Trains a single large model without gradient checkpointing."""
-  img_dim, n_channels, batch_size = 256, 1, 4
-  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
-  model = _get_big_cnn_model(
-      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
-  optimizer = optimizers.SGD()
-  losses = []
-  tr_vars = model.trainable_variables
-  for _ in range(n_steps):
-    with tf.GradientTape() as tape:
-      logits = model(x)
-      loss = _compute_loss(logits, y)
-      losses.append(loss)
-    grads = tape.gradient(loss, tr_vars)  # tr_vars
-    optimizer.apply_gradients(zip(grads, tr_vars))
-    del grads
-  return losses
+    """Trains a single large model without gradient checkpointing."""
+    img_dim, n_channels, batch_size = 256, 1, 4
+    x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+    model = _get_big_cnn_model(
+        img_dim, n_channels, num_partitions=3, blocks_per_partition=2
+    )
+    optimizer = optimizers.SGD()
+    losses = []
+    tr_vars = model.trainable_variables
+    for _ in range(n_steps):
+        with tf.GradientTape() as tape:
+            logits = model(x)
+            loss = _compute_loss(logits, y)
+            losses.append(loss)
+        grads = tape.gradient(loss, tr_vars)  # tr_vars
+        optimizer.apply_gradients(zip(grads, tr_vars))
+        del grads
+    return losses
 
 
 def _train_with_recompute(n_steps):
-  """Trains a single large model with gradient checkpointing using tf.recompute_grad."""
-  img_dim, n_channels, batch_size = 256, 1, 4
-  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
-  # This model is the same model as _get_big_cnn_model but split into 3 parts.
-  models = _get_split_cnn_model(
-      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
-  model1, model2, model3 = models
-  # Apply gradient checkpointing to the submodels using tf.recompute_grad.
-  model1_re = tf.recompute_grad(model1)
-  model2_re = tf.recompute_grad(model2)
-  model3_re = tf.recompute_grad(model3)
-  optimizer = optimizers.SGD()
-  tr_vars = (
-      model1.trainable_variables + model2.trainable_variables +
-      model3.trainable_variables)
-  losses = []
-  for _ in range(n_steps):
-    with tf.GradientTape() as tape:
-      logits1 = model1_re(x)
-      logits2 = model2_re(logits1)
-      logits3 = model3_re(logits2)
-      loss = _compute_loss(logits3, y)
-      losses.append(loss)
-      grads = tape.gradient(loss, tr_vars)  # tr_vars
-      optimizer.apply_gradients(zip(grads, tr_vars))
-      del grads
-  return losses
+    """Trains a single large model with gradient checkpointing using
+    tf.recompute_grad."""
+    img_dim, n_channels, batch_size = 256, 1, 4
+    x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+    # This model is the same model as _get_big_cnn_model but split into 3 parts.
+    models = _get_split_cnn_model(
+        img_dim, n_channels, num_partitions=3, blocks_per_partition=2
+    )
+    model1, model2, model3 = models
+    # Apply gradient checkpointing to the submodels using tf.recompute_grad.
+    model1_re = tf.recompute_grad(model1)
+    model2_re = tf.recompute_grad(model2)
+    model3_re = tf.recompute_grad(model3)
+    optimizer = optimizers.SGD()
+    tr_vars = (
+        model1.trainable_variables
+        + model2.trainable_variables
+        + model3.trainable_variables
+    )
+    losses = []
+    for _ in range(n_steps):
+        with tf.GradientTape() as tape:
+            logits1 = model1_re(x)
+            logits2 = model2_re(logits1)
+            logits3 = model3_re(logits2)
+            loss = _compute_loss(logits3, y)
+            losses.append(loss)
+            grads = tape.gradient(loss, tr_vars)  # tr_vars
+            optimizer.apply_gradients(zip(grads, tr_vars))
+            del grads
+    return losses
 
 
 @tf_test_utils.with_eager_op_as_function
 class GradientCheckpointTest(tf.test.TestCase):
-
-  def test_raises_oom_exception(self):
-    self.skipTest('b/232015009: flaky test')
-    if not _limit_gpu_memory():
-      self.skipTest('No virtual GPUs found')
-    with self.assertRaises(Exception) as context:
-      _train_no_recompute(1)
-    self.assertIsInstance(context.exception, tf.errors.ResourceExhaustedError)
-
-  @tf_test_utils.disable_xla(
-      'xla does not support searching for memory-limited solvers.')
-  def test_does_not_raise_oom_exception(self):
-    if not _limit_gpu_memory():
-      self.skipTest('No virtual GPUs found')
-    if test_lib.is_built_with_rocm():
-      self.skipTest(
-          'ROCm MIOpen does not support searching for memory-limited'
-          'solvers yet so skip the subtest which would result in OOM.')
-    n_step = 2
-    losses = _train_with_recompute(n_step)
-    self.assertLen(losses, n_step)
-
-  def tearDown(self):
-    super().tearDown()
-    # Make sure all the models created in keras has been deleted and cleared
-    # from the global keras grpah, also do a force GC to recycle the GPU memory.
-    tf.keras.backend.clear_session()
-    gc.collect()
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_raises_oom_exception(self):
+        self.skipTest("b/232015009: flaky test")
+        if not _limit_gpu_memory():
+            self.skipTest("No virtual GPUs found")
+        with self.assertRaises(Exception) as context:
+            _train_no_recompute(1)
+        self.assertIsInstance(
+            context.exception, tf.errors.ResourceExhaustedError
+        )
+
+    @tf_test_utils.disable_xla(
+        "xla does not support searching for memory-limited solvers."
+    )
+    def test_does_not_raise_oom_exception(self):
+        if not _limit_gpu_memory():
+            self.skipTest("No virtual GPUs found")
+        if test_lib.is_built_with_rocm():
+            self.skipTest(
+                "ROCm MIOpen does not support searching for memory-limited"
+                "solvers yet so skip the subtest which would result in OOM."
+            )
+        n_step = 2
+        losses = _train_with_recompute(n_step)
+        self.assertLen(losses, n_step)
+
+    def tearDown(self):
+        super().tearDown()
+        # Make sure all the models created in keras has been deleted and cleared
+        # from the global keras grpah, also do a force GC to recycle the GPU
+        # memory.
+        tf.keras.backend.clear_session()
+        gc.collect()
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/integration_test/gradients_test.py b/keras/integration_test/gradients_test.py
index 361ed8112744..dd24e9c8d7df 100644
--- a/keras/integration_test/gradients_test.py
+++ b/keras/integration_test/gradients_test.py
@@ -18,120 +18,122 @@
 
 
 class TestKerasModelClass(tf.keras.Model):
-  """A simple tensorflow keras Model class definition."""
+    """A simple tensorflow keras Model class definition."""
 
-  def __init__(self, width):
-    super().__init__()
-    self.width = width
-
-  def build(self, input_shape):
-    self.weight = self.add_weight(
-        name="test_keras_var",
-        shape=(self.width,),
-        dtype=tf.float32,
-        trainable=True,
-    )
-
-  def call(self, inputs):
-    return self.weight * inputs
-
-
-class GradientsTest(tf.test.TestCase):
-
-  def _TestVariablesGradient(self, inputs, test_model, vars_to_grad):
-    """Returns gradients of `test_model` with respect to `vars_to_grad`."""
-
-    test_model_re = tf.recompute_grad(test_model)
-
-    with tf.GradientTape(persistent=True) as tape:
-      tape.watch(vars_to_grad)
-      out_re = test_model_re(inputs)
-      out = test_model(inputs)
-
-    grads_re = tape.gradient(out_re, vars_to_grad)
-    grads = tape.gradient(out, vars_to_grad)
-
-    return grads_re, grads
-
-  def testKerasRecompute(self):
-    """Checks that recompute_grad works for a simple Keras Model."""
-
-    test_model = TestKerasModelClass(10)
-    test_input = tf.constant(tf.zeros((10, 10), dtype=np.float32))
-    # Ensures keras model is initialized.
-    test_model(test_input)  # pylint: disable=not-callable
-    grads_re, grads = self._TestVariablesGradient(test_input, test_model,
-                                                  test_input)
-
-    grads_re = self.evaluate(grads_re)
-    grads = self.evaluate(grads)
-    for g, g_re in zip(grads, grads_re):
-      self.assertAllClose(g, g_re)
-
-    grads_re, grads = self._TestVariablesGradient(test_input, test_model,
-                                                  test_model.variables)
-
-    grads_re = self.evaluate(grads_re)
-    grads = self.evaluate(grads)
-    for g, g_re in zip(grads, grads_re):
-      self.assertAllClose(g, g_re)
-
-  def testLSTMBatchJacobian(self):
-    class HasLSTM(tf.keras.Model):
-
-      def __init__(self):
+    def __init__(self, width):
         super().__init__()
-        self.lstm = tf.keras.layers.LSTM(units=5)
-        self.dense = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
-
-      def call(self, x):
-        return self.dense(self.lstm(x))
-
-    m = HasLSTM()
-
-    def jacobian(x):
-      with tf.GradientTape() as tape:
-        tape.watch(x)
-        y = m(x)  # pylint: disable=not-callable
-      return tape.batch_jacobian(y, x)
+        self.width = width
 
-    inp = tf.nn.l2_normalize(tf.ones([1, 2, 3]), axis=[1, 2])
-    eager_result = jacobian(inp)
-    function_result = tf.function(jacobian)(inp)
-    self.assertAllClose(eager_result, function_result)
-    backprop_result, numeric_result = tf.test.compute_gradient(
-        m, [inp], delta=1e-3)
-    self.assertAllClose(numeric_result, backprop_result, atol=1e-3)
-    self.assertAllClose(tf.reshape(numeric_result, [-1]),
-                        tf.reshape(eager_result, [-1]), atol=1e-3)
+    def build(self, input_shape):
+        self.weight = self.add_weight(
+            name="test_keras_var",
+            shape=(self.width,),
+            dtype=tf.float32,
+            trainable=True,
+        )
 
-  def testEmbeddingLookupGradientsHaveKnownShape(self):
+    def call(self, inputs):
+        return self.weight * inputs
 
-    class MyLayer(tf.keras.layers.Layer):
 
-      def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.embedding = None
-
-      def build(self, input_shape):
-        self.embedding = tf.Variable(tf.random.uniform([50, 16]))
-
-      def call(self, x):
-        return tf.nn.embedding_lookup(self.embedding, x)
-
-    layer = MyLayer()
-
-    @tf.function
-    def _run(x):
-      with tf.GradientTape() as tape:
-        y = layer(x)
-        loss = tf.math.reduce_sum(y)
-      gradients = tape.gradient(loss, layer.weights)
-      self.assertListEqual(gradients[0].shape.as_list(), [50, 16])
-
-    _run(tf.random.uniform([4, 16], minval=0, maxval=50, dtype=tf.int64))
+class GradientsTest(tf.test.TestCase):
+    def _TestVariablesGradient(self, inputs, test_model, vars_to_grad):
+        """Returns gradients of `test_model` with respect to `vars_to_grad`."""
+
+        test_model_re = tf.recompute_grad(test_model)
+
+        with tf.GradientTape(persistent=True) as tape:
+            tape.watch(vars_to_grad)
+            out_re = test_model_re(inputs)
+            out = test_model(inputs)
+
+        grads_re = tape.gradient(out_re, vars_to_grad)
+        grads = tape.gradient(out, vars_to_grad)
+
+        return grads_re, grads
+
+    def testKerasRecompute(self):
+        """Checks that recompute_grad works for a simple Keras Model."""
+
+        test_model = TestKerasModelClass(10)
+        test_input = tf.constant(tf.zeros((10, 10), dtype=np.float32))
+        # Ensures keras model is initialized.
+        test_model(test_input)
+        grads_re, grads = self._TestVariablesGradient(
+            test_input, test_model, test_input
+        )
+
+        grads_re = self.evaluate(grads_re)
+        grads = self.evaluate(grads)
+        for g, g_re in zip(grads, grads_re):
+            self.assertAllClose(g, g_re)
+
+        grads_re, grads = self._TestVariablesGradient(
+            test_input, test_model, test_model.variables
+        )
+
+        grads_re = self.evaluate(grads_re)
+        grads = self.evaluate(grads)
+        for g, g_re in zip(grads, grads_re):
+            self.assertAllClose(g, g_re)
+
+    def testLSTMBatchJacobian(self):
+        class HasLSTM(tf.keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.lstm = tf.keras.layers.LSTM(units=5)
+                self.dense = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
+
+            def call(self, x):
+                return self.dense(self.lstm(x))
+
+        m = HasLSTM()
+
+        def jacobian(x):
+            with tf.GradientTape() as tape:
+                tape.watch(x)
+                y = m(x)
+            return tape.batch_jacobian(y, x)
+
+        inp = tf.nn.l2_normalize(tf.ones([1, 2, 3]), axis=[1, 2])
+        eager_result = jacobian(inp)
+        function_result = tf.function(jacobian)(inp)
+        self.assertAllClose(eager_result, function_result)
+        backprop_result, numeric_result = tf.test.compute_gradient(
+            m, [inp], delta=1e-3
+        )
+        self.assertAllClose(numeric_result, backprop_result, atol=1e-3)
+        self.assertAllClose(
+            tf.reshape(numeric_result, [-1]),
+            tf.reshape(eager_result, [-1]),
+            atol=1e-3,
+        )
+
+    def testEmbeddingLookupGradientsHaveKnownShape(self):
+        class MyLayer(tf.keras.layers.Layer):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.embedding = None
+
+            def build(self, input_shape):
+                self.embedding = tf.Variable(tf.random.uniform([50, 16]))
+
+            def call(self, x):
+                return tf.nn.embedding_lookup(self.embedding, x)
+
+        layer = MyLayer()
+
+        @tf.function
+        def _run(x):
+            with tf.GradientTape() as tape:
+                y = layer(x)
+                loss = tf.math.reduce_sum(y)
+            gradients = tape.gradient(loss, layer.weights)
+            self.assertListEqual(gradients[0].shape.as_list(), [50, 16])
+
+        _run(tf.random.uniform([4, 16], minval=0, maxval=50, dtype=tf.int64))
 
 
 if __name__ == "__main__":
-  if tf.__internal__.tf2.enabled():
-    tf.test.main()
+    if tf.__internal__.tf2.enabled():
+        tf.test.main()
diff --git a/keras/integration_test/legacy_rnn_test.py b/keras/integration_test/legacy_rnn_test.py
index 8d006e29ceb3..0b85d3643377 100644
--- a/keras/integration_test/legacy_rnn_test.py
+++ b/keras/integration_test/legacy_rnn_test.py
@@ -20,366 +20,391 @@
 
 
 class KerasNetworkTFRNNs(tf.keras.Model):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self._cell = tf.nn.rnn_cell.MultiRNNCell(
+            [tf.nn.rnn_cell.LSTMCell(1) for _ in range(2)]
+        )
 
-  def __init__(self, name=None):
-    super().__init__(name=name)
-    self._cell = tf.nn.rnn_cell.MultiRNNCell(
-        [tf.nn.rnn_cell.LSTMCell(1) for _ in range(2)])
-
-  def call(self, inputs):
-    return self._cell(inputs, self._cell.get_initial_state(inputs))
+    def call(self, inputs):
+        return self._cell(inputs, self._cell.get_initial_state(inputs))
 
 
 class KerasNetworkKerasRNNs(tf.keras.Model):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self._cell = tf.keras.layers.StackedRNNCells(
+            [tf.keras.layers.LSTMCell(1) for _ in range(2)]
+        )
 
-  def __init__(self, name=None):
-    super().__init__(name=name)
-    self._cell = tf.keras.layers.StackedRNNCells(
-        [tf.keras.layers.LSTMCell(1) for _ in range(2)])
-
-  def call(self, inputs):
-    return self._cell(inputs, self._cell.get_initial_state(inputs))
+    def call(self, inputs):
+        return self._cell(inputs, self._cell.get_initial_state(inputs))
 
 
 class LegacyRNNTest(tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self._seed = 23489
-    np.random.seed(self._seed)
-
-  def testRNNWithKerasSimpleRNNCell(self):
-    with self.cached_session() as sess:
-      input_shape = 10
-      output_shape = 5
-      timestep = 4
-      batch = 100
-      (x_train, y_train), _ = get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = tf.keras.utils.to_categorical(y_train)
-      cell = tf.keras.layers.SimpleRNNCell(output_shape)
-
-      inputs = tf.placeholder(
-          tf.float32, shape=(None, timestep, input_shape))
-      predict = tf.placeholder(
-          tf.float32, shape=(None, output_shape))
-
-      outputs, state = tf.nn.dynamic_rnn(
-          cell, inputs, dtype=tf.float32)
-      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
-      self.assertEqual(state.shape.as_list(), [None, output_shape])
-      loss = tf.losses.softmax_cross_entropy(predict, state)
-      train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
-
-      sess.run([tf.global_variables_initializer()])
-      _, outputs, state = sess.run(
-          [train_op, outputs, state], {inputs: x_train, predict: y_train})
-
-      self.assertEqual(len(outputs), batch)
-      self.assertEqual(len(state), batch)
-
-  def testRNNWithKerasGRUCell(self):
-    with self.cached_session() as sess:
-      input_shape = 10
-      output_shape = 5
-      timestep = 4
-      batch = 100
-      (x_train, y_train), _ = get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = tf.keras.utils.to_categorical(y_train)
-      cell = tf.keras.layers.GRUCell(output_shape)
-
-      inputs = tf.placeholder(
-          tf.float32, shape=(None, timestep, input_shape))
-      predict = tf.placeholder(
-          tf.float32, shape=(None, output_shape))
-
-      outputs, state = tf.nn.dynamic_rnn(
-          cell, inputs, dtype=tf.float32)
-      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
-      self.assertEqual(state.shape.as_list(), [None, output_shape])
-      loss = tf.losses.softmax_cross_entropy(predict, state)
-      train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
-
-      sess.run([tf.global_variables_initializer()])
-      _, outputs, state = sess.run(
-          [train_op, outputs, state], {inputs: x_train, predict: y_train})
-
-      self.assertEqual(len(outputs), batch)
-      self.assertEqual(len(state), batch)
-
-  def testRNNWithKerasLSTMCell(self):
-    with self.cached_session() as sess:
-      input_shape = 10
-      output_shape = 5
-      timestep = 4
-      batch = 100
-      (x_train, y_train), _ = get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = tf.keras.utils.to_categorical(y_train)
-      cell = tf.keras.layers.LSTMCell(output_shape)
-
-      inputs = tf.placeholder(
-          tf.float32, shape=(None, timestep, input_shape))
-      predict = tf.placeholder(
-          tf.float32, shape=(None, output_shape))
-
-      outputs, state = tf.nn.dynamic_rnn(
-          cell, inputs, dtype=tf.float32)
-      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
-      self.assertEqual(len(state), 2)
-      self.assertEqual(state[0].shape.as_list(), [None, output_shape])
-      self.assertEqual(state[1].shape.as_list(), [None, output_shape])
-      loss = tf.losses.softmax_cross_entropy(predict, state[0])
-      train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
-
-      sess.run([tf.global_variables_initializer()])
-      _, outputs, state = sess.run(
-          [train_op, outputs, state], {inputs: x_train, predict: y_train})
-
-      self.assertEqual(len(outputs), batch)
-      self.assertEqual(len(state), 2)
-      self.assertEqual(len(state[0]), batch)
-      self.assertEqual(len(state[1]), batch)
-
-  def testRNNWithStackKerasCell(self):
-    with self.cached_session() as sess:
-      input_shape = 10
-      output_shape = 5
-      timestep = 4
-      batch = 100
-      (x_train, y_train), _ = get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = tf.keras.utils.to_categorical(y_train)
-      cell = tf.keras.layers.StackedRNNCells(
-          [tf.keras.layers.LSTMCell(2 * output_shape),
-           tf.keras.layers.LSTMCell(output_shape)])
-
-      inputs = tf.placeholder(
-          tf.float32, shape=(None, timestep, input_shape))
-      predict = tf.placeholder(
-          tf.float32, shape=(None, output_shape))
-
-      outputs, state = tf.nn.dynamic_rnn(
-          cell, inputs, dtype=tf.float32)
-      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
-      self.assertEqual(len(state), 2)
-      state = tf.nest.flatten(state)
-      self.assertEqual(len(state), 4)
-      self.assertEqual(state[0].shape.as_list(), [None, 2 * output_shape])
-      self.assertEqual(state[1].shape.as_list(), [None, 2 * output_shape])
-      self.assertEqual(state[2].shape.as_list(), [None, output_shape])
-      self.assertEqual(state[3].shape.as_list(), [None, output_shape])
-      loss = tf.losses.softmax_cross_entropy(predict, state[2])
-      train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
-
-      sess.run([tf.global_variables_initializer()])
-      _, outputs, state = sess.run(
-          [train_op, outputs, state], {inputs: x_train, predict: y_train})
-
-      self.assertEqual(len(outputs), batch)
-      self.assertEqual(len(state), 4)
-      for s in state:
-        self.assertEqual(len(s), batch)
-
-  def testStaticRNNWithKerasSimpleRNNCell(self):
-    with self.cached_session() as sess:
-      input_shape = 10
-      output_shape = 5
-      timestep = 4
-      batch = 100
-      (x_train, y_train), _ = get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      x_train = np.transpose(x_train, (1, 0, 2))
-      y_train = tf.keras.utils.to_categorical(y_train)
-      cell = tf.keras.layers.SimpleRNNCell(output_shape)
-
-      inputs = [tf.placeholder(
-          tf.float32, shape=(None, input_shape))] * timestep
-      predict = tf.placeholder(
-          tf.float32, shape=(None, output_shape))
-
-      outputs, state = tf.nn.static_rnn(
-          cell, inputs, dtype=tf.float32)
-      self.assertEqual(len(outputs), timestep)
-      self.assertEqual(outputs[0].shape.as_list(), [None, output_shape])
-      self.assertEqual(state.shape.as_list(), [None, output_shape])
-      loss = tf.losses.softmax_cross_entropy(predict, state)
-      train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
-
-      sess.run([tf.global_variables_initializer()])
-      feed_dict = {i: d for i, d in zip(inputs, x_train)}
-      feed_dict[predict] = y_train
-      _, outputs, state = sess.run(
-          [train_op, outputs, state], feed_dict)
-
-      self.assertEqual(len(outputs), timestep)
-      self.assertEqual(len(outputs[0]), batch)
-      self.assertEqual(len(state), batch)
-
-  def testKerasAndTFRNNLayerOutputComparison(self):
-    input_shape = 10
-    output_shape = 5
-    timestep = 4
-    batch = 20
-    (x_train, _), _ = get_test_data(
-        train_samples=batch,
-        test_samples=0,
-        input_shape=(timestep, input_shape),
-        num_classes=output_shape)
-    fix_weights_generator = tf.keras.layers.SimpleRNNCell(output_shape)
-    fix_weights_generator.build((None, input_shape))
-    weights = fix_weights_generator.get_weights()
-
-    with self.session(graph=tf.Graph()) as sess:
-      inputs = tf.placeholder(
-          tf.float32, shape=(None, timestep, input_shape))
-      cell = tf.keras.layers.SimpleRNNCell(output_shape)
-      tf_out, tf_state = tf.nn.dynamic_rnn(
-          cell, inputs, dtype=tf.float32)
-      cell.set_weights(weights)
-      [tf_out, tf_state] = sess.run([tf_out, tf_state], {inputs: x_train})
-    with self.session(graph=tf.Graph()) as sess:
-      k_input = tf.keras.Input(shape=(timestep, input_shape),
-                               dtype=tf.float32)
-      cell = tf.keras.layers.SimpleRNNCell(output_shape)
-      layer = tf.keras.layers.RNN(
-          cell, return_sequences=True, return_state=True)
-      keras_out = layer(k_input)
-      cell.set_weights(weights)
-      k_out, k_state = sess.run(keras_out, {k_input: x_train})
-    self.assertAllClose(tf_out, k_out)
-    self.assertAllClose(tf_state, k_state)
-
-  def testSimpleRNNCellAndBasicRNNCellComparison(self):
-    input_shape = 10
-    output_shape = 5
-    timestep = 4
-    batch = 20
-    (x_train, _), _ = get_test_data(
-        train_samples=batch,
-        test_samples=0,
-        input_shape=(timestep, input_shape),
-        num_classes=output_shape)
-    fix_weights_generator = tf.keras.layers.SimpleRNNCell(output_shape)
-    fix_weights_generator.build((None, input_shape))
-    # The SimpleRNNCell contains 3 weights: kernel, recurrent_kernel, and bias
-    # The BasicRNNCell contains 2 weight: kernel and bias, where kernel is
-    # zipped [kernel, recurrent_kernel] in SimpleRNNCell.
-    keras_weights = fix_weights_generator.get_weights()
-    kernel, recurrent_kernel, bias = keras_weights
-    tf_weights = [np.concatenate((kernel, recurrent_kernel)), bias]
-
-    with self.session(graph=tf.Graph()) as sess:
-      inputs = tf.placeholder(
-          tf.float32, shape=(None, timestep, input_shape))
-      cell = tf.keras.layers.SimpleRNNCell(output_shape)
-      k_out, k_state = tf.nn.dynamic_rnn(
-          cell, inputs, dtype=tf.float32)
-      cell.set_weights(keras_weights)
-      [k_out, k_state] = sess.run([k_out, k_state], {inputs: x_train})
-    with self.session(graph=tf.Graph()) as sess:
-      inputs = tf.placeholder(
-          tf.float32, shape=(None, timestep, input_shape))
-      cell = tf.nn.rnn_cell.BasicRNNCell(output_shape)
-      tf_out, tf_state = tf.nn.dynamic_rnn(
-          cell, inputs, dtype=tf.float32)
-      cell.set_weights(tf_weights)
-      [tf_out, tf_state] = sess.run([tf_out, tf_state], {inputs: x_train})
-
-    self.assertAllClose(tf_out, k_out, atol=1e-5)
-    self.assertAllClose(tf_state, k_state, atol=1e-5)
-
-  def testRNNCellSerialization(self):
-    for cell in [
-        tf.nn.rnn_cell.LSTMCell(32, use_peepholes=True, cell_clip=True),
-        tf.nn.rnn_cell.BasicLSTMCell(32, dtype=tf.float32),
-        tf.nn.rnn_cell.BasicRNNCell(32, activation="relu", dtype=tf.float32),
-        tf.nn.rnn_cell.GRUCell(32, dtype=tf.float32)
-    ]:
-      with self.cached_session():
-        x = tf.keras.Input((None, 5))
-        layer = tf.keras.layers.RNN(cell)
-        y = layer(x)
-        model = tf.keras.models.Model(x, y)
-        model.compile(optimizer="rmsprop", loss="mse")
-
-        # Test basic case serialization.
-        x_np = np.random.random((6, 5, 5))
-        y_np = model.predict(x_np)
-        weights = model.get_weights()
-        config = layer.get_config()
-        # The custom_objects is important here since rnn_cell_impl is
-        # not visible as a Keras layer, and also has a name conflict with
-        # keras.LSTMCell and GRUCell.
-        layer = tf.keras.layers.RNN.from_config(
-            config,
-            custom_objects={
-                "BasicRNNCell": tf.nn.rnn_cell.BasicRNNCell,
-                "GRUCell": tf.nn.rnn_cell.GRUCell,
-                "LSTMCell": tf.nn.rnn_cell.LSTMCell,
-                "BasicLSTMCell": tf.nn.rnn_cell.BasicLSTMCell
-            })
-        y = layer(x)
-        model = tf.keras.models.Model(x, y)
-        model.set_weights(weights)
-        y_np_2 = model.predict(x_np)
-        self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-  def testRNNCellActsLikeKerasRNNCellInProperScope(self):
-    with tf.layers.experimental.keras_style_scope():
-      kn1 = KerasNetworkTFRNNs(name="kn1")
-      kn2 = KerasNetworkKerasRNNs(name="kn2")
-
-    z = tf.zeros((2, 3))
-
-    kn1(z)  # pylint:disable=not-callable
-    kn2(z)  # pylint:disable=not-callable
-
-    # pylint: disable=protected-access
-    self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables))
-    self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables))
-
-    with tf.layers.experimental.keras_style_scope():
-      kn1_new = KerasNetworkTFRNNs(name="kn1_new")
-      kn2_new = KerasNetworkKerasRNNs(name="kn2_new")
-
-    kn2_new(z)  # pylint:disable=not-callable
-    # Most importantly, this doesn't fail due to variable scope reuse issues.
-    kn1_new(z)  # pylint:disable=not-callable
-
-    self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables))
-    self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables))
-
-
-def get_test_data(train_samples,
-                  test_samples,
-                  input_shape,
-                  num_classes):
-  num_sample = train_samples + test_samples
-  templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
-  y = np.random.randint(0, num_classes, size=(num_sample,))
-  x = np.zeros((num_sample,) + input_shape, dtype=np.float32)
-  for i in range(num_sample):
-    x[i] = templates[y[i]] + np.random.normal(loc=0, scale=1., size=input_shape)
-  return ((x[:train_samples], y[:train_samples]),
-          (x[train_samples:], y[train_samples:]))
+    def setUp(self):
+        super().setUp()
+        self._seed = 23489
+        np.random.seed(self._seed)
+
+    def testRNNWithKerasSimpleRNNCell(self):
+        with self.cached_session() as sess:
+            input_shape = 10
+            output_shape = 5
+            timestep = 4
+            batch = 100
+            (x_train, y_train), _ = get_test_data(
+                train_samples=batch,
+                test_samples=0,
+                input_shape=(timestep, input_shape),
+                num_classes=output_shape,
+            )
+            y_train = tf.keras.utils.to_categorical(y_train)
+            cell = tf.keras.layers.SimpleRNNCell(output_shape)
+
+            inputs = tf.placeholder(
+                tf.float32, shape=(None, timestep, input_shape)
+            )
+            predict = tf.placeholder(tf.float32, shape=(None, output_shape))
+
+            outputs, state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
+            self.assertEqual(
+                outputs.shape.as_list(), [None, timestep, output_shape]
+            )
+            self.assertEqual(state.shape.as_list(), [None, output_shape])
+            loss = tf.keras.losses.categorical_crossentropy(predict, state)
+            train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
+
+            sess.run([tf.global_variables_initializer()])
+            _, outputs, state = sess.run(
+                [train_op, outputs, state], {inputs: x_train, predict: y_train}
+            )
+
+            self.assertEqual(len(outputs), batch)
+            self.assertEqual(len(state), batch)
+
+    def testRNNWithKerasGRUCell(self):
+        with self.cached_session() as sess:
+            input_shape = 10
+            output_shape = 5
+            timestep = 4
+            batch = 100
+            (x_train, y_train), _ = get_test_data(
+                train_samples=batch,
+                test_samples=0,
+                input_shape=(timestep, input_shape),
+                num_classes=output_shape,
+            )
+            y_train = tf.keras.utils.to_categorical(y_train)
+            cell = tf.keras.layers.GRUCell(output_shape)
+
+            inputs = tf.placeholder(
+                tf.float32, shape=(None, timestep, input_shape)
+            )
+            predict = tf.placeholder(tf.float32, shape=(None, output_shape))
+
+            outputs, state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
+            self.assertEqual(
+                outputs.shape.as_list(), [None, timestep, output_shape]
+            )
+            self.assertEqual(state.shape.as_list(), [None, output_shape])
+            loss = tf.keras.losses.categorical_crossentropy(predict, state)
+            train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
+
+            sess.run([tf.global_variables_initializer()])
+            _, outputs, state = sess.run(
+                [train_op, outputs, state], {inputs: x_train, predict: y_train}
+            )
+
+            self.assertEqual(len(outputs), batch)
+            self.assertEqual(len(state), batch)
+
+    def testRNNWithKerasLSTMCell(self):
+        with self.cached_session() as sess:
+            input_shape = 10
+            output_shape = 5
+            timestep = 4
+            batch = 100
+            (x_train, y_train), _ = get_test_data(
+                train_samples=batch,
+                test_samples=0,
+                input_shape=(timestep, input_shape),
+                num_classes=output_shape,
+            )
+            y_train = tf.keras.utils.to_categorical(y_train)
+            cell = tf.keras.layers.LSTMCell(output_shape)
+
+            inputs = tf.placeholder(
+                tf.float32, shape=(None, timestep, input_shape)
+            )
+            predict = tf.placeholder(tf.float32, shape=(None, output_shape))
+
+            outputs, state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
+            self.assertEqual(
+                outputs.shape.as_list(), [None, timestep, output_shape]
+            )
+            self.assertEqual(len(state), 2)
+            self.assertEqual(state[0].shape.as_list(), [None, output_shape])
+            self.assertEqual(state[1].shape.as_list(), [None, output_shape])
+            loss = tf.keras.losses.categorical_crossentropy(predict, state[0])
+            train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
+
+            sess.run([tf.global_variables_initializer()])
+            _, outputs, state = sess.run(
+                [train_op, outputs, state], {inputs: x_train, predict: y_train}
+            )
+
+            self.assertEqual(len(outputs), batch)
+            self.assertEqual(len(state), 2)
+            self.assertEqual(len(state[0]), batch)
+            self.assertEqual(len(state[1]), batch)
+
+    def testRNNWithStackKerasCell(self):
+        with self.cached_session() as sess:
+            input_shape = 10
+            output_shape = 5
+            timestep = 4
+            batch = 100
+            (x_train, y_train), _ = get_test_data(
+                train_samples=batch,
+                test_samples=0,
+                input_shape=(timestep, input_shape),
+                num_classes=output_shape,
+            )
+            y_train = tf.keras.utils.to_categorical(y_train)
+            cell = tf.keras.layers.StackedRNNCells(
+                [
+                    tf.keras.layers.LSTMCell(2 * output_shape),
+                    tf.keras.layers.LSTMCell(output_shape),
+                ]
+            )
+
+            inputs = tf.placeholder(
+                tf.float32, shape=(None, timestep, input_shape)
+            )
+            predict = tf.placeholder(tf.float32, shape=(None, output_shape))
+
+            outputs, state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
+            self.assertEqual(
+                outputs.shape.as_list(), [None, timestep, output_shape]
+            )
+            self.assertEqual(len(state), 2)
+            state = tf.nest.flatten(state)
+            self.assertEqual(len(state), 4)
+            self.assertEqual(state[0].shape.as_list(), [None, 2 * output_shape])
+            self.assertEqual(state[1].shape.as_list(), [None, 2 * output_shape])
+            self.assertEqual(state[2].shape.as_list(), [None, output_shape])
+            self.assertEqual(state[3].shape.as_list(), [None, output_shape])
+            loss = tf.keras.losses.categorical_crossentropy(predict, state[2])
+            train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
+
+            sess.run([tf.global_variables_initializer()])
+            _, outputs, state = sess.run(
+                [train_op, outputs, state], {inputs: x_train, predict: y_train}
+            )
+
+            self.assertEqual(len(outputs), batch)
+            self.assertEqual(len(state), 4)
+            for s in state:
+                self.assertEqual(len(s), batch)
+
+    def testStaticRNNWithKerasSimpleRNNCell(self):
+        with self.cached_session() as sess:
+            input_shape = 10
+            output_shape = 5
+            timestep = 4
+            batch = 100
+            (x_train, y_train), _ = get_test_data(
+                train_samples=batch,
+                test_samples=0,
+                input_shape=(timestep, input_shape),
+                num_classes=output_shape,
+            )
+            x_train = np.transpose(x_train, (1, 0, 2))
+            y_train = tf.keras.utils.to_categorical(y_train)
+            cell = tf.keras.layers.SimpleRNNCell(output_shape)
+
+            inputs = [
+                tf.placeholder(tf.float32, shape=(None, input_shape))
+            ] * timestep
+            predict = tf.placeholder(tf.float32, shape=(None, output_shape))
+
+            outputs, state = tf.nn.static_rnn(cell, inputs, dtype=tf.float32)
+            self.assertEqual(len(outputs), timestep)
+            self.assertEqual(outputs[0].shape.as_list(), [None, output_shape])
+            self.assertEqual(state.shape.as_list(), [None, output_shape])
+            loss = tf.keras.losses.categorical_crossentropy(predict, state)
+            train_op = tf.train.GradientDescentOptimizer(0.001).minimize(loss)
+
+            sess.run([tf.global_variables_initializer()])
+            feed_dict = {i: d for i, d in zip(inputs, x_train)}
+            feed_dict[predict] = y_train
+            _, outputs, state = sess.run([train_op, outputs, state], feed_dict)
+
+            self.assertEqual(len(outputs), timestep)
+            self.assertEqual(len(outputs[0]), batch)
+            self.assertEqual(len(state), batch)
+
+    def testKerasAndTFRNNLayerOutputComparison(self):
+        input_shape = 10
+        output_shape = 5
+        timestep = 4
+        batch = 20
+        (x_train, _), _ = get_test_data(
+            train_samples=batch,
+            test_samples=0,
+            input_shape=(timestep, input_shape),
+            num_classes=output_shape,
+        )
+        fix_weights_generator = tf.keras.layers.SimpleRNNCell(output_shape)
+        fix_weights_generator.build((None, input_shape))
+        weights = fix_weights_generator.get_weights()
+
+        with self.session(graph=tf.Graph()) as sess:
+            inputs = tf.placeholder(
+                tf.float32, shape=(None, timestep, input_shape)
+            )
+            cell = tf.keras.layers.SimpleRNNCell(output_shape)
+            tf_out, tf_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
+            cell.set_weights(weights)
+            [tf_out, tf_state] = sess.run([tf_out, tf_state], {inputs: x_train})
+        with self.session(graph=tf.Graph()) as sess:
+            k_input = tf.keras.Input(
+                shape=(timestep, input_shape), dtype=tf.float32
+            )
+            cell = tf.keras.layers.SimpleRNNCell(output_shape)
+            layer = tf.keras.layers.RNN(
+                cell, return_sequences=True, return_state=True
+            )
+            keras_out = layer(k_input)
+            cell.set_weights(weights)
+            k_out, k_state = sess.run(keras_out, {k_input: x_train})
+        self.assertAllClose(tf_out, k_out)
+        self.assertAllClose(tf_state, k_state)
+
+    def testSimpleRNNCellAndBasicRNNCellComparison(self):
+        input_shape = 10
+        output_shape = 5
+        timestep = 4
+        batch = 20
+        (x_train, _), _ = get_test_data(
+            train_samples=batch,
+            test_samples=0,
+            input_shape=(timestep, input_shape),
+            num_classes=output_shape,
+        )
+        fix_weights_generator = tf.keras.layers.SimpleRNNCell(output_shape)
+        fix_weights_generator.build((None, input_shape))
+        # The SimpleRNNCell contains 3 weights: kernel, recurrent_kernel, and
+        # bias The BasicRNNCell contains 2 weight: kernel and bias, where kernel
+        # is zipped [kernel, recurrent_kernel] in SimpleRNNCell.
+        keras_weights = fix_weights_generator.get_weights()
+        kernel, recurrent_kernel, bias = keras_weights
+        tf_weights = [np.concatenate((kernel, recurrent_kernel)), bias]
+
+        with self.session(graph=tf.Graph()) as sess:
+            inputs = tf.placeholder(
+                tf.float32, shape=(None, timestep, input_shape)
+            )
+            cell = tf.keras.layers.SimpleRNNCell(output_shape)
+            k_out, k_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
+            cell.set_weights(keras_weights)
+            [k_out, k_state] = sess.run([k_out, k_state], {inputs: x_train})
+        with self.session(graph=tf.Graph()) as sess:
+            inputs = tf.placeholder(
+                tf.float32, shape=(None, timestep, input_shape)
+            )
+            cell = tf.nn.rnn_cell.BasicRNNCell(output_shape)
+            tf_out, tf_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
+            cell.set_weights(tf_weights)
+            [tf_out, tf_state] = sess.run([tf_out, tf_state], {inputs: x_train})
+
+        self.assertAllClose(tf_out, k_out, atol=1e-5)
+        self.assertAllClose(tf_state, k_state, atol=1e-5)
+
+    def testRNNCellSerialization(self):
+        for cell in [
+            tf.nn.rnn_cell.LSTMCell(32, use_peepholes=True, cell_clip=True),
+            tf.nn.rnn_cell.BasicLSTMCell(32, dtype=tf.float32),
+            tf.nn.rnn_cell.BasicRNNCell(
+                32, activation="relu", dtype=tf.float32
+            ),
+            tf.nn.rnn_cell.GRUCell(32, dtype=tf.float32),
+        ]:
+            with self.cached_session():
+                x = tf.keras.Input((None, 5))
+                layer = tf.keras.layers.RNN(cell)
+                y = layer(x)
+                model = tf.keras.models.Model(x, y)
+                model.compile(optimizer="rmsprop", loss="mse")
+
+                # Test basic case serialization.
+                x_np = np.random.random((6, 5, 5))
+                y_np = model.predict(x_np)
+                weights = model.get_weights()
+                config = layer.get_config()
+                # The custom_objects is important here since rnn_cell_impl is
+                # not visible as a Keras layer, and also has a name conflict
+                # with keras.LSTMCell and GRUCell.
+                layer = tf.keras.layers.RNN.from_config(
+                    config,
+                    custom_objects={
+                        "BasicRNNCell": tf.nn.rnn_cell.BasicRNNCell,
+                        "GRUCell": tf.nn.rnn_cell.GRUCell,
+                        "LSTMCell": tf.nn.rnn_cell.LSTMCell,
+                        "BasicLSTMCell": tf.nn.rnn_cell.BasicLSTMCell,
+                    },
+                )
+                y = layer(x)
+                model = tf.keras.models.Model(x, y)
+                model.set_weights(weights)
+                y_np_2 = model.predict(x_np)
+                self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    def testRNNCellActsLikeKerasRNNCellInProperScope(self):
+        with tf.layers.experimental.keras_style_scope():
+            kn1 = KerasNetworkTFRNNs(name="kn1")
+            kn2 = KerasNetworkKerasRNNs(name="kn2")
+
+        z = tf.zeros((2, 3))
+
+        kn1(z)
+        kn2(z)
+
+        self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables))
+        self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables))
+
+        with tf.layers.experimental.keras_style_scope():
+            kn1_new = KerasNetworkTFRNNs(name="kn1_new")
+            kn2_new = KerasNetworkKerasRNNs(name="kn2_new")
+
+        kn2_new(z)
+        # Most importantly, this doesn't fail due to variable scope reuse
+        # issues.
+        kn1_new(z)
+
+        self.assertTrue(
+            all("kn1_new" in v.name for v in kn1_new._cell.variables)
+        )
+        self.assertTrue(
+            all("kn2_new" in v.name for v in kn2_new._cell.variables)
+        )
+
+
+def get_test_data(train_samples, test_samples, input_shape, num_classes):
+    num_sample = train_samples + test_samples
+    templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
+    y = np.random.randint(0, num_classes, size=(num_sample,))
+    x = np.zeros((num_sample,) + input_shape, dtype=np.float32)
+    for i in range(num_sample):
+        x[i] = templates[y[i]] + np.random.normal(
+            loc=0, scale=1.0, size=input_shape
+        )
+    return (
+        (x[:train_samples], y[:train_samples]),
+        (x[train_samples:], y[train_samples:]),
+    )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/integration_test/models/BUILD b/keras/integration_test/models/BUILD
new file mode 100644
index 000000000000..daf1ba141adb
--- /dev/null
+++ b/keras/integration_test/models/BUILD
@@ -0,0 +1,36 @@
+# Description:
+#   Contains a collection of diverse Keras models to be used for integration tests.
+
+# Placeholder: load unaliased py_library
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
+    default_visibility = [
+        "//keras:friends",
+    ],
+    licenses = ["notice"],
+)
+
+py_library(
+    name = "models",
+    srcs = [
+        "__init__.py",
+        "bert.py",
+        "ctc_speech_rnn.py",
+        "dcgan.py",
+        "edge_case_model.py",
+        "efficientnet_v2.py",
+        "input_spec.py",
+        "low_level_model.py",
+        "mini_unet.py",
+        "mini_xception.py",
+        "retinanet.py",
+        "structured_data_classification.py",
+        "text_classification.py",
+        "timeseries_forecasting.py",
+        "translation.py",
+        "vae.py",
+    ],
+    srcs_version = "PY3",
+    deps = ["//:expect_tensorflow_installed"],
+)
diff --git a/keras/wrappers/__init__.py b/keras/integration_test/models/__init__.py
similarity index 100%
rename from keras/wrappers/__init__.py
rename to keras/integration_test/models/__init__.py
diff --git a/keras/integration_test/models/bert.py b/keras/integration_test/models/bert.py
new file mode 100644
index 000000000000..ea20aa041dbd
--- /dev/null
+++ b/keras/integration_test/models/bert.py
@@ -0,0 +1,150 @@
+"""Bert model.
+
+Adapted from https://keras.io/examples/nlp/masked_language_modeling/
+"""
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+SEQUENCE_LENGTH = 16
+VOCAB_SIZE = 1000
+EMBED_DIM = 64
+NUM_HEAD = 2
+FF_DIM = 32
+NUM_LAYERS = 2
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size,), dtype="string"),
+        InputSpec((batch_size, SEQUENCE_LENGTH, VOCAB_SIZE)),
+    )
+
+
+def get_input_preprocessor():
+    input_vectorizer = keras.layers.TextVectorization(
+        max_tokens=VOCAB_SIZE,
+        output_mode="int",
+        output_sequence_length=SEQUENCE_LENGTH,
+    )
+    text_ds = tf.data.Dataset.from_tensor_slices(
+        [
+            "Lorem ipsum dolor sit amet",
+            "consectetur adipiscing elit",
+            "sed do eiusmod tempor incididunt ut",
+            "labore et dolore magna aliqua.",
+            "Ut enim ad minim veniam",
+            "quis nostrud exercitation ullamco",
+            "laboris nisi ut aliquip ex ea commodo consequat.",
+        ]
+    )
+    input_vectorizer.adapt(text_ds)
+    return input_vectorizer
+
+
+def bert_module(query, key, value, i):
+    attention_output = keras.layers.MultiHeadAttention(
+        num_heads=NUM_HEAD,
+        key_dim=EMBED_DIM // NUM_HEAD,
+    )(query, key, value)
+    attention_output = keras.layers.Dropout(0.1)(attention_output)
+    attention_output = keras.layers.LayerNormalization(epsilon=1e-6)(
+        query + attention_output
+    )
+
+    ffn = keras.Sequential(
+        [
+            keras.layers.Dense(FF_DIM, activation="relu"),
+            keras.layers.Dense(EMBED_DIM),
+        ],
+    )
+    ffn_output = ffn(attention_output)
+    ffn_output = keras.layers.Dropout(0.1)(ffn_output)
+    sequence_output = keras.layers.LayerNormalization(epsilon=1e-6)(
+        attention_output + ffn_output
+    )
+    return sequence_output
+
+
+def get_pos_encoding_matrix(max_len, d_emb):
+    pos_enc = np.array(
+        [
+            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
+            if pos != 0
+            else np.zeros(d_emb)
+            for pos in range(max_len)
+        ]
+    )
+    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])
+    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])
+    return pos_enc
+
+
+loss_fn = keras.losses.CategoricalCrossentropy()
+loss_tracker = keras.metrics.Mean(name="loss")
+
+
+class MaskedLanguageModel(keras.Model):
+    def train_step(self, inputs):
+        if len(inputs) == 3:
+            features, labels, sample_weight = inputs
+        else:
+            features, labels = inputs
+            sample_weight = None
+
+        with tf.GradientTape() as tape:
+            predictions = self(features, training=True)
+            loss = loss_fn(labels, predictions, sample_weight=sample_weight)
+
+        trainable_vars = self.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+        loss_tracker.update_state(loss, sample_weight=sample_weight)
+        return {"loss": loss_tracker.result()}
+
+    @property
+    def metrics(self):
+        return [loss_tracker]
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    if include_preprocessing:
+        inputs = keras.layers.Input((), dtype="string")
+        x = get_input_preprocessor()(inputs)
+    else:
+        inputs = keras.layers.Input((SEQUENCE_LENGTH,), dtype=tf.int64)
+        x = inputs
+    word_embeddings = keras.layers.Embedding(VOCAB_SIZE, EMBED_DIM)(x)
+    position_embeddings = keras.layers.Embedding(
+        input_dim=SEQUENCE_LENGTH,
+        output_dim=EMBED_DIM,
+        weights=[get_pos_encoding_matrix(SEQUENCE_LENGTH, EMBED_DIM)],
+        trainable=False,
+    )(tf.range(start=0, limit=SEQUENCE_LENGTH, delta=1))
+    embeddings = word_embeddings + position_embeddings
+
+    encoder_output = embeddings
+    for i in range(NUM_LAYERS):
+        encoder_output = bert_module(
+            encoder_output, encoder_output, encoder_output, i
+        )
+
+    mlm_output = keras.layers.Dense(
+        VOCAB_SIZE, name="mlm_cls", activation="softmax"
+    )(encoder_output)
+    model = MaskedLanguageModel(inputs, mlm_output)
+
+    if compile:
+        optimizer = keras.optimizers.Adam()
+        model.compile(optimizer=optimizer, jit_compile=jit_compile)
+    return model
+
+
+def get_custom_objects():
+    return {
+        "MaskedLanguageModel": MaskedLanguageModel,
+    }
diff --git a/keras/integration_test/models/ctc_speech_rnn.py b/keras/integration_test/models/ctc_speech_rnn.py
new file mode 100644
index 000000000000..1324581b8ed4
--- /dev/null
+++ b/keras/integration_test/models/ctc_speech_rnn.py
@@ -0,0 +1,100 @@
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+TIMESTEPS = 64
+INPUT_DIM = 50
+OUTPUT_DIM = 40
+NUM_RNN_LAYERS = 2
+RNN_UNITS = 32
+
+
+def get_input_preprocessor():
+    return None
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size, TIMESTEPS, INPUT_DIM)),
+        InputSpec((batch_size, 1), dtype="int64", range=[0, OUTPUT_DIM]),
+    )
+
+
+def ctc_loss(y_true, y_pred):
+    batch_length = tf.cast(tf.shape(y_true)[0], dtype="int64")
+    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
+    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
+
+    input_length = input_length * tf.ones(
+        shape=(batch_length, 1), dtype="int64"
+    )
+    label_length = label_length * tf.ones(
+        shape=(batch_length, 1), dtype="int64"
+    )
+
+    return keras.backend.ctc_batch_cost(
+        y_true, y_pred, input_length, label_length
+    )
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    input_spectrogram = keras.layers.Input((None, INPUT_DIM), name="input")
+    x = keras.layers.Reshape((-1, INPUT_DIM, 1), name="expand_dim")(
+        input_spectrogram
+    )
+    x = keras.layers.Conv2D(
+        filters=32,
+        kernel_size=[11, 41],
+        strides=[2, 2],
+        padding="same",
+        use_bias=False,
+        name="conv_1",
+    )(x)
+    x = keras.layers.BatchNormalization(name="conv_1_bn")(x)
+    x = keras.layers.ReLU(name="conv_1_relu")(x)
+    x = keras.layers.Conv2D(
+        filters=32,
+        kernel_size=[11, 21],
+        strides=[1, 2],
+        padding="same",
+        use_bias=False,
+        name="conv_2",
+    )(x)
+    x = keras.layers.BatchNormalization(name="conv_2_bn")(x)
+    x = keras.layers.ReLU(name="conv_2_relu")(x)
+    x = keras.layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
+    for i in range(1, NUM_RNN_LAYERS + 1):
+        recurrent = keras.layers.GRU(
+            units=RNN_UNITS,
+            activation="tanh",
+            recurrent_activation="sigmoid",
+            use_bias=True,
+            return_sequences=True,
+            reset_after=True,
+            name=f"gru_{i}",
+        )
+        x = keras.layers.Bidirectional(
+            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
+        )(x)
+        if i < NUM_RNN_LAYERS:
+            x = keras.layers.Dropout(rate=0.5)(x)
+    x = keras.layers.Dense(units=RNN_UNITS * 2, name="dense_1")(x)
+    x = keras.layers.ReLU(name="dense_1_relu")(x)
+    x = keras.layers.Dropout(rate=0.5)(x)
+    output = keras.layers.Dense(units=OUTPUT_DIM + 1, activation="softmax")(x)
+    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
+
+    if compile:
+        model.compile(
+            optimizer=keras.optimizers.Adam(learning_rate=1e-4),
+            loss=ctc_loss,
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {"ctc_loss": ctc_loss}
diff --git a/keras/integration_test/models/dcgan.py b/keras/integration_test/models/dcgan.py
new file mode 100644
index 000000000000..ec23da91b331
--- /dev/null
+++ b/keras/integration_test/models/dcgan.py
@@ -0,0 +1,179 @@
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+from keras.saving import serialization_lib
+
+IMG_SIZE = (64, 64)
+LATENT_DIM = 128
+
+
+def get_data_spec(batch_size):
+    return InputSpec((batch_size,) + IMG_SIZE + (3,))
+
+
+def get_input_preprocessor():
+    return None
+
+
+class GAN(keras.Model):
+    def __init__(self, discriminator, generator, latent_dim):
+        super(GAN, self).__init__()
+        self.discriminator = discriminator
+        self.generator = generator
+        self.latent_dim = latent_dim
+
+    def compile(self, d_optimizer, g_optimizer, loss_fn, jit_compile=False):
+        super(GAN, self).compile(jit_compile=jit_compile)
+        self.d_optimizer = d_optimizer
+        self.g_optimizer = g_optimizer
+        self.loss_fn = loss_fn
+        self.d_loss_metric = keras.metrics.Mean(name="d_loss")
+        self.g_loss_metric = keras.metrics.Mean(name="g_loss")
+
+    @property
+    def metrics(self):
+        return [self.d_loss_metric, self.g_loss_metric]
+
+    def train_step(self, real_images):
+        batch_size = tf.shape(real_images)[0]
+        random_latent_vectors = tf.random.normal(
+            shape=(batch_size, self.latent_dim)
+        )
+        generated_images = self.generator(random_latent_vectors)
+        combined_images = tf.concat([generated_images, real_images], axis=0)
+        labels = tf.concat(
+            [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0
+        )
+        labels += 0.05 * tf.random.uniform(tf.shape(labels))
+
+        with tf.GradientTape() as tape:
+            predictions = self.discriminator(combined_images)
+            d_loss = self.loss_fn(labels, predictions)
+        grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
+        self.d_optimizer.apply_gradients(
+            zip(grads, self.discriminator.trainable_weights)
+        )
+
+        random_latent_vectors = tf.random.normal(
+            shape=(batch_size, self.latent_dim)
+        )
+        misleading_labels = tf.zeros((batch_size, 1))
+
+        with tf.GradientTape() as tape:
+            predictions = self.discriminator(
+                self.generator(random_latent_vectors)
+            )
+            g_loss = self.loss_fn(misleading_labels, predictions)
+        grads = tape.gradient(g_loss, self.generator.trainable_weights)
+        self.g_optimizer.apply_gradients(
+            zip(grads, self.generator.trainable_weights)
+        )
+        self.d_loss_metric.update_state(d_loss)
+        self.g_loss_metric.update_state(g_loss)
+        return {
+            "d_loss": self.d_loss_metric.result(),
+            "g_loss": self.g_loss_metric.result(),
+        }
+
+    def get_config(self):
+        return {
+            "discriminator": self.discriminator,
+            "generator": self.generator,
+            "latent_dim": self.latent_dim,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        discriminator = serialization_lib.deserialize_keras_object(
+            config["discriminator"]
+        )
+        generator = serialization_lib.deserialize_keras_object(
+            config["generator"]
+        )
+        latent_dim = config["latent_dim"]
+        return cls(discriminator, generator, latent_dim)
+
+    def get_compile_config(self):
+        return {
+            "loss_fn": self.loss_fn,
+            "d_optimizer": self.d_optimizer,
+            "g_optimizer": self.g_optimizer,
+            "jit_compile": self.jit_compile,
+        }
+
+    def compile_from_config(self, config):
+        loss_fn = serialization_lib.deserialize_keras_object(config["loss_fn"])
+        d_optimizer = serialization_lib.deserialize_keras_object(
+            config["d_optimizer"]
+        )
+        g_optimizer = serialization_lib.deserialize_keras_object(
+            config["g_optimizer"]
+        )
+        jit_compile = config["jit_compile"]
+        self.compile(
+            loss_fn=loss_fn,
+            d_optimizer=d_optimizer,
+            g_optimizer=g_optimizer,
+            jit_compile=jit_compile,
+        )
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    discriminator = keras.Sequential(
+        [
+            keras.Input(shape=IMG_SIZE + (3,)),
+            keras.layers.Conv2D(64, kernel_size=4, strides=2, padding="same"),
+            keras.layers.LeakyReLU(alpha=0.2),
+            keras.layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
+            keras.layers.LeakyReLU(alpha=0.2),
+            keras.layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
+            keras.layers.LeakyReLU(alpha=0.2),
+            keras.layers.Flatten(),
+            keras.layers.Dropout(0.2),
+            keras.layers.Dense(1, activation="sigmoid"),
+        ],
+        name="discriminator",
+    )
+
+    generator = keras.Sequential(
+        [
+            keras.Input(shape=(LATENT_DIM,)),
+            keras.layers.Dense(8 * 8 * 128),
+            keras.layers.Reshape((8, 8, 128)),
+            keras.layers.Conv2DTranspose(
+                128, kernel_size=4, strides=2, padding="same"
+            ),
+            keras.layers.LeakyReLU(alpha=0.2),
+            keras.layers.Conv2DTranspose(
+                256, kernel_size=4, strides=2, padding="same"
+            ),
+            keras.layers.LeakyReLU(alpha=0.2),
+            keras.layers.Conv2DTranspose(
+                512, kernel_size=4, strides=2, padding="same"
+            ),
+            keras.layers.LeakyReLU(alpha=0.2),
+            keras.layers.Conv2D(
+                3, kernel_size=5, padding="same", activation="sigmoid"
+            ),
+        ],
+        name="generator",
+    )
+
+    gan = GAN(
+        discriminator=discriminator, generator=generator, latent_dim=LATENT_DIM
+    )
+    if compile:
+        gan.compile(
+            d_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
+            g_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
+            loss_fn=keras.losses.BinaryCrossentropy(),
+            jit_compile=jit_compile,
+        )
+    return gan
+
+
+def get_custom_objects():
+    return {"GAN": GAN}
diff --git a/keras/integration_test/models/edge_case_model.py b/keras/integration_test/models/edge_case_model.py
new file mode 100644
index 000000000000..0fd8d1670424
--- /dev/null
+++ b/keras/integration_test/models/edge_case_model.py
@@ -0,0 +1,155 @@
+"""Model that incorporates a set of edge case development patterns.
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+INPUT_DIM = 32
+NUM_CLASSES = 5
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size, INPUT_DIM)),
+        InputSpec((batch_size, NUM_CLASSES)),
+    )
+
+
+def get_input_preprocessor():
+    return None
+
+
+class LinearA(keras.layers.Layer):
+    """Standard custom layer with 2 call() inputs."""
+
+    def __init__(self, units=32, input_dim=32):
+        super().__init__()
+        self.w = self.add_weight(
+            shape=(input_dim, units),
+            initializer="random_normal",
+            trainable=True,
+        )
+        self.b = self.add_weight(
+            shape=(units,), initializer="zeros", trainable=True
+        )
+
+    def call(self, inputs_1, inputs_2):
+        return (
+            tf.matmul(inputs_1, self.w) + tf.matmul(inputs_2, self.w) + self.b
+        )
+
+
+class LinearB(keras.layers.Layer):
+    """Layer that tracks weights in a dict attribute that gets updated later."""
+
+    def __init__(self, units=32, input_dim=32, **kwargs):
+        super().__init__(**kwargs)
+        w_init = tf.random_normal_initializer()
+        b_init = tf.zeros_initializer()
+        self.state = {
+            "kernel": tf.Variable(
+                initial_value=w_init(shape=(input_dim, units), dtype="float32"),
+                trainable=True,
+                name="kernel",
+            )
+        }
+        self.state["bias"] = tf.Variable(
+            initial_value=b_init(shape=(units,), dtype="float32"),
+            trainable=True,
+            name="bias",
+        )
+
+    def call(self, inputs):
+        return tf.matmul(inputs, self.state["kernel"]) + self.state["bias"]
+
+
+class LinearC(keras.layers.Layer):
+    """Layer that creates weights in call()."""
+
+    def __init__(self, units=32, input_dim=32, **kwargs):
+        super().__init__(**kwargs)
+        self._custom_built = False
+        self.units = units
+        self.input_dim = input_dim
+
+    def call(self, inputs):
+        if not self._custom_built:
+            self.w = self.add_weight(
+                shape=(self.input_dim, self.units),
+                initializer="random_normal",
+                trainable=True,
+            )
+            self.b = self.add_weight(
+                shape=(self.units,), initializer="zeros", trainable=True
+            )
+            self._custom_built = True
+        return tf.matmul(inputs, self.w) + self.b
+
+
+class BatchNorm(keras.layers.Layer):
+    """Layer with different training/test behavior and non-trainable updates."""
+
+    def __init__(
+        self, scale=True, center=True, epsilon=1e-6, momentum=0.9, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.scale = scale
+        self.center = center
+        self.epsilon = epsilon
+        self.momentum = momentum
+
+    def build(self, input_shape):
+        self.var = self.add_weight(
+            shape=[input_shape[1]], initializer="ones", trainable=False
+        )
+        self.mean = self.add_weight(
+            shape=[input_shape[1]], initializer="zeros", trainable=False
+        )
+        self.gamma = self.add_weight(shape=[input_shape[1]], initializer="ones")
+        self.beta = self.add_weight(shape=[input_shape[1]], initializer="zeros")
+
+    def call(self, inputs, training=False):
+        if training:
+            mean, var = tf.nn.moments(inputs, axes=[0])
+            outputs = (inputs - mean) / (var + self.epsilon)
+            self.var.assign(self.var * self.momentum + var * 0.1)
+            self.mean.assign(self.mean * self.momentum + mean * 0.1)
+        else:
+            outputs = (inputs - self.mean) / (self.var + self.epsilon)
+        if self.scale:
+            outputs *= self.gamma
+        if self.center:
+            outputs += self.beta
+        return outputs
+
+
+class FunctionalSubclassModel(keras.Model):
+    def __init__(self, **kwargs):
+        inputs = keras.Input((INPUT_DIM,))
+        x = inputs
+        x = LinearA(32, INPUT_DIM)(x, x)
+        x = LinearB(32, 32)(x)
+        x = LinearC(32, 32)(x)
+        x = BatchNorm()(x)
+        outputs = keras.layers.Dense(NUM_CLASSES, activation="softmax")(x)
+        super().__init__(inputs, outputs, **kwargs)
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    model = FunctionalSubclassModel()
+    if compile:
+        model.compile("rmsprop", "mse", jit_compile=jit_compile)
+    return model
+
+
+def get_custom_objects():
+    return {
+        "LinearA": LinearA,
+        "LinearB": LinearB,
+        "LinearC": LinearC,
+        "BatchNorm": BatchNorm,
+    }
diff --git a/keras/integration_test/models/efficientnet_v2.py b/keras/integration_test/models/efficientnet_v2.py
new file mode 100644
index 000000000000..68e392671908
--- /dev/null
+++ b/keras/integration_test/models/efficientnet_v2.py
@@ -0,0 +1,315 @@
+"""Image classification with EfficientNetV2 architecture.
+
+Adapted from the EfficientNetV2 Keras Application.
+"""
+import math
+
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+IMG_SIZE = (96, 96)
+NUM_CLASSES = 5
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size,) + IMG_SIZE + (3,)),
+        InputSpec((batch_size, NUM_CLASSES)),
+    )
+
+
+def get_input_preprocessor():
+    return keras.layers.Rescaling(scale=1.0 / 128.0, offset=-1)
+
+
+def round_filters(filters, width_coefficient, min_depth, depth_divisor):
+    filters *= width_coefficient
+    minimum_depth = min_depth or depth_divisor
+    new_filters = max(
+        minimum_depth,
+        int(filters + depth_divisor / 2) // depth_divisor * depth_divisor,
+    )
+    return int(new_filters)
+
+
+def MBConvBlock(
+    input_filters: int,
+    output_filters: int,
+    expand_ratio=1,
+    kernel_size=3,
+    strides=1,
+    se_ratio=0.0,
+    activation="swish",
+    survival_probability: float = 0.8,
+):
+    def apply(inputs):
+        filters = input_filters * expand_ratio
+        if expand_ratio != 1:
+            x = keras.layers.Conv2D(
+                filters=filters,
+                kernel_size=1,
+                strides=1,
+                padding="same",
+                data_format="channels_last",
+                use_bias=False,
+            )(inputs)
+            x = keras.layers.BatchNormalization()(x)
+            x = keras.layers.Activation(activation)(x)
+        else:
+            x = inputs
+
+        x = keras.layers.DepthwiseConv2D(
+            kernel_size=kernel_size,
+            strides=strides,
+            padding="same",
+            data_format="channels_last",
+            use_bias=False,
+        )(x)
+        x = keras.layers.BatchNormalization()(x)
+        x = keras.layers.Activation(activation)(x)
+
+        if 0 < se_ratio <= 1:
+            filters_se = max(1, int(input_filters * se_ratio))
+            se = keras.layers.GlobalAveragePooling2D()(x)
+            se = keras.layers.Reshape((1, 1, filters))(se)
+            se = keras.layers.Conv2D(
+                filters_se,
+                1,
+                padding="same",
+                activation=activation,
+            )(se)
+            se = keras.layers.Conv2D(
+                filters,
+                1,
+                padding="same",
+                activation="sigmoid",
+            )(se)
+            x = keras.layers.multiply([x, se])
+            x = keras.layers.Conv2D(
+                filters=output_filters,
+                kernel_size=1,
+                strides=1,
+                padding="same",
+                data_format="channels_last",
+                use_bias=False,
+            )(x)
+            x = keras.layers.BatchNormalization()(x)
+
+            if strides == 1 and input_filters == output_filters:
+                if survival_probability:
+                    x = keras.layers.Dropout(
+                        survival_probability,
+                        noise_shape=(None, 1, 1, 1),
+                    )(x)
+                x = keras.layers.add([x, inputs])
+        return x
+
+    return apply
+
+
+def FusedMBConvBlock(
+    input_filters: int,
+    output_filters: int,
+    expand_ratio=1,
+    kernel_size=3,
+    strides=1,
+    se_ratio=0.0,
+    activation="swish",
+    survival_probability: float = 0.8,
+):
+    def apply(inputs):
+        filters = input_filters * expand_ratio
+        if expand_ratio != 1:
+            x = keras.layers.Conv2D(
+                filters,
+                kernel_size=kernel_size,
+                strides=strides,
+                data_format="channels_last",
+                padding="same",
+                use_bias=False,
+            )(inputs)
+            x = keras.layers.BatchNormalization()(x)
+            x = keras.layers.Activation(activation)(x)
+        else:
+            x = inputs
+
+        if 0 < se_ratio <= 1:
+            filters_se = max(1, int(input_filters * se_ratio))
+            se = keras.layers.GlobalAveragePooling2D()(x)
+            se = keras.layers.Reshape((1, 1, filters))(se)
+            se = keras.layers.Conv2D(
+                filters_se,
+                1,
+                padding="same",
+                activation=activation,
+            )(se)
+            se = keras.layers.Conv2D(
+                filters,
+                1,
+                padding="same",
+                activation="sigmoid",
+            )(se)
+            x = keras.layers.multiply([x, se])
+
+        x = keras.layers.Conv2D(
+            output_filters,
+            kernel_size=1 if expand_ratio != 1 else kernel_size,
+            strides=1 if expand_ratio != 1 else strides,
+            padding="same",
+            use_bias=False,
+        )(x)
+        x = keras.layers.BatchNormalization()(x)
+
+        if expand_ratio == 1:
+            x = keras.layers.Activation(activation)(x)
+
+        if strides == 1 and input_filters == output_filters:
+            if survival_probability:
+                x = keras.layers.Dropout(
+                    survival_probability,
+                    noise_shape=(None, 1, 1, 1),
+                )(x)
+            x = keras.layers.add([x, inputs])
+
+        return x
+
+    return apply
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    width_coefficient = 1.0
+    depth_coefficient = 1.0
+    dropout_rate = 0.2
+    drop_connect_rate = 0.2
+    depth_divisor = 8
+    min_depth = 8
+    activation = "swish"
+    blocks_args = [
+        {
+            "kernel_size": 3,
+            "num_repeat": 2,
+            "input_filters": 24,
+            "output_filters": 24,
+            "expand_ratio": 1,
+            "se_ratio": 0.0,
+            "strides": 1,
+            "conv_type": 1,
+        },
+        {
+            "kernel_size": 3,
+            "num_repeat": 4,
+            "input_filters": 24,
+            "output_filters": 48,
+            "expand_ratio": 4,
+            "se_ratio": 0.0,
+            "strides": 2,
+            "conv_type": 1,
+        },
+        {
+            "conv_type": 1,
+            "expand_ratio": 4,
+            "input_filters": 48,
+            "kernel_size": 3,
+            "num_repeat": 4,
+            "output_filters": 64,
+            "se_ratio": 0,
+            "strides": 2,
+        },
+        {
+            "conv_type": 0,
+            "expand_ratio": 4,
+            "input_filters": 64,
+            "kernel_size": 3,
+            "num_repeat": 6,
+            "output_filters": 128,
+            "se_ratio": 0.25,
+            "strides": 2,
+        },
+    ]
+
+    inputs = keras.layers.Input(shape=IMG_SIZE + (3,))
+    if include_preprocessing:
+        x = get_input_preprocessor()(inputs)
+    else:
+        x = inputs
+
+    stem_filters = round_filters(
+        filters=blocks_args[0]["input_filters"],
+        width_coefficient=width_coefficient,
+        min_depth=min_depth,
+        depth_divisor=depth_divisor,
+    )
+    x = keras.layers.Conv2D(
+        filters=stem_filters,
+        kernel_size=3,
+        strides=2,
+        padding="same",
+        use_bias=False,
+    )(x)
+    x = keras.layers.BatchNormalization()(x)
+    x = keras.layers.Activation(activation, name="stem_activation")(x)
+
+    b = 0
+    blocks = float(sum(args["num_repeat"] for args in blocks_args))
+    for _, args in enumerate(blocks_args):
+        args["input_filters"] = round_filters(
+            filters=args["input_filters"],
+            width_coefficient=width_coefficient,
+            min_depth=min_depth,
+            depth_divisor=depth_divisor,
+        )
+        args["output_filters"] = round_filters(
+            filters=args["output_filters"],
+            width_coefficient=width_coefficient,
+            min_depth=min_depth,
+            depth_divisor=depth_divisor,
+        )
+        block = {0: MBConvBlock, 1: FusedMBConvBlock}[args.pop("conv_type")]
+        repeats = int(math.ceil(depth_coefficient * args.pop("num_repeat")))
+        for j in range(repeats):
+            if j > 0:
+                args["strides"] = 1
+                args["input_filters"] = args["output_filters"]
+
+            x = block(
+                activation=activation,
+                survival_probability=drop_connect_rate * b / blocks,
+                **args,
+            )(x)
+            b += 1
+
+    top_filters = round_filters(
+        filters=1280,
+        width_coefficient=width_coefficient,
+        min_depth=min_depth,
+        depth_divisor=depth_divisor,
+    )
+    x = keras.layers.Conv2D(
+        filters=top_filters,
+        kernel_size=1,
+        strides=1,
+        padding="same",
+        data_format="channels_last",
+        use_bias=False,
+    )(x)
+    x = keras.layers.BatchNormalization()(x)
+    x = keras.layers.Activation(activation=activation, name="top_activation")(x)
+    x = keras.layers.GlobalAveragePooling2D(name="avg_pool")(x)
+    x = keras.layers.Dropout(dropout_rate, name="top_dropout")(x)
+    x = keras.layers.Dense(
+        NUM_CLASSES,
+        activation="softmax",
+    )(x)
+    model = keras.Model(inputs, x)
+    if compile:
+        model.compile(
+            "adam", loss="categorical_crossentropy", jit_compile=jit_compile
+        )
+    return model
+
+
+def get_custom_objects():
+    return {}
diff --git a/keras/integration_test/models/input_spec.py b/keras/integration_test/models/input_spec.py
new file mode 100644
index 000000000000..5805fcbbc108
--- /dev/null
+++ b/keras/integration_test/models/input_spec.py
@@ -0,0 +1,24 @@
+"""Class to specify an input's shape/dtype/value range.
+"""
+
+import tensorflow as tf
+
+
+class InputSpec:
+    def __init__(self, shape, dtype="float32", range=None):
+        self.shape = shape
+        self.dtype = dtype
+        self.range = range
+
+
+def spec_to_value(spec):
+    shape = spec.shape
+    dtype = spec.dtype
+    rg = spec.range or [0, 1]
+    if dtype == "string":
+        return tf.constant(
+            ["some string" for _ in range(shape[0])], dtype="string"
+        )
+    return tf.random.stateless_uniform(
+        shape, seed=[123, 1], minval=rg[0], maxval=rg[1], dtype=dtype
+    )
diff --git a/keras/integration_test/models/low_level_model.py b/keras/integration_test/models/low_level_model.py
new file mode 100644
index 000000000000..1bf03bbab4eb
--- /dev/null
+++ b/keras/integration_test/models/low_level_model.py
@@ -0,0 +1,162 @@
+"""Model where almost everything is implemented from scratch.
+
+- Custom layers
+- Custom model subclass
+- Custom train_step and test_step
+- Custom compile()
+- Custom learning rate schedule
+- Custom metrics
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+INPUT_DIM = 32
+NUM_CLASSES = 5
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size, INPUT_DIM)),
+        InputSpec((batch_size, NUM_CLASSES)),
+    )
+
+
+def get_input_preprocessor():
+    return None
+
+
+class Linear(keras.layers.Layer):
+    def __init__(self, units=32, name=None):
+        super().__init__(name=name)
+        self.units = units
+
+    def build(self, input_shape):
+        self.w = self.add_weight(
+            shape=(input_shape[-1], self.units),
+            initializer="random_normal",
+            trainable=True,
+            name="w",
+        )
+        self.b = self.add_weight(
+            shape=(self.units,),
+            initializer="random_normal",
+            trainable=True,
+            name="b",
+        )
+
+    def call(self, inputs):
+        return tf.matmul(inputs, self.w) + self.b
+
+
+class BinaryTruePositives(tf.keras.metrics.Metric):
+    def __init__(self, name="binary_true_positives", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.true_positives = self.add_weight(name="tp", initializer="zeros")
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
+
+        values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+        values = tf.cast(values, self.dtype)
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, self.dtype)
+            values = tf.multiply(values, sample_weight)
+        self.true_positives.assign_add(tf.reduce_sum(values))
+
+    def result(self):
+        return self.true_positives
+
+    def reset_state(self):
+        self.true_positives.assign(0)
+
+
+class CustomModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+        self.btp_metric = BinaryTruePositives(name="mae")
+
+        self.linear_1 = Linear(32, name="linear_1")
+        self.linear_2 = Linear(NUM_CLASSES, name="linear_2")
+
+    def call(self, inputs, training=False):
+        x = self.linear_1(inputs)
+        x = self.linear_2(x)
+        return x
+
+    def train_step(self, data):
+        x, y = data
+        with tf.GradientTape() as tape:
+            y_pred = self(x, training=True)
+            loss = keras.losses.mean_squared_error(y, y_pred)
+
+        trainable_vars = self.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+        self.loss_tracker.update_state(loss)
+        self.btp_metric.update_state(y, y_pred)
+        return {
+            "loss": self.loss_tracker.result(),
+            "btp": self.btp_metric.result(),
+        }
+
+    def test_step(self, data):
+        x, y = data
+        y_pred = self(x, training=True)
+        loss = keras.losses.mean_squared_error(y, y_pred)
+        self.loss_tracker.update_state(loss)
+        self.btp_metric.update_state(y, y_pred)
+        return {
+            "loss": self.loss_tracker.result(),
+            "btp": self.btp_metric.result(),
+        }
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker, self.btp_metric]
+
+
+class CustomLRSchedule(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(self, initial_learning_rate):
+        self.initial_learning_rate = initial_learning_rate
+
+    def __call__(self, step):
+        return self.initial_learning_rate / tf.cast(step + 1, "float32")
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+        }
+
+
+def custom_loss(y_true, y_pred):
+    return keras.losses.mse(y_true, y_pred)
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    model = CustomModel()
+    if build:
+        model(tf.zeros((1, INPUT_DIM)))
+    if compile:
+        model.compile(
+            optimizer=keras.optimizers.Adam(CustomLRSchedule(0.1)),
+            loss=custom_loss,
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {
+        "Linear": Linear,
+        "CustomLRSchedule": CustomLRSchedule,
+        "CustomModel": CustomModel,
+        "BinaryTruePositives": BinaryTruePositives,
+        "custom_loss": custom_loss,
+    }
diff --git a/keras/integration_test/models/mini_unet.py b/keras/integration_test/models/mini_unet.py
new file mode 100644
index 000000000000..c44662b3f1a8
--- /dev/null
+++ b/keras/integration_test/models/mini_unet.py
@@ -0,0 +1,80 @@
+"""Segmentation model.
+
+Adapted from https://keras.io/examples/vision/oxford_pets_image_segmentation/
+"""
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+IMG_SIZE = (224, 224)
+NUM_CLASSES = 5
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size,) + IMG_SIZE + (3,)),
+        InputSpec((batch_size,) + IMG_SIZE + (NUM_CLASSES,)),
+    )
+
+
+def get_input_preprocessor():
+    return None
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    inputs = keras.Input(shape=IMG_SIZE + (3,))
+    x = keras.layers.Conv2D(32, 3, strides=2, padding="same")(inputs)
+    x = keras.layers.BatchNormalization()(x)
+    x = keras.layers.Activation("relu")(x)
+
+    previous_block_activation = x
+    for filters in [64, 128, 256]:
+        x = keras.layers.Activation("relu")(x)
+        x = keras.layers.SeparableConv2D(filters, 3, padding="same")(x)
+        x = keras.layers.BatchNormalization()(x)
+
+        x = keras.layers.Activation("relu")(x)
+        x = keras.layers.SeparableConv2D(filters, 3, padding="same")(x)
+        x = keras.layers.BatchNormalization()(x)
+
+        x = keras.layers.MaxPooling2D(3, strides=2, padding="same")(x)
+
+        residual = keras.layers.Conv2D(filters, 1, strides=2, padding="same")(
+            previous_block_activation
+        )
+        x = keras.layers.add([x, residual])
+        previous_block_activation = x
+
+    for filters in [256, 128, 64, 32]:
+        x = keras.layers.Activation("relu")(x)
+        x = keras.layers.Conv2DTranspose(filters, 3, padding="same")(x)
+        x = keras.layers.BatchNormalization()(x)
+
+        x = keras.layers.Activation("relu")(x)
+        x = keras.layers.Conv2DTranspose(filters, 3, padding="same")(x)
+        x = keras.layers.BatchNormalization()(x)
+
+        x = keras.layers.UpSampling2D(2)(x)
+
+        residual = keras.layers.UpSampling2D(2)(previous_block_activation)
+        residual = keras.layers.Conv2D(filters, 1, padding="same")(residual)
+        x = keras.layers.add([x, residual])
+        previous_block_activation = x
+
+    outputs = keras.layers.Conv2D(
+        NUM_CLASSES, 3, activation="softmax", padding="same"
+    )(x)
+    model = keras.Model(inputs, outputs)
+    if compile:
+        model.compile(
+            optimizer="rmsprop",
+            loss="categorical_crossentropy",
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {}
diff --git a/keras/integration_test/models/mini_xception.py b/keras/integration_test/models/mini_xception.py
new file mode 100644
index 000000000000..456e53390c53
--- /dev/null
+++ b/keras/integration_test/models/mini_xception.py
@@ -0,0 +1,84 @@
+"""Mini-Xception classification model.
+
+Adapted from https://keras.io/examples/vision/image_classification_from_scratch/
+"""
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+IMG_SIZE = (120, 120)
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size,) + IMG_SIZE + (3,)),
+        InputSpec((batch_size, 1), dtype="int32", range=[0, 2]),
+    )
+
+
+def get_input_preprocessor():
+    return keras.Sequential(
+        [
+            keras.layers.RandomFlip(),
+            keras.layers.RandomRotation(0.2),
+            keras.layers.RandomZoom(0.2),
+            keras.layers.Rescaling(1.0 / 255),
+        ]
+    )
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    inputs = keras.Input(shape=IMG_SIZE + (3,))
+
+    if include_preprocessing:
+        x = get_input_preprocessor()(inputs)
+    else:
+        x = inputs
+
+    x = keras.layers.Conv2D(32, 3, strides=2, padding="same")(x)
+    x = keras.layers.BatchNormalization()(x)
+    x = keras.layers.Activation("relu")(x)
+
+    x = keras.layers.Conv2D(64, 3, padding="same")(x)
+    x = keras.layers.BatchNormalization()(x)
+    x = keras.layers.Activation("relu")(x)
+
+    previous_block_activation = x
+
+    for size in [128, 256, 512, 728]:
+        x = keras.layers.Activation("relu")(x)
+        x = keras.layers.SeparableConv2D(size, 3, padding="same")(x)
+        x = keras.layers.BatchNormalization()(x)
+        x = keras.layers.Activation("relu")(x)
+        x = keras.layers.SeparableConv2D(size, 3, padding="same")(x)
+        x = keras.layers.BatchNormalization()(x)
+        x = keras.layers.MaxPooling2D(3, strides=2, padding="same")(x)
+
+        residual = keras.layers.Conv2D(size, 1, strides=2, padding="same")(
+            previous_block_activation
+        )
+        x = keras.layers.add([x, residual])
+        previous_block_activation = x
+
+    x = keras.layers.SeparableConv2D(1024, 3, padding="same")(x)
+    x = keras.layers.BatchNormalization()(x)
+    x = keras.layers.Activation("relu")(x)
+
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    x = keras.layers.Dropout(0.5)(x)
+    outputs = keras.layers.Dense(1, activation="sigmoid")(x)
+    model = keras.Model(inputs, outputs)
+    if compile:
+        model.compile(
+            optimizer="adam",
+            loss="binary_crossentropy",
+            metrics=["accuracy"],
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {}
diff --git a/keras/integration_test/models/retinanet.py b/keras/integration_test/models/retinanet.py
new file mode 100644
index 000000000000..188fc3e9947a
--- /dev/null
+++ b/keras/integration_test/models/retinanet.py
@@ -0,0 +1,260 @@
+"""RetinaNet object detection model.
+
+Adapted from https://keras.io/examples/vision/retinanet/
+"""
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+from keras.saving import serialization_lib
+
+NUM_CLASSES = 10
+IMG_SIZE = (224, 224)
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size,) + IMG_SIZE + (3,)),
+        InputSpec((batch_size, 9441, 5)),
+    )
+
+
+def get_input_preprocessor():
+    return None
+
+
+def get_backbone():
+    backbone = keras.applications.ResNet50(
+        include_top=False,
+        input_shape=[None, None, 3],
+        weights=None,
+    )
+    c3_output, c4_output, c5_output = [
+        backbone.get_layer(layer_name).output
+        for layer_name in [
+            "conv3_block4_out",
+            "conv4_block6_out",
+            "conv5_block3_out",
+        ]
+    ]
+    return keras.Model(
+        inputs=[backbone.inputs], outputs=[c3_output, c4_output, c5_output]
+    )
+
+
+class FeaturePyramid(keras.layers.Layer):
+    def __init__(self, backbone=None, **kwargs):
+        super().__init__(name="FeaturePyramid", **kwargs)
+        self.backbone = backbone if backbone else get_backbone()
+        self.conv_c3_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
+        self.conv_c4_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
+        self.conv_c5_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
+        self.conv_c3_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
+        self.conv_c4_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
+        self.conv_c5_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
+        self.conv_c6_3x3 = keras.layers.Conv2D(256, 3, 2, "same")
+        self.conv_c7_3x3 = keras.layers.Conv2D(256, 3, 2, "same")
+        self.upsample_2x = keras.layers.UpSampling2D(2)
+
+    def call(self, images, training=False):
+        c3_output, c4_output, c5_output = self.backbone(
+            images, training=training
+        )
+        p3_output = self.conv_c3_1x1(c3_output)
+        p4_output = self.conv_c4_1x1(c4_output)
+        p5_output = self.conv_c5_1x1(c5_output)
+        p4_output = p4_output + self.upsample_2x(p5_output)
+        p3_output = p3_output + self.upsample_2x(p4_output)
+        p3_output = self.conv_c3_3x3(p3_output)
+        p4_output = self.conv_c4_3x3(p4_output)
+        p5_output = self.conv_c5_3x3(p5_output)
+        p6_output = self.conv_c6_3x3(c5_output)
+        p7_output = self.conv_c7_3x3(tf.nn.relu(p6_output))
+        return p3_output, p4_output, p5_output, p6_output, p7_output
+
+
+def build_head(output_filters, bias_init):
+    head = keras.Sequential([keras.Input(shape=[None, None, 256])])
+    kernel_init = tf.initializers.RandomNormal(0.0, 0.01)
+    for _ in range(4):
+        head.add(
+            keras.layers.Conv2D(
+                256, 3, padding="same", kernel_initializer=kernel_init
+            )
+        )
+        head.add(keras.layers.ReLU())
+    head.add(
+        keras.layers.Conv2D(
+            output_filters,
+            3,
+            1,
+            padding="same",
+            kernel_initializer=kernel_init,
+            bias_initializer=bias_init,
+        )
+    )
+    return head
+
+
+class RetinaNet(keras.Model):
+    def __init__(self, num_classes, backbone=None, **kwargs):
+        super().__init__(name="RetinaNet", **kwargs)
+        self.fpn = FeaturePyramid(backbone)
+        self.num_classes = num_classes
+
+        prior_probability = keras.initializers.Constant(
+            -tf.math.log((1 - 0.01) / 0.01)
+        )
+        self.cls_head = build_head(9 * num_classes, prior_probability)
+        self.box_head = build_head(9 * 4, "zeros")
+
+    def call(self, image, training=False):
+        features = self.fpn(image, training=training)
+        N = tf.shape(image)[0]
+        cls_outputs = []
+        box_outputs = []
+        for feature in features:
+            box_outputs.append(tf.reshape(self.box_head(feature), [N, -1, 4]))
+            cls_outputs.append(
+                tf.reshape(self.cls_head(feature), [N, -1, self.num_classes])
+            )
+        cls_outputs = tf.concat(cls_outputs, axis=1)
+        box_outputs = tf.concat(box_outputs, axis=1)
+        return tf.concat([box_outputs, cls_outputs], axis=-1)
+
+    def get_config(self):
+        return {
+            "num_classes": self.num_classes,
+            "backbone": self.fpn.backbone,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        backbone = serialization_lib.deserialize_keras_object(
+            config.pop("backbone")
+        )
+        num_classes = config["num_classes"]
+        retinanet = cls(num_classes=num_classes, backbone=backbone)
+        retinanet(tf.zeros((1, 32, 32, 3)))  # Build model
+        return retinanet
+
+
+class RetinaNetBoxLoss(keras.losses.Loss):
+    def __init__(self, delta):
+        super().__init__(reduction="none", name="RetinaNetBoxLoss")
+        self._delta = delta
+
+    def call(self, y_true, y_pred):
+        difference = y_true - y_pred
+        absolute_difference = tf.abs(difference)
+        squared_difference = difference**2
+        loss = tf.where(
+            tf.less(absolute_difference, self._delta),
+            0.5 * squared_difference,
+            absolute_difference - 0.5,
+        )
+        return tf.reduce_sum(loss, axis=-1)
+
+    def get_config(self):
+        return {"delta": self._delta}
+
+
+class RetinaNetClassificationLoss(keras.losses.Loss):
+    def __init__(self, alpha, gamma):
+        super().__init__(reduction="none", name="RetinaNetClassificationLoss")
+        self._alpha = alpha
+        self._gamma = gamma
+
+    def call(self, y_true, y_pred):
+        cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(
+            labels=y_true, logits=y_pred
+        )
+        probs = tf.nn.sigmoid(y_pred)
+        alpha = tf.where(
+            tf.equal(y_true, 1.0), self._alpha, (1.0 - self._alpha)
+        )
+        pt = tf.where(tf.equal(y_true, 1.0), probs, 1 - probs)
+        loss = alpha * tf.pow(1.0 - pt, self._gamma) * cross_entropy
+        return tf.reduce_sum(loss, axis=-1)
+
+    def get_config(self):
+        return {"alpha": self._alpha, "gamma": self._gamma}
+
+
+class RetinaNetLoss(keras.losses.Loss):
+    def __init__(self, num_classes=80, alpha=0.25, gamma=2.0, delta=1.0):
+        super().__init__(reduction="auto", name="RetinaNetLoss")
+        self._clf_loss = RetinaNetClassificationLoss(alpha, gamma)
+        self._box_loss = RetinaNetBoxLoss(delta)
+        self._num_classes = num_classes
+        self._alpha = alpha
+        self._gamma = gamma
+        self._delta = delta
+
+    def call(self, y_true, y_pred):
+        y_pred = tf.cast(y_pred, dtype=tf.float32)
+        box_labels = y_true[:, :, :4]
+        box_predictions = y_pred[:, :, :4]
+        cls_labels = tf.one_hot(
+            tf.cast(y_true[:, :, 4], dtype=tf.int32),
+            depth=self._num_classes,
+            dtype=tf.float32,
+        )
+        cls_predictions = y_pred[:, :, 4:]
+        positive_mask = tf.cast(
+            tf.greater(y_true[:, :, 4], -1.0), dtype=tf.float32
+        )
+        ignore_mask = tf.cast(tf.equal(y_true[:, :, 4], -2.0), dtype=tf.float32)
+        clf_loss = self._clf_loss(cls_labels, cls_predictions)
+        box_loss = self._box_loss(box_labels, box_predictions)
+        clf_loss = tf.where(tf.equal(ignore_mask, 1.0), 0.0, clf_loss)
+        box_loss = tf.where(tf.equal(positive_mask, 1.0), box_loss, 0.0)
+        normalizer = tf.reduce_sum(positive_mask, axis=-1)
+        clf_loss = tf.math.divide_no_nan(
+            tf.reduce_sum(clf_loss, axis=-1), normalizer
+        )
+        box_loss = tf.math.divide_no_nan(
+            tf.reduce_sum(box_loss, axis=-1), normalizer
+        )
+        loss = clf_loss + box_loss
+        return loss
+
+    def get_config(self):
+        return {
+            "num_classes": self._num_classes,
+            "alpha": self._alpha,
+            "gamma": self._gamma,
+            "delta": self._delta,
+        }
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    resnet50_backbone = get_backbone()
+    loss_fn = RetinaNetLoss(NUM_CLASSES)
+    model = RetinaNet(NUM_CLASSES, resnet50_backbone)
+
+    if compile:
+        learning_rates = [2.5e-06, 0.000625, 0.00125, 0.0025, 0.00025, 2.5e-05]
+        learning_rate_boundaries = [125, 250, 500, 240000, 360000]
+        learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
+            boundaries=learning_rate_boundaries, values=learning_rates
+        )
+        optimizer = keras.optimizers.SGD(
+            learning_rate=learning_rate_fn, momentum=0.9
+        )
+        model.compile(
+            loss=loss_fn, optimizer=optimizer, jit_compile=jit_compile
+        )
+    return model
+
+
+def get_custom_objects():
+    return {
+        "RetinaNetLoss": RetinaNetLoss,
+        "RetinaNetClassificationLoss": RetinaNetClassificationLoss,
+        "RetinaNetBoxLoss": RetinaNetBoxLoss,
+        "RetinaNet": RetinaNet,
+        "FeaturePyramid": FeaturePyramid,
+    }
diff --git a/keras/integration_test/models/structured_data_classification.py b/keras/integration_test/models/structured_data_classification.py
new file mode 100644
index 000000000000..e53bfb063696
--- /dev/null
+++ b/keras/integration_test/models/structured_data_classification.py
@@ -0,0 +1,100 @@
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+
+def get_data_spec(batch_size):
+    return (
+        {
+            "num_cat_feat": InputSpec(
+                (batch_size,), dtype="int32", range=[0, 5]
+            ),
+            "string_cat_feat": InputSpec((batch_size,), dtype="string"),
+            "num_feat": InputSpec((batch_size,)),
+        },
+        InputSpec((batch_size, 1), dtype="int32", range=[0, 2]),
+    )
+
+
+def get_input_preprocessor():
+    dataset = tf.data.Dataset.from_tensor_slices(
+        {
+            "num_cat_feat": [0, 1, 2, 3, 4, 5],
+            "string_cat_feat": ["zero", "one", "two", "three", "four", "five"],
+            "num_feat": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
+        }
+    ).batch(3)
+
+    num_cat_feat = keras.Input(shape=(1,), name="num_cat_feat", dtype="int64")
+    string_cat_feat = keras.Input(
+        shape=(1,), name="string_cat_feat", dtype="string"
+    )
+    num_feat = keras.Input(shape=(1,), name="num_feat", dtype="float32")
+
+    all_inputs = [
+        num_cat_feat,
+        string_cat_feat,
+        num_feat,
+    ]
+
+    all_features = keras.layers.concatenate(
+        [
+            encode_categorical_feature(
+                num_cat_feat, "num_cat_feat", dataset, False
+            ),
+            encode_categorical_feature(
+                string_cat_feat, "string_cat_feat", dataset, True
+            ),
+            encode_numerical_feature(num_feat, "num_feat", dataset),
+        ]
+    )
+    preprocessor = keras.Model(all_inputs, all_features)
+    return preprocessor
+
+
+def encode_numerical_feature(feature, name, dataset):
+    normalizer = keras.layers.Normalization(mean=[1.0], variance=[2.0])
+    encoded_feature = normalizer(feature)
+    return encoded_feature
+
+
+def encode_categorical_feature(feature, name, dataset, is_string):
+    lookup_class = (
+        keras.layers.StringLookup if is_string else keras.layers.IntegerLookup
+    )
+    lookup = lookup_class(output_mode="binary")
+    feature_ds = dataset.map(lambda x: x[name])
+    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
+    lookup.adapt(feature_ds)
+    encoded_feature = lookup(feature)
+    return encoded_feature
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    preprocessor = get_input_preprocessor()
+    if include_preprocessing:
+        all_inputs = preprocessor.inputs
+        all_features = preprocessor.outputs[0]
+    else:
+        all_inputs = keras.Input(shape=preprocessor.outputs[0].shape)
+        all_features = all_inputs
+    x = keras.layers.Dense(32, activation="relu")(all_features)
+    x = keras.layers.Dropout(0.5)(x)
+    output = keras.layers.Dense(1, activation="sigmoid")(x)
+    model = keras.Model(all_inputs, output)
+
+    if compile:
+        model.compile(
+            "adam",
+            "binary_crossentropy",
+            metrics=["accuracy"],
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {}
diff --git a/keras/integration_test/models/text_classification.py b/keras/integration_test/models/text_classification.py
new file mode 100644
index 000000000000..6da5a2a741dc
--- /dev/null
+++ b/keras/integration_test/models/text_classification.py
@@ -0,0 +1,91 @@
+"""Text classification model.
+
+Adapted from https://keras.io/examples/nlp/text_classification_from_scratch/
+"""
+import re
+import string
+
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+MAX_FEATURES = 1000
+EMBEDDING_DIM = 64
+SEQUENCE_LENGTH = 32
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size,), dtype="string"),
+        InputSpec((batch_size, 1), dtype="int32", range=[0, 2]),
+    )
+
+
+def custom_standardization(input_data):
+    lowercase = tf.strings.lower(input_data)
+    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
+    return tf.strings.regex_replace(
+        stripped_html, f"[{re.escape(string.punctuation)}]", ""
+    )
+
+
+def get_input_preprocessor():
+    input_vectorizer = keras.layers.TextVectorization(
+        standardize=custom_standardization,
+        max_tokens=MAX_FEATURES,
+        output_mode="int",
+        output_sequence_length=SEQUENCE_LENGTH,
+    )
+    text_ds = tf.data.Dataset.from_tensor_slices(
+        [
+            "Lorem ipsum dolor sit amet",
+            "consectetur adipiscing elit",
+            "sed do eiusmod tempor incididunt ut",
+            "labore et dolore magna aliqua.",
+            "Ut enim ad minim veniam",
+            "quis nostrud exercitation ullamco",
+            "laboris nisi ut aliquip ex ea commodo consequat.",
+        ]
+    )
+    input_vectorizer.adapt(text_ds)
+    return input_vectorizer
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    if include_preprocessing:
+        inputs = keras.Input(shape=(), dtype="string")
+        x = get_input_preprocessor()(inputs)
+    else:
+        inputs = keras.Input(shape=(None,), dtype="int64")
+        x = inputs
+    x = keras.layers.Embedding(MAX_FEATURES, EMBEDDING_DIM)(x)
+    x = keras.layers.Dropout(0.5)(x)
+    x = keras.layers.Conv1D(
+        128, 7, padding="valid", activation="relu", strides=3
+    )(x)
+    x = keras.layers.Conv1D(
+        128, 7, padding="valid", activation="relu", strides=3
+    )(x)
+    x = keras.layers.GlobalMaxPooling1D()(x)
+    x = keras.layers.Dense(128, activation="relu")(x)
+    x = keras.layers.Dropout(0.5)(x)
+    predictions = keras.layers.Dense(
+        1, activation="sigmoid", name="predictions"
+    )(x)
+    model = keras.Model(inputs, predictions)
+
+    if compile:
+        model.compile(
+            loss="binary_crossentropy",
+            optimizer="adam",
+            metrics=["accuracy"],
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {"custom_standardization": custom_standardization}
diff --git a/keras/integration_test/models/timeseries_forecasting.py b/keras/integration_test/models/timeseries_forecasting.py
new file mode 100644
index 000000000000..7f38f0821372
--- /dev/null
+++ b/keras/integration_test/models/timeseries_forecasting.py
@@ -0,0 +1,41 @@
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+TIMESTEPS = 32
+
+
+def get_data_spec(batch_size):
+    return (
+        InputSpec((batch_size, TIMESTEPS, 1)),
+        InputSpec((batch_size, 1)),
+    )
+
+
+def get_input_preprocessor():
+    return None
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    model = keras.Sequential(
+        [
+            keras.layers.LSTM(32, return_sequences=True),
+            keras.layers.LSTM(32),
+            keras.layers.Dense(1),
+        ]
+    )
+    if build:
+        model.build((None, TIMESTEPS, 1))
+    if compile:
+        model.compile(
+            optimizer=keras.optimizers.Adam(),
+            loss="mse",
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {}
diff --git a/keras/integration_test/models/translation.py b/keras/integration_test/models/translation.py
new file mode 100644
index 000000000000..b8488600ba7f
--- /dev/null
+++ b/keras/integration_test/models/translation.py
@@ -0,0 +1,225 @@
+"""Machine translation model.
+
+Adapted from
+https://keras.io/examples/nlp/neural_machine_translation_with_transformer/
+"""
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+
+VOCAB_SIZE = 1500
+SEQUENCE_LENGTH = 20
+
+
+def get_data_spec(batch_size):
+    return (
+        (
+            InputSpec((batch_size,), dtype="string"),
+            InputSpec((batch_size,), dtype="string"),
+        ),
+        InputSpec(
+            (batch_size, SEQUENCE_LENGTH), dtype="int64", range=[0, VOCAB_SIZE]
+        ),
+    )
+
+
+def get_input_preprocessor():
+    encoder_input_vectorizer = keras.layers.TextVectorization(
+        max_tokens=VOCAB_SIZE,
+        output_mode="int",
+        output_sequence_length=SEQUENCE_LENGTH,
+    )
+    decoder_input_vectorizer = keras.layers.TextVectorization(
+        max_tokens=VOCAB_SIZE,
+        output_mode="int",
+        output_sequence_length=SEQUENCE_LENGTH,
+    )
+    text_ds = tf.data.Dataset.from_tensor_slices(
+        [
+            "Lorem ipsum dolor sit amet",
+            "consectetur adipiscing elit",
+            "sed do eiusmod tempor incididunt ut",
+            "labore et dolore magna aliqua.",
+            "Ut enim ad minim veniam",
+            "quis nostrud exercitation ullamco",
+            "laboris nisi ut aliquip ex ea commodo consequat.",
+        ]
+    )
+    encoder_input_vectorizer.adapt(text_ds)
+    decoder_input_vectorizer.adapt(text_ds)
+    return lambda x: (
+        encoder_input_vectorizer(x[0]),
+        decoder_input_vectorizer(x[1]),
+    )
+
+
+class TransformerEncoder(keras.layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention = keras.layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.dense_proj = keras.Sequential(
+            [
+                keras.layers.Dense(dense_dim, activation="relu"),
+                keras.layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = keras.layers.LayerNormalization()
+        self.layernorm_2 = keras.layers.LayerNormalization()
+        self.supports_masking = True
+
+    def call(self, inputs, mask=None):
+        if mask is not None:
+            padding_mask = tf.cast(
+                mask[:, tf.newaxis, tf.newaxis, :], dtype="int32"
+            )
+        attention_output = self.attention(
+            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
+        )
+        proj_input = self.layernorm_1(inputs + attention_output)
+        proj_output = self.dense_proj(proj_input)
+        return self.layernorm_2(proj_input + proj_output)
+
+
+class PositionalEmbedding(keras.layers.Layer):
+    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.token_embeddings = keras.layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.position_embeddings = keras.layers.Embedding(
+            input_dim=sequence_length, output_dim=embed_dim
+        )
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+
+    def call(self, inputs):
+        length = tf.shape(inputs)[-1]
+        positions = tf.range(start=0, limit=length, delta=1)
+        embedded_tokens = self.token_embeddings(inputs)
+        embedded_positions = self.position_embeddings(positions)
+        return embedded_tokens + embedded_positions
+
+    def compute_mask(self, inputs, mask=None):
+        return tf.math.not_equal(inputs, 0)
+
+
+class TransformerDecoder(keras.layers.Layer):
+    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.latent_dim = latent_dim
+        self.num_heads = num_heads
+        self.attention_1 = keras.layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.attention_2 = keras.layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.dense_proj = keras.Sequential(
+            [
+                keras.layers.Dense(latent_dim, activation="relu"),
+                keras.layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = keras.layers.LayerNormalization()
+        self.layernorm_2 = keras.layers.LayerNormalization()
+        self.layernorm_3 = keras.layers.LayerNormalization()
+        self.supports_masking = True
+
+    def call(self, inputs, encoder_outputs, mask=None):
+        causal_mask = self.get_causal_attention_mask(inputs)
+        if mask is not None:
+            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
+            padding_mask = tf.minimum(padding_mask, causal_mask)
+
+        attention_output_1 = self.attention_1(
+            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
+        )
+        out_1 = self.layernorm_1(inputs + attention_output_1)
+
+        attention_output_2 = self.attention_2(
+            query=out_1,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            attention_mask=padding_mask,
+        )
+        out_2 = self.layernorm_2(out_1 + attention_output_2)
+
+        proj_output = self.dense_proj(out_2)
+        return self.layernorm_3(out_2 + proj_output)
+
+    def get_causal_attention_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = tf.range(sequence_length)[:, tf.newaxis]
+        j = tf.range(sequence_length)
+        mask = tf.cast(i >= j, dtype="int32")
+        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = tf.concat(
+            [
+                tf.expand_dims(batch_size, -1),
+                tf.constant([1, 1], dtype=tf.int32),
+            ],
+            axis=0,
+        )
+        return tf.tile(mask, mult)
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    embed_dim = 256
+    latent_dim = 256
+    num_heads = 2
+
+    if include_preprocessing:
+        encoder_inputs = keras.Input(shape=(), dtype="string")
+        decoder_inputs = keras.Input(shape=(), dtype="string")
+        encoder_x, decoder_x = get_input_preprocessor()(
+            (encoder_inputs, decoder_inputs)
+        )
+    else:
+        encoder_inputs = keras.Input(shape=(None,), dtype="int64")
+        decoder_inputs = keras.Input(shape=(None,), dtype="int64")
+        encoder_x = encoder_inputs
+        decoder_x = decoder_inputs
+
+    x = PositionalEmbedding(SEQUENCE_LENGTH, VOCAB_SIZE, embed_dim)(encoder_x)
+    encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
+
+    encoded_seq_inputs = keras.Input(shape=(None, embed_dim))
+    x = PositionalEmbedding(SEQUENCE_LENGTH, VOCAB_SIZE, embed_dim)(decoder_x)
+    x = TransformerDecoder(embed_dim, latent_dim, num_heads)(
+        x, encoded_seq_inputs
+    )
+    x = keras.layers.Dropout(0.5)(x)
+    decoder_outputs = keras.layers.Dense(VOCAB_SIZE, activation="softmax")(x)
+    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
+
+    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
+    model = keras.Model(
+        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
+    )
+    if compile:
+        model.compile(
+            "rmsprop",
+            loss="sparse_categorical_crossentropy",
+            metrics=["accuracy"],
+            jit_compile=jit_compile,
+        )
+    return model
+
+
+def get_custom_objects():
+    return {
+        "TransformerEncoder": TransformerEncoder,
+        "TransformerDecoder": TransformerDecoder,
+        "PositionalEmbedding": PositionalEmbedding,
+    }
diff --git a/keras/integration_test/models/vae.py b/keras/integration_test/models/vae.py
new file mode 100644
index 000000000000..f9f08e1420fb
--- /dev/null
+++ b/keras/integration_test/models/vae.py
@@ -0,0 +1,137 @@
+"""Variable autoencoder.
+
+Adapted from https://keras.io/examples/generative/vae/
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+
+from keras.integration_test.models.input_spec import InputSpec
+from keras.saving import serialization_lib
+
+IMG_SIZE = (28, 28)
+LATENT_DIM = 64
+
+
+def get_input_preprocessor():
+    return None
+
+
+class Sampling(keras.layers.Layer):
+    def call(self, inputs):
+        z_mean, z_log_var = inputs
+        batch = tf.shape(z_mean)[0]
+        dim = tf.shape(z_mean)[1]
+        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
+        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
+
+
+class VAE(keras.Model):
+    def __init__(self, encoder, decoder, **kwargs):
+        super(VAE, self).__init__(**kwargs)
+        self.encoder = encoder
+        self.decoder = decoder
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
+
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.reconstruction_loss_tracker,
+            self.kl_loss_tracker,
+        ]
+
+    def train_step(self, data):
+        with tf.GradientTape() as tape:
+            z_mean, z_log_var, z = self.encoder(data)
+            reconstruction = self.decoder(z)
+            reconstruction_loss = tf.reduce_mean(
+                tf.reduce_sum(
+                    keras.losses.binary_crossentropy(data, reconstruction),
+                    axis=(1, 2),
+                )
+            )
+            kl_loss = -0.5 * (
+                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
+            )
+            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
+            total_loss = reconstruction_loss + kl_loss
+        grads = tape.gradient(total_loss, self.trainable_weights)
+        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "kl_loss": self.kl_loss_tracker.result(),
+        }
+
+    def get_config(self):
+        base_config = super().get_config()
+        return {
+            "encoder": self.encoder,
+            "decoder": self.decoder,
+            **base_config,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        encoder = serialization_lib.deserialize_keras_object(
+            config.pop("encoder")
+        )
+        decoder = serialization_lib.deserialize_keras_object(
+            config.pop("decoder")
+        )
+        return cls(encoder, decoder, **config)
+
+
+def get_data_spec(batch_size):
+    return InputSpec((batch_size,) + IMG_SIZE + (1,))
+
+
+def get_model(
+    build=False, compile=False, jit_compile=False, include_preprocessing=True
+):
+    encoder_inputs = keras.Input(shape=IMG_SIZE + (1,))
+    x = keras.layers.Conv2D(
+        32, 3, activation="relu", strides=2, padding="same"
+    )(encoder_inputs)
+    x = keras.layers.Conv2D(
+        64, 3, activation="relu", strides=2, padding="same"
+    )(x)
+    x = keras.layers.Flatten()(x)
+    x = keras.layers.Dense(16, activation="relu")(x)
+    z_mean = keras.layers.Dense(LATENT_DIM, name="z_mean")(x)
+    z_log_var = keras.layers.Dense(LATENT_DIM, name="z_log_var")(x)
+    z = Sampling()([z_mean, z_log_var])
+    encoder = keras.Model(
+        encoder_inputs, [z_mean, z_log_var, z], name="encoder"
+    )
+
+    latent_inputs = keras.Input(shape=(LATENT_DIM,))
+    x = keras.layers.Dense(7 * 7 * 64, activation="relu")(latent_inputs)
+    x = keras.layers.Reshape((7, 7, 64))(x)
+    x = keras.layers.Conv2DTranspose(
+        64, 3, activation="relu", strides=2, padding="same"
+    )(x)
+    x = keras.layers.Conv2DTranspose(
+        32, 3, activation="relu", strides=2, padding="same"
+    )(x)
+    decoder_outputs = keras.layers.Conv2DTranspose(
+        1, 3, activation="sigmoid", padding="same"
+    )(x)
+    decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
+
+    vae = VAE(encoder, decoder)
+    if compile:
+        vae.compile(optimizer=keras.optimizers.Adam(), jit_compile=jit_compile)
+    return vae
+
+
+def get_custom_objects():
+    return {"VAE": VAE, "Sampling": Sampling}
diff --git a/keras/integration_test/module_test.py b/keras/integration_test/module_test.py
index 2fe54431d17e..0454d70999b3 100644
--- a/keras/integration_test/module_test.py
+++ b/keras/integration_test/module_test.py
@@ -17,44 +17,60 @@
 
 
 class ModuleTest(tf.test.TestCase):
+    def test_module_discover_layer_variable(self):
+        m = tf.Module()
+        m.a = tf.keras.layers.Dense(1)
+        m.b = tf.keras.layers.Dense(2)
 
-  def test_module_discover_layer_variable(self):
-    m = tf.Module()
-    m.a = tf.keras.layers.Dense(1)
-    m.b = tf.keras.layers.Dense(2)
+        # The weights of the layer has not been created yet.
+        self.assertEmpty(m.variables)
+        self.assertLen(m.submodules, 2)
 
-    # The weights of the layer has not been created yet.
-    self.assertEmpty(m.variables)
-    self.assertLen(m.submodules, 2)
+        inputs = tf.keras.layers.Input((1,))
+        m.a(inputs)
+        m.b(inputs)
 
-    inputs = tf.keras.layers.Input((1,))
-    m.a(inputs)
-    m.b(inputs)
+        variable_list = m.variables
+        self.assertLen(variable_list, 4)
+        self.assertIs(variable_list[0], m.a.kernel)
+        self.assertIs(variable_list[1], m.a.bias)
+        self.assertIs(variable_list[2], m.b.kernel)
+        self.assertIs(variable_list[3], m.b.bias)
 
-    variable_list = m.variables
-    self.assertLen(variable_list, 4)
-    self.assertIs(variable_list[0], m.a.kernel)
-    self.assertIs(variable_list[1], m.a.bias)
-    self.assertIs(variable_list[2], m.b.kernel)
-    self.assertIs(variable_list[3], m.b.bias)
+    def test_model_discover_submodule(self):
+        m = tf.keras.models.Sequential(
+            layers=[tf.keras.layers.Dense(1), tf.keras.layers.Dense(2)]
+        )
 
-  def test_model_discover_submodule(self):
-    m = tf.keras.models.Sequential(
-        layers=[tf.keras.layers.Dense(1), tf.keras.layers.Dense(2)])
+        self.assertEqual(m.submodules, (m.layers[0], m.layers[1]))
+        m(tf.keras.layers.Input((1,)))
+        self.assertLen(m.variables, 4)
 
-    self.assertEqual(m.submodules, (m.layers[0], m.layers[1]))
-    m(tf.keras.layers.Input((1,)))
-    self.assertLen(m.variables, 4)
+    def test_model_wrapped_in_module_discovers_submodules(self):
+        linear = tf.keras.models.Sequential(
+            [tf.keras.layers.Dense(units=1, input_shape=[1])]
+        )
+        linear.compile(optimizer="sgd", loss="mean_squared_error")
+        m = tf.Module()
+        m.l = linear
+        self.assertNotEmpty(m.submodules)
+        self.assertLen(m.variables, 2)
 
-  def test_model_wrapped_in_module_discovers_submodules(self):
-    linear = tf.keras.models.Sequential(
-        [tf.keras.layers.Dense(units=1, input_shape=[1])])
-    linear.compile(optimizer="sgd", loss="mean_squared_error")
-    m = tf.Module()
-    m.l = linear
-    self.assertNotEmpty(m.submodules)
-    self.assertLen(m.variables, 2)
+    def test_subclass_model(self):
+        class Model(tf.keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dense = tf.keras.layers.Dense(units=1)
+
+            def call(self, inputs, training=None, mask=None):
+                return self.dense(inputs)
+
+        model = Model()
+        self.assertLen(model.submodules, 1)  # For the dense layer
+        model.compile(loss="mse", optimizer="sgd")
+        # Make sure the compiled metric doesn't break tf.module
+        self.assertLen(model.submodules, 1)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/integration_test/multi_worker_tutorial_test.py b/keras/integration_test/multi_worker_tutorial_test.py
index 87ca7a7da8f6..31a605efbf12 100644
--- a/keras/integration_test/multi_worker_tutorial_test.py
+++ b/keras/integration_test/multi_worker_tutorial_test.py
@@ -20,10 +20,11 @@
 import unittest
 import uuid
 import zipfile
-from absl import logging
-from absl.testing import parameterized
+
 import numpy as np
 import tensorflow.compat.v2 as tf
+from absl import logging
+from absl.testing import parameterized
 
 PER_WORKER_BATCH_SIZE = 64
 NUM_WORKERS = 2
@@ -32,319 +33,401 @@
 
 
 def _is_chief(task_type, task_id):
-  # Note: there are two possible `TF_CONFIG` configuration.
-  #   1) In addition to `worker` tasks, a `chief` task type is use;
-  #      in this case, this function should be modified to
-  #      `return task_type == 'chief'`.
-  #   2) Only `worker` task type is used; in this case, worker 0 is
-  #      regarded as the chief. The implementation demonstrated here
-  #      is for this case.
-  return task_type == 'worker' and task_id == 0
+    # Note: there are two possible `TF_CONFIG` configuration.
+    #   1) In addition to `worker` tasks, a `chief` task type is use;
+    #      in this case, this function should be modified to
+    #      `return task_type == 'chief'`.
+    #   2) Only `worker` task type is used; in this case, worker 0 is
+    #      regarded as the chief. The implementation demonstrated here
+    #      is for this case.
+    return task_type == "worker" and task_id == 0
 
 
 def _get_temp_dir(dirpath, task_id):
-  base_dirpath = 'workertemp_' + str(task_id)
-  temp_dir = os.path.join(dirpath, base_dirpath)
-  tf.io.gfile.makedirs(temp_dir)
-  return temp_dir
+    base_dirpath = "workertemp_" + str(task_id)
+    temp_dir = os.path.join(dirpath, base_dirpath)
+    tf.io.gfile.makedirs(temp_dir)
+    return temp_dir
 
 
 def write_filepath(filepath, task_type, task_id):
-  dirpath = os.path.dirname(filepath)
-  base = os.path.basename(filepath)
-  if not _is_chief(task_type, task_id):
-    dirpath = _get_temp_dir(dirpath, task_id)
-  return os.path.join(dirpath, base)
+    dirpath = os.path.dirname(filepath)
+    base = os.path.basename(filepath)
+    if not _is_chief(task_type, task_id):
+        dirpath = _get_temp_dir(dirpath, task_id)
+    return os.path.join(dirpath, base)
 
 
 class MultiWorkerTutorialTest(parameterized.TestCase, tf.test.TestCase):
-  """Test of multi-worker training flow in tutorials on tensorflow.org.
-
-  Please see below test method docs for what actual tutorial is being covered.
-  """
-
-  # TODO(rchao): Add a test to demonstrate gather with MWMS.
-
-  @contextlib.contextmanager
-  def skip_fetch_failure_exception(self):
-    try:
-      yield
-    except zipfile.BadZipfile as e:
-      # There can be a race when multiple processes are downloading the data.
-      # Skip the test if that results in loading errors.
-      self.skipTest('Data loading error: Bad magic number for file header.')
-    except Exception as e:  # pylint: disable=broad-except
-      if 'URL fetch failure' in str(e):
-        self.skipTest('URL fetch error not considered failure of the test.')
-      else:
-        raise
-
-  def mnist_dataset(self):
-    path_to_use = 'mnist_{}.npz'.format(str(uuid.uuid4()))
-    with self.skip_fetch_failure_exception():
-      (x_train,
-       y_train), _ = tf.keras.datasets.mnist.load_data(path=path_to_use)
-    # The `x` arrays are in uint8 and have values in the range [0, 255].
-    # We need to convert them to float32 with values in the range [0, 1]
-    x_train = x_train / np.float32(255)
-    y_train = y_train.astype(np.int64)
-    train_dataset = tf.data.Dataset.from_tensor_slices(
-        (x_train, y_train)).shuffle(60000)
-    return train_dataset
-
-  def dataset_fn(self, global_batch_size, input_context):
-    batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-    dataset = self.mnist_dataset()
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-    dataset = dataset.batch(batch_size)
-    return dataset
-
-  def build_cnn_model(self):
-    return tf.keras.Sequential([
-        tf.keras.layers.Input(shape=(28, 28)),
-        tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
-        tf.keras.layers.Conv2D(32, 3, activation='relu'),
-        tf.keras.layers.Flatten(),
-        tf.keras.layers.Dense(128, activation='relu'),
-        tf.keras.layers.Dense(10)
-    ])
-
-  def build_and_compile_cnn_model(self):
-    model = self.build_cnn_model()
-    model.compile(
-        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
-        metrics=['accuracy'])
-    return model
-
-  @tf.__internal__.test.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'], tf_api_version=2))
-  def testSingleWorkerModelFit(self):
-    single_worker_dataset = self.mnist_dataset().batch(
-        PER_WORKER_BATCH_SIZE)
-    single_worker_model = self.build_and_compile_cnn_model()
-    single_worker_model.fit(single_worker_dataset, epochs=NUM_EPOCHS)
-
-  @tf.__internal__.test.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'], tf_api_version=2))
-  def testMwmsWithModelFit(self, mode):
-    """Test multi-worker training flow demo'ed in go/multi-worker-with-keras.
-
-    This test should be kept in sync with the code samples in
-    go/multi-worker-with-keras.
-
-    Args:
-      mode: Runtime mode.
+    """Test of multi-worker training flow in tutorials on tensorflow.org.
+
+    Please see below test method docs for what actual tutorial is being covered.
     """
-    def fn(model_path, checkpoint_dir):
-      global_batch_size = PER_WORKER_BATCH_SIZE * NUM_WORKERS
-      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
-      with strategy.scope():
-        multi_worker_model = self.build_and_compile_cnn_model()
-
-      callbacks = [
-          tf.keras.callbacks.ModelCheckpoint(
-              filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
-      ]
-
-      multi_worker_dataset = strategy.distribute_datasets_from_function(
-          lambda input_context: self.dataset_fn(global_batch_size, input_context
-                                               ))
-
-      multi_worker_model.fit(
-          multi_worker_dataset,
-          epochs=NUM_EPOCHS,
-          steps_per_epoch=50,
-          callbacks=callbacks)
-
-      task_type, task_id = (strategy.cluster_resolver.task_type,
-                            strategy.cluster_resolver.task_id)
-      write_model_path = write_filepath(model_path, task_type, task_id)
-
-      multi_worker_model.save(write_model_path)
-      if not _is_chief(task_type, task_id):
-        tf.io.gfile.rmtree(os.path.dirname(write_model_path))
-
-      # Make sure chief finishes saving before non-chief's assertions.
-      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
-
-      if not tf.io.gfile.exists(model_path):
-        raise RuntimeError()
-      if tf.io.gfile.exists(write_model_path) != _is_chief(task_type, task_id):
-        raise RuntimeError()
-
-      with strategy.scope():
-        loaded_model = tf.keras.models.load_model(model_path)
-      loaded_model.fit(multi_worker_dataset, epochs=1, steps_per_epoch=1)
-
-      checkpoint = tf.train.Checkpoint(model=multi_worker_model)
-      write_checkpoint_dir = write_filepath(checkpoint_dir, task_type, task_id)
-      checkpoint_manager = tf.train.CheckpointManager(
-          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
-
-      checkpoint_manager.save()
-      if not _is_chief(task_type, task_id):
-        tf.io.gfile.rmtree(write_checkpoint_dir)
-
-      # Make sure chief finishes saving before non-chief's assertions.
-      tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
-
-      if not tf.io.gfile.exists(checkpoint_dir):
-        raise RuntimeError()
-      if tf.io.gfile.exists(write_checkpoint_dir) != _is_chief(
-          task_type, task_id):
-        raise RuntimeError()
-
-      latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
-      checkpoint.restore(latest_checkpoint)
-      multi_worker_model.fit(multi_worker_dataset, epochs=1, steps_per_epoch=1)
-
-      logging.info('testMwmsWithModelFit successfully ends')
-
-    model_path = os.path.join(self.get_temp_dir(), 'model.tf')
-    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
-    try:
-      mpr_result = tf.__internal__.distribute.multi_process_runner.run(
-          fn,
-          tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
-              num_workers=NUM_WORKERS),
-          args=(model_path, checkpoint_dir),
-          return_output=True)
-    except tf.errors.UnavailableError:
-      self.skipTest('Skipping rare disconnection among the workers.')
-
-    self.assertTrue(
-        any([
-            'testMwmsWithModelFit successfully ends' in msg
-            for msg in mpr_result.stdout
-        ]))
-
-    def extract_accuracy(worker_id, input_string):
-      match = re.match(
-          r'\[worker\-{}\].*accuracy: (\d+\.\d+).*'.format(worker_id),
-          input_string)
-      return None if match is None else float(match.group(1))
-
-    for worker_id in range(NUM_WORKERS):
-      accu_result = tf.nest.map_structure(
-          lambda x: extract_accuracy(worker_id, x),  # pylint: disable=cell-var-from-loop
-          mpr_result.stdout)
-      self.assertTrue(
-          any(accu_result), 'Every worker is supposed to have accuracy result.')
-
-  @tf.__internal__.test.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=['eager'], tf_api_version=2))
-  def testMwmsWithCtl(self, mode):
-    """Test multi-worker CTL training flow demo'ed in a to-be-added tutorial."""
-
-    def proc_func(checkpoint_dir):
-      global_batch_size = PER_WORKER_BATCH_SIZE * NUM_WORKERS
-      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
-      try:
-
-        with strategy.scope():
-          multi_worker_model = self.build_cnn_model()
-
-        multi_worker_dataset = strategy.distribute_datasets_from_function(
-            lambda input_context: self.dataset_fn(global_batch_size,  # pylint: disable=g-long-lambda
-                                                  input_context))
-        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
-        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
-            name='train_accuracy')
-
-        @tf.function
-        def train_step(iterator):
-          """Training step function."""
-
-          def step_fn(inputs):
-            """Per-Replica step function."""
-            x, y = inputs
-            with tf.GradientTape() as tape:
-              predictions = multi_worker_model(x, training=True)
-              per_batch_loss = tf.keras.losses.SparseCategoricalCrossentropy(
-                  from_logits=True,
-                  reduction=tf.keras.losses.Reduction.NONE)(y, predictions)
-              loss = tf.nn.compute_average_loss(
-                  per_batch_loss, global_batch_size=global_batch_size)
-
-            grads = tape.gradient(loss, multi_worker_model.trainable_variables)
-            optimizer.apply_gradients(
-                zip(grads, multi_worker_model.trainable_variables))
-            train_accuracy.update_state(y, predictions)
-
-            return loss
-
-          per_replica_losses = strategy.run(step_fn, args=(next(iterator),))
-          return strategy.reduce(
-              tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
-
-        epoch = tf.Variable(
-            initial_value=tf.constant(0, dtype=tf.dtypes.int64), name='epoch')
-        step_in_epoch = tf.Variable(
-            initial_value=tf.constant(0, dtype=tf.dtypes.int64),
-            name='step_in_epoch')
-
-        task_type, task_id = (strategy.cluster_resolver.task_type,
-                              strategy.cluster_resolver.task_id)
-        checkpoint = tf.train.Checkpoint(
-            model=multi_worker_model, epoch=epoch, step_in_epoch=step_in_epoch)
-        write_checkpoint_dir = write_filepath(checkpoint_dir, task_type,
-                                              task_id)
-        checkpoint_manager = tf.train.CheckpointManager(
-            checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
-
-        latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
-        if latest_checkpoint:
-          checkpoint.restore(latest_checkpoint)
-
-        while epoch.numpy() < NUM_EPOCHS:
-          iterator = iter(multi_worker_dataset)
-          total_loss = 0.0
-          num_batches = 0
-
-          while step_in_epoch.numpy() < NUM_STEPS_PER_EPOCH:
-            total_loss += train_step(iterator)
-            num_batches += 1
-            step_in_epoch.assign_add(1)
-
-          train_loss = total_loss / num_batches
-          logging.info('Epoch: %d, accuracy: %f, train_loss: %f.',
-                       epoch.numpy(), train_accuracy.result(), train_loss)
-
-          train_accuracy.reset_state()
-
-          checkpoint_manager.save()
-          if not _is_chief(task_type, task_id):
-            tf.io.gfile.rmtree(write_checkpoint_dir)
-
-          epoch.assign_add(1)
-          step_in_epoch.assign(0)
-
-      except tf.errors.UnavailableError as e:
-        logging.info('UnavailableError occurred: %r', e)
-        raise unittest.SkipTest('Skipping test due to UnavailableError')
-
-      logging.info('testMwmsWithCtl successfully ends')
-
-    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
-
-    mpr_result = tf.__internal__.distribute.multi_process_runner.run(
-        proc_func,
-        tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
-            num_workers=NUM_WORKERS),
-        return_output=True,
-        args=(checkpoint_dir,))
-
-    self.assertTrue(
-        any([
-            'testMwmsWithCtl successfully ends' in msg
-            for msg in mpr_result.stdout
-        ]))
-
-
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+
+    # TODO(rchao): Add a test to demonstrate gather with MWMS.
+
+    @contextlib.contextmanager
+    def skip_fetch_failure_exception(self):
+        try:
+            yield
+        except zipfile.BadZipfile:
+            # There can be a race when multiple processes are downloading the
+            # data.  Skip the test if that results in loading errors.
+            self.skipTest(
+                "Data loading error: Bad magic number for file header."
+            )
+        except Exception as e:
+            if "URL fetch failure" in str(e):
+                self.skipTest(
+                    "URL fetch error not considered failure of the test."
+                )
+            else:
+                raise
+
+    def mnist_dataset(self):
+        path_to_use = f"mnist_{str(uuid.uuid4())}.npz"
+        with self.skip_fetch_failure_exception():
+            (x_train, y_train), _ = tf.keras.datasets.mnist.load_data(
+                path=path_to_use
+            )
+        # The `x` arrays are in uint8 and have values in the range [0, 255].
+        # We need to convert them to float32 with values in the range [0, 1]
+        x_train = x_train / np.float32(255)
+        y_train = y_train.astype(np.int64)
+        train_dataset = tf.data.Dataset.from_tensor_slices(
+            (x_train, y_train)
+        ).shuffle(60000)
+        return train_dataset
+
+    def dataset_fn(self, global_batch_size, input_context):
+        batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+        dataset = self.mnist_dataset()
+        dataset = dataset.shard(
+            input_context.num_input_pipelines, input_context.input_pipeline_id
+        )
+        dataset = dataset.batch(batch_size)
+        return dataset
+
+    def build_cnn_model(self):
+        return tf.keras.Sequential(
+            [
+                tf.keras.layers.Input(shape=(28, 28)),
+                tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
+                tf.keras.layers.Conv2D(32, 3, activation="relu"),
+                tf.keras.layers.Flatten(),
+                tf.keras.layers.Dense(128, activation="relu"),
+                tf.keras.layers.Dense(10),
+            ]
+        )
+
+    def build_and_compile_cnn_model(self):
+        model = self.build_cnn_model()
+        model.compile(
+            loss=tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True
+            ),
+            optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
+            metrics=["accuracy"],
+        )
+        return model
+
+    @tf.__internal__.test.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"], tf_api_version=2
+        )
+    )
+    def testSingleWorkerModelFit(self):
+        single_worker_dataset = self.mnist_dataset().batch(
+            PER_WORKER_BATCH_SIZE
+        )
+        single_worker_model = self.build_and_compile_cnn_model()
+        single_worker_model.fit(single_worker_dataset, epochs=NUM_EPOCHS)
+
+    @tf.__internal__.test.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"], tf_api_version=2
+        )
+    )
+    def testMwmsWithModelFit(self, mode):
+        """Test multi-worker training flow demoed in go/multi-worker-with-keras.
+
+        This test should be kept in sync with the code samples in
+        go/multi-worker-with-keras.
+
+        Args:
+          mode: Runtime mode.
+        """
+
+        def fn(model_path, checkpoint_dir):
+            global_batch_size = PER_WORKER_BATCH_SIZE * NUM_WORKERS
+            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+            with strategy.scope():
+                multi_worker_model = self.build_and_compile_cnn_model()
+
+            callbacks = [
+                tf.keras.callbacks.ModelCheckpoint(
+                    filepath=os.path.join(self.get_temp_dir(), "checkpoint")
+                )
+            ]
+
+            multi_worker_dataset = strategy.distribute_datasets_from_function(
+                lambda input_context: self.dataset_fn(
+                    global_batch_size, input_context
+                )
+            )
+
+            multi_worker_model.fit(
+                multi_worker_dataset,
+                epochs=NUM_EPOCHS,
+                steps_per_epoch=50,
+                callbacks=callbacks,
+            )
+
+            task_type, task_id = (
+                strategy.cluster_resolver.task_type,
+                strategy.cluster_resolver.task_id,
+            )
+            write_model_path = write_filepath(model_path, task_type, task_id)
+
+            multi_worker_model.save(write_model_path)
+            if not _is_chief(task_type, task_id):
+                tf.io.gfile.rmtree(os.path.dirname(write_model_path))
+
+            # Make sure chief finishes saving before non-chief's assertions.
+            tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+
+            if not tf.io.gfile.exists(model_path):
+                raise RuntimeError()
+            if tf.io.gfile.exists(write_model_path) != _is_chief(
+                task_type, task_id
+            ):
+                raise RuntimeError()
+
+            with strategy.scope():
+                loaded_model = tf.keras.models.load_model(model_path)
+            loaded_model.fit(multi_worker_dataset, epochs=1, steps_per_epoch=1)
+
+            checkpoint = tf.train.Checkpoint(model=multi_worker_model)
+            write_checkpoint_dir = write_filepath(
+                checkpoint_dir, task_type, task_id
+            )
+            checkpoint_manager = tf.train.CheckpointManager(
+                checkpoint, directory=write_checkpoint_dir, max_to_keep=1
+            )
+
+            checkpoint_manager.save()
+            if not _is_chief(task_type, task_id):
+                tf.io.gfile.rmtree(write_checkpoint_dir)
+
+            # Make sure chief finishes saving before non-chief's assertions.
+            tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+
+            if not tf.io.gfile.exists(checkpoint_dir):
+                raise RuntimeError()
+            if tf.io.gfile.exists(write_checkpoint_dir) != _is_chief(
+                task_type, task_id
+            ):
+                raise RuntimeError()
+
+            latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
+            checkpoint.restore(latest_checkpoint)
+            multi_worker_model.fit(
+                multi_worker_dataset, epochs=1, steps_per_epoch=1
+            )
+
+            logging.info("testMwmsWithModelFit successfully ends")
+
+        model_path = os.path.join(self.get_temp_dir(), "model.tf")
+        checkpoint_dir = os.path.join(self.get_temp_dir(), "ckpt")
+        try:
+            mpr_result = tf.__internal__.distribute.multi_process_runner.run(
+                fn,
+                tf.__internal__.distribute.multi_process_runner.create_cluster_spec(  # noqa: E501
+                    num_workers=NUM_WORKERS
+                ),
+                args=(model_path, checkpoint_dir),
+                return_output=True,
+            )
+        except tf.errors.UnavailableError:
+            self.skipTest("Skipping rare disconnection among the workers.")
+
+        self.assertTrue(
+            any(
+                [
+                    "testMwmsWithModelFit successfully ends" in msg
+                    for msg in mpr_result.stdout
+                ]
+            )
+        )
+
+        def extract_accuracy(worker_id, input_string):
+            match = re.match(
+                r"\[worker\-{}\].*accuracy: (\d+\.\d+).*".format(worker_id),
+                input_string,
+            )
+            return None if match is None else float(match.group(1))
+
+        for worker_id in range(NUM_WORKERS):
+            accu_result = tf.nest.map_structure(
+                lambda x: extract_accuracy(worker_id, x),
+                mpr_result.stdout,
+            )
+            self.assertTrue(
+                any(accu_result),
+                "Every worker is supposed to have accuracy result.",
+            )
+
+    @tf.__internal__.test.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"], tf_api_version=2
+        )
+    )
+    def testMwmsWithCtl(self, mode):
+        """Test multi-worker CTL training flow demo'ed in a to-be-added
+        tutorial."""
+
+        def proc_func(checkpoint_dir):
+            global_batch_size = PER_WORKER_BATCH_SIZE * NUM_WORKERS
+            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+            try:
+
+                with strategy.scope():
+                    multi_worker_model = self.build_cnn_model()
+
+                multi_worker_dataset = (
+                    strategy.distribute_datasets_from_function(
+                        lambda input_context: self.dataset_fn(
+                            global_batch_size,
+                            input_context,
+                        )
+                    )
+                )
+                optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
+                train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+                    name="train_accuracy"
+                )
+
+                @tf.function
+                def train_step(iterator):
+                    """Training step function."""
+
+                    def step_fn(inputs):
+                        """Per-Replica step function."""
+                        x, y = inputs
+                        with tf.GradientTape() as tape:
+                            predictions = multi_worker_model(x, training=True)
+                            per_batch_loss = (
+                                tf.keras.losses.SparseCategoricalCrossentropy(
+                                    from_logits=True,
+                                    reduction=tf.keras.losses.Reduction.NONE,
+                                )(y, predictions)
+                            )
+                            loss = tf.nn.compute_average_loss(
+                                per_batch_loss,
+                                global_batch_size=global_batch_size,
+                            )
+
+                        grads = tape.gradient(
+                            loss, multi_worker_model.trainable_variables
+                        )
+                        optimizer.apply_gradients(
+                            zip(grads, multi_worker_model.trainable_variables)
+                        )
+                        train_accuracy.update_state(y, predictions)
+
+                        return loss
+
+                    per_replica_losses = strategy.run(
+                        step_fn, args=(next(iterator),)
+                    )
+                    return strategy.reduce(
+                        tf.distribute.ReduceOp.SUM,
+                        per_replica_losses,
+                        axis=None,
+                    )
+
+                epoch = tf.Variable(
+                    initial_value=tf.constant(0, dtype=tf.dtypes.int64),
+                    name="epoch",
+                )
+                step_in_epoch = tf.Variable(
+                    initial_value=tf.constant(0, dtype=tf.dtypes.int64),
+                    name="step_in_epoch",
+                )
+
+                task_type, task_id = (
+                    strategy.cluster_resolver.task_type,
+                    strategy.cluster_resolver.task_id,
+                )
+                checkpoint = tf.train.Checkpoint(
+                    model=multi_worker_model,
+                    epoch=epoch,
+                    step_in_epoch=step_in_epoch,
+                )
+                write_checkpoint_dir = write_filepath(
+                    checkpoint_dir, task_type, task_id
+                )
+                checkpoint_manager = tf.train.CheckpointManager(
+                    checkpoint, directory=write_checkpoint_dir, max_to_keep=1
+                )
+
+                latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
+                if latest_checkpoint:
+                    checkpoint.restore(latest_checkpoint)
+
+                while epoch.numpy() < NUM_EPOCHS:
+                    iterator = iter(multi_worker_dataset)
+                    total_loss = 0.0
+                    num_batches = 0
+
+                    while step_in_epoch.numpy() < NUM_STEPS_PER_EPOCH:
+                        total_loss += train_step(iterator)
+                        num_batches += 1
+                        step_in_epoch.assign_add(1)
+
+                    train_loss = total_loss / num_batches
+                    logging.info(
+                        "Epoch: %d, accuracy: %f, train_loss: %f.",
+                        epoch.numpy(),
+                        train_accuracy.result(),
+                        train_loss,
+                    )
+
+                    train_accuracy.reset_state()
+
+                    checkpoint_manager.save()
+                    if not _is_chief(task_type, task_id):
+                        tf.io.gfile.rmtree(write_checkpoint_dir)
+
+                    epoch.assign_add(1)
+                    step_in_epoch.assign(0)
+
+            except tf.errors.UnavailableError as e:
+                logging.info("UnavailableError occurred: %r", e)
+                raise unittest.SkipTest("Skipping test due to UnavailableError")
+
+            logging.info("testMwmsWithCtl successfully ends")
+
+        checkpoint_dir = os.path.join(self.get_temp_dir(), "ckpt")
+
+        mpr_result = tf.__internal__.distribute.multi_process_runner.run(
+            proc_func,
+            tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
+                num_workers=NUM_WORKERS
+            ),
+            return_output=True,
+            args=(checkpoint_dir,),
+        )
+
+        self.assertTrue(
+            any(
+                [
+                    "testMwmsWithCtl successfully ends" in msg
+                    for msg in mpr_result.stdout
+                ]
+            )
+        )
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/integration_test/mwms_multi_process_runner_test.py b/keras/integration_test/mwms_multi_process_runner_test.py
index 17f72e3d576c..178b843af8d5 100644
--- a/keras/integration_test/mwms_multi_process_runner_test.py
+++ b/keras/integration_test/mwms_multi_process_runner_test.py
@@ -18,8 +18,9 @@
 from __future__ import print_function
 
 import os
-from absl import logging
+
 import tensorflow.compat.v2 as tf
+from absl import logging
 
 NUM_WORKERS = 2
 NUM_EPOCHS = 2
@@ -27,51 +28,59 @@
 
 
 class MwmsMultiProcessRunnerTest(tf.test.TestCase):
-  """Test to demonstrate Keras training with MultiWorkerMirroredStrategy."""
-
-  def testMwmsWithModelFit(self):
-
-    def worker_fn():
+    """Test to demonstrate Keras training with MultiWorkerMirroredStrategy."""
 
-      def dataset_fn(input_context):
-        del input_context  # User should shard data accordingly. Omitted here.
-        return tf.data.Dataset.from_tensor_slices((tf.random.uniform(
-            (6, 10)), tf.random.uniform((6, 10)))).batch(2)
+    def testMwmsWithModelFit(self):
+        def worker_fn():
+            def dataset_fn(input_context):
+                # User should shard data accordingly. Omitted here.
+                del input_context
+                return tf.data.Dataset.from_tensor_slices(
+                    (tf.random.uniform((6, 10)), tf.random.uniform((6, 10)))
+                ).batch(2)
 
-      strategy = tf.distribute.MultiWorkerMirroredStrategy()
-      with strategy.scope():
-        model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-      model.compile(
-          loss=tf.keras.losses.CategoricalCrossentropy(),
-          optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
-          metrics=['accuracy'])
+            strategy = tf.distribute.MultiWorkerMirroredStrategy()
+            with strategy.scope():
+                model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+            model.compile(
+                loss=tf.keras.losses.CategoricalCrossentropy(),
+                optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
+                metrics=["accuracy"],
+            )
 
-      callbacks = [
-          tf.keras.callbacks.ModelCheckpoint(
-              filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
-      ]
-      dataset = strategy.distribute_datasets_from_function(dataset_fn)
-      model.fit(
-          dataset,
-          epochs=NUM_EPOCHS,
-          steps_per_epoch=NUM_STEPS_PER_EPOCH,
-          callbacks=callbacks)
+            callbacks = [
+                tf.keras.callbacks.ModelCheckpoint(
+                    filepath=os.path.join(self.get_temp_dir(), "checkpoint")
+                )
+            ]
+            dataset = strategy.distribute_datasets_from_function(dataset_fn)
+            model.fit(
+                dataset,
+                epochs=NUM_EPOCHS,
+                steps_per_epoch=NUM_STEPS_PER_EPOCH,
+                callbacks=callbacks,
+            )
 
-      logging.info('testMwmsWithModelFit successfully ends')
+            logging.info("testMwmsWithModelFit successfully ends")
 
-    mpr_result = tf.__internal__.distribute.multi_process_runner.run(
-        worker_fn,
-        tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
-            num_workers=NUM_WORKERS),
-        return_output=True)
+        mpr_result = tf.__internal__.distribute.multi_process_runner.run(
+            worker_fn,
+            tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
+                num_workers=NUM_WORKERS
+            ),
+            return_output=True,
+        )
 
-    # Verifying the worker functions ended successfully.
-    self.assertTrue(
-        any([
-            'testMwmsWithModelFit successfully ends' in msg
-            for msg in mpr_result.stdout
-        ]))
+        # Verifying the worker functions ended successfully.
+        self.assertTrue(
+            any(
+                [
+                    "testMwmsWithModelFit successfully ends" in msg
+                    for msg in mpr_result.stdout
+                ]
+            )
+        )
 
 
-if __name__ == '__main__':
-  tf.__internal__.distribute.multi_process_runner.test_main()
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/integration_test/parameter_server_custom_training_loop_test.py b/keras/integration_test/parameter_server_custom_training_loop_test.py
index f30afc56f535..b35393b5bbad 100644
--- a/keras/integration_test/parameter_server_custom_training_loop_test.py
+++ b/keras/integration_test/parameter_server_custom_training_loop_test.py
@@ -16,10 +16,12 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import multiprocessing
-from absl import logging
+
 import portpicker
 import tensorflow.compat.v2 as tf
+from absl import logging
 
 NUM_EPOCHS = 10
 NUM_STEPS = 100
@@ -27,108 +29,129 @@
 
 
 class ParameterServerCustomTrainingLoopTest(tf.test.TestCase):
-  """Test to demonstrate custom training loop with ParameterServerStrategy."""
-
-  def create_in_process_cluster(self, num_workers, num_ps):
-    """Creates and starts local servers and returns the cluster_resolver."""
-    worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-    ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
-
-    cluster_dict = {}
-    cluster_dict["worker"] = ["localhost:%s" % port for port in worker_ports]
-    if num_ps > 0:
-      cluster_dict["ps"] = ["localhost:%s" % port for port in ps_ports]
-
-    cluster_spec = tf.train.ClusterSpec(cluster_dict)
-
-    # Workers need some inter_ops threads to work properly.
-    worker_config = tf.compat.v1.ConfigProto()
-    if multiprocessing.cpu_count() < num_workers + 1:
-      worker_config.inter_op_parallelism_threads = num_workers + 1
-
-    for i in range(num_workers):
-      tf.distribute.Server(
-          cluster_spec,
-          job_name="worker",
-          task_index=i,
-          config=worker_config,
-          protocol="grpc")
-
-    for i in range(num_ps):
-      tf.distribute.Server(
-          cluster_spec, job_name="ps", task_index=i, protocol="grpc")
-
-    return cluster_spec
-
-  def setUp(self):
-    super().setUp()
-
-    cluster_spec = self.create_in_process_cluster(num_workers=3, num_ps=2)
-    cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
-        cluster_spec, rpc_layer="grpc")
-    self.strategy = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-    self.coordinator = (
-        tf.distribute.experimental.coordinator.ClusterCoordinator(
-            self.strategy))
-
-  def testCustomTrainingLoop(self):
-
-    coordinator, strategy = self.coordinator, self.strategy
-
-    def per_worker_dataset_fn():
-
-      def dataset_fn(_):
-        return tf.data.Dataset.from_tensor_slices((tf.random.uniform(
-            (6, 10)), tf.random.uniform((6, 10)))).batch(2).repeat()
-
-      return strategy.distribute_datasets_from_function(dataset_fn)
-
-    per_worker_dataset = coordinator.create_per_worker_dataset(
-        per_worker_dataset_fn)
-    with strategy.scope():
-      model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-      optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
-      train_accuracy = tf.keras.metrics.CategoricalAccuracy(
-          name="train_accuracy")
-
-    @tf.function
-    def worker_train_fn(iterator):
-
-      def replica_fn(inputs):
-        """Training loop function."""
-        batch_data, labels = inputs
-        with tf.GradientTape() as tape:
-          predictions = model(batch_data, training=True)
-          loss = tf.keras.losses.CategoricalCrossentropy(
-              reduction=tf.keras.losses.Reduction.NONE)(labels, predictions)
-        gradients = tape.gradient(loss, model.trainable_variables)
-
-        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
-        train_accuracy.update_state(labels, predictions)
-
-      for _ in tf.range(STEPS_PER_EXECUTION):
-        strategy.run(replica_fn, args=(next(iterator),))
-
-    for epoch in range(NUM_EPOCHS):
-
-      distributed_iterator = iter(per_worker_dataset)
-
-      for step in range(0, NUM_STEPS, STEPS_PER_EXECUTION):
-        coordinator.schedule(worker_train_fn, args=(distributed_iterator,))
-        logging.info("Epoch %d, step %d scheduled.", epoch, step)
-
-      logging.info("Now joining at epoch %d.", epoch)
-      coordinator.join()
-      logging.info(
-          "Finished joining at epoch %d. Training accuracy: %f. "
-          "Total iterations: %d", epoch, train_accuracy.result(),
-          optimizer.iterations.value())
-
-      if epoch < NUM_EPOCHS - 1:
-        train_accuracy.reset_states()
+    """Test to demonstrate custom training loop with ParameterServerStrategy."""
+
+    def create_in_process_cluster(self, num_workers, num_ps):
+        """Creates and starts local servers and returns the cluster_resolver."""
+        worker_ports = [
+            portpicker.pick_unused_port() for _ in range(num_workers)
+        ]
+        ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+
+        cluster_dict = {}
+        cluster_dict["worker"] = [f"localhost:{port}" for port in worker_ports]
+        if num_ps > 0:
+            cluster_dict["ps"] = [f"localhost:{port}" for port in ps_ports]
+
+        cluster_spec = tf.train.ClusterSpec(cluster_dict)
+
+        # Workers need some inter_ops threads to work properly.
+        worker_config = tf.compat.v1.ConfigProto()
+        if multiprocessing.cpu_count() < num_workers + 1:
+            worker_config.inter_op_parallelism_threads = num_workers + 1
+
+        for i in range(num_workers):
+            tf.distribute.Server(
+                cluster_spec,
+                job_name="worker",
+                task_index=i,
+                config=worker_config,
+                protocol="grpc",
+            )
+
+        for i in range(num_ps):
+            tf.distribute.Server(
+                cluster_spec, job_name="ps", task_index=i, protocol="grpc"
+            )
+
+        return cluster_spec
+
+    def setUp(self):
+        super().setUp()
+
+        cluster_spec = self.create_in_process_cluster(num_workers=3, num_ps=2)
+        cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
+            cluster_spec, rpc_layer="grpc"
+        )
+        self.strategy = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver
+        )
+        self.coordinator = (
+            tf.distribute.experimental.coordinator.ClusterCoordinator(
+                self.strategy
+            )
+        )
+
+    def testCustomTrainingLoop(self):
+
+        coordinator, strategy = self.coordinator, self.strategy
+
+        def per_worker_dataset_fn():
+            def dataset_fn(_):
+                return (
+                    tf.data.Dataset.from_tensor_slices(
+                        (tf.random.uniform((6, 10)), tf.random.uniform((6, 10)))
+                    )
+                    .batch(2)
+                    .repeat()
+                )
+
+            return strategy.distribute_datasets_from_function(dataset_fn)
+
+        per_worker_dataset = coordinator.create_per_worker_dataset(
+            per_worker_dataset_fn
+        )
+        with strategy.scope():
+            model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+            optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
+            train_accuracy = tf.keras.metrics.CategoricalAccuracy(
+                name="train_accuracy"
+            )
+
+        @tf.function
+        def worker_train_fn(iterator):
+            def replica_fn(inputs):
+                """Training loop function."""
+                batch_data, labels = inputs
+                with tf.GradientTape() as tape:
+                    predictions = model(batch_data, training=True)
+                    loss = tf.keras.losses.CategoricalCrossentropy(
+                        reduction=tf.keras.losses.Reduction.NONE
+                    )(labels, predictions)
+                gradients = tape.gradient(loss, model.trainable_variables)
+
+                optimizer.apply_gradients(
+                    zip(gradients, model.trainable_variables)
+                )
+                train_accuracy.update_state(labels, predictions)
+
+            for _ in tf.range(STEPS_PER_EXECUTION):
+                strategy.run(replica_fn, args=(next(iterator),))
+
+        for epoch in range(NUM_EPOCHS):
+
+            distributed_iterator = iter(per_worker_dataset)
+
+            for step in range(0, NUM_STEPS, STEPS_PER_EXECUTION):
+                coordinator.schedule(
+                    worker_train_fn, args=(distributed_iterator,)
+                )
+                logging.info("Epoch %d, step %d scheduled.", epoch, step)
+
+            logging.info("Now joining at epoch %d.", epoch)
+            coordinator.join()
+            logging.info(
+                "Finished joining at epoch %d. Training accuracy: %f. "
+                "Total iterations: %d",
+                epoch,
+                train_accuracy.result(),
+                optimizer.iterations.value(),
+            )
+
+            if epoch < NUM_EPOCHS - 1:
+                train_accuracy.reset_states()
 
 
 if __name__ == "__main__":
-  if tf.__internal__.tf2.enabled():
-    tf.__internal__.distribute.multi_process_runner.test_main()
+    if tf.__internal__.tf2.enabled():
+        tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/integration_test/parameter_server_keras_preprocessing_test.py b/keras/integration_test/parameter_server_keras_preprocessing_test.py
index 987115683d48..5dcda78fe120 100644
--- a/keras/integration_test/parameter_server_keras_preprocessing_test.py
+++ b/keras/integration_test/parameter_server_keras_preprocessing_test.py
@@ -18,309 +18,393 @@
 import os
 import random
 import tempfile
-from absl.testing import parameterized
-from keras.testing_infra import test_utils
+
 import numpy as np
 import portpicker
 import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
+from keras.testing_infra import test_utils
 
 # These vocabularies usually come from TFT or a Beam pipeline.
 FEATURE_VOCAB = [
-    "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
-    "wonder_woman"
+    "avenger",
+    "ironman",
+    "batman",
+    "hulk",
+    "spiderman",
+    "kingkong",
+    "wonder_woman",
 ]
 LABEL_VOCAB = ["yes", "no"]
 
 
 def create_in_process_cluster(num_workers, num_ps):
-  """Creates and starts local servers and returns the cluster_resolver."""
+    """Creates and starts local servers and returns the cluster_resolver."""
 
-  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+    worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+    ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
 
-  cluster_dict = {}
-  cluster_dict["worker"] = ["localhost:%s" % port for port in worker_ports]
-  if num_ps > 0:
-    cluster_dict["ps"] = ["localhost:%s" % port for port in ps_ports]
+    cluster_dict = {}
+    cluster_dict["worker"] = [f"localhost:{port}" for port in worker_ports]
+    if num_ps > 0:
+        cluster_dict["ps"] = [f"localhost:{port}" for port in ps_ports]
 
-  cluster_spec = tf.train.ClusterSpec(cluster_dict)
+    cluster_spec = tf.train.ClusterSpec(cluster_dict)
 
-  # Workers need some inter_ops threads to work properly.
-  worker_config = tf.compat.v1.ConfigProto()
-  if multiprocessing.cpu_count() < num_workers + 1:
-    worker_config.inter_op_parallelism_threads = num_workers + 1
+    # Workers need some inter_ops threads to work properly.
+    worker_config = tf.compat.v1.ConfigProto()
+    if multiprocessing.cpu_count() < num_workers + 1:
+        worker_config.inter_op_parallelism_threads = num_workers + 1
 
-  for i in range(num_workers):
-    tf.distribute.Server(
-        cluster_spec,
-        job_name="worker",
-        task_index=i,
-        config=worker_config,
-        protocol="grpc")
+    for i in range(num_workers):
+        tf.distribute.Server(
+            cluster_spec,
+            job_name="worker",
+            task_index=i,
+            config=worker_config,
+            protocol="grpc",
+        )
 
-  for i in range(num_ps):
-    tf.distribute.Server(
-        cluster_spec, job_name="ps", task_index=i, protocol="grpc")
+    for i in range(num_ps):
+        tf.distribute.Server(
+            cluster_spec, job_name="ps", task_index=i, protocol="grpc"
+        )
 
-  return cluster_spec
+    return cluster_spec
 
 
 @test_utils.run_v2_only
 class KPLTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-
-    cluster_spec = create_in_process_cluster(num_workers=3, num_ps=2)
-    cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
-        cluster_spec, rpc_layer="grpc")
-    self.strategy = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-    self.coordinator = (
-        tf.distribute.experimental.coordinator.ClusterCoordinator(
-            self.strategy))
-
-  def define_kpls_for_training(self, use_adapt):
-    # Define KPLs under strategy's scope. Right now, if they have look up
-    # tables, they will be created on the client. Their variables will be
-    # created on PS. Ideally they should be cached on each worker since they
-    # will not be changed in a training step.
-    if use_adapt:
-      feature_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              num_oov_indices=1))
-      feature_lookup_layer.adapt(FEATURE_VOCAB)
-      label_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              num_oov_indices=0, mask_token=None))
-      label_lookup_layer.adapt(LABEL_VOCAB)
-    else:
-      # Do vocab shuffling.
-      shuffled_vocab = FEATURE_VOCAB.copy()
-      random.shuffle(shuffled_vocab)
-      feature_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              vocabulary=shuffled_vocab, num_oov_indices=1))
-      label_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None))
-
-    raw_feature_input = tf.keras.Input(
-        shape=(3,), dtype=tf.string, name="feature", ragged=True)
-    feature_id_input = feature_lookup_layer(raw_feature_input)
-
-    # Model creates variables as well.
-    feature_ps = tf.keras.Model({"features": raw_feature_input},
-                                feature_id_input)
-
-    raw_label_input = tf.keras.Input(shape=(1,), dtype=tf.string, name="label")
-    label_id_input = label_lookup_layer(raw_label_input)
-    label_ps = tf.keras.Model({"label": raw_label_input}, label_id_input)
-
-    return feature_ps, label_ps
-
-  def define_reverse_lookup_layer(self):
-    # Only needed for serving.
-    label_inverse_lookup_layer = (
-        tf.keras.layers.StringLookup(
+    def setUp(self):
+        super().setUp()
+
+        cluster_spec = create_in_process_cluster(num_workers=3, num_ps=2)
+        cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
+            cluster_spec, rpc_layer="grpc"
+        )
+        self.strategy = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver
+        )
+        self.coordinator = (
+            tf.distribute.experimental.coordinator.ClusterCoordinator(
+                self.strategy
+            )
+        )
+
+    def define_kpls_for_training(self, use_adapt):
+        # Define KPLs under strategy's scope. Right now, if they have look up
+        # tables, they will be created on the client. Their variables will be
+        # created on PS. Ideally they should be cached on each worker since they
+        # will not be changed in a training step.
+        if use_adapt:
+            feature_lookup_layer = tf.keras.layers.StringLookup(
+                num_oov_indices=1
+            )
+            feature_lookup_layer.adapt(FEATURE_VOCAB)
+            label_lookup_layer = tf.keras.layers.StringLookup(
+                num_oov_indices=0, mask_token=None
+            )
+            label_lookup_layer.adapt(LABEL_VOCAB)
+        else:
+            # Do vocab shuffling.
+            shuffled_vocab = FEATURE_VOCAB.copy()
+            random.shuffle(shuffled_vocab)
+            feature_lookup_layer = tf.keras.layers.StringLookup(
+                vocabulary=shuffled_vocab, num_oov_indices=1
+            )
+            label_lookup_layer = tf.keras.layers.StringLookup(
+                vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None
+            )
+
+        raw_feature_input = tf.keras.Input(
+            shape=(3,), dtype=tf.string, name="feature", ragged=True
+        )
+        feature_id_input = feature_lookup_layer(raw_feature_input)
+
+        # Model creates variables as well.
+        feature_ps = tf.keras.Model(
+            {"features": raw_feature_input}, feature_id_input
+        )
+
+        raw_label_input = tf.keras.Input(
+            shape=(1,), dtype=tf.string, name="label"
+        )
+        label_id_input = label_lookup_layer(raw_label_input)
+        label_ps = tf.keras.Model({"label": raw_label_input}, label_id_input)
+
+        return feature_ps, label_ps
+
+    def define_reverse_lookup_layer(self):
+        # Only needed for serving.
+        label_inverse_lookup_layer = tf.keras.layers.StringLookup(
             num_oov_indices=0,
             mask_token=None,
             vocabulary=LABEL_VOCAB,
-            invert=True))
-    return label_inverse_lookup_layer
-
-  @tf.__internal__.distribute.combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          mode=["eager"],
-          use_adapt=[True, False],
-          # TODO(b/1949359300): `load_under_strategy=True` flakily times out.
-          load_under_strategy=[False]))
-  def testTrainAndServe(self, use_adapt, load_under_strategy):
-
-    with self.coordinator.strategy.scope():
-
-      feature_ps, label_ps = self.define_kpls_for_training(use_adapt)
-
-      def dataset_fn():
-
-        def feature_and_label_gen():
-          while True:
-            features = random.sample(FEATURE_VOCAB, 3)
-            label = ["yes"] if "avenger" in features else ["no"]
-            yield {"features": features, "label": label}
-
-        # The dataset will be created on the coordinator.
-        raw_dataset = tf.data.Dataset.from_generator(
-            feature_and_label_gen,
-            output_signature={
-                "features": tf.TensorSpec([3], tf.string),
-                "label": tf.TensorSpec([1], tf.string)
-            }).shuffle(100).batch(32)
-
-        train_dataset = raw_dataset.map(lambda x: (  # pylint: disable=g-long-lambda
-            {
-                "features": feature_ps(x["features"])
-            }, label_ps(x["label"])))
-        return train_dataset
-
-      # Create the model. The input needs to be compatible with KPLs.
-      model_input = tf.keras.Input(
-          shape=(3,), dtype=tf.int64, name="model_input")
-
-      # input_dim includes a mask token and an oov token.
-      emb_output = tf.keras.layers.Embedding(
-          input_dim=len(FEATURE_VOCAB) + 2, output_dim=20)(
-              model_input)
-      emb_output = tf.reduce_mean(emb_output, axis=1)
-      dense_output = tf.keras.layers.Dense(
-          units=1, activation="sigmoid")(
-              emb_output)
-      model = tf.keras.Model({"features": model_input}, dense_output)
-
-      optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
-      accuracy = tf.keras.metrics.Accuracy()
-
-    @tf.function
-    def worker_fn(iterator):
-
-      def replica_fn(iterator):
-        batch_data, labels = next(iterator)
-        with tf.GradientTape() as tape:
-          pred = model(batch_data, training=True)
-          loss = tf.nn.compute_average_loss(
-              tf.keras.losses.BinaryCrossentropy(
-                  reduction=tf.keras.losses.Reduction.NONE)(labels, pred))
-          gradients = tape.gradient(loss, model.trainable_variables)
-
-        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
-
-        actual_pred = tf.cast(tf.greater(pred, 0.5), tf.int64)
-        accuracy.update_state(labels, actual_pred)
-
-      self.coordinator.strategy.run(replica_fn, args=(iterator,))
-
-    distributed_dataset = self.coordinator.create_per_worker_dataset(dataset_fn)
-    distributed_iterator = iter(distributed_dataset)
-    for _ in range(4):
-      accuracy.reset_state()
-      for _ in range(7):
-        self.coordinator.schedule(worker_fn, args=(distributed_iterator,))
-      self.coordinator.join()
-    self.assertGreater(accuracy.result().numpy(), 0.5)
-
-    # Create a saved model.
-    model.feature_ps = feature_ps
-    model.label_ps = label_ps
-    model.label_inverse_lookup_layer = self.define_reverse_lookup_layer()
-
-    def create_serving_signature(model):
-
-      @tf.function
-      def serve_fn(raw_features):
-        raw_features = tf.expand_dims(raw_features, axis=0)
-        transformed_features = model.feature_ps(raw_features)
-        outputs = model(transformed_features)
-        outputs = tf.squeeze(outputs, axis=0)
-        outputs = tf.cast(tf.greater(outputs, 0.5), tf.int64)
-        decoded_outputs = model.label_inverse_lookup_layer(outputs)
-        return tf.squeeze(decoded_outputs, axis=0)
-
-      # serving does NOT have batch dimension
-      return serve_fn.get_concrete_function(
-          tf.TensorSpec(shape=(3), dtype=tf.string, name="example"))
-
-    serving_fn = create_serving_signature(model)
-
-    saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    model.save(saved_model_dir, signatures={"serving_default": serving_fn})
-
-    if load_under_strategy:
-      with self.coordinator.strategy.scope():
-
-        loaded_serving_fn = tf.keras.models.load_model(
-            saved_model_dir).signatures["serving_default"]
-
-      outputs = []
-      for _ in range(7):
-        outputs.append(
-            self.coordinator.schedule(
-                loaded_serving_fn,
-                args=(tf.constant(["avenger", "ironman", "avenger"]),)))
-      self.coordinator.join()
-      for prediction0 in outputs:
-        self.assertIn(prediction0._get_values()["output_0"], ("yes", "no"))
-    else:
-      loaded_serving_fn = tf.keras.models.load_model(
-          saved_model_dir).signatures["serving_default"]
-
-      # check the result w/ and w/o avenger.
-      prediction0 = loaded_serving_fn(
-          tf.constant(["avenger", "ironman", "avenger"]))["output_0"]
-      self.assertIn(prediction0, ("yes", "no"))
-
-      prediction1 = loaded_serving_fn(
-          tf.constant(["ironman", "ironman", "unknown"]))["output_0"]
-      self.assertIn(prediction1, ("yes", "no"))
+            invert=True,
+        )
+        return label_inverse_lookup_layer
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            use_adapt=[True, False],
+            test_training_with_loaded=[True, False],
+            # TODO(b/1949359300): `load_for_serving_under_strategy=True` flakily
+            # times out.
+            load_for_serving_under_strategy=[False],
+        )
+    )
+    def testTrainAndLoadAndServe(
+        self,
+        use_adapt,
+        test_training_with_loaded,
+        load_for_serving_under_strategy,
+    ):
+
+        # test_training_with_loaded=False tests distributed training with newly
+        # constructed KPL, while test_training_with_loaded=True tests
+        # distributed training with a loaded KPL which was created under
+        # strategy scope as well.
+        #
+        # load_for_serving_under_strategy test serving with a model loaded
+        # under distribution strategy or not.
+
+        with self.coordinator.strategy.scope():
+
+            feature_ps, label_ps = self.define_kpls_for_training(use_adapt)
+
+            if test_training_with_loaded:
+                saved_kpl_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+                feature_ps_dir = os.path.join(saved_kpl_dir, "feature")
+                label_ps_dir = os.path.join(saved_kpl_dir, "label")
+
+                feature_ps.save(feature_ps_dir)
+                label_ps.save(label_ps_dir)
+
+                del feature_ps, label_ps
+
+                feature_ps = tf.keras.models.load_model(feature_ps_dir)
+                label_ps = tf.keras.models.load_model(label_ps_dir)
+
+            def dataset_fn():
+                def feature_and_label_gen():
+                    while True:
+                        features = random.sample(FEATURE_VOCAB, 3)
+                        label = ["yes"] if "avenger" in features else ["no"]
+                        yield {"features": features, "label": label}
+
+                # The dataset will be created on the coordinator.
+                raw_dataset = (
+                    tf.data.Dataset.from_generator(
+                        feature_and_label_gen,
+                        output_signature={
+                            "features": tf.TensorSpec([3], tf.string),
+                            "label": tf.TensorSpec([1], tf.string),
+                        },
+                    )
+                    .shuffle(100)
+                    .batch(32)
+                )
+
+                train_dataset = raw_dataset.map(
+                    lambda x: (
+                        {"features": feature_ps(x["features"])},
+                        label_ps(x["label"]),
+                    )
+                )
+                return train_dataset
+
+            # Create the model. The input needs to be compatible with KPLs.
+            model_input = tf.keras.Input(
+                shape=(3,), dtype=tf.int64, name="model_input"
+            )
+
+            # input_dim includes a mask token and an oov token.
+            emb_output = tf.keras.layers.Embedding(
+                input_dim=len(FEATURE_VOCAB) + 2, output_dim=20
+            )(model_input)
+            emb_output = tf.reduce_mean(emb_output, axis=1)
+            dense_output = tf.keras.layers.Dense(units=1, activation="sigmoid")(
+                emb_output
+            )
+            model = tf.keras.Model({"features": model_input}, dense_output)
+
+            optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
+            accuracy = tf.keras.metrics.Accuracy()
+
+        @tf.function
+        def worker_fn(iterator):
+            def replica_fn(iterator):
+                batch_data, labels = next(iterator)
+                with tf.GradientTape() as tape:
+                    pred = model(batch_data, training=True)
+                    loss = tf.nn.compute_average_loss(
+                        tf.keras.losses.BinaryCrossentropy(
+                            reduction=tf.keras.losses.Reduction.NONE
+                        )(labels, pred)
+                    )
+                    gradients = tape.gradient(loss, model.trainable_variables)
+
+                optimizer.apply_gradients(
+                    zip(gradients, model.trainable_variables)
+                )
+
+                actual_pred = tf.cast(tf.greater(pred, 0.5), tf.int64)
+                accuracy.update_state(labels, actual_pred)
+
+            self.coordinator.strategy.run(replica_fn, args=(iterator,))
+
+        distributed_dataset = self.coordinator.create_per_worker_dataset(
+            dataset_fn
+        )
+        distributed_iterator = iter(distributed_dataset)
+        for _ in range(4):
+            accuracy.reset_state()
+            for _ in range(7):
+                self.coordinator.schedule(
+                    worker_fn, args=(distributed_iterator,)
+                )
+            self.coordinator.join()
+        self.assertGreater(accuracy.result().numpy(), 0.5)
+
+        # Create a saved model.
+        model.feature_ps = feature_ps
+        model.label_ps = label_ps
+        model.label_inverse_lookup_layer = self.define_reverse_lookup_layer()
+
+        def create_serving_signature(model):
+            @tf.function
+            def serve_fn(raw_features):
+                raw_features = tf.expand_dims(raw_features, axis=0)
+                transformed_features = model.feature_ps(raw_features)
+                outputs = model(transformed_features)
+                outputs = tf.squeeze(outputs, axis=0)
+                outputs = tf.cast(tf.greater(outputs, 0.5), tf.int64)
+                decoded_outputs = model.label_inverse_lookup_layer(outputs)
+                return tf.squeeze(decoded_outputs, axis=0)
+
+            # serving does NOT have batch dimension
+            return serve_fn.get_concrete_function(
+                tf.TensorSpec(shape=(3), dtype=tf.string, name="example")
+            )
+
+        serving_fn = create_serving_signature(model)
+
+        saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+        model.save(saved_model_dir, signatures={"serving_default": serving_fn})
+
+        if load_for_serving_under_strategy:
+            with self.coordinator.strategy.scope():
+
+                loaded_serving_fn = tf.keras.models.load_model(
+                    saved_model_dir
+                ).signatures["serving_default"]
+
+            outputs = []
+            for _ in range(7):
+                outputs.append(
+                    self.coordinator.schedule(
+                        loaded_serving_fn,
+                        args=(tf.constant(["avenger", "ironman", "avenger"]),),
+                    )
+                )
+            self.coordinator.join()
+            for prediction0 in outputs:
+                self.assertIn(
+                    prediction0._get_values()["output_0"], ("yes", "no")
+                )
+        else:
+            loaded_serving_fn = tf.keras.models.load_model(
+                saved_model_dir
+            ).signatures["serving_default"]
+
+            # check the result w/ and w/o avenger.
+            prediction0 = loaded_serving_fn(
+                tf.constant(["avenger", "ironman", "avenger"])
+            )["output_0"]
+            self.assertIn(prediction0, ("yes", "no"))
+
+            prediction1 = loaded_serving_fn(
+                tf.constant(["ironman", "ironman", "unknown"])
+            )["output_0"]
+            self.assertIn(prediction1, ("yes", "no"))
 
 
 @test_utils.run_v2_only
-class KPLCreatedInDatasetsFromFunctionTest(tf.test.TestCase,
-                                           parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-
-    cluster_spec = create_in_process_cluster(num_workers=3, num_ps=2)
-    cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
-        cluster_spec, rpc_layer="grpc")
-    self.strategy = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-    self.coordinator = (
-        tf.distribute.experimental.coordinator.ClusterCoordinator(
-            self.strategy))
-
-  def testKPLCreatedInDatasetsFromFunction(self):
-
-    filepath = os.path.join(self.get_temp_dir(), "vocab")
-    with open(filepath, "w") as f:
-      f.write("\n".join(["earth", "wind", "and", "fire"]))
-
-    def per_worker_dataset_fn():
-
-      def dataset_fn(input_context):
-        del input_context
-        lookup_layer = tf.keras.layers.StringLookup(
-            num_oov_indices=1, vocabulary=filepath)
-        x = np.array([["earth", "wind", "and", "fire"],
-                      ["fire", "and", "earth", "michigan"]])
-        y = np.array([0, 1])
-        map_fn = lambda x, y: (lookup_layer(x), y)
-        return tf.data.Dataset.from_tensor_slices(
-            (x, y)).shuffle(10).repeat().batch(2).map(map_fn)
-
-      return self.coordinator.strategy.distribute_datasets_from_function(
-          dataset_fn)
-
-    per_worker_distribute_dataset = self.coordinator.create_per_worker_dataset(
-        per_worker_dataset_fn)
-    per_worker_iter = iter(per_worker_distribute_dataset)
-
-    @tf.function
-    def worker_fn(iterator):
-
-      def replica_fn(data):
-        return data
-
-      return self.coordinator.strategy.run(replica_fn, args=(next(iterator),))
-
-    result = []
-    for _ in range(10):
-      result.append(
-          self.coordinator.schedule(worker_fn, args=(per_worker_iter,)))
-
-    self.coordinator.join()
+class KPLCreatedInDatasetsFromFunctionTest(
+    tf.test.TestCase, parameterized.TestCase
+):
+    def setUp(self):
+        super().setUp()
+
+        cluster_spec = create_in_process_cluster(num_workers=3, num_ps=2)
+        cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
+            cluster_spec, rpc_layer="grpc"
+        )
+        self.strategy = tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver
+        )
+        self.coordinator = (
+            tf.distribute.experimental.coordinator.ClusterCoordinator(
+                self.strategy
+            )
+        )
+
+    def testKPLCreatedInDatasetsFromFunction(self):
+
+        filepath = os.path.join(self.get_temp_dir(), "vocab")
+        with open(filepath, "w") as f:
+            f.write("\n".join(["earth", "wind", "and", "fire"]))
+
+        def per_worker_dataset_fn():
+            def dataset_fn(input_context):
+                del input_context
+                lookup_layer = tf.keras.layers.StringLookup(
+                    num_oov_indices=1, vocabulary=filepath
+                )
+                x = np.array(
+                    [
+                        ["earth", "wind", "and", "fire"],
+                        ["fire", "and", "earth", "michigan"],
+                    ]
+                )
+                y = np.array([0, 1])
+                map_fn = lambda x, y: (lookup_layer(x), y)
+                return (
+                    tf.data.Dataset.from_tensor_slices((x, y))
+                    .shuffle(10)
+                    .repeat()
+                    .batch(2)
+                    .map(map_fn)
+                )
+
+            return self.coordinator.strategy.distribute_datasets_from_function(
+                dataset_fn
+            )
+
+        per_worker_distribute_dataset = (
+            self.coordinator.create_per_worker_dataset(per_worker_dataset_fn)
+        )
+        per_worker_iter = iter(per_worker_distribute_dataset)
+
+        @tf.function
+        def worker_fn(iterator):
+            def replica_fn(data):
+                return data
+
+            return self.coordinator.strategy.run(
+                replica_fn, args=(next(iterator),)
+            )
+
+        result = []
+        for _ in range(10):
+            result.append(
+                self.coordinator.schedule(worker_fn, args=(per_worker_iter,))
+            )
+
+        self.coordinator.join()
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/integration_test/parameter_server_training_metric_test.py b/keras/integration_test/parameter_server_training_metric_test.py
new file mode 100644
index 000000000000..adae47960738
--- /dev/null
+++ b/keras/integration_test/parameter_server_training_metric_test.py
@@ -0,0 +1,134 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests training metrics with PSS distribution strategy."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras import layers as layers_module
+from keras import metrics as metrics_module
+from keras.engine import training as training_module
+from keras.testing_infra import test_combinations
+
+# isort: off
+from tensorflow.python.distribute import (
+    multi_process_runner,
+    multi_worker_test_base,
+)
+
+
+class ParameterServerTrainingMetricTest(test_combinations.TestCase):
+    """Test Parameter Server Distribution strategy with Keras Model Training"""
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.cluster = multi_worker_test_base.create_multi_process_cluster(
+            num_workers=2, num_ps=3, rpc_layer="grpc"
+        )
+        cls.cluster_resolver = cls.cluster.cluster_resolver
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        cls.cluster.stop()
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_pss_fit_metric_batch_counter(self):
+        """Verify that metric data is complete during fit when using
+        ParameterServerStrategy
+        """
+        strategy = tf.distribute.ParameterServerStrategy(
+            self.cluster_resolver,
+            variable_partitioner=None,
+        )
+
+        class BatchCount(metrics_module.Sum):
+            def __init__(self, name="batch_count", dtype=tf.int64):
+                super().__init__(name=name, dtype=dtype)
+
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                return super().update_state(1, sample_weight)
+
+        # Build and compile model within strategy scope.
+        with strategy.scope():
+            inputs = layers_module.Input((1,))
+            outputs = layers_module.Dense(1)(inputs)
+            model = training_module.Model(inputs, outputs)
+            model.compile(
+                loss="mse", metrics=[BatchCount()], steps_per_execution=2
+            )
+
+        BATCH_SIZE = 10
+        x, y = np.ones((400, 1)), np.ones((400, 1))
+        val_x, val_y = np.ones((100, 1)), np.ones((100, 1))
+        train_dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        train_dataset = train_dataset.batch(BATCH_SIZE)
+        val_dataset = tf.data.Dataset.from_tensor_slices((val_x, val_y))
+        val_dataset = val_dataset.batch(BATCH_SIZE)
+        train_batch_count = x.shape[0] // BATCH_SIZE
+        val_batch_count = val_x.shape[0] // BATCH_SIZE
+        # Verify that Model fit doesn't drop any batches
+        hist = model.fit(
+            train_dataset,
+            steps_per_epoch=train_batch_count,
+            validation_data=val_dataset,
+            validation_steps=val_batch_count,
+            epochs=5,
+        )
+        # Verify that min and max value of batch count metric is accurate
+        self.assertEqual(max(hist.history["batch_count"]), train_batch_count)
+        self.assertEqual(min(hist.history["batch_count"]), train_batch_count)
+        self.assertEqual(max(hist.history["val_batch_count"]), val_batch_count)
+        self.assertEqual(min(hist.history["val_batch_count"]), val_batch_count)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_pss_evaluate_metric_batch_counter(self):
+        """Verify that metric data is complete during evaluate when using
+        ParameterServerStrategy
+        """
+        strategy = tf.distribute.ParameterServerStrategy(
+            self.cluster_resolver,
+            variable_partitioner=None,
+        )
+
+        class BatchCount(metrics_module.Sum):
+            def __init__(self, name="batch_count", dtype=tf.int64):
+                super().__init__(name=name, dtype=dtype)
+
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                return super().update_state(1, sample_weight)
+
+        # Build and compile model within strategy scope.
+        with strategy.scope():
+            inputs = layers_module.Input((1,))
+            outputs = layers_module.Dense(1)(inputs)
+            model = training_module.Model(inputs, outputs)
+            model.compile(
+                loss="mse", metrics=[BatchCount()], steps_per_execution=2
+            )
+
+        BATCH_SIZE = 10
+        x, y = np.ones((400, 1)), np.ones((400, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y))
+        batch_count = x.shape[0] // BATCH_SIZE
+        # Verify that Model Eval batch counter metric is accurate.
+        eval_results = model.evaluate(dataset, steps=batch_count)
+        self.assertEqual(eval_results[-1], batch_count)
+
+
+if __name__ == "__main__":
+    tf.enable_v2_behavior()
+    multi_process_runner.test_main()
diff --git a/keras/integration_test/preprocessing_applied_in_dataset_creator_test.py b/keras/integration_test/preprocessing_applied_in_dataset_creator_test.py
index 152656fb54c1..3c490a1f5800 100644
--- a/keras/integration_test/preprocessing_applied_in_dataset_creator_test.py
+++ b/keras/integration_test/preprocessing_applied_in_dataset_creator_test.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.integration_test import preprocessing_test_utils as utils
 
 ds_combinations = tf.__internal__.distribute.combinations
@@ -28,7 +29,7 @@
 # to API changes and backward-compatibility is not guaranteed.
 STRATEGIES = [
     ds_combinations.default_strategy,
-    ds_combinations.mirrored_strategy_with_cpu_1_and_2,
+    ds_combinations.mirrored_strategy_with_two_cpus,
     ds_combinations.mirrored_strategy_with_two_gpus,
     ds_combinations.tpu_strategy,
     ds_combinations.cloud_tpu_strategy,
@@ -41,34 +42,43 @@
 
 
 @ds_combinations.generate(
-    test_combinations.combine(strategy=STRATEGIES, mode="eager"))
+    test_combinations.combine(strategy=STRATEGIES, mode="eager")
+)
 class PreprocessingAppliedInDatasetCreatorTest(tf.test.TestCase):
-  """Demonstrate Keras preprocessing layers applied in tf.data.Dataset.map."""
+    """Demonstrate Keras preprocessing layers applied in tf.data.Dataset.map."""
 
-  def testDistributedModelFit(self, strategy):
-    if (not tf.__internal__.tf2.enabled()
-        and isinstance(strategy,
-                       tf.distribute.experimental.ParameterServerStrategy)):
-      self.skipTest(
-          "Parameter Server strategy with dataset creator need to be run when "
-          "eager execution is enabled.")
-    with strategy.scope():
-      preprocessing_model = utils.make_preprocessing_model(self.get_temp_dir())
-      training_model = utils.make_training_model()
-      training_model.compile(optimizer="sgd", loss="binary_crossentropy")
+    def testDistributedModelFit(self, strategy):
+        if not tf.__internal__.tf2.enabled() and isinstance(
+            strategy, tf.distribute.experimental.ParameterServerStrategy
+        ):
+            self.skipTest(
+                "Parameter Server strategy with dataset creator need to be run "
+                "when eager execution is enabled."
+            )
+        with strategy.scope():
+            preprocessing_model = utils.make_preprocessing_model(
+                self.get_temp_dir()
+            )
+            training_model = utils.make_training_model()
+            training_model.compile(optimizer="sgd", loss="binary_crossentropy")
 
-    def dataset_fn(input_context):
-      dataset = utils.make_dataset()
-      dataset = dataset.shard(input_context.num_input_pipelines,
-                              input_context.input_pipeline_id)
-      batch_size = input_context.get_per_replica_batch_size(
-          global_batch_size=utils.BATCH_SIZE)
-      dataset = dataset.batch(batch_size).repeat().prefetch(2)
-      return dataset.map(lambda x, y: (preprocessing_model(x), y))
+        def dataset_fn(input_context):
+            dataset = utils.make_dataset()
+            dataset = dataset.shard(
+                input_context.num_input_pipelines,
+                input_context.input_pipeline_id,
+            )
+            batch_size = input_context.get_per_replica_batch_size(
+                global_batch_size=utils.BATCH_SIZE
+            )
+            dataset = dataset.batch(batch_size).repeat().prefetch(2)
+            return dataset.map(lambda x, y: (preprocessing_model(x), y))
 
-    dataset_creator = tf.keras.utils.experimental.DatasetCreator(dataset_fn)
-    training_model.fit(dataset_creator, epochs=2, steps_per_epoch=utils.STEPS)
+        dataset_creator = tf.keras.utils.experimental.DatasetCreator(dataset_fn)
+        training_model.fit(
+            dataset_creator, epochs=2, steps_per_epoch=utils.STEPS
+        )
 
 
 if __name__ == "__main__":
-  multi_process_runner.test_main()
+    multi_process_runner.test_main()
diff --git a/keras/integration_test/preprocessing_applied_in_dataset_test.py b/keras/integration_test/preprocessing_applied_in_dataset_test.py
index ec73457f4c58..d54f9fdefaf3 100644
--- a/keras/integration_test/preprocessing_applied_in_dataset_test.py
+++ b/keras/integration_test/preprocessing_applied_in_dataset_test.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.integration_test import preprocessing_test_utils as utils
 
 ds_combinations = tf.__internal__.distribute.combinations
@@ -30,7 +31,7 @@
 # a DatasetCreator when training on a tf.data.Dataset.
 STRATEGIES = [
     ds_combinations.default_strategy,
-    ds_combinations.mirrored_strategy_with_cpu_1_and_2,
+    ds_combinations.mirrored_strategy_with_two_cpus,
     ds_combinations.mirrored_strategy_with_two_gpus,
     ds_combinations.tpu_strategy,
     ds_combinations.cloud_tpu_strategy,
@@ -41,21 +42,24 @@
 
 
 @ds_combinations.generate(
-    test_combinations.combine(strategy=STRATEGIES, mode="eager"))
+    test_combinations.combine(strategy=STRATEGIES, mode="eager")
+)
 class PreprocessingAppliedInDatasetTest(tf.test.TestCase):
-  """Demonstrate Keras preprocessing layers applied in tf.data.Dataset.map."""
+    """Demonstrate Keras preprocessing layers applied in tf.data.Dataset.map."""
 
-  def testDistributedModelFit(self, strategy):
-    with strategy.scope():
-      preprocessing_model = utils.make_preprocessing_model(self.get_temp_dir())
-      training_model = utils.make_training_model()
-      training_model.compile(optimizer="sgd", loss="binary_crossentropy")
+    def testDistributedModelFit(self, strategy):
+        with strategy.scope():
+            preprocessing_model = utils.make_preprocessing_model(
+                self.get_temp_dir()
+            )
+            training_model = utils.make_training_model()
+            training_model.compile(optimizer="sgd", loss="binary_crossentropy")
 
-    dataset = utils.make_dataset()
-    dataset = dataset.batch(utils.BATCH_SIZE)
-    dataset = dataset.map(lambda x, y: (preprocessing_model(x), y))
-    training_model.fit(dataset, epochs=2)
+        dataset = utils.make_dataset()
+        dataset = dataset.batch(utils.BATCH_SIZE)
+        dataset = dataset.map(lambda x, y: (preprocessing_model(x), y))
+        training_model.fit(dataset, epochs=2)
 
 
 if __name__ == "__main__":
-  multi_process_runner.test_main()
+    multi_process_runner.test_main()
diff --git a/keras/integration_test/preprocessing_applied_in_model_test.py b/keras/integration_test/preprocessing_applied_in_model_test.py
index 29f338115c6a..4b1a20706955 100644
--- a/keras/integration_test/preprocessing_applied_in_model_test.py
+++ b/keras/integration_test/preprocessing_applied_in_model_test.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+
 from keras.integration_test import preprocessing_test_utils as utils
 
 ds_combinations = tf.__internal__.distribute.combinations
@@ -28,7 +29,7 @@
 # to API changes and backward-compatibility is not guaranteed.
 STRATEGIES = [
     ds_combinations.default_strategy,
-    ds_combinations.mirrored_strategy_with_cpu_1_and_2,
+    ds_combinations.mirrored_strategy_with_two_cpus,
     ds_combinations.mirrored_strategy_with_two_gpus,
     # TODO(b/183044870) TPU strategies with soft placement do not yet work.
     # ds_combinations.tpu_strategy,
@@ -42,37 +43,44 @@
 
 
 @ds_combinations.generate(
-    test_combinations.combine(strategy=STRATEGIES, mode="eager"))
+    test_combinations.combine(strategy=STRATEGIES, mode="eager")
+)
 class PreprocessingAppliedInModelTest(tf.test.TestCase):
-  """Demonstrate Keras preprocessing layers applied inside a Model."""
+    """Demonstrate Keras preprocessing layers applied inside a Model."""
 
-  def testDistributedModelFit(self, strategy):
-    if (not tf.__internal__.tf2.enabled()
-        and isinstance(strategy,
-                       tf.distribute.experimental.ParameterServerStrategy)):
-      self.skipTest(
-          "Parameter Server strategy with dataset creator need to be run when "
-          "eager execution is enabled.")
-    with strategy.scope():
-      preprocessing_model = utils.make_preprocessing_model(self.get_temp_dir())
-      training_model = utils.make_training_model()
-      # Merge the two separate models into a single model for training.
-      inputs = preprocessing_model.inputs
-      outputs = training_model(preprocessing_model(inputs))
-      merged_model = tf.keras.Model(inputs, outputs)
-      merged_model.compile(optimizer="sgd", loss="binary_crossentropy")
+    def testDistributedModelFit(self, strategy):
+        if not tf.__internal__.tf2.enabled() and isinstance(
+            strategy, tf.distribute.experimental.ParameterServerStrategy
+        ):
+            self.skipTest(
+                "Parameter Server strategy with dataset creator need to be run "
+                "when eager execution is enabled."
+            )
+        with strategy.scope():
+            preprocessing_model = utils.make_preprocessing_model(
+                self.get_temp_dir()
+            )
+            training_model = utils.make_training_model()
+            # Merge the two separate models into a single model for training.
+            inputs = preprocessing_model.inputs
+            outputs = training_model(preprocessing_model(inputs))
+            merged_model = tf.keras.Model(inputs, outputs)
+            merged_model.compile(optimizer="sgd", loss="binary_crossentropy")
 
-    def dataset_fn(input_context):
-      dataset = utils.make_dataset()
-      dataset = dataset.shard(input_context.num_input_pipelines,
-                              input_context.input_pipeline_id)
-      batch_size = input_context.get_per_replica_batch_size(
-          global_batch_size=utils.BATCH_SIZE)
-      return dataset.batch(batch_size).repeat().prefetch(2)
+        def dataset_fn(input_context):
+            dataset = utils.make_dataset()
+            dataset = dataset.shard(
+                input_context.num_input_pipelines,
+                input_context.input_pipeline_id,
+            )
+            batch_size = input_context.get_per_replica_batch_size(
+                global_batch_size=utils.BATCH_SIZE
+            )
+            return dataset.batch(batch_size).repeat().prefetch(2)
 
-    dataset_creator = tf.keras.utils.experimental.DatasetCreator(dataset_fn)
-    merged_model.fit(dataset_creator, epochs=2, steps_per_epoch=utils.STEPS)
+        dataset_creator = tf.keras.utils.experimental.DatasetCreator(dataset_fn)
+        merged_model.fit(dataset_creator, epochs=2, steps_per_epoch=utils.STEPS)
 
 
 if __name__ == "__main__":
-  multi_process_runner.test_main()
+    multi_process_runner.test_main()
diff --git a/keras/integration_test/preprocessing_test_utils.py b/keras/integration_test/preprocessing_test_utils.py
index ace50be24164..8287dc83a348 100644
--- a/keras/integration_test/preprocessing_test_utils.py
+++ b/keras/integration_test/preprocessing_test_utils.py
@@ -17,6 +17,7 @@
 import os
 
 import tensorflow.compat.v2 as tf
+
 preprocessing = tf.keras.layers
 
 BATCH_SIZE = 64
@@ -26,85 +27,87 @@
 
 
 def make_dataset():
-  """Make a simple structured dataset.
-
-  The dataset contains three feature columns.
-    - float_col: an unnormalized numeric column.
-    - int_col: an column of integer IDs.
-    - string_col: a column of fixed vocabulary terms.
-
-  Returns:
-    The dataset.
-  """
-  tf.random.set_seed(197011)
-  floats = tf.random.uniform((DS_SIZE, 1), maxval=10, dtype="float32")
-  # Generate a 100 unique integer values, but over a wide range to showcase a
-  # common use case for IntegerLookup.
-  ints = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64")
-  ints = ints * 1000
-  # Use a fixed vocabulary of strings from 0 to 99, to showcase loading a
-  # vocabulary from a file.
-  strings = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64")
-  strings = tf.strings.as_string(strings)
-  features = {"float_col": floats, "int_col": ints, "string_col": strings}
-  # Random binary label.
-  labels = tf.random.uniform((DS_SIZE, 1), maxval=2, dtype="int64")
-  ds = tf.data.Dataset.from_tensor_slices((features, labels))
-  return ds
+    """Make a simple structured dataset.
+
+    The dataset contains three feature columns.
+      - float_col: an unnormalized numeric column.
+      - int_col: an column of integer IDs.
+      - string_col: a column of fixed vocabulary terms.
+
+    Returns:
+      The dataset.
+    """
+    tf.random.set_seed(197011)
+    floats = tf.random.uniform((DS_SIZE, 1), maxval=10, dtype="float32")
+    # Generate a 100 unique integer values, but over a wide range to showcase a
+    # common use case for IntegerLookup.
+    ints = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64")
+    ints = ints * 1000
+    # Use a fixed vocabulary of strings from 0 to 99, to showcase loading a
+    # vocabulary from a file.
+    strings = tf.random.uniform((DS_SIZE, 1), maxval=VOCAB_SIZE, dtype="int64")
+    strings = tf.strings.as_string(strings)
+    features = {"float_col": floats, "int_col": ints, "string_col": strings}
+    # Random binary label.
+    labels = tf.random.uniform((DS_SIZE, 1), maxval=2, dtype="int64")
+    ds = tf.data.Dataset.from_tensor_slices((features, labels))
+    return ds
 
 
 def make_preprocessing_model(file_dir):
-  """Make a standalone preprocessing model."""
-  # The name of our keras.Input should match the column name in the dataset.
-  float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col")
-  int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col")
-  string_in = tf.keras.Input(shape=(1,), dtype="string", name="string_col")
-
-  # We need to batch a dataset before adapting.
-  ds = make_dataset().batch(BATCH_SIZE)
-  # Normalize floats by adapting the mean and variance of the input.
-  normalization = preprocessing.Normalization()
-  normalization.adapt(ds.map(lambda features, labels: features["float_col"]))
-  float_out = normalization(float_in)
-  # Lookup ints by adapting a vocab of integer IDs.
-  int_lookup = preprocessing.IntegerLookup()
-  int_lookup.adapt(ds.map(lambda features, labels: features["int_col"]))
-  int_out = int_lookup(int_in)
-  # Lookup strings from a fixed file based vocabulary.
-  string_vocab = list(str(i) for i in range(VOCAB_SIZE))
-  vocab_file = os.path.join(file_dir, "vocab_file.txt")
-  with open(vocab_file, "w") as f:
-    f.write("\n".join(string_vocab))
-  string_lookup = preprocessing.StringLookup(vocabulary=vocab_file)
-  string_out = string_lookup(string_in)
-
-  return tf.keras.Model(
-      inputs=(float_in, int_in, string_in),
-      outputs=(float_out, int_out, string_out))
+    """Make a standalone preprocessing model."""
+    # The name of our keras.Input should match the column name in the dataset.
+    float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col")
+    int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col")
+    string_in = tf.keras.Input(shape=(1,), dtype="string", name="string_col")
+
+    # We need to batch a dataset before adapting.
+    ds = make_dataset().batch(BATCH_SIZE)
+    # Normalize floats by adapting the mean and variance of the input.
+    normalization = preprocessing.Normalization()
+    normalization.adapt(ds.map(lambda features, labels: features["float_col"]))
+    float_out = normalization(float_in)
+    # Lookup ints by adapting a vocab of integer IDs.
+    int_lookup = preprocessing.IntegerLookup()
+    int_lookup.adapt(ds.map(lambda features, labels: features["int_col"]))
+    int_out = int_lookup(int_in)
+    # Lookup strings from a fixed file based vocabulary.
+    string_vocab = list(str(i) for i in range(VOCAB_SIZE))
+    vocab_file = os.path.join(file_dir, "vocab_file.txt")
+    with open(vocab_file, "w") as f:
+        f.write("\n".join(string_vocab))
+    string_lookup = preprocessing.StringLookup(vocabulary=vocab_file)
+    string_out = string_lookup(string_in)
+
+    return tf.keras.Model(
+        inputs=(float_in, int_in, string_in),
+        outputs=(float_out, int_out, string_out),
+    )
 
 
 def make_training_model():
-  """Make a trainable model for the preprocessed inputs."""
-  float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col")
-  # After preprocessing, both the string and int column are integer ready for
-  # embedding.
-  int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col")
-  string_in = tf.keras.Input(shape=(1,), dtype="int64", name="string_col")
-
-  # Feed the lookup layers into an embedding.
-  int_embedding = tf.keras.layers.Embedding(VOCAB_SIZE + 1, 8, input_length=1)
-  int_out = int_embedding(int_in)
-  int_out = tf.keras.layers.Flatten()(int_out)
-  string_embedding = tf.keras.layers.Embedding(
-      VOCAB_SIZE + 1, 8, input_length=1)
-  string_out = string_embedding(string_in)
-  string_out = tf.keras.layers.Flatten()(string_out)
-
-  # Concatenate outputs.
-  concatate = tf.keras.layers.Concatenate()
-  # Feed our preprocessed inputs into a simple MLP.
-  x = concatate((float_in, int_out, string_out))
-  x = tf.keras.layers.Dense(32, activation="relu")(x)
-  x = tf.keras.layers.Dense(32, activation="relu")(x)
-  outputs = tf.keras.layers.Dense(1, activation="softmax")(x)
-  return tf.keras.Model(inputs=(float_in, int_in, string_in), outputs=outputs)
+    """Make a trainable model for the preprocessed inputs."""
+    float_in = tf.keras.Input(shape=(1,), dtype="float32", name="float_col")
+    # After preprocessing, both the string and int column are integer ready for
+    # embedding.
+    int_in = tf.keras.Input(shape=(1,), dtype="int64", name="int_col")
+    string_in = tf.keras.Input(shape=(1,), dtype="int64", name="string_col")
+
+    # Feed the lookup layers into an embedding.
+    int_embedding = tf.keras.layers.Embedding(VOCAB_SIZE + 1, 8, input_length=1)
+    int_out = int_embedding(int_in)
+    int_out = tf.keras.layers.Flatten()(int_out)
+    string_embedding = tf.keras.layers.Embedding(
+        VOCAB_SIZE + 1, 8, input_length=1
+    )
+    string_out = string_embedding(string_in)
+    string_out = tf.keras.layers.Flatten()(string_out)
+
+    # Concatenate outputs.
+    concatate = tf.keras.layers.Concatenate()
+    # Feed our preprocessed inputs into a simple MLP.
+    x = concatate((float_in, int_out, string_out))
+    x = tf.keras.layers.Dense(32, activation="relu")(x)
+    x = tf.keras.layers.Dense(32, activation="relu")(x)
+    outputs = tf.keras.layers.Dense(1, activation="softmax")(x)
+    return tf.keras.Model(inputs=(float_in, int_in, string_in), outputs=outputs)
diff --git a/keras/integration_test/py_metric_test.py b/keras/integration_test/py_metric_test.py
new file mode 100644
index 000000000000..f07f019ab120
--- /dev/null
+++ b/keras/integration_test/py_metric_test.py
@@ -0,0 +1,72 @@
+"""Test Model.fit with a PyMetric."""
+
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras import Sequential
+from keras import layers
+from keras import losses
+from keras import metrics
+from keras.testing_infra import test_combinations
+
+
+def get_dataset(num_batches=5, batch_size=2):
+    x = tf.random.uniform((num_batches * batch_size, 100))
+    y = tf.random.uniform((num_batches * batch_size, 2))
+    dataset = (
+        tf.data.Dataset.from_tensor_slices((x, y))
+        .prefetch(batch_size * 2)
+        .batch(batch_size)
+    )
+    return dataset
+
+
+class CountingPyMetric(metrics.PyMetric):
+    """A test-only PyMetric which simply counts how many results it's seen."""
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        self.y_pred.append(y_pred)
+
+    def reset_state(self):
+        self.y_pred = []
+
+    def result(self):
+        return len(self.y_pred)
+
+
+class PyMetricTest(test_combinations.TestCase):
+    @parameterized.named_parameters(("eager", True), ("graph", False))
+    def test_fit(self, run_eagerly):
+        num_batches = 5
+        dataset = get_dataset(num_batches=num_batches)
+
+        counting_metric = CountingPyMetric()
+
+        model = Sequential(layers.Dense(2))
+        model.compile(
+            loss=losses.BinaryCrossentropy(),
+            metrics=[counting_metric],
+            run_eagerly=run_eagerly,
+        )
+        model.fit(dataset, epochs=1)
+
+        self.assertEqual(counting_metric.result(), num_batches)
+
+    @parameterized.named_parameters(("eager", True), ("graph", False))
+    def test_evaluate(self, run_eagerly):
+        num_batches = 5
+        dataset = get_dataset(num_batches=num_batches)
+
+        model = Sequential(layers.Dense(2))
+        model.compile(
+            loss=losses.BinaryCrossentropy(),
+            metrics=[CountingPyMetric()],
+            run_eagerly=run_eagerly,
+        )
+        loss, count = model.evaluate(dataset)
+
+        self.assertEqual(count, num_batches)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/integration_test/saved_model_test.py b/keras/integration_test/saved_model_test.py
index 81d1c3dfe183..63cbf28fc846 100644
--- a/keras/integration_test/saved_model_test.py
+++ b/keras/integration_test/saved_model_test.py
@@ -16,224 +16,236 @@
 import os
 import tempfile
 
-from absl.testing import parameterized
-
 import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 
 def cycle(obj, cycles, signatures=None):
-  to_save = obj
-  # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
-  # point w.r.t. saving/restoring, ideally after 2nd saving.
-  for _ in range(cycles):
-    path = tempfile.mkdtemp(prefix=tf.compat.v1.test.get_temp_dir())
-    # If available, we'll run the save and restore preferring the GPU. This
-    # just makes sure we aren't throwing errors and have enough
-    # device("CPU") blocks to satisfy the placer.
-    device = "/device:GPU:0" if tf.test.is_gpu_available() else "/device:CPU:0"
-    with tf.device(device):
-      tf.saved_model.save(to_save, path, signatures)
-      loaded = tf.saved_model.load(path)
-    to_save = loaded
-  return loaded
+    to_save = obj
+    # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
+    # point w.r.t. saving/restoring, ideally after 2nd saving.
+    for _ in range(cycles):
+        path = tempfile.mkdtemp(prefix=tf.compat.v1.test.get_temp_dir())
+        # If available, we'll run the save and restore preferring the GPU. This
+        # just makes sure we aren't throwing errors and have enough
+        # device("CPU") blocks to satisfy the placer.
+        device = (
+            "/device:GPU:0" if tf.test.is_gpu_available() else "/device:CPU:0"
+        )
+        with tf.device(device):
+            tf.saved_model.save(to_save, path, signatures)
+            loaded = tf.saved_model.load(path)
+        to_save = loaded
+    return loaded
 
 
 class _ModelWithOptimizer(tf.train.Checkpoint):
+    def __init__(self):
+        self.dense = tf.keras.layers.Dense(1)
+        self.optimizer = tf.keras.optimizers.Adam(0.01)
 
-  def __init__(self):
-    self.dense = tf.keras.layers.Dense(1)
-    self.optimizer = tf.keras.optimizers.Adam(0.01)
-
-  @tf.function(
-      input_signature=(tf.TensorSpec([None, 2], tf.float32),
-                       tf.TensorSpec([None], tf.float32)))
-  def call(self, x, y):
-    with tf.GradientTape() as tape:
-      loss = tf.math.reduce_mean((self.dense(x) - y) ** 2.)
-    trainable_variables = self.dense.trainable_variables
-    gradients = tape.gradient(loss, trainable_variables)
-    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
-    return {"loss": loss}
+    @tf.function(
+        input_signature=(
+            tf.TensorSpec([None, 2], tf.float32),
+            tf.TensorSpec([None], tf.float32),
+        )
+    )
+    def call(self, x, y):
+        with tf.GradientTape() as tape:
+            loss = tf.math.reduce_mean((self.dense(x) - y) ** 2.0)
+        trainable_variables = self.dense.trainable_variables
+        gradients = tape.gradient(loss, trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+        return {"loss": loss}
 
 
 def _import_and_infer(save_dir, inputs, signature_key="serving_default"):
-  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
-  graph = tf.Graph()
-  with graph.as_default(), tf.compat.v1.Session() as session:
-    model = tf.compat.v1.saved_model.load(session, ["serve"], save_dir)
-    return _run_signature(session, model, inputs, signature_key)
+    """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+    graph = tf.Graph()
+    with graph.as_default(), tf.compat.v1.Session() as session:
+        model = tf.compat.v1.saved_model.load(session, ["serve"], save_dir)
+        return _run_signature(session, model, inputs, signature_key)
 
 
 def _run_signature(session, meta_graph_def, inputs, signature_key):
-  signature = meta_graph_def.signature_def[signature_key]
-  assert set(inputs.keys()) == set(signature.inputs.keys())
-  feed_dict = {}
-  for arg_name in inputs.keys():
-    input_tensor = session.graph.get_tensor_by_name(
-        signature.inputs[arg_name].name)
-    feed_dict[input_tensor] = inputs[arg_name]
-  output_dict = {}
-  for output_name, output_tensor_info in signature.outputs.items():
-    output_dict[output_name] = session.graph.get_tensor_by_name(
-        output_tensor_info.name)
-  return session.run(output_dict, feed_dict=feed_dict)
+    signature = meta_graph_def.signature_def[signature_key]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+        input_tensor = session.graph.get_tensor_by_name(
+            signature.inputs[arg_name].name
+        )
+        feed_dict[input_tensor] = inputs[arg_name]
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+        output_dict[output_name] = session.graph.get_tensor_by_name(
+            output_tensor_info.name
+        )
+    return session.run(output_dict, feed_dict=feed_dict)
 
 
 class SaveTest(tf.test.TestCase):
-
-  def test_unbuilt_model_does_not_prevent_saving(self):
-    root = tf.train.Checkpoint(
-        model=tf.keras.Sequential([tf.keras.layers.Dense(2)]))
-    tf.saved_model.save(root, os.path.join(self.get_temp_dir(), "saved_model"))
-
-  def test_optimizer(self):
-    x = tf.constant([[3., 4.]])
-    y = tf.constant([2.])
-    model = _ModelWithOptimizer()
-    first_loss = model.call(x, y)
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    tf.saved_model.save(model, save_dir, model.call)
-    second_loss = model.call(x, y)
-    self.assertNotEqual(first_loss, second_loss)
-    self.assertAllClose(
-        second_loss,
-        _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
-
-  def test_single_method_default_signature(self):
-    model = _ModelWithOptimizer()
-    x = tf.constant([[3., 4.]])
-    y = tf.constant([2.])
-    model.call(x, y)
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    tf.saved_model.save(model, save_dir)
-    self.assertIn("loss",
-                  _import_and_infer(save_dir,
-                                    {"x": [[3., 4.]], "y": [2.]}))
+    def test_unbuilt_model_does_not_prevent_saving(self):
+        root = tf.train.Checkpoint(
+            model=tf.keras.Sequential([tf.keras.layers.Dense(2)])
+        )
+        tf.saved_model.save(
+            root, os.path.join(self.get_temp_dir(), "saved_model")
+        )
+
+    def test_optimizer(self):
+        x = tf.constant([[3.0, 4.0]])
+        y = tf.constant([2.0])
+        model = _ModelWithOptimizer()
+        first_loss = model.call(x, y)
+        save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+        tf.saved_model.save(model, save_dir, model.call)
+        second_loss = model.call(x, y)
+        self.assertNotEqual(first_loss, second_loss)
+        self.assertAllClose(
+            second_loss,
+            _import_and_infer(save_dir, {"x": [[3.0, 4.0]], "y": [2.0]}),
+        )
+
+    def test_single_method_default_signature(self):
+        model = _ModelWithOptimizer()
+        x = tf.constant([[3.0, 4.0]])
+        y = tf.constant([2.0])
+        model.call(x, y)
+        save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+        tf.saved_model.save(model, save_dir)
+        self.assertIn(
+            "loss", _import_and_infer(save_dir, {"x": [[3.0, 4.0]], "y": [2.0]})
+        )
 
 
 @parameterized.named_parameters(
     dict(testcase_name="ReloadOnce", cycles=1),
     dict(testcase_name="ReloadTwice", cycles=2),
-    dict(testcase_name="ReloadThrice", cycles=3))
+    dict(testcase_name="ReloadThrice", cycles=3),
+)
 class LoadTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_optimizer(self, cycles):
-
-    class _HasOptimizer(tf.Module):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = tf.keras.layers.Dense(1)
-        self.optimizer = tf.keras.optimizers.Adam(0.01)
-
-      @tf.function
-      def __call__(self, x):
-        return self.layer(x)
-
-      @tf.function
-      def train(self, x, y):
-        with tf.GradientTape() as tape:
-          predicted = self(x)
-          loss = tf.math.reduce_sum(tf.math.abs(y - predicted))
-        train_vars = self.layer.trainable_variables
-        grads = tape.gradient(loss, train_vars)
-        self.optimizer.apply_gradients(zip(grads, train_vars))
-
-    root = _HasOptimizer()
-    train_input = dict(x=tf.constant([[1.]]),
-                       y=tf.constant([[2.]]))
-    root.train(**train_input)
-    imported = cycle(root, cycles)
-    self.assertAllClose(root.optimizer.learning_rate.numpy(),
-                        imported.optimizer.learning_rate.numpy())
-    self.assertAllClose(root(tf.constant([[-0.5]])),
-                        imported(tf.constant([[-0.5]])))
-    root.train(**train_input)
-    imported.train(**train_input)
-    self.assertAllClose(root(tf.constant([[-0.5]])),
-                        imported(tf.constant([[-0.5]])))
-
-  def test_model_with_custom_function_attached(self, cycles):
-    root = tf.train.Checkpoint(
-        model=tf.keras.Sequential([tf.keras.layers.Dense(2)]))
-
-    @tf.function
-    def _use_sequential(x):
-      return root.model.call(x)
-
-    root.model.traced_call = _use_sequential
-
-    original = root.model.traced_call(tf.zeros([1, 1])).numpy()
-    root = cycle(root, cycles)
-    self.assertAllEqual(
-        original,
-        root.model.traced_call(tf.zeros([1, 1])).numpy())
+    def test_optimizer(self, cycles):
+        class _HasOptimizer(tf.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer = tf.keras.layers.Dense(1)
+                self.optimizer = tf.keras.optimizers.Adam(0.01)
+
+            @tf.function
+            def __call__(self, x):
+                return self.layer(x)
+
+            @tf.function
+            def train(self, x, y):
+                with tf.GradientTape() as tape:
+                    predicted = self(x)
+                    loss = tf.math.reduce_sum(tf.math.abs(y - predicted))
+                train_vars = self.layer.trainable_variables
+                grads = tape.gradient(loss, train_vars)
+                self.optimizer.apply_gradients(zip(grads, train_vars))
+
+        root = _HasOptimizer()
+        train_input = dict(x=tf.constant([[1.0]]), y=tf.constant([[2.0]]))
+        root.train(**train_input)
+        imported = cycle(root, cycles)
+        self.assertAllClose(
+            root.optimizer.learning_rate.numpy(),
+            imported.optimizer.learning_rate.numpy(),
+        )
+        self.assertAllClose(
+            root(tf.constant([[-0.5]])), imported(tf.constant([[-0.5]]))
+        )
+        root.train(**train_input)
+        imported.train(**train_input)
+        self.assertAllClose(
+            root(tf.constant([[-0.5]])), imported(tf.constant([[-0.5]]))
+        )
+
+    def test_model_with_custom_function_attached(self, cycles):
+        root = tf.train.Checkpoint(
+            model=tf.keras.Sequential([tf.keras.layers.Dense(2)])
+        )
+
+        @tf.function
+        def _use_sequential(x):
+            return root.model.call(x)
+
+        root.model.traced_call = _use_sequential
+
+        original = root.model.traced_call(tf.zeros([1, 1])).numpy()
+        root = cycle(root, cycles)
+        self.assertAllEqual(
+            original, root.model.traced_call(tf.zeros([1, 1])).numpy()
+        )
 
 
 @parameterized.named_parameters(
     dict(testcase_name="ReloadOnce", cycles=1),
     dict(testcase_name="ReloadTwice", cycles=2),
-    dict(testcase_name="ReloadThrice", cycles=3))
+    dict(testcase_name="ReloadThrice", cycles=3),
+)
 class KerasLoadTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_dense_features_layer(self, cycles):
-    columns = [
-        tf.feature_column.numeric_column("x"),
-        tf.feature_column.numeric_column("y")
-    ]
-    layer = tf.keras.layers.DenseFeatures(columns)
-    model = tf.keras.Sequential([layer])
-    model_input = {"x": tf.constant([[1.]]),
-                   "y": tf.constant([[2.]])}
-    self.assertAllClose([[1., 2.]], model.predict(model_input, steps=1))
-    loaded = cycle(model, cycles)
-    output, = loaded._default_save_signature(model_input).values()
-    self.assertAllClose([[1., 2.]], output)
-    signature_output, = loaded.signatures["serving_default"](
-        **model_input).values()
-    self.assertAllClose([[1., 2.]], signature_output)
-
-  def test_dense_features_layer_fit(self, cycles):
-    columns = [tf.feature_column.numeric_column("x")]
-    model = tf.keras.Sequential(
-        [tf.keras.layers.DenseFeatures(columns),
-         tf.keras.layers.Dense(1)])
-    model_input = {"x": tf.constant([[1.]])}
-    model.compile(optimizer="adam", loss="mse", run_eagerly=True)
-    model.fit(model_input, tf.constant([[3.]]))
-    loaded = cycle(model, cycles)
-    loaded._default_save_signature(model_input)
-    loaded.signatures["serving_default"](**model_input)
-
-  def test_multi_output_layer(self, cycles):
-
-    inp = tf.keras.Input(name="inp", shape=(None,), dtype=tf.float32)
-
-    class _MultiOutput(tf.keras.layers.Layer):
-
-      def call(self, x):
-        return x + 1., x + 2.
-
-    out = _MultiOutput(name="out")(inp)  # pylint: disable=not-callable
-    model = tf.keras.Model(inp, out)
-    loaded = cycle(model, cycles)
-    self.assertAllClose(
-        dict(out=2., out_1=3.),
-        loaded.signatures["serving_default"](tf.constant(1.)))
-
-  def test_functional_model_with_conv(self, cycles):
-    x = tf.keras.Input(name="x", shape=(None, None, 3), dtype=tf.float32)
-    conved = tf.keras.layers.Conv2D(
-        filters=3, kernel_size=3, dilation_rate=2)(x)
-    model = tf.keras.Model([x], conved)
-    model_input = tf.ones((1, 10, 10, 3))
-    initial_output = model.predict([model_input])
-    model = cycle(model, cycles)
-    self.assertAllClose(
-        [initial_output],
-        list(model.signatures["serving_default"](model_input).values()))
+    def test_dense_features_layer(self, cycles):
+        columns = [
+            tf.feature_column.numeric_column("x"),
+            tf.feature_column.numeric_column("y"),
+        ]
+        layer = tf.keras.layers.DenseFeatures(columns)
+        model = tf.keras.Sequential([layer])
+        model_input = {"x": tf.constant([[1.0]]), "y": tf.constant([[2.0]])}
+        self.assertAllClose([[1.0, 2.0]], model.predict(model_input, steps=1))
+        loaded = cycle(model, cycles)
+        (output,) = loaded._default_save_signature(model_input).values()
+        self.assertAllClose([[1.0, 2.0]], output)
+        (signature_output,) = loaded.signatures["serving_default"](
+            **model_input
+        ).values()
+        self.assertAllClose([[1.0, 2.0]], signature_output)
+
+    def test_dense_features_layer_fit(self, cycles):
+        columns = [tf.feature_column.numeric_column("x")]
+        model = tf.keras.Sequential(
+            [tf.keras.layers.DenseFeatures(columns), tf.keras.layers.Dense(1)]
+        )
+        model_input = {"x": tf.constant([[1.0]])}
+        model.compile(optimizer="adam", loss="mse", run_eagerly=True)
+        model.fit(model_input, tf.constant([[3.0]]))
+        loaded = cycle(model, cycles)
+        loaded._default_save_signature(model_input)
+        loaded.signatures["serving_default"](**model_input)
+
+    def test_multi_output_layer(self, cycles):
+
+        inp = tf.keras.Input(name="inp", shape=(None,), dtype=tf.float32)
+
+        class _MultiOutput(tf.keras.layers.Layer):
+            def call(self, x):
+                return x + 1.0, x + 2.0
+
+        out = _MultiOutput(name="out")(inp)
+        model = tf.keras.Model(inp, out)
+        loaded = cycle(model, cycles)
+        self.assertAllClose(
+            dict(out=2.0, out_1=3.0),
+            loaded.signatures["serving_default"](tf.constant(1.0)),
+        )
+
+    def test_functional_model_with_conv(self, cycles):
+        x = tf.keras.Input(name="x", shape=(None, None, 3), dtype=tf.float32)
+        conved = tf.keras.layers.Conv2D(
+            filters=3, kernel_size=3, dilation_rate=2
+        )(x)
+        model = tf.keras.Model([x], conved)
+        model_input = tf.ones((1, 10, 10, 3))
+        initial_output = model.predict([model_input])
+        model = cycle(model, cycles)
+        self.assertAllClose(
+            [initial_output],
+            list(model.signatures["serving_default"](model_input).values()),
+        )
 
 
 if __name__ == "__main__":
-  if tf.__internal__.tf2.enabled():
-    tf.test.main()
+    if tf.__internal__.tf2.enabled():
+        tf.test.main()
diff --git a/keras/integration_test/saving_v3_test.py b/keras/integration_test/saving_v3_test.py
new file mode 100644
index 000000000000..de4906cbabbb
--- /dev/null
+++ b/keras/integration_test/saving_v3_test.py
@@ -0,0 +1,130 @@
+"""Test Model.fit across a diverse range of models."""
+
+import os
+
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.integration_test.models import bert
+from keras.integration_test.models import dcgan
+from keras.integration_test.models import edge_case_model
+from keras.integration_test.models import input_spec
+from keras.integration_test.models import low_level_model
+from keras.integration_test.models import mini_unet
+from keras.integration_test.models import mini_xception
+from keras.integration_test.models import retinanet
+from keras.integration_test.models import structured_data_classification
+from keras.integration_test.models import text_classification
+from keras.integration_test.models import timeseries_forecasting
+from keras.integration_test.models import vae
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+
+def get_dataset(data_specs, batch_size):
+    values = tf.nest.map_structure(input_spec.spec_to_value, data_specs)
+    dataset = (
+        tf.data.Dataset.from_tensor_slices(values)
+        .prefetch(batch_size * 2)
+        .batch(batch_size)
+    )
+    return dataset
+
+
+@test_utils.run_v2_only
+class SavingV3Test(test_combinations.TestCase):
+    @parameterized.named_parameters(
+        ("bert", bert),
+        ("edge_case_model", edge_case_model),
+        # ("efficientnet_v2", efficientnet_v2),  # Too expensive to run on CI
+        ("low_level_model", low_level_model),
+        ("mini_unet", mini_unet),
+        ("mini_xception", mini_xception),
+        ("retinanet", retinanet),
+        ("structured_data_classification", structured_data_classification),
+        ("text_classification", text_classification),
+        ("timeseries_forecasting", timeseries_forecasting),
+    )
+    def test_saving_v3(self, module):
+        batch_size = 2
+        data_specs = module.get_data_spec(batch_size * 2)
+        dataset = get_dataset(data_specs, batch_size)
+        for batch in dataset.take(1):
+            pass
+        if isinstance(batch, tuple):
+            batch = batch[0]
+
+        model = module.get_model(
+            build=True,
+            compile=True,
+            jit_compile=False,
+            include_preprocessing=True,
+        )
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+        temp_filepath = os.path.join(
+            self.get_temp_dir(), f"{module.__name__}.keras"
+        )
+        model.save(temp_filepath, save_format="keras_v3")
+        with tf.keras.utils.custom_object_scope(module.get_custom_objects()):
+            new_model = tf.keras.models.load_model(temp_filepath)
+
+        # Test model weights
+        self.assertIs(new_model.__class__, model.__class__)
+        self.assertEqual(len(model.get_weights()), len(new_model.get_weights()))
+        for w1, w2 in zip(model.get_weights(), new_model.get_weights()):
+            if w1.dtype == "object":
+                self.assertEqual(str(w1), str(w2))
+            else:
+                self.assertAllClose(w1, w2, atol=1e-6)
+
+        # Test forward pass
+        self.assertAllClose(new_model(batch), model(batch), atol=1e-6)
+
+        # Test optimizer state
+        if hasattr(model, "optimizer"):
+            self.assertEqual(
+                len(model.optimizer.variables()),
+                len(new_model.optimizer.variables()),
+            )
+            for v1, v2 in zip(
+                model.optimizer.variables(), new_model.optimizer.variables()
+            ):
+                self.assertAllClose(v1.numpy(), v2.numpy(), atol=1e-6)
+
+        # Test training still works
+        new_model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+    @parameterized.named_parameters(("dcgan", dcgan), ("vae", vae))
+    def test_saving_v3_no_call(self, module):
+        batch_size = 2
+        data_specs = module.get_data_spec(batch_size * 2)
+        dataset = get_dataset(data_specs, batch_size)
+
+        model = module.get_model(
+            build=True,
+            compile=True,
+            jit_compile=False,
+            include_preprocessing=True,
+        )
+        temp_filepath = os.path.join(
+            self.get_temp_dir(), f"{module.__name__}.keras"
+        )
+        model.save(temp_filepath, save_format="keras_v3")
+        with tf.keras.utils.custom_object_scope(module.get_custom_objects()):
+            new_model = tf.keras.models.load_model(temp_filepath)
+
+        # Test model weights
+        self.assertIs(new_model.__class__, model.__class__)
+        self.assertEqual(len(model.get_weights()), len(new_model.get_weights()))
+        for w1, w2 in zip(model.get_weights(), new_model.get_weights()):
+            if w1.dtype == "object":
+                self.assertEqual(str(w1), str(w2))
+            else:
+                self.assertAllClose(w1, w2, atol=1e-6)
+
+        # Test training still works
+        new_model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/integration_test/tf_trt_test.py b/keras/integration_test/tf_trt_test.py
index ba472b264e1c..93f18013ed9b 100644
--- a/keras/integration_test/tf_trt_test.py
+++ b/keras/integration_test/tf_trt_test.py
@@ -16,52 +16,56 @@
 import os
 import tempfile
 
-from absl import flags
-
 import tensorflow.compat.v2 as tf
 import tensorflow_text as tf_text
+from absl import flags
 
 
 class ConvertResource(tf.test.TestCase):
+    def testConvertResource(self):
+        """Test general resource inputs don't crash the converter."""
+        if not tf.test.is_built_with_cuda():
+            self.skipTest("test is only applicable with CUDA")
 
-  def testConvertResource(self):
-    """Test general resource inputs don't crash the converter."""
-    if not tf.test.is_built_with_cuda():
-      self.skipTest('test is only applicable with CUDA')
-
-    class TokenizeLayer(tf.keras.layers.Layer):
-
-      def __init__(self, vocab_file):
-        super().__init__()
-        serialized_proto = tf.compat.v1.gfile.GFile(vocab_file, "rb").read()
-        self.tokenizer = tf_text.SentencepieceTokenizer(
-            model=serialized_proto, add_bos=True, add_eos=True)
+        class TokenizeLayer(tf.keras.layers.Layer):
+            def __init__(self, vocab_file):
+                super().__init__()
+                serialized_proto = tf.compat.v1.gfile.GFile(
+                    vocab_file, "rb"
+                ).read()
+                self.tokenizer = tf_text.SentencepieceTokenizer(
+                    model=serialized_proto, add_bos=True, add_eos=True
+                )
 
-      def call(self, inputs):
-        word_ids = self.tokenizer.tokenize(inputs)
-        word_ids = word_ids.to_tensor(default_value=1, shape=(None, 192))
-        return word_ids
+            def call(self, inputs):
+                word_ids = self.tokenizer.tokenize(inputs)
+                word_ids = word_ids.to_tensor(
+                    default_value=1, shape=(None, 192)
+                )
+                return word_ids
 
-    vocab_file = os.path.join(
-        flags.FLAGS['test_srcdir'].value,
-        'org_keras/keras',
-        'integration_test/data/sentencepiece.pb')
-    # vocab_file = tf.compat.v1.test.test_src_dir_path(
-    #     "python/keras/integration_test/data/sentencepiece.pb")
-    output_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+        vocab_file = os.path.join(
+            flags.FLAGS["test_srcdir"].value,
+            "org_keras/keras",
+            "integration_test/data/sentencepiece.pb",
+        )
+        # vocab_file = tf.compat.v1.test.test_src_dir_path(
+        #     "python/keras/integration_test/data/sentencepiece.pb")
+        output_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
 
-    # Create and save a Tokenizer
-    tokenizer = TokenizeLayer(vocab_file)
-    inputs = tf.keras.layers.Input(shape=(), dtype=tf.dtypes.string)
-    tokens = tokenizer(inputs)
-    model = tf.keras.models.Model(inputs=inputs, outputs=tokens)
-    model.save(output_dir)
+        # Create and save a Tokenizer
+        tokenizer = TokenizeLayer(vocab_file)
+        inputs = tf.keras.layers.Input(shape=(), dtype=tf.dtypes.string)
+        tokens = tokenizer(inputs)
+        model = tf.keras.models.Model(inputs=inputs, outputs=tokens)
+        model.save(output_dir)
 
-    converter = tf.experimental.tensorrt.Converter(
-        input_saved_model_dir=output_dir,
-        conversion_params=tf.experimental.tensorrt.ConversionParams())
-    converter.convert()
+        converter = tf.experimental.tensorrt.Converter(
+            input_saved_model_dir=output_dir,
+            conversion_params=tf.experimental.tensorrt.ConversionParams(),
+        )
+        converter.convert()
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/integration_test/tpu_strategy_test.py b/keras/integration_test/tpu_strategy_test.py
index ff52374966c1..de02d1e27463 100644
--- a/keras/integration_test/tpu_strategy_test.py
+++ b/keras/integration_test/tpu_strategy_test.py
@@ -17,10 +17,13 @@
 import random
 import tempfile
 
+import tensorflow.compat.v2 as tf
 from absl import flags
 
-import tensorflow.compat.v2 as tf
-from tensorflow.python.framework import test_util as tf_test_utils
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
@@ -29,213 +32,258 @@
 
 # These vocabularies usually come from TFT or a Beam pipeline.
 FEATURE_VOCAB = [
-    "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
-    "wonder_woman"
+    "avenger",
+    "ironman",
+    "batman",
+    "hulk",
+    "spiderman",
+    "kingkong",
+    "wonder_woman",
 ]
 LABEL_VOCAB = ["yes", "no"]
 
 
 def get_tpu_cluster_resolver():
-  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-      tpu=FLAGS.tpu,
-      zone=FLAGS.zone,
-      project=FLAGS.project,
-  )
-  return resolver
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu,
+        zone=FLAGS.zone,
+        project=FLAGS.project,
+    )
+    return resolver
 
 
 def get_tpu_strategy():
-  resolver = get_tpu_cluster_resolver()
-  tf.config.experimental_connect_to_cluster(resolver)
-  tf.tpu.experimental.initialize_tpu_system(resolver)
-  return tf.distribute.experimental.TPUStrategy(resolver)
+    resolver = get_tpu_cluster_resolver()
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    return tf.distribute.experimental.TPUStrategy(resolver)
 
 
 class TpuStrategyTest(tf.test.TestCase):
-
-  def define_kpls_for_training(self, use_adapt):
-    if use_adapt:
-      feature_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              num_oov_indices=1))
-      feature_lookup_layer.adapt(FEATURE_VOCAB)
-      label_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              num_oov_indices=0, mask_token=None))
-      label_lookup_layer.adapt(LABEL_VOCAB)
-    else:
-      feature_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              vocabulary=FEATURE_VOCAB, num_oov_indices=1))
-      label_lookup_layer = (
-          tf.keras.layers.StringLookup(
-              vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None))
-
-    raw_feature_input = tf.keras.layers.Input(
-        shape=(3,), dtype=tf.dtypes.string, name="feature", ragged=True)
-    feature_id_input = feature_lookup_layer(raw_feature_input)
-    feature_mapper = tf.keras.Model({"features": raw_feature_input},
-                                    feature_id_input)
-
-    raw_label_input = tf.keras.layers.Input(
-        shape=(1,), dtype=tf.dtypes.string, name="label")
-    label_id_input = label_lookup_layer(raw_label_input)
-    label_mapper = tf.keras.Model({"label": raw_label_input}, label_id_input)
-
-    return feature_mapper, label_mapper
-
-  def define_inverse_lookup_layer(self):
-    # Only needed for serving.
-    label_inverse_lookup_layer = (
-        tf.keras.layers.StringLookup(
+    def define_kpls_for_training(self, use_adapt):
+        if use_adapt:
+            feature_lookup_layer = tf.keras.layers.StringLookup(
+                num_oov_indices=1
+            )
+            feature_lookup_layer.adapt(FEATURE_VOCAB)
+            label_lookup_layer = tf.keras.layers.StringLookup(
+                num_oov_indices=0, mask_token=None
+            )
+            label_lookup_layer.adapt(LABEL_VOCAB)
+        else:
+            feature_lookup_layer = tf.keras.layers.StringLookup(
+                vocabulary=FEATURE_VOCAB, num_oov_indices=1
+            )
+            label_lookup_layer = tf.keras.layers.StringLookup(
+                vocabulary=LABEL_VOCAB, num_oov_indices=0, mask_token=None
+            )
+
+        raw_feature_input = tf.keras.layers.Input(
+            shape=(3,), dtype=tf.dtypes.string, name="feature", ragged=True
+        )
+        feature_id_input = feature_lookup_layer(raw_feature_input)
+        feature_mapper = tf.keras.Model(
+            {"features": raw_feature_input}, feature_id_input
+        )
+
+        raw_label_input = tf.keras.layers.Input(
+            shape=(1,), dtype=tf.dtypes.string, name="label"
+        )
+        label_id_input = label_lookup_layer(raw_label_input)
+        label_mapper = tf.keras.Model(
+            {"label": raw_label_input}, label_id_input
+        )
+
+        return feature_mapper, label_mapper
+
+    def define_inverse_lookup_layer(self):
+        # Only needed for serving.
+        label_inverse_lookup_layer = tf.keras.layers.StringLookup(
             num_oov_indices=0,
             mask_token=None,
             vocabulary=LABEL_VOCAB,
-            invert=True))
-    return label_inverse_lookup_layer
-
-  def test_keras_metric_outside_strategy_scope_per_replica(self):
-    if not tf.compat.v1.executing_eagerly():
-      self.skipTest("connect_to_cluster() can only be called in eager mode")
-    strategy = get_tpu_strategy()
-    metric = tf.keras.metrics.Mean("test_metric", dtype=tf.float32)
-
-    dataset = tf.data.Dataset.range(strategy.num_replicas_in_sync * 2).batch(2)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-
-    @tf.function
-    def step_fn(i):
-      metric.update_state(i)
-
-    with self.assertRaisesRegex(
-        ValueError, "Trying to run metric.update_state "
-        "in replica context"):
-      with strategy.scope():
-        for i in dataset:
-          strategy.run(step_fn, args=(i,))
-
-  @tf_test_utils.disable_mlir_bridge(
-      "TODO(b/168036682): Support dynamic padder")
-  def test_train_and_serve(self):
-    if not tf.compat.v1.executing_eagerly():
-      self.skipTest("connect_to_cluster() can only be called in eager mode")
-    strategy = get_tpu_strategy()
-    use_adapt = False
-
-    with strategy.scope():
-      feature_mapper, label_mapper = self.define_kpls_for_training(use_adapt)
-
-      def dataset_fn(_):
-
-        def feature_and_label_gen():
-          # Generator of dataset.
-          while True:
-            features = random.sample(FEATURE_VOCAB, 3)
-            label = ["yes"] if "avenger" in features else ["no"]
-            yield {"features": features, "label": label}
-
-        raw_dataset = tf.data.Dataset.from_generator(
-            feature_and_label_gen,
-            output_signature={
-                "features": tf.TensorSpec([3], tf.dtypes.string),
-                "label": tf.TensorSpec([1], tf.dtypes.string)
-            }).shuffle(100).batch(32)
-
-        train_dataset = raw_dataset.map(lambda x: (  # pylint: disable=g-long-lambda
-            {
-                "features": feature_mapper(x["features"])
-            }, label_mapper(x["label"])))
-        return train_dataset
-
-      # Create the model. The input needs to be compatible with KPLs.
-      model_input = tf.keras.layers.Input(
-          shape=(3,), dtype=tf.dtypes.int64, name="model_input")
-
-      # input_dim includes a mask token and an oov token.
-      emb_output = tf.keras.layers.Embedding(
-          input_dim=len(FEATURE_VOCAB) + 2, output_dim=20)(
-              model_input)
-      emb_output = tf.math.reduce_mean(emb_output, axis=1)
-      dense_output = tf.keras.layers.Dense(
-          units=1, activation="sigmoid")(
-              emb_output)
-      model = tf.keras.Model({"features": model_input}, dense_output)
-
-      optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
-      accuracy = tf.keras.metrics.Accuracy()
-
-      @tf.function
-      def train_step(iterator):
-        """The step function for one training step."""
-
-        def step_fn(inputs):
-          """The computation to run on each TPU device."""
-          features, labels = inputs
-          with tf.GradientTape() as tape:
-            pred = model(features, training=True)
-            loss = tf.keras.losses.binary_crossentropy(labels, pred)
-            loss = tf.nn.compute_average_loss(loss)
-          grads = tape.gradient(loss, model.trainable_variables)
-          optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
-
-          actual_pred = tf.cast(tf.math.greater(pred, 0.5), tf.dtypes.int64)
-          accuracy.update_state(labels, actual_pred)
-
-        strategy.run(step_fn, args=(next(iterator),))
-
-      distributed_dataset = strategy.distribute_datasets_from_function(
-          dataset_fn)
-      distributed_iterator = iter(distributed_dataset)
-      num_epochs = 4
-      num_steps = 7
-      for _ in range(num_epochs):
-        accuracy.reset_state()
-        for _ in range(num_steps):
-          train_step(distributed_iterator)
-
-      self.assertGreater(accuracy.result().numpy(), 0.5)
-      self.assertEqual(optimizer.iterations.numpy(), num_epochs * num_steps)
-
-      # Create a saved model.
-      model.feature_mapper = feature_mapper
-      model.label_mapper = label_mapper
-      model.label_inverse_lookup_layer = self.define_inverse_lookup_layer()
-
-      def create_serving_signature(model):
+            invert=True,
+        )
+        return label_inverse_lookup_layer
+
+    def test_keras_metric_outside_strategy_scope_per_replica(self):
+        if not tf.compat.v1.executing_eagerly():
+            self.skipTest(
+                "connect_to_cluster() can only be called in eager mode"
+            )
+        strategy = get_tpu_strategy()
+        metric = tf.keras.metrics.Mean("test_metric", dtype=tf.float32)
+
+        dataset = tf.data.Dataset.range(
+            strategy.num_replicas_in_sync * 2
+        ).batch(2)
+        dataset = strategy.experimental_distribute_dataset(dataset)
 
         @tf.function
-        def serve_fn(raw_features):
-          raw_features = tf.expand_dims(raw_features, axis=0)
-          transformed_features = model.feature_mapper(raw_features)
-          outputs = model(transformed_features)
-          outputs = tf.squeeze(outputs, axis=0)
-          outputs = tf.cast(tf.math.greater(outputs, 0.5), tf.dtypes.int64)
-          decoded_outputs = model.label_inverse_lookup_layer(outputs)
-          return tf.squeeze(decoded_outputs, axis=0)
-
-        # Serving does NOT have batch dimension
-        return serve_fn.get_concrete_function(
-            tf.TensorSpec(shape=(3), dtype=tf.dtypes.string, name="example"))
-
-      serving_fn = create_serving_signature(model)
-
-      saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-      model.save(saved_model_dir, save_format="tf",
-                 signatures={"serving_default": serving_fn})
-
-    # Test the saved_model.
-    loaded_serving_fn = tf.keras.models.load_model(
-        saved_model_dir).signatures["serving_default"]
-
-    # Check model calling with serving signature.
-    prediction1 = loaded_serving_fn(
-        tf.constant(["avenger", "ironman", "avenger"]))["output_0"]
-    self.assertIn(prediction1, ("yes", "no"))
-
-    prediction2 = loaded_serving_fn(
-        tf.constant(["ironman", "ironman", "unknown"]))["output_0"]
-    self.assertIn(prediction2, ("yes", "no"))
+        def step_fn(i):
+            metric.update_state(i)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Trying to run metric.update_state in replica context",
+        ):
+            with strategy.scope():
+                for i in dataset:
+                    strategy.run(step_fn, args=(i,))
+
+    @tf_test_utils.disable_mlir_bridge(
+        "TODO(b/168036682): Support dynamic padder"
+    )
+    def test_train_and_serve(self):
+        if not tf.compat.v1.executing_eagerly():
+            self.skipTest(
+                "connect_to_cluster() can only be called in eager mode"
+            )
+        strategy = get_tpu_strategy()
+        use_adapt = False
+
+        with strategy.scope():
+            feature_mapper, label_mapper = self.define_kpls_for_training(
+                use_adapt
+            )
+
+            def dataset_fn(_):
+                def feature_and_label_gen():
+                    # Generator of dataset.
+                    while True:
+                        features = random.sample(FEATURE_VOCAB, 3)
+                        label = ["yes"] if "avenger" in features else ["no"]
+                        yield {"features": features, "label": label}
+
+                raw_dataset = (
+                    tf.data.Dataset.from_generator(
+                        feature_and_label_gen,
+                        output_signature={
+                            "features": tf.TensorSpec([3], tf.dtypes.string),
+                            "label": tf.TensorSpec([1], tf.dtypes.string),
+                        },
+                    )
+                    .shuffle(100)
+                    .batch(32)
+                )
+
+                train_dataset = raw_dataset.map(
+                    lambda x: (
+                        {"features": feature_mapper(x["features"])},
+                        label_mapper(x["label"]),
+                    )
+                )
+                return train_dataset
+
+            # Create the model. The input needs to be compatible with KPLs.
+            model_input = tf.keras.layers.Input(
+                shape=(3,), dtype=tf.dtypes.int64, name="model_input"
+            )
+
+            # input_dim includes a mask token and an oov token.
+            emb_output = tf.keras.layers.Embedding(
+                input_dim=len(FEATURE_VOCAB) + 2, output_dim=20
+            )(model_input)
+            emb_output = tf.math.reduce_mean(emb_output, axis=1)
+            dense_output = tf.keras.layers.Dense(units=1, activation="sigmoid")(
+                emb_output
+            )
+            model = tf.keras.Model({"features": model_input}, dense_output)
+
+            optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
+            accuracy = tf.keras.metrics.Accuracy()
+
+            @tf.function
+            def train_step(iterator):
+                """The step function for one training step."""
+
+                def step_fn(inputs):
+                    """The computation to run on each TPU device."""
+                    features, labels = inputs
+                    with tf.GradientTape() as tape:
+                        pred = model(features, training=True)
+                        loss = tf.keras.losses.binary_crossentropy(labels, pred)
+                        loss = tf.nn.compute_average_loss(loss)
+                    grads = tape.gradient(loss, model.trainable_variables)
+                    optimizer.apply_gradients(
+                        list(zip(grads, model.trainable_variables))
+                    )
+
+                    actual_pred = tf.cast(
+                        tf.math.greater(pred, 0.5), tf.dtypes.int64
+                    )
+                    accuracy.update_state(labels, actual_pred)
+
+                strategy.run(step_fn, args=(next(iterator),))
+
+            distributed_dataset = strategy.distribute_datasets_from_function(
+                dataset_fn
+            )
+            distributed_iterator = iter(distributed_dataset)
+            num_epochs = 4
+            num_steps = 7
+            for _ in range(num_epochs):
+                accuracy.reset_state()
+                for _ in range(num_steps):
+                    train_step(distributed_iterator)
+
+            self.assertGreater(accuracy.result().numpy(), 0.5)
+            self.assertEqual(
+                optimizer.iterations.numpy(), num_epochs * num_steps
+            )
+
+            # Create a saved model.
+            model.feature_mapper = feature_mapper
+            model.label_mapper = label_mapper
+            model.label_inverse_lookup_layer = (
+                self.define_inverse_lookup_layer()
+            )
+
+            def create_serving_signature(model):
+                @tf.function
+                def serve_fn(raw_features):
+                    raw_features = tf.expand_dims(raw_features, axis=0)
+                    transformed_features = model.feature_mapper(raw_features)
+                    outputs = model(transformed_features)
+                    outputs = tf.squeeze(outputs, axis=0)
+                    outputs = tf.cast(
+                        tf.math.greater(outputs, 0.5), tf.dtypes.int64
+                    )
+                    decoded_outputs = model.label_inverse_lookup_layer(outputs)
+                    return tf.squeeze(decoded_outputs, axis=0)
+
+                # Serving does NOT have batch dimension
+                return serve_fn.get_concrete_function(
+                    tf.TensorSpec(
+                        shape=(3), dtype=tf.dtypes.string, name="example"
+                    )
+                )
+
+            serving_fn = create_serving_signature(model)
+
+            saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+            model.save(
+                saved_model_dir,
+                save_format="tf",
+                signatures={"serving_default": serving_fn},
+            )
+
+        # Test the saved_model.
+        loaded_serving_fn = tf.keras.models.load_model(
+            saved_model_dir
+        ).signatures["serving_default"]
+
+        # Check model calling with serving signature.
+        prediction1 = loaded_serving_fn(
+            tf.constant(["avenger", "ironman", "avenger"])
+        )["output_0"]
+        self.assertIn(prediction1, ("yes", "no"))
+
+        prediction2 = loaded_serving_fn(
+            tf.constant(["ironman", "ironman", "unknown"])
+        )["output_0"]
+        self.assertIn(prediction2, ("yes", "no"))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/integration_test/vectorized_map_test.py b/keras/integration_test/vectorized_map_test.py
index 15c50caea397..5b215280b221 100644
--- a/keras/integration_test/vectorized_map_test.py
+++ b/keras/integration_test/vectorized_map_test.py
@@ -17,28 +17,28 @@
 
 
 class VectorizedMapTest(tf.test.TestCase):
-
-  def test_vectorized_map(self):
-    batch_size = 10
-    num_features = 32
-    layer = tf.keras.layers.Dense(1)
-
-    def model_fn(arg):
-      with tf.GradientTape() as g:
-        inp, label = arg
-        inp = tf.expand_dims(inp, 0)
-        label = tf.expand_dims(label, 0)
-        prediction = layer(inp)
-        loss = tf.nn.l2_loss(label - prediction)
-      return g.gradient(loss, (layer.kernel, layer.bias))
-
-    inputs = tf.random.uniform([batch_size, num_features])
-    labels = tf.random.uniform([batch_size, 1])
-    per_example_gradients = tf.vectorized_map(model_fn, (inputs, labels))
-    self.assertEqual(per_example_gradients[0].shape,
-                     (batch_size, num_features, 1))
-    self.assertEqual(per_example_gradients[1].shape, (batch_size, 1))
+    def test_vectorized_map(self):
+        batch_size = 10
+        num_features = 32
+        layer = tf.keras.layers.Dense(1)
+
+        def model_fn(arg):
+            with tf.GradientTape() as g:
+                inp, label = arg
+                inp = tf.expand_dims(inp, 0)
+                label = tf.expand_dims(label, 0)
+                prediction = layer(inp)
+                loss = tf.nn.l2_loss(label - prediction)
+            return g.gradient(loss, (layer.kernel, layer.bias))
+
+        inputs = tf.random.uniform([batch_size, num_features])
+        labels = tf.random.uniform([batch_size, 1])
+        per_example_gradients = tf.vectorized_map(model_fn, (inputs, labels))
+        self.assertEqual(
+            per_example_gradients[0].shape, (batch_size, num_features, 1)
+        )
+        self.assertEqual(per_example_gradients[1].shape, (batch_size, 1))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/keras.bzl b/keras/keras.bzl
index cbabaf8779ae..4a787d7b9901 100644
--- a/keras/keras.bzl
+++ b/keras/keras.bzl
@@ -1,5 +1,7 @@
 """Keras common starlark macros."""
 
+# Placeholder: load aliased py_test
+
 # Macro to run Keras py_tests against pip installation.
 def py_test(deps = [], data = [], kernels = [], **kwargs):
     native.py_test(
@@ -152,3 +154,13 @@ def distribute_py_test(
         args = args,
         **kwargs
     )
+
+# We are never indexing generated code in the OSS build, but still
+# return a select() for consistency.
+def if_indexing_source_code(
+        if_true,  # @unused
+        if_false):
+    """Return a select() on whether or not we are building for source code indexing."""
+    return select({
+        "//conditions:default": if_false,
+    })
diff --git a/keras/kokoro/github/ubuntu/cpu/build.sh b/keras/kokoro/github/ubuntu/cpu/build.sh
index c88a25605b3a..a826667f2eb7 100644
--- a/keras/kokoro/github/ubuntu/cpu/build.sh
+++ b/keras/kokoro/github/ubuntu/cpu/build.sh
@@ -43,6 +43,6 @@ pip uninstall -y keras-nightly
 bazel test --test_timeout 300,450,1200,3600 --test_output=errors --keep_going \
    --define=use_fast_cpp_protos=false \
    --build_tests_only \
-   --build_tag_filters="-no_oss" \
-   --test_tag_filters="-no_oss" \
+   --build_tag_filters="-no_oss,-oss_excluded" \
+   --test_tag_filters="-no_oss,-oss_excluded" \
    -- //keras/...
diff --git a/keras/kokoro/github/ubuntu/gpu/build.sh b/keras/kokoro/github/ubuntu/gpu/build.sh
index 0095d639bb61..cc7f23bc81dc 100644
--- a/keras/kokoro/github/ubuntu/gpu/build.sh
+++ b/keras/kokoro/github/ubuntu/gpu/build.sh
@@ -38,11 +38,14 @@ pip install -r requirements.txt
 # keras code from local workspace.
 pip uninstall -y keras-nightly
 
-export LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-export TF_CUDA_COMPUTE_CAPABILITIES=6.0
-TF_CUDA_CONFIG_REPO="@ubuntu16.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
+# LD Library Path needs to be same as TensorFlow Ubuntu Docker build -
+# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/tf_sig_build_dockerfiles/
+export LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
+CUDA_TOOLKIT_PATH="/usr/local/cuda-11.8"
+TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda11.8-cudnn8.6-tensorrt8.4_config_cuda"
+TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
 
-tag_filters="gpu,-no_gpu,-nogpu,-benchmark-test,-no_oss,-oss_serial,-no_gpu_presubmit"
+tag_filters="gpu,-no_gpu,-nogpu,-benchmark-test,-no_oss,-oss_excluded,-oss_serial,-no_gpu_presubmit"
 # There are only 4 GPU available on the local test machine.
 TF_GPU_COUNT=4
 TF_TESTS_PER_GPU=8
@@ -55,13 +58,13 @@ bazel test --test_timeout 300,600,1200,3600 --test_output=errors --keep_going \
    --build_tests_only \
    --action_env=TF_CUDA_COMPUTE_CAPABILITIES="${TF_CUDA_COMPUTE_CAPABILITIES}" \
    --action_env=TF_CUDA_CONFIG_REPO="${TF_CUDA_CONFIG_REPO}" \
-   --action_env=TF_CUDA_VERSION=10 \
-   --action_env=TF_CUDNN_VERSION=7 \
+   --action_env=TF_CUDA_VERSION=11 \
+   --action_env=TF_CUDNN_VERSION=8 \
+   --action_env=CUDA_TOOLKIT_PATH="${CUDA_TOOLKIT_PATH}" \
    --test_env=TF_GPU_COUNT=${TF_GPU_COUNT} \
    --test_env=TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU} \
    --build_tag_filters="${tag_filters}" \
    --test_tag_filters="${tag_filters}" \
    --run_under=@org_keras//keras/tools/gpu_build:parallel_gpu_execute \
    --local_test_jobs=${LOCAL_TEST_JOBS} \
-   --nodistinct_host_configuration \
    -- //keras/...
diff --git a/keras/layers/BUILD b/keras/layers/BUILD
index 9d37404575d3..4c48d7e57c09 100644
--- a/keras/layers/BUILD
+++ b/keras/layers/BUILD
@@ -1,15 +1,17 @@
 # Description:
 #   Contains the Keras layers (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
     ],
     licenses = ["notice"],
diff --git a/keras/layers/__init__.py b/keras/layers/__init__.py
index 3fc21041b185..6812e92aa4ec 100644
--- a/keras/layers/__init__.py
+++ b/keras/layers/__init__.py
@@ -14,106 +14,57 @@
 # ==============================================================================
 """Keras layers API."""
 
+# isort: off
 import tensorflow.compat.v2 as tf
 
-# pylint: disable=g-bad-import-order,g-direct-tensorflow-import,disable=g-import-not-at-top
-from tensorflow.python import tf2
+from keras.engine.base_layer import Layer
+from keras.engine.base_preprocessing_layer import PreprocessingLayer
 
 # Generic layers.
 from keras.engine.input_layer import Input
 from keras.engine.input_layer import InputLayer
 from keras.engine.input_spec import InputSpec
-from keras.engine.base_layer import Layer
-from keras.engine.base_preprocessing_layer import PreprocessingLayer
-
-# Image preprocessing layers.
-from keras.layers.preprocessing.image_preprocessing import CenterCrop
-from keras.layers.preprocessing.image_preprocessing import RandomCrop
-from keras.layers.preprocessing.image_preprocessing import RandomFlip
-from keras.layers.preprocessing.image_preprocessing import RandomContrast
-from keras.layers.preprocessing.image_preprocessing import RandomHeight
-from keras.layers.preprocessing.image_preprocessing import RandomRotation
-from keras.layers.preprocessing.image_preprocessing import RandomTranslation
-from keras.layers.preprocessing.image_preprocessing import RandomWidth
-from keras.layers.preprocessing.image_preprocessing import RandomZoom
-from keras.layers.preprocessing.image_preprocessing import Resizing
-from keras.layers.preprocessing.image_preprocessing import Rescaling
-
-# Preprocessing layers.
-from keras.layers.preprocessing.category_encoding import CategoryEncoding
-from keras.layers.preprocessing.discretization import Discretization
-from keras.layers.preprocessing.hashing import Hashing
-from keras.layers.preprocessing.hashed_crossing import HashedCrossing
-from keras.layers.preprocessing.integer_lookup import IntegerLookup
-from keras.layers.preprocessing.normalization import Normalization
-from keras.layers.preprocessing.string_lookup import StringLookup
-from keras.layers.preprocessing.text_vectorization import TextVectorization
+from keras.layers.activation.elu import ELU
+from keras.layers.activation.leaky_relu import LeakyReLU
+from keras.layers.activation.prelu import PReLU
 
 # Activations layers.
 from keras.layers.activation.relu import ReLU
 from keras.layers.activation.softmax import Softmax
-from keras.layers.activation.leaky_relu import LeakyReLU
-from keras.layers.activation.prelu import PReLU
-from keras.layers.activation.elu import ELU
 from keras.layers.activation.thresholded_relu import ThresholdedReLU
+from keras.layers.attention.additive_attention import AdditiveAttention
+from keras.layers.attention.attention import Attention
 
 # Attention layers.
 from keras.layers.attention.multi_head_attention import MultiHeadAttention
-from keras.layers.attention.attention import Attention
-from keras.layers.attention.additive_attention import AdditiveAttention
 
+# Convolution layer aliases.
 # Convolution layers.
 from keras.layers.convolutional.conv1d import Conv1D
-from keras.layers.convolutional.conv2d import Conv2D
-from keras.layers.convolutional.conv3d import Conv3D
+from keras.layers.convolutional.conv1d import Convolution1D
 from keras.layers.convolutional.conv1d_transpose import Conv1DTranspose
+from keras.layers.convolutional.conv1d_transpose import Convolution1DTranspose
+from keras.layers.convolutional.conv2d import Conv2D
+from keras.layers.convolutional.conv2d import Convolution2D
 from keras.layers.convolutional.conv2d_transpose import Conv2DTranspose
+from keras.layers.convolutional.conv2d_transpose import Convolution2DTranspose
+from keras.layers.convolutional.conv3d import Conv3D
+from keras.layers.convolutional.conv3d import Convolution3D
 from keras.layers.convolutional.conv3d_transpose import Conv3DTranspose
+from keras.layers.convolutional.conv3d_transpose import Convolution3DTranspose
 from keras.layers.convolutional.depthwise_conv1d import DepthwiseConv1D
 from keras.layers.convolutional.depthwise_conv2d import DepthwiseConv2D
 from keras.layers.convolutional.separable_conv1d import SeparableConv1D
-from keras.layers.convolutional.separable_conv2d import SeparableConv2D
-
-# Convolution layer aliases.
-from keras.layers.convolutional.conv1d import Convolution1D
-from keras.layers.convolutional.conv2d import Convolution2D
-from keras.layers.convolutional.conv3d import Convolution3D
-from keras.layers.convolutional.conv1d_transpose import Convolution1DTranspose
-from keras.layers.convolutional.conv2d_transpose import Convolution2DTranspose
-from keras.layers.convolutional.conv3d_transpose import Convolution3DTranspose
 from keras.layers.convolutional.separable_conv1d import SeparableConvolution1D
+from keras.layers.convolutional.separable_conv2d import SeparableConv2D
 from keras.layers.convolutional.separable_conv2d import SeparableConvolution2D
 
-# Regularization layers.
-from keras.layers.regularization.dropout import Dropout
-from keras.layers.regularization.spatial_dropout1d import SpatialDropout1D
-from keras.layers.regularization.spatial_dropout2d import SpatialDropout2D
-from keras.layers.regularization.spatial_dropout3d import SpatialDropout3D
-from keras.layers.regularization.gaussian_dropout import GaussianDropout
-from keras.layers.regularization.gaussian_noise import GaussianNoise
-from keras.layers.regularization.activity_regularization import ActivityRegularization
-from keras.layers.regularization.alpha_dropout import AlphaDropout
-
-# Reshaping layers.
-from keras.layers.reshaping.cropping1d import Cropping1D
-from keras.layers.reshaping.cropping2d import Cropping2D
-from keras.layers.reshaping.cropping3d import Cropping3D
-from keras.layers.reshaping.flatten import Flatten
-from keras.layers.reshaping.permute import Permute
-from keras.layers.reshaping.repeat_vector import RepeatVector
-from keras.layers.reshaping.reshape import Reshape
-from keras.layers.reshaping.up_sampling1d import UpSampling1D
-from keras.layers.reshaping.up_sampling2d import UpSampling2D
-from keras.layers.reshaping.up_sampling3d import UpSampling3D
-from keras.layers.reshaping.zero_padding1d import ZeroPadding1D
-from keras.layers.reshaping.zero_padding2d import ZeroPadding2D
-from keras.layers.reshaping.zero_padding3d import ZeroPadding3D
-
 # Core layers.
 from keras.layers.core.activation import Activation
 from keras.layers.core.dense import Dense
 from keras.layers.core.einsum_dense import EinsumDense
 from keras.layers.core.embedding import Embedding
+from keras.layers.core.identity import Identity
 from keras.layers.core.lambda_layer import Lambda
 from keras.layers.core.masking import Masking
 from keras.layers.core.tf_op_layer import ClassMethod
@@ -123,148 +74,219 @@
 from keras.layers.core.tf_op_layer import TFOpLambda
 
 # Locally-connected layers.
-from keras.layers.locally_connected.locally_connected1d import LocallyConnected1D
-from keras.layers.locally_connected.locally_connected2d import LocallyConnected2D
+from keras.layers.locally_connected.locally_connected1d import (
+    LocallyConnected1D,
+)
+from keras.layers.locally_connected.locally_connected2d import (
+    LocallyConnected2D,
+)
 
+# Merging functions.
 # Merging layers.
 from keras.layers.merging.add import Add
-from keras.layers.merging.subtract import Subtract
-from keras.layers.merging.multiply import Multiply
+from keras.layers.merging.add import add
 from keras.layers.merging.average import Average
-from keras.layers.merging.maximum import Maximum
-from keras.layers.merging.minimum import Minimum
+from keras.layers.merging.average import average
 from keras.layers.merging.concatenate import Concatenate
+from keras.layers.merging.concatenate import concatenate
 from keras.layers.merging.dot import Dot
-
-# Merging functions.
-from keras.layers.merging.add import add
-from keras.layers.merging.subtract import subtract
-from keras.layers.merging.multiply import multiply
-from keras.layers.merging.average import average
+from keras.layers.merging.dot import dot
+from keras.layers.merging.maximum import Maximum
 from keras.layers.merging.maximum import maximum
+from keras.layers.merging.minimum import Minimum
 from keras.layers.merging.minimum import minimum
-from keras.layers.merging.concatenate import concatenate
-from keras.layers.merging.dot import dot
+from keras.layers.merging.multiply import Multiply
+from keras.layers.merging.multiply import multiply
+from keras.layers.merging.subtract import Subtract
+from keras.layers.merging.subtract import subtract
+from keras.layers.normalization.batch_normalization import (
+    SyncBatchNormalization,
+)
 
 # Normalization layers.
+from keras.layers.normalization.group_normalization import GroupNormalization
 from keras.layers.normalization.layer_normalization import LayerNormalization
-from keras.layers.normalization.batch_normalization import SyncBatchNormalization
 from keras.layers.normalization.unit_normalization import UnitNormalization
+from keras.layers.normalization.spectral_normalization import (
+    SpectralNormalization,
+)  # noqa: E501
+
+# Preprocessing layers.
+from keras.layers.preprocessing.category_encoding import CategoryEncoding
+from keras.layers.preprocessing.discretization import Discretization
+from keras.layers.preprocessing.hashed_crossing import HashedCrossing
+from keras.layers.preprocessing.hashing import Hashing
+
+# Image preprocessing layers.
+from keras.layers.preprocessing.image_preprocessing import CenterCrop
+from keras.layers.preprocessing.image_preprocessing import RandomBrightness
+from keras.layers.preprocessing.image_preprocessing import RandomContrast
+from keras.layers.preprocessing.image_preprocessing import RandomCrop
+from keras.layers.preprocessing.image_preprocessing import RandomFlip
+from keras.layers.preprocessing.image_preprocessing import RandomHeight
+from keras.layers.preprocessing.image_preprocessing import RandomRotation
+from keras.layers.preprocessing.image_preprocessing import RandomTranslation
+from keras.layers.preprocessing.image_preprocessing import RandomWidth
+from keras.layers.preprocessing.image_preprocessing import RandomZoom
+from keras.layers.preprocessing.image_preprocessing import Rescaling
+from keras.layers.preprocessing.image_preprocessing import Resizing
+from keras.layers.preprocessing.integer_lookup import IntegerLookup
+from keras.layers.preprocessing.normalization import Normalization
+from keras.layers.preprocessing.string_lookup import StringLookup
+from keras.layers.preprocessing.text_vectorization import TextVectorization
+from keras.layers.regularization.activity_regularization import (
+    ActivityRegularization,
+)
+from keras.layers.regularization.alpha_dropout import AlphaDropout
+
+# Regularization layers.
+from keras.layers.regularization.dropout import Dropout
+from keras.layers.regularization.gaussian_dropout import GaussianDropout
+from keras.layers.regularization.gaussian_noise import GaussianNoise
+from keras.layers.regularization.spatial_dropout1d import SpatialDropout1D
+from keras.layers.regularization.spatial_dropout2d import SpatialDropout2D
+from keras.layers.regularization.spatial_dropout3d import SpatialDropout3D
+
+# Reshaping layers.
+from keras.layers.reshaping.cropping1d import Cropping1D
+from keras.layers.reshaping.cropping2d import Cropping2D
+from keras.layers.reshaping.cropping3d import Cropping3D
+from keras.layers.reshaping.flatten import Flatten
+from keras.layers.reshaping.permute import Permute
+from keras.layers.reshaping.repeat_vector import RepeatVector
+from keras.layers.reshaping.reshape import Reshape
+from keras.layers.reshaping.up_sampling1d import UpSampling1D
+from keras.layers.reshaping.up_sampling2d import UpSampling2D
+from keras.layers.reshaping.up_sampling3d import UpSampling3D
+from keras.layers.reshaping.zero_padding1d import ZeroPadding1D
+from keras.layers.reshaping.zero_padding2d import ZeroPadding2D
+from keras.layers.reshaping.zero_padding3d import ZeroPadding3D
 
 if tf.__internal__.tf2.enabled():
-  from keras.layers.normalization.batch_normalization import BatchNormalization
-  from keras.layers.normalization.batch_normalization_v1 import BatchNormalization as BatchNormalizationV1
-  BatchNormalizationV2 = BatchNormalization
+    from keras.layers.normalization.batch_normalization import (
+        BatchNormalization,
+    )
+    from keras.layers.normalization.batch_normalization_v1 import (
+        BatchNormalization as BatchNormalizationV1,
+    )
+
+    BatchNormalizationV2 = BatchNormalization
 else:
-  from keras.layers.normalization.batch_normalization_v1 import BatchNormalization
-  from keras.layers.normalization.batch_normalization import BatchNormalization as BatchNormalizationV2
-  BatchNormalizationV1 = BatchNormalization
+    from keras.layers.normalization.batch_normalization import (
+        BatchNormalization as BatchNormalizationV2,
+    )
+    from keras.layers.normalization.batch_normalization_v1 import (
+        BatchNormalization,
+    )
+
+    BatchNormalizationV1 = BatchNormalization
 
 # Kernelized layers.
 from keras.layers.kernelized import RandomFourierFeatures
 
+# Pooling layer aliases.
 # Pooling layers.
 from keras.layers.pooling.average_pooling1d import AveragePooling1D
+from keras.layers.pooling.average_pooling1d import AvgPool1D
 from keras.layers.pooling.average_pooling2d import AveragePooling2D
+from keras.layers.pooling.average_pooling2d import AvgPool2D
 from keras.layers.pooling.average_pooling3d import AveragePooling3D
-from keras.layers.pooling.max_pooling1d import MaxPooling1D
-from keras.layers.pooling.max_pooling2d import MaxPooling2D
-from keras.layers.pooling.max_pooling3d import MaxPooling3D
+from keras.layers.pooling.average_pooling3d import AvgPool3D
 from keras.layers.pooling.global_average_pooling1d import GlobalAveragePooling1D
+from keras.layers.pooling.global_average_pooling1d import GlobalAvgPool1D
 from keras.layers.pooling.global_average_pooling2d import GlobalAveragePooling2D
+from keras.layers.pooling.global_average_pooling2d import GlobalAvgPool2D
 from keras.layers.pooling.global_average_pooling3d import GlobalAveragePooling3D
+from keras.layers.pooling.global_average_pooling3d import GlobalAvgPool3D
+from keras.layers.pooling.global_max_pooling1d import GlobalMaxPool1D
 from keras.layers.pooling.global_max_pooling1d import GlobalMaxPooling1D
+from keras.layers.pooling.global_max_pooling2d import GlobalMaxPool2D
 from keras.layers.pooling.global_max_pooling2d import GlobalMaxPooling2D
+from keras.layers.pooling.global_max_pooling3d import GlobalMaxPool3D
 from keras.layers.pooling.global_max_pooling3d import GlobalMaxPooling3D
-
-# Pooling layer aliases.
-from keras.layers.pooling.average_pooling1d import AvgPool1D
-from keras.layers.pooling.average_pooling2d import AvgPool2D
-from keras.layers.pooling.average_pooling3d import AvgPool3D
 from keras.layers.pooling.max_pooling1d import MaxPool1D
+from keras.layers.pooling.max_pooling1d import MaxPooling1D
 from keras.layers.pooling.max_pooling2d import MaxPool2D
+from keras.layers.pooling.max_pooling2d import MaxPooling2D
 from keras.layers.pooling.max_pooling3d import MaxPool3D
-from keras.layers.pooling.global_average_pooling1d import GlobalAvgPool1D
-from keras.layers.pooling.global_average_pooling2d import GlobalAvgPool2D
-from keras.layers.pooling.global_average_pooling3d import GlobalAvgPool3D
-from keras.layers.pooling.global_max_pooling1d import GlobalMaxPool1D
-from keras.layers.pooling.global_max_pooling2d import GlobalMaxPool2D
-from keras.layers.pooling.global_max_pooling3d import GlobalMaxPool3D
+from keras.layers.pooling.max_pooling3d import MaxPooling3D
+from keras.layers.rnn.abstract_rnn_cell import AbstractRNNCell
 
 # Recurrent layers.
 from keras.layers.rnn.base_rnn import RNN
-from keras.layers.rnn.abstract_rnn_cell import AbstractRNNCell
-from keras.layers.rnn.stacked_rnn_cells import StackedRNNCells
-from keras.layers.rnn.simple_rnn import SimpleRNNCell
 from keras.layers.rnn.simple_rnn import SimpleRNN
+from keras.layers.rnn.simple_rnn import SimpleRNNCell
+from keras.layers.rnn.stacked_rnn_cells import StackedRNNCells
 
 if tf.__internal__.tf2.enabled():
-  from keras.layers.rnn.gru import GRU
-  from keras.layers.rnn.gru import GRUCell
-  from keras.layers.rnn.lstm import LSTM
-  from keras.layers.rnn.lstm import LSTMCell
-  from keras.layers.rnn.gru_v1 import GRU as GRUV1
-  from keras.layers.rnn.gru_v1 import GRUCell as GRUCellV1
-  from keras.layers.rnn.lstm_v1 import LSTM as LSTMV1
-  from keras.layers.rnn.lstm_v1 import LSTMCell as LSTMCellV1
-  GRUV2 = GRU
-  GRUCellV2 = GRUCell
-  LSTMV2 = LSTM
-  LSTMCellV2 = LSTMCell
+    from keras.layers.rnn.gru import GRU
+    from keras.layers.rnn.gru import GRUCell
+    from keras.layers.rnn.gru_v1 import GRU as GRUV1
+    from keras.layers.rnn.gru_v1 import GRUCell as GRUCellV1
+    from keras.layers.rnn.lstm import LSTM
+    from keras.layers.rnn.lstm import LSTMCell
+    from keras.layers.rnn.lstm_v1 import LSTM as LSTMV1
+    from keras.layers.rnn.lstm_v1 import LSTMCell as LSTMCellV1
+
+    GRUV2 = GRU
+    GRUCellV2 = GRUCell
+    LSTMV2 = LSTM
+    LSTMCellV2 = LSTMCell
 else:
-  from keras.layers.rnn.gru_v1 import GRU
-  from keras.layers.rnn.gru_v1 import GRUCell
-  from keras.layers.rnn.lstm_v1 import LSTM
-  from keras.layers.rnn.lstm_v1 import LSTMCell
-  from keras.layers.rnn.gru import GRU as GRUV2
-  from keras.layers.rnn.gru import GRUCell as GRUCellV2
-  from keras.layers.rnn.lstm import LSTM as LSTMV2
-  from keras.layers.rnn.lstm import LSTMCell as LSTMCellV2
-  GRUV1 = GRU
-  GRUCellV1 = GRUCell
-  LSTMV1 = LSTM
-  LSTMCellV1 = LSTMCell
+    from keras.layers.rnn.gru import GRU as GRUV2
+    from keras.layers.rnn.gru import GRUCell as GRUCellV2
+    from keras.layers.rnn.gru_v1 import GRU
+    from keras.layers.rnn.gru_v1 import GRUCell
+    from keras.layers.rnn.lstm import LSTM as LSTMV2
+    from keras.layers.rnn.lstm import LSTMCell as LSTMCellV2
+    from keras.layers.rnn.lstm_v1 import LSTM
+    from keras.layers.rnn.lstm_v1 import LSTMCell
 
-# Convolutional-recurrent layers.
-from keras.layers.rnn.conv_lstm1d import ConvLSTM1D
-from keras.layers.rnn.conv_lstm2d import ConvLSTM2D
-from keras.layers.rnn.conv_lstm3d import ConvLSTM3D
+    GRUV1 = GRU
+    GRUCellV1 = GRUCell
+    LSTMV1 = LSTM
+    LSTMCellV1 = LSTMCell
 
-# cuDNN recurrent layers.
-from keras.layers.rnn.cudnn_lstm import CuDNNLSTM
-from keras.layers.rnn.cudnn_gru import CuDNNGRU
+# Serialization functions.
+from keras.layers import serialization
 
 # Wrapper functions.
 from keras.layers.rnn.base_wrapper import Wrapper
 from keras.layers.rnn.bidirectional import Bidirectional
-from keras.layers.rnn.time_distributed import TimeDistributed
 
 # RNN Cell wrappers.
 from keras.layers.rnn.cell_wrappers import DeviceWrapper
 from keras.layers.rnn.cell_wrappers import DropoutWrapper
 from keras.layers.rnn.cell_wrappers import ResidualWrapper
 
-# Serialization functions.
-from keras.layers import serialization
+# Convolutional-recurrent layers.
+from keras.layers.rnn.conv_lstm1d import ConvLSTM1D
+from keras.layers.rnn.conv_lstm2d import ConvLSTM2D
+from keras.layers.rnn.conv_lstm3d import ConvLSTM3D
+from keras.layers.rnn.cudnn_gru import CuDNNGRU
+
+# cuDNN recurrent layers.
+from keras.layers.rnn.cudnn_lstm import CuDNNLSTM
+from keras.layers.rnn.time_distributed import TimeDistributed
 from keras.layers.serialization import deserialize
 from keras.layers.serialization import deserialize_from_json
-from keras.layers.serialization import serialize
 from keras.layers.serialization import get_builtin_layer
+from keras.layers.serialization import serialize
 
 
 class VersionAwareLayers:
-  """Utility to be used internally to access layers in a V1/V2-aware fashion.
-
-  When using layers within the Keras codebase, under the constraint that
-  e.g. `layers.BatchNormalization` should be the `BatchNormalization` version
-  corresponding to the current runtime (TF1 or TF2), do not simply access
-  `layers.BatchNormalization` since it would ignore e.g. an early
-  `compat.v2.disable_v2_behavior()` call. Instead, use an instance
-  of `VersionAwareLayers` (which you can use just like the `layers` module).
-  """
-
-  def __getattr__(self, name):
-    serialization.populate_deserializable_objects()
-    if name in serialization.LOCAL.ALL_OBJECTS:
-      return serialization.LOCAL.ALL_OBJECTS[name]
-    return super().__getattr__(name)
+    """Utility to be used internally to access layers in a V1/V2-aware fashion.
+
+    When using layers within the Keras codebase, under the constraint that
+    e.g. `layers.BatchNormalization` should be the `BatchNormalization` version
+    corresponding to the current runtime (TF1 or TF2), do not simply access
+    `layers.BatchNormalization` since it would ignore e.g. an early
+    `compat.v2.disable_v2_behavior()` call. Instead, use an instance
+    of `VersionAwareLayers` (which you can use just like the `layers` module).
+    """
+
+    def __getattr__(self, name):
+        serialization.populate_deserializable_objects()
+        if name in serialization.LOCAL.ALL_OBJECTS:
+            return serialization.LOCAL.ALL_OBJECTS[name]
+        return super().__getattr__(name)
diff --git a/keras/layers/activation/BUILD b/keras/layers/activation/BUILD
index 8ca482de7223..2b81f4897a5f 100644
--- a/keras/layers/activation/BUILD
+++ b/keras/layers/activation/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #  Contains the Keras activation layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
diff --git a/keras/layers/activation/__init__.py b/keras/layers/activation/__init__.py
index c39011ade3ea..f571762759e4 100644
--- a/keras/layers/activation/__init__.py
+++ b/keras/layers/activation/__init__.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 """Layers that act as activation functions."""
-# pylint: disable=g-bad-import-order
 
-from keras.layers.activation.relu import ReLU
-from keras.layers.activation.softmax import Softmax
+
+from keras.layers.activation.elu import ELU
 from keras.layers.activation.leaky_relu import LeakyReLU
 from keras.layers.activation.prelu import PReLU
-from keras.layers.activation.elu import ELU
+from keras.layers.activation.relu import ReLU
+from keras.layers.activation.softmax import Softmax
 from keras.layers.activation.thresholded_relu import ThresholdedReLU
diff --git a/keras/layers/activation/elu.py b/keras/layers/activation/elu.py
index 598313325808..8bba10fb7080 100644
--- a/keras/layers/activation/elu.py
+++ b/keras/layers/activation/elu.py
@@ -13,55 +13,57 @@
 # limitations under the License.
 # ==============================================================================
 """Exponential Linear Unit activation layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ELU')
+@keras_export("keras.layers.ELU")
 class ELU(Layer):
-  """Exponential Linear Unit.
+    """Exponential Linear Unit.
 
-  It follows:
+    It follows:
 
-  ```
-    f(x) =  alpha * (exp(x) - 1.) for x < 0
-    f(x) = x for x >= 0
-  ```
+    ```
+        f(x) =  alpha * (exp(x) - 1.) for x < 0
+        f(x) = x for x >= 0
+    ```
 
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
+    Input shape:
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
 
-  Output shape:
-    Same shape as the input.
+    Output shape:
+        Same shape as the input.
 
-  Args:
-    alpha: Scale for the negative factor.
-  """
+    Args:
+        alpha: Scale for the negative factor.
+    """
 
-  def __init__(self, alpha=1.0, **kwargs):
-    super().__init__(**kwargs)
-    if alpha is None:
-      raise ValueError(
-          'Alpha of an ELU layer cannot be None, expecting a float. '
-          f'Received: {alpha}')
-    self.supports_masking = True
-    self.alpha = backend.cast_to_floatx(alpha)
+    def __init__(self, alpha=1.0, **kwargs):
+        super().__init__(**kwargs)
+        if alpha is None:
+            raise ValueError(
+                "Alpha of an ELU layer cannot be None, expecting a float. "
+                f"Received: {alpha}"
+            )
+        self.supports_masking = True
+        self.alpha = backend.cast_to_floatx(alpha)
 
-  def call(self, inputs):
-    return backend.elu(inputs, self.alpha)
+    def call(self, inputs):
+        return backend.elu(inputs, self.alpha)
 
-  def get_config(self):
-    config = {'alpha': float(self.alpha)}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"alpha": float(self.alpha)}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/activation/elu_test.py b/keras/layers/activation/elu_test.py
index 14cf9cc53e69..63f20d12b8e4 100644
--- a/keras/layers/activation/elu_test.py
+++ b/keras/layers/activation/elu_test.py
@@ -14,33 +14,38 @@
 # ==============================================================================
 """Tests for ELU layer."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class ELUTest(test_combinations.TestCase):
+    def test_elu(self):
+        for alpha in [0.0, 0.5, -1.0]:
+            test_utils.layer_test(
+                keras.layers.ELU,
+                kwargs={"alpha": alpha},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
-  def test_elu(self):
-    for alpha in [0., .5, -1.]:
-      test_utils.layer_test(keras.layers.ELU,
-                            kwargs={'alpha': alpha},
-                            input_shape=(2, 3, 4),
-                            supports_masking=True)
-
-  def test_elu_with_invalid_alpha(self):
-    # Test case for GitHub issue 46993.
-    with self.assertRaisesRegex(
-        ValueError, 'Alpha of an ELU layer cannot be None, '
-        'expecting a float. Received: None'):
-      test_utils.layer_test(
-          keras.layers.ELU,
-          kwargs={'alpha': None},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
+    def test_elu_with_invalid_alpha(self):
+        # Test case for GitHub issue 46993.
+        with self.assertRaisesRegex(
+            ValueError,
+            "Alpha of an ELU layer cannot be None, "
+            "expecting a float. Received: None",
+        ):
+            test_utils.layer_test(
+                keras.layers.ELU,
+                kwargs={"alpha": None},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/activation/leaky_relu.py b/keras/layers/activation/leaky_relu.py
index 4c382dea76be..fa3e373d734c 100644
--- a/keras/layers/activation/leaky_relu.py
+++ b/keras/layers/activation/leaky_relu.py
@@ -13,67 +13,69 @@
 # limitations under the License.
 # ==============================================================================
 """Leaky version of a Rectified Linear Unit activation layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.LeakyReLU')
+@keras_export("keras.layers.LeakyReLU")
 class LeakyReLU(Layer):
-  """Leaky version of a Rectified Linear Unit.
-
-  It allows a small gradient when the unit is not active:
-
-  ```
-    f(x) = alpha * x if x < 0
-    f(x) = x if x >= 0
-  ```
-
-  Usage:
-
-  >>> layer = tf.keras.layers.LeakyReLU()
-  >>> output = layer([-3.0, -1.0, 0.0, 2.0])
-  >>> list(output.numpy())
-  [-0.9, -0.3, 0.0, 2.0]
-  >>> layer = tf.keras.layers.LeakyReLU(alpha=0.1)
-  >>> output = layer([-3.0, -1.0, 0.0, 2.0])
-  >>> list(output.numpy())
-  [-0.3, -0.1, 0.0, 2.0]
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the batch axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as the input.
-
-  Args:
-    alpha: Float >= 0. Negative slope coefficient. Default to 0.3.
-
-  """
-
-  def __init__(self, alpha=0.3, **kwargs):
-    super().__init__(**kwargs)
-    if alpha is None:
-      raise ValueError(
-          'The alpha value of a Leaky ReLU layer cannot be None, '
-          f'Expecting a float. Received: {alpha}')
-    self.supports_masking = True
-    self.alpha = backend.cast_to_floatx(alpha)
-
-  def call(self, inputs):
-    return backend.relu(inputs, alpha=self.alpha)
-
-  def get_config(self):
-    config = {'alpha': float(self.alpha)}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    """Leaky version of a Rectified Linear Unit.
+
+    It allows a small gradient when the unit is not active:
+
+    ```
+        f(x) = alpha * x if x < 0
+        f(x) = x if x >= 0
+    ```
+
+    Usage:
+
+    >>> layer = tf.keras.layers.LeakyReLU()
+    >>> output = layer([-3.0, -1.0, 0.0, 2.0])
+    >>> list(output.numpy())
+    [-0.9, -0.3, 0.0, 2.0]
+    >>> layer = tf.keras.layers.LeakyReLU(alpha=0.1)
+    >>> output = layer([-3.0, -1.0, 0.0, 2.0])
+    >>> list(output.numpy())
+    [-0.3, -0.1, 0.0, 2.0]
+
+    Input shape:
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the batch axis)
+        when using this layer as the first layer in a model.
+
+    Output shape:
+        Same shape as the input.
+
+    Args:
+        alpha: Float >= `0.`. Negative slope coefficient. Defaults to `0.3`.
+
+    """
+
+    def __init__(self, alpha=0.3, **kwargs):
+        super().__init__(**kwargs)
+        if alpha is None:
+            raise ValueError(
+                "The alpha value of a Leaky ReLU layer cannot be None, "
+                f"Expecting a float. Received: {alpha}"
+            )
+        self.supports_masking = True
+        self.alpha = backend.cast_to_floatx(alpha)
+
+    def call(self, inputs):
+        return backend.relu(inputs, alpha=self.alpha)
+
+    def get_config(self):
+        config = {"alpha": float(self.alpha)}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/activation/leaky_relu_test.py b/keras/layers/activation/leaky_relu_test.py
index 9cbbc809b7fe..13d25699b3c3 100644
--- a/keras/layers/activation/leaky_relu_test.py
+++ b/keras/layers/activation/leaky_relu_test.py
@@ -14,33 +14,38 @@
 # ==============================================================================
 """Tests for LeakyReLU layer."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class LeakyReLUTest(test_combinations.TestCase):
+    def test_leaky_relu(self):
+        for alpha in [0.0, 0.5]:
+            test_utils.layer_test(
+                keras.layers.LeakyReLU,
+                kwargs={"alpha": alpha},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
-  def test_leaky_relu(self):
-    for alpha in [0., .5]:
-      test_utils.layer_test(keras.layers.LeakyReLU,
-                            kwargs={'alpha': alpha},
-                            input_shape=(2, 3, 4),
-                            supports_masking=True)
-
-  def test_leaky_relu_with_invalid_alpha(self):
-    # Test case for GitHub issue 46993.
-    with self.assertRaisesRegex(
-        ValueError, 'The alpha value of a Leaky ReLU layer '
-        'cannot be None. Expecting a float. Received: None'):
-      test_utils.layer_test(
-          keras.layers.LeakyReLU,
-          kwargs={'alpha': None},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
+    def test_leaky_relu_with_invalid_alpha(self):
+        # Test case for GitHub issue 46993.
+        with self.assertRaisesRegex(
+            ValueError,
+            "The alpha value of a Leaky ReLU layer "
+            "cannot be None. Expecting a float. Received: None",
+        ):
+            test_utils.layer_test(
+                keras.layers.LeakyReLU,
+                kwargs={"alpha": None},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/activation/prelu.py b/keras/layers/activation/prelu.py
index 94b1738e7c6a..09164599df54 100644
--- a/keras/layers/activation/prelu.py
+++ b/keras/layers/activation/prelu.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Parametric Rectified Linear Unit activation layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras import constraints
@@ -23,98 +23,102 @@
 from keras.engine.input_spec import InputSpec
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.PReLU')
+@keras_export("keras.layers.PReLU")
 class PReLU(Layer):
-  """Parametric Rectified Linear Unit.
-
-  It follows:
-
-  ```
-    f(x) = alpha * x for x < 0
-    f(x) = x for x >= 0
-  ```
-
-  where `alpha` is a learned array with the same shape as x.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as the input.
-
-  Args:
-    alpha_initializer: Initializer function for the weights.
-    alpha_regularizer: Regularizer for the weights.
-    alpha_constraint: Constraint for the weights.
-    shared_axes: The axes along which to share learnable
-      parameters for the activation function.
-      For example, if the incoming feature maps
-      are from a 2D convolution
-      with output shape `(batch, height, width, channels)`,
-      and you wish to share parameters across space
-      so that each filter only has one set of parameters,
-      set `shared_axes=[1, 2]`.
-  """
-
-  def __init__(self,
-               alpha_initializer='zeros',
-               alpha_regularizer=None,
-               alpha_constraint=None,
-               shared_axes=None,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.supports_masking = True
-    self.alpha_initializer = initializers.get(alpha_initializer)
-    self.alpha_regularizer = regularizers.get(alpha_regularizer)
-    self.alpha_constraint = constraints.get(alpha_constraint)
-    if shared_axes is None:
-      self.shared_axes = None
-    elif not isinstance(shared_axes, (list, tuple)):
-      self.shared_axes = [shared_axes]
-    else:
-      self.shared_axes = list(shared_axes)
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    param_shape = list(input_shape[1:])
-    if self.shared_axes is not None:
-      for i in self.shared_axes:
-        param_shape[i - 1] = 1
-    self.alpha = self.add_weight(
-        shape=param_shape,
-        name='alpha',
-        initializer=self.alpha_initializer,
-        regularizer=self.alpha_regularizer,
-        constraint=self.alpha_constraint)
-    # Set input spec
-    axes = {}
-    if self.shared_axes:
-      for i in range(1, len(input_shape)):
-        if i not in self.shared_axes:
-          axes[i] = input_shape[i]
-    self.input_spec = InputSpec(ndim=len(input_shape), axes=axes)
-    self.built = True
-
-  def call(self, inputs):
-    pos = backend.relu(inputs)
-    neg = -self.alpha * backend.relu(-inputs)
-    return pos + neg
-
-  def get_config(self):
-    config = {
-        'alpha_initializer': initializers.serialize(self.alpha_initializer),
-        'alpha_regularizer': regularizers.serialize(self.alpha_regularizer),
-        'alpha_constraint': constraints.serialize(self.alpha_constraint),
-        'shared_axes': self.shared_axes
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    """Parametric Rectified Linear Unit.
+
+    It follows:
+
+    ```
+        f(x) = alpha * x for x < 0
+        f(x) = x for x >= 0
+    ```
+
+    where `alpha` is a learned array with the same shape as x.
+
+    Input shape:
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+
+    Output shape:
+        Same shape as the input.
+
+    Args:
+        alpha_initializer: Initializer function for the weights.
+        alpha_regularizer: Regularizer for the weights.
+        alpha_constraint: Constraint for the weights.
+        shared_axes: The axes along which to share learnable
+            parameters for the activation function.
+            For example, if the incoming feature maps
+            are from a 2D convolution
+            with output shape `(batch, height, width, channels)`,
+            and you wish to share parameters across space
+            so that each filter only has one set of parameters,
+            set `shared_axes=[1, 2]`.
+    """
+
+    def __init__(
+        self,
+        alpha_initializer="zeros",
+        alpha_regularizer=None,
+        alpha_constraint=None,
+        shared_axes=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.alpha_initializer = initializers.get(alpha_initializer)
+        self.alpha_regularizer = regularizers.get(alpha_regularizer)
+        self.alpha_constraint = constraints.get(alpha_constraint)
+        if shared_axes is None:
+            self.shared_axes = None
+        elif not isinstance(shared_axes, (list, tuple)):
+            self.shared_axes = [shared_axes]
+        else:
+            self.shared_axes = list(shared_axes)
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        param_shape = list(input_shape[1:])
+        if self.shared_axes is not None:
+            for i in self.shared_axes:
+                param_shape[i - 1] = 1
+        self.alpha = self.add_weight(
+            shape=param_shape,
+            name="alpha",
+            initializer=self.alpha_initializer,
+            regularizer=self.alpha_regularizer,
+            constraint=self.alpha_constraint,
+        )
+        # Set input spec
+        axes = {}
+        if self.shared_axes:
+            for i in range(1, len(input_shape)):
+                if i not in self.shared_axes:
+                    axes[i] = input_shape[i]
+        self.input_spec = InputSpec(ndim=len(input_shape), axes=axes)
+        self.built = True
+
+    def call(self, inputs):
+        pos = backend.relu(inputs)
+        neg = -self.alpha * backend.relu(-inputs)
+        return pos + neg
+
+    def get_config(self):
+        config = {
+            "alpha_initializer": initializers.serialize(self.alpha_initializer),
+            "alpha_regularizer": regularizers.serialize(self.alpha_regularizer),
+            "alpha_constraint": constraints.serialize(self.alpha_constraint),
+            "shared_axes": self.shared_axes,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/activation/prelu_test.py b/keras/layers/activation/prelu_test.py
index 382bbe66ec6b..0d07f3aa9c51 100644
--- a/keras/layers/activation/prelu_test.py
+++ b/keras/layers/activation/prelu_test.py
@@ -14,26 +14,31 @@
 # ==============================================================================
 """Tests for PReLU layer."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class PReLUTest(test_combinations.TestCase):
+    def test_prelu(self):
+        test_utils.layer_test(
+            keras.layers.PReLU,
+            kwargs={},
+            input_shape=(2, 3, 4),
+            supports_masking=True,
+        )
 
-  def test_prelu(self):
-    test_utils.layer_test(keras.layers.PReLU, kwargs={},
-                          input_shape=(2, 3, 4),
-                          supports_masking=True)
-
-  def test_prelu_share(self):
-    test_utils.layer_test(keras.layers.PReLU,
-                          kwargs={'shared_axes': 1},
-                          input_shape=(2, 3, 4),
-                          supports_masking=True)
+    def test_prelu_share(self):
+        test_utils.layer_test(
+            keras.layers.PReLU,
+            kwargs={"shared_axes": 1},
+            input_shape=(2, 3, 4),
+            supports_masking=True,
+        )
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/activation/relu.py b/keras/layers/activation/relu.py
index b714c70c900c..dbb5f2194b1c 100644
--- a/keras/layers/activation/relu.py
+++ b/keras/layers/activation/relu.py
@@ -13,100 +13,112 @@
 # limitations under the License.
 # ==============================================================================
 """Rectified Linear Unit activation layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ReLU')
+@keras_export("keras.layers.ReLU")
 class ReLU(Layer):
-  """Rectified Linear Unit activation function.
-
-  With default values, it returns element-wise `max(x, 0)`.
-
-  Otherwise, it follows:
-
-  ```
-    f(x) = max_value if x >= max_value
-    f(x) = x if threshold <= x < max_value
-    f(x) = negative_slope * (x - threshold) otherwise
-  ```
-
-  Usage:
-
-  >>> layer = tf.keras.layers.ReLU()
-  >>> output = layer([-3.0, -1.0, 0.0, 2.0])
-  >>> list(output.numpy())
-  [0.0, 0.0, 0.0, 2.0]
-  >>> layer = tf.keras.layers.ReLU(max_value=1.0)
-  >>> output = layer([-3.0, -1.0, 0.0, 2.0])
-  >>> list(output.numpy())
-  [0.0, 0.0, 0.0, 1.0]
-  >>> layer = tf.keras.layers.ReLU(negative_slope=1.0)
-  >>> output = layer([-3.0, -1.0, 0.0, 2.0])
-  >>> list(output.numpy())
-  [-3.0, -1.0, 0.0, 2.0]
-  >>> layer = tf.keras.layers.ReLU(threshold=1.5)
-  >>> output = layer([-3.0, -1.0, 1.0, 2.0])
-  >>> list(output.numpy())
-  [0.0, 0.0, 0.0, 2.0]
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the batch axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as the input.
-
-  Args:
-    max_value: Float >= 0. Maximum activation value. Default to None, which
-      means unlimited.
-    negative_slope: Float >= 0. Negative slope coefficient. Default to 0.
-    threshold: Float >= 0. Threshold value for thresholded activation. Default
-      to 0.
-  """
-
-  def __init__(self, max_value=None, negative_slope=0., threshold=0., **kwargs):
-    super().__init__(**kwargs)
-    if max_value is not None and max_value < 0.:
-      raise ValueError('max_value of a ReLU layer cannot be a negative '
-                       f'value. Received: {max_value}')
-    if negative_slope is None or negative_slope < 0.:
-      raise ValueError('negative_slope of a ReLU layer cannot be a negative '
-                       f'value. Received: {negative_slope}')
-    if threshold is None or threshold < 0.:
-      raise ValueError('threshold of a ReLU layer cannot be a negative '
-                       f'value. Received: {threshold}')
-
-    self.supports_masking = True
-    if max_value is not None:
-      max_value = backend.cast_to_floatx(max_value)
-    self.max_value = max_value
-    self.negative_slope = backend.cast_to_floatx(negative_slope)
-    self.threshold = backend.cast_to_floatx(threshold)
-
-  def call(self, inputs):
-    # alpha is used for leaky relu slope in activations instead of
-    # negative_slope.
-    return backend.relu(inputs,
-                        alpha=self.negative_slope,
-                        max_value=self.max_value,
-                        threshold=self.threshold)
-
-  def get_config(self):
-    config = {
-        'max_value': self.max_value,
-        'negative_slope': self.negative_slope,
-        'threshold': self.threshold
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    """Rectified Linear Unit activation function.
+
+    With default values, it returns element-wise `max(x, 0)`.
+
+    Otherwise, it follows:
+
+    ```
+        f(x) = max_value if x >= max_value
+        f(x) = x if threshold <= x < max_value
+        f(x) = negative_slope * (x - threshold) otherwise
+    ```
+
+    Usage:
+
+    >>> layer = tf.keras.layers.ReLU()
+    >>> output = layer([-3.0, -1.0, 0.0, 2.0])
+    >>> list(output.numpy())
+    [0.0, 0.0, 0.0, 2.0]
+    >>> layer = tf.keras.layers.ReLU(max_value=1.0)
+    >>> output = layer([-3.0, -1.0, 0.0, 2.0])
+    >>> list(output.numpy())
+    [0.0, 0.0, 0.0, 1.0]
+    >>> layer = tf.keras.layers.ReLU(negative_slope=1.0)
+    >>> output = layer([-3.0, -1.0, 0.0, 2.0])
+    >>> list(output.numpy())
+    [-3.0, -1.0, 0.0, 2.0]
+    >>> layer = tf.keras.layers.ReLU(threshold=1.5)
+    >>> output = layer([-3.0, -1.0, 1.0, 2.0])
+    >>> list(output.numpy())
+    [0.0, 0.0, 0.0, 2.0]
+
+    Input shape:
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the batch axis)
+        when using this layer as the first layer in a model.
+
+    Output shape:
+        Same shape as the input.
+
+    Args:
+        max_value: Float >= 0. Maximum activation value. None means unlimited.
+            Defaults to `None`.
+        negative_slope: Float >= 0. Negative slope coefficient.
+            Defaults to `0.`.
+        threshold: Float >= 0. Threshold value for thresholded activation.
+            Defaults to `0.`.
+    """
+
+    def __init__(
+        self, max_value=None, negative_slope=0.0, threshold=0.0, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if max_value is not None and max_value < 0.0:
+            raise ValueError(
+                "max_value of a ReLU layer cannot be a negative "
+                f"value. Received: {max_value}"
+            )
+        if negative_slope is None or negative_slope < 0.0:
+            raise ValueError(
+                "negative_slope of a ReLU layer cannot be a negative "
+                f"value. Received: {negative_slope}"
+            )
+        if threshold is None or threshold < 0.0:
+            raise ValueError(
+                "threshold of a ReLU layer cannot be a negative "
+                f"value. Received: {threshold}"
+            )
+
+        self.supports_masking = True
+        if max_value is not None:
+            max_value = backend.cast_to_floatx(max_value)
+        self.max_value = max_value
+        self.negative_slope = backend.cast_to_floatx(negative_slope)
+        self.threshold = backend.cast_to_floatx(threshold)
+
+    def call(self, inputs):
+        # alpha is used for leaky relu slope in activations instead of
+        # negative_slope.
+        return backend.relu(
+            inputs,
+            alpha=self.negative_slope,
+            max_value=self.max_value,
+            threshold=self.threshold,
+        )
+
+    def get_config(self):
+        config = {
+            "max_value": self.max_value,
+            "negative_slope": self.negative_slope,
+            "threshold": self.threshold,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/activation/relu_test.py b/keras/layers/activation/relu_test.py
index 1d4daad98a63..70ded16275d6 100644
--- a/keras/layers/activation/relu_test.py
+++ b/keras/layers/activation/relu_test.py
@@ -14,88 +14,104 @@
 # ==============================================================================
 """Tests for ReLU layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class ReLUTest(test_combinations.TestCase):
+    def test_relu(self):
+        test_utils.layer_test(
+            keras.layers.ReLU,
+            kwargs={"max_value": 10},
+            input_shape=(2, 3, 4),
+            supports_masking=True,
+        )
+        x = keras.backend.ones((3, 4))
+        if not tf.executing_eagerly():
+            # Test that we use `leaky_relu` when appropriate in graph mode.
+            self.assertIn(
+                "LeakyRelu", keras.layers.ReLU(negative_slope=0.2)(x).name
+            )
+            # Test that we use `relu` when appropriate in graph mode.
+            self.assertIn("Relu", keras.layers.ReLU()(x).name)
+            # Test that we use `relu6` when appropriate in graph mode.
+            self.assertIn("Relu6", keras.layers.ReLU(max_value=6)(x).name)
 
-  def test_relu(self):
-    test_utils.layer_test(keras.layers.ReLU,
-                          kwargs={'max_value': 10},
-                          input_shape=(2, 3, 4),
-                          supports_masking=True)
-    x = keras.backend.ones((3, 4))
-    if not tf.executing_eagerly():
-      # Test that we use `leaky_relu` when appropriate in graph mode.
-      self.assertIn('LeakyRelu', keras.layers.ReLU(negative_slope=0.2)(x).name)
-      # Test that we use `relu` when appropriate in graph mode.
-      self.assertIn('Relu', keras.layers.ReLU()(x).name)
-      # Test that we use `relu6` when appropriate in graph mode.
-      self.assertIn('Relu6', keras.layers.ReLU(max_value=6)(x).name)
-
-  def test_relu_with_invalid_max_value(self):
-    with self.assertRaisesRegex(
-        ValueError, 'max_value of a ReLU layer cannot be a negative '
-        'value. Received: -10'):
-      test_utils.layer_test(
-          keras.layers.ReLU,
-          kwargs={'max_value': -10},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
+    def test_relu_with_invalid_max_value(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "max_value of a ReLU layer cannot be a negative "
+            "value. Received: -10",
+        ):
+            test_utils.layer_test(
+                keras.layers.ReLU,
+                kwargs={"max_value": -10},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
-  def test_relu_with_invalid_negative_slope(self):
-    with self.assertRaisesRegex(
-        ValueError, 'negative_slope of a ReLU layer cannot be a negative '
-        'value. Received: None'):
-      test_utils.layer_test(
-          keras.layers.ReLU,
-          kwargs={'negative_slope': None},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
+    def test_relu_with_invalid_negative_slope(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "negative_slope of a ReLU layer cannot be a negative "
+            "value. Received: None",
+        ):
+            test_utils.layer_test(
+                keras.layers.ReLU,
+                kwargs={"negative_slope": None},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
-    with self.assertRaisesRegex(
-        ValueError, 'negative_slope of a ReLU layer cannot be a negative '
-        'value. Received: -10'):
-      test_utils.layer_test(
-          keras.layers.ReLU,
-          kwargs={'negative_slope': -10},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
+        with self.assertRaisesRegex(
+            ValueError,
+            "negative_slope of a ReLU layer cannot be a negative "
+            "value. Received: -10",
+        ):
+            test_utils.layer_test(
+                keras.layers.ReLU,
+                kwargs={"negative_slope": -10},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
-  def test_relu_with_invalid_threshold(self):
-    with self.assertRaisesRegex(
-        ValueError, 'threshold of a ReLU layer cannot be a negative '
-        'value. Received: None'):
-      test_utils.layer_test(
-          keras.layers.ReLU,
-          kwargs={'threshold': None},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
+    def test_relu_with_invalid_threshold(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "threshold of a ReLU layer cannot be a negative "
+            "value. Received: None",
+        ):
+            test_utils.layer_test(
+                keras.layers.ReLU,
+                kwargs={"threshold": None},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
-    with self.assertRaisesRegex(
-        ValueError, 'threshold of a ReLU layer cannot be a negative '
-        'value. Received: -10'):
-      test_utils.layer_test(
-          keras.layers.ReLU,
-          kwargs={'threshold': -10},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
+        with self.assertRaisesRegex(
+            ValueError,
+            "threshold of a ReLU layer cannot be a negative "
+            "value. Received: -10",
+        ):
+            test_utils.layer_test(
+                keras.layers.ReLU,
+                kwargs={"threshold": -10},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
 
-  @test_combinations.run_with_all_model_types
-  def test_relu_layer_as_activation(self):
-    layer = keras.layers.Dense(1, activation=keras.layers.ReLU())
-    model = test_utils.get_model_from_layers([layer], input_shape=(10,))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2)
+    @test_combinations.run_with_all_model_types
+    def test_relu_layer_as_activation(self):
+        layer = keras.layers.Dense(1, activation=keras.layers.ReLU())
+        model = test_utils.get_model_from_layers([layer], input_shape=(10,))
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/activation/softmax.py b/keras/layers/activation/softmax.py
index c72949af6a9b..aed2dbdec6f5 100644
--- a/keras/layers/activation/softmax.py
+++ b/keras/layers/activation/softmax.py
@@ -13,96 +13,105 @@
 # limitations under the License.
 # ==============================================================================
 """Softmax activation layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
 def _large_compatible_negative(tensor_type):
-  """Large negative number as Tensor.
+    """Large negative number as Tensor.
 
-  This function is necessary because the standard value for epsilon
-  in this module (-1e9) cannot be represented using tf.float16
+    This function is necessary because the standard value for epsilon
+    in this module (-1e9) cannot be represented using tf.float16
 
-  Args:
-    tensor_type: a dtype to determine the type.
+    Args:
+        tensor_type: a dtype to determine the type.
 
-  Returns:
-    a large negative number.
-  """
-  if tensor_type == tf.float16:
-    return tf.float16.min
-  return -1e9
+    Returns:
+        a large negative number.
+    """
+    # In case of dtype=float16 (e.g., for mixed-precision), the largest
+    # negative number (dtypes.float16.min) is divided by 2, in order to
+    # avoid overflows when summing negative inputs.
+    if tensor_type == tf.float16:
+        return tf.float16.min / 2.0
+    return -1e9
 
 
-@keras_export('keras.layers.Softmax')
+@keras_export("keras.layers.Softmax")
 class Softmax(Layer):
-  """Softmax activation function.
-
-  Example without mask:
-
-  >>> inp = np.asarray([1., 2., 1.])
-  >>> layer = tf.keras.layers.Softmax()
-  >>> layer(inp).numpy()
-  array([0.21194157, 0.5761169 , 0.21194157], dtype=float32)
-  >>> mask = np.asarray([True, False, True], dtype=bool)
-  >>> layer(inp, mask).numpy()
-  array([0.5, 0. , 0.5], dtype=float32)
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as the input.
-
-  Args:
-    axis: Integer, or list of Integers, axis along which the softmax
-      normalization is applied.
-  Call arguments:
-    inputs: The inputs, or logits to the softmax layer.
-    mask: A boolean mask of the same shape as `inputs`. Defaults to `None`. The
-      mask specifies 1 to keep and 0 to mask.
-
-  Returns:
-    softmaxed output with the same shape as `inputs`.
-  """
-
-  def __init__(self, axis=-1, **kwargs):
-    super().__init__(**kwargs)
-    self.supports_masking = True
-    self.axis = axis
-
-  def call(self, inputs, mask=None):
-    if mask is not None:
-      # Since mask is 1.0 for positions we want to keep and 0.0 for
-      # masked positions, this operation will create a tensor which is 0.0 for
-      # positions we want to attend and -1e.9 for masked positions.
-      adder = (1.0 - tf.cast(mask, inputs.dtype)) * (
-          _large_compatible_negative(inputs.dtype))
-
-      # Since we are adding it to the raw scores before the softmax, this is
-      # effectively the same as removing these entirely.
-      inputs += adder
-    if isinstance(self.axis, (tuple, list)):
-      if len(self.axis) > 1:
-        return tf.exp(inputs - tf.reduce_logsumexp(
-            inputs, axis=self.axis, keepdims=True))
-      else:
-        return backend.softmax(inputs, axis=self.axis[0])
-    return backend.softmax(inputs, axis=self.axis)
-
-  def get_config(self):
-    config = {'axis': self.axis}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    """Softmax activation function.
+
+    Example without mask:
+
+    >>> inp = np.asarray([[1., 2., 1.]])
+    >>> layer = tf.keras.layers.Softmax()
+    >>> layer(inp).numpy()
+    array([[0.21194157, 0.5761169 , 0.21194157]], dtype=float32)
+    >>> mask = np.asarray([[True, False, True]], dtype=bool)
+    >>> layer(inp, mask).numpy()
+    array([[0.5, 0. , 0.5]], dtype=float32)
+
+    Input shape:
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+
+    Output shape:
+        Same shape as the input.
+
+    Args:
+        axis: Integer, or list of Integers, axis along which the softmax
+            normalization is applied.
+    Call arguments:
+        inputs: The inputs, or logits to the softmax layer.
+        mask: A boolean mask of the same shape as `inputs`. The mask
+            specifies 1 to keep and 0 to mask. Defaults to `None`.
+
+
+    Returns:
+        Softmaxed output with the same shape as `inputs`.
+    """
+
+    def __init__(self, axis=-1, **kwargs):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.axis = axis
+
+    def call(self, inputs, mask=None):
+        if mask is not None:
+            # Since mask is 1.0 for positions we want to keep and 0.0 for masked
+            # positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -1e.9 for masked positions.
+            adder = (1.0 - tf.cast(mask, inputs.dtype)) * (
+                _large_compatible_negative(inputs.dtype)
+            )
+
+            # Since we are adding it to the raw scores before the softmax, this
+            # is effectively the same as removing these entirely.
+            inputs += adder
+        if isinstance(self.axis, (tuple, list)):
+            if len(self.axis) > 1:
+                return tf.exp(
+                    inputs
+                    - tf.reduce_logsumexp(inputs, axis=self.axis, keepdims=True)
+                )
+            else:
+                return backend.softmax(inputs, axis=self.axis[0])
+        return backend.softmax(inputs, axis=self.axis)
+
+    def get_config(self):
+        config = {"axis": self.axis}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/activation/softmax_test.py b/keras/layers/activation/softmax_test.py
index 0c615791558c..86562425d452 100644
--- a/keras/layers/activation/softmax_test.py
+++ b/keras/layers/activation/softmax_test.py
@@ -14,21 +14,23 @@
 # ==============================================================================
 """Tests for Softmax layer."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class SoftmaxTest(test_combinations.TestCase):
-
-  def test_softmax(self):
-    test_utils.layer_test(keras.layers.Softmax,
-                          kwargs={'axis': 1},
-                          input_shape=(2, 3, 4),
-                          supports_masking=True)
+    def test_softmax(self):
+        test_utils.layer_test(
+            keras.layers.Softmax,
+            kwargs={"axis": 1},
+            input_shape=(2, 3, 4),
+            supports_masking=True,
+        )
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/activation/thresholded_relu.py b/keras/layers/activation/thresholded_relu.py
index cc3abeb15c76..9d575af1ee2d 100644
--- a/keras/layers/activation/thresholded_relu.py
+++ b/keras/layers/activation/thresholded_relu.py
@@ -13,60 +13,65 @@
 # limitations under the License.
 # ==============================================================================
 """Thresholded Rectified Linear Unit activation layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ThresholdedReLU')
+@keras_export("keras.layers.ThresholdedReLU")
 class ThresholdedReLU(Layer):
-  """Thresholded Rectified Linear Unit.
+    """Thresholded Rectified Linear Unit.
 
-  It follows:
+    It follows:
 
-  ```
-    f(x) = x for x > theta
-    f(x) = 0 otherwise`
-  ```
+    ```
+        f(x) = x for x > theta
+        f(x) = 0 otherwise`
+    ```
 
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
+    Input shape:
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
 
-  Output shape:
-    Same shape as the input.
+    Output shape:
+        Same shape as the input.
 
-  Args:
-    theta: Float >= 0. Threshold location of activation.
-  """
+    Args:
+        theta: Float >= 0. Threshold location of activation.
+    """
 
-  def __init__(self, theta=1.0, **kwargs):
-    super().__init__(**kwargs)
-    if theta is None:
-      raise ValueError(
-          'Theta of a Thresholded ReLU layer cannot be None, expecting a float.'
-          f' Received: {theta}')
-    if theta < 0:
-      raise ValueError('The theta value of a Thresholded ReLU layer '
-                       f'should be >=0. Received: {theta}')
-    self.supports_masking = True
-    self.theta = backend.cast_to_floatx(theta)
+    def __init__(self, theta=1.0, **kwargs):
+        super().__init__(**kwargs)
+        if theta is None:
+            raise ValueError(
+                "Theta of a Thresholded ReLU layer cannot be None, expecting a "
+                f"float. Received: {theta}"
+            )
+        if theta < 0:
+            raise ValueError(
+                "The theta value of a Thresholded ReLU layer "
+                f"should be >=0. Received: {theta}"
+            )
+        self.supports_masking = True
+        self.theta = backend.cast_to_floatx(theta)
 
-  def call(self, inputs):
-    dtype = self.compute_dtype
-    return inputs * tf.cast(tf.greater(inputs, self.theta), dtype)
+    def call(self, inputs):
+        dtype = self.compute_dtype
+        return inputs * tf.cast(tf.greater(inputs, self.theta), dtype)
 
-  def get_config(self):
-    config = {'theta': float(self.theta)}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"theta": float(self.theta)}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/activation/thresholded_relu_test.py b/keras/layers/activation/thresholded_relu_test.py
index 3a554be59110..f7f4170a4988 100644
--- a/keras/layers/activation/thresholded_relu_test.py
+++ b/keras/layers/activation/thresholded_relu_test.py
@@ -14,40 +14,48 @@
 # ==============================================================================
 """Tests for ThresholdedReLU layer."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class ThresholdedReLUTest(test_combinations.TestCase):
-
-  def test_thresholded_relu(self):
-    test_utils.layer_test(keras.layers.ThresholdedReLU,
-                          kwargs={'theta': 0.5},
-                          input_shape=(2, 3, 4),
-                          supports_masking=True)
-
-  def test_threshold_relu_with_invalid_theta(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Theta of a Thresholded ReLU layer cannot '
-        'be None, expecting a float. Received: None'):
-      test_utils.layer_test(
-          keras.layers.ThresholdedReLU,
-          kwargs={'theta': None},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
-
-    with self.assertRaisesRegex(
-        ValueError, 'The theta value of a Thresholded ReLU '
-        'layer should be >=0. Received: -10'):
-      test_utils.layer_test(
-          keras.layers.ThresholdedReLU,
-          kwargs={'theta': -10},
-          input_shape=(2, 3, 4),
-          supports_masking=True)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_thresholded_relu(self):
+        test_utils.layer_test(
+            keras.layers.ThresholdedReLU,
+            kwargs={"theta": 0.5},
+            input_shape=(2, 3, 4),
+            supports_masking=True,
+        )
+
+    def test_threshold_relu_with_invalid_theta(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Theta of a Thresholded ReLU layer cannot "
+            "be None, expecting a float. Received: None",
+        ):
+            test_utils.layer_test(
+                keras.layers.ThresholdedReLU,
+                kwargs={"theta": None},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "The theta value of a Thresholded ReLU "
+            "layer should be >=0. Received: -10",
+        ):
+            test_utils.layer_test(
+                keras.layers.ThresholdedReLU,
+                kwargs={"theta": -10},
+                input_shape=(2, 3, 4),
+                supports_masking=True,
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/attention/BUILD b/keras/layers/attention/BUILD
index 14f6b63f5fe4..fffdb146f493 100644
--- a/keras/layers/attention/BUILD
+++ b/keras/layers/attention/BUILD
@@ -1,15 +1,17 @@
 # Description:
 #  Contains the Keras attention layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/py/tensorflow_gnn:__subpackages__",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
diff --git a/keras/layers/attention/__init__.py b/keras/layers/attention/__init__.py
index 1914077daffa..e285718b4f0b 100644
--- a/keras/layers/attention/__init__.py
+++ b/keras/layers/attention/__init__.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 """Keras attention layers."""
-# pylint: disable=g-bad-import-order
 
-from keras.layers.attention.multi_head_attention import MultiHeadAttention
-from keras.layers.attention.attention import Attention
+
 from keras.layers.attention.additive_attention import AdditiveAttention
+from keras.layers.attention.attention import Attention
+from keras.layers.attention.multi_head_attention import MultiHeadAttention
diff --git a/keras/layers/attention/additive_attention.py b/keras/layers/attention/additive_attention.py
index aa9ee50c8bb4..49b826c11c2f 100644
--- a/keras/layers/attention/additive_attention.py
+++ b/keras/layers/attention/additive_attention.py
@@ -17,159 +17,161 @@
 This file follows the terminology of https://arxiv.org/abs/1706.03762 Figure 2.
 Attention is formed by three tensors: Query, Key and Value.
 """
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.layers.attention.base_dense_attention import BaseDenseAttention
+
 import tensorflow.compat.v2 as tf
 
+from keras.layers.attention.base_dense_attention import BaseDenseAttention
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.AdditiveAttention')
+@keras_export("keras.layers.AdditiveAttention")
 class AdditiveAttention(BaseDenseAttention):
-  """Additive attention layer, a.k.a. Bahdanau-style attention.
-
-  Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor of
-  shape `[batch_size, Tv, dim]` and `key` tensor of shape
-  `[batch_size, Tv, dim]`. The calculation follows the steps:
-
-  1. Reshape `query` and `key` into shapes `[batch_size, Tq, 1, dim]`
-     and `[batch_size, 1, Tv, dim]` respectively.
-  2. Calculate scores with shape `[batch_size, Tq, Tv]` as a non-linear
-     sum: `scores = tf.reduce_sum(tf.tanh(query + key), axis=-1)`
-  3. Use scores to calculate a distribution with shape
-     `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
-  4. Use `distribution` to create a linear combination of `value` with
-     shape `[batch_size, Tq, dim]`:
-     `return tf.matmul(distribution, value)`.
-
-  Args:
-    use_scale: If `True`, will create a variable to scale the attention scores.
-    causal: Boolean. Set to `True` for decoder self-attention. Adds a mask such
-      that position `i` cannot attend to positions `j > i`. This prevents the
-      flow of information from the future towards the past.
-      Defaults to `False`.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the
-      attention scores. Defaults to 0.0.
-
-  Call Args:
-
-    inputs: List of the following tensors:
-      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
-      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
-      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
-        given, will use `value` for both `key` and `value`, which is the
-        most common case.
-    mask: List of the following tensors:
-      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
-        If given, the output will be zero at the positions where
-        `mask==False`.
-      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
-        If given, will apply the mask such that values at positions where
-        `mask==False` do not contribute to the result.
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (no dropout).
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
-
-  Output:
-
-    Attention outputs of shape `[batch_size, Tq, dim]`.
-    [Optional] Attention scores after masking and softmax with shape
-      `[batch_size, Tq, Tv]`.
-
-  The meaning of `query`, `value` and `key` depend on the application. In the
-  case of text similarity, for example, `query` is the sequence embeddings of
-  the first piece of text and `value` is the sequence embeddings of the second
-  piece of text. `key` is usually the same tensor as `value`.
-
-  Here is a code example for using `AdditiveAttention` in a CNN+Attention
-  network:
-
-  ```python
-  # Variable-length int sequences.
-  query_input = tf.keras.Input(shape=(None,), dtype='int32')
-  value_input = tf.keras.Input(shape=(None,), dtype='int32')
-
-  # Embedding lookup.
-  token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
-  # Query embeddings of shape [batch_size, Tq, dimension].
-  query_embeddings = token_embedding(query_input)
-  # Value embeddings of shape [batch_size, Tv, dimension].
-  value_embeddings = token_embedding(value_input)
-
-  # CNN layer.
-  cnn_layer = tf.keras.layers.Conv1D(
-      filters=100,
-      kernel_size=4,
-      # Use 'same' padding so outputs have the same shape as inputs.
-      padding='same')
-  # Query encoding of shape [batch_size, Tq, filters].
-  query_seq_encoding = cnn_layer(query_embeddings)
-  # Value encoding of shape [batch_size, Tv, filters].
-  value_seq_encoding = cnn_layer(value_embeddings)
-
-  # Query-value attention of shape [batch_size, Tq, filters].
-  query_value_attention_seq = tf.keras.layers.AdditiveAttention()(
-      [query_seq_encoding, value_seq_encoding])
-
-  # Reduce over the sequence axis to produce encodings of shape
-  # [batch_size, filters].
-  query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
-      query_seq_encoding)
-  query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
-      query_value_attention_seq)
-
-  # Concatenate query and document encodings to produce a DNN input layer.
-  input_layer = tf.keras.layers.Concatenate()(
-      [query_encoding, query_value_attention])
-
-  # Add DNN layers, and create Model.
-  # ...
-  ```
-  """
-
-  def __init__(self, use_scale=True, **kwargs):
-    super().__init__(**kwargs)
-    self.use_scale = use_scale
-
-  def build(self, input_shape):
-    v_shape = tf.TensorShape(input_shape[1])
-    dim = v_shape[-1]
-    dim = tf.compat.dimension_value(dim)
-    if self.use_scale:
-      self.scale = self.add_weight(
-          name='scale',
-          shape=[dim],
-          initializer='glorot_uniform',
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.scale = None
-    super().build(input_shape)
-
-  def _calculate_scores(self, query, key):
-    """Calculates attention scores as a nonlinear sum of query and key.
+    """Additive attention layer, a.k.a. Bahdanau-style attention.
+
+    Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor
+    of shape `[batch_size, Tv, dim]` and `key` tensor of shape
+    `[batch_size, Tv, dim]`. The calculation follows the steps:
+
+    1. Reshape `query` and `key` into shapes `[batch_size, Tq, 1, dim]`
+        and `[batch_size, 1, Tv, dim]` respectively.
+    2. Calculate scores with shape `[batch_size, Tq, Tv]` as a non-linear
+        sum: `scores = tf.reduce_sum(tf.tanh(query + key), axis=-1)`
+    3. Use scores to calculate a distribution with shape
+        `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
+    4. Use `distribution` to create a linear combination of `value` with
+        shape `[batch_size, Tq, dim]`:
+       `return tf.matmul(distribution, value)`.
 
     Args:
-      query: Query tensor of shape `[batch_size, Tq, dim]`.
-      key: Key tensor of shape `[batch_size, Tv, dim]`.
-    Returns:
-      Tensor of shape `[batch_size, Tq, Tv]`.
+        use_scale: If `True`, will create a variable to scale the attention
+            scores.
+        dropout: Float between 0 and 1. Fraction of the units to drop for the
+            attention scores. Defaults to `0.0`.
+
+    Call arguments:
+        inputs: List of the following tensors:
+            * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+            * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+            * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`.
+                If not given, will use `value` for both `key` and `value`,
+                which is the most common case.
+        mask: List of the following tensors:
+            * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+                If given, the output will be zero at the positions where
+                `mask==False`.
+            * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+                If given, will apply the mask such that values at positions
+                where `mask==False` do not contribute to the result.
+        training: Python boolean indicating whether the layer should behave in
+            training mode (adding dropout) or in inference mode (no dropout).
+        return_attention_scores: bool, it `True`, returns the attention scores
+            (after masking and softmax) as an additional output argument.
+        use_causal_mask: Boolean. Set to `True` for decoder self-attention. Adds
+            a mask such that position `i` cannot attend to positions `j > i`.
+            This prevents the flow of information from the future towards the
+            past. Defaults to `False`.
+
+    Output:
+
+        Attention outputs of shape `[batch_size, Tq, dim]`.
+        [Optional] Attention scores after masking and softmax with shape
+            `[batch_size, Tq, Tv]`.
+
+    The meaning of `query`, `value` and `key` depend on the application. In the
+    case of text similarity, for example, `query` is the sequence embeddings of
+    the first piece of text and `value` is the sequence embeddings of the second
+    piece of text. `key` is usually the same tensor as `value`.
+
+    Here is a code example for using `AdditiveAttention` in a CNN+Attention
+    network:
+
+    ```python
+    # Variable-length int sequences.
+    query_input = tf.keras.Input(shape=(None,), dtype='int32')
+    value_input = tf.keras.Input(shape=(None,), dtype='int32')
+
+    # Embedding lookup.
+    token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
+    # Query embeddings of shape [batch_size, Tq, dimension].
+    query_embeddings = token_embedding(query_input)
+    # Value embeddings of shape [batch_size, Tv, dimension].
+    value_embeddings = token_embedding(value_input)
+
+    # CNN layer.
+    cnn_layer = tf.keras.layers.Conv1D(
+        filters=100,
+        kernel_size=4,
+        # Use 'same' padding so outputs have the same shape as inputs.
+        padding='same')
+    # Query encoding of shape [batch_size, Tq, filters].
+    query_seq_encoding = cnn_layer(query_embeddings)
+    # Value encoding of shape [batch_size, Tv, filters].
+    value_seq_encoding = cnn_layer(value_embeddings)
+
+    # Query-value attention of shape [batch_size, Tq, filters].
+    query_value_attention_seq = tf.keras.layers.AdditiveAttention()(
+        [query_seq_encoding, value_seq_encoding])
+
+    # Reduce over the sequence axis to produce encodings of shape
+    # [batch_size, filters].
+    query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
+        query_seq_encoding)
+    query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
+        query_value_attention_seq)
+
+    # Concatenate query and document encodings to produce a DNN input layer.
+    input_layer = tf.keras.layers.Concatenate()(
+        [query_encoding, query_value_attention])
+
+    # Add DNN layers, and create Model.
+    # ...
+    ```
     """
-    # Reshape tensors to enable broadcasting.
-    # Reshape into [batch_size, Tq, 1, dim].
-    q_reshaped = tf.expand_dims(query, axis=-2)
-    # Reshape into [batch_size, 1, Tv, dim].
-    k_reshaped = tf.expand_dims(key, axis=-3)
-    if self.use_scale:
-      scale = self.scale
-    else:
-      scale = 1.
-    return tf.reduce_sum(
-        scale * tf.tanh(q_reshaped + k_reshaped), axis=-1)
-
-  def get_config(self):
-    config = {'use_scale': self.use_scale}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+
+    def __init__(self, use_scale=True, **kwargs):
+        super().__init__(**kwargs)
+        self.use_scale = use_scale
+
+    def build(self, input_shape):
+        v_shape = tf.TensorShape(input_shape[1])
+        dim = v_shape[-1]
+        dim = tf.compat.dimension_value(dim)
+        if self.use_scale:
+            self.scale = self.add_weight(
+                name="scale",
+                shape=[dim],
+                initializer="glorot_uniform",
+                dtype=self.dtype,
+                trainable=True,
+            )
+        else:
+            self.scale = None
+        super().build(input_shape)
+
+    def _calculate_scores(self, query, key):
+        """Calculates attention scores as a nonlinear sum of query and key.
+
+        Args:
+            query: Query tensor of shape `[batch_size, Tq, dim]`.
+            key: Key tensor of shape `[batch_size, Tv, dim]`.
+        Returns:
+            Tensor of shape `[batch_size, Tq, Tv]`.
+        """
+        # Reshape tensors to enable broadcasting.
+        # Reshape into [batch_size, Tq, 1, dim].
+        q_reshaped = tf.expand_dims(query, axis=-2)
+        # Reshape into [batch_size, 1, Tv, dim].
+        k_reshaped = tf.expand_dims(key, axis=-3)
+        if self.use_scale:
+            scale = self.scale
+        else:
+            scale = 1.0
+        return tf.reduce_sum(scale * tf.tanh(q_reshaped + k_reshaped), axis=-1)
+
+    def get_config(self):
+        config = {"use_scale": self.use_scale}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/attention/additive_attention_test.py b/keras/layers/attention/additive_attention_test.py
index e9309f51a471..690053bcf065 100644
--- a/keras/layers/attention/additive_attention_test.py
+++ b/keras/layers/attention/additive_attention_test.py
@@ -14,266 +14,324 @@
 # ==============================================================================
 """Tests AdditiveAttention layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.mixed_precision import policy
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class AdditiveAttentionTest(tf.test.TestCase, parameterized.TestCase):
+    def test_calculate_scores_one_dim(self):
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Key tensor of shape [1, 1, 1]
+        k = np.array([[[1.6]]], dtype=np.float32)
+        attention_layer = keras.layers.AdditiveAttention()
+        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+        # Scale tensor of shape [1]
+        attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+        actual = attention_layer._calculate_scores(query=q, key=k)
 
-  def test_calculate_scores_one_dim(self):
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Key tensor of shape [1, 1, 1]
-    k = np.array([[[1.6]]], dtype=np.float32)
-    attention_layer = keras.layers.AdditiveAttention()
-    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-    # Scale tensor of shape [1]
-    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
-    actual = attention_layer._calculate_scores(query=q, key=k)
-
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
-    expected = np.array([[[0.49550372683]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
+        expected = np.array([[[0.49550372683]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
 
-  def test_calculate_scores_multi_dim(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Key tensor of shape [1, 3, 4]
-    k = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    attention_layer = keras.layers.AdditiveAttention()
-    attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
-    # Scale tensor of shape [4]
-    attention_layer.scale = np.array([[[0.5, 0.6, 0.7, 0.8]]], dtype=np.float32)
-    actual = attention_layer._calculate_scores(query=q, key=k)
+    def test_calculate_scores_multi_dim(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Key tensor of shape [1, 3, 4]
+        k = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        attention_layer = keras.layers.AdditiveAttention()
+        attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
+        # Scale tensor of shape [4]
+        attention_layer.scale = np.array(
+            [[[0.5, 0.6, 0.7, 0.8]]], dtype=np.float32
+        )
+        actual = attention_layer._calculate_scores(query=q, key=k)
 
-    # pylint:disable=line-too-long
-    # expected000 = 0.5*tanh(1.+1.5) + 0.6*tanh(1.1+1.6) + 0.7*tanh(1.2+1.7) + 0.8*tanh(1.3+1.8) = 2.58044532581
-    # expected001 = 0.5*tanh(1.+2.5) + 0.6*tanh(1.1+2.6) + 0.7*tanh(1.2+2.7) + 0.8*tanh(1.3+2.8) = 2.59734317449
-    # expected002 = 0.5*tanh(1.+3.5) + 0.6*tanh(1.1+3.6) + 0.7*tanh(1.2+3.7) + 0.8*tanh(1.3+3.8) = 2.59964024652
-    # expected010 = 0.5*tanh(2.+1.5) + 0.6*tanh(2.1+1.6) + 0.7*tanh(2.2+1.7) + 0.8*tanh(2.3+1.8) = 2.59734317449
-    # expected011 = 0.5*tanh(2.+2.5) + 0.6*tanh(2.1+2.6) + 0.7*tanh(2.2+2.7) + 0.8*tanh(2.3+2.8) = 2.59964024652
-    # expected012 = 0.5*tanh(2.+3.5) + 0.6*tanh(2.1+3.6) + 0.7*tanh(2.2+3.7) + 0.8*tanh(2.3+3.8) = 2.59995130916
-    # pylint:enable=line-too-long
-    expected = np.array([[[2.58044532581, 2.59734317449, 2.59964024652],
-                          [2.59734317449, 2.59964024652, 2.59995130916]]],
-                        dtype=np.float32)
-    self.assertAllClose(expected, actual)
+        # expected000 = 0.5*tanh(1.+1.5) + 0.6*tanh(1.1+1.6) + \
+        #     0.7*tanh(1.2+1.7) + 0.8*tanh(1.3+1.8) = 2.58044532581
+        # expected001 = 0.5*tanh(1.+2.5) + 0.6*tanh(1.1+2.6) + \
+        #     0.7*tanh(1.2+2.7) + 0.8*tanh(1.3+2.8) = 2.59734317449
+        # expected002 = 0.5*tanh(1.+3.5) + 0.6*tanh(1.1+3.6) + \
+        #     0.7*tanh(1.2+3.7) + 0.8*tanh(1.3+3.8) = 2.59964024652
+        # expected010 = 0.5*tanh(2.+1.5) + 0.6*tanh(2.1+1.6) + \
+        #     0.7*tanh(2.2+1.7) + 0.8*tanh(2.3+1.8) = 2.59734317449
+        # expected011 = 0.5*tanh(2.+2.5) + 0.6*tanh(2.1+2.6) + \
+        #     0.7*tanh(2.2+2.7) + 0.8*tanh(2.3+2.8) = 2.59964024652
+        # expected012 = 0.5*tanh(2.+3.5) + 0.6*tanh(2.1+3.6) + \
+        #     0.7*tanh(2.2+3.7) + 0.8*tanh(2.3+3.8) = 2.59995130916
+        expected = np.array(
+            [
+                [
+                    [2.58044532581, 2.59734317449, 2.59964024652],
+                    [2.59734317449, 2.59964024652, 2.59995130916],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        self.assertAllClose(expected, actual)
 
-  def test_calculate_scores_one_dim_batch_size_two(self):
-    # Query tensor of shape [2, 1, 1]
-    q = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
-    # Key tensor of shape [2, 1, 1]
-    k = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
-    attention_layer = keras.layers.AdditiveAttention()
-    attention_layer.build(input_shape=([2, 1, 1], [2, 1, 1]))
-    # Scale tensor of shape [1]
-    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
-    actual = attention_layer._calculate_scores(query=q, key=k)
+    def test_calculate_scores_one_dim_batch_size_two(self):
+        # Query tensor of shape [2, 1, 1]
+        q = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
+        # Key tensor of shape [2, 1, 1]
+        k = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+        attention_layer = keras.layers.AdditiveAttention()
+        attention_layer.build(input_shape=([2, 1, 1], [2, 1, 1]))
+        # Scale tensor of shape [1]
+        attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+        actual = attention_layer._calculate_scores(query=q, key=k)
 
-    # Expected tensor of shape [2, 1, 1].
-    # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
-    # expected100 = 0.5 * tanh(2.1 + 2.6) = 0.49991728277
-    expected = np.array([[[0.49550372683]], [[0.49991728277]]],
-                        dtype=np.float32)
-    self.assertAllClose(expected, actual)
+        # Expected tensor of shape [2, 1, 1].
+        # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
+        # expected100 = 0.5 * tanh(2.1 + 2.6) = 0.49991728277
+        expected = np.array(
+            [[[0.49550372683]], [[0.49991728277]]], dtype=np.float32
+        )
+        self.assertAllClose(expected, actual)
 
-  def test_shape(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 4]
-    v = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.AdditiveAttention()
-    actual = attention_layer([q, v], mask=[None, v_mask])
+    def test_shape(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Value tensor of shape [1, 3, 4]
+        v = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.AdditiveAttention()
+        actual = attention_layer([q, v], mask=[None, v_mask])
 
-    expected_shape = [1, 2, 4]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
+        expected_shape = [1, 2, 4]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
 
-  def test_shape_no_scale(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 4]
-    v = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.AdditiveAttention(use_scale=False)
-    actual = attention_layer([q, v], mask=[None, v_mask])
+    def test_shape_no_scale(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Value tensor of shape [1, 3, 4]
+        v = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.AdditiveAttention(use_scale=False)
+        actual = attention_layer([q, v], mask=[None, v_mask])
 
-    expected_shape = [1, 2, 4]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
+        expected_shape = [1, 2, 4]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
 
-  def test_shape_with_key(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 4]
-    v = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Key tensor of shape [1, 3, 4]
-    k = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.AdditiveAttention()
-    actual = attention_layer([q, v, k], mask=[None, v_mask])
+    def test_shape_with_key(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Value tensor of shape [1, 3, 4]
+        v = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Key tensor of shape [1, 3, 4]
+        k = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.AdditiveAttention()
+        actual = attention_layer([q, v, k], mask=[None, v_mask])
 
-    expected_shape = [1, 2, 4]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
+        expected_shape = [1, 2, 4]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
 
-  def test_multi_dim(self):
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.AdditiveAttention()
-    attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
-    # Scale tensor of shape [1]
-    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
-    actual = attention_layer([q, v], mask=[None, v_mask])
+    def test_multi_dim(self):
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.AdditiveAttention()
+        attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
+        # Scale tensor of shape [1]
+        attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+        actual = attention_layer([q, v], mask=[None, v_mask])
 
-    # pylint:disable=line-too-long
-    # Expected scores of shape [1, 1, 3]
-    # scores = [[[0.5 * tanh(1.1 + 1.6), 0.5 * tanh(1.1 + 0.7), 0.5 * tanh(1.1 - 0.8)]]]
-    #        = [[[0.49550372683, 0.47340300642, 0.14565630622]]]
-    # Expected attention distribution = softmax(scores) with zeros in
-    # positions where v_mask == False.
-    # => attention_distribution000
-    #      = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
-    #      = 0.50552495521
-    #    attention_distribution001
-    #      = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
-    #      = 0.49447504478
-    #    attention_distribution002 = 0
-    #
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 0.50552495521 * 1.6 + 0.49447504478 * 0.7 - 0 * 0.8
-    #             = 1.15497245968
-    # pylint:enable=line-too-long
-    expected = np.array([[[1.15497245968]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
+        # Expected scores of shape [1, 1, 3]
+        # scores = [[[0.5 * tanh(1.1 + 1.6),
+        #             0.5 * tanh(1.1 + 0.7),
+        #             0.5 * tanh(1.1 - 0.8)]]]
+        #        = [[[0.49550372683, 0.47340300642, 0.14565630622]]]
+        # Expected attention distribution = softmax(scores) with zeros in
+        # positions where v_mask == False.
+        # => attention_distribution000
+        #      = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
+        #      = 0.50552495521
+        #    attention_distribution001
+        #      = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
+        #      = 0.49447504478
+        #    attention_distribution002 = 0
+        #
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 0.50552495521 * 1.6 + 0.49447504478 * 0.7 - 0 * 0.8
+        #             = 1.15497245968
+        expected = np.array([[[1.15497245968]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
 
-  def test_multi_dim_with_key(self):
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
-    # Key tensor of shape [1, 3, 1]
-    k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.AdditiveAttention()
-    attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
-    # Scale tensor of shape [1]
-    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
-    actual = attention_layer([q, v, k], mask=[None, v_mask])
+    def test_multi_dim_with_key(self):
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
+        # Key tensor of shape [1, 3, 1]
+        k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.AdditiveAttention()
+        attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
+        # Scale tensor of shape [1]
+        attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+        actual = attention_layer([q, v, k], mask=[None, v_mask])
 
-    # pylint:disable=line-too-long
-    # Expected scores of shape [1, 1, 3]
-    # scores = [[[0.5 * tanh(1.1 + 1.6), 0.5 * tanh(1.1 + 0.7), 0.5 * tanh(1.1 - 0.8)]]]
-    #        = [[[0.49550372683, 0.47340300642, 0.14565630622]]]
-    # Expected attention distribution = softmax(scores) with zeros in
-    # positions where v_mask == False.
-    # => attention_distribution000
-    #        = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
-    #        = 0.50552495521
-    #    attention_distribution001
-    #        = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
-    #        = 0.49447504478
-    #    attention_distribution002 = 0
-    #
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 0.50552495521 * 0.5 + 0.49447504478 * 0.8 - 0 * 0.3
-    #             = 0.64834251342
-    # pylint:enable=line-too-long
-    expected = np.array([[[0.64834251342]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
+        # Expected scores of shape [1, 1, 3]
+        # scores = [[[0.5 * tanh(1.1 + 1.6),
+        #             0.5 * tanh(1.1 + 0.7),
+        #             0.5 * tanh(1.1 - 0.8)]]]
+        #        = [[[0.49550372683, 0.47340300642, 0.14565630622]]]
+        # Expected attention distribution = softmax(scores) with zeros in
+        # positions where v_mask == False.
+        # => attention_distribution000
+        #        = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
+        #        = 0.50552495521
+        #    attention_distribution001
+        #        = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
+        #        = 0.49447504478
+        #    attention_distribution002 = 0
+        #
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 0.50552495521 * 0.5 + 0.49447504478 * 0.8 - 0 * 0.3
+        #             = 0.64834251342
+        expected = np.array([[[0.64834251342]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
 
-  def test_multi_dim_with_query_mask(self):
-    # Query tensor of shape [1, 2, 1]
-    q = np.array([[[1.1], [-0.5]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    # Query mask tensor of shape [1, 2]
-    q_mask = np.array([[True, False]], dtype=np.bool_)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.AdditiveAttention()
-    attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
-    # Scale tensor of shape [1]
-    attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
-    actual = attention_layer([q, v], mask=[q_mask, v_mask])
+    def test_multi_dim_with_query_mask(self):
+        # Query tensor of shape [1, 2, 1]
+        q = np.array([[[1.1], [-0.5]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        # Query mask tensor of shape [1, 2]
+        q_mask = np.array([[True, False]], dtype=np.bool_)
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.AdditiveAttention()
+        attention_layer.build(input_shape=([1, 1, 1], [1, 3, 1]))
+        # Scale tensor of shape [1]
+        attention_layer.scale = np.array([[[0.5]]], dtype=np.float32)
+        actual = attention_layer([q, v], mask=[q_mask, v_mask])
 
-    # pylint:disable=line-too-long
-    # Expected scores of shape [1, 2, 3]
-    # scores = [[[0.5 * tanh(1.1 + 1.6), 0.5 * tanh(1.1 + 0.7), 0.5 * tanh(1.1 - 0.8)],
-    #            [0.5 * tanh(-0.5 + 1.6), 0.5 * tanh(-0.5 + 0.7), 0.5 * tanh(-0.5 - 0.8)]]]
-    #        = [[[0.49550372683, 0.47340300642, 0.14565630622],
-    #            [0.40024951088, 0.09868766011, -0.43086157965]]]
-    # Expected attention distribution = softmax(scores) with zeros in
-    # positions where v_mask == False.
-    # => attention_distribution000
-    #        = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
-    #        = 0.50552495521
-    #    attention_distribution001
-    #        = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
-    #        = 0.49447504478
-    #    attention_distribution002 = 0
-    # => attention_distribution010
-    #        = exp(0.40024951088)/(exp(0.40024951088) + exp(0.09868766011))
-    #        = 0.57482427975
-    #    attention_distribution011
-    #        = exp(0.09868766011)/(exp(0.40024951088) + exp(0.09868766011))
-    #        = 0.42517572025
-    #    attention_distribution012 = 0
-    #
-    # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
-    # expected000 = 0.50552495521 * 1.6 + 0.49447504478 * 0.7 - 0 * 0.8
-    #             = 1.15497245968
-    # expected000 = 0
-    # pylint:enable=line-too-long
-    expected = np.array([[[1.15497245968], [0.]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
+        # Expected scores of shape [1, 2, 3]
+        # scores = [[[0.5 * tanh(1.1 + 1.6),
+        #             0.5 * tanh(1.1 + 0.7),
+        #             0.5 * tanh(1.1 - 0.8)],
+        #            [0.5 * tanh(-0.5 + 1.6),
+        #             0.5 * tanh(-0.5 + 0.7),
+        #             0.5 * tanh(-0.5 - 0.8)]]]
+        #        = [[[0.49550372683, 0.47340300642, 0.14565630622],
+        #            [0.40024951088, 0.09868766011, -0.43086157965]]]
+        # Expected attention distribution = softmax(scores) with zeros in
+        # positions where v_mask == False.
+        # => attention_distribution000
+        #        = exp(0.49550372683)/(exp(0.49550372683) + exp(0.47340300642))
+        #        = 0.50552495521
+        #    attention_distribution001
+        #        = exp(0.47340300642)/(exp(0.49550372683) + exp(0.47340300642))
+        #        = 0.49447504478
+        #    attention_distribution002 = 0
+        # => attention_distribution010
+        #        = exp(0.40024951088)/(exp(0.40024951088) + exp(0.09868766011))
+        #        = 0.57482427975
+        #    attention_distribution011
+        #        = exp(0.09868766011)/(exp(0.40024951088) + exp(0.09868766011))
+        #        = 0.42517572025
+        #    attention_distribution012 = 0
+        #
+        # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
+        # expected000 = 0.50552495521 * 1.6 + 0.49447504478 * 0.7 - 0 * 0.8
+        #             = 1.15497245968
+        # expected000 = 0
+        expected = np.array([[[1.15497245968], [0.0]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
 
-  def test_serialization(self):
-    # Test serialization with use_scale
-    layer = keras.layers.AdditiveAttention(use_scale=True)
+    def test_serialization(self):
+        # Test serialization with use_scale
+        layer = keras.layers.AdditiveAttention(use_scale=True)
 
-    config = keras.layers.serialize(layer)
-    new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.use_scale, True)
+        config = keras.layers.serialize(layer)
+        new_layer = keras.layers.deserialize(config)
+        self.assertEqual(new_layer.use_scale, True)
 
-    config = layer.get_config()
-    new_layer = keras.layers.AdditiveAttention.from_config(config)
-    self.assertEqual(new_layer.use_scale, True)
+        config = layer.get_config()
+        new_layer = keras.layers.AdditiveAttention.from_config(config)
+        self.assertEqual(new_layer.use_scale, True)
 
-  @test_utils.enable_v2_dtype_behavior
-  def test_mixed_float16_policy(self):
-    # Test case for GitHub issue:
-    # https://github.com/tensorflow/tensorflow/issues/46064
-    with policy.policy_scope('mixed_float16'):
-      q = tf.cast(tf.random.uniform((2, 3, 4), seed=1), 'float16')
-      v = tf.cast(tf.random.uniform((2, 3, 4), seed=2), 'float16')
-      k = tf.cast(tf.random.uniform((2, 3, 4), seed=3), 'float16')
-      layer = keras.layers.AdditiveAttention(causal=True)
-      _ = layer([q, v, k])
+    @test_utils.enable_v2_dtype_behavior
+    def test_mixed_float16_policy(self):
+        # Test case for GitHub issue:
+        # https://github.com/tensorflow/tensorflow/issues/46064
+        with policy.policy_scope("mixed_float16"):
+            q = tf.cast(tf.random.uniform((2, 3, 4), seed=1), "float16")
+            v = tf.cast(tf.random.uniform((2, 3, 4), seed=2), "float16")
+            k = tf.cast(tf.random.uniform((2, 3, 4), seed=3), "float16")
+            layer = keras.layers.AdditiveAttention()
+            _ = layer([q, v, k], use_causal_mask=True)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/attention/attention.py b/keras/layers/attention/attention.py
index 91036776ee7b..380c2f557696 100644
--- a/keras/layers/attention/attention.py
+++ b/keras/layers/attention/attention.py
@@ -17,179 +17,188 @@
 This file follows the terminology of https://arxiv.org/abs/1706.03762 Figure 2.
 Attention is formed by three tensors: Query, Key and Value.
 """
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.layers.attention.base_dense_attention import BaseDenseAttention
+
 import tensorflow.compat.v2 as tf
 
+from keras.layers.attention.base_dense_attention import BaseDenseAttention
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Attention')
+@keras_export("keras.layers.Attention")
 class Attention(BaseDenseAttention):
-  """Dot-product attention layer, a.k.a. Luong-style attention.
-
-  Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor of
-  shape `[batch_size, Tv, dim]` and `key` tensor of shape
-  `[batch_size, Tv, dim]`. The calculation follows the steps:
-
-  1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
-     product: `scores = tf.matmul(query, key, transpose_b=True)`.
-  2. Use scores to calculate a distribution with shape
-     `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
-  3. Use `distribution` to create a linear combination of `value` with
-     shape `[batch_size, Tq, dim]`:
-     `return tf.matmul(distribution, value)`.
-
-  Args:
-    use_scale: If `True`, will create a scalar variable to scale the attention
-      scores.
-    causal: Boolean. Set to `True` for decoder self-attention. Adds a mask such
-      that position `i` cannot attend to positions `j > i`. This prevents the
-      flow of information from the future towards the past.
-      Defaults to `False`.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the
-      attention scores. Defaults to 0.0.
-    score_mode: Function to use to compute attention scores, one of
-      `{"dot", "concat"}`. `"dot"` refers to the dot product between the query
-      and key vectors. `"concat"` refers to the hyperbolic tangent of the
-      concatenation of the query and key vectors.
-
-  Call Args:
-
-    inputs: List of the following tensors:
-      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
-      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
-      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
-        given, will use `value` for both `key` and `value`, which is the
-        most common case.
-    mask: List of the following tensors:
-      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
-        If given, the output will be zero at the positions where
-        `mask==False`.
-      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
-        If given, will apply the mask such that values at positions where
-        `mask==False` do not contribute to the result.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (no dropout).
-
-  Output:
-
-    Attention outputs of shape `[batch_size, Tq, dim]`.
-    [Optional] Attention scores after masking and softmax with shape
-      `[batch_size, Tq, Tv]`.
-
-  The meaning of `query`, `value` and `key` depend on the application. In the
-  case of text similarity, for example, `query` is the sequence embeddings of
-  the first piece of text and `value` is the sequence embeddings of the second
-  piece of text. `key` is usually the same tensor as `value`.
-
-  Here is a code example for using `Attention` in a CNN+Attention network:
-
-  ```python
-  # Variable-length int sequences.
-  query_input = tf.keras.Input(shape=(None,), dtype='int32')
-  value_input = tf.keras.Input(shape=(None,), dtype='int32')
-
-  # Embedding lookup.
-  token_embedding = tf.keras.layers.Embedding(input_dim=1000, output_dim=64)
-  # Query embeddings of shape [batch_size, Tq, dimension].
-  query_embeddings = token_embedding(query_input)
-  # Value embeddings of shape [batch_size, Tv, dimension].
-  value_embeddings = token_embedding(value_input)
-
-  # CNN layer.
-  cnn_layer = tf.keras.layers.Conv1D(
-      filters=100,
-      kernel_size=4,
-      # Use 'same' padding so outputs have the same shape as inputs.
-      padding='same')
-  # Query encoding of shape [batch_size, Tq, filters].
-  query_seq_encoding = cnn_layer(query_embeddings)
-  # Value encoding of shape [batch_size, Tv, filters].
-  value_seq_encoding = cnn_layer(value_embeddings)
-
-  # Query-value attention of shape [batch_size, Tq, filters].
-  query_value_attention_seq = tf.keras.layers.Attention()(
-      [query_seq_encoding, value_seq_encoding])
-
-  # Reduce over the sequence axis to produce encodings of shape
-  # [batch_size, filters].
-  query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
-      query_seq_encoding)
-  query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
-      query_value_attention_seq)
-
-  # Concatenate query and document encodings to produce a DNN input layer.
-  input_layer = tf.keras.layers.Concatenate()(
-      [query_encoding, query_value_attention])
-
-  # Add DNN layers, and create Model.
-  # ...
-  ```
-  """
-
-  def __init__(self, use_scale=False, score_mode='dot', **kwargs):
-    super().__init__(**kwargs)
-    self.use_scale = use_scale
-    self.score_mode = score_mode
-    if self.score_mode not in ['dot', 'concat']:
-      raise ValueError(f'Received: score_mode={score_mode}. Acceptable values '
-                       'are: ["dot", "concat"]')
-
-  def build(self, input_shape):
-    """Creates variable when `use_scale` is True or `score_mode` is `concat`."""
-    if self.use_scale:
-      self.scale = self.add_weight(
-          name='scale',
-          shape=(),
-          initializer='ones',
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.scale = None
-    if self.score_mode == 'concat':
-      self.concat_score_weight = self.add_weight(
-          name='concat_score_weight',
-          shape=(),
-          initializer='ones',
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.concat_score_weight = None
-    super().build(input_shape)
-
-  def _calculate_scores(self, query, key):
-    """Calculates attention scores as a query-key dot product.
+    """Dot-product attention layer, a.k.a. Luong-style attention.
+
+    Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor
+    of shape `[batch_size, Tv, dim]` and `key` tensor of shape
+    `[batch_size, Tv, dim]`. The calculation follows the steps:
+
+    1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
+        product: `scores = tf.matmul(query, key, transpose_b=True)`.
+    2. Use scores to calculate a distribution with shape
+        `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
+    3. Use `distribution` to create a linear combination of `value` with
+         shape `[batch_size, Tq, dim]`:
+         `return tf.matmul(distribution, value)`.
 
     Args:
-      query: Query tensor of shape `[batch_size, Tq, dim]`.
-      key: Key tensor of shape `[batch_size, Tv, dim]`.
-    Returns:
-      Tensor of shape `[batch_size, Tq, Tv]`.
+        use_scale: If `True`, will create a scalar variable to scale the
+            attention scores.
+        dropout: Float between 0 and 1. Fraction of the units to drop for the
+            attention scores. Defaults to 0.0.
+        score_mode: Function to use to compute attention scores, one of
+            `{"dot", "concat"}`. `"dot"` refers to the dot product between the
+            query and key vectors. `"concat"` refers to the hyperbolic tangent
+            of the concatenation of the query and key vectors.
+
+    Call arguments:
+        inputs: List of the following tensors:
+            * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+            * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+            * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If
+                not given, will use `value` for both `key` and `value`, which is
+                the most common case.
+        mask: List of the following tensors:
+            * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+                If given, the output will be zero at the positions where
+                `mask==False`.
+            * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+                If given, will apply the mask such that values at positions
+                 where `mask==False` do not contribute to the result.
+        return_attention_scores: bool, it `True`, returns the attention scores
+            (after masking and softmax) as an additional output argument.
+        training: Python boolean indicating whether the layer should behave in
+            training mode (adding dropout) or in inference mode (no dropout).
+        use_causal_mask: Boolean. Set to `True` for decoder self-attention. Adds
+            a mask such that position `i` cannot attend to positions `j > i`.
+            This prevents the flow of information from the future towards the
+            past.
+            Defaults to `False`.
+
+    Output:
+
+        Attention outputs of shape `[batch_size, Tq, dim]`.
+        [Optional] Attention scores after masking and softmax with shape
+            `[batch_size, Tq, Tv]`.
+
+    The meaning of `query`, `value` and `key` depend on the application. In the
+    case of text similarity, for example, `query` is the sequence embeddings of
+    the first piece of text and `value` is the sequence embeddings of the second
+    piece of text. `key` is usually the same tensor as `value`.
+
+    Here is a code example for using `Attention` in a CNN+Attention network:
+
+    ```python
+    # Variable-length int sequences.
+    query_input = tf.keras.Input(shape=(None,), dtype='int32')
+    value_input = tf.keras.Input(shape=(None,), dtype='int32')
+
+    # Embedding lookup.
+    token_embedding = tf.keras.layers.Embedding(input_dim=1000, output_dim=64)
+    # Query embeddings of shape [batch_size, Tq, dimension].
+    query_embeddings = token_embedding(query_input)
+    # Value embeddings of shape [batch_size, Tv, dimension].
+    value_embeddings = token_embedding(value_input)
+
+    # CNN layer.
+    cnn_layer = tf.keras.layers.Conv1D(
+        filters=100,
+        kernel_size=4,
+        # Use 'same' padding so outputs have the same shape as inputs.
+        padding='same')
+    # Query encoding of shape [batch_size, Tq, filters].
+    query_seq_encoding = cnn_layer(query_embeddings)
+    # Value encoding of shape [batch_size, Tv, filters].
+    value_seq_encoding = cnn_layer(value_embeddings)
+
+    # Query-value attention of shape [batch_size, Tq, filters].
+    query_value_attention_seq = tf.keras.layers.Attention()(
+        [query_seq_encoding, value_seq_encoding])
+
+    # Reduce over the sequence axis to produce encodings of shape
+    # [batch_size, filters].
+    query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
+        query_seq_encoding)
+    query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
+        query_value_attention_seq)
+
+    # Concatenate query and document encodings to produce a DNN input layer.
+    input_layer = tf.keras.layers.Concatenate()(
+        [query_encoding, query_value_attention])
+
+    # Add DNN layers, and create Model.
+    # ...
+    ```
     """
-    if self.score_mode == 'dot':
-      scores = tf.matmul(query, key, transpose_b=True)
-      if self.scale is not None:
-        scores *= self.scale
-    elif self.score_mode == 'concat':
-      # Reshape tensors to enable broadcasting.
-      # Reshape into [batch_size, Tq, 1, dim].
-      q_reshaped = tf.expand_dims(query, axis=-2)
-      # Reshape into [batch_size, 1, Tv, dim].
-      k_reshaped = tf.expand_dims(key, axis=-3)
-      if self.scale is not None:
-        scores = self.concat_score_weight * tf.reduce_sum(
-            tf.tanh(self.scale * (q_reshaped + k_reshaped)), axis=-1)
-      else:
-        scores = self.concat_score_weight * tf.reduce_sum(
-            tf.tanh(q_reshaped + k_reshaped), axis=-1)
-
-    return scores
-
-  def get_config(self):
-    config = {'use_scale': self.use_scale, 'score_mode': self.score_mode}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+
+    def __init__(self, use_scale=False, score_mode="dot", **kwargs):
+        super().__init__(**kwargs)
+        self.use_scale = use_scale
+        self.score_mode = score_mode
+        if self.score_mode not in ["dot", "concat"]:
+            raise ValueError(
+                f"Received: score_mode={score_mode}. Acceptable values "
+                'are: ["dot", "concat"]'
+            )
+
+    def build(self, input_shape):
+        """Creates variable when `use_scale` is True or `score_mode` is
+        `concat`."""
+        if self.use_scale:
+            self.scale = self.add_weight(
+                name="scale",
+                shape=(),
+                initializer="ones",
+                dtype=self.dtype,
+                trainable=True,
+            )
+        else:
+            self.scale = None
+        if self.score_mode == "concat":
+            self.concat_score_weight = self.add_weight(
+                name="concat_score_weight",
+                shape=(),
+                initializer="ones",
+                dtype=self.dtype,
+                trainable=True,
+            )
+        else:
+            self.concat_score_weight = None
+        super().build(input_shape)
+
+    def _calculate_scores(self, query, key):
+        """Calculates attention scores as a query-key dot product.
+
+        Args:
+            query: Query tensor of shape `[batch_size, Tq, dim]`.
+            key: Key tensor of shape `[batch_size, Tv, dim]`.
+        Returns:
+            Tensor of shape `[batch_size, Tq, Tv]`.
+        """
+        if self.score_mode == "dot":
+            scores = tf.matmul(query, key, transpose_b=True)
+            if self.scale is not None:
+                scores *= self.scale
+        elif self.score_mode == "concat":
+            # Reshape tensors to enable broadcasting.
+            # Reshape into [batch_size, Tq, 1, dim].
+            q_reshaped = tf.expand_dims(query, axis=-2)
+            # Reshape into [batch_size, 1, Tv, dim].
+            k_reshaped = tf.expand_dims(key, axis=-3)
+            if self.scale is not None:
+                scores = self.concat_score_weight * tf.reduce_sum(
+                    tf.tanh(self.scale * (q_reshaped + k_reshaped)), axis=-1
+                )
+            else:
+                scores = self.concat_score_weight * tf.reduce_sum(
+                    tf.tanh(q_reshaped + k_reshaped), axis=-1
+                )
+
+        return scores
+
+    def get_config(self):
+        config = {"use_scale": self.use_scale, "score_mode": self.score_mode}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/attention/attention_test.py b/keras/layers/attention/attention_test.py
index 1ddc288316b7..43debfb26551 100644
--- a/keras/layers/attention/attention_test.py
+++ b/keras/layers/attention/attention_test.py
@@ -14,442 +14,572 @@
 # ==============================================================================
 """Tests Attention layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers import core
 from keras.testing_infra import test_combinations
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class AttentionTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_calculate_scores_one_dim(self):
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Key tensor of shape [1, 1, 1]
-    k = np.array([[[1.6]]], dtype=np.float32)
-    attention_layer = keras.layers.Attention()
-    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-    actual = attention_layer._calculate_scores(query=q, key=k)
-
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 1.1*1.6 = 1.76
-    expected = np.array([[[1.76]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_calculate_scores_multi_dim(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Key tensor of shape [1, 3, 4]
-    k = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    attention_layer = keras.layers.Attention()
-    attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
-    actual = attention_layer._calculate_scores(query=q, key=k)
-
-    # Expected tensor of shape [1, 2, 3].
-    # expected000 = 1.*1.5+1.1*1.6+1.2*1.7+1.3*1.8 = 7.64
-    # expected001 = 1.*2.5+1.1*2.6+1.2*2.7+1.3*2.8 = 12.24
-    # expected002 = 1.*3.5+1.1*3.6+1.2*3.7+1.3*3.8 = 16.84
-    # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
-    # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
-    # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
-    expected = np.array([[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]],
-                        dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_calculate_scores_multi_dim_concat(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Key tensor of shape [1, 3, 4]
-    k = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    attention_layer = keras.layers.Attention(score_mode='concat')
-    attention_layer.concat_score_weight = 1
-    attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
-    actual = keras.backend.get_value(
-        attention_layer._calculate_scores(query=q, key=k))
-
-    # pylint:disable=line-too-long
-    # expected000 = tanh(1.+1.5) + tanh(1.1+1.6) + tanh(1.2+1.7) + tanh(1.3+1.8) = 3.96753427840
-    # expected001 = tanh(1.+2.5) + tanh(1.1+2.6) + tanh(1.2+2.7) + tanh(1.3+2.8) = 3.99558784825
-    # expected002 = tanh(1.+3.5) + tanh(1.1+3.6) + tanh(1.2+3.7) + tanh(1.3+3.8) = 3.99940254147
-    # expected010 = tanh(2.+1.5) + tanh(2.1+1.6) + tanh(2.2+1.7) + tanh(2.3+1.8) = 3.99558784825
-    # expected011 = tanh(2.+2.5) + tanh(2.1+2.6) + tanh(2.2+2.7) + tanh(2.3+2.8) = 3.99940254147
-    # expected012 = tanh(2.+3.5) + tanh(2.1+3.6) + tanh(2.2+3.7) + tanh(2.3+3.8) = 3.99991913657
-    expected = np.array([[[3.96753427840, 3.99558784825, 3.99940254147],
-                          [3.99558784825, 3.99940254147, 3.99991913657]]],
-                        dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_calculate_scores_one_dim_batch_size_two(self):
-    # Query tensor of shape [2, 1, 1]
-    q = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
-    # Key tensor of shape [2, 1, 1]
-    k = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
-    attention_layer = keras.layers.Attention()
-    attention_layer.build(input_shape=([2, 1, 1], [2, 1, 1]))
-    actual = attention_layer._calculate_scores(query=q, key=k)
-
-    # Expected tensor of shape [2, 1, 1].
-    # expected000 = 1.1*1.6 = 1.76
-    # expected100 = 2.1*2.6 = 5.46
-    expected = np.array([[[1.76]], [[5.46]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_calculate_scores_one_dim_with_scale(self):
-    """Tests that scores are multiplied by scale."""
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Key tensor of shape [1, 1, 1]
-    k = np.array([[[1.6]]], dtype=np.float32)
-    attention_layer = keras.layers.Attention(use_scale=True)
-    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-    attention_layer.scale = -2.
-    actual = attention_layer._calculate_scores(query=q, key=k)
-
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = -2*1.1*1.6 = -3.52
-    expected = np.array([[[-3.52]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_calculate_scores_one_dim_with_scale_concat(self):
-    """Tests that scores are multiplied by scale."""
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Key tensor of shape [1, 1, 1]
-    k = np.array([[[1.6]]], dtype=np.float32)
-    attention_layer = keras.layers.Attention(
-        use_scale=True, score_mode='concat')
-    attention_layer.concat_score_weight = 1
-    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-    attention_layer.scale = 2.
-    actual = keras.backend.get_value(
-        attention_layer._calculate_scores(query=q, key=k))
-
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = tanh(2*(1.1+1.6)) = 0.9999592018254402
-    expected = np.array([[[0.999959202]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_shape(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 4]
-    v = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.Attention()
-    actual = attention_layer([q, v], mask=[None, v_mask])
-
-    expected_shape = [1, 2, 4]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
-
-  def test_shape_concat(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 4]
-    v = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.Attention(score_mode='concat')
-    attention_layer.concat_score_weight = 1
-    actual = attention_layer([q, v], mask=[None, v_mask])
-
-    expected_shape = [1, 2, 4]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
-
-  def test_shape_with_key(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 4]
-    v = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Key tensor of shape [1, 3, 4]
-    k = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.Attention()
-    actual = attention_layer([q, v, k], mask=[None, v_mask])
-
-    expected_shape = [1, 2, 4]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
-
-  def test_shape_with_key_concat(self):
-    # Query tensor of shape [1, 2, 4]
-    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 4]
-    v = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Key tensor of shape [1, 3, 4]
-    k = np.array(
-        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
-        dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.Attention(score_mode='concat')
-    attention_layer.concat_score_weight = 1
-    actual = attention_layer([q, v, k], mask=[None, v_mask])
-
-    expected_shape = [1, 2, 4]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
-
-  def test_multi_dim(self):
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.Attention()
-    actual = attention_layer([q, v], mask=[None, v_mask])
-
-    # Expected scores of shape [1, 1, 3]
-    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
-    # Expected attention distribution = softmax(scores) with zeros in
-    # positions where v_mask == False.
-    # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
-    #                              = 0.72908792234
-    #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
-    #                              = 0.27091207765
-    #    attention_distribution002 = 0
-    #
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
-    #             = 1.3561791301
-    expected = np.array([[[1.3561791301]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_multi_dim_with_key(self):
-    # Query tensor of shape [1, 1, 1]
-    q = np.array([[[1.1]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
-    # Key tensor of shape [1, 3, 1]
-    k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.Attention()
-    actual = attention_layer([q, v, k], mask=[None, v_mask])
-
-    # Expected scores of shape [1, 1, 3]
-    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
-    # Expected attention distribution = softmax(scores) with zeros in
-    # positions where v_mask == False.
-    # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
-    #                              = 0.72908792234
-    #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
-    #                              = 0.27091207765
-    #    attention_distribution002 = 0
-    #
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 0.72908792234 * 0.5 + 0.27091207765 * 0.8 - 0 * 0.3
-    #             = 0.58127362329
-    expected = np.array([[[0.58127362329]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  @parameterized.named_parameters(
-      ('', False),
-      ('return_attention_scores', True),
-  )
-  def test_multi_dim_with_query_mask(self, return_attention_scores):
-    # Query tensor of shape [1, 2, 1]
-    q = np.array([[[1.1], [-0.5]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    # Query mask tensor of shape [1, 2]
-    q_mask = np.array([[True, False]], dtype=np.bool_)
-    # Value mask tensor of shape [1, 3]
-    v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = keras.layers.Attention()
-    if return_attention_scores:
-      actual, actual_scores = attention_layer(
-          [q, v],
-          mask=[q_mask, v_mask],
-          return_attention_scores=return_attention_scores)
-    else:
-      actual = attention_layer([q, v],
-                               mask=[q_mask, v_mask],
-                               return_attention_scores=return_attention_scores)
-
-    # Expected scores of shape [1, 2, 3]
-    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8], [-0.5*1.6, -0.5*0.7, 0.5*0.8]]]
-    #        = [[[1.76, 0.77, -0.88], [-0.8, -0.35, 0.4]]]
-    # Expected attention distribution = softmax(scores) with zeros in
-    # positions where v_mask == False.
-    # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
-    #                              = 0.72908792234
-    #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
-    #                              = 0.27091207765
-    #    attention_distribution002 = 0
-    # => attention_distribution010 = exp(-0.8)/(exp(-0.8) + exp(-0.35))
-    #                              = 0.38936076605
-    #    attention_distribution011 = exp(-0.35)/(exp(-0.8) + exp(-0.35))
-    #                              = 0.61063923394
-    #    attention_distribution012 = 0
-    if return_attention_scores:
-      expected_scores = np.array([[[0.72908792234, 0.27091207765, 0.],
-                                   [0.38936076605, 0.61063923394, 0.]]],
-                                 dtype=np.float32)
-      self.assertAllClose(expected_scores, actual_scores)
-    # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
-    # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
-    #             = 1.3561791301
-    # expected000 = 0
-    expected = np.array([[[1.3561791301], [0.]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_scale_none(self):
-    """Tests that scale is None by default."""
-    attention_layer = keras.layers.Attention()
-    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-    self.assertIsNone(attention_layer.scale)
-
-  def test_scale_init_eager(self):
-    """Tests that scale initializes to 1 when use_scale=True."""
-    if not tf.executing_eagerly():
-      self.skipTest('Only run in eager mode')
-    attention_layer = keras.layers.Attention(use_scale=True)
-    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-    self.assertAllClose(1., attention_layer.scale.value())
-
-  def test_scale_init_graph(self):
-    """Tests that scale initializes to 1 when use_scale=True."""
-    with self.cached_session() as sess:
-      attention_layer = keras.layers.Attention(use_scale=True)
-      attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-      sess.run(attention_layer.scale.initializer)
-      self.assertAllClose(1., attention_layer.scale.value())
-
-  @parameterized.named_parameters(
-      ('', False),
-      ('return_attention_scores', True),
-  )
-  def test_self_attention_causal(self, return_attention_scores):
-    # Query-value tensor of shape [1, 3, 1]
-    q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
-    attention_layer = keras.layers.Attention(causal=True)
-    if return_attention_scores:
-      actual, actual_scores = attention_layer(
-          [q, q], return_attention_scores=return_attention_scores)
-    else:
-      actual = attention_layer([q, q],
-                               return_attention_scores=return_attention_scores)
-
-    # Expected scores of shape [1, 3, 3]
-    # scores = [[0.25, 0.4, -0.15], [0.4, 0.64, -0.24], [-0.15, -0.24, 0.09]]
-    # Expected attention distribution = softmax(scores) lower triangular
-    # => attention_distribution00 = [1., 0., 0.]
-    #    attention_distribution01
-    #      = [exp(0.4), exp(0.64), 0.] / (exp(0.4) + exp(0.64))
-    #      = [0.44028635073, 0.55971364926, 0.]
-    #    attention_distribution02
-    #      = [exp(-0.15), exp(-0.24), exp(0.09)]
-    #        / (exp(-0.15) + exp(-0.24) + exp(0.09))
-    #      = [0.31395396638, 0.28693232061, 0.399113713]
-    if return_attention_scores:
-      expected_scores = np.array(
-          [[[1., 0., 0.], [0.44028635073, 0.55971364926, 0.],
-            [0.31395396638, 0.28693232061, 0.399113713]]],
-          dtype=np.float32)
-      self.assertAllClose(expected_scores, actual_scores)
-    # Expected tensor of shape [1, 3, 1].
-    # expected000 = 0.5
-    # expected010 = 0.44028635073 * 0.5 + 0.55971364926 * 0.8
-    #             = 0.66791409477
-    # expected020 = 0.31395396638 * 0.5 +0.28693232061 * 0.8 -0.399113713 * 0.3
-    #             = 0.26678872577
-    expected = np.array([[[0.5], [0.66791409477], [0.26678872577]]],
-                        dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_inputs_not_list(self):
-    attention_layer = keras.layers.Attention()
-    q = np.array([[[1.1]]], dtype=np.float32)
-    with self.assertRaisesRegex(
-        ValueError, 'Attention layer must be called on a list of inputs'):
-      attention_layer(q)
-
-  def test_inputs_too_short(self):
-    attention_layer = keras.layers.Attention()
-    q = np.array([[[1.1]]], dtype=np.float32)
-    with self.assertRaisesRegex(
-        ValueError, 'Attention layer accepts inputs list of length 2 or 3'):
-      attention_layer([q])
-
-  def test_inputs_too_long(self):
-    attention_layer = keras.layers.Attention()
-    q = np.array([[[1.1]]], dtype=np.float32)
-    with self.assertRaisesRegex(
-        ValueError, 'Attention layer accepts inputs list of length 2 or 3'):
-      attention_layer([q, q, q, q])
-
-  def test_mask_not_list(self):
-    attention_layer = keras.layers.Attention()
-    q = np.array([[[1.1]]], dtype=np.float32)
-    mask = np.array([[True]], dtype=np.bool_)
-    with self.assertRaisesRegex(ValueError,
-                                'Attention layer mask must be a list'):
-      attention_layer([q, q], mask=mask)
-
-  def test_mask_too_short(self):
-    attention_layer = keras.layers.Attention()
-    q = np.array([[[1.1]]], dtype=np.float32)
-    mask = np.array([[True]], dtype=np.bool_)
-    with self.assertRaisesRegex(
-        ValueError, 'Attention layer mask must be a list of length 2'):
-      attention_layer([q, q], mask=[mask])
-
-  def test_mask_too_long(self):
-    attention_layer = keras.layers.Attention()
-    q = np.array([[[1.1]]], dtype=np.float32)
-    mask = np.array([[True]], dtype=np.bool_)
-    with self.assertRaisesRegex(
-        ValueError, 'Attention layer mask must be a list of length 2'):
-      attention_layer([q, q], mask=[mask, mask, mask])
-
-  def test_override_mask(self):
-    attention_layer = keras.layers.Attention()
-    q = core.Masking()(np.array([[[1.1]]], dtype=np.float32))
-    mask = np.array([[False]], dtype=np.bool_)
-    actual = attention_layer([q, q], mask=[mask, mask])
-    self.assertAllClose([[[0]]], actual)
-
-  def test_implicit_mask(self):
-    attention_layer = keras.layers.Attention()
-    q = core.Masking(1.1)(np.array([[[1.1], [1]]], dtype=np.float32))
-    v = core.Masking(1.2)(np.array([[[1.2], [1]]], dtype=np.float32))
-    actual = attention_layer([q, v])
-    self.assertAllClose([[[0], [1]]], actual)
-
-  @parameterized.named_parameters(
-      ('', False),
-      ('use_scale', True),
-  )
-  def test_serialization(self, use_scale):
-    # Test serialization with use_scale
-    layer = keras.layers.Attention(use_scale=use_scale)
-
-    config = keras.layers.serialize(layer)
-    new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.use_scale, use_scale)
-
-    config = layer.get_config()
-    new_layer = keras.layers.Attention.from_config(config)
-    self.assertEqual(new_layer.use_scale, use_scale)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_calculate_scores_one_dim(self):
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Key tensor of shape [1, 1, 1]
+        k = np.array([[[1.6]]], dtype=np.float32)
+        attention_layer = keras.layers.Attention()
+        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+        actual = attention_layer._calculate_scores(query=q, key=k)
+
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 1.1*1.6 = 1.76
+        expected = np.array([[[1.76]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_calculate_scores_multi_dim(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Key tensor of shape [1, 3, 4]
+        k = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        attention_layer = keras.layers.Attention()
+        attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
+        actual = attention_layer._calculate_scores(query=q, key=k)
+
+        # Expected tensor of shape [1, 2, 3].
+        # expected000 = 1.*1.5+1.1*1.6+1.2*1.7+1.3*1.8 = 7.64
+        # expected001 = 1.*2.5+1.1*2.6+1.2*2.7+1.3*2.8 = 12.24
+        # expected002 = 1.*3.5+1.1*3.6+1.2*3.7+1.3*3.8 = 16.84
+        # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
+        # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
+        # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
+        expected = np.array(
+            [[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], dtype=np.float32
+        )
+        self.assertAllClose(expected, actual)
+
+    def test_calculate_scores_multi_dim_concat(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Key tensor of shape [1, 3, 4]
+        k = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        attention_layer = keras.layers.Attention(score_mode="concat")
+        attention_layer.concat_score_weight = 1
+        attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
+        actual = keras.backend.get_value(
+            attention_layer._calculate_scores(query=q, key=k)
+        )
+
+        # expected000 = tanh(1.+1.5) + tanh(1.1+1.6) + \
+        #     tanh(1.2+1.7) + tanh(1.3+1.8) = 3.96753427840
+        # expected001 = tanh(1.+2.5) + tanh(1.1+2.6) + \
+        #     tanh(1.2+2.7) + tanh(1.3+2.8) = 3.99558784825
+        # expected002 = tanh(1.+3.5) + tanh(1.1+3.6) + \
+        #     tanh(1.2+3.7) + tanh(1.3+3.8) = 3.99940254147
+        # expected010 = tanh(2.+1.5) + tanh(2.1+1.6) + \
+        #     tanh(2.2+1.7) + tanh(2.3+1.8) = 3.99558784825
+        # expected011 = tanh(2.+2.5) + tanh(2.1+2.6) + \
+        #     tanh(2.2+2.7) + tanh(2.3+2.8) = 3.99940254147
+        # expected012 = tanh(2.+3.5) + tanh(2.1+3.6) + \
+        #     tanh(2.2+3.7) + tanh(2.3+3.8) = 3.99991913657
+        expected = np.array(
+            [
+                [
+                    [3.96753427840, 3.99558784825, 3.99940254147],
+                    [3.99558784825, 3.99940254147, 3.99991913657],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        self.assertAllClose(expected, actual)
+
+    def test_calculate_scores_one_dim_batch_size_two(self):
+        # Query tensor of shape [2, 1, 1]
+        q = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
+        # Key tensor of shape [2, 1, 1]
+        k = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+        attention_layer = keras.layers.Attention()
+        attention_layer.build(input_shape=([2, 1, 1], [2, 1, 1]))
+        actual = attention_layer._calculate_scores(query=q, key=k)
+
+        # Expected tensor of shape [2, 1, 1].
+        # expected000 = 1.1*1.6 = 1.76
+        # expected100 = 2.1*2.6 = 5.46
+        expected = np.array([[[1.76]], [[5.46]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_calculate_scores_one_dim_with_scale(self):
+        """Tests that scores are multiplied by scale."""
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Key tensor of shape [1, 1, 1]
+        k = np.array([[[1.6]]], dtype=np.float32)
+        attention_layer = keras.layers.Attention(use_scale=True)
+        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+        attention_layer.scale = -2.0
+        actual = attention_layer._calculate_scores(query=q, key=k)
+
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = -2*1.1*1.6 = -3.52
+        expected = np.array([[[-3.52]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_calculate_scores_one_dim_with_scale_concat(self):
+        """Tests that scores are multiplied by scale."""
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Key tensor of shape [1, 1, 1]
+        k = np.array([[[1.6]]], dtype=np.float32)
+        attention_layer = keras.layers.Attention(
+            use_scale=True, score_mode="concat"
+        )
+        attention_layer.concat_score_weight = 1
+        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+        attention_layer.scale = 2.0
+        actual = keras.backend.get_value(
+            attention_layer._calculate_scores(query=q, key=k)
+        )
+
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = tanh(2*(1.1+1.6)) = 0.9999592018254402
+        expected = np.array([[[0.999959202]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_shape(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Value tensor of shape [1, 3, 4]
+        v = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.Attention()
+        actual = attention_layer([q, v], mask=[None, v_mask])
+
+        expected_shape = [1, 2, 4]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
+
+    def test_shape_concat(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Value tensor of shape [1, 3, 4]
+        v = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.Attention(score_mode="concat")
+        attention_layer.concat_score_weight = 1
+        actual = attention_layer([q, v], mask=[None, v_mask])
+
+        expected_shape = [1, 2, 4]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
+
+    def test_shape_with_key(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Value tensor of shape [1, 3, 4]
+        v = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Key tensor of shape [1, 3, 4]
+        k = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.Attention()
+        actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+        expected_shape = [1, 2, 4]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
+
+    def test_shape_with_key_concat(self):
+        # Query tensor of shape [1, 2, 4]
+        q = np.array(
+            [[[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]], dtype=np.float32
+        )
+        # Value tensor of shape [1, 3, 4]
+        v = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Key tensor of shape [1, 3, 4]
+        k = np.array(
+            [
+                [
+                    [1.5, 1.6, 1.7, 1.8],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [3.5, 3.6, 3.7, 3.8],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.Attention(score_mode="concat")
+        attention_layer.concat_score_weight = 1
+        actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+        expected_shape = [1, 2, 4]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
+
+    def test_multi_dim(self):
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.Attention()
+        actual = attention_layer([q, v], mask=[None, v_mask])
+
+        # Expected scores of shape [1, 1, 3]
+        # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
+        # Expected attention distribution = softmax(scores) with zeros in
+        # positions where v_mask == False.
+        # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
+        #                              = 0.72908792234
+        #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
+        #                              = 0.27091207765
+        #    attention_distribution002 = 0
+        #
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
+        #             = 1.3561791301
+        expected = np.array([[[1.3561791301]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_multi_dim_with_key(self):
+        # Query tensor of shape [1, 1, 1]
+        q = np.array([[[1.1]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
+        # Key tensor of shape [1, 3, 1]
+        k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.Attention()
+        actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+        # Expected scores of shape [1, 1, 3]
+        # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
+        # Expected attention distribution = softmax(scores) with zeros in
+        # positions where v_mask == False.
+        # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
+        #                              = 0.72908792234
+        #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
+        #                              = 0.27091207765
+        #    attention_distribution002 = 0
+        #
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 0.72908792234 * 0.5 + 0.27091207765 * 0.8 - 0 * 0.3
+        #             = 0.58127362329
+        expected = np.array([[[0.58127362329]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    @parameterized.named_parameters(
+        ("", False),
+        ("return_attention_scores", True),
+    )
+    def test_multi_dim_with_query_mask(self, return_attention_scores):
+        # Query tensor of shape [1, 2, 1]
+        q = np.array([[[1.1], [-0.5]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        # Query mask tensor of shape [1, 2]
+        q_mask = np.array([[True, False]], dtype=np.bool_)
+        # Value mask tensor of shape [1, 3]
+        v_mask = np.array([[True, True, False]], dtype=np.bool_)
+        attention_layer = keras.layers.Attention()
+        if return_attention_scores:
+            actual, actual_scores = attention_layer(
+                [q, v],
+                mask=[q_mask, v_mask],
+                return_attention_scores=return_attention_scores,
+            )
+        else:
+            actual = attention_layer(
+                [q, v],
+                mask=[q_mask, v_mask],
+                return_attention_scores=return_attention_scores,
+            )
+
+        # Expected scores of shape [1, 2, 3]
+        # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8],
+        #            [-0.5*1.6, -0.5*0.7, 0.5*0.8]]]
+        #        = [[[1.76, 0.77, -0.88], [-0.8, -0.35, 0.4]]]
+        # Expected attention distribution = softmax(scores) with zeros in
+        # positions where v_mask == False.
+        # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
+        #                              = 0.72908792234
+        #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
+        #                              = 0.27091207765
+        #    attention_distribution002 = 0
+        # => attention_distribution010 = exp(-0.8)/(exp(-0.8) + exp(-0.35))
+        #                              = 0.38936076605
+        #    attention_distribution011 = exp(-0.35)/(exp(-0.8) + exp(-0.35))
+        #                              = 0.61063923394
+        #    attention_distribution012 = 0
+        if return_attention_scores:
+            expected_scores = np.array(
+                [
+                    [
+                        [0.72908792234, 0.27091207765, 0.0],
+                        [0.38936076605, 0.61063923394, 0.0],
+                    ]
+                ],
+                dtype=np.float32,
+            )
+            self.assertAllClose(expected_scores, actual_scores)
+        # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
+        # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
+        #             = 1.3561791301
+        # expected000 = 0
+        expected = np.array([[[1.3561791301], [0.0]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_scale_none(self):
+        """Tests that scale is None by default."""
+        attention_layer = keras.layers.Attention()
+        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+        self.assertIsNone(attention_layer.scale)
+
+    def test_scale_init_eager(self):
+        """Tests that scale initializes to 1 when use_scale=True."""
+        if not tf.executing_eagerly():
+            self.skipTest("Only run in eager mode")
+        attention_layer = keras.layers.Attention(use_scale=True)
+        attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+        self.assertAllClose(1.0, attention_layer.scale.value())
+
+    def test_scale_init_graph(self):
+        """Tests that scale initializes to 1 when use_scale=True."""
+        with self.cached_session() as sess:
+            attention_layer = keras.layers.Attention(use_scale=True)
+            attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+            sess.run(attention_layer.scale.initializer)
+            self.assertAllClose(1.0, attention_layer.scale.value())
+
+    @parameterized.named_parameters(
+        ("", False),
+        ("return_attention_scores", True),
+    )
+    def test_self_attention_causal(self, return_attention_scores):
+        # Query-value tensor of shape [1, 3, 1]
+        q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
+        attention_layer = keras.layers.Attention()
+        if return_attention_scores:
+            actual, actual_scores = attention_layer(
+                [q, q],
+                return_attention_scores=return_attention_scores,
+                use_causal_mask=True,
+            )
+        else:
+            actual = attention_layer(
+                [q, q],
+                return_attention_scores=return_attention_scores,
+                use_causal_mask=True,
+            )
+
+        # Expected scores of shape [1, 3, 3]
+        # scores = [[0.25, 0.4, -0.15],
+        #           [0.4, 0.64, -0.24],
+        #           [-0.15, -0.24, 0.09]]
+        # Expected attention distribution = softmax(scores) lower triangular
+        # => attention_distribution00 = [1., 0., 0.]
+        #    attention_distribution01
+        #      = [exp(0.4), exp(0.64), 0.] / (exp(0.4) + exp(0.64))
+        #      = [0.44028635073, 0.55971364926, 0.]
+        #    attention_distribution02
+        #      = [exp(-0.15), exp(-0.24), exp(0.09)]
+        #        / (exp(-0.15) + exp(-0.24) + exp(0.09))
+        #      = [0.31395396638, 0.28693232061, 0.399113713]
+        if return_attention_scores:
+            expected_scores = np.array(
+                [
+                    [
+                        [1.0, 0.0, 0.0],
+                        [0.44028635073, 0.55971364926, 0.0],
+                        [0.31395396638, 0.28693232061, 0.399113713],
+                    ]
+                ],
+                dtype=np.float32,
+            )
+            self.assertAllClose(expected_scores, actual_scores)
+        # Expected tensor of shape [1, 3, 1].
+        # expected000 = 0.5
+        # expected010 = 0.44028635073 * 0.5 + 0.55971364926 * 0.8
+        #             = 0.66791409477
+        # expected020 = 0.31395396638 * 0.5 + \
+        #     0.28693232061 * 0.8 -0.399113713 * 0.3
+        #             = 0.26678872577
+        expected = np.array(
+            [[[0.5], [0.66791409477], [0.26678872577]]], dtype=np.float32
+        )
+        self.assertAllClose(expected, actual)
+
+    def test_self_attention_causal_deprecated(self):
+        """Verify deprecated specification of causal masking still works."""
+        # Query-value tensor of shape [1, 3, 1]
+        q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
+        attention_layer_new = keras.layers.Attention()
+        new_scores = attention_layer_new(
+            [q, q],
+            use_causal_mask=True,
+        )
+        attention_layer_old = keras.layers.Attention(causal=True)
+        old_scores = attention_layer_old(
+            [q, q],
+        )
+        self.assertAllClose(new_scores, old_scores)
+
+    def test_inputs_not_list(self):
+        attention_layer = keras.layers.Attention()
+        q = np.array([[[1.1]]], dtype=np.float32)
+        with self.assertRaisesRegex(
+            ValueError, "Attention layer must be called on a list of inputs"
+        ):
+            attention_layer(q)
+
+    def test_inputs_too_short(self):
+        attention_layer = keras.layers.Attention()
+        q = np.array([[[1.1]]], dtype=np.float32)
+        with self.assertRaisesRegex(
+            ValueError, "Attention layer accepts inputs list of length 2 or 3"
+        ):
+            attention_layer([q])
+
+    def test_inputs_too_long(self):
+        attention_layer = keras.layers.Attention()
+        q = np.array([[[1.1]]], dtype=np.float32)
+        with self.assertRaisesRegex(
+            ValueError, "Attention layer accepts inputs list of length 2 or 3"
+        ):
+            attention_layer([q, q, q, q])
+
+    def test_mask_not_list(self):
+        attention_layer = keras.layers.Attention()
+        q = np.array([[[1.1]]], dtype=np.float32)
+        mask = np.array([[True]], dtype=np.bool_)
+        with self.assertRaisesRegex(
+            ValueError, "Attention layer mask must be a list"
+        ):
+            attention_layer([q, q], mask=mask)
+
+    def test_mask_too_short(self):
+        attention_layer = keras.layers.Attention()
+        q = np.array([[[1.1]]], dtype=np.float32)
+        mask = np.array([[True]], dtype=np.bool_)
+        with self.assertRaisesRegex(
+            ValueError, "Attention layer mask must be a list of length 2"
+        ):
+            attention_layer([q, q], mask=[mask])
+
+    def test_mask_too_long(self):
+        attention_layer = keras.layers.Attention()
+        q = np.array([[[1.1]]], dtype=np.float32)
+        mask = np.array([[True]], dtype=np.bool_)
+        with self.assertRaisesRegex(
+            ValueError, "Attention layer mask must be a list of length 2"
+        ):
+            attention_layer([q, q], mask=[mask, mask, mask])
+
+    def test_override_mask(self):
+        attention_layer = keras.layers.Attention()
+        q = core.Masking()(np.array([[[1.1]]], dtype=np.float32))
+        mask = np.array([[False]], dtype=np.bool_)
+        actual = attention_layer([q, q], mask=[mask, mask])
+        self.assertAllClose([[[0]]], actual)
+
+    def test_implicit_mask(self):
+        attention_layer = keras.layers.Attention()
+        q = core.Masking(1.1)(np.array([[[1.1], [1]]], dtype=np.float32))
+        v = core.Masking(1.2)(np.array([[[1.2], [1]]], dtype=np.float32))
+        actual = attention_layer([q, v])
+        self.assertAllClose([[[0], [1]]], actual)
+
+    @parameterized.named_parameters(
+        ("", False),
+        ("use_scale", True),
+    )
+    def test_serialization(self, use_scale):
+        # Test serialization with use_scale
+        layer = keras.layers.Attention(use_scale=use_scale)
+
+        config = keras.layers.serialize(layer)
+        new_layer = keras.layers.deserialize(config)
+        self.assertEqual(new_layer.use_scale, use_scale)
+
+        config = layer.get_config()
+        new_layer = keras.layers.Attention.from_config(config)
+        self.assertEqual(new_layer.use_scale, use_scale)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/attention/base_dense_attention.py b/keras/layers/attention/base_dense_attention.py
index 13d48b6a5157..c51907465fd0 100644
--- a/keras/layers/attention/base_dense_attention.py
+++ b/keras/layers/attention/base_dense_attention.py
@@ -17,217 +17,246 @@
 This file follows the terminology of https://arxiv.org/abs/1706.03762 Figure 2.
 Attention is formed by three tensors: Query, Key and Value.
 """
-# pylint: disable=g-classes-have-attributes
+
+import tensorflow.compat.v2 as tf
+from absl import logging
 
 from keras import backend
 from keras.engine import base_layer
 from keras.utils import control_flow_util
-import tensorflow.compat.v2 as tf
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
+
+@keras_export("keras.__internal__.layers.BaseDenseAttention", v1=[])
 class BaseDenseAttention(base_layer.BaseRandomLayer):
-  """Base Attention class for Dense networks.
-
-  This class is suitable for Dense or CNN networks, and not for RNN networks.
-
-  Implementations of attention mechanisms should inherit from this class, and
-  reuse the `apply_attention_scores()` method.
-
-  Args:
-    causal: Boolean. Set to `True` for decoder self-attention. Adds a mask such
-      that position `i` cannot attend to positions `j > i`. This prevents the
-      flow of information from the future towards the past.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the
-      attention scores.
-
-  Call Args:
-
-    inputs: List of the following tensors:
-      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
-      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
-      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
-        given, will use `value` for both `key` and `value`, which is the
-        most common case.
-    mask: List of the following tensors:
-      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
-        If given, the output will be zero at the positions where
-        `mask==False`.
-      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
-        If given, will apply the mask such that values at positions where
-        `mask==False` do not contribute to the result.
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (no dropout).
-    return_attention_scores: bool, if `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
-
-  Output:
-
-    Attention outputs of shape `[batch_size, Tq, dim]`.
-    [Optional] Attention scores after masking and softmax with shape
-      `[batch_size, Tq, Tv]`.
-  """
-
-  def __init__(self, causal=False, dropout=0.0, **kwargs):
-    super().__init__(**kwargs)
-    self.causal = causal
-    self.dropout = dropout
-    self.supports_masking = True
-
-  def _calculate_scores(self, query, key):
-    """Calculates attention scores.
+    """Base Attention class for Dense networks.
+
+    This class is suitable for Dense or CNN networks, and not for RNN networks.
+
+    Implementations of attention mechanisms should inherit from this class, and
+    reuse the `apply_attention_scores()` method.
 
     Args:
-      query: Query tensor of shape `[batch_size, Tq, dim]`.
-      key: Key tensor of shape `[batch_size, Tv, dim]`.
+        dropout: Float between 0 and 1. Fraction of the units to drop for the
+            attention scores.
 
-    Returns:
-      Tensor of shape `[batch_size, Tq, Tv]`.
+    Call arguments:
+        inputs: List of the following tensors:
+            * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+            * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+            * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If
+                not given, will use `value` for both `key` and `value`, which is
+                the most common case.
+        mask: List of the following tensors:
+            * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+                If given, the output will be zero at the positions where
+                `mask==False`.
+            * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+                If given, will apply the mask such that values at positions
+                 where `mask==False` do not contribute to the result.
+        training: Python boolean indicating whether the layer should behave in
+            training mode (adding dropout) or in inference mode (no dropout).
+        return_attention_scores: bool, if `True`, returns the attention scores
+            (after masking and softmax) as an additional output argument.
+
+    Output:
+
+        Attention outputs of shape `[batch_size, Tq, dim]`.
+        [Optional] Attention scores after masking and softmax with shape
+            `[batch_size, Tq, Tv]`.
     """
-    return NotImplementedError
 
-  def _apply_scores(self, scores, value, scores_mask=None, training=None):
-    """Applies attention scores to the given value tensor.
+    def __init__(self, dropout=0.0, **kwargs):
+        # Deprecated field `causal` determines whether to using causal masking.
+        # Use `use_causal_mask` in call() method instead.
+        if "causal" in kwargs:
+            logging.warning(
+                "`causal` argument is deprecated. Please use `use_causal_mask` "
+                "in call() method to specify causal masking."
+            )
+        self.causal = kwargs.pop("causal", False)
+        super().__init__(**kwargs)
+        self.dropout = dropout
+        self.supports_masking = True
 
-    To use this method in your attention layer, follow the steps:
+    def build(self, input_shape):
+        # Skip RNG initialization if dropout rate is 0. This will let the layer
+        # be purely stateless, with no reference to any variable.
+        if self.dropout > 0:
+            super().build(input_shape)
+        self.built = True
 
-    * Use `query` tensor of shape `[batch_size, Tq]` and `key` tensor of shape
-      `[batch_size, Tv]` to calculate the attention `scores`.
-    * Pass `scores` and `value` tensors to this method. The method applies
-      `scores_mask`, calculates `attention_distribution = softmax(scores)`, then
-      returns `matmul(attention_distribution, value).
-    * Apply `query_mask` and return the result.
+    def _calculate_scores(self, query, key):
+        """Calculates attention scores.
 
-    Args:
-      scores: Scores float tensor of shape `[batch_size, Tq, Tv]`.
-      value: Value tensor of shape `[batch_size, Tv, dim]`.
-      scores_mask: A boolean mask `Tensor` of shape `[batch_size, 1, Tv]` or
-        `[batch_size, Tq, Tv]`. If given, scores at positions where
-        `scores_mask==False` do not contribute to the result. It must contain
-        at least one `True` value in each line along the last dimension.
-      training: Python boolean indicating whether the layer should behave in
-        training mode (adding dropout) or in inference mode (no dropout).
-
-    Returns:
-      Tensor of shape `[batch_size, Tq, dim]`.
-      Attention scores after masking and softmax with shape
-        `[batch_size, Tq, Tv]`.
-    """
-    if scores_mask is not None:
-      padding_mask = tf.logical_not(scores_mask)
-      # Bias so padding positions do not contribute to attention distribution.
-      # Note 65504. is the max float16 value.
-      if scores.dtype is tf.float16:
-        scores -= 65504. * tf.cast(padding_mask, dtype=scores.dtype)
-      else:
-        scores -= 1.e9 * tf.cast(padding_mask, dtype=scores.dtype)
-    if training is None:
-      training = backend.learning_phase()
-    weights = tf.nn.softmax(scores)
-
-    def dropped_weights():
-      return self._random_generator.dropout(weights, rate=self.dropout)
-
-    weights = control_flow_util.smart_cond(training, dropped_weights,
-                                           lambda: tf.identity(weights))
-    return tf.matmul(weights, value), weights
-
-  # TODO(b/125916026): Consider exposing a __call__ method with named args.
-  def call(self,
-           inputs,
-           mask=None,
-           training=None,
-           return_attention_scores=False):
-    self._validate_call_args(inputs=inputs, mask=mask)
-    q = inputs[0]
-    v = inputs[1]
-    k = inputs[2] if len(inputs) > 2 else v
-    q_mask = mask[0] if mask else None
-    v_mask = mask[1] if mask else None
-    scores = self._calculate_scores(query=q, key=k)
-    if v_mask is not None:
-      # Mask of shape [batch_size, 1, Tv].
-      v_mask = tf.expand_dims(v_mask, axis=-2)
-    if self.causal:
-      # Creates a lower triangular mask, so position i cannot attend to
-      # positions j>i. This prevents the flow of information from the future
-      # into the past.
-      scores_shape = tf.shape(scores)
-      # causal_mask_shape = [1, Tq, Tv].
-      causal_mask_shape = tf.concat(
-          [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]],
-          axis=0)
-      causal_mask = _lower_triangular_mask(causal_mask_shape)
-    else:
-      causal_mask = None
-    scores_mask = _merge_masks(v_mask, causal_mask)
-    result, attention_scores = self._apply_scores(
-        scores=scores, value=v, scores_mask=scores_mask, training=training)
-    if q_mask is not None:
-      # Mask of shape [batch_size, Tq, 1].
-      q_mask = tf.expand_dims(q_mask, axis=-1)
-      result *= tf.cast(q_mask, dtype=result.dtype)
-    if return_attention_scores:
-      return result, attention_scores
-    return result
-
-  def compute_mask(self, inputs, mask=None):
-    self._validate_call_args(inputs=inputs, mask=mask)
-    if mask:
-      q_mask = mask[0]
-      if q_mask is None:
+        Args:
+            query: Query tensor of shape `[batch_size, Tq, dim]`.
+            key: Key tensor of shape `[batch_size, Tv, dim]`.
+
+        Returns:
+            Tensor of shape `[batch_size, Tq, Tv]`.
+        """
+        return NotImplementedError
+
+    def _apply_scores(self, scores, value, scores_mask=None, training=None):
+        """Applies attention scores to the given value tensor.
+
+        To use this method in your attention layer, follow the steps:
+
+        * Use `query` tensor of shape `[batch_size, Tq]` and `key` tensor of
+            shape `[batch_size, Tv]` to calculate the attention `scores`.
+        * Pass `scores` and `value` tensors to this method. The method applies
+            `scores_mask`, calculates
+            `attention_distribution = softmax(scores)`, then returns
+            `matmul(attention_distribution, value).
+        * Apply `query_mask` and return the result.
+
+        Args:
+            scores: Scores float tensor of shape `[batch_size, Tq, Tv]`.
+            value: Value tensor of shape `[batch_size, Tv, dim]`.
+            scores_mask: A boolean mask `Tensor` of shape `[batch_size, 1, Tv]`
+                or `[batch_size, Tq, Tv]`. If given, scores at positions where
+                `scores_mask==False` do not contribute to the result. It must
+                contain at least one `True` value in each line along the last
+                dimension.
+            training: Python boolean indicating whether the layer should behave
+                in training mode (adding dropout) or in inference mode
+                (no dropout).
+
+        Returns:
+            Tensor of shape `[batch_size, Tq, dim]`.
+            Attention scores after masking and softmax with shape
+                `[batch_size, Tq, Tv]`.
+        """
+        if scores_mask is not None:
+            padding_mask = tf.logical_not(scores_mask)
+            # Bias so padding positions do not contribute to attention
+            # distribution.  Note 65504. is the max float16 value.
+            if scores.dtype is tf.float16:
+                scores -= 65504.0 * tf.cast(padding_mask, dtype=scores.dtype)
+            else:
+                scores -= 1.0e9 * tf.cast(padding_mask, dtype=scores.dtype)
+        if training is None:
+            training = backend.learning_phase()
+        weights = tf.nn.softmax(scores)
+
+        if self.dropout > 0:
+
+            def dropped_weights():
+                return self._random_generator.dropout(
+                    weights, rate=self.dropout
+                )
+
+            weights = control_flow_util.smart_cond(
+                training, dropped_weights, lambda: tf.identity(weights)
+            )
+        return tf.matmul(weights, value), weights
+
+    # TODO(b/125916026): Consider exposing a __call__ method with named args.
+    def call(
+        self,
+        inputs,
+        mask=None,
+        training=None,
+        return_attention_scores=False,
+        use_causal_mask=False,
+    ):
+        self._validate_call_args(inputs=inputs, mask=mask)
+        q = inputs[0]
+        v = inputs[1]
+        k = inputs[2] if len(inputs) > 2 else v
+        q_mask = mask[0] if mask else None
+        v_mask = mask[1] if mask else None
+        scores = self._calculate_scores(query=q, key=k)
+        if v_mask is not None:
+            # Mask of shape [batch_size, 1, Tv].
+            v_mask = tf.expand_dims(v_mask, axis=-2)
+        if self.causal or use_causal_mask:
+            # Creates a lower triangular mask, so position i cannot attend to
+            # positions j>i. This prevents the flow of information from the
+            # future into the past.
+            scores_shape = tf.shape(scores)
+            # causal_mask_shape = [1, Tq, Tv].
+            causal_mask_shape = tf.concat(
+                [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0
+            )
+            causal_mask = _lower_triangular_mask(causal_mask_shape)
+        else:
+            causal_mask = None
+        scores_mask = _merge_masks(v_mask, causal_mask)
+        result, attention_scores = self._apply_scores(
+            scores=scores, value=v, scores_mask=scores_mask, training=training
+        )
+        if q_mask is not None:
+            # Mask of shape [batch_size, Tq, 1].
+            q_mask = tf.expand_dims(q_mask, axis=-1)
+            result *= tf.cast(q_mask, dtype=result.dtype)
+        if return_attention_scores:
+            return result, attention_scores
+        return result
+
+    def compute_mask(self, inputs, mask=None):
+        self._validate_call_args(inputs=inputs, mask=mask)
+        if mask:
+            q_mask = mask[0]
+            if q_mask is None:
+                return None
+            return tf.convert_to_tensor(q_mask)
         return None
-      return tf.convert_to_tensor(q_mask)
-    return None
-
-  def compute_output_shape(self, input_shape):
-    # return_attention_scores argument of BaseDenseAttention.call method
-    # is ignored. Output shape of attention_scores cannot be returned.
-    return tf.TensorShape(input_shape[0])
-
-  def _validate_call_args(self, inputs, mask):
-    """Validates arguments of the call method."""
-    class_name = self.__class__.__name__
-    if not isinstance(inputs, list):
-      raise ValueError(
-          f'{class_name} layer must be called on a list of inputs, '
-          'namely [query, value] or [query, value, key]. '
-          f'Received: {inputs}.')
-    if len(inputs) < 2 or len(inputs) > 3:
-      raise ValueError(
-          f'{class_name} layer accepts inputs list of length 2 or 3, '
-          'namely [query, value] or [query, value, key]. '
-          f'Received length: {len(inputs)}.')
-    if mask:
-      if not isinstance(mask, list):
-        raise ValueError(
-            f'{class_name} layer mask must be a list, '
-            f'namely [query_mask, value_mask]. Received: {mask}.')
-      if len(mask) < 2 or len(mask) > len(inputs):
-        raise ValueError(
-            f'{class_name} layer mask must be a list of length 2, '
-            f'namely [query_mask, value_mask]. Received length: {len(mask)}.')
-
-  def get_config(self):
-    config = {
-        'causal': self.causal,
-        'dropout': self.dropout,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+
+    def compute_output_shape(self, input_shape):
+        # return_attention_scores argument of BaseDenseAttention.call method
+        # is ignored. Output shape of attention_scores cannot be returned.
+        return tf.TensorShape(input_shape[0])
+
+    def _validate_call_args(self, inputs, mask):
+        """Validates arguments of the call method."""
+        class_name = self.__class__.__name__
+        if not isinstance(inputs, list):
+            raise ValueError(
+                f"{class_name} layer must be called on a list of inputs, "
+                "namely [query, value] or [query, value, key]. "
+                f"Received: {inputs}."
+            )
+        if len(inputs) < 2 or len(inputs) > 3:
+            raise ValueError(
+                f"{class_name} layer accepts inputs list of length 2 or 3, "
+                "namely [query, value] or [query, value, key]. "
+                f"Received length: {len(inputs)}."
+            )
+        if mask:
+            if not isinstance(mask, list):
+                raise ValueError(
+                    f"{class_name} layer mask must be a list, "
+                    f"namely [query_mask, value_mask]. Received: {mask}."
+                )
+            if len(mask) < 2 or len(mask) > len(inputs):
+                raise ValueError(
+                    f"{class_name} layer mask must be a list of length 2, "
+                    "namely [query_mask, value_mask]. "
+                    f"Received length: {len(mask)}."
+                )
+
+    def get_config(self):
+        config = {
+            "dropout": self.dropout,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 def _lower_triangular_mask(shape):
-  """Creates a lower-triangular boolean mask over the last 2 dimensions."""
-  row_index = tf.cumsum(
-      tf.ones(shape=shape, dtype=tf.int32), axis=-2)
-  col_index = tf.cumsum(
-      tf.ones(shape=shape, dtype=tf.int32), axis=-1)
-  return tf.greater_equal(row_index, col_index)
+    """Creates a lower-triangular boolean mask over the last 2 dimensions."""
+    row_index = tf.cumsum(tf.ones(shape=shape, dtype=tf.int32), axis=-2)
+    col_index = tf.cumsum(tf.ones(shape=shape, dtype=tf.int32), axis=-1)
+    return tf.greater_equal(row_index, col_index)
 
 
 def _merge_masks(x, y):
-  if x is None:
-    return y
-  if y is None:
-    return x
-  return tf.logical_and(x, y)
+    if x is None:
+        return y
+    if y is None:
+        return x
+    return tf.logical_and(x, y)
diff --git a/keras/layers/attention/base_dense_attention_test.py b/keras/layers/attention/base_dense_attention_test.py
index 7c8c98504224..86b9f4b05a7d 100644
--- a/keras/layers/attention/base_dense_attention_test.py
+++ b/keras/layers/attention/base_dense_attention_test.py
@@ -14,163 +14,186 @@
 # ==============================================================================
 """Tests BaseDenseAttention layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
-from keras.layers.attention.base_dense_attention import _lower_triangular_mask
+
 from keras.layers.attention.base_dense_attention import BaseDenseAttention
+from keras.layers.attention.base_dense_attention import _lower_triangular_mask
 from keras.testing_infra import test_combinations
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class BaseDenseAttentionTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_one_dim_with_mask(self):
-    # Scores tensor of shape [1, 1, 1]
-    scores = np.array([[[1.1]]], dtype=np.float32)
-    # Value tensor of shape [1, 1, 1]
-    v = np.array([[[1.6]]], dtype=np.float32)
-    # Scores mask tensor of shape [1, 1, 1]
-    scores_mask = np.array([[[True]]], dtype=np.bool_)
-    actual, actual_scores = BaseDenseAttention()._apply_scores(
-        scores=scores, value=v, scores_mask=scores_mask)
-
-    # Expected softmax_scores = [[[1]]]
-    expected_scores = np.array([[[1.]]], dtype=np.float32)
-    self.assertAllClose(expected_scores, actual_scores)
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
-    expected = np.array([[[1.6]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_one_dim_no_mask(self):
-    # Scores tensor of shape [1, 1, 1]
-    scores = np.array([[[1.1]]], dtype=np.float32)
-    # Value tensor of shape [1, 1, 1]
-    v = np.array([[[1.6]]], dtype=np.float32)
-    actual, actual_scores = BaseDenseAttention()._apply_scores(
-        scores=scores, value=v)
-
-    # Expected softmax_scores = [[[1]]]
-    expected_scores = np.array([[[1.]]], dtype=np.float32)
-    self.assertAllClose(expected_scores, actual_scores)
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
-    expected = np.array([[[1.6]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_multi_dim_with_mask(self):
-    # Scores tensor of shape [1, 1, 3]
-    scores = np.array([[[1., 0., 1.]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    # Scores mask tensor of shape [1, 1, 3]
-    scores_mask = np.array([[[True, True, False]]], dtype=np.bool_)
-    actual, actual_scores = BaseDenseAttention()._apply_scores(
-        scores=scores, value=v, scores_mask=scores_mask)
-
-    # Expected softmax scores = softmax(scores) with zeros in positions where
-    # v_mask == False.
-    # => softmax_scores000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
-    #    softmax_scores001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
-    #    softmax_scores002 = 0
-    expected_scores = np.array([[[0.73105857863, 0.26894142137, 0.]]],
-                               dtype=np.float32)
-    self.assertAllClose(expected_scores, actual_scores)
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8
-    #             = 1.35795272077
-    expected = np.array([[[1.35795272077]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_multi_dim_no_mask(self):
-    # Scores tensor of shape [1, 1, 3]
-    scores = np.array([[[1., 0., 1.]]], dtype=np.float32)
-    # Value tensor of shape [1, 3, 1]
-    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    actual, actual_scores = BaseDenseAttention()._apply_scores(
-        scores=scores, value=v)
-
-    # Expected softmax_scores = softmax(scores).
-    # => softmax_scores000 = exp(1)/(exp(1) + exp(0) + exp(1))
-    #                      = 0.42231879825
-    #    softmax_scores001 = exp(0)/(exp(1) + exp(0) + exp(1))
-    #                      = 0.15536240349
-    #    softmax_scores002 = exp(1)/(exp(1) + exp(0) + exp(1))
-    #                      = 0.42231879825
-    expected_scores = np.array(
-        [[[0.42231879825, 0.15536240349, 0.42231879825]]], dtype=np.float32)
-    self.assertAllClose(expected_scores, actual_scores)
-    # Expected tensor of shape [1, 1, 1].
-    # expected000 = 0.42231879825 * 1.6 + 0.15536240349 * 0.7
-    #               - 0.42231879825 * 0.8
-    #             = 0.44660872104
-    expected = np.array([[[0.44660872104]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_one_dim_batch_size_two(self):
-    # Scores tensor of shape [2, 1, 1]
-    scores = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
-    # Value tensor of shape [2, 1, 1]
-    v = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
-    # Scpres mask tensor of shape [2, 1, 1]
-    scores_mask = np.array([[[True]], [[True]]], dtype=np.bool_)
-    actual, actual_scores = BaseDenseAttention()._apply_scores(
-        scores=scores, value=v, scores_mask=scores_mask)
-
-    # Expected softmax_scores = [[[1]], [[1]]]
-    expected_scores = np.array([[[1.]], [[1.]]], dtype=np.float32)
-    self.assertAllClose(expected_scores, actual_scores)
-    # Expected tensor of shape [2, 1, 1].
-    # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
-    # expected100 = softmax_scores[1, 0] * 2.6 = 2.6
-    expected = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
-    self.assertAllClose(expected, actual)
-
-  def test_shape_with_dropout(self):
-    # scores: Scores float tensor of shape `[batch_size, tq, tv]`.
-    # value: Value tensor of shape `[batch_size, tv, dim]`.
-    batch_size = 4
-    tq = 5
-    tv = 6
-    dim = 7
-    scores = np.ones((batch_size, tq, tv))
-    value = np.ones((batch_size, tv, dim))
-    actual, actual_scores = BaseDenseAttention(
-        dropout=0.1)._apply_scores(
-            scores=scores, value=value, training=False)
-
-    # Expected Tensor of shape `[batch_size, tq, tv]`.
-    expected_scores_shape = [batch_size, tq, tv]
-    self.assertAllEqual(expected_scores_shape, tf.shape(actual_scores))
-    # Expected Tensor of shape `[batch_size, tq, dim]`.
-    expected_shape = [batch_size, tq, dim]
-    self.assertAllEqual(expected_shape, tf.shape(actual))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_one_dim_with_mask(self):
+        # Scores tensor of shape [1, 1, 1]
+        scores = np.array([[[1.1]]], dtype=np.float32)
+        # Value tensor of shape [1, 1, 1]
+        v = np.array([[[1.6]]], dtype=np.float32)
+        # Scores mask tensor of shape [1, 1, 1]
+        scores_mask = np.array([[[True]]], dtype=np.bool_)
+        actual, actual_scores = BaseDenseAttention()._apply_scores(
+            scores=scores, value=v, scores_mask=scores_mask
+        )
+
+        # Expected softmax_scores = [[[1]]]
+        expected_scores = np.array([[[1.0]]], dtype=np.float32)
+        self.assertAllClose(expected_scores, actual_scores)
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
+        expected = np.array([[[1.6]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_one_dim_no_mask(self):
+        # Scores tensor of shape [1, 1, 1]
+        scores = np.array([[[1.1]]], dtype=np.float32)
+        # Value tensor of shape [1, 1, 1]
+        v = np.array([[[1.6]]], dtype=np.float32)
+        actual, actual_scores = BaseDenseAttention()._apply_scores(
+            scores=scores, value=v
+        )
+
+        # Expected softmax_scores = [[[1]]]
+        expected_scores = np.array([[[1.0]]], dtype=np.float32)
+        self.assertAllClose(expected_scores, actual_scores)
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
+        expected = np.array([[[1.6]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_multi_dim_with_mask(self):
+        # Scores tensor of shape [1, 1, 3]
+        scores = np.array([[[1.0, 0.0, 1.0]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        # Scores mask tensor of shape [1, 1, 3]
+        scores_mask = np.array([[[True, True, False]]], dtype=np.bool_)
+        actual, actual_scores = BaseDenseAttention()._apply_scores(
+            scores=scores, value=v, scores_mask=scores_mask
+        )
+
+        # Expected softmax scores = softmax(scores) with zeros in positions
+        # where v_mask == False.
+        # => softmax_scores000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
+        #    softmax_scores001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
+        #    softmax_scores002 = 0
+        expected_scores = np.array(
+            [[[0.73105857863, 0.26894142137, 0.0]]], dtype=np.float32
+        )
+        self.assertAllClose(expected_scores, actual_scores)
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8
+        #             = 1.35795272077
+        expected = np.array([[[1.35795272077]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_multi_dim_no_mask(self):
+        # Scores tensor of shape [1, 1, 3]
+        scores = np.array([[[1.0, 0.0, 1.0]]], dtype=np.float32)
+        # Value tensor of shape [1, 3, 1]
+        v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+        actual, actual_scores = BaseDenseAttention()._apply_scores(
+            scores=scores, value=v
+        )
+
+        # Expected softmax_scores = softmax(scores).
+        # => softmax_scores000 = exp(1)/(exp(1) + exp(0) + exp(1))
+        #                      = 0.42231879825
+        #    softmax_scores001 = exp(0)/(exp(1) + exp(0) + exp(1))
+        #                      = 0.15536240349
+        #    softmax_scores002 = exp(1)/(exp(1) + exp(0) + exp(1))
+        #                      = 0.42231879825
+        expected_scores = np.array(
+            [[[0.42231879825, 0.15536240349, 0.42231879825]]], dtype=np.float32
+        )
+        self.assertAllClose(expected_scores, actual_scores)
+        # Expected tensor of shape [1, 1, 1].
+        # expected000 = 0.42231879825 * 1.6 + 0.15536240349 * 0.7
+        #               - 0.42231879825 * 0.8
+        #             = 0.44660872104
+        expected = np.array([[[0.44660872104]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_one_dim_batch_size_two(self):
+        # Scores tensor of shape [2, 1, 1]
+        scores = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
+        # Value tensor of shape [2, 1, 1]
+        v = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+        # Scpres mask tensor of shape [2, 1, 1]
+        scores_mask = np.array([[[True]], [[True]]], dtype=np.bool_)
+        actual, actual_scores = BaseDenseAttention()._apply_scores(
+            scores=scores, value=v, scores_mask=scores_mask
+        )
+
+        # Expected softmax_scores = [[[1]], [[1]]]
+        expected_scores = np.array([[[1.0]], [[1.0]]], dtype=np.float32)
+        self.assertAllClose(expected_scores, actual_scores)
+        # Expected tensor of shape [2, 1, 1].
+        # expected000 = softmax_scores[0, 0] * 1.6 = 1.6
+        # expected100 = softmax_scores[1, 0] * 2.6 = 2.6
+        expected = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+        self.assertAllClose(expected, actual)
+
+    def test_shape_with_dropout(self):
+        # scores: Scores float tensor of shape `[batch_size, tq, tv]`.
+        # value: Value tensor of shape `[batch_size, tv, dim]`.
+        batch_size = 4
+        tq = 5
+        tv = 6
+        dim = 7
+        scores = np.ones((batch_size, tq, tv))
+        value = np.ones((batch_size, tv, dim))
+        actual, actual_scores = BaseDenseAttention(dropout=0.1)._apply_scores(
+            scores=scores, value=value, training=False
+        )
+
+        # Expected Tensor of shape `[batch_size, tq, tv]`.
+        expected_scores_shape = [batch_size, tq, tv]
+        self.assertAllEqual(expected_scores_shape, tf.shape(actual_scores))
+        # Expected Tensor of shape `[batch_size, tq, dim]`.
+        expected_shape = [batch_size, tq, dim]
+        self.assertAllEqual(expected_shape, tf.shape(actual))
+
+    def test_skip_rng_init_when_no_dropout(self):
+        batch_size = 4
+        tq = 5
+        tv = 6
+        dim = 7
+        scores = np.ones((batch_size, tq, tv))
+        value = np.ones((batch_size, tv, dim))
+        layer = BaseDenseAttention()
+        layer.build(None)  # The input shape is not used by this layer
+        _, _ = layer._apply_scores(scores=scores, value=value, training=True)
+        # Make sure the rng is not built and no tf.random.Generator created.
+        self.assertFalse(layer._random_generator._built)
+        self.assertIsNone(getattr(layer._random_generator, "_generator", None))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LowerTriangularMaskTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_square_shape(self):
-    actual = _lower_triangular_mask([3, 3])
-    expected = np.array(
-        [[True, False, False], [True, True, False], [True, True, True]],
-        dtype=np.bool_)
-    self.assertAllEqual(expected, actual)
-
-  def test_orthogonal_shape(self):
-    actual = _lower_triangular_mask([3, 2])
-    expected = np.array([[True, False], [True, True], [True, True]],
-                        dtype=np.bool_)
-    self.assertAllEqual(expected, actual)
-
-  def test_three_dim(self):
-    actual = _lower_triangular_mask([1, 3, 3])
-    expected = np.array(
-        [[[True, False, False], [True, True, False], [True, True, True]]],
-        dtype=np.bool_)
-    self.assertAllEqual(expected, actual)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_square_shape(self):
+        actual = _lower_triangular_mask([3, 3])
+        expected = np.array(
+            [[True, False, False], [True, True, False], [True, True, True]],
+            dtype=np.bool_,
+        )
+        self.assertAllEqual(expected, actual)
+
+    def test_orthogonal_shape(self):
+        actual = _lower_triangular_mask([3, 2])
+        expected = np.array(
+            [[True, False], [True, True], [True, True]], dtype=np.bool_
+        )
+        self.assertAllEqual(expected, actual)
+
+    def test_three_dim(self):
+        actual = _lower_triangular_mask([1, 3, 3])
+        expected = np.array(
+            [[[True, False, False], [True, True, False], [True, True, True]]],
+            dtype=np.bool_,
+        )
+        self.assertAllEqual(expected, actual)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/attention/multi_head_attention.py b/keras/layers/attention/multi_head_attention.py
index 49711f29099d..e2b5fc3d76e4 100644
--- a/keras/layers/attention/multi_head_attention.py
+++ b/keras/layers/attention/multi_head_attention.py
@@ -13,12 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based multi-head attention layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import collections
 import math
 import string
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import constraints
 from keras import initializers
 from keras import regularizers
@@ -27,521 +30,702 @@
 from keras.layers import core
 from keras.layers import regularization
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-
 _CHR_IDX = string.ascii_lowercase
 
 
 def _build_attention_equation(rank, attn_axes):
-  """Builds einsum equations for the attention computation.
-
-  Query, key, value inputs after projection are expected to have the shape as:
-  `(bs, <non-attention dims>, <attention dims>, num_heads, channels)`.
-  `bs` and `<non-attention dims>` are treated as `<batch dims>`.
-
-  The attention operations can be generalized:
-  (1) Query-key dot product:
-  `(<batch dims>, <query attention dims>, num_heads, channels), (<batch dims>,
-  <key attention dims>, num_heads, channels) -> (<batch dims>,
-  num_heads, <query attention dims>, <key attention dims>)`
-  (2) Combination:
-  `(<batch dims>, num_heads, <query attention dims>, <key attention dims>),
-  (<batch dims>, <value attention dims>, num_heads, channels) -> (<batch dims>,
-  <query attention dims>, num_heads, channels)`
-
-  Args:
-    rank: Rank of query, key, value tensors.
-    attn_axes: List/tuple of axes, `[-1, rank)`,
-      that attention will be applied to.
-
-  Returns:
-    Einsum equations.
-  """
-  target_notation = _CHR_IDX[:rank]
-  # `batch_dims` includes the head dim.
-  batch_dims = tuple(np.delete(range(rank), attn_axes + (rank - 1,)))
-  letter_offset = rank
-  source_notation = ""
-  for i in range(rank):
-    if i in batch_dims or i == rank - 1:
-      source_notation += target_notation[i]
-    else:
-      source_notation += _CHR_IDX[letter_offset]
-      letter_offset += 1
-
-  product_notation = "".join([target_notation[i] for i in batch_dims] +
-                             [target_notation[i] for i in attn_axes] +
-                             [source_notation[i] for i in attn_axes])
-  dot_product_equation = "%s,%s->%s" % (source_notation, target_notation,
-                                        product_notation)
-  attn_scores_rank = len(product_notation)
-  combine_equation = "%s,%s->%s" % (product_notation, source_notation,
-                                    target_notation)
-  return dot_product_equation, combine_equation, attn_scores_rank
+    """Builds einsum equations for the attention computation.
+
+    Query, key, value inputs after projection are expected to have the shape as:
+    `(bs, <non-attention dims>, <attention dims>, num_heads, channels)`.
+    `bs` and `<non-attention dims>` are treated as `<batch dims>`.
+
+    The attention operations can be generalized:
+    (1) Query-key dot product:
+    `(<batch dims>, <query attention dims>, num_heads, channels), (<batch dims>,
+    <key attention dims>, num_heads, channels) -> (<batch dims>,
+    num_heads, <query attention dims>, <key attention dims>)`
+    (2) Combination:
+    `(<batch dims>, num_heads, <query attention dims>, <key attention dims>),
+    (<batch dims>, <value attention dims>, num_heads, channels) -> (<batch
+    dims>, <query attention dims>, num_heads, channels)`
+
+    Args:
+        rank: Rank of query, key, value tensors.
+        attn_axes: List/tuple of axes, `[-1, rank)`,
+            that attention will be applied to.
+
+    Returns:
+        Einsum equations.
+    """
+    target_notation = _CHR_IDX[:rank]
+    # `batch_dims` includes the head dim.
+    batch_dims = tuple(np.delete(range(rank), attn_axes + (rank - 1,)))
+    letter_offset = rank
+    source_notation = ""
+    for i in range(rank):
+        if i in batch_dims or i == rank - 1:
+            source_notation += target_notation[i]
+        else:
+            source_notation += _CHR_IDX[letter_offset]
+            letter_offset += 1
+
+    product_notation = "".join(
+        [target_notation[i] for i in batch_dims]
+        + [target_notation[i] for i in attn_axes]
+        + [source_notation[i] for i in attn_axes]
+    )
+    dot_product_equation = "%s,%s->%s" % (
+        source_notation,
+        target_notation,
+        product_notation,
+    )
+    attn_scores_rank = len(product_notation)
+    combine_equation = "%s,%s->%s" % (
+        product_notation,
+        source_notation,
+        target_notation,
+    )
+    return dot_product_equation, combine_equation, attn_scores_rank
 
 
 def _build_proj_equation(free_dims, bound_dims, output_dims):
-  """Builds an einsum equation for projections inside multi-head attention."""
-  input_str = ""
-  kernel_str = ""
-  output_str = ""
-  bias_axes = ""
-  letter_offset = 0
-  for i in range(free_dims):
-    char = _CHR_IDX[i + letter_offset]
-    input_str += char
-    output_str += char
-
-  letter_offset += free_dims
-  for i in range(bound_dims):
-    char = _CHR_IDX[i + letter_offset]
-    input_str += char
-    kernel_str += char
-
-  letter_offset += bound_dims
-  for i in range(output_dims):
-    char = _CHR_IDX[i + letter_offset]
-    kernel_str += char
-    output_str += char
-    bias_axes += char
-  equation = "%s,%s->%s" % (input_str, kernel_str, output_str)
-
-  return equation, bias_axes, len(output_str)
+    """Builds an einsum equation for projections inside multi-head attention."""
+    input_str = ""
+    kernel_str = ""
+    output_str = ""
+    bias_axes = ""
+    letter_offset = 0
+    for i in range(free_dims):
+        char = _CHR_IDX[i + letter_offset]
+        input_str += char
+        output_str += char
+
+    letter_offset += free_dims
+    for i in range(bound_dims):
+        char = _CHR_IDX[i + letter_offset]
+        input_str += char
+        kernel_str += char
+
+    letter_offset += bound_dims
+    for i in range(output_dims):
+        char = _CHR_IDX[i + letter_offset]
+        kernel_str += char
+        output_str += char
+        bias_axes += char
+    equation = f"{input_str},{kernel_str}->{output_str}"
+
+    return equation, bias_axes, len(output_str)
 
 
 def _get_output_shape(output_rank, known_last_dims):
-  return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
+    return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
 
 
 @keras_export("keras.layers.MultiHeadAttention")
 class MultiHeadAttention(Layer):
-  """MultiHeadAttention layer.
-
-  This is an implementation of multi-headed attention as described in the paper
-  "Attention is all you Need" (Vaswani et al., 2017).
-  If `query`, `key,` `value` are the same, then
-  this is self-attention. Each timestep in `query` attends to the
-  corresponding sequence in `key`, and returns a fixed-width vector.
-
-  This layer first projects `query`, `key` and `value`. These are
-  (effectively) a list of tensors of length `num_attention_heads`, where the
-  corresponding shapes are `(batch_size, <query dimensions>, key_dim)`,
-  `(batch_size, <key/value dimensions>, key_dim)`,
-  `(batch_size, <key/value dimensions>, value_dim)`.
-
-  Then, the query and key tensors are dot-producted and scaled. These are
-  softmaxed to obtain attention probabilities. The value tensors are then
-  interpolated by these probabilities, then concatenated back to a single
-  tensor.
-
-  Finally, the result tensor with the last dimension as value_dim can take an
-  linear projection and return.
-
-  When using MultiHeadAttention inside a custom Layer, the custom Layer must
-  implement `build()` and call MultiHeadAttention's `_build_from_signature()`.
-  This enables weights to be restored correctly when the model is loaded.
-  TODO(b/172609172): link to documentation about calling custom build functions
-  when used in a custom Layer.
-
-  Examples:
-
-  Performs 1D cross-attention over two sequence inputs with an attention mask.
-  Returns the additional attention weights over heads.
-
-  >>> layer = MultiHeadAttention(num_heads=2, key_dim=2)
-  >>> target = tf.keras.Input(shape=[8, 16])
-  >>> source = tf.keras.Input(shape=[4, 16])
-  >>> output_tensor, weights = layer(target, source,
-  ...                                return_attention_scores=True)
-  >>> print(output_tensor.shape)
-  (None, 8, 16)
-  >>> print(weights.shape)
-  (None, 2, 8, 4)
-
-  Performs 2D self-attention over a 5D input tensor on axes 2 and 3.
-
-  >>> layer = MultiHeadAttention(num_heads=2, key_dim=2, attention_axes=(2, 3))
-  >>> input_tensor = tf.keras.Input(shape=[5, 3, 4, 16])
-  >>> output_tensor = layer(input_tensor, input_tensor)
-  >>> print(output_tensor.shape)
-  (None, 5, 3, 4, 16)
-
-  Args:
-    num_heads: Number of attention heads.
-    key_dim: Size of each attention head for query and key.
-    value_dim: Size of each attention head for value.
-    dropout: Dropout probability.
-    use_bias: Boolean, whether the dense layers use bias vectors/matrices.
-    output_shape: The expected shape of an output tensor, besides the batch and
-      sequence dims. If not specified, projects back to the key feature dim.
-    attention_axes: axes over which the attention is applied. `None` means
-      attention over all axes, but batch, heads, and features.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-
-  Call arguments:
-    query: Query `Tensor` of shape `(B, T, dim)`.
-    value: Value `Tensor` of shape `(B, S, dim)`.
-    key: Optional key `Tensor` of shape `(B, S, dim)`. If not given, will use
-      `value` for both `key` and `value`, which is the most common case.
-    attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
-      attention to certain positions. The boolean mask specifies which query
-      elements can attend to which key elements, 1 indicates attention and 0
-      indicates no attention. Broadcasting can happen for the missing batch
-      dimensions and the head dimension.
-    return_attention_scores: A boolean to indicate whether the output should
-      be `(attention_output, attention_scores)` if `True`, or `attention_output`
-      if `False`. Defaults to `False`.
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (no dropout).
-      Defaults to either using the training mode of the parent layer/model,
-      or False (inference) if there is no parent layer.
-
-  Returns:
-    attention_output: The result of the computation, of shape `(B, T, E)`,
-      where `T` is for target sequence shapes and `E` is the query input last
-      dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
-      are project to the shape specified by `output_shape`.
-    attention_scores: [Optional] multi-head attention coefficients over
-      attention axes.
-  """
-
-  def __init__(self,
-               num_heads,
-               key_dim,
-               value_dim=None,
-               dropout=0.0,
-               use_bias=True,
-               output_shape=None,
-               attention_axes=None,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(**kwargs)
-    self._num_heads = num_heads
-    self._key_dim = key_dim
-    self._value_dim = value_dim if value_dim else key_dim
-    self._dropout = dropout
-    self._use_bias = use_bias
-    self._output_shape = output_shape
-    self._kernel_initializer = initializers.get(kernel_initializer)
-    self._bias_initializer = initializers.get(bias_initializer)
-    self._kernel_regularizer = regularizers.get(kernel_regularizer)
-    self._bias_regularizer = regularizers.get(bias_regularizer)
-    self._activity_regularizer = regularizers.get(activity_regularizer)
-    self._kernel_constraint = constraints.get(kernel_constraint)
-    self._bias_constraint = constraints.get(bias_constraint)
-    if attention_axes is not None and not isinstance(attention_axes,
-                                                     collections.abc.Sized):
-      self._attention_axes = (attention_axes,)
-    else:
-      self._attention_axes = attention_axes
-    self._built_from_signature = False
-    self._query_shape, self._key_shape, self._value_shape = None, None, None
-
-  def get_config(self):
-    config = {
-        "num_heads": self._num_heads,
-        "key_dim": self._key_dim,
-        "value_dim": self._value_dim,
-        "dropout": self._dropout,
-        "use_bias": self._use_bias,
-        "output_shape": self._output_shape,
-        "attention_axes": self._attention_axes,
-        "kernel_initializer":
-            initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            constraints.serialize(self._bias_constraint),
-        "query_shape": self._query_shape,
-        "key_shape": self._key_shape,
-        "value_shape": self._value_shape,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    # If the layer has a different build() function from the Keras default,
-    # we need to trigger the customized build to create weights.
-    query_shape = config.pop("query_shape")
-    key_shape = config.pop("key_shape")
-    value_shape = config.pop("value_shape")
-    layer = cls(**config)
-    if None in [query_shape, key_shape, value_shape]:
-      logging.warning(
-          "One of dimensions of the input shape is missing. It should have been"
-          " memorized when the layer was serialized. "
-          "%s is created without weights.",
-          str(cls))
-    else:
-      layer._build_from_signature(query_shape, value_shape, key_shape)  # pylint: disable=protected-access
-    return layer
-
-  def _build_from_signature(self, query, value, key=None):
-    """Builds layers and variables.
-
-    Once the method is called, self._built_from_signature will be set to True.
-
-    Args:
-      query: Query tensor or TensorShape.
-      value: Value tensor or TensorShape.
-      key: Key tensor or TensorShape.
-    """
-    self._built_from_signature = True
-    if hasattr(query, "shape"):
-      self._query_shape = tf.TensorShape(query.shape)
-    else:
-      self._query_shape = tf.TensorShape(query)
-    if hasattr(value, "shape"):
-      self._value_shape = tf.TensorShape(value.shape)
-    else:
-      self._value_shape = tf.TensorShape(value)
-    if key is None:
-      self._key_shape = self._value_shape
-    elif hasattr(key, "shape"):
-      self._key_shape = tf.TensorShape(key.shape)
-    else:
-      self._key_shape = tf.TensorShape(key)
-
-    # Any setup work performed only once should happen in an `init_scope`
-    # to avoid creating symbolic Tensors that will later pollute any eager
-    # operations.
-    with tf_utils.maybe_init_scope(self):
-      free_dims = self._query_shape.rank - 1
-      einsum_equation, bias_axes, output_rank = _build_proj_equation(
-          free_dims, bound_dims=1, output_dims=2)
-      self._query_dense = core.EinsumDense(
-          einsum_equation,
-          output_shape=_get_output_shape(output_rank - 1,
-                                         [self._num_heads, self._key_dim]),
-          bias_axes=bias_axes if self._use_bias else None,
-          name="query",
-          **self._get_common_kwargs_for_sublayer())
-      einsum_equation, bias_axes, output_rank = _build_proj_equation(
-          self._key_shape.rank - 1, bound_dims=1, output_dims=2)
-      self._key_dense = core.EinsumDense(
-          einsum_equation,
-          output_shape=_get_output_shape(output_rank - 1,
-                                         [self._num_heads, self._key_dim]),
-          bias_axes=bias_axes if self._use_bias else None,
-          name="key",
-          **self._get_common_kwargs_for_sublayer())
-      einsum_equation, bias_axes, output_rank = _build_proj_equation(
-          self._value_shape.rank - 1, bound_dims=1, output_dims=2)
-      self._value_dense = core.EinsumDense(
-          einsum_equation,
-          output_shape=_get_output_shape(output_rank - 1,
-                                         [self._num_heads, self._value_dim]),
-          bias_axes=bias_axes if self._use_bias else None,
-          name="value",
-          **self._get_common_kwargs_for_sublayer())
-
-      # Builds the attention computations for multi-head dot product attention.
-      # These computations could be wrapped into the keras attention layer once
-      # it supports mult-head einsum computations.
-      self._build_attention(output_rank)
-      self._output_dense = self._make_output_dense(
-          free_dims, self._get_common_kwargs_for_sublayer(),
-          "attention_output")
-
-  def _get_common_kwargs_for_sublayer(self):
-    common_kwargs = dict(
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-    # Create new clone of kernel/bias initializer, so that we don't reuse the
-    # initializer instance, which could lead to same init value since
-    # initializer is stateless.
-    kernel_initializer = self._kernel_initializer.__class__.from_config(
-        self._kernel_initializer.get_config())
-    bias_initializer = self._bias_initializer.__class__.from_config(
-        self._bias_initializer.get_config())
-    common_kwargs['kernel_initializer'] = kernel_initializer
-    common_kwargs['bias_initializer'] = bias_initializer
-    return common_kwargs
-
-  def _make_output_dense(self, free_dims, common_kwargs, name=None):
-    """Builds the output projection matrix.
+    """MultiHeadAttention layer.
+
+    This is an implementation of multi-headed attention as described in the
+    paper "Attention is all you Need" (Vaswani et al., 2017).
+    If `query`, `key,` `value` are the same, then
+    this is self-attention. Each timestep in `query` attends to the
+    corresponding sequence in `key`, and returns a fixed-width vector.
+
+    This layer first projects `query`, `key` and `value`. These are
+    (effectively) a list of tensors of length `num_attention_heads`, where the
+    corresponding shapes are `(batch_size, <query dimensions>, key_dim)`,
+    `(batch_size, <key/value dimensions>, key_dim)`,
+    `(batch_size, <key/value dimensions>, value_dim)`.
+
+    Then, the query and key tensors are dot-producted and scaled. These are
+    softmaxed to obtain attention probabilities. The value tensors are then
+    interpolated by these probabilities, then concatenated back to a single
+    tensor.
+
+    Finally, the result tensor with the last dimension as value_dim can take an
+    linear projection and return.
+
+    When using `MultiHeadAttention` inside a custom layer, the custom layer must
+    implement its own `build()` method and call `MultiHeadAttention`'s
+    `_build_from_signature()` there.
+    This enables weights to be restored correctly when the model is loaded.
+
+    Examples:
+
+    Performs 1D cross-attention over two sequence inputs with an attention mask.
+    Returns the additional attention weights over heads.
+
+    >>> layer = MultiHeadAttention(num_heads=2, key_dim=2)
+    >>> target = tf.keras.Input(shape=[8, 16])
+    >>> source = tf.keras.Input(shape=[4, 16])
+    >>> output_tensor, weights = layer(target, source,
+    ...                                return_attention_scores=True)
+    >>> print(output_tensor.shape)
+    (None, 8, 16)
+    >>> print(weights.shape)
+    (None, 2, 8, 4)
+
+    Performs 2D self-attention over a 5D input tensor on axes 2 and 3.
+
+    >>> layer = MultiHeadAttention(
+    ...     num_heads=2, key_dim=2, attention_axes=(2, 3))
+    >>> input_tensor = tf.keras.Input(shape=[5, 3, 4, 16])
+    >>> output_tensor = layer(input_tensor, input_tensor)
+    >>> print(output_tensor.shape)
+    (None, 5, 3, 4, 16)
 
     Args:
-      free_dims: Number of free dimensions for einsum equation building.
-      common_kwargs: Common keyword arguments for einsum layer.
-      name: Name for the projection layer.
+        num_heads: Number of attention heads.
+        key_dim: Size of each attention head for query and key.
+        value_dim: Size of each attention head for value.
+        dropout: Dropout probability.
+        use_bias: Boolean, whether the dense layers use bias vectors/matrices.
+        output_shape: The expected shape of an output tensor, besides the batch
+            and sequence dims. If not specified, projects back to the query
+            feature dim (the query input's last dimension).
+        attention_axes: axes over which the attention is applied. `None` means
+            attention over all axes, but batch, heads, and features.
+        kernel_initializer: Initializer for dense layer kernels.
+        bias_initializer: Initializer for dense layer biases.
+        kernel_regularizer: Regularizer for dense layer kernels.
+        bias_regularizer: Regularizer for dense layer biases.
+        activity_regularizer: Regularizer for dense layer activity.
+        kernel_constraint: Constraint for dense layer kernels.
+        bias_constraint: Constraint for dense layer kernels.
+
+    Call arguments:
+        query: Query `Tensor` of shape `(B, T, dim)`.
+        value: Value `Tensor` of shape `(B, S, dim)`.
+        key: Optional key `Tensor` of shape `(B, S, dim)`. If not given, will
+            use `value` for both `key` and `value`, which is the most common
+            case.
+        attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
+            attention to certain positions. The boolean mask specifies which
+            query elements can attend to which key elements, 1 indicates
+            attention and 0 indicates no attention. Broadcasting can happen for
+            the missing batch dimensions and the head dimension.
+        return_attention_scores: A boolean to indicate whether the output should
+            be `(attention_output, attention_scores)` if `True`, or
+            `attention_output` if `False`. Defaults to `False`.
+        training: Python boolean indicating whether the layer should behave in
+            training mode (adding dropout) or in inference mode (no dropout).
+            Will go with either using the training mode of the parent
+            layer/model, or False (inference) if there is no parent layer.
+        use_causal_mask: A boolean to indicate whether to apply a causal mask to
+            prevent tokens from attending to future tokens (e.g., used in a
+            decoder Transformer).
 
     Returns:
-      Projection layer.
+        attention_output: The result of the computation, of shape `(B, T, E)`,
+            where `T` is for target sequence shapes and `E` is the query input
+            last dimension if `output_shape` is `None`. Otherwise, the
+            multi-head outputs are projected to the shape specified by
+            `output_shape`.
+        attention_scores: [Optional] multi-head attention coefficients over
+            attention axes.
     """
-    if self._output_shape:
-      if not isinstance(self._output_shape, collections.abc.Sized):
-        output_shape = [self._output_shape]
-      else:
-        output_shape = self._output_shape
-    else:
-      output_shape = [self._query_shape[-1]]
-    einsum_equation, bias_axes, output_rank = _build_proj_equation(
-        free_dims, bound_dims=2, output_dims=len(output_shape))
-    return core.EinsumDense(
-        einsum_equation,
-        output_shape=_get_output_shape(output_rank - 1, output_shape),
-        bias_axes=bias_axes if self._use_bias else None,
-        name=name,
-        **common_kwargs)
-
-  def _build_attention(self, rank):
-    """Builds multi-head dot-product attention computations.
-
-    This function builds attributes necessary for `_compute_attention` to
-    costomize attention computation to replace the default dot-product
-    attention.
 
-    Args:
-      rank: the rank of query, key, value tensors.
-    """
-    if self._attention_axes is None:
-      self._attention_axes = tuple(range(1, rank - 2))
-    else:
-      self._attention_axes = tuple(self._attention_axes)
-    self._dot_product_equation, self._combine_equation, attn_scores_rank = (
-        _build_attention_equation(rank, attn_axes=self._attention_axes))
-    norm_axes = tuple(
-        range(attn_scores_rank - len(self._attention_axes), attn_scores_rank))
-    self._softmax = activation.Softmax(axis=norm_axes)
-    self._dropout_layer = regularization.Dropout(rate=self._dropout)
-
-  def _masked_softmax(self, attention_scores, attention_mask=None):
-    # Normalize the attention scores to probabilities.
-    # `attention_scores` = [B, N, T, S]
-    if attention_mask is not None:
-      # The expand dim happens starting from the `num_heads` dimension,
-      # (<batch_dims>, num_heads, <query_attention_dims, key_attention_dims>)
-      mask_expansion_axis = -len(self._attention_axes) * 2 - 1
-      for _ in range(len(attention_scores.shape) - len(attention_mask.shape)):
-        attention_mask = tf.expand_dims(
-            attention_mask, axis=mask_expansion_axis)
-    return self._softmax(attention_scores, attention_mask)
-
-  def _compute_attention(self,
-                         query,
-                         key,
-                         value,
-                         attention_mask=None,
-                         training=None):
-    """Applies Dot-product attention with query, key, value tensors.
-
-    This function defines the computation inside `call` with projected
-    multi-head Q, K, V inputs. Users can override this function for customized
-    attention implementation.
-
-    Args:
-      query: Projected query `Tensor` of shape `(B, T, N, key_dim)`.
-      key: Projected key `Tensor` of shape `(B, T, N, key_dim)`.
-      value: Projected value `Tensor` of shape `(B, T, N, value_dim)`.
-      attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
-        attention to certain positions.
-      training: Python boolean indicating whether the layer should behave in
-        training mode (adding dropout) or in inference mode (doing nothing).
-
-    Returns:
-      attention_output: Multi-headed outputs of attention computation.
-      attention_scores: Multi-headed attention weights.
-    """
-    # Note: Applying scalar multiply at the smaller end of einsum improves
-    # XLA performance, but may introduce slight numeric differences in
-    # the Transformer attention head.
-    query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))
-
-    # Take the dot product between "query" and "key" to get the raw
-    # attention scores.
-    attention_scores = tf.einsum(self._dot_product_equation, key, query)
-
-    attention_scores = self._masked_softmax(attention_scores, attention_mask)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_scores_dropout = self._dropout_layer(
-        attention_scores, training=training)
-
-    # `context_layer` = [B, T, N, H]
-    attention_output = tf.einsum(self._combine_equation,
-                                 attention_scores_dropout, value)
-    return attention_output, attention_scores
-
-  def call(self,
-           query,
-           value,
-           key=None,
-           attention_mask=None,
-           return_attention_scores=False,
-           training=None):
-    if not self._built_from_signature:
-      self._build_from_signature(query=query, value=value, key=key)
-    if key is None:
-      key = value
-
-    query_is_ragged = isinstance(query, tf.RaggedTensor)
-    if query_is_ragged:
-      query_lengths = query.nested_row_lengths()
-      query = query.to_tensor()
-
-    key_is_ragged = isinstance(key, tf.RaggedTensor)
-    value_is_ragged = isinstance(value, tf.RaggedTensor)
-    if key_is_ragged and value_is_ragged:
-      # Ensure they have the same shape.
-      bounding_shape = tf.math.maximum(
-          key.bounding_shape(), value.bounding_shape())
-      key = key.to_tensor(shape=bounding_shape)
-      value = value.to_tensor(shape=bounding_shape)
-    elif key_is_ragged:
-      key = key.to_tensor(shape=tf.shape(value))
-    elif value_is_ragged:
-      value = value.to_tensor(shape=tf.shape(key))
-
-    #   N = `num_attention_heads`
-    #   H = `size_per_head`
-    # `query` = [B, T, N ,H]
-    query = self._query_dense(query)
-
-    # `key` = [B, S, N, H]
-    key = self._key_dense(key)
-
-    # `value` = [B, S, N, H]
-    value = self._value_dense(value)
-
-    attention_output, attention_scores = self._compute_attention(
-        query, key, value, attention_mask, training)
-    attention_output = self._output_dense(attention_output)
-
-    if query_is_ragged:
-      attention_output = tf.RaggedTensor.from_tensor(
-          attention_output, lengths=query_lengths)
-
-    if return_attention_scores:
-      return attention_output, attention_scores
-    return attention_output
+    def __init__(
+        self,
+        num_heads,
+        key_dim,
+        value_dim=None,
+        dropout=0.0,
+        use_bias=True,
+        output_shape=None,
+        attention_axes=None,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self._num_heads = num_heads
+        self._key_dim = key_dim
+        self._value_dim = value_dim if value_dim else key_dim
+        self._dropout = dropout
+        self._use_bias = use_bias
+        self._output_shape = output_shape
+        self._kernel_initializer = initializers.get(kernel_initializer)
+        self._bias_initializer = initializers.get(bias_initializer)
+        self._kernel_regularizer = regularizers.get(kernel_regularizer)
+        self._bias_regularizer = regularizers.get(bias_regularizer)
+        self._activity_regularizer = regularizers.get(activity_regularizer)
+        self._kernel_constraint = constraints.get(kernel_constraint)
+        self._bias_constraint = constraints.get(bias_constraint)
+        if attention_axes is not None and not isinstance(
+            attention_axes, collections.abc.Sized
+        ):
+            self._attention_axes = (attention_axes,)
+        else:
+            self._attention_axes = attention_axes
+        self._built_from_signature = False
+        self._query_shape, self._key_shape, self._value_shape = None, None, None
+
+    def get_config(self):
+        config = {
+            "num_heads": self._num_heads,
+            "key_dim": self._key_dim,
+            "value_dim": self._value_dim,
+            "dropout": self._dropout,
+            "use_bias": self._use_bias,
+            "output_shape": self._output_shape,
+            "attention_axes": self._attention_axes,
+            "kernel_initializer": initializers.serialize(
+                self._kernel_initializer
+            ),
+            "bias_initializer": initializers.serialize(self._bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self._kernel_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self._bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self._activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self._kernel_constraint),
+            "bias_constraint": constraints.serialize(self._bias_constraint),
+            "query_shape": self._query_shape,
+            "key_shape": self._key_shape,
+            "value_shape": self._value_shape,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        # If the layer has a different build() function from the Keras default,
+        # we need to trigger the customized build to create weights.
+        query_shape = config.pop("query_shape")
+        key_shape = config.pop("key_shape")
+        value_shape = config.pop("value_shape")
+        layer = cls(**config)
+        if None in [query_shape, key_shape, value_shape]:
+            logging.warning(
+                "One of dimensions of the input shape is missing. It "
+                "should have been memorized when the layer was serialized. "
+                "%s is created without weights.",
+                str(cls),
+            )
+        else:
+            layer._build_from_signature(query_shape, value_shape, key_shape)
+        return layer
+
+    def _build_from_signature(self, query, value, key=None):
+        """Builds layers and variables.
+
+        Once the method is called, self._built_from_signature will be set to
+        True.
+
+        Args:
+            query: Query tensor or TensorShape.
+            value: Value tensor or TensorShape.
+            key: Key tensor or TensorShape.
+        """
+        self._built_from_signature = True
+        if hasattr(query, "shape"):
+            self._query_shape = tf.TensorShape(query.shape)
+        else:
+            self._query_shape = tf.TensorShape(query)
+        if hasattr(value, "shape"):
+            self._value_shape = tf.TensorShape(value.shape)
+        else:
+            self._value_shape = tf.TensorShape(value)
+        if key is None:
+            self._key_shape = self._value_shape
+        elif hasattr(key, "shape"):
+            self._key_shape = tf.TensorShape(key.shape)
+        else:
+            self._key_shape = tf.TensorShape(key)
+
+        # Any setup work performed only once should happen in an `init_scope`
+        # to avoid creating symbolic Tensors that will later pollute any eager
+        # operations.
+        with tf_utils.maybe_init_scope(self):
+            free_dims = self._query_shape.rank - 1
+            einsum_equation, bias_axes, output_rank = _build_proj_equation(
+                free_dims, bound_dims=1, output_dims=2
+            )
+            self._query_dense = core.EinsumDense(
+                einsum_equation,
+                output_shape=_get_output_shape(
+                    output_rank - 1, [self._num_heads, self._key_dim]
+                ),
+                bias_axes=bias_axes if self._use_bias else None,
+                name="query",
+                **self._get_common_kwargs_for_sublayer(),
+            )
+            einsum_equation, bias_axes, output_rank = _build_proj_equation(
+                self._key_shape.rank - 1, bound_dims=1, output_dims=2
+            )
+            self._key_dense = core.EinsumDense(
+                einsum_equation,
+                output_shape=_get_output_shape(
+                    output_rank - 1, [self._num_heads, self._key_dim]
+                ),
+                bias_axes=bias_axes if self._use_bias else None,
+                name="key",
+                **self._get_common_kwargs_for_sublayer(),
+            )
+            einsum_equation, bias_axes, output_rank = _build_proj_equation(
+                self._value_shape.rank - 1, bound_dims=1, output_dims=2
+            )
+            self._value_dense = core.EinsumDense(
+                einsum_equation,
+                output_shape=_get_output_shape(
+                    output_rank - 1, [self._num_heads, self._value_dim]
+                ),
+                bias_axes=bias_axes if self._use_bias else None,
+                name="value",
+                **self._get_common_kwargs_for_sublayer(),
+            )
+
+            # Builds the attention computations for multi-head dot product
+            # attention.  These computations could be wrapped into the keras
+            # attention layer once it supports mult-head einsum computations.
+            self._build_attention(output_rank)
+            self._output_dense = self._make_output_dense(
+                free_dims,
+                self._get_common_kwargs_for_sublayer(),
+                "attention_output",
+            )
+
+    def _get_common_kwargs_for_sublayer(self):
+        common_kwargs = dict(
+            kernel_regularizer=self._kernel_regularizer,
+            bias_regularizer=self._bias_regularizer,
+            activity_regularizer=self._activity_regularizer,
+            kernel_constraint=self._kernel_constraint,
+            bias_constraint=self._bias_constraint,
+            dtype=self._dtype_policy,
+        )
+        # Create new clone of kernel/bias initializer, so that we don't reuse
+        # the initializer instance, which could lead to same init value since
+        # initializer is stateless.
+        kernel_initializer = self._kernel_initializer.__class__.from_config(
+            self._kernel_initializer.get_config()
+        )
+        bias_initializer = self._bias_initializer.__class__.from_config(
+            self._bias_initializer.get_config()
+        )
+        common_kwargs["kernel_initializer"] = kernel_initializer
+        common_kwargs["bias_initializer"] = bias_initializer
+        return common_kwargs
+
+    def _make_output_dense(self, free_dims, common_kwargs, name=None):
+        """Builds the output projection matrix.
+
+        Args:
+            free_dims: Number of free dimensions for einsum equation building.
+            common_kwargs: Common keyword arguments for einsum layer.
+            name: Name for the projection layer.
+
+        Returns:
+            Projection layer.
+        """
+        if self._output_shape:
+            if not isinstance(self._output_shape, collections.abc.Sized):
+                output_shape = [self._output_shape]
+            else:
+                output_shape = self._output_shape
+        else:
+            output_shape = [self._query_shape[-1]]
+        einsum_equation, bias_axes, output_rank = _build_proj_equation(
+            free_dims, bound_dims=2, output_dims=len(output_shape)
+        )
+        return core.EinsumDense(
+            einsum_equation,
+            output_shape=_get_output_shape(output_rank - 1, output_shape),
+            bias_axes=bias_axes if self._use_bias else None,
+            name=name,
+            **common_kwargs,
+        )
+
+    def _build_attention(self, rank):
+        """Builds multi-head dot-product attention computations.
+
+        This function builds attributes necessary for `_compute_attention` to
+        customize attention computation to replace the default dot-product
+        attention.
+
+        Args:
+            rank: the rank of query, key, value tensors.
+        """
+        if self._attention_axes is None:
+            self._attention_axes = tuple(range(1, rank - 2))
+        else:
+            self._attention_axes = tuple(self._attention_axes)
+        (
+            self._dot_product_equation,
+            self._combine_equation,
+            attn_scores_rank,
+        ) = _build_attention_equation(rank, attn_axes=self._attention_axes)
+        norm_axes = tuple(
+            range(
+                attn_scores_rank - len(self._attention_axes), attn_scores_rank
+            )
+        )
+        self._softmax = activation.Softmax(
+            axis=norm_axes, dtype=self._dtype_policy
+        )
+        self._dropout_layer = regularization.Dropout(
+            rate=self._dropout, dtype=self._dtype_policy
+        )
+
+    def _masked_softmax(self, attention_scores, attention_mask=None):
+        # Normalize the attention scores to probabilities.
+        # `attention_scores` = [B, N, T, S]
+        if attention_mask is not None:
+            # The expand dim happens starting from the `num_heads` dimension,
+            # (<batch_dims>, num_heads, <query_attention_dims,
+            # key_attention_dims>)
+            mask_expansion_axis = -len(self._attention_axes) * 2 - 1
+            for _ in range(
+                len(attention_scores.shape) - len(attention_mask.shape)
+            ):
+                attention_mask = tf.expand_dims(
+                    attention_mask, axis=mask_expansion_axis
+                )
+        return self._softmax(attention_scores, attention_mask)
+
+    def _compute_attention(
+        self, query, key, value, attention_mask=None, training=None
+    ):
+        """Applies Dot-product attention with query, key, value tensors.
+
+        This function defines the computation inside `call` with projected
+        multi-head Q, K, V inputs. Users can override this function for
+        customized attention implementation.
+
+        Args:
+            query: Projected query `Tensor` of shape `(B, T, N, key_dim)`.
+            key: Projected key `Tensor` of shape `(B, S, N, key_dim)`.
+            value: Projected value `Tensor` of shape `(B, S, N, value_dim)`.
+            attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
+                attention to certain positions. It is generally not needed if
+                the `query` and `value` (and/or `key`) are masked.
+            training: Python boolean indicating whether the layer should behave
+                in training mode (adding dropout) or in inference mode (doing
+                nothing).
+
+        Returns:
+          attention_output: Multi-headed outputs of attention computation.
+          attention_scores: Multi-headed attention weights.
+        """
+        # Note: Applying scalar multiply at the smaller end of einsum improves
+        # XLA performance, but may introduce slight numeric differences in
+        # the Transformer attention head.
+        query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))
+
+        # Take the dot product between "query" and "key" to get the raw
+        # attention scores.
+        attention_scores = tf.einsum(self._dot_product_equation, key, query)
+        attention_scores = self._masked_softmax(
+            attention_scores, attention_mask
+        )
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_scores_dropout = self._dropout_layer(
+            attention_scores, training=training
+        )
+        # `context_layer` = [B, T, N, H]
+        attention_output = tf.einsum(
+            self._combine_equation, attention_scores_dropout, value
+        )
+        return attention_output, attention_scores
+
+    def call(
+        self,
+        query,
+        value,
+        key=None,
+        attention_mask=None,
+        return_attention_scores=False,
+        training=None,
+        use_causal_mask=False,
+    ):
+        if not self._built_from_signature:
+            self._build_from_signature(query=query, value=value, key=key)
+        if key is None:
+            key = value
+
+        # Convert RaggedTensor to Tensor.
+        query_is_ragged = isinstance(query, tf.RaggedTensor)
+        if query_is_ragged:
+            query_lengths = query.nested_row_lengths()
+            query = query.to_tensor()
+        key_is_ragged = isinstance(key, tf.RaggedTensor)
+        value_is_ragged = isinstance(value, tf.RaggedTensor)
+        if key_is_ragged and value_is_ragged:
+            # Ensure they have the same shape.
+            bounding_shape = tf.math.maximum(
+                key.bounding_shape(), value.bounding_shape()
+            )
+            key = key.to_tensor(shape=bounding_shape)
+            value = value.to_tensor(shape=bounding_shape)
+        elif key_is_ragged:
+            key = key.to_tensor(shape=tf.shape(value))
+        elif value_is_ragged:
+            value = value.to_tensor(shape=tf.shape(key))
+
+        attention_mask = self._compute_attention_mask(
+            query,
+            value,
+            key=key,
+            attention_mask=attention_mask,
+            use_causal_mask=use_causal_mask,
+        )
+
+        #   N = `num_attention_heads`
+        #   H = `size_per_head`
+        # `query` = [B, T, N ,H]
+        query = self._query_dense(query)
+
+        # `key` = [B, S, N, H]
+        key = self._key_dense(key)
+
+        # `value` = [B, S, N, H]
+        value = self._value_dense(value)
+
+        attention_output, attention_scores = self._compute_attention(
+            query, key, value, attention_mask, training
+        )
+        attention_output = self._output_dense(attention_output)
+
+        if query_is_ragged:
+            attention_output = tf.RaggedTensor.from_tensor(
+                attention_output, lengths=query_lengths
+            )
+
+        if return_attention_scores:
+            return attention_output, attention_scores
+        return attention_output
+
+    def _compute_attention_mask(
+        self, query, value, key=None, attention_mask=None, use_causal_mask=False
+    ):
+        """Computes the attention mask, using the Keras masks of the inputs.
+
+        * The `query`'s mask is reshaped from [B, T] to [B, T, 1].
+        * The `value`'s mask is reshaped from [B, S] to [B, 1, S].
+        * The `key`'s mask is reshaped from [B, S] to [B, 1, S]. The `key`'s
+          mask is ignored if `key` is `None` or if `key is value`.
+        * If `use_causal_mask=True`, then the causal mask is computed. Its shape
+          is [1, T, S].
+
+        All defined masks are merged using a logical AND operation (`&`).
+
+        In general, if the `query` and `value` are masked, then there is no need
+        to define the `attention_mask`.
+
+        Args:
+            query: Projected query `Tensor` of shape `(B, T, N, key_dim)`.
+            key: Projected key `Tensor` of shape `(B, T, N, key_dim)`.
+            value: Projected value `Tensor` of shape `(B, T, N, value_dim)`.
+            attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
+                attention to certain positions.
+            use_causal_mask: A boolean to indicate whether to apply a causal
+                mask to prevent tokens from attending to future tokens (e.g.,
+                used in a decoder Transformer).
+
+        Returns:
+            attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
+                attention to certain positions, based on the Keras masks of the
+                `query`, `key`, `value`, and `attention_mask` tensors, and the
+                causal mask if `use_causal_mask=True`.
+        """
+        query_mask = getattr(query, "_keras_mask", None)
+        value_mask = getattr(value, "_keras_mask", None)
+        key_mask = getattr(key, "_keras_mask", None)
+        auto_mask = None
+        if query_mask is not None:
+            query_mask = tf.cast(query_mask, tf.bool)  # defensive casting
+            # B = batch size, T = max query length
+            auto_mask = query_mask[:, :, tf.newaxis]  # shape is [B, T, 1]
+        if value_mask is not None:
+            value_mask = tf.cast(value_mask, tf.bool)  # defensive casting
+            # B = batch size, S == max value length
+            mask = value_mask[:, tf.newaxis, :]  # shape is [B, 1, S]
+            auto_mask = mask if auto_mask is None else auto_mask & mask
+        if key_mask is not None:
+            key_mask = tf.cast(key_mask, tf.bool)  # defensive casting
+            # B == batch size, S == max key length == max value length
+            mask = key_mask[:, tf.newaxis, :]  # shape is [B, 1, S]
+            auto_mask = mask if auto_mask is None else auto_mask & mask
+        if use_causal_mask:
+            # the shape of the causal mask is [1, T, S]
+            mask = self._compute_causal_mask(query, value)
+            auto_mask = mask if auto_mask is None else auto_mask & mask
+        if auto_mask is not None:
+            # merge attention_mask & automatic mask, to shape [B, T, S]
+            attention_mask = (
+                auto_mask
+                if attention_mask is None
+                else tf.cast(attention_mask, bool) & auto_mask
+            )
+        return attention_mask
+
+    def _compute_causal_mask(self, query, value=None):
+        """Computes a causal mask (e.g., for masked self-attention layers).
+
+        For example, if query and value both contain sequences of length 4,
+        this function returns a boolean `Tensor` equal to:
+
+        ```
+        [[[True,  False, False, False],
+          [True,  True,  False, False],
+          [True,  True,  True,  False],
+          [True,  True,  True,  True]]]
+        ```
+
+        Args:
+            query: query `Tensor` of shape `(B, T, ...)`.
+            value: value `Tensor` of shape `(B, S, ...)` (optional, defaults to
+                query).
+
+        Returns:
+            mask: a boolean `Tensor` of shape [1, T, S] containing a lower
+                triangular matrix of shape [T, S].
+        """
+        q_seq_length = tf.shape(query)[1]
+        v_seq_length = q_seq_length if value is None else tf.shape(value)[1]
+        return tf.linalg.band_part(  # creates a lower triangular matrix
+            tf.ones((1, q_seq_length, v_seq_length), tf.bool), -1, 0
+        )
+
+    def compute_output_shape(self, query_shape, value_shape, key_shape=None):
+        if key_shape is None:
+            key_shape = value_shape
+
+        query_shape = tf.TensorShape(query_shape)
+        value_shape = tf.TensorShape(value_shape)
+        key_shape = tf.TensorShape(key_shape)
+
+        if query_shape[-1] != value_shape[-1]:
+            raise ValueError(
+                "The last dimension of `query_shape` and `value_shape` "
+                f"must be equal, but are {query_shape[-1]}, {value_shape[-1]}. "
+                "Received: query_shape={query_shape}, value_shape={value_shape}"
+            )
+
+        if value_shape[1:-1] != key_shape[1:-1]:
+            raise ValueError(
+                "All dimensions of `value` and `key`, except the last one, "
+                f"must be equal. Received {value_shape} and "
+                f"{key_shape}"
+            )
+
+        if self._output_shape:
+            return query_shape[:-1].concatenate(self._output_shape)
+
+        return query_shape
diff --git a/keras/layers/attention/multi_head_attention_test.py b/keras/layers/attention/multi_head_attention_test.py
index fcd73cd4d194..aa4d15aed6f5 100644
--- a/keras/layers/attention/multi_head_attention_test.py
+++ b/keras/layers/attention/multi_head_attention_test.py
@@ -14,358 +14,611 @@
 # ==============================================================================
 """Tests for the MultiHeadAttention layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
+from keras.saving import object_registration
 from keras.testing_infra import test_combinations
-import numpy as np
-import tensorflow.compat.v2 as tf
+from keras.testing_infra import test_utils
 
 
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
 # guarantees forward compatibility of this code for the V2 switchover.
 @test_combinations.run_all_keras_modes
 class MultiHeadAttentionTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ("key_value_same_proj", None, None, [40, 80]),
-      ("key_value_different_proj", 32, 60, [40, 60]),
-  )
-  def test_non_masked_attention(self, value_dim, output_shape, output_dims):
-    """Test that the attention layer can be created without a mask tensor."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=12,
-        key_dim=64,
-        value_dim=value_dim,
-        output_shape=output_shape)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = keras.Input(shape=(40, 80))
-    value = keras.Input(shape=(20, 80))
-    output = test_layer(query=query, value=value)
-    self.assertEqual(output.shape.as_list(), [None] + output_dims)
-
-  def test_non_masked_self_attention(self):
-    """Test with one input (self-attenntion) and no mask tensor."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=12, key_dim=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = keras.Input(shape=(40, 80))
-    output = test_layer(query, query)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-
-  def test_attention_scores(self):
-    """Test attention outputs with coefficients."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=12, key_dim=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = keras.Input(shape=(40, 80))
-    output, coef = test_layer(query, query, return_attention_scores=True)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-    self.assertEqual(coef.shape.as_list(), [None, 12, 40, 40])
-
-  def test_attention_scores_with_values(self):
-    """Test attention outputs with coefficients."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=12, key_dim=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = keras.Input(shape=(40, 80))
-    value = keras.Input(shape=(60, 80))
-    output, coef = test_layer(query, value, return_attention_scores=True)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-    self.assertEqual(coef.shape.as_list(), [None, 12, 40, 60])
-
-  @parameterized.named_parameters(("with_bias", True), ("no_bias", False))
-  def test_masked_attention(self, use_bias):
-    """Test with a mask tensor."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=2, key_dim=2, use_bias=use_bias)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    batch_size = 3
-    query = keras.Input(shape=(4, 8))
-    value = keras.Input(shape=(2, 8))
-    mask_tensor = keras.Input(shape=(4, 2))
-    output = test_layer(query=query, value=value, attention_mask=mask_tensor)
-
-    # Create a model containing the test layer.
-    model = keras.Model([query, value, mask_tensor], output)
-
-    # Generate data for the input (non-mask) tensors.
-    from_data = 10 * np.random.random_sample((batch_size, 4, 8))
-    to_data = 10 * np.random.random_sample((batch_size, 2, 8))
-
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(2, size=(batch_size, 4, 2))
-    masked_output_data = model.predict([from_data, to_data, mask_data])
-
-    # Invoke the same data, but with a null mask (where no elements are masked).
-    null_mask_data = np.ones((batch_size, 4, 2))
-    unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
-
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(masked_output_data, unmasked_output_data)
-
-    # Tests the layer with three inputs: Q, K, V.
-    key = keras.Input(shape=(2, 8))
-    output = test_layer(query, value=value, key=key, attention_mask=mask_tensor)
-    model = keras.Model([query, value, key, mask_tensor], output)
-
-    masked_output_data = model.predict([from_data, to_data, to_data, mask_data])
-    unmasked_output_data = model.predict(
-        [from_data, to_data, to_data, null_mask_data])
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(masked_output_data, unmasked_output_data)
-
-    if use_bias:
-      self.assertLen(test_layer._query_dense.trainable_variables, 2)
-      self.assertLen(test_layer._output_dense.trainable_variables, 2)
-    else:
-      self.assertLen(test_layer._query_dense.trainable_variables, 1)
-      self.assertLen(test_layer._output_dense.trainable_variables, 1)
-
-  def test_initializer(self):
-    """Test with a specified initializer."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=12,
-        key_dim=64,
-        kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02))
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = keras.Input(shape=(40, 80))
-    output = test_layer(query, query)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-
-    # Make sure the sub layers have different kernel init value, and not reusing
-    # the initializers.
-    self.assertNotAllClose(keras.backend.eval(test_layer._query_dense.kernel),
-                           keras.backend.eval(test_layer._key_dense.kernel))
-    self.assertNotAllClose(keras.backend.eval(test_layer._query_dense.kernel),
-                           keras.backend.eval(test_layer._value_dense.kernel))
-    self.assertNotAllClose(keras.backend.eval(test_layer._query_dense.kernel),
-                           keras.backend.eval(test_layer._output_dense.kernel))
-
-  def test_masked_attention_with_scores(self):
-    """Test with a mask tensor."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=2, key_dim=2)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    batch_size = 3
-    query = keras.Input(shape=(4, 8))
-    value = keras.Input(shape=(2, 8))
-    mask_tensor = keras.Input(shape=(4, 2))
-    output = test_layer(query=query, value=value, attention_mask=mask_tensor)
-
-    # Create a model containing the test layer.
-    model = keras.Model([query, value, mask_tensor], output)
-
-    # Generate data for the input (non-mask) tensors.
-    from_data = 10 * np.random.random_sample((batch_size, 4, 8))
-    to_data = 10 * np.random.random_sample((batch_size, 2, 8))
-
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(2, size=(batch_size, 4, 2))
-    masked_output_data = model.predict([from_data, to_data, mask_data])
-
-    # Invoke the same data, but with a null mask (where no elements are masked).
-    null_mask_data = np.ones((batch_size, 4, 2))
-    unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
-
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(masked_output_data, unmasked_output_data)
-
-    # Create a model containing attention scores.
-    output, scores = test_layer(
-        query=query, value=value, attention_mask=mask_tensor,
-        return_attention_scores=True)
-    model = keras.Model([query, value, mask_tensor], [output, scores])
-    masked_output_data_score, masked_score = model.predict(
-        [from_data, to_data, mask_data])
-    unmasked_output_data_score, unmasked_score = model.predict(
-        [from_data, to_data, null_mask_data])
-    self.assertNotAllClose(masked_output_data_score, unmasked_output_data_score)
-    self.assertAllClose(masked_output_data, masked_output_data_score)
-    self.assertAllClose(unmasked_output_data, unmasked_output_data_score)
-    self.assertNotAllClose(masked_score, unmasked_score)
-
-  @parameterized.named_parameters(
-      ("4d_inputs_1freebatch_mask2", [3, 4], [3, 2], [4, 2],
-       (2,)), ("4d_inputs_1freebatch_mask3", [3, 4], [3, 2], [3, 4, 2], (2,)),
-      ("4d_inputs_1freebatch_mask4", [3, 4], [3, 2], [3, 2, 4, 2],
-       (2,)), ("4D_inputs_2D_attention", [3, 4], [3, 2], [3, 4, 3, 2], (1, 2)),
-      ("5D_inputs_2D_attention", [5, 3, 4], [5, 3, 2], [3, 4, 3, 2], (2, 3)),
-      ("5D_inputs_2D_attention_fullmask", [5, 3, 4], [5, 3, 2], [5, 3, 4, 3, 2],
-       (2, 3)))
-  def test_high_dim_attention(self, q_dims, v_dims, mask_dims, attention_axes):
-    """Test with a mask tensor."""
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=2, key_dim=2, attention_axes=attention_axes)
-    batch_size, hidden_size = 3, 8
-    # Generate data for the input (non-mask) tensors.
-    query_shape = [batch_size] + q_dims + [hidden_size]
-    value_shape = [batch_size] + v_dims + [hidden_size]
-    mask_shape = [batch_size] + mask_dims
-    query = 10 * np.random.random_sample(query_shape)
-    value = 10 * np.random.random_sample(value_shape)
-
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(2, size=mask_shape).astype("bool")
-    # Invoke the same data, but with a null mask (where no elements are masked).
-    null_mask_data = np.ones(mask_shape)
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    query_tensor = keras.Input(query_shape[1:], name="query")
-    value_tensor = keras.Input(value_shape[1:], name="value")
-    mask_tensor = keras.Input(mask_shape[1:], name="mask")
-    output = test_layer(query=query_tensor, value=value_tensor,
-                        attention_mask=mask_tensor)
-    model = keras.Model([query_tensor, value_tensor, mask_tensor], output)
-
-    self.assertNotAllClose(
-        model.predict([query, value, mask_data]),
-        model.predict([query, value, null_mask_data]))
-
-  def test_dropout(self):
-    test_layer = keras.layers.MultiHeadAttention(
-        num_heads=2, key_dim=2, dropout=0.5)
-
-    # Generate data for the input (non-mask) tensors.
-    from_data = keras.backend.ones(shape=(32, 4, 8))
-    to_data = keras.backend.ones(shape=(32, 2, 8))
-    train_out = test_layer(from_data, to_data, None, None, None, True)
-    test_out = test_layer(from_data, to_data, None, None, None, False)
-
-    # Output should be close when not in training mode,
-    # and should not be close when enabling dropout in training mode.
-    self.assertNotAllClose(
-        keras.backend.eval(train_out),
-        keras.backend.eval(test_out))
-
-  @test_combinations.generate(test_combinations.combine(
-      ragged_query=[True, False],
-      ragged_value=[True, False],
-      ragged_key=[True, False]))
-  def test_ragged_tensor(self, ragged_query, ragged_value, ragged_key):
-    if ragged_query:
-      query = tf.ragged.constant(
-          [[[3., 1.], [4., 1.]], [[5., 9.], [2., 6.], [3., 1.]], [[1., 2.]]],
-          inner_shape=(2,))
-    else:
-      query = keras.backend.ones(shape=(3, 2, 2))
-
-    if ragged_value:
-      value = tf.ragged.constant(
-          [[[3., 1.], [4., 1.]], [[5., 9.]], [[1., 2.]]], inner_shape=(2,))
-    else:
-      value = keras.backend.ones(shape=(3, 4, 2))
-
-    if ragged_key:
-      key = tf.ragged.constant(
-          [[[3., 1.], [4., 1.]],
-           [[5., 9.], [2., 6.], [3., 1.], [1., 5.]],
-           [[1., 2.]]],
-          inner_shape=(2,))
-    else:
-      key = keras.backend.ones(shape=(3, 4, 2))
-
-    test_layer = keras.layers.MultiHeadAttention(num_heads=5, key_dim=2)
-    results = test_layer(query, value, key)
-    self.assertAllEqual(results.shape.as_list(), query.shape.as_list())
+    @parameterized.named_parameters(
+        ("key_value_same_proj", None, None, [40, 80]),
+        ("key_value_different_proj", 32, 60, [40, 60]),
+    )
+    def test_non_masked_attention(self, value_dim, output_shape, output_dims):
+        """Test that the attention layer can be created without a mask
+        tensor."""
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=12,
+            key_dim=64,
+            value_dim=value_dim,
+            output_shape=output_shape,
+        )
+        # Create a 3-dimensional input (the first dimension is implicit).
+        query = keras.Input(shape=(40, 80))
+        value = keras.Input(shape=(20, 80))
+        output = test_layer(query=query, value=value)
+        self.assertEqual(output.shape.as_list(), [None] + output_dims)
+
+    def test_non_masked_self_attention(self):
+        """Test with one input (self-attenntion) and no mask tensor."""
+        test_layer = keras.layers.MultiHeadAttention(num_heads=12, key_dim=64)
+        # Create a 3-dimensional input (the first dimension is implicit).
+        query = keras.Input(shape=(40, 80))
+        output = test_layer(query, query)
+        self.assertEqual(output.shape.as_list(), [None, 40, 80])
+
+    def test_attention_scores(self):
+        """Test attention outputs with coefficients."""
+        test_layer = keras.layers.MultiHeadAttention(num_heads=12, key_dim=64)
+        # Create a 3-dimensional input (the first dimension is implicit).
+        query = keras.Input(shape=(40, 80))
+        output, coef = test_layer(query, query, return_attention_scores=True)
+        self.assertEqual(output.shape.as_list(), [None, 40, 80])
+        self.assertEqual(coef.shape.as_list(), [None, 12, 40, 40])
+
+    def test_attention_scores_with_values(self):
+        """Test attention outputs with coefficients."""
+        test_layer = keras.layers.MultiHeadAttention(num_heads=12, key_dim=64)
+        # Create a 3-dimensional input (the first dimension is implicit).
+        query = keras.Input(shape=(40, 80))
+        value = keras.Input(shape=(60, 80))
+        output, coef = test_layer(query, value, return_attention_scores=True)
+        self.assertEqual(output.shape.as_list(), [None, 40, 80])
+        self.assertEqual(coef.shape.as_list(), [None, 12, 40, 60])
+
+    @parameterized.named_parameters(("with_bias", True), ("no_bias", False))
+    def test_masked_attention(self, use_bias):
+        """Test with a mask tensor."""
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=2, key_dim=2, use_bias=use_bias
+        )
+        # Create a 3-dimensional input (the first dimension is implicit).
+        batch_size = 3
+        query = keras.Input(shape=(4, 8))
+        value = keras.Input(shape=(2, 8))
+        mask_tensor = keras.Input(shape=(4, 2))
+        output = test_layer(
+            query=query, value=value, attention_mask=mask_tensor
+        )
+
+        # Create a model containing the test layer.
+        model = keras.Model([query, value, mask_tensor], output)
+
+        # Generate data for the input (non-mask) tensors.
+        from_data = 10 * np.random.random_sample((batch_size, 4, 8))
+        to_data = 10 * np.random.random_sample((batch_size, 2, 8))
+
+        # Invoke the data with a random set of mask data. This should mask at
+        # least one element.
+        mask_data = np.random.randint(2, size=(batch_size, 4, 2))
+        masked_output_data = model.predict([from_data, to_data, mask_data])
+
+        # Invoke the same data, but with a null mask (where no elements are
+        # masked).
+        null_mask_data = np.ones((batch_size, 4, 2))
+        unmasked_output_data = model.predict(
+            [from_data, to_data, null_mask_data]
+        )
+
+        # Because one data is masked and one is not, the outputs should not be
+        # the same.
+        self.assertNotAllClose(masked_output_data, unmasked_output_data)
+
+        # Tests the layer with three inputs: Q, K, V.
+        key = keras.Input(shape=(2, 8))
+        output = test_layer(
+            query, value=value, key=key, attention_mask=mask_tensor
+        )
+        model = keras.Model([query, value, key, mask_tensor], output)
+
+        masked_output_data = model.predict(
+            [from_data, to_data, to_data, mask_data]
+        )
+        unmasked_output_data = model.predict(
+            [from_data, to_data, to_data, null_mask_data]
+        )
+        # Because one data is masked and one is not, the outputs should not be
+        # the same.
+        self.assertNotAllClose(masked_output_data, unmasked_output_data)
+
+        if use_bias:
+            self.assertLen(test_layer._query_dense.trainable_variables, 2)
+            self.assertLen(test_layer._output_dense.trainable_variables, 2)
+        else:
+            self.assertLen(test_layer._query_dense.trainable_variables, 1)
+            self.assertLen(test_layer._output_dense.trainable_variables, 1)
+
+    def test_initializer(self):
+        """Test with a specified initializer."""
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=12,
+            key_dim=64,
+            kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02),
+        )
+        # Create a 3-dimensional input (the first dimension is implicit).
+        query = keras.Input(shape=(40, 80))
+        output = test_layer(query, query)
+        self.assertEqual(output.shape.as_list(), [None, 40, 80])
+
+        # Make sure the sub layers have different kernel init value, and not
+        # reusing the initializers.
+        self.assertNotAllClose(
+            keras.backend.eval(test_layer._query_dense.kernel),
+            keras.backend.eval(test_layer._key_dense.kernel),
+        )
+        self.assertNotAllClose(
+            keras.backend.eval(test_layer._query_dense.kernel),
+            keras.backend.eval(test_layer._value_dense.kernel),
+        )
+        self.assertNotAllClose(
+            keras.backend.eval(test_layer._query_dense.kernel),
+            keras.backend.eval(test_layer._output_dense.kernel),
+        )
+
+    @parameterized.named_parameters(
+        ("bfloat16", tf.bfloat16),
+        ("float16", tf.float16),
+        ("float32", tf.float32),
+        ("float64", tf.float64),
+    )
+    def test_sublayer_dtypes(self, dtype):
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=12, key_dim=64, dtype=dtype
+        )
+
+        query = keras.Input(shape=(40, 80), dtype=dtype)
+        # Build the layer
+        test_layer(query=query, value=query)
+
+        self.assertEqual(test_layer._query_dense.dtype, dtype)
+        self.assertEqual(test_layer._key_dense.dtype, dtype)
+        self.assertEqual(test_layer._value_dense.dtype, dtype)
+        self.assertEqual(test_layer._output_dense.dtype, dtype)
+
+    def test_masked_attention_with_scores(self):
+        """Test with a mask tensor."""
+        test_layer = keras.layers.MultiHeadAttention(num_heads=2, key_dim=2)
+        # Create a 3-dimensional input (the first dimension is implicit).
+        batch_size = 3
+        query = keras.Input(shape=(4, 8))
+        value = keras.Input(shape=(2, 8))
+        mask_tensor = keras.Input(shape=(4, 2))
+        output = test_layer(
+            query=query, value=value, attention_mask=mask_tensor
+        )
+
+        # Create a model containing the test layer.
+        model = keras.Model([query, value, mask_tensor], output)
+
+        # Generate data for the input (non-mask) tensors.
+        from_data = 10 * np.random.random_sample((batch_size, 4, 8))
+        to_data = 10 * np.random.random_sample((batch_size, 2, 8))
+
+        # Invoke the data with a random set of mask data. This should mask at
+        # least one element.
+        mask_data = np.random.randint(2, size=(batch_size, 4, 2))
+        masked_output_data = model.predict([from_data, to_data, mask_data])
+
+        # Invoke the same data, but with a null mask (where no elements are
+        # masked).
+        null_mask_data = np.ones((batch_size, 4, 2))
+        unmasked_output_data = model.predict(
+            [from_data, to_data, null_mask_data]
+        )
+
+        # Because one data is masked and one is not, the outputs should not be
+        # the same.
+        self.assertNotAllClose(masked_output_data, unmasked_output_data)
+
+        # Create a model containing attention scores.
+        output, scores = test_layer(
+            query=query,
+            value=value,
+            attention_mask=mask_tensor,
+            return_attention_scores=True,
+        )
+        model = keras.Model([query, value, mask_tensor], [output, scores])
+        masked_output_data_score, masked_score = model.predict(
+            [from_data, to_data, mask_data]
+        )
+        unmasked_output_data_score, unmasked_score = model.predict(
+            [from_data, to_data, null_mask_data]
+        )
+        self.assertNotAllClose(
+            masked_output_data_score, unmasked_output_data_score
+        )
+        self.assertAllClose(masked_output_data, masked_output_data_score)
+        self.assertAllClose(unmasked_output_data, unmasked_output_data_score)
+        self.assertNotAllClose(masked_score, unmasked_score)
+
+    @parameterized.named_parameters(
+        ("4d_inputs_1freebatch_mask2", [3, 4], [3, 2], [4, 2], (2,)),
+        ("4d_inputs_1freebatch_mask3", [3, 4], [3, 2], [3, 4, 2], (2,)),
+        ("4d_inputs_1freebatch_mask4", [3, 4], [3, 2], [3, 2, 4, 2], (2,)),
+        ("4D_inputs_2D_attention", [3, 4], [3, 2], [3, 4, 3, 2], (1, 2)),
+        ("5D_inputs_2D_attention", [5, 3, 4], [5, 3, 2], [3, 4, 3, 2], (2, 3)),
+        (
+            "5D_inputs_2D_attention_fullmask",
+            [5, 3, 4],
+            [5, 3, 2],
+            [5, 3, 4, 3, 2],
+            (2, 3),
+        ),
+    )
+    def test_high_dim_attention(
+        self, q_dims, v_dims, mask_dims, attention_axes
+    ):
+        """Test with a mask tensor."""
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=2, key_dim=2, attention_axes=attention_axes
+        )
+        batch_size, hidden_size = 3, 8
+        # Generate data for the input (non-mask) tensors.
+        query_shape = [batch_size] + q_dims + [hidden_size]
+        value_shape = [batch_size] + v_dims + [hidden_size]
+        mask_shape = [batch_size] + mask_dims
+        query = 10 * np.random.random_sample(query_shape)
+        value = 10 * np.random.random_sample(value_shape)
+
+        # Invoke the data with a random set of mask data. This should mask at
+        # least one element.
+        mask_data = np.random.randint(2, size=mask_shape).astype("bool")
+        # Invoke the same data, but with a null mask (where no elements are
+        # masked).
+        null_mask_data = np.ones(mask_shape)
+        # Because one data is masked and one is not, the outputs should not be
+        # the same.
+        query_tensor = keras.Input(query_shape[1:], name="query")
+        value_tensor = keras.Input(value_shape[1:], name="value")
+        mask_tensor = keras.Input(mask_shape[1:], name="mask")
+        output = test_layer(
+            query=query_tensor, value=value_tensor, attention_mask=mask_tensor
+        )
+        model = keras.Model([query_tensor, value_tensor, mask_tensor], output)
+
+        self.assertNotAllClose(
+            model.predict([query, value, mask_data]),
+            model.predict([query, value, null_mask_data]),
+        )
+
+    def test_dropout(self):
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=2, key_dim=2, dropout=0.5
+        )
+
+        # Generate data for the input (non-mask) tensors.
+        from_data = keras.backend.ones(shape=(32, 4, 8))
+        to_data = keras.backend.ones(shape=(32, 2, 8))
+        train_out = test_layer(from_data, to_data, None, None, None, True)
+        test_out = test_layer(from_data, to_data, None, None, None, False)
+
+        # Output should be close when not in training mode,
+        # and should not be close when enabling dropout in training mode.
+        self.assertNotAllClose(
+            keras.backend.eval(train_out), keras.backend.eval(test_out)
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(
+            ragged_query=[True, False],
+            ragged_value=[True, False],
+            ragged_key=[True, False],
+        )
+    )
+    def test_ragged_tensor(self, ragged_query, ragged_value, ragged_key):
+        if ragged_query:
+            query = tf.ragged.constant(
+                [
+                    [[3.0, 1.0], [4.0, 1.0]],
+                    [[5.0, 9.0], [2.0, 6.0], [3.0, 1.0]],
+                    [[1.0, 2.0]],
+                ],
+                inner_shape=(2,),
+            )
+        else:
+            query = keras.backend.ones(shape=(3, 2, 2))
+
+        if ragged_value:
+            value = tf.ragged.constant(
+                [[[3.0, 1.0], [4.0, 1.0]], [[5.0, 9.0]], [[1.0, 2.0]]],
+                inner_shape=(2,),
+            )
+        else:
+            value = keras.backend.ones(shape=(3, 4, 2))
+
+        if ragged_key:
+            key = tf.ragged.constant(
+                [
+                    [[3.0, 1.0], [4.0, 1.0]],
+                    [[5.0, 9.0], [2.0, 6.0], [3.0, 1.0], [1.0, 5.0]],
+                    [[1.0, 2.0]],
+                ],
+                inner_shape=(2,),
+            )
+        else:
+            key = keras.backend.ones(shape=(3, 4, 2))
+
+        test_layer = keras.layers.MultiHeadAttention(num_heads=5, key_dim=2)
+        results = test_layer(query, value, key)
+        self.assertAllEqual(results.shape.as_list(), query.shape.as_list())
+
+    def test_ragged_tensor_with_causal_mask_no_error(self):
+        ragged_tensor = tf.ragged.constant(
+            [
+                [[3.0, 1.0], [4.0, 1.0]],
+                [[5.0, 9.0], [2.0, 6.0], [3.0, 1.0]],
+                [[1.0, 2.0]],
+            ],
+            inner_shape=(2,),
+        )
+        test_layer = keras.layers.MultiHeadAttention(num_heads=5, key_dim=2)
+        results = test_layer(
+            ragged_tensor, ragged_tensor, ragged_tensor, use_causal_mask=True
+        )
+        self.assertAllEqual(
+            results.shape.as_list(), ragged_tensor.shape.as_list()
+        )
+
+    def test_query_mask_progagation(self):
+        """Test automatic propagation of the query's mask."""
+        test_layer = keras.layers.MultiHeadAttention(num_heads=2, key_dim=2)
+        self.assertTrue(test_layer.supports_masking)
+        query = tf.constant([[1, 2, 3, 0, 0], [3, 3, 1, 1, 2], [1, 0, 0, 0, 0]])
+        masked_query = keras.layers.Embedding(4, 8, mask_zero=True)(query)
+        value = tf.random.normal((3, 3, 8))
+        output = test_layer(query=masked_query, value=value)
+        self.assertTrue(hasattr(output, "_keras_mask"))
+        self.assertAllEqual(masked_query._keras_mask, output._keras_mask)
+
+    @parameterized.named_parameters(("causal", True), ("not_causal", False))
+    @test_utils.run_v2_only
+    def test_value_mask(self, use_causal_mask):
+        """Test that the value and causal masks are taken into account."""
+        test_layer = keras.layers.MultiHeadAttention(num_heads=2, key_dim=2)
+        query = tf.constant([[1, 2, 3, 0, 0], [3, 3, 1, 1, 2], [1, 0, 0, 0, 0]])
+        masked_query = keras.layers.Embedding(4, 8, mask_zero=True)(query)
+        value = tf.constant([[5, 4, 0], [3, 0, 0], [2, 1, 1]])
+        masked_value = keras.layers.Embedding(6, 8, mask_zero=True)(value)
+        output = test_layer(
+            query=masked_query,
+            value=masked_value,
+            use_causal_mask=use_causal_mask,
+        )
+        mask = tf.constant(
+            [[[True, True, False]] * 3 + [[False, False, False]] * 2]
+            + [[[True, False, False]] * 5]
+            + [[[True, True, True]] + [[False, False, False]] * 4]
+        )
+        if use_causal_mask:
+            mask = mask & tf.constant(
+                [
+                    [[True, False, False], [True, True, False]]
+                    + [[True, True, True]] * 3
+                ]
+            )
+        del masked_query._keras_mask
+        del masked_value._keras_mask
+        output_with_manual_mask = test_layer(
+            query=masked_query, value=masked_value, attention_mask=mask
+        )
+        self.assertAllClose(output, output_with_manual_mask)
+
+    def test_masks_are_cast_to_bool(self):
+        """Test that the implicit and explicit masks are cast to bool."""
+        test_layer = keras.layers.MultiHeadAttention(num_heads=2, key_dim=2)
+        query = np.array([[1, 2, 3, 0, 0], [3, 3, 1, 1, 2], [1, 0, 0, 0, 0]])
+        masked_query = keras.layers.Embedding(4, 8, mask_zero=True)(query)
+        masked_query._keras_mask = tf.cast(masked_query._keras_mask, tf.float32)
+        value = np.array([[5, 4, 0], [3, 0, 0], [2, 1, 1]])
+        masked_value = keras.layers.Embedding(6, 8, mask_zero=True)(value)
+        masked_value._keras_mask = tf.cast(masked_value._keras_mask, tf.float32)
+        float_mask = tf.constant([[[1.0]]])
+        # if all works well, the following should not raise any exception:
+        _ = test_layer(
+            query=masked_query,
+            value=masked_value,
+            use_causal_mask=True,
+            attention_mask=float_mask,
+        )
+
+    @parameterized.named_parameters(
+        ("without_key_same_proj", [40, 80], [20, 80], None, None),
+        ("with_key_same_proj", [40, 80], [20, 80], [20, 30], None),
+        ("wihtout_key_different_proj", [40, 80], [20, 80], None, [30, 40]),
+        ("with_key_different_proj", [40, 80], [20, 80], [20, 30], [15, 50]),
+        (
+            "high_dim_same_proj",
+            [40, 20, 30, 80],
+            [10, 10, 50, 80],
+            [10, 10, 50, 20],
+            None,
+        ),
+        (
+            "high_dim_different_proj",
+            [40, 20, 30, 80],
+            [10, 10, 50, 80],
+            [10, 10, 50, 20],
+            [30, 20],
+        ),
+    )
+    def test_compute_output_shape(
+        self, query_dims, value_dims, key_dims, output_shape
+    ):
+        """Test computed shape is equal to the layer output's shape."""
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=2,
+            key_dim=2,
+            value_dim=2,
+            output_shape=output_shape,
+        )
+        batch_size = None
+        query_shape = [batch_size] + query_dims
+        value_shape = [batch_size] + value_dims
+
+        if key_dims:
+            key_shape = [batch_size] + key_dims
+        else:
+            key_shape = None
+
+        query = keras.Input(query_shape[1:])
+        value = keras.Input(value_shape[1:])
+        if key_shape:
+            key = keras.Input(key_shape[1:])
+        else:
+            key = None
+        output = test_layer(query=query, value=value, key=key)
+        comp_output_shape = test_layer.compute_output_shape(
+            query_shape, value_shape, key_shape
+        )
+        self.assertListEqual(
+            output.shape.as_list(), comp_output_shape.as_list()
+        )
+
+    @parameterized.named_parameters(
+        ("query_value_dim_mismatch", (None, 40, 80), (None, 20, 70), None),
+        (
+            "key_value_dim_mismatch",
+            (None, 40, 80),
+            (None, 20, 80),
+            (None, 10, 70),
+        ),
+        (
+            "key_value_dim_mismatch_high_dim",
+            (None, 40, 20, 30, 80),
+            (None, 10, 10, 50, 80),
+            (None, 10, 15, 50, 20),
+        ),
+    )
+    def test_compute_output_shape_raises_error(
+        self, query_shape, value_shape, key_shape
+    ):
+        """Test dimension mismatches"""
+        test_layer = keras.layers.MultiHeadAttention(
+            num_heads=4,
+            key_dim=2,
+            value_dim=2,
+        )
+        with self.assertRaisesRegex(ValueError, r"must be equal"):
+            test_layer.compute_output_shape(query_shape, value_shape, key_shape)
 
 
 class SubclassAttention(keras.layers.MultiHeadAttention):
+    def _build_attention(self, qkv_rank):
+        pass
 
-  def _build_attention(self, qkv_rank):
-    pass
-
-  def _compute_attention(self,
-                         query_tensor,
-                         key_tensor,
-                         value_tensor,
-                         attention_mask=None,
-                         training=None):
-    return value_tensor, None
+    def _compute_attention(
+        self,
+        query_tensor,
+        key_tensor,
+        value_tensor,
+        attention_mask=None,
+        training=None,
+    ):
+        return value_tensor, None
 
 
 @test_combinations.run_all_keras_modes
 class AttentionSubclassTest(test_combinations.TestCase):
-
-  def test_initializer(self):
-    """Test with a specified initializer."""
-    test_layer = SubclassAttention(num_heads=12, key_dim=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = keras.Input(shape=(40, 80))
-    output = test_layer(query, query)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
+    def test_initializer(self):
+        """Test with a specified initializer."""
+        test_layer = SubclassAttention(num_heads=12, key_dim=64)
+        # Create a 3-dimensional input (the first dimension is implicit).
+        query = keras.Input(shape=(40, 80))
+        output = test_layer(query, query)
+        self.assertEqual(output.shape.as_list(), [None, 40, 80])
 
 
+@object_registration.register_keras_serializable()
 class TestModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+        self.attention = keras.layers.MultiHeadAttention(
+            num_heads=3,
+            key_dim=4,
+            value_dim=4,
+            use_bias=True,
+            dropout=0.0,
+            output_shape=[12],
+        )
 
-  def __init__(self):
-    super().__init__()
-    self.attention = keras.layers.MultiHeadAttention(
-        num_heads=3,
-        key_dim=4,
-        value_dim=4,
-        use_bias=True,
-        dropout=0.0,
-        output_shape=[12])
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
 
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
+    def get_config(self):
+        return {}
 
-  def get_config(self):
-    return {}
-
-  def call(self, x, training=False):
-    return self.attention(x, x, training=training)
+    def call(self, x, training=False):
+        return self.attention(x, x, training=training)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class KerasModelSavingTest(test_combinations.TestCase):
-
-  def test_keras_saving_subclass(self):
-    model = TestModel()
-    query = keras.Input(shape=(40, 80))
-    _ = model(query)
-    model_path = self.get_temp_dir() + "/tmp_model"
-    keras.models.save_model(model, model_path, save_format="tf")
-    reloaded_model = keras.models.load_model(model_path)
-    self.assertEqual(
-        len(model.trainable_variables), len(reloaded_model.trainable_variables))
-    for src_v, loaded_v in zip(model.trainable_variables,
-                               reloaded_model.trainable_variables):
-      self.assertAllEqual(src_v, loaded_v)
-
-  @parameterized.parameters("h5", "tf")
-  def test_keras_saving_functional(self, save_format):
-    model = TestModel()
-    query = keras.Input(shape=(40, 80))
-    output = keras.layers.MultiHeadAttention(
-        num_heads=3,
-        key_dim=4,
-        value_dim=4,
-        use_bias=True,
-        dropout=0.0)(query, query)
-    model = keras.Model(inputs=query, outputs=output)
-    model_path = self.get_temp_dir() + "/tmp_model"
-    keras.models.save_model(model, model_path, save_format=save_format)
-    reloaded_model = keras.models.load_model(model_path)
-    self.assertEqual(
-        len(model.trainable_variables), len(reloaded_model.trainable_variables))
-    for src_v, loaded_v in zip(model.trainable_variables,
-                               reloaded_model.trainable_variables):
-      self.assertAllEqual(src_v, loaded_v)
-
-  def test_create_without_build(self):
-    not_initialized_layer = keras.layers.MultiHeadAttention(
-        num_heads=3, key_dim=4, value_dim=4)
-    keras.layers.MultiHeadAttention.from_config(
-        not_initialized_layer.get_config())
+    @parameterized.parameters("tf", "keras_v3")
+    def test_keras_saving_subclass(self, save_format):
+        model = TestModel()
+        query = keras.Input(shape=(40, 80))
+        _ = model(query)
+        model_path = self.get_temp_dir() + "/tmp_model"
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model_path += ".keras"
+        keras.models.save_model(model, model_path, save_format=save_format)
+        reloaded_model = keras.models.load_model(model_path)
+        self.assertEqual(
+            len(model.trainable_variables),
+            len(reloaded_model.trainable_variables),
+        )
+        for src_v, loaded_v in zip(
+            model.trainable_variables, reloaded_model.trainable_variables
+        ):
+            self.assertAllEqual(src_v, loaded_v)
+
+    @parameterized.parameters("h5", "tf", "keras_v3")
+    def test_keras_saving_functional(self, save_format):
+        model = TestModel()
+        query = keras.Input(shape=(40, 80))
+        output = keras.layers.MultiHeadAttention(
+            num_heads=3, key_dim=4, value_dim=4, use_bias=True, dropout=0.0
+        )(query, query)
+        model = keras.Model(inputs=query, outputs=output)
+        model_path = self.get_temp_dir() + "/tmp_model"
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model_path += ".keras"
+        keras.models.save_model(model, model_path, save_format=save_format)
+        reloaded_model = keras.models.load_model(model_path)
+        self.assertEqual(
+            len(model.trainable_variables),
+            len(reloaded_model.trainable_variables),
+        )
+        for src_v, loaded_v in zip(
+            model.trainable_variables, reloaded_model.trainable_variables
+        ):
+            self.assertAllEqual(src_v, loaded_v)
+
+    def test_create_without_build(self):
+        not_initialized_layer = keras.layers.MultiHeadAttention(
+            num_heads=3, key_dim=4, value_dim=4
+        )
+        keras.layers.MultiHeadAttention.from_config(
+            not_initialized_layer.get_config()
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/convolutional/BUILD b/keras/layers/convolutional/BUILD
index 974ff9154627..60560697c35a 100644
--- a/keras/layers/convolutional/BUILD
+++ b/keras/layers/convolutional/BUILD
@@ -1,15 +1,17 @@
 # Description:
 #  Contains the Keras convolution layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:__subpackages__",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
         "//third_party/tensorflow/python/keras:__subpackages__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
diff --git a/keras/layers/convolutional/__init__.py b/keras/layers/convolutional/__init__.py
index 99cbf4e7b904..6b3d3d14cad3 100644
--- a/keras/layers/convolutional/__init__.py
+++ b/keras/layers/convolutional/__init__.py
@@ -13,28 +13,27 @@
 # limitations under the License.
 # ==============================================================================
 """Keras convolution layers."""
-# pylint: disable=g-bad-import-order
 
+
+# Convolution layer aliases.
 # Convolution layers.
 from keras.layers.convolutional.conv1d import Conv1D
-from keras.layers.convolutional.conv2d import Conv2D
-from keras.layers.convolutional.conv3d import Conv3D
+from keras.layers.convolutional.conv1d import Convolution1D
 from keras.layers.convolutional.conv1d_transpose import Conv1DTranspose
+from keras.layers.convolutional.conv1d_transpose import Convolution1DTranspose
+from keras.layers.convolutional.conv2d import Conv2D
+from keras.layers.convolutional.conv2d import Convolution2D
 from keras.layers.convolutional.conv2d_transpose import Conv2DTranspose
+from keras.layers.convolutional.conv2d_transpose import Convolution2DTranspose
+from keras.layers.convolutional.conv3d import Conv3D
+from keras.layers.convolutional.conv3d import Convolution3D
 from keras.layers.convolutional.conv3d_transpose import Conv3DTranspose
+from keras.layers.convolutional.conv3d_transpose import Convolution3DTranspose
 from keras.layers.convolutional.depthwise_conv1d import DepthwiseConv1D
 from keras.layers.convolutional.depthwise_conv2d import DepthwiseConv2D
 from keras.layers.convolutional.separable_conv1d import SeparableConv1D
-from keras.layers.convolutional.separable_conv2d import SeparableConv2D
-
-# Convolution layer aliases.
-from keras.layers.convolutional.conv1d import Convolution1D
-from keras.layers.convolutional.conv2d import Convolution2D
-from keras.layers.convolutional.conv3d import Convolution3D
-from keras.layers.convolutional.conv1d_transpose import Convolution1DTranspose
-from keras.layers.convolutional.conv2d_transpose import Convolution2DTranspose
-from keras.layers.convolutional.conv3d_transpose import Convolution3DTranspose
 from keras.layers.convolutional.separable_conv1d import SeparableConvolution1D
+from keras.layers.convolutional.separable_conv2d import SeparableConv2D
 from keras.layers.convolutional.separable_conv2d import SeparableConvolution2D
 
 # Pooling layers imported for backwards namespace compatibility.
diff --git a/keras/layers/convolutional/base_conv.py b/keras/layers/convolutional/base_conv.py
index 21dfb8e80a4b..da5613cd650e 100644
--- a/keras/layers/convolutional/base_conv.py
+++ b/keras/layers/convolutional/base_conv.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Keras base class for convolution layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import constraints
@@ -22,370 +24,408 @@
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class Conv(Layer):
-  """Abstract N-D convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Note: layer attributes cannot be modified after the layer has been called
-  once (except the `trainable` attribute).
-
-  Args:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution). Could be "None", eg in the case of
-      depth wise convolution.
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"`,  `"same"`, or `"causal"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros
-      evenly to the left/right or up/down of the input such that output has the
-      same height/width dimension as the input. `"causal"` results in causal
-      (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch_size, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved
-      separately with `filters / groups` filters. The output is the
-      concatenation of all the `groups` results along the channel axis.
-      Input channels and `filters` must both be divisible by `groups`.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel. If None, the
-      default initializer (glorot_uniform) will be used.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer (zeros) will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               dilation_rate=1,
-               groups=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               conv_op=None,
-               **kwargs):
-    super().__init__(
-        trainable=trainable,
-        name=name,
-        activity_regularizer=regularizers.get(activity_regularizer),
-        **kwargs)
-    self.rank = rank
-
-    if isinstance(filters, float):
-      filters = int(filters)
-    if filters is not None and filters <= 0:
-      raise ValueError('Invalid value for argument `filters`. '
-                       'Expected a strictly positive value. '
-                       f'Received filters={filters}.')
-    self.filters = filters
-    self.groups = groups or 1
-    self.kernel_size = conv_utils.normalize_tuple(
-        kernel_size, rank, 'kernel_size')
-    self.strides = conv_utils.normalize_tuple(
-        strides, rank, 'strides', allow_zero=True)
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.dilation_rate = conv_utils.normalize_tuple(
-        dilation_rate, rank, 'dilation_rate')
-
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-    self.input_spec = InputSpec(min_ndim=self.rank + 2)
-
-    self._validate_init()
-    self._is_causal = self.padding == 'causal'
-    self._channels_first = self.data_format == 'channels_first'
-    self._tf_data_format = conv_utils.convert_data_format(
-        self.data_format, self.rank + 2)
-
-  def _validate_init(self):
-    if self.filters is not None and self.filters % self.groups != 0:
-      raise ValueError(
-          'The number of filters must be evenly divisible by the number of '
-          'groups. Received: groups={}, filters={}'.format(
-              self.groups, self.filters))
-
-    if not all(self.kernel_size):
-      raise ValueError('The argument `kernel_size` cannot contain 0(s). '
-                       'Received: %s' % (self.kernel_size,))
-
-    if not all(self.strides):
-      raise ValueError('The argument `strides` cannot contains 0(s). '
-                       'Received: %s' % (self.strides,))
-
-    if self.padding == 'causal':
-      # pylint: disable=g-import-not-at-top
-      from keras.layers.convolutional.conv1d import Conv1D
-      from keras.layers.convolutional.separable_conv1d import SeparableConv1D
-      # pylint: enable=g-import-not-at-top
-      if not isinstance(self, (Conv1D, SeparableConv1D)):
-        raise ValueError('Causal padding is only supported for `Conv1D`'
-                         'and `SeparableConv1D`.')
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    input_channel = self._get_input_channel(input_shape)
-    if input_channel % self.groups != 0:
-      raise ValueError(
-          'The number of input channels must be evenly divisible by the number '
-          'of groups. Received groups={}, but the input has {} channels '
-          '(full input shape is {}).'.format(self.groups, input_channel,
-                                             input_shape))
-    kernel_shape = self.kernel_size + (input_channel // self.groups,
-                                       self.filters)
-
-    # compute_output_shape contains some validation logic for the input shape,
-    # and make sure the output shape has all positive dimensions.
-    self.compute_output_shape(input_shape)
-
-    self.kernel = self.add_weight(
-        name='kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
+    """Abstract N-D convolution layer (private, used as implementation base).
+
+    This layer creates a convolution kernel that is convolved
+    (actually cross-correlated) with the layer input to produce a tensor of
+    outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    Note: layer attributes cannot be modified after the layer has been called
+    once (except the `trainable` attribute).
+
+    Args:
+      rank: An integer, the rank of the convolution, e.g. "2" for 2D
+        convolution.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution). Could be "None", eg in the case of
+        depth wise convolution.
+      kernel_size: An integer or tuple/list of n integers, specifying the
+        length of the convolution window.
+      strides: An integer or tuple/list of n integers,
+        specifying the stride length of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"`,  `"same"`, or `"causal"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input. `"causal"` results in
+        causal (dilated) convolutions, e.g. `output[t]` does not depend on
+        `input[t+1:]`.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, ..., channels)` while `channels_first` corresponds to
+        inputs with shape `(batch_size, channels, ...)`.
+      dilation_rate: An integer or tuple/list of n integers, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+      groups: A positive integer specifying the number of groups in which the
+        input is split along the channel axis. Each group is convolved
+        separately with `filters / groups` filters. The output is the
+        concatenation of all the `groups` results along the channel axis.
+        Input channels and `filters` must both be divisible by `groups`.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel. If None,
+        the default initializer (glorot_uniform) will be used.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer (zeros) will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+    """
+
+    def __init__(
+        self,
+        rank,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format=None,
+        dilation_rate=1,
+        groups=1,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
         trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          name='bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    channel_axis = self._get_channel_axis()
-    self.input_spec = InputSpec(min_ndim=self.rank + 2,
-                                axes={channel_axis: input_channel})
-    self.built = True
-
-  def convolution_op(self, inputs, kernel):
-    if self.padding == 'causal':
-      tf_padding = 'VALID'  # Causal padding handled in `call`.
-    elif isinstance(self.padding, str):
-      tf_padding = self.padding.upper()
-    else:
-      tf_padding = self.padding
-
-    return tf.nn.convolution(
-        inputs,
-        kernel,
-        strides=list(self.strides),
-        padding=tf_padding,
-        dilations=list(self.dilation_rate),
-        data_format=self._tf_data_format,
-        name=self.__class__.__name__)
-
-  # TODO(b/213173659): remove this when grouped convolutions are fully supported
-  # on the CPU for compiled functions. For now, we need this as a workaround for
-  # CPU support.
-  @tf.function(jit_compile=True)
-  def _jit_compiled_convolution_op(self, inputs, kernel):
-    return self.convolution_op(inputs, kernel)
-
-  def call(self, inputs):
-    input_shape = inputs.shape
-
-    if self._is_causal:  # Apply causal padding to inputs for Conv1D.
-      inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
-
-    if self.groups > 1:
-      outputs = self._jit_compiled_convolution_op(inputs, self.kernel)
-    else:
-      outputs = self.convolution_op(inputs, self.kernel)
-
-    if self.use_bias:
-      output_rank = outputs.shape.rank
-      if self.rank == 1 and self._channels_first:
-        # nn.bias_add does not accept a 1D input tensor.
-        bias = tf.reshape(self.bias, (1, self.filters, 1))
-        outputs += bias
-      else:
-        # Handle multiple batch dimensions.
-        if output_rank is not None and output_rank > 2 + self.rank:
-
-          def _apply_fn(o):
-            return tf.nn.bias_add(
-                o, self.bias, data_format=self._tf_data_format)
-
-          outputs = conv_utils.squeeze_batch_dims(
-              outputs, _apply_fn, inner_rank=self.rank + 1)
-        else:
-          outputs = tf.nn.bias_add(
-              outputs, self.bias, data_format=self._tf_data_format)
-
-    if not tf.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = self.compute_output_shape(input_shape)
-      outputs.set_shape(out_shape)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def _spatial_output_shape(self, spatial_input_shape):
-    return [
-        conv_utils.conv_output_length(  # pylint: disable=g-complex-comprehension
-            length,
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        for i, length in enumerate(spatial_input_shape)
-    ]
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    batch_rank = len(input_shape) - self.rank - 1
-    try:
-      if self.data_format == 'channels_last':
-        return tf.TensorShape(
-            input_shape[:batch_rank] +
-            self._spatial_output_shape(input_shape[batch_rank:-1]) +
-            [self.filters])
-      else:
-        return tf.TensorShape(
-            input_shape[:batch_rank] + [self.filters] +
-            self._spatial_output_shape(input_shape[batch_rank + 1:]))
-
-    except ValueError:
-      raise ValueError(
-          f'One of the dimensions in the output is <= 0 '
-          f'due to downsampling in {self.name}. Consider '
-          f'increasing the input size. '
-          f'Received input shape {input_shape} which would produce '
-          f'output shape with a zero or negative value in a '
-          f'dimension.')
-
-  def _recreate_conv_op(self, inputs):  # pylint: disable=unused-argument
-    return False
-
-  def get_config(self):
-    config = {
-        'filters':
+        name=None,
+        conv_op=None,
+        **kwargs,
+    ):
+        super().__init__(
+            trainable=trainable,
+            name=name,
+            activity_regularizer=regularizers.get(activity_regularizer),
+            **kwargs,
+        )
+        self.rank = rank
+
+        if isinstance(filters, float):
+            filters = int(filters)
+        if filters is not None and filters <= 0:
+            raise ValueError(
+                "Invalid value for argument `filters`. "
+                "Expected a strictly positive value. "
+                f"Received filters={filters}."
+            )
+        self.filters = filters
+        self.groups = groups or 1
+        self.kernel_size = conv_utils.normalize_tuple(
+            kernel_size, rank, "kernel_size"
+        )
+        self.strides = conv_utils.normalize_tuple(
+            strides, rank, "strides", allow_zero=True
+        )
+        self.padding = conv_utils.normalize_padding(padding)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.dilation_rate = conv_utils.normalize_tuple(
+            dilation_rate, rank, "dilation_rate"
+        )
+
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+        self.input_spec = InputSpec(min_ndim=self.rank + 2)
+
+        self._validate_init()
+        self._is_causal = self.padding == "causal"
+        self._channels_first = self.data_format == "channels_first"
+        self._tf_data_format = conv_utils.convert_data_format(
+            self.data_format, self.rank + 2
+        )
+
+    def _validate_init(self):
+        if self.filters is not None and self.filters % self.groups != 0:
+            raise ValueError(
+                "The number of filters must be evenly divisible by the "
+                "number of groups. Received: groups={}, filters={}".format(
+                    self.groups, self.filters
+                )
+            )
+
+        if not all(self.kernel_size):
+            raise ValueError(
+                "The argument `kernel_size` cannot contain 0(s). Received: %s"
+                % (self.kernel_size,)
+            )
+
+        if not all(self.strides):
+            raise ValueError(
+                "The argument `strides` cannot contains 0(s). Received: %s"
+                % (self.strides,)
+            )
+
+        if self.padding == "causal":
+
+            from keras.layers.convolutional.conv1d import Conv1D
+            from keras.layers.convolutional.separable_conv1d import (
+                SeparableConv1D,
+            )
+
+            if not isinstance(self, (Conv1D, SeparableConv1D)):
+                raise ValueError(
+                    "Causal padding is only supported for `Conv1D`"
+                    "and `SeparableConv1D`."
+                )
+
+        if max(self.strides) > 1 and max(self.dilation_rate) > 1:
+            raise ValueError(
+                "`strides > 1` not supported in conjunction with "
+                f"`dilation_rate > 1`. Received: strides={self.strides} and "
+                f"dilation_rate={self.dilation_rate}"
+            )
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        input_channel = self._get_input_channel(input_shape)
+        if input_channel % self.groups != 0:
+            raise ValueError(
+                "The number of input channels must be evenly divisible by "
+                "the number of groups. Received groups={}, but the input "
+                "has {} channels (full input shape is {}).".format(
+                    self.groups, input_channel, input_shape
+                )
+            )
+        kernel_shape = self.kernel_size + (
+            input_channel // self.groups,
             self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'groups':
-            self.groups,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def _compute_causal_padding(self, inputs):
-    """Calculates padding for 'causal' option for 1-d conv layers."""
-    left_pad = self.dilation_rate[0] * (self.kernel_size[0] - 1)
-    if getattr(inputs.shape, 'ndims', None) is None:
-      batch_rank = 1
-    else:
-      batch_rank = len(inputs.shape) - 2
-    if self.data_format == 'channels_last':
-      causal_padding = [[0, 0]] * batch_rank + [[left_pad, 0], [0, 0]]
-    else:
-      causal_padding = [[0, 0]] * batch_rank + [[0, 0], [left_pad, 0]]
-    return causal_padding
-
-  def _get_channel_axis(self):
-    if self.data_format == 'channels_first':
-      return -1 - self.rank
-    else:
-      return -1
-
-  def _get_input_channel(self, input_shape):
-    channel_axis = self._get_channel_axis()
-    if input_shape.dims[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs should be defined. '
-                       f'The input_shape received is {input_shape}, '
-                       f'where axis {channel_axis} (0-based) '
-                       'is the channel dimension, which found to be `None`.')
-    return int(input_shape[channel_axis])
-
-  def _get_padding_op(self):
-    if self.padding == 'causal':
-      op_padding = 'valid'
-    else:
-      op_padding = self.padding
-    if not isinstance(op_padding, (list, tuple)):
-      op_padding = op_padding.upper()
-    return op_padding
+        )
+
+        # compute_output_shape contains some validation logic for the input
+        # shape, and make sure the output shape has all positive dimensions.
+        self.compute_output_shape(input_shape)
+
+        self.kernel = self.add_weight(
+            name="kernel",
+            shape=kernel_shape,
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            trainable=True,
+            dtype=self.dtype,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                name="bias",
+                shape=(self.filters,),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                trainable=True,
+                dtype=self.dtype,
+            )
+        else:
+            self.bias = None
+        channel_axis = self._get_channel_axis()
+        self.input_spec = InputSpec(
+            min_ndim=self.rank + 2, axes={channel_axis: input_channel}
+        )
+        self.built = True
+
+    def convolution_op(self, inputs, kernel):
+        if self.padding == "causal":
+            tf_padding = "VALID"  # Causal padding handled in `call`.
+        elif isinstance(self.padding, str):
+            tf_padding = self.padding.upper()
+        else:
+            tf_padding = self.padding
+
+        return tf.nn.convolution(
+            inputs,
+            kernel,
+            strides=list(self.strides),
+            padding=tf_padding,
+            dilations=list(self.dilation_rate),
+            data_format=self._tf_data_format,
+            name=self.__class__.__name__,
+        )
+
+    # TODO(b/213173659): remove this when grouped convolutions are fully
+    # supported on the CPU for compiled functions. For now, we need this as a
+    # workaround for CPU support.
+    @tf.function(jit_compile=True)
+    def _jit_compiled_convolution_op(self, inputs, kernel):
+        return self.convolution_op(inputs, kernel)
+
+    def call(self, inputs):
+        input_shape = inputs.shape
+
+        if self._is_causal:  # Apply causal padding to inputs for Conv1D.
+            inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
+
+        if self.groups > 1:
+            outputs = self._jit_compiled_convolution_op(
+                inputs, tf.convert_to_tensor(self.kernel)
+            )
+        else:
+            outputs = self.convolution_op(inputs, self.kernel)
+
+        if self.use_bias:
+            output_rank = outputs.shape.rank
+            if self.rank == 1 and self._channels_first:
+                # nn.bias_add does not accept a 1D input tensor.
+                bias = tf.reshape(self.bias, (1, self.filters, 1))
+                outputs += bias
+            else:
+                # Handle multiple batch dimensions.
+                if output_rank is not None and output_rank > 2 + self.rank:
+
+                    def _apply_fn(o):
+                        return tf.nn.bias_add(
+                            o, self.bias, data_format=self._tf_data_format
+                        )
+
+                    outputs = conv_utils.squeeze_batch_dims(
+                        outputs, _apply_fn, inner_rank=self.rank + 1
+                    )
+                else:
+                    outputs = tf.nn.bias_add(
+                        outputs, self.bias, data_format=self._tf_data_format
+                    )
+
+        if not tf.executing_eagerly() and input_shape.rank:
+            # Infer the static output shape:
+            out_shape = self.compute_output_shape(input_shape)
+            outputs.set_shape(out_shape)
+
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
+
+    def _spatial_output_shape(self, spatial_input_shape):
+        return [
+            conv_utils.conv_output_length(
+                length,
+                self.kernel_size[i],
+                padding=self.padding,
+                stride=self.strides[i],
+                dilation=self.dilation_rate[i],
+            )
+            for i, length in enumerate(spatial_input_shape)
+        ]
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        batch_rank = len(input_shape) - self.rank - 1
+        try:
+            if self.data_format == "channels_last":
+                return tf.TensorShape(
+                    input_shape[:batch_rank]
+                    + self._spatial_output_shape(input_shape[batch_rank:-1])
+                    + [self.filters]
+                )
+            else:
+                return tf.TensorShape(
+                    input_shape[:batch_rank]
+                    + [self.filters]
+                    + self._spatial_output_shape(input_shape[batch_rank + 1 :])
+                )
+
+        except ValueError:
+            raise ValueError(
+                "One of the dimensions in the output is <= 0 "
+                f"due to downsampling in {self.name}. Consider "
+                "increasing the input size. "
+                f"Received input shape {input_shape} which would produce "
+                "output shape with a zero or negative value in a "
+                "dimension."
+            )
+
+    def _recreate_conv_op(self, inputs):
+        return False
+
+    def get_config(self):
+        config = {
+            "filters": self.filters,
+            "kernel_size": self.kernel_size,
+            "strides": self.strides,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "dilation_rate": self.dilation_rate,
+            "groups": self.groups,
+            "activation": activations.serialize(self.activation),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def _compute_causal_padding(self, inputs):
+        """Calculates padding for 'causal' option for 1-d conv layers."""
+        left_pad = self.dilation_rate[0] * (self.kernel_size[0] - 1)
+        if getattr(inputs.shape, "ndims", None) is None:
+            batch_rank = 1
+        else:
+            batch_rank = len(inputs.shape) - 2
+        if self.data_format == "channels_last":
+            causal_padding = [[0, 0]] * batch_rank + [[left_pad, 0], [0, 0]]
+        else:
+            causal_padding = [[0, 0]] * batch_rank + [[0, 0], [left_pad, 0]]
+        return causal_padding
+
+    def _get_channel_axis(self):
+        if self.data_format == "channels_first":
+            return -1 - self.rank
+        else:
+            return -1
+
+    def _get_input_channel(self, input_shape):
+        channel_axis = self._get_channel_axis()
+        if input_shape.dims[channel_axis].value is None:
+            raise ValueError(
+                "The channel dimension of the inputs should be defined. "
+                f"The input_shape received is {input_shape}, "
+                f"where axis {channel_axis} (0-based) "
+                "is the channel dimension, which found to be `None`."
+            )
+        return int(input_shape[channel_axis])
+
+    def _get_padding_op(self):
+        if self.padding == "causal":
+            op_padding = "valid"
+        else:
+            op_padding = self.padding
+        if not isinstance(op_padding, (list, tuple)):
+            op_padding = op_padding.upper()
+        return op_padding
diff --git a/keras/layers/convolutional/base_depthwise_conv.py b/keras/layers/convolutional/base_depthwise_conv.py
index e2e89de2f2bc..f18c25ee89f7 100644
--- a/keras/layers/convolutional/base_depthwise_conv.py
+++ b/keras/layers/convolutional/base_depthwise_conv.py
@@ -13,196 +13,214 @@
 # limitations under the License.
 # ==============================================================================
 """Keras abstract base for depthwise convolutions."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import constraints
 from keras import initializers
 from keras import regularizers
 from keras.engine.input_spec import InputSpec
 from keras.layers.convolutional.base_conv import Conv
-import tensorflow.compat.v2 as tf
 
 
 class DepthwiseConv(Conv):
-  """Depthwise convolution.
-
-  Depthwise convolution is a type of convolution in which each input channel is
-  convolved with a different kernel (called a depthwise kernel). You
-  can understand depthwise convolution as the first step in a depthwise
-  separable convolution.
-
-  It is implemented via the following steps:
-
-  - Split the input into individual channels.
-  - Convolve each channel with an individual depthwise kernel with
-    `depth_multiplier` output channels.
-  - Concatenate the convolved outputs along the channels axis.
-
-  Unlike a regular convolution, depthwise convolution does not mix
-  information across different input channels.
-
-  The `depth_multiplier` argument determines how many filter are applied to one
-  input channel. As such, it controls the amount of output channels that are
-  generated per input channel in the depthwise step.
-
-  Args:
-    kernel_size: A tuple or list of integers specifying the spatial dimensions
-      of the filters. Can be a single integer to specify the same value for all
-      spatial dimensions.
-    strides: A tuple or list of integers specifying the strides of the
-      convolution. Can be a single integer to specify the same value for all
-      spatial dimensions. Specifying any `stride` value != 1 is incompatible
-      with specifying any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means no
-      padding. `"same"` results in padding with zeros evenly to the left/right
-      or up/down of the input such that output has the same height/width
-      dimension as the input.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `filters_in * depth_multiplier`.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `(batch_size, height, width, channels)` while
-      `channels_first` corresponds to inputs with shape `(batch_size, channels,
-      height, width)`. It defaults to the `image_data_format` value found in
-      your Keras config file at `~/.keras/keras.json`. If you never set it, then
-      it will be 'channels_last'.
-    dilation_rate: An integer or tuple/list of 2 integers, specifying the
-      dilation rate to use for dilated convolution. Currently, specifying any
-      `dilation_rate` value != 1 is incompatible with specifying any `strides`
-      value != 1.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    depthwise_initializer: Initializer for the depthwise kernel matrix (see
-      `keras.initializers`). If None, the default initializer
-      ('glorot_uniform') will be used.
-    bias_initializer: Initializer for the bias vector (see
-      `keras.initializers`). If None, the default initializer ('zeros') will be
-      used.
-    depthwise_regularizer: Regularizer function applied to the depthwise kernel
-      matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector (see
-      `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its 'activation') (see `keras.regularizers`).
-    depthwise_constraint: Constraint function applied to the depthwise kernel
-      matrix (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector (see
-      `keras.constraints`).
-
-  Input shape:
-    4D tensor with shape: `[batch_size, channels, rows, cols]` if
-      data_format='channels_first'
-    or 4D tensor with shape: `[batch_size, rows, cols, channels]` if
-      data_format='channels_last'.
-
-  Output shape:
-    4D tensor with shape: `[batch_size, channels * depth_multiplier, new_rows,
-      new_cols]` if `data_format='channels_first'`
-      or 4D tensor with shape: `[batch_size,
-      new_rows, new_cols, channels * depth_multiplier]` if
-      `data_format='channels_last'`. `rows` and `cols` values might have changed
-      due to padding.
-
-  Returns:
-    A tensor of rank 4 representing
-    `activation(depthwiseconv2d(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is "causal".
-    ValueError: when both `strides` > 1 and `dilation_rate` > 1.
-  """
-
-  def __init__(self,
-               rank,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               depth_multiplier=1,
-               data_format=None,
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
+    """Depthwise convolution.
+
+    Depthwise convolution is a type of convolution in which each input channel
+    is convolved with a different kernel (called a depthwise kernel). You can
+    understand depthwise convolution as the first step in a depthwise separable
+    convolution.
+
+    It is implemented via the following steps:
+
+    - Split the input into individual channels.
+    - Convolve each channel with an individual depthwise kernel with
+      `depth_multiplier` output channels.
+    - Concatenate the convolved outputs along the channels axis.
+
+    Unlike a regular convolution, depthwise convolution does not mix
+    information across different input channels.
+
+    The `depth_multiplier` argument determines how many filter are applied to
+    one input channel. As such, it controls the amount of output channels that
+    are generated per input channel in the depthwise step.
+
+    Args:
+      kernel_size: A tuple or list of integers specifying the spatial dimensions
+        of the filters. Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: A tuple or list of integers specifying the strides of the
+        convolution. Can be a single integer to specify the same value for all
+        spatial dimensions. Specifying any `stride` value != 1 is incompatible
+        with specifying any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means
+        no padding. `"same"` results in padding with zeros evenly to the
+        left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch_size, height,
+        width, channels)` while `channels_first` corresponds to inputs with
+        shape `(batch_size, channels, height, width)`. If left unspecified,
+        uses `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of 2 integers, specifying the
+        dilation rate to use for dilated convolution. Currently, specifying any
+        `dilation_rate` value != 1 is incompatible with specifying any `strides`
+        value != 1.
+      activation: Activation function to use. If you don't specify anything, no
+        activation is applied (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: Initializer for the depthwise kernel matrix (see
+        `keras.initializers`). If None, the default initializer
+        ('glorot_uniform') will be used.
+      bias_initializer: Initializer for the bias vector (see
+        `keras.initializers`). If None, the default initializer ('zeros') will
+        be used.
+      depthwise_regularizer: Regularizer function applied to the depthwise
+        kernel matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector (see
+        `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its 'activation') (see `keras.regularizers`).
+      depthwise_constraint: Constraint function applied to the depthwise kernel
+        matrix (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector (see
+        `keras.constraints`).
+
+    Input shape:
+      4D tensor with shape: `[batch_size, channels, rows, cols]` if
+        data_format='channels_first'
+      or 4D tensor with shape: `[batch_size, rows, cols, channels]` if
+        data_format='channels_last'.
+
+    Output shape:
+      4D tensor with shape: `[batch_size, channels * depth_multiplier, new_rows,
+        new_cols]` if `data_format='channels_first'`
+        or 4D tensor with shape: `[batch_size,
+        new_rows, new_cols, channels * depth_multiplier]` if
+        `data_format='channels_last'`. `rows` and `cols` values might have
+        changed due to padding.
+
+    Returns:
+      A tensor of rank 4 representing
+      `activation(depthwiseconv2d(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is "causal".
+      ValueError: when both `strides` > 1 and `dilation_rate` > 1.
+    """
+
+    def __init__(
+        self,
         rank,
-        filters=None,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = initializers.get(depthwise_initializer)
-    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
-    self.depthwise_constraint = constraints.get(depthwise_constraint)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-  def build(self, input_shape):
-    if len(input_shape) != self.rank + 2:
-      raise ValueError('Inputs to `DepthwiseConv` should have '
-                       f'rank {self.rank + 2}. '
-                       f'Received input_shape={input_shape}.')
-    input_shape = tf.TensorShape(input_shape)
-    channel_axis = self._get_channel_axis()
-    if input_shape.dims[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs to `DepthwiseConv` '
-                       'should be defined. '
-                       f'The input_shape received is {input_shape}, '
-                       f'where axis {channel_axis} (0-based) '
-                       'is the channel dimension, which found to be `None`.')
-    input_dim = int(input_shape[channel_axis])
-    depthwise_kernel_shape = self.kernel_size + (input_dim,
-                                                 self.depth_multiplier)
-
-    self.depthwise_kernel = self.add_weight(
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        name='depthwise_kernel',
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint)
-
-    if self.use_bias:
-      self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,),
-                                  initializer=self.bias_initializer,
-                                  name='bias',
-                                  regularizer=self.bias_regularizer,
-                                  constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(
-        min_ndim=self.rank + 2, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-  def get_config(self):
-    config = super().get_config()
-    config.pop('filters')
-    config.pop('kernel_initializer')
-    config.pop('kernel_regularizer')
-    config.pop('kernel_constraint')
-    config['depth_multiplier'] = self.depth_multiplier
-    config['depthwise_initializer'] = initializers.serialize(
-        self.depthwise_initializer)
-    config['depthwise_regularizer'] = regularizers.serialize(
-        self.depthwise_regularizer)
-    config['depthwise_constraint'] = constraints.serialize(
-        self.depthwise_constraint)
-    return config
+        kernel_size,
+        strides=1,
+        padding="valid",
+        depth_multiplier=1,
+        data_format=None,
+        dilation_rate=1,
+        activation=None,
+        use_bias=True,
+        depthwise_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        depthwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        bias_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(
+            rank,
+            filters=None,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            use_bias=use_bias,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            bias_constraint=bias_constraint,
+            **kwargs,
+        )
+        self.depth_multiplier = depth_multiplier
+        self.depthwise_initializer = initializers.get(depthwise_initializer)
+        self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+        self.depthwise_constraint = constraints.get(depthwise_constraint)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+    def build(self, input_shape):
+        if len(input_shape) != self.rank + 2:
+            raise ValueError(
+                "Inputs to `DepthwiseConv` should have "
+                f"rank {self.rank + 2}. "
+                f"Received input_shape={input_shape}."
+            )
+        input_shape = tf.TensorShape(input_shape)
+        channel_axis = self._get_channel_axis()
+        if input_shape.dims[channel_axis].value is None:
+            raise ValueError(
+                "The channel dimension of the inputs to `DepthwiseConv` "
+                "should be defined. "
+                f"The input_shape received is {input_shape}, "
+                f"where axis {channel_axis} (0-based) "
+                "is the channel dimension, which found to be `None`."
+            )
+        input_dim = int(input_shape[channel_axis])
+        depthwise_kernel_shape = self.kernel_size + (
+            input_dim,
+            self.depth_multiplier,
+        )
+
+        self.depthwise_kernel = self.add_weight(
+            shape=depthwise_kernel_shape,
+            initializer=self.depthwise_initializer,
+            name="depthwise_kernel",
+            regularizer=self.depthwise_regularizer,
+            constraint=self.depthwise_constraint,
+        )
+
+        if self.use_bias:
+            self.bias = self.add_weight(
+                shape=(input_dim * self.depth_multiplier,),
+                initializer=self.bias_initializer,
+                name="bias",
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+            )
+        else:
+            self.bias = None
+        # Set input spec.
+        self.input_spec = InputSpec(
+            min_ndim=self.rank + 2, axes={channel_axis: input_dim}
+        )
+        self.built = True
+
+    def call(self, inputs):
+        raise NotImplementedError
+
+    def get_config(self):
+        config = super().get_config()
+        config.pop("filters")
+        config.pop("kernel_initializer")
+        config.pop("kernel_regularizer")
+        config.pop("kernel_constraint")
+        config["depth_multiplier"] = self.depth_multiplier
+        config["depthwise_initializer"] = initializers.serialize(
+            self.depthwise_initializer
+        )
+        config["depthwise_regularizer"] = regularizers.serialize(
+            self.depthwise_regularizer
+        )
+        config["depthwise_constraint"] = constraints.serialize(
+            self.depthwise_constraint
+        )
+        return config
diff --git a/keras/layers/convolutional/base_separable_conv.py b/keras/layers/convolutional/base_separable_conv.py
index 8a491daffd8d..6afb161039ca 100644
--- a/keras/layers/convolutional/base_separable_conv.py
+++ b/keras/layers/convolutional/base_separable_conv.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Keras abstract base layer for separable nD convolution."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import constraints
@@ -21,217 +23,226 @@
 from keras import regularizers
 from keras.engine.input_spec import InputSpec
 from keras.layers.convolutional.base_conv import Conv
-import tensorflow.compat.v2 as tf
 
 
 class SeparableConv(Conv):
-  """Abstract base layer for separable nD convolution.
+    """Abstract base layer for separable nD convolution.
 
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
+    This layer performs a depthwise convolution that acts separately on
+    channels, followed by a pointwise convolution that mixes channels.
+    If `use_bias` is True and a bias initializer is provided,
+    it adds a bias vector to the output.
+    It then optionally applies an activation function to produce the final
+    output.
 
-  Args:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch_size, channels, ...)`.
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel
-      (see `keras.initializers`). If None, then the default initializer
-      ('glorot_uniform') will be used.
-    pointwise_initializer: An initializer for the pointwise convolution kernel
-      (see `keras.initializers`). If None, then the default initializer
-      ('glorot_uniform') will be used.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer ('zeros') will be used (see `keras.initializers`).
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-      depthwise kernel after being updated by an `Optimizer` (e.g. used for
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-      pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-      bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` the weights of this layer will be marked as
-      trainable (and listed in `layer.trainable_weights`).
-  """
+    Args:
+      rank: An integer, the rank of the convolution, e.g. "2" for 2D
+        convolution.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A tuple or list of integers specifying the spatial
+        dimensions of the filters. Can be a single integer to specify the same
+        value for all spatial dimensions.
+      strides: A tuple or list of integers specifying the strides
+        of the convolution. Can be a single integer to specify the same value
+        for all spatial dimensions.
+        Specifying any `stride` value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, ..., channels)` while `channels_first` corresponds to
+        inputs with shape `(batch_size, channels, ...)`.
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `num_filters_in * depth_multiplier`.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias.
+      depthwise_initializer: An initializer for the depthwise convolution kernel
+        (see `keras.initializers`). If None, then the default initializer
+        ('glorot_uniform') will be used.
+      pointwise_initializer: An initializer for the pointwise convolution kernel
+        (see `keras.initializers`). If None, then the default initializer
+        ('glorot_uniform') will be used.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer ('zeros') will be used (see `keras.initializers`).
+      depthwise_regularizer: Optional regularizer for the depthwise
+        convolution kernel.
+      pointwise_regularizer: Optional regularizer for the pointwise
+        convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+      pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+      bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` the weights of this layer will be marked as
+        trainable (and listed in `layer.trainable_weights`).
+    """
 
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               pointwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        bias_initializer=initializers.get(bias_initializer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = initializers.get(depthwise_initializer)
-    self.pointwise_initializer = initializers.get(pointwise_initializer)
-    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
-    self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
-    self.depthwise_constraint = constraints.get(depthwise_constraint)
-    self.pointwise_constraint = constraints.get(pointwise_constraint)
+    def __init__(
+        self,
+        rank,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format=None,
+        dilation_rate=1,
+        depth_multiplier=1,
+        activation=None,
+        use_bias=True,
+        depthwise_initializer="glorot_uniform",
+        pointwise_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        depthwise_regularizer=None,
+        pointwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        pointwise_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs,
+    ):
+        super().__init__(
+            rank=rank,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            bias_initializer=initializers.get(bias_initializer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs,
+        )
+        self.depth_multiplier = depth_multiplier
+        self.depthwise_initializer = initializers.get(depthwise_initializer)
+        self.pointwise_initializer = initializers.get(pointwise_initializer)
+        self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+        self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
+        self.depthwise_constraint = constraints.get(depthwise_constraint)
+        self.pointwise_constraint = constraints.get(pointwise_constraint)
 
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    channel_axis = self._get_channel_axis()
-    if input_shape.dims[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs should be defined. '
-                       f'The input_shape received is {input_shape}, '
-                       f'where axis {channel_axis} (0-based) '
-                       'is the channel dimension, which found to be `None`.')
-    input_dim = int(input_shape[channel_axis])
-    self.input_spec = InputSpec(ndim=self.rank + 2,
-                                axes={channel_axis: input_dim})
-    depthwise_kernel_shape = self.kernel_size + (input_dim,
-                                                 self.depth_multiplier)
-    pointwise_kernel_shape = (
-        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        channel_axis = self._get_channel_axis()
+        if input_shape.dims[channel_axis].value is None:
+            raise ValueError(
+                "The channel dimension of the inputs should be defined. "
+                f"The input_shape received is {input_shape}, "
+                f"where axis {channel_axis} (0-based) "
+                "is the channel dimension, which found to be `None`."
+            )
+        input_dim = int(input_shape[channel_axis])
+        self.input_spec = InputSpec(
+            ndim=self.rank + 2, axes={channel_axis: input_dim}
+        )
+        depthwise_kernel_shape = self.kernel_size + (
+            input_dim,
+            self.depth_multiplier,
+        )
+        pointwise_kernel_shape = (1,) * self.rank + (
+            self.depth_multiplier * input_dim,
+            self.filters,
+        )
 
-    self.depthwise_kernel = self.add_weight(
-        name='depthwise_kernel',
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    self.pointwise_kernel = self.add_weight(
-        name='pointwise_kernel',
-        shape=pointwise_kernel_shape,
-        initializer=self.pointwise_initializer,
-        regularizer=self.pointwise_regularizer,
-        constraint=self.pointwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          name='bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
+        self.depthwise_kernel = self.add_weight(
+            name="depthwise_kernel",
+            shape=depthwise_kernel_shape,
+            initializer=self.depthwise_initializer,
+            regularizer=self.depthwise_regularizer,
+            constraint=self.depthwise_constraint,
+            trainable=True,
+            dtype=self.dtype,
+        )
+        self.pointwise_kernel = self.add_weight(
+            name="pointwise_kernel",
+            shape=pointwise_kernel_shape,
+            initializer=self.pointwise_initializer,
+            regularizer=self.pointwise_regularizer,
+            constraint=self.pointwise_constraint,
+            trainable=True,
+            dtype=self.dtype,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                name="bias",
+                shape=(self.filters,),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                trainable=True,
+                dtype=self.dtype,
+            )
+        else:
+            self.bias = None
+        self.built = True
 
-  def call(self, inputs):
-    raise NotImplementedError
+    def call(self, inputs):
+        raise NotImplementedError
 
-  def get_config(self):
-    config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'depth_multiplier':
-            self.depth_multiplier,
-        'dilation_rate':
-            self.dilation_rate,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {
+            "filters": self.filters,
+            "kernel_size": self.kernel_size,
+            "strides": self.strides,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "depth_multiplier": self.depth_multiplier,
+            "dilation_rate": self.dilation_rate,
+            "activation": activations.serialize(self.activation),
+            "use_bias": self.use_bias,
+            "depthwise_initializer": initializers.serialize(
+                self.depthwise_initializer
+            ),
+            "pointwise_initializer": initializers.serialize(
+                self.pointwise_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "depthwise_regularizer": regularizers.serialize(
+                self.depthwise_regularizer
+            ),
+            "pointwise_regularizer": regularizers.serialize(
+                self.pointwise_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "depthwise_constraint": constraints.serialize(
+                self.depthwise_constraint
+            ),
+            "pointwise_constraint": constraints.serialize(
+                self.pointwise_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/convolutional/conv1d.py b/keras/layers/convolutional/conv1d.py
index 9ddad5f3fa22..5577fca943de 100644
--- a/keras/layers/convolutional/conv1d.py
+++ b/keras/layers/convolutional/conv1d.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras 1D convolution layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import activations
 from keras import constraints
@@ -22,149 +22,158 @@
 from keras.dtensor import utils
 from keras.layers.convolutional.base_conv import Conv
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
+@keras_export("keras.layers.Conv1D", "keras.layers.Convolution1D")
 class Conv1D(Conv):
-  """1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  with the layer input over a single spatial (or temporal) dimension
-  to produce a tensor of outputs.
-  If `use_bias` is True, a bias vector is created and added to the outputs.
-  Finally, if `activation` is not `None`,
-  it is applied to the outputs as well.
-
-  When using this layer as the first layer in a model,
-  provide an `input_shape` argument
-  (tuple of integers or `None`, e.g.
-  `(10, 128)` for sequences of 10 vectors of 128-dimensional vectors,
-  or `(None, 128)` for variable-length sequences of 128-dimensional vectors.
-
-  Examples:
-
-  >>> # The inputs are 128-length vectors with 10 timesteps, and the batch size
-  >>> # is 4.
-  >>> input_shape = (4, 10, 128)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv1D(
-  ... 32, 3, activation='relu',input_shape=input_shape[1:])(x)
-  >>> print(y.shape)
-  (4, 8, 32)
-
-  >>> # With extended batch shape [4, 7] (e.g. weather data where batch
-  >>> # dimensions correspond to spatial location and the third dimension
-  >>> # corresponds to time.)
-  >>> input_shape = (4, 7, 10, 128)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv1D(
-  ... 32, 3, activation='relu', input_shape=input_shape[2:])(x)
-  >>> print(y.shape)
-  (4, 7, 8, 32)
-
-  Args:
-    filters: Integer, the dimensionality of the output space
-      (i.e. the number of output filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer,
-      specifying the length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"`, `"same"` or `"causal"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-      `"causal"` results in causal (dilated) convolutions, e.g. `output[t]`
-      does not depend on `input[t+1:]`. Useful when modeling temporal data
-      where the model should not violate the temporal order.
-      See [WaveNet: A Generative Model for Raw Audio, section
-        2.1](https://arxiv.org/abs/1609.03499).
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-    dilation_rate: an integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved
-      separately with `filters / groups` filters. The output is the
-      concatenation of all the `groups` results along the channel axis.
-      Input channels and `filters` must both be divisible by `groups`.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix
-      (see `keras.initializers`). Defaults to 'glorot_uniform'.
-    bias_initializer: Initializer for the bias vector
-      (see `keras.initializers`). Defaults to 'zeros'.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector
-      (see `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation")
-      (see `keras.regularizers`).
-    kernel_constraint: Constraint function applied to the kernel matrix
-      (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector
-      (see `keras.constraints`).
-
-  Input shape:
-    3+D tensor with shape: `batch_shape + (steps, input_dim)`
-
-  Output shape:
-    3+D tensor with shape: `batch_shape + (new_steps, filters)`
-      `steps` value might have changed due to padding or strides.
-
-  Returns:
-    A tensor of rank 3 representing
-    `activation(conv1d(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: when both `strides > 1` and `dilation_rate > 1`.
-  """
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               groups=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        rank=1,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        groups=groups,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+    """1D convolution layer (e.g. temporal convolution).
+
+    This layer creates a convolution kernel that is convolved
+    with the layer input over a single spatial (or temporal) dimension
+    to produce a tensor of outputs.
+    If `use_bias` is True, a bias vector is created and added to the outputs.
+    Finally, if `activation` is not `None`,
+    it is applied to the outputs as well.
+
+    When using this layer as the first layer in a model,
+    provide an `input_shape` argument
+    (tuple of integers or `None`, e.g.
+    `(10, 128)` for sequences of 10 vectors of 128-dimensional vectors,
+    or `(None, 128)` for variable-length sequences of 128-dimensional vectors.
+
+    Examples:
+
+    >>> # The inputs are 128-length vectors with 10 timesteps, and the
+    >>> # batch size is 4.
+    >>> input_shape = (4, 10, 128)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv1D(
+    ... 32, 3, activation='relu',input_shape=input_shape[1:])(x)
+    >>> print(y.shape)
+    (4, 8, 32)
+
+    >>> # With extended batch shape [4, 7] (e.g. weather data where batch
+    >>> # dimensions correspond to spatial location and the third dimension
+    >>> # corresponds to time.)
+    >>> input_shape = (4, 7, 10, 128)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv1D(
+    ... 32, 3, activation='relu', input_shape=input_shape[2:])(x)
+    >>> print(y.shape)
+    (4, 7, 8, 32)
+
+    Args:
+      filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of a single integer,
+        specifying the length of the 1D convolution window.
+      strides: An integer or tuple/list of a single integer,
+        specifying the stride length of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"`, `"same"` or `"causal"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input.
+        `"causal"` results in causal (dilated) convolutions, e.g. `output[t]`
+        does not depend on `input[t+1:]`. Useful when modeling temporal data
+        where the model should not violate the temporal order.
+        See [WaveNet: A Generative Model for Raw Audio, section
+          2.1](https://arxiv.org/abs/1609.03499).
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch_size, width,
+        channels)` while `channels_first` corresponds to inputs with shape
+        `(batch_size, channels, width)`. Note that the `channels_first` format
+        is currently not supported by TensorFlow on CPU.
+      dilation_rate: an integer or tuple/list of a single integer, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+      groups: A positive integer specifying the number of groups in which the
+        input is split along the channel axis. Each group is convolved
+        separately with `filters / groups` filters. The output is the
+        concatenation of all the `groups` results along the channel axis.
+        Input channels and `filters` must both be divisible by `groups`.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix
+        (see `keras.initializers`). Defaults to 'glorot_uniform'.
+      bias_initializer: Initializer for the bias vector
+        (see `keras.initializers`). Defaults to 'zeros'.
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector
+        (see `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to
+        the output of the layer (its "activation")
+        (see `keras.regularizers`).
+      kernel_constraint: Constraint function applied to the kernel matrix
+        (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector
+        (see `keras.constraints`).
+
+    Input shape:
+      3+D tensor with shape: `batch_shape + (steps, input_dim)`
+
+    Output shape:
+      3+D tensor with shape: `batch_shape + (new_steps, filters)`
+        `steps` value might have changed due to padding or strides.
+
+    Returns:
+      A tensor of rank 3 representing
+      `activation(conv1d(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: when both `strides > 1` and `dilation_rate > 1`.
+    """
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format="channels_last",
+        dilation_rate=1,
+        groups=1,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs
+    ):
+        super().__init__(
+            rank=1,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            groups=groups,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            kernel_initializer=initializers.get(kernel_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            kernel_regularizer=regularizers.get(kernel_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            kernel_constraint=constraints.get(kernel_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs
+        )
+
 
 # Alias
 
diff --git a/keras/layers/convolutional/conv1d_transpose.py b/keras/layers/convolutional/conv1d_transpose.py
index 20c30aa44f5e..e74cff0332c6 100644
--- a/keras/layers/convolutional/conv1d_transpose.py
+++ b/keras/layers/convolutional/conv1d_transpose.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Keras 1D transposed convolution layer (sometimes called deconvolution)."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import constraints
@@ -23,260 +25,279 @@
 from keras.engine.input_spec import InputSpec
 from keras.layers.convolutional.conv1d import Conv1D
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Conv1DTranspose',
-              'keras.layers.Convolution1DTranspose')
+@keras_export(
+    "keras.layers.Conv1DTranspose", "keras.layers.Convolution1DTranspose"
+)
 class Conv1DTranspose(Conv1D):
-  """Transposed convolution layer (sometimes called Deconvolution).
+    """Transposed convolution layer (sometimes called Deconvolution).
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
 
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
+    When using this layer as the first layer in a model,
+    provide the keyword argument `input_shape`
+    (tuple of integers or `None`, does not include the sample axis),
+    e.g. `input_shape=(128, 3)` for data with 128 time steps and 3 channels.
 
-  When using this layer as the first layer in a model,
-  provide the keyword argument `input_shape`
-  (tuple of integers or `None`, does not include the sample axis),
-  e.g. `input_shape=(128, 3)` for data with 128 time steps and 3 channels.
+    Args:
+      filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the convolution).
+      kernel_size: An integer length of the 1D convolution window.
+      strides: An integer specifying the stride of the convolution along the
+        time dimension. Specifying a stride value != 1 is incompatible with
+        specifying a `dilation_rate` value != 1. Defaults to `1`.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input.
+      output_padding: An integer specifying the amount of padding along
+        the time dimension of the output tensor.
+        The amount of output padding must be lower than the stride.
+        If set to `None` (default), the output shape is inferred.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch_size, channels, length)`.
+      dilation_rate: an integer, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying a `dilation_rate` value != 1 is
+        incompatible with specifying a stride value != 1.
+        Also dilation rate larger than 1 is not currently supported.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix
+        (see `keras.initializers`). Defaults to 'glorot_uniform'.
+      bias_initializer: Initializer for the bias vector
+        (see `keras.initializers`). Defaults to 'zeros'.
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector
+        (see `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to
+        the output of the layer (its "activation") (see `keras.regularizers`).
+      kernel_constraint: Constraint function applied to the kernel matrix
+        (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector
+        (see `keras.constraints`).
 
-  Args:
-    filters: Integer, the dimensionality of the output space
-      (i.e. the number of output filters in the convolution).
-    kernel_size: An integer length of the 1D convolution window.
-    strides: An integer specifying the stride of the convolution along the
-      time dimension. Specifying a stride value != 1 is incompatible with
-      specifying a `dilation_rate` value != 1. Defaults to 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    output_padding: An integer specifying the amount of padding along
-      the time dimension of the output tensor.
-      The amount of output padding must be lower than the stride.
-      If set to `None` (default), the output shape is inferred.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch_size, channels, length)`.
-    dilation_rate: an integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying a `dilation_rate` value != 1 is
-      incompatible with specifying a stride value != 1.
-      Also dilation rate larger than 1 is not currently supported.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix
-      (see `keras.initializers`). Defaults to 'glorot_uniform'.
-    bias_initializer: Initializer for the bias vector
-      (see `keras.initializers`). Defaults to 'zeros'.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector
-      (see `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation") (see `keras.regularizers`).
-    kernel_constraint: Constraint function applied to the kernel matrix
-      (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector
-      (see `keras.constraints`).
+    Input shape:
+      3D tensor with shape:
+      `(batch_size, steps, channels)`
 
-  Input shape:
-    3D tensor with shape:
-    `(batch_size, steps, channels)`
+    Output shape:
+      3D tensor with shape:
+      `(batch_size, new_steps, filters)`
+      If `output_padding` is specified:
+      ```
+      new_timesteps = ((timesteps - 1) * strides + kernel_size -
+      2 * padding + output_padding)
+      ```
 
-  Output shape:
-    3D tensor with shape:
-    `(batch_size, new_steps, filters)`
-    If `output_padding` is specified:
-    ```
-    new_timesteps = ((timesteps - 1) * strides + kernel_size -
-    2 * padding + output_padding)
-    ```
+    Returns:
+      A tensor of rank 3 representing
+      `activation(conv1dtranspose(inputs, kernel) + bias)`.
 
-  Returns:
-    A tensor of rank 3 representing
-    `activation(conv1dtranspose(inputs, kernel) + bias)`.
+    Raises:
+      ValueError: if `padding` is "causal".
+      ValueError: when both `strides` > 1 and `dilation_rate` > 1.
 
-  Raises:
-    ValueError: if `padding` is "causal".
-    ValueError: when both `strides` > 1 and `dilation_rate` > 1.
+    References:
+      - [A guide to convolution arithmetic for deep learning](
+        https://arxiv.org/abs/1603.07285v1)
+      - [Deconvolutional Networks](
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
+    """
 
-  References:
-    - [A guide to convolution arithmetic for deep learning](
-      https://arxiv.org/abs/1603.07285v1)
-    - [Deconvolutional Networks](
-      https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
-  """
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        output_padding=None,
+        data_format=None,
+        dilation_rate=1,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            kernel_initializer=initializers.get(kernel_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            kernel_regularizer=regularizers.get(kernel_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            kernel_constraint=constraints.get(kernel_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs,
+        )
 
-  @utils.allow_initializer_layout
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               output_padding=None,
-               data_format=None,
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+        self.output_padding = output_padding
+        if self.output_padding is not None:
+            self.output_padding = conv_utils.normalize_tuple(
+                self.output_padding, 1, "output_padding", allow_zero=True
+            )
+            for stride, out_pad in zip(self.strides, self.output_padding):
+                if out_pad >= stride:
+                    raise ValueError(
+                        "Strides must be greater than output padding. "
+                        f"Received strides={self.strides}, "
+                        f"output_padding={self.output_padding}."
+                    )
 
-    self.output_padding = output_padding
-    if self.output_padding is not None:
-      self.output_padding = conv_utils.normalize_tuple(
-          self.output_padding, 1, 'output_padding', allow_zero=True)
-      for stride, out_pad in zip(self.strides, self.output_padding):
-        if out_pad >= stride:
-          raise ValueError('Strides must be greater than output padding. '
-                           f'Received strides={self.strides}, '
-                           f'output_padding={self.output_padding}.')
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Inputs should have rank 3. "
+                f"Received input_shape={input_shape}."
+            )
+        channel_axis = self._get_channel_axis()
+        if input_shape.dims[channel_axis].value is None:
+            raise ValueError(
+                "The channel dimension of the inputs "
+                "to `Conv1DTranspose` should be defined. "
+                f"The input_shape received is {input_shape}, "
+                f"where axis {channel_axis} (0-based) "
+                "is the channel dimension, which found to be `None`."
+            )
+        input_dim = int(input_shape[channel_axis])
+        self.input_spec = InputSpec(ndim=3, axes={channel_axis: input_dim})
+        kernel_shape = self.kernel_size + (self.filters, input_dim)
 
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    if len(input_shape) != 3:
-      raise ValueError('Inputs should have rank 3. '
-                       f'Received input_shape={input_shape}.')
-    channel_axis = self._get_channel_axis()
-    if input_shape.dims[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'to `Conv1DTranspose` should be defined. '
-                       f'The input_shape received is {input_shape}, '
-                       f'where axis {channel_axis} (0-based) '
-                       'is the channel dimension, which found to be `None`.')
-    input_dim = int(input_shape[channel_axis])
-    self.input_spec = InputSpec(ndim=3, axes={channel_axis: input_dim})
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
+        self.kernel = self.add_weight(
+            name="kernel",
+            shape=kernel_shape,
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            trainable=True,
+            dtype=self.dtype,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                name="bias",
+                shape=(self.filters,),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                trainable=True,
+                dtype=self.dtype,
+            )
+        else:
+            self.bias = None
+        self.built = True
 
-    self.kernel = self.add_weight(
-        name='kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          name='bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
+    def call(self, inputs):
+        inputs_shape = tf.shape(inputs)
+        batch_size = inputs_shape[0]
+        if self.data_format == "channels_first":
+            t_axis = 2
+        else:
+            t_axis = 1
 
-  def call(self, inputs):
-    inputs_shape = tf.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      t_axis = 2
-    else:
-      t_axis = 1
+        length = inputs_shape[t_axis]
+        if self.output_padding is None:
+            output_padding = None
+        else:
+            output_padding = self.output_padding[0]
 
-    length = inputs_shape[t_axis]
-    if self.output_padding is None:
-      output_padding = None
-    else:
-      output_padding = self.output_padding[0]
+        # Infer the dynamic output shape:
+        out_length = conv_utils.deconv_output_length(
+            length,
+            self.kernel_size[0],
+            padding=self.padding,
+            output_padding=output_padding,
+            stride=self.strides[0],
+            dilation=self.dilation_rate[0],
+        )
+        if self.data_format == "channels_first":
+            output_shape = (batch_size, self.filters, out_length)
+        else:
+            output_shape = (batch_size, out_length, self.filters)
+        data_format = conv_utils.convert_data_format(self.data_format, ndim=3)
 
-    # Infer the dynamic output shape:
-    out_length = conv_utils.deconv_output_length(
-        length, self.kernel_size[0], padding=self.padding,
-        output_padding=output_padding, stride=self.strides[0],
-        dilation=self.dilation_rate[0])
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_length)
-    else:
-      output_shape = (batch_size, out_length, self.filters)
-    data_format = conv_utils.convert_data_format(self.data_format, ndim=3)
+        output_shape_tensor = tf.stack(output_shape)
+        outputs = tf.nn.conv1d_transpose(
+            inputs,
+            self.kernel,
+            output_shape_tensor,
+            strides=self.strides,
+            padding=self.padding.upper(),
+            data_format=data_format,
+            dilations=self.dilation_rate,
+        )
 
-    output_shape_tensor = tf.stack(output_shape)
-    outputs = tf.nn.conv1d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides=self.strides,
-        padding=self.padding.upper(),
-        data_format=data_format,
-        dilations=self.dilation_rate)
+        if not tf.executing_eagerly() and inputs.shape.rank:
+            # Infer the static output shape:
+            out_shape = self.compute_output_shape(inputs.shape)
+            outputs.set_shape(out_shape)
 
-    if not tf.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = self.compute_output_shape(inputs.shape)
-      outputs.set_shape(out_shape)
+        if self.use_bias:
+            outputs = tf.nn.bias_add(
+                outputs, self.bias, data_format=data_format
+            )
 
-    if self.use_bias:
-      outputs = tf.nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=data_format)
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
 
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        output_shape = list(input_shape)
+        if self.data_format == "channels_first":
+            c_axis, t_axis = 1, 2
+        else:
+            c_axis, t_axis = 2, 1
 
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, t_axis = 1, 2
-    else:
-      c_axis, t_axis = 2, 1
+        if self.output_padding is None:
+            output_padding = None
+        else:
+            output_padding = self.output_padding[0]
+        output_shape[c_axis] = self.filters
+        output_shape[t_axis] = conv_utils.deconv_output_length(
+            output_shape[t_axis],
+            self.kernel_size[0],
+            padding=self.padding,
+            output_padding=output_padding,
+            stride=self.strides[0],
+            dilation=self.dilation_rate[0],
+        )
+        return tf.TensorShape(output_shape)
 
-    if self.output_padding is None:
-      output_padding = None
-    else:
-      output_padding = self.output_padding[0]
-    output_shape[c_axis] = self.filters
-    output_shape[t_axis] = conv_utils.deconv_output_length(
-        output_shape[t_axis],
-        self.kernel_size[0],
-        padding=self.padding,
-        output_padding=output_padding,
-        stride=self.strides[0],
-        dilation=self.dilation_rate[0])
-    return tf.TensorShape(output_shape)
+    def get_config(self):
+        config = super().get_config()
+        config["output_padding"] = self.output_padding
+        return config
 
-  def get_config(self):
-    config = super().get_config()
-    config['output_padding'] = self.output_padding
-    return config
 
 # Alias
 
diff --git a/keras/layers/convolutional/conv2d.py b/keras/layers/convolutional/conv2d.py
index 257a729790bc..6a6c3aae0f41 100644
--- a/keras/layers/convolutional/conv2d.py
+++ b/keras/layers/convolutional/conv2d.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras 2D convolution layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import activations
 from keras import constraints
@@ -22,170 +22,181 @@
 from keras.dtensor import utils
 from keras.layers.convolutional.base_conv import Conv
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
+@keras_export("keras.layers.Conv2D", "keras.layers.Convolution2D")
 class Conv2D(Conv):
-  """2D convolution layer (e.g. spatial convolution over images).
-
-  This layer creates a convolution kernel that is convolved
-  with the layer input to produce a tensor of
-  outputs. If `use_bias` is True,
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  When using this layer as the first layer in a model,
-  provide the keyword argument `input_shape`
-  (tuple of integers or `None`, does not include the sample axis),
-  e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
-  in `data_format="channels_last"`. You can use `None` when
-  a dimension has variable size.
-
-  Examples:
-
-  >>> # The inputs are 28x28 RGB images with `channels_last` and the batch
-  >>> # size is 4.
-  >>> input_shape = (4, 28, 28, 3)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv2D(
-  ... 2, 3, activation='relu', input_shape=input_shape[1:])(x)
-  >>> print(y.shape)
-  (4, 26, 26, 2)
-
-  >>> # With `dilation_rate` as 2.
-  >>> input_shape = (4, 28, 28, 3)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv2D(
-  ... 2, 3, activation='relu', dilation_rate=2, input_shape=input_shape[1:])(x)
-  >>> print(y.shape)
-  (4, 24, 24, 2)
-
-  >>> # With `padding` as "same".
-  >>> input_shape = (4, 28, 28, 3)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv2D(
-  ... 2, 3, activation='relu', padding="same", input_shape=input_shape[1:])(x)
-  >>> print(y.shape)
-  (4, 28, 28, 2)
-
-  >>> # With extended batch shape [4, 7]:
-  >>> input_shape = (4, 7, 28, 28, 3)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv2D(
-  ... 2, 3, activation='relu', input_shape=input_shape[2:])(x)
-  >>> print(y.shape)
-  (4, 7, 26, 26, 2)
-
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number of
-      output filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the height
-      and width of the 2D convolution window. Can be a single integer to specify
-      the same value for all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers, specifying the strides of
-      the convolution along the height and width. Can be a single integer to
-      specify the same value for all spatial dimensions. Specifying any stride
-      value != 1 is incompatible with specifying any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input. When `padding="same"` and
-      `strides=1`, the output has the same size as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `(batch_size, height, width, channels)` while
-      `channels_first` corresponds to inputs with shape `(batch_size, channels,
-      height, width)`. It defaults to the `image_data_format` value found in
-      your Keras config file at `~/.keras/keras.json`. If you never set it, then
-      it will be `channels_last`.
-    dilation_rate: an integer or tuple/list of 2 integers, specifying the
-      dilation rate to use for dilated convolution. Can be a single integer to
-      specify the same value for all spatial dimensions. Currently, specifying
-      any `dilation_rate` value != 1 is incompatible with specifying any stride
-      value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved separately
-      with `filters / groups` filters. The output is the concatenation of all
-      the `groups` results along the channel axis. Input channels and `filters`
-      must both be divisible by `groups`.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix (see
-      `keras.initializers`). Defaults to 'glorot_uniform'.
-    bias_initializer: Initializer for the bias vector (see
-      `keras.initializers`). Defaults to 'zeros'.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector (see
-      `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation") (see `keras.regularizers`).
-    kernel_constraint: Constraint function applied to the kernel matrix (see
-      `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector (see
-      `keras.constraints`).
-
-  Input shape:
-    4+D tensor with shape: `batch_shape + (channels, rows, cols)` if
-      `data_format='channels_first'`
-    or 4+D tensor with shape: `batch_shape + (rows, cols, channels)` if
-      `data_format='channels_last'`.
-
-  Output shape:
-    4+D tensor with shape: `batch_shape + (filters, new_rows, new_cols)` if
-    `data_format='channels_first'` or 4+D tensor with shape: `batch_shape +
-      (new_rows, new_cols, filters)` if `data_format='channels_last'`.  `rows`
-      and `cols` values might have changed due to padding.
-
-  Returns:
-    A tensor of rank 4+ representing
-    `activation(conv2d(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is `"causal"`.
-    ValueError: when both `strides > 1` and `dilation_rate > 1`.
-  """
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1),
-               groups=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        rank=2,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        groups=groups,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+    """2D convolution layer (e.g. spatial convolution over images).
+
+    This layer creates a convolution kernel that is convolved
+    with the layer input to produce a tensor of
+    outputs. If `use_bias` is True,
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    When using this layer as the first layer in a model,
+    provide the keyword argument `input_shape`
+    (tuple of integers or `None`, does not include the sample axis),
+    e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
+    in `data_format="channels_last"`. You can use `None` when
+    a dimension has variable size.
+
+    Examples:
+
+    >>> # The inputs are 28x28 RGB images with `channels_last` and the batch
+    >>> # size is 4.
+    >>> input_shape = (4, 28, 28, 3)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv2D(
+    ... 2, 3, activation='relu', input_shape=input_shape[1:])(x)
+    >>> print(y.shape)
+    (4, 26, 26, 2)
+
+    >>> # With `dilation_rate` as 2.
+    >>> input_shape = (4, 28, 28, 3)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv2D(
+    ...     2, 3,
+    ...     activation='relu',
+    ...     dilation_rate=2,
+    ...     input_shape=input_shape[1:])(x)
+    >>> print(y.shape)
+    (4, 24, 24, 2)
+
+    >>> # With `padding` as "same".
+    >>> input_shape = (4, 28, 28, 3)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv2D(
+    ... 2, 3, activation='relu', padding="same", input_shape=input_shape[1:])(x)
+    >>> print(y.shape)
+    (4, 28, 28, 2)
+
+    >>> # With extended batch shape [4, 7]:
+    >>> input_shape = (4, 7, 28, 28, 3)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv2D(
+    ... 2, 3, activation='relu', input_shape=input_shape[2:])(x)
+    >>> print(y.shape)
+    (4, 7, 26, 26, 2)
+
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the height
+        and width of the 2D convolution window. Can be a single integer to
+        specify the same value for all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers, specifying the strides of
+        the convolution along the height and width. Can be a single integer to
+        specify the same value for all spatial dimensions. Specifying any stride
+        value != 1 is incompatible with specifying any `dilation_rate` value !=
+        1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input. When `padding="same"`
+        and `strides=1`, the output has the same size as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch_size, height,
+        width, channels)` while `channels_first` corresponds to inputs with
+        shape `(batch_size, channels, height, width)`. If left unspecified, it
+        uses the `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Note that the `channels_first` format is currently not
+        supported by TensorFlow on CPU. Defaults to 'channels_last'.
+      dilation_rate: an integer or tuple/list of 2 integers, specifying the
+        dilation rate to use for dilated convolution. Can be a single integer to
+        specify the same value for all spatial dimensions. Currently, specifying
+        any `dilation_rate` value != 1 is incompatible with specifying any
+        stride value != 1.
+      groups: A positive integer specifying the number of groups in which the
+        input is split along the channel axis. Each group is convolved
+        separately with `filters / groups` filters. The output is the
+        concatenation of all the `groups` results along the channel axis. Input
+        channels and `filters` must both be divisible by `groups`.
+      activation: Activation function to use. If you don't specify anything, no
+        activation is applied (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix (see
+        `keras.initializers`). Defaults to 'glorot_uniform'.
+      bias_initializer: Initializer for the bias vector (see
+        `keras.initializers`). Defaults to 'zeros'.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector (see
+        `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation") (see `keras.regularizers`).
+      kernel_constraint: Constraint function applied to the kernel matrix (see
+        `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector (see
+        `keras.constraints`).
+
+    Input shape:
+      4+D tensor with shape: `batch_shape + (channels, rows, cols)` if
+        `data_format='channels_first'`
+      or 4+D tensor with shape: `batch_shape + (rows, cols, channels)` if
+        `data_format='channels_last'`.
+
+    Output shape:
+      4+D tensor with shape: `batch_shape + (filters, new_rows, new_cols)` if
+      `data_format='channels_first'` or 4+D tensor with shape: `batch_shape +
+        (new_rows, new_cols, filters)` if `data_format='channels_last'`.  `rows`
+        and `cols` values might have changed due to padding.
+
+    Returns:
+      A tensor of rank 4+ representing
+      `activation(conv2d(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is `"causal"`.
+      ValueError: when both `strides > 1` and `dilation_rate > 1`.
+    """
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        data_format=None,
+        dilation_rate=(1, 1),
+        groups=1,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs
+    ):
+        super().__init__(
+            rank=2,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            groups=groups,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            kernel_initializer=initializers.get(kernel_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            kernel_regularizer=regularizers.get(kernel_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            kernel_constraint=constraints.get(kernel_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs
+        )
+
 
 # Alias
 
diff --git a/keras/layers/convolutional/conv2d_transpose.py b/keras/layers/convolutional/conv2d_transpose.py
index ae419a5cb59a..772b761e95d8 100644
--- a/keras/layers/convolutional/conv2d_transpose.py
+++ b/keras/layers/convolutional/conv2d_transpose.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Keras 2D transposed convolution layer (sometimes called deconvolution)."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import backend
@@ -24,315 +26,341 @@
 from keras.engine.input_spec import InputSpec
 from keras.layers.convolutional.conv2d import Conv2D
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Conv2DTranspose',
-              'keras.layers.Convolution2DTranspose')
+@keras_export(
+    "keras.layers.Conv2DTranspose", "keras.layers.Convolution2DTranspose"
+)
 class Conv2DTranspose(Conv2D):
-  """Transposed convolution layer (sometimes called Deconvolution).
-
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
-
-  When using this layer as the first layer in a model,
-  provide the keyword argument `input_shape`
-  (tuple of integers or `None`, does not include the sample axis),
-  e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
-  in `data_format="channels_last"`.
-
-  Args:
-    filters: Integer, the dimensionality of the output space
-      (i.e. the number of output filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    output_padding: An integer or tuple/list of 2 integers,
-      specifying the amount of padding along the height and width
-      of the output tensor.
-      Can be a single integer to specify the same value for all
-      spatial dimensions.
-      The amount of output padding along a given dimension must be
-      lower than the stride along that same dimension.
-      If set to `None` (default), the output shape is inferred.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch_size, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    dilation_rate: an integer, specifying the dilation rate for all spatial
-      dimensions for dilated convolution. Specifying different dilation rates
-      for different dimensions is not supported.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix
-      (see `keras.initializers`). Defaults to 'glorot_uniform'.
-    bias_initializer: Initializer for the bias vector
-      (see `keras.initializers`). Defaults to 'zeros'.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector
-      (see `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation") (see `keras.regularizers`).
-    kernel_constraint: Constraint function applied to the kernel matrix
-      (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector
-      (see `keras.constraints`).
-
-  Input shape:
-    4D tensor with shape:
-    `(batch_size, channels, rows, cols)` if data_format='channels_first'
-    or 4D tensor with shape:
-    `(batch_size, rows, cols, channels)` if data_format='channels_last'.
-
-  Output shape:
-    4D tensor with shape:
-    `(batch_size, filters, new_rows, new_cols)` if data_format='channels_first'
-    or 4D tensor with shape:
-    `(batch_size, new_rows, new_cols, filters)` if data_format='channels_last'.
-    `rows` and `cols` values might have changed due to padding.
-    If `output_padding` is specified:
-    ```
-    new_rows = ((rows - 1) * strides[0] + kernel_size[0] - 2 * padding[0] +
-    output_padding[0])
-    new_cols = ((cols - 1) * strides[1] + kernel_size[1] - 2 * padding[1] +
-    output_padding[1])
-    ```
-
-  Returns:
-    A tensor of rank 4 representing
-    `activation(conv2dtranspose(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is "causal".
-    ValueError: when both `strides` > 1 and `dilation_rate` > 1.
-
-  References:
-    - [A guide to convolution arithmetic for deep
-      learning](https://arxiv.org/abs/1603.07285v1)
-    - [Deconvolutional
-      Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
-  """
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               output_padding=None,
-               data_format=None,
-               dilation_rate=(1, 1),
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
-
-    self.output_padding = output_padding
-    if self.output_padding is not None:
-      self.output_padding = conv_utils.normalize_tuple(
-          self.output_padding, 2, 'output_padding', allow_zero=True)
-      for stride, out_pad in zip(self.strides, self.output_padding):
-        if out_pad >= stride:
-          raise ValueError('Strides must be greater than output padding. '
-                           f'Received strides={self.strides}, '
-                           f'output_padding={self.output_padding}.')
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. '
-                       f'Received input_shape={input_shape}.')
-    channel_axis = self._get_channel_axis()
-    if input_shape.dims[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'to `Conv2DTranspose` should be defined. '
-                       f'The input_shape received is {input_shape}, '
-                       f'where axis {channel_axis} (0-based) '
-                       'is the channel dimension, which found to be `None`.')
-    input_dim = int(input_shape[channel_axis])
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_weight(
-        name='kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          name='bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = tf.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      h_axis, w_axis = 2, 3
-    else:
-      h_axis, w_axis = 1, 2
-
-    # Use the constant height and weight when possible.
-    # TODO(scottzhu): Extract this into a utility function that can be applied
-    # to all convolutional layers, which currently lost the static shape
-    # information due to tf.shape().
-    height, width = None, None
-    if inputs.shape.rank is not None:
-      dims = inputs.shape.as_list()
-      height = dims[h_axis]
-      width = dims[w_axis]
-    height = height if height is not None else inputs_shape[h_axis]
-    width = width if width is not None else inputs_shape[w_axis]
-
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    if self.output_padding is None:
-      out_pad_h = out_pad_w = None
-    else:
-      out_pad_h, out_pad_w = self.output_padding
-
-    # Infer the dynamic output shape:
-    out_height = conv_utils.deconv_output_length(height,
-                                                 kernel_h,
-                                                 padding=self.padding,
-                                                 output_padding=out_pad_h,
-                                                 stride=stride_h,
-                                                 dilation=self.dilation_rate[0])
-    out_width = conv_utils.deconv_output_length(width,
-                                                kernel_w,
-                                                padding=self.padding,
-                                                output_padding=out_pad_w,
-                                                stride=stride_w,
-                                                dilation=self.dilation_rate[1])
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_height, out_width)
-    else:
-      output_shape = (batch_size, out_height, out_width, self.filters)
-
-    output_shape_tensor = tf.stack(output_shape)
-    outputs = backend.conv2d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides=self.strides,
-        padding=self.padding,
-        data_format=self.data_format,
-        dilation_rate=self.dilation_rate)
-
-    if not tf.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = self.compute_output_shape(inputs.shape)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs = tf.nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    if self.output_padding is None:
-      out_pad_h = out_pad_w = None
-    else:
-      out_pad_h, out_pad_w = self.output_padding
-
-    output_shape[c_axis] = self.filters
-    output_shape[h_axis] = conv_utils.deconv_output_length(
-        output_shape[h_axis],
-        kernel_h,
-        padding=self.padding,
-        output_padding=out_pad_h,
-        stride=stride_h,
-        dilation=self.dilation_rate[0])
-    output_shape[w_axis] = conv_utils.deconv_output_length(
-        output_shape[w_axis],
-        kernel_w,
-        padding=self.padding,
-        output_padding=out_pad_w,
-        stride=stride_w,
-        dilation=self.dilation_rate[1])
-    return tf.TensorShape(output_shape)
-
-  def get_config(self):
-    config = super().get_config()
-    config['output_padding'] = self.output_padding
-    return config
+    """Transposed convolution layer (sometimes called Deconvolution).
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
+
+    When using this layer as the first layer in a model,
+    provide the keyword argument `input_shape`
+    (tuple of integers or `None`, does not include the sample axis),
+    e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures
+    in `data_format="channels_last"`.
+
+    Args:
+      filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+        height and width of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the height and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input.
+      output_padding: An integer or tuple/list of 2 integers,
+        specifying the amount of padding along the height and width
+        of the output tensor.
+        Can be a single integer to specify the same value for all
+        spatial dimensions.
+        The amount of output padding along a given dimension must be
+        lower than the stride along that same dimension.
+        If set to `None` (default), the output shape is inferred.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch_size, channels, height, width)`.
+        When unspecified, uses `image_data_format` value found in your Keras
+        config file at `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to "channels_last".
+      dilation_rate: an integer, specifying the dilation rate for all spatial
+        dimensions for dilated convolution. Specifying different dilation rates
+        for different dimensions is not supported.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix
+        (see `keras.initializers`). Defaults to 'glorot_uniform'.
+      bias_initializer: Initializer for the bias vector
+        (see `keras.initializers`). Defaults to 'zeros'.
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector
+        (see `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to
+        the output of the layer (its "activation") (see `keras.regularizers`).
+      kernel_constraint: Constraint function applied to the kernel matrix
+        (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector
+        (see `keras.constraints`).
+
+    Input shape:
+      4D tensor with shape:
+      `(batch_size, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch_size, rows, cols, channels)` if data_format='channels_last'.
+
+    Output shape:
+      4D tensor with shape:
+      `(batch_size, filters, new_rows, new_cols)` if
+      data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch_size, new_rows, new_cols, filters)` if
+      data_format='channels_last'.  `rows` and `cols` values might have changed
+      due to padding.
+      If `output_padding` is specified:
+      ```
+      new_rows = ((rows - 1) * strides[0] + kernel_size[0] - 2 * padding[0] +
+      output_padding[0])
+      new_cols = ((cols - 1) * strides[1] + kernel_size[1] - 2 * padding[1] +
+      output_padding[1])
+      ```
+
+    Returns:
+      A tensor of rank 4 representing
+      `activation(conv2dtranspose(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is "causal".
+      ValueError: when both `strides` > 1 and `dilation_rate` > 1.
+
+    References:
+      - [A guide to convolution arithmetic for deep
+        learning](https://arxiv.org/abs/1603.07285v1)
+      - [Deconvolutional
+        Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
+    """
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        output_padding=None,
+        data_format=None,
+        dilation_rate=(1, 1),
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            kernel_initializer=initializers.get(kernel_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            kernel_regularizer=regularizers.get(kernel_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            kernel_constraint=constraints.get(kernel_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs,
+        )
+
+        self.output_padding = output_padding
+        if self.output_padding is not None:
+            self.output_padding = conv_utils.normalize_tuple(
+                self.output_padding, 2, "output_padding", allow_zero=True
+            )
+            for stride, out_pad in zip(self.strides, self.output_padding):
+                if out_pad >= stride:
+                    raise ValueError(
+                        "Strides must be greater than output padding. "
+                        f"Received strides={self.strides}, "
+                        f"output_padding={self.output_padding}."
+                    )
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        if len(input_shape) != 4:
+            raise ValueError(
+                "Inputs should have rank 4. "
+                f"Received input_shape={input_shape}."
+            )
+        channel_axis = self._get_channel_axis()
+        if input_shape.dims[channel_axis].value is None:
+            raise ValueError(
+                "The channel dimension of the inputs "
+                "to `Conv2DTranspose` should be defined. "
+                f"The input_shape received is {input_shape}, "
+                f"where axis {channel_axis} (0-based) "
+                "is the channel dimension, which found to be `None`."
+            )
+        input_dim = int(input_shape[channel_axis])
+        self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+        kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+        self.kernel = self.add_weight(
+            name="kernel",
+            shape=kernel_shape,
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            trainable=True,
+            dtype=self.dtype,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                name="bias",
+                shape=(self.filters,),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                trainable=True,
+                dtype=self.dtype,
+            )
+        else:
+            self.bias = None
+        self.built = True
+
+    def call(self, inputs):
+        inputs_shape = tf.shape(inputs)
+        batch_size = inputs_shape[0]
+        if self.data_format == "channels_first":
+            h_axis, w_axis = 2, 3
+        else:
+            h_axis, w_axis = 1, 2
+
+        # Use the constant height and weight when possible.
+        # TODO(scottzhu): Extract this into a utility function that can be
+        # applied to all convolutional layers, which currently lost the static
+        # shape information due to tf.shape().
+        height, width = None, None
+        if inputs.shape.rank is not None:
+            dims = inputs.shape.as_list()
+            height = dims[h_axis]
+            width = dims[w_axis]
+        height = height if height is not None else inputs_shape[h_axis]
+        width = width if width is not None else inputs_shape[w_axis]
+
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.strides
+
+        if self.output_padding is None:
+            out_pad_h = out_pad_w = None
+        else:
+            out_pad_h, out_pad_w = self.output_padding
+
+        # Infer the dynamic output shape:
+        out_height = conv_utils.deconv_output_length(
+            height,
+            kernel_h,
+            padding=self.padding,
+            output_padding=out_pad_h,
+            stride=stride_h,
+            dilation=self.dilation_rate[0],
+        )
+        out_width = conv_utils.deconv_output_length(
+            width,
+            kernel_w,
+            padding=self.padding,
+            output_padding=out_pad_w,
+            stride=stride_w,
+            dilation=self.dilation_rate[1],
+        )
+        if self.data_format == "channels_first":
+            output_shape = (batch_size, self.filters, out_height, out_width)
+        else:
+            output_shape = (batch_size, out_height, out_width, self.filters)
+
+        output_shape_tensor = tf.stack(output_shape)
+        outputs = backend.conv2d_transpose(
+            inputs,
+            self.kernel,
+            output_shape_tensor,
+            strides=self.strides,
+            padding=self.padding,
+            data_format=self.data_format,
+            dilation_rate=self.dilation_rate,
+        )
+
+        if not tf.executing_eagerly() and inputs.shape.rank:
+            # Infer the static output shape:
+            out_shape = self.compute_output_shape(inputs.shape)
+            outputs.set_shape(out_shape)
+
+        if self.use_bias:
+            outputs = tf.nn.bias_add(
+                outputs,
+                self.bias,
+                data_format=conv_utils.convert_data_format(
+                    self.data_format, ndim=4
+                ),
+            )
+
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        output_shape = list(input_shape)
+        if self.data_format == "channels_first":
+            c_axis, h_axis, w_axis = 1, 2, 3
+        else:
+            c_axis, h_axis, w_axis = 3, 1, 2
+
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.strides
+
+        if self.output_padding is None:
+            out_pad_h = out_pad_w = None
+        else:
+            out_pad_h, out_pad_w = self.output_padding
+
+        output_shape[c_axis] = self.filters
+        output_shape[h_axis] = conv_utils.deconv_output_length(
+            output_shape[h_axis],
+            kernel_h,
+            padding=self.padding,
+            output_padding=out_pad_h,
+            stride=stride_h,
+            dilation=self.dilation_rate[0],
+        )
+        output_shape[w_axis] = conv_utils.deconv_output_length(
+            output_shape[w_axis],
+            kernel_w,
+            padding=self.padding,
+            output_padding=out_pad_w,
+            stride=stride_w,
+            dilation=self.dilation_rate[1],
+        )
+        return tf.TensorShape(output_shape)
+
+    def get_config(self):
+        config = super().get_config()
+        config["output_padding"] = self.output_padding
+        return config
+
 
 # Alias
 
diff --git a/keras/layers/convolutional/conv3d.py b/keras/layers/convolutional/conv3d.py
index aeee2067f024..bfcfcf5012e2 100644
--- a/keras/layers/convolutional/conv3d.py
+++ b/keras/layers/convolutional/conv3d.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras 3D convolution layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import activations
 from keras import constraints
@@ -22,157 +22,165 @@
 from keras.dtensor import utils
 from keras.layers.convolutional.base_conv import Conv
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
+@keras_export("keras.layers.Conv3D", "keras.layers.Convolution3D")
 class Conv3D(Conv):
-  """3D convolution layer (e.g. spatial convolution over volumes).
-
-  This layer creates a convolution kernel that is convolved
-  with the layer input to produce a tensor of
-  outputs. If `use_bias` is True,
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  When using this layer as the first layer in a model,
-  provide the keyword argument `input_shape`
-  (tuple of integers or `None`, does not include the sample axis),
-  e.g. `input_shape=(128, 128, 128, 1)` for 128x128x128 volumes
-  with a single channel,
-  in `data_format="channels_last"`.
-
-  Examples:
-
-  >>> # The inputs are 28x28x28 volumes with a single channel, and the
-  >>> # batch size is 4
-  >>> input_shape =(4, 28, 28, 28, 1)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv3D(
-  ... 2, 3, activation='relu', input_shape=input_shape[1:])(x)
-  >>> print(y.shape)
-  (4, 26, 26, 26, 2)
-
-  >>> # With extended batch shape [4, 7], e.g. a batch of 4 videos of 3D frames,
-  >>> # with 7 frames per video.
-  >>> input_shape = (4, 7, 28, 28, 28, 1)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Conv3D(
-  ... 2, 3, activation='relu', input_shape=input_shape[2:])(x)
-  >>> print(y.shape)
-  (4, 7, 26, 26, 26, 2)
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number of
-      output filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the depth,
-      height and width of the 3D convolution window. Can be a single integer to
-      specify the same value for all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers, specifying the strides of
-      the convolution along each spatial dimension. Can be a single integer to
-      specify the same value for all spatial dimensions. Specifying any stride
-      value != 1 is incompatible with specifying any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `batch_shape + (spatial_dim1, spatial_dim2,
-      spatial_dim3, channels)` while `channels_first` corresponds to inputs with
-      shape `batch_shape + (channels, spatial_dim1, spatial_dim2,
-      spatial_dim3)`. It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`. If you never set it, then it
-      will be "channels_last".
-    dilation_rate: an integer or tuple/list of 3 integers, specifying the
-      dilation rate to use for dilated convolution. Can be a single integer to
-      specify the same value for all spatial dimensions. Currently, specifying
-      any `dilation_rate` value != 1 is incompatible with specifying any stride
-      value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved separately
-      with `filters / groups` filters. The output is the concatenation of all
-      the `groups` results along the channel axis. Input channels and `filters`
-      must both be divisible by `groups`.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix (see
-      `keras.initializers`). Defaults to 'glorot_uniform'.
-    bias_initializer: Initializer for the bias vector (see
-      `keras.initializers`). Defaults to 'zeros'.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector (see
-      `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation") (see `keras.regularizers`).
-    kernel_constraint: Constraint function applied to the kernel matrix (see
-      `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector (see
-      `keras.constraints`).
-
-  Input shape:
-    5+D tensor with shape: `batch_shape + (channels, conv_dim1, conv_dim2,
-      conv_dim3)` if data_format='channels_first'
-    or 5+D tensor with shape: `batch_shape + (conv_dim1, conv_dim2, conv_dim3,
-      channels)` if data_format='channels_last'.
-
-  Output shape:
-    5+D tensor with shape: `batch_shape + (filters, new_conv_dim1,
-      new_conv_dim2, new_conv_dim3)` if data_format='channels_first'
-    or 5+D tensor with shape: `batch_shape + (new_conv_dim1, new_conv_dim2,
-      new_conv_dim3, filters)` if data_format='channels_last'. `new_conv_dim1`,
-      `new_conv_dim2` and `new_conv_dim3` values might have changed due to
-      padding.
-
-  Returns:
-    A tensor of rank 5+ representing
-    `activation(conv3d(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is "causal".
-    ValueError: when both `strides > 1` and `dilation_rate > 1`.
-  """
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1, 1),
-               groups=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        rank=3,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        groups=groups,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+    """3D convolution layer (e.g. spatial convolution over volumes).
+
+    This layer creates a convolution kernel that is convolved
+    with the layer input to produce a tensor of
+    outputs. If `use_bias` is True,
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    When using this layer as the first layer in a model,
+    provide the keyword argument `input_shape`
+    (tuple of integers or `None`, does not include the sample axis),
+    e.g. `input_shape=(128, 128, 128, 1)` for 128x128x128 volumes
+    with a single channel,
+    in `data_format="channels_last"`.
+
+    Examples:
+
+    >>> # The inputs are 28x28x28 volumes with a single channel, and the
+    >>> # batch size is 4
+    >>> input_shape =(4, 28, 28, 28, 1)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv3D(
+    ... 2, 3, activation='relu', input_shape=input_shape[1:])(x)
+    >>> print(y.shape)
+    (4, 26, 26, 26, 2)
+
+    >>> # With extended batch shape [4, 7], e.g. a batch of 4 videos of
+    >>> # 3D frames, with 7 frames per video.
+    >>> input_shape = (4, 7, 28, 28, 28, 1)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Conv3D(
+    ... 2, 3, activation='relu', input_shape=input_shape[2:])(x)
+    >>> print(y.shape)
+    (4, 7, 26, 26, 26, 2)
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the depth,
+        height and width of the 3D convolution window. Can be a single integer
+        to specify the same value for all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers, specifying the strides of
+        the convolution along each spatial dimension. Can be a single integer to
+        specify the same value for all spatial dimensions. Specifying any stride
+        value != 1 is incompatible with specifying any `dilation_rate` value !=
+        1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `batch_shape +
+        (spatial_dim1, spatial_dim2, spatial_dim3, channels)` while
+        `channels_first` corresponds to inputs with shape `batch_shape +
+        (channels, spatial_dim1, spatial_dim2, spatial_dim3)`. When unspecified,
+        uses `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'. Note that the
+        `channels_first` format is currently not supported by TensorFlow on CPU.
+        Defaults to 'channels_last'.
+      dilation_rate: an integer or tuple/list of 3 integers, specifying the
+        dilation rate to use for dilated convolution. Can be a single integer to
+        specify the same value for all spatial dimensions. Currently, specifying
+        any `dilation_rate` value != 1 is incompatible with specifying any
+        stride value != 1.
+      groups: A positive integer specifying the number of groups in which the
+        input is split along the channel axis. Each group is convolved
+        separately with `filters / groups` filters. The output is the
+        concatenation of all the `groups` results along the channel axis. Input
+        channels and `filters` must both be divisible by `groups`.
+      activation: Activation function to use. If you don't specify anything, no
+        activation is applied (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix (see
+        `keras.initializers`). Defaults to 'glorot_uniform'.
+      bias_initializer: Initializer for the bias vector (see
+        `keras.initializers`). Defaults to 'zeros'.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector (see
+        `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation") (see `keras.regularizers`).
+      kernel_constraint: Constraint function applied to the kernel matrix (see
+        `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector (see
+        `keras.constraints`).
+
+    Input shape:
+      5+D tensor with shape: `batch_shape + (channels, conv_dim1, conv_dim2,
+        conv_dim3)` if data_format='channels_first'
+      or 5+D tensor with shape: `batch_shape + (conv_dim1, conv_dim2, conv_dim3,
+        channels)` if data_format='channels_last'.
+
+    Output shape:
+      5+D tensor with shape: `batch_shape + (filters, new_conv_dim1,
+        new_conv_dim2, new_conv_dim3)` if data_format='channels_first'
+      or 5+D tensor with shape: `batch_shape + (new_conv_dim1, new_conv_dim2,
+        new_conv_dim3, filters)` if data_format='channels_last'.
+        `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have
+        changed due to padding.
+
+    Returns:
+      A tensor of rank 5+ representing
+      `activation(conv3d(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is "causal".
+      ValueError: when both `strides > 1` and `dilation_rate > 1`.
+    """
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1, 1),
+        padding="valid",
+        data_format=None,
+        dilation_rate=(1, 1, 1),
+        groups=1,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs
+    ):
+        super().__init__(
+            rank=3,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            groups=groups,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            kernel_initializer=initializers.get(kernel_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            kernel_regularizer=regularizers.get(kernel_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            kernel_constraint=constraints.get(kernel_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs
+        )
+
 
 # Alias
 
diff --git a/keras/layers/convolutional/conv3d_transpose.py b/keras/layers/convolutional/conv3d_transpose.py
index 8e5359617517..dcb9b54a6665 100644
--- a/keras/layers/convolutional/conv3d_transpose.py
+++ b/keras/layers/convolutional/conv3d_transpose.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Keras 3D transposed convolution layer (sometimes called deconvolution)."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import constraints
@@ -23,326 +25,367 @@
 from keras.engine.input_spec import InputSpec
 from keras.layers.convolutional.conv3d import Conv3D
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Conv3DTranspose',
-              'keras.layers.Convolution3DTranspose')
+@keras_export(
+    "keras.layers.Conv3DTranspose", "keras.layers.Convolution3DTranspose"
+)
 class Conv3DTranspose(Conv3D):
-  """Transposed convolution layer (sometimes called Deconvolution).
-
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
-
-  When using this layer as the first layer in a model,
-  provide the keyword argument `input_shape`
-  (tuple of integers or `None`, does not include the sample axis),
-  e.g. `input_shape=(128, 128, 128, 3)` for a 128x128x128 volume with 3 channels
-  if `data_format="channels_last"`.
-
-  Args:
-    filters: Integer, the dimensionality of the output space
-      (i.e. the number of output filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth, height
-        and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    output_padding: An integer or tuple/list of 3 integers,
-      specifying the amount of padding along the depth, height, and
-      width.
-      Can be a single integer to specify the same value for all
-      spatial dimensions.
-      The amount of output padding along a given dimension must be
-      lower than the stride along that same dimension.
-      If set to `None` (default), the output shape is inferred.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch_size, channels, depth, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    dilation_rate: an integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix
-      (see `keras.initializers`). Defaults to 'glorot_uniform'.
-    bias_initializer: Initializer for the bias vector
-      (see `keras.initializers`). Defaults to 'zeros'.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix
-      (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector
-      (see `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation")
-      (see `keras.regularizers`).
-    kernel_constraint: Constraint function applied to the kernel matrix
-      (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector
-      (see `keras.constraints`).
-
-  Input shape:
-    5D tensor with shape:
-    `(batch_size, channels, depth, rows, cols)` if data_format='channels_first'
-    or 5D tensor with shape:
-    `(batch_size, depth, rows, cols, channels)` if data_format='channels_last'.
-
-  Output shape:
-    5D tensor with shape:
-    `(batch_size, filters, new_depth, new_rows, new_cols)` if
+    """Transposed convolution layer (sometimes called Deconvolution).
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
+
+    When using this layer as the first layer in a model,
+    provide the keyword argument `input_shape`
+    (tuple of integers or `None`, does not include the sample axis),
+    e.g. `input_shape=(128, 128, 128, 3)` for a 128x128x128 volume with 3
+    channels if `data_format="channels_last"`.
+
+    Args:
+      filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+        depth, height and width of the 3D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the convolution along the depth, height
+          and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input.
+      output_padding: An integer or tuple/list of 3 integers,
+        specifying the amount of padding along the depth, height, and
+        width.
+        Can be a single integer to specify the same value for all
+        spatial dimensions.
+        The amount of output padding along a given dimension must be
+        lower than the stride along that same dimension.
+        If set to `None` (default), the output shape is inferred.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch_size, channels, depth, height, width)`.
+        When unspecified, uses `image_data_format` value found in your Keras
+        config file at `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: an integer or tuple/list of 3 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix
+        (see `keras.initializers`). Defaults to 'glorot_uniform'.
+      bias_initializer: Initializer for the bias vector
+        (see `keras.initializers`). Defaults to 'zeros'.
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix
+        (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector
+        (see `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to
+        the output of the layer (its "activation")
+        (see `keras.regularizers`).
+      kernel_constraint: Constraint function applied to the kernel matrix
+        (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector
+        (see `keras.constraints`).
+
+    Input shape:
+      5D tensor with shape:
+      `(batch_size, channels, depth, rows, cols)` if
       data_format='channels_first'
-    or 5D tensor with shape:
-    `(batch_size, new_depth, new_rows, new_cols, filters)` if
+      or 5D tensor with shape:
+      `(batch_size, depth, rows, cols, channels)` if
       data_format='channels_last'.
-    `depth` and `rows` and `cols` values might have changed due to padding.
-    If `output_padding` is specified::
-    ```
-    new_depth = ((depth - 1) * strides[0] + kernel_size[0] - 2 * padding[0] +
-    output_padding[0])
-    new_rows = ((rows - 1) * strides[1] + kernel_size[1] - 2 * padding[1] +
-    output_padding[1])
-    new_cols = ((cols - 1) * strides[2] + kernel_size[2] - 2 * padding[2] +
-    output_padding[2])
-    ```
-
-  Returns:
-    A tensor of rank 5 representing
-    `activation(conv3dtranspose(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is "causal".
-    ValueError: when both `strides` > 1 and `dilation_rate` > 1.
-
-  References:
-    - [A guide to convolution arithmetic for deep
-      learning](https://arxiv.org/abs/1603.07285v1)
-    - [Deconvolutional
-      Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
-  """
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1, 1),
-               padding='valid',
-               output_padding=None,
-               data_format=None,
-               dilation_rate=(1, 1, 1),
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
-
-    self.output_padding = output_padding
-    if self.output_padding is not None:
-      self.output_padding = conv_utils.normalize_tuple(
-          self.output_padding, 3, 'output_padding', allow_zero=True)
-      for stride, out_pad in zip(self.strides, self.output_padding):
-        if out_pad >= stride:
-          raise ValueError('Strides must be greater than output padding. '
-                           f'Received strides={self.strides}, '
-                           f'output_padding={self.output_padding}.')
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5. '
-                       f'Received input_shape={input_shape}.')
-    channel_axis = self._get_channel_axis()
-    if input_shape.dims[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'to `Conv3DTranspose` should be defined. '
-                       f'The input_shape received is {input_shape}, '
-                       f'where axis {channel_axis} (0-based) '
-                       'is the channel dimension, which found to be `None`.')
-    input_dim = int(input_shape[channel_axis])
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-    self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
-
-    self.kernel = self.add_weight(
-        'kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          'bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = tf.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      d_axis, h_axis, w_axis = 2, 3, 4
-    else:
-      d_axis, h_axis, w_axis = 1, 2, 3
-
-    depth = inputs_shape[d_axis]
-    height = inputs_shape[h_axis]
-    width = inputs_shape[w_axis]
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    if self.output_padding is None:
-      out_pad_d = out_pad_h = out_pad_w = None
-    else:
-      out_pad_d, out_pad_h, out_pad_w = self.output_padding
-
-    # Infer the dynamic output shape:
-    out_depth = conv_utils.deconv_output_length(depth,
-                                                kernel_d,
-                                                padding=self.padding,
-                                                output_padding=out_pad_d,
-                                                stride=stride_d)
-    out_height = conv_utils.deconv_output_length(height,
-                                                 kernel_h,
-                                                 padding=self.padding,
-                                                 output_padding=out_pad_h,
-                                                 stride=stride_h)
-    out_width = conv_utils.deconv_output_length(width,
-                                                kernel_w,
-                                                padding=self.padding,
-                                                output_padding=out_pad_w,
-                                                stride=stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_depth, out_height,
-                      out_width)
-      strides = (1, 1, stride_d, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_depth, out_height, out_width,
-                      self.filters)
-      strides = (1, stride_d, stride_h, stride_w, 1)
-
-    output_shape_tensor = tf.stack(output_shape)
-    outputs = tf.nn.conv3d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        data_format=conv_utils.convert_data_format(self.data_format, ndim=5),
-        padding=self.padding.upper())
-
-    if not tf.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = self.compute_output_shape(inputs.shape)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs = tf.nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    if self.output_padding is None:
-      out_pad_d = out_pad_h = out_pad_w = None
-    else:
-      out_pad_d, out_pad_h, out_pad_w = self.output_padding
-
-    output_shape[c_axis] = self.filters
-    output_shape[d_axis] = conv_utils.deconv_output_length(
-        output_shape[d_axis],
-        kernel_d,
-        padding=self.padding,
-        output_padding=out_pad_d,
-        stride=stride_d)
-    output_shape[h_axis] = conv_utils.deconv_output_length(
-        output_shape[h_axis],
-        kernel_h,
-        padding=self.padding,
-        output_padding=out_pad_h,
-        stride=stride_h)
-    output_shape[w_axis] = conv_utils.deconv_output_length(
-        output_shape[w_axis],
-        kernel_w,
-        padding=self.padding,
-        output_padding=out_pad_w,
-        stride=stride_w)
-    return tf.TensorShape(output_shape)
-
-  def get_config(self):
-    config = super().get_config()
-    config.pop('dilation_rate')
-    config['output_padding'] = self.output_padding
-    return config
+
+    Output shape:
+      5D tensor with shape:
+      `(batch_size, filters, new_depth, new_rows, new_cols)` if
+        data_format='channels_first'
+      or 5D tensor with shape:
+      `(batch_size, new_depth, new_rows, new_cols, filters)` if
+        data_format='channels_last'.
+      `depth` and `rows` and `cols` values might have changed due to padding.
+      If `output_padding` is specified::
+      ```
+      new_depth = ((depth - 1) * strides[0] + kernel_size[0] - 2 * padding[0] +
+      output_padding[0])
+      new_rows = ((rows - 1) * strides[1] + kernel_size[1] - 2 * padding[1] +
+      output_padding[1])
+      new_cols = ((cols - 1) * strides[2] + kernel_size[2] - 2 * padding[2] +
+      output_padding[2])
+      ```
+
+    Returns:
+      A tensor of rank 5 representing
+      `activation(conv3dtranspose(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is "causal".
+      ValueError: when both `strides` > 1 and `dilation_rate` > 1.
+
+    References:
+      - [A guide to convolution arithmetic for deep
+        learning](https://arxiv.org/abs/1603.07285v1)
+      - [Deconvolutional
+        Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
+    """
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1, 1),
+        padding="valid",
+        output_padding=None,
+        data_format=None,
+        dilation_rate=(1, 1, 1),
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            kernel_initializer=initializers.get(kernel_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            kernel_regularizer=regularizers.get(kernel_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            kernel_constraint=constraints.get(kernel_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs,
+        )
+
+        self.output_padding = output_padding
+        if self.output_padding is not None:
+            self.output_padding = conv_utils.normalize_tuple(
+                self.output_padding, 3, "output_padding", allow_zero=True
+            )
+            for stride, out_pad in zip(self.strides, self.output_padding):
+                if out_pad >= stride:
+                    raise ValueError(
+                        "Strides must be greater than output padding. "
+                        f"Received strides={self.strides}, "
+                        f"output_padding={self.output_padding}."
+                    )
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        if len(input_shape) != 5:
+            raise ValueError(
+                "Inputs should have rank 5. "
+                f"Received input_shape={input_shape}."
+            )
+        channel_axis = self._get_channel_axis()
+        if input_shape.dims[channel_axis].value is None:
+            raise ValueError(
+                "The channel dimension of the inputs "
+                "to `Conv3DTranspose` should be defined. "
+                f"The input_shape received is {input_shape}, "
+                f"where axis {channel_axis} (0-based) "
+                "is the channel dimension, which found to be `None`."
+            )
+        input_dim = int(input_shape[channel_axis])
+        kernel_shape = self.kernel_size + (self.filters, input_dim)
+        self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
+
+        self.kernel = self.add_weight(
+            "kernel",
+            shape=kernel_shape,
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            trainable=True,
+            dtype=self.dtype,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                "bias",
+                shape=(self.filters,),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                trainable=True,
+                dtype=self.dtype,
+            )
+        else:
+            self.bias = None
+        self.built = True
+
+    def call(self, inputs):
+        inputs_shape = tf.shape(inputs)
+        batch_size = inputs_shape[0]
+        if self.data_format == "channels_first":
+            d_axis, h_axis, w_axis = 2, 3, 4
+        else:
+            d_axis, h_axis, w_axis = 1, 2, 3
+
+        depth = inputs_shape[d_axis]
+        height = inputs_shape[h_axis]
+        width = inputs_shape[w_axis]
+
+        kernel_d, kernel_h, kernel_w = self.kernel_size
+        stride_d, stride_h, stride_w = self.strides
+
+        if self.output_padding is None:
+            out_pad_d = out_pad_h = out_pad_w = None
+        else:
+            out_pad_d, out_pad_h, out_pad_w = self.output_padding
+
+        # Infer the dynamic output shape:
+        out_depth = conv_utils.deconv_output_length(
+            depth,
+            kernel_d,
+            padding=self.padding,
+            output_padding=out_pad_d,
+            stride=stride_d,
+        )
+        out_height = conv_utils.deconv_output_length(
+            height,
+            kernel_h,
+            padding=self.padding,
+            output_padding=out_pad_h,
+            stride=stride_h,
+        )
+        out_width = conv_utils.deconv_output_length(
+            width,
+            kernel_w,
+            padding=self.padding,
+            output_padding=out_pad_w,
+            stride=stride_w,
+        )
+        if self.data_format == "channels_first":
+            output_shape = (
+                batch_size,
+                self.filters,
+                out_depth,
+                out_height,
+                out_width,
+            )
+            strides = (1, 1, stride_d, stride_h, stride_w)
+        else:
+            output_shape = (
+                batch_size,
+                out_depth,
+                out_height,
+                out_width,
+                self.filters,
+            )
+            strides = (1, stride_d, stride_h, stride_w, 1)
+
+        output_shape_tensor = tf.stack(output_shape)
+        outputs = tf.nn.conv3d_transpose(
+            inputs,
+            self.kernel,
+            output_shape_tensor,
+            strides,
+            data_format=conv_utils.convert_data_format(
+                self.data_format, ndim=5
+            ),
+            padding=self.padding.upper(),
+        )
+
+        if not tf.executing_eagerly() and inputs.shape.rank:
+            # Infer the static output shape:
+            out_shape = self.compute_output_shape(inputs.shape)
+            outputs.set_shape(out_shape)
+
+        if self.use_bias:
+            outputs = tf.nn.bias_add(
+                outputs,
+                self.bias,
+                data_format=conv_utils.convert_data_format(
+                    self.data_format, ndim=4
+                ),
+            )
+
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        output_shape = list(input_shape)
+        if self.data_format == "channels_first":
+            c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+        else:
+            c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+        kernel_d, kernel_h, kernel_w = self.kernel_size
+        stride_d, stride_h, stride_w = self.strides
+
+        if self.output_padding is None:
+            out_pad_d = out_pad_h = out_pad_w = None
+        else:
+            out_pad_d, out_pad_h, out_pad_w = self.output_padding
+
+        output_shape[c_axis] = self.filters
+        output_shape[d_axis] = conv_utils.deconv_output_length(
+            output_shape[d_axis],
+            kernel_d,
+            padding=self.padding,
+            output_padding=out_pad_d,
+            stride=stride_d,
+        )
+        output_shape[h_axis] = conv_utils.deconv_output_length(
+            output_shape[h_axis],
+            kernel_h,
+            padding=self.padding,
+            output_padding=out_pad_h,
+            stride=stride_h,
+        )
+        output_shape[w_axis] = conv_utils.deconv_output_length(
+            output_shape[w_axis],
+            kernel_w,
+            padding=self.padding,
+            output_padding=out_pad_w,
+            stride=stride_w,
+        )
+        return tf.TensorShape(output_shape)
+
+    def get_config(self):
+        config = super().get_config()
+        config.pop("dilation_rate")
+        config["output_padding"] = self.output_padding
+        return config
+
 
 # Alias
 
diff --git a/keras/layers/convolutional/conv_test.py b/keras/layers/convolutional/conv_test.py
index 86aaf8eff75a..859a45cfbeb4 100644
--- a/keras/layers/convolutional/conv_test.py
+++ b/keras/layers/convolutional/conv_test.py
@@ -15,544 +15,666 @@
 """Tests for convolutional layers."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util as tf_test_utils
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
 @test_combinations.run_all_keras_modes
 class Conv1DTest(test_combinations.TestCase):
-
-  def _run_test(self, kwargs, expected_output_shape):
-    num_samples = 2
-    stack_size = 3
-    length = 7
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.Conv1D,
-          kwargs=kwargs,
-          input_shape=(num_samples, length, stack_size),
-          expected_output_shape=expected_output_shape)
-
-  def _run_test_extra_batch_dim(self, kwargs, expected_output_shape):
-    batch_shape = (2, 11)
-    stack_size = 3
-    length = 7
-
-    with self.cached_session():
-      if expected_output_shape is not None:
-        expected_output_shape = (None,) + expected_output_shape
-
-      test_utils.layer_test(
-          keras.layers.Conv1D,
-          kwargs=kwargs,
-          input_shape=batch_shape + (length, stack_size),
-          expected_output_shape=expected_output_shape)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }, (None, 5, 2)),
-      ('padding_same', {
-          'padding': 'same'
-      }, (None, 7, 2)),
-      ('padding_same_dilation_2', {
-          'padding': 'same',
-          'dilation_rate': 2
-      }, (None, 7, 2)),
-      ('padding_same_dilation_3', {
-          'padding': 'same',
-          'dilation_rate': 3
-      }, (None, 7, 2)),
-      ('padding_causal', {
-          'padding': 'causal'
-      }, (None, 7, 2)),
-      ('strides', {
-          'strides': 2
-      }, (None, 3, 2)),
-      ('dilation_rate', {
-          'dilation_rate': 2
-      }, (None, 3, 2)),
-      ('group', {
-          'groups': 3,
-          'filters': 6
-      }, (None, 5, 6)),
-  )
-  def test_conv1d(self, kwargs, expected_output_shape):
-    kwargs['filters'] = kwargs.get('filters', 2)
-    kwargs['kernel_size'] = 3
-    self._run_test(kwargs, expected_output_shape)
-    self._run_test_extra_batch_dim(kwargs, expected_output_shape)
-
-  def test_conv1d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
-
-  def test_conv1d_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_conv1d_recreate_conv(self):
-    with self.cached_session():
-      layer = keras.layers.Conv1D(filters=1,
-                                  kernel_size=3,
-                                  strides=1,
-                                  dilation_rate=2,
-                                  padding='causal')
-      inpt1 = np.random.normal(size=[1, 2, 1])
-      inpt2 = np.random.normal(size=[1, 1, 1])
-      outp1_shape = layer(inpt1).shape
-      _ = layer(inpt2).shape
-      self.assertEqual(outp1_shape, layer(inpt1).shape)
-
-  def test_conv1d_recreate_conv_unknown_dims(self):
-    with self.cached_session():
-      layer = keras.layers.Conv1D(filters=1,
-                                  kernel_size=3,
-                                  strides=1,
-                                  dilation_rate=2,
-                                  padding='causal')
-
-      inpt1 = np.random.normal(size=[1, 9, 1]).astype(np.float32)
-      inpt2 = np.random.normal(size=[1, 2, 1]).astype(np.float32)
-      outp1_shape = layer(inpt1).shape
-
-      @tf.function(input_signature=[
-          tf.TensorSpec([1, None, 1])])
-      def fn(inpt):
-        return layer(inpt)
-
-      fn(inpt2)
-      self.assertEqual(outp1_shape, layer(inpt1).shape)
-
-  def test_conv1d_invalid_output_shapes(self):
-    kwargs = {'filters': 2, 'kernel_size': 20}
-    with self.assertRaisesRegex(
-        ValueError, r"""One of the dimensions in the output is <= 0"""):
-      layer = keras.layers.Conv1D(**kwargs)
-      layer.build((None, 5, 2))
+    def _run_test(self, kwargs, expected_output_shape):
+        num_samples = 2
+        stack_size = 3
+        length = 7
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.Conv1D,
+                kwargs=kwargs,
+                input_shape=(num_samples, length, stack_size),
+                expected_output_shape=expected_output_shape,
+            )
+
+    def _run_test_extra_batch_dim(self, kwargs, expected_output_shape):
+        batch_shape = (2, 11)
+        stack_size = 3
+        length = 7
+
+        with self.cached_session():
+            if expected_output_shape is not None:
+                expected_output_shape = (None,) + expected_output_shape
+
+            test_utils.layer_test(
+                keras.layers.Conv1D,
+                kwargs=kwargs,
+                input_shape=batch_shape + (length, stack_size),
+                expected_output_shape=expected_output_shape,
+            )
+
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}, (None, 5, 2)),
+        ("padding_same", {"padding": "same"}, (None, 7, 2)),
+        (
+            "padding_same_dilation_2",
+            {"padding": "same", "dilation_rate": 2},
+            (None, 7, 2),
+        ),
+        (
+            "padding_same_dilation_3",
+            {"padding": "same", "dilation_rate": 3},
+            (None, 7, 2),
+        ),
+        ("padding_causal", {"padding": "causal"}, (None, 7, 2)),
+        ("strides", {"strides": 2}, (None, 3, 2)),
+        ("dilation_rate", {"dilation_rate": 2}, (None, 3, 2)),
+        ("group", {"groups": 3, "filters": 6}, (None, 5, 6)),
+    )
+    def test_conv1d(self, kwargs, expected_output_shape):
+        kwargs["filters"] = kwargs.get("filters", 2)
+        kwargs["kernel_size"] = 3
+        self._run_test(kwargs, expected_output_shape)
+        self._run_test_extra_batch_dim(kwargs, expected_output_shape)
+
+    def test_conv1d_regularizers(self):
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv1D(**kwargs)
+            layer.build((None, 5, 2))
+            self.assertEqual(len(layer.losses), 2)
+            layer(keras.backend.variable(np.ones((1, 5, 2))))
+            self.assertEqual(len(layer.losses), 3)
+
+    def test_conv1d_constraints(self):
+        k_constraint = lambda x: x
+        b_constraint = lambda x: x
+
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_constraint": k_constraint,
+            "bias_constraint": b_constraint,
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv1D(**kwargs)
+            layer.build((None, 5, 2))
+            self.assertEqual(layer.kernel.constraint, k_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_conv1d_recreate_conv(self):
+        with self.cached_session():
+            layer = keras.layers.Conv1D(
+                filters=1,
+                kernel_size=3,
+                strides=1,
+                dilation_rate=2,
+                padding="causal",
+            )
+            inpt1 = np.random.normal(size=[1, 2, 1])
+            inpt2 = np.random.normal(size=[1, 1, 1])
+            outp1_shape = layer(inpt1).shape
+            _ = layer(inpt2).shape
+            self.assertEqual(outp1_shape, layer(inpt1).shape)
+
+    def test_conv1d_recreate_conv_unknown_dims(self):
+        with self.cached_session():
+            layer = keras.layers.Conv1D(
+                filters=1,
+                kernel_size=3,
+                strides=1,
+                dilation_rate=2,
+                padding="causal",
+            )
+
+            inpt1 = np.random.normal(size=[1, 9, 1]).astype(np.float32)
+            inpt2 = np.random.normal(size=[1, 2, 1]).astype(np.float32)
+            outp1_shape = layer(inpt1).shape
+
+            @tf.function(input_signature=[tf.TensorSpec([1, None, 1])])
+            def fn(inpt):
+                return layer(inpt)
+
+            fn(inpt2)
+            self.assertEqual(outp1_shape, layer(inpt1).shape)
+
+    def test_conv1d_invalid_output_shapes(self):
+        kwargs = {"filters": 2, "kernel_size": 20}
+        with self.assertRaisesRegex(
+            ValueError, r"""One of the dimensions in the output is <= 0"""
+        ):
+            layer = keras.layers.Conv1D(**kwargs)
+            layer.build((None, 5, 2))
+
+    def test_conv1d_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": 2, "dilation_rate": 2}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.Conv1D(filters=1, kernel_size=2, **kwargs)
 
 
 @test_combinations.run_all_keras_modes
 class Conv2DTest(test_combinations.TestCase):
-
-  def _run_test(self, kwargs, expected_output_shape, spatial_shape=(7, 6)):
-    num_samples = 2
-    stack_size = 3
-    num_row, num_col = spatial_shape
-    input_data = None
-    # Generate valid input data.
-    if None in spatial_shape:
-      input_data_shape = (num_samples, num_row or 7, num_col or 6, stack_size)
-      input_data = 10 * np.random.random(input_data_shape).astype(np.float32)
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.Conv2D,
-          kwargs=kwargs,
-          input_shape=(num_samples, num_row, num_col, stack_size),
-          input_data=input_data,
-          expected_output_shape=expected_output_shape)
-
-  def _run_test_extra_batch_dim(self,
-                                kwargs,
-                                expected_output_shape,
-                                spatial_shape=(7, 6)):
-    batch_shape = (2, 11)
-    stack_size = 3
-    num_row, num_col = spatial_shape
-    input_data = None
-    # Generate valid input data.
-    if None in spatial_shape:
-      input_data_shape = batch_shape + (num_row or 7, num_col or 6, stack_size)
-      input_data = 10 * np.random.random(input_data_shape).astype(np.float32)
-
-    with self.cached_session():
-      if expected_output_shape is not None:
-        expected_output_shape = (None,) + expected_output_shape
-      test_utils.layer_test(
-          keras.layers.Conv2D,
-          kwargs=kwargs,
-          input_shape=batch_shape + (num_row, num_col, stack_size),
-          input_data=input_data,
-          expected_output_shape=expected_output_shape)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }, (None, 5, 4, 2)),
-      ('padding_same', {
-          'padding': 'same'
-      }, (None, 7, 6, 2)),
-      ('padding_same_dilation_2', {
-          'padding': 'same',
-          'dilation_rate': 2
-      }, (None, 7, 6, 2)),
-      ('strides', {
-          'strides': (2, 2)
-      }, (None, 3, 2, 2)),
-      ('dilation_rate', {
-          'dilation_rate': (2, 2)
-      }, (None, 3, 2, 2)),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {
-          'data_format': 'channels_first'
-      }, None, True),
-      ('group', {
-          'groups': 3,
-          'filters': 6
-      }, (None, 5, 4, 6), False),
-      ('dilation_2_unknown_width', {
-          'dilation_rate': (2, 2)
-      }, (None, None, 2, 2), False, (None, 6)),
-      ('dilation_2_unknown_height', {
-          'dilation_rate': (2, 2)
-      }, (None, 3, None, 2), False, (7, None)),
-  )
-  def test_conv2d(self,
-                  kwargs,
-                  expected_output_shape=None,
-                  requires_gpu=False,
-                  spatial_shape=(7, 6)):
-    kwargs['filters'] = kwargs.get('filters', 2)
-    kwargs['kernel_size'] = (3, 3)
-    if not requires_gpu or tf.test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape, spatial_shape)
-      self._run_test_extra_batch_dim(kwargs, expected_output_shape,
-                                     spatial_shape)
-
-  def test_conv2d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
-
-  def test_conv2d_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_conv2d_zero_kernel_size(self):
-    kwargs = {'filters': 2, 'kernel_size': 0}
-    with self.assertRaises(ValueError):
-      keras.layers.Conv2D(**kwargs)
-
-  def test_conv2d_invalid_output_shapes(self):
-    kwargs = {'filters': 2, 'kernel_size': 20}
-    with self.assertRaisesRegex(
-        ValueError, r"""One of the dimensions in the output is <= 0"""):
-      layer = keras.layers.Conv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
+    def _run_test(self, kwargs, expected_output_shape, spatial_shape=(7, 6)):
+        num_samples = 2
+        stack_size = 3
+        num_row, num_col = spatial_shape
+        input_data = None
+        # Generate valid input data.
+        if None in spatial_shape:
+            input_data_shape = (
+                num_samples,
+                num_row or 7,
+                num_col or 6,
+                stack_size,
+            )
+            input_data = 10 * np.random.random(input_data_shape).astype(
+                np.float32
+            )
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.Conv2D,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_row, num_col, stack_size),
+                input_data=input_data,
+                expected_output_shape=expected_output_shape,
+            )
+
+    def _run_test_extra_batch_dim(
+        self, kwargs, expected_output_shape, spatial_shape=(7, 6)
+    ):
+        batch_shape = (2, 11)
+        stack_size = 3
+        num_row, num_col = spatial_shape
+        input_data = None
+        # Generate valid input data.
+        if None in spatial_shape:
+            input_data_shape = batch_shape + (
+                num_row or 7,
+                num_col or 6,
+                stack_size,
+            )
+            input_data = 10 * np.random.random(input_data_shape).astype(
+                np.float32
+            )
+
+        with self.cached_session():
+            if expected_output_shape is not None:
+                expected_output_shape = (None,) + expected_output_shape
+            test_utils.layer_test(
+                keras.layers.Conv2D,
+                kwargs=kwargs,
+                input_shape=batch_shape + (num_row, num_col, stack_size),
+                input_data=input_data,
+                expected_output_shape=expected_output_shape,
+            )
+
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}, (None, 5, 4, 2)),
+        ("padding_same", {"padding": "same"}, (None, 7, 6, 2)),
+        (
+            "padding_same_dilation_2",
+            {"padding": "same", "dilation_rate": 2},
+            (None, 7, 6, 2),
+        ),
+        ("strides", {"strides": (2, 2)}, (None, 3, 2, 2)),
+        ("dilation_rate", {"dilation_rate": (2, 2)}, (None, 3, 2, 2)),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}, None, True),
+        ("group", {"groups": 3, "filters": 6}, (None, 5, 4, 6), False),
+        (
+            "dilation_2_unknown_width",
+            {"dilation_rate": (2, 2)},
+            (None, None, 2, 2),
+            False,
+            (None, 6),
+        ),
+        (
+            "dilation_2_unknown_height",
+            {"dilation_rate": (2, 2)},
+            (None, 3, None, 2),
+            False,
+            (7, None),
+        ),
+    )
+    def test_conv2d(
+        self,
+        kwargs,
+        expected_output_shape=None,
+        requires_gpu=False,
+        spatial_shape=(7, 6),
+    ):
+        kwargs["filters"] = kwargs.get("filters", 2)
+        kwargs["kernel_size"] = (3, 3)
+        if not requires_gpu or tf.test.is_gpu_available(cuda_only=True):
+            self._run_test(kwargs, expected_output_shape, spatial_shape)
+            self._run_test_extra_batch_dim(
+                kwargs, expected_output_shape, spatial_shape
+            )
+
+    def test_conv2d_regularizers(self):
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv2D(**kwargs)
+            layer.build((None, 5, 5, 2))
+            self.assertEqual(len(layer.losses), 2)
+            layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+            self.assertEqual(len(layer.losses), 3)
+
+    def test_conv2d_constraints(self):
+        k_constraint = lambda x: x
+        b_constraint = lambda x: x
+
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_constraint": k_constraint,
+            "bias_constraint": b_constraint,
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv2D(**kwargs)
+            layer.build((None, 5, 5, 2))
+            self.assertEqual(layer.kernel.constraint, k_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_conv2d_zero_kernel_size(self):
+        kwargs = {"filters": 2, "kernel_size": 0}
+        with self.assertRaises(ValueError):
+            keras.layers.Conv2D(**kwargs)
+
+    def test_conv2d_invalid_output_shapes(self):
+        kwargs = {"filters": 2, "kernel_size": 20}
+        with self.assertRaisesRegex(
+            ValueError, r"""One of the dimensions in the output is <= 0"""
+        ):
+            layer = keras.layers.Conv2D(**kwargs)
+            layer.build((None, 5, 5, 2))
+
+    def test_conv2d_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": [1, 2], "dilation_rate": [2, 1]}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.Conv2D(filters=1, kernel_size=2, **kwargs)
 
 
 @test_combinations.run_all_keras_modes
 class Conv3DTest(test_combinations.TestCase):
-
-  def _run_test(self, kwargs, expected_output_shape, validate_training=True):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-    depth = 5
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.Conv3D,
-          kwargs=kwargs,
-          input_shape=(num_samples, depth, num_row, num_col, stack_size),
-          expected_output_shape=expected_output_shape,
-          validate_training=validate_training)
-
-  def _run_test_extra_batch_dim(self,
-                                kwargs,
-                                expected_output_shape,
-                                validate_training=True):
-    batch_shape = (2, 11)
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-    depth = 5
-
-    with self.cached_session():
-      if expected_output_shape is not None:
-        expected_output_shape = (None,) + expected_output_shape
-
-      test_utils.layer_test(
-          keras.layers.Conv3D,
-          kwargs=kwargs,
-          input_shape=batch_shape + (depth, num_row, num_col, stack_size),
-          expected_output_shape=expected_output_shape,
-          validate_training=validate_training)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }, (None, 3, 5, 4, 2)),
-      ('padding_same', {
-          'padding': 'same'
-      }, (None, 5, 7, 6, 2)),
-      ('strides', {
-          'strides': (2, 2, 2)
-      }, (None, 2, 3, 2, 2)),
-      ('dilation_rate', {
-          'dilation_rate': (2, 2, 2)
-      }, (None, 1, 3, 2, 2)),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {
-          'data_format': 'channels_first'
-      }, None, True),
-      ('group', {
-          'groups': 3,
-          'filters': 6
-      }, (None, 3, 5, 4, 6)),
-  )
-  def test_conv3d(self, kwargs, expected_output_shape=None, requires_gpu=False):
-    kwargs['filters'] = kwargs.get('filters', 2)
-    kwargs['kernel_size'] = (3, 3, 3)
-    # train_on_batch currently fails with XLA enabled on GPUs
-    test_training = 'groups' not in kwargs or not tf_test_utils.is_xla_enabled()
-    if not requires_gpu or tf.test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape, test_training)
-      self._run_test_extra_batch_dim(kwargs, expected_output_shape,
-                                     test_training)
-
-  def test_conv3d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv3D(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
-
-  def test_conv3d_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv3D(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_conv3d_dynamic_shape(self):
-    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
-    with self.cached_session():
-      # Won't raise error here.
-      test_utils.layer_test(
-          keras.layers.Conv3D,
-          kwargs={
-              'data_format': 'channels_last',
-              'filters': 3,
-              'kernel_size': 3
-          },
-          input_shape=(None, None, None, None, 3),
-          input_data=input_data)
-      if tf.test.is_gpu_available(cuda_only=True):
-        test_utils.layer_test(
-            keras.layers.Conv3D,
-            kwargs={
-                'data_format': 'channels_first',
-                'filters': 3,
-                'kernel_size': 3
-            },
-            input_shape=(None, 3, None, None, None),
-            input_data=input_data)
-
-  def test_conv3d_invalid_output_shapes(self):
-    kwargs = {'filters': 2, 'kernel_size': 20}
-    with self.assertRaisesRegex(
-        ValueError, r"""One of the dimensions in the output is <= 0"""):
-      layer = keras.layers.Conv3D(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
+    def _run_test(self, kwargs, expected_output_shape, validate_training=True):
+        num_samples = 2
+        stack_size = 3
+        num_row = 7
+        num_col = 6
+        depth = 5
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.Conv3D,
+                kwargs=kwargs,
+                input_shape=(num_samples, depth, num_row, num_col, stack_size),
+                expected_output_shape=expected_output_shape,
+                validate_training=validate_training,
+            )
+
+    def _run_test_extra_batch_dim(
+        self, kwargs, expected_output_shape, validate_training=True
+    ):
+        batch_shape = (2, 11)
+        stack_size = 3
+        num_row = 7
+        num_col = 6
+        depth = 5
+
+        with self.cached_session():
+            if expected_output_shape is not None:
+                expected_output_shape = (None,) + expected_output_shape
+
+            test_utils.layer_test(
+                keras.layers.Conv3D,
+                kwargs=kwargs,
+                input_shape=batch_shape + (depth, num_row, num_col, stack_size),
+                expected_output_shape=expected_output_shape,
+                validate_training=validate_training,
+            )
+
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}, (None, 3, 5, 4, 2)),
+        ("padding_same", {"padding": "same"}, (None, 5, 7, 6, 2)),
+        ("strides", {"strides": (2, 2, 2)}, (None, 2, 3, 2, 2)),
+        ("dilation_rate", {"dilation_rate": (2, 2, 2)}, (None, 1, 3, 2, 2)),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}, None, True),
+        ("group", {"groups": 3, "filters": 6}, (None, 3, 5, 4, 6)),
+    )
+    def test_conv3d(
+        self, kwargs, expected_output_shape=None, requires_gpu=False
+    ):
+        kwargs["filters"] = kwargs.get("filters", 2)
+        kwargs["kernel_size"] = (3, 3, 3)
+        # train_on_batch currently fails with XLA enabled on GPUs
+        test_training = (
+            "groups" not in kwargs or not tf_test_utils.is_xla_enabled()
+        )
+        if not requires_gpu or tf.test.is_gpu_available(cuda_only=True):
+            self._run_test(kwargs, expected_output_shape, test_training)
+            self._run_test_extra_batch_dim(
+                kwargs, expected_output_shape, test_training
+            )
+
+    def test_conv3d_regularizers(self):
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv3D(**kwargs)
+            layer.build((None, 5, 5, 5, 2))
+            self.assertEqual(len(layer.losses), 2)
+            self.assertEqual(len(layer.losses), 2)
+            layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+            self.assertEqual(len(layer.losses), 3)
+
+    def test_conv3d_constraints(self):
+        k_constraint = lambda x: x
+        b_constraint = lambda x: x
+
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_constraint": k_constraint,
+            "bias_constraint": b_constraint,
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv3D(**kwargs)
+            layer.build((None, 5, 5, 5, 2))
+            self.assertEqual(layer.kernel.constraint, k_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_conv3d_dynamic_shape(self):
+        input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+        with self.cached_session():
+            # Won't raise error here.
+            test_utils.layer_test(
+                keras.layers.Conv3D,
+                kwargs={
+                    "data_format": "channels_last",
+                    "filters": 3,
+                    "kernel_size": 3,
+                },
+                input_shape=(None, None, None, None, 3),
+                input_data=input_data,
+            )
+            if tf.test.is_gpu_available(cuda_only=True):
+                test_utils.layer_test(
+                    keras.layers.Conv3D,
+                    kwargs={
+                        "data_format": "channels_first",
+                        "filters": 3,
+                        "kernel_size": 3,
+                    },
+                    input_shape=(None, 3, None, None, None),
+                    input_data=input_data,
+                )
+
+    def test_conv3d_invalid_output_shapes(self):
+        kwargs = {"filters": 2, "kernel_size": 20}
+        with self.assertRaisesRegex(
+            ValueError, r"""One of the dimensions in the output is <= 0"""
+        ):
+            layer = keras.layers.Conv3D(**kwargs)
+            layer.build((None, 5, 5, 5, 2))
+
+    def test_conv3d_zero_dim_output(self):
+        conv = keras.layers.Convolution3DTranspose(2, [3, 3, 3], padding="same")
+        x = tf.random.uniform([1, 32, 32, 0, 3], dtype=tf.float32)
+        # The layer doesn't crash with 0 dim input
+        _ = conv(x)
+
+    def test_conv3d_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": [1, 1, 2], "dilation_rate": [1, 2, 1]}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.Conv3D(filters=1, kernel_size=2, **kwargs)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class GroupedConvTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ('Conv1D', keras.layers.Conv1D),
-      ('Conv2D', keras.layers.Conv2D),
-      ('Conv3D', keras.layers.Conv3D),
-  )
-  def test_group_conv_incorrect_use(self, layer):
-    with self.assertRaisesRegex(ValueError, 'The number of filters'):
-      layer(16, 3, groups=3)
-    with self.assertRaisesRegex(ValueError, 'The number of input channels'):
-      layer(16, 3, groups=4).build((32, 12, 12, 3))
-
-  @parameterized.named_parameters(
-      ('Conv1D', keras.layers.Conv1D, (32, 12, 32)),
-      ('Conv2D', keras.layers.Conv2D, (32, 12, 12, 32)),
-      ('Conv3D', keras.layers.Conv3D, (32, 12, 12, 12, 32)),
-  )
-  def test_group_conv(self, layer_cls, input_shape):
-    if tf.test.is_gpu_available(cuda_only=True):
-      with test_utils.use_gpu():
-        inputs = tf.random.uniform(shape=input_shape)
-
-        layer = layer_cls(16, 3, groups=4, use_bias=False)
-        layer.build(input_shape)
-
-        input_slices = tf.split(inputs, 4, axis=-1)
-        weight_slices = tf.split(layer.kernel, 4, axis=-1)
-        expected_outputs = tf.concat([
-            tf.nn.convolution(inputs, weights)
-            for inputs, weights in zip(input_slices, weight_slices)
-        ],
-                                     axis=-1)
-        self.assertAllClose(
-            layer(inputs), expected_outputs, rtol=3e-5, atol=3e-5)
-
-  def test_group_conv_depthwise(self):
-    if tf.test.is_gpu_available(cuda_only=True):
-      with test_utils.use_gpu():
-        inputs = tf.random.uniform(shape=(3, 27, 27, 32))
-
-        layer = keras.layers.Conv2D(32, 3, groups=32, use_bias=False)
-        layer.build((3, 27, 27, 32))
-
-        weights_dw = tf.reshape(layer.kernel, [3, 3, 32, 1])
-        expected_outputs = tf.compat.v1.nn.depthwise_conv2d(
-            inputs, weights_dw, strides=[1, 1, 1, 1], padding='VALID')
-
-        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
+    @parameterized.named_parameters(
+        ("Conv1D", keras.layers.Conv1D),
+        ("Conv2D", keras.layers.Conv2D),
+        ("Conv3D", keras.layers.Conv3D),
+    )
+    def test_group_conv_incorrect_use(self, layer):
+        with self.assertRaisesRegex(ValueError, "The number of filters"):
+            layer(16, 3, groups=3)
+        with self.assertRaisesRegex(ValueError, "The number of input channels"):
+            layer(16, 3, groups=4).build((32, 12, 12, 3))
+
+    @parameterized.named_parameters(
+        ("Conv1D", keras.layers.Conv1D, (32, 12, 32)),
+        ("Conv2D", keras.layers.Conv2D, (32, 12, 12, 32)),
+        ("Conv3D", keras.layers.Conv3D, (32, 12, 12, 12, 32)),
+    )
+    def test_group_conv(self, layer_cls, input_shape):
+        if tf.test.is_gpu_available(cuda_only=True):
+            with test_utils.use_gpu():
+                inputs = tf.random.uniform(shape=input_shape)
+
+                layer = layer_cls(16, 3, groups=4, use_bias=False)
+                layer.build(input_shape)
+
+                input_slices = tf.split(inputs, 4, axis=-1)
+                weight_slices = tf.split(layer.kernel, 4, axis=-1)
+                expected_outputs = tf.concat(
+                    [
+                        tf.nn.convolution(inputs, weights)
+                        for inputs, weights in zip(input_slices, weight_slices)
+                    ],
+                    axis=-1,
+                )
+                self.assertAllClose(
+                    layer(inputs), expected_outputs, rtol=3e-5, atol=3e-5
+                )
+
+    def test_group_conv_depthwise(self):
+        if tf.test.is_gpu_available(cuda_only=True):
+            with test_utils.use_gpu():
+                inputs = tf.random.uniform(shape=(3, 27, 27, 32))
+
+                layer = keras.layers.Conv2D(32, 3, groups=32, use_bias=False)
+                layer.build((3, 27, 27, 32))
+
+                weights_dw = tf.reshape(layer.kernel, [3, 3, 32, 1])
+                expected_outputs = tf.compat.v1.nn.depthwise_conv2d(
+                    inputs, weights_dw, strides=[1, 1, 1, 1], padding="VALID"
+                )
+
+                self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
 
 
 @test_combinations.run_all_keras_modes
 class ConvSequentialTest(test_combinations.TestCase):
-
-  def _run_test(self, conv_layer_cls, kwargs, input_shape1, input_shape2,
-                expected_output_shape1, expected_output_shape2):
-    kwargs['filters'] = 1
-    kwargs['kernel_size'] = 3
-    kwargs['dilation_rate'] = 2
-    with self.cached_session():
-      layer = conv_layer_cls(**kwargs)
-      output1 = layer(np.zeros(input_shape1))
-      self.assertEqual(output1.shape, expected_output_shape1)
-      output2 = layer(np.zeros(input_shape2))
-      self.assertEqual(output2.shape, expected_output_shape2)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'},
-       (1, 8, 2), (1, 5, 2), (1, 4, 1), (1, 1, 1)),
-      ('padding_same', {'padding': 'same'},
-       (1, 8, 2), (1, 5, 2), (1, 8, 1), (1, 5, 1)),
-      ('padding_causal', {'padding': 'causal'},
-       (1, 8, 2), (1, 5, 2), (1, 8, 1), (1, 5, 1)),
-  )
-  def test_conv1d(self, kwargs, input_shape1, input_shape2,
-                  expected_output_shape1, expected_output_shape2):
-    self._run_test(keras.layers.Conv1D, kwargs, input_shape1, input_shape2,
-                   expected_output_shape1, expected_output_shape2)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'},
-       (1, 7, 6, 2), (1, 6, 5, 2), (1, 3, 2, 1), (1, 2, 1, 1)),
-      ('padding_same', {'padding': 'same'},
-       (1, 7, 6, 2), (1, 6, 5, 2), (1, 7, 6, 1), (1, 6, 5, 1)),
-  )
-  def test_conv2d(self, kwargs, input_shape1, input_shape2,
-                  expected_output_shape1, expected_output_shape2):
-    self._run_test(keras.layers.Conv2D, kwargs, input_shape1, input_shape2,
-                   expected_output_shape1, expected_output_shape2)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'},
-       (1, 5, 7, 6, 2), (1, 8, 6, 5, 2), (1, 1, 3, 2, 1), (1, 4, 2, 1, 1)),
-      ('padding_same', {'padding': 'same'},
-       (1, 5, 7, 6, 2), (1, 8, 6, 5, 2), (1, 5, 7, 6, 1), (1, 8, 6, 5, 1)),
-  )
-  def test_conv3d(self, kwargs, input_shape1, input_shape2,
-                  expected_output_shape1, expected_output_shape2):
-    self._run_test(keras.layers.Conv3D, kwargs, input_shape1, input_shape2,
-                   expected_output_shape1, expected_output_shape2)
-
-  def test_dynamic_shape(self):
-    with self.cached_session():
-      layer = keras.layers.Conv3D(2, 3)
-      input_shape = (5, None, None, 2)
-      inputs = keras.Input(shape=input_shape)
-      x = layer(inputs)
-      # Won't raise error here with None values in input shape (b/144282043).
-      layer(x)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _run_test(
+        self,
+        conv_layer_cls,
+        kwargs,
+        input_shape1,
+        input_shape2,
+        expected_output_shape1,
+        expected_output_shape2,
+    ):
+        kwargs["filters"] = 1
+        kwargs["kernel_size"] = 3
+        kwargs["dilation_rate"] = 2
+        with self.cached_session():
+            layer = conv_layer_cls(**kwargs)
+            output1 = layer(np.zeros(input_shape1))
+            self.assertEqual(output1.shape, expected_output_shape1)
+            output2 = layer(np.zeros(input_shape2))
+            self.assertEqual(output2.shape, expected_output_shape2)
+
+    @parameterized.named_parameters(
+        (
+            "padding_valid",
+            {"padding": "valid"},
+            (1, 8, 2),
+            (1, 5, 2),
+            (1, 4, 1),
+            (1, 1, 1),
+        ),
+        (
+            "padding_same",
+            {"padding": "same"},
+            (1, 8, 2),
+            (1, 5, 2),
+            (1, 8, 1),
+            (1, 5, 1),
+        ),
+        (
+            "padding_causal",
+            {"padding": "causal"},
+            (1, 8, 2),
+            (1, 5, 2),
+            (1, 8, 1),
+            (1, 5, 1),
+        ),
+    )
+    def test_conv1d(
+        self,
+        kwargs,
+        input_shape1,
+        input_shape2,
+        expected_output_shape1,
+        expected_output_shape2,
+    ):
+        self._run_test(
+            keras.layers.Conv1D,
+            kwargs,
+            input_shape1,
+            input_shape2,
+            expected_output_shape1,
+            expected_output_shape2,
+        )
+
+    @parameterized.named_parameters(
+        (
+            "padding_valid",
+            {"padding": "valid"},
+            (1, 7, 6, 2),
+            (1, 6, 5, 2),
+            (1, 3, 2, 1),
+            (1, 2, 1, 1),
+        ),
+        (
+            "padding_same",
+            {"padding": "same"},
+            (1, 7, 6, 2),
+            (1, 6, 5, 2),
+            (1, 7, 6, 1),
+            (1, 6, 5, 1),
+        ),
+    )
+    def test_conv2d(
+        self,
+        kwargs,
+        input_shape1,
+        input_shape2,
+        expected_output_shape1,
+        expected_output_shape2,
+    ):
+        self._run_test(
+            keras.layers.Conv2D,
+            kwargs,
+            input_shape1,
+            input_shape2,
+            expected_output_shape1,
+            expected_output_shape2,
+        )
+
+    @parameterized.named_parameters(
+        (
+            "padding_valid",
+            {"padding": "valid"},
+            (1, 5, 7, 6, 2),
+            (1, 8, 6, 5, 2),
+            (1, 1, 3, 2, 1),
+            (1, 4, 2, 1, 1),
+        ),
+        (
+            "padding_same",
+            {"padding": "same"},
+            (1, 5, 7, 6, 2),
+            (1, 8, 6, 5, 2),
+            (1, 5, 7, 6, 1),
+            (1, 8, 6, 5, 1),
+        ),
+    )
+    def test_conv3d(
+        self,
+        kwargs,
+        input_shape1,
+        input_shape2,
+        expected_output_shape1,
+        expected_output_shape2,
+    ):
+        self._run_test(
+            keras.layers.Conv3D,
+            kwargs,
+            input_shape1,
+            input_shape2,
+            expected_output_shape1,
+            expected_output_shape2,
+        )
+
+    def test_dynamic_shape(self):
+        with self.cached_session():
+            layer = keras.layers.Conv3D(2, 3)
+            input_shape = (5, None, None, 2)
+            inputs = keras.Input(shape=input_shape)
+            x = layer(inputs)
+            # Won't raise error here with None values in input shape
+            # (b/144282043).
+            layer(x)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/convolutional/conv_transpose_test.py b/keras/layers/convolutional/conv_transpose_test.py
index 48823996fb45..6747773371ed 100644
--- a/keras/layers/convolutional/conv_transpose_test.py
+++ b/keras/layers/convolutional/conv_transpose_test.py
@@ -14,245 +14,278 @@
 # ==============================================================================
 """Tests for convolutional transpose layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class Conv1DTransposeTest(test_combinations.TestCase):
+    def _run_test(self, kwargs, expected_output_shape):
+        num_samples = 2
+        stack_size = 3
+        num_col = 6
 
-  def _run_test(self, kwargs, expected_output_shape):
-    num_samples = 2
-    stack_size = 3
-    num_col = 6
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                keras.layers.Conv1DTranspose,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_col, stack_size),
+                expected_output_shape=expected_output_shape,
+            )
 
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          keras.layers.Conv1DTranspose,
-          kwargs=kwargs,
-          input_shape=(num_samples, num_col, stack_size),
-          expected_output_shape=expected_output_shape)
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}, (None, 8, 2)),
+        ("padding_same", {"padding": "same"}, (None, 6, 2)),
+        ("strides", {"strides": 2}, (None, 13, 2)),
+        # Only runs on GPU with CUDA, dilation_rate>1 is not supported on CPU.
+        ("dilation_rate", {"dilation_rate": 2}, (None, 10, 2)),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}),
+    )
+    def test_conv1d_transpose(self, kwargs, expected_output_shape=None):
+        kwargs["filters"] = 2
+        kwargs["kernel_size"] = 3
+        if (
+            "data_format" not in kwargs and "dilation_rate" not in kwargs
+        ) or tf.test.is_gpu_available(cuda_only=True):
+            self._run_test(kwargs, expected_output_shape)
 
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}, (None, 8, 2)),
-      ('padding_same', {'padding': 'same'}, (None, 6, 2)),
-      ('strides', {'strides': 2}, (None, 13, 2)),
-      # Only runs on GPU with CUDA, dilation_rate>1 is not supported on CPU.
-      ('dilation_rate', {'dilation_rate': 2}, (None, 10, 2)),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
-  )
-  def test_conv1d_transpose(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
-    kwargs['kernel_size'] = 3
-    if (('data_format' not in kwargs and 'dilation_rate' not in kwargs) or
-        tf.test.is_gpu_available(cuda_only=True)):
-      self._run_test(kwargs, expected_output_shape)
+    def test_conv1d_transpose_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": 2, "dilation_rate": 2}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.Conv1DTranspose(filters=1, kernel_size=2, **kwargs)
 
 
 @test_combinations.run_all_keras_modes
 class Conv2DTransposeTest(test_combinations.TestCase):
+    def _run_test(self, kwargs):
+        num_samples = 2
+        stack_size = 3
+        num_row = 7
+        num_col = 6
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.Conv2DTranspose,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_row, num_col, stack_size),
+            )
 
-  def _run_test(self, kwargs):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}),
+        ("padding_same", {"padding": "same"}),
+        ("strides", {"strides": (2, 2)}),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}),
+        (
+            "strides_output_padding",
+            {"strides": (2, 2), "output_padding": (1, 1)},
+        ),
+    )
+    def test_conv2d_transpose(self, kwargs):
+        kwargs["filters"] = 2
+        kwargs["kernel_size"] = (3, 3)
+        if "data_format" not in kwargs or tf.test.is_gpu_available(
+            cuda_only=True
+        ):
+            self._run_test(kwargs)
 
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.Conv2DTranspose,
-          kwargs=kwargs,
-          input_shape=(num_samples, num_row, num_col, stack_size))
+    def test_conv2d_transpose_regularizers(self):
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv2DTranspose(**kwargs)
+            layer.build((None, 5, 5, 2))
+            self.assertEqual(len(layer.losses), 2)
+            layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+            self.assertEqual(len(layer.losses), 3)
 
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}),
-      ('padding_same', {'padding': 'same'}),
-      ('strides', {'strides': (2, 2)}),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
-      ('strides_output_padding', {'strides': (2, 2), 'output_padding': (1, 1)}),
-  )
-  def test_conv2d_transpose(self, kwargs):
-    kwargs['filters'] = 2
-    kwargs['kernel_size'] = (3, 3)
-    if 'data_format' not in kwargs or tf.test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs)
+    def test_conv2d_transpose_constraints(self):
+        k_constraint = lambda x: x
+        b_constraint = lambda x: x
 
-  def test_conv2d_transpose_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv2DTranspose(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_constraint": k_constraint,
+            "bias_constraint": b_constraint,
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv2DTranspose(**kwargs)
+            layer.build((None, 5, 5, 2))
+            self.assertEqual(layer.kernel.constraint, k_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
 
-  def test_conv2d_transpose_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
+    def test_conv2d_transpose_dilation(self):
+        test_utils.layer_test(
+            keras.layers.Conv2DTranspose,
+            kwargs={
+                "filters": 2,
+                "kernel_size": 3,
+                "padding": "same",
+                "data_format": "channels_last",
+                "dilation_rate": (2, 2),
+            },
+            input_shape=(2, 5, 6, 3),
+        )
 
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv2DTranspose(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+        input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
 
-  def test_conv2d_transpose_dilation(self):
-    test_utils.layer_test(
-        keras.layers.Conv2DTranspose,
-        kwargs={'filters': 2,
-                'kernel_size': 3,
-                'padding': 'same',
-                'data_format': 'channels_last',
-                'dilation_rate': (2, 2)},
-        input_shape=(2, 5, 6, 3))
+        expected_output = np.float32(
+            [
+                [192, 228, 192, 228],
+                [336, 372, 336, 372],
+                [192, 228, 192, 228],
+                [336, 372, 336, 372],
+            ]
+        ).reshape((1, 4, 4, 1))
+        test_utils.layer_test(
+            keras.layers.Conv2DTranspose,
+            input_data=input_data,
+            kwargs={
+                "filters": 1,
+                "kernel_size": 3,
+                "padding": "same",
+                "data_format": "channels_last",
+                "dilation_rate": (2, 2),
+                "kernel_initializer": "ones",
+            },
+            expected_output=expected_output,
+        )
 
-    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
-    # pylint: disable=too-many-function-args
-    expected_output = np.float32([
-        [192, 228, 192, 228],
-        [336, 372, 336, 372],
-        [192, 228, 192, 228],
-        [336, 372, 336, 372]
-    ]).reshape((1, 4, 4, 1))
-    test_utils.layer_test(keras.layers.Conv2DTranspose,
-                          input_data=input_data,
-                          kwargs={'filters': 1,
-                                  'kernel_size': 3,
-                                  'padding': 'same',
-                                  'data_format': 'channels_last',
-                                  'dilation_rate': (2, 2),
-                                  'kernel_initializer': 'ones'},
-                          expected_output=expected_output)
+    def test_conv2d_transpose_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": [2, 1], "dilation_rate": [2, 1]}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.Conv2DTranspose(filters=1, kernel_size=2, **kwargs)
 
 
 @test_combinations.run_all_keras_modes
 class Conv3DTransposeTest(test_combinations.TestCase):
+    def _run_test(self, kwargs, expected_output_shape):
+        num_samples = 2
+        stack_size = 3
+        num_row = 7
+        num_col = 6
+        depth = 5
 
-  def _run_test(self, kwargs, expected_output_shape):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-    depth = 5
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                keras.layers.Conv3DTranspose,
+                kwargs=kwargs,
+                input_shape=(num_samples, depth, num_row, num_col, stack_size),
+                expected_output_shape=expected_output_shape,
+            )
 
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          keras.layers.Conv3DTranspose,
-          kwargs=kwargs,
-          input_shape=(num_samples, depth, num_row, num_col, stack_size),
-          expected_output_shape=expected_output_shape)
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}, (None, 7, 9, 8, 2)),
+        ("padding_same", {"padding": "same"}, (None, 5, 7, 6, 2)),
+        ("strides", {"strides": (2, 2, 2)}, (None, 11, 15, 13, 2)),
+        ("dilation_rate", {"dilation_rate": (2, 2, 2)}, (None, 7, 9, 8, 2)),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}),
+        (
+            "strides_output_padding",
+            {"strides": (2, 2, 2), "output_padding": (1, 1, 1)},
+            (None, 12, 16, 14, 2),
+        ),
+    )
+    def test_conv3d_transpose(self, kwargs, expected_output_shape=None):
+        kwargs["filters"] = 2
+        kwargs["kernel_size"] = (3, 3, 3)
+        if "data_format" not in kwargs or tf.test.is_gpu_available(
+            cuda_only=True
+        ):
+            self._run_test(kwargs, expected_output_shape)
 
-  @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }, (None, 7, 9, 8, 2)),
-      ('padding_same', {
-          'padding': 'same'
-      }, (None, 5, 7, 6, 2)),
-      ('strides', {
-          'strides': (2, 2, 2)
-      }, (None, 11, 15, 13, 2)),
-      ('dilation_rate', {
-          'dilation_rate': (2, 2, 2)
-      }, (None, 7, 9, 8, 2)),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {
-          'data_format': 'channels_first'
-      }),
-      ('strides_output_padding', {
-          'strides': (2, 2, 2),
-          'output_padding': (1, 1, 1)
-      }, (None, 12, 16, 14, 2)),
-  )
-  def test_conv3d_transpose(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
-    kwargs['kernel_size'] = (3, 3, 3)
-    if 'data_format' not in kwargs or tf.test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape)
+    def test_conv3d_transpose_regularizers(self):
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv3DTranspose(**kwargs)
+            layer.build((None, 5, 5, 5, 2))
+            self.assertEqual(len(layer.losses), 2)
+            layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+            self.assertEqual(len(layer.losses), 3)
 
-  def test_conv3d_transpose_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv3DTranspose(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
+    def test_conv3d_transpose_constraints(self):
+        k_constraint = lambda x: x
+        b_constraint = lambda x: x
 
-  def test_conv3d_transpose_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "kernel_constraint": k_constraint,
+            "bias_constraint": b_constraint,
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.Conv3DTranspose(**kwargs)
+            layer.build((None, 5, 5, 5, 2))
+            self.assertEqual(layer.kernel.constraint, k_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
 
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.Conv3DTranspose(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+    def test_conv3d_transpose_dynamic_shape(self):
+        input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+        with self.cached_session():
+            # Won't raise error here.
+            test_utils.layer_test(
+                keras.layers.Conv3DTranspose,
+                kwargs={
+                    "data_format": "channels_last",
+                    "filters": 3,
+                    "kernel_size": 3,
+                },
+                input_shape=(None, None, None, None, 3),
+                input_data=input_data,
+            )
+            if tf.test.is_gpu_available(cuda_only=True):
+                test_utils.layer_test(
+                    keras.layers.Conv3DTranspose,
+                    kwargs={
+                        "data_format": "channels_first",
+                        "filters": 3,
+                        "kernel_size": 3,
+                    },
+                    input_shape=(None, 3, None, None, None),
+                    input_data=input_data,
+                )
+
+    def test_conv3d_transpose_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": [2, 2, 1], "dilation_rate": [2, 2, 1]}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.Conv3DTranspose(filters=1, kernel_size=2, **kwargs)
 
-  def test_conv3d_transpose_dynamic_shape(self):
-    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
-    with self.cached_session():
-      # Won't raise error here.
-      test_utils.layer_test(
-          keras.layers.Conv3DTranspose,
-          kwargs={
-              'data_format': 'channels_last',
-              'filters': 3,
-              'kernel_size': 3
-          },
-          input_shape=(None, None, None, None, 3),
-          input_data=input_data)
-      if tf.test.is_gpu_available(cuda_only=True):
-        test_utils.layer_test(
-            keras.layers.Conv3DTranspose,
-            kwargs={
-                'data_format': 'channels_first',
-                'filters': 3,
-                'kernel_size': 3
-            },
-            input_shape=(None, 3, None, None, None),
-            input_data=input_data)
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/convolutional/depthwise_conv1d.py b/keras/layers/convolutional/depthwise_conv1d.py
index 8c9a1581c58e..b1cca7a37353 100644
--- a/keras/layers/convolutional/depthwise_conv1d.py
+++ b/keras/layers/convolutional/depthwise_conv1d.py
@@ -13,188 +13,205 @@
 # limitations under the License.
 # ==============================================================================
 """Keras depthwise 1D convolution."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.layers.convolutional.base_depthwise_conv import DepthwiseConv
 from keras.utils import conv_utils
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.DepthwiseConv1D')
+@keras_export("keras.layers.DepthwiseConv1D")
 class DepthwiseConv1D(DepthwiseConv):
-  """Depthwise 1D convolution.
-
-  Depthwise convolution is a type of convolution in which each input channel is
-  convolved with a different kernel (called a depthwise kernel). You
-  can understand depthwise convolution as the first step in a depthwise
-  separable convolution.
-
-  It is implemented via the following steps:
-
-  - Split the input into individual channels.
-  - Convolve each channel with an individual depthwise kernel with
-    `depth_multiplier` output channels.
-  - Concatenate the convolved outputs along the channels axis.
-
-  Unlike a regular 1D convolution, depthwise convolution does not mix
-  information across different input channels.
-
-  The `depth_multiplier` argument determines how many filter are applied to one
-  input channel. As such, it controls the amount of output channels that are
-  generated per input channel in the depthwise step.
-
-  Args:
-    kernel_size: An integer, specifying the height and width of the 1D
-      convolution window. Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer, specifying the strides of the convolution along the
-      height and width. Can be a single integer to specify the same value for
-      all spatial dimensions. Specifying any stride value != 1 is incompatible
-      with specifying any `dilation_rate` value != 1.
-    padding: one of `'valid'` or `'same'` (case-insensitive). `"valid"` means no
-      padding. `"same"` results in padding with zeros evenly to the left/right
-      or up/down of the input such that output has the same height/width
-      dimension as the input.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `filters_in * depth_multiplier`.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `(batch_size, height, width, channels)` while
-      `channels_first` corresponds to inputs with shape `(batch_size, channels,
-      height, width)`. It defaults to the `image_data_format` value found in
-      your Keras config file at `~/.keras/keras.json`. If you never set it, then
-      it will be 'channels_last'.
-    dilation_rate: A single integer, specifying the dilation rate to use for
-      dilated convolution. Currently, specifying any `dilation_rate` value != 1
-      is incompatible with specifying any stride value != 1.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    depthwise_initializer: Initializer for the depthwise kernel matrix (see
-      `keras.initializers`). If None, the default initializer
-      ('glorot_uniform') will be used.
-    bias_initializer: Initializer for the bias vector (see
-      `keras.initializers`). If None, the default initializer ('zeros') will be
-      used.
-    depthwise_regularizer: Regularizer function applied to the depthwise kernel
-      matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector (see
-      `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its 'activation') (see `keras.regularizers`).
-    depthwise_constraint: Constraint function applied to the depthwise kernel
-      matrix (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector (see
-      `keras.constraints`).
-
-  Input shape:
-    4D tensor with shape: `[batch_size, channels, rows, cols]` if
-      data_format='channels_first'
-    or 4D tensor with shape: `[batch_size, rows, cols, channels]` if
-      data_format='channels_last'.
-
-  Output shape:
-    4D tensor with shape: `[batch_size, channels * depth_multiplier, new_rows,
-      new_cols]` if `data_format='channels_first'`
-      or 4D tensor with shape: `[batch_size,
-      new_rows, new_cols, channels * depth_multiplier]` if
-      `data_format='channels_last'`. `rows` and `cols` values might have changed
-      due to padding.
-
-  Returns:
-    A tensor of rank 4 representing
-    `activation(depthwiseconv2d(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is "causal".
-    ValueError: when both `strides` > 1 and `dilation_rate` > 1.
-  """
-
-  def __init__(self,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               depth_multiplier=1,
-               data_format=None,
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        1,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        depth_multiplier=depth_multiplier,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        depthwise_initializer=depthwise_initializer,
-        bias_initializer=bias_initializer,
-        depthwise_regularizer=depthwise_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        depthwise_constraint=depthwise_constraint,
-        bias_constraint=bias_constraint,
-        **kwargs)
-
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides * 2 + (1,)
-      spatial_start_dim = 1
-    else:
-      strides = (1, 1) + self.strides * 2
-      spatial_start_dim = 2
-    inputs = tf.expand_dims(inputs, spatial_start_dim)
-    depthwise_kernel = tf.expand_dims(self.depthwise_kernel, axis=0)
-    dilation_rate = (1,) + self.dilation_rate
-
-    outputs = tf.nn.depthwise_conv2d(
-        inputs,
-        depthwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        dilations=dilation_rate,
-        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = tf.nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    outputs = tf.squeeze(outputs, [spatial_start_dim])
-
-    if self.activation is not None:
-      return self.activation(outputs)
-
-    return outputs
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      out_filters = input_shape[1] * self.depth_multiplier
-    elif self.data_format == 'channels_last':
-      rows = input_shape[1]
-      out_filters = input_shape[2] * self.depth_multiplier
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding, self.strides[0],
-                                         self.dilation_rate[0])
-    if self.data_format == 'channels_first':
-      return (input_shape[0], out_filters, rows)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], rows, out_filters)
+    """Depthwise 1D convolution.
+
+    Depthwise convolution is a type of convolution in which each input channel
+    is convolved with a different kernel (called a depthwise kernel). You can
+    understand depthwise convolution as the first step in a depthwise separable
+    convolution.
+
+    It is implemented via the following steps:
+
+    - Split the input into individual channels.
+    - Convolve each channel with an individual depthwise kernel with
+      `depth_multiplier` output channels.
+    - Concatenate the convolved outputs along the channels axis.
+
+    Unlike a regular 1D convolution, depthwise convolution does not mix
+    information across different input channels.
+
+    The `depth_multiplier` argument determines how many filter are applied to
+    one input channel. As such, it controls the amount of output channels that
+    are generated per input channel in the depthwise step.
+
+    Args:
+      kernel_size: An integer, specifying the height and width of the 1D
+        convolution window. Can be a single integer to specify the same value
+        for all spatial dimensions.
+      strides: An integer, specifying the strides of the convolution along the
+        height and width. Can be a single integer to specify the same value for
+        all spatial dimensions. Specifying any stride value != 1 is incompatible
+        with specifying any `dilation_rate` value != 1.
+      padding: one of `'valid'` or `'same'` (case-insensitive). `"valid"` means
+        no padding. `"same"` results in padding with zeros evenly to the
+        left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch_size, height,
+        width, channels)` while `channels_first` corresponds to inputs with
+        shape `(batch_size, channels, height, width)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: A single integer, specifying the dilation rate to use for
+        dilated convolution. Currently, specifying any `dilation_rate`
+        value != 1 is incompatible with specifying any stride value != 1.
+      activation: Activation function to use. If you don't specify anything, no
+        activation is applied (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: Initializer for the depthwise kernel matrix (see
+        `keras.initializers`). If None, the default initializer
+        ('glorot_uniform') will be used.
+      bias_initializer: Initializer for the bias vector (see
+        `keras.initializers`). If None, the default initializer ('zeros') will
+        be used.
+      depthwise_regularizer: Regularizer function applied to the depthwise
+        kernel matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector (see
+        `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its 'activation') (see `keras.regularizers`).
+      depthwise_constraint: Constraint function applied to the depthwise kernel
+        matrix (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector (see
+        `keras.constraints`).
+
+    Input shape:
+      3D tensor with shape: `[batch_size, channels, input_dim]` if
+        data_format='channels_first'
+      or 3D tensor with shape: `[batch_size, input_dim, channels]` if
+        data_format='channels_last'.
+
+    Output shape:
+      3D tensor with shape:
+       `[batch_size, channels * depth_multiplier, new_dims]`
+        if `data_format='channels_first'`
+        or 3D tensor with shape: `[batch_size,
+        new_dims, channels * depth_multiplier]` if
+        `data_format='channels_last'`. `new_dims` values might have
+        changed due to padding.
+
+    Returns:
+      A tensor of rank 3 representing
+      `activation(depthwiseconv1d(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is "causal".
+      ValueError: when both `strides` > 1 and `dilation_rate` > 1.
+    """
+
+    def __init__(
+        self,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        depth_multiplier=1,
+        data_format=None,
+        dilation_rate=1,
+        activation=None,
+        use_bias=True,
+        depthwise_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        depthwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        bias_constraint=None,
+        **kwargs
+    ):
+        super().__init__(
+            1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            depth_multiplier=depth_multiplier,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            use_bias=use_bias,
+            depthwise_initializer=depthwise_initializer,
+            bias_initializer=bias_initializer,
+            depthwise_regularizer=depthwise_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            depthwise_constraint=depthwise_constraint,
+            bias_constraint=bias_constraint,
+            **kwargs
+        )
+
+    def call(self, inputs):
+        if self.data_format == "channels_last":
+            strides = (1,) + self.strides * 2 + (1,)
+            spatial_start_dim = 1
+        else:
+            strides = (1, 1) + self.strides * 2
+            spatial_start_dim = 2
+        inputs = tf.expand_dims(inputs, spatial_start_dim)
+        depthwise_kernel = tf.expand_dims(self.depthwise_kernel, axis=0)
+        dilation_rate = (1,) + self.dilation_rate
+
+        outputs = tf.nn.depthwise_conv2d(
+            inputs,
+            depthwise_kernel,
+            strides=strides,
+            padding=self.padding.upper(),
+            dilations=dilation_rate,
+            data_format=conv_utils.convert_data_format(
+                self.data_format, ndim=4
+            ),
+        )
+
+        if self.use_bias:
+            outputs = tf.nn.bias_add(
+                outputs,
+                self.bias,
+                data_format=conv_utils.convert_data_format(
+                    self.data_format, ndim=4
+                ),
+            )
+
+        outputs = tf.squeeze(outputs, [spatial_start_dim])
+
+        if self.activation is not None:
+            return self.activation(outputs)
+
+        return outputs
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if self.data_format == "channels_first":
+            input_dim = input_shape[2]
+            out_filters = input_shape[1] * self.depth_multiplier
+        elif self.data_format == "channels_last":
+            input_dim = input_shape[1]
+            out_filters = input_shape[2] * self.depth_multiplier
+
+        input_dim = conv_utils.conv_output_length(
+            input_dim,
+            self.kernel_size[0],
+            self.padding,
+            self.strides[0],
+            self.dilation_rate[0],
+        )
+        if self.data_format == "channels_first":
+            return (input_shape[0], out_filters, input_dim)
+        elif self.data_format == "channels_last":
+            return (input_shape[0], input_dim, out_filters)
diff --git a/keras/layers/convolutional/depthwise_conv2d.py b/keras/layers/convolutional/depthwise_conv2d.py
index 202eeeae1c8d..24edea729669 100644
--- a/keras/layers/convolutional/depthwise_conv2d.py
+++ b/keras/layers/convolutional/depthwise_conv2d.py
@@ -13,184 +13,197 @@
 # limitations under the License.
 # ==============================================================================
 """Keras depthwise 2D convolution."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.layers.convolutional.base_depthwise_conv import DepthwiseConv
 from keras.utils import conv_utils
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.DepthwiseConv2D')
+@keras_export("keras.layers.DepthwiseConv2D")
 class DepthwiseConv2D(DepthwiseConv):
-  """Depthwise 2D convolution.
-
-  Depthwise convolution is a type of convolution in which each input channel is
-  convolved with a different kernel (called a depthwise kernel). You
-  can understand depthwise convolution as the first step in a depthwise
-  separable convolution.
-
-  It is implemented via the following steps:
-
-  - Split the input into individual channels.
-  - Convolve each channel with an individual depthwise kernel with
-    `depth_multiplier` output channels.
-  - Concatenate the convolved outputs along the channels axis.
-
-  Unlike a regular 2D convolution, depthwise convolution does not mix
-  information across different input channels.
-
-  The `depth_multiplier` argument determines how many filter are applied to one
-  input channel. As such, it controls the amount of output channels that are
-  generated per input channel in the depthwise step.
-
-  Args:
-    kernel_size: An integer or tuple/list of 2 integers, specifying the height
-      and width of the 2D convolution window. Can be a single integer to specify
-      the same value for all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers, specifying the strides of
-      the convolution along the height and width. Can be a single integer to
-      specify the same value for all spatial dimensions. Specifying any stride
-      value != 1 is incompatible with specifying any `dilation_rate` value != 1.
-    padding: one of `'valid'` or `'same'` (case-insensitive). `"valid"` means no
-      padding. `"same"` results in padding with zeros evenly to the left/right
-      or up/down of the input such that output has the same height/width
-      dimension as the input.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `filters_in * depth_multiplier`.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `(batch_size, height, width, channels)` while
-      `channels_first` corresponds to inputs with shape `(batch_size, channels,
-      height, width)`. It defaults to the `image_data_format` value found in
-      your Keras config file at `~/.keras/keras.json`. If you never set it, then
-      it will be 'channels_last'.
-    dilation_rate: An integer or tuple/list of 2 integers, specifying the
-      dilation rate to use for dilated convolution. Currently, specifying any
-      `dilation_rate` value != 1 is incompatible with specifying any `strides`
-      value != 1.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    depthwise_initializer: Initializer for the depthwise kernel matrix (see
-      `keras.initializers`). If None, the default initializer
-      ('glorot_uniform') will be used.
-    bias_initializer: Initializer for the bias vector (see
-      `keras.initializers`). If None, the default initializer ('zeros') will be
-      used.
-    depthwise_regularizer: Regularizer function applied to the depthwise kernel
-      matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector (see
-      `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its 'activation') (see `keras.regularizers`).
-    depthwise_constraint: Constraint function applied to the depthwise kernel
-      matrix (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector (see
-      `keras.constraints`).
-
-  Input shape:
-    4D tensor with shape: `[batch_size, channels, rows, cols]` if
-      data_format='channels_first'
-    or 4D tensor with shape: `[batch_size, rows, cols, channels]` if
-      data_format='channels_last'.
-
-  Output shape:
-    4D tensor with shape: `[batch_size, channels * depth_multiplier, new_rows,
-      new_cols]` if `data_format='channels_first'`
-      or 4D tensor with shape: `[batch_size,
-      new_rows, new_cols, channels * depth_multiplier]` if
-      `data_format='channels_last'`. `rows` and `cols` values might have changed
-      due to padding.
-
-  Returns:
-    A tensor of rank 4 representing
-    `activation(depthwiseconv2d(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is "causal".
-    ValueError: when both `strides` > 1 and `dilation_rate` > 1.
-  """
-
-  def __init__(self,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               depth_multiplier=1,
-               data_format=None,
-               dilation_rate=(1, 1),
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        2,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        depth_multiplier=depth_multiplier,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        depthwise_initializer=depthwise_initializer,
-        bias_initializer=bias_initializer,
-        depthwise_regularizer=depthwise_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        depthwise_constraint=depthwise_constraint,
-        bias_constraint=bias_constraint,
-        **kwargs)
-
-  def call(self, inputs):
-    outputs = backend.depthwise_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        strides=self.strides,
-        padding=self.padding,
-        dilation_rate=self.dilation_rate,
-        data_format=self.data_format)
-
-    if self.use_bias:
-      outputs = backend.bias_add(
-          outputs,
-          self.bias,
-          data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-
-    return outputs
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-      out_filters = input_shape[1] * self.depth_multiplier
-    elif self.data_format == 'channels_last':
-      rows = input_shape[1]
-      cols = input_shape[2]
-      out_filters = input_shape[3] * self.depth_multiplier
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding,
-                                         self.strides[0],
-                                         self.dilation_rate[0])
-    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
-                                         self.padding,
-                                         self.strides[1],
-                                         self.dilation_rate[1])
-    if self.data_format == 'channels_first':
-      return (input_shape[0], out_filters, rows, cols)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], rows, cols, out_filters)
+    """Depthwise 2D convolution.
+
+    Depthwise convolution is a type of convolution in which each input channel
+    is convolved with a different kernel (called a depthwise kernel). You can
+    understand depthwise convolution as the first step in a depthwise separable
+    convolution.
+
+    It is implemented via the following steps:
+
+    - Split the input into individual channels.
+    - Convolve each channel with an individual depthwise kernel with
+      `depth_multiplier` output channels.
+    - Concatenate the convolved outputs along the channels axis.
+
+    Unlike a regular 2D convolution, depthwise convolution does not mix
+    information across different input channels.
+
+    The `depth_multiplier` argument determines how many filter are applied to
+    one input channel. As such, it controls the amount of output channels that
+    are generated per input channel in the depthwise step.
+
+    Args:
+      kernel_size: An integer or tuple/list of 2 integers, specifying the height
+        and width of the 2D convolution window. Can be a single integer to
+        specify the same value for all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers, specifying the strides of
+        the convolution along the height and width. Can be a single integer to
+        specify the same value for all spatial dimensions. Current
+        implementation only supports equal length strides in row and
+        column dimensions. Specifying any stride value != 1 is incompatible
+        with specifying any `dilation_rate` value !=1.
+      padding: one of `'valid'` or `'same'` (case-insensitive). `"valid"` means
+        no padding. `"same"` results in padding with zeros evenly to the
+        left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch_size, height,
+        width, channels)` while `channels_first` corresponds to inputs with
+        shape `(batch_size, channels, height, width)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of 2 integers, specifying the
+        dilation rate to use for dilated convolution. Currently, specifying any
+        `dilation_rate` value != 1 is incompatible with specifying any `strides`
+        value != 1.
+      activation: Activation function to use. If you don't specify anything, no
+        activation is applied (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: Initializer for the depthwise kernel matrix (see
+        `keras.initializers`). If None, the default initializer
+        ('glorot_uniform') will be used.
+      bias_initializer: Initializer for the bias vector (see
+        `keras.initializers`). If None, the default initializer ('zeros') will
+        be used.
+      depthwise_regularizer: Regularizer function applied to the depthwise
+        kernel matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector (see
+        `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its 'activation') (see `keras.regularizers`).
+      depthwise_constraint: Constraint function applied to the depthwise kernel
+        matrix (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector (see
+        `keras.constraints`).
+
+    Input shape:
+      4D tensor with shape: `[batch_size, channels, rows, cols]` if
+        data_format='channels_first'
+      or 4D tensor with shape: `[batch_size, rows, cols, channels]` if
+        data_format='channels_last'.
+
+    Output shape:
+      4D tensor with shape: `[batch_size, channels * depth_multiplier, new_rows,
+        new_cols]` if `data_format='channels_first'`
+        or 4D tensor with shape: `[batch_size,
+        new_rows, new_cols, channels * depth_multiplier]` if
+        `data_format='channels_last'`. `rows` and `cols` values might have
+        changed due to padding.
+
+    Returns:
+      A tensor of rank 4 representing
+      `activation(depthwiseconv2d(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is "causal".
+      ValueError: when both `strides` > 1 and `dilation_rate` > 1.
+    """
+
+    def __init__(
+        self,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        depth_multiplier=1,
+        data_format=None,
+        dilation_rate=(1, 1),
+        activation=None,
+        use_bias=True,
+        depthwise_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        depthwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        bias_constraint=None,
+        **kwargs
+    ):
+        super().__init__(
+            2,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            depth_multiplier=depth_multiplier,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            use_bias=use_bias,
+            depthwise_initializer=depthwise_initializer,
+            bias_initializer=bias_initializer,
+            depthwise_regularizer=depthwise_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            depthwise_constraint=depthwise_constraint,
+            bias_constraint=bias_constraint,
+            **kwargs
+        )
+
+    def call(self, inputs):
+        outputs = backend.depthwise_conv2d(
+            inputs,
+            self.depthwise_kernel,
+            strides=self.strides,
+            padding=self.padding,
+            dilation_rate=self.dilation_rate,
+            data_format=self.data_format,
+        )
+
+        if self.use_bias:
+            outputs = backend.bias_add(
+                outputs, self.bias, data_format=self.data_format
+            )
+
+        if self.activation is not None:
+            return self.activation(outputs)
+
+        return outputs
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if self.data_format == "channels_first":
+            rows = input_shape[2]
+            cols = input_shape[3]
+            out_filters = input_shape[1] * self.depth_multiplier
+        elif self.data_format == "channels_last":
+            rows = input_shape[1]
+            cols = input_shape[2]
+            out_filters = input_shape[3] * self.depth_multiplier
+
+        rows = conv_utils.conv_output_length(
+            rows,
+            self.kernel_size[0],
+            self.padding,
+            self.strides[0],
+            self.dilation_rate[0],
+        )
+        cols = conv_utils.conv_output_length(
+            cols,
+            self.kernel_size[1],
+            self.padding,
+            self.strides[1],
+            self.dilation_rate[1],
+        )
+        if self.data_format == "channels_first":
+            return (input_shape[0], out_filters, rows, cols)
+        elif self.data_format == "channels_last":
+            return (input_shape[0], rows, cols, out_filters)
diff --git a/keras/layers/convolutional/depthwise_conv_test.py b/keras/layers/convolutional/depthwise_conv_test.py
index e324ec40be20..dd8e58584970 100644
--- a/keras/layers/convolutional/depthwise_conv_test.py
+++ b/keras/layers/convolutional/depthwise_conv_test.py
@@ -14,124 +14,130 @@
 # ==============================================================================
 """Tests for depthwise convolutional layers."""
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class DepthwiseConv1DTest(test_combinations.TestCase):
+    def _run_test(self, kwargs, expected_output_shape=None):
+        num_samples = 2
+        stack_size = 3
+        num_row = 7
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.DepthwiseConv1D,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_row, stack_size),
+                expected_output_shape=expected_output_shape,
+            )
+
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}),
+        ("padding_same", {"padding": "same"}),
+        ("strides", {"strides": 2}),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}),
+        ("depth_multiplier_1", {"depth_multiplier": 1}),
+        ("depth_multiplier_2", {"depth_multiplier": 2}),
+        ("dilation_rate", {"dilation_rate": 2}, (None, 3, 3)),
+    )
+    def test_depthwise_conv1d(self, kwargs, expected_output_shape=None):
+        kwargs["kernel_size"] = 3
+        if "data_format" not in kwargs or tf.test.is_gpu_available(
+            cuda_only=True
+        ):
+            self._run_test(kwargs, expected_output_shape)
+
+    def test_depthwise_conv1d_full(self):
+        kwargs = {
+            "kernel_size": 3,
+            "padding": "valid",
+            "data_format": "channels_last",
+            "dilation_rate": 1,
+            "activation": None,
+            "depthwise_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "depthwise_constraint": "unit_norm",
+            "use_bias": True,
+            "strides": 2,
+            "depth_multiplier": 1,
+        }
+        self._run_test(kwargs)
 
-  def _run_test(self, kwargs, expected_output_shape=None):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.DepthwiseConv1D,
-          kwargs=kwargs,
-          input_shape=(num_samples, num_row, stack_size),
-          expected_output_shape=expected_output_shape)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }),
-      ('padding_same', {
-          'padding': 'same'
-      }),
-      ('strides', {
-          'strides': 2
-      }),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {
-          'data_format': 'channels_first'
-      }),
-      ('depth_multiplier_1', {
-          'depth_multiplier': 1
-      }),
-      ('depth_multiplier_2', {
-          'depth_multiplier': 2
-      }),
-      ('dilation_rate', {
-          'dilation_rate': 2
-      }, (None, 3, 3)),
-  )
-  def test_depthwise_conv1d(self, kwargs, expected_output_shape=None):
-    kwargs['kernel_size'] = 3
-    if 'data_format' not in kwargs or tf.test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape)
-
-  def test_depthwise_conv1d_full(self):
-    kwargs = {
-        'kernel_size': 3,
-        'padding': 'valid',
-        'data_format': 'channels_last',
-        'dilation_rate': 1,
-        'activation': None,
-        'depthwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'depthwise_constraint': 'unit_norm',
-        'use_bias': True,
-        'strides': 2,
-        'depth_multiplier': 1,
-    }
-    self._run_test(kwargs)
+    def test_depthwise_conv1d_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": 2, "dilation_rate": 2}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.DepthwiseConv1D(kernel_size=2, **kwargs)
 
 
 @test_combinations.run_all_keras_modes
 class DepthwiseConv2DTest(test_combinations.TestCase):
+    def _run_test(self, kwargs, expected_output_shape=None):
+        num_samples = 2
+        stack_size = 3
+        num_row = 7
+        num_col = 6
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.DepthwiseConv2D,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_row, num_col, stack_size),
+                expected_output_shape=expected_output_shape,
+            )
+
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}),
+        ("padding_same", {"padding": "same"}),
+        ("strides", {"strides": (2, 2)}),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}),
+        ("depth_multiplier_1", {"depth_multiplier": 1}),
+        ("depth_multiplier_2", {"depth_multiplier": 2}),
+        ("dilation_rate", {"dilation_rate": (2, 2)}, (None, 3, 2, 3)),
+    )
+    def test_depthwise_conv2d(self, kwargs, expected_output_shape=None):
+        kwargs["kernel_size"] = (3, 3)
+        if "data_format" not in kwargs or tf.test.is_gpu_available(
+            cuda_only=True
+        ):
+            self._run_test(kwargs, expected_output_shape)
+
+    def test_depthwise_conv2d_full(self):
+        kwargs = {
+            "kernel_size": 3,
+            "padding": "valid",
+            "data_format": "channels_last",
+            "dilation_rate": (1, 1),
+            "activation": None,
+            "depthwise_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "depthwise_constraint": "unit_norm",
+            "use_bias": True,
+            "strides": (2, 2),
+            "depth_multiplier": 1,
+        }
+        self._run_test(kwargs)
+
+    def test_depthwise_conv2d_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": [2, 1], "dilation_rate": [2, 1]}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.DepthwiseConv2D(kernel_size=2, **kwargs)
+
 
-  def _run_test(self, kwargs, expected_output_shape=None):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.DepthwiseConv2D,
-          kwargs=kwargs,
-          input_shape=(num_samples, num_row, num_col, stack_size),
-          expected_output_shape=expected_output_shape)
-
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}),
-      ('padding_same', {'padding': 'same'}),
-      ('strides', {'strides': (2, 2)}),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
-      ('depth_multiplier_1', {'depth_multiplier': 1}),
-      ('depth_multiplier_2', {'depth_multiplier': 2}),
-      ('dilation_rate', {'dilation_rate': (2, 2)}, (None, 3, 2, 3)),
-  )
-  def test_depthwise_conv2d(self, kwargs, expected_output_shape=None):
-    kwargs['kernel_size'] = (3, 3)
-    if 'data_format' not in kwargs or tf.test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape)
-
-  def test_depthwise_conv2d_full(self):
-    kwargs = {
-        'kernel_size': 3,
-        'padding': 'valid',
-        'data_format': 'channels_last',
-        'dilation_rate': (1, 1),
-        'activation': None,
-        'depthwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'depthwise_constraint': 'unit_norm',
-        'use_bias': True,
-        'strides': (2, 2),
-        'depth_multiplier': 1,
-    }
-    self._run_test(kwargs)
-
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/convolutional/separable_conv1d.py b/keras/layers/convolutional/separable_conv1d.py
index 2f070a3f54ad..46ade298d0ff 100644
--- a/keras/layers/convolutional/separable_conv1d.py
+++ b/keras/layers/convolutional/separable_conv1d.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Keras depthwise separable 1D convolution."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import constraints
@@ -21,185 +23,199 @@
 from keras import regularizers
 from keras.layers.convolutional.base_separable_conv import SeparableConv
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.SeparableConv1D',
-              'keras.layers.SeparableConvolution1D')
+@keras_export(
+    "keras.layers.SeparableConv1D", "keras.layers.SeparableConvolution1D"
+)
 class SeparableConv1D(SeparableConv):
-  """Depthwise separable 1D convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A single integer specifying the spatial
-      dimensions of the filters.
-    strides: A single integer specifying the strides
-      of the convolution.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"`, `"same"`, or `"causal"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input. `"causal"` results in causal
-      (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch_size, channels, length)`.
-    dilation_rate: A single integer, specifying
-      the dilation rate to use for dilated convolution.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel
-      (see `keras.initializers`). If None, then the default initializer
-      ('glorot_uniform') will be used.
-    pointwise_initializer: An initializer for the pointwise convolution kernel
-      (see `keras.initializers`). If None, then the default initializer
-      ('glorot_uniform') will be used.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer ('zeros') will be used (see `keras.initializers`).
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel (see `keras.regularizers`).
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel (see `keras.regularizers`).
-    bias_regularizer: Optional regularizer for the bias vector
-      (see `keras.regularizers`).
-    activity_regularizer: Optional regularizer function for the output
-      (see `keras.regularizers`).
-    depthwise_constraint: Optional projection function to be applied to the
-      depthwise kernel after being updated by an `Optimizer` (e.g. used for
-      norm constraints or value constraints for layer weights). The function
-      must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are
-      not safe to use when doing asynchronous distributed training
-      (see `keras.constraints`).
-    pointwise_constraint: Optional projection function to be applied to the
-      pointwise kernel after being updated by an `Optimizer`
-      (see `keras.constraints`).
-    bias_constraint: Optional projection function to be applied to the
-      bias after being updated by an `Optimizer`
-      (see `keras.constraints`).
-    trainable: Boolean, if `True` the weights of this layer will be marked as
-      trainable (and listed in `layer.trainable_weights`).
-
-  Input shape:
-    3D tensor with shape:
-    `(batch_size, channels, steps)` if data_format='channels_first'
-    or 3D tensor with shape:
-    `(batch_size, steps, channels)` if data_format='channels_last'.
-
-  Output shape:
-    3D tensor with shape:
-    `(batch_size, filters, new_steps)` if data_format='channels_first'
-    or 3D tensor with shape:
-    `(batch_size,  new_steps, filters)` if data_format='channels_last'.
-    `new_steps` value might have changed due to padding or strides.
-
-  Returns:
-    A tensor of rank 3 representing
-    `activation(separableconv1d(inputs, kernel) + bias)`.
-  """
-
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               pointwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        rank=1,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        depth_multiplier=depth_multiplier,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        depthwise_initializer=initializers.get(depthwise_initializer),
-        pointwise_initializer=initializers.get(pointwise_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        depthwise_regularizer=regularizers.get(depthwise_regularizer),
-        pointwise_regularizer=regularizers.get(pointwise_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        depthwise_constraint=constraints.get(depthwise_constraint),
-        pointwise_constraint=constraints.get(pointwise_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
-
-  def call(self, inputs):
-    if self.padding == 'causal':
-      inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides * 2 + (1,)
-      spatial_start_dim = 1
-    else:
-      strides = (1, 1) + self.strides * 2
-      spatial_start_dim = 2
-
-    # Explicitly broadcast inputs and kernels to 4D.
-    # TODO(fchollet): refactor when a native separable_conv1d op is available.
-    inputs = tf.expand_dims(inputs, spatial_start_dim)
-    depthwise_kernel = tf.expand_dims(self.depthwise_kernel, 0)
-    pointwise_kernel = tf.expand_dims(self.pointwise_kernel, 0)
-    dilation_rate = (1,) + self.dilation_rate
-
-    if self.padding == 'causal':
-      op_padding = 'valid'
-    else:
-      op_padding = self.padding
-    outputs = tf.compat.v1.nn.separable_conv2d(
-        inputs,
-        depthwise_kernel,
-        pointwise_kernel,
-        strides=strides,
-        padding=op_padding.upper(),
-        rate=dilation_rate,
-        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = tf.nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    outputs = tf.squeeze(outputs, [spatial_start_dim])
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
+    """Depthwise separable 1D convolution.
+
+    This layer performs a depthwise convolution that acts separately on
+    channels, followed by a pointwise convolution that mixes channels.
+    If `use_bias` is True and a bias initializer is provided,
+    it adds a bias vector to the output.
+    It then optionally applies an activation function to produce the final
+    output.
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A single integer specifying the spatial
+        dimensions of the filters.
+      strides: A single integer specifying the strides
+        of the convolution.
+        Specifying any `stride` value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"`, `"same"`, or `"causal"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input. `"causal"` results in
+        causal (dilated) convolutions, e.g. `output[t]` does not depend on
+        `input[t+1:]`.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch_size, channels, length)`.
+      dilation_rate: A single integer, specifying
+        the dilation rate to use for dilated convolution.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `num_filters_in * depth_multiplier`.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias.
+      depthwise_initializer: An initializer for the depthwise convolution kernel
+        (see `keras.initializers`). If None, then the default initializer
+        ('glorot_uniform') will be used.
+      pointwise_initializer: An initializer for the pointwise convolution kernel
+        (see `keras.initializers`). If None, then the default initializer
+        ('glorot_uniform') will be used.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer ('zeros') will be used (see `keras.initializers`).
+      depthwise_regularizer: Optional regularizer for the depthwise
+        convolution kernel (see `keras.regularizers`).
+      pointwise_regularizer: Optional regularizer for the pointwise
+        convolution kernel (see `keras.regularizers`).
+      bias_regularizer: Optional regularizer for the bias vector
+        (see `keras.regularizers`).
+      activity_regularizer: Optional regularizer function for the output
+        (see `keras.regularizers`).
+      depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training
+        (see `keras.constraints`).
+      pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`
+        (see `keras.constraints`).
+      bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`
+        (see `keras.constraints`).
+      trainable: Boolean, if `True` the weights of this layer will be marked as
+        trainable (and listed in `layer.trainable_weights`).
+
+    Input shape:
+      3D tensor with shape:
+      `(batch_size, channels, steps)` if data_format='channels_first'
+      or 3D tensor with shape:
+      `(batch_size, steps, channels)` if data_format='channels_last'.
+
+    Output shape:
+      3D tensor with shape:
+      `(batch_size, filters, new_steps)` if data_format='channels_first'
+      or 3D tensor with shape:
+      `(batch_size,  new_steps, filters)` if data_format='channels_last'.
+      `new_steps` value might have changed due to padding or strides.
+
+    Returns:
+      A tensor of rank 3 representing
+      `activation(separableconv1d(inputs, kernel) + bias)`.
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format=None,
+        dilation_rate=1,
+        depth_multiplier=1,
+        activation=None,
+        use_bias=True,
+        depthwise_initializer="glorot_uniform",
+        pointwise_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        depthwise_regularizer=None,
+        pointwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        pointwise_constraint=None,
+        bias_constraint=None,
+        **kwargs
+    ):
+        super().__init__(
+            rank=1,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            depth_multiplier=depth_multiplier,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            depthwise_initializer=initializers.get(depthwise_initializer),
+            pointwise_initializer=initializers.get(pointwise_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            depthwise_regularizer=regularizers.get(depthwise_regularizer),
+            pointwise_regularizer=regularizers.get(pointwise_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            depthwise_constraint=constraints.get(depthwise_constraint),
+            pointwise_constraint=constraints.get(pointwise_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs
+        )
+
+    def call(self, inputs):
+        if self.padding == "causal":
+            inputs = tf.pad(inputs, self._compute_causal_padding(inputs))
+        if self.data_format == "channels_last":
+            strides = (1,) + self.strides * 2 + (1,)
+            spatial_start_dim = 1
+        else:
+            strides = (1, 1) + self.strides * 2
+            spatial_start_dim = 2
+
+        # Explicitly broadcast inputs and kernels to 4D.
+        # TODO(fchollet): refactor when a native separable_conv1d op is
+        # available.
+        inputs = tf.expand_dims(inputs, spatial_start_dim)
+        depthwise_kernel = tf.expand_dims(self.depthwise_kernel, 0)
+        pointwise_kernel = tf.expand_dims(self.pointwise_kernel, 0)
+        dilation_rate = (1,) + self.dilation_rate
+
+        if self.padding == "causal":
+            op_padding = "valid"
+        else:
+            op_padding = self.padding
+        outputs = tf.compat.v1.nn.separable_conv2d(
+            inputs,
+            depthwise_kernel,
+            pointwise_kernel,
+            strides=strides,
+            padding=op_padding.upper(),
+            rate=dilation_rate,
+            data_format=conv_utils.convert_data_format(
+                self.data_format, ndim=4
+            ),
+        )
+
+        if self.use_bias:
+            outputs = tf.nn.bias_add(
+                outputs,
+                self.bias,
+                data_format=conv_utils.convert_data_format(
+                    self.data_format, ndim=4
+                ),
+            )
+
+        outputs = tf.squeeze(outputs, [spatial_start_dim])
+
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
+
 
 # Alias
 
diff --git a/keras/layers/convolutional/separable_conv2d.py b/keras/layers/convolutional/separable_conv2d.py
index 9f484d918a6d..18e9ad49555c 100644
--- a/keras/layers/convolutional/separable_conv2d.py
+++ b/keras/layers/convolutional/separable_conv2d.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Keras depthwise separable 2D convolution."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import constraints
@@ -21,180 +23,193 @@
 from keras import regularizers
 from keras.layers.convolutional.base_separable_conv import SeparableConv
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.SeparableConv2D',
-              'keras.layers.SeparableConvolution2D')
+@keras_export(
+    "keras.layers.SeparableConv2D", "keras.layers.SeparableConvolution2D"
+)
 class SeparableConv2D(SeparableConv):
-  """Depthwise separable 2D convolution.
-
-  Separable convolutions consist of first performing
-  a depthwise spatial convolution
-  (which acts on each input channel separately)
-  followed by a pointwise convolution which mixes the resulting
-  output channels. The `depth_multiplier` argument controls how many
-  output channels are generated per input channel in the depthwise step.
-
-  Intuitively, separable convolutions can be understood as
-  a way to factorize a convolution kernel into two smaller kernels,
-  or as an extreme version of an Inception block.
-
-  Args:
-    filters: Integer, the dimensionality of the output space
-      (i.e. the number of output filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions. Current implementation only supports equal
-      length strides in the row and column dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding with zeros evenly
-      to the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch_size, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-    depth_multiplier: The number of depthwise convolution output channels
-      for each input channel.
-      The total number of depthwise convolution output
-      channels will be equal to `filters_in * depth_multiplier`.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (see `keras.activations`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    depthwise_initializer: An initializer for the depthwise convolution kernel
-      (see `keras.initializers`). If None, then the default initializer
-      ('glorot_uniform') will be used.
-    pointwise_initializer: An initializer for the pointwise convolution kernel
-      (see `keras.initializers`). If None, then the default initializer
-      ('glorot_uniform') will be used.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer ('zeros') will be used (see `keras.initializers`).
-    depthwise_regularizer: Regularizer function applied to
-      the depthwise kernel matrix (see `keras.regularizers`).
-    pointwise_regularizer: Regularizer function applied to
-      the pointwise kernel matrix (see `keras.regularizers`).
-    bias_regularizer: Regularizer function applied to the bias vector
-      (see `keras.regularizers`).
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation")
-      (see `keras.regularizers`).
-    depthwise_constraint: Constraint function applied to
-      the depthwise kernel matrix
-      (see `keras.constraints`).
-    pointwise_constraint: Constraint function applied to
-      the pointwise kernel matrix
-      (see `keras.constraints`).
-    bias_constraint: Constraint function applied to the bias vector
-      (see `keras.constraints`).
-
-  Input shape:
-    4D tensor with shape:
-    `(batch_size, channels, rows, cols)` if data_format='channels_first'
-    or 4D tensor with shape:
-    `(batch_size, rows, cols, channels)` if data_format='channels_last'.
-
-  Output shape:
-    4D tensor with shape:
-    `(batch_size, filters, new_rows, new_cols)` if data_format='channels_first'
-    or 4D tensor with shape:
-    `(batch_size, new_rows, new_cols, filters)` if data_format='channels_last'.
-    `rows` and `cols` values might have changed due to padding.
-
-  Returns:
-    A tensor of rank 4 representing
-    `activation(separableconv2d(inputs, kernel) + bias)`.
-
-  Raises:
-    ValueError: if `padding` is "causal".
-  """
-
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1),
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               pointwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        rank=2,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        depth_multiplier=depth_multiplier,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        depthwise_initializer=initializers.get(depthwise_initializer),
-        pointwise_initializer=initializers.get(pointwise_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        depthwise_regularizer=regularizers.get(depthwise_regularizer),
-        pointwise_regularizer=regularizers.get(pointwise_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        depthwise_constraint=constraints.get(depthwise_constraint),
-        pointwise_constraint=constraints.get(pointwise_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
-
-  def call(self, inputs):
-    # Apply the actual ops.
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides + (1,)
-    else:
-      strides = (1, 1) + self.strides
-    outputs = tf.compat.v1.nn.separable_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        self.pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=self.dilation_rate,
-        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = tf.nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
+    """Depthwise separable 2D convolution.
+
+    Separable convolutions consist of first performing
+    a depthwise spatial convolution
+    (which acts on each input channel separately)
+    followed by a pointwise convolution which mixes the resulting
+    output channels. The `depth_multiplier` argument controls how many
+    output channels are generated per input channel in the depthwise step.
+
+    Intuitively, separable convolutions can be understood as
+    a way to factorize a convolution kernel into two smaller kernels,
+    or as an extreme version of an Inception block.
+
+    Args:
+      filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+        height and width of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the height and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions. Current implementation only supports equal
+        length strides in the row and column dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding with zeros
+        evenly to the left/right or up/down of the input such that output has
+        the same height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch_size, channels, height, width)`.
+        When unspecified, uses `image_data_format` value found in your Keras
+        config file at `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+      depth_multiplier: The number of depthwise convolution output channels
+        for each input channel.
+        The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+      activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (see `keras.activations`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: An initializer for the depthwise convolution kernel
+        (see `keras.initializers`). If None, then the default initializer
+        ('glorot_uniform') will be used.
+      pointwise_initializer: An initializer for the pointwise convolution kernel
+        (see `keras.initializers`). If None, then the default initializer
+        ('glorot_uniform') will be used.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer ('zeros') will be used (see `keras.initializers`).
+      depthwise_regularizer: Regularizer function applied to
+        the depthwise kernel matrix (see `keras.regularizers`).
+      pointwise_regularizer: Regularizer function applied to
+        the pointwise kernel matrix (see `keras.regularizers`).
+      bias_regularizer: Regularizer function applied to the bias vector
+        (see `keras.regularizers`).
+      activity_regularizer: Regularizer function applied to
+        the output of the layer (its "activation")
+        (see `keras.regularizers`).
+      depthwise_constraint: Constraint function applied to
+        the depthwise kernel matrix
+        (see `keras.constraints`).
+      pointwise_constraint: Constraint function applied to
+        the pointwise kernel matrix
+        (see `keras.constraints`).
+      bias_constraint: Constraint function applied to the bias vector
+        (see `keras.constraints`).
+
+    Input shape:
+      4D tensor with shape:
+      `(batch_size, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch_size, rows, cols, channels)` if data_format='channels_last'.
+
+    Output shape:
+      4D tensor with shape:
+      `(batch_size, filters, new_rows, new_cols)` if
+      data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch_size, new_rows, new_cols, filters)` if
+      data_format='channels_last'.  `rows` and `cols` values might have changed
+      due to padding.
+
+    Returns:
+      A tensor of rank 4 representing
+      `activation(separableconv2d(inputs, kernel) + bias)`.
+
+    Raises:
+      ValueError: if `padding` is "causal".
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        data_format=None,
+        dilation_rate=(1, 1),
+        depth_multiplier=1,
+        activation=None,
+        use_bias=True,
+        depthwise_initializer="glorot_uniform",
+        pointwise_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        depthwise_regularizer=None,
+        pointwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        pointwise_constraint=None,
+        bias_constraint=None,
+        **kwargs
+    ):
+        super().__init__(
+            rank=2,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            depth_multiplier=depth_multiplier,
+            activation=activations.get(activation),
+            use_bias=use_bias,
+            depthwise_initializer=initializers.get(depthwise_initializer),
+            pointwise_initializer=initializers.get(pointwise_initializer),
+            bias_initializer=initializers.get(bias_initializer),
+            depthwise_regularizer=regularizers.get(depthwise_regularizer),
+            pointwise_regularizer=regularizers.get(pointwise_regularizer),
+            bias_regularizer=regularizers.get(bias_regularizer),
+            activity_regularizer=regularizers.get(activity_regularizer),
+            depthwise_constraint=constraints.get(depthwise_constraint),
+            pointwise_constraint=constraints.get(pointwise_constraint),
+            bias_constraint=constraints.get(bias_constraint),
+            **kwargs
+        )
+
+    def call(self, inputs):
+        # Apply the actual ops.
+        if self.data_format == "channels_last":
+            strides = (1,) + self.strides + (1,)
+        else:
+            strides = (1, 1) + self.strides
+        outputs = tf.nn.separable_conv2d(
+            inputs,
+            self.depthwise_kernel,
+            self.pointwise_kernel,
+            strides=strides,
+            padding=self.padding.upper(),
+            dilations=self.dilation_rate,
+            data_format=conv_utils.convert_data_format(
+                self.data_format, ndim=4
+            ),
+        )
+
+        if self.use_bias:
+            outputs = tf.nn.bias_add(
+                outputs,
+                self.bias,
+                data_format=conv_utils.convert_data_format(
+                    self.data_format, ndim=4
+                ),
+            )
+
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
+
 
 # Alias
 
diff --git a/keras/layers/convolutional/separable_conv_test.py b/keras/layers/convolutional/separable_conv_test.py
index 4f3340853d54..b4abfc1016bc 100644
--- a/keras/layers/convolutional/separable_conv_test.py
+++ b/keras/layers/convolutional/separable_conv_test.py
@@ -14,152 +14,170 @@
 # ==============================================================================
 """Tests for separable convolutional layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class SeparableConv1DTest(test_combinations.TestCase):
-
-  def _run_test(self, kwargs):
-    num_samples = 2
-    stack_size = 3
-    length = 7
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.SeparableConv1D,
-          kwargs=kwargs,
-          input_shape=(num_samples, length, stack_size))
-
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}),
-      ('padding_same', {'padding': 'same'}),
-      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
-      ('padding_causal', {'padding': 'causal'}),
-      ('strides', {'strides': 2}),
-      ('dilation_rate', {'dilation_rate': 2}),
-      ('depth_multiplier', {'depth_multiplier': 2}),
-  )
-  def test_separable_conv1d(self, kwargs):
-    kwargs['filters'] = 2
-    kwargs['kernel_size'] = 3
-    self._run_test(kwargs)
-
-  def test_separable_conv1d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'depthwise_regularizer': 'l2',
-        'pointwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.SeparableConv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((1, 5, 2))))
-      self.assertEqual(len(layer.losses), 4)
-
-  def test_separable_conv1d_constraints(self):
-    d_constraint = lambda x: x
-    p_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'pointwise_constraint': p_constraint,
-        'depthwise_constraint': d_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.SeparableConv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
-      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+    def _run_test(self, kwargs):
+        num_samples = 2
+        stack_size = 3
+        length = 7
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.SeparableConv1D,
+                kwargs=kwargs,
+                input_shape=(num_samples, length, stack_size),
+            )
+
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}),
+        ("padding_same", {"padding": "same"}),
+        ("padding_same_dilation_2", {"padding": "same", "dilation_rate": 2}),
+        ("padding_causal", {"padding": "causal"}),
+        ("strides", {"strides": 2}),
+        ("dilation_rate", {"dilation_rate": 2}),
+        ("depth_multiplier", {"depth_multiplier": 2}),
+    )
+    def test_separable_conv1d(self, kwargs):
+        kwargs["filters"] = 2
+        kwargs["kernel_size"] = 3
+        self._run_test(kwargs)
+
+    def test_separable_conv1d_regularizers(self):
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "depthwise_regularizer": "l2",
+            "pointwise_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.SeparableConv1D(**kwargs)
+            layer.build((None, 5, 2))
+            self.assertEqual(len(layer.losses), 3)
+            layer(keras.backend.variable(np.ones((1, 5, 2))))
+            self.assertEqual(len(layer.losses), 4)
+
+    def test_separable_conv1d_constraints(self):
+        d_constraint = lambda x: x
+        p_constraint = lambda x: x
+        b_constraint = lambda x: x
+
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "pointwise_constraint": p_constraint,
+            "depthwise_constraint": d_constraint,
+            "bias_constraint": b_constraint,
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.SeparableConv1D(**kwargs)
+            layer.build((None, 5, 2))
+            self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+            self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_separable_conv1d_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": 2, "dilation_rate": 2}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.SeparableConv1D(filters=1, kernel_size=2, **kwargs)
 
 
 @test_combinations.run_all_keras_modes
 class SeparableConv2DTest(test_combinations.TestCase):
-
-  def _run_test(self, kwargs):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.SeparableConv2D,
-          kwargs=kwargs,
-          input_shape=(num_samples, num_row, num_col, stack_size))
-
-  @parameterized.named_parameters(
-      ('padding_valid', {'padding': 'valid'}),
-      ('padding_same', {'padding': 'same'}),
-      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
-      ('strides', {'strides': 2}),
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
-      ('dilation_rate', {'dilation_rate': 2}),
-      ('depth_multiplier', {'depth_multiplier': 2}),
-  )
-  def test_separable_conv2d(self, kwargs):
-    kwargs['filters'] = 2
-    kwargs['kernel_size'] = 3
-    if 'data_format' not in kwargs or tf.test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs)
-
-  def test_separable_conv2d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'depthwise_regularizer': 'l2',
-        'pointwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.SeparableConv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 4)
-
-  def test_separable_conv2d_constraints(self):
-    d_constraint = lambda x: x
-    p_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'pointwise_constraint': p_constraint,
-        'depthwise_constraint': d_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.cached_session():
-      layer = keras.layers.SeparableConv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
-      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _run_test(self, kwargs):
+        num_samples = 2
+        stack_size = 3
+        num_row = 7
+        num_col = 6
+
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.SeparableConv2D,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_row, num_col, stack_size),
+            )
+
+    @parameterized.named_parameters(
+        ("padding_valid", {"padding": "valid"}),
+        ("padding_same", {"padding": "same"}),
+        ("padding_same_dilation_2", {"padding": "same", "dilation_rate": 2}),
+        ("strides", {"strides": 2}),
+        # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+        # TODO(b/62340061): Support channels_first on CPU.
+        ("data_format", {"data_format": "channels_first"}),
+        ("dilation_rate", {"dilation_rate": 2}),
+        ("depth_multiplier", {"depth_multiplier": 2}),
+    )
+    def test_separable_conv2d(self, kwargs):
+        kwargs["filters"] = 2
+        kwargs["kernel_size"] = 3
+        if "data_format" not in kwargs or tf.test.is_gpu_available(
+            cuda_only=True
+        ):
+            self._run_test(kwargs)
+
+    def test_separable_conv2d_regularizers(self):
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "depthwise_regularizer": "l2",
+            "pointwise_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.SeparableConv2D(**kwargs)
+            layer.build((None, 5, 5, 2))
+            self.assertEqual(len(layer.losses), 3)
+            layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+            self.assertEqual(len(layer.losses), 4)
+
+    def test_separable_conv2d_constraints(self):
+        d_constraint = lambda x: x
+        p_constraint = lambda x: x
+        b_constraint = lambda x: x
+
+        kwargs = {
+            "filters": 3,
+            "kernel_size": 3,
+            "padding": "valid",
+            "pointwise_constraint": p_constraint,
+            "depthwise_constraint": d_constraint,
+            "bias_constraint": b_constraint,
+            "strides": 1,
+        }
+        with self.cached_session():
+            layer = keras.layers.SeparableConv2D(**kwargs)
+            layer.build((None, 5, 5, 2))
+            self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+            self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_separable_conv2d_invalid_strides_and_dilation_rate(self):
+        kwargs = {"strides": [2, 1], "dilation_rate": [2, 1]}
+        with self.assertRaisesRegex(
+            ValueError, r"""`strides > 1` not supported in conjunction"""
+        ):
+            keras.layers.SeparableConv2D(filters=1, kernel_size=2, **kwargs)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/core/BUILD b/keras/layers/core/BUILD
index 4439c2f6710a..2148cac8fe47 100644
--- a/keras/layers/core/BUILD
+++ b/keras/layers/core/BUILD
@@ -1,3 +1,5 @@
+# Placeholder: load unaliased py_library
+
 # Description:
 #  Contains the Keras core layers.
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
@@ -6,13 +8,14 @@ load("@org_keras//keras:keras.bzl", "cuda_py_test")
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/py/tensorflow_gnn:__subpackages__",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
         "//third_party/tensorflow/python/keras:__subpackages__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
@@ -30,6 +33,7 @@ py_library(
         ":dense",
         ":einsum_dense",
         ":embedding",
+        ":identity",
         ":lambda",
         ":masking",
         ":tf_op_layer",
@@ -128,6 +132,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "identity",
+    srcs = ["identity.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras/engine:base_layer",
+    ],
+)
+
 tf_py_test(
     name = "core_test",
     size = "medium",
diff --git a/keras/layers/core/__init__.py b/keras/layers/core/__init__.py
index 89d9a7eb5272..21d3c6ab52db 100644
--- a/keras/layers/core/__init__.py
+++ b/keras/layers/core/__init__.py
@@ -18,20 +18,23 @@
 from keras.layers.core.dense import Dense
 from keras.layers.core.einsum_dense import EinsumDense
 from keras.layers.core.embedding import Embedding
+from keras.layers.core.identity import Identity
 from keras.layers.core.lambda_layer import Lambda
 from keras.layers.core.masking import Masking
-# Required by third_party/py/tensorflow_gnn/graph/keras/keras_tensors.py
-from keras.layers.core.tf_op_layer import _delegate_method
-from keras.layers.core.tf_op_layer import _delegate_property
+
+# Required by third_party/py/tensorflow_gnn/keras/keras_tensors.py
 from keras.layers.core.tf_op_layer import ClassMethod
 from keras.layers.core.tf_op_layer import InstanceMethod
 from keras.layers.core.tf_op_layer import InstanceProperty
-
 from keras.layers.core.tf_op_layer import SlicingOpLambda
 from keras.layers.core.tf_op_layer import TFOpLambda
+from keras.layers.core.tf_op_layer import _delegate_method
+from keras.layers.core.tf_op_layer import _delegate_property
 
 # Regularization layers imported for backwards namespace compatibility
-from keras.layers.regularization.activity_regularization import ActivityRegularization
+from keras.layers.regularization.activity_regularization import (
+    ActivityRegularization,
+)
 from keras.layers.regularization.dropout import Dropout
 from keras.layers.regularization.spatial_dropout1d import SpatialDropout1D
 from keras.layers.regularization.spatial_dropout2d import SpatialDropout2D
diff --git a/keras/layers/core/activation.py b/keras/layers/core/activation.py
index d953e208a4f7..9cfaade39a33 100644
--- a/keras/layers/core/activation.py
+++ b/keras/layers/core/activation.py
@@ -13,54 +13,55 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the Activation layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import activations
 from keras.engine.base_layer import Layer
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Activation')
+@keras_export("keras.layers.Activation")
 class Activation(Layer):
-  """Applies an activation function to an output.
-
-  Args:
-    activation: Activation function, such as `tf.nn.relu`, or string name of
-      built-in activation function, such as "relu".
+    """Applies an activation function to an output.
 
-  Usage:
+    Args:
+      activation: Activation function, such as `tf.nn.relu`, or string name of
+        built-in activation function, such as "relu".
 
-  >>> layer = tf.keras.layers.Activation('relu')
-  >>> output = layer([-3.0, -1.0, 0.0, 2.0])
-  >>> list(output.numpy())
-  [0.0, 0.0, 0.0, 2.0]
-  >>> layer = tf.keras.layers.Activation(tf.nn.relu)
-  >>> output = layer([-3.0, -1.0, 0.0, 2.0])
-  >>> list(output.numpy())
-  [0.0, 0.0, 0.0, 2.0]
+    Usage:
 
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the batch axis)
-    when using this layer as the first layer in a model.
+    >>> layer = tf.keras.layers.Activation('relu')
+    >>> output = layer([-3.0, -1.0, 0.0, 2.0])
+    >>> list(output.numpy())
+    [0.0, 0.0, 0.0, 2.0]
+    >>> layer = tf.keras.layers.Activation(tf.nn.relu)
+    >>> output = layer([-3.0, -1.0, 0.0, 2.0])
+    >>> list(output.numpy())
+    [0.0, 0.0, 0.0, 2.0]
 
-  Output shape:
-    Same shape as input.
-  """
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the batch axis)
+      when using this layer as the first layer in a model.
 
-  def __init__(self, activation, **kwargs):
-    super().__init__(**kwargs)
-    self.supports_masking = True
-    self.activation = activations.get(activation)
+    Output shape:
+      Same shape as input.
+    """
 
-  def call(self, inputs):
-    return self.activation(inputs)
+    def __init__(self, activation, **kwargs):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.activation = activations.get(activation)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    def call(self, inputs):
+        return self.activation(inputs)
 
-  def get_config(self):
-    config = {'activation': activations.serialize(self.activation)}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def compute_output_shape(self, input_shape):
+        return input_shape
 
+    def get_config(self):
+        config = {"activation": activations.serialize(self.activation)}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/core/core_test.py b/keras/layers/core/core_test.py
index 0f04bd7f28bf..345eb9e33c20 100644
--- a/keras/layers/core/core_test.py
+++ b/keras/layers/core/core_test.py
@@ -17,629 +17,697 @@
 import os
 import textwrap
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras import initializers
 from keras.layers import core
 from keras.mixed_precision import policy
+from keras.saving.serialization_lib import SafeModeScope
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class DropoutLayersTest(test_combinations.TestCase):
-
-  def test_dropout(self):
-    test_utils.layer_test(
-        keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
-
-    test_utils.layer_test(
-        keras.layers.Dropout,
-        kwargs={
-            'rate': 0.5,
-            'noise_shape': [3, 1]
-        },
-        input_shape=(3, 2))
-
-  def test_dropout_supports_masking(self):
-    dropout = keras.layers.Dropout(0.5)
-    self.assertEqual(True, dropout.supports_masking)
-
-  def test_spatial_dropout_1d(self):
-    test_utils.layer_test(
-        keras.layers.SpatialDropout1D,
-        kwargs={'rate': 0.5},
-        input_shape=(2, 3, 4))
-
-  def test_spatial_dropout_2d(self):
-    test_utils.layer_test(
-        keras.layers.SpatialDropout2D,
-        kwargs={'rate': 0.5},
-        input_shape=(2, 3, 4, 5))
-
-    test_utils.layer_test(
-        keras.layers.SpatialDropout2D,
-        kwargs={
-            'rate': 0.5,
-            'data_format': 'channels_first'
-        },
-        input_shape=(2, 3, 4, 5))
-
-  def test_spatial_dropout_3d(self):
-    test_utils.layer_test(
-        keras.layers.SpatialDropout3D,
-        kwargs={'rate': 0.5},
-        input_shape=(2, 3, 4, 4, 5))
-
-    test_utils.layer_test(
-        keras.layers.SpatialDropout3D,
-        kwargs={
-            'rate': 0.5,
-            'data_format': 'channels_first'
-        },
-        input_shape=(2, 3, 4, 4, 5))
-
-  def test_dropout_partial_noise_shape(self):
-    inputs = keras.Input(shape=(5, 10))
-    layer = keras.layers.Dropout(0.5, noise_shape=(None, 1, None))
-    outputs = layer(inputs)
-    model = keras.Model(inputs, outputs)
-    out = model(np.ones((20, 5, 10)), training=True)
-    out_np = keras.backend.get_value(out)
-    # Test that dropout mask is shared across second dim.
-    self.assertAllClose(out_np[:, 0, :], out_np[:, 1, :])
-
-  def test_dropout_with_savemodel(self):
-    inputs = keras.Input(shape=(5, 10))
-    layer = keras.layers.Dropout(0.5, force_generator=True)
-    outputs = layer(inputs)
-    model = keras.Model(inputs, outputs)
-    train = model(np.ones((20, 5, 10)), training=True)
-    predict = model(np.ones((20, 5, 10)))
-    # Make sure the weights from tf.random.Generator is not present in the model
-    # which will cause weight loading issue for existing application models if
-    # it contains dropout layer.
-    self.assertEmpty(layer.get_weights())
-    self.assertEmpty(model.get_weights())
-
-    # Make sure the layer does dropout value when training
-    self.assertNotAllClose(train, predict)
-
-    model.save(os.path.join(self.get_temp_dir(), 'savedmodel'),
-               save_format='tf')
-    loaded_model = keras.models.load_model(
-        os.path.join(self.get_temp_dir(), 'savedmodel'))
-    predict2 = loaded_model(np.ones((20, 5, 10)))
-
-    self.assertAllClose(predict, predict2)
-    # Make sure the model dropout different value after loading
-    train2 = loaded_model(np.ones((20, 5, 10)), training=True)
-    self.assertNotAllClose(train, train2)
-    self.assertIsNotNone(loaded_model.layers[1]._random_generator)
-
-    # Also make sure the checkpoint doesn't contain any variable from the
-    # dropout layer, to keep the backward compatibility.
-    checkpoint = tf.train.Checkpoint(model)
-    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), 'checkpoint'))
-    checkpoint_var_names = [name_value_tuple[0] for name_value_tuple in
-                            tf.train.list_variables(save_path)]
-    for name in checkpoint_var_names:
-      self.assertNotIn('dropout', name)
+    def test_dropout(self):
+        test_utils.layer_test(
+            keras.layers.Dropout, kwargs={"rate": 0.5}, input_shape=(3, 2)
+        )
+
+        test_utils.layer_test(
+            keras.layers.Dropout,
+            kwargs={"rate": 0.5, "noise_shape": [3, 1]},
+            input_shape=(3, 2),
+        )
+
+    def test_dropout_supports_masking(self):
+        dropout = keras.layers.Dropout(0.5)
+        self.assertEqual(True, dropout.supports_masking)
+
+    def test_spatial_dropout_1d(self):
+        test_utils.layer_test(
+            keras.layers.SpatialDropout1D,
+            kwargs={"rate": 0.5},
+            input_shape=(2, 3, 4),
+        )
+
+    def test_spatial_dropout_2d(self):
+        test_utils.layer_test(
+            keras.layers.SpatialDropout2D,
+            kwargs={"rate": 0.5},
+            input_shape=(2, 3, 4, 5),
+        )
+
+        test_utils.layer_test(
+            keras.layers.SpatialDropout2D,
+            kwargs={"rate": 0.5, "data_format": "channels_first"},
+            input_shape=(2, 3, 4, 5),
+        )
+
+    def test_spatial_dropout_3d(self):
+        test_utils.layer_test(
+            keras.layers.SpatialDropout3D,
+            kwargs={"rate": 0.5},
+            input_shape=(2, 3, 4, 4, 5),
+        )
+
+        test_utils.layer_test(
+            keras.layers.SpatialDropout3D,
+            kwargs={"rate": 0.5, "data_format": "channels_first"},
+            input_shape=(2, 3, 4, 4, 5),
+        )
+
+    def test_dropout_partial_noise_shape(self):
+        inputs = keras.Input(shape=(5, 10))
+        layer = keras.layers.Dropout(0.5, noise_shape=(None, 1, None))
+        outputs = layer(inputs)
+        model = keras.Model(inputs, outputs)
+        out = model(np.ones((20, 5, 10)), training=True)
+        out_np = keras.backend.get_value(out)
+        # Test that dropout mask is shared across second dim.
+        self.assertAllClose(out_np[:, 0, :], out_np[:, 1, :])
+
+    def test_dropout_with_saving(self):
+        inputs = keras.Input(shape=(5, 10))
+        layer = keras.layers.Dropout(0.5, force_generator=True)
+        outputs = layer(inputs)
+        model = keras.Model(inputs, outputs)
+        train = model(np.ones((20, 5, 10)), training=True)
+        predict = model(np.ones((20, 5, 10)))
+        # Make sure the weights from tf.random.Generator is not present in the
+        # model which will cause weight loading issue for existing application
+        # models if it contains dropout layer.
+        self.assertEmpty(layer.get_weights())
+        self.assertEmpty(model.get_weights())
+
+        # Make sure the layer does dropout value when training
+        self.assertNotAllClose(train, predict)
+
+        with self.subTest("savedmodel"):
+            model.save(
+                os.path.join(self.get_temp_dir(), "savedmodel"),
+                save_format="tf",
+            )
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "savedmodel")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model.save(os.path.join(self.get_temp_dir(), "model.keras"))
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "model.keras")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("checkpoint"):
+            # Also make sure the checkpoint doesn't contain any variable from
+            # the dropout layer, to keep the backward compatibility.
+            checkpoint = tf.train.Checkpoint(model)
+            save_path = checkpoint.save(
+                os.path.join(self.get_temp_dir(), "checkpoint")
+            )
+            checkpoint_var_names = [
+                name_value_tuple[0]
+                for name_value_tuple in tf.train.list_variables(save_path)
+            ]
+            for name in checkpoint_var_names:
+                self.assertNotIn("dropout", name)
 
 
 @test_combinations.run_all_keras_modes
 class LambdaLayerTest(test_combinations.TestCase):
-
-  def test_lambda(self):
-    test_utils.layer_test(
-        keras.layers.Lambda,
-        kwargs={'function': lambda x: x + 1},
-        input_shape=(3, 2))
-
-    test_utils.layer_test(
-        keras.layers.Lambda,
-        kwargs={
-            'function': lambda x, a, b: x * a + b,
-            'arguments': {
-                'a': 0.6,
-                'b': 0.4
-            }
-        },
-        input_shape=(3, 2))
-
-    # test serialization with function
-    def f(x):
-      return x + 1
-
-    ld = keras.layers.Lambda(f)
-    config = ld.get_config()
-    ld = keras.layers.deserialize({'class_name': 'Lambda', 'config': config})
-    self.assertEqual(ld.function(3), 4)
-
-    # test with lambda
-    ld = keras.layers.Lambda(
-        lambda x: keras.backend.concatenate([tf.square(x), x]))
-    config = ld.get_config()
-    ld = keras.layers.Lambda.from_config(config)
-    self.assertAllEqual(self.evaluate(ld.function([3])), [9, 3])
-
-  def test_lambda_multiple_inputs(self):
-    ld = keras.layers.Lambda(lambda x: x[0], output_shape=lambda x: x[0])
-    x1 = np.ones([3, 2], np.float32)
-    x2 = np.ones([3, 5], np.float32)
-    out = ld([x1, x2])
-    self.assertAllEqual(out.shape, [3, 2])
-
-  def test_lambda_output_shape(self):
-    l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
-    l(keras.backend.variable(np.ones((1, 1))))
-    self.assertEqual((1, 1), l.get_config()['output_shape'])
-
-  def test_lambda_output_shape_function(self):
-
-    def get_output_shape(input_shape):
-      return 1 * input_shape
-
-    l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
-    l(keras.backend.variable(np.ones((1, 1))))
-    self.assertEqual('lambda', l.get_config()['output_shape_type'])
-
-  def test_lambda_output_shape_autocalculate_multiple_inputs(self):
-
-    def lambda_fn(x):
-      return tf.matmul(x[0], x[1])
-
-    l = keras.layers.Lambda(lambda_fn, dtype=tf.float64)
-    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
-    self.assertAllEqual((10, 20), output_shape)
-    output_signature = l.compute_output_signature([
-        tf.TensorSpec(dtype=tf.float64, shape=(10, 10)),
-        tf.TensorSpec(dtype=tf.float64, shape=(10, 20))
-    ])
-    self.assertAllEqual((10, 20), output_signature.shape)
-    self.assertAllEqual(tf.float64, output_signature.dtype)
-
-  def test_lambda_output_shape_list_multiple_outputs(self):
-
-    def lambda_fn(x):
-      return x
-
-    l = keras.layers.Lambda(lambda_fn, output_shape=[(10,), (20,)])
-    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
-    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
-
-  def test_lambda_output_shape_tuple_with_none(self):
-
-    def lambda_fn(x):
-      return x
-
-    l = keras.layers.Lambda(lambda_fn, output_shape=(None, 10))
-    output_shape = l.compute_output_shape((5, 10, 20))
-    self.assertAllEqual([5, None, 10], output_shape.as_list())
-
-  def test_lambda_output_shape_function_multiple_outputs(self):
-
-    def lambda_fn(x):
-      return x
-
-    def output_shape_fn(input_shape):
-      return input_shape
-
-    l = keras.layers.Lambda(lambda_fn, output_shape=output_shape_fn)
-    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
-    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
-
-  def test_lambda_output_shape_nested(self):
-
-    def lambda_fn(inputs):
-      return (inputs[1]['a'], {'b': inputs[0]})
-
-    l = keras.layers.Lambda(lambda_fn)
-    output_shape = l.compute_output_shape(((10, 20), {'a': (10, 5)}))
-    self.assertAllEqual(((10, 5), {'b': (10, 20)}), output_shape)
-
-  def test_lambda_config_serialization(self):
-    # Test serialization with output_shape and output_shape_type
-    layer = keras.layers.Lambda(
-        lambda x: x + 1, output_shape=(1, 1), mask=lambda i, m: m)
-    layer(keras.backend.variable(np.ones((1, 1))))
-    config = layer.get_config()
-
-    layer = keras.layers.deserialize({'class_name': 'Lambda', 'config': config})
-    self.assertAllEqual(layer.function(1), 2)
-    self.assertAllEqual(layer._output_shape, (1, 1))
-    self.assertAllEqual(layer.mask(1, True), True)
-
-    layer = keras.layers.Lambda.from_config(config)
-    self.assertAllEqual(layer.function(1), 2)
-    self.assertAllEqual(layer._output_shape, (1, 1))
-    self.assertAllEqual(layer.mask(1, True), True)
-
-  def test_lambda_with_training_arg(self):
-
-    def fn(x, training=True):
-      return keras.backend.in_train_phase(x, 2 * x, training=training)
-
-    layer = keras.layers.Lambda(fn)
-    x = keras.backend.ones(())
-    train_out = layer(x, training=True)
-    eval_out = layer(x, training=False)
-
-    self.assertEqual(keras.backend.get_value(train_out), 1.)
-    self.assertEqual(keras.backend.get_value(eval_out), 2.)
-
-  def test_lambda_with_mask(self):
-
-    def add_one(inputs):
-      return inputs + 1.0
-
-    def mask(unused_inputs, previous_mask):
-      return previous_mask
-
-    layer = keras.layers.Lambda(add_one, mask=mask)
-    x = np.ones([5, 4, 3])
-    x[:, -1, :] = 0
-    masking = keras.layers.Masking()
-    out = layer(masking(x))
-
-    expected_out = np.full([5, 4, 3], 2.0)
-    expected_out[:, -1, :] = 1.0
-    expected_mask = np.ones([5, 4])
-    expected_mask[:, -1] = 0.0
-
-    self.assertAllClose(self.evaluate(out), expected_out)
-    self.assertIsNotNone(out._keras_mask)
-    self.assertAllClose(self.evaluate(out._keras_mask), expected_mask)
-
-  def test_lambda_with_ragged_input(self):
-
-    def add_one(inputs):
-      return inputs + 1.0
-
-    layer = keras.layers.Lambda(add_one)
-
-    ragged_input = tf.ragged.constant([[1.0], [2.0, 3.0]])
-    out = layer(ragged_input)
-    expected_out = tf.ragged.constant([[2.0], [3.0, 4.0]])
-    self.assertAllClose(out, expected_out)
-
-  def test_lambda_deserialization_does_not_pollute_core(self):
-    layer = keras.layers.Lambda(lambda x: x + 1)
-    config = layer.get_config()
-    keras.layers.Lambda.from_config(config)
-    self.assertNotIn(self.__class__.__name__, dir(core))
+    def test_lambda(self):
+        with SafeModeScope(safe_mode=False):
+            test_utils.layer_test(
+                keras.layers.Lambda,
+                kwargs={"function": lambda x: x + 1},
+                input_shape=(3, 2),
+            )
+
+            test_utils.layer_test(
+                keras.layers.Lambda,
+                kwargs={
+                    "function": lambda x, a, b: x * a + b,
+                    "arguments": {"a": 0.6, "b": 0.4},
+                },
+                input_shape=(3, 2),
+            )
+
+        # test serialization with function
+        def f(x):
+            return x + 1
+
+        ld = keras.layers.Lambda(f)
+        config = ld.get_config()
+        with SafeModeScope(safe_mode=False):
+            ld = keras.layers.deserialize(
+                {"class_name": "Lambda", "config": config}
+            )
+        self.assertEqual(ld.function(3), 4)
+
+        # test with lambda
+        ld = keras.layers.Lambda(
+            lambda x: keras.backend.concatenate([tf.square(x), x])
+        )
+        config = ld.get_config()
+        ld = keras.layers.Lambda.from_config(config)
+        self.assertAllEqual(self.evaluate(ld.function([3])), [9, 3])
+
+    def test_lambda_multiple_inputs(self):
+        ld = keras.layers.Lambda(lambda x: x[0], output_shape=lambda x: x[0])
+        x1 = np.ones([3, 2], np.float32)
+        x2 = np.ones([3, 5], np.float32)
+        out = ld([x1, x2])
+        self.assertAllEqual(out.shape, [3, 2])
+
+    def test_lambda_output_shape(self):
+        l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+        l(keras.backend.variable(np.ones((1, 1))))
+        self.assertEqual((1, 1), l.get_config()["output_shape"])
+
+    def test_lambda_output_shape_function(self):
+        def get_output_shape(input_shape):
+            return 1 * input_shape
+
+        l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
+        l(keras.backend.variable(np.ones((1, 1))))
+        self.assertEqual("lambda", l.get_config()["output_shape_type"])
+
+    def test_lambda_output_shape_autocalculate_multiple_inputs(self):
+        def lambda_fn(x):
+            return tf.matmul(x[0], x[1])
+
+        l = keras.layers.Lambda(lambda_fn, dtype=tf.float64)
+        output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+        self.assertAllEqual((10, 20), output_shape)
+        output_signature = l.compute_output_signature(
+            [
+                tf.TensorSpec(dtype=tf.float64, shape=(10, 10)),
+                tf.TensorSpec(dtype=tf.float64, shape=(10, 20)),
+            ]
+        )
+        self.assertAllEqual((10, 20), output_signature.shape)
+        self.assertAllEqual(tf.float64, output_signature.dtype)
+
+    def test_lambda_output_shape_list_multiple_outputs(self):
+        def lambda_fn(x):
+            return x
+
+        l = keras.layers.Lambda(lambda_fn, output_shape=[(10,), (20,)])
+        output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+        self.assertAllEqual([(10, 10), (10, 20)], output_shape)
+
+    def test_lambda_output_shape_tuple_with_none(self):
+        def lambda_fn(x):
+            return x
+
+        l = keras.layers.Lambda(lambda_fn, output_shape=(None, 10))
+        output_shape = l.compute_output_shape((5, 10, 20))
+        self.assertAllEqual([5, None, 10], output_shape.as_list())
+
+    def test_lambda_output_shape_function_multiple_outputs(self):
+        def lambda_fn(x):
+            return x
+
+        def output_shape_fn(input_shape):
+            return input_shape
+
+        l = keras.layers.Lambda(lambda_fn, output_shape=output_shape_fn)
+        output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+        self.assertAllEqual([(10, 10), (10, 20)], output_shape)
+
+    def test_lambda_output_shape_nested(self):
+        def lambda_fn(inputs):
+            return (inputs[1]["a"], {"b": inputs[0]})
+
+        l = keras.layers.Lambda(lambda_fn)
+        output_shape = l.compute_output_shape(((10, 20), {"a": (10, 5)}))
+        self.assertAllEqual(((10, 5), {"b": (10, 20)}), output_shape)
+
+    def test_lambda_config_serialization(self):
+        # Test serialization with output_shape and output_shape_type
+        layer = keras.layers.Lambda(
+            lambda x: x + 1, output_shape=(1, 1), mask=lambda i, m: m
+        )
+        layer(keras.backend.variable(np.ones((1, 1))))
+        config = layer.get_config()
+
+        with SafeModeScope(safe_mode=False):
+            layer = keras.layers.deserialize(
+                {"class_name": "Lambda", "config": config}
+            )
+        self.assertAllEqual(layer.function(1), 2)
+        self.assertAllEqual(layer._output_shape, (1, 1))
+        self.assertAllEqual(layer.mask(1, True), True)
+
+        layer = keras.layers.Lambda.from_config(config)
+        self.assertAllEqual(layer.function(1), 2)
+        self.assertAllEqual(layer._output_shape, (1, 1))
+        self.assertAllEqual(layer.mask(1, True), True)
+
+    def test_lambda_with_training_arg(self):
+        def fn(x, training=True):
+            return keras.backend.in_train_phase(x, 2 * x, training=training)
+
+        layer = keras.layers.Lambda(fn)
+        x = keras.backend.ones(())
+        train_out = layer(x, training=True)
+        eval_out = layer(x, training=False)
+
+        self.assertEqual(keras.backend.get_value(train_out), 1.0)
+        self.assertEqual(keras.backend.get_value(eval_out), 2.0)
+
+    def test_lambda_with_mask(self):
+        def add_one(inputs):
+            return inputs + 1.0
+
+        def mask(unused_inputs, previous_mask):
+            return previous_mask
+
+        layer = keras.layers.Lambda(add_one, mask=mask)
+        x = np.ones([5, 4, 3])
+        x[:, -1, :] = 0
+        masking = keras.layers.Masking()
+        out = layer(masking(x))
+
+        expected_out = np.full([5, 4, 3], 2.0)
+        expected_out[:, -1, :] = 1.0
+        expected_mask = np.ones([5, 4])
+        expected_mask[:, -1] = 0.0
+
+        self.assertAllClose(self.evaluate(out), expected_out)
+        self.assertIsNotNone(out._keras_mask)
+        self.assertAllClose(self.evaluate(out._keras_mask), expected_mask)
+
+    def test_lambda_with_ragged_input(self):
+        def add_one(inputs):
+            return inputs + 1.0
+
+        layer = keras.layers.Lambda(add_one)
+
+        ragged_input = tf.ragged.constant([[1.0], [2.0, 3.0]])
+        out = layer(ragged_input)
+        expected_out = tf.ragged.constant([[2.0], [3.0, 4.0]])
+        self.assertAllClose(out, expected_out)
+
+    def test_lambda_deserialization_does_not_pollute_core(self):
+        layer = keras.layers.Lambda(lambda x: x + 1)
+        config = layer.get_config()
+        keras.layers.Lambda.from_config(config)
+        self.assertNotIn(self.__class__.__name__, dir(core))
 
 
 class TestStatefulLambda(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_lambda_with_variable_in_model(self):
-    v = tf.Variable(1., trainable=True)
-
-    def lambda_fn(x, v):
-      return x * v
-
-    # While it is generally not advised to mix Variables with Lambda layers, if
-    # the variables are explicitly set as attributes then they are still
-    # tracked. This is consistent with the base Layer behavior.
-    layer = keras.layers.Lambda(lambda_fn, arguments={'v': v})
-    self.assertLen(layer.trainable_weights, 0)
-    layer.v = v
-    self.assertLen(layer.trainable_weights, 1)
-
-    model = test_utils.get_model_from_layers([layer], input_shape=(10,))
-    model.compile(
-        keras.optimizers.optimizer_v2.gradient_descent.SGD(0.1),
-        'mae',
-        run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 10), 'float32'), 2 * np.ones((10, 10), 'float32')
-    model.fit(x, y, batch_size=2, epochs=2, validation_data=(x, y))
-    self.assertLen(model.trainable_weights, 1)
-    self.assertAllClose(keras.backend.get_value(model.trainable_weights[0]), 2.)
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_creation_inside_lambda(self):
-
-    def lambda_fn(x):
-      scale = tf.Variable(1., trainable=True, name='scale')
-      shift = tf.Variable(1., trainable=True, name='shift')
-      return x * scale + shift
-
-    expected_error = textwrap.dedent(r"""
-    (    )?The following Variables were created within a Lambda layer \(shift_and_scale\)
-    (    )?but are not tracked by said layer:
-    (    )?  <tf.Variable \'.*shift_and_scale/scale:0\'.+
-    (    )?  <tf.Variable \'.*shift_and_scale/shift:0\'.+
-    (    )?The layer cannot safely ensure proper Variable reuse.+""")
-
-    with self.assertRaisesRegex(ValueError, expected_error):
-      layer = keras.layers.Lambda(lambda_fn, name='shift_and_scale')
-      model = test_utils.get_model_from_layers([layer], input_shape=(1,))
-      model(tf.ones((4, 1)))
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_transitive_variable_creation(self):
-    dense = keras.layers.Dense(1, use_bias=False, kernel_initializer='ones')
-
-    def bad_lambda_fn(x):
-      return dense(x + 1)  # Dense layer is built on first call
-
-    expected_error = textwrap.dedent(r"""
-    (    )?The following Variables were created within a Lambda layer \(bias_dense\)
-    (    )?but are not tracked by said layer:
-    (    )?  <tf.Variable \'.*bias_dense/dense/kernel:0\'.+
-    (    )?The layer cannot safely ensure proper Variable reuse.+""")
-
-    with self.assertRaisesRegex(ValueError, expected_error):
-      layer = keras.layers.Lambda(bad_lambda_fn, name='bias_dense')
-      model = test_utils.get_model_from_layers([layer], input_shape=(1,))
-      model(tf.ones((4, 1)))
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_warns_on_variable_capture(self):
-    v = tf.Variable(1., trainable=True)
-
-    def lambda_fn(x):
-      return x * v
-
-    expected_warning = textwrap.dedent(r"""
-    (    )?The following Variables were used a Lambda layer\'s call \(lambda\), but
-    (    )?are not present in its tracked objects:
-    (    )?  <tf.Variable \'.*Variable:0\'.+
-    (    )?It is possible that this is intended behavior.+""")
-
-    layer = keras.layers.Lambda(lambda_fn)
-
-    def patched_warn(msg):
-      raise ValueError(msg)
-
-    layer._warn = patched_warn
-
-    with self.assertRaisesRegex(ValueError, expected_warning):
-      model = test_utils.get_model_from_layers([layer], input_shape=(1,))
-      model(tf.ones((4, 1)))
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_lambda_skip_state_variable_from_initializer(self):
-    # Force the initializers to use the tf.random.Generator, which will contain
-    # the state variable.
-    kernel_initializer = initializers.RandomNormalV2()
-    kernel_initializer._random_generator._rng_type \
-      = kernel_initializer._random_generator.RNG_STATEFUL
-    dense = keras.layers.Dense(1, use_bias=False,
-                               kernel_initializer=kernel_initializer)
-
-    def lambda_fn(x):
-      return dense(x + 1)  # Dense layer is built on first call
-
-    # While it is generally not advised to mix Variables with Lambda layers, if
-    # the variables are explicitly set as attributes then they are still
-    # tracked. This is consistent with the base Layer behavior.
-    layer = keras.layers.Lambda(lambda_fn)
-    layer.dense = dense
-
-    model = test_utils.get_model_from_layers([layer], input_shape=(10,))
-    model.compile(
-        keras.optimizers.optimizer_v2.gradient_descent.SGD(0.1),
-        'mae',
-        run_eagerly=test_utils.should_run_eagerly())
-    x, y = np.ones((10, 10), 'float32'), 2 * np.ones((10, 10), 'float32')
-    model.fit(x, y, batch_size=2, epochs=2, validation_data=(x, y))
-    self.assertLen(model.trainable_weights, 1)
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_lambda_with_variable_in_model(self):
+        v = tf.Variable(1.0, trainable=True)
+
+        def lambda_fn(x, v):
+            return x * v
+
+        # While it is generally not advised to mix Variables with Lambda layers,
+        # if the variables are explicitly set as attributes then they are still
+        # tracked. This is consistent with the base Layer behavior.
+        layer = keras.layers.Lambda(lambda_fn, arguments={"v": v})
+        self.assertLen(layer.trainable_weights, 0)
+        layer.v = v
+        self.assertLen(layer.trainable_weights, 1)
+
+        model = test_utils.get_model_from_layers([layer], input_shape=(10,))
+        model.compile(
+            keras.optimizers.legacy.gradient_descent.SGD(0.1),
+            "mae",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x, y = np.ones((10, 10), "float32"), 2 * np.ones((10, 10), "float32")
+        model.fit(x, y, batch_size=2, epochs=2, validation_data=(x, y))
+        self.assertLen(model.trainable_weights, 1)
+        self.assertAllClose(
+            keras.backend.get_value(model.trainable_weights[0]), 2.0
+        )
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_creation_inside_lambda(self):
+        def lambda_fn(x):
+            scale = tf.Variable(1.0, trainable=True, name="scale")
+            shift = tf.Variable(1.0, trainable=True, name="shift")
+            return x * scale + shift
+
+        expected_error = textwrap.dedent(
+            r"""
+(    )?The following Variables were created within a Lambda layer \(shift_and_scale\)"""  # noqa: E501
+            r"""
+(    )?but are not tracked by said layer:
+(    )?  <tf.Variable \'.*shift_and_scale/scale:0\'.+
+(    )?  <tf.Variable \'.*shift_and_scale/shift:0\'.+
+(    )?The layer cannot safely ensure proper Variable reuse.+"""
+        )
+
+        with self.assertRaisesRegex(ValueError, expected_error):
+            layer = keras.layers.Lambda(lambda_fn, name="shift_and_scale")
+            model = test_utils.get_model_from_layers([layer], input_shape=(1,))
+            model(tf.ones((4, 1)))
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_transitive_variable_creation(self):
+        dense = keras.layers.Dense(1, use_bias=False, kernel_initializer="ones")
+
+        def bad_lambda_fn(x):
+            return dense(x + 1)  # Dense layer is built on first call
+
+        expected_error = textwrap.dedent(
+            r"""
+(    )?The following Variables were created within a Lambda layer \(bias_dense\)
+(    )?but are not tracked by said layer:
+(    )?  <tf.Variable \'.*bias_dense/dense/kernel:0\'.+
+(    )?The layer cannot safely ensure proper Variable reuse.+"""
+        )
+
+        with self.assertRaisesRegex(ValueError, expected_error):
+            layer = keras.layers.Lambda(bad_lambda_fn, name="bias_dense")
+            model = test_utils.get_model_from_layers([layer], input_shape=(1,))
+            model(tf.ones((4, 1)))
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_warns_on_variable_capture(self):
+        v = tf.Variable(1.0, trainable=True)
+
+        def lambda_fn(x):
+            return x * v
+
+        expected_warning = textwrap.dedent(
+            r"""
+(    )?The following Variables were used a Lambda layer\'s call \(lambda\), but
+(    )?are not present in its tracked objects:
+(    )?  <tf.Variable \'.*Variable:0\'.+
+(    )?It is possible that this is intended behavior.+"""
+        )
+
+        layer = keras.layers.Lambda(lambda_fn)
+
+        def patched_warn(msg):
+            raise ValueError(msg)
+
+        layer._warn = patched_warn
+
+        with self.assertRaisesRegex(ValueError, expected_warning):
+            model = test_utils.get_model_from_layers([layer], input_shape=(1,))
+            model(tf.ones((4, 1)))
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_lambda_skip_state_variable_from_initializer(self):
+        # Force the initializers to use the tf.random.Generator, which will
+        # contain the state variable.
+        kernel_initializer = initializers.RandomNormalV2()
+        kernel_initializer._random_generator._rng_type = (
+            kernel_initializer._random_generator.RNG_STATEFUL
+        )
+        dense = keras.layers.Dense(
+            1, use_bias=False, kernel_initializer=kernel_initializer
+        )
+
+        def lambda_fn(x):
+            return dense(x + 1)  # Dense layer is built on first call
+
+        # While it is generally not advised to mix Variables with Lambda layers,
+        # if the variables are explicitly set as attributes then they are still
+        # tracked. This is consistent with the base Layer behavior.
+        layer = keras.layers.Lambda(lambda_fn)
+        layer.dense = dense
+
+        model = test_utils.get_model_from_layers([layer], input_shape=(10,))
+        model.compile(
+            keras.optimizers.legacy.gradient_descent.SGD(0.1),
+            "mae",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x, y = np.ones((10, 10), "float32"), 2 * np.ones((10, 10), "float32")
+        model.fit(x, y, batch_size=2, epochs=2, validation_data=(x, y))
+        self.assertLen(model.trainable_weights, 1)
 
 
 @test_combinations.run_all_keras_modes
 class CoreLayersTest(test_combinations.TestCase):
-
-  def test_masking(self):
-    test_utils.layer_test(
-        keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
-
-  def test_keras_mask(self):
-    x = np.ones((10, 10))
-    y = keras.layers.Masking(1.)(x)
-    self.assertTrue(hasattr(y, '_keras_mask'))
-    self.assertIsNotNone(y._keras_mask)
-    self.assertAllClose(self.evaluate(y._keras_mask), np.zeros((10,)))
-
-  def test_compute_mask_with_positional_mask_arg(self):
-
-    class MyLayer(keras.layers.Layer):
-
-      def call(self, inputs, mask=None):
-        return inputs
-
-      def compute_mask(self, inputs, mask=None):
-        if mask is not None:
-          return tf.ones(())
-        else:
-          return tf.zeros(())
-
-    x, mask = tf.ones((1, 1)), tf.ones((1, 1))
-    layer = MyLayer()
-    y = layer(x, mask)
-    # Check that `mask` was correctly sent to `compute_mask`.
-    self.assertEqual(keras.backend.get_value(y._keras_mask), 1)
-
-  def test_activation(self):
-    # with string argument
-    test_utils.layer_test(
-        keras.layers.Activation,
-        kwargs={'activation': 'relu'},
-        input_shape=(3, 2))
-
-    # with function argument
-    test_utils.layer_test(
-        keras.layers.Activation,
-        kwargs={'activation': keras.backend.relu},
-        input_shape=(3, 2))
-
-  def test_dense(self):
-    test_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
-
-    test_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 2))
-
-    test_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(None, None, 2))
-
-    test_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
-
-  def test_dense_output(self):
-    dense_inputs = tf.convert_to_tensor(
-        np.random.uniform(size=(10, 10)).astype('f'))
-    # Create some sparse data where multiple rows and columns are missing.
-    sparse_inputs = tf.SparseTensor(
-        indices=np.random.randint(low=0, high=10, size=(5, 2)),
-        values=np.random.uniform(size=(5,)).astype('f'),
-        dense_shape=[10, 10])
-    sparse_inputs = tf.sparse.reorder(sparse_inputs)
-    # Create some ragged data.
-    ragged_inputs = tf.RaggedTensor.from_row_splits(
-        np.random.uniform(size=(10, 10)).astype('f'),
-        row_splits=[0, 4, 6, 6, 9, 10])
-
-    layer = keras.layers.Dense(
-        5,
-        kernel_initializer=keras.initializers.RandomUniform(),
-        bias_initializer=keras.initializers.RandomUniform(),
-        dtype='float32')
-    dense_outputs = layer(dense_inputs)
-    sparse_outpus = layer(sparse_inputs)
-    ragged_outputs = layer(ragged_inputs)
-
-    expected_dense = tf.add(
-        tf.matmul(dense_inputs, keras.backend.get_value(layer.kernel)),
-        keras.backend.get_value(layer.bias))
-    expected_sparse = tf.add(
-        tf.matmul(
-            tf.sparse.to_dense(sparse_inputs),
-            keras.backend.get_value(layer.kernel)),
-        keras.backend.get_value(layer.bias))
-    expected_ragged_values = tf.add(
-        tf.matmul(ragged_inputs.flat_values,
-                  keras.backend.get_value(layer.kernel)),
-        keras.backend.get_value(layer.bias))
-    expected_ragged = tf.RaggedTensor.from_row_splits(
-        expected_ragged_values, row_splits=[0, 4, 6, 6, 9, 10])
-
-    self.assertAllClose(dense_outputs, expected_dense)
-    self.assertAllClose(sparse_outpus, expected_sparse)
-    self.assertAllClose(ragged_outputs, expected_ragged)
-
-  def test_dense_dtype(self):
-    inputs = tf.convert_to_tensor(np.random.randint(low=0, high=7, size=(2, 2)))
-    layer = keras.layers.Dense(5, dtype='float32')
-    outputs = layer(inputs)
-    self.assertEqual(outputs.dtype, 'float32')
-
-  def test_dense_with_policy(self):
-    inputs = tf.convert_to_tensor(np.random.randint(low=0, high=7, size=(2, 2)))
-    layer = keras.layers.Dense(5, dtype=policy.Policy('mixed_float16'))
-    outputs = layer(inputs)
-    output_signature = layer.compute_output_signature(
-        tf.TensorSpec(dtype='float16', shape=(2, 2)))
-    self.assertEqual(output_signature.dtype, tf.float16)
-    self.assertEqual(output_signature.shape, (2, 5))
-    self.assertEqual(outputs.dtype, 'float16')
-    self.assertEqual(layer.kernel.dtype, 'float32')
-
-  def test_dense_regularization(self):
-    layer = keras.layers.Dense(
-        3,
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l1',
-        activity_regularizer='l2',
-        name='dense_reg')
-    layer(keras.backend.variable(np.ones((2, 4))))
-    self.assertEqual(3, len(layer.losses))
-
-  def test_dense_constraints(self):
-    k_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    layer = keras.layers.Dense(
-        3, kernel_constraint=k_constraint, bias_constraint=b_constraint)
-    layer(keras.backend.variable(np.ones((2, 4))))
-    self.assertEqual(layer.kernel.constraint, k_constraint)
-    self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_dense_layer_ragged_tensor(self):
-    layer = keras.layers.Dense(2, kernel_initializer='ones', use_bias=False)
-
-    # a.shape = [2, None, 2]; a.ragged_rank=1
-    a = tf.ragged.constant([[[1., 2], [3, 4], [5, 6]], [[7, 8]]],
-                           ragged_rank=1)
-    a_out = layer(a)
-    keras.backend.get_value(layer.kernel)  # ensures var is built in TF 1.x.
-    self.assertAllEqual(a_out, [[[3., 3], [7, 7], [11, 11]], [[15, 15]]])
-
-    # b.shape = [4, 2]; b.ragged_rank=1
-    b = tf.RaggedTensor.from_uniform_row_length([1., 2, 3, 4, 5, 6, 7, 8], 2)
-    self.assertAllEqual(layer(b), [[3., 3], [7, 7], [11, 11], [15, 15]])
-
-    # c.shape = [2, 2, 2]; c.ragged_rank=2
-    c = tf.RaggedTensor.from_uniform_row_length(b, 2)
-    self.assertAllEqual(layer(c), [[[3., 3], [7, 7]], [[11, 11], [15, 15]]])
-
-  def test_dense_layer_ragged_tensor_savedmodel(self):
-    # Check that we don't get a deadlock when saving a Keras model with
-    # a dense layer that processes RaggedTensors.  (This happened because
-    # Dense.call() had a recursive call, which is not currently supported
-    # by the @tf.function decorator.)
-
-    class TestModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self._layer = keras.layers.Dense(1, kernel_initializer='ones',
-                                         use_bias=False)
-
-      def call(self, inputs):
-        return self._layer(inputs)
-
-    model = TestModel()
-    result = model(tf.RaggedTensor.from_row_lengths([[1.], [2], [3]], [1, 2]))
-    keras.backend.get_value(model._layer.kernel)  # required in TF 1.x.
-    self.assertAllClose(result, [[[1.0]], [[2.0], [3.0]]])
-    model.save(os.path.join(self.get_temp_dir(), 'savedmodel'),
-               save_format='tf')
-
-  def test_dense_layer_unsupported_ragged_tensor_error(self):
-    layer = keras.layers.Dense(2)
-    with self.assertRaisesRegex(
-        ValueError, 'The last dimension of the inputs to a Dense layer should '
-        r'be defined. Found None. Full input shape received: .*'):
-      layer(tf.ragged.constant([[1., 2], [3, 4, 5]]))
-    with self.assertRaisesRegex(
-        ValueError, 'Dense layer only supports RaggedTensors when the '
-        r'innermost dimension is non-ragged. Received: inputs.shape=.*'):
-      layer.call(tf.ragged.constant([[1., 2], [3, 4, 5]]))
+    def test_masking(self):
+        test_utils.layer_test(
+            keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3)
+        )
+
+    def test_keras_mask(self):
+        x = np.ones((10, 10))
+        y = keras.layers.Masking(1.0)(x)
+        self.assertTrue(hasattr(y, "_keras_mask"))
+        self.assertIsNotNone(y._keras_mask)
+        self.assertAllClose(self.evaluate(y._keras_mask), np.zeros((10,)))
+
+    def test_compute_mask_with_positional_mask_arg(self):
+        class MyLayer(keras.layers.Layer):
+            def call(self, inputs, mask=None):
+                return inputs
+
+            def compute_mask(self, inputs, mask=None):
+                if mask is not None:
+                    return tf.ones(())
+                else:
+                    return tf.zeros(())
+
+        x, mask = tf.ones((1, 1)), tf.ones((1, 1))
+        layer = MyLayer()
+        y = layer(x, mask)
+        # Check that `mask` was correctly sent to `compute_mask`.
+        self.assertEqual(keras.backend.get_value(y._keras_mask), 1)
+
+    def test_activation(self):
+        # with string argument
+        test_utils.layer_test(
+            keras.layers.Activation,
+            kwargs={"activation": "relu"},
+            input_shape=(3, 2),
+        )
+
+        # with function argument
+        test_utils.layer_test(
+            keras.layers.Activation,
+            kwargs={"activation": keras.backend.relu},
+            input_shape=(3, 2),
+        )
+
+    def test_dense(self):
+        test_utils.layer_test(
+            keras.layers.Dense, kwargs={"units": 3}, input_shape=(3, 2)
+        )
+
+        test_utils.layer_test(
+            keras.layers.Dense, kwargs={"units": 3}, input_shape=(3, 4, 2)
+        )
+
+        test_utils.layer_test(
+            keras.layers.Dense, kwargs={"units": 3}, input_shape=(None, None, 2)
+        )
+
+        test_utils.layer_test(
+            keras.layers.Dense, kwargs={"units": 3}, input_shape=(3, 4, 5, 2)
+        )
+
+    def test_dense_output(self):
+        dense_inputs = tf.convert_to_tensor(
+            np.random.uniform(size=(10, 10)).astype("f")
+        )
+        # Create some sparse data where multiple rows and columns are missing.
+        sparse_inputs = tf.SparseTensor(
+            indices=np.random.randint(low=0, high=10, size=(5, 2)),
+            values=np.random.uniform(size=(5,)).astype("f"),
+            dense_shape=[10, 10],
+        )
+        sparse_inputs = tf.sparse.reorder(sparse_inputs)
+        # Create some ragged data.
+        ragged_inputs = tf.RaggedTensor.from_row_splits(
+            np.random.uniform(size=(10, 10)).astype("f"),
+            row_splits=[0, 4, 6, 6, 9, 10],
+        )
+
+        layer = keras.layers.Dense(
+            5,
+            kernel_initializer=keras.initializers.RandomUniform(),
+            bias_initializer=keras.initializers.RandomUniform(),
+            dtype="float32",
+        )
+        dense_outputs = layer(dense_inputs)
+        sparse_outpus = layer(sparse_inputs)
+        ragged_outputs = layer(ragged_inputs)
+
+        expected_dense = tf.add(
+            tf.matmul(dense_inputs, keras.backend.get_value(layer.kernel)),
+            keras.backend.get_value(layer.bias),
+        )
+        expected_sparse = tf.add(
+            tf.matmul(
+                tf.sparse.to_dense(sparse_inputs),
+                keras.backend.get_value(layer.kernel),
+            ),
+            keras.backend.get_value(layer.bias),
+        )
+        expected_ragged_values = tf.add(
+            tf.matmul(
+                ragged_inputs.flat_values, keras.backend.get_value(layer.kernel)
+            ),
+            keras.backend.get_value(layer.bias),
+        )
+        expected_ragged = tf.RaggedTensor.from_row_splits(
+            expected_ragged_values, row_splits=[0, 4, 6, 6, 9, 10]
+        )
+
+        self.assertAllClose(dense_outputs, expected_dense)
+        self.assertAllClose(sparse_outpus, expected_sparse)
+        self.assertAllClose(ragged_outputs, expected_ragged)
+
+    def test_dense_dtype(self):
+        inputs = tf.convert_to_tensor(
+            np.random.randint(low=0, high=7, size=(2, 2))
+        )
+        layer = keras.layers.Dense(5, dtype="float32")
+        outputs = layer(inputs)
+        self.assertEqual(outputs.dtype, "float32")
+
+    def test_dense_with_policy(self):
+        inputs = tf.convert_to_tensor(
+            np.random.randint(low=0, high=7, size=(2, 2))
+        )
+        layer = keras.layers.Dense(5, dtype=policy.Policy("mixed_float16"))
+        outputs = layer(inputs)
+        output_signature = layer.compute_output_signature(
+            tf.TensorSpec(dtype="float16", shape=(2, 2))
+        )
+        self.assertEqual(output_signature.dtype, tf.float16)
+        self.assertEqual(output_signature.shape, (2, 5))
+        self.assertEqual(outputs.dtype, "float16")
+        self.assertEqual(layer.kernel.dtype, "float32")
+
+    def test_dense_regularization(self):
+        layer = keras.layers.Dense(
+            3,
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer="l1",
+            activity_regularizer="l2",
+            name="dense_reg",
+        )
+        layer(keras.backend.variable(np.ones((2, 4))))
+        self.assertEqual(3, len(layer.losses))
+
+    def test_dense_constraints(self):
+        k_constraint = keras.constraints.max_norm(0.01)
+        b_constraint = keras.constraints.max_norm(0.01)
+        layer = keras.layers.Dense(
+            3, kernel_constraint=k_constraint, bias_constraint=b_constraint
+        )
+        layer(keras.backend.variable(np.ones((2, 4))))
+        self.assertEqual(layer.kernel.constraint, k_constraint)
+        self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_dense_layer_ragged_tensor(self):
+        layer = keras.layers.Dense(2, kernel_initializer="ones", use_bias=False)
+
+        # a.shape = [2, None, 2]; a.ragged_rank=1
+        a = tf.ragged.constant(
+            [[[1.0, 2], [3, 4], [5, 6]], [[7, 8]]], ragged_rank=1
+        )
+        a_out = layer(a)
+        keras.backend.get_value(layer.kernel)  # ensures var is built in TF 1.x.
+        self.assertAllEqual(a_out, [[[3.0, 3], [7, 7], [11, 11]], [[15, 15]]])
+
+        # b.shape = [4, 2]; b.ragged_rank=1
+        b = tf.RaggedTensor.from_uniform_row_length(
+            [1.0, 2, 3, 4, 5, 6, 7, 8], 2
+        )
+        self.assertAllEqual(layer(b), [[3.0, 3], [7, 7], [11, 11], [15, 15]])
+
+        # c.shape = [2, 2, 2]; c.ragged_rank=2
+        c = tf.RaggedTensor.from_uniform_row_length(b, 2)
+        self.assertAllEqual(
+            layer(c), [[[3.0, 3], [7, 7]], [[11, 11], [15, 15]]]
+        )
+
+    def test_dense_layer_ragged_tensor_savedmodel(self):
+        # Check that we don't get a deadlock when saving a Keras model with
+        # a dense layer that processes RaggedTensors.  (This happened because
+        # Dense.call() had a recursive call, which is not currently supported
+        # by the @tf.function decorator.)
+
+        class TestModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self._layer = keras.layers.Dense(
+                    1, kernel_initializer="ones", use_bias=False
+                )
+
+            def call(self, inputs):
+                return self._layer(inputs)
+
+        model = TestModel()
+        result = model(
+            tf.RaggedTensor.from_row_lengths([[1.0], [2], [3]], [1, 2])
+        )
+        keras.backend.get_value(model._layer.kernel)  # required in TF 1.x.
+        self.assertAllClose(result, [[[1.0]], [[2.0], [3.0]]])
+        model.save(
+            os.path.join(self.get_temp_dir(), "savedmodel"), save_format="tf"
+        )
+
+    def test_dense_layer_unsupported_ragged_tensor_error(self):
+        layer = keras.layers.Dense(2)
+        with self.assertRaisesRegex(
+            ValueError,
+            "The last dimension of the inputs to a Dense layer should "
+            r"be defined. Found None. Full input shape received: .*",
+        ):
+            layer(tf.ragged.constant([[1.0, 2], [3, 4, 5]]))
+        with self.assertRaisesRegex(
+            ValueError,
+            "Dense layer only supports RaggedTensors when the "
+            r"innermost dimension is non-ragged. Received: inputs.shape=.*",
+        ):
+            layer.call(tf.ragged.constant([[1.0, 2], [3, 4, 5]]))
 
 
 @test_combinations.run_all_keras_modes
 class TFOpLambdaTest(test_combinations.TestCase):
+    def test_non_tf_symbol(self):
+        def dummy_func(a, b):
+            return a + b
 
-  def test_non_tf_symbol(self):
-
-    def dummy_func(a, b):
-      return a + b
-
-    layer = core.TFOpLambda(dummy_func)
-    self.assertIsNone(layer.symbol)
-    self.assertEqual(layer.name, 'dummy_func')
+        layer = core.TFOpLambda(dummy_func)
+        self.assertIsNone(layer.symbol)
+        self.assertEqual(layer.name, "dummy_func")
 
-    with self.assertRaisesRegex(ValueError, 'was generated from .*dummy_func'):
-      layer.get_config()
+        with self.assertRaisesRegex(
+            ValueError, "was generated from .*dummy_func"
+        ):
+            layer.get_config()
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/core/dense.py b/keras/layers/core/dense.py
index 1dd28dddf00f..49c03bfa324e 100644
--- a/keras/layers/core/dense.py
+++ b/keras/layers/core/dense.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the Dense layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import backend
@@ -23,244 +25,277 @@
 from keras.dtensor import utils
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Dense')
+@keras_export("keras.layers.Dense")
 class Dense(Layer):
-  """Just your regular densely-connected NN layer.
+    """Just your regular densely-connected NN layer.
 
-  `Dense` implements the operation:
-  `output = activation(dot(input, kernel) + bias)`
-  where `activation` is the element-wise activation function
-  passed as the `activation` argument, `kernel` is a weights matrix
-  created by the layer, and `bias` is a bias vector created by the layer
-  (only applicable if `use_bias` is `True`). These are all attributes of
-  `Dense`.
+    `Dense` implements the operation:
+    `output = activation(dot(input, kernel) + bias)`
+    where `activation` is the element-wise activation function
+    passed as the `activation` argument, `kernel` is a weights matrix
+    created by the layer, and `bias` is a bias vector created by the layer
+    (only applicable if `use_bias` is `True`). These are all attributes of
+    `Dense`.
 
-  Note: If the input to the layer has a rank greater than 2, then `Dense`
-  computes the dot product between the `inputs` and the `kernel` along the
-  last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`).
-  For example, if input has dimensions `(batch_size, d0, d1)`,
-  then we create a `kernel` with shape `(d1, units)`, and the `kernel` operates
-  along axis 2 of the `input`, on every sub-tensor of shape `(1, 1, d1)`
-  (there are `batch_size * d0` such sub-tensors).
-  The output in this case will have shape `(batch_size, d0, units)`.
+    Note: If the input to the layer has a rank greater than 2, then `Dense`
+    computes the dot product between the `inputs` and the `kernel` along the
+    last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`).
+    For example, if input has dimensions `(batch_size, d0, d1)`, then we create
+    a `kernel` with shape `(d1, units)`, and the `kernel` operates along axis 2
+    of the `input`, on every sub-tensor of shape `(1, 1, d1)` (there are
+    `batch_size * d0` such sub-tensors).  The output in this case will have
+    shape `(batch_size, d0, units)`.
 
-  Besides, layer attributes cannot be modified after the layer has been called
-  once (except the `trainable` attribute).
-  When a popular kwarg `input_shape` is passed, then keras will create
-  an input layer to insert before the current layer. This can be treated
-  equivalent to explicitly defining an `InputLayer`.
+    Besides, layer attributes cannot be modified after the layer has been called
+    once (except the `trainable` attribute).
+    When a popular kwarg `input_shape` is passed, then keras will create
+    an input layer to insert before the current layer. This can be treated
+    equivalent to explicitly defining an `InputLayer`.
 
-  Example:
+    Example:
 
-  >>> # Create a `Sequential` model and add a Dense layer as the first layer.
-  >>> model = tf.keras.models.Sequential()
-  >>> model.add(tf.keras.Input(shape=(16,)))
-  >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
-  >>> # Now the model will take as input arrays of shape (None, 16)
-  >>> # and output arrays of shape (None, 32).
-  >>> # Note that after the first layer, you don't need to specify
-  >>> # the size of the input anymore:
-  >>> model.add(tf.keras.layers.Dense(32))
-  >>> model.output_shape
-  (None, 32)
+    >>> # Create a `Sequential` model and add a Dense layer as the first layer.
+    >>> model = tf.keras.models.Sequential()
+    >>> model.add(tf.keras.Input(shape=(16,)))
+    >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
+    >>> # Now the model will take as input arrays of shape (None, 16)
+    >>> # and output arrays of shape (None, 32).
+    >>> # Note that after the first layer, you don't need to specify
+    >>> # the size of the input anymore:
+    >>> model.add(tf.keras.layers.Dense(32))
+    >>> model.output_shape
+    (None, 32)
 
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      If you don't specify anything, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation").
-    kernel_constraint: Constraint function applied to
-      the `kernel` weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
+    Args:
+        units: Positive integer, dimensionality of the output space.
+        activation: Activation function to use.
+            If you don't specify anything, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix.
+        bias_initializer: Initializer for the bias vector.
+        kernel_regularizer: Regularizer function applied to
+            the `kernel` weights matrix.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        activity_regularizer: Regularizer function applied to
+            the output of the layer (its "activation").
+        kernel_constraint: Constraint function applied to
+            the `kernel` weights matrix.
+        bias_constraint: Constraint function applied to the bias vector.
 
-  Input shape:
-    N-D tensor with shape: `(batch_size, ..., input_dim)`.
-    The most common situation would be
-    a 2D input with shape `(batch_size, input_dim)`.
+    Input shape:
+        N-D tensor with shape: `(batch_size, ..., input_dim)`.
+        The most common situation would be
+        a 2D input with shape `(batch_size, input_dim)`.
 
-  Output shape:
-    N-D tensor with shape: `(batch_size, ..., units)`.
-    For instance, for a 2D input with shape `(batch_size, input_dim)`,
-    the output would have shape `(batch_size, units)`.
-  """
+    Output shape:
+        N-D tensor with shape: `(batch_size, ..., units)`.
+        For instance, for a 2D input with shape `(batch_size, input_dim)`,
+        the output would have shape `(batch_size, units)`.
+    """
 
-  @utils.allow_initializer_layout
-  def __init__(self,
-               units,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(
-        activity_regularizer=activity_regularizer, **kwargs)
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        units,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(activity_regularizer=activity_regularizer, **kwargs)
 
-    self.units = int(units) if not isinstance(units, int) else units
-    if self.units < 0:
-      raise ValueError(f'Received an invalid value for `units`, expected '
-                       f'a positive integer. Received: units={units}')
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
+        self.units = int(units) if not isinstance(units, int) else units
+        if self.units < 0:
+            raise ValueError(
+                "Received an invalid value for `units`, expected "
+                f"a positive integer. Received: units={units}"
+            )
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
 
-    self.input_spec = InputSpec(min_ndim=2)
-    self.supports_masking = True
+        self.input_spec = InputSpec(min_ndim=2)
+        self.supports_masking = True
 
-  def build(self, input_shape):
-    dtype = tf.as_dtype(self.dtype or backend.floatx())
-    if not (dtype.is_floating or dtype.is_complex):
-      raise TypeError('A Dense layer can only be built with a floating-point '
-                      f'dtype. Received: dtype={dtype}')
+    def build(self, input_shape):
+        dtype = tf.as_dtype(self.dtype or backend.floatx())
+        if not (dtype.is_floating or dtype.is_complex):
+            raise TypeError(
+                "A Dense layer can only be built with a floating-point "
+                f"dtype. Received: dtype={dtype}"
+            )
 
-    input_shape = tf.TensorShape(input_shape)
-    last_dim = tf.compat.dimension_value(input_shape[-1])
-    if last_dim is None:
-      raise ValueError('The last dimension of the inputs to a Dense layer '
-                       'should be defined. Found None. '
-                       f'Full input shape received: {input_shape}')
-    self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
-    self.kernel = self.add_weight(
-        'kernel',
-        shape=[last_dim, self.units],
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        trainable=True)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          'bias',
-          shape=[self.units,],
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.bias = None
-    self.built = True
+        input_shape = tf.TensorShape(input_shape)
+        last_dim = tf.compat.dimension_value(input_shape[-1])
+        if last_dim is None:
+            raise ValueError(
+                "The last dimension of the inputs to a Dense layer "
+                "should be defined. Found None. "
+                f"Full input shape received: {input_shape}"
+            )
+        self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
+        self.kernel = self.add_weight(
+            "kernel",
+            shape=[last_dim, self.units],
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            dtype=self.dtype,
+            trainable=True,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                "bias",
+                shape=[
+                    self.units,
+                ],
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                dtype=self.dtype,
+                trainable=True,
+            )
+        else:
+            self.bias = None
+        self.built = True
 
-  def call(self, inputs):
-    if inputs.dtype.base_dtype != self._compute_dtype_object.base_dtype:
-      inputs = tf.cast(inputs, dtype=self._compute_dtype_object)
+    def call(self, inputs):
+        if inputs.dtype.base_dtype != self._compute_dtype_object.base_dtype:
+            inputs = tf.cast(inputs, dtype=self._compute_dtype_object)
 
-    is_ragged = isinstance(inputs, tf.RaggedTensor)
-    if is_ragged:
-      # In case we encounter a RaggedTensor with a fixed last dimension (last
-      # dimension not ragged), we can flatten the input and restore the ragged
-      # dimensions at the end.
-      if tf.compat.dimension_value(inputs.shape[-1]) is None:
-        raise ValueError('Dense layer only supports RaggedTensors when the '
-                         'innermost dimension is non-ragged. Received: '
-                         f'inputs.shape={inputs.shape}.')
-      original_inputs = inputs
-      if inputs.flat_values.shape.rank > 1:
-        inputs = inputs.flat_values
-      else:
-        # Innermost partition is encoded using uniform_row_length.
-        # (This is unusual, but we can handle it.)
-        if inputs.shape.rank == 2:
-          inputs = inputs.to_tensor()
-          is_ragged = False
-        else:
-          for _ in range(original_inputs.ragged_rank - 1):
-            inputs = inputs.values
-          inputs = inputs.to_tensor()
-          original_inputs = tf.RaggedTensor.from_nested_row_splits(
-              inputs, original_inputs.nested_row_splits[:-1])
+        is_ragged = isinstance(inputs, tf.RaggedTensor)
+        if is_ragged:
+            # In case we encounter a RaggedTensor with a fixed last dimension
+            # (last dimension not ragged), we can flatten the input and restore
+            # the ragged dimensions at the end.
+            if tf.compat.dimension_value(inputs.shape[-1]) is None:
+                raise ValueError(
+                    "Dense layer only supports RaggedTensors when the "
+                    "innermost dimension is non-ragged. Received: "
+                    f"inputs.shape={inputs.shape}."
+                )
+            original_inputs = inputs
+            if inputs.flat_values.shape.rank > 1:
+                inputs = inputs.flat_values
+            else:
+                # Innermost partition is encoded using uniform_row_length.
+                # (This is unusual, but we can handle it.)
+                if inputs.shape.rank == 2:
+                    inputs = inputs.to_tensor()
+                    is_ragged = False
+                else:
+                    for _ in range(original_inputs.ragged_rank - 1):
+                        inputs = inputs.values
+                    inputs = inputs.to_tensor()
+                    original_inputs = tf.RaggedTensor.from_nested_row_splits(
+                        inputs, original_inputs.nested_row_splits[:-1]
+                    )
 
-    rank = inputs.shape.rank
-    if rank == 2 or rank is None:
-      # We use embedding_lookup_sparse as a more efficient matmul operation for
-      # large sparse input tensors. The op will result in a sparse gradient, as
-      # opposed to sparse_ops.sparse_tensor_dense_matmul which results in dense
-      # gradients. This can lead to sigfinicant speedups, see b/171762937.
-      if isinstance(inputs, tf.SparseTensor):
-        # We need to fill empty rows, as the op assumes at least one id per row.
-        inputs, _ = tf.sparse.fill_empty_rows(inputs, 0)
-        # We need to do some munging of our input to use the embedding lookup as
-        # a matrix multiply. We split our input matrix into separate ids and
-        # weights tensors. The values of the ids tensor should be the column
-        # indices of our input matrix and the values of the weights tensor
-        # can continue to the actual matrix weights.
-        # The column arrangement of ids and weights
-        # will be summed over and does not matter. See the documentation for
-        # sparse_ops.sparse_tensor_dense_matmul a more detailed explanation
-        # of the inputs to both ops.
-        ids = tf.SparseTensor(
-            indices=inputs.indices,
-            values=inputs.indices[:, 1],
-            dense_shape=inputs.dense_shape)
-        weights = inputs
-        outputs = tf.nn.embedding_lookup_sparse(
-            self.kernel, ids, weights, combiner='sum')
-      else:
-        outputs = tf.matmul(a=inputs, b=self.kernel)
-    # Broadcast kernel to inputs.
-    else:
-      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
-      # Reshape the output back to the original ndim of the input.
-      if not tf.executing_eagerly():
-        shape = inputs.shape.as_list()
-        output_shape = shape[:-1] + [self.kernel.shape[-1]]
-        outputs.set_shape(output_shape)
+        rank = inputs.shape.rank
+        if rank == 2 or rank is None:
+            # We use embedding_lookup_sparse as a more efficient matmul
+            # operation for large sparse input tensors. The op will result in a
+            # sparse gradient, as opposed to
+            # sparse_ops.sparse_tensor_dense_matmul which results in dense
+            # gradients. This can lead to sigfinicant speedups, see b/171762937.
+            if isinstance(inputs, tf.SparseTensor):
+                # We need to fill empty rows, as the op assumes at least one id
+                # per row.
+                inputs, _ = tf.sparse.fill_empty_rows(inputs, 0)
+                # We need to do some munging of our input to use the embedding
+                # lookup as a matrix multiply. We split our input matrix into
+                # separate ids and weights tensors. The values of the ids tensor
+                # should be the column indices of our input matrix and the
+                # values of the weights tensor can continue to the actual matrix
+                # weights.  The column arrangement of ids and weights will be
+                # summed over and does not matter. See the documentation for
+                # sparse_ops.sparse_tensor_dense_matmul a more detailed
+                # explanation of the inputs to both ops.
+                ids = tf.SparseTensor(
+                    indices=inputs.indices,
+                    values=inputs.indices[:, 1],
+                    dense_shape=inputs.dense_shape,
+                )
+                weights = inputs
+                outputs = tf.nn.embedding_lookup_sparse(
+                    self.kernel, ids, weights, combiner="sum"
+                )
+            else:
+                outputs = tf.matmul(a=inputs, b=self.kernel)
+        # Broadcast kernel to inputs.
+        else:
+            outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
+            # Reshape the output back to the original ndim of the input.
+            if not tf.executing_eagerly():
+                shape = inputs.shape.as_list()
+                output_shape = shape[:-1] + [self.kernel.shape[-1]]
+                outputs.set_shape(output_shape)
 
-    if self.use_bias:
-      outputs = tf.nn.bias_add(outputs, self.bias)
+        if self.use_bias:
+            outputs = tf.nn.bias_add(outputs, self.bias)
 
-    if self.activation is not None:
-      outputs = self.activation(outputs)
+        if self.activation is not None:
+            outputs = self.activation(outputs)
 
-    if is_ragged:
-      outputs = original_inputs.with_flat_values(outputs)
+        if is_ragged:
+            outputs = original_inputs.with_flat_values(outputs)
 
-    return outputs
+        return outputs
 
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    input_shape = input_shape.with_rank_at_least(2)
-    if tf.compat.dimension_value(input_shape[-1]) is None:
-      raise ValueError('The last dimension of the input shape of a Dense layer '
-                       'should be defined. Found None. '
-                       f'Received: input_shape={input_shape}')
-    return input_shape[:-1].concatenate(self.units)
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        input_shape = input_shape.with_rank_at_least(2)
+        if tf.compat.dimension_value(input_shape[-1]) is None:
+            raise ValueError(
+                "The last dimension of the input shape of a Dense layer "
+                "should be defined. Found None. "
+                f"Received: input_shape={input_shape}"
+            )
+        return input_shape[:-1].concatenate(self.units)
 
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    })
-    return config
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "units": self.units,
+                "activation": activations.serialize(self.activation),
+                "use_bias": self.use_bias,
+                "kernel_initializer": initializers.serialize(
+                    self.kernel_initializer
+                ),
+                "bias_initializer": initializers.serialize(
+                    self.bias_initializer
+                ),
+                "kernel_regularizer": regularizers.serialize(
+                    self.kernel_regularizer
+                ),
+                "bias_regularizer": regularizers.serialize(
+                    self.bias_regularizer
+                ),
+                "activity_regularizer": regularizers.serialize(
+                    self.activity_regularizer
+                ),
+                "kernel_constraint": constraints.serialize(
+                    self.kernel_constraint
+                ),
+                "bias_constraint": constraints.serialize(self.bias_constraint),
+            }
+        )
+        return config
diff --git a/keras/layers/core/einsum_dense.py b/keras/layers/core/einsum_dense.py
index f46d1581a45e..e1d3ca334c00 100644
--- a/keras/layers/core/einsum_dense.py
+++ b/keras/layers/core/einsum_dense.py
@@ -13,317 +13,349 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based einsum dense layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import re
 
+import tensorflow.compat.v2 as tf
+
 from keras import activations
 from keras import constraints
 from keras import initializers
 from keras import regularizers
 from keras.engine.base_layer import Layer
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export("keras.layers.EinsumDense",
-              "keras.layers.experimental.EinsumDense")
+@keras_export(
+    "keras.layers.EinsumDense", "keras.layers.experimental.EinsumDense"
+)
 class EinsumDense(Layer):
-  """A layer that uses `tf.einsum` as the backing computation.
-
-  This layer can perform einsum calculations of arbitrary dimensionality.
-
-  Args:
-    equation: An equation describing the einsum to perform. This equation must
-      be a valid einsum string of the form `ab,bc->ac`, `...ab,bc->...ac`, or
-      `ab...,bc->ac...` where 'ab', 'bc', and 'ac' can be any valid einsum axis
-      expression sequence.
-    output_shape: The expected shape of the output tensor (excluding the batch
-      dimension and any dimensions represented by ellipses). You can specify
-      None for any dimension that is unknown or can be inferred from the input
-      shape.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied (that is, a "linear" activation: `a(x) = x`).
-    bias_axes: A string containing the output dimension(s) to apply a bias to.
-      Each character in the `bias_axes` string should correspond to a character
-      in the output portion of the `equation` string.
-    kernel_initializer: Initializer for the `kernel` weights matrix.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation").
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-
-  Examples:
-
-  **Biased dense layer with einsums**
-
-  This example shows how to instantiate a standard Keras dense layer using
-  einsum operations. This example is equivalent to
-  `tf.keras.layers.Dense(64, use_bias=True)`.
-
-  >>> layer = tf.keras.layers.EinsumDense("ab,bc->ac",
-  ...                                     output_shape=64,
-  ...                                     bias_axes="c")
-  >>> input_tensor = tf.keras.Input(shape=[32])
-  >>> output_tensor = layer(input_tensor)
-  >>> output_tensor
-  <... shape=(None, 64) dtype=...>
-
-  **Applying a dense layer to a sequence**
-
-  This example shows how to instantiate a layer that applies the same dense
-  operation to every element in a sequence. Here, the `output_shape` has two
-  values (since there are two non-batch dimensions in the output); the first
-  dimension in the `output_shape` is `None`, because the sequence dimension `b`
-  has an unknown shape.
-
-  >>> layer = tf.keras.layers.EinsumDense("abc,cd->abd",
-  ...                                     output_shape=(None, 64),
-  ...                                     bias_axes="d")
-  >>> input_tensor = tf.keras.Input(shape=[32, 128])
-  >>> output_tensor = layer(input_tensor)
-  >>> output_tensor
-  <... shape=(None, 32, 64) dtype=...>
-
-  **Applying a dense layer to a sequence using ellipses**
-
-  This example shows how to instantiate a layer that applies the same dense
-  operation to every element in a sequence, but uses the ellipsis notation
-  instead of specifying the batch and sequence dimensions.
-
-  Because we are using ellipsis notation and have specified only one axis, the
-  `output_shape` arg is a single value. When instantiated in this way, the layer
-  can handle any number of sequence dimensions - including the case where no
-  sequence dimension exists.
-
-  >>> layer = tf.keras.layers.EinsumDense("...x,xy->...y",
-  ...                                     output_shape=64,
-  ...                                     bias_axes="y")
-  >>> input_tensor = tf.keras.Input(shape=[32, 128])
-  >>> output_tensor = layer(input_tensor)
-  >>> output_tensor
-  <... shape=(None, 32, 64) dtype=...>
-  """
-
-  def __init__(self,
-               equation,
-               output_shape,
-               activation=None,
-               bias_axes=None,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.equation = equation
-    if isinstance(output_shape, int):
-      self.partial_output_shape = [output_shape]
-    else:
-      self.partial_output_shape = list(output_shape)
-    self.bias_axes = bias_axes
-    self.activation = activations.get(activation)
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    shape_data = _analyze_einsum_string(self.equation,
-                                        self.bias_axes,
-                                        input_shape,
-                                        self.partial_output_shape)
-    kernel_shape, bias_shape, self.full_output_shape = shape_data
-    self.kernel = self.add_weight(
-        "kernel",
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        trainable=True)
-
-    if bias_shape is not None:
-      self.bias = self.add_weight(
-          "bias",
-          shape=bias_shape,
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.bias = None
-    super().build(input_shape)
-
-  def compute_output_shape(self, _):
-    return tf.TensorShape(self.full_output_shape)
-
-  def get_config(self):
-    config = {
-        "output_shape": self.partial_output_shape,
-        "equation": self.equation,
-        "activation": activations.serialize(self.activation),
-        "bias_axes": self.bias_axes,
-        "kernel_initializer": initializers.serialize(self.kernel_initializer),
-        "bias_initializer": initializers.serialize(self.bias_initializer),
-        "kernel_regularizer": regularizers.serialize(self.kernel_regularizer),
-        "bias_regularizer": regularizers.serialize(self.bias_regularizer),
-        "activity_regularizer":
-            regularizers.serialize(self.activity_regularizer),
-        "kernel_constraint": constraints.serialize(self.kernel_constraint),
-        "bias_constraint": constraints.serialize(self.bias_constraint),
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    ret = tf.einsum(self.equation, inputs, self.kernel)
-    if self.bias is not None:
-      ret += self.bias
-    if self.activation is not None:
-      ret = self.activation(ret)
-    return ret
+    """A layer that uses `tf.einsum` as the backing computation.
+
+    This layer can perform einsum calculations of arbitrary dimensionality.
+
+    Args:
+      equation: An equation describing the einsum to perform. This equation must
+        be a valid einsum string of the form `ab,bc->ac`, `...ab,bc->...ac`, or
+        `ab...,bc->ac...` where 'ab', 'bc', and 'ac' can be any valid einsum
+        axis expression sequence.
+      output_shape: The expected shape of the output tensor (excluding the batch
+        dimension and any dimensions represented by ellipses). You can specify
+        None for any dimension that is unknown or can be inferred from the input
+        shape.
+      activation: Activation function to use. If you don't specify anything, no
+        activation is applied (that is, a "linear" activation: `a(x) = x`).
+      bias_axes: A string containing the output dimension(s) to apply a bias to.
+        Each character in the `bias_axes` string should correspond to a
+        character in the output portion of the `equation` string.
+      kernel_initializer: Initializer for the `kernel` weights matrix.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation").
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+
+    Examples:
+
+    **Biased dense layer with einsums**
+
+    This example shows how to instantiate a standard Keras dense layer using
+    einsum operations. This example is equivalent to
+    `tf.keras.layers.Dense(64, use_bias=True)`.
+
+    >>> layer = tf.keras.layers.EinsumDense("ab,bc->ac",
+    ...                                     output_shape=64,
+    ...                                     bias_axes="c")
+    >>> input_tensor = tf.keras.Input(shape=[32])
+    >>> output_tensor = layer(input_tensor)
+    >>> output_tensor
+    <... shape=(None, 64) dtype=...>
+
+    **Applying a dense layer to a sequence**
+
+    This example shows how to instantiate a layer that applies the same dense
+    operation to every element in a sequence. Here, the `output_shape` has two
+    values (since there are two non-batch dimensions in the output); the first
+    dimension in the `output_shape` is `None`, because the sequence dimension
+    `b` has an unknown shape.
+
+    >>> layer = tf.keras.layers.EinsumDense("abc,cd->abd",
+    ...                                     output_shape=(None, 64),
+    ...                                     bias_axes="d")
+    >>> input_tensor = tf.keras.Input(shape=[32, 128])
+    >>> output_tensor = layer(input_tensor)
+    >>> output_tensor
+    <... shape=(None, 32, 64) dtype=...>
+
+    **Applying a dense layer to a sequence using ellipses**
+
+    This example shows how to instantiate a layer that applies the same dense
+    operation to every element in a sequence, but uses the ellipsis notation
+    instead of specifying the batch and sequence dimensions.
+
+    Because we are using ellipsis notation and have specified only one axis, the
+    `output_shape` arg is a single value. When instantiated in this way, the
+    layer can handle any number of sequence dimensions - including the case
+    where no sequence dimension exists.
+
+    >>> layer = tf.keras.layers.EinsumDense("...x,xy->...y",
+    ...                                     output_shape=64,
+    ...                                     bias_axes="y")
+    >>> input_tensor = tf.keras.Input(shape=[32, 128])
+    >>> output_tensor = layer(input_tensor)
+    >>> output_tensor
+    <... shape=(None, 32, 64) dtype=...>
+    """
+
+    def __init__(
+        self,
+        equation,
+        output_shape,
+        activation=None,
+        bias_axes=None,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.equation = equation
+        if isinstance(output_shape, int):
+            self.partial_output_shape = [output_shape]
+        else:
+            self.partial_output_shape = list(output_shape)
+        self.bias_axes = bias_axes
+        self.activation = activations.get(activation)
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        shape_data = _analyze_einsum_string(
+            self.equation,
+            self.bias_axes,
+            input_shape,
+            self.partial_output_shape,
+        )
+        kernel_shape, bias_shape, self.full_output_shape = shape_data
+        self.kernel = self.add_weight(
+            "kernel",
+            shape=kernel_shape,
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            dtype=self.dtype,
+            trainable=True,
+        )
+
+        if bias_shape is not None:
+            self.bias = self.add_weight(
+                "bias",
+                shape=bias_shape,
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                dtype=self.dtype,
+                trainable=True,
+            )
+        else:
+            self.bias = None
+        super().build(input_shape)
+
+    def compute_output_shape(self, _):
+        return tf.TensorShape(self.full_output_shape)
+
+    def get_config(self):
+        config = {
+            "output_shape": self.partial_output_shape,
+            "equation": self.equation,
+            "activation": activations.serialize(self.activation),
+            "bias_axes": self.bias_axes,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def call(self, inputs):
+        ret = tf.einsum(self.equation, inputs, self.kernel)
+        if self.bias is not None:
+            ret += self.bias
+        if self.activation is not None:
+            ret = self.activation(ret)
+        return ret
 
 
 def _analyze_einsum_string(equation, bias_axes, input_shape, output_shape):
-  """Analyzes an einsum string to determine the required weight shape."""
-
-  dot_replaced_string = re.sub(r"\.\.\.", "0", equation)
-
-  # This is the case where no ellipses are present in the string.
-  split_string = re.match("([a-zA-Z]+),([a-zA-Z]+)->([a-zA-Z]+)",
-                          dot_replaced_string)
-  if split_string:
-    return _analyze_split_string(split_string, bias_axes, input_shape,
-                                 output_shape)
-
-  # This is the case where ellipses are present on the left.
-  split_string = re.match("0([a-zA-Z]+),([a-zA-Z]+)->0([a-zA-Z]+)",
-                          dot_replaced_string)
-  if split_string:
-    return _analyze_split_string(
-        split_string, bias_axes, input_shape, output_shape, left_elided=True)
-
-  # This is the case where ellipses are present on the right.
-  split_string = re.match("([a-zA-Z]{2,})0,([a-zA-Z]+)->([a-zA-Z]+)0",
-                          dot_replaced_string)
-  if split_string:
-    return _analyze_split_string(split_string, bias_axes, input_shape,
-                                 output_shape)
-
-  raise ValueError(
-      f"Invalid einsum equation '{equation}'. Equations must be in the form "
-      "[X],[Y]->[Z], ...[X],[Y]->...[Z], or [X]...,[Y]->[Z]....")
-
-
-def _analyze_split_string(split_string,
-                          bias_axes,
-                          input_shape,
-                          output_shape,
-                          left_elided=False):
-  """Analyze an pre-split einsum string to find the weight shape."""
-  input_spec = split_string.group(1)
-  weight_spec = split_string.group(2)
-  output_spec = split_string.group(3)
-  elided = len(input_shape) - len(input_spec)
-
-  if isinstance(output_shape, int):
-    output_shape = [output_shape]
-  else:
-    output_shape = list(output_shape)
-
-  output_shape.insert(0, input_shape[0])
-
-  if elided > 0 and left_elided:
-    for i in range(1, elided):
-      # We already inserted the 0th input dimension at dim 0, so we need to
-      # start at location 1 here.
-      output_shape.insert(1, input_shape[i])
-  elif elided > 0 and not left_elided:
-    for i in range(len(input_shape) - elided, len(input_shape)):
-      output_shape.append(input_shape[i])
-
-  if left_elided:
-    # If we have beginning dimensions elided, we need to use negative indexing
-    # to determine where in the input dimension our values are.
-    input_dim_map = {
-        dim: (i + elided) - len(input_shape) for i, dim in enumerate(input_spec)
-    }
-    # Because we've constructed the full output shape already, we don't need
-    # to do negative indexing.
-    output_dim_map = {dim: (i + elided) for i, dim in enumerate(output_spec)}
-  else:
-    input_dim_map = {dim: i for i, dim in enumerate(input_spec)}
-    output_dim_map = {dim: i for i, dim in enumerate(output_spec)}
-
-  for dim in input_spec:
-    input_shape_at_dim = input_shape[input_dim_map[dim]]
-    if dim in output_dim_map:
-      output_shape_at_dim = output_shape[output_dim_map[dim]]
-      if (output_shape_at_dim is not None and
-          output_shape_at_dim != input_shape_at_dim):
-        raise ValueError(
-            "Input shape and output shape do not match at shared "
-            f"dimension '{dim}'. Input shape is {input_shape_at_dim}, "
-            "and output shape "
-            f"is {output_shape[output_dim_map[dim]]}.")
-
-  for dim in output_spec:
-    if dim not in input_spec and dim not in weight_spec:
-      raise ValueError(
-          f"Dimension '{dim}' was specified in the output '{output_spec}' but "
-          f"has no corresponding dim in the input spec '{input_spec}' or "
-          f"weight spec '{output_spec}'")
-
-  weight_shape = []
-  for dim in weight_spec:
-    if dim in input_dim_map:
-      weight_shape.append(input_shape[input_dim_map[dim]])
-    elif dim in output_dim_map:
-      weight_shape.append(output_shape[output_dim_map[dim]])
+    """Analyzes an einsum string to determine the required weight shape."""
+
+    dot_replaced_string = re.sub(r"\.\.\.", "0", equation)
+
+    # This is the case where no ellipses are present in the string.
+    split_string = re.match(
+        "([a-zA-Z]+),([a-zA-Z]+)->([a-zA-Z]+)", dot_replaced_string
+    )
+    if split_string:
+        return _analyze_split_string(
+            split_string, bias_axes, input_shape, output_shape
+        )
+
+    # This is the case where ellipses are present on the left.
+    split_string = re.match(
+        "0([a-zA-Z]+),([a-zA-Z]+)->0([a-zA-Z]+)", dot_replaced_string
+    )
+    if split_string:
+        return _analyze_split_string(
+            split_string, bias_axes, input_shape, output_shape, left_elided=True
+        )
+
+    # This is the case where ellipses are present on the right.
+    split_string = re.match(
+        "([a-zA-Z]{2,})0,([a-zA-Z]+)->([a-zA-Z]+)0", dot_replaced_string
+    )
+    if split_string:
+        return _analyze_split_string(
+            split_string, bias_axes, input_shape, output_shape
+        )
+
+    raise ValueError(
+        f"Invalid einsum equation '{equation}'. Equations must be in the form "
+        "[X],[Y]->[Z], ...[X],[Y]->...[Z], or [X]...,[Y]->[Z]...."
+    )
+
+
+def _analyze_split_string(
+    split_string, bias_axes, input_shape, output_shape, left_elided=False
+):
+    """Analyze an pre-split einsum string to find the weight shape."""
+    input_spec = split_string.group(1)
+    weight_spec = split_string.group(2)
+    output_spec = split_string.group(3)
+    elided = len(input_shape) - len(input_spec)
+
+    if isinstance(output_shape, int):
+        output_shape = [output_shape]
     else:
-      raise ValueError(
-          f"Weight dimension '{dim}' did not have a match in either "
-          f"the input spec '{input_spec}' or the output spec '{output_spec}'. "
-          "For this layer, the weight must be fully specified.")
-
-  if bias_axes is not None:
-    num_left_elided = elided if left_elided else 0
-    idx_map = {
-        char: output_shape[i + num_left_elided]
-        for i, char in enumerate(output_spec)
-    }
-
-    for char in bias_axes:
-      if char not in output_spec:
-        raise ValueError(
-            f"Bias dimension '{char}' was requested, but is not part "
-            f"of the output spec '{output_spec}'")
-
-    first_bias_location = min([output_spec.find(char) for char in bias_axes])
-    bias_output_spec = output_spec[first_bias_location:]
-
-    bias_shape = [
-        idx_map[char] if char in bias_axes else 1 for char in bias_output_spec
-    ]
-
-    if not left_elided:
-      for _ in range(elided):
-        bias_shape.append(1)
-  else:
-    bias_shape = None
-
-  return weight_shape, bias_shape, output_shape
+        output_shape = list(output_shape)
+
+    output_shape.insert(0, input_shape[0])
+
+    if elided > 0 and left_elided:
+        for i in range(1, elided):
+            # We already inserted the 0th input dimension at dim 0, so we need
+            # to start at location 1 here.
+            output_shape.insert(1, input_shape[i])
+    elif elided > 0 and not left_elided:
+        for i in range(len(input_shape) - elided, len(input_shape)):
+            output_shape.append(input_shape[i])
+
+    if left_elided:
+        # If we have beginning dimensions elided, we need to use negative
+        # indexing to determine where in the input dimension our values are.
+        input_dim_map = {
+            dim: (i + elided) - len(input_shape)
+            for i, dim in enumerate(input_spec)
+        }
+        # Because we've constructed the full output shape already, we don't need
+        # to do negative indexing.
+        output_dim_map = {
+            dim: (i + elided) for i, dim in enumerate(output_spec)
+        }
+    else:
+        input_dim_map = {dim: i for i, dim in enumerate(input_spec)}
+        output_dim_map = {dim: i for i, dim in enumerate(output_spec)}
+
+    for dim in input_spec:
+        input_shape_at_dim = input_shape[input_dim_map[dim]]
+        if dim in output_dim_map:
+            output_shape_at_dim = output_shape[output_dim_map[dim]]
+            if (
+                output_shape_at_dim is not None
+                and output_shape_at_dim != input_shape_at_dim
+            ):
+                raise ValueError(
+                    "Input shape and output shape do not match at shared "
+                    f"dimension '{dim}'. Input shape is {input_shape_at_dim}, "
+                    "and output shape "
+                    f"is {output_shape[output_dim_map[dim]]}."
+                )
+
+    for dim in output_spec:
+        if dim not in input_spec and dim not in weight_spec:
+            raise ValueError(
+                f"Dimension '{dim}' was specified in the output "
+                f"'{output_spec}' but has no corresponding dim in the input "
+                f"spec '{input_spec}' or weight spec '{output_spec}'"
+            )
+
+    weight_shape = []
+    for dim in weight_spec:
+        if dim in input_dim_map:
+            weight_shape.append(input_shape[input_dim_map[dim]])
+        elif dim in output_dim_map:
+            weight_shape.append(output_shape[output_dim_map[dim]])
+        else:
+            raise ValueError(
+                f"Weight dimension '{dim}' did not have a match in either "
+                f"the input spec '{input_spec}' or the output "
+                f"spec '{output_spec}'. For this layer, the weight must "
+                "be fully specified."
+            )
+
+    if bias_axes is not None:
+        num_left_elided = elided if left_elided else 0
+        idx_map = {
+            char: output_shape[i + num_left_elided]
+            for i, char in enumerate(output_spec)
+        }
+
+        for char in bias_axes:
+            if char not in output_spec:
+                raise ValueError(
+                    f"Bias dimension '{char}' was requested, but is not part "
+                    f"of the output spec '{output_spec}'"
+                )
+
+        first_bias_location = min(
+            [output_spec.find(char) for char in bias_axes]
+        )
+        bias_output_spec = output_spec[first_bias_location:]
+
+        bias_shape = [
+            idx_map[char] if char in bias_axes else 1
+            for char in bias_output_spec
+        ]
+
+        if not left_elided:
+            for _ in range(elided):
+                bias_shape.append(1)
+    else:
+        bias_shape = None
+
+    return weight_shape, bias_shape, output_shape
diff --git a/keras/layers/core/einsum_dense_test.py b/keras/layers/core/einsum_dense_test.py
index 3561ff4dce58..f2cb24457dfc 100644
--- a/keras/layers/core/einsum_dense_test.py
+++ b/keras/layers/core/einsum_dense_test.py
@@ -15,13 +15,14 @@
 """Tests for Keras-based einsum dense layer."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers.core import einsum_dense
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
@@ -34,8 +35,9 @@
         "output_shape": [],
         "expected_weight_shape": [32],
         "expected_bias_shape": None,
-        "expected_output_shape": (None,)
-    }, {
+        "expected_output_shape": (None,),
+    },
+    {
         "testcase_name": "_2d_middle_weight",
         "equation": "ab,bc->ac",
         "bias_axes": None,
@@ -43,8 +45,9 @@
         "output_shape": (64),
         "expected_weight_shape": [32, 64],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 64)
-    }, {
+        "expected_output_shape": (None, 64),
+    },
+    {
         "testcase_name": "_3d_bert",
         "equation": "abc,cde->abde",
         "bias_axes": None,
@@ -52,8 +55,9 @@
         "output_shape": (1, 3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 1, 3, 4)
-    }, {
+        "expected_output_shape": (None, 1, 3, 4),
+    },
+    {
         "testcase_name": "_3d_3_bias",
         "equation": "abc,cde->abde",
         "bias_axes": "e",
@@ -61,8 +65,9 @@
         "output_shape": (1, 3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": [4],
-        "expected_output_shape": (None, 1, 3, 4)
-    }, {
+        "expected_output_shape": (None, 1, 3, 4),
+    },
+    {
         "testcase_name": "_3d_2_bias",
         "equation": "abc,cde->abde",
         "bias_axes": "d",
@@ -70,8 +75,9 @@
         "output_shape": (1, 3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": [3, 1],
-        "expected_output_shape": (None, 1, 3, 4)
-    }, {
+        "expected_output_shape": (None, 1, 3, 4),
+    },
+    {
         "testcase_name": "_3d_1_3_bias",
         "equation": "abc,cde->abde",
         "bias_axes": "be",
@@ -79,8 +85,9 @@
         "output_shape": (7, 3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": [7, 1, 4],
-        "expected_output_shape": (None, 7, 3, 4)
-    }, {
+        "expected_output_shape": (None, 7, 3, 4),
+    },
+    {
         "testcase_name": "_3d_bert_projection",
         "equation": "BFNH,NHD->BFD",
         "bias_axes": None,
@@ -88,8 +95,9 @@
         "output_shape": (1, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 1, 4)
-    }, {
+        "expected_output_shape": (None, 1, 4),
+    },
+    {
         "testcase_name": "_2d_bert",
         "equation": "abc,cd->abd",
         "bias_axes": None,
@@ -97,8 +105,9 @@
         "output_shape": (1, 4),
         "expected_weight_shape": [2, 4],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 1, 4)
-    }, {
+        "expected_output_shape": (None, 1, 4),
+    },
+    {
         "testcase_name": "_embedding_1d",
         "equation": "i,d->id",
         "bias_axes": None,
@@ -106,8 +115,9 @@
         "output_shape": (2),
         "expected_weight_shape": [2],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 2)
-    }, {
+        "expected_output_shape": (None, 2),
+    },
+    {
         "testcase_name": "_xlnet_lm",
         "equation": "ibd,nd->ibn",
         "bias_axes": None,
@@ -115,8 +125,9 @@
         "output_shape": (None, 2),
         "expected_weight_shape": [2, 1],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, None, 2)
-    }, {
+        "expected_output_shape": (None, None, 2),
+    },
+    {
         "testcase_name": "_2d_precast",
         "equation": "...b,bc->...c",
         "bias_axes": None,
@@ -124,8 +135,9 @@
         "output_shape": (64),
         "expected_weight_shape": [32, 64],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 64)
-    }, {
+        "expected_output_shape": (None, 64),
+    },
+    {
         "testcase_name": "_2d_precast_elided_input_used_in_output",
         "equation": "...bc,bc->...b",
         "bias_axes": None,
@@ -133,8 +145,9 @@
         "output_shape": (32),
         "expected_weight_shape": [32, 64],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 32)
-    }, {
+        "expected_output_shape": (None, 32),
+    },
+    {
         "testcase_name": "_2d_precast_multiple_elided_dims",
         "equation": "...b,bc->...c",
         "bias_axes": None,
@@ -142,8 +155,9 @@
         "output_shape": (64),
         "expected_weight_shape": [32, 64],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, None, 64)
-    }, {
+        "expected_output_shape": (None, None, 64),
+    },
+    {
         "testcase_name": "_3d_precast",
         "equation": "...c,cde->...de",
         "bias_axes": None,
@@ -151,8 +165,9 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 1, 3, 4)
-    }, {
+        "expected_output_shape": (None, 1, 3, 4),
+    },
+    {
         "testcase_name": "_3d_precast_3_bias",
         "equation": "...c,cde->...de",
         "bias_axes": "e",
@@ -160,8 +175,9 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": [4],
-        "expected_output_shape": (None, 1, 3, 4)
-    }, {
+        "expected_output_shape": (None, 1, 3, 4),
+    },
+    {
         "testcase_name": "_3d_precast_2_bias",
         "equation": "...c,cde->...de",
         "bias_axes": "d",
@@ -169,8 +185,9 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": [3, 1],
-        "expected_output_shape": (None, 1, 3, 4)
-    }, {
+        "expected_output_shape": (None, 1, 3, 4),
+    },
+    {
         "testcase_name": "_3d_precast_2_3_bias",
         "equation": "...c,cde->...de",
         "bias_axes": "de",
@@ -178,8 +195,9 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [2, 3, 4],
         "expected_bias_shape": [3, 4],
-        "expected_output_shape": (None, 1, 3, 4)
-    }, {
+        "expected_output_shape": (None, 1, 3, 4),
+    },
+    {
         "testcase_name": "_2d_postcast",
         "equation": "bc...,cd->bd...",
         "bias_axes": None,
@@ -187,8 +205,9 @@
         "output_shape": (4),
         "expected_weight_shape": [1, 4],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 4, 2, 3)
-    }, {
+        "expected_output_shape": (None, 4, 2, 3),
+    },
+    {
         "testcase_name": "_3d_postcast",
         "equation": "bc...,cde->bde...",
         "bias_axes": None,
@@ -196,8 +215,9 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [1, 3, 4],
         "expected_bias_shape": None,
-        "expected_output_shape": (None, 3, 4, 2)
-    }, {
+        "expected_output_shape": (None, 3, 4, 2),
+    },
+    {
         "testcase_name": "_3d_postcast_1_bias",
         "equation": "bc...,cde->bde...",
         "bias_axes": "d",
@@ -205,8 +225,9 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [1, 3, 4],
         "expected_bias_shape": [3, 1, 1],
-        "expected_output_shape": (None, 3, 4, 2)
-    }, {
+        "expected_output_shape": (None, 3, 4, 2),
+    },
+    {
         "testcase_name": "_3d_postcast_2_bias",
         "equation": "bc...,cde->bde...",
         "bias_axes": "e",
@@ -214,8 +235,9 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [1, 3, 4],
         "expected_bias_shape": [4, 1],
-        "expected_output_shape": (None, 3, 4, 2)
-    }, {
+        "expected_output_shape": (None, 3, 4, 2),
+    },
+    {
         "testcase_name": "_3d_postcast_1_2_bias",
         "equation": "bc...,cde->bde...",
         "bias_axes": "de",
@@ -223,96 +245,124 @@
         "output_shape": (3, 4),
         "expected_weight_shape": [1, 3, 4],
         "expected_bias_shape": [3, 4, 1],
-        "expected_output_shape": (None, 3, 4, 2)
-    })
+        "expected_output_shape": (None, 3, 4, 2),
+    },
+)
 class TestEinsumDenseLayer(test_combinations.TestCase):
+    def test_weight_shapes(
+        self,
+        equation,
+        bias_axes,
+        input_shape,
+        output_shape,
+        expected_weight_shape,
+        expected_bias_shape,
+        expected_output_shape,
+    ):
+        del expected_output_shape  # Not used in this test.
 
-  def test_weight_shapes(self, equation, bias_axes, input_shape, output_shape,
-                         expected_weight_shape, expected_bias_shape,
-                         expected_output_shape):
-    del expected_output_shape  # Not used in this test.
+        weight_shape, bias_shape, _ = einsum_dense._analyze_einsum_string(
+            equation, bias_axes, input_shape, output_shape
+        )
 
-    weight_shape, bias_shape, _ = einsum_dense._analyze_einsum_string(
-        equation, bias_axes, input_shape, output_shape)
+        self.assertAllEqual(expected_weight_shape, weight_shape)
+        self.assertAllEqual(expected_bias_shape, bias_shape)
 
-    self.assertAllEqual(expected_weight_shape, weight_shape)
-    self.assertAllEqual(expected_bias_shape, bias_shape)
+    def test_layer_creation(
+        self,
+        equation,
+        bias_axes,
+        input_shape,
+        output_shape,
+        expected_weight_shape,
+        expected_bias_shape,
+        expected_output_shape,
+    ):
+        # Keras elides the 0-dimension of the input shape when constructing
+        # inputs.
+        non_batch_input_shape = list(input_shape)[1:]
 
-  def test_layer_creation(self, equation, bias_axes, input_shape, output_shape,
-                          expected_weight_shape, expected_bias_shape,
-                          expected_output_shape):
-    # Keras elides the 0-dimension of the input shape when constructing inputs.
-    non_batch_input_shape = list(input_shape)[1:]
+        input_tensor = keras.Input(shape=non_batch_input_shape)
+        layer = einsum_dense.EinsumDense(
+            equation=equation, output_shape=output_shape, bias_axes=bias_axes
+        )
+        output_tensor = layer(input_tensor)
 
-    input_tensor = keras.Input(shape=non_batch_input_shape)
-    layer = einsum_dense.EinsumDense(
-        equation=equation, output_shape=output_shape, bias_axes=bias_axes)
-    output_tensor = layer(input_tensor)
-
-    self.assertAllEqual(expected_weight_shape, layer.kernel.shape.as_list())
-    if expected_bias_shape is None:
-      self.assertIsNone(layer.bias)
-    else:
-      self.assertAllEqual(expected_bias_shape, layer.bias.shape.as_list())
-    self.assertAllEqual(expected_output_shape, output_tensor.shape.as_list())
+        self.assertAllEqual(expected_weight_shape, layer.kernel.shape.as_list())
+        if expected_bias_shape is None:
+            self.assertIsNone(layer.bias)
+        else:
+            self.assertAllEqual(expected_bias_shape, layer.bias.shape.as_list())
+        self.assertAllEqual(
+            expected_output_shape, output_tensor.shape.as_list()
+        )
 
 
 @test_combinations.run_all_keras_modes
 class TestEinsumLayerAPI(test_combinations.TestCase):
+    def test_layer_api(self):
+        input_data = np.array([[1.0, 2.0], [3.0, 4.0]])
+        kwargs = {
+            "equation": "...b,bc->...c",
+            "bias_axes": "c",
+            "output_shape": 4,
+            "bias_initializer": keras.initializers.constant(0.03),
+            "kernel_initializer": keras.initializers.constant(0.5),
+            "dtype": input_data.dtype,
+        }
+        expected_output = np.array(
+            [[1.53, 1.53, 1.53, 1.53], [3.53, 3.53, 3.53, 3.53]]
+        )
 
-  def test_layer_api(self):
-    input_data = np.array([[1.0, 2.0], [3.0, 4.0]])
-    kwargs = {
-        "equation": "...b,bc->...c",
-        "bias_axes": "c",
-        "output_shape": 4,
-        "bias_initializer": keras.initializers.constant(0.03),
-        "kernel_initializer": keras.initializers.constant(0.5),
-        "dtype": input_data.dtype
-    }
-    expected_output = np.array([[1.53, 1.53, 1.53, 1.53],
-                                [3.53, 3.53, 3.53, 3.53]])
-
-    output_data = test_utils.layer_test(
-        einsum_dense.EinsumDense,
-        kwargs=kwargs,
-        input_shape=(None, 2),
-        input_data=input_data)
+        output_data = test_utils.layer_test(
+            einsum_dense.EinsumDense,
+            kwargs=kwargs,
+            input_shape=(None, 2),
+            input_data=input_data,
+        )
 
-    self.assertAllClose(expected_output, output_data)
+        self.assertAllClose(expected_output, output_data)
 
-  def test_unspecified_bias_dim_fails(self):
-    input_tensor = keras.Input(shape=(32,))
-    layer = einsum_dense.EinsumDense(
-        equation="ab,bc->ac", output_shape=64, bias_axes="y")
-    with self.assertRaisesRegex(
-        ValueError, ".*is not part of the output spec.*"):
-      _ = layer(input_tensor)
+    def test_unspecified_bias_dim_fails(self):
+        input_tensor = keras.Input(shape=(32,))
+        layer = einsum_dense.EinsumDense(
+            equation="ab,bc->ac", output_shape=64, bias_axes="y"
+        )
+        with self.assertRaisesRegex(
+            ValueError, ".*is not part of the output spec.*"
+        ):
+            _ = layer(input_tensor)
 
-  def test_incompatible_input_output_shape_fails(self):
-    input_tensor = keras.Input(shape=(32, 64))
-    layer = einsum_dense.EinsumDense(
-        equation="abc,cd->abd", output_shape=(10, 96))
-    with self.assertRaisesRegex(
-        ValueError, ".*Input shape and output shape do not match at shared "
-        "dimension 'b'.*"):
-      _ = layer(input_tensor)
+    def test_incompatible_input_output_shape_fails(self):
+        input_tensor = keras.Input(shape=(32, 64))
+        layer = einsum_dense.EinsumDense(
+            equation="abc,cd->abd", output_shape=(10, 96)
+        )
+        with self.assertRaisesRegex(
+            ValueError,
+            ".*Input shape and output shape do not match at shared "
+            "dimension 'b'.*",
+        ):
+            _ = layer(input_tensor)
 
-  def test_unspecified_output_dim_fails(self):
-    input_tensor = keras.Input(shape=(32,))
-    layer = einsum_dense.EinsumDense(equation="ab,bc->cd", output_shape=64)
-    with self.assertRaisesRegex(
-        ValueError, ".*Dimension 'd' was specified in the output 'cd' but has "
-        "no corresponding dim.*"):
-      _ = layer(input_tensor)
+    def test_unspecified_output_dim_fails(self):
+        input_tensor = keras.Input(shape=(32,))
+        layer = einsum_dense.EinsumDense(equation="ab,bc->cd", output_shape=64)
+        with self.assertRaisesRegex(
+            ValueError,
+            ".*Dimension 'd' was specified in the output 'cd' but has "
+            "no corresponding dim.*",
+        ):
+            _ = layer(input_tensor)
 
-  def test_unspecified_weight_dim_fails(self):
-    input_tensor = keras.Input(shape=(32,))
-    layer = einsum_dense.EinsumDense(equation="ab,zd->ad", output_shape=64)
-    with self.assertRaisesRegex(ValueError,
-                                ".*Weight dimension 'z' did not have a match "):
-      _ = layer(input_tensor)
+    def test_unspecified_weight_dim_fails(self):
+        input_tensor = keras.Input(shape=(32,))
+        layer = einsum_dense.EinsumDense(equation="ab,zd->ad", output_shape=64)
+        with self.assertRaisesRegex(
+            ValueError, ".*Weight dimension 'z' did not have a match "
+        ):
+            _ = layer(input_tensor)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/core/embedding.py b/keras/layers/core/embedding.py
index 7af8bd18e002..cd75001b1247 100644
--- a/keras/layers/core/embedding.py
+++ b/keras/layers/core/embedding.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Embedding layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras import constraints
@@ -23,200 +25,282 @@
 from keras.engine import base_layer_utils
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Embedding')
+@keras_export("keras.layers.Embedding")
 class Embedding(Layer):
-  """Turns positive integers (indexes) into dense vectors of fixed size.
-
-  e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
-
-  This layer can only be used on positive integer inputs of a fixed range. The
-  `tf.keras.layers.TextVectorization`, `tf.keras.layers.StringLookup`,
-  and `tf.keras.layers.IntegerLookup` preprocessing layers can help prepare
-  inputs for an `Embedding` layer.
-
-  This layer accepts `tf.Tensor` and `tf.RaggedTensor` inputs. It cannot be
-  called with `tf.SparseTensor` input.
-
-  Example:
-
-  >>> model = tf.keras.Sequential()
-  >>> model.add(tf.keras.layers.Embedding(1000, 64, input_length=10))
-  >>> # The model will take as input an integer matrix of size (batch,
-  >>> # input_length), and the largest integer (i.e. word index) in the input
-  >>> # should be no larger than 999 (vocabulary size).
-  >>> # Now model.output_shape is (None, 10, 64), where `None` is the batch
-  >>> # dimension.
-  >>> input_array = np.random.randint(1000, size=(32, 10))
-  >>> model.compile('rmsprop', 'mse')
-  >>> output_array = model.predict(input_array)
-  >>> print(output_array.shape)
-  (32, 10, 64)
-
-  Args:
-    input_dim: Integer. Size of the vocabulary,
-      i.e. maximum integer index + 1.
-    output_dim: Integer. Dimension of the dense embedding.
-    embeddings_initializer: Initializer for the `embeddings`
-      matrix (see `keras.initializers`).
-    embeddings_regularizer: Regularizer function applied to
-      the `embeddings` matrix (see `keras.regularizers`).
-    embeddings_constraint: Constraint function applied to
-      the `embeddings` matrix (see `keras.constraints`).
-    mask_zero: Boolean, whether or not the input value 0 is a special "padding"
-      value that should be masked out.
-      This is useful when using recurrent layers
-      which may take variable length input.
-      If this is `True`, then all subsequent layers
-      in the model need to support masking or an exception will be raised.
-      If mask_zero is set to True, as a consequence, index 0 cannot be
-      used in the vocabulary (input_dim should equal size of
-      vocabulary + 1).
-    input_length: Length of input sequences, when it is constant.
-      This argument is required if you are going to connect
-      `Flatten` then `Dense` layers upstream
-      (without it, the shape of the dense outputs cannot be computed).
-
-  Input shape:
-    2D tensor with shape: `(batch_size, input_length)`.
-
-  Output shape:
-    3D tensor with shape: `(batch_size, input_length, output_dim)`.
-
-  **Note on variable placement:**
-  By default, if a GPU is available, the embedding matrix will be placed on
-  the GPU. This achieves the best performance, but it might cause issues:
-
-  - You may be using an optimizer that does not support sparse GPU kernels.
-  In this case you will see an error upon training your model.
-  - Your embedding matrix may be too large to fit on your GPU. In this case
-  you will see an Out Of Memory (OOM) error.
-
-  In such cases, you should place the embedding matrix on the CPU memory.
-  You can do so with a device scope, as such:
-
-  ```python
-  with tf.device('cpu:0'):
-    embedding_layer = Embedding(...)
-    embedding_layer.build()
-  ```
-
-  The pre-built `embedding_layer` instance can then be added to a `Sequential`
-  model (e.g. `model.add(embedding_layer)`), called in a Functional model
-  (e.g. `x = embedding_layer(x)`), or used in a subclassed model.
-  """
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               input_dim,
-               output_dim,
-               embeddings_initializer='uniform',
-               embeddings_regularizer=None,
-               activity_regularizer=None,
-               embeddings_constraint=None,
-               mask_zero=False,
-               input_length=None,
-               **kwargs):
-    if 'input_shape' not in kwargs:
-      if input_length:
-        kwargs['input_shape'] = (input_length,)
-      else:
-        kwargs['input_shape'] = (None,)
-    if input_dim <= 0 or output_dim <= 0:
-      raise ValueError(
-          'Both `input_dim` and `output_dim` should be positive, '
-          f'Received input_dim = {input_dim} and output_dim = {output_dim}')
-    if (not base_layer_utils.v2_dtype_behavior_enabled() and
-        'dtype' not in kwargs):
-      # In TF1, the dtype defaults to the input dtype which is typically int32,
-      # so explicitly set it to floatx
-      kwargs['dtype'] = backend.floatx()
-    # We set autocast to False, as we do not want to cast floating- point inputs
-    # to self.dtype. In call(), we cast to int32, and casting to self.dtype
-    # before casting to int32 might cause the int32 values to be different due
-    # to a loss of precision.
-    kwargs['autocast'] = False
-    super().__init__(**kwargs)
-
-    self.input_dim = input_dim
-    self.output_dim = output_dim
-    self.embeddings_initializer = initializers.get(embeddings_initializer)
-    self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.embeddings_constraint = constraints.get(embeddings_constraint)
-    self.mask_zero = mask_zero
-    self.supports_masking = mask_zero
-    self.input_length = input_length
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape=None):
-    self.embeddings = self.add_weight(
-        shape=(self.input_dim, self.output_dim),
-        initializer=self.embeddings_initializer,
-        name='embeddings',
-        regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
-        experimental_autocast=False)
-    self.built = True
-
-  def compute_mask(self, inputs, mask=None):
-    if not self.mask_zero:
-      return None
-    return tf.not_equal(inputs, 0)
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.input_length is None:
-      return input_shape + (self.output_dim,)
-    else:
-      # input_length can be tuple if input is 3D or higher
-      if isinstance(self.input_length, (list, tuple)):
-        in_lens = list(self.input_length)
-      else:
-        in_lens = [self.input_length]
-      if len(in_lens) != len(input_shape) - 1:
-        raise ValueError(
-            f'"input_length" is {self.input_length}, but received input has '
-            f'shape {input_shape}')
-      else:
-        for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
-          if s1 is not None and s2 is not None and s1 != s2:
+    """Turns positive integers (indexes) into dense vectors of fixed size.
+
+    e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
+
+    This layer can only be used on positive integer inputs of a fixed range. The
+    `tf.keras.layers.TextVectorization`, `tf.keras.layers.StringLookup`,
+    and `tf.keras.layers.IntegerLookup` preprocessing layers can help prepare
+    inputs for an `Embedding` layer.
+
+    This layer accepts `tf.Tensor`, `tf.RaggedTensor` and `tf.SparseTensor`
+    input.
+
+    Example:
+
+    >>> model = tf.keras.Sequential()
+    >>> model.add(tf.keras.layers.Embedding(1000, 64, input_length=10))
+    >>> # The model will take as input an integer matrix of size (batch,
+    >>> # input_length), and the largest integer (i.e. word index) in the input
+    >>> # should be no larger than 999 (vocabulary size).
+    >>> # Now model.output_shape is (None, 10, 64), where `None` is the batch
+    >>> # dimension.
+    >>> input_array = np.random.randint(1000, size=(32, 10))
+    >>> model.compile('rmsprop', 'mse')
+    >>> output_array = model.predict(input_array)
+    >>> print(output_array.shape)
+    (32, 10, 64)
+
+    Args:
+      input_dim: Integer. Size of the vocabulary,
+        i.e. maximum integer index + 1.
+      output_dim: Integer. Dimension of the dense embedding.
+      embeddings_initializer: Initializer for the `embeddings`
+        matrix (see `keras.initializers`).
+      embeddings_regularizer: Regularizer function applied to
+        the `embeddings` matrix (see `keras.regularizers`).
+      embeddings_constraint: Constraint function applied to
+        the `embeddings` matrix (see `keras.constraints`).
+      mask_zero: Boolean, whether or not the input value 0 is a special
+        "padding" value that should be masked out. This is useful when using
+        recurrent layers which may take variable length input. If this is
+        `True`, then all subsequent layers in the model need to support masking
+        or an exception will be raised. If mask_zero is set to True, as a
+        consequence, index 0 cannot be used in the vocabulary (input_dim should
+        equal size of vocabulary + 1).
+      input_length: Length of input sequences, when it is constant.
+        This argument is required if you are going to connect
+        `Flatten` then `Dense` layers upstream
+        (without it, the shape of the dense outputs cannot be computed).
+      sparse: If True, calling this layer returns a `tf.SparseTensor`. If False,
+        the layer returns a dense `tf.Tensor`. For an entry with no features in
+        a sparse tensor (entry with value 0), the embedding vector of index 0 is
+        returned by default.
+
+    Input shape:
+      2D tensor with shape: `(batch_size, input_length)`.
+
+    Output shape:
+      3D tensor with shape: `(batch_size, input_length, output_dim)`.
+
+    **Note on variable placement:**
+    By default, if a GPU is available, the embedding matrix will be placed on
+    the GPU. This achieves the best performance, but it might cause issues:
+
+    - You may be using an optimizer that does not support sparse GPU kernels.
+    In this case you will see an error upon training your model.
+    - Your embedding matrix may be too large to fit on your GPU. In this case
+    you will see an Out Of Memory (OOM) error.
+
+    In such cases, you should place the embedding matrix on the CPU memory.
+    You can do so with a device scope, as such:
+
+    ```python
+    with tf.device('cpu:0'):
+      embedding_layer = Embedding(...)
+      embedding_layer.build()
+    ```
+
+    The pre-built `embedding_layer` instance can then be added to a `Sequential`
+    model (e.g. `model.add(embedding_layer)`), called in a Functional model
+    (e.g. `x = embedding_layer(x)`), or used in a subclassed model.
+    """
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        embeddings_initializer="uniform",
+        embeddings_regularizer=None,
+        activity_regularizer=None,
+        embeddings_constraint=None,
+        mask_zero=False,
+        input_length=None,
+        sparse=False,
+        **kwargs,
+    ):
+        if "input_shape" not in kwargs:
+            if input_length:
+                kwargs["input_shape"] = (input_length,)
+            else:
+                kwargs["input_shape"] = (None,)
+        if input_dim <= 0 or output_dim <= 0:
             raise ValueError(
-                f'"input_length" is {self.input_length}, but received input '
-                f'has shape {input_shape}')
-          elif s1 is None:
-            in_lens[i] = s2
-      return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
-
-  def call(self, inputs):
-    dtype = backend.dtype(inputs)
-    if dtype != 'int32' and dtype != 'int64':
-      inputs = tf.cast(inputs, 'int32')
-    out = tf.nn.embedding_lookup(self.embeddings, inputs)
-    if self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype:
-      # Instead of casting the variable as in most layers, cast the output, as
-      # this is mathematically equivalent but is faster.
-      out = tf.cast(out, self._dtype_policy.compute_dtype)
-    return out
-
-  def get_config(self):
-    config = {
-        'input_dim': self.input_dim,
-        'output_dim': self.output_dim,
-        'embeddings_initializer':
-            initializers.serialize(self.embeddings_initializer),
-        'embeddings_regularizer':
-            regularizers.serialize(self.embeddings_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'embeddings_constraint':
-            constraints.serialize(self.embeddings_constraint),
-        'mask_zero': self.mask_zero,
-        'input_length': self.input_length
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+                "Both `input_dim` and `output_dim` should be positive, "
+                f"Received input_dim = {input_dim} "
+                f"and output_dim = {output_dim}"
+            )
+        if (
+            not base_layer_utils.v2_dtype_behavior_enabled()
+            and "dtype" not in kwargs
+        ):
+            # In TF1, the dtype defaults to the input dtype which is typically
+            # int32, so explicitly set it to floatx
+            kwargs["dtype"] = backend.floatx()
+        # We set autocast to False, as we do not want to cast floating- point
+        # inputs to self.dtype. In call(), we cast to int32, and casting to
+        # self.dtype before casting to int32 might cause the int32 values to be
+        # different due to a loss of precision.
+        kwargs["autocast"] = False
+        use_one_hot_matmul = kwargs.pop("use_one_hot_matmul", False)
+        super().__init__(**kwargs)
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.embeddings_initializer = initializers.get(embeddings_initializer)
+        self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.embeddings_constraint = constraints.get(embeddings_constraint)
+        self.mask_zero = mask_zero
+        self.supports_masking = mask_zero
+        self.input_length = input_length
+        self.sparse = sparse
+        if self.sparse and self.mask_zero:
+            raise ValueError(
+                "`mask_zero` cannot be enabled when "
+                "`tf.keras.layers.Embedding` is used with `tf.SparseTensor` "
+                "input."
+            )
+        # Make this flag private and do not serialize it for now.
+        # It will be part of the public API after further testing.
+        self._use_one_hot_matmul = use_one_hot_matmul
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape=None):
+        self.embeddings = self.add_weight(
+            shape=(self.input_dim, self.output_dim),
+            initializer=self.embeddings_initializer,
+            name="embeddings",
+            regularizer=self.embeddings_regularizer,
+            constraint=self.embeddings_constraint,
+            experimental_autocast=False,
+        )
+        self.built = True
+
+    def compute_mask(self, inputs, mask=None):
+        if not self.mask_zero:
+            return None
+        return tf.not_equal(inputs, 0)
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if self.input_length is None:
+            return input_shape + (self.output_dim,)
+        else:
+            # input_length can be tuple if input is 3D or higher
+            if isinstance(self.input_length, (list, tuple)):
+                in_lens = list(self.input_length)
+            else:
+                in_lens = [self.input_length]
+            if len(in_lens) != len(input_shape) - 1:
+                raise ValueError(
+                    f'"input_length" is {self.input_length}, but received '
+                    f"input has shape {input_shape}"
+                )
+            else:
+                for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
+                    if s1 is not None and s2 is not None and s1 != s2:
+                        raise ValueError(
+                            f'"input_length" is {self.input_length}, but '
+                            f"received input has shape {input_shape}"
+                        )
+                    elif s1 is None:
+                        in_lens[i] = s2
+            return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
+
+    def call(self, inputs):
+        dtype = backend.dtype(inputs)
+        if dtype != "int32" and dtype != "int64":
+            inputs = tf.cast(inputs, "int32")
+        if isinstance(inputs, tf.sparse.SparseTensor):
+            if self.sparse:
+                # get sparse embedding values
+                embedding_values = tf.nn.embedding_lookup(
+                    params=self.embeddings, ids=inputs.values
+                )
+                embedding_values = tf.reshape(embedding_values, [-1])
+                # get sparse embedding indices
+                indices_values_embed_axis = tf.range(self.output_dim)
+                repeat_times = [inputs.indices.shape[0]]
+                indices_values_embed_axis = tf.expand_dims(
+                    tf.tile(indices_values_embed_axis, repeat_times), -1
+                )
+                indices_values_embed_axis = tf.cast(
+                    indices_values_embed_axis, dtype=tf.int64
+                )
+                current_indices = tf.repeat(
+                    inputs.indices, [self.output_dim], axis=0
+                )
+                new_indices = tf.concat(
+                    [current_indices, indices_values_embed_axis], 1
+                )
+                new_shape = tf.concat(
+                    [tf.cast(inputs.shape, dtype=tf.int64), [self.output_dim]],
+                    axis=-1,
+                )
+                out = tf.SparseTensor(
+                    indices=new_indices,
+                    values=embedding_values,
+                    dense_shape=new_shape,
+                )
+            else:
+                sparse_inputs_expanded = tf.sparse.expand_dims(inputs, axis=-1)
+                out = tf.nn.safe_embedding_lookup_sparse(
+                    embedding_weights=self.embeddings,
+                    sparse_ids=sparse_inputs_expanded,
+                    default_id=0,
+                )
+        elif self._use_one_hot_matmul:
+            # Note that we change the dtype of the one_hot to be same as the
+            # weight tensor, since the input data are usually ints, and weights
+            # are floats. The nn.embedding_lookup support ids as ints, but
+            # the one_hot matmul need both inputs and weights to be same dtype.
+            one_hot_data = tf.one_hot(
+                inputs, depth=self.input_dim, dtype=self.dtype
+            )
+            out = tf.matmul(one_hot_data, self.embeddings)
+        else:
+            out = tf.nn.embedding_lookup(self.embeddings, inputs)
+
+        if self.sparse and not isinstance(out, tf.SparseTensor):
+            out = tf.sparse.from_dense(out)
+
+        if (
+            self._dtype_policy.compute_dtype
+            != self._dtype_policy.variable_dtype
+        ):
+            # Instead of casting the variable as in most layers, cast the
+            # output, as this is mathematically equivalent but is faster.
+            out = tf.cast(out, self._dtype_policy.compute_dtype)
+        return out
+
+    def get_config(self):
+        config = {
+            "input_dim": self.input_dim,
+            "output_dim": self.output_dim,
+            "embeddings_initializer": initializers.serialize(
+                self.embeddings_initializer
+            ),
+            "embeddings_regularizer": regularizers.serialize(
+                self.embeddings_regularizer
+            ),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "embeddings_constraint": constraints.serialize(
+                self.embeddings_constraint
+            ),
+            "mask_zero": self.mask_zero,
+            "input_length": self.input_length,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/core/embedding_test.py b/keras/layers/core/embedding_test.py
index 29c891d4157f..0994f208f87d 100644
--- a/keras/layers/core/embedding_test.py
+++ b/keras/layers/core/embedding_test.py
@@ -14,123 +14,245 @@
 # ==============================================================================
 """Tests for embedding layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.mixed_precision import policy
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 class EmbeddingTest(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes
+    def test_embedding(self):
+        if tf.test.is_gpu_available():
+            self.skipTest("Only test embedding on CPU.")
+
+        test_utils.layer_test(
+            keras.layers.Embedding,
+            kwargs={"output_dim": 4, "input_dim": 10, "input_length": 2},
+            input_shape=(3, 2),
+            input_dtype="int32",
+            expected_output_dtype="float32",
+        )
+
+        test_utils.layer_test(
+            keras.layers.Embedding,
+            kwargs={"output_dim": 4, "input_dim": 10, "mask_zero": True},
+            input_shape=(3, 2),
+            input_dtype="int32",
+            expected_output_dtype="float32",
+        )
+
+        test_utils.layer_test(
+            keras.layers.Embedding,
+            kwargs={"output_dim": 4, "input_dim": 10, "mask_zero": True},
+            input_shape=(3, 4, 2),
+            input_dtype="int32",
+            expected_output_dtype="float32",
+        )
+
+        test_utils.layer_test(
+            keras.layers.Embedding,
+            kwargs={
+                "output_dim": 4,
+                "input_dim": 10,
+                "mask_zero": True,
+                "input_length": (None, 2),
+            },
+            input_shape=(3, 4, 2),
+            input_dtype="int32",
+            expected_output_dtype="float32",
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_embedding_correctness(self):
+        layer = keras.layers.Embedding(output_dim=2, input_dim=2)
+        model = keras.models.Sequential([layer])
+
+        layer.set_weights([np.array([[1, 1], [2, 2]])])
+        model.run_eagerly = test_utils.should_run_eagerly()
+        outputs = model.predict(np.array([[0, 1, 0]], dtype="int32"))
+        self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
+
+    def test_embedding_incorrect_dimension(self):
+        with self.assertRaises(ValueError):
+            keras.layers.Embedding(input_dim=0, output_dim=1)
+
+        with self.assertRaises(ValueError):
+            keras.layers.Embedding(input_dim=1, output_dim=0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_eager_gpu_cpu(self):
+        l = keras.layers.Embedding(output_dim=2, input_dim=2)
+        l.build((None, 2))
+        inputs = keras.backend.constant([[0, 1, 0]], dtype="int32")
+        with tf.GradientTape() as tape:
+            output = l(inputs)
+        gs = tape.gradient(output, l.weights)
+        opt = tf.compat.v1.train.AdagradOptimizer(0.1)
+        opt.apply_gradients(zip(gs, l.weights))
+        self.assertAllEqual(len(gs), 1)
+
+    @test_combinations.run_all_keras_modes
+    def test_embedding_with_ragged_input(self):
+        layer = keras.layers.Embedding(
+            input_dim=3,
+            output_dim=2,
+            weights=[np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])],
+        )
+        inputs = keras.layers.Input(
+            shape=(None,), dtype=tf.float32, ragged=True
+        )
+
+        outputs = keras.layers.Lambda(
+            lambda args: keras.backend.identity(args)
+        )(inputs)
+
+        outputs = layer(outputs)
+
+        model = keras.Model(inputs, outputs)
+        model.run_eagerly = test_utils.should_run_eagerly()
+        outputs = model.predict(
+            tf.ragged.constant(
+                [[1.0, 2.0, 2.0], [0.0], [1.0, 2.0]], ragged_rank=1
+            )
+        )
+        self.assertAllClose(
+            outputs,
+            tf.ragged.constant(
+                [
+                    [[1.0, 1.0], [2.0, 2.0], [2.0, 2.0]],
+                    [[0.0, 0.0]],
+                    [[1.0, 1.0], [2.0, 2.0]],
+                ],
+                ragged_rank=1,
+            ),
+        )
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_mixed_precision_embedding(self):
+        try:
+            policy.set_global_policy("mixed_float16")
+            layer = keras.layers.Embedding(input_dim=5, output_dim=2)
+            self.assertEqual(layer._dtype_policy.name, "mixed_float16")
+            outputs = layer(np.array([0, 1, 2]))
+            self.assertEqual(outputs.dtype, "float16")
+        finally:
+            policy.set_global_policy("float32")
+
+    @test_combinations.run_all_keras_modes
+    def test_embedding_with_sparse_input_sparse_output(self):
+        layer = keras.layers.Embedding(
+            input_dim=3,
+            output_dim=2,
+            weights=[np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])],
+            sparse=True,
+        )
+        input = tf.SparseTensor(
+            indices=[[0, 1], [1, 2]], values=[1, 2], dense_shape=[3, 3]
+        )
+        output = layer(input)
+        expected_output = tf.SparseTensor(
+            indices=[[0, 1, 0], [0, 1, 1], [1, 2, 0], [1, 2, 1]],
+            values=[1.0, 1.0, 2.0, 2.0],
+            dense_shape=[3, 3, 2],
+        )
+        self.assertAllClose(output.indices, expected_output.indices)
+        self.assertAllClose(output.values, expected_output.values)
+        self.assertAllClose(output.dense_shape, expected_output.dense_shape)
+
+    @test_combinations.run_all_keras_modes
+    def test_embedding_with_sparse_input_dense_output(self):
+        layer = keras.layers.Embedding(
+            input_dim=3,
+            output_dim=2,
+            weights=[np.array([[0.1, 0.1], [1.0, 1.0], [2.0, 2.0]])],
+            sparse=False,
+        )
+        input = tf.SparseTensor(
+            indices=[[0, 1], [1, 2]], values=[1, 2], dense_shape=[3, 3]
+        )
+        output = layer(input)
+        expected_output = tf.constant(
+            [
+                [[0.1, 0.1], [1.0, 1.0], [0.1, 0.1]],
+                [[0.1, 0.1], [0.1, 0.1], [2.0, 2.0]],
+                [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+            ]
+        )
+        self.assertAllClose(output, expected_output)
+
+    @test_combinations.run_all_keras_modes
+    def test_error_message_for_mask_zero_enabled_with_sparse_tensor(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "`mask_zero` cannot be enabled when "
+            "`tf.keras.layers.Embedding` is used with `tf.SparseTensor` "
+            "input.",
+        ):
+            layer = keras.layers.Embedding(
+                input_dim=3,
+                output_dim=2,
+                weights=[np.array([[0.1, 0.1], [1.0, 1.0], [2.0, 2.0]])],
+                sparse=True,
+                mask_zero=True,
+            )
+            inputs = tf.SparseTensor(
+                indices=[[0, 1], [1, 2]], values=[1, 2], dense_shape=[3, 3]
+            )
+            layer(inputs)
+
+    @test_combinations.run_all_keras_modes
+    def test_embedding_with_dense_input_sprase_output(self):
+        layer = keras.layers.Embedding(
+            input_dim=3,
+            output_dim=2,
+            weights=[np.array([[0, 0], [1.0, 1.0], [2.0, 2.0]])],
+            sparse=True,
+            mask_zero=False,
+        )
+        inputs = tf.constant([0, 0, 0, 2, 1])
+        output = layer(inputs)
+        expected_output = tf.SparseTensor(
+            indices=[[3, 0], [3, 1], [4, 0], [4, 1]],
+            values=[2.0, 2.0, 1.0, 1.0],
+            dense_shape=[5, 2],
+        )
+        self.assertAllClose(output.indices, expected_output.indices)
+        self.assertAllClose(output.values, expected_output.values)
+        self.assertAllClose(output.dense_shape, expected_output.dense_shape)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_use_one_hot(self):
+        batch = 8
+        input_length = 10
+        layer = keras.layers.Embedding(input_dim=100, output_dim=16)
+        self.assertFalse(layer._use_one_hot_matmul)
+
+        inputs = tf.random.uniform(
+            shape=[batch, input_length], minval=0, maxval=9, dtype=tf.int64
+        )
+        output_1 = layer(inputs)
+
+        layer._use_one_hot_matmul = True
+        output_2 = layer(inputs)
+
+        self.assertAllClose(output_1, output_2)
+        self.assertEqual(output_1.dtype, output_2.dtype)
+
+        # Make sure the layer can be created with hidden kwargs, and not
+        # serialize it into config (for now).
+        layer = keras.layers.Embedding(
+            input_dim=100, output_dim=16, use_one_hot_matmul=True
+        )
+        self.assertTrue(layer._use_one_hot_matmul)
+
+        self.assertNotIn("use_one_hot_matmul", layer.get_config())
+
 
-  @test_combinations.run_all_keras_modes
-  def test_embedding(self):
-    if tf.test.is_gpu_available():
-      self.skipTest('Only test embedding on CPU.')
-
-    test_utils.layer_test(
-        keras.layers.Embedding,
-        kwargs={'output_dim': 4,
-                'input_dim': 10,
-                'input_length': 2},
-        input_shape=(3, 2),
-        input_dtype='int32',
-        expected_output_dtype='float32')
-
-    test_utils.layer_test(
-        keras.layers.Embedding,
-        kwargs={'output_dim': 4,
-                'input_dim': 10,
-                'mask_zero': True},
-        input_shape=(3, 2),
-        input_dtype='int32',
-        expected_output_dtype='float32')
-
-    test_utils.layer_test(
-        keras.layers.Embedding,
-        kwargs={'output_dim': 4,
-                'input_dim': 10,
-                'mask_zero': True},
-        input_shape=(3, 4, 2),
-        input_dtype='int32',
-        expected_output_dtype='float32')
-
-    test_utils.layer_test(
-        keras.layers.Embedding,
-        kwargs={'output_dim': 4,
-                'input_dim': 10,
-                'mask_zero': True,
-                'input_length': (None, 2)},
-        input_shape=(3, 4, 2),
-        input_dtype='int32',
-        expected_output_dtype='float32')
-
-  @test_combinations.run_all_keras_modes
-  def test_embedding_correctness(self):
-    layer = keras.layers.Embedding(output_dim=2, input_dim=2)
-    model = keras.models.Sequential([layer])
-
-    layer.set_weights([np.array([[1, 1], [2, 2]])])
-    model.run_eagerly = test_utils.should_run_eagerly()
-    outputs = model.predict(np.array([[0, 1, 0]], dtype='int32'))
-    self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
-
-  def test_embedding_incorrect_dimension(self):
-    with self.assertRaises(ValueError):
-      keras.layers.Embedding(input_dim=0, output_dim=1)
-
-    with self.assertRaises(ValueError):
-      keras.layers.Embedding(input_dim=1, output_dim=0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_eager_gpu_cpu(self):
-    l = keras.layers.Embedding(output_dim=2, input_dim=2)
-    l.build((None, 2))
-    inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
-    with tf.GradientTape() as tape:
-      output = l(inputs)
-    gs = tape.gradient(output, l.weights)
-    opt = tf.compat.v1.train.AdagradOptimizer(0.1)
-    opt.apply_gradients(zip(gs, l.weights))
-    self.assertAllEqual(len(gs), 1)
-
-  @test_combinations.run_all_keras_modes
-  def test_embedding_with_ragged_input(self):
-    layer = keras.layers.Embedding(
-        input_dim=3,
-        output_dim=2,
-        weights=[np.array([[0., 0.], [1., 1.], [2., 2.]])])
-    inputs = keras.layers.Input(
-        shape=(None,), dtype=tf.float32, ragged=True)
-    # pylint: disable=unnecessary-lambda
-    outputs = keras.layers.Lambda(lambda args: keras.backend.identity(args))(
-        inputs)
-    # pylint: enable=unnecessary-lambda
-    outputs = layer(outputs)
-
-    model = keras.Model(inputs, outputs)
-    model.run_eagerly = test_utils.should_run_eagerly()
-    outputs = model.predict(
-        tf.ragged.constant([[1., 2., 2.], [0.], [1., 2.]], ragged_rank=1))
-    self.assertAllClose(
-        outputs,
-        tf.ragged.constant(
-            [[[1., 1.], [2., 2.], [2., 2.]], [[0., 0.]], [[1., 1.], [2., 2.]]],
-            ragged_rank=1))
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_mixed_precision_embedding(self):
-    try:
-      policy.set_global_policy('mixed_float16')
-      layer = keras.layers.Embedding(input_dim=5, output_dim=2)
-      self.assertEqual(layer._dtype_policy.name, 'mixed_float16')
-      outputs = layer(np.array([0, 1, 2]))
-      self.assertEqual(outputs.dtype, 'float16')
-    finally:
-      policy.set_global_policy('float32')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/optimizer.py b/keras/layers/core/identity.py
similarity index 58%
rename from keras/optimizers/legacy/optimizer.py
rename to keras/layers/core/identity.py
index 925a97024508..2b5c0cff76ee 100644
--- a/keras/optimizers/legacy/optimizer.py
+++ b/keras/layers/core/identity.py
@@ -12,13 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy Adam optimizer implementation."""
+"""Contains the Identity layer."""
 
-from keras.optimizers.optimizer_v2 import optimizer_v2
+import tensorflow.compat.v2 as tf
 
+from keras.engine.base_layer import Layer
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.Optimizer')
-class Optimizer(optimizer_v2.OptimizerV2):
-  pass
+@keras_export("keras.layers.Identity")
+class Identity(Layer):
+    """Identity layer.
+
+    This layer should be used as a placeholder when no operation is to be
+    performed. The layer is argument insensitive, and returns its `inputs`
+    argument as output.
+
+    Args:
+        name: Optional name for the layer instance.
+    """
+
+    def call(self, inputs):
+        return tf.nest.map_structure(tf.identity, inputs)
diff --git a/keras/layers/core/lambda_layer.py b/keras/layers/core/lambda_layer.py
index 3be1ba108017..1a8c2142d343 100644
--- a/keras/layers/core/lambda_layer.py
+++ b/keras/layers/core/lambda_layer.py
@@ -13,210 +13,228 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the Lambda layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 import sys
 import textwrap
 import types as python_types
 import warnings
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras.engine.base_layer import Layer
+from keras.saving import serialization_lib
 from keras.utils import generic_utils
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Lambda')
+@keras_export("keras.layers.Lambda")
 class Lambda(Layer):
-  """Wraps arbitrary expressions as a `Layer` object.
-
-  The `Lambda` layer exists so that arbitrary expressions can be used
-  as a `Layer` when constructing `Sequential`
-  and Functional API models. `Lambda` layers are best suited for simple
-  operations or quick experimentation. For more advanced use cases, follow
-  [this guide](https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-  for subclassing `tf.keras.layers.Layer`.
-
-  WARNING: `tf.keras.layers.Lambda` layers have (de)serialization limitations!
-
-  The main reason to subclass `tf.keras.layers.Layer` instead of using a
-  `Lambda` layer is saving and inspecting a Model. `Lambda` layers
-  are saved by serializing the Python bytecode, which is fundamentally
-  non-portable. They should only be loaded in the same environment where
-  they were saved. Subclassed layers can be saved in a more portable way
-  by overriding their `get_config` method. Models that rely on
-  subclassed Layers are also often easier to visualize and reason about.
-
-  Examples:
-
-  ```python
-  # add a x -> x^2 layer
-  model.add(Lambda(lambda x: x ** 2))
-  ```
-  ```python
-  # add a layer that returns the concatenation
-  # of the positive part of the input and
-  # the opposite of the negative part
-
-  def antirectifier(x):
-      x -= K.mean(x, axis=1, keepdims=True)
-      x = K.l2_normalize(x, axis=1)
-      pos = K.relu(x)
-      neg = K.relu(-x)
-      return K.concatenate([pos, neg], axis=1)
-
-  model.add(Lambda(antirectifier))
-  ```
-
-  Variables:
-    While it is possible to use Variables with Lambda layers, this practice is
-    discouraged as it can easily lead to bugs. For instance, consider the
-    following layer:
-
-  ```python
+    """Wraps arbitrary expressions as a `Layer` object.
+
+    The `Lambda` layer exists so that arbitrary expressions can be used
+    as a `Layer` when constructing Sequential
+    and Functional API models. `Lambda` layers are best suited for simple
+    operations or quick experimentation. For more advanced use cases, follow
+    [this guide](
+    https://www.tensorflow.org/guide/keras/custom_layers_and_models)
+    for subclassing `tf.keras.layers.Layer`.
+
+    WARNING: `tf.keras.layers.Lambda` layers have (de)serialization limitations!
+
+    The main reason to subclass `tf.keras.layers.Layer` instead of using a
+    `Lambda` layer is saving and inspecting a Model. `Lambda` layers
+    are saved by serializing the Python bytecode, which is fundamentally
+    non-portable. They should only be loaded in the same environment where
+    they were saved. Subclassed layers can be saved in a more portable way
+    by overriding their `get_config()` method. Models that rely on
+    subclassed Layers are also often easier to visualize and reason about.
+
+    Examples:
+
+    ```python
+    # add a x -> x^2 layer
+    model.add(Lambda(lambda x: x ** 2))
+    ```
+
+    ```python
+    # add a layer that returns the concatenation
+    # of the positive part of the input and
+    # the opposite of the negative part
+
+    def antirectifier(x):
+        x -= K.mean(x, axis=1, keepdims=True)
+        x = K.l2_normalize(x, axis=1)
+        pos = K.relu(x)
+        neg = K.relu(-x)
+        return K.concatenate([pos, neg], axis=1)
+
+    model.add(Lambda(antirectifier))
+    ```
+
+    **Note on Variables:**
+
+    While it is possible to use Variables with Lambda layers,
+    this practice is discouraged as it can easily lead to bugs.
+    For instance, consider the following layer:
+
+    ```python
     scale = tf.Variable(1.)
     scale_layer = tf.keras.layers.Lambda(lambda x: x * scale)
-  ```
+    ```
 
-    Because scale_layer does not directly track the `scale` variable, it will
+    Because `scale_layer` does not directly track the `scale` variable, it will
     not appear in `scale_layer.trainable_weights` and will therefore not be
     trained if `scale_layer` is used in a Model.
 
     A better pattern is to write a subclassed Layer:
 
-  ```python
+    ```python
     class ScaleLayer(tf.keras.layers.Layer):
-      def __init__(self):
-        super(ScaleLayer, self).__init__()
-        self.scale = tf.Variable(1.)
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            self.scale = tf.Variable(1.)
 
-      def call(self, inputs):
-        return inputs * self.scale
-  ```
+        def call(self, inputs):
+            return inputs * self.scale
+    ```
 
-    In general, Lambda layers can be convenient for simple stateless
+    In general, `Lambda` layers can be convenient for simple stateless
     computation, but anything more complex should use a subclass Layer instead.
 
-  Args:
-    function: The function to be evaluated. Takes input tensor as first
-      argument.
-    output_shape: Expected output shape from function. This argument can be
-      inferred if not explicitly provided. Can be a tuple or function. If a
-      tuple, it only specifies the first dimension onward;
-      sample dimension is assumed either the same as the input: `output_shape =
-        (input_shape[0], ) + output_shape` or, the input is `None` and
-      the sample dimension is also `None`: `output_shape = (None, ) +
-        output_shape` If a function, it specifies the entire shape as a function
-        of the
-      input shape: `output_shape = f(input_shape)`
-    mask: Either None (indicating no masking) or a callable with the same
-      signature as the `compute_mask` layer method, or a tensor that will be
-      returned as output mask regardless of what the input is.
-    arguments: Optional dictionary of keyword arguments to be passed to the
-      function.
-  Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
-    integers, does not include the samples axis) when using this layer as the
-    first layer in a model.
-  Output shape: Specified by `output_shape` argument
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self,
-               function,
-               output_shape=None,
-               mask=None,
-               arguments=None,
-               **kwargs):
-    super().__init__(**kwargs)
-
-    self.arguments = arguments or {}
-    self.function = function
-
-    if mask is not None:
-      self.supports_masking = True
-    self.mask = mask
-    self._output_shape = output_shape
-
-    # Warning on every invocation will be quite irksome in Eager mode.
-    self._already_warned = False
-
-    function_args = tf_inspect.getfullargspec(function).args
-    self._fn_expects_training_arg = 'training' in function_args
-    self._fn_expects_mask_arg = 'mask' in function_args
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self._output_shape is None:
-      # Make use of existing autocomputation but provide Lambda-specific
-      # error message. This is always safe to run even when the outer context
-      # is Graph mode because Lambda layers don't have side effects such as
-      # `add_loss`.
-      with tf.__internal__.eager_context.eager_mode():
-        try:
-          return super().compute_output_shape(input_shape)
-        except NotImplementedError:
-          raise NotImplementedError(
-              'We could not automatically infer the shape of the Lambda\'s '
-              'output. Please specify `output_shape` for this Lambda.')
-
-    if callable(self._output_shape):
-      output_shapes = self._output_shape(input_shape)
-      return tf_utils.convert_shapes(output_shapes, to_tuples=False)
-
-    # Output shapes are passed directly and don't include batch dimension.
-    input_tensor_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-    batch_size = tf.nest.flatten(
-        input_tensor_shape)[0][0] if input_shape else None
-
-    def _add_batch(shape):
-      return tf.TensorShape([batch_size] + shape.as_list())
-
-    output_shapes = tf_utils.convert_shapes(self._output_shape, to_tuples=False)
-    return tf.nest.map_structure(_add_batch, output_shapes)
-
-  def call(self, inputs, mask=None, training=None):
-    # We must copy for thread safety, but it only needs to be a shallow copy.
-    kwargs = {k: v for k, v in self.arguments.items()}
-    if self._fn_expects_mask_arg:
-      kwargs['mask'] = mask
-    if self._fn_expects_training_arg:
-      kwargs['training'] = training
-
-    created_variables = []
-
-    def _variable_creator(next_creator, **kwargs):
-      var = next_creator(**kwargs)
-      created_variables.append(var)
-      return var
-
-    with tf.GradientTape(watch_accessed_variables=True) as tape,\
-        tf.variable_creator_scope(_variable_creator):
-      result = self.function(inputs, **kwargs)
-    self._check_variables(created_variables, tape.watched_variables())
-    return result
-
-  def _check_variables(self, created_variables, accessed_variables):
-    if not created_variables and not accessed_variables:
-      # In the common case that a Lambda layer does not touch a Variable, we
-      # don't want to incur the runtime cost of assembling any state used for
-      # checking only to immediately discard it.
-      return
-
-    # Filter out the state variable in the tf.random.Generator, which is
-    # commonly used for initializer or droput. The variable is intentionally
-    # not tracked and it is not a trainable variable.
-    created_variables = [v for v in created_variables
-                         if 'StateVar' not in v.name]
-
-    tracked_weights = set(v.ref() for v in self.weights)
-    untracked_new_vars = [
-        v for v in created_variables if v.ref() not in tracked_weights
-    ]
-    if untracked_new_vars:
-      variable_str = '\n'.join('  {}'.format(i) for i in untracked_new_vars)
-      error_str = textwrap.dedent("""
+    Args:
+      function: The function to be evaluated. Takes input tensor as first
+        argument.
+      output_shape: Expected output shape from function. This argument can be
+        inferred if not explicitly provided. Can be a tuple or function. If a
+        tuple, it only specifies the first dimension onward;
+        sample dimension is assumed either the same as the input:
+        `output_shape = (input_shape[0], ) + output_shape` or, the input is
+        `None` and the sample dimension is also `None`:
+        `output_shape = (None, ) + output_shape` If a function, it specifies the
+        entire shape as a function of the input shape:
+        `output_shape = f(input_shape)`
+      mask: Either None (indicating no masking) or a callable with the same
+        signature as the `compute_mask` layer method, or a tensor that will be
+        returned as output mask regardless of what the input is.
+      arguments: Optional dictionary of keyword arguments to be passed to the
+        function.
+
+    Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
+      integers, does not include the samples axis) when using this layer as the
+      first layer in a model.
+
+    Output shape: Specified by `output_shape` argument
+    """
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(
+        self, function, output_shape=None, mask=None, arguments=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.arguments = arguments or {}
+        self.function = function
+
+        if mask is not None:
+            self.supports_masking = True
+        self.mask = mask
+        self._output_shape = output_shape
+
+        # Warning on every invocation will be quite irksome in Eager mode.
+        self._already_warned = False
+
+        function_args = tf_inspect.getfullargspec(function).args
+        self._fn_expects_training_arg = "training" in function_args
+        self._fn_expects_mask_arg = "mask" in function_args
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if self._output_shape is None:
+            # Make use of existing autocomputation but provide Lambda-specific
+            # error message. This is always safe to run even when the outer
+            # context is Graph mode because Lambda layers don't have side
+            # effects such as `add_loss`.
+            with tf.__internal__.eager_context.eager_mode():
+                try:
+                    return super().compute_output_shape(input_shape)
+                except NotImplementedError:
+                    raise NotImplementedError(
+                        "We could not automatically infer the shape of "
+                        "the Lambda's output. Please specify `output_shape` "
+                        "for this Lambda."
+                    )
+
+        if callable(self._output_shape):
+            output_shapes = self._output_shape(input_shape)
+            return tf_utils.convert_shapes(output_shapes, to_tuples=False)
+
+        # Output shapes are passed directly and don't include batch dimension.
+        input_tensor_shape = tf_utils.convert_shapes(
+            input_shape, to_tuples=False
+        )
+        batch_size = (
+            tf.nest.flatten(input_tensor_shape)[0][0] if input_shape else None
+        )
+
+        def _add_batch(shape):
+            return tf.TensorShape([batch_size] + shape.as_list())
+
+        output_shapes = tf_utils.convert_shapes(
+            self._output_shape, to_tuples=False
+        )
+        return tf.nest.map_structure(_add_batch, output_shapes)
+
+    def call(self, inputs, mask=None, training=None):
+        # We must copy for thread safety, but it only needs to be a shallow
+        # copy.
+        kwargs = {k: v for k, v in self.arguments.items()}
+        if self._fn_expects_mask_arg:
+            kwargs["mask"] = mask
+        if self._fn_expects_training_arg:
+            kwargs["training"] = training
+
+        created_variables = []
+
+        def _variable_creator(next_creator, **kwargs):
+            var = next_creator(**kwargs)
+            created_variables.append(var)
+            return var
+
+        with tf.GradientTape(
+            watch_accessed_variables=True
+        ) as tape, tf.variable_creator_scope(_variable_creator):
+            result = self.function(inputs, **kwargs)
+        self._check_variables(created_variables, tape.watched_variables())
+        return result
+
+    def _check_variables(self, created_variables, accessed_variables):
+        if not created_variables and not accessed_variables:
+            # In the common case that a Lambda layer does not touch a Variable,
+            # we don't want to incur the runtime cost of assembling any state
+            # used for checking only to immediately discard it.
+            return
+
+        # Filter out the state variable in the tf.random.Generator, which is
+        # commonly used for initializer or droput. The variable is intentionally
+        # not tracked and it is not a trainable variable.
+        created_variables = [
+            v for v in created_variables if "StateVar" not in v.name
+        ]
+
+        tracked_weights = set(v.ref() for v in self.weights)
+        untracked_new_vars = [
+            v for v in created_variables if v.ref() not in tracked_weights
+        ]
+        if untracked_new_vars:
+            variable_str = "\n".join(f"  {i}" for i in untracked_new_vars)
+            error_str = textwrap.dedent(
+                """
           The following Variables were created within a Lambda layer ({name})
           but are not tracked by said layer:
           {variable_str}
@@ -224,143 +242,175 @@ def _check_variables(self, created_variables, accessed_variables):
           calls, and consequently this behavior is disallowed for safety. Lambda
           layers are not well suited to stateful computation; instead, writing a
           subclassed Layer is the recommend way to define layers with
-          Variables.""").format(
-              name=self.name, variable_str=variable_str)
-      raise ValueError(error_str)
-
-    untracked_used_vars = [
-        v for v in accessed_variables if v.ref() not in tracked_weights
-    ]
-    if untracked_used_vars and not self._already_warned:
-      variable_str = '\n'.join('  {}'.format(i) for i in untracked_used_vars)
-      self._warn(
-          textwrap.dedent("""
+          Variables."""
+            ).format(name=self.name, variable_str=variable_str)
+            raise ValueError(error_str)
+
+        untracked_used_vars = [
+            v for v in accessed_variables if v.ref() not in tracked_weights
+        ]
+        if untracked_used_vars and not self._already_warned:
+            variable_str = "\n".join(f"  {i}" for i in untracked_used_vars)
+            self._warn(
+                textwrap.dedent(
+                    """
           The following Variables were used a Lambda layer's call ({name}), but
           are not present in its tracked objects:
           {variable_str}
           It is possible that this is intended behavior, but it is more likely
           an omission. This is a strong indication that this layer should be
-          formulated as a subclassed Layer rather than a Lambda layer.""")
-          .format(name=self.name, variable_str=variable_str))
-      self._already_warned = True
-
-  def _warn(self, msg):
-    # This method will be overridden in a unit test to raise an error, because
-    # self.assertWarns is not universally implemented.
-    return tf_logging.warning(msg)
-
-  def compute_mask(self, inputs, mask=None):
-    if callable(self.mask):
-      return self.mask(inputs, mask)
-    return self.mask
-
-  def get_config(self):
-    function_config = self._serialize_function_to_config(self.function)
-    output_shape_config = self._serialize_function_to_config(
-        self._output_shape, allow_raw=True)
-    config = {
-        'function': function_config[0],
-        'function_type': function_config[1],
-        'module': function_config[2],
-        'output_shape': output_shape_config[0],
-        'output_shape_type': output_shape_config[1],
-        'output_shape_module': output_shape_config[2],
-    }
-    if self.mask is not None:
-      mask_config = self._serialize_function_to_config(self.mask)
-      config.update({
-          'mask': mask_config[0],
-          'mask_type': mask_config[1],
-          'mask_module': mask_config[2]
-      })
-    config['arguments'] = self.arguments
-
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def _serialize_function_to_config(self, inputs, allow_raw=False):
-    if isinstance(inputs, python_types.LambdaType):
-      output = generic_utils.func_dump(inputs)
-      output_type = 'lambda'
-      module = inputs.__module__
-    elif callable(inputs):
-      output = inputs.__name__
-      output_type = 'function'
-      module = inputs.__module__
-    elif allow_raw:
-      output = inputs
-      output_type = 'raw'
-      module = None
-    else:
-      raise ValueError('Invalid input for serialization, type: %s ' %
-                       type(inputs))
-
-    return output, output_type, module
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()
-    function = cls._parse_function_from_config(config, custom_objects,
-                                               'function', 'module',
-                                               'function_type')
-
-    output_shape = cls._parse_function_from_config(config, custom_objects,
-                                                   'output_shape',
-                                                   'output_shape_module',
-                                                   'output_shape_type')
-    if 'mask' in config:
-      mask = cls._parse_function_from_config(config, custom_objects, 'mask',
-                                             'mask_module', 'mask_type')
-    else:
-      mask = None
-
-    config['function'] = function
-    config['output_shape'] = output_shape
-    config['mask'] = mask
-
-    # If arguments were numpy array, they have been saved as
-    # list. We need to recover the ndarray
-    if 'arguments' in config:
-      for key in config['arguments']:
-        if isinstance(config['arguments'][key], dict):
-          arg_dict = config['arguments'][key]
-          if 'type' in arg_dict and arg_dict['type'] == 'ndarray':
-            # Overwrite the argument with its numpy translation
-            config['arguments'][key] = np.array(arg_dict['value'])
-
-    return cls(**config)
-
-  @classmethod
-  def _parse_function_from_config(cls, config, custom_objects, func_attr_name,
-                                  module_attr_name, func_type_attr_name):
-    globs = globals().copy()
-    module = config.pop(module_attr_name, None)
-    if module in sys.modules:
-      globs.update(sys.modules[module].__dict__)
-    elif module is not None:
-      # Note: we don't know the name of the function if it's a lambda.
-      warnings.warn(
-          '{} is not loaded, but a Lambda layer uses it. '
-          'It may cause errors.'.format(module),
-          UserWarning,
-          stacklevel=2)
-    if custom_objects:
-      globs.update(custom_objects)
-    function_type = config.pop(func_type_attr_name)
-    if function_type == 'function':
-      # Simple lookup in custom objects
-      function = generic_utils.deserialize_keras_object(
-          config[func_attr_name],
-          custom_objects=custom_objects,
-          printable_module_name='function in Lambda layer')
-    elif function_type == 'lambda':
-      # Unsafe deserialization from bytecode
-      function = generic_utils.func_load(config[func_attr_name], globs=globs)
-    elif function_type == 'raw':
-      function = config[func_attr_name]
-    else:
-      supported_types = ['function', 'lambda', 'raw']
-      raise TypeError(
-          f'Unsupported value for `function_type` argument. Received: '
-          f'function_type={function_type}. Expected one of {supported_types}')
-    return function
+          formulated as a subclassed Layer rather than a Lambda layer."""
+                ).format(name=self.name, variable_str=variable_str)
+            )
+            self._already_warned = True
+
+    def _warn(self, msg):
+        # This method will be overridden in a unit test to raise an error,
+        # because self.assertWarns is not universally implemented.
+        return tf_logging.warning(msg)
+
+    def compute_mask(self, inputs, mask=None):
+        if callable(self.mask):
+            return self.mask(inputs, mask)
+        return self.mask
+
+    def get_config(self):
+        function_config = self._serialize_function_to_config(self.function)
+        output_shape_config = self._serialize_function_to_config(
+            self._output_shape, allow_raw=True
+        )
+        config = {
+            "function": function_config[0],
+            "function_type": function_config[1],
+            "module": function_config[2],
+            "output_shape": output_shape_config[0],
+            "output_shape_type": output_shape_config[1],
+            "output_shape_module": output_shape_config[2],
+        }
+        if self.mask is not None:
+            mask_config = self._serialize_function_to_config(self.mask)
+            config.update(
+                {
+                    "mask": mask_config[0],
+                    "mask_type": mask_config[1],
+                    "mask_module": mask_config[2],
+                }
+            )
+        config["arguments"] = self.arguments
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def _serialize_function_to_config(self, inputs, allow_raw=False):
+        if isinstance(inputs, python_types.LambdaType):
+            output = generic_utils.func_dump(inputs)
+            output_type = "lambda"
+            module = inputs.__module__
+        elif callable(inputs):
+            output = inputs.__name__
+            output_type = "function"
+            module = inputs.__module__
+        elif allow_raw:
+            output = inputs
+            output_type = "raw"
+            module = None
+        else:
+            raise ValueError(
+                f"Invalid input for serialization, type: {type(inputs)} "
+            )
+
+        return output, output_type, module
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+        function = cls._parse_function_from_config(
+            config, custom_objects, "function", "module", "function_type"
+        )
+
+        output_shape = cls._parse_function_from_config(
+            config,
+            custom_objects,
+            "output_shape",
+            "output_shape_module",
+            "output_shape_type",
+        )
+        if "mask" in config:
+            mask = cls._parse_function_from_config(
+                config, custom_objects, "mask", "mask_module", "mask_type"
+            )
+        else:
+            mask = None
+
+        config["function"] = function
+        config["output_shape"] = output_shape
+        config["mask"] = mask
+
+        # If arguments were numpy array, they have been saved as
+        # list. We need to recover the ndarray
+        if "arguments" in config:
+            for key in config["arguments"]:
+                if isinstance(config["arguments"][key], dict):
+                    arg_dict = config["arguments"][key]
+                    if "type" in arg_dict and arg_dict["type"] == "ndarray":
+                        # Overwrite the argument with its numpy translation
+                        config["arguments"][key] = np.array(arg_dict["value"])
+
+        return cls(**config)
+
+    @classmethod
+    def _parse_function_from_config(
+        cls,
+        config,
+        custom_objects,
+        func_attr_name,
+        module_attr_name,
+        func_type_attr_name,
+    ):
+        globs = globals().copy()
+        module = config.pop(module_attr_name, None)
+        if module in sys.modules:
+            globs.update(sys.modules[module].__dict__)
+        elif module is not None:
+            # Note: we don't know the name of the function if it's a lambda.
+            warnings.warn(
+                "{} is not loaded, but a Lambda layer uses it. "
+                "It may cause errors.".format(module),
+                UserWarning,
+                stacklevel=2,
+            )
+        if custom_objects:
+            globs.update(custom_objects)
+        function_type = config.pop(func_type_attr_name)
+        if function_type == "function":
+            # Simple lookup in custom objects
+            function = serialization_lib.deserialize_keras_object(
+                config[func_attr_name],
+                custom_objects=custom_objects,
+                printable_module_name="function in Lambda layer",
+            )
+        elif function_type == "lambda":
+            if serialization_lib.in_safe_mode():
+                raise ValueError(
+                    "Requested the deserialization of a Lambda layer with a "
+                    "Python `lambda` inside it. "
+                    "This carries a potential risk of arbitrary code execution "
+                    "and thus it is disallowed by default. If you trust the "
+                    "source of the saved model, you can pass `safe_mode=False` "
+                    "to the loading function in order to allow "
+                    "Lambda layer loading."
+                )
+            # /!\ Unsafe deserialization from bytecode! Danger! /!\
+            function = generic_utils.func_load(
+                config[func_attr_name], globs=globs
+            )
+        elif function_type == "raw":
+            function = config[func_attr_name]
+        else:
+            supported_types = ["function", "lambda", "raw"]
+            raise TypeError(
+                "Unsupported value for `function_type` argument. Received: "
+                f"function_type={function_type}. "
+                f"Expected one of {supported_types}"
+            )
+        return function
diff --git a/keras/layers/core/masking.py b/keras/layers/core/masking.py
index 2faf2d022222..c710bf34731a 100644
--- a/keras/layers/core/masking.py
+++ b/keras/layers/core/masking.py
@@ -13,75 +13,79 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the Masking layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.engine.base_layer import Layer
+
 import tensorflow.compat.v2 as tf
+
+from keras.engine.base_layer import Layer
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Masking')
+@keras_export("keras.layers.Masking")
 class Masking(Layer):
-  """Masks a sequence by using a mask value to skip timesteps.
-
-  For each timestep in the input tensor (dimension #1 in the tensor),
-  if all values in the input tensor at that timestep
-  are equal to `mask_value`, then the timestep will be masked (skipped)
-  in all downstream layers (as long as they support masking).
-
-  If any downstream layer does not support masking yet receives such
-  an input mask, an exception will be raised.
-
-  Example:
-
-  Consider a Numpy data array `x` of shape `(samples, timesteps, features)`,
-  to be fed to an LSTM layer. You want to mask timestep #3 and #5 because you
-  lack data for these timesteps. You can:
-
-  - Set `x[:, 3, :] = 0.` and `x[:, 5, :] = 0.`
-  - Insert a `Masking` layer with `mask_value=0.` before the LSTM layer:
-
-  ```python
-  samples, timesteps, features = 32, 10, 8
-  inputs = np.random.random([samples, timesteps, features]).astype(np.float32)
-  inputs[:, 3, :] = 0.
-  inputs[:, 5, :] = 0.
-
-  model = tf.keras.models.Sequential()
-  model.add(tf.keras.layers.Masking(mask_value=0.,
-                                    input_shape=(timesteps, features)))
-  model.add(tf.keras.layers.LSTM(32))
-
-  output = model(inputs)
-  # The time step 3 and 5 will be skipped from LSTM calculation.
-  ```
-
-  See [the masking and padding guide](
-    https://www.tensorflow.org/guide/keras/masking_and_padding)
-  for more details.
-  """
-
-  def __init__(self, mask_value=0., **kwargs):
-    super().__init__(**kwargs)
-    self.supports_masking = True
-    self.mask_value = mask_value
-    self._compute_output_and_mask_jointly = True
-
-  def compute_mask(self, inputs, mask=None):
-    return tf.reduce_any(tf.not_equal(inputs, self.mask_value), axis=-1)
-
-  def call(self, inputs):
-    boolean_mask = tf.reduce_any(
-        tf.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
-    outputs = inputs * tf.cast(boolean_mask, inputs.dtype)
-    # Compute the mask and outputs simultaneously.
-    outputs._keras_mask = tf.squeeze(boolean_mask, axis=-1)  # pylint: disable=protected-access
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {'mask_value': self.mask_value}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Masks a sequence by using a mask value to skip timesteps.
+
+    For each timestep in the input tensor (dimension #1 in the tensor),
+    if all values in the input tensor at that timestep
+    are equal to `mask_value`, then the timestep will be masked (skipped)
+    in all downstream layers (as long as they support masking).
+
+    If any downstream layer does not support masking yet receives such
+    an input mask, an exception will be raised.
+
+    Example:
+
+    Consider a Numpy data array `x` of shape `(samples, timesteps, features)`,
+    to be fed to an LSTM layer. You want to mask timestep #3 and #5 because you
+    lack data for these timesteps. You can:
+
+    - Set `x[:, 3, :] = 0.` and `x[:, 5, :] = 0.`
+    - Insert a `Masking` layer with `mask_value=0.` before the LSTM layer:
+
+    ```python
+    samples, timesteps, features = 32, 10, 8
+    inputs = np.random.random([samples, timesteps, features]).astype(np.float32)
+    inputs[:, 3, :] = 0.
+    inputs[:, 5, :] = 0.
+
+    model = tf.keras.models.Sequential()
+    model.add(tf.keras.layers.Masking(mask_value=0.,
+                                      input_shape=(timesteps, features)))
+    model.add(tf.keras.layers.LSTM(32))
+
+    output = model(inputs)
+    # The time step 3 and 5 will be skipped from LSTM calculation.
+    ```
+
+    See [the masking and padding guide](
+      https://www.tensorflow.org/guide/keras/masking_and_padding)
+    for more details.
+    """
+
+    def __init__(self, mask_value=0.0, **kwargs):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.mask_value = mask_value
+        self._compute_output_and_mask_jointly = True
+
+    def compute_mask(self, inputs, mask=None):
+        return tf.reduce_any(tf.not_equal(inputs, self.mask_value), axis=-1)
+
+    def call(self, inputs):
+        boolean_mask = tf.reduce_any(
+            tf.not_equal(inputs, self.mask_value), axis=-1, keepdims=True
+        )
+        outputs = inputs * tf.cast(boolean_mask, inputs.dtype)
+        # Compute the mask and outputs simultaneously.
+        outputs._keras_mask = tf.squeeze(boolean_mask, axis=-1)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {"mask_value": self.mask_value}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/core/tf_op_layer.py b/keras/layers/core/tf_op_layer.py
index 1972de5c2f90..41f3ae93b799 100644
--- a/keras/layers/core/tf_op_layer.py
+++ b/keras/layers/core/tf_op_layer.py
@@ -13,362 +13,389 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the TFOpLambda layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import,g-bad-import-order
 import tensorflow.compat.v2 as tf
-# pylint: enable=g-bad-import-order
 
 from keras import backend
 from keras.engine import keras_tensor
 from keras.engine.base_layer import Layer
 
+# isort: off
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
-from tensorflow.python.util.tf_export import get_symbol_from_name
+from tensorflow.python.util.tf_export import (
+    get_canonical_name_for_symbol,
+)
+from tensorflow.python.util.tf_export import (
+    get_symbol_from_name,
+)
 
 
 class ClassMethod(Layer):
-  """Wraps a TF API Class's class method  in a `Layer` object.
-
-  It is inserted by the Functional API construction whenever users call
-  a supported TF Class's class method on KerasTensors.
-
-  This is useful in the case where users do something like:
-  x = keras.Input(...)
-  y = keras.Input(...)
-  out = tf.RaggedTensor.from_row_splits(x, y)
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, cls_ref, method_name, **kwargs):
-    self.cls_ref = cls_ref
-    self.method_name = method_name
-    self.cls_symbol = (
-        get_canonical_name_for_symbol(
-            self.cls_ref, add_prefix_to_v1_names=True) or
-        get_canonical_name_for_symbol(
-            self.cls_ref, api_name='keras', add_prefix_to_v1_names=True))
-    if 'name' not in kwargs:
-      kwargs['name'] = backend.unique_object_name(
-          'tf.' + self.cls_symbol + '.' + self.method_name,
-          zero_based=True,
-          avoid_observed_names=True)
-    kwargs['autocast'] = False
-
-    # Do not individually trace op layers in the SavedModel.
-    self._must_restore_from_config = True
-
-    super().__init__(**kwargs)
-
-    # Preserve all argument data structures when saving/loading a config
-    # (e.g., don't unnest lists that contain one element)
-    self._preserve_input_structure_in_config = True
-
-    self._call_spec.expects_training_arg = False
-    self._call_spec.expects_mask_arg = False
-
-  def call(self, args, kwargs):
-    return getattr(self.cls_ref, self.method_name)(*args, **kwargs)
-
-  def get_config(self):
-    if not self.cls_symbol:
-      raise ValueError(
-          'This Keras class method conversion tried to convert '
-          f'a method belonging to class {self.cls_symbol}, a class '
-          'that is not publicly exposed in the TensorFlow API. '
-          'To ensure cross-version compatibility of Keras models '
-          'that use op layers, only op layers produced from '
-          'public TensorFlow API symbols can be serialized.')
-
-    config = {'cls_symbol': self.cls_symbol, 'method_name': self.method_name}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()
-    symbol_name = config.pop('cls_symbol')
-    cls_ref = get_symbol_from_name(symbol_name)
-    if not cls_ref:
-      raise ValueError(f'TensorFlow symbol `{symbol_name}` could not be found.')
-
-    config['cls_ref'] = cls_ref
-
-    return cls(**config)
+    """Wraps a TF API Class's class method  in a `Layer` object.
+
+    It is inserted by the Functional API construction whenever users call
+    a supported TF Class's class method on KerasTensors.
+
+    This is useful in the case where users do something like:
+    x = keras.Input(...)
+    y = keras.Input(...)
+    out = tf.RaggedTensor.from_row_splits(x, y)
+    """
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(self, cls_ref, method_name, **kwargs):
+        self.cls_ref = cls_ref
+        self.method_name = method_name
+        self.cls_symbol = get_canonical_name_for_symbol(
+            self.cls_ref, add_prefix_to_v1_names=True
+        ) or get_canonical_name_for_symbol(
+            self.cls_ref, api_name="keras", add_prefix_to_v1_names=True
+        )
+        if "name" not in kwargs:
+            kwargs["name"] = backend.unique_object_name(
+                "tf." + self.cls_symbol + "." + self.method_name,
+                zero_based=True,
+                avoid_observed_names=True,
+            )
+        kwargs["autocast"] = False
+
+        # Do not individually trace op layers in the SavedModel.
+        self._must_restore_from_config = True
+
+        super().__init__(**kwargs)
+
+        # Preserve all argument data structures when saving/loading a config
+        # (e.g., don't unnest lists that contain one element)
+        self._preserve_input_structure_in_config = True
+
+        self._call_spec.expects_training_arg = False
+        self._call_spec.expects_mask_arg = False
+
+    def call(self, args, kwargs):
+        return getattr(self.cls_ref, self.method_name)(*args, **kwargs)
+
+    def get_config(self):
+        if not self.cls_symbol:
+            raise ValueError(
+                "This Keras class method conversion tried to convert "
+                f"a method belonging to class {self.cls_symbol}, a class "
+                "that is not publicly exposed in the TensorFlow API. "
+                "To ensure cross-version compatibility of Keras models "
+                "that use op layers, only op layers produced from "
+                "public TensorFlow API symbols can be serialized."
+            )
+
+        config = {
+            "cls_symbol": self.cls_symbol,
+            "method_name": self.method_name,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+        symbol_name = config.pop("cls_symbol")
+        cls_ref = get_symbol_from_name(symbol_name)
+        if not cls_ref:
+            raise ValueError(
+                f"TensorFlow symbol `{symbol_name}` could not be found."
+            )
+
+        config["cls_ref"] = cls_ref
+
+        return cls(**config)
 
 
 class KerasOpDispatcher(tf.__internal__.dispatch.GlobalOpDispatcher):
-  """A global dispatcher that allows building a functional model with TF Ops."""
+    """A global dispatcher that allows building a functional model with TF
+    Ops."""
 
-  def handle(self, op, args, kwargs):
-    """Handle the specified operation with the specified arguments."""
-    if any(
-        isinstance(x, keras_tensor.KerasTensor)
-        for x in tf.nest.flatten([args, kwargs])):
-      return TFOpLambda(op)(*args, **kwargs)
-    else:
-      return self.NOT_SUPPORTED
+    def handle(self, op, args, kwargs):
+        """Handle the specified operation with the specified arguments."""
+        if any(
+            isinstance(x, keras_tensor.KerasTensor)
+            for x in tf.nest.flatten([args, kwargs])
+        ):
+            return TFOpLambda(op)(*args, **kwargs)
+        else:
+            return self.NOT_SUPPORTED
 
 
 KerasOpDispatcher().register()
 
 
 class InstanceProperty(Layer):
-  """Wraps an instance property access (e.g.
+    """Wraps an instance property access (e.g.
 
-  `x.foo`) in a Keras Layer.
+    `x.foo`) in a Keras Layer.
 
-  This layer takes an attribute name `attr_name` in the constructor and,
-  when called on input tensor `obj` returns `obj.attr_name`.
+    This layer takes an attribute name `attr_name` in the constructor and,
+    when called on input tensor `obj` returns `obj.attr_name`.
 
-  KerasTensors specialized for specific extension types use it to
-  represent instance property accesses on the represented object in the
-  case where the property needs to be dynamically accessed as opposed to
-  being statically computed from the typespec, e.g.
+    KerasTensors specialized for specific extension types use it to
+    represent instance property accesses on the represented object in the
+    case where the property needs to be dynamically accessed as opposed to
+    being statically computed from the typespec, e.g.
 
-  x = keras.Input(..., ragged=True)
-  out = x.flat_values
-  """
+    x = keras.Input(..., ragged=True)
+    out = x.flat_values
+    """
 
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, attr_name, **kwargs):
-    self.attr_name = attr_name
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(self, attr_name, **kwargs):
+        self.attr_name = attr_name
 
-    if 'name' not in kwargs:
-      kwargs['name'] = backend.unique_object_name(
-          'input.' + self.attr_name, zero_based=True, avoid_observed_names=True)
-    kwargs['autocast'] = False
+        if "name" not in kwargs:
+            kwargs["name"] = backend.unique_object_name(
+                "input." + self.attr_name,
+                zero_based=True,
+                avoid_observed_names=True,
+            )
+        kwargs["autocast"] = False
 
-    # Do not individually trace op layers in the SavedModel.
-    self._must_restore_from_config = True
+        # Do not individually trace op layers in the SavedModel.
+        self._must_restore_from_config = True
 
-    super().__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    # Preserve all argument data structures when saving/loading a config
-    # (e.g., don't unnest lists that contain one element)
-    self._preserve_input_structure_in_config = True
+        # Preserve all argument data structures when saving/loading a config
+        # (e.g., don't unnest lists that contain one element)
+        self._preserve_input_structure_in_config = True
 
-  def call(self, obj):
-    return getattr(obj, self.attr_name)
+    def call(self, obj):
+        return getattr(obj, self.attr_name)
 
-  def get_config(self):
-    config = {'attr_name': self.attr_name}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"attr_name": self.attr_name}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        return cls(**config)
 
 
 class InstanceMethod(InstanceProperty):
-  """Wraps an instance method access (e.g. `x.foo(arg)` in a Keras Layer.
+    """Wraps an instance method access (e.g. `x.foo(arg)` in a Keras Layer.
 
-  This layer takes an attribute name `attr_name` in the constructor and,
-  when called on input tensor `obj` with additional arguments `args` and
-  `kwargs` returns `obj.attr_name(*args, **kwargs)`.
+    This layer takes an attribute name `attr_name` in the constructor and,
+    when called on input tensor `obj` with additional arguments `args` and
+    `kwargs` returns `obj.attr_name(*args, **kwargs)`.
 
-  KerasTensors specialized for specific extension types use it to
-  represent dynamic instance method calls on the represented object, e.g.
+    KerasTensors specialized for specific extension types use it to
+    represent dynamic instance method calls on the represented object, e.g.
 
-  x = keras.Input(..., ragged=True)
-  new_values = keras.Input(...)
-  out = x.with_values(new_values)
-  """
+    x = keras.Input(..., ragged=True)
+    new_values = keras.Input(...)
+    out = x.with_values(new_values)
+    """
 
-  def call(self, obj, args, kwargs):
-    method = getattr(obj, self.attr_name)
-    return method(*args, **kwargs)
+    def call(self, obj, args, kwargs):
+        method = getattr(obj, self.attr_name)
+        return method(*args, **kwargs)
 
 
 class TFOpLambda(Layer):
-  """Wraps TF API symbols in a `Layer` object.
-
-  It is inserted by the Functional API construction whenever users call
-  a supported TF symbol on KerasTensors.
-
-  Like Lambda layers, this layer tries to raise warnings when it detects users
-  explicitly use variables in the call. (To let them know
-  that the layer will not capture the variables).
-
-  This is useful in the case where users do something like:
-  x = keras.Input(...)
-  y = tf.Variable(...)
-  out = x * tf_variable
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, function, **kwargs):
-    self.function = function
-    self.symbol = (
-        get_canonical_name_for_symbol(
-            self.function, add_prefix_to_v1_names=True) or
-        get_canonical_name_for_symbol(
-            self.function, api_name='keras', add_prefix_to_v1_names=True))
-    if 'name' not in kwargs:
-      # Generate a name.
-      # TFOpLambda layers avoid already-observed names,
-      # because users cannot easily control the generated names.
-      # Without this avoidance, users would be more likely to run
-      # into unavoidable duplicate layer name collisions.
-      # (For standard layers users could just set `name` when creating the
-      # layer to work around a collision, but they can't do that for
-      # auto-generated layers)
-      if self.symbol:
-        name = 'tf.' + self.symbol
-      else:
-        name = self.function.__name__
-      kwargs['name'] = backend.unique_object_name(
-          name, zero_based=True, avoid_observed_names=True)
-    kwargs['autocast'] = False
-
-    # Decorate the function to produce this layer's call method
-    def _call_wrapper(*args, **kwargs):
-      return self._call_wrapper(*args, **kwargs)
-
-    self.call = tf.__internal__.decorator.make_decorator(
-        function, _call_wrapper)
-
-    # Do not individually trace op layers in the SavedModel.
-    self._must_restore_from_config = True
-
-    super().__init__(**kwargs)
-
-    # Preserve all argument data structures when saving/loading a config
-    # (e.g., don't unnest lists that contain one element)
-    self._preserve_input_structure_in_config = True
-
-    # Warning on every invocation will be quite irksome in Eager mode.
-    self._already_warned = False
-
-    self._call_spec.expects_training_arg = False
-    self._call_spec.expects_mask_arg = False
-
-  def _call_wrapper(self, *args, **kwargs):
-    created_variables = []
-
-    def _variable_creator(next_creator, **creator_kwargs):
-      var = next_creator(**creator_kwargs)
-      created_variables.append(var)
-      return var
-
-    with tf.GradientTape(watch_accessed_variables=True) as tape, \
-        tf.variable_creator_scope(_variable_creator):
-      # We explicitly drop `name` arguments here,
-      # to guard against the case where an op explicitly has a
-      # `name` passed (which is susceptible to producing
-      # multiple ops w/ the same name when the layer is reused)
-      kwargs.pop('name', None)
-      result = self.function(*args, **kwargs)
-    self._check_variables(created_variables, tape.watched_variables())
-    return result
-
-  def _check_variables(self, created_variables, accessed_variables):
-    if not created_variables and not accessed_variables:
-      # In the common case that a Lambda layer does not touch a Variable, we
-      # don't want to incur the runtime cost of assembling any state used for
-      # checking only to immediately discard it.
-      return
-
-    tracked_weights = set(v.ref() for v in self.weights)
-    untracked_new_vars = [
-        v for v in created_variables if v.ref() not in tracked_weights
-    ]
-    if untracked_new_vars:
-      variable_str = '\n'.join('  {}'.format(i) for i in untracked_new_vars)
-      raise ValueError(
-          'The following Variables were created within a Lambda layer '
-          f'({self.name}) but are not tracked by said layer: {variable_str}\n'
-          'The layer cannot safely ensure proper Variable reuse '
-          'across multiple calls, and consequently this behavior is disallowed '
-          'for safety reasons. Lambda layers are not well suited for stateful '
-          'computation; instead, writing a subclassed Layer is the recommend '
-          'way to define layers with Variables.')
-
-    untracked_used_vars = [
-        v for v in accessed_variables if v.ref() not in tracked_weights
-    ]
-    if untracked_used_vars and not self._already_warned:
-      variable_str = '\n'.join('  {}'.format(i) for i in untracked_used_vars)
-      self._warn(
-          'The following Variables were used in a Lambda layer\'s call '
-          f'({self.name}), but are not present in its tracked objects: '
-          f'{variable_str}. This is a strong indication that the Lambda layer '
-          'should be rewritten as a subclassed Layer.')
-      self._already_warned = True
-
-  def _warn(self, msg):
-    # This method will be overridden in a unit test to raise an error, because
-    # self.assertWarns is not universally implemented.
-    return tf_logging.warning(msg)
-
-  def get_config(self):
-    if not self.symbol:
-      raise ValueError(
-          f'This Keras op layer was generated from {self.function}, a method '
-          'that is not publicly exposed in the TensorFlow API. This '
-          'may have happened if the method was explicitly '
-          'decorated to add dispatching support, and it was used '
-          'during Functional model construction. '
-          'To ensure cross-version compatibility of Keras models '
-          'that use op layers, only op layers produced from '
-          'public TensorFlow API symbols can be serialized.')
-    config = {'function': self.symbol}
-
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()
-    symbol_name = config['function']
-    function = get_symbol_from_name(symbol_name)
-    if not function:
-      raise ValueError(f'TF symbol `{symbol_name}` could not be found.')
-
-    config['function'] = function
-
-    return cls(**config)
-
-
-def _delegate_property(keras_tensor_cls, property_name):  # pylint: disable=invalid-name
-  """Register property on a KerasTensor class.
-
-  Calling this multiple times with the same arguments should be a no-op.
-
-  This method exposes a property on the KerasTensor class that will use an
-  `InstanceProperty` layer to access the property on the represented
-  intermediate values in the model.
-
-  Args:
-    keras_tensor_cls: The KerasTensor subclass that should expose the property.
-    property_name: The name of the property to expose and delegate to the
-      represented (Composite)Tensor.
-  """
-  # We use a lambda because we can't create a Keras layer at import time
-  # due to dynamic layer class versioning.
-  property_access = property(lambda self: InstanceProperty(property_name)(self))  # pylint: disable=unnecessary-lambda
-  setattr(keras_tensor_cls, property_name, property_access)
-
-
-def _delegate_method(keras_tensor_cls, method_name):  # pylint: disable=invalid-name
-  """Register method on a KerasTensor class.
-
-  Calling this function times with the same arguments should be a no-op.
-
-  This method exposes an instance method on the KerasTensor class that will use
-  an `InstanceMethod` layer to run the desired method on the represented
-  intermediate values in the model.
-
-  Args:
-    keras_tensor_cls: The KerasTensor subclass that should expose the property.
-    method_name: The name of the method to expose and delegate to the
-      represented (Composite)Tensor.
-  """
-
-  def delegate(self, *args, **kwargs):
-    return InstanceMethod(method_name)(self, args, kwargs)
-
-  setattr(keras_tensor_cls, method_name, delegate)
+    """Wraps TF API symbols in a `Layer` object.
+
+    It is inserted by the Functional API construction whenever users call
+    a supported TF symbol on KerasTensors.
+
+    Like Lambda layers, this layer tries to raise warnings when it detects users
+    explicitly use variables in the call. (To let them know
+    that the layer will not capture the variables).
+
+    This is useful in the case where users do something like:
+    x = keras.Input(...)
+    y = tf.Variable(...)
+    out = x * tf_variable
+    """
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(self, function, **kwargs):
+        self.function = function
+        self.symbol = get_canonical_name_for_symbol(
+            self.function, add_prefix_to_v1_names=True
+        ) or get_canonical_name_for_symbol(
+            self.function, api_name="keras", add_prefix_to_v1_names=True
+        )
+        if "name" not in kwargs:
+            # Generate a name.
+            # TFOpLambda layers avoid already-observed names,
+            # because users cannot easily control the generated names.
+            # Without this avoidance, users would be more likely to run
+            # into unavoidable duplicate layer name collisions.
+            # (For standard layers users could just set `name` when creating the
+            # layer to work around a collision, but they can't do that for
+            # auto-generated layers)
+            if self.symbol:
+                name = "tf." + self.symbol
+            else:
+                name = self.function.__name__
+            kwargs["name"] = backend.unique_object_name(
+                name, zero_based=True, avoid_observed_names=True
+            )
+        kwargs["autocast"] = False
+
+        # Decorate the function to produce this layer's call method
+        def _call_wrapper(*args, **kwargs):
+            return self._call_wrapper(*args, **kwargs)
+
+        self.call = tf.__internal__.decorator.make_decorator(
+            function, _call_wrapper
+        )
+
+        # Do not individually trace op layers in the SavedModel.
+        self._must_restore_from_config = True
+
+        super().__init__(**kwargs)
+
+        # Preserve all argument data structures when saving/loading a config
+        # (e.g., don't unnest lists that contain one element)
+        self._preserve_input_structure_in_config = True
+
+        # Warning on every invocation will be quite irksome in Eager mode.
+        self._already_warned = False
+
+        self._call_spec.expects_training_arg = False
+        self._call_spec.expects_mask_arg = False
+
+    def _call_wrapper(self, *args, **kwargs):
+        created_variables = []
+
+        def _variable_creator(next_creator, **creator_kwargs):
+            var = next_creator(**creator_kwargs)
+            created_variables.append(var)
+            return var
+
+        with tf.GradientTape(
+            watch_accessed_variables=True
+        ) as tape, tf.variable_creator_scope(_variable_creator):
+            # We explicitly drop `name` arguments here,
+            # to guard against the case where an op explicitly has a
+            # `name` passed (which is susceptible to producing
+            # multiple ops w/ the same name when the layer is reused)
+            kwargs.pop("name", None)
+            result = self.function(*args, **kwargs)
+        self._check_variables(created_variables, tape.watched_variables())
+        return result
+
+    def _check_variables(self, created_variables, accessed_variables):
+        if not created_variables and not accessed_variables:
+            # In the common case that a Lambda layer does not touch a Variable,
+            # we don't want to incur the runtime cost of assembling any state
+            # used for checking only to immediately discard it.
+            return
+
+        tracked_weights = set(v.ref() for v in self.weights)
+        untracked_new_vars = [
+            v for v in created_variables if v.ref() not in tracked_weights
+        ]
+        if untracked_new_vars:
+            variable_str = "\n".join(f"  {i}" for i in untracked_new_vars)
+            raise ValueError(
+                "The following Variables were created within a Lambda layer "
+                f"({self.name}) but are not tracked by said layer: "
+                f"{variable_str}\n"
+                "The layer cannot safely ensure proper Variable reuse "
+                "across multiple calls, and consequently this behavior "
+                "is disallowed for safety reasons. Lambda layers are "
+                "not well suited for stateful computation; instead, "
+                "writing a subclassed Layer is the recommend "
+                "way to define layers with Variables."
+            )
+
+        untracked_used_vars = [
+            v for v in accessed_variables if v.ref() not in tracked_weights
+        ]
+        if untracked_used_vars and not self._already_warned:
+            variable_str = "\n".join(f"  {i}" for i in untracked_used_vars)
+            self._warn(
+                "The following Variables were used in a Lambda layer's call "
+                f"({self.name}), but are not present in its tracked objects: "
+                f"{variable_str}. This is a strong indication that the Lambda "
+                "layer should be rewritten as a subclassed Layer."
+            )
+            self._already_warned = True
+
+    def _warn(self, msg):
+        # This method will be overridden in a unit test to raise an error,
+        # because self.assertWarns is not universally implemented.
+        return tf_logging.warning(msg)
+
+    def get_config(self):
+        if not self.symbol:
+            raise ValueError(
+                f"This Keras op layer was generated from {self.function}, a "
+                "method that is not publicly exposed in the TensorFlow API. "
+                "This may have happened if the method was explicitly "
+                "decorated to add dispatching support, and it was used "
+                "during Functional model construction. "
+                "To ensure cross-version compatibility of Keras models "
+                "that use op layers, only op layers produced from "
+                "public TensorFlow API symbols can be serialized."
+            )
+        config = {"function": self.symbol}
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+        symbol_name = config["function"]
+        function = get_symbol_from_name(symbol_name)
+        if not function:
+            raise ValueError(f"TF symbol `{symbol_name}` could not be found.")
+
+        config["function"] = function
+
+        return cls(**config)
+
+
+def _delegate_property(keras_tensor_cls, property_name):
+    """Register property on a KerasTensor class.
+
+    Calling this multiple times with the same arguments should be a no-op.
+
+    This method exposes a property on the KerasTensor class that will use an
+    `InstanceProperty` layer to access the property on the represented
+    intermediate values in the model.
+
+    Args:
+      keras_tensor_cls: The KerasTensor subclass that should expose the
+        property.
+      property_name: The name of the property to expose and delegate to the
+        represented (Composite)Tensor.
+    """
+    # We use a lambda because we can't create a Keras layer at import time
+    # due to dynamic layer class versioning.
+    property_access = property(
+        lambda self: InstanceProperty(property_name)(self)
+    )
+    setattr(keras_tensor_cls, property_name, property_access)
+
+
+def _delegate_method(keras_tensor_cls, method_name):
+    """Register method on a KerasTensor class.
+
+    Calling this function times with the same arguments should be a no-op.
+
+    This method exposes an instance method on the KerasTensor class that will
+    use an `InstanceMethod` layer to run the desired method on the represented
+    intermediate values in the model.
+
+    Args:
+      keras_tensor_cls: The KerasTensor subclass that should expose the
+        property.
+      method_name: The name of the method to expose and delegate to the
+        represented (Composite)Tensor.
+    """
+
+    def delegate(self, *args, **kwargs):
+        return InstanceMethod(method_name)(self, args, kwargs)
+
+    setattr(keras_tensor_cls, method_name, delegate)
 
 
 # We do not support the `uniform_row_length` property because it
@@ -378,168 +405,177 @@ def delegate(self, *args, **kwargs):
 # never equal `None`, breaking code that expects it to be partially-static
 # in unpredictable ways.
 for ragged_property in [
-    'values', 'flat_values', 'row_splits', 'nested_row_splits'
+    "values",
+    "flat_values",
+    "row_splits",
+    "nested_row_splits",
 ]:
-  _delegate_property(keras_tensor.RaggedKerasTensor, ragged_property)
+    _delegate_property(keras_tensor.RaggedKerasTensor, ragged_property)
 
 for ragged_method_name in [
-    'value_rowids',
-    'nested_value_rowids',
-    'nrows',
-    'row_starts',
-    'row_limits',
-    'row_lengths',
-    'nested_row_lengths',
-    'bounding_shape',
-    'with_values',
-    'with_flat_values',
-    'with_row_splits_dtype',
-    'merge_dims',
-    'to_tensor',
-    'to_sparse',
+    "value_rowids",
+    "nested_value_rowids",
+    "nrows",
+    "row_starts",
+    "row_limits",
+    "row_lengths",
+    "nested_row_lengths",
+    "bounding_shape",
+    "with_values",
+    "with_flat_values",
+    "with_row_splits_dtype",
+    "merge_dims",
+    "to_tensor",
+    "to_sparse",
 ]:
-  _delegate_method(keras_tensor.RaggedKerasTensor, ragged_method_name)
+    _delegate_method(keras_tensor.RaggedKerasTensor, ragged_method_name)
 
 for sparse_property in [
-    'indices',
-    'values',
-    'dense_shape',
+    "indices",
+    "values",
+    "dense_shape",
 ]:
-  _delegate_property(keras_tensor.SparseKerasTensor, sparse_property)
+    _delegate_property(keras_tensor.SparseKerasTensor, sparse_property)
 
 for sparse_method in [
-    'with_values',
+    "with_values",
 ]:
-  _delegate_method(keras_tensor.SparseKerasTensor, sparse_method)
+    _delegate_method(keras_tensor.SparseKerasTensor, sparse_method)
 
 
 class TFClassMethodDispatcher(tf.__internal__.dispatch.OpDispatcher):
-  """A class method dispatcher that allows building a functional model with TF class methods."""
+    """A class method dispatcher that allows building a functional model with TF
+    class methods."""
 
-  def __init__(self, cls, method_name):
-    self.cls = cls
-    self.method_name = method_name
+    def __init__(self, cls, method_name):
+        self.cls = cls
+        self.method_name = method_name
 
-  def handle(self, args, kwargs):
-    """Handle the specified operation with the specified arguments."""
-    if any(
-        isinstance(x, keras_tensor.KerasTensor)
-        for x in tf.nest.flatten([args, kwargs])):
-      return ClassMethod(self.cls, self.method_name)(args[1:], kwargs)
-    else:
-      return self.NOT_SUPPORTED
+    def handle(self, args, kwargs):
+        """Handle the specified operation with the specified arguments."""
+        if any(
+            isinstance(x, keras_tensor.KerasTensor)
+            for x in tf.nest.flatten([args, kwargs])
+        ):
+            return ClassMethod(self.cls, self.method_name)(args[1:], kwargs)
+        else:
+            return self.NOT_SUPPORTED
 
 
 for ragged_class_method in [
-    'from_value_rowids',
-    'from_row_splits',
-    'from_row_lengths',
-    'from_row_starts',
-    'from_row_limits',
-    'from_uniform_row_length',
-    'from_nested_value_rowids',
-    'from_nested_row_splits',
-    'from_nested_row_lengths',
-    'from_tensor',
-    'from_sparse',
+    "from_value_rowids",
+    "from_row_splits",
+    "from_row_lengths",
+    "from_row_starts",
+    "from_row_limits",
+    "from_uniform_row_length",
+    "from_nested_value_rowids",
+    "from_nested_row_splits",
+    "from_nested_row_lengths",
+    "from_tensor",
+    "from_sparse",
 ]:
-  TFClassMethodDispatcher(tf.RaggedTensor, ragged_class_method).register(
-      getattr(tf.RaggedTensor, ragged_class_method))
+    TFClassMethodDispatcher(tf.RaggedTensor, ragged_class_method).register(
+        getattr(tf.RaggedTensor, ragged_class_method)
+    )
 
 
 class SlicingOpLambda(TFOpLambda):
-  """Wraps TF API symbols in a `Layer` object.
-
-  It is inserted by the Functional API construction whenever users call
-  a supported TF symbol on KerasTensors.
-
-  Like Lambda layers, this layer tries to raise warnings when it detects users
-  explicitly use variables in the call. (To let them know
-  that the layer will not capture the variables).
-
-  This is useful in the case where users do something like:
-  x = keras.Input(...)
-  y = tf.Variable(...)
-  out = x * tf_variable
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, function, **kwargs):
-    super().__init__(function, **kwargs)
-
-    original_call = self.call
-
-    # Decorate the function to produce this layer's call method
-    def _call_wrapper(*args, **kwargs):
-      # Turn any slice dicts in the args back into `slice` objects.
-      # This conversion cannot use nest.flatten/map_structure,
-      # because dicts are flattened by nest while slices aren't.
-      # So, map_structure would only see the individual elements in the
-      # dict.
-      # This can't use map_structure_up_to either because the 'shallowness' of
-      # the shallow tree would have to vary depending on if only one dim or
-      # multiple are being sliced.
-      new_args = []
-      for arg in args:
-        arg = _dict_to_slice(arg)
-        if isinstance(arg, (list, tuple)):
-          new_arg = []
-          for sub_arg in arg:
-            new_arg.append(_dict_to_slice(sub_arg))
-          arg = new_arg
-        new_args.append(arg)
-
-      # Handle the kwargs too.
-      new_kwargs = {}
-      for key, value in kwargs.items():
-        value = _dict_to_slice(value)
-        if isinstance(value, (list, tuple)):
-          new_value = []
-          for v in value:
-            new_value.append(_dict_to_slice(v))
-          value = new_value
-        new_kwargs[key] = value
-
-      return original_call(*new_args, **new_kwargs)
-
-    self.call = tf.__internal__.decorator.make_decorator(
-        original_call, _call_wrapper)
+    """Wraps TF API symbols in a `Layer` object.
+
+    It is inserted by the Functional API construction whenever users call
+    a supported TF symbol on KerasTensors.
+
+    Like Lambda layers, this layer tries to raise warnings when it detects users
+    explicitly use variables in the call. (To let them know
+    that the layer will not capture the variables).
+
+    This is useful in the case where users do something like:
+    x = keras.Input(...)
+    y = tf.Variable(...)
+    out = x * tf_variable
+    """
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(self, function, **kwargs):
+        super().__init__(function, **kwargs)
+
+        original_call = self.call
+
+        # Decorate the function to produce this layer's call method
+        def _call_wrapper(*args, **kwargs):
+            # Turn any slice dicts in the args back into `slice` objects.
+            # This conversion cannot use nest.flatten/map_structure,
+            # because dicts are flattened by nest while slices aren't.
+            # So, map_structure would only see the individual elements in the
+            # dict.
+            # This can't use map_structure_up_to either because the
+            # 'shallowness' of the shallow tree would have to vary depending on
+            # if only one dim or multiple are being sliced.
+            new_args = []
+            for arg in args:
+                arg = _dict_to_slice(arg)
+                if isinstance(arg, (list, tuple)):
+                    new_arg = []
+                    for sub_arg in arg:
+                        new_arg.append(_dict_to_slice(sub_arg))
+                    arg = new_arg
+                new_args.append(arg)
+
+            # Handle the kwargs too.
+            new_kwargs = {}
+            for key, value in kwargs.items():
+                value = _dict_to_slice(value)
+                if isinstance(value, (list, tuple)):
+                    new_value = []
+                    for v in value:
+                        new_value.append(_dict_to_slice(v))
+                    value = new_value
+                new_kwargs[key] = value
+
+            return original_call(*new_args, **new_kwargs)
+
+        self.call = tf.__internal__.decorator.make_decorator(
+            original_call, _call_wrapper
+        )
 
 
 def _slice_to_dict(x):
-  if isinstance(x, slice):
-    return {'start': x.start, 'stop': x.stop, 'step': x.step}
-  return x
+    if isinstance(x, slice):
+        return {"start": x.start, "stop": x.stop, "step": x.step}
+    return x
 
 
 def _dict_to_slice(x):
-  if isinstance(x, dict):
-    return slice(x['start'], x['stop'], x['step'])
-  return x
+    if isinstance(x, dict):
+        return slice(x["start"], x["stop"], x["step"])
+    return x
 
 
 class TFSlicingOpDispatcher(tf.__internal__.dispatch.OpDispatcher):
-  """A global dispatcher that allows building a functional model with TF Ops."""
+    """A global dispatcher that allows building a functional model with TF
+    Ops."""
 
-  def __init__(self, op):
-    self.op = op
+    def __init__(self, op):
+        self.op = op
 
-  def handle(self, args, kwargs):
-    """Handle the specified operation with the specified arguments."""
-    args = tf.nest.map_structure(_slice_to_dict, args)
-    kwargs = tf.nest.map_structure(_slice_to_dict, kwargs)
-    if any(
-        isinstance(x, keras_tensor.KerasTensor)
-        for x in tf.nest.flatten([args, kwargs])):
-      return SlicingOpLambda(self.op)(*args, **kwargs)
-    else:
-      return self.NOT_SUPPORTED
+    def handle(self, args, kwargs):
+        """Handle the specified operation with the specified arguments."""
+        args = tf.nest.map_structure(_slice_to_dict, args)
+        kwargs = tf.nest.map_structure(_slice_to_dict, kwargs)
+        if any(
+            isinstance(x, keras_tensor.KerasTensor)
+            for x in tf.nest.flatten([args, kwargs])
+        ):
+            return SlicingOpLambda(self.op)(*args, **kwargs)
+        else:
+            return self.NOT_SUPPORTED
 
 
 for slicing_op in [
-    tf.__operators__.getitem,  # pylint: disable=protected-access
+    tf.__operators__.getitem,
     tf.compat.v1.boolean_mask,
     tf.boolean_mask,
-    tf.__operators__.ragged_getitem
+    tf.__operators__.ragged_getitem,
 ]:
-  TFSlicingOpDispatcher(slicing_op).register(slicing_op)
+    TFSlicingOpDispatcher(slicing_op).register(slicing_op)
diff --git a/keras/layers/kernelized.py b/keras/layers/kernelized.py
index 5f3b64a0c905..f8114bbb7c74 100644
--- a/keras/layers/kernelized.py
+++ b/keras/layers/kernelized.py
@@ -12,254 +12,275 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Keras layers that implement explicit (approximate) kernel feature maps."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 from keras import initializers
 from keras.engine import base_layer
 from keras.engine import input_spec
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-_SUPPORTED_RBF_KERNEL_TYPES = ['gaussian', 'laplacian']
+_SUPPORTED_RBF_KERNEL_TYPES = ["gaussian", "laplacian"]
 
 
-@keras_export('keras.layers.experimental.RandomFourierFeatures')
+@keras_export("keras.layers.experimental.RandomFourierFeatures")
 class RandomFourierFeatures(base_layer.Layer):
-  r"""Layer that projects its inputs into a random feature space.
-
-  This layer implements a mapping from input space to a space with `output_dim`
-  dimensions, which approximates shift-invariant kernels. A kernel function
-  `K(x, y)` is shift-invariant if `K(x, y) == k(x - y)` for some function `k`.
-  Many popular Radial Basis Functions (RBF), including Gaussian and
-  Laplacian kernels, are shift-invariant.
-
-  The implementation of this layer is based on the following paper:
-  ["Random Features for Large-Scale Kernel Machines"](
-    https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
-  by Ali Rahimi and Ben Recht.
-
-  The distribution from which the parameters of the random features map (layer)
-  are sampled determines which shift-invariant kernel the layer approximates
-  (see paper for more details). You can use the distribution of your
-  choice. The layer supports out-of-the-box
-  approximations of the following two RBF kernels:
-
-  - Gaussian: `K(x, y) == exp(- square(x - y) / (2 * square(scale)))`
-  - Laplacian: `K(x, y) = exp(-abs(x - y) / scale))`
-
-  **Note:** Unlike what is described in the paper and unlike what is used in
-  the Scikit-Learn implementation, the output of this layer does not apply
-  the `sqrt(2 / D)` normalization factor.
-
-  **Usage:** Typically, this layer is used to "kernelize" linear models by
-  applying a non-linear transformation (this layer) to the input features and
-  then training a linear model on top of the transformed features. Depending on
-  the loss function of the linear model, the composition of this layer and the
-  linear model results to models that are equivalent (up to approximation) to
-  kernel SVMs (for hinge loss), kernel logistic regression (for logistic loss),
-  kernel linear regression (for squared loss), etc.
-
-  Examples:
-
-  A kernel multinomial logistic regression model with Gaussian kernel for MNIST:
-
-  ```python
-  model = keras.Sequential([
-    keras.Input(shape=(784,)),
-    RandomFourierFeatures(
-        output_dim=4096,
-        scale=10.,
-        kernel_initializer='gaussian'),
-    layers.Dense(units=10, activation='softmax'),
-  ])
-  model.compile(
-      optimizer='adam',
-      loss='categorical_crossentropy',
-      metrics=['categorical_accuracy']
-  )
-  ```
-
-  A quasi-SVM classifier for MNIST:
-
-  ```python
-  model = keras.Sequential([
-    keras.Input(shape=(784,)),
-    RandomFourierFeatures(
-        output_dim=4096,
-        scale=10.,
-        kernel_initializer='gaussian'),
-    layers.Dense(units=10),
-  ])
-  model.compile(
-      optimizer='adam',
-      loss='hinge',
-      metrics=['categorical_accuracy']
-  )
-  ```
-
-  To use another kernel, just replace the layer creation line with:
-
-  ```python
-  random_features_layer = RandomFourierFeatures(
-      output_dim=500,
-      kernel_initializer=<my_initializer>,
-      scale=...,
-      ...)
-  ```
-
-  Args:
-    output_dim: Positive integer, the dimension of the layer's output, i.e., the
-      number of random features used to approximate the kernel.
-    kernel_initializer: Determines the distribution of the parameters of the
-      random features map (and therefore the kernel approximated by the layer).
-      It can be either a string identifier or a Keras `Initializer` instance.
-      Currently only 'gaussian' and 'laplacian' are supported string
-      identifiers (case insensitive). Note that the kernel matrix is not
-      trainable.
-    scale: For Gaussian and Laplacian kernels, this corresponds to a scaling
-      factor of the corresponding kernel approximated by the layer (see concrete
-      definitions above). When provided, it should be a positive float. If None,
-      a default value is used: if the kernel initializer is set to "gaussian",
-      `scale` defaults to `sqrt(input_dim / 2)`, otherwise, it defaults to 1.0.
-      Both the approximation error of the kernel and the classification quality
-      are sensitive to this parameter. If `trainable` is set to `True`, this
-      parameter is learned end-to-end during training and the provided value
-      serves as the initial value.
-      **Note:** When features from this layer are fed to a linear model,
-        by making `scale` trainable, the resulting optimization problem is
-        no longer convex (even if the loss function used by the linear model
-        is convex).
-    trainable: Whether the scaling parameter of the layer should be trainable.
-      Defaults to `False`.
-    name: String, name to use for this layer.
-  """
-
-  def __init__(self,
-               output_dim,
-               kernel_initializer='gaussian',
-               scale=None,
-               trainable=False,
-               name=None,
-               **kwargs):
-    if output_dim <= 0:
-      raise ValueError(
-          f'`output_dim` should be a positive integer. Received: {output_dim}')
-    if isinstance(kernel_initializer, str):
-      if kernel_initializer.lower() not in _SUPPORTED_RBF_KERNEL_TYPES:
-        raise ValueError(
-            f'Unsupported `kernel_initializer`: {kernel_initializer} '
-            f'Expected one of: {_SUPPORTED_RBF_KERNEL_TYPES}')
-    if scale is not None and scale <= 0.0:
-      raise ValueError('When provided, `scale` should be a positive float. '
-                       f'Received: {scale}')
-    super().__init__(
-        trainable=trainable, name=name, **kwargs)
-    self.output_dim = output_dim
-    self.kernel_initializer = kernel_initializer
-    self.scale = scale
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    # TODO(pmol): Allow higher dimension inputs. Currently the input is expected
-    # to have shape [batch_size, dimension].
-    if input_shape.rank != 2:
-      raise ValueError(
-          'The rank of the input tensor should be 2. '
-          f'Received input with rank {input_shape.ndims} instead. '
-          f'Full input shape received: {input_shape}')
-    if input_shape.dims[1].value is None:
-      raise ValueError(
-          'The last dimension of the input tensor should be defined. '
-          f'Found `None`. Full input shape received: {input_shape}')
-    self.input_spec = input_spec.InputSpec(
-        ndim=2, axes={1: input_shape.dims[1].value})
-    input_dim = input_shape.dims[1].value
-
-    kernel_initializer = _get_random_features_initializer(
-        self.kernel_initializer, shape=(input_dim, self.output_dim))
-
-    self.unscaled_kernel = self.add_weight(
-        name='unscaled_kernel',
-        shape=(input_dim, self.output_dim),
-        dtype=tf.float32,
-        initializer=kernel_initializer,
-        trainable=False)
-
-    self.bias = self.add_weight(
-        name='bias',
-        shape=(self.output_dim,),
-        dtype=tf.float32,
-        initializer=initializers.RandomUniform(minval=0.0, maxval=2 * np.pi),
-        trainable=False)
-
-    if self.scale is None:
-      self.scale = _get_default_scale(self.kernel_initializer, input_dim)
-    self.kernel_scale = self.add_weight(
-        name='kernel_scale',
-        shape=(1,),
-        dtype=tf.float32,
-        initializer=tf.compat.v1.constant_initializer(self.scale),
-        trainable=True,
-        constraint='NonNeg')
-    super().build(input_shape)
-
-  def call(self, inputs):
-    inputs = tf.convert_to_tensor(inputs, dtype=self.dtype)
-    inputs = tf.cast(inputs, tf.float32)
-    kernel = (1.0 / self.kernel_scale) * self.unscaled_kernel
-    outputs = tf.matmul(a=inputs, b=kernel)
-    outputs = tf.nn.bias_add(outputs, self.bias)
-    return tf.cos(outputs)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    input_shape = input_shape.with_rank(2)
-    if input_shape.dims[-1].value is None:
-      raise ValueError(
-          'The last dimension of the input tensor should be defined. '
-          f'Found `None`. Full input shape received: {input_shape}')
-    return input_shape[:-1].concatenate(self.output_dim)
-
-  def get_config(self):
-    kernel_initializer = self.kernel_initializer
-    if not isinstance(kernel_initializer, str):
-      kernel_initializer = initializers.serialize(kernel_initializer)
-    config = {
-        'output_dim': self.output_dim,
-        'kernel_initializer': kernel_initializer,
-        'scale': self.scale,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    r"""Layer that projects its inputs into a random feature space.
+
+    This layer implements a mapping from input space to a space with
+    `output_dim` dimensions, which approximates shift-invariant kernels. A
+    kernel function `K(x, y)` is shift-invariant if `K(x, y) == k(x - y)` for
+    some function `k`.  Many popular Radial Basis Functions (RBF), including
+    Gaussian and Laplacian kernels, are shift-invariant.
+
+    The implementation of this layer is based on the following paper:
+    ["Random Features for Large-Scale Kernel Machines"](
+      https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+    by Ali Rahimi and Ben Recht.
+
+    The distribution from which the parameters of the random features map
+    (layer) are sampled determines which shift-invariant kernel the layer
+    approximates (see paper for more details). You can use the distribution of
+    your choice. The layer supports out-of-the-box approximations of the
+    following two RBF kernels:
+
+    - Gaussian: `K(x, y) == exp(- square(x - y) / (2 * square(scale)))`
+    - Laplacian: `K(x, y) = exp(-abs(x - y) / scale))`
+
+    **Note:** Unlike what is described in the paper and unlike what is used in
+    the Scikit-Learn implementation, the output of this layer does not apply
+    the `sqrt(2 / D)` normalization factor.
+
+    **Usage:** Typically, this layer is used to "kernelize" linear models by
+    applying a non-linear transformation (this layer) to the input features and
+    then training a linear model on top of the transformed features. Depending
+    on the loss function of the linear model, the composition of this layer and
+    the linear model results to models that are equivalent (up to approximation)
+    to kernel SVMs (for hinge loss), kernel logistic regression (for logistic
+    loss), kernel linear regression (for squared loss), etc.
+
+    Examples:
+
+    A kernel multinomial logistic regression model with Gaussian kernel for
+    MNIST:
+
+    ```python
+    model = keras.Sequential([
+      keras.Input(shape=(784,)),
+      RandomFourierFeatures(
+          output_dim=4096,
+          scale=10.,
+          kernel_initializer='gaussian'),
+      layers.Dense(units=10, activation='softmax'),
+    ])
+    model.compile(
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['categorical_accuracy']
+    )
+    ```
+
+    A quasi-SVM classifier for MNIST:
+
+    ```python
+    model = keras.Sequential([
+      keras.Input(shape=(784,)),
+      RandomFourierFeatures(
+          output_dim=4096,
+          scale=10.,
+          kernel_initializer='gaussian'),
+      layers.Dense(units=10),
+    ])
+    model.compile(
+        optimizer='adam',
+        loss='hinge',
+        metrics=['categorical_accuracy']
+    )
+    ```
+
+    To use another kernel, just replace the layer creation line with:
+
+    ```python
+    random_features_layer = RandomFourierFeatures(
+        output_dim=500,
+        kernel_initializer=<my_initializer>,
+        scale=...,
+        ...)
+    ```
+
+    Args:
+      output_dim: Positive integer, the dimension of the layer's output, i.e.,
+        the number of random features used to approximate the kernel.
+      kernel_initializer: Determines the distribution of the parameters of the
+        random features map (and therefore the kernel approximated by the
+        layer).  It can be either a string identifier or a Keras `Initializer`
+        instance.  Currently only 'gaussian' and 'laplacian' are supported
+        string identifiers (case insensitive). Note that the kernel matrix is
+        not trainable.
+      scale: For Gaussian and Laplacian kernels, this corresponds to a scaling
+        factor of the corresponding kernel approximated by the layer (see
+        concrete definitions above). When provided, it should be a positive
+        float. If None, a default value is used: if the kernel initializer is
+        set to "gaussian", `scale` becomes `sqrt(input_dim / 2)`, otherwise,
+        it becomes 1.0.  Both the approximation error of the kernel and the
+        classification quality are sensitive to this parameter. If `trainable`
+        is set to `True`, this parameter is learned end-to-end during training
+        and the provided value serves as the initial value.
+        **Note:** When features from this layer are fed to a linear model,
+          by making `scale` trainable, the resulting optimization problem is
+          no longer convex (even if the loss function used by the linear model
+          is convex).
+        Defaults to `None`.
+      trainable: Whether the scaling parameter of the layer should be trainable.
+        Defaults to `False`.
+      name: String, name to use for this layer.
+    """
+
+    def __init__(
+        self,
+        output_dim,
+        kernel_initializer="gaussian",
+        scale=None,
+        trainable=False,
+        name=None,
+        **kwargs,
+    ):
+        if output_dim <= 0:
+            raise ValueError(
+                "`output_dim` should be a positive integer. "
+                f"Received: {output_dim}"
+            )
+        if isinstance(kernel_initializer, str):
+            if kernel_initializer.lower() not in _SUPPORTED_RBF_KERNEL_TYPES:
+                raise ValueError(
+                    f"Unsupported `kernel_initializer`: {kernel_initializer} "
+                    f"Expected one of: {_SUPPORTED_RBF_KERNEL_TYPES}"
+                )
+        if scale is not None and scale <= 0.0:
+            raise ValueError(
+                "When provided, `scale` should be a positive float. "
+                f"Received: {scale}"
+            )
+        super().__init__(trainable=trainable, name=name, **kwargs)
+        self.output_dim = output_dim
+        self.kernel_initializer = kernel_initializer
+        self.scale = scale
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        # TODO(pmol): Allow higher dimension inputs. Currently the input is
+        # expected to have shape [batch_size, dimension].
+        if input_shape.rank != 2:
+            raise ValueError(
+                "The rank of the input tensor should be 2. "
+                f"Received input with rank {input_shape.ndims} instead. "
+                f"Full input shape received: {input_shape}"
+            )
+        if input_shape.dims[1].value is None:
+            raise ValueError(
+                "The last dimension of the input tensor should be defined. "
+                f"Found `None`. Full input shape received: {input_shape}"
+            )
+        self.input_spec = input_spec.InputSpec(
+            ndim=2, axes={1: input_shape.dims[1].value}
+        )
+        input_dim = input_shape.dims[1].value
+
+        kernel_initializer = _get_random_features_initializer(
+            self.kernel_initializer, shape=(input_dim, self.output_dim)
+        )
+
+        self.unscaled_kernel = self.add_weight(
+            name="unscaled_kernel",
+            shape=(input_dim, self.output_dim),
+            dtype=tf.float32,
+            initializer=kernel_initializer,
+            trainable=False,
+        )
+
+        self.bias = self.add_weight(
+            name="bias",
+            shape=(self.output_dim,),
+            dtype=tf.float32,
+            initializer=initializers.RandomUniform(
+                minval=0.0, maxval=2 * np.pi
+            ),
+            trainable=False,
+        )
+
+        if self.scale is None:
+            self.scale = _get_default_scale(self.kernel_initializer, input_dim)
+        self.kernel_scale = self.add_weight(
+            name="kernel_scale",
+            shape=(1,),
+            dtype=tf.float32,
+            initializer=tf.compat.v1.constant_initializer(self.scale),
+            trainable=True,
+            constraint="NonNeg",
+        )
+        super().build(input_shape)
+
+    def call(self, inputs):
+        inputs = tf.convert_to_tensor(inputs, dtype=self.dtype)
+        inputs = tf.cast(inputs, tf.float32)
+        kernel = (1.0 / self.kernel_scale) * self.unscaled_kernel
+        outputs = tf.matmul(a=inputs, b=kernel)
+        outputs = tf.nn.bias_add(outputs, self.bias)
+        return tf.cos(outputs)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        input_shape = input_shape.with_rank(2)
+        if input_shape.dims[-1].value is None:
+            raise ValueError(
+                "The last dimension of the input tensor should be defined. "
+                f"Found `None`. Full input shape received: {input_shape}"
+            )
+        return input_shape[:-1].concatenate(self.output_dim)
+
+    def get_config(self):
+        kernel_initializer = self.kernel_initializer
+        if not isinstance(kernel_initializer, str):
+            kernel_initializer = initializers.serialize(kernel_initializer)
+        config = {
+            "output_dim": self.output_dim,
+            "kernel_initializer": kernel_initializer,
+            "scale": self.scale,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 def _get_random_features_initializer(initializer, shape):
-  """Returns Initializer object for random features."""
+    """Returns Initializer object for random features."""
 
-  def _get_cauchy_samples(loc, scale, shape):
-    probs = np.random.uniform(low=0., high=1., size=shape)
-    return loc + scale * np.tan(np.pi * (probs - 0.5))
+    def _get_cauchy_samples(loc, scale, shape):
+        probs = np.random.uniform(low=0.0, high=1.0, size=shape)
+        return loc + scale * np.tan(np.pi * (probs - 0.5))
 
-  random_features_initializer = initializer
-  if isinstance(initializer, str):
-    if initializer.lower() == 'gaussian':
-      random_features_initializer = initializers.RandomNormal(stddev=1.0)
-    elif initializer.lower() == 'laplacian':
-      random_features_initializer = initializers.Constant(
-          _get_cauchy_samples(loc=0.0, scale=1.0, shape=shape))
+    random_features_initializer = initializer
+    if isinstance(initializer, str):
+        if initializer.lower() == "gaussian":
+            random_features_initializer = initializers.RandomNormal(stddev=1.0)
+        elif initializer.lower() == "laplacian":
+            random_features_initializer = initializers.Constant(
+                _get_cauchy_samples(loc=0.0, scale=1.0, shape=shape)
+            )
 
-    else:
-      raise ValueError(
-          f'Unsupported `kernel_initializer`: "{initializer}" '
-          f'Expected one of: {_SUPPORTED_RBF_KERNEL_TYPES}')
-  return random_features_initializer
+        else:
+            raise ValueError(
+                f'Unsupported `kernel_initializer`: "{initializer}" '
+                f"Expected one of: {_SUPPORTED_RBF_KERNEL_TYPES}"
+            )
+    return random_features_initializer
 
 
 def _get_default_scale(initializer, input_dim):
-  if (isinstance(initializer, str) and
-      initializer.lower() == 'gaussian'):
-    return np.sqrt(input_dim / 2.0)
-  return 1.0
+    if isinstance(initializer, str) and initializer.lower() == "gaussian":
+        return np.sqrt(input_dim / 2.0)
+    return 1.0
diff --git a/keras/layers/kernelized_test.py b/keras/layers/kernelized_test.py
index 5f48d9864f75..33835ccd5faf 100644
--- a/keras/layers/kernelized_test.py
+++ b/keras/layers/kernelized_test.py
@@ -14,372 +14,440 @@
 # ==============================================================================
 """Tests for kernelized.py."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 import math
 import os
 import shutil
 
-from absl.testing import parameterized
 import numpy as np
-from tensorflow.python.framework import test_util as tf_test_utils
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras import backend as keras_backend
-from keras.testing_infra import test_combinations
 from keras import initializers
-from keras.testing_infra import test_utils
 from keras.engine import base_layer_utils
 from keras.engine import input_layer
 from keras.engine import training
 from keras.layers import kernelized as kernel_layers
-from keras.saving import save
+from keras.saving.legacy import save
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 from keras.utils import kernelized_utils
 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
 
 def _exact_gaussian(stddev):
-  return functools.partial(
-      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+    return functools.partial(
+        kernelized_utils.exact_gaussian_kernel, stddev=stddev
+    )
 
 
 def _exact_laplacian(stddev):
-  return functools.partial(
-      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+    return functools.partial(
+        kernelized_utils.exact_laplacian_kernel, stddev=stddev
+    )
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class RandomFourierFeaturesTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _assert_all_close(self, expected, actual, atol=0.001):
-    if not tf.executing_eagerly():
-      with self.cached_session() as sess:
-        keras_backend._initialize_variables(sess)
-        self.assertAllClose(expected, actual, atol=atol)
-    else:
-      self.assertAllClose(expected, actual, atol=atol)
-
-  @test_utils.run_v2_only
-  def test_state_saving_and_loading(self):
-    with self.cached_session():
-      input_data = np.random.random((1, 2))
-      rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
-      inputs = input_layer.Input((2,))
-      outputs = rff_layer(inputs)
-      model = training.Model(inputs, outputs)
-      output_data = model.predict(input_data)
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir)
-      saved_model_dir = os.path.join(temp_dir, 'rff_model')
-      model.save(saved_model_dir)
-      new_model = save.load_model(saved_model_dir)
-      new_output_data = new_model.predict(input_data)
-      self.assertAllClose(output_data, new_output_data, atol=1e-4)
-
-  def test_invalid_output_dim(self):
-    with self.assertRaisesRegex(
-        ValueError, '`output_dim` should be a positive integer'):
-      _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)
-
-  def test_unsupported_kernel_type(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Unsupported `kernel_initializer`'):
-      _ = kernel_layers.RandomFourierFeatures(
-          3, 'unsupported_kernel', stddev=2.0)
-
-  def test_invalid_scale(self):
-    with self.assertRaisesRegex(
-        ValueError,
-        'When provided, `scale` should be a positive float'):
-      _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)
-
-  def test_invalid_input_shape(self):
-    inputs = tf.random.uniform((3, 2, 4), seed=1)
-    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
-    with self.assertRaisesRegex(
-        ValueError,
-        'The rank of the input tensor should be 2'):
-      _ = rff_layer(inputs)
-
-  @parameterized.named_parameters(
-      ('gaussian', 'gaussian', 10.0, False),
-      ('random', tf.compat.v1.random_uniform_initializer, 1.0, True))
-  def test_random_features_properties(self, initializer, scale, trainable):
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim=10,
-        kernel_initializer=initializer,
-        scale=scale,
-        trainable=trainable)
-    self.assertEqual(rff_layer.output_dim, 10)
-    self.assertEqual(rff_layer.kernel_initializer, initializer)
-    self.assertEqual(rff_layer.scale, scale)
-    self.assertEqual(rff_layer.trainable, trainable)
-
-  @parameterized.named_parameters(('gaussian', 'gaussian', False),
-                                  ('laplacian', 'laplacian', True),
-                                  ('other', tf.compat.v1.ones_initializer, True))
-  def test_call(self, initializer, trainable):
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim=10,
-        kernel_initializer=initializer,
-        scale=1.0,
-        trainable=trainable,
-        name='random_fourier_features')
-    inputs = tf.random.uniform((3, 2), seed=1)
-    outputs = rff_layer(inputs)
-    self.assertListEqual([3, 10], outputs.shape.as_list())
-    num_trainable_vars = 1 if trainable else 0
-    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
-
-  @tf_test_utils.assert_no_new_pyobjects_executing_eagerly
-  def test_no_eager_Leak(self):
-    # Tests that repeatedly constructing and building a Layer does not leak
-    # Python objects.
-    inputs = tf.random.uniform((5, 4), seed=1)
-    kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs)
-    kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)
-
-  def test_output_shape(self):
-    inputs = tf.random.uniform((3, 2), seed=1)
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim=7, name='random_fourier_features', trainable=True)
-    outputs = rff_layer(inputs)
-    self.assertEqual([3, 7], outputs.shape.as_list())
-
-  @parameterized.named_parameters(
-      ('gaussian', 'gaussian'), ('laplacian', 'laplacian'),
-      ('other', tf.compat.v1.random_uniform_initializer))
-  def test_call_on_placeholder(self, initializer):
-    with tf.Graph().as_default():
-      inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, None])
-      rff_layer = kernel_layers.RandomFourierFeatures(
-          output_dim=5,
-          kernel_initializer=initializer,
-          name='random_fourier_features')
-      with self.assertRaisesRegex(
-          ValueError,
-          'The last dimension of the input tensor should be defined'):
-        rff_layer(inputs)
-
-      inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[2, None])
-      rff_layer = kernel_layers.RandomFourierFeatures(
-          output_dim=5,
-          kernel_initializer=initializer,
-          name='random_fourier_features')
-      with self.assertRaisesRegex(
-          ValueError,
-          'The last dimension of the input tensor should be defined'):
-        rff_layer(inputs)
-
-      inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 3])
-      rff_layer = kernel_layers.RandomFourierFeatures(
-          output_dim=5, name='random_fourier_features')
-      rff_layer(inputs)
-
-  @parameterized.named_parameters(('gaussian', 10, 'gaussian', 2.0),
-                                  ('laplacian', 5, 'laplacian', None),
-                                  ('other', 10, tf.compat.v1.ones_initializer, 1.0))
-  def test_compute_output_shape(self, output_dim, initializer, scale):
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim, initializer, scale=scale, name='rff')
-    with self.assertRaises(ValueError):
-      rff_layer.compute_output_shape(tf.TensorShape(None))
-    with self.assertRaises(ValueError):
-      rff_layer.compute_output_shape(tf.TensorShape([]))
-    with self.assertRaises(ValueError):
-      rff_layer.compute_output_shape(tf.TensorShape([3]))
-    with self.assertRaises(ValueError):
-      rff_layer.compute_output_shape(tf.TensorShape([3, 2, 3]))
-
-    with self.assertRaisesRegex(
-        ValueError, 'The last dimension of the input tensor should be defined'):
-      rff_layer.compute_output_shape(tf.TensorShape([3, None]))
-
-    self.assertEqual([None, output_dim],
-                     rff_layer.compute_output_shape((None, 3)).as_list())
-    self.assertEqual([None, output_dim],
-                     rff_layer.compute_output_shape(
-                         tf.TensorShape([None, 2])).as_list())
-    self.assertEqual([4, output_dim],
-                     rff_layer.compute_output_shape((4, 1)).as_list())
-
-  @parameterized.named_parameters(
-      ('gaussian', 10, 'gaussian', 3.0, False),
-      ('laplacian', 5, 'laplacian', 5.5, True),
-      ('other', 7, tf.compat.v1.random_uniform_initializer(), None, True))
-  def test_get_config(self, output_dim, initializer, scale, trainable):
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim,
-        initializer,
-        scale=scale,
-        trainable=trainable,
-        name='random_fourier_features',
+    def _assert_all_close(self, expected, actual, atol=0.001):
+        if not tf.executing_eagerly():
+            with self.cached_session() as sess:
+                keras_backend._initialize_variables(sess)
+                self.assertAllClose(expected, actual, atol=atol)
+        else:
+            self.assertAllClose(expected, actual, atol=atol)
+
+    @test_utils.run_v2_only
+    def test_state_saving_and_loading(self):
+        with self.cached_session():
+            input_data = np.random.random((1, 2))
+            rff_layer = kernel_layers.RandomFourierFeatures(
+                output_dim=10, scale=3.0
+            )
+            inputs = input_layer.Input((2,))
+            outputs = rff_layer(inputs)
+            model = training.Model(inputs, outputs)
+            output_data = model.predict(input_data)
+            temp_dir = self.get_temp_dir()
+            self.addCleanup(shutil.rmtree, temp_dir)
+            saved_model_dir = os.path.join(temp_dir, "rff_model")
+            model.save(saved_model_dir)
+            new_model = save.load_model(saved_model_dir)
+            new_output_data = new_model.predict(input_data)
+            self.assertAllClose(output_data, new_output_data, atol=1e-4)
+
+    def test_invalid_output_dim(self):
+        with self.assertRaisesRegex(
+            ValueError, "`output_dim` should be a positive integer"
+        ):
+            _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)
+
+    def test_unsupported_kernel_type(self):
+        with self.assertRaisesRegex(
+            ValueError, "Unsupported `kernel_initializer`"
+        ):
+            _ = kernel_layers.RandomFourierFeatures(
+                3, "unsupported_kernel", stddev=2.0
+            )
+
+    def test_invalid_scale(self):
+        with self.assertRaisesRegex(
+            ValueError, "When provided, `scale` should be a positive float"
+        ):
+            _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)
+
+    def test_invalid_input_shape(self):
+        inputs = tf.random.uniform((3, 2, 4), seed=1)
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim=10, scale=3.0
+        )
+        with self.assertRaisesRegex(
+            ValueError, "The rank of the input tensor should be 2"
+        ):
+            _ = rff_layer(inputs)
+
+    @parameterized.named_parameters(
+        ("gaussian", "gaussian", 10.0, False),
+        ("random", tf.compat.v1.random_uniform_initializer, 1.0, True),
+    )
+    def test_random_features_properties(self, initializer, scale, trainable):
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim=10,
+            kernel_initializer=initializer,
+            scale=scale,
+            trainable=trainable,
+        )
+        self.assertEqual(rff_layer.output_dim, 10)
+        self.assertEqual(rff_layer.kernel_initializer, initializer)
+        self.assertEqual(rff_layer.scale, scale)
+        self.assertEqual(rff_layer.trainable, trainable)
+
+    @parameterized.named_parameters(
+        ("gaussian", "gaussian", False),
+        ("laplacian", "laplacian", True),
+        ("other", tf.compat.v1.ones_initializer, True),
+    )
+    def test_call(self, initializer, trainable):
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim=10,
+            kernel_initializer=initializer,
+            scale=1.0,
+            trainable=trainable,
+            name="random_fourier_features",
+        )
+        inputs = tf.random.uniform((3, 2), seed=1)
+        outputs = rff_layer(inputs)
+        self.assertListEqual([3, 10], outputs.shape.as_list())
+        num_trainable_vars = 1 if trainable else 0
+        self.assertLen(
+            rff_layer.non_trainable_variables, 3 - num_trainable_vars
+        )
+
+    @tf_test_utils.assert_no_new_pyobjects_executing_eagerly
+    def test_no_eager_Leak(self):
+        # Tests that repeatedly constructing and building a Layer does not leak
+        # Python objects.
+        inputs = tf.random.uniform((5, 4), seed=1)
+        kernel_layers.RandomFourierFeatures(output_dim=4, name="rff")(inputs)
+        kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)
+
+    def test_output_shape(self):
+        inputs = tf.random.uniform((3, 2), seed=1)
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim=7, name="random_fourier_features", trainable=True
+        )
+        outputs = rff_layer(inputs)
+        self.assertEqual([3, 7], outputs.shape.as_list())
+
+    @parameterized.named_parameters(
+        ("gaussian", "gaussian"),
+        ("laplacian", "laplacian"),
+        ("other", tf.compat.v1.random_uniform_initializer),
+    )
+    def test_call_on_placeholder(self, initializer):
+        with tf.Graph().as_default():
+            inputs = tf.compat.v1.placeholder(
+                dtype=tf.float32, shape=[None, None]
+            )
+            rff_layer = kernel_layers.RandomFourierFeatures(
+                output_dim=5,
+                kernel_initializer=initializer,
+                name="random_fourier_features",
+            )
+            with self.assertRaisesRegex(
+                ValueError,
+                "The last dimension of the input tensor should be defined",
+            ):
+                rff_layer(inputs)
+
+            inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[2, None])
+            rff_layer = kernel_layers.RandomFourierFeatures(
+                output_dim=5,
+                kernel_initializer=initializer,
+                name="random_fourier_features",
+            )
+            with self.assertRaisesRegex(
+                ValueError,
+                "The last dimension of the input tensor should be defined",
+            ):
+                rff_layer(inputs)
+
+            inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 3])
+            rff_layer = kernel_layers.RandomFourierFeatures(
+                output_dim=5, name="random_fourier_features"
+            )
+            rff_layer(inputs)
+
+    @parameterized.named_parameters(
+        ("gaussian", 10, "gaussian", 2.0),
+        ("laplacian", 5, "laplacian", None),
+        ("other", 10, tf.compat.v1.ones_initializer, 1.0),
+    )
+    def test_compute_output_shape(self, output_dim, initializer, scale):
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim, initializer, scale=scale, name="rff"
+        )
+        with self.assertRaises(ValueError):
+            rff_layer.compute_output_shape(tf.TensorShape(None))
+        with self.assertRaises(ValueError):
+            rff_layer.compute_output_shape(tf.TensorShape([]))
+        with self.assertRaises(ValueError):
+            rff_layer.compute_output_shape(tf.TensorShape([3]))
+        with self.assertRaises(ValueError):
+            rff_layer.compute_output_shape(tf.TensorShape([3, 2, 3]))
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "The last dimension of the input tensor should be defined",
+        ):
+            rff_layer.compute_output_shape(tf.TensorShape([3, None]))
+
+        self.assertEqual(
+            [None, output_dim],
+            rff_layer.compute_output_shape((None, 3)).as_list(),
+        )
+        self.assertEqual(
+            [None, output_dim],
+            rff_layer.compute_output_shape(tf.TensorShape([None, 2])).as_list(),
+        )
+        self.assertEqual(
+            [4, output_dim], rff_layer.compute_output_shape((4, 1)).as_list()
+        )
+
+    @parameterized.named_parameters(
+        ("gaussian", 10, "gaussian", 3.0, False),
+        ("laplacian", 5, "laplacian", 5.5, True),
+        ("other", 7, tf.compat.v1.random_uniform_initializer(), None, True),
+    )
+    def test_get_config(self, output_dim, initializer, scale, trainable):
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim,
+            initializer,
+            scale=scale,
+            trainable=trainable,
+            name="random_fourier_features",
+        )
+        expected_initializer = initializer
+        if not isinstance(initializer, str):
+            expected_initializer = initializers.serialize(initializer)
+
+        expected_dtype = (
+            "float32" if base_layer_utils.v2_dtype_behavior_enabled() else None
+        )
+        expected_config = {
+            "output_dim": output_dim,
+            "kernel_initializer": expected_initializer,
+            "scale": scale,
+            "name": "random_fourier_features",
+            "trainable": trainable,
+            "dtype": expected_dtype,
+        }
+        self.assertLen(expected_config, len(rff_layer.get_config()))
+        self.assertSameElements(
+            list(expected_config.items()), list(rff_layer.get_config().items())
+        )
+
+    @parameterized.named_parameters(
+        ("gaussian", 5, "gaussian", None, True),
+        ("laplacian", 5, "laplacian", 5.5, False),
+        ("other", 7, tf.compat.v1.ones_initializer(), 2.0, True),
+    )
+    def test_from_config(self, output_dim, initializer, scale, trainable):
+        model_config = {
+            "output_dim": output_dim,
+            "kernel_initializer": initializer,
+            "scale": scale,
+            "trainable": trainable,
+            "name": "random_fourier_features",
+        }
+        rff_layer = kernel_layers.RandomFourierFeatures.from_config(
+            model_config
+        )
+        self.assertEqual(rff_layer.output_dim, output_dim)
+        self.assertEqual(rff_layer.kernel_initializer, initializer)
+        self.assertEqual(rff_layer.scale, scale)
+        self.assertEqual(rff_layer.trainable, trainable)
+
+        inputs = tf.random.uniform((3, 2), seed=1)
+        outputs = rff_layer(inputs)
+        self.assertListEqual([3, output_dim], outputs.shape.as_list())
+        num_trainable_vars = 1 if trainable else 0
+        self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
+        if trainable:
+            self.assertEqual(
+                "random_fourier_features/kernel_scale:0",
+                rff_layer.trainable_variables[0].name,
+            )
+        self.assertLen(
+            rff_layer.non_trainable_variables, 3 - num_trainable_vars
+        )
+
+    @parameterized.named_parameters(
+        ("gaussian", 10, "gaussian", 3.0, True),
+        ("laplacian", 5, "laplacian", 5.5, False),
+        ("other", 10, tf.compat.v1.random_uniform_initializer(), None, True),
+    )
+    def test_same_random_features_params_reused(
+        self, output_dim, initializer, scale, trainable
+    ):
+        """Applying the layer on the same input twice gives the same output."""
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim=output_dim,
+            kernel_initializer=initializer,
+            scale=scale,
+            trainable=trainable,
+            name="random_fourier_features",
+        )
+        inputs = tf.constant(np.random.uniform(low=-1.0, high=1.0, size=(2, 4)))
+        output1 = rff_layer(inputs)
+        output2 = rff_layer(inputs)
+        self._assert_all_close(output1, output2)
+
+    @parameterized.named_parameters(
+        ("gaussian", "gaussian", 5.0),
+        ("laplacian", "laplacian", 3.0),
+        ("other", tf.compat.v1.random_uniform_initializer(), 5.0),
+    )
+    def test_different_params_similar_approximation(self, initializer, scale):
+        tf.compat.v1.set_random_seed(12345)
+        rff_layer1 = kernel_layers.RandomFourierFeatures(
+            output_dim=3000,
+            kernel_initializer=initializer,
+            scale=scale,
+            name="rff1",
+        )
+        rff_layer2 = kernel_layers.RandomFourierFeatures(
+            output_dim=2000,
+            kernel_initializer=initializer,
+            scale=scale,
+            name="rff2",
+        )
+        # Two distinct inputs.
+        x = tf.constant([[1.0, -1.0, 0.5]])
+        y = tf.constant([[-1.0, 1.0, 1.0]])
+
+        # Apply both layers to both inputs.
+        output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1(x)
+        output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1(y)
+        output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2(x)
+        output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2(y)
+
+        # Compute the inner products of the outputs (on inputs x and y) for both
+        # layers. For any fixed random features layer rff_layer, and inputs x,
+        # y, rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization
+        # factor.
+        approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
+        approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
+        self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08)
+
+    @parameterized.named_parameters(
+        ("gaussian", "gaussian", 5.0, _exact_gaussian(stddev=5.0)),
+        ("laplacian", "laplacian", 20.0, _exact_laplacian(stddev=20.0)),
+    )
+    def test_bad_kernel_approximation(
+        self, initializer, scale, exact_kernel_fn
+    ):
+        """Approximation is bad when output dimension is small."""
+        # Two distinct inputs.
+        x = tf.constant([[1.0, -1.0, 0.5]])
+        y = tf.constant([[-1.0, 1.0, 1.0]])
+
+        small_output_dim = 10
+        tf.compat.v1.set_random_seed(1234)
+        # Initialize layer.
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim=small_output_dim,
+            kernel_initializer=initializer,
+            scale=scale,
+            name="random_fourier_features",
+        )
+
+        # Apply layer to both inputs.
+        output_x = math.sqrt(2.0 / small_output_dim) * rff_layer(x)
+        output_y = math.sqrt(2.0 / small_output_dim) * rff_layer(y)
+
+        # The inner products of the outputs (on inputs x and y) approximates the
+        # real value of the RBF kernel but poorly since the output dimension of
+        # the layer is small.
+        exact_kernel_value = exact_kernel_fn(x, y)
+        approx_kernel_value = kernelized_utils.inner_product(output_x, output_y)
+        abs_error = tf.abs(exact_kernel_value - approx_kernel_value)
+        if not tf.executing_eagerly():
+            with self.cached_session() as sess:
+                keras_backend._initialize_variables(sess)
+                abs_error_eval = sess.run([abs_error])
+                self.assertGreater(abs_error_eval[0][0], 0.01)
+                self.assertLess(abs_error_eval[0][0], 0.5)
+        else:
+            self.assertGreater(abs_error, 0.01)
+            self.assertLess(abs_error, 0.5)
+
+    @parameterized.named_parameters(
+        ("gaussian", "gaussian", 5.0, _exact_gaussian(stddev=5.0)),
+        ("laplacian", "laplacian", 10.0, _exact_laplacian(stddev=10.0)),
     )
-    expected_initializer = initializer
-    if not isinstance(initializer, str):
-      expected_initializer = initializers.serialize(initializer)
-
-    expected_dtype = (
-        'float32' if base_layer_utils.v2_dtype_behavior_enabled() else None)
-    expected_config = {
-        'output_dim': output_dim,
-        'kernel_initializer': expected_initializer,
-        'scale': scale,
-        'name': 'random_fourier_features',
-        'trainable': trainable,
-        'dtype': expected_dtype,
-    }
-    self.assertLen(expected_config, len(rff_layer.get_config()))
-    self.assertSameElements(
-        list(expected_config.items()), list(rff_layer.get_config().items()))
-
-  @parameterized.named_parameters(
-      ('gaussian', 5, 'gaussian', None, True),
-      ('laplacian', 5, 'laplacian', 5.5, False),
-      ('other', 7, tf.compat.v1.ones_initializer(), 2.0, True))
-  def test_from_config(self, output_dim, initializer, scale, trainable):
-    model_config = {
-        'output_dim': output_dim,
-        'kernel_initializer': initializer,
-        'scale': scale,
-        'trainable': trainable,
-        'name': 'random_fourier_features',
-    }
-    rff_layer = kernel_layers.RandomFourierFeatures.from_config(model_config)
-    self.assertEqual(rff_layer.output_dim, output_dim)
-    self.assertEqual(rff_layer.kernel_initializer, initializer)
-    self.assertEqual(rff_layer.scale, scale)
-    self.assertEqual(rff_layer.trainable, trainable)
-
-    inputs = tf.random.uniform((3, 2), seed=1)
-    outputs = rff_layer(inputs)
-    self.assertListEqual([3, output_dim], outputs.shape.as_list())
-    num_trainable_vars = 1 if trainable else 0
-    self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
-    if trainable:
-      self.assertEqual('random_fourier_features/kernel_scale:0',
-                       rff_layer.trainable_variables[0].name)
-    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
-
-  @parameterized.named_parameters(
-      ('gaussian', 10, 'gaussian', 3.0, True),
-      ('laplacian', 5, 'laplacian', 5.5, False),
-      ('other', 10, tf.compat.v1.random_uniform_initializer(), None, True))
-  def test_same_random_features_params_reused(self, output_dim, initializer,
-                                              scale, trainable):
-    """Applying the layer on the same input twice gives the same output."""
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim=output_dim,
-        kernel_initializer=initializer,
-        scale=scale,
-        trainable=trainable,
-        name='random_fourier_features')
-    inputs = tf.constant(
-        np.random.uniform(low=-1.0, high=1.0, size=(2, 4)))
-    output1 = rff_layer(inputs)
-    output2 = rff_layer(inputs)
-    self._assert_all_close(output1, output2)
-
-  @parameterized.named_parameters(
-      ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0),
-      ('other', tf.compat.v1.random_uniform_initializer(), 5.0))
-  def test_different_params_similar_approximation(self, initializer, scale):
-    tf.compat.v1.set_random_seed(12345)
-    rff_layer1 = kernel_layers.RandomFourierFeatures(
-        output_dim=3000,
-        kernel_initializer=initializer,
-        scale=scale,
-        name='rff1')
-    rff_layer2 = kernel_layers.RandomFourierFeatures(
-        output_dim=2000,
-        kernel_initializer=initializer,
-        scale=scale,
-        name='rff2')
-    # Two distinct inputs.
-    x = tf.constant([[1.0, -1.0, 0.5]])
-    y = tf.constant([[-1.0, 1.0, 1.0]])
-
-    # Apply both layers to both inputs.
-    output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1(x)
-    output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1(y)
-    output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2(x)
-    output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2(y)
-
-    # Compute the inner products of the outputs (on inputs x and y) for both
-    # layers. For any fixed random features layer rff_layer, and inputs x, y,
-    # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor.
-    approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
-    approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
-    self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08)
-
-  @parameterized.named_parameters(
-      ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
-      ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0)))
-  def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn):
-    """Approximation is bad when output dimension is small."""
-    # Two distinct inputs.
-    x = tf.constant([[1.0, -1.0, 0.5]])
-    y = tf.constant([[-1.0, 1.0, 1.0]])
-
-    small_output_dim = 10
-    tf.compat.v1.set_random_seed(1234)
-    # Initialize layer.
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim=small_output_dim,
-        kernel_initializer=initializer,
-        scale=scale,
-        name='random_fourier_features')
-
-    # Apply layer to both inputs.
-    output_x = math.sqrt(2.0 / small_output_dim) * rff_layer(x)
-    output_y = math.sqrt(2.0 / small_output_dim) * rff_layer(y)
-
-    # The inner products of the outputs (on inputs x and y) approximates the
-    # real value of the RBF kernel but poorly since the output dimension of the
-    # layer is small.
-    exact_kernel_value = exact_kernel_fn(x, y)
-    approx_kernel_value = kernelized_utils.inner_product(output_x, output_y)
-    abs_error = tf.abs(exact_kernel_value - approx_kernel_value)
-    if not tf.executing_eagerly():
-      with self.cached_session() as sess:
-        keras_backend._initialize_variables(sess)
-        abs_error_eval = sess.run([abs_error])
-        self.assertGreater(abs_error_eval[0][0], 0.01)
-        self.assertLess(abs_error_eval[0][0], 0.5)
-    else:
-      self.assertGreater(abs_error, 0.01)
-      self.assertLess(abs_error, 0.5)
-
-  @parameterized.named_parameters(
-      ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
-      ('laplacian', 'laplacian', 10.0, _exact_laplacian(stddev=10.0)))
-  def test_good_kernel_approximation_multiple_inputs(self, initializer, scale,
-                                                     exact_kernel_fn):
-    # Parameters.
-    input_dim = 5
-    output_dim = 2000
-    x_rows = 20
-    y_rows = 30
-
-    x = tf.constant(
-        np.random.uniform(size=(x_rows, input_dim)), dtype=tf.float32)
-    y = tf.constant(
-        np.random.uniform(size=(y_rows, input_dim)), dtype=tf.float32)
-
-    tf.compat.v1.set_random_seed(1234)
-    rff_layer = kernel_layers.RandomFourierFeatures(
-        output_dim=output_dim,
-        kernel_initializer=initializer,
-        scale=scale,
-        name='random_fourier_features')
-
-    # The shapes of output_x and output_y are (x_rows, output_dim) and
-    # (y_rows, output_dim) respectively.
-    output_x = math.sqrt(2.0 / output_dim) * rff_layer(x)
-    output_y = math.sqrt(2.0 / output_dim) * rff_layer(y)
-
-    approx_kernel_matrix = kernelized_utils.inner_product(output_x, output_y)
-    exact_kernel_matrix = exact_kernel_fn(x, y)
-    self._assert_all_close(approx_kernel_matrix, exact_kernel_matrix, atol=0.05)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_good_kernel_approximation_multiple_inputs(
+        self, initializer, scale, exact_kernel_fn
+    ):
+        # Parameters.
+        input_dim = 5
+        output_dim = 2000
+        x_rows = 20
+        y_rows = 30
+
+        x = tf.constant(
+            np.random.uniform(size=(x_rows, input_dim)), dtype=tf.float32
+        )
+        y = tf.constant(
+            np.random.uniform(size=(y_rows, input_dim)), dtype=tf.float32
+        )
+
+        tf.compat.v1.set_random_seed(1234)
+        rff_layer = kernel_layers.RandomFourierFeatures(
+            output_dim=output_dim,
+            kernel_initializer=initializer,
+            scale=scale,
+            name="random_fourier_features",
+        )
+
+        # The shapes of output_x and output_y are (x_rows, output_dim) and
+        # (y_rows, output_dim) respectively.
+        output_x = math.sqrt(2.0 / output_dim) * rff_layer(x)
+        output_y = math.sqrt(2.0 / output_dim) * rff_layer(y)
+
+        approx_kernel_matrix = kernelized_utils.inner_product(
+            output_x, output_y
+        )
+        exact_kernel_matrix = exact_kernel_fn(x, y)
+        self._assert_all_close(
+            approx_kernel_matrix, exact_kernel_matrix, atol=0.05
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/layers_test.py b/keras/layers/layers_test.py
index b618925a0894..1072f5948994 100644
--- a/keras/layers/layers_test.py
+++ b/keras/layers/layers_test.py
@@ -12,24 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Tests for layers.__init__."""
 
-from keras import layers
 import tensorflow.compat.v2 as tf
 
+from keras import layers
 
-class LayersTest(tf.test.TestCase):
 
-  def test_keras_private_symbol(self):
-    normalization_parent = layers.BatchNormalization.__module__.split('.')[-1]
-    if tf.__internal__.tf2.enabled():
-      self.assertEqual('batch_normalization', normalization_parent)
-      self.assertTrue(layers.BatchNormalization._USE_V2_BEHAVIOR)
-    else:
-      self.assertEqual('batch_normalization_v1', normalization_parent)
-      self.assertFalse(layers.BatchNormalization._USE_V2_BEHAVIOR)
+class LayersTest(tf.test.TestCase):
+    def test_keras_private_symbol(self):
+        normalization_parent = layers.BatchNormalization.__module__.split(".")[
+            -1
+        ]
+        if tf.__internal__.tf2.enabled():
+            self.assertEqual("batch_normalization", normalization_parent)
+            self.assertTrue(layers.BatchNormalization._USE_V2_BEHAVIOR)
+        else:
+            self.assertEqual("batch_normalization_v1", normalization_parent)
+            self.assertFalse(layers.BatchNormalization._USE_V2_BEHAVIOR)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/locally_connected/BUILD b/keras/layers/locally_connected/BUILD
index c93785b661ed..e6ee324c60eb 100644
--- a/keras/layers/locally_connected/BUILD
+++ b/keras/layers/locally_connected/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #  Contains the Keras locally-connected layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
@@ -82,7 +84,7 @@ tf_py_test(
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
         "//keras/testing_infra:test_utils",
     ],
diff --git a/keras/layers/locally_connected/__init__.py b/keras/layers/locally_connected/__init__.py
index 6d424d65c177..9dbd20b3522b 100644
--- a/keras/layers/locally_connected/__init__.py
+++ b/keras/layers/locally_connected/__init__.py
@@ -14,5 +14,9 @@
 # ==============================================================================
 """Keras locally-connected layers."""
 
-from keras.layers.locally_connected.locally_connected1d import LocallyConnected1D
-from keras.layers.locally_connected.locally_connected2d import LocallyConnected2D
+from keras.layers.locally_connected.locally_connected1d import (
+    LocallyConnected1D,
+)
+from keras.layers.locally_connected.locally_connected2d import (
+    LocallyConnected2D,
+)
diff --git a/keras/layers/locally_connected/locally_connected1d.py b/keras/layers/locally_connected/locally_connected1d.py
index ddc651e6eca6..32fe80fee560 100644
--- a/keras/layers/locally_connected/locally_connected1d.py
+++ b/keras/layers/locally_connected/locally_connected1d.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 """Locally-connected layer for 1D input."""
 
 from keras import activations
@@ -26,308 +26,346 @@
 from keras.utils import conv_utils
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.LocallyConnected1D')
+@keras_export("keras.layers.LocallyConnected1D")
 class LocallyConnected1D(Layer):
-  """Locally-connected layer for 1D inputs.
-
-  The `LocallyConnected1D` layer works similarly to
-  the `Conv1D` layer, except that weights are unshared,
-  that is, a different set of filters is applied at each different patch
-  of the input.
-
-  Note: layer attributes cannot be modified after the layer has been called
-  once (except the `trainable` attribute).
-
-  Example:
-  ```python
-      # apply a unshared weight convolution 1d of length 3 to a sequence with
-      # 10 timesteps, with 64 output filters
-      model = Sequential()
-      model.add(LocallyConnected1D(64, 3, input_shape=(10, 32)))
-      # now model.output_shape == (None, 8, 64)
-      # add a new conv1d on top
-      model.add(LocallyConnected1D(32, 3))
-      # now model.output_shape == (None, 6, 32)
-  ```
-
-  Args:
-      filters: Integer, the dimensionality of the output space (i.e. the number
-        of output filters in the convolution).
-      kernel_size: An integer or tuple/list of a single integer, specifying the
-        length of the 1D convolution window.
-      strides: An integer or tuple/list of a single integer, specifying the
-        stride length of the convolution.
-      padding: Currently only supports `"valid"` (case-insensitive). `"same"`
-        may be supported in the future. `"valid"` means no padding.
-      data_format: A string, one of `channels_last` (default) or
-        `channels_first`. The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape `(batch, length,
-        channels)` while `channels_first` corresponds to inputs with shape
-        `(batch, channels, length)`. It defaults to the `image_data_format`
-        value found in your Keras config file at `~/.keras/keras.json`. If you
-        never set it, then it will be "channels_last".
-      activation: Activation function to use. If you don't specify anything, no
-        activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to the `kernel` weights
-        matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to the output of the
-        layer (its "activation")..
-      kernel_constraint: Constraint function applied to the kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      implementation: implementation mode, either `1`, `2`, or `3`. `1` loops
-        over input spatial locations to perform the forward pass. It is
-        memory-efficient but performs a lot of (small) ops.  `2` stores layer
-        weights in a dense but sparsely-populated 2D matrix and implements the
-        forward pass as a single matrix-multiply. It uses a lot of RAM but
-        performs few (large) ops.  `3` stores layer weights in a sparse tensor
-        and implements the forward pass as a single sparse matrix-multiply.
-          How to choose:
-          `1`: large, dense models,
-          `2`: small models,
-          `3`: large, sparse models,  where "large" stands for large
-            input/output activations (i.e. many `filters`, `input_filters`,
-            large `input_size`, `output_size`), and "sparse" stands for few
-            connections between inputs and outputs, i.e. small ratio `filters *
-            input_filters * kernel_size / (input_size * strides)`, where inputs
-            to and outputs of the layer are assumed to have shapes `(input_size,
-            input_filters)`, `(output_size, filters)` respectively.  It is
-            recommended to benchmark each in the setting of interest to pick the
-            most efficient one (in terms of speed and memory usage). Correct
-            choice of implementation can lead to dramatic speed improvements
-            (e.g. 50X), potentially at the expense of RAM.  Also, only
-            `padding="valid"` is supported by `implementation=1`.
-  Input shape:
-      3D tensor with shape: `(batch_size, steps, input_dim)`
-  Output shape:
-      3D tensor with shape: `(batch_size, new_steps, filters)` `steps` value
-        might have changed due to padding or strides.
-  """
-
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               implementation=1,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
-    self.strides = conv_utils.normalize_tuple(
-        strides, 1, 'strides', allow_zero=True)
-    self.padding = conv_utils.normalize_padding(padding)
-    if self.padding != 'valid' and implementation == 1:
-      raise ValueError('Invalid border mode for LocallyConnected1D '
-                       '(only "valid" is supported if implementation is 1): ' +
-                       padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-    self.implementation = implementation
-    self.input_spec = InputSpec(ndim=3)
-
-  @property
-  def _use_input_spec_as_call_signature(self):
-    return False
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    if self.data_format == 'channels_first':
-      input_dim, input_length = input_shape[1], input_shape[2]
-    else:
-      input_dim, input_length = input_shape[2], input_shape[1]
-
-    if input_dim is None:
-      raise ValueError(
-          'Axis 2 of input should be fully-defined. '
-          'Found shape:', input_shape)
-    self.output_length = conv_utils.conv_output_length(input_length,
-                                                       self.kernel_size[0],
-                                                       self.padding,
-                                                       self.strides[0])
-
-    if self.output_length <= 0:
-      raise ValueError(
-          f'One of the dimensions in the output is <= 0 '
-          f'due to downsampling in {self.name}. Consider '
-          f'increasing the input size. '
-          f'Received input shape {input_shape} which would produce '
-          f'output shape with a zero or negative value in a '
-          f'dimension.')
-
-    if self.implementation == 1:
-      self.kernel_shape = (self.output_length, self.kernel_size[0] * input_dim,
-                           self.filters)
-
-      self.kernel = self.add_weight(
-          shape=self.kernel_shape,
-          initializer=self.kernel_initializer,
-          name='kernel',
-          regularizer=self.kernel_regularizer,
-          constraint=self.kernel_constraint)
-
-    elif self.implementation == 2:
-      if self.data_format == 'channels_first':
-        self.kernel_shape = (input_dim, input_length, self.filters,
-                             self.output_length)
-      else:
-        self.kernel_shape = (input_length, input_dim, self.output_length,
-                             self.filters)
-
-      self.kernel = self.add_weight(
-          shape=self.kernel_shape,
-          initializer=self.kernel_initializer,
-          name='kernel',
-          regularizer=self.kernel_regularizer,
-          constraint=self.kernel_constraint)
-
-      self.kernel_mask = locally_connected_utils.get_locallyconnected_mask(
-          input_shape=(input_length,),
-          kernel_shape=self.kernel_size,
-          strides=self.strides,
-          padding=self.padding,
-          data_format=self.data_format,
-      )
-
-    elif self.implementation == 3:
-      self.kernel_shape = (self.output_length * self.filters,
-                           input_length * input_dim)
-
-      self.kernel_idxs = sorted(
-          conv_utils.conv_kernel_idxs(
-              input_shape=(input_length,),
-              kernel_shape=self.kernel_size,
-              strides=self.strides,
-              padding=self.padding,
-              filters_in=input_dim,
-              filters_out=self.filters,
-              data_format=self.data_format))
-
-      self.kernel = self.add_weight(
-          shape=(len(self.kernel_idxs),),
-          initializer=self.kernel_initializer,
-          name='kernel',
-          regularizer=self.kernel_regularizer,
-          constraint=self.kernel_constraint)
-
-    else:
-      raise ValueError('Unrecognized implementation mode: %d.' %
-                       self.implementation)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.output_length, self.filters),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-
-    if self.data_format == 'channels_first':
-      self.input_spec = InputSpec(ndim=3, axes={1: input_dim})
-    else:
-      self.input_spec = InputSpec(ndim=3, axes={-1: input_dim})
-    self.built = True
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      input_length = input_shape[2]
-    else:
-      input_length = input_shape[1]
-
-    length = conv_utils.conv_output_length(input_length, self.kernel_size[0],
-                                           self.padding, self.strides[0])
-
-    if self.data_format == 'channels_first':
-      return (input_shape[0], self.filters, length)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], length, self.filters)
-
-  def call(self, inputs):
-    if self.implementation == 1:
-      output = backend.local_conv(
-          inputs, self.kernel, self.kernel_size, self.strides,
-          (self.output_length,), self.data_format)
-
-    elif self.implementation == 2:
-      output = locally_connected_utils.local_conv_matmul(
-          inputs, self.kernel, self.kernel_mask,
-          self.compute_output_shape(inputs.shape))
-
-    elif self.implementation == 3:
-      output = locally_connected_utils.local_conv_sparse_matmul(
-          inputs, self.kernel, self.kernel_idxs, self.kernel_shape,
-          self.compute_output_shape(inputs.shape))
-
-    else:
-      raise ValueError('Unrecognized implementation mode: %d.' %
-                       self.implementation)
-
-    if self.use_bias:
-      output = backend.bias_add(output, self.bias, data_format=self.data_format)
-
-    output = self.activation(output)
-    return output
-
-  def get_config(self):
-    config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'implementation':
-            self.implementation
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Locally-connected layer for 1D inputs.
+
+    The `LocallyConnected1D` layer works similarly to
+    the `Conv1D` layer, except that weights are unshared,
+    that is, a different set of filters is applied at each different patch
+    of the input.
+
+    Note: layer attributes cannot be modified after the layer has been called
+    once (except the `trainable` attribute).
+
+    Example:
+    ```python
+        # apply a unshared weight convolution 1d of length 3 to a sequence with
+        # 10 timesteps, with 64 output filters
+        model = Sequential()
+        model.add(LocallyConnected1D(64, 3, input_shape=(10, 32)))
+        # now model.output_shape == (None, 8, 64)
+        # add a new conv1d on top
+        model.add(LocallyConnected1D(32, 3))
+        # now model.output_shape == (None, 6, 32)
+    ```
+
+    Args:
+        filters: Integer, the dimensionality of the output space (i.e. the
+          number of output filters in the convolution).
+        kernel_size: An integer or tuple/list of a single integer, specifying
+          the length of the 1D convolution window.
+        strides: An integer or tuple/list of a single integer, specifying the
+          stride length of the convolution.
+        padding: Currently only supports `"valid"` (case-insensitive). `"same"`
+          may be supported in the future. `"valid"` means no padding.
+        data_format: A string, one of `channels_last` (default) or
+          `channels_first`. The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape `(batch, length,
+          channels)` while `channels_first` corresponds to inputs with shape
+          `(batch, channels, length)`. When unspecified, uses
+          `image_data_format` value found in your Keras config file at
+          `~/.keras/keras.json` (if exists) else 'channels_last'.
+          Defaults to 'channels_last'.
+        activation: Activation function to use. If you don't specify anything,
+          no activation is applied (ie. "linear" activation: `a(x) = x`).
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix.
+        bias_initializer: Initializer for the bias vector.
+        kernel_regularizer: Regularizer function applied to the `kernel` weights
+          matrix.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        activity_regularizer: Regularizer function applied to the output of the
+          layer (its "activation")..
+        kernel_constraint: Constraint function applied to the kernel matrix.
+        bias_constraint: Constraint function applied to the bias vector.
+        implementation: implementation mode, either `1`, `2`, or `3`. `1` loops
+          over input spatial locations to perform the forward pass. It is
+          memory-efficient but performs a lot of (small) ops.  `2` stores layer
+          weights in a dense but sparsely-populated 2D matrix and implements the
+          forward pass as a single matrix-multiply. It uses a lot of RAM but
+          performs few (large) ops.  `3` stores layer weights in a sparse tensor
+          and implements the forward pass as a single sparse matrix-multiply.
+            How to choose:
+            `1`: large, dense models,
+            `2`: small models,
+            `3`: large, sparse models,  where "large" stands for large
+              input/output activations (i.e. many `filters`, `input_filters`,
+              large `input_size`, `output_size`), and "sparse" stands for few
+              connections between inputs and outputs, i.e. small ratio
+              `filters * input_filters * kernel_size / (input_size * strides)`,
+              where inputs to and outputs of the layer are assumed to have
+              shapes `(input_size, input_filters)`, `(output_size, filters)`
+              respectively.  It is recommended to benchmark each in the setting
+              of interest to pick the most efficient one (in terms of speed and
+              memory usage). Correct choice of implementation can lead to
+              dramatic speed improvements (e.g. 50X), potentially at the expense
+              of RAM.  Also, only `padding="valid"` is supported by
+              `implementation=1`.
+    Input shape:
+        3D tensor with shape: `(batch_size, steps, input_dim)`
+    Output shape:
+        3D tensor with shape: `(batch_size, new_steps, filters)` `steps` value
+          might have changed due to padding or strides.
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format=None,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        implementation=1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.filters = filters
+        self.kernel_size = conv_utils.normalize_tuple(
+            kernel_size, 1, "kernel_size"
+        )
+        self.strides = conv_utils.normalize_tuple(
+            strides, 1, "strides", allow_zero=True
+        )
+        self.padding = conv_utils.normalize_padding(padding)
+        if self.padding != "valid" and implementation == 1:
+            raise ValueError(
+                "Invalid border mode for LocallyConnected1D "
+                '(only "valid" is supported if implementation is 1): ' + padding
+            )
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+        self.implementation = implementation
+        self.input_spec = InputSpec(ndim=3)
+
+    @property
+    def _use_input_spec_as_call_signature(self):
+        return False
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        if self.data_format == "channels_first":
+            input_dim, input_length = input_shape[1], input_shape[2]
+        else:
+            input_dim, input_length = input_shape[2], input_shape[1]
+
+        if input_dim is None:
+            raise ValueError(
+                "Axis 2 of input should be fully-defined. Found shape:",
+                input_shape,
+            )
+        self.output_length = conv_utils.conv_output_length(
+            input_length, self.kernel_size[0], self.padding, self.strides[0]
+        )
+
+        if self.output_length <= 0:
+            raise ValueError(
+                "One of the dimensions in the output is <= 0 "
+                f"due to downsampling in {self.name}. Consider "
+                "increasing the input size. "
+                f"Received input shape {input_shape} which would produce "
+                "output shape with a zero or negative value in a "
+                "dimension."
+            )
+
+        if self.implementation == 1:
+            self.kernel_shape = (
+                self.output_length,
+                self.kernel_size[0] * input_dim,
+                self.filters,
+            )
+
+            self.kernel = self.add_weight(
+                shape=self.kernel_shape,
+                initializer=self.kernel_initializer,
+                name="kernel",
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+            )
+
+        elif self.implementation == 2:
+            if self.data_format == "channels_first":
+                self.kernel_shape = (
+                    input_dim,
+                    input_length,
+                    self.filters,
+                    self.output_length,
+                )
+            else:
+                self.kernel_shape = (
+                    input_length,
+                    input_dim,
+                    self.output_length,
+                    self.filters,
+                )
+
+            self.kernel = self.add_weight(
+                shape=self.kernel_shape,
+                initializer=self.kernel_initializer,
+                name="kernel",
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+            )
+
+            self.kernel_mask = (
+                locally_connected_utils.get_locallyconnected_mask(
+                    input_shape=(input_length,),
+                    kernel_shape=self.kernel_size,
+                    strides=self.strides,
+                    padding=self.padding,
+                    data_format=self.data_format,
+                )
+            )
+
+        elif self.implementation == 3:
+            self.kernel_shape = (
+                self.output_length * self.filters,
+                input_length * input_dim,
+            )
+
+            self.kernel_idxs = sorted(
+                conv_utils.conv_kernel_idxs(
+                    input_shape=(input_length,),
+                    kernel_shape=self.kernel_size,
+                    strides=self.strides,
+                    padding=self.padding,
+                    filters_in=input_dim,
+                    filters_out=self.filters,
+                    data_format=self.data_format,
+                )
+            )
+
+            self.kernel = self.add_weight(
+                shape=(len(self.kernel_idxs),),
+                initializer=self.kernel_initializer,
+                name="kernel",
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+            )
+
+        else:
+            raise ValueError(
+                "Unrecognized implementation mode: %d." % self.implementation
+            )
+
+        if self.use_bias:
+            self.bias = self.add_weight(
+                shape=(self.output_length, self.filters),
+                initializer=self.bias_initializer,
+                name="bias",
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+            )
+        else:
+            self.bias = None
+
+        if self.data_format == "channels_first":
+            self.input_spec = InputSpec(ndim=3, axes={1: input_dim})
+        else:
+            self.input_spec = InputSpec(ndim=3, axes={-1: input_dim})
+        self.built = True
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if self.data_format == "channels_first":
+            input_length = input_shape[2]
+        else:
+            input_length = input_shape[1]
+
+        length = conv_utils.conv_output_length(
+            input_length, self.kernel_size[0], self.padding, self.strides[0]
+        )
+
+        if self.data_format == "channels_first":
+            return (input_shape[0], self.filters, length)
+        elif self.data_format == "channels_last":
+            return (input_shape[0], length, self.filters)
+
+    def call(self, inputs):
+        if self.implementation == 1:
+            output = backend.local_conv(
+                inputs,
+                self.kernel,
+                self.kernel_size,
+                self.strides,
+                (self.output_length,),
+                self.data_format,
+            )
+
+        elif self.implementation == 2:
+            output = locally_connected_utils.local_conv_matmul(
+                inputs,
+                self.kernel,
+                self.kernel_mask,
+                self.compute_output_shape(inputs.shape),
+            )
+
+        elif self.implementation == 3:
+            output = locally_connected_utils.local_conv_sparse_matmul(
+                inputs,
+                self.kernel,
+                self.kernel_idxs,
+                self.kernel_shape,
+                self.compute_output_shape(inputs.shape),
+            )
+
+        else:
+            raise ValueError(
+                "Unrecognized implementation mode: %d." % self.implementation
+            )
+
+        if self.use_bias:
+            output = backend.bias_add(
+                output, self.bias, data_format=self.data_format
+            )
+
+        output = self.activation(output)
+        return output
+
+    def get_config(self):
+        config = {
+            "filters": self.filters,
+            "kernel_size": self.kernel_size,
+            "strides": self.strides,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "activation": activations.serialize(self.activation),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "implementation": self.implementation,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/locally_connected/locally_connected2d.py b/keras/layers/locally_connected/locally_connected2d.py
index b67aba34795e..fce8c32e2ce4 100644
--- a/keras/layers/locally_connected/locally_connected2d.py
+++ b/keras/layers/locally_connected/locally_connected2d.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 """Locally-connected layer for 2D input."""
 
 from keras import activations
@@ -26,330 +26,375 @@
 from keras.utils import conv_utils
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.LocallyConnected2D')
+@keras_export("keras.layers.LocallyConnected2D")
 class LocallyConnected2D(Layer):
-  """Locally-connected layer for 2D inputs.
-
-  The `LocallyConnected2D` layer works similarly
-  to the `Conv2D` layer, except that weights are unshared,
-  that is, a different set of filters is applied at each
-  different patch of the input.
-
-  Note: layer attributes cannot be modified after the layer has been called
-  once (except the `trainable` attribute).
-
-  Examples:
-  ```python
-      # apply a 3x3 unshared weights convolution with 64 output filters on a
-      32x32 image
-      # with `data_format="channels_last"`:
-      model = Sequential()
-      model.add(LocallyConnected2D(64, (3, 3), input_shape=(32, 32, 3)))
-      # now model.output_shape == (None, 30, 30, 64)
-      # notice that this layer will consume (30*30)*(3*3*3*64) + (30*30)*64
-      parameters
-
-      # add a 3x3 unshared weights convolution on top, with 32 output filters:
-      model.add(LocallyConnected2D(32, (3, 3)))
-      # now model.output_shape == (None, 28, 28, 32)
-  ```
-
-  Args:
-      filters: Integer, the dimensionality of the output space (i.e. the number
-        of output filters in the convolution).
-      kernel_size: An integer or tuple/list of 2 integers, specifying the width
-        and height of the 2D convolution window. Can be a single integer to
-        specify the same value for all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers, specifying the strides of
-        the convolution along the width and height. Can be a single integer to
-        specify the same value for all spatial dimensions.
-      padding: Currently only support `"valid"` (case-insensitive). `"same"`
-        will be supported in future. `"valid"` means no padding.
-      data_format: A string, one of `channels_last` (default) or
-        `channels_first`. The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape `(batch, height, width,
-        channels)` while `channels_first` corresponds to inputs with shape
-        `(batch, channels, height, width)`. It defaults to the
-        `image_data_format` value found in your Keras config file at
-        `~/.keras/keras.json`. If you never set it, then it will be
-        "channels_last".
-      activation: Activation function to use. If you don't specify anything, no
-        activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to the `kernel` weights
-        matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to the output of the
-        layer (its "activation").
-      kernel_constraint: Constraint function applied to the kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      implementation: implementation mode, either `1`, `2`, or `3`. `1` loops
-        over input spatial locations to perform the forward pass. It is
-        memory-efficient but performs a lot of (small) ops.  `2` stores layer
-        weights in a dense but sparsely-populated 2D matrix and implements the
-        forward pass as a single matrix-multiply. It uses a lot of RAM but
-        performs few (large) ops.  `3` stores layer weights in a sparse tensor
-        and implements the forward pass as a single sparse matrix-multiply.
-          How to choose:
-          `1`: large, dense models,
-          `2`: small models,
-          `3`: large, sparse models,  where "large" stands for large
-            input/output activations (i.e. many `filters`, `input_filters`,
-            large `np.prod(input_size)`, `np.prod(output_size)`), and "sparse"
-            stands for few connections between inputs and outputs, i.e. small
-            ratio `filters * input_filters * np.prod(kernel_size) /
-            (np.prod(input_size) * np.prod(strides))`, where inputs to and
-            outputs of the layer are assumed to have shapes `input_size +
-            (input_filters,)`, `output_size + (filters,)` respectively.  It is
-            recommended to benchmark each in the setting of interest to pick the
-            most efficient one (in terms of speed and memory usage). Correct
-            choice of implementation can lead to dramatic speed improvements
-            (e.g. 50X), potentially at the expense of RAM.  Also, only
-            `padding="valid"` is supported by `implementation=1`.
-  Input shape:
-      4D tensor with shape: `(samples, channels, rows, cols)` if
-        data_format='channels_first'
-      or 4D tensor with shape: `(samples, rows, cols, channels)` if
-        data_format='channels_last'.
-  Output shape:
-      4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
-        data_format='channels_first'
-      or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if
-        data_format='channels_last'. `rows` and `cols` values might have changed
-        due to padding.
-  """
-
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               implementation=1,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
-    self.strides = conv_utils.normalize_tuple(
-        strides, 2, 'strides', allow_zero=True)
-    self.padding = conv_utils.normalize_padding(padding)
-    if self.padding != 'valid' and implementation == 1:
-      raise ValueError('Invalid border mode for LocallyConnected2D '
-                       '(only "valid" is supported if implementation is 1): ' +
-                       padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-    self.implementation = implementation
-    self.input_spec = InputSpec(ndim=4)
-
-  @property
-  def _use_input_spec_as_call_signature(self):
-    return False
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    if self.data_format == 'channels_last':
-      input_row, input_col = input_shape[1:-1]
-      input_filter = input_shape[3]
-    else:
-      input_row, input_col = input_shape[2:]
-      input_filter = input_shape[1]
-    if input_row is None or input_col is None:
-      raise ValueError('The spatial dimensions of the inputs to '
-                       ' a LocallyConnected2D layer '
-                       'should be fully-defined, but layer received '
-                       'the inputs shape ' + str(input_shape))
-    output_row = conv_utils.conv_output_length(input_row, self.kernel_size[0],
-                                               self.padding, self.strides[0])
-    output_col = conv_utils.conv_output_length(input_col, self.kernel_size[1],
-                                               self.padding, self.strides[1])
-    self.output_row = output_row
-    self.output_col = output_col
-
-    if self.output_row <= 0 or self.output_col <= 0:
-      raise ValueError(
-          f'One of the dimensions in the output is <= 0 '
-          f'due to downsampling in {self.name}. Consider '
-          f'increasing the input size. '
-          f'Received input shape {input_shape} which would produce '
-          f'output shape with a zero or negative value in a '
-          f'dimension.')
-
-    if self.implementation == 1:
-      self.kernel_shape = (output_row * output_col, self.kernel_size[0] *
-                           self.kernel_size[1] * input_filter, self.filters)
-
-      self.kernel = self.add_weight(
-          shape=self.kernel_shape,
-          initializer=self.kernel_initializer,
-          name='kernel',
-          regularizer=self.kernel_regularizer,
-          constraint=self.kernel_constraint)
-
-    elif self.implementation == 2:
-      if self.data_format == 'channels_first':
-        self.kernel_shape = (input_filter, input_row, input_col, self.filters,
-                             self.output_row, self.output_col)
-      else:
-        self.kernel_shape = (input_row, input_col, input_filter,
-                             self.output_row, self.output_col, self.filters)
-
-      self.kernel = self.add_weight(
-          shape=self.kernel_shape,
-          initializer=self.kernel_initializer,
-          name='kernel',
-          regularizer=self.kernel_regularizer,
-          constraint=self.kernel_constraint)
-
-      self.kernel_mask = locally_connected_utils.get_locallyconnected_mask(
-          input_shape=(input_row, input_col),
-          kernel_shape=self.kernel_size,
-          strides=self.strides,
-          padding=self.padding,
-          data_format=self.data_format,
-      )
-
-    elif self.implementation == 3:
-      self.kernel_shape = (self.output_row * self.output_col * self.filters,
-                           input_row * input_col * input_filter)
-
-      self.kernel_idxs = sorted(
-          conv_utils.conv_kernel_idxs(
-              input_shape=(input_row, input_col),
-              kernel_shape=self.kernel_size,
-              strides=self.strides,
-              padding=self.padding,
-              filters_in=input_filter,
-              filters_out=self.filters,
-              data_format=self.data_format))
-
-      self.kernel = self.add_weight(
-          shape=(len(self.kernel_idxs),),
-          initializer=self.kernel_initializer,
-          name='kernel',
-          regularizer=self.kernel_regularizer,
-          constraint=self.kernel_constraint)
-
-    else:
-      raise ValueError('Unrecognized implementation mode: %d.' %
-                       self.implementation)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(output_row, output_col, self.filters),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    if self.data_format == 'channels_first':
-      self.input_spec = InputSpec(ndim=4, axes={1: input_filter})
-    else:
-      self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
-    self.built = True
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    elif self.data_format == 'channels_last':
-      rows = input_shape[1]
-      cols = input_shape[2]
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding, self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
-                                         self.padding, self.strides[1])
-
-    if self.data_format == 'channels_first':
-      return (input_shape[0], self.filters, rows, cols)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], rows, cols, self.filters)
-
-  def call(self, inputs):
-    if self.implementation == 1:
-      output = backend.local_conv(
-          inputs, self.kernel, self.kernel_size, self.strides,
-          (self.output_row, self.output_col),
-          self.data_format)
-
-    elif self.implementation == 2:
-      output = locally_connected_utils.local_conv_matmul(
-          inputs, self.kernel, self.kernel_mask,
-          self.compute_output_shape(inputs.shape))
-
-    elif self.implementation == 3:
-      output = locally_connected_utils.local_conv_sparse_matmul(
-          inputs, self.kernel, self.kernel_idxs, self.kernel_shape,
-          self.compute_output_shape(inputs.shape))
-
-    else:
-      raise ValueError('Unrecognized implementation mode: %d.' %
-                       self.implementation)
-
-    if self.use_bias:
-      output = backend.bias_add(output, self.bias, data_format=self.data_format)
-
-    output = self.activation(output)
-    return output
-
-  def get_config(self):
-    config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'implementation':
-            self.implementation
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Locally-connected layer for 2D inputs.
+
+    The `LocallyConnected2D` layer works similarly
+    to the `Conv2D` layer, except that weights are unshared,
+    that is, a different set of filters is applied at each
+    different patch of the input.
+
+    Note: layer attributes cannot be modified after the layer has been called
+    once (except the `trainable` attribute).
+
+    Examples:
+    ```python
+        # apply a 3x3 unshared weights convolution with 64 output filters on a
+        32x32 image
+        # with `data_format="channels_last"`:
+        model = Sequential()
+        model.add(LocallyConnected2D(64, (3, 3), input_shape=(32, 32, 3)))
+        # now model.output_shape == (None, 30, 30, 64)
+        # notice that this layer will consume (30*30)*(3*3*3*64) + (30*30)*64
+        parameters
+
+        # add a 3x3 unshared weights convolution on top, with 32 output filters:
+        model.add(LocallyConnected2D(32, (3, 3)))
+        # now model.output_shape == (None, 28, 28, 32)
+    ```
+
+    Args:
+        filters: Integer, the dimensionality of the output space (i.e. the
+          number of output filters in the convolution).
+        kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window. Can be a single integer
+          to specify the same value for all spatial dimensions.
+        strides: An integer or tuple/list of 2 integers, specifying the strides
+          of the convolution along the width and height. Can be a single integer
+          to specify the same value for all spatial dimensions.
+        padding: Currently only support `"valid"` (case-insensitive). `"same"`
+          will be supported in future. `"valid"` means no padding.
+        data_format: A string, one of `channels_last` (default) or
+          `channels_first`. The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape `(batch, height,
+            width, channels)` while `channels_first` corresponds to inputs with
+            shape
+          `(batch, channels, height, width)`. When unspecified, uses
+          `image_data_format` value found in your Keras config file at
+          `~/.keras/keras.json` (if exists) else 'channels_last'.
+          Defaults to 'channels_last'.
+        activation: Activation function to use. If you don't specify anything,
+          no activation is applied (ie. "linear" activation: `a(x) = x`).
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix.
+        bias_initializer: Initializer for the bias vector.
+        kernel_regularizer: Regularizer function applied to the `kernel` weights
+          matrix.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        activity_regularizer: Regularizer function applied to the output of the
+          layer (its "activation").
+        kernel_constraint: Constraint function applied to the kernel matrix.
+        bias_constraint: Constraint function applied to the bias vector.
+        implementation: implementation mode, either `1`, `2`, or `3`. `1` loops
+          over input spatial locations to perform the forward pass. It is
+          memory-efficient but performs a lot of (small) ops.  `2` stores layer
+          weights in a dense but sparsely-populated 2D matrix and implements the
+          forward pass as a single matrix-multiply. It uses a lot of RAM but
+          performs few (large) ops.  `3` stores layer weights in a sparse tensor
+          and implements the forward pass as a single sparse matrix-multiply.
+            How to choose:
+            `1`: large, dense models,
+            `2`: small models,
+            `3`: large, sparse models,  where "large" stands for large
+              input/output activations (i.e. many `filters`, `input_filters`,
+              large `np.prod(input_size)`, `np.prod(output_size)`), and "sparse"
+              stands for few connections between inputs and outputs, i.e. small
+              ratio `filters * input_filters * np.prod(kernel_size) /
+              (np.prod(input_size) * np.prod(strides))`, where inputs to and
+              outputs of the layer are assumed to have shapes `input_size +
+              (input_filters,)`, `output_size + (filters,)` respectively. It is
+              recommended to benchmark each in the setting of interest to pick
+              the most efficient one (in terms of speed and memory usage).
+              Correct choice of implementation can lead to dramatic speed
+              improvements (e.g. 50X), potentially at the expense of RAM. Also,
+              only `padding="valid"` is supported by `implementation=1`.
+    Input shape:
+        4D tensor with shape: `(samples, channels, rows, cols)` if
+          data_format='channels_first'
+        or 4D tensor with shape: `(samples, rows, cols, channels)` if
+          data_format='channels_last'.
+    Output shape:
+        4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
+          data_format='channels_first'
+        or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if
+          data_format='channels_last'. `rows` and `cols` values might have
+          changed due to padding.
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        data_format=None,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        implementation=1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.filters = filters
+        self.kernel_size = conv_utils.normalize_tuple(
+            kernel_size, 2, "kernel_size"
+        )
+        self.strides = conv_utils.normalize_tuple(
+            strides, 2, "strides", allow_zero=True
+        )
+        self.padding = conv_utils.normalize_padding(padding)
+        if self.padding != "valid" and implementation == 1:
+            raise ValueError(
+                "Invalid border mode for LocallyConnected2D "
+                '(only "valid" is supported if implementation is 1): ' + padding
+            )
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+        self.implementation = implementation
+        self.input_spec = InputSpec(ndim=4)
+
+    @property
+    def _use_input_spec_as_call_signature(self):
+        return False
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        if self.data_format == "channels_last":
+            input_row, input_col = input_shape[1:-1]
+            input_filter = input_shape[3]
+        else:
+            input_row, input_col = input_shape[2:]
+            input_filter = input_shape[1]
+        if input_row is None or input_col is None:
+            raise ValueError(
+                "The spatial dimensions of the inputs to "
+                " a LocallyConnected2D layer "
+                "should be fully-defined, but layer received "
+                "the inputs shape " + str(input_shape)
+            )
+        output_row = conv_utils.conv_output_length(
+            input_row, self.kernel_size[0], self.padding, self.strides[0]
+        )
+        output_col = conv_utils.conv_output_length(
+            input_col, self.kernel_size[1], self.padding, self.strides[1]
+        )
+        self.output_row = output_row
+        self.output_col = output_col
+
+        if self.output_row <= 0 or self.output_col <= 0:
+            raise ValueError(
+                "One of the dimensions in the output is <= 0 "
+                f"due to downsampling in {self.name}. Consider "
+                "increasing the input size. "
+                f"Received input shape {input_shape} which would produce "
+                "output shape with a zero or negative value in a "
+                "dimension."
+            )
+
+        if self.implementation == 1:
+            self.kernel_shape = (
+                output_row * output_col,
+                self.kernel_size[0] * self.kernel_size[1] * input_filter,
+                self.filters,
+            )
+
+            self.kernel = self.add_weight(
+                shape=self.kernel_shape,
+                initializer=self.kernel_initializer,
+                name="kernel",
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+            )
+
+        elif self.implementation == 2:
+            if self.data_format == "channels_first":
+                self.kernel_shape = (
+                    input_filter,
+                    input_row,
+                    input_col,
+                    self.filters,
+                    self.output_row,
+                    self.output_col,
+                )
+            else:
+                self.kernel_shape = (
+                    input_row,
+                    input_col,
+                    input_filter,
+                    self.output_row,
+                    self.output_col,
+                    self.filters,
+                )
+
+            self.kernel = self.add_weight(
+                shape=self.kernel_shape,
+                initializer=self.kernel_initializer,
+                name="kernel",
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+            )
+
+            self.kernel_mask = (
+                locally_connected_utils.get_locallyconnected_mask(
+                    input_shape=(input_row, input_col),
+                    kernel_shape=self.kernel_size,
+                    strides=self.strides,
+                    padding=self.padding,
+                    data_format=self.data_format,
+                )
+            )
+
+        elif self.implementation == 3:
+            self.kernel_shape = (
+                self.output_row * self.output_col * self.filters,
+                input_row * input_col * input_filter,
+            )
+
+            self.kernel_idxs = sorted(
+                conv_utils.conv_kernel_idxs(
+                    input_shape=(input_row, input_col),
+                    kernel_shape=self.kernel_size,
+                    strides=self.strides,
+                    padding=self.padding,
+                    filters_in=input_filter,
+                    filters_out=self.filters,
+                    data_format=self.data_format,
+                )
+            )
+
+            self.kernel = self.add_weight(
+                shape=(len(self.kernel_idxs),),
+                initializer=self.kernel_initializer,
+                name="kernel",
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+            )
+
+        else:
+            raise ValueError(
+                "Unrecognized implementation mode: %d." % self.implementation
+            )
+
+        if self.use_bias:
+            self.bias = self.add_weight(
+                shape=(output_row, output_col, self.filters),
+                initializer=self.bias_initializer,
+                name="bias",
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+            )
+        else:
+            self.bias = None
+        if self.data_format == "channels_first":
+            self.input_spec = InputSpec(ndim=4, axes={1: input_filter})
+        else:
+            self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
+        self.built = True
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if self.data_format == "channels_first":
+            rows = input_shape[2]
+            cols = input_shape[3]
+        elif self.data_format == "channels_last":
+            rows = input_shape[1]
+            cols = input_shape[2]
+
+        rows = conv_utils.conv_output_length(
+            rows, self.kernel_size[0], self.padding, self.strides[0]
+        )
+        cols = conv_utils.conv_output_length(
+            cols, self.kernel_size[1], self.padding, self.strides[1]
+        )
+
+        if self.data_format == "channels_first":
+            return (input_shape[0], self.filters, rows, cols)
+        elif self.data_format == "channels_last":
+            return (input_shape[0], rows, cols, self.filters)
+
+    def call(self, inputs):
+        if self.implementation == 1:
+            output = backend.local_conv(
+                inputs,
+                self.kernel,
+                self.kernel_size,
+                self.strides,
+                (self.output_row, self.output_col),
+                self.data_format,
+            )
+
+        elif self.implementation == 2:
+            output = locally_connected_utils.local_conv_matmul(
+                inputs,
+                self.kernel,
+                self.kernel_mask,
+                self.compute_output_shape(inputs.shape),
+            )
+
+        elif self.implementation == 3:
+            output = locally_connected_utils.local_conv_sparse_matmul(
+                inputs,
+                self.kernel,
+                self.kernel_idxs,
+                self.kernel_shape,
+                self.compute_output_shape(inputs.shape),
+            )
+
+        else:
+            raise ValueError(
+                "Unrecognized implementation mode: %d." % self.implementation
+            )
+
+        if self.use_bias:
+            output = backend.bias_add(
+                output, self.bias, data_format=self.data_format
+            )
+
+        output = self.activation(output)
+        return output
+
+    def get_config(self):
+        config = {
+            "filters": self.filters,
+            "kernel_size": self.kernel_size,
+            "strides": self.strides,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "activation": activations.serialize(self.activation),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "implementation": self.implementation,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/locally_connected/locally_connected_test.py b/keras/layers/locally_connected/locally_connected_test.py
index 9bc2bcdbd111..bb85dee7410b 100644
--- a/keras/layers/locally_connected/locally_connected_test.py
+++ b/keras/layers/locally_connected/locally_connected_test.py
@@ -17,708 +17,734 @@
 
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers.locally_connected import locally_connected_utils
-from keras.optimizers.optimizer_v2 import rmsprop
+from keras.optimizers.legacy import rmsprop
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
-
-
-_DATA_FORMAT_PADDING_IMPLEMENTATION = [{
-    'data_format': 'channels_first',
-    'padding': 'valid',
-    'implementation': 1
-}, {
-    'data_format': 'channels_first',
-    'padding': 'same',
-    'implementation': 1
-}, {
-    'data_format': 'channels_last',
-    'padding': 'valid',
-    'implementation': 1
-}, {
-    'data_format': 'channels_last',
-    'padding': 'same',
-    'implementation': 1
-}, {
-    'data_format': 'channels_first',
-    'padding': 'valid',
-    'implementation': 2
-}, {
-    'data_format': 'channels_first',
-    'padding': 'same',
-    'implementation': 2
-}, {
-    'data_format': 'channels_last',
-    'padding': 'valid',
-    'implementation': 2
-}, {
-    'data_format': 'channels_last',
-    'padding': 'same',
-    'implementation': 2
-}, {
-    'data_format': 'channels_first',
-    'padding': 'valid',
-    'implementation': 3
-}, {
-    'data_format': 'channels_first',
-    'padding': 'same',
-    'implementation': 3
-}, {
-    'data_format': 'channels_last',
-    'padding': 'valid',
-    'implementation': 3
-}, {
-    'data_format': 'channels_last',
-    'padding': 'same',
-    'implementation': 3
-}]
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_util,
+)
+from tensorflow.python.training.rmsprop import (
+    RMSPropOptimizer,
+)
+
+_DATA_FORMAT_PADDING_IMPLEMENTATION = [
+    {"data_format": "channels_first", "padding": "valid", "implementation": 1},
+    {"data_format": "channels_first", "padding": "same", "implementation": 1},
+    {"data_format": "channels_last", "padding": "valid", "implementation": 1},
+    {"data_format": "channels_last", "padding": "same", "implementation": 1},
+    {"data_format": "channels_first", "padding": "valid", "implementation": 2},
+    {"data_format": "channels_first", "padding": "same", "implementation": 2},
+    {"data_format": "channels_last", "padding": "valid", "implementation": 2},
+    {"data_format": "channels_last", "padding": "same", "implementation": 2},
+    {"data_format": "channels_first", "padding": "valid", "implementation": 3},
+    {"data_format": "channels_first", "padding": "same", "implementation": 3},
+    {"data_format": "channels_last", "padding": "valid", "implementation": 3},
+    {"data_format": "channels_last", "padding": "same", "implementation": 3},
+]
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LocallyConnected1DLayersTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
-  def test_locallyconnected_1d(self, data_format, padding, implementation):
-    with self.cached_session():
-      num_samples = 2
-      num_steps = 8
-      input_dim = 5
-      filter_length = 3
-      filters = 4
-
-      for strides in [1]:
-        if padding == 'same' and strides != 1:
-          continue
+    @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
+    def test_locallyconnected_1d(self, data_format, padding, implementation):
+        with self.cached_session():
+            num_samples = 2
+            num_steps = 8
+            input_dim = 5
+            filter_length = 3
+            filters = 4
+
+            for strides in [1]:
+                if padding == "same" and strides != 1:
+                    continue
+                kwargs = {
+                    "filters": filters,
+                    "kernel_size": filter_length,
+                    "padding": padding,
+                    "strides": strides,
+                    "data_format": data_format,
+                    "implementation": implementation,
+                }
+
+                if padding == "same" and implementation == 1:
+                    self.assertRaises(
+                        ValueError, keras.layers.LocallyConnected1D, **kwargs
+                    )
+                else:
+                    test_utils.layer_test(
+                        keras.layers.LocallyConnected1D,
+                        kwargs=kwargs,
+                        input_shape=(num_samples, num_steps, input_dim),
+                    )
+
+    @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
+    def test_locallyconnected_1d_regularization(
+        self, data_format, padding, implementation
+    ):
+        num_samples = 2
+        num_steps = 8
+        input_dim = 5
+        filter_length = 3
+        filters = 4
         kwargs = {
-            'filters': filters,
-            'kernel_size': filter_length,
-            'padding': padding,
-            'strides': strides,
-            'data_format': data_format,
-            'implementation': implementation
+            "filters": filters,
+            "kernel_size": filter_length,
+            "kernel_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "data_format": data_format,
+            "implementation": implementation,
+            "padding": padding,
         }
 
-        if padding == 'same' and implementation == 1:
-          self.assertRaises(ValueError, keras.layers.LocallyConnected1D,
-                            **kwargs)
+        if padding == "same" and implementation == 1:
+            self.assertRaises(
+                ValueError, keras.layers.LocallyConnected1D, **kwargs
+            )
         else:
-          test_utils.layer_test(
-              keras.layers.LocallyConnected1D,
-              kwargs=kwargs,
-              input_shape=(num_samples, num_steps, input_dim))
-
-  @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
-  def test_locallyconnected_1d_regularization(self, data_format, padding,
-                                              implementation):
-    num_samples = 2
-    num_steps = 8
-    input_dim = 5
-    filter_length = 3
-    filters = 4
-    kwargs = {
-        'filters': filters,
-        'kernel_size': filter_length,
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'data_format': data_format,
-        'implementation': implementation,
-        'padding': padding
-    }
-
-    if padding == 'same' and implementation == 1:
-      self.assertRaises(ValueError, keras.layers.LocallyConnected1D, **kwargs)
-    else:
-      with self.cached_session():
-        layer = keras.layers.LocallyConnected1D(**kwargs)
-        layer.build((num_samples, num_steps, input_dim))
-        self.assertLen(layer.losses, 2)
-        layer(
-            keras.backend.variable(
-                np.ones((num_samples, num_steps, input_dim))))
-        self.assertLen(layer.losses, 3)
-
-      k_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      kwargs = {
-          'filters': filters,
-          'kernel_size': filter_length,
-          'kernel_constraint': k_constraint,
-          'bias_constraint': b_constraint,
-      }
-      with self.cached_session():
-        layer = keras.layers.LocallyConnected1D(**kwargs)
-        layer.build((num_samples, num_steps, input_dim))
-        self.assertEqual(layer.kernel.constraint, k_constraint)
-        self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_locallyconnected1d_invalid_output_shapes(self):
-    kwargs = {'filters': 2, 'kernel_size': 10}
-    with self.assertRaisesRegex(
-        ValueError, r"""One of the dimensions in the output is <= 0 """):
-      layer = keras.layers.LocallyConnected1D(**kwargs)
-      layer.build((None, 5, 2))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+            with self.cached_session():
+                layer = keras.layers.LocallyConnected1D(**kwargs)
+                layer.build((num_samples, num_steps, input_dim))
+                self.assertLen(layer.losses, 2)
+                layer(
+                    keras.backend.variable(
+                        np.ones((num_samples, num_steps, input_dim))
+                    )
+                )
+                self.assertLen(layer.losses, 3)
+
+            k_constraint = keras.constraints.max_norm(0.01)
+            b_constraint = keras.constraints.max_norm(0.01)
+            kwargs = {
+                "filters": filters,
+                "kernel_size": filter_length,
+                "kernel_constraint": k_constraint,
+                "bias_constraint": b_constraint,
+            }
+            with self.cached_session():
+                layer = keras.layers.LocallyConnected1D(**kwargs)
+                layer.build((num_samples, num_steps, input_dim))
+                self.assertEqual(layer.kernel.constraint, k_constraint)
+                self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_locallyconnected1d_invalid_output_shapes(self):
+        kwargs = {"filters": 2, "kernel_size": 10}
+        with self.assertRaisesRegex(
+            ValueError, r"""One of the dimensions in the output is <= 0 """
+        ):
+            layer = keras.layers.LocallyConnected1D(**kwargs)
+            layer.build((None, 5, 2))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LocallyConnected2DLayersTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
-  def test_locallyconnected_2d(self, data_format, padding, implementation):
-    with self.cached_session():
-      num_samples = 8
-      filters = 3
-      stack_size = 4
-      num_row = 6
-      num_col = 10
-
-      for strides in [(1, 1), (2, 2)]:
-        if padding == 'same' and strides != (1, 1):
-          continue
-
+    @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
+    def test_locallyconnected_2d(self, data_format, padding, implementation):
+        with self.cached_session():
+            num_samples = 8
+            filters = 3
+            stack_size = 4
+            num_row = 6
+            num_col = 10
+
+            for strides in [(1, 1), (2, 2)]:
+                if padding == "same" and strides != (1, 1):
+                    continue
+
+                kwargs = {
+                    "filters": filters,
+                    "kernel_size": 3,
+                    "padding": padding,
+                    "kernel_regularizer": "l2",
+                    "bias_regularizer": "l2",
+                    "strides": strides,
+                    "data_format": data_format,
+                    "implementation": implementation,
+                }
+
+                if padding == "same" and implementation == 1:
+                    self.assertRaises(
+                        ValueError, keras.layers.LocallyConnected2D, **kwargs
+                    )
+                else:
+                    test_utils.layer_test(
+                        keras.layers.LocallyConnected2D,
+                        kwargs=kwargs,
+                        input_shape=(num_samples, num_row, num_col, stack_size),
+                    )
+
+    @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
+    def test_locallyconnected_2d_channels_first(
+        self, data_format, padding, implementation
+    ):
+        with self.cached_session():
+            num_samples = 8
+            filters = 3
+            stack_size = 4
+            num_row = 6
+            num_col = 10
+            kwargs = {
+                "filters": filters,
+                "kernel_size": 3,
+                "data_format": data_format,
+                "implementation": implementation,
+                "padding": padding,
+            }
+
+            if padding == "same" and implementation == 1:
+                self.assertRaises(
+                    ValueError, keras.layers.LocallyConnected2D, **kwargs
+                )
+            else:
+                test_utils.layer_test(
+                    keras.layers.LocallyConnected2D,
+                    kwargs=kwargs,
+                    input_shape=(num_samples, num_row, num_col, stack_size),
+                )
+
+    @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
+    def test_locallyconnected_2d_regularization(
+        self, data_format, padding, implementation
+    ):
+        num_samples = 2
+        filters = 3
+        stack_size = 4
+        num_row = 6
+        num_col = 7
         kwargs = {
-            'filters': filters,
-            'kernel_size': 3,
-            'padding': padding,
-            'kernel_regularizer': 'l2',
-            'bias_regularizer': 'l2',
-            'strides': strides,
-            'data_format': data_format,
-            'implementation': implementation
+            "filters": filters,
+            "kernel_size": 3,
+            "kernel_regularizer": "l2",
+            "bias_regularizer": "l2",
+            "activity_regularizer": "l2",
+            "implementation": implementation,
+            "padding": padding,
+            "data_format": data_format,
         }
 
-        if padding == 'same' and implementation == 1:
-          self.assertRaises(ValueError, keras.layers.LocallyConnected2D,
-                            **kwargs)
+        if padding == "same" and implementation == 1:
+            self.assertRaises(
+                ValueError, keras.layers.LocallyConnected2D, **kwargs
+            )
         else:
-          test_utils.layer_test(
-              keras.layers.LocallyConnected2D,
-              kwargs=kwargs,
-              input_shape=(num_samples, num_row, num_col, stack_size))
-
-  @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
-  def test_locallyconnected_2d_channels_first(self, data_format, padding,
-                                              implementation):
-    with self.cached_session():
-      num_samples = 8
-      filters = 3
-      stack_size = 4
-      num_row = 6
-      num_col = 10
-      kwargs = {
-          'filters': filters,
-          'kernel_size': 3,
-          'data_format': data_format,
-          'implementation': implementation,
-          'padding': padding
-      }
-
-      if padding == 'same' and implementation == 1:
-        self.assertRaises(ValueError, keras.layers.LocallyConnected2D, **kwargs)
-      else:
-        test_utils.layer_test(
-            keras.layers.LocallyConnected2D,
-            kwargs=kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
-  def test_locallyconnected_2d_regularization(self, data_format, padding,
-                                              implementation):
-    num_samples = 2
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 7
-    kwargs = {
-        'filters': filters,
-        'kernel_size': 3,
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'implementation': implementation,
-        'padding': padding,
-        'data_format': data_format
-    }
-
-    if padding == 'same' and implementation == 1:
-      self.assertRaises(ValueError, keras.layers.LocallyConnected2D, **kwargs)
-    else:
-      with self.cached_session():
-        layer = keras.layers.LocallyConnected2D(**kwargs)
-        layer.build((num_samples, num_row, num_col, stack_size))
-        self.assertLen(layer.losses, 2)
-        layer(
-            keras.backend.variable(
-                np.ones((num_samples, num_row, num_col, stack_size))))
-        self.assertLen(layer.losses, 3)
-
-      k_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      kwargs = {
-          'filters': filters,
-          'kernel_size': 3,
-          'kernel_constraint': k_constraint,
-          'bias_constraint': b_constraint,
-      }
-      with self.cached_session():
-        layer = keras.layers.LocallyConnected2D(**kwargs)
-        layer.build((num_samples, num_row, num_col, stack_size))
-        self.assertEqual(layer.kernel.constraint, k_constraint)
-        self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_locallyconnected2d_invalid_output_shapes(self):
-    kwargs = {'filters': 2, 'kernel_size': 10}
-    with self.assertRaisesRegex(
-        ValueError, r"""One of the dimensions in the output is <= 0 """):
-      layer = keras.layers.LocallyConnected2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class LocallyConnectedImplementationModeTest(tf.test.TestCase,
-                                             parameterized.TestCase):
-
-  @parameterized.parameters([
-      {'width': 1, 'data_format': 'channels_first'},
-      {'width': 1, 'data_format': 'channels_last'},
-      {'width': 6, 'data_format': 'channels_first'},
-      {'width': 6, 'data_format': 'channels_last'},
-  ])
-  def test_locallyconnected_implementation(self, width, data_format):
-    with self.cached_session():
-      num_samples = 4
-      num_classes = 3
-      num_epochs = 2
-
-      np.random.seed(1)
-      tf_test_util.random_seed.set_seed(1)
-      # Following code generates sparse targets and converts them
-      # to one-hot encoded vectors
-      # Create sparse targets eg. [0,1,2]
-      sparse_targets = np.random.randint(0, num_classes, (num_samples,))
-
-      # Convert to one-hot encoding
-      # Final targets:
-      # [[ 1. 0. 0. ]
-      #  [ 0. 1. 0. ]
-      #  [ 0. 0. 1. ]]
-
-      targets = np.zeros((sparse_targets.size, num_classes))
-      targets[np.arange(sparse_targets.size), sparse_targets] = 1
-      height = 7
-      filters = 2
-      inputs = get_inputs(data_format, filters, height, num_samples, width)
-
-      kernel_x = (3,)
-      kernel_y = () if width == 1 else (2,)
-      stride_x = (1,)
-      stride_y = () if width == 1 else (3,)
-      layers = 2
-
-      kwargs = {
-          'layers': layers,
-          'filters': filters,
-          'kernel_size': kernel_x + kernel_y,
-          'strides': stride_x + stride_y,
-          'data_format': data_format,
-          'num_classes': num_classes
-      }
-
-      model_1 = get_model(implementation=1, **kwargs)
-      model_2 = get_model(implementation=2, **kwargs)
-      model_3 = get_model(implementation=3, **kwargs)
-
-      # Build models.
-      model_1.train_on_batch(inputs, targets)
-      model_2.train_on_batch(inputs, targets)
-      model_3.train_on_batch(inputs, targets)
-
-      # Copy weights.
-      copy_model_weights(model_from=model_2, model_to=model_1)
-      copy_model_weights(model_from=model_2, model_to=model_3)
-
-      # Compare outputs at initialization.
-      out_1 = model_1(inputs)
-      out_2 = model_2(inputs)
-      out_3 = model_3(inputs)
-
-      self.assertAllCloseAccordingToType(
-          out_2, out_1, rtol=1e-5, atol=1e-5)
-      self.assertAllCloseAccordingToType(
-          out_2, out_3, rtol=1e-5, atol=1e-5)
-      self.assertAllCloseAccordingToType(
-          out_1, out_3, rtol=1e-5, atol=1e-5)
-
-      # Train.
-      model_1.fit(
-          x=inputs,
-          y=targets,
-          epochs=num_epochs,
-          batch_size=num_samples,
-          shuffle=False)
-      model_2.fit(
-          x=inputs,
-          y=targets,
-          epochs=num_epochs,
-          batch_size=num_samples,
-          shuffle=False)
-      model_3.fit(
-          x=inputs,
-          y=targets,
-          epochs=num_epochs,
-          batch_size=num_samples,
-          shuffle=False)
-
-      # Compare outputs after a few training steps.
-      out_1 = model_1(inputs)
-      out_2 = model_2(inputs)
-      out_3 = model_3(inputs)
-
-      self.assertAllCloseAccordingToType(
-          out_2, out_1, atol=2e-4)
-      self.assertAllCloseAccordingToType(
-          out_2, out_3, atol=2e-4)
-      self.assertAllCloseAccordingToType(
-          out_1, out_3, atol=2e-4)
-
-  @parameterized.parameters([
-      {
-          'width': 1,
-          'data_format': 'channels_first'
-      },
-      {
-          'width': 1,
-          'data_format': 'channels_last'
-      },
-      {
-          'width': 6,
-          'data_format': 'channels_first'
-      },
-      {
-          'width': 6,
-          'data_format': 'channels_last'
-      },
-  ])
-  def test_locallyconnected_save(self, width, data_format):
-    with self.cached_session():
-      num_samples = 4
-      num_classes = 3
-      num_epochs = 2
-
-      np.random.seed(1)
-      tf_test_util.random_seed.set_seed(1)
-      # Following code generates sparse targets and converts them
-      # to one-hot encoded vectors
-      # Create sparse targets eg. [0,1,2]
-      sparse_targets = np.random.randint(0, num_classes, (num_samples,))
-
-      # Convert to one-hot encoding
-      # Final targets:
-      # [[ 1. 0. 0. ]
-      #  [ 0. 1. 0. ]
-      #  [ 0. 0. 1. ]]
-
-      targets = np.zeros((sparse_targets.size, num_classes))
-      targets[np.arange(sparse_targets.size), sparse_targets] = 1
-
-      height = 7
-      filters = 2
-      inputs = get_inputs(data_format, filters, height, num_samples, width)
-
-      kernel_x = (3,)
-      kernel_y = () if width == 1 else (2,)
-      stride_x = (1,)
-      stride_y = () if width == 1 else (3,)
-      layers = 2
-
-      kwargs = {
-          'layers': layers,
-          'filters': filters,
-          'kernel_size': kernel_x + kernel_y,
-          'strides': stride_x + stride_y,
-          'data_format': data_format,
-          'num_classes': num_classes
-      }
-
-      model_1 = get_model_saveable(implementation=1, **kwargs)
-      model_2 = get_model_saveable(implementation=2, **kwargs)
-      model_3 = get_model_saveable(implementation=3, **kwargs)
-
-      # Train.
-      model_1.fit(
-          x=inputs,
-          y=targets,
-          epochs=num_epochs,
-          batch_size=num_samples,
-          shuffle=False)
-      model_2.fit(
-          x=inputs,
-          y=targets,
-          epochs=num_epochs,
-          batch_size=num_samples,
-          shuffle=False)
-      model_3.fit(
-          x=inputs,
-          y=targets,
-          epochs=num_epochs,
-          batch_size=num_samples,
-          shuffle=False)
-
-      out_1_before = model_1(inputs)
-      out_2_before = model_2(inputs)
-      out_3_before = model_3(inputs)
-
-      path_1 = os.path.join(self.get_temp_dir(), 'model_1_path')
-      model_1.save(path_1)
-      model_1 = keras.models.load_model(path_1, custom_objects={'xent': xent})
-      path_2 = os.path.join(self.get_temp_dir(), 'model_2_path')
-      model_2.save(path_2)
-      model_2 = keras.models.load_model(path_2, custom_objects={'xent': xent})
-      path_3 = os.path.join(self.get_temp_dir(), 'model_3_path')
-      model_3.save(path_3)
-      model_3 = keras.models.load_model(path_3, custom_objects={'xent': xent})
-
-      out_1_after = model_1(inputs)
-      out_2_after = model_2(inputs)
-      out_3_after = model_3(inputs)
-
-      self.assertAllCloseAccordingToType(out_1_before, out_1_after, atol=2e-4)
-      self.assertAllCloseAccordingToType(out_2_before, out_2_after, atol=2e-4)
-      self.assertAllCloseAccordingToType(out_3_before, out_3_after, atol=2e-4)
-
-  def test_make_2d(self):
-    input_shapes = [
-        (0,),
-        (0, 0),
-        (1,),
-        (2,),
-        (3,),
-        (1, 0),
-        (0, 3),
-        (1, 1),
-        (1, 2),
-        (3, 1),
-        (2, 2),
-        (3, 3),
-        (1, 0, 1),
-        (5, 2, 3),
-        (3, 5, 6, 7, 0),
-        (3, 2, 2, 4, 4),
-        (1, 2, 3, 4, 7, 2),
-    ]
-    np.random.seed(1)
-
-    for input_shape in input_shapes:
-      inputs = np.random.normal(0, 1, input_shape)
-      inputs_tf = keras.backend.variable(inputs)
-
-      split_dim = np.random.randint(0, inputs.ndim + 1)
-      shape_2d = (int(np.prod(inputs.shape[:split_dim])),
-                  int(np.prod(inputs.shape[split_dim:])))
-      inputs_2d = np.reshape(inputs, shape_2d)
-
-      inputs_2d_tf = locally_connected_utils.make_2d(inputs_tf, split_dim)
-      inputs_2d_tf = keras.backend.get_value(inputs_2d_tf)
-
-      self.assertAllCloseAccordingToType(inputs_2d, inputs_2d_tf)
+            with self.cached_session():
+                layer = keras.layers.LocallyConnected2D(**kwargs)
+                layer.build((num_samples, num_row, num_col, stack_size))
+                self.assertLen(layer.losses, 2)
+                layer(
+                    keras.backend.variable(
+                        np.ones((num_samples, num_row, num_col, stack_size))
+                    )
+                )
+                self.assertLen(layer.losses, 3)
+
+            k_constraint = keras.constraints.max_norm(0.01)
+            b_constraint = keras.constraints.max_norm(0.01)
+            kwargs = {
+                "filters": filters,
+                "kernel_size": 3,
+                "kernel_constraint": k_constraint,
+                "bias_constraint": b_constraint,
+            }
+            with self.cached_session():
+                layer = keras.layers.LocallyConnected2D(**kwargs)
+                layer.build((num_samples, num_row, num_col, stack_size))
+                self.assertEqual(layer.kernel.constraint, k_constraint)
+                self.assertEqual(layer.bias.constraint, b_constraint)
+
+    def test_locallyconnected2d_invalid_output_shapes(self):
+        kwargs = {"filters": 2, "kernel_size": 10}
+        with self.assertRaisesRegex(
+            ValueError, r"""One of the dimensions in the output is <= 0 """
+        ):
+            layer = keras.layers.LocallyConnected2D(**kwargs)
+            layer.build((None, 5, 5, 2))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class LocallyConnectedImplementationModeTest(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @parameterized.parameters(
+        [
+            {"width": 1, "data_format": "channels_first"},
+            {"width": 1, "data_format": "channels_last"},
+            {"width": 6, "data_format": "channels_first"},
+            {"width": 6, "data_format": "channels_last"},
+        ]
+    )
+    def test_locallyconnected_implementation(self, width, data_format):
+        with self.cached_session():
+            num_samples = 4
+            num_classes = 3
+            num_epochs = 2
+
+            np.random.seed(1)
+            tf_test_util.random_seed.set_seed(1)
+            # Following code generates sparse targets and converts them
+            # to one-hot encoded vectors
+            # Create sparse targets eg. [0,1,2]
+            sparse_targets = np.random.randint(0, num_classes, (num_samples,))
+
+            # Convert to one-hot encoding
+            # Final targets:
+            # [[ 1. 0. 0. ]
+            #  [ 0. 1. 0. ]
+            #  [ 0. 0. 1. ]]
+
+            targets = np.zeros((sparse_targets.size, num_classes))
+            targets[np.arange(sparse_targets.size), sparse_targets] = 1
+            height = 7
+            filters = 2
+            inputs = get_inputs(
+                data_format, filters, height, num_samples, width
+            )
+
+            kernel_x = (3,)
+            kernel_y = () if width == 1 else (2,)
+            stride_x = (1,)
+            stride_y = () if width == 1 else (3,)
+            layers = 2
+
+            kwargs = {
+                "layers": layers,
+                "filters": filters,
+                "kernel_size": kernel_x + kernel_y,
+                "strides": stride_x + stride_y,
+                "data_format": data_format,
+                "num_classes": num_classes,
+            }
+
+            model_1 = get_model(implementation=1, **kwargs)
+            model_2 = get_model(implementation=2, **kwargs)
+            model_3 = get_model(implementation=3, **kwargs)
+
+            # Build models.
+            model_1.train_on_batch(inputs, targets)
+            model_2.train_on_batch(inputs, targets)
+            model_3.train_on_batch(inputs, targets)
+
+            # Copy weights.
+            copy_model_weights(model_from=model_2, model_to=model_1)
+            copy_model_weights(model_from=model_2, model_to=model_3)
+
+            # Compare outputs at initialization.
+            out_1 = model_1(inputs)
+            out_2 = model_2(inputs)
+            out_3 = model_3(inputs)
+
+            self.assertAllCloseAccordingToType(
+                out_2, out_1, rtol=1e-5, atol=1e-5
+            )
+            self.assertAllCloseAccordingToType(
+                out_2, out_3, rtol=1e-5, atol=1e-5
+            )
+            self.assertAllCloseAccordingToType(
+                out_1, out_3, rtol=1e-5, atol=1e-5
+            )
+
+            # Train.
+            model_1.fit(
+                x=inputs,
+                y=targets,
+                epochs=num_epochs,
+                batch_size=num_samples,
+                shuffle=False,
+            )
+            model_2.fit(
+                x=inputs,
+                y=targets,
+                epochs=num_epochs,
+                batch_size=num_samples,
+                shuffle=False,
+            )
+            model_3.fit(
+                x=inputs,
+                y=targets,
+                epochs=num_epochs,
+                batch_size=num_samples,
+                shuffle=False,
+            )
+
+            # Compare outputs after a few training steps.
+            out_1 = model_1(inputs)
+            out_2 = model_2(inputs)
+            out_3 = model_3(inputs)
+
+            self.assertAllCloseAccordingToType(out_2, out_1, atol=2e-4)
+            self.assertAllCloseAccordingToType(out_2, out_3, atol=2e-4)
+            self.assertAllCloseAccordingToType(out_1, out_3, atol=2e-4)
+
+    @parameterized.parameters(
+        [
+            {"width": 1, "data_format": "channels_first"},
+            {"width": 1, "data_format": "channels_last"},
+            {"width": 6, "data_format": "channels_first"},
+            {"width": 6, "data_format": "channels_last"},
+        ]
+    )
+    def test_locallyconnected_save(self, width, data_format):
+        with self.cached_session():
+            num_samples = 4
+            num_classes = 3
+            num_epochs = 2
+
+            np.random.seed(1)
+            tf_test_util.random_seed.set_seed(1)
+            # Following code generates sparse targets and converts them
+            # to one-hot encoded vectors
+            # Create sparse targets eg. [0,1,2]
+            sparse_targets = np.random.randint(0, num_classes, (num_samples,))
+
+            # Convert to one-hot encoding
+            # Final targets:
+            # [[ 1. 0. 0. ]
+            #  [ 0. 1. 0. ]
+            #  [ 0. 0. 1. ]]
+
+            targets = np.zeros((sparse_targets.size, num_classes))
+            targets[np.arange(sparse_targets.size), sparse_targets] = 1
+
+            height = 7
+            filters = 2
+            inputs = get_inputs(
+                data_format, filters, height, num_samples, width
+            )
+
+            kernel_x = (3,)
+            kernel_y = () if width == 1 else (2,)
+            stride_x = (1,)
+            stride_y = () if width == 1 else (3,)
+            layers = 2
+
+            kwargs = {
+                "layers": layers,
+                "filters": filters,
+                "kernel_size": kernel_x + kernel_y,
+                "strides": stride_x + stride_y,
+                "data_format": data_format,
+                "num_classes": num_classes,
+            }
+
+            model_1 = get_model_saveable(implementation=1, **kwargs)
+            model_2 = get_model_saveable(implementation=2, **kwargs)
+            model_3 = get_model_saveable(implementation=3, **kwargs)
+
+            # Train.
+            model_1.fit(
+                x=inputs,
+                y=targets,
+                epochs=num_epochs,
+                batch_size=num_samples,
+                shuffle=False,
+            )
+            model_2.fit(
+                x=inputs,
+                y=targets,
+                epochs=num_epochs,
+                batch_size=num_samples,
+                shuffle=False,
+            )
+            model_3.fit(
+                x=inputs,
+                y=targets,
+                epochs=num_epochs,
+                batch_size=num_samples,
+                shuffle=False,
+            )
+
+            out_1_before = model_1(inputs)
+            out_2_before = model_2(inputs)
+            out_3_before = model_3(inputs)
+
+            path_1 = os.path.join(self.get_temp_dir(), "model_1_path")
+            model_1.save(path_1)
+            model_1 = keras.models.load_model(
+                path_1, custom_objects={"xent": xent}
+            )
+            path_2 = os.path.join(self.get_temp_dir(), "model_2_path")
+            model_2.save(path_2)
+            model_2 = keras.models.load_model(
+                path_2, custom_objects={"xent": xent}
+            )
+            path_3 = os.path.join(self.get_temp_dir(), "model_3_path")
+            model_3.save(path_3)
+            model_3 = keras.models.load_model(
+                path_3, custom_objects={"xent": xent}
+            )
+
+            out_1_after = model_1(inputs)
+            out_2_after = model_2(inputs)
+            out_3_after = model_3(inputs)
+
+            self.assertAllCloseAccordingToType(
+                out_1_before, out_1_after, atol=2e-4
+            )
+            self.assertAllCloseAccordingToType(
+                out_2_before, out_2_after, atol=2e-4
+            )
+            self.assertAllCloseAccordingToType(
+                out_3_before, out_3_after, atol=2e-4
+            )
+
+    def test_make_2d(self):
+        input_shapes = [
+            (0,),
+            (0, 0),
+            (1,),
+            (2,),
+            (3,),
+            (1, 0),
+            (0, 3),
+            (1, 1),
+            (1, 2),
+            (3, 1),
+            (2, 2),
+            (3, 3),
+            (1, 0, 1),
+            (5, 2, 3),
+            (3, 5, 6, 7, 0),
+            (3, 2, 2, 4, 4),
+            (1, 2, 3, 4, 7, 2),
+        ]
+        np.random.seed(1)
+
+        for input_shape in input_shapes:
+            inputs = np.random.normal(0, 1, input_shape)
+            inputs_tf = keras.backend.variable(inputs)
+
+            split_dim = np.random.randint(0, inputs.ndim + 1)
+            shape_2d = (
+                int(np.prod(inputs.shape[:split_dim])),
+                int(np.prod(inputs.shape[split_dim:])),
+            )
+            inputs_2d = np.reshape(inputs, shape_2d)
+
+            inputs_2d_tf = locally_connected_utils.make_2d(inputs_tf, split_dim)
+            inputs_2d_tf = keras.backend.get_value(inputs_2d_tf)
+
+            self.assertAllCloseAccordingToType(inputs_2d, inputs_2d_tf)
 
 
 def get_inputs(data_format, filters, height, num_samples, width):
-  if data_format == 'channels_first':
-    if width == 1:
-      input_shape = (filters, height)
-    else:
-      input_shape = (filters, height, width)
+    if data_format == "channels_first":
+        if width == 1:
+            input_shape = (filters, height)
+        else:
+            input_shape = (filters, height, width)
 
-  elif data_format == 'channels_last':
-    if width == 1:
-      input_shape = (height, filters)
-    else:
-      input_shape = (height, width, filters)
+    elif data_format == "channels_last":
+        if width == 1:
+            input_shape = (height, filters)
+        else:
+            input_shape = (height, width, filters)
 
-  else:
-    raise NotImplementedError(data_format)
+    else:
+        raise NotImplementedError(data_format)
 
-  inputs = np.random.normal(0, 1,
-                            (num_samples,) + input_shape).astype(np.float32)
-  return inputs
+    inputs = np.random.normal(0, 1, (num_samples,) + input_shape).astype(
+        np.float32
+    )
+    return inputs
 
 
 def xent(y_true, y_pred):
-  y_true = keras.backend.cast(
-      keras.backend.reshape(y_true, (-1,)),
-      tf.int32)
-
-  return tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits(
-      labels=y_true,
-      logits=y_pred)
-
-
-def get_model(implementation,
-              filters,
-              kernel_size,
-              strides,
-              layers,
-              num_classes,
-              data_format):
-  model = keras.Sequential()
-
-  if len(kernel_size) == 1:
-    lc_layer = keras.layers.LocallyConnected1D
-  elif len(kernel_size) == 2:
-    lc_layer = keras.layers.LocallyConnected2D
-  else:
-    raise NotImplementedError(kernel_size)
-
-  for _ in range(layers):
-    model.add(lc_layer(
-        padding='valid',
-        kernel_initializer=keras.initializers.random_normal(),
-        bias_initializer=keras.initializers.random_normal(),
-        filters=filters,
-        strides=strides,
-        kernel_size=kernel_size,
-        activation=keras.activations.relu,
-        data_format=data_format,
-        implementation=implementation))
-
-  model.add(keras.layers.Flatten())
-  model.add(keras.layers.Dense(num_classes))
-  model.compile(
-      optimizer=RMSPropOptimizer(0.01),
-      metrics=[keras.metrics.categorical_accuracy],
-      loss=keras.losses.CategoricalCrossentropy(from_logits=True))
-  return model
-
-
-def get_model_saveable(implementation, filters, kernel_size, strides, layers,
-                       num_classes, data_format):
-  model = keras.Sequential()
-
-  if len(kernel_size) == 1:
-    lc_layer = keras.layers.LocallyConnected1D
-  elif len(kernel_size) == 2:
-    lc_layer = keras.layers.LocallyConnected2D
-  else:
-    raise NotImplementedError(kernel_size)
-
-  for _ in range(layers):
-    model.add(
-        lc_layer(
-            padding='valid',
-            kernel_initializer=keras.initializers.random_normal(),
-            bias_initializer=keras.initializers.random_normal(),
-            filters=filters,
-            strides=strides,
-            kernel_size=kernel_size,
-            activation=keras.activations.relu,
-            data_format=data_format,
-            implementation=implementation))
-
-  model.add(keras.layers.Flatten())
-  model.add(keras.layers.Dense(num_classes))
-  model.compile(
-      optimizer=rmsprop.RMSProp(learning_rate=0.01),
-      metrics=[keras.metrics.categorical_accuracy],
-      loss=keras.losses.CategoricalCrossentropy(from_logits=True))
-  return model
+    y_true = keras.backend.cast(keras.backend.reshape(y_true, (-1,)), tf.int32)
+
+    return tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=y_true, logits=y_pred
+    )
+
+
+def get_model(
+    implementation,
+    filters,
+    kernel_size,
+    strides,
+    layers,
+    num_classes,
+    data_format,
+):
+    model = keras.Sequential()
+
+    if len(kernel_size) == 1:
+        lc_layer = keras.layers.LocallyConnected1D
+    elif len(kernel_size) == 2:
+        lc_layer = keras.layers.LocallyConnected2D
+    else:
+        raise NotImplementedError(kernel_size)
+
+    for _ in range(layers):
+        model.add(
+            lc_layer(
+                padding="valid",
+                kernel_initializer=keras.initializers.random_normal(),
+                bias_initializer=keras.initializers.random_normal(),
+                filters=filters,
+                strides=strides,
+                kernel_size=kernel_size,
+                activation=keras.activations.relu,
+                data_format=data_format,
+                implementation=implementation,
+            )
+        )
+
+    model.add(keras.layers.Flatten())
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(
+        optimizer=RMSPropOptimizer(0.01),
+        metrics=[keras.metrics.categorical_accuracy],
+        loss=keras.losses.CategoricalCrossentropy(from_logits=True),
+    )
+    return model
+
+
+def get_model_saveable(
+    implementation,
+    filters,
+    kernel_size,
+    strides,
+    layers,
+    num_classes,
+    data_format,
+):
+    model = keras.Sequential()
+
+    if len(kernel_size) == 1:
+        lc_layer = keras.layers.LocallyConnected1D
+    elif len(kernel_size) == 2:
+        lc_layer = keras.layers.LocallyConnected2D
+    else:
+        raise NotImplementedError(kernel_size)
+
+    for _ in range(layers):
+        model.add(
+            lc_layer(
+                padding="valid",
+                kernel_initializer=keras.initializers.random_normal(),
+                bias_initializer=keras.initializers.random_normal(),
+                filters=filters,
+                strides=strides,
+                kernel_size=kernel_size,
+                activation=keras.activations.relu,
+                data_format=data_format,
+                implementation=implementation,
+            )
+        )
+
+    model.add(keras.layers.Flatten())
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(
+        optimizer=rmsprop.RMSProp(learning_rate=0.01),
+        metrics=[keras.metrics.categorical_accuracy],
+        loss=keras.losses.CategoricalCrossentropy(from_logits=True),
+    )
+    return model
 
 
 def copy_lc_weights_2_to_1(lc_layer_2_from, lc_layer_1_to):
-  lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
-  lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
+    lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
+    lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
 
-  data_format = lc_layer_2_from.data_format
+    data_format = lc_layer_2_from.data_format
 
-  if data_format == 'channels_first':
-    if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
-      permutation = (3, 0, 1, 2)
-    elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
-      permutation = (4, 5, 0, 1, 2, 3)
-    else:
-      raise NotImplementedError(lc_layer_2_from)
+    if data_format == "channels_first":
+        if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
+            permutation = (3, 0, 1, 2)
+        elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
+            permutation = (4, 5, 0, 1, 2, 3)
+        else:
+            raise NotImplementedError(lc_layer_2_from)
 
-  elif data_format == 'channels_last':
-    if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
-      permutation = (2, 0, 1, 3)
-    elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
-      permutation = (3, 4, 0, 1, 2, 5)
-    else:
-      raise NotImplementedError(lc_layer_2_from)
+    elif data_format == "channels_last":
+        if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
+            permutation = (2, 0, 1, 3)
+        elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
+            permutation = (3, 4, 0, 1, 2, 5)
+        else:
+            raise NotImplementedError(lc_layer_2_from)
 
-  else:
-    raise NotImplementedError(data_format)
+    else:
+        raise NotImplementedError(data_format)
 
-  lc_2_kernel_masked = keras.backend.permute_dimensions(
-      lc_2_kernel_masked, permutation)
+    lc_2_kernel_masked = keras.backend.permute_dimensions(
+        lc_2_kernel_masked, permutation
+    )
 
-  lc_2_kernel_mask = tf.not_equal(
-      lc_2_kernel_masked, 0)
-  lc_2_kernel_flat = tf.compat.v1.boolean_mask(
-      lc_2_kernel_masked, lc_2_kernel_mask)
-  lc_2_kernel_reshaped = keras.backend.reshape(lc_2_kernel_flat,
-                                               lc_layer_1_to.kernel.shape)
+    lc_2_kernel_mask = tf.not_equal(lc_2_kernel_masked, 0)
+    lc_2_kernel_flat = tf.compat.v1.boolean_mask(
+        lc_2_kernel_masked, lc_2_kernel_mask
+    )
+    lc_2_kernel_reshaped = keras.backend.reshape(
+        lc_2_kernel_flat, lc_layer_1_to.kernel.shape
+    )
 
-  lc_2_kernel_reshaped = keras.backend.get_value(lc_2_kernel_reshaped)
-  lc_2_bias = keras.backend.get_value(lc_2_bias)
+    lc_2_kernel_reshaped = keras.backend.get_value(lc_2_kernel_reshaped)
+    lc_2_bias = keras.backend.get_value(lc_2_bias)
 
-  lc_layer_1_to.set_weights([lc_2_kernel_reshaped, lc_2_bias])
+    lc_layer_1_to.set_weights([lc_2_kernel_reshaped, lc_2_bias])
 
 
 def copy_lc_weights_2_to_3(lc_layer_2_from, lc_layer_3_to):
-  lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
-  lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
+    lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
+    lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
 
-  lc_2_kernel_masked = locally_connected_utils.make_2d(
-      lc_2_kernel_masked, split_dim=keras.backend.ndim(lc_2_kernel_masked) // 2)
-  lc_2_kernel_masked = keras.backend.transpose(lc_2_kernel_masked)
-  lc_2_kernel_mask = tf.not_equal(lc_2_kernel_masked, 0)
-  lc_2_kernel_flat = tf.compat.v1.boolean_mask(
-      lc_2_kernel_masked, lc_2_kernel_mask)
+    lc_2_kernel_masked = locally_connected_utils.make_2d(
+        lc_2_kernel_masked,
+        split_dim=keras.backend.ndim(lc_2_kernel_masked) // 2,
+    )
+    lc_2_kernel_masked = keras.backend.transpose(lc_2_kernel_masked)
+    lc_2_kernel_mask = tf.not_equal(lc_2_kernel_masked, 0)
+    lc_2_kernel_flat = tf.compat.v1.boolean_mask(
+        lc_2_kernel_masked, lc_2_kernel_mask
+    )
 
-  lc_2_kernel_flat = keras.backend.get_value(lc_2_kernel_flat)
-  lc_2_bias = keras.backend.get_value(lc_2_bias)
+    lc_2_kernel_flat = keras.backend.get_value(lc_2_kernel_flat)
+    lc_2_bias = keras.backend.get_value(lc_2_bias)
 
-  lc_layer_3_to.set_weights([lc_2_kernel_flat, lc_2_bias])
+    lc_layer_3_to.set_weights([lc_2_kernel_flat, lc_2_bias])
 
 
 def copy_model_weights(model_from, model_to):
-  for l in range(len(model_from.layers)):
-    layer_from = model_from.layers[l]
-    layer_to = model_to.layers[l]
-
-    if (isinstance(
-        layer_from,
-        (keras.layers.LocallyConnected2D, keras.layers.LocallyConnected1D)) and
-        isinstance(layer_to, (keras.layers.LocallyConnected2D,
-                              keras.layers.LocallyConnected1D))):
-      if layer_from.implementation == 2:
-        if layer_to.implementation == 1:
-          copy_lc_weights_2_to_1(layer_from, layer_to)
-        elif layer_to.implementation == 3:
-          copy_lc_weights_2_to_3(layer_from, layer_to)
-        else:
-          raise NotImplementedError
-
-      else:
-        raise NotImplementedError
+    for l in range(len(model_from.layers)):
+        layer_from = model_from.layers[l]
+        layer_to = model_to.layers[l]
+
+        if isinstance(
+            layer_from,
+            (keras.layers.LocallyConnected2D, keras.layers.LocallyConnected1D),
+        ) and isinstance(
+            layer_to,
+            (keras.layers.LocallyConnected2D, keras.layers.LocallyConnected1D),
+        ):
+            if layer_from.implementation == 2:
+                if layer_to.implementation == 1:
+                    copy_lc_weights_2_to_1(layer_from, layer_to)
+                elif layer_to.implementation == 3:
+                    copy_lc_weights_2_to_3(layer_from, layer_to)
+                else:
+                    raise NotImplementedError
+
+            else:
+                raise NotImplementedError
+
+        elif isinstance(layer_from, keras.layers.Dense):
+            weights_2, bias_2 = layer_from.weights
+            weights_2 = keras.backend.get_value(weights_2)
+            bias_2 = keras.backend.get_value(bias_2)
+            layer_to.set_weights([weights_2, bias_2])
 
-    elif isinstance(layer_from, keras.layers.Dense):
-      weights_2, bias_2 = layer_from.weights
-      weights_2 = keras.backend.get_value(weights_2)
-      bias_2 = keras.backend.get_value(bias_2)
-      layer_to.set_weights([weights_2, bias_2])
-
-    else:
-      continue
+        else:
+            continue
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/locally_connected/locally_connected_utils.py b/keras/layers/locally_connected/locally_connected_utils.py
index 435758e7e023..26695a506753 100644
--- a/keras/layers/locally_connected/locally_connected_utils.py
+++ b/keras/layers/locally_connected/locally_connected_utils.py
@@ -14,180 +14,193 @@
 # ==============================================================================
 """Private utilities for locally-connected layers."""
 
-from keras import backend
-from keras.utils import conv_utils
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+from keras import backend
+from keras.utils import conv_utils
+
 
-def get_locallyconnected_mask(input_shape, kernel_shape, strides, padding,
-                              data_format):
-  """Return a mask representing connectivity of a locally-connected operation.
-
-  This method returns a masking numpy array of 0s and 1s (of type `np.float32`)
-  that, when element-wise multiplied with a fully-connected weight tensor, masks
-  out the weights between disconnected input-output pairs and thus implements
-  local connectivity through a sparse fully-connected weight tensor.
-
-  Assume an unshared convolution with given parameters is applied to an input
-  having N spatial dimensions with `input_shape = (d_in1, ..., d_inN)`
-  to produce an output with spatial shape `(d_out1, ..., d_outN)` (determined
-  by layer parameters such as `strides`).
-
-  This method returns a mask which can be broadcast-multiplied (element-wise)
-  with a 2*(N+1)-D weight matrix (equivalent to a fully-connected layer between
-  (N+1)-D activations (N spatial + 1 channel dimensions for input and output)
-  to make it perform an unshared convolution with given `kernel_shape`,
-  `strides`, `padding` and `data_format`.
-
-  Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)` spatial shape of the
-      input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
-      receptive field.
-    strides: tuple of size N, strides along each spatial dimension.
-    padding: type of padding, string `"same"` or `"valid"`.
-    data_format: a string, `"channels_first"` or `"channels_last"`.
-
-  Returns:
-    a `np.float32`-type `np.ndarray` of shape
-    `(1, d_in1, ..., d_inN, 1, d_out1, ..., d_outN)`
-    if `data_format == `"channels_first"`, or
-    `(d_in1, ..., d_inN, 1, d_out1, ..., d_outN, 1)`
-    if `data_format == "channels_last"`.
-
-  Raises:
-    ValueError: if `data_format` is neither `"channels_first"` nor
-                `"channels_last"`.
-  """
-  mask = conv_utils.conv_kernel_mask(
-      input_shape=input_shape,
-      kernel_shape=kernel_shape,
-      strides=strides,
-      padding=padding)
-
-  ndims = int(mask.ndim / 2)
-
-  if data_format == 'channels_first':
-    mask = np.expand_dims(mask, 0)
-    mask = np.expand_dims(mask, -ndims - 1)
-
-  elif data_format == 'channels_last':
-    mask = np.expand_dims(mask, ndims)
-    mask = np.expand_dims(mask, -1)
-
-  else:
-    raise ValueError('Unrecognized data_format: ' + str(data_format))
-
-  return mask
+def get_locallyconnected_mask(
+    input_shape, kernel_shape, strides, padding, data_format
+):
+    """Return a mask representing connectivity of a locally-connected operation.
+
+    This method returns a masking numpy array of 0s and 1s (of type
+    `np.float32`) that, when element-wise multiplied with a fully-connected
+    weight tensor, masks out the weights between disconnected input-output pairs
+    and thus implements local connectivity through a sparse fully-connected
+    weight tensor.
+
+    Assume an unshared convolution with given parameters is applied to an input
+    having N spatial dimensions with `input_shape = (d_in1, ..., d_inN)`
+    to produce an output with spatial shape `(d_out1, ..., d_outN)` (determined
+    by layer parameters such as `strides`).
+
+    This method returns a mask which can be broadcast-multiplied (element-wise)
+    with a 2*(N+1)-D weight matrix (equivalent to a fully-connected layer
+    between (N+1)-D activations (N spatial + 1 channel dimensions for input and
+    output) to make it perform an unshared convolution with given
+    `kernel_shape`, `strides`, `padding` and `data_format`.
+
+    Args:
+      input_shape: tuple of size N: `(d_in1, ..., d_inN)` spatial shape of the
+        input.
+      kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+        receptive field.
+      strides: tuple of size N, strides along each spatial dimension.
+      padding: type of padding, string `"same"` or `"valid"`.
+      data_format: a string, `"channels_first"` or `"channels_last"`.
+
+    Returns:
+      a `np.float32`-type `np.ndarray` of shape
+      `(1, d_in1, ..., d_inN, 1, d_out1, ..., d_outN)`
+      if `data_format == `"channels_first"`, or
+      `(d_in1, ..., d_inN, 1, d_out1, ..., d_outN, 1)`
+      if `data_format == "channels_last"`.
+
+    Raises:
+      ValueError: if `data_format` is neither `"channels_first"` nor
+                  `"channels_last"`.
+    """
+    mask = conv_utils.conv_kernel_mask(
+        input_shape=input_shape,
+        kernel_shape=kernel_shape,
+        strides=strides,
+        padding=padding,
+    )
+
+    ndims = int(mask.ndim / 2)
+
+    if data_format == "channels_first":
+        mask = np.expand_dims(mask, 0)
+        mask = np.expand_dims(mask, -ndims - 1)
+
+    elif data_format == "channels_last":
+        mask = np.expand_dims(mask, ndims)
+        mask = np.expand_dims(mask, -1)
+
+    else:
+        raise ValueError("Unrecognized data_format: " + str(data_format))
+
+    return mask
 
 
 def local_conv_matmul(inputs, kernel, kernel_mask, output_shape):
-  """Apply N-D convolution with un-shared weights using a single matmul call.
-
-  This method outputs `inputs . (kernel * kernel_mask)`
-  (with `.` standing for matrix-multiply and `*` for element-wise multiply)
-  and requires a precomputed `kernel_mask` to zero-out weights in `kernel` and
-  hence perform the same operation as a convolution with un-shared
-  (the remaining entries in `kernel`) weights. It also does the necessary
-  reshapes to make `inputs` and `kernel` 2-D and `output` (N+2)-D.
-
-  Args:
-      inputs: (N+2)-D tensor with shape `(batch_size, channels_in, d_in1, ...,
-        d_inN)` or `(batch_size, d_in1, ..., d_inN, channels_in)`.
-      kernel: the unshared weights for N-D convolution,
-          an (N+2)-D tensor of shape: `(d_in1, ..., d_inN, channels_in, d_out2,
-            ..., d_outN, channels_out)` or `(channels_in, d_in1, ..., d_inN,
-            channels_out, d_out2, ..., d_outN)`, with the ordering of channels
-            and spatial dimensions matching that of the input. Each entry is the
-            weight between a particular input and output location, similarly to
-            a fully-connected weight matrix.
-      kernel_mask: a float 0/1 mask tensor of shape: `(d_in1, ..., d_inN, 1,
-        d_out2, ..., d_outN, 1)` or `(1, d_in1, ..., d_inN, 1, d_out2, ...,
-        d_outN)`, with the ordering of singleton and spatial dimensions matching
-        that of the input. Mask represents the connectivity pattern of the layer
-        and is
-           precomputed elsewhere based on layer parameters: stride, padding, and
-             the receptive field shape.
-      output_shape: a tuple of (N+2) elements representing the output shape:
-        `(batch_size, channels_out, d_out1, ..., d_outN)` or `(batch_size,
-        d_out1, ..., d_outN, channels_out)`, with the ordering of channels and
-        spatial dimensions matching that of the input.
-
-  Returns:
-      Output (N+2)-D tensor with shape `output_shape`.
-  """
-  inputs_flat = backend.reshape(inputs, (backend.shape(inputs)[0], -1))
-
-  kernel = kernel_mask * kernel
-  kernel = make_2d(kernel, split_dim=backend.ndim(kernel) // 2)
-
-  output_flat = tf.matmul(inputs_flat, kernel, b_is_sparse=True)
-  output = backend.reshape(output_flat, [
-      backend.shape(output_flat)[0],
-  ] + output_shape.as_list()[1:])
-  return output
-
-
-def local_conv_sparse_matmul(inputs, kernel, kernel_idxs, kernel_shape,
-                             output_shape):
-  """Apply N-D convolution with un-shared weights using a single sparse matmul.
-
-  This method outputs `inputs . tf.sparse.SparseTensor(indices=kernel_idxs,
-  values=kernel, dense_shape=kernel_shape)`, with `.` standing for
-  matrix-multiply. It also reshapes `inputs` to 2-D and `output` to (N+2)-D.
-
-  Args:
-      inputs: (N+2)-D tensor with shape `(batch_size, channels_in, d_in1, ...,
-        d_inN)` or `(batch_size, d_in1, ..., d_inN, channels_in)`.
-      kernel: a 1-D tensor with shape `(len(kernel_idxs),)` containing all the
-        weights of the layer.
-      kernel_idxs:  a list of integer tuples representing indices in a sparse
-        matrix performing the un-shared convolution as a matrix-multiply.
-      kernel_shape: a tuple `(input_size, output_size)`, where `input_size =
-        channels_in * d_in1 * ... * d_inN` and `output_size = channels_out *
-        d_out1 * ... * d_outN`.
-      output_shape: a tuple of (N+2) elements representing the output shape:
-        `(batch_size, channels_out, d_out1, ..., d_outN)` or `(batch_size,
-        d_out1, ..., d_outN, channels_out)`, with the ordering of channels and
-        spatial dimensions matching that of the input.
-
-  Returns:
-      Output (N+2)-D dense tensor with shape `output_shape`.
-  """
-  inputs_flat = backend.reshape(inputs, (backend.shape(inputs)[0], -1))
-  output_flat = tf.sparse.sparse_dense_matmul(
-      sp_a=tf.SparseTensor(kernel_idxs, kernel, kernel_shape),
-      b=inputs_flat,
-      adjoint_b=True)
-  output_flat_transpose = backend.transpose(output_flat)
-
-  output_reshaped = backend.reshape(output_flat_transpose, [
-      backend.shape(output_flat_transpose)[0],
-  ] + output_shape.as_list()[1:])
-  return output_reshaped
+    """Apply N-D convolution with un-shared weights using a single matmul call.
+
+    This method outputs `inputs . (kernel * kernel_mask)`
+    (with `.` standing for matrix-multiply and `*` for element-wise multiply)
+    and requires a precomputed `kernel_mask` to zero-out weights in `kernel` and
+    hence perform the same operation as a convolution with un-shared
+    (the remaining entries in `kernel`) weights. It also does the necessary
+    reshapes to make `inputs` and `kernel` 2-D and `output` (N+2)-D.
+
+    Args:
+        inputs: (N+2)-D tensor with shape `(batch_size, channels_in, d_in1, ...,
+          d_inN)` or `(batch_size, d_in1, ..., d_inN, channels_in)`.
+        kernel: the unshared weights for N-D convolution,
+            an (N+2)-D tensor of shape: `(d_in1, ..., d_inN, channels_in,
+            d_out2, ..., d_outN, channels_out)` or `(channels_in, d_in1, ...,
+            d_inN, channels_out, d_out2, ..., d_outN)`, with the ordering of
+            channels and spatial dimensions matching that of the input. Each
+            entry is the weight between a particular input and output location,
+            similarly to a fully-connected weight matrix.
+        kernel_mask: a float 0/1 mask tensor of shape: `(d_in1, ..., d_inN, 1,
+          d_out2, ..., d_outN, 1)` or `(1, d_in1, ..., d_inN, 1, d_out2, ...,
+          d_outN)`, with the ordering of singleton and spatial dimensions
+          matching that of the input. Mask represents the connectivity pattern
+          of the layer and is precomputed elsewhere based on layer parameters:
+          stride, padding, and the receptive field shape.
+        output_shape: a tuple of (N+2) elements representing the output shape:
+          `(batch_size, channels_out, d_out1, ..., d_outN)` or `(batch_size,
+          d_out1, ..., d_outN, channels_out)`, with the ordering of channels and
+          spatial dimensions matching that of the input.
+
+    Returns:
+        Output (N+2)-D tensor with shape `output_shape`.
+    """
+    inputs_flat = backend.reshape(inputs, (backend.shape(inputs)[0], -1))
+
+    kernel = kernel_mask * kernel
+    kernel = make_2d(kernel, split_dim=backend.ndim(kernel) // 2)
+
+    output_flat = tf.matmul(inputs_flat, kernel, b_is_sparse=True)
+    output = backend.reshape(
+        output_flat,
+        [
+            backend.shape(output_flat)[0],
+        ]
+        + output_shape.as_list()[1:],
+    )
+    return output
+
+
+def local_conv_sparse_matmul(
+    inputs, kernel, kernel_idxs, kernel_shape, output_shape
+):
+    """Apply N-D convolution with unshared weights using a single sparse matmul.
+
+    This method outputs `inputs . tf.sparse.SparseTensor(indices=kernel_idxs,
+    values=kernel, dense_shape=kernel_shape)`, with `.` standing for
+    matrix-multiply. It also reshapes `inputs` to 2-D and `output` to (N+2)-D.
+
+    Args:
+        inputs: (N+2)-D tensor with shape `(batch_size, channels_in, d_in1, ...,
+          d_inN)` or `(batch_size, d_in1, ..., d_inN, channels_in)`.
+        kernel: a 1-D tensor with shape `(len(kernel_idxs),)` containing all the
+          weights of the layer.
+        kernel_idxs:  a list of integer tuples representing indices in a sparse
+          matrix performing the un-shared convolution as a matrix-multiply.
+        kernel_shape: a tuple `(input_size, output_size)`, where `input_size =
+          channels_in * d_in1 * ... * d_inN` and `output_size = channels_out *
+          d_out1 * ... * d_outN`.
+        output_shape: a tuple of (N+2) elements representing the output shape:
+          `(batch_size, channels_out, d_out1, ..., d_outN)` or `(batch_size,
+          d_out1, ..., d_outN, channels_out)`, with the ordering of channels and
+          spatial dimensions matching that of the input.
+
+    Returns:
+        Output (N+2)-D dense tensor with shape `output_shape`.
+    """
+    inputs_flat = backend.reshape(inputs, (backend.shape(inputs)[0], -1))
+    output_flat = tf.sparse.sparse_dense_matmul(
+        sp_a=tf.SparseTensor(kernel_idxs, kernel, kernel_shape),
+        b=inputs_flat,
+        adjoint_b=True,
+    )
+    output_flat_transpose = backend.transpose(output_flat)
+
+    output_reshaped = backend.reshape(
+        output_flat_transpose,
+        [
+            backend.shape(output_flat_transpose)[0],
+        ]
+        + output_shape.as_list()[1:],
+    )
+    return output_reshaped
 
 
 def make_2d(tensor, split_dim):
-  """Reshapes an N-dimensional tensor into a 2D tensor.
+    """Reshapes an N-dimensional tensor into a 2D tensor.
 
-  Dimensions before (excluding) and after (including) `split_dim` are grouped
-  together.
+    Dimensions before (excluding) and after (including) `split_dim` are grouped
+    together.
 
-  Args:
-    tensor: a tensor of shape `(d0, ..., d(N-1))`.
-    split_dim: an integer from 1 to N-1, index of the dimension to group
-      dimensions before (excluding) and after (including).
+    Args:
+      tensor: a tensor of shape `(d0, ..., d(N-1))`.
+      split_dim: an integer from 1 to N-1, index of the dimension to group
+        dimensions before (excluding) and after (including).
 
-  Returns:
-    Tensor of shape
-    `(d0 * ... * d(split_dim-1), d(split_dim) * ... * d(N-1))`.
-  """
-  shape = tf.shape(tensor)
-  in_dims = shape[:split_dim]
-  out_dims = shape[split_dim:]
+    Returns:
+      Tensor of shape
+      `(d0 * ... * d(split_dim-1), d(split_dim) * ... * d(N-1))`.
+    """
+    shape = tf.shape(tensor)
+    in_dims = shape[:split_dim]
+    out_dims = shape[split_dim:]
 
-  in_size = tf.reduce_prod(in_dims)
-  out_size = tf.reduce_prod(out_dims)
+    in_size = tf.reduce_prod(in_dims)
+    out_size = tf.reduce_prod(out_dims)
 
-  return tf.reshape(tensor, (in_size, out_size))
+    return tf.reshape(tensor, (in_size, out_size))
diff --git a/keras/layers/merging/BUILD b/keras/layers/merging/BUILD
index 357606ec0f92..7de776ca2a18 100644
--- a/keras/layers/merging/BUILD
+++ b/keras/layers/merging/BUILD
@@ -1,15 +1,17 @@
 # Description:
 #  Contains the Keras merging layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/py/tensorflow_gnn:__subpackages__",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
diff --git a/keras/layers/merging/__init__.py b/keras/layers/merging/__init__.py
index 406c6afbd8ac..beb834f31c73 100644
--- a/keras/layers/merging/__init__.py
+++ b/keras/layers/merging/__init__.py
@@ -13,24 +13,23 @@
 # limitations under the License.
 # ==============================================================================
 """Keras merging layers."""
-# pylint: disable=g-bad-import-order
 
+
+# Merging functions.
 # Merging layers.
 from keras.layers.merging.add import Add
-from keras.layers.merging.subtract import Subtract
-from keras.layers.merging.multiply import Multiply
+from keras.layers.merging.add import add
 from keras.layers.merging.average import Average
-from keras.layers.merging.maximum import Maximum
-from keras.layers.merging.minimum import Minimum
+from keras.layers.merging.average import average
 from keras.layers.merging.concatenate import Concatenate
+from keras.layers.merging.concatenate import concatenate
 from keras.layers.merging.dot import Dot
-
-# Merging functions.
-from keras.layers.merging.add import add
-from keras.layers.merging.subtract import subtract
-from keras.layers.merging.multiply import multiply
-from keras.layers.merging.average import average
+from keras.layers.merging.dot import dot
+from keras.layers.merging.maximum import Maximum
 from keras.layers.merging.maximum import maximum
+from keras.layers.merging.minimum import Minimum
 from keras.layers.merging.minimum import minimum
-from keras.layers.merging.concatenate import concatenate
-from keras.layers.merging.dot import dot
+from keras.layers.merging.multiply import Multiply
+from keras.layers.merging.multiply import multiply
+from keras.layers.merging.subtract import Subtract
+from keras.layers.merging.subtract import subtract
diff --git a/keras/layers/merging/add.py b/keras/layers/merging/add.py
index 8e4997ecceb9..3df77c3efc9f 100644
--- a/keras/layers/merging/add.py
+++ b/keras/layers/merging/add.py
@@ -17,75 +17,76 @@
 
 from keras.layers.merging.base_merge import _Merge
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Add')
+@keras_export("keras.layers.Add")
 class Add(_Merge):
-  """Layer that adds a list of inputs.
+    """Layer that adds a list of inputs.
 
-  It takes as input a list of tensors,
-  all of the same shape, and returns
-  a single tensor (also of the same shape).
+    It takes as input a list of tensors,
+    all of the same shape, and returns
+    a single tensor (also of the same shape).
 
-  Examples:
+    Examples:
 
-  >>> input_shape = (2, 3, 4)
-  >>> x1 = tf.random.normal(input_shape)
-  >>> x2 = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.Add()([x1, x2])
-  >>> print(y.shape)
-  (2, 3, 4)
+    >>> input_shape = (2, 3, 4)
+    >>> x1 = tf.random.normal(input_shape)
+    >>> x2 = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.Add()([x1, x2])
+    >>> print(y.shape)
+    (2, 3, 4)
 
-  Used in a functional model:
+    Used in a functional model:
 
-  >>> input1 = tf.keras.layers.Input(shape=(16,))
-  >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
-  >>> input2 = tf.keras.layers.Input(shape=(32,))
-  >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
-  >>> # equivalent to `added = tf.keras.layers.add([x1, x2])`
-  >>> added = tf.keras.layers.Add()([x1, x2])
-  >>> out = tf.keras.layers.Dense(4)(added)
-  >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
+    >>> input1 = tf.keras.layers.Input(shape=(16,))
+    >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
+    >>> input2 = tf.keras.layers.Input(shape=(32,))
+    >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
+    >>> # equivalent to `added = tf.keras.layers.add([x1, x2])`
+    >>> added = tf.keras.layers.Add()([x1, x2])
+    >>> out = tf.keras.layers.Dense(4)(added)
+    >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
 
-  """
+    """
 
-  def _merge_function(self, inputs):
-    output = inputs[0]
-    for i in range(1, len(inputs)):
-      output += inputs[i]
-    return output
+    def _merge_function(self, inputs):
+        output = inputs[0]
+        for i in range(1, len(inputs)):
+            output += inputs[i]
+        return output
 
 
-@keras_export('keras.layers.add')
+@keras_export("keras.layers.add")
 def add(inputs, **kwargs):
-  """Functional interface to the `tf.keras.layers.Add` layer.
+    """Functional interface to the `tf.keras.layers.Add` layer.
 
-  Args:
-      inputs: A list of input tensors with the same shape.
-      **kwargs: Standard layer keyword arguments.
+    Args:
+        inputs: A list of input tensors with the same shape.
+        **kwargs: Standard layer keyword arguments.
 
-  Returns:
-      A tensor as the sum of the inputs. It has the same shape as the inputs.
+    Returns:
+        A tensor as the sum of the inputs. It has the same shape as the inputs.
 
-  Examples:
+    Examples:
 
-  >>> input_shape = (2, 3, 4)
-  >>> x1 = tf.random.normal(input_shape)
-  >>> x2 = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.add([x1, x2])
-  >>> print(y.shape)
-  (2, 3, 4)
+    >>> input_shape = (2, 3, 4)
+    >>> x1 = tf.random.normal(input_shape)
+    >>> x2 = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.add([x1, x2])
+    >>> print(y.shape)
+    (2, 3, 4)
 
-  Used in a functional model:
+    Used in a functional model:
 
-  >>> input1 = tf.keras.layers.Input(shape=(16,))
-  >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
-  >>> input2 = tf.keras.layers.Input(shape=(32,))
-  >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
-  >>> added = tf.keras.layers.add([x1, x2])
-  >>> out = tf.keras.layers.Dense(4)(added)
-  >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
+    >>> input1 = tf.keras.layers.Input(shape=(16,))
+    >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
+    >>> input2 = tf.keras.layers.Input(shape=(32,))
+    >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
+    >>> added = tf.keras.layers.add([x1, x2])
+    >>> out = tf.keras.layers.Dense(4)(added)
+    >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
 
-  """
-  return Add(**kwargs)(inputs)
+    """
+    return Add(**kwargs)(inputs)
diff --git a/keras/layers/merging/average.py b/keras/layers/merging/average.py
index e019b6bb37e6..87261c167099 100644
--- a/keras/layers/merging/average.py
+++ b/keras/layers/merging/average.py
@@ -17,77 +17,78 @@
 
 from keras.layers.merging.base_merge import _Merge
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Average')
+@keras_export("keras.layers.Average")
 class Average(_Merge):
-  """Layer that averages a list of inputs element-wise.
+    """Layer that averages a list of inputs element-wise.
 
-  It takes as input a list of tensors, all of the same shape, and returns
-  a single tensor (also of the same shape).
+    It takes as input a list of tensors, all of the same shape, and returns
+    a single tensor (also of the same shape).
 
-  Example:
+    Example:
 
-  >>> x1 = np.ones((2, 2))
-  >>> x2 = np.zeros((2, 2))
-  >>> y = tf.keras.layers.Average()([x1, x2])
-  >>> y.numpy().tolist()
-  [[0.5, 0.5], [0.5, 0.5]]
+    >>> x1 = np.ones((2, 2))
+    >>> x2 = np.zeros((2, 2))
+    >>> y = tf.keras.layers.Average()([x1, x2])
+    >>> y.numpy().tolist()
+    [[0.5, 0.5], [0.5, 0.5]]
 
-  Usage in a functional model:
+    Usage in a functional model:
 
-  >>> input1 = tf.keras.layers.Input(shape=(16,))
-  >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
-  >>> input2 = tf.keras.layers.Input(shape=(32,))
-  >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
-  >>> avg = tf.keras.layers.Average()([x1, x2])
-  >>> out = tf.keras.layers.Dense(4)(avg)
-  >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
+    >>> input1 = tf.keras.layers.Input(shape=(16,))
+    >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
+    >>> input2 = tf.keras.layers.Input(shape=(32,))
+    >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
+    >>> avg = tf.keras.layers.Average()([x1, x2])
+    >>> out = tf.keras.layers.Dense(4)(avg)
+    >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
 
-  Raises:
-    ValueError: If there is a shape mismatch between the inputs and the shapes
-      cannot be broadcasted to match.
-  """
+    Raises:
+      ValueError: If there is a shape mismatch between the inputs and the shapes
+        cannot be broadcasted to match.
+    """
 
-  def _merge_function(self, inputs):
-    output = inputs[0]
-    for i in range(1, len(inputs)):
-      output += inputs[i]
-    return output / len(inputs)
+    def _merge_function(self, inputs):
+        output = inputs[0]
+        for i in range(1, len(inputs)):
+            output += inputs[i]
+        return output / len(inputs)
 
 
-@keras_export('keras.layers.average')
+@keras_export("keras.layers.average")
 def average(inputs, **kwargs):
-  """Functional interface to the `tf.keras.layers.Average` layer.
-
-  Example:
-
-  >>> x1 = np.ones((2, 2))
-  >>> x2 = np.zeros((2, 2))
-  >>> y = tf.keras.layers.Average()([x1, x2])
-  >>> y.numpy().tolist()
-  [[0.5, 0.5], [0.5, 0.5]]
-
-  Usage in a functional model:
-
-  >>> input1 = tf.keras.layers.Input(shape=(16,))
-  >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
-  >>> input2 = tf.keras.layers.Input(shape=(32,))
-  >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
-  >>> avg = tf.keras.layers.Average()([x1, x2])
-  >>> out = tf.keras.layers.Dense(4)(avg)
-  >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
-
-  Args:
-      inputs: A list of input tensors.
-      **kwargs: Standard layer keyword arguments.
-
-  Returns:
-      A tensor, the average of the inputs.
-
-  Raises:
-    ValueError: If there is a shape mismatch between the inputs and the shapes
-      cannot be broadcasted to match.
-  """
-  return Average(**kwargs)(inputs)
+    """Functional interface to the `tf.keras.layers.Average` layer.
+
+    Example:
+
+    >>> x1 = np.ones((2, 2))
+    >>> x2 = np.zeros((2, 2))
+    >>> y = tf.keras.layers.Average()([x1, x2])
+    >>> y.numpy().tolist()
+    [[0.5, 0.5], [0.5, 0.5]]
+
+    Usage in a functional model:
+
+    >>> input1 = tf.keras.layers.Input(shape=(16,))
+    >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1)
+    >>> input2 = tf.keras.layers.Input(shape=(32,))
+    >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2)
+    >>> avg = tf.keras.layers.Average()([x1, x2])
+    >>> out = tf.keras.layers.Dense(4)(avg)
+    >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
+
+    Args:
+        inputs: A list of input tensors.
+        **kwargs: Standard layer keyword arguments.
+
+    Returns:
+        A tensor, the average of the inputs.
+
+    Raises:
+      ValueError: If there is a shape mismatch between the inputs and the shapes
+        cannot be broadcasted to match.
+    """
+    return Average(**kwargs)(inputs)
diff --git a/keras/layers/merging/base_merge.py b/keras/layers/merging/base_merge.py
index a73db401984d..058de0a0eb21 100644
--- a/keras/layers/merging/base_merge.py
+++ b/keras/layers/merging/base_merge.py
@@ -14,205 +14,229 @@
 # ==============================================================================
 """Private base class for layers that can merge several inputs into one."""
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
 
 class _Merge(Layer):
-  """Generic merge layer for elementwise merge functions.
-
-  Used to implement `Sum`, `Average`, etc.
-  """
-
-  def __init__(self, **kwargs):
-    """Initializes a Merge layer.
+    """Generic merge layer for elementwise merge functions.
 
-    Args:
-      **kwargs: standard layer keyword arguments.
+    Used to implement `Sum`, `Average`, etc.
     """
-    super().__init__(**kwargs)
-    self.supports_masking = True
 
-  def _merge_function(self, inputs):
-    raise NotImplementedError
-
-  def _compute_elemwise_op_output_shape(self, shape1, shape2):
-    """Computes the shape of the resultant of an elementwise operation.
-
-    Args:
-        shape1: tuple or None. Shape of the first tensor
-        shape2: tuple or None. Shape of the second tensor
-
-    Returns:
-        expected output shape when an element-wise operation is
-        carried out on 2 tensors with shapes shape1 and shape2.
-        tuple or None.
-
-    Raises:
-        ValueError: if shape1 and shape2 are not compatible for
-            element-wise operations.
-    """
-    if None in [shape1, shape2]:
-      return None
-    elif len(shape1) < len(shape2):
-      return self._compute_elemwise_op_output_shape(shape2, shape1)  # pylint: disable=arguments-out-of-order
-    elif not shape2:
-      return shape1
-    output_shape = list(shape1[:-len(shape2)])
-    for i, j in zip(shape1[-len(shape2):], shape2):
-      if i is None or j is None:
-        output_shape.append(None)
-      elif i == 1:
-        output_shape.append(j)
-      elif j == 1:
-        output_shape.append(i)
-      else:
-        if i != j:
-          raise ValueError(
-              'Inputs have incompatible shapes. '
-              f'Received shapes {shape1} and {shape2}')
-        output_shape.append(i)
-    return tuple(output_shape)
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    # Used purely for shape validation.
-    if not isinstance(input_shape[0], tuple):
-      raise ValueError(
-          'A merge layer should be called on a list of inputs. '
-          f'Received: input_shape={input_shape} (not a list of shapes)')
-    if len(input_shape) < 1:
-      raise ValueError('A merge layer should be called '
-                       'on a list of at least 1 input. '
-                       f'Got {len(input_shape)} inputs. '
-                       f'Full input_shape received: {input_shape}')
-    batch_sizes = {s[0] for s in input_shape if s} - {None}
-    if len(batch_sizes) > 1:
-      raise ValueError(
-          'Cannot merge tensors with different batch sizes. '
-          f'Got tensors with shapes {input_shape}')
-    if input_shape[0] is None:
-      output_shape = None
-    else:
-      output_shape = input_shape[0][1:]
-    for i in range(1, len(input_shape)):
-      if input_shape[i] is None:
-        shape = None
-      else:
-        shape = input_shape[i][1:]
-      output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
-    # If the inputs have different ranks, we have to reshape them
-    # to make them broadcastable.
-    if None not in input_shape and len(set(map(len, input_shape))) == 1:
-      self._reshape_required = False
-    else:
-      self._reshape_required = True
-
-  def call(self, inputs):
-    if not isinstance(inputs, (list, tuple)):
-      raise ValueError(
-          'A merge layer should be called on a list of inputs. '
-          f'Received: inputs={inputs} (not a list of tensors)')
-    if self._reshape_required:
-      reshaped_inputs = []
-      input_ndims = list(map(backend.ndim, inputs))
-      if None not in input_ndims:
-        # If ranks of all inputs are available,
-        # we simply expand each of them at axis=1
-        # until all of them have the same rank.
-        max_ndim = max(input_ndims)
-        for x in inputs:
-          x_ndim = backend.ndim(x)
-          for _ in range(max_ndim - x_ndim):
-            x = tf.expand_dims(x, axis=1)
-          reshaped_inputs.append(x)
-        return self._merge_function(reshaped_inputs)
-      else:
-        # Transpose all inputs so that batch size is the last dimension.
-        # (batch_size, dim1, dim2, ... ) -> (dim1, dim2, ... , batch_size)
-        transposed = False
-        for x in inputs:
-          x_ndim = backend.ndim(x)
-          if x_ndim is None:
-            x_shape = tf.shape(x)
-            batch_size = x_shape[0]
-            new_shape = backend.concatenate(
-                [x_shape[1:],
-                 tf.expand_dims(batch_size, axis=-1)])
-            x_transposed = tf.reshape(
-                x,
-                tf.stack(
-                    [batch_size, tf.reduce_prod(x_shape[1:])], axis=0))
-            x_transposed = tf.transpose(x_transposed, perm=(1, 0))
-            x_transposed = tf.reshape(x_transposed, new_shape)
-            reshaped_inputs.append(x_transposed)
-            transposed = True
-          elif x_ndim > 1:
-            dims = list(range(1, x_ndim)) + [0]
-            reshaped_inputs.append(tf.transpose(x, perm=dims))
-            transposed = True
-          else:
-            # We don't transpose inputs if they are 1D vectors or scalars.
-            reshaped_inputs.append(x)
-        y = self._merge_function(reshaped_inputs)
-        y_ndim = backend.ndim(y)
-        if transposed:
-          # If inputs have been transposed, we have to transpose the output too.
-          if y_ndim is None:
-            y_shape = tf.shape(y)
-            y_ndim = tf.shape(y_shape)[0]
-            batch_size = y_shape[y_ndim - 1]
-            new_shape = backend.concatenate([
-                tf.expand_dims(batch_size, axis=-1), y_shape[:y_ndim - 1]
-            ])
-            y = tf.reshape(y, (-1, batch_size))
-            y = tf.transpose(y, perm=(1, 0))
-            y = tf.reshape(y, new_shape)
-          elif y_ndim > 1:
-            dims = [y_ndim - 1] + list(range(y_ndim - 1))
-            y = tf.transpose(y, perm=dims)
-        return y
-    else:
-      return self._merge_function(inputs)
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if input_shape[0] is None:
-      output_shape = None
-    else:
-      output_shape = input_shape[0][1:]
-    for i in range(1, len(input_shape)):
-      if input_shape[i] is None:
-        shape = None
-      else:
-        shape = input_shape[i][1:]
-      output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
-    batch_sizes = {s[0] for s in input_shape if s is not None} - {None}
-    if len(batch_sizes) == 1:
-      output_shape = (list(batch_sizes)[0],) + output_shape
-    else:
-      output_shape = (None,) + output_shape
-    return output_shape
-
-  def compute_mask(self, inputs, mask=None):
-    if mask is None:
-      return None
-    if not isinstance(mask, (tuple, list)):
-      raise ValueError(f'`mask` should be a list. Received: mask={mask}')
-    if not isinstance(inputs, (tuple, list)):
-      raise ValueError(f'`inputs` should be a list. Received: inputs={inputs}')
-    if len(mask) != len(inputs):
-      raise ValueError(
-          'The lists `inputs` and `mask` should have the same length. '
-          f'Received: inputs={inputs} of length {len(inputs)}, and '
-          f'mask={mask} of length {len(mask)}')
-    if all(m is None for m in mask):
-      return None
-    masks = [tf.expand_dims(m, axis=0) for m in mask if m is not None]
-    return backend.all(
-        backend.concatenate(masks, axis=0), axis=0, keepdims=False)
-
-  def get_config(self):  # pylint: disable=useless-super-delegation
-    return super().get_config()
+    def __init__(self, **kwargs):
+        """Initializes a Merge layer.
+
+        Args:
+          **kwargs: standard layer keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.supports_masking = True
+
+    def _merge_function(self, inputs):
+        raise NotImplementedError
+
+    def _compute_elemwise_op_output_shape(self, shape1, shape2):
+        """Computes the shape of the resultant of an elementwise operation.
+
+        Args:
+            shape1: tuple or None. Shape of the first tensor
+            shape2: tuple or None. Shape of the second tensor
+
+        Returns:
+            expected output shape when an element-wise operation is
+            carried out on 2 tensors with shapes shape1 and shape2.
+            tuple or None.
+
+        Raises:
+            ValueError: if shape1 and shape2 are not compatible for
+                element-wise operations.
+        """
+        if None in [shape1, shape2]:
+            return None
+        elif len(shape1) < len(shape2):
+            return self._compute_elemwise_op_output_shape(shape2, shape1)
+        elif not shape2:
+            return shape1
+        output_shape = list(shape1[: -len(shape2)])
+        for i, j in zip(shape1[-len(shape2) :], shape2):
+            if i is None or j is None:
+                output_shape.append(None)
+            elif i == 1:
+                output_shape.append(j)
+            elif j == 1:
+                output_shape.append(i)
+            else:
+                if i != j:
+                    raise ValueError(
+                        "Inputs have incompatible shapes. "
+                        f"Received shapes {shape1} and {shape2}"
+                    )
+                output_shape.append(i)
+        return tuple(output_shape)
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        # Used purely for shape validation.
+        if not isinstance(input_shape[0], tuple):
+            raise ValueError(
+                "A merge layer should be called on a list of inputs. "
+                f"Received: input_shape={input_shape} (not a list of shapes)"
+            )
+        if len(input_shape) < 1:
+            raise ValueError(
+                "A merge layer should be called "
+                "on a list of at least 1 input. "
+                f"Got {len(input_shape)} inputs. "
+                f"Full input_shape received: {input_shape}"
+            )
+        batch_sizes = {s[0] for s in input_shape if s} - {None}
+        if len(batch_sizes) > 1:
+            raise ValueError(
+                "Cannot merge tensors with different batch sizes. "
+                f"Got tensors with shapes {input_shape}"
+            )
+        if input_shape[0] is None:
+            output_shape = None
+        else:
+            output_shape = input_shape[0][1:]
+        for i in range(1, len(input_shape)):
+            if input_shape[i] is None:
+                shape = None
+            else:
+                shape = input_shape[i][1:]
+            output_shape = self._compute_elemwise_op_output_shape(
+                output_shape, shape
+            )
+        # If the inputs have different ranks, we have to reshape them
+        # to make them broadcastable.
+        if None not in input_shape and len(set(map(len, input_shape))) == 1:
+            self._reshape_required = False
+        else:
+            self._reshape_required = True
+
+    def call(self, inputs):
+        if not isinstance(inputs, (list, tuple)):
+            raise ValueError(
+                "A merge layer should be called on a list of inputs. "
+                f"Received: inputs={inputs} (not a list of tensors)"
+            )
+        if self._reshape_required:
+            reshaped_inputs = []
+            input_ndims = list(map(backend.ndim, inputs))
+            if None not in input_ndims:
+                # If ranks of all inputs are available,
+                # we simply expand each of them at axis=1
+                # until all of them have the same rank.
+                max_ndim = max(input_ndims)
+                for x in inputs:
+                    x_ndim = backend.ndim(x)
+                    for _ in range(max_ndim - x_ndim):
+                        x = tf.expand_dims(x, axis=1)
+                    reshaped_inputs.append(x)
+                return self._merge_function(reshaped_inputs)
+            else:
+                # Transpose all inputs so that batch size is the last dimension.
+                # (batch_size, dim1, dim2, ... ) -> (dim1, dim2, ... ,
+                # batch_size)
+                transposed = False
+                for x in inputs:
+                    x_ndim = backend.ndim(x)
+                    if x_ndim is None:
+                        x_shape = tf.shape(x)
+                        batch_size = x_shape[0]
+                        new_shape = backend.concatenate(
+                            [x_shape[1:], tf.expand_dims(batch_size, axis=-1)]
+                        )
+                        x_transposed = tf.reshape(
+                            x,
+                            tf.stack(
+                                [batch_size, tf.reduce_prod(x_shape[1:])],
+                                axis=0,
+                            ),
+                        )
+                        x_transposed = tf.transpose(x_transposed, perm=(1, 0))
+                        x_transposed = tf.reshape(x_transposed, new_shape)
+                        reshaped_inputs.append(x_transposed)
+                        transposed = True
+                    elif x_ndim > 1:
+                        dims = list(range(1, x_ndim)) + [0]
+                        reshaped_inputs.append(tf.transpose(x, perm=dims))
+                        transposed = True
+                    else:
+                        # We don't transpose inputs if they are 1D vectors or
+                        # scalars.
+                        reshaped_inputs.append(x)
+                y = self._merge_function(reshaped_inputs)
+                y_ndim = backend.ndim(y)
+                if transposed:
+                    # If inputs have been transposed, we have to transpose the
+                    # output too.
+                    if y_ndim is None:
+                        y_shape = tf.shape(y)
+                        y_ndim = tf.shape(y_shape)[0]
+                        batch_size = y_shape[y_ndim - 1]
+                        new_shape = backend.concatenate(
+                            [
+                                tf.expand_dims(batch_size, axis=-1),
+                                y_shape[: y_ndim - 1],
+                            ]
+                        )
+                        y = tf.reshape(y, (-1, batch_size))
+                        y = tf.transpose(y, perm=(1, 0))
+                        y = tf.reshape(y, new_shape)
+                    elif y_ndim > 1:
+                        dims = [y_ndim - 1] + list(range(y_ndim - 1))
+                        y = tf.transpose(y, perm=dims)
+                return y
+        else:
+            return self._merge_function(inputs)
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if input_shape[0] is None:
+            output_shape = None
+        else:
+            output_shape = input_shape[0][1:]
+        for i in range(1, len(input_shape)):
+            if input_shape[i] is None:
+                shape = None
+            else:
+                shape = input_shape[i][1:]
+            output_shape = self._compute_elemwise_op_output_shape(
+                output_shape, shape
+            )
+        batch_sizes = {s[0] for s in input_shape if s is not None} - {None}
+        if len(batch_sizes) == 1:
+            output_shape = (list(batch_sizes)[0],) + output_shape
+        else:
+            output_shape = (None,) + output_shape
+        return output_shape
+
+    def compute_mask(self, inputs, mask=None):
+        if mask is None:
+            return None
+        if not isinstance(mask, (tuple, list)):
+            raise ValueError(f"`mask` should be a list. Received: mask={mask}")
+        if not isinstance(inputs, (tuple, list)):
+            raise ValueError(
+                f"`inputs` should be a list. Received: inputs={inputs}"
+            )
+        if len(mask) != len(inputs):
+            raise ValueError(
+                "The lists `inputs` and `mask` should have the same length. "
+                f"Received: inputs={inputs} of length {len(inputs)}, and "
+                f"mask={mask} of length {len(mask)}"
+            )
+        if all(m is None for m in mask):
+            return None
+        masks = [tf.expand_dims(m, axis=0) for m in mask if m is not None]
+        return backend.all(
+            backend.concatenate(masks, axis=0), axis=0, keepdims=False
+        )
+
+    def get_config(self):
+        return super().get_config()
diff --git a/keras/layers/merging/concatenate.py b/keras/layers/merging/concatenate.py
index 79dff736940a..3818e332d60c 100644
--- a/keras/layers/merging/concatenate.py
+++ b/keras/layers/merging/concatenate.py
@@ -15,51 +15,23 @@
 """Layer that concatenates several inputs."""
 
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.layers.merging.base_merge import _Merge
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Concatenate')
+@keras_export("keras.layers.Concatenate")
 class Concatenate(_Merge):
-  """Layer that concatenates a list of inputs.
-
-  It takes as input a list of tensors, all of the same shape except
-  for the concatenation axis, and returns a single tensor that is the
-  concatenation of all inputs.
-
-  >>> x = np.arange(20).reshape(2, 2, 5)
-  >>> print(x)
-  [[[ 0  1  2  3  4]
-    [ 5  6  7  8  9]]
-   [[10 11 12 13 14]
-    [15 16 17 18 19]]]
-  >>> y = np.arange(20, 30).reshape(2, 1, 5)
-  >>> print(y)
-  [[[20 21 22 23 24]]
-   [[25 26 27 28 29]]]
-  >>> tf.keras.layers.Concatenate(axis=1)([x, y])
-  <tf.Tensor: shape=(2, 3, 5), dtype=int64, numpy=
-  array([[[ 0,  1,  2,  3,  4],
-          [ 5,  6,  7,  8,  9],
-          [20, 21, 22, 23, 24]],
-         [[10, 11, 12, 13, 14],
-          [15, 16, 17, 18, 19],
-          [25, 26, 27, 28, 29]]])>
-
-  >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
-  >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
-  >>> concatted = tf.keras.layers.Concatenate()([x1, x2])
-  >>> concatted.shape
-  TensorShape([5, 16])
+    """Layer that concatenates a list of inputs.
 
-  """
-
-  def __init__(self, axis=-1, **kwargs):
-    """Instantiates a Concatenate layer.
+    It takes as input a list of tensors, all of the same shape except
+    for the concatenation axis, and returns a single tensor that is the
+    concatenation of all inputs.
 
     >>> x = np.arange(20).reshape(2, 2, 5)
     >>> print(x)
@@ -80,138 +52,180 @@ def __init__(self, axis=-1, **kwargs):
             [15, 16, 17, 18, 19],
             [25, 26, 27, 28, 29]]])>
 
-    Args:
-      axis: Axis along which to concatenate.
-      **kwargs: standard layer keyword arguments.
+    >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
+    >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
+    >>> concatted = tf.keras.layers.Concatenate()([x1, x2])
+    >>> concatted.shape
+    TensorShape([5, 16])
+
     """
-    super().__init__(**kwargs)
-    self.axis = axis
-    self.supports_masking = True
-    self._reshape_required = False
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    # Used purely for shape validation.
-    if len(input_shape) < 1 or not isinstance(input_shape[0], tuple):
-      raise ValueError(
-          'A `Concatenate` layer should be called on a list of '
-          f'at least 1 input. Received: input_shape={input_shape}')
-    if all(shape is None for shape in input_shape):
-      return
-    reduced_inputs_shapes = [list(shape) for shape in input_shape]
-    shape_set = set()
-    for i in range(len(reduced_inputs_shapes)):
-      del reduced_inputs_shapes[i][self.axis]
-      shape_set.add(tuple(reduced_inputs_shapes[i]))
-
-    if len(shape_set) != 1:
-      err_msg = ('A `Concatenate` layer requires inputs with matching shapes '
-                 'except for the concatenation axis. '
-                 f'Received: input_shape={input_shape}')
-      # Make sure all the shapes have same ranks.
-      ranks = set(len(shape) for shape in shape_set)
-      if len(ranks) != 1:
-        raise ValueError(err_msg)
-      # Get the only rank for the set.
-      (rank,) = ranks
-      for axis in range(rank):
-        # Skip the Nones in the shape since they are dynamic, also the axis for
-        # concat has been removed above.
-        unique_dims = set(
-            shape[axis] for shape in shape_set if shape[axis] is not None)
-        if len(unique_dims) > 1:
-          raise ValueError(err_msg)
-
-  def _merge_function(self, inputs):
-    return backend.concatenate(inputs, axis=self.axis)
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if ((not isinstance(input_shape, (tuple, list))) or
-        (not isinstance(input_shape[0], (tuple, list)))):
-      # The tf_utils.shape_type_conversion decorator turns tensorshapes
-      # into tuples, so we need to verify that `input_shape` is a list/tuple,
-      # *and* that the individual elements are themselves shape tuples.
-      raise ValueError(
-          'A `Concatenate` layer should be called on a list of inputs. '
-          f'Received: input_shape={input_shape}')
-    input_shapes = input_shape
-    output_shape = list(input_shapes[0])
-    for shape in input_shapes[1:]:
-      if output_shape[self.axis] is None or shape[self.axis] is None:
-        output_shape[self.axis] = None
-        break
-      output_shape[self.axis] += shape[self.axis]
-    return tuple(output_shape)
-
-  def compute_mask(self, inputs, mask=None):
-    if mask is None:
-      return None
-    if not isinstance(mask, (tuple, list)):
-      raise ValueError(f'`mask` should be a list. Received mask={mask}')
-    if not isinstance(inputs, (tuple, list)):
-      raise ValueError(f'`inputs` should be a list. Received: inputs={inputs}')
-    if len(mask) != len(inputs):
-      raise ValueError(
-          'The lists `inputs` and `mask` should have the same length. '
-          f'Received: inputs={inputs} of length {len(inputs)}, and '
-          f'mask={mask} of length {len(mask)}')
-    if all(m is None for m in mask):
-      return None
-    # Make a list of masks while making sure
-    # the dimensionality of each mask
-    # is the same as the corresponding input.
-    masks = []
-    for input_i, mask_i in zip(inputs, mask):
-      if mask_i is None:
-        # Input is unmasked. Append all 1s to masks,
-        masks.append(tf.ones_like(input_i, dtype='bool'))
-      elif backend.ndim(mask_i) < backend.ndim(input_i):
-        # Mask is smaller than the input, expand it
-        masks.append(tf.expand_dims(mask_i, axis=-1))
-      else:
-        masks.append(mask_i)
-    concatenated = backend.concatenate(masks, axis=self.axis)
-    return backend.all(concatenated, axis=-1, keepdims=False)
-
-  def get_config(self):
-    config = {
-        'axis': self.axis,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.concatenate')
+
+    def __init__(self, axis=-1, **kwargs):
+        """Instantiates a Concatenate layer.
+
+        >>> x = np.arange(20).reshape(2, 2, 5)
+        >>> print(x)
+        [[[ 0  1  2  3  4]
+          [ 5  6  7  8  9]]
+         [[10 11 12 13 14]
+          [15 16 17 18 19]]]
+        >>> y = np.arange(20, 30).reshape(2, 1, 5)
+        >>> print(y)
+        [[[20 21 22 23 24]]
+         [[25 26 27 28 29]]]
+        >>> tf.keras.layers.Concatenate(axis=1)([x, y])
+        <tf.Tensor: shape=(2, 3, 5), dtype=int64, numpy=
+        array([[[ 0,  1,  2,  3,  4],
+                [ 5,  6,  7,  8,  9],
+                [20, 21, 22, 23, 24]],
+               [[10, 11, 12, 13, 14],
+                [15, 16, 17, 18, 19],
+                [25, 26, 27, 28, 29]]])>
+
+        Args:
+          axis: Axis along which to concatenate.
+          **kwargs: standard layer keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.axis = axis
+        self.supports_masking = True
+        self._reshape_required = False
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        # Used purely for shape validation.
+        if len(input_shape) < 1 or not isinstance(input_shape[0], tuple):
+            raise ValueError(
+                "A `Concatenate` layer should be called on a list of "
+                f"at least 1 input. Received: input_shape={input_shape}"
+            )
+        if all(shape is None for shape in input_shape):
+            return
+        reduced_inputs_shapes = [list(shape) for shape in input_shape]
+        shape_set = set()
+        for i in range(len(reduced_inputs_shapes)):
+            del reduced_inputs_shapes[i][self.axis]
+            shape_set.add(tuple(reduced_inputs_shapes[i]))
+
+        if len(shape_set) != 1:
+            err_msg = (
+                "A `Concatenate` layer requires inputs with matching shapes "
+                "except for the concatenation axis. "
+                f"Received: input_shape={input_shape}"
+            )
+            # Make sure all the shapes have same ranks.
+            ranks = set(len(shape) for shape in shape_set)
+            if len(ranks) != 1:
+                raise ValueError(err_msg)
+            # Get the only rank for the set.
+            (rank,) = ranks
+            for axis in range(rank):
+                # Skip the Nones in the shape since they are dynamic, also the
+                # axis for concat has been removed above.
+                unique_dims = set(
+                    shape[axis]
+                    for shape in shape_set
+                    if shape[axis] is not None
+                )
+                if len(unique_dims) > 1:
+                    raise ValueError(err_msg)
+
+    def _merge_function(self, inputs):
+        return backend.concatenate(inputs, axis=self.axis)
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if (not isinstance(input_shape, (tuple, list))) or (
+            not isinstance(input_shape[0], (tuple, list))
+        ):
+            # The tf_utils.shape_type_conversion decorator turns tensorshapes
+            # into tuples, so we need to verify that `input_shape` is a
+            # list/tuple, *and* that the individual elements are themselves
+            # shape tuples.
+            raise ValueError(
+                "A `Concatenate` layer should be called on a list of inputs. "
+                f"Received: input_shape={input_shape}"
+            )
+        input_shapes = input_shape
+        output_shape = list(input_shapes[0])
+        for shape in input_shapes[1:]:
+            if output_shape[self.axis] is None or shape[self.axis] is None:
+                output_shape[self.axis] = None
+                break
+            output_shape[self.axis] += shape[self.axis]
+        return tuple(output_shape)
+
+    def compute_mask(self, inputs, mask=None):
+        if mask is None:
+            return None
+        if not isinstance(mask, (tuple, list)):
+            raise ValueError(f"`mask` should be a list. Received mask={mask}")
+        if not isinstance(inputs, (tuple, list)):
+            raise ValueError(
+                f"`inputs` should be a list. Received: inputs={inputs}"
+            )
+        if len(mask) != len(inputs):
+            raise ValueError(
+                "The lists `inputs` and `mask` should have the same length. "
+                f"Received: inputs={inputs} of length {len(inputs)}, and "
+                f"mask={mask} of length {len(mask)}"
+            )
+        if all(m is None for m in mask):
+            return None
+        # Make a list of masks while making sure
+        # the dimensionality of each mask
+        # is the same as the corresponding input.
+        masks = []
+        for input_i, mask_i in zip(inputs, mask):
+            if mask_i is None:
+                # Input is unmasked. Append all 1s to masks,
+                masks.append(tf.ones_like(input_i, dtype="bool"))
+            elif backend.ndim(mask_i) < backend.ndim(input_i):
+                # Mask is smaller than the input, expand it
+                masks.append(tf.expand_dims(mask_i, axis=-1))
+            else:
+                masks.append(mask_i)
+        concatenated = backend.concatenate(masks, axis=self.axis)
+        return backend.all(concatenated, axis=-1, keepdims=False)
+
+    def get_config(self):
+        config = {
+            "axis": self.axis,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.layers.concatenate")
 def concatenate(inputs, axis=-1, **kwargs):
-  """Functional interface to the `Concatenate` layer.
-
-  >>> x = np.arange(20).reshape(2, 2, 5)
-  >>> print(x)
-  [[[ 0  1  2  3  4]
-    [ 5  6  7  8  9]]
-   [[10 11 12 13 14]
-    [15 16 17 18 19]]]
-  >>> y = np.arange(20, 30).reshape(2, 1, 5)
-  >>> print(y)
-  [[[20 21 22 23 24]]
-   [[25 26 27 28 29]]]
-  >>> tf.keras.layers.concatenate([x, y],
-  ...                             axis=1)
-  <tf.Tensor: shape=(2, 3, 5), dtype=int64, numpy=
-  array([[[ 0,  1,  2,  3,  4],
-        [ 5,  6,  7,  8,  9],
-        [20, 21, 22, 23, 24]],
-       [[10, 11, 12, 13, 14],
-        [15, 16, 17, 18, 19],
-        [25, 26, 27, 28, 29]]])>
-
-  Args:
-      inputs: A list of input tensors.
-      axis: Concatenation axis.
-      **kwargs: Standard layer keyword arguments.
-
-  Returns:
-      A tensor, the concatenation of the inputs alongside axis `axis`.
-  """
-  return Concatenate(axis=axis, **kwargs)(inputs)
+    """Functional interface to the `Concatenate` layer.
+
+    >>> x = np.arange(20).reshape(2, 2, 5)
+    >>> print(x)
+    [[[ 0  1  2  3  4]
+      [ 5  6  7  8  9]]
+     [[10 11 12 13 14]
+      [15 16 17 18 19]]]
+    >>> y = np.arange(20, 30).reshape(2, 1, 5)
+    >>> print(y)
+    [[[20 21 22 23 24]]
+     [[25 26 27 28 29]]]
+    >>> tf.keras.layers.concatenate([x, y],
+    ...                             axis=1)
+    <tf.Tensor: shape=(2, 3, 5), dtype=int64, numpy=
+    array([[[ 0,  1,  2,  3,  4],
+          [ 5,  6,  7,  8,  9],
+          [20, 21, 22, 23, 24]],
+         [[10, 11, 12, 13, 14],
+          [15, 16, 17, 18, 19],
+          [25, 26, 27, 28, 29]]])>
+
+    Args:
+        inputs: A list of input tensors.
+        axis: Concatenation axis.
+        **kwargs: Standard layer keyword arguments.
+
+    Returns:
+        A tensor, the concatenation of the inputs alongside axis `axis`.
+    """
+    return Concatenate(axis=axis, **kwargs)(inputs)
diff --git a/keras/layers/merging/dot.py b/keras/layers/merging/dot.py
index 249457c3a22d..27fb48350925 100644
--- a/keras/layers/merging/dot.py
+++ b/keras/layers/merging/dot.py
@@ -15,200 +15,212 @@
 """Layer that computes the dot product between two inputs."""
 
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine import base_layer_utils
 from keras.layers.merging.base_merge import _Merge
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Dot')
+@keras_export("keras.layers.Dot")
 class Dot(_Merge):
-  """Layer that computes a dot product between samples in two tensors.
-
-  E.g. if applied to a list of two tensors `a` and `b` of shape
-  `(batch_size, n)`, the output will be a tensor of shape `(batch_size, 1)`
-  where each entry `i` will be the dot product between
-  `a[i]` and `b[i]`.
-
-  >>> x = np.arange(10).reshape(1, 5, 2)
-  >>> print(x)
-  [[[0 1]
-    [2 3]
-    [4 5]
-    [6 7]
-    [8 9]]]
-  >>> y = np.arange(10, 20).reshape(1, 2, 5)
-  >>> print(y)
-  [[[10 11 12 13 14]
-    [15 16 17 18 19]]]
-  >>> tf.keras.layers.Dot(axes=(1, 2))([x, y])
-  <tf.Tensor: shape=(1, 2, 2), dtype=int64, numpy=
-  array([[[260, 360],
-          [320, 445]]])>
-
-  >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
-  >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
-  >>> dotted = tf.keras.layers.Dot(axes=1)([x1, x2])
-  >>> dotted.shape
-  TensorShape([5, 1])
-
-
-  """
-
-  def __init__(self, axes, normalize=False, **kwargs):
-    """Initializes a layer that computes the element-wise dot product.
-
-      >>> x = np.arange(10).reshape(1, 5, 2)
-      >>> print(x)
-      [[[0 1]
-        [2 3]
-        [4 5]
-        [6 7]
-        [8 9]]]
-      >>> y = np.arange(10, 20).reshape(1, 2, 5)
-      >>> print(y)
-      [[[10 11 12 13 14]
-        [15 16 17 18 19]]]
-      >>> tf.keras.layers.Dot(axes=(1, 2))([x, y])
-      <tf.Tensor: shape=(1, 2, 2), dtype=int64, numpy=
-      array([[[260, 360],
-              [320, 445]]])>
+    """Layer that computes a dot product between samples in two tensors.
+
+    E.g. if applied to a list of two tensors `a` and `b` of shape
+    `(batch_size, n)`, the output will be a tensor of shape `(batch_size, 1)`
+    where each entry `i` will be the dot product between
+    `a[i]` and `b[i]`.
+
+    >>> x = np.arange(10).reshape(1, 5, 2)
+    >>> print(x)
+    [[[0 1]
+      [2 3]
+      [4 5]
+      [6 7]
+      [8 9]]]
+    >>> y = np.arange(10, 20).reshape(1, 2, 5)
+    >>> print(y)
+    [[[10 11 12 13 14]
+      [15 16 17 18 19]]]
+    >>> tf.keras.layers.Dot(axes=(1, 2))([x, y])
+    <tf.Tensor: shape=(1, 2, 2), dtype=int64, numpy=
+    array([[[260, 360],
+            [320, 445]]])>
+
+    >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
+    >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
+    >>> dotted = tf.keras.layers.Dot(axes=1)([x1, x2])
+    >>> dotted.shape
+    TensorShape([5, 1])
+
 
-    Args:
-      axes: Integer or tuple of integers,
-        axis or axes along which to take the dot product. If a tuple, should
-        be two integers corresponding to the desired axis from the first input
-        and the desired axis from the second input, respectively. Note that the
-        size of the two selected axes must match.
-      normalize: Whether to L2-normalize samples along the
-        dot product axis before taking the dot product.
-        If set to True, then the output of the dot product
-        is the cosine proximity between the two samples.
-      **kwargs: Standard layer keyword arguments.
     """
-    super().__init__(**kwargs)
-    if not isinstance(axes, int):
-      if not isinstance(axes, (list, tuple)):
-        raise TypeError(
-            'Invalid type for argument `axes`: it should be '
-            f'a list or an int. Received: axes={axes}')
-      if len(axes) != 2:
-        raise ValueError(
-            'Invalid format for argument `axes`: it should contain two '
-            f'elements. Received: axes={axes}')
-      if not isinstance(axes[0], int) or not isinstance(axes[1], int):
-        raise ValueError(
-            'Invalid format for argument `axes`: list elements should be '
-            f'integers. Received: axes={axes}')
-    self.axes = axes
-    self.normalize = normalize
-    self.supports_masking = True
-    self._reshape_required = False
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    # Used purely for shape validation.
-    if not isinstance(input_shape[0], tuple) or len(input_shape) != 2:
-      raise ValueError(
-          'A `Dot` layer should be called on a list of 2 inputs. '
-          f'Received: input_shape={input_shape}')
-    shape1 = input_shape[0]
-    shape2 = input_shape[1]
-    if shape1 is None or shape2 is None:
-      return
-    if isinstance(self.axes, int):
-      if self.axes < 0:
-        axes = [self.axes % len(shape1), self.axes % len(shape2)]
-      else:
-        axes = [self.axes] * 2
-    else:
-      axes = self.axes
-    if shape1[axes[0]] != shape2[axes[1]]:
-      raise ValueError(
-          'Incompatible input shapes: '
-          f'axis values {shape1[axes[0]]} (at axis {axes[0]}) != '
-          f'{shape2[axes[1]]} (at axis {axes[1]}). '
-          f'Full input shapes: {shape1}, {shape2}')
-
-  def _merge_function(self, inputs):
-    base_layer_utils.no_ragged_support(inputs, self.name)
-    if len(inputs) != 2:
-      raise ValueError(
-          'A `Dot` layer should be called on exactly 2 inputs. '
-          f'Received: inputs={inputs}')
-    x1 = inputs[0]
-    x2 = inputs[1]
-    if isinstance(self.axes, int):
-      if self.axes < 0:
-        axes = [self.axes % backend.ndim(x1), self.axes % backend.ndim(x2)]
-      else:
-        axes = [self.axes] * 2
-    else:
-      axes = []
-      for i in range(len(self.axes)):
-        if self.axes[i] < 0:
-          axes.append(self.axes[i] % backend.ndim(inputs[i]))
+
+    def __init__(self, axes, normalize=False, **kwargs):
+        """Initializes a layer that computes the element-wise dot product.
+
+          >>> x = np.arange(10).reshape(1, 5, 2)
+          >>> print(x)
+          [[[0 1]
+            [2 3]
+            [4 5]
+            [6 7]
+            [8 9]]]
+          >>> y = np.arange(10, 20).reshape(1, 2, 5)
+          >>> print(y)
+          [[[10 11 12 13 14]
+            [15 16 17 18 19]]]
+          >>> tf.keras.layers.Dot(axes=(1, 2))([x, y])
+          <tf.Tensor: shape=(1, 2, 2), dtype=int64, numpy=
+          array([[[260, 360],
+                  [320, 445]]])>
+
+        Args:
+          axes: Integer or tuple of integers,
+            axis or axes along which to take the dot product. If a tuple, should
+            be two integers corresponding to the desired axis from the first
+            input and the desired axis from the second input, respectively. Note
+            that the size of the two selected axes must match.
+          normalize: Whether to L2-normalize samples along the
+            dot product axis before taking the dot product.
+            If set to True, then the output of the dot product
+            is the cosine proximity between the two samples.
+          **kwargs: Standard layer keyword arguments.
+        """
+        super().__init__(**kwargs)
+        if not isinstance(axes, int):
+            if not isinstance(axes, (list, tuple)):
+                raise TypeError(
+                    "Invalid type for argument `axes`: it should be "
+                    f"a list or an int. Received: axes={axes}"
+                )
+            if len(axes) != 2:
+                raise ValueError(
+                    "Invalid format for argument `axes`: it should contain two "
+                    f"elements. Received: axes={axes}"
+                )
+            if not isinstance(axes[0], int) or not isinstance(axes[1], int):
+                raise ValueError(
+                    "Invalid format for argument `axes`: list elements should "
+                    f"be integers. Received: axes={axes}"
+                )
+        self.axes = axes
+        self.normalize = normalize
+        self.supports_masking = True
+        self._reshape_required = False
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        # Used purely for shape validation.
+        if not isinstance(input_shape[0], tuple) or len(input_shape) != 2:
+            raise ValueError(
+                "A `Dot` layer should be called on a list of 2 inputs. "
+                f"Received: input_shape={input_shape}"
+            )
+        shape1 = input_shape[0]
+        shape2 = input_shape[1]
+        if shape1 is None or shape2 is None:
+            return
+        if isinstance(self.axes, int):
+            if self.axes < 0:
+                axes = [self.axes % len(shape1), self.axes % len(shape2)]
+            else:
+                axes = [self.axes] * 2
         else:
-          axes.append(self.axes[i])
-    if self.normalize:
-      x1 = tf.linalg.l2_normalize(x1, axis=axes[0])
-      x2 = tf.linalg.l2_normalize(x2, axis=axes[1])
-    output = backend.batch_dot(x1, x2, axes)
-    return output
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if not isinstance(input_shape, (tuple, list)) or len(input_shape) != 2:
-      raise ValueError(
-          'A `Dot` layer should be called on a list of 2 inputs. '
-          f'Received: input_shape={input_shape}')
-    shape1 = list(input_shape[0])
-    shape2 = list(input_shape[1])
-    if isinstance(self.axes, int):
-      if self.axes < 0:
-        axes = [self.axes % len(shape1), self.axes % len(shape2)]
-      else:
-        axes = [self.axes] * 2
-    else:
-      axes = self.axes
-    shape1.pop(axes[0])
-    shape2.pop(axes[1])
-    shape2.pop(0)
-    output_shape = shape1 + shape2
-    if len(output_shape) == 1:
-      output_shape += [1]
-    return tuple(output_shape)
-
-  def compute_mask(self, inputs, mask=None):
-    return None
-
-  def get_config(self):
-    config = {
-        'axes': self.axes,
-        'normalize': self.normalize,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.dot')
+            axes = self.axes
+        if shape1[axes[0]] != shape2[axes[1]]:
+            raise ValueError(
+                "Incompatible input shapes: "
+                f"axis values {shape1[axes[0]]} (at axis {axes[0]}) != "
+                f"{shape2[axes[1]]} (at axis {axes[1]}). "
+                f"Full input shapes: {shape1}, {shape2}"
+            )
+
+    def _merge_function(self, inputs):
+        base_layer_utils.no_ragged_support(inputs, self.name)
+        if len(inputs) != 2:
+            raise ValueError(
+                "A `Dot` layer should be called on exactly 2 inputs. "
+                f"Received: inputs={inputs}"
+            )
+        x1 = inputs[0]
+        x2 = inputs[1]
+        if isinstance(self.axes, int):
+            if self.axes < 0:
+                axes = [
+                    self.axes % backend.ndim(x1),
+                    self.axes % backend.ndim(x2),
+                ]
+            else:
+                axes = [self.axes] * 2
+        else:
+            axes = []
+            for i in range(len(self.axes)):
+                if self.axes[i] < 0:
+                    axes.append(self.axes[i] % backend.ndim(inputs[i]))
+                else:
+                    axes.append(self.axes[i])
+        if self.normalize:
+            x1 = tf.linalg.l2_normalize(x1, axis=axes[0])
+            x2 = tf.linalg.l2_normalize(x2, axis=axes[1])
+        output = backend.batch_dot(x1, x2, axes)
+        return output
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if not isinstance(input_shape, (tuple, list)) or len(input_shape) != 2:
+            raise ValueError(
+                "A `Dot` layer should be called on a list of 2 inputs. "
+                f"Received: input_shape={input_shape}"
+            )
+        shape1 = list(input_shape[0])
+        shape2 = list(input_shape[1])
+        if isinstance(self.axes, int):
+            if self.axes < 0:
+                axes = [self.axes % len(shape1), self.axes % len(shape2)]
+            else:
+                axes = [self.axes] * 2
+        else:
+            axes = self.axes
+        shape1.pop(axes[0])
+        shape2.pop(axes[1])
+        shape2.pop(0)
+        output_shape = shape1 + shape2
+        if len(output_shape) == 1:
+            output_shape += [1]
+        return tuple(output_shape)
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+    def get_config(self):
+        config = {
+            "axes": self.axes,
+            "normalize": self.normalize,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.layers.dot")
 def dot(inputs, axes, normalize=False, **kwargs):
-  """Functional interface to the `Dot` layer.
-
-  Args:
-      inputs: A list of input tensors (at least 2).
-      axes: Integer or tuple of integers,
-          axis or axes along which to take the dot product.
-      normalize: Whether to L2-normalize samples along the
-          dot product axis before taking the dot product.
-          If set to True, then the output of the dot product
-          is the cosine proximity between the two samples.
-      **kwargs: Standard layer keyword arguments.
-
-  Returns:
-      A tensor, the dot product of the samples from the inputs.
-  """
-  return Dot(axes=axes, normalize=normalize, **kwargs)(inputs)
+    """Functional interface to the `Dot` layer.
+
+    Args:
+        inputs: A list of input tensors (at least 2).
+        axes: Integer or tuple of integers,
+            axis or axes along which to take the dot product.
+        normalize: Whether to L2-normalize samples along the
+            dot product axis before taking the dot product.
+            If set to True, then the output of the dot product
+            is the cosine proximity between the two samples.
+        **kwargs: Standard layer keyword arguments.
+
+    Returns:
+        A tensor, the dot product of the samples from the inputs.
+    """
+    return Dot(axes=axes, normalize=normalize, **kwargs)(inputs)
diff --git a/keras/layers/merging/maximum.py b/keras/layers/merging/maximum.py
index 413536220b0f..de939d2856cc 100644
--- a/keras/layers/merging/maximum.py
+++ b/keras/layers/merging/maximum.py
@@ -15,69 +15,71 @@
 """Layer that computes the maximum (element-wise) of several inputs."""
 
 
-from keras.layers.merging.base_merge import _Merge
 import tensorflow.compat.v2 as tf
 
+from keras.layers.merging.base_merge import _Merge
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Maximum')
+@keras_export("keras.layers.Maximum")
 class Maximum(_Merge):
-  """Layer that computes the maximum (element-wise) a list of inputs.
-
-  It takes as input a list of tensors, all of the same shape, and returns
-  a single tensor (also of the same shape).
-
-  >>> tf.keras.layers.Maximum()([np.arange(5).reshape(5, 1),
-  ...                            np.arange(5, 10).reshape(5, 1)])
-  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
-  array([[5],
-       [6],
-       [7],
-       [8],
-       [9]])>
-
-  >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
-  >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
-  >>> maxed = tf.keras.layers.Maximum()([x1, x2])
-  >>> maxed.shape
-  TensorShape([5, 8])
-  """
-
-  def _merge_function(self, inputs):
-    output = inputs[0]
-    for i in range(1, len(inputs)):
-      output = tf.maximum(output, inputs[i])
-    return output
-
-
-@keras_export('keras.layers.maximum')
+    """Layer that computes the maximum (element-wise) a list of inputs.
+
+    It takes as input a list of tensors, all of the same shape, and returns
+    a single tensor (also of the same shape).
+
+    >>> tf.keras.layers.Maximum()([np.arange(5).reshape(5, 1),
+    ...                            np.arange(5, 10).reshape(5, 1)])
+    <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[5],
+         [6],
+         [7],
+         [8],
+         [9]])>
+
+    >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
+    >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
+    >>> maxed = tf.keras.layers.Maximum()([x1, x2])
+    >>> maxed.shape
+    TensorShape([5, 8])
+    """
+
+    def _merge_function(self, inputs):
+        output = inputs[0]
+        for i in range(1, len(inputs)):
+            output = tf.maximum(output, inputs[i])
+        return output
+
+
+@keras_export("keras.layers.maximum")
 def maximum(inputs, **kwargs):
-  """Functional interface to compute maximum (element-wise) list of `inputs`.
-
-  This is equivalent to the `tf.keras.layers.Maximum` layer.
-
-  For example:
-
-  ```python
-  input1 = tf.keras.layers.Input(shape=(16,))
-  x1 = tf.keras.layers.Dense(8, activation='relu')(input1) #shape=(None, 8)
-  input2 = tf.keras.layers.Input(shape=(32,))
-  x2 = tf.keras.layers.Dense(8, activation='relu')(input2) #shape=(None, 8)
-  max_inp=tf.keras.layers.maximum([x1,x2]) #shape=(None, 8)
-  out = tf.keras.layers.Dense(4)(max_inp)
-  model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
-  ```
-
-  Args:
-      inputs: A list of input tensors of same shape.
-      **kwargs: Standard layer keyword arguments.
-
-  Returns:
-      A tensor (of same shape as input tensor) with the element-wise
-      maximum of the inputs.
-
-  Raises:
-      ValueError: If input tensors are of different shape.
-  """
-  return Maximum(**kwargs)(inputs)
+    """Functional interface to compute maximum (element-wise) list of `inputs`.
+
+    This is equivalent to the `tf.keras.layers.Maximum` layer.
+
+    For example:
+
+    ```python
+    input1 = tf.keras.layers.Input(shape=(16,))
+    x1 = tf.keras.layers.Dense(8, activation='relu')(input1) #shape=(None, 8)
+    input2 = tf.keras.layers.Input(shape=(32,))
+    x2 = tf.keras.layers.Dense(8, activation='relu')(input2) #shape=(None, 8)
+    max_inp=tf.keras.layers.maximum([x1,x2]) #shape=(None, 8)
+    out = tf.keras.layers.Dense(4)(max_inp)
+    model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
+    ```
+
+    Args:
+        inputs: A list of input tensors of same shape.
+        **kwargs: Standard layer keyword arguments.
+
+    Returns:
+        A tensor (of same shape as input tensor) with the element-wise
+        maximum of the inputs.
+
+    Raises:
+        ValueError: If input tensors are of different shape.
+    """
+    return Maximum(**kwargs)(inputs)
diff --git a/keras/layers/merging/merging_test.py b/keras/layers/merging/merging_test.py
index f81c54e825a2..1f3b597467e6 100644
--- a/keras/layers/merging/merging_test.py
+++ b/keras/layers/merging/merging_test.py
@@ -14,437 +14,488 @@
 # ==============================================================================
 """Tests for merging layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras import backend
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import tf_inspect
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class MergingLayersTest(test_combinations.TestCase):
-
-  def test_add(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    i3 = keras.layers.Input(shape=(4, 5))
-
-    add_layer = keras.layers.Add()
-    o = add_layer([i1, i2, i3])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    model = keras.models.Model([i1, i2, i3], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    x2 = np.random.random((2, 4, 5))
-    x3 = np.random.random((2, 4, 5))
-    out = model.predict([x1, x2, x3])
-    self.assertEqual(out.shape, (2, 4, 5))
-    self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
-
-    self.assertIsNone(add_layer.compute_mask([i1, i2, i3], [None, None, None]))
-    self.assertTrue(
-        np.all(
-            backend.eval(
-                add_layer.compute_mask(
-                    [i1, i2], [backend.variable(x1), backend.variable(x2)]))))
-
-    with self.assertRaisesRegex(ValueError, '`mask` should be a list.'):
-      add_layer.compute_mask([i1, i2, i3], x1)
-    with self.assertRaisesRegex(ValueError, '`inputs` should be a list.'):
-      add_layer.compute_mask(i1, [None, None, None])
-    with self.assertRaisesRegex(ValueError, ' should have the same length.'):
-      add_layer.compute_mask([i1, i2, i3], [None, None])
-
-  def test_subtract(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    i3 = keras.layers.Input(shape=(4, 5))
-
-    subtract_layer = keras.layers.Subtract()
-    o = subtract_layer([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    model = keras.models.Model([i1, i2], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    x2 = np.random.random((2, 4, 5))
-    out = model.predict([x1, x2])
-    self.assertEqual(out.shape, (2, 4, 5))
-    self.assertAllClose(out, x1 - x2, atol=1e-4)
-
-    self.assertIsNone(subtract_layer.compute_mask([i1, i2], [None, None]))
-    self.assertTrue(
-        np.all(
-            backend.eval(
-                subtract_layer.compute_mask(
-                    [i1, i2], [backend.variable(x1), backend.variable(x2)]))))
-
-    with self.assertRaisesRegex(ValueError, '`mask` should be a list.'):
-      subtract_layer.compute_mask([i1, i2], x1)
-    with self.assertRaisesRegex(ValueError, '`inputs` should be a list.'):
-      subtract_layer.compute_mask(i1, [None, None])
-    with self.assertRaisesRegex(ValueError,
-                                'layer should be called on exactly 2 inputs'):
-      subtract_layer([i1, i2, i3])
-    with self.assertRaisesRegex(ValueError,
-                                'layer should be called on exactly 2 inputs'):
-      subtract_layer([i1])
-
-  def test_multiply(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    i3 = keras.layers.Input(shape=(4, 5))
-    o = keras.layers.multiply([i1, i2, i3])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    model = keras.models.Model([i1, i2, i3], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    x2 = np.random.random((2, 4, 5))
-    x3 = np.random.random((2, 4, 5))
-    out = model.predict([x1, x2, x3])
-    self.assertEqual(out.shape, (2, 4, 5))
-    self.assertAllClose(out, x1 * x2 * x3, atol=1e-4)
-
-  def test_average(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    o = keras.layers.average([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    model = keras.models.Model([i1, i2], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    x2 = np.random.random((2, 4, 5))
-    out = model.predict([x1, x2])
-    self.assertEqual(out.shape, (2, 4, 5))
-    self.assertAllClose(out, 0.5 * (x1 + x2), atol=1e-4)
-
-  def test_maximum(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    o = keras.layers.maximum([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    model = keras.models.Model([i1, i2], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    x2 = np.random.random((2, 4, 5))
-    out = model.predict([x1, x2])
-    self.assertEqual(out.shape, (2, 4, 5))
-    self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
-
-  def test_minimum(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    o = keras.layers.minimum([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    model = keras.models.Model([i1, i2], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    x2 = np.random.random((2, 4, 5))
-    out = model.predict([x1, x2])
-    self.assertEqual(out.shape, (2, 4, 5))
-    self.assertAllClose(out, np.minimum(x1, x2), atol=1e-4)
-
-  def test_concatenate(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    concat_layer = keras.layers.Concatenate(axis=1)
-    o = concat_layer([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 8, 5])
-    model = keras.models.Model([i1, i2], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    x2 = np.random.random((2, 4, 5))
-    out = model.predict([x1, x2])
-    self.assertEqual(out.shape, (2, 8, 5))
-    self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
-
-    self.assertIsNone(concat_layer.compute_mask([i1, i2], [None, None]))
-    self.assertTrue(
-        np.all(
-            backend.eval(
-                concat_layer.compute_mask(
-                    [i1, i2], [backend.variable(x1), backend.variable(x2)]))))
-
-    # Should work with unit-length input.
-    unit_length_o = concat_layer([i1])
-    self.assertListEqual(unit_length_o.shape.as_list(), i1.shape.as_list())
-
-    with self.assertRaisesRegex(ValueError, '`mask` should be a list.'):
-      concat_layer.compute_mask([i1, i2], x1)
-    with self.assertRaisesRegex(ValueError, '`inputs` should be a list.'):
-      concat_layer.compute_mask(i1, [None, None])
-    with self.assertRaisesRegex(ValueError, 'should have the same length'):
-      concat_layer.compute_mask([i1, i2], [None])
-    with self.assertRaisesRegex(ValueError,
-                                'layer should be called on a list of inputs'):
-      concat_layer(i1)
-
-  def test_concatenate_numpy_inputs(self):
-    if tf.executing_eagerly():
-      layer = keras.layers.Concatenate()
-      x, y = np.ones((10, 10)), np.ones((10, 10))
-      self.assertAllEqual(np.ones((10, 20)), layer([x, y]))
-
-  def test_dot(self):
-    i1 = keras.layers.Input(shape=(4,))
-    i2 = keras.layers.Input(shape=(4,))
-    o = keras.layers.dot([i1, i2], axes=1)
-    self.assertListEqual(o.shape.as_list(), [None, 1])
-    model = keras.models.Model([i1, i2], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-    _ = keras.layers.Dot(axes=1).get_config()
-
-    x1 = np.random.random((2, 4))
-    x2 = np.random.random((2, 4))
-    out = model.predict([x1, x2])
-    self.assertEqual(out.shape, (2, 1))
-    expected = np.zeros((2, 1))
-    expected[0, 0] = np.dot(x1[0], x2[0])
-    expected[1, 0] = np.dot(x1[1], x2[1])
-    self.assertAllClose(out, expected, atol=1e-4)
-
-    # Test with negative tuple of axes.
-    o = keras.layers.dot([i1, i2], axes=(-1, -1))
-    self.assertListEqual(o.shape.as_list(), [None, 1])
-    model = keras.models.Model([i1, i2], o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-    out = model.predict([x1, x2])
-    self.assertEqual(out.shape, (2, 1))
-    self.assertAllClose(out, expected, atol=1e-4)
-
-    # test compute_output_shape
-    layer = keras.layers.Dot(axes=-1)
-    self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          layer=[keras.layers.Add, keras.layers.Subtract,
-                 keras.layers.Multiply, keras.layers.Minimum,
-                 keras.layers.Maximum, keras.layers.Average]))
-  def test_merging_with_ragged_input(self, layer):
-    ragged_data = tf.ragged.constant(
-        [[1., 1., 1.], [1., 1.], [1., 1., 1., 1.]], ragged_rank=1)
-    dense_data = ragged_data.to_tensor()
-    input1 = keras.Input(shape=(None,), ragged=True)
-    input2 = keras.Input(shape=(None,), ragged=True)
-    out = layer()([input1, input2])
-    model = keras.models.Model(inputs=[input1, input2], outputs=out)
-    out_ragged = model.predict([ragged_data, ragged_data], steps=1)
-    out_ragged = convert_ragged_tensor_value(out_ragged).to_tensor()
-
-    input1 = keras.Input(shape=(None,))
-    input2 = keras.Input(shape=(None,))
-    out = layer()([input1, input2])
-    model = keras.models.Model(inputs=[input1, input2], outputs=out)
-    out_dense = model.predict([dense_data, dense_data], steps=1)
-
-    self.assertAllEqual(out_dense, out_ragged)
-
-  def test_concatenate_with_ragged_input(self):
-    ragged1 = tf.ragged.constant([[1., 1.], [1.], [1., 1., 1.]], ragged_rank=1)
-    ragged2 = tf.ragged.constant([[2., 2., 2.], [2.], [2., 2.]], ragged_rank=1)
-    expected_concatenated_ragged = tf.ragged.constant(
-        [[1., 1., 2., 2., 2.], [1., 2.], [1., 1., 1., 2., 2.]], ragged_rank=1)
-    input1 = keras.Input(shape=(None,), ragged=True)
-    input2 = keras.Input(shape=(None,), ragged=True)
-    out = keras.layers.Concatenate(axis=1)([input1, input2])
-    model = keras.models.Model(inputs=[input1, input2], outputs=out)
-    out_ragged = model.predict([ragged1, ragged2], steps=1)
-    self.assertAllEqual(out_ragged, expected_concatenated_ragged)
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          layer=[keras.layers.Add, keras.layers.Subtract,
-                 keras.layers.Multiply, keras.layers.Minimum,
-                 keras.layers.Maximum, keras.layers.Average]))
-  def test_merging_with_scalar_input(self, layer):
-    x1 = np.array((1))
-    x2 = np.array((2))
-    out = layer()([x1, x2])
-    self.assertEqual(out.shape, ())
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(layer=[
-          keras.layers.Add, keras.layers.add, keras.layers.Average, keras.layers
-          .average, keras.layers.Concatenate, keras.layers.concatenate,
-          keras.layers.Maximum, keras.layers.maximum, keras.layers.Minimum,
-          keras.layers.minimum, keras.layers.Multiply, keras.layers.multiply
-      ]))
-  def test_single_element(self, layer):
-    # Instantiate the Layer subclasses
-    if tf_inspect.isclass(layer) and issubclass(layer, keras.layers.Layer):
-      layer = layer()
-
-    # Processing a single element list should behave as identity.
-    i1 = keras.layers.Input(shape=(4, 5))
-    o = layer([i1])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    model = keras.models.Model(i1, o)
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    x1 = np.random.random((2, 4, 5))
-    out = model.predict(x1)
-    self.assertEqual(out.shape, (2, 4, 5))
-    self.assertAllClose(out, x1)
-
-    # A single element must be passed as a list, not by itself.
-    with self.assertRaisesRegex(ValueError, 'called on a list'):
-      layer(i1)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_add(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        i3 = keras.layers.Input(shape=(4, 5))
+
+        add_layer = keras.layers.Add()
+        o = add_layer([i1, i2, i3])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        model = keras.models.Model([i1, i2, i3], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        x2 = np.random.random((2, 4, 5))
+        x3 = np.random.random((2, 4, 5))
+        out = model.predict([x1, x2, x3])
+        self.assertEqual(out.shape, (2, 4, 5))
+        self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
+
+        self.assertIsNone(
+            add_layer.compute_mask([i1, i2, i3], [None, None, None])
+        )
+        self.assertTrue(
+            np.all(
+                backend.eval(
+                    add_layer.compute_mask(
+                        [i1, i2], [backend.variable(x1), backend.variable(x2)]
+                    )
+                )
+            )
+        )
+
+        with self.assertRaisesRegex(ValueError, "`mask` should be a list."):
+            add_layer.compute_mask([i1, i2, i3], x1)
+        with self.assertRaisesRegex(ValueError, "`inputs` should be a list."):
+            add_layer.compute_mask(i1, [None, None, None])
+        with self.assertRaisesRegex(
+            ValueError, " should have the same length."
+        ):
+            add_layer.compute_mask([i1, i2, i3], [None, None])
+
+    def test_subtract(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        i3 = keras.layers.Input(shape=(4, 5))
+
+        subtract_layer = keras.layers.Subtract()
+        o = subtract_layer([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        model = keras.models.Model([i1, i2], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        x2 = np.random.random((2, 4, 5))
+        out = model.predict([x1, x2])
+        self.assertEqual(out.shape, (2, 4, 5))
+        self.assertAllClose(out, x1 - x2, atol=1e-4)
+
+        self.assertIsNone(subtract_layer.compute_mask([i1, i2], [None, None]))
+        self.assertTrue(
+            np.all(
+                backend.eval(
+                    subtract_layer.compute_mask(
+                        [i1, i2], [backend.variable(x1), backend.variable(x2)]
+                    )
+                )
+            )
+        )
+
+        with self.assertRaisesRegex(ValueError, "`mask` should be a list."):
+            subtract_layer.compute_mask([i1, i2], x1)
+        with self.assertRaisesRegex(ValueError, "`inputs` should be a list."):
+            subtract_layer.compute_mask(i1, [None, None])
+        with self.assertRaisesRegex(
+            ValueError, "layer should be called on exactly 2 inputs"
+        ):
+            subtract_layer([i1, i2, i3])
+        with self.assertRaisesRegex(
+            ValueError, "layer should be called on exactly 2 inputs"
+        ):
+            subtract_layer([i1])
+
+    def test_multiply(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        i3 = keras.layers.Input(shape=(4, 5))
+        o = keras.layers.multiply([i1, i2, i3])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        model = keras.models.Model([i1, i2, i3], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        x2 = np.random.random((2, 4, 5))
+        x3 = np.random.random((2, 4, 5))
+        out = model.predict([x1, x2, x3])
+        self.assertEqual(out.shape, (2, 4, 5))
+        self.assertAllClose(out, x1 * x2 * x3, atol=1e-4)
+
+    def test_average(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        o = keras.layers.average([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        model = keras.models.Model([i1, i2], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        x2 = np.random.random((2, 4, 5))
+        out = model.predict([x1, x2])
+        self.assertEqual(out.shape, (2, 4, 5))
+        self.assertAllClose(out, 0.5 * (x1 + x2), atol=1e-4)
+
+    def test_maximum(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        o = keras.layers.maximum([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        model = keras.models.Model([i1, i2], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        x2 = np.random.random((2, 4, 5))
+        out = model.predict([x1, x2])
+        self.assertEqual(out.shape, (2, 4, 5))
+        self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
+
+    def test_minimum(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        o = keras.layers.minimum([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        model = keras.models.Model([i1, i2], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        x2 = np.random.random((2, 4, 5))
+        out = model.predict([x1, x2])
+        self.assertEqual(out.shape, (2, 4, 5))
+        self.assertAllClose(out, np.minimum(x1, x2), atol=1e-4)
+
+    def test_concatenate(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        concat_layer = keras.layers.Concatenate(axis=1)
+        o = concat_layer([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 8, 5])
+        model = keras.models.Model([i1, i2], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        x2 = np.random.random((2, 4, 5))
+        out = model.predict([x1, x2])
+        self.assertEqual(out.shape, (2, 8, 5))
+        self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
+
+        self.assertIsNone(concat_layer.compute_mask([i1, i2], [None, None]))
+        self.assertTrue(
+            np.all(
+                backend.eval(
+                    concat_layer.compute_mask(
+                        [i1, i2], [backend.variable(x1), backend.variable(x2)]
+                    )
+                )
+            )
+        )
+
+        # Should work with unit-length input.
+        unit_length_o = concat_layer([i1])
+        self.assertListEqual(unit_length_o.shape.as_list(), i1.shape.as_list())
+
+        with self.assertRaisesRegex(ValueError, "`mask` should be a list."):
+            concat_layer.compute_mask([i1, i2], x1)
+        with self.assertRaisesRegex(ValueError, "`inputs` should be a list."):
+            concat_layer.compute_mask(i1, [None, None])
+        with self.assertRaisesRegex(ValueError, "should have the same length"):
+            concat_layer.compute_mask([i1, i2], [None])
+        with self.assertRaisesRegex(
+            ValueError, "layer should be called on a list of inputs"
+        ):
+            concat_layer(i1)
+
+    def test_concatenate_numpy_inputs(self):
+        if tf.executing_eagerly():
+            layer = keras.layers.Concatenate()
+            x, y = np.ones((10, 10)), np.ones((10, 10))
+            self.assertAllEqual(np.ones((10, 20)), layer([x, y]))
+
+    def test_dot(self):
+        i1 = keras.layers.Input(shape=(4,))
+        i2 = keras.layers.Input(shape=(4,))
+        o = keras.layers.dot([i1, i2], axes=1)
+        self.assertListEqual(o.shape.as_list(), [None, 1])
+        model = keras.models.Model([i1, i2], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+        _ = keras.layers.Dot(axes=1).get_config()
+
+        x1 = np.random.random((2, 4))
+        x2 = np.random.random((2, 4))
+        out = model.predict([x1, x2])
+        self.assertEqual(out.shape, (2, 1))
+        expected = np.zeros((2, 1))
+        expected[0, 0] = np.dot(x1[0], x2[0])
+        expected[1, 0] = np.dot(x1[1], x2[1])
+        self.assertAllClose(out, expected, atol=1e-4)
+
+        # Test with negative tuple of axes.
+        o = keras.layers.dot([i1, i2], axes=(-1, -1))
+        self.assertListEqual(o.shape.as_list(), [None, 1])
+        model = keras.models.Model([i1, i2], o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+        out = model.predict([x1, x2])
+        self.assertEqual(out.shape, (2, 1))
+        self.assertAllClose(out, expected, atol=1e-4)
+
+        # test compute_output_shape
+        layer = keras.layers.Dot(axes=-1)
+        self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer=[
+                keras.layers.Add,
+                keras.layers.Subtract,
+                keras.layers.Multiply,
+                keras.layers.Minimum,
+                keras.layers.Maximum,
+                keras.layers.Average,
+            ]
+        )
+    )
+    def test_merging_with_ragged_input(self, layer):
+        ragged_data = tf.ragged.constant(
+            [[1.0, 1.0, 1.0], [1.0, 1.0], [1.0, 1.0, 1.0, 1.0]], ragged_rank=1
+        )
+        dense_data = ragged_data.to_tensor()
+        input1 = keras.Input(shape=(None,), ragged=True)
+        input2 = keras.Input(shape=(None,), ragged=True)
+        out = layer()([input1, input2])
+        model = keras.models.Model(inputs=[input1, input2], outputs=out)
+        out_ragged = model.predict([ragged_data, ragged_data], steps=1)
+        out_ragged = convert_ragged_tensor_value(out_ragged).to_tensor()
+
+        input1 = keras.Input(shape=(None,))
+        input2 = keras.Input(shape=(None,))
+        out = layer()([input1, input2])
+        model = keras.models.Model(inputs=[input1, input2], outputs=out)
+        out_dense = model.predict([dense_data, dense_data], steps=1)
+
+        self.assertAllEqual(out_dense, out_ragged)
+
+    def test_concatenate_with_ragged_input(self):
+        ragged1 = tf.ragged.constant(
+            [[1.0, 1.0], [1.0], [1.0, 1.0, 1.0]], ragged_rank=1
+        )
+        ragged2 = tf.ragged.constant(
+            [[2.0, 2.0, 2.0], [2.0], [2.0, 2.0]], ragged_rank=1
+        )
+        expected_concatenated_ragged = tf.ragged.constant(
+            [[1.0, 1.0, 2.0, 2.0, 2.0], [1.0, 2.0], [1.0, 1.0, 1.0, 2.0, 2.0]],
+            ragged_rank=1,
+        )
+        input1 = keras.Input(shape=(None,), ragged=True)
+        input2 = keras.Input(shape=(None,), ragged=True)
+        out = keras.layers.Concatenate(axis=1)([input1, input2])
+        model = keras.models.Model(inputs=[input1, input2], outputs=out)
+        out_ragged = model.predict([ragged1, ragged2], steps=1)
+        self.assertAllEqual(out_ragged, expected_concatenated_ragged)
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer=[
+                keras.layers.Add,
+                keras.layers.Subtract,
+                keras.layers.Multiply,
+                keras.layers.Minimum,
+                keras.layers.Maximum,
+                keras.layers.Average,
+            ]
+        )
+    )
+    def test_merging_with_scalar_input(self, layer):
+        x1 = np.array((1))
+        x2 = np.array((2))
+        out = layer()([x1, x2])
+        self.assertEqual(out.shape, ())
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer=[
+                keras.layers.Add,
+                keras.layers.add,
+                keras.layers.Average,
+                keras.layers.average,
+                keras.layers.Concatenate,
+                keras.layers.concatenate,
+                keras.layers.Maximum,
+                keras.layers.maximum,
+                keras.layers.Minimum,
+                keras.layers.minimum,
+                keras.layers.Multiply,
+                keras.layers.multiply,
+            ]
+        )
+    )
+    def test_single_element(self, layer):
+        # Instantiate the Layer subclasses
+        if tf_inspect.isclass(layer) and issubclass(layer, keras.layers.Layer):
+            layer = layer()
+
+        # Processing a single element list should behave as identity.
+        i1 = keras.layers.Input(shape=(4, 5))
+        o = layer([i1])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        model = keras.models.Model(i1, o)
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        x1 = np.random.random((2, 4, 5))
+        out = model.predict(x1)
+        self.assertEqual(out.shape, (2, 4, 5))
+        self.assertAllClose(out, x1)
+
+        # A single element must be passed as a list, not by itself.
+        with self.assertRaisesRegex(ValueError, "called on a list"):
+            layer(i1)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class MergingLayersTestNoExecution(tf.test.TestCase):
-
-  def test_add_elementwise_errors(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 6))
-    with self.assertRaises(ValueError):
-      keras.layers.add([i1, i2])
-    with self.assertRaises(ValueError):
-      keras.layers.add(i1)
-
-  def test_concatenate_errors(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(3, 5))
-    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
-      keras.layers.concatenate([i1, i2], axis=-1)
-    with self.assertRaisesRegex(ValueError, 'called on a list'):
-      keras.layers.concatenate(i1, axis=-1)
-
-  def test_concatenate_with_partial_shape(self):
-    i1 = keras.layers.Input(shape=(5,), batch_size=32)
-    i2 = keras.layers.Input(shape=(5,))
-    i3 = keras.layers.Input(shape=(4, 5), batch_size=32)
-    i4 = keras.layers.Input(shape=(None,), batch_size=64)
-    i5 = keras.layers.Input(shape=(7,))
-
-    # Valid case since the i2 has a dynamic batch size.
-    keras.layers.concatenate([i1, i2], axis=-1)
-
-    # Different rank
-    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
-      keras.layers.concatenate([i1, i3], axis=-1)
-
-    # Valid case with partial dimension information
-    keras.layers.concatenate([i1, i4], axis=0)
-    keras.layers.concatenate([i2, i4], axis=0)
-    keras.layers.concatenate([i2, i4], axis=1)
-    keras.layers.concatenate([i1, i2, i4], axis=0)
-    keras.layers.concatenate([i1, i5], axis=1)
-
-    # Mismatch in batch dimension.
-    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
-      keras.layers.concatenate([i1, i4], axis=-1)
-
-    with self.assertRaisesRegex(ValueError, 'inputs with matching shapes'):
-      keras.layers.concatenate([i1, i2, i4], axis=-1)
-
-  def test_dot_errors(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 6))
-    i3 = keras.layers.Input(shape=(4, 6))
-    with self.assertRaises(ValueError):
-      keras.layers.dot([i1, i2], axes=-1)
-    with self.assertRaises(ValueError):
-      keras.layers.dot(i1, axes=-1)
-    with self.assertRaises(ValueError):
-      keras.layers.dot([i1], axes=-1)
-    with self.assertRaises(ValueError):
-      keras.layers.dot([i1, i2, i3], axes=-1)
-    with self.assertRaises(ValueError):
-      dot = keras.layers.Dot(1)
-      dot.compute_output_shape(1)
-
-  def test_subtract(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    y = keras.layers.subtract([i1, i2])
-    self.assertEqual(y.shape.as_list(), [None, 4, 5])
-
-    # Test invalid use cases
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(3, 5))
-    with self.assertRaises(ValueError):
-      keras.layers.subtract([i1, i2])
-    with self.assertRaises(ValueError):
-      keras.layers.subtract([i1, i1, i1])
-
-  def test_add_masking(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    m1 = keras.layers.Masking()(i1)
-    layer = keras.layers.Add()
-    o = layer([m1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
-    mask = layer.output_mask
-    self.assertListEqual(mask.shape.as_list(), [None, 4])
-
-  def test_add_dynamic_shape(self):
-    i1 = keras.Input(batch_shape=(4, None), dtype='float32')
-    i2 = keras.Input(batch_shape=(4, 5), dtype='float32')
-    layer = keras.layers.Add()
-    o = layer([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [4, 5])
-
-  def test_concatenate_masking(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 5))
-    m1 = keras.layers.Masking()(i1)
-    layer = keras.layers.Concatenate()
-    o = layer([m1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 4, 10])
-    mask = layer.output_mask
-    self.assertListEqual(mask.shape.as_list(), [None, 4])
-
-  def test_concatenate_sparse_shape(self):
-    i1 = keras.layers.Input(shape=(1,), batch_size=2, sparse=True)
-    i2 = keras.layers.Input(shape=(2,), batch_size=2, sparse=True)
-    layer = keras.layers.Concatenate(axis=1)
-    o = layer([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [2, 3])
-
-    # Make sure it also respect None as the batch size
-    i1 = keras.layers.Input(shape=(1,), sparse=True)
-    i2 = keras.layers.Input(shape=(2,), sparse=True)
-    layer = keras.layers.Concatenate(axis=1)
-    o = layer([i1, i2])
-    self.assertListEqual(o.shape.as_list(), [None, 3])
-
-  def test_concatenate_user_changes_to_input_structure(self):
-    a = keras.layers.Input(shape=(4, 5))
-    struct = [a, a]
-    concat1 = keras.layers.Concatenate(1)
-    b = concat1(struct)
-    struct.append(b)
-    concat2 = keras.layers.Concatenate(1)
-    c = concat2(struct)
-
-    # Checks that the append to `struct` doesn't affect `concat1`s
-    # node data.
-    self.assertLen(concat1.inbound_nodes[0].input_tensors, 2)
-    self.assertLen(concat2.inbound_nodes[0].input_tensors, 3)
-
-    keras.Model(a, c)  # Ensure model can be built.
+    def test_add_elementwise_errors(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 6))
+        with self.assertRaises(ValueError):
+            keras.layers.add([i1, i2])
+        with self.assertRaises(ValueError):
+            keras.layers.add(i1)
+
+    def test_concatenate_errors(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(3, 5))
+        with self.assertRaisesRegex(ValueError, "inputs with matching shapes"):
+            keras.layers.concatenate([i1, i2], axis=-1)
+        with self.assertRaisesRegex(ValueError, "called on a list"):
+            keras.layers.concatenate(i1, axis=-1)
+
+    def test_concatenate_with_partial_shape(self):
+        i1 = keras.layers.Input(shape=(5,), batch_size=32)
+        i2 = keras.layers.Input(shape=(5,))
+        i3 = keras.layers.Input(shape=(4, 5), batch_size=32)
+        i4 = keras.layers.Input(shape=(None,), batch_size=64)
+        i5 = keras.layers.Input(shape=(7,))
+
+        # Valid case since the i2 has a dynamic batch size.
+        keras.layers.concatenate([i1, i2], axis=-1)
+
+        # Different rank
+        with self.assertRaisesRegex(ValueError, "inputs with matching shapes"):
+            keras.layers.concatenate([i1, i3], axis=-1)
+
+        # Valid case with partial dimension information
+        keras.layers.concatenate([i1, i4], axis=0)
+        keras.layers.concatenate([i2, i4], axis=0)
+        keras.layers.concatenate([i2, i4], axis=1)
+        keras.layers.concatenate([i1, i2, i4], axis=0)
+        keras.layers.concatenate([i1, i5], axis=1)
+
+        # Mismatch in batch dimension.
+        with self.assertRaisesRegex(ValueError, "inputs with matching shapes"):
+            keras.layers.concatenate([i1, i4], axis=-1)
+
+        with self.assertRaisesRegex(ValueError, "inputs with matching shapes"):
+            keras.layers.concatenate([i1, i2, i4], axis=-1)
+
+    def test_dot_errors(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 6))
+        i3 = keras.layers.Input(shape=(4, 6))
+        with self.assertRaises(ValueError):
+            keras.layers.dot([i1, i2], axes=-1)
+        with self.assertRaises(ValueError):
+            keras.layers.dot(i1, axes=-1)
+        with self.assertRaises(ValueError):
+            keras.layers.dot([i1], axes=-1)
+        with self.assertRaises(ValueError):
+            keras.layers.dot([i1, i2, i3], axes=-1)
+        with self.assertRaises(ValueError):
+            dot = keras.layers.Dot(1)
+            dot.compute_output_shape(1)
+
+    def test_subtract(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        y = keras.layers.subtract([i1, i2])
+        self.assertEqual(y.shape.as_list(), [None, 4, 5])
+
+        # Test invalid use cases
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(3, 5))
+        with self.assertRaises(ValueError):
+            keras.layers.subtract([i1, i2])
+        with self.assertRaises(ValueError):
+            keras.layers.subtract([i1, i1, i1])
+
+    def test_add_masking(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        m1 = keras.layers.Masking()(i1)
+        layer = keras.layers.Add()
+        o = layer([m1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+        mask = layer.output_mask
+        self.assertListEqual(mask.shape.as_list(), [None, 4])
+
+    def test_add_dynamic_shape(self):
+        i1 = keras.Input(batch_shape=(4, None), dtype="float32")
+        i2 = keras.Input(batch_shape=(4, 5), dtype="float32")
+        layer = keras.layers.Add()
+        o = layer([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [4, 5])
+
+    def test_concatenate_masking(self):
+        i1 = keras.layers.Input(shape=(4, 5))
+        i2 = keras.layers.Input(shape=(4, 5))
+        m1 = keras.layers.Masking()(i1)
+        layer = keras.layers.Concatenate()
+        o = layer([m1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 4, 10])
+        mask = layer.output_mask
+        self.assertListEqual(mask.shape.as_list(), [None, 4])
+
+    def test_concatenate_sparse_shape(self):
+        i1 = keras.layers.Input(shape=(1,), batch_size=2, sparse=True)
+        i2 = keras.layers.Input(shape=(2,), batch_size=2, sparse=True)
+        layer = keras.layers.Concatenate(axis=1)
+        o = layer([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [2, 3])
+
+        # Make sure it also respect None as the batch size
+        i1 = keras.layers.Input(shape=(1,), sparse=True)
+        i2 = keras.layers.Input(shape=(2,), sparse=True)
+        layer = keras.layers.Concatenate(axis=1)
+        o = layer([i1, i2])
+        self.assertListEqual(o.shape.as_list(), [None, 3])
+
+    def test_concatenate_user_changes_to_input_structure(self):
+        a = keras.layers.Input(shape=(4, 5))
+        struct = [a, a]
+        concat1 = keras.layers.Concatenate(1)
+        b = concat1(struct)
+        struct.append(b)
+        concat2 = keras.layers.Concatenate(1)
+        c = concat2(struct)
+
+        # Checks that the append to `struct` doesn't affect `concat1`s
+        # node data.
+        self.assertLen(concat1.inbound_nodes[0].input_tensors, 2)
+        self.assertLen(concat2.inbound_nodes[0].input_tensors, 3)
+
+        keras.Model(a, c)  # Ensure model can be built.
 
 
 def convert_ragged_tensor_value(inputs):
-  if isinstance(inputs, tf.compat.v1.ragged.RaggedTensorValue):
-    flat_values = tf.convert_to_tensor(
-        value=inputs.flat_values,
-        name='flat_values')
-    return tf.RaggedTensor.from_nested_row_splits(
-        flat_values, inputs.nested_row_splits, validate=False)
-  return inputs
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    if isinstance(inputs, tf.compat.v1.ragged.RaggedTensorValue):
+        flat_values = tf.convert_to_tensor(
+            value=inputs.flat_values, name="flat_values"
+        )
+        return tf.RaggedTensor.from_nested_row_splits(
+            flat_values, inputs.nested_row_splits, validate=False
+        )
+    return inputs
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/merging/minimum.py b/keras/layers/merging/minimum.py
index e3fe3fbea100..4bfbd784e771 100644
--- a/keras/layers/merging/minimum.py
+++ b/keras/layers/merging/minimum.py
@@ -15,51 +15,53 @@
 """Layer that computes the minimum (element-wise) of several inputs."""
 
 
-from keras.layers.merging.base_merge import _Merge
 import tensorflow.compat.v2 as tf
 
+from keras.layers.merging.base_merge import _Merge
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Minimum')
+@keras_export("keras.layers.Minimum")
 class Minimum(_Merge):
-  """Layer that computes the minimum (element-wise) a list of inputs.
+    """Layer that computes the minimum (element-wise) a list of inputs.
 
-  It takes as input a list of tensors, all of the same shape, and returns
-  a single tensor (also of the same shape).
+    It takes as input a list of tensors, all of the same shape, and returns
+    a single tensor (also of the same shape).
 
-  >>> tf.keras.layers.Minimum()([np.arange(5).reshape(5, 1),
-  ...                            np.arange(5, 10).reshape(5, 1)])
-  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
-  array([[0],
-       [1],
-       [2],
-       [3],
-       [4]])>
+    >>> tf.keras.layers.Minimum()([np.arange(5).reshape(5, 1),
+    ...                            np.arange(5, 10).reshape(5, 1)])
+    <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[0],
+         [1],
+         [2],
+         [3],
+         [4]])>
 
-  >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
-  >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
-  >>> minned = tf.keras.layers.Minimum()([x1, x2])
-  >>> minned.shape
-  TensorShape([5, 8])
-  """
+    >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
+    >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
+    >>> minned = tf.keras.layers.Minimum()([x1, x2])
+    >>> minned.shape
+    TensorShape([5, 8])
+    """
 
-  def _merge_function(self, inputs):
-    output = inputs[0]
-    for i in range(1, len(inputs)):
-      output = tf.minimum(output, inputs[i])
-    return output
+    def _merge_function(self, inputs):
+        output = inputs[0]
+        for i in range(1, len(inputs)):
+            output = tf.minimum(output, inputs[i])
+        return output
 
 
-@keras_export('keras.layers.minimum')
+@keras_export("keras.layers.minimum")
 def minimum(inputs, **kwargs):
-  """Functional interface to the `Minimum` layer.
+    """Functional interface to the `Minimum` layer.
 
-  Args:
-      inputs: A list of input tensors.
-      **kwargs: Standard layer keyword arguments.
+    Args:
+        inputs: A list of input tensors.
+        **kwargs: Standard layer keyword arguments.
 
-  Returns:
-      A tensor, the element-wise minimum of the inputs.
-  """
-  return Minimum(**kwargs)(inputs)
+    Returns:
+        A tensor, the element-wise minimum of the inputs.
+    """
+    return Minimum(**kwargs)(inputs)
diff --git a/keras/layers/merging/multiply.py b/keras/layers/merging/multiply.py
index 2c016894814d..caae29c7907b 100644
--- a/keras/layers/merging/multiply.py
+++ b/keras/layers/merging/multiply.py
@@ -17,65 +17,68 @@
 
 from keras.layers.merging.base_merge import _Merge
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Multiply')
+@keras_export("keras.layers.Multiply")
 class Multiply(_Merge):
-  """Layer that multiplies (element-wise) a list of inputs.
-
-  It takes as input a list of tensors, all of the same shape, and returns
-  a single tensor (also of the same shape).
-
-  >>> tf.keras.layers.Multiply()([np.arange(5).reshape(5, 1),
-  ...                             np.arange(5, 10).reshape(5, 1)])
-  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
-  array([[ 0],
-       [ 6],
-       [14],
-       [24],
-       [36]])>
-
-  >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
-  >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
-  >>> multiplied = tf.keras.layers.Multiply()([x1, x2])
-  >>> multiplied.shape
-  TensorShape([5, 8])
-  """
-
-  def _merge_function(self, inputs):
-    output = inputs[0]
-    for i in range(1, len(inputs)):
-      output = output * inputs[i]
-    return output
-
-
-@keras_export('keras.layers.multiply')
+    """Layer that multiplies (element-wise) a list of inputs.
+
+    It takes as input a list of tensors, all of the same shape, and returns
+    a single tensor (also of the same shape).
+
+    >>> tf.keras.layers.Multiply()([np.arange(5).reshape(5, 1),
+    ...                             np.arange(5, 10).reshape(5, 1)])
+    <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[ 0],
+         [ 6],
+         [14],
+         [24],
+         [36]])>
+
+    >>> x1 = tf.keras.layers.Dense(8)(np.arange(10).reshape(5, 2))
+    >>> x2 = tf.keras.layers.Dense(8)(np.arange(10, 20).reshape(5, 2))
+    >>> multiplied = tf.keras.layers.Multiply()([x1, x2])
+    >>> multiplied.shape
+    TensorShape([5, 8])
+    """
+
+    def _merge_function(self, inputs):
+        output = inputs[0]
+        for i in range(1, len(inputs)):
+            output = output * inputs[i]
+        return output
+
+
+@keras_export("keras.layers.multiply")
 def multiply(inputs, **kwargs):
-  """Functional interface to the `Multiply` layer.
-
-  Example:
-
-  >>> x1 = np.arange(3.0)
-  >>> x2 = np.arange(3.0)
-  >>> tf.keras.layers.multiply([x1, x2])
-  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 1., 4.], ...)>
-
-  Usage in a functional model:
-
-  >>> input1 = tf.keras.layers.Input(shape=(16,))
-  >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1) #shape=(None, 8)
-  >>> input2 = tf.keras.layers.Input(shape=(32,))
-  >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2) #shape=(None, 8)
-  >>> out = tf.keras.layers.multiply([x1,x2]) #shape=(None, 8)
-  >>> out = tf.keras.layers.Dense(4)(out)
-  >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
-
-  Args:
-      inputs: A list of input tensors.
-      **kwargs: Standard layer keyword arguments.
-
-  Returns:
-      A tensor, the element-wise product of the inputs.
-  """
-  return Multiply(**kwargs)(inputs)
+    """Functional interface to the `Multiply` layer.
+
+    Example:
+
+    >>> x1 = np.arange(3.0)
+    >>> x2 = np.arange(3.0)
+    >>> tf.keras.layers.multiply([x1, x2])
+    <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 1., 4.], ...)>
+
+    Usage in a functional model:
+
+    >>> input1 = tf.keras.layers.Input(shape=(16,))
+    >>> x1 = tf.keras.layers.Dense(
+    ...     8, activation='relu')(input1) #shape=(None, 8)
+    >>> input2 = tf.keras.layers.Input(shape=(32,))
+    >>> x2 = tf.keras.layers.Dense(
+    ...     8, activation='relu')(input2) #shape=(None, 8)
+    >>> out = tf.keras.layers.multiply([x1,x2]) #shape=(None, 8)
+    >>> out = tf.keras.layers.Dense(4)(out)
+    >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
+
+    Args:
+        inputs: A list of input tensors.
+        **kwargs: Standard layer keyword arguments.
+
+    Returns:
+        A tensor, the element-wise product of the inputs.
+    """
+    return Multiply(**kwargs)(inputs)
diff --git a/keras/layers/merging/subtract.py b/keras/layers/merging/subtract.py
index 8d2b5ce659b9..de55fa516eaa 100644
--- a/keras/layers/merging/subtract.py
+++ b/keras/layers/merging/subtract.py
@@ -18,74 +18,76 @@
 from keras.layers.merging.base_merge import _Merge
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Subtract')
+@keras_export("keras.layers.Subtract")
 class Subtract(_Merge):
-  """Layer that subtracts two inputs.
-
-  It takes as input a list of tensors of size 2,
-  both of the same shape, and returns a single tensor, (inputs[0] - inputs[1]),
-  also of the same shape.
-
-  Examples:
-
-  ```python
-      import keras
-
-      input1 = keras.layers.Input(shape=(16,))
-      x1 = keras.layers.Dense(8, activation='relu')(input1)
-      input2 = keras.layers.Input(shape=(32,))
-      x2 = keras.layers.Dense(8, activation='relu')(input2)
-      # Equivalent to subtracted = keras.layers.subtract([x1, x2])
-      subtracted = keras.layers.Subtract()([x1, x2])
-
-      out = keras.layers.Dense(4)(subtracted)
-      model = keras.models.Model(inputs=[input1, input2], outputs=out)
-  ```
-  """
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    super().build(input_shape)
-    if len(input_shape) != 2:
-      raise ValueError(
-          'A `Subtract` layer should be called on exactly 2 inputs. '
-          f'Received: input_shape={input_shape}')
-
-  def _merge_function(self, inputs):
-    if len(inputs) != 2:
-      raise ValueError(
-          'A `Subtract` layer should be called on exactly 2 inputs. '
-          f'Received: inputs={inputs}')
-    return inputs[0] - inputs[1]
-
-
-@keras_export('keras.layers.subtract')
+    """Layer that subtracts two inputs.
+
+    It takes as input a list of tensors of size 2, both of the same shape, and
+    returns a single tensor, (inputs[0] - inputs[1]), also of the same shape.
+
+    Examples:
+
+    ```python
+        import keras
+
+        input1 = keras.layers.Input(shape=(16,))
+        x1 = keras.layers.Dense(8, activation='relu')(input1)
+        input2 = keras.layers.Input(shape=(32,))
+        x2 = keras.layers.Dense(8, activation='relu')(input2)
+        # Equivalent to subtracted = keras.layers.subtract([x1, x2])
+        subtracted = keras.layers.Subtract()([x1, x2])
+
+        out = keras.layers.Dense(4)(subtracted)
+        model = keras.models.Model(inputs=[input1, input2], outputs=out)
+    ```
+    """
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        super().build(input_shape)
+        if len(input_shape) != 2:
+            raise ValueError(
+                "A `Subtract` layer should be called on exactly 2 inputs. "
+                f"Received: input_shape={input_shape}"
+            )
+
+    def _merge_function(self, inputs):
+        if len(inputs) != 2:
+            raise ValueError(
+                "A `Subtract` layer should be called on exactly 2 inputs. "
+                f"Received: inputs={inputs}"
+            )
+        return inputs[0] - inputs[1]
+
+
+@keras_export("keras.layers.subtract")
 def subtract(inputs, **kwargs):
-  """Functional interface to the `Subtract` layer.
+    """Functional interface to the `Subtract` layer.
 
-  Args:
-      inputs: A list of input tensors (exactly 2).
-      **kwargs: Standard layer keyword arguments.
+    Args:
+        inputs: A list of input tensors (exactly 2).
+        **kwargs: Standard layer keyword arguments.
 
-  Returns:
-      A tensor, the difference of the inputs.
+    Returns:
+        A tensor, the difference of the inputs.
 
-  Examples:
+    Examples:
 
-  ```python
-      import keras
+    ```python
+        import keras
 
-      input1 = keras.layers.Input(shape=(16,))
-      x1 = keras.layers.Dense(8, activation='relu')(input1)
-      input2 = keras.layers.Input(shape=(32,))
-      x2 = keras.layers.Dense(8, activation='relu')(input2)
-      subtracted = keras.layers.subtract([x1, x2])
+        input1 = keras.layers.Input(shape=(16,))
+        x1 = keras.layers.Dense(8, activation='relu')(input1)
+        input2 = keras.layers.Input(shape=(32,))
+        x2 = keras.layers.Dense(8, activation='relu')(input2)
+        subtracted = keras.layers.subtract([x1, x2])
 
-      out = keras.layers.Dense(4)(subtracted)
-      model = keras.models.Model(inputs=[input1, input2], outputs=out)
-  ```
-  """
-  return Subtract(**kwargs)(inputs)
+        out = keras.layers.Dense(4)(subtracted)
+        model = keras.models.Model(inputs=[input1, input2], outputs=out)
+    ```
+    """
+    return Subtract(**kwargs)(inputs)
diff --git a/keras/layers/noise.py b/keras/layers/noise.py
index 62f113a0dc5a..7e479a435fd1 100644
--- a/keras/layers/noise.py
+++ b/keras/layers/noise.py
@@ -13,9 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 """Layers that operate regularization via the addition of noise."""
-# pylint: disable=g-bad-import-order,unused-import
+
+
+from keras.layers.regularization.alpha_dropout import AlphaDropout  # noqa: F401
 
 # Regularization layers imported for backwards namespace compatibility
-from keras.layers.regularization.gaussian_dropout import GaussianDropout
-from keras.layers.regularization.gaussian_noise import GaussianNoise
-from keras.layers.regularization.alpha_dropout import AlphaDropout
+from keras.layers.regularization.gaussian_dropout import (  # noqa: F401,E501
+    GaussianDropout,
+)
+from keras.layers.regularization.gaussian_noise import (  # noqa: F401,E501
+    GaussianNoise,
+)
diff --git a/keras/layers/normalization/BUILD b/keras/layers/normalization/BUILD
index 0266b9dabbd2..fffb798587da 100644
--- a/keras/layers/normalization/BUILD
+++ b/keras/layers/normalization/BUILD
@@ -1,18 +1,18 @@
 # Description:
 #   Contains the Keras normalization layers (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
+
+# buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
+# buildifier: disable=same-origin-load
+load("@org_keras//keras:keras.bzl", "tf_py_test")
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
-    default_visibility = [
-        "//keras:friends",
-        "//third_party/tensorflow/python/distribute:__pkg__",
-        "//third_party/tensorflow/python/feature_column:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
-        "//third_party/tensorflow/tools/pip_package:__pkg__",
-        "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
-    ],
+    default_visibility = ["//keras:friends"],
     licenses = ["notice"],
 )
 
@@ -25,7 +25,9 @@ py_library(
     deps = [
         ":batch_normalization",
         ":batch_normalization_v1",
+        ":group_normalization",
         ":layer_normalization",
+        ":spectral_normalization",
         ":unit_normalization",
     ],
 )
@@ -57,6 +59,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "group_normalization",
+    srcs = ["group_normalization.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras:constraints",
+        "//keras:regularizers",
+        "//keras/dtensor:utils",
+        "//keras/engine:base_layer",
+        "//keras/initializers",
+    ],
+)
+
 py_library(
     name = "layer_normalization",
     srcs = ["layer_normalization.py"],
@@ -81,6 +97,40 @@ py_library(
     ],
 )
 
+py_library(
+    name = "spectral_normalization",
+    srcs = ["spectral_normalization.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras/engine:base_layer",
+    ],
+)
+
+cuda_py_test(
+    name = "group_normalization_test",
+    size = "medium",
+    srcs = ["group_normalization_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    tags = [
+        "notsan",
+    ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
+    deps = [
+        ":group_normalization",
+        "//:expect_absl_installed",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/layers",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
 cuda_py_test(
     name = "batch_normalization_test",
     size = "medium",
@@ -102,6 +152,22 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "batch_normalization_dtensor_test",
+    srcs = ["batch_normalization_dtensor_test.py"],
+    shard_count = 2,
+    tags = ["no_oss"],
+    deps = [
+        ":batch_normalization",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/dtensor:test_util",
+        "//keras/testing_infra:test_utils",
+        "//third_party/tensorflow/python/distribute/experimental:mirrored_strategy",
+    ],
+)
+
 cuda_py_test(
     name = "layer_normalization_test",
     size = "medium",
@@ -133,3 +199,17 @@ cuda_py_test(
         "//keras/testing_infra:test_combinations",
     ],
 )
+
+cuda_py_test(
+    name = "spectral_normalization_test",
+    size = "small",
+    srcs = ["spectral_normalization_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//:expect_absl_installed",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
diff --git a/keras/layers/normalization/batch_normalization.py b/keras/layers/normalization/batch_normalization.py
index 84a6138a6b62..759b0486a735 100644
--- a/keras/layers/normalization/batch_normalization.py
+++ b/keras/layers/normalization/batch_normalization.py
@@ -14,7 +14,10 @@
 # ==============================================================================
 """The V2 implementation of Normalization layers."""
 
+import warnings
+
 import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import constraints
 from keras import initializers
@@ -24,1226 +27,1570 @@
 from keras.engine.input_spec import InputSpec
 from keras.utils import control_flow_util
 from keras.utils import tf_utils
-from tensorflow.python.ops.control_flow_ops import get_enclosing_xla_context
+
+# isort: off
+from tensorflow.python.ops.control_flow_ops import (
+    get_enclosing_xla_context,
+)
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import keras_export
 
 
 class BatchNormalizationBase(Layer):
-  r"""Layer that normalizes its inputs.
-
-  Batch normalization applies a transformation that maintains the mean output
-  close to 0 and the output standard deviation close to 1.
-
-  Importantly, batch normalization works differently during training and
-  during inference.
-
-  **During training** (i.e. when using `fit()` or when calling the layer/model
-  with the argument `training=True`), the layer normalizes its output using
-  the mean and standard deviation of the current batch of inputs. That is to
-  say, for each channel being normalized, the layer returns
-  `gamma * (batch - mean(batch)) / sqrt(var(batch) + epsilon) + beta`, where:
-
-  - `epsilon` is small constant (configurable as part of the constructor
-  arguments)
-  - `gamma` is a learned scaling factor (initialized as 1), which
-  can be disabled by passing `scale=False` to the constructor.
-  - `beta` is a learned offset factor (initialized as 0), which
-  can be disabled by passing `center=False` to the constructor.
-
-  **During inference** (i.e. when using `evaluate()` or `predict()`) or when
-  calling the layer/model with the argument `training=False` (which is the
-  default), the layer normalizes its output using a moving average of the
-  mean and standard deviation of the batches it has seen during training. That
-  is to say, it returns
-  `gamma * (batch - self.moving_mean) / sqrt(self.moving_var + epsilon) + beta`.
-
-  `self.moving_mean` and `self.moving_var` are non-trainable variables that
-  are updated each time the layer in called in training mode, as such:
-
-  - `moving_mean = moving_mean * momentum + mean(batch) * (1 - momentum)`
-  - `moving_var = moving_var * momentum + var(batch) * (1 - momentum)`
-
-  As such, the layer will only normalize its inputs during inference
-  *after having been trained on data that has similar statistics as the
-  inference data*.
-
-  Args:
-    axis: Integer or a list of integers, the axis that should be normalized
-      (typically the features axis). For instance, after a `Conv2D` layer with
-      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
-    momentum: Momentum for the moving average.
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored.
-    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
-      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
-      scaling will be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    moving_mean_initializer: Initializer for the moving mean.
-    moving_variance_initializer: Initializer for the moving variance.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: Optional constraint for the beta weight.
-    gamma_constraint: Optional constraint for the gamma weight.
-    renorm: Whether to use [Batch Renormalization](
-      https://arxiv.org/abs/1702.03275). This adds extra variables during
-        training. The inference is the same for either value of this parameter.
-    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction `(r,
-      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
-      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
-      dmax are set to inf, 0, inf, respectively.
-    renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training and
-      should be neither too small (which would add noise) nor too large (which
-      would give stale estimates). Note that `momentum` is still applied to get
-      the means and variances for inference.
-    fused: if `True`, use a faster, fused implementation, or raise a ValueError
-      if the fused implementation cannot be used. If `None`, use the faster
-      implementation if possible. If False, do not used the fused
-      implementation.
-      Note that in TensorFlow 1.x, the meaning of `fused=True` is different: if
-        `False`, the layer uses the system-recommended implementation.
-    trainable: Boolean, if `True` the variables will be marked as trainable.
-    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
-      which means batch normalization is performed across the whole batch. When
-      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
-      Normalization", which creates virtual sub-batches which are each
-      normalized separately (with shared gamma, beta, and moving statistics).
-      Must divide the actual batch size during execution.
-    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
-      the input tensor and returning a pair (scale, bias) to apply to the
-      normalized values (before gamma and beta), only during training. For
-      example, if `axis=-1`,
-        `adjustment = lambda shape: (
-          tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
-            value by up to 7% up or down, then shift the result by up to 0.1
-            (with independent scaling and bias for each feature but shared
-            across all examples), and finally apply gamma and/or beta. If
-            `None`, no adjustment is applied. Cannot be specified if
-            virtual_batch_size is specified.
-
-  Call arguments:
-    inputs: Input tensor (of any rank).
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode.
-      - `training=True`: The layer will normalize its inputs using the mean and
-        variance of the current batch of inputs.
-      - `training=False`: The layer will normalize its inputs using the mean and
-        variance of its moving statistics, learned during training.
-
-  Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
-    integers, does not include the samples axis) when using this layer as the
-    first layer in a model.
-
-  Output shape: Same shape as input.
-
-  Reference:
-    - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
-  """
-
-  # By default, the base class uses V2 behavior. The BatchNormalization V1
-  # subclass sets this to False to use the V1 behavior.
-  _USE_V2_BEHAVIOR = True
-
-  def __init__(self,
-               axis=-1,
-               momentum=0.99,
-               epsilon=1e-3,
-               center=True,
-               scale=True,
-               beta_initializer='zeros',
-               gamma_initializer='ones',
-               moving_mean_initializer='zeros',
-               moving_variance_initializer='ones',
-               beta_regularizer=None,
-               gamma_regularizer=None,
-               beta_constraint=None,
-               gamma_constraint=None,
-               renorm=False,
-               renorm_clipping=None,
-               renorm_momentum=0.99,
-               fused=None,
-               trainable=True,
-               virtual_batch_size=None,
-               adjustment=None,
-               name=None,
-               **kwargs):
-    super().__init__(name=name, **kwargs)
-    if isinstance(axis, (list, tuple)):
-      self.axis = axis[:]
-    elif isinstance(axis, int):
-      self.axis = axis
-    else:
-      raise TypeError('Expected an int or a list/tuple of ints for the '
-                      'argument \'axis\', but received: %r' % axis)
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.center = center
-    self.scale = scale
-    self.beta_initializer = initializers.get(beta_initializer)
-    self.gamma_initializer = initializers.get(gamma_initializer)
-    self.moving_mean_initializer = initializers.get(moving_mean_initializer)
-    self.moving_variance_initializer = initializers.get(
-        moving_variance_initializer)
-    self.beta_regularizer = regularizers.get(beta_regularizer)
-    self.gamma_regularizer = regularizers.get(gamma_regularizer)
-    self.beta_constraint = constraints.get(beta_constraint)
-    self.gamma_constraint = constraints.get(gamma_constraint)
-    self.renorm = renorm
-    self.virtual_batch_size = virtual_batch_size
-    self.adjustment = adjustment
-    if self._USE_V2_BEHAVIOR:
-      if fused:
-        self._raise_if_fused_cannot_be_used()
-      # We leave fused as None if self._fused_can_be_used()==True, since we
-      # still may set it to False in self.build() if the input rank is not 4.
-      elif fused is None and not self._fused_can_be_used():
-        fused = False
-    elif fused is None:
-      fused = True
-    self.supports_masking = True
-
-    self.fused = fused
-    self._bessels_correction_test_only = True
-    self.trainable = trainable
-
-    if renorm:
-      renorm_clipping = renorm_clipping or {}
-      keys = ['rmax', 'rmin', 'dmax']
-      if set(renorm_clipping) - set(keys):
-        raise ValueError(
-            f'Received invalid keys for `renorm_clipping` argument: '
-            f'{renorm_clipping}. Supported values: {keys}.')
-      self.renorm_clipping = renorm_clipping
-      self.renorm_momentum = renorm_momentum
-
-  def _raise_if_fused_cannot_be_used(self):
-    """Raises a ValueError if fused implementation cannot be used.
-
-    In addition to the checks done in this function, the input tensors rank must
-    be 4 or 5. The input rank check can only be done once the input shape is
-    known.
+    r"""Layer that normalizes its inputs.
+
+    Batch normalization applies a transformation that maintains the mean output
+    close to 0 and the output standard deviation close to 1.
+
+    Importantly, batch normalization works differently during training and
+    during inference.
+
+    **During training** (i.e. when using `fit()` or when calling the layer/model
+    with the argument `training=True`), the layer normalizes its output using
+    the mean and standard deviation of the current batch of inputs. That is to
+    say, for each channel being normalized, the layer returns
+    `gamma * (batch - mean(batch)) / sqrt(var(batch) + epsilon) + beta`, where:
+
+    - `epsilon` is small constant (configurable as part of the constructor
+    arguments)
+    - `gamma` is a learned scaling factor (initialized as 1), which
+    can be disabled by passing `scale=False` to the constructor.
+    - `beta` is a learned offset factor (initialized as 0), which
+    can be disabled by passing `center=False` to the constructor.
+
+    **During inference** (i.e. when using `evaluate()` or `predict()`) or when
+    calling the layer/model with the argument `training=False` (which is the
+    default), the layer normalizes its output using a moving average of the
+    mean and standard deviation of the batches it has seen during training. That
+    is to say, it returns
+    `gamma * (batch - self.moving_mean) / sqrt(self.moving_var+epsilon) + beta`.
+
+    `self.moving_mean` and `self.moving_var` are non-trainable variables that
+    are updated each time the layer in called in training mode, as such:
+
+    - `moving_mean = moving_mean * momentum + mean(batch) * (1 - momentum)`
+    - `moving_var = moving_var * momentum + var(batch) * (1 - momentum)`
+
+    As such, the layer will only normalize its inputs during inference
+    *after having been trained on data that has similar statistics as the
+    inference data*.
+
+    Args:
+      axis: Integer or a list of integers, the axis that should be normalized
+        (typically the features axis). For instance, after a `Conv2D` layer with
+        `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
+      momentum: Momentum for the moving average.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor. If False,
+        `beta` is ignored.
+      scale: If True, multiply by `gamma`. If False, `gamma` is not used. When
+        the next layer is linear (also e.g. `nn.relu`), this can be disabled
+        since the scaling will be done by the next layer.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      moving_mean_initializer: Initializer for the moving mean.
+      moving_variance_initializer: Initializer for the moving variance.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: Optional constraint for the beta weight.
+      gamma_constraint: Optional constraint for the gamma weight.
+      renorm: Whether to use [Batch Renormalization](
+        https://arxiv.org/abs/1702.03275). This adds extra variables during
+          training. The inference is the same for either value of this
+          parameter.
+      renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+        scalar `Tensors` used to clip the renorm correction. The correction `(r,
+        d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+        clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+        dmax are set to inf, 0, inf, respectively.
+      renorm_momentum: Momentum used to update the moving means and standard
+        deviations with renorm. Unlike `momentum`, this affects training and
+        should be neither too small (which would add noise) nor too large (which
+        would give stale estimates). Note that `momentum` is still applied to
+        get the means and variances for inference.
+      fused: if `True`, use a faster, fused implementation, or raise a
+        ValueError if the fused implementation cannot be used. If `None`, use
+        the faster implementation if possible. If False, do not used the fused
+        implementation. Note that in TensorFlow 1.x, the meaning of
+        `fused=True` is different: if `False`, the layer uses the
+        system-recommended implementation. You cannot use `fused=True` if a
+        mask is passed in the `call()` method.
+      trainable: Boolean, if `True` the variables will be marked as trainable.
+      virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+        which means batch normalization is performed across the whole batch.
+        When `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+        Normalization", which creates virtual sub-batches which are each
+        normalized separately (with shared gamma, beta, and moving statistics).
+        Must divide the actual batch size during execution.
+      adjustment: A function taking the `Tensor` containing the (dynamic) shape
+        of the input tensor and returning a pair (scale, bias) to apply to the
+        normalized values (before gamma and beta), only during training. For
+        example, if `axis=-1`,
+          `adjustment = lambda shape: (
+            tf.random.uniform(shape[-1:], 0.93, 1.07),
+            tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+              value by up to 7% up or down, then shift the result by up to 0.1
+              (with independent scaling and bias for each feature but shared
+              across all examples), and finally apply gamma and/or beta. If
+              `None`, no adjustment is applied. Cannot be specified if
+              virtual_batch_size is specified.
+      synchronized: If True, synchronizes the global batch statistics (mean and
+        variance) for the layer across all devices at each training step in a
+        distributed training strategy. If False, each replica uses its own
+        local batch statistics. Only relevant when used inside a
+        `tf.distribute` strategy.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode.
+        - `training=True`: The layer will normalize its inputs using the mean
+          and variance of the current batch of inputs.
+        - `training=False`: The layer will normalize its inputs using the mean
+          and variance of its moving statistics, learned during training.
+      mask: Binary tensor of shape broadcastable to `inputs` tensor, indicating
+        the positions for which the mean and variance should be computed.
+
+    Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
+      integers, does not include the samples axis) when using this layer as the
+      first layer in a model.
+
+    Output shape: Same shape as input.
+
+    Reference:
+      - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
     """
-    # Note the ValueErrors in this function are caught and not reraised in
-    # _fused_can_be_used(). No other exception besides ValueError should be
-    # raised here.
-
-    # Currently fused batch norm doesn't support renorm. It also only supports a
-    # channel dimension on axis 1 or 3 (rank=4) / 1 or 4 (rank5), when no
-    # virtual batch size or adjustment is used.
-    if self.renorm:
-      raise ValueError('Passing both `fused=True` and `renorm=True` is '
-                       'not supported')
-    axis = [self.axis] if isinstance(self.axis, int) else self.axis
-    # Axis -3 is equivalent to 1, and axis -1 is equivalent to 3, when the
-    # input rank is 4. Similarly, the valid axis is -4, -1, 1, 4 when the rank
-    # is 5. The combination of ranks and axes will be checked later.
-    if len(axis) > 1 or axis[0] not in (-4, -3, -1, 1, 3, 4):
-      raise ValueError('Passing `fused=True` is only supported when axis is 1 '
-                       'or 3 for input rank = 4 or 1 or 4 for input rank = 5. '
-                       'Got axis %s' % (axis,))
-    if self.virtual_batch_size is not None:
-      raise ValueError('Passing `fused=True` is not supported when '
-                       '`virtual_batch_size` is specified.')
-    if self.adjustment is not None:
-      raise ValueError('Passing `fused=True` is not supported when '
-                       '`adjustment` is specified.')
-    # TODO(reedwm): Support fp64 in FusedBatchNorm then remove this check.
-    if self._compute_dtype not in ('float16', 'bfloat16', 'float32', None):
-      raise ValueError(
-          'Passing `fused=True` is only supported when the compute '
-          'dtype is float16, bfloat16, or float32. Got dtype: %s' %
-          (self._compute_dtype,))
-
-  def _fused_can_be_used(self):
-    try:
-      self._raise_if_fused_cannot_be_used()
-      return True
-    except ValueError:
-      return False
-
-  @property
-  def trainable(self):
-    return self._trainable
-
-  @trainable.setter
-  def trainable(self, value):
-    self._trainable = value
-
-  @property
-  def _param_dtype(self):
-    # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == tf.float16 or self.dtype == tf.bfloat16:
-      return tf.float32
-    else:
-      return self.dtype or tf.float32
-
-  def _support_zero_size_input(self):
-    if not tf.distribute.has_strategy():
-      return False
-    strategy = tf.distribute.get_strategy()
-    # TODO(b/195085185): remove experimental_enable_get_next_as_optional after
-    # migrating all users.
-    return getattr(
-        strategy.extended, 'enable_partial_batch_handling',
-        getattr(strategy.extended, 'experimental_enable_get_next_as_optional',
-                False))
-
-  def build(self, input_shape):
-    self.axis = tf_utils.validate_axis(self.axis, input_shape)
-    input_shape = tf.TensorShape(input_shape)
-    rank = input_shape.rank
-
-    if self.virtual_batch_size is not None:
-      if self.virtual_batch_size <= 0:
-        raise ValueError(
-            f'`virtual_batch_size` must be a positive integer that divides the '
-            f'true batch size of the input tensor. Received: '
-            f'virtual_batch_size={self.virtual_batch_size}')
-      # If using virtual batches, the first dimension must be the batch
-      # dimension and cannot be the batch norm axis
-      if 0 in self.axis:
-        raise ValueError('When using `virtual_batch_size`, the batch dimension '
-                         'must be 0 and thus axis cannot include 0. '
-                         f'Received axis={self.axis}')
-      if self.adjustment is not None:
-        raise ValueError('When using `virtual_batch_size`, adjustment cannot '
-                         'be specified')
-
-    if self.fused in (None, True):
-      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
-      # output back to its original shape accordingly.
-      if self._USE_V2_BEHAVIOR:
-        if self.fused is None:
-          self.fused = rank in (4, 5)
-        elif self.fused and rank not in (4, 5):
-          raise ValueError('Batch normalization layers with `fused=True` only '
-                           'support 4D or 5D input tensors. '
-                           f'Received tensor with shape: {tuple(input_shape)}')
-      else:
-        assert self.fused is not None
-        self.fused = (rank in (4, 5) and self._fused_can_be_used())
-      # TODO(chrisying): fused batch norm is currently not supported for
-      # multi-axis batch norm and by extension virtual batches. In some cases,
-      # it might be possible to use fused batch norm but would require reshaping
-      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
-      # particularly tricky. A compromise might be to just support the most
-      # common use case (turning 5D w/ virtual batch to NCHW)
-
-    if self.fused:
-      if self.axis == [1] and rank == 4:
-        self._data_format = 'NCHW'
-      elif self.axis == [1] and rank == 5:
-        self._data_format = 'NCDHW'
-      elif self.axis == [3] and rank == 4:
-        self._data_format = 'NHWC'
-      elif self.axis == [4] and rank == 5:
-        self._data_format = 'NDHWC'
-      elif rank == 5:
-        # 5D tensors that can be passed in but should not use fused batch norm
-        # due to unsupported axis.
-        self.fused = False
-      else:
-        if rank == 4:
-          raise ValueError(
-              'Unsupported axis. The use of `fused=True` is only possible with '
-              '`axis=1` or `axis=3` for 4D input tensors. Received: '
-              f'axis={tuple(self.axis)}')
+
+    # By default, the base class uses V2 behavior. The BatchNormalization V1
+    # subclass sets this to False to use the V1 behavior.
+    _USE_V2_BEHAVIOR = True
+
+    def __init__(
+        self,
+        axis=-1,
+        momentum=0.99,
+        epsilon=1e-3,
+        center=True,
+        scale=True,
+        beta_initializer="zeros",
+        gamma_initializer="ones",
+        moving_mean_initializer="zeros",
+        moving_variance_initializer="ones",
+        beta_regularizer=None,
+        gamma_regularizer=None,
+        beta_constraint=None,
+        gamma_constraint=None,
+        renorm=False,
+        renorm_clipping=None,
+        renorm_momentum=0.99,
+        fused=None,
+        trainable=True,
+        virtual_batch_size=None,
+        adjustment=None,
+        name=None,
+        synchronized=False,
+        **kwargs,
+    ):
+        super().__init__(name=name, **kwargs)
+        if isinstance(axis, (list, tuple)):
+            self.axis = axis[:]
+        elif isinstance(axis, int):
+            self.axis = axis
         else:
-          raise ValueError(
-              'Unsupported axis. The use of `fused=True` is only possible with '
-              '`axis=1` or `axis=4` for 5D input tensors. Received: '
-              f'axis={tuple(self.axis)}')
-
-    axis_to_dim = {x: input_shape.dims[x].value for x in self.axis}
-    for x in axis_to_dim:
-      if axis_to_dim[x] is None:
-        raise ValueError('Input has undefined `axis` dimension. Received input '
-                         f'with shape {tuple(input_shape)} '
-                         f'and axis={tuple(self.axis)}')
-    self.input_spec = InputSpec(ndim=rank, axes=axis_to_dim)
-
-    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
-      # Single axis batch norm (most common/default use-case)
-      param_shape = (list(axis_to_dim.values())[0],)
-    else:
-      # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [
-          axis_to_dim[i] if i in axis_to_dim else 1 for i in range(rank)
-      ]
-      if self.virtual_batch_size is not None:
-        # When using virtual batches, add an extra dim at index 1
-        param_shape.insert(1, 1)
-        for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1  # Account for added dimension
-
-    if self.scale:
-      self.gamma = self.add_weight(
-          name='gamma',
-          shape=param_shape,
-          dtype=self._param_dtype,
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint,
-          trainable=True,
-          experimental_autocast=False)
-    else:
-      self.gamma = None
-      if self.fused:
-        self._gamma_const = backend.constant(
-            1.0, dtype=self._param_dtype, shape=param_shape)
-
-    if self.center:
-      self.beta = self.add_weight(
-          name='beta',
-          shape=param_shape,
-          dtype=self._param_dtype,
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint,
-          trainable=True,
-          experimental_autocast=False)
-    else:
-      self.beta = None
-      if self.fused:
-        self._beta_const = backend.constant(
-            0.0, dtype=self._param_dtype, shape=param_shape)
-
-    try:
-      # Disable variable partitioning when creating the moving mean and variance
-      if hasattr(self, '_scope') and self._scope:
-        partitioner = self._scope.partitioner
-        self._scope.set_partitioner(None)
-      else:
-        partitioner = None
-      self.moving_mean = self.add_weight(
-          name='moving_mean',
-          shape=param_shape,
-          dtype=self._param_dtype,
-          initializer=self.moving_mean_initializer,
-          synchronization=tf.VariableSynchronization.ON_READ,
-          trainable=False,
-          aggregation=tf.VariableAggregation.MEAN,
-          experimental_autocast=False)
-
-      self.moving_variance = self.add_weight(
-          name='moving_variance',
-          shape=param_shape,
-          dtype=self._param_dtype,
-          initializer=self.moving_variance_initializer,
-          synchronization=tf.VariableSynchronization.ON_READ,
-          trainable=False,
-          aggregation=tf.VariableAggregation.MEAN,
-          experimental_autocast=False)
-
-      if self.renorm:
-        # In batch renormalization we track the inference moving stddev instead
-        # of the moving variance to more closely align with the paper.
-        def moving_stddev_initializer(*args, **kwargs):
-          return tf.sqrt(
-              self.moving_variance_initializer(*args, **kwargs))
-
-        with tf.distribute.get_strategy(
-        ).extended.colocate_vars_with(self.moving_variance):
-          self.moving_stddev = self.add_weight(
-              name='moving_stddev',
-              shape=param_shape,
-              dtype=self._param_dtype,
-              initializer=moving_stddev_initializer,
-              synchronization=tf.VariableSynchronization.ON_READ,
-              trainable=False,
-              aggregation=tf.VariableAggregation.MEAN,
-              experimental_autocast=False)
-
-        # Create variables to maintain the moving mean and standard deviation.
-        # These are used in training and thus are different from the moving
-        # averages above. The renorm variables are colocated with moving_mean
-        # and moving_stddev.
-        # NOTE: below, the outer `with device` block causes the current device
-        # stack to be cleared. The nested ones use a `lambda` to set the desired
-        # device and ignore any devices that may be set by the custom getter.
-        def _renorm_variable(name,
-                             shape,
-                             initializer='zeros'):
-          """Create a renorm variable."""
-          var = self.add_weight(
-              name=name,
-              shape=shape,
-              dtype=self._param_dtype,
-              initializer=initializer,
-              synchronization=tf.VariableSynchronization.ON_READ,
-              trainable=False,
-              aggregation=tf.VariableAggregation.MEAN,
-              experimental_autocast=False)
-          return var
-
-        with tf.distribute.get_strategy(
-        ).extended.colocate_vars_with(self.moving_mean):
-          self.renorm_mean = _renorm_variable('renorm_mean', param_shape,
-                                              self.moving_mean_initializer)
-        with tf.distribute.get_strategy(
-        ).extended.colocate_vars_with(self.moving_stddev):
-          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape,
-                                                moving_stddev_initializer)
-    finally:
-      if partitioner:
-        self._scope.set_partitioner(partitioner)
-    self.built = True
-
-  def _assign_moving_average(self, variable, value, momentum, inputs_size):
-
-    def calculate_update_delta():
-      decay = tf.convert_to_tensor(
-          1.0 - momentum, name='decay')
-      if decay.dtype != variable.dtype.base_dtype:
-        decay = tf.cast(decay, variable.dtype.base_dtype)
-      update_delta = (variable - tf.cast(value, variable.dtype)) * decay
-      if inputs_size is not None:
-        update_delta = tf.where(inputs_size > 0, update_delta,
-                                backend.zeros_like(update_delta))
-      return update_delta
-
-    with backend.name_scope('AssignMovingAvg') as scope:
-      if tf.compat.v1.executing_eagerly_outside_functions():
-        return variable.assign_sub(calculate_update_delta(), name=scope)
-      else:
-        with tf.compat.v1.colocate_with(variable):  # pylint: disable=protected-access
-          return tf.compat.v1.assign_sub(
-              variable, calculate_update_delta(), name=scope)
-
-  def _assign_new_value(self, variable, value):
-    with backend.name_scope('AssignNewValue') as scope:
-      if tf.compat.v1.executing_eagerly_outside_functions():
-        return variable.assign(value, name=scope)
-      else:
-        with tf.compat.v1.colocate_with(variable):  # pylint: disable=protected-access
-          return tf.compat.v1.assign(variable, value, name=scope)
-
-  def _fused_batch_norm(self, inputs, training):
-    """Returns the output of fused batch norm."""
-    beta = self.beta if self.center else self._beta_const
-    gamma = self.gamma if self.scale else self._gamma_const
-
-    # TODO(b/129279393): Support zero batch input in non DistributionStrategy
-    # code as well.
-    if self._support_zero_size_input():
-      # Keras assumes that batch dimension is the first dimension for Batch
-      # Normalization.
-      input_batch_size = tf.shape(inputs)[0]
-    else:
-      input_batch_size = None
-
-    # TODO(rmlarsen): Support using fused avg updates for non-eager execution
-    # after fixing graph pattern matching and enabling fused_batch_norm to
-    # take exponential_avg_factor as a tensor input.
-    use_fused_avg_updates = (
-        tf.compat.v1.executing_eagerly_outside_functions() and
-        isinstance(self.momentum,
-                   (float, int)) and get_enclosing_xla_context() is None)
-    if use_fused_avg_updates:
-      exponential_avg_factor = 1.0 - self.momentum
-    else:
-      exponential_avg_factor = None
-
-    def _maybe_add_or_remove_bessels_correction(variance, remove=True):
-      r"""Add or remove Bessel's correction."""
-      # Removes Bessel's correction if remove == True, adds it otherwise.
-      # This is to be consistent with non-fused batch norm. Note that the
-      # variance computed by fused batch norm is with Bessel's correction.
-      # This is only used in legacy V1 batch norm tests.
-      if self._bessels_correction_test_only:
-        return variance
-      sample_size = tf.cast(
-          tf.size(inputs) / tf.size(variance), variance.dtype)
-      if remove:
-        factor = (sample_size -
-                  tf.cast(1.0, variance.dtype)) / sample_size
-      else:
-        factor = sample_size / (
-            sample_size - tf.cast(1.0, variance.dtype))
-      return variance * factor
-
-    def _fused_batch_norm_training():
-      return tf.compat.v1.nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=self.moving_mean,
-          variance=_maybe_add_or_remove_bessels_correction(
-              self.moving_variance, remove=False),
-          epsilon=self.epsilon,
-          is_training=True,
-          data_format=self._data_format,
-          exponential_avg_factor=exponential_avg_factor)
-
-    def _fused_batch_norm_inference():
-      return tf.compat.v1.nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=self.moving_mean,
-          variance=self.moving_variance,
-          epsilon=self.epsilon,
-          is_training=False,
-          data_format=self._data_format)
-
-    output, mean, variance = control_flow_util.smart_cond(
-        training, _fused_batch_norm_training, _fused_batch_norm_inference)
-    variance = _maybe_add_or_remove_bessels_correction(variance, remove=True)
-
-    training_value = control_flow_util.constant_value(training)
-    if training_value or training_value is None:
-      if not use_fused_avg_updates:
-        if training_value is None:
-          momentum = control_flow_util.smart_cond(training,
-                                                  lambda: self.momentum,
-                                                  lambda: 1.0)
+            raise TypeError(
+                "Expected an int or a list/tuple of ints for the "
+                "argument 'axis', but received: %r" % axis
+            )
+        if synchronized and fused:
+            raise ValueError(
+                "`fused=True` is not supported when `synchronized=True`."
+            )
+        self.synchronized = synchronized
+        if self.synchronized:
+            fused = False
+
+        self.momentum = momentum
+        self.epsilon = epsilon
+        self.center = center
+        self.scale = scale
+        self.beta_initializer = initializers.get(beta_initializer)
+        self.gamma_initializer = initializers.get(gamma_initializer)
+        self.moving_mean_initializer = initializers.get(moving_mean_initializer)
+        self.moving_variance_initializer = initializers.get(
+            moving_variance_initializer
+        )
+        self.beta_regularizer = regularizers.get(beta_regularizer)
+        self.gamma_regularizer = regularizers.get(gamma_regularizer)
+        self.beta_constraint = constraints.get(beta_constraint)
+        self.gamma_constraint = constraints.get(gamma_constraint)
+        self.renorm = renorm
+        self.virtual_batch_size = virtual_batch_size
+        self.adjustment = adjustment
+        if self._USE_V2_BEHAVIOR:
+            if fused:
+                self._raise_if_fused_cannot_be_used()
+            # We leave fused as None if self._fused_can_be_used()==True, since
+            # we still may set it to False in self.build() if the input rank is
+            # not 4.
+            elif fused is None and not self._fused_can_be_used():
+                fused = False
+        elif fused is None:
+            fused = True
+        self.supports_masking = True
+
+        self.fused = fused
+        self._bessels_correction_test_only = True
+        self.trainable = trainable
+
+        if renorm:
+            renorm_clipping = renorm_clipping or {}
+            keys = ["rmax", "rmin", "dmax"]
+            if set(renorm_clipping) - set(keys):
+                raise ValueError(
+                    "Received invalid keys for `renorm_clipping` argument: "
+                    f"{renorm_clipping}. Supported values: {keys}."
+                )
+            self.renorm_clipping = renorm_clipping
+            self.renorm_momentum = renorm_momentum
+
+    def _raise_if_fused_cannot_be_used(self):
+        """Raises a ValueError if fused implementation cannot be used.
+
+        In addition to the checks done in this function, the input tensors rank
+        must be 4 or 5. The input rank check can only be done once the input
+        shape is known.
+        """
+        # Note the ValueErrors in this function are caught and not reraised in
+        # _fused_can_be_used(). No other exception besides ValueError should be
+        # raised here.
+
+        # Currently fused batch norm doesn't support renorm. It also only
+        # supports a channel dimension on axis 1 or 3 (rank=4) / 1 or 4 (rank5),
+        # when no virtual batch size or adjustment is used.
+        if self.renorm:
+            raise ValueError(
+                "Passing both `fused=True` and `renorm=True` is not supported"
+            )
+        axis = [self.axis] if isinstance(self.axis, int) else self.axis
+        # Axis -3 is equivalent to 1, and axis -1 is equivalent to 3, when the
+        # input rank is 4. Similarly, the valid axis is -4, -1, 1, 4 when the
+        # rank is 5. The combination of ranks and axes will be checked later.
+        if len(axis) > 1 or axis[0] not in (-4, -3, -1, 1, 3, 4):
+            raise ValueError(
+                "Passing `fused=True` is only supported when axis is 1 "
+                "or 3 for input rank = 4 or 1 or 4 for input rank = 5. "
+                "Got axis %s" % (axis,)
+            )
+        if self.virtual_batch_size is not None:
+            raise ValueError(
+                "Passing `fused=True` is not supported when "
+                "`virtual_batch_size` is specified."
+            )
+        if self.adjustment is not None:
+            raise ValueError(
+                "Passing `fused=True` is not supported when "
+                "`adjustment` is specified."
+            )
+        # TODO(reedwm): Support fp64 in FusedBatchNorm then remove this check.
+        if self._compute_dtype not in ("float16", "bfloat16", "float32", None):
+            raise ValueError(
+                "Passing `fused=True` is only supported when the compute "
+                "dtype is float16, bfloat16, or float32. Got dtype: %s"
+                % (self._compute_dtype,)
+            )
+
+    def _fused_can_be_used(self):
+        try:
+            self._raise_if_fused_cannot_be_used()
+            return True
+        except ValueError:
+            return False
+
+    @property
+    def trainable(self):
+        return self._trainable
+
+    @trainable.setter
+    def trainable(self, value):
+        self._trainable = value
+
+    @property
+    def _param_dtype(self):
+        # Raise parameters of fp16 batch norm to fp32
+        if self.dtype == tf.float16 or self.dtype == tf.bfloat16:
+            return tf.float32
         else:
-          momentum = tf.convert_to_tensor(self.momentum)
-
-      def mean_update():
-        """Update self.moving_mean with the most recent data point."""
-        if use_fused_avg_updates:
-          if input_batch_size is not None:
-            new_mean = control_flow_util.smart_cond(
-                input_batch_size > 0, lambda: mean, lambda: self.moving_mean)
-          else:
-            new_mean = mean
-          return self._assign_new_value(self.moving_mean, new_mean)
+            return self.dtype or tf.float32
+
+    def build(self, input_shape):
+        self.axis = tf_utils.validate_axis(self.axis, input_shape)
+        input_shape = tf.TensorShape(input_shape)
+        rank = input_shape.rank
+
+        if self.virtual_batch_size is not None:
+            if self.virtual_batch_size <= 0:
+                raise ValueError(
+                    "`virtual_batch_size` must be a positive integer that "
+                    "divides the true batch size of the input tensor. "
+                    f"Received: virtual_batch_size={self.virtual_batch_size}"
+                )
+            # If using virtual batches, the first dimension must be the batch
+            # dimension and cannot be the batch norm axis
+            if 0 in self.axis:
+                raise ValueError(
+                    "When using `virtual_batch_size`, the batch dimension "
+                    "must be 0 and thus axis cannot include 0. "
+                    f"Received axis={self.axis}"
+                )
+            if self.adjustment is not None:
+                raise ValueError(
+                    "When using `virtual_batch_size`, adjustment cannot "
+                    "be specified"
+                )
+
+        if self.fused in (None, True):
+            # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape
+            # the output back to its original shape accordingly.
+            if self._USE_V2_BEHAVIOR:
+                if self.fused is None:
+                    self.fused = rank in (4, 5)
+                elif self.fused and rank not in (4, 5):
+                    raise ValueError(
+                        "Batch normalization layers with `fused=True` only "
+                        "support 4D or 5D input tensors. "
+                        f"Received tensor with shape: {tuple(input_shape)}"
+                    )
+            else:
+                assert self.fused is not None
+                self.fused = rank in (4, 5) and self._fused_can_be_used()
+            # TODO(chrisying): fused batch norm is currently not supported for
+            # multi-axis batch norm and by extension virtual batches. In some
+            # cases, it might be possible to use fused batch norm but would
+            # require reshaping the Tensor to 4D with the axis in 1 or 3
+            # (preferred 1) which is particularly tricky. A compromise might be
+            # to just support the most common use case (turning 5D w/ virtual
+            # batch to NCHW)
+
+        if self.fused:
+            if self.axis == [1] and rank == 4:
+                self._data_format = "NCHW"
+            elif self.axis == [1] and rank == 5:
+                self._data_format = "NCDHW"
+            elif self.axis == [3] and rank == 4:
+                self._data_format = "NHWC"
+            elif self.axis == [4] and rank == 5:
+                self._data_format = "NDHWC"
+            elif rank == 5:
+                # 5D tensors that can be passed in but should not use fused
+                # batch norm due to unsupported axis.
+                self.fused = False
+            else:
+                if rank == 4:
+                    raise ValueError(
+                        "Unsupported axis. The use of `fused=True` is only "
+                        "possible with `axis=1` or `axis=3` for 4D input "
+                        f"tensors. Received: axis={tuple(self.axis)}"
+                    )
+                else:
+                    raise ValueError(
+                        "Unsupported axis. The use of `fused=True` is only "
+                        "possible with `axis=1` or `axis=4` for 5D input "
+                        f"tensors. Received: axis={tuple(self.axis)}"
+                    )
+
+        axis_to_dim = {x: input_shape.dims[x].value for x in self.axis}
+        for x in axis_to_dim:
+            if axis_to_dim[x] is None:
+                raise ValueError(
+                    "Input has undefined `axis` dimension. Received input "
+                    f"with shape {tuple(input_shape)} "
+                    f"and axis={tuple(self.axis)}"
+                )
+        self.input_spec = InputSpec(ndim=rank, axes=axis_to_dim)
+
+        if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
+            # Single axis batch norm (most common/default use-case)
+            param_shape = (list(axis_to_dim.values())[0],)
         else:
-          return self._assign_moving_average(self.moving_mean, mean, momentum,
-                                             input_batch_size)
-
-      def variance_update():
-        """Update self.moving_variance with the most recent data point."""
-        if use_fused_avg_updates:
-          if input_batch_size is not None:
-            new_variance = control_flow_util.smart_cond(
-                input_batch_size > 0, lambda: variance,
-                lambda: self.moving_variance)
-          else:
-            new_variance = variance
-          return self._assign_new_value(self.moving_variance, new_variance)
+            # Parameter shape is the original shape but with 1 in all non-axis
+            # dims
+            param_shape = [
+                axis_to_dim[i] if i in axis_to_dim else 1 for i in range(rank)
+            ]
+            if self.virtual_batch_size is not None:
+                # When using virtual batches, add an extra dim at index 1
+                param_shape.insert(1, 1)
+                for idx, x in enumerate(self.axis):
+                    self.axis[idx] = x + 1  # Account for added dimension
+        self._param_shape = param_shape
+        if self.scale:
+            self.gamma = self.add_weight(
+                name="gamma",
+                shape=param_shape,
+                dtype=self._param_dtype,
+                initializer=self.gamma_initializer,
+                regularizer=self.gamma_regularizer,
+                constraint=self.gamma_constraint,
+                trainable=True,
+                experimental_autocast=False,
+            )
+        else:
+            self.gamma = None
+
+        if self.center:
+            self.beta = self.add_weight(
+                name="beta",
+                shape=param_shape,
+                dtype=self._param_dtype,
+                initializer=self.beta_initializer,
+                regularizer=self.beta_regularizer,
+                constraint=self.beta_constraint,
+                trainable=True,
+                experimental_autocast=False,
+            )
+        else:
+            self.beta = None
+
+        try:
+            # Disable variable partitioning when creating the moving mean and
+            # variance
+            if hasattr(self, "_scope") and self._scope:
+                partitioner = self._scope.partitioner
+                self._scope.set_partitioner(None)
+            else:
+                partitioner = None
+            self.moving_mean = self.add_weight(
+                name="moving_mean",
+                shape=param_shape,
+                dtype=self._param_dtype,
+                initializer=self.moving_mean_initializer,
+                synchronization=tf.VariableSynchronization.ON_READ,
+                trainable=False,
+                aggregation=tf.VariableAggregation.MEAN,
+                experimental_autocast=False,
+            )
+
+            self.moving_variance = self.add_weight(
+                name="moving_variance",
+                shape=param_shape,
+                dtype=self._param_dtype,
+                initializer=self.moving_variance_initializer,
+                synchronization=tf.VariableSynchronization.ON_READ,
+                trainable=False,
+                aggregation=tf.VariableAggregation.MEAN,
+                experimental_autocast=False,
+            )
+
+            if self.renorm:
+                # In batch renormalization we track the inference moving stddev
+                # instead of the moving variance to more closely align with the
+                # paper.
+                def moving_stddev_initializer(*args, **kwargs):
+                    return tf.sqrt(
+                        self.moving_variance_initializer(*args, **kwargs)
+                    )
+
+                with tf.distribute.get_strategy().extended.colocate_vars_with(
+                    self.moving_variance
+                ):
+                    self.moving_stddev = self.add_weight(
+                        name="moving_stddev",
+                        shape=param_shape,
+                        dtype=self._param_dtype,
+                        initializer=moving_stddev_initializer,
+                        synchronization=tf.VariableSynchronization.ON_READ,
+                        trainable=False,
+                        aggregation=tf.VariableAggregation.MEAN,
+                        experimental_autocast=False,
+                    )
+
+                # Create variables to maintain the moving mean and standard
+                # deviation.  These are used in training and thus are different
+                # from the moving averages above. The renorm variables are
+                # colocated with moving_mean and moving_stddev.
+                # NOTE: below, the outer `with device` block causes the current
+                # device stack to be cleared. The nested ones use a `lambda` to
+                # set the desired device and ignore any devices that may be set
+                # by the custom getter.
+                def _renorm_variable(name, shape, initializer="zeros"):
+                    """Create a renorm variable."""
+                    var = self.add_weight(
+                        name=name,
+                        shape=shape,
+                        dtype=self._param_dtype,
+                        initializer=initializer,
+                        synchronization=tf.VariableSynchronization.ON_READ,
+                        trainable=False,
+                        aggregation=tf.VariableAggregation.MEAN,
+                        experimental_autocast=False,
+                    )
+                    return var
+
+                with tf.distribute.get_strategy().extended.colocate_vars_with(
+                    self.moving_mean
+                ):
+                    self.renorm_mean = _renorm_variable(
+                        "renorm_mean", param_shape, self.moving_mean_initializer
+                    )
+                with tf.distribute.get_strategy().extended.colocate_vars_with(
+                    self.moving_stddev
+                ):
+                    self.renorm_stddev = _renorm_variable(
+                        "renorm_stddev", param_shape, moving_stddev_initializer
+                    )
+        finally:
+            if partitioner:
+                self._scope.set_partitioner(partitioner)
+        self.built = True
+
+    def call(self, inputs, training=None, mask=None):
+        inputs = tf.cast(inputs, self.compute_dtype)
+        training = self._get_training_value(training)
+        # Determine a boolean value for `training`: could be True, False, or
+        # None.
+        training_value = control_flow_util.constant_value(training)
+        _raise_for_non_sync_bn_with_renorm_and_dtensor_strategy(
+            synchronized=self.synchronized,
+            training=training,
+            renorm=self.renorm,
+        )
+
+        if self.virtual_batch_size is not None:
+            # Virtual batches (aka ghost batches) can be simulated by reshaping
+            # the Tensor and reusing the existing batch norm implementation
+            original_shape = tf.shape(inputs)
+            original_shape = tf.concat(
+                [tf.constant([-1]), original_shape[1:]], axis=0
+            )
+
+            if tf.__internal__.tf2.enabled():
+                expanded_shape = (
+                    [self.virtual_batch_size, -1] if training_value else [-1, 1]
+                )
+                expanded_shape = tf.concat(
+                    [
+                        tf.constant(expanded_shape),
+                        original_shape[1:],
+                    ],
+                    axis=0,
+                )
+            else:
+                # Preserve incorrect legacy behavior for backwards compatibility
+                expanded_shape = tf.concat(
+                    [
+                        tf.constant([self.virtual_batch_size, -1]),
+                        original_shape[1:],
+                    ],
+                    axis=0,
+                )
+
+            # Will cause errors if virtual_batch_size does not divide the batch
+            # size
+            inputs = tf.reshape(inputs, expanded_shape)
+
+            def undo_virtual_batching(outputs):
+                outputs = tf.reshape(outputs, original_shape)
+                return outputs
+
+        if self.fused:
+            outputs = self._fused_batch_norm(
+                inputs, mask=mask, training=training
+            )
+            if self.virtual_batch_size is not None:
+                # Currently never reaches here since fused_batch_norm does not
+                # support virtual batching
+                outputs = undo_virtual_batching(outputs)
+            return outputs
+
+        inputs_dtype = inputs.dtype.base_dtype
+        if inputs_dtype in (tf.float16, tf.bfloat16):
+            # Do all math in float32 if given 16-bit inputs for numeric
+            # stability.  In particular, it's very easy for variance to overflow
+            # in float16 and for safety we also choose to cast bfloat16 to
+            # float32.
+            inputs = tf.cast(inputs, tf.float32)
+
+        # Compute the axes along which to reduce the mean / variance
+        input_shape = inputs.shape
+        ndims = len(input_shape)
+        reduction_axes = [i for i in range(ndims) if i not in self.axis]
+        if self.virtual_batch_size is not None:
+            del reduction_axes[1]  # Do not reduce along virtual batch dim
+
+        # Broadcasting only necessary for single-axis batch norm where the axis
+        # is not the last dimension
+        broadcast_shape = [1] * ndims
+        broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
+
+        def _broadcast(v):
+            if (
+                v is not None
+                and len(v.shape) != ndims
+                and reduction_axes != list(range(ndims - 1))
+            ):
+                return tf.reshape(v, broadcast_shape)
+            return v
+
+        scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+        def _compose_transforms(scale, offset, then_scale, then_offset):
+            if then_scale is not None:
+                scale *= then_scale
+                offset *= then_scale
+            if then_offset is not None:
+                offset += then_offset
+            return (scale, offset)
+
+        if training_value == False:  # noqa: E712
+            mean, variance = self.moving_mean, self.moving_variance
         else:
-          return self._assign_moving_average(self.moving_variance, variance,
-                                             momentum, input_batch_size)
-
-      self.add_update(mean_update)
-      self.add_update(variance_update)
-
-    return output
-
-  def _renorm_correction_and_moments(self, mean, variance, training,
-                                     inputs_size):
-    """Returns the correction and update values for renorm."""
-    stddev = tf.sqrt(variance + self.epsilon)
-    # Compute the average mean and standard deviation, as if they were
-    # initialized with this batch's moments.
-    renorm_mean = self.renorm_mean
-    # Avoid divide by zero early on in training.
-    renorm_stddev = tf.maximum(self.renorm_stddev, tf.sqrt(self.epsilon))
-    # Compute the corrections for batch renorm.
-    r = stddev / renorm_stddev
-    d = (mean - renorm_mean) / renorm_stddev
-    # Ensure the corrections use pre-update moving averages.
-    with tf.control_dependencies([r, d]):
-      mean = tf.identity(mean)
-      stddev = tf.identity(stddev)
-    rmin, rmax, dmax = [
-        self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
-    ]
-    if rmin is not None:
-      r = tf.maximum(r, rmin)
-    if rmax is not None:
-      r = tf.minimum(r, rmax)
-    if dmax is not None:
-      d = tf.maximum(d, -dmax)
-      d = tf.minimum(d, dmax)
-    # When not training, use r=1, d=0.
-    r = control_flow_util.smart_cond(training, lambda: r,
-                                     lambda: tf.ones_like(r))
-    d = control_flow_util.smart_cond(training, lambda: d,
-                                     lambda: tf.zeros_like(d))
-
-    def _update_renorm_variable(var, value, inputs_size):
-      """Updates a moving average and weight, returns the unbiased value."""
-      value = tf.identity(value)
-
-      def _do_update():
-        """Updates the var, returns the updated value."""
-        new_var = self._assign_moving_average(var, value, self.renorm_momentum,
-                                              inputs_size)
-        return new_var
-
-      def _fake_update():
-        return tf.identity(var)
-
-      return control_flow_util.smart_cond(training, _do_update, _fake_update)
-
-    # TODO(yuefengz): colocate the operations
-    update_new_mean = _update_renorm_variable(self.renorm_mean, mean,
-                                              inputs_size)
-    update_new_stddev = _update_renorm_variable(self.renorm_stddev, stddev,
-                                                inputs_size)
-
-    # Update the inference mode moving averages with the batch value.
-    with tf.control_dependencies([update_new_mean, update_new_stddev]):
-      out_mean = tf.identity(mean)
-      out_variance = tf.identity(variance)
-
-    return (r, d, out_mean, out_variance)
-
-  def _calculate_mean_and_var(self, inputs, reduction_axes, keep_dims):
-    return tf.nn.moments(inputs, reduction_axes, keepdims=keep_dims)
-
-  def _moments(self, inputs, reduction_axes, keep_dims):
-    mean, variance = self._calculate_mean_and_var(inputs, reduction_axes,
-                                                  keep_dims)
-    # TODO(b/129279393): Support zero batch input in non DistributionStrategy
-    # code as well.
-    if self._support_zero_size_input():
-      input_batch_size = tf.shape(inputs)[0]
-      mean = tf.where(input_batch_size > 0, mean, backend.zeros_like(mean))
-      variance = tf.where(input_batch_size > 0, variance,
-                          backend.zeros_like(variance))
-    return mean, variance
-
-  def _get_training_value(self, training=None):
-    if training is None:
-      training = backend.learning_phase()
-    if self._USE_V2_BEHAVIOR:
-      if isinstance(training, int):
-        training = bool(training)
-      if not self.trainable:
-        # When the layer is not trainable, it overrides the value passed from
-        # model.
-        training = False
-    return training
-
-  def call(self, inputs, training=None):
-    inputs = tf.cast(inputs, self.compute_dtype)
-    training = self._get_training_value(training)
-
-    if self.virtual_batch_size is not None:
-      # Virtual batches (aka ghost batches) can be simulated by reshaping the
-      # Tensor and reusing the existing batch norm implementation
-      original_shape = tf.shape(inputs)
-      original_shape = tf.concat(
-          [tf.constant([-1]), original_shape[1:]], axis=0)
-      expanded_shape = tf.concat([
-          tf.constant([self.virtual_batch_size, -1]),
-          original_shape[1:]
-      ], axis=0)
-
-      # Will cause errors if virtual_batch_size does not divide the batch size
-      inputs = tf.reshape(inputs, expanded_shape)
-
-      def undo_virtual_batching(outputs):
-        outputs = tf.reshape(outputs, original_shape)
+            # The following long block are handling mean/variance update during
+            # the training stage in various of different settings.
+            if self.adjustment:
+                adj_scale, adj_bias = self.adjustment(tf.shape(inputs))
+                # Adjust only during training.
+                adj_scale = control_flow_util.smart_cond(
+                    training, lambda: adj_scale, lambda: tf.ones_like(adj_scale)
+                )
+                adj_bias = control_flow_util.smart_cond(
+                    training, lambda: adj_bias, lambda: tf.zeros_like(adj_bias)
+                )
+                scale, offset = _compose_transforms(
+                    adj_scale, adj_bias, scale, offset
+                )
+
+            # Some of the computations here are not necessary when
+            # training==False but not a constant. However, this makes the code
+            # simpler.
+            keep_dims = (
+                self.virtual_batch_size is not None or len(self.axis) > 1
+            )
+            mean, variance = self._moments(
+                tf.cast(inputs, self._param_dtype),
+                reduction_axes,
+                keep_dims=keep_dims,
+                mask=mask,
+            )
+
+            moving_mean = self.moving_mean
+            moving_variance = self.moving_variance
+
+            mean = control_flow_util.smart_cond(
+                training,
+                lambda: mean,
+                lambda: tf.convert_to_tensor(moving_mean),
+            )
+            variance = control_flow_util.smart_cond(
+                training,
+                lambda: variance,
+                lambda: tf.convert_to_tensor(moving_variance),
+            )
+
+            if self.virtual_batch_size is not None:
+                # This isn't strictly correct since in ghost batch norm, you are
+                # supposed to sequentially update the moving_mean and
+                # moving_variance with each sub-batch. However, since the moving
+                # statistics are only used during evaluation, it is more
+                # efficient to just update in one step and should not make a
+                # significant difference in the result.
+                new_mean = tf.reduce_mean(mean, axis=1, keepdims=True)
+                new_variance = tf.reduce_mean(variance, axis=1, keepdims=True)
+            else:
+                if (
+                    utils.running_with_dtensor_strategy()
+                    and not self.synchronized
+                ):
+                    new_mean = tf.math.reduce_mean(mean, axis=reduction_axes)
+                    new_variance = tf.math.reduce_mean(
+                        variance, axis=reduction_axes
+                    )
+                else:
+                    new_mean, new_variance = mean, variance
+
+            if self._support_zero_size_input():
+                # Keras assumes that batch dimension is the first dimension for
+                # Batch Normalization.
+                input_batch_size = tf.shape(inputs)[0]
+            else:
+                input_batch_size = None
+
+            if self.renorm:
+                (
+                    r,
+                    d,
+                    new_mean,
+                    new_variance,
+                ) = self._renorm_correction_and_moments(
+                    new_mean, new_variance, training, input_batch_size
+                )
+                # When training, the normalized values (say, x) will be
+                # transformed as x * gamma + beta without renorm, and (x * r +
+                # d) * gamma + beta = x * (r * gamma) + (d * gamma + beta) with
+                # renorm.
+                r = _broadcast(tf.stop_gradient(r, name="renorm_r"))
+                d = _broadcast(tf.stop_gradient(d, name="renorm_d"))
+                scale, offset = _compose_transforms(r, d, scale, offset)
+
+            def _do_update(var, value):
+                """Compute the updates for mean and variance."""
+                return self._assign_moving_average(
+                    var, value, self.momentum, input_batch_size
+                )
+
+            def mean_update():
+                true_branch = lambda: _do_update(self.moving_mean, new_mean)
+                false_branch = lambda: self.moving_mean
+                return control_flow_util.smart_cond(
+                    training, true_branch, false_branch
+                )
+
+            def variance_update():
+                """Update the moving variance."""
+
+                def true_branch_renorm():
+                    # We apply epsilon as part of the moving_stddev to mirror
+                    # the training code path.
+                    moving_stddev = _do_update(
+                        self.moving_stddev, tf.sqrt(new_variance + self.epsilon)
+                    )
+                    return self._assign_new_value(
+                        self.moving_variance,
+                        # Apply relu in case floating point rounding causes it
+                        # to go negative.
+                        backend.relu(
+                            moving_stddev * moving_stddev - self.epsilon
+                        ),
+                    )
+
+                if self.renorm:
+                    true_branch = true_branch_renorm
+                else:
+                    true_branch = lambda: _do_update(
+                        self.moving_variance, new_variance
+                    )
+
+                false_branch = lambda: self.moving_variance
+                return control_flow_util.smart_cond(
+                    training, true_branch, false_branch
+                )
+
+            self.add_update(mean_update)
+            self.add_update(variance_update)
+            # End of handling mean/variance calculation and update.
+
+        mean = tf.cast(mean, inputs.dtype)
+        variance = tf.cast(variance, inputs.dtype)
+        if offset is not None:
+            offset = tf.cast(offset, inputs.dtype)
+        if scale is not None:
+            scale = tf.cast(scale, inputs.dtype)
+        outputs = tf.nn.batch_normalization(
+            inputs,
+            _broadcast(mean),
+            _broadcast(variance),
+            offset,
+            scale,
+            self.epsilon,
+        )
+        if inputs_dtype in (tf.float16, tf.bfloat16):
+            outputs = tf.cast(outputs, inputs_dtype)
+
+        # If some components of the shape got lost due to adjustments, fix that.
+        outputs.set_shape(input_shape)
+
+        if self.virtual_batch_size is not None:
+            outputs = undo_virtual_batching(outputs)
         return outputs
 
-    if self.fused:
-      outputs = self._fused_batch_norm(inputs, training=training)
-      if self.virtual_batch_size is not None:
-        # Currently never reaches here since fused_batch_norm does not support
-        # virtual batching
-        outputs = undo_virtual_batching(outputs)
-      return outputs
-
-    inputs_dtype = inputs.dtype.base_dtype
-    if inputs_dtype in (tf.float16, tf.bfloat16):
-      # Do all math in float32 if given 16-bit inputs for numeric stability.
-      # In particular, it's very easy for variance to overflow in float16 and
-      # for safety we also choose to cast bfloat16 to float32.
-      inputs = tf.cast(inputs, tf.float32)
-
-    # Compute the axes along which to reduce the mean / variance
-    input_shape = inputs.shape
-    ndims = len(input_shape)
-    reduction_axes = [i for i in range(ndims) if i not in self.axis]
-    if self.virtual_batch_size is not None:
-      del reduction_axes[1]  # Do not reduce along virtual batch dim
-
-    # Broadcasting only necessary for single-axis batch norm where the axis is
-    # not the last dimension
-    broadcast_shape = [1] * ndims
-    broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
-
-    def _broadcast(v):
-      if (v is not None and len(v.shape) != ndims and
-          reduction_axes != list(range(ndims - 1))):
-        return tf.reshape(v, broadcast_shape)
-      return v
-
-    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
-
-    def _compose_transforms(scale, offset, then_scale, then_offset):
-      if then_scale is not None:
-        scale *= then_scale
-        offset *= then_scale
-      if then_offset is not None:
-        offset += then_offset
-      return (scale, offset)
-
-    # Determine a boolean value for `training`: could be True, False, or None.
-    training_value = control_flow_util.constant_value(training)
-    if training_value == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
-      mean, variance = self.moving_mean, self.moving_variance
-    else:
-      if self.adjustment:
-        adj_scale, adj_bias = self.adjustment(tf.shape(inputs))
-        # Adjust only during training.
-        adj_scale = control_flow_util.smart_cond(
-            training, lambda: adj_scale, lambda: tf.ones_like(adj_scale))
-        adj_bias = control_flow_util.smart_cond(
-            training, lambda: adj_bias, lambda: tf.zeros_like(adj_bias))
-        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
-
-      # Some of the computations here are not necessary when training==False
-      # but not a constant. However, this makes the code simpler.
-      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = self._moments(
-          tf.cast(inputs, self._param_dtype),
-          reduction_axes,
-          keep_dims=keep_dims)
-
-      moving_mean = self.moving_mean
-      moving_variance = self.moving_variance
-
-      mean = control_flow_util.smart_cond(
-          training, lambda: mean,
-          lambda: tf.convert_to_tensor(moving_mean))
-      variance = control_flow_util.smart_cond(
-          training, lambda: variance,
-          lambda: tf.convert_to_tensor(moving_variance))
-
-      if self.virtual_batch_size is not None:
-        # This isn't strictly correct since in ghost batch norm, you are
-        # supposed to sequentially update the moving_mean and moving_variance
-        # with each sub-batch. However, since the moving statistics are only
-        # used during evaluation, it is more efficient to just update in one
-        # step and should not make a significant difference in the result.
-        new_mean = tf.reduce_mean(mean, axis=1, keepdims=True)
-        new_variance = tf.reduce_mean(variance, axis=1, keepdims=True)
-      else:
-        new_mean, new_variance = mean, variance
-
-      if self._support_zero_size_input():
-        # Keras assumes that batch dimension is the first dimension for Batch
-        # Normalization.
-        input_batch_size = tf.shape(inputs)[0]
-      else:
-        input_batch_size = None
-
-      if self.renorm:
-        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            new_mean, new_variance, training, input_batch_size)
-        # When training, the normalized values (say, x) will be transformed as
-        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
-        # = x * (r * gamma) + (d * gamma + beta) with renorm.
-        r = _broadcast(tf.stop_gradient(r, name='renorm_r'))
-        d = _broadcast(tf.stop_gradient(d, name='renorm_d'))
-        scale, offset = _compose_transforms(r, d, scale, offset)
-
-      def _do_update(var, value):
-        """Compute the updates for mean and variance."""
-        return self._assign_moving_average(var, value, self.momentum,
-                                           input_batch_size)
-
-      def mean_update():
-        true_branch = lambda: _do_update(self.moving_mean, new_mean)
-        false_branch = lambda: self.moving_mean
-        return control_flow_util.smart_cond(training, true_branch, false_branch)
-
-      def variance_update():
-        """Update the moving variance."""
-
-        def true_branch_renorm():
-          # We apply epsilon as part of the moving_stddev to mirror the training
-          # code path.
-          moving_stddev = _do_update(self.moving_stddev,
-                                     tf.sqrt(new_variance + self.epsilon))
-          return self._assign_new_value(
-              self.moving_variance,
-              # Apply relu in case floating point rounding causes it to go
-              # negative.
-              backend.relu(moving_stddev * moving_stddev - self.epsilon))
-
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "axis": self.axis,
+            "momentum": self.momentum,
+            "epsilon": self.epsilon,
+            "center": self.center,
+            "scale": self.scale,
+            "beta_initializer": initializers.serialize(self.beta_initializer),
+            "gamma_initializer": initializers.serialize(self.gamma_initializer),
+            "moving_mean_initializer": initializers.serialize(
+                self.moving_mean_initializer
+            ),
+            "moving_variance_initializer": initializers.serialize(
+                self.moving_variance_initializer
+            ),
+            "beta_regularizer": regularizers.serialize(self.beta_regularizer),
+            "gamma_regularizer": regularizers.serialize(self.gamma_regularizer),
+            "beta_constraint": constraints.serialize(self.beta_constraint),
+            "gamma_constraint": constraints.serialize(self.gamma_constraint),
+        }
+        # Only add TensorFlow-specific parameters if they are set, so as to
+        # preserve model compatibility with external Keras.
         if self.renorm:
-          true_branch = true_branch_renorm
+            config["renorm"] = True
+            config["renorm_clipping"] = self.renorm_clipping
+            config["renorm_momentum"] = self.renorm_momentum
+        if self.virtual_batch_size is not None:
+            config["virtual_batch_size"] = self.virtual_batch_size
+        # Note: adjustment is not serializable.
+        if self.adjustment is not None:
+            logging.warning(
+                "The `adjustment` function of this `BatchNormalization` "
+                "layer cannot be serialized and has been omitted from "
+                "the layer config. It will not be included when "
+                "re-creating the layer from the saved config."
+            )
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    ######################## Start of private methods ##########################
+    def _support_zero_size_input(self):
+        if not tf.distribute.has_strategy():
+            return False
+        strategy = tf.distribute.get_strategy()
+        # TODO(b/195085185): remove experimental_enable_get_next_as_optional
+        # after migrating all users.
+        return getattr(
+            strategy.extended,
+            "enable_partial_batch_handling",
+            getattr(
+                strategy.extended,
+                "experimental_enable_get_next_as_optional",
+                False,
+            ),
+        )
+
+    def _assign_moving_average(self, variable, value, momentum, inputs_size):
+        def calculate_update_delta():
+            decay = tf.convert_to_tensor(1.0 - momentum, name="decay")
+            if decay.dtype != variable.dtype.base_dtype:
+                decay = tf.cast(decay, variable.dtype.base_dtype)
+            update_delta = (variable - tf.cast(value, variable.dtype)) * decay
+            if inputs_size is not None:
+                update_delta = tf.where(
+                    inputs_size > 0,
+                    update_delta,
+                    backend.zeros_like(update_delta),
+                )
+            return update_delta
+
+        with backend.name_scope("AssignMovingAvg") as scope:
+            if tf.compat.v1.executing_eagerly_outside_functions():
+                return variable.assign_sub(calculate_update_delta(), name=scope)
+            else:
+                with tf.compat.v1.colocate_with(variable):
+                    return tf.compat.v1.assign_sub(
+                        variable, calculate_update_delta(), name=scope
+                    )
+
+    def _assign_new_value(self, variable, value):
+        with backend.name_scope("AssignNewValue") as scope:
+            if tf.compat.v1.executing_eagerly_outside_functions():
+                return variable.assign(value, name=scope)
+            else:
+                with tf.compat.v1.colocate_with(variable):
+                    return tf.compat.v1.assign(variable, value, name=scope)
+
+    def _fused_batch_norm(self, inputs, mask, training):
+        """Returns the output of fused batch norm."""
+        if mask is not None:
+            warnings.warn(
+                "Masking is not supported with `fused=True`. "
+                "You should either turn off fusing "
+                "(`fused=False`) or you should not pass a `mask` "
+                "argument when calling the layer. "
+                "For the moment `mask` will be ignored for the "
+                "normalization."
+            )
+        if self.center:
+            beta = self.beta
         else:
-          true_branch = lambda: _do_update(self.moving_variance, new_variance)
-
-        false_branch = lambda: self.moving_variance
-        return control_flow_util.smart_cond(training, true_branch, false_branch)
-
-      self.add_update(mean_update)
-      self.add_update(variance_update)
-
-    mean = tf.cast(mean, inputs.dtype)
-    variance = tf.cast(variance, inputs.dtype)
-    if offset is not None:
-      offset = tf.cast(offset, inputs.dtype)
-    if scale is not None:
-      scale = tf.cast(scale, inputs.dtype)
-    outputs = tf.nn.batch_normalization(inputs, _broadcast(mean),
-                                        _broadcast(variance), offset, scale,
-                                        self.epsilon)
-    if inputs_dtype in (tf.float16, tf.bfloat16):
-      outputs = tf.cast(outputs, inputs_dtype)
-
-    # If some components of the shape got lost due to adjustments, fix that.
-    outputs.set_shape(input_shape)
-
-    if self.virtual_batch_size is not None:
-      outputs = undo_virtual_batching(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'axis': self.axis,
-        'momentum': self.momentum,
-        'epsilon': self.epsilon,
-        'center': self.center,
-        'scale': self.scale,
-        'beta_initializer': initializers.serialize(self.beta_initializer),
-        'gamma_initializer': initializers.serialize(self.gamma_initializer),
-        'moving_mean_initializer':
-            initializers.serialize(self.moving_mean_initializer),
-        'moving_variance_initializer':
-            initializers.serialize(self.moving_variance_initializer),
-        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
-        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
-        'beta_constraint': constraints.serialize(self.beta_constraint),
-        'gamma_constraint': constraints.serialize(self.gamma_constraint)
-    }
-    # Only add TensorFlow-specific parameters if they are set, so as to preserve
-    # model compatibility with external Keras.
-    if self.renorm:
-      config['renorm'] = True
-      config['renorm_clipping'] = self.renorm_clipping
-      config['renorm_momentum'] = self.renorm_momentum
-    if self.virtual_batch_size is not None:
-      config['virtual_batch_size'] = self.virtual_batch_size
-    # Note: adjustment is not serializable.
-    if self.adjustment is not None:
-      logging.warning('The `adjustment` function of this `BatchNormalization` '
-                      'layer cannot be serialized and has been omitted from '
-                      'the layer config. It will not be included when '
-                      're-creating the layer from the saved config.')
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.layers.experimental.SyncBatchNormalization', v1=[])
-class SyncBatchNormalization(BatchNormalizationBase):
-  r"""Normalize and scale inputs or activations synchronously across replicas.
-
-  Applies batch normalization to activations of the previous layer at each batch
-  by synchronizing the global batch statistics across all devices that are
-  training the model. For specific details about batch normalization please
-  refer to the `tf.keras.layers.BatchNormalization` layer docs.
-
-  If this layer is used when using tf.distribute strategy to train models
-  across devices/workers, there will be an allreduce call to aggregate batch
-  statistics across all replicas at every training step. Without tf.distribute
-  strategy, this layer behaves as a regular `tf.keras.layers.BatchNormalization`
-  layer.
-
-  Example usage:
-
-  ```python
-  strategy = tf.distribute.MirroredStrategy()
-
-  with strategy.scope():
-    model = tf.keras.Sequential()
-    model.add(tf.keras.layers.Dense(16))
-    model.add(tf.keras.layers.experimental.SyncBatchNormalization())
-  ```
-
-  Args:
-    axis: Integer, the axis that should be normalized
-      (typically the features axis).
-      For instance, after a `Conv2D` layer with
-      `data_format="channels_first"`,
-      set `axis=1` in `BatchNormalization`.
-    momentum: Momentum for the moving average.
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor.
-      If False, `beta` is ignored.
-    scale: If True, multiply by `gamma`.
-      If False, `gamma` is not used.
-      When the next layer is linear (also e.g. `nn.relu`),
-      this can be disabled since the scaling
-      will be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    moving_mean_initializer: Initializer for the moving mean.
-    moving_variance_initializer: Initializer for the moving variance.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: Optional constraint for the beta weight.
-    gamma_constraint: Optional constraint for the gamma weight.
-
-  Call arguments:
-    inputs: Input tensor (of any rank).
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode.
-      - `training=True`: The layer will normalize its inputs using the
-        mean and variance of the current batch of inputs.
-      - `training=False`: The layer will normalize its inputs using the
-        mean and variance of its moving statistics, learned during training.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
-  """
-
-  def __init__(self,
-               axis=-1,
-               momentum=0.99,
-               epsilon=1e-3,
-               center=True,
-               scale=True,
-               beta_initializer='zeros',
-               gamma_initializer='ones',
-               moving_mean_initializer='zeros',
-               moving_variance_initializer='ones',
-               beta_regularizer=None,
-               gamma_regularizer=None,
-               beta_constraint=None,
-               gamma_constraint=None,
-               **kwargs):
-    if kwargs.pop('fused', None):
-      raise ValueError(
-          '`fused` argument cannot be True for SyncBatchNormalization.')
-
-    # Currently we only support aggregating over the global batch size.
-    super().__init__(
-        axis=axis,
-        momentum=momentum,
-        epsilon=epsilon,
-        center=center,
-        scale=scale,
-        beta_initializer=beta_initializer,
-        gamma_initializer=gamma_initializer,
-        moving_mean_initializer=moving_mean_initializer,
-        moving_variance_initializer=moving_variance_initializer,
-        beta_regularizer=beta_regularizer,
-        gamma_regularizer=gamma_regularizer,
-        beta_constraint=beta_constraint,
-        gamma_constraint=gamma_constraint,
-        fused=False,
-        **kwargs)
-
-  def _calculate_mean_and_var(self, x, axes, keep_dims):
-
-    with backend.name_scope('moments'):
-      # The dynamic range of fp16 is too limited to support the collection of
-      # sufficient statistics. As a workaround we simply perform the operations
-      # on 32-bit floats before converting the mean and variance back to fp16
-      y = tf.cast(x, tf.float32) if x.dtype == tf.float16 else x
-      replica_ctx = tf.distribute.get_replica_context()
-      if replica_ctx:
-        local_sum = tf.reduce_sum(y, axis=axes, keepdims=True)
-        local_squared_sum = tf.reduce_sum(tf.square(y), axis=axes,
-                                                keepdims=True)
-        batch_size = tf.cast(tf.shape(y)[axes[0]],
-                                   tf.float32)
-        # TODO(b/163099951): batch the all-reduces once we sort out the ordering
-        # issue for NCCL. We don't have a mechanism to launch NCCL in the same
-        # order in each replica nowadays, so we limit NCCL to batch all-reduces.
-        y_sum = replica_ctx.all_reduce(tf.distribute.ReduceOp.SUM, local_sum)
-        y_squared_sum = replica_ctx.all_reduce(tf.distribute.ReduceOp.SUM,
-                                               local_squared_sum)
-        global_batch_size = replica_ctx.all_reduce(tf.distribute.ReduceOp.SUM,
-                                                   batch_size)
-
-        axes_vals = [(tf.shape(y))[axes[i]]
-                     for i in range(1, len(axes))]
-        multiplier = tf.cast(tf.reduce_prod(axes_vals),
-                                   tf.float32)
-        multiplier = multiplier * global_batch_size
-
-        mean = y_sum / multiplier
-        y_squared_mean = y_squared_sum / multiplier
-        # var = E(x^2) - E(x)^2
-        variance = y_squared_mean - tf.square(mean)
-      else:
-        # Compute true mean while keeping the dims for proper broadcasting.
-        mean = tf.reduce_mean(y, axes, keepdims=True, name='mean')
-        # sample variance, not unbiased variance
-        # Note: stop_gradient does not change the gradient that gets
-        #       backpropagated to the mean from the variance calculation,
-        #       because that gradient is zero
-        variance = tf.reduce_mean(
-            tf.math.squared_difference(y, tf.stop_gradient(mean)),
-            axes,
-            keepdims=True,
-            name='variance')
-      if not keep_dims:
-        mean = tf.squeeze(mean, axes)
-        variance = tf.squeeze(variance, axes)
-      if x.dtype == tf.float16:
-        return (tf.cast(mean, tf.float16),
-                tf.cast(variance, tf.float16))
-      else:
-        return (mean, variance)
-
-
-@keras_export('keras.layers.BatchNormalization', v1=[])
+            beta = backend.constant(
+                0.0, dtype=self._param_dtype, shape=self._param_shape
+            )
+        if self.scale:
+            gamma = self.gamma
+        else:
+            gamma = backend.constant(
+                1.0, dtype=self._param_dtype, shape=self._param_shape
+            )
+
+        # TODO(b/129279393): Support zero batch input in non
+        # DistributionStrategy code as well.
+        if self._support_zero_size_input():
+            # Keras assumes that batch dimension is the first dimension for
+            # Batch Normalization.
+            input_batch_size = tf.shape(inputs)[0]
+        else:
+            input_batch_size = None
+
+        # TODO(rmlarsen): Support using fused avg updates for non-eager
+        # execution after fixing graph pattern matching and enabling
+        # fused_batch_norm to take exponential_avg_factor as a tensor input.
+        use_fused_avg_updates = (
+            tf.compat.v1.executing_eagerly_outside_functions()
+            and isinstance(self.momentum, (float, int))
+            and get_enclosing_xla_context() is None
+        )
+        if use_fused_avg_updates:
+            exponential_avg_factor = 1.0 - self.momentum
+        else:
+            exponential_avg_factor = None
+
+        def _maybe_add_or_remove_bessels_correction(variance, remove=True):
+            r"""Add or remove Bessel's correction."""
+            # Removes Bessel's correction if remove == True, adds it otherwise.
+            # This is to be consistent with non-fused batch norm. Note that the
+            # variance computed by fused batch norm is with Bessel's correction.
+            # This is only used in legacy V1 batch norm tests.
+            if self._bessels_correction_test_only:
+                return variance
+            sample_size = tf.cast(
+                tf.size(inputs) / tf.size(variance), variance.dtype
+            )
+            if remove:
+                factor = (
+                    sample_size - tf.cast(1.0, variance.dtype)
+                ) / sample_size
+            else:
+                factor = sample_size / (
+                    sample_size - tf.cast(1.0, variance.dtype)
+                )
+            return variance * factor
+
+        def _fused_batch_norm_training():
+            return tf.compat.v1.nn.fused_batch_norm(
+                inputs,
+                gamma,
+                beta,
+                mean=self.moving_mean,
+                variance=_maybe_add_or_remove_bessels_correction(
+                    self.moving_variance, remove=False
+                ),
+                epsilon=self.epsilon,
+                is_training=True,
+                data_format=self._data_format,
+                exponential_avg_factor=exponential_avg_factor,
+            )
+
+        def _fused_batch_norm_inference():
+            return tf.compat.v1.nn.fused_batch_norm(
+                inputs,
+                gamma,
+                beta,
+                mean=self.moving_mean,
+                variance=self.moving_variance,
+                epsilon=self.epsilon,
+                is_training=False,
+                data_format=self._data_format,
+            )
+
+        output, mean, variance = control_flow_util.smart_cond(
+            training, _fused_batch_norm_training, _fused_batch_norm_inference
+        )
+        variance = _maybe_add_or_remove_bessels_correction(
+            variance, remove=True
+        )
+
+        training_value = control_flow_util.constant_value(training)
+        if training_value or training_value is None:
+            if not use_fused_avg_updates:
+                if training_value is None:
+                    momentum = control_flow_util.smart_cond(
+                        training, lambda: self.momentum, lambda: 1.0
+                    )
+                else:
+                    momentum = tf.convert_to_tensor(self.momentum)
+
+            def mean_update():
+                """Update self.moving_mean with the most recent data point."""
+                if use_fused_avg_updates:
+                    if input_batch_size is not None:
+                        new_mean = control_flow_util.smart_cond(
+                            input_batch_size > 0,
+                            lambda: mean,
+                            lambda: self.moving_mean,
+                        )
+                    else:
+                        new_mean = mean
+                    return self._assign_new_value(self.moving_mean, new_mean)
+                else:
+                    return self._assign_moving_average(
+                        self.moving_mean, mean, momentum, input_batch_size
+                    )
+
+            def variance_update():
+                """Update self.moving_variance with the most recent data
+                point."""
+                if use_fused_avg_updates:
+                    if input_batch_size is not None:
+                        new_variance = control_flow_util.smart_cond(
+                            input_batch_size > 0,
+                            lambda: variance,
+                            lambda: self.moving_variance,
+                        )
+                    else:
+                        new_variance = variance
+                    return self._assign_new_value(
+                        self.moving_variance, new_variance
+                    )
+                else:
+                    return self._assign_moving_average(
+                        self.moving_variance,
+                        variance,
+                        momentum,
+                        input_batch_size,
+                    )
+
+            self.add_update(mean_update)
+            self.add_update(variance_update)
+
+        return output
+
+    def _renorm_correction_and_moments(
+        self, mean, variance, training, inputs_size
+    ):
+        """Returns the correction and update values for renorm."""
+        stddev = tf.sqrt(variance + self.epsilon)
+        # Compute the average mean and standard deviation, as if they were
+        # initialized with this batch's moments.
+        renorm_mean = self.renorm_mean
+        # Avoid divide by zero early on in training.
+        renorm_stddev = tf.maximum(self.renorm_stddev, tf.sqrt(self.epsilon))
+        # Compute the corrections for batch renorm.
+        r = stddev / renorm_stddev
+        d = (mean - renorm_mean) / renorm_stddev
+        # Ensure the corrections use pre-update moving averages.
+        with tf.control_dependencies([r, d]):
+            mean = tf.identity(mean)
+            stddev = tf.identity(stddev)
+        rmin, rmax, dmax = [
+            self.renorm_clipping.get(key) for key in ["rmin", "rmax", "dmax"]
+        ]
+        if rmin is not None:
+            r = tf.maximum(r, rmin)
+        if rmax is not None:
+            r = tf.minimum(r, rmax)
+        if dmax is not None:
+            d = tf.maximum(d, -dmax)
+            d = tf.minimum(d, dmax)
+        # When not training, use r=1, d=0.
+        r = control_flow_util.smart_cond(
+            training, lambda: r, lambda: tf.ones_like(r)
+        )
+        d = control_flow_util.smart_cond(
+            training, lambda: d, lambda: tf.zeros_like(d)
+        )
+
+        def _update_renorm_variable(var, value, inputs_size):
+            """Updates a moving average and weight, returns the unbiased
+            value."""
+            value = tf.identity(value)
+
+            def _do_update():
+                """Updates the var, returns the updated value."""
+                new_var = self._assign_moving_average(
+                    var, value, self.renorm_momentum, inputs_size
+                )
+                return new_var
+
+            def _fake_update():
+                return tf.identity(var)
+
+            return control_flow_util.smart_cond(
+                training, _do_update, _fake_update
+            )
+
+        # TODO(yuefengz): colocate the operations
+        update_new_mean = _update_renorm_variable(
+            self.renorm_mean, mean, inputs_size
+        )
+        update_new_stddev = _update_renorm_variable(
+            self.renorm_stddev, stddev, inputs_size
+        )
+
+        # Update the inference mode moving averages with the batch value.
+        with tf.control_dependencies([update_new_mean, update_new_stddev]):
+            out_mean = tf.identity(mean)
+            out_variance = tf.identity(variance)
+
+        return (r, d, out_mean, out_variance)
+
+    def _calculate_mean_and_var(
+        self, inputs, reduction_axes, keep_dims, mask=None
+    ):
+        if self.synchronized:
+            return self._sync_calculate_mean_and_var(
+                inputs, reduction_axes, keep_dims, mask=mask
+            )
+        return self._no_sync_calculate_mean_and_var(
+            inputs, reduction_axes, keep_dims, mask=mask
+        )
+
+    def _no_sync_calculate_mean_and_var(
+        self, inputs, reduction_axes, keep_dims, mask=None
+    ):
+        if mask is None:
+            return tf.nn.moments(inputs, reduction_axes, keepdims=keep_dims)
+        else:
+            mask_weights = tf.cast(
+                mask, self.compute_dtype, name="mask_weights"
+            )
+            mask_weights = tf.expand_dims(
+                mask_weights, axis=-1, name="mask_weights_broadcasted"
+            )
+            return tf.nn.weighted_moments(
+                inputs,
+                axes=reduction_axes,
+                frequency_weights=mask_weights,
+                keepdims=keep_dims,
+            )
+
+    def _sync_calculate_mean_and_var(
+        self, x, reduction_axes, keep_dims, mask=None
+    ):
+        with backend.name_scope("moments"):
+            # The dynamic range of fp16 is too limited to support the collection
+            # of sufficient statistics. As a workaround we simply perform the
+            # operations on 32-bit floats before converting the mean and
+            # variance back to fp16
+            y = tf.cast(x, tf.float32) if x.dtype == tf.float16 else x
+            replica_ctx = tf.distribute.get_replica_context()
+
+            if not replica_ctx:
+                return self._no_sync_calculate_mean_and_var(
+                    x, reduction_axes, keep_dims, mask=mask
+                )
+
+            if mask is not None:
+                mask_weights = tf.cast(mask, y.dtype, name="mask_weights")
+                mask_weights = tf.expand_dims(
+                    mask_weights, axis=-1, name="mask_weights_broadcasted"
+                )
+                y *= mask_weights
+                local_count = tf.broadcast_to(
+                    mask_weights, tf.shape(y), name="count"
+                )
+            else:
+                local_count = tf.ones_like(y, name="count")
+
+            local_sum = tf.reduce_sum(y, axis=reduction_axes, keepdims=True)
+            local_squared_sum = tf.reduce_sum(
+                tf.square(y), axis=reduction_axes, keepdims=True
+            )
+            local_count = tf.reduce_sum(
+                local_count, axis=reduction_axes, keepdims=True
+            )
+
+            # TODO(b/163099951): batch the all-reduces once we sort out the
+            # ordering issue for NCCL. We don't have a mechanism to launch
+            # NCCL in the same order in each replica nowadays, so we limit
+            # NCCL to batch all-reduces.
+            y_sum = replica_ctx.all_reduce(
+                tf.distribute.ReduceOp.SUM, local_sum
+            )
+            y_squared_sum = replica_ctx.all_reduce(
+                tf.distribute.ReduceOp.SUM, local_squared_sum
+            )
+            count_sum = replica_ctx.all_reduce(
+                tf.distribute.ReduceOp.SUM, local_count
+            )
+
+            mean = y_sum / count_sum
+            y_squared_mean = y_squared_sum / count_sum
+            # var = E(x^2) - E(x)^2
+            variance = y_squared_mean - tf.square(mean)
+            if not keep_dims:
+                mean = tf.squeeze(mean, reduction_axes)
+                variance = tf.squeeze(variance, reduction_axes)
+            if x.dtype == tf.float16:
+                return (
+                    tf.cast(mean, tf.float16),
+                    tf.cast(variance, tf.float16),
+                )
+            else:
+                return (mean, variance)
+
+    def _dtensor_calculate_mean_and_var(
+        self, inputs, reduction_axes, keep_dims, mask=None
+    ):
+        if self.synchronized:
+            return self._dtensor_sync_calculate_mean_and_var(
+                inputs, reduction_axes, keep_dims, mask=mask
+            )
+        return self._dtensor_no_sync_calculate_mean_and_var(
+            inputs, reduction_axes, keep_dims, mask=mask
+        )
+
+    def _dtensor_no_sync_calculate_mean_and_var(
+        self, inputs, reduction_axes, keep_dims, mask=None
+    ):
+        replica_tensor = _expand_tensor_with_local_replica_group(inputs)
+        local_batch_size = tf.shape(replica_tensor)[1]
+
+        # Since we added a new axis in the beginning, all the value in
+        # reduction_axes need to be incremented by 1.
+        updated_reduction_axes = [n + 1 for n in reduction_axes]
+
+        if mask is None:
+            mean, var = tf.nn.moments(
+                replica_tensor, updated_reduction_axes, keepdims=keep_dims
+            )
+        else:
+            mask_weights = tf.cast(
+                mask, self.compute_dtype, name="mask_weights"
+            )
+            mask_weights = tf.expand_dims(
+                mask_weights, axis=-1, name="mask_weights_broadcasted"
+            )
+            mask_weights = _expand_tensor_with_local_replica_group(mask_weights)
+            mean, var = tf.nn.weighted_moments(
+                replica_tensor,
+                axes=updated_reduction_axes,
+                frequency_weights=mask_weights,
+                keepdims=keep_dims,
+            )
+        # Also note that the mean/var we have here will have an extra dim in
+        # axis 0, which is represented for num local replica. Down the
+        # stream, the mean/var will be used to update the moving_mean/var
+        # and also normalize the inputs. To make the shape match, we will
+        # expand the tensor shape from [num_replica, x, y] to
+        # [batch_size, x, y] so that it can be properly used for
+        # normalization. When it reaches the mean/var update, a separate
+        # logic will be there to reduce_mean the value based on the batch
+        # dim.
+        mean = tf.repeat(mean, local_batch_size, axis=0)
+        var = tf.repeat(var, local_batch_size, axis=0)
+        if not keep_dims:
+            # We need to fill the reduced dims so that the mean/var can be
+            # properly broadcast to the input shapes. In the example above,
+            # the original reduction_axes is [0, 1]. We ignore the first 0
+            # (batch dim) here since we already expand and use it as num_replica
+            for dim in reduction_axes[1:]:
+                mean = tf.expand_dims(mean, axis=dim)
+                var = tf.expand_dims(var, axis=dim)
+        return mean, var
+
+    def _dtensor_sync_calculate_mean_and_var(
+        self, inputs, reduction_axes, keep_dims, mask=None
+    ):
+        # In the DTensor sync BN, since the input tensor is already in global
+        # context, we just need to use the normal moments/weighted_moments
+        # to calculate mean/var, which is same as the non-sync BN in the normal
+        # mode.
+        return self._no_sync_calculate_mean_and_var(
+            inputs, reduction_axes, keep_dims, mask
+        )
+
+    def _moments(self, inputs, reduction_axes, keep_dims, mask=None):
+        if utils.running_with_dtensor_strategy():
+            mean, variance = self._dtensor_calculate_mean_and_var(
+                inputs, reduction_axes, keep_dims, mask=mask
+            )
+        else:
+            mean, variance = self._calculate_mean_and_var(
+                inputs, reduction_axes, keep_dims, mask=mask
+            )
+        # TODO(b/129279393): Support zero batch input in non
+        # DistributionStrategy code as well.
+        if self._support_zero_size_input():
+            input_batch_size = tf.shape(inputs)[0]
+            mean = tf.where(
+                input_batch_size > 0, mean, backend.zeros_like(mean)
+            )
+            variance = tf.where(
+                input_batch_size > 0, variance, backend.zeros_like(variance)
+            )
+        return mean, variance
+
+    def _get_training_value(self, training=None):
+        if training is None:
+            training = backend.learning_phase()
+        if self._USE_V2_BEHAVIOR:
+            if isinstance(training, int):
+                training = bool(training)
+            if not self.trainable:
+                # When the layer is not trainable, it overrides the value passed
+                # from model.
+                training = False
+        return training
+
+
+@keras_export("keras.layers.BatchNormalization", v1=[])
 class BatchNormalization(BatchNormalizationBase):
-  """Layer that normalizes its inputs.
-
-  Batch normalization applies a transformation that maintains the mean output
-  close to 0 and the output standard deviation close to 1.
-
-  Importantly, batch normalization works differently during training and
-  during inference.
-
-  **During training** (i.e. when using `fit()` or when calling the layer/model
-  with the argument `training=True`), the layer normalizes its output using
-  the mean and standard deviation of the current batch of inputs. That is to
-  say, for each channel being normalized, the layer returns
-  `gamma * (batch - mean(batch)) / sqrt(var(batch) + epsilon) + beta`, where:
-
-  - `epsilon` is small constant (configurable as part of the constructor
-  arguments)
-  - `gamma` is a learned scaling factor (initialized as 1), which
-  can be disabled by passing `scale=False` to the constructor.
-  - `beta` is a learned offset factor (initialized as 0), which
-  can be disabled by passing `center=False` to the constructor.
-
-  **During inference** (i.e. when using `evaluate()` or `predict()` or when
-  calling the layer/model with the argument `training=False` (which is the
-  default), the layer normalizes its output using a moving average of the
-  mean and standard deviation of the batches it has seen during training. That
-  is to say, it returns
-  `gamma * (batch - self.moving_mean) / sqrt(self.moving_var + epsilon) + beta`.
-
-  `self.moving_mean` and `self.moving_var` are non-trainable variables that
-  are updated each time the layer in called in training mode, as such:
-
-  - `moving_mean = moving_mean * momentum + mean(batch) * (1 - momentum)`
-  - `moving_var = moving_var * momentum + var(batch) * (1 - momentum)`
-
-  As such, the layer will only normalize its inputs during inference
-  *after having been trained on data that has similar statistics as the
-  inference data*.
-
-  Args:
-    axis: Integer, the axis that should be normalized (typically the features
-      axis). For instance, after a `Conv2D` layer with
-      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
-    momentum: Momentum for the moving average.
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored.
-    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
-      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
-      scaling will be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    moving_mean_initializer: Initializer for the moving mean.
-    moving_variance_initializer: Initializer for the moving variance.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: Optional constraint for the beta weight.
-    gamma_constraint: Optional constraint for the gamma weight.
-
-  Call arguments:
-    inputs: Input tensor (of any rank).
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode.
-      - `training=True`: The layer will normalize its inputs using the mean and
-        variance of the current batch of inputs.
-      - `training=False`: The layer will normalize its inputs using the mean and
-        variance of its moving statistics, learned during training.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape` (tuple of
-    integers, does not include the samples axis) when using this layer as the
-    first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
-  Reference:
-    - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
-
-  **About setting `layer.trainable = False` on a `BatchNormalization` layer:**
-
-  The meaning of setting `layer.trainable = False` is to freeze the layer,
-  i.e. its internal state will not change during training:
-  its trainable weights will not be updated
-  during `fit()` or `train_on_batch()`, and its state updates will not be run.
-
-  Usually, this does not necessarily mean that the layer is run in inference
-  mode (which is normally controlled by the `training` argument that can
-  be passed when calling a layer). "Frozen state" and "inference mode"
-  are two separate concepts.
-
-  However, in the case of the `BatchNormalization` layer, **setting
-  `trainable = False` on the layer means that the layer will be
-  subsequently run in inference mode** (meaning that it will use
-  the moving mean and the moving variance to normalize the current batch,
-  rather than using the mean and variance of the current batch).
-
-  This behavior has been introduced in TensorFlow 2.0, in order
-  to enable `layer.trainable = False` to produce the most commonly
-  expected behavior in the convnet fine-tuning use case.
-
-  Note that:
-    - Setting `trainable` on an model containing other layers will
-      recursively set the `trainable` value of all inner layers.
-    - If the value of the `trainable`
-      attribute is changed after calling `compile()` on a model,
-      the new value doesn't take effect for this model
-      until `compile()` is called again.
-  """
-  _USE_V2_BEHAVIOR = True
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               axis=-1,
-               momentum=0.99,
-               epsilon=1e-3,
-               center=True,
-               scale=True,
-               beta_initializer='zeros',
-               gamma_initializer='ones',
-               moving_mean_initializer='zeros',
-               moving_variance_initializer='ones',
-               beta_regularizer=None,
-               gamma_regularizer=None,
-               beta_constraint=None,
-               gamma_constraint=None,
-               **kwargs):
-    super().__init__(
-        axis=axis,
-        momentum=momentum,
-        epsilon=epsilon,
-        center=center,
-        scale=scale,
-        beta_initializer=beta_initializer,
-        gamma_initializer=gamma_initializer,
-        moving_mean_initializer=moving_mean_initializer,
-        moving_variance_initializer=moving_variance_initializer,
-        beta_regularizer=beta_regularizer,
-        gamma_regularizer=gamma_regularizer,
-        beta_constraint=beta_constraint,
-        gamma_constraint=gamma_constraint,
-        **kwargs)
+    """Layer that normalizes its inputs.
+
+    Batch normalization applies a transformation that maintains the mean output
+    close to 0 and the output standard deviation close to 1.
+
+    Importantly, batch normalization works differently during training and
+    during inference.
+
+    **During training** (i.e. when using `fit()` or when calling the layer/model
+    with the argument `training=True`), the layer normalizes its output using
+    the mean and standard deviation of the current batch of inputs. That is to
+    say, for each channel being normalized, the layer returns
+    `gamma * (batch - mean(batch)) / sqrt(var(batch) + epsilon) + beta`, where:
+
+    - `epsilon` is small constant (configurable as part of the constructor
+    arguments)
+    - `gamma` is a learned scaling factor (initialized as 1), which
+    can be disabled by passing `scale=False` to the constructor.
+    - `beta` is a learned offset factor (initialized as 0), which
+    can be disabled by passing `center=False` to the constructor.
+
+    **During inference** (i.e. when using `evaluate()` or `predict()` or when
+    calling the layer/model with the argument `training=False` (which is the
+    default), the layer normalizes its output using a moving average of the
+    mean and standard deviation of the batches it has seen during training. That
+    is to say, it returns
+    `gamma * (batch - self.moving_mean) / sqrt(self.moving_var+epsilon) + beta`.
+
+    `self.moving_mean` and `self.moving_var` are non-trainable variables that
+    are updated each time the layer in called in training mode, as such:
+
+    - `moving_mean = moving_mean * momentum + mean(batch) * (1 - momentum)`
+    - `moving_var = moving_var * momentum + var(batch) * (1 - momentum)`
+
+    As such, the layer will only normalize its inputs during inference
+    *after having been trained on data that has similar statistics as the
+    inference data*.
+
+    When `synchronized=True` is set and if this layer is used within a
+    `tf.distribute` strategy, there will be an `allreduce` call
+    to aggregate batch statistics across all replicas at every
+    training step. Setting `synchronized` has no impact when the model is
+    trained without specifying any distribution strategy.
+
+    Example usage:
+
+    ```python
+    strategy = tf.distribute.MirroredStrategy()
+
+    with strategy.scope():
+      model = tf.keras.Sequential()
+      model.add(tf.keras.layers.Dense(16))
+      model.add(tf.keras.layers.BatchNormalization(synchronized=True))
+    ```
+
+    Args:
+      axis: Integer, the axis that should be normalized (typically the features
+        axis). For instance, after a `Conv2D` layer with
+        `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
+      momentum: Momentum for the moving average.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor. If False,
+        `beta` is ignored.
+      scale: If True, multiply by `gamma`. If False, `gamma` is not used. When
+        the next layer is linear (also e.g. `nn.relu`), this can be disabled
+        since the scaling will be done by the next layer.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      moving_mean_initializer: Initializer for the moving mean.
+      moving_variance_initializer: Initializer for the moving variance.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: Optional constraint for the beta weight.
+      gamma_constraint: Optional constraint for the gamma weight.
+      synchronized: If True, synchronizes the global batch statistics (mean and
+        variance) for the layer across all devices at each training step in a
+        distributed training strategy. If False, each replica uses its own
+        local batch statistics. Only relevant when used inside a
+        `tf.distribute` strategy.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode.
+        - `training=True`: The layer will normalize its inputs using the mean
+          and variance of the current batch of inputs.
+        - `training=False`: The layer will normalize its inputs using the mean
+          and variance of its moving statistics, learned during training.
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape` (tuple of
+      integers, does not include the samples axis) when using this layer as the
+      first layer in a model.
+
+    Output shape:
+      Same shape as input.
+
+    Reference:
+      - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
+
+    **About setting `layer.trainable = False` on a `BatchNormalization` layer:**
+
+    The meaning of setting `layer.trainable = False` is to freeze the layer,
+    i.e. its internal state will not change during training:
+    its trainable weights will not be updated
+    during `fit()` or `train_on_batch()`, and its state updates will not be run.
+
+    Usually, this does not necessarily mean that the layer is run in inference
+    mode (which is normally controlled by the `training` argument that can
+    be passed when calling a layer). "Frozen state" and "inference mode"
+    are two separate concepts.
+
+    However, in the case of the `BatchNormalization` layer, **setting
+    `trainable = False` on the layer means that the layer will be
+    subsequently run in inference mode** (meaning that it will use
+    the moving mean and the moving variance to normalize the current batch,
+    rather than using the mean and variance of the current batch).
+
+    This behavior has been introduced in TensorFlow 2.0, in order
+    to enable `layer.trainable = False` to produce the most commonly
+    expected behavior in the convnet fine-tuning use case.
+
+    Note that:
+      - Setting `trainable` on an model containing other layers will
+        recursively set the `trainable` value of all inner layers.
+      - If the value of the `trainable`
+        attribute is changed after calling `compile()` on a model,
+        the new value doesn't take effect for this model
+        until `compile()` is called again.
+    """
+
+    _USE_V2_BEHAVIOR = True
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        axis=-1,
+        momentum=0.99,
+        epsilon=1e-3,
+        center=True,
+        scale=True,
+        beta_initializer="zeros",
+        gamma_initializer="ones",
+        moving_mean_initializer="zeros",
+        moving_variance_initializer="ones",
+        beta_regularizer=None,
+        gamma_regularizer=None,
+        beta_constraint=None,
+        gamma_constraint=None,
+        synchronized=False,
+        **kwargs,
+    ):
+        # Currently we only support aggregating over the global batch size.
+        super().__init__(
+            axis=axis,
+            momentum=momentum,
+            epsilon=epsilon,
+            center=center,
+            scale=scale,
+            beta_initializer=beta_initializer,
+            gamma_initializer=gamma_initializer,
+            moving_mean_initializer=moving_mean_initializer,
+            moving_variance_initializer=moving_variance_initializer,
+            beta_regularizer=beta_regularizer,
+            gamma_regularizer=gamma_regularizer,
+            beta_constraint=beta_constraint,
+            gamma_constraint=gamma_constraint,
+            synchronized=synchronized,
+            **kwargs,
+        )
+
+
+@keras_export("keras.layers.experimental.SyncBatchNormalization", v1=[])
+@deprecation.deprecated_endpoints(
+    "keras.layers.experimental.SyncBatchNormalization"
+)
+class SyncBatchNormalization(BatchNormalizationBase):
+    """Deprecated. Please use `tf.keras.layers.BatchNormalization` instead.
+
+    Caution: `tf.keras.layers.experimental.SyncBatchNormalization` endpoint is
+      deprecated and will be removed in a future release. Please use
+      `tf.keras.layers.BatchNormalization` with parameter `synchronized`
+      set to True
+    """
+
+    def __init__(
+        self,
+        axis=-1,
+        momentum=0.99,
+        epsilon=1e-3,
+        center=True,
+        scale=True,
+        beta_initializer="zeros",
+        gamma_initializer="ones",
+        moving_mean_initializer="zeros",
+        moving_variance_initializer="ones",
+        beta_regularizer=None,
+        gamma_regularizer=None,
+        beta_constraint=None,
+        gamma_constraint=None,
+        **kwargs,
+    ):
+        warning = (
+            "`tf.keras.layers.experimental.SyncBatchNormalization` endpoint is "
+            "deprecated and will be removed in a future release. Please use "
+            "`tf.keras.layers.BatchNormalization` with parameter "
+            "`synchronized` set to True."
+        )
+        logging.log_first_n(logging.WARN, warning, 1)
+        super().__init__(
+            axis=axis,
+            momentum=momentum,
+            epsilon=epsilon,
+            center=center,
+            scale=scale,
+            beta_initializer=beta_initializer,
+            gamma_initializer=gamma_initializer,
+            moving_mean_initializer=moving_mean_initializer,
+            moving_variance_initializer=moving_variance_initializer,
+            beta_regularizer=beta_regularizer,
+            gamma_regularizer=gamma_regularizer,
+            beta_constraint=beta_constraint,
+            gamma_constraint=gamma_constraint,
+            synchronized=True,
+            **kwargs,
+        )
+
+
+def _expand_tensor_with_local_replica_group(inputs):
+    """Reshape the input tensor to have an extra dimension of replica group.
+
+    Under the DTensor usage, the normal batch norm still need to perform on
+    a local batch size, which mean we can't directly do mean/var on a global
+    tensor. In order to do a local mean/var, we have to add a new dimention to
+    the tensor, so that the ops will not cross the replica boundary. E.g,
+    a global tensor with shape [8, x, y] and has 2 local replica, the output of
+    this will be [2, 4, x, y], where the first dim is for num of replica, and
+    the second dim is for the local batch size. The follow ops can do reduces
+    among the local batch dimension.
+
+    Note that this function should only be used under DTensor based strategy,
+    and it will use the current strategy in the context to get the number of
+    replica.
+
+    Args:
+        inputs: Tensor with shape [global_batch_size, ...]
+
+    Returns:
+        Tensor with shape [num_replica, local_batch_size, ...]
+    """
+    # TODO(b/272382109): Implement this an an Op.
+    input_shape = tf.shape(inputs)
+    global_batch_size = input_shape[0]
+    num_replica = tf.distribute.get_strategy().num_replicas_in_sync
+    local_batch_size = global_batch_size // num_replica
+    replica_shape = tf.stack([num_replica, local_batch_size])
+    replica_shape = tf.concat([replica_shape, input_shape[1:]], axis=0)
+    return tf.reshape(inputs, replica_shape)
+
+
+def _raise_for_non_sync_bn_with_renorm_and_dtensor_strategy(
+    synchronized, training, renorm
+):
+    if (
+        utils.running_with_dtensor_strategy()
+        and not synchronized
+        and training == True
+        and renorm
+    ):
+        raise NotImplementedError(
+            "Renorm for BatchNormalization under DTensor based distribution "
+            "strategy is not supported at the moment. Please file a feature "
+            "request if this is blocking your adoption."
+        )
diff --git a/keras/layers/normalization/batch_normalization_dtensor_test.py b/keras/layers/normalization/batch_normalization_dtensor_test.py
new file mode 100644
index 000000000000..fffc914a672d
--- /dev/null
+++ b/keras/layers/normalization/batch_normalization_dtensor_test.py
@@ -0,0 +1,157 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for normalization layers under DTensor context."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.dtensor import test_util
+from keras.dtensor import utils
+from keras.layers.normalization import batch_normalization
+from keras.testing_infra import test_utils
+
+# isort: off
+# Import the MirroredStrategy that is backed by DTensor
+# It is not a public API yet, so we do a private symbol import for now.
+from tensorflow.python.distribute.experimental import (
+    mirrored_strategy as dtensor_mirrored_strategy,
+)
+
+
+@test_utils.run_v2_only
+class BatchNormalizationDTensorTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+
+        global_ids = test_util.create_device_ids_array((2,))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": tf.experimental.dtensor.Mesh(
+                ["batch"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2,), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+
+    def test_strategy_backed_by_dtensor(self):
+        strategy = dtensor_mirrored_strategy.MirroredStrategy(mesh=self.mesh)
+
+        with strategy.scope():
+            self.assertTrue(utils.running_with_dtensor_strategy())
+
+        self.assertFalse(utils.running_with_dtensor_strategy())
+
+        normal_mirrored_strategy = tf.distribute.MirroredStrategy(
+            ["CPU:0", "CPU:1"]
+        )
+        self.assertFalse(utils.running_with_dtensor_strategy())
+        with normal_mirrored_strategy.scope():
+            self.assertFalse(utils.running_with_dtensor_strategy())
+
+    @parameterized.product(
+        training=[True, False],
+        synchronized=[True, False],
+        renorm=[True, False],
+        use_mask=[True, False],
+    )
+    def test_batch_normalization_with_dtensor_strategy(
+        self, training, synchronized, renorm, use_mask
+    ):
+        num_replica = 2
+        local_batch_size = 4
+        global_batch_size = num_replica * local_batch_size
+        feature_shape = [3, 5]
+        global_inputs = tf.random.uniform(
+            shape=[global_batch_size, *feature_shape], dtype=tf.float32
+        )
+        replica_inputs = tf.reshape(
+            global_inputs, [num_replica, local_batch_size, *feature_shape]
+        )
+        if use_mask:
+            mask = tf.concat(
+                [
+                    tf.ones(shape=[global_batch_size, 2]),
+                    tf.zeros(shape=[global_batch_size, 1]),
+                ],
+                axis=-1,
+            )
+            mask = tf.cast(mask, tf.bool)
+            mask = tf.reshape(mask, [num_replica, local_batch_size, 3])
+
+            def value_fn(value_context):
+                return {
+                    "inputs": replica_inputs[
+                        value_context.replica_id_in_sync_group
+                    ],
+                    "mask": mask[value_context.replica_id_in_sync_group],
+                }
+
+        else:
+
+            def value_fn(value_context):
+                return replica_inputs[value_context.replica_id_in_sync_group]
+
+        normal_strategy = tf.distribute.MirroredStrategy(["CPU:0", "CPU:1"])
+        dtensor_strategy = dtensor_mirrored_strategy.MirroredStrategy(
+            mesh=self.mesh
+        )
+        init_kwargs = {"synchronized": synchronized, "renorm": renorm}
+        bn_layer_0 = batch_normalization.BatchNormalization(**init_kwargs)
+        bn_layer_1 = batch_normalization.BatchNormalization(**init_kwargs)
+        run_kwargs = {"training": training}
+
+        normal_strategy_result = self._run_bn_training_with_strategy(
+            normal_strategy, value_fn, bn_layer_0, run_kwargs
+        )
+        if training and not synchronized and renorm:
+            # This is an unsupported case at the moment.
+            with self.assertRaisesRegexp(NotImplementedError, "not supported"):
+                self._run_bn_training_with_strategy(
+                    dtensor_strategy, value_fn, bn_layer_1, run_kwargs
+                )
+            return
+        else:
+            dtensor_strategy_result = self._run_bn_training_with_strategy(
+                dtensor_strategy, value_fn, bn_layer_1, run_kwargs
+            )
+        self.assertAllClose(
+            normal_strategy_result.values, dtensor_strategy_result.values
+        )
+        self.assertAllClose(bn_layer_0.moving_mean, bn_layer_1.moving_mean)
+        self.assertAllClose(
+            bn_layer_0.moving_variance, bn_layer_1.moving_variance
+        )
+
+    def _run_bn_training_with_strategy(
+        self, strategy, value_fn, bn_layer, run_kwargs
+    ):
+        @tf.function
+        def run_fn(inputs):
+            if isinstance(inputs, dict):
+                return bn_layer(**inputs, **run_kwargs)
+            return bn_layer(inputs, **run_kwargs)
+
+        distributed_inputs = (
+            strategy.experimental_distribute_values_from_function(value_fn)
+        )
+
+        return strategy.run(run_fn, args=(distributed_inputs,))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/normalization/batch_normalization_test.py b/keras/layers/normalization/batch_normalization_test.py
index 885e9f30afbc..80ea097ca421 100644
--- a/keras/layers/normalization/batch_normalization_test.py
+++ b/keras/layers/normalization/batch_normalization_test.py
@@ -14,10 +14,9 @@
 # ==============================================================================
 """Tests for normalization layers."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
 from keras.layers.normalization import batch_normalization
@@ -27,511 +26,648 @@
 
 
 class BatchNormalizationTest(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes
-  def test_basic_batchnorm(self):
-    test_utils.layer_test(
-        keras.layers.BatchNormalization,
-        kwargs={
-            'momentum': 0.9,
-            'epsilon': 0.1,
-            'gamma_regularizer': keras.regularizers.l2(0.01),
-            'beta_regularizer': keras.regularizers.l2(0.01)
-        },
-        input_shape=(3, 4, 2))
-    test_utils.layer_test(
-        keras.layers.BatchNormalization,
-        kwargs={
-            'gamma_initializer': 'ones',
-            'beta_initializer': 'ones',
-            'moving_mean_initializer': 'zeros',
-            'moving_variance_initializer': 'ones'
-        },
-        input_shape=(3, 4, 2))
-    test_utils.layer_test(
-        keras.layers.BatchNormalization,
-        kwargs={'scale': False,
-                'center': False},
-        input_shape=(3, 3))
-    test_utils.layer_test(
-        keras.layers.BatchNormalization,
-        kwargs={
-            'gamma_initializer': 'ones',
-            'beta_initializer': 'ones',
-            'moving_mean_initializer': 'zeros',
-            'moving_variance_initializer': 'ones'
-        },
-        input_shape=(3, 2, 4, 2))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_batchnorm_weights(self):
-    layer = keras.layers.BatchNormalization(scale=False, center=False)
-    layer.build((None, 3, 4))
-    self.assertEqual(len(layer.trainable_weights), 0)
-    self.assertEqual(len(layer.weights), 2)
-
-    layer = keras.layers.BatchNormalization()
-    layer.build((None, 3, 4))
-    self.assertEqual(len(layer.trainable_weights), 2)
-    self.assertEqual(len(layer.weights), 4)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_batchnorm_regularization(self):
-    layer = keras.layers.BatchNormalization(
-        gamma_regularizer='l1', beta_regularizer='l1')
-    layer.build((None, 3, 4))
-    self.assertEqual(len(layer.losses), 2)
-    max_norm = keras.constraints.max_norm
-    layer = keras.layers.BatchNormalization(
-        gamma_constraint=max_norm, beta_constraint=max_norm)
-    layer.build((None, 3, 4))
-    self.assertEqual(layer.gamma.constraint, max_norm)
-    self.assertEqual(layer.beta.constraint, max_norm)
-
-  @test_combinations.run_all_keras_modes
-  def test_batchnorm_convnet(self):
-    if tf.test.is_gpu_available(cuda_only=True):
-      with self.session():
+    @test_combinations.run_all_keras_modes
+    def test_basic_batchnorm(self):
+        test_utils.layer_test(
+            keras.layers.BatchNormalization,
+            kwargs={
+                "momentum": 0.9,
+                "epsilon": 0.1,
+                "gamma_regularizer": keras.regularizers.l2(0.01),
+                "beta_regularizer": keras.regularizers.l2(0.01),
+            },
+            input_shape=(3, 4, 2),
+        )
+        test_utils.layer_test(
+            keras.layers.BatchNormalization,
+            kwargs={
+                "gamma_initializer": "ones",
+                "beta_initializer": "ones",
+                "moving_mean_initializer": "zeros",
+                "moving_variance_initializer": "ones",
+            },
+            input_shape=(3, 4, 2),
+        )
+        test_utils.layer_test(
+            keras.layers.BatchNormalization,
+            kwargs={"scale": False, "center": False},
+            input_shape=(3, 3),
+        )
+        test_utils.layer_test(
+            keras.layers.BatchNormalization,
+            kwargs={
+                "gamma_initializer": "ones",
+                "beta_initializer": "ones",
+                "moving_mean_initializer": "zeros",
+                "moving_variance_initializer": "ones",
+            },
+            input_shape=(3, 2, 4, 2),
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_batchnorm_weights(self):
+        layer = keras.layers.BatchNormalization(scale=False, center=False)
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 0)
+        self.assertEqual(len(layer.weights), 2)
+
+        layer = keras.layers.BatchNormalization()
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 2)
+        self.assertEqual(len(layer.weights), 4)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_batchnorm_regularization(self):
+        layer = keras.layers.BatchNormalization(
+            gamma_regularizer="l1", beta_regularizer="l1"
+        )
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.losses), 2)
+        max_norm = keras.constraints.max_norm
+        layer = keras.layers.BatchNormalization(
+            gamma_constraint=max_norm, beta_constraint=max_norm
+        )
+        layer.build((None, 3, 4))
+        self.assertEqual(layer.gamma.constraint, max_norm)
+        self.assertEqual(layer.beta.constraint, max_norm)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_batchnorm_sync_fused_error(self):
+        with self.assertRaises(ValueError):
+            _ = batch_normalization.BatchNormalization(
+                synchronized=True, fused=True
+            )
+
+    def _test_batchnorm_convnet(self, synchronized=False):
+        if tf.test.is_gpu_available(cuda_only=True):
+            with self.session():
+                model = keras.models.Sequential()
+                norm = keras.layers.BatchNormalization(
+                    axis=1,
+                    input_shape=(3, 4, 4),
+                    momentum=0.8,
+                    synchronized=synchronized,
+                )
+                model.add(norm)
+                model.compile(
+                    loss="mse",
+                    optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+                    run_eagerly=test_utils.should_run_eagerly(),
+                )
+
+                # centered on 5.0, variance 10.0
+                x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+                model.fit(x, x, epochs=4, verbose=0)
+                out = model.predict(x)
+                out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+                out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+                np.testing.assert_allclose(
+                    np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1
+                )
+                np.testing.assert_allclose(
+                    np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1
+                )
+
+    @test_combinations.run_all_keras_modes
+    def test_batchnorm_convnet(self):
+        self._test_batchnorm_convnet(synchronized=False)
+
+    @test_combinations.run_all_keras_modes
+    def test_batchnorm_convnet_synchronized(self):
+        self._test_batchnorm_convnet(synchronized=True)
+
+    @test_combinations.run_all_keras_modes
+    def test_batchnorm_convnet_channel_last(self):
         model = keras.models.Sequential()
         norm = keras.layers.BatchNormalization(
-            axis=1, input_shape=(3, 4, 4), momentum=0.8)
+            axis=-1, input_shape=(4, 4, 3), momentum=0.8
+        )
         model.add(norm)
         model.compile(
-            loss='mse',
+            loss="mse",
             optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-            run_eagerly=test_utils.should_run_eagerly())
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
 
         # centered on 5.0, variance 10.0
-        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
         model.fit(x, x, epochs=4, verbose=0)
         out = model.predict(x)
-        out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
-        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+        out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+        np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+        np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
+    @test_combinations.run_all_keras_modes
+    def test_batchnorm_correctness(self):
+        _run_batchnorm_correctness_test(
+            batch_normalization_v1.BatchNormalization, dtype="float32"
+        )
+        _run_batchnorm_correctness_test(
+            batch_normalization.BatchNormalization, dtype="float32"
+        )
+        _run_batchnorm_correctness_test(
+            batch_normalization.BatchNormalization,
+            dtype="float32",
+            synchronized=True,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_batchnorm_float16(self):
+        _run_batchnorm_correctness_test(
+            batch_normalization_v1.BatchNormalization, dtype="float16"
+        )
+        _run_batchnorm_correctness_test(
+            batch_normalization.BatchNormalization, dtype="float16"
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    @test_utils.enable_v2_dtype_behavior
+    def test_batchnorm_mixed_precision(self):
+        norm = keras.layers.BatchNormalization(
+            axis=-1, momentum=0.8, dtype="mixed_float16"
+        )
+        x = np.random.normal(size=(10, 4, 4, 3))
+        y = norm(x)
+        self.assertEqual(y.dtype, "float16")
+        self.assertEqual(norm.beta.dtype.base_dtype, "float32")
+        self.assertEqual(norm.gamma.dtype.base_dtype, "float32")
+
+        x = np.arange(10 * 4 * 4 * 3).reshape((10, 4, 4, 3))
+        y = norm(x)
+        self.assertEqual(y.dtype, "float16")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"], fused=[True, False])
+    )
+    @test_utils.enable_v2_dtype_behavior
+    def test_batchnorm_mixed_precision_does_not_overflow(self, fused):
+        norm = keras.layers.BatchNormalization(
+            axis=-1, input_shape=(1, 1, 1), fused=fused, dtype="mixed_float16"
+        )
+        x = np.array([-1000.0, 1000.0]).reshape((2, 1, 1, 1))
+        y = norm(x, training=True)
+        expected_y = np.array([-1.0, 1.0]).reshape((2, 1, 1, 1))
+        self.assertAllClose(keras.backend.eval(y), expected_y)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_batchnorm_non_trainable_with_fit(self):
+        # We use the same data shape for all the data we use in this test.
+        # This will prevent any used tf.functions from retracing.
+        # This helps us verify that changing trainable and recompiling really
+        # does update the training loop, rather than a different data shape
+        # triggering a retrace.
+        data_shape = (100, 3)
+
+        inputs = keras.Input((3,))
+        bn = batch_normalization.BatchNormalization()
+        outputs = bn(inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        model.fit(np.random.random(data_shape), np.random.random(data_shape))
 
-        np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
-        np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+        test_data = np.random.random(data_shape)
+        test_targets = np.random.random(data_shape)
+        test_loss = model.evaluate(test_data, test_targets)
 
-  @test_combinations.run_all_keras_modes
-  def test_batchnorm_convnet_channel_last(self):
-    model = keras.models.Sequential()
-    norm = keras.layers.BatchNormalization(
-        axis=-1, input_shape=(4, 4, 3), momentum=0.8)
-    model.add(norm)
-    model.compile(
-        loss='mse',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        run_eagerly=test_utils.should_run_eagerly())
+        bn.trainable = False
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        train_loss = model.train_on_batch(test_data, test_targets)
+        self.assertAlmostEqual(test_loss, train_loss)
+
+    @test_combinations.run_all_keras_modes
+    def test_batchnorm_ignore_masked_values(self):
+        padded_data = np.array(
+            [[[1, 5], [2, 5], [0, 0], [0, 0]] for _ in range(10)],
+            dtype="float32",
+        )  # Pad value of 0
+
+        inputs = keras.layers.Input((None, 2))
+        masked = keras.layers.Masking()(inputs)
+        normed = keras.layers.BatchNormalization(momentum=0.0)(masked)
+        model = keras.models.Model(inputs, normed)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        model.fit(x=padded_data, y=padded_data, batch_size=10, epochs=5)
+
+        self.assertAllEqual(model.layers[2].moving_mean, [1.5, 5.0])
+        self.assertAllEqual(model.layers[2].moving_variance, [0.25, 0.0])
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_sync_batchnorm_with_mask(self):
+        padded_data = np.array(
+            [[[1, 5], [2, 5], [0, 0], [0, 0]] for _ in range(10)],
+            dtype="float32",
+        )  # Pad value of 0
+        strategy = tf.distribute.MirroredStrategy(["CPU:0"])
+        distributed_data = strategy.distribute_datasets_from_function(
+            dataset_fn=lambda _: tf.data.Dataset.from_tensors(
+                (padded_data, padded_data)
+            ).repeat(),
+            options=None,
+        )
+        with strategy.scope():
+            inputs = keras.layers.Input((None, 2))
+            masked = keras.layers.Masking()(inputs)
+            normed = keras.layers.BatchNormalization(
+                momentum=0.0, synchronized=True
+            )(masked)
+            model = keras.models.Model(inputs, normed)
+        # MirroredStrategy will be very slow when run eagerly.
+        model.compile("rmsprop", "mse", run_eagerly=False)
+        model.fit(distributed_data, steps_per_epoch=1, epochs=5)
+
+        self.assertAllEqual(model.layers[2].moving_mean, [1.5, 5.0])
+        self.assertAllEqual(model.layers[2].moving_variance, [0.25, 0.0])
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.bn = keras.layers.BatchNormalization()
+
+            @tf.function()
+            def call(self, x, training):
+                return self.bn(x, training=training)
+
+        model = MyModel()
+
+        for _ in range(10):
+            x = tf.constant(0.5, shape=[1, 1])
+            model(x, training=True)
+
+        # Make sure the moving mean and variance have been updated
+        self.assertAllClose(model.bn.moving_mean.numpy(), [0.047], atol=3e-3)
+        self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_bessels_correction(self):
+        # Bessel's correction is currently only used in the fused case. In the
+        # future, it may be used in the nonfused case as well.
+
+        x = tf.constant([0.0, 2.0], shape=[2, 1, 1, 1])
+        layer = batch_normalization.BatchNormalization(
+            momentum=0.5, moving_variance_initializer="zeros"
+        )
+        layer(x, training=True)
+        self.assertTrue(layer.fused)
+        # Since fused is used, Bessel's correction is used. The variance of [0,
+        # 2] is 2 with Bessel's correction. Since the momentum is 0.5, the
+        # variance is 2 * 0.5 == 1.
+        self.assertAllEqual(self.evaluate(layer.moving_variance), [1.0])
+
+        x = tf.constant([0.0, 2.0], shape=[2, 1, 1, 1, 1])
+        layer = batch_normalization.BatchNormalization(
+            momentum=0.5, moving_variance_initializer="zeros"
+        )
+        layer(x, training=True)
+        self.assertTrue(layer.fused)
+        # Since fused is used, Bessel's correction is used. The variance of [0,
+        # 2] is 2 with Bessel's correction. Since the momentum is 0.5, the
+        # variance is 2 * 0.5 == 1.
+        self.assertAllEqual(self.evaluate(layer.moving_variance), [1.0])
+
+    @test_combinations.run_all_keras_modes
+    def test_can_be_used_in_multiple_graphs(self):
+        norm = keras.layers.BatchNormalization(
+            scale=False, center=False, fused=True
+        )
 
-    # centered on 5.0, variance 10.0
-    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
-    model.fit(x, x, epochs=4, verbose=0)
-    out = model.predict(x)
-    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
-    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
-
-    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
-    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
-
-  @test_combinations.run_all_keras_modes
-  def test_batchnorm_correctness(self):
-    _run_batchnorm_correctness_test(
-        batch_normalization_v1.BatchNormalization, dtype='float32')
-    _run_batchnorm_correctness_test(
-        batch_normalization.BatchNormalization, dtype='float32')
-
-  @test_combinations.run_all_keras_modes
-  def test_batchnorm_float16(self):
-    _run_batchnorm_correctness_test(
-        batch_normalization_v1.BatchNormalization, dtype='float16')
-    _run_batchnorm_correctness_test(
-        batch_normalization.BatchNormalization, dtype='float16')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  @test_utils.enable_v2_dtype_behavior
-  def test_batchnorm_mixed_precision(self):
-    norm = keras.layers.BatchNormalization(
-        axis=-1,
-        momentum=0.8,
-        dtype='mixed_float16')
-    x = np.random.normal(size=(10, 4, 4, 3))
-    y = norm(x)
-    self.assertEqual(y.dtype, 'float16')
-    self.assertEqual(norm.beta.dtype.base_dtype, 'float32')
-    self.assertEqual(norm.gamma.dtype.base_dtype, 'float32')
-
-    x = np.arange(10 * 4 * 4 * 3).reshape((10, 4, 4, 3))
-    y = norm(x)
-    self.assertEqual(y.dtype, 'float16')
-
-  @test_combinations.generate(test_combinations.combine(mode=['graph', 'eager'],
-                                                        fused=[True, False]))
-  @test_utils.enable_v2_dtype_behavior
-  def test_batchnorm_mixed_precision_does_not_overflow(self, fused):
-    norm = keras.layers.BatchNormalization(
-        axis=-1,
-        input_shape=(1, 1, 1),
-        fused=fused,
-        dtype='mixed_float16')
-    x = np.array([-1000., 1000.]).reshape((2, 1, 1, 1))
-    y = norm(x, training=True)
-    expected_y = np.array([-1.0, 1.0]).reshape((2, 1, 1, 1))
-    self.assertAllClose(keras.backend.eval(y), expected_y)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_batchnorm_non_trainable_with_fit(self):
-    # We use the same data shape for all the data we use in this test.
-    # This will prevent any used tf.functions from retracing.
-    # This helps us verify that changing trainable and recompiling really
-    # does update the training loop, rather than a different data shape
-    # triggering a retrace.
-    data_shape = (100, 3)
-
-    inputs = keras.Input((3,))
-    bn = batch_normalization.BatchNormalization()
-    outputs = bn(inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(np.random.random(data_shape), np.random.random(data_shape))
+        @tf.function
+        def fn1(x):
+            return norm(x, training=True)
 
-    test_data = np.random.random(data_shape)
-    test_targets = np.random.random(data_shape)
-    test_loss = model.evaluate(test_data, test_targets)
+        @tf.function
+        def fn2(x):
+            return norm(x, training=True)
 
-    bn.trainable = False
-    model.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    train_loss = model.train_on_batch(test_data, test_targets)
-    self.assertAlmostEqual(test_loss, train_loss)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.bn = keras.layers.BatchNormalization()
-
-      @tf.function()
-      def call(self, x, training):
-        return self.bn(x, training=training)
-
-    model = MyModel()
-
-    for _ in range(10):
-      x = tf.constant(0.5, shape=[1, 1])
-      model(x, training=True)
-
-    # Make sure the moving mean and variance have been updated
-    self.assertAllClose(model.bn.moving_mean.numpy(), [0.047], atol=3e-3)
-    self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_bessels_correction(self):
-    # Bessel's correction is currently only used in the fused case. In the
-    # future, it may be used in the nonfused case as well.
-
-    x = tf.constant([0., 2.], shape=[2, 1, 1, 1])
-    layer = batch_normalization.BatchNormalization(
-        momentum=0.5, moving_variance_initializer='zeros')
-    layer(x, training=True)
-    self.assertTrue(layer.fused)
-    # Since fused is used, Bessel's correction is used. The variance of [0, 2]
-    # is 2 with Bessel's correction. Since the momentum is 0.5, the variance is
-    # 2 * 0.5 == 1.
-    self.assertAllEqual(self.evaluate(layer.moving_variance), [1.])
-
-    x = tf.constant([0., 2.], shape=[2, 1, 1, 1, 1])
-    layer = batch_normalization.BatchNormalization(
-        momentum=0.5, moving_variance_initializer='zeros')
-    layer(x, training=True)
-    self.assertTrue(layer.fused)
-    # Since fused is used, Bessel's correction is used. The variance of [0, 2]
-    # is 2 with Bessel's correction. Since the momentum is 0.5, the variance is
-    # 2 * 0.5 == 1.
-    self.assertAllEqual(self.evaluate(layer.moving_variance), [1.])
+        x = np.array([-1000.0, 1000.0]).reshape((2, 1, 1, 1))
+        y = norm(fn2(fn1(x)), training=True)
+        expected_y = np.array([-0.9995, 0.9995]).reshape((2, 1, 1, 1))
+        self.assertAllClose(keras.backend.eval(y), expected_y)
 
 
 class BatchNormalizationV1Test(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_v1_fused_attribute(self):
-    norm = batch_normalization_v1.BatchNormalization()
-    inp = keras.layers.Input((4, 4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, True)
-
-    norm = batch_normalization_v1.BatchNormalization(fused=False)
-    self.assertEqual(norm.fused, False)
-    inp = keras.layers.Input(shape=(4, 4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, False)
-
-    norm = batch_normalization_v1.BatchNormalization(virtual_batch_size=2)
-    self.assertEqual(norm.fused, True)
-    inp = keras.layers.Input(shape=(2, 2, 2))
-    norm(inp)
-    self.assertEqual(norm.fused, False)
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_v1_fused_attribute(self):
+        norm = batch_normalization_v1.BatchNormalization()
+        inp = keras.layers.Input((4, 4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, True)
+
+        norm = batch_normalization_v1.BatchNormalization(fused=False)
+        self.assertEqual(norm.fused, False)
+        inp = keras.layers.Input(shape=(4, 4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, False)
+
+        norm = batch_normalization_v1.BatchNormalization(virtual_batch_size=2)
+        self.assertEqual(norm.fused, True)
+        inp = keras.layers.Input(shape=(2, 2, 2))
+        norm(inp)
+        self.assertEqual(norm.fused, False)
 
 
 class BatchNormalizationV2Test(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes
+    def test_basic_batchnorm_v2(self):
+        test_utils.layer_test(
+            batch_normalization.BatchNormalization,
+            kwargs={"fused": True},
+            input_shape=(3, 3, 3, 3),
+        )
+        test_utils.layer_test(
+            batch_normalization.BatchNormalization,
+            kwargs={"fused": None},
+            input_shape=(3, 3, 3),
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_v2_fused_attribute(self):
+        norm = batch_normalization.BatchNormalization()
+        self.assertIsNone(norm.fused)
+        inp = keras.layers.Input(shape=(4, 4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, True)
+
+        norm = batch_normalization.BatchNormalization()
+        self.assertIsNone(norm.fused)
+        inp = keras.layers.Input(shape=(4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, False)
+
+        norm = batch_normalization.BatchNormalization()
+        self.assertIsNone(norm.fused)
+        inp = keras.layers.Input(shape=(4, 4, 4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, True)
+
+        norm = batch_normalization.BatchNormalization(virtual_batch_size=2)
+        self.assertEqual(norm.fused, False)
+        inp = keras.layers.Input(shape=(4, 4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, False)
+
+        norm = batch_normalization.BatchNormalization(fused=False)
+        self.assertEqual(norm.fused, False)
+        inp = keras.layers.Input(shape=(4, 4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, False)
+
+        norm = batch_normalization.BatchNormalization(fused=True, axis=[3])
+        self.assertEqual(norm.fused, True)
+        inp = keras.layers.Input(shape=(4, 4, 4))
+        norm(inp)
+        self.assertEqual(norm.fused, True)
+
+        with self.assertRaisesRegex(ValueError, "fused.*renorm"):
+            batch_normalization.BatchNormalization(fused=True, renorm=True)
+
+        with self.assertRaisesRegex(ValueError, "fused.*when axis is 1 or 3"):
+            batch_normalization.BatchNormalization(fused=True, axis=2)
+
+        with self.assertRaisesRegex(ValueError, "fused.*when axis is 1 or 3"):
+            batch_normalization.BatchNormalization(fused=True, axis=[1, 3])
+
+        with self.assertRaisesRegex(ValueError, "fused.*virtual_batch_size"):
+            batch_normalization.BatchNormalization(
+                fused=True, virtual_batch_size=2
+            )
+
+        with self.assertRaisesRegex(ValueError, "fused.*adjustment"):
+            batch_normalization.BatchNormalization(
+                fused=True, adjustment=lambda _: (1, 0)
+            )
+
+        norm = batch_normalization.BatchNormalization(fused=True)
+        self.assertEqual(norm.fused, True)
+        inp = keras.layers.Input(shape=(4, 4))
+        with self.assertRaisesRegex(ValueError, "4D or 5D input tensors"):
+            norm(inp)
+
+    def test_updates_in_wrap_function(self):
+        def my_func():
+            layer = batch_normalization_v1.BatchNormalization()
+            x = tf.ones((10, 1))
+            y = layer(x, training=True)
+            # Updates should be tracked in a `wrap_function`.
+            self.assertLen(layer.updates, 2)
+            return y
+
+        wrapped_fn = tf.compat.v1.wrap_function(my_func, [])
+        wrapped_fn()
+
+    @test_combinations.run_all_keras_modes
+    @test_utils.run_v2_only
+    def test_basic_batchnorm_v2_input_shape_and_virtual_batch_size(self):
+        # Test case for GitHub issue for 32380
+        norm = batch_normalization.BatchNormalization(virtual_batch_size=8)
+        inp = keras.layers.Input(shape=(None, None, 3))
+        _ = norm(inp)
+
+        # Test case for https://github.com/tensorflow/tensorflow/issues/23050
+        norm = batch_normalization.BatchNormalization(virtual_batch_size=8)
+        _ = norm(np.ones((1, 28, 28)))
+
+        with self.assertRaisesRegex(Exception, "Reshape"):
+            norm = batch_normalization.BatchNormalization(virtual_batch_size=8)
+            _ = norm(np.ones((1, 28, 28)), training=True)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_fused_batchnorm_empty_batch(self):
+        # Test case for https://github.com/tensorflow/tensorflow/issues/52986
+        # create a simple strategy with the enable_partial_batch_handling flag
+        # turned on, to trigger the empty batch code path in fused batchnorm
+        strategy = tf.distribute.OneDeviceStrategy("/cpu:0")
+        strategy.extended.enable_partial_batch_handling = True
+        with strategy.scope():
+            layer = batch_normalization.BatchNormalization()
+
+        def fn():
+            with tf.GradientTape() as tape:
+                x = tf.ones((0, 2, 2, 2))
+                layer(x, training=True)
+            return tape
+
+        tape = strategy.run(fn)
+
+        self.assertTrue(layer.fused)
+
+        self.assertIsNotNone(layer.moving_mean)
+        self.assertIsNotNone(layer.moving_variance)
+
+        tape_vars = tape.watched_variables()
+        self.assertAllEqual(layer.gamma, tape_vars[0])
+        self.assertAllEqual(layer.beta, tape_vars[1])
+
+
+def _run_batchnorm_correctness_test(
+    layer, dtype="float32", fused=False, synchronized=False
+):
+    model = keras.models.Sequential()
+    model.add(keras.Input(shape=(2, 2, 2), dtype=dtype))
+    norm = layer(momentum=0.8, fused=fused, synchronized=synchronized)
+    model.add(norm)
+    if dtype == "float16":
+        # Keras models require float32 losses.
+        model.add(
+            keras.layers.Lambda(lambda x: keras.backend.cast(x, "float32"))
+        )
+    model.compile(
+        loss="mse",
+        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        run_eagerly=test_utils.should_run_eagerly(),
+    )
 
-  @test_combinations.run_all_keras_modes
-  def test_basic_batchnorm_v2(self):
-    test_utils.layer_test(
-        batch_normalization.BatchNormalization,
-        kwargs={'fused': True},
-        input_shape=(3, 3, 3, 3))
-    test_utils.layer_test(
-        batch_normalization.BatchNormalization,
-        kwargs={'fused': None},
-        input_shape=(3, 3, 3))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_v2_fused_attribute(self):
-    norm = batch_normalization.BatchNormalization()
-    self.assertIsNone(norm.fused)
-    inp = keras.layers.Input(shape=(4, 4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, True)
-
-    norm = batch_normalization.BatchNormalization()
-    self.assertIsNone(norm.fused)
-    inp = keras.layers.Input(shape=(4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, False)
-
-    norm = batch_normalization.BatchNormalization()
-    self.assertIsNone(norm.fused)
-    inp = keras.layers.Input(shape=(4, 4, 4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, True)
-
-    norm = batch_normalization.BatchNormalization(virtual_batch_size=2)
-    self.assertEqual(norm.fused, False)
-    inp = keras.layers.Input(shape=(4, 4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, False)
-
-    norm = batch_normalization.BatchNormalization(fused=False)
-    self.assertEqual(norm.fused, False)
-    inp = keras.layers.Input(shape=(4, 4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, False)
-
-    norm = batch_normalization.BatchNormalization(fused=True, axis=[3])
-    self.assertEqual(norm.fused, True)
-    inp = keras.layers.Input(shape=(4, 4, 4))
-    norm(inp)
-    self.assertEqual(norm.fused, True)
-
-    with self.assertRaisesRegex(ValueError, 'fused.*renorm'):
-      batch_normalization.BatchNormalization(fused=True, renorm=True)
-
-    with self.assertRaisesRegex(ValueError, 'fused.*when axis is 1 or 3'):
-      batch_normalization.BatchNormalization(fused=True, axis=2)
-
-    with self.assertRaisesRegex(ValueError, 'fused.*when axis is 1 or 3'):
-      batch_normalization.BatchNormalization(fused=True, axis=[1, 3])
-
-    with self.assertRaisesRegex(ValueError, 'fused.*virtual_batch_size'):
-      batch_normalization.BatchNormalization(fused=True, virtual_batch_size=2)
-
-    with self.assertRaisesRegex(ValueError, 'fused.*adjustment'):
-      batch_normalization.BatchNormalization(
-          fused=True, adjustment=lambda _: (1, 0))
-
-    norm = batch_normalization.BatchNormalization(fused=True)
-    self.assertEqual(norm.fused, True)
-    inp = keras.layers.Input(shape=(4, 4))
-    with self.assertRaisesRegex(ValueError, '4D or 5D input tensors'):
-      norm(inp)
-
-  def test_updates_in_wrap_function(self):
-
-    def my_func():
-      layer = batch_normalization_v1.BatchNormalization()
-      x = tf.ones((10, 1))
-      y = layer(x, training=True)
-      # Updates should be tracked in a `wrap_function`.
-      self.assertLen(layer.updates, 2)
-      return y
-
-    wrapped_fn = tf.compat.v1.wrap_function(my_func, [])
-    wrapped_fn()
-
-  @test_combinations.run_all_keras_modes
-  def test_basic_batchnorm_v2_none_shape_and_virtual_batch_size(self):
-    # Test case for GitHub issue for 32380
-    norm = batch_normalization.BatchNormalization(virtual_batch_size=8)
-    inp = keras.layers.Input(shape=(None, None, 3))
-    _ = norm(inp)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_fused_batchnorm_empty_batch(self):
-    # Test case for https://github.com/tensorflow/tensorflow/issues/52986
-    # create a simple strategy with the enable_partial_batch_handling flag
-    # turned on, to trigger the empty batch code path in fused batchnorm
-    strategy = tf.distribute.OneDeviceStrategy('/cpu:0')
-    strategy.extended.enable_partial_batch_handling = True
-    with strategy.scope():
-      layer = batch_normalization.BatchNormalization()
-
-    def fn():
-      with tf.GradientTape() as tape:
-        x = tf.ones((0, 2, 2, 2))
-        layer(x, training=True)
-      return tape
-
-    tape = strategy.run(fn)
-
-    self.assertTrue(layer.fused)
-
-    self.assertIsNotNone(layer.moving_mean)
-    self.assertIsNotNone(layer.moving_variance)
-
-    tape_vars = tape.watched_variables()
-    self.assertAllEqual(layer.gamma, tape_vars[0])
-    self.assertAllEqual(layer.beta, tape_vars[1])
-
-
-def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False):
-  model = keras.models.Sequential()
-  model.add(keras.Input(shape=(2, 2, 2), dtype=dtype))
-  norm = layer(momentum=0.8, fused=fused)
-  model.add(norm)
-  if dtype == 'float16':
-    # Keras models require float32 losses.
-    model.add(keras.layers.Lambda(lambda x: keras.backend.cast(x, 'float32')))
-  model.compile(
-      loss='mse',
-      optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-      run_eagerly=test_utils.should_run_eagerly())
-
-  # centered on 5.0, variance 10.0
-  x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
-       .astype(dtype))
-  model.fit(x, x, epochs=4, verbose=0)
-  out = model.predict(x)
-  out -= keras.backend.eval(norm.beta)
-  out /= keras.backend.eval(norm.gamma)
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2)).astype(
+        dtype
+    )
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= keras.backend.eval(norm.beta)
+    out /= keras.backend.eval(norm.gamma)
 
-  np.testing.assert_allclose(out.mean(), 0.0, atol=2e-1)
-  np.testing.assert_allclose(out.std(), 1.0, atol=2e-1)
+    np.testing.assert_allclose(out.mean(), 0.0, atol=2e-1)
+    np.testing.assert_allclose(out.std(), 1.0, atol=2e-1)
 
 
-@parameterized.parameters([
-    batch_normalization_v1.BatchNormalization,
-    batch_normalization.BatchNormalization
-])
+@parameterized.parameters(
+    [
+        batch_normalization_v1.BatchNormalization,
+        batch_normalization.BatchNormalization,
+    ]
+)
 class NormalizationLayersGraphModeOnlyTest(
-    tf.test.TestCase, parameterized.TestCase):
-
-  def test_shared_batchnorm(self, layer):
-    """Test that a BN layer can be shared across different data streams."""
-    with self.cached_session():
-      # Test single layer reuse
-      bn = layer()
-      x1 = keras.layers.Input(shape=(10,))
-      _ = bn(x1)
-
-      x2 = keras.layers.Input(shape=(10,))
-      y2 = bn(x2)
-
-      x = np.random.normal(loc=5.0, scale=10.0, size=(2, 10))
-      model = keras.models.Model(x2, y2)
-
-      model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.01), 'mse')
-      model.train_on_batch(x, x)
-
-      # Test model-level reuse
-      x3 = keras.layers.Input(shape=(10,))
-      y3 = model(x3)
-      new_model = keras.models.Model(x3, y3, name='new_model')
-
-      new_model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.01), 'mse')
-      new_model.train_on_batch(x, x)
-
-  def test_that_trainable_disables_updates(self, layer):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      a = keras.layers.Input(shape=(4,))
-      layer = layer(input_shape=(4,))
-      b = layer(a)
-      model = keras.models.Model(a, b)
-
-      model.trainable = False
-      model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.01), 'mse')
-
-      x1 = model.predict(val_a)
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      self.assertAllClose(x1, x2, atol=1e-7)
-
-      model.trainable = True
-      model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.01), 'mse')
-
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      assert np.abs(np.sum(x1 - x2)) > 1e-5
-
-      layer.trainable = False
-      model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.01), 'mse')
-
-      x1 = model.predict(val_a)
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      self.assertAllClose(x1, x2, atol=1e-7)
-
-  def test_batchnorm_trainable(self, layer):
-    """Tests that batchnorm layer is trainable when learning phase is enabled.
-
-    Computes mean and std for current inputs then
-    applies batch normalization using them.
-
-    Args:
-      layer: Either V1 or V2 of BatchNormalization layer.
-    """
-    # TODO(fchollet): enable in all execution modes when issue with
-    # learning phase setting is resolved.
-    with tf.Graph().as_default(), self.cached_session():
-      bn_mean = 0.5
-      bn_std = 10.
-      val_a = np.expand_dims(np.arange(10.), axis=1)
-
-      def get_model(bn_mean, bn_std):
-        inp = keras.layers.Input(shape=(1,))
-        x = layer()(inp)
-        model1 = keras.models.Model(inp, x)
-        model1.set_weights([
-            np.array([1.]),
-            np.array([0.]),
-            np.array([bn_mean]),
-            np.array([bn_std**2])
-        ])
-        return model1
-
-      # Simulates training-mode with trainable layer.
-      # Should use mini-batch statistics.
-      with keras.backend.learning_phase_scope(1):
-        model = get_model(bn_mean, bn_std)
-        model.compile(loss='mse', optimizer='rmsprop')
-        out = model.predict(val_a)
-        self.assertAllClose(
-            (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    tf.test.TestCase, parameterized.TestCase
+):
+    def test_shared_batchnorm(self, layer):
+        """Test that a BN layer can be shared across different data streams."""
+        with self.cached_session():
+            # Test single layer reuse
+            bn = layer()
+            x1 = keras.layers.Input(shape=(10,))
+            _ = bn(x1)
+
+            x2 = keras.layers.Input(shape=(10,))
+            y2 = bn(x2)
+
+            x = np.random.normal(loc=5.0, scale=10.0, size=(2, 10))
+            model = keras.models.Model(x2, y2)
+
+            model.compile(
+                tf.compat.v1.train.GradientDescentOptimizer(0.01), "mse"
+            )
+            model.train_on_batch(x, x)
+
+            # Test model-level reuse
+            x3 = keras.layers.Input(shape=(10,))
+            y3 = model(x3)
+            new_model = keras.models.Model(x3, y3, name="new_model")
+
+            new_model.compile(
+                tf.compat.v1.train.GradientDescentOptimizer(0.01), "mse"
+            )
+            new_model.train_on_batch(x, x)
+
+    def test_that_trainable_disables_updates(self, layer):
+        with self.cached_session():
+            val_a = np.random.random((10, 4))
+            val_out = np.random.random((10, 4))
+
+            a = keras.layers.Input(shape=(4,))
+            layer = layer(input_shape=(4,))
+            b = layer(a)
+            model = keras.models.Model(a, b)
+
+            model.trainable = False
+            model.compile(
+                tf.compat.v1.train.GradientDescentOptimizer(0.01), "mse"
+            )
+
+            x1 = model.predict(val_a)
+            model.train_on_batch(val_a, val_out)
+            x2 = model.predict(val_a)
+            self.assertAllClose(x1, x2, atol=1e-7)
+
+            model.trainable = True
+            model.compile(
+                tf.compat.v1.train.GradientDescentOptimizer(0.01), "mse"
+            )
+
+            model.train_on_batch(val_a, val_out)
+            x2 = model.predict(val_a)
+            assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+            layer.trainable = False
+            model.compile(
+                tf.compat.v1.train.GradientDescentOptimizer(0.01), "mse"
+            )
+
+            x1 = model.predict(val_a)
+            model.train_on_batch(val_a, val_out)
+            x2 = model.predict(val_a)
+            self.assertAllClose(x1, x2, atol=1e-7)
+
+    def test_batchnorm_trainable(self, layer):
+        """Tests that batchnorm layer is trainable when learning phase enabled.
+
+        Computes mean and std for current inputs then
+        applies batch normalization using them.
+
+        Args:
+          layer: Either V1 or V2 of BatchNormalization layer.
+        """
+        # TODO(fchollet): enable in all execution modes when issue with
+        # learning phase setting is resolved.
+        with tf.Graph().as_default(), self.cached_session():
+            bn_mean = 0.5
+            bn_std = 10.0
+            val_a = np.expand_dims(np.arange(10.0), axis=1)
+
+            def get_model(bn_mean, bn_std):
+                inp = keras.layers.Input(shape=(1,))
+                x = layer()(inp)
+                model1 = keras.models.Model(inp, x)
+                model1.set_weights(
+                    [
+                        np.array([1.0]),
+                        np.array([0.0]),
+                        np.array([bn_mean]),
+                        np.array([bn_std**2]),
+                    ]
+                )
+                return model1
+
+            # Simulates training-mode with trainable layer.
+            # Should use mini-batch statistics.
+            with keras.backend.learning_phase_scope(1):
+                model = get_model(bn_mean, bn_std)
+                model.compile(loss="mse", optimizer="rmsprop")
+                out = model.predict(val_a)
+                self.assertAllClose(
+                    (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3
+                )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/normalization/batch_normalization_v1.py b/keras/layers/normalization/batch_normalization_v1.py
index c6d3fb2d6d00..4d9feb311da2 100644
--- a/keras/layers/normalization/batch_normalization_v1.py
+++ b/keras/layers/normalization/batch_normalization_v1.py
@@ -13,13 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 """Batch Normalization V1 layer."""
-# pylint: disable=g-classes-have-attributes
+
 
 from keras.layers.normalization import batch_normalization
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-# pylint: disable=missing-docstring
-@keras_export(v1=['keras.layers.BatchNormalization'])
+@keras_export(v1=["keras.layers.BatchNormalization"])
 class BatchNormalization(batch_normalization.BatchNormalizationBase):
-  _USE_V2_BEHAVIOR = False
+    _USE_V2_BEHAVIOR = False
+
+    def __init__(self, *args, **kwargs):
+        # synchronized not implemented in V1
+        kwargs.pop("synchronized", None)
+        super().__init__(*args, **kwargs)
diff --git a/keras/layers/normalization/group_normalization.py b/keras/layers/normalization/group_normalization.py
new file mode 100644
index 000000000000..a0a39bc105bb
--- /dev/null
+++ b/keras/layers/normalization/group_normalization.py
@@ -0,0 +1,269 @@
+# Copyright 2022 The Keras Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Group normalization layer"""
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras import constraints
+from keras import initializers
+from keras import regularizers
+from keras.layers import InputSpec
+from keras.layers import Layer
+from keras.utils import tf_utils
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.layers.GroupNormalization", v1=[])
+class GroupNormalization(Layer):
+    """Group normalization layer.
+
+    Group Normalization divides the channels into groups and computes
+    within each group the mean and variance for normalization.
+    Empirically, its accuracy is more stable than batch norm in a wide
+    range of small batch sizes, if learning rate is adjusted linearly
+    with batch sizes.
+
+    Relation to Layer Normalization:
+    If the number of groups is set to 1, then this operation becomes nearly
+    identical to Layer Normalization (see Layer Normalization docs for details).
+
+    Relation to Instance Normalization:
+    If the number of groups is set to the input dimension (number of groups is
+    equal to number of channels), then this operation becomes identical to
+    Instance Normalization.
+
+    Args:
+      groups: Integer, the number of groups for Group Normalization. Can be in
+        the range [1, N] where N is the input dimension. The input dimension
+        must be divisible by the number of groups. Defaults to `32`.
+      axis: Integer or List/Tuple. The axis or axes to normalize across.
+        Typically, this is the features axis/axes. The left-out axes are
+        typically the batch axis/axes. `-1` is the last dimension in the
+        input. Defaults to `-1`.
+      epsilon: Small float added to variance to avoid dividing by zero. Defaults
+        to 1e-3
+      center: If True, add offset of `beta` to normalized tensor. If False,
+        `beta` is ignored. Defaults to `True`.
+      scale: If True, multiply by `gamma`. If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`), this can be
+        disabled since the scaling will be done by the next layer.
+        Defaults to `True`.
+      beta_initializer: Initializer for the beta weight. Defaults to zeros.
+      gamma_initializer: Initializer for the gamma weight. Defaults to ones.
+      beta_regularizer: Optional regularizer for the beta weight. None by
+        default.
+      gamma_regularizer: Optional regularizer for the gamma weight. None by
+        default.
+      beta_constraint: Optional constraint for the beta weight. None by default.
+      gamma_constraint: Optional constraint for the gamma weight. None by
+        default.  Input shape: Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis) when using this
+        layer as the first layer in a model.  Output shape: Same shape as input.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      mask: The mask parameter is a tensor that indicates the weight for each
+        position in the input tensor when computing the mean and variance.
+
+    Reference: - [Yuxin Wu & Kaiming He, 2018](https://arxiv.org/abs/1803.08494)
+    """
+
+    def __init__(
+        self,
+        groups=32,
+        axis=-1,
+        epsilon=1e-3,
+        center=True,
+        scale=True,
+        beta_initializer="zeros",
+        gamma_initializer="ones",
+        beta_regularizer=None,
+        gamma_regularizer=None,
+        beta_constraint=None,
+        gamma_constraint=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.groups = groups
+        self.axis = axis
+        self.epsilon = epsilon
+        self.center = center
+        self.scale = scale
+        self.beta_initializer = initializers.get(beta_initializer)
+        self.gamma_initializer = initializers.get(gamma_initializer)
+        self.beta_regularizer = regularizers.get(beta_regularizer)
+        self.gamma_regularizer = regularizers.get(gamma_regularizer)
+        self.beta_constraint = constraints.get(beta_constraint)
+        self.gamma_constraint = constraints.get(gamma_constraint)
+
+    def build(self, input_shape):
+        tf_utils.validate_axis(self.axis, input_shape)
+
+        dim = input_shape[self.axis]
+        if dim is None:
+            raise ValueError(
+                f"Axis {self.axis} of input tensor should have a defined "
+                "dimension but the layer received an input with shape "
+                f"{input_shape}."
+            )
+
+        if self.groups == -1:
+            self.groups = dim
+
+        if dim < self.groups:
+            raise ValueError(
+                f"Number of groups ({self.groups}) cannot be more than the "
+                f"number of channels ({dim})."
+            )
+
+        if dim % self.groups != 0:
+            raise ValueError(
+                f"Number of groups ({self.groups}) must be a multiple "
+                f"of the number of channels ({dim})."
+            )
+
+        self.input_spec = InputSpec(
+            ndim=len(input_shape), axes={self.axis: dim}
+        )
+
+        if self.scale:
+            self.gamma = self.add_weight(
+                shape=(dim,),
+                name="gamma",
+                initializer=self.gamma_initializer,
+                regularizer=self.gamma_regularizer,
+                constraint=self.gamma_constraint,
+            )
+        else:
+            self.gamma = None
+
+        if self.center:
+            self.beta = self.add_weight(
+                shape=(dim,),
+                name="beta",
+                initializer=self.beta_initializer,
+                regularizer=self.beta_regularizer,
+                constraint=self.beta_constraint,
+            )
+        else:
+            self.beta = None
+
+        super().build(input_shape)
+
+    def call(self, inputs, mask=None):
+        input_shape = tf.shape(inputs)
+
+        if mask is None:
+            mask = tf.ones_like(inputs)
+        else:
+            # We broadcast before we group in case the mask does not have the
+            # same shape as the input.
+            mask = tf.broadcast_to(mask, input_shape)
+
+        reshaped_inputs = self._reshape_into_groups(inputs)
+        reshaped_mask = self._reshape_into_groups(mask)
+
+        normalized_inputs = self._apply_normalization(
+            reshaped_inputs=reshaped_inputs,
+            input_shape=input_shape,
+            reshaped_mask=reshaped_mask,
+        )
+
+        return tf.reshape(normalized_inputs, input_shape)
+
+    def _reshape_into_groups(self, inputs):
+        input_shape = tf.shape(inputs)
+        group_shape = [input_shape[i] for i in range(inputs.shape.rank)]
+
+        group_shape[self.axis] = input_shape[self.axis] // self.groups
+        group_shape.insert(self.axis, self.groups)
+        group_shape = tf.stack(group_shape)
+        reshaped_inputs = tf.reshape(inputs, group_shape)
+        return reshaped_inputs
+
+    def _apply_normalization(
+        self,
+        *,
+        reshaped_inputs,
+        reshaped_mask,
+        input_shape,
+    ):
+        group_reduction_axes = list(range(1, reshaped_inputs.shape.rank))
+
+        axis = self.axis - 1
+        group_reduction_axes.pop(axis)
+
+        mask_weights = tf.cast(reshaped_mask, reshaped_inputs.dtype)
+
+        mean, variance = tf.nn.weighted_moments(
+            reshaped_inputs,
+            axes=group_reduction_axes,
+            frequency_weights=mask_weights,
+            keepdims=True,
+        )
+
+        gamma, beta = self._get_reshaped_weights(input_shape)
+        normalized_inputs = tf.nn.batch_normalization(
+            reshaped_inputs,
+            mean=mean,
+            variance=variance,
+            scale=gamma,
+            offset=beta,
+            variance_epsilon=self.epsilon,
+        )
+        return normalized_inputs
+
+    def _get_reshaped_weights(self, input_shape):
+        broadcast_shape = self._create_broadcast_shape(input_shape)
+        gamma = None
+        beta = None
+        if self.scale:
+            gamma = tf.reshape(self.gamma, broadcast_shape)
+
+        if self.center:
+            beta = tf.reshape(self.beta, broadcast_shape)
+        return gamma, beta
+
+    def _create_broadcast_shape(self, input_shape):
+        broadcast_shape = [1] * backend.int_shape(input_shape)[0]
+
+        broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
+        broadcast_shape.insert(self.axis, self.groups)
+
+        return broadcast_shape
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "groups": self.groups,
+            "axis": self.axis,
+            "epsilon": self.epsilon,
+            "center": self.center,
+            "scale": self.scale,
+            "beta_initializer": initializers.serialize(self.beta_initializer),
+            "gamma_initializer": initializers.serialize(self.gamma_initializer),
+            "beta_regularizer": regularizers.serialize(self.beta_regularizer),
+            "gamma_regularizer": regularizers.serialize(self.gamma_regularizer),
+            "beta_constraint": constraints.serialize(self.beta_constraint),
+            "gamma_constraint": constraints.serialize(self.gamma_constraint),
+        }
+        base_config = super().get_config()
+        return {**base_config, **config}
diff --git a/keras/layers/normalization/group_normalization_test.py b/keras/layers/normalization/group_normalization_test.py
new file mode 100644
index 000000000000..d73455cd4fc9
--- /dev/null
+++ b/keras/layers/normalization/group_normalization_test.py
@@ -0,0 +1,382 @@
+# Copyright 2022 The Keras Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras.initializers import Constant
+from keras.layers import GroupNormalization
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+
+def _build_group_normalization_model(norm):
+    model = keras.models.Sequential()
+    model.add(norm)
+    model.compile(
+        loss="mse",
+        optimizer="rmsprop",
+        run_eagerly=test_utils.should_run_eagerly(),
+    )
+
+    return model
+
+
+@test_utils.run_v2_only
+class GroupNormalizationTest(test_combinations.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_trainable_weights(self):
+        # Check if weights get initialized correctly
+        layer = GroupNormalization(groups=1, scale=False, center=False)
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 0)
+        self.assertEqual(len(layer.weights), 0)
+
+        # Check if weights get initialized correctly
+        layer = GroupNormalization(groups=1, scale=True, center=True)
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 2)
+        self.assertEqual(len(layer.weights), 2)
+
+    @test_combinations.run_all_keras_modes
+    def test_groupnorm(self):
+        test_utils.layer_test(
+            GroupNormalization,
+            kwargs={
+                "gamma_regularizer": keras.regularizers.l2(0.01),
+                "beta_regularizer": keras.regularizers.l2(0.01),
+            },
+            input_shape=(3, 4, 32),
+        )
+
+        test_utils.layer_test(
+            GroupNormalization,
+            kwargs={
+                "groups": 4,
+                "gamma_constraint": keras.constraints.UnitNorm(),
+                "beta_constraint": keras.constraints.UnitNorm(),
+            },
+            input_shape=(3, 4, 4),
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_1d(self):
+        layer_with_1_group = GroupNormalization(
+            groups=1, axis=-1, input_shape=(8,), scale=False, center=False
+        )
+        layer_with_2_groups = GroupNormalization(
+            groups=2, axis=1, input_shape=(8,), scale=False, center=False
+        )
+
+        inputs = tf.constant(
+            [-1.0, -1.0, 1.0, 1.0, 2.0, 2.0, 0, -2.0], shape=(1, 8)
+        )
+
+        expected_output_1_group = tf.constant(
+            [-0.898, -0.898, 0.539, 0.539, 1.257, 1.257, -0.180, -1.616],
+            shape=(1, 8),
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_1_group)(inputs),
+            expected_output_1_group,
+            atol=1e-3,
+        )
+
+        expected_output_2_groups = tf.constant(
+            [-1.0, -1.0, 1.0, 1.0, 0.904, 0.904, -0.301, -1.507], shape=(1, 8)
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_2_groups)(inputs),
+            expected_output_2_groups,
+            atol=1e-3,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_1d_with_mask(self):
+        layer_with_1_group = GroupNormalization(
+            groups=1, axis=-1, input_shape=(8,), scale=False, center=False
+        )
+        layer_with_2_groups = GroupNormalization(
+            groups=2, axis=1, input_shape=(8,), scale=False, center=False
+        )
+
+        inputs = tf.constant(
+            [-1.0, -1.0, 1.0, 1.0, 2.0, 2.0, 0, -2.0], shape=(1, 8)
+        )
+
+        mask1 = tf.constant(
+            [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=(1, 8)
+        )
+        mask2 = tf.constant(
+            [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0], shape=(1, 8)
+        )
+
+        expected_output_1_group = tf.constant(
+            [-0.706, -0.706, 1.413, 1.413, 2.473, 2.473, 0.353, -1.766],
+            shape=(1, 8),
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_1_group)(
+                inputs, mask=mask1
+            ),
+            expected_output_1_group,
+            atol=1e-3,
+        )
+
+        expected_output_2_groups = tf.constant(
+            [-1.0, -1.0, 1.0, 1.0, 0.999, 0.999, 0.0, -0.999], shape=(1, 8)
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_2_groups)(
+                inputs, mask=mask2
+            ),
+            expected_output_2_groups,
+            atol=1e-3,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_1d_with_non_binary_mask(self):
+        norm = GroupNormalization(
+            groups=1, axis=-1, input_shape=(8,), scale=False, center=False
+        )
+        inputs = tf.constant(
+            [-1.0, -1.0, 1.0, 1.0, 2.0, 2.0, 0, -2.0], shape=(1, 8)
+        )
+
+        mask = tf.constant(
+            [0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=(1, 8)
+        )
+
+        expected_output = tf.constant(
+            [-0.999, -0.999, 0.999, 0.999, 1.999, 1.999, 0.0, -1.999],
+            shape=(1, 8),
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(norm)(inputs, mask=mask),
+            expected_output,
+            atol=1e-3,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_2d(self):
+        layer_with_1_group = GroupNormalization(
+            groups=1, axis=-1, input_shape=(2, 4), scale=False, center=False
+        )
+        layer_with_2_groups = GroupNormalization(
+            groups=2, axis=2, input_shape=(2, 4), scale=False, center=False
+        )
+
+        inputs = tf.constant(
+            [[-1.0, -1.0, 2.0, 2.0], [1.0, 1.0, 0, -2.0]], shape=(1, 2, 4)
+        )
+
+        expected_output_1_group = tf.constant(
+            [[-0.898, -0.898, 1.257, 1.257], [0.539, 0.539, -0.180, -1.616]],
+            shape=(1, 2, 4),
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_1_group)(inputs),
+            expected_output_1_group,
+            atol=1e-3,
+        )
+
+        expected_output_2_groups = tf.constant(
+            [[-1.0, -1.0, 0.904, 0.904], [1.0, 1.0, -0.301, -1.507]],
+            shape=(1, 2, 4),
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_2_groups)(inputs),
+            expected_output_2_groups,
+            atol=1e-3,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_2d_with_mask(self):
+        layer_with_1_group = GroupNormalization(
+            groups=1, axis=-1, input_shape=(2, 4), scale=False, center=False
+        )
+        layer_with_2_groups = GroupNormalization(
+            groups=2, axis=2, input_shape=(2, 4), scale=False, center=False
+        )
+
+        inputs = tf.constant(
+            [[-1.0, -1.0, 2.0, 2.0], [1.0, 1.0, 0, -2.0]], shape=(1, 2, 4)
+        )
+
+        mask1 = tf.constant(
+            [
+                [
+                    1.0,
+                    1.0,
+                    0.0,
+                    0.0,
+                ],
+                [1.0, 0.0, 0.0, 0.0],
+            ],
+            shape=(1, 2, 4),
+        )
+        mask2 = tf.constant(
+            [
+                [
+                    1.0,
+                    1.0,
+                    0.0,
+                    1.0,
+                ],
+                [1.0, 1.0, 0.0, 1.0],
+            ],
+            shape=(1, 2, 4),
+        )
+
+        expected_output_1_group = tf.constant(
+            [[-0.706, -0.706, 2.473, 2.473], [1.413, 1.413, 0.353, -1.766]],
+            shape=(1, 2, 4),
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_1_group)(
+                inputs, mask=mask1
+            ),
+            expected_output_1_group,
+            atol=1e-3,
+        )
+
+        expected_output_2_groups = tf.constant(
+            [[-1.0, -1.0, 0.999, 0.999], [1.0, 1.0, 0.0, -0.999]],
+            shape=(1, 2, 4),
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(layer_with_2_groups)(
+                inputs, mask=mask2
+            ),
+            expected_output_2_groups,
+            atol=1e-3,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_mask_broadcasting(self):
+        images = tf.ones((1, 2, 4, 3))  # NHWC
+        mask = tf.random.uniform((1, 2, 4, 1)) < 0.5  # NHWC
+
+        norm = GroupNormalization(
+            groups=3, axis=-1, input_shape=(2, 4, 9), scale=False, center=False
+        )
+        output = norm(images, mask=mask)
+
+        self.assertEqual(output.shape, (1, 2, 4, 3))
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_instance_norm(self):
+        instance_norm_layer = GroupNormalization(
+            groups=4, axis=-1, input_shape=(2, 4), scale=False, center=False
+        )
+
+        inputs = tf.constant(
+            [[-1.0, 1.0, 0, 2.0], [1.0, 3.0, -4, -2.0]], shape=(1, 2, 4)
+        )
+
+        expected_instance_norm_output = tf.constant(
+            [[-1.0, -1.0, 1.0, 1.0], [1.0, 1.0, -1.0, -1.0]], shape=(1, 2, 4)
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(instance_norm_layer)(inputs),
+            expected_instance_norm_output,
+            atol=1e-3,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_with_centering(self):
+        normalization_layer = GroupNormalization(
+            groups=2,
+            axis=-1,
+            input_shape=(8,),
+            scale=False,
+            center=True,
+            beta_initializer=Constant(10),
+        )
+
+        inputs = tf.constant(
+            [-1.0, -1.0, 1.0, 1.0, 2.0, 2.0, 0, -2.0], shape=(1, 8)
+        )
+
+        expected_output = tf.constant(
+            [9.0, 9.0, 11.0, 11.0, 10.904, 10.904, 9.699, 8.493], shape=(1, 8)
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(normalization_layer)(inputs),
+            expected_output,
+            atol=1e-3,
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_correctness_with_scaling(self):
+        normalization_layer = GroupNormalization(
+            groups=2,
+            axis=-1,
+            input_shape=(8,),
+            scale=True,
+            center=False,
+            gamma_initializer=Constant(2),
+        )
+
+        inputs = tf.constant(
+            [-1.0, -1.0, 1.0, 1.0, 2.0, 2.0, 0, -2.0], shape=(1, 8)
+        )
+
+        expected_output = tf.constant(
+            [-2.0, -2.0, 2.0, 2.0, 1.809, 1.808, -0.602, -3.014], shape=(1, 8)
+        )
+        self.assertAllClose(
+            _build_group_normalization_model(normalization_layer)(inputs),
+            expected_output,
+            atol=1e-3,
+        )
+
+    def test_validates_groups_against_channels(self):
+        with self.assertRaisesRegex(
+            ValueError, r"must be a multiple of the number of channels"
+        ):
+            norm = GroupNormalization(groups=3, axis=-1)
+            norm.build(input_shape=(2, 10))
+
+        with self.assertRaisesRegex(
+            ValueError, r"cannot be more than the number of channels"
+        ):
+            norm = GroupNormalization(groups=32, axis=-1)
+            norm.build(input_shape=(2, 8))
+
+    def test_validates_known_number_of_channels(self):
+        with self.assertRaisesRegex(
+            ValueError, r"tensor should have a defined dimension"
+        ):
+            norm = GroupNormalization(axis=-1)
+            norm.build(input_shape=(1, 32, None))
+
+    def test_rejects_invalid_axis(self):
+        with self.assertRaisesRegex(
+            ValueError, r"Invalid value for `axis` argument"
+        ):
+            norm = GroupNormalization(axis=-4)
+            norm.build(input_shape=(64, 32, 32))
+        with self.assertRaisesRegex(
+            ValueError, r"Invalid value for `axis` argument"
+        ):
+            norm = GroupNormalization(axis=3)
+            norm.build(input_shape=(64, 32, 32))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/normalization/layer_normalization.py b/keras/layers/normalization/layer_normalization.py
index 2da0e9405f0c..42bcc08d1ea6 100644
--- a/keras/layers/normalization/layer_normalization.py
+++ b/keras/layers/normalization/layer_normalization.py
@@ -15,7 +15,6 @@
 """Layer Normalization layer."""
 
 import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
 
 from keras import constraints
 from keras import initializers
@@ -24,332 +23,347 @@
 from keras.engine.base_layer import Layer
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.LayerNormalization')
+@keras_export("keras.layers.LayerNormalization")
 class LayerNormalization(Layer):
-  """Layer normalization layer (Ba et al., 2016).
-
-  Normalize the activations of the previous layer for each given example in a
-  batch independently, rather than across a batch like Batch Normalization.
-  i.e. applies a transformation that maintains the mean activation within each
-  example close to 0 and the activation standard deviation close to 1.
-
-  Given a tensor `inputs`, moments are calculated and normalization
-  is performed across the axes specified in `axis`.
-
-  Example:
-
-  >>> data = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
-  >>> print(data)
-  tf.Tensor(
-  [[ 0. 10.]
-   [20. 30.]
-   [40. 50.]
-   [60. 70.]
-   [80. 90.]], shape=(5, 2), dtype=float32)
-
-  >>> layer = tf.keras.layers.LayerNormalization(axis=1)
-  >>> output = layer(data)
-  >>> print(output)
-  tf.Tensor(
-  [[-1. 1.]
-   [-1. 1.]
-   [-1. 1.]
-   [-1. 1.]
-   [-1. 1.]], shape=(5, 2), dtype=float32)
-
-  Notice that with Layer Normalization the normalization happens across the
-  axes *within* each example, rather than across different examples in the
-  batch.
-
-  If `scale` or `center` are enabled, the layer will scale the normalized
-  outputs by broadcasting them with a trainable variable `gamma`, and center
-  the outputs by broadcasting with a trainable variable `beta`. `gamma` will
-  default to a ones tensor and `beta` will default to a zeros tensor, so that
-  centering and scaling are no-ops before training has begun.
-
-  So, with scaling and centering enabled the normalization equations
-  are as follows:
-
-  Let the intermediate activations for a mini-batch to be the `inputs`.
-
-  For each sample `x_i` in `inputs` with `k` features, we compute the mean and
-  variance of the sample:
-
-  ```python
-  mean_i = sum(x_i[j] for j in range(k)) / k
-  var_i = sum((x_i[j] - mean_i) ** 2 for j in range(k)) / k
-  ```
-
-  and then compute a normalized `x_i_normalized`, including a small factor
-  `epsilon` for numerical stability.
-
-  ```python
-  x_i_normalized = (x_i - mean_i) / sqrt(var_i + epsilon)
-  ```
-
-  And finally `x_i_normalized ` is linearly transformed by `gamma` and `beta`,
-  which are learned parameters:
-
-  ```python
-  output_i = x_i_normalized * gamma + beta
-  ```
-
-  `gamma` and `beta` will span the axes of `inputs` specified in `axis`, and
-  this part of the inputs' shape must be fully defined.
-
-  For example:
-
-  >>> layer = tf.keras.layers.LayerNormalization(axis=[1, 2, 3])
-  >>> layer.build([5, 20, 30, 40])
-  >>> print(layer.beta.shape)
-  (20, 30, 40)
-  >>> print(layer.gamma.shape)
-  (20, 30, 40)
-
-  Note that other implementations of layer normalization may choose to define
-  `gamma` and `beta` over a separate set of axes from the axes being
-  normalized across. For example, Group Normalization
-  ([Wu et al. 2018](https://arxiv.org/abs/1803.08494)) with group size of 1
-  corresponds to a Layer Normalization that normalizes across height, width,
-  and channel and has `gamma` and `beta` span only the channel dimension.
-  So, this Layer Normalization implementation will not match a Group
-  Normalization layer with group size set to 1.
-
-  Args:
-    axis: Integer or List/Tuple. The axis or axes to normalize across. Typically
-      this is the features axis/axes. The left-out axes are typically the batch
-      axis/axes. This argument defaults to `-1`, the last dimension in the
-      input.
-    epsilon: Small float added to variance to avoid dividing by zero. Defaults
-      to 1e-3
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored. Defaults to True.
-    scale: If True, multiply by `gamma`. If False, `gamma` is not used. Defaults
-      to True. When the next layer is linear (also e.g. `nn.relu`), this can be
-      disabled since the scaling will be done by the next layer.
-    beta_initializer: Initializer for the beta weight. Defaults to zeros.
-    gamma_initializer: Initializer for the gamma weight. Defaults to ones.
-    beta_regularizer: Optional regularizer for the beta weight. None by default.
-    gamma_regularizer: Optional regularizer for the gamma weight. None by
-      default.
-    beta_constraint: Optional constraint for the beta weight. None by default.
-    gamma_constraint: Optional constraint for the gamma weight. None by default.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape` (tuple of
-    integers, does not include the samples axis) when using this layer as the
-    first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
-  Reference:
-    - [Lei Ba et al., 2016](https://arxiv.org/abs/1607.06450).
-  """
-
-  @utils.allow_initializer_layout
-  def __init__(self,
-               axis=-1,
-               epsilon=1e-3,
-               center=True,
-               scale=True,
-               beta_initializer='zeros',
-               gamma_initializer='ones',
-               beta_regularizer=None,
-               gamma_regularizer=None,
-               beta_constraint=None,
-               gamma_constraint=None,
-               **kwargs):
-    super().__init__(**kwargs)
-    if isinstance(axis, (list, tuple)):
-      self.axis = list(axis)
-    elif isinstance(axis, int):
-      self.axis = axis
-    else:
-      raise TypeError('Expected an int or a list/tuple of ints for the '
-                      'argument \'axis\', but received: %r' % axis)
-
-    self.epsilon = epsilon
-    self.center = center
-    self.scale = scale
-    self.beta_initializer = initializers.get(beta_initializer)
-    self.gamma_initializer = initializers.get(gamma_initializer)
-    self.beta_regularizer = regularizers.get(beta_regularizer)
-    self.gamma_regularizer = regularizers.get(gamma_regularizer)
-    self.beta_constraint = constraints.get(beta_constraint)
-    self.gamma_constraint = constraints.get(gamma_constraint)
-
-    self.supports_masking = True
-
-    # Indicates whether a faster fused implementation can be used. This will be
-    # set to True or False in build()"
-    self._fused = None
-
-  def _fused_can_be_used(self, ndims):
-    """Returns false if fused implementation cannot be used.
-
-    Check if the axis is contiguous and can be collapsed into the last axis.
-    The self.axis is assumed to have no duplicates.
+    """Layer normalization layer (Ba et al., 2016).
+
+    Normalize the activations of the previous layer for each given example in a
+    batch independently, rather than across a batch like Batch Normalization.
+    i.e. applies a transformation that maintains the mean activation within each
+    example close to 0 and the activation standard deviation close to 1.
+
+    Given a tensor `inputs`, moments are calculated and normalization
+    is performed across the axes specified in `axis`.
+
+    Example:
+
+    >>> data = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
+    >>> print(data)
+    tf.Tensor(
+    [[ 0. 10.]
+     [20. 30.]
+     [40. 50.]
+     [60. 70.]
+     [80. 90.]], shape=(5, 2), dtype=float32)
+
+    >>> layer = tf.keras.layers.LayerNormalization(axis=1)
+    >>> output = layer(data)
+    >>> print(output)
+    tf.Tensor(
+    [[-1. 1.]
+     [-1. 1.]
+     [-1. 1.]
+     [-1. 1.]
+     [-1. 1.]], shape=(5, 2), dtype=float32)
+
+    Notice that with Layer Normalization the normalization happens across the
+    axes *within* each example, rather than across different examples in the
+    batch.
+
+    If `scale` or `center` are enabled, the layer will scale the normalized
+    outputs by broadcasting them with a trainable variable `gamma`, and center
+    the outputs by broadcasting with a trainable variable `beta`. `gamma` will
+    default to a ones tensor and `beta` will default to a zeros tensor, so that
+    centering and scaling are no-ops before training has begun.
+
+    So, with scaling and centering enabled the normalization equations
+    are as follows:
+
+    Let the intermediate activations for a mini-batch to be the `inputs`.
+
+    For each sample `x_i` in `inputs` with `k` features, we compute the mean and
+    variance of the sample:
+
+    ```python
+    mean_i = sum(x_i[j] for j in range(k)) / k
+    var_i = sum((x_i[j] - mean_i) ** 2 for j in range(k)) / k
+    ```
+
+    and then compute a normalized `x_i_normalized`, including a small factor
+    `epsilon` for numerical stability.
+
+    ```python
+    x_i_normalized = (x_i - mean_i) / sqrt(var_i + epsilon)
+    ```
+
+    And finally `x_i_normalized ` is linearly transformed by `gamma` and `beta`,
+    which are learned parameters:
+
+    ```python
+    output_i = x_i_normalized * gamma + beta
+    ```
+
+    `gamma` and `beta` will span the axes of `inputs` specified in `axis`, and
+    this part of the inputs' shape must be fully defined.
+
+    For example:
+
+    >>> layer = tf.keras.layers.LayerNormalization(axis=[1, 2, 3])
+    >>> layer.build([5, 20, 30, 40])
+    >>> print(layer.beta.shape)
+    (20, 30, 40)
+    >>> print(layer.gamma.shape)
+    (20, 30, 40)
+
+    Note that other implementations of layer normalization may choose to define
+    `gamma` and `beta` over a separate set of axes from the axes being
+    normalized across. For example, Group Normalization
+    ([Wu et al. 2018](https://arxiv.org/abs/1803.08494)) with group size of 1
+    corresponds to a Layer Normalization that normalizes across height, width,
+    and channel and has `gamma` and `beta` span only the channel dimension.
+    So, this Layer Normalization implementation will not match a Group
+    Normalization layer with group size set to 1.
+
+    Args:
+      axis: Integer or List/Tuple. The axis or axes to normalize across.
+        Typically, this is the features axis/axes. The left-out axes are
+        typically the batch axis/axes. `-1` is the last dimension in the
+        input. Defaults to `-1`.
+      epsilon: Small float added to variance to avoid dividing by zero. Defaults
+        to 1e-3
+      center: If True, add offset of `beta` to normalized tensor. If False,
+        `beta` is ignored. Defaults to `True`.
+      scale: If True, multiply by `gamma`. If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`), this can be
+        disabled since the scaling will be done by the next layer.
+        Defaults to `True`.
+      beta_initializer: Initializer for the beta weight. Defaults to zeros.
+      gamma_initializer: Initializer for the gamma weight. Defaults to ones.
+      beta_regularizer: Optional regularizer for the beta weight. None by
+        default.
+      gamma_regularizer: Optional regularizer for the gamma weight. None by
+        default.
+      beta_constraint: Optional constraint for the beta weight. None by default.
+      gamma_constraint: Optional constraint for the gamma weight. None by
+        default.
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape` (tuple of
+      integers, does not include the samples axis) when using this layer as the
+      first layer in a model.
+
+    Output shape:
+      Same shape as input.
+
+    Reference:
+      - [Lei Ba et al., 2016](https://arxiv.org/abs/1607.06450).
     """
-    axis = sorted(self.axis)
-    can_use_fused = False
-
-    if axis[-1] == ndims - 1 and axis[-1] - axis[0] == len(axis) - 1:
-      can_use_fused = True
-
-    # fused_batch_norm will silently raise epsilon to be at least 1.001e-5, so
-    # we cannot used the fused version if epsilon is below that value. Also, the
-    # variable dtype must be float32, as fused_batch_norm only supports float32
-    # variables.
-    if self.epsilon < 1.001e-5 or self.dtype != 'float32':
-      can_use_fused = False
-
-    return can_use_fused
-
-  def build(self, input_shape):
-    self.axis = tf_utils.validate_axis(self.axis, input_shape)
-    input_shape = tf.TensorShape(input_shape)
-    rank = input_shape.rank
-
-    param_shape = [input_shape[dim] for dim in self.axis]
-    if self.scale:
-      self.gamma = self.add_weight(
-          name='gamma',
-          shape=param_shape,
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint,
-          trainable=True,
-          experimental_autocast=False)
-    else:
-      self.gamma = None
-
-    if self.center:
-      self.beta = self.add_weight(
-          name='beta',
-          shape=param_shape,
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint,
-          trainable=True,
-          experimental_autocast=False)
-    else:
-      self.beta = None
-
-    self._fused = self._fused_can_be_used(rank)
-    self.built = True
-
-  def call(self, inputs):
-    # TODO(b/229545225): Remove the RaggedTensor check.
-    is_ragged = isinstance(inputs, tf.RaggedTensor)
-    if is_ragged:
-      inputs_lengths = inputs.nested_row_lengths()
-      inputs = inputs.to_tensor()
-    inputs = tf.cast(inputs, self.compute_dtype)
-    # Compute the axes along which to reduce the mean / variance
-    input_shape = inputs.shape
-    ndims = len(input_shape)
-
-    # Broadcasting only necessary for norm when the axis is not just
-    # the last dimension
-    broadcast_shape = [1] * ndims
-    for dim in self.axis:
-      broadcast_shape[dim] = input_shape.dims[dim].value
-
-    def _broadcast(v):
-      if (v is not None and len(v.shape) != ndims and self.axis != [ndims - 1]):
-        return tf.reshape(v, broadcast_shape)
-      return v
-
-    if not self._fused:
-      input_dtype = inputs.dtype
-      if input_dtype in ('float16', 'bfloat16') and self.dtype == 'float32':
-        # If mixed precision is used, cast inputs to float32 so that this is at
-        # least as numerically stable as the fused version.
-        inputs = tf.cast(inputs, 'float32')
-
-      # Calculate the moments on the last axis (layer activations).
-      mean, variance = tf.nn.moments(inputs, self.axis, keepdims=True)
-
-      scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
-
-      # Compute layer normalization using the batch_normalization function.
-      outputs = tf.nn.batch_normalization(
-          inputs,
-          mean,
-          variance,
-          offset=offset,
-          scale=scale,
-          variance_epsilon=self.epsilon)
-      outputs = tf.cast(outputs, input_dtype)
-    else:
-      # Collapse dims before self.axis, and dims in self.axis
-      pre_dim, in_dim = (1, 1)
-      axis = sorted(self.axis)
-      tensor_shape = tf.shape(inputs)
-      for dim in range(0, ndims):
-        dim_tensor = tensor_shape[dim]
-        if dim < axis[0]:
-          pre_dim = pre_dim * dim_tensor
+
+    @utils.allow_initializer_layout
+    def __init__(
+        self,
+        axis=-1,
+        epsilon=1e-3,
+        center=True,
+        scale=True,
+        beta_initializer="zeros",
+        gamma_initializer="ones",
+        beta_regularizer=None,
+        gamma_regularizer=None,
+        beta_constraint=None,
+        gamma_constraint=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        if isinstance(axis, (list, tuple)):
+            self.axis = list(axis)
+        elif isinstance(axis, int):
+            self.axis = axis
+        else:
+            raise TypeError(
+                "Expected an int or a list/tuple of ints for the "
+                "argument 'axis', but received: %r" % axis
+            )
+
+        self.epsilon = epsilon
+        self.center = center
+        self.scale = scale
+        self.beta_initializer = initializers.get(beta_initializer)
+        self.gamma_initializer = initializers.get(gamma_initializer)
+        self.beta_regularizer = regularizers.get(beta_regularizer)
+        self.gamma_regularizer = regularizers.get(gamma_regularizer)
+        self.beta_constraint = constraints.get(beta_constraint)
+        self.gamma_constraint = constraints.get(gamma_constraint)
+
+        self.supports_masking = True
+
+        # Indicates whether a faster fused implementation can be used. This will
+        # be set to True or False in build()"
+        self._fused = None
+
+    def _fused_can_be_used(self, ndims):
+        """Returns false if fused implementation cannot be used.
+
+        Check if the axis is contiguous and can be collapsed into the last axis.
+        The self.axis is assumed to have no duplicates.
+        """
+        axis = sorted(self.axis)
+        can_use_fused = False
+
+        if axis[-1] == ndims - 1 and axis[-1] - axis[0] == len(axis) - 1:
+            can_use_fused = True
+
+        # fused_batch_norm will silently raise epsilon to be at least 1.001e-5,
+        # so we cannot used the fused version if epsilon is below that value.
+        # Also, the variable dtype must be float32, as fused_batch_norm only
+        # supports float32 variables.
+        if self.epsilon < 1.001e-5 or self.dtype != "float32":
+            can_use_fused = False
+
+        return can_use_fused
+
+    def build(self, input_shape):
+        self.axis = tf_utils.validate_axis(self.axis, input_shape)
+        input_shape = tf.TensorShape(input_shape)
+        rank = input_shape.rank
+
+        param_shape = [input_shape[dim] for dim in self.axis]
+        if self.scale:
+            self.gamma = self.add_weight(
+                name="gamma",
+                shape=param_shape,
+                initializer=self.gamma_initializer,
+                regularizer=self.gamma_regularizer,
+                constraint=self.gamma_constraint,
+                trainable=True,
+                experimental_autocast=False,
+            )
+        else:
+            self.gamma = None
+
+        if self.center:
+            self.beta = self.add_weight(
+                name="beta",
+                shape=param_shape,
+                initializer=self.beta_initializer,
+                regularizer=self.beta_regularizer,
+                constraint=self.beta_constraint,
+                trainable=True,
+                experimental_autocast=False,
+            )
+        else:
+            self.beta = None
+
+        self._fused = self._fused_can_be_used(rank)
+        self.built = True
+
+    def call(self, inputs):
+        # TODO(b/229545225): Remove the RaggedTensor check.
+        is_ragged = isinstance(inputs, tf.RaggedTensor)
+        if is_ragged:
+            inputs_lengths = inputs.nested_row_lengths()
+            inputs = inputs.to_tensor()
+        inputs = tf.cast(inputs, self.compute_dtype)
+        # Compute the axes along which to reduce the mean / variance
+        input_shape = inputs.shape
+        ndims = len(input_shape)
+
+        # Broadcasting only necessary for norm when the axis is not just
+        # the last dimension
+        broadcast_shape = [1] * ndims
+        for dim in self.axis:
+            broadcast_shape[dim] = input_shape.dims[dim].value
+
+        def _broadcast(v):
+            if (
+                v is not None
+                and len(v.shape) != ndims
+                and self.axis != [ndims - 1]
+            ):
+                return tf.reshape(v, broadcast_shape)
+            return v
+
+        if not self._fused:
+            input_dtype = inputs.dtype
+            if (
+                input_dtype in ("float16", "bfloat16")
+                and self.dtype == "float32"
+            ):
+                # If mixed precision is used, cast inputs to float32 so that
+                # this is at least as numerically stable as the fused version.
+                inputs = tf.cast(inputs, "float32")
+
+            # Calculate the moments on the last axis (layer activations).
+            mean, variance = tf.nn.moments(inputs, self.axis, keepdims=True)
+
+            scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+            # Compute layer normalization using the batch_normalization
+            # function.
+            outputs = tf.nn.batch_normalization(
+                inputs,
+                mean,
+                variance,
+                offset=offset,
+                scale=scale,
+                variance_epsilon=self.epsilon,
+            )
+            outputs = tf.cast(outputs, input_dtype)
         else:
-          assert dim in axis
-          in_dim = in_dim * dim_tensor
-
-      squeezed_shape = [1, pre_dim, in_dim, 1]
-      # This fused operation requires reshaped inputs to be NCHW.
-      data_format = 'NCHW'
-
-      inputs = tf.reshape(inputs, squeezed_shape)
-
-      # self.gamma and self.beta have the wrong shape for fused_batch_norm, so
-      # we cannot pass them as the scale and offset parameters. Therefore, we
-      # create two constant tensors in correct shapes for fused_batch_norm and
-      # later construct a separate calculation on the scale and offset.
-      scale = tf.ones([pre_dim], dtype=self.dtype)
-      offset = tf.zeros([pre_dim], dtype=self.dtype)
-
-      # Compute layer normalization using the fused_batch_norm function.
-      outputs, _, _ = tf.compat.v1.nn.fused_batch_norm(
-          inputs,
-          scale=scale,
-          offset=offset,
-          epsilon=self.epsilon,
-          data_format=data_format)
-
-      outputs = tf.reshape(outputs, tensor_shape)
-
-      scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
-
-      if scale is not None:
-        outputs = outputs * tf.cast(scale, outputs.dtype)
-      if offset is not None:
-        outputs = outputs + tf.cast(offset, outputs.dtype)
-
-    # If some components of the shape got lost due to adjustments, fix that.
-    outputs.set_shape(input_shape)
-
-    if is_ragged:
-      outputs = tf.RaggedTensor.from_tensor(outputs, inputs_lengths)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'axis': self.axis,
-        'epsilon': self.epsilon,
-        'center': self.center,
-        'scale': self.scale,
-        'beta_initializer': initializers.serialize(self.beta_initializer),
-        'gamma_initializer': initializers.serialize(self.gamma_initializer),
-        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
-        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
-        'beta_constraint': constraints.serialize(self.beta_constraint),
-        'gamma_constraint': constraints.serialize(self.gamma_constraint)
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+            # Collapse dims before self.axis, and dims in self.axis
+
+            axis = sorted(self.axis)
+            tensor_shape = tf.shape(inputs)
+            pre_dim = tf.reduce_prod(tensor_shape[: axis[0]])
+            in_dim = tf.reduce_prod(tensor_shape[axis[0] :])
+            squeezed_shape = [1, pre_dim, in_dim, 1]
+            # This fused operation requires reshaped inputs to be NCHW.
+            data_format = "NCHW"
+
+            inputs = tf.reshape(inputs, squeezed_shape)
+
+            # self.gamma and self.beta have the wrong shape for
+            # fused_batch_norm, so we cannot pass them as the scale and offset
+            # parameters. Therefore, we create two constant tensors in correct
+            # shapes for fused_batch_norm and later construct a separate
+            # calculation on the scale and offset.
+            scale = tf.ones([pre_dim], dtype=self.dtype)
+            offset = tf.zeros([pre_dim], dtype=self.dtype)
+
+            # Compute layer normalization using the fused_batch_norm function.
+            outputs, _, _ = tf.compat.v1.nn.fused_batch_norm(
+                inputs,
+                scale=scale,
+                offset=offset,
+                epsilon=self.epsilon,
+                data_format=data_format,
+            )
+
+            outputs = tf.reshape(outputs, tensor_shape)
+
+            scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+            if scale is not None:
+                outputs = outputs * tf.cast(scale, outputs.dtype)
+            if offset is not None:
+                outputs = outputs + tf.cast(offset, outputs.dtype)
+
+        # If some components of the shape got lost due to adjustments, fix that.
+        outputs.set_shape(input_shape)
+
+        if is_ragged:
+            outputs = tf.RaggedTensor.from_tensor(outputs, inputs_lengths)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "axis": self.axis,
+            "epsilon": self.epsilon,
+            "center": self.center,
+            "scale": self.scale,
+            "beta_initializer": initializers.serialize(self.beta_initializer),
+            "gamma_initializer": initializers.serialize(self.gamma_initializer),
+            "beta_regularizer": regularizers.serialize(self.beta_regularizer),
+            "gamma_regularizer": regularizers.serialize(self.gamma_regularizer),
+            "beta_constraint": constraints.serialize(self.beta_constraint),
+            "gamma_constraint": constraints.serialize(self.gamma_constraint),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/normalization/layer_normalization_test.py b/keras/layers/normalization/layer_normalization_test.py
index e2b2eea650ee..c3531d83fdb7 100644
--- a/keras/layers/normalization/layer_normalization_test.py
+++ b/keras/layers/normalization/layer_normalization_test.py
@@ -14,335 +14,402 @@
 # ==============================================================================
 """Tests for normalization layers."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
+from keras.layers.normalization import layer_normalization
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.layers.normalization import layer_normalization
-
-
-def _run_layernorm_correctness_test(layer, dtype='float32'):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Lambda(lambda x: tf.cast(x, dtype='float16')))
-  norm = layer(input_shape=(2, 2, 2), dtype=dtype)
-  model.add(norm)
-  model.compile(
-      loss='mse',
-      optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-      run_eagerly=test_utils.should_run_eagerly())
-
-  # centered on 5.0, variance 10.0
-  x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
-       .astype(dtype))
-  model.fit(x, x, epochs=4, verbose=0)
-  out = model.predict(x)
-  out -= keras.backend.eval(norm.beta)
-  out /= keras.backend.eval(norm.gamma)
 
-  np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-  np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
-
-class LayerNormalizationTest(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes
-  def test_basic_layernorm(self):
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        kwargs={
-            'gamma_regularizer': keras.regularizers.l2(0.01),
-            'beta_regularizer': keras.regularizers.l2(0.01)
-        },
-        input_shape=(3, 4, 2))
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        kwargs={
-            'gamma_initializer': 'ones',
-            'beta_initializer': 'ones',
-        },
-        input_shape=(3, 4, 2))
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        kwargs={'scale': False,
-                'center': False},
-        input_shape=(3, 3))
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        kwargs={'axis': (-3, -2, -1)},
-        input_shape=(2, 8, 8, 3))
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        input_shape=(1, 0, 10))
-
-  @test_combinations.run_all_keras_modes
-  def test_non_fused_layernorm(self):
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        kwargs={'axis': -2},
-        input_shape=(3, 4, 2))
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        kwargs={'axis': (-3, -2)},
-        input_shape=(2, 8, 8, 3))
-    test_utils.layer_test(
-        keras.layers.LayerNormalization,
-        kwargs={'axis': (-3, -1)},
-        input_shape=(2, 8, 8, 3))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_layernorm_weights(self):
-    layer = keras.layers.LayerNormalization(scale=False, center=False)
-    layer.build((None, 3, 4))
-    self.assertEqual(len(layer.trainable_weights), 0)
-    self.assertEqual(len(layer.weights), 0)
-
-    layer = keras.layers.LayerNormalization()
-    layer.build((None, 3, 4))
-    self.assertEqual(len(layer.trainable_weights), 2)
-    self.assertEqual(len(layer.weights), 2)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_layernorm_regularization(self):
-    layer = keras.layers.LayerNormalization(
-        gamma_regularizer='l1', beta_regularizer='l1')
-    layer.build((None, 3, 4))
-    self.assertEqual(len(layer.losses), 2)
-    max_norm = keras.constraints.max_norm
-    layer = keras.layers.LayerNormalization(
-        gamma_constraint=max_norm, beta_constraint=max_norm)
-    layer.build((None, 3, 4))
-    self.assertEqual(layer.gamma.constraint, max_norm)
-    self.assertEqual(layer.beta.constraint, max_norm)
-
-  @test_combinations.run_all_keras_modes
-  def test_layernorm_convnet_channel_last(self):
+def _run_layernorm_correctness_test(layer, dtype="float32"):
     model = keras.models.Sequential()
-    norm = keras.layers.LayerNormalization(input_shape=(4, 4, 3))
+    model.add(keras.layers.Lambda(lambda x: tf.cast(x, dtype="float16")))
+    norm = layer(input_shape=(2, 2, 2), dtype=dtype)
     model.add(norm)
     model.compile(
-        loss='mse',
+        loss="mse",
         optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        run_eagerly=test_utils.should_run_eagerly())
+        run_eagerly=test_utils.should_run_eagerly(),
+    )
 
     # centered on 5.0, variance 10.0
-    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2)).astype(
+        dtype
+    )
     model.fit(x, x, epochs=4, verbose=0)
     out = model.predict(x)
-    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
-    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
-
-    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
-    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
-
-  @test_combinations.run_all_keras_modes
-  def test_layernorm_ragged_tensor(self):
-    x = tf.ragged.constant(
-        [[[3., 1., 1.], [4., 1., 1.]],
-         [[5., 9., 1.]],
-         [[1., 2., 1.]]],
-        inner_shape=(3,))
-    layer = keras.layers.LayerNormalization()
-    self.assertEqual(layer(x).shape, (3, None, 3))
-
-  @test_combinations.run_all_keras_modes
-  def test_layernorm_correctness(self):
-    _run_layernorm_correctness_test(
-        layer_normalization.LayerNormalization, dtype='float32')
-
-  @test_combinations.run_all_keras_modes
-  def test_layernorm_mixed_precision(self):
-    _run_layernorm_correctness_test(
-        layer_normalization.LayerNormalization, dtype='float16')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testIncorrectAxisType(self):
-    with self.assertRaisesRegex(TypeError,
-                                r'Expected an int or a list/tuple of ints'):
-      _ = layer_normalization.LayerNormalization(axis={'axis': -1})
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInvalidAxis(self):
-    with self.assertRaisesRegex(
-        ValueError,
-        r'Invalid value for `axis` argument. Expected 0 <= axis < inputs.rank'):
-      layer_norm = layer_normalization.LayerNormalization(axis=3)
-      layer_norm.build(input_shape=(2, 2, 2))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testDuplicateAxis(self):
-    with self.assertRaisesRegex(ValueError, r'Duplicate axis:'):
-      layer_norm = layer_normalization.LayerNormalization(axis=[-1, -1])
-      layer_norm.build(input_shape=(2, 2, 2))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testFusedAttr(self):
-    layer_norm = layer_normalization.LayerNormalization(axis=[-2, -1])
-    layer_norm.build(input_shape=(2, 2, 2))
-    self.assertEqual(layer_norm._fused, True)
+    out -= keras.backend.eval(norm.beta)
+    out /= keras.backend.eval(norm.gamma)
+
+    np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+    np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class LayerNormalizationTest(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes
+    def test_basic_layernorm(self):
+        test_utils.layer_test(
+            keras.layers.LayerNormalization,
+            kwargs={
+                "gamma_regularizer": keras.regularizers.l2(0.01),
+                "beta_regularizer": keras.regularizers.l2(0.01),
+            },
+            input_shape=(3, 4, 2),
+        )
+        test_utils.layer_test(
+            keras.layers.LayerNormalization,
+            kwargs={
+                "gamma_initializer": "ones",
+                "beta_initializer": "ones",
+            },
+            input_shape=(3, 4, 2),
+        )
+        test_utils.layer_test(
+            keras.layers.LayerNormalization,
+            kwargs={"scale": False, "center": False},
+            input_shape=(3, 3),
+        )
+        test_utils.layer_test(
+            keras.layers.LayerNormalization,
+            kwargs={"axis": (-3, -2, -1)},
+            input_shape=(2, 8, 8, 3),
+        )
+        test_utils.layer_test(
+            keras.layers.LayerNormalization, input_shape=(1, 0, 10)
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_non_fused_layernorm(self):
+        test_utils.layer_test(
+            keras.layers.LayerNormalization,
+            kwargs={"axis": -2},
+            input_shape=(3, 4, 2),
+        )
+        test_utils.layer_test(
+            keras.layers.LayerNormalization,
+            kwargs={"axis": (-3, -2)},
+            input_shape=(2, 8, 8, 3),
+        )
+        test_utils.layer_test(
+            keras.layers.LayerNormalization,
+            kwargs={"axis": (-3, -1)},
+            input_shape=(2, 8, 8, 3),
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_layernorm_weights(self):
+        layer = keras.layers.LayerNormalization(scale=False, center=False)
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 0)
+        self.assertEqual(len(layer.weights), 0)
+
+        layer = keras.layers.LayerNormalization()
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 2)
+        self.assertEqual(len(layer.weights), 2)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_layernorm_regularization(self):
+        layer = keras.layers.LayerNormalization(
+            gamma_regularizer="l1", beta_regularizer="l1"
+        )
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.losses), 2)
+        max_norm = keras.constraints.max_norm
+        layer = keras.layers.LayerNormalization(
+            gamma_constraint=max_norm, beta_constraint=max_norm
+        )
+        layer.build((None, 3, 4))
+        self.assertEqual(layer.gamma.constraint, max_norm)
+        self.assertEqual(layer.beta.constraint, max_norm)
+
+    @test_combinations.run_all_keras_modes
+    def test_layernorm_convnet_channel_last(self):
+        model = keras.models.Sequential()
+        norm = keras.layers.LayerNormalization(input_shape=(4, 4, 3))
+        model.add(norm)
+        model.compile(
+            loss="mse",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        # centered on 5.0, variance 10.0
+        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+        model.fit(x, x, epochs=4, verbose=0)
+        out = model.predict(x)
+        out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+        np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+        np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
+    @test_combinations.run_all_keras_modes
+    def test_layernorm_ragged_tensor(self):
+        x = tf.ragged.constant(
+            [
+                [[3.0, 1.0, 1.0], [4.0, 1.0, 1.0]],
+                [[5.0, 9.0, 1.0]],
+                [[1.0, 2.0, 1.0]],
+            ],
+            inner_shape=(3,),
+        )
+        layer = keras.layers.LayerNormalization()
+        self.assertEqual(layer(x).shape, (3, None, 3))
+
+    @test_combinations.run_all_keras_modes
+    def test_layernorm_correctness(self):
+        _run_layernorm_correctness_test(
+            layer_normalization.LayerNormalization, dtype="float32"
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_layernorm_mixed_precision(self):
+        _run_layernorm_correctness_test(
+            layer_normalization.LayerNormalization, dtype="float16"
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testIncorrectAxisType(self):
+        with self.assertRaisesRegex(
+            TypeError, r"Expected an int or a list/tuple of ints"
+        ):
+            _ = layer_normalization.LayerNormalization(axis={"axis": -1})
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInvalidAxis(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Invalid value for `axis` argument. "
+            r"Expected 0 <= axis < inputs.rank",
+        ):
+            layer_norm = layer_normalization.LayerNormalization(axis=3)
+            layer_norm.build(input_shape=(2, 2, 2))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDuplicateAxis(self):
+        with self.assertRaisesRegex(ValueError, r"Duplicate axis:"):
+            layer_norm = layer_normalization.LayerNormalization(axis=[-1, -1])
+            layer_norm.build(input_shape=(2, 2, 2))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testFusedAttr(self):
+        layer_norm = layer_normalization.LayerNormalization(axis=[-2, -1])
+        layer_norm.build(input_shape=(2, 2, 2))
+        self.assertEqual(layer_norm._fused, True)
 
 
 class LayerNormalizationNumericsTest(test_combinations.TestCase):
-  """Tests LayerNormalization has correct and numerically stable outputs."""
-
-  def _expected_layer_norm(self, x, beta, gamma, batch_input_shape, axis,
-                           epsilon):
-    """Returns the layer norm, which is computed using NumPy."""
-    broadcast_shape = [batch_input_shape[i] if i in axis else 1
-                       for i in range(len(batch_input_shape))]
-    mean = np.mean(x, axis=axis, keepdims=True)
-    var = np.var(x, axis=axis, keepdims=True)
-    expected = (x - mean) / np.sqrt(var + epsilon)
-    expected *= np.reshape(gamma, broadcast_shape)
-    expected += np.reshape(beta, broadcast_shape)
-    return expected
-
-  def _test_forward_pass(self, batch_input_shape, axis, fp64_tol=1e-14,
-                         fp32_tol=1e-6, fp16_tol=1e-2):
-    """Tests the forward pass of layer layer_normalization.
-
-    Args:
-      batch_input_shape: The input shape that will be used to test, including
-        the batch dimension.
-      axis: A list of axes to normalize. Will be passed to the `axis` argument
-        of Layerlayer_normalization.
-      fp64_tol: The relative and absolute tolerance for float64.
-      fp32_tol: The relative and absolute tolerance for float32.
-      fp16_tol: The relative and absolute tolerance for float16.
-    """
-    param_shape = [batch_input_shape[i] for i in axis]
-    param_elems = 1
-    for dim in param_shape:
-      param_elems *= dim
-    beta = np.arange(param_elems, dtype='float64').reshape(param_shape)
-    gamma = np.arange(1, param_elems + 1, dtype='float64').reshape(param_shape)
-    x = np.random.normal(size=batch_input_shape)
-
-    for epsilon in 1e-12, 1e-3:
-      expected = self._expected_layer_norm(x, beta, gamma, batch_input_shape,
-                                           axis, epsilon)
-      for dtype in 'float64', 'float32', 'float16':
-        norm = layer_normalization.LayerNormalization(
-            axis=axis, dtype=dtype, batch_input_shape=batch_input_shape,
-            epsilon=epsilon, beta_initializer=keras.initializers.constant(beta),
-            gamma_initializer=keras.initializers.constant(gamma))
-        y = norm(keras.backend.cast(x, dtype))
-        actual = keras.backend.eval(y)
-
-        if dtype == 'float64':
-          tol = fp64_tol
-        elif dtype == 'float32':
-          tol = fp32_tol
-        else:
-          assert dtype == 'float16'
-          tol = fp16_tol
-
-        # We use absolute tolerances in addition to relative tolerances, because
-        # some of the values are very close to zero.
-        self.assertAllClose(expected, actual, rtol=tol, atol=tol)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_forward(self):
-    # For numeric stability, we ensure the axis's dimension(s) have at least 4
-    # elements.
-    self._test_forward_pass((4, 3), (0,))
-    self._test_forward_pass((3, 4), (1,))
-    self._test_forward_pass((4, 3, 2), (0,))
-    self._test_forward_pass((2, 4, 2), (1,))
-    self._test_forward_pass((2, 3, 4), (2,), fp16_tol=5e-2)
-    self._test_forward_pass((2, 3, 2), (0, 2))
-    self._test_forward_pass((2, 2, 2, 2), (1, 3))
-    self._test_forward_pass((2, 2, 2, 2), (2, 3))
-    self._test_forward_pass((2, 3, 4, 5), (3,))
-
-  def _test_backward_pass(self, batch_input_shape, axis, fp64_tol=1e-5,
-                          fp32_tol=1e-5, fp16_tol=2e-2):
-    """Tests the backwards pass of layer layer_normalization.
-
-    Args:
-      batch_input_shape: The input shape that will be used to test, including
-        the batch dimension.
-      axis: A list of axes to normalize. Will be passed to the `axis` argument
-        of Layerlayer_normalization.
-      fp64_tol: The relative and absolute tolerance for float64.
-      fp32_tol: The relative and absolute tolerance for float32.
-      fp16_tol: The relative and absolute tolerance for float16.
-    """
-    param_shape = [batch_input_shape[i] for i in axis]
-    param_elems = 1
-    for dim in param_shape:
-      param_elems *= dim
-    beta = np.arange(param_elems, dtype='float64').reshape(param_shape)
-    gamma = np.arange(1, param_elems + 1, dtype='float64').reshape(param_shape)
-    x = np.random.normal(size=batch_input_shape)
-
-    for epsilon in 1e-12, 1e-3:
-      # Float64 must come first in this list, as we use the float64 numerical
-      # gradients to compare to the float32 and float16 symbolic gradients as
-      # well. Computing float32/float16 numerical gradients is too numerically
-      # unstable.
-      for dtype in 'float64', 'float32', 'float16':
-        norm = layer_normalization.LayerNormalization(
-            axis=axis, dtype=dtype, batch_input_shape=batch_input_shape,
-            epsilon=epsilon, beta_initializer=keras.initializers.constant(beta),
-            gamma_initializer=keras.initializers.constant(gamma))
-        norm.build(x.shape)
-
-        # pylint: disable=cell-var-from-loop
-        def forward_fn(x, beta, gamma):
-          # We must monkey-patch the attributes of `norm` with the function
-          # arguments, so that the gradient checker will properly compute their
-          # gradients. The gradient checker computes gradients with respect to
-          # the input arguments of `f`.
-          with tf.compat.v1.test.mock.patch.object(norm, 'beta', beta):
-            with tf.compat.v1.test.mock.patch.object(norm, 'gamma', gamma):
-              return norm(x)
-        # pylint: enable=cell-var-from-loop
-        results = tf.test.compute_gradient(
-            forward_fn, [keras.backend.cast(x, dtype), norm.beta, norm.gamma])
-        ([x_grad_t, beta_grad_t, gamma_grad_t],
-         [x_grad_n, beta_grad_n, gamma_grad_n]) = results
-
-        if dtype == 'float64':
-          # We use the float64 numeric gradients as the reference, to compare
-          # against the symbolic gradients for all dtypes.
-          x_grad_ref = x_grad_n
-          beta_grad_ref = beta_grad_n
-          gamma_grad_ref = gamma_grad_n
-          tol = fp64_tol
-        elif dtype == 'float32':
-          tol = fp32_tol
-        else:
-          assert dtype == 'float16'
-          tol = fp16_tol
-
-        # We use absolute tolerances in addition to relative tolerances, because
-        # some of the values are very close to zero.
-        self.assertAllClose(x_grad_t, x_grad_ref, rtol=tol, atol=tol)
-        self.assertAllClose(beta_grad_t, beta_grad_ref, rtol=tol, atol=tol)
-        self.assertAllClose(gamma_grad_t, gamma_grad_ref, rtol=tol, atol=tol)
-
-  # The gradient_checker_v2 does not work properly with LayerNorm in graph mode.
-  @test_utils.run_v2_only
-  def test_backward(self):
-    # For numeric stability, we ensure the axis's dimension(s) have at least 4
-    # elements.
-    self._test_backward_pass((4, 3), (0,))
-    self._test_backward_pass((2, 4, 2), (1,))
-    self._test_backward_pass((2, 3, 4), (2,))
-    self._test_backward_pass((2, 3, 2), (0, 2), fp64_tol=5e-4, fp32_tol=5e-4)
-    self._test_backward_pass((2, 2, 2, 2), (1, 3))
-    self._test_backward_pass((2, 2, 2, 2), (2, 3))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Tests LayerNormalization has correct and numerically stable outputs."""
+
+    def _expected_layer_norm(
+        self, x, beta, gamma, batch_input_shape, axis, epsilon
+    ):
+        """Returns the layer norm, which is computed using NumPy."""
+        broadcast_shape = [
+            batch_input_shape[i] if i in axis else 1
+            for i in range(len(batch_input_shape))
+        ]
+        mean = np.mean(x, axis=axis, keepdims=True)
+        var = np.var(x, axis=axis, keepdims=True)
+        expected = (x - mean) / np.sqrt(var + epsilon)
+        expected *= np.reshape(gamma, broadcast_shape)
+        expected += np.reshape(beta, broadcast_shape)
+        return expected
+
+    def _test_forward_pass(
+        self,
+        batch_input_shape,
+        axis,
+        fp64_tol=1e-14,
+        fp32_tol=1e-6,
+        fp16_tol=1e-2,
+    ):
+        """Tests the forward pass of layer layer_normalization.
+
+        Args:
+          batch_input_shape: The input shape that will be used to test,
+            including the batch dimension.
+          axis: A list of axes to normalize. Will be passed to the `axis`
+            argument of Layerlayer_normalization.
+          fp64_tol: The relative and absolute tolerance for float64.
+          fp32_tol: The relative and absolute tolerance for float32.
+          fp16_tol: The relative and absolute tolerance for float16.
+        """
+        param_shape = [batch_input_shape[i] for i in axis]
+        param_elems = 1
+        for dim in param_shape:
+            param_elems *= dim
+        beta = np.arange(param_elems, dtype="float64").reshape(param_shape)
+        gamma = np.arange(1, param_elems + 1, dtype="float64").reshape(
+            param_shape
+        )
+        x = np.random.normal(size=batch_input_shape)
+
+        for epsilon in 1e-12, 1e-3:
+            expected = self._expected_layer_norm(
+                x, beta, gamma, batch_input_shape, axis, epsilon
+            )
+            for dtype in "float64", "float32", "float16":
+                norm = layer_normalization.LayerNormalization(
+                    axis=axis,
+                    dtype=dtype,
+                    batch_input_shape=batch_input_shape,
+                    epsilon=epsilon,
+                    beta_initializer=keras.initializers.constant(beta),
+                    gamma_initializer=keras.initializers.constant(gamma),
+                )
+                y = norm(keras.backend.cast(x, dtype))
+                actual = keras.backend.eval(y)
+
+                if dtype == "float64":
+                    tol = fp64_tol
+                elif dtype == "float32":
+                    tol = fp32_tol
+                else:
+                    assert dtype == "float16"
+                    tol = fp16_tol
+
+                # We use absolute tolerances in addition to relative tolerances,
+                # because some of the values are very close to zero.
+                self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_forward(self):
+        # For numeric stability, we ensure the axis's dimension(s) have at least
+        # 4 elements.
+        self._test_forward_pass((4, 3), (0,))
+        self._test_forward_pass((3, 4), (1,))
+        self._test_forward_pass((4, 3, 2), (0,))
+        self._test_forward_pass((2, 4, 2), (1,))
+        self._test_forward_pass((2, 3, 4), (2,), fp16_tol=5e-2)
+        self._test_forward_pass((2, 3, 2), (0, 2))
+        self._test_forward_pass((2, 2, 2, 2), (1, 3))
+        self._test_forward_pass((2, 2, 2, 2), (2, 3))
+        self._test_forward_pass((2, 3, 4, 5), (3,))
+
+    def _test_backward_pass(
+        self,
+        batch_input_shape,
+        axis,
+        fp64_tol=1e-5,
+        fp32_tol=1e-5,
+        fp16_tol=2e-2,
+    ):
+        """Tests the backwards pass of layer layer_normalization.
+
+        Args:
+          batch_input_shape: The input shape that will be used to test,
+            including the batch dimension.
+          axis: A list of axes to normalize. Will be passed to the `axis`
+            argument of Layerlayer_normalization.
+          fp64_tol: The relative and absolute tolerance for float64.
+          fp32_tol: The relative and absolute tolerance for float32.
+          fp16_tol: The relative and absolute tolerance for float16.
+        """
+        param_shape = [batch_input_shape[i] for i in axis]
+        param_elems = 1
+        for dim in param_shape:
+            param_elems *= dim
+        beta = np.arange(param_elems, dtype="float64").reshape(param_shape)
+        gamma = np.arange(1, param_elems + 1, dtype="float64").reshape(
+            param_shape
+        )
+        x = np.random.normal(size=batch_input_shape)
+
+        for epsilon in 1e-12, 1e-3:
+            # Float64 must come first in this list, as we use the float64
+            # numerical gradients to compare to the float32 and float16 symbolic
+            # gradients as well. Computing float32/float16 numerical gradients
+            # is too numerically unstable.
+            for dtype in "float64", "float32", "float16":
+                norm = layer_normalization.LayerNormalization(
+                    axis=axis,
+                    dtype=dtype,
+                    batch_input_shape=batch_input_shape,
+                    epsilon=epsilon,
+                    beta_initializer=keras.initializers.constant(beta),
+                    gamma_initializer=keras.initializers.constant(gamma),
+                )
+                norm.build(x.shape)
+
+                def forward_fn(x, beta, gamma):
+                    # We must monkey-patch the attributes of `norm` with the
+                    # function arguments, so that the gradient checker will
+                    # properly compute their gradients. The gradient checker
+                    # computes gradients with respect to the input arguments of
+                    # `f`.
+                    with tf.compat.v1.test.mock.patch.object(
+                        norm, "beta", beta
+                    ):
+                        with tf.compat.v1.test.mock.patch.object(
+                            norm, "gamma", gamma
+                        ):
+                            return norm(x)
+
+                results = tf.test.compute_gradient(
+                    forward_fn,
+                    [keras.backend.cast(x, dtype), norm.beta, norm.gamma],
+                )
+                (
+                    [x_grad_t, beta_grad_t, gamma_grad_t],
+                    [x_grad_n, beta_grad_n, gamma_grad_n],
+                ) = results
+
+                if dtype == "float64":
+                    # We use the float64 numeric gradients as the reference, to
+                    # compare against the symbolic gradients for all dtypes.
+                    x_grad_ref = x_grad_n
+                    beta_grad_ref = beta_grad_n
+                    gamma_grad_ref = gamma_grad_n
+                    tol = fp64_tol
+                elif dtype == "float32":
+                    tol = fp32_tol
+                else:
+                    assert dtype == "float16"
+                    tol = fp16_tol
+
+                # We use absolute tolerances in addition to relative tolerances,
+                # because some of the values are very close to zero.
+                self.assertAllClose(x_grad_t, x_grad_ref, rtol=tol, atol=tol)
+                self.assertAllClose(
+                    beta_grad_t, beta_grad_ref, rtol=tol, atol=tol
+                )
+                self.assertAllClose(
+                    gamma_grad_t, gamma_grad_ref, rtol=tol, atol=tol
+                )
+
+    # The gradient_checker_v2 does not work properly with LayerNorm in graph
+    # mode.
+    @test_utils.run_v2_only
+    def test_backward(self):
+        # For numeric stability, we ensure the axis's dimension(s) have at least
+        # 4 elements.
+        self._test_backward_pass((4, 3), (0,))
+        self._test_backward_pass((2, 4, 2), (1,))
+        self._test_backward_pass((2, 3, 4), (2,))
+        self._test_backward_pass(
+            (2, 3, 2), (0, 2), fp64_tol=5e-4, fp32_tol=5e-4
+        )
+        self._test_backward_pass((2, 2, 2, 2), (1, 3))
+        self._test_backward_pass((2, 2, 2, 2), (2, 3))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/normalization/spectral_normalization.py b/keras/layers/normalization/spectral_normalization.py
new file mode 100644
index 000000000000..c958cd4a79ac
--- /dev/null
+++ b/keras/layers/normalization/spectral_normalization.py
@@ -0,0 +1,141 @@
+# Copyright 2023 The Keras Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow.compat.v2 as tf
+
+from keras.initializers import TruncatedNormal
+from keras.layers.rnn import Wrapper
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+# Adapted from TF-Addons implementation
+@keras_export("keras.layers.SpectralNormalization", v1=[])
+class SpectralNormalization(Wrapper):
+    """Performs spectral normalization on the weights of a target layer.
+
+    This wrapper controls the Lipschitz constant of the weights of a layer by
+    constraining their spectral norm, which can stabilize the training of GANs.
+
+    Args:
+      layer: A `keras.layers.Layer` instance that
+        has either a `kernel` (e.g. `Conv2D`, `Dense`...)
+        or an `embeddings` attribute (`Embedding` layer).
+      power_iterations: int, the number of iterations during normalization.
+
+    Examples:
+
+    Wrap `keras.layers.Conv2D`:
+    >>> x = np.random.rand(1, 10, 10, 1)
+    >>> conv2d = SpectralNormalization(tf.keras.layers.Conv2D(2, 2))
+    >>> y = conv2d(x)
+    >>> y.shape
+    TensorShape([1, 9, 9, 2])
+
+    Wrap `keras.layers.Dense`:
+    >>> x = np.random.rand(1, 10, 10, 1)
+    >>> dense = SpectralNormalization(tf.keras.layers.Dense(10))
+    >>> y = dense(x)
+    >>> y.shape
+    TensorShape([1, 10, 10, 10])
+
+    Reference:
+
+    - [Spectral Normalization for GAN](https://arxiv.org/abs/1802.05957).
+    """
+
+    def __init__(self, layer, power_iterations=1, **kwargs):
+        super().__init__(layer, **kwargs)
+        if power_iterations <= 0:
+            raise ValueError(
+                "`power_iterations` should be greater than zero. Received: "
+                f"`power_iterations={power_iterations}`"
+            )
+        self.power_iterations = power_iterations
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        input_shape = tf.TensorShape(input_shape)
+        self.input_spec = tf.keras.layers.InputSpec(
+            shape=[None] + input_shape[1:]
+        )
+
+        if hasattr(self.layer, "kernel"):
+            self.kernel = self.layer.kernel
+        elif hasattr(self.layer, "embeddings"):
+            self.kernel = self.layer.embeddings
+        else:
+            raise ValueError(
+                f"{type(self.layer).__name__} object has no attribute 'kernel' "
+                "nor 'embeddings'"
+            )
+
+        self.kernel_shape = self.kernel.shape.as_list()
+
+        self.vector_u = self.add_weight(
+            shape=(1, self.kernel_shape[-1]),
+            initializer=TruncatedNormal(stddev=0.02),
+            trainable=False,
+            name="vector_u",
+            dtype=self.kernel.dtype,
+        )
+
+    def call(self, inputs, training=False):
+        if training:
+            self.normalize_weights()
+
+        output = self.layer(inputs)
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return tf.TensorShape(
+            self.layer.compute_output_shape(input_shape).as_list()
+        )
+
+    def normalize_weights(self):
+        """Generate spectral normalized weights.
+
+        This method will update the value of `self.kernel` with the
+        spectral normalized value, so that the layer is ready for `call()`.
+        """
+
+        weights = tf.reshape(self.kernel, [-1, self.kernel_shape[-1]])
+        vector_u = self.vector_u
+
+        # check for zeroes weights
+        if not tf.reduce_all(tf.equal(weights, 0.0)):
+            for _ in range(self.power_iterations):
+                vector_v = tf.math.l2_normalize(
+                    tf.matmul(vector_u, weights, transpose_b=True)
+                )
+                vector_u = tf.math.l2_normalize(tf.matmul(vector_v, weights))
+            vector_u = tf.stop_gradient(vector_u)
+            vector_v = tf.stop_gradient(vector_v)
+            sigma = tf.matmul(
+                tf.matmul(vector_v, weights), vector_u, transpose_b=True
+            )
+            self.vector_u.assign(tf.cast(vector_u, self.vector_u.dtype))
+            self.kernel.assign(
+                tf.cast(
+                    tf.reshape(self.kernel / sigma, self.kernel_shape),
+                    self.kernel.dtype,
+                )
+            )
+
+    def get_config(self):
+        config = {"power_iterations": self.power_iterations}
+        base_config = super().get_config()
+        return {**base_config, **config}
diff --git a/keras/layers/normalization/spectral_normalization_test.py b/keras/layers/normalization/spectral_normalization_test.py
new file mode 100644
index 000000000000..555850291af3
--- /dev/null
+++ b/keras/layers/normalization/spectral_normalization_test.py
@@ -0,0 +1,184 @@
+# Copyright 2023 The Keras Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+from absl.testing import parameterized
+
+import keras
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+
+class SpectralNormalizationTest(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes
+    def test_basic_spectralnorm(self):
+        test_utils.layer_test(
+            keras.layers.SpectralNormalization,
+            kwargs={"layer": keras.layers.Dense(2), "input_shape": (3, 4)},
+            input_data=tf.random.uniform((10, 3, 4)),
+        )
+
+    @test_combinations.run_all_keras_modes
+    def test_from_to_config(self):
+        base_layer = keras.layers.Dense(1)
+        sn = keras.layers.SpectralNormalization(base_layer)
+        config = sn.get_config()
+
+        new_sn = keras.layers.SpectralNormalization.from_config(config)
+        self.assertEqual(sn.power_iterations, new_sn.power_iterations)
+
+    @test_combinations.run_all_keras_modes
+    def test_save_load_model(self):
+        base_layer = keras.layers.Dense(1)
+        input_shape = [1]
+
+        inputs = keras.layers.Input(shape=input_shape)
+        sn_layer = keras.layers.SpectralNormalization(base_layer)
+        model = keras.models.Sequential(layers=[inputs, sn_layer])
+
+        # initialize model
+        model.predict(tf.random.uniform((2, 1)))
+
+        with self.subTest("h5"):
+            model.save("test.h5")
+            new_model = keras.models.load_model("test.h5")
+
+            self.assertEqual(
+                model.layers[0].get_config(), new_model.layers[0].get_config()
+            )
+        with self.subTest("savedmodel"):
+            model.save("test")
+            new_model = keras.models.load_model("test")
+
+            self.assertEqual(
+                model.layers[0].get_config(), new_model.layers[0].get_config()
+            )
+        with self.subTest("keras_v3"):
+            model.save("test.keras")
+            new_model = keras.models.load_model("test.keras")
+
+            self.assertEqual(
+                model.layers[0].get_config(), new_model.layers[0].get_config()
+            )
+
+    @test_combinations.run_all_keras_modes
+    def test_normalization(self):
+        inputs = keras.layers.Input(shape=[2, 2, 1])
+
+        base_layer = keras.layers.Conv2D(
+            1, (2, 2), kernel_initializer=tf.constant_initializer(value=2)
+        )
+        sn_layer = keras.layers.SpectralNormalization(base_layer)
+        model = keras.models.Sequential(layers=[inputs, sn_layer])
+
+        weights = tf.squeeze(model.layers[0].w.numpy())
+        # This wrapper normalizes weights by the maximum eigen value
+        eigen_val, _ = tf.linalg.eig(weights)
+        weights_normalized = weights / tf.reduce_max(eigen_val)
+
+        for training in [False, True]:
+            _ = model(
+                tf.constant(tf.ones((1, 2, 2, 1), dtype=tf.float32)),
+                training=training,
+            )
+            if training:
+                w = weights_normalized
+            else:
+                w = weights
+            self.assertAllClose(w, tf.squeeze(model.layers[0].w.numpy()))
+
+    @test_combinations.run_all_keras_modes
+    def test_apply_layer(self):
+        images = tf.ones((1, 2, 2, 1))
+        sn_wrapper = keras.layers.SpectralNormalization(
+            keras.layers.Conv2D(
+                1, [2, 2], kernel_initializer=tf.constant_initializer(value=1)
+            ),
+            input_shape=(2, 2, 1),
+        )
+
+        result = sn_wrapper(images, training=False)
+        result_train = sn_wrapper(images, training=True)
+        expected_output = tf.constant([[[[4.0]]]], dtype=tf.float32)
+
+        self.assertAllClose(result, expected_output)
+        # max eigen value of 2x2 matrix of ones is 2
+        self.assertAllClose(result_train, expected_output / 2)
+        self.assertTrue(hasattr(sn_wrapper, "u"))
+
+    @test_combinations.run_all_keras_modes
+    def test_no_layer(self):
+        images = tf.random.uniform((2, 4, 43))
+        with self.assertRaises(AssertionError):
+            keras.layers.SpectralNormalization(images)
+
+    @test_combinations.run_all_keras_modes
+    def test_no_kernel(self):
+        with self.assertRaises(AttributeError):
+            keras.layers.SpectralNormalization(
+                keras.layers.MaxPooling2D(2, 2)
+            ).build((2, 2))
+
+    @parameterized.parameters(
+        [
+            (lambda: keras.layers.Dense(2), [3, 2]),
+            (
+                lambda: keras.layers.Conv2D(3, (2, 2), padding="same"),
+                [4, 4, 3],
+            ),
+            (lambda: keras.layers.Embedding(2, 10), [2]),
+        ],
+    )
+    @test_combinations.run_all_keras_modes
+    def test_model_build(self, base_layer_fn, input_shape):
+        inputs = keras.layers.Input(shape=input_shape)
+        base_layer = base_layer_fn()
+        sn_layer = keras.layers.SpectralNormalization(base_layer)
+        model = keras.models.Sequential(layers=[inputs, sn_layer])
+        model.build()
+        self.assertTrue(hasattr(model.layers[0], "vector_u"))
+
+    @parameterized.parameters(
+        [
+            (lambda: keras.layers.Dense(2), [3, 2], [3, 2]),
+            (
+                lambda: keras.layers.Conv2D(3, (2, 2), padding="same"),
+                [4, 4, 3],
+                [4, 4, 3],
+            ),
+            (lambda: keras.layers.Embedding(2, 10), [2], [2, 10]),
+        ],
+    )
+    @test_combinations.run_all_keras_modes
+    def test_model_fit(self, base_layer_fn, input_shape, output_shape):
+        inputs = keras.layers.Input(shape=input_shape)
+        base_layer = base_layer_fn()
+
+        sn_layer = keras.layers.SpectralNormalization(base_layer)
+        model = keras.models.Sequential(layers=[inputs, sn_layer])
+        model.add(keras.layers.Activation("relu"))
+
+        model.compile(
+            optimizer=keras.optimizers.RMSprop(learning_rate=0.001),
+            loss="mse",
+        )
+        model.fit(
+            tf.random.uniform((2, *input_shape)),
+            tf.random.uniform((2, *output_shape)),
+            epochs=3,
+            batch_size=10,
+            verbose=0,
+        )
+        self.assertTrue(hasattr(model.layers[0], "vector_u"))
diff --git a/keras/layers/normalization/unit_normalization.py b/keras/layers/normalization/unit_normalization.py
index f8f7cd1421f2..eb1746fdde15 100644
--- a/keras/layers/normalization/unit_normalization.py
+++ b/keras/layers/normalization/unit_normalization.py
@@ -13,65 +13,63 @@
 # limitations under the License.
 # ==============================================================================
 """Unit Normalization layer."""
-# pylint: disable=g-bad-import-order
 
-# pylint: disable=g-classes-have-attributes
 
 import tensorflow.compat.v2 as tf
 
 from keras.engine import base_layer
 from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.UnitNormalization', v1=[])
+@keras_export("keras.layers.UnitNormalization", v1=[])
 class UnitNormalization(base_layer.Layer):
-  """Unit normalization layer.
+    """Unit normalization layer.
 
-  Normalize a batch of inputs so that each input in the batch has a L2 norm
-  equal to 1 (across the axes specified in `axis`).
+    Normalize a batch of inputs so that each input in the batch has a L2 norm
+    equal to 1 (across the axes specified in `axis`).
 
-  Example:
+    Example:
 
-  >>> data = tf.constant(np.arange(6).reshape(2, 3), dtype=tf.float32)
-  >>> normalized_data = tf.keras.layers.UnitNormalization()(data)
-  >>> print(tf.reduce_sum(normalized_data[0, :] ** 2).numpy())
-  1.0
+    >>> data = tf.constant(np.arange(6).reshape(2, 3), dtype=tf.float32)
+    >>> normalized_data = tf.keras.layers.UnitNormalization()(data)
+    >>> print(tf.reduce_sum(normalized_data[0, :] ** 2).numpy())
+    1.0
 
-  Args:
-    axis: Integer or list/tuple. The axis or axes to normalize across. Typically
-      this is the features axis or axes. The left-out axes are typically the
-      batch axis or axes. Defaults to `-1`, the last dimension in
-      the input.
-  """
+    Args:
+      axis: Integer or list/tuple. The axis or axes to normalize across.
+        Typically, this is the features axis or axes. The left-out axes are
+        typically the batch axis or axes. `-1` is the last dimension
+        in the input. Defaults to `-1`.
+    """
 
-  def __init__(self,
-               axis=-1,
-               **kwargs):
-    super().__init__(**kwargs)
-    if isinstance(axis, (list, tuple)):
-      self.axis = list(axis)
-    elif isinstance(axis, int):
-      self.axis = axis
-    else:
-      raise TypeError(
-          'Invalid value for `axis` argument: '
-          'expected an int or a list/tuple of ints. '
-          f'Received: axis={axis}')
-    self.supports_masking = True
+    def __init__(self, axis=-1, **kwargs):
+        super().__init__(**kwargs)
+        if isinstance(axis, (list, tuple)):
+            self.axis = list(axis)
+        elif isinstance(axis, int):
+            self.axis = axis
+        else:
+            raise TypeError(
+                "Invalid value for `axis` argument: "
+                "expected an int or a list/tuple of ints. "
+                f"Received: axis={axis}"
+            )
+        self.supports_masking = True
 
-  def build(self, input_shape):
-    self.axis = tf_utils.validate_axis(self.axis, input_shape)
+    def build(self, input_shape):
+        self.axis = tf_utils.validate_axis(self.axis, input_shape)
 
-  def call(self, inputs):
-    inputs = tf.cast(inputs, self.compute_dtype)
-    return tf.linalg.l2_normalize(inputs, axis=self.axis)
+    def call(self, inputs):
+        inputs = tf.cast(inputs, self.compute_dtype)
+        return tf.linalg.l2_normalize(inputs, axis=self.axis)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    def compute_output_shape(self, input_shape):
+        return input_shape
 
-  def get_config(self):
-    config = super().get_config()
-    config.update({'axis': self.axis})
-    return config
+    def get_config(self):
+        config = super().get_config()
+        config.update({"axis": self.axis})
+        return config
diff --git a/keras/layers/normalization/unit_normalization_test.py b/keras/layers/normalization/unit_normalization_test.py
index 4edc375e1280..386d5a043d03 100644
--- a/keras/layers/normalization/unit_normalization_test.py
+++ b/keras/layers/normalization/unit_normalization_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for Unit Normalization layer."""
-# pylint: disable=g-bad-import-order
+
 
 import tensorflow.compat.v2 as tf
 
@@ -23,56 +23,57 @@
 
 
 def squared_l2_norm(x):
-  return tf.reduce_sum(x ** 2)
+    return tf.reduce_sum(x**2)
 
 
 @test_utils.run_v2_only
 class UnitNormalizationTest(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes
+    def test_basics(self):
+        test_utils.layer_test(
+            keras.layers.UnitNormalization,
+            kwargs={"axis": -1},
+            input_shape=(2, 3),
+        )
+        test_utils.layer_test(
+            keras.layers.UnitNormalization,
+            kwargs={"axis": (1, 2)},
+            input_shape=(1, 3, 3),
+        )
 
-  @test_combinations.run_all_keras_modes
-  def test_basics(self):
-    test_utils.layer_test(
-        keras.layers.UnitNormalization,
-        kwargs={'axis': -1},
-        input_shape=(2, 3))
-    test_utils.layer_test(
-        keras.layers.UnitNormalization,
-        kwargs={'axis': (1, 2)},
-        input_shape=(1, 3, 3))
-
-  def test_correctness(self):
-    layer = keras.layers.UnitNormalization(axis=-1)
-    inputs = tf.random.normal(shape=(2, 3))
-    outputs = layer(inputs).numpy()
-    self.assertAllClose(squared_l2_norm(outputs[0, :]), 1.)
-    self.assertAllClose(squared_l2_norm(outputs[1, :]), 1.)
+    def test_correctness(self):
+        layer = keras.layers.UnitNormalization(axis=-1)
+        inputs = tf.random.normal(shape=(2, 3))
+        outputs = layer(inputs).numpy()
+        self.assertAllClose(squared_l2_norm(outputs[0, :]), 1.0)
+        self.assertAllClose(squared_l2_norm(outputs[1, :]), 1.0)
 
-    layer = keras.layers.UnitNormalization(axis=(1, 2))
-    inputs = tf.random.normal(shape=(2, 3, 3))
-    outputs = layer(inputs).numpy()
-    self.assertAllClose(squared_l2_norm(outputs[0, :, :]), 1.)
-    self.assertAllClose(squared_l2_norm(outputs[1, :, :]), 1.)
+        layer = keras.layers.UnitNormalization(axis=(1, 2))
+        inputs = tf.random.normal(shape=(2, 3, 3))
+        outputs = layer(inputs).numpy()
+        self.assertAllClose(squared_l2_norm(outputs[0, :, :]), 1.0)
+        self.assertAllClose(squared_l2_norm(outputs[1, :, :]), 1.0)
 
-    layer = keras.layers.UnitNormalization(axis=1)
-    inputs = tf.random.normal(shape=(2, 3, 2))
-    outputs = layer(inputs).numpy()
-    self.assertAllClose(squared_l2_norm(outputs[0, :, 0]), 1.)
-    self.assertAllClose(squared_l2_norm(outputs[1, :, 0]), 1.)
-    self.assertAllClose(squared_l2_norm(outputs[0, :, 1]), 1.)
-    self.assertAllClose(squared_l2_norm(outputs[1, :, 1]), 1.)
+        layer = keras.layers.UnitNormalization(axis=1)
+        inputs = tf.random.normal(shape=(2, 3, 2))
+        outputs = layer(inputs).numpy()
+        self.assertAllClose(squared_l2_norm(outputs[0, :, 0]), 1.0)
+        self.assertAllClose(squared_l2_norm(outputs[1, :, 0]), 1.0)
+        self.assertAllClose(squared_l2_norm(outputs[0, :, 1]), 1.0)
+        self.assertAllClose(squared_l2_norm(outputs[1, :, 1]), 1.0)
 
-  def testInvalidAxis(self):
-    with self.assertRaisesRegex(
-        TypeError,
-        r'Invalid value for `axis` argument'):
-      layer = keras.layers.UnitNormalization(axis=None)
+    def testInvalidAxis(self):
+        with self.assertRaisesRegex(
+            TypeError, r"Invalid value for `axis` argument"
+        ):
+            layer = keras.layers.UnitNormalization(axis=None)
 
-    with self.assertRaisesRegex(
-        ValueError,
-        r'Invalid value for `axis` argument'):
-      layer = keras.layers.UnitNormalization(axis=3)
-      layer.build(input_shape=(2, 2, 2))
+        with self.assertRaisesRegex(
+            ValueError, r"Invalid value for `axis` argument"
+        ):
+            layer = keras.layers.UnitNormalization(axis=3)
+            layer.build(input_shape=(2, 2, 2))
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/pooling/BUILD b/keras/layers/pooling/BUILD
index 7aac954fe715..d622f7138420 100644
--- a/keras/layers/pooling/BUILD
+++ b/keras/layers/pooling/BUILD
@@ -1,15 +1,17 @@
 # Description:
 #  Contains the Keras pooling layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/py/tensorflow_gnn:__subpackages__",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
diff --git a/keras/layers/pooling/__init__.py b/keras/layers/pooling/__init__.py
index f69751662192..d70383f39eb2 100644
--- a/keras/layers/pooling/__init__.py
+++ b/keras/layers/pooling/__init__.py
@@ -13,32 +13,31 @@
 # limitations under the License.
 # ==============================================================================
 """Keras Pooling layers."""
-# pylint: disable=g-bad-import-order
 
+
+# Pooling layer aliases.
 # Pooling layers.
 from keras.layers.pooling.average_pooling1d import AveragePooling1D
+from keras.layers.pooling.average_pooling1d import AvgPool1D
 from keras.layers.pooling.average_pooling2d import AveragePooling2D
+from keras.layers.pooling.average_pooling2d import AvgPool2D
 from keras.layers.pooling.average_pooling3d import AveragePooling3D
-from keras.layers.pooling.max_pooling1d import MaxPooling1D
-from keras.layers.pooling.max_pooling2d import MaxPooling2D
-from keras.layers.pooling.max_pooling3d import MaxPooling3D
+from keras.layers.pooling.average_pooling3d import AvgPool3D
 from keras.layers.pooling.global_average_pooling1d import GlobalAveragePooling1D
+from keras.layers.pooling.global_average_pooling1d import GlobalAvgPool1D
 from keras.layers.pooling.global_average_pooling2d import GlobalAveragePooling2D
+from keras.layers.pooling.global_average_pooling2d import GlobalAvgPool2D
 from keras.layers.pooling.global_average_pooling3d import GlobalAveragePooling3D
+from keras.layers.pooling.global_average_pooling3d import GlobalAvgPool3D
+from keras.layers.pooling.global_max_pooling1d import GlobalMaxPool1D
 from keras.layers.pooling.global_max_pooling1d import GlobalMaxPooling1D
+from keras.layers.pooling.global_max_pooling2d import GlobalMaxPool2D
 from keras.layers.pooling.global_max_pooling2d import GlobalMaxPooling2D
+from keras.layers.pooling.global_max_pooling3d import GlobalMaxPool3D
 from keras.layers.pooling.global_max_pooling3d import GlobalMaxPooling3D
-
-# Pooling layer aliases.
-from keras.layers.pooling.average_pooling1d import AvgPool1D
-from keras.layers.pooling.average_pooling2d import AvgPool2D
-from keras.layers.pooling.average_pooling3d import AvgPool3D
 from keras.layers.pooling.max_pooling1d import MaxPool1D
+from keras.layers.pooling.max_pooling1d import MaxPooling1D
 from keras.layers.pooling.max_pooling2d import MaxPool2D
+from keras.layers.pooling.max_pooling2d import MaxPooling2D
 from keras.layers.pooling.max_pooling3d import MaxPool3D
-from keras.layers.pooling.global_average_pooling1d import GlobalAvgPool1D
-from keras.layers.pooling.global_average_pooling2d import GlobalAvgPool2D
-from keras.layers.pooling.global_average_pooling3d import GlobalAvgPool3D
-from keras.layers.pooling.global_max_pooling1d import GlobalMaxPool1D
-from keras.layers.pooling.global_max_pooling2d import GlobalMaxPool2D
-from keras.layers.pooling.global_max_pooling3d import GlobalMaxPool3D
+from keras.layers.pooling.max_pooling3d import MaxPooling3D
diff --git a/keras/layers/pooling/average_pooling1d.py b/keras/layers/pooling/average_pooling1d.py
index 7c4a762d62ba..a4b3a9c6d22c 100644
--- a/keras/layers/pooling/average_pooling1d.py
+++ b/keras/layers/pooling/average_pooling1d.py
@@ -13,126 +13,134 @@
 # limitations under the License.
 # ==============================================================================
 """Average pooling 1D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import functools
 
 from keras import backend
 from keras.layers.pooling.base_pooling1d import Pooling1D
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
+@keras_export("keras.layers.AveragePooling1D", "keras.layers.AvgPool1D")
 class AveragePooling1D(Pooling1D):
-  """Average pooling for temporal data.
-
-  Downsamples the input representation by taking the average value over the
-  window defined by `pool_size`. The window is shifted by `strides`.  The
-  resulting output when using "valid" padding option has a shape of:
-  `output_shape = (input_shape - pool_size + 1) / strides)`
-
-  The resulting output shape when using the "same" padding option is:
-  `output_shape = input_shape / strides`
-
-  For example, for strides=1 and padding="valid":
-
-  >>> x = tf.constant([1., 2., 3., 4., 5.])
-  >>> x = tf.reshape(x, [1, 5, 1])
-  >>> x
-  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
-    array([[[1.],
-            [2.],
-            [3.],
-            [4.],
-            [5.]], dtype=float32)>
-  >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
-  ...    strides=1, padding='valid')
-  >>> avg_pool_1d(x)
-  <tf.Tensor: shape=(1, 4, 1), dtype=float32, numpy=
-  array([[[1.5],
-          [2.5],
-          [3.5],
-          [4.5]]], dtype=float32)>
-
-  For example, for strides=2 and padding="valid":
-
-  >>> x = tf.constant([1., 2., 3., 4., 5.])
-  >>> x = tf.reshape(x, [1, 5, 1])
-  >>> x
-  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
-    array([[[1.],
-            [2.],
-            [3.],
-            [4.],
-            [5.]], dtype=float32)>
-  >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
-  ...    strides=2, padding='valid')
-  >>> avg_pool_1d(x)
-  <tf.Tensor: shape=(1, 2, 1), dtype=float32, numpy=
-  array([[[1.5],
-          [3.5]]], dtype=float32)>
-
-  For example, for strides=1 and padding="same":
-
-  >>> x = tf.constant([1., 2., 3., 4., 5.])
-  >>> x = tf.reshape(x, [1, 5, 1])
-  >>> x
-  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
-    array([[[1.],
-            [2.],
-            [3.],
-            [4.],
-            [5.]], dtype=float32)>
-  >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
-  ...    strides=1, padding='same')
-  >>> avg_pool_1d(x)
-  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
-  array([[[1.5],
-          [2.5],
-          [3.5],
-          [4.5],
-          [5.]]], dtype=float32)>
-
-  Args:
-    pool_size: Integer, size of the average pooling windows.
-    strides: Integer, or None. Factor by which to downscale.
-      E.g. 2 will halve the input.
-      If None, it will default to `pool_size`.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, steps, features)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, features, steps)`.
-
-  Input shape:
-    - If `data_format='channels_last'`:
-      3D tensor with shape `(batch_size, steps, features)`.
-    - If `data_format='channels_first'`:
-      3D tensor with shape `(batch_size, features, steps)`.
-
-  Output shape:
-    - If `data_format='channels_last'`:
-      3D tensor with shape `(batch_size, downsampled_steps, features)`.
-    - If `data_format='channels_first'`:
-      3D tensor with shape `(batch_size, features, downsampled_steps)`.
-  """
-
-  def __init__(self, pool_size=2, strides=None,
-               padding='valid', data_format='channels_last', **kwargs):
-    super().__init__(
-        functools.partial(backend.pool2d, pool_mode='avg'),
-        pool_size=pool_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        **kwargs)
+    """Average pooling for temporal data.
+
+    Downsamples the input representation by taking the average value over the
+    window defined by `pool_size`. The window is shifted by `strides`.  The
+    resulting output when using "valid" padding option has a shape of:
+    `output_shape = (input_shape - pool_size + 1) / strides)`
+
+    The resulting output shape when using the "same" padding option is:
+    `output_shape = input_shape / strides`
+
+    For example, for strides=1 and padding="valid":
+
+    >>> x = tf.constant([1., 2., 3., 4., 5.])
+    >>> x = tf.reshape(x, [1, 5, 1])
+    >>> x
+    <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+      array([[[1.],
+              [2.],
+              [3.],
+              [4.],
+              [5.]], dtype=float32)>
+    >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
+    ...    strides=1, padding='valid')
+    >>> avg_pool_1d(x)
+    <tf.Tensor: shape=(1, 4, 1), dtype=float32, numpy=
+    array([[[1.5],
+            [2.5],
+            [3.5],
+            [4.5]]], dtype=float32)>
+
+    For example, for strides=2 and padding="valid":
+
+    >>> x = tf.constant([1., 2., 3., 4., 5.])
+    >>> x = tf.reshape(x, [1, 5, 1])
+    >>> x
+    <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+      array([[[1.],
+              [2.],
+              [3.],
+              [4.],
+              [5.]], dtype=float32)>
+    >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
+    ...    strides=2, padding='valid')
+    >>> avg_pool_1d(x)
+    <tf.Tensor: shape=(1, 2, 1), dtype=float32, numpy=
+    array([[[1.5],
+            [3.5]]], dtype=float32)>
+
+    For example, for strides=1 and padding="same":
+
+    >>> x = tf.constant([1., 2., 3., 4., 5.])
+    >>> x = tf.reshape(x, [1, 5, 1])
+    >>> x
+    <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+      array([[[1.],
+              [2.],
+              [3.],
+              [4.],
+              [5.]], dtype=float32)>
+    >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
+    ...    strides=1, padding='same')
+    >>> avg_pool_1d(x)
+    <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+    array([[[1.5],
+            [2.5],
+            [3.5],
+            [4.5],
+            [5.]]], dtype=float32)>
+
+    Args:
+      pool_size: Integer, size of the average pooling windows.
+      strides: Integer, or None. Factor by which to downscale.
+        E.g. 2 will halve the input.
+        If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+
+    Input shape:
+      - If `data_format='channels_last'`:
+        3D tensor with shape `(batch_size, steps, features)`.
+      - If `data_format='channels_first'`:
+        3D tensor with shape `(batch_size, features, steps)`.
+
+    Output shape:
+      - If `data_format='channels_last'`:
+        3D tensor with shape `(batch_size, downsampled_steps, features)`.
+      - If `data_format='channels_first'`:
+        3D tensor with shape `(batch_size, features, downsampled_steps)`.
+    """
+
+    def __init__(
+        self,
+        pool_size=2,
+        strides=None,
+        padding="valid",
+        data_format="channels_last",
+        **kwargs
+    ):
+        super().__init__(
+            functools.partial(backend.pool2d, pool_mode="avg"),
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            **kwargs
+        )
 
 
 # Alias
diff --git a/keras/layers/pooling/average_pooling2d.py b/keras/layers/pooling/average_pooling2d.py
index 9c8375cdf8ca..662ec99016e6 100644
--- a/keras/layers/pooling/average_pooling2d.py
+++ b/keras/layers/pooling/average_pooling2d.py
@@ -13,126 +13,135 @@
 # limitations under the License.
 # ==============================================================================
 """Average pooling 2D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.layers.pooling.base_pooling2d import Pooling2D
+
 import tensorflow.compat.v2 as tf
 
+from keras.layers.pooling.base_pooling2d import Pooling2D
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
+@keras_export("keras.layers.AveragePooling2D", "keras.layers.AvgPool2D")
 class AveragePooling2D(Pooling2D):
-  """Average pooling operation for spatial data.
-
-  Downsamples the input along its spatial dimensions (height and width)
-  by taking the average value over an input window
-  (of size defined by `pool_size`) for each channel of the input.
-  The window is shifted by `strides` along each dimension.
-
-  The resulting output when using `"valid"` padding option has a shape
-  (number of rows or columns) of:
-  `output_shape = math.floor((input_shape - pool_size) / strides) + 1`
-  (when `input_shape >= pool_size`)
-
-  The resulting output shape when using the `"same"` padding option is:
-  `output_shape = math.floor((input_shape - 1) / strides) + 1`
-
-  For example, for `strides=(1, 1)` and `padding="valid"`:
-
-  >>> x = tf.constant([[1., 2., 3.],
-  ...                  [4., 5., 6.],
-  ...                  [7., 8., 9.]])
-  >>> x = tf.reshape(x, [1, 3, 3, 1])
-  >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
-  ...    strides=(1, 1), padding='valid')
-  >>> avg_pool_2d(x)
-  <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-    array([[[[3.],
-             [4.]],
-            [[6.],
-             [7.]]]], dtype=float32)>
-
-  For example, for `stride=(2, 2)` and `padding="valid"`:
-
-  >>> x = tf.constant([[1., 2., 3., 4.],
-  ...                  [5., 6., 7., 8.],
-  ...                  [9., 10., 11., 12.]])
-  >>> x = tf.reshape(x, [1, 3, 4, 1])
-  >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
-  ...    strides=(2, 2), padding='valid')
-  >>> avg_pool_2d(x)
-  <tf.Tensor: shape=(1, 1, 2, 1), dtype=float32, numpy=
-    array([[[[3.5],
-             [5.5]]]], dtype=float32)>
-
-  For example, for `strides=(1, 1)` and `padding="same"`:
-
-  >>> x = tf.constant([[1., 2., 3.],
-  ...                  [4., 5., 6.],
-  ...                  [7., 8., 9.]])
-  >>> x = tf.reshape(x, [1, 3, 3, 1])
-  >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
-  ...    strides=(1, 1), padding='same')
-  >>> avg_pool_2d(x)
-  <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
-    array([[[[3.],
-             [4.],
-             [4.5]],
-            [[6.],
-             [7.],
-             [7.5]],
-            [[7.5],
-             [8.5],
-             [9.]]]], dtype=float32)>
-
-  Args:
-    pool_size: integer or tuple of 2 integers,
-      factors by which to downscale (vertical, horizontal).
-      `(2, 2)` will halve the input in both spatial dimension.
-      If only one integer is specified, the same window length
-      will be used for both dimensions.
-    strides: Integer, tuple of 2 integers, or None.
-      Strides values.
-      If None, it will default to `pool_size`.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    - If `data_format='channels_last'`:
-      4D tensor with shape `(batch_size, rows, cols, channels)`.
-    - If `data_format='channels_first'`:
-      4D tensor with shape `(batch_size, channels, rows, cols)`.
-
-  Output shape:
-    - If `data_format='channels_last'`:
-      4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
-    - If `data_format='channels_first'`:
-      4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
-  """
-
-  def __init__(self,
-               pool_size=(2, 2),
-               strides=None,
-               padding='valid',
-               data_format=None,
-               **kwargs):
-    super().__init__(
-        tf.nn.avg_pool,
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, **kwargs)
+    """Average pooling operation for spatial data.
+
+    Downsamples the input along its spatial dimensions (height and width)
+    by taking the average value over an input window
+    (of size defined by `pool_size`) for each channel of the input.
+    The window is shifted by `strides` along each dimension.
+
+    The resulting output when using `"valid"` padding option has a shape
+    (number of rows or columns) of:
+    `output_shape = math.floor((input_shape - pool_size) / strides) + 1`
+    (when `input_shape >= pool_size`)
+
+    The resulting output shape when using the `"same"` padding option is:
+    `output_shape = math.floor((input_shape - 1) / strides) + 1`
+
+    For example, for `strides=(1, 1)` and `padding="valid"`:
+
+    >>> x = tf.constant([[1., 2., 3.],
+    ...                  [4., 5., 6.],
+    ...                  [7., 8., 9.]])
+    >>> x = tf.reshape(x, [1, 3, 3, 1])
+    >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
+    ...    strides=(1, 1), padding='valid')
+    >>> avg_pool_2d(x)
+    <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
+      array([[[[3.],
+               [4.]],
+              [[6.],
+               [7.]]]], dtype=float32)>
+
+    For example, for `stride=(2, 2)` and `padding="valid"`:
+
+    >>> x = tf.constant([[1., 2., 3., 4.],
+    ...                  [5., 6., 7., 8.],
+    ...                  [9., 10., 11., 12.]])
+    >>> x = tf.reshape(x, [1, 3, 4, 1])
+    >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
+    ...    strides=(2, 2), padding='valid')
+    >>> avg_pool_2d(x)
+    <tf.Tensor: shape=(1, 1, 2, 1), dtype=float32, numpy=
+      array([[[[3.5],
+               [5.5]]]], dtype=float32)>
+
+    For example, for `strides=(1, 1)` and `padding="same"`:
+
+    >>> x = tf.constant([[1., 2., 3.],
+    ...                  [4., 5., 6.],
+    ...                  [7., 8., 9.]])
+    >>> x = tf.reshape(x, [1, 3, 3, 1])
+    >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
+    ...    strides=(1, 1), padding='same')
+    >>> avg_pool_2d(x)
+    <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+      array([[[[3.],
+               [4.],
+               [4.5]],
+              [[6.],
+               [7.],
+               [7.5]],
+              [[7.5],
+               [8.5],
+               [9.]]]], dtype=float32)>
+
+    Args:
+      pool_size: integer or tuple of 2 integers,
+        factors by which to downscale (vertical, horizontal).
+        `(2, 2)` will halve the input in both spatial dimension.
+        If only one integer is specified, the same window length
+        will be used for both dimensions.
+      strides: Integer, tuple of 2 integers, or None.
+        Strides values.
+        If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      - If `data_format='channels_last'`:
+        4D tensor with shape `(batch_size, rows, cols, channels)`.
+      - If `data_format='channels_first'`:
+        4D tensor with shape `(batch_size, channels, rows, cols)`.
+
+    Output shape:
+      - If `data_format='channels_last'`:
+        4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
+      - If `data_format='channels_first'`:
+        4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
+    """
+
+    def __init__(
+        self,
+        pool_size=(2, 2),
+        strides=None,
+        padding="valid",
+        data_format=None,
+        **kwargs
+    ):
+        super().__init__(
+            tf.nn.avg_pool,
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            **kwargs
+        )
 
 
 # Alias
diff --git a/keras/layers/pooling/average_pooling3d.py b/keras/layers/pooling/average_pooling3d.py
index 56b7d4a9d585..9d1177e6c68d 100644
--- a/keras/layers/pooling/average_pooling3d.py
+++ b/keras/layers/pooling/average_pooling3d.py
@@ -13,83 +13,92 @@
 # limitations under the License.
 # ==============================================================================
 """Average pooling 3D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.layers.pooling.base_pooling3d import Pooling3D
+
 import tensorflow.compat.v2 as tf
 
+from keras.layers.pooling.base_pooling3d import Pooling3D
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
+@keras_export("keras.layers.AveragePooling3D", "keras.layers.AvgPool3D")
 class AveragePooling3D(Pooling3D):
-  """Average pooling operation for 3D data (spatial or spatio-temporal).
-
-  Downsamples the input along its spatial dimensions (depth, height, and width)
-  by taking the average value over an input window
-  (of size defined by `pool_size`) for each channel of the input.
-  The window is shifted by `strides` along each dimension.
-
-  Args:
-    pool_size: tuple of 3 integers,
-      factors by which to downscale (dim1, dim2, dim3).
-      `(2, 2, 2)` will halve the size of the 3D input in each dimension.
-    strides: tuple of 3 integers, or None. Strides values.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      while `channels_first` corresponds to inputs with shape
-      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    - If `data_format='channels_last'`:
-      5D tensor with shape:
-      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-    - If `data_format='channels_first'`:
-      5D tensor with shape:
-      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
-
-  Output shape:
-    - If `data_format='channels_last'`:
-      5D tensor with shape:
-      `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
-    - If `data_format='channels_first'`:
-      5D tensor with shape:
-      `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
-
-  Example:
-
-  ```python
-  depth = 30
-  height = 30
-  width = 30
-  input_channels = 3
-
-  inputs = tf.keras.Input(shape=(depth, height, width, input_channels))
-  layer = tf.keras.layers.AveragePooling3D(pool_size=3)
-  outputs = layer(inputs)  # Shape: (batch_size, 10, 10, 10, 3)
-  ```
-  """
-
-  def __init__(self,
-               pool_size=(2, 2, 2),
-               strides=None,
-               padding='valid',
-               data_format=None,
-               **kwargs):
-    super().__init__(
-        tf.nn.avg_pool3d,
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, **kwargs)
+    """Average pooling operation for 3D data (spatial or spatio-temporal).
+
+    Downsamples the input along its spatial dimensions (depth, height, and
+    width) by taking the average value over an input window
+    (of size defined by `pool_size`) for each channel of the input.
+    The window is shifted by `strides` along each dimension.
+
+    Args:
+      pool_size: tuple of 3 integers,
+        factors by which to downscale (dim1, dim2, dim3).
+        `(2, 2, 2)` will halve the size of the 3D input in each dimension.
+      strides: tuple of 3 integers, or None. Strides values.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+        while `channels_first` corresponds to inputs with shape
+        `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      - If `data_format='channels_last'`:
+        5D tensor with shape:
+        `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+        5D tensor with shape:
+        `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+    Output shape:
+      - If `data_format='channels_last'`:
+        5D tensor with shape:
+        `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+      - If `data_format='channels_first'`:
+        5D tensor with shape:
+        `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+
+    Example:
+
+    ```python
+    depth = 30
+    height = 30
+    width = 30
+    input_channels = 3
+
+    inputs = tf.keras.Input(shape=(depth, height, width, input_channels))
+    layer = tf.keras.layers.AveragePooling3D(pool_size=3)
+    outputs = layer(inputs)  # Shape: (batch_size, 10, 10, 10, 3)
+    ```
+    """
+
+    def __init__(
+        self,
+        pool_size=(2, 2, 2),
+        strides=None,
+        padding="valid",
+        data_format=None,
+        **kwargs
+    ):
+        super().__init__(
+            tf.nn.avg_pool3d,
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            **kwargs
+        )
 
 
 # Alias
diff --git a/keras/layers/pooling/average_pooling_test.py b/keras/layers/pooling/average_pooling_test.py
index 21a7fba93cd6..cd7f5ffed9ad 100644
--- a/keras/layers/pooling/average_pooling_test.py
+++ b/keras/layers/pooling/average_pooling_test.py
@@ -14,85 +14,79 @@
 # ==============================================================================
 """Tests for average pooling layers."""
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class AveragePoolingTest(tf.test.TestCase, parameterized.TestCase):
+    def test_average_pooling_1d(self):
+        for padding in ["valid", "same"]:
+            for stride in [1, 2]:
+                test_utils.layer_test(
+                    keras.layers.AveragePooling1D,
+                    kwargs={"strides": stride, "padding": padding},
+                    input_shape=(3, 5, 4),
+                )
 
-  def test_average_pooling_1d(self):
-    for padding in ['valid', 'same']:
-      for stride in [1, 2]:
         test_utils.layer_test(
             keras.layers.AveragePooling1D,
-            kwargs={
-                'strides': stride,
-                'padding': padding
-            },
-            input_shape=(3, 5, 4))
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 2, 6),
+        )
 
-    test_utils.layer_test(
-        keras.layers.AveragePooling1D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 2, 6))
+    def test_average_pooling_2d(self):
+        test_utils.layer_test(
+            keras.layers.AveragePooling2D,
+            kwargs={"strides": (2, 2), "padding": "same", "pool_size": (2, 2)},
+            input_shape=(3, 5, 6, 4),
+        )
+        test_utils.layer_test(
+            keras.layers.AveragePooling2D,
+            kwargs={"strides": (2, 2), "padding": "valid", "pool_size": (3, 3)},
+            input_shape=(3, 5, 6, 4),
+        )
 
-  def test_average_pooling_2d(self):
-    test_utils.layer_test(
-        keras.layers.AveragePooling2D,
-        kwargs={
-            'strides': (2, 2),
-            'padding': 'same',
-            'pool_size': (2, 2)
-        },
-        input_shape=(3, 5, 6, 4))
-    test_utils.layer_test(
-        keras.layers.AveragePooling2D,
-        kwargs={
-            'strides': (2, 2),
-            'padding': 'valid',
-            'pool_size': (3, 3)
-        },
-        input_shape=(3, 5, 6, 4))
+        # This part of the test can only run on GPU but doesn't appear
+        # to be properly assigned to a GPU when running in eager mode.
+        if not tf.executing_eagerly():
+            # Only runs on GPU with CUDA, channels_first is not supported on
+            # CPU.
+            # TODO(b/62340061): Support channels_first on CPU.
+            if tf.test.is_gpu_available(cuda_only=True):
+                test_utils.layer_test(
+                    keras.layers.AveragePooling2D,
+                    kwargs={
+                        "strides": (1, 1),
+                        "padding": "valid",
+                        "pool_size": (2, 2),
+                        "data_format": "channels_first",
+                    },
+                    input_shape=(3, 4, 5, 6),
+                )
 
-    # This part of the test can only run on GPU but doesn't appear
-    # to be properly assigned to a GPU when running in eager mode.
-    if not tf.executing_eagerly():
-      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
-      # TODO(b/62340061): Support channels_first on CPU.
-      if tf.test.is_gpu_available(cuda_only=True):
+    def test_average_pooling_3d(self):
+        pool_size = (3, 3, 3)
         test_utils.layer_test(
-            keras.layers.AveragePooling2D,
+            keras.layers.AveragePooling3D,
+            kwargs={"strides": 2, "padding": "valid", "pool_size": pool_size},
+            input_shape=(3, 11, 12, 10, 4),
+        )
+        test_utils.layer_test(
+            keras.layers.AveragePooling3D,
             kwargs={
-                'strides': (1, 1),
-                'padding': 'valid',
-                'pool_size': (2, 2),
-                'data_format': 'channels_first'
+                "strides": 3,
+                "padding": "valid",
+                "data_format": "channels_first",
+                "pool_size": pool_size,
             },
-            input_shape=(3, 4, 5, 6))
+            input_shape=(3, 4, 11, 12, 10),
+        )
 
-  def test_average_pooling_3d(self):
-    pool_size = (3, 3, 3)
-    test_utils.layer_test(
-        keras.layers.AveragePooling3D,
-        kwargs={
-            'strides': 2,
-            'padding': 'valid',
-            'pool_size': pool_size
-        },
-        input_shape=(3, 11, 12, 10, 4))
-    test_utils.layer_test(
-        keras.layers.AveragePooling3D,
-        kwargs={
-            'strides': 3,
-            'padding': 'valid',
-            'data_format': 'channels_first',
-            'pool_size': pool_size
-        },
-        input_shape=(3, 4, 11, 12, 10))
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/pooling/base_global_pooling1d.py b/keras/layers/pooling/base_global_pooling1d.py
index 073f3d8cb3ee..fbf2465109be 100644
--- a/keras/layers/pooling/base_global_pooling1d.py
+++ b/keras/layers/pooling/base_global_pooling1d.py
@@ -13,41 +13,56 @@
 # limitations under the License.
 # ==============================================================================
 """Private base class for global pooling 1D layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class GlobalPooling1D(Layer):
-  """Abstract class for different global pooling 1D layers."""
-
-  def __init__(self, data_format='channels_last', keepdims=False, **kwargs):
-    super().__init__(**kwargs)
-    self.input_spec = InputSpec(ndim=3)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.keepdims = keepdims
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      if self.keepdims:
-        return tf.TensorShape([input_shape[0], input_shape[1], 1])
-      else:
-        return tf.TensorShape([input_shape[0], input_shape[1]])
-    else:
-      if self.keepdims:
-        return tf.TensorShape([input_shape[0], 1, input_shape[2]])
-      else:
-        return tf.TensorShape([input_shape[0], input_shape[2]])
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-  def get_config(self):
-    config = {'data_format': self.data_format, 'keepdims': self.keepdims}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Abstract class for different global pooling 1D layers."""
+
+    def __init__(self, data_format="channels_last", keepdims=False, **kwargs):
+        super().__init__(**kwargs)
+        self.input_spec = InputSpec(ndim=3)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.keepdims = keepdims
+
+    def _validate_reduction_axis(self, input_shape, axes):
+        for axis in axes:
+            if input_shape[axis] == 0:
+                raise ValueError(
+                    f"Incorrect input shape {input_shape} "
+                    f"with dimension 0 at reduction axis {axis}."
+                )
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_last":
+            self._validate_reduction_axis(input_shape, [1])
+        else:
+            self._validate_reduction_axis(input_shape, [2])
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            if self.keepdims:
+                return tf.TensorShape([input_shape[0], input_shape[1], 1])
+            else:
+                return tf.TensorShape([input_shape[0], input_shape[1]])
+        else:
+            if self.keepdims:
+                return tf.TensorShape([input_shape[0], 1, input_shape[2]])
+            else:
+                return tf.TensorShape([input_shape[0], input_shape[2]])
+
+    def call(self, inputs):
+        raise NotImplementedError
 
+    def get_config(self):
+        config = {"data_format": self.data_format, "keepdims": self.keepdims}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/pooling/base_global_pooling2d.py b/keras/layers/pooling/base_global_pooling2d.py
index f1c22279cf6b..7fe7a28e890c 100644
--- a/keras/layers/pooling/base_global_pooling2d.py
+++ b/keras/layers/pooling/base_global_pooling2d.py
@@ -13,40 +13,56 @@
 # limitations under the License.
 # ==============================================================================
 """Private base class for global pooling 2D layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class GlobalPooling2D(Layer):
-  """Abstract class for different global pooling 2D layers."""
-
-  def __init__(self, data_format=None, keepdims=False, **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(ndim=4)
-    self.keepdims = keepdims
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_last':
-      if self.keepdims:
-        return tf.TensorShape([input_shape[0], 1, 1, input_shape[3]])
-      else:
-        return tf.TensorShape([input_shape[0], input_shape[3]])
-    else:
-      if self.keepdims:
-        return tf.TensorShape([input_shape[0], input_shape[1], 1, 1])
-      else:
-        return tf.TensorShape([input_shape[0], input_shape[1]])
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-  def get_config(self):
-    config = {'data_format': self.data_format, 'keepdims': self.keepdims}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Abstract class for different global pooling 2D layers."""
+
+    def __init__(self, data_format=None, keepdims=False, **kwargs):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=4)
+        self.keepdims = keepdims
+
+    def _validate_reduction_axis(self, input_shape, axes):
+        for axis in axes:
+            if input_shape[axis] == 0:
+                raise ValueError(
+                    f"Incorrect input shape {input_shape} "
+                    f"with dimension 0 at reduction axis {axis}."
+                )
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_last":
+            self._validate_reduction_axis(input_shape, [1, 2])
+        else:
+            self._validate_reduction_axis(input_shape, [2, 3])
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_last":
+            if self.keepdims:
+                return tf.TensorShape([input_shape[0], 1, 1, input_shape[3]])
+            else:
+                return tf.TensorShape([input_shape[0], input_shape[3]])
+        else:
+            if self.keepdims:
+                return tf.TensorShape([input_shape[0], input_shape[1], 1, 1])
+            else:
+                return tf.TensorShape([input_shape[0], input_shape[1]])
+
+    def call(self, inputs):
+        raise NotImplementedError
+
+    def get_config(self):
+        config = {"data_format": self.data_format, "keepdims": self.keepdims}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/pooling/base_global_pooling3d.py b/keras/layers/pooling/base_global_pooling3d.py
index 40ccf92bf849..749475ac857b 100644
--- a/keras/layers/pooling/base_global_pooling3d.py
+++ b/keras/layers/pooling/base_global_pooling3d.py
@@ -13,42 +13,56 @@
 # limitations under the License.
 # ==============================================================================
 """Private base class for global pooling 3D layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class GlobalPooling3D(Layer):
-  """Abstract class for different global pooling 3D layers."""
-
-  def __init__(self, data_format=None, keepdims=False, **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(ndim=5)
-    self.keepdims = keepdims
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_last':
-      if self.keepdims:
-        return tf.TensorShape(
-            [input_shape[0], 1, 1, 1, input_shape[4]])
-      else:
-        return tf.TensorShape([input_shape[0], input_shape[4]])
-    else:
-      if self.keepdims:
-        return tf.TensorShape(
-            [input_shape[0], input_shape[1], 1, 1, 1])
-      else:
-        return tf.TensorShape([input_shape[0], input_shape[1]])
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-  def get_config(self):
-    config = {'data_format': self.data_format, 'keepdims': self.keepdims}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Abstract class for different global pooling 3D layers."""
+
+    def __init__(self, data_format=None, keepdims=False, **kwargs):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=5)
+        self.keepdims = keepdims
+
+    def _validate_reduction_axis(self, input_shape, axes):
+        for axis in axes:
+            if input_shape[axis] == 0:
+                raise ValueError(
+                    f"Incorrect input shape {input_shape} "
+                    f"with dimension 0 at reduction axis {axis}."
+                )
+
+    def build(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_last":
+            self._validate_reduction_axis(input_shape, [1, 2, 3])
+        else:
+            self._validate_reduction_axis(input_shape, [2, 3, 4])
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_last":
+            if self.keepdims:
+                return tf.TensorShape([input_shape[0], 1, 1, 1, input_shape[4]])
+            else:
+                return tf.TensorShape([input_shape[0], input_shape[4]])
+        else:
+            if self.keepdims:
+                return tf.TensorShape([input_shape[0], input_shape[1], 1, 1, 1])
+            else:
+                return tf.TensorShape([input_shape[0], input_shape[1]])
+
+    def call(self, inputs):
+        raise NotImplementedError
+
+    def get_config(self):
+        config = {"data_format": self.data_format, "keepdims": self.keepdims}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/pooling/base_pooling1d.py b/keras/layers/pooling/base_pooling1d.py
index 2176b9d3ed17..397196d51e55 100644
--- a/keras/layers/pooling/base_pooling1d.py
+++ b/keras/layers/pooling/base_pooling1d.py
@@ -13,88 +13,97 @@
 # limitations under the License.
 # ==============================================================================
 """Private base class for pooling 1D layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class Pooling1D(Layer):
-  """Pooling layer for arbitrary pooling functions, for 1D inputs.
+    """Pooling layer for arbitrary pooling functions, for 1D inputs.
 
-  This class only exists for code reuse. It will never be an exposed API.
+    This class only exists for code reuse. It will never be an exposed API.
 
-  Args:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, steps, features)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, features, steps)`.
-    name: A string, the name of the layer.
-  """
+    Args:
+      pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
+      pool_size: An integer or tuple/list of a single integer,
+        representing the size of the pooling window.
+      strides: An integer or tuple/list of a single integer, specifying the
+        strides of the pooling operation.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+      name: A string, the name of the layer.
+    """
 
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super().__init__(name=name, **kwargs)
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if strides is None:
-      strides = pool_size
-    self.pool_function = pool_function
-    self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
-    self.strides = conv_utils.normalize_tuple(
-        strides, 1, 'strides', allow_zero=True)
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(ndim=3)
+    def __init__(
+        self,
+        pool_function,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        super().__init__(name=name, **kwargs)
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if strides is None:
+            strides = pool_size
+        self.pool_function = pool_function
+        self.pool_size = conv_utils.normalize_tuple(pool_size, 1, "pool_size")
+        self.strides = conv_utils.normalize_tuple(
+            strides, 1, "strides", allow_zero=True
+        )
+        self.padding = conv_utils.normalize_padding(padding)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=3)
 
-  def call(self, inputs):
-    pad_axis = 2 if self.data_format == 'channels_last' else 3
-    inputs = tf.expand_dims(inputs, pad_axis)
-    outputs = self.pool_function(
-        inputs,
-        self.pool_size + (1,),
-        strides=self.strides + (1,),
-        padding=self.padding,
-        data_format=self.data_format)
-    return tf.squeeze(outputs, pad_axis)
+    def call(self, inputs):
+        pad_axis = 2 if self.data_format == "channels_last" else 3
+        inputs = tf.expand_dims(inputs, pad_axis)
+        outputs = self.pool_function(
+            inputs,
+            self.pool_size + (1,),
+            strides=self.strides + (1,),
+            padding=self.padding,
+            data_format=self.data_format,
+        )
+        return tf.squeeze(outputs, pad_axis)
 
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      steps = input_shape[2]
-      features = input_shape[1]
-    else:
-      steps = input_shape[1]
-      features = input_shape[2]
-    length = conv_utils.conv_output_length(steps,
-                                           self.pool_size[0],
-                                           self.padding,
-                                           self.strides[0])
-    if self.data_format == 'channels_first':
-      return tf.TensorShape([input_shape[0], features, length])
-    else:
-      return tf.TensorShape([input_shape[0], length, features])
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            steps = input_shape[2]
+            features = input_shape[1]
+        else:
+            steps = input_shape[1]
+            features = input_shape[2]
+        length = conv_utils.conv_output_length(
+            steps, self.pool_size[0], self.padding, self.strides[0]
+        )
+        if self.data_format == "channels_first":
+            return tf.TensorShape([input_shape[0], features, length])
+        else:
+            return tf.TensorShape([input_shape[0], length, features])
 
-  def get_config(self):
-    config = {
-        'strides': self.strides,
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'data_format': self.data_format,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {
+            "strides": self.strides,
+            "pool_size": self.pool_size,
+            "padding": self.padding,
+            "data_format": self.data_format,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/pooling/base_pooling2d.py b/keras/layers/pooling/base_pooling2d.py
index e783d4220d05..3aaa080700bd 100644
--- a/keras/layers/pooling/base_pooling2d.py
+++ b/keras/layers/pooling/base_pooling2d.py
@@ -13,96 +13,108 @@
 # limitations under the License.
 # ==============================================================================
 """Private base class for pooling 2D layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class Pooling2D(Layer):
-  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
+    """Pooling layer for arbitrary pooling functions, for 2D data (e.g. images).
 
-  This class only exists for code reuse. It will never be an exposed API.
+    This class only exists for code reuse. It will never be an exposed API.
 
-  Args:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-  """
+    Args:
+      pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
+      pool_size: An integer or tuple/list of 2 integers:
+        (pool_height, pool_width)
+        specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+      name: A string, the name of the layer.
+    """
 
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format=None,
-               name=None, **kwargs):
-    super().__init__(name=name, **kwargs)
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if strides is None:
-      strides = pool_size
-    self.pool_function = pool_function
-    self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
-    self.strides = conv_utils.normalize_tuple(
-        strides, 2, 'strides', allow_zero=True)
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(ndim=4)
+    def __init__(
+        self,
+        pool_function,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format=None,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(name=name, **kwargs)
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if strides is None:
+            strides = pool_size
+        self.pool_function = pool_function
+        self.pool_size = conv_utils.normalize_tuple(pool_size, 2, "pool_size")
+        self.strides = conv_utils.normalize_tuple(
+            strides, 2, "strides", allow_zero=True
+        )
+        self.padding = conv_utils.normalize_padding(padding)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=4)
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      pool_shape = (1,) + self.pool_size + (1,)
-      strides = (1,) + self.strides + (1,)
-    else:
-      pool_shape = (1, 1) + self.pool_size
-      strides = (1, 1) + self.strides
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=conv_utils.convert_data_format(self.data_format, 4))
-    return outputs
+    def call(self, inputs):
+        if self.data_format == "channels_last":
+            pool_shape = (1,) + self.pool_size + (1,)
+            strides = (1,) + self.strides + (1,)
+        else:
+            pool_shape = (1, 1) + self.pool_size
+            strides = (1, 1) + self.strides
+        outputs = self.pool_function(
+            inputs,
+            ksize=pool_shape,
+            strides=strides,
+            padding=self.padding.upper(),
+            data_format=conv_utils.convert_data_format(self.data_format, 4),
+        )
+        return outputs
 
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
-    rows = conv_utils.conv_output_length(rows, self.pool_size[0], self.padding,
-                                         self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.pool_size[1], self.padding,
-                                         self.strides[1])
-    if self.data_format == 'channels_first':
-      return tf.TensorShape(
-          [input_shape[0], input_shape[1], rows, cols])
-    else:
-      return tf.TensorShape(
-          [input_shape[0], rows, cols, input_shape[3]])
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            rows = input_shape[2]
+            cols = input_shape[3]
+        else:
+            rows = input_shape[1]
+            cols = input_shape[2]
+        rows = conv_utils.conv_output_length(
+            rows, self.pool_size[0], self.padding, self.strides[0]
+        )
+        cols = conv_utils.conv_output_length(
+            cols, self.pool_size[1], self.padding, self.strides[1]
+        )
+        if self.data_format == "channels_first":
+            return tf.TensorShape([input_shape[0], input_shape[1], rows, cols])
+        else:
+            return tf.TensorShape([input_shape[0], rows, cols, input_shape[3]])
 
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {
+            "pool_size": self.pool_size,
+            "padding": self.padding,
+            "strides": self.strides,
+            "data_format": self.data_format,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/pooling/base_pooling3d.py b/keras/layers/pooling/base_pooling3d.py
index ad75cc32f002..bc4d5b7bde1c 100644
--- a/keras/layers/pooling/base_pooling3d.py
+++ b/keras/layers/pooling/base_pooling3d.py
@@ -13,107 +13,123 @@
 # limitations under the License.
 # ==============================================================================
 """Private base class for pooling 3D layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class Pooling3D(Layer):
-  """Pooling layer for arbitrary pooling functions, for 3D inputs.
+    """Pooling layer for arbitrary pooling functions, for 3D inputs.
 
-  This class only exists for code reuse. It will never be an exposed API.
+    This class only exists for code reuse. It will never be an exposed API.
 
-  Args:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)`
-      while `channels_first` corresponds to
-      inputs with shape `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-  """
+    Args:
+      pool_function: The pooling function to apply, e.g. `tf.nn.max_pool2d`.
+      pool_size: An integer or tuple/list of 3 integers:
+        (pool_depth, pool_height, pool_width)
+        specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)`
+        while `channels_first` corresponds to
+        inputs with shape `(batch, channels, depth, height, width)`.
+      name: A string, the name of the layer.
+    """
 
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super().__init__(name=name, **kwargs)
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if strides is None:
-      strides = pool_size
-    self.pool_function = pool_function
-    self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
-    self.strides = conv_utils.normalize_tuple(
-        strides, 3, 'strides', allow_zero=True)
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(ndim=5)
+    def __init__(
+        self,
+        pool_function,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        super().__init__(name=name, **kwargs)
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if strides is None:
+            strides = pool_size
+        self.pool_function = pool_function
+        self.pool_size = conv_utils.normalize_tuple(pool_size, 3, "pool_size")
+        self.strides = conv_utils.normalize_tuple(
+            strides, 3, "strides", allow_zero=True
+        )
+        self.padding = conv_utils.normalize_padding(padding)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=5)
 
-  def call(self, inputs):
-    pool_shape = (1,) + self.pool_size + (1,)
-    strides = (1,) + self.strides + (1,)
+    def call(self, inputs):
+        pool_shape = (1,) + self.pool_size + (1,)
+        strides = (1,) + self.strides + (1,)
 
-    if self.data_format == 'channels_first':
-      # TF does not support `channels_first` with 3D pooling operations,
-      # so we must handle this case manually.
-      # TODO(fchollet): remove this when TF pooling is feature-complete.
-      inputs = tf.transpose(inputs, (0, 2, 3, 4, 1))
+        if self.data_format == "channels_first":
+            # TF does not support `channels_first` with 3D pooling operations,
+            # so we must handle this case manually.
+            # TODO(fchollet): remove this when TF pooling is feature-complete.
+            inputs = tf.transpose(inputs, (0, 2, 3, 4, 1))
 
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper())
+        outputs = self.pool_function(
+            inputs,
+            ksize=pool_shape,
+            strides=strides,
+            padding=self.padding.upper(),
+        )
 
-    if self.data_format == 'channels_first':
-      outputs = tf.transpose(outputs, (0, 4, 1, 2, 3))
-    return outputs
+        if self.data_format == "channels_first":
+            outputs = tf.transpose(outputs, (0, 4, 1, 2, 3))
+        return outputs
 
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      len_dim1 = input_shape[2]
-      len_dim2 = input_shape[3]
-      len_dim3 = input_shape[4]
-    else:
-      len_dim1 = input_shape[1]
-      len_dim2 = input_shape[2]
-      len_dim3 = input_shape[3]
-    len_dim1 = conv_utils.conv_output_length(len_dim1, self.pool_size[0],
-                                             self.padding, self.strides[0])
-    len_dim2 = conv_utils.conv_output_length(len_dim2, self.pool_size[1],
-                                             self.padding, self.strides[1])
-    len_dim3 = conv_utils.conv_output_length(len_dim3, self.pool_size[2],
-                                             self.padding, self.strides[2])
-    if self.data_format == 'channels_first':
-      return tf.TensorShape(
-          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
-    else:
-      return tf.TensorShape(
-          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            len_dim1 = input_shape[2]
+            len_dim2 = input_shape[3]
+            len_dim3 = input_shape[4]
+        else:
+            len_dim1 = input_shape[1]
+            len_dim2 = input_shape[2]
+            len_dim3 = input_shape[3]
+        len_dim1 = conv_utils.conv_output_length(
+            len_dim1, self.pool_size[0], self.padding, self.strides[0]
+        )
+        len_dim2 = conv_utils.conv_output_length(
+            len_dim2, self.pool_size[1], self.padding, self.strides[1]
+        )
+        len_dim3 = conv_utils.conv_output_length(
+            len_dim3, self.pool_size[2], self.padding, self.strides[2]
+        )
+        if self.data_format == "channels_first":
+            return tf.TensorShape(
+                [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3]
+            )
+        else:
+            return tf.TensorShape(
+                [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]]
+            )
 
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {
+            "pool_size": self.pool_size,
+            "padding": self.padding,
+            "strides": self.strides,
+            "data_format": self.data_format,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/pooling/global_average_pooling1d.py b/keras/layers/pooling/global_average_pooling1d.py
index 4ec277e591df..0a81e9f98b1d 100644
--- a/keras/layers/pooling/global_average_pooling1d.py
+++ b/keras/layers/pooling/global_average_pooling1d.py
@@ -13,87 +13,89 @@
 # limitations under the License.
 # ==============================================================================
 """Global average pooling 1D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.layers.pooling.base_global_pooling1d import GlobalPooling1D
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GlobalAveragePooling1D',
-              'keras.layers.GlobalAvgPool1D')
+@keras_export(
+    "keras.layers.GlobalAveragePooling1D", "keras.layers.GlobalAvgPool1D"
+)
 class GlobalAveragePooling1D(GlobalPooling1D):
-  """Global average pooling operation for temporal data.
-
-  Examples:
-
-  >>> input_shape = (2, 3, 4)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.GlobalAveragePooling1D()(x)
-  >>> print(y.shape)
-  (2, 4)
-
-  Args:
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, steps, features)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, features, steps)`.
-    keepdims: A boolean, whether to keep the temporal dimension or not.
-      If `keepdims` is `False` (default), the rank of the tensor is reduced
-      for spatial dimensions.
-      If `keepdims` is `True`, the temporal dimension are retained with
-      length 1.
-      The behavior is the same as for `tf.reduce_mean` or `np.mean`.
-
-  Call arguments:
-    inputs: A 3D tensor.
-    mask: Binary tensor of shape `(batch_size, steps)` indicating whether
-      a given step should be masked (excluded from the average).
-
-  Input shape:
-    - If `data_format='channels_last'`:
-      3D tensor with shape:
-      `(batch_size, steps, features)`
-    - If `data_format='channels_first'`:
-      3D tensor with shape:
-      `(batch_size, features, steps)`
-
-  Output shape:
-    - If `keepdims`=False:
-      2D tensor with shape `(batch_size, features)`.
-    - If `keepdims`=True:
+    """Global average pooling operation for temporal data.
+
+    Examples:
+
+    >>> input_shape = (2, 3, 4)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.GlobalAveragePooling1D()(x)
+    >>> print(y.shape)
+    (2, 4)
+
+    Args:
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+      keepdims: A boolean, whether to keep the temporal dimension or not.
+        If `keepdims` is `False` (default), the rank of the tensor is reduced
+        for spatial dimensions.
+        If `keepdims` is `True`, the temporal dimension are retained with
+        length 1.
+        The behavior is the same as for `tf.reduce_mean` or `np.mean`.
+
+    Call arguments:
+      inputs: A 3D tensor.
+      mask: Binary tensor of shape `(batch_size, steps)` indicating whether
+        a given step should be masked (excluded from the average).
+
+    Input shape:
       - If `data_format='channels_last'`:
-        3D tensor with shape `(batch_size, 1, features)`
+        3D tensor with shape:
+        `(batch_size, steps, features)`
       - If `data_format='channels_first'`:
-        3D tensor with shape `(batch_size, features, 1)`
-  """
-
-  def __init__(self, data_format='channels_last', **kwargs):
-    super().__init__(data_format=data_format,
-                                                 **kwargs)
-    self.supports_masking = True
-
-  def call(self, inputs, mask=None):
-    steps_axis = 1 if self.data_format == 'channels_last' else 2
-    if mask is not None:
-      mask = tf.cast(mask, inputs[0].dtype)
-      mask = tf.expand_dims(
-          mask, 2 if self.data_format == 'channels_last' else 1)
-      inputs *= mask
-      return backend.sum(
-          inputs, axis=steps_axis,
-          keepdims=self.keepdims) / tf.reduce_sum(
-              mask, axis=steps_axis, keepdims=self.keepdims)
-    else:
-      return backend.mean(inputs, axis=steps_axis, keepdims=self.keepdims)
-
-  def compute_mask(self, inputs, mask=None):
-    return None
+        3D tensor with shape:
+        `(batch_size, features, steps)`
+
+    Output shape:
+      - If `keepdims`=False:
+        2D tensor with shape `(batch_size, features)`.
+      - If `keepdims`=True:
+        - If `data_format='channels_last'`:
+          3D tensor with shape `(batch_size, 1, features)`
+        - If `data_format='channels_first'`:
+          3D tensor with shape `(batch_size, features, 1)`
+    """
+
+    def __init__(self, data_format="channels_last", **kwargs):
+        super().__init__(data_format=data_format, **kwargs)
+        self.supports_masking = True
+
+    def call(self, inputs, mask=None):
+        steps_axis = 1 if self.data_format == "channels_last" else 2
+        if mask is not None:
+            mask = tf.cast(mask, inputs[0].dtype)
+            mask = tf.expand_dims(
+                mask, 2 if self.data_format == "channels_last" else 1
+            )
+            inputs *= mask
+            return backend.sum(
+                inputs, axis=steps_axis, keepdims=self.keepdims
+            ) / tf.reduce_sum(mask, axis=steps_axis, keepdims=self.keepdims)
+        else:
+            return backend.mean(inputs, axis=steps_axis, keepdims=self.keepdims)
+
+    def compute_mask(self, inputs, mask=None):
+        return None
 
 
 # Alias
diff --git a/keras/layers/pooling/global_average_pooling2d.py b/keras/layers/pooling/global_average_pooling2d.py
index 54dab87a6680..e219e2414081 100644
--- a/keras/layers/pooling/global_average_pooling2d.py
+++ b/keras/layers/pooling/global_average_pooling2d.py
@@ -13,66 +13,68 @@
 # limitations under the License.
 # ==============================================================================
 """Global average pooling 2D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.layers.pooling.base_global_pooling2d import GlobalPooling2D
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GlobalAveragePooling2D',
-              'keras.layers.GlobalAvgPool2D')
+@keras_export(
+    "keras.layers.GlobalAveragePooling2D", "keras.layers.GlobalAvgPool2D"
+)
 class GlobalAveragePooling2D(GlobalPooling2D):
-  """Global average pooling operation for spatial data.
-
-  Examples:
+    """Global average pooling operation for spatial data.
 
-  >>> input_shape = (2, 4, 5, 3)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.GlobalAveragePooling2D()(x)
-  >>> print(y.shape)
-  (2, 3)
+    Examples:
 
-  Args:
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape
-        `(batch, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
-      keepdims: A boolean, whether to keep the spatial dimensions or not.
-        If `keepdims` is `False` (default), the rank of the tensor is reduced
-        for spatial dimensions.
-        If `keepdims` is `True`, the spatial dimensions are retained with
-        length 1.
-        The behavior is the same as for `tf.reduce_mean` or `np.mean`.
+    >>> input_shape = (2, 4, 5, 3)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.GlobalAveragePooling2D()(x)
+    >>> print(y.shape)
+    (2, 3)
 
-  Input shape:
-    - If `data_format='channels_last'`:
-      4D tensor with shape `(batch_size, rows, cols, channels)`.
-    - If `data_format='channels_first'`:
-      4D tensor with shape `(batch_size, channels, rows, cols)`.
+    Args:
+        data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          When unspecified, uses `image_data_format` value found
+          in your Keras config file at `~/.keras/keras.json`
+          (if exists) else 'channels_last'. Defaults to 'channels_last'.
+        keepdims: A boolean, whether to keep the spatial dimensions or not.
+          If `keepdims` is `False` (default), the rank of the tensor is reduced
+          for spatial dimensions.
+          If `keepdims` is `True`, the spatial dimensions are retained with
+          length 1.
+          The behavior is the same as for `tf.reduce_mean` or `np.mean`.
 
-  Output shape:
-    - If `keepdims`=False:
-      2D tensor with shape `(batch_size, channels)`.
-    - If `keepdims`=True:
+    Input shape:
       - If `data_format='channels_last'`:
-        4D tensor with shape `(batch_size, 1, 1, channels)`
+        4D tensor with shape `(batch_size, rows, cols, channels)`.
       - If `data_format='channels_first'`:
-        4D tensor with shape `(batch_size, channels, 1, 1)`
-  """
+        4D tensor with shape `(batch_size, channels, rows, cols)`.
+
+    Output shape:
+      - If `keepdims`=False:
+        2D tensor with shape `(batch_size, channels)`.
+      - If `keepdims`=True:
+        - If `data_format='channels_last'`:
+          4D tensor with shape `(batch_size, 1, 1, channels)`
+        - If `data_format='channels_first'`:
+          4D tensor with shape `(batch_size, channels, 1, 1)`
+    """
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      return backend.mean(inputs, axis=[1, 2], keepdims=self.keepdims)
-    else:
-      return backend.mean(inputs, axis=[2, 3], keepdims=self.keepdims)
+    def call(self, inputs):
+        if self.data_format == "channels_last":
+            return backend.mean(inputs, axis=[1, 2], keepdims=self.keepdims)
+        else:
+            return backend.mean(inputs, axis=[2, 3], keepdims=self.keepdims)
 
 
 # Alias
diff --git a/keras/layers/pooling/global_average_pooling3d.py b/keras/layers/pooling/global_average_pooling3d.py
index 2130e5294eb2..04b95667ed8e 100644
--- a/keras/layers/pooling/global_average_pooling3d.py
+++ b/keras/layers/pooling/global_average_pooling3d.py
@@ -13,60 +13,63 @@
 # limitations under the License.
 # ==============================================================================
 """Global average pooling 3D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.layers.pooling.base_global_pooling3d import GlobalPooling3D
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GlobalAveragePooling3D',
-              'keras.layers.GlobalAvgPool3D')
+@keras_export(
+    "keras.layers.GlobalAveragePooling3D", "keras.layers.GlobalAvgPool3D"
+)
 class GlobalAveragePooling3D(GlobalPooling3D):
-  """Global Average pooling operation for 3D data.
-
-  Args:
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      while `channels_first` corresponds to inputs with shape
-      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    keepdims: A boolean, whether to keep the spatial dimensions or not.
-      If `keepdims` is `False` (default), the rank of the tensor is reduced
-      for spatial dimensions.
-      If `keepdims` is `True`, the spatial dimensions are retained with
-      length 1.
-      The behavior is the same as for `tf.reduce_mean` or `np.mean`.
+    """Global Average pooling operation for 3D data.
 
-  Input shape:
-    - If `data_format='channels_last'`:
-      5D tensor with shape:
-      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-    - If `data_format='channels_first'`:
-      5D tensor with shape:
-      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+    Args:
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+        while `channels_first` corresponds to inputs with shape
+        `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      keepdims: A boolean, whether to keep the spatial dimensions or not.
+        If `keepdims` is `False` (default), the rank of the tensor is reduced
+        for spatial dimensions.
+        If `keepdims` is `True`, the spatial dimensions are retained with
+        length 1.
+        The behavior is the same as for `tf.reduce_mean` or `np.mean`.
 
-  Output shape:
-    - If `keepdims`=False:
-      2D tensor with shape `(batch_size, channels)`.
-    - If `keepdims`=True:
+    Input shape:
       - If `data_format='channels_last'`:
-        5D tensor with shape `(batch_size, 1, 1, 1, channels)`
+        5D tensor with shape:
+        `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
       - If `data_format='channels_first'`:
-        5D tensor with shape `(batch_size, channels, 1, 1, 1)`
-  """
+        5D tensor with shape:
+        `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+    Output shape:
+      - If `keepdims`=False:
+        2D tensor with shape `(batch_size, channels)`.
+      - If `keepdims`=True:
+        - If `data_format='channels_last'`:
+          5D tensor with shape `(batch_size, 1, 1, 1, channels)`
+        - If `data_format='channels_first'`:
+          5D tensor with shape `(batch_size, channels, 1, 1, 1)`
+    """
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      return backend.mean(inputs, axis=[1, 2, 3], keepdims=self.keepdims)
-    else:
-      return backend.mean(inputs, axis=[2, 3, 4], keepdims=self.keepdims)
+    def call(self, inputs):
+        if self.data_format == "channels_last":
+            return backend.mean(inputs, axis=[1, 2, 3], keepdims=self.keepdims)
+        else:
+            return backend.mean(inputs, axis=[2, 3, 4], keepdims=self.keepdims)
 
 
 # Alias
diff --git a/keras/layers/pooling/global_average_pooling_test.py b/keras/layers/pooling/global_average_pooling_test.py
index f38a5a46dcc5..ed33f7c44767 100644
--- a/keras/layers/pooling/global_average_pooling_test.py
+++ b/keras/layers/pooling/global_average_pooling_test.py
@@ -14,131 +14,157 @@
 # ==============================================================================
 """Tests for global average pooling layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.mixed_precision import policy
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class GlobalAveragePoolingTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_mixed_float16_policy(self):
-    with policy.policy_scope('mixed_float16'):
-      inputs1 = keras.Input(shape=(36, 512), dtype='float16')
-      inputs2 = keras.Input(shape=(36,), dtype='bool')
-      average_layer = keras.layers.GlobalAveragePooling1D()
-      _ = average_layer(inputs1, inputs2)
-
-  def test_global_average_pooling_1d(self):
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling1D, input_shape=(3, 4, 5))
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling1D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 4, 5))
-
-  def test_global_average_pooling_1d_masking_support(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Masking(mask_value=0., input_shape=(None, 4)))
-    model.add(keras.layers.GlobalAveragePooling1D())
-    model.compile(loss='mae', optimizer='rmsprop')
-
-    model_input = np.random.random((2, 3, 4))
-    model_input[0, 1:, :] = 0
-    output = model.predict(model_input)
-    self.assertAllClose(output[0], model_input[0, 0, :])
-
-  def test_global_average_pooling_1d_with_ragged(self):
-    ragged_data = tf.ragged.constant(
-        [[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], [[1.0, 1.0], [2.0, 2.0]]],
-        ragged_rank=1)
-    dense_data = ragged_data.to_tensor()
-
-    inputs = keras.Input(shape=(None, 2), dtype='float32', ragged=True)
-    out = keras.layers.GlobalAveragePooling1D()(inputs)
-    model = keras.models.Model(inputs=inputs, outputs=out)
-    output_ragged = model.predict(ragged_data, steps=1)
-
-    inputs = keras.Input(shape=(None, 2), dtype='float32')
-    masking = keras.layers.Masking(mask_value=0., input_shape=(3, 2))(inputs)
-    out = keras.layers.GlobalAveragePooling1D()(masking)
-    model = keras.models.Model(inputs=inputs, outputs=out)
-    output_dense = model.predict(dense_data, steps=1)
-
-    self.assertAllEqual(output_ragged, output_dense)
-
-  def test_global_average_pooling_2d(self):
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling2D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 4, 5, 6))
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling2D,
-        kwargs={'data_format': 'channels_last'},
-        input_shape=(3, 5, 6, 4))
-
-  def test_global_average_pooling_3d(self):
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling3D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 4, 3, 4, 3))
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling3D,
-        kwargs={'data_format': 'channels_last'},
-        input_shape=(3, 4, 3, 4, 3))
-
-  def test_global_average_pooling_1d_keepdims(self):
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling1D,
-        kwargs={'keepdims': True},
-        input_shape=(3, 4, 5),
-        expected_output_shape=(None, 1, 5))
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling1D,
-        kwargs={'data_format': 'channels_first', 'keepdims': True},
-        input_shape=(3, 4, 5),
-        expected_output_shape=(None, 4, 1))
-
-  def test_global_average_pooling_2d_keepdims(self):
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling2D,
-        kwargs={'data_format': 'channels_first', 'keepdims': True},
-        input_shape=(3, 4, 5, 6),
-        expected_output_shape=(None, 4, 1, 1))
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling2D,
-        kwargs={'data_format': 'channels_last', 'keepdims': True},
-        input_shape=(3, 4, 5, 6),
-        expected_output_shape=(None, 1, 1, 6))
-
-  def test_global_average_pooling_3d_keepdims(self):
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling3D,
-        kwargs={'data_format': 'channels_first', 'keepdims': True},
-        input_shape=(3, 4, 3, 4, 3),
-        expected_output_shape=(None, 4, 1, 1, 1))
-    test_utils.layer_test(
-        keras.layers.GlobalAveragePooling3D,
-        kwargs={'data_format': 'channels_last', 'keepdims': True},
-        input_shape=(3, 4, 3, 4, 3),
-        expected_output_shape=(None, 1, 1, 1, 3))
-
-  def test_global_average_pooling_1d_keepdims_masking_support(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Masking(mask_value=0., input_shape=(None, 4)))
-    model.add(keras.layers.GlobalAveragePooling1D(keepdims=True))
-    model.compile(loss='mae', optimizer='rmsprop')
-
-    model_input = np.random.random((2, 3, 4))
-    model_input[0, 1:, :] = 0
-    output = model.predict(model_input)
-    self.assertAllEqual((2, 1, 4), output.shape)
-    self.assertAllClose(output[0, 0], model_input[0, 0, :])
-
-if __name__ == '__main__':
-  tf.test.main()
+    @test_utils.enable_v2_dtype_behavior
+    def test_mixed_float16_policy(self):
+        with policy.policy_scope("mixed_float16"):
+            inputs1 = keras.Input(shape=(36, 512), dtype="float16")
+            inputs2 = keras.Input(shape=(36,), dtype="bool")
+            average_layer = keras.layers.GlobalAveragePooling1D()
+            _ = average_layer(inputs1, inputs2)
+
+    def test_global_average_pooling_1d(self):
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling1D, input_shape=(3, 4, 5)
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling1D,
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 4, 5),
+        )
+
+    def test_global_average_pooling_1d_masking_support(self):
+        model = keras.Sequential()
+        model.add(keras.layers.Masking(mask_value=0.0, input_shape=(None, 4)))
+        model.add(keras.layers.GlobalAveragePooling1D())
+        model.compile(loss="mae", optimizer="rmsprop")
+
+        model_input = np.random.random((2, 3, 4))
+        model_input[0, 1:, :] = 0
+        output = model.predict(model_input)
+        self.assertAllClose(output[0], model_input[0, 0, :])
+
+    def test_global_average_pooling_1d_with_ragged(self):
+        ragged_data = tf.ragged.constant(
+            [[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], [[1.0, 1.0], [2.0, 2.0]]],
+            ragged_rank=1,
+        )
+        dense_data = ragged_data.to_tensor()
+
+        inputs = keras.Input(shape=(None, 2), dtype="float32", ragged=True)
+        out = keras.layers.GlobalAveragePooling1D()(inputs)
+        model = keras.models.Model(inputs=inputs, outputs=out)
+        output_ragged = model.predict(ragged_data, steps=1)
+
+        inputs = keras.Input(shape=(None, 2), dtype="float32")
+        masking = keras.layers.Masking(mask_value=0.0, input_shape=(3, 2))(
+            inputs
+        )
+        out = keras.layers.GlobalAveragePooling1D()(masking)
+        model = keras.models.Model(inputs=inputs, outputs=out)
+        output_dense = model.predict(dense_data, steps=1)
+
+        self.assertAllEqual(output_ragged, output_dense)
+
+    def test_global_average_pooling_2d(self):
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling2D,
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 4, 5, 6),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling2D,
+            kwargs={"data_format": "channels_last"},
+            input_shape=(3, 5, 6, 4),
+        )
+
+    def test_global_average_pooling_3d(self):
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling3D,
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 4, 3, 4, 3),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling3D,
+            kwargs={"data_format": "channels_last"},
+            input_shape=(3, 4, 3, 4, 3),
+        )
+
+    def test_global_average_pooling_1d_keepdims(self):
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling1D,
+            kwargs={"keepdims": True},
+            input_shape=(3, 4, 5),
+            expected_output_shape=(None, 1, 5),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling1D,
+            kwargs={"data_format": "channels_first", "keepdims": True},
+            input_shape=(3, 4, 5),
+            expected_output_shape=(None, 4, 1),
+        )
+
+    def test_global_average_pooling_2d_keepdims(self):
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling2D,
+            kwargs={"data_format": "channels_first", "keepdims": True},
+            input_shape=(3, 4, 5, 6),
+            expected_output_shape=(None, 4, 1, 1),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling2D,
+            kwargs={"data_format": "channels_last", "keepdims": True},
+            input_shape=(3, 4, 5, 6),
+            expected_output_shape=(None, 1, 1, 6),
+        )
+
+    def test_global_average_pooling_3d_keepdims(self):
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling3D,
+            kwargs={"data_format": "channels_first", "keepdims": True},
+            input_shape=(3, 4, 3, 4, 3),
+            expected_output_shape=(None, 4, 1, 1, 1),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalAveragePooling3D,
+            kwargs={"data_format": "channels_last", "keepdims": True},
+            input_shape=(3, 4, 3, 4, 3),
+            expected_output_shape=(None, 1, 1, 1, 3),
+        )
+
+    def test_global_average_pooling_1d_keepdims_masking_support(self):
+        model = keras.Sequential()
+        model.add(keras.layers.Masking(mask_value=0.0, input_shape=(None, 4)))
+        model.add(keras.layers.GlobalAveragePooling1D(keepdims=True))
+        model.compile(loss="mae", optimizer="rmsprop")
+
+        model_input = np.random.random((2, 3, 4))
+        model_input[0, 1:, :] = 0
+        output = model.predict(model_input)
+        self.assertAllEqual((2, 1, 4), output.shape)
+        self.assertAllClose(output[0, 0], model_input[0, 0, :])
+
+    def test_global_average_pooling_1d_invalid_input_dimension(self):
+        with self.assertRaisesRegex(ValueError, r"""Incorrect input shape"""):
+            layer = keras.layers.GlobalAveragePooling1D()
+            layer.build((None, 0, 2))
+
+    def test_global_average_pooling_3d_invalid_input_dimension(self):
+        with self.assertRaisesRegex(ValueError, r"""Incorrect input shape"""):
+            layer = keras.layers.GlobalAveragePooling3D(keepdims=True)
+            layer.build((None, 0, 16, 16, 3))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/pooling/global_max_pooling1d.py b/keras/layers/pooling/global_max_pooling1d.py
index 4bcaa6869e4f..db84f22eb53a 100644
--- a/keras/layers/pooling/global_max_pooling1d.py
+++ b/keras/layers/pooling/global_max_pooling1d.py
@@ -13,73 +13,74 @@
 # limitations under the License.
 # ==============================================================================
 """Global max pooling 1D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.layers.pooling.base_global_pooling1d import GlobalPooling1D
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
+@keras_export("keras.layers.GlobalMaxPooling1D", "keras.layers.GlobalMaxPool1D")
 class GlobalMaxPooling1D(GlobalPooling1D):
-  """Global max pooling operation for 1D temporal data.
-
-  Downsamples the input representation by taking the maximum value over
-  the time dimension.
+    """Global max pooling operation for 1D temporal data.
 
-  For example:
+    Downsamples the input representation by taking the maximum value over
+    the time dimension.
 
-  >>> x = tf.constant([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
-  >>> x = tf.reshape(x, [3, 3, 1])
-  >>> x
-  <tf.Tensor: shape=(3, 3, 1), dtype=float32, numpy=
-  array([[[1.], [2.], [3.]],
-         [[4.], [5.], [6.]],
-         [[7.], [8.], [9.]]], dtype=float32)>
-  >>> max_pool_1d = tf.keras.layers.GlobalMaxPooling1D()
-  >>> max_pool_1d(x)
-  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
-  array([[3.],
-         [6.],
-         [9.], dtype=float32)>
+    For example:
 
-  Args:
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, steps, features)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, features, steps)`.
-    keepdims: A boolean, whether to keep the temporal dimension or not.
-      If `keepdims` is `False` (default), the rank of the tensor is reduced
-      for spatial dimensions.
-      If `keepdims` is `True`, the temporal dimension are retained with
-      length 1.
-      The behavior is the same as for `tf.reduce_max` or `np.max`.
+    >>> x = tf.constant([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
+    >>> x = tf.reshape(x, [3, 3, 1])
+    >>> x
+    <tf.Tensor: shape=(3, 3, 1), dtype=float32, numpy=
+    array([[[1.], [2.], [3.]],
+           [[4.], [5.], [6.]],
+           [[7.], [8.], [9.]]], dtype=float32)>
+    >>> max_pool_1d = tf.keras.layers.GlobalMaxPooling1D()
+    >>> max_pool_1d(x)
+    <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+    array([[3.],
+           [6.],
+           [9.], dtype=float32)>
 
-  Input shape:
-    - If `data_format='channels_last'`:
-      3D tensor with shape:
-      `(batch_size, steps, features)`
-    - If `data_format='channels_first'`:
-      3D tensor with shape:
-      `(batch_size, features, steps)`
+    Args:
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+      keepdims: A boolean, whether to keep the temporal dimension or not.
+        If `keepdims` is `False` (default), the rank of the tensor is reduced
+        for spatial dimensions.
+        If `keepdims` is `True`, the temporal dimension are retained with
+        length 1.
+        The behavior is the same as for `tf.reduce_max` or `np.max`.
 
-  Output shape:
-    - If `keepdims`=False:
-      2D tensor with shape `(batch_size, features)`.
-    - If `keepdims`=True:
+    Input shape:
       - If `data_format='channels_last'`:
-        3D tensor with shape `(batch_size, 1, features)`
+        3D tensor with shape:
+        `(batch_size, steps, features)`
       - If `data_format='channels_first'`:
-        3D tensor with shape `(batch_size, features, 1)`
-  """
+        3D tensor with shape:
+        `(batch_size, features, steps)`
+
+    Output shape:
+      - If `keepdims`=False:
+        2D tensor with shape `(batch_size, features)`.
+      - If `keepdims`=True:
+        - If `data_format='channels_last'`:
+          3D tensor with shape `(batch_size, 1, features)`
+        - If `data_format='channels_first'`:
+          3D tensor with shape `(batch_size, features, 1)`
+    """
 
-  def call(self, inputs):
-    steps_axis = 1 if self.data_format == 'channels_last' else 2
-    return backend.max(inputs, axis=steps_axis, keepdims=self.keepdims)
+    def call(self, inputs):
+        steps_axis = 1 if self.data_format == "channels_last" else 2
+        return backend.max(inputs, axis=steps_axis, keepdims=self.keepdims)
 
 
 # Alias
diff --git a/keras/layers/pooling/global_max_pooling2d.py b/keras/layers/pooling/global_max_pooling2d.py
index dee0a258a060..77ef11b3abdd 100644
--- a/keras/layers/pooling/global_max_pooling2d.py
+++ b/keras/layers/pooling/global_max_pooling2d.py
@@ -13,65 +13,67 @@
 # limitations under the License.
 # ==============================================================================
 """Global max pooling 2D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.layers.pooling.base_global_pooling2d import GlobalPooling2D
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
+@keras_export("keras.layers.GlobalMaxPooling2D", "keras.layers.GlobalMaxPool2D")
 class GlobalMaxPooling2D(GlobalPooling2D):
-  """Global max pooling operation for spatial data.
-
-  Examples:
+    """Global max pooling operation for spatial data.
 
-  >>> input_shape = (2, 4, 5, 3)
-  >>> x = tf.random.normal(input_shape)
-  >>> y = tf.keras.layers.GlobalMaxPool2D()(x)
-  >>> print(y.shape)
-  (2, 3)
+    Examples:
 
-  Args:
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    keepdims: A boolean, whether to keep the spatial dimensions or not.
-      If `keepdims` is `False` (default), the rank of the tensor is reduced
-      for spatial dimensions.
-      If `keepdims` is `True`, the spatial dimensions are retained with
-      length 1.
-      The behavior is the same as for `tf.reduce_max` or `np.max`.
+    >>> input_shape = (2, 4, 5, 3)
+    >>> x = tf.random.normal(input_shape)
+    >>> y = tf.keras.layers.GlobalMaxPooling2D()(x)
+    >>> print(y.shape)
+    (2, 3)
 
-  Input shape:
-    - If `data_format='channels_last'`:
-      4D tensor with shape `(batch_size, rows, cols, channels)`.
-    - If `data_format='channels_first'`:
-      4D tensor with shape `(batch_size, channels, rows, cols)`.
+    Args:
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      keepdims: A boolean, whether to keep the spatial dimensions or not.
+        If `keepdims` is `False` (default), the rank of the tensor is reduced
+        for spatial dimensions.
+        If `keepdims` is `True`, the spatial dimensions are retained with
+        length 1.
+        The behavior is the same as for `tf.reduce_max` or `np.max`.
 
-  Output shape:
-    - If `keepdims`=False:
-      2D tensor with shape `(batch_size, channels)`.
-    - If `keepdims`=True:
+    Input shape:
       - If `data_format='channels_last'`:
-        4D tensor with shape `(batch_size, 1, 1, channels)`
+        4D tensor with shape `(batch_size, rows, cols, channels)`.
       - If `data_format='channels_first'`:
-        4D tensor with shape `(batch_size, channels, 1, 1)`
-  """
+        4D tensor with shape `(batch_size, channels, rows, cols)`.
+
+    Output shape:
+      - If `keepdims`=False:
+        2D tensor with shape `(batch_size, channels)`.
+      - If `keepdims`=True:
+        - If `data_format='channels_last'`:
+          4D tensor with shape `(batch_size, 1, 1, channels)`
+        - If `data_format='channels_first'`:
+          4D tensor with shape `(batch_size, channels, 1, 1)`
+    """
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      return backend.max(inputs, axis=[1, 2], keepdims=self.keepdims)
-    else:
-      return backend.max(inputs, axis=[2, 3], keepdims=self.keepdims)
+    def call(self, inputs):
+        if self.data_format == "channels_last":
+            return backend.max(inputs, axis=[1, 2], keepdims=self.keepdims)
+        else:
+            return backend.max(inputs, axis=[2, 3], keepdims=self.keepdims)
 
 
 # Alias
diff --git a/keras/layers/pooling/global_max_pooling3d.py b/keras/layers/pooling/global_max_pooling3d.py
index 7df93d13df93..f5385fc9b414 100644
--- a/keras/layers/pooling/global_max_pooling3d.py
+++ b/keras/layers/pooling/global_max_pooling3d.py
@@ -13,59 +13,61 @@
 # limitations under the License.
 # ==============================================================================
 """Global max pooling 3D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import backend
 from keras.layers.pooling.base_global_pooling3d import GlobalPooling3D
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
+@keras_export("keras.layers.GlobalMaxPooling3D", "keras.layers.GlobalMaxPool3D")
 class GlobalMaxPooling3D(GlobalPooling3D):
-  """Global Max pooling operation for 3D data.
-
-  Args:
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      while `channels_first` corresponds to inputs with shape
-      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    keepdims: A boolean, whether to keep the spatial dimensions or not.
-      If `keepdims` is `False` (default), the rank of the tensor is reduced
-      for spatial dimensions.
-      If `keepdims` is `True`, the spatial dimensions are retained with
-      length 1.
-      The behavior is the same as for `tf.reduce_max` or `np.max`.
+    """Global Max pooling operation for 3D data.
 
-  Input shape:
-    - If `data_format='channels_last'`:
-      5D tensor with shape:
-      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-    - If `data_format='channels_first'`:
-      5D tensor with shape:
-      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+    Args:
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+        while `channels_first` corresponds to inputs with shape
+        `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      keepdims: A boolean, whether to keep the spatial dimensions or not.
+        If `keepdims` is `False` (default), the rank of the tensor is reduced
+        for spatial dimensions.
+        If `keepdims` is `True`, the spatial dimensions are retained with
+        length 1.
+        The behavior is the same as for `tf.reduce_max` or `np.max`.
 
-  Output shape:
-    - If `keepdims`=False:
-      2D tensor with shape `(batch_size, channels)`.
-    - If `keepdims`=True:
+    Input shape:
       - If `data_format='channels_last'`:
-        5D tensor with shape `(batch_size, 1, 1, 1, channels)`
+        5D tensor with shape:
+        `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
       - If `data_format='channels_first'`:
-        5D tensor with shape `(batch_size, channels, 1, 1, 1)`
-  """
+        5D tensor with shape:
+        `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+    Output shape:
+      - If `keepdims`=False:
+        2D tensor with shape `(batch_size, channels)`.
+      - If `keepdims`=True:
+        - If `data_format='channels_last'`:
+          5D tensor with shape `(batch_size, 1, 1, 1, channels)`
+        - If `data_format='channels_first'`:
+          5D tensor with shape `(batch_size, channels, 1, 1, 1)`
+    """
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      return backend.max(inputs, axis=[1, 2, 3], keepdims=self.keepdims)
-    else:
-      return backend.max(inputs, axis=[2, 3, 4], keepdims=self.keepdims)
+    def call(self, inputs):
+        if self.data_format == "channels_last":
+            return backend.max(inputs, axis=[1, 2, 3], keepdims=self.keepdims)
+        else:
+            return backend.max(inputs, axis=[2, 3, 4], keepdims=self.keepdims)
 
 
 # Alias
diff --git a/keras/layers/pooling/global_max_pooling_test.py b/keras/layers/pooling/global_max_pooling_test.py
index f8f4dcd1db1e..ccb59703a3c2 100644
--- a/keras/layers/pooling/global_max_pooling_test.py
+++ b/keras/layers/pooling/global_max_pooling_test.py
@@ -14,98 +14,124 @@
 # ==============================================================================
 """Tests for global max pooling layers."""
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class GlobalMaxPoolingTest(tf.test.TestCase, parameterized.TestCase):
+    def test_global_max_pooling_1d(self):
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling1D, input_shape=(3, 4, 5)
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling1D,
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 4, 5),
+        )
+
+    def test_global_max_pooling_2d_with_ragged(self):
+        ragged_data = tf.ragged.constant(
+            [
+                [[[1.0], [1.0]], [[2.0], [2.0]], [[3.0], [3.0]]],
+                [[[1.0], [1.0]], [[2.0], [2.0]]],
+            ],
+            ragged_rank=1,
+        )
+        dense_data = ragged_data.to_tensor()
+
+        inputs = keras.Input(shape=(None, 2, 1), dtype="float32", ragged=True)
+        out = keras.layers.GlobalMaxPooling2D()(inputs)
+        model = keras.models.Model(inputs=inputs, outputs=out)
+        output_ragged = model.predict(ragged_data, steps=1)
+
+        inputs = keras.Input(shape=(None, 2, 1), dtype="float32")
+        out = keras.layers.GlobalMaxPooling2D()(inputs)
+        model = keras.models.Model(inputs=inputs, outputs=out)
+        output_dense = model.predict(dense_data, steps=1)
+
+        self.assertAllEqual(output_ragged, output_dense)
+
+    def test_global_max_pooling_2d(self):
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling2D,
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 4, 5, 6),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling2D,
+            kwargs={"data_format": "channels_last"},
+            input_shape=(3, 5, 6, 4),
+        )
+
+    def test_global_maxpooling_3d(self):
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling3D,
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 4, 3, 4, 3),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling3D,
+            kwargs={"data_format": "channels_last"},
+            input_shape=(3, 4, 3, 4, 3),
+        )
+
+    def test_global_max_pooling_1d_keepdims(self):
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling1D,
+            kwargs={"keepdims": True},
+            input_shape=(3, 4, 5),
+            expected_output_shape=(None, 1, 5),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling1D,
+            kwargs={"data_format": "channels_first", "keepdims": True},
+            input_shape=(3, 4, 5),
+            expected_output_shape=(None, 4, 1),
+        )
+
+    def test_global_max_pooling_2d_keepdims(self):
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling2D,
+            kwargs={"data_format": "channels_first", "keepdims": True},
+            input_shape=(3, 4, 5, 6),
+            expected_output_shape=(None, 4, 1, 1),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling2D,
+            kwargs={"data_format": "channels_last", "keepdims": True},
+            input_shape=(3, 4, 5, 6),
+            expected_output_shape=(None, 1, 1, 6),
+        )
+
+    def test_global_max_pooling_3d_keepdims(self):
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling3D,
+            kwargs={"data_format": "channels_first", "keepdims": True},
+            input_shape=(3, 4, 3, 4, 3),
+            expected_output_shape=(None, 4, 1, 1, 1),
+        )
+        test_utils.layer_test(
+            keras.layers.GlobalMaxPooling3D,
+            kwargs={"data_format": "channels_last", "keepdims": True},
+            input_shape=(3, 4, 3, 4, 3),
+            expected_output_shape=(None, 1, 1, 1, 3),
+        )
+
+    def test_global_max_pooling_1d_invalid_input_dimension(self):
+        with self.assertRaisesRegex(ValueError, r"""Incorrect input shape"""):
+            layer = keras.layers.GlobalMaxPooling1D()
+            layer.build((None, 0, 2))
+
+    def test_global_max_pooling_3d_invalid_input_dimension(self):
+        with self.assertRaisesRegex(ValueError, r"""Incorrect input shape"""):
+            layer = keras.layers.GlobalMaxPooling3D(keepdims=True)
+            layer.build((None, 0, 16, 16, 3))
+
 
-  def test_global_max_pooling_1d(self):
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling1D, input_shape=(3, 4, 5))
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling1D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 4, 5))
-
-  def test_global_max_pooling_2d_with_ragged(self):
-    ragged_data = tf.ragged.constant(
-        [[[[1.0], [1.0]], [[2.0], [2.0]], [[3.0], [3.0]]],
-         [[[1.0], [1.0]], [[2.0], [2.0]]]],
-        ragged_rank=1)
-    dense_data = ragged_data.to_tensor()
-
-    inputs = keras.Input(shape=(None, 2, 1), dtype='float32', ragged=True)
-    out = keras.layers.GlobalMaxPooling2D()(inputs)
-    model = keras.models.Model(inputs=inputs, outputs=out)
-    output_ragged = model.predict(ragged_data, steps=1)
-
-    inputs = keras.Input(shape=(None, 2, 1), dtype='float32')
-    out = keras.layers.GlobalMaxPooling2D()(inputs)
-    model = keras.models.Model(inputs=inputs, outputs=out)
-    output_dense = model.predict(dense_data, steps=1)
-
-    self.assertAllEqual(output_ragged, output_dense)
-
-  def test_global_max_pooling_2d(self):
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling2D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 4, 5, 6))
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling2D,
-        kwargs={'data_format': 'channels_last'},
-        input_shape=(3, 5, 6, 4))
-
-  def test_global_maxpooling_3d(self):
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling3D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 4, 3, 4, 3))
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling3D,
-        kwargs={'data_format': 'channels_last'},
-        input_shape=(3, 4, 3, 4, 3))
-
-  def test_global_max_pooling_1d_keepdims(self):
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling1D,
-        kwargs={'keepdims': True},
-        input_shape=(3, 4, 5),
-        expected_output_shape=(None, 1, 5))
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling1D,
-        kwargs={'data_format': 'channels_first', 'keepdims': True},
-        input_shape=(3, 4, 5),
-        expected_output_shape=(None, 4, 1))
-
-  def test_global_max_pooling_2d_keepdims(self):
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling2D,
-        kwargs={'data_format': 'channels_first', 'keepdims': True},
-        input_shape=(3, 4, 5, 6),
-        expected_output_shape=(None, 4, 1, 1))
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling2D,
-        kwargs={'data_format': 'channels_last', 'keepdims': True},
-        input_shape=(3, 4, 5, 6),
-        expected_output_shape=(None, 1, 1, 6))
-
-  def test_global_max_pooling_3d_keepdims(self):
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling3D,
-        kwargs={'data_format': 'channels_first', 'keepdims': True},
-        input_shape=(3, 4, 3, 4, 3),
-        expected_output_shape=(None, 4, 1, 1, 1))
-    test_utils.layer_test(
-        keras.layers.GlobalMaxPooling3D,
-        kwargs={'data_format': 'channels_last', 'keepdims': True},
-        input_shape=(3, 4, 3, 4, 3),
-        expected_output_shape=(None, 1, 1, 1, 3))
-
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/pooling/max_pooling1d.py b/keras/layers/pooling/max_pooling1d.py
index ff090941d5cd..67e915d4b79c 100644
--- a/keras/layers/pooling/max_pooling1d.py
+++ b/keras/layers/pooling/max_pooling1d.py
@@ -13,106 +13,114 @@
 # limitations under the License.
 # ==============================================================================
 """Max pooling 1D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import functools
 
 from keras import backend
 from keras.layers.pooling.base_pooling1d import Pooling1D
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
+@keras_export("keras.layers.MaxPooling1D", "keras.layers.MaxPool1D")
 class MaxPooling1D(Pooling1D):
-  """Max pooling operation for 1D temporal data.
-
-  Downsamples the input representation by taking the maximum value over a
-  spatial window of size `pool_size`. The window is shifted by `strides`.  The
-  resulting output, when using the `"valid"` padding option, has a shape of:
-  `output_shape = (input_shape - pool_size + 1) / strides)`
-
-  The resulting output shape when using the `"same"` padding option is:
-  `output_shape = input_shape / strides`
-
-  For example, for `strides=1` and `padding="valid"`:
-
-  >>> x = tf.constant([1., 2., 3., 4., 5.])
-  >>> x = tf.reshape(x, [1, 5, 1])
-  >>> max_pool_1d = tf.keras.layers.MaxPooling1D(pool_size=2,
-  ...    strides=1, padding='valid')
-  >>> max_pool_1d(x)
-  <tf.Tensor: shape=(1, 4, 1), dtype=float32, numpy=
-  array([[[2.],
-          [3.],
-          [4.],
-          [5.]]], dtype=float32)>
-
-  For example, for `strides=2` and `padding="valid"`:
-
-  >>> x = tf.constant([1., 2., 3., 4., 5.])
-  >>> x = tf.reshape(x, [1, 5, 1])
-  >>> max_pool_1d = tf.keras.layers.MaxPooling1D(pool_size=2,
-  ...    strides=2, padding='valid')
-  >>> max_pool_1d(x)
-  <tf.Tensor: shape=(1, 2, 1), dtype=float32, numpy=
-  array([[[2.],
-          [4.]]], dtype=float32)>
-
-  For example, for `strides=1` and `padding="same"`:
-
-  >>> x = tf.constant([1., 2., 3., 4., 5.])
-  >>> x = tf.reshape(x, [1, 5, 1])
-  >>> max_pool_1d = tf.keras.layers.MaxPooling1D(pool_size=2,
-  ...    strides=1, padding='same')
-  >>> max_pool_1d(x)
-  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
-  array([[[2.],
-          [3.],
-          [4.],
-          [5.],
-          [5.]]], dtype=float32)>
-
-  Args:
-    pool_size: Integer, size of the max pooling window.
-    strides: Integer, or None. Specifies how much the pooling window moves
-      for each pooling step.
-      If None, it will default to `pool_size`.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, steps, features)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, features, steps)`.
-
-  Input shape:
-    - If `data_format='channels_last'`:
-      3D tensor with shape `(batch_size, steps, features)`.
-    - If `data_format='channels_first'`:
-      3D tensor with shape `(batch_size, features, steps)`.
-
-  Output shape:
-    - If `data_format='channels_last'`:
-      3D tensor with shape `(batch_size, downsampled_steps, features)`.
-    - If `data_format='channels_first'`:
-      3D tensor with shape `(batch_size, features, downsampled_steps)`.
-  """
-
-  def __init__(self, pool_size=2, strides=None,
-               padding='valid', data_format='channels_last', **kwargs):
-
-    super().__init__(
-        functools.partial(backend.pool2d, pool_mode='max'),
-        pool_size=pool_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        **kwargs)
+    """Max pooling operation for 1D temporal data.
+
+    Downsamples the input representation by taking the maximum value over a
+    spatial window of size `pool_size`. The window is shifted by `strides`.  The
+    resulting output, when using the `"valid"` padding option, has a shape of:
+    `output_shape = (input_shape - pool_size + 1) / strides)`
+
+    The resulting output shape when using the `"same"` padding option is:
+    `output_shape = input_shape / strides`
+
+    For example, for `strides=1` and `padding="valid"`:
+
+    >>> x = tf.constant([1., 2., 3., 4., 5.])
+    >>> x = tf.reshape(x, [1, 5, 1])
+    >>> max_pool_1d = tf.keras.layers.MaxPooling1D(pool_size=2,
+    ...    strides=1, padding='valid')
+    >>> max_pool_1d(x)
+    <tf.Tensor: shape=(1, 4, 1), dtype=float32, numpy=
+    array([[[2.],
+            [3.],
+            [4.],
+            [5.]]], dtype=float32)>
+
+    For example, for `strides=2` and `padding="valid"`:
+
+    >>> x = tf.constant([1., 2., 3., 4., 5.])
+    >>> x = tf.reshape(x, [1, 5, 1])
+    >>> max_pool_1d = tf.keras.layers.MaxPooling1D(pool_size=2,
+    ...    strides=2, padding='valid')
+    >>> max_pool_1d(x)
+    <tf.Tensor: shape=(1, 2, 1), dtype=float32, numpy=
+    array([[[2.],
+            [4.]]], dtype=float32)>
+
+    For example, for `strides=1` and `padding="same"`:
+
+    >>> x = tf.constant([1., 2., 3., 4., 5.])
+    >>> x = tf.reshape(x, [1, 5, 1])
+    >>> max_pool_1d = tf.keras.layers.MaxPooling1D(pool_size=2,
+    ...    strides=1, padding='same')
+    >>> max_pool_1d(x)
+    <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+    array([[[2.],
+            [3.],
+            [4.],
+            [5.],
+            [5.]]], dtype=float32)>
+
+    Args:
+      pool_size: Integer, size of the max pooling window.
+      strides: Integer, or None. Specifies how much the pooling window moves
+        for each pooling step.
+        If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+
+    Input shape:
+      - If `data_format='channels_last'`:
+        3D tensor with shape `(batch_size, steps, features)`.
+      - If `data_format='channels_first'`:
+        3D tensor with shape `(batch_size, features, steps)`.
+
+    Output shape:
+      - If `data_format='channels_last'`:
+        3D tensor with shape `(batch_size, downsampled_steps, features)`.
+      - If `data_format='channels_first'`:
+        3D tensor with shape `(batch_size, features, downsampled_steps)`.
+    """
+
+    def __init__(
+        self,
+        pool_size=2,
+        strides=None,
+        padding="valid",
+        data_format="channels_last",
+        **kwargs
+    ):
+
+        super().__init__(
+            functools.partial(backend.pool2d, pool_mode="max"),
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            **kwargs
+        )
 
 
 # Alias
diff --git a/keras/layers/pooling/max_pooling2d.py b/keras/layers/pooling/max_pooling2d.py
index 1ac40cd41acf..f21ab07f2142 100644
--- a/keras/layers/pooling/max_pooling2d.py
+++ b/keras/layers/pooling/max_pooling2d.py
@@ -13,149 +13,158 @@
 # limitations under the License.
 # ==============================================================================
 """Max pooling 2D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.layers.pooling.base_pooling2d import Pooling2D
+
 import tensorflow.compat.v2 as tf
 
+from keras.layers.pooling.base_pooling2d import Pooling2D
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
+@keras_export("keras.layers.MaxPooling2D", "keras.layers.MaxPool2D")
 class MaxPooling2D(Pooling2D):
-  """Max pooling operation for 2D spatial data.
-
-  Downsamples the input along its spatial dimensions (height and width)
-  by taking the maximum value over an input window
-  (of size defined by `pool_size`) for each channel of the input.
-  The window is shifted by `strides` along each dimension.
-
-  The resulting output,
-  when using the `"valid"` padding option, has a spatial shape
-  (number of rows or columns) of:
-  `output_shape = math.floor((input_shape - pool_size) / strides) + 1`
-  (when `input_shape >= pool_size`)
-
-  The resulting output shape when using the `"same"` padding option is:
-  `output_shape = math.floor((input_shape - 1) / strides) + 1`
-
-  For example, for `strides=(1, 1)` and `padding="valid"`:
-
-  >>> x = tf.constant([[1., 2., 3.],
-  ...                  [4., 5., 6.],
-  ...                  [7., 8., 9.]])
-  >>> x = tf.reshape(x, [1, 3, 3, 1])
-  >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
-  ...    strides=(1, 1), padding='valid')
-  >>> max_pool_2d(x)
-  <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-    array([[[[5.],
-             [6.]],
-            [[8.],
-             [9.]]]], dtype=float32)>
-
-  For example, for `strides=(2, 2)` and `padding="valid"`:
-
-  >>> x = tf.constant([[1., 2., 3., 4.],
-  ...                  [5., 6., 7., 8.],
-  ...                  [9., 10., 11., 12.]])
-  >>> x = tf.reshape(x, [1, 3, 4, 1])
-  >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
-  ...    strides=(2, 2), padding='valid')
-  >>> max_pool_2d(x)
-  <tf.Tensor: shape=(1, 1, 2, 1), dtype=float32, numpy=
-    array([[[[6.],
-             [8.]]]], dtype=float32)>
-
-  Usage Example:
-
-  >>> input_image = tf.constant([[[[1.], [1.], [2.], [4.]],
-  ...                            [[2.], [2.], [3.], [2.]],
-  ...                            [[4.], [1.], [1.], [1.]],
-  ...                            [[2.], [2.], [1.], [4.]]]])
-  >>> output = tf.constant([[[[1], [0]],
-  ...                       [[0], [1]]]])
-  >>> model = tf.keras.models.Sequential()
-  >>> model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
-  ...    input_shape=(4, 4, 1)))
-  >>> model.compile('adam', 'mean_squared_error')
-  >>> model.predict(input_image, steps=1)
-  array([[[[2.],
-           [4.]],
-          [[4.],
-           [4.]]]], dtype=float32)
-
-  For example, for stride=(1, 1) and padding="same":
-
-  >>> x = tf.constant([[1., 2., 3.],
-  ...                  [4., 5., 6.],
-  ...                  [7., 8., 9.]])
-  >>> x = tf.reshape(x, [1, 3, 3, 1])
-  >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
-  ...    strides=(1, 1), padding='same')
-  >>> max_pool_2d(x)
-  <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
-    array([[[[5.],
-             [6.],
-             [6.]],
-            [[8.],
-             [9.],
-             [9.]],
-            [[8.],
-             [9.],
-             [9.]]]], dtype=float32)>
-
-  Args:
-    pool_size: integer or tuple of 2 integers,
-      window size over which to take the maximum.
-      `(2, 2)` will take the max value over a 2x2 pooling window.
-      If only one integer is specified, the same window length
-      will be used for both dimensions.
-    strides: Integer, tuple of 2 integers, or None.
-      Strides values.  Specifies how far the pooling window moves
-      for each pooling step. If None, it will default to `pool_size`.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    - If `data_format='channels_last'`:
-      4D tensor with shape `(batch_size, rows, cols, channels)`.
-    - If `data_format='channels_first'`:
-      4D tensor with shape `(batch_size, channels, rows, cols)`.
-
-  Output shape:
-    - If `data_format='channels_last'`:
-      4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
-    - If `data_format='channels_first'`:
-      4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
-
-  Returns:
-    A tensor of rank 4 representing the maximum pooled values.  See above for
-    output shape.
-  """
-
-  def __init__(self,
-               pool_size=(2, 2),
-               strides=None,
-               padding='valid',
-               data_format=None,
-               **kwargs):
-    super().__init__(
-        tf.compat.v1.nn.max_pool,
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, **kwargs)
+    """Max pooling operation for 2D spatial data.
+
+    Downsamples the input along its spatial dimensions (height and width)
+    by taking the maximum value over an input window
+    (of size defined by `pool_size`) for each channel of the input.
+    The window is shifted by `strides` along each dimension.
+
+    The resulting output,
+    when using the `"valid"` padding option, has a spatial shape
+    (number of rows or columns) of:
+    `output_shape = math.floor((input_shape - pool_size) / strides) + 1`
+    (when `input_shape >= pool_size`)
+
+    The resulting output shape when using the `"same"` padding option is:
+    `output_shape = math.floor((input_shape - 1) / strides) + 1`
+
+    For example, for `strides=(1, 1)` and `padding="valid"`:
+
+    >>> x = tf.constant([[1., 2., 3.],
+    ...                  [4., 5., 6.],
+    ...                  [7., 8., 9.]])
+    >>> x = tf.reshape(x, [1, 3, 3, 1])
+    >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+    ...    strides=(1, 1), padding='valid')
+    >>> max_pool_2d(x)
+    <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
+      array([[[[5.],
+               [6.]],
+              [[8.],
+               [9.]]]], dtype=float32)>
+
+    For example, for `strides=(2, 2)` and `padding="valid"`:
+
+    >>> x = tf.constant([[1., 2., 3., 4.],
+    ...                  [5., 6., 7., 8.],
+    ...                  [9., 10., 11., 12.]])
+    >>> x = tf.reshape(x, [1, 3, 4, 1])
+    >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+    ...    strides=(2, 2), padding='valid')
+    >>> max_pool_2d(x)
+    <tf.Tensor: shape=(1, 1, 2, 1), dtype=float32, numpy=
+      array([[[[6.],
+               [8.]]]], dtype=float32)>
+
+    Usage Example:
+
+    >>> input_image = tf.constant([[[[1.], [1.], [2.], [4.]],
+    ...                            [[2.], [2.], [3.], [2.]],
+    ...                            [[4.], [1.], [1.], [1.]],
+    ...                            [[2.], [2.], [1.], [4.]]]])
+    >>> output = tf.constant([[[[1], [0]],
+    ...                       [[0], [1]]]])
+    >>> model = tf.keras.models.Sequential()
+    >>> model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+    ...    input_shape=(4, 4, 1)))
+    >>> model.compile('adam', 'mean_squared_error')
+    >>> model.predict(input_image, steps=1)
+    array([[[[2.],
+             [4.]],
+            [[4.],
+             [4.]]]], dtype=float32)
+
+    For example, for stride=(1, 1) and padding="same":
+
+    >>> x = tf.constant([[1., 2., 3.],
+    ...                  [4., 5., 6.],
+    ...                  [7., 8., 9.]])
+    >>> x = tf.reshape(x, [1, 3, 3, 1])
+    >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+    ...    strides=(1, 1), padding='same')
+    >>> max_pool_2d(x)
+    <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+      array([[[[5.],
+               [6.],
+               [6.]],
+              [[8.],
+               [9.],
+               [9.]],
+              [[8.],
+               [9.],
+               [9.]]]], dtype=float32)>
+
+    Args:
+      pool_size: integer or tuple of 2 integers,
+        window size over which to take the maximum.
+        `(2, 2)` will take the max value over a 2x2 pooling window.
+        If only one integer is specified, the same window length
+        will be used for both dimensions.
+      strides: Integer, tuple of 2 integers, or None.
+        Strides values.  Specifies how far the pooling window moves
+        for each pooling step. If None, it will default to `pool_size`.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      - If `data_format='channels_last'`:
+        4D tensor with shape `(batch_size, rows, cols, channels)`.
+      - If `data_format='channels_first'`:
+        4D tensor with shape `(batch_size, channels, rows, cols)`.
+
+    Output shape:
+      - If `data_format='channels_last'`:
+        4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
+      - If `data_format='channels_first'`:
+        4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
+
+    Returns:
+      A tensor of rank 4 representing the maximum pooled values.  See above for
+      output shape.
+    """
+
+    def __init__(
+        self,
+        pool_size=(2, 2),
+        strides=None,
+        padding="valid",
+        data_format=None,
+        **kwargs
+    ):
+        super().__init__(
+            tf.compat.v1.nn.max_pool,
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            **kwargs
+        )
 
 
 # Alias
diff --git a/keras/layers/pooling/max_pooling3d.py b/keras/layers/pooling/max_pooling3d.py
index fc31276ceb44..64b2575732eb 100644
--- a/keras/layers/pooling/max_pooling3d.py
+++ b/keras/layers/pooling/max_pooling3d.py
@@ -13,83 +13,92 @@
 # limitations under the License.
 # ==============================================================================
 """Max pooling 3D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.layers.pooling.base_pooling3d import Pooling3D
+
 import tensorflow.compat.v2 as tf
 
+from keras.layers.pooling.base_pooling3d import Pooling3D
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
+@keras_export("keras.layers.MaxPooling3D", "keras.layers.MaxPool3D")
 class MaxPooling3D(Pooling3D):
-  """Max pooling operation for 3D data (spatial or spatio-temporal).
-
-  Downsamples the input along its spatial dimensions (depth, height, and width)
-  by taking the maximum value over an input window
-  (of size defined by `pool_size`) for each channel of the input.
-  The window is shifted by `strides` along each dimension.
-
-  Args:
-    pool_size: Tuple of 3 integers,
-      factors by which to downscale (dim1, dim2, dim3).
-      `(2, 2, 2)` will halve the size of the 3D input in each dimension.
-    strides: tuple of 3 integers, or None. Strides values.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      while `channels_first` corresponds to inputs with shape
-      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    - If `data_format='channels_last'`:
-      5D tensor with shape:
-      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-    - If `data_format='channels_first'`:
-      5D tensor with shape:
-      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
-
-  Output shape:
-    - If `data_format='channels_last'`:
-      5D tensor with shape:
-      `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
-    - If `data_format='channels_first'`:
-      5D tensor with shape:
-      `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
-
-  Example:
-
-  ```python
-  depth = 30
-  height = 30
-  width = 30
-  input_channels = 3
-
-  inputs = tf.keras.Input(shape=(depth, height, width, input_channels))
-  layer = tf.keras.layers.MaxPooling3D(pool_size=3)
-  outputs = layer(inputs)  # Shape: (batch_size, 10, 10, 10, 3)
-  ```
-  """
-
-  def __init__(self,
-               pool_size=(2, 2, 2),
-               strides=None,
-               padding='valid',
-               data_format=None,
-               **kwargs):
-    super().__init__(
-        tf.nn.max_pool3d,
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, **kwargs)
+    """Max pooling operation for 3D data (spatial or spatio-temporal).
+
+    Downsamples the input along its spatial dimensions (depth, height, and
+    width) by taking the maximum value over an input window (of size defined by
+    `pool_size`) for each channel of the input.  The window is shifted by
+    `strides` along each dimension.
+
+    Args:
+      pool_size: Tuple of 3 integers,
+        factors by which to downscale (dim1, dim2, dim3).
+        `(2, 2, 2)` will halve the size of the 3D input in each dimension.
+      strides: tuple of 3 integers, or None. Strides values.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+        while `channels_first` corresponds to inputs with shape
+        `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      - If `data_format='channels_last'`:
+        5D tensor with shape:
+        `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      - If `data_format='channels_first'`:
+        5D tensor with shape:
+        `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+
+    Output shape:
+      - If `data_format='channels_last'`:
+        5D tensor with shape:
+        `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+      - If `data_format='channels_first'`:
+        5D tensor with shape:
+        `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+
+    Example:
+
+    ```python
+    depth = 30
+    height = 30
+    width = 30
+    input_channels = 3
+
+    inputs = tf.keras.Input(shape=(depth, height, width, input_channels))
+    layer = tf.keras.layers.MaxPooling3D(pool_size=3)
+    outputs = layer(inputs)  # Shape: (batch_size, 10, 10, 10, 3)
+    ```
+    """
+
+    def __init__(
+        self,
+        pool_size=(2, 2, 2),
+        strides=None,
+        padding="valid",
+        data_format=None,
+        **kwargs
+    ):
+        super().__init__(
+            tf.nn.max_pool3d,
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            **kwargs
+        )
 
 
 # Alias
diff --git a/keras/layers/pooling/max_pooling_test.py b/keras/layers/pooling/max_pooling_test.py
index 70fc151674c5..e1e0bc568ba2 100644
--- a/keras/layers/pooling/max_pooling_test.py
+++ b/keras/layers/pooling/max_pooling_test.py
@@ -14,62 +14,61 @@
 # ==============================================================================
 """Tests for max pooling layers."""
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class MaxPoolingTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_max_pooling_1d(self):
-    for padding in ['valid', 'same']:
-      for stride in [1, 2]:
+    def test_max_pooling_1d(self):
+        for padding in ["valid", "same"]:
+            for stride in [1, 2]:
+                test_utils.layer_test(
+                    keras.layers.MaxPooling1D,
+                    kwargs={"strides": stride, "padding": padding},
+                    input_shape=(3, 5, 4),
+                )
         test_utils.layer_test(
             keras.layers.MaxPooling1D,
+            kwargs={"data_format": "channels_first"},
+            input_shape=(3, 2, 6),
+        )
+
+    def test_max_pooling_2d(self):
+        pool_size = (3, 3)
+        for strides in [(1, 1), (2, 2)]:
+            test_utils.layer_test(
+                keras.layers.MaxPooling2D,
+                kwargs={
+                    "strides": strides,
+                    "padding": "valid",
+                    "pool_size": pool_size,
+                },
+                input_shape=(3, 5, 6, 4),
+            )
+
+    def test_max_pooling_3d(self):
+        pool_size = (3, 3, 3)
+        test_utils.layer_test(
+            keras.layers.MaxPooling3D,
+            kwargs={"strides": 2, "padding": "valid", "pool_size": pool_size},
+            input_shape=(3, 11, 12, 10, 4),
+        )
+        test_utils.layer_test(
+            keras.layers.MaxPooling3D,
             kwargs={
-                'strides': stride,
-                'padding': padding
+                "strides": 3,
+                "padding": "valid",
+                "data_format": "channels_first",
+                "pool_size": pool_size,
             },
-            input_shape=(3, 5, 4))
-    test_utils.layer_test(
-        keras.layers.MaxPooling1D,
-        kwargs={'data_format': 'channels_first'},
-        input_shape=(3, 2, 6))
-
-  def test_max_pooling_2d(self):
-    pool_size = (3, 3)
-    for strides in [(1, 1), (2, 2)]:
-      test_utils.layer_test(
-          keras.layers.MaxPooling2D,
-          kwargs={
-              'strides': strides,
-              'padding': 'valid',
-              'pool_size': pool_size
-          },
-          input_shape=(3, 5, 6, 4))
+            input_shape=(3, 4, 11, 12, 10),
+        )
 
-  def test_max_pooling_3d(self):
-    pool_size = (3, 3, 3)
-    test_utils.layer_test(
-        keras.layers.MaxPooling3D,
-        kwargs={
-            'strides': 2,
-            'padding': 'valid',
-            'pool_size': pool_size
-        },
-        input_shape=(3, 11, 12, 10, 4))
-    test_utils.layer_test(
-        keras.layers.MaxPooling3D,
-        kwargs={
-            'strides': 3,
-            'padding': 'valid',
-            'data_format': 'channels_first',
-            'pool_size': pool_size
-        },
-        input_shape=(3, 4, 11, 12, 10))
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/preprocessing/BUILD b/keras/layers/preprocessing/BUILD
index ca9cd75ca4af..17acbcd0aa3f 100644
--- a/keras/layers/preprocessing/BUILD
+++ b/keras/layers/preprocessing/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   Contains the Keras preprocess layers (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 # buildifier: disable=same-origin-load
@@ -8,6 +9,7 @@ load("@org_keras//keras:keras.bzl", "cuda_py_test")
 load("@org_keras//keras:keras.bzl", "distribute_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
@@ -256,6 +258,9 @@ distribute_py_test(
     name = "category_encoding_distribution_test",
     srcs = ["category_encoding_distribution_test.py"],
     disable_mlir_bridge = False,
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "category_encoding_distribution_test.py",
     python_version = "PY3",
     shard_count = 4,
@@ -282,6 +287,9 @@ distribute_py_test(
 distribute_py_test(
     name = "image_preprocessing_distribution_test",
     srcs = ["image_preprocessing_distribution_test.py"],
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "image_preprocessing_distribution_test.py",
     python_version = "PY3",
     shard_count = 4,
@@ -323,6 +331,9 @@ tf_py_test(
 distribute_py_test(
     name = "discretization_distribution_test",
     srcs = ["discretization_distribution_test.py"],
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "discretization_distribution_test.py",
     python_version = "PY3",
     shard_count = 4,
@@ -362,6 +373,9 @@ distribute_py_test(
     name = "hashing_distribution_test",
     srcs = ["hashing_distribution_test.py"],
     disable_mlir_bridge = False,
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "hashing_distribution_test.py",
     python_version = "PY3",
     shard_count = 4,
@@ -415,6 +429,9 @@ distribute_py_test(
     name = "index_lookup_distribution_test",
     srcs = ["index_lookup_distribution_test.py"],
     disable_mlir_bridge = False,
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "index_lookup_distribution_test.py",
     python_version = "PY3",
     shard_count = 4,
@@ -490,6 +507,9 @@ tf_py_test(
 distribute_py_test(
     name = "normalization_distribution_test",
     srcs = ["normalization_distribution_test.py"],
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "normalization_distribution_test.py",
     python_version = "PY3",
     shard_count = 8,
@@ -527,9 +547,12 @@ distribute_py_test(
     name = "text_vectorization_distribution_test",
     srcs = ["text_vectorization_distribution_test.py"],
     disable_mlir_bridge = False,
+    env = {
+        "CUDA_MODULE_LOADING": "LAZY",
+    },
     main = "text_vectorization_distribution_test.py",
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "multi_and_single_gpu",
         "nomultivm",  # TODO(b/170502145)
diff --git a/keras/layers/preprocessing/benchmarks/BUILD b/keras/layers/preprocessing/benchmarks/BUILD
index 4a6a4d15109b..66d4bf22a6b5 100644
--- a/keras/layers/preprocessing/benchmarks/BUILD
+++ b/keras/layers/preprocessing/benchmarks/BUILD
@@ -1,3 +1,5 @@
+# Placeholder: load unaliased py_library
+
 # Benchmarks for Keras preprocessing layers.
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
@@ -5,6 +7,7 @@ load("@org_keras//keras:keras.bzl", "cuda_py_test")
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
diff --git a/keras/layers/preprocessing/benchmarks/__init__.py b/keras/layers/preprocessing/benchmarks/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py
index ff2dbd5693c4..e12ec7ae8013 100644
--- a/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py
@@ -14,14 +14,19 @@
 # ==============================================================================
 """Benchmark for KPL implementation of bucketized columns with dense inputs."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import discretization
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10  # The number of times to run each benchmark.
 BATCH_SIZES = [32, 256]
@@ -29,46 +34,51 @@
 
 ### KPL AND FC IMPLEMENTATION BENCHMARKS ###
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  max_value = 25.0
-  bins = np.arange(1.0, max_value)
-  data = fc_bm.create_data(
-      max_length, batch_size * NUM_REPEATS, 100000, dtype=float)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.float32))
-  model.add(discretization.Discretization(bins))
-
-  # FC implementation
-  fc = tf.feature_column.bucketized_column(
-      tf.feature_column.numeric_column("data"), boundaries=list(bins))
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {"data": data.to_tensor(default_value=0.0)}
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {"data": data.to_tensor(default_value=0.0)}
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    max_value = 25.0
+    bins = np.arange(1.0, max_value)
+    data = fc_bm.create_data(
+        max_length, batch_size * NUM_REPEATS, 100000, dtype=float
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.float32))
+    model.add(discretization.Discretization(bins))
+
+    # FC implementation
+    fc = tf.feature_column.bucketized_column(
+        tf.feature_column.numeric_column("data"), boundaries=list(bins)
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {"data": data.to_tensor(default_value=0.0)}
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {"data": data.to_tensor(default_value=0.0)}
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "bucketized|dense|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"bucketized|dense|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py b/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
index e44804626a22..15e2545c7791 100644
--- a/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
@@ -14,62 +14,70 @@
 # ==============================================================================
 """Benchmark for Keras category_encoding preprocessing layer."""
 
-import tensorflow.compat.v2 as tf
-
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.layers.preprocessing import category_encoding
 
 
 class BenchmarkLayer(tf.test.Benchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def run_dataset_implementation(self, output_mode, batch_size, sequence_length,
-                                 max_tokens):
-    input_t = keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    layer = category_encoding.CategoryEncoding(
-        max_tokens=max_tokens, output_mode=output_mode)
-    _ = layer(input_t)
+    def run_dataset_implementation(
+        self, output_mode, batch_size, sequence_length, max_tokens
+    ):
+        input_t = keras.Input(shape=(sequence_length,), dtype=tf.int32)
+        layer = category_encoding.CategoryEncoding(
+            max_tokens=max_tokens, output_mode=output_mode
+        )
+        _ = layer(input_t)
 
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.from_tensor_slices(
-          tf.random.uniform([batch_size * 10, sequence_length],
-                                    minval=0,
-                                    maxval=max_tokens - 1,
-                                    dtype=tf.int32))
-      ds = ds.shuffle(batch_size * 100)
-      ds = ds.batch(batch_size)
-      num_batches = 5
-      ds = ds.take(num_batches)
-      ds = ds.prefetch(num_batches)
-      starts.append(time.time())
-      # Benchmarked code begins here.
-      for i in ds:
-        _ = layer(i)
-      # Benchmarked code ends here.
-      ends.append(time.time())
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.from_tensor_slices(
+                tf.random.uniform(
+                    [batch_size * 10, sequence_length],
+                    minval=0,
+                    maxval=max_tokens - 1,
+                    dtype=tf.int32,
+                )
+            )
+            ds = ds.shuffle(batch_size * 100)
+            ds = ds.batch(batch_size)
+            num_batches = 5
+            ds = ds.take(num_batches)
+            ds = ds.prefetch(num_batches)
+            starts.append(time.time())
+            # Benchmarked code begins here.
+            for i in ds:
+                _ = layer(i)
+            # Benchmarked code ends here.
+            ends.append(time.time())
 
-    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
-        batch_size, sequence_length, max_tokens)
-    self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
+        avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+        name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
+            batch_size,
+            sequence_length,
+            max_tokens,
+        )
+        self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
 
-  def benchmark_vocab_size_by_batch(self):
-    for batch in [32, 256, 2048]:
-      for sequence_length in [10, 1000]:
-        for num_tokens in [100, 1000, 20000]:
-          self.run_dataset_implementation(
-              output_mode="count",
-              batch_size=batch,
-              sequence_length=sequence_length,
-              max_tokens=num_tokens)
+    def benchmark_vocab_size_by_batch(self):
+        for batch in [32, 256, 2048]:
+            for sequence_length in [10, 1000]:
+                for num_tokens in [100, 1000, 20000]:
+                    self.run_dataset_implementation(
+                        output_mode="count",
+                        batch_size=batch,
+                        sequence_length=sequence_length,
+                        max_tokens=num_tokens,
+                    )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py
index 2a50b01dcf2d..f4953cc1842b 100644
--- a/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py
@@ -12,64 +12,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of categorical hash columns with dense inputs."""
+"""Benchmark for KPL implementation of categorical hash columns with dense
+inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import hashing
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-
-  num_buckets = 10000
-  vocab = fc_bm.create_vocabulary(32768)
-  data = fc_bm.create_string_data(
-      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.string))
-  model.add(hashing.Hashing(num_buckets))
-
-  # FC implementation
-  fc = tf.feature_column.sequence_categorical_column_with_hash_bucket("data", num_buckets)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {
-      "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
-  }
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {
-      "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
-  }
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+
+    num_buckets = 10000
+    vocab = fc_bm.create_vocabulary(32768)
+    data = fc_bm.create_string_data(
+        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.string))
+    model.add(hashing.Hashing(num_buckets))
+
+    # FC implementation
+    fc = tf.feature_column.sequence_categorical_column_with_hash_bucket(
+        "data", num_buckets
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {
+        "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
+    }
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {
+        "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
+    }
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "hash|dense|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"hash|dense|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py
index 07cd1d463b3b..a43f42a2c013 100644
--- a/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py
@@ -12,62 +12,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of categorical hash columns with varying-length inputs."""
+"""Benchmark for KPL implementation of categorical hash columns with
+varying-length inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import hashing
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-
-  num_buckets = 10000
-  vocab = fc_bm.create_vocabulary(32768)
-  data = fc_bm.create_string_data(
-      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(
-      keras.Input(
-          shape=(max_length,), name="data", ragged=True, dtype=tf.string))
-  model.add(hashing.Hashing(num_buckets))
-
-  # FC implementation
-  fc = tf.feature_column.categorical_column_with_hash_bucket("data", num_buckets)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {"data": data}
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {"data": data.to_sparse()}
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+
+    num_buckets = 10000
+    vocab = fc_bm.create_vocabulary(32768)
+    data = fc_bm.create_string_data(
+        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(
+        keras.Input(
+            shape=(max_length,), name="data", ragged=True, dtype=tf.string
+        )
+    )
+    model.add(hashing.Hashing(num_buckets))
+
+    # FC implementation
+    fc = tf.feature_column.categorical_column_with_hash_bucket(
+        "data", num_buckets
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {"data": data}
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {"data": data.to_sparse()}
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "hash|varlen|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"hash|varlen|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_file_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_file_dense_benchmark.py
index 26d4adb940ff..ae43734f5699 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_file_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_file_dense_benchmark.py
@@ -12,78 +12,98 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of vocabulary columns from files with dense inputs."""
-
-import tensorflow.compat.v2 as tf
+"""Benchmark for KPL implementation of vocabulary columns from files with dense
+inputs."""
 
 import os
 
+import tensorflow.compat.v2 as tf
+
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import string_lookup
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 class BenchmarkLayer(tf.test.TestCase, fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def embedding_varlen(self, batch_size, max_length):
-    """Benchmark a variable-length embedding."""
-    # Data and constants.
-    vocab = fc_bm.create_vocabulary(32768)
-
-    path = self._write_to_temp_file("tmp", vocab)
-
-    data = fc_bm.create_string_data(
-        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)
-
-    # Keras implementation
-    model = keras.Sequential()
-    model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.string))
-    model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None))
-
-    # FC implementation
-    fc = tf.feature_column.categorical_column_with_vocabulary_list(
-        key="data", vocabulary_list=vocab, num_oov_buckets=1)
-
-    # Wrap the FC implementation in a tf.function for a fair comparison
-    @tf_function()
-    def fc_fn(tensors):
-      fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-    # Benchmark runs
-    keras_data = {
-        "data": data.to_tensor(
-            default_value="", shape=(batch_size, max_length))
-    }
-    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-    fc_data = {
-        "data": data.to_tensor(
-            default_value="", shape=(batch_size, max_length))
-    }
-    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-    return k_avg_time, fc_avg_time
-
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "vocab_list|dense|batch_%s" % batch
-      k_time, f_time = self.embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    """Benchmark the layer forward pass."""
+
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def embedding_varlen(self, batch_size, max_length):
+        """Benchmark a variable-length embedding."""
+        # Data and constants.
+        vocab = fc_bm.create_vocabulary(32768)
+
+        path = self._write_to_temp_file("tmp", vocab)
+
+        data = fc_bm.create_string_data(
+            max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15
+        )
+
+        # Keras implementation
+        model = keras.Sequential()
+        model.add(
+            keras.Input(shape=(max_length,), name="data", dtype=tf.string)
+        )
+        model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None))
+
+        # FC implementation
+        fc = tf.feature_column.categorical_column_with_vocabulary_list(
+            key="data", vocabulary_list=vocab, num_oov_buckets=1
+        )
+
+        # Wrap the FC implementation in a tf.function for a fair comparison
+        @tf_function()
+        def fc_fn(tensors):
+            fc.transform_feature(
+                tf.__internal__.feature_column.FeatureTransformationCache(
+                    tensors
+                ),
+                None,
+            )
+
+        # Benchmark runs
+        keras_data = {
+            "data": data.to_tensor(
+                default_value="", shape=(batch_size, max_length)
+            )
+        }
+        k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+        fc_data = {
+            "data": data.to_tensor(
+                default_value="", shape=(batch_size, max_length)
+            )
+        }
+        fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+        return k_avg_time, fc_avg_time
+
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"vocab_list|dense|batch_{batch}"
+            k_time, f_time = self.embedding_varlen(
+                batch_size=batch, max_length=256
+            )
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_file_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_file_varlen_benchmark.py
index b5e38e0eabb6..26c6f4861ed9 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_file_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_file_varlen_benchmark.py
@@ -12,73 +12,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of vocabulary columns from files with varying-length inputs."""
-
-import tensorflow.compat.v2 as tf
+"""Benchmark for KPL implementation of vocabulary columns from files with
+varying-length inputs."""
 
 import os
 
+import tensorflow.compat.v2 as tf
+
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import string_lookup
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 class BenchmarkLayer(tf.test.TestCase, fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def embedding_varlen(self, batch_size, max_length):
-    """Benchmark a variable-length embedding."""
-    # Data and constants.
-    vocab = fc_bm.create_vocabulary(32768)
-    path = self._write_to_temp_file("tmp", vocab)
-
-    data = fc_bm.create_string_data(
-        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)
-
-    # Keras implementation
-    model = keras.Sequential()
-    model.add(
-        keras.Input(
-            shape=(max_length,), name="data", ragged=True, dtype=tf.string))
-    model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None))
-
-    # FC implementation
-    fc = tf.feature_column.sequence_categorical_column_with_vocabulary_list(
-        key="data", vocabulary_list=vocab, num_oov_buckets=1)
-
-    # Wrap the FC implementation in a tf.function for a fair comparison
-    @tf_function()
-    def fc_fn(tensors):
-      fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-    # Benchmark runs
-    keras_data = {"data": data}
-    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-    fc_data = {"data": data.to_sparse()}
-    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-    return k_avg_time, fc_avg_time
-
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "vocab_list|varlen|batch_%s" % batch
-      k_time, f_time = self.embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    """Benchmark the layer forward pass."""
+
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def embedding_varlen(self, batch_size, max_length):
+        """Benchmark a variable-length embedding."""
+        # Data and constants.
+        vocab = fc_bm.create_vocabulary(32768)
+        path = self._write_to_temp_file("tmp", vocab)
+
+        data = fc_bm.create_string_data(
+            max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15
+        )
+
+        # Keras implementation
+        model = keras.Sequential()
+        model.add(
+            keras.Input(
+                shape=(max_length,), name="data", ragged=True, dtype=tf.string
+            )
+        )
+        model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None))
+
+        # FC implementation
+        fc = tf.feature_column.sequence_categorical_column_with_vocabulary_list(
+            key="data", vocabulary_list=vocab, num_oov_buckets=1
+        )
+
+        # Wrap the FC implementation in a tf.function for a fair comparison
+        @tf_function()
+        def fc_fn(tensors):
+            fc.transform_feature(
+                tf.__internal__.feature_column.FeatureTransformationCache(
+                    tensors
+                ),
+                None,
+            )
+
+        # Benchmark runs
+        keras_data = {"data": data}
+        k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+        fc_data = {"data": data.to_sparse()}
+        fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+        return k_avg_time, fc_avg_time
+
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"vocab_list|varlen|batch_{batch}"
+            k_time, f_time = self.embedding_varlen(
+                batch_size=batch, max_length=256
+            )
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py
index a04b30271d69..eb455a8e52bc 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py
@@ -12,63 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of vocabulary columns from lists with dense inputs."""
+"""Benchmark for KPL implementation of vocabulary columns from lists with dense
+inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import string_lookup
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  vocab = fc_bm.create_vocabulary(32768)
-  data = fc_bm.create_string_data(
-      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.string))
-  model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
-
-  # FC implementation
-  fc = tf.feature_column.categorical_column_with_vocabulary_list(
-      key="data", vocabulary_list=vocab, num_oov_buckets=1)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {
-      "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
-  }
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {
-      "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
-  }
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    vocab = fc_bm.create_vocabulary(32768)
+    data = fc_bm.create_string_data(
+        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.string))
+    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
+
+    # FC implementation
+    fc = tf.feature_column.categorical_column_with_vocabulary_list(
+        key="data", vocabulary_list=vocab, num_oov_buckets=1
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {
+        "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
+    }
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {
+        "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
+    }
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "vocab_list|dense|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"vocab_list|dense|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py
index be23aa79adc8..b2aa0d687a0c 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py
@@ -12,69 +12,84 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of vocabulary columns + indicator from lists with dense inputs."""
+"""Benchmark for KPL implementation of vocabulary columns + indicator from lists
+with dense inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import category_encoding
 from keras.layers.preprocessing import string_lookup
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  vocab_size = 32768
-  vocab = fc_bm.create_vocabulary(vocab_size)
-  data = fc_bm.create_string_data(
-      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.string))
-  model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
-  model.add(
-      category_encoding.CategoryEncoding(
-          num_tokens=vocab_size + 1, output_mode="count"))
-
-  # FC implementation
-  fc = tf.feature_column.indicator_column(
-      tf.feature_column.categorical_column_with_vocabulary_list(
-          key="data", vocabulary_list=vocab, num_oov_buckets=1))
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {
-      "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
-  }
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {
-      "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
-  }
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    vocab_size = 32768
+    vocab = fc_bm.create_vocabulary(vocab_size)
+    data = fc_bm.create_string_data(
+        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(keras.Input(shape=(max_length,), name="data", dtype=tf.string))
+    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
+    model.add(
+        category_encoding.CategoryEncoding(
+            num_tokens=vocab_size + 1, output_mode="count"
+        )
+    )
+
+    # FC implementation
+    fc = tf.feature_column.indicator_column(
+        tf.feature_column.categorical_column_with_vocabulary_list(
+            key="data", vocabulary_list=vocab, num_oov_buckets=1
+        )
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {
+        "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
+    }
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {
+        "data": data.to_tensor(default_value="", shape=(batch_size, max_length))
+    }
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "vocab_list_indicator|dense|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"vocab_list_indicator|dense|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py
index cede6b70a912..b46b01ebbb18 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py
@@ -12,67 +12,84 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of vocabulary columns + indicator from lists with varying-length inputs."""
+"""Benchmark for KPL implementation of vocabulary columns + indicator from lists
+with varying-length inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import category_encoding
 from keras.layers.preprocessing import string_lookup
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  vocab_size = 32768
-  vocab = fc_bm.create_vocabulary(vocab_size)
-  data = fc_bm.create_string_data(
-      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(
-      keras.Input(
-          shape=(max_length,), name="data", ragged=True, dtype=tf.string))
-  model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
-  model.add(
-      category_encoding.CategoryEncoding(
-          num_tokens=vocab_size + 1, output_mode="count"))
-
-  # FC implementation
-  fc = tf.feature_column.indicator_column(
-      tf.feature_column.sequence_categorical_column_with_vocabulary_list(
-          key="data", vocabulary_list=vocab, num_oov_buckets=1))
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {"data": data}
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {"data": data.to_sparse()}
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    vocab_size = 32768
+    vocab = fc_bm.create_vocabulary(vocab_size)
+    data = fc_bm.create_string_data(
+        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(
+        keras.Input(
+            shape=(max_length,), name="data", ragged=True, dtype=tf.string
+        )
+    )
+    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
+    model.add(
+        category_encoding.CategoryEncoding(
+            num_tokens=vocab_size + 1, output_mode="count"
+        )
+    )
+
+    # FC implementation
+    fc = tf.feature_column.indicator_column(
+        tf.feature_column.sequence_categorical_column_with_vocabulary_list(
+            key="data", vocabulary_list=vocab, num_oov_buckets=1
+        )
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {"data": data}
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {"data": data.to_sparse()}
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "vocab_list_indicator|varlen|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"vocab_list_indicator|varlen|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py
index 85d9a515bd37..6b1455c5ec4a 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py
@@ -12,61 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of vocabulary columns from lists with varying-length inputs."""
+"""Benchmark for KPL implementation of vocabulary columns from lists with
+varying-length inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
 from keras.layers.preprocessing import string_lookup
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  vocab = fc_bm.create_vocabulary(32768)
-  data = fc_bm.create_string_data(
-      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(
-      keras.Input(
-          shape=(max_length,), name="data", ragged=True, dtype=tf.string))
-  model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
-
-  # FC implementation
-  fc = tf.feature_column.sequence_categorical_column_with_vocabulary_list(
-      key="data", vocabulary_list=vocab, num_oov_buckets=1)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {"data": data}
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {"data": data.to_sparse()}
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    vocab = fc_bm.create_vocabulary(32768)
+    data = fc_bm.create_string_data(
+        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(
+        keras.Input(
+            shape=(max_length,), name="data", ragged=True, dtype=tf.string
+        )
+    )
+    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
+
+    # FC implementation
+    fc = tf.feature_column.sequence_categorical_column_with_vocabulary_list(
+        key="data", vocabulary_list=vocab, num_oov_buckets=1
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {"data": data}
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {"data": data.to_sparse()}
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "vocab_list|varlen|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"vocab_list|varlen|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/discretization_adapt_benchmark.py b/keras/layers/preprocessing/benchmarks/discretization_adapt_benchmark.py
index 4f5ba20c2517..86af3a6583e0 100644
--- a/keras/layers/preprocessing/benchmarks/discretization_adapt_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/discretization_adapt_benchmark.py
@@ -14,11 +14,10 @@
 # ==============================================================================
 """Benchmark for Keras discretization preprocessing layer's adapt method."""
 
-import tensorflow.compat.v2 as tf
-
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.layers.preprocessing import discretization
@@ -27,82 +26,83 @@
 
 
 def reduce_fn(state, values, epsilon=EPSILON):
-  """tf.data.Dataset-friendly implementation of mean and variance."""
+    """tf.data.Dataset-friendly implementation of mean and variance."""
 
-  state_, = state
-  summary = discretization.summarize(values, epsilon)
-  if np.sum(state_[:, 0]) == 0:
-    return (summary,)
-  return (discretization.merge_summaries(state_, summary, epsilon),)
+    (state_,) = state
+    summary = discretization.summarize(values, epsilon)
+    if np.sum(state_[:, 0]) == 0:
+        return (summary,)
+    return (discretization.merge_summaries(state_, summary, epsilon),)
 
 
 class BenchmarkAdapt(tf.test.Benchmark):
-  """Benchmark adapt."""
-
-  def run_dataset_implementation(self, num_elements, batch_size):
-    input_t = keras.Input(shape=(1,))
-    layer = discretization.Discretization()
-    _ = layer(input_t)
-
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.range(num_elements)
-      ds = ds.map(
-          lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
-      ds = ds.batch(batch_size)
-
-      starts.append(time.time())
-      # Benchmarked code begins here.
-      state = ds.reduce((np.zeros((1, 2)),), reduce_fn)
-
-      bins = discretization.get_bucket_boundaries(state, 100)
-      layer.set_weights([bins])
-      # Benchmarked code ends here.
-      ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    return avg_time
-
-  def bm_adapt_implementation(self, num_elements, batch_size):
-    """Test the KPL adapt implementation."""
-    input_t = keras.Input(shape=(1,), dtype=tf.float32)
-    layer = discretization.Discretization()
-    _ = layer(input_t)
-
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.range(num_elements)
-      ds = ds.map(
-          lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
-      ds = ds.batch(batch_size)
-
-      starts.append(time.time())
-      # Benchmarked code begins here.
-      layer.adapt(ds)
-      # Benchmarked code ends here.
-      ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    name = "discretization_adapt|%s_elements|batch_%s" % (num_elements,
-                                                          batch_size)
-    baseline = self.run_dataset_implementation(num_elements, batch_size)
-    extras = {
-        "tf.data implementation baseline": baseline,
-        "delta seconds": (baseline - avg_time),
-        "delta percent": ((baseline - avg_time) / baseline) * 100
-    }
-    self.report_benchmark(
-        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
-
-  def benchmark_vocab_size_by_batch(self):
-    for vocab_size in [100, 1000, 10000, 100000, 1000000]:
-      for batch in [64 * 2048]:
-        self.bm_adapt_implementation(vocab_size, batch)
+    """Benchmark adapt."""
+
+    def run_dataset_implementation(self, num_elements, batch_size):
+        input_t = keras.Input(shape=(1,))
+        layer = discretization.Discretization()
+        _ = layer(input_t)
+
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.range(num_elements)
+            ds = ds.map(lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
+            ds = ds.batch(batch_size)
+
+            starts.append(time.time())
+            # Benchmarked code begins here.
+            state = ds.reduce((np.zeros((1, 2)),), reduce_fn)
+
+            bins = discretization.get_bucket_boundaries(state, 100)
+            layer.set_weights([bins])
+            # Benchmarked code ends here.
+            ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        return avg_time
+
+    def bm_adapt_implementation(self, num_elements, batch_size):
+        """Test the KPL adapt implementation."""
+        input_t = keras.Input(shape=(1,), dtype=tf.float32)
+        layer = discretization.Discretization()
+        _ = layer(input_t)
+
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.range(num_elements)
+            ds = ds.map(lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
+            ds = ds.batch(batch_size)
+
+            starts.append(time.time())
+            # Benchmarked code begins here.
+            layer.adapt(ds)
+            # Benchmarked code ends here.
+            ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        name = "discretization_adapt|%s_elements|batch_%s" % (
+            num_elements,
+            batch_size,
+        )
+        baseline = self.run_dataset_implementation(num_elements, batch_size)
+        extras = {
+            "tf.data implementation baseline": baseline,
+            "delta seconds": (baseline - avg_time),
+            "delta percent": ((baseline - avg_time) / baseline) * 100,
+        }
+        self.report_benchmark(
+            iters=num_repeats, wall_time=avg_time, extras=extras, name=name
+        )
+
+    def benchmark_vocab_size_by_batch(self):
+        for vocab_size in [100, 1000, 10000, 100000, 1000000]:
+            for batch in [64 * 2048]:
+                self.bm_adapt_implementation(vocab_size, batch)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py
index 9f8a70e80d9a..bbe64c2c8d8e 100644
--- a/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py
@@ -17,8 +17,14 @@
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
@@ -26,48 +32,54 @@
 
 ### KPL AND FC IMPLEMENTATION BENCHMARKS ###
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  embedding_size = 32768
-  data = fc_bm.create_data(
-      max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(keras.Input(shape=(None,), name="data", dtype=tf.int64))
-  model.add(keras.layers.Embedding(embedding_size, 256))
-  model.add(keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=-1)))
-
-  # FC implementation
-  fc = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_identity(
-          "data", num_buckets=embedding_size - 1),
-      dimension=256)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {"data": data.to_tensor(default_value=0)}
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {"data": data.to_tensor(default_value=0)}
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    embedding_size = 32768
+    data = fc_bm.create_data(
+        max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(keras.Input(shape=(None,), name="data", dtype=tf.int64))
+    model.add(keras.layers.Embedding(embedding_size, 256))
+    model.add(keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=-1)))
+
+    # FC implementation
+    fc = tf.feature_column.embedding_column(
+        tf.feature_column.categorical_column_with_identity(
+            "data", num_buckets=embedding_size - 1
+        ),
+        dimension=256,
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {"data": data.to_tensor(default_value=0)}
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {"data": data.to_tensor(default_value=0)}
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "embedding|dense|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"embedding|dense|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py
index c1538a4c9c81..f7ddbcc3a571 100644
--- a/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py
@@ -12,13 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of embedding column with varying-length inputs."""
+"""Benchmark for KPL implementation of embedding column with varying-length
+inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
@@ -26,49 +33,56 @@
 
 ### KPL AND FC IMPLEMENTATION BENCHMARKS ###
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  embedding_size = 32768
-  data = fc_bm.create_data(
-      max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int)
-
-  # Keras implementation
-  model = keras.Sequential()
-  model.add(
-      keras.Input(shape=(None,), ragged=True, name="data", dtype=tf.int64))
-  model.add(keras.layers.Embedding(embedding_size, 256))
-  model.add(keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=-1)))
-
-  # FC implementation
-  fc = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_identity(
-          "data", num_buckets=embedding_size - 1),
-      dimension=256)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {"data": data}
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {"data": data.to_sparse()}
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    embedding_size = 32768
+    data = fc_bm.create_data(
+        max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int
+    )
+
+    # Keras implementation
+    model = keras.Sequential()
+    model.add(
+        keras.Input(shape=(None,), ragged=True, name="data", dtype=tf.int64)
+    )
+    model.add(keras.layers.Embedding(embedding_size, 256))
+    model.add(keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=-1)))
+
+    # FC implementation
+    fc = tf.feature_column.embedding_column(
+        tf.feature_column.categorical_column_with_identity(
+            "data", num_buckets=embedding_size - 1
+        ),
+        dimension=256,
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {"data": data}
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {"data": data.to_sparse()}
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "embedding|varlen|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"embedding|varlen|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/feature_column_benchmark.py b/keras/layers/preprocessing/benchmarks/feature_column_benchmark.py
index 572e6c823786..cb14279fc2dc 100644
--- a/keras/layers/preprocessing/benchmarks/feature_column_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/feature_column_benchmark.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Benchmark suite for KPL and feature column implementations."""
 
-import tensorflow.compat.v2 as tf
 import itertools
 import math
 import random
@@ -22,123 +21,134 @@
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 
 
 class LayerBenchmark(tf.test.Benchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def report(self, name, keras_time, fc_time, iters):
-    """Calculate and report benchmark statistics."""
-    extras = {
-        "fc_avg_time": fc_time,
-        "fc_vs_keras_sec": fc_time - keras_time,
-        "fc_vs_keras_pct": ((fc_time - keras_time) / fc_time) * 100,
-        "keras_faster_ratio": fc_time / keras_time
-    }
-    self.report_benchmark(
-        iters=iters, wall_time=keras_time, extras=extras, name=name)
+    def report(self, name, keras_time, fc_time, iters):
+        """Calculate and report benchmark statistics."""
+        extras = {
+            "fc_avg_time": fc_time,
+            "fc_vs_keras_sec": fc_time - keras_time,
+            "fc_vs_keras_pct": ((fc_time - keras_time) / fc_time) * 100,
+            "keras_faster_ratio": fc_time / keras_time,
+        }
+        self.report_benchmark(
+            iters=iters, wall_time=keras_time, extras=extras, name=name
+        )
 
 
 class StepTimingCallback(keras.callbacks.Callback):
-  """A callback that times non-warmup steps of a Keras predict call."""
+    """A callback that times non-warmup steps of a Keras predict call."""
 
-  def __init__(self):
-    self.t0 = None
-    self.steps = 0
+    def __init__(self):
+        self.t0 = None
+        self.steps = 0
 
-  def on_predict_batch_begin(self, batch_index, _):
-    if batch_index == 2:
-      self.t0 = time.time()
-    elif batch_index > 2:
-      self.steps += 1
+    def on_predict_batch_begin(self, batch_index, _):
+        if batch_index == 2:
+            self.t0 = time.time()
+        elif batch_index > 2:
+            self.steps += 1
 
-  def on_predict_end(self, _):
-    self.tn = time.time()
-    self.t_avg = (self.tn - self.t0) / self.steps
+    def on_predict_end(self, _):
+        self.tn = time.time()
+        self.t_avg = (self.tn - self.t0) / self.steps
 
 
 def create_data(length, num_entries, max_value, dtype):
-  """Create a ragged tensor with random data entries."""
-  lengths = (np.random.random(size=num_entries) * length).astype(int)
-  total_length = np.sum(lengths)
-  values = (np.random.random(size=total_length) * max_value).astype(dtype)
-  return tf.RaggedTensor.from_row_lengths(values, lengths)
-
-
-def create_string_data(length,
-                       num_entries,
-                       vocabulary,
-                       pct_oov,
-                       oov_string="__OOV__"):
-  """Create a ragged tensor with random data entries."""
-  lengths = (np.random.random(size=num_entries) * length).astype(int)
-  total_length = np.sum(lengths)
-  num_oovs = int(pct_oov * total_length)
-  values = []
-  for _ in range(total_length):
-    values.append(random.choice(vocabulary))
-
-  if pct_oov > 0:
-    oov_cadence = int(total_length / num_oovs)
-    idx = 0
-    for _ in range(num_oovs):
-      if idx < total_length:
-        values[idx] = oov_string
-      idx += oov_cadence
-
-  return tf.RaggedTensor.from_row_lengths(values, lengths)
+    """Create a ragged tensor with random data entries."""
+    lengths = (np.random.random(size=num_entries) * length).astype(int)
+    total_length = np.sum(lengths)
+    values = (np.random.random(size=total_length) * max_value).astype(dtype)
+    return tf.RaggedTensor.from_row_lengths(values, lengths)
+
+
+def create_string_data(
+    length, num_entries, vocabulary, pct_oov, oov_string="__OOV__"
+):
+    """Create a ragged tensor with random data entries."""
+    lengths = (np.random.random(size=num_entries) * length).astype(int)
+    total_length = np.sum(lengths)
+    num_oovs = int(pct_oov * total_length)
+    values = []
+    for _ in range(total_length):
+        values.append(random.choice(vocabulary))
+
+    if pct_oov > 0:
+        oov_cadence = int(total_length / num_oovs)
+        idx = 0
+        for _ in range(num_oovs):
+            if idx < total_length:
+                values[idx] = oov_string
+            idx += oov_cadence
+
+    return tf.RaggedTensor.from_row_lengths(values, lengths)
 
 
 def create_vocabulary(vocab_size):
-  base = len(string.ascii_letters)
-  n = math.ceil(math.log(vocab_size, base))
-  vocab = []
-  for i in range(1, n + 1):
-    for item in itertools.product(string.ascii_letters, repeat=i):
-      if len(vocab) >= vocab_size:
-        break
-      vocab.append("".join(item))
-  return vocab
+    base = len(string.ascii_letters)
+    n = math.ceil(math.log(vocab_size, base))
+    vocab = []
+    for i in range(1, n + 1):
+        for item in itertools.product(string.ascii_letters, repeat=i):
+            if len(vocab) >= vocab_size:
+                break
+            vocab.append("".join(item))
+    return vocab
 
 
 def run_keras(data, model, batch_size, num_runs, steps_per_repeat=100):
-  """Benchmark a Keras model."""
-  ds = tf.data.Dataset.from_tensor_slices(data).repeat().prefetch(
-      tf.data.AUTOTUNE).batch(batch_size).cache()
-  steps = 0
-  times = []
-  for _ in range(num_runs):
-    steps += steps_per_repeat
-    timer = StepTimingCallback()
-    # Benchmarked code begins here.
-    model.predict(ds, steps=steps, callbacks=[timer])
-    # Benchmarked code ends here.
-    times.append(timer.t_avg)
-  avg_time = np.mean(times)
-  return avg_time
+    """Benchmark a Keras model."""
+    ds = (
+        tf.data.Dataset.from_tensor_slices(data)
+        .repeat()
+        .prefetch(tf.data.AUTOTUNE)
+        .batch(batch_size)
+        .cache()
+    )
+    steps = 0
+    times = []
+    for _ in range(num_runs):
+        steps += steps_per_repeat
+        timer = StepTimingCallback()
+        # Benchmarked code begins here.
+        model.predict(ds, steps=steps, callbacks=[timer])
+        # Benchmarked code ends here.
+        times.append(timer.t_avg)
+    avg_time = np.mean(times)
+    return avg_time
 
 
 def run_fc(data, fc_fn, batch_size, num_runs, steps_per_repeat=100):
-  """Benchmark a Feature Column."""
-
-  ds = tf.data.Dataset.from_tensor_slices(data).repeat().prefetch(
-      tf.data.AUTOTUNE).batch(batch_size).cache()
-
-  # Trace the fc_fn
-  ds_iter = ds.__iter__()
-  fc_fn(next(ds_iter))
-  fc_starts = []
-  fc_ends = []
-  for _ in range(num_runs):
-    fc_starts.append(time.time())
-    # Benchmarked code begins here.
-    for _ in range(steps_per_repeat):
-      _ = fc_fn(next(ds_iter))
-    # Benchmarked code ends here.
-    fc_ends.append(time.time())
-  avg_per_step_time = (np.array(fc_ends) -
-                       np.array(fc_starts)) / steps_per_repeat
-  avg_time = np.mean(avg_per_step_time)
-  return avg_time
+    """Benchmark a Feature Column."""
+
+    ds = (
+        tf.data.Dataset.from_tensor_slices(data)
+        .repeat()
+        .prefetch(tf.data.AUTOTUNE)
+        .batch(batch_size)
+        .cache()
+    )
+
+    # Trace the fc_fn
+    ds_iter = ds.__iter__()
+    fc_fn(next(ds_iter))
+    fc_starts = []
+    fc_ends = []
+    for _ in range(num_runs):
+        fc_starts.append(time.time())
+        # Benchmarked code begins here.
+        for _ in range(steps_per_repeat):
+            _ = fc_fn(next(ds_iter))
+        # Benchmarked code ends here.
+        fc_ends.append(time.time())
+    avg_per_step_time = (
+        np.array(fc_ends) - np.array(fc_starts)
+    ) / steps_per_repeat
+    avg_time = np.mean(avg_per_step_time)
+    return avg_time
diff --git a/keras/layers/preprocessing/benchmarks/hashed_crossing_benchmark.py b/keras/layers/preprocessing/benchmarks/hashed_crossing_benchmark.py
index 3dd74662fc84..9b0fad90f2c0 100644
--- a/keras/layers/preprocessing/benchmarks/hashed_crossing_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/hashed_crossing_benchmark.py
@@ -12,69 +12,78 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of categorical cross hash columns with dense inputs."""
+"""Benchmark for KPL implementation of categorical cross hash columns with dense
+inputs."""
 
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.layers.preprocessing import hashed_crossing
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
-import tensorflow.compat.v2 as tf
-from tensorflow.python.eager.def_function import function as tf_function
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
 
 
 def embedding_varlen(batch_size):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  num_buckets = 10000
-  data_a = tf.random.uniform(shape=(batch_size * NUM_REPEATS, 1),
-                             maxval=32768,
-                             dtype=tf.int64)
-  data_b = tf.strings.as_string(data_a)
-
-  # Keras implementation
-  input_1 = keras.Input(shape=(1,), name="data_a", dtype=tf.int64)
-  input_2 = keras.Input(shape=(1,), name="data_b", dtype=tf.string)
-  outputs = hashed_crossing.HashedCrossing(num_buckets)([input_1, input_2])
-  model = keras.Model([input_1, input_2], outputs)
-
-  # FC implementation
-  fc = tf.feature_column.crossed_column(["data_a", "data_b"], num_buckets)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(
-        tf.__internal__.feature_column.FeatureTransformationCache(tensors),
-        None)
-
-  # Benchmark runs
-  keras_data = {
-      "data_a": data_a,
-      "data_b": data_b,
-  }
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {
-      "data_a": data_a,
-      "data_b": data_b,
-  }
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    num_buckets = 10000
+    data_a = tf.random.uniform(
+        shape=(batch_size * NUM_REPEATS, 1), maxval=32768, dtype=tf.int64
+    )
+    data_b = tf.strings.as_string(data_a)
+
+    # Keras implementation
+    input_1 = keras.Input(shape=(1,), name="data_a", dtype=tf.int64)
+    input_2 = keras.Input(shape=(1,), name="data_b", dtype=tf.string)
+    outputs = hashed_crossing.HashedCrossing(num_buckets)([input_1, input_2])
+    model = keras.Model([input_1, input_2], outputs)
+
+    # FC implementation
+    fc = tf.feature_column.crossed_column(["data_a", "data_b"], num_buckets)
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {
+        "data_a": data_a,
+        "data_b": data_b,
+    }
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {
+        "data_a": data_a,
+        "data_b": data_b,
+    }
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "hashed_cross|dense|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"hashed_cross|dense|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/hashing_benchmark.py b/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
index 0bd10f4eed64..0d0d5b0f8a86 100644
--- a/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
@@ -14,14 +14,13 @@
 # ==============================================================================
 """Benchmark for Keras hashing preprocessing layer."""
 
-import tensorflow.compat.v2 as tf
-
 import itertools
 import random
 import string
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.layers.preprocessing import hashing
@@ -30,73 +29,76 @@
 # word_gen creates random sequences of ASCII letters (both lowercase and upper).
 # The number of unique strings is ~2,700.
 def word_gen():
-  for _ in itertools.count(1):
-    yield "".join(random.choice(string.ascii_letters) for i in range(2))
+    for _ in itertools.count(1):
+        yield "".join(random.choice(string.ascii_letters) for i in range(2))
 
 
 class BenchmarkLayer(tf.test.Benchmark):
-  """Benchmark the layer forward pass."""
-
-  def run_dataset_implementation(self, batch_size):
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.from_generator(word_gen, tf.string,
-                                              tf.TensorShape([]))
-      ds = ds.shuffle(batch_size * 100)
-      ds = ds.batch(batch_size)
-      num_batches = 5
-      ds = ds.take(num_batches)
-      ds = ds.prefetch(num_batches)
-      starts.append(time.time())
-      # Benchmarked code begins here.
-      for i in ds:
-        _ = tf.strings.to_hash_bucket(i, num_buckets=2)
-      # Benchmarked code ends here.
-      ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    return avg_time
-
-  def bm_layer_implementation(self, batch_size):
-    input_1 = keras.Input(shape=(None,), dtype=tf.string, name="word")
-    layer = hashing.Hashing(num_bins=2)
-    _ = layer(input_1)
-
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.from_generator(word_gen, tf.string,
-                                              tf.TensorShape([]))
-      ds = ds.shuffle(batch_size * 100)
-      ds = ds.batch(batch_size)
-      num_batches = 5
-      ds = ds.take(num_batches)
-      ds = ds.prefetch(num_batches)
-      starts.append(time.time())
-      # Benchmarked code begins here.
-      for i in ds:
-        _ = layer(i)
-      # Benchmarked code ends here.
-      ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    name = "hashing|batch_%s" % batch_size
-    baseline = self.run_dataset_implementation(batch_size)
-    extras = {
-        "dataset implementation baseline": baseline,
-        "delta seconds": (baseline - avg_time),
-        "delta percent": ((baseline - avg_time) / baseline) * 100
-    }
-    self.report_benchmark(
-        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
-
-  def benchmark_vocab_size_by_batch(self):
-    for batch in [32, 64, 256]:
-      self.bm_layer_implementation(batch_size=batch)
+    """Benchmark the layer forward pass."""
+
+    def run_dataset_implementation(self, batch_size):
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.from_generator(
+                word_gen, tf.string, tf.TensorShape([])
+            )
+            ds = ds.shuffle(batch_size * 100)
+            ds = ds.batch(batch_size)
+            num_batches = 5
+            ds = ds.take(num_batches)
+            ds = ds.prefetch(num_batches)
+            starts.append(time.time())
+            # Benchmarked code begins here.
+            for i in ds:
+                _ = tf.strings.to_hash_bucket(i, num_buckets=2)
+            # Benchmarked code ends here.
+            ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+        return avg_time
+
+    def bm_layer_implementation(self, batch_size):
+        input_1 = keras.Input(shape=(None,), dtype=tf.string, name="word")
+        layer = hashing.Hashing(num_bins=2)
+        _ = layer(input_1)
+
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.from_generator(
+                word_gen, tf.string, tf.TensorShape([])
+            )
+            ds = ds.shuffle(batch_size * 100)
+            ds = ds.batch(batch_size)
+            num_batches = 5
+            ds = ds.take(num_batches)
+            ds = ds.prefetch(num_batches)
+            starts.append(time.time())
+            # Benchmarked code begins here.
+            for i in ds:
+                _ = layer(i)
+            # Benchmarked code ends here.
+            ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+        name = f"hashing|batch_{batch_size}"
+        baseline = self.run_dataset_implementation(batch_size)
+        extras = {
+            "dataset implementation baseline": baseline,
+            "delta seconds": (baseline - avg_time),
+            "delta percent": ((baseline - avg_time) / baseline) * 100,
+        }
+        self.report_benchmark(
+            iters=num_repeats, wall_time=avg_time, extras=extras, name=name
+        )
+
+    def benchmark_vocab_size_by_batch(self):
+        for batch in [32, 64, 256]:
+            self.bm_layer_implementation(batch_size=batch)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py b/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
index 9fc4eac16ecb..895232f22a85 100644
--- a/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
@@ -14,134 +14,145 @@
 # ==============================================================================
 """Benchmark for Keras image preprocessing layer."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.layers.preprocessing import image_preprocessing
 
-LOWER = .2
-UPPER = .4
+LOWER = 0.2
+UPPER = 0.4
 BATCH_SIZE = 32
 
 
 def rotate(inputs):
-  """rotate image."""
-  inputs_shape = tf.shape(inputs)
-  batch_size = inputs_shape[0]
-  img_hd = tf.cast(inputs_shape[1], tf.float32)
-  img_wd = tf.cast(inputs_shape[2], tf.float32)
-  min_angle = LOWER * 2. * np.pi
-  max_angle = UPPER * 2. * np.pi
-  angles = tf.random.uniform(
-      shape=[batch_size], minval=min_angle, maxval=max_angle)
-  return image_preprocessing.transform(
-      inputs, image_preprocessing.get_rotation_matrix(angles, img_hd, img_wd))
+    """rotate image."""
+    inputs_shape = tf.shape(inputs)
+    batch_size = inputs_shape[0]
+    img_hd = tf.cast(inputs_shape[1], tf.float32)
+    img_wd = tf.cast(inputs_shape[2], tf.float32)
+    min_angle = LOWER * 2.0 * np.pi
+    max_angle = UPPER * 2.0 * np.pi
+    angles = tf.random.uniform(
+        shape=[batch_size], minval=min_angle, maxval=max_angle
+    )
+    return image_preprocessing.transform(
+        inputs, image_preprocessing.get_rotation_matrix(angles, img_hd, img_wd)
+    )
 
 
 def zoom(inputs):
-  """zoom image."""
-  inputs_shape = tf.shape(inputs)
-  batch_size = inputs_shape[0]
-  img_hd = tf.cast(inputs_shape[1], tf.float32)
-  img_wd = tf.cast(inputs_shape[2], tf.float32)
-  height_zoom = tf.random.uniform(
-      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
-  width_zoom = tf.random.uniform(
-      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
-  zooms = tf.cast(
-      tf.concat([width_zoom, height_zoom], axis=1), dtype=tf.float32)
-  return image_preprocessing.transform(
-      inputs, image_preprocessing.get_zoom_matrix(zooms, img_hd, img_wd))
+    """zoom image."""
+    inputs_shape = tf.shape(inputs)
+    batch_size = inputs_shape[0]
+    img_hd = tf.cast(inputs_shape[1], tf.float32)
+    img_wd = tf.cast(inputs_shape[2], tf.float32)
+    height_zoom = tf.random.uniform(
+        shape=[batch_size, 1], minval=1.0 + LOWER, maxval=1.0 + UPPER
+    )
+    width_zoom = tf.random.uniform(
+        shape=[batch_size, 1], minval=1.0 + LOWER, maxval=1.0 + UPPER
+    )
+    zooms = tf.cast(
+        tf.concat([width_zoom, height_zoom], axis=1), dtype=tf.float32
+    )
+    return image_preprocessing.transform(
+        inputs, image_preprocessing.get_zoom_matrix(zooms, img_hd, img_wd)
+    )
 
 
 def image_augmentation(inputs, batch_size):
-  """image augmentation."""
-  img = inputs
-  img = tf.image.resize(img, size=[224, 224])
-  img = tf.image.random_crop(img, size=[batch_size, 224, 224, 3])
-  img = rotate(img)
-  img = zoom(img)
-  return img
+    """image augmentation."""
+    img = inputs
+    img = tf.image.resize(img, size=[224, 224])
+    img = tf.image.random_crop(img, size=[batch_size, 224, 224, 3])
+    img = rotate(img)
+    img = zoom(img)
+    return img
 
 
 class BenchmarkLayer(tf.test.Benchmark):
-  """Benchmark the layer forward pass."""
-
-  def run_dataset_implementation(self, batch_size):
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.from_tensor_slices(
-          np.random.random((batch_size, 256, 256, 3)))
-      ds = ds.shuffle(batch_size * 100)
-      ds = ds.batch(batch_size)
-      ds = ds.prefetch(batch_size)
-      img_augmentation = functools.partial(
-          image_augmentation, batch_size=batch_size)
-      ds = ds.map(img_augmentation, num_parallel_calls=8)
-      starts.append(time.time())
-      count = 0
-      # Benchmarked code begins here.
-      for i in ds:
-        _ = i
-        count += 1
-      # Benchmarked code ends here.
-      ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
-    return avg_time
-
-  def bm_layer_implementation(self, batch_size):
-    with tf.device("/gpu:0"):
-      img = keras.Input(shape=(256, 256, 3), dtype=tf.float32)
-      preprocessor = keras.Sequential([
-          image_preprocessing.Resizing(224, 224),
-          image_preprocessing.RandomCrop(height=224, width=224),
-          image_preprocessing.RandomRotation(factor=(.2, .4)),
-          image_preprocessing.RandomFlip(mode="horizontal"),
-          image_preprocessing.RandomZoom(.2, .2)
-      ])
-      _ = preprocessor(img)
-
-      num_repeats = 5
-      starts = []
-      ends = []
-      for _ in range(num_repeats):
-        ds = tf.data.Dataset.from_tensor_slices(
-            np.random.random((batch_size, 256, 256, 3)))
-        ds = ds.shuffle(batch_size * 100)
-        ds = ds.batch(batch_size)
-        ds = ds.prefetch(batch_size)
-        starts.append(time.time())
-        count = 0
-        # Benchmarked code begins here.
-        for i in ds:
-          _ = preprocessor(i)
-          count += 1
-        # Benchmarked code ends here.
-        ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
-    name = "image_preprocessing|batch_%s" % batch_size
-    baseline = self.run_dataset_implementation(batch_size)
-    extras = {
-        "dataset implementation baseline": baseline,
-        "delta seconds": (baseline - avg_time),
-        "delta percent": ((baseline - avg_time) / baseline) * 100
-    }
-    self.report_benchmark(
-        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
-
-  def benchmark_vocab_size_by_batch(self):
-    for batch in [32, 64, 256]:
-      self.bm_layer_implementation(batch_size=batch)
+    """Benchmark the layer forward pass."""
+
+    def run_dataset_implementation(self, batch_size):
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.from_tensor_slices(
+                np.random.random((batch_size, 256, 256, 3))
+            )
+            ds = ds.shuffle(batch_size * 100)
+            ds = ds.batch(batch_size)
+            ds = ds.prefetch(batch_size)
+            img_augmentation = functools.partial(
+                image_augmentation, batch_size=batch_size
+            )
+            ds = ds.map(img_augmentation, num_parallel_calls=8)
+            starts.append(time.time())
+            count = 0
+            # Benchmarked code begins here.
+            for i in ds:
+                _ = i
+                count += 1
+            # Benchmarked code ends here.
+            ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+        return avg_time
+
+    def bm_layer_implementation(self, batch_size):
+        with tf.device("/gpu:0"):
+            img = keras.Input(shape=(256, 256, 3), dtype=tf.float32)
+            preprocessor = keras.Sequential(
+                [
+                    image_preprocessing.Resizing(224, 224),
+                    image_preprocessing.RandomCrop(height=224, width=224),
+                    image_preprocessing.RandomRotation(factor=(0.2, 0.4)),
+                    image_preprocessing.RandomFlip(mode="horizontal"),
+                    image_preprocessing.RandomZoom(0.2, 0.2),
+                ]
+            )
+            _ = preprocessor(img)
+
+            num_repeats = 5
+            starts = []
+            ends = []
+            for _ in range(num_repeats):
+                ds = tf.data.Dataset.from_tensor_slices(
+                    np.random.random((batch_size, 256, 256, 3))
+                )
+                ds = ds.shuffle(batch_size * 100)
+                ds = ds.batch(batch_size)
+                ds = ds.prefetch(batch_size)
+                starts.append(time.time())
+                count = 0
+                # Benchmarked code begins here.
+                for i in ds:
+                    _ = preprocessor(i)
+                    count += 1
+                # Benchmarked code ends here.
+                ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+        name = f"image_preprocessing|batch_{batch_size}"
+        baseline = self.run_dataset_implementation(batch_size)
+        extras = {
+            "dataset implementation baseline": baseline,
+            "delta seconds": (baseline - avg_time),
+            "delta percent": ((baseline - avg_time) / baseline) * 100,
+        }
+        self.report_benchmark(
+            iters=num_repeats, wall_time=avg_time, extras=extras, name=name
+        )
+
+    def benchmark_vocab_size_by_batch(self):
+        for batch in [32, 64, 256]:
+            self.bm_layer_implementation(batch_size=batch)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py b/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
index 85493722cb59..589f9ab2dea7 100644
--- a/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Benchmark for Keras text vectorization preprocessing layer's adapt method."""
 
-import tensorflow.compat.v2 as tf
-
 import collections
 import itertools
 import random
@@ -23,6 +21,7 @@
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.layers.preprocessing import index_lookup
@@ -33,90 +32,102 @@
 # word_gen creates random sequences of ASCII letters (both lowercase and upper).
 # The number of unique strings is ~2,700.
 def word_gen():
-  for _ in itertools.count(1):
-    yield "".join(random.choice(string.ascii_letters) for i in range(2))
+    for _ in itertools.count(1):
+        yield "".join(random.choice(string.ascii_letters) for i in range(2))
 
 
 def get_top_k(dataset, k):
-  """Python implementation of vocabulary building using a defaultdict."""
-  counts = collections.defaultdict(int)
-  for tensor in dataset:
-    data = tensor.numpy()
-    for element in data:
-      counts[element] += 1
-  sorted_vocab = [
-      k for k, _ in sorted(
-          counts.items(), key=lambda item: item[1], reverse=True)
-  ]
-  if len(sorted_vocab) > k:
-    sorted_vocab = sorted_vocab[:k]
-  return sorted_vocab
+    """Python implementation of vocabulary building using a defaultdict."""
+    counts = collections.defaultdict(int)
+    for tensor in dataset:
+        data = tensor.numpy()
+        for element in data:
+            counts[element] += 1
+    sorted_vocab = [
+        k
+        for k, _ in sorted(
+            counts.items(), key=lambda item: item[1], reverse=True
+        )
+    ]
+    if len(sorted_vocab) > k:
+        sorted_vocab = sorted_vocab[:k]
+    return sorted_vocab
 
 
 class BenchmarkAdapt(tf.test.Benchmark):
-  """Benchmark adapt."""
-
-  def run_numpy_implementation(self, num_elements, batch_size, k):
-    """Test the python implementation."""
-    ds = tf.data.Dataset.from_generator(word_gen, tf.string,
-                                            tf.TensorShape([]))
-    batched_ds = ds.take(num_elements).batch(batch_size)
-    input_t = keras.Input(shape=(), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=k,
-        num_oov_indices=0,
-        mask_token=None,
-        oov_token="OOV",
-        dtype=tf.string)
-    _ = layer(input_t)
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      starts.append(time.time())
-      vocab = get_top_k(batched_ds, k)
-      layer.set_vocabulary(vocab)
-      ends.append(time.time())
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    return avg_time
-
-  def bm_adapt_implementation(self, num_elements, batch_size, k):
-    """Test the KPL adapt implementation."""
-    ds = tf.data.Dataset.from_generator(word_gen, tf.string,
-                                            tf.TensorShape([]))
-    batched_ds = ds.take(num_elements).batch(batch_size)
-    input_t = keras.Input(shape=(), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=k,
-        num_oov_indices=0,
-        mask_token=None,
-        oov_token="OOV",
-        dtype=tf.string)
-    _ = layer(input_t)
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      starts.append(time.time())
-      layer.adapt(batched_ds)
-      ends.append(time.time())
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    name = "index_lookup_adapt|%s_elements|vocab_size_%s|batch_%s" % (
-        num_elements, k, batch_size)
-    baseline = self.run_numpy_implementation(num_elements, batch_size, k)
-    extras = {
-        "numpy implementation baseline": baseline,
-        "delta seconds": (baseline - avg_time),
-        "delta percent": ((baseline - avg_time) / baseline) * 100
-    }
-    self.report_benchmark(
-        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
-
-  def benchmark_vocab_size_by_batch(self):
-    for vocab_size in [100, 1000, 10000, 100000, 1000000]:
-      for batch in [1, 16, 2048]:
-        self.bm_adapt_implementation(vocab_size, batch, int(vocab_size / 10))
+    """Benchmark adapt."""
+
+    def run_numpy_implementation(self, num_elements, batch_size, k):
+        """Test the python implementation."""
+        ds = tf.data.Dataset.from_generator(
+            word_gen, tf.string, tf.TensorShape([])
+        )
+        batched_ds = ds.take(num_elements).batch(batch_size)
+        input_t = keras.Input(shape=(), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=k,
+            num_oov_indices=0,
+            mask_token=None,
+            oov_token="OOV",
+            dtype=tf.string,
+        )
+        _ = layer(input_t)
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            starts.append(time.time())
+            vocab = get_top_k(batched_ds, k)
+            layer.set_vocabulary(vocab)
+            ends.append(time.time())
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        return avg_time
+
+    def bm_adapt_implementation(self, num_elements, batch_size, k):
+        """Test the KPL adapt implementation."""
+        ds = tf.data.Dataset.from_generator(
+            word_gen, tf.string, tf.TensorShape([])
+        )
+        batched_ds = ds.take(num_elements).batch(batch_size)
+        input_t = keras.Input(shape=(), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=k,
+            num_oov_indices=0,
+            mask_token=None,
+            oov_token="OOV",
+            dtype=tf.string,
+        )
+        _ = layer(input_t)
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            starts.append(time.time())
+            layer.adapt(batched_ds)
+            ends.append(time.time())
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        name = "index_lookup_adapt|%s_elements|vocab_size_%s|batch_%s" % (
+            num_elements,
+            k,
+            batch_size,
+        )
+        baseline = self.run_numpy_implementation(num_elements, batch_size, k)
+        extras = {
+            "numpy implementation baseline": baseline,
+            "delta seconds": (baseline - avg_time),
+            "delta percent": ((baseline - avg_time) / baseline) * 100,
+        }
+        self.report_benchmark(
+            iters=num_repeats, wall_time=avg_time, extras=extras, name=name
+        )
+
+    def benchmark_vocab_size_by_batch(self):
+        for vocab_size in [100, 1000, 10000, 100000, 1000000]:
+            for batch in [1, 16, 2048]:
+                self.bm_adapt_implementation(
+                    vocab_size, batch, int(vocab_size / 10)
+                )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/index_lookup_forward_benchmark.py b/keras/layers/preprocessing/benchmarks/index_lookup_forward_benchmark.py
index d7f6868ddbdb..659d65569403 100644
--- a/keras/layers/preprocessing/benchmarks/index_lookup_forward_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/index_lookup_forward_benchmark.py
@@ -14,14 +14,13 @@
 # ==============================================================================
 """Benchmark for Keras text vectorization preprocessing layer's adapt method."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import random
 import string
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.layers.preprocessing import index_lookup
@@ -30,107 +29,114 @@
 # word_gen creates random sequences of ASCII letters (both lowercase and upper).
 # The number of unique strings is ~2,700.
 def tensor_gen(batch, num_elements):
-  data = []
-  for _ in range(batch):
-    batch_element = []
-    for _ in range(num_elements - 1):
-      tok = "".join(random.choice(string.ascii_letters) for i in range(2))
-      batch_element.append(tok)
-    batch_element.append("")  # Explicitly test the empty string.
-    data.append(batch_element)
-  return tf.constant(data)
+    data = []
+    for _ in range(batch):
+        batch_element = []
+        for _ in range(num_elements - 1):
+            tok = "".join(random.choice(string.ascii_letters) for i in range(2))
+            batch_element.append(tok)
+        batch_element.append("")  # Explicitly test the empty string.
+        data.append(batch_element)
+    return tf.constant(data)
 
 
 def get_vocab():
-  vocab = list(
-      set([a + b for a in string.ascii_letters for b in string.ascii_letters]))  # pylint:disable=g-complex-comprehension
-  vocab.sort()
-  return vocab
+    vocab = list(
+        set([a + b for a in string.ascii_letters for b in string.ascii_letters])
+    )
+    vocab.sort()
+    return vocab
 
 
 # This class uses TestCase for get_temp_dir().
 class BenchmarkLookup(tf.test.Benchmark):
-  """Benchmark the index lookup layer's forward pass."""
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def run_numpy_implementation(self, data, vocab):
-    """Test the python implementation."""
-    input_t = keras.Input(shape=(), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="OOV",
-        dtype=tf.string)
-    out_t = layer(input_t)
-    model = keras.Model(input_t, out_t)
-    num_repeats = 5
-    starts = []
-    ends = []
-    _ = model(data)
-    for _ in range(num_repeats):
-      starts.append(time.time())
-      out = model(data)
-      ends.append(time.time())
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    return avg_time, out
-
-  def bm_adapt_implementation(self, num_elements, batch_size):
-    """Test the KPL adapt implementation."""
-    vocab = get_vocab()
-    vocab_file = self._write_to_temp_file("vocab", vocab)
-    vocabulary_initializer = tf.lookup.TextFileInitializer(
-        filename=vocab_file,
-        key_dtype=tf.string,
-        key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
-        value_dtype=tf.int64,
-        value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
-        value_index_offset=2)
-    input_t = keras.Input(shape=(), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocabulary_initializer,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="OOV",
-        dtype=tf.string)
-    out_t = layer(input_t)
-    model = keras.Model(input_t, out_t)
-    num_repeats = 5
-    starts = []
-    ends = []
-    data = tensor_gen(batch_size, num_elements)
-    _ = model(data)
-    for _ in range(num_repeats):
-      starts.append(time.time())
-      _ = model(data)
-      ends.append(time.time())
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    baseline, _ = self.run_numpy_implementation(data, vocab)
-    extras = {
-        "numpy implementation baseline": baseline,
-        "delta seconds": (baseline - avg_time),
-        "delta percent": ((baseline - avg_time) / baseline) * 100
-    }
-    name = "index_lookup_forward|%s_elements|batch_%s" % (num_elements,
-                                                          batch_size)
-    self.report_benchmark(
-        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
-
-  def benchmark_vocab_size_by_batch(self):
-    for tensor_size in [100, 1000, 10000]:
-      for batch in [1, 16, 2048]:
-        self.bm_adapt_implementation(tensor_size, batch)
+    """Benchmark the index lookup layer's forward pass."""
+
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def run_numpy_implementation(self, data, vocab):
+        """Test the python implementation."""
+        input_t = keras.Input(shape=(), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="OOV",
+            dtype=tf.string,
+        )
+        out_t = layer(input_t)
+        model = keras.Model(input_t, out_t)
+        num_repeats = 5
+        starts = []
+        ends = []
+        _ = model(data)
+        for _ in range(num_repeats):
+            starts.append(time.time())
+            out = model(data)
+            ends.append(time.time())
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        return avg_time, out
+
+    def bm_adapt_implementation(self, num_elements, batch_size):
+        """Test the KPL adapt implementation."""
+        vocab = get_vocab()
+        vocab_file = self._write_to_temp_file("vocab", vocab)
+        vocabulary_initializer = tf.lookup.TextFileInitializer(
+            filename=vocab_file,
+            key_dtype=tf.string,
+            key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
+            value_dtype=tf.int64,
+            value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
+            value_index_offset=2,
+        )
+        input_t = keras.Input(shape=(), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocabulary_initializer,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="OOV",
+            dtype=tf.string,
+        )
+        out_t = layer(input_t)
+        model = keras.Model(input_t, out_t)
+        num_repeats = 5
+        starts = []
+        ends = []
+        data = tensor_gen(batch_size, num_elements)
+        _ = model(data)
+        for _ in range(num_repeats):
+            starts.append(time.time())
+            _ = model(data)
+            ends.append(time.time())
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        baseline, _ = self.run_numpy_implementation(data, vocab)
+        extras = {
+            "numpy implementation baseline": baseline,
+            "delta seconds": (baseline - avg_time),
+            "delta percent": ((baseline - avg_time) / baseline) * 100,
+        }
+        name = "index_lookup_forward|%s_elements|batch_%s" % (
+            num_elements,
+            batch_size,
+        )
+        self.report_benchmark(
+            iters=num_repeats, wall_time=avg_time, extras=extras, name=name
+        )
+
+    def benchmark_vocab_size_by_batch(self):
+        for tensor_size in [100, 1000, 10000]:
+            for batch in [1, 16, 2048]:
+                self.bm_adapt_implementation(tensor_size, batch)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py b/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py
index 491216f3cff4..6d8c50b1a125 100644
--- a/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/normalization_adapt_benchmark.py
@@ -14,106 +14,109 @@
 # ==============================================================================
 """Benchmark for Keras text vectorization preprocessing layer's adapt method."""
 
-import tensorflow.compat.v2 as tf
-
 import time
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.layers.preprocessing import normalization
 
 
 def reduce_fn(state, values):
-  """tf.data.Dataset-friendly implementation of mean and variance."""
-  k, n, ex, ex2 = state
-  # If this is the first iteration, we pick the first value to be 'k',
-  # which helps with precision - we assume that k is close to an average
-  # value and calculate mean and variance with respect to that.
-  k = tf.cond(tf.equal(n, 0), lambda: values[0], lambda: k)
-
-  sum_v = tf.reduce_sum(values, axis=0)
-  sum_v2 = tf.reduce_sum(tf.square(values), axis=0)
-  ones = tf.ones_like(values, dtype=tf.int32)
-  batch_size = tf.reduce_sum(ones, axis=0)
-  batch_size_f = tf.cast(batch_size, tf.float32)
-
-  ex = 0 + sum_v - tf.multiply(batch_size_f, k)
-  ex2 = 0 + sum_v2 + tf.multiply(
-      batch_size_f, (tf.square(k) -
-                     tf.multiply(tf.multiply(2.0, k), sum_v)))
-
-  return (k, n + batch_size, ex, ex2)
+    """tf.data.Dataset-friendly implementation of mean and variance."""
+    k, n, ex, ex2 = state
+    # If this is the first iteration, we pick the first value to be 'k',
+    # which helps with precision - we assume that k is close to an average
+    # value and calculate mean and variance with respect to that.
+    k = tf.cond(tf.equal(n, 0), lambda: values[0], lambda: k)
+
+    sum_v = tf.reduce_sum(values, axis=0)
+    sum_v2 = tf.reduce_sum(tf.square(values), axis=0)
+    ones = tf.ones_like(values, dtype=tf.int32)
+    batch_size = tf.reduce_sum(ones, axis=0)
+    batch_size_f = tf.cast(batch_size, tf.float32)
+
+    ex = 0 + sum_v - tf.multiply(batch_size_f, k)
+    ex2 = (
+        0
+        + sum_v2
+        + tf.multiply(
+            batch_size_f,
+            (tf.square(k) - tf.multiply(tf.multiply(2.0, k), sum_v)),
+        )
+    )
+
+    return (k, n + batch_size, ex, ex2)
 
 
 class BenchmarkAdapt(tf.test.Benchmark):
-  """Benchmark adapt."""
-
-  def run_dataset_implementation(self, num_elements, batch_size):
-    input_t = keras.Input(shape=(1,))
-    layer = normalization.Normalization()
-    _ = layer(input_t)
-
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.range(num_elements)
-      ds = ds.map(
-          lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
-      ds = ds.batch(batch_size)
-
-      starts.append(time.time())
-      # Benchmarked code begins here.
-      k, n, ex, ex2 = ds.reduce((0.0, 0, 0.0, 0.0), reduce_fn)
-      mean = k.numpy() + ex.numpy() / n.numpy()
-      var = (ex2.numpy() - (ex.numpy() * ex.numpy()) / n.numpy()) / (
-          n.numpy() - 1)
-      layer.set_weights([mean, var])
-      # Benchmarked code ends here.
-      ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    return avg_time
-
-  def bm_adapt_implementation(self, num_elements, batch_size):
-    """Test the KPL adapt implementation."""
-    input_t = keras.Input(shape=(1,), dtype=tf.float32)
-    layer = normalization.Normalization()
-    _ = layer(input_t)
-
-    num_repeats = 5
-    starts = []
-    ends = []
-    for _ in range(num_repeats):
-      ds = tf.data.Dataset.range(num_elements)
-      ds = ds.map(
-          lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
-      ds = ds.batch(batch_size)
-
-      starts.append(time.time())
-      # Benchmarked code begins here.
-      layer.adapt(ds)
-      # Benchmarked code ends here.
-      ends.append(time.time())
-
-    avg_time = np.mean(np.array(ends) - np.array(starts))
-    name = "normalization_adapt|%s_elements|batch_%s" % (num_elements,
-                                                         batch_size)
-    baseline = self.run_dataset_implementation(num_elements, batch_size)
-    extras = {
-        "tf.data implementation baseline": baseline,
-        "delta seconds": (baseline - avg_time),
-        "delta percent": ((baseline - avg_time) / baseline) * 100
-    }
-    self.report_benchmark(
-        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
-
-  def benchmark_vocab_size_by_batch(self):
-    for vocab_size in [100, 1000, 10000, 100000, 1000000]:
-      for batch in [1, 16, 2048]:
-        self.bm_adapt_implementation(vocab_size, batch)
+    """Benchmark adapt."""
+
+    def run_dataset_implementation(self, num_elements, batch_size):
+        input_t = keras.Input(shape=(1,))
+        layer = normalization.Normalization()
+        _ = layer(input_t)
+
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.range(num_elements)
+            ds = ds.map(lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
+            ds = ds.batch(batch_size)
+
+            starts.append(time.time())
+            # Benchmarked code begins here.
+            k, n, ex, ex2 = ds.reduce((0.0, 0, 0.0, 0.0), reduce_fn)
+            mean = k.numpy() + ex.numpy() / n.numpy()
+            var = (ex2.numpy() - (ex.numpy() * ex.numpy()) / n.numpy()) / (
+                n.numpy() - 1
+            )
+            layer.set_weights([mean, var])
+            # Benchmarked code ends here.
+            ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        return avg_time
+
+    def bm_adapt_implementation(self, num_elements, batch_size):
+        """Test the KPL adapt implementation."""
+        input_t = keras.Input(shape=(1,), dtype=tf.float32)
+        layer = normalization.Normalization()
+        _ = layer(input_t)
+
+        num_repeats = 5
+        starts = []
+        ends = []
+        for _ in range(num_repeats):
+            ds = tf.data.Dataset.range(num_elements)
+            ds = ds.map(lambda x: tf.expand_dims(tf.cast(x, tf.float32), -1))
+            ds = ds.batch(batch_size)
+
+            starts.append(time.time())
+            # Benchmarked code begins here.
+            layer.adapt(ds)
+            # Benchmarked code ends here.
+            ends.append(time.time())
+
+        avg_time = np.mean(np.array(ends) - np.array(starts))
+        name = f"normalization_adapt|{num_elements}_elements|batch_{batch_size}"
+        baseline = self.run_dataset_implementation(num_elements, batch_size)
+        extras = {
+            "tf.data implementation baseline": baseline,
+            "delta seconds": (baseline - avg_time),
+            "delta percent": ((baseline - avg_time) / baseline) * 100,
+        }
+        self.report_benchmark(
+            iters=num_repeats, wall_time=avg_time, extras=extras, name=name
+        )
+
+    def benchmark_vocab_size_by_batch(self):
+        for vocab_size in [100, 1000, 10000, 100000, 1000000]:
+            for batch in [1, 16, 2048]:
+                self.bm_adapt_implementation(vocab_size, batch)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py
index 40a64d1e4e76..6213761e34dd 100644
--- a/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py
@@ -12,13 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for KPL implementation of weighted embedding column with varying-length inputs."""
+"""Benchmark for KPL implementation of weighted embedding column with
+varying-length inputs."""
 
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.def_function import function as tf_function
-from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
+from keras.layers.preprocessing.benchmarks import (
+    feature_column_benchmark as fc_bm,
+)
+
+# isort: off
+from tensorflow.python.eager.def_function import (
+    function as tf_function,
+)
 
 NUM_REPEATS = 10
 BATCH_SIZES = [32, 256]
@@ -26,56 +33,66 @@
 
 ### KPL AND FC IMPLEMENTATION BENCHMARKS ###
 def embedding_varlen(batch_size, max_length):
-  """Benchmark a variable-length embedding."""
-  # Data and constants.
-  embedding_size = 32768
-  data = fc_bm.create_data(
-      max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int)
-  weight = tf.ones_like(data, dtype=tf.float32)
-
-  # Keras implementation
-  data_input = keras.Input(
-      shape=(None,), ragged=True, name="data", dtype=tf.int64)
-  weight_input = keras.Input(
-      shape=(None,), ragged=True, name="weight", dtype=tf.float32)
-  embedded_data = keras.layers.Embedding(embedding_size, 256)(data_input)
-  weighted_embedding = tf.multiply(
-      embedded_data, tf.expand_dims(weight_input, -1))
-  reduced_embedding = tf.reduce_sum(weighted_embedding, axis=1)
-  model = keras.Model([data_input, weight_input], reduced_embedding)
-
-  # FC implementation
-  fc = tf.feature_column.embedding_column(
-      tf.feature_column.weighted_categorical_column(
-          tf.feature_column.categorical_column_with_identity(
-              "data", num_buckets=embedding_size - 1),
-          weight_feature_key="weight"),
-      dimension=256)
-
-  # Wrap the FC implementation in a tf.function for a fair comparison
-  @tf_function()
-  def fc_fn(tensors):
-    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
-
-  # Benchmark runs
-  keras_data = {"data": data, "weight": weight}
-  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
-
-  fc_data = {"data": data.to_sparse(), "weight": weight.to_sparse()}
-  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
-
-  return k_avg_time, fc_avg_time
+    """Benchmark a variable-length embedding."""
+    # Data and constants.
+    embedding_size = 32768
+    data = fc_bm.create_data(
+        max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int
+    )
+    weight = tf.ones_like(data, dtype=tf.float32)
+
+    # Keras implementation
+    data_input = keras.Input(
+        shape=(None,), ragged=True, name="data", dtype=tf.int64
+    )
+    weight_input = keras.Input(
+        shape=(None,), ragged=True, name="weight", dtype=tf.float32
+    )
+    embedded_data = keras.layers.Embedding(embedding_size, 256)(data_input)
+    weighted_embedding = tf.multiply(
+        embedded_data, tf.expand_dims(weight_input, -1)
+    )
+    reduced_embedding = tf.reduce_sum(weighted_embedding, axis=1)
+    model = keras.Model([data_input, weight_input], reduced_embedding)
+
+    # FC implementation
+    fc = tf.feature_column.embedding_column(
+        tf.feature_column.weighted_categorical_column(
+            tf.feature_column.categorical_column_with_identity(
+                "data", num_buckets=embedding_size - 1
+            ),
+            weight_feature_key="weight",
+        ),
+        dimension=256,
+    )
+
+    # Wrap the FC implementation in a tf.function for a fair comparison
+    @tf_function()
+    def fc_fn(tensors):
+        fc.transform_feature(
+            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
+            None,
+        )
+
+    # Benchmark runs
+    keras_data = {"data": data, "weight": weight}
+    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)
+
+    fc_data = {"data": data.to_sparse(), "weight": weight.to_sparse()}
+    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)
+
+    return k_avg_time, fc_avg_time
 
 
 class BenchmarkLayer(fc_bm.LayerBenchmark):
-  """Benchmark the layer forward pass."""
+    """Benchmark the layer forward pass."""
 
-  def benchmark_layer(self):
-    for batch in BATCH_SIZES:
-      name = "weighted_embedding|varlen|batch_%s" % batch
-      k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
-      self.report(name, k_time, f_time, NUM_REPEATS)
+    def benchmark_layer(self):
+        for batch in BATCH_SIZES:
+            name = f"weighted_embedding|varlen|batch_{batch}"
+            k_time, f_time = embedding_varlen(batch_size=batch, max_length=256)
+            self.report(name, k_time, f_time, NUM_REPEATS)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/category_encoding.py b/keras/layers/preprocessing/category_encoding.py
index 8f41de191d95..5b606616f02e 100644
--- a/keras/layers/preprocessing/category_encoding.py
+++ b/keras/layers/preprocessing/category_encoding.py
@@ -14,15 +14,16 @@
 # ==============================================================================
 """Keras CategoryEncoding preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
 
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_layer
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_utils as utils
 from keras.utils import layer_utils
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -32,184 +33,199 @@
 COUNT = utils.COUNT
 
 
-@keras_export("keras.layers.CategoryEncoding",
-              "keras.layers.experimental.preprocessing.CategoryEncoding")
+@keras_export(
+    "keras.layers.CategoryEncoding",
+    "keras.layers.experimental.preprocessing.CategoryEncoding",
+)
 class CategoryEncoding(base_layer.Layer):
-  """A preprocessing layer which encodes integer features.
-
-  This layer provides options for condensing data into a categorical encoding
-  when the total number of tokens are known in advance. It accepts integer
-  values as inputs, and it outputs a dense or sparse representation of those
-  inputs. For integer inputs where the total number of tokens is not known, use
-  `tf.keras.layers.IntegerLookup` instead.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Examples:
-
-  **One-hot encoding data**
-
-  >>> layer = tf.keras.layers.CategoryEncoding(
-  ...           num_tokens=4, output_mode="one_hot")
-  >>> layer([3, 2, 0, 1])
-  <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
-    array([[0., 0., 0., 1.],
-           [0., 0., 1., 0.],
-           [1., 0., 0., 0.],
-           [0., 1., 0., 0.]], dtype=float32)>
-
-  **Multi-hot encoding data**
-
-  >>> layer = tf.keras.layers.CategoryEncoding(
-  ...           num_tokens=4, output_mode="multi_hot")
-  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
-  <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
-    array([[1., 1., 0., 0.],
-           [1., 0., 0., 0.],
-           [0., 1., 1., 0.],
-           [0., 1., 0., 1.]], dtype=float32)>
-
-  **Using weighted inputs in `"count"` mode**
-
-  >>> layer = tf.keras.layers.CategoryEncoding(
-  ...           num_tokens=4, output_mode="count")
-  >>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
-  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
-  <tf.Tensor: shape=(4, 4), dtype=float64, numpy=
-    array([[0.1, 0.2, 0. , 0. ],
-           [0.2, 0. , 0. , 0. ],
-           [0. , 0.2, 0.3, 0. ],
-           [0. , 0.2, 0. , 0.4]], dtype=float32)>
-
-  Args:
-    num_tokens: The total number of tokens the layer should support. All inputs
-      to the layer must integers in the range `0 <= value < num_tokens`, or an
-      error will be thrown.
-    output_mode: Specification for the output of the layer.
-      Defaults to `"multi_hot"`. Values can be `"one_hot"`, `"multi_hot"` or
-      `"count"`, configuring the layer as follows:
-        - `"one_hot"`: Encodes each individual element in the input into an
-          array of `num_tokens` size, containing a 1 at the element index. If
-          the last dimension is size 1, will encode on that dimension. If the
-          last dimension is not size 1, will append a new dimension for the
-          encoded output.
-        - `"multi_hot"`: Encodes each sample in the input into a single array
-          of `num_tokens` size, containing a 1 for each vocabulary term present
-          in the sample. Treats the last dimension as the sample dimension, if
-          input shape is `(..., sample_length)`, output shape will be
-          `(..., num_tokens)`.
-        - `"count"`: Like `"multi_hot"`, but the int array contains a count of
-          the number of times the token at that index appeared in the sample.
-      For all output modes, currently only output up to rank 2 is supported.
-    sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
-      `Tensor`. Defaults to `False`.
-
-  Call arguments:
-    inputs: A 1D or 2D tensor of integer inputs.
-    count_weights: A tensor in the same shape as `inputs` indicating the
-      weight for each sample value when summing up in `count` mode. Not used in
-      `"multi_hot"` or `"one_hot"` modes.
-  """
-
-  def __init__(self,
-               num_tokens=None,
-               output_mode="multi_hot",
-               sparse=False,
-               **kwargs):
-    # max_tokens is an old name for the num_tokens arg we continue to support
-    # because of usage.
-    if "max_tokens" in kwargs:
-      logging.warning(
-          "max_tokens is deprecated, please use num_tokens instead.")
-      num_tokens = kwargs["max_tokens"]
-      del kwargs["max_tokens"]
-
-    # By default, output floats. This is already default for TF2, but in TF1
-    # dtype is inferred from inputs, and would default to int.
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = backend.floatx()
-
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell("CategoryEncoding").set(
-        True)
-
-    # Support deprecated names for output_modes.
-    if output_mode == "binary":
-      output_mode = MULTI_HOT
-    # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
-    layer_utils.validate_string_arg(
-        output_mode,
-        allowable_strings=(COUNT, ONE_HOT, MULTI_HOT),
-        layer_name="CategoryEncoding",
-        arg_name="output_mode")
-
-    if num_tokens is None:
-      raise ValueError("num_tokens must be set to use this layer. If the "
-                       "number of tokens is not known beforehand, use the "
-                       "IntegerLookup layer instead.")
-    if num_tokens < 1:
-      raise ValueError(
-          f"`num_tokens` must be >= 1. Received: num_tokens={num_tokens}.")
-
-    self.num_tokens = num_tokens
-    self.output_mode = output_mode
-    self.sparse = sparse
-
-  def compute_output_shape(self, input_shape):
-    if not input_shape:
-      return tf.TensorShape([self.num_tokens])
-    if self.output_mode == ONE_HOT and input_shape[-1] != 1:
-      return tf.TensorShape(input_shape + [self.num_tokens])
-    else:
-      return tf.TensorShape(input_shape[:-1] + [self.num_tokens])
-
-  def compute_output_signature(self, input_spec):
-    output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    if self.sparse:
-      return tf.SparseTensorSpec(
-          shape=output_shape, dtype=tf.int64)
-    else:
-      return tf.TensorSpec(shape=output_shape, dtype=tf.int64)
-
-  def get_config(self):
-    config = {
-        "num_tokens": self.num_tokens,
-        "output_mode": self.output_mode,
-        "sparse": self.sparse,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, count_weights=None):
-    inputs = utils.ensure_tensor(inputs)
-
-    if count_weights is not None:
-      if self.output_mode != COUNT:
-        raise ValueError(
-            "`count_weights` is not used when `output_mode` is not `'count'`. "
-            "Received `count_weights={}`.".format(count_weights))
-      count_weights = utils.ensure_tensor(count_weights, self.compute_dtype)
-
-    depth = self.num_tokens
-    if isinstance(inputs, tf.SparseTensor):
-      max_value = tf.reduce_max(inputs.values)
-      min_value = tf.reduce_min(inputs.values)
-    else:
-      max_value = tf.reduce_max(inputs)
-      min_value = tf.reduce_min(inputs)
-    condition = tf.logical_and(
-        tf.greater(tf.cast(depth, max_value.dtype), max_value),
-        tf.greater_equal(min_value, tf.cast(0, min_value.dtype)))
-    assertion = tf.Assert(condition, [
-        "Input values must be in the range 0 <= values < num_tokens"
-        " with num_tokens={}".format(depth)
-    ])
-    with tf.control_dependencies([assertion]):
-      return utils.encode_categorical_inputs(
-          inputs,
-          output_mode=self.output_mode,
-          depth=depth,
-          dtype=self.compute_dtype,
-          sparse=self.sparse,
-          count_weights=count_weights)
+    """A preprocessing layer which encodes integer features.
+
+    This layer provides options for condensing data into a categorical encoding
+    when the total number of tokens are known in advance. It accepts integer
+    values as inputs, and it outputs a dense or sparse representation of those
+    inputs. For integer inputs where the total number of tokens is not known,
+    use `tf.keras.layers.IntegerLookup` instead.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Examples:
+
+    **One-hot encoding data**
+
+    >>> layer = tf.keras.layers.CategoryEncoding(
+    ...           num_tokens=4, output_mode="one_hot")
+    >>> layer([3, 2, 0, 1])
+    <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
+      array([[0., 0., 0., 1.],
+             [0., 0., 1., 0.],
+             [1., 0., 0., 0.],
+             [0., 1., 0., 0.]], dtype=float32)>
+
+    **Multi-hot encoding data**
+
+    >>> layer = tf.keras.layers.CategoryEncoding(
+    ...           num_tokens=4, output_mode="multi_hot")
+    >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
+    <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
+      array([[1., 1., 0., 0.],
+             [1., 0., 0., 0.],
+             [0., 1., 1., 0.],
+             [0., 1., 0., 1.]], dtype=float32)>
+
+    **Using weighted inputs in `"count"` mode**
+
+    >>> layer = tf.keras.layers.CategoryEncoding(
+    ...           num_tokens=4, output_mode="count")
+    >>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
+    >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
+    <tf.Tensor: shape=(4, 4), dtype=float64, numpy=
+      array([[0.1, 0.2, 0. , 0. ],
+             [0.2, 0. , 0. , 0. ],
+             [0. , 0.2, 0.3, 0. ],
+             [0. , 0.2, 0. , 0.4]], dtype=float32)>
+
+    Args:
+      num_tokens: The total number of tokens the layer should support. All
+        inputs to the layer must integers in the range `0 <= value <
+        num_tokens`, or an error will be thrown.
+      output_mode: Specification for the output of the layer.
+        Values can be `"one_hot"`, `"multi_hot"` or
+        `"count"`, configuring the layer as follows:
+          - `"one_hot"`: Encodes each individual element in the input into an
+            array of `num_tokens` size, containing a 1 at the element index. If
+            the last dimension is size 1, will encode on that dimension. If the
+            last dimension is not size 1, will append a new dimension for the
+            encoded output.
+          - `"multi_hot"`: Encodes each sample in the input into a single array
+            of `num_tokens` size, containing a 1 for each vocabulary term
+            present in the sample. Treats the last dimension as the sample
+            dimension, if input shape is `(..., sample_length)`, output shape
+            will be `(..., num_tokens)`.
+          - `"count"`: Like `"multi_hot"`, but the int array contains a count of
+            the number of times the token at that index appeared in the sample.
+        For all output modes, currently only output up to rank 2 is supported.
+        Defaults to `"multi_hot"`.
+      sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
+        `Tensor`. Defaults to `False`.
+
+    Call arguments:
+      inputs: A 1D or 2D tensor of integer inputs.
+      count_weights: A tensor in the same shape as `inputs` indicating the
+        weight for each sample value when summing up in `count` mode. Not used
+        in `"multi_hot"` or `"one_hot"` modes.
+    """
+
+    def __init__(
+        self, num_tokens=None, output_mode="multi_hot", sparse=False, **kwargs
+    ):
+        # max_tokens is an old name for the num_tokens arg we continue to
+        # support because of usage.
+        if "max_tokens" in kwargs:
+            logging.warning(
+                "max_tokens is deprecated, please use num_tokens instead."
+            )
+            num_tokens = kwargs["max_tokens"]
+            del kwargs["max_tokens"]
+
+        # By default, output floats. This is already default for TF2, but in TF1
+        # dtype is inferred from inputs, and would default to int.
+        if "dtype" not in kwargs:
+            kwargs["dtype"] = backend.floatx()
+
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell(
+            "CategoryEncoding"
+        ).set(True)
+
+        # Support deprecated names for output_modes.
+        if output_mode == "binary":
+            output_mode = MULTI_HOT
+        # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
+        layer_utils.validate_string_arg(
+            output_mode,
+            allowable_strings=(COUNT, ONE_HOT, MULTI_HOT),
+            layer_name="CategoryEncoding",
+            arg_name="output_mode",
+        )
+
+        if num_tokens is None:
+            raise ValueError(
+                "num_tokens must be set to use this layer. If the "
+                "number of tokens is not known beforehand, use the "
+                "IntegerLookup layer instead."
+            )
+        if num_tokens < 1:
+            raise ValueError(
+                f"`num_tokens` must be >= 1. Received: num_tokens={num_tokens}."
+            )
+
+        self.num_tokens = num_tokens
+        self.output_mode = output_mode
+        self.sparse = sparse
+
+    def compute_output_shape(self, input_shape):
+        input_shape = list(input_shape)
+        if not input_shape:
+            return tf.TensorShape([self.num_tokens])
+        if self.output_mode == ONE_HOT and input_shape[-1] != 1:
+            return tf.TensorShape(input_shape + [self.num_tokens])
+        else:
+            return tf.TensorShape(input_shape[:-1] + [self.num_tokens])
+
+    def compute_output_signature(self, input_spec):
+        output_shape = self.compute_output_shape(input_spec.shape.as_list())
+        if self.sparse:
+            return tf.SparseTensorSpec(shape=output_shape, dtype=tf.int64)
+        else:
+            return tf.TensorSpec(shape=output_shape, dtype=tf.int64)
+
+    def get_config(self):
+        config = {
+            "num_tokens": self.num_tokens,
+            "output_mode": self.output_mode,
+            "sparse": self.sparse,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def call(self, inputs, count_weights=None):
+        inputs = utils.ensure_tensor(inputs)
+
+        if count_weights is not None:
+            if self.output_mode != COUNT:
+                raise ValueError(
+                    "`count_weights` is not used when `output_mode` is not "
+                    "`'count'`. Received `count_weights={count_weights}`."
+                )
+            count_weights = utils.ensure_tensor(
+                count_weights, self.compute_dtype
+            )
+
+        depth = self.num_tokens
+        if isinstance(inputs, tf.SparseTensor):
+            max_value = tf.reduce_max(inputs.values)
+            min_value = tf.reduce_min(inputs.values)
+        else:
+            max_value = tf.reduce_max(inputs)
+            min_value = tf.reduce_min(inputs)
+        condition = tf.logical_and(
+            tf.greater(tf.cast(depth, max_value.dtype), max_value),
+            tf.greater_equal(min_value, tf.cast(0, min_value.dtype)),
+        )
+        assertion = tf.Assert(
+            condition,
+            [
+                "Input values must be in the range 0 <= values < num_tokens"
+                " with num_tokens={}".format(depth)
+            ],
+        )
+        with tf.control_dependencies([assertion]):
+            return utils.encode_categorical_inputs(
+                inputs,
+                output_mode=self.output_mode,
+                depth=depth,
+                dtype=self.compute_dtype,
+                sparse=self.sparse,
+                count_weights=count_weights,
+            )
diff --git a/keras/layers/preprocessing/category_encoding_distribution_test.py b/keras/layers/preprocessing/category_encoding_distribution_test.py
index 5f8d5a72b9bf..8be4b5cc5abf 100644
--- a/keras/layers/preprocessing/category_encoding_distribution_test.py
+++ b/keras/layers/preprocessing/category_encoding_distribution_test.py
@@ -15,6 +15,8 @@
 """Distribution tests for keras.layers.preprocessing.category_encoding."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras import backend
@@ -23,59 +25,64 @@
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-from tensorflow.python.framework import test_util as tf_test_utils
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
 def batch_wrapper(dataset, batch_size, strategy, repeat=None):
-  if repeat:
-    dataset = dataset.repeat(repeat)
-  # TPUs currently require fully defined input shapes, drop_remainder ensures
-  # the input will have fully defined shapes.
-  if backend.is_tpu_strategy(strategy):
-    return dataset.batch(batch_size, drop_remainder=True)
-  else:
-    return dataset.batch(batch_size)
+    if repeat:
+        dataset = dataset.repeat(repeat)
+    # TPUs currently require fully defined input shapes, drop_remainder ensures
+    # the input will have fully defined shapes.
+    if backend.is_tpu_strategy(strategy):
+        return dataset.batch(batch_size, drop_remainder=True)
+    else:
+        return dataset.batch(batch_size)
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        strategy=strategy_combinations.all_strategies +
-        strategy_combinations.multi_worker_mirrored_strategies +
-        strategy_combinations.parameter_server_strategies_single_worker +
-        strategy_combinations.parameter_server_strategies_multi_worker,
-        mode=["eager"]))
+        strategy=strategy_combinations.all_strategies
+        + strategy_combinations.multi_worker_mirrored_strategies
+        + strategy_combinations.parameter_server_strategies_single_worker
+        + strategy_combinations.parameter_server_strategies_multi_worker,
+        mode=["eager"],
+    )
+)
 class CategoryEncodingDistributionTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_strategy(self, strategy):
-    if (backend.is_tpu_strategy(strategy) and
-        not tf_test_utils.is_mlir_bridge_enabled()):
-      self.skipTest("TPU tests require MLIR bridge")
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_strategy(self, strategy):
+        if (
+            backend.is_tpu_strategy(strategy)
+            and not tf_test_utils.is_mlir_bridge_enabled()
+        ):
+            self.skipTest("TPU tests require MLIR bridge")
 
-    input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-    inp_dataset = tf.data.Dataset.from_tensor_slices(input_array)
-    inp_dataset = batch_wrapper(inp_dataset, 2, strategy)
+        input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+        inp_dataset = tf.data.Dataset.from_tensor_slices(input_array)
+        inp_dataset = batch_wrapper(inp_dataset, 2, strategy)
 
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0, 0],
-                       [1, 1, 0, 1, 0, 0]]
-    # pyformat: enable
-    num_tokens = 6
-    tf.config.set_soft_device_placement(True)
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0]]
+        # pyformat: enable
+        num_tokens = 6
+        tf.config.set_soft_device_placement(True)
 
-    with strategy.scope():
-      input_data = keras.Input(shape=(4,), dtype=tf.int32)
-      layer = category_encoding.CategoryEncoding(
-          num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(inp_dataset)
-    self.assertAllEqual(expected_output, output_dataset)
+        with strategy.scope():
+            input_data = keras.Input(shape=(4,), dtype=tf.int32)
+            layer = category_encoding.CategoryEncoding(
+                num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT
+            )
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(inp_dataset)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/layers/preprocessing/category_encoding_test.py b/keras/layers/preprocessing/category_encoding_test.py
index 70677ea3b1a5..ed02ecc7652f 100644
--- a/keras/layers/preprocessing/category_encoding_test.py
+++ b/keras/layers/preprocessing/category_encoding_test.py
@@ -15,513 +15,577 @@
 """Tests for Keras text category_encoding preprocessing layer."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras import backend
 from keras.layers import core
 from keras.layers.preprocessing import category_encoding
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class CategoryEncodingInputTest(test_combinations.TestCase,
-                                preprocessing_test_utils.PreprocessingLayerTest
-                               ):
-
-  @parameterized.named_parameters(
-      ("list", list),
-      ("tuple", tuple),
-      ("numpy", np.array),
-      ("array_like", preprocessing_test_utils.ArrayLike),
-  )
-  def test_tensor_like_inputs(self, data_fn):
-    category_data = data_fn([1, 2, 3, 3, 0])
-    weight_data = data_fn([1, 2, 3, 1, 7])
-    expected_output = [7, 1, 2, 4, 0, 0]
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=6, output_mode=category_encoding.COUNT)
-    output_data = layer(category_data, count_weights=weight_data)
-    self.assertAllEqual(output_data, expected_output)
-
-  def test_dense_input_sparse_output(self):
-    input_array = tf.constant([[1, 2, 3], [3, 3, 0]])
-
-    # The expected output should be (X for missing value):
-    # [[X, 1, 1, 1, X, X]
-    #  [1, X, X, 2, X, X]]
-    expected_indices = [[0, 1], [0, 2], [0, 3], [1, 0], [1, 3]]
-    expected_values = [1, 1, 1, 1, 2]
-    num_tokens = 6
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32)
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
-    int_data = layer(input_data)
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    sp_output_dataset = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_values, sp_output_dataset.values)
-    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
-    # Assert sparse output is same as dense output.
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens,
-        output_mode=category_encoding.COUNT,
-        sparse=False)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array, steps=1)
-    self.assertAllEqual(
-        tf.sparse.to_dense(sp_output_dataset, default_value=0),
-        output_dataset)
-
-  def test_sparse_input(self):
-    input_array = np.array([[1, 2, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
-    sparse_tensor_data = tf.sparse.from_dense(input_array)
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0, 0],
-                       [0, 1, 0, 1, 0, 0]]
-    # pyformat: enable
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(sparse_tensor_data, steps=1)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_sparse_input_with_weights(self):
-    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 4]], dtype=np.int64)
-    weights_array = np.array([[.1, .2, .3, .4], [.2, .1, .4, .3]])
-    sparse_tensor_data = tf.sparse.from_dense(input_array)
-    sparse_weight_data = tf.sparse.from_dense(weights_array)
-
-    # pyformat: disable
-    expected_output = [[0, .1, .2, .3, .4, 0],
-                       [0, .4, 0, .1, .5, 0]]
-    # pyformat: enable
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    weight_data = keras.Input(shape=(None,), dtype=tf.float32, sparse=True)
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.COUNT)
-    int_data = layer(input_data, count_weights=weight_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
-    output_dataset = model.predict([sparse_tensor_data, sparse_weight_data],
-                                   steps=1)
-    self.assertAllClose(expected_output, output_dataset)
-
-  def test_sparse_input_sparse_output(self):
-    sp_inp = tf.SparseTensor(
-        indices=[[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]],
-        values=[0, 2, 1, 1, 0],
-        dense_shape=[4, 2])
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-
-    # The expected output should be (X for missing value):
-    # [[1, X, X, X]
-    #  [X, X, 1, X]
-    #  [X, 2, X, X]
-    #  [1, X, X, X]]
-    expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
-    expected_values = [1, 1, 2, 1]
-    num_tokens = 6
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
-    int_data = layer(input_data)
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    sp_output_dataset = model.predict(sp_inp, steps=1)
-    self.assertAllEqual(expected_values, sp_output_dataset.values)
-    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
-    # Assert sparse output is same as dense output.
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens,
-        output_mode=category_encoding.COUNT,
-        sparse=False)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(sp_inp, steps=1)
-    self.assertAllEqual(
-        tf.sparse.to_dense(sp_output_dataset, default_value=0),
-        output_dataset)
-
-  def test_sparse_input_sparse_output_with_weights(self):
-    indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]]
-    sp_inp = tf.SparseTensor(
-        indices=indices, values=[0, 2, 1, 1, 0], dense_shape=[4, 2])
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    sp_weight = tf.SparseTensor(
-        indices=indices, values=[.1, .2, .4, .3, .2], dense_shape=[4, 2])
-    weight_data = keras.Input(shape=(None,), dtype=tf.float32, sparse=True)
-
-    # The expected output should be (X for missing value):
-    # [[1, X, X, X]
-    #  [X, X, 1, X]
-    #  [X, 2, X, X]
-    #  [1, X, X, X]]
-    expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
-    expected_values = [.1, .2, .7, .2]
-    num_tokens = 6
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
-    int_data = layer(input_data, count_weights=weight_data)
-
-    model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
-    sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1)
-    self.assertAllClose(expected_values, sp_output_dataset.values)
-    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
-  def test_ragged_input(self):
-    input_array = tf.ragged.constant([[1, 2, 3], [3, 1]])
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0, 0],
-                       [0, 1, 0, 1, 0, 0]]
-    # pyformat: enable
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
-    int_data = layer(input_data)
-
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_ragged_input_sparse_output(self):
-    input_array = tf.ragged.constant([[1, 2, 3], [3, 3]])
-
-    # The expected output should be (X for missing value):
-    # [[X, 1, 1, 1]
-    #  [X, X, X, 2]]
-    expected_indices = [[0, 1], [0, 2], [0, 3], [1, 3]]
-    expected_values = [1, 1, 1, 2]
-    num_tokens = 6
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
-    int_data = layer(input_data)
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    sp_output_dataset = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_values, sp_output_dataset.values)
-    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
-    # Assert sparse output is same as dense output.
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens,
-        output_mode=category_encoding.COUNT,
-        sparse=False)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array, steps=1)
-    self.assertAllEqual(
-        tf.sparse.to_dense(sp_output_dataset, default_value=0),
-        output_dataset)
-
-  def test_sparse_output_and_dense_layer(self):
-    input_array = tf.constant([[1, 2, 3], [3, 3, 0]])
-
-    num_tokens = 4
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32)
-    encoding_layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
-    int_data = encoding_layer(input_data)
-    dense_layer = keras.layers.Dense(units=1)
-    output_data = dense_layer(int_data)
-
-    model = keras.Model(inputs=input_data, outputs=output_data)
-    _ = model.predict(input_array, steps=1)
-
-  def test_dense_oov_input(self):
-    valid_array = tf.constant([[0, 1, 2], [0, 1, 2]])
-    invalid_array = tf.constant([[0, 1, 2], [2, 3, 1]])
-    num_tokens = 3
-    expected_output_shape = [None, num_tokens]
-    encoder_layer = category_encoding.CategoryEncoding(num_tokens)
-    input_data = keras.Input(shape=(3,), dtype=tf.int32)
-    int_data = encoder_layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    # Call predict once on valid input to compile a graph and test control flow.
-    _ = model.predict(valid_array, steps=1)
-    with self.assertRaisesRegex(
-        tf.errors.InvalidArgumentError,
-        ".*must be in the range 0 <= values < num_tokens.*"):
-      _ = model.predict(invalid_array, steps=1)
-
-  def test_dense_negative(self):
-    valid_array = tf.constant([[0, 1, 2], [0, 1, 2]])
-    invalid_array = tf.constant([[1, 2, 0], [2, 2, -1]])
-    num_tokens = 3
-    expected_output_shape = [None, num_tokens]
-    encoder_layer = category_encoding.CategoryEncoding(num_tokens)
-    input_data = keras.Input(shape=(3,), dtype=tf.int32)
-    int_data = encoder_layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    # Call predict once on valid input to compile a graph and test control flow.
-    _ = model.predict(valid_array, steps=1)
-    with self.assertRaisesRegex(
-        tf.errors.InvalidArgumentError,
-        ".*must be in the range 0 <= values < num_tokens.*"):
-      _ = model.predict(invalid_array, steps=1)
-
-  def test_legacy_max_tokens_arg(self):
-    input_array = np.array([[1, 2, 3, 1]])
-    expected_output = [[0, 1, 1, 1, 0, 0]]
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32)
-    layer = category_encoding.CategoryEncoding(
-        max_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+class CategoryEncodingInputTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(
+        ("list", list),
+        ("tuple", tuple),
+        ("numpy", np.array),
+        ("array_like", preprocessing_test_utils.ArrayLike),
+    )
+    def test_tensor_like_inputs(self, data_fn):
+        category_data = data_fn([1, 2, 3, 3, 0])
+        weight_data = data_fn([1, 2, 3, 1, 7])
+        expected_output = [7, 1, 2, 4, 0, 0]
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=6, output_mode=category_encoding.COUNT
+        )
+        output_data = layer(category_data, count_weights=weight_data)
+        self.assertAllEqual(output_data, expected_output)
+
+    def test_compute_output_shape(self):
+        layer = category_encoding.CategoryEncoding(5)
+        output_shape = layer.compute_output_shape((None, 1))
+        self.assertListEqual(output_shape.as_list(), [None, 5])
+        output_shape = layer.compute_output_shape([None, 1])
+        self.assertListEqual(output_shape.as_list(), [None, 5])
+
+    def test_dense_input_sparse_output(self):
+        input_array = tf.constant([[1, 2, 3], [3, 3, 0]])
+
+        # The expected output should be (X for missing value):
+        # [[X, 1, 1, 1, X, X]
+        #  [1, X, X, 2, X, X]]
+        expected_indices = [[0, 1], [0, 2], [0, 3], [1, 0], [1, 3]]
+        expected_values = [1, 1, 1, 1, 2]
+        num_tokens = 6
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32)
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=True,
+        )
+        int_data = layer(input_data)
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        sp_output_dataset = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_values, sp_output_dataset.values)
+        self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
+        # Assert sparse output is same as dense output.
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=False,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array, steps=1)
+        self.assertAllEqual(
+            tf.sparse.to_dense(sp_output_dataset, default_value=0),
+            output_dataset,
+        )
+
+    def test_sparse_input(self):
+        input_array = np.array([[1, 2, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
+        sparse_tensor_data = tf.sparse.from_dense(input_array)
+
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0, 0], [0, 1, 0, 1, 0, 0]]
+        # pyformat: enable
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(sparse_tensor_data, steps=1)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_sparse_input_with_weights(self):
+        input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 4]], dtype=np.int64)
+        weights_array = np.array([[0.1, 0.2, 0.3, 0.4], [0.2, 0.1, 0.4, 0.3]])
+        sparse_tensor_data = tf.sparse.from_dense(input_array)
+        sparse_weight_data = tf.sparse.from_dense(weights_array)
+
+        # pyformat: disable
+        expected_output = [[0, 0.1, 0.2, 0.3, 0.4, 0], [0, 0.4, 0, 0.1, 0.5, 0]]
+        # pyformat: enable
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+        weight_data = keras.Input(shape=(None,), dtype=tf.float32, sparse=True)
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.COUNT
+        )
+        int_data = layer(input_data, count_weights=weight_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
+        output_dataset = model.predict(
+            [sparse_tensor_data, sparse_weight_data], steps=1
+        )
+        self.assertAllClose(expected_output, output_dataset)
+
+    def test_sparse_input_sparse_output(self):
+        sp_inp = tf.SparseTensor(
+            indices=[[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]],
+            values=[0, 2, 1, 1, 0],
+            dense_shape=[4, 2],
+        )
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+
+        # The expected output should be (X for missing value):
+        # [[1, X, X, X]
+        #  [X, X, 1, X]
+        #  [X, 2, X, X]
+        #  [1, X, X, X]]
+        expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
+        expected_values = [1, 1, 2, 1]
+        num_tokens = 6
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=True,
+        )
+        int_data = layer(input_data)
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        sp_output_dataset = model.predict(sp_inp, steps=1)
+        self.assertAllEqual(expected_values, sp_output_dataset.values)
+        self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
+        # Assert sparse output is same as dense output.
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=False,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(sp_inp, steps=1)
+        self.assertAllEqual(
+            tf.sparse.to_dense(sp_output_dataset, default_value=0),
+            output_dataset,
+        )
+
+    def test_sparse_input_sparse_output_with_weights(self):
+        indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]]
+        sp_inp = tf.SparseTensor(
+            indices=indices, values=[0, 2, 1, 1, 0], dense_shape=[4, 2]
+        )
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+        sp_weight = tf.SparseTensor(
+            indices=indices,
+            values=[0.1, 0.2, 0.4, 0.3, 0.2],
+            dense_shape=[4, 2],
+        )
+        weight_data = keras.Input(shape=(None,), dtype=tf.float32, sparse=True)
+
+        # The expected output should be (X for missing value):
+        # [[1, X, X, X]
+        #  [X, X, 1, X]
+        #  [X, 2, X, X]
+        #  [1, X, X, X]]
+        expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
+        expected_values = [0.1, 0.2, 0.7, 0.2]
+        num_tokens = 6
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=True,
+        )
+        int_data = layer(input_data, count_weights=weight_data)
+
+        model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
+        sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1)
+        self.assertAllClose(expected_values, sp_output_dataset.values)
+        self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
+    def test_ragged_input(self):
+        input_array = tf.ragged.constant([[1, 2, 3], [3, 1]])
+
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0, 0], [0, 1, 0, 1, 0, 0]]
+        # pyformat: enable
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT
+        )
+        int_data = layer(input_data)
+
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_ragged_input_sparse_output(self):
+        input_array = tf.ragged.constant([[1, 2, 3], [3, 3]])
+
+        # The expected output should be (X for missing value):
+        # [[X, 1, 1, 1]
+        #  [X, X, X, 2]]
+        expected_indices = [[0, 1], [0, 2], [0, 3], [1, 3]]
+        expected_values = [1, 1, 1, 2]
+        num_tokens = 6
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=True,
+        )
+        int_data = layer(input_data)
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        sp_output_dataset = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_values, sp_output_dataset.values)
+        self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
+        # Assert sparse output is same as dense output.
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=False,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array, steps=1)
+        self.assertAllEqual(
+            tf.sparse.to_dense(sp_output_dataset, default_value=0),
+            output_dataset,
+        )
+
+    def test_sparse_output_and_dense_layer(self):
+        input_array = tf.constant([[1, 2, 3], [3, 3, 0]])
+
+        num_tokens = 4
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32)
+        encoding_layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens,
+            output_mode=category_encoding.COUNT,
+            sparse=True,
+        )
+        int_data = encoding_layer(input_data)
+        dense_layer = keras.layers.Dense(units=1)
+        output_data = dense_layer(int_data)
+
+        model = keras.Model(inputs=input_data, outputs=output_data)
+        _ = model.predict(input_array, steps=1)
+
+    def test_dense_oov_input(self):
+        valid_array = tf.constant([[0, 1, 2], [0, 1, 2]])
+        invalid_array = tf.constant([[0, 1, 2], [2, 3, 1]])
+        num_tokens = 3
+        expected_output_shape = [None, num_tokens]
+        encoder_layer = category_encoding.CategoryEncoding(num_tokens)
+        input_data = keras.Input(shape=(3,), dtype=tf.int32)
+        int_data = encoder_layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        # Call predict once on valid input to compile a graph and test control
+        # flow.
+        _ = model.predict(valid_array, steps=1)
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError,
+            ".*must be in the range 0 <= values < num_tokens.*",
+        ):
+            _ = model.predict(invalid_array, steps=1)
+
+    def test_dense_negative(self):
+        valid_array = tf.constant([[0, 1, 2], [0, 1, 2]])
+        invalid_array = tf.constant([[1, 2, 0], [2, 2, -1]])
+        num_tokens = 3
+        expected_output_shape = [None, num_tokens]
+        encoder_layer = category_encoding.CategoryEncoding(num_tokens)
+        input_data = keras.Input(shape=(3,), dtype=tf.int32)
+        int_data = encoder_layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        # Call predict once on valid input to compile a graph and test control
+        # flow.
+        _ = model.predict(valid_array, steps=1)
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError,
+            ".*must be in the range 0 <= values < num_tokens.*",
+        ):
+            _ = model.predict(invalid_array, steps=1)
+
+    def test_legacy_max_tokens_arg(self):
+        input_array = np.array([[1, 2, 3, 1]])
+        expected_output = [[0, 1, 1, 1, 0, 0]]
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32)
+        layer = category_encoding.CategoryEncoding(
+            max_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_combinations.run_all_keras_modes
-class CategoryEncodingOutputTest(test_combinations.TestCase,
-                                 preprocessing_test_utils.PreprocessingLayerTest
-                                ):
-
-  @parameterized.named_parameters(
-      ("float32", tf.float32),
-      ("float64", tf.float64),
-  )
-  def test_output_dtype(self, dtype):
-    inputs = keras.Input(shape=(1,), dtype=tf.int32)
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=4,
-        output_mode=category_encoding.ONE_HOT,
-        dtype=dtype)
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.dtype, dtype)
-
-  def test_one_hot_output(self):
-    input_data = np.array([[3], [2], [0], [1]])
-    expected_output = [
-        [0, 0, 0, 1],
-        [0, 0, 1, 0],
-        [1, 0, 0, 0],
-        [0, 1, 0, 0],
-    ]
-    num_tokens = 4
-    expected_output_shape = [None, num_tokens]
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
-    inputs = keras.Input(shape=(1,), dtype=tf.int32)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_dataset = model(input_data)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_one_hot_output_rank_one_input(self):
-    input_data = np.array([3, 2, 0, 1])
-    expected_output = [
-        [0, 0, 0, 1],
-        [0, 0, 1, 0],
-        [1, 0, 0, 0],
-        [0, 1, 0, 0],
-    ]
-    num_tokens = 4
-    expected_output_shape = [None, num_tokens]
-
-    # Test call on layer directly.
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
-    output_data = layer(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-    # Test call on model.
-    inputs = keras.Input(shape=(1,), dtype=tf.int32)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model(input_data)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_one_hot_output_rank_zero_input(self):
-    input_data = np.array(3)
-    expected_output = [0, 0, 0, 1]
-    num_tokens = 4
-    expected_output_shape = [None, num_tokens]
-
-    # Test call on layer directly.
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
-    output_data = layer(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-    # Test call on model.
-    inputs = keras.Input(shape=(1,), dtype=tf.int32)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model(input_data)
-
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_one_hot_rank_3_output_fails(self):
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=4, output_mode=category_encoding.ONE_HOT)
-    with self.assertRaisesRegex(ValueError, "maximum supported output rank"):
-      _ = layer(keras.Input(shape=(4,), dtype=tf.int32))
-    with self.assertRaisesRegex(ValueError, "maximum supported output rank"):
-      _ = layer(np.array([[3, 2, 0, 1], [3, 2, 0, 1]]))
-
-  def test_multi_hot_output(self):
-    input_data = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-    expected_output = [
-        [0, 1, 1, 1, 0, 0],
-        [1, 1, 0, 1, 0, 0],
-    ]
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
-    inputs = keras.Input(shape=(None,), dtype=tf.int32)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model.predict(input_data)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_multi_hot_output_rank_one_input(self):
-    input_data = np.array([3, 2, 0, 1])
-    expected_output = [1, 1, 1, 1, 0, 0]
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    # Test call on layer directly.
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
-    output_data = layer(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-    # Test call on model.
-    inputs = keras.Input(shape=(4,), dtype=tf.int32)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model(input_data)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_multi_hot_output_rank_zero_input(self):
-    input_data = np.array(3)
-    expected_output = [0, 0, 0, 1, 0, 0]
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    # Test call on layer directly.
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
-    output_data = layer(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-    # Test call on model.
-    inputs = keras.Input(shape=(4,), dtype=tf.int32)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model(input_data)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_multi_hot_rank_3_output_fails(self):
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=4, output_mode=category_encoding.ONE_HOT)
-    with self.assertRaisesRegex(ValueError, "maximum supported output rank"):
-      _ = layer(keras.Input(shape=(3, 4,), dtype=tf.int32))
-    with self.assertRaisesRegex(ValueError, "maximum supported output rank"):
-      _ = layer(np.array([[[3, 2, 0, 1], [3, 2, 0, 1]]]))
-
-  def test_count_output(self):
-    input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-
-    # pyformat: disable
-    expected_output = [[0, 2, 1, 1, 0, 0],
-                       [2, 1, 0, 1, 0, 0]]
-    # pyformat: enable
-    num_tokens = 6
-    expected_output_shape = [None, num_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32)
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=6, output_mode=category_encoding.COUNT)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+class CategoryEncodingOutputTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(
+        ("float32", tf.float32),
+        ("float64", tf.float64),
+    )
+    def test_output_dtype(self, dtype):
+        inputs = keras.Input(shape=(1,), dtype=tf.int32)
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=4, output_mode=category_encoding.ONE_HOT, dtype=dtype
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.dtype, dtype)
+
+    def test_one_hot_output(self):
+        input_data = np.array([[3], [2], [0], [1]])
+        expected_output = [
+            [0, 0, 0, 1],
+            [0, 0, 1, 0],
+            [1, 0, 0, 0],
+            [0, 1, 0, 0],
+        ]
+        num_tokens = 4
+        expected_output_shape = [None, num_tokens]
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT
+        )
+        inputs = keras.Input(shape=(1,), dtype=tf.int32)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_dataset = model(input_data)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_one_hot_output_rank_one_input(self):
+        input_data = np.array([3, 2, 0, 1])
+        expected_output = [
+            [0, 0, 0, 1],
+            [0, 0, 1, 0],
+            [1, 0, 0, 0],
+            [0, 1, 0, 0],
+        ]
+        num_tokens = 4
+        expected_output_shape = [None, num_tokens]
+
+        # Test call on layer directly.
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT
+        )
+        output_data = layer(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+        # Test call on model.
+        inputs = keras.Input(shape=(1,), dtype=tf.int32)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model(input_data)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_one_hot_output_rank_zero_input(self):
+        input_data = np.array(3)
+        expected_output = [0, 0, 0, 1]
+        num_tokens = 4
+        expected_output_shape = [None, num_tokens]
+
+        # Test call on layer directly.
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT
+        )
+        output_data = layer(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+        # Test call on model.
+        inputs = keras.Input(shape=(1,), dtype=tf.int32)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model(input_data)
+
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_one_hot_rank_3_output_fails(self):
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=4, output_mode=category_encoding.ONE_HOT
+        )
+        with self.assertRaisesRegex(
+            ValueError, "maximum supported output rank"
+        ):
+            _ = layer(keras.Input(shape=(4,), dtype=tf.int32))
+        with self.assertRaisesRegex(
+            ValueError, "maximum supported output rank"
+        ):
+            _ = layer(np.array([[3, 2, 0, 1], [3, 2, 0, 1]]))
+
+    def test_multi_hot_output(self):
+        input_data = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+        expected_output = [
+            [0, 1, 1, 1, 0, 0],
+            [1, 1, 0, 1, 0, 0],
+        ]
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT
+        )
+        inputs = keras.Input(shape=(None,), dtype=tf.int32)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model.predict(input_data)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_multi_hot_output_rank_one_input(self):
+        input_data = np.array([3, 2, 0, 1])
+        expected_output = [1, 1, 1, 1, 0, 0]
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        # Test call on layer directly.
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT
+        )
+        output_data = layer(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+        # Test call on model.
+        inputs = keras.Input(shape=(4,), dtype=tf.int32)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model(input_data)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_multi_hot_output_rank_zero_input(self):
+        input_data = np.array(3)
+        expected_output = [0, 0, 0, 1, 0, 0]
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        # Test call on layer directly.
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT
+        )
+        output_data = layer(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+        # Test call on model.
+        inputs = keras.Input(shape=(4,), dtype=tf.int32)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model(input_data)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_multi_hot_rank_3_output_fails(self):
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=4, output_mode=category_encoding.ONE_HOT
+        )
+        with self.assertRaisesRegex(
+            ValueError, "maximum supported output rank"
+        ):
+            _ = layer(
+                keras.Input(
+                    shape=(
+                        3,
+                        4,
+                    ),
+                    dtype=tf.int32,
+                )
+            )
+        with self.assertRaisesRegex(
+            ValueError, "maximum supported output rank"
+        ):
+            _ = layer(np.array([[[3, 2, 0, 1], [3, 2, 0, 1]]]))
+
+    def test_count_output(self):
+        input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+
+        # pyformat: disable
+        expected_output = [[0, 2, 1, 1, 0, 0], [2, 1, 0, 1, 0, 0]]
+        # pyformat: enable
+        num_tokens = 6
+        expected_output_shape = [None, num_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32)
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=6, output_mode=category_encoding.COUNT
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 class CategoryEncodingModelBuildingTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name": "count_output",
-          "num_tokens": 5,
-          "output_mode": category_encoding.COUNT
-      }, {
-          "testcase_name": "multi_hot_output",
-          "num_tokens": 5,
-          "output_mode": category_encoding.MULTI_HOT
-      })
-  def test_end_to_end_bagged_modeling(self, output_mode, num_tokens):
-    input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32)
-    layer = category_encoding.CategoryEncoding(
-        num_tokens=num_tokens, output_mode=output_mode)
-
-    weights = []
-    if num_tokens is None:
-      layer.set_num_elements(5)
-    layer.set_weights(weights)
-
-    int_data = layer(input_data)
-    float_data = backend.cast(int_data, dtype="float32")
-    output_data = core.Dense(64)(float_data)
-    model = keras.Model(inputs=input_data, outputs=output_data)
-    _ = model.predict(input_array)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "count_output",
+            "num_tokens": 5,
+            "output_mode": category_encoding.COUNT,
+        },
+        {
+            "testcase_name": "multi_hot_output",
+            "num_tokens": 5,
+            "output_mode": category_encoding.MULTI_HOT,
+        },
+    )
+    def test_end_to_end_bagged_modeling(self, output_mode, num_tokens):
+        input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32)
+        layer = category_encoding.CategoryEncoding(
+            num_tokens=num_tokens, output_mode=output_mode
+        )
+
+        weights = []
+        if num_tokens is None:
+            layer.set_num_elements(5)
+        layer.set_weights(weights)
+
+        int_data = layer(input_data)
+        float_data = backend.cast(int_data, dtype="float32")
+        output_data = core.Dense(64)(float_data)
+        model = keras.Model(inputs=input_data, outputs=output_data)
+        _ = model.predict(input_array)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/discretization.py b/keras/layers/preprocessing/discretization.py
index d83c02853a60..72ae53c4e0ac 100644
--- a/keras/layers/preprocessing/discretization.py
+++ b/keras/layers/preprocessing/discretization.py
@@ -14,16 +14,17 @@
 # ==============================================================================
 """Keras discretization preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_utils as utils
 from keras.utils import layer_utils
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -34,360 +35,410 @@
 
 
 def summarize(values, epsilon):
-  """Reduce a 1D sequence of values to a summary.
-
-  This algorithm is based on numpy.quantiles but modified to allow for
-  intermediate steps between multiple data sets. It first finds the target
-  number of bins as the reciprocal of epsilon and then takes the individual
-  values spaced at appropriate intervals to arrive at that target.
-  The final step is to return the corresponding counts between those values
-  If the target num_bins is larger than the size of values, the whole array is
-  returned (with weights of 1).
-
-  Args:
-      values: 1D `np.ndarray` to be summarized.
-      epsilon: A `'float32'` that determines the approximate desired precision.
-
-  Returns:
-      A 2D `np.ndarray` that is a summary of the inputs. First column is the
-      interpolated partition values, the second is the weights (counts).
-  """
-
-  values = tf.reshape(values, [-1])
-  values = tf.sort(values)
-  elements = tf.cast(tf.size(values), tf.float32)
-  num_buckets = 1. / epsilon
-  increment = tf.cast(elements / num_buckets, tf.int32)
-  start = increment
-  step = tf.maximum(increment, 1)
-  boundaries = values[start::step]
-  weights = tf.ones_like(boundaries)
-  weights = weights * tf.cast(step, tf.float32)
-  return tf.stack([boundaries, weights])
-
-
-def compress(summary, epsilon):
-  """Compress a summary to within `epsilon` accuracy.
+    """Reduce a 1D sequence of values to a summary.
+
+    This algorithm is based on numpy.quantiles but modified to allow for
+    intermediate steps between multiple data sets. It first finds the target
+    number of bins as the reciprocal of epsilon and then takes the individual
+    values spaced at appropriate intervals to arrive at that target.
+    The final step is to return the corresponding counts between those values
+    If the target num_bins is larger than the size of values, the whole array is
+    returned (with weights of 1).
+
+    Args:
+        values: 1D `np.ndarray` to be summarized.
+        epsilon: A `'float32'` that determines the approximate desired
+          precision.
+
+    Returns:
+        A 2D `np.ndarray` that is a summary of the inputs. First column is the
+        interpolated partition values, the second is the weights (counts).
+    """
 
-  The compression step is needed to keep the summary sizes small after merging,
-  and also used to return the final target boundaries. It finds the new bins
-  based on interpolating cumulative weight percentages from the large summary.
-  Taking the difference of the cumulative weights from the previous bin's
-  cumulative weight will give the new weight for that bin.
+    values = tf.reshape(values, [-1])
+    values = tf.sort(values)
+    elements = tf.cast(tf.size(values), tf.float32)
+    num_buckets = 1.0 / epsilon
+    increment = tf.cast(elements / num_buckets, tf.int32)
+    start = increment
+    step = tf.maximum(increment, 1)
+    boundaries = values[start::step]
+    weights = tf.ones_like(boundaries)
+    weights = weights * tf.cast(step, tf.float32)
+    return tf.stack([boundaries, weights])
 
-  Args:
-      summary: 2D `np.ndarray` summary to be compressed.
-      epsilon: A `'float32'` that determines the approxmiate desired precision.
 
-  Returns:
-      A 2D `np.ndarray` that is a compressed summary. First column is the
-      interpolated partition values, the second is the weights (counts).
-  """
-  # TODO(b/184863356): remove the numpy escape hatch here.
-  return tf.numpy_function(
-      lambda s: _compress_summary_numpy(s, epsilon), [summary], tf.float32)
+def compress(summary, epsilon):
+    """Compress a summary to within `epsilon` accuracy.
+
+    The compression step is needed to keep the summary sizes small after
+    merging, and also used to return the final target boundaries. It finds the
+    new bins based on interpolating cumulative weight percentages from the large
+    summary.  Taking the difference of the cumulative weights from the previous
+    bin's cumulative weight will give the new weight for that bin.
+
+    Args:
+        summary: 2D `np.ndarray` summary to be compressed.
+        epsilon: A `'float32'` that determines the approxmiate desired
+          precision.
+
+    Returns:
+        A 2D `np.ndarray` that is a compressed summary. First column is the
+        interpolated partition values, the second is the weights (counts).
+    """
+    # TODO(b/184863356): remove the numpy escape hatch here.
+    return tf.numpy_function(
+        lambda s: _compress_summary_numpy(s, epsilon), [summary], tf.float32
+    )
 
 
 def _compress_summary_numpy(summary, epsilon):
-  """Compress a summary with numpy."""
-  if summary.shape[1] * epsilon < 1:
-    return summary
-
-  percents = epsilon + np.arange(0.0, 1.0, epsilon)
-  cum_weights = summary[1].cumsum()
-  cum_weight_percents = cum_weights / cum_weights[-1]
-  new_bins = np.interp(percents, cum_weight_percents, summary[0])
-  cum_weights = np.interp(percents, cum_weight_percents, cum_weights)
-  new_weights = cum_weights - np.concatenate((np.array([0]), cum_weights[:-1]))
-  summary = np.stack((new_bins, new_weights))
-  return summary.astype(np.float32)
+    """Compress a summary with numpy."""
+    if summary.shape[1] * epsilon < 1:
+        return summary
+
+    percents = epsilon + np.arange(0.0, 1.0, epsilon)
+    cum_weights = summary[1].cumsum()
+    cum_weight_percents = cum_weights / cum_weights[-1]
+    new_bins = np.interp(percents, cum_weight_percents, summary[0])
+    cum_weights = np.interp(percents, cum_weight_percents, cum_weights)
+    new_weights = cum_weights - np.concatenate(
+        (np.array([0]), cum_weights[:-1])
+    )
+    summary = np.stack((new_bins, new_weights))
+    return summary.astype(np.float32)
 
 
 def merge_summaries(prev_summary, next_summary, epsilon):
-  """Weighted merge sort of summaries.
+    """Weighted merge sort of summaries.
 
-  Given two summaries of distinct data, this function merges (and compresses)
-  them to stay within `epsilon` error tolerance.
+    Given two summaries of distinct data, this function merges (and compresses)
+    them to stay within `epsilon` error tolerance.
 
-  Args:
-      prev_summary: 2D `np.ndarray` summary to be merged with `next_summary`.
-      next_summary: 2D `np.ndarray` summary to be merged with `prev_summary`.
-      epsilon: A float that determines the approxmiate desired precision.
+    Args:
+        prev_summary: 2D `np.ndarray` summary to be merged with `next_summary`.
+        next_summary: 2D `np.ndarray` summary to be merged with `prev_summary`.
+        epsilon: A float that determines the approxmiate desired precision.
 
-  Returns:
-      A 2-D `np.ndarray` that is a merged summary. First column is the
-      interpolated partition values, the second is the weights (counts).
-  """
-  merged = tf.concat((prev_summary, next_summary), axis=1)
-  merged = tf.gather(merged, tf.argsort(merged[0]), axis=1)
-  return compress(merged, epsilon)
+    Returns:
+        A 2-D `np.ndarray` that is a merged summary. First column is the
+        interpolated partition values, the second is the weights (counts).
+    """
+    merged = tf.concat((prev_summary, next_summary), axis=1)
+    merged = tf.gather(merged, tf.argsort(merged[0]), axis=1)
+    return compress(merged, epsilon)
 
 
 def get_bin_boundaries(summary, num_bins):
-  return compress(summary, 1.0 / num_bins)[0, :-1]
+    return compress(summary, 1.0 / num_bins)[0, :-1]
 
 
-@keras_export("keras.layers.Discretization",
-              "keras.layers.experimental.preprocessing.Discretization")
+@keras_export(
+    "keras.layers.Discretization",
+    "keras.layers.experimental.preprocessing.Discretization",
+)
 class Discretization(base_preprocessing_layer.PreprocessingLayer):
-  """A preprocessing layer which buckets continuous features by ranges.
-
-  This layer will place each element of its input data into one of several
-  contiguous ranges and output an integer index indicating which range each
-  element was placed in.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Input shape:
-    Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.
-
-  Output shape:
-    Same as input shape.
-
-  Arguments:
-    bin_boundaries: A list of bin boundaries. The leftmost and rightmost bins
-      will always extend to `-inf` and `inf`, so `bin_boundaries=[0., 1., 2.]`
-      generates bins `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`. If
-      this option is set, `adapt()` should not be called.
-    num_bins: The integer number of bins to compute. If this option is set,
-      `adapt()` should be called to learn the bin boundaries.
-    epsilon: Error tolerance, typically a small fraction close to zero (e.g.
-      0.01). Higher values of epsilon increase the quantile approximation, and
-      hence result in more unequal buckets, but could improve performance
-      and resource consumption.
-    output_mode: Specification for the output of the layer. Defaults to `"int"`.
-      Values can be `"int"`, `"one_hot"`, `"multi_hot"`, or `"count"`
-      configuring the layer as follows:
-        - `"int"`: Return the discritized bin indices directly.
-        - `"one_hot"`: Encodes each individual element in the input into an
-          array the same size as `num_bins`, containing a 1 at the input's bin
-          index. If the last dimension is size 1, will encode on that dimension.
-          If the last dimension is not size 1, will append a new dimension for
-          the encoded output.
-        - `"multi_hot"`: Encodes each sample in the input into a single array
-          the same size as `num_bins`, containing a 1 for each bin index
-          index present in the sample. Treats the last dimension as the sample
-          dimension, if input shape is `(..., sample_length)`, output shape will
-          be `(..., num_tokens)`.
-        - `"count"`: As `"multi_hot"`, but the int array contains a count of the
-          number of times the bin index appeared in the sample.
-    sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
-      and `"count"` output modes. If True, returns a `SparseTensor` instead of
-      a dense `Tensor`. Defaults to False.
-
-  Examples:
-
-  Bucketize float values based on provided buckets.
-  >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-  >>> layer = tf.keras.layers.Discretization(bin_boundaries=[0., 1., 2.])
-  >>> layer(input)
-  <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
-  array([[0, 2, 3, 1],
-         [1, 3, 2, 1]])>
-
-  Bucketize float values based on a number of buckets to compute.
-  >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-  >>> layer = tf.keras.layers.Discretization(num_bins=4, epsilon=0.01)
-  >>> layer.adapt(input)
-  >>> layer(input)
-  <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
-  array([[0, 2, 3, 2],
-         [1, 3, 3, 1]])>
-  """
-
-  def __init__(self,
-               bin_boundaries=None,
-               num_bins=None,
-               epsilon=0.01,
-               output_mode="int",
-               sparse=False,
-               **kwargs):
-    # bins is a deprecated arg for setting bin_boundaries or num_bins that still
-    # has some usage.
-    if "bins" in kwargs:
-      logging.warning(
-          "bins is deprecated, please use bin_boundaries or num_bins instead.")
-      if isinstance(kwargs["bins"], int) and num_bins is None:
-        num_bins = kwargs["bins"]
-      elif bin_boundaries is None:
-        bin_boundaries = kwargs["bins"]
-      del kwargs["bins"]
-
-    # By default, output int64 when output_mode='int' and floats otherwise.
-    if "dtype" not in kwargs or kwargs["dtype"] is None:
-      kwargs["dtype"] = tf.int64 if output_mode == INT else backend.floatx()
-    elif output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer:
-      # Compat for when dtype was always floating and ignored by the layer.
-      kwargs["dtype"] = tf.int64
-
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell("Discretization").set(
-        True)
-
-    # Check dtype only after base layer parses it; dtype parsing is complex.
-    if output_mode == INT and not tf.as_dtype(self.compute_dtype).is_integer:
-      input_dtype = kwargs["dtype"]
-      raise ValueError("When `output_mode='int'`, `dtype` should be an integer "
-                       f"type. Received: dtype={input_dtype}")
-
-    # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)
-    layer_utils.validate_string_arg(
-        output_mode,
-        allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),
-        layer_name=self.__class__.__name__,
-        arg_name="output_mode")
-
-    if sparse and output_mode == INT:
-      raise ValueError(f"`sparse` may only be true if `output_mode` is "
-                       f"`'one_hot'`, `'multi_hot'`, or `'count'`. "
-                       f"Received: sparse={sparse} and "
-                       f"output_mode={output_mode}")
-
-    if num_bins is not None and num_bins < 0:
-      raise ValueError("`num_bins` must be greater than or equal to 0. "
-                       "You passed `num_bins={}`".format(num_bins))
-    if num_bins is not None and bin_boundaries is not None:
-      raise ValueError("Both `num_bins` and `bin_boundaries` should not be "
-                       "set. You passed `num_bins={}` and "
-                       "`bin_boundaries={}`".format(num_bins, bin_boundaries))
-    bin_boundaries = utils.listify_tensors(bin_boundaries)
-    self.input_bin_boundaries = bin_boundaries
-    self.bin_boundaries = bin_boundaries if bin_boundaries is not None else []
-    self.num_bins = num_bins
-    self.epsilon = epsilon
-    self.output_mode = output_mode
-    self.sparse = sparse
-
-  def build(self, input_shape):
-    super().build(input_shape)
-
-    if self.input_bin_boundaries is not None:
-      return
-
-    # Summary contains two equal length vectors of bins at index 0 and weights
-    # at index 1.
-    self.summary = self.add_weight(
-        name="summary",
-        shape=(2, None),
-        dtype=tf.float32,
-        initializer=lambda shape, dtype: [[], []],  # pylint: disable=unused-arguments
-        trainable=False)
-
-  # We override this method solely to generate a docstring.
-  def adapt(self, data, batch_size=None, steps=None):
-    """Computes bin boundaries from quantiles in a input dataset.
-
-    Calling `adapt()` on a `Discretization` layer is an alternative to passing
-    in a `bin_boundaries` argument during construction. A `Discretization` layer
-    should always be either adapted over a dataset or passed `bin_boundaries`.
-
-    During `adapt()`, the layer will estimate the quantile boundaries of the
-    input dataset. The number of quantiles can be controlled via the `num_bins`
-    argument, and the error tolerance for quantile boundaries can be controlled
-    via the `epsilon` argument.
-
-    In order to make `Discretization` efficient in any distribution context, the
-    computed boundaries are kept static with respect to any compiled `tf.Graph`s
-    that call the layer. As a consequence, if the layer is adapted a second
-    time, any models using the layer should be re-compiled. For more information
-    see `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
-
-    `adapt()` is meant only as a single machine utility to compute layer state.
-    To analyze a dataset that cannot fit on a single machine, see
-    [Tensorflow Transform](https://www.tensorflow.org/tfx/transform/get_started)
-    for a multi-machine, map-reduce solution.
+    """A preprocessing layer which buckets continuous features by ranges.
+
+    This layer will place each element of its input data into one of several
+    contiguous ranges and output an integer index indicating which range each
+    element was placed in.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Input shape:
+      Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.
+
+    Output shape:
+      Same as input shape.
 
     Arguments:
-      data: The data to train on. It can be passed either as a
-          `tf.data.Dataset`, or as a numpy array.
-      batch_size: Integer or `None`.
-          Number of samples per state update.
-          If unspecified, `batch_size` will default to 32.
-          Do not specify the `batch_size` if your data is in the
-          form of datasets, generators, or `keras.utils.Sequence` instances
-          (since they generate batches).
-      steps: Integer or `None`.
-          Total number of steps (batches of samples)
-          When training with input tensors such as
-          TensorFlow data tensors, the default `None` is equal to
-          the number of samples in your dataset divided by
-          the batch size, or 1 if that cannot be determined. If x is a
-          `tf.data` dataset, and 'steps' is None, the epoch will run until
-          the input dataset is exhausted. When passing an infinitely
-          repeating dataset, you must specify the `steps` argument. This
-          argument is not supported with array inputs.
+      bin_boundaries: A list of bin boundaries. The leftmost and rightmost bins
+        will always extend to `-inf` and `inf`, so `bin_boundaries=[0., 1., 2.]`
+        generates bins `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
+        If this option is set, `adapt()` should not be called.
+      num_bins: The integer number of bins to compute. If this option is set,
+        `adapt()` should be called to learn the bin boundaries.
+      epsilon: Error tolerance, typically a small fraction close to zero (e.g.
+        0.01). Higher values of epsilon increase the quantile approximation, and
+        hence result in more unequal buckets, but could improve performance
+        and resource consumption.
+      output_mode: Specification for the output of the layer. Values can be
+       `"int"`, `"one_hot"`, `"multi_hot"`, or
+        `"count"` configuring the layer as follows:
+          - `"int"`: Return the discretized bin indices directly.
+          - `"one_hot"`: Encodes each individual element in the input into an
+            array the same size as `num_bins`, containing a 1 at the input's bin
+            index. If the last dimension is size 1, will encode on that
+            dimension.  If the last dimension is not size 1, will append a new
+            dimension for the encoded output.
+          - `"multi_hot"`: Encodes each sample in the input into a single array
+            the same size as `num_bins`, containing a 1 for each bin index
+            index present in the sample. Treats the last dimension as the sample
+            dimension, if input shape is `(..., sample_length)`, output shape
+            will be `(..., num_tokens)`.
+          - `"count"`: As `"multi_hot"`, but the int array contains a count of
+            the number of times the bin index appeared in the sample.
+        Defaults to `"int"`.
+      sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
+        and `"count"` output modes. If True, returns a `SparseTensor` instead of
+        a dense `Tensor`. Defaults to `False`.
+
+    Examples:
+
+    Bucketize float values based on provided buckets.
+    >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
+    >>> layer = tf.keras.layers.Discretization(bin_boundaries=[0., 1., 2.])
+    >>> layer(input)
+    <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
+    array([[0, 2, 3, 1],
+           [1, 3, 2, 1]])>
+
+    Bucketize float values based on a number of buckets to compute.
+    >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
+    >>> layer = tf.keras.layers.Discretization(num_bins=4, epsilon=0.01)
+    >>> layer.adapt(input)
+    >>> layer(input)
+    <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
+    array([[0, 2, 3, 2],
+           [1, 3, 3, 1]])>
     """
-    super().adapt(data, batch_size=batch_size, steps=steps)
-
-  def update_state(self, data):
-    if self.input_bin_boundaries is not None:
-      raise ValueError(
-          "Cannot adapt a Discretization layer that has been initialized with "
-          "`bin_boundaries`, use `num_bins` instead. You passed "
-          "`bin_boundaries={}`.".format(self.input_bin_boundaries))
-
-    if not self.built:
-      raise RuntimeError("`build` must be called before `update_state`.")
-
-    data = tf.convert_to_tensor(data)
-    if data.dtype != tf.float32:
-      data = tf.cast(data, tf.float32)
-    summary = summarize(data, self.epsilon)
-    self.summary.assign(merge_summaries(summary, self.summary, self.epsilon))
-
-  def finalize_state(self):
-    if self.input_bin_boundaries is not None or not self.built:
-      return
-
-    # The bucketize op only support list boundaries.
-    self.bin_boundaries = utils.listify_tensors(
-        get_bin_boundaries(self.summary, self.num_bins))
-
-  def reset_state(self):  # pylint: disable=method-hidden
-    if self.input_bin_boundaries is not None or not self.built:
-      return
-
-    self.summary.assign([[], []])
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        "bin_boundaries": self.input_bin_boundaries,
-        "num_bins": self.num_bins,
-        "epsilon": self.epsilon,
-        "output_mode": self.output_mode,
-        "sparse": self.sparse,
-    })
-    return config
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def compute_output_signature(self, input_spec):
-    output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    if isinstance(input_spec, tf.SparseTensorSpec):
-      return tf.SparseTensorSpec(
-          shape=output_shape, dtype=self.compute_dtype)
-    return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)
-
-  def call(self, inputs):
-    def bucketize(inputs):
-      return tf.raw_ops.Bucketize(input=inputs, boundaries=self.bin_boundaries)
-
-    if tf_utils.is_ragged(inputs):
-      indices = tf.ragged.map_flat_values(bucketize, inputs)
-    elif tf_utils.is_sparse(inputs):
-      indices = tf.SparseTensor(
-          indices=tf.identity(inputs.indices),
-          values=bucketize(inputs.values),
-          dense_shape=tf.identity(inputs.dense_shape))
-    else:
-      indices = bucketize(inputs)
-
-    return utils.encode_categorical_inputs(
-        indices,
-        output_mode=self.output_mode,
-        depth=len(self.bin_boundaries) + 1,
-        sparse=self.sparse,
-        dtype=self.compute_dtype)
+
+    def __init__(
+        self,
+        bin_boundaries=None,
+        num_bins=None,
+        epsilon=0.01,
+        output_mode="int",
+        sparse=False,
+        **kwargs,
+    ):
+        # bins is a deprecated arg for setting bin_boundaries or num_bins that
+        # still has some usage.
+        if "bins" in kwargs:
+            logging.warning(
+                "bins is deprecated, "
+                "please use bin_boundaries or num_bins instead."
+            )
+            if isinstance(kwargs["bins"], int) and num_bins is None:
+                num_bins = kwargs["bins"]
+            elif bin_boundaries is None:
+                bin_boundaries = kwargs["bins"]
+            del kwargs["bins"]
+
+        # By default, output int64 when output_mode='int' and floats otherwise.
+        if "dtype" not in kwargs or kwargs["dtype"] is None:
+            kwargs["dtype"] = (
+                tf.int64 if output_mode == INT else backend.floatx()
+            )
+        elif (
+            output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer
+        ):
+            # Compat for when dtype was always floating and ignored by the
+            # layer.
+            kwargs["dtype"] = tf.int64
+
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("Discretization").set(
+            True
+        )
+
+        # Check dtype only after base layer parses it; dtype parsing is complex.
+        if (
+            output_mode == INT
+            and not tf.as_dtype(self.compute_dtype).is_integer
+        ):
+            input_dtype = kwargs["dtype"]
+            raise ValueError(
+                "When `output_mode='int'`, `dtype` should be an integer "
+                f"type. Received: dtype={input_dtype}"
+            )
+
+        # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)
+        layer_utils.validate_string_arg(
+            output_mode,
+            allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),
+            layer_name=self.__class__.__name__,
+            arg_name="output_mode",
+        )
+
+        if sparse and output_mode == INT:
+            raise ValueError(
+                "`sparse` may only be true if `output_mode` is "
+                "`'one_hot'`, `'multi_hot'`, or `'count'`. "
+                f"Received: sparse={sparse} and "
+                f"output_mode={output_mode}"
+            )
+
+        if num_bins is not None and num_bins < 0:
+            raise ValueError(
+                "`num_bins` must be greater than or equal to 0. "
+                "You passed `num_bins={}`".format(num_bins)
+            )
+        if num_bins is not None and bin_boundaries is not None:
+            raise ValueError(
+                "Both `num_bins` and `bin_boundaries` should not be "
+                "set. You passed `num_bins={}` and "
+                "`bin_boundaries={}`".format(num_bins, bin_boundaries)
+            )
+        bin_boundaries = utils.listify_tensors(bin_boundaries)
+        self.input_bin_boundaries = bin_boundaries
+        self.bin_boundaries = (
+            bin_boundaries if bin_boundaries is not None else []
+        )
+        self.num_bins = num_bins
+        self.epsilon = epsilon
+        self.output_mode = output_mode
+        self.sparse = sparse
+
+    def build(self, input_shape):
+        super().build(input_shape)
+
+        if self.input_bin_boundaries is not None:
+            return
+
+        # Summary contains two equal length vectors of bins at index 0 and
+        # weights at index 1.
+        self.summary = self.add_weight(
+            name="summary",
+            shape=(2, None),
+            dtype=tf.float32,
+            initializer=lambda shape, dtype: [
+                [],
+                [],
+            ],
+            trainable=False,
+        )
+
+    # We override this method solely to generate a docstring.
+    def adapt(self, data, batch_size=None, steps=None):
+        """Computes bin boundaries from quantiles in a input dataset.
+
+        Calling `adapt()` on a `Discretization` layer is an alternative to
+        passing in a `bin_boundaries` argument during construction. A
+        `Discretization` layer should always be either adapted over a dataset or
+        passed `bin_boundaries`.
+
+        During `adapt()`, the layer will estimate the quantile boundaries of the
+        input dataset. The number of quantiles can be controlled via the
+        `num_bins` argument, and the error tolerance for quantile boundaries can
+        be controlled via the `epsilon` argument.
+
+        In order to make `Discretization` efficient in any distribution context,
+        the computed boundaries are kept static with respect to any compiled
+        `tf.Graph`s that call the layer. As a consequence, if the layer is
+        adapted a second time, any models using the layer should be re-compiled.
+        For more information see
+        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
+
+        `adapt()` is meant only as a single machine utility to compute layer
+        state.  To analyze a dataset that cannot fit on a single machine, see
+        [Tensorflow Transform](
+        https://www.tensorflow.org/tfx/transform/get_started) for a
+        multi-machine, map-reduce solution.
+
+        Arguments:
+          data: The data to train on. It can be passed either as a
+              `tf.data.Dataset`, or as a numpy array.
+          batch_size: Integer or `None`.
+              Number of samples per state update.
+              If unspecified, `batch_size` will default to 32.
+              Do not specify the `batch_size` if your data is in the
+              form of datasets, generators, or `keras.utils.Sequence` instances
+              (since they generate batches).
+          steps: Integer or `None`.
+              Total number of steps (batches of samples)
+              When training with input tensors such as
+              TensorFlow data tensors, the default `None` is equal to
+              the number of samples in your dataset divided by
+              the batch size, or 1 if that cannot be determined. If x is a
+              `tf.data` dataset, and 'steps' is None, the epoch will run until
+              the input dataset is exhausted. When passing an infinitely
+              repeating dataset, you must specify the `steps` argument. This
+              argument is not supported with array inputs.
+        """
+        super().adapt(data, batch_size=batch_size, steps=steps)
+
+    def update_state(self, data):
+        if self.input_bin_boundaries is not None:
+            raise ValueError(
+                "Cannot adapt a Discretization layer that has been initialized "
+                "with `bin_boundaries`, use `num_bins` instead. You passed "
+                "`bin_boundaries={}`.".format(self.input_bin_boundaries)
+            )
+
+        if not self.built:
+            raise RuntimeError("`build` must be called before `update_state`.")
+
+        data = tf.convert_to_tensor(data)
+        if data.dtype != tf.float32:
+            data = tf.cast(data, tf.float32)
+        summary = summarize(data, self.epsilon)
+        self.summary.assign(
+            merge_summaries(summary, self.summary, self.epsilon)
+        )
+
+    def finalize_state(self):
+        if self.input_bin_boundaries is not None or not self.built:
+            return
+
+        # The bucketize op only support list boundaries.
+        self.bin_boundaries = utils.listify_tensors(
+            get_bin_boundaries(self.summary, self.num_bins)
+        )
+
+    def reset_state(self):
+        if self.input_bin_boundaries is not None or not self.built:
+            return
+
+        self.summary.assign([[], []])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "bin_boundaries": self.input_bin_boundaries,
+                "num_bins": self.num_bins,
+                "epsilon": self.epsilon,
+                "output_mode": self.output_mode,
+                "sparse": self.sparse,
+            }
+        )
+        return config
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def compute_output_signature(self, input_spec):
+        output_shape = self.compute_output_shape(input_spec.shape.as_list())
+        if isinstance(input_spec, tf.SparseTensorSpec):
+            return tf.SparseTensorSpec(
+                shape=output_shape, dtype=self.compute_dtype
+            )
+        return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)
+
+    def call(self, inputs):
+        def bucketize(inputs):
+            return tf.raw_ops.Bucketize(
+                input=inputs, boundaries=self.bin_boundaries
+            )
+
+        if tf_utils.is_ragged(inputs):
+            indices = tf.ragged.map_flat_values(bucketize, inputs)
+        elif tf_utils.is_sparse(inputs):
+            indices = tf.SparseTensor(
+                indices=tf.identity(inputs.indices),
+                values=bucketize(inputs.values),
+                dense_shape=tf.identity(inputs.dense_shape),
+            )
+        else:
+            indices = bucketize(inputs)
+
+        return utils.encode_categorical_inputs(
+            indices,
+            output_mode=self.output_mode,
+            depth=len(self.bin_boundaries) + 1,
+            sparse=self.sparse,
+            dtype=self.compute_dtype,
+        )
diff --git a/keras/layers/preprocessing/discretization_distribution_test.py b/keras/layers/preprocessing/discretization_distribution_test.py
index 562d71fb6dac..ff2d962fe71a 100644
--- a/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/keras/layers/preprocessing/discretization_distribution_test.py
@@ -15,6 +15,8 @@
 """Distribution tests for keras.layers.preprocessing.discretization."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.distribute import strategy_combinations
@@ -22,40 +24,43 @@
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        strategy=strategy_combinations.all_strategies +
-        strategy_combinations.multi_worker_mirrored_strategies +
-        strategy_combinations.parameter_server_strategies_single_worker +
-        strategy_combinations.parameter_server_strategies_multi_worker,
-        mode=["eager"]))
+        strategy=strategy_combinations.all_strategies
+        + strategy_combinations.multi_worker_mirrored_strategies
+        + strategy_combinations.parameter_server_strategies_single_worker
+        + strategy_combinations.parameter_server_strategies_multi_worker,
+        mode=["eager"],
+    )
+)
 class DiscretizationDistributionTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_strategy(self, strategy):
-    input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_strategy(self, strategy):
+        input_array = np.array([[-1.5, 1.0, 3.4, 0.5], [0.0, 3.0, 1.3, 0.0]])
 
-    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, 4]
+        expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
+        expected_output_shape = [None, 4]
 
-    tf.config.set_soft_device_placement(True)
+        tf.config.set_soft_device_placement(True)
 
-    with strategy.scope():
-      input_data = keras.Input(shape=(4,))
-      layer = discretization.Discretization(bin_boundaries=[0., 1., 2.])
-      bucket_data = layer(input_data)
-      self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
+        with strategy.scope():
+            input_data = keras.Input(shape=(4,))
+            layer = discretization.Discretization(
+                bin_boundaries=[0.0, 1.0, 2.0]
+            )
+            bucket_data = layer(input_data)
+            self.assertAllEqual(
+                expected_output_shape, bucket_data.shape.as_list()
+            )
 
-      model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+            model = keras.Model(inputs=input_data, outputs=bucket_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/layers/preprocessing/discretization_test.py b/keras/layers/preprocessing/discretization_test.py
index 38dad27dc312..0b4b5e78b1df 100644
--- a/keras/layers/preprocessing/discretization_test.py
+++ b/keras/layers/preprocessing/discretization_test.py
@@ -16,403 +16,451 @@
 
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers.preprocessing import discretization
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
-class DiscretizationTest(test_combinations.TestCase,
-                         preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_bucketize_with_explicit_buckets_integer(self):
-    input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-
-    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, 4]
-
-    input_data = keras.Input(shape=(4,))
-    layer = discretization.Discretization(bin_boundaries=[0., 1., 2.])
-    bucket_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_bucketize_with_explicit_buckets_int_input(self):
-    input_array = np.array([[-1, 1, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
-
-    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, 4]
-
-    input_data = keras.Input(shape=(4,), dtype=tf.int64)
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
-    bucket_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_bucketize_with_explicit_buckets_sparse_float_input(self):
-    indices = [[0, 1], [0, 2], [1, 1]]
-    input_array = tf.SparseTensor(
-        indices=indices, values=[-1.5, 1.0, 3.4], dense_shape=[2, 3])
-    expected_output = [0, 2, 3]
-    input_data = keras.Input(shape=(3,), dtype=tf.float32, sparse=True)
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
-    bucket_data = layer(input_data)
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array, steps=1)
-    self.assertAllEqual(indices, output_dataset.indices)
-    self.assertAllEqual(expected_output, output_dataset.values)
-
-  def test_bucketize_with_explicit_buckets_ragged_float_input(self):
-    input_array = tf.ragged.constant([[-1.5, 1.0, 3.4, .5],
-                                      [0.0, 3.0, 1.3]])
-
-    expected_output = [[0, 2, 3, 1], [1, 3, 2]]
-    expected_output_shape = [None, None]
-
-    input_data = keras.Input(shape=(None,), ragged=True)
-    layer = discretization.Discretization(bin_boundaries=[0., 1., 2.])
-    bucket_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_bucketize_with_explicit_buckets_ragged_int_input(self):
-    input_array = tf.ragged.constant([[-1, 1, 3, 0], [0, 3, 1]],
-                                     dtype=tf.int64)
-
-    expected_output = [[0, 2, 3, 1], [1, 3, 2]]
-    expected_output_shape = [None, None]
-
-    input_data = keras.Input(shape=(None,), ragged=True, dtype=tf.int64)
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
-    bucket_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_bucketize_with_explicit_buckets_sparse_int_input(self):
-    indices = [[0, 1], [0, 2], [1, 1]]
-    input_array = tf.SparseTensor(
-        indices=indices, values=[-1, 1, 3], dense_shape=[2, 3])
-    expected_output = [0, 2, 3]
-    input_data = keras.Input(shape=(3,), dtype=tf.int32, sparse=True)
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
-    bucket_data = layer(input_data)
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array, steps=1)
-    self.assertAllEqual(indices, output_dataset.indices)
-    self.assertAllEqual(expected_output, output_dataset.values)
-
-  def test_one_hot_output(self):
-    input_data = np.array([-1.5, 1.0, 3.4, 3.5])
-
-    expected_output = [[1., 0., 0., 0.],
-                       [0., 0., 1., 0.],
-                       [0., 0., 0., 1.],
-                       [0., 0., 0., 1.]]
-    expected_output_shape = [None, 4]
-
-    inputs = keras.Input(shape=(1,))
-    layer = discretization.Discretization(bin_boundaries=[0., 1., 2.],
-                                          output_mode="one_hot")
-    outputs = layer(inputs)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-
-    model = keras.Model(inputs, outputs)
-    output_data = model(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_multi_hot_output(self):
-    input_data = np.array([-1.5, 1.0, 3.4, 3.5])
-
-    expected_output = [1., 0., 1., 1.]
-    expected_output_shape = [None, 4]
-
-    inputs = keras.Input(shape=(4,))
-    layer = discretization.Discretization(bin_boundaries=[0., 1., 2.],
-                                          output_mode="multi_hot")
-    outputs = layer(inputs)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-
-    model = keras.Model(inputs, outputs)
-    output_data = model(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_count_output(self):
-    input_data = np.array([-1.5, 1.0, 3.4, 3.5])
-
-    expected_output = [1., 0., 1., 2.]
-    expected_output_shape = [None, 4]
-
-    inputs = keras.Input(shape=(4,))
-    layer = discretization.Discretization(bin_boundaries=[0., 1., 2.],
-                                          output_mode="count")
-    outputs = layer(inputs)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-
-    model = keras.Model(inputs, outputs)
-    output_data = model(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_output_shape(self):
-    inputs = keras.Input(batch_size=16, shape=(4,), dtype=tf.int64)
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.shape.as_list(), [16, 4])
-
-  @parameterized.named_parameters(
-      ("int32", tf.int32),
-      ("int64", tf.int64),
-  )
-  def test_output_dtype(self, dtype):
-    inputs = keras.Input(batch_size=16, shape=(4,), dtype="float32")
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5],
-                                          dtype=dtype)
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.dtype, dtype)
-
-  def test_legacy_dtype_compat(self):
-    inputs = keras.Input(batch_size=16, shape=(4,), dtype="float32")
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5],
-                                          dtype="float32")
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.dtype, tf.int64)
-    # In TF1 we sometimes face an explicit dtype=None in the config.
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5],
-                                          dtype=None)
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.dtype, tf.int64)
-
-  @parameterized.named_parameters(
-      ("float32", tf.float32),
-      ("float64", tf.float64),
-  )
-  def test_one_hot_output_dtype(self, dtype):
-    inputs = keras.Input(batch_size=16, shape=(1,), dtype="float32")
-    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5],
-                                          output_mode="one_hot",
-                                          dtype=dtype)
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.dtype, dtype)
-
-  def test_num_bins_negative_fails(self):
-    with self.assertRaisesRegex(ValueError, "`num_bins` must be.*num_bins=-7"):
-      _ = discretization.Discretization(num_bins=-7)
-
-  def test_num_bins_and_bins_set_fails(self):
-    with self.assertRaisesRegex(
-        ValueError,
-        r"`num_bins` and `bin_boundaries` should not be set.*5.*\[1, 2\]"):
-      _ = discretization.Discretization(num_bins=5, bins=[1, 2])
+class DiscretizationTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_bucketize_with_explicit_buckets_integer(self):
+        input_array = np.array([[-1.5, 1.0, 3.4, 0.5], [0.0, 3.0, 1.3, 0.0]])
+
+        expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
+        expected_output_shape = [None, 4]
+
+        input_data = keras.Input(shape=(4,))
+        layer = discretization.Discretization(bin_boundaries=[0.0, 1.0, 2.0])
+        bucket_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=bucket_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_bucketize_with_explicit_buckets_int_input(self):
+        input_array = np.array([[-1, 1, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
+
+        expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
+        expected_output_shape = [None, 4]
+
+        input_data = keras.Input(shape=(4,), dtype=tf.int64)
+        layer = discretization.Discretization(bin_boundaries=[-0.5, 0.5, 1.5])
+        bucket_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=bucket_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_bucketize_with_explicit_buckets_sparse_float_input(self):
+        indices = [[0, 1], [0, 2], [1, 1]]
+        input_array = tf.SparseTensor(
+            indices=indices, values=[-1.5, 1.0, 3.4], dense_shape=[2, 3]
+        )
+        expected_output = [0, 2, 3]
+        input_data = keras.Input(shape=(3,), dtype=tf.float32, sparse=True)
+        layer = discretization.Discretization(bin_boundaries=[-0.5, 0.5, 1.5])
+        bucket_data = layer(input_data)
+
+        model = keras.Model(inputs=input_data, outputs=bucket_data)
+        output_dataset = model.predict(input_array, steps=1)
+        self.assertAllEqual(indices, output_dataset.indices)
+        self.assertAllEqual(expected_output, output_dataset.values)
+
+    def test_bucketize_with_explicit_buckets_ragged_float_input(self):
+        input_array = tf.ragged.constant(
+            [[-1.5, 1.0, 3.4, 0.5], [0.0, 3.0, 1.3]]
+        )
+
+        expected_output = [[0, 2, 3, 1], [1, 3, 2]]
+        expected_output_shape = [None, None]
+
+        input_data = keras.Input(shape=(None,), ragged=True)
+        layer = discretization.Discretization(bin_boundaries=[0.0, 1.0, 2.0])
+        bucket_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=bucket_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_bucketize_with_explicit_buckets_ragged_int_input(self):
+        input_array = tf.ragged.constant(
+            [[-1, 1, 3, 0], [0, 3, 1]], dtype=tf.int64
+        )
+
+        expected_output = [[0, 2, 3, 1], [1, 3, 2]]
+        expected_output_shape = [None, None]
+
+        input_data = keras.Input(shape=(None,), ragged=True, dtype=tf.int64)
+        layer = discretization.Discretization(bin_boundaries=[-0.5, 0.5, 1.5])
+        bucket_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
+        model = keras.Model(inputs=input_data, outputs=bucket_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_bucketize_with_explicit_buckets_sparse_int_input(self):
+        indices = [[0, 1], [0, 2], [1, 1]]
+        input_array = tf.SparseTensor(
+            indices=indices, values=[-1, 1, 3], dense_shape=[2, 3]
+        )
+        expected_output = [0, 2, 3]
+        input_data = keras.Input(shape=(3,), dtype=tf.int32, sparse=True)
+        layer = discretization.Discretization(bin_boundaries=[-0.5, 0.5, 1.5])
+        bucket_data = layer(input_data)
+
+        model = keras.Model(inputs=input_data, outputs=bucket_data)
+        output_dataset = model.predict(input_array, steps=1)
+        self.assertAllEqual(indices, output_dataset.indices)
+        self.assertAllEqual(expected_output, output_dataset.values)
+
+    def test_one_hot_output(self):
+        input_data = np.array([-1.5, 1.0, 3.4, 3.5])
+
+        expected_output = [
+            [1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0],
+        ]
+        expected_output_shape = [None, 4]
+
+        inputs = keras.Input(shape=(1,))
+        layer = discretization.Discretization(
+            bin_boundaries=[0.0, 1.0, 2.0], output_mode="one_hot"
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+
+        model = keras.Model(inputs, outputs)
+        output_data = model(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_multi_hot_output(self):
+        input_data = np.array([-1.5, 1.0, 3.4, 3.5])
+
+        expected_output = [1.0, 0.0, 1.0, 1.0]
+        expected_output_shape = [None, 4]
+
+        inputs = keras.Input(shape=(4,))
+        layer = discretization.Discretization(
+            bin_boundaries=[0.0, 1.0, 2.0], output_mode="multi_hot"
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+
+        model = keras.Model(inputs, outputs)
+        output_data = model(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_count_output(self):
+        input_data = np.array([-1.5, 1.0, 3.4, 3.5])
+
+        expected_output = [1.0, 0.0, 1.0, 2.0]
+        expected_output_shape = [None, 4]
+
+        inputs = keras.Input(shape=(4,))
+        layer = discretization.Discretization(
+            bin_boundaries=[0.0, 1.0, 2.0], output_mode="count"
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+
+        model = keras.Model(inputs, outputs)
+        output_data = model(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_output_shape(self):
+        inputs = keras.Input(batch_size=16, shape=(4,), dtype=tf.int64)
+        layer = discretization.Discretization(bin_boundaries=[-0.5, 0.5, 1.5])
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.shape.as_list(), [16, 4])
+
+    @parameterized.named_parameters(
+        ("int32", tf.int32),
+        ("int64", tf.int64),
+    )
+    def test_output_dtype(self, dtype):
+        inputs = keras.Input(batch_size=16, shape=(4,), dtype="float32")
+        layer = discretization.Discretization(
+            bin_boundaries=[-0.5, 0.5, 1.5], dtype=dtype
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.dtype, dtype)
+
+    def test_legacy_dtype_compat(self):
+        inputs = keras.Input(batch_size=16, shape=(4,), dtype="float32")
+        layer = discretization.Discretization(
+            bin_boundaries=[-0.5, 0.5, 1.5], dtype="float32"
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.dtype, tf.int64)
+        # In TF1 we sometimes face an explicit dtype=None in the config.
+        layer = discretization.Discretization(
+            bin_boundaries=[-0.5, 0.5, 1.5], dtype=None
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.dtype, tf.int64)
+
+    @parameterized.named_parameters(
+        ("float32", tf.float32),
+        ("float64", tf.float64),
+    )
+    def test_one_hot_output_dtype(self, dtype):
+        inputs = keras.Input(batch_size=16, shape=(1,), dtype="float32")
+        layer = discretization.Discretization(
+            bin_boundaries=[-0.5, 0.5, 1.5], output_mode="one_hot", dtype=dtype
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.dtype, dtype)
+
+    def test_num_bins_negative_fails(self):
+        with self.assertRaisesRegex(
+            ValueError, "`num_bins` must be.*num_bins=-7"
+        ):
+            _ = discretization.Discretization(num_bins=-7)
+
+    def test_num_bins_and_bins_set_fails(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            r"`num_bins` and `bin_boundaries` should not be set.*5.*\[1, 2\]",
+        ):
+            _ = discretization.Discretization(num_bins=5, bins=[1, 2])
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class DiscretizationAdaptTest(test_combinations.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
-
-  @parameterized.named_parameters([
-      {
-          "testcase_name": "2d_single_element",
-          "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]]),
-          "test_data": np.array([[1.], [2.], [3.]]),
-          "use_dataset": True,
-          "expected": np.array([[1], [2], [3]]),
-          "num_bins": 5,
-          "epsilon": 0.01
-      }, {
-          "testcase_name": "2d_multi_element",
-          "adapt_data": np.array([[1., 6.], [2., 7.], [3., 8.], [4., 9.],
-                                  [5., 10.]]),
-          "test_data": np.array([[1., 10.], [2., 6.], [3., 8.]]),
-          "use_dataset": True,
-          "expected": np.array([[0, 4], [1, 3], [1, 4]]),
-          "num_bins": 5,
-          "epsilon": 0.01
-      }, {
-          "testcase_name": "1d_single_element",
-          "adapt_data": np.array([3., 2., 1., 5., 4.]),
-          "test_data": np.array([1., 2., 3.]),
-          "use_dataset": True,
-          "expected": np.array([1, 2, 3]),
-          "num_bins": 5,
-          "epsilon": 0.01
-      }, {
-          "testcase_name": "300_batch_1d_single_element_1",
-          "adapt_data": np.arange(300),
-          "test_data": np.arange(300),
-          "use_dataset": True,
-          "expected":
-              np.concatenate([np.zeros(101), np.ones(99), 2 * np.ones(100)]),
-          "num_bins": 3,
-          "epsilon": 0.01
-      }, {
-          "testcase_name": "300_batch_1d_single_element_2",
-          "adapt_data": np.arange(300) ** 2,
-          "test_data": np.arange(300) ** 2,
-          "use_dataset": True,
-          "expected":
-              np.concatenate([np.zeros(101), np.ones(99), 2 * np.ones(100)]),
-          "num_bins": 3,
-          "epsilon": 0.01
-      }, {
-          "testcase_name": "300_batch_1d_single_element_large_epsilon",
-          "adapt_data": np.arange(300),
-          "test_data": np.arange(300),
-          "use_dataset": True,
-          "expected": np.concatenate([np.zeros(136), np.ones(164)]),
-          "num_bins": 2,
-          "epsilon": 0.1
-      }])
-  def test_layer_computation(self, adapt_data, test_data, use_dataset,
-                             expected, num_bins=5, epsilon=0.01):
-
-    input_shape = tuple(list(test_data.shape)[1:])
-    np.random.shuffle(adapt_data)
-    if use_dataset:
-      # Keras APIs expect batched datasets
-      adapt_data = tf.data.Dataset.from_tensor_slices(adapt_data).batch(
-          test_data.shape[0] // 2)
-      test_data = tf.data.Dataset.from_tensor_slices(test_data).batch(
-          test_data.shape[0] // 2)
-
-    layer = discretization.Discretization(epsilon=epsilon, num_bins=num_bins)
-    layer.adapt(adapt_data)
-
-    input_data = keras.Input(shape=input_shape)
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
-    model._run_eagerly = test_utils.should_run_eagerly()
-    output_data = model.predict(test_data)
-    self.assertAllClose(expected, output_data)
-
-  def test_multiple_adapts(self):
-    first_adapt = [[1], [2], [3]]
-    second_adapt = [[4], [5], [6]]
-    predict_input = [[2], [2]]
-    expected_first_output = [[2], [2]]
-    expected_second_output = [[0], [0]]
-
-    inputs = keras.Input(shape=(1,), dtype=tf.int32)
-    layer = discretization.Discretization(num_bins=3)
-    layer.adapt(first_adapt)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    actual_output = model.predict(predict_input)
-    self.assertAllClose(actual_output, expected_first_output)
-
-    # Re-adapt the layer on new inputs.
-    layer.adapt(second_adapt)
-    # Re-compile the model.
-    model.compile()
-    # `predict` should now use the new model state.
-    actual_output = model.predict(predict_input)
-    self.assertAllClose(actual_output, expected_second_output)
-
-  def test_saved_model_tf(self):
-    input_data = [[1], [2], [3]]
-    predict_data = [[0.5], [1.5], [2.5]]
-    expected_output = [[0], [1], [2]]
-
-    inputs = keras.Input(shape=(1,), dtype=tf.float32)
-    layer = discretization.Discretization(num_bins=3)
-    layer.adapt(input_data)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    output_data = model.predict(predict_data)
-    self.assertAllClose(output_data, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_saved_model")
-    tf.saved_model.save(model, output_path)
-    loaded_model = tf.saved_model.load(output_path)
-    f = loaded_model.signatures["serving_default"]
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_data = f(tf.constant(predict_data))["discretization"]
-    self.assertAllClose(new_output_data, expected_output)
-
-  @parameterized.product(
-      save_format=["tf", "h5"],
-      adapt=[True, False],
-  )
-  def test_saved_model_keras(self, save_format, adapt):
-    input_data = [[1], [2], [3]]
-    predict_data = [[0.5], [1.5], [2.5]]
-    expected_output = [[0], [1], [2]]
-
-    cls = discretization.Discretization
-    inputs = keras.Input(shape=(1,), dtype=tf.float32)
-    if adapt:
-      layer = cls(num_bins=3)
-      layer.adapt(input_data)
-    else:
-      layer = cls(bin_boundaries=[1.0, 2.0])
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    output_data = model.predict(predict_data)
-    self.assertAllClose(output_data, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format=save_format)
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"Discretization": cls})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_data = loaded_model.predict(predict_data)
-    self.assertAllClose(new_output_data, expected_output)
-
-  def test_saved_weights_keras(self):
-    input_data = [[1], [2], [3]]
-    predict_data = [[0.5], [1.5], [2.5]]
-    expected_output = [[0], [1], [2]]
-
-    cls = discretization.Discretization
-    inputs = keras.Input(shape=(1,), dtype=tf.float32)
-    layer = cls(num_bins=3)
-    layer.adapt(input_data)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    output_data = model.predict(predict_data)
-    self.assertAllClose(output_data, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_weights")
-    model.save_weights(output_path, save_format="tf")
-    new_model = keras.Model.from_config(
-        model.get_config(), custom_objects={"Discretization": cls})
-    new_model.load_weights(output_path)
-
-    # Validate correctness of the new model.
-    new_output_data = new_model.predict(predict_data)
-    self.assertAllClose(new_output_data, expected_output)
+class DiscretizationAdaptTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(
+        [
+            {
+                "testcase_name": "2d_single_element",
+                "adapt_data": np.array([[1.0], [2.0], [3.0], [4.0], [5.0]]),
+                "test_data": np.array([[1.0], [2.0], [3.0]]),
+                "use_dataset": True,
+                "expected": np.array([[1], [2], [3]]),
+                "num_bins": 5,
+                "epsilon": 0.01,
+            },
+            {
+                "testcase_name": "2d_multi_element",
+                "adapt_data": np.array(
+                    [
+                        [1.0, 6.0],
+                        [2.0, 7.0],
+                        [3.0, 8.0],
+                        [4.0, 9.0],
+                        [5.0, 10.0],
+                    ]
+                ),
+                "test_data": np.array([[1.0, 10.0], [2.0, 6.0], [3.0, 8.0]]),
+                "use_dataset": True,
+                "expected": np.array([[0, 4], [1, 3], [1, 4]]),
+                "num_bins": 5,
+                "epsilon": 0.01,
+            },
+            {
+                "testcase_name": "1d_single_element",
+                "adapt_data": np.array([3.0, 2.0, 1.0, 5.0, 4.0]),
+                "test_data": np.array([1.0, 2.0, 3.0]),
+                "use_dataset": True,
+                "expected": np.array([1, 2, 3]),
+                "num_bins": 5,
+                "epsilon": 0.01,
+            },
+            {
+                "testcase_name": "300_batch_1d_single_element_1",
+                "adapt_data": np.arange(300),
+                "test_data": np.arange(300),
+                "use_dataset": True,
+                "expected": np.concatenate(
+                    [np.zeros(101), np.ones(99), 2 * np.ones(100)]
+                ),
+                "num_bins": 3,
+                "epsilon": 0.01,
+            },
+            {
+                "testcase_name": "300_batch_1d_single_element_2",
+                "adapt_data": np.arange(300) ** 2,
+                "test_data": np.arange(300) ** 2,
+                "use_dataset": True,
+                "expected": np.concatenate(
+                    [np.zeros(101), np.ones(99), 2 * np.ones(100)]
+                ),
+                "num_bins": 3,
+                "epsilon": 0.01,
+            },
+            {
+                "testcase_name": "300_batch_1d_single_element_large_epsilon",
+                "adapt_data": np.arange(300),
+                "test_data": np.arange(300),
+                "use_dataset": True,
+                "expected": np.concatenate([np.zeros(136), np.ones(164)]),
+                "num_bins": 2,
+                "epsilon": 0.1,
+            },
+        ]
+    )
+    def test_layer_computation(
+        self,
+        adapt_data,
+        test_data,
+        use_dataset,
+        expected,
+        num_bins=5,
+        epsilon=0.01,
+    ):
+
+        input_shape = tuple(list(test_data.shape)[1:])
+        np.random.shuffle(adapt_data)
+        if use_dataset:
+            # Keras APIs expect batched datasets
+            adapt_data = tf.data.Dataset.from_tensor_slices(adapt_data).batch(
+                test_data.shape[0] // 2
+            )
+            test_data = tf.data.Dataset.from_tensor_slices(test_data).batch(
+                test_data.shape[0] // 2
+            )
+
+        layer = discretization.Discretization(
+            epsilon=epsilon, num_bins=num_bins
+        )
+        layer.adapt(adapt_data)
+
+        input_data = keras.Input(shape=input_shape)
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+        model._run_eagerly = test_utils.should_run_eagerly()
+        output_data = model.predict(test_data)
+        self.assertAllClose(expected, output_data)
+
+    def test_multiple_adapts(self):
+        first_adapt = [[1], [2], [3]]
+        second_adapt = [[4], [5], [6]]
+        predict_input = [[2], [2]]
+        expected_first_output = [[2], [2]]
+        expected_second_output = [[0], [0]]
+
+        inputs = keras.Input(shape=(1,), dtype=tf.int32)
+        layer = discretization.Discretization(num_bins=3)
+        layer.adapt(first_adapt)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        actual_output = model.predict(predict_input)
+        self.assertAllClose(actual_output, expected_first_output)
+
+        # Re-adapt the layer on new inputs.
+        layer.adapt(second_adapt)
+        # Re-compile the model.
+        model.compile()
+        # `predict` should now use the new model state.
+        actual_output = model.predict(predict_input)
+        self.assertAllClose(actual_output, expected_second_output)
+
+    def test_saved_model_tf(self):
+        input_data = [[1], [2], [3]]
+        predict_data = [[0.5], [1.5], [2.5]]
+        expected_output = [[0], [1], [2]]
+
+        inputs = keras.Input(shape=(1,), dtype=tf.float32)
+        layer = discretization.Discretization(num_bins=3)
+        layer.adapt(input_data)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        output_data = model.predict(predict_data)
+        self.assertAllClose(output_data, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_saved_model")
+        tf.saved_model.save(model, output_path)
+        loaded_model = tf.saved_model.load(output_path)
+        f = loaded_model.signatures["serving_default"]
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = f(tf.constant(predict_data))["discretization"]
+        self.assertAllClose(new_output_data, expected_output)
+
+    @parameterized.product(
+        save_format=["tf", "h5"],
+        adapt=[True, False],
+    )
+    def test_saved_model_keras(self, save_format, adapt):
+        input_data = [[1], [2], [3]]
+        predict_data = [[0.5], [1.5], [2.5]]
+        expected_output = [[0], [1], [2]]
+
+        cls = discretization.Discretization
+        inputs = keras.Input(shape=(1,), dtype=tf.float32)
+        if adapt:
+            layer = cls(num_bins=3)
+            layer.adapt(input_data)
+        else:
+            layer = cls(bin_boundaries=[1.0, 2.0])
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        output_data = model.predict(predict_data)
+        self.assertAllClose(output_data, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format=save_format)
+        loaded_model = keras.models.load_model(
+            output_path, custom_objects={"Discretization": cls}
+        )
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = loaded_model.predict(predict_data)
+        self.assertAllClose(new_output_data, expected_output)
+
+    def test_saved_weights_keras(self):
+        input_data = [[1], [2], [3]]
+        predict_data = [[0.5], [1.5], [2.5]]
+        expected_output = [[0], [1], [2]]
+
+        cls = discretization.Discretization
+        inputs = keras.Input(shape=(1,), dtype=tf.float32)
+        layer = cls(num_bins=3)
+        layer.adapt(input_data)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        output_data = model.predict(predict_data)
+        self.assertAllClose(output_data, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(
+            self.get_temp_dir(), "tf_keras_saved_weights"
+        )
+        model.save_weights(output_path, save_format="tf")
+        new_model = keras.Model.from_config(
+            model.get_config(), custom_objects={"Discretization": cls}
+        )
+        new_model.load_weights(output_path)
+
+        # Validate correctness of the new model.
+        new_output_data = new_model.predict(predict_data)
+        self.assertAllClose(new_output_data, expected_output)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/hashed_crossing.py b/keras/layers/preprocessing/hashed_crossing.py
index 240281b2f343..02fa326d3999 100644
--- a/keras/layers/preprocessing/hashed_crossing.py
+++ b/keras/layers/preprocessing/hashed_crossing.py
@@ -14,185 +14,214 @@
 # ==============================================================================
 """Keras hashed crossing preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
 
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_layer
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_utils as utils
 from keras.utils import layer_utils
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 INT = utils.INT
 ONE_HOT = utils.ONE_HOT
 
 
-@keras_export("keras.layers.experimental.preprocessing.HashedCrossing")
+@keras_export(
+    "keras.layers.HashedCrossing",
+    "keras.layers.experimental.preprocessing.HashedCrossing",
+    v1=[],
+)
 class HashedCrossing(base_layer.Layer):
-  """A preprocessing layer which crosses features using the "hashing trick".
-
-  This layer performs crosses of categorical features using the "hasing trick".
-  Conceptually, the transformation can be thought of as:
-  hash(concatenation of features) % `num_bins`.
-
-  This layer currently only performs crosses of scalar inputs and batches of
-  scalar inputs. Valid input shapes are `(batch_size, 1)`, `(batch_size,)` and
-  `()`.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    num_bins: Number of hash bins.
-    output_mode: Specification for the output of the layer. Defaults to `"int"`.
-      Values can be `"int"`, or `"one_hot"` configuring the layer as follows:
-        - `"int"`: Return the integer bin indices directly.
-        - `"one_hot"`: Encodes each individual element in the input into an
-          array the same size as `num_bins`, containing a 1 at the input's bin
-          index.
-    sparse: Boolean. Only applicable to `"one_hot"` mode. If True, returns a
-      `SparseTensor` instead of a dense `Tensor`. Defaults to False.
-    **kwargs: Keyword arguments to construct a layer.
-
-  Examples:
-
-  **Crossing two scalar features.**
-
-  >>> layer = tf.keras.layers.experimental.preprocessing.HashedCrossing(
-  ...     num_bins=5)
-  >>> feat1 = tf.constant(['A', 'B', 'A', 'B', 'A'])
-  >>> feat2 = tf.constant([101, 101, 101, 102, 102])
-  >>> layer((feat1, feat2))
-  <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 4, 1, 1, 3])>
-
-  **Crossing and one-hotting two scalar features.**
-
-  >>> layer = tf.keras.layers.experimental.preprocessing.HashedCrossing(
-  ...     num_bins=5, output_mode='one_hot')
-  >>> feat1 = tf.constant(['A', 'B', 'A', 'B', 'A'])
-  >>> feat2 = tf.constant([101, 101, 101, 102, 102])
-  >>> layer((feat1, feat2))
-  <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
-    array([[0., 1., 0., 0., 0.],
-           [0., 0., 0., 0., 1.],
-           [0., 1., 0., 0., 0.],
-           [0., 1., 0., 0., 0.],
-           [0., 0., 0., 1., 0.]], dtype=float32)>
-  """
-
-  def __init__(self,
-               num_bins,
-               output_mode="int",
-               sparse=False,
-               **kwargs):
-    # By default, output int64 when output_mode="int" and floats otherwise.
-    if "dtype" not in kwargs or kwargs["dtype"] is None:
-      kwargs["dtype"] = tf.int64 if output_mode == INT else backend.floatx()
-
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell(
-        "HashedCrossing").set(True)
-
-    # Check dtype only after base layer parses it; dtype parsing is complex.
-    if output_mode == INT and not tf.as_dtype(self.compute_dtype).is_integer:
-      input_dtype = kwargs["dtype"]
-      raise ValueError("When `output_mode='int'`, `dtype` should be an integer "
-                       f"type. Received: dtype={input_dtype}")
-
-    # "output_mode" must be one of (INT, ONE_HOT)
-    layer_utils.validate_string_arg(
-        output_mode,
-        allowable_strings=(INT, ONE_HOT),
-        layer_name=self.__class__.__name__,
-        arg_name="output_mode")
-
-    self.num_bins = num_bins
-    self.output_mode = output_mode
-    self.sparse = sparse
-
-  def call(self, inputs):
-    # Convert all inputs to tensors and check shape. This layer only supports
-    # sclars and batches of scalars for the initial version.
-    self._check_at_least_two_inputs(inputs)
-    inputs = [utils.ensure_tensor(x) for x in inputs]
-    self._check_input_shape_and_type(inputs)
-
-    # Uprank to rank 2 for the cross_hashed op.
-    rank = inputs[0].shape.rank
-    if rank < 2:
-      inputs = [utils.expand_dims(x, -1) for x in inputs]
-    if rank < 1:
-      inputs = [utils.expand_dims(x, -1) for x in inputs]
-
-    # Perform the cross and convert to dense
-    outputs = tf.sparse.cross_hashed(inputs, self.num_bins)
-    outputs = tf.sparse.to_dense(outputs)
-
-    # Fix output shape and downrank to match input rank.
-    if rank == 2:
-      # tf.sparse.cross_hashed output shape will always be None on the last
-      # dimension. Given our input shape restrictions, we want to force shape 1
-      # instead.
-      outputs = tf.reshape(outputs, [-1, 1])
-    elif rank == 1:
-      outputs = tf.reshape(outputs, [-1])
-    elif rank == 0:
-      outputs = tf.reshape(outputs, [])
-
-    # Encode outputs.
-    return utils.encode_categorical_inputs(
-        outputs,
-        output_mode=self.output_mode,
-        depth=self.num_bins,
-        sparse=self.sparse,
-        dtype=self.compute_dtype)
-
-  def compute_output_shape(self, input_shapes):
-    self._check_at_least_two_inputs(input_shapes)
-    return utils.compute_shape_for_encode_categorical(input_shapes[0])
-
-  def compute_output_signature(self, input_specs):
-    input_shapes = [x.shape.as_list() for x in input_specs]
-    output_shape = self.compute_output_shape(input_shapes)
-    if self.sparse or any(
-        isinstance(x, tf.SparseTensorSpec) for x in input_specs):
-      return tf.SparseTensorSpec(shape=output_shape, dtype=self.compute_dtype)
-    return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        "num_bins": self.num_bins,
-        "output_mode": self.output_mode,
-        "sparse": self.sparse,
-    })
-    return config
-
-  def _check_at_least_two_inputs(self, inputs):
-    if not isinstance(inputs, (list, tuple)):
-      raise ValueError(
-          "`HashedCrossing` should be called on a list or tuple of inputs. "
-          f"Received: inputs={inputs}")
-    if len(inputs) < 2:
-      raise ValueError(
-          "`HashedCrossing` should be called on at least two inputs. "
-          f"Received: inputs={inputs}")
-
-  def _check_input_shape_and_type(self, inputs):
-    first_shape = inputs[0].shape.as_list()
-    rank = len(first_shape)
-    if rank > 2 or (rank == 2 and first_shape[-1] != 1):
-      raise ValueError(
-          "All `HashedCrossing` inputs should have shape `[]`, `[batch_size]` "
-          f"or `[batch_size, 1]`. Received: inputs={inputs}")
-    if not all(x.shape.as_list() == first_shape for x in inputs[1:]):
-      raise ValueError("All `HashedCrossing` inputs should have equal shape. "
-                       f"Received: inputs={inputs}")
-    if any(isinstance(x, (tf.RaggedTensor, tf.SparseTensor)) for x in inputs):
-      raise ValueError("All `HashedCrossing` inputs should be dense tensors. "
-                       f"Received: inputs={inputs}")
-    if not all(x.dtype.is_integer or x.dtype == tf.string for x in inputs):
-      raise ValueError("All `HashedCrossing` inputs should have an integer or "
-                       f"string dtype. Received: inputs={inputs}")
+    """A preprocessing layer which crosses features using the "hashing trick".
+
+    This layer performs crosses of categorical features using the "hashing
+    trick". Conceptually, the transformation can be thought of as:
+    `hash(concatenate(features)) % num_bins`.
+
+    This layer currently only performs crosses of scalar inputs and batches of
+    scalar inputs. Valid input shapes are `(batch_size, 1)`, `(batch_size,)` and
+    `()`.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+        num_bins: Number of hash bins.
+        output_mode: Specification for the output of the layer. Values can be
+            `"int"`, or `"one_hot"` configuring the layer as follows:
+            - `"int"`: Return the integer bin indices directly.
+            - `"one_hot"`: Encodes each individual element in the input into an
+                array the same size as `num_bins`, containing a 1 at the input's
+                bin index. Defaults to `"int"`.
+        sparse: Boolean. Only applicable to `"one_hot"` mode. If `True`,
+            returns a `SparseTensor` instead of a dense `Tensor`.
+            Defaults to `False`.
+        **kwargs: Keyword arguments to construct a layer.
+
+    Examples:
+
+    **Crossing two scalar features.**
+
+    >>> layer = tf.keras.layers.HashedCrossing(
+    ...     num_bins=5)
+    >>> feat1 = tf.constant(['A', 'B', 'A', 'B', 'A'])
+    >>> feat2 = tf.constant([101, 101, 101, 102, 102])
+    >>> layer((feat1, feat2))
+    <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 4, 1, 1, 3])>
+
+    **Crossing and one-hotting two scalar features.**
+
+    >>> layer = tf.keras.layers.HashedCrossing(
+    ...     num_bins=5, output_mode='one_hot')
+    >>> feat1 = tf.constant(['A', 'B', 'A', 'B', 'A'])
+    >>> feat2 = tf.constant([101, 101, 101, 102, 102])
+    >>> layer((feat1, feat2))
+    <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
+      array([[0., 1., 0., 0., 0.],
+             [0., 0., 0., 0., 1.],
+             [0., 1., 0., 0., 0.],
+             [0., 1., 0., 0., 0.],
+             [0., 0., 0., 1., 0.]], dtype=float32)>
+    """
+
+    def __init__(self, num_bins, output_mode="int", sparse=False, **kwargs):
+        # By default, output int64 when output_mode="int" and floats otherwise.
+        if "dtype" not in kwargs or kwargs["dtype"] is None:
+            kwargs["dtype"] = (
+                tf.int64 if output_mode == INT else backend.floatx()
+            )
+
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("HashedCrossing").set(
+            True
+        )
+
+        # Check dtype only after base layer parses it; dtype parsing is complex.
+        if (
+            output_mode == INT
+            and not tf.as_dtype(self.compute_dtype).is_integer
+        ):
+            input_dtype = kwargs["dtype"]
+            raise ValueError(
+                "When `output_mode='int'`, `dtype` should be an integer "
+                f"type. Received: dtype={input_dtype}"
+            )
+
+        # "output_mode" must be one of (INT, ONE_HOT)
+        layer_utils.validate_string_arg(
+            output_mode,
+            allowable_strings=(INT, ONE_HOT),
+            layer_name=self.__class__.__name__,
+            arg_name="output_mode",
+        )
+
+        self.num_bins = num_bins
+        self.output_mode = output_mode
+        self.sparse = sparse
+
+    def call(self, inputs):
+        # Convert all inputs to tensors and check shape. This layer only
+        # supports sclars and batches of scalars for the initial version.
+        self._check_at_least_two_inputs(inputs)
+        inputs = [utils.ensure_tensor(x) for x in inputs]
+        self._check_input_shape_and_type(inputs)
+
+        # Uprank to rank 2 for the cross_hashed op.
+        rank = inputs[0].shape.rank
+        if rank < 2:
+            inputs = [utils.expand_dims(x, -1) for x in inputs]
+        if rank < 1:
+            inputs = [utils.expand_dims(x, -1) for x in inputs]
+
+        # Perform the cross and convert to dense
+        outputs = tf.sparse.cross_hashed(inputs, self.num_bins)
+        outputs = tf.sparse.to_dense(outputs)
+
+        # Fix output shape and downrank to match input rank.
+        if rank == 2:
+            # tf.sparse.cross_hashed output shape will always be None on the
+            # last dimension. Given our input shape restrictions, we want to
+            # force shape 1 instead.
+            outputs = tf.reshape(outputs, [-1, 1])
+        elif rank == 1:
+            outputs = tf.reshape(outputs, [-1])
+        elif rank == 0:
+            outputs = tf.reshape(outputs, [])
+
+        # Encode outputs.
+        return utils.encode_categorical_inputs(
+            outputs,
+            output_mode=self.output_mode,
+            depth=self.num_bins,
+            sparse=self.sparse,
+            dtype=self.compute_dtype,
+        )
+
+    def compute_output_shape(self, input_shapes):
+        self._check_at_least_two_inputs(input_shapes)
+        return utils.compute_shape_for_encode_categorical(input_shapes[0])
+
+    def compute_output_signature(self, input_specs):
+        input_shapes = [x.shape.as_list() for x in input_specs]
+        output_shape = self.compute_output_shape(input_shapes)
+        if self.sparse or any(
+            isinstance(x, tf.SparseTensorSpec) for x in input_specs
+        ):
+            return tf.SparseTensorSpec(
+                shape=output_shape, dtype=self.compute_dtype
+            )
+        return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_bins": self.num_bins,
+                "output_mode": self.output_mode,
+                "sparse": self.sparse,
+            }
+        )
+        return config
+
+    def _check_at_least_two_inputs(self, inputs):
+        if not isinstance(inputs, (list, tuple)):
+            raise ValueError(
+                "`HashedCrossing` should be called on a list or tuple of "
+                f"inputs. Received: inputs={inputs}"
+            )
+        if len(inputs) < 2:
+            raise ValueError(
+                "`HashedCrossing` should be called on at least two inputs. "
+                f"Received: inputs={inputs}"
+            )
+
+    def _check_input_shape_and_type(self, inputs):
+        first_shape = inputs[0].shape.as_list()
+        rank = len(first_shape)
+        if rank > 2 or (rank == 2 and first_shape[-1] != 1):
+            raise ValueError(
+                "All `HashedCrossing` inputs should have shape `[]`, "
+                "`[batch_size]` or `[batch_size, 1]`. "
+                f"Received: inputs={inputs}"
+            )
+        if not all(x.shape.as_list() == first_shape for x in inputs[1:]):
+            raise ValueError(
+                "All `HashedCrossing` inputs should have equal shape. "
+                f"Received: inputs={inputs}"
+            )
+        if any(
+            isinstance(x, (tf.RaggedTensor, tf.SparseTensor)) for x in inputs
+        ):
+            raise ValueError(
+                "All `HashedCrossing` inputs should be dense tensors. "
+                f"Received: inputs={inputs}"
+            )
+        if not all(x.dtype.is_integer or x.dtype == tf.string for x in inputs):
+            raise ValueError(
+                "All `HashedCrossing` inputs should have an integer or "
+                f"string dtype. Received: inputs={inputs}"
+            )
diff --git a/keras/layers/preprocessing/hashed_crossing_test.py b/keras/layers/preprocessing/hashed_crossing_test.py
index 529673d791a8..6fa5163fb784 100644
--- a/keras/layers/preprocessing/hashed_crossing_test.py
+++ b/keras/layers/preprocessing/hashed_crossing_test.py
@@ -15,153 +15,192 @@
 """Tests for hashed crossing layer."""
 
 import os
+
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 import keras
 from keras.layers.preprocessing import hashed_crossing
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class HashedCrossingTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ('python_value', lambda x: x),
-      ('dense', tf.constant),
-  )
-  def test_cross_scalars(self, data_fn):
-    layer = hashed_crossing.HashedCrossing(num_bins=10)
-    feat1 = data_fn('A')
-    feat2 = data_fn(101)
-    outputs = layer((feat1, feat2))
-    self.assertAllClose(outputs, 1)
-    self.assertAllEqual(outputs.shape.as_list(), [])
-
-  @parameterized.named_parameters(
-      ('tuple', tuple),
-      ('list', list),
-      ('numpy', np.array),
-      ('array_like', preprocessing_test_utils.ArrayLike),
-      ('dense', tf.constant),
-  )
-  def test_cross_batch_of_scalars_1d(self, data_fn):
-    layer = hashed_crossing.HashedCrossing(num_bins=10)
-    feat1 = data_fn(['A', 'B', 'A', 'B', 'A'])
-    feat2 = data_fn([101, 101, 101, 102, 102])
-    outputs = layer((feat1, feat2))
-    self.assertAllClose(outputs, [1, 4, 1, 6, 3])
-    self.assertAllEqual(outputs.shape.as_list(), [5])
-
-  @parameterized.named_parameters(
-      ('tuple', tuple),
-      ('list', list),
-      ('numpy', np.array),
-      ('array_like', preprocessing_test_utils.ArrayLike),
-      ('dense', tf.constant),
-  )
-  def test_cross_batch_of_scalars_2d(self, data_fn):
-    layer = hashed_crossing.HashedCrossing(num_bins=10)
-    feat1 = data_fn([['A'], ['B'], ['A'], ['B'], ['A']])
-    feat2 = data_fn([[101], [101], [101], [102], [102]])
-    outputs = layer((feat1, feat2))
-    self.assertAllClose(outputs, [[1], [4], [1], [6], [3]])
-    self.assertAllEqual(outputs.shape.as_list(), [5, 1])
-
-  @parameterized.named_parameters(
-      ('sparse', True),
-      ('dense', False),
-  )
-  def test_cross_one_hot_output(self, sparse):
-    layer = hashed_crossing.HashedCrossing(
-        num_bins=5, output_mode='one_hot', sparse=sparse)
-    feat1 = tf.constant([['A'], ['B'], ['A'], ['B'], ['A']])
-    feat2 = tf.constant([[101], [101], [101], [102], [102]])
-    outputs = layer((feat1, feat2))
-    if sparse:
-      outputs = tf.sparse.to_dense(outputs)
-    self.assertAllClose(outputs, [
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 0, 1],
-        [0, 1, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1, 0],
-    ])
-    self.assertAllEqual(outputs.shape.as_list(), [5, 5])
-
-  def test_cross_output_dtype(self):
-    layer = hashed_crossing.HashedCrossing(num_bins=2)
-    self.assertAllEqual(layer(([1], [1])).dtype, tf.int64)
-    layer = hashed_crossing.HashedCrossing(num_bins=2, dtype=tf.int32)
-    self.assertAllEqual(layer(([1], [1])).dtype, tf.int32)
-    layer = hashed_crossing.HashedCrossing(num_bins=2, output_mode='one_hot')
-    self.assertAllEqual(layer(([1], [1])).dtype, tf.float32)
-    layer = hashed_crossing.HashedCrossing(
-        num_bins=2, output_mode='one_hot', dtype=tf.float64)
-    self.assertAllEqual(layer(([1], [1])).dtype, tf.float64)
-
-  def test_non_list_input_fails(self):
-    with self.assertRaisesRegex(ValueError, 'should be called on a list'):
-      hashed_crossing.HashedCrossing(num_bins=10)(tf.constant(1))
-
-  def test_single_input_fails(self):
-    with self.assertRaisesRegex(ValueError, 'at least two inputs'):
-      hashed_crossing.HashedCrossing(num_bins=10)([tf.constant(1)])
-
-  def test_sparse_input_fails(self):
-    with self.assertRaisesRegex(ValueError, 'inputs should be dense tensors'):
-      sparse_in = tf.sparse.from_dense(tf.constant([1]))
-      hashed_crossing.HashedCrossing(num_bins=10)((sparse_in, sparse_in))
-
-  def test_float_input_fails(self):
-    with self.assertRaisesRegex(ValueError, 'should have an integer or string'):
-      hashed_crossing.HashedCrossing(num_bins=10)(
-          (tf.constant([1.]), tf.constant([1.])))
-
-  def test_upsupported_shape_input_fails(self):
-    with self.assertRaisesRegex(ValueError, 'inputs should have shape'):
-      hashed_crossing.HashedCrossing(num_bins=10)(
-          (tf.constant([[[1.]]]), tf.constant([[[1.]]])))
-
-  def test_from_config(self):
-    layer = hashed_crossing.HashedCrossing(
-        num_bins=5, output_mode='one_hot', sparse=True)
-    cloned_layer = hashed_crossing.HashedCrossing.from_config(
-        layer.get_config())
-    feat1 = tf.constant([['A'], ['B'], ['A'], ['B'], ['A']])
-    feat2 = tf.constant([[101], [101], [101], [102], [102]])
-    original_outputs = layer((feat1, feat2))
-    cloned_outputs = cloned_layer((feat1, feat2))
-    self.assertAllEqual(
-        tf.sparse.to_dense(cloned_outputs),
-        tf.sparse.to_dense(original_outputs))
-
-  def test_saved_model_keras(self):
-    string_in = keras.Input(shape=(1,), dtype=tf.string)
-    int_in = keras.Input(shape=(1,), dtype=tf.int64)
-    out = hashed_crossing.HashedCrossing(num_bins=10)((string_in, int_in))
-    model = keras.Model(inputs=(string_in, int_in), outputs=out)
-
-    string_data = tf.constant([['A'], ['B'], ['A'], ['B'], ['A']])
-    int_data = tf.constant([[101], [101], [101], [102], [102]])
-    expected_output = [[1], [4], [1], [6], [3]]
-
-    output_data = model((string_data, int_data))
-    self.assertAllClose(output_data, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), 'saved_model')
-    model.save(output_path, save_format='tf')
-    loaded_model = keras.models.load_model(
-        output_path,
-        custom_objects={'HashedCrossing': hashed_crossing.HashedCrossing})
-
-    # Validate correctness of the new model.
-    new_output_data = loaded_model((string_data, int_data))
-    self.assertAllClose(new_output_data, expected_output)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @parameterized.named_parameters(
+        ("python_value", lambda x: x),
+        ("dense", tf.constant),
+    )
+    def test_cross_scalars(self, data_fn):
+        layer = hashed_crossing.HashedCrossing(num_bins=10)
+        feat1 = data_fn("A")
+        feat2 = data_fn(101)
+        outputs = layer((feat1, feat2))
+        self.assertAllClose(outputs, 1)
+        self.assertAllEqual(outputs.shape.as_list(), [])
+
+    @parameterized.named_parameters(
+        ("tuple", tuple),
+        ("list", list),
+        ("numpy", np.array),
+        ("array_like", preprocessing_test_utils.ArrayLike),
+        ("dense", tf.constant),
+    )
+    def test_cross_batch_of_scalars_1d(self, data_fn):
+        layer = hashed_crossing.HashedCrossing(num_bins=10)
+        feat1 = data_fn(["A", "B", "A", "B", "A"])
+        feat2 = data_fn([101, 101, 101, 102, 102])
+        outputs = layer((feat1, feat2))
+        self.assertAllClose(outputs, [1, 4, 1, 6, 3])
+        self.assertAllEqual(outputs.shape.as_list(), [5])
+
+    @parameterized.named_parameters(
+        ("tuple", tuple),
+        ("list", list),
+        ("numpy", np.array),
+        ("array_like", preprocessing_test_utils.ArrayLike),
+        ("dense", tf.constant),
+    )
+    def test_cross_batch_of_scalars_2d(self, data_fn):
+        layer = hashed_crossing.HashedCrossing(num_bins=10)
+        feat1 = data_fn([["A"], ["B"], ["A"], ["B"], ["A"]])
+        feat2 = data_fn([[101], [101], [101], [102], [102]])
+        outputs = layer((feat1, feat2))
+        self.assertAllClose(outputs, [[1], [4], [1], [6], [3]])
+        self.assertAllEqual(outputs.shape.as_list(), [5, 1])
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_cross_one_hot_output(self, sparse):
+        layer = hashed_crossing.HashedCrossing(
+            num_bins=5, output_mode="one_hot", sparse=sparse
+        )
+        feat1 = tf.constant([["A"], ["B"], ["A"], ["B"], ["A"]])
+        feat2 = tf.constant([[101], [101], [101], [102], [102]])
+        outputs = layer((feat1, feat2))
+        if sparse:
+            outputs = tf.sparse.to_dense(outputs)
+        self.assertAllClose(
+            outputs,
+            [
+                [0, 1, 0, 0, 0],
+                [0, 0, 0, 0, 1],
+                [0, 1, 0, 0, 0],
+                [0, 1, 0, 0, 0],
+                [0, 0, 0, 1, 0],
+            ],
+        )
+        self.assertAllEqual(outputs.shape.as_list(), [5, 5])
+
+    def test_cross_output_dtype(self):
+        layer = hashed_crossing.HashedCrossing(num_bins=2)
+        self.assertAllEqual(layer(([1], [1])).dtype, tf.int64)
+        layer = hashed_crossing.HashedCrossing(num_bins=2, dtype=tf.int32)
+        self.assertAllEqual(layer(([1], [1])).dtype, tf.int32)
+        layer = hashed_crossing.HashedCrossing(
+            num_bins=2, output_mode="one_hot"
+        )
+        self.assertAllEqual(layer(([1], [1])).dtype, tf.float32)
+        layer = hashed_crossing.HashedCrossing(
+            num_bins=2, output_mode="one_hot", dtype=tf.float64
+        )
+        self.assertAllEqual(layer(([1], [1])).dtype, tf.float64)
+
+    def test_non_list_input_fails(self):
+        with self.assertRaisesRegex(ValueError, "should be called on a list"):
+            hashed_crossing.HashedCrossing(num_bins=10)(tf.constant(1))
+
+    def test_single_input_fails(self):
+        with self.assertRaisesRegex(ValueError, "at least two inputs"):
+            hashed_crossing.HashedCrossing(num_bins=10)([tf.constant(1)])
+
+    def test_sparse_input_fails(self):
+        with self.assertRaisesRegex(
+            ValueError, "inputs should be dense tensors"
+        ):
+            sparse_in = tf.sparse.from_dense(tf.constant([1]))
+            hashed_crossing.HashedCrossing(num_bins=10)((sparse_in, sparse_in))
+
+    def test_float_input_fails(self):
+        with self.assertRaisesRegex(
+            ValueError, "should have an integer or string"
+        ):
+            hashed_crossing.HashedCrossing(num_bins=10)(
+                (tf.constant([1.0]), tf.constant([1.0]))
+            )
+
+    def test_upsupported_shape_input_fails(self):
+        with self.assertRaisesRegex(ValueError, "inputs should have shape"):
+            hashed_crossing.HashedCrossing(num_bins=10)(
+                (tf.constant([[[1.0]]]), tf.constant([[[1.0]]]))
+            )
+
+    def test_from_config(self):
+        layer = hashed_crossing.HashedCrossing(
+            num_bins=5, output_mode="one_hot", sparse=True
+        )
+        cloned_layer = hashed_crossing.HashedCrossing.from_config(
+            layer.get_config()
+        )
+        feat1 = tf.constant([["A"], ["B"], ["A"], ["B"], ["A"]])
+        feat2 = tf.constant([[101], [101], [101], [102], [102]])
+        original_outputs = layer((feat1, feat2))
+        cloned_outputs = cloned_layer((feat1, feat2))
+        self.assertAllEqual(
+            tf.sparse.to_dense(cloned_outputs),
+            tf.sparse.to_dense(original_outputs),
+        )
+
+    def test_saving_keras(self):
+        string_in = keras.Input(shape=(1,), dtype=tf.string)
+        int_in = keras.Input(shape=(1,), dtype=tf.int64)
+        out = hashed_crossing.HashedCrossing(num_bins=10)((string_in, int_in))
+        model = keras.Model(inputs=(string_in, int_in), outputs=out)
+
+        string_data = tf.constant([["A"], ["B"], ["A"], ["B"], ["A"]])
+        int_data = tf.constant([[101], [101], [101], [102], [102]])
+        expected_output = [[1], [4], [1], [6], [3]]
+
+        output_data = model((string_data, int_data))
+        self.assertAllClose(output_data, expected_output)
+
+        with self.subTest("savedmodel"):
+            # Save the model to disk.
+            output_path = os.path.join(self.get_temp_dir(), "saved_model")
+            model.save(output_path, save_format="tf")
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={
+                    "HashedCrossing": hashed_crossing.HashedCrossing
+                },
+            )
+
+            # Validate correctness of the new model.
+            new_output_data = loaded_model((string_data, int_data))
+            self.assertAllClose(new_output_data, expected_output)
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            # Save the model to disk.
+            output_path = os.path.join(self.get_temp_dir(), "model.keras")
+            model.save(output_path, save_format="keras_v3")
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={
+                    "HashedCrossing": hashed_crossing.HashedCrossing
+                },
+            )
+
+            # Validate correctness of the new model.
+            new_output_data = loaded_model((string_data, int_data))
+            self.assertAllClose(new_output_data, expected_output)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/preprocessing/hashing.py b/keras/layers/preprocessing/hashing.py
index 1dd13d585a69..77adfee68d0e 100644
--- a/keras/layers/preprocessing/hashing.py
+++ b/keras/layers/preprocessing/hashing.py
@@ -14,15 +14,16 @@
 # ==============================================================================
 """Keras hashing preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
 
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_layer
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_utils as utils
 from keras.utils import layer_utils
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 INT = utils.INT
@@ -31,238 +32,267 @@
 COUNT = utils.COUNT
 
 
-@keras_export('keras.layers.Hashing',
-              'keras.layers.experimental.preprocessing.Hashing')
+@keras_export(
+    "keras.layers.Hashing", "keras.layers.experimental.preprocessing.Hashing"
+)
 class Hashing(base_layer.Layer):
-  """A preprocessing layer which hashes and bins categorical features.
-
-  This layer transforms categorical inputs to hashed output. It element-wise
-  converts a ints or strings to ints in a fixed range. The stable hash
-  function uses `tensorflow::ops::Fingerprint` to produce the same output
-  consistently across all platforms.
-
-  This layer uses [FarmHash64](https://github.com/google/farmhash) by default,
-  which provides a consistent hashed output across different platforms and is
-  stable across invocations, regardless of device and context, by mixing the
-  input bits thoroughly.
-
-  If you want to obfuscate the hashed output, you can also pass a random `salt`
-  argument in the constructor. In that case, the layer will use the
-  [SipHash64](https://github.com/google/highwayhash) hash function, with
-  the `salt` value serving as additional input to the hash function.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  **Example (FarmHash64)**
-
-  >>> layer = tf.keras.layers.Hashing(num_bins=3)
-  >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
-  >>> layer(inp)
-  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
-    array([[1],
-           [0],
-           [1],
-           [1],
-           [2]])>
-
-  **Example (FarmHash64) with a mask value**
-
-  >>> layer = tf.keras.layers.Hashing(num_bins=3, mask_value='')
-  >>> inp = [['A'], ['B'], [''], ['C'], ['D']]
-  >>> layer(inp)
-  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
-    array([[1],
-           [1],
-           [0],
-           [2],
-           [2]])>
-
-  **Example (SipHash64)**
-
-  >>> layer = tf.keras.layers.Hashing(num_bins=3, salt=[133, 137])
-  >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
-  >>> layer(inp)
-  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
-    array([[1],
-           [2],
-           [1],
-           [0],
-           [2]])>
-
-  **Example (Siphash64 with a single integer, same as `salt=[133, 133]`)**
-
-  >>> layer = tf.keras.layers.Hashing(num_bins=3, salt=133)
-  >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
-  >>> layer(inp)
-  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
-    array([[0],
-           [0],
-           [2],
-           [1],
-           [0]])>
-
-  Args:
-    num_bins: Number of hash bins. Note that this includes the `mask_value` bin,
-      so the effective number of bins is `(num_bins - 1)` if `mask_value` is
-      set.
-    mask_value: A value that represents masked inputs, which are mapped to
-      index 0. Defaults to None, meaning no mask term will be added and the
-      hashing will start at index 0.
-    salt: A single unsigned integer or None.
-      If passed, the hash function used will be SipHash64, with these values
-      used as an additional input (known as a "salt" in cryptography).
-      These should be non-zero. Defaults to `None` (in that
-      case, the FarmHash64 hash function is used). It also supports
-      tuple/list of 2 unsigned integer numbers, see reference paper for details.
-    output_mode: Specification for the output of the layer. Defaults to `"int"`.
-      Values can be `"int"`, `"one_hot"`, `"multi_hot"`, or `"count"`
-      configuring the layer as follows:
-        - `"int"`: Return the integer bin indices directly.
-        - `"one_hot"`: Encodes each individual element in the input into an
-          array the same size as `num_bins`, containing a 1 at the input's bin
-          index. If the last dimension is size 1, will encode on that dimension.
-          If the last dimension is not size 1, will append a new dimension for
-          the encoded output.
-        - `"multi_hot"`: Encodes each sample in the input into a single array
-          the same size as `num_bins`, containing a 1 for each bin index
-          index present in the sample. Treats the last dimension as the sample
-          dimension, if input shape is `(..., sample_length)`, output shape will
-          be `(..., num_tokens)`.
-        - `"count"`: As `"multi_hot"`, but the int array contains a count of the
-          number of times the bin index appeared in the sample.
-    sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
-      and `"count"` output modes. If True, returns a `SparseTensor` instead of
-      a dense `Tensor`. Defaults to False.
-    **kwargs: Keyword arguments to construct a layer.
-
-  Input shape:
-    A single or list of string, int32 or int64 `Tensor`,
-    `SparseTensor` or `RaggedTensor` of shape `(batch_size, ...,)`
-
-  Output shape:
-    An int64 `Tensor`, `SparseTensor` or `RaggedTensor` of shape
-    `(batch_size, ...)`. If any input is `RaggedTensor` then output is
-    `RaggedTensor`, otherwise if any input is `SparseTensor` then output is
-    `SparseTensor`, otherwise the output is `Tensor`.
-
-  Reference:
-    - [SipHash with salt](https://www.131002.net/siphash/siphash.pdf)
-
-  """
-
-  def __init__(self,
-               num_bins,
-               mask_value=None,
-               salt=None,
-               output_mode='int',
-               sparse=False,
-               **kwargs):
-    if num_bins is None or num_bins <= 0:
-      raise ValueError(
-          f'The `num_bins` for `Hashing` cannot be `None` or non-positive '
-          f'values. Received: num_bins={num_bins}.')
-
-    # By default, output int64 when output_mode='int' and floats otherwise.
-    if 'dtype' not in kwargs or kwargs['dtype'] is None:
-      kwargs['dtype'] = tf.int64 if output_mode == INT else backend.floatx()
-    elif output_mode == 'int' and not tf.as_dtype(kwargs['dtype']).is_integer:
-      # Compat for when dtype was always floating and ignored by the layer.
-      kwargs['dtype'] = tf.int64
-
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('Hashing').set(True)
-
-    # Check dtype only after base layer parses it; dtype parsing is complex.
-    if output_mode == INT and not tf.as_dtype(self.compute_dtype).is_integer:
-      input_dtype = kwargs['dtype']
-      raise ValueError('When `output_mode="int"`, `dtype` should be an integer '
-                       f'type. Received: dtype={input_dtype}')
-
-    # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)
-    layer_utils.validate_string_arg(
-        output_mode,
-        allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),
-        layer_name=self.__class__.__name__,
-        arg_name='output_mode')
-
-    if sparse and output_mode == INT:
-      raise ValueError(f'`sparse` may only be true if `output_mode` is '
-                       f'`"one_hot"`, `"multi_hot"`, or `"count"`. '
-                       f'Received: sparse={sparse} and '
-                       f'output_mode={output_mode}')
-
-    self.num_bins = num_bins
-    self.mask_value = mask_value
-    self.strong_hash = True if salt is not None else False
-    self.output_mode = output_mode
-    self.sparse = sparse
-    self.salt = None
-    if salt is not None:
-      if isinstance(salt, (tuple, list)) and len(salt) == 2:
-        self.salt = salt
-      elif isinstance(salt, int):
-        self.salt = [salt, salt]
-      else:
-        raise ValueError(
-            f'The `salt` argument for `Hashing` can only be a tuple of size 2 '
-            f'integers, or a single integer. Received: salt={salt}.')
-
-  def call(self, inputs):
-    inputs = utils.ensure_tensor(inputs)
-    if isinstance(inputs, tf.SparseTensor):
-      indices = tf.SparseTensor(
-          indices=inputs.indices,
-          values=self._hash_values_to_bins(inputs.values),
-          dense_shape=inputs.dense_shape)
-    else:
-      indices = self._hash_values_to_bins(inputs)
-    return utils.encode_categorical_inputs(
-        indices,
-        output_mode=self.output_mode,
-        depth=self.num_bins,
-        sparse=self.sparse,
-        dtype=self.compute_dtype)
-
-  def _hash_values_to_bins(self, values):
-    """Converts a non-sparse tensor of values to bin indices."""
-    hash_bins = self.num_bins
-    mask = None
-    # If mask_value is set, the zeroth bin is reserved for it.
-    if self.mask_value is not None and hash_bins > 1:
-      hash_bins -= 1
-      mask = tf.equal(values, self.mask_value)
-    # Convert all values to strings before hashing.
-    if values.dtype.is_integer:
-      values = tf.as_string(values)
-    # Hash the strings.
-    if self.strong_hash:
-      values = tf.strings.to_hash_bucket_strong(
-          values, hash_bins, name='hash', key=self.salt)
-    else:
-      values = tf.strings.to_hash_bucket_fast(values, hash_bins, name='hash')
-    if mask is not None:
-      values = tf.add(values, tf.ones_like(values))
-      values = tf.where(mask, tf.zeros_like(values), values)
-    return values
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def compute_output_signature(self, input_spec):
-    output_shape = self.compute_output_shape(input_spec.shape)
-    if isinstance(input_spec, tf.SparseTensorSpec):
-      return tf.SparseTensorSpec(shape=output_shape, dtype=self.compute_dtype)
-    else:
-      return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'num_bins': self.num_bins,
-        'salt': self.salt,
-        'mask_value': self.mask_value,
-        'output_mode': self.output_mode,
-        'sparse': self.sparse,
-    })
-    return config
+    """A preprocessing layer which hashes and bins categorical features.
+
+    This layer transforms categorical inputs to hashed output. It element-wise
+    converts a ints or strings to ints in a fixed range. The stable hash
+    function uses `tensorflow::ops::Fingerprint` to produce the same output
+    consistently across all platforms.
+
+    This layer uses [FarmHash64](https://github.com/google/farmhash) by default,
+    which provides a consistent hashed output across different platforms and is
+    stable across invocations, regardless of device and context, by mixing the
+    input bits thoroughly.
+
+    If you want to obfuscate the hashed output, you can also pass a random
+    `salt` argument in the constructor. In that case, the layer will use the
+    [SipHash64](https://github.com/google/highwayhash) hash function, with
+    the `salt` value serving as additional input to the hash function.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    **Example (FarmHash64)**
+
+    >>> layer = tf.keras.layers.Hashing(num_bins=3)
+    >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
+    >>> layer(inp)
+    <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+      array([[1],
+             [0],
+             [1],
+             [1],
+             [2]])>
+
+    **Example (FarmHash64) with a mask value**
+
+    >>> layer = tf.keras.layers.Hashing(num_bins=3, mask_value='')
+    >>> inp = [['A'], ['B'], [''], ['C'], ['D']]
+    >>> layer(inp)
+    <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+      array([[1],
+             [1],
+             [0],
+             [2],
+             [2]])>
+
+    **Example (SipHash64)**
+
+    >>> layer = tf.keras.layers.Hashing(num_bins=3, salt=[133, 137])
+    >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
+    >>> layer(inp)
+    <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+      array([[1],
+             [2],
+             [1],
+             [0],
+             [2]])>
+
+    **Example (Siphash64 with a single integer, same as `salt=[133, 133]`)**
+
+    >>> layer = tf.keras.layers.Hashing(num_bins=3, salt=133)
+    >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
+    >>> layer(inp)
+    <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+      array([[0],
+             [0],
+             [2],
+             [1],
+             [0]])>
+
+    Args:
+      num_bins: Number of hash bins. Note that this includes the `mask_value`
+        bin, so the effective number of bins is `(num_bins - 1)` if `mask_value`
+        is set.
+      mask_value: A value that represents masked inputs, which are mapped to
+        index 0. `None` means no mask term will be added and the
+        hashing will start at index 0. Defaults to `None`.
+      salt: A single unsigned integer or None.
+        If passed, the hash function used will be SipHash64, with these values
+        used as an additional input (known as a "salt" in cryptography).
+        These should be non-zero. If `None`, uses the FarmHash64 hash function.
+        It also supports tuple/list of 2 unsigned integer numbers, see
+        reference paper for details. Defaults to `None`.
+      output_mode: Specification for the output of the layer. Values can bes
+        `"int"`, `"one_hot"`, `"multi_hot"`, or
+        `"count"` configuring the layer as follows:
+          - `"int"`: Return the integer bin indices directly.
+          - `"one_hot"`: Encodes each individual element in the input into an
+            array the same size as `num_bins`, containing a 1 at the input's bin
+            index. If the last dimension is size 1, will encode on that
+            dimension.  If the last dimension is not size 1, will append a new
+            dimension for the encoded output.
+          - `"multi_hot"`: Encodes each sample in the input into a single array
+            the same size as `num_bins`, containing a 1 for each bin index
+            index present in the sample. Treats the last dimension as the sample
+            dimension, if input shape is `(..., sample_length)`, output shape
+            will be `(..., num_tokens)`.
+          - `"count"`: As `"multi_hot"`, but the int array contains a count of
+            the number of times the bin index appeared in the sample.
+        Defaults to `"int"`.
+      sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
+        and `"count"` output modes. If True, returns a `SparseTensor` instead of
+        a dense `Tensor`. Defaults to `False`.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Input shape:
+      A single or list of string, int32 or int64 `Tensor`,
+      `SparseTensor` or `RaggedTensor` of shape `(batch_size, ...,)`
+
+    Output shape:
+      An int64 `Tensor`, `SparseTensor` or `RaggedTensor` of shape
+      `(batch_size, ...)`. If any input is `RaggedTensor` then output is
+      `RaggedTensor`, otherwise if any input is `SparseTensor` then output is
+      `SparseTensor`, otherwise the output is `Tensor`.
+
+    Reference:
+      - [SipHash with salt](https://www.131002.net/siphash/siphash.pdf)
+
+    """
+
+    def __init__(
+        self,
+        num_bins,
+        mask_value=None,
+        salt=None,
+        output_mode="int",
+        sparse=False,
+        **kwargs,
+    ):
+        if num_bins is None or num_bins <= 0:
+            raise ValueError(
+                "The `num_bins` for `Hashing` cannot be `None` or "
+                f"non-positive values. Received: num_bins={num_bins}."
+            )
+
+        # By default, output int64 when output_mode='int' and floats otherwise.
+        if "dtype" not in kwargs or kwargs["dtype"] is None:
+            kwargs["dtype"] = (
+                tf.int64 if output_mode == INT else backend.floatx()
+            )
+        elif (
+            output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer
+        ):
+            # Compat for when dtype was always floating and ignored by the
+            # layer.
+            kwargs["dtype"] = tf.int64
+
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("Hashing").set(True)
+
+        # Check dtype only after base layer parses it; dtype parsing is complex.
+        if (
+            output_mode == INT
+            and not tf.as_dtype(self.compute_dtype).is_integer
+        ):
+            input_dtype = kwargs["dtype"]
+            raise ValueError(
+                'When `output_mode="int"`, `dtype` should be an integer '
+                f"type. Received: dtype={input_dtype}"
+            )
+
+        # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)
+        layer_utils.validate_string_arg(
+            output_mode,
+            allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),
+            layer_name=self.__class__.__name__,
+            arg_name="output_mode",
+        )
+
+        if sparse and output_mode == INT:
+            raise ValueError(
+                "`sparse` may only be true if `output_mode` is "
+                '`"one_hot"`, `"multi_hot"`, or `"count"`. '
+                f"Received: sparse={sparse} and "
+                f"output_mode={output_mode}"
+            )
+
+        self.num_bins = num_bins
+        self.mask_value = mask_value
+        self.strong_hash = True if salt is not None else False
+        self.output_mode = output_mode
+        self.sparse = sparse
+        self.salt = None
+        if salt is not None:
+            if isinstance(salt, (tuple, list)) and len(salt) == 2:
+                self.salt = salt
+            elif isinstance(salt, int):
+                self.salt = [salt, salt]
+            else:
+                raise ValueError(
+                    "The `salt` argument for `Hashing` can only be a tuple of "
+                    "size 2 integers, or a single integer. "
+                    f"Received: salt={salt}."
+                )
+
+    def call(self, inputs):
+        inputs = utils.ensure_tensor(inputs)
+        if isinstance(inputs, tf.SparseTensor):
+            indices = tf.SparseTensor(
+                indices=inputs.indices,
+                values=self._hash_values_to_bins(inputs.values),
+                dense_shape=inputs.dense_shape,
+            )
+        else:
+            indices = self._hash_values_to_bins(inputs)
+        return utils.encode_categorical_inputs(
+            indices,
+            output_mode=self.output_mode,
+            depth=self.num_bins,
+            sparse=self.sparse,
+            dtype=self.compute_dtype,
+        )
+
+    def _hash_values_to_bins(self, values):
+        """Converts a non-sparse tensor of values to bin indices."""
+        hash_bins = self.num_bins
+        mask = None
+        # If mask_value is set, the zeroth bin is reserved for it.
+        if self.mask_value is not None and hash_bins > 1:
+            hash_bins -= 1
+            mask = tf.equal(values, self.mask_value)
+        # Convert all values to strings before hashing.
+        if values.dtype.is_integer:
+            values = tf.as_string(values)
+        # Hash the strings.
+        if self.strong_hash:
+            values = tf.strings.to_hash_bucket_strong(
+                values, hash_bins, name="hash", key=self.salt
+            )
+        else:
+            values = tf.strings.to_hash_bucket_fast(
+                values, hash_bins, name="hash"
+            )
+        if mask is not None:
+            values = tf.add(values, tf.ones_like(values))
+            values = tf.where(mask, tf.zeros_like(values), values)
+        return values
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def compute_output_signature(self, input_spec):
+        output_shape = self.compute_output_shape(input_spec.shape)
+        if isinstance(input_spec, tf.SparseTensorSpec):
+            return tf.SparseTensorSpec(
+                shape=output_shape, dtype=self.compute_dtype
+            )
+        else:
+            return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_bins": self.num_bins,
+                "salt": self.salt,
+                "mask_value": self.mask_value,
+                "output_mode": self.output_mode,
+                "sparse": self.sparse,
+            }
+        )
+        return config
diff --git a/keras/layers/preprocessing/hashing_distribution_test.py b/keras/layers/preprocessing/hashing_distribution_test.py
index 9814b1d38f83..af6a1fab4c29 100644
--- a/keras/layers/preprocessing/hashing_distribution_test.py
+++ b/keras/layers/preprocessing/hashing_distribution_test.py
@@ -15,6 +15,8 @@
 """Tests for keras.layers.preprocessing.hashing."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras import backend
@@ -23,42 +25,49 @@
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-from tensorflow.python.framework import test_util as tf_test_utils
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        strategy=strategy_combinations.all_strategies +
-        strategy_combinations.multi_worker_mirrored_strategies +
-        strategy_combinations.parameter_server_strategies_single_worker +
-        strategy_combinations.parameter_server_strategies_multi_worker,
-        mode=["eager"]))
-class HashingDistributionTest(test_combinations.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_strategy(self, strategy):
-    if (backend.is_tpu_strategy(strategy) and
-        not tf_test_utils.is_mlir_bridge_enabled()):
-      self.skipTest("TPU tests require MLIR bridge")
+        strategy=strategy_combinations.all_strategies
+        + strategy_combinations.multi_worker_mirrored_strategies
+        + strategy_combinations.parameter_server_strategies_single_worker
+        + strategy_combinations.parameter_server_strategies_multi_worker,
+        mode=["eager"],
+    )
+)
+class HashingDistributionTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_strategy(self, strategy):
+        if (
+            backend.is_tpu_strategy(strategy)
+            and not tf_test_utils.is_mlir_bridge_enabled()
+        ):
+            self.skipTest("TPU tests require MLIR bridge")
 
-    input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]])
-    input_dataset = tf.data.Dataset.from_tensor_slices(input_data).batch(
-        2, drop_remainder=True)
-    expected_output = [[0], [0], [1], [0]]
+        input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]])
+        input_dataset = tf.data.Dataset.from_tensor_slices(input_data).batch(
+            2, drop_remainder=True
+        )
+        expected_output = [[0], [0], [1], [0]]
 
-    tf.config.set_soft_device_placement(True)
+        tf.config.set_soft_device_placement(True)
 
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = hashing.Hashing(num_bins=2)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_dataset)
-    self.assertAllEqual(expected_output, output_dataset)
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = hashing.Hashing(num_bins=2)
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_dataset)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/layers/preprocessing/hashing_test.py b/keras/layers/preprocessing/hashing_test.py
index f7d018a4571e..7bb20dc1eab8 100644
--- a/keras/layers/preprocessing/hashing_test.py
+++ b/keras/layers/preprocessing/hashing_test.py
@@ -15,6 +15,9 @@
 """Tests for hashing layer."""
 
 import os
+
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 import keras
@@ -24,393 +27,444 @@
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class HashingTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ('list', list),
-      ('tuple', tuple),
-      ('numpy', np.array),
-      ('array_like', preprocessing_test_utils.ArrayLike),
-  )
-  def test_tensor_like_inputs(self, data_fn):
-    input_data = data_fn([0, 1, 2, 3, 4])
-    expected_output = [1, 0, 1, 0, 2]
-
-    layer = hashing.Hashing(num_bins=3)
-    output_data = layer(input_data)
-    self.assertAllEqual(output_data, expected_output)
-
-  def test_hash_single_bin(self):
-    layer = hashing.Hashing(num_bins=1)
-    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
-    output = layer(inp)
-    self.assertAllClose([[0], [0], [0], [0], [0]], output)
-
-  def test_hash_dense_input_farmhash(self):
-    layer = hashing.Hashing(num_bins=2)
-    inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
-                      ['skywalker']])
-    output = layer(inp)
-    # Assert equal for hashed output that should be true on all platforms.
-    self.assertAllClose([[0], [0], [1], [0], [0]], output)
-
-  def test_hash_dense_input_mask_value_farmhash(self):
-    empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
-    omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
-    inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
-                      ['skywalker']])
-    empty_mask_output = empty_mask_layer(inp)
-    omar_mask_output = omar_mask_layer(inp)
-    # Outputs should be one more than test_hash_dense_input_farmhash (the zeroth
-    # bin is now reserved for masks).
-    self.assertAllClose([[1], [1], [2], [1], [1]], empty_mask_output)
-    # 'omar' should map to 0.
-    self.assertAllClose([[0], [1], [2], [1], [1]], omar_mask_output)
-
-  def test_hash_dense_list_input_farmhash(self):
-    layer = hashing.Hashing(num_bins=2)
-    inp = [['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]
-    output = layer(inp)
-    # Assert equal for hashed output that should be true on all platforms.
-    self.assertAllClose([[0], [0], [1], [0], [0]], output)
-
-    inp = ['omar', 'stringer', 'marlo', 'wire', 'skywalker']
-    output = layer(inp)
-    # Assert equal for hashed output that should be true on all platforms.
-    self.assertAllClose([0, 0, 1, 0, 0], output)
-
-  def test_hash_dense_int_input_farmhash(self):
-    layer = hashing.Hashing(num_bins=3)
-    inp = np.asarray([[0], [1], [2], [3], [4]])
-    output = layer(inp)
-    # Assert equal for hashed output that should be true on all platforms.
-    self.assertAllClose([[1], [0], [1], [0], [2]], output)
-
-  def test_hash_dense_input_siphash(self):
-    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
-    inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
-                      ['skywalker']])
-    output = layer(inp)
-    # Assert equal for hashed output that should be true on all platforms.
-    # Note the result is different from FarmHash.
-    self.assertAllClose([[0], [1], [0], [1], [0]], output)
-
-    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
-    output_2 = layer_2(inp)
-    # Note the result is different from (133, 137).
-    self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
-
-  def test_hash_dense_int_input_siphash(self):
-    layer = hashing.Hashing(num_bins=3, salt=[133, 137])
-    inp = np.asarray([[0], [1], [2], [3], [4]])
-    output = layer(inp)
-    # Assert equal for hashed output that should be true on all platforms.
-    self.assertAllClose([[1], [1], [2], [0], [1]], output)
-
-  def test_hash_sparse_input_farmhash(self):
-    layer = hashing.Hashing(num_bins=2)
-    indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
-    inp = tf.SparseTensor(
-        indices=indices,
-        values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
-        dense_shape=[3, 2])
-    output = layer(inp)
-    self.assertAllClose(indices, output.indices)
-    self.assertAllClose([0, 0, 1, 0, 0], output.values)
-
-  def test_hash_sparse_input_mask_value_farmhash(self):
-    empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
-    omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
-    indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
-    inp = tf.SparseTensor(
-        indices=indices,
-        values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
-        dense_shape=[3, 2])
-    empty_mask_output = empty_mask_layer(inp)
-    omar_mask_output = omar_mask_layer(inp)
-    self.assertAllClose(indices, omar_mask_output.indices)
-    self.assertAllClose(indices, empty_mask_output.indices)
-    # Outputs should be one more than test_hash_sparse_input_farmhash (the
-    # zeroth bin is now reserved for masks).
-    self.assertAllClose([1, 1, 2, 1, 1], empty_mask_output.values)
-    # 'omar' should map to 0.
-    self.assertAllClose([0, 1, 2, 1, 1], omar_mask_output.values)
-
-  def test_hash_sparse_int_input_farmhash(self):
-    layer = hashing.Hashing(num_bins=3)
-    indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
-    inp = tf.SparseTensor(
-        indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2])
-    output = layer(inp)
-    self.assertAllClose(indices, output.indices)
-    self.assertAllClose([1, 0, 1, 0, 2], output.values)
-
-  def test_hash_sparse_input_siphash(self):
-    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
-    indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
-    inp = tf.SparseTensor(
-        indices=indices,
-        values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
-        dense_shape=[3, 2])
-    output = layer(inp)
-    self.assertAllClose(output.indices, indices)
-    # The result should be same with test_hash_dense_input_siphash.
-    self.assertAllClose([0, 1, 0, 1, 0], output.values)
-
-    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
-    output = layer_2(inp)
-    # The result should be same with test_hash_dense_input_siphash.
-    self.assertAllClose([1, 0, 1, 0, 1], output.values)
-
-  def test_hash_sparse_int_input_siphash(self):
-    layer = hashing.Hashing(num_bins=3, salt=[133, 137])
-    indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
-    inp = tf.SparseTensor(
-        indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2])
-    output = layer(inp)
-    self.assertAllClose(indices, output.indices)
-    self.assertAllClose([1, 1, 2, 0, 1], output.values)
-
-  def test_hash_ragged_string_input_farmhash(self):
-    layer = hashing.Hashing(num_bins=2)
-    inp_data = tf.ragged.constant(
-        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
-        dtype=tf.string)
-    out_data = layer(inp_data)
-    # Same hashed output as test_hash_sparse_input_farmhash
-    expected_output = [[0, 0, 1, 0], [1, 0, 0]]
-    self.assertAllEqual(expected_output, out_data)
-
-    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.string)
-    out_t = layer(inp_t)
-    model = training.Model(inputs=inp_t, outputs=out_t)
-    self.assertAllClose(out_data, model.predict(inp_data))
-
-  def test_hash_ragged_input_mask_value(self):
-    empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
-    omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
-    inp_data = tf.ragged.constant(
-        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
-        dtype=tf.string)
-    empty_mask_output = empty_mask_layer(inp_data)
-    omar_mask_output = omar_mask_layer(inp_data)
-    # Outputs should be one more than test_hash_ragged_string_input_farmhash
-    # (the zeroth bin is now reserved for masks).
-    expected_output = [[1, 1, 2, 1], [2, 1, 1]]
-    self.assertAllClose(expected_output, empty_mask_output)
-    # 'omar' should map to 0.
-    expected_output = [[0, 1, 2, 1], [2, 1, 1]]
-    self.assertAllClose(expected_output, omar_mask_output)
-
-  def test_hash_ragged_int_input_farmhash(self):
-    layer = hashing.Hashing(num_bins=3)
-    inp_data = tf.ragged.constant([[0, 1, 3, 4], [2, 1, 0]], dtype=tf.int64)
-    out_data = layer(inp_data)
-    # Same hashed output as test_hash_sparse_input_farmhash
-    expected_output = [[1, 0, 0, 2], [1, 0, 1]]
-    self.assertAllEqual(expected_output, out_data)
-
-    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.int64)
-    out_t = layer(inp_t)
-    model = training.Model(inputs=inp_t, outputs=out_t)
-    self.assertAllClose(out_data, model.predict(inp_data))
-
-  def test_hash_ragged_string_input_siphash(self):
-    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
-    inp_data = tf.ragged.constant(
-        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
-        dtype=tf.string)
-    out_data = layer(inp_data)
-    # Same hashed output as test_hash_dense_input_siphash
-    expected_output = [[0, 1, 0, 1], [0, 0, 1]]
-    self.assertAllEqual(expected_output, out_data)
-
-    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.string)
-    out_t = layer(inp_t)
-    model = training.Model(inputs=inp_t, outputs=out_t)
-    self.assertAllClose(out_data, model.predict(inp_data))
-
-    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
-    out_data = layer_2(inp_data)
-    expected_output = [[1, 0, 1, 0], [1, 1, 0]]
-    self.assertAllEqual(expected_output, out_data)
-
-    out_t = layer_2(inp_t)
-    model = training.Model(inputs=inp_t, outputs=out_t)
-    self.assertAllClose(out_data, model.predict(inp_data))
-
-  def test_hash_ragged_int_input_siphash(self):
-    layer = hashing.Hashing(num_bins=3, salt=[133, 137])
-    inp_data = tf.ragged.constant([[0, 1, 3, 4], [2, 1, 0]], dtype=tf.int64)
-    out_data = layer(inp_data)
-    # Same hashed output as test_hash_sparse_input_farmhash
-    expected_output = [[1, 1, 0, 1], [2, 1, 1]]
-    self.assertAllEqual(expected_output, out_data)
-
-    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.int64)
-    out_t = layer(inp_t)
-    model = training.Model(inputs=inp_t, outputs=out_t)
-    self.assertAllClose(out_data, model.predict(inp_data))
-
-  def test_invalid_inputs(self):
-    with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
-      _ = hashing.Hashing(num_bins=None)
-    with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
-      _ = hashing.Hashing(num_bins=-1)
-    with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
-      _ = hashing.Hashing(num_bins=2, salt='string')
-    with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
-      _ = hashing.Hashing(num_bins=2, salt=[1])
-    with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
-      _ = hashing.Hashing(num_bins=1, salt=tf.constant([133, 137]))
-
-  def test_one_hot_output(self):
-    input_array = np.array([0, 1, 2, 3, 4])
-
-    expected_output = [[0., 1., 0.],
-                       [1., 0., 0.],
-                       [0., 1., 0.],
-                       [1., 0., 0.],
-                       [0., 0., 1.]]
-    expected_output_shape = [None, 3]
-
-    inputs = keras.Input(shape=(1,), dtype='int32')
-    layer = hashing.Hashing(num_bins=3, output_mode='one_hot')
-    outputs = layer(inputs)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-
-    model = keras.Model(inputs, outputs)
-    output_data = model(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_multi_hot_output(self):
-    input_array = np.array([0, 1, 2, 3, 4])
-
-    expected_output = [1., 1., 1.]
-    expected_output_shape = [None, 3]
-
-    inputs = keras.Input(shape=(3,), dtype='int32')
-    layer = hashing.Hashing(num_bins=3, output_mode='multi_hot')
-    outputs = layer(inputs)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-
-    model = keras.Model(inputs, outputs)
-    output_data = model(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_count_output(self):
-    input_array = np.array([0, 1, 2, 3, 4])
-
-    expected_output = [2., 2., 1.]
-    expected_output_shape = [None, 3]
-
-    inputs = keras.Input(shape=(3,), dtype='int32')
-    layer = hashing.Hashing(num_bins=3, output_mode='count')
-    outputs = layer(inputs)
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-
-    model = keras.Model(inputs, outputs)
-    output_data = model(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  @parameterized.named_parameters(
-      ('int32', tf.int32),
-      ('int64', tf.int64),
-  )
-  def test_output_dtype(self, dtype):
-    input_data = keras.Input(batch_size=16, shape=(4,), dtype='string')
-    layer = hashing.Hashing(num_bins=3, dtype=dtype)
-    output = layer(input_data)
-    self.assertAllEqual(output.dtype, dtype)
-
-  def test_legacy_dtype_compat(self):
-    inputs = keras.Input(batch_size=16, shape=(4,), dtype='string')
-    layer = hashing.Hashing(num_bins=3, dtype='float32')
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.dtype, tf.int64)
-    # In TF1 we sometimes face an explicit dtype=None in the config.
-    layer = hashing.Hashing(num_bins=3, dtype=None)
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.dtype, tf.int64)
-
-  @parameterized.named_parameters(
-      ('float32', tf.float32),
-      ('float64', tf.float64),
-  )
-  def test_one_hot_output_dtype(self, dtype):
-    input_data = keras.Input(batch_size=16, shape=(1,), dtype='string')
-    layer = hashing.Hashing(num_bins=3, output_mode='one_hot', dtype=dtype)
-    output = layer(input_data)
-    self.assertAllEqual(output.dtype, dtype)
-
-  def test_hash_compute_output_signature(self):
-    input_shape = tf.TensorShape([2, 3])
-    input_spec = tf.TensorSpec(input_shape, tf.string)
-    layer = hashing.Hashing(num_bins=2)
-    output_spec = layer.compute_output_signature(input_spec)
-    self.assertEqual(output_spec.shape.dims, input_shape.dims)
-    self.assertEqual(output_spec.dtype, tf.int64)
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = hashing.Hashing(num_bins=2, name='hashing')
-    config = layer.get_config()
-    layer_1 = hashing.Hashing.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_saved_model(self):
-    input_data = np.array(['omar', 'stringer', 'marlo', 'wire', 'skywalker'])
-
-    inputs = keras.Input(shape=(None,), dtype=tf.string)
-    outputs = hashing.Hashing(num_bins=100)(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    original_output_data = model(input_data)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), 'tf_keras_saved_model')
-    model.save(output_path, save_format='tf')
-    loaded_model = keras.models.load_model(output_path)
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_data = loaded_model(input_data)
-    self.assertAllClose(new_output_data, original_output_data)
-
-  @parameterized.named_parameters(
-      (
-          'list_input',
-          [1, 2, 3],
-          [1, 1, 1],
-      ),
-      (
-          'list_input_2d',
-          [[1], [2], [3]],
-          [[1], [1], [1]],
-      ),
-      (
-          'list_input_2d_multiple',
-          [[1, 2], [2, 3], [3, 4]],
-          [[1, 1], [1, 1], [1, 1]],
-      ),
-      (
-          'list_input_3d',
-          [[[1], [2]], [[2], [3]], [[3], [4]]],
-          [[[1], [1]], [[1], [1]], [[1], [1]]],
-      ),
-  )
-  def test_hash_list_input(self, input_data, expected):
-    layer = hashing.Hashing(num_bins=2)
-    out_data = layer(input_data)
-    self.assertAllEqual(expected, out_data.numpy().tolist())
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @parameterized.named_parameters(
+        ("list", list),
+        ("tuple", tuple),
+        ("numpy", np.array),
+        ("array_like", preprocessing_test_utils.ArrayLike),
+    )
+    def test_tensor_like_inputs(self, data_fn):
+        input_data = data_fn([0, 1, 2, 3, 4])
+        expected_output = [1, 0, 1, 0, 2]
+
+        layer = hashing.Hashing(num_bins=3)
+        output_data = layer(input_data)
+        self.assertAllEqual(output_data, expected_output)
+
+    def test_hash_single_bin(self):
+        layer = hashing.Hashing(num_bins=1)
+        inp = np.asarray([["A"], ["B"], ["C"], ["D"], ["E"]])
+        output = layer(inp)
+        self.assertAllClose([[0], [0], [0], [0], [0]], output)
+
+    def test_hash_dense_input_farmhash(self):
+        layer = hashing.Hashing(num_bins=2)
+        inp = np.asarray(
+            [["omar"], ["stringer"], ["marlo"], ["wire"], ["skywalker"]]
+        )
+        output = layer(inp)
+        # Assert equal for hashed output that should be true on all platforms.
+        self.assertAllClose([[0], [0], [1], [0], [0]], output)
+
+    def test_hash_dense_input_mask_value_farmhash(self):
+        empty_mask_layer = hashing.Hashing(num_bins=3, mask_value="")
+        omar_mask_layer = hashing.Hashing(num_bins=3, mask_value="omar")
+        inp = np.asarray(
+            [["omar"], ["stringer"], ["marlo"], ["wire"], ["skywalker"]]
+        )
+        empty_mask_output = empty_mask_layer(inp)
+        omar_mask_output = omar_mask_layer(inp)
+        # Outputs should be one more than test_hash_dense_input_farmhash (the
+        # zeroth bin is now reserved for masks).
+        self.assertAllClose([[1], [1], [2], [1], [1]], empty_mask_output)
+        # 'omar' should map to 0.
+        self.assertAllClose([[0], [1], [2], [1], [1]], omar_mask_output)
+
+    def test_hash_dense_list_input_farmhash(self):
+        layer = hashing.Hashing(num_bins=2)
+        inp = [["omar"], ["stringer"], ["marlo"], ["wire"], ["skywalker"]]
+        output = layer(inp)
+        # Assert equal for hashed output that should be true on all platforms.
+        self.assertAllClose([[0], [0], [1], [0], [0]], output)
+
+        inp = ["omar", "stringer", "marlo", "wire", "skywalker"]
+        output = layer(inp)
+        # Assert equal for hashed output that should be true on all platforms.
+        self.assertAllClose([0, 0, 1, 0, 0], output)
+
+    def test_hash_dense_int_input_farmhash(self):
+        layer = hashing.Hashing(num_bins=3)
+        inp = np.asarray([[0], [1], [2], [3], [4]])
+        output = layer(inp)
+        # Assert equal for hashed output that should be true on all platforms.
+        self.assertAllClose([[1], [0], [1], [0], [2]], output)
+
+    def test_hash_dense_input_siphash(self):
+        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+        inp = np.asarray(
+            [["omar"], ["stringer"], ["marlo"], ["wire"], ["skywalker"]]
+        )
+        output = layer(inp)
+        # Assert equal for hashed output that should be true on all platforms.
+        # Note the result is different from FarmHash.
+        self.assertAllClose([[0], [1], [0], [1], [0]], output)
+
+        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
+        output_2 = layer_2(inp)
+        # Note the result is different from (133, 137).
+        self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
+
+    def test_hash_dense_int_input_siphash(self):
+        layer = hashing.Hashing(num_bins=3, salt=[133, 137])
+        inp = np.asarray([[0], [1], [2], [3], [4]])
+        output = layer(inp)
+        # Assert equal for hashed output that should be true on all platforms.
+        self.assertAllClose([[1], [1], [2], [0], [1]], output)
+
+    def test_hash_sparse_input_farmhash(self):
+        layer = hashing.Hashing(num_bins=2)
+        indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
+        inp = tf.SparseTensor(
+            indices=indices,
+            values=["omar", "stringer", "marlo", "wire", "skywalker"],
+            dense_shape=[3, 2],
+        )
+        output = layer(inp)
+        self.assertAllClose(indices, output.indices)
+        self.assertAllClose([0, 0, 1, 0, 0], output.values)
+
+    def test_hash_sparse_input_mask_value_farmhash(self):
+        empty_mask_layer = hashing.Hashing(num_bins=3, mask_value="")
+        omar_mask_layer = hashing.Hashing(num_bins=3, mask_value="omar")
+        indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
+        inp = tf.SparseTensor(
+            indices=indices,
+            values=["omar", "stringer", "marlo", "wire", "skywalker"],
+            dense_shape=[3, 2],
+        )
+        empty_mask_output = empty_mask_layer(inp)
+        omar_mask_output = omar_mask_layer(inp)
+        self.assertAllClose(indices, omar_mask_output.indices)
+        self.assertAllClose(indices, empty_mask_output.indices)
+        # Outputs should be one more than test_hash_sparse_input_farmhash (the
+        # zeroth bin is now reserved for masks).
+        self.assertAllClose([1, 1, 2, 1, 1], empty_mask_output.values)
+        # 'omar' should map to 0.
+        self.assertAllClose([0, 1, 2, 1, 1], omar_mask_output.values)
+
+    def test_hash_sparse_int_input_farmhash(self):
+        layer = hashing.Hashing(num_bins=3)
+        indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
+        inp = tf.SparseTensor(
+            indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2]
+        )
+        output = layer(inp)
+        self.assertAllClose(indices, output.indices)
+        self.assertAllClose([1, 0, 1, 0, 2], output.values)
+
+    def test_hash_sparse_input_siphash(self):
+        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+        indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
+        inp = tf.SparseTensor(
+            indices=indices,
+            values=["omar", "stringer", "marlo", "wire", "skywalker"],
+            dense_shape=[3, 2],
+        )
+        output = layer(inp)
+        self.assertAllClose(output.indices, indices)
+        # The result should be same with test_hash_dense_input_siphash.
+        self.assertAllClose([0, 1, 0, 1, 0], output.values)
+
+        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
+        output = layer_2(inp)
+        # The result should be same with test_hash_dense_input_siphash.
+        self.assertAllClose([1, 0, 1, 0, 1], output.values)
+
+    def test_hash_sparse_int_input_siphash(self):
+        layer = hashing.Hashing(num_bins=3, salt=[133, 137])
+        indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
+        inp = tf.SparseTensor(
+            indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2]
+        )
+        output = layer(inp)
+        self.assertAllClose(indices, output.indices)
+        self.assertAllClose([1, 1, 2, 0, 1], output.values)
+
+    def test_hash_ragged_string_input_farmhash(self):
+        layer = hashing.Hashing(num_bins=2)
+        inp_data = tf.ragged.constant(
+            [
+                ["omar", "stringer", "marlo", "wire"],
+                ["marlo", "skywalker", "wire"],
+            ],
+            dtype=tf.string,
+        )
+        out_data = layer(inp_data)
+        # Same hashed output as test_hash_sparse_input_farmhash
+        expected_output = [[0, 0, 1, 0], [1, 0, 0]]
+        self.assertAllEqual(expected_output, out_data)
+
+        inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.string)
+        out_t = layer(inp_t)
+        model = training.Model(inputs=inp_t, outputs=out_t)
+        self.assertAllClose(out_data, model.predict(inp_data))
+
+    def test_hash_ragged_input_mask_value(self):
+        empty_mask_layer = hashing.Hashing(num_bins=3, mask_value="")
+        omar_mask_layer = hashing.Hashing(num_bins=3, mask_value="omar")
+        inp_data = tf.ragged.constant(
+            [
+                ["omar", "stringer", "marlo", "wire"],
+                ["marlo", "skywalker", "wire"],
+            ],
+            dtype=tf.string,
+        )
+        empty_mask_output = empty_mask_layer(inp_data)
+        omar_mask_output = omar_mask_layer(inp_data)
+        # Outputs should be one more than test_hash_ragged_string_input_farmhash
+        # (the zeroth bin is now reserved for masks).
+        expected_output = [[1, 1, 2, 1], [2, 1, 1]]
+        self.assertAllClose(expected_output, empty_mask_output)
+        # 'omar' should map to 0.
+        expected_output = [[0, 1, 2, 1], [2, 1, 1]]
+        self.assertAllClose(expected_output, omar_mask_output)
+
+    def test_hash_ragged_int_input_farmhash(self):
+        layer = hashing.Hashing(num_bins=3)
+        inp_data = tf.ragged.constant([[0, 1, 3, 4], [2, 1, 0]], dtype=tf.int64)
+        out_data = layer(inp_data)
+        # Same hashed output as test_hash_sparse_input_farmhash
+        expected_output = [[1, 0, 0, 2], [1, 0, 1]]
+        self.assertAllEqual(expected_output, out_data)
+
+        inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.int64)
+        out_t = layer(inp_t)
+        model = training.Model(inputs=inp_t, outputs=out_t)
+        self.assertAllClose(out_data, model.predict(inp_data))
+
+    def test_hash_ragged_string_input_siphash(self):
+        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+        inp_data = tf.ragged.constant(
+            [
+                ["omar", "stringer", "marlo", "wire"],
+                ["marlo", "skywalker", "wire"],
+            ],
+            dtype=tf.string,
+        )
+        out_data = layer(inp_data)
+        # Same hashed output as test_hash_dense_input_siphash
+        expected_output = [[0, 1, 0, 1], [0, 0, 1]]
+        self.assertAllEqual(expected_output, out_data)
+
+        inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.string)
+        out_t = layer(inp_t)
+        model = training.Model(inputs=inp_t, outputs=out_t)
+        self.assertAllClose(out_data, model.predict(inp_data))
+
+        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
+        out_data = layer_2(inp_data)
+        expected_output = [[1, 0, 1, 0], [1, 1, 0]]
+        self.assertAllEqual(expected_output, out_data)
+
+        out_t = layer_2(inp_t)
+        model = training.Model(inputs=inp_t, outputs=out_t)
+        self.assertAllClose(out_data, model.predict(inp_data))
+
+    def test_hash_ragged_int_input_siphash(self):
+        layer = hashing.Hashing(num_bins=3, salt=[133, 137])
+        inp_data = tf.ragged.constant([[0, 1, 3, 4], [2, 1, 0]], dtype=tf.int64)
+        out_data = layer(inp_data)
+        # Same hashed output as test_hash_sparse_input_farmhash
+        expected_output = [[1, 1, 0, 1], [2, 1, 1]]
+        self.assertAllEqual(expected_output, out_data)
+
+        inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.int64)
+        out_t = layer(inp_t)
+        model = training.Model(inputs=inp_t, outputs=out_t)
+        self.assertAllClose(out_data, model.predict(inp_data))
+
+    def test_invalid_inputs(self):
+        with self.assertRaisesRegex(ValueError, "cannot be `None`"):
+            _ = hashing.Hashing(num_bins=None)
+        with self.assertRaisesRegex(ValueError, "cannot be `None`"):
+            _ = hashing.Hashing(num_bins=-1)
+        with self.assertRaisesRegex(
+            ValueError, "can only be a tuple of size 2"
+        ):
+            _ = hashing.Hashing(num_bins=2, salt="string")
+        with self.assertRaisesRegex(
+            ValueError, "can only be a tuple of size 2"
+        ):
+            _ = hashing.Hashing(num_bins=2, salt=[1])
+        with self.assertRaisesRegex(
+            ValueError, "can only be a tuple of size 2"
+        ):
+            _ = hashing.Hashing(num_bins=1, salt=tf.constant([133, 137]))
+
+    def test_one_hot_output(self):
+        input_array = np.array([0, 1, 2, 3, 4])
+
+        expected_output = [
+            [0.0, 1.0, 0.0],
+            [1.0, 0.0, 0.0],
+            [0.0, 1.0, 0.0],
+            [1.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0],
+        ]
+        expected_output_shape = [None, 3]
+
+        inputs = keras.Input(shape=(1,), dtype="int32")
+        layer = hashing.Hashing(num_bins=3, output_mode="one_hot")
+        outputs = layer(inputs)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+
+        model = keras.Model(inputs, outputs)
+        output_data = model(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_multi_hot_output(self):
+        input_array = np.array([0, 1, 2, 3, 4])
+
+        expected_output = [1.0, 1.0, 1.0]
+        expected_output_shape = [None, 3]
+
+        inputs = keras.Input(shape=(3,), dtype="int32")
+        layer = hashing.Hashing(num_bins=3, output_mode="multi_hot")
+        outputs = layer(inputs)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+
+        model = keras.Model(inputs, outputs)
+        output_data = model(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_count_output(self):
+        input_array = np.array([0, 1, 2, 3, 4])
+
+        expected_output = [2.0, 2.0, 1.0]
+        expected_output_shape = [None, 3]
+
+        inputs = keras.Input(shape=(3,), dtype="int32")
+        layer = hashing.Hashing(num_bins=3, output_mode="count")
+        outputs = layer(inputs)
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+
+        model = keras.Model(inputs, outputs)
+        output_data = model(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    @parameterized.named_parameters(
+        ("int32", tf.int32),
+        ("int64", tf.int64),
+    )
+    def test_output_dtype(self, dtype):
+        input_data = keras.Input(batch_size=16, shape=(4,), dtype="string")
+        layer = hashing.Hashing(num_bins=3, dtype=dtype)
+        output = layer(input_data)
+        self.assertAllEqual(output.dtype, dtype)
+
+    def test_legacy_dtype_compat(self):
+        inputs = keras.Input(batch_size=16, shape=(4,), dtype="string")
+        layer = hashing.Hashing(num_bins=3, dtype="float32")
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.dtype, tf.int64)
+        # In TF1 we sometimes face an explicit dtype=None in the config.
+        layer = hashing.Hashing(num_bins=3, dtype=None)
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.dtype, tf.int64)
+
+    @parameterized.named_parameters(
+        ("float32", tf.float32),
+        ("float64", tf.float64),
+    )
+    def test_one_hot_output_dtype(self, dtype):
+        input_data = keras.Input(batch_size=16, shape=(1,), dtype="string")
+        layer = hashing.Hashing(num_bins=3, output_mode="one_hot", dtype=dtype)
+        output = layer(input_data)
+        self.assertAllEqual(output.dtype, dtype)
+
+    def test_hash_compute_output_signature(self):
+        input_shape = tf.TensorShape([2, 3])
+        input_spec = tf.TensorSpec(input_shape, tf.string)
+        layer = hashing.Hashing(num_bins=2)
+        output_spec = layer.compute_output_signature(input_spec)
+        self.assertEqual(output_spec.shape.dims, input_shape.dims)
+        self.assertEqual(output_spec.dtype, tf.int64)
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
+        layer = hashing.Hashing(num_bins=2, name="hashing")
+        config = layer.get_config()
+        layer_1 = hashing.Hashing.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_saved_model(self):
+        input_data = np.array(
+            ["omar", "stringer", "marlo", "wire", "skywalker"]
+        )
+
+        inputs = keras.Input(shape=(None,), dtype=tf.string)
+        outputs = hashing.Hashing(num_bins=100)(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        original_output_data = model(input_data)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format="tf")
+        loaded_model = keras.models.load_model(output_path)
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = loaded_model(input_data)
+        self.assertAllClose(new_output_data, original_output_data)
+
+    @test_utils.run_v2_only
+    def test_save_keras_v3(self):
+        input_data = np.array(
+            ["omar", "stringer", "marlo", "wire", "skywalker"]
+        )
+
+        inputs = keras.Input(shape=(None,), dtype=tf.string)
+        outputs = hashing.Hashing(num_bins=100)(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        original_output_data = model(input_data)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_model.keras")
+        model.save(output_path, save_format="keras_v3")
+        loaded_model = keras.models.load_model(output_path)
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = loaded_model(input_data)
+        self.assertAllClose(new_output_data, original_output_data)
+
+    @parameterized.named_parameters(
+        (
+            "list_input",
+            [1, 2, 3],
+            [1, 1, 1],
+        ),
+        (
+            "list_input_2d",
+            [[1], [2], [3]],
+            [[1], [1], [1]],
+        ),
+        (
+            "list_input_2d_multiple",
+            [[1, 2], [2, 3], [3, 4]],
+            [[1, 1], [1, 1], [1, 1]],
+        ),
+        (
+            "list_input_3d",
+            [[[1], [2]], [[2], [3]], [[3], [4]]],
+            [[[1], [1]], [[1], [1]], [[1], [1]]],
+        ),
+    )
+    def test_hash_list_input(self, input_data, expected):
+        layer = hashing.Hashing(num_bins=2)
+        out_data = layer(input_data)
+        self.assertAllEqual(expected, out_data.numpy().tolist())
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/preprocessing/image_preprocessing.py b/keras/layers/preprocessing/image_preprocessing.py
index cf8416c5ec18..b2c74b9f65eb 100644
--- a/keras/layers/preprocessing/image_preprocessing.py
+++ b/keras/layers/preprocessing/image_preprocessing.py
@@ -14,8 +14,9 @@
 # ==============================================================================
 """Keras image preprocessing layers."""
 
-# pylint: disable=g-classes-have-attributes
-
+import numpy as np
+import tensorflow.compat.v2 as tf
+from tensorflow.python.util.tf_export import keras_export
 
 from keras import backend
 from keras.engine import base_layer
@@ -23,1903 +24,1742 @@
 from keras.layers.preprocessing import preprocessing_utils as utils
 from keras.utils import image_utils
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.ops import stateless_random_ops
-from tensorflow.python.util.tf_export import keras_export
-from tensorflow.tools.docs import doc_controls
 
 H_AXIS = -3
 W_AXIS = -2
 
-IMAGES = 'images'
-LABELS = 'labels'
-TARGETS = 'targets'
-BOUNDING_BOXES = 'bounding_boxes'
-
 
 def check_fill_mode_and_interpolation(fill_mode, interpolation):
-  if fill_mode not in {'reflect', 'wrap', 'constant', 'nearest'}:
-    raise NotImplementedError(
-        'Unknown `fill_mode` {}. Only `reflect`, `wrap`, '
-        '`constant` and `nearest` are supported.'.format(fill_mode))
-  if interpolation not in {'nearest', 'bilinear'}:
-    raise NotImplementedError('Unknown `interpolation` {}. Only `nearest` and '
-                              '`bilinear` are supported.'.format(interpolation))
+    if fill_mode not in {"reflect", "wrap", "constant", "nearest"}:
+        raise NotImplementedError(
+            f"Unknown `fill_mode` {fill_mode}. Only `reflect`, `wrap`, "
+            "`constant` and `nearest` are supported."
+        )
+    if interpolation not in {"nearest", "bilinear"}:
+        raise NotImplementedError(
+            f"Unknown `interpolation` {interpolation}. Only `nearest` and "
+            "`bilinear` are supported."
+        )
+
+
+@keras_export(
+    "keras.layers.Resizing", "keras.layers.experimental.preprocessing.Resizing"
+)
+class Resizing(base_layer.Layer):
+    """A preprocessing layer which resizes images.
 
+    This layer resizes an image input to a target height and width. The input
+    should be a 4D (batched) or 3D (unbatched) tensor in `"channels_last"`
+    format. Input pixel values can be of any range
+    (e.g. `[0., 1.)` or `[0, 255]`) and of integer or floating point dtype.
+    By default, the layer will output floats.
 
-@keras_export('keras.layers.Resizing',
-              'keras.layers.experimental.preprocessing.Resizing')
-class Resizing(base_layer.Layer):
-  """A preprocessing layer which resizes images.
-
-  This layer resizes an image input to a target height and width. The input
-  should be a 4D (batched) or 3D (unbatched) tensor in `"channels_last"` format.
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and of
-  interger or floating point dtype. By default, the layer will output floats.
-
-  This layer can be called on tf.RaggedTensor batches of input images of
-  distinct sizes, and will resize the outputs to dense tensors of uniform size.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    height: Integer, the height of the output shape.
-    width: Integer, the width of the output shape.
-    interpolation: String, the interpolation method. Defaults to `"bilinear"`.
-      Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`, `"lanczos3"`,
-      `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
-    crop_to_aspect_ratio: If True, resize the images without aspect
-      ratio distortion. When the original aspect ratio differs from the target
-      aspect ratio, the output image will be cropped so as to return the largest
-      possible window in the image (of size `(height, width)`) that matches
-      the target aspect ratio. By default (`crop_to_aspect_ratio=False`),
-      aspect ratio may not be preserved.
-  """
-
-  def __init__(self,
-               height,
-               width,
-               interpolation='bilinear',
-               crop_to_aspect_ratio=False,
-               **kwargs):
-    self.height = height
-    self.width = width
-    self.interpolation = interpolation
-    self.crop_to_aspect_ratio = crop_to_aspect_ratio
-    self._interpolation_method = image_utils.get_interpolation(interpolation)
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('Resizing').set(True)
-
-  def call(self, inputs):
-    # tf.image.resize will always output float32 and operate more efficiently on
-    # float32 unless interpolation is nearest, in which case ouput type matches
-    # input type.
-    if self.interpolation == 'nearest':
-      input_dtype = self.compute_dtype
-    else:
-      input_dtype = tf.float32
-    inputs = utils.ensure_tensor(inputs, dtype=input_dtype)
-    size = [self.height, self.width]
-    if self.crop_to_aspect_ratio:
-      def resize_to_aspect(x):
-        if tf_utils.is_ragged(inputs):
-          x = x.to_tensor()
-        return image_utils.smart_resize(
-            x,
-            size=size,
-            interpolation=self._interpolation_method)
-
-      if tf_utils.is_ragged(inputs):
-        size_as_shape = tf.TensorShape(size)
-        shape = size_as_shape + inputs.shape[-1:]
-        spec = tf.TensorSpec(shape, input_dtype)
-        outputs = tf.map_fn(resize_to_aspect, inputs, fn_output_signature=spec)
-      else:
-        outputs = resize_to_aspect(inputs)
-    else:
-      outputs = tf.image.resize(
-          inputs,
-          size=size,
-          method=self._interpolation_method)
-    return tf.cast(outputs, self.compute_dtype)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    input_shape[H_AXIS] = self.height
-    input_shape[W_AXIS] = self.width
-    return tf.TensorShape(input_shape)
-
-  def get_config(self):
-    config = {
-        'height': self.height,
-        'width': self.width,
-        'interpolation': self.interpolation,
-        'crop_to_aspect_ratio': self.crop_to_aspect_ratio,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.CenterCrop',
-              'keras.layers.experimental.preprocessing.CenterCrop')
+    This layer can be called on tf.RaggedTensor batches of input images of
+    distinct sizes, and will resize the outputs to dense tensors of uniform
+    size.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+        height: Integer, the height of the output shape.
+        width: Integer, the width of the output shape.
+        interpolation: String, the interpolation method.
+            Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
+            `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+            Defaults to `"bilinear"`.
+        crop_to_aspect_ratio: If True, resize the images without aspect
+            ratio distortion. When the original aspect ratio differs
+            from the target aspect ratio, the output image will be
+            cropped so as to return the
+            largest possible window in the image (of size `(height, width)`)
+            that matches the target aspect ratio. By default
+            (`crop_to_aspect_ratio=False`), aspect ratio may not be preserved.
+    """
+
+    def __init__(
+        self,
+        height,
+        width,
+        interpolation="bilinear",
+        crop_to_aspect_ratio=False,
+        **kwargs,
+    ):
+        self.height = height
+        self.width = width
+        self.interpolation = interpolation
+        self.crop_to_aspect_ratio = crop_to_aspect_ratio
+        self._interpolation_method = image_utils.get_interpolation(
+            interpolation
+        )
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("Resizing").set(True)
+
+    def call(self, inputs):
+        # tf.image.resize will always output float32
+        # and operate more efficiently on float32
+        # unless interpolation is nearest, in which case ouput type matches
+        # input type.
+        if self.interpolation == "nearest":
+            input_dtype = self.compute_dtype
+        else:
+            input_dtype = tf.float32
+        inputs = convert_inputs(inputs, dtype=input_dtype)
+        size = [self.height, self.width]
+        if self.crop_to_aspect_ratio:
+
+            def resize_to_aspect(x):
+                if tf_utils.is_ragged(inputs):
+                    x = x.to_tensor()
+                return image_utils.smart_resize(
+                    x, size=size, interpolation=self._interpolation_method
+                )
+
+            if tf_utils.is_ragged(inputs):
+                size_as_shape = tf.TensorShape(size)
+                shape = size_as_shape + inputs.shape[-1:]
+                spec = tf.TensorSpec(shape, input_dtype)
+                outputs = tf.map_fn(
+                    resize_to_aspect, inputs, fn_output_signature=spec
+                )
+            else:
+                outputs = resize_to_aspect(inputs)
+        else:
+            outputs = tf.image.resize(
+                inputs, size=size, method=self._interpolation_method
+            )
+        return tf.cast(outputs, self.compute_dtype)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        input_shape[H_AXIS] = self.height
+        input_shape[W_AXIS] = self.width
+        return tf.TensorShape(input_shape)
+
+    def get_config(self):
+        config = {
+            "height": self.height,
+            "width": self.width,
+            "interpolation": self.interpolation,
+            "crop_to_aspect_ratio": self.crop_to_aspect_ratio,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export(
+    "keras.layers.CenterCrop",
+    "keras.layers.experimental.preprocessing.CenterCrop",
+)
 class CenterCrop(base_layer.Layer):
-  """A preprocessing layer which crops images.
-
-  This layers crops the central portion of the images to a target size. If an
-  image is smaller than the target size, it will be resized and cropped so as to
-  return the largest possible window in the image that matches the target aspect
-  ratio.
-
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
-  of interger or floating point dtype. By default, the layer will output floats.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., target_height, target_width, channels)`.
-
-  If the input height/width is even and the target height/width is odd (or
-  inversely), the input image is left-padded by 1 pixel.
-
-  Args:
-    height: Integer, the height of the output shape.
-    width: Integer, the width of the output shape.
-  """
-
-  def __init__(self, height, width, **kwargs):
-    self.height = height
-    self.width = width
-    super().__init__(**kwargs, autocast=False)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('CenterCrop').set(True)
-
-  def call(self, inputs):
-    inputs = utils.ensure_tensor(inputs, self.compute_dtype)
-    input_shape = tf.shape(inputs)
-    h_diff = input_shape[H_AXIS] - self.height
-    w_diff = input_shape[W_AXIS] - self.width
-
-    def center_crop():
-      h_start = tf.cast(h_diff / 2, tf.int32)
-      w_start = tf.cast(w_diff / 2, tf.int32)
-      return tf.image.crop_to_bounding_box(inputs, h_start, w_start,
-                                           self.height, self.width)
-
-    def upsize():
-      outputs = image_utils.smart_resize(inputs, [self.height, self.width])
-      # smart_resize will always output float32, so we need to re-cast.
-      return tf.cast(outputs, self.compute_dtype)
-
-    return tf.cond(
-        tf.reduce_all((h_diff >= 0, w_diff >= 0)), center_crop, upsize)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    input_shape[H_AXIS] = self.height
-    input_shape[W_AXIS] = self.width
-    return tf.TensorShape(input_shape)
-
-  def get_config(self):
-    config = {
-        'height': self.height,
-        'width': self.width,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.__internal__.layers.BaseImageAugmentationLayer')
-class BaseImageAugmentationLayer(base_layer.BaseRandomLayer):
-  """Abstract base layer for image augmentaion.
-
-  This layer contains base functionalities for preprocessing layers which
-  augment image related data, eg. image and in future, label and bounding boxes.
-  The subclasses could avoid making certain mistakes and reduce code
-  duplications.
-
-  This layer requires you to implement one method: `augment_image()`, which
-  augments one single image during the training. There are a few additional
-  methods that you can implement for added functionality on the layer:
-
-  `augment_label()`, which handles label augmentation if the layer supports
-  that.
-
-  `augment_bounding_boxes()`, which handles the bounding box augmentation, if the
-  layer supports that.
-
-  `get_random_transformation()`, which should produce a random transformation
-  setting. The tranformation object, which could be any type, will be passed to
-  `augment_image`, `augment_label` and `augment_bounding_boxes`, to coodinate
-  the randomness behavior, eg, in the RandomFlip layer, the image and
-  bounding_boxes should be changed in the same way.
-
-  The `call()` method support two formats of inputs:
-  1. Single image tensor with 3D (HWC) or 4D (NHWC) format.
-  2. A dict of tensors with stable keys. The supported keys are:
-    `"images"`, `"labels"` and `"bounding_boxes"` at the moment. We might add
-    more keys in future when we support more types of augmentation.
-
-  The output of the `call()` will be in two formats, which will be the same
-  structure as the inputs.
-
-  The `call()` will handle the logic detecting the training/inference
-  mode, unpack the inputs, forward to the correct function, and pack the output
-  back to the same structure as the inputs.
-
-  By default the `call()` method leverages the `tf.vectorized_map()` function.
-  Auto-vectorization can be disabled by setting `self.auto_vectorize = False`
-  in your `__init__()` method.  When disabled, `call()` instead relies
-  on `tf.map_fn()`. For example:
-
-  ```python
-  class SubclassLayer(BaseImageAugmentationLayer):
-    def __init__(self):
-      super().__init__()
-      self.auto_vectorize = False
-  ```
-
-  Example:
-
-  ```python
-  class RandomContrast(BaseImageAugmentationLayer):
-
-    def __init__(self, factor=(0.5, 1.5), **kwargs):
-      super().__init__(**kwargs)
-      self._factor = factor
-
-    def augment_image(self, image, transformation):
-      random_factor = tf.random.uniform([], self._factor[0], self._factor[1])
-      mean = tf.math.reduced_mean(inputs, axis=-1, keep_dim=True)
-      return (inputs - mean) * random_factor + mean
-  ```
-
-  Note that since the randomness is also a common functionnality, this layer
-  also includes a tf.keras.backend.RandomGenerator, which can be used to produce
-  the random numbers.  The random number generator is stored in the
-  `self._random_generator` attribute.
-  """
-
-  def __init__(self, rate=1.0, seed=None, **kwargs):
-    super().__init__(seed=seed, **kwargs)
-    self.rate = rate
-
-  @property
-  def auto_vectorize(self):
-    """Control whether automatic vectorization occurs.
-
-    By default the `call()` method leverages the `tf.vectorized_map()` function.
-    Auto-vectorization can be disabled by setting `self.auto_vectorize = False`
-    in your `__init__()` method.  When disabled, `call()` instead relies
-    on `tf.map_fn()`. For example:
+    """A preprocessing layer which crops images.
 
-    ```python
-    class SubclassLayer(BaseImageAugmentationLayer):
-      def __init__(self):
-        super().__init__()
-        self.auto_vectorize = False
-    ```
+    This layers crops the central portion of the images to a target size. If an
+    image is smaller than the target size, it will be resized and cropped
+    so as to return the largest possible window in the image that matches
+    the target aspect ratio.
+
+    Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
+    of integer or floating point dtype.
+    By default, the layer will output floats.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., target_height, target_width, channels)`.
+
+    If the input height/width is even and the target height/width is odd (or
+    inversely), the input image is left-padded by 1 pixel.
+
+    Args:
+        height: Integer, the height of the output shape.
+        width: Integer, the width of the output shape.
     """
-    return getattr(self, '_auto_vectorize', True)
 
-  @auto_vectorize.setter
-  def auto_vectorize(self, auto_vectorize):
-    self._auto_vectorize = auto_vectorize
+    def __init__(self, height, width, **kwargs):
+        self.height = height
+        self.width = width
+        super().__init__(**kwargs, autocast=False)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("CenterCrop").set(
+            True
+        )
+
+    def call(self, inputs):
+        inputs = convert_inputs(inputs, self.compute_dtype)
+        input_shape = tf.shape(inputs)
+        h_diff = input_shape[H_AXIS] - self.height
+        w_diff = input_shape[W_AXIS] - self.width
+
+        def center_crop():
+            h_start = tf.cast(h_diff / 2, tf.int32)
+            w_start = tf.cast(w_diff / 2, tf.int32)
+            return tf.image.crop_to_bounding_box(
+                inputs, h_start, w_start, self.height, self.width
+            )
+
+        def upsize():
+            outputs = image_utils.smart_resize(
+                inputs, [self.height, self.width]
+            )
+            # smart_resize will always output float32, so we need to re-cast.
+            return tf.cast(outputs, self.compute_dtype)
+
+        return tf.cond(
+            tf.reduce_all((h_diff >= 0, w_diff >= 0)), center_crop, upsize
+        )
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        input_shape[H_AXIS] = self.height
+        input_shape[W_AXIS] = self.width
+        return tf.TensorShape(input_shape)
+
+    def get_config(self):
+        config = {
+            "height": self.height,
+            "width": self.width,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export(
+    "keras.layers.RandomCrop",
+    "keras.layers.experimental.preprocessing.RandomCrop",
+    v1=[],
+)
+class RandomCrop(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly crops images during training.
+
+    During training, this layer will randomly choose a location to crop images
+    down to a target size. The layer will crop all the images in the same batch
+    to the same cropping location.
+
+    At inference time, and during training if an input image is smaller than the
+    target size, the input will be resized and cropped so as to return the
+    largest possible window in the image that matches the target aspect ratio.
+    If you need to apply random cropping at inference time, set `training` to
+    True when calling the layer.
+
+    Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
+    of integer or floating point dtype. By default, the layer will output
+    floats.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., target_height, target_width, channels)`.
 
-  @property
-  def _map_fn(self):
-    if self.auto_vectorize:
-      return tf.vectorized_map
-    else:
-      return tf.map_fn
+    Args:
+        height: Integer, the height of the output shape.
+        width: Integer, the width of the output shape.
+        seed: Integer. Used to create a random seed.
+    """
+
+    def __init__(self, height, width, seed=None, **kwargs):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("RandomCrop").set(
+            True
+        )
+        super().__init__(
+            **kwargs, autocast=False, seed=seed, force_generator=True
+        )
+        self.height = height
+        self.width = width
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs, dtype=self.compute_dtype)
+        input_shape = tf.shape(inputs)
+        h_diff = input_shape[H_AXIS] - self.height
+        w_diff = input_shape[W_AXIS] - self.width
+
+        def random_crop():
+            dtype = input_shape.dtype
+            rands = self._random_generator.random_uniform(
+                [2], 0, dtype.max, dtype
+            )
+            h_start = rands[0] % (h_diff + 1)
+            w_start = rands[1] % (w_diff + 1)
+            return tf.image.crop_to_bounding_box(
+                inputs, h_start, w_start, self.height, self.width
+            )
+
+        def resize():
+            outputs = image_utils.smart_resize(
+                inputs, [self.height, self.width]
+            )
+            # smart_resize will always output float32, so we need to re-cast.
+            return tf.cast(outputs, self.compute_dtype)
+
+        return tf.cond(
+            tf.reduce_all((training, h_diff >= 0, w_diff >= 0)),
+            random_crop,
+            resize,
+        )
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        input_shape[H_AXIS] = self.height
+        input_shape[W_AXIS] = self.width
+        return tf.TensorShape(input_shape)
+
+    def get_config(self):
+        config = {
+            "height": self.height,
+            "width": self.width,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export(
+    "keras.layers.Rescaling",
+    "keras.layers.experimental.preprocessing.Rescaling",
+)
+class Rescaling(base_layer.Layer):
+    """A preprocessing layer which rescales input values to a new range.
+
+    This layer rescales every value of an input (often an image) by multiplying
+    by `scale` and adding `offset`.
+
+    For instance:
+
+    1. To rescale an input in the `[0, 255]` range
+    to be in the `[0, 1]` range, you would pass `scale=1./255`.
+
+    2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range,
+    you would pass `scale=1./127.5, offset=-1`.
 
-  @doc_controls.for_subclass_implementers
-  def augment_image(self, image, transformation):
-    """Augment a single image during training.
+    The rescaling is applied both during training and inference. Inputs can be
+    of integer or floating point dtype, and by default the layer will output
+    floats.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Input shape:
+        Arbitrary.
+
+    Output shape:
+        Same as input.
 
     Args:
-      image: 3D image input tensor to the layer. Forwarded from `layer.call()`.
-      transformation: The transformation object produced by
-        `get_random_transformation`. Used to coordinate the randomness between
-        image, label and bounding box.
+        scale: Float, the scale to apply to the inputs.
+        offset: Float, the offset to apply to the inputs.
+    """
 
-    Returns:
-      output 3D tensor, which will be forward to `layer.call()`.
+    def __init__(self, scale, offset=0.0, **kwargs):
+        self.scale = scale
+        self.offset = offset
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("Rescaling").set(True)
+
+    def call(self, inputs):
+        dtype = self.compute_dtype
+        inputs = convert_inputs(inputs, dtype=dtype)
+        scale = tf.cast(self.scale, dtype)
+        offset = tf.cast(self.offset, dtype)
+        return tf.cast(inputs, dtype) * scale + offset
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "scale": self.scale,
+            "offset": self.offset,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+HORIZONTAL = "horizontal"
+VERTICAL = "vertical"
+HORIZONTAL_AND_VERTICAL = "horizontal_and_vertical"
+
+
+@keras_export(
+    "keras.layers.RandomFlip",
+    "keras.layers.experimental.preprocessing.RandomFlip",
+    v1=[],
+)
+class RandomFlip(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly flips images during training.
+
+    This layer will flip the images horizontally and or vertically based on the
+    `mode` attribute. During inference time, the output will be identical to
+    input. Call the layer with `training=True` to flip the input.
+
+    Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
+    of integer or floating point dtype.
+    By default, the layer will output floats.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Args:
+        mode: String indicating which flip mode to use. Can be `"horizontal"`,
+            `"vertical"`, or `"horizontal_and_vertical"`. `"horizontal"` is a
+            left-right flip and `"vertical"` is a top-bottom flip. Defaults to
+            `"horizontal_and_vertical"`
+        seed: Integer. Used to create a random seed.
     """
-    raise NotImplementedError()
 
-  @doc_controls.for_subclass_implementers
-  def augment_label(self, label, transformation):
-    """Augment a single label during training.
+    def __init__(self, mode=HORIZONTAL_AND_VERTICAL, seed=None, **kwargs):
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("RandomFlip").set(
+            True
+        )
+        self.mode = mode
+        if mode == HORIZONTAL:
+            self.horizontal = True
+            self.vertical = False
+        elif mode == VERTICAL:
+            self.horizontal = False
+            self.vertical = True
+        elif mode == HORIZONTAL_AND_VERTICAL:
+            self.horizontal = True
+            self.vertical = True
+        else:
+            raise ValueError(
+                f"RandomFlip layer {self.name} received an unknown mode "
+                f"argument {mode}"
+            )
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs, self.compute_dtype)
+
+        def random_flipped_inputs(inputs):
+            flipped_outputs = inputs
+            if self.horizontal:
+                seed = self._random_generator.make_seed_for_stateless_op()
+                if seed is not None:
+                    flipped_outputs = tf.image.stateless_random_flip_left_right(
+                        flipped_outputs, seed=seed
+                    )
+                else:
+                    flipped_outputs = tf.image.random_flip_left_right(
+                        flipped_outputs,
+                        self._random_generator.make_legacy_seed(),
+                    )
+            if self.vertical:
+                seed = self._random_generator.make_seed_for_stateless_op()
+                if seed is not None:
+                    flipped_outputs = tf.image.stateless_random_flip_up_down(
+                        flipped_outputs, seed=seed
+                    )
+                else:
+                    flipped_outputs = tf.image.random_flip_up_down(
+                        flipped_outputs,
+                        self._random_generator.make_legacy_seed(),
+                    )
+            flipped_outputs.set_shape(inputs.shape)
+            return flipped_outputs
+
+        if training:
+            return random_flipped_inputs(inputs)
+        else:
+            return inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "mode": self.mode,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+# TODO(tanzheny): Add examples, here and everywhere.
+@keras_export(
+    "keras.layers.RandomTranslation",
+    "keras.layers.experimental.preprocessing.RandomTranslation",
+    v1=[],
+)
+class RandomTranslation(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly translates images during training.
+
+    This layer will apply random translations to each image during training,
+    filling empty space according to `fill_mode`.
+
+    Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
+    of integer or floating point dtype. By default, the layer will output
+    floats.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+      height_factor: a float represented as fraction of value, or a tuple of
+          size 2 representing lower and upper bound for shifting vertically. A
+          negative value means shifting image up, while a positive value means
+          shifting image down. When represented as a single positive float, this
+          value is used for both the upper and lower bound. For instance,
+          `height_factor=(-0.2, 0.3)` results in an output shifted by a random
+          amount in the range `[-20%, +30%]`.  `height_factor=0.2` results in an
+          output height shifted by a random amount in the range `[-20%, +20%]`.
+      width_factor: a float represented as fraction of value, or a tuple of size
+          2 representing lower and upper bound for shifting horizontally. A
+          negative value means shifting image left, while a positive value means
+          shifting image right. When represented as a single positive float,
+          this value is used for both the upper and lower bound. For instance,
+          `width_factor=(-0.2, 0.3)` results in an output shifted left by 20%,
+          and shifted right by 30%. `width_factor=0.2` results
+          in an output height shifted left or right by 20%.
+      fill_mode: Points outside the boundaries of the input are filled according
+          to the given mode
+          (one of `{"constant", "reflect", "wrap", "nearest"}`).
+          - *reflect*: `(d c b a | a b c d | d c b a)` The input is extended by
+              reflecting about the edge of the last pixel.
+          - *constant*: `(k k k k | a b c d | k k k k)` The input is extended by
+              filling all values beyond the edge with the same constant value
+              k = 0.
+          - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
+              wrapping around to the opposite edge.
+          - *nearest*: `(a a a a | a b c d | d d d d)` The input is extended by
+              the nearest pixel.
+      interpolation: Interpolation mode. Supported values: `"nearest"`,
+          `"bilinear"`.
+      seed: Integer. Used to create a random seed.
+      fill_value: a float represents the value to be filled outside the
+          boundaries when `fill_mode="constant"`.
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`,  in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`,  in `"channels_last"` format.
+    """
+
+    def __init__(
+        self,
+        height_factor,
+        width_factor,
+        fill_mode="reflect",
+        interpolation="bilinear",
+        seed=None,
+        fill_value=0.0,
+        **kwargs,
+    ):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell(
+            "RandomTranslation"
+        ).set(True)
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        self.height_factor = height_factor
+        if isinstance(height_factor, (tuple, list)):
+            self.height_lower = height_factor[0]
+            self.height_upper = height_factor[1]
+        else:
+            self.height_lower = -height_factor
+            self.height_upper = height_factor
+        if self.height_upper < self.height_lower:
+            raise ValueError(
+                "`height_factor` cannot have upper bound less than "
+                f"lower bound, got {height_factor}"
+            )
+        if abs(self.height_lower) > 1.0 or abs(self.height_upper) > 1.0:
+            raise ValueError(
+                "`height_factor` argument must have values between [-1, 1]. "
+                f"Received: height_factor={height_factor}"
+            )
+
+        self.width_factor = width_factor
+        if isinstance(width_factor, (tuple, list)):
+            self.width_lower = width_factor[0]
+            self.width_upper = width_factor[1]
+        else:
+            self.width_lower = -width_factor
+            self.width_upper = width_factor
+        if self.width_upper < self.width_lower:
+            raise ValueError(
+                "`width_factor` cannot have upper bound less than "
+                f"lower bound, got {width_factor}"
+            )
+        if abs(self.width_lower) > 1.0 or abs(self.width_upper) > 1.0:
+            raise ValueError(
+                "`width_factor` must have values between [-1, 1], "
+                f"got {width_factor}"
+            )
+
+        check_fill_mode_and_interpolation(fill_mode, interpolation)
+
+        self.fill_mode = fill_mode
+        self.fill_value = fill_value
+        self.interpolation = interpolation
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs, self.compute_dtype)
+
+        def random_translated_inputs(inputs):
+            """Translated inputs with random ops."""
+            # The transform op only accepts rank 4 inputs,
+            # so if we have an unbatched image,
+            # we need to temporarily expand dims to a batch.
+            original_shape = inputs.shape
+            unbatched = inputs.shape.rank == 3
+            if unbatched:
+                inputs = tf.expand_dims(inputs, 0)
+
+            inputs_shape = tf.shape(inputs)
+            batch_size = inputs_shape[0]
+            img_hd = tf.cast(inputs_shape[H_AXIS], tf.float32)
+            img_wd = tf.cast(inputs_shape[W_AXIS], tf.float32)
+            height_translate = self._random_generator.random_uniform(
+                shape=[batch_size, 1],
+                minval=self.height_lower,
+                maxval=self.height_upper,
+                dtype=tf.float32,
+            )
+            height_translate = height_translate * img_hd
+            width_translate = self._random_generator.random_uniform(
+                shape=[batch_size, 1],
+                minval=self.width_lower,
+                maxval=self.width_upper,
+                dtype=tf.float32,
+            )
+            width_translate = width_translate * img_wd
+            translations = tf.cast(
+                tf.concat([width_translate, height_translate], axis=1),
+                dtype=tf.float32,
+            )
+            output = transform(
+                inputs,
+                get_translation_matrix(translations),
+                interpolation=self.interpolation,
+                fill_mode=self.fill_mode,
+                fill_value=self.fill_value,
+            )
+            if unbatched:
+                output = tf.squeeze(output, 0)
+            output.set_shape(original_shape)
+            return output
+
+        if training:
+            return random_translated_inputs(inputs)
+        else:
+            return inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "height_factor": self.height_factor,
+            "width_factor": self.width_factor,
+            "fill_mode": self.fill_mode,
+            "fill_value": self.fill_value,
+            "interpolation": self.interpolation,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def get_translation_matrix(translations, name=None):
+    """Returns projective transform(s) for the given translation(s).
 
     Args:
-      label: 1D label to the layer. Forwarded from `layer.call()`.
-      transformation: The transformation object produced by
-        `get_random_transformation`. Used to coordinate the randomness between
-        image, label and bounding box.
+        translations: A matrix of 2-element lists representing `[dx, dy]`
+            to translate for each image (for a batch of images).
+        name: The name of the op.
 
     Returns:
-      output 1D tensor, which will be forward to `layer.call()`.
+        A tensor of shape `(num_images, 8)` projective transforms
+            which can be given to `transform`.
     """
-    raise NotImplementedError()
-
-  @doc_controls.for_subclass_implementers
-  def augment_target(self, target, transformation):
-    """Augment a single target during training.
+    with backend.name_scope(name or "translation_matrix"):
+        num_translations = tf.shape(translations)[0]
+        # The translation matrix looks like:
+        #     [[1 0 -dx]
+        #      [0 1 -dy]
+        #      [0 0 1]]
+        # where the last entry is implicit.
+        # Translation matrices are always float32.
+        return tf.concat(
+            values=[
+                tf.ones((num_translations, 1), tf.float32),
+                tf.zeros((num_translations, 1), tf.float32),
+                -translations[:, 0, None],
+                tf.zeros((num_translations, 1), tf.float32),
+                tf.ones((num_translations, 1), tf.float32),
+                -translations[:, 1, None],
+                tf.zeros((num_translations, 2), tf.float32),
+            ],
+            axis=1,
+        )
+
+
+def transform(
+    images,
+    transforms,
+    fill_mode="reflect",
+    fill_value=0.0,
+    interpolation="bilinear",
+    output_shape=None,
+    name=None,
+):
+    """Applies the given transform(s) to the image(s).
 
     Args:
-      target: 1D label to the layer. Forwarded from `layer.call()`.
-      transformation: The transformation object produced by
-        `get_random_transformation`. Used to coordinate the randomness between
-        image, label and bounding box.
+        images: A tensor of shape
+            `(num_images, num_rows, num_columns, num_channels)` (NHWC).
+            The rank must be statically known
+            (the shape is not `TensorShape(None)`).
+        transforms: Projective transform matrix/matrices.
+            A vector of length 8 or tensor of size N x 8.
+            If one row of transforms is [a0, a1, a2, b0, b1, b2,
+            c0, c1], then it maps the *output* point `(x, y)`
+            to a transformed *input* point
+            `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
+            `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the
+            transform mapping input points to output points.
+            Note that gradients are not backpropagated
+            into transformation parameters.
+        fill_mode: Points outside the boundaries of the input are filled
+            according to the given mode
+            (one of `{"constant", "reflect", "wrap", "nearest"}`).
+        fill_value: a float represents the value to be filled outside
+            the boundaries when `fill_mode="constant"`.
+        interpolation: Interpolation mode. Supported values: `"nearest"`,
+            `"bilinear"`.
+        output_shape: Output dimension after the transform, `[height, width]`.
+            If `None`, output is the same size as input image.
+        name: The name of the op.
+
+    Fill mode behavior for each valid value is as follows:
+
+    - `"reflect"`: `(d c b a | a b c d | d c b a)`
+    The input is extended by reflecting about the edge of the last pixel.
+
+    - `"constant"`: `(k k k k | a b c d | k k k k)`
+    The input is extended by filling all
+    values beyond the edge with the same constant value k = 0.
+
+    - `"wrap"`: `(a b c d | a b c d | a b c d)`
+    The input is extended by wrapping around to the opposite edge.
+
+    - `"nearest"`: `(a a a a | a b c d | d d d d)`
+    The input is extended by the nearest pixel.
+
+    Input shape:
+        4D tensor with shape: `(samples, height, width, channels)`,
+            in `"channels_last"` format.
+
+    Output shape:
+        4D tensor with shape: `(samples, height, width, channels)`,
+            in `"channels_last"` format.
 
     Returns:
-      output 1D tensor, which will be forward to `layer.call()`.
+        Image(s) with the same type and shape as `images`, with the given
+        transform(s) applied. Transformed coordinates outside of the input image
+        will be filled with zeros.
     """
-    return self.augment_label(target, transformation)
+    with backend.name_scope(name or "transform"):
+        if output_shape is None:
+            output_shape = tf.shape(images)[1:3]
+            if not tf.executing_eagerly():
+                output_shape_value = tf.get_static_value(output_shape)
+                if output_shape_value is not None:
+                    output_shape = output_shape_value
+
+        output_shape = tf.convert_to_tensor(
+            output_shape, tf.int32, name="output_shape"
+        )
+
+        if not output_shape.get_shape().is_compatible_with([2]):
+            raise ValueError(
+                "output_shape must be a 1-D Tensor of 2 elements: "
+                "new_height, new_width, instead got "
+                f"output_shape={output_shape}"
+            )
+
+        fill_value = tf.convert_to_tensor(
+            fill_value, tf.float32, name="fill_value"
+        )
+
+        return tf.raw_ops.ImageProjectiveTransformV3(
+            images=images,
+            output_shape=output_shape,
+            fill_value=fill_value,
+            transforms=transforms,
+            fill_mode=fill_mode.upper(),
+            interpolation=interpolation.upper(),
+        )
 
-  @doc_controls.for_subclass_implementers
-  def augment_bounding_boxes(self, image, bounding_boxes, transformation=None):
-    """Augment bounding boxes for one image during training.
+
+def get_rotation_matrix(angles, image_height, image_width, name=None):
+    """Returns projective transform(s) for the given angle(s).
 
     Args:
-      image: 3D image input tensor to the layer. Forwarded from `layer.call()`.
-      bounding_boxes: 2D bounding boxes to the layer. Forwarded from `call()`.
-      transformation: The transformation object produced by
-        `get_random_transformation`. Used to coordinate the randomness between
-        image, label and bounding box.
+        angles: A scalar angle to rotate all images by,
+            or (for batches of images) a vector with an angle to
+            rotate each image in the batch. The rank must be
+            statically known (the shape is not `TensorShape(None)`).
+        image_height: Height of the image(s) to be transformed.
+        image_width: Width of the image(s) to be transformed.
+        name: The name of the op.
 
     Returns:
-      output 2D tensor, which will be forward to `layer.call()`.
+        A tensor of shape (num_images, 8).
+            Projective transforms which can be given
+            to operation `image_projective_transform_v2`.
+            If one row of transforms is
+            [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
+            `(x, y)` to a transformed *input* point
+            `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
+            where `k = c0 x + c1 y + 1`.
+    """
+    with backend.name_scope(name or "rotation_matrix"):
+        x_offset = (
+            (image_width - 1)
+            - (
+                tf.cos(angles) * (image_width - 1)
+                - tf.sin(angles) * (image_height - 1)
+            )
+        ) / 2.0
+        y_offset = (
+            (image_height - 1)
+            - (
+                tf.sin(angles) * (image_width - 1)
+                + tf.cos(angles) * (image_height - 1)
+            )
+        ) / 2.0
+        num_angles = tf.shape(angles)[0]
+        return tf.concat(
+            values=[
+                tf.cos(angles)[:, None],
+                -tf.sin(angles)[:, None],
+                x_offset[:, None],
+                tf.sin(angles)[:, None],
+                tf.cos(angles)[:, None],
+                y_offset[:, None],
+                tf.zeros((num_angles, 2), tf.float32),
+            ],
+            axis=1,
+        )
+
+
+@keras_export(
+    "keras.layers.RandomRotation",
+    "keras.layers.experimental.preprocessing.RandomRotation",
+    v1=[],
+)
+class RandomRotation(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly rotates images during training.
+
+    This layer will apply random rotations to each image, filling empty space
+    according to `fill_mode`.
+
+    By default, random rotations are only applied during training.
+    At inference time, the layer does nothing. If you need to apply random
+    rotations at inference time, set `training` to True when calling the layer.
+
+    Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
+    of integer or floating point dtype.
+    By default, the layer will output floats.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format
+
+    Args:
+        factor: a float represented as fraction of 2 Pi, or a tuple of size 2
+            representing lower and upper bound for rotating clockwise and
+            counter-clockwise. A positive values means rotating
+            counter clock-wise,
+            while a negative value means clock-wise.
+            When represented as a single
+            float, this value is used for both the upper and lower bound.
+            For instance, `factor=(-0.2, 0.3)`
+            results in an output rotation by a random
+            amount in the range `[-20% * 2pi, 30% * 2pi]`.
+            `factor=0.2` results in an
+            output rotating by a random amount
+            in the range `[-20% * 2pi, 20% * 2pi]`.
+        fill_mode: Points outside the boundaries of the input are filled
+            according to the given mode
+            (one of `{"constant", "reflect", "wrap", "nearest"}`).
+            - *reflect*: `(d c b a | a b c d | d c b a)`
+                The input is extended by reflecting about
+                the edge of the last pixel.
+            - *constant*: `(k k k k | a b c d | k k k k)`
+                The input is extended by
+                filling all values beyond the edge with
+                the same constant value k = 0.
+            - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
+                wrapping around to the opposite edge.
+            - *nearest*: `(a a a a | a b c d | d d d d)`
+                The input is extended by the nearest pixel.
+        interpolation: Interpolation mode. Supported values: `"nearest"`,
+            `"bilinear"`.
+        seed: Integer. Used to create a random seed.
+        fill_value: a float represents the value to be filled outside
+            the boundaries when `fill_mode="constant"`.
     """
-    raise NotImplementedError()
 
-  @doc_controls.for_subclass_implementers
-  def get_random_transformation(
-      self, image=None, label=None, bounding_box=None):
-    """Produce random transformation config for one single input.
+    def __init__(
+        self,
+        factor,
+        fill_mode="reflect",
+        interpolation="bilinear",
+        seed=None,
+        fill_value=0.0,
+        **kwargs,
+    ):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("RandomRotation").set(
+            True
+        )
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        self.factor = factor
+        if isinstance(factor, (tuple, list)):
+            self.lower = factor[0]
+            self.upper = factor[1]
+        else:
+            self.lower = -factor
+            self.upper = factor
+        if self.upper < self.lower:
+            raise ValueError(
+                "`factor` argument cannot have a negative value. "
+                f"Received: factor={factor}"
+            )
+        check_fill_mode_and_interpolation(fill_mode, interpolation)
+        self.fill_mode = fill_mode
+        self.fill_value = fill_value
+        self.interpolation = interpolation
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs, self.compute_dtype)
+
+        def random_rotated_inputs(inputs):
+            """Rotated inputs with random ops."""
+            original_shape = inputs.shape
+            unbatched = inputs.shape.rank == 3
+            # The transform op only accepts rank 4 inputs,
+            # so if we have an unbatched image,
+            # we need to temporarily expand dims to a batch.
+            if unbatched:
+                inputs = tf.expand_dims(inputs, 0)
+            inputs_shape = tf.shape(inputs)
+            batch_size = inputs_shape[0]
+            img_hd = tf.cast(inputs_shape[H_AXIS], tf.float32)
+            img_wd = tf.cast(inputs_shape[W_AXIS], tf.float32)
+            min_angle = self.lower * 2.0 * np.pi
+            max_angle = self.upper * 2.0 * np.pi
+            angles = self._random_generator.random_uniform(
+                shape=[batch_size], minval=min_angle, maxval=max_angle
+            )
+            output = transform(
+                inputs,
+                get_rotation_matrix(angles, img_hd, img_wd),
+                fill_mode=self.fill_mode,
+                fill_value=self.fill_value,
+                interpolation=self.interpolation,
+            )
+            if unbatched:
+                output = tf.squeeze(output, 0)
+            output.set_shape(original_shape)
+            return output
+
+        if training:
+            return random_rotated_inputs(inputs)
+        else:
+            return inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "factor": self.factor,
+            "fill_mode": self.fill_mode,
+            "fill_value": self.fill_value,
+            "interpolation": self.interpolation,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export(
+    "keras.layers.RandomZoom",
+    "keras.layers.experimental.preprocessing.RandomZoom",
+    v1=[],
+)
+class RandomZoom(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly zooms images during training.
+
+    This layer will randomly zoom in or out on each axis of an image
+    independently, filling empty space according to `fill_mode`.
+
+    Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
+    of integer or floating point dtype.
+    By default, the layer will output floats.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
-    This is used to produce same randomness between image/label/bounding_box.
+    Args:
+        height_factor: a float represented as fraction of value,
+            or a tuple of size 2 representing lower and upper bound
+            for zooming vertically. When represented as a single float,
+            this value is used for both the upper and
+            lower bound. A positive value means zooming out,
+            while a negative value
+            means zooming in. For instance, `height_factor=(0.2, 0.3)`
+            result in an output zoomed out by a random amount
+            in the range `[+20%, +30%]`.
+            `height_factor=(-0.3, -0.2)` result in an output zoomed
+            in by a random amount in the range `[+20%, +30%]`.
+        width_factor: a float represented as fraction of value,
+            or a tuple of size 2 representing lower and upper bound
+            for zooming horizontally. When
+            represented as a single float, this value is used
+            for both the upper and
+            lower bound. For instance, `width_factor=(0.2, 0.3)`
+            result in an output
+            zooming out between 20% to 30%.
+            `width_factor=(-0.3, -0.2)` result in an
+            output zooming in between 20% to 30%. `None` means
+            i.e., zooming vertical and horizontal directions
+            by preserving the aspect ratio. Defaults to `None`.
+        fill_mode: Points outside the boundaries of the input are
+            filled according to the given mode
+            (one of `{"constant", "reflect", "wrap", "nearest"}`).
+            - *reflect*: `(d c b a | a b c d | d c b a)`
+                The input is extended by reflecting about
+                the edge of the last pixel.
+            - *constant*: `(k k k k | a b c d | k k k k)`
+                The input is extended by filling all values beyond
+                the edge with the same constant value k = 0.
+            - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
+                wrapping around to the opposite edge.
+            - *nearest*: `(a a a a | a b c d | d d d d)`
+                The input is extended by the nearest pixel.
+        interpolation: Interpolation mode. Supported values: `"nearest"`,
+            `"bilinear"`.
+        seed: Integer. Used to create a random seed.
+        fill_value: a float represents the value to be filled outside
+            the boundaries when `fill_mode="constant"`.
+
+    Example:
+
+    >>> input_img = np.random.random((32, 224, 224, 3))
+    >>> layer = tf.keras.layers.RandomZoom(.5, .2)
+    >>> out_img = layer(input_img)
+    >>> out_img.shape
+    TensorShape([32, 224, 224, 3])
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+    """
+
+    def __init__(
+        self,
+        height_factor,
+        width_factor=None,
+        fill_mode="reflect",
+        interpolation="bilinear",
+        seed=None,
+        fill_value=0.0,
+        **kwargs,
+    ):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("RandomZoom").set(
+            True
+        )
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        self.height_factor = height_factor
+        if isinstance(height_factor, (tuple, list)):
+            self.height_lower = height_factor[0]
+            self.height_upper = height_factor[1]
+        else:
+            self.height_lower = -height_factor
+            self.height_upper = height_factor
+
+        if abs(self.height_lower) > 1.0 or abs(self.height_upper) > 1.0:
+            raise ValueError(
+                "`height_factor` argument must have values between [-1, 1]. "
+                f"Received: height_factor={height_factor}"
+            )
+
+        self.width_factor = width_factor
+        if width_factor is not None:
+            if isinstance(width_factor, (tuple, list)):
+                self.width_lower = width_factor[0]
+                self.width_upper = width_factor[1]
+            else:
+                self.width_lower = -width_factor
+                self.width_upper = width_factor
+
+            if self.width_lower < -1.0 or self.width_upper < -1.0:
+                raise ValueError(
+                    "`width_factor` argument must have values larger than -1. "
+                    f"Received: width_factor={width_factor}"
+                )
+
+        check_fill_mode_and_interpolation(fill_mode, interpolation)
+
+        self.fill_mode = fill_mode
+        self.fill_value = fill_value
+        self.interpolation = interpolation
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs, self.compute_dtype)
+
+        def random_zoomed_inputs(inputs):
+            """Zoomed inputs with random ops."""
+            original_shape = inputs.shape
+            unbatched = inputs.shape.rank == 3
+            # The transform op only accepts rank 4 inputs,
+            # so if we have an unbatched image,
+            # we need to temporarily expand dims to a batch.
+            if unbatched:
+                inputs = tf.expand_dims(inputs, 0)
+            inputs_shape = tf.shape(inputs)
+            batch_size = inputs_shape[0]
+            img_hd = tf.cast(inputs_shape[H_AXIS], tf.float32)
+            img_wd = tf.cast(inputs_shape[W_AXIS], tf.float32)
+            height_zoom = self._random_generator.random_uniform(
+                shape=[batch_size, 1],
+                minval=1.0 + self.height_lower,
+                maxval=1.0 + self.height_upper,
+            )
+            if self.width_factor is not None:
+                width_zoom = self._random_generator.random_uniform(
+                    shape=[batch_size, 1],
+                    minval=1.0 + self.width_lower,
+                    maxval=1.0 + self.width_upper,
+                )
+            else:
+                width_zoom = height_zoom
+            zooms = tf.cast(
+                tf.concat([width_zoom, height_zoom], axis=1), dtype=tf.float32
+            )
+            output = transform(
+                inputs,
+                get_zoom_matrix(zooms, img_hd, img_wd),
+                fill_mode=self.fill_mode,
+                fill_value=self.fill_value,
+                interpolation=self.interpolation,
+            )
+            if unbatched:
+                output = tf.squeeze(output, 0)
+            output.set_shape(original_shape)
+            return output
+
+        if training:
+            return random_zoomed_inputs(inputs)
+        else:
+            return inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "height_factor": self.height_factor,
+            "width_factor": self.width_factor,
+            "fill_mode": self.fill_mode,
+            "fill_value": self.fill_value,
+            "interpolation": self.interpolation,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def get_zoom_matrix(zooms, image_height, image_width, name=None):
+    """Returns projective transform(s) for the given zoom(s).
 
     Args:
-      image: 3D image tensor from inputs.
-      label: optional 1D label tensor from inputs.
-      bounding_box: optional 2D bounding boxes tensor from inputs.
+        zooms: A matrix of 2-element lists representing `[zx, zy]`
+            to zoom for each image (for a batch of images).
+        image_height: Height of the image(s) to be transformed.
+        image_width: Width of the image(s) to be transformed.
+        name: The name of the op.
 
     Returns:
-      Any type of object, which will be forwarded to `augment_image`,
-      `augment_label` and `augment_bounding_box` as the `transformation`
-      parameter.
+        A tensor of shape `(num_images, 8)`. Projective transforms which can be
+            given to operation `image_projective_transform_v2`.
+            If one row of transforms is
+            `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps the *output* point
+            `(x, y)` to a transformed *input* point
+            `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
+            where `k = c0 x + c1 y + 1`.
     """
-    return None
-
-  def call(self, inputs, training=True):
-    inputs = self._ensure_inputs_are_compute_dtype(inputs)
-    if training:
-      inputs, is_dict, use_targets = self._format_inputs(inputs)
-      images = inputs[IMAGES]
-      if images.shape.rank == 3:
-        return self._format_output(self._augment(inputs), is_dict, use_targets)
-      elif images.shape.rank == 4:
-        return self._format_output(self._batch_augment(inputs), is_dict, use_targets)
-      else:
-        raise ValueError('Image augmentation layers are expecting inputs to be '
-                         'rank 3 (HWC) or 4D (NHWC) tensors. Got shape: '
-                         f'{images.shape}')
-    else:
-      return inputs
-
-  def _augment(self, inputs):
-    image = inputs.get(IMAGES, None)
-    label = inputs.get(LABELS, None)
-    bounding_box = inputs.get(BOUNDING_BOXES, None)
-    transformation = self.get_random_transformation(
-        image=image, label=label, bounding_box=bounding_box)  # pylint: disable=assignment-from-none
-    image = self.augment_image(image, transformation=transformation)
-    result = {IMAGES: image}
-    if label is not None:
-      label = self.augment_target(label, transformation=transformation)
-      result[LABELS] = label
-    if bounding_box is not None:
-      bounding_box = self.augment_bounding_boxes(
-          image, bounding_box, transformation=transformation)
-      result[BOUNDING_BOXES] = bounding_box
-    return result
-
-  def _batch_augment(self, inputs):
-    return self._map_fn(self._augment, inputs)
-
-  def _format_inputs(self, inputs):
-    if tf.is_tensor(inputs):
-      # single image input tensor
-      return {IMAGES: inputs}, False, False
-    elif isinstance(inputs, dict) and TARGETS in inputs:
-      # TODO(scottzhu): Check if it only contains the valid keys
-      inputs[LABELS] = inputs[TARGETS]
-      del inputs[TARGETS]
-      return inputs, True, True
-    elif isinstance(inputs, dict):
-      return inputs, True, False
-    else:
-      raise ValueError(
-          f'Expect the inputs to be image tensor or dict. Got {inputs}')
-
-  def _format_output(self, output, is_dict, use_targets):
-    if not is_dict:
-      return output[IMAGES]
-    elif use_targets:
-      output[TARGETS] = output[LABELS]
-      del output[LABELS]
-      return output
-    else:
-      return output
-
-  def _ensure_inputs_are_compute_dtype(self, inputs):
-    if isinstance(inputs, dict):
-      inputs[IMAGES] = utils.ensure_tensor(inputs[IMAGES],
-                                             self.compute_dtype)
-    else:
-      inputs = utils.ensure_tensor(inputs, self.compute_dtype)
-    return inputs
+    with backend.name_scope(name or "zoom_matrix"):
+        num_zooms = tf.shape(zooms)[0]
+        # The zoom matrix looks like:
+        #     [[zx 0 0]
+        #      [0 zy 0]
+        #      [0 0 1]]
+        # where the last entry is implicit.
+        # Zoom matrices are always float32.
+        x_offset = ((image_width - 1.0) / 2.0) * (1.0 - zooms[:, 0, None])
+        y_offset = ((image_height - 1.0) / 2.0) * (1.0 - zooms[:, 1, None])
+        return tf.concat(
+            values=[
+                zooms[:, 0, None],
+                tf.zeros((num_zooms, 1), tf.float32),
+                x_offset,
+                tf.zeros((num_zooms, 1), tf.float32),
+                zooms[:, 1, None],
+                y_offset,
+                tf.zeros((num_zooms, 2), tf.float32),
+            ],
+            axis=1,
+        )
+
+
+@keras_export(
+    "keras.layers.RandomContrast",
+    "keras.layers.experimental.preprocessing.RandomContrast",
+    v1=[],
+)
+class RandomContrast(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly adjusts contrast during training.
+
+    This layer will randomly adjust the contrast of an image or images
+    by a random factor. Contrast is adjusted independently
+    for each channel of each image during training.
+
+    For each channel, this layer computes the mean of the image pixels in the
+    channel and then adjusts each component `x` of each pixel to
+    `(x - mean) * contrast_factor + mean`.
+
+    Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
+    in integer or floating point dtype.
+    By default, the layer will output floats.
+    The output value will be clipped to the range `[0, 255]`, the valid
+    range of RGB colors.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
 
+    Args:
+        factor: a positive float represented as fraction of value, or a tuple of
+            size 2 representing lower and upper bound.
+            When represented as a single float, lower = upper.
+            The contrast factor will be randomly picked between
+            `[1.0 - lower, 1.0 + upper]`. For any pixel x in the channel,
+            the output will be `(x - mean) * factor + mean`
+            where `mean` is the mean value of the channel.
+        seed: Integer. Used to create a random seed.
+    """
 
-@keras_export('keras.layers.RandomCrop',
-              'keras.layers.experimental.preprocessing.RandomCrop',
-              v1=[])
-class RandomCrop(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly crops images during training.
-
-  During training, this layer will randomly choose a location to crop images
-  down to a target size. The layer will crop all the images in the same batch to
-  the same cropping location.
-
-  At inference time, and during training if an input image is smaller than the
-  target size, the input will be resized and cropped so as to return the largest
-  possible window in the image that matches the target aspect ratio. If you need
-  to apply random cropping at inference time, set `training` to True when
-  calling the layer.
-
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
-  of interger or floating point dtype. By default, the layer will output floats.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., target_height, target_width, channels)`.
-
-  Args:
-    height: Integer, the height of the output shape.
-    width: Integer, the width of the output shape.
-    seed: Integer. Used to create a random seed.
-  """
-
-  def __init__(self, height, width, seed=None, **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomCrop').set(True)
-    super().__init__(**kwargs, autocast=False, seed=seed,
-                                     force_generator=True)
-    self.height = height
-    self.width = width
-    self.seed = seed
-
-  def call(self, inputs, training=True):
-
-    if training:
-      return super().call(inputs, training)
-    else:
-      inputs = self._ensure_inputs_are_compute_dtype(inputs)
-      inputs, is_dict, targets = self._format_inputs(inputs)
-      output = inputs
-      # self._resize() returns valid results for both batched and unbatched
-      output['images'] = self._resize(inputs['images'])
-      return self._format_output(output, is_dict, targets)
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    input_shape = tf.shape(image)
-    h_diff = input_shape[H_AXIS] - self.height
-    w_diff = input_shape[W_AXIS] - self.width
-    dtype = input_shape.dtype
-    rands = self._random_generator.random_uniform([2], 0, dtype.max, dtype)
-    h_start = rands[0] % (h_diff + 1)
-    w_start = rands[1] % (w_diff + 1)
-    return {'top': h_start, 'left': w_start}
-
-  def augment_image(self, image, transformation):
-    input_shape = tf.shape(image)
-    h_diff = input_shape[H_AXIS] - self.height
-    w_diff = input_shape[W_AXIS] - self.width
-    return tf.cond(
-        tf.reduce_all((h_diff >= 0, w_diff >= 0)),
-        lambda: self._crop(image, transformation), lambda: self._resize(image))
-
-  def _crop(self, image, transformation):
-    top = transformation['top']
-    left = transformation['left']
-    return tf.image.crop_to_bounding_box(image, top, left, self.height,
-                                         self.width)
-
-  def _resize(self, image):
-    outputs = image_utils.smart_resize(image, [self.height, self.width])
-    # smart_resize will always output float32, so we need to re-cast.
-    return tf.cast(outputs, self.compute_dtype)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    input_shape[H_AXIS] = self.height
-    input_shape[W_AXIS] = self.width
-    return tf.TensorShape(input_shape)
-
-  def get_config(self):
-    config = {
-        'height': self.height,
-        'width': self.width,
-        'seed': self.seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.Rescaling',
-              'keras.layers.experimental.preprocessing.Rescaling')
-class Rescaling(base_layer.Layer):
-  """A preprocessing layer which rescales input values to a new range.
-
-  This layer rescales every value of an input (often an image) by multiplying by
-  `scale` and adding `offset`.
-
-  For instance:
-
-  1. To rescale an input in the ``[0, 255]`` range
-  to be in the `[0, 1]` range, you would pass `scale=1./255`.
-
-  2. To rescale an input in the ``[0, 255]`` range to be in the `[-1, 1]` range,
-  you would pass `scale=1./127.5, offset=-1`.
-
-  The rescaling is applied both during training and inference. Inputs can be
-  of integer or floating point dtype, and by default the layer will output
-  floats.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Input shape:
-    Arbitrary.
-
-  Output shape:
-    Same as input.
-
-  Args:
-    scale: Float, the scale to apply to the inputs.
-    offset: Float, the offset to apply to the inputs.
-  """
-
-  def __init__(self, scale, offset=0., **kwargs):
-    self.scale = scale
-    self.offset = offset
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('Rescaling').set(True)
-
-  def call(self, inputs):
-    dtype = self.compute_dtype
-    scale = tf.cast(self.scale, dtype)
-    offset = tf.cast(self.offset, dtype)
-    return tf.cast(inputs, dtype) * scale + offset
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'scale': self.scale,
-        'offset': self.offset,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-HORIZONTAL = 'horizontal'
-VERTICAL = 'vertical'
-HORIZONTAL_AND_VERTICAL = 'horizontal_and_vertical'
-
-
-@keras_export('keras.layers.RandomFlip',
-              'keras.layers.experimental.preprocessing.RandomFlip',
-              v1=[])
-class RandomFlip(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly flips images during training.
-
-  This layer will flip the images horizontally and or vertically based on the
-  `mode` attribute. During inference time, the output will be identical to
-  input. Call the layer with `training=True` to flip the input.
-
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
-  of interger or floating point dtype. By default, the layer will output floats.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Arguments:
-    mode: String indicating which flip mode to use. Can be `"horizontal"`,
-      `"vertical"`, or `"horizontal_and_vertical"`. Defaults to
-      `"horizontal_and_vertical"`. `"horizontal"` is a left-right flip and
-      `"vertical"` is a top-bottom flip.
-    seed: Integer. Used to create a random seed.
-  """
-
-  def __init__(self,
-               mode=HORIZONTAL_AND_VERTICAL,
-               seed=None,
-               **kwargs):
-    super().__init__(seed=seed, force_generator=True, **kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomFlip').set(True)
-    self.mode = mode
-    if mode == HORIZONTAL:
-      self.horizontal = True
-      self.vertical = False
-    elif mode == VERTICAL:
-      self.horizontal = False
-      self.vertical = True
-    elif mode == HORIZONTAL_AND_VERTICAL:
-      self.horizontal = True
-      self.vertical = True
-    else:
-      raise ValueError('RandomFlip layer {name} received an unknown mode '
-                       'argument {arg}'.format(name=self.name, arg=mode))
-    self.auto_vectorize = False
-
-  def augment_label(self, label, transformation):
-    return label
-
-  def augment_image(self, image, transformation):
-    flipped_outputs = image
-    if self.horizontal and transformation['flip_horizontal']:
-      flipped_outputs = tf.image.flip_left_right(flipped_outputs)
-    if self.vertical and transformation['flip_vertical']:
-      flipped_outputs = tf.image.flip_up_down(flipped_outputs)
-    flipped_outputs.set_shape(image.shape)
-    return flipped_outputs
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    flip_horizontal = False
-    flip_vertical = False
-    if self.horizontal:
-      flip_horizontal = np.random.choice([True, False])
-    if self.vertical:
-      flip_vertical = np.random.choice([True, False])
-    return {'flip_horizontal': flip_horizontal, 'flip_vertical': flip_vertical}
-
-  def augment_bounding_boxes(self, image, bounding_boxes, transformation=None):
-    transformation = transformation or self.get_random_transformation()
-    image = tf.expand_dims(image, 0)
-    image_shape = tf.shape(image)
-    h = image_shape[H_AXIS]
-    w = image_shape[W_AXIS]
-    bboxes_out = tf.identity(bounding_boxes)
-    if transformation['flip_horizontal']:
-      bboxes_out = tf.stack([
-          w - bboxes_out[:, 2], bboxes_out[:, 1], w - bboxes_out[:, 0],
-          bboxes_out[:, 3]
-      ],
-                            axis=-1)
-    if transformation['flip_vertical']:
-      bboxes_out = tf.stack([
-          bboxes_out[:, 0], h - bboxes_out[:, 3], bboxes_out[:, 2],
-          h - bboxes_out[:, 1]
-      ],
-                            axis=-1)
-    return bboxes_out
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'mode': self.mode,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def __init__(self, factor, seed=None, **kwargs):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("RandomContrast").set(
+            True
+        )
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        self.factor = factor
+        if isinstance(factor, (tuple, list)):
+            self.lower = factor[0]
+            self.upper = factor[1]
+        else:
+            self.lower = self.upper = factor
+        if self.lower < 0.0 or self.upper < 0.0 or self.lower > 1.0:
+            raise ValueError(
+                "`factor` argument cannot have negative values or values "
+                "greater than 1."
+                f"Received: factor={factor}"
+            )
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs, self.compute_dtype)
+
+        def random_contrasted_inputs(inputs):
+            seed = self._random_generator.make_seed_for_stateless_op()
+            if seed is not None:
+                output = tf.image.stateless_random_contrast(
+                    inputs, 1.0 - self.lower, 1.0 + self.upper, seed=seed
+                )
+            else:
+                output = tf.image.random_contrast(
+                    inputs,
+                    1.0 - self.lower,
+                    1.0 + self.upper,
+                    seed=self._random_generator.make_legacy_seed(),
+                )
+            output = tf.clip_by_value(output, 0, 255)
+            output.set_shape(inputs.shape)
+            return output
+
+        if training:
+            return random_contrasted_inputs(inputs)
+        else:
+            return inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "factor": self.factor,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.layers.RandomBrightness", v1=[])
+class RandomBrightness(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly adjusts brightness during training.
+
+    This layer will randomly increase/reduce the brightness for the input RGB
+    images. At inference time, the output will be identical to the input.
+    Call the layer with `training=True` to adjust the brightness of the input.
+
+    Note that different brightness adjustment factors
+    will be apply to each the images in the batch.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
+    Args:
+        factor: Float or a list/tuple of 2 floats between -1.0 and 1.0. The
+            factor is used to determine the lower bound and upper bound of the
+            brightness adjustment. A float value will be chosen randomly between
+            the limits. When -1.0 is chosen, the output image will be black, and
+            when 1.0 is chosen, the image will be fully white.
+            When only one float is provided, eg, 0.2,
+            then -0.2 will be used for lower bound and 0.2
+            will be used for upper bound.
+        value_range: Optional list/tuple of 2 floats
+            for the lower and upper limit
+            of the values of the input data.
+            To make no change, use [0.0, 1.0], e.g., if the image input
+            has been scaled before this layer. Defaults to [0.0, 255.0].
+            The brightness adjustment will be scaled to this range, and the
+            output values will be clipped to this range.
+        seed: optional integer, for fixed RNG behavior.
+
+    Inputs: 3D (HWC) or 4D (NHWC) tensor, with float or int dtype. Input pixel
+        values can be of any range (e.g. `[0., 1.)` or `[0, 255]`)
+
+    Output: 3D (HWC) or 4D (NHWC) tensor with brightness adjusted based on the
+        `factor`. By default, the layer will output floats.
+        The output value will be clipped to the range `[0, 255]`,
+        the valid range of RGB colors, and
+        rescaled based on the `value_range` if needed.
+
+    Sample usage:
 
-# TODO(tanzheny): Add examples, here and everywhere.
-@keras_export('keras.layers.RandomTranslation',
-              'keras.layers.experimental.preprocessing.RandomTranslation',
-              v1=[])
-class RandomTranslation(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly translates images during training.
-
-  This layer will apply random translations to each image during training,
-  filling empty space according to `fill_mode`.
-
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
-  of interger or floating point dtype. By default, the layer will output floats.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    height_factor: a float represented as fraction of value, or a tuple of size
-      2 representing lower and upper bound for shifting vertically. A negative
-      value means shifting image up, while a positive value means shifting image
-      down. When represented as a single positive float, this value is used for
-      both the upper and lower bound. For instance, `height_factor=(-0.2, 0.3)`
-      results in an output shifted by a random amount in the range
-      `[-20%, +30%]`.
-      `height_factor=0.2` results in an output height shifted by a random amount
-      in the range `[-20%, +20%]`.
-    width_factor: a float represented as fraction of value, or a tuple of size 2
-      representing lower and upper bound for shifting horizontally. A negative
-      value means shifting image left, while a positive value means shifting
-      image right. When represented as a single positive float, this value is
-      used for both the upper and lower bound. For instance,
-      `width_factor=(-0.2, 0.3)` results in an output shifted left by 20%, and
-      shifted right by 30%. `width_factor=0.2` results in an output height
-      shifted left or right by 20%.
-    fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`).
-      - *reflect*: `(d c b a | a b c d | d c b a)` The input is extended by
-        reflecting about the edge of the last pixel.
-      - *constant*: `(k k k k | a b c d | k k k k)` The input is extended by
-        filling all values beyond the edge with the same constant value k = 0.
-      - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
-        wrapping around to the opposite edge.
-      - *nearest*: `(a a a a | a b c d | d d d d)` The input is extended by the
-        nearest pixel.
-    interpolation: Interpolation mode. Supported values: `"nearest"`,
-      `"bilinear"`.
-    seed: Integer. Used to create a random seed.
-    fill_value: a float represents the value to be filled outside the boundaries
-      when `fill_mode="constant"`.
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`,  in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`,  in `"channels_last"` format.
-  """
-
-  def __init__(self,
-               height_factor,
-               width_factor,
-               fill_mode='reflect',
-               interpolation='bilinear',
-               seed=None,
-               fill_value=0.0,
-               **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomTranslation').set(
-        True)
-    super().__init__(seed=seed, force_generator=True,
-                                            **kwargs)
-    self.height_factor = height_factor
-    if isinstance(height_factor, (tuple, list)):
-      self.height_lower = height_factor[0]
-      self.height_upper = height_factor[1]
-    else:
-      self.height_lower = -height_factor
-      self.height_upper = height_factor
-    if self.height_upper < self.height_lower:
-      raise ValueError('`height_factor` cannot have upper bound less than '
-                       'lower bound, got {}'.format(height_factor))
-    if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
-      raise ValueError('`height_factor` must have values between [-1, 1], '
-                       'got {}'.format(height_factor))
-
-    self.width_factor = width_factor
-    if isinstance(width_factor, (tuple, list)):
-      self.width_lower = width_factor[0]
-      self.width_upper = width_factor[1]
-    else:
-      self.width_lower = -width_factor
-      self.width_upper = width_factor
-    if self.width_upper < self.width_lower:
-      raise ValueError('`width_factor` cannot have upper bound less than '
-                       'lower bound, got {}'.format(width_factor))
-    if abs(self.width_lower) > 1. or abs(self.width_upper) > 1.:
-      raise ValueError('`width_factor` must have values between [-1, 1], '
-                       'got {}'.format(width_factor))
-
-    check_fill_mode_and_interpolation(fill_mode, interpolation)
-
-    self.fill_mode = fill_mode
-    self.fill_value = fill_value
-    self.interpolation = interpolation
-    self.seed = seed
-
-  @tf.function
-  def augment_image(self, image, transformation):
-    """Translated inputs with random ops."""
-    # The transform op only accepts rank 4 inputs, so if we have an unbatched
-    # image, we need to temporarily expand dims to a batch.
-    original_shape = image.shape
-    inputs = tf.expand_dims(image, 0)
-
-    inputs_shape = tf.shape(inputs)
-    img_hd = tf.cast(inputs_shape[H_AXIS], tf.float32)
-    img_wd = tf.cast(inputs_shape[W_AXIS], tf.float32)
-    height_translation = transformation['height_translation']
-    width_translation = transformation['width_translation']
-    height_translation = height_translation * img_hd
-    width_translation = width_translation * img_wd
-    translations = tf.cast(
-        tf.concat([width_translation, height_translation], axis=1),
-        dtype=tf.float32)
-    output = transform(
-        inputs,
-        get_translation_matrix(translations),
-        interpolation=self.interpolation,
-        fill_mode=self.fill_mode,
-        fill_value=self.fill_value)
-
-    output = tf.squeeze(output, 0)
-    output.set_shape(original_shape)
-    return output
-
-  def get_random_transformation(
-      self, image=None, label=None, bounding_box=None):
-    del image, label, bounding_box
-    batch_size = 1
-    height_translation = self._random_generator.random_uniform(
-        shape=[batch_size, 1],
-        minval=self.height_lower,
-        maxval=self.height_upper,
-        dtype=tf.float32)
-    width_translation = self._random_generator.random_uniform(
-        shape=[batch_size, 1],
-        minval=self.width_lower,
-        maxval=self.width_upper,
-        dtype=tf.float32)
-    return {'height_translation': height_translation,
-            'width_translation': width_translation}
-
-  def _batch_augment(self, inputs):
-    # Change to vectorized_map for better performance, as well as work around
-    # issue for different tensorspec between inputs and outputs.
-    return tf.vectorized_map(self._augment, inputs)
-
-  def augment_label(self, label, transformation):
-    return label
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'height_factor': self.height_factor,
-        'width_factor': self.width_factor,
-        'fill_mode': self.fill_mode,
-        'fill_value': self.fill_value,
-        'interpolation': self.interpolation,
-        'seed': self.seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    ```python
+    random_bright = tf.keras.layers.RandomBrightness(factor=0.2)
 
+    # An image with shape [2, 2, 3]
+    image = [[[1, 2, 3], [4 ,5 ,6]], [[7, 8, 9], [10, 11, 12]]]
 
-def get_translation_matrix(translations, name=None):
-  """Returns projective transform(s) for the given translation(s).
-
-  Args:
-    translations: A matrix of 2-element lists representing `[dx, dy]`
-      to translate for each image (for a batch of images).
-    name: The name of the op.
-
-  Returns:
-    A tensor of shape `(num_images, 8)` projective transforms which can be given
-      to `transform`.
-  """
-  with backend.name_scope(name or 'translation_matrix'):
-    num_translations = tf.shape(translations)[0]
-    # The translation matrix looks like:
-    #     [[1 0 -dx]
-    #      [0 1 -dy]
-    #      [0 0 1]]
-    # where the last entry is implicit.
-    # Translation matrices are always float32.
-    return tf.concat(
-        values=[
-            tf.ones((num_translations, 1), tf.float32),
-            tf.zeros((num_translations, 1), tf.float32),
-            -translations[:, 0, None],
-            tf.zeros((num_translations, 1), tf.float32),
-            tf.ones((num_translations, 1), tf.float32),
-            -translations[:, 1, None],
-            tf.zeros((num_translations, 2), tf.float32),
-        ],
-        axis=1)
-
-
-def transform(images,
-              transforms,
-              fill_mode='reflect',
-              fill_value=0.0,
-              interpolation='bilinear',
-              output_shape=None,
-              name=None):
-  """Applies the given transform(s) to the image(s).
-
-  Args:
-    images: A tensor of shape
-      `(num_images, num_rows, num_columns, num_channels)` (NHWC). The rank must
-      be statically known (the shape is not `TensorShape(None)`).
-    transforms: Projective transform matrix/matrices. A vector of length 8 or
-      tensor of size N x 8. If one row of transforms is [a0, a1, a2, b0, b1, b2,
-      c0, c1], then it maps the *output* point `(x, y)` to a transformed *input*
-      point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
-      `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the
-      transform mapping input points to output points. Note that gradients are
-      not backpropagated into transformation parameters.
-    fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`).
-    fill_value: a float represents the value to be filled outside the boundaries
-      when `fill_mode="constant"`.
-    interpolation: Interpolation mode. Supported values: `"nearest"`,
-      `"bilinear"`.
-    output_shape: Output dimension after the transform, `[height, width]`.
-      If `None`, output is the same size as input image.
-    name: The name of the op.
-
-  Fill mode behavior for each valid value is as follows:
-
-  - reflect (d c b a | a b c d | d c b a)
-  The input is extended by reflecting about the edge of the last pixel.
-
-  - constant (k k k k | a b c d | k k k k)
-  The input is extended by filling all
-  values beyond the edge with the same constant value k = 0.
-
-  - wrap (a b c d | a b c d | a b c d)
-  The input is extended by wrapping around to the opposite edge.
-
-  - nearest (a a a a | a b c d | d d d d)
-  The input is extended by the nearest pixel.
-
-  Input shape:
-    4D tensor with shape: `(samples, height, width, channels)`,
-      in `"channels_last"` format.
-
-  Output shape:
-    4D tensor with shape: `(samples, height, width, channels)`,
-      in `"channels_last"` format.
-
-  Returns:
-    Image(s) with the same type and shape as `images`, with the given
-    transform(s) applied. Transformed coordinates outside of the input image
-    will be filled with zeros.
-
-  Raises:
-    TypeError: If `image` is an invalid type.
-    ValueError: If output shape is not 1-D int32 Tensor.
-  """
-  with backend.name_scope(name or 'transform'):
-    if output_shape is None:
-      output_shape = tf.shape(images)[1:3]
-      if not tf.executing_eagerly():
-        output_shape_value = tf.get_static_value(output_shape)
-        if output_shape_value is not None:
-          output_shape = output_shape_value
-
-    output_shape = tf.convert_to_tensor(
-        output_shape, tf.int32, name='output_shape')
-
-    if not output_shape.get_shape().is_compatible_with([2]):
-      raise ValueError('output_shape must be a 1-D Tensor of 2 elements: '
-                       'new_height, new_width, instead got '
-                       '{}'.format(output_shape))
-
-    fill_value = tf.convert_to_tensor(
-        fill_value, tf.float32, name='fill_value')
-
-    return tf.raw_ops.ImageProjectiveTransformV3(
-        images=images,
-        output_shape=output_shape,
-        fill_value=fill_value,
-        transforms=transforms,
-        fill_mode=fill_mode.upper(),
-        interpolation=interpolation.upper())
+    # Assume we randomly select the factor to be 0.1, then it will apply
+    # 0.1 * 255 to all the channel
+    output = random_bright(image, training=True)
 
+    # output will be int64 with 25.5 added to each channel and round down.
+    tf.Tensor([[[26.5, 27.5, 28.5]
+                [29.5, 30.5, 31.5]]
+               [[32.5, 33.5, 34.5]
+                [35.5, 36.5, 37.5]]],
+              shape=(2, 2, 3), dtype=int64)
+    ```
+    """
 
-def get_rotation_matrix(angles, image_height, image_width, name=None):
-  """Returns projective transform(s) for the given angle(s).
-
-  Args:
-    angles: A scalar angle to rotate all images by, or (for batches of images) a
-      vector with an angle to rotate each image in the batch. The rank must be
-      statically known (the shape is not `TensorShape(None)`).
-    image_height: Height of the image(s) to be transformed.
-    image_width: Width of the image(s) to be transformed.
-    name: The name of the op.
-
-  Returns:
-    A tensor of shape (num_images, 8). Projective transforms which can be given
-      to operation `image_projective_transform_v2`. If one row of transforms is
-       [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
-       `(x, y)` to a transformed *input* point
-       `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
-       where `k = c0 x + c1 y + 1`.
-  """
-  with backend.name_scope(name or 'rotation_matrix'):
-    x_offset = ((image_width - 1) - (tf.cos(angles) *
-                                     (image_width - 1) - tf.sin(angles) *
-                                     (image_height - 1))) / 2.0
-    y_offset = ((image_height - 1) - (tf.sin(angles) *
-                                      (image_width - 1) + tf.cos(angles) *
-                                      (image_height - 1))) / 2.0
-    num_angles = tf.shape(angles)[0]
-    return tf.concat(
-        values=[
-            tf.cos(angles)[:, None],
-            -tf.sin(angles)[:, None],
-            x_offset[:, None],
-            tf.sin(angles)[:, None],
-            tf.cos(angles)[:, None],
-            y_offset[:, None],
-            tf.zeros((num_angles, 2), tf.float32),
-        ],
-        axis=1)
-
-
-@keras_export('keras.layers.RandomRotation',
-              'keras.layers.experimental.preprocessing.RandomRotation',
-              v1=[])
-class RandomRotation(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly rotates images during training.
-
-  This layer will apply random rotations to each image, filling empty space
-  according to `fill_mode`.
-
-  By default, random rotations are only applied during training.
-  At inference time, the layer does nothing. If you need to apply random
-  rotations at inference time, set `training` to True when calling the layer.
-
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
-  of interger or floating point dtype. By default, the layer will output floats.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format
-
-  Arguments:
-    factor: a float represented as fraction of 2 Pi, or a tuple of size 2
-      representing lower and upper bound for rotating clockwise and
-      counter-clockwise. A positive values means rotating counter clock-wise,
-      while a negative value means clock-wise. When represented as a single
-      float, this value is used for both the upper and lower bound. For
-      instance, `factor=(-0.2, 0.3)` results in an output rotation by a random
-      amount in the range `[-20% * 2pi, 30% * 2pi]`. `factor=0.2` results in an
-      output rotating by a random amount in the range `[-20% * 2pi, 20% * 2pi]`.
-    fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`).
-      - *reflect*: `(d c b a | a b c d | d c b a)` The input is extended by
-        reflecting about the edge of the last pixel.
-      - *constant*: `(k k k k | a b c d | k k k k)` The input is extended by
-        filling all values beyond the edge with the same constant value k = 0.
-      - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
-        wrapping around to the opposite edge.
-      - *nearest*: `(a a a a | a b c d | d d d d)` The input is extended by the
-        nearest pixel.
-    interpolation: Interpolation mode. Supported values: `"nearest"`,
-      `"bilinear"`.
-    seed: Integer. Used to create a random seed.
-    fill_value: a float represents the value to be filled outside the boundaries
-      when `fill_mode="constant"`.
-  """
-
-  def __init__(self,
-               factor,
-               fill_mode='reflect',
-               interpolation='bilinear',
-               seed=None,
-               fill_value=0.0,
-               **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomRotation').set(
-        True)
-    super().__init__(seed=seed, force_generator=True,
-                                         **kwargs)
-    self.factor = factor
-    if isinstance(factor, (tuple, list)):
-      self.lower = factor[0]
-      self.upper = factor[1]
-    else:
-      self.lower = -factor
-      self.upper = factor
-    if self.upper < self.lower:
-      raise ValueError('Factor cannot have negative values, '
-                       'got {}'.format(factor))
-    check_fill_mode_and_interpolation(fill_mode, interpolation)
-    self.fill_mode = fill_mode
-    self.fill_value = fill_value
-    self.interpolation = interpolation
-    self.seed = seed
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    min_angle = self.lower * 2. * np.pi
-    max_angle = self.upper * 2. * np.pi
-    angle = self._random_generator.random_uniform(
-        shape=[1], minval=min_angle, maxval=max_angle)
-    return {'angle': angle}
-
-  def augment_image(self, image, transformation):
-    image = utils.ensure_tensor(image, self.compute_dtype)
-    original_shape = image.shape
-    image = tf.expand_dims(image, 0)
-    image_shape = tf.shape(image)
-    img_hd = tf.cast(image_shape[H_AXIS], tf.float32)
-    img_wd = tf.cast(image_shape[W_AXIS], tf.float32)
-    angle = transformation['angle']
-    output = transform(
-        image,
-        get_rotation_matrix(angle, img_hd, img_wd),
-        fill_mode=self.fill_mode,
-        fill_value=self.fill_value,
-        interpolation=self.interpolation)
-    output = tf.squeeze(output, 0)
-    output.set_shape(original_shape)
-    return output
-
-  def augment_bounding_boxes(self, image, bounding_boxes, transformation):
-    image = tf.expand_dims(image, 0)
-    image_shape = tf.shape(image)
-    h = image_shape[H_AXIS]
-    w = image_shape[W_AXIS]
-    bbox_dtype = bounding_boxes.dtype
-    # origin coordinates, all the points on the image are rotated around this
-    # point
-    origin_x, origin_y = int(h / 2), int(w / 2)
-    angle = transformation['angle']
-    angle = -angle
-    # calculate coordinates of all four corners of the bounding box
-    point = tf.stack([
-        tf.stack([bounding_boxes[:, 0], bounding_boxes[:, 1]], axis=1),
-        tf.stack([bounding_boxes[:, 2], bounding_boxes[:, 1]], axis=1),
-        tf.stack([bounding_boxes[:, 2], bounding_boxes[:, 3]], axis=1),
-        tf.stack([bounding_boxes[:, 0], bounding_boxes[:, 3]], axis=1)], axis=1)
-    # point_x : x coordinates of all corners of the bounding box
-    point_x = tf.gather(point, [0], axis=2)
-    # point_y : y cordinates of all corners of the bounding box
-    point_y = tf.gather(point, [1], axis=2)
-    # rotated bbox coordinates
-    # new_x : new position of x coordinates of corners of bounding box
-    new_x = origin_x + tf.multiply(tf.cos(angle), tf.cast(
-        (point_x - origin_x), dtype=tf.float32)) - tf.multiply(
-            tf.sin(angle), tf.cast((point_y - origin_y), dtype=tf.float32))
-    # new_y : new position of y coordinates of corners of bounding box
-    new_y = origin_y + tf.multiply(tf.sin(angle), tf.cast(
-        (point_x - origin_x), dtype=tf.float32)) + tf.multiply(
-            tf.cos(angle), tf.cast((point_y - origin_y), dtype=tf.float32))
-    # rotated bbox coordinates
-    out = tf.concat([new_x, new_y], axis=2)
-    # find readjusted coordinates of bounding box to represent it in corners
-    # format
-    min_cordinates = tf.math.reduce_min(out, axis=1)
-    max_cordinates = tf.math.reduce_max(out, axis=1)
-    bboxes_out = tf.concat([min_cordinates, max_cordinates], axis=1)
-    # cordinates cannot be float values, it is casted to int32
-    bboxes_out = tf.cast(bboxes_out, bbox_dtype)
-    return bboxes_out
-
-  def augment_label(self, label, transformation):
-    return label
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'factor': self.factor,
-        'fill_mode': self.fill_mode,
-        'fill_value': self.fill_value,
-        'interpolation': self.interpolation,
-        'seed': self.seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.RandomZoom',
-              'keras.layers.experimental.preprocessing.RandomZoom',
-              v1=[])
-class RandomZoom(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly zooms images during training.
-
-  This layer will randomly zoom in or out on each axis of an image
-  independently, filling empty space according to `fill_mode`.
-
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
-  of interger or floating point dtype. By default, the layer will output floats.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    height_factor: a float represented as fraction of value, or a tuple of size
-      2 representing lower and upper bound for zooming vertically. When
-      represented as a single float, this value is used for both the upper and
-      lower bound. A positive value means zooming out, while a negative value
-      means zooming in. For instance, `height_factor=(0.2, 0.3)` result in an
-      output zoomed out by a random amount in the range `[+20%, +30%]`.
-      `height_factor=(-0.3, -0.2)` result in an output zoomed in by a random
-      amount in the range `[+20%, +30%]`.
-    width_factor: a float represented as fraction of value, or a tuple of size 2
-      representing lower and upper bound for zooming horizontally. When
-      represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `width_factor=(0.2, 0.3)` result in an output
-      zooming out between 20% to 30%. `width_factor=(-0.3, -0.2)` result in an
-      output zooming in between 20% to 30%. Defaults to `None`, i.e., zooming
-      vertical and horizontal directions by preserving the aspect ratio.
-    fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`).
-      - *reflect*: `(d c b a | a b c d | d c b a)` The input is extended by
-        reflecting about the edge of the last pixel.
-      - *constant*: `(k k k k | a b c d | k k k k)` The input is extended by
-        filling all values beyond the edge with the same constant value k = 0.
-      - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
-        wrapping around to the opposite edge.
-      - *nearest*: `(a a a a | a b c d | d d d d)` The input is extended by the
-        nearest pixel.
-    interpolation: Interpolation mode. Supported values: `"nearest"`,
-      `"bilinear"`.
-    seed: Integer. Used to create a random seed.
-    fill_value: a float represents the value to be filled outside the boundaries
-      when `fill_mode="constant"`.
-
-  Example:
-
-  >>> input_img = np.random.random((32, 224, 224, 3))
-  >>> layer = tf.keras.layers.RandomZoom(.5, .2)
-  >>> out_img = layer(input_img)
-  >>> out_img.shape
-  TensorShape([32, 224, 224, 3])
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-  """
-
-  def __init__(self,
-               height_factor,
-               width_factor=None,
-               fill_mode='reflect',
-               interpolation='bilinear',
-               seed=None,
-               fill_value=0.0,
-               **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomZoom').set(True)
-    super().__init__(seed=seed, force_generator=True, **kwargs)
-    self.height_factor = height_factor
-    if isinstance(height_factor, (tuple, list)):
-      self.height_lower = height_factor[0]
-      self.height_upper = height_factor[1]
-    else:
-      self.height_lower = -height_factor
-      self.height_upper = height_factor
-
-    if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
-      raise ValueError('`height_factor` must have values between [-1, 1], '
-                       'got {}'.format(height_factor))
-
-    self.width_factor = width_factor
-    if width_factor is not None:
-      if isinstance(width_factor, (tuple, list)):
-        self.width_lower = width_factor[0]
-        self.width_upper = width_factor[1]
-      else:
-        self.width_lower = -width_factor  # pylint: disable=invalid-unary-operand-type
-        self.width_upper = width_factor
-
-      if self.width_lower < -1. or self.width_upper < -1.:
-        raise ValueError('`width_factor` must have values larger than -1, '
-                         'got {}'.format(width_factor))
-
-    check_fill_mode_and_interpolation(fill_mode, interpolation)
-
-    self.fill_mode = fill_mode
-    self.fill_value = fill_value
-    self.interpolation = interpolation
-    self.seed = seed
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    height_zoom = self._random_generator.random_uniform(
-        shape=[1, 1],
-        minval=1. + self.height_lower,
-        maxval=1. + self.height_upper)
-    if self.width_factor is not None:
-      width_zoom = self._random_generator.random_uniform(
-          shape=[1, 1],
-          minval=1. + self.width_lower,
-          maxval=1. + self.width_upper)
-    else:
-      width_zoom = height_zoom
-
-    return {'height_zoom': height_zoom, 'width_zoom': width_zoom}
-
-  def augment_image(self, image, transformation):
-    image = utils.ensure_tensor(image, self.compute_dtype)
-    original_shape = image.shape
-    image = tf.expand_dims(image, 0)
-    image_shape = tf.shape(image)
-    img_hd = tf.cast(image_shape[H_AXIS], tf.float32)
-    img_wd = tf.cast(image_shape[W_AXIS], tf.float32)
-    width_zoom = transformation['width_zoom']
-    height_zoom = transformation['height_zoom']
-    zooms = tf.cast(
-        tf.concat([width_zoom, height_zoom], axis=1),
-        dtype=tf.float32)
-    output = transform(
-        image,
-        get_zoom_matrix(zooms, img_hd, img_wd),
-        fill_mode=self.fill_mode,
-        fill_value=self.fill_value,
-        interpolation=self.interpolation)
-    output = tf.squeeze(output, 0)
-    output.set_shape(original_shape)
-    return output
-
-  def augment_label(self, label, transformation):
-    return label
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'height_factor': self.height_factor,
-        'width_factor': self.width_factor,
-        'fill_mode': self.fill_mode,
-        'fill_value': self.fill_value,
-        'interpolation': self.interpolation,
-        'seed': self.seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    _FACTOR_VALIDATION_ERROR = (
+        "The `factor` argument should be a number (or a list of two numbers) "
+        "in the range [-1.0, 1.0]. "
+    )
+    _VALUE_RANGE_VALIDATION_ERROR = (
+        "The `value_range` argument should be a list of two numbers. "
+    )
 
+    def __init__(self, factor, value_range=(0, 255), seed=None, **kwargs):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell(
+            "RandomBrightness"
+        ).set(True)
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        self._set_factor(factor)
+        self._set_value_range(value_range)
+        self._seed = seed
+
+    def _set_value_range(self, value_range):
+        if not isinstance(value_range, (tuple, list)):
+            raise ValueError(
+                self._VALUE_RANGE_VALIDATION_ERROR + f"Got {value_range}"
+            )
+        if len(value_range) != 2:
+            raise ValueError(
+                self._VALUE_RANGE_VALIDATION_ERROR + f"Got {value_range}"
+            )
+        self._value_range = sorted(value_range)
+
+    def _set_factor(self, factor):
+        if isinstance(factor, (tuple, list)):
+            if len(factor) != 2:
+                raise ValueError(
+                    self._FACTOR_VALIDATION_ERROR + f"Got {factor}"
+                )
+            self._check_factor_range(factor[0])
+            self._check_factor_range(factor[1])
+            self._factor = sorted(factor)
+        elif isinstance(factor, (int, float)):
+            self._check_factor_range(factor)
+            factor = abs(factor)
+            self._factor = [-factor, factor]
+        else:
+            raise ValueError(self._FACTOR_VALIDATION_ERROR + f"Got {factor}")
+
+    def _check_factor_range(self, input_number):
+        if input_number > 1.0 or input_number < -1.0:
+            raise ValueError(
+                self._FACTOR_VALIDATION_ERROR + f"Got {input_number}"
+            )
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs, dtype=self.compute_dtype)
+        if training:
+            return self._brightness_adjust(inputs)
+        else:
+            return inputs
+
+    def _brightness_adjust(self, images):
+        rank = images.shape.rank
+        if rank == 3:
+            rgb_delta_shape = (1, 1, 1)
+        elif rank == 4:
+            # Keep only the batch dim. This will ensure to have same adjustment
+            # with in one image, but different across the images.
+            rgb_delta_shape = [tf.shape(images)[0], 1, 1, 1]
+        else:
+            raise ValueError(
+                "Expected the input image to be rank 3 or 4. Got "
+                f"inputs.shape = {images.shape}"
+            )
+        rgb_delta = self._random_generator.random_uniform(
+            shape=rgb_delta_shape,
+            minval=self._factor[0],
+            maxval=self._factor[1],
+        )
+        rgb_delta = rgb_delta * (self._value_range[1] - self._value_range[0])
+        rgb_delta = tf.cast(rgb_delta, images.dtype)
+        images += rgb_delta
+        return tf.clip_by_value(
+            images, self._value_range[0], self._value_range[1]
+        )
+
+    def get_config(self):
+        config = {
+            "factor": self._factor,
+            "value_range": self._value_range,
+            "seed": self._seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export(
+    "keras.layers.RandomHeight",
+    "keras.layers.experimental.preprocessing.RandomHeight",
+    v1=[],
+)
+class RandomHeight(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly varies image height during training.
+
+    This layer adjusts the height of a batch of images by a random factor.
+    The input should be a 3D (unbatched) or 4D (batched) tensor in the
+    `"channels_last"` image data format. Input pixel values can be of any range
+    (e.g. `[0., 1.)` or `[0, 255]`) and of integer or floating point dtype. By
+    default, the layer will output floats.
+
+
+    By default, this layer is inactive during inference.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
-def get_zoom_matrix(zooms, image_height, image_width, name=None):
-  """Returns projective transform(s) for the given zoom(s).
-
-  Args:
-    zooms: A matrix of 2-element lists representing `[zx, zy]` to zoom for each
-      image (for a batch of images).
-    image_height: Height of the image(s) to be transformed.
-    image_width: Width of the image(s) to be transformed.
-    name: The name of the op.
-
-  Returns:
-    A tensor of shape `(num_images, 8)`. Projective transforms which can be
-      given to operation `image_projective_transform_v2`.
-      If one row of transforms is
-       `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps the *output* point
-       `(x, y)` to a transformed *input* point
-       `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
-       where `k = c0 x + c1 y + 1`.
-  """
-  with backend.name_scope(name or 'zoom_matrix'):
-    num_zooms = tf.shape(zooms)[0]
-    # The zoom matrix looks like:
-    #     [[zx 0 0]
-    #      [0 zy 0]
-    #      [0 0 1]]
-    # where the last entry is implicit.
-    # Zoom matrices are always float32.
-    x_offset = ((image_width - 1.) / 2.0) * (1.0 - zooms[:, 0, None])
-    y_offset = ((image_height - 1.) / 2.0) * (1.0 - zooms[:, 1, None])
-    return tf.concat(
-        values=[
-            zooms[:, 0, None],
-            tf.zeros((num_zooms, 1), tf.float32),
-            x_offset,
-            tf.zeros((num_zooms, 1), tf.float32),
-            zooms[:, 1, None],
-            y_offset,
-            tf.zeros((num_zooms, 2), tf.float32),
-        ],
-        axis=1)
-
-
-@keras_export('keras.layers.RandomContrast',
-              'keras.layers.experimental.preprocessing.RandomContrast',
-              v1=[])
-class RandomContrast(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly adjusts contrast during training.
-
-  This layer will randomly adjust the contrast of an image or images by a random
-  factor. Contrast is adjusted independently for each channel of each image
-  during training.
-
-  For each channel, this layer computes the mean of the image pixels in the
-  channel and then adjusts each component `x` of each pixel to
-  `(x - mean) * contrast_factor + mean`.
-
-  Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and
-  in integer or floating point dtype. By default, the layer will output floats.
-  The output value will be clipped to the range `[0, 255]`, the valid
-  range of RGB colors.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Arguments:
-    factor: a positive float represented as fraction of value, or a tuple of
-      size 2 representing lower and upper bound. When represented as a single
-      float, lower = upper. The contrast factor will be randomly picked between
-      `[1.0 - lower, 1.0 + upper]`. For any pixel x in the channel, the output
-      will be `(x - mean) * factor + mean` where `mean` is the mean value of the
-      channel.
-    seed: Integer. Used to create a random seed.
-  """
-
-  def __init__(self, factor, seed=None, **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomContrast').set(
-        True)
-    super().__init__(seed=seed, force_generator=True,
-                                         **kwargs)
-    self.factor = factor
-    if isinstance(factor, (tuple, list)):
-      self.lower = factor[0]
-      self.upper = factor[1]
-    else:
-      self.lower = self.upper = factor
-    if self.lower < 0. or self.upper < 0. or self.lower > 1.:
-      raise ValueError('Factor cannot have negative values or greater than 1.0,'
-                       ' got {}'.format(factor))
-    self.seed = seed
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    lower = 1. - self.lower
-    upper = 1. + self.upper
-    random_seed = self._random_generator.make_seed_for_stateless_op()
-    contrast_factor = stateless_random_ops.stateless_random_uniform(
-        shape=[], minval=lower, maxval=upper, seed=random_seed)
-    return {'contrast_factor': contrast_factor}
-
-  def augment_image(self, image, transformation):
-    contrast_factor = transformation['contrast_factor']
-    output = tf.image.adjust_contrast(image, contrast_factor=contrast_factor)
-    output = tf.clip_by_value(output, 0, 255)
-    output.set_shape(image.shape)
-    return output
-
-  def augment_label(self, label, transformation):
-    return label
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'factor': self.factor,
-        'seed': self.seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.RandomBrightness', v1=[])
-class RandomBrightness(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly adjusts brightness during training.
-
-  This layer will randomly increase/reduce the brightness for the input RGB
-  images. At inference time, the output will be identical to the input.
-  Call the layer with `training=True` to adjust the brightness of the input.
-
-  Note that different brightness adjustment factors
-  will be apply to each the images in the batch.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    factor: Float or a list/tuple of 2 floats between -1.0 and 1.0. The
-      factor is used to determine the lower bound and upper bound of the
-      brightness adjustment. A float value will be chosen randomly between
-      the limits. When -1.0 is chosen, the output image will be black, and
-      when 1.0 is chosen, the image will be fully white. When only one float
-      is provided, eg, 0.2, then -0.2 will be used for lower bound and 0.2
-      will be used for upper bound.
-    value_range: Optional list/tuple of 2 floats for the lower and upper limit
-      of the values of the input data. Defaults to [0.0, 255.0]. Can be changed
-      to e.g. [0.0, 1.0] if the image input has been scaled before this layer.
-      The brightness adjustment will be scaled to this range, and the
-      output values will be clipped to this range.
-    seed: optional integer, for fixed RNG behavior.
-
-  Inputs: 3D (HWC) or 4D (NHWC) tensor, with float or int dtype. Input pixel
-    values can be of any range (e.g. `[0., 1.)` or `[0, 255]`)
-
-  Output: 3D (HWC) or 4D (NHWC) tensor with brightness adjusted based on the
-    `factor`. By default, the layer will output floats. The output value will
-    be clipped to the range `[0, 255]`, the valid range of RGB colors, and
-    rescaled based on the `value_range` if needed.
-
-  Sample usage:
-
-  ```python
-  random_bright = tf.keras.layers.RandomBrightness(factor=0.2)
-
-  # An image with shape [2, 2, 3]
-  image = [[[1, 2, 3], [4 ,5 ,6]], [[7, 8, 9], [10, 11, 12]]]
-
-  # Assume we randomly select the factor to be 0.1, then it will apply
-  # 0.1 * 255 to all the channel
-  output = random_bright(image, training=True)
-
-  # output will be int64 with 25.5 added to each channel and round down.
-  tf.Tensor([[[26.5, 27.5, 28.5]
-              [29.5, 30.5, 31.5]]
-             [[32.5, 33.5, 34.5]
-              [35.5, 36.5, 37.5]]],
-            shape=(2, 2, 3), dtype=int64)
-  ```
-  """
-  _FACTOR_VALIDATION_ERROR = (
-      'The `factor` argument should be a number (or a list of two numbers) '
-      'in the range [-1.0, 1.0]. ')
-  _VALUE_RANGE_VALIDATION_ERROR = (
-      'The `value_range` argument should be a list of two numbers. ')
-
-  def __init__(self, factor, value_range=(0, 255), seed=None, **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomBrightness').set(
-        True)
-    super().__init__(seed=seed, force_generator=True, **kwargs)
-    self._set_factor(factor)
-    self._set_value_range(value_range)
-    self._seed = seed
-
-  def augment_image(self, image, transformation):
-    return self._brightness_adjust(image, transformation['rgb_delta'])
-
-  def augment_label(self, label, transformation):
-    return label
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    rgb_delta_shape = (1, 1, 1)
-    random_rgb_delta = self._random_generator.random_uniform(
-        shape=rgb_delta_shape,
-        minval=self._factor[0],
-        maxval=self._factor[1],
-    )
-    random_rgb_delta = random_rgb_delta * (
-        self._value_range[1] - self._value_range[0])
-    return {'rgb_delta': random_rgb_delta}
-
-  def _set_value_range(self, value_range):
-    if not isinstance(value_range, (tuple, list)):
-      raise ValueError(
-          self._VALUE_RANGE_VALIDATION_ERROR + f'Got {value_range}')
-    if len(value_range) != 2:
-      raise ValueError(
-          self._VALUE_RANGE_VALIDATION_ERROR + f'Got {value_range}')
-    self._value_range = sorted(value_range)
-
-  def _set_factor(self, factor):
-    if isinstance(factor, (tuple, list)):
-      if len(factor) != 2:
-        raise ValueError(self._FACTOR_VALIDATION_ERROR + f'Got {factor}')
-      self._check_factor_range(factor[0])
-      self._check_factor_range(factor[1])
-      self._factor = sorted(factor)
-    elif isinstance(factor, (int, float)):
-      self._check_factor_range(factor)
-      factor = abs(factor)
-      self._factor = [-factor, factor]
-    else:
-      raise ValueError(self._FACTOR_VALIDATION_ERROR + f'Got {factor}')
-
-  def _check_factor_range(self, input_number):
-    if input_number > 1.0 or input_number < -1.0:
-      raise ValueError(self._FACTOR_VALIDATION_ERROR + f'Got {input_number}')
-
-  def _brightness_adjust(self, image, rgb_delta):
-    image = utils.ensure_tensor(image, self.compute_dtype)
-    rank = image.shape.rank
-    if rank != 3:
-      raise ValueError(
-          'Expected the input image to be rank 3. Got '
-          f'inputs.shape = {image.shape}')
-    rgb_delta = tf.cast(rgb_delta, image.dtype)
-    image += rgb_delta
-    return tf.clip_by_value(
-        image, self._value_range[0], self._value_range[1])
-
-  def get_config(self):
-    config = {
-        'factor': self._factor,
-        'value_range': self._value_range,
-        'seed': self._seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.RandomHeight',
-              'keras.layers.experimental.preprocessing.RandomHeight',
-              v1=[])
-class RandomHeight(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly varies image height during training.
-
-  This layer adjusts the height of a batch of images by a random factor.
-  The input should be a 3D (unbatched) or 4D (batched) tensor in the
-  `"channels_last"` image data format. Input pixel values can be of any range
-  (e.g. `[0., 1.)` or `[0, 255]`) and of interger or floating point dtype. By
-  default, the layer will output floats.
-
-
-  By default, this layer is inactive during inference.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    factor: A positive float (fraction of original height), or a tuple of size 2
-      representing lower and upper bound for resizing vertically. When
-      represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
-      height changed by a random amount in the range `[20%, 30%]`.
-      `factor=(-0.2, 0.3)` results in an output with height changed by a random
-      amount in the range `[-20%, +30%]`. `factor=0.2` results in an output with
-      height changed by a random amount in the range `[-20%, +20%]`.
-    interpolation: String, the interpolation method. Defaults to `"bilinear"`.
-      Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
-      `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
-    seed: Integer. Used to create a random seed.
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., random_height, width, channels)`.
-  """
-
-  def __init__(self,
-               factor,
-               interpolation='bilinear',
-               seed=None,
-               **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomHeight').set(True)
-    super().__init__(seed=seed, force_generator=True,
-                                       **kwargs)
-    self.factor = factor
-    if isinstance(factor, (tuple, list)):
-      self.height_lower = factor[0]
-      self.height_upper = factor[1]
-    else:
-      self.height_lower = -factor
-      self.height_upper = factor
-
-    if self.height_upper < self.height_lower:
-      raise ValueError('`factor` cannot have upper bound less than '
-                       'lower bound, got {}'.format(factor))
-    if self.height_lower < -1. or self.height_upper < -1.:
-      raise ValueError('`factor` must have values larger than -1, '
-                       'got {}'.format(factor))
-    self.interpolation = interpolation
-    self._interpolation_method = image_utils.get_interpolation(interpolation)
-    self.seed = seed
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    height_factor = self._random_generator.random_uniform(
-        shape=[],
-        minval=(1.0 + self.height_lower),
-        maxval=(1.0 + self.height_upper))
-    inputs_shape = tf.shape(image)
-    img_hd = tf.cast(inputs_shape[H_AXIS], tf.float32)
-    adjusted_height = tf.cast(height_factor * img_hd, tf.int32)
-    return {'height': adjusted_height}
-
-  def _batch_augment(self, inputs):
-    images = self.augment_image(
-        inputs[IMAGES],
-        transformation=self.get_random_transformation(image=inputs[IMAGES]))
-    result = {IMAGES: images}
-    # to-do augment bbox to clip bbox to resized height value
-    return result
-
-  def augment_image(self, image, transformation):
-    # The batch dimension of the input=image is not modified. The output would
-    # be accurate for both unbatched and batched input
-    inputs_shape = tf.shape(image)
-    img_wd = inputs_shape[W_AXIS]
-    adjusted_height = transformation['height']
-    adjusted_size = tf.stack([adjusted_height, img_wd])
-    output = tf.image.resize(
-        images=image, size=adjusted_size, method=self._interpolation_method)
-    # tf.resize will output float32 in many cases regardless of input type.
-    output = tf.cast(output, self.compute_dtype)
-    output_shape = list(image.shape)
-    output_shape[H_AXIS] = None
-    output.set_shape(output_shape)
-    return output
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    input_shape[H_AXIS] = None
-    return tf.TensorShape(input_shape)
-
-  def get_config(self):
-    config = {
-        'factor': self.factor,
-        'interpolation': self.interpolation,
-        'seed': self.seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.RandomWidth',
-              'keras.layers.experimental.preprocessing.RandomWidth',
-              v1=[])
-class RandomWidth(BaseImageAugmentationLayer):
-  """A preprocessing layer which randomly varies image width during training.
-
-  This layer will randomly adjusts the width of a batch of images of a
-  batch of images by a random factor. The input should be a 3D (unbatched) or
-  4D (batched) tensor in the `"channels_last"` image data format. Input pixel
-  values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and of interger or
-  floating point dtype. By default, the layer will output floats.
-
-  By default, this layer is inactive during inference.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    factor: A positive float (fraction of original width), or a tuple of size 2
-      representing lower and upper bound for resizing vertically. When
-      represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
-      width changed by a random amount in the range `[20%, 30%]`. `factor=(-0.2,
-      0.3)` results in an output with width changed by a random amount in the
-      range `[-20%, +30%]`. `factor=0.2` results in an output with width changed
-      by a random amount in the range `[-20%, +20%]`.
-    interpolation: String, the interpolation method. Defaults to `bilinear`.
-      Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`, `"lanczos3"`,
-      `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
-    seed: Integer. Used to create a random seed.
-
-  Input shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, width, channels)`, in `"channels_last"` format.
-
-  Output shape:
-    3D (unbatched) or 4D (batched) tensor with shape:
-    `(..., height, random_width, channels)`.
-  """
-
-  def __init__(self,
-               factor,
-               interpolation='bilinear',
-               seed=None,
-               **kwargs):
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('RandomWidth').set(True)
-    super().__init__(seed=seed, force_generator=True, **kwargs)
-    self.factor = factor
-    if isinstance(factor, (tuple, list)):
-      self.width_lower = factor[0]
-      self.width_upper = factor[1]
-    else:
-      self.width_lower = -factor
-      self.width_upper = factor
-    if self.width_upper < self.width_lower:
-      raise ValueError('`factor` cannot have upper bound less than '
-                       'lower bound, got {}'.format(factor))
-    if self.width_lower < -1. or self.width_upper < -1.:
-      raise ValueError('`factor` must have values larger than -1, '
-                       'got {}'.format(factor))
-    self.interpolation = interpolation
-    self._interpolation_method = image_utils.get_interpolation(interpolation)
-    self.seed = seed
-    self.auto_vectorize = False
-
-  def _batch_augment(self, inputs):
-    images = self.augment_image(
-        inputs[IMAGES],
-        transformation=self.get_random_transformation(image=inputs[IMAGES]))
-    result = {IMAGES: images}
-    # to-do augment bbox to clip bbox to resized width value
-    return result
-
-  def augment_image(self, image, transformation):
-    # The batch dimension of the input=image is not modified. The output would
-    # be accurate for both unbatched and batched input
-    inputs = utils.ensure_tensor(image)
-    inputs_shape = tf.shape(inputs)
-    img_hd = inputs_shape[H_AXIS]
-    adjusted_width = transformation['width']
-    adjusted_size = tf.stack([img_hd, adjusted_width])
-    output = tf.image.resize(
-        images=inputs, size=adjusted_size, method=self._interpolation_method)
-    # tf.resize will output float32 in many cases regardless of input type.
-    output = tf.cast(output, self.compute_dtype)
-    output_shape = inputs.shape.as_list()
-    output_shape[W_AXIS] = None
-    output.set_shape(output_shape)
-    return output
-
-  def get_random_transformation(self,
-                                image=None,
-                                label=None,
-                                bounding_box=None):
-    inputs_shape = tf.shape(image)
-    img_wd = tf.cast(inputs_shape[W_AXIS], tf.float32)
-    width_factor = self._random_generator.random_uniform(
-        shape=[],
-        minval=(1.0 + self.width_lower),
-        maxval=(1.0 + self.width_upper))
-    adjusted_width = tf.cast(width_factor * img_wd, tf.int32)
-    return {'width': adjusted_width}
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    input_shape[W_AXIS] = None
-    return tf.TensorShape(input_shape)
-
-  def get_config(self):
-    config = {
-        'factor': self.factor,
-        'interpolation': self.interpolation,
-        'seed': self.seed,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    Args:
+        factor: A positive float (fraction of original height),
+            or a tuple of size 2 representing lower and upper bound
+            for resizing vertically. When represented as a single float,
+            this value is used for both the upper and
+            lower bound. For instance, `factor=(0.2, 0.3)` results
+            in an output with
+            height changed by a random amount in the range `[20%, 30%]`.
+            `factor=(-0.2, 0.3)` results in an output with height
+            changed by a random amount in the range `[-20%, +30%]`.
+            `factor=0.2` results in an output with
+            height changed by a random amount in the range `[-20%, +20%]`.
+        interpolation: String, the interpolation method.
+            Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
+            `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+            Defaults to `"bilinear"`.
+        seed: Integer. Used to create a random seed.
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., random_height, width, channels)`.
+    """
+
+    def __init__(self, factor, interpolation="bilinear", seed=None, **kwargs):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("RandomHeight").set(
+            True
+        )
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        self.factor = factor
+        if isinstance(factor, (tuple, list)):
+            self.height_lower = factor[0]
+            self.height_upper = factor[1]
+        else:
+            self.height_lower = -factor
+            self.height_upper = factor
+
+        if self.height_upper < self.height_lower:
+            raise ValueError(
+                "`factor` argument cannot have an upper bound lesser than the "
+                f"lower bound. Received: factor={factor}"
+            )
+        if self.height_lower < -1.0 or self.height_upper < -1.0:
+            raise ValueError(
+                "`factor` argument must have values larger than -1. "
+                f"Received: factor={factor}"
+            )
+        self.interpolation = interpolation
+        self._interpolation_method = image_utils.get_interpolation(
+            interpolation
+        )
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs)
+
+        def random_height_inputs(inputs):
+            """Inputs height-adjusted with random ops."""
+            inputs_shape = tf.shape(inputs)
+            img_hd = tf.cast(inputs_shape[H_AXIS], tf.float32)
+            img_wd = inputs_shape[W_AXIS]
+            height_factor = self._random_generator.random_uniform(
+                shape=[],
+                minval=(1.0 + self.height_lower),
+                maxval=(1.0 + self.height_upper),
+            )
+            adjusted_height = tf.cast(height_factor * img_hd, tf.int32)
+            adjusted_size = tf.stack([adjusted_height, img_wd])
+            output = tf.image.resize(
+                images=inputs,
+                size=adjusted_size,
+                method=self._interpolation_method,
+            )
+            # tf.resize will output float32 regardless of input type.
+            output = tf.cast(output, self.compute_dtype)
+            output_shape = inputs.shape.as_list()
+            output_shape[H_AXIS] = None
+            output.set_shape(output_shape)
+            return output
+
+        if training:
+            return random_height_inputs(inputs)
+        else:
+            return inputs
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        input_shape[H_AXIS] = None
+        return tf.TensorShape(input_shape)
+
+    def get_config(self):
+        config = {
+            "factor": self.factor,
+            "interpolation": self.interpolation,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export(
+    "keras.layers.RandomWidth",
+    "keras.layers.experimental.preprocessing.RandomWidth",
+    v1=[],
+)
+class RandomWidth(base_layer.BaseRandomLayer):
+    """A preprocessing layer which randomly varies image width during training.
+
+    This layer will randomly adjusts the width of a batch of images of a
+    batch of images by a random factor. The input should be a 3D (unbatched) or
+    4D (batched) tensor in the `"channels_last"` image data format. Input pixel
+    values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and of integer or
+    floating point dtype. By default, the layer will output floats.
+
+    By default, this layer is inactive during inference.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+        factor: A positive float (fraction of original width),
+            or a tuple of size 2 representing lower and upper bound
+            for resizing horizontally. When represented as a single float,
+            this value is used for both the upper and
+            lower bound. For instance, `factor=(0.2, 0.3)`
+            results in an output with
+            width changed by a random amount in the range `[20%, 30%]`.
+            `factor=(-0.2, 0.3)` results in an output with width changed
+            by a random amount in the range `[-20%, +30%]`.
+            `factor=0.2` results in an output with width changed
+            by a random amount in the range `[-20%, +20%]`.
+        interpolation: String, the interpolation method.
+            Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
+            `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+            Defaults to `bilinear`.
+        seed: Integer. Used to create a random seed.
+
+    Input shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, width, channels)`, in `"channels_last"` format.
+
+    Output shape:
+        3D (unbatched) or 4D (batched) tensor with shape:
+        `(..., height, random_width, channels)`.
+    """
+
+    def __init__(self, factor, interpolation="bilinear", seed=None, **kwargs):
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("RandomWidth").set(
+            True
+        )
+        super().__init__(seed=seed, force_generator=True, **kwargs)
+        self.factor = factor
+        if isinstance(factor, (tuple, list)):
+            self.width_lower = factor[0]
+            self.width_upper = factor[1]
+        else:
+            self.width_lower = -factor
+            self.width_upper = factor
+        if self.width_upper < self.width_lower:
+            raise ValueError(
+                "`factor` argument cannot have an upper bound less than the "
+                f"lower bound. Received: factor={factor}"
+            )
+        if self.width_lower < -1.0 or self.width_upper < -1.0:
+            raise ValueError(
+                "`factor` argument must have values larger than -1. "
+                f"Received: factor={factor}"
+            )
+        self.interpolation = interpolation
+        self._interpolation_method = image_utils.get_interpolation(
+            interpolation
+        )
+        self.seed = seed
+
+    def call(self, inputs, training=True):
+        inputs = convert_inputs(inputs)
+
+        def random_width_inputs(inputs):
+            """Inputs width-adjusted with random ops."""
+            inputs_shape = tf.shape(inputs)
+            img_hd = inputs_shape[H_AXIS]
+            img_wd = tf.cast(inputs_shape[W_AXIS], tf.float32)
+            width_factor = self._random_generator.random_uniform(
+                shape=[],
+                minval=(1.0 + self.width_lower),
+                maxval=(1.0 + self.width_upper),
+            )
+            adjusted_width = tf.cast(width_factor * img_wd, tf.int32)
+            adjusted_size = tf.stack([img_hd, adjusted_width])
+            output = tf.image.resize(
+                images=inputs,
+                size=adjusted_size,
+                method=self._interpolation_method,
+            )
+            # tf.resize will output float32 regardless of input type.
+            output = tf.cast(output, self.compute_dtype)
+            output_shape = inputs.shape.as_list()
+            output_shape[W_AXIS] = None
+            output.set_shape(output_shape)
+            return output
+
+        if training:
+            return random_width_inputs(inputs)
+        else:
+            return inputs
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        input_shape[W_AXIS] = None
+        return tf.TensorShape(input_shape)
+
+    def get_config(self):
+        config = {
+            "factor": self.factor,
+            "interpolation": self.interpolation,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def convert_inputs(inputs, dtype=None):
+    if isinstance(inputs, dict):
+        raise ValueError(
+            "This layer can only process a tensor representing an image or "
+            f"a batch of images. Received: type(inputs)={type(inputs)}."
+            "If you need to pass a dict containing "
+            "images, labels, and bounding boxes, you should "
+            "instead use the preprocessing and augmentation layers "
+            "from `keras_cv.layers`. See docs at "
+            "https://keras.io/api/keras_cv/layers/"
+        )
+    inputs = utils.ensure_tensor(inputs, dtype=dtype)
+    return inputs
diff --git a/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/keras/layers/preprocessing/image_preprocessing_distribution_test.py
index 1a71b8ce5a2d..9383de95e0e7 100644
--- a/keras/layers/preprocessing/image_preprocessing_distribution_test.py
+++ b/keras/layers/preprocessing/image_preprocessing_distribution_test.py
@@ -14,52 +14,60 @@
 # ==============================================================================
 """Distribution tests for keras.layers.preprocessing.image_preprocessing."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.distribute import strategy_combinations
 from keras.layers.preprocessing import image_preprocessing
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        strategy=strategy_combinations.all_strategies +
-        strategy_combinations.multi_worker_mirrored_strategies,
-        mode=["eager"]))
+        strategy=strategy_combinations.all_strategies
+        + strategy_combinations.multi_worker_mirrored_strategies,
+        mode=["eager"],
+    )
+)
 class ImagePreprocessingDistributionTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_distribution(self, strategy):
-    if "CentralStorage" in type(strategy).__name__:
-      self.skipTest("Does not work with CentralStorageStrategy yet.")
-    # TODO(b/159738418): large image input causes OOM in ubuntu multi gpu.
-    np_images = np.random.random((32, 32, 32, 3)).astype(np.float32)
-    image_dataset = tf.data.Dataset.from_tensor_slices(np_images).batch(
-        16, drop_remainder=True)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_distribution(self, strategy):
+        if "CentralStorage" in type(strategy).__name__:
+            self.skipTest("Does not work with CentralStorageStrategy yet.")
+        # TODO(b/159738418): large image input causes OOM in ubuntu multi gpu.
+        np_images = np.random.random((32, 32, 32, 3)).astype(np.float32)
+        image_dataset = tf.data.Dataset.from_tensor_slices(np_images).batch(
+            16, drop_remainder=True
+        )
 
-    with strategy.scope():
-      input_data = keras.Input(shape=(32, 32, 3), dtype=tf.float32)
-      image_preprocessor = keras.Sequential([
-          image_preprocessing.Resizing(height=256, width=256),
-          image_preprocessing.RandomCrop(height=224, width=224),
-          image_preprocessing.RandomTranslation(.1, .1),
-          image_preprocessing.RandomBrightness(.1, value_range=(0, 1)),
-          image_preprocessing.RandomRotation(.2),
-          image_preprocessing.RandomFlip(),
-          image_preprocessing.RandomZoom(.2, .2)])
-      preprocessed_image = image_preprocessor(input_data)
-      flatten_layer = keras.layers.Flatten(data_format="channels_last")
-      output = flatten_layer(preprocessed_image)
-      cls_layer = keras.layers.Dense(units=1, activation="sigmoid")
-      output = cls_layer(output)
-      model = keras.Model(inputs=input_data, outputs=output)
-    _ = model.predict(image_dataset)
+        with strategy.scope():
+            input_data = keras.Input(shape=(32, 32, 3), dtype=tf.float32)
+            image_preprocessor = keras.Sequential(
+                [
+                    image_preprocessing.Resizing(height=256, width=256),
+                    image_preprocessing.RandomCrop(height=224, width=224),
+                    image_preprocessing.RandomTranslation(0.1, 0.1),
+                    image_preprocessing.RandomBrightness(
+                        0.1, value_range=(0, 1)
+                    ),
+                    image_preprocessing.RandomRotation(0.2),
+                    image_preprocessing.RandomFlip(),
+                    image_preprocessing.RandomZoom(0.2, 0.2),
+                ]
+            )
+            preprocessed_image = image_preprocessor(input_data)
+            flatten_layer = keras.layers.Flatten(data_format="channels_last")
+            output = flatten_layer(preprocessed_image)
+            cls_layer = keras.layers.Dense(units=1, activation="sigmoid")
+            output = cls_layer(output)
+            model = keras.Model(inputs=input_data, outputs=output)
+        _ = model.predict(image_dataset)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/layers/preprocessing/image_preprocessing_test.py b/keras/layers/preprocessing/image_preprocessing_test.py
index 413bb43cd6f8..8385e6cdace2 100644
--- a/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/keras/layers/preprocessing/image_preprocessing_test.py
@@ -15,6 +15,9 @@
 """Tests for image preprocessing layers."""
 
 import functools
+
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 import keras
@@ -22,2229 +25,2284 @@
 from keras.layers.preprocessing import image_preprocessing
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.ops import stateless_random_ops
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class ResizingTest(test_combinations.TestCase):
-
-  def _run_test(self, kwargs, expected_height, expected_width):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    kwargs.update({'height': expected_height, 'width': expected_width})
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          image_preprocessing.Resizing,
-          kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          expected_output_shape=(None, expected_height, expected_width,
-                                 channels))
-
-  @parameterized.named_parameters(('down_sample_bilinear_2_by_2', {
-      'interpolation': 'bilinear'
-  }, 2, 2), ('down_sample_bilinear_3_by_2', {
-      'interpolation': 'bilinear'
-  }, 3, 2), ('down_sample_nearest_2_by_2', {
-      'interpolation': 'nearest'
-  }, 2, 2), ('down_sample_nearest_3_by_2', {
-      'interpolation': 'nearest'
-  }, 3, 2), ('down_sample_area_2_by_2', {
-      'interpolation': 'area'
-  }, 2, 2), ('down_sample_area_3_by_2', {
-      'interpolation': 'area'
-  }, 3, 2), ('down_sample_crop_to_aspect_ratio_3_by_2', {
-      'interpolation': 'bilinear',
-      'crop_to_aspect_ratio': True,
-  }, 3, 2))
-  def test_down_sampling(self, kwargs, expected_height, expected_width):
-    self._run_test(kwargs, expected_height, expected_width)
-
-  @parameterized.named_parameters(('up_sample_bilinear_10_by_12', {
-      'interpolation': 'bilinear'
-  }, 10, 12), ('up_sample_bilinear_12_by_12', {
-      'interpolation': 'bilinear'
-  }, 12, 12), ('up_sample_nearest_10_by_12', {
-      'interpolation': 'nearest'
-  }, 10, 12), ('up_sample_nearest_12_by_12', {
-      'interpolation': 'nearest'
-  }, 12, 12), ('up_sample_area_10_by_12', {
-      'interpolation': 'area'
-  }, 10, 12), ('up_sample_area_12_by_12', {
-      'interpolation': 'area'
-  }, 12, 12), ('up_sample_crop_to_aspect_ratio_12_by_14', {
-      'interpolation': 'bilinear',
-      'crop_to_aspect_ratio': True,
-  }, 12, 14))
-  def test_up_sampling(self, kwargs, expected_height, expected_width):
-    self._run_test(kwargs, expected_height, expected_width)
-
-  def test_down_sampling_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(dtype)
-        layer = image_preprocessing.Resizing(
-            height=2, width=2, interpolation='nearest')
-        output_image = layer(input_image)
-        # pyformat: disable
-        expected_output = np.asarray([
-            [5, 7],
-            [13, 15]
-        ]).astype(dtype)
-        # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_up_sampling_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 4), (1, 2, 2, 1)).astype(dtype)
+    def _run_test(self, kwargs, expected_height, expected_width):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        kwargs.update({"height": expected_height, "width": expected_width})
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                image_preprocessing.Resizing,
+                kwargs=kwargs,
+                input_shape=(num_samples, orig_height, orig_width, channels),
+                expected_output_shape=(
+                    None,
+                    expected_height,
+                    expected_width,
+                    channels,
+                ),
+            )
+
+    @parameterized.named_parameters(
+        ("down_sample_bilinear_2_by_2", {"interpolation": "bilinear"}, 2, 2),
+        ("down_sample_bilinear_3_by_2", {"interpolation": "bilinear"}, 3, 2),
+        ("down_sample_nearest_2_by_2", {"interpolation": "nearest"}, 2, 2),
+        ("down_sample_nearest_3_by_2", {"interpolation": "nearest"}, 3, 2),
+        ("down_sample_area_2_by_2", {"interpolation": "area"}, 2, 2),
+        ("down_sample_area_3_by_2", {"interpolation": "area"}, 3, 2),
+        (
+            "down_sample_crop_to_aspect_ratio_3_by_2",
+            {
+                "interpolation": "bilinear",
+                "crop_to_aspect_ratio": True,
+            },
+            3,
+            2,
+        ),
+    )
+    def test_down_sampling(self, kwargs, expected_height, expected_width):
+        self._run_test(kwargs, expected_height, expected_width)
+
+    @parameterized.named_parameters(
+        ("up_sample_bilinear_10_by_12", {"interpolation": "bilinear"}, 10, 12),
+        ("up_sample_bilinear_12_by_12", {"interpolation": "bilinear"}, 12, 12),
+        ("up_sample_nearest_10_by_12", {"interpolation": "nearest"}, 10, 12),
+        ("up_sample_nearest_12_by_12", {"interpolation": "nearest"}, 12, 12),
+        ("up_sample_area_10_by_12", {"interpolation": "area"}, 10, 12),
+        ("up_sample_area_12_by_12", {"interpolation": "area"}, 12, 12),
+        (
+            "up_sample_crop_to_aspect_ratio_12_by_14",
+            {
+                "interpolation": "bilinear",
+                "crop_to_aspect_ratio": True,
+            },
+            12,
+            14,
+        ),
+    )
+    def test_up_sampling(self, kwargs, expected_height, expected_width):
+        self._run_test(kwargs, expected_height, expected_width)
+
+    def test_down_sampling_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.Resizing(
+                    height=2, width=2, interpolation="nearest"
+                )
+                output_image = layer(input_image)
+                # pyformat: disable
+                expected_output = np.asarray([[5, 7], [13, 15]]).astype(dtype)
+                # pyformat: enable
+                expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_up_sampling_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 4), (1, 2, 2, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.Resizing(
+                    height=4, width=4, interpolation="nearest"
+                )
+                output_image = layer(input_image)
+                # pyformat: disable
+                expected_output = np.asarray(
+                    [[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 3, 3], [2, 2, 3, 3]]
+                ).astype(dtype)
+                # pyformat: enable
+                expected_output = np.reshape(expected_output, (1, 4, 4, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    @parameterized.named_parameters(
+        ("reshape_bilinear_10_by_4", {"interpolation": "bilinear"}, 10, 4)
+    )
+    def test_reshaping(self, kwargs, expected_height, expected_width):
+        self._run_test(kwargs, expected_height, expected_width)
+
+    def test_invalid_interpolation(self):
+        with self.assertRaises(NotImplementedError):
+            image_preprocessing.Resizing(5, 5, "invalid_interpolation")
+
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.Resizing(5, 5, name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.Resizing.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_crop_to_aspect_ratio(self):
+        with test_utils.use_gpu():
+            input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(
+                "float32"
+            )
+            layer = image_preprocessing.Resizing(
+                4, 2, crop_to_aspect_ratio=True
+            )
+            output_image = layer(input_image)
+            expected_output = np.asarray(
+                [
+                    [1, 2],
+                    [5, 6],
+                    [9, 10],
+                    [13, 14],
+                ]
+            ).astype("float32")
+            expected_output = np.reshape(expected_output, (1, 4, 2, 1))
+            self.assertAllEqual(expected_output, output_image)
+
+    def test_unbatched_image(self):
+        with test_utils.use_gpu():
+            input_image = np.reshape(np.arange(0, 16), (4, 4, 1)).astype(
+                "float32"
+            )
+            layer = image_preprocessing.Resizing(2, 2, interpolation="nearest")
+            output_image = layer(input_image)
+            expected_output = np.asarray(
+                [
+                    [5, 7],
+                    [13, 15],
+                ]
+            ).astype("float32")
+            expected_output = np.reshape(expected_output, (2, 2, 1))
+            self.assertAllEqual(expected_output, output_image)
+
+    @parameterized.named_parameters(
+        ("crop_to_aspect_ratio_false", False),
+        ("crop_to_aspect_ratio_true", True),
+    )
+    def test_ragged_image(self, crop_to_aspect_ratio):
+        with test_utils.use_gpu():
+            inputs = tf.ragged.constant(
+                [
+                    np.ones((8, 8, 1)),
+                    np.ones((8, 4, 1)),
+                    np.ones((4, 8, 1)),
+                    np.ones((2, 2, 1)),
+                ],
+                dtype="float32",
+            )
+            layer = image_preprocessing.Resizing(
+                2,
+                2,
+                interpolation="nearest",
+                crop_to_aspect_ratio=crop_to_aspect_ratio,
+            )
+            outputs = layer(inputs)
+            expected_output = [
+                [[[1.0], [1.0]], [[1.0], [1.0]]],
+                [[[1.0], [1.0]], [[1.0], [1.0]]],
+                [[[1.0], [1.0]], [[1.0], [1.0]]],
+                [[[1.0], [1.0]], [[1.0], [1.0]]],
+            ]
+            self.assertIsInstance(outputs, tf.Tensor)
+            self.assertNotIsInstance(outputs, tf.RaggedTensor)
+            self.assertAllEqual(expected_output, outputs)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.Resizing(2, 2)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.Resizing(2, 2, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
+
+    @parameterized.named_parameters(
+        ("batch_crop_to_aspect_ratio", True, True),
+        ("batch_dont_crop_to_aspect_ratio", False, True),
+        ("single_sample_crop_to_aspect_ratio", True, False),
+        ("single_sample_dont_crop_to_aspect_ratio", False, False),
+    )
+    def test_static_shape_inference(self, crop_to_aspect_ratio, batch):
+        channels = 3
+        input_height = 8
+        input_width = 8
+        target_height = 4
+        target_width = 6
         layer = image_preprocessing.Resizing(
-            height=4, width=4, interpolation='nearest')
-        output_image = layer(input_image)
-        # pyformat: disable
-        expected_output = np.asarray([
-            [0, 0, 1, 1],
-            [0, 0, 1, 1],
-            [2, 2, 3, 3],
-            [2, 2, 3, 3]
-        ]).astype(dtype)
-        # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 4, 4, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  @parameterized.named_parameters(('reshape_bilinear_10_by_4', {
-      'interpolation': 'bilinear'
-  }, 10, 4))
-  def test_reshaping(self, kwargs, expected_height, expected_width):
-    self._run_test(kwargs, expected_height, expected_width)
-
-  def test_invalid_interpolation(self):
-    with self.assertRaises(NotImplementedError):
-      image_preprocessing.Resizing(5, 5, 'invalid_interpolation')
-
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.Resizing(5, 5, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.Resizing.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_crop_to_aspect_ratio(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype('float32')
-      layer = image_preprocessing.Resizing(4, 2, crop_to_aspect_ratio=True)
-      output_image = layer(input_image)
-      expected_output = np.asarray([
-          [1, 2],
-          [5, 6],
-          [9, 10],
-          [13, 14],
-      ]).astype('float32')
-      expected_output = np.reshape(expected_output, (1, 4, 2, 1))
-      self.assertAllEqual(expected_output, output_image)
-
-  def test_unbatched_image(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 16), (4, 4, 1)).astype('float32')
-      layer = image_preprocessing.Resizing(2, 2, interpolation='nearest')
-      output_image = layer(input_image)
-      expected_output = np.asarray([
-          [5, 7],
-          [13, 15],
-      ]).astype('float32')
-      expected_output = np.reshape(expected_output, (2, 2, 1))
-      self.assertAllEqual(expected_output, output_image)
-
-  @parameterized.named_parameters(('crop_to_aspect_ratio_false', False),
-                                  ('crop_to_aspect_ratio_true', True))
-  def test_ragged_image(self, crop_to_aspect_ratio):
-    with test_utils.use_gpu():
-      inputs = tf.ragged.constant([
-          np.ones((8, 8, 1)),
-          np.ones((8, 4, 1)),
-          np.ones((4, 8, 1)),
-          np.ones((2, 2, 1)),
-      ], dtype='float32')
-      layer = image_preprocessing.Resizing(
-          2,
-          2,
-          interpolation='nearest',
-          crop_to_aspect_ratio=crop_to_aspect_ratio)
-      outputs = layer(inputs)
-      expected_output = [[[[1.], [1.]], [[1.], [1.]]],
-                         [[[1.], [1.]], [[1.], [1.]]],
-                         [[[1.], [1.]], [[1.], [1.]]],
-                         [[[1.], [1.]], [[1.], [1.]]]]
-      self.assertIsInstance(outputs, tf.Tensor)
-      self.assertNotIsInstance(outputs, tf.RaggedTensor)
-      self.assertAllEqual(expected_output, outputs)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.Resizing(2, 2)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.Resizing(2, 2, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
-
-  @parameterized.named_parameters(
-      ('batch_crop_to_aspect_ratio', True, True),
-      ('batch_dont_crop_to_aspect_ratio', False, True),
-      ('single_sample_crop_to_aspect_ratio', True, False),
-      ('single_sample_dont_crop_to_aspect_ratio', False, False),
-  )
-  def test_static_shape_inference(self, crop_to_aspect_ratio, batch):
-    channels = 3
-    input_height = 8
-    input_width = 8
-    target_height = 4
-    target_width = 6
-    layer = image_preprocessing.Resizing(
-        target_height, target_width, crop_to_aspect_ratio=crop_to_aspect_ratio)
-    unit_test = self
-
-    @tf.function
-    def tf_function(img):
-      unit_test.assertListEqual([input_height, input_width, channels],
-                                img.shape.as_list()[-3:])
-      img = layer(img)
-      unit_test.assertListEqual([target_height, target_width, channels],
-                                img.shape.as_list()[-3:])
-      return img
-
-    with test_utils.use_gpu():
-      if batch:
-        input_shape = (2, input_height, input_width, channels)
-      else:
-        input_shape = (input_height, input_width, channels)
-      img_data = np.random.random(size=input_shape).astype('float32')
-      tf_function(img_data)
+            target_height,
+            target_width,
+            crop_to_aspect_ratio=crop_to_aspect_ratio,
+        )
+        unit_test = self
+
+        @tf.function
+        def tf_function(img):
+            unit_test.assertListEqual(
+                [input_height, input_width, channels], img.shape.as_list()[-3:]
+            )
+            img = layer(img)
+            unit_test.assertListEqual(
+                [target_height, target_width, channels],
+                img.shape.as_list()[-3:],
+            )
+            return img
+
+        with test_utils.use_gpu():
+            if batch:
+                input_shape = (2, input_height, input_width, channels)
+            else:
+                input_shape = (input_height, input_width, channels)
+            img_data = np.random.random(size=input_shape).astype("float32")
+            tf_function(img_data)
 
 
 def get_numpy_center_crop(images, expected_height, expected_width):
-  orig_height = images.shape[1]
-  orig_width = images.shape[2]
-  height_start = int((orig_height - expected_height) / 2)
-  width_start = int((orig_width - expected_width) / 2)
-  height_end = height_start + expected_height
-  width_end = width_start + expected_width
-  return images[:, height_start:height_end, width_start:width_end, :]
+    orig_height = images.shape[1]
+    orig_width = images.shape[2]
+    height_start = int((orig_height - expected_height) / 2)
+    width_start = int((orig_width - expected_width) / 2)
+    height_end = height_start + expected_height
+    width_end = width_start + expected_width
+    return images[:, height_start:height_end, width_start:width_end, :]
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class CenterCropTest(test_combinations.TestCase):
-
-  def _run_test(self, expected_height, expected_width):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    kwargs = {'height': expected_height, 'width': expected_width}
-    input_images = np.random.random(
-        (num_samples, orig_height, orig_width, channels)).astype(np.float32)
-    expected_output = get_numpy_center_crop(input_images, expected_height,
-                                            expected_width)
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          image_preprocessing.CenterCrop,
-          kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          input_data=input_images,
-          expected_output=expected_output,
-          expected_output_shape=(None, expected_height, expected_width,
-                                 channels))
-
-  @parameterized.named_parameters(('center_crop_3_by_4', 3, 4),
-                                  ('center_crop_3_by_2', 3, 2))
-  def test_center_crop_aligned(self, expected_height, expected_width):
-    self._run_test(expected_height, expected_width)
-
-  @parameterized.named_parameters(('center_crop_4_by_5', 4, 5),
-                                  ('center_crop_4_by_3', 4, 3))
-  def test_center_crop_mis_aligned(self, expected_height, expected_width):
-    self._run_test(expected_height, expected_width)
-
-  @parameterized.named_parameters(('center_crop_4_by_6', 4, 6),
-                                  ('center_crop_3_by_2', 3, 2))
-  def test_center_crop_half_mis_aligned(self, expected_height, expected_width):
-    self._run_test(expected_height, expected_width)
-
-  def test_input_smaller_than_crop_box(self):
-    np.random.seed(1337)
-    height, width = 10, 8
-    inp = np.random.random((12, 3, 3, 3))
-    with test_utils.use_gpu():
-      layer = image_preprocessing.CenterCrop(height, width)
-      actual_output = layer(inp)
-      # In this case, output should equal resizing with crop_to_aspect ratio.
-      resize_layer = image_preprocessing.Resizing(
-          height, width, crop_to_aspect_ratio=True)
-      expected_output = resize_layer(inp)
-      self.assertAllEqual(expected_output, actual_output)
-
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.CenterCrop(5, 5, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.CenterCrop.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_unbatched_image(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 16), (4, 4, 1)).astype('float32')
-      layer = image_preprocessing.CenterCrop(2, 2)
-      output_image = layer(input_image)
-      expected_output = np.asarray([
-          [5, 6],
-          [9, 10],
-      ]).astype('float32')
-      expected_output = np.reshape(expected_output, (2, 2, 1))
-      self.assertAllEqual(expected_output, output_image)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.CenterCrop(2, 2)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.CenterCrop(2, 2, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
+    def _run_test(self, expected_height, expected_width):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        kwargs = {"height": expected_height, "width": expected_width}
+        input_images = np.random.random(
+            (num_samples, orig_height, orig_width, channels)
+        ).astype(np.float32)
+        expected_output = get_numpy_center_crop(
+            input_images, expected_height, expected_width
+        )
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                image_preprocessing.CenterCrop,
+                kwargs=kwargs,
+                input_shape=(num_samples, orig_height, orig_width, channels),
+                input_data=input_images,
+                expected_output=expected_output,
+                expected_output_shape=(
+                    None,
+                    expected_height,
+                    expected_width,
+                    channels,
+                ),
+            )
+
+    @parameterized.named_parameters(
+        ("center_crop_3_by_4", 3, 4), ("center_crop_3_by_2", 3, 2)
+    )
+    def test_center_crop_aligned(self, expected_height, expected_width):
+        self._run_test(expected_height, expected_width)
+
+    @parameterized.named_parameters(
+        ("center_crop_4_by_5", 4, 5), ("center_crop_4_by_3", 4, 3)
+    )
+    def test_center_crop_mis_aligned(self, expected_height, expected_width):
+        self._run_test(expected_height, expected_width)
+
+    @parameterized.named_parameters(
+        ("center_crop_4_by_6", 4, 6), ("center_crop_3_by_2", 3, 2)
+    )
+    def test_center_crop_half_mis_aligned(
+        self, expected_height, expected_width
+    ):
+        self._run_test(expected_height, expected_width)
+
+    def test_input_smaller_than_crop_box(self):
+        np.random.seed(1337)
+        height, width = 10, 8
+        inp = np.random.random((12, 3, 3, 3))
+        with test_utils.use_gpu():
+            layer = image_preprocessing.CenterCrop(height, width)
+            actual_output = layer(inp)
+            # In this case, output should equal resizing
+            # with crop_to_aspect ratio.
+            resize_layer = image_preprocessing.Resizing(
+                height, width, crop_to_aspect_ratio=True
+            )
+            expected_output = resize_layer(inp)
+            self.assertAllEqual(expected_output, actual_output)
+
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.CenterCrop(5, 5, name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.CenterCrop.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_unbatched_image(self):
+        with test_utils.use_gpu():
+            input_image = np.reshape(np.arange(0, 16), (4, 4, 1)).astype(
+                "float32"
+            )
+            layer = image_preprocessing.CenterCrop(2, 2)
+            output_image = layer(input_image)
+            expected_output = np.asarray(
+                [
+                    [5, 6],
+                    [9, 10],
+                ]
+            ).astype("float32")
+            expected_output = np.reshape(expected_output, (2, 2, 1))
+            self.assertAllEqual(expected_output, output_image)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.CenterCrop(2, 2)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.CenterCrop(2, 2, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class RandomCropTest(test_combinations.TestCase):
-
-  def _run_test(self, expected_height, expected_width):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    kwargs = {'height': expected_height, 'width': expected_width}
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          image_preprocessing.RandomCrop,
-          kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          expected_output_shape=(None, expected_height, expected_width,
-                                 channels))
-
-  def test_input_smaller_than_crop_box(self):
-    np.random.seed(1337)
-    height, width = 10, 8
-    inp = np.random.random((12, 3, 3, 3))
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(height, width)
-      actual_output = layer(inp)
-      # In this case, output should equal resizing with crop_to_aspect ratio.
-      resize_layer = image_preprocessing.Resizing(
-          height, width, crop_to_aspect_ratio=True)
-      expected_output = resize_layer(inp)
-      self.assertAllEqual(expected_output, actual_output)
-
-  def test_training_with_mock(self):
-    np.random.seed(1337)
-    height, width = 3, 4
-    height_offset = np.random.randint(low=0, high=3)
-    width_offset = np.random.randint(low=0, high=5)
-    mock_offset = [height_offset, width_offset]
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(height, width)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_offset):
-        inp = np.random.random((12, 5, 8, 3))
-        actual_output = layer(inp, training=True)
-        expected_output = inp[:, height_offset:(height_offset + height),
-                              width_offset:(width_offset + width), :]
-        self.assertAllClose(expected_output, actual_output)
-
-  @parameterized.named_parameters(('random_crop_4_by_6', 4, 6),
-                                  ('random_crop_3_by_2', 3, 2))
-  def test_random_crop_output_shape(self, expected_height, expected_width):
-    self._run_test(expected_height, expected_width)
-
-  def test_random_crop_full_height(self):
-    self._run_test(5, 2)
-
-  def test_random_crop_full_width(self):
-    self._run_test(3, 8)
-
-  def test_random_crop_full(self):
-    np.random.seed(1337)
-    height, width = 8, 16
-    inp = np.random.random((12, 8, 16, 3))
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(height, width)
-      actual_output = layer(inp, training=False)
-      self.assertAllClose(inp, actual_output)
-
-  def test_predicting_with_mock_longer_height(self):
-    np.random.seed(1337)
-    height, width = 3, 3
-    inp = np.random.random((12, 10, 6, 3))
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(height, width)
-      actual_output = layer(inp, training=False)
-      resized_inp = tf.image.resize(inp, size=[5, 3])
-      expected_output = resized_inp[:, 1:4, :, :]
-      self.assertAllClose(expected_output, actual_output)
-
-  def test_predicting_with_mock_longer_width(self):
-    np.random.seed(1337)
-    height, width = 4, 6
-    inp = np.random.random((12, 8, 16, 3))
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(height, width)
-      actual_output = layer(inp, training=False)
-      resized_inp = tf.image.resize(inp, size=[4, 8])
-      expected_output = resized_inp[:, :, 1:7, :]
-      self.assertAllClose(expected_output, actual_output)
-
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomCrop(5, 5, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomCrop.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_unbatched_image(self):
-    np.random.seed(1337)
-    inp = np.random.random((16, 16, 3))
-    mock_offset = [2, 2]
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(8, 8)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator,
-          'random_uniform',
-          return_value=mock_offset):
-        actual_output = layer(inp, training=True)
-        self.assertAllClose(inp[2:10, 2:10, :], actual_output)
-
-  def test_batched_input(self):
-    np.random.seed(1337)
-    inp = np.random.random((20, 16, 16, 3))
-    mock_offset = [2, 2]
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(8, 8)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_offset):
-        actual_output = layer(inp, training=True)
-        self.assertAllClose(inp[:, 2:10, 2:10, :], actual_output)
-
-  def test_augment_image(self):
-    np.random.seed(1337)
-    inp = np.random.random((16, 16, 3))
-    mock_offset = [2, 2]
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(8, 8)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_offset):
-        actual_output = layer.augment_image(
-            inp, transformation=layer.get_random_transformation(image=inp))
-        self.assertAllClose(inp[2:10, 2:10, :], actual_output)
-
-  def test_training_false(self):
-    np.random.seed(1337)
-    height, width = 4, 6
-    inp = np.random.random((12, 8, 16, 3))
-    inp_dict = {'images': inp}
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomCrop(height, width)
-      # test wih tensor input
-      actual_output = layer(inp, training=False)
-      resized_inp = tf.image.resize(inp, size=[4, 8])
-      expected_output = resized_inp[:, :, 1:7, :]
-      self.assertAllClose(expected_output, actual_output)
-      # test with dictionary input
-      actual_output = layer(inp_dict, training=False)
-      resized_inp = tf.image.resize(inp, size=[4, 8])
-      expected_output = resized_inp[:, :, 1:7, :]
-      self.assertAllClose(expected_output, actual_output['images'])
-
-  @test_utils.run_v2_only
-  def test_uint8_input(self):
-    inputs = keras.Input((128, 128, 3), batch_size=2, dtype=tf.uint8)
-    layer = image_preprocessing.RandomCrop(64, 64)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomCrop(2, 2)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomCrop(2, 2, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
+    def _run_test(self, expected_height, expected_width):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        kwargs = {"height": expected_height, "width": expected_width}
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                image_preprocessing.RandomCrop,
+                kwargs=kwargs,
+                input_shape=(num_samples, orig_height, orig_width, channels),
+                expected_output_shape=(
+                    None,
+                    expected_height,
+                    expected_width,
+                    channels,
+                ),
+            )
+
+    def test_input_smaller_than_crop_box(self):
+        np.random.seed(1337)
+        height, width = 10, 8
+        inp = np.random.random((12, 3, 3, 3))
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomCrop(height, width)
+            actual_output = layer(inp)
+            # In this case, output should equal resizing
+            # with crop_to_aspect ratio.
+            resize_layer = image_preprocessing.Resizing(
+                height, width, crop_to_aspect_ratio=True
+            )
+            expected_output = resize_layer(inp)
+            self.assertAllEqual(expected_output, actual_output)
+
+    def test_training_with_mock(self):
+        np.random.seed(1337)
+        height, width = 3, 4
+        height_offset = np.random.randint(low=0, high=3)
+        width_offset = np.random.randint(low=0, high=5)
+        mock_offset = [height_offset, width_offset]
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomCrop(height, width)
+            with tf.compat.v1.test.mock.patch.object(
+                layer._random_generator,
+                "random_uniform",
+                return_value=mock_offset,
+            ):
+                inp = np.random.random((12, 5, 8, 3))
+                actual_output = layer(inp, training=True)
+                expected_output = inp[
+                    :,
+                    height_offset : (height_offset + height),
+                    width_offset : (width_offset + width),
+                    :,
+                ]
+                self.assertAllClose(expected_output, actual_output)
+
+    @parameterized.named_parameters(
+        ("random_crop_4_by_6", 4, 6), ("random_crop_3_by_2", 3, 2)
+    )
+    def test_random_crop_output_shape(self, expected_height, expected_width):
+        self._run_test(expected_height, expected_width)
+
+    def test_random_crop_full_height(self):
+        self._run_test(5, 2)
+
+    def test_random_crop_full_width(self):
+        self._run_test(3, 8)
+
+    def test_random_crop_full(self):
+        np.random.seed(1337)
+        height, width = 8, 16
+        inp = np.random.random((12, 8, 16, 3))
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomCrop(height, width)
+            actual_output = layer(inp, training=False)
+            self.assertAllClose(inp, actual_output)
+
+    def test_predicting_with_mock_longer_height(self):
+        np.random.seed(1337)
+        height, width = 3, 3
+        inp = np.random.random((12, 10, 6, 3))
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomCrop(height, width)
+            actual_output = layer(inp, training=False)
+            resized_inp = tf.image.resize(inp, size=[5, 3])
+            expected_output = resized_inp[:, 1:4, :, :]
+            self.assertAllClose(expected_output, actual_output)
+
+    def test_predicting_with_mock_longer_width(self):
+        np.random.seed(1337)
+        height, width = 4, 6
+        inp = np.random.random((12, 8, 16, 3))
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomCrop(height, width)
+            actual_output = layer(inp, training=False)
+            resized_inp = tf.image.resize(inp, size=[4, 8])
+            expected_output = resized_inp[:, :, 1:7, :]
+            self.assertAllClose(expected_output, actual_output)
+
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.RandomCrop(5, 5, name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomCrop.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_unbatched_image(self):
+        np.random.seed(1337)
+        inp = np.random.random((16, 16, 3))
+        mock_offset = [2, 2]
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomCrop(8, 8)
+            with tf.compat.v1.test.mock.patch.object(
+                layer._random_generator,
+                "random_uniform",
+                return_value=mock_offset,
+            ):
+                actual_output = layer(inp, training=True)
+                self.assertAllClose(inp[2:10, 2:10, :], actual_output)
+
+    @test_utils.run_v2_only
+    def test_uint8_input(self):
+        inputs = keras.Input((128, 128, 3), batch_size=2, dtype=tf.uint8)
+        layer = image_preprocessing.RandomCrop(64, 64)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.RandomCrop(2, 2)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomCrop(2, 2, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 class RescalingTest(test_combinations.TestCase):
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_rescaling_base(self):
-    kwargs = {'scale': 1. / 127.5, 'offset': -1.}
-    test_utils.layer_test(
-        image_preprocessing.Rescaling,
-        kwargs=kwargs,
-        input_shape=(2, 5, 6, 3),
-        expected_output_shape=(None, 5, 6, 3))
-
-  @test_utils.run_v2_only
-  def test_rescaling_correctness_float(self):
-    layer = image_preprocessing.Rescaling(scale=1. / 127.5, offset=-1.)
-    inputs = tf.random.uniform((2, 4, 5, 3))
-    outputs = layer(inputs)
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1. / 127.5) - 1)
-
-  @test_utils.run_v2_only
-  def test_rescaling_correctness_int(self):
-    layer = image_preprocessing.Rescaling(scale=1. / 127.5, offset=-1)
-    inputs = tf.random.uniform((2, 4, 5, 3), 0, 100, dtype='int32')
-    outputs = layer(inputs)
-    self.assertEqual(outputs.dtype.name, 'float32')
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1. / 127.5) - 1)
-
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.Rescaling(0.5, name='rescaling')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.Rescaling.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_unbatched_image(self):
-    layer = image_preprocessing.Rescaling(scale=1. / 127.5, offset=-1)
-    inputs = tf.random.uniform((4, 5, 3))
-    outputs = layer(inputs)
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1. / 127.5) - 1)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.Rescaling(0.5)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.Rescaling(0.5, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_rescaling_base(self):
+        kwargs = {"scale": 1.0 / 127.5, "offset": -1.0}
+        test_utils.layer_test(
+            image_preprocessing.Rescaling,
+            kwargs=kwargs,
+            input_shape=(2, 5, 6, 3),
+            expected_output_shape=(None, 5, 6, 3),
+        )
+
+    @test_utils.run_v2_only
+    def test_rescaling_correctness_float(self):
+        layer = image_preprocessing.Rescaling(scale=1.0 / 127.5, offset=-1.0)
+        inputs = tf.random.uniform((2, 4, 5, 3))
+        outputs = layer(inputs)
+        self.assertAllClose(outputs.numpy(), inputs.numpy() * (1.0 / 127.5) - 1)
+
+    @test_utils.run_v2_only
+    def test_rescaling_correctness_int(self):
+        layer = image_preprocessing.Rescaling(scale=1.0 / 127.5, offset=-1)
+        inputs = tf.random.uniform((2, 4, 5, 3), 0, 100, dtype="int32")
+        outputs = layer(inputs)
+        self.assertEqual(outputs.dtype.name, "float32")
+        self.assertAllClose(outputs.numpy(), inputs.numpy() * (1.0 / 127.5) - 1)
+
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.Rescaling(0.5, name="rescaling")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.Rescaling.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_unbatched_image(self):
+        layer = image_preprocessing.Rescaling(scale=1.0 / 127.5, offset=-1)
+        inputs = tf.random.uniform((4, 5, 3))
+        outputs = layer(inputs)
+        self.assertAllClose(outputs.numpy(), inputs.numpy() * (1.0 / 127.5) - 1)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.Rescaling(0.5)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.Rescaling(0.5, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class RandomFlipTest(test_combinations.TestCase):
-
-  def _run_test(self, mode, expected_output=None, mock_random=None):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    if mock_random is None:
-      mock_random = [True for _ in range(num_samples)]
-      if mode == 'horizontal_and_vertical':
-        mock_random *= 2
-    inp = np.random.random((num_samples, orig_height, orig_width, channels))
-    if expected_output is None:
-      expected_output = inp
-      if mode == 'horizontal' or mode == 'horizontal_and_vertical':
-        expected_output = np.flip(expected_output, axis=2)
-      if mode == 'vertical' or mode == 'horizontal_and_vertical':
-        expected_output = np.flip(expected_output, axis=1)
-    with tf.compat.v1.test.mock.patch.object(
-        np.random,
-        'choice',
-        side_effect=mock_random,
-    ):
-      with test_utils.use_gpu():
-        layer = image_preprocessing.RandomFlip(mode)
-        actual_output = layer(inp, training=True)
-        self.assertAllClose(expected_output, actual_output)
-
-  @parameterized.named_parameters(
-      ('random_flip_horizontal', 'horizontal'),
-      ('random_flip_vertical', 'vertical'),
-      ('random_flip_both', 'horizontal_and_vertical'))
-  def test_random_flip(self, mode):
-    self._run_test(mode)
-
-  def test_random_flip_horizontal_half(self):
-    np.random.seed(1337)
-    mock_random = [True, False]
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images.copy()
-    expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=1)
-    self._run_test('horizontal', expected_output, mock_random)
-
-  def test_random_flip_vertical_half(self):
-    np.random.seed(1337)
-    mock_random = [True, False]
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images.copy()
-    expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=0)
-    self._run_test('vertical', expected_output, mock_random)
-
-  def test_random_flip_inference(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomFlip()
-      actual_output = layer(input_images, training=False)
-      self.assertAllClose(expected_output, actual_output)
-
-  def test_random_flip_default(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = np.flip(np.flip(input_images, axis=1), axis=2)
-    mock_random = [True, True, True, True]
-    with tf.compat.v1.test.mock.patch.object(
-        np.random,
-        'choice',
-        side_effect=mock_random,
-    ):
-      with self.cached_session():
+    def _run_test(self, mode, expected_output=None, mock_random=None):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        if mock_random is None:
+            mock_random = [1 for _ in range(num_samples)]
+            mock_random = np.reshape(mock_random, [2, 1, 1, 1])
+        inp = np.random.random((num_samples, orig_height, orig_width, channels))
+        if expected_output is None:
+            expected_output = inp
+            if mode == "horizontal" or mode == "horizontal_and_vertical":
+                expected_output = np.flip(expected_output, axis=2)
+            if mode == "vertical" or mode == "horizontal_and_vertical":
+                expected_output = np.flip(expected_output, axis=1)
+        with tf.compat.v1.test.mock.patch.object(
+            stateless_random_ops,
+            "stateless_random_uniform",
+            return_value=mock_random,
+        ):
+            with test_utils.use_gpu():
+                layer = image_preprocessing.RandomFlip(mode)
+                actual_output = layer(inp, training=True)
+                self.assertAllClose(expected_output, actual_output)
+
+    @parameterized.named_parameters(
+        ("random_flip_horizontal", "horizontal"),
+        ("random_flip_vertical", "vertical"),
+        ("random_flip_both", "horizontal_and_vertical"),
+    )
+    def test_random_flip(self, mode):
+        self._run_test(mode)
+
+    def test_random_flip_horizontal_half(self):
+        np.random.seed(1337)
+        mock_random = [1, 0]
+        mock_random = np.reshape(mock_random, [2, 1, 1, 1])
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images.copy()
+        expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=1)
+        self._run_test("horizontal", expected_output, mock_random)
+
+    def test_random_flip_vertical_half(self):
+        np.random.seed(1337)
+        mock_random = [1, 0]
+        mock_random = np.reshape(mock_random, [2, 1, 1, 1])
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images.copy()
+        expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=0)
+        self._run_test("vertical", expected_output, mock_random)
+
+    def test_random_flip_inference(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomFlip()
+            actual_output = layer(input_images, training=False)
+            self.assertAllClose(expected_output, actual_output)
+
+    def test_random_flip_default(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = np.flip(np.flip(input_images, axis=1), axis=2)
+        mock_random = [1, 1]
+        mock_random = np.reshape(mock_random, [2, 1, 1, 1])
+        with tf.compat.v1.test.mock.patch.object(
+            stateless_random_ops,
+            "stateless_random_uniform",
+            return_value=mock_random,
+        ):
+            with self.cached_session():
+                layer = image_preprocessing.RandomFlip()
+                actual_output = layer(input_images, training=True)
+                self.assertAllClose(expected_output, actual_output)
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.RandomFlip(name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomFlip.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_random_flip_unbatched_image(self):
+        input_image = np.random.random((4, 4, 1)).astype(np.float32)
+        expected_output = np.flip(input_image, axis=0)
+        # mock_random = np.reshape([0.], [1, 1, 1])
+        with tf.compat.v1.test.mock.patch.object(
+            stateless_random_ops,
+            "stateless_random_uniform",
+            return_value=0.0,
+        ):
+            with self.cached_session():
+                layer = image_preprocessing.RandomFlip("vertical")
+                actual_output = layer(input_image, training=True)
+                self.assertAllClose(expected_output, actual_output)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
         layer = image_preprocessing.RandomFlip()
-        actual_output = layer(input_images, training=True)
-        self.assertAllClose(expected_output, actual_output)
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomFlip(name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomFlip.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_random_flip_unbatched_image(self):
-    input_image = np.random.random((4, 4, 1)).astype(np.float32)
-    expected_output = np.flip(input_image, axis=0)
-    mock_random = [True, True, True, True]
-    with tf.compat.v1.test.mock.patch.object(
-        np.random,
-        'choice',
-        side_effect=mock_random,
-    ):
-      with self.cached_session():
-        layer = image_preprocessing.RandomFlip('vertical')
-        actual_output = layer(input_image, training=True)
-        self.assertAllClose(expected_output, actual_output)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomFlip()
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomFlip(dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
-
-  @test_utils.run_v2_only
-  def test_augment_bbox_horizontal(self):
-    image = tf.zeros([1, 20, 20, 3])
-    bboxes = np.array([[0, 0, 10, 10], [4, 4, 12, 12]], dtype='int32')
-    layer = image_preprocessing.RandomFlip()
-    output = layer.augment_bounding_boxes(
-        image,
-        bboxes,
-        transformation={
-            'flip_horizontal': True,
-            'flip_vertical': False
-        })
-    expected_output = [[10, 0, 20, 10], [8, 4, 16, 12]]
-    self.assertAllClose(expected_output, output)
-
-  @test_utils.run_v2_only
-  def test_augment_bbox_vertical(self):
-    image = tf.zeros([1, 20, 20, 3])
-    bboxes = np.array([[0, 0, 10, 10], [4, 4, 12, 12]], dtype='int32')
-    layer = image_preprocessing.RandomFlip()
-    output = layer.augment_bounding_boxes(
-        image,
-        bboxes,
-        transformation={
-            'flip_horizontal': False,
-            'flip_vertical': True
-        })
-    expected_output = [[0, 10, 10, 20], [4, 8, 12, 16]]
-    self.assertAllClose(expected_output, output)
-
-  @test_utils.run_v2_only
-  def test_augment_bbox_both(self):
-    image = tf.zeros([1, 20, 20, 3])
-    bboxes = np.array([[0, 0, 10, 10], [4, 4, 12, 12]], dtype='int32')
-    layer = image_preprocessing.RandomFlip()
-    output = layer.augment_bounding_boxes(
-        image,
-        bboxes,
-        transformation={
-            'flip_horizontal': True,
-            'flip_vertical': True
-        })
-    expected_output = [[10, 10, 20, 20], [8, 8, 16, 16]]
-    self.assertAllClose(expected_output, output)
-
-  @test_utils.run_v2_only
-  def test_augment_bbox_batched_input(self):
-    image = tf.zeros([20, 20, 3])
-    bboxes = np.array(
-        [[[0, 0, 10, 10], [4, 4, 12, 12]], [[0, 0, 10, 10], [4, 4, 12, 12]]],
-        dtype='int32')
-    input = {'images': [image, image], 'bounding_boxes': bboxes}
-    mock_random = [True, True, True, True]
-    with tf.compat.v1.test.mock.patch.object(
-        np.random,
-        'choice',
-        side_effect=mock_random,
-    ):
-      layer = image_preprocessing.RandomFlip()
-      output = layer(input, training=True)
-    expected_output = [[[10, 10, 20, 20], [8, 8, 16, 16]],
-                       [[10, 10, 20, 20], [8, 8, 16, 16]]]
-    self.assertAllClose(expected_output, output['bounding_boxes'])
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomFlip(dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
+
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class RandomContrastTest(test_combinations.TestCase):
-
-  def _run_test(self, lower, upper, expected_output=None, mock_random=None):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    if mock_random is None:
-      mock_random = 0.2
-    inp = np.random.random((num_samples, orig_height, orig_width, channels))
-    if expected_output is None:
-      # reduce mean on height.
-      inp_mean = np.mean(inp, axis=1, keepdims=True)
-      # reduce mean on width.
-      inp_mean = np.mean(inp_mean, axis=2, keepdims=True)
-      expected_output = (inp - inp_mean) * mock_random + inp_mean
-    with tf.compat.v1.test.mock.patch.object(
-        stateless_random_ops,
-        'stateless_random_uniform',
-        return_value=mock_random,
-    ):
-      with test_utils.use_gpu():
-        layer = image_preprocessing.RandomContrast((lower, upper))
-        actual_output = layer(inp, training=True)
-        self.assertAllClose(expected_output, actual_output)
-
-  @parameterized.named_parameters(('random_contrast_2_by_5', 0.2, 0.5),
-                                  ('random_contrast_2_by_13', 0.2, 1.3),
-                                  ('random_contrast_5_by_2', 0.5, 0.2),
-                                  ('random_contrast_10_by_10', 1.0, 1.0))
-  def test_random_contrast(self, lower, upper):
-    self._run_test(lower, upper)
-
-  @parameterized.named_parameters(('random_contrast_amplitude_2', 0.2),
-                                  ('random_contrast_amplitude_5', 0.5))
-  def test_random_contrast_amplitude(self, amplitude):
-    input_images = np.random.random((2, 5, 8, 3))
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomContrast(amplitude)
-      layer(input_images)
-
-  def test_random_contrast_inference(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomContrast((0.1, 0.2))
-      actual_output = layer(input_images, training=False)
-      self.assertAllClose(expected_output, actual_output)
-
-  def test_random_contrast_int_dtype(self):
-    input_images = np.random.randint(low=0, high=255, size=(2, 5, 8, 3))
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomContrast((0.1, 0.2))
-      layer(input_images)
-
-  def test_random_contrast_invalid_bounds(self):
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomContrast((-0.1, .5))
-
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomContrast((1.1, .5))
-
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomContrast((0.1, -0.2))
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomContrast((.5, .6), name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomContrast.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_output_value_clip(self):
-    input_images = np.random.random((5, 8, 3)).astype(np.float32) * 255.0
-    # Give a factor range [1.0, 11.0] so that it will produce large contrast.
-    layer = image_preprocessing.RandomContrast((0.0, 10.0))
-    output = layer(input_images)
-    self.assertLessEqual(tf.reduce_max(output), 255.0)
-    self.assertGreaterEqual(tf.reduce_min(output), 0.0)
-
-  def test_unbatched_image(self):
-    np.random.seed(1337)
-    mock_random = 0.2
-    inp = np.random.random((4, 4, 1))
-    inp_mean = np.mean(inp, axis=0, keepdims=True)
-    inp_mean = np.mean(inp_mean, axis=1, keepdims=True)
-    expected_output = (inp - inp_mean) * mock_random + inp_mean
-    with tf.compat.v1.test.mock.patch.object(
-        stateless_random_ops,
-        'stateless_random_uniform',
-        return_value=mock_random,
-    ):
-      with test_utils.use_gpu():
-        layer = image_preprocessing.RandomContrast((0.2, 0.5))
-        actual_output = layer(inp, training=True)
-        self.assertAllClose(expected_output, actual_output)
-
-  def test_augment_image(self):
-    np.random.seed(1337)
-    mock_random = 0.2
-    inp = np.random.random((4, 4, 1))
-    inp_mean = np.mean(inp, axis=0, keepdims=True)
-    inp_mean = np.mean(inp_mean, axis=1, keepdims=True)
-    expected_output = (inp - inp_mean) * mock_random + inp_mean
-    with tf.compat.v1.test.mock.patch.object(
-        stateless_random_ops,
-        'stateless_random_uniform',
-        return_value=mock_random,
-    ):
-      with test_utils.use_gpu():
-        layer = image_preprocessing.RandomContrast((0.2, 0.5))
-        actual_output = layer.augment_image(
-            inp, transformation=layer.get_random_transformation())
-        self.assertAllClose(expected_output, actual_output)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomContrast((.5, .6))
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomContrast((.5, .6), dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
+    def _run_test(self, lower, upper, expected_output=None, mock_random=None):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        if mock_random is None:
+            mock_random = 0.2
+        inp = np.random.random((num_samples, orig_height, orig_width, channels))
+        if expected_output is None:
+            # reduce mean on height.
+            inp_mean = np.mean(inp, axis=1, keepdims=True)
+            # reduce mean on width.
+            inp_mean = np.mean(inp_mean, axis=2, keepdims=True)
+            expected_output = (inp - inp_mean) * mock_random + inp_mean
+        with tf.compat.v1.test.mock.patch.object(
+            stateless_random_ops,
+            "stateless_random_uniform",
+            return_value=mock_random,
+        ):
+            with test_utils.use_gpu():
+                layer = image_preprocessing.RandomContrast((lower, upper))
+                actual_output = layer(inp, training=True)
+                self.assertAllClose(expected_output, actual_output)
+
+    @parameterized.named_parameters(
+        ("random_contrast_2_by_5", 0.2, 0.5),
+        ("random_contrast_2_by_13", 0.2, 1.3),
+        ("random_contrast_5_by_2", 0.5, 0.2),
+        ("random_contrast_10_by_10", 1.0, 1.0),
+    )
+    def test_random_contrast(self, lower, upper):
+        self._run_test(lower, upper)
+
+    @parameterized.named_parameters(
+        ("random_contrast_amplitude_2", 0.2),
+        ("random_contrast_amplitude_5", 0.5),
+    )
+    def test_random_contrast_amplitude(self, amplitude):
+        input_images = np.random.random((2, 5, 8, 3))
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomContrast(amplitude)
+            layer(input_images)
+
+    def test_random_contrast_inference(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomContrast((0.1, 0.2))
+            actual_output = layer(input_images, training=False)
+            self.assertAllClose(expected_output, actual_output)
+
+    def test_random_contrast_int_dtype(self):
+        input_images = np.random.randint(low=0, high=255, size=(2, 5, 8, 3))
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomContrast((0.1, 0.2))
+            layer(input_images)
+
+    def test_random_contrast_invalid_bounds(self):
+        with self.assertRaises(ValueError):
+            image_preprocessing.RandomContrast((-0.1, 0.5))
+
+        with self.assertRaises(ValueError):
+            image_preprocessing.RandomContrast((1.1, 0.5))
+
+        with self.assertRaises(ValueError):
+            image_preprocessing.RandomContrast((0.1, -0.2))
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.RandomContrast(
+            (0.5, 0.6), name="image_preproc"
+        )
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomContrast.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_output_value_clip(self):
+        input_images = np.random.random((5, 8, 3)).astype(np.float32) * 255.0
+        # Give a factor range [1.0, 11.0] so that
+        # it will produce large contrast.
+        layer = image_preprocessing.RandomContrast((0.0, 10.0))
+        output = layer(input_images)
+        self.assertLessEqual(tf.reduce_max(output), 255.0)
+        self.assertGreaterEqual(tf.reduce_min(output), 0.0)
+
+    def test_unbatched_image(self):
+        np.random.seed(1337)
+        mock_random = 0.2
+        inp = np.random.random((4, 4, 1))
+        inp_mean = np.mean(inp, axis=0, keepdims=True)
+        inp_mean = np.mean(inp_mean, axis=1, keepdims=True)
+        expected_output = (inp - inp_mean) * mock_random + inp_mean
+        with tf.compat.v1.test.mock.patch.object(
+            stateless_random_ops,
+            "stateless_random_uniform",
+            return_value=mock_random,
+        ):
+            with test_utils.use_gpu():
+                layer = image_preprocessing.RandomContrast((0.2, 0.5))
+                actual_output = layer(inp, training=True)
+                self.assertAllClose(expected_output, actual_output)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.RandomContrast((0.5, 0.6))
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomContrast((0.5, 0.6), dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class RandomBrightnessTest(test_combinations.TestCase):
+    def test_factor_input_validation(self):
+        with self.assertRaisesRegex(ValueError, r"in the range \[-1.0, 1.0\]"):
+            image_preprocessing.RandomBrightness(2.0)
+
+        with self.assertRaisesRegex(ValueError, "list of two numbers"):
+            image_preprocessing.RandomBrightness([1.0])
+
+        with self.assertRaisesRegex(ValueError, "should be a number"):
+            image_preprocessing.RandomBrightness("one")
+
+    def test_factor_normalize(self):
+        layer = image_preprocessing.RandomBrightness(1.0)
+        self.assertEqual(layer._factor, [-1.0, 1.0])
+
+        layer = image_preprocessing.RandomBrightness((0.5, 0.3))
+        self.assertEqual(layer._factor, [0.3, 0.5])
+
+        layer = image_preprocessing.RandomBrightness(-0.2)
+        self.assertEqual(layer._factor, [-0.2, 0.2])
+
+    @test_utils.run_v2_only
+    def test_output_value_range(self):
+        # Always scale up to 255
+        layer = image_preprocessing.RandomBrightness([1.0, 1.0])
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output = layer(inputs)
+        output_min = tf.math.reduce_min(output)
+        output_max = tf.math.reduce_max(output)
+        self.assertEqual(output_min, 255)
+        self.assertEqual(output_max, 255)
+
+        # Always scale down to 0
+        layer = image_preprocessing.RandomBrightness([-1.0, -1.0])
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output = layer(inputs)
+        output_min = tf.math.reduce_min(output)
+        output_max = tf.math.reduce_max(output)
+        self.assertEqual(output_min, 0)
+        self.assertEqual(output_max, 0)
+
+    def test_output(self):
+        # Always scale up, but randomly between 0 ~ 255
+        layer = image_preprocessing.RandomBrightness([0, 1.0])
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output = layer(inputs)
+        diff = output - inputs
+        self.assertGreaterEqual(tf.math.reduce_min(diff), 0)
+        self.assertGreater(tf.math.reduce_mean(diff), 0)
+
+        # Always scale down, but randomly between 0 ~ 255
+        layer = image_preprocessing.RandomBrightness([-1.0, 0.0])
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output = layer(inputs)
+        diff = output - inputs
+        self.assertLessEqual(tf.math.reduce_max(diff), 0)
+        self.assertLess(tf.math.reduce_mean(diff), 0)
+
+    @test_utils.run_v2_only
+    def test_scale_output(self):
+        layer = image_preprocessing.RandomBrightness([0, 1.0], seed=1337)
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output = layer(inputs)
+
+        # Create a new layer with same seed but different value range
+        layer2 = image_preprocessing.RandomBrightness(
+            [0, 1.0], value_range=[0, 1], seed=1337
+        )
+        inputs2 = inputs / 255.0
+        output2 = layer2(inputs2)
+        # Make sure the outputs are the same, but just scaled with 255
+        self.assertAllClose(output, output2 * 255.0)
+
+    def test_different_adjustment_within_batch(self):
+        layer = image_preprocessing.RandomBrightness([0.2, 0.3])
+        inputs = np.zeros(shape=(2, 10, 10, 3))  # 2 images with all zeros
+        output = layer(inputs)
+        diff = output - inputs
+        # Make sure two images gets the different adjustment
+        self.assertNotAllClose(diff[0], diff[1])
+        # Make sure all the pixel are the same with the same image
+        image1 = output[0]
+        # The reduced mean pixel value among width and height are the same as
+        # any of the pixel in the image.
+        self.assertAllClose(
+            tf.reduce_mean(image1), image1[0, 0, 0], rtol=1e-5, atol=1e-5
+        )
+
+    def test_inference(self):
+        layer = image_preprocessing.RandomBrightness([0, 1.0])
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output = layer(inputs, training=False)
+        self.assertAllClose(inputs, output)
+
+    @test_utils.run_v2_only
+    def test_dtype(self):
+        layer = image_preprocessing.RandomBrightness([0, 1.0])
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output = layer(inputs)
+        self.assertEqual(output.dtype, tf.float32)
+
+        layer = image_preprocessing.RandomBrightness([0, 1.0], dtype="uint8")
+        output = layer(inputs)
+        self.assertEqual(output.dtype, tf.uint8)
+
+    def test_seed(self):
+        layer = image_preprocessing.RandomBrightness([0, 1.0], seed=1337)
+        inputs = np.random.randint(0, 255, size=(224, 224, 3))
+        output_1 = layer(inputs)
+
+        layer2 = image_preprocessing.RandomBrightness([0, 1.0], seed=1337)
+        output_2 = layer2(inputs)
+
+        self.assertAllClose(output_1, output_2)
+
+    def test_config(self):
+        layer = image_preprocessing.RandomBrightness(
+            [0, 1.0], value_range=[0.0, 1.0], seed=1337
+        )
+        config = layer.get_config()
+        self.assertEqual(config["factor"], [0.0, 1.0])
+        self.assertEqual(config["value_range"], [0.0, 1.0])
+        self.assertEqual(config["seed"], 1337)
+
+        reconstructed_layer = image_preprocessing.RandomBrightness.from_config(
+            config
+        )
+        self.assertEqual(reconstructed_layer._factor, layer._factor)
+        self.assertEqual(reconstructed_layer._value_range, layer._value_range)
+        self.assertEqual(reconstructed_layer._seed, layer._seed)
+
 
-  def test_factor_input_validation(self):
-    with self.assertRaisesRegex(ValueError, r'in the range \[-1.0, 1.0\]'):
-      image_preprocessing.RandomBrightness(2.0)
-
-    with self.assertRaisesRegex(ValueError, 'list of two numbers'):
-      image_preprocessing.RandomBrightness([1.0])
-
-    with self.assertRaisesRegex(ValueError, 'should be a number'):
-      image_preprocessing.RandomBrightness('one')
-
-  def test_factor_normalize(self):
-    layer = image_preprocessing.RandomBrightness(1.0)
-    self.assertEqual(layer._factor, [-1.0, 1.0])
-
-    layer = image_preprocessing.RandomBrightness((0.5, 0.3))
-    self.assertEqual(layer._factor, [0.3, 0.5])
-
-    layer = image_preprocessing.RandomBrightness(-0.2)
-    self.assertEqual(layer._factor, [-0.2, 0.2])
-
-  @test_utils.run_v2_only
-  def test_output_value_range(self):
-    # Always scale up to 255
-    layer = image_preprocessing.RandomBrightness([1.0, 1.0])
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer(inputs)
-    output_min = tf.math.reduce_min(output)
-    output_max = tf.math.reduce_max(output)
-    self.assertEqual(output_min, 255)
-    self.assertEqual(output_max, 255)
-
-    # Always scale down to 0
-    layer = image_preprocessing.RandomBrightness([-1.0, -1.0])
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer(inputs)
-    output_min = tf.math.reduce_min(output)
-    output_max = tf.math.reduce_max(output)
-    self.assertEqual(output_min, 0)
-    self.assertEqual(output_max, 0)
-
-  def test_output(self):
-    # Always scale up, but randomly between 0 ~ 255
-    layer = image_preprocessing.RandomBrightness([0, 1.0])
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer(inputs)
-    diff = output - inputs
-    self.assertGreaterEqual(tf.math.reduce_min(diff), 0)
-    self.assertGreater(tf.math.reduce_mean(diff), 0)
-
-    # Always scale down, but randomly between 0 ~ 255
-    layer = image_preprocessing.RandomBrightness([-1.0, 0.0])
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer(inputs)
-    diff = output - inputs
-    self.assertLessEqual(tf.math.reduce_max(diff), 0)
-    self.assertLess(tf.math.reduce_mean(diff), 0)
-
-  def test_augment_image(self):
-    # Always scale up, but randomly between 0 ~ 255
-    layer = image_preprocessing.RandomBrightness([0, 1.0])
-    image = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer.augment_image(
-        image, transformation=layer.get_random_transformation())
-    diff = output - image
-    self.assertGreaterEqual(tf.math.reduce_min(diff), 0)
-    self.assertGreater(tf.math.reduce_mean(diff), 0)
-
-    # Always scale down, but randomly between 0 ~ 255
-    layer = image_preprocessing.RandomBrightness([-1.0, 0.0])
-    image = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer.augment_image(
-        image, transformation=layer.get_random_transformation())
-    diff = output - image
-    self.assertLessEqual(tf.math.reduce_max(diff), 0)
-    self.assertLess(tf.math.reduce_mean(diff), 0)
-
-  @test_utils.run_v2_only
-  def test_scale_output(self):
-    layer = image_preprocessing.RandomBrightness([0, 1.0], seed=1337)
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer(inputs)
-
-    # Create a new layer with same seed but different value range
-    layer2 = image_preprocessing.RandomBrightness(
-        [0, 1.0], value_range=[0, 1], seed=1337)
-    inputs2 = inputs / 255.0
-    output2 = layer2(inputs2)
-    # Make sure the outputs are the same, but just scaled with 255
-    self.assertAllClose(output, output2 * 255.0)
-
-  def test_different_adjustment_within_batch(self):
-    layer = image_preprocessing.RandomBrightness([0.2, 0.3])
-    inputs = np.zeros(shape=(2, 10, 10, 3))  # 2 images with all zeros
-    output = layer(inputs)
-    diff = output - inputs
-    # Make sure two images gets the different adjustment
-    self.assertNotAllClose(diff[0], diff[1])
-    # Make sure all the pixel are the same with the same image
-    image1 = output[0]
-    # The reduced mean pixel value among width and height are the same as
-    # any of the pixel in the image.
-    self.assertAllClose(
-        tf.reduce_mean(image1), image1[0, 0, 0], rtol=1e-5, atol=1e-5)
-
-  def test_inference(self):
-    layer = image_preprocessing.RandomBrightness([0, 1.0])
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer(inputs, training=False)
-    self.assertAllClose(inputs, output)
-
-  @test_utils.run_v2_only
-  def test_dtype(self):
-    layer = image_preprocessing.RandomBrightness([0, 1.0])
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output = layer(inputs)
-    self.assertEqual(output.dtype, tf.float32)
-
-    layer = image_preprocessing.RandomBrightness([0, 1.0], dtype='uint8')
-    output = layer(inputs)
-    self.assertEqual(output.dtype, tf.uint8)
-
-  def test_seed(self):
-    layer = image_preprocessing.RandomBrightness([0, 1.0], seed=1337)
-    inputs = np.random.randint(0, 255, size=(224, 224, 3))
-    output_1 = layer(inputs)
-
-    layer2 = image_preprocessing.RandomBrightness([0, 1.0], seed=1337)
-    output_2 = layer2(inputs)
-
-    self.assertAllClose(output_1, output_2)
-
-  def test_config(self):
-    layer = image_preprocessing.RandomBrightness(
-        [0, 1.0], value_range=[0.0, 1.0], seed=1337)
-    config = layer.get_config()
-    self.assertEqual(config['factor'], [0.0, 1.0])
-    self.assertEqual(config['value_range'], [0.0, 1.0])
-    self.assertEqual(config['seed'], 1337)
-
-    reconstructed_layer = image_preprocessing.RandomBrightness.from_config(
-        config)
-    self.assertEqual(reconstructed_layer._factor, layer._factor)
-    self.assertEqual(reconstructed_layer._value_range, layer._value_range)
-    self.assertEqual(reconstructed_layer._seed, layer._seed)
-
-
-@test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class RandomTranslationTest(test_combinations.TestCase):
-
-  def _run_test(self, height_factor, width_factor):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    kwargs = {'height_factor': height_factor, 'width_factor': width_factor}
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          image_preprocessing.RandomTranslation,
-          kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          expected_output_shape=(None, orig_height, orig_width, channels))
-
-  @parameterized.named_parameters(
-      ('random_translate_4_by_6', .4, .6), ('random_translate_3_by_2', .3, .2),
-      ('random_translate_tuple_factor', (-.5, .4), (.2, .3)))
-  def test_random_translation(self, height_factor, width_factor):
-    self._run_test(height_factor, width_factor)
-
-  def test_random_translation_up_numeric_reflect(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
-        # Shifting by -.2 * 5 = 1 pixel.
-        layer = image_preprocessing.RandomTranslation(
-            height_factor=(-.2, -.2), width_factor=0.)
-        output_image = layer(input_image)
-        expected_output = np.asarray([
-            [5, 6, 7, 8, 9],
-            [10, 11, 12, 13, 14],
-            [15, 16, 17, 18, 19],
-            [20, 21, 22, 23, 24],
-            [20, 21, 22, 23, 24],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_translation_up_numeric_constant(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
-        # Shifting by -.2 * 5 = 1 pixel.
-        layer = image_preprocessing.RandomTranslation(
-            height_factor=(-.2, -.2), width_factor=0., fill_mode='constant')
-        output_image = layer(input_image)
-        expected_output = np.asarray([
-            [5, 6, 7, 8, 9],
-            [10, 11, 12, 13, 14],
-            [15, 16, 17, 18, 19],
-            [20, 21, 22, 23, 24],
-            [0, 0, 0, 0, 0],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_translation_down_numeric_reflect(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
-        # Shifting by .2 * 5 = 1 pixel.
-        layer = image_preprocessing.RandomTranslation(
-            height_factor=(.2, .2), width_factor=0.)
-        output_image = layer(input_image)
-        expected_output = np.asarray([
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [5, 6, 7, 8, 9],
-            [10, 11, 12, 13, 14],
-            [15, 16, 17, 18, 19],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_translation_asymmetric_size_numeric_reflect(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 16), (1, 8, 2, 1)).astype(dtype)
-        # Shifting by .5 * 8 = 1 pixel.
+    def _run_test(self, height_factor, width_factor):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        kwargs = {"height_factor": height_factor, "width_factor": width_factor}
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                image_preprocessing.RandomTranslation,
+                kwargs=kwargs,
+                input_shape=(num_samples, orig_height, orig_width, channels),
+                expected_output_shape=(None, orig_height, orig_width, channels),
+            )
+
+    @parameterized.named_parameters(
+        ("random_translate_4_by_6", 0.4, 0.6),
+        ("random_translate_3_by_2", 0.3, 0.2),
+        ("random_translate_tuple_factor", (-0.5, 0.4), (0.2, 0.3)),
+    )
+    def test_random_translation(self, height_factor, width_factor):
+        self._run_test(height_factor, width_factor)
+
+    def test_random_translation_up_numeric_reflect(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(
+                    dtype
+                )
+                # Shifting by -.2 * 5 = 1 pixel.
+                layer = image_preprocessing.RandomTranslation(
+                    height_factor=(-0.2, -0.2), width_factor=0.0
+                )
+                output_image = layer(input_image)
+                expected_output = np.asarray(
+                    [
+                        [5, 6, 7, 8, 9],
+                        [10, 11, 12, 13, 14],
+                        [15, 16, 17, 18, 19],
+                        [20, 21, 22, 23, 24],
+                        [20, 21, 22, 23, 24],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_translation_up_numeric_constant(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(
+                    dtype
+                )
+                # Shifting by -.2 * 5 = 1 pixel.
+                layer = image_preprocessing.RandomTranslation(
+                    height_factor=(-0.2, -0.2),
+                    width_factor=0.0,
+                    fill_mode="constant",
+                )
+                output_image = layer(input_image)
+                expected_output = np.asarray(
+                    [
+                        [5, 6, 7, 8, 9],
+                        [10, 11, 12, 13, 14],
+                        [15, 16, 17, 18, 19],
+                        [20, 21, 22, 23, 24],
+                        [0, 0, 0, 0, 0],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_translation_down_numeric_reflect(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(
+                    dtype
+                )
+                # Shifting by .2 * 5 = 1 pixel.
+                layer = image_preprocessing.RandomTranslation(
+                    height_factor=(0.2, 0.2), width_factor=0.0
+                )
+                output_image = layer(input_image)
+                expected_output = np.asarray(
+                    [
+                        [0, 1, 2, 3, 4],
+                        [0, 1, 2, 3, 4],
+                        [5, 6, 7, 8, 9],
+                        [10, 11, 12, 13, 14],
+                        [15, 16, 17, 18, 19],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_translation_asymmetric_size_numeric_reflect(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 16), (1, 8, 2, 1)).astype(
+                    dtype
+                )
+                # Shifting by .5 * 8 = 1 pixel.
+                layer = image_preprocessing.RandomTranslation(
+                    height_factor=(0.5, 0.5), width_factor=0.0
+                )
+                output_image = layer(input_image)
+                # pyformat: disable
+                expected_output = np.asarray(
+                    [
+                        [6, 7],
+                        [4, 5],
+                        [2, 3],
+                        [0, 1],
+                        [0, 1],
+                        [2, 3],
+                        [4, 5],
+                        [6, 7],
+                    ]
+                ).astype(dtype)
+                # pyformat: enable
+                expected_output = np.reshape(expected_output, (1, 8, 2, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_translation_down_numeric_constant(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(
+                    dtype
+                )
+                # Shifting by -.2 * 5 = 1 pixel.
+                layer = image_preprocessing.RandomTranslation(
+                    height_factor=(0.2, 0.2),
+                    width_factor=0.0,
+                    fill_mode="constant",
+                )
+                output_image = layer(input_image)
+                expected_output = np.asarray(
+                    [
+                        [0, 0, 0, 0, 0],
+                        [0, 1, 2, 3, 4],
+                        [5, 6, 7, 8, 9],
+                        [10, 11, 12, 13, 14],
+                        [15, 16, 17, 18, 19],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_translation_left_numeric_reflect(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(
+                    dtype
+                )
+                # Shifting by .2 * 5 = 1 pixel.
+                layer = image_preprocessing.RandomTranslation(
+                    height_factor=0.0, width_factor=(-0.2, -0.2)
+                )
+                output_image = layer(input_image)
+                expected_output = np.asarray(
+                    [
+                        [1, 2, 3, 4, 4],
+                        [6, 7, 8, 9, 9],
+                        [11, 12, 13, 14, 14],
+                        [16, 17, 18, 19, 19],
+                        [21, 22, 23, 24, 24],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_translation_left_numeric_constant(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(
+                    dtype
+                )
+                # Shifting by -.2 * 5 = 1 pixel.
+                layer = image_preprocessing.RandomTranslation(
+                    height_factor=0.0,
+                    width_factor=(-0.2, -0.2),
+                    fill_mode="constant",
+                )
+                output_image = layer(input_image)
+                expected_output = np.asarray(
+                    [
+                        [1, 2, 3, 4, 0],
+                        [6, 7, 8, 9, 0],
+                        [11, 12, 13, 14, 0],
+                        [16, 17, 18, 19, 0],
+                        [21, 22, 23, 24, 0],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_translation_inference(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomTranslation(0.5, 0.5)
+            actual_output = layer(input_images, training=False)
+            self.assertAllClose(expected_output, actual_output)
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
         layer = image_preprocessing.RandomTranslation(
-            height_factor=(.5, .5), width_factor=0.)
-        output_image = layer(input_image)
-        # pyformat: disable
-        expected_output = np.asarray([
-            [6, 7],
-            [4, 5],
-            [2, 3],
-            [0, 1],
-            [0, 1],
-            [2, 3],
-            [4, 5],
-            [6, 7],
-        ]).astype(dtype)
-        # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 8, 2, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_translation_down_numeric_constant(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
-        # Shifting by -.2 * 5 = 1 pixel.
-        layer = image_preprocessing.RandomTranslation(
-            height_factor=(.2, .2), width_factor=0., fill_mode='constant')
-        output_image = layer(input_image)
-        expected_output = np.asarray([
-            [0, 0, 0, 0, 0],
-            [0, 1, 2, 3, 4],
-            [5, 6, 7, 8, 9],
-            [10, 11, 12, 13, 14],
-            [15, 16, 17, 18, 19],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_translation_left_numeric_reflect(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
-        # Shifting by .2 * 5 = 1 pixel.
-        layer = image_preprocessing.RandomTranslation(
-            height_factor=0., width_factor=(-.2, -.2))
-        output_image = layer(input_image)
-        expected_output = np.asarray([
-            [1, 2, 3, 4, 4],
-            [6, 7, 8, 9, 9],
-            [11, 12, 13, 14, 14],
-            [16, 17, 18, 19, 19],
-            [21, 22, 23, 24, 24],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_translation_left_numeric_constant(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
-        # Shifting by -.2 * 5 = 1 pixel.
-        layer = image_preprocessing.RandomTranslation(
-            height_factor=0., width_factor=(-.2, -.2), fill_mode='constant')
-        output_image = layer(input_image)
-        expected_output = np.asarray([
-            [1, 2, 3, 4, 0],
-            [6, 7, 8, 9, 0],
-            [11, 12, 13, 14, 0],
-            [16, 17, 18, 19, 0],
-            [21, 22, 23, 24, 0],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_translation_inference(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomTranslation(.5, .5)
-      actual_output = layer(input_images, training=False)
-      self.assertAllClose(expected_output, actual_output)
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomTranslation(.5, .6, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomTranslation.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_unbatched_image(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(np.int64)
-      # Shifting by -.2 * 5 = 1 pixel.
-      layer = image_preprocessing.RandomTranslation(
-          height_factor=(-.2, -.2), width_factor=0.)
-      output_image = layer(input_image)
-      expected_output = np.asarray([
-          [5, 6, 7, 8, 9],
-          [10, 11, 12, 13, 14],
-          [15, 16, 17, 18, 19],
-          [20, 21, 22, 23, 24],
-          [20, 21, 22, 23, 24],
-      ]).astype(np.int64)
-      expected_output = np.reshape(expected_output, (5, 5, 1))
-      self.assertAllEqual(expected_output, output_image)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomTranslation(.5, .6)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomTranslation(.5, .6, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
+            0.5, 0.6, name="image_preproc"
+        )
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomTranslation.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_unbatched_image(self):
+        with test_utils.use_gpu():
+            input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(
+                np.int64
+            )
+            # Shifting by -.2 * 5 = 1 pixel.
+            layer = image_preprocessing.RandomTranslation(
+                height_factor=(-0.2, -0.2), width_factor=0.0
+            )
+            output_image = layer(input_image)
+            expected_output = np.asarray(
+                [
+                    [5, 6, 7, 8, 9],
+                    [10, 11, 12, 13, 14],
+                    [15, 16, 17, 18, 19],
+                    [20, 21, 22, 23, 24],
+                    [20, 21, 22, 23, 24],
+                ]
+            ).astype(np.int64)
+            expected_output = np.reshape(expected_output, (5, 5, 1))
+            self.assertAllEqual(expected_output, output_image)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.RandomTranslation(0.5, 0.6)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomTranslation(0.5, 0.6, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class RandomTransformTest(test_combinations.TestCase):
-
-  def _run_random_transform_with_mock(self,
-                                      transform_matrix,
-                                      expected_output,
-                                      mode,
-                                      fill_value=0.0,
-                                      interpolation='bilinear'):
-    inp = np.arange(15).reshape((1, 5, 3, 1)).astype(np.float32)
-    with self.cached_session():
-      output = image_preprocessing.transform(
-          inp,
-          transform_matrix,
-          fill_mode=mode,
-          fill_value=fill_value,
-          interpolation=interpolation)
-    self.assertAllClose(expected_output, output)
-
-  def test_random_translation_reflect(self):
-    # reflected output is (dcba|abcd|dcba)
-
-    # Test down shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 1., 2.],
-         [0., 1., 2.],
-         [3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'reflect')
-
-    # Test up shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11.],
-         [12., 13., 14.],
-         [12., 13., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'reflect')
-
-    # Test left shift by 1.
-    # reflected output is (dcba|abcd|dcba)
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[1., 2., 2.],
-         [4., 5., 5.],
-         [7., 8., 8.],
-         [10., 11., 11.],
-         [13., 14., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'reflect')
-
-    # Test right shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 0., 1.],
-         [3., 3., 4],
-         [6., 6., 7.],
-         [9., 9., 10.],
-         [12., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'reflect')
-
-  def test_random_translation_wrap(self):
-    # warpped output is (abcd|abcd|abcd)
-
-    # Test down shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[12., 13., 14.],
-         [0., 1., 2.],
-         [3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'wrap')
-
-    # Test up shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11.],
-         [12., 13., 14.],
-         [0., 1., 2.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'wrap')
-
-    # Test left shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[1., 2., 0.],
-         [4., 5., 3.],
-         [7., 8., 6.],
-         [10., 11., 9.],
-         [13., 14., 12.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'wrap')
-
-    # Test right shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[2., 0., 1.],
-         [5., 3., 4],
-         [8., 6., 7.],
-         [11., 9., 10.],
-         [14., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'wrap')
-
-  def test_random_translation_nearest(self):
-    # nearest output is (aaaa|abcd|dddd)
-
-    # Test down shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 1., 2.],
-         [0., 1., 2.],
-         [3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'nearest')
-
-    # Test up shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11.],
-         [12., 13., 14.],
-         [12., 13., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'nearest')
-
-    # Test left shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[1., 2., 2.],
-         [4., 5., 5.],
-         [7., 8., 8.],
-         [10., 11., 11.],
-         [13., 14., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'nearest')
-
-    # Test right shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 0., 1.],
-         [3., 3., 4],
-         [6., 6., 7.],
-         [9., 9., 10.],
-         [12., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'nearest')
-
-  def test_random_translation_constant_0(self):
-    # constant output is (0000|abcd|0000)
-
-    # Test down shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 0., 0.],
-         [0., 1., 2.],
-         [3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'constant')
-
-    # Test up shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11.],
-         [12., 13., 14.],
-         [0., 0., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'constant')
-
-    # Test left shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[1., 2., 0.],
-         [4., 5., 0.],
-         [7., 8., 0.],
-         [10., 11., 0.],
-         [13., 14., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'constant')
-
-    # Test right shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 0., 1.],
-         [0., 3., 4],
-         [0., 6., 7.],
-         [0., 9., 10.],
-         [0., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(transform_matrix, expected_output,
-                                         'constant')
-
-  def test_random_translation_constant_1(self):
-    with tf.compat.forward_compatibility_horizon(2020, 8, 6):
-      # constant output is (1111|abcd|1111)
-
-      # Test down shift by 1.
-      # pyformat: disable
-      expected_output = np.asarray(
-          [[1., 1., 1.],
-           [0., 1., 2.],
-           [3., 4., 5.],
-           [6., 7., 8],
-           [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
-      # pyformat: enable
-      transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
-      self._run_random_transform_with_mock(
-          transform_matrix, expected_output, 'constant', fill_value=1.0)
-
-      # Test up shift by 1.
-      # pyformat: disable
-      expected_output = np.asarray(
-          [[3., 4., 5.],
-           [6., 7., 8],
-           [9., 10., 11.],
-           [12., 13., 14.],
-           [1., 1., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-      # pyformat: enable
-      transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
-      self._run_random_transform_with_mock(
-          transform_matrix, expected_output, 'constant', fill_value=1.0)
-
-      # Test left shift by 1.
-      # pyformat: disable
-      expected_output = np.asarray(
-          [[1., 2., 1.],
-           [4., 5., 1.],
-           [7., 8., 1.],
-           [10., 11., 1.],
-           [13., 14., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-      # pyformat: enable
-      transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
-      self._run_random_transform_with_mock(
-          transform_matrix, expected_output, 'constant', fill_value=1.0)
-
-      # Test right shift by 1.
-      # pyformat: disable
-      expected_output = np.asarray(
-          [[1., 0., 1.],
-           [1., 3., 4],
-           [1., 6., 7.],
-           [1., 9., 10.],
-           [1., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-      # pyformat: enable
-      transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
-      self._run_random_transform_with_mock(
-          transform_matrix, expected_output, 'constant', fill_value=1.0)
-
-  def test_random_translation_nearest_interpolation(self):
-    # nearest output is (aaaa|abcd|dddd)
-
-    # Test down shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 0., 0.],
-         [0., 1., 2.],
-         [3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
-    self._run_random_transform_with_mock(
-        transform_matrix,
-        expected_output,
-        mode='constant',
-        interpolation='nearest')
-
-    # Test up shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[3., 4., 5.],
-         [6., 7., 8],
-         [9., 10., 11.],
-         [12., 13., 14.],
-         [0., 0., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
-    self._run_random_transform_with_mock(
+    def _run_random_transform_with_mock(
+        self,
         transform_matrix,
         expected_output,
-        mode='constant',
-        interpolation='nearest')
-
-    # Test left shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[1., 2., 0.],
-         [4., 5., 0.],
-         [7., 8., 0.],
-         [10., 11., 0.],
-         [13., 14., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(
-        transform_matrix,
-        expected_output,
-        mode='constant',
-        interpolation='nearest')
-
-    # Test right shift by 1.
-    # pyformat: disable
-    expected_output = np.asarray(
-        [[0., 0., 1.],
-         [0., 3., 4],
-         [0., 6., 7.],
-         [0., 9., 10.],
-         [0., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
-    # pyformat: enable
-    transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
-    self._run_random_transform_with_mock(
-        transform_matrix,
-        expected_output,
-        mode='constant',
-        interpolation='nearest')
-
-
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class RandomRotationTest(test_combinations.TestCase):
-
-  def _run_test(self, factor):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    kwargs = {'factor': factor}
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          image_preprocessing.RandomRotation,
-          kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          expected_output_shape=(None, orig_height, orig_width, channels))
-
-  @parameterized.named_parameters(('random_rotate_4', .4),
-                                  ('random_rotate_3', .3),
-                                  ('random_rotate_tuple_factor', (-.5, .4)))
-  def test_random_rotation(self, factor):
-    self._run_test(factor)
-
-  def test_random_rotation_inference(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomRotation(.5)
-      actual_output = layer(input_images, training=False)
-      self.assertAllClose(expected_output, actual_output)
-
-  def test_distribution_strategy(self):
-    """Tests that RandomRotation can be created within distribution strategies."""
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    with test_utils.use_gpu():
-      strat = tf.distribute.MirroredStrategy(devices=['cpu', 'gpu'])
-      with strat.scope():
-        layer = image_preprocessing.RandomRotation(.5)
-        output = strat.run(lambda: layer(input_images, training=True))
-      values = output.values
-      self.assertAllEqual(2, len(values))
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomRotation(.5, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomRotation.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_unbatched_image(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(np.float32)
-      # 180 rotation.
-      layer = image_preprocessing.RandomRotation(factor=(0.5, 0.5))
-      output_image = layer(input_image)
-      expected_output = np.asarray([
-          [24, 23, 22, 21, 20],
-          [19, 18, 17, 16, 15],
-          [14, 13, 12, 11, 10],
-          [9, 8, 7, 6, 5],
-          [4, 3, 2, 1, 0],
-      ]).astype(np.float32)
-      expected_output = np.reshape(expected_output, (5, 5, 1))
-      self.assertAllClose(expected_output, output_image)
-
-  def test_augment_image(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(np.float32)
-      # 180 rotation.
-      layer = image_preprocessing.RandomRotation(factor=(0.5, 0.5))
-      output_image = layer.augment_image(
-          input_image, transformation=layer.get_random_transformation())
-      expected_output = np.asarray([
-          [24, 23, 22, 21, 20],
-          [19, 18, 17, 16, 15],
-          [14, 13, 12, 11, 10],
-          [9, 8, 7, 6, 5],
-          [4, 3, 2, 1, 0],
-      ]).astype(np.float32)
-      expected_output = np.reshape(expected_output, (5, 5, 1))
-      self.assertAllClose(expected_output, output_image)
-
-  def test_augment_bbox(self):
-    with test_utils.use_gpu():
-      input_image = np.random.random((512, 512, 3)).astype(np.float32)
-      bboxes = tf.convert_to_tensor([[200,200,400,400],[100,100,300,300]])
-      # 180 rotation.
-      layer = image_preprocessing.RandomRotation(factor=(0.5, 0.5))
-      output_bbox = layer.augment_bounding_boxes(
-          input_image, bboxes, transformation=layer.get_random_transformation())
-      expected_output = np.asarray([
-          [111, 112, 312, 312],
-          [212, 211, 412, 412]
-      ]).astype(np.int32)
-      expected_output = np.reshape(expected_output, ( 2, 4))
-      self.assertAllClose(expected_output, output_bbox)
-
-  def test_augment_bbox_dict_input(self):
-    with test_utils.use_gpu():
-      input_image = np.random.random((512, 512, 3)).astype(np.float32)
-      bboxes = tf.convert_to_tensor([[200,200,400,400],[100,100,300,300]])
-      input = {'images':input_image, 'bounding_boxes':bboxes}
-      # 180 rotation.
-      layer = image_preprocessing.RandomRotation(factor=(0.0833, 0.0833))
-      output_bbox = layer(input)
-      expected_output = np.asarray([
-          [179, 135, 452, 408],
-          [42, 98, 316, 372]
-      ]).astype(np.int32)
-      expected_output = np.reshape(expected_output, ( 2, 4))
-      self.assertAllClose(expected_output, output_bbox['bounding_boxes'])
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomRotation(.5)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomRotation(.5, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
-
-
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class RandomZoomTest(test_combinations.TestCase):
+        mode,
+        fill_value=0.0,
+        interpolation="bilinear",
+    ):
+        inp = np.arange(15).reshape((1, 5, 3, 1)).astype(np.float32)
+        with self.cached_session():
+            output = image_preprocessing.transform(
+                inp,
+                transform_matrix,
+                fill_mode=mode,
+                fill_value=fill_value,
+                interpolation=interpolation,
+            )
+        self.assertAllClose(expected_output, output)
+
+    def test_random_translation_reflect(self):
+        # reflected output is (dcba|abcd|dcba)
+
+        # Test down shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 1.0, 2.0],
+                    [0.0, 1.0, 2.0],
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "reflect"
+        )
+
+        # Test up shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11.0],
+                    [12.0, 13.0, 14.0],
+                    [12.0, 13.0, 14.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "reflect"
+        )
+
+        # Test left shift by 1.
+        # reflected output is (dcba|abcd|dcba)
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [1.0, 2.0, 2.0],
+                    [4.0, 5.0, 5.0],
+                    [7.0, 8.0, 8.0],
+                    [10.0, 11.0, 11.0],
+                    [13.0, 14.0, 14.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "reflect"
+        )
+
+        # Test right shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 0.0, 1.0],
+                    [3.0, 3.0, 4],
+                    [6.0, 6.0, 7.0],
+                    [9.0, 9.0, 10.0],
+                    [12.0, 12.0, 13.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "reflect"
+        )
 
-  def _run_test(self, height_factor, width_factor):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    kwargs = {'height_factor': height_factor, 'width_factor': width_factor}
-    with test_utils.use_gpu():
-      test_utils.layer_test(
-          image_preprocessing.RandomZoom,
-          kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          expected_output_shape=(None, orig_height, orig_width, channels))
-
-  @parameterized.named_parameters(
-      ('random_zoom_4_by_6', -.4, -.6), ('random_zoom_2_by_3', -.2, -.3),
-      ('random_zoom_tuple_factor', (-.4, -.5), (-.2, -.3)))
-  def test_random_zoom_in(self, height_factor, width_factor):
-    self._run_test(height_factor, width_factor)
-
-  @parameterized.named_parameters(
-      ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
-      ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
-  def test_random_zoom_out(self, height_factor, width_factor):
-    self._run_test(height_factor, width_factor)
-
-  def test_random_zoom_in_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
-        layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
-                                               interpolation='nearest')
-        output_image = layer(np.expand_dims(input_image, axis=0))
-        expected_output = np.asarray([
-            [6, 7, 7, 8, 8],
-            [11, 12, 12, 13, 13],
-            [11, 12, 12, 13, 13],
-            [16, 17, 17, 18, 18],
-            [16, 17, 17, 18, 18],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_zoom_out_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
-        layer = image_preprocessing.RandomZoom((.5, .5), (.8, .8),
-                                               fill_mode='constant',
-                                               interpolation='nearest')
-        output_image = layer(np.expand_dims(input_image, axis=0))
-        expected_output = np.asarray([
-            [0, 0, 0, 0, 0],
-            [0, 5, 7, 9, 0],
-            [0, 10, 12, 14, 0],
-            [0, 20, 22, 24, 0],
-            [0, 0, 0, 0, 0],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
-        layer = image_preprocessing.RandomZoom((.5, .5),
-                                               fill_mode='constant',
-                                               interpolation='nearest')
-        output_image = layer(np.expand_dims(input_image, axis=0))
-        expected_output = np.asarray([
-            [0, 0, 0, 0, 0],
-            [0, 6, 7, 9, 0],
-            [0, 11, 12, 14, 0],
-            [0, 21, 22, 24, 0],
-            [0, 0, 0, 0, 0],
-        ]).astype(dtype)
-        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_zoom_inference(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomZoom(.5, .5)
-      actual_output = layer(input_images, training=False)
-      self.assertAllClose(expected_output, actual_output)
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomZoom(.5, .6, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomZoom.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_unbatched_image(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(np.int64)
-      layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
-                                             interpolation='nearest')
-      output_image = layer(input_image)
-      expected_output = np.asarray([
-          [6, 7, 7, 8, 8],
-          [11, 12, 12, 13, 13],
-          [11, 12, 12, 13, 13],
-          [16, 17, 17, 18, 18],
-          [16, 17, 17, 18, 18],
-      ]).astype(np.int64)
-      expected_output = np.reshape(expected_output, (5, 5, 1))
-      self.assertAllEqual(expected_output, output_image)
-
-  def test_augment_image(self):
-    with test_utils.use_gpu():
-      input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(np.int64)
-      layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
-                                             interpolation='nearest')
-      output_image = layer.augment_image(
-          input_image, transformation=layer.get_random_transformation())
-      expected_output = np.asarray([
-          [6, 7, 7, 8, 8],
-          [11, 12, 12, 13, 13],
-          [11, 12, 12, 13, 13],
-          [16, 17, 17, 18, 18],
-          [16, 17, 17, 18, 18],
-      ]).astype(np.int64)
-      expected_output = np.reshape(expected_output, (5, 5, 1))
-      self.assertAllEqual(expected_output, output_image)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomZoom(.5, .5)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomZoom(.5, .5, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
+    def test_random_translation_wrap(self):
+        # warpped output is (abcd|abcd|abcd)
 
+        # Test down shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [12.0, 13.0, 14.0],
+                    [0.0, 1.0, 2.0],
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "wrap"
+        )
+
+        # Test up shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11.0],
+                    [12.0, 13.0, 14.0],
+                    [0.0, 1.0, 2.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "wrap"
+        )
+
+        # Test left shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [1.0, 2.0, 0.0],
+                    [4.0, 5.0, 3.0],
+                    [7.0, 8.0, 6.0],
+                    [10.0, 11.0, 9.0],
+                    [13.0, 14.0, 12.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "wrap"
+        )
+
+        # Test right shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [2.0, 0.0, 1.0],
+                    [5.0, 3.0, 4],
+                    [8.0, 6.0, 7.0],
+                    [11.0, 9.0, 10.0],
+                    [14.0, 12.0, 13.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "wrap"
+        )
 
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class RandomHeightTest(test_combinations.TestCase):
+    def test_random_translation_nearest(self):
+        # nearest output is (aaaa|abcd|dddd)
 
-  def _run_test(self, factor):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    with test_utils.use_gpu():
-      img = np.random.random((num_samples, orig_height, orig_width, channels))
-      layer = image_preprocessing.RandomHeight(factor)
-      img_out = layer(img, training=True)
-      self.assertEqual(img_out.shape[0], 2)
-      self.assertEqual(img_out.shape[2], 8)
-      self.assertEqual(img_out.shape[3], 3)
-
-  @parameterized.named_parameters(('random_height_4_by_6', (.4, .6)),
-                                  ('random_height_3_by_2', (-.3, .2)),
-                                  ('random_height_3', .3))
-  def test_random_height_basic(self, factor):
-    self._run_test(factor)
-
-  def test_valid_random_height(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      img = np.random.random((12, 5, 8, 3))
-      layer = image_preprocessing.RandomHeight(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[1], 3)
-
-  def test_random_height_longer_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
-        layer = image_preprocessing.RandomHeight(factor=(1., 1.))
-        # Return type of RandomHeight() is float32 if `interpolation` is not
-        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
-        output_image = tf.cast(
-            layer(np.expand_dims(input_image, axis=0)), dtype=dtype)
+        # Test down shift by 1.
         # pyformat: disable
-        expected_output = np.asarray([
-            [0, 1, 2],
-            [0.75, 1.75, 2.75],
-            [2.25, 3.25, 4.25],
-            [3, 4, 5]
-        ]).astype(dtype)
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 1.0, 2.0],
+                    [0.0, 1.0, 2.0],
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
         # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 4, 3, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_height_shorter_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
-        layer = image_preprocessing.RandomHeight(
-            factor=(-.5, -.5), interpolation='nearest')
-        output_image = layer(np.expand_dims(input_image, axis=0))
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "nearest"
+        )
+
+        # Test up shift by 1.
         # pyformat: disable
-        expected_output = np.asarray([
-            [2, 3],
-            [6, 7]
-        ]).astype(dtype)
+        expected_output = (
+            np.asarray(
+                [
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11.0],
+                    [12.0, 13.0, 14.0],
+                    [12.0, 13.0, 14.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
         # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_height_invalid_factor(self):
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomHeight((-1.5, .4))
-
-  def test_random_height_inference(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomHeight(.5)
-      actual_output = layer(input_images, training=False)
-      self.assertAllClose(expected_output, actual_output)
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomHeight(.5, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomHeight.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_unbatched_image(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      img = np.random.random((5, 8, 3))
-      layer = image_preprocessing.RandomHeight(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[0], 3)
-
-  @test_utils.run_v2_only
-  def test_batched_input(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      images = np.random.random((5, 5, 8, 3))
-      layer = image_preprocessing.RandomHeight(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer(images, training=True)
-        self.assertEqual(img_out.shape[1], 3)
-
-  @test_utils.run_v2_only
-  def test_augment_image(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      img = np.random.random((5, 8, 3))
-      layer = image_preprocessing.RandomHeight(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer.augment_image(
-            img, transformation=layer.get_random_transformation(image=img))
-        self.assertEqual(img_out.shape[0], 3)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomHeight(.2)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomHeight(.2, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
-
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "nearest"
+        )
+
+        # Test left shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [1.0, 2.0, 2.0],
+                    [4.0, 5.0, 5.0],
+                    [7.0, 8.0, 8.0],
+                    [10.0, 11.0, 11.0],
+                    [13.0, 14.0, 14.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "nearest"
+        )
+
+        # Test right shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 0.0, 1.0],
+                    [3.0, 3.0, 4],
+                    [6.0, 6.0, 7.0],
+                    [9.0, 9.0, 10.0],
+                    [12.0, 12.0, 13.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "nearest"
+        )
 
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class RandomWidthTest(test_combinations.TestCase):
+    def test_random_translation_constant_0(self):
+        # constant output is (0000|abcd|0000)
 
-  def _run_test(self, factor):
-    np.random.seed(1337)
-    num_samples = 2
-    orig_height = 5
-    orig_width = 8
-    channels = 3
-    with test_utils.use_gpu():
-      img = np.random.random((num_samples, orig_height, orig_width, channels))
-      layer = image_preprocessing.RandomWidth(factor)
-      img_out = layer(img, training=True)
-      self.assertEqual(img_out.shape[0], 2)
-      self.assertEqual(img_out.shape[1], 5)
-      self.assertEqual(img_out.shape[3], 3)
-
-  @parameterized.named_parameters(('random_width_4_by_6', (.4, .6)),
-                                  ('random_width_3_by_2', (-.3, .2)),
-                                  ('random_width_3', .3))
-  def test_random_width_basic(self, factor):
-    self._run_test(factor)
-
-  def test_valid_random_width(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      img = np.random.random((12, 8, 5, 3))
-      layer = image_preprocessing.RandomWidth(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[2], 3)
-
-  def test_random_width_longer_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
-        layer = image_preprocessing.RandomWidth(factor=(1., 1.))
-        # Return type of RandomWidth() is float32 if `interpolation` is not
-        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
-        output_image = tf.cast(
-            layer(np.expand_dims(input_image, axis=0)), dtype=dtype)
+        # Test down shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 0.0, 0.0],
+                    [0.0, 1.0, 2.0],
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "constant"
+        )
+
+        # Test up shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11.0],
+                    [12.0, 13.0, 14.0],
+                    [0.0, 0.0, 0.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "constant"
+        )
+
+        # Test left shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [1.0, 2.0, 0.0],
+                    [4.0, 5.0, 0.0],
+                    [7.0, 8.0, 0.0],
+                    [10.0, 11.0, 0.0],
+                    [13.0, 14.0, 0.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "constant"
+        )
+
+        # Test right shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 0.0, 1.0],
+                    [0.0, 3.0, 4],
+                    [0.0, 6.0, 7.0],
+                    [0.0, 9.0, 10.0],
+                    [0.0, 12.0, 13.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix, expected_output, "constant"
+        )
+
+    def test_random_translation_constant_1(self):
+        with tf.compat.forward_compatibility_horizon(2020, 8, 6):
+            # constant output is (1111|abcd|1111)
+
+            # Test down shift by 1.
+            # pyformat: disable
+            expected_output = (
+                np.asarray(
+                    [
+                        [1.0, 1.0, 1.0],
+                        [0.0, 1.0, 2.0],
+                        [3.0, 4.0, 5.0],
+                        [6.0, 7.0, 8],
+                        [9.0, 10.0, 11],
+                    ]
+                )
+                .reshape((1, 5, 3, 1))
+                .astype(np.float32)
+            )
+            # pyformat: enable
+            transform_matrix = np.asarray(
+                [[1.0, 0.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0]]
+            )
+            self._run_random_transform_with_mock(
+                transform_matrix, expected_output, "constant", fill_value=1.0
+            )
+
+            # Test up shift by 1.
+            # pyformat: disable
+            expected_output = (
+                np.asarray(
+                    [
+                        [3.0, 4.0, 5.0],
+                        [6.0, 7.0, 8],
+                        [9.0, 10.0, 11.0],
+                        [12.0, 13.0, 14.0],
+                        [1.0, 1.0, 1.0],
+                    ]
+                )
+                .reshape((1, 5, 3, 1))
+                .astype(np.float32)
+            )
+            # pyformat: enable
+            transform_matrix = np.asarray(
+                [[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]
+            )
+            self._run_random_transform_with_mock(
+                transform_matrix, expected_output, "constant", fill_value=1.0
+            )
+
+            # Test left shift by 1.
+            # pyformat: disable
+            expected_output = (
+                np.asarray(
+                    [
+                        [1.0, 2.0, 1.0],
+                        [4.0, 5.0, 1.0],
+                        [7.0, 8.0, 1.0],
+                        [10.0, 11.0, 1.0],
+                        [13.0, 14.0, 1.0],
+                    ]
+                )
+                .reshape((1, 5, 3, 1))
+                .astype(np.float32)
+            )
+            # pyformat: enable
+            transform_matrix = np.asarray(
+                [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+            )
+            self._run_random_transform_with_mock(
+                transform_matrix, expected_output, "constant", fill_value=1.0
+            )
+
+            # Test right shift by 1.
+            # pyformat: disable
+            expected_output = (
+                np.asarray(
+                    [
+                        [1.0, 0.0, 1.0],
+                        [1.0, 3.0, 4],
+                        [1.0, 6.0, 7.0],
+                        [1.0, 9.0, 10.0],
+                        [1.0, 12.0, 13.0],
+                    ]
+                )
+                .reshape((1, 5, 3, 1))
+                .astype(np.float32)
+            )
+            # pyformat: enable
+            transform_matrix = np.asarray(
+                [[1.0, 0.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+            )
+            self._run_random_transform_with_mock(
+                transform_matrix, expected_output, "constant", fill_value=1.0
+            )
+
+    def test_random_translation_nearest_interpolation(self):
+        # nearest output is (aaaa|abcd|dddd)
+
+        # Test down shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 0.0, 0.0],
+                    [0.0, 1.0, 2.0],
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix,
+            expected_output,
+            mode="constant",
+            interpolation="nearest",
+        )
+
+        # Test up shift by 1.
+        # pyformat: disable
+        expected_output = (
+            np.asarray(
+                [
+                    [3.0, 4.0, 5.0],
+                    [6.0, 7.0, 8],
+                    [9.0, 10.0, 11.0],
+                    [12.0, 13.0, 14.0],
+                    [0.0, 0.0, 0.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
+        # pyformat: enable
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix,
+            expected_output,
+            mode="constant",
+            interpolation="nearest",
+        )
+
+        # Test left shift by 1.
         # pyformat: disable
-        expected_output = np.asarray([
-            [0, 0.25, 0.75, 1],
-            [2, 2.25, 2.75, 3],
-            [4, 4.25, 4.75, 5]
-        ]).astype(dtype)
+        expected_output = (
+            np.asarray(
+                [
+                    [1.0, 2.0, 0.0],
+                    [4.0, 5.0, 0.0],
+                    [7.0, 8.0, 0.0],
+                    [10.0, 11.0, 0.0],
+                    [13.0, 14.0, 0.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
         # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 3, 4, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_width_shorter_numeric(self):
-    for dtype in (np.int64, np.float32):
-      with test_utils.use_gpu():
-        input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
-        layer = image_preprocessing.RandomWidth(
-            factor=(-.5, -.5), interpolation='nearest')
-        output_image = layer(np.expand_dims(input_image, axis=0))
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix,
+            expected_output,
+            mode="constant",
+            interpolation="nearest",
+        )
+
+        # Test right shift by 1.
         # pyformat: disable
-        expected_output = np.asarray([
-            [1, 3],
-            [5, 7]
-        ]).astype(dtype)
+        expected_output = (
+            np.asarray(
+                [
+                    [0.0, 0.0, 1.0],
+                    [0.0, 3.0, 4],
+                    [0.0, 6.0, 7.0],
+                    [0.0, 9.0, 10.0],
+                    [0.0, 12.0, 13.0],
+                ]
+            )
+            .reshape((1, 5, 3, 1))
+            .astype(np.float32)
+        )
         # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
-        self.assertAllEqual(expected_output, output_image)
-
-  def test_random_width_invalid_factor(self):
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomWidth((-1.5, .4))
-
-  def test_random_width_inference(self):
-    input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
-    expected_output = input_images
-    with test_utils.use_gpu():
-      layer = image_preprocessing.RandomWidth(.5)
-      actual_output = layer(input_images, training=False)
-      self.assertAllClose(expected_output, actual_output)
-
-  @test_utils.run_v2_only
-  def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomWidth(.5, name='image_preproc')
-    config = layer.get_config()
-    layer_1 = image_preprocessing.RandomWidth.from_config(config)
-    self.assertEqual(layer_1.name, layer.name)
-
-  def test_unbatched_image(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      img = np.random.random((8, 5, 3))
-      layer = image_preprocessing.RandomWidth(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[1], 3)
-
-  @test_utils.run_v2_only
-  def test_batched_input(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      img = np.random.random((12, 8, 5, 3))
-      layer = image_preprocessing.RandomWidth(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[2], 3)
-
-  @test_utils.run_v2_only
-  def test_augment_image(self):
-    # need (maxval - minval) * rnd + minval = 0.6
-    mock_factor = 0.6
-    with test_utils.use_gpu():
-      img = np.random.random((8, 5, 3))
-      layer = image_preprocessing.RandomWidth(.4)
-      with tf.compat.v1.test.mock.patch.object(
-          layer._random_generator, 'random_uniform', return_value=mock_factor):
-        img_out = layer.augment_image(
-            img, transformation=layer.get_random_transformation(image=img))
-        self.assertEqual(img_out.shape[1], 3)
-
-  @test_utils.run_v2_only
-  def test_output_dtypes(self):
-    inputs = np.array([[[1], [2]], [[3], [4]]], dtype='float64')
-    layer = image_preprocessing.RandomWidth(.2)
-    self.assertAllEqual(layer(inputs).dtype, 'float32')
-    layer = image_preprocessing.RandomWidth(.2, dtype='uint8')
-    self.assertAllEqual(layer(inputs).dtype, 'uint8')
+        transform_matrix = np.asarray(
+            [[1.0, 0.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0]]
+        )
+        self._run_random_transform_with_mock(
+            transform_matrix,
+            expected_output,
+            mode="constant",
+            interpolation="nearest",
+        )
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class WithLabelsTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ('RandomZoom', image_preprocessing.RandomZoom, {
-          'height_factor': 0.1
-      }),
-      ('RandomBrightness', image_preprocessing.RandomBrightness, {
-          'factor': 0.5
-      }),
-      ('RandomContrast', image_preprocessing.RandomContrast, {
-          'factor': 0.5
-      }),
-      ('RandomRotation', image_preprocessing.RandomRotation, {
-          'factor': 0.2
-      }),
-  )
-  def test_layer_with_labels(self, layer_cls, init_args):
-    layer = layer_cls(**init_args)
-
-    img = tf.random.uniform(
-        shape=(3, 512, 512, 3), minval=0, maxval=1, dtype=tf.float32)
-    labels = tf.constant(([[1, 0, 0], [0, 0, 1], [0, 1, 0]]), dtype=tf.float32)
-
-    inputs = {'images': img, 'labels': labels}
-    outputs = layer(inputs)
-    self.assertAllClose(labels, outputs["labels"])
+class RandomRotationTest(test_combinations.TestCase):
+    def _run_test(self, factor):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        kwargs = {"factor": factor}
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                image_preprocessing.RandomRotation,
+                kwargs=kwargs,
+                input_shape=(num_samples, orig_height, orig_width, channels),
+                expected_output_shape=(None, orig_height, orig_width, channels),
+            )
+
+    @parameterized.named_parameters(
+        ("random_rotate_4", 0.4),
+        ("random_rotate_3", 0.3),
+        ("random_rotate_tuple_factor", (-0.5, 0.4)),
+    )
+    def test_random_rotation(self, factor):
+        self._run_test(factor)
+
+    def test_random_rotation_inference(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomRotation(0.5)
+            actual_output = layer(input_images, training=False)
+            self.assertAllClose(expected_output, actual_output)
+
+    def test_distribution_strategy(self):
+        """Tests that RandomRotation can be created within DistStrats."""
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        with test_utils.use_gpu():
+            strat = tf.distribute.MirroredStrategy(devices=["cpu", "gpu"])
+            with strat.scope():
+                layer = image_preprocessing.RandomRotation(0.5)
+                output = strat.run(lambda: layer(input_images, training=True))
+            values = output.values
+            self.assertAllEqual(2, len(values))
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.RandomRotation(0.5, name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomRotation.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_unbatched_image(self):
+        with test_utils.use_gpu():
+            input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(
+                np.float32
+            )
+            # 180 rotation.
+            layer = image_preprocessing.RandomRotation(factor=(0.5, 0.5))
+            output_image = layer(input_image)
+            expected_output = np.asarray(
+                [
+                    [24, 23, 22, 21, 20],
+                    [19, 18, 17, 16, 15],
+                    [14, 13, 12, 11, 10],
+                    [9, 8, 7, 6, 5],
+                    [4, 3, 2, 1, 0],
+                ]
+            ).astype(np.float32)
+            expected_output = np.reshape(expected_output, (5, 5, 1))
+            self.assertAllClose(expected_output, output_image)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.RandomRotation(0.5)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomRotation(0.5, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class LearningPhaseTest(test_combinations.TestCase):
-
-  def test_plain_call(self):
-    layer = image_preprocessing.RandomWidth(.5, seed=123)
-    shape = (12, 12, 3)
-    img = np.random.random((12,) + shape)
-    out = layer(img)  # Default to training=True
-    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
-    out = layer(img, training=True)
-    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
-    out = layer(img, training=False)
-    self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
-  def test_call_in_container(self):
-    layer1 = image_preprocessing.RandomWidth(.5, seed=123)
-    layer2 = image_preprocessing.RandomHeight(.5, seed=123)
-    seq = sequential.Sequential([layer1, layer2])
-
-    shape = (12, 12, 3)
-    img = np.random.random((12,) + shape)
-    out = seq(img)  # Default to training=True
-    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
-    out = seq(img, training=True)
-    self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
-    out = seq(img, training=False)
-    self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
+class RandomZoomTest(test_combinations.TestCase):
+    def _run_test(self, height_factor, width_factor):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        kwargs = {"height_factor": height_factor, "width_factor": width_factor}
+        with test_utils.use_gpu():
+            test_utils.layer_test(
+                image_preprocessing.RandomZoom,
+                kwargs=kwargs,
+                input_shape=(num_samples, orig_height, orig_width, channels),
+                expected_output_shape=(None, orig_height, orig_width, channels),
+            )
+
+    @parameterized.named_parameters(
+        ("random_zoom_4_by_6", -0.4, -0.6),
+        ("random_zoom_2_by_3", -0.2, -0.3),
+        ("random_zoom_tuple_factor", (-0.4, -0.5), (-0.2, -0.3)),
+    )
+    def test_random_zoom_in(self, height_factor, width_factor):
+        self._run_test(height_factor, width_factor)
+
+    @parameterized.named_parameters(
+        ("random_zoom_4_by_6", 0.4, 0.6),
+        ("random_zoom_2_by_3", 0.2, 0.3),
+        ("random_zoom_tuple_factor", (0.4, 0.5), (0.2, 0.3)),
+    )
+    def test_random_zoom_out(self, height_factor, width_factor):
+        self._run_test(height_factor, width_factor)
+
+    def test_random_zoom_in_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.RandomZoom(
+                    (-0.5, -0.5), (-0.5, -0.5), interpolation="nearest"
+                )
+                output_image = layer(np.expand_dims(input_image, axis=0))
+                expected_output = np.asarray(
+                    [
+                        [6, 7, 7, 8, 8],
+                        [11, 12, 12, 13, 13],
+                        [11, 12, 12, 13, 13],
+                        [16, 17, 17, 18, 18],
+                        [16, 17, 17, 18, 18],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_zoom_out_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.RandomZoom(
+                    (0.5, 0.5),
+                    (0.8, 0.8),
+                    fill_mode="constant",
+                    interpolation="nearest",
+                )
+                output_image = layer(np.expand_dims(input_image, axis=0))
+                expected_output = np.asarray(
+                    [
+                        [0, 0, 0, 0, 0],
+                        [0, 5, 7, 9, 0],
+                        [0, 10, 12, 14, 0],
+                        [0, 20, 22, 24, 0],
+                        [0, 0, 0, 0, 0],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.RandomZoom(
+                    (0.5, 0.5), fill_mode="constant", interpolation="nearest"
+                )
+                output_image = layer(np.expand_dims(input_image, axis=0))
+                expected_output = np.asarray(
+                    [
+                        [0, 0, 0, 0, 0],
+                        [0, 6, 7, 9, 0],
+                        [0, 11, 12, 14, 0],
+                        [0, 21, 22, 24, 0],
+                        [0, 0, 0, 0, 0],
+                    ]
+                ).astype(dtype)
+                expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_zoom_inference(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomZoom(0.5, 0.5)
+            actual_output = layer(input_images, training=False)
+            self.assertAllClose(expected_output, actual_output)
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.RandomZoom(0.5, 0.6, name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomZoom.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_unbatched_image(self):
+        with test_utils.use_gpu():
+            input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(
+                np.int64
+            )
+            layer = image_preprocessing.RandomZoom(
+                (-0.5, -0.5), (-0.5, -0.5), interpolation="nearest"
+            )
+            output_image = layer(input_image)
+            expected_output = np.asarray(
+                [
+                    [6, 7, 7, 8, 8],
+                    [11, 12, 12, 13, 13],
+                    [11, 12, 12, 13, 13],
+                    [16, 17, 17, 18, 18],
+                    [16, 17, 17, 18, 18],
+                ]
+            ).astype(np.int64)
+            expected_output = np.reshape(expected_output, (5, 5, 1))
+            self.assertAllEqual(expected_output, output_image)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.RandomZoom(0.5, 0.5)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomZoom(0.5, 0.5, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class DeterminismTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ('random_contrast',
-       functools.partial(image_preprocessing.RandomContrast, factor=1.)),
-      ('random_crop',
-       functools.partial(image_preprocessing.RandomCrop, height=2, width=2)),
-      ('random_translation',
-       functools.partial(image_preprocessing.RandomTranslation, 0.3, 0.2)),
-      ('random_rotation',
-       functools.partial(image_preprocessing.RandomRotation, 0.5)),
-      ('random_zoom', functools.partial(image_preprocessing.RandomZoom, 0.2)),
-      ('random_height', functools.partial(image_preprocessing.RandomHeight,
-                                          0.4)),
-      ('random_width', functools.partial(image_preprocessing.RandomWidth, 0.3)),
-  )
-  def test_seed_constructor_arg(self, layer_cls):
-    input_image = np.random.random((2, 5, 8, 3)).astype(np.float32)
-
-    layer1 = layer_cls(seed=0.)
-    layer2 = layer_cls(seed=0.)
-    layer1_output = layer1(input_image)
-    layer2_output = layer2(input_image)
-
-    self.assertAllClose(layer1_output.numpy().tolist(),
-                        layer2_output.numpy().tolist())
-
-
-class RandomAddLayer(image_preprocessing.BaseImageAugmentationLayer):
-
-  def __init__(self, value_range=(0., 1.0), fixed_value=None, **kwargs):
-    super().__init__(**kwargs)
-    self.value_range = value_range
-    self.fixed_value = fixed_value
-
-  def get_random_transformation(
-      self, image=None, label=None, bounding_box=None):
-    if self.fixed_value:
-      return self.fixed_value
-    return self._random_generator.random_uniform(
-        [], minval=self.value_range[0], maxval=self.value_range[1])
-
-  def augment_image(self, image, transformation):
-    return image + transformation
-
-  def augment_label(self, label, transformation):
-    return label + transformation
-
-
-class VectorizeDisabledLayer(image_preprocessing.BaseImageAugmentationLayer):
-
-  def __init__(self, **kwargs):
-    self.auto_vectorize = False
-    super().__init__(**kwargs)
+class RandomHeightTest(test_combinations.TestCase):
+    def _run_test(self, factor):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        with test_utils.use_gpu():
+            img = np.random.random(
+                (num_samples, orig_height, orig_width, channels)
+            )
+            layer = image_preprocessing.RandomHeight(factor)
+            img_out = layer(img, training=True)
+            self.assertEqual(img_out.shape[0], 2)
+            self.assertEqual(img_out.shape[2], 8)
+            self.assertEqual(img_out.shape[3], 3)
+
+    @parameterized.named_parameters(
+        ("random_height_4_by_6", (0.4, 0.6)),
+        ("random_height_3_by_2", (-0.3, 0.2)),
+        ("random_height_3", 0.3),
+    )
+    def test_random_height_basic(self, factor):
+        self._run_test(factor)
+
+    def test_valid_random_height(self):
+        # need (maxval - minval) * rnd + minval = 0.6
+        mock_factor = 0.6
+        with test_utils.use_gpu():
+            img = np.random.random((12, 5, 8, 3))
+            layer = image_preprocessing.RandomHeight(0.4)
+            with tf.compat.v1.test.mock.patch.object(
+                layer._random_generator,
+                "random_uniform",
+                return_value=mock_factor,
+            ):
+                img_out = layer(img, training=True)
+                self.assertEqual(img_out.shape[1], 3)
+
+    def test_random_height_longer_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.RandomHeight(factor=(1.0, 1.0))
+                # Return type of RandomHeight() is float32
+                # if `interpolation` is not
+                # set to `ResizeMethod.NEAREST_NEIGHBOR`;
+                # cast `layer` to desired dtype.
+                output_image = tf.cast(
+                    layer(np.expand_dims(input_image, axis=0)), dtype=dtype
+                )
+                # pyformat: disable
+                expected_output = np.asarray(
+                    [
+                        [0, 1, 2],
+                        [0.75, 1.75, 2.75],
+                        [2.25, 3.25, 4.25],
+                        [3, 4, 5],
+                    ]
+                ).astype(dtype)
+                # pyformat: enable
+                expected_output = np.reshape(expected_output, (1, 4, 3, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_height_shorter_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.RandomHeight(
+                    factor=(-0.5, -0.5), interpolation="nearest"
+                )
+                output_image = layer(np.expand_dims(input_image, axis=0))
+                # pyformat: disable
+                expected_output = np.asarray([[2, 3], [6, 7]]).astype(dtype)
+                # pyformat: enable
+                expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_height_invalid_factor(self):
+        with self.assertRaises(ValueError):
+            image_preprocessing.RandomHeight((-1.5, 0.4))
+
+    def test_random_height_inference(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomHeight(0.5)
+            actual_output = layer(input_images, training=False)
+            self.assertAllClose(expected_output, actual_output)
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.RandomHeight(0.5, name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomHeight.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_unbatched_image(self):
+        # need (maxval - minval) * rnd + minval = 0.6
+        mock_factor = 0.6
+        with test_utils.use_gpu():
+            img = np.random.random((5, 8, 3))
+            layer = image_preprocessing.RandomHeight(0.4)
+            with tf.compat.v1.test.mock.patch.object(
+                layer._random_generator,
+                "random_uniform",
+                return_value=mock_factor,
+            ):
+                img_out = layer(img, training=True)
+                self.assertEqual(img_out.shape[0], 3)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.RandomHeight(0.2)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomHeight(0.2, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class BaseImageAugmentationLayerTest(test_combinations.TestCase):
-
-  def test_augment_single_image(self):
-    add_layer = RandomAddLayer(fixed_value=2.0)
-    image = np.random.random(size=(8, 8, 3)).astype('float32')
-    output = add_layer(image)
-
-    self.assertAllClose(image + 2.0, output)
-
-  def test_augment_dict_return_type(self):
-    add_layer = RandomAddLayer(fixed_value=2.0)
-    image = np.random.random(size=(8, 8, 3)).astype('float32')
-    output = add_layer({'images': image})
-
-    self.assertIsInstance(output, dict)
-
-  def test_auto_vectorize_disabled(self):
-    vectorize_disabled_layer = VectorizeDisabledLayer()
-    self.assertFalse(vectorize_disabled_layer.auto_vectorize)
-    self.assertEqual(vectorize_disabled_layer._map_fn, tf.map_fn)
-
-  @test_utils.run_v2_only
-  def test_augment_casts_dtypes(self):
-    add_layer = RandomAddLayer(fixed_value=2.0)
-    images = tf.ones((2, 8, 8, 3), dtype='uint8')
-    output = add_layer(images)
-
-    self.assertAllClose(tf.ones((2, 8, 8, 3), dtype='float32') * 3.0, output)
+class RandomWidthTest(test_combinations.TestCase):
+    def _run_test(self, factor):
+        np.random.seed(1337)
+        num_samples = 2
+        orig_height = 5
+        orig_width = 8
+        channels = 3
+        with test_utils.use_gpu():
+            img = np.random.random(
+                (num_samples, orig_height, orig_width, channels)
+            )
+            layer = image_preprocessing.RandomWidth(factor)
+            img_out = layer(img, training=True)
+            self.assertEqual(img_out.shape[0], 2)
+            self.assertEqual(img_out.shape[1], 5)
+            self.assertEqual(img_out.shape[3], 3)
+
+    @parameterized.named_parameters(
+        ("random_width_4_by_6", (0.4, 0.6)),
+        ("random_width_3_by_2", (-0.3, 0.2)),
+        ("random_width_3", 0.3),
+    )
+    def test_random_width_basic(self, factor):
+        self._run_test(factor)
+
+    def test_valid_random_width(self):
+        # need (maxval - minval) * rnd + minval = 0.6
+        mock_factor = 0.6
+        with test_utils.use_gpu():
+            img = np.random.random((12, 8, 5, 3))
+            layer = image_preprocessing.RandomWidth(0.4)
+            with tf.compat.v1.test.mock.patch.object(
+                layer._random_generator,
+                "random_uniform",
+                return_value=mock_factor,
+            ):
+                img_out = layer(img, training=True)
+                self.assertEqual(img_out.shape[2], 3)
+
+    def test_random_width_longer_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.RandomWidth(factor=(1.0, 1.0))
+                # Return type of RandomWidth() is float32
+                # if `interpolation` is not
+                # set to `ResizeMethod.NEAREST_NEIGHBOR`;
+                # cast `layer` to desired dtype.
+                output_image = tf.cast(
+                    layer(np.expand_dims(input_image, axis=0)), dtype=dtype
+                )
+                # pyformat: disable
+                expected_output = np.asarray(
+                    [[0, 0.25, 0.75, 1], [2, 2.25, 2.75, 3], [4, 4.25, 4.75, 5]]
+                ).astype(dtype)
+                # pyformat: enable
+                expected_output = np.reshape(expected_output, (1, 3, 4, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_width_shorter_numeric(self):
+        for dtype in (np.int64, np.float32):
+            with test_utils.use_gpu():
+                input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(
+                    dtype
+                )
+                layer = image_preprocessing.RandomWidth(
+                    factor=(-0.5, -0.5), interpolation="nearest"
+                )
+                output_image = layer(np.expand_dims(input_image, axis=0))
+                # pyformat: disable
+                expected_output = np.asarray([[1, 3], [5, 7]]).astype(dtype)
+                # pyformat: enable
+                expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+                self.assertAllEqual(expected_output, output_image)
+
+    def test_random_width_invalid_factor(self):
+        with self.assertRaises(ValueError):
+            image_preprocessing.RandomWidth((-1.5, 0.4))
+
+    def test_random_width_inference(self):
+        input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
+        expected_output = input_images
+        with test_utils.use_gpu():
+            layer = image_preprocessing.RandomWidth(0.5)
+            actual_output = layer(input_images, training=False)
+            self.assertAllClose(expected_output, actual_output)
+
+    @test_utils.run_v2_only
+    def test_config_with_custom_name(self):
+        layer = image_preprocessing.RandomWidth(0.5, name="image_preproc")
+        config = layer.get_config()
+        layer_1 = image_preprocessing.RandomWidth.from_config(config)
+        self.assertEqual(layer_1.name, layer.name)
+
+    def test_unbatched_image(self):
+        # need (maxval - minval) * rnd + minval = 0.6
+        mock_factor = 0.6
+        with test_utils.use_gpu():
+            img = np.random.random((8, 5, 3))
+            layer = image_preprocessing.RandomWidth(0.4)
+            with tf.compat.v1.test.mock.patch.object(
+                layer._random_generator,
+                "random_uniform",
+                return_value=mock_factor,
+            ):
+                img_out = layer(img, training=True)
+                self.assertEqual(img_out.shape[1], 3)
+
+    @test_utils.run_v2_only
+    def test_output_dtypes(self):
+        inputs = np.array([[[1], [2]], [[3], [4]]], dtype="float64")
+        layer = image_preprocessing.RandomWidth(0.2)
+        self.assertAllEqual(layer(inputs).dtype, "float32")
+        layer = image_preprocessing.RandomWidth(0.2, dtype="uint8")
+        self.assertAllEqual(layer(inputs).dtype, "uint8")
 
-  def test_augment_batch_images(self):
-    add_layer = RandomAddLayer()
-    images = np.random.random(size=(2, 8, 8, 3)).astype('float32')
-    output = add_layer(images)
 
-    diff = output - images
-    # Make sure the first image and second image get different augmentation
-    self.assertNotAllClose(diff[0], diff[1])
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class LearningPhaseTest(test_combinations.TestCase):
+    def test_plain_call(self):
+        layer = image_preprocessing.RandomWidth(0.5, seed=123)
+        shape = (12, 12, 3)
+        img = np.random.random((12,) + shape)
+        out = layer(img)  # Defaults to training=True
+        self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
-  def test_augment_image_and_label(self):
-    add_layer = RandomAddLayer(fixed_value=2.0)
-    image = np.random.random(size=(8, 8, 3)).astype('float32')
-    label = np.random.random(size=(1,)).astype('float32')
+        out = layer(img, training=True)
+        self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
-    output = add_layer({'images': image, 'labels': label})
-    expected_output = {'images': image + 2.0, 'labels': label + 2.0}
-    self.assertAllClose(output, expected_output)
+        out = layer(img, training=False)
+        self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
-  def test_augment_image_and_target(self):
-    add_layer = RandomAddLayer(fixed_value=2.0)
-    image = np.random.random(size=(8, 8, 3)).astype('float32')
-    label = np.random.random(size=(1,)).astype('float32')
+    def test_call_in_container(self):
+        layer1 = image_preprocessing.RandomWidth(0.5, seed=123)
+        layer2 = image_preprocessing.RandomHeight(0.5, seed=123)
+        seq = sequential.Sequential([layer1, layer2])
 
-    output = add_layer({'images': image, 'targets': label})
-    expected_output = {'images': image + 2.0, 'targets': label + 2.0}
-    self.assertAllClose(output, expected_output)
+        shape = (12, 12, 3)
+        img = np.random.random((12,) + shape)
+        out = seq(img)  # Defaults to training=True
+        self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
-  def test_augment_batch_images_and_labels(self):
-    add_layer = RandomAddLayer()
-    images = np.random.random(size=(2, 8, 8, 3)).astype('float32')
-    labels = np.random.random(size=(2, 1)).astype('float32')
-    output = add_layer({'images': images, 'labels': labels})
+        out = seq(img, training=True)
+        self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
-    image_diff = output['images'] - images
-    label_diff = output['labels'] - labels
-    # Make sure the first image and second image get different augmentation
-    self.assertNotAllClose(image_diff[0], image_diff[1])
-    self.assertNotAllClose(label_diff[0], label_diff[1])
+        out = seq(img, training=False)
+        self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class DeterminismTest(test_combinations.TestCase):
+    @parameterized.named_parameters(
+        ("random_flip", image_preprocessing.RandomFlip),
+        (
+            "random_contrast",
+            functools.partial(image_preprocessing.RandomContrast, factor=1.0),
+        ),
+        (
+            "random_crop",
+            functools.partial(
+                image_preprocessing.RandomCrop, height=2, width=2
+            ),
+        ),
+        (
+            "random_translation",
+            functools.partial(image_preprocessing.RandomTranslation, 0.3, 0.2),
+        ),
+        (
+            "random_rotation",
+            functools.partial(image_preprocessing.RandomRotation, 0.5),
+        ),
+        ("random_zoom", functools.partial(image_preprocessing.RandomZoom, 0.2)),
+        (
+            "random_height",
+            functools.partial(image_preprocessing.RandomHeight, 0.4),
+        ),
+        (
+            "random_width",
+            functools.partial(image_preprocessing.RandomWidth, 0.3),
+        ),
+    )
+    def test_seed_constructor_arg(self, layer_cls):
+        input_image = np.random.random((2, 5, 8, 3)).astype(np.float32)
+
+        layer1 = layer_cls(seed=0.0)
+        layer2 = layer_cls(seed=0.0)
+        layer1_output = layer1(input_image)
+        layer2_output = layer2(input_image)
+
+        self.assertAllClose(
+            layer1_output.numpy().tolist(), layer2_output.numpy().tolist()
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/preprocessing/index_lookup.py b/keras/layers/preprocessing/index_lookup.py
index 752f2c294bf6..4747b7ac206e 100644
--- a/keras/layers/preprocessing/index_lookup.py
+++ b/keras/layers/preprocessing/index_lookup.py
@@ -14,20 +14,21 @@
 # ==============================================================================
 """Keras index lookup preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
-
 
 import collections
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine import base_layer_utils
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_utils as utils
-from keras.saving.saved_model import layer_serialization
+from keras.saving.legacy.saved_model import layer_serialization
 from keras.utils import layer_utils
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 INT = utils.INT
@@ -41,770 +42,958 @@
 
 
 class NullInitializer(tf.lookup.KeyValueTensorInitializer):
-  """A placeholder initializer for restoring this layer from a SavedModel."""
+    """A placeholder initializer for restoring this layer from a SavedModel."""
 
-  def __init__(self, key_dtype, value_dtype):
-    """Construct a table initializer object.
+    def __init__(self, key_dtype, value_dtype):
+        """Construct a table initializer object.
 
-    Args:
-      key_dtype: Type of the table keys.
-      value_dtype: Type of the table values.
-    """
-    self._key_dtype = key_dtype
-    self._value_dtype = value_dtype
+        Args:
+          key_dtype: Type of the table keys.
+          value_dtype: Type of the table values.
+        """
+        self._key_dtype = key_dtype
+        self._value_dtype = value_dtype
 
-  @property
-  def key_dtype(self):
-    """The expected table key dtype."""
-    return self._key_dtype
+    @property
+    def key_dtype(self):
+        """The expected table key dtype."""
+        return self._key_dtype
 
-  @property
-  def value_dtype(self):
-    """The expected table value dtype."""
-    return self._value_dtype
+    @property
+    def value_dtype(self):
+        """The expected table value dtype."""
+        return self._value_dtype
 
-  def initialize(self, table):
-    """Returns the table initialization op."""
-    pass
+    def initialize(self, table):
+        """Returns the table initialization op."""
+        pass
 
 
 class VocabWeightHandler(base_layer_utils.TrackableWeightHandler):
-  """Adds the vocabulary as a layer weight during serialization."""
+    """Adds the vocabulary as a layer weight during serialization."""
 
-  def __init__(self, lookup_layer):
-    self._layer = lookup_layer
-    self._dtype = lookup_layer.vocabulary_dtype
-    self._distribute_strategy = tf.distribute.get_strategy()
+    def __init__(self, lookup_layer):
+        # Note that this class doesn't call super().__init__() in order to
+        # have customized behavior. The fileds like '_dtype' and
+        # '_distribute_strategy' are required by the parent class, as well as
+        # tf.distribute. See `strategy.extended.variable_created_in_scope`
+        self._layer = lookup_layer
+        self._dtype = lookup_layer.vocabulary_dtype
+        self._distribute_strategy = tf.distribute.get_strategy()
 
-  @property
-  def num_tensors(self):
-    return 1
+    @property
+    def num_tensors(self):
+        return 1
 
-  def set_weights(self, weights):
-    tokens = tf.convert_to_tensor(weights[0], self._dtype)
-    self._layer.lookup_table = self._layer._lookup_table_from_tokens(tokens)  # pylint: disable=protected-access
+    def set_weights(self, weights):
+        tokens = tf.convert_to_tensor(weights[0], self._dtype)
+        self._layer.lookup_table = self._layer._lookup_table_from_tokens(tokens)
 
-  def get_tensors(self):
-    # Just save the non-config part of the vocab (no special tokens).
-    tokens = self._layer.get_vocabulary(include_special_tokens=False)
-    tokens = tf.convert_to_tensor(tokens, self._dtype)
-    return [tokens]
+    def get_tensors(self):
+        # Just save the non-config part of the vocab (no special tokens).
+        tokens = self._layer.get_vocabulary(include_special_tokens=False)
+        tokens = tf.convert_to_tensor(tokens, self._dtype)
+        return [tokens]
 
 
 class IndexLookup(base_preprocessing_layer.PreprocessingLayer):
-  """Maps values from a vocabulary to integer indices.
-
-  This layer translates a set of arbitrary hashables into an integer output via
-  a table-based lookup, with optional out-of-vocabulary handling. This is the
-  basis layer for both IntegerLookup and StringLookup; it holds the common
-  logic but is not intended to be exported as part of the Keras API.
-
-  Args:
-    max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary. Note that this size
-      includes the OOV and mask tokens.
-    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
-      value is more than 1, OOV inputs are hashed to determine their OOV value.
-      If this value is 0, OOV inputs will cause an error when calling the layer.
-    mask_token: A token that represents masked inputs. When `output_mode` is
-      `"int"`, the token is included in vocabulary and mapped to index 0. In
-      other output modes, the token will not appear in the vocabulary and
-      instances of the mask token in the input will be dropped. If set to None,
-      no mask term will be added.
-    oov_token: Only used when `invert` is True. The token to return for OOV
-      indices.
-    vocabulary: Optional. Either an array or a string path to a text file. If
-      passing an array, can pass a tuple, list, 1D numpy array, or 1D tensor
-      containing the vocbulary terms. If passing a file path, the file should
-      contain one line per term in the vocabulary. If this argument is set,
-      there is no need to `adapt` the layer.
-    vocabulary_dtype: The dtype of the vocabulary terms. For example, `"int64"`
-      or `"string"`.
-    idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list, 1D
-      numpy array, or 1D tensor or the same length as the vocabulary, containing
-      the floating point inverse document frequency weights, which will be
-      multiplied by per sample term counts for the final `tf_idf` weight. If the
-      `vocabulary` argument is set, and `output_mode` is `"tf_idf"`, this
-      argument must be supplied.
-    invert: Only valid when `output_mode` is `"int"`. If True, this layer will
-      map indices to vocabulary items instead of mapping vocabulary items to
-      indices. Default to False.
-    output_mode: Specification for the output of the layer. Defaults to `"int"`.
-      Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or
-      `"tf_idf"` configuring the layer as follows:
-        - `"int"`: Return the raw integer indices of the input tokens.
-        - `"one_hot"`: Encodes each individual element in the input into an
-          array the same size as the vocabulary, containing a 1 at the element
-          index. If the last dimension is size 1, will encode on that dimension.
-          If the last dimension is not size 1, will append a new dimension for
-          the encoded output.
-        - `"multi_hot"`: Encodes each sample in the input into a single array
-          the same size as the vocabulary, containing a 1 for each vocabulary
-          term present in the sample. Treats the last dimension as the sample
-          dimension, if input shape is (..., sample_length), output shape will
-          be (..., num_tokens).
-        - `"count"`: As `"multi_hot"`, but the int array contains a count of the
-          number of times the token at that index appeared in the sample.
-        - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
-          find the value in each token slot.
-    pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
-      `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
-      padded to `max_tokens` even if the number of unique tokens in the
-      vocabulary is less than max_tokens, resulting in a tensor of shape
-      [batch_size, max_tokens] regardless of vocabulary size. Defaults to False.
-    sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`, `"count"`
-      and `"tf-idf"` output modes. If True, returns a `SparseTensor` instead of
-      a dense `Tensor`. Defaults to False.
-  """
-
-  def __init__(self,
-               max_tokens,
-               num_oov_indices,
-               mask_token,
-               oov_token,
-               vocabulary_dtype,
-               vocabulary=None,
-               idf_weights=None,
-               invert=False,
-               output_mode="int",
-               sparse=False,
-               pad_to_max_tokens=False,
-               **kwargs):
-    # If max_tokens is set, the value must be greater than 1 - otherwise we
-    # are creating a 0-element vocab, which doesn't make sense.
-    if max_tokens is not None and max_tokens <= 1:
-      raise ValueError(f"If set, `max_tokens` must be greater than 1. "
-                       f"Received: max_tokens={max_tokens}")
-
-    if pad_to_max_tokens and max_tokens is None:
-      raise ValueError(f"If pad_to_max_tokens is True, must set `max_tokens`. "
-                       f"Received: max_tokens={max_tokens}")
-
-    if num_oov_indices < 0:
-      raise ValueError(f"`num_oov_indices` must be greater than or equal to 0. "
-                       f"Received: num_oov_indices={num_oov_indices}")
-
-    # Support deprecated names for output_modes.
-    if output_mode == "binary":
-      output_mode = MULTI_HOT
-    if output_mode == "tf-idf":
-      output_mode = TF_IDF
-    # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF)
-    layer_utils.validate_string_arg(
-        output_mode,
-        allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF),
-        layer_name=self.__class__.__name__,
-        arg_name="output_mode")
-
-    if invert and output_mode != INT:
-      raise ValueError(f"`output_mode` must be `'int'` when `invert` is true. "
-                       f"Received: output_mode={output_mode}")
-
-    if sparse and output_mode == INT:
-      raise ValueError(f"`sparse` may only be true if `output_mode` is "
-                       f"`'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. "
-                       f"Received: sparse={sparse} and "
-                       f"output_mode={output_mode}")
-
-    if idf_weights is not None and output_mode != TF_IDF:
-      raise ValueError(f"`idf_weights` should only be set if `output_mode` is "
-                       f"`'tf_idf'`. Received: idf_weights={idf_weights} and "
-                       f"output_mode={output_mode}")
-
-    self.invert = invert
-    self.max_tokens = max_tokens
-    self.num_oov_indices = num_oov_indices
-    self.mask_token = mask_token
-    self.oov_token = oov_token
-    self.output_mode = output_mode
-    self.sparse = sparse
-    self.pad_to_max_tokens = pad_to_max_tokens
-    self.vocabulary_dtype = vocabulary_dtype
-    self._frozen_vocab_size = None
-
-    self.input_vocabulary = vocabulary
-    self.input_idf_weights = idf_weights
-    # VocabularySavedModelSaver will clear the config vocabulary to restore the
-    # lookup table ops directly. We persist this hidden option to persist the
-    # fact that we have have a non-adaptable layer with a manually set vocab.
-    self._has_input_vocabulary = kwargs.pop("has_input_vocabulary",
-                                            (vocabulary is not None))
-
-    # Drop deprecated config options.
-    kwargs.pop("vocabulary_size", None)
-    kwargs.pop("has_static_table", None)
-
-    # By default, output int64 when output_mode='int' and floats otherwise.
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = tf.int64 if output_mode == INT else backend.floatx()
-
-    super().__init__(**kwargs)
-
-    # Check dtype only after base layer parses it; dtype parsing is complex.
-    if output_mode == INT and not tf.as_dtype(self.compute_dtype).is_integer:
-      input_dtype = kwargs["dtype"]
-      raise ValueError("When `output_mode='int'`, `dtype` should be an integer "
-                       f"type. Received: dtype={input_dtype}")
-
-    if invert:
-      self._key_dtype = self.dtype if output_mode == INT else tf.int64
-      self._value_dtype = tf.as_dtype(self.vocabulary_dtype)
-      mask_key = 0
-      mask_value = mask_token
-      self._default_value = self.oov_token
-    else:
-      self._key_dtype = tf.as_dtype(self.vocabulary_dtype)
-      self._value_dtype = self.dtype if output_mode == INT else tf.int64
-      mask_key = mask_token
-      # Masks should map to 0 for int output and be dropped otherwise. Max ints
-      # will be dropped from the bincount op.
-      mask_value = 0 if self.output_mode == INT else self._value_dtype.max
-      if self.num_oov_indices == 0:
-        # If there are no OOV indices, we map OOV tokens to -1 and error out
-        # during call if we find a negative index.
-        self._default_value = -1
-      elif self.num_oov_indices == 1:
-        # If there is only one OOV index, we can set that index as the default
-        # value of the index_lookup table.
-        self._default_value = self._oov_start_index()
-      else:
-        # If we have multiple OOV values, we need to do a further hashing step;
-        # to make this easier, we set the OOV value to -1. (This lets us do a
-        # vectorized add and cast to boolean to determine locations where we
-        # need to do extra hashing.)
-        self._default_value = -1
-    if self.mask_token is not None:
-      self._mask_key = tf.convert_to_tensor(mask_key, self._key_dtype)
-      self._mask_value = tf.convert_to_tensor(mask_value, self._value_dtype)
-
-    if self.output_mode == TF_IDF:
-      self.idf_weights = tf.Variable(
-          [0] * self._token_start_index(),
-          shape=(None,),
-          dtype=self.compute_dtype,
-          trainable=False)
-      self.idf_weights_const = self.idf_weights.value()
-
-    if vocabulary is not None:
-      self.set_vocabulary(vocabulary, idf_weights)
-    else:
-      # When restoring from a keras SavedModel, the loading code will expect to
-      # find and restore a lookup_table attribute on the layer. This table needs
-      # to be uninitialized as a StaticHashTable cannot be initialized twice.
-      self.lookup_table = self._uninitialized_lookup_table()
-
-    # Only set up adapt state if we did not receive a vocab on construction.
-    if not self._has_input_vocabulary:
-      # Add a custom weight handler to return the layers vocab as it's weight.
-      self._add_trackable(VocabWeightHandler(self), False)
-      # Set adapt state.
-      self.token_counts = tf.lookup.experimental.MutableHashTable(
-          key_dtype=vocabulary_dtype, value_dtype=tf.int64, default_value=0)
-      if self.output_mode == TF_IDF:
-        self.token_document_counts = tf.lookup.experimental.MutableHashTable(
-            key_dtype=vocabulary_dtype, value_dtype=tf.int64, default_value=0)
-        self.num_documents = tf.Variable(0, dtype=tf.int64, trainable=False)
-
-  def compute_output_shape(self, input_shape):
-    if self.output_mode == INT:
-      return input_shape
-    depth = (
-        self.max_tokens if self.pad_to_max_tokens else self._frozen_vocab_size)
-    return tf.TensorShape([input_shape[0], depth])
-
-  def compute_output_signature(self, input_spec):
-    output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    output_dtype = self.vocabulary_dtype if self.invert else self.compute_dtype
-    return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
-
-  def get_vocabulary(self, include_special_tokens=True):
-    """Returns the current vocabulary of the layer.
+    """Maps values from a vocabulary to integer indices.
 
-    Args:
-      include_special_tokens: If True, the returned vocabulary will include mask
-        and OOV tokens, and a term's index in the vocabulary will equal the
-        term's index when calling the layer. If False, the returned vocabulary
-        will not include any mask or OOV tokens.
-    """
-    # The lookup table data will not be sorted, so we will create a inverted
-    # lookup here, and use that to lookup a range of indices [0, vocab_size).
-    if self.lookup_table.size() == 0:
-      vocab, indices = [], []
-    else:
-      keys, values = self.lookup_table.export()
-      vocab, indices = (values, keys) if self.invert else (keys, values)
-      vocab, indices = (self._tensor_vocab_to_numpy(vocab), indices.numpy())
-    lookup = collections.defaultdict(lambda: self.oov_token,
-                                     zip(indices, vocab))
-    vocab = [lookup[x] for x in range(self.vocabulary_size())]
-    if self.mask_token is not None and self.output_mode == INT:
-      vocab[0] = self.mask_token
-    if not include_special_tokens:
-      vocab = vocab[self._token_start_index():]
-    return vocab
-
-  def vocabulary_size(self):
-    """Gets the current size of the layer's vocabulary.
-
-    Returns:
-      The integer size of the voculary, including optional mask and oov indices.
-    """
-    return int(self.lookup_table.size().numpy()) + self._token_start_index()
-
-  def vocab_size(self):
-    logging.warning("vocab_size is deprecated, please use vocabulary_size.")
-    return self.vocabulary_size()
-
-  def get_config(self):
-    config = {
-        "invert": self.invert,
-        "max_tokens": self.max_tokens,
-        "num_oov_indices": self.num_oov_indices,
-        "oov_token": self.oov_token,
-        "mask_token": self.mask_token,
-        "output_mode": self.output_mode,
-        "sparse": self.sparse,
-        "pad_to_max_tokens": self.pad_to_max_tokens,
-        "vocabulary": utils.listify_tensors(self.input_vocabulary),
-        "vocabulary_dtype": self.vocabulary_dtype,
-        "idf_weights": utils.listify_tensors(self.input_idf_weights),
-    }
-
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def set_vocabulary(self, vocabulary, idf_weights=None):
-    """Sets vocabulary (and optionally document frequency) data for this layer.
-
-    This method sets the vocabulary and idf weights for this layer directly,
-    instead of analyzing a dataset through `adapt`. It should be used whenever
-    the vocab (and optionally document frequency) information is already known.
-    If vocabulary data is already present in the layer, this method will replace
-    it.
-
-    Args:
-      vocabulary: Either an array or a string path to a text file. If passing an
-        array, can pass a tuple, list, 1D numpy array, or 1D tensor containing
-        the vocbulary terms. If passing a file path, the file should contain one
-        line per term in the vocabulary.
-      idf_weights: A tuple, list, 1D numpy array, or 1D tensor of inverse
-        document frequency weights with equal length to vocabulary. Must be set
-        if `output_mode` is `"tf_idf"`. Should not be set otherwise.
-
-    Raises:
-      ValueError: If there are too many inputs, the inputs do not match, or
-        input data is missing.
-      RuntimeError: If the vocabulary cannot be set when this function is
-        called. This happens when `"multi_hot"`, `"count"`, and `"tf_idf"`
-        modes, if `pad_to_max_tokens` is False and the layer itself has already
-        been called.
-      RuntimeError: If a tensor vocabulary is passed outside of eager execution.
-    """
-    if self.output_mode != TF_IDF and idf_weights is not None:
-      raise ValueError(f"`idf_weights` should only be set if output_mode is "
-                       f"`'tf_idf'`. Received: output_mode={self.output_mode} "
-                       f"and idf_weights={idf_weights}")
-
-    if isinstance(vocabulary, str):
-      if not tf.io.gfile.exists(vocabulary):
-        raise ValueError(
-            "Vocabulary file {} does not exist.".format(vocabulary))
-      if self.output_mode == TF_IDF:
-        raise ValueError("output_mode `'tf_idf'` does not support loading a "
-                         "vocabulary from file.")
-      self.lookup_table = self._lookup_table_from_file(vocabulary)
-      return
-
-    if not tf.executing_eagerly() and (tf.is_tensor(vocabulary) or
-                                       tf.is_tensor(idf_weights)):
-      raise RuntimeError(
-          "Cannot set a tensor vocabulary on {} layer {} when not executing "
-          "eagerly. Create this layer or call `set_vocabulary` outside of "
-          "any `tf.function`s and with eager execution enabled.".format(
-              self.__class__.__name__, self.name))
-
-    # TODO(mattdangerw): for better performance we should rewrite this entire
-    # function to operate on tensors and convert vocabulary to a tensor here.
-    if tf.is_tensor(vocabulary):
-      vocabulary = self._tensor_vocab_to_numpy(vocabulary)
-    elif isinstance(vocabulary, (list, tuple)):
-      vocabulary = np.array(vocabulary)
-    if tf.is_tensor(idf_weights):
-      idf_weights = idf_weights.numpy()
-    elif isinstance(idf_weights, (list, tuple)):
-      idf_weights = np.array(idf_weights)
-
-    if vocabulary.size == 0:
-      raise ValueError(
-          "Cannot set an empty vocabulary, you passed {}.".format(vocabulary))
-
-    oov_start = self._oov_start_index()
-    token_start = self._token_start_index()
-    special_tokens = (
-        [self.mask_token] * oov_start + [self.oov_token] * self.num_oov_indices)
-    found_special_tokens = np.array_equal(
-        special_tokens, vocabulary[:token_start])
-    if found_special_tokens:
-      tokens = vocabulary[token_start:]
-    else:
-      tokens = vocabulary
-
-    repeated_tokens = self._find_repeated_tokens(tokens)
-    if repeated_tokens:
-      raise ValueError("The passed vocabulary has at least one repeated "
-                       "term. Please uniquify your dataset. The repeated terms "
-                       "are {}".format(repeated_tokens))
-
-    if self.mask_token is not None and self.mask_token in tokens:
-      mask_index = np.argwhere(vocabulary == self.mask_token)[-1]
-      raise ValueError(
-          "Found reserved mask token at unexpected location in `vocabulary`. "
-          "Note that passed `vocabulary` does not need to include the OOV and "
-          "mask tokens. Either remove all mask and OOV tokens, or include them "
-          "only at the start of the vocabulary in precisely this order: "
-          f"{special_tokens}. Received: mask_token={self.mask_token} at "
-          f"vocabulary index {mask_index}")
-    # Only error out for oov_token when invert=True. When invert=False,
-    # oov_token is unused during lookup.
-    if self.oov_token is not None and self.invert and self.oov_token in tokens:
-      oov_index = np.argwhere(vocabulary == self.oov_token)[-1]
-      raise ValueError(
-          "Found reserved OOV token at unexpected location in `vocabulary`. "
-          "Note that passed `vocabulary` does not need to include the OOV and "
-          "mask tokens. Either remove all mask and OOV tokens, or include them "
-          "only at the start of the vocabulary in precisely this order: "
-          f"{special_tokens}. Received: oov_token={self.oov_token} at "
-          f"vocabulary index {oov_index}")
-
-    new_vocab_size = token_start + len(tokens)
-    if self.max_tokens is not None and (new_vocab_size > self.max_tokens):
-      raise ValueError(
-          "Attempted to set a vocabulary larger than the maximum vocab size. "
-          "Passed vocab size is {}, max vocab size is {}.".format(
-              new_vocab_size, self.max_tokens))
-    self.lookup_table = self._lookup_table_from_tokens(tokens)
-
-    if self.output_mode == TF_IDF:
-      if idf_weights is None:
-        raise ValueError("`idf_weights` must be set if output_mode is TF_IDF")
-      if len(vocabulary) != len(idf_weights):
-        raise ValueError("`idf_weights` must be the same length as vocabulary. "
-                         "len(idf_weights) is {}, len(vocabulary) is {}".format(
-                             len(vocabulary), len(idf_weights)))
-      idf_weights = self._convert_to_ndarray(idf_weights)
-      if idf_weights.ndim != 1:
-        raise ValueError(
-            "TF-IDF data must be a 1-index array, but received {}".format(
-                type(idf_weights)))
-
-      # If the passed vocabulary has no special tokens, we need to pad the front
-      # of idf_weights. We don't have real document frequencies for these tokens
-      # so we will use an average of all idf_weights passed in as a reasonable
-      # default.
-      if found_special_tokens:
-        front_padding = 0
-        front_padding_value = 0
-      else:
-        front_padding = token_start
-        front_padding_value = np.average(idf_weights)
-      # If pad_to_max_tokens is true, and max_tokens is greater than our total
-      # vocab size, we need to pad the back of idf_weights with zeros as well.
-      back_padding_value = 0
-      if self.pad_to_max_tokens and self.max_tokens is not None:
-        back_padding = self.max_tokens - front_padding - len(idf_weights)
-      else:
-        back_padding = 0
-      weights = np.pad(
-          idf_weights, (front_padding, back_padding),
-          "constant",
-          constant_values=(front_padding_value, back_padding_value))
-      weights = tf.convert_to_tensor(weights, dtype=self.compute_dtype)
-      self.idf_weights.assign(weights)
-      self.idf_weights_const = self.idf_weights.value()
-
-  def update_state(self, data):
-    if self._has_input_vocabulary:
-      raise ValueError(
-          "Cannot adapt {} layer after setting a static vocabulary via init "
-          "argument or `set_vocabulary`.".format(self.__class__.__name__))
-
-    data = utils.ensure_tensor(data, dtype=self.vocabulary_dtype)
-    if data.shape.rank == 0:
-      data = tf.expand_dims(data, 0)
-    if data.shape.rank == 1:
-      # Expand dims on axis 0 for tf-idf. A 1-d tensor is a single document.
-      data = tf.expand_dims(data, 0)
-
-    tokens, counts = self._num_tokens(data)
-    self.token_counts.insert(tokens, counts + self.token_counts.lookup(tokens))
-
-    if self.output_mode == TF_IDF:
-      # Dedupe each row of our dataset.
-      deduped_doc_data = tf.map_fn(lambda x: tf.unique(x)[0], data)
-      # Flatten and count tokens.
-      tokens, doc_counts = self._num_tokens(deduped_doc_data)
-      self.token_document_counts.insert(
-          tokens, doc_counts + self.token_document_counts.lookup(tokens))
-      if tf_utils.is_ragged(data):
-        self.num_documents.assign_add(data.nrows())
-      else:
-        self.num_documents.assign_add(tf.shape(data, out_type=tf.int64)[0])
-
-  def finalize_state(self):
-    if self._has_input_vocabulary or tf.equal(self.token_counts.size(), 0):
-      # Finalize idf_weights to a const for call even if we don't need to
-      # compute a new vocabulary.
-      if self.output_mode == TF_IDF:
-        self.idf_weights_const = self.idf_weights.value()
-      return
-
-    # Remove special tokens from our counts.
-    if self.mask_token is not None:
-      self.token_counts.remove(
-          tf.convert_to_tensor([self.mask_token], self.vocabulary_dtype))
-    if self.oov_token is not None:
-      self.token_counts.remove(
-          tf.convert_to_tensor([self.oov_token], self.vocabulary_dtype))
-
-    tokens, counts = self.token_counts.export()
-    # To keep vocabs deterministic, we sort our tokens by count and break ties
-    # by sorting the tokens themselves. Tensorflow has no ops for sorting
-    # strings, so we need to use numpy for the sort.
-    sorted_indices = np.lexsort((tokens.numpy(), counts.numpy()))[::-1]
-    token_start = self._token_start_index()
-    if self.max_tokens:
-      max_learned_tokens = self.max_tokens - token_start
-      sorted_indices = sorted_indices[:max_learned_tokens]
-    tokens = tf.gather(tokens, sorted_indices)
-    self.lookup_table = self._lookup_table_from_tokens(tokens)
-
-    if self.output_mode == TF_IDF:
-      token_document_counts = self.token_document_counts.lookup(tokens)
-      idf_weights = self._inverse_document_frequency(token_document_counts,
-                                                     self.num_documents)
-      idf_weights = tf.cast(idf_weights, self.compute_dtype)
-      # Pad the front of idf_weights with the average idf weight for OOV tokens.
-      # We cannot compute the real idf weight of OOV in a single pass.
-      idf_weights = tf.pad(
-          idf_weights, [[self._token_start_index(), 0]],
-          constant_values=tf.reduce_mean(idf_weights))
-      if self.pad_to_max_tokens and self.max_tokens is not None:
-        # Pad the back of idf_weights with zeros.
-        idf_weights = tf.pad(
-            idf_weights, [[0, self.max_tokens - tf.size(idf_weights)]],
-            constant_values=0)
-      self.idf_weights.assign(idf_weights)
-      self.idf_weights_const = self.idf_weights.value()
-
-    # We call this here to save memory, now that we've built our vocabulary, we
-    # don't want to keep every token we've seen in separate lookup tables.
-    self.reset_state()
-
-  def reset_state(self):  # pylint: disable=method-hidden
-    if self._has_input_vocabulary:
-      return
-
-    self.token_counts.remove(self.token_counts.export()[0])
-    if self.output_mode == TF_IDF:
-      self.token_document_counts.remove(self.token_document_counts.export()[0])
-      self.num_documents.assign(0)
-
-  def call(self, inputs):
-    self._maybe_freeze_vocab_size()
-
-    inputs = utils.ensure_tensor(inputs, dtype=self._key_dtype)
-    original_shape = inputs.shape
-    # Some ops will not handle scalar input, so uprank to rank 1.
-    if inputs.shape.rank == 0:
-      inputs = self._expand_dims(inputs, -1)
-
-    if tf_utils.is_sparse(inputs):
-      lookups = tf.SparseTensor(inputs.indices,
-                                self._lookup_dense(inputs.values),
-                                inputs.dense_shape)
-    elif tf_utils.is_ragged(inputs):
-      lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs)
-    else:
-      lookups = self._lookup_dense(inputs)
-
-    if self.output_mode == INT:
-      # If we received a scalar input, downrank back to a scalar.
-      if original_shape.rank == 0:
-        lookups = tf.squeeze(lookups, -1)
-      return lookups
-
-    depth = (
-        self.max_tokens if self.pad_to_max_tokens else self._frozen_vocab_size)
-    idf_weights = self.idf_weights_const if self.output_mode == TF_IDF else None
-    return utils.encode_categorical_inputs(
-        lookups,
-        output_mode=self.output_mode,
-        depth=depth,
-        dtype=self.compute_dtype,
-        sparse=self.sparse,
-        idf_weights=idf_weights)
-
-  def _lookup_dense(self, inputs):
-    """Lookup table values for a dense Tensor, handling masking and OOV."""
-    # When executing eagerly and tracing keras.Inputs, do not call lookup. This
-    # is critical for restoring SavedModel, which will first trace layer.call
-    # and then attempt to restore the table. We need the table to be
-    # uninitialized for the restore to work, but calling the table uninitialized
-    # would error.
-    if tf.executing_eagerly() and backend.is_keras_tensor(inputs):
-      lookups = tf.zeros_like(inputs, dtype=self._value_dtype)
-    else:
-      lookups = self.lookup_table.lookup(inputs)
-
-    if self.mask_token is not None:
-      mask_locations = tf.equal(inputs, self._mask_key)
-      lookups = tf.where(mask_locations, self._mask_value, lookups)
-
-    if self.invert:
-      return lookups
-
-    lookup_checks = []
-
-    if self.num_oov_indices == 0:
-      # If we have zero oov indices, we need to check for oov inputs.
-      oov_indices = tf.where(tf.equal(lookups, -1))
-      oov_inputs = tf.gather_nd(inputs, oov_indices)
-      msg = tf.strings.format(
-          "When `num_oov_indices=0` all inputs should be in vocabulary, "
-          "found OOV values {}, consider setting `num_oov_indices=1`.",
-          (oov_inputs,))
-      assertion = tf.Assert(tf.equal(tf.size(oov_indices), 0), [msg])
-      lookup_checks.append(assertion)
-    elif self.num_oov_indices > 1:
-      # If we have multiple oov indices, we need a further hashing step.
-      if self._key_dtype.is_integer:
-        oov_indices = tf.math.floormod(inputs, self.num_oov_indices)
-      else:
-        oov_indices = tf.strings.to_hash_bucket_fast(
-            inputs, num_buckets=self.num_oov_indices)
-      oov_indices = oov_indices + self._oov_start_index()
-      oov_locations = tf.equal(lookups, self._default_value)
-      lookups = tf.where(oov_locations, oov_indices, lookups)
-
-    with tf.control_dependencies(lookup_checks):
-      return tf.identity(lookups)
-
-  def _uninitialized_lookup_table(self):
-    with tf.init_scope():
-      initializer = NullInitializer(self._key_dtype, self._value_dtype)
-      return tf.lookup.StaticHashTable(initializer, self._default_value)
-
-  def _lookup_table_from_tokens(self, tokens):
-    with tf.init_scope():
-      token_start = self._token_start_index()
-      token_end = token_start + tf.size(tokens)
-      indices_dtype = self._key_dtype if self.invert else self._value_dtype
-      indices = tf.range(token_start, token_end, dtype=indices_dtype)
-      keys, values = (indices, tokens) if self.invert else (tokens, indices)
-      initializer = tf.lookup.KeyValueTensorInitializer(keys, values,
-                                                        self._key_dtype,
-                                                        self._value_dtype)
-      return tf.lookup.StaticHashTable(initializer, self._default_value)
-
-  def _lookup_table_from_file(self, filename):
-    if self.invert:
-      key_index = tf.lookup.TextFileIndex.LINE_NUMBER
-      value_index = tf.lookup.TextFileIndex.WHOLE_LINE
-    else:
-      key_index = tf.lookup.TextFileIndex.WHOLE_LINE
-      value_index = tf.lookup.TextFileIndex.LINE_NUMBER
-    with tf.init_scope():
-      initializer = tf.lookup.TextFileInitializer(
-          filename=filename,
-          key_dtype=self._key_dtype,
-          key_index=key_index,
-          value_dtype=self._value_dtype,
-          value_index=value_index,
-          value_index_offset=self._token_start_index())
-      return tf.lookup.StaticHashTable(initializer, self._default_value)
-
-  def _convert_to_ndarray(self, x):
-    return np.array(x) if isinstance(x, (list, tuple)) else x
-
-  def _expand_dims(self, inputs, axis):
-    if tf_utils.is_sparse(inputs):
-      return tf.sparse.expand_dims(inputs, axis)
-    else:
-      return tf.expand_dims(inputs, axis)
-
-  def _oov_start_index(self):
-    return 1 if self.mask_token is not None and self.output_mode == INT else 0
-
-  def _token_start_index(self):
-    return self._oov_start_index() + self.num_oov_indices
-
-  def _maybe_freeze_vocab_size(self):
-    if self.output_mode == INT or self.pad_to_max_tokens:
-      return
-    with tf.init_scope():
-      if not tf.executing_eagerly():
-        raise RuntimeError(
-            "When using `output_mode={}` eager execution must be enabled."
-            .format(self.output_mode))
-      new_vocab_size = self.vocabulary_size()
-    if new_vocab_size == self._token_start_index():
-      raise RuntimeError(
-          "When using `output_mode={}` and `pad_to_max_tokens=False`, you "
-          "must set the layer's vocabulary before calling it. Either pass "
-          "a `vocabulary` argument to the layer, or call `adapt` with some "
-          "sample data.".format(self.output_mode))
-    elif (self._frozen_vocab_size is not None and
-          new_vocab_size != self._frozen_vocab_size):
-      raise RuntimeError(
-          "When using `output_mode={}` and `pad_to_max_tokens=False`, the "
-          "vocabulary size cannot be changed after the layer is called. "
-          "Vocab size is {}, new vocab size is {}".format(
-              self.output_mode, self._frozen_vocab_size, new_vocab_size))
-    self._frozen_vocab_size = new_vocab_size
-
-  def _find_repeated_tokens(self, vocabulary):
-    """Return all repeated tokens in a vocabulary."""
-    vocabulary_set = set(vocabulary)
-    if len(vocabulary) != len(vocabulary_set):
-      return [
-          item for item, count in collections.Counter(vocabulary).items()
-          if count > 1
-      ]
-    else:
-      return []
-
-  def _num_tokens(self, data):
-    """Count the number of tokens in a ragged, sparse or dense tensor."""
-    if tf_utils.is_sparse(data):
-      flat_values = data.values
-    elif tf_utils.is_ragged(data):
-      flat_values = data.flat_values
-    else:
-      flat_values = tf.reshape(data, [-1])
-    tokens, _, counts = tf.unique_with_counts(flat_values, out_idx=tf.int64)
-    return tokens, counts
-
-  def _inverse_document_frequency(self, token_document_counts, num_documents):
-    """Computes the inverse-document-frequency (IDF) component of "tf_idf".
-
-    Uses the default weighting scheme described in
-    https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
+    This layer translates a set of arbitrary hashables into an integer output
+    via a table-based lookup, with optional out-of-vocabulary handling. This is
+    the basis layer for both IntegerLookup and StringLookup; it holds the common
+    logic but is not intended to be exported as part of the Keras API.
 
     Args:
-      token_document_counts: An array of the # of documents each token appears
-        in.
-      num_documents: An int representing the total number of documents
-
-    Returns:
-      An array of "inverse document frequency" weights.
+      max_tokens: The maximum size of the vocabulary for this layer. If None,
+        there is no cap on the size of the vocabulary. Note that this size
+        includes the OOV and mask tokens.
+      num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+        value is more than 1, OOV inputs are hashed to determine their OOV
+        value. If this value is 0, OOV inputs will cause an error when calling
+        the layer.
+      mask_token: A token that represents masked inputs. When `output_mode` is
+        `"int"`, the token is included in vocabulary and mapped to index 0. In
+        other output modes, the token will not appear in the vocabulary and
+        instances of the mask token in the input will be dropped. If set to
+        None, no mask term will be added.
+      oov_token: Only used when `invert` is True. The token to return for OOV
+        indices.
+      vocabulary: Optional. Either an array or a string path to a text file. If
+        passing an array, can pass a tuple, list, 1D numpy array, or 1D tensor
+        containing the vocbulary terms. If passing a file path, the file should
+        contain one line per term in the vocabulary. If this argument is set,
+        there is no need to `adapt` the layer.
+      vocabulary_dtype: The dtype of the vocabulary terms. For example,
+        `"int64"` or `"string"`.
+      idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list,
+        1D numpy array, or 1D tensor or the same length as the vocabulary,
+        containing the floating point inverse document frequency weights, which
+        will be multiplied by per sample term counts for the final `tf_idf`
+        weight. If the `vocabulary` argument is set, and `output_mode` is
+        `"tf_idf"`, this argument must be supplied.
+      invert: Only valid when `output_mode` is `"int"`. If True, this layer will
+        map indices to vocabulary items instead of mapping vocabulary items to
+        indices. Defaults to `False`.
+      output_mode: Specification for the output of the layer. Values can be
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
+        configuring the layer as follows:
+          - `"int"`: Return the raw integer indices of the input tokens.
+          - `"one_hot"`: Encodes each individual element in the input into an
+            array the same size as the vocabulary, containing a 1 at the element
+            index. If the last dimension is size 1, will encode on that
+            dimension.  If the last dimension is not size 1, will append a new
+            dimension for the encoded output.
+          - `"multi_hot"`: Encodes each sample in the input into a single array
+            the same size as the vocabulary, containing a 1 for each vocabulary
+            term present in the sample. Treats the last dimension as the sample
+            dimension, if input shape is (..., sample_length), output shape will
+            be (..., num_tokens).
+          - `"count"`: As `"multi_hot"`, but the int array contains a count of
+            the number of times the token at that index appeared in the sample.
+          - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
+            find the value in each token slot.
+        Defaults to `"int"`.
+      pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
+        `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
+        padded to `max_tokens` even if the number of unique tokens in the
+        vocabulary is less than max_tokens, resulting in a tensor of shape
+        [batch_size, max_tokens] regardless of vocabulary size. Defaults to
+        False.
+      sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`, `"count"`
+        and `"tf-idf"` output modes. If True, returns a `SparseTensor` instead
+        of a dense `Tensor`. Defaults to `False`.
     """
-    return tf.math.log(1 + num_documents / (1 + token_document_counts))
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return layer_serialization.VocabularySavedModelSaver(self)
 
-  # Override points for IntegerLookup and StringLookup.
-  def _tensor_vocab_to_numpy(self, vocabulary):
-    """Converts a tensor vocabulary to a numpy vocabulary."""
-    return vocabulary.numpy()
+    def __init__(
+        self,
+        max_tokens,
+        num_oov_indices,
+        mask_token,
+        oov_token,
+        vocabulary_dtype,
+        vocabulary=None,
+        idf_weights=None,
+        invert=False,
+        output_mode="int",
+        sparse=False,
+        pad_to_max_tokens=False,
+        **kwargs,
+    ):
+        # If max_tokens is set, the value must be greater than 1 - otherwise we
+        # are creating a 0-element vocab, which doesn't make sense.
+        if max_tokens is not None and max_tokens <= 1:
+            raise ValueError(
+                "If set, `max_tokens` must be greater than 1. "
+                f"Received: max_tokens={max_tokens}"
+            )
+
+        if pad_to_max_tokens and max_tokens is None:
+            raise ValueError(
+                "If pad_to_max_tokens is True, must set `max_tokens`. "
+                f"Received: max_tokens={max_tokens}"
+            )
+
+        if num_oov_indices < 0:
+            raise ValueError(
+                "`num_oov_indices` must be greater than or equal to 0. "
+                f"Received: num_oov_indices={num_oov_indices}"
+            )
+
+        # Support deprecated names for output_modes.
+        if output_mode == "binary":
+            output_mode = MULTI_HOT
+        if output_mode == "tf-idf":
+            output_mode = TF_IDF
+        # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF)
+        layer_utils.validate_string_arg(
+            output_mode,
+            allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF),
+            layer_name=self.__class__.__name__,
+            arg_name="output_mode",
+        )
+
+        if invert and output_mode != INT:
+            raise ValueError(
+                "`output_mode` must be `'int'` when `invert` is true. "
+                f"Received: output_mode={output_mode}"
+            )
+
+        if sparse and output_mode == INT:
+            raise ValueError(
+                "`sparse` may only be true if `output_mode` is "
+                "`'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. "
+                f"Received: sparse={sparse} and "
+                f"output_mode={output_mode}"
+            )
+
+        if idf_weights is not None and output_mode != TF_IDF:
+            raise ValueError(
+                "`idf_weights` should only be set if `output_mode` is "
+                f"`'tf_idf'`. Received: idf_weights={idf_weights} and "
+                f"output_mode={output_mode}"
+            )
+
+        self.invert = invert
+        self.max_tokens = max_tokens
+        self.num_oov_indices = num_oov_indices
+        self.mask_token = mask_token
+        self.oov_token = oov_token
+        self.output_mode = output_mode
+        self.sparse = sparse
+        self.pad_to_max_tokens = pad_to_max_tokens
+        self.vocabulary_dtype = vocabulary_dtype
+        self._frozen_vocab_size = kwargs.pop("vocabulary_size", None)
+
+        self.input_vocabulary = vocabulary
+        self.input_idf_weights = idf_weights
+        # VocabularySavedModelSaver will clear the config vocabulary to restore
+        # the lookup table ops directly. We persist this hidden option to
+        # persist the fact that we have have a non-adaptable layer with a
+        # manually set vocab.
+        self._has_input_vocabulary = kwargs.pop(
+            "has_input_vocabulary", (vocabulary is not None)
+        )
+
+        # Drop deprecated config options.
+        kwargs.pop("has_static_table", None)
+
+        # By default, output int64 when output_mode='int' and floats otherwise.
+        if "dtype" not in kwargs:
+            kwargs["dtype"] = (
+                tf.int64 if output_mode == INT else backend.floatx()
+            )
+
+        super().__init__(**kwargs)
+
+        # Check dtype only after base layer parses it; dtype parsing is complex.
+        if (
+            output_mode == INT
+            and not tf.as_dtype(self.compute_dtype).is_integer
+        ):
+            input_dtype = kwargs["dtype"]
+            raise ValueError(
+                "When `output_mode='int'`, `dtype` should be an integer "
+                f"type. Received: dtype={input_dtype}"
+            )
+
+        if invert:
+            self._key_dtype = self.dtype if output_mode == INT else tf.int64
+            self._value_dtype = tf.as_dtype(self.vocabulary_dtype)
+            mask_key = 0
+            mask_value = mask_token
+            self._default_value = self.oov_token
+        else:
+            self._key_dtype = tf.as_dtype(self.vocabulary_dtype)
+            self._value_dtype = self.dtype if output_mode == INT else tf.int64
+            mask_key = mask_token
+            # Masks should map to 0 for int output and be dropped otherwise. Max
+            # ints will be dropped from the bincount op.
+            mask_value = 0 if self.output_mode == INT else self._value_dtype.max
+            if self.num_oov_indices == 0:
+                # If there are no OOV indices, we map OOV tokens to -1 and error
+                # out during call if we find a negative index.
+                self._default_value = -1
+            elif self.num_oov_indices == 1:
+                # If there is only one OOV index, we can set that index as the
+                # default value of the index_lookup table.
+                self._default_value = self._oov_start_index()
+            else:
+                # If we have multiple OOV values, we need to do a further
+                # hashing step; to make this easier, we set the OOV value to -1.
+                # (This lets us do a vectorized add and cast to boolean to
+                # determine locations where we need to do extra hashing.)
+                self._default_value = -1
+        if self.mask_token is not None:
+            self._mask_key = tf.convert_to_tensor(mask_key, self._key_dtype)
+            self._mask_value = tf.convert_to_tensor(
+                mask_value, self._value_dtype
+            )
+
+        if self.output_mode == TF_IDF:
+            self.idf_weights = tf.Variable(
+                [0] * self._token_start_index(),
+                shape=(None,),
+                dtype=self.compute_dtype,
+                trainable=False,
+            )
+            self.idf_weights_const = self.idf_weights.value()
+
+        if vocabulary is not None:
+            self.set_vocabulary(vocabulary, idf_weights)
+        else:
+            # When restoring from a keras SavedModel, the loading code will
+            # expect to find and restore a lookup_table attribute on the layer.
+            # This table needs to be uninitialized as a StaticHashTable cannot
+            # be initialized twice.
+            self.lookup_table = self._uninitialized_lookup_table()
+
+        # Only set up adapt state if we did not receive a vocab on construction.
+        if not self._has_input_vocabulary:
+            # Add custom weight handler to return the layer's vocab as a weight.
+            self._add_trackable(VocabWeightHandler(self), False)
+            # Set adapt state.
+            self.token_counts = tf.lookup.experimental.MutableHashTable(
+                key_dtype=vocabulary_dtype,
+                value_dtype=tf.int64,
+                default_value=0,
+            )
+            if self.output_mode == TF_IDF:
+                self.token_document_counts = (
+                    tf.lookup.experimental.MutableHashTable(
+                        key_dtype=vocabulary_dtype,
+                        value_dtype=tf.int64,
+                        default_value=0,
+                    )
+                )
+                self.num_documents = tf.Variable(
+                    0, dtype=tf.int64, trainable=False
+                )
+
+    def compute_output_shape(self, input_shape):
+        if self.output_mode == INT:
+            return input_shape
+        depth = (
+            self.max_tokens
+            if self.pad_to_max_tokens
+            else self._frozen_vocab_size
+        )
+        return tf.TensorShape([input_shape[0], depth])
+
+    def compute_output_signature(self, input_spec):
+        output_shape = self.compute_output_shape(input_spec.shape.as_list())
+        output_dtype = (
+            self.vocabulary_dtype if self.invert else self.compute_dtype
+        )
+        return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
+
+    def get_vocabulary(self, include_special_tokens=True):
+        """Returns the current vocabulary of the layer.
+
+        Args:
+          include_special_tokens: If True, the returned vocabulary will include
+            mask and OOV tokens, and a term's index in the vocabulary will equal
+            the term's index when calling the layer. If False, the returned
+            vocabulary will not include any mask or OOV tokens.
+        """
+        # The lookup table data will not be sorted, so we will create a inverted
+        # lookup here, and use that to lookup a range of indices [0,
+        # vocab_size).
+        if self.lookup_table.size() == 0:
+            vocab, indices = [], []
+        else:
+            keys, values = self.lookup_table.export()
+            vocab, indices = (values, keys) if self.invert else (keys, values)
+            vocab, indices = (
+                self._tensor_vocab_to_numpy(vocab),
+                indices.numpy(),
+            )
+        lookup = collections.defaultdict(
+            lambda: self.oov_token, zip(indices, vocab)
+        )
+        vocab = [lookup[x] for x in range(self.vocabulary_size())]
+        if self.mask_token is not None and self.output_mode == INT:
+            vocab[0] = self.mask_token
+        if not include_special_tokens:
+            vocab = vocab[self._token_start_index() :]
+        return vocab
+
+    def vocabulary_size(self):
+        """Gets the current size of the layer's vocabulary.
+
+        Returns:
+          The integer size of the vocabulary, including optional mask and oov
+          indices.
+        """
+        if tf.executing_eagerly():
+            return (
+                int(self.lookup_table.size().numpy())
+                + self._token_start_index()
+            )
+        else:
+            return self.lookup_table.size() + self._token_start_index()
+
+    def vocab_size(self):
+        logging.warning("vocab_size is deprecated, please use vocabulary_size.")
+        return self.vocabulary_size()
+
+    def get_config(self):
+        config = {
+            "invert": self.invert,
+            "max_tokens": self.max_tokens,
+            "num_oov_indices": self.num_oov_indices,
+            "oov_token": self.oov_token,
+            "mask_token": self.mask_token,
+            "output_mode": self.output_mode,
+            "sparse": self.sparse,
+            "pad_to_max_tokens": self.pad_to_max_tokens,
+            "vocabulary_dtype": self.vocabulary_dtype,
+            "idf_weights": utils.listify_tensors(self.input_idf_weights),
+            "vocabulary": utils.listify_tensors(self.input_vocabulary),
+            "vocabulary_size": self._frozen_vocab_size,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def _record_vocabulary_size(self):
+        self._ensure_vocab_size_unchanged()
+        with tf.init_scope():
+            self._frozen_vocab_size = self.vocabulary_size()
+
+    def set_vocabulary(self, vocabulary, idf_weights=None):
+        """Sets vocabulary (and optionally document frequency) for this layer.
+
+        This method sets the vocabulary and idf weights for this layer directly,
+        instead of analyzing a dataset through `adapt`. It should be used
+        whenever the vocab (and optionally document frequency) information is
+        already known.  If vocabulary data is already present in the layer, this
+        method will replace it.
+
+        Args:
+          vocabulary: Either an array or a string path to a text file. If
+            passing an array, can pass a tuple, list, 1D numpy array, or 1D
+            tensor containing the vocbulary terms. If passing a file path, the
+            file should contain one line per term in the vocabulary.
+          idf_weights: A tuple, list, 1D numpy array, or 1D tensor of inverse
+            document frequency weights with equal length to vocabulary. Must be
+            set if `output_mode` is `"tf_idf"`. Should not be set otherwise.
+
+        Raises:
+          ValueError: If there are too many inputs, the inputs do not match, or
+            input data is missing.
+          RuntimeError: If the vocabulary cannot be set when this function is
+            called. This happens when `"multi_hot"`, `"count"`, and `"tf_idf"`
+            modes, if `pad_to_max_tokens` is False and the layer itself has
+            already been called.
+          RuntimeError: If a tensor vocabulary is passed outside of eager
+            execution.
+        """
+        if self.output_mode == TF_IDF:
+            if idf_weights is None:
+                raise ValueError(
+                    "`idf_weights` must be set if output_mode is TF_IDF"
+                )
+        elif idf_weights is not None:
+            raise ValueError(
+                "`idf_weights` should only be set if output_mode is "
+                f"`'tf_idf'`. Received: output_mode={self.output_mode} "
+                f"and idf_weights={idf_weights}"
+            )
+
+        if isinstance(vocabulary, str):
+            if not tf.io.gfile.exists(vocabulary):
+                raise ValueError(
+                    f"Vocabulary file {vocabulary} does not exist."
+                )
+            if self.output_mode == TF_IDF:
+                raise ValueError(
+                    "output_mode `'tf_idf'` does not support loading a "
+                    "vocabulary from file."
+                )
+            self.lookup_table = self._lookup_table_from_file(vocabulary)
+            self._record_vocabulary_size()
+            return
+
+        if not tf.executing_eagerly() and (
+            tf.is_tensor(vocabulary) or tf.is_tensor(idf_weights)
+        ):
+            raise RuntimeError(
+                "Cannot set a tensor vocabulary on {} layer {} when not "
+                "executing eagerly. Create this layer or call `set_vocabulary` "
+                "outside of any `tf.function`s and with eager execution "
+                "enabled.".format(self.__class__.__name__, self.name)
+            )
+
+        # TODO(mattdangerw): for better performance we should rewrite this
+        # entire function to operate on tensors and convert vocabulary to a
+        # tensor here.
+        if tf.is_tensor(vocabulary):
+            vocabulary = self._tensor_vocab_to_numpy(vocabulary)
+        elif isinstance(vocabulary, (list, tuple)):
+            vocabulary = np.array(vocabulary)
+        if tf.is_tensor(idf_weights):
+            idf_weights = idf_weights.numpy()
+        elif isinstance(idf_weights, (list, tuple)):
+            idf_weights = np.array(idf_weights)
+
+        if vocabulary.size == 0:
+            raise ValueError(
+                f"Cannot set an empty vocabulary, you passed {vocabulary}."
+            )
+
+        oov_start = self._oov_start_index()
+        token_start = self._token_start_index()
+        special_tokens = [self.mask_token] * oov_start + [
+            self.oov_token
+        ] * self.num_oov_indices
+        found_special_tokens = np.array_equal(
+            special_tokens, vocabulary[:token_start]
+        )
+        if found_special_tokens:
+            tokens = vocabulary[token_start:]
+        else:
+            tokens = vocabulary
+
+        repeated_tokens = self._find_repeated_tokens(tokens)
+        if repeated_tokens:
+            raise ValueError(
+                "The passed vocabulary has at least one repeated "
+                "term. Please uniquify your dataset. The repeated terms "
+                "are {}".format(repeated_tokens)
+            )
+
+        if self.mask_token is not None and self.mask_token in tokens:
+            mask_index = np.argwhere(vocabulary == self.mask_token)[-1]
+            raise ValueError(
+                "Found reserved mask token at unexpected location in "
+                "`vocabulary`. Note that passed `vocabulary` does not need to "
+                "include the OOV and mask tokens. Either remove all mask and "
+                "OOV tokens, or include them only at the start of the "
+                f"vocabulary in precisely this order: {special_tokens}. "
+                f"Received: mask_token={self.mask_token} at "
+                f"vocabulary index {mask_index}"
+            )
+        # Only error out for oov_token when invert=True. When invert=False,
+        # oov_token is unused during lookup.
+        if (
+            self.oov_token is not None
+            and self.invert
+            and self.oov_token in tokens
+        ):
+            oov_index = np.argwhere(vocabulary == self.oov_token)[-1]
+            raise ValueError(
+                "Found reserved OOV token at unexpected location in "
+                "`vocabulary`. Note that passed `vocabulary` does not need to "
+                "include the OOV and mask tokens. Either remove all mask and "
+                "OOV tokens, or include them only at the start of the "
+                f"vocabulary in precisely this order: {special_tokens}. "
+                f"Received: oov_token={self.oov_token} at "
+                f"vocabulary index {oov_index}"
+            )
+
+        new_vocab_size = token_start + len(tokens)
+        if self.max_tokens is not None and (new_vocab_size > self.max_tokens):
+            raise ValueError(
+                "Attempted to set a vocabulary larger than the maximum vocab "
+                "size. Passed vocab size is {}, max vocab size is {}.".format(
+                    new_vocab_size, self.max_tokens
+                )
+            )
+        self.lookup_table = self._lookup_table_from_tokens(tokens)
+        self._record_vocabulary_size()
+
+        if self.output_mode == TF_IDF and idf_weights is not False:
+            if len(vocabulary) != len(idf_weights):
+                raise ValueError(
+                    "`idf_weights` must be the same length as vocabulary. "
+                    "len(idf_weights) is {}, len(vocabulary) is {}".format(
+                        len(vocabulary), len(idf_weights)
+                    )
+                )
+            idf_weights = self._convert_to_ndarray(idf_weights)
+            if idf_weights.ndim != 1:
+                raise ValueError(
+                    "TF-IDF data must be a 1-index array, "
+                    "but received {}".format(type(idf_weights))
+                )
+
+            # If the passed vocabulary has no special tokens, we need to pad the
+            # front of idf_weights. We don't have real document frequencies for
+            # these tokens so we will use an average of all idf_weights passed
+            # in as a reasonable default.
+            if found_special_tokens:
+                front_padding = 0
+                front_padding_value = 0
+            else:
+                front_padding = token_start
+                front_padding_value = np.average(idf_weights)
+            # If pad_to_max_tokens is true, and max_tokens is greater than our
+            # total vocab size, we need to pad the back of idf_weights with
+            # zeros as well.
+            back_padding_value = 0
+            if self.pad_to_max_tokens and self.max_tokens is not None:
+                back_padding = (
+                    self.max_tokens - front_padding - len(idf_weights)
+                )
+            else:
+                back_padding = 0
+            weights = np.pad(
+                idf_weights,
+                (front_padding, back_padding),
+                "constant",
+                constant_values=(front_padding_value, back_padding_value),
+            )
+            weights = tf.convert_to_tensor(weights, dtype=self.compute_dtype)
+            self.idf_weights.assign(weights)
+            self.idf_weights_const = self.idf_weights.value()
+
+    def update_state(self, data):
+        if self._has_input_vocabulary:
+            raise ValueError(
+                "Cannot adapt {} layer after setting a static vocabulary via "
+                "init argument "
+                "or `set_vocabulary`.".format(self.__class__.__name__)
+            )
+
+        data = utils.ensure_tensor(data, dtype=self.vocabulary_dtype)
+        if data.shape.rank == 0:
+            data = tf.expand_dims(data, 0)
+        if data.shape.rank == 1:
+            # Expand dims on axis 0 for tf-idf. A 1-d tensor is a single
+            # document.
+            data = tf.expand_dims(data, 0)
+
+        tokens, counts = self._num_tokens(data)
+        self.token_counts.insert(
+            tokens, counts + self.token_counts.lookup(tokens)
+        )
+
+        if self.output_mode == TF_IDF:
+            # Dedupe each row of our dataset.
+            deduped_doc_data = tf.map_fn(lambda x: tf.unique(x)[0], data)
+            # Flatten and count tokens.
+            tokens, doc_counts = self._num_tokens(deduped_doc_data)
+            self.token_document_counts.insert(
+                tokens, doc_counts + self.token_document_counts.lookup(tokens)
+            )
+            if tf_utils.is_ragged(data):
+                self.num_documents.assign_add(data.nrows())
+            else:
+                self.num_documents.assign_add(
+                    tf.shape(data, out_type=tf.int64)[0]
+                )
+
+    def finalize_state(self):
+        if self._has_input_vocabulary or tf.equal(self.token_counts.size(), 0):
+            # Finalize idf_weights to a const for call even if we don't need to
+            # compute a new vocabulary.
+            if self.output_mode == TF_IDF:
+                self.idf_weights_const = self.idf_weights.value()
+            self._record_vocabulary_size()
+            return
+
+        # Remove special tokens from our counts.
+        if self.mask_token is not None:
+            self.token_counts.remove(
+                tf.convert_to_tensor([self.mask_token], self.vocabulary_dtype)
+            )
+        if self.oov_token is not None:
+            self.token_counts.remove(
+                tf.convert_to_tensor([self.oov_token], self.vocabulary_dtype)
+            )
+
+        tokens, counts = self.token_counts.export()
+        # To keep vocabs deterministic, we sort our tokens by count and break
+        # ties by sorting the tokens themselves. Tensorflow has no ops for
+        # sorting strings, so we need to use numpy for the sort.
+        sorted_indices = np.lexsort((tokens.numpy(), counts.numpy()))[::-1]
+        token_start = self._token_start_index()
+        if self.max_tokens:
+            max_learned_tokens = self.max_tokens - token_start
+            sorted_indices = sorted_indices[:max_learned_tokens]
+        tokens = tf.gather(tokens, sorted_indices)
+        self.lookup_table = self._lookup_table_from_tokens(tokens)
+
+        if self.output_mode == TF_IDF:
+            token_document_counts = self.token_document_counts.lookup(tokens)
+            idf_weights = self._inverse_document_frequency(
+                token_document_counts, self.num_documents
+            )
+            idf_weights = tf.cast(idf_weights, self.compute_dtype)
+            # Pad the front of idf_weights with the average idf weight for OOV
+            # tokens.  We cannot compute the real idf weight of OOV in a single
+            # pass.
+            idf_weights = tf.pad(
+                idf_weights,
+                [[self._token_start_index(), 0]],
+                constant_values=tf.reduce_mean(idf_weights),
+            )
+            if self.pad_to_max_tokens and self.max_tokens is not None:
+                # Pad the back of idf_weights with zeros.
+                idf_weights = tf.pad(
+                    idf_weights,
+                    [[0, self.max_tokens - tf.size(idf_weights)]],
+                    constant_values=0,
+                )
+            self.idf_weights.assign(idf_weights)
+            self.idf_weights_const = self.idf_weights.value()
+
+        # We call this here to save memory, now that we've built our vocabulary,
+        # we don't want to keep every token we've seen in separate lookup
+        # tables.
+        self.reset_state()
+        self._record_vocabulary_size()
+
+    def reset_state(self):
+        if self._has_input_vocabulary:
+            return
+
+        self.token_counts.remove(self.token_counts.export()[0])
+        if self.output_mode == TF_IDF:
+            self.token_document_counts.remove(
+                self.token_document_counts.export()[0]
+            )
+            self.num_documents.assign(0)
+
+    def call(self, inputs):
+        self._ensure_known_vocab_size()
+
+        inputs = utils.ensure_tensor(inputs, dtype=self._key_dtype)
+        original_shape = inputs.shape
+        # Some ops will not handle scalar input, so uprank to rank 1.
+        if inputs.shape.rank == 0:
+            inputs = self._expand_dims(inputs, -1)
+
+        if tf_utils.is_sparse(inputs):
+            lookups = tf.SparseTensor(
+                inputs.indices,
+                self._lookup_dense(inputs.values),
+                inputs.dense_shape,
+            )
+        elif tf_utils.is_ragged(inputs):
+            lookups = tf.ragged.map_flat_values(self._lookup_dense, inputs)
+        else:
+            lookups = self._lookup_dense(inputs)
+
+        if self.output_mode == INT:
+            # If we received a scalar input, downrank back to a scalar.
+            if original_shape.rank == 0:
+                lookups = tf.squeeze(lookups, -1)
+            return lookups
+
+        depth = (
+            self.max_tokens
+            if self.pad_to_max_tokens
+            else self._frozen_vocab_size
+        )
+        idf_weights = (
+            self.idf_weights_const if self.output_mode == TF_IDF else None
+        )
+        return utils.encode_categorical_inputs(
+            lookups,
+            output_mode=self.output_mode,
+            depth=depth,
+            dtype=self.compute_dtype,
+            sparse=self.sparse,
+            idf_weights=idf_weights,
+        )
+
+    def _lookup_dense(self, inputs):
+        """Lookup table values for a dense Tensor, handling masking and OOV."""
+        # When executing eagerly and tracing keras.Input objects,
+        # do not call lookup.
+        # This is critical for restoring SavedModel, which will first trace
+        # layer.call and then attempt to restore the table. We need the table to
+        # be uninitialized for the restore to work, but calling the table
+        # uninitialized would error.
+        if tf.executing_eagerly() and backend.is_keras_tensor(inputs):
+            lookups = tf.zeros_like(inputs, dtype=self._value_dtype)
+        else:
+            lookups = self.lookup_table.lookup(inputs)
+
+        if self.mask_token is not None:
+            mask_locations = tf.equal(inputs, self._mask_key)
+            lookups = tf.where(mask_locations, self._mask_value, lookups)
+
+        if self.invert:
+            return lookups
+
+        lookup_checks = []
+
+        if self.num_oov_indices == 0:
+            # If we have zero oov indices, we need to check for oov inputs.
+            oov_indices = tf.where(tf.equal(lookups, -1))
+            oov_inputs = tf.gather_nd(inputs, oov_indices)
+            msg = tf.strings.format(
+                "When `num_oov_indices=0` all inputs should be in vocabulary, "
+                "found OOV values {}, consider setting `num_oov_indices=1`.",
+                (oov_inputs,),
+            )
+            assertion = tf.Assert(tf.equal(tf.size(oov_indices), 0), [msg])
+            lookup_checks.append(assertion)
+        elif self.num_oov_indices > 1:
+            # If we have multiple oov indices, we need a further hashing step.
+            if self._key_dtype.is_integer:
+                oov_indices = tf.math.floormod(inputs, self.num_oov_indices)
+            else:
+                oov_indices = tf.strings.to_hash_bucket_fast(
+                    inputs, num_buckets=self.num_oov_indices
+                )
+            oov_indices = oov_indices + self._oov_start_index()
+            oov_locations = tf.equal(lookups, self._default_value)
+            lookups = tf.where(oov_locations, oov_indices, lookups)
+
+        with tf.control_dependencies(lookup_checks):
+            return tf.identity(lookups)
+
+    def save_own_variables(self, store):
+        if self.output_mode == TF_IDF:
+            store["idf_weights"] = self.idf_weights_const.numpy()
+
+    def load_own_variables(self, store):
+        if self.output_mode == TF_IDF:
+            self.idf_weights.assign(store["idf_weights"])
+            self.idf_weights_const = self.idf_weights.value()
+
+    def save_assets(self, dir_path):
+        if self.input_vocabulary:
+            # Vocab saved in config.
+            # TODO: consider unifying both paths.
+            return
+        vocabulary = self.get_vocabulary(include_special_tokens=True)
+        vocabulary_filepath = tf.io.gfile.join(dir_path, "vocabulary.txt")
+        with open(vocabulary_filepath, "w") as f:
+            f.write("\n".join([str(w) for w in vocabulary]))
+
+    def load_assets(self, dir_path):
+        if self.input_vocabulary:
+            # Vocab saved in config.
+            # TODO: consider unifying both paths.
+            return
+        vocabulary_filepath = tf.io.gfile.join(dir_path, "vocabulary.txt")
+        # TODO: fix bug with include_special_tokens and set reload from file.
+        with open(vocabulary_filepath, "r") as f:
+            lines = f.read().split("\n")
+            if tf.as_dtype(self.vocabulary_dtype) == tf.string:
+                values = [str(line) for line in lines]
+            else:
+                values = [int(line) for line in lines]
+            if self.output_mode == TF_IDF:
+                self.set_vocabulary(values, idf_weights=False)
+            else:
+                self.set_vocabulary(values)
+
+    def _uninitialized_lookup_table(self):
+        with tf.init_scope():
+            initializer = NullInitializer(self._key_dtype, self._value_dtype)
+            return tf.lookup.StaticHashTable(initializer, self._default_value)
+
+    def _lookup_table_from_tokens(self, tokens):
+        with tf.init_scope():
+            token_start = self._token_start_index()
+            token_end = token_start + tf.size(tokens)
+            indices_dtype = (
+                self._key_dtype if self.invert else self._value_dtype
+            )
+            indices = tf.range(token_start, token_end, dtype=indices_dtype)
+            keys, values = (
+                (indices, tokens) if self.invert else (tokens, indices)
+            )
+            initializer = tf.lookup.KeyValueTensorInitializer(
+                keys, values, self._key_dtype, self._value_dtype
+            )
+            return tf.lookup.StaticHashTable(initializer, self._default_value)
+
+    def _lookup_table_from_file(self, filename):
+        if self.invert:
+            key_index = tf.lookup.TextFileIndex.LINE_NUMBER
+            value_index = tf.lookup.TextFileIndex.WHOLE_LINE
+        else:
+            key_index = tf.lookup.TextFileIndex.WHOLE_LINE
+            value_index = tf.lookup.TextFileIndex.LINE_NUMBER
+        with tf.init_scope():
+            initializer = tf.lookup.TextFileInitializer(
+                filename=filename,
+                key_dtype=self._key_dtype,
+                key_index=key_index,
+                value_dtype=self._value_dtype,
+                value_index=value_index,
+                value_index_offset=self._token_start_index(),
+            )
+            return tf.lookup.StaticHashTable(initializer, self._default_value)
+
+    def _convert_to_ndarray(self, x):
+        return np.array(x) if isinstance(x, (list, tuple)) else x
+
+    def _expand_dims(self, inputs, axis):
+        if tf_utils.is_sparse(inputs):
+            return tf.sparse.expand_dims(inputs, axis)
+        else:
+            return tf.expand_dims(inputs, axis)
+
+    def _oov_start_index(self):
+        return (
+            1 if self.mask_token is not None and self.output_mode == INT else 0
+        )
+
+    def _token_start_index(self):
+        return self._oov_start_index() + self.num_oov_indices
+
+    def _ensure_known_vocab_size(self):
+        if self.output_mode == INT or self.pad_to_max_tokens:
+            return
+        if self._frozen_vocab_size is None:
+            raise RuntimeError(
+                f"When using `output_mode={self.output_mode}` "
+                "and `pad_to_max_tokens=False`, "
+                "you must set the layer's vocabulary before calling it. Either "
+                "pass a `vocabulary` argument to the layer, or call `adapt` "
+                "with some sample data.".format(self.output_mode)
+            )
+
+    def _ensure_vocab_size_unchanged(self):
+        if self.output_mode == INT or self.pad_to_max_tokens:
+            return
+
+        with tf.init_scope():
+            new_vocab_size = self.vocabulary_size()
+
+        if (
+            self._frozen_vocab_size is not None
+            and new_vocab_size != self._frozen_vocab_size
+        ):
+            raise RuntimeError(
+                f"When using `output_mode={self.output_mode}` "
+                "and `pad_to_max_tokens=False`, "
+                "the vocabulary size cannot be changed after the layer is "
+                f"called. Old vocab size is {self._frozen_vocab_size}, "
+                f"new vocab size is {new_vocab_size}"
+            )
+
+    def _find_repeated_tokens(self, vocabulary):
+        """Return all repeated tokens in a vocabulary."""
+        vocabulary_set = set(vocabulary)
+        if len(vocabulary) != len(vocabulary_set):
+            return [
+                item
+                for item, count in collections.Counter(vocabulary).items()
+                if count > 1
+            ]
+        else:
+            return []
+
+    def _num_tokens(self, data):
+        """Count the number of tokens in a ragged, sparse or dense tensor."""
+        if tf_utils.is_sparse(data):
+            flat_values = data.values
+        elif tf_utils.is_ragged(data):
+            flat_values = data.flat_values
+        else:
+            flat_values = tf.reshape(data, [-1])
+        tokens, _, counts = tf.unique_with_counts(flat_values, out_idx=tf.int64)
+        return tokens, counts
+
+    def _inverse_document_frequency(self, token_document_counts, num_documents):
+        """Computes the inverse-document-frequency (IDF) component of "tf_idf".
+
+        Uses the default weighting scheme described in
+        https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
+
+        Args:
+          token_document_counts: An array of the # of documents each token
+            appears in.
+          num_documents: An int representing the total number of documents
+
+        Returns:
+          An array of "inverse document frequency" weights.
+        """
+        return tf.math.log(1 + num_documents / (1 + token_document_counts))
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return layer_serialization.VocabularySavedModelSaver(self)
+
+    # Override points for IntegerLookup and StringLookup.
+    def _tensor_vocab_to_numpy(self, vocabulary):
+        """Converts a tensor vocabulary to a numpy vocabulary."""
+        return vocabulary.numpy()
diff --git a/keras/layers/preprocessing/index_lookup_distribution_test.py b/keras/layers/preprocessing/index_lookup_distribution_test.py
index a7942b3dcc6e..eb9790b75734 100644
--- a/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -15,9 +15,11 @@
 """Distribution tests for keras.layers.preprocessing.index_lookup."""
 
 
-
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras import backend
 from keras.distribute import strategy_combinations
@@ -25,128 +27,174 @@
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-from tensorflow.python.framework import test_util as tf_test_utils
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        strategy=strategy_combinations.all_strategies +
-        strategy_combinations.multi_worker_mirrored_strategies +
-        strategy_combinations.parameter_server_strategies_single_worker +
-        strategy_combinations.parameter_server_strategies_multi_worker,
-        mode=["eager"]))
+        strategy=strategy_combinations.all_strategies
+        + strategy_combinations.multi_worker_mirrored_strategies
+        + strategy_combinations.parameter_server_strategies_single_worker
+        + strategy_combinations.parameter_server_strategies_multi_worker,
+        mode=["eager"],
+    )
+)
 class IndexLookupDistributionTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def test_strategy(self, strategy):
-    if (backend.is_tpu_strategy(strategy) and
-        not tf_test_utils.is_mlir_bridge_enabled()):
-      self.skipTest("TPU tests require MLIR bridge")
-
-    vocab_data = [[
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ]]
-    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
-        2, drop_remainder=True)
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    tf.config.set_soft_device_placement(True)
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string)
-      layer.adapt(vocab_dataset)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    model.compile(loss="mse")
-    output_dataset = model.predict(input_dataset)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_strategy_with_file(self, strategy):
-    if (backend.is_tpu_strategy(strategy) and
-        not tf_test_utils.is_mlir_bridge_enabled()):
-      self.skipTest("TPU tests require MLIR bridge")
-
-    vocab_data = ["earth", "wind", "and", "fire"]
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
-        2, drop_remainder=True)
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    tf.config.set_soft_device_placement(True)
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string,
-          vocabulary=vocab_file)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    model.compile(loss="mse")
-    output_dataset = model.predict(input_dataset)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_tpu_with_multiple_oov(self, strategy):
-    # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
-    if backend.is_tpu_strategy(strategy):
-      self.skipTest("This test needs MLIR bridge on TPU.")
-
-    vocab_data = [[
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ]]
-    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
-        2, drop_remainder=True)
-    expected_output = [[3, 4, 5, 6], [6, 5, 3, 1]]
-
-    tf.config.set_soft_device_placement(True)
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=2,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string)
-      layer.adapt(vocab_dataset)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_dataset)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def test_strategy(self, strategy):
+        if (
+            backend.is_tpu_strategy(strategy)
+            and not tf_test_utils.is_mlir_bridge_enabled()
+        ):
+            self.skipTest("TPU tests require MLIR bridge")
+
+        vocab_data = [
+            [
+                "earth",
+                "earth",
+                "earth",
+                "earth",
+                "wind",
+                "wind",
+                "wind",
+                "and",
+                "and",
+                "fire",
+            ]
+        ]
+        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
+            2, drop_remainder=True
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        tf.config.set_soft_device_placement(True)
+
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+            )
+            layer.adapt(vocab_dataset)
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+        model.compile(loss="mse")
+        output_dataset = model.predict(input_dataset)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_strategy_with_file(self, strategy):
+        if (
+            backend.is_tpu_strategy(strategy)
+            and not tf_test_utils.is_mlir_bridge_enabled()
+        ):
+            self.skipTest("TPU tests require MLIR bridge")
+
+        vocab_data = ["earth", "wind", "and", "fire"]
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
+            2, drop_remainder=True
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        tf.config.set_soft_device_placement(True)
+
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+                vocabulary=vocab_file,
+            )
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+        model.compile(loss="mse")
+        output_dataset = model.predict(input_dataset)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_tpu_with_multiple_oov(self, strategy):
+        # TODO(b/180614455): remove this check when MLIR bridge is always
+        # enabled.
+        if backend.is_tpu_strategy(strategy):
+            self.skipTest("This test needs MLIR bridge on TPU.")
+
+        vocab_data = [
+            [
+                "earth",
+                "earth",
+                "earth",
+                "earth",
+                "wind",
+                "wind",
+                "wind",
+                "and",
+                "and",
+                "fire",
+            ]
+        ]
+        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
+            2, drop_remainder=True
+        )
+        expected_output = [[3, 4, 5, 6], [6, 5, 3, 1]]
+
+        tf.config.set_soft_device_placement(True)
+
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=2,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+            )
+            layer.adapt(vocab_dataset)
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_dataset)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/layers/preprocessing/index_lookup_test.py b/keras/layers/preprocessing/index_lookup_test.py
index 9b3ef9687d36..ca488eb4c54e 100644
--- a/keras/layers/preprocessing/index_lookup_test.py
+++ b/keras/layers/preprocessing/index_lookup_test.py
@@ -20,6 +20,8 @@
 import random
 import string
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 import keras
@@ -27,2201 +29,988 @@
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.utils.generic_utils import CustomObjectScope
-import numpy as np
-import tensorflow.compat.v2 as tf
+from keras.utils import CustomObjectScope
 
 
 def zip_and_sort(weight_values):
-  keys, values = weight_values
-  return sorted(zip(keys, values), key=lambda x: x[1])
+    keys, values = weight_values
+    return sorted(zip(keys, values), key=lambda x: x[1])
 
 
 def _get_end_to_end_test_cases():
-  test_cases = (
-      {
-          "testcase_name":
-              "test_strings_soft_vocab_cap",
-          # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab
-          # accumulator is sorting by frequency.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": None,
-              "num_oov_indices": 1,
-              "mask_token": "",
-              "oov_token": "[OOV]",
-              "vocabulary_dtype": tf.string,
-          },
-          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
-          "input_dtype":
-              tf.string
-      },
-      {
-          "testcase_name":
-              "test_inverse_strings_soft_vocab_cap",
-          # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab
-          # accumulator is sorting by frequency.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([[2], [3], [4], [1], [1], [4], [2], [5]]),
-          "kwargs": {
-              "max_tokens": None,
-              "num_oov_indices": 1,
-              "mask_token": "",
-              "oov_token": "[OOV]",
-              "vocabulary_dtype": tf.string,
-              "invert": True
-          },
-          "expected_output":
-              np.array([[b"earth"], [b"wind"], [b"and"], [b"[OOV]"], [b"[OOV]"],
-                        [b"and"], [b"earth"], [b"fire"]]),
-          "input_dtype":
-              tf.int64
-      },
-      {
-          "testcase_name":
-              "test_strings_with_special_tokens",
-          # Mask and oov values in the vocab data should be dropped, and mapped
-          # to 0 and 1 respectively when calling the layer.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        [""], [""], [""], ["[OOV]"], ["[OOV]"], ["[OOV]"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], [""], ["wind"], ["[OOV]"], ["and"], [""],
-                        ["fire"], ["and"], ["[OOV]"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": None,
-              "num_oov_indices": 1,
-              "mask_token": "",
-              "oov_token": "[OOV]",
-              "vocabulary_dtype": tf.string,
-          },
-          "expected_output": [[2], [0], [3], [1], [4], [0], [5], [4], [1], [1]],
-          "input_dtype":
-              tf.string
-      },
-      {
-          "testcase_name":
-              "test_ints_soft_vocab_cap",
-          # Create an array where 1138 is the most frequent term, followed by
-          # 1729, then 725, then 42. This ensures that the vocab accumulator
-          # is sorting by frequency.
-          "vocab_data":
-              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
-                        [1729], [725], [725]],
-                       dtype=np.int64),
-          "input_data":
-              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
-                       dtype=np.int64),
-          "kwargs": {
-              "max_tokens": None,
-              "num_oov_indices": 1,
-              "mask_token": 0,
-              "oov_token": -1,
-              "vocabulary_dtype": tf.int64,
-          },
-          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
-          "input_dtype":
-              tf.int64
-      },
-      {
-          "testcase_name":
-              "test_ints_with_special_tokens",
-          # Mask and oov values in the vocab data should be dropped, and mapped
-          # to 0 and 1 respectively when calling the layer.
-          "vocab_data":
-              np.array([[42], [1138], [1138], [1138], [1138], [0], [0], [0],
-                        [-1], [-1], [-1], [1729], [1729], [1729], [725], [725]],
-                       dtype=np.int64),
-          "input_data":
-              np.array([[1138], [0], [1729], [-1], [725], [0], [42], [725],
-                        [-1], [4]],
-                       dtype=np.int64),
-          "kwargs": {
-              "max_tokens": None,
-              "num_oov_indices": 1,
-              "mask_token": 0,
-              "oov_token": -1,
-              "vocabulary_dtype": tf.int64,
-          },
-          "expected_output": [[2], [0], [3], [1], [4], [0], [5], [4], [1], [1]],
-          "input_dtype":
-              tf.int64
-      },
-      {
-          "testcase_name":
-              "test_strings_hard_vocab_cap",
-          # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab
-          # accumulator is sorting by frequency.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "num_oov_indices": 1,
-              "mask_token": "",
-              "oov_token": "[OOV]",
-              "vocabulary_dtype": tf.string,
-          },
-          "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
-          "input_dtype":
-              tf.string
-      },
-      {
-          "testcase_name":
-              "test_inverse_strings_hard_vocab_cap",
-          # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab
-          # accumulator is sorting by frequency.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([[2], [3], [4], [1], [1], [4], [2], [5]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "num_oov_indices": 1,
-              "mask_token": "",
-              "oov_token": "[OOV]",
-              "vocabulary_dtype": tf.string,
-              "invert": True
-          },
-          "expected_output":
-              np.array([[b"earth"], [b"wind"], [b"and"], [b"[OOV]"], [b"[OOV]"],
-                        [b"and"], [b"earth"], [b"[OOV]"]]),
-          "input_dtype":
-              tf.int64
-      },
-      {
-          "testcase_name":
-              "test_ints_hard_vocab_cap",
-          # Create an array where 1138 is the most frequent term, followed by
-          # 1729, then 725, then 42. This ensures that the vocab accumulator
-          # is sorting by frequency.
-          "vocab_data":
-              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
-                        [1729], [725], [725]],
-                       dtype=np.int64),
-          "input_data":
-              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
-                       dtype=np.int64),
-          "kwargs": {
-              "max_tokens": 5,
-              "num_oov_indices": 1,
-              "mask_token": 0,
-              "oov_token": -1,
-              "vocabulary_dtype": tf.int64,
-          },
-          "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
-          "input_dtype":
-              tf.int64
-      },
-      {
-          "testcase_name":
-              "test_ints_tf_idf_output",
-          "vocab_data":
-              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
-                        [1729], [725], [725]]),
-          "input_data":
-              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "num_oov_indices": 1,
-              "mask_token": 0,
-              "oov_token": -1,
-              "output_mode": index_lookup.TF_IDF,
-              "vocabulary_dtype": tf.int64,
-          },
-          "expected_output": [[0, 1.098612, 0, 0, 0], [0, 0, 1.252763, 0, 0],
-                              [0, 0, 0, 1.466337, 0], [0, 0, 0, 0, 1.7917595],
-                              [0, 0, 0, 0, 1.7917595], [0, 0, 0, 1.4663371, 0],
-                              [0, 1.098612, 0, 0, 0], [1.402368, 0, 0, 0, 0]],
-          "input_dtype":
-              tf.int64
-      },
-      {
-          "testcase_name":
-              "test_strings_tf_idf_output",
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "num_oov_indices": 1,
-              "mask_token": "",
-              "oov_token": "[OOV]",
-              "output_mode": index_lookup.TF_IDF,
-              "vocabulary_dtype": tf.string,
-          },
-          "expected_output": [[0, 1.098612, 0, 0, 0], [0, 0, 1.252763, 0, 0],
-                              [0, 0, 0, 1.466337, 0], [0, 0, 0, 0, 1.7917595],
-                              [0, 0, 0, 0, 1.7917595], [0, 0, 0, 1.4663371, 0],
-                              [0, 1.098612, 0, 0, 0], [1.402368, 0, 0, 0, 0]],
-          "input_dtype":
-              tf.string
-      },
-  )
-
-  crossed_test_cases = []
-  # Cross above test cases with use_dataset in (True, False)
-  for use_dataset in (True, False):
-    for case in test_cases:
-      case = case.copy()
-      if use_dataset:
-        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
-      case["use_dataset"] = use_dataset
-      crossed_test_cases.append(case)
-
-  return crossed_test_cases
+    test_cases = (
+        {
+            "testcase_name": "test_strings_soft_vocab_cap",
+            # Create an array where 'earth' is the most frequent term, followed
+            # by 'wind', then 'and', then 'fire'. This ensures that the vocab
+            # accumulator is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "num_oov_indices": 1,
+                "mask_token": "",
+                "oov_token": "[OOV]",
+                "vocabulary_dtype": tf.string,
+            },
+            "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+            "input_dtype": tf.string,
+        },
+        {
+            "testcase_name": "test_inverse_strings_soft_vocab_cap",
+            # Create an array where 'earth' is the most frequent term, followed
+            # by 'wind', then 'and', then 'fire'. This ensures that the vocab
+            # accumulator is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array([[2], [3], [4], [1], [1], [4], [2], [5]]),
+            "kwargs": {
+                "max_tokens": None,
+                "num_oov_indices": 1,
+                "mask_token": "",
+                "oov_token": "[OOV]",
+                "vocabulary_dtype": tf.string,
+                "invert": True,
+            },
+            "expected_output": np.array(
+                [
+                    [b"earth"],
+                    [b"wind"],
+                    [b"and"],
+                    [b"[OOV]"],
+                    [b"[OOV]"],
+                    [b"and"],
+                    [b"earth"],
+                    [b"fire"],
+                ]
+            ),
+            "input_dtype": tf.int64,
+        },
+        {
+            "testcase_name": "test_strings_with_special_tokens",
+            # Mask and oov values in the vocab data should be dropped, and
+            # mapped to 0 and 1 respectively when calling the layer.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    [""],
+                    [""],
+                    [""],
+                    ["[OOV]"],
+                    ["[OOV]"],
+                    ["[OOV]"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    [""],
+                    ["wind"],
+                    ["[OOV]"],
+                    ["and"],
+                    [""],
+                    ["fire"],
+                    ["and"],
+                    ["[OOV]"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "num_oov_indices": 1,
+                "mask_token": "",
+                "oov_token": "[OOV]",
+                "vocabulary_dtype": tf.string,
+            },
+            "expected_output": [
+                [2],
+                [0],
+                [3],
+                [1],
+                [4],
+                [0],
+                [5],
+                [4],
+                [1],
+                [1],
+            ],
+            "input_dtype": tf.string,
+        },
+        {
+            "testcase_name": "test_ints_soft_vocab_cap",
+            # Create an array where 1138 is the most frequent term, followed by
+            # 1729, then 725, then 42. This ensures that the vocab accumulator
+            # is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    [42],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1729],
+                    [1729],
+                    [1729],
+                    [725],
+                    [725],
+                ],
+                dtype=np.int64,
+            ),
+            "input_data": np.array(
+                [[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                dtype=np.int64,
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "num_oov_indices": 1,
+                "mask_token": 0,
+                "oov_token": -1,
+                "vocabulary_dtype": tf.int64,
+            },
+            "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+            "input_dtype": tf.int64,
+        },
+        {
+            "testcase_name": "test_ints_with_special_tokens",
+            # Mask and oov values in the vocab data should be dropped, and
+            # mapped to 0 and 1 respectively when calling the layer.
+            "vocab_data": np.array(
+                [
+                    [42],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [0],
+                    [0],
+                    [0],
+                    [-1],
+                    [-1],
+                    [-1],
+                    [1729],
+                    [1729],
+                    [1729],
+                    [725],
+                    [725],
+                ],
+                dtype=np.int64,
+            ),
+            "input_data": np.array(
+                [[1138], [0], [1729], [-1], [725], [0], [42], [725], [-1], [4]],
+                dtype=np.int64,
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "num_oov_indices": 1,
+                "mask_token": 0,
+                "oov_token": -1,
+                "vocabulary_dtype": tf.int64,
+            },
+            "expected_output": [
+                [2],
+                [0],
+                [3],
+                [1],
+                [4],
+                [0],
+                [5],
+                [4],
+                [1],
+                [1],
+            ],
+            "input_dtype": tf.int64,
+        },
+        {
+            "testcase_name": "test_strings_hard_vocab_cap",
+            # Create an array where 'earth' is the most frequent term, followed
+            # by 'wind', then 'and', then 'fire'. This ensures that the vocab
+            # accumulator is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "num_oov_indices": 1,
+                "mask_token": "",
+                "oov_token": "[OOV]",
+                "vocabulary_dtype": tf.string,
+            },
+            "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
+            "input_dtype": tf.string,
+        },
+        {
+            "testcase_name": "test_inverse_strings_hard_vocab_cap",
+            # Create an array where 'earth' is the most frequent term, followed
+            # by 'wind', then 'and', then 'fire'. This ensures that the vocab
+            # accumulator is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array([[2], [3], [4], [1], [1], [4], [2], [5]]),
+            "kwargs": {
+                "max_tokens": 5,
+                "num_oov_indices": 1,
+                "mask_token": "",
+                "oov_token": "[OOV]",
+                "vocabulary_dtype": tf.string,
+                "invert": True,
+            },
+            "expected_output": np.array(
+                [
+                    [b"earth"],
+                    [b"wind"],
+                    [b"and"],
+                    [b"[OOV]"],
+                    [b"[OOV]"],
+                    [b"and"],
+                    [b"earth"],
+                    [b"[OOV]"],
+                ]
+            ),
+            "input_dtype": tf.int64,
+        },
+        {
+            "testcase_name": "test_ints_hard_vocab_cap",
+            # Create an array where 1138 is the most frequent term, followed by
+            # 1729, then 725, then 42. This ensures that the vocab accumulator
+            # is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    [42],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1729],
+                    [1729],
+                    [1729],
+                    [725],
+                    [725],
+                ],
+                dtype=np.int64,
+            ),
+            "input_data": np.array(
+                [[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                dtype=np.int64,
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "num_oov_indices": 1,
+                "mask_token": 0,
+                "oov_token": -1,
+                "vocabulary_dtype": tf.int64,
+            },
+            "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
+            "input_dtype": tf.int64,
+        },
+        {
+            "testcase_name": "test_ints_tf_idf_output",
+            "vocab_data": np.array(
+                [
+                    [42],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1729],
+                    [1729],
+                    [1729],
+                    [725],
+                    [725],
+                ]
+            ),
+            "input_data": np.array(
+                [[1138], [1729], [725], [42], [42], [725], [1138], [4]]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "num_oov_indices": 1,
+                "mask_token": 0,
+                "oov_token": -1,
+                "output_mode": index_lookup.TF_IDF,
+                "vocabulary_dtype": tf.int64,
+            },
+            "expected_output": [
+                [0, 1.098612, 0, 0, 0],
+                [0, 0, 1.252763, 0, 0],
+                [0, 0, 0, 1.466337, 0],
+                [0, 0, 0, 0, 1.7917595],
+                [0, 0, 0, 0, 1.7917595],
+                [0, 0, 0, 1.4663371, 0],
+                [0, 1.098612, 0, 0, 0],
+                [1.402368, 0, 0, 0, 0],
+            ],
+            "input_dtype": tf.int64,
+        },
+        {
+            "testcase_name": "test_strings_tf_idf_output",
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "num_oov_indices": 1,
+                "mask_token": "",
+                "oov_token": "[OOV]",
+                "output_mode": index_lookup.TF_IDF,
+                "vocabulary_dtype": tf.string,
+            },
+            "expected_output": [
+                [0, 1.098612, 0, 0, 0],
+                [0, 0, 1.252763, 0, 0],
+                [0, 0, 0, 1.466337, 0],
+                [0, 0, 0, 0, 1.7917595],
+                [0, 0, 0, 0, 1.7917595],
+                [0, 0, 0, 1.4663371, 0],
+                [0, 1.098612, 0, 0, 0],
+                [1.402368, 0, 0, 0, 0],
+            ],
+            "input_dtype": tf.string,
+        },
+    )
+
+    crossed_test_cases = []
+    # Cross above test cases with use_dataset in (True, False)
+    for use_dataset in (True, False):
+        for case in test_cases:
+            case = case.copy()
+            if use_dataset:
+                case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+            case["use_dataset"] = use_dataset
+            crossed_test_cases.append(case)
+
+    return crossed_test_cases
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupLayerTest(test_combinations.TestCase,
-                           preprocessing_test_utils.PreprocessingLayerTest):
-
-  @parameterized.named_parameters(*_get_end_to_end_test_cases())
-  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
-                                       use_dataset, expected_output,
-                                       input_dtype):
-    cls = index_lookup.IndexLookup
-    if "invert" in kwargs and kwargs["invert"]:
-      expected_output_dtype = kwargs["vocabulary_dtype"]
-    elif "output_mode" in kwargs and kwargs["output_mode"] != index_lookup.INT:
-      expected_output_dtype = tf.float32
-    else:
-      expected_output_dtype = tf.int64
-
-    input_shape = input_data.shape
-
-    if use_dataset:
-      # Keras APIs expect batched datasets.
-      # TODO(rachelim): `model.predict` predicts the result on each
-      # dataset batch separately, then tries to concatenate the results
-      # together. When the results have different shapes on the non-concat
-      # axis (which can happen in the output_mode = INT case for
-      # IndexLookup), the concatenation fails. In real use cases, this may
-      # not be an issue because users are likely to pipe the preprocessing layer
-      # into other keras layers instead of predicting it directly. A workaround
-      # for these unit tests is to have the dataset only contain one batch, so
-      # no concatenation needs to happen with the result. For consistency with
-      # numpy input, we should make `predict` join differently shaped results
-      # together sensibly, with 0 padding.
-      input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
-          input_shape[0])
-      vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
-          input_shape[0])
-
-    with CustomObjectScope({"IndexLookup": cls}):
-      output_data = test_utils.layer_test(
-          cls,
-          kwargs=kwargs,
-          input_shape=input_shape,
-          input_data=input_data,
-          input_dtype=input_dtype,
-          expected_output_dtype=expected_output_dtype,
-          validate_training=False,
-          adapt_data=vocab_data)
-    if "invert" in kwargs and kwargs["invert"]:
-      self.assertAllEqual(expected_output, output_data)
-    else:
-      self.assertAllClose(expected_output, output_data)
+class IndexLookupLayerTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(*_get_end_to_end_test_cases())
+    def test_layer_end_to_end_with_adapt(
+        self,
+        vocab_data,
+        input_data,
+        kwargs,
+        use_dataset,
+        expected_output,
+        input_dtype,
+    ):
+        cls = index_lookup.IndexLookup
+        if "invert" in kwargs and kwargs["invert"]:
+            expected_output_dtype = kwargs["vocabulary_dtype"]
+        elif (
+            "output_mode" in kwargs
+            and kwargs["output_mode"] != index_lookup.INT
+        ):
+            expected_output_dtype = tf.float32
+        else:
+            expected_output_dtype = tf.int64
+
+        input_shape = input_data.shape
+
+        if use_dataset:
+            # Keras APIs expect batched datasets.
+            # TODO(rachelim): `model.predict` predicts the result on each
+            # dataset batch separately, then tries to concatenate the results
+            # together. When the results have different shapes on the non-concat
+            # axis (which can happen in the output_mode = INT case for
+            # IndexLookup), the concatenation fails. In real use cases, this may
+            # not be an issue because users are likely to pipe the preprocessing
+            # layer into other keras layers instead of predicting it directly. A
+            # workaround for these unit tests is to have the dataset only
+            # contain one batch, so no concatenation needs to happen with the
+            # result. For consistency with numpy input, we should make `predict`
+            # join differently shaped results together sensibly, with 0 padding.
+            input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
+                input_shape[0]
+            )
+            vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
+                input_shape[0]
+            )
+
+        with CustomObjectScope({"IndexLookup": cls}):
+            output_data = test_utils.layer_test(
+                cls,
+                kwargs=kwargs,
+                input_shape=input_shape,
+                input_data=input_data,
+                input_dtype=input_dtype,
+                expected_output_dtype=expected_output_dtype,
+                validate_training=False,
+                adapt_data=vocab_data,
+            )
+        if "invert" in kwargs and kwargs["invert"]:
+            self.assertAllEqual(expected_output, output_data)
+        else:
+            self.assertAllClose(expected_output, output_data)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class CategoricalEncodingInputTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_sparse_string_input(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]],
-        values=["fire", "michigan"],
-        dense_shape=[3, 4])
-
-    expected_indices = [[0, 0], [1, 2]]
-    expected_values = [5, 1]
-    expected_dense_shape = [3, 4]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string, sparse=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_indices, output_data.indices)
-    self.assertAllEqual(expected_values, output_data.values)
-    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
-  def test_sparse_int_input(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]],
-        values=np.array([13, 32], dtype=np.int64),
-        dense_shape=[3, 4])
-
-    expected_indices = [[0, 0], [1, 2]]
-    expected_values = [5, 1]
-    expected_dense_shape = [3, 4]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        vocabulary_dtype=tf.int64,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_indices, output_data.indices)
-    self.assertAllEqual(expected_values, output_data.values)
-    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
-  def test_ragged_string_input(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = tf.ragged.constant(
-        [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_ragged_int_input(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 42]],
-                                     dtype=np.int64)
-    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        vocabulary_dtype=tf.int64,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int32_input_with_int64_keys(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 42]],
-                                     dtype=np.int32)
-    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        vocabulary_dtype=tf.int64,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_sparse_string_input(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]],
+            values=["fire", "michigan"],
+            dense_shape=[3, 4],
+        )
+
+        expected_indices = [[0, 0], [1, 2]]
+        expected_values = [5, 1]
+        expected_dense_shape = [3, 4]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string, sparse=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_indices, output_data.indices)
+        self.assertAllEqual(expected_values, output_data.values)
+        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+    def test_sparse_int_input(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]],
+            values=np.array([13, 32], dtype=np.int64),
+            dense_shape=[3, 4],
+        )
+
+        expected_indices = [[0, 0], [1, 2]]
+        expected_values = [5, 1]
+        expected_dense_shape = [3, 4]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            vocabulary_dtype=tf.int64,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_indices, output_data.indices)
+        self.assertAllEqual(expected_values, output_data.values)
+        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+    def test_ragged_string_input(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = tf.ragged.constant(
+            [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]]
+        )
+        expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_ragged_int_input(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.ragged.constant(
+            [[10, 11, 13], [13, 12, 10, 42]], dtype=np.int64
+        )
+        expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            vocabulary_dtype=tf.int64,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int32_input_with_int64_keys(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.ragged.constant(
+            [[10, 11, 13], [13, 12, 10, 42]], dtype=np.int32
+        )
+        expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int32, ragged=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            vocabulary_dtype=tf.int64,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class CategoricalEncodingMultiOOVTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_sparse_string_input_multi_bucket(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]], values=["fire", "ohio"], dense_shape=[3, 4])
-
-    expected_indices = [[0, 0], [1, 2]]
-    expected_values = [6, 2]
-    expected_dense_shape = [3, 4]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string, sparse=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=2,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_indices, output_data.indices)
-    self.assertAllEqual(expected_values, output_data.values)
-    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
-  def test_sparse_int_input_multi_bucket(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]],
-        values=np.array([13, 133], dtype=np.int64),
-        dense_shape=[3, 4])
-
-    expected_indices = [[0, 0], [1, 2]]
-    expected_values = [6, 2]
-    expected_dense_shape = [3, 4]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        vocabulary_dtype=tf.int64,
-        num_oov_indices=2,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_indices, output_data.indices)
-    self.assertAllEqual(expected_values, output_data.values)
-    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
-  def test_ragged_string_input_multi_bucket(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = tf.ragged.constant([["earth", "wind", "fire"],
-                                      ["fire", "and", "earth", "ohio"]])
-    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=2,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_ragged_int_input_multi_bucket(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 133]],
-                                     dtype=np.int64)
-    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        vocabulary_dtype=tf.int64,
-        num_oov_indices=2,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_sparse_string_input_multi_bucket(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]],
+            values=["fire", "ohio"],
+            dense_shape=[3, 4],
+        )
 
+        expected_indices = [[0, 0], [1, 2]]
+        expected_values = [6, 2]
+        expected_dense_shape = [3, 4]
 
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingAdaptTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_sparse_adapt(self):
-    vocab_data = tf.SparseTensor(
-        indices=[[0, 0], [0, 1], [1, 2]],
-        values=["michigan", "fire", "michigan"],
-        dense_shape=[3, 4])
-    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
-
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.adapt(vocab_dataset)
-    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
-    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
-  def test_ragged_adapt(self):
-    vocab_data = tf.ragged.constant([["michigan"],
-                                     ["fire", "michigan"]])
-    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
-
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.adapt(vocab_dataset)
-    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
-    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
-  def test_sparse_int_input(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]],
-        values=np.array([13, 32], dtype=np.int64),
-        dense_shape=[3, 4])
-
-    expected_indices = [[0, 0], [1, 2]]
-    expected_values = [5, 1]
-    expected_dense_shape = [3, 4]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        vocabulary_dtype=tf.int64,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_indices, output_data.indices)
-    self.assertAllEqual(expected_values, output_data.values)
-    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
-  def test_ragged_string_input(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = tf.ragged.constant(
-        [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_ragged_int_input(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 42]],
-                                     dtype=np.int64)
-    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        vocabulary_dtype=tf.int64,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_single_string_generator_dataset(self):
-
-    def word_gen():
-      for _ in itertools.count(1):
-        yield "".join(random.choice(string.ascii_letters) for i in range(2))
-
-    ds = tf.data.Dataset.from_generator(word_gen, tf.string,
-                                        tf.TensorShape([]))
-    batched_ds = ds.take(2)
-    input_t = keras.Input(shape=(), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=10,
-        num_oov_indices=0,
-        mask_token=None,
-        oov_token=None,
-        vocabulary_dtype=tf.string)
-    _ = layer(input_t)
-    layer.adapt(batched_ds)
+        input_data = keras.Input(shape=(None,), dtype=tf.string, sparse=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=2,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_indices, output_data.indices)
+        self.assertAllEqual(expected_values, output_data.values)
+        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+    def test_sparse_int_input_multi_bucket(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]],
+            values=np.array([13, 133], dtype=np.int64),
+            dense_shape=[3, 4],
+        )
+
+        expected_indices = [[0, 0], [1, 2]]
+        expected_values = [6, 2]
+        expected_dense_shape = [3, 4]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            vocabulary_dtype=tf.int64,
+            num_oov_indices=2,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_indices, output_data.indices)
+        self.assertAllEqual(expected_values, output_data.values)
+        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
 
+    def test_ragged_string_input_multi_bucket(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = tf.ragged.constant(
+            [["earth", "wind", "fire"], ["fire", "and", "earth", "ohio"]]
+        )
+        expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupOutputTest(test_combinations.TestCase,
-                            preprocessing_test_utils.PreprocessingLayerTest):
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  @parameterized.product(
-      rank=[0, 1, 2],
-      # Check lists, numpy arrays, tensors, and objects convertable to tensor.
-      data_fn=[None, np.array, tf.constant, preprocessing_test_utils.ArrayLike]
-  )
-  def test_input_types(self, rank, data_fn):
-    input_data = vocab = ["earth", "wind", "and", "fire"]
-    expected_output = [2, 3, 4, 5]
-    if rank == 0:
-      input_data = input_data[0]
-      expected_output = expected_output[0]
-    elif rank == 2:
-      input_data = [input_data]
-      expected_output = [expected_output]
-    if data_fn is not None:
-      input_data = data_fn(input_data)
-    input_shape = [] if rank == 0 else [None]
-
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary=vocab,
-        vocabulary_dtype=tf.string)
-    output_data = layer(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-    # Again in a keras.Model
-    inputs = keras.Input(shape=input_shape, dtype=tf.string)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model(tf.constant(input_data))
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_int_output_shape(self):
-    input_data = keras.Input(batch_size=16, shape=(4,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=2,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    self.assertAllEqual(int_data.shape.as_list(), [16, 4])
-
-  @parameterized.named_parameters(
-      ("int32", tf.int32),
-      ("int64", tf.int64),
-  )
-  def test_int_output_dtype(self, dtype):
-    input_data = keras.Input(batch_size=16, shape=(4,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=2,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        dtype=dtype)
-    int_data = layer(input_data)
-    self.assertAllEqual(int_data.dtype, dtype)
-
-  def test_int_output_float_dtype_fails(self):
-    with self.assertRaisesRegex(ValueError, "`dtype` should be an integer"):
-      index_lookup.IndexLookup(
-          max_tokens=2,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string,
-          dtype=tf.float32)
-
-  def test_int_output_no_reserved_zero(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token=None,
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_no_oov(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    valid_input = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", ""]])
-    invalid_input = np.array([["earth", "wind", "and", "michigan"],
-                              ["fire", "and", "earth", "michigan"]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=0,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(valid_input)
-    self.assertAllEqual(expected_output, output_data)
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                "found OOV values.*michigan"):
-      _ = model.predict(invalid_input)
-
-  def test_int_output_no_oov_ragged(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    valid_input = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", ""]])
-    invalid_input = np.array([["earth", "wind", "and", "michigan"],
-                              ["fire", "and", "earth", "michigan"]])
-    valid_input = tf.RaggedTensor.from_tensor(valid_input)
-    invalid_input = tf.RaggedTensor.from_tensor(invalid_input)
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=0,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(valid_input)
-    self.assertAllEqual(expected_output, output_data)
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                "found OOV values.*michigan"):
-      _ = model.predict(invalid_input)
-
-  def test_int_output_no_oov_sparse(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    valid_input = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", ""]])
-    invalid_input = np.array([["earth", "wind", "and", "michigan"],
-                              ["fire", "and", "earth", "michigan"]])
-    valid_input = tf.sparse.from_dense(valid_input)
-    invalid_input = tf.sparse.from_dense(invalid_input)
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=0,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(valid_input)
-    self.assertAllEqual(expected_output,
-                        tf.sparse.to_dense(output_data))
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                "found OOV values.*michigan"):
-      _ = model.predict(invalid_input)
-
-  def test_int_output_explicit_vocab(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_one_hot_output_hard_maximum(self):
-    """Check binary output when pad_to_max_tokens=True."""
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array(["earth", "wind", "and", "fire", "michigan", ""])
-    expected_output = [
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 1, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 1, 0],
-        [1, 0, 0, 0, 0, 0],
-        [0, 0, 0, 0, 0, 0],
-    ]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=6,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        output_mode=index_lookup.ONE_HOT,
-        pad_to_max_tokens=True,
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    binary_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=binary_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_one_hot_output_soft_maximum(self):
-    """Check binary output when pad_to_max_tokens=False."""
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array(["earth", "wind", "and", "fire", "michigan", ""])
-    expected_output = [
-        [0, 1, 0, 0, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 1, 0],
-        [0, 0, 0, 0, 1],
-        [1, 0, 0, 0, 0],
-        [0, 0, 0, 0, 0],
-    ]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        output_mode=index_lookup.ONE_HOT,
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    binary_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=binary_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_one_hot_output_rank_zero_no_oov(self):
-    """Check binary output when pad_to_max_tokens=False."""
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_data = tf.constant("earth")
-    expected_output = [1, 0, 0, 0]
-
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=0,
-        mask_token="",
-        oov_token="[OOV]",
-        output_mode=index_lookup.ONE_HOT,
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    output_data = layer(input_data)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_one_hot_output_shape(self):
-    inputs = keras.Input(batch_size=16, shape=(1,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=["earth"],
-        max_tokens=2,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        output_mode=index_lookup.ONE_HOT,
-        vocabulary_dtype=tf.string)
-    outputs = layer(inputs)
-    self.assertAllEqual(outputs.shape.as_list(), [16, 2])
-
-  @parameterized.product(
-      sparse=[True, False],
-      adapt=[True, False],
-      pad_to_max=[True, False],
-      mode=["multi_hot", "count", "tf_idf"],
-      dtype=[tf.float32, tf.float64],
-  )
-  def test_binned_output(self, sparse, adapt, pad_to_max, mode, dtype):
-    """Check "multi_hot", "count", and "tf_idf" output."""
-    # Adapt breaks ties with sort order.
-    vocab_data = ["wind", "fire", "earth", "and"]
-    # IDF weight for a term in 1 out of 1 document is log(1 + 1/2).
-    idf_data = [math.log(1.5)] * 4
-    input_data = np.array([["and", "earth", "fire", "and", ""],
-                           ["michigan", "wind", "and", "ohio", ""]])
-
-    if mode == "count":
-      expected_output = np.array([
-          [0, 0, 1, 1, 2],
-          [2, 1, 0, 0, 1],
-      ])
-    elif mode == "tf_idf":
-      expected_output = np.array([
-          [0, 0, 1, 1, 2],
-          [2, 1, 0, 0, 1],
-      ]) * math.log(1.5)
-    else:
-      expected_output = np.array([
-          [0, 0, 1, 1, 1],
-          [1, 1, 0, 0, 1],
-      ])
-    expected_output_shape = [None, 5]
-    if pad_to_max:
-      expected_output = np.concatenate((expected_output, [[0], [0]]), axis=1)
-      expected_output_shape = [None, 6]
-
-    inputs = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=6,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        output_mode=mode,
-        pad_to_max_tokens=pad_to_max,
-        vocabulary_dtype=tf.string,
-        sparse=sparse,
-        vocabulary=None if adapt else vocab_data,
-        idf_weights=None if adapt or mode != "tf_idf" else idf_data,
-        dtype=dtype)
-    if adapt:
-      layer.adapt(vocab_data)
-    outputs = layer(inputs)
-    model = keras.Model(inputs, outputs)
-    output_data = model.predict(input_data)
-    if sparse:
-      output_data = tf.sparse.to_dense(output_data)
-    # Check output data.
-    self.assertAllClose(expected_output, output_data)
-    # Check symbolic output shape.
-    self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
-    # Check output dtype.
-    self.assertAllEqual(dtype, output_data.dtype)
-
-  def test_multi_hot_output_no_oov(self):
-    """Check multi hot output when num_oov_indices=0."""
-    vocab_data = ["earth", "wind", "and", "fire"]
-    valid_input = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", ""]])
-    invalid_input = np.array([["earth", "wind", "and", "michigan"],
-                              ["fire", "and", "earth", "michigan"]])
-    expected_output = [
-        [1, 1, 1, 1, 0],
-        [1, 0, 1, 1, 0],
-    ]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=5,
-        num_oov_indices=0,
-        mask_token="",
-        oov_token="[OOV]",
-        output_mode=index_lookup.MULTI_HOT,
-        pad_to_max_tokens=True,
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    binary_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=binary_data)
-    output_data = model.predict(valid_input)
-    self.assertAllEqual(expected_output, output_data)
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                "found OOV values.*michigan"):
-      _ = model.predict(invalid_input)
-
-  def test_multi_hot_output_hard_maximum_multiple_adapts(self):
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-    adapt_data = ["earth", "earth", "earth", "earth", "wind", "wind", "wind"]
-    first_expected_output = [
-        [1, 1, 1, 0, 0],
-        [1, 1, 0, 0, 0],
-    ]
-    second_adapt_data = [
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ]
-    second_expected_output = [
-        [0, 1, 1, 1, 0],
-        [1, 1, 0, 1, 0],
-    ]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=5,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        output_mode=index_lookup.MULTI_HOT,
-        pad_to_max_tokens=True,
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    # Test the first adapt
-    layer.adapt(adapt_data)
-    first_output = model.predict(input_array)
-    # Test the second adapt
-    layer.adapt(second_adapt_data)
-    # We need to recompile the model to retrace our call graph.
-    model.compile()
-    second_output = model.predict(input_array)
-    self.assertAllEqual(first_expected_output, first_output)
-    self.assertAllEqual(second_expected_output, second_output)
-
-  def test_int_output_file_vocab(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 0, 2, 1]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_non_int_output_file_vocab_in_tf_function(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = tf.constant(
-        [["earth", "wind", "and", "fire", ""],
-         ["fire", "and", "earth", "michigan", ""]],
-        dtype=tf.string)
-
-    expected_output = [
-        [0, 1, 1, 1, 1],
-        [1, 1, 0, 1, 1],
-    ]
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    @tf.function
-    def compute(data):
-      layer = index_lookup.IndexLookup(
-          vocabulary=vocab_file,
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          output_mode=index_lookup.MULTI_HOT,
-          vocabulary_dtype=tf.string)
-      return layer(data)
-
-    output_dataset = compute(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_file_vocab_and_list_vocab_identical_attrs(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    file_layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-
-    list_layer = index_lookup.IndexLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-
-    expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
-    self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
-    expected_vocab_size = 6
-    self.assertAllEqual(expected_vocab_size, list_layer.vocabulary_size())
-    self.assertAllEqual(list_layer.get_vocabulary(),
-                        file_layer.get_vocabulary())
-    self.assertAllEqual(list_layer.vocabulary_size(),
-                        file_layer.vocabulary_size())
-
-  def test_file_vocab_and_list_vocab_identical_attrs_multi_oov(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    file_layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        num_oov_indices=2,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-
-    list_layer = index_lookup.IndexLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        num_oov_indices=2,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-
-    expected_vocab = ["", "[OOV]", "[OOV]", "earth", "wind", "and", "fire"]
-    self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
-    expected_vocab_size = 7
-    self.assertAllEqual(expected_vocab_size, list_layer.vocabulary_size())
-    self.assertAllEqual(list_layer.get_vocabulary(),
-                        file_layer.get_vocabulary())
-    self.assertAllEqual(list_layer.vocabulary_size(),
-                        file_layer.vocabulary_size())
-
-  def test_file_vocab_and_list_vocab_identical_attrs_no_mask(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    file_layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        num_oov_indices=2,
-        mask_token=None,
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-
-    list_layer = index_lookup.IndexLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        num_oov_indices=2,
-        mask_token=None,
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-
-    expected_vocab = ["[OOV]", "[OOV]", "earth", "wind", "and", "fire"]
-    self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
-    expected_vocab_size = 6
-    self.assertAllEqual(expected_vocab_size, list_layer.vocabulary_size())
-    self.assertAllEqual(list_layer.get_vocabulary(),
-                        file_layer.get_vocabulary())
-    self.assertAllEqual(list_layer.vocabulary_size(),
-                        file_layer.vocabulary_size())
-
-  def test_int_output_file_vocab_no_mask(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "", "earth", "michigan"]])
-    expected_output = [[1, 2, 3, 4], [4, 0, 1, 0]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        mask_token=None,
-        num_oov_indices=1,
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_file_vocab_no_oov_or_mask(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "wind", "earth", "and"]])
-    expected_output = [[0, 1, 2, 3], [3, 1, 0, 2]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        mask_token=None,
-        num_oov_indices=0,
-        oov_token=None,
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_file_vocab_inversion(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([[1, 2, 3, 4], [4, 0, 1, 0]])
-    expected_output = [["earth", "wind", "and", "fire"],
-                       ["fire", "[OOV]", "earth", "[OOV]"]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-    idata = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        mask_token=None,
-        num_oov_indices=1,
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    _ = layer(idata)
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-
-    invert_layer = index_lookup.IndexLookup(
-        vocabulary=layer.get_vocabulary(),
-        max_tokens=None,
-        oov_token="[OOV]",
-        mask_token=None,
-        num_oov_indices=1,
-        invert=True,
-        vocabulary_dtype=tf.string)
-    int_data = invert_layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_int_file_vocab(self):
-    vocab_data = ["10", "20", "30", "40"]
-    input_array = np.array([[10, 20, 30, 40], [40, 0, 10, 42]])
-    expected_output = [[2, 3, 4, 5], [5, 0, 2, 1]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_file,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1,
-        vocabulary_dtype=tf.int64)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_dataset_map_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=0,
-        mask_token=None,
-        oov_token="[OOV]",
-        vocabulary=vocab_data,
-        vocabulary_dtype=tf.string)
-    ds = tf.data.Dataset.from_tensor_slices([["earth"], ["wind"], ["and"]])
-    ds = ds.map(layer)
-    self.assertAllEqual(list(ds.as_numpy_iterator()), [[0], [1], [2]])
-
-  def test_dataset_map_output_layer_created_in_function(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    def apply_lookup(data):
-      layer = index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=0,
-          mask_token=None,
-          oov_token="[OOV]",
-          vocabulary=vocab_data,
-          vocabulary_dtype=tf.string)
-      return layer(data)
-
-    ds = tf.data.Dataset.from_tensor_slices([["earth"], ["wind"], ["and"]])
-    ds = ds.map(apply_lookup)
-    self.assertAllEqual(list(ds.as_numpy_iterator()), [[0], [1], [2]])
+        input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=2,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
+    def test_ragged_int_input_multi_bucket(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.ragged.constant(
+            [[10, 11, 13], [13, 12, 10, 133]], dtype=np.int64
+        )
+        expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupVocabularyTest(test_combinations.TestCase,
-                                preprocessing_test_utils.PreprocessingLayerTest
-                               ):
-
-  def test_int_output_explicit_vocab(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_explicit_vocab_with_special_tokens(self):
-    vocab_data = ["", "[OOV]", "earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_get_vocabulary_no_special_tokens(self):
-    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=5,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary(include_special_tokens=False)
-    self.assertAllEqual(returned_vocab, ["wind", "and", "fire"])
-    self.assertAllEqual(layer.vocabulary_size(), 5)
-
-  def test_vocab_multi_oov(self):
-    vocab_data = ["", "[OOV]", "[OOV]", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=2,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(returned_vocab, vocab_data)
-
-  def test_vocab_multi_oov_not_present(self):
-    vocab_data = ["wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=10,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(returned_vocab,
-                        [""] + ["[OOV]"] * 10 + ["wind", "and", "fire"])
-
-  def test_vocab_with_max_cap(self):
-    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=5,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-    self.assertAllEqual(layer.vocabulary_size(), 5)
-
-  def test_int_vocab_with_max_cap(self):
-    vocab_data = [0, -1, 42, 1276, 1138]
-    layer = index_lookup.IndexLookup(
-        max_tokens=5,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1,
-        vocabulary_dtype=tf.int64)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-    self.assertAllEqual(layer.vocabulary_size(), 5)
-
-  def test_vocab_with_multiple_oov_indices(self):
-    vocab_data = ["", "[OOV]", "[OOV]", "[OOV]", "wind"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=3,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-
-  def test_int_vocab_with_multiple_oov_indices(self):
-    vocab_data = [0, -1, -1, -1, 42]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=3,
-        mask_token=0,
-        oov_token=-1,
-        vocabulary_dtype=tf.int64)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-
-  def test_non_unique_vocab_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire", "fire"]
-    with self.assertRaisesRegex(ValueError, "repeated term.*fire"):
-      _ = index_lookup.IndexLookup(
-          vocabulary=vocab_data,
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string)
-
-  def test_vocab_with_repeated_element_fails(self):
-    vocab_data = ["earth", "earth", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    with self.assertRaisesRegex(ValueError, "repeated term.*earth"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_vocab_with_reserved_oov_element_and_invert_true_fails(self):
-    vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        invert=True,
-        vocabulary_dtype=tf.string)
-    with self.assertRaisesRegex(ValueError, "reserved OOV"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_vocab_with_reserved_mask_element_fails(self):
-    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="mask_token",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    with self.assertRaisesRegex(ValueError, "reserved mask"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_vocab_size_changed_pad_to_max_false_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        pad_to_max_tokens=False,
-        output_mode=index_lookup.MULTI_HOT,
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    # Calling the layer should lock the vocabulary size.
-    _ = layer([["earth"]])
-    layer.set_vocabulary(vocab_data[:2])
-    with self.assertRaisesRegex(RuntimeError,
-                                "vocabulary size cannot be changed"):
-      # Calling the layer again should cause an error.
-      _ = layer([["earth"]])
-
-  def test_vocab_with_idf_weights_non_tfidf_output_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    weight_data = [1, 1, 1, 1, 1]
-    with self.assertRaisesRegex(ValueError,
-                                "`idf_weights` should only be set if"):
-      index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          output_mode=index_lookup.MULTI_HOT,
-          vocabulary_dtype=tf.string,
-          vocabulary=vocab_data,
-          idf_weights=weight_data)
-
-  def test_vocab_with_idf_weights_length_mismatch_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    weight_data = [1, 1, 1, 1, 1]  # too long
-    with self.assertRaisesRegex(
-        ValueError, "`idf_weights` must be the same length as vocab"):
-      index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          output_mode=index_lookup.TF_IDF,
-          vocabulary_dtype=tf.string,
-          vocabulary=vocab_data,
-          idf_weights=weight_data)
-
-  def test_vocab_without_idf_weights_tfidf_output_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    with self.assertRaisesRegex(
-        ValueError, "`idf_weights` must be set if output_mode is TF_IDF"):
-      index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          output_mode=index_lookup.TF_IDF,
-          vocabulary_dtype=tf.string,
-          vocabulary=vocab_data)
-
-  def test_non_unique_int_vocab_fails(self):
-    vocab_data = [12, 13, 14, 15, 15]
-    with self.assertRaisesRegex(ValueError, "repeated term.*15"):
-      _ = index_lookup.IndexLookup(
-          vocabulary=vocab_data,
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token=0,
-          oov_token=-1,
-          vocabulary_dtype=tf.int64)
-
-  def test_int_vocab_with_reserved_oov_element_and_invert_true_fails(self):
-    vocab_data = [14, 38, -1, 34, 3, 84]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1,
-        invert=True,
-        vocabulary_dtype=tf.int64)
-    with self.assertRaisesRegex(ValueError, "reserved OOV"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_int_vocab_with_reserved_mask_element_fails(self):
-    vocab_data = [125, 0, 3, 4, 94]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1,
-        vocabulary_dtype=tf.int64)
-    with self.assertRaisesRegex(ValueError, "reserved mask"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_no_vocab_file_string_fails(self):
-    with self.assertRaisesRegex(ValueError, "non_existent_file"):
-      _ = index_lookup.IndexLookup(
-          vocabulary="non_existent_file",
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token=0,
-          oov_token=-1,
-          vocabulary_dtype=tf.int64)
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            vocabulary_dtype=tf.int64,
+            num_oov_indices=2,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupInverseVocabularyTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_int_output_explicit_vocab(self):
-    vocab_data = ["", "[OOV]", "earth", "wind", "and", "fire"]
-    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
-    expected_output = np.array([["earth", "wind", "and", "fire"],
-                                ["fire", "and", "earth", "[OOV]"]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = index_lookup.IndexLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        invert=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_with_max_cap(self):
-    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=5,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        invert=True)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-
-  def test_int_vocab_with_max_cap(self):
-    vocab_data = [0, -1, 42, 1276, 1138]
-    layer = index_lookup.IndexLookup(
-        max_tokens=5,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1,
-        vocabulary_dtype=tf.int64,
-        invert=True)
-    layer.set_vocabulary(vocab_data)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-
-  def test_non_unique_vocab_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire", "fire"]
-    with self.assertRaisesRegex(ValueError, "repeated term.*fire"):
-      _ = index_lookup.IndexLookup(
-          vocabulary=vocab_data,
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string,
-          invert=True)
-
-  def test_non_int_output_fails(self):
-    with self.assertRaisesRegex(ValueError, "`output_mode` must be `'int'`"):
-      _ = index_lookup.IndexLookup(
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string,
-          output_mode=index_lookup.COUNT,
-          invert=True)
-
-  def test_vocab_with_repeated_element_fails(self):
-    vocab_data = ["earth", "earth", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        invert=True)
-    with self.assertRaisesRegex(ValueError, "repeated term.*earth"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_vocab_with_reserved_mask_element_fails(self):
-    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="mask_token",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        invert=True)
-    with self.assertRaisesRegex(ValueError, "reserved mask"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_non_unique_int_vocab_fails(self):
-    vocab_data = [12, 13, 14, 15, 15]
-    with self.assertRaisesRegex(ValueError, "repeated term.*15"):
-      _ = index_lookup.IndexLookup(
-          vocabulary=vocab_data,
-          max_tokens=None,
-          num_oov_indices=1,
-          mask_token=0,
-          oov_token=-1,
-          vocabulary_dtype=tf.int64,
-          invert=True)
-
-  def test_int_vocab_with_repeated_element_fails(self):
-    vocab_data = [11, 11, 34, 23, 124]
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token=0,
-        oov_token=-1,
-        vocabulary_dtype=tf.int64,
-        invert=True)
-    with self.assertRaisesRegex(ValueError, "repeated term.*11"):
-      layer.set_vocabulary(vocab_data)
+class CategoricalEncodingAdaptTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_sparse_adapt(self):
+        vocab_data = tf.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 2]],
+            values=["michigan", "fire", "michigan"],
+            dense_shape=[3, 4],
+        )
+        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
+
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.adapt(vocab_dataset)
+        expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
+        self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
+    def test_ragged_adapt(self):
+        vocab_data = tf.ragged.constant([["michigan"], ["fire", "michigan"]])
+        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
 
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupErrorTest(test_combinations.TestCase,
-                           preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_too_long_vocab_fails_in_single_setting(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    layer = index_lookup.IndexLookup(
-        max_tokens=4,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_zero_max_tokens_fails(self):
-    with self.assertRaisesRegex(ValueError, "max_tokens"):
-      _ = index_lookup.IndexLookup(
-          max_tokens=0,
-          num_oov_indices=1,
-          mask_token="",
-          oov_token="[OOV]",
-          vocabulary_dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.adapt(vocab_dataset)
+        expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
+        self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+    def test_sparse_int_input(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]],
+            values=np.array([13, 32], dtype=np.int64),
+            dense_shape=[3, 4],
+        )
+
+        expected_indices = [[0, 0], [1, 2]]
+        expected_values = [5, 1]
+        expected_dense_shape = [3, 4]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            vocabulary_dtype=tf.int64,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_indices, output_data.indices)
+        self.assertAllEqual(expected_values, output_data.values)
+        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+    def test_ragged_string_input(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = tf.ragged.constant(
+            [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]]
+        )
+        expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_ragged_int_input(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.ragged.constant(
+            [[10, 11, 13], [13, 12, 10, 42]], dtype=np.int64
+        )
+        expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            vocabulary_dtype=tf.int64,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_single_string_generator_dataset(self):
+        def word_gen():
+            for _ in itertools.count(1):
+                yield "".join(
+                    random.choice(string.ascii_letters) for i in range(2)
+                )
+
+        ds = tf.data.Dataset.from_generator(
+            word_gen, tf.string, tf.TensorShape([])
+        )
+        batched_ds = ds.take(2)
+        input_t = keras.Input(shape=(), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=10,
+            num_oov_indices=0,
+            mask_token=None,
+            oov_token=None,
+            vocabulary_dtype=tf.string,
+        )
+        _ = layer(input_t)
+        layer.adapt(batched_ds)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupSavingTest(test_combinations.TestCase,
-                            preprocessing_test_utils.PreprocessingLayerTest):
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def test_vocabulary_persistence_across_saving(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_vocabulary_persistence_file_across_cloning(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_file)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Clone the model and set weights.
-    new_model = keras.models.clone_model(model)
-    new_model.set_weights(model.get_weights())
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, new_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = new_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_persistence_file_vocabs_tf_save_tf_load(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_file)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    tf.saved_model.save(obj=model, export_dir=output_path)
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = tf.saved_model.load(output_path)
-    f = loaded_model.signatures["serving_default"]
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = f(tf.constant(input_array))["index_lookup"]
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_vocabulary_persistence_file_vocab_keras_save_tf_load(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_file)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = tf.saved_model.load(output_path)
-    f = loaded_model.signatures["serving_default"]
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = f(tf.constant(input_array))["index_lookup"]
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_persistence_file_vocab_keras_save_keras_load(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_file)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-    tf.io.gfile.remove(vocab_file)
-
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-    # Try re-saving the layer. This simulates saving a layer contained at
-    # a hub Module.
-    input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
-    output_2 = loaded_model(input_data_2)
-    model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
-    new_output_dataset = model_2.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model_2")
-    model_2.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_persistence_file_vocab_keras_save_keras_load_tf_save_tf_load(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_file)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-    tf.io.gfile.remove(vocab_file)
-
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-    # Try re-saving the layer. This simulates saving a layer contained at
-    # a hub Module.
-    input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
-    output_2 = loaded_model(input_data_2)
-    model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
-    new_output_dataset = model_2.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model_2")
-    tf.saved_model.save(model_2, output_path)
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = tf.saved_model.load(output_path)
-    f = loaded_model.signatures["serving_default"]
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = f(tf.constant(input_array))["model"]
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_persistence_file_vocab_keras_save_keras_load_keras_save_keras_load(
-      self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = index_lookup.IndexLookup(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_file)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-    tf.io.gfile.remove(vocab_file)
-
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-    # Try re-saving the layer. This simulates saving a layer contained at
-    # a hub Module.
-    input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
-    output_2 = loaded_model(input_data_2)
-    model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
-    new_output_dataset = model_2.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model_2")
-    model_2.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = model_2.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_static_table_config_weight_data_transfer_succeeds(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    vocab_file = self._write_to_temp_file("temp", vocab_data)
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    # Build and validate a golden model.
-    layer_cls = index_lookup.IndexLookup
-    layer = layer_cls(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_file)
-    config = layer.get_config()
-    weights = layer.get_weights()
-
-    layer = layer_cls.from_config(config)
-    layer.set_weights(weights)
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    output = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=output)
-
-    new_output_dataset = model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def test_sparse_output_across_saving(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-
-    expected_output = [[0., 1., 1., 1., 1.], [1., 1., 0., 1., 1.]]
-
-    layer_cls = index_lookup.IndexLookup
-    layer = layer_cls(
-        max_tokens=None,
-        num_oov_indices=1,
-        mask_token="",
-        oov_token="[OOV]",
-        vocabulary_dtype=tf.string,
-        vocabulary=vocab_data,
-        output_mode="multi_hot",
-        sparse=True)
-    config = layer.get_config()
-    layer = layer_cls.from_config(config)
-
-    output = layer(input_array)
-    self.assertIsInstance(output, tf.SparseTensor)
-    self.assertAllEqual(tf.sparse.to_dense(output), expected_output)
-
-
-class EagerExecutionDisabled(test_combinations.TestCase,
-                             preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_lookup(self):
-    # We need this test for model_to_estimator followed by export_saved_model,
-    # which will call the layer in a legacy session. This could also happen
-    # directly if a user calls disable_v2_behavior or disable_eager_execution.
-    with tf.compat.v1.Session():
-      with test_utils.run_eagerly_scope(False):
+class IndexLookupOutputTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    @parameterized.product(
+        rank=[0, 1, 2],
+        # Check lists, numpy arrays, tensors, and objects convertable to tensor.
+        data_fn=[
+            None,
+            np.array,
+            tf.constant,
+            preprocessing_test_utils.ArrayLike,
+        ],
+    )
+    def test_input_types(self, rank, data_fn):
+        input_data = vocab = ["earth", "wind", "and", "fire"]
+        expected_output = [2, 3, 4, 5]
+        if rank == 0:
+            input_data = input_data[0]
+            expected_output = expected_output[0]
+        elif rank == 2:
+            input_data = [input_data]
+            expected_output = [expected_output]
+        if data_fn is not None:
+            input_data = data_fn(input_data)
+        input_shape = [] if rank == 0 else [None]
+
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary=vocab,
+            vocabulary_dtype=tf.string,
+        )
+        output_data = layer(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+        # Again in a keras.Model
+        inputs = keras.Input(shape=input_shape, dtype=tf.string)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model(tf.constant(input_data))
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_int_output_shape(self):
+        input_data = keras.Input(batch_size=16, shape=(4,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=2,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(int_data.shape.as_list(), [16, 4])
+
+    @parameterized.named_parameters(
+        ("int32", tf.int32),
+        ("int64", tf.int64),
+    )
+    def test_int_output_dtype(self, dtype):
+        input_data = keras.Input(batch_size=16, shape=(4,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=2,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            dtype=dtype,
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(int_data.dtype, dtype)
+
+    def test_int_output_float_dtype_fails(self):
+        with self.assertRaisesRegex(ValueError, "`dtype` should be an integer"):
+            index_lookup.IndexLookup(
+                max_tokens=2,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+                dtype=tf.float32,
+            )
+
+    def test_int_output_no_reserved_zero(self):
         vocab_data = ["earth", "wind", "and", "fire"]
-        input_array = np.array(["earth", "wind", "and", "fire"])
-        expected_output = [1, 2, 3, 4]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
 
         input_data = keras.Input(shape=(None,), dtype=tf.string)
         layer = index_lookup.IndexLookup(
@@ -2230,17 +1019,1771 @@ def test_lookup(self):
             mask_token=None,
             oov_token="[OOV]",
             vocabulary_dtype=tf.string,
-            vocabulary=vocab_data)
+        )
+        layer.set_vocabulary(vocab_data)
         int_data = layer(input_data)
         model = keras.Model(inputs=input_data, outputs=int_data)
-        # In a TF1 session the user will need to make sure all tables are
-        # initialized themselves.
-        tf.compat.v1.tables_initializer().run()
-        output_dataset = model(input_array)
-        self.assertAllEqual(output_dataset, expected_output)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_no_oov(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        valid_input = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]
+        )
+        invalid_input = np.array(
+            [
+                ["earth", "wind", "and", "michigan"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=0,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(valid_input)
+        self.assertAllEqual(expected_output, output_data)
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError, "found OOV values.*michigan"
+        ):
+            _ = model.predict(invalid_input)
+
+    def test_int_output_no_oov_ragged(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        valid_input = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]
+        )
+        invalid_input = np.array(
+            [
+                ["earth", "wind", "and", "michigan"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        valid_input = tf.RaggedTensor.from_tensor(valid_input)
+        invalid_input = tf.RaggedTensor.from_tensor(invalid_input)
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=0,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(valid_input)
+        self.assertAllEqual(expected_output, output_data)
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError, "found OOV values.*michigan"
+        ):
+            _ = model.predict(invalid_input)
+
+    def test_int_output_no_oov_sparse(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        valid_input = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]
+        )
+        invalid_input = np.array(
+            [
+                ["earth", "wind", "and", "michigan"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        valid_input = tf.sparse.from_dense(valid_input)
+        invalid_input = tf.sparse.from_dense(invalid_input)
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=0,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(valid_input)
+        self.assertAllEqual(expected_output, tf.sparse.to_dense(output_data))
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError, "found OOV values.*michigan"
+        ):
+            _ = model.predict(invalid_input)
+
+    def test_int_output_explicit_vocab(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_one_hot_output_hard_maximum(self):
+        """Check binary output when pad_to_max_tokens=True."""
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(["earth", "wind", "and", "fire", "michigan", ""])
+        expected_output = [
+            [0, 1, 0, 0, 0, 0],
+            [0, 0, 1, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 0, 1, 0],
+            [1, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 0],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=6,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            output_mode=index_lookup.ONE_HOT,
+            pad_to_max_tokens=True,
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        binary_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=binary_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_one_hot_output_soft_maximum(self):
+        """Check binary output when pad_to_max_tokens=False."""
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(["earth", "wind", "and", "fire", "michigan", ""])
+        expected_output = [
+            [0, 1, 0, 0, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 0],
+            [0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            output_mode=index_lookup.ONE_HOT,
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        binary_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=binary_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_one_hot_output_rank_zero_no_oov(self):
+        """Check binary output when pad_to_max_tokens=False."""
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_data = tf.constant("earth")
+        expected_output = [1, 0, 0, 0]
+
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=0,
+            mask_token="",
+            oov_token="[OOV]",
+            output_mode=index_lookup.ONE_HOT,
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        output_data = layer(input_data)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_one_hot_output_shape(self):
+        inputs = keras.Input(batch_size=16, shape=(1,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=["earth"],
+            max_tokens=2,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            output_mode=index_lookup.ONE_HOT,
+            vocabulary_dtype=tf.string,
+        )
+        outputs = layer(inputs)
+        self.assertAllEqual(outputs.shape.as_list(), [16, 2])
+
+    @parameterized.product(
+        sparse=[True, False],
+        adapt=[True, False],
+        pad_to_max=[True, False],
+        mode=["multi_hot", "count", "tf_idf"],
+        dtype=[tf.float32, tf.float64],
+    )
+    def test_binned_output(self, sparse, adapt, pad_to_max, mode, dtype):
+        """Check "multi_hot", "count", and "tf_idf" output."""
+        # Adapt breaks ties with sort order.
+        vocab_data = ["wind", "fire", "earth", "and"]
+        # IDF weight for a term in 1 out of 1 document is log(1 + 1/2).
+        idf_data = [math.log(1.5)] * 4
+        input_data = np.array(
+            [
+                ["and", "earth", "fire", "and", ""],
+                ["michigan", "wind", "and", "ohio", ""],
+            ]
+        )
+
+        if mode == "count":
+            expected_output = np.array(
+                [
+                    [0, 0, 1, 1, 2],
+                    [2, 1, 0, 0, 1],
+                ]
+            )
+        elif mode == "tf_idf":
+            expected_output = np.array(
+                [
+                    [0, 0, 1, 1, 2],
+                    [2, 1, 0, 0, 1],
+                ]
+            ) * math.log(1.5)
+        else:
+            expected_output = np.array(
+                [
+                    [0, 0, 1, 1, 1],
+                    [1, 1, 0, 0, 1],
+                ]
+            )
+        expected_output_shape = [None, 5]
+        if pad_to_max:
+            expected_output = np.concatenate(
+                (expected_output, [[0], [0]]), axis=1
+            )
+            expected_output_shape = [None, 6]
+
+        inputs = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=6,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            output_mode=mode,
+            pad_to_max_tokens=pad_to_max,
+            vocabulary_dtype=tf.string,
+            sparse=sparse,
+            vocabulary=None if adapt else vocab_data,
+            idf_weights=None if adapt or mode != "tf_idf" else idf_data,
+            dtype=dtype,
+        )
+        if adapt:
+            layer.adapt(vocab_data)
+        outputs = layer(inputs)
+        model = keras.Model(inputs, outputs)
+        output_data = model.predict(input_data)
+        if sparse:
+            output_data = tf.sparse.to_dense(output_data)
+        # Check output data.
+        self.assertAllClose(expected_output, output_data)
+        # Check symbolic output shape.
+        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
+        # Check output dtype.
+        self.assertAllEqual(dtype, output_data.dtype)
+
+    def test_multi_hot_output_no_oov(self):
+        """Check multi hot output when num_oov_indices=0."""
+        vocab_data = ["earth", "wind", "and", "fire"]
+        valid_input = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]
+        )
+        invalid_input = np.array(
+            [
+                ["earth", "wind", "and", "michigan"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [
+            [1, 1, 1, 1, 0],
+            [1, 0, 1, 1, 0],
+        ]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=5,
+            num_oov_indices=0,
+            mask_token="",
+            oov_token="[OOV]",
+            output_mode=index_lookup.MULTI_HOT,
+            pad_to_max_tokens=True,
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        binary_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=binary_data)
+        output_data = model.predict(valid_input)
+        self.assertAllEqual(expected_output, output_data)
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError, "found OOV values.*michigan"
+        ):
+            _ = model.predict(invalid_input)
+
+    def test_multi_hot_output_hard_maximum_multiple_adapts(self):
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+        adapt_data = [
+            "earth",
+            "earth",
+            "earth",
+            "earth",
+            "wind",
+            "wind",
+            "wind",
+        ]
+        first_expected_output = [
+            [1, 1, 1, 0, 0],
+            [1, 1, 0, 0, 0],
+        ]
+        second_adapt_data = [
+            "earth",
+            "earth",
+            "earth",
+            "earth",
+            "wind",
+            "wind",
+            "wind",
+            "and",
+            "and",
+            "fire",
+        ]
+        second_expected_output = [
+            [0, 1, 1, 1, 0],
+            [1, 1, 0, 1, 0],
+        ]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=5,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            output_mode=index_lookup.MULTI_HOT,
+            pad_to_max_tokens=True,
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        # Test the first adapt
+        layer.adapt(adapt_data)
+        first_output = model.predict(input_array)
+        # Test the second adapt
+        layer.adapt(second_adapt_data)
+        # We need to recompile the model to retrace our call graph.
+        model.compile()
+        second_output = model.predict(input_array)
+        self.assertAllEqual(first_expected_output, first_output)
+        self.assertAllEqual(second_expected_output, second_output)
+
+    def test_int_output_file_vocab(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 0, 2, 1]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_non_int_output_file_vocab_in_tf_function(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = tf.constant(
+            [
+                ["earth", "wind", "and", "fire", ""],
+                ["fire", "and", "earth", "michigan", ""],
+            ],
+            dtype=tf.string,
+        )
+
+        expected_output = [
+            [0, 1, 1, 1, 1],
+            [1, 1, 0, 1, 1],
+        ]
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        @tf.function
+        def compute(data):
+            layer = index_lookup.IndexLookup(
+                vocabulary=vocab_file,
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                output_mode=index_lookup.MULTI_HOT,
+                vocabulary_dtype=tf.string,
+            )
+            return layer(data)
+
+        output_dataset = compute(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_file_vocab_and_list_vocab_identical_attrs(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        file_layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+
+        list_layer = index_lookup.IndexLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+
+        expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
+        self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
+        expected_vocab_size = 6
+        self.assertAllEqual(expected_vocab_size, list_layer.vocabulary_size())
+        self.assertAllEqual(
+            list_layer.get_vocabulary(), file_layer.get_vocabulary()
+        )
+        self.assertAllEqual(
+            list_layer.vocabulary_size(), file_layer.vocabulary_size()
+        )
+
+    def test_file_vocab_and_list_vocab_identical_attrs_multi_oov(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        file_layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            num_oov_indices=2,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+
+        list_layer = index_lookup.IndexLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            num_oov_indices=2,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+
+        expected_vocab = ["", "[OOV]", "[OOV]", "earth", "wind", "and", "fire"]
+        self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
+        expected_vocab_size = 7
+        self.assertAllEqual(expected_vocab_size, list_layer.vocabulary_size())
+        self.assertAllEqual(
+            list_layer.get_vocabulary(), file_layer.get_vocabulary()
+        )
+        self.assertAllEqual(
+            list_layer.vocabulary_size(), file_layer.vocabulary_size()
+        )
+
+    def test_file_vocab_and_list_vocab_identical_attrs_no_mask(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        file_layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            num_oov_indices=2,
+            mask_token=None,
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+
+        list_layer = index_lookup.IndexLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            num_oov_indices=2,
+            mask_token=None,
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+
+        expected_vocab = ["[OOV]", "[OOV]", "earth", "wind", "and", "fire"]
+        self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
+        expected_vocab_size = 6
+        self.assertAllEqual(expected_vocab_size, list_layer.vocabulary_size())
+        self.assertAllEqual(
+            list_layer.get_vocabulary(), file_layer.get_vocabulary()
+        )
+        self.assertAllEqual(
+            list_layer.vocabulary_size(), file_layer.vocabulary_size()
+        )
+
+    def test_int_output_file_vocab_no_mask(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[1, 2, 3, 4], [4, 0, 1, 0]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            mask_token=None,
+            num_oov_indices=1,
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_file_vocab_no_oov_or_mask(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "wind", "earth", "and"]]
+        )
+        expected_output = [[0, 1, 2, 3], [3, 1, 0, 2]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            mask_token=None,
+            num_oov_indices=0,
+            oov_token=None,
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_file_vocab_inversion(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array([[1, 2, 3, 4], [4, 0, 1, 0]])
+        expected_output = [
+            ["earth", "wind", "and", "fire"],
+            ["fire", "[OOV]", "earth", "[OOV]"],
+        ]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+        idata = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            mask_token=None,
+            num_oov_indices=1,
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        _ = layer(idata)
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+
+        invert_layer = index_lookup.IndexLookup(
+            vocabulary=layer.get_vocabulary(),
+            max_tokens=None,
+            oov_token="[OOV]",
+            mask_token=None,
+            num_oov_indices=1,
+            invert=True,
+            vocabulary_dtype=tf.string,
+        )
+        int_data = invert_layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_int_file_vocab(self):
+        vocab_data = ["10", "20", "30", "40"]
+        input_array = np.array([[10, 20, 30, 40], [40, 0, 10, 42]])
+        expected_output = [[2, 3, 4, 5], [5, 0, 2, 1]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_file,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+            vocabulary_dtype=tf.int64,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_dataset_map_output(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=0,
+            mask_token=None,
+            oov_token="[OOV]",
+            vocabulary=vocab_data,
+            vocabulary_dtype=tf.string,
+        )
+        ds = tf.data.Dataset.from_tensor_slices([["earth"], ["wind"], ["and"]])
+        ds = ds.map(layer)
+        self.assertAllEqual(list(ds.as_numpy_iterator()), [[0], [1], [2]])
+
+    def test_dataset_map_output_layer_created_in_function(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        def apply_lookup(data):
+            layer = index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=0,
+                mask_token=None,
+                oov_token="[OOV]",
+                vocabulary=vocab_data,
+                vocabulary_dtype=tf.string,
+            )
+            return layer(data)
+
+        ds = tf.data.Dataset.from_tensor_slices([["earth"], ["wind"], ["and"]])
+        ds = ds.map(apply_lookup)
+        self.assertAllEqual(list(ds.as_numpy_iterator()), [[0], [1], [2]])
+
+
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class IndexLookupVocabularyTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_int_output_explicit_vocab(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_explicit_vocab_with_special_tokens(self):
+        vocab_data = ["", "[OOV]", "earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_get_vocabulary_no_special_tokens(self):
+        vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=5,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary(include_special_tokens=False)
+        self.assertAllEqual(returned_vocab, ["wind", "and", "fire"])
+        self.assertAllEqual(layer.vocabulary_size(), 5)
+
+    def test_vocab_multi_oov(self):
+        vocab_data = ["", "[OOV]", "[OOV]", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=2,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(returned_vocab, vocab_data)
+
+    def test_vocab_multi_oov_not_present(self):
+        vocab_data = ["wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=10,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(
+            returned_vocab, [""] + ["[OOV]"] * 10 + ["wind", "and", "fire"]
+        )
+
+    def test_vocab_with_max_cap(self):
+        vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=5,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+        self.assertAllEqual(layer.vocabulary_size(), 5)
+
+    def test_int_vocab_with_max_cap(self):
+        vocab_data = [0, -1, 42, 1276, 1138]
+        layer = index_lookup.IndexLookup(
+            max_tokens=5,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+            vocabulary_dtype=tf.int64,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+        self.assertAllEqual(layer.vocabulary_size(), 5)
+
+    def test_vocab_with_multiple_oov_indices(self):
+        vocab_data = ["", "[OOV]", "[OOV]", "[OOV]", "wind"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=3,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+
+    def test_int_vocab_with_multiple_oov_indices(self):
+        vocab_data = [0, -1, -1, -1, 42]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=3,
+            mask_token=0,
+            oov_token=-1,
+            vocabulary_dtype=tf.int64,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+
+    def test_non_unique_vocab_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire", "fire"]
+        with self.assertRaisesRegex(ValueError, "repeated term.*fire"):
+            _ = index_lookup.IndexLookup(
+                vocabulary=vocab_data,
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+            )
+
+    def test_vocab_with_repeated_element_fails(self):
+        vocab_data = ["earth", "earth", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        with self.assertRaisesRegex(ValueError, "repeated term.*earth"):
+            layer.set_vocabulary(vocab_data)
+
+    def test_vocab_with_reserved_oov_element_and_invert_true_fails(self):
+        vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            invert=True,
+            vocabulary_dtype=tf.string,
+        )
+        with self.assertRaisesRegex(ValueError, "reserved OOV"):
+            layer.set_vocabulary(vocab_data)
+
+    def test_vocab_with_reserved_mask_element_fails(self):
+        vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="mask_token",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        with self.assertRaisesRegex(ValueError, "reserved mask"):
+            layer.set_vocabulary(vocab_data)
+
+    def test_vocab_size_changed_pad_to_max_false_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            pad_to_max_tokens=False,
+            output_mode=index_lookup.MULTI_HOT,
+            vocabulary_dtype=tf.string,
+        )
+        layer.set_vocabulary(vocab_data)
+        # Calling the layer should lock the vocabulary size.
+        _ = layer([["earth"]])
+        with self.assertRaisesRegex(
+            RuntimeError, "vocabulary size cannot be changed"
+        ):
+            layer.set_vocabulary(vocab_data[:2])
+
+    def test_vocab_with_idf_weights_non_tfidf_output_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        weight_data = [1, 1, 1, 1, 1]
+        with self.assertRaisesRegex(
+            ValueError, "`idf_weights` should only be set if"
+        ):
+            index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                output_mode=index_lookup.MULTI_HOT,
+                vocabulary_dtype=tf.string,
+                vocabulary=vocab_data,
+                idf_weights=weight_data,
+            )
+
+    def test_vocab_with_idf_weights_length_mismatch_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        weight_data = [1, 1, 1, 1, 1]  # too long
+        with self.assertRaisesRegex(
+            ValueError, "`idf_weights` must be the same length as vocab"
+        ):
+            index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                output_mode=index_lookup.TF_IDF,
+                vocabulary_dtype=tf.string,
+                vocabulary=vocab_data,
+                idf_weights=weight_data,
+            )
+
+    def test_vocab_without_idf_weights_tfidf_output_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        with self.assertRaisesRegex(
+            ValueError, "`idf_weights` must be set if output_mode is TF_IDF"
+        ):
+            index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                output_mode=index_lookup.TF_IDF,
+                vocabulary_dtype=tf.string,
+                vocabulary=vocab_data,
+            )
+
+    def test_non_unique_int_vocab_fails(self):
+        vocab_data = [12, 13, 14, 15, 15]
+        with self.assertRaisesRegex(ValueError, "repeated term.*15"):
+            _ = index_lookup.IndexLookup(
+                vocabulary=vocab_data,
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token=0,
+                oov_token=-1,
+                vocabulary_dtype=tf.int64,
+            )
+
+    def test_int_vocab_with_reserved_oov_element_and_invert_true_fails(self):
+        vocab_data = [14, 38, -1, 34, 3, 84]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+            invert=True,
+            vocabulary_dtype=tf.int64,
+        )
+        with self.assertRaisesRegex(ValueError, "reserved OOV"):
+            layer.set_vocabulary(vocab_data)
+
+    def test_int_vocab_with_reserved_mask_element_fails(self):
+        vocab_data = [125, 0, 3, 4, 94]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+            vocabulary_dtype=tf.int64,
+        )
+        with self.assertRaisesRegex(ValueError, "reserved mask"):
+            layer.set_vocabulary(vocab_data)
+
+    def test_no_vocab_file_string_fails(self):
+        with self.assertRaisesRegex(ValueError, "non_existent_file"):
+            _ = index_lookup.IndexLookup(
+                vocabulary="non_existent_file",
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token=0,
+                oov_token=-1,
+                vocabulary_dtype=tf.int64,
+            )
+
+
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class IndexLookupInverseVocabularyTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_int_output_explicit_vocab(self):
+        vocab_data = ["", "[OOV]", "earth", "wind", "and", "fire"]
+        input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
+        expected_output = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "[OOV]"],
+            ]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = index_lookup.IndexLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            invert=True,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_vocab_with_max_cap(self):
+        vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=5,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            invert=True,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+
+    def test_int_vocab_with_max_cap(self):
+        vocab_data = [0, -1, 42, 1276, 1138]
+        layer = index_lookup.IndexLookup(
+            max_tokens=5,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+            vocabulary_dtype=tf.int64,
+            invert=True,
+        )
+        layer.set_vocabulary(vocab_data)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+
+    def test_non_unique_vocab_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire", "fire"]
+        with self.assertRaisesRegex(ValueError, "repeated term.*fire"):
+            _ = index_lookup.IndexLookup(
+                vocabulary=vocab_data,
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+                invert=True,
+            )
+
+    def test_non_int_output_fails(self):
+        with self.assertRaisesRegex(
+            ValueError, "`output_mode` must be `'int'`"
+        ):
+            _ = index_lookup.IndexLookup(
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+                output_mode=index_lookup.COUNT,
+                invert=True,
+            )
+
+    def test_vocab_with_repeated_element_fails(self):
+        vocab_data = ["earth", "earth", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            invert=True,
+        )
+        with self.assertRaisesRegex(ValueError, "repeated term.*earth"):
+            layer.set_vocabulary(vocab_data)
+
+    def test_vocab_with_reserved_mask_element_fails(self):
+        vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="mask_token",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            invert=True,
+        )
+        with self.assertRaisesRegex(ValueError, "reserved mask"):
+            layer.set_vocabulary(vocab_data)
+
+    def test_non_unique_int_vocab_fails(self):
+        vocab_data = [12, 13, 14, 15, 15]
+        with self.assertRaisesRegex(ValueError, "repeated term.*15"):
+            _ = index_lookup.IndexLookup(
+                vocabulary=vocab_data,
+                max_tokens=None,
+                num_oov_indices=1,
+                mask_token=0,
+                oov_token=-1,
+                vocabulary_dtype=tf.int64,
+                invert=True,
+            )
+
+    def test_int_vocab_with_repeated_element_fails(self):
+        vocab_data = [11, 11, 34, 23, 124]
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token=0,
+            oov_token=-1,
+            vocabulary_dtype=tf.int64,
+            invert=True,
+        )
+        with self.assertRaisesRegex(ValueError, "repeated term.*11"):
+            layer.set_vocabulary(vocab_data)
+
+
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class IndexLookupErrorTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_too_long_vocab_fails_in_single_setting(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        layer = index_lookup.IndexLookup(
+            max_tokens=4,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+        )
+        with self.assertRaisesRegex(
+            ValueError, "vocabulary larger than the maximum vocab"
+        ):
+            layer.set_vocabulary(vocab_data)
+
+    def test_zero_max_tokens_fails(self):
+        with self.assertRaisesRegex(ValueError, "max_tokens"):
+            _ = index_lookup.IndexLookup(
+                max_tokens=0,
+                num_oov_indices=1,
+                mask_token="",
+                oov_token="[OOV]",
+                vocabulary_dtype=tf.string,
+            )
+
+
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class IndexLookupSavingTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def test_vocabulary_persistence_across_saving(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        with self.subTest("keras_v3"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+        with self.subTest("savedmodel"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            keras.backend.clear_session()
+            tf.io.gfile.remove(vocab_file)
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_vocabulary_persistence_file_across_cloning(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        # Clone the model and set weights.
+        new_model = keras.models.clone_model(model)
+        new_model.set_weights(model.get_weights())
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, new_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = new_model.predict(input_array)
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_persistence_file_vocabs_tf_save_tf_load(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        tf.saved_model.save(obj=model, export_dir=output_path)
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+
+        loaded_model = tf.saved_model.load(output_path)
+        f = loaded_model.signatures["serving_default"]
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = f(tf.constant(input_array))["index_lookup"]
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_vocabulary_persistence_file_vocab_keras_save_tf_load(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format="tf")
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+
+        loaded_model = tf.saved_model.load(output_path)
+        f = loaded_model.signatures["serving_default"]
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = f(tf.constant(input_array))["index_lookup"]
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_persistence_file_vocab_keras_save_keras_load(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        with self.subTest("keras_v3"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Try re-saving the layer. This simulates saving a layer
+            # contained at a hub Module.
+            input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
+            output_2 = loaded_model(input_data_2)
+            model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
+            new_output_dataset = model_2.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model_2.keras"
+            )
+            model_2.save(output_path, save_format="keras_v3")
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+        with self.subTest("saved_model"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            keras.backend.clear_session()
+            tf.io.gfile.remove(vocab_file)
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Try re-saving the layer. This simulates saving a layer
+            # contained at a hub Module.
+            input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
+            output_2 = loaded_model(input_data_2)
+            model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
+            new_output_dataset = model_2.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model_2"
+            )
+            model_2.save(output_path, save_format="tf")
+
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            keras.backend.clear_session()
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_persistence_file_vocab_keras_save_keras_load_tf_save_tf_load(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format="tf")
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+        tf.io.gfile.remove(vocab_file)
+
+        loaded_model = keras.models.load_model(
+            output_path,
+            custom_objects={"IndexLookup": index_lookup.IndexLookup},
+        )
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = loaded_model.predict(input_array)
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+        # Try re-saving the layer. This simulates saving a layer contained at
+        # a hub Module.
+        input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
+        output_2 = loaded_model(input_data_2)
+        model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
+        new_output_dataset = model_2.predict(input_array)
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(
+            self.get_temp_dir(), "tf_keras_saved_model_2"
+        )
+        tf.saved_model.save(model_2, output_path)
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+
+        loaded_model = tf.saved_model.load(output_path)
+        f = loaded_model.signatures["serving_default"]
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = f(tf.constant(input_array))["model"]
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_persistence_file_vocab_keras_save_keras_load_keras_save_keras_load(
+        self,
+    ):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = index_lookup.IndexLookup(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format="tf")
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+        tf.io.gfile.remove(vocab_file)
+
+        loaded_model = keras.models.load_model(
+            output_path,
+            custom_objects={"IndexLookup": index_lookup.IndexLookup},
+        )
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = loaded_model.predict(input_array)
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+        # Try re-saving the layer. This simulates saving a layer contained at
+        # a hub Module.
+        input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
+        output_2 = loaded_model(input_data_2)
+        model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
+        new_output_dataset = model_2.predict(input_array)
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(
+            self.get_temp_dir(), "tf_keras_saved_model_2"
+        )
+        model_2.save(output_path, save_format="tf")
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+
+        loaded_model = keras.models.load_model(
+            output_path,
+            custom_objects={"IndexLookup": index_lookup.IndexLookup},
+        )
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = model_2.predict(input_array)
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_static_table_config_weight_data_transfer_succeeds(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        # Build and validate a golden model.
+        layer_cls = index_lookup.IndexLookup
+        layer = layer_cls(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
+        )
+        config = layer.get_config()
+        weights = layer.get_weights()
+
+        layer = layer_cls.from_config(config)
+        layer.set_weights(weights)
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        output = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=output)
+
+        new_output_dataset = model.predict(input_array)
+        self.assertAllEqual(new_output_dataset, expected_output)
+
+    def test_sparse_output_across_saving(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+
+        expected_output = [[0.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 0.0, 1.0, 1.0]]
+
+        layer_cls = index_lookup.IndexLookup
+        layer = layer_cls(
+            max_tokens=None,
+            num_oov_indices=1,
+            mask_token="",
+            oov_token="[OOV]",
+            vocabulary_dtype=tf.string,
+            vocabulary=vocab_data,
+            output_mode="multi_hot",
+            sparse=True,
+        )
+        config = layer.get_config()
+        layer = layer_cls.from_config(config)
+
+        output = layer(input_array)
+        self.assertIsInstance(output, tf.SparseTensor)
+        self.assertAllEqual(tf.sparse.to_dense(output), expected_output)
+
+
+class EagerExecutionDisabled(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_lookup(self):
+        # We need this test for model_to_estimator followed by
+        # export_saved_model, which will call the layer in a legacy session.
+        # This could also happen directly if a user calls disable_v2_behavior or
+        # disable_eager_execution.
+        with tf.compat.v1.Session():
+            with test_utils.run_eagerly_scope(False):
+                vocab_data = ["earth", "wind", "and", "fire"]
+                input_array = np.array(["earth", "wind", "and", "fire"])
+                expected_output = [1, 2, 3, 4]
+
+                input_data = keras.Input(shape=(None,), dtype=tf.string)
+                layer = index_lookup.IndexLookup(
+                    max_tokens=None,
+                    num_oov_indices=1,
+                    mask_token=None,
+                    oov_token="[OOV]",
+                    vocabulary_dtype=tf.string,
+                    vocabulary=vocab_data,
+                )
+                int_data = layer(input_data)
+                model = keras.Model(inputs=input_data, outputs=int_data)
+                # In a TF1 session the user will need to make sure all tables
+                # are initialized themselves.
+                tf.compat.v1.tables_initializer().run()
+                output_dataset = model(input_array)
+                self.assertAllEqual(output_dataset, expected_output)
 
 
 if __name__ == "__main__":
-  # IndexLookup is only exported as a TF2 API.
-  tf.compat.v1.enable_v2_behavior()
-  tf.test.main()
+    # IndexLookup is only exported as a TF2 API.
+    tf.compat.v1.enable_v2_behavior()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/integer_lookup.py b/keras/layers/preprocessing/integer_lookup.py
index b24c32daa78f..78601201f63f 100644
--- a/keras/layers/preprocessing/integer_lookup.py
+++ b/keras/layers/preprocessing/integer_lookup.py
@@ -14,13 +14,14 @@
 # ==============================================================================
 """Keras string lookup preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import index_lookup
-import numpy as np
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -28,405 +29,435 @@
 @keras_export(
     "keras.layers.IntegerLookup",
     "keras.layers.experimental.preprocessing.IntegerLookup",
-    v1=[])
+    v1=[],
+)
 class IntegerLookup(index_lookup.IndexLookup):
-  """A preprocessing layer which maps integer features to contiguous ranges.
-
-  This layer maps a set of arbitrary integer input tokens into indexed
-  integer output via a table-based vocabulary lookup. The layer's output indices
-  will be contiguously arranged up to the maximum vocab size, even if the input
-  tokens are non-continguous or unbounded. The layer supports multiple options
-  for encoding the output via `output_mode`, and has optional support for
-  out-of-vocabulary (OOV) tokens and masking.
-
-  The vocabulary for the layer must be either supplied on construction or
-  learned via `adapt()`. During `adapt()`, the layer will analyze a data set,
-  determine the frequency of individual integer tokens, and create a vocabulary
-  from them. If the vocabulary is capped in size, the most frequent tokens will
-  be used to create the vocabulary and all others will be treated as OOV.
-
-  There are two possible output modes for the layer.
-  When `output_mode` is `"int"`,
-  input integers are converted to their index in the vocabulary (an integer).
-  When `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"`, input integers
-  are encoded into an array where each dimension corresponds to an element in
-  the vocabulary.
-
-  The vocabulary can optionally contain a mask token as well as an OOV token
-  (which can optionally occupy multiple indices in the vocabulary, as set
-  by `num_oov_indices`).
-  The position of these tokens in the vocabulary is fixed. When `output_mode` is
-  `"int"`, the vocabulary will begin with the mask token at index 0, followed by
-  OOV indices, followed by the rest of the vocabulary. When `output_mode` is
-  `"multi_hot"`, `"count"`, or `"tf_idf"` the vocabulary will begin with OOV
-  indices and instances of the mask token will be dropped.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    max_tokens: Maximum size of the vocabulary for this layer. This should only
-      be specified when adapting the vocabulary or when setting
-      `pad_to_max_tokens=True`. If None, there is no cap on the size of the
-      vocabulary. Note that this size includes the OOV and mask tokens. Defaults
-      to None.
-    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
-      value is more than 1, OOV inputs are modulated to determine their OOV
-      value. If this value is 0, OOV inputs will cause an error when calling the
-      layer. Defaults to 1.
-    mask_token: An integer token that represents masked inputs. When
-      `output_mode` is `"int"`, the token is included in vocabulary and mapped
-      to index 0. In other output modes, the token will not appear in the
-      vocabulary and instances of the mask token in the input will be dropped.
-      If set to None, no mask term will be added. Defaults to None.
-    oov_token: Only used when `invert` is True. The token to return for OOV
-      indices. Defaults to -1.
-    vocabulary: Optional. Either an array of integers or a string path to a text
-      file. If passing an array, can pass a tuple, list, 1D numpy array, or 1D
-      tensor containing the integer vocbulary terms. If passing a file path, the
-      file should contain one line per term in the vocabulary. If this argument
-      is set, there is no need to `adapt()` the layer.
-    vocabulary_dtype: The dtype of the vocabulary terms, for example
-      `"int64"` or `"int32"`. Defaults to `"int64"`.
-    idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list, 1D
-      numpy array, or 1D tensor or the same length as the vocabulary, containing
-      the floating point inverse document frequency weights, which will be
-      multiplied by per sample term counts for the final `tf_idf` weight. If the
-      `vocabulary` argument is set, and `output_mode` is `"tf_idf"`, this
-      argument must be supplied.
-    invert: Only valid when `output_mode` is `"int"`. If True, this layer will
-      map indices to vocabulary items instead of mapping vocabulary items to
-      indices. Default to False.
-    output_mode: Specification for the output of the layer. Defaults to `"int"`.
-      Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or
-      `"tf_idf"` configuring the layer as follows:
-        - `"int"`: Return the vocabulary indices of the input tokens.
-        - `"one_hot"`: Encodes each individual element in the input into an
-          array the same size as the vocabulary, containing a 1 at the element
-          index. If the last dimension is size 1, will encode on that dimension.
-          If the last dimension is not size 1, will append a new dimension for
-          the encoded output.
-        - `"multi_hot"`: Encodes each sample in the input into a single array
-          the same size as the vocabulary, containing a 1 for each vocabulary
-          term present in the sample. Treats the last dimension as the sample
-          dimension, if input shape is (..., sample_length), output shape will
-          be (..., num_tokens).
-        - `"count"`: As `"multi_hot"`, but the int array contains a count of the
-          number of times the token at that index appeared in the sample.
-        - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
-          find the value in each token slot.
-      For `"int"` output, any shape of input and output is supported. For all
-      other output modes, currently only output up to rank 2 is supported.
-    pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
-      `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
-      padded to `max_tokens` even if the number of unique tokens in the
-      vocabulary is less than max_tokens, resulting in a tensor of shape
-      [batch_size, max_tokens] regardless of vocabulary size. Defaults to False.
-    sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
-      `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
-      dense `Tensor`. Defaults to False.
-
-  Examples:
-
-  **Creating a lookup layer with a known vocabulary**
-
-  This example creates a lookup layer with a pre-existing vocabulary.
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])  # Note OOV tokens
-  >>> layer = tf.keras.layers.IntegerLookup(vocabulary=vocab)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[1, 3, 4],
-         [4, 0, 2]])>
-
-  **Creating a lookup layer with an adapted vocabulary**
-
-  This example creates a lookup layer and generates the vocabulary by analyzing
-  the dataset.
-
-  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
-  >>> layer = tf.keras.layers.IntegerLookup()
-  >>> layer.adapt(data)
-  >>> layer.get_vocabulary()
-  [-1, 42, 1138, 1000, 36, 12]
-
-  Note that the OOV token -1 have been added to the vocabulary. The remaining
-  tokens are sorted by frequency (42, which has 2 occurrences, is first) then
-  by inverse sort order.
-
-  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
-  >>> layer = tf.keras.layers.IntegerLookup()
-  >>> layer.adapt(data)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[5, 2, 1],
-         [1, 3, 4]])>
-
-
-  **Lookups with multiple OOV indices**
-
-  This example demonstrates how to use a lookup layer with multiple OOV indices.
-  When a layer is created with more than one OOV index, any OOV tokens are
-  hashed into the number of OOV buckets, distributing OOV tokens in a
-  deterministic fashion across the set.
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42], [37, 1000, 36]])
-  >>> layer = tf.keras.layers.IntegerLookup(vocabulary=vocab, num_oov_indices=2)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[2, 4, 5],
-         [1, 0, 3]])>
-
-  Note that the output for OOV token 37 is 1, while the output for OOV token
-  1000 is 0. The in-vocab terms have their output index increased by 1 from
-  earlier examples (12 maps to 2, etc) in order to make space for the extra OOV
-  token.
-
-  **One-hot output**
-
-  Configure the layer with `output_mode='one_hot'`. Note that the first
-  `num_oov_indices` dimensions in the ont_hot encoding represent OOV values.
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([12, 36, 1138, 42, 7]) # Note OOV tokens
-  >>> layer = tf.keras.layers.IntegerLookup(
-  ...     vocabulary=vocab, output_mode='one_hot')
-  >>> layer(data)
-  <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
-    array([[0., 1., 0., 0., 0.],
-           [0., 0., 1., 0., 0.],
-           [0., 0., 0., 1., 0.],
-           [0., 0., 0., 0., 1.],
-           [1., 0., 0., 0., 0.]], dtype=float32)>
-
-  **Multi-hot output**
-
-  Configure the layer with `output_mode='multi_hot'`. Note that the first
-  `num_oov_indices` dimensions in the multi_hot encoding represent OOV tokens
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
-  >>> layer = tf.keras.layers.IntegerLookup(
-  ...     vocabulary=vocab, output_mode='multi_hot')
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0., 1., 0., 1., 1.],
-           [1., 0., 1., 0., 1.]], dtype=float32)>
-
-  **Token count output**
-
-  Configure the layer with `output_mode='count'`. As with multi_hot output, the
-  first `num_oov_indices` dimensions in the output represent OOV tokens.
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
-  >>> layer = tf.keras.layers.IntegerLookup(
-  ...     vocabulary=vocab, output_mode='count')
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0., 1., 0., 1., 2.],
-           [2., 0., 1., 0., 1.]], dtype=float32)>
-
-  **TF-IDF output**
-
-  Configure the layer with `output_mode='tf_idf'`. As with multi_hot output, the
-  first `num_oov_indices` dimensions in the output represent OOV tokens.
-
-  Each token bin will output `token_count * idf_weight`, where the idf weights
-  are the inverse document frequency weights per token. These should be provided
-  along with the vocabulary. Note that the `idf_weight` for OOV tokens will
-  default to the average of all idf weights passed in.
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> idf_weights = [0.25, 0.75, 0.6, 0.4]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
-  >>> layer = tf.keras.layers.IntegerLookup(
-  ...     output_mode='tf_idf', vocabulary=vocab, idf_weights=idf_weights)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
-           [1.0 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
-
-  To specify the idf weights for oov tokens, you will need to pass the entire
-  vocabularly including the leading oov token.
-
-  >>> vocab = [-1, 12, 36, 1138, 42]
-  >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
-  >>> layer = tf.keras.layers.IntegerLookup(
-  ...     output_mode='tf_idf', vocabulary=vocab, idf_weights=idf_weights)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
-           [1.8 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
-
-  When adapting the layer in tf_idf mode, each input sample will be considered a
-  document, and idf weight per token will be calculated as
-  `log(1 + num_documents / (1 + token_document_count))`.
-
-  **Inverse lookup**
-
-  This example demonstrates how to map indices to tokens using this layer. (You
-  can also use `adapt()` with `inverse=True`, but for simplicity we'll pass the
-  vocab in this example.)
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[1, 3, 4], [4, 0, 2]])
-  >>> layer = tf.keras.layers.IntegerLookup(vocabulary=vocab, invert=True)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[  12, 1138,   42],
-         [  42,   -1,   36]])>
-
-  Note that the first index correspond to the oov token by default.
-
-
-  **Forward and inverse lookup pairs**
-
-  This example demonstrates how to use the vocabulary of a standard lookup
-  layer to create an inverse lookup layer.
-
-  >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
-  >>> layer = tf.keras.layers.IntegerLookup(vocabulary=vocab)
-  >>> i_layer = tf.keras.layers.IntegerLookup(
-  ...     vocabulary=layer.get_vocabulary(), invert=True)
-  >>> int_data = layer(data)
-  >>> i_layer(int_data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[  12, 1138,   42],
-         [  42,   -1,   36]])>
-
-  In this example, the input token 1000 resulted in an output of -1, since
-  1000 was not in the vocabulary - it got represented as an OOV, and all OOV
-  tokens are returned as -1 in the inverse layer. Also, note that for the
-  inverse to work, you must have already set the forward layer vocabulary
-  either directly or via `adapt()` before calling `get_vocabulary()`.
-  """
-
-  def __init__(self,
-               max_tokens=None,
-               num_oov_indices=1,
-               mask_token=None,
-               oov_token=-1,
-               vocabulary=None,
-               vocabulary_dtype="int64",
-               idf_weights=None,
-               invert=False,
-               output_mode="int",
-               sparse=False,
-               pad_to_max_tokens=False,
-               **kwargs):
-    if not tf.dtypes.as_dtype(vocabulary_dtype).is_integer:
-      raise ValueError("`vocabulary_dtype` must be an integer dtype. "
-                       f"Received: {vocabulary_dtype}")
-
-    # Legacy versions of the IntegerLookup layer set layer dtype to int64,
-    # instead of the output type. If we see this and output mode is not "int",
-    # clear the setting so we don't switch types for old SavedModels.
-    if output_mode != "int" and "dtype" in kwargs and (
-        kwargs["dtype"] == tf.int64 or kwargs["dtype"] == "int64"):
-      del kwargs["dtype"]
-
-    # Support deprecated args for this layer.
-    if "max_values" in kwargs:
-      logging.log_first_n(logging.WARN,
-                          "max_values is deprecated, use max_tokens instead.",
-                          1)
-      max_tokens = kwargs["max_values"]
-      del kwargs["max_values"]
-    if "mask_value" in kwargs:
-      logging.log_first_n(logging.WARN,
-                          "mask_value is deprecated, use mask_token instead.",
-                          1)
-      mask_token = kwargs["mask_value"]
-      del kwargs["mask_value"]
-    if "oov_value" in kwargs:
-      logging.log_first_n(logging.WARN,
-                          "oov_value is deprecated, use oov_token instead.", 1)
-      oov_token = kwargs["oov_value"]
-      del kwargs["oov_value"]
-
-    # If max_tokens is set, the token must be greater than 1 - otherwise we
-    # are creating a 0-element vocab, which doesn't make sense.
-    if max_tokens is not None and max_tokens <= 1:
-      raise ValueError(
-          f"If `max_tokens` is set for `IntegerLookup`, it must be "
-          f"greater than 1. Received: max_tokens={max_tokens}.")
-
-    if num_oov_indices < 0:
-      raise ValueError(
-          f"The value of `num_oov_indices` argument for `IntegerLookup` "
-          f"must >= 0. Received num_oov_indices="
-          f"{num_oov_indices}.")
-
-    # Make sure mask and oov are of the dtype we want.
-    mask_token = None if mask_token is None else np.int64(mask_token)
-    oov_token = None if oov_token is None else np.int64(oov_token)
-
-    super().__init__(
-        max_tokens=max_tokens,
-        num_oov_indices=num_oov_indices,
-        mask_token=mask_token,
-        oov_token=oov_token,
-        vocabulary=vocabulary,
-        vocabulary_dtype=vocabulary_dtype,
-        idf_weights=idf_weights,
-        invert=invert,
-        output_mode=output_mode,
-        sparse=sparse,
-        pad_to_max_tokens=pad_to_max_tokens,
-        **kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell("IntegerLookup").set(True)
-
-  # We override this method solely to generate a docstring.
-  def adapt(self, data, batch_size=None, steps=None):
-    """Computes a vocabulary of interger terms from tokens in a dataset.
-
-    Calling `adapt()` on an `IntegerLookup` layer is an alternative to passing
-    in a precomputed vocabulary  on construction via the `vocabulary` argument.
-    An `IntegerLookup` layer should always be either adapted over a dataset or
-    supplied with a vocabulary.
-
-    During `adapt()`, the layer will build a vocabulary of all integer tokens
-    seen in the dataset, sorted by occurance count, with ties broken by sort
-    order of the tokens (high to low). At the end of `adapt()`, if `max_tokens`
-    is set, the voculary wil be truncated to `max_tokens` size. For example,
-    adapting a layer with `max_tokens=1000` will compute the 1000 most frequent
-    tokens occurring in the input dataset. If `output_mode='tf-idf'`, `adapt()`
-    will also learn the document frequencies of each token in the input dataset.
-
-    In order to make `StringLookup` efficient in any distribution context, the
-    vocabulary is kept static with respect to any compiled `tf.Graph`s that
-    call the layer. As a consequence, if the layer is adapted a second time,
-    any models using the layer should be re-compiled. For more information
-    see `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
-
-    `adapt()` is meant only as a single machine utility to compute layer state.
-    To analyze a dataset that cannot fit on a single machine, see
-    [Tensorflow Transform](https://www.tensorflow.org/tfx/transform/get_started)
-    for a multi-machine, map-reduce solution.
-
-    Arguments:
-      data: The data to train on. It can be passed either as a
-          `tf.data.Dataset`, or as a numpy array.
-      batch_size: Integer or `None`.
-          Number of samples per state update.
-          If unspecified, `batch_size` will default to 32.
-          Do not specify the `batch_size` if your data is in the
-          form of datasets, generators, or `keras.utils.Sequence` instances
-          (since they generate batches).
-      steps: Integer or `None`.
-          Total number of steps (batches of samples)
-          When training with input tensors such as
-          TensorFlow data tensors, the default `None` is equal to
-          the number of samples in your dataset divided by
-          the batch size, or 1 if that cannot be determined. If x is a
-          `tf.data` dataset, and 'steps' is None, the epoch will run until
-          the input dataset is exhausted. When passing an infinitely
-          repeating dataset, you must specify the `steps` argument. This
-          argument is not supported with array inputs.
+    """A preprocessing layer which maps integer features to contiguous ranges.
+
+    This layer maps a set of arbitrary integer input tokens into indexed integer
+    output via a table-based vocabulary lookup. The layer's output indices will
+    be contiguously arranged up to the maximum vocab size, even if the input
+    tokens are non-continguous or unbounded. The layer supports multiple options
+    for encoding the output via `output_mode`, and has optional support for
+    out-of-vocabulary (OOV) tokens and masking.
+
+    The vocabulary for the layer must be either supplied on construction or
+    learned via `adapt()`. During `adapt()`, the layer will analyze a data set,
+    determine the frequency of individual integer tokens, and create a
+    vocabulary from them. If the vocabulary is capped in size, the most frequent
+    tokens will be used to create the vocabulary and all others will be treated
+    as OOV.
+
+    There are two possible output modes for the layer.  When `output_mode` is
+    `"int"`, input integers are converted to their index in the vocabulary (an
+    integer).  When `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"`,
+    input integers are encoded into an array where each dimension corresponds to
+    an element in the vocabulary.
+
+    The vocabulary can optionally contain a mask token as well as an OOV token
+    (which can optionally occupy multiple indices in the vocabulary, as set
+    by `num_oov_indices`).
+    The position of these tokens in the vocabulary is fixed. When `output_mode`
+    is `"int"`, the vocabulary will begin with the mask token at index 0,
+    followed by OOV indices, followed by the rest of the vocabulary. When
+    `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"` the vocabulary will
+    begin with OOV indices and instances of the mask token will be dropped.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+      max_tokens: Maximum size of the vocabulary for this layer. This should
+        only be specified when adapting the vocabulary or when setting
+        `pad_to_max_tokens=True`. If None, there is no cap on the size of the
+        vocabulary. Note that this size includes the OOV and mask tokens.
+        Defaults to `None`.
+      num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+        value is more than 1, OOV inputs are modulated to determine their OOV
+        value. If this value is 0, OOV inputs will cause an error when calling
+        the layer. Defaults to `1`.
+      mask_token: An integer token that represents masked inputs. When
+        `output_mode` is `"int"`, the token is included in vocabulary and mapped
+        to index 0. In other output modes, the token will not appear in the
+        vocabulary and instances of the mask token in the input will be dropped.
+        If set to None, no mask term will be added. Defaults to `None`.
+      oov_token: Only used when `invert` is True. The token to return for OOV
+        indices. Defaults to `-1`.
+      vocabulary: Optional. Either an array of integers or a string path to a
+        text file. If passing an array, can pass a tuple, list, 1D numpy array,
+        or 1D tensor containing the integer vocbulary terms. If passing a file
+        path, the file should contain one line per term in the vocabulary. If
+        this argument is set, there is no need to `adapt()` the layer.
+      vocabulary_dtype: The dtype of the vocabulary terms, for example
+        `"int64"` or `"int32"`. Defaults to `"int64"`.
+      idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list,
+        1D numpy array, or 1D tensor or the same length as the vocabulary,
+        containing the floating point inverse document frequency weights, which
+        will be multiplied by per sample term counts for the final `tf_idf`
+        weight. If the `vocabulary` argument is set, and `output_mode` is
+        `"tf_idf"`, this argument must be supplied.
+      invert: Only valid when `output_mode` is `"int"`. If True, this layer will
+        map indices to vocabulary items instead of mapping vocabulary items to
+        indices. Defaults to `False`.
+      output_mode: Specification for the output of the layer. Values can be
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
+        configuring the layer as follows:
+          - `"int"`: Return the vocabulary indices of the input tokens.
+          - `"one_hot"`: Encodes each individual element in the input into an
+            array the same size as the vocabulary, containing a 1 at the element
+            index. If the last dimension is size 1, will encode on that
+            dimension.  If the last dimension is not size 1, will append a new
+            dimension for the encoded output.
+          - `"multi_hot"`: Encodes each sample in the input into a single array
+            the same size as the vocabulary, containing a 1 for each vocabulary
+            term present in the sample. Treats the last dimension as the sample
+            dimension, if input shape is (..., sample_length), output shape will
+            be (..., num_tokens).
+          - `"count"`: As `"multi_hot"`, but the int array contains a count of
+            the number of times the token at that index appeared in the sample.
+          - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
+            find the value in each token slot.
+        For `"int"` output, any shape of input and output is supported. For all
+        other output modes, currently only output up to rank 2 is supported.
+        Defaults to `"int"`.
+      pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
+        `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
+        padded to `max_tokens` even if the number of unique tokens in the
+        vocabulary is less than max_tokens, resulting in a tensor of shape
+        [batch_size, max_tokens] regardless of vocabulary size. Defaults to
+        False.
+      sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
+        `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
+        dense `Tensor`. Defaults to `False`.
+
+    Examples:
+
+    **Creating a lookup layer with a known vocabulary**
+
+    This example creates a lookup layer with a pre-existing vocabulary.
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])  # Note OOV tokens
+    >>> layer = tf.keras.layers.IntegerLookup(vocabulary=vocab)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[1, 3, 4],
+           [4, 0, 2]])>
+
+    **Creating a lookup layer with an adapted vocabulary**
+
+    This example creates a lookup layer and generates the vocabulary by
+    analyzing the dataset.
+
+    >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+    >>> layer = tf.keras.layers.IntegerLookup()
+    >>> layer.adapt(data)
+    >>> layer.get_vocabulary()
+    [-1, 42, 1138, 1000, 36, 12]
+
+    Note that the OOV token -1 have been added to the vocabulary. The remaining
+    tokens are sorted by frequency (42, which has 2 occurrences, is first) then
+    by inverse sort order.
+
+    >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+    >>> layer = tf.keras.layers.IntegerLookup()
+    >>> layer.adapt(data)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[5, 2, 1],
+           [1, 3, 4]])>
+
+
+    **Lookups with multiple OOV indices**
+
+    This example demonstrates how to use a lookup layer with multiple OOV
+    indices.  When a layer is created with more than one OOV index, any OOV
+    tokens are hashed into the number of OOV buckets, distributing OOV tokens in
+    a deterministic fashion across the set.
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> data = tf.constant([[12, 1138, 42], [37, 1000, 36]])
+    >>> layer = tf.keras.layers.IntegerLookup(
+    ...     vocabulary=vocab, num_oov_indices=2)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[2, 4, 5],
+           [1, 0, 3]])>
+
+    Note that the output for OOV token 37 is 1, while the output for OOV token
+    1000 is 0. The in-vocab terms have their output index increased by 1 from
+    earlier examples (12 maps to 2, etc) in order to make space for the extra
+    OOV token.
+
+    **One-hot output**
+
+    Configure the layer with `output_mode='one_hot'`. Note that the first
+    `num_oov_indices` dimensions in the one_hot encoding represent OOV values.
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> data = tf.constant([12, 36, 1138, 42, 7]) # Note OOV tokens
+    >>> layer = tf.keras.layers.IntegerLookup(
+    ...     vocabulary=vocab, output_mode='one_hot')
+    >>> layer(data)
+    <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
+      array([[0., 1., 0., 0., 0.],
+             [0., 0., 1., 0., 0.],
+             [0., 0., 0., 1., 0.],
+             [0., 0., 0., 0., 1.],
+             [1., 0., 0., 0., 0.]], dtype=float32)>
+
+    **Multi-hot output**
+
+    Configure the layer with `output_mode='multi_hot'`. Note that the first
+    `num_oov_indices` dimensions in the multi_hot encoding represent OOV tokens
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> data = tf.constant([[12, 1138, 42, 42],
+    ...                     [42, 7, 36, 7]]) # Note OOV tokens
+    >>> layer = tf.keras.layers.IntegerLookup(
+    ...     vocabulary=vocab, output_mode='multi_hot')
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0., 1., 0., 1., 1.],
+             [1., 0., 1., 0., 1.]], dtype=float32)>
+
+    **Token count output**
+
+    Configure the layer with `output_mode='count'`. As with multi_hot output,
+    the first `num_oov_indices` dimensions in the output represent OOV tokens.
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> data = tf.constant([[12, 1138, 42, 42],
+    ...                     [42, 7, 36, 7]]) # Note OOV tokens
+    >>> layer = tf.keras.layers.IntegerLookup(
+    ...     vocabulary=vocab, output_mode='count')
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0., 1., 0., 1., 2.],
+             [2., 0., 1., 0., 1.]], dtype=float32)>
+
+    **TF-IDF output**
+
+    Configure the layer with `output_mode='tf_idf'`. As with multi_hot output,
+    the first `num_oov_indices` dimensions in the output represent OOV tokens.
+
+    Each token bin will output `token_count * idf_weight`, where the idf weights
+    are the inverse document frequency weights per token. These should be
+    provided along with the vocabulary. Note that the `idf_weight` for OOV
+    tokens will default to the average of all idf weights passed in.
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> idf_weights = [0.25, 0.75, 0.6, 0.4]
+    >>> data = tf.constant([[12, 1138, 42, 42],
+    ...                     [42, 7, 36, 7]]) # Note OOV tokens
+    >>> layer = tf.keras.layers.IntegerLookup(
+    ...     output_mode='tf_idf', vocabulary=vocab, idf_weights=idf_weights)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
+             [1.0 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
+
+    To specify the idf weights for oov tokens, you will need to pass the entire
+    vocabularly including the leading oov token.
+
+    >>> vocab = [-1, 12, 36, 1138, 42]
+    >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
+    >>> data = tf.constant([[12, 1138, 42, 42],
+    ...                     [42, 7, 36, 7]]) # Note OOV tokens
+    >>> layer = tf.keras.layers.IntegerLookup(
+    ...     output_mode='tf_idf', vocabulary=vocab, idf_weights=idf_weights)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
+             [1.8 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
+
+    When adapting the layer in tf_idf mode, each input sample will be considered
+    a document, and idf weight per token will be calculated as
+    `log(1 + num_documents / (1 + token_document_count))`.
+
+    **Inverse lookup**
+
+    This example demonstrates how to map indices to tokens using this layer.
+    (You can also use `adapt()` with `inverse=True`, but for simplicity we'll
+    pass the vocab in this example.)
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> data = tf.constant([[1, 3, 4], [4, 0, 2]])
+    >>> layer = tf.keras.layers.IntegerLookup(vocabulary=vocab, invert=True)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[  12, 1138,   42],
+           [  42,   -1,   36]])>
+
+    Note that the first index correspond to the oov token by default.
+
+
+    **Forward and inverse lookup pairs**
+
+    This example demonstrates how to use the vocabulary of a standard lookup
+    layer to create an inverse lookup layer.
+
+    >>> vocab = [12, 36, 1138, 42]
+    >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+    >>> layer = tf.keras.layers.IntegerLookup(vocabulary=vocab)
+    >>> i_layer = tf.keras.layers.IntegerLookup(
+    ...     vocabulary=layer.get_vocabulary(), invert=True)
+    >>> int_data = layer(data)
+    >>> i_layer(int_data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[  12, 1138,   42],
+           [  42,   -1,   36]])>
+
+    In this example, the input token 1000 resulted in an output of -1, since
+    1000 was not in the vocabulary - it got represented as an OOV, and all OOV
+    tokens are returned as -1 in the inverse layer. Also, note that for the
+    inverse to work, you must have already set the forward layer vocabulary
+    either directly or via `adapt()` before calling `get_vocabulary()`.
     """
-    super().adapt(data, batch_size=batch_size, steps=steps)
+
+    def __init__(
+        self,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=None,
+        oov_token=-1,
+        vocabulary=None,
+        vocabulary_dtype="int64",
+        idf_weights=None,
+        invert=False,
+        output_mode="int",
+        sparse=False,
+        pad_to_max_tokens=False,
+        **kwargs,
+    ):
+        if not tf.dtypes.as_dtype(vocabulary_dtype).is_integer:
+            raise ValueError(
+                "`vocabulary_dtype` must be an integer dtype. "
+                f"Received: {vocabulary_dtype}"
+            )
+
+        # Legacy versions of the IntegerLookup layer set layer dtype to int64,
+        # instead of the output type. If we see this and output mode is not
+        # "int", clear the setting so we don't switch types for old SavedModels.
+        if (
+            output_mode != "int"
+            and "dtype" in kwargs
+            and (kwargs["dtype"] == tf.int64 or kwargs["dtype"] == "int64")
+        ):
+            del kwargs["dtype"]
+
+        # Support deprecated args for this layer.
+        if "max_values" in kwargs:
+            logging.log_first_n(
+                logging.WARN,
+                "max_values is deprecated, use max_tokens instead.",
+                1,
+            )
+            max_tokens = kwargs["max_values"]
+            del kwargs["max_values"]
+        if "mask_value" in kwargs:
+            logging.log_first_n(
+                logging.WARN,
+                "mask_value is deprecated, use mask_token instead.",
+                1,
+            )
+            mask_token = kwargs["mask_value"]
+            del kwargs["mask_value"]
+        if "oov_value" in kwargs:
+            logging.log_first_n(
+                logging.WARN,
+                "oov_value is deprecated, use oov_token instead.",
+                1,
+            )
+            oov_token = kwargs["oov_value"]
+            del kwargs["oov_value"]
+
+        # If max_tokens is set, the token must be greater than 1 - otherwise we
+        # are creating a 0-element vocab, which doesn't make sense.
+        if max_tokens is not None and max_tokens <= 1:
+            raise ValueError(
+                "If `max_tokens` is set for `IntegerLookup`, it must be "
+                f"greater than 1. Received: max_tokens={max_tokens}."
+            )
+
+        if num_oov_indices < 0:
+            raise ValueError(
+                "The value of `num_oov_indices` argument for `IntegerLookup` "
+                "must >= 0. Received num_oov_indices="
+                f"{num_oov_indices}."
+            )
+
+        # Make sure mask and oov are of the dtype we want.
+        mask_token = None if mask_token is None else np.int64(mask_token)
+        oov_token = None if oov_token is None else np.int64(oov_token)
+
+        super().__init__(
+            max_tokens=max_tokens,
+            num_oov_indices=num_oov_indices,
+            mask_token=mask_token,
+            oov_token=oov_token,
+            vocabulary=vocabulary,
+            vocabulary_dtype=vocabulary_dtype,
+            idf_weights=idf_weights,
+            invert=invert,
+            output_mode=output_mode,
+            sparse=sparse,
+            pad_to_max_tokens=pad_to_max_tokens,
+            **kwargs,
+        )
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("IntegerLookup").set(
+            True
+        )
+
+    # We override this method solely to generate a docstring.
+    def adapt(self, data, batch_size=None, steps=None):
+        """Computes a vocabulary of interger terms from tokens in a dataset.
+
+        Calling `adapt()` on an `IntegerLookup` layer is an alternative to
+        passing in a precomputed vocabulary  on construction via the
+        `vocabulary` argument.  An `IntegerLookup` layer should always be either
+        adapted over a dataset or supplied with a vocabulary.
+
+        During `adapt()`, the layer will build a vocabulary of all integer
+        tokens seen in the dataset, sorted by occurrence count, with ties broken
+        by sort order of the tokens (high to low). At the end of `adapt()`, if
+        `max_tokens` is set, the vocabulary wil be truncated to `max_tokens`
+        size. For example, adapting a layer with `max_tokens=1000` will compute
+        the 1000 most frequent tokens occurring in the input dataset. If
+        `output_mode='tf-idf'`, `adapt()` will also learn the document
+        frequencies of each token in the input dataset.
+
+        In order to make `StringLookup` efficient in any distribution context,
+        the vocabulary is kept static with respect to any compiled `tf.Graph`s
+        that call the layer. As a consequence, if the layer is adapted a second
+        time, any models using the layer should be re-compiled. For more
+        information see
+        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
+
+        `adapt()` is meant only as a single machine utility to compute layer
+        state.  To analyze a dataset that cannot fit on a single machine, see
+        [Tensorflow Transform](
+        https://www.tensorflow.org/tfx/transform/get_started) for a
+        multi-machine, map-reduce solution.
+
+        Arguments:
+          data: The data to train on. It can be passed either as a
+              `tf.data.Dataset`, or as a numpy array.
+          batch_size: Integer or `None`.
+              Number of samples per state update.
+              If unspecified, `batch_size` will default to 32.
+              Do not specify the `batch_size` if your data is in the
+              form of datasets, generators, or `keras.utils.Sequence` instances
+              (since they generate batches).
+          steps: Integer or `None`.
+              Total number of steps (batches of samples)
+              When training with input tensors such as
+              TensorFlow data tensors, the default `None` is equal to
+              the number of samples in your dataset divided by
+              the batch size, or 1 if that cannot be determined. If x is a
+              `tf.data` dataset, and 'steps' is None, the epoch will run until
+              the input dataset is exhausted. When passing an infinitely
+              repeating dataset, you must specify the `steps` argument. This
+              argument is not supported with array inputs.
+        """
+        super().adapt(data, batch_size=batch_size, steps=steps)
diff --git a/keras/layers/preprocessing/integer_lookup_test.py b/keras/layers/preprocessing/integer_lookup_test.py
index 17f29b77a9bf..4a06475880cb 100644
--- a/keras/layers/preprocessing/integer_lookup_test.py
+++ b/keras/layers/preprocessing/integer_lookup_test.py
@@ -14,605 +14,673 @@
 # ==============================================================================
 """Tests for Keras text vectorization preprocessing layer."""
 
-import tensorflow.compat.v2 as tf
-
 import gc
 import itertools
 import os
 import random
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.layers.preprocessing import integer_lookup
 from keras.layers.preprocessing import preprocessing_test_utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 def _get_end_to_end_test_cases():
-  test_cases = (
-      {
-          "testcase_name":
-              "test_ints_soft_vocab_cap",
-          # Create an array where 1138 is the most frequent term, followed by
-          # 1729, then 725, then 42. This ensures that the vocab accumulator
-          # is sorting by frequency.
-          "vocab_data":
-              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
-                        [1729], [725], [725]],
-                       dtype=np.int64),
-          "input_data":
-              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
-                       dtype=np.int64),
-          "kwargs": {
-              "max_tokens": None,
-              "dtype": tf.int64,
-          },
-          "expected_output": [[1], [2], [3], [4], [4], [3], [1], [0]],
-          "input_dtype":
-              tf.int64
-      },)
-
-  crossed_test_cases = []
-  # Cross above test cases with use_dataset in (True, False)
-  for use_dataset in (True, False):
-    for case in test_cases:
-      case = case.copy()
-      if use_dataset:
-        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
-      case["use_dataset"] = use_dataset
-      crossed_test_cases.append(case)
-
-  return crossed_test_cases
+    test_cases = (
+        {
+            "testcase_name": "test_ints_soft_vocab_cap",
+            # Create an array where 1138 is the most frequent term, followed by
+            # 1729, then 725, then 42. This ensures that the vocab accumulator
+            # is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    [42],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1138],
+                    [1729],
+                    [1729],
+                    [1729],
+                    [725],
+                    [725],
+                ],
+                dtype=np.int64,
+            ),
+            "input_data": np.array(
+                [[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                dtype=np.int64,
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "dtype": tf.int64,
+            },
+            "expected_output": [[1], [2], [3], [4], [4], [3], [1], [0]],
+            "input_dtype": tf.int64,
+        },
+    )
+
+    crossed_test_cases = []
+    # Cross above test cases with use_dataset in (True, False)
+    for use_dataset in (True, False):
+        for case in test_cases:
+            case = case.copy()
+            if use_dataset:
+                case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+            case["use_dataset"] = use_dataset
+            crossed_test_cases.append(case)
+
+    return crossed_test_cases
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupLayerTest(test_combinations.TestCase,
-                             preprocessing_test_utils.PreprocessingLayerTest):
-
-  @parameterized.named_parameters(*_get_end_to_end_test_cases())
-  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
-                                       use_dataset, expected_output,
-                                       input_dtype):
-    cls = integer_lookup.IntegerLookup
-    expected_output_dtype = tf.int64
-    input_shape = input_data.shape
-
-    if use_dataset:
-      # Keras APIs expect batched datasets.
-      # TODO(rachelim): `model.predict` predicts the result on each
-      # dataset batch separately, then tries to concatenate the results
-      # together. When the results have different shapes on the non-concat
-      # axis (which can happen in the output_mode = INT case for
-      # IntegerLookup), the concatenation fails. In real use cases, this may
-      # not be an issue because users are likely to pipe the preprocessing layer
-      # into other keras layers instead of predicting it directly. A workaround
-      # for these unit tests is to have the dataset only contain one batch, so
-      # no concatenation needs to happen with the result. For consistency with
-      # numpy input, we should make `predict` join differently shaped results
-      # together sensibly, with 0 padding.
-      input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
-          input_shape[0])
-      vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
-          input_shape[0])
-
-    output_data = test_utils.layer_test(
-        cls,
-        kwargs=kwargs,
-        input_shape=input_shape,
-        input_data=input_data,
-        input_dtype=input_dtype,
-        expected_output_dtype=expected_output_dtype,
-        validate_training=False,
-        adapt_data=vocab_data)
-    self.assertAllClose(expected_output, output_data)
-
-  def test_layer_with_list_input(self):
-    vocab = [12, 36, 1138, 42]
-    data = [[12, 1138, 42], [42, 1000, 36]]  # Note OOV tokens
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab)
-    output = layer(data)
-    expected_output = np.array([[1, 3, 4], [4, 0, 2]])
-    self.assertEqual(output.numpy().tolist(), expected_output.tolist())
+class IntegerLookupLayerTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(*_get_end_to_end_test_cases())
+    def test_layer_end_to_end_with_adapt(
+        self,
+        vocab_data,
+        input_data,
+        kwargs,
+        use_dataset,
+        expected_output,
+        input_dtype,
+    ):
+        cls = integer_lookup.IntegerLookup
+        expected_output_dtype = tf.int64
+        input_shape = input_data.shape
+
+        if use_dataset:
+            # Keras APIs expect batched datasets.
+            # TODO(rachelim): `model.predict` predicts the result on each
+            # dataset batch separately, then tries to concatenate the results
+            # together. When the results have different shapes on the non-concat
+            # axis (which can happen in the output_mode = INT case for
+            # IntegerLookup), the concatenation fails. In real use cases, this
+            # may not be an issue because users are likely to pipe the
+            # preprocessing layer into other keras layers instead of predicting
+            # it directly. A workaround for these unit tests is to have the
+            # dataset only contain one batch, so no concatenation needs to
+            # happen with the result. For consistency with numpy input, we
+            # should make `predict` join differently shaped results together
+            # sensibly, with 0 padding.
+            input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
+                input_shape[0]
+            )
+            vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
+                input_shape[0]
+            )
+
+        output_data = test_utils.layer_test(
+            cls,
+            kwargs=kwargs,
+            input_shape=input_shape,
+            input_data=input_data,
+            input_dtype=input_dtype,
+            expected_output_dtype=expected_output_dtype,
+            validate_training=False,
+            adapt_data=vocab_data,
+        )
+        self.assertAllClose(expected_output, output_data)
+
+    def test_layer_with_list_input(self):
+        vocab = [12, 36, 1138, 42]
+        data = [[12, 1138, 42], [42, 1000, 36]]  # Note OOV tokens
+        layer = integer_lookup.IntegerLookup(vocabulary=vocab)
+        output = layer(data)
+        expected_output = np.array([[1, 3, 4], [4, 0, 2]])
+        self.assertEqual(output.numpy().tolist(), expected_output.tolist())
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class CategoricalEncodingInputTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_sparse_int_input(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]],
-        values=np.array([13, 32], dtype=np.int64),
-        dense_shape=[3, 4])
-
-    expected_indices = [[0, 0], [1, 2]]
-    expected_values = [4, 0]
-    expected_dense_shape = [3, 4]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    layer = integer_lookup.IntegerLookup(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_indices, output_data.indices)
-    self.assertAllEqual(expected_values, output_data.values)
-    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
-  def test_ragged_int_input(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 42]],
-                                     dtype=np.int64)
-    expected_output = [[1, 2, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    layer = integer_lookup.IntegerLookup(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_sparse_int_input(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]],
+            values=np.array([13, 32], dtype=np.int64),
+            dense_shape=[3, 4],
+        )
+
+        expected_indices = [[0, 0], [1, 2]]
+        expected_values = [4, 0]
+        expected_dense_shape = [3, 4]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+        layer = integer_lookup.IntegerLookup(max_tokens=None)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_indices, output_data.indices)
+        self.assertAllEqual(expected_values, output_data.values)
+        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+    def test_ragged_int_input(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.ragged.constant(
+            [[10, 11, 13], [13, 12, 10, 42]], dtype=np.int64
+        )
+        expected_output = [[1, 2, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
+        layer = integer_lookup.IntegerLookup(max_tokens=None)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class CategoricalEncodingMultiOOVTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_sparse_int_input_multi_bucket(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]],
-        values=np.array([13, 133], dtype=np.int64),
-        dense_shape=[3, 4])
-
-    expected_indices = [[0, 0], [1, 2]]
-    expected_values = [6, 2]
-    expected_dense_shape = [3, 4]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    layer = integer_lookup.IntegerLookup(
-        max_tokens=None,
-        dtype=tf.int64,
-        num_oov_indices=2,
-        mask_token=0,
-        oov_token=-1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array, steps=1)
-    self.assertAllEqual(expected_indices, output_data.indices)
-    self.assertAllEqual(expected_values, output_data.values)
-    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
-  def test_ragged_int_input_multi_bucket(self):
-    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = tf.ragged.constant([[10, 11, 13], [13, 12, 10, 133]],
-                                     dtype=np.int64)
-    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=2)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_sparse_int_input_multi_bucket(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]],
+            values=np.array([13, 133], dtype=np.int64),
+            dense_shape=[3, 4],
+        )
+
+        expected_indices = [[0, 0], [1, 2]]
+        expected_values = [6, 2]
+        expected_dense_shape = [3, 4]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
+        layer = integer_lookup.IntegerLookup(
+            max_tokens=None,
+            dtype=tf.int64,
+            num_oov_indices=2,
+            mask_token=0,
+            oov_token=-1,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array, steps=1)
+        self.assertAllEqual(expected_indices, output_data.indices)
+        self.assertAllEqual(expected_values, output_data.values)
+        self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+    def test_ragged_int_input_multi_bucket(self):
+        vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+        input_array = tf.ragged.constant(
+            [[10, 11, 13], [13, 12, 10, 133]], dtype=np.int64
+        )
+        expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
+        layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=2)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class CategoricalEncodingAdaptTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_sparse_adapt(self):
-    vocab_data = tf.SparseTensor(
-        indices=[[0, 0], [0, 1], [1, 2]],
-        values=[203, 1729, 203],
-        dense_shape=[3, 4])
-    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
-
-    layer = integer_lookup.IntegerLookup()
-    layer.adapt(vocab_dataset)
-    expected_vocabulary = [-1, 203, 1729]
-    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
-  def test_ragged_adapt(self):
-    vocab_data = tf.ragged.constant([[203], [1729, 203]])
-    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
-
-    layer = integer_lookup.IntegerLookup()
-    layer.adapt(vocab_dataset)
-    expected_vocabulary = [-1, 203, 1729]
-    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
-  def test_single_int_generator_dataset(self):
-
-    def word_gen():
-      for _ in itertools.count(1):
-        yield random.randint(0, 100)
-
-    ds = tf.data.Dataset.from_generator(word_gen, tf.int64, tf.TensorShape([]))
-    batched_ds = ds.take(2)
-    input_t = keras.Input(shape=(), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        max_tokens=10, num_oov_indices=0, mask_token=None, oov_token=None)
-    _ = layer(input_t)
-    layer.adapt(batched_ds)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_sparse_adapt(self):
+        vocab_data = tf.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 2]],
+            values=[203, 1729, 203],
+            dense_shape=[3, 4],
+        )
+        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
+
+        layer = integer_lookup.IntegerLookup()
+        layer.adapt(vocab_dataset)
+        expected_vocabulary = [-1, 203, 1729]
+        self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+    def test_ragged_adapt(self):
+        vocab_data = tf.ragged.constant([[203], [1729, 203]])
+        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
+
+        layer = integer_lookup.IntegerLookup()
+        layer.adapt(vocab_dataset)
+        expected_vocabulary = [-1, 203, 1729]
+        self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+    def test_single_int_generator_dataset(self):
+        def word_gen():
+            for _ in itertools.count(1):
+                yield random.randint(0, 100)
+
+        ds = tf.data.Dataset.from_generator(
+            word_gen, tf.int64, tf.TensorShape([])
+        )
+        batched_ds = ds.take(2)
+        input_t = keras.Input(shape=(), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            max_tokens=10, num_oov_indices=0, mask_token=None, oov_token=None
+        )
+        _ = layer(input_t)
+        layer.adapt(batched_ds)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupOutputTest(test_combinations.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_int_output(self):
-    vocab_data = [42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup()
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_output_shape(self):
-    input_data = keras.Input(shape=(4,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(max_tokens=2, num_oov_indices=1)
-    int_data = layer(input_data)
-    self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
-
-  def test_int_output_with_mask(self):
-    vocab_data = [42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(max_tokens=None, mask_token=0)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_explicit_vocab(self):
-    vocab_data = [42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-    )
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_explicit_vocab_with_special_tokens(self):
-    vocab_data = [0, -1, 42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data,
-        max_tokens=None,
-        mask_token=0,
-    )
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_no_oov(self):
-    vocab_data = [42, 1138, 725, 1729]
-    valid_input = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 0]])
-    invalid_input = np.array([[42, 1138, 725, 203], [1729, 725, 42, 203]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data, mask_token=0, num_oov_indices=0)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(valid_input)
-    self.assertAllEqual(expected_output, output_data)
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                "found OOV values.*203"):
-      _ = model.predict(invalid_input)
-
-  def test_inverse_output(self):
-    vocab_data = [-1, 42, 1138, 725, 1729]
-    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
-    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(invert=True)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_forward_backward_explicit_vocab(self):
-    vocab_data = [42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
-    inverse_layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data, invert=True)
-    int_data = layer(input_data)
-    inverse_data = inverse_layer(int_data)
-    model = keras.Model(inputs=input_data, outputs=inverse_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_forward_backward_adapted_vocab(self):
-    adapt_data = [42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup()
-    layer.adapt(adapt_data)
-    inverse_layer = integer_lookup.IntegerLookup(
-        vocabulary=layer.get_vocabulary(), invert=True)
-    int_data = layer(input_data)
-    inverse_data = inverse_layer(int_data)
-    model = keras.Model(inputs=input_data, outputs=inverse_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+class IntegerLookupOutputTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_int_output(self):
+        vocab_data = [42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup()
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_output_shape(self):
+        input_data = keras.Input(shape=(4,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(max_tokens=2, num_oov_indices=1)
+        int_data = layer(input_data)
+        self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
+
+    def test_int_output_with_mask(self):
+        vocab_data = [42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(max_tokens=None, mask_token=0)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_explicit_vocab(self):
+        vocab_data = [42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_explicit_vocab_with_special_tokens(self):
+        vocab_data = [0, -1, 42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data,
+            max_tokens=None,
+            mask_token=0,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_no_oov(self):
+        vocab_data = [42, 1138, 725, 1729]
+        valid_input = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 0]])
+        invalid_input = np.array([[42, 1138, 725, 203], [1729, 725, 42, 203]])
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data, mask_token=0, num_oov_indices=0
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(valid_input)
+        self.assertAllEqual(expected_output, output_data)
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError, "found OOV values.*203"
+        ):
+            _ = model.predict(invalid_input)
+
+    def test_inverse_output(self):
+        vocab_data = [-1, 42, 1138, 725, 1729]
+        input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
+        expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(invert=True)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_forward_backward_explicit_vocab(self):
+        vocab_data = [42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
+        inverse_layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data, invert=True
+        )
+        int_data = layer(input_data)
+        inverse_data = inverse_layer(int_data)
+        model = keras.Model(inputs=input_data, outputs=inverse_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_forward_backward_adapted_vocab(self):
+        adapt_data = [42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup()
+        layer.adapt(adapt_data)
+        inverse_layer = integer_lookup.IntegerLookup(
+            vocabulary=layer.get_vocabulary(), invert=True
+        )
+        int_data = layer(input_data)
+        inverse_data = inverse_layer(int_data)
+        model = keras.Model(inputs=input_data, outputs=inverse_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class IntegerLookupVocabularyTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(str(vocab) + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def test_int_output_explicit_vocab(self):
-    vocab_data = [42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_no_vocab(self):
-    with self.assertRaisesRegex(RuntimeError,
-                                "you must set the layer's vocabulary"):
-      layer = integer_lookup.IntegerLookup(output_mode="binary")
-      layer([[1]])
-
-  def test_one_hot_output(self):
-    vocab_data = [2, 3, 4, 5]
-    input_array = np.array([2, 3, 4, 5, 6])
-    expected_output = [
-        [0, 1, 0, 0, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 1, 0],
-        [0, 0, 0, 0, 1],
-        [1, 0, 0, 0, 0],
-    ]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data, output_mode="one_hot")
-    res = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=res)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_multi_hot_output(self):
-    vocab_data = [2, 3, 4, 5]
-    input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 2]])
-    expected_output = [[0, 1, 1, 1, 0], [1, 1, 0, 0, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data, output_mode="multi_hot")
-    res = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=res)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_count_output(self):
-    vocab_data = [2, 3, 4, 5]
-    input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 6]])
-    expected_output = [[0, 2, 1, 1, 0], [3, 0, 0, 0, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data, output_mode="count")
-    res = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=res)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_sparse_output(self):
-    vocab_data = [2, 3, 4, 5]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_data, output_mode="multi_hot", sparse=True)
-    res = layer(input_data)
-    self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
-
-  def test_get_vocab_returns_int(self):
-    vocab_data = [42, 1138, 725, 1729]
-    expected_vocab = [-1, 42, 1138, 725, 1729]
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
-    layer_vocab = layer.get_vocabulary()
-    self.assertAllEqual(expected_vocab, layer_vocab)
-    self.assertIsInstance(layer_vocab[0], np.int64)
-
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_list = [42, 1138, 725, 1729]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_inverted_vocab_from_file(self):
-    vocab_list = [42, 1138, 725, 1729]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
-    expected_output = [[42, 1138, 725, 1729], [1729, 725, 42, -1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab_path, invert=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_inverted_vocab_from_file_with_mask(self):
-    vocab_list = [42, 1138, 725, 1729]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
-    expected_output = [[42, 1138, 725, 1729], [1729, 725, 42, -10]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(
-        vocabulary=vocab_path, invert=True, mask_value=-10)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_explicit_vocab_from_file_via_setter(self):
-    vocab_list = [42, 1138, 725, 1729]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup()
-    layer.set_vocabulary(vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_non_unique_vocab_fails(self):
-    vocab_data = [42, 1138, 725, 1729, 1729]
-    with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
-      _ = integer_lookup.IntegerLookup(vocabulary=vocab_data)
-
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_list = [42, 1138, 725, 1729, 42]
-    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
-    with self.assertRaisesRegex(
-        tf.errors.FailedPreconditionError,
-        ".*HashTable has different value for same key.*42.*"):
-      _ = integer_lookup.IntegerLookup(vocabulary=vocab_path)
-
-  def test_tensor_vocab(self):
-    vocab_data = [-1, 42, 1138, 725, 1729]
-    vocab_tensor = tf.constant(vocab_data, tf.int64)
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab_tensor)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-    self.assertAllEqual(layer.vocabulary_size(), 5)
-    fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor))
-    with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"):
-      fn()
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(str(vocab) + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def test_int_output_explicit_vocab(self):
+        vocab_data = [42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_no_vocab(self):
+        with self.assertRaisesRegex(
+            RuntimeError, "you must set the layer's vocabulary"
+        ):
+            layer = integer_lookup.IntegerLookup(output_mode="binary")
+            layer([[1]])
+
+    def test_one_hot_output(self):
+        vocab_data = [2, 3, 4, 5]
+        input_array = np.array([2, 3, 4, 5, 6])
+        expected_output = [
+            [0, 1, 0, 0, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 0],
+            [0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data, output_mode="one_hot"
+        )
+        res = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=res)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_multi_hot_output(self):
+        vocab_data = [2, 3, 4, 5]
+        input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 2]])
+        expected_output = [[0, 1, 1, 1, 0], [1, 1, 0, 0, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data, output_mode="multi_hot"
+        )
+        res = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=res)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_count_output(self):
+        vocab_data = [2, 3, 4, 5]
+        input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 6]])
+        expected_output = [[0, 2, 1, 1, 0], [3, 0, 0, 0, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data, output_mode="count"
+        )
+        res = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=res)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_sparse_output(self):
+        vocab_data = [2, 3, 4, 5]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_data, output_mode="multi_hot", sparse=True
+        )
+        res = layer(input_data)
+        self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
+
+    def test_get_vocab_returns_int(self):
+        vocab_data = [42, 1138, 725, 1729]
+        expected_vocab = [-1, 42, 1138, 725, 1729]
+        layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
+        layer_vocab = layer.get_vocabulary()
+        self.assertAllEqual(expected_vocab, layer_vocab)
+        self.assertIsInstance(layer_vocab[0], np.int64)
+
+    def test_int_output_explicit_vocab_from_file(self):
+        vocab_list = [42, 1138, 725, 1729]
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(vocabulary=vocab_path)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_inverted_vocab_from_file(self):
+        vocab_list = [42, 1138, 725, 1729]
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+        input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
+        expected_output = [[42, 1138, 725, 1729], [1729, 725, 42, -1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(vocabulary=vocab_path, invert=True)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_inverted_vocab_from_file_with_mask(self):
+        vocab_list = [42, 1138, 725, 1729]
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+        input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
+        expected_output = [[42, 1138, 725, 1729], [1729, 725, 42, -10]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(
+            vocabulary=vocab_path, invert=True, mask_value=-10
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_explicit_vocab_from_file_via_setter(self):
+        vocab_list = [42, 1138, 725, 1729]
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup()
+        layer.set_vocabulary(vocab_path)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_non_unique_vocab_fails(self):
+        vocab_data = [42, 1138, 725, 1729, 1729]
+        with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
+            _ = integer_lookup.IntegerLookup(vocabulary=vocab_data)
+
+    def test_non_unique_vocab_from_file_fails(self):
+        vocab_list = [42, 1138, 725, 1729, 42]
+        vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+        with self.assertRaisesRegex(
+            tf.errors.FailedPreconditionError,
+            ".*HashTable has different value for same key.*42.*",
+        ):
+            _ = integer_lookup.IntegerLookup(vocabulary=vocab_path)
+
+    def test_tensor_vocab(self):
+        vocab_data = [-1, 42, 1138, 725, 1729]
+        vocab_tensor = tf.constant(vocab_data, tf.int64)
+        layer = integer_lookup.IntegerLookup(vocabulary=vocab_tensor)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+        self.assertAllEqual(layer.vocabulary_size(), 5)
+        fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor))
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot set a tensor vocabulary"
+        ):
+            fn()
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupErrorTest(test_combinations.TestCase,
-                             preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_too_long_vocab_fails_in_single_setting(self):
-    vocab_data = [42, 1138, 725, 1729]
+class IntegerLookupErrorTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_too_long_vocab_fails_in_single_setting(self):
+        vocab_data = [42, 1138, 725, 1729]
 
-    layer = integer_lookup.IntegerLookup(max_tokens=4, num_oov_indices=1)
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data)
+        layer = integer_lookup.IntegerLookup(max_tokens=4, num_oov_indices=1)
+        with self.assertRaisesRegex(
+            ValueError, "vocabulary larger than the maximum vocab.*"
+        ):
+            layer.set_vocabulary(vocab_data)
 
-  def test_zero_max_tokens_fails(self):
-    with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
-      _ = integer_lookup.IntegerLookup(max_tokens=0, num_oov_indices=1)
+    def test_zero_max_tokens_fails(self):
+        with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
+            _ = integer_lookup.IntegerLookup(max_tokens=0, num_oov_indices=1)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupSavingTest(test_combinations.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
-
-  def tearDown(self):
-    keras.backend.clear_session()
-    gc.collect()
-    super(IntegerLookupSavingTest, self).tearDown()
-
-  def test_vocabulary_persistence_across_saving(self):
-    vocab_data = [42, 1138, 725, 1729]
-    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=1)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    # TODO(b/149526183): Can't clear session when TF2 is disabled.
-    if tf.__internal__.tf2.enabled():
-      keras.backend.clear_session()
-
-    loaded_model = keras.models.load_model(
-        output_path,
-        custom_objects={"IntegerLookup": integer_lookup.IntegerLookup})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
+class IntegerLookupSavingTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def tearDown(self):
+        keras.backend.clear_session()
+        gc.collect()
+        super(IntegerLookupSavingTest, self).tearDown()
+
+    def test_vocabulary_persistence_across_saving(self):
+        vocab_data = [42, 1138, 725, 1729]
+        input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=1)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(output_dataset, expected_output)
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IntegerLookup": integer_lookup.IntegerLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+        with self.subTest("savedmodel"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            # TODO(b/149526183): Can't clear session when TF2 is disabled.
+            if tf.__internal__.tf2.enabled():
+                keras.backend.clear_session()
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IntegerLookup": integer_lookup.IntegerLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/normalization.py b/keras/layers/preprocessing/normalization.py
index 52b25ed56651..c105877d8d64 100644
--- a/keras/layers/preprocessing/normalization.py
+++ b/keras/layers/preprocessing/normalization.py
@@ -14,335 +14,380 @@
 # ==============================================================================
 """Normalization preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_utils as utils
-import numpy as np
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Normalization',
-              'keras.layers.experimental.preprocessing.Normalization')
+@keras_export(
+    "keras.layers.Normalization",
+    "keras.layers.experimental.preprocessing.Normalization",
+)
 class Normalization(base_preprocessing_layer.PreprocessingLayer):
-  """A preprocessing layer which normalizes continuous features.
-
-  This layer will shift and scale inputs into a distribution centered around
-  0 with standard deviation 1. It accomplishes this by precomputing the mean and
-  variance of the data, and calling `(input - mean) / sqrt(var)` at runtime.
-
-  The mean and variance values for the layer must be either supplied on
-  construction or learned via `adapt()`. `adapt()` will compute the mean and
-  variance of the data and store them as the layer's weights. `adapt()` should
-  be called before `fit()`, `evaluate()`, or `predict()`.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-      axis: Integer, tuple of integers, or None. The axis or axes that should
-        have a separate mean and variance for each index in the shape. For
-        example, if shape is `(None, 5)` and `axis=1`, the layer will track 5
-        separate mean and variance values for the last axis. If `axis` is set to
-        `None`, the layer will normalize all elements in the input by a scalar
-        mean and variance. Defaults to -1, where the last axis of the input is
-        assumed to be a feature dimension and is normalized per index. Note that
-        in the specific case of batched scalar inputs where the only axis is the
-        batch axis, the default will normalize each index in the batch
-        separately. In this case, consider passing `axis=None`.
-      mean: The mean value(s) to use during normalization. The passed value(s)
-        will be broadcast to the shape of the kept axes above; if the value(s)
-        cannot be broadcast, an error will be raised when this layer's `build()`
-        method is called.
-      variance: The variance value(s) to use during normalization. The passed
-        value(s) will be broadcast to the shape of the kept axes above; if the
-        value(s) cannot be broadcast, an error will be raised when this layer's
-        `build()` method is called.
-      invert: If True, this layer will apply the inverse transformation
-        to its inputs: it would turn a normalized input back into its
-        original form.
-
-  Examples:
-
-  Calculate a global mean and variance by analyzing the dataset in `adapt()`.
-
-  >>> adapt_data = np.array([1., 2., 3., 4., 5.], dtype='float32')
-  >>> input_data = np.array([1., 2., 3.], dtype='float32')
-  >>> layer = tf.keras.layers.Normalization(axis=None)
-  >>> layer.adapt(adapt_data)
-  >>> layer(input_data)
-  <tf.Tensor: shape=(3,), dtype=float32, numpy=
-  array([-1.4142135, -0.70710677, 0.], dtype=float32)>
-
-  Calculate a mean and variance for each index on the last axis.
-
-  >>> adapt_data = np.array([[0., 7., 4.],
-  ...                        [2., 9., 6.],
-  ...                        [0., 7., 4.],
-  ...                        [2., 9., 6.]], dtype='float32')
-  >>> input_data = np.array([[0., 7., 4.]], dtype='float32')
-  >>> layer = tf.keras.layers.Normalization(axis=-1)
-  >>> layer.adapt(adapt_data)
-  >>> layer(input_data)
-  <tf.Tensor: shape=(1, 3), dtype=float32, numpy=
-  array([-1., -1., -1.], dtype=float32)>
-
-  Pass the mean and variance directly.
-
-  >>> input_data = np.array([[1.], [2.], [3.]], dtype='float32')
-  >>> layer = tf.keras.layers.Normalization(mean=3., variance=2.)
-  >>> layer(input_data)
-  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
-  array([[-1.4142135 ],
-         [-0.70710677],
-         [ 0.        ]], dtype=float32)>
-
-  Use the layer to de-normalize inputs (after adapting the layer).
-
-  >>> adapt_data = np.array([[0., 7., 4.],
-  ...                        [2., 9., 6.],
-  ...                        [0., 7., 4.],
-  ...                        [2., 9., 6.]], dtype='float32')
-  >>> input_data = np.array([[1., 2., 3.]], dtype='float32')
-  >>> layer = tf.keras.layers.Normalization(axis=-1, invert=True)
-  >>> layer.adapt(adapt_data)
-  >>> layer(input_data)
-  <tf.Tensor: shape=(1, 3), dtype=float32, numpy=
-  array([2., 10., 8.], dtype=float32)>
-  """
-
-  def __init__(self, axis=-1, mean=None, variance=None, invert=False, **kwargs):
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell('Normalization').set(True)
-
-    # Standardize `axis` to a tuple.
-    if axis is None:
-      axis = ()
-    elif isinstance(axis, int):
-      axis = (axis,)
-    else:
-      axis = tuple(axis)
-    self.axis = axis
-
-    # Set `mean` and `variance` if passed.
-    if isinstance(mean, tf.Variable):
-      raise ValueError('Normalization does not support passing a Variable '
-                       'for the `mean` init arg.')
-    if isinstance(variance, tf.Variable):
-      raise ValueError('Normalization does not support passing a Variable '
-                       'for the `variance` init arg.')
-    if (mean is not None) != (variance is not None):
-      raise ValueError(
-          'When setting values directly, both `mean` and `variance` '
-          'must be set. Got mean: {} and variance: {}'.format(mean, variance))
-    self.input_mean = mean
-    self.input_variance = variance
-    self.invert = invert
-
-  def build(self, input_shape):
-    super().build(input_shape)
-
-    if (isinstance(input_shape, (list, tuple)) and
-        all(isinstance(shape, tf.TensorShape) for shape in input_shape)):
-      raise ValueError('Normalization only accepts a single input. If you are '
-                       'passing a python list or tuple as a single input, '
-                       'please convert to a numpy array or `tf.Tensor`.')
-
-    input_shape = tf.TensorShape(input_shape).as_list()
-    ndim = len(input_shape)
-
-    if any(a < -ndim or a >= ndim for a in self.axis):
-      raise ValueError('All `axis` values must be in the range [-ndim, ndim). '
-                       'Found ndim: `{}`, axis: {}'.format(ndim, self.axis))
-
-    # Axes to be kept, replacing negative values with positive equivalents.
-    # Sorted to avoid transposing axes.
-    self._keep_axis = sorted([d if d >= 0 else d + ndim for d in self.axis])
-    # All axes to be kept should have known shape.
-    for d in self._keep_axis:
-      if input_shape[d] is None:
-        raise ValueError(
-            'All `axis` values to be kept must have known shape. Got axis: {}, '
-            'input shape: {}, with unknown axis at index: {}'.format(
-                self.axis, input_shape, d))
-    # Axes to be reduced.
-    self._reduce_axis = [d for d in range(ndim) if d not in self._keep_axis]
-    # 1 if an axis should be reduced, 0 otherwise.
-    self._reduce_axis_mask = [
-        0 if d in self._keep_axis else 1 for d in range(ndim)
-    ]
-    # Broadcast any reduced axes.
-    self._broadcast_shape = [
-        input_shape[d] if d in self._keep_axis else 1 for d in range(ndim)
-    ]
-    mean_and_var_shape = tuple(input_shape[d] for d in self._keep_axis)
-
-    if self.input_mean is None:
-      self.adapt_mean = self.add_weight(
-          name='mean',
-          shape=mean_and_var_shape,
-          dtype=self.compute_dtype,
-          initializer='zeros',
-          trainable=False)
-      self.adapt_variance = self.add_weight(
-          name='variance',
-          shape=mean_and_var_shape,
-          dtype=self.compute_dtype,
-          initializer='ones',
-          trainable=False)
-      self.count = self.add_weight(
-          name='count',
-          shape=(),
-          dtype=tf.int64,
-          initializer='zeros',
-          trainable=False)
-      self.finalize_state()
-    else:
-      # In the no adapt case, make constant tensors for mean and variance with
-      # proper broadcast shape for use during call.
-      mean = self.input_mean * np.ones(mean_and_var_shape)
-      variance = self.input_variance * np.ones(mean_and_var_shape)
-      mean = tf.reshape(mean, self._broadcast_shape)
-      variance = tf.reshape(variance, self._broadcast_shape)
-      self.mean = tf.cast(mean, self.compute_dtype)
-      self.variance = tf.cast(variance, self.compute_dtype)
-
-  # We override this method solely to generate a docstring.
-  def adapt(self, data, batch_size=None, steps=None):
-    """Computes the mean and variance of values in a dataset.
-
-    Calling `adapt()` on a `Normalization` layer is an alternative to passing in
-    `mean` and `variance` arguments during layer construction. A `Normalization`
-    layer should always either be adapted over a dataset or passed `mean` and
-    `variance`.
-
-    During `adapt()`, the layer will compute a `mean` and `variance` separately
-    for each position in each axis specified by the `axis` argument. To
-    calculate a single `mean` and `variance` over the input data, simply pass
-    `axis=None`.
-
-    In order to make `Normalization` efficient in any distribution context, the
-    computed mean and variance are kept static with respect to any compiled
-    `tf.Graph`s that call the layer. As a consequence, if the layer is adapted a
-    second time, any models using the layer should be re-compiled. For more
-    information see
-    `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
-
-    `adapt()` is meant only as a single machine utility to compute layer state.
-    To analyze a dataset that cannot fit on a single machine, see
-    [Tensorflow Transform](https://www.tensorflow.org/tfx/transform/get_started)
-    for a multi-machine, map-reduce solution.
-
-    Arguments:
-      data: The data to train on. It can be passed either as a
-          `tf.data.Dataset`, or as a numpy array.
-      batch_size: Integer or `None`.
-          Number of samples per state update.
-          If unspecified, `batch_size` will default to 32.
-          Do not specify the `batch_size` if your data is in the
-          form of datasets, generators, or `keras.utils.Sequence` instances
-          (since they generate batches).
-      steps: Integer or `None`.
-          Total number of steps (batches of samples)
-          When training with input tensors such as
-          TensorFlow data tensors, the default `None` is equal to
-          the number of samples in your dataset divided by
-          the batch size, or 1 if that cannot be determined. If x is a
-          `tf.data` dataset, and 'steps' is None, the epoch will run until
-          the input dataset is exhausted. When passing an infinitely
-          repeating dataset, you must specify the `steps` argument. This
-          argument is not supported with array inputs.
+    """A preprocessing layer which normalizes continuous features.
+
+    This layer will shift and scale inputs into a distribution centered around
+    0 with standard deviation 1. It accomplishes this by precomputing the mean
+    and variance of the data, and calling `(input - mean) / sqrt(var)` at
+    runtime.
+
+    The mean and variance values for the layer must be either supplied on
+    construction or learned via `adapt()`. `adapt()` will compute the mean and
+    variance of the data and store them as the layer's weights. `adapt()` should
+    be called before `fit()`, `evaluate()`, or `predict()`.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+        axis: Integer, tuple of integers, or None. The axis or axes that should
+          have a separate mean and variance for each index in the shape. For
+          example, if shape is `(None, 5)` and `axis=1`, the layer will track 5
+          separate mean and variance values for the last axis. If `axis` is set
+          to `None`, the layer will normalize all elements in the input by a
+          scalar mean and variance. When `-1` the last axis of the
+          input is assumed to be a feature dimension and is normalized per
+          index. Note that in the specific case of batched scalar inputs where
+          the only axis is the batch axis, the default will normalize each index
+          in the batch separately. In this case, consider passing `axis=None`.
+          Defaults to `-1`.
+        mean: The mean value(s) to use during normalization. The passed value(s)
+          will be broadcast to the shape of the kept axes above; if the value(s)
+          cannot be broadcast, an error will be raised when this layer's
+          `build()` method is called.
+        variance: The variance value(s) to use during normalization. The passed
+          value(s) will be broadcast to the shape of the kept axes above; if the
+          value(s) cannot be broadcast, an error will be raised when this
+          layer's `build()` method is called.
+        invert: If True, this layer will apply the inverse transformation
+          to its inputs: it would turn a normalized input back into its
+          original form.
+
+    Examples:
+
+    Calculate a global mean and variance by analyzing the dataset in `adapt()`.
+
+    >>> adapt_data = np.array([1., 2., 3., 4., 5.], dtype='float32')
+    >>> input_data = np.array([1., 2., 3.], dtype='float32')
+    >>> layer = tf.keras.layers.Normalization(axis=None)
+    >>> layer.adapt(adapt_data)
+    >>> layer(input_data)
+    <tf.Tensor: shape=(3,), dtype=float32, numpy=
+    array([-1.4142135, -0.70710677, 0.], dtype=float32)>
+
+    Calculate a mean and variance for each index on the last axis.
+
+    >>> adapt_data = np.array([[0., 7., 4.],
+    ...                        [2., 9., 6.],
+    ...                        [0., 7., 4.],
+    ...                        [2., 9., 6.]], dtype='float32')
+    >>> input_data = np.array([[0., 7., 4.]], dtype='float32')
+    >>> layer = tf.keras.layers.Normalization(axis=-1)
+    >>> layer.adapt(adapt_data)
+    >>> layer(input_data)
+    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=
+    array([-1., -1., -1.], dtype=float32)>
+
+    Pass the mean and variance directly.
+
+    >>> input_data = np.array([[1.], [2.], [3.]], dtype='float32')
+    >>> layer = tf.keras.layers.Normalization(mean=3., variance=2.)
+    >>> layer(input_data)
+    <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+    array([[-1.4142135 ],
+           [-0.70710677],
+           [ 0.        ]], dtype=float32)>
+
+    Use the layer to de-normalize inputs (after adapting the layer).
+
+    >>> adapt_data = np.array([[0., 7., 4.],
+    ...                        [2., 9., 6.],
+    ...                        [0., 7., 4.],
+    ...                        [2., 9., 6.]], dtype='float32')
+    >>> input_data = np.array([[1., 2., 3.]], dtype='float32')
+    >>> layer = tf.keras.layers.Normalization(axis=-1, invert=True)
+    >>> layer.adapt(adapt_data)
+    >>> layer(input_data)
+    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=
+    array([2., 10., 8.], dtype=float32)>
     """
-    super().adapt(data, batch_size=batch_size, steps=steps)
-
-  def update_state(self, data):
-    if self.input_mean is not None:
-      raise ValueError(
-          'Cannot `adapt` a Normalization layer that is initialized with '
-          'static `mean` and `variance`, you passed mean {} and variance {}.'
-          .format(self.input_mean, self.input_variance))
-
-    if not self.built:
-      raise RuntimeError('`build` must be called before `update_state`.')
-
-    data = self._standardize_inputs(data)
-    data = tf.cast(data, self.adapt_mean.dtype)
-    batch_mean, batch_variance = tf.nn.moments(data, axes=self._reduce_axis)
-    batch_shape = tf.shape(data, out_type=self.count.dtype)
-    if self._reduce_axis:
-      batch_reduce_shape = tf.gather(batch_shape, self._reduce_axis)
-      batch_count = tf.reduce_prod(batch_reduce_shape)
-    else:
-      batch_count = 1
-
-    total_count = batch_count + self.count
-    batch_weight = (
-        tf.cast(batch_count, dtype=self.compute_dtype) /
-        tf.cast(total_count, dtype=self.compute_dtype))
-    existing_weight = 1. - batch_weight
-
-    total_mean = self.adapt_mean * existing_weight + batch_mean * batch_weight
-    # The variance is computed using the lack-of-fit sum of squares
-    # formula (see https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).
-    total_variance = ((self.adapt_variance +
-                       (self.adapt_mean - total_mean)**2) * existing_weight +
-                      (batch_variance +
-                       (batch_mean - total_mean)**2) * batch_weight)
-    self.adapt_mean.assign(total_mean)
-    self.adapt_variance.assign(total_variance)
-    self.count.assign(total_count)
-
-  def reset_state(self):  # pylint: disable=method-hidden
-    if self.input_mean is not None or not self.built:
-      return
-
-    self.adapt_mean.assign(tf.zeros_like(self.adapt_mean))
-    self.adapt_variance.assign(tf.ones_like(self.adapt_variance))
-    self.count.assign(tf.zeros_like(self.count))
-
-  def finalize_state(self):
-    if self.input_mean is not None or not self.built:
-      return
-
-    # In the adapt case, we make constant tensors for mean and variance with
-    # proper broadcast shape and dtype each time `finalize_state` is called.
-    self.mean = tf.reshape(self.adapt_mean, self._broadcast_shape)
-    self.mean = tf.cast(self.mean, self.compute_dtype)
-    self.variance = tf.reshape(self.adapt_variance, self._broadcast_shape)
-    self.variance = tf.cast(self.variance, self.compute_dtype)
-
-  def call(self, inputs):
-    inputs = self._standardize_inputs(inputs)
-    # The base layer automatically casts floating-point inputs, but we
-    # explicitly cast here to also allow integer inputs to be passed
-    inputs = tf.cast(inputs, self.compute_dtype)
-    if self.invert:
-      return ((inputs + self.mean) *
-              tf.maximum(tf.sqrt(self.variance), backend.epsilon()))
-    else:
-      return ((inputs - self.mean) /
-              tf.maximum(tf.sqrt(self.variance), backend.epsilon()))
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def compute_output_signature(self, input_spec):
-    return input_spec
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'axis': self.axis,
-        'mean': utils.listify_tensors(self.input_mean),
-        'variance': utils.listify_tensors(self.input_variance),
-    })
-    return config
-
-  def _standardize_inputs(self, inputs):
-    inputs = tf.convert_to_tensor(inputs)
-    if inputs.dtype != self.compute_dtype:
-      inputs = tf.cast(inputs, self.compute_dtype)
-    return inputs
+
+    def __init__(
+        self, axis=-1, mean=None, variance=None, invert=False, **kwargs
+    ):
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("Normalization").set(
+            True
+        )
+
+        # Standardize `axis` to a tuple.
+        if axis is None:
+            axis = ()
+        elif isinstance(axis, int):
+            axis = (axis,)
+        else:
+            axis = tuple(axis)
+        self.axis = axis
+
+        # Set `mean` and `variance` if passed.
+        if isinstance(mean, tf.Variable):
+            raise ValueError(
+                "Normalization does not support passing a Variable "
+                "for the `mean` init arg."
+            )
+        if isinstance(variance, tf.Variable):
+            raise ValueError(
+                "Normalization does not support passing a Variable "
+                "for the `variance` init arg."
+            )
+        if (mean is not None) != (variance is not None):
+            raise ValueError(
+                "When setting values directly, both `mean` and `variance` "
+                "must be set. Got mean: {} and variance: {}".format(
+                    mean, variance
+                )
+            )
+        self.input_mean = mean
+        self.input_variance = variance
+        self.invert = invert
+
+    def build(self, input_shape):
+        super().build(input_shape)
+
+        if isinstance(input_shape, (list, tuple)) and all(
+            isinstance(shape, tf.TensorShape) for shape in input_shape
+        ):
+            raise ValueError(
+                "Normalization only accepts a single input. If you are "
+                "passing a python list or tuple as a single input, "
+                "please convert to a numpy array or `tf.Tensor`."
+            )
+
+        input_shape = tf.TensorShape(input_shape).as_list()
+        ndim = len(input_shape)
+
+        if any(a < -ndim or a >= ndim for a in self.axis):
+            raise ValueError(
+                "All `axis` values must be in the range [-ndim, ndim). "
+                "Found ndim: `{}`, axis: {}".format(ndim, self.axis)
+            )
+
+        # Axes to be kept, replacing negative values with positive equivalents.
+        # Sorted to avoid transposing axes.
+        self._keep_axis = sorted([d if d >= 0 else d + ndim for d in self.axis])
+        # All axes to be kept should have known shape.
+        for d in self._keep_axis:
+            if input_shape[d] is None:
+                raise ValueError(
+                    "All `axis` values to be kept must have known shape. "
+                    "Got axis: {}, "
+                    "input shape: {}, with unknown axis at index: {}".format(
+                        self.axis, input_shape, d
+                    )
+                )
+        # Axes to be reduced.
+        self._reduce_axis = [d for d in range(ndim) if d not in self._keep_axis]
+        # 1 if an axis should be reduced, 0 otherwise.
+        self._reduce_axis_mask = [
+            0 if d in self._keep_axis else 1 for d in range(ndim)
+        ]
+        # Broadcast any reduced axes.
+        self._broadcast_shape = [
+            input_shape[d] if d in self._keep_axis else 1 for d in range(ndim)
+        ]
+        mean_and_var_shape = tuple(input_shape[d] for d in self._keep_axis)
+
+        if self.input_mean is None:
+            self.adapt_mean = self.add_weight(
+                name="mean",
+                shape=mean_and_var_shape,
+                dtype=self.compute_dtype,
+                initializer="zeros",
+                trainable=False,
+            )
+            self.adapt_variance = self.add_weight(
+                name="variance",
+                shape=mean_and_var_shape,
+                dtype=self.compute_dtype,
+                initializer="ones",
+                trainable=False,
+            )
+            self.count = self.add_weight(
+                name="count",
+                shape=(),
+                dtype=tf.int64,
+                initializer="zeros",
+                trainable=False,
+            )
+            self.finalize_state()
+        else:
+            # In the no adapt case, make constant tensors for mean and variance
+            # with proper broadcast shape for use during call.
+            mean = self.input_mean * np.ones(mean_and_var_shape)
+            variance = self.input_variance * np.ones(mean_and_var_shape)
+            mean = tf.reshape(mean, self._broadcast_shape)
+            variance = tf.reshape(variance, self._broadcast_shape)
+            self.mean = tf.cast(mean, self.compute_dtype)
+            self.variance = tf.cast(variance, self.compute_dtype)
+
+    # We override this method solely to generate a docstring.
+    def adapt(self, data, batch_size=None, steps=None):
+        """Computes the mean and variance of values in a dataset.
+
+        Calling `adapt()` on a `Normalization` layer is an alternative to
+        passing in `mean` and `variance` arguments during layer construction. A
+        `Normalization` layer should always either be adapted over a dataset or
+        passed `mean` and `variance`.
+
+        During `adapt()`, the layer will compute a `mean` and `variance`
+        separately for each position in each axis specified by the `axis`
+        argument. To calculate a single `mean` and `variance` over the input
+        data, simply pass `axis=None`.
+
+        In order to make `Normalization` efficient in any distribution context,
+        the computed mean and variance are kept static with respect to any
+        compiled `tf.Graph`s that call the layer. As a consequence, if the layer
+        is adapted a second time, any models using the layer should be
+        re-compiled. For more information see
+        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
+
+        `adapt()` is meant only as a single machine utility to compute layer
+        state.  To analyze a dataset that cannot fit on a single machine, see
+        [Tensorflow Transform](
+        https://www.tensorflow.org/tfx/transform/get_started)
+        for a multi-machine, map-reduce solution.
+
+        Arguments:
+          data: The data to train on. It can be passed either as a
+              `tf.data.Dataset`, or as a numpy array.
+          batch_size: Integer or `None`.
+              Number of samples per state update.
+              If unspecified, `batch_size` will default to 32.
+              Do not specify the `batch_size` if your data is in the
+              form of datasets, generators, or `keras.utils.Sequence` instances
+              (since they generate batches).
+          steps: Integer or `None`.
+              Total number of steps (batches of samples)
+              When training with input tensors such as
+              TensorFlow data tensors, the default `None` is equal to
+              the number of samples in your dataset divided by
+              the batch size, or 1 if that cannot be determined. If x is a
+              `tf.data` dataset, and 'steps' is None, the epoch will run until
+              the input dataset is exhausted. When passing an infinitely
+              repeating dataset, you must specify the `steps` argument. This
+              argument is not supported with array inputs.
+        """
+        super().adapt(data, batch_size=batch_size, steps=steps)
+
+    def update_state(self, data):
+        if self.input_mean is not None:
+            raise ValueError(
+                "Cannot `adapt` a Normalization layer that is initialized with "
+                "static `mean` and `variance`, "
+                "you passed mean {} and variance {}.".format(
+                    self.input_mean, self.input_variance
+                )
+            )
+
+        if not self.built:
+            raise RuntimeError("`build` must be called before `update_state`.")
+
+        data = self._standardize_inputs(data)
+        data = tf.cast(data, self.adapt_mean.dtype)
+        batch_mean, batch_variance = tf.nn.moments(data, axes=self._reduce_axis)
+        batch_shape = tf.shape(data, out_type=self.count.dtype)
+        if self._reduce_axis:
+            batch_reduce_shape = tf.gather(batch_shape, self._reduce_axis)
+            batch_count = tf.reduce_prod(batch_reduce_shape)
+        else:
+            batch_count = 1
+
+        total_count = batch_count + self.count
+        batch_weight = tf.cast(batch_count, dtype=self.compute_dtype) / tf.cast(
+            total_count, dtype=self.compute_dtype
+        )
+        existing_weight = 1.0 - batch_weight
+
+        total_mean = (
+            self.adapt_mean * existing_weight + batch_mean * batch_weight
+        )
+        # The variance is computed using the lack-of-fit sum of squares
+        # formula (see
+        # https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).
+        total_variance = (
+            self.adapt_variance + (self.adapt_mean - total_mean) ** 2
+        ) * existing_weight + (
+            batch_variance + (batch_mean - total_mean) ** 2
+        ) * batch_weight
+        self.adapt_mean.assign(total_mean)
+        self.adapt_variance.assign(total_variance)
+        self.count.assign(total_count)
+
+    def reset_state(self):
+        if self.input_mean is not None or not self.built:
+            return
+
+        self.adapt_mean.assign(tf.zeros_like(self.adapt_mean))
+        self.adapt_variance.assign(tf.ones_like(self.adapt_variance))
+        self.count.assign(tf.zeros_like(self.count))
+
+    def finalize_state(self):
+        if self.input_mean is not None or not self.built:
+            return
+
+        # In the adapt case, we make constant tensors for mean and variance with
+        # proper broadcast shape and dtype each time `finalize_state` is called.
+        self.mean = tf.reshape(self.adapt_mean, self._broadcast_shape)
+        self.mean = tf.cast(self.mean, self.compute_dtype)
+        self.variance = tf.reshape(self.adapt_variance, self._broadcast_shape)
+        self.variance = tf.cast(self.variance, self.compute_dtype)
+
+    def call(self, inputs):
+        inputs = self._standardize_inputs(inputs)
+        # The base layer automatically casts floating-point inputs, but we
+        # explicitly cast here to also allow integer inputs to be passed
+        inputs = tf.cast(inputs, self.compute_dtype)
+        if self.invert:
+            return self.mean + (
+                inputs * tf.maximum(tf.sqrt(self.variance), backend.epsilon())
+            )
+        else:
+            return (inputs - self.mean) / tf.maximum(
+                tf.sqrt(self.variance), backend.epsilon()
+            )
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def compute_output_signature(self, input_spec):
+        return input_spec
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "axis": self.axis,
+                "invert": self.invert,
+                "mean": utils.listify_tensors(self.input_mean),
+                "variance": utils.listify_tensors(self.input_variance),
+            }
+        )
+        return config
+
+    def _standardize_inputs(self, inputs):
+        inputs = tf.convert_to_tensor(inputs)
+        if inputs.dtype != self.compute_dtype:
+            inputs = tf.cast(inputs, self.compute_dtype)
+        return inputs
+
+    def load_own_variables(self, store):
+        # Ensure that we call finalize_state after variable loading.
+        super().load_own_variables(store)
+        self.finalize_state()
diff --git a/keras/layers/preprocessing/normalization_distribution_test.py b/keras/layers/preprocessing/normalization_distribution_test.py
index 3562aaba3e58..3d8e08aacf44 100644
--- a/keras/layers/preprocessing/normalization_distribution_test.py
+++ b/keras/layers/preprocessing/normalization_distribution_test.py
@@ -15,110 +15,145 @@
 """Distribution tests for keras.layers.preprocessing.normalization."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.distribute import strategy_combinations
 from keras.layers.preprocessing import normalization
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 def _get_layer_computation_test_cases():
-  test_cases = ({
-      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
-      "axis": -1,
-      "test_data": np.array([[1.], [2.], [3.]], np.float32),
-      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
-      "testcase_name": "2d_single_element"
-  }, {
-      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
-      "axis": None,
-      "test_data": np.array([[1.], [2.], [3.]], np.float32),
-      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
-      "testcase_name": "2d_single_element_none_axis"
-  }, {
-      "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
-      "axis": None,
-      "test_data": np.array([[1.], [2.], [3.]], np.float32),
-      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
-      "testcase_name": "2d_single_element_none_axis_flat_data"
-  }, {
-      "adapt_data":
-          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
-                   np.float32),
-      "axis":
-          1,
-      "test_data":
-          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
-                   np.float32),
-      "expected":
-          np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
-                    [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
-                   np.float32),
-      "testcase_name":
-          "3d_internal_axis"
-  }, {
-      "adapt_data":
-          np.array(
-              [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
-              np.float32),
-      "axis": (1, 2),
-      "test_data":
-          np.array(
-              [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
-              np.float32),
-      "expected":
-          np.array(
-              [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
-              np.float32),
-      "testcase_name":
-          "3d_multiple_axis"
-  })
+    test_cases = (
+        {
+            "adapt_data": np.array(
+                [[1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32
+            ),
+            "axis": -1,
+            "test_data": np.array([[1.0], [2.0], [3.0]], np.float32),
+            "expected": np.array([[-1.414214], [-0.707107], [0]], np.float32),
+            "testcase_name": "2d_single_element",
+        },
+        {
+            "adapt_data": np.array(
+                [[1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32
+            ),
+            "axis": None,
+            "test_data": np.array([[1.0], [2.0], [3.0]], np.float32),
+            "expected": np.array([[-1.414214], [-0.707107], [0]], np.float32),
+            "testcase_name": "2d_single_element_none_axis",
+        },
+        {
+            "adapt_data": np.array(
+                [[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float32
+            ),
+            "axis": None,
+            "test_data": np.array([[1.0], [2.0], [3.0]], np.float32),
+            "expected": np.array([[-1.414214], [-0.707107], [0]], np.float32),
+            "testcase_name": "2d_single_element_none_axis_flat_data",
+        },
+        {
+            "adapt_data": np.array(
+                [
+                    [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
+                    [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0]],
+                ],
+                np.float32,
+            ),
+            "axis": 1,
+            "test_data": np.array(
+                [
+                    [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
+                    [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0]],
+                ],
+                np.float32,
+            ),
+            "expected": np.array(
+                [
+                    [[-1.549193, -0.774597, 0.0], [-1.549193, -0.774597, 0.0]],
+                    [[0.0, 0.774597, 1.549193], [0.0, 0.774597, 1.549193]],
+                ],
+                np.float32,
+            ),
+            "testcase_name": "3d_internal_axis",
+        },
+        {
+            "adapt_data": np.array(
+                [
+                    [[1.0, 0.0, 3.0], [2.0, 3.0, 4.0]],
+                    [[3.0, -1.0, 5.0], [4.0, 5.0, 8.0]],
+                ],
+                np.float32,
+            ),
+            "axis": (1, 2),
+            "test_data": np.array(
+                [
+                    [[3.0, 1.0, -1.0], [2.0, 5.0, 4.0]],
+                    [[3.0, 0.0, 5.0], [2.0, 5.0, 8.0]],
+                ],
+                np.float32,
+            ),
+            "expected": np.array(
+                [
+                    [[1.0, 3.0, -5.0], [-1.0, 1.0, -1.0]],
+                    [[1.0, 1.0, 1.0], [-1.0, 1.0, 1.0]],
+                ],
+                np.float32,
+            ),
+            "testcase_name": "3d_multiple_axis",
+        },
+    )
 
-  crossed_test_cases = []
-  # Cross above test cases with use_dataset in (True, False)
-  for use_dataset in (True, False):
-    for case in test_cases:
-      case = case.copy()
-      if use_dataset:
-        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
-      case["use_dataset"] = use_dataset
-      crossed_test_cases.append(case)
+    crossed_test_cases = []
+    # Cross above test cases with use_dataset in (True, False)
+    for use_dataset in (True, False):
+        for case in test_cases:
+            case = case.copy()
+            if use_dataset:
+                case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+            case["use_dataset"] = use_dataset
+            crossed_test_cases.append(case)
 
-  return crossed_test_cases
+    return crossed_test_cases
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.times(
         tf.__internal__.test.combinations.combine(
-            strategy=strategy_combinations.all_strategies +
-            strategy_combinations.multi_worker_mirrored_strategies +
-            strategy_combinations.parameter_server_strategies_single_worker +
-            strategy_combinations.parameter_server_strategies_multi_worker,
-            mode=["eager"]), _get_layer_computation_test_cases()))
-class NormalizationTest(test_combinations.TestCase,
-                        preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_layer_computation(self, strategy, adapt_data, axis, test_data,
-                             use_dataset, expected):
-    input_shape = tuple([None for _ in range(test_data.ndim - 1)])
-    if use_dataset:
-      # Keras APIs expect batched datasets
-      adapt_data = tf.data.Dataset.from_tensor_slices(adapt_data).batch(2)
-      test_data = tf.data.Dataset.from_tensor_slices(test_data).batch(2)
+            strategy=strategy_combinations.all_strategies
+            + strategy_combinations.multi_worker_mirrored_strategies
+            + strategy_combinations.parameter_server_strategies_single_worker
+            + strategy_combinations.parameter_server_strategies_multi_worker,
+            mode=["eager"],
+        ),
+        _get_layer_computation_test_cases(),
+    )
+)
+class NormalizationTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_layer_computation(
+        self, strategy, adapt_data, axis, test_data, use_dataset, expected
+    ):
+        input_shape = tuple([None for _ in range(test_data.ndim - 1)])
+        if use_dataset:
+            # Keras APIs expect batched datasets
+            adapt_data = tf.data.Dataset.from_tensor_slices(adapt_data).batch(2)
+            test_data = tf.data.Dataset.from_tensor_slices(test_data).batch(2)
 
-    with strategy.scope():
-      input_data = keras.Input(shape=input_shape)
-      layer = normalization.Normalization(axis=axis)
-      layer.adapt(adapt_data)
-      output = layer(input_data)
-      model = keras.Model(input_data, output)
-    output_data = model.predict(test_data)
-    self.assertAllClose(expected, output_data)
+        with strategy.scope():
+            input_data = keras.Input(shape=input_shape)
+            layer = normalization.Normalization(axis=axis)
+            layer.adapt(adapt_data)
+            output = layer(input_data)
+            model = keras.Model(input_data, output)
+        output_data = model.predict(test_data)
+        self.assertAllClose(expected, output_data)
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/layers/preprocessing/normalization_test.py b/keras/layers/preprocessing/normalization_test.py
index 856cb8959338..d948f34d38fa 100644
--- a/keras/layers/preprocessing/normalization_test.py
+++ b/keras/layers/preprocessing/normalization_test.py
@@ -14,420 +14,530 @@
 # ==============================================================================
 """Tests for keras.layers.preprocessing.normalization."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
-from absl.testing import parameterized
-
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.layers.preprocessing import normalization
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.mixed_precision import policy
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 def _get_layer_computation_test_cases():
-  test_cases = ({
-      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
-      "axis": -1,
-      "test_data": np.array([[1.], [2.], [3.]], np.float32),
-      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
-      "testcase_name": "2d_single_element"
-  }, {
-      "adapt_data": np.array([[1], [2], [3], [4], [5]], dtype=np.int32),
-      "axis": -1,
-      "test_data": np.array([[1], [2], [3]], np.int32),
-      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
-      "testcase_name": "2d_int_data"
-  }, {
-      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
-      "axis": None,
-      "test_data": np.array([[1.], [2.], [3.]], np.float32),
-      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
-      "testcase_name": "2d_single_element_none_axis"
-  }, {
-      "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
-      "axis": None,
-      "test_data": np.array([[1.], [2.], [3.]], np.float32),
-      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
-      "testcase_name": "2d_single_element_none_axis_flat_data"
-  }, {
-      "adapt_data":
-          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
-                   np.float32),
-      "axis":
-          1,
-      "test_data":
-          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
-                   np.float32),
-      "expected":
-          np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
-                    [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
-                   np.float32),
-      "testcase_name":
-          "3d_internal_axis"
-  }, {
-      "adapt_data":
-          np.array(
-              [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
-              np.float32),
-      "axis": (1, 2),
-      "test_data":
-          np.array(
-              [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
-              np.float32),
-      "expected":
-          np.array(
-              [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
-              np.float32),
-      "testcase_name":
-          "3d_multiple_axis"
-  }, {
-      "adapt_data":
-          np.zeros((3, 4)),
-      "axis": -1,
-      "test_data":
-          np.zeros((3, 4)),
-      "expected":
-          np.zeros((3, 4)),
-      "testcase_name":
-          "zero_variance"
-  })
-
-  crossed_test_cases = []
-  # Cross above test cases with use_dataset in (True, False)
-  for use_dataset in (True, False):
-    for case in test_cases:
-      case = case.copy()
-      if use_dataset:
-        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
-      case["use_dataset"] = use_dataset
-      crossed_test_cases.append(case)
-
-  return crossed_test_cases
+    test_cases = (
+        {
+            "adapt_data": np.array(
+                [[1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32
+            ),
+            "axis": -1,
+            "test_data": np.array([[1.0], [2.0], [3.0]], np.float32),
+            "expected": np.array([[-1.414214], [-0.707107], [0]], np.float32),
+            "testcase_name": "2d_single_element",
+        },
+        {
+            "adapt_data": np.array([[1], [2], [3], [4], [5]], dtype=np.int32),
+            "axis": -1,
+            "test_data": np.array([[1], [2], [3]], np.int32),
+            "expected": np.array([[-1.414214], [-0.707107], [0]], np.float32),
+            "testcase_name": "2d_int_data",
+        },
+        {
+            "adapt_data": np.array(
+                [[1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32
+            ),
+            "axis": None,
+            "test_data": np.array([[1.0], [2.0], [3.0]], np.float32),
+            "expected": np.array([[-1.414214], [-0.707107], [0]], np.float32),
+            "testcase_name": "2d_single_element_none_axis",
+        },
+        {
+            "adapt_data": np.array(
+                [[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float32
+            ),
+            "axis": None,
+            "test_data": np.array([[1.0], [2.0], [3.0]], np.float32),
+            "expected": np.array([[-1.414214], [-0.707107], [0]], np.float32),
+            "testcase_name": "2d_single_element_none_axis_flat_data",
+        },
+        {
+            "adapt_data": np.array(
+                [
+                    [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
+                    [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0]],
+                ],
+                np.float32,
+            ),
+            "axis": 1,
+            "test_data": np.array(
+                [
+                    [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
+                    [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0]],
+                ],
+                np.float32,
+            ),
+            "expected": np.array(
+                [
+                    [[-1.549193, -0.774597, 0.0], [-1.549193, -0.774597, 0.0]],
+                    [[0.0, 0.774597, 1.549193], [0.0, 0.774597, 1.549193]],
+                ],
+                np.float32,
+            ),
+            "testcase_name": "3d_internal_axis",
+        },
+        {
+            "adapt_data": np.array(
+                [
+                    [[1.0, 0.0, 3.0], [2.0, 3.0, 4.0]],
+                    [[3.0, -1.0, 5.0], [4.0, 5.0, 8.0]],
+                ],
+                np.float32,
+            ),
+            "axis": (1, 2),
+            "test_data": np.array(
+                [
+                    [[3.0, 1.0, -1.0], [2.0, 5.0, 4.0]],
+                    [[3.0, 0.0, 5.0], [2.0, 5.0, 8.0]],
+                ],
+                np.float32,
+            ),
+            "expected": np.array(
+                [
+                    [[1.0, 3.0, -5.0], [-1.0, 1.0, -1.0]],
+                    [[1.0, 1.0, 1.0], [-1.0, 1.0, 1.0]],
+                ],
+                np.float32,
+            ),
+            "testcase_name": "3d_multiple_axis",
+        },
+        {
+            "adapt_data": np.zeros((3, 4)),
+            "axis": -1,
+            "test_data": np.zeros((3, 4)),
+            "expected": np.zeros((3, 4)),
+            "testcase_name": "zero_variance",
+        },
+    )
+
+    crossed_test_cases = []
+    # Cross above test cases with use_dataset in (True, False)
+    for use_dataset in (True, False):
+        for case in test_cases:
+            case = case.copy()
+            if use_dataset:
+                case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+            case["use_dataset"] = use_dataset
+            crossed_test_cases.append(case)
+
+    return crossed_test_cases
 
 
 @test_combinations.run_all_keras_modes
-class NormalizationTest(test_combinations.TestCase,
-                        preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_broadcasting_during_direct_setting(self):
-    layer = normalization.Normalization(axis=-1, mean=[1.0], variance=[1.0])
-    output = layer(np.array([[1., 2.]]))
-    expected_output = [[0., 1.]]
-    self.assertAllClose(output, expected_output)
-    self.assertAllClose(layer.get_weights(), [])
-
-  def test_broadcasting_during_direct_setting_with_tensors(self):
-    if not tf.executing_eagerly():
-      self.skipTest("Only supported in TF2.")
-
-    layer = normalization.Normalization(
-        axis=-1,
-        mean=tf.constant([1.0]),
-        variance=tf.constant([1.0]))
-    output = layer(np.array([[1., 2.]]))
-    expected_output = [[0., 1.]]
-    self.assertAllClose(output, expected_output)
-    self.assertAllClose(layer.get_weights(), [])
-
-  def test_1d_data(self):
-    data = np.array([0., 2., 0., 2.])
-    layer = normalization.Normalization(mean=1.0, variance=1.0)
-    output = layer(data)
-    self.assertListEqual(output.shape.as_list(), [4])
-    self.assertAllClose(output, [-1, 1, -1, 1])
-
-  def test_0d_data(self):
-    layer = normalization.Normalization(axis=None, mean=1.0, variance=1.0)
-    output = layer(0.)
-    self.assertListEqual(output.shape.as_list(), [])
-    self.assertAllClose(output, -1)
-
-  def test_broadcasting_during_direct_setting_with_variables_fails(self):
-    with self.assertRaisesRegex(ValueError, "passing a Variable"):
-      _ = normalization.Normalization(
-          axis=-1,
-          mean=tf.Variable([1.0]),
-          variance=tf.Variable([2.0]))
-
-  def test_keeping_an_unknown_axis_fails(self):
-    layer = normalization.Normalization(axis=-1)
-    with self.assertRaisesRegex(ValueError, "axis.*must have known shape"):
-      layer.build([None])
-
-  @parameterized.parameters(
-      # Out of bounds
-      {"axis": 3},
-      {"axis": -4},
-      # In a tuple
-      {"axis": (1, 3)},
-      {"axis": (1, -4)},
-  )
-  def test_bad_axis_fail_build(self, axis):
-    layer = normalization.Normalization(axis=axis)
-    with self.assertRaisesRegex(ValueError, "in the range"):
-      layer.build([None, 2, 3])
-
-  def test_list_input(self):
-    with self.assertRaisesRegex(
-        ValueError, ("Normalization only accepts a single input. If you are "
-                     "passing a python list or tuple as a single input, "
-                     "please convert to a numpy array or `tf.Tensor`.")):
-      normalization.Normalization()([1, 2, 3])
-
-  def test_scalar_input(self):
-    with self.assertRaisesRegex(ValueError,
-                                "axis.*values must be in the range"):
-      normalization.Normalization()(1)
-
-  def test_output_dtype(self):
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest("set_global_policy only supported in TF2.")
-    # Output should respect an explicit dtype, and default to the global policy.
-    policy.set_global_policy("float64")
-    input_data = keras.Input(batch_size=16, shape=(1,))
-    layer = normalization.Normalization(mean=1.0, variance=1.0, dtype="float16")
-    output = layer(input_data)
-    self.assertAllEqual(output.dtype, tf.float16)
-    layer = normalization.Normalization(mean=1.0, variance=1.0)
-    output = layer(input_data)
-    self.assertAllEqual(output.dtype, tf.float64)
-
-  def test_invert(self):
-    data = np.array([0., 2., 0., 2.])
-    norm = normalization.Normalization(mean=1.0, variance=1.0)
-    inv_norm = normalization.Normalization(mean=1.0, variance=1.0, invert=True)
-    output = norm(data)
-    output2 = inv_norm(output)
-    self.assertListEqual(output2.shape.as_list(), [4])
-    self.assertAllClose(output2, [0., 2., 0., 2.])
-
-  @test_utils.run_v2_only
-  def test_invert_adapt(self):
-    input_data = [[0.], [2.], [0.], [2.]]
-    norm = keras.layers.Normalization(axis=-1)
-    norm.adapt(input_data)
-    inv_norm = keras.layers.Normalization(axis=-1, invert=True)
-    inv_norm.adapt(input_data)
-    output = norm(input_data)
-    output2 = inv_norm(output)
-    self.assertAllClose(input_data, output2)
+class NormalizationTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_broadcasting_during_direct_setting(self):
+        layer = normalization.Normalization(axis=-1, mean=[1.0], variance=[1.0])
+        output = layer(np.array([[1.0, 2.0]]))
+        expected_output = [[0.0, 1.0]]
+        self.assertAllClose(output, expected_output)
+        self.assertAllClose(layer.get_weights(), [])
+
+    def test_broadcasting_during_direct_setting_with_tensors(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Only supported in TF2.")
+
+        layer = normalization.Normalization(
+            axis=-1, mean=tf.constant([1.0]), variance=tf.constant([1.0])
+        )
+        output = layer(np.array([[1.0, 2.0]]))
+        expected_output = [[0.0, 1.0]]
+        self.assertAllClose(output, expected_output)
+        self.assertAllClose(layer.get_weights(), [])
+
+    def test_1d_data(self):
+        data = np.array([0.0, 2.0, 0.0, 2.0])
+        layer = normalization.Normalization(mean=1.0, variance=1.0)
+        output = layer(data)
+        self.assertListEqual(output.shape.as_list(), [4])
+        self.assertAllClose(output, [-1, 1, -1, 1])
+
+    def test_0d_data(self):
+        layer = normalization.Normalization(axis=None, mean=1.0, variance=1.0)
+        output = layer(0.0)
+        self.assertListEqual(output.shape.as_list(), [])
+        self.assertAllClose(output, -1)
+
+    def test_broadcasting_during_direct_setting_with_variables_fails(self):
+        with self.assertRaisesRegex(ValueError, "passing a Variable"):
+            _ = normalization.Normalization(
+                axis=-1, mean=tf.Variable([1.0]), variance=tf.Variable([2.0])
+            )
+
+    def test_keeping_an_unknown_axis_fails(self):
+        layer = normalization.Normalization(axis=-1)
+        with self.assertRaisesRegex(ValueError, "axis.*must have known shape"):
+            layer.build([None])
+
+    @parameterized.parameters(
+        # Out of bounds
+        {"axis": 3},
+        {"axis": -4},
+        # In a tuple
+        {"axis": (1, 3)},
+        {"axis": (1, -4)},
+    )
+    def test_bad_axis_fail_build(self, axis):
+        layer = normalization.Normalization(axis=axis)
+        with self.assertRaisesRegex(ValueError, "in the range"):
+            layer.build([None, 2, 3])
+
+    def test_list_input(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Normalization only accepts a single input. If you are "
+            "passing a python list or tuple as a single input, "
+            "please convert to a numpy array or `tf.Tensor`.",
+        ):
+            normalization.Normalization()([1, 2, 3])
+
+    def test_scalar_input(self):
+        with self.assertRaisesRegex(
+            ValueError, "axis.*values must be in the range"
+        ):
+            normalization.Normalization()(1)
+
+    def test_output_dtype(self):
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest("set_global_policy only supported in TF2.")
+        # Output should respect an explicit dtype, and default to the global
+        # policy.
+        policy.set_global_policy("float64")
+        input_data = keras.Input(batch_size=16, shape=(1,))
+        layer = normalization.Normalization(
+            mean=1.0, variance=1.0, dtype="float16"
+        )
+        output = layer(input_data)
+        self.assertAllEqual(output.dtype, tf.float16)
+        layer = normalization.Normalization(mean=1.0, variance=1.0)
+        output = layer(input_data)
+        self.assertAllEqual(output.dtype, tf.float64)
+
+    def test_invert(self):
+        input_data = np.array([0.0, 4.0, 0.0, 4.0])
+        norm = normalization.Normalization(mean=2.0, variance=4.0)
+        inv_norm = normalization.Normalization(
+            mean=2.0, variance=4.0, invert=True
+        )
+        output = norm(input_data)
+        output2 = inv_norm(output)
+        self.assertListEqual(output2.shape.as_list(), [4])
+        self.assertAllClose(input_data, output2)
+
+    @test_utils.run_v2_only
+    def test_invert_adapt(self):
+        input_data = [[0.0], [4.0], [0.0], [4.0]]
+        norm = keras.layers.Normalization(axis=-1)
+        norm.adapt(input_data)
+        inv_norm = keras.layers.Normalization(axis=-1, invert=True)
+        inv_norm.adapt(input_data)
+        output = norm(input_data)
+        output2 = inv_norm(output)
+        self.assertListEqual(output2.shape.as_list(), [4, 1])
+        self.assertAllClose(input_data, output2)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class NormalizationAdaptTest(test_combinations.TestCase,
-                             preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_layer_api_compatibility(self):
-    cls = normalization.Normalization
-    output_data = test_utils.layer_test(
-        cls,
-        kwargs={"axis": -1},
-        input_shape=(None, 3),
-        input_data=np.array([[3, 1, 2], [6, 5, 4]], dtype=np.float32),
-        validate_training=False,
-        adapt_data=np.array([[1, 2, 1], [2, 3, 4], [1, 2, 1], [2, 3, 4]]))
-    expected = np.array([[3., -3., -0.33333333], [9., 5., 1.]])
-    self.assertAllClose(expected, output_data)
-
-  @parameterized.named_parameters(*_get_layer_computation_test_cases())
-  def test_layer_computation(self, adapt_data, axis, test_data, use_dataset,
-                             expected):
-    input_shape = tuple([test_data.shape[i] for i in range(1, test_data.ndim)])
-    if use_dataset:
-      # Keras APIs expect batched datasets
-      adapt_data = tf.data.Dataset.from_tensor_slices(adapt_data).batch(
-          test_data.shape[0] // 2)
-      test_data = tf.data.Dataset.from_tensor_slices(test_data).batch(
-          test_data.shape[0] // 2)
-
-    layer = normalization.Normalization(axis=axis)
-    layer.adapt(adapt_data)
-
-    input_data = keras.Input(shape=input_shape)
-    output = layer(input_data)
-    model = keras.Model(input_data, output)
-    model._run_eagerly = test_utils.should_run_eagerly()
-    output_data = model.predict(test_data)
-    self.assertAllClose(expected, output_data)
-
-  def test_1d_unbatched_adapt(self):
-    ds = tf.data.Dataset.from_tensor_slices([
-        [2., 0., 2., 0.],
-        [0., 2., 0., 2.],
-    ])
-    layer = normalization.Normalization(axis=-1)
-    layer.adapt(ds)
-    output_ds = ds.map(layer)
-    self.assertAllClose(
-        list(output_ds.as_numpy_iterator()), [
-            [1., -1., 1., -1.],
-            [-1., 1., -1., 1.],
-        ])
-
-  def test_0d_unbatched_adapt(self):
-    ds = tf.data.Dataset.from_tensor_slices([2., 0., 2., 0.])
-    layer = normalization.Normalization(axis=None)
-    layer.adapt(ds)
-    output_ds = ds.map(layer)
-    self.assertAllClose(list(output_ds.as_numpy_iterator()), [1., -1., 1., -1.])
-
-  @parameterized.parameters(
-      # Results should be identical no matter how the axes are specified (3d).
-      {"axis": (1, 2)},
-      {"axis": (2, 1)},
-      {"axis": (1, -1)},
-      {"axis": (-1, 1)},
-  )
-  def test_axis_permutations(self, axis):
-    layer = normalization.Normalization(axis=axis)
-    # data.shape = [2, 2, 3]
-    data = np.array([[[0., 1., 2.], [0., 2., 6.]],
-                     [[2., 3., 4.], [3., 6., 10.]]])
-    expect = np.array([[[-1., -1., -1.], [-1., -1., -1.]],
-                       [[1., 1., 1.], [1., 1., 1.]]])
-    layer.adapt(data)
-    self.assertAllClose(expect, layer(data))
-
-  def test_model_summary_after_layer_adapt(self):
-    data = np.array([[[0., 1., 2.], [0., 2., 6.]],
-                     [[2., 3., 4.], [3., 6., 10.]]])
-    layer = normalization.Normalization(axis=-1)
-    layer.adapt(data)
-    model = keras.Sequential(
-        [layer,
-         keras.layers.Dense(64, activation="relu"),
-         keras.layers.Dense(1)])
-    model.summary()
-
-  def test_multiple_adapts(self):
-    first_adapt = [[0], [2], [0], [2]]
-    second_adapt = [[2], [4], [2], [4]]
-    predict_input = [[2], [2]]
-    expected_first_output = [[1], [1]]
-    expected_second_output = [[-1], [-1]]
-
-    inputs = keras.Input(shape=(1,), dtype=tf.int32)
-    layer = normalization.Normalization(axis=-1)
-    layer.adapt(first_adapt)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    actual_output = model.predict(predict_input)
-    self.assertAllClose(actual_output, expected_first_output)
-
-    # Re-adapt the layer on new inputs.
-    layer.adapt(second_adapt)
-    # Re-compile the model.
-    model.compile()
-    # `predict` should now use the new model state.
-    actual_output = model.predict(predict_input)
-    self.assertAllClose(actual_output, expected_second_output)
-
-  @parameterized.parameters(
-      {"adapted": True},
-      {"adapted": False},
-  )
-  def test_saved_model_tf(self, adapted):
-    input_data = [[0.], [2.], [0.], [2.]]
-    expected_output = [[-1.], [1.], [-1.], [1.]]
-
-    inputs = keras.Input(shape=(1,), dtype=tf.float32)
-    if adapted:
-      layer = normalization.Normalization(axis=-1)
-      layer.adapt(input_data)
-    else:
-      layer = normalization.Normalization(mean=1., variance=1.)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    output_data = model.predict(input_data)
-    self.assertAllClose(output_data, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_saved_model")
-    tf.saved_model.save(model, output_path)
-    loaded_model = tf.saved_model.load(output_path)
-    f = loaded_model.signatures["serving_default"]
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_data = f(tf.constant(input_data))["normalization"]
-    self.assertAllClose(new_output_data, expected_output)
-
-  @parameterized.product(
-      save_format=["tf", "h5"],
-      adapt=[True, False],
-  )
-  def test_saved_model_keras(self, save_format, adapt):
-    input_data = [[0.], [2.], [0.], [2.]]
-    expected_output = [[-1.], [1.], [-1.], [1.]]
-
-    cls = normalization.Normalization
-    inputs = keras.Input(shape=(1,), dtype=tf.float32)
-    if adapt:
-      layer = cls(axis=-1)
-      layer.adapt(input_data)
-    else:
-      layer = cls(mean=1., variance=1.)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    output_data = model.predict(input_data)
-    self.assertAllClose(output_data, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format=format)
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"Normalization": cls})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_data = loaded_model.predict(input_data)
-    self.assertAllClose(new_output_data, expected_output)
-
-  @parameterized.parameters(
-      {"adapted": True},
-      {"adapted": False},
-  )
-  def test_saved_weights_keras(self, adapted):
-    input_data = [[0.], [2.], [0.], [2.]]
-    expected_output = [[-1.], [1.], [-1.], [1.]]
-
-    cls = normalization.Normalization
-    inputs = keras.Input(shape=(1,), dtype=tf.float32)
-    if adapted:
-      layer = cls(axis=-1)
-      layer.adapt(input_data)
-    else:
-      layer = cls(mean=1., variance=1.)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-
-    output_data = model.predict(input_data)
-    self.assertAllClose(output_data, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_weights")
-    model.save_weights(output_path, save_format="tf")
-    new_model = keras.Model.from_config(
-        model.get_config(), custom_objects={"Normalization": cls})
-    new_model.load_weights(output_path)
-
-    # Validate correctness of the new model.
-    new_output_data = new_model.predict(input_data)
-    self.assertAllClose(new_output_data, expected_output)
+class NormalizationAdaptTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_layer_api_compatibility(self):
+        cls = normalization.Normalization
+        output_data = test_utils.layer_test(
+            cls,
+            kwargs={"axis": -1},
+            input_shape=(None, 3),
+            input_data=np.array([[3, 1, 2], [6, 5, 4]], dtype=np.float32),
+            validate_training=False,
+            adapt_data=np.array([[1, 2, 1], [2, 3, 4], [1, 2, 1], [2, 3, 4]]),
+        )
+        expected = np.array([[3.0, -3.0, -0.33333333], [9.0, 5.0, 1.0]])
+        self.assertAllClose(expected, output_data)
+
+    @parameterized.named_parameters(*_get_layer_computation_test_cases())
+    def test_layer_computation(
+        self, adapt_data, axis, test_data, use_dataset, expected
+    ):
+        input_shape = tuple(
+            [test_data.shape[i] for i in range(1, test_data.ndim)]
+        )
+        if use_dataset:
+            # Keras APIs expect batched datasets
+            adapt_data = tf.data.Dataset.from_tensor_slices(adapt_data).batch(
+                test_data.shape[0] // 2
+            )
+            test_data = tf.data.Dataset.from_tensor_slices(test_data).batch(
+                test_data.shape[0] // 2
+            )
+
+        layer = normalization.Normalization(axis=axis)
+        layer.adapt(adapt_data)
+
+        input_data = keras.Input(shape=input_shape)
+        output = layer(input_data)
+        model = keras.Model(input_data, output)
+        model._run_eagerly = test_utils.should_run_eagerly()
+        output_data = model.predict(test_data)
+        self.assertAllClose(expected, output_data)
+
+    def test_1d_unbatched_adapt(self):
+        ds = tf.data.Dataset.from_tensor_slices(
+            [
+                [2.0, 0.0, 2.0, 0.0],
+                [0.0, 2.0, 0.0, 2.0],
+            ]
+        )
+        layer = normalization.Normalization(axis=-1)
+        layer.adapt(ds)
+        output_ds = ds.map(layer)
+        self.assertAllClose(
+            list(output_ds.as_numpy_iterator()),
+            [
+                [1.0, -1.0, 1.0, -1.0],
+                [-1.0, 1.0, -1.0, 1.0],
+            ],
+        )
+
+    def test_0d_unbatched_adapt(self):
+        ds = tf.data.Dataset.from_tensor_slices([2.0, 0.0, 2.0, 0.0])
+        layer = normalization.Normalization(axis=None)
+        layer.adapt(ds)
+        output_ds = ds.map(layer)
+        self.assertAllClose(
+            list(output_ds.as_numpy_iterator()), [1.0, -1.0, 1.0, -1.0]
+        )
+
+    @parameterized.parameters(
+        # Results should be identical no matter how the axes are specified (3d).
+        {"axis": (1, 2)},
+        {"axis": (2, 1)},
+        {"axis": (1, -1)},
+        {"axis": (-1, 1)},
+    )
+    def test_axis_permutations(self, axis):
+        layer = normalization.Normalization(axis=axis)
+        # data.shape = [2, 2, 3]
+        data = np.array(
+            [
+                [[0.0, 1.0, 2.0], [0.0, 2.0, 6.0]],
+                [[2.0, 3.0, 4.0], [3.0, 6.0, 10.0]],
+            ]
+        )
+        expect = np.array(
+            [
+                [[-1.0, -1.0, -1.0], [-1.0, -1.0, -1.0]],
+                [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
+            ]
+        )
+        layer.adapt(data)
+        self.assertAllClose(expect, layer(data))
+
+    def test_model_summary_after_layer_adapt(self):
+        data = np.array(
+            [
+                [[0.0, 1.0, 2.0], [0.0, 2.0, 6.0]],
+                [[2.0, 3.0, 4.0], [3.0, 6.0, 10.0]],
+            ]
+        )
+        layer = normalization.Normalization(axis=-1)
+        layer.adapt(data)
+        model = keras.Sequential(
+            [
+                layer,
+                keras.layers.Dense(64, activation="relu"),
+                keras.layers.Dense(1),
+            ]
+        )
+        model.summary()
+
+    def test_multiple_adapts(self):
+        first_adapt = [[0], [2], [0], [2]]
+        second_adapt = [[2], [4], [2], [4]]
+        predict_input = [[2], [2]]
+        expected_first_output = [[1], [1]]
+        expected_second_output = [[-1], [-1]]
+
+        inputs = keras.Input(shape=(1,), dtype=tf.int32)
+        layer = normalization.Normalization(axis=-1)
+        layer.adapt(first_adapt)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        actual_output = model.predict(predict_input)
+        self.assertAllClose(actual_output, expected_first_output)
+
+        # Re-adapt the layer on new inputs.
+        layer.adapt(second_adapt)
+        # Re-compile the model.
+        model.compile()
+        # `predict` should now use the new model state.
+        actual_output = model.predict(predict_input)
+        self.assertAllClose(actual_output, expected_second_output)
+
+    @parameterized.parameters(
+        {"adapted": True},
+        {"adapted": False},
+    )
+    def test_saving_tf(self, adapted):
+        input_data = [[0.0], [2.0], [0.0], [2.0]]
+        expected_output = [[-1.0], [1.0], [-1.0], [1.0]]
+
+        inputs = keras.Input(shape=(1,), dtype=tf.float32)
+        if adapted:
+            layer = normalization.Normalization(axis=-1)
+            layer.adapt(input_data)
+        else:
+            layer = normalization.Normalization(mean=1.0, variance=1.0)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        output_data = model.predict(input_data)
+        self.assertAllClose(output_data, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_saved_model")
+        tf.saved_model.save(model, output_path)
+        loaded_model = tf.saved_model.load(output_path)
+        f = loaded_model.signatures["serving_default"]
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = f(tf.constant(input_data))["normalization"]
+        self.assertAllClose(new_output_data, expected_output)
+
+    @parameterized.product(
+        save_format=["tf", "h5", "keras_v3"],
+        adapt=[True, False],
+    )
+    def test_saving_keras(self, save_format, adapt):
+        input_data = [[0.0], [2.0], [0.0], [2.0]]
+        expected_output = [[-1.0], [1.0], [-1.0], [1.0]]
+
+        cls = normalization.Normalization
+        inputs = keras.Input(shape=(1,), dtype=tf.float32)
+        if adapt:
+            layer = cls(axis=-1)
+            layer.adapt(input_data)
+        else:
+            layer = cls(mean=1.0, variance=1.0)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        output_data = model.predict(input_data)
+        self.assertAllClose(output_data, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_model")
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path += ".keras"
+        model.save(output_path, save_format=save_format)
+        loaded_model = keras.models.load_model(
+            output_path, custom_objects={"Normalization": cls}
+        )
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = loaded_model.predict(input_data)
+        self.assertAllClose(new_output_data, expected_output)
+
+    @parameterized.product(
+        save_format=["tf", "h5", "keras_v3"],
+        adapt=[True, False],
+    )
+    def test_saving_keras_invert(self, save_format, adapt):
+        expected_output = [[0.0], [2.0], [0.0], [2.0]]
+        input_data = [[-1.0], [1.0], [-1.0], [1.0]]
+
+        cls = normalization.Normalization
+        inputs = keras.Input(shape=(1,), dtype=tf.float32)
+        if adapt:
+            layer = cls(axis=-1, invert=True)
+            layer.adapt(expected_output)
+        else:
+            layer = cls(mean=1.0, variance=1.0, invert=True)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        output_data = model.predict(input_data)
+        self.assertAllClose(output_data, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_model_invert")
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path += ".keras"
+        model.save(output_path, save_format=save_format)
+        loaded_model = keras.models.load_model(
+            output_path, custom_objects={"Normalization": cls}
+        )
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = loaded_model.predict(input_data)
+        self.assertAllClose(new_output_data, expected_output)
+
+    @parameterized.parameters(
+        {"adapted": True},
+        {"adapted": False},
+    )
+    def test_saved_weights_keras(self, adapted):
+        input_data = [[0.0], [2.0], [0.0], [2.0]]
+        expected_output = [[-1.0], [1.0], [-1.0], [1.0]]
+
+        cls = normalization.Normalization
+        inputs = keras.Input(shape=(1,), dtype=tf.float32)
+        if adapted:
+            layer = cls(axis=-1)
+            layer.adapt(input_data)
+        else:
+            layer = cls(mean=1.0, variance=1.0)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        output_data = model.predict(input_data)
+        self.assertAllClose(output_data, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(
+            self.get_temp_dir(), "tf_keras_saved_weights"
+        )
+        model.save_weights(output_path, save_format="tf")
+        new_model = keras.Model.from_config(
+            model.get_config(), custom_objects={"Normalization": cls}
+        )
+        new_model.load_weights(output_path)
+
+        # Validate correctness of the new model.
+        new_output_data = new_model.predict(input_data)
+        self.assertAllClose(new_output_data, expected_output)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/preprocessing_stage.py b/keras/layers/preprocessing/preprocessing_stage.py
index 2247f13b7aa3..035f18c16b6f 100644
--- a/keras/layers/preprocessing/preprocessing_stage.py
+++ b/keras/layers/preprocessing/preprocessing_stage.py
@@ -14,10 +14,9 @@
 # ==============================================================================
 """Preprocessing stage."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
 
-import numpy as np
 from keras.engine import base_preprocessing_layer
 from keras.engine import functional
 from keras.engine import sequential
@@ -25,243 +24,259 @@
 
 
 # Sequential methods should take precedence.
-class PreprocessingStage(sequential.Sequential,
-                         base_preprocessing_layer.PreprocessingLayer):
-  """A sequential preprocessing stage.
-
-  This preprocessing stage wraps a list of preprocessing layers into a
-  Sequential-like object that enables you to `adapt()` the whole list via
-  a single `adapt()` call on the preprocessing stage.
-
-  Args:
-    layers: List of layers. Can include layers that aren't preprocessing layers.
-    name: String. Optional name for the preprocessing stage object.
-  """
+class PreprocessingStage(
+    sequential.Sequential, base_preprocessing_layer.PreprocessingLayer
+):
+    """A sequential preprocessing stage.
 
-  def adapt(self, data, reset_state=True):
-    """Adapt the state of the layers of the preprocessing stage to the data.
+    This preprocessing stage wraps a list of preprocessing layers into a
+    Sequential-like object that enables you to `adapt()` the whole list via
+    a single `adapt()` call on the preprocessing stage.
 
     Args:
-      data: A batched Dataset object, or a NumPy array, or an EagerTensor.
-        Data to be iterated over to adapt the state of the layers in this
-        preprocessing stage.
-      reset_state: Whether this call to `adapt` should reset the state of
-        the layers in this preprocessing stage.
+      layers: List of layers. Can include layers that aren't preprocessing
+        layers.
+      name: String. Optional name for the preprocessing stage object.
     """
-    if not isinstance(
-        data, (tf.data.Dataset, np.ndarray, tf.__internal__.EagerTensor)):
-      raise ValueError(
-          f'`adapt()` requires a batched Dataset, an EagerTensor, or a Numpy '
-          f'array as input. Received data={data}')
-    if isinstance(data, tf.data.Dataset):
-      # Validate the datasets to try and ensure we haven't been passed one with
-      # infinite size. That would cause an infinite loop here.
-      if tf_utils.dataset_is_infinite(data):
-        raise ValueError(
-            'The dataset passed to `adapt()` has an infinite number of '
-            'elements. Please use dataset.take(...) to make the number '
-            'of elements finite.')
-
-    for current_layer_index in range(0, len(self.layers)):
-      if not hasattr(self.layers[current_layer_index], 'adapt'):
-        # Skip any layer that does not need adapting.
-        continue
 
-      def map_fn(x):
-        """Maps `PreprocessingStage` inputs to inputs at `current_layer_index`.
+    def adapt(self, data, reset_state=True):
+        """Adapt the state of the layers of the preprocessing stage to the data.
 
         Args:
-          x: Batch of inputs seen in entry of the `PreprocessingStage` instance.
-
-        Returns:
-          Batch of inputs to be processed by layer
-            `self.layers[current_layer_index]`
+          data: A batched Dataset object, or a NumPy array, or an EagerTensor.
+            Data to be iterated over to adapt the state of the layers in this
+            preprocessing stage.
+          reset_state: Whether this call to `adapt` should reset the state of
+            the layers in this preprocessing stage.
         """
-        if current_layer_index == 0:  # pylint: disable=cell-var-from-loop
-          return x
-        for i in range(current_layer_index):  # pylint: disable=cell-var-from-loop
-          x = self.layers[i](x)
-        return x
-
-      if isinstance(data, tf.data.Dataset):
-        current_layer_data = data.map(map_fn)
-      else:
-        current_layer_data = map_fn(data)
-      self.layers[current_layer_index].adapt(current_layer_data,
-                                             reset_state=reset_state)
+        if not isinstance(
+            data, (tf.data.Dataset, np.ndarray, tf.__internal__.EagerTensor)
+        ):
+            raise ValueError(
+                "`adapt()` requires a batched Dataset, an EagerTensor, or a "
+                f"Numpy array as input. Received data={data}"
+            )
+        if isinstance(data, tf.data.Dataset):
+            # Validate the datasets to try and ensure we haven't been passed one
+            # with infinite size. That would cause an infinite loop here.
+            if tf_utils.dataset_is_infinite(data):
+                raise ValueError(
+                    "The dataset passed to `adapt()` has an infinite number of "
+                    "elements. Please use dataset.take(...) to make the number "
+                    "of elements finite."
+                )
+
+        for current_layer_index in range(0, len(self.layers)):
+            if not hasattr(self.layers[current_layer_index], "adapt"):
+                # Skip any layer that does not need adapting.
+                continue
+
+            def map_fn(x):
+                """Maps this object's inputs to those at current_layer_index.
+
+                Args:
+                  x: Batch of inputs seen in entry of the `PreprocessingStage`
+                    instance.
+
+                Returns:
+                  Batch of inputs to be processed by layer
+                    `self.layers[current_layer_index]`
+                """
+                if current_layer_index == 0:
+                    return x
+                for i in range(current_layer_index):
+                    x = self.layers[i](x)
+                return x
+
+            if isinstance(data, tf.data.Dataset):
+                current_layer_data = data.map(map_fn)
+            else:
+                current_layer_data = map_fn(data)
+            self.layers[current_layer_index].adapt(
+                current_layer_data, reset_state=reset_state
+            )
 
 
 # Functional methods should take precedence.
-class FunctionalPreprocessingStage(functional.Functional,
-                                   base_preprocessing_layer.PreprocessingLayer):
-  """A functional preprocessing stage.
-
-  This preprocessing stage wraps a graph of preprocessing layers into a
-  Functional-like object that enables you to `adapt()` the whole graph via
-  a single `adapt()` call on the preprocessing stage.
-
-  Preprocessing stage is not a complete model, so it cannot be called with
-  `fit()`. However, it is possible to add regular layers that may be trainable
-  to a preprocessing stage.
-
-  A functional preprocessing stage is created in the same way as `Functional`
-  models. A stage can be instantiated by passing two arguments to
-  `__init__`. The first argument is the `keras.Input` Tensors that represent
-  the inputs to the stage. The second argument specifies the output
-  tensors that represent the outputs of this stage. Both arguments can be a
-  nested structure of tensors.
-
-  Example:
-
-  >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
-  ...           'x1': tf.keras.Input(shape=(1,))}
-  >>> norm_layer = tf.keras.layers.experimental.preprocessing.Normalization()
-  >>> y = norm_layer(inputs['x2'])
-  >>> y, z = tf.keras.layers.Lambda(lambda x: (x, x))(inputs['x1'])
-  >>> outputs = [inputs['x1'], [y, z]]
-  >>> stage = FunctionalPreprocessingStage(inputs, outputs)
-
-  Args:
-    inputs: An input tensor (must be created via `tf.keras.Input()`), or a list,
-      a dict, or a nested structure of input tensors.
-    outputs: An output tensor, or a list, a dict or a nested structure of output
-      tensors.
-    name: String, optional. Name of the preprocessing stage.
-  """
-
-  def fit(self, *args, **kwargs):
-    raise ValueError(
-        'Preprocessing stage is not a complete model, and hence should not be '
-        '`fit`. Instead, you may feed data to `adapt` the stage to set '
-        'appropriate states of the layers in the stage.')
-
-  def adapt(self, data, reset_state=True):
-    """Adapt the state of the layers of the preprocessing stage to the data.
+class FunctionalPreprocessingStage(
+    functional.Functional, base_preprocessing_layer.PreprocessingLayer
+):
+    """A functional preprocessing stage.
 
-    Args:
-      data: A batched Dataset object, a NumPy array, an EagerTensor, or a list,
-        dict or nested structure of Numpy Arrays or EagerTensors. The elements
-        of Dataset object need to conform with inputs of the stage. The first
-        dimension of NumPy arrays or EagerTensors are understood to be batch
-        dimension. Data to be iterated over to adapt the state of the layers in
-        this preprocessing stage.
-      reset_state: Whether this call to `adapt` should reset the state of the
-        layers in this preprocessing stage.
-
-    Examples:
-
-    >>> # For a stage with dict input
-    >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
-    ...           'x1': tf.keras.Input(shape=(1,))}
-    >>> outputs = [inputs['x1'], inputs['x2']]
-    >>> stage = FunctionalPreprocessingStage(inputs, outputs)
-    >>> ds = tf.data.Dataset.from_tensor_slices({'x1': tf.ones((4,5)),
-    ...                                          'x2': tf.ones((4,1))})
-    >>> sorted(ds.element_spec.items()) # Check element_spec
-    [('x1', TensorSpec(shape=(5,), dtype=tf.float32, name=None)),
-     ('x2', TensorSpec(shape=(1,), dtype=tf.float32, name=None))]
-    >>> stage.adapt(ds)
-    >>> data_np = {'x1': np.ones((4, 5)), 'x2': np.ones((4, 1))}
-    >>> stage.adapt(data_np)
+    This preprocessing stage wraps a graph of preprocessing layers into a
+    Functional-like object that enables you to `adapt()` the whole graph via
+    a single `adapt()` call on the preprocessing stage.
 
-    """
-    if not isinstance(data, tf.data.Dataset):
-      data = self._flatten_to_reference_inputs(data)
-      if any(not isinstance(datum, (np.ndarray, tf.__internal__.EagerTensor))
-             for datum in data):
-        raise ValueError(
-            '`adapt()` requires a batched Dataset, a list of EagerTensors '
-            'or Numpy arrays as input, got {}'.format(type(data)))
-      ds_input = [
-          tf.data.Dataset.from_tensor_slices(x).batch(1) for x in data
-      ]
-
-    if isinstance(data, tf.data.Dataset):
-      # Validate the datasets to try and ensure we haven't been passed one with
-      # infinite size. That would cause an infinite loop here.
-      if tf_utils.dataset_is_infinite(data):
-        raise ValueError(
-            'The dataset passed to `adapt()` has an infinite number of '
-            'elements. Please use dataset.take(...) to make the number '
-            'of elements finite.')
-      # Unzip dataset object to a list of single input dataset.
-      ds_input = _unzip_dataset(data)
+    Preprocessing stage is not a complete model, so it cannot be called with
+    `fit()`. However, it is possible to add regular layers that may be trainable
+    to a preprocessing stage.
 
-    # Dictionary mapping reference tensors to datasets
-    ds_dict = {}
-    tensor_usage_count = self._tensor_usage_count
-    for x, y in zip(self.inputs, ds_input):
-      x_id = str(id(x))
-      ds_dict[x_id] = [y] * tensor_usage_count[x_id]
+    A functional preprocessing stage is created in the same way as `Functional`
+    models. A stage can be instantiated by passing two arguments to
+    `__init__`. The first argument is the `keras.Input` Tensors that represent
+    the inputs to the stage. The second argument specifies the output
+    tensors that represent the outputs of this stage. Both arguments can be a
+    nested structure of tensors.
 
-    nodes_by_depth = self._nodes_by_depth
-    depth_keys = sorted(nodes_by_depth.keys(), reverse=True)
+    Example:
 
-    def build_map_fn(node, args, kwargs):
-      if not isinstance(args.element_spec, tuple):
-
-        def map_fn(*x):
-          return tf.nest.flatten(node.layer(*x, **kwargs))
-      else:
-
-        def map_fn(*x):
-          return tf.nest.flatten(node.layer(x, **kwargs))
-
-      return map_fn
-
-    for depth in depth_keys:
-      for node in nodes_by_depth[depth]:
-        # Input node
-        if node.is_input:
-          continue
+    >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
+    ...           'x1': tf.keras.Input(shape=(1,))}
+    >>> norm_layer = tf.keras.layers.Normalization()
+    >>> y = norm_layer(inputs['x2'])
+    >>> y, z = tf.keras.layers.Lambda(lambda x: (x, x))(inputs['x1'])
+    >>> outputs = [inputs['x1'], [y, z]]
+    >>> stage = FunctionalPreprocessingStage(inputs, outputs)
 
-        # Node with input not computed yet
-        if any(t_id not in ds_dict for t_id in node.flat_input_ids):
-          continue
+    Args:
+      inputs: An input tensor (must be created via `tf.keras.Input()`), or a
+        list, a dict, or a nested structure of input tensors.
+      outputs: An output tensor, or a list, a dict or a nested structure of
+        output tensors.
+      name: String, optional. Name of the preprocessing stage.
+    """
 
-        args, kwargs = node.map_arguments(ds_dict)
-        args = tf.data.Dataset.zip(tf.__internal__.nest.list_to_tuple(*args))
+    def fit(self, *args, **kwargs):
+        raise ValueError(
+            "Preprocessing stage is not a complete model, and hence should not "
+            "be `fit`. Instead, you may feed data to `adapt` the stage to set "
+            "appropriate states of the layers in the stage."
+        )
 
-        if node.layer.stateful and hasattr(node.layer, 'adapt'):
-          node.layer.adapt(args, reset_state=reset_state)
+    def adapt(self, data, reset_state=True):
+        """Adapt the state of the layers of the preprocessing stage to the data.
 
-        map_fn = build_map_fn(node, args, kwargs)
-        outputs = args.map(map_fn)
-        outputs = _unzip_dataset(outputs)
+        Args:
+          data: A batched Dataset object, a NumPy array, an EagerTensor, or a
+            list, dict or nested structure of Numpy Arrays or EagerTensors. The
+            elements of Dataset object need to conform with inputs of the stage.
+            The first dimension of NumPy arrays or EagerTensors are understood
+            to be batch dimension. Data to be iterated over to adapt the state
+            of the layers in this preprocessing stage.
+          reset_state: Whether this call to `adapt` should reset the state of
+            the layers in this preprocessing stage.
+
+        Examples:
+
+        >>> # For a stage with dict input
+        >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
+        ...           'x1': tf.keras.Input(shape=(1,))}
+        >>> outputs = [inputs['x1'], inputs['x2']]
+        >>> stage = FunctionalPreprocessingStage(inputs, outputs)
+        >>> ds = tf.data.Dataset.from_tensor_slices({'x1': tf.ones((4,5)),
+        ...                                          'x2': tf.ones((4,1))})
+        >>> sorted(ds.element_spec.items()) # Check element_spec
+        [('x1', TensorSpec(shape=(5,), dtype=tf.float32, name=None)),
+         ('x2', TensorSpec(shape=(1,), dtype=tf.float32, name=None))]
+        >>> stage.adapt(ds)
+        >>> data_np = {'x1': np.ones((4, 5)), 'x2': np.ones((4, 1))}
+        >>> stage.adapt(data_np)
 
-        # Update ds_dict.
-        for x_id, y in zip(node.flat_output_ids, outputs):
-          ds_dict[x_id] = [y] * tensor_usage_count[x_id]
+        """
+        if not isinstance(data, tf.data.Dataset):
+            data = self._flatten_to_reference_inputs(data)
+            if any(
+                not isinstance(datum, (np.ndarray, tf.__internal__.EagerTensor))
+                for datum in data
+            ):
+                raise ValueError(
+                    "`adapt()` requires a batched Dataset, a list of "
+                    f"EagerTensors or Numpy arrays as input, got {type(data)}"
+                )
+            ds_input = [
+                tf.data.Dataset.from_tensor_slices(x).batch(1) for x in data
+            ]
+
+        if isinstance(data, tf.data.Dataset):
+            # Validate the datasets to try and ensure we haven't been passed one
+            # with infinite size. That would cause an infinite loop here.
+            if tf_utils.dataset_is_infinite(data):
+                raise ValueError(
+                    "The dataset passed to `adapt()` has an infinite number of "
+                    "elements. Please use dataset.take(...) to make the number "
+                    "of elements finite."
+                )
+            # Unzip dataset object to a list of single input dataset.
+            ds_input = _unzip_dataset(data)
+
+        # Dictionary mapping reference tensors to datasets
+        ds_dict = {}
+        tensor_usage_count = self._tensor_usage_count
+        for x, y in zip(self.inputs, ds_input):
+            x_id = str(id(x))
+            ds_dict[x_id] = [y] * tensor_usage_count[x_id]
+
+        nodes_by_depth = self._nodes_by_depth
+        depth_keys = sorted(nodes_by_depth.keys(), reverse=True)
+
+        def build_map_fn(node, args, kwargs):
+            if not isinstance(args.element_spec, tuple):
+
+                def map_fn(*x):
+                    return tf.nest.flatten(node.layer(*x, **kwargs))
+
+            else:
+
+                def map_fn(*x):
+                    return tf.nest.flatten(node.layer(x, **kwargs))
+
+            return map_fn
+
+        for depth in depth_keys:
+            for node in nodes_by_depth[depth]:
+                # Input node
+                if node.is_input:
+                    continue
+
+                # Node with input not computed yet
+                if any(t_id not in ds_dict for t_id in node.flat_input_ids):
+                    continue
+
+                args, kwargs = node.map_arguments(ds_dict)
+                args = tf.data.Dataset.zip(
+                    tf.__internal__.nest.list_to_tuple(*args)
+                )
+
+                if node.layer.stateful and hasattr(node.layer, "adapt"):
+                    node.layer.adapt(args, reset_state=reset_state)
+
+                map_fn = build_map_fn(node, args, kwargs)
+                outputs = args.map(map_fn)
+                outputs = _unzip_dataset(outputs)
+
+                # Update ds_dict.
+                for x_id, y in zip(node.flat_output_ids, outputs):
+                    ds_dict[x_id] = [y] * tensor_usage_count[x_id]
 
 
 def _unzip_dataset(ds):
-  """Unzip dataset into a list of single element datasets.
+    """Unzip dataset into a list of single element datasets.
 
-  Args:
-    ds: A Dataset object.
+    Args:
+      ds: A Dataset object.
 
-  Returns:
-    A list of Dataset object, each correspond to one of the `element_spec` of
-    the input Dataset object.
+    Returns:
+      A list of Dataset object, each correspond to one of the `element_spec` of
+      the input Dataset object.
 
-  Example:
+    Example:
 
-  >>> ds1 = tf.data.Dataset.from_tensor_slices([1, 2, 3])
-  >>> ds2 = tf.data.Dataset.from_tensor_slices([4, 5, 6])
-  >>> ds_zipped_tuple = tf.data.Dataset.zip((ds1, ds2))
-  >>> ds_unzipped_tuple = _unzip_dataset(ds_zipped_tuple)
-  >>> ds_zipped_dict = tf.data.Dataset.zip({'ds1': ds1, 'ds2': ds2})
-  >>> ds_unzipped_dict = _unzip_dataset(ds_zipped_dict)
+    >>> ds1 = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+    >>> ds2 = tf.data.Dataset.from_tensor_slices([4, 5, 6])
+    >>> ds_zipped_tuple = tf.data.Dataset.zip((ds1, ds2))
+    >>> ds_unzipped_tuple = _unzip_dataset(ds_zipped_tuple)
+    >>> ds_zipped_dict = tf.data.Dataset.zip({'ds1': ds1, 'ds2': ds2})
+    >>> ds_unzipped_dict = _unzip_dataset(ds_zipped_dict)
 
-  Then the two elements of `ds_unzipped_tuple` and `ds_unzipped_dict` are both
-  the same as `ds1` and `ds2`.
-  """
-  element_count = len(tf.nest.flatten(ds.element_spec))
-  ds_unzipped = []
-  for i in range(element_count):
+    Then the two elements of `ds_unzipped_tuple` and `ds_unzipped_dict` are both
+    the same as `ds1` and `ds2`.
+    """
+    element_count = len(tf.nest.flatten(ds.element_spec))
+    ds_unzipped = []
+    for i in range(element_count):
 
-    def map_fn(*x, j=i):
-      return tf.nest.flatten(x)[j]
+        def map_fn(*x, j=i):
+            return tf.nest.flatten(x)[j]
 
-    ds_unzipped.append(ds.map(map_fn))
-  return ds_unzipped
+        ds_unzipped.append(ds.map(map_fn))
+    return ds_unzipped
diff --git a/keras/layers/preprocessing/preprocessing_stage_functional_test.py b/keras/layers/preprocessing/preprocessing_stage_functional_test.py
index 12fd94b0c9b5..897c1d48ec64 100644
--- a/keras/layers/preprocessing/preprocessing_stage_functional_test.py
+++ b/keras/layers/preprocessing/preprocessing_stage_functional_test.py
@@ -14,12 +14,11 @@
 # ==============================================================================
 """Functional preprocessing stage tests."""
 
-import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
-
 import time
+
 import numpy as np
-from keras.testing_infra import test_combinations
+import tensorflow.compat.v2 as tf
+
 from keras.engine import base_preprocessing_layer
 from keras.engine.input_layer import Input
 from keras.layers import convolutional
@@ -29,411 +28,421 @@
 from keras.layers.preprocessing import normalization
 from keras.layers.preprocessing import preprocessing_stage
 from keras.layers.preprocessing import preprocessing_test_utils
+from keras.testing_infra import test_combinations
 
 
 class PL(base_preprocessing_layer.PreprocessingLayer):
+    def __init__(self, **kwargs):
+        self.adapt_time = None
+        self.adapt_count = 0
+        super().__init__(**kwargs)
 
-  def __init__(self, **kwargs):
-    self.adapt_time = None
-    self.adapt_count = 0
-    super().__init__(**kwargs)
-
-  def adapt(self, data, reset_state=True):
-    self.adapt_time = time.time()
-    self.adapt_count += 1
+    def adapt(self, data, reset_state=True):
+        self.adapt_time = time.time()
+        self.adapt_count += 1
 
-  def call(self, inputs):
-    return inputs + 1
+    def call(self, inputs):
+        return inputs + 1
 
 
 class PLMerge(PL):
-
-  def call(self, inputs):
-    return inputs[0] + inputs[1]
+    def call(self, inputs):
+        return inputs[0] + inputs[1]
 
 
 class PLSplit(PL):
-
-  def call(self, inputs):
-    return inputs + 1, inputs - 1
+    def call(self, inputs):
+        return inputs + 1, inputs - 1
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class PreprocessingStageTest(test_combinations.TestCase,
-                             preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_adapt_preprocessing_stage_with_single_input_output(self):
-
-    x = Input(shape=(3,))
-
-    l0 = PL()
-    y = l0(x)
-
-    l1 = PL()
-    z = l1(y)
-
-    stage = preprocessing_stage.FunctionalPreprocessingStage(x, z)
-    stage.compile()
-
-    # Test with NumPy array
-    one_array = np.ones((4, 3), dtype='float32')
-    stage.adapt(one_array)
-    self.assertEqual(l0.adapt_count, 1)
-    self.assertEqual(l1.adapt_count, 1)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
-    # Check call
-    z = stage(tf.ones((4, 3), dtype='float32'))
-    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 2.)
-
-    # Test with dataset
-    adapt_data = tf.data.Dataset.from_tensor_slices(one_array)
-    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
-
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 2)
-    self.assertEqual(l1.adapt_count, 2)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
-    # Test error with bad data
-    with self.assertRaisesRegex(ValueError, 'requires a '):
-      stage.adapt(None)
-
-    # Disallow calling fit
-    with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
-      stage.fit(None)
-
-  def test_adapt_preprocessing_stage_with_list_input(self):
-
-    x0 = Input(shape=(3,))
-    x1 = Input(shape=(3,))
-    x2 = Input(shape=(3,))
-
-    l0 = PLMerge()
-    y = l0([x0, x1])
-
-    l1 = PLMerge()
-    y = l1([y, x2])
-
-    l2 = PLSplit()
-    z, y = l2(y)
-
-    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2],
-                                                             [y, z])
-    stage.compile()
-
-    # Test with NumPy array
-    one_array = np.ones((4, 3), dtype='float32')
-    stage.adapt([one_array, one_array, one_array])
-    self.assertEqual(l0.adapt_count, 1)
-    self.assertEqual(l1.adapt_count, 1)
-    self.assertEqual(l2.adapt_count, 1)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Check call
-    y, z = stage([
-        tf.ones((4, 3), dtype='float32'),
-        tf.ones((4, 3), dtype='float32'),
-        tf.ones((4, 3), dtype='float32')
-    ])
-    self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 1.)
-    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 3.)
-
-    # Test with dataset
-    adapt_data = tf.data.Dataset.from_tensor_slices(
-        (one_array, one_array, one_array))
-    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
-
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 2)
-    self.assertEqual(l1.adapt_count, 2)
-    self.assertEqual(l2.adapt_count, 2)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Test error with bad data
-    with self.assertRaisesRegex(ValueError, 'requires a '):
-      stage.adapt(None)
-
-  def test_adapt_preprocessing_stage_with_dict_input(self):
-    x0 = Input(shape=(3,), name='x0')
-    x1 = Input(shape=(4,), name='x1')
-    x2 = Input(shape=(3, 5), name='x2')
-
-    # dimension will mismatch if x1 incorrectly placed.
-    x1_sum = core.Lambda(
-        lambda x: tf.reduce_sum(x, axis=-1, keepdims=True))(
-            x1)
-    x2_sum = core.Lambda(lambda x: tf.reduce_sum(x, axis=-1))(x2)
-
-    l0 = PLMerge()
-    y = l0([x0, x1_sum])
-
-    l1 = PLMerge()
-    y = l1([y, x2_sum])
-
-    l2 = PLSplit()
-    z, y = l2(y)
-    stage = preprocessing_stage.FunctionalPreprocessingStage(
-        {
-            'x2': x2,
-            'x0': x0,
-            'x1': x1
-        }, [y, z])
-    stage.compile()
-
-    # Test with dict of NumPy array
-    one_array0 = np.ones((4, 3), dtype='float32')
-    one_array1 = np.ones((4, 4), dtype='float32')
-    one_array2 = np.ones((4, 3, 5), dtype='float32')
-    adapt_data = {'x1': one_array1, 'x0': one_array0, 'x2': one_array2}
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 1)
-    self.assertEqual(l1.adapt_count, 1)
-    self.assertEqual(l2.adapt_count, 1)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Check call
-    y, z = stage({
-        'x1': tf.constant(one_array1),
-        'x2': tf.constant(one_array2),
-        'x0': tf.constant(one_array0)
-    })
-    self.assertAllClose(y, np.zeros((4, 3), dtype='float32') + 9.)
-    self.assertAllClose(z, np.zeros((4, 3), dtype='float32') + 11.)
-
-    # Test with list of NumPy array
-    adapt_data = [one_array0, one_array1, one_array2]
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 2)
-    self.assertEqual(l1.adapt_count, 2)
-    self.assertEqual(l2.adapt_count, 2)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Test with flattened dataset
-    adapt_data = tf.data.Dataset.from_tensor_slices(
-        (one_array0, one_array1, one_array2))
-    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
-
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 3)
-    self.assertEqual(l1.adapt_count, 3)
-    self.assertEqual(l2.adapt_count, 3)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Test with dataset in dict shape
-    adapt_data = tf.data.Dataset.from_tensor_slices({
-        'x0': one_array0,
-        'x2': one_array2,
-        'x1': one_array1
-    })
-    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 4)
-    self.assertEqual(l1.adapt_count, 4)
-    self.assertEqual(l2.adapt_count, 4)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Test error with bad data
-    with self.assertRaisesRegex(ValueError, 'requires a '):
-      stage.adapt(None)
-
-  def test_adapt_preprocessing_stage_with_dict_output(self):
-    x = Input(shape=(3,), name='x')
-
-    l0 = PLSplit()
-    y0, y1 = l0(x)
-
-    l1 = PLSplit()
-    z0, z1 = l1(y0)
-    stage = preprocessing_stage.FunctionalPreprocessingStage({'x': x}, {
-        'y1': y1,
-        'z1': z1,
-        'y0': y0,
-        'z0': z0
-    })
-    stage.compile()
-
-    # Test with NumPy array
-    one_array = np.ones((4, 3), dtype='float32')
-    adapt_data = {'x': one_array}
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 1)
-    self.assertEqual(l1.adapt_count, 1)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
-    # Check call
-    outputs = stage({'x': tf.constant(one_array)})
-    self.assertEqual(set(outputs.keys()), {'y0', 'y1', 'z0', 'z1'})
-    self.assertAllClose(outputs['y0'], np.ones((4, 3), dtype='float32') + 1.)
-    self.assertAllClose(outputs['y1'], np.ones((4, 3), dtype='float32') - 1.)
-    self.assertAllClose(outputs['z0'], np.ones((4, 3), dtype='float32') + 2.)
-    self.assertAllClose(outputs['z1'], np.ones((4, 3), dtype='float32'))
-
-  def test_preprocessing_stage_with_nested_input(self):
-    # Test with NumPy array
-    x0 = Input(shape=(3,))
-    x1 = Input(shape=(3,))
-    x2 = Input(shape=(3,))
-
-    l0 = PLMerge()
-    y = l0([x0, x1])
-
-    l1 = PLMerge()
-    y = l1([y, x2])
-
-    l2 = PLSplit()
-    z, y = l2(y)
-
-    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, [x1, x2]],
-                                                             [y, z])
-    stage.compile()
-    one_array = np.ones((4, 3), dtype='float32')
-    stage.adapt([one_array, [one_array, one_array]])
-    self.assertEqual(l0.adapt_count, 1)
-    self.assertEqual(l1.adapt_count, 1)
-    self.assertEqual(l2.adapt_count, 1)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Check call
-    y, z = stage([
-        tf.ones((4, 3), dtype='float32'),
-        [
-            tf.ones((4, 3), dtype='float32'),
-            tf.ones((4, 3), dtype='float32')
+class PreprocessingStageTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_adapt_preprocessing_stage_with_single_input_output(self):
+
+        x = Input(shape=(3,))
+
+        l0 = PL()
+        y = l0(x)
+
+        l1 = PL()
+        z = l1(y)
+
+        stage = preprocessing_stage.FunctionalPreprocessingStage(x, z)
+        stage.compile()
+
+        # Test with NumPy array
+        one_array = np.ones((4, 3), dtype="float32")
+        stage.adapt(one_array)
+        self.assertEqual(l0.adapt_count, 1)
+        self.assertEqual(l1.adapt_count, 1)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+        # Check call
+        z = stage(tf.ones((4, 3), dtype="float32"))
+        self.assertAllClose(z, np.ones((4, 3), dtype="float32") + 2.0)
+
+        # Test with dataset
+        adapt_data = tf.data.Dataset.from_tensor_slices(one_array)
+        adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 2)
+        self.assertEqual(l1.adapt_count, 2)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+        # Test error with bad data
+        with self.assertRaisesRegex(ValueError, "requires a "):
+            stage.adapt(None)
+
+        # Disallow calling fit
+        with self.assertRaisesRegex(ValueError, "Preprocessing stage"):
+            stage.fit(None)
+
+    def test_adapt_preprocessing_stage_with_list_input(self):
+
+        x0 = Input(shape=(3,))
+        x1 = Input(shape=(3,))
+        x2 = Input(shape=(3,))
+
+        l0 = PLMerge()
+        y = l0([x0, x1])
+
+        l1 = PLMerge()
+        y = l1([y, x2])
+
+        l2 = PLSplit()
+        z, y = l2(y)
+
+        stage = preprocessing_stage.FunctionalPreprocessingStage(
+            [x0, x1, x2], [y, z]
+        )
+        stage.compile()
+
+        # Test with NumPy array
+        one_array = np.ones((4, 3), dtype="float32")
+        stage.adapt([one_array, one_array, one_array])
+        self.assertEqual(l0.adapt_count, 1)
+        self.assertEqual(l1.adapt_count, 1)
+        self.assertEqual(l2.adapt_count, 1)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Check call
+        y, z = stage(
+            [
+                tf.ones((4, 3), dtype="float32"),
+                tf.ones((4, 3), dtype="float32"),
+                tf.ones((4, 3), dtype="float32"),
+            ]
+        )
+        self.assertAllClose(y, np.ones((4, 3), dtype="float32") + 1.0)
+        self.assertAllClose(z, np.ones((4, 3), dtype="float32") + 3.0)
+
+        # Test with dataset
+        adapt_data = tf.data.Dataset.from_tensor_slices(
+            (one_array, one_array, one_array)
+        )
+        adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 2)
+        self.assertEqual(l1.adapt_count, 2)
+        self.assertEqual(l2.adapt_count, 2)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Test error with bad data
+        with self.assertRaisesRegex(ValueError, "requires a "):
+            stage.adapt(None)
+
+    def test_adapt_preprocessing_stage_with_dict_input(self):
+        x0 = Input(shape=(3,), name="x0")
+        x1 = Input(shape=(4,), name="x1")
+        x2 = Input(shape=(3, 5), name="x2")
+
+        # dimension will mismatch if x1 incorrectly placed.
+        x1_sum = core.Lambda(
+            lambda x: tf.reduce_sum(x, axis=-1, keepdims=True)
+        )(x1)
+        x2_sum = core.Lambda(lambda x: tf.reduce_sum(x, axis=-1))(x2)
+
+        l0 = PLMerge()
+        y = l0([x0, x1_sum])
+
+        l1 = PLMerge()
+        y = l1([y, x2_sum])
+
+        l2 = PLSplit()
+        z, y = l2(y)
+        stage = preprocessing_stage.FunctionalPreprocessingStage(
+            {"x2": x2, "x0": x0, "x1": x1}, [y, z]
+        )
+        stage.compile()
+
+        # Test with dict of NumPy array
+        one_array0 = np.ones((4, 3), dtype="float32")
+        one_array1 = np.ones((4, 4), dtype="float32")
+        one_array2 = np.ones((4, 3, 5), dtype="float32")
+        adapt_data = {"x1": one_array1, "x0": one_array0, "x2": one_array2}
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 1)
+        self.assertEqual(l1.adapt_count, 1)
+        self.assertEqual(l2.adapt_count, 1)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Check call
+        y, z = stage(
+            {
+                "x1": tf.constant(one_array1),
+                "x2": tf.constant(one_array2),
+                "x0": tf.constant(one_array0),
+            }
+        )
+        self.assertAllClose(y, np.zeros((4, 3), dtype="float32") + 9.0)
+        self.assertAllClose(z, np.zeros((4, 3), dtype="float32") + 11.0)
+
+        # Test with list of NumPy array
+        adapt_data = [one_array0, one_array1, one_array2]
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 2)
+        self.assertEqual(l1.adapt_count, 2)
+        self.assertEqual(l2.adapt_count, 2)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Test with flattened dataset
+        adapt_data = tf.data.Dataset.from_tensor_slices(
+            (one_array0, one_array1, one_array2)
+        )
+        adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 3)
+        self.assertEqual(l1.adapt_count, 3)
+        self.assertEqual(l2.adapt_count, 3)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Test with dataset in dict shape
+        adapt_data = tf.data.Dataset.from_tensor_slices(
+            {"x0": one_array0, "x2": one_array2, "x1": one_array1}
+        )
+        adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 4)
+        self.assertEqual(l1.adapt_count, 4)
+        self.assertEqual(l2.adapt_count, 4)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Test error with bad data
+        with self.assertRaisesRegex(ValueError, "requires a "):
+            stage.adapt(None)
+
+    def test_adapt_preprocessing_stage_with_dict_output(self):
+        x = Input(shape=(3,), name="x")
+
+        l0 = PLSplit()
+        y0, y1 = l0(x)
+
+        l1 = PLSplit()
+        z0, z1 = l1(y0)
+        stage = preprocessing_stage.FunctionalPreprocessingStage(
+            {"x": x}, {"y1": y1, "z1": z1, "y0": y0, "z0": z0}
+        )
+        stage.compile()
+
+        # Test with NumPy array
+        one_array = np.ones((4, 3), dtype="float32")
+        adapt_data = {"x": one_array}
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 1)
+        self.assertEqual(l1.adapt_count, 1)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+        # Check call
+        outputs = stage({"x": tf.constant(one_array)})
+        self.assertEqual(set(outputs.keys()), {"y0", "y1", "z0", "z1"})
+        self.assertAllClose(
+            outputs["y0"], np.ones((4, 3), dtype="float32") + 1.0
+        )
+        self.assertAllClose(
+            outputs["y1"], np.ones((4, 3), dtype="float32") - 1.0
+        )
+        self.assertAllClose(
+            outputs["z0"], np.ones((4, 3), dtype="float32") + 2.0
+        )
+        self.assertAllClose(outputs["z1"], np.ones((4, 3), dtype="float32"))
+
+    def test_preprocessing_stage_with_nested_input(self):
+        # Test with NumPy array
+        x0 = Input(shape=(3,))
+        x1 = Input(shape=(3,))
+        x2 = Input(shape=(3,))
+
+        l0 = PLMerge()
+        y = l0([x0, x1])
+
+        l1 = PLMerge()
+        y = l1([y, x2])
+
+        l2 = PLSplit()
+        z, y = l2(y)
+
+        stage = preprocessing_stage.FunctionalPreprocessingStage(
+            [x0, [x1, x2]], [y, z]
+        )
+        stage.compile()
+        one_array = np.ones((4, 3), dtype="float32")
+        stage.adapt([one_array, [one_array, one_array]])
+        self.assertEqual(l0.adapt_count, 1)
+        self.assertEqual(l1.adapt_count, 1)
+        self.assertEqual(l2.adapt_count, 1)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Check call
+        y, z = stage(
+            [
+                tf.ones((4, 3), dtype="float32"),
+                [
+                    tf.ones((4, 3), dtype="float32"),
+                    tf.ones((4, 3), dtype="float32"),
+                ],
+            ]
+        )
+        self.assertAllClose(y, np.ones((4, 3), dtype="float32") + 1.0)
+        self.assertAllClose(z, np.ones((4, 3), dtype="float32") + 3.0)
+
+        # Test with dataset
+        adapt_data = tf.data.Dataset.from_tensor_slices(
+            (one_array, (one_array, one_array))
+        )
+        adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 2)
+        self.assertEqual(l1.adapt_count, 2)
+        self.assertEqual(l2.adapt_count, 2)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+        self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+        # Test error with bad data
+        with self.assertRaisesRegex(ValueError, "requires a "):
+            stage.adapt(None)
+
+    def test_include_layers_with_dict_input(self):
+        class PLMergeDict(PLMerge):
+            def call(self, inputs):
+                return inputs["a"] + inputs["b"]
+
+        x0 = Input(shape=(3,))
+        x1 = Input(shape=(3,))
+
+        l0 = PLMergeDict()
+        y = l0({"a": x0, "b": x1})
+
+        l1 = PLSplit()
+        z, y = l1(y)
+
+        stage = preprocessing_stage.FunctionalPreprocessingStage(
+            [x0, x1], [y, z]
+        )
+        stage.compile()
+
+        one_array = np.ones((4, 3), dtype="float32")
+        adapt_data = tf.data.Dataset.from_tensor_slices((one_array, one_array))
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 1)
+        self.assertEqual(l1.adapt_count, 1)
+        self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+        # Check call
+        y, z = stage(
+            [tf.ones((4, 3), dtype="float32"), tf.ones((4, 3), dtype="float32")]
+        )
+        self.assertAllClose(y, np.ones((4, 3), dtype="float32"))
+        self.assertAllClose(z, np.ones((4, 3), dtype="float32") + 2.0)
+
+    def test_include_layers_with_nested_input(self):
+        class PLMergeNest(PLMerge):
+            def call(self, inputs):
+                a = inputs[0]
+                b = inputs[1][0]
+                c = inputs[1][1]
+                return a + b + c
+
+        x0 = Input(shape=(3,))
+        x1 = Input(shape=(3,))
+        x2 = Input(shape=(3,))
+
+        l0 = PLMergeNest()
+        y = l0([x0, [x1, x2]])
+
+        stage = preprocessing_stage.FunctionalPreprocessingStage(
+            [x0, x1, x2], y
+        )
+        stage.compile()
+
+        one_array = np.ones((4, 3), dtype="float32")
+        adapt_data = tf.data.Dataset.from_tensor_slices((one_array,) * 3)
+        stage.adapt(adapt_data)
+        self.assertEqual(l0.adapt_count, 1)
+
+        # Check call
+        y = stage(
+            [
+                tf.ones((4, 3), dtype="float32"),
+                tf.ones((4, 3), dtype="float32"),
+                tf.ones((4, 3), dtype="float32"),
+            ]
+        )
+        self.assertAllClose(y, np.ones((4, 3), dtype="float32") + 2.0)
+
+    def test_mixing_preprocessing_and_regular_layers(self):
+        x0 = Input(shape=(10, 10, 3))
+        x1 = Input(shape=(10, 10, 3))
+        x2 = Input(shape=(10, 10, 3))
+
+        y0 = merging.Add()([x0, x1])
+        y1 = image_preprocessing.CenterCrop(8, 8)(x2)
+        y1 = convolutional.ZeroPadding2D(padding=1)(y1)
+
+        z = merging.Add()([y0, y1])
+        z = normalization.Normalization()(z)
+        z = convolutional.Conv2D(4, 3)(z)
+
+        stage = preprocessing_stage.FunctionalPreprocessingStage(
+            [x0, x1, x2], z
+        )
+
+        data = [
+            np.ones((12, 10, 10, 3), dtype="float32"),
+            np.ones((12, 10, 10, 3), dtype="float32"),
+            np.ones((12, 10, 10, 3), dtype="float32"),
         ]
-    ])
-    self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 1.)
-    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 3.)
-
-    # Test with dataset
-    adapt_data = tf.data.Dataset.from_tensor_slices(
-        (one_array, (one_array, one_array)))
-    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
-
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 2)
-    self.assertEqual(l1.adapt_count, 2)
-    self.assertEqual(l2.adapt_count, 2)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
-    # Test error with bad data
-    with self.assertRaisesRegex(ValueError, 'requires a '):
-      stage.adapt(None)
-
-  def test_include_layers_with_dict_input(self):
-
-    class PLMergeDict(PLMerge):
-
-      def call(self, inputs):
-        return inputs['a'] + inputs['b']
-
-    x0 = Input(shape=(3,))
-    x1 = Input(shape=(3,))
-
-    l0 = PLMergeDict()
-    y = l0({'a': x0, 'b': x1})
-
-    l1 = PLSplit()
-    z, y = l1(y)
-
-    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1], [y, z])
-    stage.compile()
-
-    one_array = np.ones((4, 3), dtype='float32')
-    adapt_data = tf.data.Dataset.from_tensor_slices((one_array, one_array))
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 1)
-    self.assertEqual(l1.adapt_count, 1)
-    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
-    # Check call
-    y, z = stage([
-        tf.ones((4, 3), dtype='float32'),
-        tf.ones((4, 3), dtype='float32')
-    ])
-    self.assertAllClose(y, np.ones((4, 3), dtype='float32'))
-    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 2.)
-
-  def test_include_layers_with_nested_input(self):
-
-    class PLMergeNest(PLMerge):
-
-      def call(self, inputs):
-        a = inputs[0]
-        b = inputs[1][0]
-        c = inputs[1][1]
-        return a + b + c
-
-    x0 = Input(shape=(3,))
-    x1 = Input(shape=(3,))
-    x2 = Input(shape=(3,))
-
-    l0 = PLMergeNest()
-    y = l0([x0, [x1, x2]])
-
-    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2], y)
-    stage.compile()
-
-    one_array = np.ones((4, 3), dtype='float32')
-    adapt_data = tf.data.Dataset.from_tensor_slices((one_array,) * 3)
-    stage.adapt(adapt_data)
-    self.assertEqual(l0.adapt_count, 1)
-
-    # Check call
-    y = stage([
-        tf.ones((4, 3), dtype='float32'),
-        tf.ones((4, 3), dtype='float32'),
-        tf.ones((4, 3), dtype='float32')
-    ])
-    self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 2.)
-
-  def test_mixing_preprocessing_and_regular_layers(self):
-    x0 = Input(shape=(10, 10, 3))
-    x1 = Input(shape=(10, 10, 3))
-    x2 = Input(shape=(10, 10, 3))
-
-    y0 = merging.Add()([x0, x1])
-    y1 = image_preprocessing.CenterCrop(8, 8)(x2)
-    y1 = convolutional.ZeroPadding2D(padding=1)(y1)
-
-    z = merging.Add()([y0, y1])
-    z = normalization.Normalization()(z)
-    z = convolutional.Conv2D(4, 3)(z)
-
-    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2], z)
-
-    data = [
-        np.ones((12, 10, 10, 3), dtype='float32'),
-        np.ones((12, 10, 10, 3), dtype='float32'),
-        np.ones((12, 10, 10, 3), dtype='float32')
-    ]
-
-    stage.adapt(data)
-    _ = stage(data)
-    stage.compile('rmsprop', 'mse')
-    with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
-      stage.fit(data, np.ones((12, 8, 8, 4)))
-
-    ds_x0 = tf.data.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
-    ds_x1 = tf.data.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
-    ds_x2 = tf.data.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
-    ds_x = tf.data.Dataset.zip((ds_x0, ds_x1, ds_x2))
-    ds_y = tf.data.Dataset.from_tensor_slices(np.ones((12, 8, 8, 4)))
-    dataset = tf.data.Dataset.zip((ds_x, ds_y)).batch(4)
-
-    with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
-      stage.fit(dataset)
-    _ = stage.evaluate(data, np.ones((12, 8, 8, 4)))
-    _ = stage.predict(data)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+
+        stage.adapt(data)
+        _ = stage(data)
+        stage.compile("rmsprop", "mse")
+        with self.assertRaisesRegex(ValueError, "Preprocessing stage"):
+            stage.fit(data, np.ones((12, 8, 8, 4)))
+
+        ds_x0 = tf.data.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
+        ds_x1 = tf.data.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
+        ds_x2 = tf.data.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
+        ds_x = tf.data.Dataset.zip((ds_x0, ds_x1, ds_x2))
+        ds_y = tf.data.Dataset.from_tensor_slices(np.ones((12, 8, 8, 4)))
+        dataset = tf.data.Dataset.zip((ds_x, ds_y)).batch(4)
+
+        with self.assertRaisesRegex(ValueError, "Preprocessing stage"):
+            stage.fit(dataset)
+        _ = stage.evaluate(data, np.ones((12, 8, 8, 4)))
+        _ = stage.predict(data)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/preprocessing/preprocessing_stage_test.py b/keras/layers/preprocessing/preprocessing_stage_test.py
index b8bfe2692c59..5d183d841648 100644
--- a/keras/layers/preprocessing/preprocessing_stage_test.py
+++ b/keras/layers/preprocessing/preprocessing_stage_test.py
@@ -14,70 +14,73 @@
 # ==============================================================================
 """Preprocessing stage tests."""
 
-import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
-
 import time
+
 import numpy as np
-from keras.testing_infra import test_combinations
+import tensorflow.compat.v2 as tf
+
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_stage
 from keras.layers.preprocessing import preprocessing_test_utils
+from keras.testing_infra import test_combinations
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class PreprocessingStageTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_adapt(self):
-
-    class PL(base_preprocessing_layer.PreprocessingLayer):
-
-      def __init__(self, **kwargs):
-        self.adapt_time = None
-        self.adapt_count = 0
-        super().__init__(**kwargs)
-
-      def adapt(self, data, reset_state=True):
-        self.adapt_time = time.time()
-        self.adapt_count += 1
-
-      def call(self, inputs):
-        return inputs + 1.
-
-    # Test with NumPy array
-    stage = preprocessing_stage.PreprocessingStage([
-        PL(),
-        PL(),
-        PL(),
-    ])
-    stage.adapt(np.ones((3, 4)))
-    self.assertEqual(stage.layers[0].adapt_count, 1)
-    self.assertEqual(stage.layers[1].adapt_count, 1)
-    self.assertEqual(stage.layers[2].adapt_count, 1)
-    self.assertLessEqual(stage.layers[0].adapt_time, stage.layers[1].adapt_time)
-    self.assertLessEqual(stage.layers[1].adapt_time, stage.layers[2].adapt_time)
-
-    # Check call
-    y = stage(tf.ones((3, 4)))
-    self.assertAllClose(y, np.ones((3, 4)) + 3.)
-
-    # Test with dataset
-    adapt_data = tf.data.Dataset.from_tensor_slices(np.ones((3, 10)))
-    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
-
-    stage.adapt(adapt_data)
-    self.assertEqual(stage.layers[0].adapt_count, 2)
-    self.assertEqual(stage.layers[1].adapt_count, 2)
-    self.assertEqual(stage.layers[2].adapt_count, 2)
-    self.assertLess(stage.layers[0].adapt_time, stage.layers[1].adapt_time)
-    self.assertLess(stage.layers[1].adapt_time, stage.layers[2].adapt_time)
-
-    # Test error with bad data
-    with self.assertRaisesRegex(ValueError, 'requires a '):
-      stage.adapt(None)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_adapt(self):
+        class PL(base_preprocessing_layer.PreprocessingLayer):
+            def __init__(self, **kwargs):
+                self.adapt_time = None
+                self.adapt_count = 0
+                super().__init__(**kwargs)
+
+            def adapt(self, data, reset_state=True):
+                self.adapt_time = time.time()
+                self.adapt_count += 1
+
+            def call(self, inputs):
+                return inputs + 1.0
+
+        # Test with NumPy array
+        stage = preprocessing_stage.PreprocessingStage(
+            [
+                PL(),
+                PL(),
+                PL(),
+            ]
+        )
+        stage.adapt(np.ones((3, 4)))
+        self.assertEqual(stage.layers[0].adapt_count, 1)
+        self.assertEqual(stage.layers[1].adapt_count, 1)
+        self.assertEqual(stage.layers[2].adapt_count, 1)
+        self.assertLessEqual(
+            stage.layers[0].adapt_time, stage.layers[1].adapt_time
+        )
+        self.assertLessEqual(
+            stage.layers[1].adapt_time, stage.layers[2].adapt_time
+        )
+
+        # Check call
+        y = stage(tf.ones((3, 4)))
+        self.assertAllClose(y, np.ones((3, 4)) + 3.0)
+
+        # Test with dataset
+        adapt_data = tf.data.Dataset.from_tensor_slices(np.ones((3, 10)))
+        adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+        stage.adapt(adapt_data)
+        self.assertEqual(stage.layers[0].adapt_count, 2)
+        self.assertEqual(stage.layers[1].adapt_count, 2)
+        self.assertEqual(stage.layers[2].adapt_count, 2)
+        self.assertLess(stage.layers[0].adapt_time, stage.layers[1].adapt_time)
+        self.assertLess(stage.layers[1].adapt_time, stage.layers[2].adapt_time)
+
+        # Test error with bad data
+        with self.assertRaisesRegex(ValueError, "requires a "):
+            stage.adapt(None)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/preprocessing/preprocessing_test_utils.py b/keras/layers/preprocessing/preprocessing_test_utils.py
index ae5366c1a4ae..8862241e4f1b 100644
--- a/keras/layers/preprocessing/preprocessing_test_utils.py
+++ b/keras/layers/preprocessing/preprocessing_test_utils.py
@@ -15,156 +15,189 @@
 """Tests utils for preprocessing layers."""
 
 import collections
+
 import numpy as np
 import tensorflow.compat.v2 as tf
 
 
 class ArrayLike:
+    def __init__(self, values):
+        self.values = values
 
-  def __init__(self, values):
-    self.values = values
-
-  def __array__(self):
-    return np.array(self.values)
+    def __array__(self):
+        return np.array(self.values)
 
 
 class PreprocessingLayerTest(tf.test.TestCase):
-  """Base test class for preprocessing layer API validation."""
-  # TODO(b/137303934): Consider incorporating something like this Close vs All
-  # behavior into core tf.test.TestCase.
-
-  def assertAllCloseOrEqual(self, a, b, msg=None):
-    """Asserts that elements are close (if numeric) or equal (if string)."""
-    if a is None or b is None:
-      self.assertAllEqual(a, b, msg=msg)
-    elif isinstance(a, (list, tuple)):
-      self.assertEqual(len(a), len(b))
-      for a_value, b_value in zip(a, b):
-        self.assertAllCloseOrEqual(a_value, b_value, msg=msg)
-    elif isinstance(a, collections.abc.Mapping):
-      self.assertEqual(len(a), len(b))
-      for key, a_value in a.items():
-        b_value = b[key]
-        error_message = "{} ({})".format(msg, key) if msg else None
-        self.assertAllCloseOrEqual(a_value, b_value, error_message)
-    elif (isinstance(a, float) or
-          hasattr(a, "dtype") and np.issubdtype(a.dtype, np.number)):
-      self.assertAllClose(a, b, msg=msg)
-    else:
-      self.assertAllEqual(a, b, msg=msg)
-
-  def assert_extracted_output_equal(self, combiner, acc1, acc2, msg=None):
-    data_1 = combiner.extract(acc1)
-    data_2 = combiner.extract(acc2)
-    self.assertAllCloseOrEqual(data_1, data_2, msg=msg)
-
-  # This is an injection seam so that tests like TextVectorizationTest can
-  # define their own methods for asserting that accumulators are equal.
-  compare_accumulators = assertAllCloseOrEqual
-
-  def validate_accumulator_computation(self, combiner, data, expected):
-    """Validate that various combinations of compute and merge are identical."""
-    if len(data) < 4:
-      raise AssertionError(
-          f"Data must have at least 4 elements. Received "
-          f"len(data)={len(data)}.")
-    data_0 = np.array([data[0]])
-    data_1 = np.array([data[1]])
-    data_2 = np.array(data[2:])
-
-    single_compute = combiner.compute(data)
-
-    all_merge = combiner.merge([
-        combiner.compute(data_0),
-        combiner.compute(data_1),
-        combiner.compute(data_2)
-    ])
-
-    self.compare_accumulators(
-        single_compute,
-        all_merge,
-        msg="Sharding data should not change the data output.")
-
-    unordered_all_merge = combiner.merge([
-        combiner.compute(data_1),
-        combiner.compute(data_2),
-        combiner.compute(data_0)
-    ])
-    self.compare_accumulators(
-        all_merge,
-        unordered_all_merge,
-        msg="The order of merge arguments should not change the data "
-        "output.")
-
-    hierarchical_merge = combiner.merge([
-        combiner.compute(data_1),
-        combiner.merge([combiner.compute(data_2),
-                        combiner.compute(data_0)])
-    ])
-    self.compare_accumulators(
-        all_merge,
-        hierarchical_merge,
-        msg="Nesting merge arguments should not change the data output.")
-
-    nested_compute = combiner.compute(
-        data_0, combiner.compute(data_1, combiner.compute(data_2)))
-    self.compare_accumulators(
-        all_merge,
-        nested_compute,
-        msg="Nesting compute arguments should not change the data output.")
-
-    mixed_compute = combiner.merge([
-        combiner.compute(data_0),
-        combiner.compute(data_1, combiner.compute(data_2))
-    ])
-    self.compare_accumulators(
-        all_merge,
-        mixed_compute,
-        msg="Mixing merge and compute calls should not change the data "
-        "output.")
-
-    single_merge = combiner.merge([
-        combiner.merge([combiner.compute(data_0)]),
-        combiner.compute(data_1, combiner.compute(data_2))
-    ])
-    self.compare_accumulators(
-        all_merge,
-        single_merge,
-        msg="Calling merge with a data length of 1 should not change the data "
-        "output.")
-
-    self.compare_accumulators(
-        expected,
-        all_merge,
-        msg="Calculated accumulators "
-        "did not match expected accumulator.")
-
-  def validate_accumulator_extract(self, combiner, data, expected):
-    """Validate that the expected results of computing and extracting."""
-    acc = combiner.compute(data)
-    extracted_data = combiner.extract(acc)
-    self.assertAllCloseOrEqual(expected, extracted_data)
-
-  def validate_accumulator_extract_and_restore(self, combiner, data, expected):
-    """Validate that the extract<->restore loop loses no data."""
-    acc = combiner.compute(data)
-    extracted_data = combiner.extract(acc)
-    restored_acc = combiner.restore(extracted_data)
-    self.assert_extracted_output_equal(combiner, acc, restored_acc)
-    self.assertAllCloseOrEqual(expected, combiner.extract(restored_acc))
-
-  def validate_accumulator_serialize_and_deserialize(self, combiner, data,
-                                                     expected):
-    """Validate that the serialize<->deserialize loop loses no data."""
-    acc = combiner.compute(data)
-    serialized_data = combiner.serialize(acc)
-    deserialized_data = combiner.deserialize(serialized_data)
-    self.compare_accumulators(acc, deserialized_data)
-    self.compare_accumulators(expected, deserialized_data)
-
-  def validate_accumulator_uniqueness(self, combiner, data):
-    """Validate that every call to compute creates a unique accumulator."""
-    acc = combiner.compute(data)
-    acc2 = combiner.compute(data)
-    self.assertIsNot(acc, acc2)
-    self.compare_accumulators(acc, acc2)
+    """Base test class for preprocessing layer API validation."""
+
+    # TODO(b/137303934): Consider incorporating something like this Close vs All
+    # behavior into core tf.test.TestCase.
+
+    def assertAllCloseOrEqual(self, a, b, msg=None):
+        """Asserts that elements are close (if numeric) or equal (if string)."""
+        if a is None or b is None:
+            self.assertAllEqual(a, b, msg=msg)
+        elif isinstance(a, (list, tuple)):
+            self.assertEqual(len(a), len(b))
+            for a_value, b_value in zip(a, b):
+                self.assertAllCloseOrEqual(a_value, b_value, msg=msg)
+        elif isinstance(a, collections.abc.Mapping):
+            self.assertEqual(len(a), len(b))
+            for key, a_value in a.items():
+                b_value = b[key]
+                error_message = f"{msg} ({key})" if msg else None
+                self.assertAllCloseOrEqual(a_value, b_value, error_message)
+        elif (
+            isinstance(a, float)
+            or hasattr(a, "dtype")
+            and np.issubdtype(a.dtype, np.number)
+        ):
+            self.assertAllClose(a, b, msg=msg)
+        else:
+            self.assertAllEqual(a, b, msg=msg)
+
+    def assert_extracted_output_equal(self, combiner, acc1, acc2, msg=None):
+        data_1 = combiner.extract(acc1)
+        data_2 = combiner.extract(acc2)
+        self.assertAllCloseOrEqual(data_1, data_2, msg=msg)
+
+    # This is an injection seam so that tests like TextVectorizationTest can
+    # define their own methods for asserting that accumulators are equal.
+    compare_accumulators = assertAllCloseOrEqual
+
+    def validate_accumulator_computation(self, combiner, data, expected):
+        """Validate that various combinations of compute and merge are
+        identical."""
+        if len(data) < 4:
+            raise AssertionError(
+                "Data must have at least 4 elements. Received "
+                f"len(data)={len(data)}."
+            )
+        data_0 = np.array([data[0]])
+        data_1 = np.array([data[1]])
+        data_2 = np.array(data[2:])
+
+        single_compute = combiner.compute(data)
+
+        all_merge = combiner.merge(
+            [
+                combiner.compute(data_0),
+                combiner.compute(data_1),
+                combiner.compute(data_2),
+            ]
+        )
+
+        self.compare_accumulators(
+            single_compute,
+            all_merge,
+            msg="Sharding data should not change the data output.",
+        )
+
+        unordered_all_merge = combiner.merge(
+            [
+                combiner.compute(data_1),
+                combiner.compute(data_2),
+                combiner.compute(data_0),
+            ]
+        )
+        self.compare_accumulators(
+            all_merge,
+            unordered_all_merge,
+            msg=(
+                "The order of merge arguments should not change the data "
+                "output."
+            ),
+        )
+
+        hierarchical_merge = combiner.merge(
+            [
+                combiner.compute(data_1),
+                combiner.merge(
+                    [combiner.compute(data_2), combiner.compute(data_0)]
+                ),
+            ]
+        )
+        self.compare_accumulators(
+            all_merge,
+            hierarchical_merge,
+            msg="Nesting merge arguments should not change the data output.",
+        )
+
+        nested_compute = combiner.compute(
+            data_0, combiner.compute(data_1, combiner.compute(data_2))
+        )
+        self.compare_accumulators(
+            all_merge,
+            nested_compute,
+            msg="Nesting compute arguments should not change the data output.",
+        )
+
+        mixed_compute = combiner.merge(
+            [
+                combiner.compute(data_0),
+                combiner.compute(data_1, combiner.compute(data_2)),
+            ]
+        )
+        self.compare_accumulators(
+            all_merge,
+            mixed_compute,
+            msg=(
+                "Mixing merge and compute calls should not change the data "
+                "output."
+            ),
+        )
+
+        single_merge = combiner.merge(
+            [
+                combiner.merge([combiner.compute(data_0)]),
+                combiner.compute(data_1, combiner.compute(data_2)),
+            ]
+        )
+        self.compare_accumulators(
+            all_merge,
+            single_merge,
+            msg=(
+                "Calling merge with a data length of 1 should not change "
+                "the data output."
+            ),
+        )
+
+        self.compare_accumulators(
+            expected,
+            all_merge,
+            msg="Calculated accumulators did not match expected accumulator.",
+        )
+
+    def validate_accumulator_extract(self, combiner, data, expected):
+        """Validate that the expected results of computing and extracting."""
+        acc = combiner.compute(data)
+        extracted_data = combiner.extract(acc)
+        self.assertAllCloseOrEqual(expected, extracted_data)
+
+    def validate_accumulator_extract_and_restore(
+        self, combiner, data, expected
+    ):
+        """Validate that the extract<->restore loop loses no data."""
+        acc = combiner.compute(data)
+        extracted_data = combiner.extract(acc)
+        restored_acc = combiner.restore(extracted_data)
+        self.assert_extracted_output_equal(combiner, acc, restored_acc)
+        self.assertAllCloseOrEqual(expected, combiner.extract(restored_acc))
+
+    def validate_accumulator_serialize_and_deserialize(
+        self, combiner, data, expected
+    ):
+        """Validate that the serialize<->deserialize loop loses no data."""
+        acc = combiner.compute(data)
+        serialized_data = combiner.serialize(acc)
+        deserialized_data = combiner.deserialize(serialized_data)
+        self.compare_accumulators(acc, deserialized_data)
+        self.compare_accumulators(expected, deserialized_data)
+
+    def validate_accumulator_uniqueness(self, combiner, data):
+        """Validate that every call to compute creates a unique accumulator."""
+        acc = combiner.compute(data)
+        acc2 = combiner.compute(data)
+        self.assertIsNot(acc, acc2)
+        self.compare_accumulators(acc, acc2)
diff --git a/keras/layers/preprocessing/preprocessing_utils.py b/keras/layers/preprocessing/preprocessing_utils.py
index 4c60721d7235..b0f7cc94555e 100644
--- a/keras/layers/preprocessing/preprocessing_utils.py
+++ b/keras/layers/preprocessing/preprocessing_utils.py
@@ -14,10 +14,11 @@
 # ==============================================================================
 """Utils for preprocessing layers."""
 
-from keras.utils import tf_utils
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+from keras.utils import tf_utils
+
 INT = "int"
 ONE_HOT = "one_hot"
 MULTI_HOT = "multi_hot"
@@ -26,128 +27,140 @@
 
 
 def ensure_tensor(inputs, dtype=None):
-  """Ensures the input is a Tensor, SparseTensor or RaggedTensor."""
-  if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor, tf.SparseTensor)):
-    inputs = tf.convert_to_tensor(inputs, dtype)
-  if dtype is not None and inputs.dtype != dtype:
-    inputs = tf.cast(inputs, dtype)
-  return inputs
+    """Ensures the input is a Tensor, SparseTensor or RaggedTensor."""
+    if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor, tf.SparseTensor)):
+        inputs = tf.convert_to_tensor(inputs, dtype)
+    if dtype is not None and inputs.dtype != dtype:
+        inputs = tf.cast(inputs, dtype)
+    return inputs
 
 
 def listify_tensors(x):
-  """Convert any tensors or numpy arrays to lists for config serialization."""
-  if tf.is_tensor(x):
-    x = x.numpy()
-  if isinstance(x, np.ndarray):
-    x = x.tolist()
-  return x
+    """Convert any tensors or numpy arrays to lists for config serialization."""
+    if tf.is_tensor(x):
+        x = x.numpy()
+    if isinstance(x, np.ndarray):
+        x = x.tolist()
+    return x
 
 
 def sparse_bincount(inputs, depth, binary_output, dtype, count_weights=None):
-  """Apply binary or count encoding to an input and return a sparse tensor."""
-  result = tf.sparse.bincount(
-      inputs,
-      weights=count_weights,
-      minlength=depth,
-      maxlength=depth,
-      axis=-1,
-      binary_output=binary_output)
-  result = tf.cast(result, dtype)
-  if inputs.shape.rank == 1:
-    output_shape = (depth,)
-  else:
-    batch_size = tf.shape(result)[0]
-    output_shape = (batch_size, depth)
-  result = tf.SparseTensor(
-      indices=result.indices, values=result.values, dense_shape=output_shape)
-  return result
+    """Apply binary or count encoding to an input and return a sparse tensor."""
+    result = tf.sparse.bincount(
+        inputs,
+        weights=count_weights,
+        minlength=depth,
+        maxlength=depth,
+        axis=-1,
+        binary_output=binary_output,
+    )
+    result = tf.cast(result, dtype)
+    if inputs.shape.rank == 1:
+        output_shape = (depth,)
+    else:
+        batch_size = tf.shape(result)[0]
+        output_shape = (batch_size, depth)
+    result = tf.SparseTensor(
+        indices=result.indices, values=result.values, dense_shape=output_shape
+    )
+    return result
 
 
 def dense_bincount(inputs, depth, binary_output, dtype, count_weights=None):
-  """Apply binary or count encoding to an input."""
-  result = tf.math.bincount(
-      inputs,
-      weights=count_weights,
-      minlength=depth,
-      maxlength=depth,
-      dtype=dtype,
-      axis=-1,
-      binary_output=binary_output)
-  if inputs.shape.rank == 1:
-    result.set_shape(tf.TensorShape((depth,)))
-  else:
-    batch_size = inputs.shape.as_list()[0]
-    result.set_shape(tf.TensorShape((batch_size, depth)))
-  return result
+    """Apply binary or count encoding to an input."""
+    result = tf.math.bincount(
+        inputs,
+        weights=count_weights,
+        minlength=depth,
+        maxlength=depth,
+        dtype=dtype,
+        axis=-1,
+        binary_output=binary_output,
+    )
+    if inputs.shape.rank == 1:
+        result.set_shape(tf.TensorShape((depth,)))
+    else:
+        batch_size = inputs.shape.as_list()[0]
+        result.set_shape(tf.TensorShape((batch_size, depth)))
+    return result
 
 
 def expand_dims(inputs, axis):
-  """Expand dims on sparse, ragged, or dense tensors."""
-  if tf_utils.is_sparse(inputs):
-    return tf.sparse.expand_dims(inputs, axis)
-  else:
-    return tf.expand_dims(inputs, axis)
-
-
-def encode_categorical_inputs(inputs,
-                              output_mode,
-                              depth,
-                              dtype="float32",
-                              sparse=False,
-                              count_weights=None,
-                              idf_weights=None):
-  """Encodes categoical inputs according to output_mode."""
-  if output_mode == INT:
-    return tf.identity(tf.cast(inputs, dtype))
-
-  original_shape = inputs.shape
-  # In all cases, we should uprank scalar input to a single sample.
-  if inputs.shape.rank == 0:
-    inputs = expand_dims(inputs, -1)
-  # One hot will unprank only if the final output dimension is not already 1.
-  if output_mode == ONE_HOT:
-    if inputs.shape[-1] != 1:
-      inputs = expand_dims(inputs, -1)
-
-  # TODO(b/190445202): remove output rank restriction.
-  if inputs.shape.rank > 2:
-    raise ValueError(
-        f"When output_mode is not `'int'`, maximum supported output rank is 2. "
-        f"Received output_mode {output_mode} and input shape {original_shape}, "
-        f"which would result in output rank {inputs.shape.rank}.")
-
-  binary_output = output_mode in (MULTI_HOT, ONE_HOT)
-  if sparse:
-    bincounts = sparse_bincount(inputs, depth, binary_output, dtype,
-                                count_weights)
-  else:
-    bincounts = dense_bincount(inputs, depth, binary_output, dtype,
-                               count_weights)
-
-  if output_mode != TF_IDF:
-    return bincounts
-
-  if idf_weights is None:
-    raise ValueError(
-        f"When output mode is `'tf_idf'`, idf_weights must be provided. "
-        f"Received: output_mode={output_mode} and idf_weights={idf_weights}")
-
-  if sparse:
-    value_weights = tf.gather(idf_weights, bincounts.indices[:, -1])
-    return tf.SparseTensor(bincounts.indices,
-                           value_weights * bincounts.values,
-                           bincounts.dense_shape)
-  else:
-    return tf.multiply(bincounts, idf_weights)
+    """Expand dims on sparse, ragged, or dense tensors."""
+    if tf_utils.is_sparse(inputs):
+        return tf.sparse.expand_dims(inputs, axis)
+    else:
+        return tf.expand_dims(inputs, axis)
+
+
+def encode_categorical_inputs(
+    inputs,
+    output_mode,
+    depth,
+    dtype="float32",
+    sparse=False,
+    count_weights=None,
+    idf_weights=None,
+):
+    """Encodes categoical inputs according to output_mode."""
+    if output_mode == INT:
+        return tf.identity(tf.cast(inputs, dtype))
+
+    original_shape = inputs.shape
+    # In all cases, we should uprank scalar input to a single sample.
+    if inputs.shape.rank == 0:
+        inputs = expand_dims(inputs, -1)
+    # One hot will unprank only if the final output dimension is not already 1.
+    if output_mode == ONE_HOT:
+        if inputs.shape[-1] != 1:
+            inputs = expand_dims(inputs, -1)
+
+    # TODO(b/190445202): remove output rank restriction.
+    if inputs.shape.rank > 2:
+        raise ValueError(
+            "When output_mode is not `'int'`, maximum supported output rank "
+            f"is 2. Received output_mode {output_mode} and input shape "
+            f"{original_shape}, "
+            f"which would result in output rank {inputs.shape.rank}."
+        )
+
+    binary_output = output_mode in (MULTI_HOT, ONE_HOT)
+    if sparse:
+        bincounts = sparse_bincount(
+            inputs, depth, binary_output, dtype, count_weights
+        )
+    else:
+        bincounts = dense_bincount(
+            inputs, depth, binary_output, dtype, count_weights
+        )
+
+    if output_mode != TF_IDF:
+        return bincounts
+
+    if idf_weights is None:
+        raise ValueError(
+            "When output mode is `'tf_idf'`, idf_weights must be provided. "
+            f"Received: output_mode={output_mode} and idf_weights={idf_weights}"
+        )
+
+    if sparse:
+        value_weights = tf.gather(idf_weights, bincounts.indices[:, -1])
+        return tf.SparseTensor(
+            bincounts.indices,
+            value_weights * bincounts.values,
+            bincounts.dense_shape,
+        )
+    else:
+        return tf.multiply(bincounts, idf_weights)
 
 
 def compute_shape_for_encode_categorical(shape, output_mode, depth):
-  """Computes the output shape of `encode_categorical_inputs`."""
-  if output_mode == INT:
-    return tf.TensorShape(shape)
-  if not shape:
-    return tf.TensorShape([depth])
-  if output_mode == ONE_HOT and shape[-1] != 1:
-    return tf.TensorShape(shape + [depth])
-  else:
-    return tf.TensorShape(shape[:-1] + [depth])
+    """Computes the output shape of `encode_categorical_inputs`."""
+    if output_mode == INT:
+        return tf.TensorShape(shape)
+    if not shape:
+        return tf.TensorShape([depth])
+    if output_mode == ONE_HOT and shape[-1] != 1:
+        return tf.TensorShape(shape + [depth])
+    else:
+        return tf.TensorShape(shape[:-1] + [depth])
diff --git a/keras/layers/preprocessing/preprocessing_utils_test.py b/keras/layers/preprocessing/preprocessing_utils_test.py
index 2394f59d5169..5e48a0ca19ff 100644
--- a/keras/layers/preprocessing/preprocessing_utils_test.py
+++ b/keras/layers/preprocessing/preprocessing_utils_test.py
@@ -14,112 +14,121 @@
 # ==============================================================================
 """Tests for preprocessing utils."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras.layers.preprocessing import preprocessing_utils
 from keras.testing_infra import test_combinations
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class ListifyTensorsTest(test_combinations.TestCase):
+    def test_tensor_input(self):
+        inputs = tf.constant([0, 1, 2, 3, 4])
+        outputs = preprocessing_utils.listify_tensors(inputs)
+        self.assertAllEqual([0, 1, 2, 3, 4], outputs)
+        self.assertIsInstance(outputs, list)
 
-  def test_tensor_input(self):
-    inputs = tf.constant([0, 1, 2, 3, 4])
-    outputs = preprocessing_utils.listify_tensors(inputs)
-    self.assertAllEqual([0, 1, 2, 3, 4], outputs)
-    self.assertIsInstance(outputs, list)
-
-  def test_numpy_input(self):
-    inputs = np.array([0, 1, 2, 3, 4])
-    outputs = preprocessing_utils.listify_tensors(inputs)
-    self.assertAllEqual([0, 1, 2, 3, 4], outputs)
-    self.assertIsInstance(outputs, list)
+    def test_numpy_input(self):
+        inputs = np.array([0, 1, 2, 3, 4])
+        outputs = preprocessing_utils.listify_tensors(inputs)
+        self.assertAllEqual([0, 1, 2, 3, 4], outputs)
+        self.assertIsInstance(outputs, list)
 
 
 @test_combinations.run_all_keras_modes
 class EncodeCategoricalInputsTest(test_combinations.TestCase):
-
-  def test_int_encoding(self):
-    inputs = tf.constant([0, 1, 2])
-    outputs = preprocessing_utils.encode_categorical_inputs(
-        inputs, output_mode='int', depth=4)
-    self.assertAllEqual([0, 1, 2], outputs)
-
-  @parameterized.named_parameters(
-      ('sparse', True),
-      ('dense', False),
-  )
-  def test_one_hot_encoding(self, sparse):
-    inputs = tf.constant([0, 1, 2])
-    outputs = preprocessing_utils.encode_categorical_inputs(
-        inputs, output_mode='one_hot', depth=4, sparse=sparse)
-    if sparse:
-      outputs = tf.sparse.to_dense(outputs)
-    self.assertAllEqual([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]], outputs)
-
-  @parameterized.named_parameters(
-      ('sparse', True),
-      ('dense', False),
-  )
-  def test_multi_hot_encoding(self, sparse):
-    inputs = tf.constant([0, 1, 2])
-    outputs = preprocessing_utils.encode_categorical_inputs(
-        inputs, output_mode='multi_hot', depth=4, sparse=sparse)
-    if sparse:
-      outputs = tf.sparse.to_dense(outputs)
-    self.assertAllEqual([1, 1, 1, 0], outputs)
-
-  @parameterized.named_parameters(
-      ('sparse', True),
-      ('dense', False),
-  )
-  def test_count_encoding(self, sparse):
-    inputs = tf.constant([0, 1, 1, 2, 2, 2])
-    outputs = preprocessing_utils.encode_categorical_inputs(
-        inputs, output_mode='count', depth=4, sparse=sparse)
-    if sparse:
-      outputs = tf.sparse.to_dense(outputs)
-    self.assertAllEqual([1, 2, 3, 0], outputs)
-
-  @parameterized.named_parameters(
-      ('sparse', True),
-      ('dense', False),
-  )
-  def test_tf_idf_encoding(self, sparse):
-    inputs = tf.constant([0, 1, 1, 2, 2, 2])
-    outputs = preprocessing_utils.encode_categorical_inputs(
-        inputs,
-        output_mode='tf_idf',
-        depth=4,
-        sparse=sparse,
-        idf_weights=[0.1, 1.0, 10.0, 0])
-    if sparse:
-      outputs = tf.sparse.to_dense(outputs)
-    self.assertAllClose([.1, 2, 30, 0], outputs)
-
-  def test_output_dtype(self):
-    inputs = tf.constant([0, 1, 2], dtype=tf.dtypes.int32)
-    outputs = preprocessing_utils.encode_categorical_inputs(
-        inputs, output_mode='int', depth=4, dtype=tf.dtypes.int64)
-    self.assertAllEqual(outputs.dtype, tf.dtypes.int64)
-    outputs = preprocessing_utils.encode_categorical_inputs(
-        inputs, output_mode='one_hot', depth=4, dtype=tf.dtypes.float64)
-    self.assertAllEqual(outputs.dtype, tf.dtypes.float64)
-
-  def test_rank_3_output_fails(self):
-    inputs = tf.constant([[[0]], [[1]], [[2]]])
-    with self.assertRaisesRegex(ValueError,
-                                'maximum supported output rank is 2'):
-      preprocessing_utils.encode_categorical_inputs(inputs, 'multi_hot', 4,
-                                                    'float32')
-
-  def test_tf_idf_output_with_no_weights_fails(self):
-    inputs = tf.constant([0, 1, 2])
-    with self.assertRaisesRegex(ValueError, 'idf_weights must be provided'):
-      preprocessing_utils.encode_categorical_inputs(inputs, 'tf_idf', 4,
-                                                    'float32')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_int_encoding(self):
+        inputs = tf.constant([0, 1, 2])
+        outputs = preprocessing_utils.encode_categorical_inputs(
+            inputs, output_mode="int", depth=4
+        )
+        self.assertAllEqual([0, 1, 2], outputs)
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_one_hot_encoding(self, sparse):
+        inputs = tf.constant([0, 1, 2])
+        outputs = preprocessing_utils.encode_categorical_inputs(
+            inputs, output_mode="one_hot", depth=4, sparse=sparse
+        )
+        if sparse:
+            outputs = tf.sparse.to_dense(outputs)
+        self.assertAllEqual([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]], outputs)
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_multi_hot_encoding(self, sparse):
+        inputs = tf.constant([0, 1, 2])
+        outputs = preprocessing_utils.encode_categorical_inputs(
+            inputs, output_mode="multi_hot", depth=4, sparse=sparse
+        )
+        if sparse:
+            outputs = tf.sparse.to_dense(outputs)
+        self.assertAllEqual([1, 1, 1, 0], outputs)
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_count_encoding(self, sparse):
+        inputs = tf.constant([0, 1, 1, 2, 2, 2])
+        outputs = preprocessing_utils.encode_categorical_inputs(
+            inputs, output_mode="count", depth=4, sparse=sparse
+        )
+        if sparse:
+            outputs = tf.sparse.to_dense(outputs)
+        self.assertAllEqual([1, 2, 3, 0], outputs)
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_tf_idf_encoding(self, sparse):
+        inputs = tf.constant([0, 1, 1, 2, 2, 2])
+        outputs = preprocessing_utils.encode_categorical_inputs(
+            inputs,
+            output_mode="tf_idf",
+            depth=4,
+            sparse=sparse,
+            idf_weights=[0.1, 1.0, 10.0, 0],
+        )
+        if sparse:
+            outputs = tf.sparse.to_dense(outputs)
+        self.assertAllClose([0.1, 2, 30, 0], outputs)
+
+    def test_output_dtype(self):
+        inputs = tf.constant([0, 1, 2], dtype=tf.dtypes.int32)
+        outputs = preprocessing_utils.encode_categorical_inputs(
+            inputs, output_mode="int", depth=4, dtype=tf.dtypes.int64
+        )
+        self.assertAllEqual(outputs.dtype, tf.dtypes.int64)
+        outputs = preprocessing_utils.encode_categorical_inputs(
+            inputs, output_mode="one_hot", depth=4, dtype=tf.dtypes.float64
+        )
+        self.assertAllEqual(outputs.dtype, tf.dtypes.float64)
+
+    def test_rank_3_output_fails(self):
+        inputs = tf.constant([[[0]], [[1]], [[2]]])
+        with self.assertRaisesRegex(
+            ValueError, "maximum supported output rank is 2"
+        ):
+            preprocessing_utils.encode_categorical_inputs(
+                inputs, "multi_hot", 4, "float32"
+            )
+
+    def test_tf_idf_output_with_no_weights_fails(self):
+        inputs = tf.constant([0, 1, 2])
+        with self.assertRaisesRegex(ValueError, "idf_weights must be provided"):
+            preprocessing_utils.encode_categorical_inputs(
+                inputs, "tf_idf", 4, "float32"
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/preprocessing/string_lookup.py b/keras/layers/preprocessing/string_lookup.py
index b0fd1f01cc6c..5bf7389b8539 100644
--- a/keras/layers/preprocessing/string_lookup.py
+++ b/keras/layers/preprocessing/string_lookup.py
@@ -14,388 +14,404 @@
 # ==============================================================================
 """Keras string lookup preprocessing layer."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
 
-import numpy as np
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import index_lookup
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export(
     "keras.layers.StringLookup",
     "keras.layers.experimental.preprocessing.StringLookup",
-    v1=[])
+    v1=[],
+)
 class StringLookup(index_lookup.IndexLookup):
-  """A preprocessing layer which maps string features to integer indices.
-
-  This layer translates a set of arbitrary strings into integer output via a
-  table-based vocabulary lookup. This layer will perform no splitting or
-  transformation of input strings. For a layer than can split and tokenize
-  natural language, see the `TextVectorization` layer.
-
-  The vocabulary for the layer must be either supplied on construction or
-  learned via `adapt()`. During `adapt()`, the layer will analyze a data set,
-  determine the frequency of individual strings tokens, and create a vocabulary
-  from them. If the vocabulary is capped in size, the most frequent tokens will
-  be used to create the vocabulary and all others will be treated as
-  out-of-vocabulary (OOV).
-
-  There are two possible output modes for the layer.
-  When `output_mode` is `"int"`,
-  input strings are converted to their index in the vocabulary (an integer).
-  When `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"`, input strings
-  are encoded into an array where each dimension corresponds to an element in
-  the vocabulary.
-
-  The vocabulary can optionally contain a mask token as well as an OOV token
-  (which can optionally occupy multiple indices in the vocabulary, as set
-  by `num_oov_indices`).
-  The position of these tokens in the vocabulary is fixed. When `output_mode` is
-  `"int"`, the vocabulary will begin with the mask token (if set), followed by
-  OOV indices, followed by the rest of the vocabulary. When `output_mode` is
-  `"multi_hot"`, `"count"`, or `"tf_idf"` the vocabulary will begin with OOV
-  indices and instances of the mask token will be dropped.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    max_tokens: Maximum size of the vocabulary for this layer. This should only
-      be specified when adapting the vocabulary or when setting
-      `pad_to_max_tokens=True`. If None, there is no cap on the size of the
-      vocabulary. Note that this size includes the OOV and mask tokens. Defaults
-      to None.
-    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
-      value is more than 1, OOV inputs are hashed to determine their OOV value.
-      If this value is 0, OOV inputs will cause an error when calling the layer.
-      Defaults to 1.
-    mask_token: A token that represents masked inputs. When `output_mode` is
-      `"int"`, the token is included in vocabulary and mapped to index 0. In
-      other output modes, the token will not appear in the vocabulary and
-      instances of the mask token in the input will be dropped. If set to None,
-      no mask term will be added. Defaults to `None`.
-    oov_token: Only used when `invert` is True. The token to return for OOV
-      indices. Defaults to `"[UNK]"`.
-    vocabulary: Optional. Either an array of strings or a string path to a text
-      file. If passing an array, can pass a tuple, list, 1D numpy array, or 1D
-      tensor containing the string vocbulary terms. If passing a file path, the
-      file should contain one line per term in the vocabulary. If this argument
-      is set, there is no need to `adapt()` the layer.
-    idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list, 1D
-      numpy array, or 1D tensor or the same length as the vocabulary, containing
-      the floating point inverse document frequency weights, which will be
-      multiplied by per sample term counts for the final `tf_idf` weight. If the
-      `vocabulary` argument is set, and `output_mode` is `"tf_idf"`, this
-      argument must be supplied.
-    invert: Only valid when `output_mode` is `"int"`. If True, this layer will
-      map indices to vocabulary items instead of mapping vocabulary items to
-      indices. Default to False.
-    output_mode: Specification for the output of the layer. Defaults to `"int"`.
-      Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or
-      `"tf_idf"` configuring the layer as follows:
-        - `"int"`: Return the raw integer indices of the input tokens.
-        - `"one_hot"`: Encodes each individual element in the input into an
-          array the same size as the vocabulary, containing a 1 at the element
-          index. If the last dimension is size 1, will encode on that dimension.
-          If the last dimension is not size 1, will append a new dimension for
-          the encoded output.
-        - `"multi_hot"`: Encodes each sample in the input into a single array
-          the same size as the vocabulary, containing a 1 for each vocabulary
-          term present in the sample. Treats the last dimension as the sample
-          dimension, if input shape is (..., sample_length), output shape will
-          be (..., num_tokens).
-        - `"count"`: As `"multi_hot"`, but the int array contains a count of the
-          number of times the token at that index appeared in the sample.
-        - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
-          find the value in each token slot.
-      For `"int"` output, any shape of input and output is supported. For all
-      other output modes, currently only output up to rank 2 is supported.
-    pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
-      `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
-      padded to `max_tokens` even if the number of unique tokens in the
-      vocabulary is less than max_tokens, resulting in a tensor of shape
-      [batch_size, max_tokens] regardless of vocabulary size. Defaults to False.
-    sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
-      `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
-      dense `Tensor`. Defaults to False.
-
-  Examples:
-
-  **Creating a lookup layer with a known vocabulary**
-
-  This example creates a lookup layer with a pre-existing vocabulary.
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
-  >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[1, 3, 4],
-         [4, 0, 2]])>
-
-  **Creating a lookup layer with an adapted vocabulary**
-
-  This example creates a lookup layer and generates the vocabulary by analyzing
-  the dataset.
-
-  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
-  >>> layer = tf.keras.layers.StringLookup()
-  >>> layer.adapt(data)
-  >>> layer.get_vocabulary()
-  ['[UNK]', 'd', 'z', 'c', 'b', 'a']
-
-  Note that the OOV token `"[UNK]"` has been added to the vocabulary.
-  The remaining tokens are sorted by frequency
-  (`"d"`, which has 2 occurrences, is first) then by inverse sort order.
-
-  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
-  >>> layer = tf.keras.layers.StringLookup()
-  >>> layer.adapt(data)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[5, 3, 1],
-         [1, 2, 4]])>
-
-  **Lookups with multiple OOV indices**
-
-  This example demonstrates how to use a lookup layer with multiple OOV indices.
-  When a layer is created with more than one OOV index, any OOV values are
-  hashed into the number of OOV buckets, distributing OOV values in a
-  deterministic fashion across the set.
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> data = tf.constant([["a", "c", "d"], ["m", "z", "b"]])
-  >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab, num_oov_indices=2)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
-  array([[2, 4, 5],
-         [0, 1, 3]])>
-
-  Note that the output for OOV value 'm' is 0, while the output for OOV value
-  'z' is 1. The in-vocab terms have their output index increased by 1 from
-  earlier examples (a maps to 2, etc) in order to make space for the extra OOV
-  value.
-
-  **One-hot output**
-
-  Configure the layer with `output_mode='one_hot'`. Note that the first
-  `num_oov_indices` dimensions in the ont_hot encoding represent OOV values.
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> data = tf.constant(["a", "b", "c", "d", "z"])
-  >>> layer = tf.keras.layers.StringLookup(
-  ...     vocabulary=vocab, output_mode='one_hot')
-  >>> layer(data)
-  <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
-    array([[0., 1., 0., 0., 0.],
-           [0., 0., 1., 0., 0.],
-           [0., 0., 0., 1., 0.],
-           [0., 0., 0., 0., 1.],
-           [1., 0., 0., 0., 0.]], dtype=float32)>
-
-  **Multi-hot output**
-
-  Configure the layer with `output_mode='multi_hot'`. Note that the first
-  `num_oov_indices` dimensions in the multi_hot encoding represent OOV values.
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
-  >>> layer = tf.keras.layers.StringLookup(
-  ...     vocabulary=vocab, output_mode='multi_hot')
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0., 1., 0., 1., 1.],
-           [1., 0., 1., 0., 1.]], dtype=float32)>
-
-  **Token count output**
-
-  Configure the layer with `output_mode='count'`. As with multi_hot output, the
-  first `num_oov_indices` dimensions in the output represent OOV values.
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
-  >>> layer = tf.keras.layers.StringLookup(
-  ...     vocabulary=vocab, output_mode='count')
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0., 1., 0., 1., 2.],
-           [2., 0., 1., 0., 1.]], dtype=float32)>
-
-  **TF-IDF output**
-
-  Configure the layer with `output_mode="tf_idf"`. As with multi_hot output, the
-  first `num_oov_indices` dimensions in the output represent OOV values.
-
-  Each token bin will output `token_count * idf_weight`, where the idf weights
-  are the inverse document frequency weights per token. These should be provided
-  along with the vocabulary. Note that the `idf_weight` for OOV values will
-  default to the average of all idf weights passed in.
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> idf_weights = [0.25, 0.75, 0.6, 0.4]
-  >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
-  >>> layer = tf.keras.layers.StringLookup(output_mode="tf_idf")
-  >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
-           [1.0 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
-
-  To specify the idf weights for oov values, you will need to pass the entire
-  vocabularly including the leading oov token.
-
-  >>> vocab = ["[UNK]", "a", "b", "c", "d"]
-  >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
-  >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
-  >>> layer = tf.keras.layers.StringLookup(output_mode="tf_idf")
-  >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
-    array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
-           [1.8 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
-
-  When adapting the layer in `"tf_idf"` mode, each input sample will be
-  considered a document, and IDF weight per token will be calculated as
-  `log(1 + num_documents / (1 + token_document_count))`.
-
-  **Inverse lookup**
-
-  This example demonstrates how to map indices to strings using this layer. (You
-  can also use `adapt()` with `inverse=True`, but for simplicity we'll pass the
-  vocab in this example.)
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> data = tf.constant([[1, 3, 4], [4, 0, 2]])
-  >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)
-  >>> layer(data)
-  <tf.Tensor: shape=(2, 3), dtype=string, numpy=
-  array([[b'a', b'c', b'd'],
-         [b'd', b'[UNK]', b'b']], dtype=object)>
-
-  Note that the first index correspond to the oov token by default.
-
-
-  **Forward and inverse lookup pairs**
-
-  This example demonstrates how to use the vocabulary of a standard lookup
-  layer to create an inverse lookup layer.
-
-  >>> vocab = ["a", "b", "c", "d"]
-  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
-  >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab)
-  >>> i_layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)
-  >>> int_data = layer(data)
-  >>> i_layer(int_data)
-  <tf.Tensor: shape=(2, 3), dtype=string, numpy=
-  array([[b'a', b'c', b'd'],
-         [b'd', b'[UNK]', b'b']], dtype=object)>
-
-  In this example, the input value `"z"` resulted in an output of `"[UNK]"`,
-  since 1000 was not in the vocabulary - it got represented as an OOV, and all
-  OOV values are returned as `"[UNK]"` in the inverse layer. Also, note that
-  for the inverse to work, you must have already set the forward layer
-  vocabulary either directly or via `adapt()` before calling `get_vocabulary()`.
-  """
-
-  def __init__(self,
-               max_tokens=None,
-               num_oov_indices=1,
-               mask_token=None,
-               oov_token="[UNK]",
-               vocabulary=None,
-               idf_weights=None,
-               encoding=None,
-               invert=False,
-               output_mode="int",
-               sparse=False,
-               pad_to_max_tokens=False,
-               **kwargs):
-    # Legacy versions of the StringLookup layer set layer dtype to string,
-    # instead of the output type. If we see this, clear it.
-    if "dtype" in kwargs and (kwargs["dtype"] == tf.string or
-                              kwargs["dtype"] == "string"):
-      del kwargs["dtype"]
-
-    if encoding is None:
-      encoding = "utf-8"
-
-    self.encoding = encoding
-
-    super().__init__(
-        max_tokens=max_tokens,
-        num_oov_indices=num_oov_indices,
-        mask_token=mask_token,
-        oov_token=oov_token,
-        vocabulary=vocabulary,
-        vocabulary_dtype=tf.string,
-        idf_weights=idf_weights,
-        invert=invert,
-        output_mode=output_mode,
-        sparse=sparse,
-        pad_to_max_tokens=pad_to_max_tokens,
-        **kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell("StringLookup").set(True)
-
-  def get_config(self):
-    config = {"encoding": self.encoding}
-    base_config = super().get_config()
-    # There is only one valid dtype for strings, so we don't expose this.
-    del base_config["vocabulary_dtype"]
-    return dict(list(base_config.items()) + list(config.items()))
-
-  # We override this method solely to generate a docstring.
-  def adapt(self, data, batch_size=None, steps=None):
-    """Computes a vocabulary of string terms from tokens in a dataset.
-
-    Calling `adapt()` on a `StringLookup` layer is an alternative to passing in
-    a precomputed vocabulary on construction via the `vocabulary` argument. A
-    `StringLookup` layer should always be either adapted over a dataset or
-    supplied with a vocabulary.
-
-    During `adapt()`, the layer will build a vocabulary of all string tokens
-    seen in the dataset, sorted by occurance count, with ties broken by sort
-    order of the tokens (high to low). At the end of `adapt()`, if `max_tokens`
-    is set, the voculary wil be truncated to `max_tokens` size. For example,
-    adapting a layer with `max_tokens=1000` will compute the 1000 most frequent
-    tokens occurring in the input dataset. If `output_mode='tf-idf'`, `adapt()`
-    will also learn the document frequencies of each token in the input dataset.
-
-    In order to make `StringLookup` efficient in any distribution context, the
-    vocabulary is kept static with respect to any compiled `tf.Graph`s that
-    call the layer. As a consequence, if the layer is adapted a second time,
-    any models using the layer should be re-compiled. For more information
-    see `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
-
-    `adapt()` is meant only as a single machine utility to compute layer state.
-    To analyze a dataset that cannot fit on a single machine, see
-    [Tensorflow Transform](https://www.tensorflow.org/tfx/transform/get_started)
-    for a multi-machine, map-reduce solution.
-
-    Arguments:
-      data: The data to train on. It can be passed either as a
-          `tf.data.Dataset`, or as a numpy array.
-      batch_size: Integer or `None`.
-          Number of samples per state update.
-          If unspecified, `batch_size` will default to 32.
-          Do not specify the `batch_size` if your data is in the
-          form of datasets, generators, or `keras.utils.Sequence` instances
-          (since they generate batches).
-      steps: Integer or `None`.
-          Total number of steps (batches of samples)
-          When training with input tensors such as
-          TensorFlow data tensors, the default `None` is equal to
-          the number of samples in your dataset divided by
-          the batch size, or 1 if that cannot be determined. If x is a
-          `tf.data` dataset, and 'steps' is None, the epoch will run until
-          the input dataset is exhausted. When passing an infinitely
-          repeating dataset, you must specify the `steps` argument. This
-          argument is not supported with array inputs.
+    """A preprocessing layer which maps string features to integer indices.
+
+    This layer translates a set of arbitrary strings into integer output via a
+    table-based vocabulary lookup. This layer will perform no splitting or
+    transformation of input strings. For a layer that can split and tokenize
+    natural language, see the `tf.keras.layers.TextVectorization` layer.
+
+    The vocabulary for the layer must be either supplied on construction or
+    learned via `adapt()`. During `adapt()`, the layer will analyze a data set,
+    determine the frequency of individual strings tokens, and create a
+    vocabulary from them. If the vocabulary is capped in size, the most frequent
+    tokens will be used to create the vocabulary and all others will be treated
+    as out-of-vocabulary (OOV).
+
+    There are two possible output modes for the layer.
+    When `output_mode` is `"int"`,
+    input strings are converted to their index in the vocabulary (an integer).
+    When `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"`, input strings
+    are encoded into an array where each dimension corresponds to an element in
+    the vocabulary.
+
+    The vocabulary can optionally contain a mask token as well as an OOV token
+    (which can optionally occupy multiple indices in the vocabulary, as set
+    by `num_oov_indices`).
+    The position of these tokens in the vocabulary is fixed. When `output_mode`
+    is `"int"`, the vocabulary will begin with the mask token (if set), followed
+    by OOV indices, followed by the rest of the vocabulary. When `output_mode`
+    is `"multi_hot"`, `"count"`, or `"tf_idf"` the vocabulary will begin with
+    OOV indices and instances of the mask token will be dropped.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+      max_tokens: Maximum size of the vocabulary for this layer. This should
+        only be specified when adapting the vocabulary or when setting
+        `pad_to_max_tokens=True`. If None, there is no cap on the size of the
+        vocabulary. Note that this size includes the OOV and mask tokens.
+        Defaults to `None`.
+      num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+        value is more than 1, OOV inputs are hashed to determine their OOV
+        value. If this value is 0, OOV inputs will cause an error when calling
+        the layer.  Defaults to `1`.
+      mask_token: A token that represents masked inputs. When `output_mode` is
+        `"int"`, the token is included in vocabulary and mapped to index 0. In
+        other output modes, the token will not appear in the vocabulary and
+        instances of the mask token in the input will be dropped. If set to
+        None, no mask term will be added. Defaults to `None`.
+      oov_token: Only used when `invert` is True. The token to return for OOV
+        indices. Defaults to `"[UNK]"`.
+      vocabulary: Optional. Either an array of strings or a string path to a
+        text file. If passing an array, can pass a tuple, list, 1D numpy array,
+        or 1D tensor containing the string vocabulary terms. If passing a file
+        path, the file should contain one line per term in the vocabulary. If
+        this argument is set, there is no need to `adapt()` the layer.
+      idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list,
+        1D numpy array, or 1D tensor or the same length as the vocabulary,
+        containing the floating point inverse document frequency weights, which
+        will be multiplied by per sample term counts for the final `tf_idf`
+        weight. If the `vocabulary` argument is set, and `output_mode` is
+        `"tf_idf"`, this argument must be supplied.
+      invert: Only valid when `output_mode` is `"int"`. If True, this layer will
+        map indices to vocabulary items instead of mapping vocabulary items to
+        indices. Defaults to `False`.
+      output_mode: Specification for the output of the layer. Values can be
+        `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`
+        configuring the layer as follows:
+          - `"int"`: Return the raw integer indices of the input tokens.
+          - `"one_hot"`: Encodes each individual element in the input into an
+            array the same size as the vocabulary, containing a 1 at the element
+            index. If the last dimension is size 1, will encode on that
+            dimension. If the last dimension is not size 1, will append a new
+            dimension for the encoded output.
+          - `"multi_hot"`: Encodes each sample in the input into a single array
+            the same size as the vocabulary, containing a 1 for each vocabulary
+            term present in the sample. Treats the last dimension as the sample
+            dimension, if input shape is (..., sample_length), output shape will
+            be (..., num_tokens).
+          - `"count"`: As `"multi_hot"`, but the int array contains a count of
+            the number of times the token at that index appeared in the sample.
+          - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
+            find the value in each token slot.
+        For `"int"` output, any shape of input and output is supported. For all
+        other output modes, currently only output up to rank 2 is supported.
+        Defaults to `"int"`.
+      pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
+        `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
+        padded to `max_tokens` even if the number of unique tokens in the
+        vocabulary is less than max_tokens, resulting in a tensor of shape
+        [batch_size, max_tokens] regardless of vocabulary size. Defaults to
+        `False`.
+      sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
+        `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
+        dense `Tensor`. Defaults to `False`.
+      encoding: Optional. The text encoding to use to interpret the input
+        strings. Defaults to `"utf-8"`.
+
+    Examples:
+
+    **Creating a lookup layer with a known vocabulary**
+
+    This example creates a lookup layer with a pre-existing vocabulary.
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+    >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[1, 3, 4],
+           [4, 0, 2]])>
+
+    **Creating a lookup layer with an adapted vocabulary**
+
+    This example creates a lookup layer and generates the vocabulary by
+    analyzing the dataset.
+
+    >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+    >>> layer = tf.keras.layers.StringLookup()
+    >>> layer.adapt(data)
+    >>> layer.get_vocabulary()
+    ['[UNK]', 'd', 'z', 'c', 'b', 'a']
+
+    Note that the OOV token `"[UNK]"` has been added to the vocabulary.
+    The remaining tokens are sorted by frequency
+    (`"d"`, which has 2 occurrences, is first) then by inverse sort order.
+
+    >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+    >>> layer = tf.keras.layers.StringLookup()
+    >>> layer.adapt(data)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[5, 3, 1],
+           [1, 2, 4]])>
+
+    **Lookups with multiple OOV indices**
+
+    This example demonstrates how to use a lookup layer with multiple OOV
+    indices.  When a layer is created with more than one OOV index, any OOV
+    values are hashed into the number of OOV buckets, distributing OOV values in
+    a deterministic fashion across the set.
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> data = tf.constant([["a", "c", "d"], ["m", "z", "b"]])
+    >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab,
+    ...                                      num_oov_indices=2)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+    array([[2, 4, 5],
+           [0, 1, 3]])>
+
+    Note that the output for OOV value 'm' is 0, while the output for OOV value
+    'z' is 1. The in-vocab terms have their output index increased by 1 from
+    earlier examples (a maps to 2, etc) in order to make space for the extra OOV
+    value.
+
+    **One-hot output**
+
+    Configure the layer with `output_mode='one_hot'`. Note that the first
+    `num_oov_indices` dimensions in the one_hot encoding represent OOV values.
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> data = tf.constant(["a", "b", "c", "d", "z"])
+    >>> layer = tf.keras.layers.StringLookup(
+    ...     vocabulary=vocab, output_mode='one_hot')
+    >>> layer(data)
+    <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
+      array([[0., 1., 0., 0., 0.],
+             [0., 0., 1., 0., 0.],
+             [0., 0., 0., 1., 0.],
+             [0., 0., 0., 0., 1.],
+             [1., 0., 0., 0., 0.]], dtype=float32)>
+
+    **Multi-hot output**
+
+    Configure the layer with `output_mode='multi_hot'`. Note that the first
+    `num_oov_indices` dimensions in the multi_hot encoding represent OOV values.
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
+    >>> layer = tf.keras.layers.StringLookup(
+    ...     vocabulary=vocab, output_mode='multi_hot')
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0., 1., 0., 1., 1.],
+             [1., 0., 1., 0., 1.]], dtype=float32)>
+
+    **Token count output**
+
+    Configure the layer with `output_mode='count'`. As with multi_hot output,
+    the first `num_oov_indices` dimensions in the output represent OOV values.
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
+    >>> layer = tf.keras.layers.StringLookup(
+    ...     vocabulary=vocab, output_mode='count')
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0., 1., 0., 1., 2.],
+             [2., 0., 1., 0., 1.]], dtype=float32)>
+
+    **TF-IDF output**
+
+    Configure the layer with `output_mode="tf_idf"`. As with multi_hot output,
+    the first `num_oov_indices` dimensions in the output represent OOV values.
+
+    Each token bin will output `token_count * idf_weight`, where the idf weights
+    are the inverse document frequency weights per token. These should be
+    provided along with the vocabulary. Note that the `idf_weight` for OOV
+    values will default to the average of all idf weights passed in.
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> idf_weights = [0.25, 0.75, 0.6, 0.4]
+    >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
+    >>> layer = tf.keras.layers.StringLookup(output_mode="tf_idf")
+    >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
+             [1.0 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
+
+    To specify the idf weights for oov values, you will need to pass the entire
+    vocabularly including the leading oov token.
+
+    >>> vocab = ["[UNK]", "a", "b", "c", "d"]
+    >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
+    >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
+    >>> layer = tf.keras.layers.StringLookup(output_mode="tf_idf")
+    >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
+      array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
+             [1.8 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
+
+    When adapting the layer in `"tf_idf"` mode, each input sample will be
+    considered a document, and IDF weight per token will be calculated as
+    `log(1 + num_documents / (1 + token_document_count))`.
+
+    **Inverse lookup**
+
+    This example demonstrates how to map indices to strings using this layer.
+    (You can also use `adapt()` with `inverse=True`, but for simplicity we'll
+    pass the vocab in this example.)
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> data = tf.constant([[1, 3, 4], [4, 0, 2]])
+    >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)
+    >>> layer(data)
+    <tf.Tensor: shape=(2, 3), dtype=string, numpy=
+    array([[b'a', b'c', b'd'],
+           [b'd', b'[UNK]', b'b']], dtype=object)>
+
+    Note that the first index correspond to the oov token by default.
+
+
+    **Forward and inverse lookup pairs**
+
+    This example demonstrates how to use the vocabulary of a standard lookup
+    layer to create an inverse lookup layer.
+
+    >>> vocab = ["a", "b", "c", "d"]
+    >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+    >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab)
+    >>> i_layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)
+    >>> int_data = layer(data)
+    >>> i_layer(int_data)
+    <tf.Tensor: shape=(2, 3), dtype=string, numpy=
+    array([[b'a', b'c', b'd'],
+           [b'd', b'[UNK]', b'b']], dtype=object)>
+
+    In this example, the input value `"z"` resulted in an output of `"[UNK]"`,
+    since 1000 was not in the vocabulary - it got represented as an OOV, and all
+    OOV values are returned as `"[UNK]"` in the inverse layer. Also, note that
+    for the inverse to work, you must have already set the forward layer
+    vocabulary either directly or via `adapt()` before calling
+    `get_vocabulary()`.
     """
-    super().adapt(data, batch_size=batch_size, steps=steps)
 
-  # Overridden methods from IndexLookup.
-  def _tensor_vocab_to_numpy(self, vocabulary):
-    vocabulary = vocabulary.numpy()
-    return np.array([tf.compat.as_text(x, self.encoding) for x in vocabulary])
+    def __init__(
+        self,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=None,
+        oov_token="[UNK]",
+        vocabulary=None,
+        idf_weights=None,
+        encoding="utf-8",
+        invert=False,
+        output_mode="int",
+        sparse=False,
+        pad_to_max_tokens=False,
+        **kwargs
+    ):
+        # Legacy versions of the StringLookup layer set layer dtype to string,
+        # instead of the output type. If we see this, clear it.
+        if "dtype" in kwargs and (
+            kwargs["dtype"] == tf.string or kwargs["dtype"] == "string"
+        ):
+            del kwargs["dtype"]
+
+        self.encoding = encoding
+
+        super().__init__(
+            max_tokens=max_tokens,
+            num_oov_indices=num_oov_indices,
+            mask_token=mask_token,
+            oov_token=oov_token,
+            vocabulary=vocabulary,
+            vocabulary_dtype=tf.string,
+            idf_weights=idf_weights,
+            invert=invert,
+            output_mode=output_mode,
+            sparse=sparse,
+            pad_to_max_tokens=pad_to_max_tokens,
+            **kwargs
+        )
+        base_preprocessing_layer.keras_kpl_gauge.get_cell("StringLookup").set(
+            True
+        )
+
+    def get_config(self):
+        config = {"encoding": self.encoding}
+        base_config = super().get_config()
+        # There is only one valid dtype for strings, so we don't expose this.
+        del base_config["vocabulary_dtype"]
+        return dict(list(base_config.items()) + list(config.items()))
+
+    # We override this method solely to generate a docstring.
+    def adapt(self, data, batch_size=None, steps=None):
+        """Computes a vocabulary of string terms from tokens in a dataset.
+
+        Calling `adapt()` on a `StringLookup` layer is an alternative to passing
+        in a precomputed vocabulary on construction via the `vocabulary`
+        argument. A `StringLookup` layer should always be either adapted over a
+        dataset or supplied with a vocabulary.
+
+        During `adapt()`, the layer will build a vocabulary of all string tokens
+        seen in the dataset, sorted by occurrence count, with ties broken by
+        sort order of the tokens (high to low). At the end of `adapt()`, if
+        `max_tokens` is set, the vocabulary wil be truncated to `max_tokens`
+        size. For example, adapting a layer with `max_tokens=1000` will compute
+        the 1000 most frequent tokens occurring in the input dataset. If
+        `output_mode='tf-idf'`, `adapt()` will also learn the document
+        frequencies of each token in the input dataset.
+
+        In order to make `StringLookup` efficient in any distribution context,
+        the vocabulary is kept static with respect to any compiled `tf.Graph`s
+        that call the layer. As a consequence, if the layer is adapted a second
+        time, any models using the layer should be re-compiled. For more
+        information see
+        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
+
+        `adapt()` is meant only as a single machine utility to compute layer
+        state.  To analyze a dataset that cannot fit on a single machine, see
+        [Tensorflow Transform](
+        https://www.tensorflow.org/tfx/transform/get_started) for a
+        multi-machine, map-reduce solution.
+
+        Arguments:
+          data: The data to train on. It can be passed either as a
+              `tf.data.Dataset`, or as a numpy array.
+          batch_size: Integer or `None`.
+              Number of samples per state update.
+              If unspecified, `batch_size` will default to 32.
+              Do not specify the `batch_size` if your data is in the
+              form of datasets, generators, or `keras.utils.Sequence` instances
+              (since they generate batches).
+          steps: Integer or `None`.
+              Total number of steps (batches of samples)
+              When training with input tensors such as
+              TensorFlow data tensors, the default `None` is equal to
+              the number of samples in your dataset divided by
+              the batch size, or 1 if that cannot be determined. If x is a
+              `tf.data` dataset, and 'steps' is None, the epoch will run until
+              the input dataset is exhausted. When passing an infinitely
+              repeating dataset, you must specify the `steps` argument. This
+              argument is not supported with array inputs.
+        """
+        super().adapt(data, batch_size=batch_size, steps=steps)
+
+    # Overridden methods from IndexLookup.
+    def _tensor_vocab_to_numpy(self, vocabulary):
+        vocabulary = vocabulary.numpy()
+        return np.array(
+            [tf.compat.as_text(x, self.encoding) for x in vocabulary]
+        )
diff --git a/keras/layers/preprocessing/string_lookup_test.py b/keras/layers/preprocessing/string_lookup_test.py
index 17ead71db055..0fac8cf28f1d 100644
--- a/keras/layers/preprocessing/string_lookup_test.py
+++ b/keras/layers/preprocessing/string_lookup_test.py
@@ -14,377 +14,515 @@
 # ==============================================================================
 """Tests for Keras text vectorization preprocessing layer."""
 
-import tensorflow.compat.v2 as tf
-
 import os
-from absl.testing import parameterized
+
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.layers.preprocessing import string_lookup
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 def _get_end_to_end_test_cases():
-  test_cases = (
-      {
-          "testcase_name": "test_strings_soft_vocab_cap",
-          # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab
-          # accumulator is sorting by frequency.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": None,
-          },
-          "expected_output": [[1], [2], [3], [4], [4], [3], [1], [0]],
-          "input_dtype":
-              tf.string
-      },
-  )
-
-  crossed_test_cases = []
-  # Cross above test cases with use_dataset in (True, False)
-  for use_dataset in (True, False):
-    for case in test_cases:
-      case = case.copy()
-      if use_dataset:
-        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
-      case["use_dataset"] = use_dataset
-      crossed_test_cases.append(case)
-
-  return crossed_test_cases
+    test_cases = (
+        {
+            "testcase_name": "test_strings_soft_vocab_cap",
+            # Create an array where 'earth' is the most frequent term, followed
+            # by 'wind', then 'and', then 'fire'. This ensures that the vocab
+            # accumulator is sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": None,
+            },
+            "expected_output": [[1], [2], [3], [4], [4], [3], [1], [0]],
+            "input_dtype": tf.string,
+        },
+    )
+
+    crossed_test_cases = []
+    # Cross above test cases with use_dataset in (True, False)
+    for use_dataset in (True, False):
+        for case in test_cases:
+            case = case.copy()
+            if use_dataset:
+                case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+            case["use_dataset"] = use_dataset
+            crossed_test_cases.append(case)
+
+    return crossed_test_cases
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class StringLookupLayerTest(test_combinations.TestCase,
-                            preprocessing_test_utils.PreprocessingLayerTest):
-
-  @parameterized.named_parameters(*_get_end_to_end_test_cases())
-  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
-                                       use_dataset, expected_output,
-                                       input_dtype):
-    cls = string_lookup.StringLookup
-    expected_output_dtype = tf.int64
-    input_shape = input_data.shape
-
-    if use_dataset:
-      # Keras APIs expect batched datasets.
-      # TODO(rachelim): `model.predict` predicts the result on each
-      # dataset batch separately, then tries to concatenate the results
-      # together. When the results have different shapes on the non-concat
-      # axis (which can happen in the output_mode = INT case for
-      # StringLookup), the concatenation fails. In real use cases, this may
-      # not be an issue because users are likely to pipe the preprocessing layer
-      # into other keras layers instead of predicting it directly. A workaround
-      # for these unit tests is to have the dataset only contain one batch, so
-      # no concatenation needs to happen with the result. For consistency with
-      # numpy input, we should make `predict` join differently shaped results
-      # together sensibly, with 0 padding.
-      input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
-          input_shape[0])
-      vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
-          input_shape[0])
-
-    output_data = test_utils.layer_test(
-        cls,
-        kwargs=kwargs,
-        input_shape=input_shape,
-        input_data=input_data,
-        input_dtype=input_dtype,
-        expected_output_dtype=expected_output_dtype,
-        validate_training=False,
-        adapt_data=vocab_data)
-    self.assertAllClose(expected_output, output_data)
+class StringLookupLayerTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(*_get_end_to_end_test_cases())
+    def test_layer_end_to_end_with_adapt(
+        self,
+        vocab_data,
+        input_data,
+        kwargs,
+        use_dataset,
+        expected_output,
+        input_dtype,
+    ):
+        cls = string_lookup.StringLookup
+        expected_output_dtype = tf.int64
+        input_shape = input_data.shape
+
+        if use_dataset:
+            # Keras APIs expect batched datasets.
+            # TODO(rachelim): `model.predict` predicts the result on each
+            # dataset batch separately, then tries to concatenate the results
+            # together. When the results have different shapes on the non-concat
+            # axis (which can happen in the output_mode = INT case for
+            # StringLookup), the concatenation fails. In real use cases, this
+            # may not be an issue because users are likely to pipe the
+            # preprocessing layer into other keras layers instead of predicting
+            # it directly. A workaround for these unit tests is to have the
+            # dataset only contain one batch, so no concatenation needs to
+            # happen with the result. For consistency with numpy input, we
+            # should make `predict` join differently shaped results together
+            # sensibly, with 0 padding.
+            input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
+                input_shape[0]
+            )
+            vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
+                input_shape[0]
+            )
+
+        output_data = test_utils.layer_test(
+            cls,
+            kwargs=kwargs,
+            input_shape=input_shape,
+            input_data=input_data,
+            input_dtype=input_dtype,
+            expected_output_dtype=expected_output_dtype,
+            validate_training=False,
+            adapt_data=vocab_data,
+        )
+        self.assertAllClose(expected_output, output_data)
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class StringLookupVocabularyTest(test_combinations.TestCase,
-                                 preprocessing_test_utils.PreprocessingLayerTest
-                                ):
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def test_int_output_explicit_vocab(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(vocabulary=vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_int_output_explicit_vocab_with_special_tokens(self):
-    vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(vocabulary=vocab_data, mask_token="")
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_int_output_no_oov(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    valid_input = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", ""]])
-    invalid_input = np.array([["earth", "wind", "and", "michigan"],
-                              ["fire", "and", "earth", "michigan"]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(
-        vocabulary=vocab_data, mask_token="", num_oov_indices=0)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(valid_input)
-    self.assertAllEqual(expected_output, output_data)
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                "found OOV values.*michigan"):
-      _ = model.predict(invalid_input)
-
-  def test_no_vocab(self):
-    with self.assertRaisesRegex(RuntimeError,
-                                "you must set the layer's vocabulary"):
-      layer = string_lookup.StringLookup(output_mode="binary")
-      layer([["a"]])
-
-  def test_one_hot_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array(["earth", "wind", "and", "fire", "michigan"])
-    expected_output = [
-        [0, 1, 0, 0, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 1, 0],
-        [0, 0, 0, 0, 1],
-        [1, 0, 0, 0, 0],
-    ]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = string_lookup.StringLookup(
-        vocabulary=vocab_data, output_mode="one_hot")
-    res = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=res)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_multi_hot_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[0, 1, 1, 1, 1], [1, 1, 0, 1, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(
-        vocabulary=vocab_data, output_mode="multi_hot")
-    res = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=res)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_count_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "earth", "fire", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[0, 2, 0, 0, 2], [1, 1, 0, 1, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(
-        vocabulary=vocab_data, output_mode="count")
-    res = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=res)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_sparse_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(
-        vocabulary=vocab_data, output_mode="multi_hot", sparse=True)
-    res = layer(input_data)
-    self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
-
-  def test_get_vocab_returns_str(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    expected_vocab = ["[UNK]", "earth", "wind", "and", "fire"]
-    layer = string_lookup.StringLookup(vocabulary=vocab_data)
-    layer_vocab = layer.get_vocabulary()
-    self.assertAllEqual(expected_vocab, layer_vocab)
-    self.assertIsInstance(layer_vocab[0], str)
-
-    inverse_layer = string_lookup.StringLookup(
-        vocabulary=layer.get_vocabulary(), invert=True)
-    layer_vocab = inverse_layer.get_vocabulary()
-    self.assertAllEqual(expected_vocab, layer_vocab)
-    self.assertIsInstance(layer_vocab[0], str)
-
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_list = ["earth", "wind", "and", "fire"]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_int_output_explicit_vocab_from_file_via_setter(self):
-    vocab_list = ["earth", "wind", "and", "fire"]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup()
-    layer.set_vocabulary(vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_non_unique_vocab_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire", "fire"]
-    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = string_lookup.StringLookup(vocabulary=vocab_data)
-
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_list = ["earth", "wind", "and", "fire", "earth"]
-    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
-    with self.assertRaisesRegex(
-        tf.errors.FailedPreconditionError,
-        "HashTable has different value for same key.*earth"):
-      _ = string_lookup.StringLookup(vocabulary=vocab_path)
-
-  def test_inverse_layer(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
-    expected_output = np.array([["earth", "wind", "and", "fire"],
-                                ["fire", "and", "earth", ""]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = string_lookup.StringLookup(
-        vocabulary=vocab_data, invert=True, mask_token="")
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_inverse_layer_from_file(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
-    expected_output = np.array([["earth", "wind", "and", "fire"],
-                                ["fire", "and", "earth", "[UNK]"]])
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = string_lookup.StringLookup(vocabulary=vocab_path, invert=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_inverse_layer_from_file_with_mask(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
-    expected_output = np.array([["earth", "wind", "and", "fire"],
-                                ["fire", "and", "earth", "[M]"]])
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
-
-    input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = string_lookup.StringLookup(
-        vocabulary=vocab_path, invert=True, mask_token="[M]")
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_forward_backward_explicit_vocab(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = np.array([["earth", "wind", "and", "fire"],
-                                ["fire", "and", "earth", "[UNK]"]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup(vocabulary=vocab_data)
-    invert_layer = string_lookup.StringLookup(
-        vocabulary=vocab_data, invert=True)
-    int_data = layer(input_data)
-    out_data = invert_layer(int_data)
-    model = keras.Model(inputs=input_data, outputs=out_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_forward_backward_adapted_vocab(self):
-    adapt_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = np.array([["earth", "wind", "and", "fire"],
-                                ["fire", "and", "earth", "[UNK]"]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = string_lookup.StringLookup()
-    layer.adapt(adapt_data)
-    invert_layer = string_lookup.StringLookup(
-        vocabulary=layer.get_vocabulary(), invert=True)
-    int_data = layer(input_data)
-    out_data = invert_layer(int_data)
-    model = keras.Model(inputs=input_data, outputs=out_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_ragged_string_input_multi_bucket(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = tf.ragged.constant([["earth", "wind", "fire"],
-                                      ["fire", "and", "earth", "ohio"]])
-    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
-    layer = string_lookup.StringLookup(num_oov_indices=2)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_data = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_data)
-
-  def test_tensor_vocab(self):
-    vocab_data = ["[UNK]", "wind", "and", "fire"]
-    vocab_tensor = tf.constant(vocab_data)
-    layer = string_lookup.StringLookup(vocabulary=vocab_tensor)
-    returned_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, returned_vocab)
-    self.assertAllEqual(layer.vocabulary_size(), 4)
-    fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor))
-    with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"):
-      fn()
+class StringLookupVocabularyTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def test_int_output_explicit_vocab(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(vocabulary=vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_int_output_explicit_vocab_with_special_tokens(self):
+        vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(vocabulary=vocab_data, mask_token="")
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_int_output_no_oov(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        valid_input = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]
+        )
+        invalid_input = np.array(
+            [
+                ["earth", "wind", "and", "michigan"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(
+            vocabulary=vocab_data, mask_token="", num_oov_indices=0
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(valid_input)
+        self.assertAllEqual(expected_output, output_data)
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError, "found OOV values.*michigan"
+        ):
+            _ = model.predict(invalid_input)
+
+    def test_no_vocab(self):
+        with self.assertRaisesRegex(
+            RuntimeError, "you must set the layer's vocabulary"
+        ):
+            layer = string_lookup.StringLookup(output_mode="binary")
+            layer([["a"]])
+
+    def test_one_hot_output(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(["earth", "wind", "and", "fire", "michigan"])
+        expected_output = [
+            [0, 1, 0, 0, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 0],
+            [0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = string_lookup.StringLookup(
+            vocabulary=vocab_data, output_mode="one_hot"
+        )
+        res = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=res)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_multi_hot_output(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[0, 1, 1, 1, 1], [1, 1, 0, 1, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(
+            vocabulary=vocab_data, output_mode="multi_hot"
+        )
+        res = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=res)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_count_output(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "earth", "fire", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[0, 2, 0, 0, 2], [1, 1, 0, 1, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(
+            vocabulary=vocab_data, output_mode="count"
+        )
+        res = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=res)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_sparse_output(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(
+            vocabulary=vocab_data, output_mode="multi_hot", sparse=True
+        )
+        res = layer(input_data)
+        self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
+
+    def test_get_vocab_returns_str(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        expected_vocab = ["[UNK]", "earth", "wind", "and", "fire"]
+        layer = string_lookup.StringLookup(vocabulary=vocab_data)
+        layer_vocab = layer.get_vocabulary()
+        self.assertAllEqual(expected_vocab, layer_vocab)
+        self.assertIsInstance(layer_vocab[0], str)
+
+        inverse_layer = string_lookup.StringLookup(
+            vocabulary=layer.get_vocabulary(), invert=True
+        )
+        layer_vocab = inverse_layer.get_vocabulary()
+        self.assertAllEqual(expected_vocab, layer_vocab)
+        self.assertIsInstance(layer_vocab[0], str)
+
+    def test_int_output_explicit_vocab_from_file(self):
+        vocab_list = ["earth", "wind", "and", "fire"]
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(vocabulary=vocab_path)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_int_output_explicit_vocab_from_file_via_setter(self):
+        vocab_list = ["earth", "wind", "and", "fire"]
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup()
+        layer.set_vocabulary(vocab_path)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_non_unique_vocab_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire", "fire"]
+        with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+            _ = string_lookup.StringLookup(vocabulary=vocab_data)
+
+    def test_non_unique_vocab_from_file_fails(self):
+        vocab_list = ["earth", "wind", "and", "fire", "earth"]
+        vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+        with self.assertRaisesRegex(
+            tf.errors.FailedPreconditionError,
+            "HashTable has different value for same key.*earth",
+        ):
+            _ = string_lookup.StringLookup(vocabulary=vocab_path)
+
+    def test_inverse_layer(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
+        expected_output = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth", ""]]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = string_lookup.StringLookup(
+            vocabulary=vocab_data, invert=True, mask_token=""
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_inverse_layer_from_file(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
+        expected_output = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "[UNK]"],
+            ]
+        )
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = string_lookup.StringLookup(vocabulary=vocab_path, invert=True)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_inverse_layer_from_file_with_mask(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
+        expected_output = np.array(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth", "[M]"]]
+        )
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+
+        input_data = keras.Input(shape=(None,), dtype=tf.int64)
+        layer = string_lookup.StringLookup(
+            vocabulary=vocab_path, invert=True, mask_token="[M]"
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_forward_backward_explicit_vocab(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "[UNK]"],
+            ]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(vocabulary=vocab_data)
+        invert_layer = string_lookup.StringLookup(
+            vocabulary=vocab_data, invert=True
+        )
+        int_data = layer(input_data)
+        out_data = invert_layer(int_data)
+        model = keras.Model(inputs=input_data, outputs=out_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_forward_backward_adapted_vocab(self):
+        adapt_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "[UNK]"],
+            ]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup()
+        layer.adapt(adapt_data)
+        invert_layer = string_lookup.StringLookup(
+            vocabulary=layer.get_vocabulary(), invert=True
+        )
+        int_data = layer(input_data)
+        out_data = invert_layer(int_data)
+        model = keras.Model(inputs=input_data, outputs=out_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_ragged_string_input_multi_bucket(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = tf.ragged.constant(
+            [["earth", "wind", "fire"], ["fire", "and", "earth", "ohio"]]
+        )
+        expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string, ragged=True)
+        layer = string_lookup.StringLookup(num_oov_indices=2)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_data = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_data)
+
+    def test_tensor_vocab(self):
+        vocab_data = ["[UNK]", "wind", "and", "fire"]
+        vocab_tensor = tf.constant(vocab_data)
+        layer = string_lookup.StringLookup(vocabulary=vocab_tensor)
+        returned_vocab = layer.get_vocabulary()
+        self.assertAllEqual(vocab_data, returned_vocab)
+        self.assertAllEqual(layer.vocabulary_size(), 4)
+        fn = tf.function(lambda: layer.set_vocabulary(vocab_tensor))
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot set a tensor vocabulary"
+        ):
+            fn()
+
+    @test_utils.run_v2_only()
+    def test_saving_v3(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(["earth", "wind", "and", "fire"])
+
+        # First, with a static vocabulary.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(vocabulary=vocab_data)
+        output = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=output)
+        ref_output = model.predict(input_array)
+        temp_dir = self.get_temp_dir()
+        model_path = os.path.join(temp_dir, "mymodel.keras")
+        model.save(model_path, save_format="keras_v3")
+        model = keras.models.load_model(model_path)
+        output = model.predict(input_array)
+        self.assertAllEqual(output, ref_output)
+
+        # Second, with adapt().
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup()
+        layer.adapt(vocab_data)
+        output = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=output)
+        ref_output = model.predict(input_array)
+        model.save(model_path, save_format="keras_v3", overwrite=True)
+        model = keras.models.load_model(model_path)
+        output = model.predict(input_array)
+        self.assertAllEqual(output, ref_output)
+
+        # Test TF-IDF + adapt().
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = string_lookup.StringLookup(output_mode="tf_idf")
+        layer.adapt(vocab_data)
+        output = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=output)
+        ref_output = model.predict(input_array)
+        model.save(model_path, save_format="keras_v3", overwrite=True)
+        model = keras.models.load_model(model_path)
+        output = model.predict(input_array)
+        self.assertAllEqual(output, ref_output)
+
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/preprocessing/text_vectorization.py b/keras/layers/preprocessing/text_vectorization.py
index 80c77fece698..cd65e21bec4b 100644
--- a/keras/layers/preprocessing/text_vectorization.py
+++ b/keras/layers/preprocessing/text_vectorization.py
@@ -14,18 +14,20 @@
 # ==============================================================================
 """Keras text vectorization preprocessing layer."""
 
-# pylint: disable=g-classes-have-attributes
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import preprocessing_utils as utils
 from keras.layers.preprocessing import string_lookup
-from keras.saving.saved_model import layer_serialization
+from keras.saving.legacy.saved_model import layer_serialization
+from keras.saving.serialization_lib import deserialize_keras_object
 from keras.utils import layer_utils
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
@@ -49,545 +51,633 @@
 @keras_export(
     "keras.layers.TextVectorization",
     "keras.layers.experimental.preprocessing.TextVectorization",
-    v1=[])
+    v1=[],
+)
 class TextVectorization(base_preprocessing_layer.PreprocessingLayer):
-  """A preprocessing layer which maps text features to integer sequences.
-
-  This layer has basic options for managing text in a Keras model. It transforms
-  a batch of strings (one example = one string) into either a list of token
-  indices (one example = 1D tensor of integer token indices) or a dense
-  representation (one example = 1D tensor of float values representing data
-  about the example's tokens). This layer is meant to handle natural language
-  inputs. To handle simple string inputs (categorical strings or pre-tokenized
-  strings) see `tf.keras.layers.StringLookup`.
-
-  The vocabulary for the layer must be either supplied on construction or
-  learned via `adapt()`. When this layer is adapted, it will analyze the
-  dataset, determine the frequency of individual string values, and create a
-  vocabulary from them. This vocabulary can have unlimited size or be capped,
-  depending on the configuration options for this layer; if there are more
-  unique values in the input than the maximum vocabulary size, the most frequent
-  terms will be used to create the vocabulary.
-
-  The processing of each example contains the following steps:
-
-  1. Standardize each example (usually lowercasing + punctuation stripping)
-  2. Split each example into substrings (usually words)
-  3. Recombine substrings into tokens (usually ngrams)
-  4. Index tokens (associate a unique int value with each token)
-  5. Transform each example using this index, either into a vector of ints or
-     a dense float vector.
-
-  Some notes on passing callables to customize splitting and normalization for
-  this layer:
-
-  1. Any callable can be passed to this Layer, but if you want to serialize
-     this object you should only pass functions that are registered Keras
-     serializables (see `tf.keras.utils.register_keras_serializable` for more
-     details).
-  2. When using a custom callable for `standardize`, the data received
-     by the callable will be exactly as passed to this layer. The callable
-     should return a tensor of the same shape as the input.
-  3. When using a custom callable for `split`, the data received by the
-     callable will have the 1st dimension squeezed out - instead of
-     `[["string to split"], ["another string to split"]]`, the Callable will
-     see `["string to split", "another string to split"]`. The callable should
-     return a Tensor with the first dimension containing the split tokens -
-     in this example, we should see something like `[["string", "to",
-     "split"], ["another", "string", "to", "split"]]`. This makes the callable
-     site natively compatible with `tf.strings.split()`.
-
-  For an overview and full list of preprocessing layers, see the preprocessing
-  [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-    max_tokens: Maximum size of the vocabulary for this layer. This should only
-      be specified when adapting a vocabulary or when setting
-      `pad_to_max_tokens=True`. Note that this vocabulary
-      contains 1 OOV token, so the effective number of tokens is `(max_tokens -
-      1 - (1 if output_mode == "int" else 0))`.
-    standardize: Optional specification for standardization to apply to the
-      input text. Values can be:
-        - `None`: No standardization.
-        - `"lower_and_strip_punctuation"`: Text will be lowercased and all
-          punctuation removed.
-        - `"lower"`: Text will be lowercased.
-        - `"strip_punctuation"`: All punctuation will be removed.
-        - Callable: Inputs will passed to the callable function, which should
-          standardized and returned.
-    split: Optional specification for splitting the input text. Values can be:
-        - `None`: No splitting.
-        - `"whitespace"`: Split on whitespace.
-        - `"character"`: Split on each unicode character.
-        - Callable: Standardized inputs will passed to the callable function,
-          which should split and returned.
-    ngrams: Optional specification for ngrams to create from the possibly-split
-      input text. Values can be None, an integer or tuple of integers; passing
-      an integer will create ngrams up to that integer, and passing a tuple of
-      integers will create ngrams for the specified values in the tuple. Passing
-      None means that no ngrams will be created.
-    output_mode: Optional specification for the output of the layer. Values can
-      be `"int"`, `"multi_hot"`, `"count"` or `"tf_idf"`, configuring the layer
-      as follows:
-        - `"int"`: Outputs integer indices, one integer index per split string
-          token. When `output_mode == "int"`, 0 is reserved for masked
-          locations; this reduces the vocab size to
-          `max_tokens - 2` instead of `max_tokens - 1`.
-        - `"multi_hot"`: Outputs a single int array per batch, of either
-          vocab_size or max_tokens size, containing 1s in all elements where the
-          token mapped to that index exists at least once in the batch item.
-        - `"count"`: Like `"multi_hot"`, but the int array contains a count of
-          the number of times the token at that index appeared in the
-          batch item.
-        - `"tf_idf"`: Like `"multi_hot"`, but the TF-IDF algorithm is applied to
-          find the value in each token slot.
-      For `"int"` output, any shape of input and output is supported. For all
-      other output modes, currently only rank 1 inputs (and rank 2 outputs after
-      splitting) are supported.
-    output_sequence_length: Only valid in INT mode. If set, the output will have
-      its time dimension padded or truncated to exactly `output_sequence_length`
-      values, resulting in a tensor of shape
-      `(batch_size, output_sequence_length)` regardless of how many tokens
-      resulted from the splitting step. Defaults to None.
-    pad_to_max_tokens: Only valid in  `"multi_hot"`, `"count"`, and `"tf_idf"`
-      modes. If True, the output will have its feature axis padded to
-      `max_tokens` even if the number of unique tokens in the vocabulary is less
-      than max_tokens, resulting in a tensor of shape `(batch_size, max_tokens)`
-      regardless of vocabulary size. Defaults to False.
-    vocabulary: Optional. Either an array of strings or a string path to a text
-      file. If passing an array, can pass a tuple, list, 1D numpy array, or 1D
-      tensor containing the string vocbulary terms. If passing a file path, the
-      file should contain one line per term in the vocabulary. If this argument
-      is set, there is no need to `adapt()` the layer.
-    idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list, 1D
-      numpy array, or 1D tensor or the same length as the vocabulary, containing
-      the floating point inverse document frequency weights, which will be
-      multiplied by per sample term counts for the final `tf_idf` weight. If the
-      `vocabulary` argument is set, and `output_mode` is `"tf_idf"`, this
-      argument must be supplied.
-    ragged: Boolean. Only applicable to `"int"` output mode. If True, returns a
-      `RaggedTensor` instead of a dense `Tensor`, where each sequence may have a
-      different length after string splitting. Defaults to False.
-    sparse: Boolean. Only applicable to `"multi_hot"`, `"count"`, and
-      `"tf_idf"` output modes. If True, returns a `SparseTensor` instead of a
-      dense `Tensor`. Defaults to False.
-
-  Example:
-
-  This example instantiates a `TextVectorization` layer that lowercases text,
-  splits on whitespace, strips punctuation, and outputs integer vocab indices.
-
-  >>> text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
-  >>> max_features = 5000  # Maximum vocab size.
-  >>> max_len = 4  # Sequence length to pad the outputs to.
-  >>>
-  >>> # Create the layer.
-  >>> vectorize_layer = tf.keras.layers.TextVectorization(
-  ...  max_tokens=max_features,
-  ...  output_mode='int',
-  ...  output_sequence_length=max_len)
-  >>>
-  >>> # Now that the vocab layer has been created, call `adapt` on the text-only
-  >>> # dataset to create the vocabulary. You don't have to batch, but for large
-  >>> # datasets this means we're not keeping spare copies of the dataset.
-  >>> vectorize_layer.adapt(text_dataset.batch(64))
-  >>>
-  >>> # Create the model that uses the vectorize text layer
-  >>> model = tf.keras.models.Sequential()
-  >>>
-  >>> # Start by creating an explicit input layer. It needs to have a shape of
-  >>> # (1,) (because we need to guarantee that there is exactly one string
-  >>> # input per batch), and the dtype needs to be 'string'.
-  >>> model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
-  >>>
-  >>> # The first layer in our model is the vectorization layer. After this
-  >>> # layer, we have a tensor of shape (batch_size, max_len) containing vocab
-  >>> # indices.
-  >>> model.add(vectorize_layer)
-  >>>
-  >>> # Now, the model can map strings to integers, and you can add an embedding
-  >>> # layer to map these integers to learned embeddings.
-  >>> input_data = [["foo qux bar"], ["qux baz"]]
-  >>> model.predict(input_data)
-  array([[2, 1, 4, 0],
-         [1, 3, 0, 0]])
-
-  Example:
-
-  This example instantiates a `TextVectorization` layer by passing a list
-  of vocabulary terms to the layer's `__init__()` method.
-
-  >>> vocab_data = ["earth", "wind", "and", "fire"]
-  >>> max_len = 4  # Sequence length to pad the outputs to.
-  >>>
-  >>> # Create the layer, passing the vocab directly. You can also pass the
-  >>> # vocabulary arg a path to a file containing one vocabulary word per
-  >>> # line.
-  >>> vectorize_layer = tf.keras.layers.TextVectorization(
-  ...  max_tokens=max_features,
-  ...  output_mode='int',
-  ...  output_sequence_length=max_len,
-  ...  vocabulary=vocab_data)
-  >>>
-  >>> # Because we've passed the vocabulary directly, we don't need to adapt
-  >>> # the layer - the vocabulary is already set. The vocabulary contains the
-  >>> # padding token ('') and OOV token ('[UNK]') as well as the passed tokens.
-  >>> vectorize_layer.get_vocabulary()
-  ['', '[UNK]', 'earth', 'wind', 'and', 'fire']
-
-  """
-
-  def __init__(self,
-               max_tokens=None,
-               standardize="lower_and_strip_punctuation",
-               split="whitespace",
-               ngrams=None,
-               output_mode="int",
-               output_sequence_length=None,
-               pad_to_max_tokens=False,
-               vocabulary=None,
-               idf_weights=None,
-               sparse=False,
-               ragged=False,
-               **kwargs):
-
-    # This layer only applies to string processing, and so should only have
-    # a dtype of 'string'.
-    if "dtype" in kwargs and kwargs["dtype"] != tf.string:
-      raise ValueError(
-          f"`TextVectorization` may only have a dtype of string. "
-          f"Received dtype: {kwargs['dtype']}.")
-    elif "dtype" not in kwargs:
-      kwargs["dtype"] = tf.string
-
-    # 'standardize' must be one of
-    # (None, LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION, callable)
-    layer_utils.validate_string_arg(
-        standardize,
-        allowable_strings=(LOWER_AND_STRIP_PUNCTUATION, LOWER,
-                           STRIP_PUNCTUATION),
-        layer_name="TextVectorization",
-        arg_name="standardize",
-        allow_none=True,
-        allow_callables=True)
-
-    # 'split' must be one of (None, WHITESPACE, CHARACTER, callable)
-    layer_utils.validate_string_arg(
-        split,
-        allowable_strings=(WHITESPACE, CHARACTER),
-        layer_name="TextVectorization",
-        arg_name="split",
-        allow_none=True,
-        allow_callables=True)
-
-    # Support deprecated names for output_modes.
-    if output_mode == "binary":
-      output_mode = MULTI_HOT
-    if output_mode == "tf-idf":
-      output_mode = TF_IDF
-    # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
-    layer_utils.validate_string_arg(
-        output_mode,
-        allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF),
-        layer_name="TextVectorization",
-        arg_name="output_mode",
-        allow_none=True)
-
-    # 'ngrams' must be one of (None, int, tuple(int))
-    if not (ngrams is None or
-            isinstance(ngrams, int) or
-            isinstance(ngrams, tuple) and
-            all(isinstance(item, int) for item in ngrams)):
-      raise ValueError(f"`ngrams` must be None, an integer, or a tuple of "
-                       f"integers. Received: ngrams={ngrams}")
-
-    # 'output_sequence_length' must be one of (None, int) and is only
-    # set if output_mode is INT.
-    if (output_mode == INT and not (isinstance(output_sequence_length, int) or
-                                    (output_sequence_length is None))):
-      raise ValueError(f"`output_sequence_length` must be either None or an "
-                       f"integer when `output_mode` is 'int'. Received: "
-                       f"output_sequence_length={output_sequence_length}")
-
-    if output_mode != INT and output_sequence_length is not None:
-      raise ValueError(
-          f"`output_sequence_length` must not be set if `output_mode` is not "
-          f"'int'. Received output_sequence_length={output_sequence_length}.")
-
-    if ragged and output_mode != INT:
-      raise ValueError(f"`ragged` must not be true if `output_mode` is "
-                       f"`'int'`. Received: ragged={ragged} and "
-                       f"output_mode={output_mode}")
-
-    if ragged and output_sequence_length is not None:
-      raise ValueError(f"`output_sequence_length` must not be set if ragged "
-                       f"is True. Received: ragged={ragged} and "
-                       f"output_sequence_length={output_sequence_length}")
-
-    self._max_tokens = max_tokens
-    self._standardize = standardize
-    self._split = split
-    self._ngrams_arg = ngrams
-    if isinstance(ngrams, int):
-      self._ngrams = tuple(range(1, ngrams + 1))
-    else:
-      self._ngrams = ngrams
-    self._ragged = ragged
-
-    self._output_mode = output_mode
-    self._output_sequence_length = output_sequence_length
-
-    # VocabularySavedModelSaver will clear the config vocabulary to restore the
-    # lookup table ops directly. We persist this hidden option to persist the
-    # fact that we have have a non-adaptable layer with a manually set vocab.
-    self._has_input_vocabulary = kwargs.pop("has_input_vocabulary",
-                                            (vocabulary is not None))
-
-    # Drop deprecated config options.
-    kwargs.pop("vocabulary_size", None)
-
-    super().__init__(**kwargs)
-    base_preprocessing_layer.keras_kpl_gauge.get_cell("TextVectorization").set(
-        True)
-
-    self._lookup_layer = string_lookup.StringLookup(
-        max_tokens=max_tokens,
-        vocabulary=vocabulary,
-        idf_weights=idf_weights,
-        pad_to_max_tokens=pad_to_max_tokens,
-        mask_token="",
-        output_mode=output_mode if output_mode is not None else INT,
-        sparse=sparse,
-        has_input_vocabulary=self._has_input_vocabulary)
-
-  def compute_output_shape(self, input_shape):
-    if self._output_mode == INT:
-      return tf.TensorShape([input_shape[0], self._output_sequence_length])
-
-    if self._split is None:
-      if len(input_shape) <= 1:
-        input_shape = tuple(input_shape) + (1,)
-    else:
-      input_shape = tuple(input_shape) + (None,)
-    return self._lookup_layer.compute_output_shape(input_shape)
-
-  def compute_output_signature(self, input_spec):
-    output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    output_dtype = (tf.int64 if self._output_mode == INT
-                    else backend.floatx())
-    return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
-
-  # We override this method solely to generate a docstring.
-  def adapt(self, data, batch_size=None, steps=None):
-    """Computes a vocabulary of string terms from tokens in a dataset.
-
-    Calling `adapt()` on a `TextVectorization` layer is an alternative to
-    passing in a precomputed vocabulary on construction via the `vocabulary`
-    argument. A `TextVectorization` layer should always be either adapted over a
-    dataset or supplied with a vocabulary.
-
-    During `adapt()`, the layer will build a vocabulary of all string tokens
-    seen in the dataset, sorted by occurance count, with ties broken by sort
-    order of the tokens (high to low). At the end of `adapt()`, if `max_tokens`
-    is set, the vocabulary wil be truncated to `max_tokens` size. For example,
-    adapting a layer with `max_tokens=1000` will compute the 1000 most frequent
-    tokens occurring in the input dataset. If `output_mode='tf-idf'`, `adapt()`
-    will also learn the document frequencies of each token in the input dataset.
-
-    In order to make `TextVectorization` efficient in any distribution context,
-    the vocabulary is kept static with respect to any compiled `tf.Graph`s that
-    call the layer. As a consequence, if the layer is adapted a second time,
-    any models using the layer should be re-compiled. For more information
-    see `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
-
-    `adapt()` is meant only as a single machine utility to compute layer state.
-    To analyze a dataset that cannot fit on a single machine, see
-    [Tensorflow Transform](https://www.tensorflow.org/tfx/transform/get_started)
-    for a multi-machine, map-reduce solution.
-
-    Arguments:
-      data: The data to train on. It can be passed either as a
-          `tf.data.Dataset`, or as a numpy array.
-      batch_size: Integer or `None`.
-          Number of samples per state update.
-          If unspecified, `batch_size` will default to 32.
-          Do not specify the `batch_size` if your data is in the
-          form of datasets, generators, or `keras.utils.Sequence` instances
-          (since they generate batches).
-      steps: Integer or `None`.
-          Total number of steps (batches of samples)
-          When training with input tensors such as
-          TensorFlow data tensors, the default `None` is equal to
-          the number of samples in your dataset divided by
-          the batch size, or 1 if that cannot be determined. If x is a
-          `tf.data` dataset, and 'steps' is None, the epoch will run until
-          the input dataset is exhausted. When passing an infinitely
-          repeating dataset, you must specify the `steps` argument. This
-          argument is not supported with array inputs.
-    """
-    super().adapt(data, batch_size=batch_size, steps=steps)
-
-  def update_state(self, data):
-    self._lookup_layer.update_state(self._preprocess(data))
-
-  def finalize_state(self):
-    self._lookup_layer.finalize_state()
-
-  def reset_state(self):  # pylint: disable=method-hidden
-    self._lookup_layer.reset_state()
-
-  def get_vocabulary(self, include_special_tokens=True):
-    """Returns the current vocabulary of the layer.
+    """A preprocessing layer which maps text features to integer sequences.
+
+    This layer has basic options for managing text in a Keras model. It
+    transforms a batch of strings (one example = one string) into either a list
+    of token indices (one example = 1D tensor of integer token indices) or a
+    dense representation (one example = 1D tensor of float values representing
+    data about the example's tokens). This layer is meant to handle natural
+    language inputs. To handle simple string inputs (categorical strings or
+    pre-tokenized strings) see `tf.keras.layers.StringLookup`.
+
+    The vocabulary for the layer must be either supplied on construction or
+    learned via `adapt()`. When this layer is adapted, it will analyze the
+    dataset, determine the frequency of individual string values, and create a
+    vocabulary from them. This vocabulary can have unlimited size or be capped,
+    depending on the configuration options for this layer; if there are more
+    unique values in the input than the maximum vocabulary size, the most
+    frequent terms will be used to create the vocabulary.
+
+    The processing of each example contains the following steps:
+
+    1. Standardize each example (usually lowercasing + punctuation stripping)
+    2. Split each example into substrings (usually words)
+    3. Recombine substrings into tokens (usually ngrams)
+    4. Index tokens (associate a unique int value with each token)
+    5. Transform each example using this index, either into a vector of ints or
+       a dense float vector.
+
+    Some notes on passing callables to customize splitting and normalization for
+    this layer:
+
+    1. Any callable can be passed to this Layer, but if you want to serialize
+       this object you should only pass functions that are registered Keras
+       serializables (see `tf.keras.saving.register_keras_serializable` for more
+       details).
+    2. When using a custom callable for `standardize`, the data received
+       by the callable will be exactly as passed to this layer. The callable
+       should return a tensor of the same shape as the input.
+    3. When using a custom callable for `split`, the data received by the
+       callable will have the 1st dimension squeezed out - instead of
+       `[["string to split"], ["another string to split"]]`, the Callable will
+       see `["string to split", "another string to split"]`. The callable should
+       return a Tensor with the first dimension containing the split tokens -
+       in this example, we should see something like `[["string", "to",
+       "split"], ["another", "string", "to", "split"]]`. This makes the callable
+       site natively compatible with `tf.strings.split()`.
+
+    For an overview and full list of preprocessing layers, see the preprocessing
+    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
     Args:
-      include_special_tokens: If True, the returned vocabulary will include
-        the padding and OOV tokens, and a term's index in the vocabulary will
-        equal the term's index when calling the layer. If False, the returned
-        vocabulary will not include any padding or OOV tokens.
-    """
-    return self._lookup_layer.get_vocabulary(include_special_tokens)
-
-  def vocabulary_size(self):
-    """Gets the current size of the layer's vocabulary.
+      max_tokens: Maximum size of the vocabulary for this layer. This should
+        only be specified when adapting a vocabulary or when setting
+        `pad_to_max_tokens=True`. Note that this vocabulary
+        contains 1 OOV token, so the effective number of tokens is
+        `(max_tokens - 1 - (1 if output_mode == "int" else 0))`.
+      standardize: Optional specification for standardization to apply to the
+        input text. Values can be:
+          - `None`: No standardization.
+          - `"lower_and_strip_punctuation"`: Text will be lowercased and all
+            punctuation removed.
+          - `"lower"`: Text will be lowercased.
+          - `"strip_punctuation"`: All punctuation will be removed.
+          - Callable: Inputs will passed to the callable function, which should
+            be standardized and returned.
+      split: Optional specification for splitting the input text. Values can be:
+          - `None`: No splitting.
+          - `"whitespace"`: Split on whitespace.
+          - `"character"`: Split on each unicode character.
+          - Callable: Standardized inputs will passed to the callable function,
+            which should be split and returned.
+      ngrams: Optional specification for ngrams to create from the
+        possibly-split input text. Values can be None, an integer or tuple of
+        integers; passing an integer will create ngrams up to that integer, and
+        passing a tuple of integers will create ngrams for the specified values
+        in the tuple. Passing None means that no ngrams will be created.
+      output_mode: Optional specification for the output of the layer. Values
+        can be `"int"`, `"multi_hot"`, `"count"` or `"tf_idf"`, configuring the
+        layer as follows:
+          - `"int"`: Outputs integer indices, one integer index per split string
+            token. When `output_mode == "int"`, 0 is reserved for masked
+            locations; this reduces the vocab size to
+            `max_tokens - 2` instead of `max_tokens - 1`.
+          - `"multi_hot"`: Outputs a single int array per batch, of either
+            vocab_size or max_tokens size, containing 1s in all elements where
+            the token mapped to that index exists at least once in the batch
+            item.
+          - `"count"`: Like `"multi_hot"`, but the int array contains a count of
+            the number of times the token at that index appeared in the
+            batch item.
+          - `"tf_idf"`: Like `"multi_hot"`, but the TF-IDF algorithm is applied
+            to find the value in each token slot.
+        For `"int"` output, any shape of input and output is supported. For all
+        other output modes, currently only rank 1 inputs (and rank 2 outputs
+        after splitting) are supported.
+      output_sequence_length: Only valid in INT mode. If set, the output will
+        have its time dimension padded or truncated to exactly
+        `output_sequence_length` values, resulting in a tensor of shape
+        `(batch_size, output_sequence_length)` regardless of how many tokens
+        resulted from the splitting step. Defaults to `None`.
+      pad_to_max_tokens: Only valid in  `"multi_hot"`, `"count"`, and `"tf_idf"`
+        modes. If True, the output will have its feature axis padded to
+        `max_tokens` even if the number of unique tokens in the vocabulary is
+        less than max_tokens, resulting in a tensor of shape `(batch_size,
+        max_tokens)` regardless of vocabulary size. Defaults to `False`.
+      vocabulary: Optional. Either an array of strings or a string path to a
+        text file. If passing an array, can pass a tuple, list, 1D numpy array,
+        or 1D tensor containing the string vocabulary terms. If passing a file
+        path, the file should contain one line per term in the vocabulary. If
+        this argument is set, there is no need to `adapt()` the layer.
+      idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list,
+        1D numpy array, or 1D tensor of the same length as the vocabulary,
+        containing the floating point inverse document frequency weights, which
+        will be multiplied by per sample term counts for the final `tf_idf`
+        weight. If the `vocabulary` argument is set, and `output_mode` is
+        `"tf_idf"`, this argument must be supplied.
+      ragged: Boolean. Only applicable to `"int"` output mode. If True, returns
+        a `RaggedTensor` instead of a dense `Tensor`, where each sequence may
+        have a different length after string splitting. Defaults to `False`.
+      sparse: Boolean. Only applicable to `"multi_hot"`, `"count"`, and
+        `"tf_idf"` output modes. If True, returns a `SparseTensor` instead of a
+        dense `Tensor`. Defaults to `False`.
+      encoding: Optional. The text encoding to use to interpret the input
+        strings. Defaults to `"utf-8"`.
+
+    Example:
+
+    This example instantiates a `TextVectorization` layer that lowercases text,
+    splits on whitespace, strips punctuation, and outputs integer vocab indices.
+
+    >>> text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
+    >>> max_features = 5000  # Maximum vocab size.
+    >>> max_len = 4  # Sequence length to pad the outputs to.
+    >>>
+    >>> # Create the layer.
+    >>> vectorize_layer = tf.keras.layers.TextVectorization(
+    ...  max_tokens=max_features,
+    ...  output_mode='int',
+    ...  output_sequence_length=max_len)
+    >>>
+    >>> # Now that the vocab layer has been created, call `adapt` on the
+    >>> # text-only dataset to create the vocabulary. You don't have to batch,
+    >>> # but for large datasets this means we're not keeping spare copies of
+    >>> # the dataset.
+    >>> vectorize_layer.adapt(text_dataset.batch(64))
+    >>>
+    >>> # Create the model that uses the vectorize text layer
+    >>> model = tf.keras.models.Sequential()
+    >>>
+    >>> # Start by creating an explicit input layer. It needs to have a shape of
+    >>> # (1,) (because we need to guarantee that there is exactly one string
+    >>> # input per batch), and the dtype needs to be 'string'.
+    >>> model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
+    >>>
+    >>> # The first layer in our model is the vectorization layer. After this
+    >>> # layer, we have a tensor of shape (batch_size, max_len) containing
+    >>> # vocab indices.
+    >>> model.add(vectorize_layer)
+    >>>
+    >>> # Now, the model can map strings to integers, and you can add an
+    >>> # embedding layer to map these integers to learned embeddings.
+    >>> input_data = [["foo qux bar"], ["qux baz"]]
+    >>> model.predict(input_data)
+    array([[2, 1, 4, 0],
+           [1, 3, 0, 0]])
+
+    Example:
+
+    This example instantiates a `TextVectorization` layer by passing a list
+    of vocabulary terms to the layer's `__init__()` method.
+
+    >>> vocab_data = ["earth", "wind", "and", "fire"]
+    >>> max_len = 4  # Sequence length to pad the outputs to.
+    >>>
+    >>> # Create the layer, passing the vocab directly. You can also pass the
+    >>> # vocabulary arg a path to a file containing one vocabulary word per
+    >>> # line.
+    >>> vectorize_layer = tf.keras.layers.TextVectorization(
+    ...  max_tokens=max_features,
+    ...  output_mode='int',
+    ...  output_sequence_length=max_len,
+    ...  vocabulary=vocab_data)
+    >>>
+    >>> # Because we've passed the vocabulary directly, we don't need to adapt
+    >>> # the layer - the vocabulary is already set. The vocabulary contains the
+    >>> # padding token ('') and OOV token ('[UNK]') as well as the passed
+    >>> # tokens.
+    >>> vectorize_layer.get_vocabulary()
+    ['', '[UNK]', 'earth', 'wind', 'and', 'fire']
 
-    Returns:
-      The integer size of the vocabulary, including optional mask and
-      OOV indices.
     """
-    return self._lookup_layer.vocabulary_size()
-
-  def get_config(self):
-    vocab = self._lookup_layer.input_vocabulary
-    idf_weights = self._lookup_layer.input_idf_weights
-    config = {
-        "max_tokens": self._lookup_layer.max_tokens,
-        "standardize": self._standardize,
-        "split": self._split,
-        "ngrams": self._ngrams_arg,
-        "output_mode": self._output_mode,
-        "output_sequence_length": self._output_sequence_length,
-        "pad_to_max_tokens": self._lookup_layer.pad_to_max_tokens,
-        "sparse": self._lookup_layer.sparse,
-        "ragged": self._ragged,
-        "vocabulary": utils.listify_tensors(vocab),
-        "idf_weights": utils.listify_tensors(idf_weights),
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def set_vocabulary(self, vocabulary, idf_weights=None):
-    """Sets vocabulary (and optionally document frequency) data for this layer.
-
-    This method sets the vocabulary and idf weights for this layer directly,
-    instead of analyzing a dataset through 'adapt'. It should be used whenever
-    the vocab (and optionally document frequency) information is already known.
-    If vocabulary data is already present in the layer, this method will replace
-    it.
 
-    Args:
-      vocabulary: Either an array or a string path to a text file. If passing an
-        array, can pass a tuple, list, 1D numpy array, or 1D tensor containing
-        the vocbulary terms. If passing a file path, the file should contain one
-        line per term in the vocabulary.
-      idf_weights: A tuple, list, 1D numpy array, or 1D tensor of inverse
-        document frequency weights with equal length to vocabulary. Must be set
-        if `output_mode` is `"tf_idf"`. Should not be set otherwise.
-
-    Raises:
-      ValueError: If there are too many inputs, the inputs do not match, or
-        input data is missing.
-      RuntimeError: If the vocabulary cannot be set when this function is
-        called. This happens when `"multi_hot"`, `"count"`, and "tf_idf" modes,
-        if `pad_to_max_tokens` is False and the layer itself has already been
-        called.
-    """
-    self._lookup_layer.set_vocabulary(vocabulary, idf_weights=idf_weights)
-
-  def _preprocess(self, inputs):
-    inputs = utils.ensure_tensor(inputs, dtype=tf.string)
-    if self._standardize in (LOWER, LOWER_AND_STRIP_PUNCTUATION):
-      inputs = tf.strings.lower(inputs)
-    if self._standardize in (STRIP_PUNCTUATION, LOWER_AND_STRIP_PUNCTUATION):
-      inputs = tf.strings.regex_replace(inputs, DEFAULT_STRIP_REGEX, "")
-    if callable(self._standardize):
-      inputs = self._standardize(inputs)
-
-    if self._split is not None:
-      # If we are splitting, we validate that the 1st axis is of dimension 1 and
-      # so can be squeezed out. We do this here instead of after splitting for
-      # performance reasons - it's more expensive to squeeze a ragged tensor.
-      if inputs.shape.rank > 1:
-        if inputs.shape[-1] != 1:
-          raise ValueError(
-              "When using `TextVectorization` to tokenize strings, the input "
-              "rank must be 1 or the last shape dimension must be 1. Received: "
-              f"inputs.shape={inputs.shape} with rank={inputs.shape.rank}")
+    def __init__(
+        self,
+        max_tokens=None,
+        standardize="lower_and_strip_punctuation",
+        split="whitespace",
+        ngrams=None,
+        output_mode="int",
+        output_sequence_length=None,
+        pad_to_max_tokens=False,
+        vocabulary=None,
+        idf_weights=None,
+        sparse=False,
+        ragged=False,
+        encoding="utf-8",
+        **kwargs,
+    ):
+
+        # This layer only applies to string processing, and so should only have
+        # a dtype of 'string'.
+        if "dtype" in kwargs and kwargs["dtype"] != tf.string:
+            raise ValueError(
+                "`TextVectorization` may only have a dtype of string. "
+                f"Received dtype: {kwargs['dtype']}."
+            )
+        elif "dtype" not in kwargs:
+            kwargs["dtype"] = tf.string
+
+        # 'standardize' must be one of
+        # (None, LOWER_AND_STRIP_PUNCTUATION, LOWER, STRIP_PUNCTUATION,
+        # callable)
+        layer_utils.validate_string_arg(
+            standardize,
+            allowable_strings=(
+                LOWER_AND_STRIP_PUNCTUATION,
+                LOWER,
+                STRIP_PUNCTUATION,
+            ),
+            layer_name="TextVectorization",
+            arg_name="standardize",
+            allow_none=True,
+            allow_callables=True,
+        )
+
+        # 'split' must be one of (None, WHITESPACE, CHARACTER, callable)
+        layer_utils.validate_string_arg(
+            split,
+            allowable_strings=(WHITESPACE, CHARACTER),
+            layer_name="TextVectorization",
+            arg_name="split",
+            allow_none=True,
+            allow_callables=True,
+        )
+
+        # Support deprecated names for output_modes.
+        if output_mode == "binary":
+            output_mode = MULTI_HOT
+        if output_mode == "tf-idf":
+            output_mode = TF_IDF
+        # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
+        layer_utils.validate_string_arg(
+            output_mode,
+            allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF),
+            layer_name="TextVectorization",
+            arg_name="output_mode",
+            allow_none=True,
+        )
+
+        # 'ngrams' must be one of (None, int, tuple(int))
+        if not (
+            ngrams is None
+            or isinstance(ngrams, int)
+            or isinstance(ngrams, tuple)
+            and all(isinstance(item, int) for item in ngrams)
+        ):
+            raise ValueError(
+                "`ngrams` must be None, an integer, or a tuple of "
+                f"integers. Received: ngrams={ngrams}"
+            )
+
+        # 'output_sequence_length' must be one of (None, int) and is only
+        # set if output_mode is INT.
+        if output_mode == INT and not (
+            isinstance(output_sequence_length, int)
+            or (output_sequence_length is None)
+        ):
+            raise ValueError(
+                "`output_sequence_length` must be either None or an "
+                "integer when `output_mode` is 'int'. Received: "
+                f"output_sequence_length={output_sequence_length}"
+            )
+
+        if output_mode != INT and output_sequence_length is not None:
+            raise ValueError(
+                "`output_sequence_length` must not be set if `output_mode` is "
+                "not 'int'. "
+                f"Received output_sequence_length={output_sequence_length}."
+            )
+
+        if ragged and output_mode != INT:
+            raise ValueError(
+                "`ragged` must not be true if `output_mode` is "
+                f"`'int'`. Received: ragged={ragged} and "
+                f"output_mode={output_mode}"
+            )
+
+        if ragged and output_sequence_length is not None:
+            raise ValueError(
+                "`output_sequence_length` must not be set if ragged "
+                f"is True. Received: ragged={ragged} and "
+                f"output_sequence_length={output_sequence_length}"
+            )
+
+        self._max_tokens = max_tokens
+        self._standardize = standardize
+        self._split = split
+        self._ngrams_arg = ngrams
+        if isinstance(ngrams, int):
+            self._ngrams = tuple(range(1, ngrams + 1))
+        else:
+            self._ngrams = ngrams
+        self._ragged = ragged
+
+        self._output_mode = output_mode
+        self._output_sequence_length = output_sequence_length
+        self._encoding = encoding
+
+        # VocabularySavedModelSaver will clear the config vocabulary to restore
+        # the lookup table ops directly. We persist this hidden option to
+        # persist the fact that we have have a non-adaptable layer with a
+        # manually set vocab.
+        self._has_input_vocabulary = kwargs.pop(
+            "has_input_vocabulary", (vocabulary is not None)
+        )
+
+        vocabulary_size = kwargs.pop("vocabulary_size", None)
+
+        super().__init__(**kwargs)
+        base_preprocessing_layer.keras_kpl_gauge.get_cell(
+            "TextVectorization"
+        ).set(True)
+
+        self._lookup_layer = string_lookup.StringLookup(
+            max_tokens=max_tokens,
+            vocabulary=vocabulary,
+            idf_weights=idf_weights,
+            pad_to_max_tokens=pad_to_max_tokens,
+            mask_token="",
+            output_mode=output_mode if output_mode is not None else INT,
+            sparse=sparse,
+            has_input_vocabulary=self._has_input_vocabulary,
+            encoding=encoding,
+            vocabulary_size=vocabulary_size,
+        )
+
+    def compute_output_shape(self, input_shape):
+        if self._output_mode == INT:
+            return tf.TensorShape(
+                [input_shape[0], self._output_sequence_length]
+            )
+
+        if self._split is None:
+            if len(input_shape) <= 1:
+                input_shape = tuple(input_shape) + (1,)
         else:
-          inputs = tf.squeeze(inputs, axis=-1)
-      if self._split == WHITESPACE:
-        # This treats multiple whitespaces as one whitespace, and strips leading
-        # and trailing whitespace.
-        inputs = tf.strings.split(inputs)
-      elif self._split == CHARACTER:
-        inputs = tf.strings.unicode_split(inputs, "UTF-8")
-      elif callable(self._split):
-        inputs = self._split(inputs)
-      else:
-        raise ValueError(
-            ("%s is not a supported splitting."
-             "TextVectorization supports the following options "
-             "for `split`: None, 'whitespace', or a Callable.") % self._split)
-
-    # Note that 'inputs' here can be either ragged or dense depending on the
-    # configuration choices for this Layer. The strings.ngrams op, however, does
-    # support both ragged and dense inputs.
-    if self._ngrams is not None:
-      inputs = tf.strings.ngrams(
-          inputs, ngram_width=self._ngrams, separator=" ")
-
-    return inputs
-
-  def call(self, inputs):
-    if isinstance(inputs, (list, tuple, np.ndarray)):
-      inputs = tf.convert_to_tensor(inputs)
-
-    inputs = self._preprocess(inputs)
-
-    # If we're not doing any output processing, return right away.
-    if self._output_mode is None:
-      return inputs
-
-    lookup_data = self._lookup_layer(inputs)
-
-    # For any non-int output, we can return directly from the underlying layer.
-    if self._output_mode != INT:
-      return lookup_data
-
-    if self._ragged:
-      return lookup_data
-
-    # If we have a ragged tensor, we can pad during the conversion to dense.
-    if tf_utils.is_ragged(lookup_data):
-      shape = lookup_data.shape.as_list()
-      # If output sequence length is None, to_tensor will pad the last dimension
-      # to the bounding shape of the ragged dimension.
-      shape[-1] = self._output_sequence_length
-      return lookup_data.to_tensor(default_value=0, shape=shape)
-
-    # If we have a dense tensor, we need to pad/trim directly.
-    if self._output_sequence_length is not None:
-      # Maybe trim the output.
-      lookup_data = lookup_data[..., :self._output_sequence_length]
-
-      # Maybe pad the output. We need to be careful to use dynamic shape here as
-      # required_space_to_batch_paddings requires a fully known shape.
-      shape = tf.shape(lookup_data)
-      padded_shape = tf.concat((shape[:-1], [self._output_sequence_length]), 0)
-      padding, _ = tf.required_space_to_batch_paddings(shape, padded_shape)
-      return tf.pad(lookup_data, padding)
-
-    return lookup_data
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return layer_serialization.VocabularySavedModelSaver(self)
+            input_shape = tuple(input_shape) + (None,)
+        return self._lookup_layer.compute_output_shape(input_shape)
+
+    def compute_output_signature(self, input_spec):
+        output_shape = self.compute_output_shape(input_spec.shape.as_list())
+        output_dtype = (
+            tf.int64 if self._output_mode == INT else backend.floatx()
+        )
+        return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
+
+    # We override this method solely to generate a docstring.
+    def adapt(self, data, batch_size=None, steps=None):
+        """Computes a vocabulary of string terms from tokens in a dataset.
+
+        Calling `adapt()` on a `TextVectorization` layer is an alternative to
+        passing in a precomputed vocabulary on construction via the `vocabulary`
+        argument. A `TextVectorization` layer should always be either adapted
+        over a dataset or supplied with a vocabulary.
+
+        During `adapt()`, the layer will build a vocabulary of all string tokens
+        seen in the dataset, sorted by occurrence count, with ties broken by
+        sort order of the tokens (high to low). At the end of `adapt()`, if
+        `max_tokens` is set, the vocabulary wil be truncated to `max_tokens`
+        size. For example, adapting a layer with `max_tokens=1000` will compute
+        the 1000 most frequent tokens occurring in the input dataset. If
+        `output_mode='tf-idf'`, `adapt()` will also learn the document
+        frequencies of each token in the input dataset.
+
+        In order to make `TextVectorization` efficient in any distribution
+        context, the vocabulary is kept static with respect to any compiled
+        `tf.Graph`s that call the layer. As a consequence, if the layer is
+        adapted a second time, any models using the layer should be re-compiled.
+        For more information see
+        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
+
+        `adapt()` is meant only as a single machine utility to compute layer
+        state.  To analyze a dataset that cannot fit on a single machine, see
+        [Tensorflow Transform](
+        https://www.tensorflow.org/tfx/transform/get_started) for a
+        multi-machine, map-reduce solution.
+
+        Arguments:
+          data: The data to train on. It can be passed either as a
+              `tf.data.Dataset`, or as a numpy array.
+          batch_size: Integer or `None`.
+              Number of samples per state update.
+              If unspecified, `batch_size` will default to 32.
+              Do not specify the `batch_size` if your data is in the
+              form of datasets, generators, or `keras.utils.Sequence` instances
+              (since they generate batches).
+          steps: Integer or `None`.
+              Total number of steps (batches of samples)
+              When training with input tensors such as
+              TensorFlow data tensors, the default `None` is equal to
+              the number of samples in your dataset divided by
+              the batch size, or 1 if that cannot be determined. If x is a
+              `tf.data` dataset, and 'steps' is None, the epoch will run until
+              the input dataset is exhausted. When passing an infinitely
+              repeating dataset, you must specify the `steps` argument. This
+              argument is not supported with array inputs.
+        """
+        super().adapt(data, batch_size=batch_size, steps=steps)
+
+    def update_state(self, data):
+        self._lookup_layer.update_state(self._preprocess(data))
+
+    def finalize_state(self):
+        self._lookup_layer.finalize_state()
+
+    def reset_state(self):
+        self._lookup_layer.reset_state()
+
+    def get_vocabulary(self, include_special_tokens=True):
+        """Returns the current vocabulary of the layer.
+
+        Args:
+          include_special_tokens: If True, the returned vocabulary will include
+            the padding and OOV tokens, and a term's index in the vocabulary
+            will equal the term's index when calling the layer. If False, the
+            returned vocabulary will not include any padding or OOV tokens.
+        """
+        return self._lookup_layer.get_vocabulary(include_special_tokens)
+
+    def vocabulary_size(self):
+        """Gets the current size of the layer's vocabulary.
+
+        Returns:
+          The integer size of the vocabulary, including optional mask and
+          OOV indices.
+        """
+        return self._lookup_layer.vocabulary_size()
+
+    def get_config(self):
+        config = {
+            "max_tokens": self._lookup_layer.max_tokens,
+            "standardize": self._standardize,
+            "split": self._split,
+            "ngrams": self._ngrams_arg,
+            "output_mode": self._output_mode,
+            "output_sequence_length": self._output_sequence_length,
+            "pad_to_max_tokens": self._lookup_layer.pad_to_max_tokens,
+            "sparse": self._lookup_layer.sparse,
+            "ragged": self._ragged,
+            "vocabulary": utils.listify_tensors(
+                self._lookup_layer.input_vocabulary
+            ),
+            "idf_weights": utils.listify_tensors(
+                self._lookup_layer.input_idf_weights
+            ),
+            "encoding": self._encoding,
+            "vocabulary_size": self.vocabulary_size(),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        if config["standardize"] not in (
+            LOWER_AND_STRIP_PUNCTUATION,
+            LOWER,
+            STRIP_PUNCTUATION,
+        ):
+            config["standardize"] = deserialize_keras_object(
+                config["standardize"]
+            )
+        if config["split"] not in (WHITESPACE, CHARACTER):
+            config["split"] = deserialize_keras_object(config["split"])
+        return cls(**config)
+
+    def set_vocabulary(self, vocabulary, idf_weights=None):
+        """Sets vocabulary (and optionally document frequency) for this layer.
+
+        This method sets the vocabulary and idf weights for this layer directly,
+        instead of analyzing a dataset through 'adapt'. It should be used
+        whenever the vocab (and optionally document frequency) information is
+        already known.  If vocabulary data is already present in the layer, this
+        method will replace it.
+
+        Args:
+          vocabulary: Either an array or a string path to a text file. If
+            passing an array, can pass a tuple, list, 1D numpy array, or 1D
+            tensor containing the vocbulary terms. If passing a file path, the
+            file should contain one line per term in the vocabulary.
+          idf_weights: A tuple, list, 1D numpy array, or 1D tensor of inverse
+            document frequency weights with equal length to vocabulary. Must be
+            set if `output_mode` is `"tf_idf"`. Should not be set otherwise.
+
+        Raises:
+          ValueError: If there are too many inputs, the inputs do not match, or
+            input data is missing.
+          RuntimeError: If the vocabulary cannot be set when this function is
+            called. This happens when `"multi_hot"`, `"count"`, and "tf_idf"
+            modes, if `pad_to_max_tokens` is False and the layer itself has
+            already been called.
+        """
+        self._lookup_layer.set_vocabulary(vocabulary, idf_weights=idf_weights)
+
+    def _preprocess(self, inputs):
+        inputs = utils.ensure_tensor(inputs, dtype=tf.string)
+        if self._standardize in (LOWER, LOWER_AND_STRIP_PUNCTUATION):
+            inputs = tf.strings.lower(inputs)
+        if self._standardize in (
+            STRIP_PUNCTUATION,
+            LOWER_AND_STRIP_PUNCTUATION,
+        ):
+            inputs = tf.strings.regex_replace(inputs, DEFAULT_STRIP_REGEX, "")
+        if callable(self._standardize):
+            inputs = self._standardize(inputs)
+
+        if self._split is not None:
+            # If we are splitting, we validate that the 1st axis is of dimension
+            # 1 and so can be squeezed out. We do this here instead of after
+            # splitting for performance reasons - it's more expensive to squeeze
+            # a ragged tensor.
+            if inputs.shape.rank > 1:
+                if inputs.shape[-1] != 1:
+                    raise ValueError(
+                        "When using `TextVectorization` to tokenize strings, "
+                        "the input rank must be 1 or the last shape dimension "
+                        f"must be 1. Received: inputs.shape={inputs.shape} "
+                        f"with rank={inputs.shape.rank}"
+                    )
+                else:
+                    inputs = tf.squeeze(inputs, axis=-1)
+            if self._split == WHITESPACE:
+                # This treats multiple whitespaces as one whitespace, and strips
+                # leading and trailing whitespace.
+                inputs = tf.strings.split(inputs)
+            elif self._split == CHARACTER:
+                inputs = tf.strings.unicode_split(inputs, "UTF-8")
+            elif callable(self._split):
+                inputs = self._split(inputs)
+            else:
+                raise ValueError(
+                    "%s is not a supported splitting."
+                    "TextVectorization supports the following options "
+                    "for `split`: None, 'whitespace', or a Callable."
+                    % self._split
+                )
+
+        # Note that 'inputs' here can be either ragged or dense depending on the
+        # configuration choices for this Layer. The strings.ngrams op, however,
+        # does support both ragged and dense inputs.
+        if self._ngrams is not None:
+            inputs = tf.strings.ngrams(
+                inputs, ngram_width=self._ngrams, separator=" "
+            )
+
+        return inputs
+
+    def call(self, inputs):
+        if isinstance(inputs, (list, tuple, np.ndarray)):
+            inputs = tf.convert_to_tensor(inputs)
+
+        inputs = self._preprocess(inputs)
+
+        # If we're not doing any output processing, return right away.
+        if self._output_mode is None:
+            return inputs
+
+        lookup_data = self._lookup_layer(inputs)
+
+        # For any non-int output, we can return directly from the underlying
+        # layer.
+        if self._output_mode != INT:
+            return lookup_data
+
+        if self._ragged:
+            return lookup_data
+
+        # If we have a ragged tensor, we can pad during the conversion to dense.
+        if tf_utils.is_ragged(lookup_data):
+            shape = lookup_data.shape.as_list()
+            # If output sequence length is None, to_tensor will pad the last
+            # dimension to the bounding shape of the ragged dimension.
+            shape[-1] = self._output_sequence_length
+            return lookup_data.to_tensor(default_value=0, shape=shape)
+
+        # If we have a dense tensor, we need to pad/trim directly.
+        if self._output_sequence_length is not None:
+            # Maybe trim the output.
+            lookup_data = lookup_data[..., : self._output_sequence_length]
+
+            # Maybe pad the output. We need to be careful to use dynamic shape
+            # here as required_space_to_batch_paddings requires a fully known
+            # shape.
+            shape = tf.shape(lookup_data)
+            padded_shape = tf.concat(
+                (shape[:-1], [self._output_sequence_length]), 0
+            )
+            padding, _ = tf.required_space_to_batch_paddings(
+                shape, padded_shape
+            )
+            return tf.pad(lookup_data, padding)
+
+        return lookup_data
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return layer_serialization.VocabularySavedModelSaver(self)
+
+    def save_own_variables(self, store):
+        self._lookup_layer.save_own_variables(store)
+
+    def load_own_variables(self, store):
+        self._lookup_layer.load_own_variables(store)
+
+    def save_assets(self, dir_path):
+        self._lookup_layer.save_assets(dir_path)
+
+    def load_assets(self, dir_path):
+        self._lookup_layer.load_assets(dir_path)
diff --git a/keras/layers/preprocessing/text_vectorization_distribution_test.py b/keras/layers/preprocessing/text_vectorization_distribution_test.py
index 30c171f1d5fb..94087acacbac 100644
--- a/keras/layers/preprocessing/text_vectorization_distribution_test.py
+++ b/keras/layers/preprocessing/text_vectorization_distribution_test.py
@@ -15,6 +15,8 @@
 """Distribution tests for keras.layers.preprocessing.text_vectorization."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras import backend
@@ -23,84 +25,113 @@
 from keras.layers.preprocessing import text_vectorization
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-from tensorflow.python.framework import test_util as tf_test_utils
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
 @test_utils.run_v2_only
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        strategy=strategy_combinations.all_strategies +
-        strategy_combinations.multi_worker_mirrored_strategies +
-        strategy_combinations.parameter_server_strategies_single_worker +
-        strategy_combinations.parameter_server_strategies_multi_worker,
-        mode=["eager"]))
+        strategy=strategy_combinations.all_strategies
+        + strategy_combinations.multi_worker_mirrored_strategies
+        + strategy_combinations.parameter_server_strategies_single_worker
+        + strategy_combinations.parameter_server_strategies_multi_worker,
+        mode=["eager"],
+    )
+)
 class TextVectorizationDistributionTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_distribution_strategy_output(self, strategy):
-    if (backend.is_tpu_strategy(strategy) and
-        not tf_test_utils.is_mlir_bridge_enabled()):
-      self.skipTest("TPU tests require MLIR bridge")
-
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
-        2, drop_remainder=True)
-
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    tf.config.set_soft_device_placement(True)
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = text_vectorization.TextVectorization(
-          max_tokens=None,
-          standardize=None,
-          split=None,
-          output_mode=text_vectorization.INT,
-          vocabulary=vocab_data)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-
-    output_dataset = model.predict(input_dataset)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_distribution_strategy_output_with_adapt(self, strategy):
-    # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
-    if backend.is_tpu_strategy(strategy):
-      self.skipTest("This test needs MLIR bridge on TPU.")
-
-    vocab_data = [[
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ]]
-    vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
-        2, drop_remainder=True)
-
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    tf.config.set_soft_device_placement(True)
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = text_vectorization.TextVectorization(
-          max_tokens=None,
-          standardize=None,
-          split=None,
-          output_mode=text_vectorization.INT)
-      layer.adapt(vocab_dataset)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-
-    output_dataset = model.predict(input_dataset)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_distribution_strategy_output(self, strategy):
+        if (
+            backend.is_tpu_strategy(strategy)
+            and not tf_test_utils.is_mlir_bridge_enabled()
+        ):
+            self.skipTest("TPU tests require MLIR bridge")
+
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
+            2, drop_remainder=True
+        )
+
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        tf.config.set_soft_device_placement(True)
+
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = text_vectorization.TextVectorization(
+                max_tokens=None,
+                standardize=None,
+                split=None,
+                output_mode=text_vectorization.INT,
+                vocabulary=vocab_data,
+            )
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+
+        output_dataset = model.predict(input_dataset)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_distribution_strategy_output_with_adapt(self, strategy):
+        # TODO(b/180614455): remove this check when MLIR bridge is always
+        # enabled.
+        if backend.is_tpu_strategy(strategy):
+            self.skipTest("This test needs MLIR bridge on TPU.")
+
+        vocab_data = [
+            [
+                "earth",
+                "earth",
+                "earth",
+                "earth",
+                "wind",
+                "wind",
+                "wind",
+                "and",
+                "and",
+                "fire",
+            ]
+        ]
+        vocab_dataset = tf.data.Dataset.from_tensors(vocab_data)
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        input_dataset = tf.data.Dataset.from_tensor_slices(input_array).batch(
+            2, drop_remainder=True
+        )
+
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        tf.config.set_soft_device_placement(True)
+
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = text_vectorization.TextVectorization(
+                max_tokens=None,
+                standardize=None,
+                split=None,
+                output_mode=text_vectorization.INT,
+            )
+            layer.adapt(vocab_dataset)
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+
+        output_dataset = model.predict(input_dataset)
+        self.assertAllEqual(expected_output, output_dataset)
+
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/layers/preprocessing/text_vectorization_test.py b/keras/layers/preprocessing/text_vectorization_test.py
index 9b615c9a0d25..9a4b85c16d6e 100644
--- a/keras/layers/preprocessing/text_vectorization_test.py
+++ b/keras/layers/preprocessing/text_vectorization_test.py
@@ -14,1903 +14,2491 @@
 # ==============================================================================
 """Tests for Keras text vectorization preprocessing layer."""
 
-import tensorflow.compat.v2 as tf
-
 import gc
 import os
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras import backend
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.layers import convolutional
 from keras.layers import core
 from keras.layers.preprocessing import preprocessing_test_utils
 from keras.layers.preprocessing import text_vectorization
-from keras.utils import generic_utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import register_keras_serializable
 
 
 def _get_end_to_end_test_cases():
-  test_cases = (
-      {
-          "testcase_name":
-              "test_simple_tokens_int_mode",
-          # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab
-          # is sorting by frequency.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": None,
-              "standardize": None,
-              "split": None,
-              "output_mode": text_vectorization.INT
-          },
-          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
-      },
-      {
-          "testcase_name":
-              "test_simple_tokens_int_mode_hard_cap",
-          # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab
-          # is sorting by frequency.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": 6,
-              "standardize": None,
-              "split": None,
-              "output_mode": text_vectorization.INT
-          },
-          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
-      },
-      {
-          "testcase_name":
-              "test_special_tokens_int_mode",
-          # Mask tokens in the vocab data should be ignored, and mapped to 0 in
-          # from the input data.
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        [""], [""], [""], ["[UNK]"], ["[UNK]"], ["[UNK]"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], [""], ["wind"], ["[UNK]"], ["and"], [""],
-                        ["fire"], ["and"], ["[UNK]"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": None,
-              "standardize": None,
-              "split": None,
-              "output_mode": text_vectorization.INT
-          },
-          "expected_output": [[2], [0], [3], [1], [4], [0], [5], [4], [1], [1]],
-      },
-      {
-          "testcase_name":
-              "test_documents_int_mode",
-          "vocab_data":
-              np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
-                        ["and wind and"]]),
-          "input_data":
-              np.array([["earth wind and"], ["fire fire"], ["and earth"],
-                        ["michigan"]]),
-          "kwargs": {
-              "max_tokens": None,
-              "standardize": None,
-              "split": text_vectorization.WHITESPACE,
-              "output_mode": text_vectorization.INT
-          },
-          "expected_output": [[2, 3, 4], [5, 5, 0], [4, 2, 0], [1, 0, 0]],
-      },
-      {
-          "testcase_name":
-              "test_documents_1d_input_int_mode",
-          "vocab_data":
-              np.array([
-                  "fire earth earth", "earth earth", "wind wind", "and wind and"
-              ]),
-          "input_data":
-              np.array([["earth wind and"], ["fire fire"], ["and earth"],
-                        ["michigan"]]),
-          "kwargs": {
-              "max_tokens": None,
-              "standardize": None,
-              "split": text_vectorization.WHITESPACE,
-              "output_mode": text_vectorization.INT
-          },
-          "expected_output": [[2, 3, 4], [5, 5, 0], [4, 2, 0], [1, 0, 0]],
-      },
-      {
-          "testcase_name":
-              "test_simple_tokens_binary_mode",
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "standardize": None,
-              "split": None,
-              "output_mode": text_vectorization.MULTI_HOT
-          },
-          "expected_output": [[0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0],
-                              [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0],
-                              [0, 1, 0, 0, 0], [1, 0, 0, 0, 0]],
-      },
-      {
-          "testcase_name":
-              "test_documents_binary_mode",
-          "vocab_data":
-              np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
-                        ["and wind and"]]),
-          "input_data":
-              np.array([["earth wind"], ["and"], ["fire fire"],
-                        ["earth michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "standardize": None,
-              "split": text_vectorization.WHITESPACE,
-              "output_mode": text_vectorization.MULTI_HOT
-          },
-          "expected_output": [[0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1],
-                              [1, 1, 0, 0, 0]],
-      },
-      {
-          "testcase_name":
-              "test_simple_tokens_count_mode",
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "standardize": None,
-              "split": None,
-              "output_mode": text_vectorization.COUNT
-          },
-          "expected_output": [[0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0],
-                              [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0],
-                              [0, 1, 0, 0, 0], [1, 0, 0, 0, 0]],
-      },
-      {
-          "testcase_name":
-              "test_documents_count_mode",
-          "vocab_data":
-              np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
-                        ["and wind and"]]),
-          "input_data":
-              np.array([["earth wind"], ["and"], ["fire fire"],
-                        ["earth michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "standardize": None,
-              "split": text_vectorization.WHITESPACE,
-              "output_mode": text_vectorization.COUNT
-          },
-          "expected_output": [[0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 2],
-                              [1, 1, 0, 0, 0]],
-      },
-      {
-          "testcase_name":
-              "test_tokens_idf_mode",
-          "vocab_data":
-              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
-                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data":
-              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
-                        ["and"], ["earth"], ["michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "standardize": None,
-              "split": None,
-              "output_mode": text_vectorization.TF_IDF
-          },
-          "expected_output": [[0, 1.098612, 0, 0, 0], [0, 0, 1.252763, 0, 0],
-                              [0, 0, 0, 1.466337, 0], [0, 0, 0, 0, 1.7917595],
-                              [0, 0, 0, 0, 1.7917595], [0, 0, 0, 1.4663371, 0],
-                              [0, 1.098612, 0, 0, 0], [1.402368, 0, 0, 0, 0]],
-      },
-      {
-          "testcase_name":
-              "test_documents_idf_mode",
-          "vocab_data":
-              np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
-                        ["and wind and"]]),
-          "input_data":
-              np.array([["earth wind"], ["and"], ["fire fire"],
-                        ["earth michigan"]]),
-          "kwargs": {
-              "max_tokens": 5,
-              "pad_to_max_tokens": True,
-              "standardize": None,
-              "split": text_vectorization.WHITESPACE,
-              "output_mode": text_vectorization.TF_IDF
-          },
-          "expected_output": [[0., 0.847298, 0.847298, 0., 0.],
-                              [0., 0., 0., 1.098612, 0.],
-                              [0., 0., 0., 0., 2.197225],
-                              [0.972955, 0.847298, 0., 0., 0.]],
-      },
-  )
-
-  crossed_test_cases = []
-  # Cross above test cases with use_dataset in (True, False)
-  for use_dataset in (True, False):
-    for case in test_cases:
-      case = case.copy()
-      if use_dataset:
-        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
-      case["use_dataset"] = use_dataset
-      crossed_test_cases.append(case)
-
-  return crossed_test_cases
+    test_cases = (
+        {
+            "testcase_name": "test_simple_tokens_int_mode",
+            # Create an array where 'earth' is the most frequent term, followed
+            # by 'wind', then 'and', then 'fire'. This ensures that the vocab is
+            # sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "standardize": None,
+                "split": None,
+                "output_mode": text_vectorization.INT,
+            },
+            "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+        },
+        {
+            "testcase_name": "test_simple_tokens_int_mode_hard_cap",
+            # Create an array where 'earth' is the most frequent term, followed
+            # by 'wind', then 'and', then 'fire'. This ensures that the vocab is
+            # sorting by frequency.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": 6,
+                "standardize": None,
+                "split": None,
+                "output_mode": text_vectorization.INT,
+            },
+            "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+        },
+        {
+            "testcase_name": "test_special_tokens_int_mode",
+            # Mask tokens in the vocab data should be ignored, and mapped to 0
+            # in from the input data.
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    [""],
+                    [""],
+                    [""],
+                    ["[UNK]"],
+                    ["[UNK]"],
+                    ["[UNK]"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    [""],
+                    ["wind"],
+                    ["[UNK]"],
+                    ["and"],
+                    [""],
+                    ["fire"],
+                    ["and"],
+                    ["[UNK]"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "standardize": None,
+                "split": None,
+                "output_mode": text_vectorization.INT,
+            },
+            "expected_output": [
+                [2],
+                [0],
+                [3],
+                [1],
+                [4],
+                [0],
+                [5],
+                [4],
+                [1],
+                [1],
+            ],
+        },
+        {
+            "testcase_name": "test_documents_int_mode",
+            "vocab_data": np.array(
+                [
+                    ["fire earth earth"],
+                    ["earth earth"],
+                    ["wind wind"],
+                    ["and wind and"],
+                ]
+            ),
+            "input_data": np.array(
+                [["earth wind and"], ["fire fire"], ["and earth"], ["michigan"]]
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "standardize": None,
+                "split": text_vectorization.WHITESPACE,
+                "output_mode": text_vectorization.INT,
+            },
+            "expected_output": [[2, 3, 4], [5, 5, 0], [4, 2, 0], [1, 0, 0]],
+        },
+        {
+            "testcase_name": "test_documents_1d_input_int_mode",
+            "vocab_data": np.array(
+                ["fire earth earth", "earth earth", "wind wind", "and wind and"]
+            ),
+            "input_data": np.array(
+                [["earth wind and"], ["fire fire"], ["and earth"], ["michigan"]]
+            ),
+            "kwargs": {
+                "max_tokens": None,
+                "standardize": None,
+                "split": text_vectorization.WHITESPACE,
+                "output_mode": text_vectorization.INT,
+            },
+            "expected_output": [[2, 3, 4], [5, 5, 0], [4, 2, 0], [1, 0, 0]],
+        },
+        {
+            "testcase_name": "test_simple_tokens_binary_mode",
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "standardize": None,
+                "split": None,
+                "output_mode": text_vectorization.MULTI_HOT,
+            },
+            "expected_output": [
+                [0, 1, 0, 0, 0],
+                [0, 0, 1, 0, 0],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 1],
+                [0, 0, 0, 0, 1],
+                [0, 0, 0, 1, 0],
+                [0, 1, 0, 0, 0],
+                [1, 0, 0, 0, 0],
+            ],
+        },
+        {
+            "testcase_name": "test_documents_binary_mode",
+            "vocab_data": np.array(
+                [
+                    ["fire earth earth"],
+                    ["earth earth"],
+                    ["wind wind"],
+                    ["and wind and"],
+                ]
+            ),
+            "input_data": np.array(
+                [["earth wind"], ["and"], ["fire fire"], ["earth michigan"]]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "standardize": None,
+                "split": text_vectorization.WHITESPACE,
+                "output_mode": text_vectorization.MULTI_HOT,
+            },
+            "expected_output": [
+                [0, 1, 1, 0, 0],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 1],
+                [1, 1, 0, 0, 0],
+            ],
+        },
+        {
+            "testcase_name": "test_simple_tokens_count_mode",
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "standardize": None,
+                "split": None,
+                "output_mode": text_vectorization.COUNT,
+            },
+            "expected_output": [
+                [0, 1, 0, 0, 0],
+                [0, 0, 1, 0, 0],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 1],
+                [0, 0, 0, 0, 1],
+                [0, 0, 0, 1, 0],
+                [0, 1, 0, 0, 0],
+                [1, 0, 0, 0, 0],
+            ],
+        },
+        {
+            "testcase_name": "test_documents_count_mode",
+            "vocab_data": np.array(
+                [
+                    ["fire earth earth"],
+                    ["earth earth"],
+                    ["wind wind"],
+                    ["and wind and"],
+                ]
+            ),
+            "input_data": np.array(
+                [["earth wind"], ["and"], ["fire fire"], ["earth michigan"]]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "standardize": None,
+                "split": text_vectorization.WHITESPACE,
+                "output_mode": text_vectorization.COUNT,
+            },
+            "expected_output": [
+                [0, 1, 1, 0, 0],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 2],
+                [1, 1, 0, 0, 0],
+            ],
+        },
+        {
+            "testcase_name": "test_tokens_idf_mode",
+            "vocab_data": np.array(
+                [
+                    ["fire"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["earth"],
+                    ["wind"],
+                    ["wind"],
+                    ["wind"],
+                    ["and"],
+                    ["and"],
+                ]
+            ),
+            "input_data": np.array(
+                [
+                    ["earth"],
+                    ["wind"],
+                    ["and"],
+                    ["fire"],
+                    ["fire"],
+                    ["and"],
+                    ["earth"],
+                    ["michigan"],
+                ]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "standardize": None,
+                "split": None,
+                "output_mode": text_vectorization.TF_IDF,
+            },
+            "expected_output": [
+                [0, 1.098612, 0, 0, 0],
+                [0, 0, 1.252763, 0, 0],
+                [0, 0, 0, 1.466337, 0],
+                [0, 0, 0, 0, 1.7917595],
+                [0, 0, 0, 0, 1.7917595],
+                [0, 0, 0, 1.4663371, 0],
+                [0, 1.098612, 0, 0, 0],
+                [1.402368, 0, 0, 0, 0],
+            ],
+        },
+        {
+            "testcase_name": "test_documents_idf_mode",
+            "vocab_data": np.array(
+                [
+                    ["fire earth earth"],
+                    ["earth earth"],
+                    ["wind wind"],
+                    ["and wind and"],
+                ]
+            ),
+            "input_data": np.array(
+                [["earth wind"], ["and"], ["fire fire"], ["earth michigan"]]
+            ),
+            "kwargs": {
+                "max_tokens": 5,
+                "pad_to_max_tokens": True,
+                "standardize": None,
+                "split": text_vectorization.WHITESPACE,
+                "output_mode": text_vectorization.TF_IDF,
+            },
+            "expected_output": [
+                [0.0, 0.847298, 0.847298, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 1.098612, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 2.197225],
+                [0.972955, 0.847298, 0.0, 0.0, 0.0],
+            ],
+        },
+    )
+
+    crossed_test_cases = []
+    # Cross above test cases with use_dataset in (True, False)
+    for use_dataset in (True, False):
+        for case in test_cases:
+            case = case.copy()
+            if use_dataset:
+                case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+            case["use_dataset"] = use_dataset
+            crossed_test_cases.append(case)
+
+    return crossed_test_cases
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationLayerTest(test_combinations.TestCase,
-                                 preprocessing_test_utils.PreprocessingLayerTest
-                                ):
-
-  @parameterized.named_parameters(*_get_end_to_end_test_cases())
-  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
-                                       use_dataset, expected_output):
-    cls = text_vectorization.TextVectorization
-    if kwargs.get("output_mode") == text_vectorization.INT:
-      expected_output_dtype = tf.int64
-    else:
-      expected_output_dtype = tf.float32
-    input_shape = input_data.shape
-
-    if use_dataset:
-      # Keras APIs expect batched datasets.
-      # TODO(rachelim): `model.predict` predicts the result on each
-      # dataset batch separately, then tries to concatenate the results
-      # together. When the results have different shapes on the non-concat
-      # axis (which can happen in the output_mode = INT case for
-      # TextVectorization), the concatenation fails. In real use cases, this may
-      # not be an issue because users are likely to pipe the preprocessing layer
-      # into other keras layers instead of predicting it directly. A workaround
-      # for these unit tests is to have the dataset only contain one batch, so
-      # no concatenation needs to happen with the result. For consistency with
-      # numpy input, we should make `predict` join differently shaped results
-      # together sensibly, with 0 padding.
-      input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
-          input_shape[0])
-      vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
-          input_shape[0])
-
-    output_data = test_utils.layer_test(
-        cls,
-        kwargs=kwargs,
-        input_shape=input_shape,
-        input_data=input_data,
-        input_dtype=tf.string,
-        expected_output_dtype=expected_output_dtype,
-        validate_training=False,
-        adapt_data=vocab_data)
-    self.assertAllClose(expected_output, output_data)
-
-  @parameterized.product(
-      rank=[0, 1, 2],
-      # Check lists, numpy arrays, tensors, and objects convertable to tensor.
-      data_fn=[None, np.array, tf.constant, preprocessing_test_utils.ArrayLike]
-  )
-  def test_input_types(self, rank, data_fn):
-    input_data = "earth wind and fire"
-    expected_output = [2, 3, 4, 5]
-    if rank == 1:
-      input_data = [input_data]
-      expected_output = [expected_output]
-    elif rank == 2:
-      input_data = [[input_data]]
-      expected_output = [expected_output]
-    if data_fn is not None:
-      input_data = data_fn(input_data)
-    input_shape = [] if rank == 0 else [1]
-
-    layer = text_vectorization.TextVectorization(
-        vocabulary=["earth", "wind", "and", "fire"])
-    output_data = layer(input_data)
-    self.assertAllEqual(output_data, expected_output)
-
-    # Again in a keras.Model
-    inputs = keras.Input(shape=input_shape, dtype=tf.string)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model(tf.constant(input_data))
-    self.assertAllEqual(output_data, expected_output)
-
-  @parameterized.named_parameters([
-      {
-          "testcase_name": "ragged_tensor1",
-          "input_data": [[["0 a b"], ["c d"]], [["e a"], ["b c d"]], [["f"]]],
-          "expected_output": [[[1, 2, 3], [4, 5]], [[6, 2], [3, 4, 5]], [[7]]],
-      },
-      {
-          "testcase_name": "ragged_tensor2",
-          "input_data": [[["0 a b"], [""]], [], [["e a"], ["b c d"]], [["f"]]],
-          "expected_output": [[[1, 2, 3], []], [], [[6, 2], [3, 4, 5]], [[7]]],
-      },
-  ])
-  def test_ragged_input_and_ragged_output(self, input_data, expected_output):
-    input_data = tf.ragged.constant(input_data, inner_shape=(1,))
-    layer = text_vectorization.TextVectorization(
-        vocabulary=["a", "b", "c", "d", "e", "f"], ragged=True)
-    output_data = layer(input_data)
-    self.assertAllEqual(output_data, expected_output)
-
-    # Again in a keras.Model
-    inputs = keras.Input(shape=(1,), dtype=tf.string)
-    outputs = layer(inputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    output_data = model.predict(input_data)
-    self.assertAllEqual(output_data, expected_output)
-
-  def test_scalar_input_int_mode_no_len_limit(self):
-    vocab_data = [
-        "fire earth earth", "earth earth", "wind wind", "and wind and"
-    ]
-    input_data = "earth wind and fire fire and earth michigan"
-    layer = text_vectorization.TextVectorization()
-    layer.adapt(vocab_data)
-    out = layer(input_data)
-    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
-    layer.set_vocabulary(["earth", "wind", "and", "fire"])
-    out = layer(input_data)
-    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
-
-  def test_scalar_input_int_mode_trim_to_len_limit(self):
-    vocab_data = [
-        "fire earth earth", "earth earth", "wind wind", "and wind and"
-    ]
-    input_data = "earth wind and fire fire and earth michigan"
-    layer = text_vectorization.TextVectorization(output_sequence_length=3)
-    layer.adapt(vocab_data)
-    out = layer(input_data)
-    self.assertAllClose(out.numpy(), [2, 3, 4])
-    layer.set_vocabulary(["earth", "wind", "and", "fire"])
-    out = layer(input_data)
-    self.assertAllClose(out.numpy(), [2, 3, 4])
-
-  def test_scalar_input_int_pad_to_len_limit(self):
-    vocab_data = [
-        "fire earth earth", "earth earth", "wind wind", "and wind and"
-    ]
-    input_data = "earth wind and fire fire and earth michigan"
-    layer = text_vectorization.TextVectorization(output_sequence_length=10)
-    layer.adapt(vocab_data)
-    out = layer(input_data)
-    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
-    layer.set_vocabulary(["earth", "wind", "and", "fire"])
-    out = layer(input_data)
-    self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
-
-  def test_dataset_of_single_strings(self):
-    vocab_data = ["two two two", "two three three", "three four four five"]
-    input_data = ["two three", "four five"]
-    vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
-    input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
-    layer = text_vectorization.TextVectorization()
-    layer.adapt(vocab_ds)
-    out = input_ds.map(layer)
-    self.assertAllClose(list(out.as_numpy_iterator()), [[2, 3], [4, 5]])
-
-  def test_dataset_of_single_strings_with_output_sequence(self):
-    vocab_data = ["two two two", "two three three", "three four four five"]
-    input_data = ["two three", "four five"]
-    vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
-    input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
-    layer = text_vectorization.TextVectorization(output_sequence_length=3)
-    layer.adapt(vocab_ds)
-    out = input_ds.map(layer)
-    self.assertAllClose(list(out.as_numpy_iterator()), [[2, 3, 0], [4, 5, 0]])
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name": "1d",
-          "data": ["0", "a", "b", "c", "d", "e", "a", "b", "c", "d", "f"],
-          "expected": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]
-      },
-      {
-          "testcase_name": "2d",
-          "data": [["0", "a", "b", "c", "d"], ["e", "a", "b", "c", "d"], ["f"]],
-          "expected": [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 0, 0, 0, 0]]
-      },
-      {
-          "testcase_name":
-              "3d",
-          "data": [[["0", "a", "b"], ["c", "d"]], [["e", "a"], ["b", "c", "d"]],
-                   [["f"]]],
-          "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
-                       [[1, 0, 0], [0, 0, 0]]]
-      },
-  )
-  def test_layer_dimensionality_handling(self, data, expected):
-    vocab = ["a", "b", "c", "d"]
-    vectorization = text_vectorization.TextVectorization(
-        max_tokens=None, standardize=None, split=None, pad_to_max_tokens=False)
-    vectorization.set_vocabulary(vocab)
-    output = vectorization(tf.ragged.constant(data))
-    self.assertAllEqual(expected, output)
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name": "1d",
-          "data": ["0 a b c d e a b c d f"],
-          "expected": [[1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]]
-      },
-      {
-          "testcase_name":
-              "3d",
-          "data": [[["0 a b"], ["c d"]], [["e a"], ["b c d"]], [["f"]]],
-          "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
-                       [[1, 0, 0], [0, 0, 0]]]
-      },
-  )
-  def test_layer_dimensionality_handling_with_split(self, data, expected):
-    vocab = ["a", "b", "c", "d"]
-    vectorization = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        pad_to_max_tokens=False)
-    vectorization.set_vocabulary(vocab)
-    output = vectorization(tf.ragged.constant(data, inner_shape=(1,)))
-    self.assertAllEqual(expected, output)
+class TextVectorizationLayerTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(*_get_end_to_end_test_cases())
+    def test_layer_end_to_end_with_adapt(
+        self, vocab_data, input_data, kwargs, use_dataset, expected_output
+    ):
+        cls = text_vectorization.TextVectorization
+        if kwargs.get("output_mode") == text_vectorization.INT:
+            expected_output_dtype = tf.int64
+        else:
+            expected_output_dtype = tf.float32
+        input_shape = input_data.shape
+
+        if use_dataset:
+            # Keras APIs expect batched datasets.
+            # TODO(rachelim): `model.predict` predicts the result on each
+            # dataset batch separately, then tries to concatenate the results
+            # together. When the results have different shapes on the non-concat
+            # axis (which can happen in the output_mode = INT case for
+            # TextVectorization), the concatenation fails. In real use cases,
+            # this may not be an issue because users are likely to pipe the
+            # preprocessing layer into other keras layers instead of predicting
+            # it directly. A workaround for these unit tests is to have the
+            # dataset only contain one batch, so no concatenation needs to
+            # happen with the result. For consistency with numpy input, we
+            # should make `predict` join differently shaped results together
+            # sensibly, with 0 padding.
+            input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
+                input_shape[0]
+            )
+            vocab_data = tf.data.Dataset.from_tensor_slices(vocab_data).batch(
+                input_shape[0]
+            )
+
+        output_data = test_utils.layer_test(
+            cls,
+            kwargs=kwargs,
+            input_shape=input_shape,
+            input_data=input_data,
+            input_dtype=tf.string,
+            expected_output_dtype=expected_output_dtype,
+            validate_training=False,
+            adapt_data=vocab_data,
+        )
+        self.assertAllClose(expected_output, output_data)
+
+    @parameterized.product(
+        rank=[0, 1, 2],
+        # Check lists, numpy arrays, tensors, and objects convertable to tensor.
+        data_fn=[
+            None,
+            np.array,
+            tf.constant,
+            preprocessing_test_utils.ArrayLike,
+        ],
+    )
+    def test_input_types(self, rank, data_fn):
+        input_data = "earth wind and fire"
+        expected_output = [2, 3, 4, 5]
+        if rank == 1:
+            input_data = [input_data]
+            expected_output = [expected_output]
+        elif rank == 2:
+            input_data = [[input_data]]
+            expected_output = [expected_output]
+        if data_fn is not None:
+            input_data = data_fn(input_data)
+        input_shape = [] if rank == 0 else [1]
+
+        layer = text_vectorization.TextVectorization(
+            vocabulary=["earth", "wind", "and", "fire"]
+        )
+        output_data = layer(input_data)
+        self.assertAllEqual(output_data, expected_output)
+
+        # Again in a keras.Model
+        inputs = keras.Input(shape=input_shape, dtype=tf.string)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model(tf.constant(input_data))
+        self.assertAllEqual(output_data, expected_output)
+
+    @parameterized.named_parameters(
+        [
+            {
+                "testcase_name": "ragged_tensor1",
+                "input_data": [
+                    [["0 a b"], ["c d"]],
+                    [["e a"], ["b c d"]],
+                    [["f"]],
+                ],
+                "expected_output": [
+                    [[1, 2, 3], [4, 5]],
+                    [[6, 2], [3, 4, 5]],
+                    [[7]],
+                ],
+            },
+            {
+                "testcase_name": "ragged_tensor2",
+                "input_data": [
+                    [["0 a b"], [""]],
+                    [],
+                    [["e a"], ["b c d"]],
+                    [["f"]],
+                ],
+                "expected_output": [
+                    [[1, 2, 3], []],
+                    [],
+                    [[6, 2], [3, 4, 5]],
+                    [[7]],
+                ],
+            },
+        ]
+    )
+    def test_ragged_input_and_ragged_output(self, input_data, expected_output):
+        input_data = tf.ragged.constant(input_data, inner_shape=(1,))
+        layer = text_vectorization.TextVectorization(
+            vocabulary=["a", "b", "c", "d", "e", "f"], ragged=True
+        )
+        output_data = layer(input_data)
+        self.assertAllEqual(output_data, expected_output)
+
+        # Again in a keras.Model
+        inputs = keras.Input(shape=(1,), dtype=tf.string)
+        outputs = layer(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        output_data = model.predict(input_data)
+        self.assertAllEqual(output_data, expected_output)
+
+    def test_scalar_input_int_mode_no_len_limit(self):
+        vocab_data = [
+            "fire earth earth",
+            "earth earth",
+            "wind wind",
+            "and wind and",
+        ]
+        input_data = "earth wind and fire fire and earth michigan"
+        layer = text_vectorization.TextVectorization()
+        layer.adapt(vocab_data)
+        out = layer(input_data)
+        self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
+        layer.set_vocabulary(["earth", "wind", "and", "fire"])
+        out = layer(input_data)
+        self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
+
+    def test_scalar_input_int_mode_trim_to_len_limit(self):
+        vocab_data = [
+            "fire earth earth",
+            "earth earth",
+            "wind wind",
+            "and wind and",
+        ]
+        input_data = "earth wind and fire fire and earth michigan"
+        layer = text_vectorization.TextVectorization(output_sequence_length=3)
+        layer.adapt(vocab_data)
+        out = layer(input_data)
+        self.assertAllClose(out.numpy(), [2, 3, 4])
+        layer.set_vocabulary(["earth", "wind", "and", "fire"])
+        out = layer(input_data)
+        self.assertAllClose(out.numpy(), [2, 3, 4])
+
+    def test_scalar_input_int_pad_to_len_limit(self):
+        vocab_data = [
+            "fire earth earth",
+            "earth earth",
+            "wind wind",
+            "and wind and",
+        ]
+        input_data = "earth wind and fire fire and earth michigan"
+        layer = text_vectorization.TextVectorization(output_sequence_length=10)
+        layer.adapt(vocab_data)
+        out = layer(input_data)
+        self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
+        layer.set_vocabulary(["earth", "wind", "and", "fire"])
+        out = layer(input_data)
+        self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
+
+    def test_dataset_of_single_strings(self):
+        vocab_data = ["two two two", "two three three", "three four four five"]
+        input_data = ["two three", "four five"]
+        vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
+        input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
+        layer = text_vectorization.TextVectorization()
+        layer.adapt(vocab_ds)
+        out = input_ds.map(layer)
+        self.assertAllClose(list(out.as_numpy_iterator()), [[2, 3], [4, 5]])
+
+    def test_dataset_of_single_strings_with_output_sequence(self):
+        vocab_data = ["two two two", "two three three", "three four four five"]
+        input_data = ["two three", "four five"]
+        vocab_ds = tf.data.Dataset.from_tensor_slices(vocab_data)  # unbatched
+        input_ds = tf.data.Dataset.from_tensor_slices(input_data)  # unbatched
+        layer = text_vectorization.TextVectorization(output_sequence_length=3)
+        layer.adapt(vocab_ds)
+        out = input_ds.map(layer)
+        self.assertAllClose(
+            list(out.as_numpy_iterator()), [[2, 3, 0], [4, 5, 0]]
+        )
+
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "1d",
+            "data": ["0", "a", "b", "c", "d", "e", "a", "b", "c", "d", "f"],
+            "expected": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1],
+        },
+        {
+            "testcase_name": "2d",
+            "data": [
+                ["0", "a", "b", "c", "d"],
+                ["e", "a", "b", "c", "d"],
+                ["f"],
+            ],
+            "expected": [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 0, 0, 0, 0]],
+        },
+        {
+            "testcase_name": "3d",
+            "data": [
+                [["0", "a", "b"], ["c", "d"]],
+                [["e", "a"], ["b", "c", "d"]],
+                [["f"]],
+            ],
+            "expected": [
+                [[1, 2, 3], [4, 5, 0]],
+                [[1, 2, 0], [3, 4, 5]],
+                [[1, 0, 0], [0, 0, 0]],
+            ],
+        },
+    )
+    def test_layer_dimensionality_handling(self, data, expected):
+        vocab = ["a", "b", "c", "d"]
+        vectorization = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            pad_to_max_tokens=False,
+        )
+        vectorization.set_vocabulary(vocab)
+        output = vectorization(tf.ragged.constant(data))
+        self.assertAllEqual(expected, output)
+
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "1d",
+            "data": ["0 a b c d e a b c d f"],
+            "expected": [[1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]],
+        },
+        {
+            "testcase_name": "3d",
+            "data": [[["0 a b"], ["c d"]], [["e a"], ["b c d"]], [["f"]]],
+            "expected": [
+                [[1, 2, 3], [4, 5, 0]],
+                [[1, 2, 0], [3, 4, 5]],
+                [[1, 0, 0], [0, 0, 0]],
+            ],
+        },
+    )
+    def test_layer_dimensionality_handling_with_split(self, data, expected):
+        vocab = ["a", "b", "c", "d"]
+        vectorization = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            pad_to_max_tokens=False,
+        )
+        vectorization.set_vocabulary(vocab)
+        output = vectorization(tf.ragged.constant(data, inner_shape=(1,)))
+        self.assertAllEqual(expected, output)
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TextVectorizationPreprocessingTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with tf.io.gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
-  def test_summary_before_adapt(self):
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=10,
-        pad_to_max_tokens=True,
-        standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
-        split=None,
-        ngrams=None,
-        output_mode=text_vectorization.TF_IDF)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    # We are testing that model.summary() can be called without erroring out.
-    # (b/145726907)
-    model.summary()
-
-  @parameterized.parameters([list, np.array, tf.constant, tf.ragged.constant])
-  def test_lower_and_strip_punctuation(self, data_fn):
-    input_array = data_fn([["Earth", "wInD", "aNd", "firE"],
-                           ["fire|", "an<>d", "{earth}", "michigan@%$"]])
-    expected_output = data_fn([[b"earth", b"wind", b"and", b"fire"],
-                               [b"fire", b"and", b"earth", b"michigan"]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
-        split=None,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  @parameterized.parameters([list, np.array, tf.constant, tf.ragged.constant])
-  def test_strip_punctuation(self, data_fn):
-    input_array = data_fn([["Earth", "wInD", "aNd", "firE"],
-                           ["fire|", "an<>d", "{earth}", "michigan@%$"]])
-    expected_output = data_fn([[b"Earth", b"wInD", b"aNd", b"firE"],
-                               [b"fire", b"and", b"earth", b"michigan"]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=text_vectorization.STRIP_PUNCTUATION,
-        split=None,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  @parameterized.parameters([list, np.array, tf.constant, tf.ragged.constant])
-  def test_lower(self, data_fn):
-    input_array = data_fn([["Earth", "wInD", "aNd", "firE"],
-                           ["fire|", "an<>d", "{earth}", "michigan@$"]])
-    expected_output = data_fn([[b"earth", b"wind", b"and", b"fire"],
-                               [b"fire|", b"an<>d", b"{earth}", b"michigan@$"]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=text_vectorization.LOWER,
-        split=None,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_custom_normalization(self):
-    input_array = np.array([["Earth", "wInD", "aNd", "firE"],
-                            ["fire|", "an<>d", "{earth}", "michigan@%$"]])
-    expected_output = np.array(
-        [[b"earth", b"wind", b"and", b"fire"],
-         [b"fire|", b"an<>d", b"{earth}", b"michigan@%$"]])
-
-    custom_standardization = tf.strings.lower
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=custom_standardization,
-        split=None,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_whitespace_splitting(self):
-    input_array = np.array([["earth wind and fire"],
-                            ["\tfire\tand\nearth    michigan  "]])
-    expected_output = [[b"earth", b"wind", b"and", b"fire"],
-                       [b"fire", b"and", b"earth", b"michigan"]]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_character_splitting(self):
-    input_array = np.array([["earthwind"],
-                            ["and fire"]])
-    expected_output = [[b"e", b"a", b"r", b"t", b"h", b"w", b"i", b"n", b"d"],
-                       [b"a", b"n", b"d", b" ", b"f", b"i", b"r", b"e"]]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.CHARACTER,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_custom_string_splitting(self):
-    input_array = np.array([["earth>wind>and fire"],
-                            ["\tfire>and\nearth>michigan"]])
-    expected_output = [[b"earth", b"wind", b"and fire"],
-                       [b"\tfire", b"and\nearth", b"michigan"]]
-
-    custom_split = lambda x: tf.strings.split(x, sep=">")
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=custom_split,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_single_ngram_value_ragged_inputs(self):
-    input_array = tf.ragged.constant([["earth", "wind", "and", "fire"],
-                                               ["fire", "and", "earth"]])
-    # pyformat: disable
-    expected_output = [[b"earth", b"wind", b"and", b"fire",
-                        b"earth wind", b"wind and", b"and fire",
-                        b"earth wind and", b"wind and fire"],
-                       [b"fire", b"and", b"earth",
-                        b"fire and", b"and earth",
-                        b"fire and earth"]]
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), ragged=True, dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        ngrams=3,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_single_ngram_value(self):
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    # pyformat: disable
-    expected_output = [[b"earth", b"wind", b"and", b"fire",
-                        b"earth wind", b"wind and", b"and fire",
-                        b"earth wind and", b"wind and fire"],
-                       [b"fire", b"and", b"earth", b"michigan",
-                        b"fire and", b"and earth", b"earth michigan",
-                        b"fire and earth", b"and earth michigan"]]
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(4,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        ngrams=3,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_multiple_ngram_values(self):
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    # pyformat: disable
-    expected_output = [[b"earth wind", b"wind and", b"and fire",
-                        b"earth wind and", b"wind and fire"],
-                       [b"fire and", b"and earth", b"earth michigan",
-                        b"fire and earth", b"and earth michigan"]]
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(4,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        ngrams=(2, 3),
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_string_multiple_preprocessing_steps(self):
-    input_array = np.array([["earth wInD and firE"],
-                            ["\tfire\tand\nearth!!    michig@n  "]])
-    expected_output = [[
-        b"earth",
-        b"wind",
-        b"and",
-        b"fire",
-        b"earth wind",
-        b"wind and",
-        b"and fire",
-    ],
-                       [
-                           b"fire",
-                           b"and",
-                           b"earth",
-                           b"michign",
-                           b"fire and",
-                           b"and earth",
-                           b"earth michign",
-                       ]]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
-        split=text_vectorization.WHITESPACE,
-        ngrams=2,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_string_splitting_with_non_1d_array_fails(self):
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        vocabulary=["a"],
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=None)
-    with self.assertRaisesRegex(ValueError, "last shape dimension must be 1"):
-      _ = layer(input_data)
-
-  def test_string_splitting_with_non_1d_raggedarray_fails(self):
-    input_data = keras.Input(shape=(None,), ragged=True, dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        vocabulary=["a"],
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=None)
-    with self.assertRaisesRegex(ValueError, "last shape dimension must be 1"):
-      _ = layer(input_data)
-
-  def test_standardization_with_invalid_standardize_arg(self):
-    with self.assertRaisesRegex(ValueError, "Unkown value for `standardize`"):
-      text_vectorization.TextVectorization(
-          vocabulary=["a"], standardize="unsupported")
-
-  def test_splitting_with_invalid_split_arg(self):
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(vocabulary=["a"])
-    layer._split = "unsupported"
-    with self.assertRaisesRegex(ValueError, ".*is not a supported splitting.*"):
-      _ = layer(input_data)
-
-  def test_vocab_setting_via_init(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT,
-        vocabulary=vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_setting_via_init_file(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT,
-        vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_setting_via_setter(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_setting_with_oov_via_setter(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def _write_to_temp_file(self, file_name, vocab_list):
+        vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+        with tf.io.gfile.GFile(vocab_path, "w") as writer:
+            for vocab in vocab_list:
+                writer.write(vocab + "\n")
+            writer.flush()
+            writer.close()
+        return vocab_path
+
+    def test_summary_before_adapt(self):
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=10,
+            pad_to_max_tokens=True,
+            standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
+            split=None,
+            ngrams=None,
+            output_mode=text_vectorization.TF_IDF,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        # We are testing that model.summary() can be called without erroring
+        # out. (b/145726907)
+        model.summary()
+
+    @parameterized.parameters([list, np.array, tf.constant, tf.ragged.constant])
+    def test_lower_and_strip_punctuation(self, data_fn):
+        input_array = data_fn(
+            [
+                ["Earth", "wInD", "aNd", "firE"],
+                ["fire|", "an<>d", "{earth}", "michigan@%$"],
+            ]
+        )
+        expected_output = data_fn(
+            [
+                [b"earth", b"wind", b"and", b"fire"],
+                [b"fire", b"and", b"earth", b"michigan"],
+            ]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
+            split=None,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    @parameterized.parameters([list, np.array, tf.constant, tf.ragged.constant])
+    def test_strip_punctuation(self, data_fn):
+        input_array = data_fn(
+            [
+                ["Earth", "wInD", "aNd", "firE"],
+                ["fire|", "an<>d", "{earth}", "michigan@%$"],
+            ]
+        )
+        expected_output = data_fn(
+            [
+                [b"Earth", b"wInD", b"aNd", b"firE"],
+                [b"fire", b"and", b"earth", b"michigan"],
+            ]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=text_vectorization.STRIP_PUNCTUATION,
+            split=None,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    @parameterized.parameters([list, np.array, tf.constant, tf.ragged.constant])
+    def test_lower(self, data_fn):
+        input_array = data_fn(
+            [
+                ["Earth", "wInD", "aNd", "firE"],
+                ["fire|", "an<>d", "{earth}", "michigan@$"],
+            ]
+        )
+        expected_output = data_fn(
+            [
+                [b"earth", b"wind", b"and", b"fire"],
+                [b"fire|", b"an<>d", b"{earth}", b"michigan@$"],
+            ]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=text_vectorization.LOWER,
+            split=None,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_custom_normalization(self):
+        input_array = np.array(
+            [
+                ["Earth", "wInD", "aNd", "firE"],
+                ["fire|", "an<>d", "{earth}", "michigan@%$"],
+            ]
+        )
+        expected_output = np.array(
+            [
+                [b"earth", b"wind", b"and", b"fire"],
+                [b"fire|", b"an<>d", b"{earth}", b"michigan@%$"],
+            ]
+        )
+
+        custom_standardization = tf.strings.lower
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=custom_standardization,
+            split=None,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_whitespace_splitting(self):
+        input_array = np.array(
+            [["earth wind and fire"], ["\tfire\tand\nearth    michigan  "]]
+        )
+        expected_output = [
+            [b"earth", b"wind", b"and", b"fire"],
+            [b"fire", b"and", b"earth", b"michigan"],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_character_splitting(self):
+        input_array = np.array([["earthwind"], ["and fire"]])
+        expected_output = [
+            [b"e", b"a", b"r", b"t", b"h", b"w", b"i", b"n", b"d"],
+            [b"a", b"n", b"d", b" ", b"f", b"i", b"r", b"e"],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.CHARACTER,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_custom_string_splitting(self):
+        input_array = np.array(
+            [["earth>wind>and fire"], ["\tfire>and\nearth>michigan"]]
+        )
+        expected_output = [
+            [b"earth", b"wind", b"and fire"],
+            [b"\tfire", b"and\nearth", b"michigan"],
+        ]
+
+        custom_split = lambda x: tf.strings.split(x, sep=">")
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=custom_split,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_single_ngram_value_ragged_inputs(self):
+        input_array = tf.ragged.constant(
+            [["earth", "wind", "and", "fire"], ["fire", "and", "earth"]]
+        )
+        # pyformat: disable
+        expected_output = [
+            [
+                b"earth",
+                b"wind",
+                b"and",
+                b"fire",
+                b"earth wind",
+                b"wind and",
+                b"and fire",
+                b"earth wind and",
+                b"wind and fire",
+            ],
+            [
+                b"fire",
+                b"and",
+                b"earth",
+                b"fire and",
+                b"and earth",
+                b"fire and earth",
+            ],
+        ]
+        # pyformat: enable
+
+        input_data = keras.Input(shape=(None,), ragged=True, dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            ngrams=3,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_single_ngram_value(self):
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        # pyformat: disable
+        expected_output = [
+            [
+                b"earth",
+                b"wind",
+                b"and",
+                b"fire",
+                b"earth wind",
+                b"wind and",
+                b"and fire",
+                b"earth wind and",
+                b"wind and fire",
+            ],
+            [
+                b"fire",
+                b"and",
+                b"earth",
+                b"michigan",
+                b"fire and",
+                b"and earth",
+                b"earth michigan",
+                b"fire and earth",
+                b"and earth michigan",
+            ],
+        ]
+        # pyformat: enable
+
+        input_data = keras.Input(shape=(4,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            ngrams=3,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_multiple_ngram_values(self):
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        # pyformat: disable
+        expected_output = [
+            [
+                b"earth wind",
+                b"wind and",
+                b"and fire",
+                b"earth wind and",
+                b"wind and fire",
+            ],
+            [
+                b"fire and",
+                b"and earth",
+                b"earth michigan",
+                b"fire and earth",
+                b"and earth michigan",
+            ],
+        ]
+        # pyformat: enable
+
+        input_data = keras.Input(shape=(4,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            ngrams=(2, 3),
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_string_multiple_preprocessing_steps(self):
+        input_array = np.array(
+            [["earth wInD and firE"], ["\tfire\tand\nearth!!    michig@n  "]]
+        )
+        expected_output = [
+            [
+                b"earth",
+                b"wind",
+                b"and",
+                b"fire",
+                b"earth wind",
+                b"wind and",
+                b"and fire",
+            ],
+            [
+                b"fire",
+                b"and",
+                b"earth",
+                b"michign",
+                b"fire and",
+                b"and earth",
+                b"earth michign",
+            ],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
+            split=text_vectorization.WHITESPACE,
+            ngrams=2,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_string_splitting_with_non_1d_array_fails(self):
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            vocabulary=["a"],
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=None,
+        )
+        with self.assertRaisesRegex(
+            ValueError, "last shape dimension must be 1"
+        ):
+            _ = layer(input_data)
+
+    def test_string_splitting_with_non_1d_raggedarray_fails(self):
+        input_data = keras.Input(shape=(None,), ragged=True, dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            vocabulary=["a"],
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=None,
+        )
+        with self.assertRaisesRegex(
+            ValueError, "last shape dimension must be 1"
+        ):
+            _ = layer(input_data)
+
+    def test_standardization_with_invalid_standardize_arg(self):
+        with self.assertRaisesRegex(
+            ValueError, "Unkown value for `standardize`"
+        ):
+            text_vectorization.TextVectorization(
+                vocabulary=["a"], standardize="unsupported"
+            )
+
+    def test_splitting_with_invalid_split_arg(self):
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(vocabulary=["a"])
+        layer._split = "unsupported"
+        with self.assertRaisesRegex(
+            ValueError, ".*is not a supported splitting.*"
+        ):
+            _ = layer(input_data)
+
+    def test_vocab_setting_via_init(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+            vocabulary=vocab_data,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_vocab_setting_via_init_file(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+            vocabulary=vocab_path,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_vocab_setting_via_setter(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+        )
+        layer.set_vocabulary(vocab_path)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_vocab_setting_with_oov_via_setter(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+        )
+        layer.set_vocabulary(vocab_path)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TextVectorizationDistributionTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_distribution_strategy_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    strategy = tf.distribute.OneDeviceStrategy("/cpu:0")
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=tf.string)
-      layer = text_vectorization.TextVectorization(
-          max_tokens=None,
-          standardize=None,
-          split=None,
-          output_mode=text_vectorization.INT)
-      layer.set_vocabulary(vocab_data)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_distribution_strategy_output(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        strategy = tf.distribute.OneDeviceStrategy("/cpu:0")
+        with strategy.scope():
+            input_data = keras.Input(shape=(None,), dtype=tf.string)
+            layer = text_vectorization.TextVectorization(
+                max_tokens=None,
+                standardize=None,
+                split=None,
+                output_mode=text_vectorization.INT,
+            )
+            layer.set_vocabulary(vocab_data)
+            int_data = layer(input_data)
+            model = keras.Model(inputs=input_data, outputs=int_data)
+
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TextVectorizationOutputTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_int_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_densifies_with_zeros(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # Create an input array that has 5 elements in the first example and 4 in
-    # the second. This should output a 2x5 tensor with a padding value in the
-    # second example.
-    input_array = np.array([["earth wind and also fire"],
-                            ["fire and earth michigan"]])
-    expected_output = [[2, 3, 4, 1, 5], [5, 4, 2, 1, 0]]
-
-    # This test doesn't explicitly set an output shape, so the 2nd dimension
-    # should stay 'None'.
-    expected_output_shape = [None, None]
-
-    # The input shape here is explicitly 1 because we're tokenizing.
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_ragged(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # Create an input array that has 5 elements in the first example and 4 in
-    # the second.
-    input_array = np.array([["earth wind and also fire"],
-                            ["fire and earth michigan"]])
-    expected_output = tf.ragged.constant([[2, 3, 4, 1, 5], [5, 4, 2, 1]])
-    expected_output_shape = [None, None]
-
-    # The input shape here is explicitly 1 because we're tokenizing.
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=text_vectorization.INT,
-        ragged=True)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_densifies_with_zeros_and_pads(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # Create an input array that has 5 elements in the first example and 4 in
-    # the second. This should output a 2x6 tensor with a padding value in the
-    # second example, since output_sequence_length is set to 6.
-    input_array = np.array([["earth wind and also fire"],
-                            ["fire and earth michigan"]])
-    expected_output = [[2, 3, 4, 1, 5, 0], [5, 4, 2, 1, 0, 0]]
-
-    output_sequence_length = 6
-    expected_output_shape = [None, output_sequence_length]
-
-    # The input shape here is explicitly 1 because we're tokenizing.
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=text_vectorization.INT,
-        output_sequence_length=output_sequence_length)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_densifies_with_zeros_and_strips(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # Create an input array that has 5 elements in the first example and 4 in
-    # the second. This should output a 2x3 tensor with a padding value in the
-    # second example, since output_sequence_length is set to 3.
-    input_array = np.array([["earth wind and also fire"],
-                            ["fire and earth michigan"]])
-    expected_output = [[2, 3, 4], [5, 4, 2]]
-    output_sequence_length = 3
-    expected_output_shape = [None, output_sequence_length]
-
-    # The input shape here is explicitly 1 because we're tokenizing.
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=text_vectorization.INT,
-        output_sequence_length=output_sequence_length)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_int_output_dynamically_strips_and_pads(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # Create an input array that has 5 elements in the first example and 4 in
-    # the second. This should output a 2x3 tensor with a padding value in the
-    # second example, since output_sequence_length is set to 3.
-    input_array = np.array([["earth wind and also fire"],
-                            ["fire and earth michigan"]])
-    expected_output = [[2, 3, 4], [5, 4, 2]]
-    output_sequence_length = 3
-    expected_output_shape = [None, output_sequence_length]
-
-    # The input shape here is explicitly 1 because we're tokenizing.
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=text_vectorization.INT,
-        output_sequence_length=output_sequence_length)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-    # Create an input array that has 1 element in the first example and 2 in
-    # the second. This should output a 2x3 tensor with a padding value in the
-    # second example, since output_sequence_length is set to 3.
-    input_array_2 = np.array([["wind"], ["fire and"]])
-    expected_output_2 = [[3, 0, 0], [5, 4, 0]]
-    output_dataset = model.predict(input_array_2)
-    self.assertAllEqual(expected_output_2, output_dataset)
-
-  @parameterized.parameters(
-      {"sparse": True},
-      {"sparse": False},
-  )
-  def test_multi_hot_output_hard_maximum(self, sparse):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0, 0],
-                       [1, 1, 0, 1, 0, 0]]
-    # pyformat: enable
-    max_tokens = 6
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=max_tokens,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.MULTI_HOT,
-        pad_to_max_tokens=True,
-        sparse=sparse)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    if sparse:
-      expected_output = tf.sparse.from_dense(tf.constant(expected_output))
-      self.assertAllEqual(expected_output.indices, output_dataset.indices)
-      self.assertAllEqual(expected_output.values, output_dataset.values)
-    else:
-      self.assertAllEqual(expected_output, output_dataset)
-
-  @parameterized.parameters(
-      {"sparse": True},
-      {"sparse": False},
-  )
-  def test_multi_hot_output_soft_maximum(self, sparse):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0],
-                       [1, 1, 0, 1, 0]]
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=10,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.MULTI_HOT,
-        pad_to_max_tokens=False,
-        sparse=sparse)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    if sparse:
-      expected_output = tf.sparse.from_dense(tf.constant(expected_output))
-      self.assertAllEqual(expected_output.indices, output_dataset.indices)
-      self.assertAllEqual(expected_output.values, output_dataset.values)
-    else:
-      self.assertAllEqual(expected_output, output_dataset)
-
-  def test_multi_hot_output_hard_maximum_set_vocabulary_after_build(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0],
-                       [1, 1, 0, 1, 0]]
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=max_tokens,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.MULTI_HOT,
-        pad_to_max_tokens=True)
-    int_data = layer(input_data)
-    layer.set_vocabulary(vocab_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_multi_hot_output_hard_maximum_adapt_after_build(self):
-    vocab_data = np.array([
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ])
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0],
-                       [1, 1, 0, 1, 0]]
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=max_tokens,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.MULTI_HOT,
-        pad_to_max_tokens=True)
-    int_data = layer(input_data)
-    layer.adapt(vocab_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_multi_hot_output_hard_maximum_multiple_adapts(self):
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-    adapt_data = ["earth", "earth", "earth", "earth", "wind", "wind", "wind"]
-    first_expected_output = [
-        [1, 1, 1, 0, 0],
-        [1, 1, 0, 0, 0],
-    ]
-    second_adapt_data = [
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ]
-    second_expected_output = [
-        [0, 1, 1, 1, 0],
-        [1, 1, 0, 1, 0],
-    ]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.MULTI_HOT,
-        pad_to_max_tokens=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    # Test the first adapt
-    layer.adapt(adapt_data)
-    first_output = model.predict(input_array)
-    # Test the second adapt
-    layer.adapt(second_adapt_data)
-    # We need to recompile the model to retrace our call graph.
-    model.compile()
-    second_output = model.predict(input_array)
-    self.assertAllEqual(first_expected_output, first_output)
-    self.assertAllEqual(second_expected_output, second_output)
-
-  def test_multi_hot_output_soft_maximum_set_state_after_build(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0],
-                       [1, 1, 0, 1, 0]]
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=10,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.MULTI_HOT,
-        pad_to_max_tokens=False)
-    layer.build(input_data.shape)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_size_changed_pad_to_max_false_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.MULTI_HOT,
-        pad_to_max_tokens=False)
-    layer.adapt(vocab_data)
-    _ = layer(input_data)
-
-    layer.set_vocabulary(vocab_data[:2])
-    with self.assertRaisesRegex(RuntimeError,
-                                "vocabulary size cannot be changed"):
-      _ = layer(input_data)
-
-  def test_count_output_hard_maximum(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[0, 2, 1, 1, 0, 0],
-                       [2, 1, 0, 1, 0, 0]]
-    # pyformat: enable
-    max_tokens = 6
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=6,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.COUNT,
-        pad_to_max_tokens=True)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_count_output_soft_maximum(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[0, 2, 1, 1, 0],
-                       [2, 1, 0, 1, 0]]
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=10,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.COUNT,
-        pad_to_max_tokens=False)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  @parameterized.named_parameters(
-      ("sparse", True),
-      ("dense", False),
-  )
-  def test_tfidf_output_hard_maximum(self, sparse):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
-    idf_weights = [.4, .25, .75, .6]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0, .8, .25, .75,  0, 0],
-                       [ 1, .4,   0,   0, .6, 0]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-    max_tokens = 6
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=6,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TF_IDF,
-        pad_to_max_tokens=True,
-        sparse=sparse,
-        vocabulary=vocab_data,
-        idf_weights=idf_weights)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    if sparse:
-      output_dataset = tf.sparse.to_dense(output_dataset)
-    self.assertAllClose(expected_output, output_dataset)
-
-  @parameterized.named_parameters(
-      ("sparse", True),
-      ("dense", False),
-  )
-  def test_tfidf_output_soft_maximum(self, sparse):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
-    idf_weights = [.4, .25, .75, .6]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0, .8, .25, .75,  0],
-                       [ 1, .4,   0,   0, .6]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=10,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TF_IDF,
-        pad_to_max_tokens=False,
-        sparse=sparse,
-        vocabulary=vocab_data,
-        idf_weights=idf_weights)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    if sparse:
-      output_dataset = tf.sparse.to_dense(output_dataset)
-    self.assertAllClose(expected_output, output_dataset)
-
-  @parameterized.named_parameters(
-      ("sparse", True),
-      ("dense", False),
-  )
-  def test_tfidf_output_set_oov_weight(self, sparse):
-    vocab_data = ["[UNK]", "earth", "wind", "and", "fire"]
-    idf_weights = [.1, .4, .25, .75, .6]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[  0, .8, .25, .75,  0],
-                       [ .2, .4,   0,   0, .6]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=10,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TF_IDF,
-        pad_to_max_tokens=False,
-        sparse=sparse,
-        vocabulary=vocab_data,
-        idf_weights=idf_weights)
-    int_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    if sparse:
-      output_dataset = tf.sparse.to_dense(output_dataset)
-    self.assertAllClose(expected_output, output_dataset)
-
-  def test_accept_1D_input(self):
-    input_array = np.array(["earth wind and fire",
-                            "fire and earth michigan"])
-    layer = text_vectorization.TextVectorization(
-        standardize=None, split=None, output_mode="int")
-    layer.adapt(input_array)
-    _ = layer(input_array)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_int_output(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_densifies_with_zeros(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # Create an input array that has 5 elements in the first example and 4
+        # in the second. This should output a 2x5 tensor with a padding value in
+        # the second example.
+        input_array = np.array(
+            [["earth wind and also fire"], ["fire and earth michigan"]]
+        )
+        expected_output = [[2, 3, 4, 1, 5], [5, 4, 2, 1, 0]]
+
+        # This test doesn't explicitly set an output shape, so the 2nd dimension
+        # should stay 'None'.
+        expected_output_shape = [None, None]
+
+        # The input shape here is explicitly 1 because we're tokenizing.
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=text_vectorization.INT,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_ragged(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # Create an input array that has 5 elements in the first example and 4
+        # in the second.
+        input_array = np.array(
+            [["earth wind and also fire"], ["fire and earth michigan"]]
+        )
+        expected_output = tf.ragged.constant([[2, 3, 4, 1, 5], [5, 4, 2, 1]])
+        expected_output_shape = [None, None]
+
+        # The input shape here is explicitly 1 because we're tokenizing.
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=text_vectorization.INT,
+            ragged=True,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_densifies_with_zeros_and_pads(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # Create an input array that has 5 elements in the first example and 4
+        # in the second. This should output a 2x6 tensor with a padding value in
+        # the second example, since output_sequence_length is set to 6.
+        input_array = np.array(
+            [["earth wind and also fire"], ["fire and earth michigan"]]
+        )
+        expected_output = [[2, 3, 4, 1, 5, 0], [5, 4, 2, 1, 0, 0]]
+
+        output_sequence_length = 6
+        expected_output_shape = [None, output_sequence_length]
+
+        # The input shape here is explicitly 1 because we're tokenizing.
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=text_vectorization.INT,
+            output_sequence_length=output_sequence_length,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_densifies_with_zeros_and_strips(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # Create an input array that has 5 elements in the first example and 4
+        # in the second. This should output a 2x3 tensor with a padding value in
+        # the second example, since output_sequence_length is set to 3.
+        input_array = np.array(
+            [["earth wind and also fire"], ["fire and earth michigan"]]
+        )
+        expected_output = [[2, 3, 4], [5, 4, 2]]
+        output_sequence_length = 3
+        expected_output_shape = [None, output_sequence_length]
+
+        # The input shape here is explicitly 1 because we're tokenizing.
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=text_vectorization.INT,
+            output_sequence_length=output_sequence_length,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_int_output_dynamically_strips_and_pads(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # Create an input array that has 5 elements in the first example and 4
+        # in the second. This should output a 2x3 tensor with a padding value in
+        # the second example, since output_sequence_length is set to 3.
+        input_array = np.array(
+            [["earth wind and also fire"], ["fire and earth michigan"]]
+        )
+        expected_output = [[2, 3, 4], [5, 4, 2]]
+        output_sequence_length = 3
+        expected_output_shape = [None, output_sequence_length]
+
+        # The input shape here is explicitly 1 because we're tokenizing.
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=text_vectorization.INT,
+            output_sequence_length=output_sequence_length,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+        # Create an input array that has 1 element in the first example and 2 in
+        # the second. This should output a 2x3 tensor with a padding value in
+        # the second example, since output_sequence_length is set to 3.
+        input_array_2 = np.array([["wind"], ["fire and"]])
+        expected_output_2 = [[3, 0, 0], [5, 4, 0]]
+        output_dataset = model.predict(input_array_2)
+        self.assertAllEqual(expected_output_2, output_dataset)
+
+    @parameterized.parameters(
+        {"sparse": True},
+        {"sparse": False},
+    )
+    def test_multi_hot_output_hard_maximum(self, sparse):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0]]
+        # pyformat: enable
+        max_tokens = 6
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=max_tokens,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.MULTI_HOT,
+            pad_to_max_tokens=True,
+            sparse=sparse,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        if sparse:
+            expected_output = tf.sparse.from_dense(tf.constant(expected_output))
+            self.assertAllEqual(expected_output.indices, output_dataset.indices)
+            self.assertAllEqual(expected_output.values, output_dataset.values)
+        else:
+            self.assertAllEqual(expected_output, output_dataset)
+
+    @parameterized.parameters(
+        {"sparse": True},
+        {"sparse": False},
+    )
+    def test_multi_hot_output_soft_maximum(self, sparse):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0], [1, 1, 0, 1, 0]]
+        # pyformat: enable
+        max_tokens = 5
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=10,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.MULTI_HOT,
+            pad_to_max_tokens=False,
+            sparse=sparse,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        if sparse:
+            expected_output = tf.sparse.from_dense(tf.constant(expected_output))
+            self.assertAllEqual(expected_output.indices, output_dataset.indices)
+            self.assertAllEqual(expected_output.values, output_dataset.values)
+        else:
+            self.assertAllEqual(expected_output, output_dataset)
+
+    def test_multi_hot_output_hard_maximum_set_vocabulary_after_build(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0], [1, 1, 0, 1, 0]]
+        # pyformat: enable
+        max_tokens = 5
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=max_tokens,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.MULTI_HOT,
+            pad_to_max_tokens=True,
+        )
+        int_data = layer(input_data)
+        layer.set_vocabulary(vocab_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_multi_hot_output_hard_maximum_adapt_after_build(self):
+        vocab_data = np.array(
+            [
+                "earth",
+                "earth",
+                "earth",
+                "earth",
+                "wind",
+                "wind",
+                "wind",
+                "and",
+                "and",
+                "fire",
+            ]
+        )
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0], [1, 1, 0, 1, 0]]
+        # pyformat: enable
+        max_tokens = 5
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=max_tokens,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.MULTI_HOT,
+            pad_to_max_tokens=True,
+        )
+        int_data = layer(input_data)
+        layer.adapt(vocab_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_multi_hot_output_hard_maximum_multiple_adapts(self):
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+        adapt_data = [
+            "earth",
+            "earth",
+            "earth",
+            "earth",
+            "wind",
+            "wind",
+            "wind",
+        ]
+        first_expected_output = [
+            [1, 1, 1, 0, 0],
+            [1, 1, 0, 0, 0],
+        ]
+        second_adapt_data = [
+            "earth",
+            "earth",
+            "earth",
+            "earth",
+            "wind",
+            "wind",
+            "wind",
+            "and",
+            "and",
+            "fire",
+        ]
+        second_expected_output = [
+            [0, 1, 1, 1, 0],
+            [1, 1, 0, 1, 0],
+        ]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=5,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.MULTI_HOT,
+            pad_to_max_tokens=True,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        # Test the first adapt
+        layer.adapt(adapt_data)
+        first_output = model.predict(input_array)
+        # Test the second adapt
+        layer.adapt(second_adapt_data)
+        # We need to recompile the model to retrace our call graph.
+        model.compile()
+        second_output = model.predict(input_array)
+        self.assertAllEqual(first_expected_output, first_output)
+        self.assertAllEqual(second_expected_output, second_output)
+
+    def test_multi_hot_output_soft_maximum_set_state_after_build(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[0, 1, 1, 1, 0], [1, 1, 0, 1, 0]]
+        # pyformat: enable
+        max_tokens = 5
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=10,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.MULTI_HOT,
+            pad_to_max_tokens=False,
+        )
+        layer.build(input_data.shape)
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_vocab_size_changed_pad_to_max_false_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.MULTI_HOT,
+            pad_to_max_tokens=False,
+        )
+        layer.adapt(vocab_data)
+        _ = layer(input_data)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "vocabulary size cannot be changed"
+        ):
+            layer.set_vocabulary(vocab_data[:2])
+
+    def test_count_output_hard_maximum(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[0, 2, 1, 1, 0, 0], [2, 1, 0, 1, 0, 0]]
+        # pyformat: enable
+        max_tokens = 6
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=6,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.COUNT,
+            pad_to_max_tokens=True,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    def test_count_output_soft_maximum(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[0, 2, 1, 1, 0], [2, 1, 0, 1, 0]]
+        # pyformat: enable
+        max_tokens = 5
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=10,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.COUNT,
+            pad_to_max_tokens=False,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_tfidf_output_hard_maximum(self, sparse):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
+        idf_weights = [0.4, 0.25, 0.75, 0.6]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "fire", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+
+        expected_output = [[0, 0.8, 0.25, 0.75, 0, 0], [1, 0.4, 0, 0, 0.6, 0]]
+
+        # pyformat: enable
+        max_tokens = 6
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=6,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.TF_IDF,
+            pad_to_max_tokens=True,
+            sparse=sparse,
+            vocabulary=vocab_data,
+            idf_weights=idf_weights,
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        if sparse:
+            output_dataset = tf.sparse.to_dense(output_dataset)
+        self.assertAllClose(expected_output, output_dataset)
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_tfidf_output_soft_maximum(self, sparse):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
+        idf_weights = [0.4, 0.25, 0.75, 0.6]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "fire", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+
+        expected_output = [[0, 0.8, 0.25, 0.75, 0], [1, 0.4, 0, 0, 0.6]]
+
+        # pyformat: enable
+        max_tokens = 5
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=10,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.TF_IDF,
+            pad_to_max_tokens=False,
+            sparse=sparse,
+            vocabulary=vocab_data,
+            idf_weights=idf_weights,
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        if sparse:
+            output_dataset = tf.sparse.to_dense(output_dataset)
+        self.assertAllClose(expected_output, output_dataset)
+
+    @parameterized.named_parameters(
+        ("sparse", True),
+        ("dense", False),
+    )
+    def test_tfidf_output_set_oov_weight(self, sparse):
+        vocab_data = ["[UNK]", "earth", "wind", "and", "fire"]
+        idf_weights = [0.1, 0.4, 0.25, 0.75, 0.6]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "fire", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+
+        expected_output = [[0, 0.8, 0.25, 0.75, 0], [0.2, 0.4, 0, 0, 0.6]]
+
+        # pyformat: enable
+        max_tokens = 5
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=10,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.TF_IDF,
+            pad_to_max_tokens=False,
+            sparse=sparse,
+            vocabulary=vocab_data,
+            idf_weights=idf_weights,
+        )
+        int_data = layer(input_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        if sparse:
+            output_dataset = tf.sparse.to_dense(output_dataset)
+        self.assertAllClose(expected_output, output_dataset)
+
+    def test_accept_1D_input(self):
+        input_array = np.array(
+            ["earth wind and fire", "fire and earth michigan"]
+        )
+        layer = text_vectorization.TextVectorization(
+            standardize=None, split=None, output_mode="int"
+        )
+        layer.adapt(input_array)
+        _ = layer(input_array)
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TextVectorizationModelBuildingTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name": "count_hard_max",
-          "pad_to_max_tokens": True,
-          "output_mode": text_vectorization.COUNT
-      }, {
-          "testcase_name": "count_soft_max",
-          "pad_to_max_tokens": False,
-          "output_mode": text_vectorization.COUNT
-      }, {
-          "testcase_name": "binary_hard_max",
-          "pad_to_max_tokens": True,
-          "output_mode": text_vectorization.MULTI_HOT
-      }, {
-          "testcase_name": "binary_soft_max",
-          "pad_to_max_tokens": False,
-          "output_mode": text_vectorization.MULTI_HOT
-      }, {
-          "testcase_name": "tfidf_hard_max",
-          "pad_to_max_tokens": True,
-          "output_mode": text_vectorization.TF_IDF
-      }, {
-          "testcase_name": "tfidf_soft_max",
-          "pad_to_max_tokens": False,
-          "output_mode": text_vectorization.TF_IDF
-      })
-  def test_end_to_end_bagged_modeling(self, output_mode, pad_to_max_tokens):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    if output_mode == text_vectorization.TF_IDF:
-      idf_weights = [.5, .25, .2, .125]
-    else:
-      idf_weights = None
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=10,
-        standardize=None,
-        split=None,
-        output_mode=output_mode,
-        pad_to_max_tokens=pad_to_max_tokens,
-        vocabulary=vocab_data,
-        idf_weights=idf_weights)
-
-    int_data = layer(input_data)
-    float_data = backend.cast(int_data, dtype="float32")
-    output_data = core.Dense(64)(float_data)
-    model = keras.Model(inputs=input_data, outputs=output_data)
-    _ = model.predict(input_array)
-
-  def test_end_to_end_vocab_modeling(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth wind and also fire"],
-                            ["fire and earth michigan"]])
-    output_sequence_length = 6
-    max_tokens = 5
-
-    # The input shape here is explicitly 1 because we're tokenizing.
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=text_vectorization.WHITESPACE,
-        output_mode=text_vectorization.INT,
-        output_sequence_length=output_sequence_length)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    embedded_data = core.Embedding(
-        input_dim=max_tokens + 1, output_dim=32)(
-            int_data)
-    output_data = convolutional.Conv1D(
-        250, 3, padding="valid", activation="relu", strides=1)(
-            embedded_data)
-
-    model = keras.Model(inputs=input_data, outputs=output_data)
-    _ = model.predict(input_array)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "count_hard_max",
+            "pad_to_max_tokens": True,
+            "output_mode": text_vectorization.COUNT,
+        },
+        {
+            "testcase_name": "count_soft_max",
+            "pad_to_max_tokens": False,
+            "output_mode": text_vectorization.COUNT,
+        },
+        {
+            "testcase_name": "binary_hard_max",
+            "pad_to_max_tokens": True,
+            "output_mode": text_vectorization.MULTI_HOT,
+        },
+        {
+            "testcase_name": "binary_soft_max",
+            "pad_to_max_tokens": False,
+            "output_mode": text_vectorization.MULTI_HOT,
+        },
+        {
+            "testcase_name": "tfidf_hard_max",
+            "pad_to_max_tokens": True,
+            "output_mode": text_vectorization.TF_IDF,
+        },
+        {
+            "testcase_name": "tfidf_soft_max",
+            "pad_to_max_tokens": False,
+            "output_mode": text_vectorization.TF_IDF,
+        },
+    )
+    def test_end_to_end_bagged_modeling(self, output_mode, pad_to_max_tokens):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        if output_mode == text_vectorization.TF_IDF:
+            idf_weights = [0.5, 0.25, 0.2, 0.125]
+        else:
+            idf_weights = None
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=10,
+            standardize=None,
+            split=None,
+            output_mode=output_mode,
+            pad_to_max_tokens=pad_to_max_tokens,
+            vocabulary=vocab_data,
+            idf_weights=idf_weights,
+        )
+
+        int_data = layer(input_data)
+        float_data = backend.cast(int_data, dtype="float32")
+        output_data = core.Dense(64)(float_data)
+        model = keras.Model(inputs=input_data, outputs=output_data)
+        _ = model.predict(input_array)
+
+    def test_end_to_end_vocab_modeling(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [["earth wind and also fire"], ["fire and earth michigan"]]
+        )
+        output_sequence_length = 6
+        max_tokens = 5
+
+        # The input shape here is explicitly 1 because we're tokenizing.
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=text_vectorization.WHITESPACE,
+            output_mode=text_vectorization.INT,
+            output_sequence_length=output_sequence_length,
+        )
+        layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        embedded_data = core.Embedding(input_dim=max_tokens + 1, output_dim=32)(
+            int_data
+        )
+        output_data = convolutional.Conv1D(
+            250, 3, padding="valid", activation="relu", strides=1
+        )(embedded_data)
+
+        model = keras.Model(inputs=input_data, outputs=output_data)
+        _ = model.predict(input_array)
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationVocbularyTest(
+class TextVectorizationVocabularyTest(
     test_combinations.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest,
 ):
+    def test_get_vocabulary(self):
+        vocab = ["earth", "wind", "and", "fire"]
 
-  def test_get_vocabulary(self):
-    vocab = ["earth", "wind", "and", "fire"]
+        layer = text_vectorization.TextVectorization(vocabulary=vocab)
+        self.assertAllEqual(
+            layer.get_vocabulary(),
+            ["", "[UNK]", "earth", "wind", "and", "fire"],
+        )
 
-    layer = text_vectorization.TextVectorization(vocabulary=vocab)
-    self.assertAllEqual(layer.get_vocabulary(),
-                        ["", "[UNK]", "earth", "wind", "and", "fire"])
+    def test_get_vocabulary_adapt(self):
+        vocab = np.array(
+            [["earth earth earth earth wind wind wind and and fire"]]
+        )
 
-  def test_get_vocabulary_adapt(self):
-    vocab = np.array([["earth earth earth earth wind wind wind and and fire"]])
+        layer = text_vectorization.TextVectorization()
+        layer.adapt(vocab)
+        self.assertAllEqual(
+            layer.get_vocabulary(),
+            ["", "[UNK]", "earth", "wind", "and", "fire"],
+        )
 
-    layer = text_vectorization.TextVectorization()
-    layer.adapt(vocab)
-    self.assertAllEqual(layer.get_vocabulary(),
-                        ["", "[UNK]", "earth", "wind", "and", "fire"])
+    def test_get_vocabulary_no_special_tokens(self):
+        vocab = ["earth", "wind", "and", "fire"]
 
-  def test_get_vocabulary_no_special_tokens(self):
-    vocab = ["earth", "wind", "and", "fire"]
-
-    layer = text_vectorization.TextVectorization(vocabulary=vocab)
-    self.assertAllEqual(
-        layer.get_vocabulary(include_special_tokens=False),
-        ["earth", "wind", "and", "fire"])
+        layer = text_vectorization.TextVectorization(vocabulary=vocab)
+        self.assertAllEqual(
+            layer.get_vocabulary(include_special_tokens=False),
+            ["earth", "wind", "and", "fire"],
+        )
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationErrorTest(test_combinations.TestCase,
-                                 preprocessing_test_utils.PreprocessingLayerTest
-                                ):
-
-  def test_too_long_vocab_fails_in_single_setting(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    layer = text_vectorization.TextVectorization(
-        max_tokens=4,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data)
-
-  def test_setting_vocab_without_idf_weights_fails_in_tfidf_mode(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-
-    with self.assertRaisesRegex(
-        ValueError, "`idf_weights` must be set if output_mode is TF_IDF"):
-      text_vectorization.TextVectorization(
-          max_tokens=5,
-          standardize=None,
-          split=None,
-          output_mode=text_vectorization.TF_IDF,
-          vocabulary=vocab_data)
-
-  def test_idf_weights_length_mismatch_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    idf_weights = [1, 2, 3]
-    with self.assertRaisesRegex(
-        ValueError, "`idf_weights` must be the same length as vocab"):
-      text_vectorization.TextVectorization(
-          max_tokens=5,
-          standardize=None,
-          split=None,
-          output_mode=text_vectorization.TF_IDF,
-          vocabulary=vocab_data,
-          idf_weights=idf_weights)
-
-  def test_set_tfidf_in_non_tfidf_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    idf_weights = [1, 2, 3, 4]
-    with self.assertRaisesRegex(ValueError,
-                                "`idf_weights` should only be set if"):
-      text_vectorization.TextVectorization(
-          max_tokens=5,
-          standardize=None,
-          split=None,
-          output_mode=text_vectorization.MULTI_HOT,
-          vocabulary=vocab_data,
-          idf_weights=idf_weights)
-
-  def test_zero_max_tokens_fails(self):
-    with self.assertRaisesRegex(ValueError, "max_tokens.*"):
-      _ = text_vectorization.TextVectorization(max_tokens=0)
-
-  def test_non_string_dtype_fails(self):
-    with self.assertRaisesRegex(ValueError, "dtype of string.*"):
-      _ = text_vectorization.TextVectorization(dtype=tf.int64)
-
-  def test_unknown_standardize_arg_fails(self):
-    with self.assertRaisesRegex(ValueError,
-                                "`standardize` arg.*unsupported_value"):
-      _ = text_vectorization.TextVectorization(standardize="unsupported_value")
-
-  def test_unknown_split_arg_fails(self):
-    with self.assertRaisesRegex(ValueError, "`split` arg.*unsupported_value"):
-      _ = text_vectorization.TextVectorization(split="unsupported_value")
-
-  def test_unknown_output_mode_arg_fails(self):
-    with self.assertRaisesRegex(ValueError,
-                                "`output_mode` arg.*unsupported_value"):
-      _ = text_vectorization.TextVectorization(output_mode="unsupported_value")
-
-  def test_unknown_ngrams_arg_fails(self):
-    with self.assertRaisesRegex(ValueError, "ngrams.*unsupported_value"):
-      _ = text_vectorization.TextVectorization(ngrams="unsupported_value")
-
-  def test_float_ngrams_arg_fails(self):
-    with self.assertRaisesRegex(ValueError, "ngrams.*2.9"):
-      _ = text_vectorization.TextVectorization(ngrams=2.9)
-
-  def test_float_tuple_ngrams_arg_fails(self):
-    with self.assertRaisesRegex(ValueError, "ngrams.*(1.3, 2.9)"):
-      _ = text_vectorization.TextVectorization(ngrams=(1.3, 2.9))
-
-  def test_non_int_output_sequence_length_dtype_fails(self):
-    with self.assertRaisesRegex(ValueError, "output_sequence_length.*2.0"):
-      _ = text_vectorization.TextVectorization(
-          output_mode="int", output_sequence_length=2.0)
-
-  def test_non_none_output_sequence_length_fails_if_output_mode_not_int(self):
-    with self.assertRaisesRegex(ValueError,
-                                "`output_sequence_length` must not be set"):
-      _ = text_vectorization.TextVectorization(
-          output_mode="count", output_sequence_length=2)
-
-  def test_non_none_output_sequence_length_fails_if_ragged_true(self):
-    with self.assertRaisesRegex(ValueError,
-                                "`output_sequence_length` must not be set"):
-      _ = text_vectorization.TextVectorization(
-          ragged=True, output_sequence_length=2)
-
-  def test_ragged_true_fails_if_output_mode_not_int(self):
-    with self.assertRaisesRegex(ValueError, "`ragged` must not be true if"):
-      _ = text_vectorization.TextVectorization(
-          ragged=True, output_mode=text_vectorization.MULTI_HOT)
-
-  def test_sparse_true_fails_if_output_mode_is_int(self):
-    with self.assertRaisesRegex(ValueError, "`sparse` may only be true if"):
-      _ = text_vectorization.TextVectorization(
-          sparse=True, output_mode=text_vectorization.INT)
+class TextVectorizationErrorTest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_too_long_vocab_fails_in_single_setting(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        layer = text_vectorization.TextVectorization(
+            max_tokens=4,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+        )
+        with self.assertRaisesRegex(
+            ValueError, "vocabulary larger than the maximum vocab.*"
+        ):
+            layer.set_vocabulary(vocab_data)
+
+    def test_setting_vocab_without_idf_weights_fails_in_tfidf_mode(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        with self.assertRaisesRegex(
+            ValueError, "`idf_weights` must be set if output_mode is TF_IDF"
+        ):
+            text_vectorization.TextVectorization(
+                max_tokens=5,
+                standardize=None,
+                split=None,
+                output_mode=text_vectorization.TF_IDF,
+                vocabulary=vocab_data,
+            )
+
+    def test_idf_weights_length_mismatch_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        idf_weights = [1, 2, 3]
+        with self.assertRaisesRegex(
+            ValueError, "`idf_weights` must be the same length as vocab"
+        ):
+            text_vectorization.TextVectorization(
+                max_tokens=5,
+                standardize=None,
+                split=None,
+                output_mode=text_vectorization.TF_IDF,
+                vocabulary=vocab_data,
+                idf_weights=idf_weights,
+            )
+
+    def test_set_tfidf_in_non_tfidf_fails(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        idf_weights = [1, 2, 3, 4]
+        with self.assertRaisesRegex(
+            ValueError, "`idf_weights` should only be set if"
+        ):
+            text_vectorization.TextVectorization(
+                max_tokens=5,
+                standardize=None,
+                split=None,
+                output_mode=text_vectorization.MULTI_HOT,
+                vocabulary=vocab_data,
+                idf_weights=idf_weights,
+            )
+
+    def test_zero_max_tokens_fails(self):
+        with self.assertRaisesRegex(ValueError, "max_tokens.*"):
+            _ = text_vectorization.TextVectorization(max_tokens=0)
+
+    def test_non_string_dtype_fails(self):
+        with self.assertRaisesRegex(ValueError, "dtype of string.*"):
+            _ = text_vectorization.TextVectorization(dtype=tf.int64)
+
+    def test_unknown_standardize_arg_fails(self):
+        with self.assertRaisesRegex(
+            ValueError, "`standardize` arg.*unsupported_value"
+        ):
+            _ = text_vectorization.TextVectorization(
+                standardize="unsupported_value"
+            )
+
+    def test_unknown_split_arg_fails(self):
+        with self.assertRaisesRegex(
+            ValueError, "`split` arg.*unsupported_value"
+        ):
+            _ = text_vectorization.TextVectorization(split="unsupported_value")
+
+    def test_unknown_output_mode_arg_fails(self):
+        with self.assertRaisesRegex(
+            ValueError, "`output_mode` arg.*unsupported_value"
+        ):
+            _ = text_vectorization.TextVectorization(
+                output_mode="unsupported_value"
+            )
+
+    def test_unknown_ngrams_arg_fails(self):
+        with self.assertRaisesRegex(ValueError, "ngrams.*unsupported_value"):
+            _ = text_vectorization.TextVectorization(ngrams="unsupported_value")
+
+    def test_float_ngrams_arg_fails(self):
+        with self.assertRaisesRegex(ValueError, "ngrams.*2.9"):
+            _ = text_vectorization.TextVectorization(ngrams=2.9)
+
+    def test_float_tuple_ngrams_arg_fails(self):
+        with self.assertRaisesRegex(ValueError, "ngrams.*(1.3, 2.9)"):
+            _ = text_vectorization.TextVectorization(ngrams=(1.3, 2.9))
+
+    def test_non_int_output_sequence_length_dtype_fails(self):
+        with self.assertRaisesRegex(ValueError, "output_sequence_length.*2.0"):
+            _ = text_vectorization.TextVectorization(
+                output_mode="int", output_sequence_length=2.0
+            )
+
+    def test_non_none_output_sequence_length_fails_if_output_mode_not_int(self):
+        with self.assertRaisesRegex(
+            ValueError, "`output_sequence_length` must not be set"
+        ):
+            _ = text_vectorization.TextVectorization(
+                output_mode="count", output_sequence_length=2
+            )
+
+    def test_non_none_output_sequence_length_fails_if_ragged_true(self):
+        with self.assertRaisesRegex(
+            ValueError, "`output_sequence_length` must not be set"
+        ):
+            _ = text_vectorization.TextVectorization(
+                ragged=True, output_sequence_length=2
+            )
+
+    def test_ragged_true_fails_if_output_mode_not_int(self):
+        with self.assertRaisesRegex(ValueError, "`ragged` must not be true if"):
+            _ = text_vectorization.TextVectorization(
+                ragged=True, output_mode=text_vectorization.MULTI_HOT
+            )
+
+    def test_sparse_true_fails_if_output_mode_is_int(self):
+        with self.assertRaisesRegex(ValueError, "`sparse` may only be true if"):
+            _ = text_vectorization.TextVectorization(
+                sparse=True, output_mode=text_vectorization.INT
+            )
 
 
 # Custom functions for the custom callable serialization test. Declared here
 # to avoid multiple registrations from run_all_keras_modes().
-@generic_utils.register_keras_serializable(package="Test")
+@register_keras_serializable(package="Test")
 def custom_standardize_fn(x):
-  return tf.strings.lower(x)
+    return tf.strings.lower(x)
 
 
-@generic_utils.register_keras_serializable(package="Test")
+@register_keras_serializable(package="Test")
 def custom_split_fn(x):
-  return tf.strings.split(x, sep=">")
+    return tf.strings.split(x, sep=">")
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TextVectorizationSavingTest(
-    test_combinations.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def tearDown(self):
-    keras.backend.clear_session()
-    gc.collect()
-    super(TextVectorizationSavingTest, self).tearDown()
-
-  @parameterized.parameters(
-      {"init_vocab": True},
-      {"init_vocab": False},
-  )
-  def test_saving(self, init_vocab):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    vocabulary = vocab_data if init_vocab else None
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT,
-        vocabulary=vocabulary)
-    if not init_vocab:
-      layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = keras.models.load_model(output_path)
-    self.assertAllEqual(loaded_model.predict(input_array), expected_output)
-
-  @parameterized.parameters(
-      {"init_vocab": True},
-      {"init_vocab": False},
-  )
-  def test_saving_when_nested(self, init_vocab):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    vocabulary = vocab_data if init_vocab else None
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT,
-        vocabulary=vocabulary)
-    if not init_vocab:
-      layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    outer_input = keras.Input(shape=(None,), dtype=tf.string)
-    outer_output = model(outer_input)
-    outer_model = keras.Model(inputs=outer_input, outputs=outer_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    outer_model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = keras.models.load_model(output_path)
-    self.assertAllEqual(loaded_model.predict(input_array), expected_output)
-
-  def test_saving_when_adapted(self):
-    adapt_data = [
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.adapt(adapt_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-
-    model.save(output_path, save_format="tf")
-
-    # Delete the session and graph to ensure that the loaded model is generated
-    # from scratch.
-    keras.backend.clear_session()
-
-    loaded_model = keras.models.load_model(output_path)
-    self.assertAllEqual(loaded_model.predict(input_array), expected_output)
-
-  def test_saving_with_tfidf(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
-    idf_weights = [.4, .25, .75, .6]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0, .8, .25, .75,  0],
-                       [ 1, .4,   0,   0, .6]]
-    vocab_data = ["earth", "wind", "and", "fire"]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TF_IDF)
-    layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
-
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-    loaded_model = keras.models.load_model(output_path)
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllClose(new_output_dataset, expected_output)
-
-  def test_serialization_with_custom_callables(self):
-    input_array = np.array([["earth>wind>and Fire"],
-                            ["\tfire>And\nearth>michigan"]])
-    expected_output = [[b"earth", b"wind", b"and fire"],
-                       [b"\tfire", b"and\nearth", b"michigan"]]
-
-    input_data = keras.Input(shape=(1,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=None,
-        standardize=custom_standardize_fn,
-        split=custom_split_fn,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-    serialized_model_data = model.get_config()
-    new_model = keras.Model.from_config(serialized_model_data)
-    new_output_dataset = new_model.predict(input_array)
-    self.assertAllEqual(expected_output, new_output_dataset)
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def tearDown(self):
+        keras.backend.clear_session()
+        gc.collect()
+        super(TextVectorizationSavingTest, self).tearDown()
+
+    @parameterized.parameters(
+        {"init_vocab": True},
+        {"init_vocab": False},
+    )
+    def test_saving(self, init_vocab):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        vocabulary = vocab_data if init_vocab else None
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+            vocabulary=vocabulary,
+        )
+        if not init_vocab:
+            layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+
+        model.save(output_path, save_format="tf")
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+
+        loaded_model = keras.models.load_model(output_path)
+        self.assertAllEqual(loaded_model.predict(input_array), expected_output)
+
+    @parameterized.parameters(
+        {"init_vocab": True},
+        {"init_vocab": False},
+    )
+    def test_saving_when_nested(self, init_vocab):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        vocabulary = vocab_data if init_vocab else None
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+            vocabulary=vocabulary,
+        )
+        if not init_vocab:
+            layer.set_vocabulary(vocab_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        outer_input = keras.Input(shape=(None,), dtype=tf.string)
+        outer_output = model(outer_input)
+        outer_model = keras.Model(inputs=outer_input, outputs=outer_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        outer_model.save(output_path, save_format="tf")
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+
+        loaded_model = keras.models.load_model(output_path)
+        self.assertAllEqual(loaded_model.predict(input_array), expected_output)
+
+    def test_saving_when_adapted(self):
+        adapt_data = [
+            "earth",
+            "earth",
+            "earth",
+            "earth",
+            "wind",
+            "wind",
+            "wind",
+            "and",
+            "and",
+            "fire",
+        ]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "fire"],
+                ["fire", "and", "earth", "michigan"],
+            ]
+        )
+        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.INT,
+        )
+        layer.adapt(adapt_data)
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+
+        model.save(output_path, save_format="tf")
+
+        # Delete the session and graph to ensure that the loaded model is
+        # generated from scratch.
+        keras.backend.clear_session()
+
+        loaded_model = keras.models.load_model(output_path)
+        self.assertAllEqual(loaded_model.predict(input_array), expected_output)
+
+    def test_saving_with_tfidf(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
+        idf_weights = [0.4, 0.25, 0.75, 0.6]
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "fire", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+
+        expected_output = [[0, 0.8, 0.25, 0.75, 0], [1, 0.4, 0, 0, 0.6]]
+        vocab_data = ["earth", "wind", "and", "fire"]
+
+        # pyformat: enable
+
+        # Build and validate a golden model.
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=5,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.TF_IDF,
+        )
+        layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
+
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllClose(output_dataset, expected_output)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format="tf")
+        loaded_model = keras.models.load_model(output_path)
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_dataset = loaded_model.predict(input_array)
+        self.assertAllClose(new_output_dataset, expected_output)
+
+    def test_serialization_with_custom_callables(self):
+        input_array = np.array(
+            [["earth>wind>and Fire"], ["\tfire>And\nearth>michigan"]]
+        )
+        expected_output = [
+            [b"earth", b"wind", b"and fire"],
+            [b"\tfire", b"and\nearth", b"michigan"],
+        ]
+
+        input_data = keras.Input(shape=(1,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=None,
+            standardize=custom_standardize_fn,
+            split=custom_split_fn,
+            ngrams=None,
+            output_mode=None,
+        )
+        int_data = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=int_data)
+        output_dataset = model.predict(input_array)
+        self.assertAllEqual(expected_output, output_dataset)
+
+        serialized_model_data = model.get_config()
+        new_model = keras.Model.from_config(serialized_model_data)
+        new_output_dataset = new_model.predict(input_array)
+        self.assertAllEqual(expected_output, new_output_dataset)
+
+    @test_utils.run_v2_only()
+    def test_saving_v3(self):
+        vocab_data = ["earth", "wind", "and", "fire"]
+        input_array = np.array(["earth, wind, and fire"])
+
+        # First, with a static vocabulary.
+        input_data = keras.Input(shape=(), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(vocabulary=vocab_data)
+        output = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=output)
+        ref_output = model.predict(input_array)
+        temp_dir = self.get_temp_dir()
+        model_path = os.path.join(temp_dir, "mymodel.keras")
+        model.save(model_path, save_format="keras_v3")
+        model = keras.models.load_model(model_path)
+        output = model.predict(input_array)
+        self.assertAllEqual(output, ref_output)
+
+        # Second, with adapt().
+        input_data = keras.Input(shape=(), dtype=tf.string)
+        layer = text_vectorization.TextVectorization()
+        layer.adapt(vocab_data)
+        output = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=output)
+        ref_output = model.predict(input_array)
+        model.save(model_path, save_format="keras_v3", overwrite=True)
+        model = keras.models.load_model(model_path)
+        output = model.predict(input_array)
+        self.assertAllEqual(output, ref_output)
+
+        # Test TF-IDF + adapt().
+        input_data = keras.Input(shape=(), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(output_mode="tf_idf")
+        layer.adapt(vocab_data)
+        output = layer(input_data)
+        model = keras.Model(inputs=input_data, outputs=output)
+        ref_output = model.predict(input_array)
+        model.save(model_path, save_format="keras_v3", overwrite=True)
+        model = keras.models.load_model(model_path)
+        output = model.predict(input_array)
+        self.assertAllEqual(output, ref_output)
 
 
 @test_utils.run_v2_only
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationE2ETest(test_combinations.TestCase,
-                               preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_keras_vocab_trimming_example(self):
-    vocab_data = np.array([
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ])
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "and", "earth", "michigan"]])
-
-    # pyformat: disable
-    expected_output = [[1, 2, 1],
-                       [3, 1, 0]]
-    # pyformat: enable
-    max_tokens = 3
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=tf.string)
-    layer = text_vectorization.TextVectorization(
-        max_tokens=max_tokens,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.COUNT,
-        pad_to_max_tokens=True)
-    int_data = layer(input_data)
-    layer.adapt(vocab_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-    model = keras.Model(input_data, int_data)
-    output = model.predict(input_array)
-    self.assertAllEqual(expected_output, output)
+class TextVectorizationE2ETest(
+    test_combinations.TestCase, preprocessing_test_utils.PreprocessingLayerTest
+):
+    def test_keras_vocab_trimming_example(self):
+        vocab_data = np.array(
+            [
+                "earth",
+                "earth",
+                "earth",
+                "earth",
+                "wind",
+                "wind",
+                "wind",
+                "and",
+                "and",
+                "fire",
+            ]
+        )
+        input_array = np.array(
+            [
+                ["earth", "wind", "and", "earth"],
+                ["ohio", "and", "earth", "michigan"],
+            ]
+        )
+
+        # pyformat: disable
+        expected_output = [[1, 2, 1], [3, 1, 0]]
+        # pyformat: enable
+        max_tokens = 3
+        expected_output_shape = [None, max_tokens]
+
+        input_data = keras.Input(shape=(None,), dtype=tf.string)
+        layer = text_vectorization.TextVectorization(
+            max_tokens=max_tokens,
+            standardize=None,
+            split=None,
+            output_mode=text_vectorization.COUNT,
+            pad_to_max_tokens=True,
+        )
+        int_data = layer(input_data)
+        layer.adapt(vocab_data)
+        self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+        model = keras.Model(input_data, int_data)
+        output = model.predict(input_array)
+        self.assertAllEqual(expected_output, output)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/regularization/BUILD b/keras/layers/regularization/BUILD
index c49cb80ed4b7..ac9a829414ae 100644
--- a/keras/layers/regularization/BUILD
+++ b/keras/layers/regularization/BUILD
@@ -1,15 +1,17 @@
 # Description:
 #  Contains the Keras regularization layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/py/tensorflow_gnn:__subpackages__",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
diff --git a/keras/layers/regularization/__init__.py b/keras/layers/regularization/__init__.py
index 8718c8985ace..60e910e8ef62 100644
--- a/keras/layers/regularization/__init__.py
+++ b/keras/layers/regularization/__init__.py
@@ -13,14 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """Keras regularization layers."""
-# pylint: disable=g-bad-import-order
 
+
+from keras.layers.regularization.activity_regularization import (
+    ActivityRegularization,
+)
+from keras.layers.regularization.alpha_dropout import AlphaDropout
 from keras.layers.regularization.dropout import Dropout
+from keras.layers.regularization.gaussian_dropout import GaussianDropout
+from keras.layers.regularization.gaussian_noise import GaussianNoise
 from keras.layers.regularization.spatial_dropout1d import SpatialDropout1D
 from keras.layers.regularization.spatial_dropout2d import SpatialDropout2D
 from keras.layers.regularization.spatial_dropout3d import SpatialDropout3D
-from keras.layers.regularization.gaussian_dropout import GaussianDropout
-from keras.layers.regularization.gaussian_noise import GaussianNoise
-from keras.layers.regularization.activity_regularization import ActivityRegularization
-from keras.layers.regularization.alpha_dropout import AlphaDropout
-
diff --git a/keras/layers/regularization/activity_regularization.py b/keras/layers/regularization/activity_regularization.py
index 520b526e4978..977b7d24e56c 100644
--- a/keras/layers/regularization/activity_regularization.py
+++ b/keras/layers/regularization/activity_regularization.py
@@ -13,41 +13,44 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the ActivityRegularization layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import regularizers
 from keras.engine.base_layer import Layer
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ActivityRegularization')
+@keras_export("keras.layers.ActivityRegularization")
 class ActivityRegularization(Layer):
-  """Layer that applies an update to the cost function based input activity.
-
-  Args:
-    l1: L1 regularization factor (positive float).
-    l2: L2 regularization factor (positive float).
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-  """
-
-  def __init__(self, l1=0., l2=0., **kwargs):
-    super().__init__(
-        activity_regularizer=regularizers.L1L2(l1=l1, l2=l2), **kwargs)
-    self.supports_masking = True
-    self.l1 = l1
-    self.l2 = l2
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {'l1': self.l1, 'l2': self.l2}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Layer that applies an update to the cost function based input activity.
+
+    Args:
+      l1: L1 regularization factor (positive float).
+      l2: L2 regularization factor (positive float).
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+    Output shape:
+      Same shape as input.
+    """
+
+    def __init__(self, l1=0.0, l2=0.0, **kwargs):
+        super().__init__(
+            activity_regularizer=regularizers.L1L2(l1=l1, l2=l2), **kwargs
+        )
+        self.supports_masking = True
+        self.l1 = l1
+        self.l2 = l2
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {"l1": self.l1, "l2": self.l2}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/regularization/activity_regularization_test.py b/keras/layers/regularization/activity_regularization_test.py
index 47475ff70b57..a98d57cc0382 100644
--- a/keras/layers/regularization/activity_regularization_test.py
+++ b/keras/layers/regularization/activity_regularization_test.py
@@ -14,21 +14,22 @@
 # ==============================================================================
 """Tests for activity regularization layer."""
 
-import keras
-from keras.testing_infra import test_combinations
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+import keras
+from keras.testing_infra import test_combinations
+
 
 @test_combinations.run_all_keras_modes
 class ActivityRegularizationTest(test_combinations.TestCase):
+    def test_activity_regularization(self):
+        layer = keras.layers.ActivityRegularization(l1=0.1)
+        layer(keras.backend.variable(np.ones((2, 4))))
+        self.assertEqual(1, len(layer.losses))
+        config = layer.get_config()
+        self.assertEqual(config.pop("l1"), 0.1)
 
-  def test_activity_regularization(self):
-    layer = keras.layers.ActivityRegularization(l1=0.1)
-    layer(keras.backend.variable(np.ones((2, 4))))
-    self.assertEqual(1, len(layer.losses))
-    config = layer.get_config()
-    self.assertEqual(config.pop('l1'), 0.1)
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/regularization/alpha_dropout.py b/keras/layers/regularization/alpha_dropout.py
index f9d5287b5e6a..5c00ab347243 100644
--- a/keras/layers/regularization/alpha_dropout.py
+++ b/keras/layers/regularization/alpha_dropout.py
@@ -13,88 +13,92 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the AlphaDropout layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_layer
 from keras.utils import tf_utils
 
-import tensorflow.compat.v2 as tf
-
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.AlphaDropout')
+@keras_export("keras.layers.AlphaDropout")
 class AlphaDropout(base_layer.BaseRandomLayer):
-  """Applies Alpha Dropout to the input.
-
-  Alpha Dropout is a `Dropout` that keeps mean and variance of inputs
-  to their original values, in order to ensure the self-normalizing property
-  even after this dropout.
-  Alpha Dropout fits well to Scaled Exponential Linear Units
-  by randomly setting activations to the negative saturation value.
-
-  Args:
-    rate: float, drop probability (as with `Dropout`).
-      The multiplicative noise will have
-      standard deviation `sqrt(rate / (1 - rate))`.
-    seed: Integer, optional random seed to enable deterministic behavior.
-
-  Call arguments:
-    inputs: Input tensor (of any rank).
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (doing nothing).
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-  """
-
-  def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    super().__init__(seed=seed, **kwargs)
-    self.rate = rate
-    self.noise_shape = noise_shape
-    self.seed = seed
-    self.supports_masking = True
-
-  def _get_noise_shape(self, inputs):
-    return self.noise_shape if self.noise_shape else tf.shape(inputs)
-
-  def call(self, inputs, training=None):
-    if 0. < self.rate < 1.:
-      noise_shape = self._get_noise_shape(inputs)
-
-      def dropped_inputs(inputs=inputs, rate=self.rate):  # pylint: disable=missing-docstring
-        alpha = 1.6732632423543772848170429916717
-        scale = 1.0507009873554804934193349852946
-        alpha_p = -alpha * scale
-
-        kept_idx = tf.greater_equal(
-            self._random_generator.random_uniform(noise_shape), rate)
-        kept_idx = tf.cast(kept_idx, inputs.dtype)
-
-        # Get affine transformation params
-        a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
-        b = -a * alpha_p * rate
-
-        # Apply mask
-        x = inputs * kept_idx + alpha_p * (1 - kept_idx)
-
-        # Do affine transformation
-        return a * x + b
-
-      return backend.in_train_phase(dropped_inputs, inputs, training=training)
-    return inputs
-
-  def get_config(self):
-    config = {'rate': self.rate, 'seed': self.seed}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    """Applies Alpha Dropout to the input.
+
+    Alpha Dropout is a `Dropout` that keeps mean and variance of inputs
+    to their original values, in order to ensure the self-normalizing property
+    even after this dropout.
+    Alpha Dropout fits well to Scaled Exponential Linear Units
+    by randomly setting activations to the negative saturation value.
+
+    Args:
+      rate: float, drop probability (as with `Dropout`).
+        The multiplicative noise will have
+        standard deviation `sqrt(rate / (1 - rate))`.
+      seed: Integer, optional random seed to enable deterministic behavior.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+    Output shape:
+      Same shape as input.
+    """
+
+    def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
+        super().__init__(seed=seed, **kwargs)
+        self.rate = rate
+        self.noise_shape = noise_shape
+        self.seed = seed
+        self.supports_masking = True
+
+    def _get_noise_shape(self, inputs):
+        return self.noise_shape if self.noise_shape else tf.shape(inputs)
+
+    def call(self, inputs, training=None):
+        if 0.0 < self.rate < 1.0:
+            noise_shape = self._get_noise_shape(inputs)
+
+            def dropped_inputs(inputs=inputs, rate=self.rate):
+                alpha = 1.6732632423543772848170429916717
+                scale = 1.0507009873554804934193349852946
+                alpha_p = -alpha * scale
+
+                kept_idx = tf.greater_equal(
+                    self._random_generator.random_uniform(noise_shape), rate
+                )
+                kept_idx = tf.cast(kept_idx, inputs.dtype)
+
+                # Get affine transformation params
+                a = ((1 - rate) * (1 + rate * alpha_p**2)) ** -0.5
+                b = -a * alpha_p * rate
+
+                # Apply mask
+                x = inputs * kept_idx + alpha_p * (1 - kept_idx)
+
+                # Do affine transformation
+                return a * x + b
+
+            return backend.in_train_phase(
+                dropped_inputs, inputs, training=training
+            )
+        return inputs
+
+    def get_config(self):
+        config = {"rate": self.rate, "seed": self.seed}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/regularization/alpha_dropout_test.py b/keras/layers/regularization/alpha_dropout_test.py
index d7d8c1230062..b466acf4fe86 100644
--- a/keras/layers/regularization/alpha_dropout_test.py
+++ b/keras/layers/regularization/alpha_dropout_test.py
@@ -14,41 +14,46 @@
 # ==============================================================================
 """Tests for alpha dropout layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class AlphaDropoutTest(test_combinations.TestCase):
-
-  def test_AlphaDropout(self):
-    test_utils.layer_test(
-        keras.layers.AlphaDropout, kwargs={'rate': 0.2}, input_shape=(3, 2, 3))
-
-  def _make_model(self, dtype):
-    assert dtype in (tf.float32, tf.float64)
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
-    layer = keras.layers.AlphaDropout(0.5, dtype=dtype)
-    model.add(layer)
-    return model
-
-  def _train_model(self, dtype):
-    model = self._make_model(dtype)
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
-
-  def test_alpha_dropout_float32(self):
-    self._train_model(tf.float32)
-
-  def test_alpha_dropout_float64(self):
-    self._train_model(tf.float64)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_AlphaDropout(self):
+        test_utils.layer_test(
+            keras.layers.AlphaDropout,
+            kwargs={"rate": 0.2},
+            input_shape=(3, 2, 3),
+        )
+
+    def _make_model(self, dtype):
+        assert dtype in (tf.float32, tf.float64)
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
+        layer = keras.layers.AlphaDropout(0.5, dtype=dtype)
+        model.add(layer)
+        return model
+
+    def _train_model(self, dtype):
+        model = self._make_model(dtype)
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
+
+    def test_alpha_dropout_float32(self):
+        self._train_model(tf.float32)
+
+    def test_alpha_dropout_float64(self):
+        self._train_model(tf.float64)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/regularization/dropout.py b/keras/layers/regularization/dropout.py
index dbfa82d6fecd..17374afcdf3b 100644
--- a/keras/layers/regularization/dropout.py
+++ b/keras/layers/regularization/dropout.py
@@ -13,113 +13,123 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the Dropout layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+import numbers
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_layer
 from keras.utils import control_flow_util
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Dropout')
+@keras_export("keras.layers.Dropout")
 class Dropout(base_layer.BaseRandomLayer):
-  """Applies Dropout to the input.
-
-  The Dropout layer randomly sets input units to 0 with a frequency of `rate`
-  at each step during training time, which helps prevent overfitting.
-  Inputs not set to 0 are scaled up by 1/(1 - rate) such that the sum over
-  all inputs is unchanged.
-
-  Note that the Dropout layer only applies when `training` is set to True
-  such that no values are dropped during inference. When using `model.fit`,
-  `training` will be appropriately set to True automatically, and in other
-  contexts, you can set the kwarg explicitly to True when calling the layer.
-
-  (This is in contrast to setting `trainable=False` for a Dropout layer.
-  `trainable` does not affect the layer's behavior, as Dropout does
-  not have any variables/weights that can be frozen during training.)
-
-  >>> tf.random.set_seed(0)
-  >>> layer = tf.keras.layers.Dropout(.2, input_shape=(2,))
-  >>> data = np.arange(10).reshape(5, 2).astype(np.float32)
-  >>> print(data)
-  [[0. 1.]
-   [2. 3.]
-   [4. 5.]
-   [6. 7.]
-   [8. 9.]]
-  >>> outputs = layer(data, training=True)
-  >>> print(outputs)
-  tf.Tensor(
-  [[ 0.    1.25]
-   [ 2.5   3.75]
-   [ 5.    6.25]
-   [ 7.5   8.75]
-   [10.    0.  ]], shape=(5, 2), dtype=float32)
-
-  Args:
-    rate: Float between 0 and 1. Fraction of the input units to drop.
-    noise_shape: 1D integer tensor representing the shape of the
-      binary dropout mask that will be multiplied with the input.
-      For instance, if your inputs have shape
-      `(batch_size, timesteps, features)` and
-      you want the dropout mask to be the same for all timesteps,
-      you can use `noise_shape=(batch_size, 1, features)`.
-    seed: A Python integer to use as random seed.
-
-  Call arguments:
-    inputs: Input tensor (of any rank).
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (doing nothing).
-  """
-
-  def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    super().__init__(seed=seed, **kwargs)
-    if isinstance(rate, (int, float)) and not 0 <= rate <= 1:
-      raise ValueError(f'Invalid value {rate} received for '
-                       f'`rate`, expected a value between 0 and 1.')
-    self.rate = rate
-    self.noise_shape = noise_shape
-    self.seed = seed
-    self.supports_masking = True
-
-  def build(self, input_shape):
-    self._random_generator._maybe_init()  # pylint: disable=protected-access
-
-  def _get_noise_shape(self, inputs):
-    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
-    # which will override `self.noise_shape`, and allows for custom noise
-    # shapes with dynamically sized inputs.
-    if self.noise_shape is None:
-      return None
-
-    concrete_inputs_shape = tf.shape(inputs)
-    noise_shape = []
-    for i, value in enumerate(self.noise_shape):
-      noise_shape.append(concrete_inputs_shape[i] if value is None else value)
-    return tf.convert_to_tensor(noise_shape)
-
-  def call(self, inputs, training=None):
-    if training is None:
-      training = backend.learning_phase()
-
-    def dropped_inputs():
-      return self._random_generator.dropout(
-          inputs, self.rate, noise_shape=self._get_noise_shape(inputs))
-
-    output = control_flow_util.smart_cond(training, dropped_inputs,
-                                          lambda: tf.identity(inputs))
-    return output
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'rate': self.rate,
-        'noise_shape': self.noise_shape,
-        'seed': self.seed
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Applies Dropout to the input.
+
+    The Dropout layer randomly sets input units to 0 with a frequency of `rate`
+    at each step during training time, which helps prevent overfitting.
+    Inputs not set to 0 are scaled up by 1/(1 - rate) such that the sum over
+    all inputs is unchanged.
+
+    Note that the Dropout layer only applies when `training` is set to True
+    such that no values are dropped during inference. When using `model.fit`,
+    `training` will be appropriately set to True automatically, and in other
+    contexts, you can set the kwarg explicitly to True when calling the layer.
+
+    (This is in contrast to setting `trainable=False` for a Dropout layer.
+    `trainable` does not affect the layer's behavior, as Dropout does
+    not have any variables/weights that can be frozen during training.)
+
+    >>> tf.random.set_seed(0)
+    >>> layer = tf.keras.layers.Dropout(.2, input_shape=(2,))
+    >>> data = np.arange(10).reshape(5, 2).astype(np.float32)
+    >>> print(data)
+    [[0. 1.]
+     [2. 3.]
+     [4. 5.]
+     [6. 7.]
+     [8. 9.]]
+    >>> outputs = layer(data, training=True)
+    >>> print(outputs)
+    tf.Tensor(
+    [[ 0.    1.25]
+     [ 2.5   3.75]
+     [ 5.    6.25]
+     [ 7.5   8.75]
+     [10.    0.  ]], shape=(5, 2), dtype=float32)
+
+    Args:
+      rate: Float between 0 and 1. Fraction of the input units to drop.
+      noise_shape: 1D integer tensor representing the shape of the
+        binary dropout mask that will be multiplied with the input.
+        For instance, if your inputs have shape
+        `(batch_size, timesteps, features)` and
+        you want the dropout mask to be the same for all timesteps,
+        you can use `noise_shape=(batch_size, 1, features)`.
+      seed: A Python integer to use as random seed.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+    """
+
+    def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
+        super().__init__(seed=seed, **kwargs)
+        if isinstance(rate, (int, float)) and not 0 <= rate <= 1:
+            raise ValueError(
+                f"Invalid value {rate} received for "
+                "`rate`, expected a value between 0 and 1."
+            )
+        self.rate = rate
+        self.noise_shape = noise_shape
+        self.seed = seed
+        self.supports_masking = True
+
+    def _get_noise_shape(self, inputs):
+        # Subclasses of `Dropout` may implement `_get_noise_shape(self,
+        # inputs)`, which will override `self.noise_shape`, and allows for
+        # custom noise shapes with dynamically sized inputs.
+        if self.noise_shape is None:
+            return None
+
+        concrete_inputs_shape = tf.shape(inputs)
+        noise_shape = []
+        for i, value in enumerate(self.noise_shape):
+            noise_shape.append(
+                concrete_inputs_shape[i] if value is None else value
+            )
+        return tf.convert_to_tensor(noise_shape)
+
+    def call(self, inputs, training=None):
+        if isinstance(self.rate, numbers.Real) and self.rate == 0:
+            return tf.identity(inputs)
+
+        if training is None:
+            training = backend.learning_phase()
+
+        def dropped_inputs():
+            return self._random_generator.dropout(
+                inputs, self.rate, noise_shape=self._get_noise_shape(inputs)
+            )
+
+        output = control_flow_util.smart_cond(
+            training, dropped_inputs, lambda: tf.identity(inputs)
+        )
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "rate": self.rate,
+            "noise_shape": self.noise_shape,
+            "seed": self.seed,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/regularization/dropout_test.py b/keras/layers/regularization/dropout_test.py
index 19fdb1e50ab4..2239338b8af4 100644
--- a/keras/layers/regularization/dropout_test.py
+++ b/keras/layers/regularization/dropout_test.py
@@ -16,78 +16,148 @@
 
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class DropoutTest(test_combinations.TestCase):
+    def test_dropout(self):
+        test_utils.layer_test(
+            keras.layers.Dropout, kwargs={"rate": 0.5}, input_shape=(3, 2)
+        )
+
+        test_utils.layer_test(
+            keras.layers.Dropout,
+            kwargs={"rate": 0.5, "noise_shape": [3, 1]},
+            input_shape=(3, 2),
+        )
+
+    def test_dropout_supports_masking(self):
+        dropout = keras.layers.Dropout(0.5)
+        self.assertEqual(True, dropout.supports_masking)
+
+    def test_dropout_partial_noise_shape(self):
+        inputs = keras.Input(shape=(5, 10))
+        layer = keras.layers.Dropout(0.5, noise_shape=(None, 1, None))
+        outputs = layer(inputs)
+        model = keras.Model(inputs, outputs)
+        out = model(np.ones((20, 5, 10)), training=True)
+        out_np = keras.backend.get_value(out)
+        # Test that dropout mask is shared across second dim.
+        self.assertAllClose(out_np[:, 0, :], out_np[:, 1, :])
+
+    @test_utils.run_v2_only
+    def test_dropout_with_zero_rate(self):
+        inputs = np.ones((20, 5, 10))
+        dropout = keras.layers.Dropout(0.0, force_generator=True)
+        dropout.build((20, 5, 10))
+        # Make sure we don't use the RNG when the dropout rate is 0
+        # (for performance).
+        rng_state_var = tf.constant(
+            dropout._random_generator._generator._state_var
+        )
+        output = dropout(inputs, training=True)
+        self.assertAllClose(inputs, output)
+        self.assertAllClose(
+            rng_state_var, dropout._random_generator._generator._state_var
+        )
+
+    def test_dropout_with_saving(self):
+        inputs = keras.Input(shape=(5, 10))
+        layer = keras.layers.Dropout(0.5, force_generator=True)
+        outputs = layer(inputs)
+        model = keras.Model(inputs, outputs)
+        train = model(np.ones((20, 5, 10)), training=True)
+        predict = model(np.ones((20, 5, 10)))
+        # Make sure the weights from tf.random.Generator is not present in the
+        # model which will cause weight loading issue for existing application
+        # models if it contains dropout layer.
+        self.assertEmpty(layer.get_weights())
+        self.assertEmpty(model.get_weights())
+
+        # Make sure the layer does dropout value when training
+        self.assertNotAllClose(train, predict)
+
+        with self.subTest("savedmodel"):
+            model.save(
+                os.path.join(self.get_temp_dir(), "savedmodel"),
+                save_format="tf",
+            )
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "savedmodel")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model.save(
+                os.path.join(self.get_temp_dir(), "model.keras"),
+                save_format="keras_v3",
+            )
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "model.keras")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("checkpoint"):
+            # Also make sure the checkpoint doesn't contain any variable from
+            # the dropout layer, to keep the backward compatibility.
+            checkpoint = tf.train.Checkpoint(model)
+            save_path = checkpoint.save(
+                os.path.join(self.get_temp_dir(), "checkpoint")
+            )
+            checkpoint_var_names = [
+                name_value_tuple[0]
+                for name_value_tuple in tf.train.list_variables(save_path)
+            ]
+            for name in checkpoint_var_names:
+                self.assertNotIn("dropout", name)
+
+            # Make sure the checkpoint can be loaded
+            clone_model = keras.models.clone_model(model)
+            checkpoint = tf.train.Checkpoint(clone_model)
+            status = checkpoint.restore(
+                os.path.join(self.get_temp_dir(), "checkpoint-1")
+            )
+            self.assertTrue(status.assert_consumed())
+            self.assertTrue(status.assert_existing_objects_matched())
+            # Make sure the output is differnt from the original model, since
+            # the StateVar is not preserved.
+            train3 = clone_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train3, train2)
+
+    @test_utils.run_v2_only
+    def test_state_variable_name(self):
+        inputs = keras.Input(shape=(5, 10))
+        layer = keras.layers.Dropout(
+            0.5, force_generator=True, name="dropout_layer"
+        )
+        layer(inputs)
+        self.assertEqual(
+            layer._random_generator._generator._state_var.name,
+            "dropout_layer/StateVar:0",
+        )
+
 
-  def test_dropout(self):
-    test_utils.layer_test(
-        keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
-
-    test_utils.layer_test(
-        keras.layers.Dropout,
-        kwargs={
-            'rate': 0.5,
-            'noise_shape': [3, 1]
-        },
-        input_shape=(3, 2))
-
-  def test_dropout_supports_masking(self):
-    dropout = keras.layers.Dropout(0.5)
-    self.assertEqual(True, dropout.supports_masking)
-
-  def test_dropout_partial_noise_shape(self):
-    inputs = keras.Input(shape=(5, 10))
-    layer = keras.layers.Dropout(0.5, noise_shape=(None, 1, None))
-    outputs = layer(inputs)
-    model = keras.Model(inputs, outputs)
-    out = model(np.ones((20, 5, 10)), training=True)
-    out_np = keras.backend.get_value(out)
-    # Test that dropout mask is shared across second dim.
-    self.assertAllClose(out_np[:, 0, :], out_np[:, 1, :])
-
-  def test_dropout_with_savemodel(self):
-    inputs = keras.Input(shape=(5, 10))
-    layer = keras.layers.Dropout(0.5, force_generator=True)
-    outputs = layer(inputs)
-    model = keras.Model(inputs, outputs)
-    train = model(np.ones((20, 5, 10)), training=True)
-    predict = model(np.ones((20, 5, 10)))
-    # Make sure the weights from tf.random.Generator is not present in the model
-    # which will cause weight loading issue for existing application models if
-    # it contains dropout layer.
-    self.assertEmpty(layer.get_weights())
-    self.assertEmpty(model.get_weights())
-
-    # Make sure the layer does dropout value when training
-    self.assertNotAllClose(train, predict)
-
-    model.save(os.path.join(self.get_temp_dir(), 'savedmodel'),
-               save_format='tf')
-    loaded_model = keras.models.load_model(
-        os.path.join(self.get_temp_dir(), 'savedmodel'))
-    predict2 = loaded_model(np.ones((20, 5, 10)))
-
-    self.assertAllClose(predict, predict2)
-    # Make sure the model dropout different value after loading
-    train2 = loaded_model(np.ones((20, 5, 10)), training=True)
-    self.assertNotAllClose(train, train2)
-    self.assertIsNotNone(loaded_model.layers[1]._random_generator)
-
-    # Also make sure the checkpoint doesn't contain any variable from the
-    # dropout layer, to keep the backward compatibility.
-    checkpoint = tf.train.Checkpoint(model)
-    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), 'checkpoint'))
-    checkpoint_var_names = [name_value_tuple[0] for name_value_tuple in
-                            tf.train.list_variables(save_path)]
-    for name in checkpoint_var_names:
-      self.assertNotIn('dropout', name)
-
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/regularization/gaussian_dropout.py b/keras/layers/regularization/gaussian_dropout.py
index 1ff92e8923c0..9e9d442bbe87 100644
--- a/keras/layers/regularization/gaussian_dropout.py
+++ b/keras/layers/regularization/gaussian_dropout.py
@@ -13,69 +13,71 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the GaussianDropout layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras import backend
-from keras.engine import base_layer
-from keras.utils import tf_utils
 
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+from keras import backend
+from keras.engine import base_layer
+from keras.utils import tf_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GaussianDropout')
+@keras_export("keras.layers.GaussianDropout")
 class GaussianDropout(base_layer.BaseRandomLayer):
-  """Apply multiplicative 1-centered Gaussian noise.
-
-  As it is a regularization layer, it is only active at training time.
-
-  Args:
-    rate: Float, drop probability (as with `Dropout`).
-      The multiplicative noise will have
-      standard deviation `sqrt(rate / (1 - rate))`.
-    seed: Integer, optional random seed to enable deterministic behavior.
-
-  Call arguments:
-    inputs: Input tensor (of any rank).
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (doing nothing).
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-  """
-
-  def __init__(self, rate, seed=None, **kwargs):
-    super().__init__(seed=seed, **kwargs)
-    self.supports_masking = True
-    self.rate = rate
-    self.seed = seed
-
-  def call(self, inputs, training=None):
-    if 0 < self.rate < 1:
-
-      def noised():
-        stddev = np.sqrt(self.rate / (1.0 - self.rate))
-        return inputs * self._random_generator.random_normal(
-            shape=tf.shape(inputs),
-            mean=1.0,
-            stddev=stddev,
-            dtype=inputs.dtype)
-
-      return backend.in_train_phase(noised, inputs, training=training)
-    return inputs
-
-  def get_config(self):
-    config = {'rate': self.rate, 'seed': self.seed}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    """Apply multiplicative 1-centered Gaussian noise.
+
+    As it is a regularization layer, it is only active at training time.
+
+    Args:
+      rate: Float, drop probability (as with `Dropout`).
+        The multiplicative noise will have
+        standard deviation `sqrt(rate / (1 - rate))`.
+      seed: Integer, optional random seed to enable deterministic behavior.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+    Output shape:
+      Same shape as input.
+    """
+
+    def __init__(self, rate, seed=None, **kwargs):
+        super().__init__(seed=seed, **kwargs)
+        self.supports_masking = True
+        self.rate = rate
+        self.seed = seed
+
+    def call(self, inputs, training=None):
+        if 0 < self.rate < 1:
+
+            def noised():
+                stddev = np.sqrt(self.rate / (1.0 - self.rate))
+                return inputs * self._random_generator.random_normal(
+                    shape=tf.shape(inputs),
+                    mean=1.0,
+                    stddev=stddev,
+                    dtype=inputs.dtype,
+                )
+
+            return backend.in_train_phase(noised, inputs, training=training)
+        return inputs
+
+    def get_config(self):
+        config = {"rate": self.rate, "seed": self.seed}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/regularization/gaussian_dropout_test.py b/keras/layers/regularization/gaussian_dropout_test.py
index a961c838926a..b50d348e2548 100644
--- a/keras/layers/regularization/gaussian_dropout_test.py
+++ b/keras/layers/regularization/gaussian_dropout_test.py
@@ -14,43 +14,46 @@
 # ==============================================================================
 """Tests for gaussian dropout layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class NoiseLayersTest(test_combinations.TestCase):
-
-  def test_GaussianDropout(self):
-    test_utils.layer_test(
-        keras.layers.GaussianDropout,
-        kwargs={'rate': 0.5},
-        input_shape=(3, 2, 3))
-
-  def _make_model(self, dtype):
-    assert dtype in (tf.float32, tf.float64)
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
-    layer = keras.layers.GaussianDropout(0.1, dtype=dtype)
-    model.add(layer)
-    return model
-
-  def _train_model(self, dtype):
-    model = self._make_model(dtype)
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
-
-  def test_gaussian_dropout_float32(self):
-    self._train_model(tf.float32)
-
-  def test_gaussian_dropout_float64(self):
-    self._train_model(tf.float64)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_GaussianDropout(self):
+        test_utils.layer_test(
+            keras.layers.GaussianDropout,
+            kwargs={"rate": 0.5},
+            input_shape=(3, 2, 3),
+        )
+
+    def _make_model(self, dtype):
+        assert dtype in (tf.float32, tf.float64)
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
+        layer = keras.layers.GaussianDropout(0.1, dtype=dtype)
+        model.add(layer)
+        return model
+
+    def _train_model(self, dtype):
+        model = self._make_model(dtype)
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
+
+    def test_gaussian_dropout_float32(self):
+        self._train_model(tf.float32)
+
+    def test_gaussian_dropout_float64(self):
+        self._train_model(tf.float64)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/regularization/gaussian_noise.py b/keras/layers/regularization/gaussian_noise.py
index 32386ac09e21..f88e3a3c4a2d 100644
--- a/keras/layers/regularization/gaussian_noise.py
+++ b/keras/layers/regularization/gaussian_noise.py
@@ -13,68 +13,69 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the GaussianNoise layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_layer
 from keras.utils import tf_utils
 
-import tensorflow.compat.v2 as tf
-
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.GaussianNoise')
+@keras_export("keras.layers.GaussianNoise")
 class GaussianNoise(base_layer.BaseRandomLayer):
-  """Apply additive zero-centered Gaussian noise.
-
-  This is useful to mitigate overfitting
-  (you could see it as a form of random data augmentation).
-  Gaussian Noise (GS) is a natural choice as corruption process
-  for real valued inputs.
-
-  As it is a regularization layer, it is only active at training time.
-
-  Args:
-    stddev: Float, standard deviation of the noise distribution.
-    seed: Integer, optional random seed to enable deterministic behavior.
-
-  Call arguments:
-    inputs: Input tensor (of any rank).
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding noise) or in inference mode (doing nothing).
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-  """
-
-  def __init__(self, stddev, seed=None, **kwargs):
-    super().__init__(seed=seed, **kwargs)
-    self.supports_masking = True
-    self.stddev = stddev
-    self.seed = seed
-
-  def call(self, inputs, training=None):
-
-    def noised():
-      return inputs + self._random_generator.random_normal(
-          shape=tf.shape(inputs),
-          mean=0.,
-          stddev=self.stddev,
-          dtype=inputs.dtype)
-
-    return backend.in_train_phase(noised, inputs, training=training)
-
-  def get_config(self):
-    config = {'stddev': self.stddev, 'seed': self.seed}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    """Apply additive zero-centered Gaussian noise.
+
+    This is useful to mitigate overfitting
+    (you could see it as a form of random data augmentation).
+    Gaussian Noise (GS) is a natural choice as corruption process
+    for real valued inputs.
+
+    As it is a regularization layer, it is only active at training time.
+
+    Args:
+      stddev: Float, standard deviation of the noise distribution.
+      seed: Integer, optional random seed to enable deterministic behavior.
+
+    Call arguments:
+      inputs: Input tensor (of any rank).
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding noise) or in inference mode (doing nothing).
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+    Output shape:
+      Same shape as input.
+    """
+
+    def __init__(self, stddev, seed=None, **kwargs):
+        super().__init__(seed=seed, **kwargs)
+        self.supports_masking = True
+        self.stddev = stddev
+        self.seed = seed
+
+    def call(self, inputs, training=None):
+        def noised():
+            return inputs + self._random_generator.random_normal(
+                shape=tf.shape(inputs),
+                mean=0.0,
+                stddev=self.stddev,
+                dtype=inputs.dtype,
+            )
+
+        return backend.in_train_phase(noised, inputs, training=training)
+
+    def get_config(self):
+        config = {"stddev": self.stddev, "seed": self.seed}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        return input_shape
diff --git a/keras/layers/regularization/gaussian_noise_test.py b/keras/layers/regularization/gaussian_noise_test.py
index 3ac051240cf4..b67084e053f2 100644
--- a/keras/layers/regularization/gaussian_noise_test.py
+++ b/keras/layers/regularization/gaussian_noise_test.py
@@ -14,43 +14,46 @@
 # ==============================================================================
 """Tests for gaussian noise layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class NoiseLayersTest(test_combinations.TestCase):
-
-  def test_GaussianNoise(self):
-    test_utils.layer_test(
-        keras.layers.GaussianNoise,
-        kwargs={'stddev': 1.},
-        input_shape=(3, 2, 3))
-
-  def _make_model(self, dtype):
-    assert dtype in (tf.float32, tf.float64)
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
-    layer = keras.layers.GaussianNoise(0.0003, dtype=dtype)
-    model.add(layer)
-    return model
-
-  def _train_model(self, dtype):
-    model = self._make_model(dtype)
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
-
-  def test_gaussian_noise_float32(self):
-    self._train_model(tf.float32)
-
-  def test_gaussian_noise_float64(self):
-    self._train_model(tf.float64)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_GaussianNoise(self):
+        test_utils.layer_test(
+            keras.layers.GaussianNoise,
+            kwargs={"stddev": 1.0},
+            input_shape=(3, 2, 3),
+        )
+
+    def _make_model(self, dtype):
+        assert dtype in (tf.float32, tf.float64)
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
+        layer = keras.layers.GaussianNoise(0.0003, dtype=dtype)
+        model.add(layer)
+        return model
+
+    def _train_model(self, dtype):
+        model = self._make_model(dtype)
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
+
+    def test_gaussian_noise_float32(self):
+        self._train_model(tf.float32)
+
+    def test_gaussian_noise_float64(self):
+        self._train_model(tf.float64)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/regularization/spatial_dropout1d.py b/keras/layers/regularization/spatial_dropout1d.py
index 29dabc95ac72..7a3672c9d295 100644
--- a/keras/layers/regularization/spatial_dropout1d.py
+++ b/keras/layers/regularization/spatial_dropout1d.py
@@ -13,45 +13,47 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the SpatialDropout1D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.engine.input_spec import InputSpec
 from keras.layers.regularization.dropout import Dropout
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.SpatialDropout1D')
+@keras_export("keras.layers.SpatialDropout1D")
 class SpatialDropout1D(Dropout):
-  """Spatial 1D version of Dropout.
-
-  This version performs the same function as Dropout, however, it drops
-  entire 1D feature maps instead of individual elements. If adjacent frames
-  within feature maps are strongly correlated (as is normally the case in
-  early convolution layers) then regular dropout will not regularize the
-  activations and will otherwise just result in an effective learning rate
-  decrease. In this case, SpatialDropout1D will help promote independence
-  between feature maps and should be used instead.
-
-  Args:
-    rate: Float between 0 and 1. Fraction of the input units to drop.
-  Call arguments:
-    inputs: A 3D tensor.
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (doing nothing).
-  Input shape:
-    3D tensor with shape: `(samples, timesteps, channels)`
-  Output shape: Same as input.
-  References: - [Efficient Object Localization Using Convolutional
-      Networks](https://arxiv.org/abs/1411.4280)
-  """
-
-  def __init__(self, rate, **kwargs):
-    super().__init__(rate, **kwargs)
-    self.input_spec = InputSpec(ndim=3)
-
-  def _get_noise_shape(self, inputs):
-    input_shape = tf.shape(inputs)
-    noise_shape = (input_shape[0], 1, input_shape[2])
-    return noise_shape
+    """Spatial 1D version of Dropout.
+
+    This version performs the same function as Dropout, however, it drops
+    entire 1D feature maps instead of individual elements. If adjacent frames
+    within feature maps are strongly correlated (as is normally the case in
+    early convolution layers) then regular dropout will not regularize the
+    activations and will otherwise just result in an effective learning rate
+    decrease. In this case, SpatialDropout1D will help promote independence
+    between feature maps and should be used instead.
+
+    Args:
+      rate: Float between 0 and 1. Fraction of the input units to drop.
+    Call arguments:
+      inputs: A 3D tensor.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+    Input shape:
+      3D tensor with shape: `(samples, timesteps, channels)`
+    Output shape: Same as input.
+    References: - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+    """
+
+    def __init__(self, rate, **kwargs):
+        super().__init__(rate, **kwargs)
+        self.input_spec = InputSpec(ndim=3)
+
+    def _get_noise_shape(self, inputs):
+        input_shape = tf.shape(inputs)
+        noise_shape = (input_shape[0], 1, input_shape[2])
+        return noise_shape
diff --git a/keras/layers/regularization/spatial_dropout2d.py b/keras/layers/regularization/spatial_dropout2d.py
index ec6b84806033..e913c132c682 100644
--- a/keras/layers/regularization/spatial_dropout2d.py
+++ b/keras/layers/regularization/spatial_dropout2d.py
@@ -13,63 +13,67 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the SpatialDropout2D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.input_spec import InputSpec
 from keras.layers.regularization.dropout import Dropout
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.SpatialDropout2D')
+@keras_export("keras.layers.SpatialDropout2D")
 class SpatialDropout2D(Dropout):
-  """Spatial 2D version of Dropout.
+    """Spatial 2D version of Dropout.
 
-  This version performs the same function as Dropout, however, it drops
-  entire 2D feature maps instead of individual elements. If adjacent pixels
-  within feature maps are strongly correlated (as is normally the case in
-  early convolution layers) then regular dropout will not regularize the
-  activations and will otherwise just result in an effective learning rate
-  decrease. In this case, SpatialDropout2D will help promote independence
-  between feature maps and should be used instead.
+    This version performs the same function as Dropout, however, it drops
+    entire 2D feature maps instead of individual elements. If adjacent pixels
+    within feature maps are strongly correlated (as is normally the case in
+    early convolution layers) then regular dropout will not regularize the
+    activations and will otherwise just result in an effective learning rate
+    decrease. In this case, SpatialDropout2D will help promote independence
+    between feature maps and should be used instead.
 
-  Args:
-    rate: Float between 0 and 1. Fraction of the input units to drop.
-    data_format: 'channels_first' or 'channels_last'. In 'channels_first' mode,
-      the channels dimension (the depth) is at index 1, in 'channels_last' mode
-      is it at index 3. It defaults to the `image_data_format` value found in
-      your Keras config file at `~/.keras/keras.json`. If you never set it, then
-      it will be "channels_last".
-  Call arguments:
-    inputs: A 4D tensor.
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (doing nothing).
-  Input shape:
-    4D tensor with shape: `(samples, channels, rows, cols)` if
-      data_format='channels_first'
-    or 4D tensor with shape: `(samples, rows, cols, channels)` if
-      data_format='channels_last'.
-  Output shape: Same as input.
-  References: - [Efficient Object Localization Using Convolutional
-      Networks](https://arxiv.org/abs/1411.4280)
-  """
+    Args:
+      rate: Float between 0 and 1. Fraction of the input units to drop.
+      data_format: 'channels_first' or 'channels_last'. In 'channels_first'
+        mode, the channels dimension (the depth) is at index 1, in
+        'channels_last' mode is it at index 3. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+    Call arguments:
+      inputs: A 4D tensor.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+    Input shape:
+      4D tensor with shape: `(samples, channels, rows, cols)` if
+        data_format='channels_first'
+      or 4D tensor with shape: `(samples, rows, cols, channels)` if
+        data_format='channels_last'.
+    Output shape: Same as input.
+    References: - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+    """
 
-  def __init__(self, rate, data_format=None, **kwargs):
-    super().__init__(rate, **kwargs)
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if data_format not in {'channels_last', 'channels_first'}:
-      raise ValueError(
-          f'`data_format` must be "channels_last" or "channels_first". '
-          f'Received: data_format={data_format}.')
-    self.data_format = data_format
-    self.input_spec = InputSpec(ndim=4)
+    def __init__(self, rate, data_format=None, **kwargs):
+        super().__init__(rate, **kwargs)
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if data_format not in {"channels_last", "channels_first"}:
+            raise ValueError(
+                '`data_format` must be "channels_last" or "channels_first". '
+                f"Received: data_format={data_format}."
+            )
+        self.data_format = data_format
+        self.input_spec = InputSpec(ndim=4)
 
-  def _get_noise_shape(self, inputs):
-    input_shape = tf.shape(inputs)
-    if self.data_format == 'channels_first':
-      return (input_shape[0], input_shape[1], 1, 1)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], 1, 1, input_shape[3])
+    def _get_noise_shape(self, inputs):
+        input_shape = tf.shape(inputs)
+        if self.data_format == "channels_first":
+            return (input_shape[0], input_shape[1], 1, 1)
+        elif self.data_format == "channels_last":
+            return (input_shape[0], 1, 1, input_shape[3])
diff --git a/keras/layers/regularization/spatial_dropout3d.py b/keras/layers/regularization/spatial_dropout3d.py
index 792a2c5b703b..d7dff8724e0b 100644
--- a/keras/layers/regularization/spatial_dropout3d.py
+++ b/keras/layers/regularization/spatial_dropout3d.py
@@ -13,63 +13,67 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the SpatialDropout3D layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.input_spec import InputSpec
 from keras.layers.regularization.dropout import Dropout
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.SpatialDropout3D')
+@keras_export("keras.layers.SpatialDropout3D")
 class SpatialDropout3D(Dropout):
-  """Spatial 3D version of Dropout.
+    """Spatial 3D version of Dropout.
 
-  This version performs the same function as Dropout, however, it drops
-  entire 3D feature maps instead of individual elements. If adjacent voxels
-  within feature maps are strongly correlated (as is normally the case in
-  early convolution layers) then regular dropout will not regularize the
-  activations and will otherwise just result in an effective learning rate
-  decrease. In this case, SpatialDropout3D will help promote independence
-  between feature maps and should be used instead.
+    This version performs the same function as Dropout, however, it drops
+    entire 3D feature maps instead of individual elements. If adjacent voxels
+    within feature maps are strongly correlated (as is normally the case in
+    early convolution layers) then regular dropout will not regularize the
+    activations and will otherwise just result in an effective learning rate
+    decrease. In this case, SpatialDropout3D will help promote independence
+    between feature maps and should be used instead.
 
-  Args:
-    rate: Float between 0 and 1. Fraction of the input units to drop.
-    data_format: 'channels_first' or 'channels_last'. In 'channels_first' mode,
-      the channels dimension (the depth) is at index 1, in 'channels_last' mode
-      is it at index 4. It defaults to the `image_data_format` value found in
-      your Keras config file at `~/.keras/keras.json`. If you never set it, then
-      it will be "channels_last".
-  Call arguments:
-    inputs: A 5D tensor.
-    training: Python boolean indicating whether the layer should behave in
-      training mode (adding dropout) or in inference mode (doing nothing).
-  Input shape:
-    5D tensor with shape: `(samples, channels, dim1, dim2, dim3)` if
-      data_format='channels_first'
-    or 5D tensor with shape: `(samples, dim1, dim2, dim3, channels)` if
-      data_format='channels_last'.
-  Output shape: Same as input.
-  References: - [Efficient Object Localization Using Convolutional
-      Networks](https://arxiv.org/abs/1411.4280)
-  """
+    Args:
+      rate: Float between 0 and 1. Fraction of the input units to drop.
+      data_format: 'channels_first' or 'channels_last'. In 'channels_first'
+        mode, the channels dimension (the depth) is at index 1, in
+        'channels_last' mode is it at index 4. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+    Call arguments:
+      inputs: A 5D tensor.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+    Input shape:
+      5D tensor with shape: `(samples, channels, dim1, dim2, dim3)` if
+        data_format='channels_first'
+      or 5D tensor with shape: `(samples, dim1, dim2, dim3, channels)` if
+        data_format='channels_last'.
+    Output shape: Same as input.
+    References: - [Efficient Object Localization Using Convolutional
+        Networks](https://arxiv.org/abs/1411.4280)
+    """
 
-  def __init__(self, rate, data_format=None, **kwargs):
-    super().__init__(rate, **kwargs)
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if data_format not in {'channels_last', 'channels_first'}:
-      raise ValueError(
-          f'`data_format` must be "channels_last" or "channels_first". '
-          f'Received: data_format={data_format}.')
-    self.data_format = data_format
-    self.input_spec = InputSpec(ndim=5)
+    def __init__(self, rate, data_format=None, **kwargs):
+        super().__init__(rate, **kwargs)
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if data_format not in {"channels_last", "channels_first"}:
+            raise ValueError(
+                '`data_format` must be "channels_last" or "channels_first". '
+                f"Received: data_format={data_format}."
+            )
+        self.data_format = data_format
+        self.input_spec = InputSpec(ndim=5)
 
-  def _get_noise_shape(self, inputs):
-    input_shape = tf.shape(inputs)
-    if self.data_format == 'channels_first':
-      return (input_shape[0], input_shape[1], 1, 1, 1)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], 1, 1, 1, input_shape[4])
+    def _get_noise_shape(self, inputs):
+        input_shape = tf.shape(inputs)
+        if self.data_format == "channels_first":
+            return (input_shape[0], input_shape[1], 1, 1, 1)
+        elif self.data_format == "channels_last":
+            return (input_shape[0], 1, 1, 1, input_shape[4])
diff --git a/keras/layers/regularization/spatial_dropout_test.py b/keras/layers/regularization/spatial_dropout_test.py
index 1b4ec6f12c98..66ac40ec242d 100644
--- a/keras/layers/regularization/spatial_dropout_test.py
+++ b/keras/layers/regularization/spatial_dropout_test.py
@@ -14,48 +14,48 @@
 # ==============================================================================
 """Tests for spatial dropout layers."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class SpacialDropoutTest(test_combinations.TestCase):
-
-  def test_spatial_dropout_1d(self):
-    test_utils.layer_test(
-        keras.layers.SpatialDropout1D,
-        kwargs={'rate': 0.5},
-        input_shape=(2, 3, 4))
-
-  def test_spatial_dropout_2d(self):
-    test_utils.layer_test(
-        keras.layers.SpatialDropout2D,
-        kwargs={'rate': 0.5},
-        input_shape=(2, 3, 4, 5))
-
-    test_utils.layer_test(
-        keras.layers.SpatialDropout2D,
-        kwargs={
-            'rate': 0.5,
-            'data_format': 'channels_first'
-        },
-        input_shape=(2, 3, 4, 5))
-
-  def test_spatial_dropout_3d(self):
-    test_utils.layer_test(
-        keras.layers.SpatialDropout3D,
-        kwargs={'rate': 0.5},
-        input_shape=(2, 3, 4, 4, 5))
-
-    test_utils.layer_test(
-        keras.layers.SpatialDropout3D,
-        kwargs={
-            'rate': 0.5,
-            'data_format': 'channels_first'
-        },
-        input_shape=(2, 3, 4, 4, 5))
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_spatial_dropout_1d(self):
+        test_utils.layer_test(
+            keras.layers.SpatialDropout1D,
+            kwargs={"rate": 0.5},
+            input_shape=(2, 3, 4),
+        )
+
+    def test_spatial_dropout_2d(self):
+        test_utils.layer_test(
+            keras.layers.SpatialDropout2D,
+            kwargs={"rate": 0.5},
+            input_shape=(2, 3, 4, 5),
+        )
+
+        test_utils.layer_test(
+            keras.layers.SpatialDropout2D,
+            kwargs={"rate": 0.5, "data_format": "channels_first"},
+            input_shape=(2, 3, 4, 5),
+        )
+
+    def test_spatial_dropout_3d(self):
+        test_utils.layer_test(
+            keras.layers.SpatialDropout3D,
+            kwargs={"rate": 0.5},
+            input_shape=(2, 3, 4, 4, 5),
+        )
+
+        test_utils.layer_test(
+            keras.layers.SpatialDropout3D,
+            kwargs={"rate": 0.5, "data_format": "channels_first"},
+            input_shape=(2, 3, 4, 4, 5),
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/reshaping/BUILD b/keras/layers/reshaping/BUILD
index 0fd9bdb8d927..2f7e2a73d8e6 100644
--- a/keras/layers/reshaping/BUILD
+++ b/keras/layers/reshaping/BUILD
@@ -1,18 +1,20 @@
 # Description:
 #  Contains the Keras reshaping layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:__subpackages__",
         "//third_party/tensorflow/python/distribute:__pkg__",
         "//third_party/tensorflow/python/feature_column:__pkg__",
         "//third_party/tensorflow/python/keras:__subpackages__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
diff --git a/keras/layers/reshaping/cropping1d.py b/keras/layers/reshaping/cropping1d.py
index 5c4068b892c7..2eb632e38d0a 100644
--- a/keras/layers/reshaping/cropping1d.py
+++ b/keras/layers/reshaping/cropping1d.py
@@ -13,77 +13,85 @@
 # limitations under the License.
 # ==============================================================================
 """Keras cropping layer for 1D input."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Cropping1D')
+@keras_export("keras.layers.Cropping1D")
 class Cropping1D(Layer):
-  """Cropping layer for 1D input (e.g. temporal sequence).
-
-  It crops along the time dimension (axis 1).
-
-  Examples:
-
-  >>> input_shape = (2, 3, 2)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> print(x)
-  [[[ 0  1]
-    [ 2  3]
-    [ 4  5]]
-   [[ 6  7]
-    [ 8  9]
-    [10 11]]]
-  >>> y = tf.keras.layers.Cropping1D(cropping=1)(x)
-  >>> print(y)
-  tf.Tensor(
-    [[[2 3]]
-     [[8 9]]], shape=(2, 1, 2), dtype=int64)
-
-  Args:
-    cropping: Int or tuple of int (length 2)
-      How many units should be trimmed off at the beginning and end of
-      the cropping dimension (axis 1).
-      If a single int is provided, the same value will be used for both.
-
-  Input shape:
-    3D tensor with shape `(batch_size, axis_to_crop, features)`
-
-  Output shape:
-    3D tensor with shape `(batch_size, cropped_axis, features)`
-  """
-
-  def __init__(self, cropping=(1, 1), **kwargs):
-    super().__init__(**kwargs)
-    self.cropping = conv_utils.normalize_tuple(
-        cropping, 2, 'cropping', allow_zero=True)
-    self.input_spec = InputSpec(ndim=3)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if input_shape[1] is not None:
-      length = input_shape[1] - self.cropping[0] - self.cropping[1]
-    else:
-      length = None
-    return tf.TensorShape([input_shape[0], length, input_shape[2]])
-
-  def call(self, inputs):
-    if inputs.shape[1] is not None and sum(self.cropping) >= inputs.shape[1]:
-      raise ValueError('cropping parameter of Cropping layer must be '
-                       'greater than the input shape. Received: inputs.shape='
-                       f'{inputs.shape}, and cropping={self.cropping}')
-    if self.cropping[1] == 0:
-      return inputs[:, self.cropping[0]:, :]
-    else:
-      return inputs[:, self.cropping[0]:-self.cropping[1], :]
-
-  def get_config(self):
-    config = {'cropping': self.cropping}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Cropping layer for 1D input (e.g. temporal sequence).
+
+    It crops along the time dimension (axis 1).
+
+    Examples:
+
+    >>> input_shape = (2, 3, 2)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> print(x)
+    [[[ 0  1]
+      [ 2  3]
+      [ 4  5]]
+     [[ 6  7]
+      [ 8  9]
+      [10 11]]]
+    >>> y = tf.keras.layers.Cropping1D(cropping=1)(x)
+    >>> print(y)
+    tf.Tensor(
+      [[[2 3]]
+       [[8 9]]], shape=(2, 1, 2), dtype=int64)
+
+    Args:
+      cropping: Int or tuple of int (length 2)
+        How many units should be trimmed off at the beginning and end of
+        the cropping dimension (axis 1).
+        If a single int is provided, the same value will be used for both.
+
+    Input shape:
+      3D tensor with shape `(batch_size, axis_to_crop, features)`
+
+    Output shape:
+      3D tensor with shape `(batch_size, cropped_axis, features)`
+    """
+
+    def __init__(self, cropping=(1, 1), **kwargs):
+        super().__init__(**kwargs)
+        self.cropping = conv_utils.normalize_tuple(
+            cropping, 2, "cropping", allow_zero=True
+        )
+        self.input_spec = InputSpec(ndim=3)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if input_shape[1] is not None:
+            length = input_shape[1] - self.cropping[0] - self.cropping[1]
+        else:
+            length = None
+        return tf.TensorShape([input_shape[0], length, input_shape[2]])
+
+    def call(self, inputs):
+        if (
+            inputs.shape[1] is not None
+            and sum(self.cropping) >= inputs.shape[1]
+        ):
+            raise ValueError(
+                "cropping parameter of Cropping layer must be "
+                "greater than the input shape. Received: inputs.shape="
+                f"{inputs.shape}, and cropping={self.cropping}"
+            )
+        if self.cropping[1] == 0:
+            return inputs[:, self.cropping[0] :, :]
+        else:
+            return inputs[:, self.cropping[0] : -self.cropping[1], :]
+
+    def get_config(self):
+        config = {"cropping": self.cropping}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/cropping2d.py b/keras/layers/reshaping/cropping2d.py
index 72cedb846936..118de07ee54e 100644
--- a/keras/layers/reshaping/cropping2d.py
+++ b/keras/layers/reshaping/cropping2d.py
@@ -13,152 +13,207 @@
 # limitations under the License.
 # ==============================================================================
 """Keras cropping layer for 2D input."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Cropping2D')
+@keras_export("keras.layers.Cropping2D")
 class Cropping2D(Layer):
-  """Cropping layer for 2D input (e.g. picture).
-
-  It crops along spatial dimensions, i.e. height and width.
-
-  Examples:
-
-  >>> input_shape = (2, 28, 28, 3)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> y = tf.keras.layers.Cropping2D(cropping=((2, 2), (4, 4)))(x)
-  >>> print(y.shape)
-  (2, 24, 20, 3)
-
-  Args:
-    cropping: Int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
-      - If int: the same symmetric cropping
-        is applied to height and width.
-      - If tuple of 2 ints:
-        interpreted as two different
-        symmetric cropping values for height and width:
-        `(symmetric_height_crop, symmetric_width_crop)`.
-      - If tuple of 2 tuples of 2 ints:
-        interpreted as
-        `((top_crop, bottom_crop), (left_crop, right_crop))`
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch_size, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    4D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-      `(batch_size, rows, cols, channels)`
-    - If `data_format` is `"channels_first"`:
-      `(batch_size, channels, rows, cols)`
-
-  Output shape:
-    4D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-      `(batch_size, cropped_rows, cropped_cols, channels)`
-    - If `data_format` is `"channels_first"`:
-      `(batch_size, channels, cropped_rows, cropped_cols)`
-  """
-
-  def __init__(self, cropping=((0, 0), (0, 0)), data_format=None, **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    if isinstance(cropping, int):
-      self.cropping = ((cropping, cropping), (cropping, cropping))
-    elif hasattr(cropping, '__len__'):
-      if len(cropping) != 2:
-        raise ValueError('`cropping` should have two elements. '
-                         f'Received: {cropping}.')
-      height_cropping = conv_utils.normalize_tuple(
-          cropping[0], 2, '1st entry of cropping', allow_zero=True)
-      width_cropping = conv_utils.normalize_tuple(
-          cropping[1], 2, '2nd entry of cropping', allow_zero=True)
-      self.cropping = (height_cropping, width_cropping)
-    else:
-      raise ValueError('`cropping` should be either an int, '
-                       'a tuple of 2 ints '
-                       '(symmetric_height_crop, symmetric_width_crop), '
-                       'or a tuple of 2 tuples of 2 ints '
-                       '((top_crop, bottom_crop), (left_crop, right_crop)). '
-                       f'Received: {cropping}.')
-    self.input_spec = InputSpec(ndim=4)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    # pylint: disable=invalid-unary-operand-type
-    if self.data_format == 'channels_first':
-      return tf.TensorShape([
-          input_shape[0], input_shape[1],
-          input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
-          if input_shape[2] else None,
-          input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
-          if input_shape[3] else None
-      ])
-    else:
-      return tf.TensorShape([
-          input_shape[0],
-          input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
-          if input_shape[1] else None,
-          input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
-          if input_shape[2] else None, input_shape[3]
-      ])
-    # pylint: enable=invalid-unary-operand-type
-
-  def call(self, inputs):
-    # pylint: disable=invalid-unary-operand-type
-    if self.data_format == 'channels_first':
-      if ((inputs.shape[2] is not None and
-           sum(self.cropping[0]) >= inputs.shape[2]) or
-          (inputs.shape[3] is not None and
-           sum(self.cropping[1]) >= inputs.shape[3])):
-        raise ValueError('Argument `cropping` must be '
-                         'greater than the input shape. Received: inputs.shape='
-                         f'{inputs.shape}, and cropping={self.cropping}')
-      if self.cropping[0][1] == self.cropping[1][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:]
-      elif self.cropping[0][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:
-                      -self.cropping[1][1]]
-      elif self.cropping[1][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
-                      self.cropping[1][0]:]
-      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
-                    self.cropping[1][0]:-self.cropping[1][1]]
-    else:
-      if ((inputs.shape[1] is not None and
-           sum(self.cropping[0]) >= inputs.shape[1]) or
-          (inputs.shape[2] is not None and
-           sum(self.cropping[1]) >= inputs.shape[2])):
-        raise ValueError('Argument `cropping` must be '
-                         'greater than the input shape. Received: inputs.shape='
-                         f'{inputs.shape}, and cropping={self.cropping}')
-      if self.cropping[0][1] == self.cropping[1][1] == 0:
-        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:, :]
-      elif self.cropping[0][1] == 0:
-        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:
-                      -self.cropping[1][1], :]
-      elif self.cropping[1][1] == 0:
-        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
-                      self.cropping[1][0]:, :]
-      return inputs[:, self.cropping[0][0]:-self.cropping[0][1], self.cropping[
-          1][0]:-self.cropping[1][1], :]  # pylint: disable=invalid-unary-operand-type
-    # pylint: enable=invalid-unary-operand-type
-
-  def get_config(self):
-    config = {'cropping': self.cropping, 'data_format': self.data_format}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Cropping layer for 2D input (e.g. picture).
+
+    It crops along spatial dimensions, i.e. height and width.
+
+    Examples:
+
+    >>> input_shape = (2, 28, 28, 3)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> y = tf.keras.layers.Cropping2D(cropping=((2, 2), (4, 4)))(x)
+    >>> print(y.shape)
+    (2, 24, 20, 3)
+
+    Args:
+      cropping: Int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+        - If int: the same symmetric cropping
+          is applied to height and width.
+        - If tuple of 2 ints:
+          interpreted as two different
+          symmetric cropping values for height and width:
+          `(symmetric_height_crop, symmetric_width_crop)`.
+        - If tuple of 2 tuples of 2 ints:
+          interpreted as
+          `((top_crop, bottom_crop), (left_crop, right_crop))`
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch_size, channels, height, width)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+        `(batch_size, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+        `(batch_size, channels, rows, cols)`
+
+    Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+        `(batch_size, cropped_rows, cropped_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+        `(batch_size, channels, cropped_rows, cropped_cols)`
+    """
+
+    def __init__(self, cropping=((0, 0), (0, 0)), data_format=None, **kwargs):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        if isinstance(cropping, int):
+            self.cropping = ((cropping, cropping), (cropping, cropping))
+        elif hasattr(cropping, "__len__"):
+            if len(cropping) != 2:
+                raise ValueError(
+                    "`cropping` should have two elements. "
+                    f"Received: {cropping}."
+                )
+            height_cropping = conv_utils.normalize_tuple(
+                cropping[0], 2, "1st entry of cropping", allow_zero=True
+            )
+            width_cropping = conv_utils.normalize_tuple(
+                cropping[1], 2, "2nd entry of cropping", allow_zero=True
+            )
+            self.cropping = (height_cropping, width_cropping)
+        else:
+            raise ValueError(
+                "`cropping` should be either an int, "
+                "a tuple of 2 ints "
+                "(symmetric_height_crop, symmetric_width_crop), "
+                "or a tuple of 2 tuples of 2 ints "
+                "((top_crop, bottom_crop), (left_crop, right_crop)). "
+                f"Received: {cropping}."
+            )
+        self.input_spec = InputSpec(ndim=4)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+
+        if self.data_format == "channels_first":
+            return tf.TensorShape(
+                [
+                    input_shape[0],
+                    input_shape[1],
+                    input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
+                    if input_shape[2]
+                    else None,
+                    input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
+                    if input_shape[3]
+                    else None,
+                ]
+            )
+        else:
+            return tf.TensorShape(
+                [
+                    input_shape[0],
+                    input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
+                    if input_shape[1]
+                    else None,
+                    input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
+                    if input_shape[2]
+                    else None,
+                    input_shape[3],
+                ]
+            )
+
+    def call(self, inputs):
+
+        if self.data_format == "channels_first":
+            if (
+                inputs.shape[2] is not None
+                and sum(self.cropping[0]) >= inputs.shape[2]
+            ) or (
+                inputs.shape[3] is not None
+                and sum(self.cropping[1]) >= inputs.shape[3]
+            ):
+                raise ValueError(
+                    "Argument `cropping` must be "
+                    "greater than the input shape. Received: inputs.shape="
+                    f"{inputs.shape}, and cropping={self.cropping}"
+                )
+            if self.cropping[0][1] == self.cropping[1][1] == 0:
+                return inputs[
+                    :, :, self.cropping[0][0] :, self.cropping[1][0] :
+                ]
+            elif self.cropping[0][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] : -self.cropping[1][1],
+                ]
+            elif self.cropping[1][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] :,
+                ]
+            return inputs[
+                :,
+                :,
+                self.cropping[0][0] : -self.cropping[0][1],
+                self.cropping[1][0] : -self.cropping[1][1],
+            ]
+        else:
+            if (
+                inputs.shape[1] is not None
+                and sum(self.cropping[0]) >= inputs.shape[1]
+            ) or (
+                inputs.shape[2] is not None
+                and sum(self.cropping[1]) >= inputs.shape[2]
+            ):
+                raise ValueError(
+                    "Argument `cropping` must be "
+                    "greater than the input shape. Received: inputs.shape="
+                    f"{inputs.shape}, and cropping={self.cropping}"
+                )
+            if self.cropping[0][1] == self.cropping[1][1] == 0:
+                return inputs[
+                    :, self.cropping[0][0] :, self.cropping[1][0] :, :
+                ]
+            elif self.cropping[0][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] : -self.cropping[1][1],
+                    :,
+                ]
+            elif self.cropping[1][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] :,
+                    :,
+                ]
+            return inputs[
+                :,
+                self.cropping[0][0] : -self.cropping[0][1],
+                self.cropping[1][0] : -self.cropping[1][1],
+                :,
+            ]
+
+    def get_config(self):
+        config = {"cropping": self.cropping, "data_format": self.data_format}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/cropping3d.py b/keras/layers/reshaping/cropping3d.py
index 775c4a32f6a8..a7d1a933e7ca 100644
--- a/keras/layers/reshaping/cropping3d.py
+++ b/keras/layers/reshaping/cropping3d.py
@@ -13,193 +13,301 @@
 # limitations under the License.
 # ==============================================================================
 """Keras cropping layer for 3D input."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Cropping3D')
+@keras_export("keras.layers.Cropping3D")
 class Cropping3D(Layer):
-  """Cropping layer for 3D data (e.g. spatial or spatio-temporal).
-
-    Examples:
-
-  >>> input_shape = (2, 28, 28, 10, 3)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> y = tf.keras.layers.Cropping3D(cropping=(2, 4, 2))(x)
-  >>> print(y.shape)
-  (2, 24, 20, 6, 3)
-
-  Args:
-    cropping: Int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
-      - If int: the same symmetric cropping
-        is applied to depth, height, and width.
-      - If tuple of 3 ints: interpreted as two different
-        symmetric cropping values for depth, height, and width:
-        `(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop)`.
-      - If tuple of 3 tuples of 2 ints: interpreted as
-        `((left_dim1_crop, right_dim1_crop), (left_dim2_crop,
-          right_dim2_crop), (left_dim3_crop, right_dim3_crop))`
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      while `channels_first` corresponds to inputs with shape
-      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    5D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-      `(batch_size, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop,
-        depth)`
-    - If `data_format` is `"channels_first"`:
-      `(batch_size, depth, first_axis_to_crop, second_axis_to_crop,
-        third_axis_to_crop)`
-
-  Output shape:
-    5D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-      `(batch_size, first_cropped_axis, second_cropped_axis, third_cropped_axis,
-        depth)`
-    - If `data_format` is `"channels_first"`:
-      `(batch_size, depth, first_cropped_axis, second_cropped_axis,
-        third_cropped_axis)`
-  """
-
-  def __init__(self,
-               cropping=((1, 1), (1, 1), (1, 1)),
-               data_format=None,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    if isinstance(cropping, int):
-      self.cropping = ((cropping, cropping), (cropping, cropping), (cropping,
-                                                                    cropping))
-    elif hasattr(cropping, '__len__'):
-      if len(cropping) != 3:
-        raise ValueError('`cropping` should have 3 elements. '
-                         f'Received: {cropping}.')
-      dim1_cropping = conv_utils.normalize_tuple(
-          cropping[0], 2, '1st entry of cropping', allow_zero=True)
-      dim2_cropping = conv_utils.normalize_tuple(
-          cropping[1], 2, '2nd entry of cropping', allow_zero=True)
-      dim3_cropping = conv_utils.normalize_tuple(
-          cropping[2], 2, '3rd entry of cropping', allow_zero=True)
-      self.cropping = (dim1_cropping, dim2_cropping, dim3_cropping)
-    else:
-      raise ValueError(
-          '`cropping` should be either an int, '
-          'a tuple of 3 ints '
-          '(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop), '
-          'or a tuple of 3 tuples of 2 ints '
-          '((left_dim1_crop, right_dim1_crop),'
-          ' (left_dim2_crop, right_dim2_crop),'
-          ' (left_dim3_crop, right_dim2_crop)). '
-          f'Received: {cropping}.')
-    self.input_spec = InputSpec(ndim=5)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    # pylint: disable=invalid-unary-operand-type
-    if self.data_format == 'channels_first':
-      if input_shape[2] is not None:
-        dim1 = input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
-      else:
-        dim1 = None
-      if input_shape[3] is not None:
-        dim2 = input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
-      else:
-        dim2 = None
-      if input_shape[4] is not None:
-        dim3 = input_shape[4] - self.cropping[2][0] - self.cropping[2][1]
-      else:
-        dim3 = None
-      return tf.TensorShape(
-          [input_shape[0], input_shape[1], dim1, dim2, dim3])
-    elif self.data_format == 'channels_last':
-      if input_shape[1] is not None:
-        dim1 = input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
-      else:
-        dim1 = None
-      if input_shape[2] is not None:
-        dim2 = input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
-      else:
-        dim2 = None
-      if input_shape[3] is not None:
-        dim3 = input_shape[3] - self.cropping[2][0] - self.cropping[2][1]
-      else:
-        dim3 = None
-      return tf.TensorShape(
-          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
-    # pylint: enable=invalid-unary-operand-type
-
-  def call(self, inputs):
-    # pylint: disable=invalid-unary-operand-type
-    if self.data_format == 'channels_first':
-      if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:,
-                      self.cropping[2][0]:]
-      elif self.cropping[0][1] == self.cropping[1][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:,
-                      self.cropping[2][0]:-self.cropping[2][1]]
-      elif self.cropping[1][1] == self.cropping[2][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
-                      self.cropping[1][0]:, self.cropping[2][0]:]
-      elif self.cropping[0][1] == self.cropping[2][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][0]:
-                      -self.cropping[1][1], self.cropping[2][0]:]
-      elif self.cropping[0][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:, self.cropping[1][
-            0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][1]]
-      elif self.cropping[1][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1], self.
-                      cropping[1][0]:, self.cropping[2][0]:-self.cropping[2][1]]
-      elif self.cropping[2][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1], self.
-                      cropping[1][0]:-self.cropping[1][1], self.cropping[2][0]:]
-      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
-                    self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][
-                        0]:-self.cropping[2][1]]
-    else:
-      if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
-        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:,
-                      self.cropping[2][0]:, :]
-      elif self.cropping[0][1] == self.cropping[1][1] == 0:
-        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:,
-                      self.cropping[2][0]:-self.cropping[2][1], :]
-      elif self.cropping[1][1] == self.cropping[2][1] == 0:
-        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
-                      self.cropping[1][0]:, self.cropping[2][0]:, :]
-      elif self.cropping[0][1] == self.cropping[2][1] == 0:
-        return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:
-                      -self.cropping[1][1], self.cropping[2][0]:, :]
-      elif self.cropping[0][1] == 0:
-        return inputs[:, self.cropping[0][0]:, self.cropping[1][
-            0]:-self.cropping[1][1], self.cropping[2][0]:
-                      -self.cropping[2][1], :]
-      elif self.cropping[1][1] == 0:
-        return inputs[:, self.cropping[0][
-            0]:-self.cropping[0][1], self.cropping[1][0]:, self.cropping[2][0]:
-                      -self.cropping[2][1], :]
-      elif self.cropping[2][1] == 0:
-        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
-                      self.cropping[1][0]:-self.cropping[1][1], self.cropping[
-                          2][0]:, :]
-      return inputs[:, self.cropping[0][0]:-self.cropping[0][1], self.cropping[
-          1][0]:-self.cropping[1][1], self.cropping[2][0]:  # pylint: disable=invalid-unary-operand-type
-                    -self.cropping[2][1], :]  # pylint: disable=invalid-unary-operand-type
-    # pylint: enable=invalid-unary-operand-type
-
-  def get_config(self):
-    config = {'cropping': self.cropping, 'data_format': self.data_format}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Cropping layer for 3D data (e.g. spatial or spatio-temporal).
+
+      Examples:
+
+    >>> input_shape = (2, 28, 28, 10, 3)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> y = tf.keras.layers.Cropping3D(cropping=(2, 4, 2))(x)
+    >>> print(y.shape)
+    (2, 24, 20, 6, 3)
+
+    Args:
+      cropping: Int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
+        - If int: the same symmetric cropping
+          is applied to depth, height, and width.
+        - If tuple of 3 ints: interpreted as two different
+          symmetric cropping values for depth, height, and width:
+          `(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop)`.
+        - If tuple of 3 tuples of 2 ints: interpreted as
+          `((left_dim1_crop, right_dim1_crop), (left_dim2_crop,
+            right_dim2_crop), (left_dim3_crop, right_dim3_crop))`
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+        while `channels_first` corresponds to inputs with shape
+        `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+        `(batch_size, first_axis_to_crop, second_axis_to_crop,
+        third_axis_to_crop, depth)`
+      - If `data_format` is `"channels_first"`:
+        `(batch_size, depth, first_axis_to_crop, second_axis_to_crop,
+          third_axis_to_crop)`
+
+    Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+        `(batch_size, first_cropped_axis, second_cropped_axis,
+        third_cropped_axis, depth)`
+      - If `data_format` is `"channels_first"`:
+        `(batch_size, depth, first_cropped_axis, second_cropped_axis,
+          third_cropped_axis)`
+    """
+
+    def __init__(
+        self, cropping=((1, 1), (1, 1), (1, 1)), data_format=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        if isinstance(cropping, int):
+            self.cropping = (
+                (cropping, cropping),
+                (cropping, cropping),
+                (cropping, cropping),
+            )
+        elif hasattr(cropping, "__len__"):
+            if len(cropping) != 3:
+                raise ValueError(
+                    f"`cropping` should have 3 elements. Received: {cropping}."
+                )
+            dim1_cropping = conv_utils.normalize_tuple(
+                cropping[0], 2, "1st entry of cropping", allow_zero=True
+            )
+            dim2_cropping = conv_utils.normalize_tuple(
+                cropping[1], 2, "2nd entry of cropping", allow_zero=True
+            )
+            dim3_cropping = conv_utils.normalize_tuple(
+                cropping[2], 2, "3rd entry of cropping", allow_zero=True
+            )
+            self.cropping = (dim1_cropping, dim2_cropping, dim3_cropping)
+        else:
+            raise ValueError(
+                "`cropping` should be either an int, "
+                "a tuple of 3 ints "
+                "(symmetric_dim1_crop, symmetric_dim2_crop, "
+                "symmetric_dim3_crop), "
+                "or a tuple of 3 tuples of 2 ints "
+                "((left_dim1_crop, right_dim1_crop),"
+                " (left_dim2_crop, right_dim2_crop),"
+                " (left_dim3_crop, right_dim2_crop)). "
+                f"Received: {cropping}."
+            )
+        self.input_spec = InputSpec(ndim=5)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+
+        if self.data_format == "channels_first":
+            if input_shape[2] is not None:
+                dim1 = (
+                    input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
+                )
+            else:
+                dim1 = None
+            if input_shape[3] is not None:
+                dim2 = (
+                    input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
+                )
+            else:
+                dim2 = None
+            if input_shape[4] is not None:
+                dim3 = (
+                    input_shape[4] - self.cropping[2][0] - self.cropping[2][1]
+                )
+            else:
+                dim3 = None
+            return tf.TensorShape(
+                [input_shape[0], input_shape[1], dim1, dim2, dim3]
+            )
+        elif self.data_format == "channels_last":
+            if input_shape[1] is not None:
+                dim1 = (
+                    input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
+                )
+            else:
+                dim1 = None
+            if input_shape[2] is not None:
+                dim2 = (
+                    input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
+                )
+            else:
+                dim2 = None
+            if input_shape[3] is not None:
+                dim3 = (
+                    input_shape[3] - self.cropping[2][0] - self.cropping[2][1]
+                )
+            else:
+                dim3 = None
+            return tf.TensorShape(
+                [input_shape[0], dim1, dim2, dim3, input_shape[4]]
+            )
+
+    def call(self, inputs):
+
+        if self.data_format == "channels_first":
+            if (
+                self.cropping[0][1]
+                == self.cropping[1][1]
+                == self.cropping[2][1]
+                == 0
+            ):
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] :,
+                ]
+            elif self.cropping[0][1] == self.cropping[1][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] : -self.cropping[2][1],
+                ]
+            elif self.cropping[1][1] == self.cropping[2][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] :,
+                ]
+            elif self.cropping[0][1] == self.cropping[2][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] : -self.cropping[1][1],
+                    self.cropping[2][0] :,
+                ]
+            elif self.cropping[0][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] : -self.cropping[1][1],
+                    self.cropping[2][0] : -self.cropping[2][1],
+                ]
+            elif self.cropping[1][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] : -self.cropping[2][1],
+                ]
+            elif self.cropping[2][1] == 0:
+                return inputs[
+                    :,
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] : -self.cropping[1][1],
+                    self.cropping[2][0] :,
+                ]
+            return inputs[
+                :,
+                :,
+                self.cropping[0][0] : -self.cropping[0][1],
+                self.cropping[1][0] : -self.cropping[1][1],
+                self.cropping[2][0] : -self.cropping[2][1],
+            ]
+        else:
+            if (
+                self.cropping[0][1]
+                == self.cropping[1][1]
+                == self.cropping[2][1]
+                == 0
+            ):
+                return inputs[
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] :,
+                    :,
+                ]
+            elif self.cropping[0][1] == self.cropping[1][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] : -self.cropping[2][1],
+                    :,
+                ]
+            elif self.cropping[1][1] == self.cropping[2][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] :,
+                    :,
+                ]
+            elif self.cropping[0][1] == self.cropping[2][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] : -self.cropping[1][1],
+                    self.cropping[2][0] :,
+                    :,
+                ]
+            elif self.cropping[0][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] :,
+                    self.cropping[1][0] : -self.cropping[1][1],
+                    self.cropping[2][0] : -self.cropping[2][1],
+                    :,
+                ]
+            elif self.cropping[1][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] :,
+                    self.cropping[2][0] : -self.cropping[2][1],
+                    :,
+                ]
+            elif self.cropping[2][1] == 0:
+                return inputs[
+                    :,
+                    self.cropping[0][0] : -self.cropping[0][1],
+                    self.cropping[1][0] : -self.cropping[1][1],
+                    self.cropping[2][0] :,
+                    :,
+                ]
+            return inputs[
+                :,
+                self.cropping[0][0] : -self.cropping[0][1],
+                self.cropping[1][0] : -self.cropping[1][1],
+                self.cropping[2][0] : -self.cropping[2][1],
+                :,
+            ]
+
+    def get_config(self):
+        config = {"cropping": self.cropping, "data_format": self.data_format}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/cropping_test.py b/keras/layers/reshaping/cropping_test.py
index 5b6a7d22b5da..69f7a28003d0 100644
--- a/keras/layers/reshaping/cropping_test.py
+++ b/keras/layers/reshaping/cropping_test.py
@@ -14,161 +14,199 @@
 # ==============================================================================
 """Tests for cropping layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class CroppingTest(test_combinations.TestCase):
+    def test_cropping_1d(self):
+        num_samples = 2
+        time_length = 4
+        input_len_dim1 = 2
+        inputs = np.random.rand(num_samples, time_length, input_len_dim1)
 
-  def test_cropping_1d(self):
-    num_samples = 2
-    time_length = 4
-    input_len_dim1 = 2
-    inputs = np.random.rand(num_samples, time_length, input_len_dim1)
-
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.Cropping1D,
-          kwargs={'cropping': (1, 1)},
-          input_shape=inputs.shape)
-
-    # test incorrect use
-    with self.assertRaises(ValueError):
-      keras.layers.Cropping1D(cropping=(1, 1, 1))
-    with self.assertRaises(ValueError):
-      keras.layers.Cropping1D(cropping=None)
-    with self.assertRaises(ValueError):
-      input_layer = keras.layers.Input(
-          shape=(num_samples, time_length, input_len_dim1))
-      keras.layers.Cropping1D(cropping=(2, 3))(input_layer)
-
-  def test_cropping_2d(self):
-    num_samples = 2
-    stack_size = 2
-    input_len_dim1 = 9
-    input_len_dim2 = 9
-    cropping = ((2, 2), (3, 3))
-
-    for data_format in ['channels_first', 'channels_last']:
-      if data_format == 'channels_first':
-        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
-                                input_len_dim2)
-      else:
-        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
-                                stack_size)
-      with self.cached_session():
-        # basic test
-        test_utils.layer_test(
-            keras.layers.Cropping2D,
-            kwargs={'cropping': cropping,
-                    'data_format': data_format},
-            input_shape=inputs.shape)
-        # correctness test
-        layer = keras.layers.Cropping2D(
-            cropping=cropping, data_format=data_format)
-        layer.build(inputs.shape)
-        output = layer(keras.backend.variable(inputs))
-        if tf.executing_eagerly():
-          np_output = output.numpy()
-        else:
-          np_output = keras.backend.eval(output)
-        # compare with numpy
-        if data_format == 'channels_first':
-          expected_out = inputs[:, :, cropping[0][0]:-cropping[0][1], cropping[
-              1][0]:-cropping[1][1]]
-        else:
-          expected_out = inputs[:, cropping[0][0]:-cropping[0][1], cropping[1][
-              0]:-cropping[1][1], :]
-        np.testing.assert_allclose(np_output, expected_out)
-
-    for data_format in ['channels_first', 'channels_last']:
-      if data_format == 'channels_first':
-        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
-                                input_len_dim2)
-      else:
-        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
-                                stack_size)
-      # another correctness test (no cropping)
-      with self.cached_session():
-        cropping = ((0, 0), (0, 0))
-        layer = keras.layers.Cropping2D(
-            cropping=cropping, data_format=data_format)
-        layer.build(inputs.shape)
-        output = layer(keras.backend.variable(inputs))
-        if tf.executing_eagerly():
-          np_output = output.numpy()
-        else:
-          np_output = keras.backend.eval(output)
-        # compare with input
-        np.testing.assert_allclose(np_output, inputs)
-
-    # test incorrect use
-    with self.assertRaises(ValueError):
-      keras.layers.Cropping2D(cropping=(1, 1, 1))
-    with self.assertRaises(ValueError):
-      keras.layers.Cropping2D(cropping=None)
-    with self.assertRaises(ValueError):
-      input_layer = keras.layers.Input(
-          shape=(num_samples, input_len_dim1, input_len_dim2, stack_size))
-      keras.layers.Cropping2D(cropping=((5, 4), (3, 4)))(input_layer)
-
-  def test_cropping_3d(self):
-    num_samples = 2
-    stack_size = 2
-    input_len_dim1 = 8
-    input_len_dim2 = 8
-    input_len_dim3 = 8
-    croppings = [((2, 2), (1, 1), (2, 3)), 3, (0, 1, 1)]
-
-    for cropping in croppings:
-      for data_format in ['channels_last', 'channels_first']:
-        if data_format == 'channels_first':
-          inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
-                                  input_len_dim2, input_len_dim3)
-        else:
-          inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
-                                  input_len_dim3, stack_size)
-        # basic test
         with self.cached_session():
-          test_utils.layer_test(
-              keras.layers.Cropping3D,
-              kwargs={'cropping': cropping,
-                      'data_format': data_format},
-              input_shape=inputs.shape)
-
-        if len(croppings) == 3 and len(croppings[0]) == 2:
-          # correctness test
-          with self.cached_session():
-            layer = keras.layers.Cropping3D(
-                cropping=cropping, data_format=data_format)
-            layer.build(inputs.shape)
-            output = layer(keras.backend.variable(inputs))
-            if tf.executing_eagerly():
-              np_output = output.numpy()
+            test_utils.layer_test(
+                keras.layers.Cropping1D,
+                kwargs={"cropping": (1, 1)},
+                input_shape=inputs.shape,
+            )
+
+        # test incorrect use
+        with self.assertRaises(ValueError):
+            keras.layers.Cropping1D(cropping=(1, 1, 1))
+        with self.assertRaises(ValueError):
+            keras.layers.Cropping1D(cropping=None)
+        with self.assertRaises(ValueError):
+            input_layer = keras.layers.Input(
+                shape=(num_samples, time_length, input_len_dim1)
+            )
+            keras.layers.Cropping1D(cropping=(2, 3))(input_layer)
+
+    def test_cropping_2d(self):
+        num_samples = 2
+        stack_size = 2
+        input_len_dim1 = 9
+        input_len_dim2 = 9
+        cropping = ((2, 2), (3, 3))
+
+        for data_format in ["channels_first", "channels_last"]:
+            if data_format == "channels_first":
+                inputs = np.random.rand(
+                    num_samples, stack_size, input_len_dim1, input_len_dim2
+                )
             else:
-              np_output = keras.backend.eval(output)
-            # compare with numpy
-            if data_format == 'channels_first':
-              expected_out = inputs[:, :,
-                                    cropping[0][0]:-cropping[0][1],
-                                    cropping[1][0]:-cropping[1][1],
-                                    cropping[2][0]:-cropping[2][1]]
+                inputs = np.random.rand(
+                    num_samples, input_len_dim1, input_len_dim2, stack_size
+                )
+            with self.cached_session():
+                # basic test
+                test_utils.layer_test(
+                    keras.layers.Cropping2D,
+                    kwargs={"cropping": cropping, "data_format": data_format},
+                    input_shape=inputs.shape,
+                )
+                # correctness test
+                layer = keras.layers.Cropping2D(
+                    cropping=cropping, data_format=data_format
+                )
+                layer.build(inputs.shape)
+                output = layer(keras.backend.variable(inputs))
+                if tf.executing_eagerly():
+                    np_output = output.numpy()
+                else:
+                    np_output = keras.backend.eval(output)
+                # compare with numpy
+                if data_format == "channels_first":
+                    expected_out = inputs[
+                        :,
+                        :,
+                        cropping[0][0] : -cropping[0][1],
+                        cropping[1][0] : -cropping[1][1],
+                    ]
+                else:
+                    expected_out = inputs[
+                        :,
+                        cropping[0][0] : -cropping[0][1],
+                        cropping[1][0] : -cropping[1][1],
+                        :,
+                    ]
+                np.testing.assert_allclose(np_output, expected_out)
+
+        for data_format in ["channels_first", "channels_last"]:
+            if data_format == "channels_first":
+                inputs = np.random.rand(
+                    num_samples, stack_size, input_len_dim1, input_len_dim2
+                )
             else:
-              expected_out = inputs[:,
-                                    cropping[0][0]:-cropping[0][1],
-                                    cropping[1][0]:-cropping[1][1],
-                                    cropping[2][0]:-cropping[2][1], :]
-            np.testing.assert_allclose(np_output, expected_out)
-
-    # test incorrect use
-    with self.assertRaises(ValueError):
-      keras.layers.Cropping3D(cropping=(1, 1))
-    with self.assertRaises(ValueError):
-      keras.layers.Cropping3D(cropping=None)
-
-if __name__ == '__main__':
-  tf.test.main()
+                inputs = np.random.rand(
+                    num_samples, input_len_dim1, input_len_dim2, stack_size
+                )
+            # another correctness test (no cropping)
+            with self.cached_session():
+                cropping = ((0, 0), (0, 0))
+                layer = keras.layers.Cropping2D(
+                    cropping=cropping, data_format=data_format
+                )
+                layer.build(inputs.shape)
+                output = layer(keras.backend.variable(inputs))
+                if tf.executing_eagerly():
+                    np_output = output.numpy()
+                else:
+                    np_output = keras.backend.eval(output)
+                # compare with input
+                np.testing.assert_allclose(np_output, inputs)
+
+        # test incorrect use
+        with self.assertRaises(ValueError):
+            keras.layers.Cropping2D(cropping=(1, 1, 1))
+        with self.assertRaises(ValueError):
+            keras.layers.Cropping2D(cropping=None)
+        with self.assertRaises(ValueError):
+            input_layer = keras.layers.Input(
+                shape=(num_samples, input_len_dim1, input_len_dim2, stack_size)
+            )
+            keras.layers.Cropping2D(cropping=((5, 4), (3, 4)))(input_layer)
+
+    def test_cropping_3d(self):
+        num_samples = 2
+        stack_size = 2
+        input_len_dim1 = 8
+        input_len_dim2 = 8
+        input_len_dim3 = 8
+        croppings = [((2, 2), (1, 1), (2, 3)), 3, (0, 1, 1)]
+
+        for cropping in croppings:
+            for data_format in ["channels_last", "channels_first"]:
+                if data_format == "channels_first":
+                    inputs = np.random.rand(
+                        num_samples,
+                        stack_size,
+                        input_len_dim1,
+                        input_len_dim2,
+                        input_len_dim3,
+                    )
+                else:
+                    inputs = np.random.rand(
+                        num_samples,
+                        input_len_dim1,
+                        input_len_dim2,
+                        input_len_dim3,
+                        stack_size,
+                    )
+                # basic test
+                with self.cached_session():
+                    test_utils.layer_test(
+                        keras.layers.Cropping3D,
+                        kwargs={
+                            "cropping": cropping,
+                            "data_format": data_format,
+                        },
+                        input_shape=inputs.shape,
+                    )
+
+                if len(croppings) == 3 and len(croppings[0]) == 2:
+                    # correctness test
+                    with self.cached_session():
+                        layer = keras.layers.Cropping3D(
+                            cropping=cropping, data_format=data_format
+                        )
+                        layer.build(inputs.shape)
+                        output = layer(keras.backend.variable(inputs))
+                        if tf.executing_eagerly():
+                            np_output = output.numpy()
+                        else:
+                            np_output = keras.backend.eval(output)
+                        # compare with numpy
+                        if data_format == "channels_first":
+                            expected_out = inputs[
+                                :,
+                                :,
+                                cropping[0][0] : -cropping[0][1],
+                                cropping[1][0] : -cropping[1][1],
+                                cropping[2][0] : -cropping[2][1],
+                            ]
+                        else:
+                            expected_out = inputs[
+                                :,
+                                cropping[0][0] : -cropping[0][1],
+                                cropping[1][0] : -cropping[1][1],
+                                cropping[2][0] : -cropping[2][1],
+                                :,
+                            ]
+                        np.testing.assert_allclose(np_output, expected_out)
+
+        # test incorrect use
+        with self.assertRaises(ValueError):
+            keras.layers.Cropping3D(cropping=(1, 1))
+        with self.assertRaises(ValueError):
+            keras.layers.Cropping3D(cropping=None)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/reshaping/flatten.py b/keras/layers/reshaping/flatten.py
index 8dc1d246d68d..51d3a4fe2a49 100644
--- a/keras/layers/reshaping/flatten.py
+++ b/keras/layers/reshaping/flatten.py
@@ -13,103 +13,110 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the flatten layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import functools
 import operator
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Flatten')
+@keras_export("keras.layers.Flatten")
 class Flatten(Layer):
-  """Flattens the input. Does not affect the batch size.
-
-  Note: If inputs are shaped `(batch,)` without a feature axis, then
-  flattening adds an extra channel dimension and output shape is `(batch, 1)`.
-
-  Args:
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Example:
-
-  >>> model = tf.keras.Sequential()
-  >>> model.add(tf.keras.layers.Conv2D(64, 3, 3, input_shape=(3, 32, 32)))
-  >>> model.output_shape
-  (None, 1, 10, 64)
-
-  >>> model.add(Flatten())
-  >>> model.output_shape
-  (None, 640)
-
-  """
-
-  def __init__(self, data_format=None, **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(min_ndim=1)
-    self._channels_first = self.data_format == 'channels_first'
-
-  def call(self, inputs):
-    if self._channels_first:
-      rank = inputs.shape.rank
-      if rank and rank > 1:
-        # Switch to channels-last format.
-        permutation = [0]
-        permutation.extend(range(2, rank))
-        permutation.append(1)
-        inputs = tf.transpose(inputs, perm=permutation)
-
-    if tf.executing_eagerly():
-      # Full static shape is guaranteed to be available.
-      # Performance: Using `constant_op` is much faster than passing a list.
-      flattened_shape = tf.constant([inputs.shape[0], -1])
-      return tf.reshape(inputs, flattened_shape)
-    else:
-      input_shape = inputs.shape
-      rank = input_shape.rank
-      if rank == 1:
-        return tf.expand_dims(inputs, axis=1)
-      else:
-        batch_dim = tf.compat.dimension_value(input_shape[0])
-        non_batch_dims = input_shape[1:]
-        # Reshape in a way that preserves as much shape info as possible.
-        if non_batch_dims.is_fully_defined():
-          last_dim = int(functools.reduce(operator.mul, non_batch_dims))
-          flattened_shape = tf.constant([-1, last_dim])
-        elif batch_dim is not None:
-          flattened_shape = tf.constant([int(batch_dim), -1])
+    """Flattens the input. Does not affect the batch size.
+
+    Note: If inputs are shaped `(batch,)` without a feature axis, then
+    flattening adds an extra channel dimension and output shape is `(batch, 1)`.
+
+    Args:
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, ..., channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, ...)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Example:
+
+    >>> model = tf.keras.Sequential()
+    >>> model.add(tf.keras.layers.Conv2D(64, 3, 3, input_shape=(3, 32, 32)))
+    >>> model.output_shape
+    (None, 1, 10, 64)
+
+    >>> model.add(Flatten())
+    >>> model.output_shape
+    (None, 640)
+
+    """
+
+    def __init__(self, data_format=None, **kwargs):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.input_spec = InputSpec(min_ndim=1)
+        self._channels_first = self.data_format == "channels_first"
+
+    def call(self, inputs):
+        if self._channels_first:
+            rank = inputs.shape.rank
+            if rank and rank > 1:
+                # Switch to channels-last format.
+                permutation = [0]
+                permutation.extend(range(2, rank))
+                permutation.append(1)
+                inputs = tf.transpose(inputs, perm=permutation)
+
+        if tf.executing_eagerly():
+            # Full static shape is guaranteed to be available.
+            # Performance: Using `constant_op` is much faster than passing a
+            # list.
+            flattened_shape = tf.constant([inputs.shape[0], -1])
+            return tf.reshape(inputs, flattened_shape)
+        else:
+            input_shape = inputs.shape
+            rank = input_shape.rank
+            if rank == 1:
+                return tf.expand_dims(inputs, axis=1)
+            else:
+                batch_dim = tf.compat.dimension_value(input_shape[0])
+                non_batch_dims = input_shape[1:]
+                # Reshape in a way that preserves as much shape info as
+                # possible.
+                if non_batch_dims.is_fully_defined():
+                    last_dim = int(
+                        functools.reduce(operator.mul, non_batch_dims)
+                    )
+                    flattened_shape = tf.constant([-1, last_dim])
+                elif batch_dim is not None:
+                    flattened_shape = tf.constant([int(batch_dim), -1])
+                else:
+                    flattened_shape = [tf.shape(inputs)[0], -1]
+                return tf.reshape(inputs, flattened_shape)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if not input_shape:
+            output_shape = tf.TensorShape([1])
         else:
-          flattened_shape = [tf.shape(inputs)[0], -1]
-        return tf.reshape(inputs, flattened_shape)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if not input_shape:
-      output_shape = tf.TensorShape([1])
-    else:
-      output_shape = [input_shape[0]]
-    if np.all(input_shape[1:]):
-      output_shape += [np.prod(input_shape[1:], dtype=int)]
-    else:
-      output_shape += [None]
-    return tf.TensorShape(output_shape)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({'data_format': self.data_format})
-    return config
+            output_shape = [input_shape[0]]
+        if np.all(input_shape[1:]):
+            output_shape += [np.prod(input_shape[1:], dtype=int)]
+        else:
+            output_shape += [None]
+        return tf.TensorShape(output_shape)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"data_format": self.data_format})
+        return config
diff --git a/keras/layers/reshaping/flatten_test.py b/keras/layers/reshaping/flatten_test.py
index f6a343bcb798..92127afffe29 100644
--- a/keras/layers/reshaping/flatten_test.py
+++ b/keras/layers/reshaping/flatten_test.py
@@ -14,41 +14,46 @@
 # ==============================================================================
 """Tests for flatten layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class FlattenTest(test_combinations.TestCase):
-
-  def test_flatten(self):
-    test_utils.layer_test(
-        keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
-
-    # Test channels_first
-    inputs = np.random.random((10, 3, 5, 5)).astype('float32')
-    outputs = test_utils.layer_test(
-        keras.layers.Flatten,
-        kwargs={'data_format': 'channels_first'},
-        input_data=inputs)
-    target_outputs = np.reshape(
-        np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
-    self.assertAllClose(outputs, target_outputs)
-
-  def test_flatten_scalar_channels(self):
-    test_utils.layer_test(keras.layers.Flatten, kwargs={}, input_shape=(3,))
-
-    # Test channels_first
-    inputs = np.random.random((10,)).astype('float32')
-    outputs = test_utils.layer_test(
-        keras.layers.Flatten,
-        kwargs={'data_format': 'channels_first'},
-        input_data=inputs)
-    target_outputs = np.expand_dims(inputs, -1)
-    self.assertAllClose(outputs, target_outputs)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_flatten(self):
+        test_utils.layer_test(
+            keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4)
+        )
+
+        # Test channels_first
+        inputs = np.random.random((10, 3, 5, 5)).astype("float32")
+        outputs = test_utils.layer_test(
+            keras.layers.Flatten,
+            kwargs={"data_format": "channels_first"},
+            input_data=inputs,
+        )
+        target_outputs = np.reshape(
+            np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3)
+        )
+        self.assertAllClose(outputs, target_outputs)
+
+    def test_flatten_scalar_channels(self):
+        test_utils.layer_test(keras.layers.Flatten, kwargs={}, input_shape=(3,))
+
+        # Test channels_first
+        inputs = np.random.random((10,)).astype("float32")
+        outputs = test_utils.layer_test(
+            keras.layers.Flatten,
+            kwargs={"data_format": "channels_first"},
+            input_data=inputs,
+        )
+        target_outputs = np.expand_dims(inputs, -1)
+        self.assertAllClose(outputs, target_outputs)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/reshaping/permute.py b/keras/layers/reshaping/permute.py
index 96767a1a944d..590815e9a8e6 100644
--- a/keras/layers/reshaping/permute.py
+++ b/keras/layers/reshaping/permute.py
@@ -13,70 +13,73 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the Permute layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import copy
 
+import tensorflow.compat.v2 as tf
+
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Permute')
+@keras_export("keras.layers.Permute")
 class Permute(Layer):
-  """Permutes the dimensions of the input according to a given pattern.
-
-  Useful e.g. connecting RNNs and convnets.
-
-  Example:
-
-  ```python
-  model = Sequential()
-  model.add(Permute((2, 1), input_shape=(10, 64)))
-  # now: model.output_shape == (None, 64, 10)
-  # note: `None` is the batch dimension
-  ```
-
-  Args:
-    dims: Tuple of integers. Permutation pattern does not include the
-      samples dimension. Indexing starts at 1.
-      For instance, `(2, 1)` permutes the first and second dimensions
-      of the input.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same as the input shape, but with the dimensions re-ordered according
-    to the specified pattern.
-  """
-
-  def __init__(self, dims, **kwargs):
-    super().__init__(**kwargs)
-    self.dims = tuple(dims)
-    if sorted(dims) != list(range(1, len(dims) + 1)):
-      raise ValueError(
-          'Invalid permutation argument `dims` for Permute Layer. '
-          'The set of indices in `dims` must be consecutive and start from 1. '
-          f'Received dims={dims}')
-    self.input_spec = InputSpec(ndim=len(self.dims) + 1)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    output_shape = copy.copy(input_shape)
-    for i, dim in enumerate(self.dims):
-      target_dim = input_shape[dim]
-      output_shape[i + 1] = target_dim
-    return tf.TensorShape(output_shape)
-
-  def call(self, inputs):
-    return tf.transpose(inputs, perm=(0,) + self.dims)
-
-  def get_config(self):
-    config = {'dims': self.dims}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Permutes the dimensions of the input according to a given pattern.
+
+    Useful e.g. connecting RNNs and convnets.
+
+    Example:
+
+    ```python
+    model = Sequential()
+    model.add(Permute((2, 1), input_shape=(10, 64)))
+    # now: model.output_shape == (None, 64, 10)
+    # note: `None` is the batch dimension
+    ```
+
+    Args:
+      dims: Tuple of integers. Permutation pattern does not include the
+        samples dimension. Indexing starts at 1.
+        For instance, `(2, 1)` permutes the first and second dimensions
+        of the input.
+
+    Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+    Output shape:
+      Same as the input shape, but with the dimensions re-ordered according
+      to the specified pattern.
+    """
+
+    def __init__(self, dims, **kwargs):
+        super().__init__(**kwargs)
+        self.dims = tuple(dims)
+        if sorted(dims) != list(range(1, len(dims) + 1)):
+            raise ValueError(
+                "Invalid permutation argument `dims` for Permute Layer. "
+                "The set of indices in `dims` must be consecutive and start "
+                f"from 1. Received dims={dims}"
+            )
+        self.input_spec = InputSpec(ndim=len(self.dims) + 1)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        output_shape = copy.copy(input_shape)
+        for i, dim in enumerate(self.dims):
+            target_dim = input_shape[dim]
+            output_shape[i + 1] = target_dim
+        return tf.TensorShape(output_shape)
+
+    def call(self, inputs):
+        return tf.transpose(inputs, perm=(0,) + self.dims)
+
+    def get_config(self):
+        config = {"dims": self.dims}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/permute_test.py b/keras/layers/reshaping/permute_test.py
index 4145a6439e3c..1a9e6564c8de 100644
--- a/keras/layers/reshaping/permute_test.py
+++ b/keras/layers/reshaping/permute_test.py
@@ -14,33 +14,40 @@
 # ==============================================================================
 """Tests for Keras permute layer."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
-import tensorflow.compat.v2 as tf
-
 
 @test_combinations.run_all_keras_modes
 class PermuteTest(test_combinations.TestCase):
-
-  def test_permute(self):
-    test_utils.layer_test(
-        keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
-
-  def test_permute_errors_on_invalid_starting_dims_index(self):
-    with self.assertRaisesRegex(ValueError, r'Invalid permutation .*dims.*'):
-      test_utils.layer_test(
-          keras.layers.Permute,
-          kwargs={'dims': (0, 1, 2)},
-          input_shape=(3, 2, 4))
-
-  def test_permute_errors_on_invalid_set_of_dims_indices(self):
-    with self.assertRaisesRegex(ValueError, r'Invalid permutation .*dims.*'):
-      test_utils.layer_test(
-          keras.layers.Permute,
-          kwargs={'dims': (1, 4, 2)},
-          input_shape=(3, 2, 4))
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_permute(self):
+        test_utils.layer_test(
+            keras.layers.Permute, kwargs={"dims": (2, 1)}, input_shape=(3, 2, 4)
+        )
+
+    def test_permute_errors_on_invalid_starting_dims_index(self):
+        with self.assertRaisesRegex(
+            ValueError, r"Invalid permutation .*dims.*"
+        ):
+            test_utils.layer_test(
+                keras.layers.Permute,
+                kwargs={"dims": (0, 1, 2)},
+                input_shape=(3, 2, 4),
+            )
+
+    def test_permute_errors_on_invalid_set_of_dims_indices(self):
+        with self.assertRaisesRegex(
+            ValueError, r"Invalid permutation .*dims.*"
+        ):
+            test_utils.layer_test(
+                keras.layers.Permute,
+                kwargs={"dims": (1, 4, 2)},
+                input_shape=(3, 2, 4),
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/reshaping/repeat_vector.py b/keras/layers/reshaping/repeat_vector.py
index db3e4cff7ace..46dcb89e1541 100644
--- a/keras/layers/reshaping/repeat_vector.py
+++ b/keras/layers/reshaping/repeat_vector.py
@@ -13,53 +13,57 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the RepeatVector layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.RepeatVector')
+@keras_export("keras.layers.RepeatVector")
 class RepeatVector(Layer):
-  """Repeats the input n times.
+    """Repeats the input n times.
 
-  Example:
+    Example:
 
-  ```python
-  model = Sequential()
-  model.add(Dense(32, input_dim=32))
-  # now: model.output_shape == (None, 32)
-  # note: `None` is the batch dimension
+    ```python
+    model = Sequential()
+    model.add(Dense(32, input_dim=32))
+    # now: model.output_shape == (None, 32)
+    # note: `None` is the batch dimension
 
-  model.add(RepeatVector(3))
-  # now: model.output_shape == (None, 3, 32)
-  ```
+    model.add(RepeatVector(3))
+    # now: model.output_shape == (None, 3, 32)
+    ```
 
-  Args:
-    n: Integer, repetition factor.
-  Input shape: 2D tensor of shape `(num_samples, features)`.
-  Output shape: 3D tensor of shape `(num_samples, n, features)`.
-  """
+    Args:
+      n: Integer, repetition factor.
+    Input shape: 2D tensor of shape `(num_samples, features)`.
+    Output shape: 3D tensor of shape `(num_samples, n, features)`.
+    """
 
-  def __init__(self, n, **kwargs):
-    super().__init__(**kwargs)
-    self.n = n
-    if not isinstance(n, int):
-      raise TypeError(f'Expected an integer value for `n`, got {type(n)}.')
-    self.input_spec = InputSpec(ndim=2)
+    def __init__(self, n, **kwargs):
+        super().__init__(**kwargs)
+        self.n = n
+        if not isinstance(n, int):
+            raise TypeError(
+                f"Expected an integer value for `n`, got {type(n)}."
+            )
+        self.input_spec = InputSpec(ndim=2)
 
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    return tf.TensorShape([input_shape[0], self.n, input_shape[1]])
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        return tf.TensorShape([input_shape[0], self.n, input_shape[1]])
 
-  def call(self, inputs):
-    return backend.repeat(inputs, self.n)
+    def call(self, inputs):
+        return backend.repeat(inputs, self.n)
 
-  def get_config(self):
-    config = {'n': self.n}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"n": self.n}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/repeat_vector_test.py b/keras/layers/reshaping/repeat_vector_test.py
index 62e567f6c478..f307f308f74c 100644
--- a/keras/layers/reshaping/repeat_vector_test.py
+++ b/keras/layers/reshaping/repeat_vector_test.py
@@ -14,26 +14,27 @@
 # ==============================================================================
 """Tests for repeat vector layer."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class RepeatVectorTest(test_combinations.TestCase):
+    def test_repeat_vector(self):
+        test_utils.layer_test(
+            keras.layers.RepeatVector, kwargs={"n": 3}, input_shape=(3, 2)
+        )
 
-  def test_repeat_vector(self):
-    test_utils.layer_test(
-        keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
+    def test_numpy_inputs(self):
+        if tf.executing_eagerly():
+            layer = keras.layers.RepeatVector(2)
+            x = np.ones((10, 10))
+            self.assertAllEqual(np.ones((10, 2, 10)), layer(x))
 
-  def test_numpy_inputs(self):
-    if tf.executing_eagerly():
-      layer = keras.layers.RepeatVector(2)
-      x = np.ones((10, 10))
-      self.assertAllEqual(np.ones((10, 2, 10)), layer(x))
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/reshaping/reshape.py b/keras/layers/reshaping/reshape.py
index ba2636340dbb..83bfccf61a24 100644
--- a/keras/layers/reshaping/reshape.py
+++ b/keras/layers/reshaping/reshape.py
@@ -13,125 +13,136 @@
 # limitations under the License.
 # ==============================================================================
 """Contains the Reshape layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
 
-from keras.engine.base_layer import Layer
+
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+from keras.engine.base_layer import Layer
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Reshape')
+@keras_export("keras.layers.Reshape")
 class Reshape(Layer):
-  """Layer that reshapes inputs into the given shape.
-
-  Input shape:
-    Arbitrary, although all dimensions in the input shape must be known/fixed.
-    Use the keyword argument `input_shape` (tuple of integers, does not include
-    the samples/batch size axis) when using this layer as the first layer
-    in a model.
-
-  Output shape:
-    `(batch_size,) + target_shape`
-
-  Example:
-
-  >>> # as first layer in a Sequential model
-  >>> model = tf.keras.Sequential()
-  >>> model.add(tf.keras.layers.Reshape((3, 4), input_shape=(12,)))
-  >>> # model.output_shape == (None, 3, 4), `None` is the batch size.
-  >>> model.output_shape
-  (None, 3, 4)
-
-  >>> # as intermediate layer in a Sequential model
-  >>> model.add(tf.keras.layers.Reshape((6, 2)))
-  >>> model.output_shape
-  (None, 6, 2)
-
-  >>> # also supports shape inference using `-1` as dimension
-  >>> model.add(tf.keras.layers.Reshape((-1, 2, 2)))
-  >>> model.output_shape
-  (None, 3, 2, 2)
-  """
-
-  def __init__(self, target_shape, **kwargs):
-    """Creates a `tf.keras.layers.Reshape`  layer instance.
-
-    Args:
-      target_shape: Target shape. Tuple of integers, does not include the
-        samples dimension (batch size).
-      **kwargs: Any additional layer keyword arguments.
+    """Layer that reshapes inputs into the given shape.
+
+    Input shape:
+      Arbitrary, although all dimensions in the input shape must be known/fixed.
+      Use the keyword argument `input_shape` (tuple of integers, does not
+      include the samples/batch size axis) when using this layer as the first
+      layer in a model.
+
+    Output shape:
+      `(batch_size,) + target_shape`
+
+    Example:
+
+    >>> # as first layer in a Sequential model
+    >>> model = tf.keras.Sequential()
+    >>> model.add(tf.keras.layers.Reshape((3, 4), input_shape=(12,)))
+    >>> # model.output_shape == (None, 3, 4), `None` is the batch size.
+    >>> model.output_shape
+    (None, 3, 4)
+
+    >>> # as intermediate layer in a Sequential model
+    >>> model.add(tf.keras.layers.Reshape((6, 2)))
+    >>> model.output_shape
+    (None, 6, 2)
+
+    >>> # also supports shape inference using `-1` as dimension
+    >>> model.add(tf.keras.layers.Reshape((-1, 2, 2)))
+    >>> model.output_shape
+    (None, 3, 2, 2)
     """
-    super().__init__(**kwargs)
-    self.target_shape = tuple(target_shape)
-
-  def _fix_unknown_dimension(self, input_shape, output_shape):
-    """Find and replace a missing dimension in an output shape.
 
-    This is a near direct port of the internal Numpy function
-    `_fix_unknown_dimension` in `numpy/core/src/multiarray/shape.c`
-
-    Args:
-      input_shape: Shape of array being reshaped
-      output_shape: Desired shape of the array with at most a single -1 which
-        indicates a dimension that should be derived from the input shape.
-
-    Returns:
-      The new output shape with a -1 replaced with its computed value.
-
-    Raises:
-      ValueError: If the total array size of the output_shape is
-      different than the input_shape, or more than one unknown dimension
-      is specified.
-    """
-    output_shape = list(output_shape)
-    msg = ('total size of new array must be unchanged, '
-           'input_shape = {}, output_shape = {}'.format(input_shape,
-                                                        output_shape))
-
-    known, unknown = 1, None
-    for index, dim in enumerate(output_shape):
-      if dim < 0:
-        if unknown is None:
-          unknown = index
+    def __init__(self, target_shape, **kwargs):
+        """Creates a `tf.keras.layers.Reshape`  layer instance.
+
+        Args:
+          target_shape: Target shape. Tuple of integers, does not include the
+            samples dimension (batch size).
+          **kwargs: Any additional layer keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.target_shape = tuple(target_shape)
+
+    def _fix_unknown_dimension(self, input_shape, output_shape):
+        """Find and replace a missing dimension in an output shape.
+
+        This is a near direct port of the internal Numpy function
+        `_fix_unknown_dimension` in `numpy/core/src/multiarray/shape.c`
+
+        Args:
+          input_shape: Shape of array being reshaped
+          output_shape: Desired shape of the array with at most a single -1
+            which indicates a dimension that should be derived from the input
+            shape.
+
+        Returns:
+          The new output shape with a -1 replaced with its computed value.
+
+        Raises:
+          ValueError: If the total array size of the output_shape is
+          different than the input_shape, or more than one unknown dimension
+          is specified.
+        """
+        output_shape = list(output_shape)
+        msg = (
+            "total size of new array must be unchanged, "
+            "input_shape = {}, output_shape = {}".format(
+                input_shape, output_shape
+            )
+        )
+
+        known, unknown = 1, None
+        for index, dim in enumerate(output_shape):
+            if dim < 0:
+                if unknown is None:
+                    unknown = index
+                else:
+                    raise ValueError(
+                        "There must be at most one unknown dimension in "
+                        f"output_shape. Received: output_shape={output_shape}."
+                    )
+            else:
+                known *= dim
+
+        original = np.prod(input_shape, dtype=int)
+        if unknown is not None:
+            if known == 0 or original % known != 0:
+                raise ValueError(msg)
+            output_shape[unknown] = original // known
+        elif original != known:
+            raise ValueError(msg)
+        return output_shape
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if None in input_shape[1:]:
+            output_shape = [input_shape[0]]
+            # input shape (partially) unknown? replace -1's with None's
+            output_shape += tuple(
+                s if s != -1 else None for s in self.target_shape
+            )
         else:
-          raise ValueError(
-              f'There must be at most one unknown dimension in output_shape. '
-              f'Received: output_shape={output_shape}.')
-      else:
-        known *= dim
-
-    original = np.prod(input_shape, dtype=int)
-    if unknown is not None:
-      if known == 0 or original % known != 0:
-        raise ValueError(msg)
-      output_shape[unknown] = original // known
-    elif original != known:
-      raise ValueError(msg)
-    return output_shape
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if None in input_shape[1:]:
-      output_shape = [input_shape[0]]
-      # input shape (partially) unknown? replace -1's with None's
-      output_shape += tuple(s if s != -1 else None for s in self.target_shape)
-    else:
-      output_shape = [input_shape[0]]
-      output_shape += self._fix_unknown_dimension(input_shape[1:],
-                                                  self.target_shape)
-    return tf.TensorShape(output_shape)
-
-  def call(self, inputs):
-    result = tf.reshape(inputs, (tf.shape(inputs)[0],) + self.target_shape)
-    if not tf.executing_eagerly():
-      # Set the static shape for the result since it might lost during array_ops
-      # reshape, eg, some `None` dim in the result could be inferred.
-      result.set_shape(self.compute_output_shape(inputs.shape))
-    return result
-
-  def get_config(self):
-    config = {'target_shape': self.target_shape}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+            output_shape = [input_shape[0]]
+            output_shape += self._fix_unknown_dimension(
+                input_shape[1:], self.target_shape
+            )
+        return tf.TensorShape(output_shape)
+
+    def call(self, inputs):
+        result = tf.reshape(inputs, (tf.shape(inputs)[0],) + self.target_shape)
+        if not tf.executing_eagerly():
+            # Set the static shape for the result since it might lost during
+            # array_ops reshape, eg, some `None` dim in the result could be
+            # inferred.
+            result.set_shape(self.compute_output_shape(inputs.shape))
+        return result
+
+    def get_config(self):
+        config = {"target_shape": self.target_shape}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/reshape_test.py b/keras/layers/reshaping/reshape_test.py
index 8e66b4d3b948..0c9d89f737a2 100644
--- a/keras/layers/reshaping/reshape_test.py
+++ b/keras/layers/reshaping/reshape_test.py
@@ -14,42 +14,46 @@
 # ==============================================================================
 """Tests for reshape layer."""
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
-import tensorflow.compat.v2 as tf
-
 
 @test_combinations.run_all_keras_modes
 class ReshapeTest(test_combinations.TestCase):
-
-  def test_reshape(self):
-    test_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (8, 1)},
-        input_shape=(3, 2, 4))
-
-    test_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (-1, 1)},
-        input_shape=(3, 2, 4))
-
-    test_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (1, -1)},
-        input_shape=(3, 2, 4))
-
-    test_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (-1, 1)},
-        input_shape=(None, None, 2))
-
-  def test_reshape_set_static_shape(self):
-    input_layer = keras.Input(batch_shape=(1, None))
-    reshaped = keras.layers.Reshape((1, 100))(input_layer)
-    # Make sure the batch dim is not lost after array_ops.reshape.
-    self.assertEqual(reshaped.shape, [1, 1, 100])
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_reshape(self):
+        test_utils.layer_test(
+            keras.layers.Reshape,
+            kwargs={"target_shape": (8, 1)},
+            input_shape=(3, 2, 4),
+        )
+
+        test_utils.layer_test(
+            keras.layers.Reshape,
+            kwargs={"target_shape": (-1, 1)},
+            input_shape=(3, 2, 4),
+        )
+
+        test_utils.layer_test(
+            keras.layers.Reshape,
+            kwargs={"target_shape": (1, -1)},
+            input_shape=(3, 2, 4),
+        )
+
+        test_utils.layer_test(
+            keras.layers.Reshape,
+            kwargs={"target_shape": (-1, 1)},
+            input_shape=(None, None, 2),
+        )
+
+    def test_reshape_set_static_shape(self):
+        input_layer = keras.Input(batch_shape=(1, None))
+        reshaped = keras.layers.Reshape((1, 100))(input_layer)
+        # Make sure the batch dim is not lost after array_ops.reshape.
+        self.assertEqual(reshaped.shape, [1, 1, 100])
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/reshaping/up_sampling1d.py b/keras/layers/reshaping/up_sampling1d.py
index b5853cc867c3..56b75ef23d2d 100644
--- a/keras/layers/reshaping/up_sampling1d.py
+++ b/keras/layers/reshaping/up_sampling1d.py
@@ -13,68 +13,72 @@
 # limitations under the License.
 # ==============================================================================
 """Keras upsampling layer for 1D inputs."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.UpSampling1D')
+@keras_export("keras.layers.UpSampling1D")
 class UpSampling1D(Layer):
-  """Upsampling layer for 1D inputs.
-
-  Repeats each temporal step `size` times along the time axis.
-
-  Examples:
-
-  >>> input_shape = (2, 2, 3)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> print(x)
-  [[[ 0  1  2]
-    [ 3  4  5]]
-   [[ 6  7  8]
-    [ 9 10 11]]]
-  >>> y = tf.keras.layers.UpSampling1D(size=2)(x)
-  >>> print(y)
-  tf.Tensor(
+    """Upsampling layer for 1D inputs.
+
+    Repeats each temporal step `size` times along the time axis.
+
+    Examples:
+
+    >>> input_shape = (2, 2, 3)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> print(x)
     [[[ 0  1  2]
-      [ 0  1  2]
-      [ 3  4  5]
       [ 3  4  5]]
      [[ 6  7  8]
-      [ 6  7  8]
-      [ 9 10 11]
-      [ 9 10 11]]], shape=(2, 4, 3), dtype=int64)
-
-  Args:
-    size: Integer. Upsampling factor.
-
-  Input shape:
-    3D tensor with shape: `(batch_size, steps, features)`.
-
-  Output shape:
-    3D tensor with shape: `(batch_size, upsampled_steps, features)`.
-  """
-
-  def __init__(self, size=2, **kwargs):
-    super().__init__(**kwargs)
-    self.size = int(size)
-    self.input_spec = InputSpec(ndim=3)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    size = self.size * input_shape[1] if input_shape[1] is not None else None
-    return tf.TensorShape([input_shape[0], size, input_shape[2]])
-
-  def call(self, inputs):
-    output = backend.repeat_elements(inputs, self.size, axis=1)
-    return output
-
-  def get_config(self):
-    config = {'size': self.size}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+      [ 9 10 11]]]
+    >>> y = tf.keras.layers.UpSampling1D(size=2)(x)
+    >>> print(y)
+    tf.Tensor(
+      [[[ 0  1  2]
+        [ 0  1  2]
+        [ 3  4  5]
+        [ 3  4  5]]
+       [[ 6  7  8]
+        [ 6  7  8]
+        [ 9 10 11]
+        [ 9 10 11]]], shape=(2, 4, 3), dtype=int64)
+
+    Args:
+      size: Integer. Upsampling factor.
+
+    Input shape:
+      3D tensor with shape: `(batch_size, steps, features)`.
+
+    Output shape:
+      3D tensor with shape: `(batch_size, upsampled_steps, features)`.
+    """
+
+    def __init__(self, size=2, **kwargs):
+        super().__init__(**kwargs)
+        self.size = int(size)
+        self.input_spec = InputSpec(ndim=3)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        size = (
+            self.size * input_shape[1] if input_shape[1] is not None else None
+        )
+        return tf.TensorShape([input_shape[0], size, input_shape[2]])
+
+    def call(self, inputs):
+        output = backend.repeat_elements(inputs, self.size, axis=1)
+        return output
+
+    def get_config(self):
+        config = {"size": self.size}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/up_sampling2d.py b/keras/layers/reshaping/up_sampling2d.py
index cf2513092931..ccfd2a6cff0f 100644
--- a/keras/layers/reshaping/up_sampling2d.py
+++ b/keras/layers/reshaping/up_sampling2d.py
@@ -13,128 +13,135 @@
 # limitations under the License.
 # ==============================================================================
 """Keras upsampling layer for 2D inputs."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
+from keras.utils import image_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.UpSampling2D')
+@keras_export("keras.layers.UpSampling2D")
 class UpSampling2D(Layer):
-  """Upsampling layer for 2D inputs.
-
-  Repeats the rows and columns of the data
-  by `size[0]` and `size[1]` respectively.
-
-  Examples:
-
-  >>> input_shape = (2, 2, 1, 3)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> print(x)
-  [[[[ 0  1  2]]
-    [[ 3  4  5]]]
-   [[[ 6  7  8]]
-    [[ 9 10 11]]]]
-  >>> y = tf.keras.layers.UpSampling2D(size=(1, 2))(x)
-  >>> print(y)
-  tf.Tensor(
-    [[[[ 0  1  2]
-       [ 0  1  2]]
-      [[ 3  4  5]
-       [ 3  4  5]]]
-     [[[ 6  7  8]
-       [ 6  7  8]]
-      [[ 9 10 11]
-       [ 9 10 11]]]], shape=(2, 2, 2, 3), dtype=int64)
-
-  Args:
-    size: Int, or tuple of 2 integers.
-      The upsampling factors for rows and columns.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch_size, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    interpolation: A string, one of `"area"`, `"bicubic"`, `"bilinear"`,
-      `"gaussian"`, `"lanczos3"`, `"lanczos5"`, `"mitchellcubic"`, `"nearest"`.
-
-  Input shape:
-    4D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, rows, cols, channels)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, channels, rows, cols)`
-
-  Output shape:
-    4D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, upsampled_rows, upsampled_cols, channels)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, channels, upsampled_rows, upsampled_cols)`
-  """
-
-  def __init__(self,
-               size=(2, 2),
-               data_format=None,
-               interpolation='nearest',
-               **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.size = conv_utils.normalize_tuple(size, 2, 'size')
-    interpolations = {
-        'area': tf.image.ResizeMethod.AREA,
-        'bicubic': tf.image.ResizeMethod.BICUBIC,
-        'bilinear': tf.image.ResizeMethod.BILINEAR,
-        'gaussian': tf.image.ResizeMethod.GAUSSIAN,
-        'lanczos3': tf.image.ResizeMethod.LANCZOS3,
-        'lanczos5': tf.image.ResizeMethod.LANCZOS5,
-        'mitchellcubic': tf.image.ResizeMethod.MITCHELLCUBIC,
-        'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR,
-    }
-    interploations_list = '"' + '", "'.join(interpolations.keys()) + '"'
-    if interpolation not in interpolations:
-      raise ValueError('`interpolation` argument should be one of: '
-                       f'{interploations_list}. Received: "{interpolation}".')
-    self.interpolation = interpolation
-    self.input_spec = InputSpec(ndim=4)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      height = self.size[0] * input_shape[
-          2] if input_shape[2] is not None else None
-      width = self.size[1] * input_shape[
-          3] if input_shape[3] is not None else None
-      return tf.TensorShape(
-          [input_shape[0], input_shape[1], height, width])
-    else:
-      height = self.size[0] * input_shape[
-          1] if input_shape[1] is not None else None
-      width = self.size[1] * input_shape[
-          2] if input_shape[2] is not None else None
-      return tf.TensorShape(
-          [input_shape[0], height, width, input_shape[3]])
-
-  def call(self, inputs):
-    return backend.resize_images(
-        inputs, self.size[0], self.size[1], self.data_format,
-        interpolation=self.interpolation)
-
-  def get_config(self):
-    config = {
-        'size': self.size,
-        'data_format': self.data_format,
-        'interpolation': self.interpolation
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Upsampling layer for 2D inputs.
+
+    Repeats the rows and columns of the data
+    by `size[0]` and `size[1]` respectively.
+
+    Examples:
+
+    >>> input_shape = (2, 2, 1, 3)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> print(x)
+    [[[[ 0  1  2]]
+      [[ 3  4  5]]]
+     [[[ 6  7  8]]
+      [[ 9 10 11]]]]
+    >>> y = tf.keras.layers.UpSampling2D(size=(1, 2))(x)
+    >>> print(y)
+    tf.Tensor(
+      [[[[ 0  1  2]
+         [ 0  1  2]]
+        [[ 3  4  5]
+         [ 3  4  5]]]
+       [[[ 6  7  8]
+         [ 6  7  8]]
+        [[ 9 10 11]
+         [ 9 10 11]]]], shape=(2, 2, 2, 3), dtype=int64)
+
+    Args:
+      size: Int, or tuple of 2 integers.
+        The upsampling factors for rows and columns.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch_size, channels, height, width)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      interpolation: A string, one of `"area"`, `"bicubic"`, `"bilinear"`,
+        `"gaussian"`, `"lanczos3"`, `"lanczos5"`, `"mitchellcubic"`,
+        `"nearest"`.
+
+    Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, channels, rows, cols)`
+
+    Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, upsampled_rows, upsampled_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, channels, upsampled_rows, upsampled_cols)`
+    """
+
+    def __init__(
+        self, size=(2, 2), data_format=None, interpolation="nearest", **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.size = conv_utils.normalize_tuple(size, 2, "size")
+        self.interpolation = image_utils.get_interpolation(interpolation)
+        self.input_spec = InputSpec(ndim=4)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            height = (
+                self.size[0] * input_shape[2]
+                if input_shape[2] is not None
+                else None
+            )
+            width = (
+                self.size[1] * input_shape[3]
+                if input_shape[3] is not None
+                else None
+            )
+            return tf.TensorShape(
+                [input_shape[0], input_shape[1], height, width]
+            )
+        else:
+            height = (
+                self.size[0] * input_shape[1]
+                if input_shape[1] is not None
+                else None
+            )
+            width = (
+                self.size[1] * input_shape[2]
+                if input_shape[2] is not None
+                else None
+            )
+            return tf.TensorShape(
+                [input_shape[0], height, width, input_shape[3]]
+            )
+
+    def call(self, inputs):
+        return backend.resize_images(
+            inputs,
+            self.size[0],
+            self.size[1],
+            self.data_format,
+            interpolation=self.interpolation,
+        )
+
+    def get_config(self):
+        config = {
+            "size": self.size,
+            "data_format": self.data_format,
+            "interpolation": self.interpolation,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/up_sampling3d.py b/keras/layers/reshaping/up_sampling3d.py
index 72ed748c2dd8..9482ea1b530c 100644
--- a/keras/layers/reshaping/up_sampling3d.py
+++ b/keras/layers/reshaping/up_sampling3d.py
@@ -13,93 +13,119 @@
 # limitations under the License.
 # ==============================================================================
 """Keras upsampling layer for 3D inputs."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.UpSampling3D')
+@keras_export("keras.layers.UpSampling3D")
 class UpSampling3D(Layer):
-  """Upsampling layer for 3D inputs.
-
-  Repeats the 1st, 2nd and 3rd dimensions
-  of the data by `size[0]`, `size[1]` and `size[2]` respectively.
-
-  Examples:
-
-  >>> input_shape = (2, 1, 2, 1, 3)
-  >>> x = tf.constant(1, shape=input_shape)
-  >>> y = tf.keras.layers.UpSampling3D(size=2)(x)
-  >>> print(y.shape)
-  (2, 2, 4, 2, 3)
-
-  Args:
-    size: Int, or tuple of 3 integers.
-      The upsampling factors for dim1, dim2 and dim3.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      while `channels_first` corresponds to inputs with shape
-      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    5D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, dim1, dim2, dim3, channels)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, channels, dim1, dim2, dim3)`
-
-  Output shape:
-    5D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, upsampled_dim1, upsampled_dim2, upsampled_dim3, channels)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, channels, upsampled_dim1, upsampled_dim2, upsampled_dim3)`
-  """
-
-  def __init__(self, size=(2, 2, 2), data_format=None, **kwargs):
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.size = conv_utils.normalize_tuple(size, 3, 'size')
-    self.input_spec = InputSpec(ndim=5)
-    super().__init__(**kwargs)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      dim1 = self.size[0] * input_shape[
-          2] if input_shape[2] is not None else None
-      dim2 = self.size[1] * input_shape[
-          3] if input_shape[3] is not None else None
-      dim3 = self.size[2] * input_shape[
-          4] if input_shape[4] is not None else None
-      return tf.TensorShape(
-          [input_shape[0], input_shape[1], dim1, dim2, dim3])
-    else:
-      dim1 = self.size[0] * input_shape[
-          1] if input_shape[1] is not None else None
-      dim2 = self.size[1] * input_shape[
-          2] if input_shape[2] is not None else None
-      dim3 = self.size[2] * input_shape[
-          3] if input_shape[3] is not None else None
-      return tf.TensorShape(
-          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
-
-  def call(self, inputs):
-    return backend.resize_volumes(
-        inputs, self.size[0], self.size[1], self.size[2], self.data_format)
-
-  def get_config(self):
-    config = {'size': self.size, 'data_format': self.data_format}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Upsampling layer for 3D inputs.
+
+    Repeats the 1st, 2nd and 3rd dimensions
+    of the data by `size[0]`, `size[1]` and `size[2]` respectively.
+
+    Examples:
+
+    >>> input_shape = (2, 1, 2, 1, 3)
+    >>> x = tf.constant(1, shape=input_shape)
+    >>> y = tf.keras.layers.UpSampling3D(size=2)(x)
+    >>> print(y.shape)
+    (2, 2, 4, 2, 3)
+
+    Args:
+      size: Int, or tuple of 3 integers.
+        The upsampling factors for dim1, dim2 and dim3.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+        while `channels_first` corresponds to inputs with shape
+        `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, dim1, dim2, dim3, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, channels, dim1, dim2, dim3)`
+
+    Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, upsampled_dim1, upsampled_dim2, upsampled_dim3,
+          channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, channels, upsampled_dim1, upsampled_dim2,
+          upsampled_dim3)`
+    """
+
+    def __init__(self, size=(2, 2, 2), data_format=None, **kwargs):
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.size = conv_utils.normalize_tuple(size, 3, "size")
+        self.input_spec = InputSpec(ndim=5)
+        super().__init__(**kwargs)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            dim1 = (
+                self.size[0] * input_shape[2]
+                if input_shape[2] is not None
+                else None
+            )
+            dim2 = (
+                self.size[1] * input_shape[3]
+                if input_shape[3] is not None
+                else None
+            )
+            dim3 = (
+                self.size[2] * input_shape[4]
+                if input_shape[4] is not None
+                else None
+            )
+            return tf.TensorShape(
+                [input_shape[0], input_shape[1], dim1, dim2, dim3]
+            )
+        else:
+            dim1 = (
+                self.size[0] * input_shape[1]
+                if input_shape[1] is not None
+                else None
+            )
+            dim2 = (
+                self.size[1] * input_shape[2]
+                if input_shape[2] is not None
+                else None
+            )
+            dim3 = (
+                self.size[2] * input_shape[3]
+                if input_shape[3] is not None
+                else None
+            )
+            return tf.TensorShape(
+                [input_shape[0], dim1, dim2, dim3, input_shape[4]]
+            )
+
+    def call(self, inputs):
+        return backend.resize_volumes(
+            inputs, self.size[0], self.size[1], self.size[2], self.data_format
+        )
+
+    def get_config(self):
+        config = {"size": self.size, "data_format": self.data_format}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/up_sampling_test.py b/keras/layers/reshaping/up_sampling_test.py
index 2716f902c252..70ed79e6328e 100644
--- a/keras/layers/reshaping/up_sampling_test.py
+++ b/keras/layers/reshaping/up_sampling_test.py
@@ -15,164 +15,244 @@
 """Tests for up-sampling layers."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util as tf_test_utils
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
-@tf_test_utils.for_all_test_methods(tf_test_utils.disable_xla,
-                                    'align_corners=False not supported by XLA')
+@tf_test_utils.for_all_test_methods(
+    tf_test_utils.disable_xla, "align_corners=False not supported by XLA"
+)
 @test_combinations.run_all_keras_modes
 class UpSamplingTest(test_combinations.TestCase):
+    def test_upsampling_1d(self):
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.UpSampling1D,
+                kwargs={"size": 2},
+                input_shape=(3, 5, 4),
+            )
+
+    def test_upsampling_2d(self):
+        num_samples = 2
+        stack_size = 2
+        input_num_row = 11
+        input_num_col = 12
+
+        for data_format in ["channels_first", "channels_last"]:
+            if data_format == "channels_first":
+                inputs = np.random.rand(
+                    num_samples, stack_size, input_num_row, input_num_col
+                )
+            else:
+                inputs = np.random.rand(
+                    num_samples, input_num_row, input_num_col, stack_size
+                )
+
+            # basic test
+            with self.cached_session():
+                test_utils.layer_test(
+                    keras.layers.UpSampling2D,
+                    kwargs={"size": (2, 2), "data_format": data_format},
+                    input_shape=inputs.shape,
+                )
+
+                for length_row in [2]:
+                    for length_col in [2, 3]:
+                        layer = keras.layers.UpSampling2D(
+                            size=(length_row, length_col),
+                            data_format=data_format,
+                        )
+                        layer.build(inputs.shape)
+                        output = layer(keras.backend.variable(inputs))
+                        if tf.executing_eagerly():
+                            np_output = output.numpy()
+                        else:
+                            np_output = keras.backend.eval(output)
+                        if data_format == "channels_first":
+                            assert (
+                                np_output.shape[2] == length_row * input_num_row
+                            )
+                            assert (
+                                np_output.shape[3] == length_col * input_num_col
+                            )
+                        else:  # tf
+                            assert (
+                                np_output.shape[1] == length_row * input_num_row
+                            )
+                            assert (
+                                np_output.shape[2] == length_col * input_num_col
+                            )
+
+                        # compare with numpy
+                        if data_format == "channels_first":
+                            expected_out = np.repeat(inputs, length_row, axis=2)
+                            expected_out = np.repeat(
+                                expected_out, length_col, axis=3
+                            )
+                        else:  # tf
+                            expected_out = np.repeat(inputs, length_row, axis=1)
+                            expected_out = np.repeat(
+                                expected_out, length_col, axis=2
+                            )
 
-  def test_upsampling_1d(self):
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.UpSampling1D, kwargs={'size': 2}, input_shape=(3, 5, 4))
-
-  def test_upsampling_2d(self):
-    num_samples = 2
-    stack_size = 2
-    input_num_row = 11
-    input_num_col = 12
-
-    for data_format in ['channels_first', 'channels_last']:
-      if data_format == 'channels_first':
-        inputs = np.random.rand(num_samples, stack_size, input_num_row,
-                                input_num_col)
-      else:
-        inputs = np.random.rand(num_samples, input_num_row, input_num_col,
-                                stack_size)
-
-      # basic test
-      with self.cached_session():
-        test_utils.layer_test(
-            keras.layers.UpSampling2D,
-            kwargs={'size': (2, 2),
-                    'data_format': data_format},
-            input_shape=inputs.shape)
-
-        for length_row in [2]:
-          for length_col in [2, 3]:
-            layer = keras.layers.UpSampling2D(
-                size=(length_row, length_col), data_format=data_format)
-            layer.build(inputs.shape)
-            output = layer(keras.backend.variable(inputs))
-            if tf.executing_eagerly():
-              np_output = output.numpy()
+                        np.testing.assert_allclose(np_output, expected_out)
+
+    def test_upsampling_2d_bilinear(self):
+        num_samples = 2
+        stack_size = 2
+        input_num_row = 11
+        input_num_col = 12
+        for data_format in ["channels_first", "channels_last"]:
+            if data_format == "channels_first":
+                inputs = np.random.rand(
+                    num_samples, stack_size, input_num_row, input_num_col
+                )
             else:
-              np_output = keras.backend.eval(output)
-            if data_format == 'channels_first':
-              assert np_output.shape[2] == length_row * input_num_row
-              assert np_output.shape[3] == length_col * input_num_col
-            else:  # tf
-              assert np_output.shape[1] == length_row * input_num_row
-              assert np_output.shape[2] == length_col * input_num_col
-
-            # compare with numpy
-            if data_format == 'channels_first':
-              expected_out = np.repeat(inputs, length_row, axis=2)
-              expected_out = np.repeat(expected_out, length_col, axis=3)
-            else:  # tf
-              expected_out = np.repeat(inputs, length_row, axis=1)
-              expected_out = np.repeat(expected_out, length_col, axis=2)
-
-            np.testing.assert_allclose(np_output, expected_out)
-
-  def test_upsampling_2d_bilinear(self):
-    num_samples = 2
-    stack_size = 2
-    input_num_row = 11
-    input_num_col = 12
-    for data_format in ['channels_first', 'channels_last']:
-      if data_format == 'channels_first':
-        inputs = np.random.rand(num_samples, stack_size, input_num_row,
-                                input_num_col)
-      else:
-        inputs = np.random.rand(num_samples, input_num_row, input_num_col,
-                                stack_size)
-
-      test_utils.layer_test(keras.layers.UpSampling2D,
-                            kwargs={'size': (2, 2),
-                                    'data_format': data_format,
-                                    'interpolation': 'bilinear'},
-                            input_shape=inputs.shape)
-
-      if not tf.executing_eagerly():
-        for length_row in [2]:
-          for length_col in [2, 3]:
-            layer = keras.layers.UpSampling2D(
-                size=(length_row, length_col),
-                data_format=data_format)
-            layer.build(inputs.shape)
-            outputs = layer(keras.backend.variable(inputs))
-            np_output = keras.backend.eval(outputs)
-            if data_format == 'channels_first':
-              self.assertEqual(np_output.shape[2], length_row * input_num_row)
-              self.assertEqual(np_output.shape[3], length_col * input_num_col)
+                inputs = np.random.rand(
+                    num_samples, input_num_row, input_num_col, stack_size
+                )
+
+            test_utils.layer_test(
+                keras.layers.UpSampling2D,
+                kwargs={
+                    "size": (2, 2),
+                    "data_format": data_format,
+                    "interpolation": "bilinear",
+                },
+                input_shape=inputs.shape,
+            )
+
+            if not tf.executing_eagerly():
+                for length_row in [2]:
+                    for length_col in [2, 3]:
+                        layer = keras.layers.UpSampling2D(
+                            size=(length_row, length_col),
+                            data_format=data_format,
+                        )
+                        layer.build(inputs.shape)
+                        outputs = layer(keras.backend.variable(inputs))
+                        np_output = keras.backend.eval(outputs)
+                        if data_format == "channels_first":
+                            self.assertEqual(
+                                np_output.shape[2], length_row * input_num_row
+                            )
+                            self.assertEqual(
+                                np_output.shape[3], length_col * input_num_col
+                            )
+                        else:
+                            self.assertEqual(
+                                np_output.shape[1], length_row * input_num_row
+                            )
+                            self.assertEqual(
+                                np_output.shape[2], length_col * input_num_col
+                            )
+
+    def test_upsampling_3d(self):
+        num_samples = 2
+        stack_size = 2
+        input_len_dim1 = 10
+        input_len_dim2 = 11
+        input_len_dim3 = 12
+
+        for data_format in ["channels_first", "channels_last"]:
+            if data_format == "channels_first":
+                inputs = np.random.rand(
+                    num_samples,
+                    stack_size,
+                    input_len_dim1,
+                    input_len_dim2,
+                    input_len_dim3,
+                )
             else:
-              self.assertEqual(np_output.shape[1], length_row * input_num_row)
-              self.assertEqual(np_output.shape[2], length_col * input_num_col)
-
-  def test_upsampling_3d(self):
-    num_samples = 2
-    stack_size = 2
-    input_len_dim1 = 10
-    input_len_dim2 = 11
-    input_len_dim3 = 12
-
-    for data_format in ['channels_first', 'channels_last']:
-      if data_format == 'channels_first':
-        inputs = np.random.rand(num_samples, stack_size, input_len_dim1,
-                                input_len_dim2, input_len_dim3)
-      else:
-        inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
-                                input_len_dim3, stack_size)
-
-      # basic test
-      with self.cached_session():
-        test_utils.layer_test(
-            keras.layers.UpSampling3D,
-            kwargs={'size': (2, 2, 2),
-                    'data_format': data_format},
-            input_shape=inputs.shape)
-
-        for length_dim1 in [2, 3]:
-          for length_dim2 in [2]:
-            for length_dim3 in [3]:
-              layer = keras.layers.UpSampling3D(
-                  size=(length_dim1, length_dim2, length_dim3),
-                  data_format=data_format)
-              layer.build(inputs.shape)
-              output = layer(keras.backend.variable(inputs))
-              if tf.executing_eagerly():
-                np_output = output.numpy()
-              else:
-                np_output = keras.backend.eval(output)
-              if data_format == 'channels_first':
-                assert np_output.shape[2] == length_dim1 * input_len_dim1
-                assert np_output.shape[3] == length_dim2 * input_len_dim2
-                assert np_output.shape[4] == length_dim3 * input_len_dim3
-              else:  # tf
-                assert np_output.shape[1] == length_dim1 * input_len_dim1
-                assert np_output.shape[2] == length_dim2 * input_len_dim2
-                assert np_output.shape[3] == length_dim3 * input_len_dim3
-
-              # compare with numpy
-              if data_format == 'channels_first':
-                expected_out = np.repeat(inputs, length_dim1, axis=2)
-                expected_out = np.repeat(expected_out, length_dim2, axis=3)
-                expected_out = np.repeat(expected_out, length_dim3, axis=4)
-              else:  # tf
-                expected_out = np.repeat(inputs, length_dim1, axis=1)
-                expected_out = np.repeat(expected_out, length_dim2, axis=2)
-                expected_out = np.repeat(expected_out, length_dim3, axis=3)
-
-              np.testing.assert_allclose(np_output, expected_out)
-
-if __name__ == '__main__':
-  tf.test.main()
+                inputs = np.random.rand(
+                    num_samples,
+                    input_len_dim1,
+                    input_len_dim2,
+                    input_len_dim3,
+                    stack_size,
+                )
+
+            # basic test
+            with self.cached_session():
+                test_utils.layer_test(
+                    keras.layers.UpSampling3D,
+                    kwargs={"size": (2, 2, 2), "data_format": data_format},
+                    input_shape=inputs.shape,
+                )
+
+                for length_dim1 in [2, 3]:
+                    for length_dim2 in [2]:
+                        for length_dim3 in [3]:
+                            layer = keras.layers.UpSampling3D(
+                                size=(length_dim1, length_dim2, length_dim3),
+                                data_format=data_format,
+                            )
+                            layer.build(inputs.shape)
+                            output = layer(keras.backend.variable(inputs))
+                            if tf.executing_eagerly():
+                                np_output = output.numpy()
+                            else:
+                                np_output = keras.backend.eval(output)
+                            if data_format == "channels_first":
+                                assert (
+                                    np_output.shape[2]
+                                    == length_dim1 * input_len_dim1
+                                )
+                                assert (
+                                    np_output.shape[3]
+                                    == length_dim2 * input_len_dim2
+                                )
+                                assert (
+                                    np_output.shape[4]
+                                    == length_dim3 * input_len_dim3
+                                )
+                            else:  # tf
+                                assert (
+                                    np_output.shape[1]
+                                    == length_dim1 * input_len_dim1
+                                )
+                                assert (
+                                    np_output.shape[2]
+                                    == length_dim2 * input_len_dim2
+                                )
+                                assert (
+                                    np_output.shape[3]
+                                    == length_dim3 * input_len_dim3
+                                )
+
+                            # compare with numpy
+                            if data_format == "channels_first":
+                                expected_out = np.repeat(
+                                    inputs, length_dim1, axis=2
+                                )
+                                expected_out = np.repeat(
+                                    expected_out, length_dim2, axis=3
+                                )
+                                expected_out = np.repeat(
+                                    expected_out, length_dim3, axis=4
+                                )
+                            else:  # tf
+                                expected_out = np.repeat(
+                                    inputs, length_dim1, axis=1
+                                )
+                                expected_out = np.repeat(
+                                    expected_out, length_dim2, axis=2
+                                )
+                                expected_out = np.repeat(
+                                    expected_out, length_dim3, axis=3
+                                )
+
+                            np.testing.assert_allclose(np_output, expected_out)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/reshaping/zero_padding1d.py b/keras/layers/reshaping/zero_padding1d.py
index 68d11d994661..591e5d92172d 100644
--- a/keras/layers/reshaping/zero_padding1d.py
+++ b/keras/layers/reshaping/zero_padding1d.py
@@ -13,79 +13,82 @@
 # limitations under the License.
 # ==============================================================================
 """Keras zero-padding layer for 1D input."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ZeroPadding1D')
+@keras_export("keras.layers.ZeroPadding1D")
 class ZeroPadding1D(Layer):
-  """Zero-padding layer for 1D input (e.g. temporal sequence).
+    """Zero-padding layer for 1D input (e.g. temporal sequence).
 
-  Examples:
+    Examples:
 
-  >>> input_shape = (2, 2, 3)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> print(x)
-  [[[ 0  1  2]
-    [ 3  4  5]]
-   [[ 6  7  8]
-    [ 9 10 11]]]
-  >>> y = tf.keras.layers.ZeroPadding1D(padding=2)(x)
-  >>> print(y)
-  tf.Tensor(
-    [[[ 0  0  0]
-      [ 0  0  0]
-      [ 0  1  2]
-      [ 3  4  5]
-      [ 0  0  0]
-      [ 0  0  0]]
-     [[ 0  0  0]
-      [ 0  0  0]
-      [ 6  7  8]
-      [ 9 10 11]
-      [ 0  0  0]
-      [ 0  0  0]]], shape=(2, 6, 3), dtype=int64)
+    >>> input_shape = (2, 2, 3)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> print(x)
+    [[[ 0  1  2]
+      [ 3  4  5]]
+     [[ 6  7  8]
+      [ 9 10 11]]]
+    >>> y = tf.keras.layers.ZeroPadding1D(padding=2)(x)
+    >>> print(y)
+    tf.Tensor(
+      [[[ 0  0  0]
+        [ 0  0  0]
+        [ 0  1  2]
+        [ 3  4  5]
+        [ 0  0  0]
+        [ 0  0  0]]
+       [[ 0  0  0]
+        [ 0  0  0]
+        [ 6  7  8]
+        [ 9 10 11]
+        [ 0  0  0]
+        [ 0  0  0]]], shape=(2, 6, 3), dtype=int64)
 
-  Args:
-      padding: Int, or tuple of int (length 2), or dictionary.
-          - If int:
-          How many zeros to add at the beginning and end of
-          the padding dimension (axis 1).
-          - If tuple of int (length 2):
-          How many zeros to add at the beginning and the end of
-          the padding dimension (`(left_pad, right_pad)`).
+    Args:
+        padding: Int, or tuple of int (length 2).
+            - If int:
+            How many zeros to add at the beginning and end of
+            the padding dimension (axis 1).
+            - If tuple of int (length 2):
+            How many zeros to add at the beginning and the end of
+            the padding dimension (`(left_pad, right_pad)`).
 
-  Input shape:
-      3D tensor with shape `(batch_size, axis_to_pad, features)`
+    Input shape:
+        3D tensor with shape `(batch_size, axis_to_pad, features)`
 
-  Output shape:
-      3D tensor with shape `(batch_size, padded_axis, features)`
-  """
+    Output shape:
+        3D tensor with shape `(batch_size, padded_axis, features)`
+    """
 
-  def __init__(self, padding=1, **kwargs):
-    super().__init__(**kwargs)
-    self.padding = conv_utils.normalize_tuple(
-        padding, 2, 'padding', allow_zero=True)
-    self.input_spec = InputSpec(ndim=3)
+    def __init__(self, padding=1, **kwargs):
+        super().__init__(**kwargs)
+        self.padding = conv_utils.normalize_tuple(
+            padding, 2, "padding", allow_zero=True
+        )
+        self.input_spec = InputSpec(ndim=3)
 
-  def compute_output_shape(self, input_shape):
-    if input_shape[1] is not None:
-      length = input_shape[1] + self.padding[0] + self.padding[1]
-    else:
-      length = None
-    return tf.TensorShape([input_shape[0], length, input_shape[2]])
+    def compute_output_shape(self, input_shape):
+        if input_shape[1] is not None:
+            length = input_shape[1] + self.padding[0] + self.padding[1]
+        else:
+            length = None
+        return tf.TensorShape([input_shape[0], length, input_shape[2]])
 
-  def call(self, inputs):
-    return backend.temporal_padding(inputs, padding=self.padding)
+    def call(self, inputs):
+        return backend.temporal_padding(inputs, padding=self.padding)
 
-  def get_config(self):
-    config = {'padding': self.padding}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"padding": self.padding}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/zero_padding2d.py b/keras/layers/reshaping/zero_padding2d.py
index 7b5584f0afc4..a4e4c3e6fb57 100644
--- a/keras/layers/reshaping/zero_padding2d.py
+++ b/keras/layers/reshaping/zero_padding2d.py
@@ -13,137 +13,144 @@
 # limitations under the License.
 # ==============================================================================
 """Keras zero-padding layer for 2D input."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ZeroPadding2D')
+@keras_export("keras.layers.ZeroPadding2D")
 class ZeroPadding2D(Layer):
-  """Zero-padding layer for 2D input (e.g. picture).
-
-  This layer can add rows and columns of zeros
-  at the top, bottom, left and right side of an image tensor.
-
-  Examples:
-
-  >>> input_shape = (1, 1, 2, 2)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> print(x)
-  [[[[0 1]
-     [2 3]]]]
-  >>> y = tf.keras.layers.ZeroPadding2D(padding=1)(x)
-  >>> print(y)
-  tf.Tensor(
-    [[[[0 0]
-       [0 0]
-       [0 0]
-       [0 0]]
-      [[0 0]
-       [0 1]
-       [2 3]
-       [0 0]]
-      [[0 0]
-       [0 0]
-       [0 0]
-       [0 0]]]], shape=(1, 3, 4, 2), dtype=int64)
-
-  Args:
-    padding: Int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
-      - If int: the same symmetric padding
-        is applied to height and width.
-      - If tuple of 2 ints:
-        interpreted as two different
-        symmetric padding values for height and width:
-        `(symmetric_height_pad, symmetric_width_pad)`.
-      - If tuple of 2 tuples of 2 ints:
-        interpreted as
-        `((top_pad, bottom_pad), (left_pad, right_pad))`
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch_size, channels, height, width)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-
-  Input shape:
-    4D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, rows, cols, channels)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, channels, rows, cols)`
-
-  Output shape:
-    4D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, padded_rows, padded_cols, channels)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, channels, padded_rows, padded_cols)`
-  """
-
-  def __init__(self, padding=(1, 1), data_format=None, **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    if isinstance(padding, int):
-      self.padding = ((padding, padding), (padding, padding))
-    elif hasattr(padding, '__len__'):
-      if len(padding) != 2:
-        raise ValueError('`padding` should have two elements. '
-                         f'Received: {padding}.')
-      height_padding = conv_utils.normalize_tuple(
-          padding[0], 2, '1st entry of padding', allow_zero=True)
-      width_padding = conv_utils.normalize_tuple(
-          padding[1], 2, '2nd entry of padding', allow_zero=True)
-      self.padding = (height_padding, width_padding)
-    else:
-      raise ValueError('`padding` should be either an int, '
-                       'a tuple of 2 ints '
-                       '(symmetric_height_pad, symmetric_width_pad), '
-                       'or a tuple of 2 tuples of 2 ints '
-                       '((top_pad, bottom_pad), (left_pad, right_pad)). '
-                       f'Received: {padding}.')
-    self.input_spec = InputSpec(ndim=4)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      if input_shape[2] is not None:
-        rows = input_shape[2] + self.padding[0][0] + self.padding[0][1]
-      else:
-        rows = None
-      if input_shape[3] is not None:
-        cols = input_shape[3] + self.padding[1][0] + self.padding[1][1]
-      else:
-        cols = None
-      return tf.TensorShape(
-          [input_shape[0], input_shape[1], rows, cols])
-    elif self.data_format == 'channels_last':
-      if input_shape[1] is not None:
-        rows = input_shape[1] + self.padding[0][0] + self.padding[0][1]
-      else:
-        rows = None
-      if input_shape[2] is not None:
-        cols = input_shape[2] + self.padding[1][0] + self.padding[1][1]
-      else:
-        cols = None
-      return tf.TensorShape(
-          [input_shape[0], rows, cols, input_shape[3]])
-
-  def call(self, inputs):
-    return backend.spatial_2d_padding(
-        inputs, padding=self.padding, data_format=self.data_format)
-
-  def get_config(self):
-    config = {'padding': self.padding, 'data_format': self.data_format}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Zero-padding layer for 2D input (e.g. picture).
+
+    This layer can add rows and columns of zeros
+    at the top, bottom, left and right side of an image tensor.
+
+    Examples:
+
+    >>> input_shape = (1, 1, 2, 2)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> print(x)
+    [[[[0 1]
+       [2 3]]]]
+    >>> y = tf.keras.layers.ZeroPadding2D(padding=1)(x)
+    >>> print(y)
+    tf.Tensor(
+      [[[[0 0]
+         [0 0]
+         [0 0]
+         [0 0]]
+        [[0 0]
+         [0 1]
+         [2 3]
+         [0 0]]
+        [[0 0]
+         [0 0]
+         [0 0]
+         [0 0]]]], shape=(1, 3, 4, 2), dtype=int64)
+
+    Args:
+      padding: Int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+        - If int: the same symmetric padding
+          is applied to height and width.
+        - If tuple of 2 ints:
+          interpreted as two different
+          symmetric padding values for height and width:
+          `(symmetric_height_pad, symmetric_width_pad)`.
+        - If tuple of 2 tuples of 2 ints:
+          interpreted as
+          `((top_pad, bottom_pad), (left_pad, right_pad))`
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch_size, channels, height, width)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+
+    Input shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, rows, cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, channels, rows, cols)`
+
+    Output shape:
+      4D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, padded_rows, padded_cols, channels)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, channels, padded_rows, padded_cols)`
+    """
+
+    def __init__(self, padding=(1, 1), data_format=None, **kwargs):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        if isinstance(padding, int):
+            self.padding = ((padding, padding), (padding, padding))
+        elif hasattr(padding, "__len__"):
+            if len(padding) != 2:
+                raise ValueError(
+                    f"`padding` should have two elements. Received: {padding}."
+                )
+            height_padding = conv_utils.normalize_tuple(
+                padding[0], 2, "1st entry of padding", allow_zero=True
+            )
+            width_padding = conv_utils.normalize_tuple(
+                padding[1], 2, "2nd entry of padding", allow_zero=True
+            )
+            self.padding = (height_padding, width_padding)
+        else:
+            raise ValueError(
+                "`padding` should be either an int, "
+                "a tuple of 2 ints "
+                "(symmetric_height_pad, symmetric_width_pad), "
+                "or a tuple of 2 tuples of 2 ints "
+                "((top_pad, bottom_pad), (left_pad, right_pad)). "
+                f"Received: {padding}."
+            )
+        self.input_spec = InputSpec(ndim=4)
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            if input_shape[2] is not None:
+                rows = input_shape[2] + self.padding[0][0] + self.padding[0][1]
+            else:
+                rows = None
+            if input_shape[3] is not None:
+                cols = input_shape[3] + self.padding[1][0] + self.padding[1][1]
+            else:
+                cols = None
+            return tf.TensorShape([input_shape[0], input_shape[1], rows, cols])
+        elif self.data_format == "channels_last":
+            if input_shape[1] is not None:
+                rows = input_shape[1] + self.padding[0][0] + self.padding[0][1]
+            else:
+                rows = None
+            if input_shape[2] is not None:
+                cols = input_shape[2] + self.padding[1][0] + self.padding[1][1]
+            else:
+                cols = None
+            return tf.TensorShape([input_shape[0], rows, cols, input_shape[3]])
+
+    def call(self, inputs):
+        return backend.spatial_2d_padding(
+            inputs, padding=self.padding, data_format=self.data_format
+        )
+
+    def get_config(self):
+        config = {"padding": self.padding, "data_format": self.data_format}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/zero_padding3d.py b/keras/layers/reshaping/zero_padding3d.py
index 214bf6355593..147118afd52e 100644
--- a/keras/layers/reshaping/zero_padding3d.py
+++ b/keras/layers/reshaping/zero_padding3d.py
@@ -13,138 +13,152 @@
 # limitations under the License.
 # ==============================================================================
 """Keras zero-padding layer for 3D input."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ZeroPadding3D')
+@keras_export("keras.layers.ZeroPadding3D")
 class ZeroPadding3D(Layer):
-  """Zero-padding layer for 3D data (spatial or spatio-temporal).
+    """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
-  Examples:
+    Examples:
 
-  >>> input_shape = (1, 1, 2, 2, 3)
-  >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
-  >>> y = tf.keras.layers.ZeroPadding3D(padding=2)(x)
-  >>> print(y.shape)
-  (1, 5, 6, 6, 3)
+    >>> input_shape = (1, 1, 2, 2, 3)
+    >>> x = np.arange(np.prod(input_shape)).reshape(input_shape)
+    >>> y = tf.keras.layers.ZeroPadding3D(padding=2)(x)
+    >>> print(y.shape)
+    (1, 5, 6, 6, 3)
 
-  Args:
-    padding: Int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
-      - If int: the same symmetric padding
-        is applied to height and width.
-      - If tuple of 3 ints:
-        interpreted as two different
-        symmetric padding values for height and width:
-        `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
-      - If tuple of 3 tuples of 2 ints:
-        interpreted as
-        `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
-          right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      while `channels_first` corresponds to inputs with shape
-      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
+    Args:
+      padding: Int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
+        - If int: the same symmetric padding
+          is applied to height and width.
+        - If tuple of 3 ints:
+          interpreted as two different
+          symmetric padding values for height and width:
+          `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
+        - If tuple of 3 tuples of 2 ints:
+          interpreted as
+          `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
+            right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+        while `channels_first` corresponds to inputs with shape
+        `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
 
-  Input shape:
-    5D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad,
-          depth)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, depth, first_axis_to_pad, second_axis_to_pad,
+    Input shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, first_axis_to_pad, second_axis_to_pad,
+          third_axis_to_pad, depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, depth, first_axis_to_pad, second_axis_to_pad,
           third_axis_to_pad)`
 
-  Output shape:
-    5D tensor with shape:
-    - If `data_format` is `"channels_last"`:
-        `(batch_size, first_padded_axis, second_padded_axis, third_axis_to_pad,
-          depth)`
-    - If `data_format` is `"channels_first"`:
-        `(batch_size, depth, first_padded_axis, second_padded_axis,
-          third_axis_to_pad)`
-  """
+    Output shape:
+      5D tensor with shape:
+      - If `data_format` is `"channels_last"`:
+          `(batch_size, first_padded_axis, second_padded_axis,
+          third_axis_to_pad, depth)`
+      - If `data_format` is `"channels_first"`:
+          `(batch_size, depth, first_padded_axis, second_padded_axis,
+            third_axis_to_pad)`
+    """
 
-  def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
-    super().__init__(**kwargs)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    if isinstance(padding, int):
-      self.padding = ((padding, padding), (padding, padding), (padding,
-                                                               padding))
-    elif hasattr(padding, '__len__'):
-      if len(padding) != 3:
-        raise ValueError('`padding` should have 3 elements. '
-                         f'Received: {padding}.')
-      dim1_padding = conv_utils.normalize_tuple(
-          padding[0], 2, '1st entry of padding', allow_zero=True)
-      dim2_padding = conv_utils.normalize_tuple(
-          padding[1], 2, '2nd entry of padding', allow_zero=True)
-      dim3_padding = conv_utils.normalize_tuple(
-          padding[2], 2, '3rd entry of padding', allow_zero=True)
-      self.padding = (dim1_padding, dim2_padding, dim3_padding)
-    else:
-      raise ValueError(
-          '`padding` should be either an int, '
-          'a tuple of 3 ints '
-          '(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad), '
-          'or a tuple of 3 tuples of 2 ints '
-          '((left_dim1_pad, right_dim1_pad),'
-          ' (left_dim2_pad, right_dim2_pad),'
-          ' (left_dim3_pad, right_dim2_pad)). '
-          f'Received: {padding}.')
-    self.input_spec = InputSpec(ndim=5)
+    def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
+        super().__init__(**kwargs)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        if isinstance(padding, int):
+            self.padding = (
+                (padding, padding),
+                (padding, padding),
+                (padding, padding),
+            )
+        elif hasattr(padding, "__len__"):
+            if len(padding) != 3:
+                raise ValueError(
+                    f"`padding` should have 3 elements. Received: {padding}."
+                )
+            dim1_padding = conv_utils.normalize_tuple(
+                padding[0], 2, "1st entry of padding", allow_zero=True
+            )
+            dim2_padding = conv_utils.normalize_tuple(
+                padding[1], 2, "2nd entry of padding", allow_zero=True
+            )
+            dim3_padding = conv_utils.normalize_tuple(
+                padding[2], 2, "3rd entry of padding", allow_zero=True
+            )
+            self.padding = (dim1_padding, dim2_padding, dim3_padding)
+        else:
+            raise ValueError(
+                "`padding` should be either an int, "
+                "a tuple of 3 ints "
+                "(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad), "
+                "or a tuple of 3 tuples of 2 ints "
+                "((left_dim1_pad, right_dim1_pad),"
+                " (left_dim2_pad, right_dim2_pad),"
+                " (left_dim3_pad, right_dim2_pad)). "
+                f"Received: {padding}."
+            )
+        self.input_spec = InputSpec(ndim=5)
 
-  def compute_output_shape(self, input_shape):
-    input_shape = tf.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      if input_shape[2] is not None:
-        dim1 = input_shape[2] + self.padding[0][0] + self.padding[0][1]
-      else:
-        dim1 = None
-      if input_shape[3] is not None:
-        dim2 = input_shape[3] + self.padding[1][0] + self.padding[1][1]
-      else:
-        dim2 = None
-      if input_shape[4] is not None:
-        dim3 = input_shape[4] + self.padding[2][0] + self.padding[2][1]
-      else:
-        dim3 = None
-      return tf.TensorShape(
-          [input_shape[0], input_shape[1], dim1, dim2, dim3])
-    elif self.data_format == 'channels_last':
-      if input_shape[1] is not None:
-        dim1 = input_shape[1] + self.padding[0][0] + self.padding[0][1]
-      else:
-        dim1 = None
-      if input_shape[2] is not None:
-        dim2 = input_shape[2] + self.padding[1][0] + self.padding[1][1]
-      else:
-        dim2 = None
-      if input_shape[3] is not None:
-        dim3 = input_shape[3] + self.padding[2][0] + self.padding[2][1]
-      else:
-        dim3 = None
-      return tf.TensorShape(
-          [input_shape[0], dim1, dim2, dim3, input_shape[4]])
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_first":
+            if input_shape[2] is not None:
+                dim1 = input_shape[2] + self.padding[0][0] + self.padding[0][1]
+            else:
+                dim1 = None
+            if input_shape[3] is not None:
+                dim2 = input_shape[3] + self.padding[1][0] + self.padding[1][1]
+            else:
+                dim2 = None
+            if input_shape[4] is not None:
+                dim3 = input_shape[4] + self.padding[2][0] + self.padding[2][1]
+            else:
+                dim3 = None
+            return tf.TensorShape(
+                [input_shape[0], input_shape[1], dim1, dim2, dim3]
+            )
+        elif self.data_format == "channels_last":
+            if input_shape[1] is not None:
+                dim1 = input_shape[1] + self.padding[0][0] + self.padding[0][1]
+            else:
+                dim1 = None
+            if input_shape[2] is not None:
+                dim2 = input_shape[2] + self.padding[1][0] + self.padding[1][1]
+            else:
+                dim2 = None
+            if input_shape[3] is not None:
+                dim3 = input_shape[3] + self.padding[2][0] + self.padding[2][1]
+            else:
+                dim3 = None
+            return tf.TensorShape(
+                [input_shape[0], dim1, dim2, dim3, input_shape[4]]
+            )
 
-  def call(self, inputs):
-    return backend.spatial_3d_padding(
-        inputs, padding=self.padding, data_format=self.data_format)
+    def call(self, inputs):
+        return backend.spatial_3d_padding(
+            inputs, padding=self.padding, data_format=self.data_format
+        )
 
-  def get_config(self):
-    config = {'padding': self.padding, 'data_format': self.data_format}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"padding": self.padding, "data_format": self.data_format}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/reshaping/zero_padding_test.py b/keras/layers/reshaping/zero_padding_test.py
index 0896cd01afa4..4e997658d791 100644
--- a/keras/layers/reshaping/zero_padding_test.py
+++ b/keras/layers/reshaping/zero_padding_test.py
@@ -14,246 +14,327 @@
 # ==============================================================================
 """Tests for zero-padding layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class ZeroPaddingTest(test_combinations.TestCase):
+    def test_zero_padding_1d(self):
+        num_samples = 2
+        input_dim = 2
+        num_steps = 5
+        shape = (num_samples, num_steps, input_dim)
+        inputs = np.ones(shape)
 
-  def test_zero_padding_1d(self):
-    num_samples = 2
-    input_dim = 2
-    num_steps = 5
-    shape = (num_samples, num_steps, input_dim)
-    inputs = np.ones(shape)
+        with self.cached_session():
+            # basic test
+            test_utils.layer_test(
+                keras.layers.ZeroPadding1D,
+                kwargs={"padding": 2},
+                input_shape=inputs.shape,
+            )
+            test_utils.layer_test(
+                keras.layers.ZeroPadding1D,
+                kwargs={"padding": (1, 2)},
+                input_shape=inputs.shape,
+            )
 
-    with self.cached_session():
-      # basic test
-      test_utils.layer_test(
-          keras.layers.ZeroPadding1D,
-          kwargs={'padding': 2},
-          input_shape=inputs.shape)
-      test_utils.layer_test(
-          keras.layers.ZeroPadding1D,
-          kwargs={'padding': (1, 2)},
-          input_shape=inputs.shape)
+            # correctness test
+            layer = keras.layers.ZeroPadding1D(padding=2)
+            layer.build(shape)
+            output = layer(keras.backend.variable(inputs))
+            if tf.executing_eagerly():
+                np_output = output.numpy()
+            else:
+                np_output = keras.backend.eval(output)
+            for offset in [0, 1, -1, -2]:
+                np.testing.assert_allclose(np_output[:, offset, :], 0.0)
+            np.testing.assert_allclose(np_output[:, 2:-2, :], 1.0)
 
-      # correctness test
-      layer = keras.layers.ZeroPadding1D(padding=2)
-      layer.build(shape)
-      output = layer(keras.backend.variable(inputs))
-      if tf.executing_eagerly():
-        np_output = output.numpy()
-      else:
-        np_output = keras.backend.eval(output)
-      for offset in [0, 1, -1, -2]:
-        np.testing.assert_allclose(np_output[:, offset, :], 0.)
-      np.testing.assert_allclose(np_output[:, 2:-2, :], 1.)
+            layer = keras.layers.ZeroPadding1D(padding=(1, 2))
+            layer.build(shape)
+            output = layer(keras.backend.variable(inputs))
+            if tf.executing_eagerly():
+                np_output = output.numpy()
+            else:
+                np_output = keras.backend.eval(output)
+            for left_offset in [0]:
+                np.testing.assert_allclose(np_output[:, left_offset, :], 0.0)
+            for right_offset in [-1, -2]:
+                np.testing.assert_allclose(np_output[:, right_offset, :], 0.0)
+            np.testing.assert_allclose(np_output[:, 1:-2, :], 1.0)
+            layer.get_config()
 
-      layer = keras.layers.ZeroPadding1D(padding=(1, 2))
-      layer.build(shape)
-      output = layer(keras.backend.variable(inputs))
-      if tf.executing_eagerly():
-        np_output = output.numpy()
-      else:
-        np_output = keras.backend.eval(output)
-      for left_offset in [0]:
-        np.testing.assert_allclose(np_output[:, left_offset, :], 0.)
-      for right_offset in [-1, -2]:
-        np.testing.assert_allclose(np_output[:, right_offset, :], 0.)
-      np.testing.assert_allclose(np_output[:, 1:-2, :], 1.)
-      layer.get_config()
+        # test incorrect use
+        with self.assertRaises(ValueError):
+            keras.layers.ZeroPadding1D(padding=(1, 1, 1))
+        with self.assertRaises(ValueError):
+            keras.layers.ZeroPadding1D(padding=None)
 
-    # test incorrect use
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding1D(padding=(1, 1, 1))
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding1D(padding=None)
+    @parameterized.named_parameters(
+        ("channels_first", "channels_first"), ("channels_last", "channels_last")
+    )
+    def test_zero_padding_2d(self, data_format):
+        num_samples = 2
+        stack_size = 2
+        input_num_row = 4
+        input_num_col = 5
+        if data_format == "channels_first":
+            inputs = np.ones(
+                (num_samples, stack_size, input_num_row, input_num_col)
+            )
+        elif data_format == "channels_last":
+            inputs = np.ones(
+                (num_samples, input_num_row, input_num_col, stack_size)
+            )
 
-  @parameterized.named_parameters(('channels_first', 'channels_first'),
-                                  ('channels_last', 'channels_last'))
-  def test_zero_padding_2d(self, data_format):
-    num_samples = 2
-    stack_size = 2
-    input_num_row = 4
-    input_num_col = 5
-    if data_format == 'channels_first':
-      inputs = np.ones((num_samples, stack_size, input_num_row, input_num_col))
-    elif data_format == 'channels_last':
-      inputs = np.ones((num_samples, input_num_row, input_num_col, stack_size))
+        # basic test
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.ZeroPadding2D,
+                kwargs={"padding": (2, 2), "data_format": data_format},
+                input_shape=inputs.shape,
+            )
+            test_utils.layer_test(
+                keras.layers.ZeroPadding2D,
+                kwargs={
+                    "padding": ((1, 2), (3, 4)),
+                    "data_format": data_format,
+                },
+                input_shape=inputs.shape,
+            )
 
-    # basic test
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.ZeroPadding2D,
-          kwargs={
-              'padding': (2, 2),
-              'data_format': data_format
-          },
-          input_shape=inputs.shape)
-      test_utils.layer_test(
-          keras.layers.ZeroPadding2D,
-          kwargs={
-              'padding': ((1, 2), (3, 4)),
-              'data_format': data_format
-          },
-          input_shape=inputs.shape)
+        # correctness test
+        with self.cached_session():
+            layer = keras.layers.ZeroPadding2D(
+                padding=(2, 2), data_format=data_format
+            )
+            layer.build(inputs.shape)
+            output = layer(keras.backend.variable(inputs))
+            if tf.executing_eagerly():
+                np_output = output.numpy()
+            else:
+                np_output = keras.backend.eval(output)
+            if data_format == "channels_last":
+                for offset in [0, 1, -1, -2]:
+                    np.testing.assert_allclose(np_output[:, offset, :, :], 0.0)
+                    np.testing.assert_allclose(np_output[:, :, offset, :], 0.0)
+                np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.0)
+            elif data_format == "channels_first":
+                for offset in [0, 1, -1, -2]:
+                    np.testing.assert_allclose(np_output[:, :, offset, :], 0.0)
+                    np.testing.assert_allclose(np_output[:, :, :, offset], 0.0)
+                np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.0)
 
-    # correctness test
-    with self.cached_session():
-      layer = keras.layers.ZeroPadding2D(
-          padding=(2, 2), data_format=data_format)
-      layer.build(inputs.shape)
-      output = layer(keras.backend.variable(inputs))
-      if tf.executing_eagerly():
-        np_output = output.numpy()
-      else:
-        np_output = keras.backend.eval(output)
-      if data_format == 'channels_last':
-        for offset in [0, 1, -1, -2]:
-          np.testing.assert_allclose(np_output[:, offset, :, :], 0.)
-          np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
-        np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
-      elif data_format == 'channels_first':
-        for offset in [0, 1, -1, -2]:
-          np.testing.assert_allclose(np_output[:, :, offset, :], 0.)
-          np.testing.assert_allclose(np_output[:, :, :, offset], 0.)
-        np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, :], 1.)
+            layer = keras.layers.ZeroPadding2D(
+                padding=((1, 2), (3, 4)), data_format=data_format
+            )
+            layer.build(inputs.shape)
+            output = layer(keras.backend.variable(inputs))
+            if tf.executing_eagerly():
+                np_output = output.numpy()
+            else:
+                np_output = keras.backend.eval(output)
+            if data_format == "channels_last":
+                for top_offset in [0]:
+                    np.testing.assert_allclose(
+                        np_output[:, top_offset, :, :], 0.0
+                    )
+                for bottom_offset in [-1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, bottom_offset, :, :], 0.0
+                    )
+                for left_offset in [0, 1, 2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, left_offset, :], 0.0
+                    )
+                for right_offset in [-1, -2, -3, -4]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, right_offset, :], 0.0
+                    )
+                np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, :], 1.0)
+            elif data_format == "channels_first":
+                for top_offset in [0]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, top_offset, :], 0.0
+                    )
+                for bottom_offset in [-1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, bottom_offset, :], 0.0
+                    )
+                for left_offset in [0, 1, 2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, left_offset], 0.0
+                    )
+                for right_offset in [-1, -2, -3, -4]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, right_offset], 0.0
+                    )
+                np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4], 1.0)
 
-      layer = keras.layers.ZeroPadding2D(
-          padding=((1, 2), (3, 4)), data_format=data_format)
-      layer.build(inputs.shape)
-      output = layer(keras.backend.variable(inputs))
-      if tf.executing_eagerly():
-        np_output = output.numpy()
-      else:
-        np_output = keras.backend.eval(output)
-      if data_format == 'channels_last':
-        for top_offset in [0]:
-          np.testing.assert_allclose(np_output[:, top_offset, :, :], 0.)
-        for bottom_offset in [-1, -2]:
-          np.testing.assert_allclose(np_output[:, bottom_offset, :, :], 0.)
-        for left_offset in [0, 1, 2]:
-          np.testing.assert_allclose(np_output[:, :, left_offset, :], 0.)
-        for right_offset in [-1, -2, -3, -4]:
-          np.testing.assert_allclose(np_output[:, :, right_offset, :], 0.)
-        np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, :], 1.)
-      elif data_format == 'channels_first':
-        for top_offset in [0]:
-          np.testing.assert_allclose(np_output[:, :, top_offset, :], 0.)
-        for bottom_offset in [-1, -2]:
-          np.testing.assert_allclose(np_output[:, :, bottom_offset, :], 0.)
-        for left_offset in [0, 1, 2]:
-          np.testing.assert_allclose(np_output[:, :, :, left_offset], 0.)
-        for right_offset in [-1, -2, -3, -4]:
-          np.testing.assert_allclose(np_output[:, :, :, right_offset], 0.)
-        np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4], 1.)
+        # test incorrect use
+        with self.assertRaises(ValueError):
+            keras.layers.ZeroPadding2D(padding=(1, 1, 1))
+        with self.assertRaises(ValueError):
+            keras.layers.ZeroPadding2D(padding=None)
 
-    # test incorrect use
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding2D(padding=(1, 1, 1))
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding2D(padding=None)
+    @parameterized.named_parameters(
+        ("channels_first", "channels_first"), ("channels_last", "channels_last")
+    )
+    def test_zero_padding_3d(self, data_format):
+        num_samples = 2
+        stack_size = 2
+        input_len_dim1 = 4
+        input_len_dim2 = 5
+        input_len_dim3 = 3
 
-  @parameterized.named_parameters(('channels_first', 'channels_first'),
-                                  ('channels_last', 'channels_last'))
-  def test_zero_padding_3d(self, data_format):
-    num_samples = 2
-    stack_size = 2
-    input_len_dim1 = 4
-    input_len_dim2 = 5
-    input_len_dim3 = 3
+        if data_format == "channels_first":
+            inputs = np.ones(
+                (
+                    num_samples,
+                    stack_size,
+                    input_len_dim1,
+                    input_len_dim2,
+                    input_len_dim3,
+                )
+            )
+        elif data_format == "channels_last":
+            inputs = np.ones(
+                (
+                    num_samples,
+                    input_len_dim1,
+                    input_len_dim2,
+                    input_len_dim3,
+                    stack_size,
+                )
+            )
 
-    if data_format == 'channels_first':
-      inputs = np.ones((num_samples, stack_size, input_len_dim1, input_len_dim2,
-                        input_len_dim3))
-    elif data_format == 'channels_last':
-      inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
-                        input_len_dim3, stack_size))
+        with self.cached_session():
+            # basic test
+            test_utils.layer_test(
+                keras.layers.ZeroPadding3D,
+                kwargs={"padding": (2, 2, 2), "data_format": data_format},
+                input_shape=inputs.shape,
+            )
+            test_utils.layer_test(
+                keras.layers.ZeroPadding3D,
+                kwargs={
+                    "padding": ((1, 2), (3, 4), (0, 2)),
+                    "data_format": data_format,
+                },
+                input_shape=inputs.shape,
+            )
 
-    with self.cached_session():
-      # basic test
-      test_utils.layer_test(
-          keras.layers.ZeroPadding3D,
-          kwargs={
-              'padding': (2, 2, 2),
-              'data_format': data_format
-          },
-          input_shape=inputs.shape)
-      test_utils.layer_test(
-          keras.layers.ZeroPadding3D,
-          kwargs={
-              'padding': ((1, 2), (3, 4), (0, 2)),
-              'data_format': data_format
-          },
-          input_shape=inputs.shape)
+        with self.cached_session():
+            # correctness test
+            layer = keras.layers.ZeroPadding3D(
+                padding=(2, 2, 2), data_format=data_format
+            )
+            layer.build(inputs.shape)
+            output = layer(keras.backend.variable(inputs))
+            if tf.executing_eagerly():
+                np_output = output.numpy()
+            else:
+                np_output = keras.backend.eval(output)
+            if data_format == "channels_last":
+                for offset in [0, 1, -1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, offset, :, :, :], 0.0
+                    )
+                    np.testing.assert_allclose(
+                        np_output[:, :, offset, :, :], 0.0
+                    )
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, offset, :], 0.0
+                    )
+                np.testing.assert_allclose(
+                    np_output[:, 2:-2, 2:-2, 2:-2, :], 1.0
+                )
+            elif data_format == "channels_first":
+                for offset in [0, 1, -1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, offset, :, :], 0.0
+                    )
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, offset, :], 0.0
+                    )
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, :, offset], 0.0
+                    )
+                np.testing.assert_allclose(
+                    np_output[:, :, 2:-2, 2:-2, 2:-2], 1.0
+                )
 
-    with self.cached_session():
-      # correctness test
-      layer = keras.layers.ZeroPadding3D(
-          padding=(2, 2, 2), data_format=data_format)
-      layer.build(inputs.shape)
-      output = layer(keras.backend.variable(inputs))
-      if tf.executing_eagerly():
-        np_output = output.numpy()
-      else:
-        np_output = keras.backend.eval(output)
-      if data_format == 'channels_last':
-        for offset in [0, 1, -1, -2]:
-          np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
-          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-        np.testing.assert_allclose(np_output[:, 2:-2, 2:-2, 2:-2, :], 1.)
-      elif data_format == 'channels_first':
-        for offset in [0, 1, -1, -2]:
-          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-          np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
-        np.testing.assert_allclose(np_output[:, :, 2:-2, 2:-2, 2:-2], 1.)
+            layer = keras.layers.ZeroPadding3D(
+                padding=((1, 2), (3, 4), (0, 2)), data_format=data_format
+            )
+            layer.build(inputs.shape)
+            output = layer(keras.backend.variable(inputs))
+            if tf.executing_eagerly():
+                np_output = output.numpy()
+            else:
+                np_output = keras.backend.eval(output)
+            if data_format == "channels_last":
+                for offset in [0]:
+                    np.testing.assert_allclose(
+                        np_output[:, offset, :, :, :], 0.0
+                    )
+                for offset in [-1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, offset, :, :, :], 0.0
+                    )
+                for offset in [0, 1, 2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, offset, :, :], 0.0
+                    )
+                for offset in [-1, -2, -3, -4]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, offset, :, :], 0.0
+                    )
+                for offset in [-1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, offset, :], 0.0
+                    )
+                np.testing.assert_allclose(
+                    np_output[:, 1:-2, 3:-4, 0:-2, :], 1.0
+                )
+            elif data_format == "channels_first":
+                for offset in [0]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, offset, :, :], 0.0
+                    )
+                for offset in [-1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, offset, :, :], 0.0
+                    )
+                for offset in [0, 1, 2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, offset, :], 0.0
+                    )
+                for offset in [-1, -2, -3, -4]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, offset, :], 0.0
+                    )
+                for offset in [-1, -2]:
+                    np.testing.assert_allclose(
+                        np_output[:, :, :, :, offset], 0.0
+                    )
+                np.testing.assert_allclose(
+                    np_output[:, :, 1:-2, 3:-4, 0:-2], 1.0
+                )
 
-      layer = keras.layers.ZeroPadding3D(
-          padding=((1, 2), (3, 4), (0, 2)), data_format=data_format)
-      layer.build(inputs.shape)
-      output = layer(keras.backend.variable(inputs))
-      if tf.executing_eagerly():
-        np_output = output.numpy()
-      else:
-        np_output = keras.backend.eval(output)
-      if data_format == 'channels_last':
-        for offset in [0]:
-          np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
-        for offset in [-1, -2]:
-          np.testing.assert_allclose(np_output[:, offset, :, :, :], 0.)
-        for offset in [0, 1, 2]:
-          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-        for offset in [-1, -2, -3, -4]:
-          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-        for offset in [-1, -2]:
-          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-        np.testing.assert_allclose(np_output[:, 1:-2, 3:-4, 0:-2, :], 1.)
-      elif data_format == 'channels_first':
-        for offset in [0]:
-          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-        for offset in [-1, -2]:
-          np.testing.assert_allclose(np_output[:, :, offset, :, :], 0.)
-        for offset in [0, 1, 2]:
-          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-        for offset in [-1, -2, -3, -4]:
-          np.testing.assert_allclose(np_output[:, :, :, offset, :], 0.)
-        for offset in [-1, -2]:
-          np.testing.assert_allclose(np_output[:, :, :, :, offset], 0.)
-        np.testing.assert_allclose(np_output[:, :, 1:-2, 3:-4, 0:-2], 1.)
+        # test incorrect use
+        with self.assertRaises(ValueError):
+            keras.layers.ZeroPadding3D(padding=(1, 1))
+        with self.assertRaises(ValueError):
+            keras.layers.ZeroPadding3D(padding=None)
 
-    # test incorrect use
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding3D(padding=(1, 1))
-    with self.assertRaises(ValueError):
-      keras.layers.ZeroPadding3D(padding=None)
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/BUILD b/keras/layers/rnn/BUILD
index ccbb9690a242..5b7ca0279f40 100644
--- a/keras/layers/rnn/BUILD
+++ b/keras/layers/rnn/BUILD
@@ -1,12 +1,14 @@
 # Description:
 #  Contains the Keras recurrent layers.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
@@ -180,7 +182,7 @@ py_library(
         "//keras:backend",
         "//keras/engine:base_layer",
         "//keras/engine:input_spec",
-        "//keras/saving/saved_model",
+        "//keras/saving/legacy/saved_model",
         "//keras/utils:generic_utils",
     ],
 )
@@ -396,6 +398,9 @@ cuda_py_test(
     srcs = ["gru_lstm_test.py"],
     python_version = "PY3",
     shard_count = 2,
+    tags = [
+        "no_oss",  # TODO(b/277925387)
+    ],
     deps = [
         ":gru",
         ":lstm",
@@ -414,7 +419,9 @@ cuda_py_test(
     srcs = ["gru_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = ["no_rocm"],
+    tags = [
+        "no_oss",  # TODO(b/277925387)
+    ],
     deps = [
         ":gru_lstm_utils",
         "//:expect_absl_installed",
@@ -501,7 +508,6 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 12,
     tags = [
-        "no_rocm",
         "notsan",  # TODO(b/170870794)
     ],
     deps = [
@@ -544,7 +550,6 @@ tf_py_test(
     srcs = ["conv_lstm_test.py"],
     python_version = "PY3",
     shard_count = 8,
-    tags = ["no_rocm"],
     deps = [
         "//:expect_absl_installed",
         "//:expect_numpy_installed",
@@ -562,6 +567,7 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = [
+        "no_oss",  # TODO(b/277925387)
         "no_windows_gpu",
     ],
     deps = [
@@ -569,7 +575,7 @@ cuda_py_test(
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
         "//keras/testing_infra:test_utils",
     ],
diff --git a/keras/layers/rnn/__init__.py b/keras/layers/rnn/__init__.py
index 3b6587d9edcd..a2438fc7d105 100644
--- a/keras/layers/rnn/__init__.py
+++ b/keras/layers/rnn/__init__.py
@@ -13,59 +13,61 @@
 # limitations under the License.
 # ==============================================================================
 """Keras recurrent layers."""
-# pylint: disable=g-bad-import-order,g-direct-tensorflow-import,disable=g-import-not-at-top
 
 import tensorflow.compat.v2 as tf
 
+from keras.layers.rnn.abstract_rnn_cell import AbstractRNNCell
+
 # Recurrent layers.
 from keras.layers.rnn.base_rnn import RNN
-from keras.layers.rnn.abstract_rnn_cell import AbstractRNNCell
-from keras.layers.rnn.stacked_rnn_cells import StackedRNNCells
-from keras.layers.rnn.simple_rnn import SimpleRNNCell
 from keras.layers.rnn.simple_rnn import SimpleRNN
+from keras.layers.rnn.simple_rnn import SimpleRNNCell
+from keras.layers.rnn.stacked_rnn_cells import StackedRNNCells
 
 if tf.__internal__.tf2.enabled():
-  from keras.layers.rnn.gru import GRU
-  from keras.layers.rnn.gru import GRUCell
-  from keras.layers.rnn.lstm import LSTM
-  from keras.layers.rnn.lstm import LSTMCell
-  from keras.layers.rnn.gru_v1 import GRU as GRUV1
-  from keras.layers.rnn.gru_v1 import GRUCell as GRUCellV1
-  from keras.layers.rnn.lstm_v1 import LSTM as LSTMV1
-  from keras.layers.rnn.lstm_v1 import LSTMCell as LSTMCellV1
-  GRUV2 = GRU
-  GRUCellV2 = GRUCell
-  LSTMV2 = LSTM
-  LSTMCellV2 = LSTMCell
+    from keras.layers.rnn.gru import GRU
+    from keras.layers.rnn.gru import GRUCell
+    from keras.layers.rnn.gru_v1 import GRU as GRUV1
+    from keras.layers.rnn.gru_v1 import GRUCell as GRUCellV1
+    from keras.layers.rnn.lstm import LSTM
+    from keras.layers.rnn.lstm import LSTMCell
+    from keras.layers.rnn.lstm_v1 import LSTM as LSTMV1
+    from keras.layers.rnn.lstm_v1 import LSTMCell as LSTMCellV1
+
+    GRUV2 = GRU
+    GRUCellV2 = GRUCell
+    LSTMV2 = LSTM
+    LSTMCellV2 = LSTMCell
 else:
-  from keras.layers.rnn.gru_v1 import GRU
-  from keras.layers.rnn.gru_v1 import GRUCell
-  from keras.layers.rnn.lstm_v1 import LSTM
-  from keras.layers.rnn.lstm_v1 import LSTMCell
-  from keras.layers.rnn.gru import GRU as GRUV2
-  from keras.layers.rnn.gru import GRUCell as GRUCellV2
-  from keras.layers.rnn.lstm import LSTM as LSTMV2
-  from keras.layers.rnn.lstm import LSTMCell as LSTMCellV2
-  GRUV1 = GRU
-  GRUCellV1 = GRUCell
-  LSTMV1 = LSTM
-  LSTMCellV1 = LSTMCell
+    from keras.layers.rnn.gru import GRU as GRUV2
+    from keras.layers.rnn.gru import GRUCell as GRUCellV2
+    from keras.layers.rnn.gru_v1 import GRU
+    from keras.layers.rnn.gru_v1 import GRUCell
+    from keras.layers.rnn.lstm import LSTM as LSTMV2
+    from keras.layers.rnn.lstm import LSTMCell as LSTMCellV2
+    from keras.layers.rnn.lstm_v1 import LSTM
+    from keras.layers.rnn.lstm_v1 import LSTMCell
 
-# Convolutional-recurrent layers.
-from keras.layers.rnn.conv_lstm1d import ConvLSTM1D
-from keras.layers.rnn.conv_lstm2d import ConvLSTM2D
-from keras.layers.rnn.conv_lstm3d import ConvLSTM3D
-
-# cuDNN recurrent layers.
-from keras.layers.rnn.cudnn_lstm import CuDNNLSTM
-from keras.layers.rnn.cudnn_gru import CuDNNGRU
+    GRUV1 = GRU
+    GRUCellV1 = GRUCell
+    LSTMV1 = LSTM
+    LSTMCellV1 = LSTMCell
 
 # Wrapper functions.
 from keras.layers.rnn.base_wrapper import Wrapper
 from keras.layers.rnn.bidirectional import Bidirectional
-from keras.layers.rnn.time_distributed import TimeDistributed
 
 # RNN Cell wrappers.
 from keras.layers.rnn.cell_wrappers import DeviceWrapper
 from keras.layers.rnn.cell_wrappers import DropoutWrapper
 from keras.layers.rnn.cell_wrappers import ResidualWrapper
+
+# Convolutional-recurrent layers.
+from keras.layers.rnn.conv_lstm1d import ConvLSTM1D
+from keras.layers.rnn.conv_lstm2d import ConvLSTM2D
+from keras.layers.rnn.conv_lstm3d import ConvLSTM3D
+from keras.layers.rnn.cudnn_gru import CuDNNGRU
+
+# cuDNN recurrent layers.
+from keras.layers.rnn.cudnn_lstm import CuDNNLSTM
+from keras.layers.rnn.time_distributed import TimeDistributed
diff --git a/keras/layers/rnn/abstract_rnn_cell.py b/keras/layers/rnn/abstract_rnn_cell.py
index 0ae557fc40ec..d097947a21e5 100644
--- a/keras/layers/rnn/abstract_rnn_cell.py
+++ b/keras/layers/rnn/abstract_rnn_cell.py
@@ -13,101 +13,103 @@
 # limitations under the License.
 # ==============================================================================
 """Base class for RNN cells."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras.engine import base_layer
 from keras.layers.rnn import rnn_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.AbstractRNNCell')
+@keras_export("keras.layers.AbstractRNNCell")
 class AbstractRNNCell(base_layer.Layer):
-  """Abstract object representing an RNN cell.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  This is the base class for implementing RNN cells with custom behavior.
-
-  Every `RNNCell` must have the properties below and implement `call` with
-  the signature `(output, next_state) = call(input, state)`.
-
-  Examples:
-
-  ```python
-    class MinimalRNNCell(AbstractRNNCell):
-
-      def __init__(self, units, **kwargs):
-        self.units = units
-        super(MinimalRNNCell, self).__init__(**kwargs)
-
-      @property
-      def state_size(self):
-        return self.units
-
-      def build(self, input_shape):
-        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                      initializer='uniform',
-                                      name='kernel')
-        self.recurrent_kernel = self.add_weight(
-            shape=(self.units, self.units),
-            initializer='uniform',
-            name='recurrent_kernel')
-        self.built = True
-
-      def call(self, inputs, states):
-        prev_output = states[0]
-        h = backend.dot(inputs, self.kernel)
-        output = h + backend.dot(prev_output, self.recurrent_kernel)
-        return output, output
-  ```
-
-  This definition of cell differs from the definition used in the literature.
-  In the literature, 'cell' refers to an object with a single scalar output.
-  This definition refers to a horizontal array of such units.
-
-  An RNN cell, in the most abstract setting, is anything that has
-  a state and performs some operation that takes a matrix of inputs.
-  This operation results in an output matrix with `self.output_size` columns.
-  If `self.state_size` is an integer, this operation also results in a new
-  state matrix with `self.state_size` columns.  If `self.state_size` is a
-  (possibly nested tuple of) TensorShape object(s), then it should return a
-  matching structure of Tensors having shape `[batch_size].concatenate(s)`
-  for each `s` in `self.batch_size`.
-  """
-
-  def call(self, inputs, states):
-    """The function that contains the logic for one RNN step calculation.
-
-    Args:
-      inputs: the input tensor, which is a slide from the overall RNN input by
-        the time dimension (usually the second dimension).
-      states: the state tensor from previous step, which has the same shape
-        as `(batch, state_size)`. In the case of timestep 0, it will be the
-        initial state user specified, or zero filled tensor otherwise.
-
-    Returns:
-      A tuple of two tensors:
-        1. output tensor for the current timestep, with size `output_size`.
-        2. state tensor for next step, which has the shape of `state_size`.
+    """Abstract object representing an RNN cell.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    This is the base class for implementing RNN cells with custom behavior.
+
+    Every `RNNCell` must have the properties below and implement `call` with
+    the signature `(output, next_state) = call(input, state)`.
+
+    Examples:
+
+    ```python
+      class MinimalRNNCell(AbstractRNNCell):
+
+        def __init__(self, units, **kwargs):
+          self.units = units
+          super(MinimalRNNCell, self).__init__(**kwargs)
+
+        @property
+        def state_size(self):
+          return self.units
+
+        def build(self, input_shape):
+          self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                        initializer='uniform',
+                                        name='kernel')
+          self.recurrent_kernel = self.add_weight(
+              shape=(self.units, self.units),
+              initializer='uniform',
+              name='recurrent_kernel')
+          self.built = True
+
+        def call(self, inputs, states):
+          prev_output = states[0]
+          h = backend.dot(inputs, self.kernel)
+          output = h + backend.dot(prev_output, self.recurrent_kernel)
+          return output, output
+    ```
+
+    This definition of cell differs from the definition used in the literature.
+    In the literature, 'cell' refers to an object with a single scalar output.
+    This definition refers to a horizontal array of such units.
+
+    An RNN cell, in the most abstract setting, is anything that has
+    a state and performs some operation that takes a matrix of inputs.
+    This operation results in an output matrix with `self.output_size` columns.
+    If `self.state_size` is an integer, this operation also results in a new
+    state matrix with `self.state_size` columns.  If `self.state_size` is a
+    (possibly nested tuple of) TensorShape object(s), then it should return a
+    matching structure of Tensors having shape `[batch_size].concatenate(s)`
+    for each `s` in `self.batch_size`.
     """
-    raise NotImplementedError
-
-  @property
-  def state_size(self):
-    """size(s) of state(s) used by this cell.
-
-    It can be represented by an Integer, a TensorShape or a tuple of Integers
-    or TensorShapes.
-    """
-    raise NotImplementedError
-
-  @property
-  def output_size(self):
-    """Integer or TensorShape: size of outputs produced by this cell."""
-    raise NotImplementedError
 
-  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    return rnn_utils.generate_zero_filled_state_for_cell(
-        self, inputs, batch_size, dtype)
+    def call(self, inputs, states):
+        """The function that contains the logic for one RNN step calculation.
+
+        Args:
+          inputs: the input tensor, which is a slide from the overall RNN input
+            by the time dimension (usually the second dimension).
+          states: the state tensor from previous step, which has the same shape
+            as `(batch, state_size)`. In the case of timestep 0, it will be the
+            initial state user specified, or zero filled tensor otherwise.
+
+        Returns:
+          A tuple of two tensors:
+            1. output tensor for the current timestep, with size `output_size`.
+            2. state tensor for next step, which has the shape of `state_size`.
+        """
+        raise NotImplementedError
+
+    @property
+    def state_size(self):
+        """size(s) of state(s) used by this cell.
+
+        It can be represented by an Integer, a TensorShape or a tuple of
+        Integers or TensorShapes.
+        """
+        raise NotImplementedError
+
+    @property
+    def output_size(self):
+        """Integer or TensorShape: size of outputs produced by this cell."""
+        raise NotImplementedError
+
+    def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        return rnn_utils.generate_zero_filled_state_for_cell(
+            self, inputs, batch_size, dtype
+        )
diff --git a/keras/layers/rnn/base_conv_lstm.py b/keras/layers/rnn/base_conv_lstm.py
index ef753cc94acb..b3280d5ac63b 100644
--- a/keras/layers/rnn/base_conv_lstm.py
+++ b/keras/layers/rnn/base_conv_lstm.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Base class for N-D convolutional LSTM layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import backend
@@ -24,577 +26,617 @@
 from keras.layers.rnn.base_conv_rnn import ConvRNN
 from keras.layers.rnn.dropout_rnn_cell_mixin import DropoutRNNCellMixin
 from keras.utils import conv_utils
-import tensorflow.compat.v2 as tf
 
 
 class ConvLSTMCell(DropoutRNNCellMixin, base_layer.BaseRandomLayer):
-  """Cell class for the ConvLSTM layer.
-
-  Args:
-    rank: Integer, rank of the convolution, e.g. "2" for 2D convolutions.
-    filters: Integer, the dimensionality of the output space (i.e. the number of
-      output filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      dimensions of the convolution window.
-    strides: An integer or tuple/list of n integers, specifying the strides of
-      the convolution. Specifying any stride value != 1 is incompatible with
-      specifying any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means no
-      padding. `"same"` results in padding evenly to the left/right or up/down
-      of the input such that output has the same height/width dimension as the
-      input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      It defaults to the `image_data_format` value found in your Keras config
-      file at `~/.keras/keras.json`. If you never set it, then it will be
-      "channels_last".
-    dilation_rate: An integer or tuple/list of n integers, specifying the
-      dilation rate to use for dilated convolution. Currently, specifying any
-      `dilation_rate` value != 1 is incompatible with specifying any `strides`
-      value != 1.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use for the recurrent step.
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
-      initialization. Use in combination with `bias_initializer="zeros"`. This
-      is recommended in [Jozefowicz et al., 2015](
-        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-  Call arguments:
-    inputs: A (2+ `rank`)D tensor.
-    states:  List of state tensors corresponding to the previous timestep.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. Only relevant when `dropout` or
-      `recurrent_dropout` is used.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               dilation_rate=1,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.0,
-               recurrent_dropout=0.0,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.rank = rank
-    if self.rank > 3:
-      raise ValueError(f'Rank {rank} convolutions are not currently '
-                       f'implemented. Received: rank={rank}')
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, self.rank,
-                                                  'kernel_size')
-    self.strides = conv_utils.normalize_tuple(
-        strides, self.rank, 'strides', allow_zero=True)
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, self.rank,
-                                                    'dilation_rate')
-    self.activation = activations.get(activation)
-    self.recurrent_activation = activations.get(recurrent_activation)
-    self.use_bias = use_bias
-
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.recurrent_initializer = initializers.get(recurrent_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.unit_forget_bias = unit_forget_bias
-
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.recurrent_constraint = constraints.get(recurrent_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-    self.dropout = min(1.0, max(0.0, dropout))
-    self.recurrent_dropout = min(1.0, max(0.0, recurrent_dropout))
-    self.state_size = (self.filters, self.filters)
-
-  def build(self, input_shape):
-
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError(
-          'The channel dimension of the inputs (last axis) should be defined. '
-          f'Found None. Full input shape received: input_shape={input_shape}')
-    input_dim = input_shape[channel_axis]
-    self.kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
-    recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
-
-    self.kernel = self.add_weight(
-        shape=self.kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-    self.recurrent_kernel = self.add_weight(
-        shape=recurrent_kernel_shape,
-        initializer=self.recurrent_initializer,
-        name='recurrent_kernel',
-        regularizer=self.recurrent_regularizer,
-        constraint=self.recurrent_constraint)
-
-    if self.use_bias:
-      if self.unit_forget_bias:
-
-        def bias_initializer(_, *args, **kwargs):
-          return backend.concatenate([
-              self.bias_initializer((self.filters,), *args, **kwargs),
-              initializers.get('ones')((self.filters,), *args, **kwargs),
-              self.bias_initializer((self.filters * 2,), *args, **kwargs),
-          ])
-      else:
-        bias_initializer = self.bias_initializer
-      self.bias = self.add_weight(
-          shape=(self.filters * 4,),
-          name='bias',
-          initializer=bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs, states, training=None):
-    h_tm1 = states[0]  # previous memory state
-    c_tm1 = states[1]  # previous carry state
-
-    # dropout matrices for input units
-    dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
-    # dropout matrices for recurrent units
-    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
-        h_tm1, training, count=4)
-
-    if 0 < self.dropout < 1.:
-      inputs_i = inputs * dp_mask[0]
-      inputs_f = inputs * dp_mask[1]
-      inputs_c = inputs * dp_mask[2]
-      inputs_o = inputs * dp_mask[3]
-    else:
-      inputs_i = inputs
-      inputs_f = inputs
-      inputs_c = inputs
-      inputs_o = inputs
-
-    if 0 < self.recurrent_dropout < 1.:
-      h_tm1_i = h_tm1 * rec_dp_mask[0]
-      h_tm1_f = h_tm1 * rec_dp_mask[1]
-      h_tm1_c = h_tm1 * rec_dp_mask[2]
-      h_tm1_o = h_tm1 * rec_dp_mask[3]
-    else:
-      h_tm1_i = h_tm1
-      h_tm1_f = h_tm1
-      h_tm1_c = h_tm1
-      h_tm1_o = h_tm1
-
-    (kernel_i, kernel_f, kernel_c, kernel_o) = tf.split(
-        self.kernel, 4, axis=self.rank + 1)
-    (recurrent_kernel_i, recurrent_kernel_f, recurrent_kernel_c,
-     recurrent_kernel_o) = tf.split(
-         self.recurrent_kernel, 4, axis=self.rank + 1)
-
-    if self.use_bias:
-      bias_i, bias_f, bias_c, bias_o = tf.split(self.bias, 4)
-    else:
-      bias_i, bias_f, bias_c, bias_o = None, None, None, None
-
-    x_i = self.input_conv(inputs_i, kernel_i, bias_i, padding=self.padding)
-    x_f = self.input_conv(inputs_f, kernel_f, bias_f, padding=self.padding)
-    x_c = self.input_conv(inputs_c, kernel_c, bias_c, padding=self.padding)
-    x_o = self.input_conv(inputs_o, kernel_o, bias_o, padding=self.padding)
-    h_i = self.recurrent_conv(h_tm1_i, recurrent_kernel_i)
-    h_f = self.recurrent_conv(h_tm1_f, recurrent_kernel_f)
-    h_c = self.recurrent_conv(h_tm1_c, recurrent_kernel_c)
-    h_o = self.recurrent_conv(h_tm1_o, recurrent_kernel_o)
-
-    i = self.recurrent_activation(x_i + h_i)
-    f = self.recurrent_activation(x_f + h_f)
-    c = f * c_tm1 + i * self.activation(x_c + h_c)
-    o = self.recurrent_activation(x_o + h_o)
-    h = o * self.activation(c)
-    return h, [h, c]
-
-  @property
-  def _conv_func(self):
-    if self.rank == 1:
-      return backend.conv1d
-    if self.rank == 2:
-      return backend.conv2d
-    if self.rank == 3:
-      return backend.conv3d
-
-  def input_conv(self, x, w, b=None, padding='valid'):
-    conv_out = self._conv_func(
-        x,
-        w,
-        strides=self.strides,
-        padding=padding,
-        data_format=self.data_format,
-        dilation_rate=self.dilation_rate)
-    if b is not None:
-      conv_out = backend.bias_add(conv_out, b, data_format=self.data_format)
-    return conv_out
-
-  def recurrent_conv(self, x, w):
-    strides = conv_utils.normalize_tuple(
-        1, self.rank, 'strides', allow_zero=True)
-    conv_out = self._conv_func(
-        x, w, strides=strides, padding='same', data_format=self.data_format)
-    return conv_out
-
-  def get_config(self):
-    config = {
-        'filters':
+    """Cell class for the ConvLSTM layer.
+
+    Args:
+      rank: Integer, rank of the convolution, e.g. "2" for 2D convolutions.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of output filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers, specifying the strides of
+        the convolution. Specifying any stride value != 1 is incompatible with
+        specifying any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means
+        no padding. `"same"` results in padding evenly to the left/right or
+        up/down of the input such that output has the same height/width
+        dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of n integers, specifying the
+        dilation rate to use for dilated convolution. Currently, specifying any
+        `dilation_rate` value != 1 is incompatible with specifying any `strides`
+        value != 1.
+      activation: Activation function to use. If you don't specify anything, no
+        activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
+      at initialization. Use in combination with `bias_initializer="zeros"`.
+      This is recommended in [Jozefowicz et al., 2015](
+      http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state.
+    Call arguments:
+      inputs: A (2+ `rank`)D tensor.
+      states:  List of state tensors corresponding to the previous timestep.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. Only relevant when `dropout` or
+        `recurrent_dropout` is used.
+    """
+
+    def __init__(
+        self,
+        rank,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format=None,
+        dilation_rate=1,
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.rank = rank
+        if self.rank > 3:
+            raise ValueError(
+                f"Rank {rank} convolutions are not currently "
+                f"implemented. Received: rank={rank}"
+            )
+        self.filters = filters
+        self.kernel_size = conv_utils.normalize_tuple(
+            kernel_size, self.rank, "kernel_size"
+        )
+        self.strides = conv_utils.normalize_tuple(
+            strides, self.rank, "strides", allow_zero=True
+        )
+        self.padding = conv_utils.normalize_padding(padding)
+        self.data_format = conv_utils.normalize_data_format(data_format)
+        self.dilation_rate = conv_utils.normalize_tuple(
+            dilation_rate, self.rank, "dilation_rate"
+        )
+        self.activation = activations.get(activation)
+        self.recurrent_activation = activations.get(recurrent_activation)
+        self.use_bias = use_bias
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.recurrent_initializer = initializers.get(recurrent_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.unit_forget_bias = unit_forget_bias
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.recurrent_constraint = constraints.get(recurrent_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+        self.dropout = min(1.0, max(0.0, dropout))
+        self.recurrent_dropout = min(1.0, max(0.0, recurrent_dropout))
+        self.state_size = (self.filters, self.filters)
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        if self.data_format == "channels_first":
+            channel_axis = 1
+        else:
+            channel_axis = -1
+        if input_shape[channel_axis] is None:
+            raise ValueError(
+                "The channel dimension of the inputs (last axis) should be "
+                "defined. Found None. Full input shape received: "
+                f"input_shape={input_shape}"
+            )
+        input_dim = input_shape[channel_axis]
+        self.kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
+        recurrent_kernel_shape = self.kernel_size + (
             self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+            self.filters * 4,
+        )
+
+        self.kernel = self.add_weight(
+            shape=self.kernel_shape,
+            initializer=self.kernel_initializer,
+            name="kernel",
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+        )
+        self.recurrent_kernel = self.add_weight(
+            shape=recurrent_kernel_shape,
+            initializer=self.recurrent_initializer,
+            name="recurrent_kernel",
+            regularizer=self.recurrent_regularizer,
+            constraint=self.recurrent_constraint,
+        )
+
+        if self.use_bias:
+            if self.unit_forget_bias:
+
+                def bias_initializer(_, *args, **kwargs):
+                    return backend.concatenate(
+                        [
+                            self.bias_initializer(
+                                (self.filters,), *args, **kwargs
+                            ),
+                            initializers.get("ones")(
+                                (self.filters,), *args, **kwargs
+                            ),
+                            self.bias_initializer(
+                                (self.filters * 2,), *args, **kwargs
+                            ),
+                        ]
+                    )
+
+            else:
+                bias_initializer = self.bias_initializer
+            self.bias = self.add_weight(
+                shape=(self.filters * 4,),
+                name="bias",
+                initializer=bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+            )
+        else:
+            self.bias = None
+        self.built = True
+
+    def call(self, inputs, states, training=None):
+        h_tm1 = states[0]  # previous memory state
+        c_tm1 = states[1]  # previous carry state
+
+        # dropout matrices for input units
+        dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
+        # dropout matrices for recurrent units
+        rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+            h_tm1, training, count=4
+        )
+
+        if 0 < self.dropout < 1.0:
+            inputs_i = inputs * dp_mask[0]
+            inputs_f = inputs * dp_mask[1]
+            inputs_c = inputs * dp_mask[2]
+            inputs_o = inputs * dp_mask[3]
+        else:
+            inputs_i = inputs
+            inputs_f = inputs
+            inputs_c = inputs
+            inputs_o = inputs
+
+        if 0 < self.recurrent_dropout < 1.0:
+            h_tm1_i = h_tm1 * rec_dp_mask[0]
+            h_tm1_f = h_tm1 * rec_dp_mask[1]
+            h_tm1_c = h_tm1 * rec_dp_mask[2]
+            h_tm1_o = h_tm1 * rec_dp_mask[3]
+        else:
+            h_tm1_i = h_tm1
+            h_tm1_f = h_tm1
+            h_tm1_c = h_tm1
+            h_tm1_o = h_tm1
+
+        (kernel_i, kernel_f, kernel_c, kernel_o) = tf.split(
+            self.kernel, 4, axis=self.rank + 1
+        )
+        (
+            recurrent_kernel_i,
+            recurrent_kernel_f,
+            recurrent_kernel_c,
+            recurrent_kernel_o,
+        ) = tf.split(self.recurrent_kernel, 4, axis=self.rank + 1)
+
+        if self.use_bias:
+            bias_i, bias_f, bias_c, bias_o = tf.split(self.bias, 4)
+        else:
+            bias_i, bias_f, bias_c, bias_o = None, None, None, None
+
+        x_i = self.input_conv(inputs_i, kernel_i, bias_i, padding=self.padding)
+        x_f = self.input_conv(inputs_f, kernel_f, bias_f, padding=self.padding)
+        x_c = self.input_conv(inputs_c, kernel_c, bias_c, padding=self.padding)
+        x_o = self.input_conv(inputs_o, kernel_o, bias_o, padding=self.padding)
+        h_i = self.recurrent_conv(h_tm1_i, recurrent_kernel_i)
+        h_f = self.recurrent_conv(h_tm1_f, recurrent_kernel_f)
+        h_c = self.recurrent_conv(h_tm1_c, recurrent_kernel_c)
+        h_o = self.recurrent_conv(h_tm1_o, recurrent_kernel_o)
+
+        i = self.recurrent_activation(x_i + h_i)
+        f = self.recurrent_activation(x_f + h_f)
+        c = f * c_tm1 + i * self.activation(x_c + h_c)
+        o = self.recurrent_activation(x_o + h_o)
+        h = o * self.activation(c)
+        return h, [h, c]
+
+    @property
+    def _conv_func(self):
+        if self.rank == 1:
+            return backend.conv1d
+        if self.rank == 2:
+            return backend.conv2d
+        if self.rank == 3:
+            return backend.conv3d
+
+    def input_conv(self, x, w, b=None, padding="valid"):
+        conv_out = self._conv_func(
+            x,
+            w,
+            strides=self.strides,
+            padding=padding,
+            data_format=self.data_format,
+            dilation_rate=self.dilation_rate,
+        )
+        if b is not None:
+            conv_out = backend.bias_add(
+                conv_out, b, data_format=self.data_format
+            )
+        return conv_out
+
+    def recurrent_conv(self, x, w):
+        strides = conv_utils.normalize_tuple(
+            1, self.rank, "strides", allow_zero=True
+        )
+        conv_out = self._conv_func(
+            x, w, strides=strides, padding="same", data_format=self.data_format
+        )
+        return conv_out
+
+    def get_config(self):
+        config = {
+            "filters": self.filters,
+            "kernel_size": self.kernel_size,
+            "strides": self.strides,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "dilation_rate": self.dilation_rate,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "unit_forget_bias": self.unit_forget_bias,
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 class ConvLSTM(ConvRNN):
-  """Abstract N-D Convolutional LSTM layer (used as implementation base).
-
-  Similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
-
-  Args:
-    rank: Integer, rank of the convolution, e.g. "2" for 2D convolutions.
-    filters: Integer, the dimensionality of the output space
-      (i.e. the number of output filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      dimensions of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the strides of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string,
-      one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, time, ..., channels)`
-      while `channels_first` corresponds to
-      inputs with shape `(batch, time, channels, ...)`.
-      It defaults to the `image_data_format` value found in your
-      Keras config file at `~/.keras/keras.json`.
-      If you never set it, then it will be "channels_last".
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function to use.
-      By default hyperbolic tangent activation function is applied
-      (`tanh(x)`).
-    recurrent_activation: Activation function to use
-      for the recurrent step.
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix,
-      used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean.
-      If True, add 1 to the bias of the forget gate at initialization.
-      Use in combination with `bias_initializer="zeros"`.
-      This is recommended in [Jozefowicz et al., 2015](
-        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix.
-    recurrent_regularizer: Regularizer function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to.
-    kernel_constraint: Constraint function applied to
-      the `kernel` weights matrix.
-    recurrent_constraint: Constraint function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence. (default False)
-    return_state: Boolean Whether to return the last state
-      in addition to the output. (default False)
-    go_backwards: Boolean (default False).
-      If True, process the input sequence backwards.
-    stateful: Boolean (default False). If True, the last state
-      for each sample at index i in a batch will be used as initial
-      state for the sample of index i in the following batch.
-    dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               dilation_rate=1,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               dropout=0.0,
-               recurrent_dropout=0.0,
-               **kwargs):
-    cell = ConvLSTMCell(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        dtype=kwargs.get('dtype'))
-    super().__init__(
+    """Abstract N-D Convolutional LSTM layer (used as implementation base).
+
+    Similar to an LSTM layer, but the input transformations
+    and recurrent transformations are both convolutional.
+
+    Args:
+      rank: Integer, rank of the convolution, e.g. "2" for 2D convolutions.
+      filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers,
+        specifying the strides of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, time, ..., channels)`
+        while `channels_first` corresponds to
+        inputs with shape `(batch, time, channels, ...)`.
+        When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+         `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of n integers, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+      activation: Activation function to use.
+        By default hyperbolic tangent activation function is applied
+        (`tanh(x)`).
+      recurrent_activation: Activation function to use
+        for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean.
+        If True, add 1 to the bias of the forget gate at initialization.
+        Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al., 2015](
+          http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to.
+      kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence. (default False)
+      return_state: Boolean Whether to return the last state
+        in addition to the output. (default False)
+      go_backwards: Boolean (default False).
+        If True, process the input sequence backwards.
+      stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+      dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+    """
+
+    def __init__(
+        self,
         rank,
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    return super().call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def filters(self):
-    return self.cell.filters
-
-  @property
-  def kernel_size(self):
-    return self.cell.kernel_size
-
-  @property
-  def strides(self):
-    return self.cell.strides
-
-  @property
-  def padding(self):
-    return self.cell.padding
-
-  @property
-  def data_format(self):
-    return self.cell.data_format
-
-  @property
-  def dilation_rate(self):
-    return self.cell.dilation_rate
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def unit_forget_bias(self):
-    return self.cell.unit_forget_bias
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  def get_config(self):
-    config = {'filters': self.filters,
-              'kernel_size': self.kernel_size,
-              'strides': self.strides,
-              'padding': self.padding,
-              'data_format': self.data_format,
-              'dilation_rate': self.dilation_rate,
-              'activation': activations.serialize(self.activation),
-              'recurrent_activation': activations.serialize(
-                  self.recurrent_activation),
-              'use_bias': self.use_bias,
-              'kernel_initializer': initializers.serialize(
-                  self.kernel_initializer),
-              'recurrent_initializer': initializers.serialize(
-                  self.recurrent_initializer),
-              'bias_initializer': initializers.serialize(self.bias_initializer),
-              'unit_forget_bias': self.unit_forget_bias,
-              'kernel_regularizer': regularizers.serialize(
-                  self.kernel_regularizer),
-              'recurrent_regularizer': regularizers.serialize(
-                  self.recurrent_regularizer),
-              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-              'activity_regularizer': regularizers.serialize(
-                  self.activity_regularizer),
-              'kernel_constraint': constraints.serialize(
-                  self.kernel_constraint),
-              'recurrent_constraint': constraints.serialize(
-                  self.recurrent_constraint),
-              'bias_constraint': constraints.serialize(self.bias_constraint),
-              'dropout': self.dropout,
-              'recurrent_dropout': self.recurrent_dropout}
-    base_config = super().get_config()
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format=None,
+        dilation_rate=1,
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs,
+    ):
+        cell = ConvLSTMCell(
+            rank=rank,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            unit_forget_bias=unit_forget_bias,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            name="conv_lstm_cell",
+            dtype=kwargs.get("dtype"),
+        )
+        super().__init__(
+            rank,
+            cell,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            **kwargs,
+        )
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        return super().call(
+            inputs, mask=mask, training=training, initial_state=initial_state
+        )
+
+    @property
+    def filters(self):
+        return self.cell.filters
+
+    @property
+    def kernel_size(self):
+        return self.cell.kernel_size
+
+    @property
+    def strides(self):
+        return self.cell.strides
+
+    @property
+    def padding(self):
+        return self.cell.padding
+
+    @property
+    def data_format(self):
+        return self.cell.data_format
+
+    @property
+    def dilation_rate(self):
+        return self.cell.dilation_rate
+
+    @property
+    def activation(self):
+        return self.cell.activation
+
+    @property
+    def recurrent_activation(self):
+        return self.cell.recurrent_activation
+
+    @property
+    def use_bias(self):
+        return self.cell.use_bias
+
+    @property
+    def kernel_initializer(self):
+        return self.cell.kernel_initializer
+
+    @property
+    def recurrent_initializer(self):
+        return self.cell.recurrent_initializer
+
+    @property
+    def bias_initializer(self):
+        return self.cell.bias_initializer
+
+    @property
+    def unit_forget_bias(self):
+        return self.cell.unit_forget_bias
+
+    @property
+    def kernel_regularizer(self):
+        return self.cell.kernel_regularizer
+
+    @property
+    def recurrent_regularizer(self):
+        return self.cell.recurrent_regularizer
+
+    @property
+    def bias_regularizer(self):
+        return self.cell.bias_regularizer
+
+    @property
+    def kernel_constraint(self):
+        return self.cell.kernel_constraint
+
+    @property
+    def recurrent_constraint(self):
+        return self.cell.recurrent_constraint
+
+    @property
+    def bias_constraint(self):
+        return self.cell.bias_constraint
+
+    @property
+    def dropout(self):
+        return self.cell.dropout
+
+    @property
+    def recurrent_dropout(self):
+        return self.cell.recurrent_dropout
+
+    def get_config(self):
+        config = {
+            "filters": self.filters,
+            "kernel_size": self.kernel_size,
+            "strides": self.strides,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "dilation_rate": self.dilation_rate,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "unit_forget_bias": self.unit_forget_bias,
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+        }
+        base_config = super().get_config()
+        del base_config["cell"]
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
diff --git a/keras/layers/rnn/base_conv_rnn.py b/keras/layers/rnn/base_conv_rnn.py
index 86f2babe950b..bdeef1155cd4 100644
--- a/keras/layers/rnn/base_conv_rnn.py
+++ b/keras/layers/rnn/base_conv_rnn.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 """Base class for convolutional-recurrent layers."""
-# pylint: disable=g-classes-have-attributes
+
+
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine import base_layer
@@ -22,369 +25,413 @@
 from keras.utils import conv_utils
 from keras.utils import generic_utils
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 class ConvRNN(RNN):
-  """N-Dimensional Base class for convolutional-recurrent layers.
-
-  Args:
-    rank: Integer, rank of the convolution, e.g. "2" for 2D convolutions.
-    cell: A RNN cell instance. A RNN cell is a class that has: - a
-      `call(input_at_t, states_at_t)` method, returning `(output_at_t,
-      states_at_t_plus_1)`. The call method of the cell can also take the
-      optional argument `constants`, see section "Note on passing external
-      constants" below. - a `state_size` attribute. This can be a single integer
-      (single state) in which case it is the number of channels of the recurrent
-      state (which should be the same as the number of channels of the cell
-      output). This can also be a list/tuple of integers (one size per state).
-      In this case, the first entry (`state_size[0]`) should be the same as the
-      size of the cell output.
-    return_sequences: Boolean. Whether to return the last output. in the output
-      sequence, or the full sequence.
-    return_state: Boolean. Whether to return the last state in addition to the
-      output.
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    stateful: Boolean (default False). If True, the last state for each sample
-      at index i in a batch will be used as initial state for the sample of
-      index i in the following batch.
-    input_shape: Use this argument to specify the shape of the input when this
-      layer is the first one in a model.
-  Call arguments:
-    inputs: A (2 + `rank`)D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
-      given timestep should be masked.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is for use with cells that use dropout.
-    initial_state: List of initial state tensors to be passed to the first call
-      of the cell.
-    constants: List of constant tensors to be passed to the cell at each
-      timestep.
-  Input shape:
-    (3 + `rank`)D tensor with shape: `(samples, timesteps, channels,
-      img_dimensions...)`
-    if data_format='channels_first' or shape: `(samples, timesteps,
-      img_dimensions..., channels)` if data_format='channels_last'.
-  Output shape:
-    - If `return_state`: a list of tensors. The first tensor is the output. The
-      remaining tensors are the last states,
-      each (2 + `rank`)D tensor with shape: `(samples, filters,
+    """N-Dimensional Base class for convolutional-recurrent layers.
+
+    Args:
+      rank: Integer, rank of the convolution, e.g. "2" for 2D convolutions.
+      cell: A RNN cell instance. A RNN cell is a class that has: - a
+        `call(input_at_t, states_at_t)` method, returning `(output_at_t,
+        states_at_t_plus_1)`. The call method of the cell can also take the
+        optional argument `constants`, see section "Note on passing external
+        constants" below. - a `state_size` attribute. This can be a single
+        integer (single state) in which case it is the number of channels of the
+        recurrent state (which should be the same as the number of channels of
+        the cell output). This can also be a list/tuple of integers (one size
+        per state).  In this case, the first entry (`state_size[0]`) should be
+        the same as the size of the cell output.
+      return_sequences: Boolean. Whether to return the last output. in the
+        output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state in addition to the
+        output.
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+      input_shape: Use this argument to specify the shape of the input when this
+        layer is the first one in a model.
+    Call arguments:
+      inputs: A (2 + `rank`)D tensor.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
+        given timestep should be masked.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is for use with cells that use dropout.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+      constants: List of constant tensors to be passed to the cell at each
+        timestep.
+    Input shape:
+      (3 + `rank`)D tensor with shape: `(samples, timesteps, channels,
+        img_dimensions...)`
+      if data_format='channels_first' or shape: `(samples, timesteps,
+        img_dimensions..., channels)` if data_format='channels_last'.
+    Output shape:
+      - If `return_state`: a list of tensors. The first tensor is the output.
+        The remaining tensors are the last states,
+        each (2 + `rank`)D tensor with shape: `(samples, filters,
+          new_img_dimensions...)` if data_format='channels_first'
+        or shape: `(samples, new_img_dimensions..., filters)` if
+          data_format='channels_last'. img_dimension values might have changed
+          due to padding.
+      - If `return_sequences`: (3 + `rank`)D tensor with shape: `(samples,
+        timesteps, filters, new_img_dimensions...)` if
+        data_format='channels_first'
+        or shape: `(samples, timesteps, new_img_dimensions..., filters)` if
+          data_format='channels_last'.
+      - Else, (2 + `rank`)D tensor with shape: `(samples, filters,
         new_img_dimensions...)` if data_format='channels_first'
-      or shape: `(samples, new_img_dimensions..., filters)` if
-        data_format='channels_last'. img_dimension values might have changed due
-        to padding.
-    - If `return_sequences`: (3 + `rank`)D tensor with shape: `(samples,
-      timesteps, filters, new_img_dimensions...)` if
-      data_format='channels_first'
-      or shape: `(samples, timesteps, new_img_dimensions..., filters)` if
-        data_format='channels_last'.
-    - Else, (2 + `rank`)D tensor with shape: `(samples, filters,
-      new_img_dimensions...)` if data_format='channels_first'
-      or shape: `(samples, new_img_dimensions..., filters)` if
-        data_format='channels_last'.
-  Masking: This layer supports masking for input data with a variable number of
-    timesteps.
-  Note on using statefulness in RNNs: You can set RNN layers to be 'stateful',
-    which means that the states computed for the samples in one batch will be
-    reused as initial states for the samples in the next batch. This assumes a
-    one-to-one mapping between samples in different successive batches.
-    To enable statefulness: - Specify `stateful=True` in the layer constructor.
-      - Specify a fixed batch size for your model, by passing
-          - If sequential model: `batch_input_shape=(...)` to the first layer in
-            your model.
-          - If functional model with 1 or more Input layers: `batch_shape=(...)`
-            to all the first layers in your model. This is the expected shape of
-            your inputs *including the batch size*. It should be a tuple of
-            integers, e.g. `(32, 10, 100, 100, 32)`. for rank 2 convolution Note
-            that the image dimensions should be specified too. - Specify
-            `shuffle=False` when calling fit(). To reset the states of your
-            model, call `.reset_states()` on either a specific layer, or on your
-            entire model.
-  Note on specifying the initial state of RNNs: You can specify the initial
-    state of RNN layers symbolically by calling them with the keyword argument
-    `initial_state`. The value of `initial_state` should be a tensor or list of
-    tensors representing the initial state of the RNN layer. You can specify the
-    initial state of RNN layers numerically by calling `reset_states` with the
-    keyword argument `states`. The value of `states` should be a numpy array or
-    list of numpy arrays representing the initial state of the RNN layer.
-  Note on passing external constants to RNNs: You can pass "external" constants
-    to the cell using the `constants` keyword argument of `RNN.__call__` (as
-    well as `RNN.call`) method. This requires that the `cell.call` method
-    accepts the same keyword argument `constants`. Such constants can be used to
-    condition the cell transformation on additional static inputs (not changing
-    over time), a.k.a. an attention mechanism.
-  """
-
-  def __init__(self,
-               rank,
-               cell,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if unroll:
-      raise TypeError(
-          'Unrolling is not possible with convolutional RNNs. '
-          f'Received: unroll={unroll}')
-    if isinstance(cell, (list, tuple)):
-      # The StackedConvRNN3DCells isn't implemented yet.
-      raise TypeError('It is not possible at the moment to'
-                      'stack convolutional cells. Only pass a single cell '
-                      'instance as the `cell` argument. Received: '
-                      f'cell={cell}')
-    super().__init__(cell, return_sequences, return_state,
-                                  go_backwards, stateful, unroll, **kwargs)
-    self.rank = rank
-    self.input_spec = [InputSpec(ndim=rank + 3)]
-    self.states = None
-    self._num_constants = None
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-
-    cell = self.cell
-    if cell.data_format == 'channels_first':
-      img_dims = input_shape[3:]
-    elif cell.data_format == 'channels_last':
-      img_dims = input_shape[2:-1]
-
-    norm_img_dims = tuple([
-        conv_utils.conv_output_length(  # pylint: disable=g-complex-comprehension
-            img_dims[idx],
-            cell.kernel_size[idx],
-            padding=cell.padding,
-            stride=cell.strides[idx],
-            dilation=cell.dilation_rate[idx]) for idx in range(len(img_dims))
-    ])
-
-    if cell.data_format == 'channels_first':
-      output_shape = input_shape[:2] + (cell.filters,) + norm_img_dims
-    elif cell.data_format == 'channels_last':
-      output_shape = input_shape[:2] + norm_img_dims + (cell.filters,)
-
-    if not self.return_sequences:
-      output_shape = output_shape[:1] + output_shape[2:]
-
-    if self.return_state:
-      output_shape = [output_shape]
-      if cell.data_format == 'channels_first':
-        output_shape += [
-            (input_shape[0], cell.filters) + norm_img_dims for _ in range(2)
-        ]
-      elif cell.data_format == 'channels_last':
-        output_shape += [(input_shape[0],) + norm_img_dims + (cell.filters,)
-                         for _ in range(2)]
-    return output_shape
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    # Note input_shape will be list of shapes of initial states and
-    # constants if these are passed in __call__.
-    if self._num_constants is not None:
-      constants_shape = input_shape[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-    else:
-      constants_shape = None
-
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_spec[0] = InputSpec(
-        shape=(batch_size, None) + input_shape[2:self.rank + 3])
-
-    # allow cell (if layer) to build before we set or validate state_spec
-    if isinstance(self.cell, base_layer.Layer):
-      step_input_shape = (input_shape[0],) + input_shape[2:]
-      if constants_shape is not None:
-        self.cell.build([step_input_shape] + constants_shape)
-      else:
-        self.cell.build(step_input_shape)
-
-    # set or validate state_spec
-    if hasattr(self.cell.state_size, '__len__'):
-      state_size = list(self.cell.state_size)
-    else:
-      state_size = [self.cell.state_size]
-
-    if self.state_spec is not None:
-      # initial_state was passed in call, check compatibility
-      if self.cell.data_format == 'channels_first':
-        ch_dim = 1
-      elif self.cell.data_format == 'channels_last':
-        ch_dim = self.rank + 1
-      if [spec.shape[ch_dim] for spec in self.state_spec] != state_size:
-        raise ValueError(
-            'An `initial_state` was passed that is not compatible with '
-            '`cell.state_size`. Received state shapes '
-            f'{[spec.shape for spec in self.state_spec]}. '
-            f'However `cell.state_size` is {self.cell.state_size}')
-    else:
-      img_dims = tuple((None for _ in range(self.rank)))
-      if self.cell.data_format == 'channels_first':
-        self.state_spec = [
-            InputSpec(shape=(None, dim) + img_dims) for dim in state_size
-        ]
-      elif self.cell.data_format == 'channels_last':
-        self.state_spec = [
-            InputSpec(shape=(None,) + img_dims + (dim,)) for dim in state_size
-        ]
-    if self.stateful:
-      self.reset_states()
-    self.built = True
-
-  def get_initial_state(self, inputs):
-    # (samples, timesteps, img_dims..., filters)
-    initial_state = backend.zeros_like(inputs)
-    # (samples, img_dims..., filters)
-    initial_state = backend.sum(initial_state, axis=1)
-    shape = list(self.cell.kernel_shape)
-    shape[-1] = self.cell.filters
-    initial_state = self.cell.input_conv(initial_state,
-                                         tf.zeros(tuple(shape),
-                                                  initial_state.dtype),
-                                         padding=self.cell.padding)
-
-    if hasattr(self.cell.state_size, '__len__'):
-      return [initial_state for _ in self.cell.state_size]
-    else:
-      return [initial_state]
-
-  def call(self,
-           inputs,
-           mask=None,
-           training=None,
-           initial_state=None,
-           constants=None):
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    inputs, initial_state, constants = self._process_inputs(
-        inputs, initial_state, constants)
-
-    if isinstance(mask, list):
-      mask = mask[0]
-    timesteps = backend.int_shape(inputs)[1]
-
-    kwargs = {}
-    if generic_utils.has_arg(self.cell.call, 'training'):
-      kwargs['training'] = training
-
-    if constants:
-      if not generic_utils.has_arg(self.cell.call, 'constants'):
-        raise ValueError(
-            f'RNN cell {self.cell} does not support constants. '
-            f'Received: constants={constants}')
-
-      def step(inputs, states):
-        constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-        return self.cell.call(inputs, states, constants=constants, **kwargs)
-    else:
-      def step(inputs, states):
-        return self.cell.call(inputs, states, **kwargs)
-
-    last_output, outputs, states = backend.rnn(
-        step,
+        or shape: `(samples, new_img_dimensions..., filters)` if
+          data_format='channels_last'.
+    Masking: This layer supports masking for input data with a variable number
+      of timesteps.
+    Note on using statefulness in RNNs: You can set RNN layers to be 'stateful',
+      which means that the states computed for the samples in one batch will be
+      reused as initial states for the samples in the next batch. This assumes a
+      one-to-one mapping between samples in different successive batches.
+      To enable statefulness: - Specify `stateful=True` in the layer
+      constructor.
+        - Specify a fixed batch size for your model, by passing
+            - If sequential model: `batch_input_shape=(...)` to the first layer
+              in your model.
+            - If functional model with 1 or more Input layers:
+              `batch_shape=(...)` to all the first layers in your model. This is
+              the expected shape of your inputs *including the batch size*. It
+              should be a tuple of integers, e.g. `(32, 10, 100, 100, 32)`. for
+              rank 2 convolution Note that the image dimensions should be
+              specified too. - Specify `shuffle=False` when calling fit(). To
+              reset the states of your model, call `.reset_states()` on either a
+              specific layer, or on your entire model.
+    Note on specifying the initial state of RNNs: You can specify the initial
+      state of RNN layers symbolically by calling them with the keyword argument
+      `initial_state`. The value of `initial_state` should be a tensor or list
+      of tensors representing the initial state of the RNN layer. You can
+      specify the initial state of RNN layers numerically by calling
+      `reset_states` with the keyword argument `states`. The value of `states`
+      should be a numpy array or list of numpy arrays representing the initial
+      state of the RNN layer.
+    Note on passing external constants to RNNs: You can pass "external"
+      constants to the cell using the `constants` keyword argument of
+      `RNN.__call__` (as well as `RNN.call`) method. This requires that the
+      `cell.call` method accepts the same keyword argument `constants`. Such
+      constants can be used to condition the cell transformation on additional
+      static inputs (not changing over time), a.k.a. an attention mechanism.
+    """
+
+    def __init__(
+        self,
+        rank,
+        cell,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        unroll=False,
+        **kwargs,
+    ):
+        if unroll:
+            raise TypeError(
+                "Unrolling is not possible with convolutional RNNs. "
+                f"Received: unroll={unroll}"
+            )
+        if isinstance(cell, (list, tuple)):
+            # The StackedConvRNN3DCells isn't implemented yet.
+            raise TypeError(
+                "It is not possible at the moment to"
+                "stack convolutional cells. Only pass a single cell "
+                "instance as the `cell` argument. Received: "
+                f"cell={cell}"
+            )
+        super().__init__(
+            cell,
+            return_sequences,
+            return_state,
+            go_backwards,
+            stateful,
+            unroll,
+            **kwargs,
+        )
+        self.rank = rank
+        self.input_spec = [InputSpec(ndim=rank + 3)]
+        self.states = None
+        self._num_constants = None
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+
+        cell = self.cell
+        if cell.data_format == "channels_first":
+            img_dims = input_shape[3:]
+        elif cell.data_format == "channels_last":
+            img_dims = input_shape[2:-1]
+
+        norm_img_dims = tuple(
+            [
+                conv_utils.conv_output_length(
+                    img_dims[idx],
+                    cell.kernel_size[idx],
+                    padding=cell.padding,
+                    stride=cell.strides[idx],
+                    dilation=cell.dilation_rate[idx],
+                )
+                for idx in range(len(img_dims))
+            ]
+        )
+
+        if cell.data_format == "channels_first":
+            output_shape = input_shape[:2] + (cell.filters,) + norm_img_dims
+        elif cell.data_format == "channels_last":
+            output_shape = input_shape[:2] + norm_img_dims + (cell.filters,)
+
+        if not self.return_sequences:
+            output_shape = output_shape[:1] + output_shape[2:]
+
+        if self.return_state:
+            output_shape = [output_shape]
+            if cell.data_format == "channels_first":
+                output_shape += [
+                    (input_shape[0], cell.filters) + norm_img_dims
+                    for _ in range(2)
+                ]
+            elif cell.data_format == "channels_last":
+                output_shape += [
+                    (input_shape[0],) + norm_img_dims + (cell.filters,)
+                    for _ in range(2)
+                ]
+        return output_shape
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        # Note input_shape will be list of shapes of initial states and
+        # constants if these are passed in __call__.
+        if self._num_constants is not None:
+            constants_shape = input_shape[-self._num_constants :]
+        else:
+            constants_shape = None
+
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+
+        batch_size = input_shape[0] if self.stateful else None
+        self.input_spec[0] = InputSpec(
+            shape=(batch_size, None) + input_shape[2 : self.rank + 3]
+        )
+
+        # allow cell (if layer) to build before we set or validate state_spec
+        if isinstance(self.cell, base_layer.Layer):
+            step_input_shape = (input_shape[0],) + input_shape[2:]
+            if constants_shape is not None:
+                self.cell.build([step_input_shape] + constants_shape)
+            else:
+                self.cell.build(step_input_shape)
+
+        # set or validate state_spec
+        if hasattr(self.cell.state_size, "__len__"):
+            state_size = list(self.cell.state_size)
+        else:
+            state_size = [self.cell.state_size]
+
+        if self.state_spec is not None:
+            # initial_state was passed in call, check compatibility
+            if self.cell.data_format == "channels_first":
+                ch_dim = 1
+            elif self.cell.data_format == "channels_last":
+                ch_dim = self.rank + 1
+            if [spec.shape[ch_dim] for spec in self.state_spec] != state_size:
+                raise ValueError(
+                    "An `initial_state` was passed that is not compatible with "
+                    "`cell.state_size`. Received state shapes "
+                    f"{[spec.shape for spec in self.state_spec]}. "
+                    f"However `cell.state_size` is {self.cell.state_size}"
+                )
+        else:
+            img_dims = tuple((None for _ in range(self.rank)))
+            if self.cell.data_format == "channels_first":
+                self.state_spec = [
+                    InputSpec(shape=(None, dim) + img_dims)
+                    for dim in state_size
+                ]
+            elif self.cell.data_format == "channels_last":
+                self.state_spec = [
+                    InputSpec(shape=(None,) + img_dims + (dim,))
+                    for dim in state_size
+                ]
+        if self.stateful:
+            self.reset_states()
+        self.built = True
+
+    def get_initial_state(self, inputs):
+        # (samples, timesteps, img_dims..., filters)
+        initial_state = backend.zeros_like(inputs)
+        # (samples, img_dims..., filters)
+        initial_state = backend.sum(initial_state, axis=1)
+        shape = list(self.cell.kernel_shape)
+        shape[-1] = self.cell.filters
+        initial_state = self.cell.input_conv(
+            initial_state,
+            tf.zeros(tuple(shape), initial_state.dtype),
+            padding=self.cell.padding,
+        )
+
+        if hasattr(self.cell.state_size, "__len__"):
+            return [initial_state for _ in self.cell.state_size]
+        else:
+            return [initial_state]
+
+    def call(
+        self,
         inputs,
-        initial_state,
-        constants=constants,
-        go_backwards=self.go_backwards,
-        mask=mask,
-        input_length=timesteps,
-        return_all_outputs=self.return_sequences)
-    if self.stateful:
-      updates = [
-          backend.update(self_state, state)
-          for self_state, state in zip(self.states, states)
-      ]
-      self.add_update(updates)
-
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = last_output
-
-    if self.return_state:
-      if not isinstance(states, (list, tuple)):
-        states = [states]
-      else:
-        states = list(states)
-      return [output] + states
-    return output
-
-  def reset_states(self, states=None):
-    if not self.stateful:
-      raise AttributeError('Layer must be stateful.')
-    input_shape = self.input_spec[0].shape
-    state_shape = self.compute_output_shape(input_shape)
-    if self.return_state:
-      state_shape = state_shape[0]
-    if self.return_sequences:
-      state_shape = state_shape[:1].concatenate(state_shape[2:])
-    if None in state_shape:
-      raise ValueError('If a RNN is stateful, it needs to know '
-                       'its batch size. Specify the batch size '
-                       'of your input tensors: \n'
-                       '- If using a Sequential model, '
-                       'specify the batch size by passing '
-                       'a `batch_input_shape` '
-                       'argument to your first layer.\n'
-                       '- If using the functional API, specify '
-                       'the time dimension by passing a '
-                       '`batch_shape` argument to your Input layer.\n'
-                       'The same thing goes for the number of rows and '
-                       'columns.')
-
-    # helper function
-    def get_tuple_shape(nb_channels):
-      result = list(state_shape)
-      if self.cell.data_format == 'channels_first':
-        result[1] = nb_channels
-      elif self.cell.data_format == 'channels_last':
-        result[self.rank + 1] = nb_channels
-      else:
-        raise KeyError(
-            'Cell data format must be one of '
-            '{"channels_first", "channels_last"}. Received: '
-            f'cell.data_format={self.cell.data_format}')
-      return tuple(result)
-
-    # initialize state if None
-    if self.states[0] is None:
-      if hasattr(self.cell.state_size, '__len__'):
-        self.states = [backend.zeros(get_tuple_shape(dim))
-                       for dim in self.cell.state_size]
-      else:
-        self.states = [backend.zeros(get_tuple_shape(self.cell.state_size))]
-    elif states is None:
-      if hasattr(self.cell.state_size, '__len__'):
-        for state, dim in zip(self.states, self.cell.state_size):
-          backend.set_value(state, np.zeros(get_tuple_shape(dim)))
-      else:
-        backend.set_value(self.states[0],
-                          np.zeros(get_tuple_shape(self.cell.state_size)))
-    else:
-      if not isinstance(states, (list, tuple)):
-        states = [states]
-      if len(states) != len(self.states):
-        raise ValueError(
-            f'Layer {self.name} expects {len(self.states)} states, '
-            f'but it received {len(states)} state values. '
-            f'States received: {states}')
-      for index, (value, state) in enumerate(zip(states, self.states)):
-        if hasattr(self.cell.state_size, '__len__'):
-          dim = self.cell.state_size[index]
+        mask=None,
+        training=None,
+        initial_state=None,
+        constants=None,
+    ):
+        # note that the .build() method of subclasses MUST define
+        # self.input_spec and self.state_spec with complete input shapes.
+        inputs, initial_state, constants = self._process_inputs(
+            inputs, initial_state, constants
+        )
+
+        if isinstance(mask, list):
+            mask = mask[0]
+        timesteps = backend.int_shape(inputs)[1]
+
+        kwargs = {}
+        if generic_utils.has_arg(self.cell.call, "training"):
+            kwargs["training"] = training
+
+        if constants:
+            if not generic_utils.has_arg(self.cell.call, "constants"):
+                raise ValueError(
+                    f"RNN cell {self.cell} does not support constants. "
+                    f"Received: constants={constants}"
+                )
+
+            def step(inputs, states):
+                constants = states[-self._num_constants :]
+                states = states[: -self._num_constants]
+                return self.cell.call(
+                    inputs, states, constants=constants, **kwargs
+                )
+
+        else:
+
+            def step(inputs, states):
+                return self.cell.call(inputs, states, **kwargs)
+
+        last_output, outputs, states = backend.rnn(
+            step,
+            inputs,
+            initial_state,
+            constants=constants,
+            go_backwards=self.go_backwards,
+            mask=mask,
+            input_length=timesteps,
+            return_all_outputs=self.return_sequences,
+        )
+        if self.stateful:
+            updates = [
+                backend.update(self_state, state)
+                for self_state, state in zip(self.states, states)
+            ]
+            self.add_update(updates)
+
+        if self.return_sequences:
+            output = outputs
+        else:
+            output = last_output
+
+        if self.return_state:
+            if not isinstance(states, (list, tuple)):
+                states = [states]
+            else:
+                states = list(states)
+            return [output] + states
+        return output
+
+    def reset_states(self, states=None):
+        if not self.stateful:
+            raise AttributeError("Layer must be stateful.")
+        input_shape = self.input_spec[0].shape
+        state_shape = self.compute_output_shape(input_shape)
+        if self.return_state:
+            state_shape = state_shape[0]
+        if self.return_sequences:
+            state_shape = state_shape[:1].concatenate(state_shape[2:])
+        if None in state_shape:
+            raise ValueError(
+                "If a RNN is stateful, it needs to know "
+                "its batch size. Specify the batch size "
+                "of your input tensors: \n"
+                "- If using a Sequential model, "
+                "specify the batch size by passing "
+                "a `batch_input_shape` "
+                "argument to your first layer.\n"
+                "- If using the functional API, specify "
+                "the time dimension by passing a "
+                "`batch_shape` argument to your Input layer.\n"
+                "The same thing goes for the number of rows and "
+                "columns."
+            )
+
+        # helper function
+        def get_tuple_shape(nb_channels):
+            result = list(state_shape)
+            if self.cell.data_format == "channels_first":
+                result[1] = nb_channels
+            elif self.cell.data_format == "channels_last":
+                result[self.rank + 1] = nb_channels
+            else:
+                raise KeyError(
+                    "Cell data format must be one of "
+                    '{"channels_first", "channels_last"}. Received: '
+                    f"cell.data_format={self.cell.data_format}"
+                )
+            return tuple(result)
+
+        # initialize state if None
+        if self.states[0] is None:
+            if hasattr(self.cell.state_size, "__len__"):
+                self.states = [
+                    backend.zeros(get_tuple_shape(dim))
+                    for dim in self.cell.state_size
+                ]
+            else:
+                self.states = [
+                    backend.zeros(get_tuple_shape(self.cell.state_size))
+                ]
+        elif states is None:
+            if hasattr(self.cell.state_size, "__len__"):
+                for state, dim in zip(self.states, self.cell.state_size):
+                    backend.set_value(state, np.zeros(get_tuple_shape(dim)))
+            else:
+                backend.set_value(
+                    self.states[0],
+                    np.zeros(get_tuple_shape(self.cell.state_size)),
+                )
         else:
-          dim = self.cell.state_size
-        if value.shape != get_tuple_shape(dim):
-          raise ValueError(
-              f'State {index} is incompatible with layer {self.name}: '
-              f'expected shape={get_tuple_shape(dim)}, '
-              f'found shape={value.shape}')
-        backend.set_value(state, value)
+            if not isinstance(states, (list, tuple)):
+                states = [states]
+            if len(states) != len(self.states):
+                raise ValueError(
+                    f"Layer {self.name} expects {len(self.states)} states, "
+                    f"but it received {len(states)} state values. "
+                    f"States received: {states}"
+                )
+            for index, (value, state) in enumerate(zip(states, self.states)):
+                if hasattr(self.cell.state_size, "__len__"):
+                    dim = self.cell.state_size[index]
+                else:
+                    dim = self.cell.state_size
+                if value.shape != get_tuple_shape(dim):
+                    raise ValueError(
+                        "State {index} is incompatible with layer "
+                        f"{self.name}: expected shape={get_tuple_shape(dim)}, "
+                        f"found shape={value.shape}"
+                    )
+                backend.set_value(state, value)
diff --git a/keras/layers/rnn/base_cudnn_rnn.py b/keras/layers/rnn/base_cudnn_rnn.py
index 197dfdae787e..96426fc72e2a 100644
--- a/keras/layers/rnn/base_cudnn_rnn.py
+++ b/keras/layers/rnn/base_cudnn_rnn.py
@@ -13,133 +13,138 @@
 # limitations under the License.
 # ==============================================================================
 """Base class for recurrent layers backed by cuDNN."""
-# pylint: disable=g-classes-have-attributes
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.input_spec import InputSpec
 from keras.layers.rnn.base_rnn import RNN
-import tensorflow.compat.v2 as tf
 
 
 class _CuDNNRNN(RNN):
-  """Private base class for CuDNNGRU and CuDNNLSTM layers.
-
-  Args:
-    return_sequences: Boolean. Whether to return the last output
-        in the output sequence, or the full sequence.
-    return_state: Boolean. Whether to return the last state
-        in addition to the output.
-    go_backwards: Boolean (default False).
-        If True, process the input sequence backwards and return the
-        reversed sequence.
-    stateful: Boolean (default False). If True, the last state
-        for each sample at index i in a batch will be used as initial
-        state for the sample of index i in the following batch.
-    time_major: Boolean (default False). If true, the inputs and outputs will be
-        in shape `(timesteps, batch, ...)`, whereas in the False case, it will
-        be `(batch, timesteps, ...)`.
-  """
-
-  def __init__(self,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               time_major=False,
-               **kwargs):
-    # We invoke the base layer's initializer directly here because we do not
-    # want to create RNN cell instance.
-    super(RNN, self).__init__(**kwargs)  # pylint: disable=bad-super-call
-    self.return_sequences = return_sequences
-    self.return_state = return_state
-    self.go_backwards = go_backwards
-    self.stateful = stateful
-    self.time_major = time_major
-    self.supports_masking = False
-    self.input_spec = [InputSpec(ndim=3)]
-    if hasattr(self.cell.state_size, '__len__'):
-      state_size = self.cell.state_size
-    else:
-      state_size = [self.cell.state_size]
-    self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
-    self.constants_spec = None
-    self._states = None
-    self._num_constants = 0
-    self._vector_shape = tf.constant([-1])
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    if isinstance(mask, list):
-      mask = mask[0]
-    if mask is not None:
-      raise ValueError('Masking is not supported for CuDNN RNNs.')
-
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      initial_state = inputs[1:]
-      inputs = inputs[0]
-    elif initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
-
-    if len(initial_state) != len(self.states):
-      raise ValueError('Layer has ' + str(len(self.states)) +
-                       ' states but was passed ' + str(len(initial_state)) +
-                       ' initial states.')
-
-    if self.go_backwards:
-      # Reverse time axis.
-      inputs = backend.reverse(inputs, 1)
-    output, states = self._process_batch(inputs, initial_state)
-
-    if self.stateful:
-      updates = [
-          tf.compat.v1.assign(self_state, state)
-          for self_state, state in zip(self.states, states)
-      ]
-      self.add_update(updates)
-
-    if self.return_state:
-      return [output] + states
-    else:
-      return output
-
-  def get_config(self):
-    config = {
-        'return_sequences': self.return_sequences,
-        'return_state': self.return_state,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful,
-        'time_major': self.time_major,
-    }
-    base_config = super(  # pylint: disable=bad-super-call
-        RNN, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
-
-  @property
-  def trainable_weights(self):
-    if self.trainable and self.built:
-      return [self.kernel, self.recurrent_kernel, self.bias]
-    return []
-
-  @property
-  def non_trainable_weights(self):
-    if not self.trainable and self.built:
-      return [self.kernel, self.recurrent_kernel, self.bias]
-    return []
-
-  @property
-  def losses(self):
-    return super(RNN, self).losses  # pylint: disable=bad-super-call
-
-  def get_losses_for(self, inputs=None):
-    return super(  # pylint: disable=bad-super-call
-        RNN, self).get_losses_for(inputs=inputs)
+    """Private base class for CuDNNGRU and CuDNNLSTM layers.
+
+    Args:
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      time_major: Boolean (default False). If true, the inputs and outputs will
+          be in shape `(timesteps, batch, ...)`, whereas in the False case, it
+          will be `(batch, timesteps, ...)`.
+    """
+
+    def __init__(
+        self,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        time_major=False,
+        **kwargs
+    ):
+        # We invoke the base layer's initializer directly here because we do not
+        # want to create RNN cell instance.
+        super(RNN, self).__init__(**kwargs)
+        self.return_sequences = return_sequences
+        self.return_state = return_state
+        self.go_backwards = go_backwards
+        self.stateful = stateful
+        self.time_major = time_major
+        self.supports_masking = False
+        self.input_spec = [InputSpec(ndim=3)]
+        if hasattr(self.cell.state_size, "__len__"):
+            state_size = self.cell.state_size
+        else:
+            state_size = [self.cell.state_size]
+        self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
+        self.constants_spec = None
+        self._states = None
+        self._num_constants = 0
+        self._vector_shape = tf.constant([-1])
+
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        if isinstance(mask, list):
+            mask = mask[0]
+        if mask is not None:
+            raise ValueError("Masking is not supported for CuDNN RNNs.")
+
+        # input shape: `(samples, time (padded with zeros), input_dim)`
+        # note that the .build() method of subclasses MUST define
+        # self.input_spec and self.state_spec with complete input shapes.
+        if isinstance(inputs, list):
+            initial_state = inputs[1:]
+            inputs = inputs[0]
+        elif initial_state is not None:
+            pass
+        elif self.stateful:
+            initial_state = self.states
+        else:
+            initial_state = self.get_initial_state(inputs)
+
+        if len(initial_state) != len(self.states):
+            raise ValueError(
+                "Layer has "
+                + str(len(self.states))
+                + " states but was passed "
+                + str(len(initial_state))
+                + " initial states."
+            )
+
+        if self.go_backwards:
+            # Reverse time axis.
+            inputs = backend.reverse(inputs, 1)
+        output, states = self._process_batch(inputs, initial_state)
+
+        if self.stateful:
+            updates = [
+                tf.compat.v1.assign(self_state, state)
+                for self_state, state in zip(self.states, states)
+            ]
+            self.add_update(updates)
+
+        if self.return_state:
+            return [output] + states
+        else:
+            return output
+
+    def get_config(self):
+        config = {
+            "return_sequences": self.return_sequences,
+            "return_state": self.return_state,
+            "go_backwards": self.go_backwards,
+            "stateful": self.stateful,
+            "time_major": self.time_major,
+        }
+        base_config = super(RNN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+    @property
+    def trainable_weights(self):
+        if self.trainable and self.built:
+            return [self.kernel, self.recurrent_kernel, self.bias]
+        return []
+
+    @property
+    def non_trainable_weights(self):
+        if not self.trainable and self.built:
+            return [self.kernel, self.recurrent_kernel, self.bias]
+        return []
+
+    @property
+    def losses(self):
+        return super(RNN, self).losses
+
+    def get_losses_for(self, inputs=None):
+        return super(RNN, self).get_losses_for(inputs=inputs)
diff --git a/keras/layers/rnn/base_rnn.py b/keras/layers/rnn/base_rnn.py
index 36bab3653f57..350dcd1dd60e 100644
--- a/keras/layers/rnn/base_rnn.py
+++ b/keras/layers/rnn/base_rnn.py
@@ -13,850 +13,967 @@
 # limitations under the License.
 # ==============================================================================
 """Base class for recurrent layers."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import collections
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine import base_layer
 from keras.engine.input_spec import InputSpec
 from keras.layers.rnn import rnn_utils
 from keras.layers.rnn.dropout_rnn_cell_mixin import DropoutRNNCellMixin
 from keras.layers.rnn.stacked_rnn_cells import StackedRNNCells
-from keras.saving.saved_model import layer_serialization
+from keras.saving import serialization_lib
+from keras.saving.legacy.saved_model import layer_serialization
 from keras.utils import generic_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 
-@keras_export('keras.layers.RNN')
+@keras_export("keras.layers.RNN")
 class RNN(base_layer.Layer):
-  """Base class for recurrent layers.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  Args:
-    cell: A RNN cell instance or a list of RNN cell instances.
-      A RNN cell is a class that has:
-      - A `call(input_at_t, states_at_t)` method, returning
-        `(output_at_t, states_at_t_plus_1)`. The call method of the
-        cell can also take the optional argument `constants`, see
-        section "Note on passing external constants" below.
-      - A `state_size` attribute. This can be a single integer
-        (single state) in which case it is the size of the recurrent
-        state. This can also be a list/tuple of integers (one size per state).
-        The `state_size` can also be TensorShape or tuple/list of
-        TensorShape, to represent high dimension state.
-      - A `output_size` attribute. This can be a single integer or a
-        TensorShape, which represent the shape of the output. For backward
-        compatible reason, if this attribute is not available for the
-        cell, the value will be inferred by the first element of the
-        `state_size`.
-      - A `get_initial_state(inputs=None, batch_size=None, dtype=None)`
-        method that creates a tensor meant to be fed to `call()` as the
-        initial state, if the user didn't specify any initial state via other
-        means. The returned initial state should have a shape of
-        [batch_size, cell.state_size]. The cell might choose to create a
-        tensor full of zeros, or full of other values based on the cell's
-        implementation.
-        `inputs` is the input tensor to the RNN layer, which should
-        contain the batch size as its shape[0], and also dtype. Note that
-        the shape[0] might be `None` during the graph construction. Either
-        the `inputs` or the pair of `batch_size` and `dtype` are provided.
-        `batch_size` is a scalar tensor that represents the batch size
-        of the inputs. `dtype` is `tf.DType` that represents the dtype of
-        the inputs.
-        For backward compatibility, if this method is not implemented
-        by the cell, the RNN layer will create a zero filled tensor with the
-        size of [batch_size, cell.state_size].
-      In the case that `cell` is a list of RNN cell instances, the cells
-      will be stacked on top of each other in the RNN, resulting in an
-      efficient stacked RNN.
-    return_sequences: Boolean (default `False`). Whether to return the last
-      output in the output sequence, or the full sequence.
-    return_state: Boolean (default `False`). Whether to return the last state
-      in addition to the output.
-    go_backwards: Boolean (default `False`).
-      If True, process the input sequence backwards and return the
-      reversed sequence.
-    stateful: Boolean (default `False`). If True, the last state
-      for each sample at index i in a batch will be used as initial
-      state for the sample of index i in the following batch.
-    unroll: Boolean (default `False`).
-      If True, the network will be unrolled, else a symbolic loop will be used.
-      Unrolling can speed-up a RNN, although it tends to be more
-      memory-intensive. Unrolling is only suitable for short sequences.
-    time_major: The shape format of the `inputs` and `outputs` tensors.
-      If True, the inputs and outputs will be in shape
-      `(timesteps, batch, ...)`, whereas in the False case, it will be
-      `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
-      efficient because it avoids transposes at the beginning and end of the
-      RNN calculation. However, most TensorFlow data is batch-major, so by
-      default this function accepts input and emits output in batch-major
-      form.
-    zero_output_for_mask: Boolean (default `False`).
-      Whether the output should use zeros for the masked timesteps. Note that
-      this field is only used when `return_sequences` is True and mask is
-      provided. It can useful if you want to reuse the raw output sequence of
-      the RNN without interference from the masked timesteps, eg, merging
-      bidirectional RNNs.
-
-  Call arguments:
-    inputs: Input tensor.
-    mask: Binary tensor of shape `[batch_size, timesteps]` indicating whether
-      a given timestep should be masked. An individual `True` entry indicates
-      that the corresponding timestep should be utilized, while a `False`
-      entry indicates that the corresponding timestep should be ignored.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is for use with cells that use dropout.
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
-    constants: List of constant tensors to be passed to the cell at each
-      timestep.
-
-  Input shape:
-    N-D tensor with shape `[batch_size, timesteps, ...]` or
-    `[timesteps, batch_size, ...]` when time_major is True.
-
-  Output shape:
-    - If `return_state`: a list of tensors. The first tensor is
-      the output. The remaining tensors are the last states,
-      each with shape `[batch_size, state_size]`, where `state_size` could
-      be a high dimension tensor shape.
-    - If `return_sequences`: N-D tensor with shape
-      `[batch_size, timesteps, output_size]`, where `output_size` could
-      be a high dimension tensor shape, or
-      `[timesteps, batch_size, output_size]` when `time_major` is True.
-    - Else, N-D tensor with shape `[batch_size, output_size]`, where
-      `output_size` could be a high dimension tensor shape.
-
-  Masking:
-    This layer supports masking for input data with a variable number
-    of timesteps. To introduce masks to your data,
-    use an [tf.keras.layers.Embedding] layer with the `mask_zero` parameter
-    set to `True`.
-
-  Note on using statefulness in RNNs:
-    You can set RNN layers to be 'stateful', which means that the states
-    computed for the samples in one batch will be reused as initial states
-    for the samples in the next batch. This assumes a one-to-one mapping
-    between samples in different successive batches.
-
-    To enable statefulness:
-      - Specify `stateful=True` in the layer constructor.
-      - Specify a fixed batch size for your model, by passing
-        If sequential model:
-          `batch_input_shape=(...)` to the first layer in your model.
-        Else for functional model with 1 or more Input layers:
-          `batch_shape=(...)` to all the first layers in your model.
-        This is the expected shape of your inputs
-        *including the batch size*.
-        It should be a tuple of integers, e.g. `(32, 10, 100)`.
-      - Specify `shuffle=False` when calling `fit()`.
-
-    To reset the states of your model, call `.reset_states()` on either
-    a specific layer, or on your entire model.
-
-  Note on specifying the initial state of RNNs:
-    You can specify the initial state of RNN layers symbolically by
-    calling them with the keyword argument `initial_state`. The value of
-    `initial_state` should be a tensor or list of tensors representing
-    the initial state of the RNN layer.
-
-    You can specify the initial state of RNN layers numerically by
-    calling `reset_states` with the keyword argument `states`. The value of
-    `states` should be a numpy array or list of numpy arrays representing
-    the initial state of the RNN layer.
-
-  Note on passing external constants to RNNs:
-    You can pass "external" constants to the cell using the `constants`
-    keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
-    requires that the `cell.call` method accepts the same keyword argument
-    `constants`. Such constants can be used to condition the cell
-    transformation on additional static inputs (not changing over time),
-    a.k.a. an attention mechanism.
-
-  Examples:
-
-  ```python
-  # First, let's define a RNN Cell, as a layer subclass.
-
-  class MinimalRNNCell(keras.layers.Layer):
-
-      def __init__(self, units, **kwargs):
-          self.units = units
-          self.state_size = units
-          super(MinimalRNNCell, self).__init__(**kwargs)
-
-      def build(self, input_shape):
-          self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                        initializer='uniform',
-                                        name='kernel')
-          self.recurrent_kernel = self.add_weight(
-              shape=(self.units, self.units),
-              initializer='uniform',
-              name='recurrent_kernel')
-          self.built = True
-
-      def call(self, inputs, states):
-          prev_output = states[0]
-          h = backend.dot(inputs, self.kernel)
-          output = h + backend.dot(prev_output, self.recurrent_kernel)
-          return output, [output]
-
-  # Let's use this cell in a RNN layer:
-
-  cell = MinimalRNNCell(32)
-  x = keras.Input((None, 5))
-  layer = RNN(cell)
-  y = layer(x)
-
-  # Here's how to use the cell to build a stacked RNN:
-
-  cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
-  x = keras.Input((None, 5))
-  layer = RNN(cells)
-  y = layer(x)
-  ```
-  """
-
-  def __init__(self,
-               cell,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               time_major=False,
-               **kwargs):
-    if isinstance(cell, (list, tuple)):
-      cell = StackedRNNCells(cell)
-    if 'call' not in dir(cell):
-      raise ValueError('Argument `cell` should have a `call` method. '
-                       f'The RNN was passed: cell={cell}')
-    if 'state_size' not in dir(cell):
-      raise ValueError('The RNN cell should have a `state_size` attribute '
-                       '(tuple of integers, one integer per RNN state). '
-                       f'Received: cell={cell}')
-    # If True, the output for masked timestep will be zeros, whereas in the
-    # False case, output from previous timestep is returned for masked timestep.
-    self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
-
-    if 'input_shape' not in kwargs and (
-        'input_dim' in kwargs or 'input_length' in kwargs):
-      input_shape = (kwargs.pop('input_length', None),
-                     kwargs.pop('input_dim', None))
-      kwargs['input_shape'] = input_shape
-
-    super().__init__(**kwargs)
-    self.cell = cell
-    self.return_sequences = return_sequences
-    self.return_state = return_state
-    self.go_backwards = go_backwards
-    self.stateful = stateful
-    self.unroll = unroll
-    self.time_major = time_major
-
-    self.supports_masking = True
-    # The input shape is unknown yet, it could have nested tensor inputs, and
-    # the input spec will be the list of specs for nested inputs, the structure
-    # of the input_spec will be the same as the input.
-    self.input_spec = None
-    self.state_spec = None
-    self._states = None
-    self.constants_spec = None
-    self._num_constants = 0
-
-    if stateful:
-      if tf.distribute.has_strategy():
-        raise ValueError('Stateful RNNs (created with `stateful=True`) '
-                         'are not yet supported with tf.distribute.Strategy.')
-
-  @property
-  def _use_input_spec_as_call_signature(self):
-    if self.unroll:
-      # When the RNN layer is unrolled, the time step shape cannot be unknown.
-      # The input spec does not define the time step (because this layer can be
-      # called with any time step value, as long as it is not None), so it
-      # cannot be used as the call function signature when saving to SavedModel.
-      return False
-    return super()._use_input_spec_as_call_signature
-
-  @property
-  def states(self):
-    if self._states is None:
-      state = tf.nest.map_structure(lambda _: None, self.cell.state_size)
-      return state if tf.nest.is_nested(self.cell.state_size) else [state]
-    return self._states
-
-  @states.setter
-  # Automatic tracking catches "self._states" which adds an extra weight and
-  # breaks HDF5 checkpoints.
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def states(self, states):
-    self._states = states
-
-  def compute_output_shape(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    # Check whether the input shape contains any nested shapes. It could be
-    # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from numpy
-    # inputs.
-    try:
-      input_shape = tf.TensorShape(input_shape)
-    except (ValueError, TypeError):
-      # A nested tensor input
-      input_shape = tf.nest.flatten(input_shape)[0]
-
-    batch = input_shape[0]
-    time_step = input_shape[1]
-    if self.time_major:
-      batch, time_step = time_step, batch
-
-    if rnn_utils.is_multiple_state(self.cell.state_size):
-      state_size = self.cell.state_size
-    else:
-      state_size = [self.cell.state_size]
-
-    def _get_output_shape(flat_output_size):
-      output_dim = tf.TensorShape(flat_output_size).as_list()
-      if self.return_sequences:
+    """Base class for recurrent layers.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    Args:
+      cell: A RNN cell instance or a list of RNN cell instances.
+        A RNN cell is a class that has:
+        - A `call(input_at_t, states_at_t)` method, returning
+          `(output_at_t, states_at_t_plus_1)`. The call method of the
+          cell can also take the optional argument `constants`, see
+          section "Note on passing external constants" below.
+        - A `state_size` attribute. This can be a single integer
+          (single state) in which case it is the size of the recurrent
+          state. This can also be a list/tuple of integers (one size per state).
+          The `state_size` can also be TensorShape or tuple/list of
+          TensorShape, to represent high dimension state.
+        - A `output_size` attribute. This can be a single integer or a
+          TensorShape, which represent the shape of the output. For backward
+          compatible reason, if this attribute is not available for the
+          cell, the value will be inferred by the first element of the
+          `state_size`.
+        - A `get_initial_state(inputs=None, batch_size=None, dtype=None)`
+          method that creates a tensor meant to be fed to `call()` as the
+          initial state, if the user didn't specify any initial state via other
+          means. The returned initial state should have a shape of
+          [batch_size, cell.state_size]. The cell might choose to create a
+          tensor full of zeros, or full of other values based on the cell's
+          implementation.
+          `inputs` is the input tensor to the RNN layer, which should
+          contain the batch size as its shape[0], and also dtype. Note that
+          the shape[0] might be `None` during the graph construction. Either
+          the `inputs` or the pair of `batch_size` and `dtype` are provided.
+          `batch_size` is a scalar tensor that represents the batch size
+          of the inputs. `dtype` is `tf.DType` that represents the dtype of
+          the inputs.
+          For backward compatibility, if this method is not implemented
+          by the cell, the RNN layer will create a zero filled tensor with the
+          size of [batch_size, cell.state_size].
+        In the case that `cell` is a list of RNN cell instances, the cells
+        will be stacked on top of each other in the RNN, resulting in an
+        efficient stacked RNN.
+      return_sequences: Boolean (default `False`). Whether to return the last
+        output in the output sequence, or the full sequence.
+      return_state: Boolean (default `False`). Whether to return the last state
+        in addition to the output.
+      go_backwards: Boolean (default `False`).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+      stateful: Boolean (default `False`). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+      unroll: Boolean (default `False`).
+        If True, the network will be unrolled, else a symbolic loop will be
+        used. Unrolling can speed-up a RNN, although it tends to be more
+        memory-intensive. Unrolling is only suitable for short sequences.
+      time_major: The shape format of the `inputs` and `outputs` tensors.
+        If True, the inputs and outputs will be in shape
+        `(timesteps, batch, ...)`, whereas in the False case, it will be
+        `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
+        efficient because it avoids transposes at the beginning and end of the
+        RNN calculation. However, most TensorFlow data is batch-major, so by
+        default this function accepts input and emits output in batch-major
+        form.
+      zero_output_for_mask: Boolean (default `False`).
+        Whether the output should use zeros for the masked timesteps. Note that
+        this field is only used when `return_sequences` is True and mask is
+        provided. It can useful if you want to reuse the raw output sequence of
+        the RNN without interference from the masked timesteps, eg, merging
+        bidirectional RNNs.
+
+    Call arguments:
+      inputs: Input tensor.
+      mask: Binary tensor of shape `[batch_size, timesteps]` indicating whether
+        a given timestep should be masked. An individual `True` entry indicates
+        that the corresponding timestep should be utilized, while a `False`
+        entry indicates that the corresponding timestep should be ignored.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is for use with cells that use dropout.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+      constants: List of constant tensors to be passed to the cell at each
+        timestep.
+
+    Input shape:
+      N-D tensor with shape `[batch_size, timesteps, ...]` or
+      `[timesteps, batch_size, ...]` when time_major is True.
+
+    Output shape:
+      - If `return_state`: a list of tensors. The first tensor is
+        the output. The remaining tensors are the last states,
+        each with shape `[batch_size, state_size]`, where `state_size` could
+        be a high dimension tensor shape.
+      - If `return_sequences`: N-D tensor with shape
+        `[batch_size, timesteps, output_size]`, where `output_size` could
+        be a high dimension tensor shape, or
+        `[timesteps, batch_size, output_size]` when `time_major` is True.
+      - Else, N-D tensor with shape `[batch_size, output_size]`, where
+        `output_size` could be a high dimension tensor shape.
+
+    Masking:
+      This layer supports masking for input data with a variable number
+      of timesteps. To introduce masks to your data,
+      use an [tf.keras.layers.Embedding] layer with the `mask_zero` parameter
+      set to `True`.
+
+    Note on using statefulness in RNNs:
+      You can set RNN layers to be 'stateful', which means that the states
+      computed for the samples in one batch will be reused as initial states
+      for the samples in the next batch. This assumes a one-to-one mapping
+      between samples in different successive batches.
+
+      To enable statefulness:
+        - Specify `stateful=True` in the layer constructor.
+        - Specify a fixed batch size for your model, by passing
+          If sequential model:
+            `batch_input_shape=(...)` to the first layer in your model.
+          Else for functional model with 1 or more Input layers:
+            `batch_shape=(...)` to all the first layers in your model.
+          This is the expected shape of your inputs
+          *including the batch size*.
+          It should be a tuple of integers, e.g. `(32, 10, 100)`.
+        - Specify `shuffle=False` when calling `fit()`.
+
+      To reset the states of your model, call `.reset_states()` on either
+      a specific layer, or on your entire model.
+
+    Note on specifying the initial state of RNNs:
+      You can specify the initial state of RNN layers symbolically by
+      calling them with the keyword argument `initial_state`. The value of
+      `initial_state` should be a tensor or list of tensors representing
+      the initial state of the RNN layer.
+
+      You can specify the initial state of RNN layers numerically by
+      calling `reset_states` with the keyword argument `states`. The value of
+      `states` should be a numpy array or list of numpy arrays representing
+      the initial state of the RNN layer.
+
+    Note on passing external constants to RNNs:
+      You can pass "external" constants to the cell using the `constants`
+      keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
+      requires that the `cell.call` method accepts the same keyword argument
+      `constants`. Such constants can be used to condition the cell
+      transformation on additional static inputs (not changing over time),
+      a.k.a. an attention mechanism.
+
+    Examples:
+
+    ```python
+    from keras.layers import RNN
+    from keras import backend
+
+    # First, let's define a RNN Cell, as a layer subclass.
+    class MinimalRNNCell(keras.layers.Layer):
+
+        def __init__(self, units, **kwargs):
+            self.units = units
+            self.state_size = units
+            super(MinimalRNNCell, self).__init__(**kwargs)
+
+        def build(self, input_shape):
+            self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                          initializer='uniform',
+                                          name='kernel')
+            self.recurrent_kernel = self.add_weight(
+                shape=(self.units, self.units),
+                initializer='uniform',
+                name='recurrent_kernel')
+            self.built = True
+
+        def call(self, inputs, states):
+            prev_output = states[0]
+            h = backend.dot(inputs, self.kernel)
+            output = h + backend.dot(prev_output, self.recurrent_kernel)
+            return output, [output]
+
+    # Let's use this cell in a RNN layer:
+
+    cell = MinimalRNNCell(32)
+    x = keras.Input((None, 5))
+    layer = RNN(cell)
+    y = layer(x)
+
+    # Here's how to use the cell to build a stacked RNN:
+
+    cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
+    x = keras.Input((None, 5))
+    layer = RNN(cells)
+    y = layer(x)
+    ```
+    """
+
+    def __init__(
+        self,
+        cell,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        unroll=False,
+        time_major=False,
+        **kwargs,
+    ):
+        if isinstance(cell, (list, tuple)):
+            cell = StackedRNNCells(cell)
+        if "call" not in dir(cell):
+            raise ValueError(
+                "Argument `cell` should have a `call` method. "
+                f"The RNN was passed: cell={cell}"
+            )
+        if "state_size" not in dir(cell):
+            raise ValueError(
+                "The RNN cell should have a `state_size` attribute "
+                "(tuple of integers, one integer per RNN state). "
+                f"Received: cell={cell}"
+            )
+        # If True, the output for masked timestep will be zeros, whereas in the
+        # False case, output from previous timestep is returned for masked
+        # timestep.
+        self.zero_output_for_mask = kwargs.pop("zero_output_for_mask", False)
+
+        if "input_shape" not in kwargs and (
+            "input_dim" in kwargs or "input_length" in kwargs
+        ):
+            input_shape = (
+                kwargs.pop("input_length", None),
+                kwargs.pop("input_dim", None),
+            )
+            kwargs["input_shape"] = input_shape
+
+        super().__init__(**kwargs)
+        self.cell = cell
+        self.return_sequences = return_sequences
+        self.return_state = return_state
+        self.go_backwards = go_backwards
+        self.stateful = stateful
+        self.unroll = unroll
+        self.time_major = time_major
+
+        self.supports_masking = True
+        # The input shape is unknown yet, it could have nested tensor inputs,
+        # and the input spec will be the list of specs for nested inputs, the
+        # structure of the input_spec will be the same as the input.
+        self.input_spec = None
+        self.state_spec = None
+        self._states = None
+        self.constants_spec = None
+        self._num_constants = 0
+
+        if stateful:
+            if tf.distribute.has_strategy():
+                raise ValueError(
+                    "Stateful RNNs (created with `stateful=True`) "
+                    "are not yet supported with tf.distribute.Strategy."
+                )
+
+    @property
+    def _use_input_spec_as_call_signature(self):
+        if self.unroll:
+            # When the RNN layer is unrolled, the time step shape cannot be
+            # unknown.  The input spec does not define the time step (because
+            # this layer can be called with any time step value, as long as it
+            # is not None), so it cannot be used as the call function signature
+            # when saving to SavedModel.
+            return False
+        return super()._use_input_spec_as_call_signature
+
+    @property
+    def states(self):
+        if self._states is None:
+            state = tf.nest.map_structure(lambda _: None, self.cell.state_size)
+            return state if tf.nest.is_nested(self.cell.state_size) else [state]
+        return self._states
+
+    @states.setter
+    # Automatic tracking catches "self._states" which adds an extra weight and
+    # breaks HDF5 checkpoints.
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def states(self, states):
+        self._states = states
+
+    def compute_output_shape(self, input_shape):
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+        # Check whether the input shape contains any nested shapes. It could be
+        # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from
+        # numpy inputs.
+        try:
+            input_shape = tf.TensorShape(input_shape)
+        except (ValueError, TypeError):
+            # A nested tensor input
+            input_shape = tf.nest.flatten(input_shape)[0]
+
+        batch = input_shape[0]
+        time_step = input_shape[1]
         if self.time_major:
-          output_shape = tf.TensorShape(
-              [time_step, batch] + output_dim)
+            batch, time_step = time_step, batch
+
+        if rnn_utils.is_multiple_state(self.cell.state_size):
+            state_size = self.cell.state_size
+        else:
+            state_size = [self.cell.state_size]
+
+        def _get_output_shape(flat_output_size):
+            output_dim = tf.TensorShape(flat_output_size).as_list()
+            if self.return_sequences:
+                if self.time_major:
+                    output_shape = tf.TensorShape(
+                        [time_step, batch] + output_dim
+                    )
+                else:
+                    output_shape = tf.TensorShape(
+                        [batch, time_step] + output_dim
+                    )
+            else:
+                output_shape = tf.TensorShape([batch] + output_dim)
+            return output_shape
+
+        if getattr(self.cell, "output_size", None) is not None:
+            # cell.output_size could be nested structure.
+            output_shape = tf.nest.flatten(
+                tf.nest.map_structure(_get_output_shape, self.cell.output_size)
+            )
+            output_shape = (
+                output_shape[0] if len(output_shape) == 1 else output_shape
+            )
         else:
-          output_shape = tf.TensorShape(
-              [batch, time_step] + output_dim)
-      else:
-        output_shape = tf.TensorShape([batch] + output_dim)
-      return output_shape
-
-    if getattr(self.cell, 'output_size', None) is not None:
-      # cell.output_size could be nested structure.
-      output_shape = tf.nest.flatten(tf.nest.map_structure(
-          _get_output_shape, self.cell.output_size))
-      output_shape = output_shape[0] if len(output_shape) == 1 else output_shape
-    else:
-      # Note that state_size[0] could be a tensor_shape or int.
-      output_shape = _get_output_shape(state_size[0])
-
-    if self.return_state:
-      def _get_state_shape(flat_state):
-        state_shape = [batch] + tf.TensorShape(flat_state).as_list()
-        return tf.TensorShape(state_shape)
-      state_shape = tf.nest.map_structure(_get_state_shape, state_size)
-      return generic_utils.to_list(output_shape) + tf.nest.flatten(state_shape)
-    else:
-      return output_shape
-
-  def compute_mask(self, inputs, mask):
-    # Time step masks must be the same for each input.
-    # This is because the mask for an RNN is of size [batch, time_steps, 1],
-    # and specifies which time steps should be skipped, and a time step
-    # must be skipped for all inputs.
-    # TODO(scottzhu): Should we accept multiple different masks?
-    mask = tf.nest.flatten(mask)[0]
-    output_mask = mask if self.return_sequences else None
-    if self.return_state:
-      state_mask = [None for _ in self.states]
-      return [output_mask] + state_mask
-    else:
-      return output_mask
-
-  def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-      # The input_shape here could be a nest structure.
-
-    # do the tensor_shape to shapes here. The input could be single tensor, or a
-    # nested structure of tensors.
-    def get_input_spec(shape):
-      """Convert input shape to InputSpec."""
-      if isinstance(shape, tf.TensorShape):
-        input_spec_shape = shape.as_list()
-      else:
-        input_spec_shape = list(shape)
-      batch_index, time_step_index = (1, 0) if self.time_major else (0, 1)
-      if not self.stateful:
-        input_spec_shape[batch_index] = None
-      input_spec_shape[time_step_index] = None
-      return InputSpec(shape=tuple(input_spec_shape))
-
-    def get_step_input_shape(shape):
-      if isinstance(shape, tf.TensorShape):
-        shape = tuple(shape.as_list())
-      # remove the timestep from the input_shape
-      return shape[1:] if self.time_major else (shape[0],) + shape[2:]
-
-    def get_state_spec(shape):
-      state_spec_shape = tf.TensorShape(shape).as_list()
-      # append batch dim
-      state_spec_shape = [None] + state_spec_shape
-      return InputSpec(shape=tuple(state_spec_shape))
-
-    # Check whether the input shape contains any nested shapes. It could be
-    # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from numpy
-    # inputs.
-    try:
-      input_shape = tf.TensorShape(input_shape)
-    except (ValueError, TypeError):
-      # A nested tensor input
-      pass
-
-    if not tf.nest.is_nested(input_shape):
-      # This indicates the there is only one input.
-      if self.input_spec is not None:
-        self.input_spec[0] = get_input_spec(input_shape)
-      else:
-        self.input_spec = [get_input_spec(input_shape)]
-      step_input_shape = get_step_input_shape(input_shape)
-    else:
-      if self.input_spec is not None:
-        self.input_spec[0] = tf.nest.map_structure(get_input_spec, input_shape)
-      else:
-        self.input_spec = generic_utils.to_list(
-            tf.nest.map_structure(get_input_spec, input_shape))
-      step_input_shape = tf.nest.map_structure(get_step_input_shape,
-                                               input_shape)
-
-    # allow cell (if layer) to build before we set or validate state_spec.
-    if isinstance(self.cell, base_layer.Layer) and not self.cell.built:
-      with backend.name_scope(self.cell.name):
-        self.cell.build(step_input_shape)
-        self.cell.built = True
-
-    # set or validate state_spec
-    if rnn_utils.is_multiple_state(self.cell.state_size):
-      state_size = list(self.cell.state_size)
-    else:
-      state_size = [self.cell.state_size]
-
-    if self.state_spec is not None:
-      # initial_state was passed in call, check compatibility
-      self._validate_state_spec(state_size, self.state_spec)
-    else:
-      if tf.nest.is_nested(state_size):
-        self.state_spec = tf.nest.map_structure(get_state_spec, state_size)
-      else:
-        self.state_spec = [
-            InputSpec(shape=[None] + tf.TensorShape(dim).as_list())
-            for dim in state_size
-        ]
-      # ensure the generated state_spec is correct.
-      self._validate_state_spec(state_size, self.state_spec)
-    if self.stateful:
-      self.reset_states()
-    self.built = True
-
-  @staticmethod
-  def _validate_state_spec(cell_state_sizes, init_state_specs):
-    """Validate the state spec between the initial_state and the state_size.
+            # Note that state_size[0] could be a tensor_shape or int.
+            output_shape = _get_output_shape(state_size[0])
 
-    Args:
-      cell_state_sizes: list, the `state_size` attribute from the cell.
-      init_state_specs: list, the `state_spec` from the initial_state that is
-        passed in `call()`.
+        if self.return_state:
 
-    Raises:
-      ValueError: When initial state spec is not compatible with the state size.
-    """
-    validation_error = ValueError(
-        'An `initial_state` was passed that is not compatible with '
-        '`cell.state_size`. Received `state_spec`={}; '
-        'however `cell.state_size` is '
-        '{}'.format(init_state_specs, cell_state_sizes))
-    flat_cell_state_sizes = tf.nest.flatten(cell_state_sizes)
-    flat_state_specs = tf.nest.flatten(init_state_specs)
-
-    if len(flat_cell_state_sizes) != len(flat_state_specs):
-      raise validation_error
-    for cell_state_spec, cell_state_size in zip(flat_state_specs,
-                                                flat_cell_state_sizes):
-      if not tf.TensorShape(
-          # Ignore the first axis for init_state which is for batch
-          cell_state_spec.shape[1:]).is_compatible_with(
-              tf.TensorShape(cell_state_size)):
-        raise validation_error
-
-  @doc_controls.do_not_doc_inheritable
-  def get_initial_state(self, inputs):
-    get_initial_state_fn = getattr(self.cell, 'get_initial_state', None)
-
-    if tf.nest.is_nested(inputs):
-      # The input are nested sequences. Use the first element in the seq to get
-      # batch size and dtype.
-      inputs = tf.nest.flatten(inputs)[0]
-
-    input_shape = tf.shape(inputs)
-    batch_size = input_shape[1] if self.time_major else input_shape[0]
-    dtype = inputs.dtype
-    if get_initial_state_fn:
-      init_state = get_initial_state_fn(
-          inputs=None, batch_size=batch_size, dtype=dtype)
-    else:
-      init_state = rnn_utils.generate_zero_filled_state(
-          batch_size, self.cell.state_size, dtype)
-    # Keras RNN expect the states in a list, even if it's a single state tensor.
-    if not tf.nest.is_nested(init_state):
-      init_state = [init_state]
-    # Force the state to be a list in case it is a namedtuple eg LSTMStateTuple.
-    return list(init_state)
-
-  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = rnn_utils.standardize_args(
-        inputs, initial_state, constants, self._num_constants)
-
-    if initial_state is None and constants is None:
-      return super().__call__(inputs, **kwargs)
-
-    # If any of `initial_state` or `constants` are specified and are Keras
-    # tensors, then add them to the inputs and temporarily modify the
-    # input_spec to include them.
-
-    additional_inputs = []
-    additional_specs = []
-    if initial_state is not None:
-      additional_inputs += initial_state
-      self.state_spec = tf.nest.map_structure(
-          lambda s: InputSpec(shape=backend.int_shape(s)), initial_state)
-      additional_specs += self.state_spec
-    if constants is not None:
-      additional_inputs += constants
-      self.constants_spec = [
-          InputSpec(shape=backend.int_shape(constant)) for constant in constants
-      ]
-      self._num_constants = len(constants)
-      additional_specs += self.constants_spec
-    # additional_inputs can be empty if initial_state or constants are provided
-    # but empty (e.g. the cell is stateless).
-    flat_additional_inputs = tf.nest.flatten(additional_inputs)
-    is_keras_tensor = backend.is_keras_tensor(
-        flat_additional_inputs[0]) if flat_additional_inputs else True
-    for tensor in flat_additional_inputs:
-      if backend.is_keras_tensor(tensor) != is_keras_tensor:
-        raise ValueError(
-            'The initial state or constants of an RNN layer cannot be '
-            'specified via a mix of Keras tensors and non-Keras tensors '
-            '(a "Keras tensor" is a tensor that was returned by a Keras layer '
-            ' or by `Input` during Functional model construction). '
-            f'Received: initial_state={initial_state}, constants={constants}')
-
-    if is_keras_tensor:
-      # Compute the full input spec, including state and constants
-      full_input = [inputs] + additional_inputs
-      if self.built:
-        # Keep the input_spec since it has been populated in build() method.
-        full_input_spec = self.input_spec + additional_specs
-      else:
-        # The original input_spec is None since there could be a nested tensor
-        # input. Update the input_spec to match the inputs.
-        full_input_spec = generic_utils.to_list(
-            tf.nest.map_structure(lambda _: None, inputs)) + additional_specs
-      # Perform the call with temporarily replaced input_spec
-      self.input_spec = full_input_spec
-      output = super().__call__(full_input, **kwargs)
-      # Remove the additional_specs from input spec and keep the rest. It is
-      # important to keep since the input spec was populated by build(), and
-      # will be reused in the stateful=True.
-      self.input_spec = self.input_spec[:-len(additional_specs)]
-      return output
-    else:
-      if initial_state is not None:
-        kwargs['initial_state'] = initial_state
-      if constants is not None:
-        kwargs['constants'] = constants
-      return super().__call__(inputs, **kwargs)
-
-  def call(self,
-           inputs,
-           mask=None,
-           training=None,
-           initial_state=None,
-           constants=None):
-    # The input should be dense, padded with zeros. If a ragged input is fed
-    # into the layer, it is padded and the row lengths are used for masking.
-    inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
-    is_ragged_input = (row_lengths is not None)
-    self._validate_args_if_ragged(is_ragged_input, mask)
-
-    inputs, initial_state, constants = self._process_inputs(
-        inputs, initial_state, constants)
-
-    self._maybe_reset_cell_dropout_mask(self.cell)
-    if isinstance(self.cell, StackedRNNCells):
-      for cell in self.cell.cells:
-        self._maybe_reset_cell_dropout_mask(cell)
-
-    if mask is not None:
-      # Time step masks must be the same for each input.
-      # TODO(scottzhu): Should we accept multiple different masks?
-      mask = tf.nest.flatten(mask)[0]
-
-    if tf.nest.is_nested(inputs):
-      # In the case of nested input, use the first element for shape check.
-      input_shape = backend.int_shape(tf.nest.flatten(inputs)[0])
-    else:
-      input_shape = backend.int_shape(inputs)
-    timesteps = input_shape[0] if self.time_major else input_shape[1]
-    if self.unroll and timesteps is None:
-      raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined. \n'
-                       '- If using a Sequential model, '
-                       'specify the time dimension by passing '
-                       'an `input_shape` or `batch_input_shape` '
-                       'argument to your first layer. If your '
-                       'first layer is an Embedding, you can '
-                       'also use the `input_length` argument.\n'
-                       '- If using the functional API, specify '
-                       'the time dimension by passing a `shape` '
-                       'or `batch_shape` argument to your Input layer.')
-
-    kwargs = {}
-    if generic_utils.has_arg(self.cell.call, 'training'):
-      kwargs['training'] = training
-
-    # TF RNN cells expect single tensor as state instead of list wrapped tensor.
-    is_tf_rnn_cell = getattr(self.cell, '_is_tf_rnn_cell', None) is not None
-    # Use the __call__ function for callable objects, eg layers, so that it
-    # will have the proper name scopes for the ops, etc.
-    cell_call_fn = self.cell.__call__ if callable(self.cell) else self.cell.call
-    if constants:
-      if not generic_utils.has_arg(self.cell.call, 'constants'):
-        raise ValueError(
-            f'RNN cell {self.cell} does not support constants. '
-            f'Received: constants={constants}')
-
-      def step(inputs, states):
-        constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-
-        states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
-        output, new_states = cell_call_fn(
-            inputs, states, constants=constants, **kwargs)
-        if not tf.nest.is_nested(new_states):
-          new_states = [new_states]
-        return output, new_states
-    else:
-
-      def step(inputs, states):
-        states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
-        output, new_states = cell_call_fn(inputs, states, **kwargs)
-        if not tf.nest.is_nested(new_states):
-          new_states = [new_states]
-        return output, new_states
-    last_output, outputs, states = backend.rnn(
-        step,
+            def _get_state_shape(flat_state):
+                state_shape = [batch] + tf.TensorShape(flat_state).as_list()
+                return tf.TensorShape(state_shape)
+
+            state_shape = tf.nest.map_structure(_get_state_shape, state_size)
+            return generic_utils.to_list(output_shape) + tf.nest.flatten(
+                state_shape
+            )
+        else:
+            return output_shape
+
+    def compute_mask(self, inputs, mask):
+        # Time step masks must be the same for each input.
+        # This is because the mask for an RNN is of size [batch, time_steps, 1],
+        # and specifies which time steps should be skipped, and a time step
+        # must be skipped for all inputs.
+        # TODO(scottzhu): Should we accept multiple different masks?
+        mask = tf.nest.flatten(mask)[0]
+        output_mask = mask if self.return_sequences else None
+        if self.return_state:
+            state_mask = [None for _ in self.states]
+            return [output_mask] + state_mask
+        else:
+            return output_mask
+
+    def build(self, input_shape):
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+            # The input_shape here could be a nest structure.
+
+        # do the tensor_shape to shapes here. The input could be single tensor,
+        # or a nested structure of tensors.
+        def get_input_spec(shape):
+            """Convert input shape to InputSpec."""
+            if isinstance(shape, tf.TensorShape):
+                input_spec_shape = shape.as_list()
+            else:
+                input_spec_shape = list(shape)
+            batch_index, time_step_index = (1, 0) if self.time_major else (0, 1)
+            if not self.stateful:
+                input_spec_shape[batch_index] = None
+            input_spec_shape[time_step_index] = None
+            return InputSpec(shape=tuple(input_spec_shape))
+
+        def get_step_input_shape(shape):
+            if isinstance(shape, tf.TensorShape):
+                shape = tuple(shape.as_list())
+            # remove the timestep from the input_shape
+            return shape[1:] if self.time_major else (shape[0],) + shape[2:]
+
+        def get_state_spec(shape):
+            state_spec_shape = tf.TensorShape(shape).as_list()
+            # append batch dim
+            state_spec_shape = [None] + state_spec_shape
+            return InputSpec(shape=tuple(state_spec_shape))
+
+        # Check whether the input shape contains any nested shapes. It could be
+        # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from
+        # numpy inputs.
+        try:
+            input_shape = tf.TensorShape(input_shape)
+        except (ValueError, TypeError):
+            # A nested tensor input
+            pass
+
+        if not tf.nest.is_nested(input_shape):
+            # This indicates the there is only one input.
+            if self.input_spec is not None:
+                self.input_spec[0] = get_input_spec(input_shape)
+            else:
+                self.input_spec = [get_input_spec(input_shape)]
+            step_input_shape = get_step_input_shape(input_shape)
+        else:
+            if self.input_spec is not None:
+                self.input_spec[0] = tf.nest.map_structure(
+                    get_input_spec, input_shape
+                )
+            else:
+                self.input_spec = generic_utils.to_list(
+                    tf.nest.map_structure(get_input_spec, input_shape)
+                )
+            step_input_shape = tf.nest.map_structure(
+                get_step_input_shape, input_shape
+            )
+
+        # allow cell (if layer) to build before we set or validate state_spec.
+        if isinstance(self.cell, base_layer.Layer) and not self.cell.built:
+            with backend.name_scope(self.cell.name):
+                self.cell.build(step_input_shape)
+                self.cell.built = True
+
+        # set or validate state_spec
+        if rnn_utils.is_multiple_state(self.cell.state_size):
+            state_size = list(self.cell.state_size)
+        else:
+            state_size = [self.cell.state_size]
+
+        if self.state_spec is not None:
+            # initial_state was passed in call, check compatibility
+            self._validate_state_spec(state_size, self.state_spec)
+        else:
+            if tf.nest.is_nested(state_size):
+                self.state_spec = tf.nest.map_structure(
+                    get_state_spec, state_size
+                )
+            else:
+                self.state_spec = [
+                    InputSpec(shape=[None] + tf.TensorShape(dim).as_list())
+                    for dim in state_size
+                ]
+            # ensure the generated state_spec is correct.
+            self._validate_state_spec(state_size, self.state_spec)
+        if self.stateful:
+            self.reset_states()
+        super().build(input_shape)
+
+    @staticmethod
+    def _validate_state_spec(cell_state_sizes, init_state_specs):
+        """Validate the state spec between the initial_state and the state_size.
+
+        Args:
+          cell_state_sizes: list, the `state_size` attribute from the cell.
+          init_state_specs: list, the `state_spec` from the initial_state that
+            is passed in `call()`.
+
+        Raises:
+          ValueError: When initial state spec is not compatible with the state
+            size.
+        """
+        validation_error = ValueError(
+            "An `initial_state` was passed that is not compatible with "
+            "`cell.state_size`. Received `state_spec`={}; "
+            "however `cell.state_size` is "
+            "{}".format(init_state_specs, cell_state_sizes)
+        )
+        flat_cell_state_sizes = tf.nest.flatten(cell_state_sizes)
+        flat_state_specs = tf.nest.flatten(init_state_specs)
+
+        if len(flat_cell_state_sizes) != len(flat_state_specs):
+            raise validation_error
+        for cell_state_spec, cell_state_size in zip(
+            flat_state_specs, flat_cell_state_sizes
+        ):
+            if not tf.TensorShape(
+                # Ignore the first axis for init_state which is for batch
+                cell_state_spec.shape[1:]
+            ).is_compatible_with(tf.TensorShape(cell_state_size)):
+                raise validation_error
+
+    @doc_controls.do_not_doc_inheritable
+    def get_initial_state(self, inputs):
+        get_initial_state_fn = getattr(self.cell, "get_initial_state", None)
+
+        if tf.nest.is_nested(inputs):
+            # The input are nested sequences. Use the first element in the seq
+            # to get batch size and dtype.
+            inputs = tf.nest.flatten(inputs)[0]
+
+        input_shape = tf.shape(inputs)
+        batch_size = input_shape[1] if self.time_major else input_shape[0]
+        dtype = inputs.dtype
+        if get_initial_state_fn:
+            init_state = get_initial_state_fn(
+                inputs=None, batch_size=batch_size, dtype=dtype
+            )
+        else:
+            init_state = rnn_utils.generate_zero_filled_state(
+                batch_size, self.cell.state_size, dtype
+            )
+        # Keras RNN expect the states in a list, even if it's a single state
+        # tensor.
+        if not tf.nest.is_nested(init_state):
+            init_state = [init_state]
+        # Force the state to be a list in case it is a namedtuple eg
+        # LSTMStateTuple.
+        return list(init_state)
+
+    def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+        inputs, initial_state, constants = rnn_utils.standardize_args(
+            inputs, initial_state, constants, self._num_constants
+        )
+
+        if initial_state is None and constants is None:
+            return super().__call__(inputs, **kwargs)
+
+        # If any of `initial_state` or `constants` are specified and are Keras
+        # tensors, then add them to the inputs and temporarily modify the
+        # input_spec to include them.
+
+        additional_inputs = []
+        additional_specs = []
+        if initial_state is not None:
+            additional_inputs += initial_state
+            self.state_spec = tf.nest.map_structure(
+                lambda s: InputSpec(shape=backend.int_shape(s)), initial_state
+            )
+            additional_specs += self.state_spec
+        if constants is not None:
+            additional_inputs += constants
+            self.constants_spec = [
+                InputSpec(shape=backend.int_shape(constant))
+                for constant in constants
+            ]
+            self._num_constants = len(constants)
+            additional_specs += self.constants_spec
+        # additional_inputs can be empty if initial_state or constants are
+        # provided but empty (e.g. the cell is stateless).
+        flat_additional_inputs = tf.nest.flatten(additional_inputs)
+        is_keras_tensor = (
+            backend.is_keras_tensor(flat_additional_inputs[0])
+            if flat_additional_inputs
+            else True
+        )
+        for tensor in flat_additional_inputs:
+            if backend.is_keras_tensor(tensor) != is_keras_tensor:
+                raise ValueError(
+                    "The initial state or constants of an RNN layer cannot be "
+                    "specified via a mix of Keras tensors and non-Keras "
+                    'tensors (a "Keras tensor" is a tensor that was returned '
+                    "by a Keras layer  or by `Input` during Functional "
+                    "model construction). Received: "
+                    f"initial_state={initial_state}, constants={constants}"
+                )
+
+        if is_keras_tensor:
+            # Compute the full input spec, including state and constants
+            full_input = [inputs] + additional_inputs
+            if self.built:
+                # Keep the input_spec since it has been populated in build()
+                # method.
+                full_input_spec = self.input_spec + additional_specs
+            else:
+                # The original input_spec is None since there could be a nested
+                # tensor input. Update the input_spec to match the inputs.
+                full_input_spec = (
+                    generic_utils.to_list(
+                        tf.nest.map_structure(lambda _: None, inputs)
+                    )
+                    + additional_specs
+                )
+            # Perform the call with temporarily replaced input_spec
+            self.input_spec = full_input_spec
+            output = super().__call__(full_input, **kwargs)
+            # Remove the additional_specs from input spec and keep the rest. It
+            # is important to keep since the input spec was populated by
+            # build(), and will be reused in the stateful=True.
+            self.input_spec = self.input_spec[: -len(additional_specs)]
+            return output
+        else:
+            if initial_state is not None:
+                kwargs["initial_state"] = initial_state
+            if constants is not None:
+                kwargs["constants"] = constants
+            return super().__call__(inputs, **kwargs)
+
+    def call(
+        self,
         inputs,
-        initial_state,
-        constants=constants,
-        go_backwards=self.go_backwards,
-        mask=mask,
-        unroll=self.unroll,
-        input_length=row_lengths if row_lengths is not None else timesteps,
-        time_major=self.time_major,
-        zero_output_for_mask=self.zero_output_for_mask,
-        return_all_outputs=self.return_sequences)
-
-    if self.stateful:
-      updates = [
-          tf.compat.v1.assign(self_state, tf.cast(state, self_state.dtype))
-          for self_state, state in zip(
-              tf.nest.flatten(self.states), tf.nest.flatten(states))
-      ]
-      self.add_update(updates)
-
-    if self.return_sequences:
-      output = backend.maybe_convert_to_ragged(
-          is_ragged_input, outputs, row_lengths, go_backwards=self.go_backwards)
-    else:
-      output = last_output
-
-    if self.return_state:
-      if not isinstance(states, (list, tuple)):
-        states = [states]
-      else:
-        states = list(states)
-      return generic_utils.to_list(output) + states
-    else:
-      return output
-
-  def _process_inputs(self, inputs, initial_state, constants):
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if (isinstance(inputs, collections.abc.Sequence)
-        and not isinstance(inputs, tuple)):
-      # get initial_state from full input spec
-      # as they could be copied to multiple GPU.
-      if not self._num_constants:
-        initial_state = inputs[1:]
-      else:
-        initial_state = inputs[1:-self._num_constants]
-        constants = inputs[-self._num_constants:]
-      if len(initial_state) == 0:
-        initial_state = None
-      inputs = inputs[0]
-
-    if self.stateful:
-      if initial_state is not None:
-        # When layer is stateful and initial_state is provided, check if the
-        # recorded state is same as the default value (zeros). Use the recorded
-        # state if it is not same as the default.
-        non_zero_count = tf.add_n([tf.math.count_nonzero(s)
-                                   for s in tf.nest.flatten(self.states)])
-        # Set strict = True to keep the original structure of the state.
-        initial_state = tf.compat.v1.cond(non_zero_count > 0,
-                                          true_fn=lambda: self.states,
-                                          false_fn=lambda: initial_state,
-                                          strict=True)
-      else:
-        initial_state = self.states
-      initial_state = tf.nest.map_structure(
-          # When the layer has a inferred dtype, use the dtype from the cell.
-          lambda v: tf.cast(v, self.compute_dtype or self.cell.compute_dtype),
-          initial_state
-      )
-    elif initial_state is None:
-      initial_state = self.get_initial_state(inputs)
-
-    if len(initial_state) != len(self.states):
-      raise ValueError(f'Layer has {len(self.states)} '
-                       f'states but was passed {len(initial_state)} initial '
-                       f'states. Received: initial_state={initial_state}')
-    return inputs, initial_state, constants
-
-  def _validate_args_if_ragged(self, is_ragged_input, mask):
-    if not is_ragged_input:
-      return
-
-    if mask is not None:
-      raise ValueError(f'The mask that was passed in was {mask}, which '
-                       'cannot be applied to RaggedTensor inputs. Please '
-                       'make sure that there is no mask injected by upstream '
-                       'layers.')
-    if self.unroll:
-      raise ValueError('The input received contains RaggedTensors and does '
-                       'not support unrolling. Disable unrolling by passing '
-                       '`unroll=False` in the RNN Layer constructor.')
-
-  def _maybe_reset_cell_dropout_mask(self, cell):
-    if isinstance(cell, DropoutRNNCellMixin):
-      cell.reset_dropout_mask()
-      cell.reset_recurrent_dropout_mask()
-
-  def reset_states(self, states=None):
-    """Reset the recorded states for the stateful RNN layer.
-
-    Can only be used when RNN layer is constructed with `stateful` = `True`.
-    Args:
-      states: Numpy arrays that contains the value for the initial state, which
-        will be feed to cell at the first time step. When the value is None,
-        zero filled numpy array will be created based on the cell state size.
-
-    Raises:
-      AttributeError: When the RNN layer is not stateful.
-      ValueError: When the batch size of the RNN layer is unknown.
-      ValueError: When the input numpy array is not compatible with the RNN
-        layer state, either size wise or dtype wise.
-    """
-    if not self.stateful:
-      raise AttributeError('Layer must be stateful.')
-    spec_shape = None
-    if self.input_spec is not None:
-      spec_shape = tf.nest.flatten(self.input_spec[0])[0].shape
-    if spec_shape is None:
-      # It is possible to have spec shape to be None, eg when construct a RNN
-      # with a custom cell, or standard RNN layers (LSTM/GRU) which we only know
-      # it has 3 dim input, but not its full shape spec before build().
-      batch_size = None
-    else:
-      batch_size = spec_shape[1] if self.time_major else spec_shape[0]
-    if not batch_size:
-      raise ValueError('If a RNN is stateful, it needs to know '
-                       'its batch size. Specify the batch size '
-                       'of your input tensors: \n'
-                       '- If using a Sequential model, '
-                       'specify the batch size by passing '
-                       'a `batch_input_shape` '
-                       'argument to your first layer.\n'
-                       '- If using the functional API, specify '
-                       'the batch size by passing a '
-                       '`batch_shape` argument to your Input layer.')
-    # initialize state if None
-    if tf.nest.flatten(self.states)[0] is None:
-      if getattr(self.cell, 'get_initial_state', None):
-        flat_init_state_values = tf.nest.flatten(self.cell.get_initial_state(
-            inputs=None, batch_size=batch_size,
-            # Use variable_dtype instead of compute_dtype, since the state is
-            # stored in a variable
-            dtype=self.variable_dtype or backend.floatx()))
-      else:
-        flat_init_state_values = tf.nest.flatten(
-            rnn_utils.generate_zero_filled_state(
-                batch_size, self.cell.state_size, self.variable_dtype or
-                backend.floatx()))
-      flat_states_variables = tf.nest.map_structure(
-          backend.variable, flat_init_state_values)
-      self.states = tf.nest.pack_sequence_as(self.cell.state_size,
-                                             flat_states_variables)
-      if not tf.nest.is_nested(self.states):
-        self.states = [self.states]
-    elif states is None:
-      for state, size in zip(tf.nest.flatten(self.states),
-                             tf.nest.flatten(self.cell.state_size)):
-        backend.set_value(
-            state,
-            np.zeros([batch_size] + tf.TensorShape(size).as_list()))
-    else:
-      flat_states = tf.nest.flatten(self.states)
-      flat_input_states = tf.nest.flatten(states)
-      if len(flat_input_states) != len(flat_states):
-        raise ValueError(f'Layer {self.name} expects {len(flat_states)} '
-                         f'states, but it received {len(flat_input_states)} '
-                         f'state values. States received: {states}')
-      set_value_tuples = []
-      for i, (value, state) in enumerate(zip(flat_input_states,
-                                             flat_states)):
-        if value.shape != state.shape:
-          raise ValueError(
-              f'State {i} is incompatible with layer {self.name}: '
-              f'expected shape={(batch_size, state)} '
-              f'but found shape={value.shape}')
-        set_value_tuples.append((state, value))
-      backend.batch_set_value(set_value_tuples)
-
-  def get_config(self):
-    config = {
-        'return_sequences': self.return_sequences,
-        'return_state': self.return_state,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful,
-        'unroll': self.unroll,
-        'time_major': self.time_major
-    }
-    if self._num_constants:
-      config['num_constants'] = self._num_constants
-    if self.zero_output_for_mask:
-      config['zero_output_for_mask'] = self.zero_output_for_mask
-
-    config['cell'] = generic_utils.serialize_keras_object(self.cell)
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    from keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    cell = deserialize_layer(config.pop('cell'), custom_objects=custom_objects)
-    num_constants = config.pop('num_constants', 0)
-    layer = cls(cell, **config)
-    layer._num_constants = num_constants  # pylint: disable=protected-access
-    return layer
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return layer_serialization.RNNSavedModelSaver(self)
+        mask=None,
+        training=None,
+        initial_state=None,
+        constants=None,
+    ):
+        # The input should be dense, padded with zeros. If a ragged input is fed
+        # into the layer, it is padded and the row lengths are used for masking.
+        inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
+        is_ragged_input = row_lengths is not None
+        self._validate_args_if_ragged(is_ragged_input, mask)
+
+        inputs, initial_state, constants = self._process_inputs(
+            inputs, initial_state, constants
+        )
+
+        self._maybe_reset_cell_dropout_mask(self.cell)
+        if isinstance(self.cell, StackedRNNCells):
+            for cell in self.cell.cells:
+                self._maybe_reset_cell_dropout_mask(cell)
+
+        if mask is not None:
+            # Time step masks must be the same for each input.
+            # TODO(scottzhu): Should we accept multiple different masks?
+            mask = tf.nest.flatten(mask)[0]
+
+        if tf.nest.is_nested(inputs):
+            # In the case of nested input, use the first element for shape
+            # check.
+            input_shape = backend.int_shape(tf.nest.flatten(inputs)[0])
+        else:
+            input_shape = backend.int_shape(inputs)
+        timesteps = input_shape[0] if self.time_major else input_shape[1]
+        if self.unroll and timesteps is None:
+            raise ValueError(
+                "Cannot unroll a RNN if the "
+                "time dimension is undefined. \n"
+                "- If using a Sequential model, "
+                "specify the time dimension by passing "
+                "an `input_shape` or `batch_input_shape` "
+                "argument to your first layer. If your "
+                "first layer is an Embedding, you can "
+                "also use the `input_length` argument.\n"
+                "- If using the functional API, specify "
+                "the time dimension by passing a `shape` "
+                "or `batch_shape` argument to your Input layer."
+            )
+
+        kwargs = {}
+        if generic_utils.has_arg(self.cell.call, "training"):
+            kwargs["training"] = training
+
+        # TF RNN cells expect single tensor as state instead of list wrapped
+        # tensor.
+        is_tf_rnn_cell = getattr(self.cell, "_is_tf_rnn_cell", None) is not None
+        # Use the __call__ function for callable objects, eg layers, so that it
+        # will have the proper name scopes for the ops, etc.
+        cell_call_fn = (
+            self.cell.__call__ if callable(self.cell) else self.cell.call
+        )
+        if constants:
+            if not generic_utils.has_arg(self.cell.call, "constants"):
+                raise ValueError(
+                    f"RNN cell {self.cell} does not support constants. "
+                    f"Received: constants={constants}"
+                )
+
+            def step(inputs, states):
+                constants = states[-self._num_constants :]
+                states = states[: -self._num_constants]
+
+                states = (
+                    states[0] if len(states) == 1 and is_tf_rnn_cell else states
+                )
+                output, new_states = cell_call_fn(
+                    inputs, states, constants=constants, **kwargs
+                )
+                if not tf.nest.is_nested(new_states):
+                    new_states = [new_states]
+                return output, new_states
+
+        else:
+
+            def step(inputs, states):
+                states = (
+                    states[0] if len(states) == 1 and is_tf_rnn_cell else states
+                )
+                output, new_states = cell_call_fn(inputs, states, **kwargs)
+                if not tf.nest.is_nested(new_states):
+                    new_states = [new_states]
+                return output, new_states
+
+        last_output, outputs, states = backend.rnn(
+            step,
+            inputs,
+            initial_state,
+            constants=constants,
+            go_backwards=self.go_backwards,
+            mask=mask,
+            unroll=self.unroll,
+            input_length=row_lengths if row_lengths is not None else timesteps,
+            time_major=self.time_major,
+            zero_output_for_mask=self.zero_output_for_mask,
+            return_all_outputs=self.return_sequences,
+        )
+
+        if self.stateful:
+            updates = [
+                tf.compat.v1.assign(
+                    self_state, tf.cast(state, self_state.dtype)
+                )
+                for self_state, state in zip(
+                    tf.nest.flatten(self.states), tf.nest.flatten(states)
+                )
+            ]
+            self.add_update(updates)
+
+        if self.return_sequences:
+            output = backend.maybe_convert_to_ragged(
+                is_ragged_input,
+                outputs,
+                row_lengths,
+                go_backwards=self.go_backwards,
+            )
+        else:
+            output = last_output
+
+        if self.return_state:
+            if not isinstance(states, (list, tuple)):
+                states = [states]
+            else:
+                states = list(states)
+            return generic_utils.to_list(output) + states
+        else:
+            return output
+
+    def _process_inputs(self, inputs, initial_state, constants):
+        # input shape: `(samples, time (padded with zeros), input_dim)`
+        # note that the .build() method of subclasses MUST define
+        # self.input_spec and self.state_spec with complete input shapes.
+        if isinstance(inputs, collections.abc.Sequence) and not isinstance(
+            inputs, tuple
+        ):
+            # get initial_state from full input spec
+            # as they could be copied to multiple GPU.
+            if not self._num_constants:
+                initial_state = inputs[1:]
+            else:
+                initial_state = inputs[1 : -self._num_constants]
+                constants = inputs[-self._num_constants :]
+            if len(initial_state) == 0:
+                initial_state = None
+            inputs = inputs[0]
+
+        if self.stateful:
+            if initial_state is not None:
+                # When layer is stateful and initial_state is provided, check if
+                # the recorded state is same as the default value (zeros). Use
+                # the recorded state if it is not same as the default.
+                non_zero_count = tf.add_n(
+                    [
+                        tf.math.count_nonzero(s)
+                        for s in tf.nest.flatten(self.states)
+                    ]
+                )
+                # Set strict = True to keep the original structure of the state.
+                initial_state = tf.compat.v1.cond(
+                    non_zero_count > 0,
+                    true_fn=lambda: self.states,
+                    false_fn=lambda: initial_state,
+                    strict=True,
+                )
+            else:
+                initial_state = self.states
+            initial_state = tf.nest.map_structure(
+                # When the layer has a inferred dtype, use the dtype from the
+                # cell.
+                lambda v: tf.cast(
+                    v, self.compute_dtype or self.cell.compute_dtype
+                ),
+                initial_state,
+            )
+        elif initial_state is None:
+            initial_state = self.get_initial_state(inputs)
+
+        if len(initial_state) != len(self.states):
+            raise ValueError(
+                f"Layer has {len(self.states)} "
+                f"states but was passed {len(initial_state)} initial "
+                f"states. Received: initial_state={initial_state}"
+            )
+        return inputs, initial_state, constants
+
+    def _validate_args_if_ragged(self, is_ragged_input, mask):
+        if not is_ragged_input:
+            return
+
+        if mask is not None:
+            raise ValueError(
+                f"The mask that was passed in was {mask}, which "
+                "cannot be applied to RaggedTensor inputs. Please "
+                "make sure that there is no mask injected by upstream "
+                "layers."
+            )
+        if self.unroll:
+            raise ValueError(
+                "The input received contains RaggedTensors and does "
+                "not support unrolling. Disable unrolling by passing "
+                "`unroll=False` in the RNN Layer constructor."
+            )
+
+    def _maybe_reset_cell_dropout_mask(self, cell):
+        if isinstance(cell, DropoutRNNCellMixin):
+            cell.reset_dropout_mask()
+            cell.reset_recurrent_dropout_mask()
+
+    def reset_states(self, states=None):
+        """Reset the recorded states for the stateful RNN layer.
+
+        Can only be used when RNN layer is constructed with `stateful` = `True`.
+        Args:
+          states: Numpy arrays that contains the value for the initial state,
+            which will be feed to cell at the first time step. When the value is
+            None, zero filled numpy array will be created based on the cell
+            state size.
+
+        Raises:
+          AttributeError: When the RNN layer is not stateful.
+          ValueError: When the batch size of the RNN layer is unknown.
+          ValueError: When the input numpy array is not compatible with the RNN
+            layer state, either size wise or dtype wise.
+        """
+        if not self.stateful:
+            raise AttributeError("Layer must be stateful.")
+        spec_shape = None
+        if self.input_spec is not None:
+            spec_shape = tf.nest.flatten(self.input_spec[0])[0].shape
+        if spec_shape is None:
+            # It is possible to have spec shape to be None, eg when construct a
+            # RNN with a custom cell, or standard RNN layers (LSTM/GRU) which we
+            # only know it has 3 dim input, but not its full shape spec before
+            # build().
+            batch_size = None
+        else:
+            batch_size = spec_shape[1] if self.time_major else spec_shape[0]
+        if not batch_size:
+            raise ValueError(
+                "If a RNN is stateful, it needs to know "
+                "its batch size. Specify the batch size "
+                "of your input tensors: \n"
+                "- If using a Sequential model, "
+                "specify the batch size by passing "
+                "a `batch_input_shape` "
+                "argument to your first layer.\n"
+                "- If using the functional API, specify "
+                "the batch size by passing a "
+                "`batch_shape` argument to your Input layer."
+            )
+        # initialize state if None
+        if tf.nest.flatten(self.states)[0] is None:
+            if getattr(self.cell, "get_initial_state", None):
+                flat_init_state_values = tf.nest.flatten(
+                    self.cell.get_initial_state(
+                        inputs=None,
+                        batch_size=batch_size,
+                        # Use variable_dtype instead of compute_dtype, since the
+                        # state is stored in a variable
+                        dtype=self.variable_dtype or backend.floatx(),
+                    )
+                )
+            else:
+                flat_init_state_values = tf.nest.flatten(
+                    rnn_utils.generate_zero_filled_state(
+                        batch_size,
+                        self.cell.state_size,
+                        self.variable_dtype or backend.floatx(),
+                    )
+                )
+            flat_states_variables = tf.nest.map_structure(
+                lambda v: backend.variable(v, v.dtype), flat_init_state_values
+            )
+            self.states = tf.nest.pack_sequence_as(
+                self.cell.state_size, flat_states_variables
+            )
+            if not tf.nest.is_nested(self.states):
+                self.states = [self.states]
+        elif states is None:
+            for state, size in zip(
+                tf.nest.flatten(self.states),
+                tf.nest.flatten(self.cell.state_size),
+            ):
+                backend.set_value(
+                    state,
+                    np.zeros([batch_size] + tf.TensorShape(size).as_list()),
+                )
+        else:
+            flat_states = tf.nest.flatten(self.states)
+            flat_input_states = tf.nest.flatten(states)
+            if len(flat_input_states) != len(flat_states):
+                raise ValueError(
+                    f"Layer {self.name} expects {len(flat_states)} "
+                    f"states, but it received {len(flat_input_states)} "
+                    f"state values. States received: {states}"
+                )
+            set_value_tuples = []
+            for i, (value, state) in enumerate(
+                zip(flat_input_states, flat_states)
+            ):
+                if value.shape != state.shape:
+                    raise ValueError(
+                        f"State {i} is incompatible with layer {self.name}: "
+                        f"expected shape={(batch_size, state)} "
+                        f"but found shape={value.shape}"
+                    )
+                set_value_tuples.append((state, value))
+            backend.batch_set_value(set_value_tuples)
+
+    def get_config(self):
+        config = {
+            "return_sequences": self.return_sequences,
+            "return_state": self.return_state,
+            "go_backwards": self.go_backwards,
+            "stateful": self.stateful,
+            "unroll": self.unroll,
+            "time_major": self.time_major,
+        }
+        if self._num_constants:
+            config["num_constants"] = self._num_constants
+        if self.zero_output_for_mask:
+            config["zero_output_for_mask"] = self.zero_output_for_mask
+
+        config["cell"] = serialization_lib.serialize_keras_object(self.cell)
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        from keras.layers import deserialize as deserialize_layer
+
+        cell = deserialize_layer(
+            config.pop("cell"), custom_objects=custom_objects
+        )
+        num_constants = config.pop("num_constants", 0)
+        layer = cls(cell, **config)
+        layer._num_constants = num_constants
+        return layer
+
+    @property
+    def _trackable_saved_model_saver(self):
+        return layer_serialization.RNNSavedModelSaver(self)
diff --git a/keras/layers/rnn/base_rnn_test.py b/keras/layers/rnn/base_rnn_test.py
index a010879bb656..7b0182a15cb2 100644
--- a/keras/layers/rnn/base_rnn_test.py
+++ b/keras/layers/rnn/base_rnn_test.py
@@ -20,7 +20,10 @@
 
 import collections
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.engine import base_layer_utils
 from keras.layers.rnn import gru
@@ -29,1914 +32,2136 @@
 from keras.layers.rnn import lstm_v1
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.utils import generic_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.training.tracking import util as trackable_util
 
+# isort: off
+from tensorflow.python.checkpoint import (
+    checkpoint as trackable_util,
+)
 
 # Used for nested input/output/state RNN test.
-NestedInput = collections.namedtuple('NestedInput', ['t1', 't2'])
-NestedState = collections.namedtuple('NestedState', ['s1', 's2'])
+NestedInput = collections.namedtuple("NestedInput", ["t1", "t2"])
+NestedState = collections.namedtuple("NestedState", ["s1", "s2"])
 
 
 @test_combinations.run_all_keras_modes
 class RNNTest(test_combinations.TestCase):
+    def test_minimal_rnn_cell_non_layer(self):
+        class MinimalRNNCell:
+            def __init__(self, units, input_dim):
+                self.units = units
+                self.state_size = units
+                self.kernel = keras.backend.variable(
+                    np.random.random((input_dim, units))
+                )
+
+            def call(self, inputs, states):
+                prev_output = states[0]
+                output = keras.backend.dot(inputs, self.kernel) + prev_output
+                return output, [output]
+
+        # Basic test case.
+        cell = MinimalRNNCell(32, 5)
+        x = keras.Input((None, 5))
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacking.
+        cells = [
+            MinimalRNNCell(8, 5),
+            MinimalRNNCell(32, 8),
+            MinimalRNNCell(32, 32),
+        ]
+        layer = keras.layers.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    def test_minimal_rnn_cell_non_layer_multiple_states(self):
+        class MinimalRNNCell:
+            def __init__(self, units, input_dim):
+                self.units = units
+                self.state_size = (units, units)
+                self.kernel = keras.backend.variable(
+                    np.random.random((input_dim, units))
+                )
+
+            def call(self, inputs, states):
+                prev_output_1 = states[0]
+                prev_output_2 = states[1]
+                output = keras.backend.dot(inputs, self.kernel)
+                output += prev_output_1
+                output -= prev_output_2
+                return output, [output * 2, output * 3]
+
+        # Basic test case.
+        cell = MinimalRNNCell(32, 5)
+        x = keras.Input((None, 5))
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacking.
+        cells = [
+            MinimalRNNCell(8, 5),
+            MinimalRNNCell(16, 8),
+            MinimalRNNCell(32, 16),
+        ]
+        layer = keras.layers.RNN(cells)
+        self.assertEqual(layer.cell.state_size, ((8, 8), (16, 16), (32, 32)))
+        self.assertEqual(layer.cell.output_size, 32)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    def test_minimal_rnn_cell_layer(self):
+        class MinimalRNNCell(keras.layers.Layer):
+            def __init__(self, units, **kwargs):
+                self.units = units
+                self.state_size = units
+                super().__init__(**kwargs)
+
+            def build(self, input_shape):
+                self.kernel = self.add_weight(
+                    shape=(input_shape[-1], self.units),
+                    initializer="uniform",
+                    name="kernel",
+                )
+                self.recurrent_kernel = self.add_weight(
+                    shape=(self.units, self.units),
+                    initializer="uniform",
+                    name="recurrent_kernel",
+                )
+                self.built = True
+
+            def call(self, inputs, states):
+                prev_output = states[0]
+                h = keras.backend.dot(inputs, self.kernel)
+                output = h + keras.backend.dot(
+                    prev_output, self.recurrent_kernel
+                )
+                return output, [output]
+
+            def get_config(self):
+                config = {"units": self.units}
+                base_config = super().get_config()
+                return dict(list(base_config.items()) + list(config.items()))
+
+        # Test basic case.
+        x = keras.Input((None, 5))
+        cell = MinimalRNNCell(32)
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test basic case serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        with keras.utils.CustomObjectScope({"MinimalRNNCell": MinimalRNNCell}):
+            layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+        # Test stacking.
+        cells = [MinimalRNNCell(8), MinimalRNNCell(12), MinimalRNNCell(32)]
+        layer = keras.layers.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacked RNN serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        with keras.utils.CustomObjectScope({"MinimalRNNCell": MinimalRNNCell}):
+            layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    def test_minimal_rnn_cell_abstract_rnn_cell(self):
+        class MinimalRNNCell(keras.layers.AbstractRNNCell):
+            def __init__(self, units, **kwargs):
+                self.units = units
+                super().__init__(**kwargs)
+
+            @property
+            def state_size(self):
+                return self.units
+
+            def build(self, input_shape):
+                self.kernel = self.add_weight(
+                    shape=(input_shape[-1], self.units),
+                    initializer="uniform",
+                    name="kernel",
+                )
+                self.recurrent_kernel = self.add_weight(
+                    shape=(self.units, self.units),
+                    initializer="uniform",
+                    name="recurrent_kernel",
+                )
+                self.built = True
+
+            def call(self, inputs, states):
+                prev_output = states[0]
+                h = keras.backend.dot(inputs, self.kernel)
+                output = h + keras.backend.dot(
+                    prev_output, self.recurrent_kernel
+                )
+                return output, output
+
+            @property
+            def output_size(self):
+                return self.units
+
+        cell = MinimalRNNCell(32)
+        x = keras.Input((None, 5))
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacking.
+        cells = [MinimalRNNCell(8), MinimalRNNCell(16), MinimalRNNCell(32)]
+        layer = keras.layers.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    def test_rnn_with_time_major(self):
+        batch = 10
+        time_step = 5
+        embedding_dim = 4
+        units = 3
+
+        # Test basic case.
+        x = keras.Input((time_step, embedding_dim))
+        time_major_x = keras.layers.Lambda(
+            lambda t: tf.transpose(t, [1, 0, 2])
+        )(x)
+        layer = keras.layers.SimpleRNN(
+            units, time_major=True, return_sequences=True
+        )
+        self.assertEqual(
+            layer.compute_output_shape(
+                (time_step, None, embedding_dim)
+            ).as_list(),
+            [time_step, None, units],
+        )
+        y = layer(time_major_x)
+        self.assertEqual(layer.output_shape, (time_step, None, units))
+
+        y = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(y)
+
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, time_step, embedding_dim)),
+            np.zeros((batch, time_step, units)),
+        )
+
+        # Test stacking.
+        x = keras.Input((time_step, embedding_dim))
+        time_major_x = keras.layers.Lambda(
+            lambda t: tf.transpose(t, [1, 0, 2])
+        )(x)
+        cell_units = [10, 8, 6]
+        cells = [keras.layers.SimpleRNNCell(cell_units[i]) for i in range(3)]
+        layer = keras.layers.RNN(cells, time_major=True, return_sequences=True)
+        y = layer(time_major_x)
+        self.assertEqual(layer.output_shape, (time_step, None, cell_units[-1]))
+
+        y = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(y)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, time_step, embedding_dim)),
+            np.zeros((batch, time_step, cell_units[-1])),
+        )
+
+        # Test masking.
+        x = keras.Input((time_step, embedding_dim))
+        time_major = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(
+            x
+        )
+        mask = keras.layers.Masking()(time_major)
+        rnn = keras.layers.SimpleRNN(
+            units, time_major=True, return_sequences=True
+        )(mask)
+        y = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(rnn)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, time_step, embedding_dim)),
+            np.zeros((batch, time_step, units)),
+        )
+
+        # Test layer output
+        x = keras.Input((time_step, embedding_dim))
+        rnn_1 = keras.layers.SimpleRNN(units, return_sequences=True)
+        y = rnn_1(x)
+
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, time_step, embedding_dim)),
+            np.zeros((batch, time_step, units)),
+        )
+
+        x_np = np.random.random((batch, time_step, embedding_dim))
+        y_np_1 = model.predict(x_np)
+
+        time_major = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(
+            x
+        )
+        rnn_2 = keras.layers.SimpleRNN(
+            units, time_major=True, return_sequences=True
+        )
+        y_2 = rnn_2(time_major)
+        y_2 = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(y_2)
+
+        model_2 = keras.models.Model(x, y_2)
+        rnn_2.set_weights(rnn_1.get_weights())
+
+        y_np_2 = model_2.predict(x_np)
+        self.assertAllClose(y_np_1, y_np_2, atol=1e-4)
+
+    def test_rnn_cell_with_constants_layer(self):
+        # Test basic case.
+        x = keras.Input((None, 5))
+        c = keras.Input((3,))
+        cell = RNNCellWithConstants(32, constant_size=3)
+        layer = keras.layers.RNN(cell)
+        y = layer(x, constants=c)
+
+        model = keras.models.Model([x, c], y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((6, 5, 5)), np.zeros((6, 3))], np.zeros((6, 32))
+        )
+
+        # Test basic case serialization.
+        x_np = np.random.random((6, 5, 5))
+        c_np = np.random.random((6, 3))
+        y_np = model.predict([x_np, c_np])
+        weights = model.get_weights()
+        config = layer.get_config()
+        custom_objects = {"RNNCellWithConstants": RNNCellWithConstants}
+        with keras.utils.CustomObjectScope(custom_objects):
+            layer = keras.layers.RNN.from_config(config.copy())
+        y = layer(x, constants=c)
+        model = keras.models.Model([x, c], y)
+        model.set_weights(weights)
+        y_np_2 = model.predict([x_np, c_np])
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+        # test flat list inputs.
+        with keras.utils.CustomObjectScope(custom_objects):
+            layer = keras.layers.RNN.from_config(config.copy())
+        y = layer([x, c])
+        model = keras.models.Model([x, c], y)
+        model.set_weights(weights)
+        y_np_3 = model.predict([x_np, c_np])
+        self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+        # Test stacking.
+        cells = [
+            gru.GRUCell(8),
+            RNNCellWithConstants(12, constant_size=3),
+            RNNCellWithConstants(32, constant_size=3),
+        ]
+        layer = keras.layers.RNN(cells)
+        y = layer(x, constants=c)
+        model = keras.models.Model([x, c], y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((6, 5, 5)), np.zeros((6, 3))], np.zeros((6, 32))
+        )
+
+        # Test GRUCell reset_after property.
+        x = keras.Input((None, 5))
+        c = keras.Input((3,))
+        cells = [gru.GRUCell(32, reset_after=True)]
+        layer = keras.layers.RNN(cells)
+        y = layer(x, constants=c)
+        model = keras.models.Model([x, c], y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((6, 5, 5)), np.zeros((6, 3))], np.zeros((6, 32))
+        )
+
+        # Test stacked RNN serialization
+        x_np = np.random.random((6, 5, 5))
+        c_np = np.random.random((6, 3))
+        y_np = model.predict([x_np, c_np])
+        weights = model.get_weights()
+        config = layer.get_config()
+        with keras.utils.CustomObjectScope(custom_objects):
+            layer = keras.layers.RNN.from_config(config.copy())
+        y = layer(x, constants=c)
+        model = keras.models.Model([x, c], y)
+        model.set_weights(weights)
+        y_np_2 = model.predict([x_np, c_np])
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    def test_rnn_cell_with_non_keras_constants(self):
+        # Test basic case.
+        x = keras.Input((None, 5))
+        c = tf.zeros([6, 3], dtype=tf.float32)
+        cell = RNNCellWithConstants(32, constant_size=3)
+        layer = keras.layers.RNN(cell)
+        y = layer(x, constants=c)
+
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacking.
+        cells = [
+            gru.GRUCell(8),
+            RNNCellWithConstants(12, constant_size=3),
+            RNNCellWithConstants(32, constant_size=3),
+        ]
+        layer = keras.layers.RNN(cells)
+        y = layer(x, constants=c)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    def test_rnn_cell_with_constants_layer_passing_initial_state(self):
+        # Test basic case.
+        x = keras.Input((None, 5))
+        c = keras.Input((3,))
+        s = keras.Input((32,))
+        cell = RNNCellWithConstants(32, constant_size=3)
+        layer = keras.layers.RNN(cell)
+        y = layer(x, initial_state=s, constants=c)
+        model = keras.models.Model([x, s, c], y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
+            np.zeros((6, 32)),
+        )
+
+        # Test basic case serialization.
+        x_np = np.random.random((6, 5, 5))
+        s_np = np.random.random((6, 32))
+        c_np = np.random.random((6, 3))
+        y_np = model.predict([x_np, s_np, c_np])
+        weights = model.get_weights()
+        config = layer.get_config()
+        custom_objects = {"RNNCellWithConstants": RNNCellWithConstants}
+        with keras.utils.CustomObjectScope(custom_objects):
+            layer = keras.layers.RNN.from_config(config.copy())
+        y = layer(x, initial_state=s, constants=c)
+        model = keras.models.Model([x, s, c], y)
+        model.set_weights(weights)
+        y_np_2 = model.predict([x_np, s_np, c_np])
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+        # verify that state is used
+        y_np_2_different_s = model.predict([x_np, s_np + 10.0, c_np])
+        with self.assertRaises(AssertionError):
+            self.assertAllClose(y_np, y_np_2_different_s, atol=1e-4)
+
+        # test flat list inputs
+        with keras.utils.CustomObjectScope(custom_objects):
+            layer = keras.layers.RNN.from_config(config.copy())
+        y = layer([x, s, c])
+        model = keras.models.Model([x, s, c], y)
+        model.set_weights(weights)
+        y_np_3 = model.predict([x_np, s_np, c_np])
+        self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+    def test_rnn_cell_with_non_keras_constants_and_initial_state(self):
+        # Test basic case.
+        x = keras.Input((None, 5))
+        c = tf.zeros([6, 3], dtype=tf.float32)
+        s = tf.zeros([6, 32], dtype=tf.float32)
+        cell = RNNCellWithConstants(32, constant_size=3)
+        layer = keras.layers.RNN(cell)
+        y = layer(x, initial_state=s, constants=c)
+
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacking.
+        cells = [
+            gru.GRUCell(8),
+            RNNCellWithConstants(12, constant_size=3),
+            RNNCellWithConstants(32, constant_size=3),
+        ]
+        layer = keras.layers.RNN(cells)
+        s = [
+            tf.zeros([6, 8], dtype=tf.float32),
+            tf.zeros([6, 12], dtype=tf.float32),
+            tf.zeros([6, 32], dtype=tf.float32),
+        ]
+        y = layer(x, initial_state=s, constants=c)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    def test_stacked_rnn_attributes(self):
+        if tf.executing_eagerly():
+            self.skipTest("reduce_sum is not available in eager mode.")
+
+        cells = [keras.layers.LSTMCell(1), keras.layers.LSTMCell(1)]
+        layer = keras.layers.RNN(cells)
+        layer.build((None, None, 1))
+
+        # Test weights
+        self.assertEqual(len(layer.trainable_weights), 6)
+        cells[0].trainable = False
+        self.assertEqual(len(layer.trainable_weights), 3)
+        self.assertEqual(len(layer.non_trainable_weights), 3)
+
+        # Test `get_losses_for` and `losses`
+        x = keras.Input((None, 1))
+        loss_1 = tf.reduce_sum(x)
+        loss_2 = tf.reduce_sum(cells[0].kernel)
+        cells[0].add_loss(loss_1, inputs=x)
+        cells[0].add_loss(loss_2)
+        self.assertEqual(len(layer.losses), 2)
+        self.assertEqual(layer.get_losses_for(None), [loss_2])
+        self.assertEqual(layer.get_losses_for(x), [loss_1])
+
+        # Test `updates`
+        cells = [keras.layers.LSTMCell(1), keras.layers.LSTMCell(1)]
+        layer = keras.layers.RNN(cells)
+        x = keras.Input((None, 1))
+        _ = layer(x)
+
+        update_1 = tf.compat.v1.assign_add(
+            cells[0].kernel, x[0, 0, 0] * cells[0].kernel
+        )
+        update_2 = tf.compat.v1.assign_add(
+            cells[0].kernel, tf.ones_like(cells[0].kernel)
+        )
+        # TODO(b/128682878): Remove when RNNCells are __call__'d.
+        with base_layer_utils.call_context().enter(layer, x, True, None):
+            cells[0].add_update(update_1)
+            cells[0].add_update(update_2)
+        self.assertEqual(len(layer.updates), 2)
+
+    def test_rnn_dynamic_trainability(self):
+        layer_class = keras.layers.SimpleRNN
+        embedding_dim = 4
+        units = 3
+
+        layer = layer_class(units)
+        layer.build((None, None, embedding_dim))
+        self.assertEqual(len(layer.weights), 3)
+        self.assertEqual(len(layer.trainable_weights), 3)
+        self.assertEqual(len(layer.non_trainable_weights), 0)
+        layer.trainable = False
+        self.assertEqual(len(layer.weights), 3)
+        self.assertEqual(len(layer.trainable_weights), 0)
+        self.assertEqual(len(layer.non_trainable_weights), 3)
+        layer.trainable = True
+        self.assertEqual(len(layer.weights), 3)
+        self.assertEqual(len(layer.trainable_weights), 3)
+        self.assertEqual(len(layer.non_trainable_weights), 0)
+
+    @parameterized.parameters(
+        [keras.layers.SimpleRNN, keras.layers.GRU, keras.layers.LSTM]
+    )
+    def test_rnn_cell_trainability(self, layer_cls):
+        # https://github.com/tensorflow/tensorflow/issues/32369.
+        layer = layer_cls(3, trainable=False)
+        self.assertFalse(layer.cell.trainable)
+
+        layer.trainable = True
+        self.assertTrue(layer.cell.trainable)
+
+    def test_state_reuse_with_dropout(self):
+        layer_class = keras.layers.SimpleRNN
+        embedding_dim = 4
+        units = 3
+        timesteps = 2
+        num_samples = 2
+
+        input1 = keras.Input(
+            batch_shape=(num_samples, timesteps, embedding_dim)
+        )
+        layer = layer_class(
+            units, return_state=True, return_sequences=True, dropout=0.2
+        )
+        state = layer(input1)[1:]
+
+        input2 = keras.Input(
+            batch_shape=(num_samples, timesteps, embedding_dim)
+        )
+        output = layer_class(units)(input2, initial_state=state)
+        model = keras.Model([input1, input2], output)
+
+        inputs = [
+            np.random.random((num_samples, timesteps, embedding_dim)),
+            np.random.random((num_samples, timesteps, embedding_dim)),
+        ]
+        model.predict(inputs)
+
+    def test_builtin_and_custom_rnn_cell_serialization(self):
+        @keras.utils.register_keras_serializable(package="TestOnly")
+        class CustomRNNCell(keras.layers.Layer):
+            def __init__(self, units, **kwargs):
+                self.units = units
+                self.state_size = units
+                super().__init__(**kwargs)
+
+            def build(self, input_shape):
+                self.kernel = self.add_weight(
+                    shape=(input_shape[-1], self.units),
+                    initializer="uniform",
+                    name="kernel",
+                )
+                self.recurrent_kernel = self.add_weight(
+                    shape=(self.units, self.units),
+                    initializer="uniform",
+                    name="recurrent_kernel",
+                )
+                self.built = True
+
+            def call(self, inputs, states):
+                prev_output = states[0]
+                h = keras.backend.dot(inputs, self.kernel)
+                output = h + keras.backend.dot(
+                    prev_output, self.recurrent_kernel
+                )
+                return output, [output]
+
+            def get_config(self):
+                config = {"units": self.units}
+                base_config = super().get_config()
+                return dict(list(base_config.items()) + list(config.items()))
+
+        for cell_class in [
+            keras.layers.SimpleRNNCell,
+            keras.layers.GRUCell,
+            keras.layers.LSTMCell,
+            CustomRNNCell,
+        ]:
+            # Test basic case.
+            x = keras.Input((None, 5))
+            cell = cell_class(32)
+            layer = keras.layers.RNN(cell)
+            y = layer(x)
+            model = keras.models.Model(x, y)
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+            # Test basic case serialization.
+            x_np = np.random.random((6, 5, 5))
+            y_np = model.predict(x_np)
+            weights = model.get_weights()
+            config = layer.get_config()
+            layer = keras.layers.RNN.from_config(config)
+            y = layer(x)
+            model = keras.models.Model(x, y)
+            model.set_weights(weights)
+            y_np_2 = model.predict(x_np)
+            self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+            # Test stacking.
+            cells = [cell_class(8), cell_class(12), cell_class(32)]
+            layer = keras.layers.RNN(cells)
+            y = layer(x)
+            model = keras.models.Model(x, y)
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+            # Test stacked RNN serialization.
+            x_np = np.random.random((6, 5, 5))
+            y_np = model.predict(x_np)
+            weights = model.get_weights()
+            config = layer.get_config()
+            layer = keras.layers.RNN.from_config(config)
+            y = layer(x)
+            model = keras.models.Model(x, y)
+            model.set_weights(weights)
+            y_np_2 = model.predict(x_np)
+            self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer=[
+                keras.layers.SimpleRNN,
+                gru_v1.GRU,
+                lstm_v1.LSTM,
+                gru.GRU,
+                lstm.LSTM,
+            ],
+            unroll=[True, False],
+        )
+    )
+    def test_rnn_dropout(self, layer, unroll):
+        rnn_layer = layer(3, dropout=0.1, recurrent_dropout=0.1, unroll=unroll)
+        if not unroll:
+            x = keras.Input((None, 5))
+        else:
+            x = keras.Input((5, 5))
+        y = rnn_layer(x)
+        model = keras.models.Model(x, y)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        x_np = np.random.random((6, 5, 5))
+        y_np = np.random.random((6, 3))
+        model.train_on_batch(x_np, y_np)
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            cell=[
+                keras.layers.SimpleRNNCell,
+                keras.layers.GRUCell,
+                keras.layers.LSTMCell,
+            ],
+            unroll=[True, False],
+        )
+    )
+    def test_stacked_rnn_dropout(self, cell, unroll):
+        cells = [
+            cell(3, dropout=0.1, recurrent_dropout=0.1),
+            cell(3, dropout=0.1, recurrent_dropout=0.1),
+        ]
+        layer = keras.layers.RNN(cells, unroll=unroll)
+
+        if not unroll:
+            x = keras.Input((None, 5))
+        else:
+            x = keras.Input((5, 5))
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        x_np = np.random.random((6, 5, 5))
+        y_np = np.random.random((6, 3))
+        model.train_on_batch(x_np, y_np)
+
+    def test_dropout_mask_reuse(self):
+        # The layer is created with recurrent_initializer = zero, so that the
+        # the recurrent state won't affect the output. By doing this, we can
+        # verify the output and see if the same mask is applied to for each
+        # timestep.
+        layer_1 = keras.layers.SimpleRNN(
+            3,
+            dropout=0.5,
+            kernel_initializer="ones",
+            recurrent_initializer="zeros",
+            return_sequences=True,
+            unroll=True,
+        )
+        layer_2 = keras.layers.RNN(
+            keras.layers.SimpleRNNCell(
+                3,
+                dropout=0.5,
+                kernel_initializer="ones",
+                recurrent_initializer="zeros",
+            ),
+            return_sequences=True,
+            unroll=True,
+        )
+        layer_3 = keras.layers.RNN(
+            [
+                keras.layers.SimpleRNNCell(
+                    3,
+                    dropout=0.5,
+                    kernel_initializer="ones",
+                    recurrent_initializer="zeros",
+                ),
+                keras.layers.SimpleRNNCell(
+                    3,
+                    dropout=0.5,
+                    kernel_initializer="ones",
+                    recurrent_initializer="zeros",
+                ),
+            ],
+            return_sequences=True,
+            unroll=True,
+        )
+
+        def verify(rnn_layer):
+            inputs = tf.constant(1.0, shape=(6, 2, 5))
+            out = rnn_layer(inputs, training=True)
+            if not tf.executing_eagerly():
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+            batch_1 = self.evaluate(out)
+            batch_1_t0, batch_1_t1 = batch_1[:, 0, :], batch_1[:, 1, :]
+            self.assertAllClose(batch_1_t0, batch_1_t1)
+
+            # This simulate the layer called with multiple batches in eager mode
+            if tf.executing_eagerly():
+                out2 = rnn_layer(inputs, training=True)
+            else:
+                out2 = out
+            batch_2 = self.evaluate(out2)
+            batch_2_t0, batch_2_t1 = batch_2[:, 0, :], batch_2[:, 1, :]
+            self.assertAllClose(batch_2_t0, batch_2_t1)
+
+            # Also validate that different dropout is used by between batches.
+            self.assertNotAllClose(batch_1_t0, batch_2_t0)
+            self.assertNotAllClose(batch_1_t1, batch_2_t1)
+
+        for l in [layer_1, layer_2, layer_3]:
+            verify(l)
+
+    def test_stacked_rnn_compute_output_shape(self):
+        cells = [keras.layers.LSTMCell(3), keras.layers.LSTMCell(6)]
+        embedding_dim = 4
+        timesteps = 2
+        layer = keras.layers.RNN(
+            cells, return_state=True, return_sequences=True
+        )
+        output_shape = layer.compute_output_shape(
+            (None, timesteps, embedding_dim)
+        )
+        expected_output_shape = [
+            (None, timesteps, 6),
+            (None, 3),
+            (None, 3),
+            (None, 6),
+            (None, 6),
+        ]
+        self.assertEqual(
+            [tuple(o.as_list()) for o in output_shape], expected_output_shape
+        )
+
+        # Test reverse_state_order = True for stacked cell.
+        stacked_cell = keras.layers.StackedRNNCells(
+            cells, reverse_state_order=True
+        )
+        layer = keras.layers.RNN(
+            stacked_cell, return_state=True, return_sequences=True
+        )
+        output_shape = layer.compute_output_shape(
+            (None, timesteps, embedding_dim)
+        )
+        expected_output_shape = [
+            (None, timesteps, 6),
+            (None, 6),
+            (None, 6),
+            (None, 3),
+            (None, 3),
+        ]
+        self.assertEqual(
+            [tuple(o.as_list()) for o in output_shape], expected_output_shape
+        )
+
+    def test_stacked_rnn_with_training_param(self):
+        # See https://github.com/tensorflow/tensorflow/issues/32586
+
+        class CellWrapper(keras.layers.AbstractRNNCell):
+            def __init__(self, cell):
+                super().__init__()
+                self.cell = cell
+
+            @property
+            def state_size(self):
+                return self.cell.state_size
+
+            @property
+            def output_size(self):
+                return self.cell.output_size
+
+            def build(self, input_shape):
+                self.cell.build(input_shape)
+                self.built = True
+
+            def get_initial_state(
+                self, inputs=None, batch_size=None, dtype=None
+            ):
+                return self.cell.get_initial_state(
+                    inputs=inputs, batch_size=batch_size, dtype=dtype
+                )
+
+            def call(self, inputs, states, training=None, **kwargs):
+                assert training is not None
+                return self.cell(inputs, states=states, training=training)
+
+        cell = keras.layers.LSTMCell(32)
+        cell = CellWrapper(cell)
+        cell = keras.layers.StackedRNNCells([cell])
+
+        rnn = keras.layers.RNN(cell)
+        inputs = np.ones((8, 4, 16), dtype=np.float32)
+        rnn(inputs, training=True)
+
+    def test_stacked_rnn_with_nested_cell(self):
+        batch = 10
+        t = 5
+        i1, i2, i3 = 3, 4, 5
+        o11, o12, o13 = 2, 3, 4
+        o21, o22, o23 = 4, 5, 6
+
+        # test 1: use_tuple=False
+        cells = [NestedCell(o11, o12, o13), NestedCell(o21, o22, o23)]
+        rnn = keras.layers.RNN(cells, return_sequences=True, return_state=True)
+
+        input_1 = keras.Input((t, i1))
+        input_2 = keras.Input((t, i2, i3))
+
+        output1, output2, state1, state2 = rnn((input_1, input_2))
+        s11, s12 = state1
+        s21, s22 = state2
+
+        self.assertEqual(output1.shape.as_list(), [None, t, o21])
+        self.assertEqual(output2.shape.as_list(), [None, t, o22, o23])
+        self.assertEqual(s11.shape.as_list(), [None, o11])
+        self.assertEqual(s12.shape.as_list(), [None, o12, o13])
+        self.assertEqual(s21.shape.as_list(), [None, o21])
+        self.assertEqual(s22.shape.as_list(), [None, o22, o23])
+
+        model = keras.models.Model([input_1, input_2], [output1, output2])
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
+            [np.zeros((batch, t, o21)), np.zeros((batch, t, o22, o23))],
+        )
+        self.assertEqual(
+            model.output_shape, [(None, t, o21), (None, t, o22, o23)]
+        )
+
+        # test 2: use_tuple=True
+        cells = [
+            NestedCell(o11, o12, o13, use_tuple=True),
+            NestedCell(o21, o22, o23),
+        ]
+
+        rnn = keras.layers.RNN(cells, return_sequences=True, return_state=True)
+
+        input_1 = keras.Input((t, i1))
+        input_2 = keras.Input((t, i2, i3))
+
+        output1, output2, state1, state2 = rnn(
+            NestedInput(t1=input_1, t2=input_2)
+        )
+        s11, s12 = state1
+        s21, s22 = state2
+
+        self.assertEqual(output1.shape.as_list(), [None, t, o21])
+        self.assertEqual(output2.shape.as_list(), [None, t, o22, o23])
+        self.assertEqual(s11.shape.as_list(), [None, o11])
+        self.assertEqual(s12.shape.as_list(), [None, o12, o13])
+        self.assertEqual(s21.shape.as_list(), [None, o21])
+        self.assertEqual(s22.shape.as_list(), [None, o22, o23])
+
+        model = keras.models.Model([input_1, input_2], [output1, output2])
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
+            [np.zeros((batch, t, o21)), np.zeros((batch, t, o22, o23))],
+        )
+        self.assertEqual(
+            model.output_shape, [(None, t, o21), (None, t, o22, o23)]
+        )
+
+    def test_trackable_dependencies(self):
+        rnn = keras.layers.SimpleRNN
+        x = np.random.random((2, 2, 2))
+        y = np.random.random((2, 2))
+        model = keras.models.Sequential()
+        model.add(rnn(2))
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # check whether the model variables are present in the
+        # trackable list of objects
+        checkpointed_objects = {
+            id(o) for o in trackable_util.list_objects(model)
+        }
+        for v in model.variables:
+            self.assertIn(id(v), checkpointed_objects)
+
+    def test_high_dimension_RNN(self):
+        # Basic test case.
+        unit_a = 10
+        unit_b = 20
+        input_a = 5
+        input_b = 10
+        batch = 32
+        time_step = 4
+
+        cell = Minimal2DRNNCell(unit_a, unit_b)
+        x = keras.Input((None, input_a, input_b))
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+
+        self.assertEqual(cell.state_size.as_list(), [unit_a, unit_b])
+
+        if not tf.executing_eagerly():
+            init_state = layer.get_initial_state(x)
+            self.assertEqual(len(init_state), 1)
+            self.assertEqual(
+                init_state[0].shape.as_list(), [None, unit_a, unit_b]
+            )
+
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, time_step, input_a, input_b)),
+            np.zeros((batch, unit_a, unit_b)),
+        )
+        self.assertEqual(model.output_shape, (None, unit_a, unit_b))
+
+        # Test stacking.
+        cells = [
+            Minimal2DRNNCell(unit_a, unit_b),
+            Minimal2DRNNCell(unit_a * 2, unit_b * 2),
+            Minimal2DRNNCell(unit_a * 4, unit_b * 4),
+        ]
+        layer = keras.layers.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, time_step, input_a, input_b)),
+            np.zeros((batch, unit_a * 4, unit_b * 4)),
+        )
+        self.assertEqual(model.output_shape, (None, unit_a * 4, unit_b * 4))
+
+    def test_high_dimension_RNN_with_init_state(self):
+        unit_a = 10
+        unit_b = 20
+        input_a = 5
+        input_b = 10
+        batch = 32
+        time_step = 4
+
+        # Basic test case.
+        cell = Minimal2DRNNCell(unit_a, unit_b)
+        x = keras.Input((None, input_a, input_b))
+        s = keras.Input((unit_a, unit_b))
+        layer = keras.layers.RNN(cell)
+        y = layer(x, initial_state=s)
+
+        model = keras.models.Model([x, s], y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [
+                np.zeros((batch, time_step, input_a, input_b)),
+                np.zeros((batch, unit_a, unit_b)),
+            ],
+            np.zeros((batch, unit_a, unit_b)),
+        )
+        self.assertEqual(model.output_shape, (None, unit_a, unit_b))
+
+        # Bad init state shape.
+        bad_shape_a = unit_a * 2
+        bad_shape_b = unit_b * 2
+        cell = Minimal2DRNNCell(unit_a, unit_b)
+        x = keras.Input((None, input_a, input_b))
+        s = keras.Input((bad_shape_a, bad_shape_b))
+        layer = keras.layers.RNN(cell)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "however `cell.state_size` is"
+        ):
+            layer(x, initial_state=s)
+
+    def test_inconsistent_output_state_size(self):
+        batch = 32
+        time_step = 4
+        state_size = 5
+        input_size = 6
+        cell = PlusOneRNNCell(state_size)
+        x = keras.Input((None, input_size))
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+
+        self.assertEqual(cell.state_size, state_size)
+        if not tf.executing_eagerly():
+            init_state = layer.get_initial_state(x)
+            self.assertEqual(len(init_state), 1)
+            self.assertEqual(init_state[0].shape.as_list(), [None, state_size])
+
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, time_step, input_size)),
+            np.zeros((batch, input_size)),
+        )
+        self.assertEqual(model.output_shape, (None, input_size))
+
+    def test_get_initial_state(self):
+        cell = keras.layers.SimpleRNNCell(5)
+        with self.assertRaisesRegex(
+            ValueError, "batch_size and dtype cannot be None"
+        ):
+            cell.get_initial_state(None, None, None)
+
+        if not tf.executing_eagerly():
+            inputs = keras.Input((None, 10))
+            initial_state = cell.get_initial_state(inputs, None, None)
+            self.assertEqual(initial_state.shape.as_list(), [None, 5])
+            self.assertEqual(initial_state.dtype, inputs.dtype)
+
+            batch = tf.shape(inputs)[0]
+            dtype = inputs.dtype
+            initial_state = cell.get_initial_state(None, batch, dtype)
+            self.assertEqual(initial_state.shape.as_list(), [None, 5])
+            self.assertEqual(initial_state.dtype, inputs.dtype)
+        else:
+            batch = 8
+            inputs = np.random.random((batch, 10))
+            initial_state = cell.get_initial_state(inputs, None, None)
+            self.assertEqual(initial_state.shape.as_list(), [8, 5])
+            self.assertEqual(initial_state.dtype, inputs.dtype)
+
+            dtype = inputs.dtype
+            initial_state = cell.get_initial_state(None, batch, dtype)
+            self.assertEqual(initial_state.shape.as_list(), [batch, 5])
+            self.assertEqual(initial_state.dtype, inputs.dtype)
+
+    @parameterized.parameters([True, False])
+    def test_nested_input_output(self, stateful):
+        batch = 10
+        t = 5
+        i1, i2, i3 = 3, 4, 5
+        o1, o2, o3 = 2, 3, 4
+
+        cell = NestedCell(o1, o2, o3)
+        rnn = keras.layers.RNN(cell, stateful=stateful)
+
+        batch_size = batch if stateful else None
+        input_1 = keras.Input((t, i1), batch_size=batch_size)
+        input_2 = keras.Input((t, i2, i3), batch_size=batch_size)
+
+        outputs = rnn((input_1, input_2))
+
+        self.assertEqual(len(outputs), 2)
+        self.assertEqual(outputs[0].shape.as_list(), [batch_size, o1])
+        self.assertEqual(outputs[1].shape.as_list(), [batch_size, o2, o3])
+
+        model = keras.models.Model((input_1, input_2), outputs)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
+            [np.zeros((batch, o1)), np.zeros((batch, o2, o3))],
+        )
+        self.assertEqual(
+            model.output_shape, [(batch_size, o1), (batch_size, o2, o3)]
+        )
+
+        cell = NestedCell(o1, o2, o3, use_tuple=True)
+
+        rnn = keras.layers.RNN(cell, stateful=stateful)
+
+        input_1 = keras.Input((t, i1), batch_size=batch_size)
+        input_2 = keras.Input((t, i2, i3), batch_size=batch_size)
+
+        outputs = rnn(NestedInput(t1=input_1, t2=input_2))
+
+        self.assertEqual(len(outputs), 2)
+        self.assertEqual(outputs[0].shape.as_list(), [batch_size, o1])
+        self.assertEqual(outputs[1].shape.as_list(), [batch_size, o2, o3])
+
+        model = keras.models.Model([input_1, input_2], outputs)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
+            [np.zeros((batch, o1)), np.zeros((batch, o2, o3))],
+        )
+        self.assertEqual(
+            model.output_shape, [(batch_size, o1), (batch_size, o2, o3)]
+        )
+
+    def test_nested_input_output_with_state(self):
+        batch = 10
+        t = 5
+        i1, i2, i3 = 3, 4, 5
+        o1, o2, o3 = 2, 3, 4
+
+        cell = NestedCell(o1, o2, o3)
+        rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+        input_1 = keras.Input((t, i1))
+        input_2 = keras.Input((t, i2, i3))
+
+        output1, output2, s1, s2 = rnn((input_1, input_2))
+
+        self.assertEqual(output1.shape.as_list(), [None, t, o1])
+        self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+        self.assertEqual(s1.shape.as_list(), [None, o1])
+        self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+        model = keras.models.Model([input_1, input_2], [output1, output2])
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
+            [np.zeros((batch, t, o1)), np.zeros((batch, t, o2, o3))],
+        )
+        self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+        cell = NestedCell(o1, o2, o3, use_tuple=True)
+
+        rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+        input_1 = keras.Input((t, i1))
+        input_2 = keras.Input((t, i2, i3))
+
+        output1, output2, s1, s2 = rnn(NestedInput(t1=input_1, t2=input_2))
+
+        self.assertEqual(output1.shape.as_list(), [None, t, o1])
+        self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+        self.assertEqual(s1.shape.as_list(), [None, o1])
+        self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+        model = keras.models.Model([input_1, input_2], [output1, output2])
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
+            [np.zeros((batch, t, o1)), np.zeros((batch, t, o2, o3))],
+        )
+        self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+    def test_nest_input_output_with_init_state(self):
+        batch = 10
+        t = 5
+        i1, i2, i3 = 3, 4, 5
+        o1, o2, o3 = 2, 3, 4
+
+        cell = NestedCell(o1, o2, o3)
+        rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+        input_1 = keras.Input((t, i1))
+        input_2 = keras.Input((t, i2, i3))
+        init_s1 = keras.Input((o1,))
+        init_s2 = keras.Input((o2, o3))
+
+        output1, output2, s1, s2 = rnn(
+            (input_1, input_2), initial_state=(init_s1, init_s2)
+        )
+
+        self.assertEqual(output1.shape.as_list(), [None, t, o1])
+        self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+        self.assertEqual(s1.shape.as_list(), [None, o1])
+        self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+        model = keras.models.Model(
+            [input_1, input_2, init_s1, init_s2], [output1, output2]
+        )
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [
+                np.zeros((batch, t, i1)),
+                np.zeros((batch, t, i2, i3)),
+                np.zeros((batch, o1)),
+                np.zeros((batch, o2, o3)),
+            ],
+            [np.zeros((batch, t, o1)), np.zeros((batch, t, o2, o3))],
+        )
+        self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+        cell = NestedCell(o1, o2, o3, use_tuple=True)
+
+        rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+        input_1 = keras.Input((t, i1))
+        input_2 = keras.Input((t, i2, i3))
+        init_s1 = keras.Input((o1,))
+        init_s2 = keras.Input((o2, o3))
+        init_state = NestedState(s1=init_s1, s2=init_s2)
+
+        output1, output2, s1, s2 = rnn(
+            NestedInput(t1=input_1, t2=input_2), initial_state=init_state
+        )
+
+        self.assertEqual(output1.shape.as_list(), [None, t, o1])
+        self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+        self.assertEqual(s1.shape.as_list(), [None, o1])
+        self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+        model = keras.models.Model(
+            [input_1, input_2, init_s1, init_s2], [output1, output2]
+        )
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            [
+                np.zeros((batch, t, i1)),
+                np.zeros((batch, t, i2, i3)),
+                np.zeros((batch, o1)),
+                np.zeros((batch, o2, o3)),
+            ],
+            [np.zeros((batch, t, o1)), np.zeros((batch, t, o2, o3))],
+        )
+        self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+    def test_masking_rnn_with_output_and_states(self):
+        class Cell(keras.layers.Layer):
+            def __init__(self):
+                self.state_size = None
+                self.output_size = None
+                super().__init__()
+
+            def build(self, input_shape):
+                self.state_size = input_shape[-1]
+                self.output_size = input_shape[-1]
+
+            def call(self, inputs, states):
+                return inputs, [s + 1 for s in states]
+
+        x = keras.Input((3, 1), name="x")
+        x_masked = keras.layers.Masking()(x)
+        s_0 = keras.Input((1,), name="s_0")
+        y, s = keras.layers.RNN(Cell(), return_state=True)(
+            x_masked, initial_state=s_0
+        )
+        model = keras.models.Model([x, s_0], [y, s])
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        # last time step masked
+        x_np = np.array([[[1.0], [2.0], [0.0]]])
+        s_0_np = np.array([[10.0]])
+        y_np, s_np = model.predict([x_np, s_0_np])
+
+        # 1 is added to initial state two times
+        self.assertAllClose(s_np, s_0_np + 2)
+        # Expect last output to be the same as last output before masking
+        self.assertAllClose(y_np, x_np[:, 1, :])
+
+    def test_zero_output_for_masking(self):
+        for unroll in [True, False]:
+            cell = keras.layers.SimpleRNNCell(5)
+            x = keras.Input((5, 5))
+            mask = keras.layers.Masking()
+            layer = keras.layers.RNN(
+                cell,
+                return_sequences=True,
+                zero_output_for_mask=True,
+                unroll=unroll,
+            )
+            masked_input = mask(x)
+            y = layer(masked_input)
+            model = keras.models.Model(x, y)
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+            np_x = np.ones((6, 5, 5))
+            result_1 = model.predict(np_x)
+
+            # set the time 4 and 5 for last record to be zero (masked).
+            np_x[5, 3:] = 0
+            result_2 = model.predict(np_x)
+
+            # expect the result_2 has same output, except the time 4,5 for last
+            # record.
+            result_1[5, 3:] = 0
+            self.assertAllClose(result_1, result_2)
+
+    def test_unroll_single_step(self):
+        """Even if the time dimension is only one, we should be able to
+        unroll."""
+        cell = keras.layers.SimpleRNNCell(5)
+        x = keras.Input((1, 5))
+        layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        np_x = np.ones((6, 1, 5))
+        result = model.predict(np_x)
+        self.assertEqual((6, 1, 5), result.shape)
+
+    def test_unroll_zero_step(self):
+        """If the time dimension is None, we should fail to unroll."""
+        cell = keras.layers.SimpleRNNCell(5)
+        x = keras.Input((None, 5))
+        layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+        with self.assertRaisesRegex(ValueError, "Cannot unroll a RNN.*"):
+            layer(x)
+
+    def test_full_input_spec(self):
+        # See https://github.com/tensorflow/tensorflow/issues/25985
+        inputs = keras.layers.Input(batch_shape=(1, 1, 1))
+        state_h = keras.layers.Input(batch_shape=(1, 1))
+        state_c = keras.layers.Input(batch_shape=(1, 1))
+        states = [state_h, state_c]
+        decoder_out = keras.layers.LSTM(1, stateful=True)(
+            inputs, initial_state=states
+        )
+        model = keras.Model([inputs, state_h, state_c], decoder_out)
+        output1 = model.predict(
+            [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))]
+        )
+        output2 = model.predict(
+            [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))]
+        )
+        model.reset_states()
+        output3 = model.predict(
+            [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))]
+        )
+        self.assertAllClose(output1, output3)
+        self.assertNotAllClose(output1, output2)
+
+    def test_reset_states(self):
+        # See https://github.com/tensorflow/tensorflow/issues/25852
+        with self.assertRaisesRegex(
+            ValueError, "it needs to know its batch size"
+        ):
+            simple_rnn = keras.layers.SimpleRNN(1, stateful=True)
+            simple_rnn.reset_states()
+
+        with self.assertRaisesRegex(
+            ValueError, "it needs to know its batch size"
+        ):
+            cell = Minimal2DRNNCell(1, 2)
+            custom_rnn = keras.layers.RNN(cell, stateful=True)
+            custom_rnn.reset_states()
+
+    @parameterized.parameters(
+        [
+            keras.layers.SimpleRNNCell,
+            keras.layers.GRUCell,
+            keras.layers.LSTMCell,
+        ]
+    )
+    def test_stateful_rnn_with_stacking(self, cell):
+        # See https://github.com/tensorflow/tensorflow/issues/28614.
+        batch = 12
+        timesteps = 10
+        input_dim = 8
+        output_dim = 64
+        cells = [cell(32), cell(64)]
+        x = keras.Input(batch_shape=(batch, None, input_dim))
+        layer = keras.layers.RNN(cells, stateful=True)
+        y = layer(x)
+
+        model = keras.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, timesteps, input_dim)),
+            np.zeros((batch, output_dim)),
+        )
+        model.predict(np.ones((batch, timesteps, input_dim)))
+
+        model.reset_states()
+        model.predict(np.ones((batch, timesteps, input_dim)))
+
+        new_states = tf.nest.map_structure(
+            lambda s: np.ones((batch, s)), layer.cell.state_size
+        )
+        layer.reset_states(new_states)
+        model.predict(np.ones((batch, timesteps, input_dim)))
+
+    def test_stateful_rnn_with_initial_state(self):
+        # See https://github.com/tensorflow/tensorflow/issues/32299.
+        batch = 12
+        timesteps = 1
+        input_dim = 8
+        output_dim = 16
+
+        test_inputs = np.full((batch, timesteps, input_dim), 0.5)
+
+        def make_model(stateful=False, with_initial_state=False):
+            input_layer = keras.Input(shape=(None, input_dim), batch_size=batch)
+            if with_initial_state:
+                initial_states = keras.backend.constant(
+                    np.ones((batch, output_dim))
+                )
+            else:
+                initial_states = None
+            rnn_output = keras.layers.GRU(
+                units=output_dim, return_sequences=True, stateful=stateful
+            )(input_layer, initial_state=initial_states)
+            model = keras.Model(input_layer, rnn_output)
+            model.compile(
+                optimizer="rmsprop",
+                loss="mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            return model
+
+        # Define a model with a constant state initialization
+        model = make_model(stateful=True, with_initial_state=True)
+        layer_weights = model.layers[1].get_weights()
+
+        model.reset_states()
+        predict_1 = model.predict(test_inputs)
+        predict_2 = model.predict(test_inputs)
+
+        model.reset_states()
+        predict_3 = model.predict(test_inputs)
+
+        # predict 1 and 2 should be different since the batch 2 should use the
+        # state from batch 1 as the initial state.
+        self.assertNotAllClose(predict_1, predict_2)
+        self.assertAllClose(predict_1, predict_3)
+
+        # Create a new model with same weights but without initial states. Make
+        # sure the predict value is different from the model with non-zero
+        # initial state.
+        model_2 = make_model(stateful=True, with_initial_state=False)
+        model_2.layers[1].set_weights(layer_weights)
+
+        model_2.reset_states()
+        predict_4 = model_2.predict(test_inputs)
+        predict_5 = model_2.predict(test_inputs)
+        self.assertNotAllClose(predict_1, predict_4)
+        self.assertNotAllClose(predict_4, predict_5)
+
+        # Create models with stateful=False, and make sure they handle init
+        # state correctly.
+        model_3 = make_model(stateful=False, with_initial_state=True)
+        model_3.layers[1].set_weights(layer_weights)
+
+        model_3.reset_states()
+        predict_6 = model_3.predict(test_inputs)
+        predict_7 = model_3.predict(test_inputs)
+        self.assertAllClose(predict_1, predict_6)
+        self.assertAllClose(predict_6, predict_7)
+
+    def test_stateful_rnn_with_customized_get_initial_state(self):
+        class TestCell(keras.layers.AbstractRNNCell):
+            state_size = 1
+            output_size = 2
+
+            def get_initial_state(
+                self, inputs=None, batch_size=None, dtype=None
+            ):
+                return np.ones((batch_size, 1), dtype=dtype)
+
+            def call(self, inputs, states):
+                return inputs, states
+
+        layer = keras.layers.RNN(TestCell(), stateful=True, return_state=True)
+        inputs = keras.Input(shape=(10, 2), batch_size=4)
+        model = keras.Model(inputs, layer(inputs))
+        x = np.ones((4, 10, 2), dtype=np.float32)
+        output, state = model.predict(x)
+        self.assertAllClose(output, np.ones((4, 2)))
+        self.assertAllClose(state, np.ones((4, 1)))
+
+    def test_stateful_rnn_with_customized_dtype(self):
+        class TestCell(keras.layers.AbstractRNNCell):
+            state_size = 1
+            output_size = 2
+
+            def get_initial_state(
+                self, inputs=None, batch_size=None, dtype=None
+            ):
+                return np.ones((batch_size, 1), dtype=np.float16)
+
+            def call(self, inputs, states):
+                return inputs, states
+
+        layer = keras.layers.RNN(TestCell(), stateful=True, return_state=True)
+        inputs = keras.Input(shape=(10, 2), batch_size=4)
+        model = keras.Model(inputs, layer(inputs))
+        x = np.ones((4, 10, 2), dtype=np.float16)
+        output, state = model.predict(x)
+        self.assertAllClose(output, np.ones((4, 2), dtype=np.float16))
+        self.assertAllClose(state, np.ones((4, 1), dtype=np.float16))
+
+    def test_input_dim_length(self):
+        simple_rnn = keras.layers.SimpleRNN(5, input_length=10, input_dim=8)
+        self.assertEqual(simple_rnn._batch_input_shape, (None, 10, 8))
+
+        simple_rnn = keras.layers.SimpleRNN(5, input_dim=8)
+        self.assertEqual(simple_rnn._batch_input_shape, (None, None, 8))
+
+        simple_rnn = keras.layers.SimpleRNN(5, input_length=10)
+        self.assertEqual(simple_rnn._batch_input_shape, (None, 10, None))
+
+    @parameterized.parameters(
+        [
+            keras.layers.SimpleRNNCell,
+            keras.layers.GRUCell,
+            keras.layers.LSTMCell,
+        ]
+    )
+    def test_state_spec_with_stack_cell(self, cell):
+        # See https://github.com/tensorflow/tensorflow/issues/27817 for more
+        # detail.
+        batch = 12
+        timesteps = 10
+        input_dim = 8
+        output_dim = 8
+
+        def create_cell():
+            return [cell(output_dim), cell(output_dim), cell(output_dim)]
+
+        inputs = keras.Input((timesteps, input_dim))
+        encoder_output = keras.layers.RNN(create_cell(), return_state=True)(
+            inputs
+        )
+
+        states = encoder_output[1:]
+
+        decoder_output = keras.layers.RNN(create_cell())(
+            inputs, initial_state=states
+        )
+
+        model = keras.models.Model(inputs, decoder_output)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(
+            np.zeros((batch, timesteps, input_dim)),
+            np.zeros((batch, output_dim)),
+        )
+        model.predict(np.ones((batch, timesteps, input_dim)))
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer=[
+                keras.layers.SimpleRNN,
+                gru_v1.GRU,
+                lstm_v1.LSTM,
+                gru.GRU,
+                lstm.LSTM,
+            ]
+        )
+    )
+    def test_rnn_with_ragged_input(self, layer):
+        ragged_data = tf.ragged.constant(
+            [
+                [[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 1.0, 1.0]],
+                [[2.0, 4.0, 1.0, 3.0, 1.0]],
+                [
+                    [2.0, 3.0, 4.0, 1.0, 5.0],
+                    [2.0, 3.0, 1.0, 1.0, 1.0],
+                    [1.0, 2.0, 3.0, 4.0, 5.0],
+                ],
+            ],
+            ragged_rank=1,
+        )
+        label_data = np.array([[1, 0, 1], [1, 1, 0], [0, 0, 1]])
+
+        # Test results in feed forward
+        np.random.seed(100)
+        rnn_layer = layer(4, activation="sigmoid")
+
+        x_ragged = keras.Input(shape=(None, 5), ragged=True)
+        y_ragged = rnn_layer(x_ragged)
+        model = keras.models.Model(x_ragged, y_ragged)
+        output_ragged = model.predict(ragged_data, steps=1)
+
+        x_dense = keras.Input(shape=(3, 5))
+        masking = keras.layers.Masking()(x_dense)
+        y_dense = rnn_layer(masking)
+        model_2 = keras.models.Model(x_dense, y_dense)
+        dense_data = ragged_data.to_tensor()
+        output_dense = model_2.predict(dense_data, steps=1)
+
+        self.assertAllClose(output_dense, output_ragged)
+
+        # Test results with go backwards
+        np.random.seed(200)
+        back_rnn_layer = layer(8, go_backwards=True, activation="sigmoid")
+
+        x_ragged = keras.Input(shape=(None, 5), ragged=True)
+        y_ragged = back_rnn_layer(x_ragged)
+        model = keras.models.Model(x_ragged, y_ragged)
+        output_ragged = model.predict(ragged_data, steps=1)
+
+        x_dense = keras.Input(shape=(3, 5))
+        masking = keras.layers.Masking()(x_dense)
+        y_dense = back_rnn_layer(masking)
+        model_2 = keras.models.Model(x_dense, y_dense)
+        dense_data = ragged_data.to_tensor()
+        output_dense = model_2.predict(dense_data, steps=1)
+
+        self.assertAllClose(output_dense, output_ragged)
+
+        # Test densification of the ragged input
+        dense_tensor, row_lengths = keras.backend.convert_inputs_if_ragged(
+            ragged_data
+        )
+        self.assertAllClose(dense_data, dense_tensor)
+
+        # Test optional params, all should work except unrolling
+        inputs = keras.Input(shape=(None, 5), dtype=tf.float32, ragged=True)
+        custom_rnn_layer = layer(
+            3, zero_output_for_mask=True, dropout=0.1, use_bias=True
+        )
+        outputs = custom_rnn_layer(inputs)
+        model = keras.models.Model(inputs, outputs)
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(ragged_data, label_data)
+
+        # Test stateful and full shape specification
+        inputs = keras.Input(
+            shape=(None, 5), batch_size=3, dtype=tf.float32, ragged=True
+        )
+        stateful_rnn_layer = layer(3, stateful=True)
+        outputs = stateful_rnn_layer(inputs)
+        model = keras.models.Model(inputs, outputs)
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(ragged_data, label_data)
+
+        # Must raise error when unroll is set to True
+        unroll_rnn_layer = layer(3, unroll=True)
+        with self.assertRaisesRegex(
+            ValueError, "The input received contains RaggedTensors *"
+        ):
+            unroll_rnn_layer(inputs)
+
+        # Check if return sequences outputs are correct
+        np.random.seed(100)
+        returning_rnn_layer = layer(4, return_sequences=True)
+
+        x_ragged = keras.Input(shape=(None, 5), ragged=True)
+        y_ragged = returning_rnn_layer(x_ragged)
+        model = keras.models.Model(x_ragged, y_ragged)
+        output_ragged = model.predict(ragged_data, steps=1)
+        self.assertAllClose(output_ragged.ragged_rank, ragged_data.ragged_rank)
+        self.assertAllClose(output_ragged.row_splits, ragged_data.row_splits)
+
+        x_dense = keras.Input(shape=(3, 5))
+        masking = keras.layers.Masking()(x_dense)
+        y_dense = returning_rnn_layer(masking)
+        model_2 = keras.models.Model(x_dense, y_dense)
+        dense_data = ragged_data.to_tensor()
+        output_dense = model_2.predict(dense_data, steps=1)
+        # Convert the output here to ragged for value comparison
+        output_dense = tf.RaggedTensor.from_tensor(
+            output_dense, lengths=row_lengths
+        )
+        self.assertAllClose(output_ragged, output_dense)
+
+        # Check if return sequences and go_backwards outputs are correct
+        np.random.seed(100)
+        returning_rnn_layer = layer(4, go_backwards=True, return_sequences=True)
+
+        x_ragged = keras.Input(shape=(None, 5), ragged=True)
+        y_ragged = returning_rnn_layer(x_ragged)
+        model = keras.models.Model(x_ragged, y_ragged)
+        output_ragged = model.predict(ragged_data, steps=1)
+        self.assertAllClose(output_ragged.ragged_rank, ragged_data.ragged_rank)
+        self.assertAllClose(output_ragged.row_splits, ragged_data.row_splits)
+
+        x_dense = keras.Input(shape=(3, 5))
+        masking = keras.layers.Masking()(x_dense)
+        y_dense = returning_rnn_layer(masking)
+        model_2 = keras.models.Model(x_dense, y_dense)
+        dense_data = ragged_data.to_tensor()
+        output_dense = model_2.predict(dense_data, steps=1)
+
+        # Note that the raw output for dense and ragged input when
+        # go_backward=True will be different. Consider following input
+        # [[a, b, 0], [c, 0, 0], [d, e, f]] where 0s are masked value.
+        # The dense output will be [[0, b, a], [0, 0, c], [f, e, d]] since it
+        # will process the whole sequence from the end.
+        # While ragged output will be [[b, a], [c], [f, e, d]] since it just
+        # ignore the 0s. And if we densify the ragged output, it will by default
+        # inserting 0s to the end (rather than from the beginning), which make
+        # the output to be [[b, a, 0], [c, 0, 0], [f, e, d]]. With this, we need
+        # to verify that reverse(ragged_output.to_tensor()) ==
+        # reverse(dense_output)
+        output_dense = keras.backend.reverse(output_dense, [1])
+        output_dense = tf.RaggedTensor.from_tensor(
+            output_dense, lengths=row_lengths
+        )
+
+        self.assertAllClose(
+            keras.backend.reverse(output_ragged, [1]), output_dense
+        )
+
+    def test_stateless_rnn_cell(self):
+        class StatelessCell(keras.layers.Layer):
+            def __init__(self):
+                self.state_size = ((), [], ())
+                self.output_size = None
+                super().__init__()
+
+            def build(self, input_shape):
+                self.output_size = input_shape[-1]
+
+            def call(self, inputs, states):
+                return inputs, states
+
+        x = keras.Input((None, 5))
+        cell = StatelessCell()
+        initial_state = tf.nest.map_structure(lambda t: None, cell.state_size)
+        layer = keras.layers.RNN(cell)
+        y = layer(x, initial_state=initial_state)
+        model = keras.models.Model(x, y)
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 5)))
+
+    @parameterized.parameters(
+        [keras.layers.SimpleRNN, gru_v1.GRU, lstm_v1.LSTM, gru.GRU, lstm.LSTM]
+    )
+    def test_for_enable_caching_device_for_layer(self, layer_cls):
+        expected_caching_device = (
+            tf.compat.v1.executing_eagerly_outside_functions()
+        )
+        layer = layer_cls(1)
+        self.assertEqual(
+            layer.cell._enable_caching_device, expected_caching_device
+        )
+
+        # Make sure the config only appears when the none default value is used.
+        config = layer.get_config()
+        self.assertNotIn("enable_caching_device", config)
+
+        non_default_value = not expected_caching_device
+        layer = layer_cls(1, enable_caching_device=non_default_value)
+        self.assertEqual(layer.cell._enable_caching_device, non_default_value)
+        config = layer.get_config()
+        self.assertEqual(config["enable_caching_device"], non_default_value)
+
+    @parameterized.parameters(
+        [
+            keras.layers.SimpleRNNCell,
+            gru_v1.GRUCell,
+            lstm_v1.LSTMCell,
+            gru.GRUCell,
+            lstm.LSTMCell,
+        ]
+    )
+    def test_for_enable_caching_device_for_cell(self, cell_cls):
+        expected_caching_device = (
+            tf.compat.v1.executing_eagerly_outside_functions()
+        )
+        cell = cell_cls(1)
+        self.assertEqual(cell._enable_caching_device, expected_caching_device)
 
-  def test_minimal_rnn_cell_non_layer(self):
-
-    class MinimalRNNCell:
+        # Make sure the config only appears when the none default value is used.
+        config = cell.get_config()
+        self.assertNotIn("enable_caching_device", config)
 
-      def __init__(self, units, input_dim):
-        self.units = units
-        self.state_size = units
-        self.kernel = keras.backend.variable(
-            np.random.random((input_dim, units)))
+        non_default_value = not expected_caching_device
+        cell = cell_cls(1, enable_caching_device=non_default_value)
+        self.assertEqual(cell._enable_caching_device, non_default_value)
+        config = cell.get_config()
+        self.assertEqual(config["enable_caching_device"], non_default_value)
 
-      def call(self, inputs, states):
-        prev_output = states[0]
-        output = keras.backend.dot(inputs, self.kernel) + prev_output
-        return output, [output]
 
-    # Basic test case.
-    cell = MinimalRNNCell(32, 5)
-    x = keras.Input((None, 5))
-    layer = keras.layers.RNN(cell)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacking.
-    cells = [MinimalRNNCell(8, 5),
-             MinimalRNNCell(32, 8),
-             MinimalRNNCell(32, 32)]
-    layer = keras.layers.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-  def test_minimal_rnn_cell_non_layer_multiple_states(self):
-
-    class MinimalRNNCell:
-
-      def __init__(self, units, input_dim):
-        self.units = units
-        self.state_size = (units, units)
-        self.kernel = keras.backend.variable(
-            np.random.random((input_dim, units)))
-
-      def call(self, inputs, states):
-        prev_output_1 = states[0]
-        prev_output_2 = states[1]
-        output = keras.backend.dot(inputs, self.kernel)
-        output += prev_output_1
-        output -= prev_output_2
-        return output, [output * 2, output * 3]
-
-    # Basic test case.
-    cell = MinimalRNNCell(32, 5)
-    x = keras.Input((None, 5))
-    layer = keras.layers.RNN(cell)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacking.
-    cells = [MinimalRNNCell(8, 5),
-             MinimalRNNCell(16, 8),
-             MinimalRNNCell(32, 16)]
-    layer = keras.layers.RNN(cells)
-    self.assertEqual(layer.cell.state_size, ((8, 8), (16, 16), (32, 32)))
-    self.assertEqual(layer.cell.output_size, 32)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-  def test_minimal_rnn_cell_layer(self):
-
-    class MinimalRNNCell(keras.layers.Layer):
-
-      def __init__(self, units, **kwargs):
+class RNNCellWithConstants(keras.layers.Layer):
+    def __init__(self, units, constant_size, **kwargs):
         self.units = units
         self.state_size = units
+        self.constant_size = constant_size
         super().__init__(**kwargs)
 
-      def build(self, input_shape):
-        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                      initializer='uniform',
-                                      name='kernel')
+    def build(self, input_shape):
+        self.input_kernel = self.add_weight(
+            shape=(input_shape[-1], self.units),
+            initializer="uniform",
+            name="kernel",
+        )
         self.recurrent_kernel = self.add_weight(
             shape=(self.units, self.units),
-            initializer='uniform',
-            name='recurrent_kernel')
+            initializer="uniform",
+            name="recurrent_kernel",
+        )
+        self.constant_kernel = self.add_weight(
+            shape=(self.constant_size, self.units),
+            initializer="uniform",
+            name="constant_kernel",
+        )
         self.built = True
 
-      def call(self, inputs, states):
-        prev_output = states[0]
-        h = keras.backend.dot(inputs, self.kernel)
-        output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
+    def call(self, inputs, states, constants):
+        [prev_output] = states
+        [constant] = constants
+        h_input = keras.backend.dot(inputs, self.input_kernel)
+        h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+        h_const = keras.backend.dot(constant, self.constant_kernel)
+        output = h_input + h_state + h_const
         return output, [output]
 
-      def get_config(self):
-        config = {'units': self.units}
+    def get_config(self):
+        config = {"units": self.units, "constant_size": self.constant_size}
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    # Test basic case.
-    x = keras.Input((None, 5))
-    cell = MinimalRNNCell(32)
-    layer = keras.layers.RNN(cell)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test basic case serialization.
-    x_np = np.random.random((6, 5, 5))
-    y_np = model.predict(x_np)
-    weights = model.get_weights()
-    config = layer.get_config()
-    with generic_utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-      layer = keras.layers.RNN.from_config(config)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.set_weights(weights)
-    y_np_2 = model.predict(x_np)
-    self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-    # Test stacking.
-    cells = [MinimalRNNCell(8),
-             MinimalRNNCell(12),
-             MinimalRNNCell(32)]
-    layer = keras.layers.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacked RNN serialization.
-    x_np = np.random.random((6, 5, 5))
-    y_np = model.predict(x_np)
-    weights = model.get_weights()
-    config = layer.get_config()
-    with generic_utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-      layer = keras.layers.RNN.from_config(config)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.set_weights(weights)
-    y_np_2 = model.predict(x_np)
-    self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-  def test_minimal_rnn_cell_abstract_rnn_cell(self):
-
-    class MinimalRNNCell(keras.layers.AbstractRNNCell):
-
-      def __init__(self, units, **kwargs):
-        self.units = units
-        super().__init__(**kwargs)
-
-      @property
-      def state_size(self):
-        return self.units
-
-      def build(self, input_shape):
-        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                      initializer='uniform',
-                                      name='kernel')
-        self.recurrent_kernel = self.add_weight(
-            shape=(self.units, self.units),
-            initializer='uniform',
-            name='recurrent_kernel')
-        self.built = True
-
-      def call(self, inputs, states):
-        prev_output = states[0]
-        h = keras.backend.dot(inputs, self.kernel)
-        output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
-        return output, output
-
-      @property
-      def output_size(self):
-        return self.units
-
-    cell = MinimalRNNCell(32)
-    x = keras.Input((None, 5))
-    layer = keras.layers.RNN(cell)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacking.
-    cells = [MinimalRNNCell(8),
-             MinimalRNNCell(16),
-             MinimalRNNCell(32)]
-    layer = keras.layers.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-  def test_rnn_with_time_major(self):
-    batch = 10
-    time_step = 5
-    embedding_dim = 4
-    units = 3
-
-    # Test basic case.
-    x = keras.Input((time_step, embedding_dim))
-    time_major_x = keras.layers.Lambda(
-        lambda t: tf.transpose(t, [1, 0, 2]))(x)
-    layer = keras.layers.SimpleRNN(
-        units, time_major=True, return_sequences=True)
-    self.assertEqual(
-        layer.compute_output_shape((time_step, None,
-                                    embedding_dim)).as_list(),
-        [time_step, None, units])
-    y = layer(time_major_x)
-    self.assertEqual(layer.output_shape, (time_step, None, units))
-
-    y = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(y)
-
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, time_step, embedding_dim)),
-        np.zeros((batch, time_step, units)))
-
-    # Test stacking.
-    x = keras.Input((time_step, embedding_dim))
-    time_major_x = keras.layers.Lambda(
-        lambda t: tf.transpose(t, [1, 0, 2]))(x)
-    cell_units = [10, 8, 6]
-    cells = [keras.layers.SimpleRNNCell(cell_units[i]) for i in range(3)]
-    layer = keras.layers.RNN(cells, time_major=True, return_sequences=True)
-    y = layer(time_major_x)
-    self.assertEqual(layer.output_shape, (time_step, None, cell_units[-1]))
-
-    y = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(y)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, time_step, embedding_dim)),
-        np.zeros((batch, time_step, cell_units[-1])))
-
-    # Test masking.
-    x = keras.Input((time_step, embedding_dim))
-    time_major = keras.layers.Lambda(
-        lambda t: tf.transpose(t, [1, 0, 2]))(x)
-    mask = keras.layers.Masking()(time_major)
-    rnn = keras.layers.SimpleRNN(
-        units, time_major=True, return_sequences=True)(mask)
-    y = keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2]))(rnn)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, time_step, embedding_dim)),
-        np.zeros((batch, time_step, units)))
-
-    # Test layer output
-    x = keras.Input((time_step, embedding_dim))
-    rnn_1 = keras.layers.SimpleRNN(units, return_sequences=True)
-    y = rnn_1(x)
-
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, time_step, embedding_dim)),
-        np.zeros((batch, time_step, units)))
-
-    x_np = np.random.random((batch, time_step, embedding_dim))
-    y_np_1 = model.predict(x_np)
-
-    time_major = keras.layers.Lambda(
-        lambda t: tf.transpose(t, [1, 0, 2]))(x)
-    rnn_2 = keras.layers.SimpleRNN(
-        units, time_major=True, return_sequences=True)
-    y_2 = rnn_2(time_major)
-    y_2 = keras.layers.Lambda(
-        lambda t: tf.transpose(t, [1, 0, 2]))(y_2)
-
-    model_2 = keras.models.Model(x, y_2)
-    rnn_2.set_weights(rnn_1.get_weights())
-
-    y_np_2 = model_2.predict(x_np)
-    self.assertAllClose(y_np_1, y_np_2, atol=1e-4)
-
-  def test_rnn_cell_with_constants_layer(self):
-    # Test basic case.
-    x = keras.Input((None, 5))
-    c = keras.Input((3,))
-    cell = RNNCellWithConstants(32, constant_size=3)
-    layer = keras.layers.RNN(cell)
-    y = layer(x, constants=c)
-
-    model = keras.models.Model([x, c], y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-        np.zeros((6, 32))
-    )
 
-    # Test basic case serialization.
-    x_np = np.random.random((6, 5, 5))
-    c_np = np.random.random((6, 3))
-    y_np = model.predict([x_np, c_np])
-    weights = model.get_weights()
-    config = layer.get_config()
-    custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
-    with generic_utils.CustomObjectScope(custom_objects):
-      layer = keras.layers.RNN.from_config(config.copy())
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
-    model.set_weights(weights)
-    y_np_2 = model.predict([x_np, c_np])
-    self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-    # test flat list inputs.
-    with generic_utils.CustomObjectScope(custom_objects):
-      layer = keras.layers.RNN.from_config(config.copy())
-    y = layer([x, c])
-    model = keras.models.Model([x, c], y)
-    model.set_weights(weights)
-    y_np_3 = model.predict([x_np, c_np])
-    self.assertAllClose(y_np, y_np_3, atol=1e-4)
-
-    # Test stacking.
-    cells = [gru.GRUCell(8),
-             RNNCellWithConstants(12, constant_size=3),
-             RNNCellWithConstants(32, constant_size=3)]
-    layer = keras.layers.RNN(cells)
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-        np.zeros((6, 32))
-    )
-
-    # Test GRUCell reset_after property.
-    x = keras.Input((None, 5))
-    c = keras.Input((3,))
-    cells = [gru.GRUCell(32, reset_after=True)]
-    layer = keras.layers.RNN(cells)
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-        np.zeros((6, 32))
-    )
+class Minimal2DRNNCell(keras.layers.Layer):
+    """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
 
-    # Test stacked RNN serialization
-    x_np = np.random.random((6, 5, 5))
-    c_np = np.random.random((6, 3))
-    y_np = model.predict([x_np, c_np])
-    weights = model.get_weights()
-    config = layer.get_config()
-    with generic_utils.CustomObjectScope(custom_objects):
-      layer = keras.layers.RNN.from_config(config.copy())
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
-    model.set_weights(weights)
-    y_np_2 = model.predict([x_np, c_np])
-    self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-  def test_rnn_cell_with_non_keras_constants(self):
-    # Test basic case.
-    x = keras.Input((None, 5))
-    c = tf.zeros([6, 3], dtype=tf.float32)
-    cell = RNNCellWithConstants(32, constant_size=3)
-    layer = keras.layers.RNN(cell)
-    y = layer(x, constants=c)
-
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacking.
-    cells = [gru.GRUCell(8),
-             RNNCellWithConstants(12, constant_size=3),
-             RNNCellWithConstants(32, constant_size=3)]
-    layer = keras.layers.RNN(cells)
-    y = layer(x, constants=c)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-  def test_rnn_cell_with_constants_layer_passing_initial_state(self):
-    # Test basic case.
-    x = keras.Input((None, 5))
-    c = keras.Input((3,))
-    s = keras.Input((32,))
-    cell = RNNCellWithConstants(32, constant_size=3)
-    layer = keras.layers.RNN(cell)
-    y = layer(x, initial_state=s, constants=c)
-    model = keras.models.Model([x, s, c], y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
-        np.zeros((6, 32))
-    )
+    Both internal state and output have 2 dimensions and are orthogonal
+    between each other.
+    """
 
-    # Test basic case serialization.
-    x_np = np.random.random((6, 5, 5))
-    s_np = np.random.random((6, 32))
-    c_np = np.random.random((6, 3))
-    y_np = model.predict([x_np, s_np, c_np])
-    weights = model.get_weights()
-    config = layer.get_config()
-    custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
-    with generic_utils.CustomObjectScope(custom_objects):
-      layer = keras.layers.RNN.from_config(config.copy())
-    y = layer(x, initial_state=s, constants=c)
-    model = keras.models.Model([x, s, c], y)
-    model.set_weights(weights)
-    y_np_2 = model.predict([x_np, s_np, c_np])
-    self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-    # verify that state is used
-    y_np_2_different_s = model.predict([x_np, s_np + 10., c_np])
-    with self.assertRaises(AssertionError):
-      self.assertAllClose(y_np, y_np_2_different_s, atol=1e-4)
-
-    # test flat list inputs
-    with generic_utils.CustomObjectScope(custom_objects):
-      layer = keras.layers.RNN.from_config(config.copy())
-    y = layer([x, s, c])
-    model = keras.models.Model([x, s, c], y)
-    model.set_weights(weights)
-    y_np_3 = model.predict([x_np, s_np, c_np])
-    self.assertAllClose(y_np, y_np_3, atol=1e-4)
-
-  def test_rnn_cell_with_non_keras_constants_and_initial_state(self):
-    # Test basic case.
-    x = keras.Input((None, 5))
-    c = tf.zeros([6, 3], dtype=tf.float32)
-    s = tf.zeros([6, 32], dtype=tf.float32)
-    cell = RNNCellWithConstants(32, constant_size=3)
-    layer = keras.layers.RNN(cell)
-    y = layer(x, initial_state=s, constants=c)
-
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacking.
-    cells = [gru.GRUCell(8),
-             RNNCellWithConstants(12, constant_size=3),
-             RNNCellWithConstants(32, constant_size=3)]
-    layer = keras.layers.RNN(cells)
-    s = [tf.zeros([6, 8], dtype=tf.float32),
-         tf.zeros([6, 12], dtype=tf.float32),
-         tf.zeros([6, 32], dtype=tf.float32)]
-    y = layer(x, initial_state=s, constants=c)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-  def test_stacked_rnn_attributes(self):
-    if tf.executing_eagerly():
-      self.skipTest('reduce_sum is not available in eager mode.')
-
-    cells = [keras.layers.LSTMCell(1),
-             keras.layers.LSTMCell(1)]
-    layer = keras.layers.RNN(cells)
-    layer.build((None, None, 1))
-
-    # Test weights
-    self.assertEqual(len(layer.trainable_weights), 6)
-    cells[0].trainable = False
-    self.assertEqual(len(layer.trainable_weights), 3)
-    self.assertEqual(len(layer.non_trainable_weights), 3)
-
-    # Test `get_losses_for` and `losses`
-    x = keras.Input((None, 1))
-    loss_1 = tf.reduce_sum(x)
-    loss_2 = tf.reduce_sum(cells[0].kernel)
-    cells[0].add_loss(loss_1, inputs=x)
-    cells[0].add_loss(loss_2)
-    self.assertEqual(len(layer.losses), 2)
-    self.assertEqual(layer.get_losses_for(None), [loss_2])
-    self.assertEqual(layer.get_losses_for(x), [loss_1])
-
-    # Test `updates`
-    cells = [keras.layers.LSTMCell(1),
-             keras.layers.LSTMCell(1)]
-    layer = keras.layers.RNN(cells)
-    x = keras.Input((None, 1))
-    _ = layer(x)
-
-    update_1 = tf.compat.v1.assign_add(cells[0].kernel,
-                                       x[0, 0, 0] * cells[0].kernel)
-    update_2 = tf.compat.v1.assign_add(cells[0].kernel,
-                                       tf.ones_like(cells[0].kernel))
-    # TODO(b/128682878): Remove when RNNCells are __call__'d.
-    with base_layer_utils.call_context().enter(layer, x, True, None):
-      cells[0].add_update(update_1)
-      cells[0].add_update(update_2)
-    self.assertEqual(len(layer.updates), 2)
-
-  def test_rnn_dynamic_trainability(self):
-    layer_class = keras.layers.SimpleRNN
-    embedding_dim = 4
-    units = 3
-
-    layer = layer_class(units)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(len(layer.weights), 3)
-    self.assertEqual(len(layer.trainable_weights), 3)
-    self.assertEqual(len(layer.non_trainable_weights), 0)
-    layer.trainable = False
-    self.assertEqual(len(layer.weights), 3)
-    self.assertEqual(len(layer.trainable_weights), 0)
-    self.assertEqual(len(layer.non_trainable_weights), 3)
-    layer.trainable = True
-    self.assertEqual(len(layer.weights), 3)
-    self.assertEqual(len(layer.trainable_weights), 3)
-    self.assertEqual(len(layer.non_trainable_weights), 0)
-
-  @parameterized.parameters(
-      [keras.layers.SimpleRNN, keras.layers.GRU, keras.layers.LSTM])
-  def test_rnn_cell_trainability(self, layer_cls):
-    # https://github.com/tensorflow/tensorflow/issues/32369.
-    layer = layer_cls(3, trainable=False)
-    self.assertFalse(layer.cell.trainable)
-
-    layer.trainable = True
-    self.assertTrue(layer.cell.trainable)
-
-  def test_state_reuse_with_dropout(self):
-    layer_class = keras.layers.SimpleRNN
-    embedding_dim = 4
-    units = 3
-    timesteps = 2
-    num_samples = 2
-
-    input1 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = layer_class(units,
-                        return_state=True,
-                        return_sequences=True,
-                        dropout=0.2)
-    state = layer(input1)[1:]
-
-    input2 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    output = layer_class(units)(input2, initial_state=state)
-    model = keras.Model([input1, input2], output)
-
-    inputs = [np.random.random((num_samples, timesteps, embedding_dim)),
-              np.random.random((num_samples, timesteps, embedding_dim))]
-    model.predict(inputs)
-
-  def test_builtin_and_custom_rnn_cell_serialization(self):
-
-    @keras.utils.generic_utils.register_keras_serializable(package='TestOnly')
-    class CustomRNNCell(keras.layers.Layer):
-
-      def __init__(self, units, **kwargs):
-        self.units = units
-        self.state_size = units
+    def __init__(self, unit_a, unit_b, **kwargs):
+        self.unit_a = unit_a
+        self.unit_b = unit_b
+        self.state_size = tf.TensorShape([unit_a, unit_b])
+        self.output_size = tf.TensorShape([unit_a, unit_b])
         super().__init__(**kwargs)
 
-      def build(self, input_shape):
-        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                      initializer='uniform',
-                                      name='kernel')
-        self.recurrent_kernel = self.add_weight(
-            shape=(self.units, self.units),
-            initializer='uniform',
-            name='recurrent_kernel')
+    def build(self, input_shape):
+        input_a = input_shape[-2]
+        input_b = input_shape[-1]
+        self.kernel = self.add_weight(
+            shape=(input_a, input_b, self.unit_a, self.unit_b),
+            initializer="uniform",
+            name="kernel",
+        )
+        self.recurring_kernel = self.add_weight(
+            shape=(self.unit_a, self.unit_b, self.unit_a, self.unit_b),
+            initializer="uniform",
+            name="recurring_kernel",
+        )
+        self.bias = self.add_weight(
+            shape=(self.unit_a, self.unit_b), initializer="uniform", name="bias"
+        )
         self.built = True
 
-      def call(self, inputs, states):
+    def call(self, inputs, states):
         prev_output = states[0]
-        h = keras.backend.dot(inputs, self.kernel)
-        output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
+        h = tf.einsum("bij,ijkl->bkl", inputs, self.kernel)
+        h += tf.expand_dims(self.bias, axis=0)
+        output = h + tf.einsum(
+            "bij,ijkl->bkl", prev_output, self.recurring_kernel
+        )
         return output, [output]
 
-      def get_config(self):
-        config = {'units': self.units}
-        base_config = super().get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    for cell_class in [keras.layers.SimpleRNNCell,
-                       keras.layers.GRUCell,
-                       keras.layers.LSTMCell,
-                       CustomRNNCell]:
-      # Test basic case.
-      x = keras.Input((None, 5))
-      cell = cell_class(32)
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(
-          optimizer='rmsprop',
-          loss='mse',
-          run_eagerly=test_utils.should_run_eagerly())
-
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      y_np = model.predict(x_np)
-      weights = model.get_weights()
-      config = layer.get_config()
-      layer = keras.layers.RNN.from_config(config)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.set_weights(weights)
-      y_np_2 = model.predict(x_np)
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-      # Test stacking.
-      cells = [cell_class(8),
-               cell_class(12),
-               cell_class(32)]
-      layer = keras.layers.RNN(cells)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(
-          optimizer='rmsprop',
-          loss='mse',
-          run_eagerly=test_utils.should_run_eagerly())
-
-      # Test stacked RNN serialization.
-      x_np = np.random.random((6, 5, 5))
-      y_np = model.predict(x_np)
-      weights = model.get_weights()
-      config = layer.get_config()
-      layer = keras.layers.RNN.from_config(config)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.set_weights(weights)
-      y_np_2 = model.predict(x_np)
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          layer=[
-              keras.layers.SimpleRNN, gru_v1.GRU, lstm_v1.LSTM, gru.GRU,
-              lstm.LSTM
-          ],
-          unroll=[True, False]))
-  def test_rnn_dropout(self, layer, unroll):
-    rnn_layer = layer(3, dropout=0.1, recurrent_dropout=0.1, unroll=unroll)
-    if not unroll:
-      x = keras.Input((None, 5))
-    else:
-      x = keras.Input((5, 5))
-    y = rnn_layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x_np = np.random.random((6, 5, 5))
-    y_np = np.random.random((6, 3))
-    model.train_on_batch(x_np, y_np)
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          cell=[keras.layers.SimpleRNNCell, keras.layers.GRUCell,
-                keras.layers.LSTMCell],
-          unroll=[True, False]))
-  def test_stacked_rnn_dropout(self, cell, unroll):
-    cells = [cell(3, dropout=0.1, recurrent_dropout=0.1),
-             cell(3, dropout=0.1, recurrent_dropout=0.1)]
-    layer = keras.layers.RNN(cells, unroll=unroll)
-
-    if not unroll:
-      x = keras.Input((None, 5))
-    else:
-      x = keras.Input((5, 5))
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x_np = np.random.random((6, 5, 5))
-    y_np = np.random.random((6, 3))
-    model.train_on_batch(x_np, y_np)
-
-  def test_dropout_mask_reuse(self):
-    # The layer is created with recurrent_initializer = zero, so that the
-    # the recurrent state won't affect the output. By doing this, we can verify
-    # the output and see if the same mask is applied to for each timestep.
-    layer_1 = keras.layers.SimpleRNN(3,
-                                     dropout=0.5,
-                                     kernel_initializer='ones',
-                                     recurrent_initializer='zeros',
-                                     return_sequences=True,
-                                     unroll=True)
-    layer_2 = keras.layers.RNN(
-        keras.layers.SimpleRNNCell(3,
-                                   dropout=0.5,
-                                   kernel_initializer='ones',
-                                   recurrent_initializer='zeros'),
-        return_sequences=True,
-        unroll=True)
-    layer_3 = keras.layers.RNN(
-        [keras.layers.SimpleRNNCell(3,
-                                    dropout=0.5,
-                                    kernel_initializer='ones',
-                                    recurrent_initializer='zeros'),
-         keras.layers.SimpleRNNCell(3,
-                                    dropout=0.5,
-                                    kernel_initializer='ones',
-                                    recurrent_initializer='zeros')
-        ],
-        return_sequences=True,
-        unroll=True)
-
-    def verify(rnn_layer):
-      inputs = tf.constant(1.0, shape=(6, 2, 5))
-      out = rnn_layer(inputs, training=True)
-      if not tf.executing_eagerly():
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-      batch_1 = self.evaluate(out)
-      batch_1_t0, batch_1_t1 = batch_1[:, 0, :], batch_1[:, 1, :]
-      self.assertAllClose(batch_1_t0, batch_1_t1)
-
-      # This simulate the layer called with multiple batches in eager mode
-      if tf.executing_eagerly():
-        out2 = rnn_layer(inputs, training=True)
-      else:
-        out2 = out
-      batch_2 = self.evaluate(out2)
-      batch_2_t0, batch_2_t1 = batch_2[:, 0, :], batch_2[:, 1, :]
-      self.assertAllClose(batch_2_t0, batch_2_t1)
-
-      # Also validate that different dropout is used by between batches.
-      self.assertNotAllClose(batch_1_t0, batch_2_t0)
-      self.assertNotAllClose(batch_1_t1, batch_2_t1)
-
-    for l in [layer_1, layer_2, layer_3]:
-      verify(l)
-
-  def test_stacked_rnn_compute_output_shape(self):
-    cells = [keras.layers.LSTMCell(3),
-             keras.layers.LSTMCell(6)]
-    embedding_dim = 4
-    timesteps = 2
-    layer = keras.layers.RNN(cells, return_state=True, return_sequences=True)
-    output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
-    expected_output_shape = [(None, timesteps, 6),
-                             (None, 3),
-                             (None, 3),
-                             (None, 6),
-                             (None, 6)]
-    self.assertEqual(
-        [tuple(o.as_list()) for o in output_shape],
-        expected_output_shape)
-
-    # Test reverse_state_order = True for stacked cell.
-    stacked_cell = keras.layers.StackedRNNCells(
-        cells, reverse_state_order=True)
-    layer = keras.layers.RNN(
-        stacked_cell, return_state=True, return_sequences=True)
-    output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
-    expected_output_shape = [(None, timesteps, 6),
-                             (None, 6),
-                             (None, 6),
-                             (None, 3),
-                             (None, 3)]
-    self.assertEqual(
-        [tuple(o.as_list()) for o in output_shape],
-        expected_output_shape)
-
-  def test_stacked_rnn_with_training_param(self):
-    # See https://github.com/tensorflow/tensorflow/issues/32586
-
-    class CellWrapper(keras.layers.AbstractRNNCell):
-
-      def __init__(self, cell):
-        super().__init__()
-        self.cell = cell
-
-      @property
-      def state_size(self):
-        return self.cell.state_size
-
-      @property
-      def output_size(self):
-        return self.cell.output_size
-
-      def build(self, input_shape):
-        self.cell.build(input_shape)
-        self.built = True
-
-      def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-        return self.cell.get_initial_state(
-            inputs=inputs, batch_size=batch_size, dtype=dtype)
-
-      def call(self, inputs, states, training=None, **kwargs):
-        assert training is not None
-        return self.cell(inputs, states=states, training=training)
-
-    cell = keras.layers.LSTMCell(32)
-    cell = CellWrapper(cell)
-    cell = keras.layers.StackedRNNCells([cell])
-
-    rnn = keras.layers.RNN(cell)
-    inputs = np.ones((8, 4, 16), dtype=np.float32)
-    rnn(inputs, training=True)
-
-  def test_stacked_rnn_with_nested_cell(self):
-    batch = 10
-    t = 5
-    i1, i2, i3 = 3, 4, 5
-    o11, o12, o13 = 2, 3, 4
-    o21, o22, o23 = 4, 5, 6
-
-    # test 1: use_tuple=False
-    cells = [NestedCell(o11, o12, o13), NestedCell(o21, o22, o23)]
-    rnn = keras.layers.RNN(cells, return_sequences=True, return_state=True)
-
-    input_1 = keras.Input((t, i1))
-    input_2 = keras.Input((t, i2, i3))
-
-    output1, output2, state1, state2 = rnn((input_1, input_2))
-    s11, s12 = state1
-    s21, s22 = state2
-
-    self.assertEqual(output1.shape.as_list(), [None, t, o21])
-    self.assertEqual(output2.shape.as_list(), [None, t, o22, o23])
-    self.assertEqual(s11.shape.as_list(), [None, o11])
-    self.assertEqual(s12.shape.as_list(), [None, o12, o13])
-    self.assertEqual(s21.shape.as_list(), [None, o21])
-    self.assertEqual(s22.shape.as_list(), [None, o22, o23])
-
-    model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)),
-         np.zeros((batch, t, i2, i3))],
-        [np.zeros((batch, t, o21)),
-         np.zeros((batch, t, o22, o23))])
-    self.assertEqual(model.output_shape, [(None, t, o21), (None, t, o22, o23)])
-
-    # test 2: use_tuple=True
-    cells = [
-        NestedCell(o11, o12, o13, use_tuple=True),
-        NestedCell(o21, o22, o23)
-    ]
-
-    rnn = keras.layers.RNN(cells, return_sequences=True, return_state=True)
-
-    input_1 = keras.Input((t, i1))
-    input_2 = keras.Input((t, i2, i3))
-
-    output1, output2, state1, state2 = rnn(NestedInput(t1=input_1, t2=input_2))
-    s11, s12 = state1
-    s21, s22 = state2
-
-    self.assertEqual(output1.shape.as_list(), [None, t, o21])
-    self.assertEqual(output2.shape.as_list(), [None, t, o22, o23])
-    self.assertEqual(s11.shape.as_list(), [None, o11])
-    self.assertEqual(s12.shape.as_list(), [None, o12, o13])
-    self.assertEqual(s21.shape.as_list(), [None, o21])
-    self.assertEqual(s22.shape.as_list(), [None, o22, o23])
-
-    model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)),
-         np.zeros((batch, t, i2, i3))],
-        [np.zeros((batch, t, o21)),
-         np.zeros((batch, t, o22, o23))])
-    self.assertEqual(model.output_shape, [(None, t, o21), (None, t, o22, o23)])
-
-  def test_trackable_dependencies(self):
-    rnn = keras.layers.SimpleRNN
-    x = np.random.random((2, 2, 2))
-    y = np.random.random((2, 2))
-    model = keras.models.Sequential()
-    model.add(rnn(2))
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, epochs=1, batch_size=1)
-
-    # check whether the model variables are present in the
-    # trackable list of objects
-    checkpointed_objects = {id(o) for o in trackable_util.list_objects(model)}
-    for v in model.variables:
-      self.assertIn(id(v), checkpointed_objects)
-
-  def test_high_dimension_RNN(self):
-    # Basic test case.
-    unit_a = 10
-    unit_b = 20
-    input_a = 5
-    input_b = 10
-    batch = 32
-    time_step = 4
-
-    cell = Minimal2DRNNCell(unit_a, unit_b)
-    x = keras.Input((None, input_a, input_b))
-    layer = keras.layers.RNN(cell)
-    y = layer(x)
-
-    self.assertEqual(cell.state_size.as_list(), [unit_a, unit_b])
-
-    if not tf.executing_eagerly():
-      init_state = layer.get_initial_state(x)
-      self.assertEqual(len(init_state), 1)
-      self.assertEqual(init_state[0].shape.as_list(), [None, unit_a, unit_b])
-
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, time_step, input_a, input_b)),
-        np.zeros((batch, unit_a, unit_b)))
-    self.assertEqual(model.output_shape, (None, unit_a, unit_b))
-
-    # Test stacking.
-    cells = [
-        Minimal2DRNNCell(unit_a, unit_b),
-        Minimal2DRNNCell(unit_a * 2, unit_b * 2),
-        Minimal2DRNNCell(unit_a * 4, unit_b * 4)
-    ]
-    layer = keras.layers.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, time_step, input_a, input_b)),
-        np.zeros((batch, unit_a * 4, unit_b * 4)))
-    self.assertEqual(model.output_shape, (None, unit_a * 4, unit_b * 4))
-
-  def test_high_dimension_RNN_with_init_state(self):
-    unit_a = 10
-    unit_b = 20
-    input_a = 5
-    input_b = 10
-    batch = 32
-    time_step = 4
-
-    # Basic test case.
-    cell = Minimal2DRNNCell(unit_a, unit_b)
-    x = keras.Input((None, input_a, input_b))
-    s = keras.Input((unit_a, unit_b))
-    layer = keras.layers.RNN(cell)
-    y = layer(x, initial_state=s)
-
-    model = keras.models.Model([x, s], y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch([
-        np.zeros((batch, time_step, input_a, input_b)),
-        np.zeros((batch, unit_a, unit_b))
-    ], np.zeros((batch, unit_a, unit_b)))
-    self.assertEqual(model.output_shape, (None, unit_a, unit_b))
-
-    # Bad init state shape.
-    bad_shape_a = unit_a * 2
-    bad_shape_b = unit_b * 2
-    cell = Minimal2DRNNCell(unit_a, unit_b)
-    x = keras.Input((None, input_a, input_b))
-    s = keras.Input((bad_shape_a, bad_shape_b))
-    layer = keras.layers.RNN(cell)
-    with self.assertRaisesWithPredicateMatch(ValueError,
-                                             'however `cell.state_size` is'):
-      layer(x, initial_state=s)
-
-  def test_inconsistent_output_state_size(self):
-    batch = 32
-    time_step = 4
-    state_size = 5
-    input_size = 6
-    cell = PlusOneRNNCell(state_size)
-    x = keras.Input((None, input_size))
-    layer = keras.layers.RNN(cell)
-    y = layer(x)
-
-    self.assertEqual(cell.state_size, state_size)
-    if not tf.executing_eagerly():
-      init_state = layer.get_initial_state(x)
-      self.assertEqual(len(init_state), 1)
-      self.assertEqual(init_state[0].shape.as_list(), [None, state_size])
-
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, time_step, input_size)),
-        np.zeros((batch, input_size)))
-    self.assertEqual(model.output_shape, (None, input_size))
-
-  def test_get_initial_state(self):
-    cell = keras.layers.SimpleRNNCell(5)
-    with self.assertRaisesRegex(ValueError,
-                                'batch_size and dtype cannot be None'):
-      cell.get_initial_state(None, None, None)
-
-    if not tf.executing_eagerly():
-      inputs = keras.Input((None, 10))
-      initial_state = cell.get_initial_state(inputs, None, None)
-      self.assertEqual(initial_state.shape.as_list(), [None, 5])
-      self.assertEqual(initial_state.dtype, inputs.dtype)
-
-      batch = tf.shape(inputs)[0]
-      dtype = inputs.dtype
-      initial_state = cell.get_initial_state(None, batch, dtype)
-      self.assertEqual(initial_state.shape.as_list(), [None, 5])
-      self.assertEqual(initial_state.dtype, inputs.dtype)
-    else:
-      batch = 8
-      inputs = np.random.random((batch, 10))
-      initial_state = cell.get_initial_state(inputs, None, None)
-      self.assertEqual(initial_state.shape.as_list(), [8, 5])
-      self.assertEqual(initial_state.dtype, inputs.dtype)
-
-      dtype = inputs.dtype
-      initial_state = cell.get_initial_state(None, batch, dtype)
-      self.assertEqual(initial_state.shape.as_list(), [batch, 5])
-      self.assertEqual(initial_state.dtype, inputs.dtype)
-
-  @parameterized.parameters([True, False])
-  def test_nested_input_output(self, stateful):
-    batch = 10
-    t = 5
-    i1, i2, i3 = 3, 4, 5
-    o1, o2, o3 = 2, 3, 4
-
-    cell = NestedCell(o1, o2, o3)
-    rnn = keras.layers.RNN(cell, stateful=stateful)
-
-    batch_size = batch if stateful else None
-    input_1 = keras.Input((t, i1), batch_size=batch_size)
-    input_2 = keras.Input((t, i2, i3), batch_size=batch_size)
-
-    outputs = rnn((input_1, input_2))
-
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].shape.as_list(), [batch_size, o1])
-    self.assertEqual(outputs[1].shape.as_list(), [batch_size, o2, o3])
-
-    model = keras.models.Model((input_1, input_2), outputs)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
-        [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
-    self.assertEqual(model.output_shape, [(batch_size, o1),
-                                          (batch_size, o2, o3)])
-
-    cell = NestedCell(o1, o2, o3, use_tuple=True)
-
-    rnn = keras.layers.RNN(cell, stateful=stateful)
-
-    input_1 = keras.Input((t, i1), batch_size=batch_size)
-    input_2 = keras.Input((t, i2, i3), batch_size=batch_size)
-
-    outputs = rnn(NestedInput(t1=input_1, t2=input_2))
-
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].shape.as_list(), [batch_size, o1])
-    self.assertEqual(outputs[1].shape.as_list(), [batch_size, o2, o3])
-
-    model = keras.models.Model([input_1, input_2], outputs)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)),
-         np.zeros((batch, t, i2, i3))],
-        [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
-    self.assertEqual(model.output_shape, [(batch_size, o1),
-                                          (batch_size, o2, o3)])
-
-  def test_nested_input_output_with_state(self):
-    batch = 10
-    t = 5
-    i1, i2, i3 = 3, 4, 5
-    o1, o2, o3 = 2, 3, 4
-
-    cell = NestedCell(o1, o2, o3)
-    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
-
-    input_1 = keras.Input((t, i1))
-    input_2 = keras.Input((t, i2, i3))
-
-    output1, output2, s1, s2 = rnn((input_1, input_2))
-
-    self.assertEqual(output1.shape.as_list(), [None, t, o1])
-    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
-    self.assertEqual(s1.shape.as_list(), [None, o1])
-    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
-
-    model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)),
-         np.zeros((batch, t, i2, i3))],
-        [np.zeros((batch, t, o1)),
-         np.zeros((batch, t, o2, o3))])
-    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
-
-    cell = NestedCell(o1, o2, o3, use_tuple=True)
-
-    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
-
-    input_1 = keras.Input((t, i1))
-    input_2 = keras.Input((t, i2, i3))
-
-    output1, output2, s1, s2 = rnn(NestedInput(t1=input_1, t2=input_2))
-
-    self.assertEqual(output1.shape.as_list(), [None, t, o1])
-    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
-    self.assertEqual(s1.shape.as_list(), [None, o1])
-    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
-
-    model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)),
-         np.zeros((batch, t, i2, i3))],
-        [np.zeros((batch, t, o1)),
-         np.zeros((batch, t, o2, o3))])
-    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
-
-  def test_nest_input_output_with_init_state(self):
-    batch = 10
-    t = 5
-    i1, i2, i3 = 3, 4, 5
-    o1, o2, o3 = 2, 3, 4
-
-    cell = NestedCell(o1, o2, o3)
-    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
-
-    input_1 = keras.Input((t, i1))
-    input_2 = keras.Input((t, i2, i3))
-    init_s1 = keras.Input((o1,))
-    init_s2 = keras.Input((o2, o3))
-
-    output1, output2, s1, s2 = rnn((input_1, input_2),
-                                   initial_state=(init_s1, init_s2))
-
-    self.assertEqual(output1.shape.as_list(), [None, t, o1])
-    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
-    self.assertEqual(s1.shape.as_list(), [None, o1])
-    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
-
-    model = keras.models.Model([input_1, input_2, init_s1, init_s2],
-                               [output1, output2])
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)),
-         np.zeros((batch, t, i2, i3)),
-         np.zeros((batch, o1)),
-         np.zeros((batch, o2, o3))],
-        [np.zeros((batch, t, o1)),
-         np.zeros((batch, t, o2, o3))])
-    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
-
-    cell = NestedCell(o1, o2, o3, use_tuple=True)
-
-    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
-
-    input_1 = keras.Input((t, i1))
-    input_2 = keras.Input((t, i2, i3))
-    init_s1 = keras.Input((o1,))
-    init_s2 = keras.Input((o2, o3))
-    init_state = NestedState(s1=init_s1, s2=init_s2)
-
-    output1, output2, s1, s2 = rnn(NestedInput(t1=input_1, t2=input_2),
-                                   initial_state=init_state)
-
-    self.assertEqual(output1.shape.as_list(), [None, t, o1])
-    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
-    self.assertEqual(s1.shape.as_list(), [None, o1])
-    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
-
-    model = keras.models.Model([input_1, input_2, init_s1, init_s2],
-                               [output1, output2])
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        [np.zeros((batch, t, i1)),
-         np.zeros((batch, t, i2, i3)),
-         np.zeros((batch, o1)),
-         np.zeros((batch, o2, o3))],
-        [np.zeros((batch, t, o1)),
-         np.zeros((batch, t, o2, o3))])
-    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
-
-  def test_masking_rnn_with_output_and_states(self):
-
-    class Cell(keras.layers.Layer):
-
-      def __init__(self):
-        self.state_size = None
-        self.output_size = None
-        super().__init__()
-
-      def build(self, input_shape):
-        self.state_size = input_shape[-1]
-        self.output_size = input_shape[-1]
-
-      def call(self, inputs, states):
-        return inputs, [s + 1 for s in states]
-
-    x = keras.Input((3, 1), name='x')
-    x_masked = keras.layers.Masking()(x)
-    s_0 = keras.Input((1,), name='s_0')
-    y, s = keras.layers.RNN(
-        Cell(), return_state=True)(x_masked, initial_state=s_0)
-    model = keras.models.Model([x, s_0], [y, s])
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    # last time step masked
-    x_np = np.array([[[1.], [2.], [0.]]])
-    s_0_np = np.array([[10.]])
-    y_np, s_np = model.predict([x_np, s_0_np])
-
-    # 1 is added to initial state two times
-    self.assertAllClose(s_np, s_0_np + 2)
-    # Expect last output to be the same as last output before masking
-    self.assertAllClose(y_np, x_np[:, 1, :])
-
-  def test_zero_output_for_masking(self):
-
-    for unroll in [True, False]:
-      cell = keras.layers.SimpleRNNCell(5)
-      x = keras.Input((5, 5))
-      mask = keras.layers.Masking()
-      layer = keras.layers.RNN(
-          cell, return_sequences=True, zero_output_for_mask=True, unroll=unroll)
-      masked_input = mask(x)
-      y = layer(masked_input)
-      model = keras.models.Model(x, y)
-      model.compile(
-          optimizer='rmsprop',
-          loss='mse',
-          run_eagerly=test_utils.should_run_eagerly())
-
-      np_x = np.ones((6, 5, 5))
-      result_1 = model.predict(np_x)
-
-      # set the time 4 and 5 for last record to be zero (masked).
-      np_x[5, 3:] = 0
-      result_2 = model.predict(np_x)
-
-      # expect the result_2 has same output, except the time 4,5 for last
-      # record.
-      result_1[5, 3:] = 0
-      self.assertAllClose(result_1, result_2)
-
-  def test_unroll_single_step(self):
-    """Even if the time dimension is only one, we should be able to unroll."""
-    cell = keras.layers.SimpleRNNCell(5)
-    x = keras.Input((1, 5))
-    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    np_x = np.ones((6, 1, 5))
-    result = model.predict(np_x)
-    self.assertEqual((6, 1, 5), result.shape)
-
-  def test_unroll_zero_step(self):
-    """If the time dimension is None, we should fail to unroll."""
-    cell = keras.layers.SimpleRNNCell(5)
-    x = keras.Input((None, 5))
-    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
-    with self.assertRaisesRegex(ValueError, 'Cannot unroll a RNN.*'):
-      layer(x)
-
-  def test_full_input_spec(self):
-    # See https://github.com/tensorflow/tensorflow/issues/25985
-    inputs = keras.layers.Input(batch_shape=(1, 1, 1))
-    state_h = keras.layers.Input(batch_shape=(1, 1))
-    state_c = keras.layers.Input(batch_shape=(1, 1))
-    states = [state_h, state_c]
-    decoder_out = keras.layers.LSTM(1, stateful=True)(
-        inputs,
-        initial_state=states
-    )
-    model = keras.Model([inputs, state_h, state_c], decoder_out)
-    output1 = model.predict(
-        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
-    output2 = model.predict(
-        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
-    model.reset_states()
-    output3 = model.predict(
-        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
-    self.assertAllClose(output1, output3)
-    self.assertNotAllClose(output1, output2)
-
-  def test_reset_states(self):
-    # See https://github.com/tensorflow/tensorflow/issues/25852
-    with self.assertRaisesRegex(ValueError, 'it needs to know its batch size'):
-      simple_rnn = keras.layers.SimpleRNN(1, stateful=True)
-      simple_rnn.reset_states()
-
-    with self.assertRaisesRegex(ValueError, 'it needs to know its batch size'):
-      cell = Minimal2DRNNCell(1, 2)
-      custom_rnn = keras.layers.RNN(cell, stateful=True)
-      custom_rnn.reset_states()
-
-  @parameterized.parameters(
-      [keras.layers.SimpleRNNCell, keras.layers.GRUCell, keras.layers.LSTMCell])
-  def test_stateful_rnn_with_stacking(self, cell):
-    # See https://github.com/tensorflow/tensorflow/issues/28614.
-    batch = 12
-    timesteps = 10
-    input_dim = 8
-    output_dim = 64
-    cells = [cell(32), cell(64)]
-    x = keras.Input(batch_shape=(batch, None, input_dim))
-    layer = keras.layers.RNN(cells, stateful=True)
-    y = layer(x)
-
-    model = keras.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, timesteps, input_dim)),
-        np.zeros((batch, output_dim)))
-    model.predict(np.ones((batch, timesteps, input_dim)))
-
-    model.reset_states()
-    model.predict(np.ones((batch, timesteps, input_dim)))
-
-    new_states = tf.nest.map_structure(lambda s: np.ones((batch, s)),
-                                       layer.cell.state_size)
-    layer.reset_states(new_states)
-    model.predict(np.ones((batch, timesteps, input_dim)))
-
-  def test_stateful_rnn_with_initial_state(self):
-    # See https://github.com/tensorflow/tensorflow/issues/32299.
-    batch = 12
-    timesteps = 1
-    input_dim = 8
-    output_dim = 16
-
-    test_inputs = np.full((batch, timesteps, input_dim), 0.5)
-
-    def make_model(stateful=False, with_initial_state=False):
-      input_layer = keras.Input(shape=(None, input_dim), batch_size=batch)
-      if with_initial_state:
-        initial_states = keras.backend.constant(np.ones((batch, output_dim)))
-      else:
-        initial_states = None
-      rnn_output = keras.layers.GRU(
-          units=output_dim, return_sequences=True, stateful=stateful)(
-              input_layer, initial_state=initial_states)
-      model = keras.Model(input_layer, rnn_output)
-      model.compile(
-          optimizer='rmsprop', loss='mse',
-          run_eagerly=test_utils.should_run_eagerly())
-      return model
-
-    # Define a model with a constant state initialization
-    model = make_model(stateful=True, with_initial_state=True)
-    layer_weights = model.layers[1].get_weights()
-
-    model.reset_states()
-    predict_1 = model.predict(test_inputs)
-    predict_2 = model.predict(test_inputs)
-
-    model.reset_states()
-    predict_3 = model.predict(test_inputs)
-
-    # predict 1 and 2 should be different since the batch 2 should use the state
-    # from batch 1 as the initial state.
-    self.assertNotAllClose(predict_1, predict_2)
-    self.assertAllClose(predict_1, predict_3)
-
-    # Create a new model with same weights but without initial states. Make sure
-    # the predict value is different from the model with non-zero initial state.
-    model_2 = make_model(stateful=True, with_initial_state=False)
-    model_2.layers[1].set_weights(layer_weights)
-
-    model_2.reset_states()
-    predict_4 = model_2.predict(test_inputs)
-    predict_5 = model_2.predict(test_inputs)
-    self.assertNotAllClose(predict_1, predict_4)
-    self.assertNotAllClose(predict_4, predict_5)
-
-    # Create models with stateful=False, and make sure they handle init state
-    # correctly.
-    model_3 = make_model(stateful=False, with_initial_state=True)
-    model_3.layers[1].set_weights(layer_weights)
-
-    model_3.reset_states()
-    predict_6 = model_3.predict(test_inputs)
-    predict_7 = model_3.predict(test_inputs)
-    self.assertAllClose(predict_1, predict_6)
-    self.assertAllClose(predict_6, predict_7)
-
-  def test_stateful_rnn_with_customized_get_initial_state(self):
-
-    class TestCell(keras.layers.AbstractRNNCell):
-
-      state_size = 1
-      output_size = 2
-
-      def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-        return np.ones((batch_size, 1), dtype=dtype)
-
-      def call(self, inputs, states):
-        return inputs, states
-
-    layer = keras.layers.RNN(TestCell(), stateful=True, return_state=True)
-    inputs = keras.Input(shape=(10, 2), batch_size=4)
-    model = keras.Model(inputs, layer(inputs))
-    x = np.ones((4, 10, 2), dtype=np.float32)
-    output, state = model.predict(x)
-    self.assertAllClose(output, np.ones((4, 2)))
-    self.assertAllClose(state, np.ones((4, 1)))
-
-  def test_input_dim_length(self):
-    simple_rnn = keras.layers.SimpleRNN(5, input_length=10, input_dim=8)
-    self.assertEqual(simple_rnn._batch_input_shape, (None, 10, 8))
-
-    simple_rnn = keras.layers.SimpleRNN(5, input_dim=8)
-    self.assertEqual(simple_rnn._batch_input_shape, (None, None, 8))
-
-    simple_rnn = keras.layers.SimpleRNN(5, input_length=10)
-    self.assertEqual(simple_rnn._batch_input_shape, (None, 10, None))
-
-  @parameterized.parameters(
-      [keras.layers.SimpleRNNCell, keras.layers.GRUCell, keras.layers.LSTMCell])
-  def test_state_spec_with_stack_cell(self, cell):
-    # See https://github.com/tensorflow/tensorflow/issues/27817 for more detail.
-    batch = 12
-    timesteps = 10
-    input_dim = 8
-    output_dim = 8
-
-    def create_cell():
-      return [cell(output_dim),
-              cell(output_dim),
-              cell(output_dim)]
-
-    inputs = keras.Input((timesteps, input_dim))
-    encoder_output = keras.layers.RNN(create_cell(), return_state=True)(inputs)
-
-    states = encoder_output[1:]
-
-    decoder_output = keras.layers.RNN(
-        create_cell())(inputs, initial_state=states)
-
-    model = keras.models.Model(inputs, decoder_output)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, timesteps, input_dim)),
-        np.zeros((batch, output_dim)))
-    model.predict(np.ones((batch, timesteps, input_dim)))
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(layer=[
-          keras.layers.SimpleRNN, gru_v1.GRU, lstm_v1.LSTM, gru.GRU, lstm.LSTM
-      ]))
-  def test_rnn_with_ragged_input(self, layer):
-    ragged_data = tf.ragged.constant(
-        [[[1., 1., 1., 1., 1.], [1., 2., 3., 1., 1.]],
-         [[2., 4., 1., 3., 1.]],
-         [[2., 3., 4., 1., 5.], [2., 3., 1., 1., 1.], [1., 2., 3., 4., 5.]]],
-        ragged_rank=1)
-    label_data = np.array([[1, 0, 1], [1, 1, 0], [0, 0, 1]])
-
-    # Test results in feed forward
-    np.random.seed(100)
-    rnn_layer = layer(4, activation='sigmoid')
-
-    x_ragged = keras.Input(shape=(None, 5), ragged=True)
-    y_ragged = rnn_layer(x_ragged)
-    model = keras.models.Model(x_ragged, y_ragged)
-    output_ragged = model.predict(ragged_data, steps=1)
-
-    x_dense = keras.Input(shape=(3, 5))
-    masking = keras.layers.Masking()(x_dense)
-    y_dense = rnn_layer(masking)
-    model_2 = keras.models.Model(x_dense, y_dense)
-    dense_data = ragged_data.to_tensor()
-    output_dense = model_2.predict(dense_data, steps=1)
-
-    self.assertAllClose(output_dense, output_ragged)
-
-    # Test results with go backwards
-    np.random.seed(200)
-    back_rnn_layer = layer(8, go_backwards=True, activation='sigmoid')
-
-    x_ragged = keras.Input(shape=(None, 5), ragged=True)
-    y_ragged = back_rnn_layer(x_ragged)
-    model = keras.models.Model(x_ragged, y_ragged)
-    output_ragged = model.predict(ragged_data, steps=1)
-
-    x_dense = keras.Input(shape=(3, 5))
-    masking = keras.layers.Masking()(x_dense)
-    y_dense = back_rnn_layer(masking)
-    model_2 = keras.models.Model(x_dense, y_dense)
-    dense_data = ragged_data.to_tensor()
-    output_dense = model_2.predict(dense_data, steps=1)
-
-    self.assertAllClose(output_dense, output_ragged)
-
-    # Test densification of the ragged input
-    dense_tensor, row_lengths = keras.backend.convert_inputs_if_ragged(
-        ragged_data)
-    self.assertAllClose(dense_data, dense_tensor)
-
-    # Test optional params, all should work except unrolling
-    inputs = keras.Input(shape=(None, 5), dtype=tf.float32, ragged=True)
-    custom_rnn_layer = layer(
-        3, zero_output_for_mask=True, dropout=0.1, use_bias=True)
-    outputs = custom_rnn_layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(ragged_data, label_data)
-
-    # Test stateful and full shape specification
-    inputs = keras.Input(
-        shape=(None, 5), batch_size=3, dtype=tf.float32, ragged=True)
-    stateful_rnn_layer = layer(3, stateful=True)
-    outputs = stateful_rnn_layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(ragged_data, label_data)
-
-    # Must raise error when unroll is set to True
-    unroll_rnn_layer = layer(3, unroll=True)
-    with self.assertRaisesRegex(ValueError,
-                                'The input received contains RaggedTensors *'):
-      unroll_rnn_layer(inputs)
-
-    # Check if return sequences outputs are correct
-    np.random.seed(100)
-    returning_rnn_layer = layer(4, return_sequences=True)
-
-    x_ragged = keras.Input(shape=(None, 5), ragged=True)
-    y_ragged = returning_rnn_layer(x_ragged)
-    model = keras.models.Model(x_ragged, y_ragged)
-    output_ragged = model.predict(ragged_data, steps=1)
-    self.assertAllClose(output_ragged.ragged_rank, ragged_data.ragged_rank)
-    self.assertAllClose(output_ragged.row_splits, ragged_data.row_splits)
-
-    x_dense = keras.Input(shape=(3, 5))
-    masking = keras.layers.Masking()(x_dense)
-    y_dense = returning_rnn_layer(masking)
-    model_2 = keras.models.Model(x_dense, y_dense)
-    dense_data = ragged_data.to_tensor()
-    output_dense = model_2.predict(dense_data, steps=1)
-    # Convert the output here to ragged for value comparison
-    output_dense = tf.RaggedTensor.from_tensor(
-        output_dense, lengths=row_lengths)
-    self.assertAllClose(output_ragged, output_dense)
-
-    # Check if return sequences and go_backwards outputs are correct
-    np.random.seed(100)
-    returning_rnn_layer = layer(4, go_backwards=True, return_sequences=True)
-
-    x_ragged = keras.Input(shape=(None, 5), ragged=True)
-    y_ragged = returning_rnn_layer(x_ragged)
-    model = keras.models.Model(x_ragged, y_ragged)
-    output_ragged = model.predict(ragged_data, steps=1)
-    self.assertAllClose(output_ragged.ragged_rank, ragged_data.ragged_rank)
-    self.assertAllClose(output_ragged.row_splits, ragged_data.row_splits)
-
-    x_dense = keras.Input(shape=(3, 5))
-    masking = keras.layers.Masking()(x_dense)
-    y_dense = returning_rnn_layer(masking)
-    model_2 = keras.models.Model(x_dense, y_dense)
-    dense_data = ragged_data.to_tensor()
-    output_dense = model_2.predict(dense_data, steps=1)
-
-    # Note that the raw output for dense and ragged input when go_backward=True
-    # will be different. Consider following input
-    # [[a, b, 0], [c, 0, 0], [d, e, f]] where 0s are masked value.
-    # The dense output will be [[0, b, a], [0, 0, c], [f, e, d]] since it will
-    # process the whole sequence from the end.
-    # While ragged output will be [[b, a], [c], [f, e, d]] since it just ignore
-    # the 0s. And if we densify the ragged output, it will by default inserting
-    # 0s to the end (rather than from the beginning), which make the output to
-    # be [[b, a, 0], [c, 0, 0], [f, e, d]]. With this, we need to verify that
-    # reverse(ragged_output.to_tensor()) == reverse(dense_output)
-    output_dense = keras.backend.reverse(output_dense, [1])
-    output_dense = tf.RaggedTensor.from_tensor(
-        output_dense, lengths=row_lengths)
-
-    self.assertAllClose(keras.backend.reverse(output_ragged, [1]), output_dense)
-
-  def test_stateless_rnn_cell(self):
-
-    class StatelessCell(keras.layers.Layer):
-
-      def __init__(self):
-        self.state_size = ((), [], ())
-        self.output_size = None
-        super().__init__()
-
-      def build(self, input_shape):
-        self.output_size = input_shape[-1]
-
-      def call(self, inputs, states):
-        return inputs, states
-
-    x = keras.Input((None, 5))
-    cell = StatelessCell()
-    initial_state = tf.nest.map_structure(lambda t: None, cell.state_size)
-    layer = keras.layers.RNN(cell)
-    y = layer(x, initial_state=initial_state)
-    model = keras.models.Model(x, y)
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 5)))
-
-  @parameterized.parameters(
-      [keras.layers.SimpleRNN, gru_v1.GRU, lstm_v1.LSTM, gru.GRU, lstm.LSTM])
-  def test_for_enable_caching_device_for_layer(self, layer_cls):
-    expected_caching_device = tf.compat.v1.executing_eagerly_outside_functions()
-    layer = layer_cls(1)
-    self.assertEqual(layer.cell._enable_caching_device, expected_caching_device)
-
-    # Make sure the config only appears when the none default value is used.
-    config = layer.get_config()
-    self.assertNotIn('enable_caching_device', config)
-
-    non_default_value = not expected_caching_device
-    layer = layer_cls(1, enable_caching_device=non_default_value)
-    self.assertEqual(layer.cell._enable_caching_device, non_default_value)
-    config = layer.get_config()
-    self.assertEqual(config['enable_caching_device'], non_default_value)
-
-  @parameterized.parameters(
-      [keras.layers.SimpleRNNCell, gru_v1.GRUCell, lstm_v1.LSTMCell,
-       gru.GRUCell, lstm.LSTMCell])
-  def test_for_enable_caching_device_for_cell(self, cell_cls):
-    expected_caching_device = tf.compat.v1.executing_eagerly_outside_functions()
-    cell = cell_cls(1)
-    self.assertEqual(cell._enable_caching_device, expected_caching_device)
-
-    # Make sure the config only appears when the none default value is used.
-    config = cell.get_config()
-    self.assertNotIn('enable_caching_device', config)
-
-    non_default_value = not expected_caching_device
-    cell = cell_cls(1, enable_caching_device=non_default_value)
-    self.assertEqual(cell._enable_caching_device, non_default_value)
-    config = cell.get_config()
-    self.assertEqual(config['enable_caching_device'], non_default_value)
-
-
-class RNNCellWithConstants(keras.layers.Layer):
-
-  def __init__(self, units, constant_size, **kwargs):
-    self.units = units
-    self.state_size = units
-    self.constant_size = constant_size
-    super().__init__(**kwargs)
-
-  def build(self, input_shape):
-    self.input_kernel = self.add_weight(
-        shape=(input_shape[-1], self.units),
-        initializer='uniform',
-        name='kernel')
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units),
-        initializer='uniform',
-        name='recurrent_kernel')
-    self.constant_kernel = self.add_weight(
-        shape=(self.constant_size, self.units),
-        initializer='uniform',
-        name='constant_kernel')
-    self.built = True
-
-  def call(self, inputs, states, constants):
-    [prev_output] = states
-    [constant] = constants
-    h_input = keras.backend.dot(inputs, self.input_kernel)
-    h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-    h_const = keras.backend.dot(constant, self.constant_kernel)
-    output = h_input + h_state + h_const
-    return output, [output]
-
-  def get_config(self):
-    config = {'units': self.units, 'constant_size': self.constant_size}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class Minimal2DRNNCell(keras.layers.Layer):
-  """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
-
-  Both internal state and output have 2 dimensions and are orthogonal
-  between each other.
-  """
-
-  def __init__(self, unit_a, unit_b, **kwargs):
-    self.unit_a = unit_a
-    self.unit_b = unit_b
-    self.state_size = tf.TensorShape([unit_a, unit_b])
-    self.output_size = tf.TensorShape([unit_a, unit_b])
-    super().__init__(**kwargs)
-
-  def build(self, input_shape):
-    input_a = input_shape[-2]
-    input_b = input_shape[-1]
-    self.kernel = self.add_weight(
-        shape=(input_a, input_b, self.unit_a, self.unit_b),
-        initializer='uniform',
-        name='kernel')
-    self.recurring_kernel = self.add_weight(
-        shape=(self.unit_a, self.unit_b, self.unit_a, self.unit_b),
-        initializer='uniform',
-        name='recurring_kernel')
-    self.bias = self.add_weight(
-        shape=(self.unit_a, self.unit_b), initializer='uniform', name='bias')
-    self.built = True
-
-  def call(self, inputs, states):
-    prev_output = states[0]
-    h = tf.einsum('bij,ijkl->bkl', inputs, self.kernel)
-    h += tf.expand_dims(self.bias, axis=0)
-    output = h + tf.einsum('bij,ijkl->bkl', prev_output, self.recurring_kernel)
-    return output, [output]
-
 
 class PlusOneRNNCell(keras.layers.Layer):
-  """Add one to the input and state.
+    """Add one to the input and state.
 
-  This cell is used for testing state_size and output_size.
-  """
+    This cell is used for testing state_size and output_size.
+    """
 
-  def __init__(self, num_unit, **kwargs):
-    self.state_size = num_unit
-    super().__init__(**kwargs)
+    def __init__(self, num_unit, **kwargs):
+        self.state_size = num_unit
+        super().__init__(**kwargs)
 
-  def build(self, input_shape):
-    self.output_size = input_shape[-1]
+    def build(self, input_shape):
+        self.output_size = input_shape[-1]
 
-  def call(self, inputs, states):
-    return inputs + 1, [states[0] + 1]
+    def call(self, inputs, states):
+        return inputs + 1, [states[0] + 1]
 
 
 class NestedCell(keras.layers.Layer):
-
-  def __init__(self, unit_1, unit_2, unit_3, use_tuple=False, **kwargs):
-    self.unit_1 = unit_1
-    self.unit_2 = unit_2
-    self.unit_3 = unit_3
-    self.use_tuple = use_tuple
-    super().__init__(**kwargs)
-    # A nested state.
-    if use_tuple:
-      self.state_size = NestedState(
-          s1=unit_1, s2=tf.TensorShape([unit_2, unit_3]))
-    else:
-      self.state_size = (unit_1, tf.TensorShape([unit_2, unit_3]))
-    self.output_size = (unit_1, tf.TensorShape([unit_2, unit_3]))
-
-  def build(self, inputs_shape):
-    # expect input_shape to contain 2 items, [(batch, i1), (batch, i2, i3)]
-    if self.use_tuple:
-      input_1 = inputs_shape.t1[1]
-      input_2, input_3 = inputs_shape.t2[1:]
-    else:
-      input_1 = inputs_shape[0][1]
-      input_2, input_3 = inputs_shape[1][1:]
-
-    self.kernel_1 = self.add_weight(
-        shape=(input_1, self.unit_1), initializer='uniform', name='kernel_1')
-    self.kernel_2_3 = self.add_weight(
-        shape=(input_2, input_3, self.unit_2, self.unit_3),
-        initializer='uniform',
-        name='kernel_2_3')
-
-  def call(self, inputs, states):
-    # inputs should be in [(batch, input_1), (batch, input_2, input_3)]
-    # state should be in shape [(batch, unit_1), (batch, unit_2, unit_3)]
-    flatten_inputs = tf.nest.flatten(inputs)
-    s1, s2 = states
-
-    output_1 = tf.matmul(flatten_inputs[0], self.kernel_1)
-    output_2_3 = tf.einsum('bij,ijkl->bkl', flatten_inputs[1], self.kernel_2_3)
-    state_1 = s1 + output_1
-    state_2_3 = s2 + output_2_3
-
-    output = [output_1, output_2_3]
-    new_states = NestedState(s1=state_1, s2=state_2_3)
-
-    return output, new_states
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def __init__(self, unit_1, unit_2, unit_3, use_tuple=False, **kwargs):
+        self.unit_1 = unit_1
+        self.unit_2 = unit_2
+        self.unit_3 = unit_3
+        self.use_tuple = use_tuple
+        super().__init__(**kwargs)
+        # A nested state.
+        if use_tuple:
+            self.state_size = NestedState(
+                s1=unit_1, s2=tf.TensorShape([unit_2, unit_3])
+            )
+        else:
+            self.state_size = (unit_1, tf.TensorShape([unit_2, unit_3]))
+        self.output_size = (unit_1, tf.TensorShape([unit_2, unit_3]))
+
+    def build(self, inputs_shape):
+        # expect input_shape to contain 2 items, [(batch, i1), (batch, i2, i3)]
+        if self.use_tuple:
+            input_1 = inputs_shape.t1[1]
+            input_2, input_3 = inputs_shape.t2[1:]
+        else:
+            input_1 = inputs_shape[0][1]
+            input_2, input_3 = inputs_shape[1][1:]
+
+        self.kernel_1 = self.add_weight(
+            shape=(input_1, self.unit_1), initializer="uniform", name="kernel_1"
+        )
+        self.kernel_2_3 = self.add_weight(
+            shape=(input_2, input_3, self.unit_2, self.unit_3),
+            initializer="uniform",
+            name="kernel_2_3",
+        )
+
+    def call(self, inputs, states):
+        # inputs should be in [(batch, input_1), (batch, input_2, input_3)]
+        # state should be in shape [(batch, unit_1), (batch, unit_2, unit_3)]
+        flatten_inputs = tf.nest.flatten(inputs)
+        s1, s2 = states
+
+        output_1 = tf.matmul(flatten_inputs[0], self.kernel_1)
+        output_2_3 = tf.einsum(
+            "bij,ijkl->bkl", flatten_inputs[1], self.kernel_2_3
+        )
+        state_1 = s1 + output_1
+        state_2_3 = s2 + output_2_3
+
+        output = [output_1, output_2_3]
+        new_states = NestedState(s1=state_1, s2=state_2_3)
+
+        return output, new_states
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/base_wrapper.py b/keras/layers/rnn/base_wrapper.py
index 24c40007f76c..6058d85fa59b 100644
--- a/keras/layers/rnn/base_wrapper.py
+++ b/keras/layers/rnn/base_wrapper.py
@@ -16,56 +16,77 @@
 
 Wrappers are layers that augment the functionality of another layer.
 """
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import copy
 
 from keras.engine.base_layer import Layer
-from keras.utils import generic_utils
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization as legacy_serialization
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Wrapper')
+@keras_export("keras.layers.Wrapper")
 class Wrapper(Layer):
-  """Abstract wrapper base class.
+    """Abstract wrapper base class.
+
+    Wrappers take another layer and augment it in various ways.
+    Do not use this class as a layer, it is only an abstract base class.
+    Two usable wrappers are the `TimeDistributed` and `Bidirectional` wrappers.
 
-  Wrappers take another layer and augment it in various ways.
-  Do not use this class as a layer, it is only an abstract base class.
-  Two usable wrappers are the `TimeDistributed` and `Bidirectional` wrappers.
+    Args:
+      layer: The layer to be wrapped.
+    """
 
-  Args:
-    layer: The layer to be wrapped.
-  """
+    def __init__(self, layer, **kwargs):
+        try:
+            assert isinstance(layer, Layer)
+        except Exception:
+            raise ValueError(
+                f"Layer {layer} supplied to wrapper is"
+                " not a supported layer type. Please"
+                " ensure wrapped layer is a valid Keras layer."
+            )
+        self.layer = layer
+        super().__init__(**kwargs)
 
-  def __init__(self, layer, **kwargs):
-    assert isinstance(layer, Layer)
-    self.layer = layer
-    super().__init__(**kwargs)
+    def build(self, input_shape=None):
+        if not self.layer.built:
+            self.layer.build(input_shape)
+            self.layer.built = True
+        self.built = True
 
-  def build(self, input_shape=None):
-    if not self.layer.built:
-      self.layer.build(input_shape)
-      self.layer.built = True
-    self.built = True
+    @property
+    def activity_regularizer(self):
+        if hasattr(self.layer, "activity_regularizer"):
+            return self.layer.activity_regularizer
+        else:
+            return None
 
-  @property
-  def activity_regularizer(self):
-    if hasattr(self.layer, 'activity_regularizer'):
-      return self.layer.activity_regularizer
-    else:
-      return None
+    def get_config(self):
+        try:
+            config = {
+                "layer": serialization_lib.serialize_keras_object(self.layer)
+            }
+        except TypeError:  # Case of incompatible custom wrappers
+            config = {
+                "layer": legacy_serialization.serialize_keras_object(self.layer)
+            }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
-  def get_config(self):
-    config = {'layer': generic_utils.serialize_keras_object(self.layer)}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        from keras.layers import deserialize as deserialize_layer
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    from keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    # Avoid mutating the input dict
-    config = copy.deepcopy(config)
-    layer = deserialize_layer(
-        config.pop('layer'), custom_objects=custom_objects)
-    return cls(layer, **config)
+        # Avoid mutating the input dict
+        config = copy.deepcopy(config)
+        use_legacy_format = "module" not in config
+        layer = deserialize_layer(
+            config.pop("layer"),
+            custom_objects=custom_objects,
+            use_legacy_format=use_legacy_format,
+        )
+        return cls(layer, **config)
diff --git a/keras/layers/rnn/base_wrapper_test.py b/keras/layers/rnn/base_wrapper_test.py
index d7d5cbf2f4aa..cd019a5f77a0 100644
--- a/keras/layers/rnn/base_wrapper_test.py
+++ b/keras/layers/rnn/base_wrapper_test.py
@@ -14,31 +14,31 @@
 # ==============================================================================
 """Tests for the Wrapper base class."""
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
-import tensorflow.compat.v2 as tf
 
 
 class ExampleWrapper(keras.layers.Wrapper):
-  """Simple Wrapper subclass."""
+    """Simple Wrapper subclass."""
 
-  def call(self, inputs, *args, **kwargs):
-    return self.layer(inputs, *args, **kwargs)
+    def call(self, inputs, *args, **kwargs):
+        return self.layer(inputs, *args, **kwargs)
 
 
 class WrapperTest(parameterized.TestCase):
+    def test_wrapper_from_config_no_mutation(self):
+        wrapper = ExampleWrapper(keras.layers.Dense(1))
+        config = wrapper.get_config()
+        config_copy = config.copy()
+        self.assertEqual(config, config_copy)
 
-  def test_wrapper_from_config_no_mutation(self):
-    wrapper = ExampleWrapper(keras.layers.Dense(1))
-    config = wrapper.get_config()
-    config_copy = config.copy()
-    self.assertEqual(config, config_copy)
-
-    wrapper_from_config = ExampleWrapper.from_config(config)
-    new_config = wrapper_from_config.get_config()
-    self.assertEqual(new_config, config)
-    self.assertEqual(new_config, config_copy)
+        wrapper_from_config = ExampleWrapper.from_config(config)
+        new_config = wrapper_from_config.get_config()
+        self.assertEqual(new_config, config)
+        self.assertEqual(new_config, config_copy)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/bidirectional.py b/keras/layers/rnn/bidirectional.py
index 70c32d2e0692..3a2d30536b42 100644
--- a/keras/layers/rnn/bidirectional.py
+++ b/keras/layers/rnn/bidirectional.py
@@ -13,455 +13,521 @@
 # limitations under the License.
 # ==============================================================================
 """Bidirectional wrapper for RNNs."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import copy
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine.base_layer import Layer
 from keras.engine.input_spec import InputSpec
 from keras.layers.rnn import rnn_utils
 from keras.layers.rnn.base_wrapper import Wrapper
+from keras.saving import serialization_lib
 from keras.utils import generic_utils
 from keras.utils import tf_inspect
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.Bidirectional')
+@keras_export("keras.layers.Bidirectional")
 class Bidirectional(Wrapper):
-  """Bidirectional wrapper for RNNs.
-
-  Args:
-    layer: `keras.layers.RNN` instance, such as `keras.layers.LSTM` or
-      `keras.layers.GRU`. It could also be a `keras.layers.Layer` instance
-      that meets the following criteria:
-      1. Be a sequence-processing layer (accepts 3D+ inputs).
-      2. Have a `go_backwards`, `return_sequences` and `return_state`
-        attribute (with the same semantics as for the `RNN` class).
-      3. Have an `input_spec` attribute.
-      4. Implement serialization via `get_config()` and `from_config()`.
-      Note that the recommended way to create new RNN layers is to write a
-      custom RNN cell and use it with `keras.layers.RNN`, instead of
-      subclassing `keras.layers.Layer` directly.
-      - When the `returns_sequences` is true, the output of the masked timestep
-      will be zero regardless of the layer's original `zero_output_for_mask`
-      value.
-    merge_mode: Mode by which outputs of the forward and backward RNNs will be
-      combined. One of {'sum', 'mul', 'concat', 'ave', None}. If None, the
-      outputs will not be combined, they will be returned as a list. Default
-      value is 'concat'.
-    backward_layer: Optional `keras.layers.RNN`, or `keras.layers.Layer`
-      instance to be used to handle backwards input processing.
-      If `backward_layer` is not provided, the layer instance passed as the
-      `layer` argument will be used to generate the backward layer
-      automatically.
-      Note that the provided `backward_layer` layer should have properties
-      matching those of the `layer` argument, in particular it should have the
-      same values for `stateful`, `return_states`, `return_sequences`, etc.
-      In addition, `backward_layer` and `layer` should have different
-      `go_backwards` argument values.
-      A `ValueError` will be raised if these requirements are not met.
-
-  Call arguments:
-    The call arguments for this layer are the same as those of the wrapped RNN
-      layer.
-    Beware that when passing the `initial_state` argument during the call of
-    this layer, the first half in the list of elements in the `initial_state`
-    list will be passed to the forward RNN call and the last half in the list
-    of elements will be passed to the backward RNN call.
-
-  Raises:
-    ValueError:
-      1. If `layer` or `backward_layer` is not a `Layer` instance.
-      2. In case of invalid `merge_mode` argument.
-      3. If `backward_layer` has mismatched properties compared to `layer`.
-
-  Examples:
-
-  ```python
-  model = Sequential()
-  model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5, 10)))
-  model.add(Bidirectional(LSTM(10)))
-  model.add(Dense(5))
-  model.add(Activation('softmax'))
-  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
-
-   # With custom backward layer
-   model = Sequential()
-   forward_layer = LSTM(10, return_sequences=True)
-   backward_layer = LSTM(10, activation='relu', return_sequences=True,
-                         go_backwards=True)
-   model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
-                           input_shape=(5, 10)))
-   model.add(Dense(5))
-   model.add(Activation('softmax'))
-   model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
-  ```
-  """
-
-  def __init__(self,
-               layer,
-               merge_mode='concat',
-               weights=None,
-               backward_layer=None,
-               **kwargs):
-    if not isinstance(layer, Layer):
-      raise ValueError(
-          'Please initialize `Bidirectional` layer with a '
-          f'`tf.keras.layers.Layer` instance. Received: {layer}')
-    if backward_layer is not None and not isinstance(backward_layer, Layer):
-      raise ValueError(
-          '`backward_layer` need to be a `tf.keras.layers.Layer` instance. '
-          f'Received: {backward_layer}')
-    if merge_mode not in ['sum', 'mul', 'ave', 'concat', None]:
-      raise ValueError(f'Invalid merge mode. Received: {merge_mode}. '
-                       'Merge mode should be one of '
-                       '{"sum", "mul", "ave", "concat", None}')
-    # We don't want to track `layer` since we're already tracking the two copies
-    # of it we actually run.
-    self._setattr_tracking = False
-    super().__init__(layer, **kwargs)
-    self._setattr_tracking = True
-
-    # Recreate the forward layer from the original layer config, so that it will
-    # not carry over any state from the layer.
-    self.forward_layer = self._recreate_layer_from_config(layer)
-
-    if backward_layer is None:
-      self.backward_layer = self._recreate_layer_from_config(
-          layer, go_backwards=True)
-    else:
-      self.backward_layer = backward_layer
-      # Keep the custom backward layer config, so that we can save it later. The
-      # layer's name might be updated below with prefix 'backward_', and we want
-      # to preserve the original config.
-      self._backward_layer_config = generic_utils.serialize_keras_object(
-          backward_layer)
-
-    self.forward_layer._name = 'forward_' + self.forward_layer.name
-    self.backward_layer._name = 'backward_' + self.backward_layer.name
-
-    self._verify_layer_config()
-
-    def force_zero_output_for_mask(layer):
-      # Force the zero_output_for_mask to be True if returning sequences.
-      if getattr(layer, 'zero_output_for_mask', None) is not None:
-        layer.zero_output_for_mask = layer.return_sequences
-
-    force_zero_output_for_mask(self.forward_layer)
-    force_zero_output_for_mask(self.backward_layer)
-
-    self.merge_mode = merge_mode
-    if weights:
-      nw = len(weights)
-      self.forward_layer.initial_weights = weights[:nw // 2]
-      self.backward_layer.initial_weights = weights[nw // 2:]
-    self.stateful = layer.stateful
-    self.return_sequences = layer.return_sequences
-    self.return_state = layer.return_state
-    self.supports_masking = True
-    self._trainable = True
-    self._num_constants = 0
-    self.input_spec = layer.input_spec
-
-  @property
-  def _use_input_spec_as_call_signature(self):
-    return self.layer._use_input_spec_as_call_signature  # pylint: disable=protected-access
-
-  def _verify_layer_config(self):
-    """Ensure the forward and backward layers have valid common property."""
-    if self.forward_layer.go_backwards == self.backward_layer.go_backwards:
-      raise ValueError(
-          'Forward layer and backward layer should have different '
-          '`go_backwards` value.'
-          f'forward_layer.go_backwards = {self.forward_layer.go_backwards},'
-          f'backward_layer.go_backwards = {self.backward_layer.go_backwards}')
-
-    common_attributes = ('stateful', 'return_sequences', 'return_state')
-    for a in common_attributes:
-      forward_value = getattr(self.forward_layer, a)
-      backward_value = getattr(self.backward_layer, a)
-      if forward_value != backward_value:
-        raise ValueError(
-            'Forward layer and backward layer are expected to have the same '
-            f'value for attribute "{a}", got "{forward_value}" for forward '
-            f'layer and "{backward_value}" for backward layer')
-
-  def _recreate_layer_from_config(self, layer, go_backwards=False):
-    # When recreating the layer from its config, it is possible that the layer
-    # is a RNN layer that contains custom cells. In this case we inspect the
-    # layer and pass the custom cell class as part of the `custom_objects`
-    # argument when calling `from_config`.
-    # See https://github.com/tensorflow/tensorflow/issues/26581 for more detail.
-    config = layer.get_config()
-    if go_backwards:
-      config['go_backwards'] = not config['go_backwards']
-    if 'custom_objects' in tf_inspect.getfullargspec(
-        layer.__class__.from_config).args:
-      custom_objects = {}
-      cell = getattr(layer, 'cell', None)
-      if cell is not None:
-        custom_objects[cell.__class__.__name__] = cell.__class__
-        # For StackedRNNCells
-        stacked_cells = getattr(cell, 'cells', [])
-        for c in stacked_cells:
-          custom_objects[c.__class__.__name__] = c.__class__
-      return layer.__class__.from_config(config, custom_objects=custom_objects)
-    else:
-      return layer.__class__.from_config(config)
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    output_shape = self.forward_layer.compute_output_shape(input_shape)
-    if self.return_state:
-      state_shape = tf_utils.convert_shapes(output_shape[1:], to_tuples=False)
-      output_shape = tf_utils.convert_shapes(output_shape[0], to_tuples=False)
-    else:
-      output_shape = tf_utils.convert_shapes(output_shape, to_tuples=False)
-
-    if self.merge_mode == 'concat':
-      output_shape = output_shape.as_list()
-      output_shape[-1] *= 2
-      output_shape = tf.TensorShape(output_shape)
-    elif self.merge_mode is None:
-      output_shape = [output_shape, copy.copy(output_shape)]
-
-    if self.return_state:
-      if self.merge_mode is None:
-        return output_shape + state_shape + copy.copy(state_shape)
-      return [output_shape] + state_shape + copy.copy(state_shape)
-    return output_shape
-
-  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    """`Bidirectional.__call__` implements the same API as the wrapped `RNN`."""
-    inputs, initial_state, constants = rnn_utils.standardize_args(
-        inputs, initial_state, constants, self._num_constants)
-
-    if isinstance(inputs, list):
-      if len(inputs) > 1:
-        initial_state = inputs[1:]
-      inputs = inputs[0]
-
-    if initial_state is None and constants is None:
-      return super().__call__(inputs, **kwargs)
-
-    # Applies the same workaround as in `RNN.__call__`
-    additional_inputs = []
-    additional_specs = []
-    if initial_state is not None:
-      # Check if `initial_state` can be split into half
-      num_states = len(initial_state)
-      if num_states % 2 > 0:
-        raise ValueError(
-            'When passing `initial_state` to a Bidirectional RNN, '
-            'the state should be a list containing the states of '
-            'the underlying RNNs. '
-            f'Received: {initial_state}')
-
-      kwargs['initial_state'] = initial_state
-      additional_inputs += initial_state
-      state_specs = tf.nest.map_structure(
-          lambda state: InputSpec(shape=backend.int_shape(state)),
-          initial_state)
-      self.forward_layer.state_spec = state_specs[:num_states // 2]
-      self.backward_layer.state_spec = state_specs[num_states // 2:]
-      additional_specs += state_specs
-    if constants is not None:
-      kwargs['constants'] = constants
-      additional_inputs += constants
-      constants_spec = [InputSpec(shape=backend.int_shape(constant))
-                        for constant in constants]
-      self.forward_layer.constants_spec = constants_spec
-      self.backward_layer.constants_spec = constants_spec
-      additional_specs += constants_spec
-
-      self._num_constants = len(constants)
-      self.forward_layer._num_constants = self._num_constants
-      self.backward_layer._num_constants = self._num_constants
-
-    is_keras_tensor = backend.is_keras_tensor(
-        tf.nest.flatten(additional_inputs)[0])
-    for tensor in tf.nest.flatten(additional_inputs):
-      if backend.is_keras_tensor(tensor) != is_keras_tensor:
-        raise ValueError('The initial state of a Bidirectional'
-                         ' layer cannot be specified with a mix of'
-                         ' Keras tensors and non-Keras tensors'
-                         ' (a "Keras tensor" is a tensor that was'
-                         ' returned by a Keras layer, or by `Input`)')
-
-    if is_keras_tensor:
-      # Compute the full input spec, including state
-      full_input = [inputs] + additional_inputs
-      # The original input_spec is None since there could be a nested tensor
-      # input. Update the input_spec to match the inputs.
-      full_input_spec = [None for _ in range(len(tf.nest.flatten(inputs)))
-                        ] + additional_specs
-      # Removing kwargs since the value are passed with input list.
-      kwargs['initial_state'] = None
-      kwargs['constants'] = None
-
-      # Perform the call with temporarily replaced input_spec
-      original_input_spec = self.input_spec
-      self.input_spec = full_input_spec
-      output = super().__call__(full_input, **kwargs)
-      self.input_spec = original_input_spec
-      return output
-    else:
-      return super().__call__(inputs, **kwargs)
-
-  def call(self,
-           inputs,
-           training=None,
-           mask=None,
-           initial_state=None,
-           constants=None):
-    """`Bidirectional.call` implements the same API as the wrapped `RNN`."""
-    kwargs = {}
-    if generic_utils.has_arg(self.layer.call, 'training'):
-      kwargs['training'] = training
-    if generic_utils.has_arg(self.layer.call, 'mask'):
-      kwargs['mask'] = mask
-    if generic_utils.has_arg(self.layer.call, 'constants'):
-      kwargs['constants'] = constants
-
-    if generic_utils.has_arg(self.layer.call, 'initial_state'):
-      if isinstance(inputs, list) and len(inputs) > 1:
-        # initial_states are keras tensors, which means they are passed in
-        # together with inputs as list. The initial_states need to be split into
-        # forward and backward section, and be feed to layers accordingly.
-        forward_inputs = [inputs[0]]
-        backward_inputs = [inputs[0]]
-        pivot = (len(inputs) - self._num_constants) // 2 + 1
-        # add forward initial state
-        forward_inputs += inputs[1:pivot]
-        if not self._num_constants:
-          # add backward initial state
-          backward_inputs += inputs[pivot:]
+    """Bidirectional wrapper for RNNs.
+
+    Args:
+      layer: `keras.layers.RNN` instance, such as `keras.layers.LSTM` or
+        `keras.layers.GRU`. It could also be a `keras.layers.Layer` instance
+        that meets the following criteria:
+        1. Be a sequence-processing layer (accepts 3D+ inputs).
+        2. Have a `go_backwards`, `return_sequences` and `return_state`
+          attribute (with the same semantics as for the `RNN` class).
+        3. Have an `input_spec` attribute.
+        4. Implement serialization via `get_config()` and `from_config()`.
+        Note that the recommended way to create new RNN layers is to write a
+        custom RNN cell and use it with `keras.layers.RNN`, instead of
+        subclassing `keras.layers.Layer` directly.
+        - When the `returns_sequences` is true, the output of the masked
+        timestep will be zero regardless of the layer's original
+        `zero_output_for_mask` value.
+      merge_mode: Mode by which outputs of the forward and backward RNNs will be
+        combined. One of {'sum', 'mul', 'concat', 'ave', None}. If None, the
+        outputs will not be combined, they will be returned as a list. Default
+        value is 'concat'.
+      backward_layer: Optional `keras.layers.RNN`, or `keras.layers.Layer`
+        instance to be used to handle backwards input processing.
+        If `backward_layer` is not provided, the layer instance passed as the
+        `layer` argument will be used to generate the backward layer
+        automatically.
+        Note that the provided `backward_layer` layer should have properties
+        matching those of the `layer` argument, in particular it should have the
+        same values for `stateful`, `return_states`, `return_sequences`, etc.
+        In addition, `backward_layer` and `layer` should have different
+        `go_backwards` argument values.
+        A `ValueError` will be raised if these requirements are not met.
+
+    Call arguments:
+      The call arguments for this layer are the same as those of the wrapped RNN
+        layer.
+      Beware that when passing the `initial_state` argument during the call of
+      this layer, the first half in the list of elements in the `initial_state`
+      list will be passed to the forward RNN call and the last half in the list
+      of elements will be passed to the backward RNN call.
+
+    Raises:
+      ValueError:
+        1. If `layer` or `backward_layer` is not a `Layer` instance.
+        2. In case of invalid `merge_mode` argument.
+        3. If `backward_layer` has mismatched properties compared to `layer`.
+
+    Examples:
+
+    ```python
+    model = Sequential()
+    model.add(Bidirectional(LSTM(10, return_sequences=True),
+                                 input_shape=(5, 10)))
+    model.add(Bidirectional(LSTM(10)))
+    model.add(Dense(5))
+    model.add(Activation('softmax'))
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+
+    # With custom backward layer
+    model = Sequential()
+    forward_layer = LSTM(10, return_sequences=True)
+    backward_layer = LSTM(10, activation='relu', return_sequences=True,
+                          go_backwards=True)
+    model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
+                            input_shape=(5, 10)))
+    model.add(Dense(5))
+    model.add(Activation('softmax'))
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+    ```
+    """
+
+    def __init__(
+        self,
+        layer,
+        merge_mode="concat",
+        weights=None,
+        backward_layer=None,
+        **kwargs,
+    ):
+        if not isinstance(layer, Layer):
+            raise ValueError(
+                "Please initialize `Bidirectional` layer with a "
+                f"`tf.keras.layers.Layer` instance. Received: {layer}"
+            )
+        if backward_layer is not None and not isinstance(backward_layer, Layer):
+            raise ValueError(
+                "`backward_layer` need to be a `tf.keras.layers.Layer` "
+                f"instance. Received: {backward_layer}"
+            )
+        if merge_mode not in ["sum", "mul", "ave", "concat", None]:
+            raise ValueError(
+                f"Invalid merge mode. Received: {merge_mode}. "
+                "Merge mode should be one of "
+                '{"sum", "mul", "ave", "concat", None}'
+            )
+        # We don't want to track `layer` since we're already tracking the two
+        # copies of it we actually run.
+        self._setattr_tracking = False
+        super().__init__(layer, **kwargs)
+        self._setattr_tracking = True
+
+        # Recreate the forward layer from the original layer config, so that it
+        # will not carry over any state from the layer.
+        self.forward_layer = self._recreate_layer_from_config(layer)
+
+        if backward_layer is None:
+            self.backward_layer = self._recreate_layer_from_config(
+                layer, go_backwards=True
+            )
+        else:
+            self.backward_layer = backward_layer
+
+            # Keep the custom backward layer config, so that we can save it
+            # later. The layer's name might be updated below with prefix
+            # 'backward_', and we want to preserve the original config.
+            self._backward_layer_config = (
+                serialization_lib.serialize_keras_object(backward_layer)
+            )
+
+        self.forward_layer._name = "forward_" + self.forward_layer.name
+        self.backward_layer._name = "backward_" + self.backward_layer.name
+
+        self._verify_layer_config()
+
+        def force_zero_output_for_mask(layer):
+            # Force the zero_output_for_mask to be True if returning sequences.
+            if getattr(layer, "zero_output_for_mask", None) is not None:
+                layer.zero_output_for_mask = layer.return_sequences
+
+        force_zero_output_for_mask(self.forward_layer)
+        force_zero_output_for_mask(self.backward_layer)
+
+        self.merge_mode = merge_mode
+        if weights:
+            nw = len(weights)
+            self.forward_layer.initial_weights = weights[: nw // 2]
+            self.backward_layer.initial_weights = weights[nw // 2 :]
+        self.stateful = layer.stateful
+        self.return_sequences = layer.return_sequences
+        self.return_state = layer.return_state
+        self.supports_masking = True
+        self._trainable = kwargs.get("trainable", layer.trainable)
+        self._num_constants = 0
+        self.input_spec = layer.input_spec
+
+    @property
+    def _use_input_spec_as_call_signature(self):
+        return self.layer._use_input_spec_as_call_signature
+
+    def _verify_layer_config(self):
+        """Ensure the forward and backward layers have valid common property."""
+        if self.forward_layer.go_backwards == self.backward_layer.go_backwards:
+            raise ValueError(
+                "Forward layer and backward layer should have different "
+                "`go_backwards` value."
+                "forward_layer.go_backwards = "
+                f"{self.forward_layer.go_backwards},"
+                "backward_layer.go_backwards = "
+                f"{self.backward_layer.go_backwards}"
+            )
+
+        common_attributes = ("stateful", "return_sequences", "return_state")
+        for a in common_attributes:
+            forward_value = getattr(self.forward_layer, a)
+            backward_value = getattr(self.backward_layer, a)
+            if forward_value != backward_value:
+                raise ValueError(
+                    "Forward layer and backward layer are expected to have "
+                    f'the same value for attribute "{a}", got '
+                    f'"{forward_value}" for forward layer and '
+                    f'"{backward_value}" for backward layer'
+                )
+
+    def _recreate_layer_from_config(self, layer, go_backwards=False):
+        # When recreating the layer from its config, it is possible that the
+        # layer is a RNN layer that contains custom cells. In this case we
+        # inspect the layer and pass the custom cell class as part of the
+        # `custom_objects` argument when calling `from_config`.  See
+        # https://github.com/tensorflow/tensorflow/issues/26581 for more detail.
+        config = layer.get_config()
+        if go_backwards:
+            config["go_backwards"] = not config["go_backwards"]
+        if (
+            "custom_objects"
+            in tf_inspect.getfullargspec(layer.__class__.from_config).args
+        ):
+            custom_objects = {}
+            cell = getattr(layer, "cell", None)
+            if cell is not None:
+                custom_objects[cell.__class__.__name__] = cell.__class__
+                # For StackedRNNCells
+                stacked_cells = getattr(cell, "cells", [])
+                for c in stacked_cells:
+                    custom_objects[c.__class__.__name__] = c.__class__
+            return layer.__class__.from_config(
+                config, custom_objects=custom_objects
+            )
+        else:
+            return layer.__class__.from_config(config)
+
+    @tf_utils.shape_type_conversion
+    def compute_output_shape(self, input_shape):
+        output_shape = self.forward_layer.compute_output_shape(input_shape)
+        if self.return_state:
+            state_shape = tf_utils.convert_shapes(
+                output_shape[1:], to_tuples=False
+            )
+            output_shape = tf_utils.convert_shapes(
+                output_shape[0], to_tuples=False
+            )
+        else:
+            output_shape = tf_utils.convert_shapes(
+                output_shape, to_tuples=False
+            )
+
+        if self.merge_mode == "concat":
+            output_shape = output_shape.as_list()
+            output_shape[-1] *= 2
+            output_shape = tf.TensorShape(output_shape)
+        elif self.merge_mode is None:
+            output_shape = [output_shape, copy.copy(output_shape)]
+
+        if self.return_state:
+            if self.merge_mode is None:
+                return output_shape + state_shape + copy.copy(state_shape)
+            return [output_shape] + state_shape + copy.copy(state_shape)
+        return output_shape
+
+    def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+        """`Bidirectional.__call__` implements the same API as the wrapped
+        `RNN`."""
+        inputs, initial_state, constants = rnn_utils.standardize_args(
+            inputs, initial_state, constants, self._num_constants
+        )
+
+        if isinstance(inputs, list):
+            if len(inputs) > 1:
+                initial_state = inputs[1:]
+            inputs = inputs[0]
+
+        if initial_state is None and constants is None:
+            return super().__call__(inputs, **kwargs)
+
+        # Applies the same workaround as in `RNN.__call__`
+        additional_inputs = []
+        additional_specs = []
+        if initial_state is not None:
+            # Check if `initial_state` can be split into half
+            num_states = len(initial_state)
+            if num_states % 2 > 0:
+                raise ValueError(
+                    "When passing `initial_state` to a Bidirectional RNN, "
+                    "the state should be a list containing the states of "
+                    "the underlying RNNs. "
+                    f"Received: {initial_state}"
+                )
+
+            kwargs["initial_state"] = initial_state
+            additional_inputs += initial_state
+            state_specs = tf.nest.map_structure(
+                lambda state: InputSpec(shape=backend.int_shape(state)),
+                initial_state,
+            )
+            self.forward_layer.state_spec = state_specs[: num_states // 2]
+            self.backward_layer.state_spec = state_specs[num_states // 2 :]
+            additional_specs += state_specs
+        if constants is not None:
+            kwargs["constants"] = constants
+            additional_inputs += constants
+            constants_spec = [
+                InputSpec(shape=backend.int_shape(constant))
+                for constant in constants
+            ]
+            self.forward_layer.constants_spec = constants_spec
+            self.backward_layer.constants_spec = constants_spec
+            additional_specs += constants_spec
+
+            self._num_constants = len(constants)
+            self.forward_layer._num_constants = self._num_constants
+            self.backward_layer._num_constants = self._num_constants
+
+        is_keras_tensor = backend.is_keras_tensor(
+            tf.nest.flatten(additional_inputs)[0]
+        )
+        for tensor in tf.nest.flatten(additional_inputs):
+            if backend.is_keras_tensor(tensor) != is_keras_tensor:
+                raise ValueError(
+                    "The initial state of a Bidirectional"
+                    " layer cannot be specified with a mix of"
+                    " Keras tensors and non-Keras tensors"
+                    ' (a "Keras tensor" is a tensor that was'
+                    " returned by a Keras layer, or by `Input`)"
+                )
+
+        if is_keras_tensor:
+            # Compute the full input spec, including state
+            full_input = [inputs] + additional_inputs
+            # The original input_spec is None since there could be a nested
+            # tensor input. Update the input_spec to match the inputs.
+            full_input_spec = [
+                None for _ in range(len(tf.nest.flatten(inputs)))
+            ] + additional_specs
+            # Removing kwargs since the value are passed with input list.
+            kwargs["initial_state"] = None
+            kwargs["constants"] = None
+
+            # Perform the call with temporarily replaced input_spec
+            original_input_spec = self.input_spec
+            self.input_spec = full_input_spec
+            output = super().__call__(full_input, **kwargs)
+            self.input_spec = original_input_spec
+            return output
+        else:
+            return super().__call__(inputs, **kwargs)
+
+    def call(
+        self,
+        inputs,
+        training=None,
+        mask=None,
+        initial_state=None,
+        constants=None,
+    ):
+        """`Bidirectional.call` implements the same API as the wrapped `RNN`."""
+        kwargs = {}
+        if generic_utils.has_arg(self.layer.call, "training"):
+            kwargs["training"] = training
+        if generic_utils.has_arg(self.layer.call, "mask"):
+            kwargs["mask"] = mask
+        if generic_utils.has_arg(self.layer.call, "constants"):
+            kwargs["constants"] = constants
+
+        if generic_utils.has_arg(self.layer.call, "initial_state"):
+            if isinstance(inputs, list) and len(inputs) > 1:
+                # initial_states are keras tensors, which means they are passed
+                # in together with inputs as list. The initial_states need to be
+                # split into forward and backward section, and be feed to layers
+                # accordingly.
+                forward_inputs = [inputs[0]]
+                backward_inputs = [inputs[0]]
+                pivot = (len(inputs) - self._num_constants) // 2 + 1
+                # add forward initial state
+                forward_inputs += inputs[1:pivot]
+                if not self._num_constants:
+                    # add backward initial state
+                    backward_inputs += inputs[pivot:]
+                else:
+                    # add backward initial state
+                    backward_inputs += inputs[pivot : -self._num_constants]
+                    # add constants for forward and backward layers
+                    forward_inputs += inputs[-self._num_constants :]
+                    backward_inputs += inputs[-self._num_constants :]
+                forward_state, backward_state = None, None
+                if "constants" in kwargs:
+                    kwargs["constants"] = None
+            elif initial_state is not None:
+                # initial_states are not keras tensors, eg eager tensor from np
+                # array.  They are only passed in from kwarg initial_state, and
+                # should be passed to forward/backward layer via kwarg
+                # initial_state as well.
+                forward_inputs, backward_inputs = inputs, inputs
+                half = len(initial_state) // 2
+                forward_state = initial_state[:half]
+                backward_state = initial_state[half:]
+            else:
+                forward_inputs, backward_inputs = inputs, inputs
+                forward_state, backward_state = None, None
+
+            y = self.forward_layer(
+                forward_inputs, initial_state=forward_state, **kwargs
+            )
+            y_rev = self.backward_layer(
+                backward_inputs, initial_state=backward_state, **kwargs
+            )
+        else:
+            y = self.forward_layer(inputs, **kwargs)
+            y_rev = self.backward_layer(inputs, **kwargs)
+
+        if self.return_state:
+            states = y[1:] + y_rev[1:]
+            y = y[0]
+            y_rev = y_rev[0]
+
+        if self.return_sequences:
+            time_dim = (
+                0 if getattr(self.forward_layer, "time_major", False) else 1
+            )
+            y_rev = backend.reverse(y_rev, time_dim)
+        if self.merge_mode == "concat":
+            output = backend.concatenate([y, y_rev])
+        elif self.merge_mode == "sum":
+            output = y + y_rev
+        elif self.merge_mode == "ave":
+            output = (y + y_rev) / 2
+        elif self.merge_mode == "mul":
+            output = y * y_rev
+        elif self.merge_mode is None:
+            output = [y, y_rev]
+        else:
+            raise ValueError(
+                "Unrecognized value for `merge_mode`. "
+                f"Received: {self.merge_mode}"
+                'Expected values are ["concat", "sum", "ave", "mul"]'
+            )
+
+        if self.return_state:
+            if self.merge_mode is None:
+                return output + states
+            return [output] + states
+        return output
+
+    def reset_states(self, states=None):
+        if not self.stateful:
+            raise AttributeError("Layer must be stateful.")
+
+        if states is None:
+            self.forward_layer.reset_states()
+            self.backward_layer.reset_states()
+        else:
+            if not isinstance(states, (list, tuple)):
+                raise ValueError(
+                    "Unrecognized value for `states`. "
+                    "Expected `states` to be list or tuple. "
+                    f"Received: {states}"
+                )
+
+            half = len(states) // 2
+            self.forward_layer.reset_states(states[:half])
+            self.backward_layer.reset_states(states[half:])
+
+    def build(self, input_shape):
+        with backend.name_scope(self.forward_layer.name):
+            self.forward_layer.build(input_shape)
+        with backend.name_scope(self.backward_layer.name):
+            self.backward_layer.build(input_shape)
+        self.built = True
+
+    def compute_mask(self, inputs, mask):
+        if isinstance(mask, list):
+            mask = mask[0]
+        if self.return_sequences:
+            if not self.merge_mode:
+                output_mask = [mask, mask]
+            else:
+                output_mask = mask
         else:
-          # add backward initial state
-          backward_inputs += inputs[pivot:-self._num_constants]
-          # add constants for forward and backward layers
-          forward_inputs += inputs[-self._num_constants:]
-          backward_inputs += inputs[-self._num_constants:]
-        forward_state, backward_state = None, None
-        if 'constants' in kwargs:
-          kwargs['constants'] = None
-      elif initial_state is not None:
-        # initial_states are not keras tensors, eg eager tensor from np array.
-        # They are only passed in from kwarg initial_state, and should be passed
-        # to forward/backward layer via kwarg initial_state as well.
-        forward_inputs, backward_inputs = inputs, inputs
-        half = len(initial_state) // 2
-        forward_state = initial_state[:half]
-        backward_state = initial_state[half:]
-      else:
-        forward_inputs, backward_inputs = inputs, inputs
-        forward_state, backward_state = None, None
-
-      y = self.forward_layer(forward_inputs,
-                             initial_state=forward_state, **kwargs)
-      y_rev = self.backward_layer(backward_inputs,
-                                  initial_state=backward_state, **kwargs)
-    else:
-      y = self.forward_layer(inputs, **kwargs)
-      y_rev = self.backward_layer(inputs, **kwargs)
-
-    if self.return_state:
-      states = y[1:] + y_rev[1:]
-      y = y[0]
-      y_rev = y_rev[0]
-
-    if self.return_sequences:
-      time_dim = 0 if getattr(self.forward_layer, 'time_major', False) else 1
-      y_rev = backend.reverse(y_rev, time_dim)
-    if self.merge_mode == 'concat':
-      output = backend.concatenate([y, y_rev])
-    elif self.merge_mode == 'sum':
-      output = y + y_rev
-    elif self.merge_mode == 'ave':
-      output = (y + y_rev) / 2
-    elif self.merge_mode == 'mul':
-      output = y * y_rev
-    elif self.merge_mode is None:
-      output = [y, y_rev]
-    else:
-      raise ValueError(
-          f'Unrecognized value for `merge_mode`. Received: {self.merge_mode}'
-          'Expected values are ["concat", "sum", "ave", "mul"]')
-
-    if self.return_state:
-      if self.merge_mode is None:
-        return output + states
-      return [output] + states
-    return output
-
-  def reset_states(self):
-    self.forward_layer.reset_states()
-    self.backward_layer.reset_states()
-
-  def build(self, input_shape):
-    with backend.name_scope(self.forward_layer.name):
-      self.forward_layer.build(input_shape)
-    with backend.name_scope(self.backward_layer.name):
-      self.backward_layer.build(input_shape)
-    self.built = True
-
-  def compute_mask(self, inputs, mask):
-    if isinstance(mask, list):
-      mask = mask[0]
-    if self.return_sequences:
-      if not self.merge_mode:
-        output_mask = [mask, mask]
-      else:
-        output_mask = mask
-    else:
-      output_mask = [None, None] if not self.merge_mode else None
-
-    if self.return_state:
-      states = self.forward_layer.states
-      state_mask = [None for _ in states]
-      if isinstance(output_mask, list):
-        return output_mask + state_mask * 2
-      return [output_mask] + state_mask * 2
-    return output_mask
-
-  @property
-  def constraints(self):
-    constraints = {}
-    if hasattr(self.forward_layer, 'constraints'):
-      constraints.update(self.forward_layer.constraints)
-      constraints.update(self.backward_layer.constraints)
-    return constraints
-
-  def get_config(self):
-    config = {'merge_mode': self.merge_mode}
-    if self._num_constants:
-      config['num_constants'] = self._num_constants
-
-    if hasattr(self, '_backward_layer_config'):
-      config['backward_layer'] = self._backward_layer_config
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    # Instead of updating the input, create a copy and use that.
-    config = copy.deepcopy(config)
-    num_constants = config.pop('num_constants', 0)
-    # Handle forward layer instantiation (as would parent class).
-    from keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    config['layer'] = deserialize_layer(
-        config['layer'], custom_objects=custom_objects)
-    # Handle (optional) backward layer instantiation.
-    backward_layer_config = config.pop('backward_layer', None)
-    if backward_layer_config is not None:
-      backward_layer = deserialize_layer(
-          backward_layer_config, custom_objects=custom_objects)
-      config['backward_layer'] = backward_layer
-    # Instantiate the wrapper, adjust it and return it.
-    layer = cls(**config)
-    layer._num_constants = num_constants  # pylint: disable=protected-access
-    return layer
+            output_mask = [None, None] if not self.merge_mode else None
+
+        if self.return_state:
+            states = self.forward_layer.states
+            state_mask = [None for _ in states]
+            if isinstance(output_mask, list):
+                return output_mask + state_mask * 2
+            return [output_mask] + state_mask * 2
+        return output_mask
+
+    @property
+    def constraints(self):
+        constraints = {}
+        if hasattr(self.forward_layer, "constraints"):
+            constraints.update(self.forward_layer.constraints)
+            constraints.update(self.backward_layer.constraints)
+        return constraints
+
+    def get_config(self):
+        config = {"merge_mode": self.merge_mode}
+        if self._num_constants:
+            config["num_constants"] = self._num_constants
+
+        if hasattr(self, "_backward_layer_config"):
+            config["backward_layer"] = self._backward_layer_config
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        # Instead of updating the input, create a copy and use that.
+        config = copy.deepcopy(config)
+        num_constants = config.pop("num_constants", 0)
+        # Handle forward layer instantiation (as would parent class).
+        from keras.layers import deserialize as deserialize_layer
+
+        config["layer"] = deserialize_layer(
+            config["layer"], custom_objects=custom_objects
+        )
+        # Handle (optional) backward layer instantiation.
+        backward_layer_config = config.pop("backward_layer", None)
+        if backward_layer_config is not None:
+            backward_layer = deserialize_layer(
+                backward_layer_config, custom_objects=custom_objects
+            )
+            config["backward_layer"] = backward_layer
+        # Instantiate the wrapper, adjust it and return it.
+        layer = cls(**config)
+        layer._num_constants = num_constants
+        return layer
diff --git a/keras/layers/rnn/bidirectional_test.py b/keras/layers/rnn/bidirectional_test.py
index 29df473f3fe2..cc97f2c1b91f 100644
--- a/keras/layers/rnn/bidirectional_test.py
+++ b/keras/layers/rnn/bidirectional_test.py
@@ -17,922 +17,1117 @@
 
 import copy
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.engine import base_layer_utils
 from keras.layers import core
 from keras.layers.rnn.cell_wrappers import ResidualWrapper
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.utils import generic_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.training.tracking import util as trackable_util
+# isort: off
+from tensorflow.python.checkpoint import (
+    checkpoint as trackable_util,
+)
+from tensorflow.python.framework import (
+    test_util as tf_test_util,
+)
 
 
 class _RNNCellWithConstants(keras.layers.Layer):
-
-  def __init__(self, units, constant_size, **kwargs):
-    self.units = units
-    self.state_size = units
-    self.constant_size = constant_size
-    super().__init__(**kwargs)
-
-  def build(self, input_shape):
-    self.input_kernel = self.add_weight(
-        shape=(input_shape[-1], self.units),
-        initializer='uniform',
-        name='kernel')
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units),
-        initializer='uniform',
-        name='recurrent_kernel')
-    self.constant_kernel = self.add_weight(
-        shape=(self.constant_size, self.units),
-        initializer='uniform',
-        name='constant_kernel')
-    self.built = True
-
-  def call(self, inputs, states, constants):
-    [prev_output] = states
-    [constant] = constants
-    h_input = keras.backend.dot(inputs, self.input_kernel)
-    h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-    h_const = keras.backend.dot(constant, self.constant_kernel)
-    output = h_input + h_state + h_const
-    return output, [output]
-
-  def get_config(self):
-    config = {'units': self.units, 'constant_size': self.constant_size}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def __init__(self, units, constant_size, **kwargs):
+        self.units = units
+        self.state_size = units
+        self.constant_size = constant_size
+        super().__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.input_kernel = self.add_weight(
+            shape=(input_shape[-1], self.units),
+            initializer="uniform",
+            name="kernel",
+        )
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            initializer="uniform",
+            name="recurrent_kernel",
+        )
+        self.constant_kernel = self.add_weight(
+            shape=(self.constant_size, self.units),
+            initializer="uniform",
+            name="constant_kernel",
+        )
+        self.built = True
+
+    def call(self, inputs, states, constants):
+        [prev_output] = states
+        [constant] = constants
+        h_input = keras.backend.dot(inputs, self.input_kernel)
+        h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+        h_const = keras.backend.dot(constant, self.constant_kernel)
+        output = h_input + h_state + h_const
+        return output, [output]
+
+    def get_config(self):
+        config = {"units": self.units, "constant_size": self.constant_size}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 class _ResidualLSTMCell(keras.layers.LSTMCell):
-
-  def call(self, inputs, states, training=None):
-    output, states = super().call(inputs, states)
-    return output + inputs, states
+    def call(self, inputs, states, training=None):
+        output, states = super().call(inputs, states)
+        return output + inputs, states
 
 
 class _AddOneCell(keras.layers.AbstractRNNCell):
-  """Increments inputs and state by one on each call."""
+    """Increments inputs and state by one on each call."""
 
-  @property
-  def state_size(self):
-    return 1
+    @property
+    def state_size(self):
+        return 1
 
-  @property
-  def output_size(self):
-    return 1
+    @property
+    def output_size(self):
+        return 1
 
-  def call(self, inputs, state):
-    inputs = tf.reduce_mean(inputs, axis=1, keepdims=True)
-    outputs = inputs + 1.0
-    state = tf.nest.map_structure(lambda t: t + 1.0, state)
-    return outputs, state
+    def call(self, inputs, state):
+        inputs = tf.reduce_mean(inputs, axis=1, keepdims=True)
+        outputs = inputs + 1.0
+        state = tf.nest.map_structure(lambda t: t + 1.0, state)
+        return outputs, state
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class BidirectionalTest(tf.test.TestCase, parameterized.TestCase):
+    @parameterized.parameters(["sum", "concat", "ave", "mul"])
+    def test_bidirectional(self, mode):
+        rnn = keras.layers.SimpleRNN
+        samples = 2
+        dim = 2
+        timesteps = 2
+        output_dim = 2
+        with self.cached_session():
+            x = np.random.random((samples, timesteps, dim))
+            target_dim = 2 * output_dim if mode == "concat" else output_dim
+            y = np.random.random((samples, target_dim))
+
+            # test with Sequential model
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Bidirectional(
+                    rnn(output_dim),
+                    merge_mode=mode,
+                    input_shape=(timesteps, dim),
+                )
+            )
+            model.compile(optimizer="rmsprop", loss="mse")
+            model.fit(x, y, epochs=1, batch_size=1)
+
+            # check whether the model variables are present in the
+            # trackable list of objects
+            checkpointed_object_ids = {
+                id(o) for o in trackable_util.list_objects(model)
+            }
+            for v in model.variables:
+                self.assertIn(id(v), checkpointed_object_ids)
+
+            # test compute output shape
+            ref_shape = model.layers[-1].output.shape
+            shape = model.layers[-1].compute_output_shape(
+                (None, timesteps, dim)
+            )
+            self.assertListEqual(shape.as_list(), ref_shape.as_list())
+
+            # test config
+            model.get_config()
+            model = keras.models.model_from_json(model.to_json())
+            model.summary()
+
+    def test_bidirectional_invalid_init(self):
+        x = tf.constant(np.zeros((1, 1)).astype("float32"))
+        with self.assertRaisesRegex(
+            ValueError,
+            "Please initialize `Bidirectional` layer with a "
+            "`tf.keras.layers.Layer` instance.",
+        ):
+            keras.layers.Bidirectional(x)
+
+    def test_bidirectional_weight_loading(self):
+        rnn = keras.layers.SimpleRNN
+        samples = 2
+        dim = 2
+        timesteps = 2
+        output_dim = 2
+        with self.cached_session():
+            x = np.random.random((samples, timesteps, dim))
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Bidirectional(
+                    rnn(output_dim), input_shape=(timesteps, dim)
+                )
+            )
+            y_ref = model.predict(x)
+            weights = model.layers[-1].get_weights()
+            model.layers[-1].set_weights(weights)
+            y = model.predict(x)
+            self.assertAllClose(y, y_ref)
+
+    def test_bidirectional_stacked(self):
+        # test stacked bidirectional layers
+        rnn = keras.layers.SimpleRNN
+        samples = 2
+        dim = 2
+        timesteps = 2
+        output_dim = 2
+        mode = "sum"
+
+        with self.cached_session():
+            x = np.random.random((samples, timesteps, dim))
+            target_dim = 2 * output_dim if mode == "concat" else output_dim
+            y = np.random.random((samples, target_dim))
+
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Bidirectional(
+                    rnn(output_dim, return_sequences=True),
+                    merge_mode=mode,
+                    input_shape=(timesteps, dim),
+                )
+            )
+            model.add(
+                keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode)
+            )
+            model.compile(loss="mse", optimizer="sgd")
+            model.fit(x, y, epochs=1, batch_size=1)
+
+            # test with functional API
+            inputs = keras.layers.Input((timesteps, dim))
+            output = keras.layers.Bidirectional(
+                rnn(output_dim), merge_mode=mode
+            )(inputs)
+            model = keras.models.Model(inputs, output)
+            model.compile(loss="mse", optimizer="sgd")
+            model.fit(x, y, epochs=1, batch_size=1)
+
+    def test_bidirectional_statefulness(self):
+        # Bidirectional and stateful
+        def run_test():
+            rnn = keras.layers.SimpleRNN
+            samples = 2
+            dim = 2
+            timesteps = 2
+            output_dim = 2
+            mode = "sum"
+
+            with self.cached_session():
+                x = np.random.random((samples, timesteps, dim))
+                target_dim = 2 * output_dim if mode == "concat" else output_dim
+                y = np.random.random((samples, target_dim))
+
+                inputs = keras.layers.Input(batch_shape=(1, timesteps, dim))
+                bidi_rnn = keras.layers.Bidirectional(
+                    rnn(output_dim, stateful=True), merge_mode=mode
+                )
+                self.assertTrue(bidi_rnn.stateful)
+                output = bidi_rnn(inputs)
+                model = keras.models.Model(inputs, output)
+
+                y_1 = model.predict(x, batch_size=1)
+                model.reset_states()
+                y_2 = model.predict(x, batch_size=1)
+
+                self.assertAllClose(y_1, y_2)
+
+                model.compile(loss="mse", optimizer="sgd")
+                model.fit(x, y, epochs=1, batch_size=1)
+
+        if tf.executing_eagerly():
+            run_test()
+        else:
+            tf_test_util.enable_output_all_intermediates(run_test)()
+
+    @parameterized.parameters(["sum", "mul", "ave", "concat", None])
+    def test_Bidirectional_merged_value(self, merge_mode):
+        rnn = keras.layers.LSTM
+        samples = 2
+        dim = 5
+        timesteps = 3
+        units = 3
+        x = [np.random.rand(samples, timesteps, dim)]
+
+        with self.cached_session():
+            if merge_mode == "sum":
+                merge_func = lambda y, y_rev: y + y_rev
+            elif merge_mode == "mul":
+                merge_func = lambda y, y_rev: y * y_rev
+            elif merge_mode == "ave":
+                merge_func = lambda y, y_rev: (y + y_rev) / 2
+            elif merge_mode == "concat":
+                merge_func = lambda y, y_rev: np.concatenate(
+                    (y, y_rev), axis=-1
+                )
+            else:
+                merge_func = lambda y, y_rev: [y, y_rev]
+
+            # basic case
+            inputs = keras.Input((timesteps, dim))
+            layer = keras.layers.Bidirectional(
+                rnn(units, return_sequences=True), merge_mode=merge_mode
+            )
+            f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
+            f_forward = keras.backend.function(
+                [inputs], [layer.forward_layer(inputs)]
+            )
+            f_backward = keras.backend.function(
+                [inputs],
+                [keras.backend.reverse(layer.backward_layer(inputs), 1)],
+            )
+
+            y_merged = f_merged(x)
+            y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
+            assert len(y_merged) == len(y_expected)
+            for x1, x2 in zip(y_merged, y_expected):
+                self.assertAllClose(x1, x2, atol=1e-5)
+
+            # test return_state
+            inputs = keras.Input((timesteps, dim))
+            layer = keras.layers.Bidirectional(
+                rnn(units, return_state=True), merge_mode=merge_mode
+            )
+            f_merged = keras.backend.function([inputs], layer(inputs))
+            f_forward = keras.backend.function(
+                [inputs], layer.forward_layer(inputs)
+            )
+            f_backward = keras.backend.function(
+                [inputs], layer.backward_layer(inputs)
+            )
+            n_states = len(layer.layer.states)
+
+            y_merged = f_merged(x)
+            y_forward = f_forward(x)
+            y_backward = f_backward(x)
+            y_expected = _to_list(merge_func(y_forward[0], y_backward[0]))
+            assert len(y_merged) == len(y_expected) + n_states * 2
+            for x1, x2 in zip(y_merged, y_expected):
+                self.assertAllClose(x1, x2, atol=1e-5)
+
+            y_merged = y_merged[-n_states * 2 :]
+            y_forward = y_forward[-n_states:]
+            y_backward = y_backward[-n_states:]
+            for state_birnn, state_inner in zip(
+                y_merged, y_forward + y_backward
+            ):
+                self.assertAllClose(state_birnn, state_inner, atol=1e-5)
+
+    @parameterized.parameters([True, False])
+    def test_Bidirectional_with_time_major_input(self, time_major):
+        batch_size, time, input_dim = 2, 3, 1
+        inputs = tf.zeros((batch_size, time, input_dim))
+        # length is [1 2]. Within the batch, the first element has 1 step, and
+        # the second element as 2 steps.
+        lengths = tf.range(1, 1 + batch_size)
+        mask = tf.sequence_mask(lengths, maxlen=time, dtype=tf.float32)
+
+        forward_cell = _AddOneCell(name="forward")
+        backward_cell = _AddOneCell(name="backward")
+
+        layer = keras.layers.Bidirectional(
+            layer=keras.layers.RNN(
+                forward_cell, time_major=time_major, return_sequences=True
+            ),
+            backward_layer=keras.layers.RNN(
+                backward_cell,
+                time_major=time_major,
+                return_sequences=True,
+                go_backwards=True,
+            ),
+        )
+
+        # Switch to time-major.
+        if time_major:
+            inputs = tf.transpose(inputs, [1, 0, 2])
+            mask = tf.transpose(mask, [1, 0])
+
+        keras_outputs = layer(inputs, mask=mask)
+        if time_major:
+            keras_outputs = tf.transpose(keras_outputs, [1, 0, 2])
+
+        # expect the first element in batch has 1 step and second element in
+        # batch has 2 steps.
+        expected_result = np.array(
+            [
+                [[1.0, 1.0], [0.0, 0.0], [0.0, 0.0]],
+                [[1.0, 1.0], [1.0, 1.0], [0.0, 0.0]],
+            ]
+        )
+        self.assertAllClose(expected_result, keras_outputs)
+
+    def test_Bidirectional_dropout(self):
+        rnn = keras.layers.LSTM
+        samples = 2
+        dim = 5
+        timesteps = 3
+        units = 3
+        merge_mode = "sum"
+        x = [np.random.rand(samples, timesteps, dim)]
+
+        with self.cached_session():
+            inputs = keras.Input((timesteps, dim))
+            wrapped = keras.layers.Bidirectional(
+                rnn(units, dropout=0.2, recurrent_dropout=0.2),
+                merge_mode=merge_mode,
+            )
+            outputs = _to_list(wrapped(inputs, training=True))
+
+            inputs = keras.Input((timesteps, dim))
+            wrapped = keras.layers.Bidirectional(
+                rnn(units, dropout=0.2, return_state=True),
+                merge_mode=merge_mode,
+            )
+            outputs = _to_list(wrapped(inputs))
+
+            model = keras.Model(inputs, outputs)
+            y1 = _to_list(model.predict(x))
+            y2 = _to_list(model.predict(x))
+            for x1, x2 in zip(y1, y2):
+                self.assertAllClose(x1, x2, atol=1e-5)
+
+    def test_Bidirectional_state_reuse(self):
+        rnn = keras.layers.LSTM
+        samples = 2
+        dim = 5
+        timesteps = 3
+        units = 3
+
+        with self.cached_session():
+            input1 = keras.layers.Input((timesteps, dim))
+            layer = keras.layers.Bidirectional(
+                rnn(units, return_state=True, return_sequences=True)
+            )
+            state = layer(input1)[1:]
+
+            # test passing invalid initial_state: passing a tensor
+            input2 = keras.layers.Input((timesteps, dim))
+            with self.assertRaises(ValueError):
+                keras.layers.Bidirectional(rnn(units))(
+                    input2, initial_state=state[0]
+                )
+
+            # test valid usage: passing a list
+            output = keras.layers.Bidirectional(rnn(units))(
+                input2, initial_state=state
+            )
+            model = keras.models.Model([input1, input2], output)
+            assert len(model.layers) == 4
+            assert isinstance(model.layers[-1].input, list)
+            inputs = [
+                np.random.rand(samples, timesteps, dim),
+                np.random.rand(samples, timesteps, dim),
+            ]
+            model.predict(inputs)
+
+    def test_Bidirectional_state_reuse_with_np_input(self):
+        # See https://github.com/tensorflow/tensorflow/issues/28761 for more
+        # detail.
+        rnn = keras.layers.LSTM
+        samples = 2
+        dim = 5
+        timesteps = 3
+        units = 3
+
+        with self.cached_session():
+            input1 = np.random.rand(samples, timesteps, dim).astype(np.float32)
+            layer = keras.layers.Bidirectional(
+                rnn(units, return_state=True, return_sequences=True)
+            )
+            state = layer(input1)[1:]
+
+            input2 = np.random.rand(samples, timesteps, dim).astype(np.float32)
+            keras.layers.Bidirectional(rnn(units))(input2, initial_state=state)
+
+    def test_Bidirectional_trainable(self):
+        # test layers that need learning_phase to be set
+        with self.cached_session():
+            x = keras.layers.Input(shape=(3, 2))
+            layer = keras.layers.Bidirectional(keras.layers.SimpleRNN(3))
+            _ = layer(x)
+            assert len(layer.trainable_weights) == 6
+            layer.trainable = False
+            assert not layer.trainable_weights
+            layer.trainable = True
+            assert len(layer.trainable_weights) == 6
+
+    def test_Bidirectional_updates(self):
+        if tf.executing_eagerly():
+            self.skipTest("layer.updates is only available in graph mode.")
+
+        with self.cached_session():
+            x = keras.layers.Input(shape=(3, 2))
+            x_reachable_update = x * x
+            layer = keras.layers.Bidirectional(keras.layers.SimpleRNN(3))
+            _ = layer(x)
+            assert not layer.updates
+            # TODO(b/128684069): Remove when Wrapper sublayers are __call__'d.
+            with base_layer_utils.call_context().enter(layer, x, True, None):
+                layer.forward_layer.add_update(x_reachable_update)
+                layer.forward_layer.add_update(1)
+                layer.backward_layer.add_update(x_reachable_update)
+                layer.backward_layer.add_update(1)
+            assert len(layer.updates) == 4
+
+    def test_Bidirectional_losses(self):
+        x = keras.layers.Input(shape=(3, 2))
+        layer = keras.layers.Bidirectional(
+            keras.layers.SimpleRNN(
+                3,
+                kernel_regularizer="l1",
+                bias_regularizer="l1",
+                activity_regularizer="l1",
+            )
+        )
+        _ = layer(x)
+        assert len(layer.losses) == 6
+
+        loss = x * x
+        layer.forward_layer.add_loss(loss)
+        layer.backward_layer.add_loss(loss)
+        assert len(layer.losses) == 8
+
+    def test_Bidirectional_with_constants(self):
+        with self.cached_session():
+            # Test basic case.
+            x = keras.Input((5, 5))
+            c = keras.Input((3,))
+            cell = _RNNCellWithConstants(32, 3)
+            custom_objects = {"_RNNCellWithConstants": _RNNCellWithConstants}
+            with keras.utils.CustomObjectScope(custom_objects):
+                layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
+            y = layer(x, constants=c)
+            model = keras.Model([x, c], y)
+            model.compile(optimizer="rmsprop", loss="mse")
+            model.train_on_batch(
+                [np.zeros((6, 5, 5)), np.zeros((6, 3))], np.zeros((6, 64))
+            )
+
+            # Test basic case serialization.
+            x_np = np.random.random((6, 5, 5))
+            c_np = np.random.random((6, 3))
+            y_np = model.predict([x_np, c_np])
+            weights = model.get_weights()
+            config = layer.get_config()
+
+            with keras.utils.CustomObjectScope(custom_objects):
+                layer = keras.layers.Bidirectional.from_config(
+                    copy.deepcopy(config)
+                )
+            y = layer(x, constants=c)
+            model = keras.Model([x, c], y)
+            model.set_weights(weights)
+            y_np_2 = model.predict([x_np, c_np])
+            self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+            # Test flat list inputs
+            with keras.utils.CustomObjectScope(custom_objects):
+                layer = keras.layers.Bidirectional.from_config(
+                    copy.deepcopy(config)
+                )
+            y = layer([x, c])
+            model = keras.Model([x, c], y)
+            model.set_weights(weights)
+            y_np_3 = model.predict([x_np, c_np])
+            self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+    def test_Bidirectional_with_constants_layer_passing_initial_state(self):
+        with self.cached_session():
+            # Test basic case.
+            x = keras.Input((5, 5))
+            c = keras.Input((3,))
+            s_for = keras.Input((32,))
+            s_bac = keras.Input((32,))
+            cell = _RNNCellWithConstants(32, 3)
+            custom_objects = {"_RNNCellWithConstants": _RNNCellWithConstants}
+            with keras.utils.CustomObjectScope(custom_objects):
+                layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
+            y = layer(x, initial_state=[s_for, s_bac], constants=c)
+            model = keras.Model([x, s_for, s_bac, c], y)
+            model.compile(optimizer="rmsprop", loss="mse")
+            model.train_on_batch(
+                [
+                    np.zeros((6, 5, 5)),
+                    np.zeros((6, 32)),
+                    np.zeros((6, 32)),
+                    np.zeros((6, 3)),
+                ],
+                np.zeros((6, 64)),
+            )
+
+            # Test basic case serialization.
+            x_np = np.random.random((6, 5, 5))
+            s_fw_np = np.random.random((6, 32))
+            s_bk_np = np.random.random((6, 32))
+            c_np = np.random.random((6, 3))
+            y_np = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+            weights = model.get_weights()
+            config = layer.get_config()
+
+            with keras.utils.CustomObjectScope(custom_objects):
+                layer = keras.layers.Bidirectional.from_config(
+                    copy.deepcopy(config)
+                )
+            y = layer(x, initial_state=[s_for, s_bac], constants=c)
+            model = keras.Model([x, s_for, s_bac, c], y)
+            model.set_weights(weights)
+            y_np_2 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+            self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+            # Verify that state is used
+            y_np_2_different_s = model.predict(
+                [x_np, s_fw_np + 10.0, s_bk_np + 10.0, c_np]
+            )
+            assert np.mean(y_np - y_np_2_different_s) != 0
+
+            # Test flat list inputs
+            with keras.utils.CustomObjectScope(custom_objects):
+                layer = keras.layers.Bidirectional.from_config(
+                    copy.deepcopy(config)
+                )
+            y = layer([x, s_for, s_bac, c])
+            model = keras.Model([x, s_for, s_bac, c], y)
+            model.set_weights(weights)
+            y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+            self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+    @parameterized.parameters([keras.layers.LSTM, keras.layers.GRU])
+    def test_Bidirectional_output_shape(self, rnn):
+        input_shape = [None, 2, 1]
+        num_state = 4 if rnn == keras.layers.LSTM else 2
+
+        wrapper = keras.layers.Bidirectional(rnn(3))
+        output_shape = wrapper.compute_output_shape(input_shape)
+        self.assertEqual(output_shape.as_list(), [None, 6])
+
+        wrapper = keras.layers.Bidirectional(rnn(3, return_state=True))
+        output_shape = wrapper.compute_output_shape(input_shape)
+        # 1 for output and the rest for forward and backward states
+        self.assertLen(output_shape, 1 + num_state)
+        self.assertEqual(output_shape[0].as_list(), [None, 6])
+        for shape in output_shape[1:]:
+            self.assertEqual(shape.as_list(), [None, 3])
+
+        wrapper = keras.layers.Bidirectional(
+            rnn(3, return_state=True), merge_mode=None
+        )
+        output_shape = wrapper.compute_output_shape(input_shape)
+        # 1 for forward output and 1 for backward output,  and the rest for
+        # states
+        self.assertLen(output_shape, 2 + num_state)
+        for shape in output_shape:
+            self.assertEqual(shape.as_list(), [None, 3])
+
+    def test_Bidirectional_output_shape_return_types(self):
+        class TestLayer(keras.layers.SimpleRNN):
+            def call(self, inputs):
+                return tf.concat([inputs, inputs], axis=-1)
+
+            def compute_output_shape(self, input_shape):
+                output_shape = tf.TensorShape(input_shape).as_list()
+                output_shape[-1] = output_shape[-1] * 2
+                return tf.TensorShape(output_shape)
+
+        class TestListLayer(TestLayer):
+            def compute_output_shape(self, input_shape):
+                shape = super().compute_output_shape(input_shape)
+                return shape.as_list()
+
+        class TestTupleLayer(TestLayer):
+            def compute_output_shape(self, input_shape):
+                shape = super().compute_output_shape(input_shape)
+                return tuple(shape.as_list())
+
+        # Layers can specify output shape as list/tuple/TensorShape
+        test_layers = [TestLayer, TestListLayer, TestTupleLayer]
+        for layer in test_layers:
+            input_layer = keras.layers.Bidirectional(layer(1))
+            inputs = keras.backend.placeholder(shape=(None, 2, 4))
+            output = input_layer(inputs)
+            self.assertEqual(output.shape.as_list(), [None, 2, 16])
+            self.assertEqual(
+                input_layer.compute_output_shape([None, 2, 4]).as_list(),
+                [None, 2, 16],
+            )
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_Bidirectional_last_output_with_masking(self):
+        rnn = keras.layers.LSTM
+        samples = 2
+        dim = 5
+        timesteps = 3
+        units = 3
+        merge_mode = "concat"
+        x = np.random.rand(samples, timesteps, dim)
+        # clear the first record's timestep 2. Last output should be same as
+        # state, not zeroed.
+        x[0, 2] = 0
+
+        with self.cached_session():
+            inputs = keras.Input((timesteps, dim))
+            masked_inputs = keras.layers.Masking()(inputs)
+            wrapped = keras.layers.Bidirectional(
+                rnn(units, return_state=True), merge_mode=merge_mode
+            )
+            outputs = _to_list(wrapped(masked_inputs, training=True))
+            self.assertLen(outputs, 5)
+            self.assertEqual(outputs[0].shape.as_list(), [None, units * 2])
+
+            model = keras.Model(inputs, outputs)
+            y = _to_list(model.predict(x))
+            self.assertLen(y, 5)
+            self.assertAllClose(y[0], np.concatenate([y[1], y[3]], axis=1))
+
+    @parameterized.parameters([keras.layers.LSTM, keras.layers.GRU])
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_Bidirectional_sequence_output_with_masking(self, rnn):
+        samples = 2
+        dim = 5
+        timesteps = 3
+        units = 3
+        merge_mode = "concat"
+        x = np.random.rand(samples, timesteps, dim)
+        # clear the first record's timestep 2, and expect the output of timestep
+        # 2 is also 0s.
+        x[0, 2] = 0
+
+        with self.cached_session():
+            inputs = keras.Input((timesteps, dim))
+            masked_inputs = keras.layers.Masking()(inputs)
+            wrapped = keras.layers.Bidirectional(
+                rnn(units, return_sequences=True), merge_mode=merge_mode
+            )
+            outputs = _to_list(wrapped(masked_inputs, training=True))
+            self.assertLen(outputs, 1)
+            self.assertEqual(
+                outputs[0].shape.as_list(), [None, timesteps, units * 2]
+            )
+
+            model = keras.Model(inputs, outputs)
+            y = _to_list(model.predict(x))
+            self.assertLen(y, 1)
+            self.assertAllClose(y[0][0, 2], np.zeros(units * 2))
+
+    @parameterized.parameters(["sum", "concat"])
+    def test_custom_backward_layer(self, mode):
+        rnn = keras.layers.SimpleRNN
+        samples = 2
+        dim = 2
+        timesteps = 2
+        output_dim = 2
 
-  @parameterized.parameters(['sum', 'concat', 'ave', 'mul'])
-  def test_bidirectional(self, mode):
-    rnn = keras.layers.SimpleRNN
-    samples = 2
-    dim = 2
-    timesteps = 2
-    output_dim = 2
-    with self.cached_session():
-      x = np.random.random((samples, timesteps, dim))
-      target_dim = 2 * output_dim if mode == 'concat' else output_dim
-      y = np.random.random((samples, target_dim))
-
-      # test with Sequential model
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Bidirectional(
-              rnn(output_dim), merge_mode=mode, input_shape=(timesteps, dim)))
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.fit(x, y, epochs=1, batch_size=1)
-
-      # check whether the model variables are present in the
-      # trackable list of objects
-      checkpointed_object_ids = {
-          id(o) for o in trackable_util.list_objects(model)
-      }
-      for v in model.variables:
-        self.assertIn(id(v), checkpointed_object_ids)
-
-      # test compute output shape
-      ref_shape = model.layers[-1].output.shape
-      shape = model.layers[-1].compute_output_shape(
-          (None, timesteps, dim))
-      self.assertListEqual(shape.as_list(), ref_shape.as_list())
-
-      # test config
-      model.get_config()
-      model = keras.models.model_from_json(model.to_json())
-      model.summary()
-
-  def test_bidirectional_invalid_init(self):
-    x = tf.constant(np.zeros((1, 1)).astype('float32'))
-    with self.assertRaisesRegex(
-        ValueError,
-        'Please initialize `Bidirectional` layer with a '
-        '`tf.keras.layers.Layer` instance.'):
-      keras.layers.Bidirectional(x)
-
-  def test_bidirectional_weight_loading(self):
-    rnn = keras.layers.SimpleRNN
-    samples = 2
-    dim = 2
-    timesteps = 2
-    output_dim = 2
-    with self.cached_session():
-      x = np.random.random((samples, timesteps, dim))
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Bidirectional(
-              rnn(output_dim), input_shape=(timesteps, dim)))
-      y_ref = model.predict(x)
-      weights = model.layers[-1].get_weights()
-      model.layers[-1].set_weights(weights)
-      y = model.predict(x)
-      self.assertAllClose(y, y_ref)
-
-  def test_bidirectional_stacked(self):
-    # test stacked bidirectional layers
-    rnn = keras.layers.SimpleRNN
-    samples = 2
-    dim = 2
-    timesteps = 2
-    output_dim = 2
-    mode = 'sum'
-
-    with self.cached_session():
-      x = np.random.random((samples, timesteps, dim))
-      target_dim = 2 * output_dim if mode == 'concat' else output_dim
-      y = np.random.random((samples, target_dim))
-
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Bidirectional(
-              rnn(output_dim, return_sequences=True),
-              merge_mode=mode,
-              input_shape=(timesteps, dim)))
-      model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
-      model.compile(loss='mse', optimizer='sgd')
-      model.fit(x, y, epochs=1, batch_size=1)
-
-      # test with functional API
-      inputs = keras.layers.Input((timesteps, dim))
-      output = keras.layers.Bidirectional(
-          rnn(output_dim), merge_mode=mode)(inputs)
-      model = keras.models.Model(inputs, output)
-      model.compile(loss='mse', optimizer='sgd')
-      model.fit(x, y, epochs=1, batch_size=1)
-
-  def test_bidirectional_statefulness(self):
-    # Bidirectional and stateful
-    def run_test():
-      rnn = keras.layers.SimpleRNN
-      samples = 2
-      dim = 2
-      timesteps = 2
-      output_dim = 2
-      mode = 'sum'
-
-      with self.cached_session():
         x = np.random.random((samples, timesteps, dim))
-        target_dim = 2 * output_dim if mode == 'concat' else output_dim
+        target_dim = 2 * output_dim if mode == "concat" else output_dim
         y = np.random.random((samples, target_dim))
+        forward_layer = rnn(output_dim)
+        backward_layer = rnn(output_dim, go_backwards=True)
+
+        # test with Sequential model
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                forward_layer,
+                merge_mode=mode,
+                backward_layer=backward_layer,
+                input_shape=(timesteps, dim),
+            )
+        )
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(x, y, epochs=1, batch_size=1)
 
-        inputs = keras.layers.Input(batch_shape=(1, timesteps, dim))
-        bidi_rnn = keras.layers.Bidirectional(
-            rnn(output_dim, stateful=True), merge_mode=mode)
-        self.assertTrue(bidi_rnn.stateful)
-        output = bidi_rnn(inputs)
-        model = keras.models.Model(inputs, output)
-
-        y_1 = model.predict(x, batch_size=1)
+        # check whether the model variables are present in the
+        # trackable list of objects
+        checkpointed_object_ids = {
+            id(o) for o in trackable_util.list_objects(model)
+        }
+        for v in model.variables:
+            self.assertIn(id(v), checkpointed_object_ids)
+
+        # test compute output shape
+        ref_shape = model.layers[-1].output.shape
+        shape = model.layers[-1].compute_output_shape((None, timesteps, dim))
+        self.assertListEqual(shape.as_list(), ref_shape.as_list())
+
+        # test config
+        model.get_config()
+        model = keras.models.model_from_json(model.to_json())
+        model.summary()
+
+    def test_custom_backward_layer_error_check(self):
+        rnn = keras.layers.LSTM
+        units = 2
+
+        forward_layer = rnn(units)
+        backward_layer = rnn(units)
+
+        with self.assertRaisesRegex(
+            ValueError, "should have different `go_backwards` value."
+        ):
+            keras.layers.Bidirectional(
+                forward_layer,
+                merge_mode="concat",
+                backward_layer=backward_layer,
+            )
+
+        for attr in ("stateful", "return_sequences", "return_state"):
+            kwargs = {attr: True}
+            backward_layer = rnn(units, go_backwards=True, **kwargs)
+            with self.assertRaisesRegex(
+                ValueError,
+                'expected to have the same value for attribute "' + attr,
+            ):
+                keras.layers.Bidirectional(
+                    forward_layer,
+                    merge_mode="concat",
+                    backward_layer=backward_layer,
+                )
+
+    def test_custom_backward_layer_serialization(self):
+        rnn = keras.layers.LSTM
+        units = 2
+
+        forward_layer = rnn(units)
+        backward_layer = rnn(units, go_backwards=True)
+        layer = keras.layers.Bidirectional(
+            forward_layer, merge_mode="concat", backward_layer=backward_layer
+        )
+        config = layer.get_config()
+        layer_from_config = keras.layers.Bidirectional.from_config(config)
+        new_config = layer_from_config.get_config()
+        self.assertDictEqual(config, new_config)
+
+    def test_rnn_layer_name(self):
+        rnn = keras.layers.LSTM
+        units = 2
+
+        layer = keras.layers.Bidirectional(rnn(units, name="rnn"))
+        config = layer.get_config()
+
+        self.assertEqual(config["layer"]["config"]["name"], "rnn")
+
+        layer_from_config = keras.layers.Bidirectional.from_config(config)
+        self.assertEqual(layer_from_config.forward_layer.name, "forward_rnn")
+        self.assertEqual(layer_from_config.backward_layer.name, "backward_rnn")
+
+    def test_custom_backward_rnn_layer_name(self):
+        rnn = keras.layers.LSTM
+        units = 2
+
+        forward_layer = rnn(units)
+        backward_layer = rnn(units, go_backwards=True)
+        layer = keras.layers.Bidirectional(
+            forward_layer, merge_mode="concat", backward_layer=backward_layer
+        )
+        config = layer.get_config()
+
+        self.assertEqual(config["layer"]["config"]["name"], "lstm")
+        self.assertEqual(config["backward_layer"]["config"]["name"], "lstm_1")
+
+        layer_from_config = keras.layers.Bidirectional.from_config(config)
+        self.assertEqual(layer_from_config.forward_layer.name, "forward_lstm")
+        self.assertEqual(
+            layer_from_config.backward_layer.name, "backward_lstm_1"
+        )
+
+    def test_rnn_with_customized_cell(self):
+        batch = 20
+        dim = 5
+        timesteps = 3
+        units = 5
+        merge_mode = "sum"
+
+        cell = _ResidualLSTMCell(units)
+        forward_layer = keras.layers.RNN(cell)
+        inputs = keras.Input((timesteps, dim))
+        bidirectional_rnn = keras.layers.Bidirectional(
+            forward_layer, merge_mode=merge_mode
+        )
+        outputs = _to_list(bidirectional_rnn(inputs))
+
+        model = keras.Model(inputs, outputs)
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            np.random.random((batch, timesteps, dim)),
+            np.random.random((batch, units)),
+            epochs=1,
+            batch_size=10,
+        )
+
+    def test_rnn_with_customized_cell_stacking(self):
+        batch = 20
+        dim = 5
+        timesteps = 3
+        units = 5
+        merge_mode = "sum"
+
+        cell = [_ResidualLSTMCell(units), _ResidualLSTMCell(units)]
+        forward_layer = keras.layers.RNN(cell)
+        inputs = keras.Input((timesteps, dim))
+        bidirectional_rnn = keras.layers.Bidirectional(
+            forward_layer, merge_mode=merge_mode
+        )
+        outputs = _to_list(bidirectional_rnn(inputs))
+
+        model = keras.Model(inputs, outputs)
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            np.random.random((batch, timesteps, dim)),
+            np.random.random((batch, units)),
+            epochs=1,
+            batch_size=10,
+        )
+
+    @test_utils.run_v2_only
+    def test_wrapped_rnn_cell(self):
+        # See https://github.com/tensorflow/tensorflow/issues/26581.
+        batch = 20
+        dim = 5
+        timesteps = 3
+        units = 5
+        merge_mode = "sum"
+
+        cell = keras.layers.LSTMCell(units)
+        cell = ResidualWrapper(cell)
+        rnn = keras.layers.RNN(cell)
+
+        inputs = keras.Input((timesteps, dim))
+        wrapped = keras.layers.Bidirectional(rnn, merge_mode=merge_mode)
+        outputs = _to_list(wrapped(inputs))
+
+        model = keras.Model(inputs, outputs)
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            np.random.random((batch, timesteps, dim)),
+            np.random.random((batch, units)),
+            epochs=1,
+            batch_size=10,
+        )
+
+    @parameterized.parameters(["ave", "concat", "mul"])
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm RNN does not support ragged tensors yet."
+        ),
+    )
+    def test_Bidirectional_ragged_input(self, merge_mode):
+        np.random.seed(100)
+        rnn = keras.layers.LSTM
+        units = 3
+        x = tf.ragged.constant(
+            [
+                [[1, 1, 1], [1, 1, 1]],
+                [[1, 1, 1]],
+                [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]],
+                [[1, 1, 1], [1, 1, 1], [1, 1, 1]],
+            ],
+            ragged_rank=1,
+        )
+        x = tf.cast(x, "float32")
+
+        with self.cached_session():
+            if merge_mode == "ave":
+                merge_func = lambda y, y_rev: (y + y_rev) / 2
+            elif merge_mode == "concat":
+                merge_func = lambda y, y_rev: tf.concat((y, y_rev), axis=-1)
+            elif merge_mode == "mul":
+                merge_func = lambda y, y_rev: (y * y_rev)
+
+            inputs = keras.Input(
+                shape=(None, 3), batch_size=4, dtype="float32", ragged=True
+            )
+            layer = keras.layers.Bidirectional(
+                rnn(units, return_sequences=True), merge_mode=merge_mode
+            )
+            f_merged = keras.backend.function([inputs], layer(inputs))
+            f_forward = keras.backend.function(
+                [inputs], layer.forward_layer(inputs)
+            )
+
+            # TODO(kaftan): after KerasTensor refactor TF op layers should work
+            # with many composite tensors, and this shouldn't need to be a
+            # lambda layer.
+            reverse_layer = core.Lambda(tf.reverse, arguments=dict(axis=[1]))
+            f_backward = keras.backend.function(
+                [inputs], reverse_layer(layer.backward_layer(inputs))
+            )
+
+            y_merged = f_merged(x)
+            y_expected = merge_func(
+                convert_ragged_tensor_value(f_forward(x)),
+                convert_ragged_tensor_value(f_backward(x)),
+            )
+
+            y_merged = convert_ragged_tensor_value(y_merged)
+            self.assertAllClose(y_merged.flat_values, y_expected.flat_values)
+
+    def test_Bidirectional_nested_state_reuse(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Only test eager mode.")
+        x = tf.random.normal([4, 8, 16])
+        layer = keras.layers.Bidirectional(
+            keras.layers.RNN(
+                [keras.layers.LSTMCell(5), keras.layers.LSTMCell(5)],
+                return_sequences=True,
+                return_state=True,
+            )
+        )
+        y = layer(x)
+        self.assertAllClose(layer([x] + y[1:]), layer(x, initial_state=y[1:]))
+
+    def test_full_input_spec(self):
+        # See https://github.com/tensorflow/tensorflow/issues/38403
+        inputs = keras.layers.Input(batch_shape=(1, 1, 1))
+        fw_state = keras.layers.Input(batch_shape=(1, 1))
+        bw_state = keras.layers.Input(batch_shape=(1, 1))
+        states = [fw_state, bw_state]
+        bidirectional_rnn = keras.layers.Bidirectional(
+            keras.layers.SimpleRNN(1, stateful=True)
+        )
+
+        rnn_output = bidirectional_rnn(inputs, initial_state=states)
+        model = keras.Model([inputs, fw_state, bw_state], rnn_output)
+        output1 = model.predict(
+            [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))]
+        )
+        output2 = model.predict(
+            [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))]
+        )
         model.reset_states()
-        y_2 = model.predict(x, batch_size=1)
+        output3 = model.predict(
+            [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))]
+        )
+        self.assertAllClose(output1, output3)
+        self.assertNotAllClose(output1, output2)
 
-        self.assertAllClose(y_1, y_2)
+    def test_reset_states(self):
+        ref_state = np.random.rand(1, 3).astype(np.float32)
 
-        model.compile(loss='mse', optimizer='sgd')
-        model.fit(x, y, epochs=1, batch_size=1)
+        # build model
+        inp = keras.Input(batch_shape=[1, 2, 3])
 
-    if tf.executing_eagerly():
-      run_test()
-    else:
-      tf_test_util.enable_output_all_intermediates(run_test)()
-
-  @parameterized.parameters(['sum', 'mul', 'ave', 'concat', None])
-  def test_Bidirectional_merged_value(self, merge_mode):
-    rnn = keras.layers.LSTM
-    samples = 2
-    dim = 5
-    timesteps = 3
-    units = 3
-    x = [np.random.rand(samples, timesteps, dim)]
-
-    with self.cached_session():
-      if merge_mode == 'sum':
-        merge_func = lambda y, y_rev: y + y_rev
-      elif merge_mode == 'mul':
-        merge_func = lambda y, y_rev: y * y_rev
-      elif merge_mode == 'ave':
-        merge_func = lambda y, y_rev: (y + y_rev) / 2
-      elif merge_mode == 'concat':
-        merge_func = lambda y, y_rev: np.concatenate((y, y_rev), axis=-1)
-      else:
-        merge_func = lambda y, y_rev: [y, y_rev]
-
-      # basic case
-      inputs = keras.Input((timesteps, dim))
-      layer = keras.layers.Bidirectional(
-          rnn(units, return_sequences=True), merge_mode=merge_mode)
-      f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
-      f_forward = keras.backend.function([inputs],
-                                         [layer.forward_layer(inputs)])
-      f_backward = keras.backend.function(
-          [inputs],
-          [keras.backend.reverse(layer.backward_layer(inputs), 1)])
-
-      y_merged = f_merged(x)
-      y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
-      assert len(y_merged) == len(y_expected)
-      for x1, x2 in zip(y_merged, y_expected):
-        self.assertAllClose(x1, x2, atol=1e-5)
-
-      # test return_state
-      inputs = keras.Input((timesteps, dim))
-      layer = keras.layers.Bidirectional(
-          rnn(units, return_state=True), merge_mode=merge_mode)
-      f_merged = keras.backend.function([inputs], layer(inputs))
-      f_forward = keras.backend.function([inputs],
-                                         layer.forward_layer(inputs))
-      f_backward = keras.backend.function([inputs],
-                                          layer.backward_layer(inputs))
-      n_states = len(layer.layer.states)
-
-      y_merged = f_merged(x)
-      y_forward = f_forward(x)
-      y_backward = f_backward(x)
-      y_expected = _to_list(merge_func(y_forward[0], y_backward[0]))
-      assert len(y_merged) == len(y_expected) + n_states * 2
-      for x1, x2 in zip(y_merged, y_expected):
-        self.assertAllClose(x1, x2, atol=1e-5)
-
-      y_merged = y_merged[-n_states * 2:]
-      y_forward = y_forward[-n_states:]
-      y_backward = y_backward[-n_states:]
-      for state_birnn, state_inner in zip(y_merged, y_forward + y_backward):
-        self.assertAllClose(state_birnn, state_inner, atol=1e-5)
-
-  @parameterized.parameters([True, False])
-  def test_Bidirectional_with_time_major_input(self, time_major):
-    batch_size, time, input_dim = 2, 3, 1
-    inputs = tf.zeros((batch_size, time, input_dim))
-    # length is [1 2]. Within the batch, the first element has 1 step, and the
-    # second element as 2 steps.
-    lengths = tf.range(1, 1 + batch_size)
-    mask = tf.sequence_mask(lengths, maxlen=time, dtype=tf.float32)
-
-    forward_cell = _AddOneCell(name='forward')
-    backward_cell = _AddOneCell(name='backward')
-
-    layer = keras.layers.Bidirectional(
-        layer=keras.layers.RNN(
-            forward_cell, time_major=time_major, return_sequences=True),
-        backward_layer=keras.layers.RNN(
-            backward_cell, time_major=time_major, return_sequences=True,
-            go_backwards=True))
-
-    # Switch to time-major.
-    if time_major:
-      inputs = tf.transpose(inputs, [1, 0, 2])
-      mask = tf.transpose(mask, [1, 0])
-
-    keras_outputs = layer(inputs, mask=mask)
-    if time_major:
-      keras_outputs = tf.transpose(keras_outputs, [1, 0, 2])
-
-    # expect the first element in batch has 1 step and second element in batch
-    # has 2 steps.
-    expected_result = np.array([[[1., 1.], [0., 0.], [0., 0.]],
-                                [[1., 1.], [1., 1.], [0., 0.]]])
-    self.assertAllClose(expected_result, keras_outputs)
-
-  def test_Bidirectional_dropout(self):
-    rnn = keras.layers.LSTM
-    samples = 2
-    dim = 5
-    timesteps = 3
-    units = 3
-    merge_mode = 'sum'
-    x = [np.random.rand(samples, timesteps, dim)]
-
-    with self.cached_session():
-      inputs = keras.Input((timesteps, dim))
-      wrapped = keras.layers.Bidirectional(
-          rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
-      outputs = _to_list(wrapped(inputs, training=True))
-
-      inputs = keras.Input((timesteps, dim))
-      wrapped = keras.layers.Bidirectional(
-          rnn(units, dropout=0.2, return_state=True), merge_mode=merge_mode)
-      outputs = _to_list(wrapped(inputs))
-
-      model = keras.Model(inputs, outputs)
-      y1 = _to_list(model.predict(x))
-      y2 = _to_list(model.predict(x))
-      for x1, x2 in zip(y1, y2):
-        self.assertAllClose(x1, x2, atol=1e-5)
-
-  def test_Bidirectional_state_reuse(self):
-    rnn = keras.layers.LSTM
-    samples = 2
-    dim = 5
-    timesteps = 3
-    units = 3
-
-    with self.cached_session():
-      input1 = keras.layers.Input((timesteps, dim))
-      layer = keras.layers.Bidirectional(
-          rnn(units, return_state=True, return_sequences=True))
-      state = layer(input1)[1:]
-
-      # test passing invalid initial_state: passing a tensor
-      input2 = keras.layers.Input((timesteps, dim))
-      with self.assertRaises(ValueError):
-        keras.layers.Bidirectional(rnn(units))(input2, initial_state=state[0])
-
-      # test valid usage: passing a list
-      output = keras.layers.Bidirectional(rnn(units))(input2,
-                                                      initial_state=state)
-      model = keras.models.Model([input1, input2], output)
-      assert len(model.layers) == 4
-      assert isinstance(model.layers[-1].input, list)
-      inputs = [np.random.rand(samples, timesteps, dim),
-                np.random.rand(samples, timesteps, dim)]
-      model.predict(inputs)
-
-  def test_Bidirectional_state_reuse_with_np_input(self):
-    # See https://github.com/tensorflow/tensorflow/issues/28761 for more detail.
-    rnn = keras.layers.LSTM
-    samples = 2
-    dim = 5
-    timesteps = 3
-    units = 3
-
-    with self.cached_session():
-      input1 = np.random.rand(samples, timesteps, dim).astype(np.float32)
-      layer = keras.layers.Bidirectional(
-          rnn(units, return_state=True, return_sequences=True))
-      state = layer(input1)[1:]
-
-      input2 = np.random.rand(samples, timesteps, dim).astype(np.float32)
-      keras.layers.Bidirectional(rnn(units))(input2, initial_state=state)
-
-  def test_Bidirectional_trainable(self):
-    # test layers that need learning_phase to be set
-    with self.cached_session():
-      x = keras.layers.Input(shape=(3, 2))
-      layer = keras.layers.Bidirectional(keras.layers.SimpleRNN(3))
-      _ = layer(x)
-      assert len(layer.trainable_weights) == 6
-      layer.trainable = False
-      assert not layer.trainable_weights
-      layer.trainable = True
-      assert len(layer.trainable_weights) == 6
-
-  def test_Bidirectional_updates(self):
-    if tf.executing_eagerly():
-      self.skipTest('layer.updates is only available in graph mode.')
-
-    with self.cached_session():
-      x = keras.layers.Input(shape=(3, 2))
-      x_reachable_update = x * x
-      layer = keras.layers.Bidirectional(keras.layers.SimpleRNN(3))
-      _ = layer(x)
-      assert not layer.updates
-      # TODO(b/128684069): Remove when Wrapper sublayers are __call__'d.
-      with base_layer_utils.call_context().enter(layer, x, True, None):
-        layer.forward_layer.add_update(x_reachable_update)
-        layer.forward_layer.add_update(1)
-        layer.backward_layer.add_update(x_reachable_update)
-        layer.backward_layer.add_update(1)
-      assert len(layer.updates) == 4
-
-  def test_Bidirectional_losses(self):
-    x = keras.layers.Input(shape=(3, 2))
-    layer = keras.layers.Bidirectional(
-        keras.layers.SimpleRNN(
-            3,
-            kernel_regularizer='l1',
-            bias_regularizer='l1',
-            activity_regularizer='l1'))
-    _ = layer(x)
-    assert len(layer.losses) == 6
-
-    loss = x * x
-    layer.forward_layer.add_loss(loss)
-    layer.backward_layer.add_loss(loss)
-    assert len(layer.losses) == 8
-
-  def test_Bidirectional_with_constants(self):
-    with self.cached_session():
-      # Test basic case.
-      x = keras.Input((5, 5))
-      c = keras.Input((3,))
-      cell = _RNNCellWithConstants(32, 3)
-      custom_objects = {'_RNNCellWithConstants': _RNNCellWithConstants}
-      with generic_utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
-      y = layer(x, constants=c)
-      model = keras.Model([x, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-          np.zeros((6, 64))
-      )
-
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      c_np = np.random.random((6, 3))
-      y_np = model.predict([x_np, c_np])
-      weights = model.get_weights()
-      config = layer.get_config()
-
-      with generic_utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
-      y = layer(x, constants=c)
-      model = keras.Model([x, c], y)
-      model.set_weights(weights)
-      y_np_2 = model.predict([x_np, c_np])
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-      # Test flat list inputs
-      with generic_utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
-      y = layer([x, c])
-      model = keras.Model([x, c], y)
-      model.set_weights(weights)
-      y_np_3 = model.predict([x_np, c_np])
-      self.assertAllClose(y_np, y_np_3, atol=1e-4)
-
-  def test_Bidirectional_with_constants_layer_passing_initial_state(self):
-    with self.cached_session():
-      # Test basic case.
-      x = keras.Input((5, 5))
-      c = keras.Input((3,))
-      s_for = keras.Input((32,))
-      s_bac = keras.Input((32,))
-      cell = _RNNCellWithConstants(32, 3)
-      custom_objects = {'_RNNCellWithConstants': _RNNCellWithConstants}
-      with generic_utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
-      y = layer(x, initial_state=[s_for, s_bac], constants=c)
-      model = keras.Model([x, s_for, s_bac, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)),
-           np.zeros((6, 32)),
-           np.zeros((6, 32)),
-           np.zeros((6, 3))],
-          np.zeros((6, 64))
-      )
-
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      s_fw_np = np.random.random((6, 32))
-      s_bk_np = np.random.random((6, 32))
-      c_np = np.random.random((6, 3))
-      y_np = model.predict([x_np, s_fw_np, s_bk_np, c_np])
-      weights = model.get_weights()
-      config = layer.get_config()
-
-      with generic_utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
-      y = layer(x, initial_state=[s_for, s_bac], constants=c)
-      model = keras.Model([x, s_for, s_bac, c], y)
-      model.set_weights(weights)
-      y_np_2 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-      # Verify that state is used
-      y_np_2_different_s = model.predict(
-          [x_np, s_fw_np + 10., s_bk_np + 10., c_np])
-      assert np.mean(y_np - y_np_2_different_s) != 0
-
-      # Test flat list inputs
-      with generic_utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
-      y = layer([x, s_for, s_bac, c])
-      model = keras.Model([x, s_for, s_bac, c], y)
-      model.set_weights(weights)
-      y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
-      self.assertAllClose(y_np, y_np_3, atol=1e-4)
-
-  @parameterized.parameters([keras.layers.LSTM, keras.layers.GRU])
-  def test_Bidirectional_output_shape(self, rnn):
-    input_shape = [None, 2, 1]
-    num_state = 4 if rnn == keras.layers.LSTM else 2
-
-    wrapper = keras.layers.Bidirectional(rnn(3))
-    output_shape = wrapper.compute_output_shape(input_shape)
-    self.assertEqual(output_shape.as_list(), [None, 6])
-
-    wrapper = keras.layers.Bidirectional(rnn(3, return_state=True))
-    output_shape = wrapper.compute_output_shape(input_shape)
-    # 1 for output and the rest for forward and backward states
-    self.assertLen(output_shape, 1 + num_state)
-    self.assertEqual(output_shape[0].as_list(), [None, 6])
-    for shape in output_shape[1:]:
-      self.assertEqual(shape.as_list(), [None, 3])
-
-    wrapper = keras.layers.Bidirectional(rnn(3, return_state=True),
-                                         merge_mode=None)
-    output_shape = wrapper.compute_output_shape(input_shape)
-    # 1 for forward output and 1 for backward output,  and the rest for states
-    self.assertLen(output_shape, 2 + num_state)
-    for shape in output_shape:
-      self.assertEqual(shape.as_list(), [None, 3])
-
-  def test_Bidirectional_output_shape_return_types(self):
-
-    class TestLayer(keras.layers.SimpleRNN):
-
-      def call(self, inputs):
-        return tf.concat([inputs, inputs], axis=-1)
-
-      def compute_output_shape(self, input_shape):
-        output_shape = tf.TensorShape(input_shape).as_list()
-        output_shape[-1] = output_shape[-1] * 2
-        return tf.TensorShape(output_shape)
-
-    class TestListLayer(TestLayer):
-
-      def compute_output_shape(self, input_shape):
-        shape = super().compute_output_shape(input_shape)
-        return shape.as_list()
-
-    class TestTupleLayer(TestLayer):
-
-      def compute_output_shape(self, input_shape):
-        shape = super().compute_output_shape(input_shape)
-        return tuple(shape.as_list())
-
-    # Layers can specify output shape as list/tuple/TensorShape
-    test_layers = [TestLayer, TestListLayer, TestTupleLayer]
-    for layer in test_layers:
-      input_layer = keras.layers.Bidirectional(layer(1))
-      inputs = keras.backend.placeholder(shape=(None, 2, 4))
-      output = input_layer(inputs)
-      self.assertEqual(output.shape.as_list(), [None, 2, 16])
-      self.assertEqual(
-          input_layer.compute_output_shape([None, 2, 4]).as_list(),
-          [None, 2, 16])
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_Bidirectional_last_output_with_masking(self):
-    rnn = keras.layers.LSTM
-    samples = 2
-    dim = 5
-    timesteps = 3
-    units = 3
-    merge_mode = 'concat'
-    x = np.random.rand(samples, timesteps, dim)
-    # clear the first record's timestep 2. Last output should be same as state,
-    # not zeroed.
-    x[0, 2] = 0
-
-    with self.cached_session():
-      inputs = keras.Input((timesteps, dim))
-      masked_inputs = keras.layers.Masking()(inputs)
-      wrapped = keras.layers.Bidirectional(
-          rnn(units, return_state=True), merge_mode=merge_mode)
-      outputs = _to_list(wrapped(masked_inputs, training=True))
-      self.assertLen(outputs, 5)
-      self.assertEqual(outputs[0].shape.as_list(), [None, units * 2])
-
-      model = keras.Model(inputs, outputs)
-      y = _to_list(model.predict(x))
-      self.assertLen(y, 5)
-      self.assertAllClose(y[0], np.concatenate([y[1], y[3]], axis=1))
-
-  @parameterized.parameters([keras.layers.LSTM, keras.layers.GRU])
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_Bidirectional_sequence_output_with_masking(self, rnn):
-    samples = 2
-    dim = 5
-    timesteps = 3
-    units = 3
-    merge_mode = 'concat'
-    x = np.random.rand(samples, timesteps, dim)
-    # clear the first record's timestep 2, and expect the output of timestep 2
-    # is also 0s.
-    x[0, 2] = 0
-
-    with self.cached_session():
-      inputs = keras.Input((timesteps, dim))
-      masked_inputs = keras.layers.Masking()(inputs)
-      wrapped = keras.layers.Bidirectional(
-          rnn(units, return_sequences=True),
-          merge_mode=merge_mode)
-      outputs = _to_list(wrapped(masked_inputs, training=True))
-      self.assertLen(outputs, 1)
-      self.assertEqual(outputs[0].shape.as_list(), [None, timesteps, units * 2])
-
-      model = keras.Model(inputs, outputs)
-      y = _to_list(model.predict(x))
-      self.assertLen(y, 1)
-      self.assertAllClose(y[0][0, 2], np.zeros(units * 2))
-
-  @parameterized.parameters(['sum', 'concat'])
-  def test_custom_backward_layer(self, mode):
-    rnn = keras.layers.SimpleRNN
-    samples = 2
-    dim = 2
-    timesteps = 2
-    output_dim = 2
-
-    x = np.random.random((samples, timesteps, dim))
-    target_dim = 2 * output_dim if mode == 'concat' else output_dim
-    y = np.random.random((samples, target_dim))
-    forward_layer = rnn(output_dim)
-    backward_layer = rnn(output_dim, go_backwards=True)
-
-    # test with Sequential model
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Bidirectional(
-            forward_layer,
-            merge_mode=mode,
-            backward_layer=backward_layer,
-            input_shape=(timesteps, dim)))
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(x, y, epochs=1, batch_size=1)
-
-    # check whether the model variables are present in the
-    # trackable list of objects
-    checkpointed_object_ids = {
-        id(o) for o in trackable_util.list_objects(model)
-    }
-    for v in model.variables:
-      self.assertIn(id(v), checkpointed_object_ids)
-
-    # test compute output shape
-    ref_shape = model.layers[-1].output.shape
-    shape = model.layers[-1].compute_output_shape((None, timesteps, dim))
-    self.assertListEqual(shape.as_list(), ref_shape.as_list())
-
-    # test config
-    model.get_config()
-    model = keras.models.model_from_json(model.to_json())
-    model.summary()
-
-  def test_custom_backward_layer_error_check(self):
-    rnn = keras.layers.LSTM
-    units = 2
-
-    forward_layer = rnn(units)
-    backward_layer = rnn(units)
-
-    with self.assertRaisesRegex(ValueError,
-                                'should have different `go_backwards` value.'):
-      keras.layers.Bidirectional(
-          forward_layer, merge_mode='concat', backward_layer=backward_layer)
-
-    for attr in ('stateful', 'return_sequences', 'return_state'):
-      kwargs = {attr: True}
-      backward_layer = rnn(units, go_backwards=True, **kwargs)
-      with self.assertRaisesRegex(
-          ValueError, 'expected to have the same value for attribute "' + attr):
-        keras.layers.Bidirectional(
-            forward_layer, merge_mode='concat', backward_layer=backward_layer)
-
-  def test_custom_backward_layer_serialization(self):
-    rnn = keras.layers.LSTM
-    units = 2
-
-    forward_layer = rnn(units)
-    backward_layer = rnn(units, go_backwards=True)
-    layer = keras.layers.Bidirectional(
-        forward_layer, merge_mode='concat', backward_layer=backward_layer)
-    config = layer.get_config()
-    layer_from_config = keras.layers.Bidirectional.from_config(config)
-    new_config = layer_from_config.get_config()
-    self.assertDictEqual(config, new_config)
-
-  def test_rnn_layer_name(self):
-    rnn = keras.layers.LSTM
-    units = 2
-
-    layer = keras.layers.Bidirectional(rnn(units, name='rnn'))
-    config = layer.get_config()
-
-    self.assertEqual(config['layer']['config']['name'], 'rnn')
-
-    layer_from_config = keras.layers.Bidirectional.from_config(config)
-    self.assertEqual(layer_from_config.forward_layer.name, 'forward_rnn')
-    self.assertEqual(layer_from_config.backward_layer.name, 'backward_rnn')
-
-  def test_custom_backward_rnn_layer_name(self):
-    rnn = keras.layers.LSTM
-    units = 2
-
-    forward_layer = rnn(units)
-    backward_layer = rnn(units, go_backwards=True)
-    layer = keras.layers.Bidirectional(
-        forward_layer, merge_mode='concat', backward_layer=backward_layer)
-    config = layer.get_config()
-
-    self.assertEqual(config['layer']['config']['name'], 'lstm')
-    self.assertEqual(config['backward_layer']['config']['name'], 'lstm_1')
-
-    layer_from_config = keras.layers.Bidirectional.from_config(config)
-    self.assertEqual(layer_from_config.forward_layer.name, 'forward_lstm')
-    self.assertEqual(layer_from_config.backward_layer.name, 'backward_lstm_1')
-
-  def test_rnn_with_customized_cell(self):
-    batch = 20
-    dim = 5
-    timesteps = 3
-    units = 5
-    merge_mode = 'sum'
-
-    cell = _ResidualLSTMCell(units)
-    forward_layer = keras.layers.RNN(cell)
-    inputs = keras.Input((timesteps, dim))
-    bidirectional_rnn = keras.layers.Bidirectional(
-        forward_layer, merge_mode=merge_mode)
-    outputs = _to_list(bidirectional_rnn(inputs))
-
-    model = keras.Model(inputs, outputs)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(
-        np.random.random((batch, timesteps, dim)),
-        np.random.random((batch, units)),
-        epochs=1,
-        batch_size=10)
-
-  def test_rnn_with_customized_cell_stacking(self):
-    batch = 20
-    dim = 5
-    timesteps = 3
-    units = 5
-    merge_mode = 'sum'
-
-    cell = [_ResidualLSTMCell(units), _ResidualLSTMCell(units)]
-    forward_layer = keras.layers.RNN(cell)
-    inputs = keras.Input((timesteps, dim))
-    bidirectional_rnn = keras.layers.Bidirectional(
-        forward_layer, merge_mode=merge_mode)
-    outputs = _to_list(bidirectional_rnn(inputs))
-
-    model = keras.Model(inputs, outputs)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(
-        np.random.random((batch, timesteps, dim)),
-        np.random.random((batch, units)),
-        epochs=1,
-        batch_size=10)
-
-  @test_utils.run_v2_only
-  def test_wrapped_rnn_cell(self):
-    # See https://github.com/tensorflow/tensorflow/issues/26581.
-    batch = 20
-    dim = 5
-    timesteps = 3
-    units = 5
-    merge_mode = 'sum'
-
-    cell = keras.layers.LSTMCell(units)
-    cell = ResidualWrapper(cell)
-    rnn = keras.layers.RNN(cell)
-
-    inputs = keras.Input((timesteps, dim))
-    wrapped = keras.layers.Bidirectional(rnn, merge_mode=merge_mode)
-    outputs = _to_list(wrapped(inputs))
-
-    model = keras.Model(inputs, outputs)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(
-        np.random.random((batch, timesteps, dim)),
-        np.random.random((batch, units)),
-        epochs=1,
-        batch_size=10)
-
-  @parameterized.parameters(['ave', 'concat', 'mul'])
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm RNN does not support ragged tensors yet.')
-  def test_Bidirectional_ragged_input(self, merge_mode):
-    np.random.seed(100)
-    rnn = keras.layers.LSTM
-    units = 3
-    x = tf.ragged.constant(
-        [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1]],
-         [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]],
-         [[1, 1, 1], [1, 1, 1], [1, 1, 1]]],
-        ragged_rank=1)
-    x = tf.cast(x, 'float32')
-
-    # pylint: disable=g-long-lambda
-    with self.cached_session():
-      if merge_mode == 'ave':
-        merge_func = lambda y, y_rev: (y + y_rev) / 2
-      elif merge_mode == 'concat':
-        merge_func = lambda y, y_rev: tf.concat(
-            (y, y_rev), axis=-1)
-      elif merge_mode == 'mul':
-        merge_func = lambda y, y_rev: (y * y_rev)
-        # pylint: enable=g-long-lambda
-
-      inputs = keras.Input(
-          shape=(None, 3), batch_size=4, dtype='float32', ragged=True)
-      layer = keras.layers.Bidirectional(
-          rnn(units, return_sequences=True), merge_mode=merge_mode)
-      f_merged = keras.backend.function([inputs], layer(inputs))
-      f_forward = keras.backend.function([inputs],
-                                         layer.forward_layer(inputs))
-
-      # TODO(kaftan): after KerasTensor refactor TF op layers should work
-      # with many composite tensors, and this shouldn't need to be a lambda
-      # layer.
-      reverse_layer = core.Lambda(tf.reverse, arguments=dict(axis=[1]))
-      f_backward = keras.backend.function(
-          [inputs],
-          reverse_layer(layer.backward_layer(inputs)))
-
-      y_merged = f_merged(x)
-      y_expected = merge_func(
-          convert_ragged_tensor_value(f_forward(x)),
-          convert_ragged_tensor_value(f_backward(x)))
-
-      y_merged = convert_ragged_tensor_value(y_merged)
-      self.assertAllClose(y_merged.flat_values, y_expected.flat_values)
-
-  def test_Bidirectional_nested_state_reuse(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Only test eager mode.')
-    x = tf.random.normal([4, 8, 16])
-    layer = keras.layers.Bidirectional(
-        keras.layers.RNN([keras.layers.LSTMCell(5),
-                          keras.layers.LSTMCell(5)],
-                         return_sequences=True,
-                         return_state=True))
-    y = layer(x)
-    self.assertAllClose(layer([x] + y[1:]), layer(x, initial_state=y[1:]))
-
-  def test_full_input_spec(self):
-    # See https://github.com/tensorflow/tensorflow/issues/38403
-    inputs = keras.layers.Input(batch_shape=(1, 1, 1))
-    fw_state = keras.layers.Input(batch_shape=(1, 1))
-    bw_state = keras.layers.Input(batch_shape=(1, 1))
-    states = [fw_state, bw_state]
-    bidirectional_rnn = keras.layers.Bidirectional(
-        keras.layers.SimpleRNN(1, stateful=True))
-
-    rnn_output = bidirectional_rnn(inputs, initial_state=states)
-    model = keras.Model([inputs, fw_state, bw_state], rnn_output)
-    output1 = model.predict(
-        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
-    output2 = model.predict(
-        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
-    model.reset_states()
-    output3 = model.predict(
-        [np.ones((1, 1, 1)), np.ones((1, 1)), np.ones((1, 1))])
-    self.assertAllClose(output1, output3)
-    self.assertNotAllClose(output1, output2)
+        stateful = keras.layers.SimpleRNN(units=3, stateful=True)
+        stateless = keras.layers.SimpleRNN(units=3, stateful=False)
 
+        bid_stateless = keras.layers.Bidirectional(stateless)
+        bid_stateful = keras.layers.Bidirectional(stateful)
 
-def _to_list(ls):
-  if isinstance(ls, list):
-    return ls
-  else:
-    return [ls]
+        # required to correctly initialize the state in the layers
+        _ = keras.Model(
+            inp,
+            [
+                bid_stateless(inp),
+                bid_stateful(inp),
+            ],
+        )
 
+        with self.assertRaisesRegex(
+            AttributeError,
+            "Layer must be stateful.",
+        ):
+            bid_stateless.reset_states()
 
-def convert_ragged_tensor_value(inputs):
-  if isinstance(inputs, tf.compat.v1.ragged.RaggedTensorValue):
-    flat_values = tf.convert_to_tensor(
-        value=inputs.flat_values,
-        name='flat_values')
-    return tf.RaggedTensor.from_nested_row_splits(
-        flat_values, inputs.nested_row_splits, validate=False)
-  return inputs
+        with self.assertRaisesRegex(AttributeError, "Layer must be stateful."):
+            bid_stateless.reset_states([])
+
+        bid_stateful.reset_states()
+        bid_stateful.reset_states([ref_state, ref_state])
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Unrecognized value for `states`. Expected `states` "
+            "to be list or tuple",
+        ):
+            bid_stateful.reset_states({})
+
+    def test_trainable_parameter_argument(self):
+        inp = keras.layers.Input([None, 3])
+
+        def test(fwd, bwd, **kwargs):
+            def _remove_from_dict(d, remove_key):
+                if isinstance(d, dict):
+                    d.pop(remove_key, None)
+                    for key in list(d.keys()):
+                        _remove_from_dict(d[key], remove_key)
 
+            bid = keras.layers.Bidirectional(fwd, backward_layer=bwd, **kwargs)
 
-if __name__ == '__main__':
-  tf.test.main()
+            model = keras.Model(inp, bid(inp))
+            clone = keras.models.clone_model(model)
+
+            # Comparison should exclude `build_config`
+            clone_config = _remove_from_dict(clone.get_config(), "build_config")
+            model_config = _remove_from_dict(model.get_config(), "build_config")
+            self.assertEqual(clone_config, model_config)
+
+        # test fetching trainable from `layer`
+        fwd = keras.layers.SimpleRNN(units=3)
+        bwd = keras.layers.SimpleRNN(units=3, go_backwards=True)
+
+        fwd.trainable = True
+        test(fwd, None)
+
+        fwd.trainable = False
+        test(fwd, None)
+
+        fwd.trainable = True
+        bwd.trainable = False
+        test(fwd, bwd)
+
+        fwd.trainable = False
+        bwd.trainable = True
+        test(fwd, bwd)
+
+        fwd.trainable = True
+        bwd.trainable = True
+        test(fwd, bwd)
+
+        fwd.trainable = False
+        bwd.trainable = False
+        test(fwd, bwd)
+
+        # test fetching trainable from `kwargs`
+        test(fwd, None, trainable=True)
+        test(fwd, None, trainable=False)
+
+
+def _to_list(ls):
+    if isinstance(ls, list):
+        return ls
+    else:
+        return [ls]
+
+
+def convert_ragged_tensor_value(inputs):
+    if isinstance(inputs, tf.compat.v1.ragged.RaggedTensorValue):
+        flat_values = tf.convert_to_tensor(
+            value=inputs.flat_values, name="flat_values"
+        )
+        return tf.RaggedTensor.from_nested_row_splits(
+            flat_values, inputs.nested_row_splits, validate=False
+        )
+    return inputs
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/cell_wrappers.py b/keras/layers/rnn/cell_wrappers.py
index 61e97b9b85fc..596c5e16ae71 100644
--- a/keras/layers/rnn/cell_wrappers.py
+++ b/keras/layers/rnn/cell_wrappers.py
@@ -27,557 +27,675 @@
 import types as python_types
 import warnings
 
+import tensorflow.compat.v2 as tf
+
 from keras.layers.rnn import lstm
 from keras.layers.rnn.abstract_rnn_cell import AbstractRNNCell
+from keras.saving import serialization_lib
 from keras.utils import generic_utils
 from keras.utils import tf_inspect
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.deprecation import deprecated
 
 
 class _RNNCellWrapper(AbstractRNNCell):
-  """Base class for cells wrappers V2 compatibility.
-
-  This class along with `rnn_cell_impl._RNNCellWrapperV1` allows to define
-  wrappers that are compatible with V1 and V2, and defines helper methods for
-  this purpose.
-  """
-
-  def __init__(self, cell, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.cell = cell
-    cell_call_spec = tf_inspect.getfullargspec(cell.call)
-    self._call_spec.expects_training_arg = (("training"
-                                             in cell_call_spec.args) or
-                                            (cell_call_spec.varkw is not None))
-
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Calls the wrapped cell and performs the wrapping logic.
-
-    This method is called from the wrapper's `call` or `__call__` methods.
-
-    Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      cell_call_fn: Wrapped cell's method to use for step computation (cell's
-        `__call__` or 'call' method).
-      **kwargs: Additional arguments.
-
-    Returns:
-      A pair containing:
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
-    """
-    raise NotImplementedError
-
-  def call(self, inputs, state, **kwargs):
-    """Runs the RNN cell step computation.
-
-    When `call` is being used, we assume that the wrapper object has been built,
-    and therefore the wrapped cells has been built via its `build` method and
-    its `call` method can be used directly.
+    """Base class for cells wrappers V2 compatibility.
 
-    This allows to use the wrapped cell and the non-wrapped cell equivalently
-    when using `call` and `build`.
-
-    Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      **kwargs: Additional arguments passed to the wrapped cell's `call`.
-
-    Returns:
-      A pair containing:
-
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    This class along with `rnn_cell_impl._RNNCellWrapperV1` allows to define
+    wrappers that are compatible with V1 and V2, and defines helper methods for
+    this purpose.
     """
-    return self._call_wrapped_cell(
-        inputs, state, cell_call_fn=self.cell.call, **kwargs)
-
-  def build(self, inputs_shape):
-    """Builds the wrapped cell."""
-    self.cell.build(inputs_shape)
-    self.built = True
-
-  @property
-  def wrapped_cell(self):
-    return self.cell
-
-  @property
-  def state_size(self):
-    return self.cell.state_size
-
-  @property
-  def output_size(self):
-    return self.cell.output_size
-
-  def zero_state(self, batch_size, dtype):
-    with tf.name_scope(type(self).__name__ + "ZeroState"):
-      return self.cell.zero_state(batch_size, dtype)
-
-  def get_config(self):
-    config = {
-        "cell": {
-            "class_name": self.cell.__class__.__name__,
-            "config": self.cell.get_config()
-        },
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()
-    from keras.layers.serialization import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    cell = deserialize_layer(config.pop("cell"), custom_objects=custom_objects)
-    return cls(cell, **config)
-
 
+    def __init__(self, cell, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cell = cell
+        cell_call_spec = tf_inspect.getfullargspec(cell.call)
+        self._call_spec.expects_training_arg = (
+            "training" in cell_call_spec.args
+        ) or (cell_call_spec.varkw is not None)
+
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Calls the wrapped cell and performs the wrapping logic.
+
+        This method is called from the wrapper's `call` or `__call__` methods.
+
+        Args:
+          inputs: A tensor with wrapped cell's input.
+          state: A tensor or tuple of tensors with wrapped cell's state.
+          cell_call_fn: Wrapped cell's method to use for step computation
+            (cell's `__call__` or 'call' method).
+          **kwargs: Additional arguments.
+
+        Returns:
+          A pair containing:
+          - Output: A tensor with cell's output.
+          - New state: A tensor or tuple of tensors with new wrapped cell's
+            state.
+        """
+        raise NotImplementedError
+
+    def call(self, inputs, state, **kwargs):
+        """Runs the RNN cell step computation.
+
+        When `call` is being used, we assume that the wrapper object has been
+        built, and therefore the wrapped cells has been built via its `build`
+        method and its `call` method can be used directly.
+
+        This allows to use the wrapped cell and the non-wrapped cell
+        equivalently when using `call` and `build`.
+
+        Args:
+          inputs: A tensor with wrapped cell's input.
+          state: A tensor or tuple of tensors with wrapped cell's state.
+          **kwargs: Additional arguments passed to the wrapped cell's `call`.
+
+        Returns:
+          A pair containing:
+
+          - Output: A tensor with cell's output.
+          - New state: A tensor or tuple of tensors with new wrapped cell's
+            state.
+        """
+        return self._call_wrapped_cell(
+            inputs, state, cell_call_fn=self.cell.call, **kwargs
+        )
+
+    def build(self, inputs_shape):
+        """Builds the wrapped cell."""
+        self.cell.build(inputs_shape)
+        self.built = True
+
+    @property
+    def wrapped_cell(self):
+        return self.cell
+
+    @property
+    def state_size(self):
+        return self.cell.state_size
+
+    @property
+    def output_size(self):
+        return self.cell.output_size
+
+    def zero_state(self, batch_size, dtype):
+        with tf.name_scope(type(self).__name__ + "ZeroState"):
+            return self.cell.zero_state(batch_size, dtype)
+
+    def get_config(self):
+        config = {
+            "cell": {
+                "class_name": self.cell.__class__.__name__,
+                "config": self.cell.get_config(),
+            },
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+        from keras.layers.serialization import deserialize as deserialize_layer
+
+        cell = deserialize_layer(
+            config.pop("cell"), custom_objects=custom_objects
+        )
+        return cls(cell, **config)
+
+
+@deprecated(None, "Please use tf.keras.layers.RNN instead.")
 @tf_export("nn.RNNCellDropoutWrapper", v1=[])
 class DropoutWrapper(_RNNCellWrapper):
-  """Operator adding dropout to inputs and outputs of the given cell."""
-
-  def __init__(self,
-               cell,
-               input_keep_prob=1.0,
-               output_keep_prob=1.0,
-               state_keep_prob=1.0,
-               variational_recurrent=False,
-               input_size=None,
-               dtype=None,
-               seed=None,
-               dropout_state_filter_visitor=None,
-               **kwargs):
-    """Create a cell with added input, state, and/or output dropout.
-
-    If `variational_recurrent` is set to `True` (**NOT** the default behavior),
-    then the same dropout mask is applied at every step, as described in:
-    [A Theoretically Grounded Application of Dropout in Recurrent
-    Neural Networks. Y. Gal, Z. Ghahramani](https://arxiv.org/abs/1512.05287).
-
-    Otherwise a different dropout mask is applied at every time step.
-
-    Note, by default (unless a custom `dropout_state_filter` is provided),
-    the memory state (`c` component of any `LSTMStateTuple`) passing through
-    a `DropoutWrapper` is never modified.  This behavior is described in the
-    above article.
-
-    Args:
-      cell: an RNNCell, a projection to output_size is added to it.
-      input_keep_prob: unit Tensor or float between 0 and 1, input keep
-        probability; if it is constant and 1, no input dropout will be added.
-      output_keep_prob: unit Tensor or float between 0 and 1, output keep
-        probability; if it is constant and 1, no output dropout will be added.
-      state_keep_prob: unit Tensor or float between 0 and 1, output keep
-        probability; if it is constant and 1, no output dropout will be added.
-        State dropout is performed on the outgoing states of the cell. **Note**
-        the state components to which dropout is applied when `state_keep_prob`
-        is in `(0, 1)` are also determined by the argument
-        `dropout_state_filter_visitor` (e.g. by default dropout is never applied
-        to the `c` component of an `LSTMStateTuple`).
-      variational_recurrent: Python bool.  If `True`, then the same dropout
-        pattern is applied across all time steps per run call. If this parameter
-        is set, `input_size` **must** be provided.
-      input_size: (optional) (possibly nested tuple of) `TensorShape` objects
-        containing the depth(s) of the input tensors expected to be passed in to
-        the `DropoutWrapper`.  Required and used **iff** `variational_recurrent
-        = True` and `input_keep_prob < 1`.
-      dtype: (optional) The `dtype` of the input, state, and output tensors.
-        Required and used **iff** `variational_recurrent = True`.
-      seed: (optional) integer, the randomness seed.
-      dropout_state_filter_visitor: (optional), default: (see below).  Function
-        that takes any hierarchical level of the state and returns a scalar or
-        depth=1 structure of Python booleans describing which terms in the state
-        should be dropped out.  In addition, if the function returns `True`,
-        dropout is applied across this sublevel.  If the function returns
-        `False`, dropout is not applied across this entire sublevel.
-        Default behavior: perform dropout on all terms except the memory (`c`)
-          state of `LSTMCellState` objects, and don't try to apply dropout to
-        `TensorArray` objects: ```
-        def dropout_state_filter_visitor(s):
-          if isinstance(s, LSTMCellState): # Never perform dropout on the c
-            state. return LSTMCellState(c=False, h=True)
-          elif isinstance(s, TensorArray): return False return True ```
-      **kwargs: dict of keyword arguments for base layer.
-
-    Raises:
-      TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is provided
-        but not `callable`.
-      ValueError: if any of the keep_probs are not between 0 and 1.
-    """
-    if isinstance(cell, lstm.LSTMCell):
-      raise ValueError("keras LSTM cell does not work with DropoutWrapper. "
-                       "Please use LSTMCell(dropout=x, recurrent_dropout=y) "
-                       "instead.")
-    super().__init__(cell, dtype=dtype, **kwargs)
-
-    if (dropout_state_filter_visitor is not None and
-        not callable(dropout_state_filter_visitor)):
-      raise TypeError("dropout_state_filter_visitor must be callable. "
-                      f"Received: {dropout_state_filter_visitor}")
-    self._dropout_state_filter = (
-        dropout_state_filter_visitor or _default_dropout_state_filter_visitor)
-    with tf.name_scope("DropoutWrapperInit"):
-
-      def tensor_and_const_value(v):
-        tensor_value = tf.convert_to_tensor(v)
-        const_value = tf.get_static_value(tensor_value)
-        return (tensor_value, const_value)
-
-      for prob, attr in [(input_keep_prob, "input_keep_prob"),
-                         (state_keep_prob, "state_keep_prob"),
-                         (output_keep_prob, "output_keep_prob")]:
-        tensor_prob, const_prob = tensor_and_const_value(prob)
-        if const_prob is not None:
-          if const_prob < 0 or const_prob > 1:
-            raise ValueError(f"Parameter {attr} must be between 0 and 1. "
-                             f"Received {const_prob}")
-          setattr(self, "_%s" % attr, float(const_prob))
+    """Operator adding dropout to inputs and outputs of the given cell."""
+
+    def __init__(
+        self,
+        cell,
+        input_keep_prob=1.0,
+        output_keep_prob=1.0,
+        state_keep_prob=1.0,
+        variational_recurrent=False,
+        input_size=None,
+        dtype=None,
+        seed=None,
+        dropout_state_filter_visitor=None,
+        **kwargs,
+    ):
+        """Create a cell with added input, state, and/or output dropout.
+
+        If `variational_recurrent` is set to `True` (**NOT** the default
+        behavior), then the same dropout mask is applied at every step, as
+        described in: [A Theoretically Grounded Application of Dropout in
+        Recurrent Neural Networks. Y. Gal, Z.
+        Ghahramani](https://arxiv.org/abs/1512.05287).
+
+        Otherwise a different dropout mask is applied at every time step.
+
+        Note, by default (unless a custom `dropout_state_filter` is provided),
+        the memory state (`c` component of any `LSTMStateTuple`) passing through
+        a `DropoutWrapper` is never modified.  This behavior is described in the
+        above article.
+
+        Args:
+          cell: an RNNCell, a projection to output_size is added to it.
+          input_keep_prob: unit Tensor or float between 0 and 1, input keep
+            probability; if it is constant and 1, no input dropout will be
+            added.
+          output_keep_prob: unit Tensor or float between 0 and 1, output keep
+            probability; if it is constant and 1, no output dropout will be
+            added.
+          state_keep_prob: unit Tensor or float between 0 and 1, output keep
+            probability; if it is constant and 1, no output dropout will be
+            added.  State dropout is performed on the outgoing states of the
+            cell. **Note** the state components to which dropout is applied when
+            `state_keep_prob` is in `(0, 1)` are also determined by the argument
+            `dropout_state_filter_visitor` (e.g. by default dropout is never
+            applied to the `c` component of an `LSTMStateTuple`).
+          variational_recurrent: Python bool.  If `True`, then the same dropout
+            pattern is applied across all time steps per run call. If this
+            parameter is set, `input_size` **must** be provided.
+          input_size: (optional) (possibly nested tuple of) `TensorShape`
+            objects containing the depth(s) of the input tensors expected to be
+            passed in to the `DropoutWrapper`.  Required and used **iff**
+            `variational_recurrent = True` and `input_keep_prob < 1`.
+          dtype: (optional) The `dtype` of the input, state, and output tensors.
+            Required and used **iff** `variational_recurrent = True`.
+          seed: (optional) integer, the randomness seed.
+          dropout_state_filter_visitor: (optional), default: (see below).
+            Function that takes any hierarchical level of the state and returns
+            a scalar or depth=1 structure of Python booleans describing which
+            terms in the state should be dropped out.  In addition, if the
+            function returns `True`, dropout is applied across this sublevel.
+            If the function returns `False`, dropout is not applied across this
+            entire sublevel.  Default behavior: perform dropout on all terms
+            except the memory (`c`) state of `LSTMCellState` objects, and don't
+            try to apply dropout to
+            `TensorArray` objects:
+            ```
+            def dropout_state_filter_visitor(s):
+              # Never perform dropout on the c state.
+              if isinstance(s, LSTMCellState):
+                return LSTMCellState(c=False, h=True)
+              elif isinstance(s, TensorArray):
+                return False
+              return True
+            ```
+          **kwargs: dict of keyword arguments for base layer.
+
+        Raises:
+          TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is
+            provided but not `callable`.
+          ValueError: if any of the keep_probs are not between 0 and 1.
+        """
+        if isinstance(cell, lstm.LSTMCell):
+            raise ValueError(
+                "keras LSTM cell does not work with DropoutWrapper. "
+                "Please use LSTMCell(dropout=x, recurrent_dropout=y) "
+                "instead."
+            )
+        super().__init__(cell, dtype=dtype, **kwargs)
+
+        if dropout_state_filter_visitor is not None and not callable(
+            dropout_state_filter_visitor
+        ):
+            raise TypeError(
+                "dropout_state_filter_visitor must be callable. "
+                f"Received: {dropout_state_filter_visitor}"
+            )
+        self._dropout_state_filter = (
+            dropout_state_filter_visitor
+            or _default_dropout_state_filter_visitor
+        )
+        with tf.name_scope("DropoutWrapperInit"):
+
+            def tensor_and_const_value(v):
+                tensor_value = tf.convert_to_tensor(v)
+                const_value = tf.get_static_value(tensor_value)
+                return (tensor_value, const_value)
+
+            for prob, attr in [
+                (input_keep_prob, "input_keep_prob"),
+                (state_keep_prob, "state_keep_prob"),
+                (output_keep_prob, "output_keep_prob"),
+            ]:
+                tensor_prob, const_prob = tensor_and_const_value(prob)
+                if const_prob is not None:
+                    if const_prob < 0 or const_prob > 1:
+                        raise ValueError(
+                            f"Parameter {attr} must be between 0 and 1. "
+                            f"Received {const_prob}"
+                        )
+                    setattr(self, f"_{attr}", float(const_prob))
+                else:
+                    setattr(self, f"_{attr}", tensor_prob)
+
+        # Set variational_recurrent, seed before running the code below
+        self._variational_recurrent = variational_recurrent
+        self._input_size = input_size
+        self._seed = seed
+
+        self._recurrent_input_noise = None
+        self._recurrent_state_noise = None
+        self._recurrent_output_noise = None
+
+        if variational_recurrent:
+            if dtype is None:
+                raise ValueError(
+                    "When variational_recurrent=True, dtype must be provided"
+                )
+
+            def convert_to_batch_shape(s):
+                # Prepend a 1 for the batch dimension; for recurrent
+                # variational dropout we use the same dropout mask for all
+                # batch elements.
+                return tf.concat(([1], tf.TensorShape(s).as_list()), 0)
+
+            def batch_noise(s, inner_seed):
+                shape = convert_to_batch_shape(s)
+                return tf.random.uniform(shape, seed=inner_seed, dtype=dtype)
+
+            if (
+                not isinstance(self._input_keep_prob, numbers.Real)
+                or self._input_keep_prob < 1.0
+            ):
+                if input_size is None:
+                    raise ValueError(
+                        "When variational_recurrent=True and input_keep_prob < "
+                        "1.0 or is unknown, input_size must be provided"
+                    )
+                self._recurrent_input_noise = _enumerated_map_structure_up_to(
+                    input_size,
+                    lambda i, s: batch_noise(
+                        s, inner_seed=self._gen_seed("input", i)
+                    ),
+                    input_size,
+                )
+            self._recurrent_state_noise = _enumerated_map_structure_up_to(
+                cell.state_size,
+                lambda i, s: batch_noise(
+                    s, inner_seed=self._gen_seed("state", i)
+                ),
+                cell.state_size,
+            )
+            self._recurrent_output_noise = _enumerated_map_structure_up_to(
+                cell.output_size,
+                lambda i, s: batch_noise(
+                    s, inner_seed=self._gen_seed("output", i)
+                ),
+                cell.output_size,
+            )
+
+    def _gen_seed(self, salt_prefix, index):
+        if self._seed is None:
+            return None
+        salt = "%s_%d" % (salt_prefix, index)
+        string = (str(self._seed) + salt).encode("utf-8")
+        return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
+
+    def _variational_recurrent_dropout_value(
+        self, unused_index, value, noise, keep_prob
+    ):
+        """Performs dropout given the pre-calculated noise tensor."""
+        # uniform [keep_prob, 1.0 + keep_prob)
+        random_tensor = keep_prob + noise
+
+        # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
+        binary_tensor = tf.floor(random_tensor)
+        ret = tf.divide(value, keep_prob) * binary_tensor
+        ret.set_shape(value.get_shape())
+        return ret
+
+    def _dropout(
+        self,
+        values,
+        salt_prefix,
+        recurrent_noise,
+        keep_prob,
+        shallow_filtered_substructure=None,
+    ):
+        """Decides whether to perform standard dropout or recurrent dropout."""
+
+        if shallow_filtered_substructure is None:
+            # Put something so we traverse the entire structure; inside the
+            # dropout function we check to see if leafs of this are bool or not.
+            shallow_filtered_substructure = values
+
+        if not self._variational_recurrent:
+
+            def dropout(i, do_dropout, v):
+                if not isinstance(do_dropout, bool) or do_dropout:
+                    return tf.nn.dropout(
+                        v,
+                        rate=1.0 - keep_prob,
+                        seed=self._gen_seed(salt_prefix, i),
+                    )
+                else:
+                    return v
+
+            return _enumerated_map_structure_up_to(
+                shallow_filtered_substructure,
+                dropout,
+                *[shallow_filtered_substructure, values],
+            )
         else:
-          setattr(self, "_%s" % attr, tensor_prob)
-
-    # Set variational_recurrent, seed before running the code below
-    self._variational_recurrent = variational_recurrent
-    self._input_size = input_size
-    self._seed = seed
-
-    self._recurrent_input_noise = None
-    self._recurrent_state_noise = None
-    self._recurrent_output_noise = None
-
-    if variational_recurrent:
-      if dtype is None:
-        raise ValueError(
-            "When variational_recurrent=True, dtype must be provided")
-
-      def convert_to_batch_shape(s):
-        # Prepend a 1 for the batch dimension; for recurrent
-        # variational dropout we use the same dropout mask for all
-        # batch elements.
-        return tf.concat(([1], tf.TensorShape(s).as_list()), 0)
-
-      def batch_noise(s, inner_seed):
-        shape = convert_to_batch_shape(s)
-        return tf.random.uniform(shape, seed=inner_seed, dtype=dtype)
-
-      if (not isinstance(self._input_keep_prob, numbers.Real) or
-          self._input_keep_prob < 1.0):
-        if input_size is None:
-          raise ValueError(
-              "When variational_recurrent=True and input_keep_prob < 1.0 or "
-              "is unknown, input_size must be provided")
-        self._recurrent_input_noise = _enumerated_map_structure_up_to(
-            input_size,
-            lambda i, s: batch_noise(s, inner_seed=self._gen_seed("input", i)),
-            input_size)
-      self._recurrent_state_noise = _enumerated_map_structure_up_to(
-          cell.state_size,
-          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("state", i)),
-          cell.state_size)
-      self._recurrent_output_noise = _enumerated_map_structure_up_to(
-          cell.output_size,
-          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("output", i)),
-          cell.output_size)
-
-  def _gen_seed(self, salt_prefix, index):
-    if self._seed is None:
-      return None
-    salt = "%s_%d" % (salt_prefix, index)
-    string = (str(self._seed) + salt).encode("utf-8")
-    return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
-
-  def _variational_recurrent_dropout_value(
-      self, unused_index, value, noise, keep_prob):
-    """Performs dropout given the pre-calculated noise tensor."""
-    # uniform [keep_prob, 1.0 + keep_prob)
-    random_tensor = keep_prob + noise
-
-    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
-    binary_tensor = tf.floor(random_tensor)
-    ret = tf.divide(value, keep_prob) * binary_tensor
-    ret.set_shape(value.get_shape())
-    return ret
-
-  def _dropout(self,
-               values,
-               salt_prefix,
-               recurrent_noise,
-               keep_prob,
-               shallow_filtered_substructure=None):
-    """Decides whether to perform standard dropout or recurrent dropout."""
-
-    if shallow_filtered_substructure is None:
-      # Put something so we traverse the entire structure; inside the
-      # dropout function we check to see if leafs of this are bool or not.
-      shallow_filtered_substructure = values
-
-    if not self._variational_recurrent:
-
-      def dropout(i, do_dropout, v):
-        if not isinstance(do_dropout, bool) or do_dropout:
-          return tf.nn.dropout(
-              v, rate=1. - keep_prob, seed=self._gen_seed(salt_prefix, i))
-        else:
-          return v
-
-      return _enumerated_map_structure_up_to(
-          shallow_filtered_substructure, dropout,
-          *[shallow_filtered_substructure, values])
-    else:
-
-      def dropout(i, do_dropout, v, n):
-        if not isinstance(do_dropout, bool) or do_dropout:
-          return self._variational_recurrent_dropout_value(i, v, n, keep_prob)
-        else:
-          return v
-
-      return _enumerated_map_structure_up_to(
-          shallow_filtered_substructure, dropout,
-          *[shallow_filtered_substructure, values, recurrent_noise])
-
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Runs the wrapped cell and applies dropout.
-
-    Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      cell_call_fn: Wrapped cell's method to use for step computation (cell's
-        `__call__` or 'call' method).
-      **kwargs: Additional arguments.
-
-    Returns:
-      A pair containing:
-
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
-    """
-
-    def _should_dropout(p):
-      return (not isinstance(p, float)) or p < 1
-
-    if _should_dropout(self._input_keep_prob):
-      inputs = self._dropout(inputs, "input", self._recurrent_input_noise,
-                             self._input_keep_prob)
-    output, new_state = cell_call_fn(inputs, state, **kwargs)
-    if _should_dropout(self._state_keep_prob):
-      # Identify which subsets of the state to perform dropout on and
-      # which ones to keep.
-      shallow_filtered_substructure = tf.__internal__.nest.get_traverse_shallow_structure(
-          self._dropout_state_filter, new_state)
-      new_state = self._dropout(new_state, "state", self._recurrent_state_noise,
-                                self._state_keep_prob,
-                                shallow_filtered_substructure)
-    if _should_dropout(self._output_keep_prob):
-      output = self._dropout(output, "output", self._recurrent_output_noise,
-                             self._output_keep_prob)
-    return output, new_state
-
-  def get_config(self):
-    """Returns the config of the dropout wrapper."""
-    config = {
-        "input_keep_prob": self._input_keep_prob,
-        "output_keep_prob": self._output_keep_prob,
-        "state_keep_prob": self._state_keep_prob,
-        "variational_recurrent": self._variational_recurrent,
-        "input_size": self._input_size,
-        "seed": self._seed,
-    }
-    if self._dropout_state_filter != _default_dropout_state_filter_visitor:  # pylint: disable=comparison-with-callable
-      function, function_type, function_module = _serialize_function_to_config(
-          self._dropout_state_filter)
-      config.update({"dropout_fn": function,
-                     "dropout_fn_type": function_type,
-                     "dropout_fn_module": function_module})
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    if "dropout_fn" in config:
-      config = config.copy()
-      dropout_state_filter = _parse_config_to_function(
-          config, custom_objects, "dropout_fn", "dropout_fn_type",
-          "dropout_fn_module")
-      config.pop("dropout_fn")
-      config["dropout_state_filter_visitor"] = dropout_state_filter
-    return super(DropoutWrapper, cls).from_config(
-        config, custom_objects=custom_objects)
-
 
+            def dropout(i, do_dropout, v, n):
+                if not isinstance(do_dropout, bool) or do_dropout:
+                    return self._variational_recurrent_dropout_value(
+                        i, v, n, keep_prob
+                    )
+                else:
+                    return v
+
+            return _enumerated_map_structure_up_to(
+                shallow_filtered_substructure,
+                dropout,
+                *[shallow_filtered_substructure, values, recurrent_noise],
+            )
+
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Runs the wrapped cell and applies dropout.
+
+        Args:
+          inputs: A tensor with wrapped cell's input.
+          state: A tensor or tuple of tensors with wrapped cell's state.
+          cell_call_fn: Wrapped cell's method to use for step computation
+            (cell's `__call__` or 'call' method).
+          **kwargs: Additional arguments.
+
+        Returns:
+          A pair containing:
+
+          - Output: A tensor with cell's output.
+          - New state: A tensor or tuple of tensors with new wrapped cell's
+            state.
+        """
+
+        def _should_dropout(p):
+            return (not isinstance(p, float)) or p < 1
+
+        if _should_dropout(self._input_keep_prob):
+            inputs = self._dropout(
+                inputs,
+                "input",
+                self._recurrent_input_noise,
+                self._input_keep_prob,
+            )
+        output, new_state = cell_call_fn(inputs, state, **kwargs)
+        if _should_dropout(self._state_keep_prob):
+            # Identify which subsets of the state to perform dropout on and
+            # which ones to keep.
+            shallow_filtered_substructure = (
+                tf.__internal__.nest.get_traverse_shallow_structure(
+                    self._dropout_state_filter, new_state
+                )
+            )
+            new_state = self._dropout(
+                new_state,
+                "state",
+                self._recurrent_state_noise,
+                self._state_keep_prob,
+                shallow_filtered_substructure,
+            )
+        if _should_dropout(self._output_keep_prob):
+            output = self._dropout(
+                output,
+                "output",
+                self._recurrent_output_noise,
+                self._output_keep_prob,
+            )
+        return output, new_state
+
+    def get_config(self):
+        """Returns the config of the dropout wrapper."""
+        config = {
+            "input_keep_prob": self._input_keep_prob,
+            "output_keep_prob": self._output_keep_prob,
+            "state_keep_prob": self._state_keep_prob,
+            "variational_recurrent": self._variational_recurrent,
+            "input_size": self._input_size,
+            "seed": self._seed,
+        }
+        if self._dropout_state_filter != _default_dropout_state_filter_visitor:
+            (
+                function,
+                function_type,
+                function_module,
+            ) = _serialize_function_to_config(self._dropout_state_filter)
+            config.update(
+                {
+                    "dropout_fn": function,
+                    "dropout_fn_type": function_type,
+                    "dropout_fn_module": function_module,
+                }
+            )
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        if "dropout_fn" in config:
+            config = config.copy()
+            dropout_state_filter = _parse_config_to_function(
+                config,
+                custom_objects,
+                "dropout_fn",
+                "dropout_fn_type",
+                "dropout_fn_module",
+            )
+            config.pop("dropout_fn")
+            config["dropout_state_filter_visitor"] = dropout_state_filter
+        return super(DropoutWrapper, cls).from_config(
+            config, custom_objects=custom_objects
+        )
+
+
+@deprecated(None, "Please use tf.keras.layers.RNN instead.")
 @tf_export("nn.RNNCellResidualWrapper", v1=[])
 class ResidualWrapper(_RNNCellWrapper):
-  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
-
-  def __init__(self, cell, residual_fn=None, **kwargs):
-    """Constructs a `ResidualWrapper` for `cell`.
-
-    Args:
-      cell: An instance of `RNNCell`.
-      residual_fn: (Optional) The function to map raw cell inputs and raw cell
-        outputs to the actual cell outputs of the residual network.
-        Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
-          and outputs.
-      **kwargs: dict of keyword arguments for base layer.
-    """
-    super().__init__(cell, **kwargs)
-    self._residual_fn = residual_fn
-
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Run the cell and then apply the residual_fn on its inputs to its outputs.
-
-    Args:
-      inputs: cell inputs.
-      state: cell state.
-      cell_call_fn: Wrapped cell's method to use for step computation (cell's
-        `__call__` or 'call' method).
-      **kwargs: Additional arguments passed to the wrapped cell's `call`.
-
-    Returns:
-      Tuple of cell outputs and new state.
-
-    Raises:
-      TypeError: If cell inputs and outputs have different structure (type).
-      ValueError: If cell inputs and outputs have different structure (value).
-    """
-    outputs, new_state = cell_call_fn(inputs, state, **kwargs)
-
-    # Ensure shapes match
-    def assert_shape_match(inp, out):
-      inp.get_shape().assert_is_compatible_with(out.get_shape())
-
-    def default_residual_fn(inputs, outputs):
-      tf.nest.assert_same_structure(inputs, outputs)
-      tf.nest.map_structure(assert_shape_match, inputs, outputs)
-      return tf.nest.map_structure(lambda inp, out: inp + out, inputs, outputs)
-
-    res_outputs = (self._residual_fn or default_residual_fn)(inputs, outputs)
-    return (res_outputs, new_state)
-
-  def get_config(self):
-    """Returns the config of the residual wrapper."""
-    if self._residual_fn is not None:
-      function, function_type, function_module = _serialize_function_to_config(
-          self._residual_fn)
-      config = {
-          "residual_fn": function,
-          "residual_fn_type": function_type,
-          "residual_fn_module": function_module
-      }
-    else:
-      config = {}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    if "residual_fn" in config:
-      config = config.copy()
-      residual_function = _parse_config_to_function(config, custom_objects,
-                                                    "residual_fn",
-                                                    "residual_fn_type",
-                                                    "residual_fn_module")
-      config["residual_fn"] = residual_function
-    return super(ResidualWrapper, cls).from_config(
-        config, custom_objects=custom_objects)
-
-
+    """RNNCell wrapper that ensures cell inputs are added to the outputs."""
+
+    def __init__(self, cell, residual_fn=None, **kwargs):
+        """Constructs a `ResidualWrapper` for `cell`.
+
+        Args:
+          cell: An instance of `RNNCell`.
+          residual_fn: (Optional) The function to map raw cell inputs and raw
+            cell outputs to the actual cell outputs of the residual network.
+            Defaults to calling nest.map_structure on (lambda i, o: i + o),
+            inputs and outputs.
+          **kwargs: dict of keyword arguments for base layer.
+        """
+        super().__init__(cell, **kwargs)
+        self._residual_fn = residual_fn
+
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Run the cell and apply the residual_fn.
+
+        Args:
+          inputs: cell inputs.
+          state: cell state.
+          cell_call_fn: Wrapped cell's method to use for step computation
+            (cell's `__call__` or 'call' method).
+          **kwargs: Additional arguments passed to the wrapped cell's `call`.
+
+        Returns:
+          Tuple of cell outputs and new state.
+
+        Raises:
+          TypeError: If cell inputs and outputs have different structure (type).
+          ValueError: If cell inputs and outputs have different structure
+            (value).
+        """
+        outputs, new_state = cell_call_fn(inputs, state, **kwargs)
+
+        # Ensure shapes match
+        def assert_shape_match(inp, out):
+            inp.get_shape().assert_is_compatible_with(out.get_shape())
+
+        def default_residual_fn(inputs, outputs):
+            tf.nest.assert_same_structure(inputs, outputs)
+            tf.nest.map_structure(assert_shape_match, inputs, outputs)
+            return tf.nest.map_structure(
+                lambda inp, out: inp + out, inputs, outputs
+            )
+
+        res_outputs = (self._residual_fn or default_residual_fn)(
+            inputs, outputs
+        )
+        return (res_outputs, new_state)
+
+    def get_config(self):
+        """Returns the config of the residual wrapper."""
+        if self._residual_fn is not None:
+            (
+                function,
+                function_type,
+                function_module,
+            ) = _serialize_function_to_config(self._residual_fn)
+            config = {
+                "residual_fn": function,
+                "residual_fn_type": function_type,
+                "residual_fn_module": function_module,
+            }
+        else:
+            config = {}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        if "residual_fn" in config:
+            config = config.copy()
+            residual_function = _parse_config_to_function(
+                config,
+                custom_objects,
+                "residual_fn",
+                "residual_fn_type",
+                "residual_fn_module",
+            )
+            config["residual_fn"] = residual_function
+        return super(ResidualWrapper, cls).from_config(
+            config, custom_objects=custom_objects
+        )
+
+
+@deprecated(None, "Please use tf.keras.layers.RNN instead.")
 @tf_export("nn.RNNCellDeviceWrapper", v1=[])
 class DeviceWrapper(_RNNCellWrapper):
-  """Operator that ensures an RNNCell runs on a particular device."""
+    """Operator that ensures an RNNCell runs on a particular device."""
 
-  def __init__(self, cell, device, **kwargs):
-    """Construct a `DeviceWrapper` for `cell` with device `device`.
+    def __init__(self, cell, device, **kwargs):
+        """Construct a `DeviceWrapper` for `cell` with device `device`.
 
-    Ensures the wrapped `cell` is called with `tf.device(device)`.
+        Ensures the wrapped `cell` is called with `tf.device(device)`.
 
-    Args:
-      cell: An instance of `RNNCell`.
-      device: A device string or function, for passing to `tf.device`.
-      **kwargs: dict of keyword arguments for base layer.
-    """
-    super().__init__(cell, **kwargs)
-    self._device = device
+        Args:
+          cell: An instance of `RNNCell`.
+          device: A device string or function, for passing to `tf.device`.
+          **kwargs: dict of keyword arguments for base layer.
+        """
+        super().__init__(cell, **kwargs)
+        self._device = device
 
-  def zero_state(self, batch_size, dtype):
-    with tf.name_scope(type(self).__name__ + "ZeroState"):
-      with tf.compat.v1.device(self._device):
-        return self.cell.zero_state(batch_size, dtype)
+    def zero_state(self, batch_size, dtype):
+        with tf.name_scope(type(self).__name__ + "ZeroState"):
+            with tf.compat.v1.device(self._device):
+                return self.cell.zero_state(batch_size, dtype)
 
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Run the cell on specified device."""
-    with tf.compat.v1.device(self._device):
-      return cell_call_fn(inputs, state, **kwargs)
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Run the cell on specified device."""
+        with tf.compat.v1.device(self._device):
+            return cell_call_fn(inputs, state, **kwargs)
 
-  def get_config(self):
-    config = {"device": self._device}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"device": self._device}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 def _serialize_function_to_config(function):
-  """Serialize the function for get_config()."""
-  if isinstance(function, python_types.LambdaType):
-    output = generic_utils.func_dump(function)
-    output_type = "lambda"
-    module = function.__module__
-  elif callable(function):
-    output = function.__name__
-    output_type = "function"
-    module = function.__module__
-  else:
-    raise ValueError(
-        f"Unrecognized function type for input: {type(function)}")
-
-  return output, output_type, module
-
-
-def _parse_config_to_function(config, custom_objects, func_attr_name,
-                              func_type_attr_name, module_attr_name):
-  """Reconstruct the function from the config."""
-  globs = globals()
-  module = config.pop(module_attr_name, None)
-  if module in sys.modules:
-    globs.update(sys.modules[module].__dict__)
-  elif module is not None:
-    # Note: we don't know the name of the function if it's a lambda.
-    warnings.warn(
-        "{} is not loaded, but a layer uses it. "
-        "It may cause errors.".format(module),
-        UserWarning,
-        stacklevel=2)
-  if custom_objects:
-    globs.update(custom_objects)
-  function_type = config.pop(func_type_attr_name)
-  if function_type == "function":
-    # Simple lookup in custom objects
-    function = generic_utils.deserialize_keras_object(
-        config[func_attr_name],
-        custom_objects=custom_objects,
-        printable_module_name="function in wrapper")
-  elif function_type == "lambda":
-    # Unsafe deserialization from bytecode
-    function = generic_utils.func_load(
-        config[func_attr_name], globs=globs)
-  else:
-    raise TypeError(
-        f"Unknown function type received: {function_type}. "
-        "Expected types are ['function', 'lambda']")
-  return function
+    """Serialize the function for get_config()."""
+    if isinstance(function, python_types.LambdaType):
+        output = generic_utils.func_dump(function)
+        output_type = "lambda"
+        module = function.__module__
+    elif callable(function):
+        output = function.__name__
+        output_type = "function"
+        module = function.__module__
+    else:
+        raise ValueError(
+            f"Unrecognized function type for input: {type(function)}"
+        )
+
+    return output, output_type, module
+
+
+def _parse_config_to_function(
+    config,
+    custom_objects,
+    func_attr_name,
+    func_type_attr_name,
+    module_attr_name,
+):
+    """Reconstruct the function from the config."""
+    globs = globals()
+    module = config.pop(module_attr_name, None)
+    if module in sys.modules:
+        globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+        # Note: we don't know the name of the function if it's a lambda.
+        warnings.warn(
+            "{} is not loaded, but a layer uses it. "
+            "It may cause errors.".format(module),
+            UserWarning,
+            stacklevel=2,
+        )
+    if custom_objects:
+        globs.update(custom_objects)
+    function_type = config.pop(func_type_attr_name)
+    if function_type == "function":
+        # Simple lookup in custom objects
+        function = serialization_lib.deserialize_keras_object(
+            config[func_attr_name],
+            custom_objects=custom_objects,
+            printable_module_name="function in wrapper",
+        )
+    elif function_type == "lambda":
+        if serialization_lib.in_safe_mode():
+            raise ValueError(
+                "Requested the deserialization of a layer with a "
+                "Python `lambda` inside it. "
+                "This carries a potential risk of arbitrary code execution "
+                "and thus it is disallowed by default. If you trust the "
+                "source of the saved model, you can pass `safe_mode=False` to "
+                "the loading function in order to allow "
+                "`lambda` loading."
+            )
+        # Unsafe deserialization from bytecode
+        function = generic_utils.func_load(config[func_attr_name], globs=globs)
+    else:
+        raise TypeError(
+            f"Unknown function type received: {function_type}. "
+            "Expected types are ['function', 'lambda']"
+        )
+    return function
 
 
 def _default_dropout_state_filter_visitor(substate):
-  return not isinstance(substate, tf.TensorArray)
+    return not isinstance(substate, tf.TensorArray)
 
 
 def _enumerated_map_structure_up_to(shallow_structure, map_fn, *args, **kwargs):
-  ix = [0]
+    ix = [0]
 
-  def enumerated_fn(*inner_args, **inner_kwargs):
-    r = map_fn(ix[0], *inner_args, **inner_kwargs)
-    ix[0] += 1
-    return r
+    def enumerated_fn(*inner_args, **inner_kwargs):
+        r = map_fn(ix[0], *inner_args, **inner_kwargs)
+        ix[0] += 1
+        return r
 
-  return tf.__internal__.nest.map_structure_up_to(shallow_structure,
-                                                  enumerated_fn, *args,
-                                                  **kwargs)
+    return tf.__internal__.nest.map_structure_up_to(
+        shallow_structure, enumerated_fn, *args, **kwargs
+    )
diff --git a/keras/layers/rnn/cell_wrappers_test.py b/keras/layers/rnn/cell_wrappers_test.py
index e5f3caa30438..e8683a7f2040 100644
--- a/keras/layers/rnn/cell_wrappers_test.py
+++ b/keras/layers/rnn/cell_wrappers_test.py
@@ -14,208 +14,223 @@
 # ==============================================================================
 """Tests for RNN cell wrappers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import layers
 from keras.layers.rnn import cell_wrappers
 from keras.layers.rnn import legacy_cells
 from keras.legacy_tf_layers import base as legacy_base_layer
 from keras.testing_infra import test_combinations
 from keras.utils import generic_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class RNNCellWrapperTest(tf.test.TestCase, parameterized.TestCase):
-
-  def testResidualWrapper(self):
-    wrapper_type = cell_wrappers.ResidualWrapper
-    x = tf.convert_to_tensor(
-        np.array([[1., 1., 1.]]), dtype="float32")
-    m = tf.convert_to_tensor(
-        np.array([[0.1, 0.1, 0.1]]), dtype="float32")
-    base_cell = legacy_cells.GRUCell(
-        3, kernel_initializer=tf.compat.v1.constant_initializer(0.5),
-        bias_initializer=tf.compat.v1.constant_initializer(0.5))
-    g, m_new = base_cell(x, m)
-    wrapper_object = wrapper_type(base_cell)
-    self.assertDictEqual({"cell": base_cell},
-                         wrapper_object._trackable_children())
-    wrapper_object.get_config()  # Should not throw an error
-
-    g_res, m_new_res = wrapper_object(x, m)
-    self.evaluate([tf.compat.v1.global_variables_initializer()])
-    res = self.evaluate([g, g_res, m_new, m_new_res])
-    # Residual connections
-    self.assertAllClose(res[1], res[0] + [1., 1., 1.])
-    # States are left untouched
-    self.assertAllClose(res[2], res[3])
-
-  def testResidualWrapperWithSlice(self):
-    wrapper_type = cell_wrappers.ResidualWrapper
-    x = tf.convert_to_tensor(
-        np.array([[1., 1., 1., 1., 1.]]), dtype="float32")
-    m = tf.convert_to_tensor(
-        np.array([[0.1, 0.1, 0.1]]), dtype="float32")
-    base_cell = legacy_cells.GRUCell(
-        3, kernel_initializer=tf.compat.v1.constant_initializer(0.5),
-        bias_initializer=tf.compat.v1.constant_initializer(0.5))
-    g, m_new = base_cell(x, m)
-
-    def residual_with_slice_fn(inp, out):
-      inp_sliced = tf.slice(inp, [0, 0], [-1, 3])
-      return inp_sliced + out
-
-    g_res, m_new_res = wrapper_type(
-        base_cell, residual_with_slice_fn)(x, m)
-    self.evaluate([tf.compat.v1.global_variables_initializer()])
-    res_g, res_g_res, res_m_new, res_m_new_res = self.evaluate(
-        [g, g_res, m_new, m_new_res])
-    # Residual connections
-    self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
-    # States are left untouched
-    self.assertAllClose(res_m_new, res_m_new_res)
-
-  def testDeviceWrapper(self):
-    wrapper_type = cell_wrappers.DeviceWrapper
-    x = tf.zeros([1, 3])
-    m = tf.zeros([1, 3])
-    cell = legacy_cells.GRUCell(3)
-    wrapped_cell = wrapper_type(cell, "/cpu:0")
-    self.assertDictEqual({"cell": cell},
-                         wrapped_cell._trackable_children())
-    wrapped_cell.get_config()  # Should not throw an error
-
-    outputs, _ = wrapped_cell(x, m)
-    self.assertIn("cpu:0", outputs.device.lower())
-
-  @parameterized.parameters(
-      [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper])
-  def testWrapperKerasStyle(self, wrapper):
-    """Tests if wrapper cell is instantiated in keras style scope."""
-    wrapped_cell = wrapper(legacy_cells.BasicRNNCell(1))
-    self.assertIsNone(getattr(wrapped_cell, "_keras_style", None))
-
-  @parameterized.parameters(
-      [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper])
-  def testWrapperWeights(self, wrapper):
-    """Tests that wrapper weights contain wrapped cells weights."""
-    base_cell = layers.SimpleRNNCell(1, name="basic_rnn_cell")
-    rnn_cell = wrapper(base_cell)
-    rnn_layer = layers.RNN(rnn_cell)
-    inputs = tf.convert_to_tensor([[[1]]], dtype=tf.float32)
-    rnn_layer(inputs)
-
-    wrapper_name = generic_utils.to_snake_case(wrapper.__name__)
-    expected_weights = ["rnn/" + wrapper_name + "/" + var for var in
-                        ("kernel:0", "recurrent_kernel:0", "bias:0")]
-    self.assertLen(rnn_cell.weights, 3)
-    self.assertCountEqual([v.name for v in rnn_cell.weights], expected_weights)
-    self.assertCountEqual([v.name for v in rnn_cell.trainable_variables],
-                          expected_weights)
-    self.assertCountEqual([v.name for v in rnn_cell.non_trainable_variables],
-                          [])
-    self.assertCountEqual([v.name for v in rnn_cell.cell.weights],
-                          expected_weights)
-
-  @parameterized.parameters(
-      [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper])
-  def testWrapperV2Caller(self, wrapper):
-    """Tests that wrapper V2 is using the LayerRNNCell's caller."""
-
-    with legacy_base_layer.keras_style_scope():
-      base_cell = legacy_cells.MultiRNNCell(
-          [legacy_cells.BasicRNNCell(1) for _ in range(2)])
-    rnn_cell = wrapper(base_cell)
-    inputs = tf.convert_to_tensor([[1]], dtype=tf.float32)
-    state = tf.convert_to_tensor([[1]], dtype=tf.float32)
-    _ = rnn_cell(inputs, [state, state])
-    weights = base_cell._cells[0].weights
-    self.assertLen(weights, expected_len=2)
-    self.assertTrue(all("_wrapper" in v.name for v in weights))
-
-  @parameterized.parameters(
-      [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper])
-  def testWrapperV2Build(self, wrapper):
-    cell = legacy_cells.LSTMCell(10)
-    wrapper = wrapper(cell)
-    wrapper.build((1,))
-    self.assertTrue(cell.built)
-
-  def testDeviceWrapperSerialization(self):
-    wrapper_cls = cell_wrappers.DeviceWrapper
-    cell = layers.LSTMCell(10)
-    wrapper = wrapper_cls(cell, "/cpu:0")
-    config = wrapper.get_config()
-
-    reconstructed_wrapper = wrapper_cls.from_config(config)
-    self.assertDictEqual(config, reconstructed_wrapper.get_config())
-    self.assertIsInstance(reconstructed_wrapper, wrapper_cls)
-
-  def testResidualWrapperSerialization(self):
-    wrapper_cls = cell_wrappers.ResidualWrapper
-    cell = layers.LSTMCell(10)
-    wrapper = wrapper_cls(cell)
-    config = wrapper.get_config()
-
-    reconstructed_wrapper = wrapper_cls.from_config(config)
-    self.assertDictEqual(config, reconstructed_wrapper.get_config())
-    self.assertIsInstance(reconstructed_wrapper, wrapper_cls)
-
-    wrapper = wrapper_cls(cell, residual_fn=lambda i, o: i + i + o)
-    config = wrapper.get_config()
-
-    reconstructed_wrapper = wrapper_cls.from_config(config)
-    # Assert the reconstructed function will perform the math correctly.
-    self.assertEqual(reconstructed_wrapper._residual_fn(1, 2), 4)
-
-    def residual_fn(inputs, outputs):
-      return inputs * 3 + outputs
-
-    wrapper = wrapper_cls(cell, residual_fn=residual_fn)
-    config = wrapper.get_config()
-
-    reconstructed_wrapper = wrapper_cls.from_config(config)
-    # Assert the reconstructed function will perform the math correctly.
-    self.assertEqual(reconstructed_wrapper._residual_fn(1, 2), 5)
-
-  def testDropoutWrapperSerialization(self):
-    wrapper_cls = cell_wrappers.DropoutWrapper
-    cell = layers.GRUCell(10)
-    wrapper = wrapper_cls(cell)
-    config = wrapper.get_config()
-
-    reconstructed_wrapper = wrapper_cls.from_config(config)
-    self.assertDictEqual(config, reconstructed_wrapper.get_config())
-    self.assertIsInstance(reconstructed_wrapper, wrapper_cls)
-
-    wrapper = wrapper_cls(cell, dropout_state_filter_visitor=lambda s: True)
-    config = wrapper.get_config()
-
-    reconstructed_wrapper = wrapper_cls.from_config(config)
-    self.assertTrue(reconstructed_wrapper._dropout_state_filter(None))
-
-    def dropout_state_filter_visitor(unused_state):
-      return False
-
-    wrapper = wrapper_cls(
-        cell, dropout_state_filter_visitor=dropout_state_filter_visitor)
-    config = wrapper.get_config()
-
-    reconstructed_wrapper = wrapper_cls.from_config(config)
-    self.assertFalse(reconstructed_wrapper._dropout_state_filter(None))
-
-  def testDropoutWrapperWithKerasLSTMCell(self):
-    wrapper_cls = cell_wrappers.DropoutWrapper
-    cell = layers.LSTMCell(10)
-
-    with self.assertRaisesRegex(ValueError, "does not work with "):
-      wrapper_cls(cell)
-
-    cell = layers.LSTMCellV2(10)
-    with self.assertRaisesRegex(ValueError, "does not work with "):
-      wrapper_cls(cell)
+    def testResidualWrapper(self):
+        wrapper_type = cell_wrappers.ResidualWrapper
+        x = tf.convert_to_tensor(np.array([[1.0, 1.0, 1.0]]), dtype="float32")
+        m = tf.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
+        base_cell = legacy_cells.GRUCell(
+            3,
+            kernel_initializer=tf.compat.v1.constant_initializer(0.5),
+            bias_initializer=tf.compat.v1.constant_initializer(0.5),
+        )
+        g, m_new = base_cell(x, m)
+        wrapper_object = wrapper_type(base_cell)
+        self.assertDictEqual(
+            {"cell": base_cell}, wrapper_object._trackable_children()
+        )
+        wrapper_object.get_config()  # Should not throw an error
+
+        g_res, m_new_res = wrapper_object(x, m)
+        self.evaluate([tf.compat.v1.global_variables_initializer()])
+        res = self.evaluate([g, g_res, m_new, m_new_res])
+        # Residual connections
+        self.assertAllClose(res[1], res[0] + [1.0, 1.0, 1.0])
+        # States are left untouched
+        self.assertAllClose(res[2], res[3])
+
+    def testResidualWrapperWithSlice(self):
+        wrapper_type = cell_wrappers.ResidualWrapper
+        x = tf.convert_to_tensor(
+            np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]), dtype="float32"
+        )
+        m = tf.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
+        base_cell = legacy_cells.GRUCell(
+            3,
+            kernel_initializer=tf.compat.v1.constant_initializer(0.5),
+            bias_initializer=tf.compat.v1.constant_initializer(0.5),
+        )
+        g, m_new = base_cell(x, m)
+
+        def residual_with_slice_fn(inp, out):
+            inp_sliced = tf.slice(inp, [0, 0], [-1, 3])
+            return inp_sliced + out
+
+        g_res, m_new_res = wrapper_type(base_cell, residual_with_slice_fn)(x, m)
+        self.evaluate([tf.compat.v1.global_variables_initializer()])
+        res_g, res_g_res, res_m_new, res_m_new_res = self.evaluate(
+            [g, g_res, m_new, m_new_res]
+        )
+        # Residual connections
+        self.assertAllClose(res_g_res, res_g + [1.0, 1.0, 1.0])
+        # States are left untouched
+        self.assertAllClose(res_m_new, res_m_new_res)
+
+    def testDeviceWrapper(self):
+        wrapper_type = cell_wrappers.DeviceWrapper
+        x = tf.zeros([1, 3])
+        m = tf.zeros([1, 3])
+        cell = legacy_cells.GRUCell(3)
+        wrapped_cell = wrapper_type(cell, "/cpu:0")
+        self.assertDictEqual({"cell": cell}, wrapped_cell._trackable_children())
+        wrapped_cell.get_config()  # Should not throw an error
+
+        outputs, _ = wrapped_cell(x, m)
+        self.assertIn("cpu:0", outputs.device.lower())
+
+    @parameterized.parameters(
+        [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper]
+    )
+    def testWrapperKerasStyle(self, wrapper):
+        """Tests if wrapper cell is instantiated in keras style scope."""
+        wrapped_cell = wrapper(legacy_cells.BasicRNNCell(1))
+        self.assertIsNone(getattr(wrapped_cell, "_keras_style", None))
+
+    @parameterized.parameters(
+        [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper]
+    )
+    def testWrapperWeights(self, wrapper):
+        """Tests that wrapper weights contain wrapped cells weights."""
+        base_cell = layers.SimpleRNNCell(1, name="basic_rnn_cell")
+        rnn_cell = wrapper(base_cell)
+        rnn_layer = layers.RNN(rnn_cell)
+        inputs = tf.convert_to_tensor([[[1]]], dtype=tf.float32)
+        rnn_layer(inputs)
+
+        wrapper_name = generic_utils.to_snake_case(wrapper.__name__)
+        expected_weights = [
+            "rnn/" + wrapper_name + "/" + var
+            for var in ("kernel:0", "recurrent_kernel:0", "bias:0")
+        ]
+        self.assertLen(rnn_cell.weights, 3)
+        self.assertCountEqual(
+            [v.name for v in rnn_cell.weights], expected_weights
+        )
+        self.assertCountEqual(
+            [v.name for v in rnn_cell.trainable_variables], expected_weights
+        )
+        self.assertCountEqual(
+            [v.name for v in rnn_cell.non_trainable_variables], []
+        )
+        self.assertCountEqual(
+            [v.name for v in rnn_cell.cell.weights], expected_weights
+        )
+
+    @parameterized.parameters(
+        [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper]
+    )
+    def testWrapperV2Caller(self, wrapper):
+        """Tests that wrapper V2 is using the LayerRNNCell's caller."""
+
+        with legacy_base_layer.keras_style_scope():
+            base_cell = legacy_cells.MultiRNNCell(
+                [legacy_cells.BasicRNNCell(1) for _ in range(2)]
+            )
+        rnn_cell = wrapper(base_cell)
+        inputs = tf.convert_to_tensor([[1]], dtype=tf.float32)
+        state = tf.convert_to_tensor([[1]], dtype=tf.float32)
+        _ = rnn_cell(inputs, [state, state])
+        weights = base_cell._cells[0].weights
+        self.assertLen(weights, expected_len=2)
+        self.assertTrue(all("_wrapper" in v.name for v in weights))
+
+    @parameterized.parameters(
+        [cell_wrappers.DropoutWrapper, cell_wrappers.ResidualWrapper]
+    )
+    def testWrapperV2Build(self, wrapper):
+        cell = legacy_cells.LSTMCell(10)
+        wrapper = wrapper(cell)
+        wrapper.build((1,))
+        self.assertTrue(cell.built)
+
+    def testDeviceWrapperSerialization(self):
+        wrapper_cls = cell_wrappers.DeviceWrapper
+        cell = layers.LSTMCell(10)
+        wrapper = wrapper_cls(cell, "/cpu:0")
+        config = wrapper.get_config()
+
+        reconstructed_wrapper = wrapper_cls.from_config(config)
+        self.assertDictEqual(config, reconstructed_wrapper.get_config())
+        self.assertIsInstance(reconstructed_wrapper, wrapper_cls)
+
+    def testResidualWrapperSerialization(self):
+        wrapper_cls = cell_wrappers.ResidualWrapper
+        cell = layers.LSTMCell(10)
+        wrapper = wrapper_cls(cell)
+        config = wrapper.get_config()
+
+        reconstructed_wrapper = wrapper_cls.from_config(config)
+        self.assertDictEqual(config, reconstructed_wrapper.get_config())
+        self.assertIsInstance(reconstructed_wrapper, wrapper_cls)
+
+        wrapper = wrapper_cls(cell, residual_fn=lambda i, o: i + i + o)
+        config = wrapper.get_config()
+
+        reconstructed_wrapper = wrapper_cls.from_config(config)
+        # Assert the reconstructed function will perform the math correctly.
+        self.assertEqual(reconstructed_wrapper._residual_fn(1, 2), 4)
+
+        def residual_fn(inputs, outputs):
+            return inputs * 3 + outputs
+
+        wrapper = wrapper_cls(cell, residual_fn=residual_fn)
+        config = wrapper.get_config()
+
+        reconstructed_wrapper = wrapper_cls.from_config(config)
+        # Assert the reconstructed function will perform the math correctly.
+        self.assertEqual(reconstructed_wrapper._residual_fn(1, 2), 5)
+
+    def testDropoutWrapperSerialization(self):
+        wrapper_cls = cell_wrappers.DropoutWrapper
+        cell = layers.GRUCell(10)
+        wrapper = wrapper_cls(cell)
+        config = wrapper.get_config()
+
+        reconstructed_wrapper = wrapper_cls.from_config(config)
+        self.assertDictEqual(config, reconstructed_wrapper.get_config())
+        self.assertIsInstance(reconstructed_wrapper, wrapper_cls)
+
+        wrapper = wrapper_cls(cell, dropout_state_filter_visitor=lambda s: True)
+        config = wrapper.get_config()
+
+        reconstructed_wrapper = wrapper_cls.from_config(config)
+        self.assertTrue(reconstructed_wrapper._dropout_state_filter(None))
+
+        def dropout_state_filter_visitor(unused_state):
+            return False
+
+        wrapper = wrapper_cls(
+            cell, dropout_state_filter_visitor=dropout_state_filter_visitor
+        )
+        config = wrapper.get_config()
+
+        reconstructed_wrapper = wrapper_cls.from_config(config)
+        self.assertFalse(reconstructed_wrapper._dropout_state_filter(None))
+
+    def testDropoutWrapperWithKerasLSTMCell(self):
+        wrapper_cls = cell_wrappers.DropoutWrapper
+        cell = layers.LSTMCell(10)
+
+        with self.assertRaisesRegex(ValueError, "does not work with "):
+            wrapper_cls(cell)
+
+        cell = layers.LSTMCellV2(10)
+        with self.assertRaisesRegex(ValueError, "does not work with "):
+            wrapper_cls(cell)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/rnn/conv_lstm1d.py b/keras/layers/rnn/conv_lstm1d.py
index b86eb9a4c1b7..96d3c2837416 100644
--- a/keras/layers/rnn/conv_lstm1d.py
+++ b/keras/layers/rnn/conv_lstm1d.py
@@ -13,172 +13,177 @@
 # limitations under the License.
 # ==============================================================================
 """1D Convolutional LSTM layer."""
-# pylint: disable=g-classes-have-attributes,disable=g-direct-tensorflow-import
+
 
 from keras.layers.rnn.base_conv_lstm import ConvLSTM
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ConvLSTM1D')
+@keras_export("keras.layers.ConvLSTM1D")
 class ConvLSTM1D(ConvLSTM):
-  """1D Convolutional LSTM.
+    """1D Convolutional LSTM.
 
-  Similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+    Similar to an LSTM layer, but the input transformations
+    and recurrent transformations are both convolutional.
 
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number of
-      output filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      dimensions of the convolution window.
-    strides: An integer or tuple/list of n integers, specifying the strides of
-      the convolution. Specifying any stride value != 1 is incompatible with
-      specifying any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means no
-      padding. `"same"` results in padding evenly to the left/right or up/down
-      of the input such that output has the same height/width dimension as the
-      input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `(batch, time, ..., channels)` while `channels_first`
-      corresponds to inputs with shape `(batch, time, channels, ...)`. It
-      defaults to the `image_data_format` value found in your Keras config file
-      at `~/.keras/keras.json`. If you never set it, then it will be
-      "channels_last".
-    dilation_rate: An integer or tuple/list of n integers, specifying the
-      dilation rate to use for dilated convolution. Currently, specifying any
-      `dilation_rate` value != 1 is incompatible with specifying any `strides`
-      value != 1.
-    activation: Activation function to use. By default hyperbolic tangent
-      activation function is applied (`tanh(x)`).
-    recurrent_activation: Activation function to use for the recurrent step.
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
-      initialization. Use in combination with `bias_initializer="zeros"`. This
-      is recommended in [Jozefowicz et al., 2015](
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of output filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers, specifying the strides of
+        the convolution. Specifying any stride value != 1 is incompatible with
+        specifying any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means
+        no padding. `"same"` results in padding evenly to the left/right or
+        up/down of the input such that output has the same height/width
+        dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch, time, ...,
+        channels)` while `channels_first` corresponds to inputs with shape
+        `(batch, time, channels, ...)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of n integers, specifying the
+        dilation rate to use for dilated convolution. Currently, specifying any
+        `dilation_rate` value != 1 is incompatible with specifying any `strides`
+        value != 1.
+      activation: Activation function to use. By default hyperbolic tangent
+        activation function is applied (`tanh(x)`).
+      recurrent_activation: Activation function to use for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
+        at initialization. Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al., 2015](
         http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    return_sequences: Boolean. Whether to return the last output in the output
-      sequence, or the full sequence. (default False)
-    return_state: Boolean Whether to return the last state in addition to the
-      output. (default False)
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards.
-    stateful: Boolean (default False). If True, the last state for each sample
-      at index i in a batch will be used as initial state for the sample of
-      index i in the following batch.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-  Call arguments:
-    inputs: A 4D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
-      given timestep should be masked.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or `recurrent_dropout`
-      are set.
-    initial_state: List of initial state tensors to be passed to the first call
-      of the cell.
-  Input shape: - If data_format='channels_first'
-        4D tensor with shape: `(samples, time, channels, rows)` - If
-          data_format='channels_last'
-        4D tensor with shape: `(samples, time, rows, channels)`
-  Output shape:
-    - If `return_state`: a list of tensors. The first tensor is the output. The
-      remaining tensors are the last states,
-      each 3D tensor with shape: `(samples, filters, new_rows)` if
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output in the output
+        sequence, or the full sequence. (default False)
+      return_state: Boolean Whether to return the last state in addition to the
+        output. (default False)
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state.
+    Call arguments:
+      inputs: A 4D tensor.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
+        given timestep should be masked.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` are set.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+    Input shape: - If data_format='channels_first'
+          4D tensor with shape: `(samples, time, channels, rows)` - If
+            data_format='channels_last'
+          4D tensor with shape: `(samples, time, rows, channels)`
+    Output shape:
+      - If `return_state`: a list of tensors. The first tensor is the output.
+        The remaining tensors are the last states,
+        each 3D tensor with shape: `(samples, filters, new_rows)` if
+          data_format='channels_first'
+        or shape: `(samples, new_rows, filters)` if data_format='channels_last'.
+          `rows` values might have changed due to padding.
+      - If `return_sequences`: 4D tensor with shape: `(samples, timesteps,
+        filters, new_rows)` if data_format='channels_first'
+        or shape: `(samples, timesteps, new_rows, filters)` if
+          data_format='channels_last'.
+      - Else, 3D tensor with shape: `(samples, filters, new_rows)` if
         data_format='channels_first'
-      or shape: `(samples, new_rows, filters)` if data_format='channels_last'.
-        `rows` values might have changed due to padding.
-    - If `return_sequences`: 4D tensor with shape: `(samples, timesteps,
-      filters, new_rows)` if data_format='channels_first'
-      or shape: `(samples, timesteps, new_rows, filters)` if
-        data_format='channels_last'.
-    - Else, 3D tensor with shape: `(samples, filters, new_rows)` if
-      data_format='channels_first'
-      or shape: `(samples, new_rows, filters)` if data_format='channels_last'.
+        or shape: `(samples, new_rows, filters)` if data_format='channels_last'.
 
-  Raises:
-    ValueError: in case of invalid constructor arguments.
+    Raises:
+      ValueError: in case of invalid constructor arguments.
 
-  References:
-    - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
-    (the current implementation does not include the feedback loop on the
-    cells output).
-  """
+    References:
+      - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
+      (the current implementation does not include the feedback loop on the
+      cells output).
+    """
 
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               dilation_rate=1,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               dropout=0.0,
-               recurrent_dropout=0.0,
-               **kwargs):
-    super().__init__(
-        rank=1,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        **kwargs)
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format=None,
+        dilation_rate=1,
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs
+    ):
+        super().__init__(
+            rank=1,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            unit_forget_bias=unit_forget_bias,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            **kwargs
+        )
diff --git a/keras/layers/rnn/conv_lstm2d.py b/keras/layers/rnn/conv_lstm2d.py
index e559097dda4b..668c9da5e4a9 100644
--- a/keras/layers/rnn/conv_lstm2d.py
+++ b/keras/layers/rnn/conv_lstm2d.py
@@ -13,174 +13,179 @@
 # limitations under the License.
 # ==============================================================================
 """2D Convolutional LSTM layer."""
-# pylint: disable=g-classes-have-attributes,disable=g-direct-tensorflow-import
+
 
 from keras.layers.rnn.base_conv_lstm import ConvLSTM
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ConvLSTM2D')
+@keras_export("keras.layers.ConvLSTM2D")
 class ConvLSTM2D(ConvLSTM):
-  """2D Convolutional LSTM.
+    """2D Convolutional LSTM.
 
-  Similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+    Similar to an LSTM layer, but the input transformations
+    and recurrent transformations are both convolutional.
 
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number of
-      output filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      dimensions of the convolution window.
-    strides: An integer or tuple/list of n integers, specifying the strides of
-      the convolution. Specifying any stride value != 1 is incompatible with
-      specifying any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means no
-      padding. `"same"` results in padding evenly to the left/right or up/down
-      of the input such that output has the same height/width dimension as the
-      input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `(batch, time, ..., channels)` while `channels_first`
-      corresponds to inputs with shape `(batch, time, channels, ...)`. It
-      defaults to the `image_data_format` value found in your Keras config file
-      at `~/.keras/keras.json`. If you never set it, then it will be
-      "channels_last".
-    dilation_rate: An integer or tuple/list of n integers, specifying the
-      dilation rate to use for dilated convolution. Currently, specifying any
-      `dilation_rate` value != 1 is incompatible with specifying any `strides`
-      value != 1.
-    activation: Activation function to use. By default hyperbolic tangent
-      activation function is applied (`tanh(x)`).
-    recurrent_activation: Activation function to use for the recurrent step.
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
-      initialization. Use in combination with `bias_initializer="zeros"`. This
-      is recommended in [Jozefowicz et al., 2015](
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of output filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers, specifying the strides of
+        the convolution. Specifying any stride value != 1 is incompatible with
+        specifying any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means
+        no padding. `"same"` results in padding evenly to the left/right or
+        up/down of the input such that output has the same height/width
+        dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`.  The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch, time, ...,
+        channels)` while `channels_first` corresponds to inputs with shape
+        `(batch, time, channels, ...)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of n integers, specifying the
+        dilation rate to use for dilated convolution. Currently, specifying any
+        `dilation_rate` value != 1 is incompatible with specifying any `strides`
+        value != 1.
+      activation: Activation function to use. By default hyperbolic tangent
+        activation function is applied (`tanh(x)`).
+      recurrent_activation: Activation function to use for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
+        at initialization. Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al., 2015](
         http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    return_sequences: Boolean. Whether to return the last output in the output
-      sequence, or the full sequence. (default False)
-    return_state: Boolean Whether to return the last state in addition to the
-      output. (default False)
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards.
-    stateful: Boolean (default False). If True, the last state for each sample
-      at index i in a batch will be used as initial state for the sample of
-      index i in the following batch.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-  Call arguments:
-    inputs: A 5D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
-      given timestep should be masked.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or `recurrent_dropout`
-      are set.
-    initial_state: List of initial state tensors to be passed to the first call
-      of the cell.
-  Input shape: - If data_format='channels_first'
-        5D tensor with shape: `(samples, time, channels, rows, cols)` - If
-          data_format='channels_last'
-        5D tensor with shape: `(samples, time, rows, cols, channels)`
-  Output shape:
-    - If `return_state`: a list of tensors. The first tensor is the output. The
-      remaining tensors are the last states,
-      each 4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output in the output
+        sequence, or the full sequence. (default False)
+      return_state: Boolean Whether to return the last state in addition to the
+        output. (default False)
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state.
+    Call arguments:
+      inputs: A 5D tensor.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
+        given timestep should be masked.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` are set.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+    Input shape: - If data_format='channels_first'
+          5D tensor with shape: `(samples, time, channels, rows, cols)` - If
+            data_format='channels_last'
+          5D tensor with shape: `(samples, time, rows, cols, channels)`
+    Output shape:
+      - If `return_state`: a list of tensors. The first tensor is the output.
+        The remaining tensors are the last states,
+        each 4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
+          data_format='channels_first'
+        or shape: `(samples, new_rows, new_cols, filters)` if
+          data_format='channels_last'. `rows` and `cols` values might have
+          changed due to padding.
+      - If `return_sequences`: 5D tensor with shape: `(samples, timesteps,
+        filters, new_rows, new_cols)` if data_format='channels_first'
+        or shape: `(samples, timesteps, new_rows, new_cols, filters)` if
+          data_format='channels_last'.
+      - Else, 4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
         data_format='channels_first'
-      or shape: `(samples, new_rows, new_cols, filters)` if
-        data_format='channels_last'. `rows` and `cols` values might have changed
-        due to padding.
-    - If `return_sequences`: 5D tensor with shape: `(samples, timesteps,
-      filters, new_rows, new_cols)` if data_format='channels_first'
-      or shape: `(samples, timesteps, new_rows, new_cols, filters)` if
-        data_format='channels_last'.
-    - Else, 4D tensor with shape: `(samples, filters, new_rows, new_cols)` if
-      data_format='channels_first'
-      or shape: `(samples, new_rows, new_cols, filters)` if
-        data_format='channels_last'.
+        or shape: `(samples, new_rows, new_cols, filters)` if
+          data_format='channels_last'.
 
-  Raises:
-    ValueError: in case of invalid constructor arguments.
+    Raises:
+      ValueError: in case of invalid constructor arguments.
 
-  References:
-    - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
-    (the current implementation does not include the feedback loop on the
-    cells output).
-  """
+    References:
+      - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
+      (the current implementation does not include the feedback loop on the
+      cells output).
+    """
 
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1),
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               dropout=0.0,
-               recurrent_dropout=0.0,
-               **kwargs):
-    super().__init__(
-        rank=2,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        **kwargs)
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        data_format=None,
+        dilation_rate=(1, 1),
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs
+    ):
+        super().__init__(
+            rank=2,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            unit_forget_bias=unit_forget_bias,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            **kwargs
+        )
diff --git a/keras/layers/rnn/conv_lstm3d.py b/keras/layers/rnn/conv_lstm3d.py
index 76e490dbc74b..1488faae72c5 100644
--- a/keras/layers/rnn/conv_lstm3d.py
+++ b/keras/layers/rnn/conv_lstm3d.py
@@ -13,174 +13,179 @@
 # limitations under the License.
 # ==============================================================================
 """3D Convolutional LSTM layer."""
-# pylint: disable=g-classes-have-attributes,disable=g-direct-tensorflow-import
+
 
 from keras.layers.rnn.base_conv_lstm import ConvLSTM
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.ConvLSTM3D')
+@keras_export("keras.layers.ConvLSTM3D")
 class ConvLSTM3D(ConvLSTM):
-  """3D Convolutional LSTM.
+    """3D Convolutional LSTM.
 
-  Similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+    Similar to an LSTM layer, but the input transformations
+    and recurrent transformations are both convolutional.
 
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number of
-      output filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      dimensions of the convolution window.
-    strides: An integer or tuple/list of n integers, specifying the strides of
-      the convolution. Specifying any stride value != 1 is incompatible with
-      specifying any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means no
-      padding. `"same"` results in padding evenly to the left/right or up/down
-      of the input such that output has the same height/width dimension as the
-      input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs. `channels_last` corresponds
-      to inputs with shape `(batch, time, ..., channels)` while `channels_first`
-      corresponds to inputs with shape `(batch, time, channels, ...)`. It
-      defaults to the `image_data_format` value found in your Keras config file
-      at `~/.keras/keras.json`. If you never set it, then it will be
-      "channels_last".
-    dilation_rate: An integer or tuple/list of n integers, specifying the
-      dilation rate to use for dilated convolution. Currently, specifying any
-      `dilation_rate` value != 1 is incompatible with specifying any `strides`
-      value != 1.
-    activation: Activation function to use. By default hyperbolic tangent
-      activation function is applied (`tanh(x)`).
-    recurrent_activation: Activation function to use for the recurrent step.
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
-      initialization. Use in combination with `bias_initializer="zeros"`. This
-      is recommended in [Jozefowicz et al., 2015](
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of output filters in the convolution).
+      kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+      strides: An integer or tuple/list of n integers, specifying the strides of
+        the convolution. Specifying any stride value != 1 is incompatible with
+        specifying any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive). `"valid"` means
+        no padding. `"same"` results in padding evenly to the left/right or
+        up/down of the input such that output has the same height/width
+        dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape `(batch, time, ...,
+        channels)` while `channels_first` corresponds to inputs with shape
+        `(batch, time, channels, ...)`. When unspecified, uses
+        `image_data_format` value found in your Keras config file at
+        `~/.keras/keras.json` (if exists) else 'channels_last'.
+        Defaults to 'channels_last'.
+      dilation_rate: An integer or tuple/list of n integers, specifying the
+        dilation rate to use for dilated convolution. Currently, specifying any
+        `dilation_rate` value != 1 is incompatible with specifying any `strides`
+        value != 1.
+      activation: Activation function to use. By default hyperbolic tangent
+        activation function is applied (`tanh(x)`).
+      recurrent_activation: Activation function to use for the recurrent step.
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
+        at initialization. Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al., 2015](
         http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    return_sequences: Boolean. Whether to return the last output in the output
-      sequence, or the full sequence. (default False)
-    return_state: Boolean Whether to return the last state in addition to the
-      output. (default False)
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards.
-    stateful: Boolean (default False). If True, the last state for each sample
-      at index i in a batch will be used as initial state for the sample of
-      index i in the following batch.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-  Call arguments:
-    inputs: A 6D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
-      given timestep should be masked.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or `recurrent_dropout`
-      are set.
-    initial_state: List of initial state tensors to be passed to the first call
-      of the cell.
-  Input shape: - If data_format='channels_first'
-        6D tensor with shape: `(samples, time, channels, rows, cols, depth)` -
-          If data_format='channels_last'
-        5D tensor with shape: `(samples, time, rows, cols, depth, channels)`
-  Output shape:
-    - If `return_state`: a list of tensors. The first tensor is the output. The
-      remaining tensors are the last states,
-      each 5D tensor with shape: `(samples, filters, new_rows, new_cols,
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output in the output
+        sequence, or the full sequence. (default False)
+      return_state: Boolean Whether to return the last state in addition to the
+        output. (default False)
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state.
+    Call arguments:
+      inputs: A 6D tensor.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether a
+        given timestep should be masked.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` are set.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+    Input shape: - If data_format='channels_first'
+          6D tensor with shape: `(samples, time, channels, rows, cols, depth)` -
+            If data_format='channels_last'
+          5D tensor with shape: `(samples, time, rows, cols, depth, channels)`
+    Output shape:
+      - If `return_state`: a list of tensors. The first tensor is the output.
+        The remaining tensors are the last states,
+        each 5D tensor with shape: `(samples, filters, new_rows, new_cols,
+          new_depth)` if data_format='channels_first'
+        or shape: `(samples, new_rows, new_cols, new_depth, filters)` if
+          data_format='channels_last'. `rows`, `cols`, and `depth` values might
+          have changed due to padding.
+      - If `return_sequences`: 6D tensor with shape: `(samples, timesteps,
+        filters, new_rows, new_cols, new_depth)` if data_format='channels_first'
+        or shape: `(samples, timesteps, new_rows, new_cols, new_depth, filters)`
+          if data_format='channels_last'.
+      - Else, 5D tensor with shape: `(samples, filters, new_rows, new_cols,
         new_depth)` if data_format='channels_first'
-      or shape: `(samples, new_rows, new_cols, new_depth, filters)` if
-        data_format='channels_last'. `rows`, `cols`, and `depth` values might
-        have changed due to padding.
-    - If `return_sequences`: 6D tensor with shape: `(samples, timesteps,
-      filters, new_rows, new_cols, new_depth)` if data_format='channels_first'
-      or shape: `(samples, timesteps, new_rows, new_cols, new_depth, filters)`
-        if data_format='channels_last'.
-    - Else, 5D tensor with shape: `(samples, filters, new_rows, new_cols,
-      new_depth)` if data_format='channels_first'
-      or shape: `(samples, new_rows, new_cols, new_depth, filters)` if
-        data_format='channels_last'.
+        or shape: `(samples, new_rows, new_cols, new_depth, filters)` if
+          data_format='channels_last'.
 
-  Raises:
-    ValueError: in case of invalid constructor arguments.
+    Raises:
+      ValueError: in case of invalid constructor arguments.
 
-  References:
-    - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
-    (the current implementation does not include the feedback loop on the
-    cells output).
-  """
+    References:
+      - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
+      (the current implementation does not include the feedback loop on the
+      cells output).
+    """
 
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1, 1),
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               dropout=0.0,
-               recurrent_dropout=0.0,
-               **kwargs):
-    super().__init__(
-        rank=3,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        **kwargs)
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1, 1),
+        padding="valid",
+        data_format=None,
+        dilation_rate=(1, 1, 1),
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs
+    ):
+        super().__init__(
+            rank=3,
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            unit_forget_bias=unit_forget_bias,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            **kwargs
+        )
diff --git a/keras/layers/rnn/conv_lstm_test.py b/keras/layers/rnn/conv_lstm_test.py
index 707d4b8b3b22..d8dfdeda2bfe 100644
--- a/keras/layers/rnn/conv_lstm_test.py
+++ b/keras/layers/rnn/conv_lstm_test.py
@@ -14,334 +14,406 @@
 # ==============================================================================
 """Tests for convolutional recurrent layers."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class ConvLSTM1DTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          data_format=['channels_first', 'channels_last'],
-          return_sequences=[True, False]))
-  def test_conv_lstm(self, data_format, return_sequences):
-    num_row = 3
-    filters = 3
-    num_samples = 1
-    input_channel = 2
-    input_num_row = 5
-    sequence_len = 2
-    if data_format == 'channels_first':
-      inputs = np.random.rand(num_samples, sequence_len, input_channel,
-                              input_num_row)
-    else:
-      inputs = np.random.rand(num_samples, sequence_len, input_num_row,
-                              input_channel)
-
-    # test for return state:
-    x = keras.Input(batch_shape=inputs.shape)
-    kwargs = {
-        'data_format': data_format,
-        'return_sequences': return_sequences,
-        'return_state': True,
-        'stateful': True,
-        'filters': filters,
-        'kernel_size': num_row,
-        'padding': 'valid',
-    }
-    layer = keras.layers.ConvLSTM1D(**kwargs)
-    layer.build(inputs.shape)
-    outputs = layer(x)
-    _, states = outputs[0], outputs[1:]
-    self.assertEqual(len(states), 2)
-    model = keras.models.Model(x, states[0])
-
-    state = model.predict(inputs)
-
-    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
-
-    # test for output shape:
-    test_utils.layer_test(
-        keras.layers.ConvLSTM1D,
-        kwargs={
-            'data_format': data_format,
-            'return_sequences': return_sequences,
-            'filters': filters,
-            'kernel_size': num_row,
-            'padding': 'valid'
-        },
-        input_shape=inputs.shape)
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            data_format=["channels_first", "channels_last"],
+            return_sequences=[True, False],
+        )
+    )
+    def test_conv_lstm(self, data_format, return_sequences):
+        num_row = 3
+        filters = 3
+        num_samples = 1
+        input_channel = 2
+        input_num_row = 5
+        sequence_len = 2
+        if data_format == "channels_first":
+            inputs = np.random.rand(
+                num_samples, sequence_len, input_channel, input_num_row
+            )
+        else:
+            inputs = np.random.rand(
+                num_samples, sequence_len, input_num_row, input_channel
+            )
+
+        # test for return state:
+        x = keras.Input(batch_shape=inputs.shape)
+        kwargs = {
+            "data_format": data_format,
+            "return_sequences": return_sequences,
+            "return_state": True,
+            "stateful": True,
+            "filters": filters,
+            "kernel_size": num_row,
+            "padding": "valid",
+        }
+        layer = keras.layers.ConvLSTM1D(**kwargs)
+        layer.build(inputs.shape)
+        outputs = layer(x)
+        _, states = outputs[0], outputs[1:]
+        self.assertEqual(len(states), 2)
+        model = keras.models.Model(x, states[0])
+
+        state = model.predict(inputs)
+
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4
+        )
+
+        # test for output shape:
+        test_utils.layer_test(
+            keras.layers.ConvLSTM1D,
+            kwargs={
+                "data_format": data_format,
+                "return_sequences": return_sequences,
+                "filters": filters,
+                "kernel_size": num_row,
+                "padding": "valid",
+            },
+            input_shape=inputs.shape,
+        )
 
 
 @test_combinations.run_all_keras_modes
 class ConvLSTM2DTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          data_format=['channels_first', 'channels_last'],
-          return_sequences=[True, False]))
-  def test_conv_lstm(self, data_format, return_sequences):
-    num_row = 3
-    num_col = 3
-    filters = 2
-    num_samples = 1
-    input_channel = 2
-    input_num_row = 5
-    input_num_col = 5
-    sequence_len = 2
-    if data_format == 'channels_first':
-      inputs = np.random.rand(num_samples, sequence_len,
-                              input_channel,
-                              input_num_row, input_num_col)
-    else:
-      inputs = np.random.rand(num_samples, sequence_len,
-                              input_num_row, input_num_col,
-                              input_channel)
-
-    # test for return state:
-    x = keras.Input(batch_shape=inputs.shape)
-    kwargs = {'data_format': data_format,
-              'return_sequences': return_sequences,
-              'return_state': True,
-              'stateful': True,
-              'filters': filters,
-              'kernel_size': (num_row, num_col),
-              'padding': 'valid'}
-    layer = keras.layers.ConvLSTM2D(**kwargs)
-    layer.build(inputs.shape)
-    outputs = layer(x)
-    _, states = outputs[0], outputs[1:]
-    self.assertEqual(len(states), 2)
-    model = keras.models.Model(x, states[0])
-    state = model.predict(inputs)
-
-    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
-
-    # test for output shape:
-    test_utils.layer_test(
-        keras.layers.ConvLSTM2D,
-        kwargs={'data_format': data_format,
-                'return_sequences': return_sequences,
-                'filters': filters,
-                'kernel_size': (num_row, num_col),
-                'padding': 'valid'},
-        input_shape=inputs.shape)
-
-  def test_conv_lstm_statefulness(self):
-    # Tests for statefulness
-    num_row = 3
-    num_col = 3
-    filters = 2
-    num_samples = 1
-    input_channel = 2
-    input_num_row = 5
-    input_num_col = 5
-    sequence_len = 2
-    inputs = np.random.rand(num_samples, sequence_len,
-                            input_num_row, input_num_col,
-                            input_channel)
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      kwargs = {'data_format': 'channels_last',
-                'return_sequences': False,
-                'filters': filters,
-                'kernel_size': (num_row, num_col),
-                'stateful': True,
-                'batch_input_shape': inputs.shape,
-                'padding': 'same'}
-      layer = keras.layers.ConvLSTM2D(**kwargs)
-
-      model.add(layer)
-      model.compile(optimizer='sgd', loss='mse')
-      out1 = model.predict(np.ones_like(inputs))
-
-      # train once so that the states change
-      model.train_on_batch(np.ones_like(inputs), np.random.random(out1.shape))
-      out2 = model.predict(np.ones_like(inputs))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones_like(inputs))
-      self.assertNotEqual(out3.max(), out2.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones_like(inputs))
-      self.assertAllClose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones_like(inputs))
-      self.assertNotEqual(out4.max(), out5.max())
-
-  def test_conv_lstm_regularizers(self):
-    # check regularizers
-    num_row = 3
-    num_col = 3
-    filters = 2
-    num_samples = 1
-    input_channel = 2
-    input_num_row = 5
-    input_num_col = 5
-    sequence_len = 2
-    inputs = np.random.rand(num_samples, sequence_len,
-                            input_num_row, input_num_col,
-                            input_channel)
-
-    with self.cached_session():
-      kwargs = {'data_format': 'channels_last',
-                'return_sequences': False,
-                'kernel_size': (num_row, num_col),
-                'stateful': True,
-                'filters': filters,
-                'batch_input_shape': inputs.shape,
-                'kernel_regularizer': keras.regularizers.L1L2(l1=0.01),
-                'recurrent_regularizer': keras.regularizers.L1L2(l1=0.01),
-                'activity_regularizer': 'l2',
-                'bias_regularizer': 'l2',
-                'kernel_constraint': 'max_norm',
-                'recurrent_constraint': 'max_norm',
-                'bias_constraint': 'max_norm',
-                'padding': 'same'}
-
-      layer = keras.layers.ConvLSTM2D(**kwargs)
-      layer.build(inputs.shape)
-      self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones(inputs.shape)))
-      self.assertEqual(len(layer.losses), 4)
-
-  def test_conv_lstm_dropout(self):
-    # check dropout
-    with self.cached_session():
-      test_utils.layer_test(
-          keras.layers.ConvLSTM2D,
-          kwargs={'data_format': 'channels_last',
-                  'return_sequences': False,
-                  'filters': 2,
-                  'kernel_size': (3, 3),
-                  'padding': 'same',
-                  'dropout': 0.1,
-                  'recurrent_dropout': 0.1},
-          input_shape=(1, 2, 5, 5, 2))
-
-  def test_conv_lstm_cloning(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.ConvLSTM2D(5, 3, input_shape=(None, 5, 5, 3)))
-
-      test_inputs = np.random.random((2, 4, 5, 5, 3))
-      reference_outputs = model.predict(test_inputs)
-      weights = model.get_weights()
-
-    # Use a new graph to clone the model
-    with self.cached_session():
-      clone = keras.models.clone_model(model)
-      clone.set_weights(weights)
-
-      outputs = clone.predict(test_inputs)
-      self.assertAllClose(reference_outputs, outputs, atol=1e-5)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping the test as OOM occurred with 1 GB budget.')
-  def test_conv_lstm_with_initial_state(self):
-    num_samples = 32
-    sequence_len = 5
-    encoder_inputs = keras.layers.Input((None, 32, 32, 3))
-    encoder = keras.layers.ConvLSTM2D(
-        filters=32, kernel_size=(3, 3), padding='same',
-        return_sequences=False, return_state=True)
-    _, state_h, state_c = encoder(encoder_inputs)
-    encoder_states = [state_h, state_c]
-
-    decoder_inputs = keras.layers.Input((None, 32, 32, 4))
-    decoder_lstm = keras.layers.ConvLSTM2D(
-        filters=32, kernel_size=(3, 3), padding='same',
-        return_sequences=False, return_state=False)
-    decoder_outputs = decoder_lstm(decoder_inputs, initial_state=encoder_states)
-    output = keras.layers.Conv2D(
-        1, (3, 3), padding='same', activation='relu')(decoder_outputs)
-    model = keras.Model([encoder_inputs, decoder_inputs], output)
-
-    model.compile(
-        optimizer='sgd', loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x_1 = np.random.rand(num_samples, sequence_len, 32, 32, 3)
-    x_2 = np.random.rand(num_samples, sequence_len, 32, 32, 4)
-    y = np.random.rand(num_samples, 32, 32, 1)
-    model.fit([x_1, x_2], y)
-
-    model.predict([x_1, x_2])
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            data_format=["channels_first", "channels_last"],
+            return_sequences=[True, False],
+        )
+    )
+    def test_conv_lstm(self, data_format, return_sequences):
+        num_row = 3
+        num_col = 3
+        filters = 2
+        num_samples = 1
+        input_channel = 2
+        input_num_row = 5
+        input_num_col = 5
+        sequence_len = 2
+        if data_format == "channels_first":
+            inputs = np.random.rand(
+                num_samples,
+                sequence_len,
+                input_channel,
+                input_num_row,
+                input_num_col,
+            )
+        else:
+            inputs = np.random.rand(
+                num_samples,
+                sequence_len,
+                input_num_row,
+                input_num_col,
+                input_channel,
+            )
+
+        # test for return state:
+        x = keras.Input(batch_shape=inputs.shape)
+        kwargs = {
+            "data_format": data_format,
+            "return_sequences": return_sequences,
+            "return_state": True,
+            "stateful": True,
+            "filters": filters,
+            "kernel_size": (num_row, num_col),
+            "padding": "valid",
+        }
+        layer = keras.layers.ConvLSTM2D(**kwargs)
+        layer.build(inputs.shape)
+        outputs = layer(x)
+        _, states = outputs[0], outputs[1:]
+        self.assertEqual(len(states), 2)
+        model = keras.models.Model(x, states[0])
+        state = model.predict(inputs)
+
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4
+        )
+
+        # test for output shape:
+        test_utils.layer_test(
+            keras.layers.ConvLSTM2D,
+            kwargs={
+                "data_format": data_format,
+                "return_sequences": return_sequences,
+                "filters": filters,
+                "kernel_size": (num_row, num_col),
+                "padding": "valid",
+            },
+            input_shape=inputs.shape,
+        )
+
+    def test_conv_lstm_statefulness(self):
+        # Tests for statefulness
+        num_row = 3
+        num_col = 3
+        filters = 2
+        num_samples = 1
+        input_channel = 2
+        input_num_row = 5
+        input_num_col = 5
+        sequence_len = 2
+        inputs = np.random.rand(
+            num_samples,
+            sequence_len,
+            input_num_row,
+            input_num_col,
+            input_channel,
+        )
+
+        with self.cached_session():
+            model = keras.models.Sequential()
+            kwargs = {
+                "data_format": "channels_last",
+                "return_sequences": False,
+                "filters": filters,
+                "kernel_size": (num_row, num_col),
+                "stateful": True,
+                "batch_input_shape": inputs.shape,
+                "padding": "same",
+            }
+            layer = keras.layers.ConvLSTM2D(**kwargs)
+
+            model.add(layer)
+            model.compile(optimizer="sgd", loss="mse")
+            out1 = model.predict(np.ones_like(inputs))
+
+            # train once so that the states change
+            model.train_on_batch(
+                np.ones_like(inputs), np.random.random(out1.shape)
+            )
+            out2 = model.predict(np.ones_like(inputs))
+
+            # if the state is not reset, output should be different
+            self.assertNotEqual(out1.max(), out2.max())
+
+            # check that output changes after states are reset
+            # (even though the model itself didn't change)
+            layer.reset_states()
+            out3 = model.predict(np.ones_like(inputs))
+            self.assertNotEqual(out3.max(), out2.max())
+
+            # check that container-level reset_states() works
+            model.reset_states()
+            out4 = model.predict(np.ones_like(inputs))
+            self.assertAllClose(out3, out4, atol=1e-5)
+
+            # check that the call to `predict` updated the states
+            out5 = model.predict(np.ones_like(inputs))
+            self.assertNotEqual(out4.max(), out5.max())
+
+    def test_conv_lstm_regularizers(self):
+        # check regularizers
+        num_row = 3
+        num_col = 3
+        filters = 2
+        num_samples = 1
+        input_channel = 2
+        input_num_row = 5
+        input_num_col = 5
+        sequence_len = 2
+        inputs = np.random.rand(
+            num_samples,
+            sequence_len,
+            input_num_row,
+            input_num_col,
+            input_channel,
+        )
+
+        with self.cached_session():
+            kwargs = {
+                "data_format": "channels_last",
+                "return_sequences": False,
+                "kernel_size": (num_row, num_col),
+                "stateful": True,
+                "filters": filters,
+                "batch_input_shape": inputs.shape,
+                "kernel_regularizer": keras.regularizers.L1L2(l1=0.01),
+                "recurrent_regularizer": keras.regularizers.L1L2(l1=0.01),
+                "activity_regularizer": "l2",
+                "bias_regularizer": "l2",
+                "kernel_constraint": "max_norm",
+                "recurrent_constraint": "max_norm",
+                "bias_constraint": "max_norm",
+                "padding": "same",
+            }
+
+            layer = keras.layers.ConvLSTM2D(**kwargs)
+            layer.build(inputs.shape)
+            self.assertEqual(len(layer.losses), 3)
+            layer(keras.backend.variable(np.ones(inputs.shape)))
+            self.assertEqual(len(layer.losses), 4)
+
+    def test_conv_lstm_dropout(self):
+        # check dropout
+        with self.cached_session():
+            test_utils.layer_test(
+                keras.layers.ConvLSTM2D,
+                kwargs={
+                    "data_format": "channels_last",
+                    "return_sequences": False,
+                    "filters": 2,
+                    "kernel_size": (3, 3),
+                    "padding": "same",
+                    "dropout": 0.1,
+                    "recurrent_dropout": 0.1,
+                },
+                input_shape=(1, 2, 5, 5, 2),
+            )
+
+    def test_conv_lstm_cloning(self):
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.ConvLSTM2D(5, 3, input_shape=(None, 5, 5, 3))
+            )
+
+            test_inputs = np.random.random((2, 4, 5, 5, 3))
+            reference_outputs = model.predict(test_inputs)
+            weights = model.get_weights()
+
+        # Use a new graph to clone the model
+        with self.cached_session():
+            clone = keras.models.clone_model(model)
+            clone.set_weights(weights)
+
+            outputs = clone.predict(test_inputs)
+            self.assertAllClose(reference_outputs, outputs, atol=1e-5)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="Skipping the test as OOM occurred with 1 GB budget.",
+    )
+    def test_conv_lstm_with_initial_state(self):
+        num_samples = 32
+        sequence_len = 5
+        encoder_inputs = keras.layers.Input((None, 32, 32, 3))
+        encoder = keras.layers.ConvLSTM2D(
+            filters=32,
+            kernel_size=(3, 3),
+            padding="same",
+            return_sequences=False,
+            return_state=True,
+        )
+        _, state_h, state_c = encoder(encoder_inputs)
+        encoder_states = [state_h, state_c]
+
+        decoder_inputs = keras.layers.Input((None, 32, 32, 4))
+        decoder_lstm = keras.layers.ConvLSTM2D(
+            filters=32,
+            kernel_size=(3, 3),
+            padding="same",
+            return_sequences=False,
+            return_state=False,
+        )
+        decoder_outputs = decoder_lstm(
+            decoder_inputs, initial_state=encoder_states
+        )
+        output = keras.layers.Conv2D(
+            1, (3, 3), padding="same", activation="relu"
+        )(decoder_outputs)
+        model = keras.Model([encoder_inputs, decoder_inputs], output)
+
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        x_1 = np.random.rand(num_samples, sequence_len, 32, 32, 3)
+        x_2 = np.random.rand(num_samples, sequence_len, 32, 32, 4)
+        y = np.random.rand(num_samples, 32, 32, 1)
+        model.fit([x_1, x_2], y)
+
+        model.predict([x_1, x_2])
 
 
 @test_combinations.run_all_keras_modes
 class ConvLSTM3DTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          data_format=['channels_first', 'channels_last'],
-          return_sequences=[True, False]))
-  def test_conv_lstm(self, data_format, return_sequences):
-    num_height = 3
-    num_width = 3
-    num_depth = 3
-    filters = 3
-    num_samples = 1
-    input_channel = 2
-    input_height = 5
-    input_width = 5
-    input_depth = 5
-    sequence_len = 2
-    if data_format == 'channels_first':
-      inputs = np.random.rand(num_samples, sequence_len, input_channel,
-                              input_height, input_width, input_depth)
-    else:
-      inputs = np.random.rand(num_samples, sequence_len, input_height,
-                              input_width, input_depth, input_channel)
-
-    # test for return state:
-    x = keras.Input(batch_shape=inputs.shape)
-    kwargs = {
-        'data_format': data_format,
-        'return_sequences': return_sequences,
-        'return_state': True,
-        'stateful': True,
-        'filters': filters,
-        'kernel_size': (num_height, num_width, num_depth),
-        'padding': 'same'
-    }
-    layer = keras.layers.ConvLSTM3D(**kwargs)
-    layer.build(inputs.shape)
-    outputs = layer(x)
-    _, states = outputs[0], outputs[1:]
-    self.assertEqual(len(states), 2)
-    model = keras.models.Model(x, states[0])
-
-    state = model.predict(inputs)
-
-    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
-
-    # test for output shape:
-    test_utils.layer_test(
-        keras.layers.ConvLSTM3D,
-        kwargs={
-            'data_format': data_format,
-            'return_sequences': return_sequences,
-            'filters': filters,
-            'kernel_size': (num_height, num_width, num_depth),
-            'padding': 'valid'
-        },
-        input_shape=inputs.shape)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            data_format=["channels_first", "channels_last"],
+            return_sequences=[True, False],
+        )
+    )
+    def test_conv_lstm(self, data_format, return_sequences):
+        num_height = 3
+        num_width = 3
+        num_depth = 3
+        filters = 3
+        num_samples = 1
+        input_channel = 2
+        input_height = 5
+        input_width = 5
+        input_depth = 5
+        sequence_len = 2
+        if data_format == "channels_first":
+            inputs = np.random.rand(
+                num_samples,
+                sequence_len,
+                input_channel,
+                input_height,
+                input_width,
+                input_depth,
+            )
+        else:
+            inputs = np.random.rand(
+                num_samples,
+                sequence_len,
+                input_height,
+                input_width,
+                input_depth,
+                input_channel,
+            )
+
+        # test for return state:
+        x = keras.Input(batch_shape=inputs.shape)
+        kwargs = {
+            "data_format": data_format,
+            "return_sequences": return_sequences,
+            "return_state": True,
+            "stateful": True,
+            "filters": filters,
+            "kernel_size": (num_height, num_width, num_depth),
+            "padding": "same",
+        }
+        layer = keras.layers.ConvLSTM3D(**kwargs)
+        layer.build(inputs.shape)
+        outputs = layer(x)
+        _, states = outputs[0], outputs[1:]
+        self.assertEqual(len(states), 2)
+        model = keras.models.Model(x, states[0])
+
+        state = model.predict(inputs)
+
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4
+        )
+
+        # test for output shape:
+        test_utils.layer_test(
+            keras.layers.ConvLSTM3D,
+            kwargs={
+                "data_format": data_format,
+                "return_sequences": return_sequences,
+                "filters": filters,
+                "kernel_size": (num_height, num_width, num_depth),
+                "padding": "valid",
+            },
+            input_shape=inputs.shape,
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/cudnn_gru.py b/keras/layers/rnn/cudnn_gru.py
index ead4431c3d64..45c7c91d53e3 100644
--- a/keras/layers/rnn/cudnn_gru.py
+++ b/keras/layers/rnn/cudnn_gru.py
@@ -13,195 +13,212 @@
 # limitations under the License.
 # ==============================================================================
 """Fast GRU layer backed by cuDNN."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import collections
 
+import tensorflow.compat.v2 as tf
+
 from keras import constraints
 from keras import initializers
 from keras import regularizers
 from keras.layers.rnn import gru_lstm_utils
 from keras.layers.rnn.base_cudnn_rnn import _CuDNNRNN
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export(v1=['keras.layers.CuDNNGRU'])
+@keras_export(v1=["keras.layers.CuDNNGRU"])
 class CuDNNGRU(_CuDNNRNN):
-  """Fast GRU implementation backed by cuDNN.
-
-  More information about cuDNN can be found on the [NVIDIA
-  developer website](https://developer.nvidia.com/cudnn).
-  Can only be run on GPU.
-
-  Args:
-      units: Positive integer, dimensionality of the output space.
-      kernel_initializer: Initializer for the `kernel` weights matrix, used for
-        the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel` weights
-        matrix, used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to the `kernel` weights
-        matrix.
-      recurrent_regularizer: Regularizer function applied to the
-        `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to the output of the
-        layer (its "activation").
-      kernel_constraint: Constraint function applied to the `kernel` weights
-        matrix.
-      recurrent_constraint: Constraint function applied to the
-        `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      return_sequences: Boolean. Whether to return the last output in the output
-        sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state in addition to the
-        output.
-      go_backwards: Boolean (default False). If True, process the input sequence
-        backwards and return the reversed sequence.
-      stateful: Boolean (default False). If True, the last state for each sample
-        at index i in a batch will be used as initial state for the sample of
-        index i in the following batch.
-  """
-
-  def __init__(self,
-               units,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               **kwargs):
-    self.units = units
-    cell_spec = collections.namedtuple('cell', 'state_size')
-    self._cell = cell_spec(state_size=self.units)
-    super().__init__(
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        **kwargs)
-
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.recurrent_initializer = initializers.get(recurrent_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.recurrent_constraint = constraints.get(recurrent_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-  @property
-  def cell(self):
-    return self._cell
-
-  def build(self, input_shape):
-    super().build(input_shape)
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_dim = int(input_shape[-1])
-
-    self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 3),
-        name='kernel',
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units * 3),
-        name='recurrent_kernel',
-        initializer=self.recurrent_initializer,
-        regularizer=self.recurrent_regularizer,
-        constraint=self.recurrent_constraint)
-
-    self.bias = self.add_weight(
-        shape=(self.units * 6,),
-        name='bias',
-        initializer=self.bias_initializer,
-        regularizer=self.bias_regularizer,
-        constraint=self.bias_constraint)
-
-    self.built = True
-
-  def _process_batch(self, inputs, initial_state):
-    if not self.time_major:
-      inputs = tf.transpose(inputs, perm=(1, 0, 2))
-    input_h = initial_state[0]
-    input_h = tf.expand_dims(input_h, axis=0)
-
-    params = gru_lstm_utils.canonical_to_params(
-        weights=[
-            self.kernel[:, self.units:self.units * 2],
-            self.kernel[:, :self.units],
-            self.kernel[:, self.units * 2:],
-            self.recurrent_kernel[:, self.units:self.units * 2],
-            self.recurrent_kernel[:, :self.units],
-            self.recurrent_kernel[:, self.units * 2:],
-        ],
-        biases=[
-            self.bias[self.units:self.units * 2],
-            self.bias[:self.units],
-            self.bias[self.units * 2:self.units * 3],
-            self.bias[self.units * 4:self.units * 5],
-            self.bias[self.units * 3:self.units * 4],
-            self.bias[self.units * 5:],
-        ],
-        shape=self._vector_shape)
-
-    args = {
-        'input': inputs,
-        'input_h': input_h,
-        'input_c': 0,
-        'params': params,
-        'is_training': True,
-        'rnn_mode': 'gru',
-    }
-
-    outputs, h, _, _, _ = tf.raw_ops.CudnnRNNV2(**args)
-
-    if self.stateful or self.return_state:
-      h = h[0]
-    if self.return_sequences:
-      if self.time_major:
-        output = outputs
-      else:
-        output = tf.transpose(outputs, perm=(1, 0, 2))
-    else:
-      output = outputs[-1]
-    return output, [h]
-
-  def get_config(self):
-    config = {
-        'units': self.units,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Fast GRU implementation backed by cuDNN.
+
+    More information about cuDNN can be found on the [NVIDIA
+    developer website](https://developer.nvidia.com/cudnn).
+    Can only be run on GPU.
+
+    Args:
+        units: Positive integer, dimensionality of the output space.
+        kernel_initializer: Initializer for the `kernel` weights matrix, used
+          for the linear transformation of the inputs.
+        recurrent_initializer: Initializer for the `recurrent_kernel` weights
+          matrix, used for the linear transformation of the recurrent state.
+        bias_initializer: Initializer for the bias vector.
+        kernel_regularizer: Regularizer function applied to the `kernel` weights
+          matrix.
+        recurrent_regularizer: Regularizer function applied to the
+          `recurrent_kernel` weights matrix.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        activity_regularizer: Regularizer function applied to the output of the
+          layer (its "activation").
+        kernel_constraint: Constraint function applied to the `kernel` weights
+          matrix.
+        recurrent_constraint: Constraint function applied to the
+          `recurrent_kernel` weights matrix.
+        bias_constraint: Constraint function applied to the bias vector.
+        return_sequences: Boolean. Whether to return the last output in the
+          output sequence, or the full sequence.
+        return_state: Boolean. Whether to return the last state in addition to
+          the output.
+        go_backwards: Boolean (default False). If True, process the input
+          sequence backwards and return the reversed sequence.
+        stateful: Boolean (default False). If True, the last state for each
+          sample at index i in a batch will be used as initial state for the
+          sample of index i in the following batch.
+    """
+
+    def __init__(
+        self,
+        units,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        **kwargs
+    ):
+        self.units = units
+        cell_spec = collections.namedtuple("cell", "state_size")
+        self._cell = cell_spec(state_size=self.units)
+        super().__init__(
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            **kwargs
+        )
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.recurrent_initializer = initializers.get(recurrent_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.recurrent_constraint = constraints.get(recurrent_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+    @property
+    def cell(self):
+        return self._cell
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+        input_dim = int(input_shape[-1])
+
+        self.kernel = self.add_weight(
+            shape=(input_dim, self.units * 3),
+            name="kernel",
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+        )
+
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units * 3),
+            name="recurrent_kernel",
+            initializer=self.recurrent_initializer,
+            regularizer=self.recurrent_regularizer,
+            constraint=self.recurrent_constraint,
+        )
+
+        self.bias = self.add_weight(
+            shape=(self.units * 6,),
+            name="bias",
+            initializer=self.bias_initializer,
+            regularizer=self.bias_regularizer,
+            constraint=self.bias_constraint,
+        )
+
+        self.built = True
+
+    def _process_batch(self, inputs, initial_state):
+        if not self.time_major:
+            inputs = tf.transpose(inputs, perm=(1, 0, 2))
+        input_h = initial_state[0]
+        input_h = tf.expand_dims(input_h, axis=0)
+
+        params = gru_lstm_utils.canonical_to_params(
+            weights=[
+                self.kernel[:, self.units : self.units * 2],
+                self.kernel[:, : self.units],
+                self.kernel[:, self.units * 2 :],
+                self.recurrent_kernel[:, self.units : self.units * 2],
+                self.recurrent_kernel[:, : self.units],
+                self.recurrent_kernel[:, self.units * 2 :],
+            ],
+            biases=[
+                self.bias[self.units : self.units * 2],
+                self.bias[: self.units],
+                self.bias[self.units * 2 : self.units * 3],
+                self.bias[self.units * 4 : self.units * 5],
+                self.bias[self.units * 3 : self.units * 4],
+                self.bias[self.units * 5 :],
+            ],
+            shape=self._vector_shape,
+        )
+
+        args = {
+            "input": inputs,
+            "input_h": input_h,
+            "input_c": 0,
+            "params": params,
+            "is_training": True,
+            "rnn_mode": "gru",
+        }
+
+        outputs, h, _, _, _ = tf.raw_ops.CudnnRNNV2(**args)
+
+        if self.stateful or self.return_state:
+            h = h[0]
+        if self.return_sequences:
+            if self.time_major:
+                output = outputs
+            else:
+                output = tf.transpose(outputs, perm=(1, 0, 2))
+        else:
+            output = outputs[-1]
+        return output, [h]
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/rnn/cudnn_lstm.py b/keras/layers/rnn/cudnn_lstm.py
index dd37f357ff9b..69ae8e96af6b 100644
--- a/keras/layers/rnn/cudnn_lstm.py
+++ b/keras/layers/rnn/cudnn_lstm.py
@@ -13,218 +13,245 @@
 # limitations under the License.
 # ==============================================================================
 """Fast LSTM layer backed by cuDNN."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import collections
 
+import tensorflow.compat.v2 as tf
+
 from keras import constraints
 from keras import initializers
 from keras import regularizers
 from keras.layers.rnn import gru_lstm_utils
 from keras.layers.rnn.base_cudnn_rnn import _CuDNNRNN
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export(v1=['keras.layers.CuDNNLSTM'])
+@keras_export(v1=["keras.layers.CuDNNLSTM"])
 class CuDNNLSTM(_CuDNNRNN):
-  """Fast LSTM implementation backed by cuDNN.
-
-  More information about cuDNN can be found on the [NVIDIA
-  developer website](https://developer.nvidia.com/cudnn).
-  Can only be run on GPU.
-
-  Args:
-      units: Positive integer, dimensionality of the output space.
-      kernel_initializer: Initializer for the `kernel` weights matrix, used for
-        the linear transformation of the inputs.
-      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
-        at initialization. Setting it to true will also force
-        `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
-        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-      recurrent_initializer: Initializer for the `recurrent_kernel` weights
-        matrix, used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to the `kernel` weights
-        matrix.
-      recurrent_regularizer: Regularizer function applied to the
-        `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to the output of the
-        layer (its "activation").
-      kernel_constraint: Constraint function applied to the `kernel` weights
-        matrix.
-      recurrent_constraint: Constraint function applied to the
-        `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      return_sequences: Boolean. Whether to return the last output. in the
-        output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state in addition to the
-        output.
-      go_backwards: Boolean (default False). If True, process the input sequence
-        backwards and return the reversed sequence.
-      stateful: Boolean (default False). If True, the last state for each sample
-        at index i in a batch will be used as initial state for the sample of
-        index i in the following batch.
-  """
-
-  def __init__(self,
-               units,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               **kwargs):
-    self.units = units
-    cell_spec = collections.namedtuple('cell', 'state_size')
-    self._cell = cell_spec(state_size=(self.units, self.units))
-    super().__init__(
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        **kwargs)
-
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.recurrent_initializer = initializers.get(recurrent_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.unit_forget_bias = unit_forget_bias
-
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.recurrent_constraint = constraints.get(recurrent_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-  @property
-  def cell(self):
-    return self._cell
-
-  def build(self, input_shape):
-    super().build(input_shape)
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_dim = int(input_shape[-1])
-
-    self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 4),
-        name='kernel',
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units * 4),
-        name='recurrent_kernel',
-        initializer=self.recurrent_initializer,
-        regularizer=self.recurrent_regularizer,
-        constraint=self.recurrent_constraint)
-
-    if self.unit_forget_bias:
-
-      def bias_initializer(_, *args, **kwargs):
-        return tf.concat([
-            self.bias_initializer((self.units * 5,), *args, **kwargs),
-            tf.compat.v1.ones_initializer()((self.units,), *args, **kwargs),
-            self.bias_initializer((self.units * 2,), *args, **kwargs),
-        ], axis=0)
-    else:
-      bias_initializer = self.bias_initializer
-    self.bias = self.add_weight(
-        shape=(self.units * 8,),
-        name='bias',
-        initializer=bias_initializer,
-        regularizer=self.bias_regularizer,
-        constraint=self.bias_constraint)
-
-    self.built = True
-
-  def _process_batch(self, inputs, initial_state):
-    if not self.time_major:
-      inputs = tf.transpose(inputs, perm=(1, 0, 2))
-    input_h = initial_state[0]
-    input_c = initial_state[1]
-    input_h = tf.expand_dims(input_h, axis=0)
-    input_c = tf.expand_dims(input_c, axis=0)
-
-    params = gru_lstm_utils.canonical_to_params(
-        weights=[
-            self.kernel[:, :self.units],
-            self.kernel[:, self.units:self.units * 2],
-            self.kernel[:, self.units * 2:self.units * 3],
-            self.kernel[:, self.units * 3:],
-            self.recurrent_kernel[:, :self.units],
-            self.recurrent_kernel[:, self.units:self.units * 2],
-            self.recurrent_kernel[:, self.units * 2:self.units * 3],
-            self.recurrent_kernel[:, self.units * 3:],
-        ],
-        biases=[
-            self.bias[:self.units],
-            self.bias[self.units:self.units * 2],
-            self.bias[self.units * 2:self.units * 3],
-            self.bias[self.units * 3:self.units * 4],
-            self.bias[self.units * 4:self.units * 5],
-            self.bias[self.units * 5:self.units * 6],
-            self.bias[self.units * 6:self.units * 7],
-            self.bias[self.units * 7:],
-        ],
-        shape=self._vector_shape)
-
-    args = {
-        'input': inputs,
-        'input_h': input_h,
-        'input_c': input_c,
-        'params': params,
-        'is_training': True,
-    }
-
-    outputs, h, c, _, _ = tf.raw_ops.CudnnRNNV2(**args)
-
-    if self.stateful or self.return_state:
-      h = h[0]
-      c = c[0]
-    if self.return_sequences:
-      if self.time_major:
-        output = outputs
-      else:
-        output = tf.transpose(outputs, perm=(1, 0, 2))
-    else:
-      output = outputs[-1]
-    return output, [h, c]
-
-  def get_config(self):
-    config = {
-        'units': self.units,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'unit_forget_bias': self.unit_forget_bias,
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Fast LSTM implementation backed by cuDNN.
+
+    More information about cuDNN can be found on the [NVIDIA
+    developer website](https://developer.nvidia.com/cudnn).
+    Can only be run on GPU.
+
+    Args:
+        units: Positive integer, dimensionality of the output space.
+        kernel_initializer: Initializer for the `kernel` weights matrix, used
+          for the linear transformation of the inputs.
+        unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
+          at initialization. Setting it to true will also force
+          `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+        recurrent_initializer: Initializer for the `recurrent_kernel` weights
+          matrix, used for the linear transformation of the recurrent state.
+        bias_initializer: Initializer for the bias vector.
+        kernel_regularizer: Regularizer function applied to the `kernel` weights
+          matrix.
+        recurrent_regularizer: Regularizer function applied to the
+          `recurrent_kernel` weights matrix.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        activity_regularizer: Regularizer function applied to the output of the
+          layer (its "activation").
+        kernel_constraint: Constraint function applied to the `kernel` weights
+          matrix.
+        recurrent_constraint: Constraint function applied to the
+          `recurrent_kernel` weights matrix.
+        bias_constraint: Constraint function applied to the bias vector.
+        return_sequences: Boolean. Whether to return the last output. in the
+          output sequence, or the full sequence.
+        return_state: Boolean. Whether to return the last state in addition to
+          the output.
+        go_backwards: Boolean (default False). If True, process the input
+          sequence backwards and return the reversed sequence.
+        stateful: Boolean (default False). If True, the last state for each
+          sample at index i in a batch will be used as initial state for the
+          sample of index i in the following batch.
+    """
+
+    def __init__(
+        self,
+        units,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        **kwargs
+    ):
+        self.units = units
+        cell_spec = collections.namedtuple("cell", "state_size")
+        self._cell = cell_spec(state_size=(self.units, self.units))
+        super().__init__(
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            **kwargs
+        )
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.recurrent_initializer = initializers.get(recurrent_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.unit_forget_bias = unit_forget_bias
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.recurrent_constraint = constraints.get(recurrent_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+    @property
+    def cell(self):
+        return self._cell
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+        input_dim = int(input_shape[-1])
+
+        self.kernel = self.add_weight(
+            shape=(input_dim, self.units * 4),
+            name="kernel",
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+        )
+
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units * 4),
+            name="recurrent_kernel",
+            initializer=self.recurrent_initializer,
+            regularizer=self.recurrent_regularizer,
+            constraint=self.recurrent_constraint,
+        )
+
+        if self.unit_forget_bias:
+
+            def bias_initializer(_, *args, **kwargs):
+                return tf.concat(
+                    [
+                        self.bias_initializer(
+                            (self.units * 5,), *args, **kwargs
+                        ),
+                        tf.compat.v1.ones_initializer()(
+                            (self.units,), *args, **kwargs
+                        ),
+                        self.bias_initializer(
+                            (self.units * 2,), *args, **kwargs
+                        ),
+                    ],
+                    axis=0,
+                )
+
+        else:
+            bias_initializer = self.bias_initializer
+        self.bias = self.add_weight(
+            shape=(self.units * 8,),
+            name="bias",
+            initializer=bias_initializer,
+            regularizer=self.bias_regularizer,
+            constraint=self.bias_constraint,
+        )
+
+        self.built = True
+
+    def _process_batch(self, inputs, initial_state):
+        if not self.time_major:
+            inputs = tf.transpose(inputs, perm=(1, 0, 2))
+        input_h = initial_state[0]
+        input_c = initial_state[1]
+        input_h = tf.expand_dims(input_h, axis=0)
+        input_c = tf.expand_dims(input_c, axis=0)
+
+        params = gru_lstm_utils.canonical_to_params(
+            weights=[
+                self.kernel[:, : self.units],
+                self.kernel[:, self.units : self.units * 2],
+                self.kernel[:, self.units * 2 : self.units * 3],
+                self.kernel[:, self.units * 3 :],
+                self.recurrent_kernel[:, : self.units],
+                self.recurrent_kernel[:, self.units : self.units * 2],
+                self.recurrent_kernel[:, self.units * 2 : self.units * 3],
+                self.recurrent_kernel[:, self.units * 3 :],
+            ],
+            biases=[
+                self.bias[: self.units],
+                self.bias[self.units : self.units * 2],
+                self.bias[self.units * 2 : self.units * 3],
+                self.bias[self.units * 3 : self.units * 4],
+                self.bias[self.units * 4 : self.units * 5],
+                self.bias[self.units * 5 : self.units * 6],
+                self.bias[self.units * 6 : self.units * 7],
+                self.bias[self.units * 7 :],
+            ],
+            shape=self._vector_shape,
+        )
+
+        args = {
+            "input": inputs,
+            "input_h": input_h,
+            "input_c": input_c,
+            "params": params,
+            "is_training": True,
+        }
+
+        outputs, h, c, _, _ = tf.raw_ops.CudnnRNNV2(**args)
+
+        if self.stateful or self.return_state:
+            h = h[0]
+            c = c[0]
+        if self.return_sequences:
+            if self.time_major:
+                output = outputs
+            else:
+                output = tf.transpose(outputs, perm=(1, 0, 2))
+        else:
+            output = outputs[-1]
+        return output, [h, c]
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "unit_forget_bias": self.unit_forget_bias,
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/layers/rnn/cudnn_test.py b/keras/layers/rnn/cudnn_test.py
index 8aac19766715..8e4a67c1e64e 100644
--- a/keras/layers/rnn/cudnn_test.py
+++ b/keras/layers/rnn/cudnn_test.py
@@ -14,479 +14,530 @@
 # ==============================================================================
 """Tests for cudnn recurrent layers."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import tempfile
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from tensorflow.python.framework import test_util as tf_test_utils
+from keras.optimizers.legacy.rmsprop import RMSprop
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.optimizers.optimizer_v2.rmsprop import RMSprop
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 
 @test_combinations.run_all_keras_modes
 class CuDNNTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
-          return_sequences=[True, False]))
-  @tf_test_utils.run_gpu_only
-  def test_cudnn_rnn_return_sequence(self, layer_class, return_sequences):
-    input_size = 10
-    timesteps = 6
-    units = 2
-    num_samples = 32
-    test_utils.layer_test(
-        layer_class,
-        kwargs={'units': units,
-                'return_sequences': return_sequences},
-        input_shape=(num_samples, timesteps, input_size))
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
-          go_backwards=[True, False]))
-  @tf_test_utils.run_gpu_only
-  def test_cudnn_rnn_go_backward(self, layer_class, go_backwards):
-    input_size = 10
-    timesteps = 6
-    units = 2
-    num_samples = 32
-    test_utils.layer_test(
-        layer_class,
-        kwargs={'units': units,
-                'go_backwards': go_backwards},
-        input_shape=(num_samples, timesteps, input_size))
-
-  @parameterized.named_parameters(
-      ('cudnngru', keras.layers.CuDNNGRU),
-      ('cudnnlstm', keras.layers.CuDNNLSTM),
-  )
-  @tf_test_utils.run_gpu_only
-  def test_return_state(self, layer_class):
-    input_size = 10
-    timesteps = 6
-    units = 2
-    num_samples = 32
-    num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
-
-    inputs = keras.Input(batch_shape=(num_samples, timesteps, input_size))
-    layer = layer_class(units, return_state=True, stateful=True)
-    outputs = layer(inputs)
-    _, state = outputs[0], outputs[1:]
-    self.assertEqual(len(state), num_states)
-    model = keras.models.Model(inputs, state[0])
-    model.run_eagerly = test_utils.should_run_eagerly()
-
-    inputs = np.random.random((num_samples, timesteps, input_size))
-    state = model.predict(inputs)
-    np.testing.assert_allclose(
-        keras.backend.eval(layer.states[0]), state, atol=1e-4)
-
-  @parameterized.named_parameters(
-      ('cudnngru', keras.layers.CuDNNGRU),
-      ('cudnnlstm', keras.layers.CuDNNLSTM),
-  )
-  @tf_test_utils.run_gpu_only
-  def test_time_major_input(self, layer_class):
-    input_size = 10
-    timesteps = 6
-    units = 2
-    num_samples = 32
-
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2])))
-    layer = layer_class(units, time_major=True, return_sequences=True)
-    model.add(layer)
-    model.add(
-        keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2])))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSprop(learning_rate=0.001))
-    model.fit(
-        np.ones((num_samples, timesteps, input_size)),
-        np.ones((num_samples, timesteps, units)))
-    out = model.predict(np.ones((num_samples, timesteps, input_size)))
-    self.assertEqual(out.shape, (num_samples, timesteps, units))
-
-  @parameterized.named_parameters(
-      ('cudnngru', keras.layers.CuDNNGRU),
-      ('cudnnlstm', keras.layers.CuDNNLSTM),
-  )
-  @tf_test_utils.run_gpu_only
-  def test_specify_initial_state_keras_tensor(self, layer_class):
-    input_size = 10
-    timesteps = 6
-    units = 2
-    num_samples = 32
-    num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
-
-    inputs = keras.Input((timesteps, input_size))
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    layer = layer_class(units)
-    if len(initial_state) == 1:
-      output = layer(inputs, initial_state=initial_state[0])
-    else:
-      output = layer(inputs, initial_state=initial_state)
-    self.assertTrue(
-        any(initial_state[0] is t
-            for t in layer._inbound_nodes[0].input_tensors))
-
-    model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=RMSprop(learning_rate=0.001),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.random.random((num_samples, timesteps, input_size))
-    initial_state = [
-        np.random.random((num_samples, units)) for _ in range(num_states)
-    ]
-    targets = np.random.random((num_samples, units))
-    model.fit([inputs] + initial_state, targets)
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
+            return_sequences=[True, False],
+        )
+    )
+    @tf_test_utils.run_gpu_only
+    def test_cudnn_rnn_return_sequence(self, layer_class, return_sequences):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        test_utils.layer_test(
+            layer_class,
+            kwargs={"units": units, "return_sequences": return_sequences},
+            input_shape=(num_samples, timesteps, input_size),
+        )
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
+            go_backwards=[True, False],
+        )
+    )
+    @tf_test_utils.run_gpu_only
+    def test_cudnn_rnn_go_backward(self, layer_class, go_backwards):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        test_utils.layer_test(
+            layer_class,
+            kwargs={"units": units, "go_backwards": go_backwards},
+            input_shape=(num_samples, timesteps, input_size),
+        )
+
+    @parameterized.named_parameters(
+        ("cudnngru", keras.layers.CuDNNGRU),
+        ("cudnnlstm", keras.layers.CuDNNLSTM),
+    )
+    @tf_test_utils.run_gpu_only
+    def test_return_state(self, layer_class):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+        inputs = keras.Input(batch_shape=(num_samples, timesteps, input_size))
+        layer = layer_class(units, return_state=True, stateful=True)
+        outputs = layer(inputs)
+        _, state = outputs[0], outputs[1:]
+        self.assertEqual(len(state), num_states)
+        model = keras.models.Model(inputs, state[0])
+        model.run_eagerly = test_utils.should_run_eagerly()
+
+        inputs = np.random.random((num_samples, timesteps, input_size))
+        state = model.predict(inputs)
+        np.testing.assert_allclose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4
+        )
+
+    @parameterized.named_parameters(
+        ("cudnngru", keras.layers.CuDNNGRU),
+        ("cudnnlstm", keras.layers.CuDNNLSTM),
+    )
+    @tf_test_utils.run_gpu_only
+    def test_time_major_input(self, layer_class):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+
+        model = keras.models.Sequential()
+        model.add(keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2])))
+        layer = layer_class(units, time_major=True, return_sequences=True)
+        model.add(layer)
+        model.add(keras.layers.Lambda(lambda t: tf.transpose(t, [1, 0, 2])))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=RMSprop(learning_rate=0.001),
+        )
+        model.fit(
+            np.ones((num_samples, timesteps, input_size)),
+            np.ones((num_samples, timesteps, units)),
+        )
+        out = model.predict(np.ones((num_samples, timesteps, input_size)))
+        self.assertEqual(out.shape, (num_samples, timesteps, units))
+
+    @parameterized.named_parameters(
+        ("cudnngru", keras.layers.CuDNNGRU),
+        ("cudnnlstm", keras.layers.CuDNNLSTM),
+    )
+    @tf_test_utils.run_gpu_only
+    def test_specify_initial_state_keras_tensor(self, layer_class):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+        inputs = keras.Input((timesteps, input_size))
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        layer = layer_class(units)
+        if len(initial_state) == 1:
+            output = layer(inputs, initial_state=initial_state[0])
+        else:
+            output = layer(inputs, initial_state=initial_state)
+        self.assertTrue(
+            any(
+                initial_state[0] is t
+                for t in layer._inbound_nodes[0].input_tensors
+            )
+        )
+
+        model = keras.models.Model([inputs] + initial_state, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=RMSprop(learning_rate=0.001),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.random.random((num_samples, timesteps, input_size))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.fit([inputs] + initial_state, targets)
 
 
 class CuDNNGraphOnlyTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ('cudnngru', keras.layers.CuDNNGRU),
-      ('cudnnlstm', keras.layers.CuDNNLSTM),
-  )
-  @tf_test_utils.run_gpu_only
-  def test_regularizer(self, layer_class):
-    input_size = 10
-    timesteps = 6
-    units = 2
-    num_samples = 32
-    with tf.Graph().as_default():
-      layer = layer_class(
-          units,
-          return_sequences=False,
-          input_shape=(timesteps, input_size),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2')
-      layer.build((None, None, input_size))
-      self.assertEqual(len(layer.losses), 3)
-
-      layer = layer_class(
-          units,
-          return_sequences=False,
-          input_shape=(timesteps, input_size),
-          activity_regularizer='l2')
-      self.assertTrue(layer.activity_regularizer)
-      x = keras.backend.variable(
-          np.ones((num_samples, timesteps, input_size)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-  @parameterized.named_parameters(
-      ('cudnngru', keras.layers.CuDNNGRU),
-      ('cudnnlstm', keras.layers.CuDNNLSTM),
-  )
-  @tf_test_utils.run_gpu_only
-  @tf_test_utils.run_v1_only('b/120941292')
-  def test_statefulness(self, layer_class):
-    input_size = 10
-    timesteps = 6
-    units = 2
-    num_samples = 32
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              10,
-              input_size,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-                    loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertAllClose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    @parameterized.named_parameters(
+        ("cudnngru", keras.layers.CuDNNGRU),
+        ("cudnnlstm", keras.layers.CuDNNLSTM),
+    )
+    @tf_test_utils.run_gpu_only
+    def test_regularizer(self, layer_class):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        with tf.Graph().as_default():
+            layer = layer_class(
+                units,
+                return_sequences=False,
+                input_shape=(timesteps, input_size),
+                kernel_regularizer=keras.regularizers.l1(0.01),
+                recurrent_regularizer=keras.regularizers.l1(0.01),
+                bias_regularizer="l2",
+            )
+            layer.build((None, None, input_size))
+            self.assertEqual(len(layer.losses), 3)
+
+            layer = layer_class(
+                units,
+                return_sequences=False,
+                input_shape=(timesteps, input_size),
+                activity_regularizer="l2",
+            )
+            self.assertTrue(layer.activity_regularizer)
+            x = keras.backend.variable(
+                np.ones((num_samples, timesteps, input_size))
+            )
+            layer(x)
+            self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+    @parameterized.named_parameters(
+        ("cudnngru", keras.layers.CuDNNGRU),
+        ("cudnnlstm", keras.layers.CuDNNLSTM),
+    )
+    @tf_test_utils.run_gpu_only
+    @tf_test_utils.run_v1_only("b/120941292")
+    def test_statefulness(self, layer_class):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Embedding(
+                    10,
+                    input_size,
+                    input_length=timesteps,
+                    batch_input_shape=(num_samples, timesteps),
+                )
+            )
+            layer = layer_class(
+                units, return_sequences=False, stateful=True, weights=None
+            )
+            model.add(layer)
+            model.compile(
+                optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+                loss="mse",
+            )
+            out1 = model.predict(np.ones((num_samples, timesteps)))
+            self.assertEqual(out1.shape, (num_samples, units))
+
+            # train once so that the states change
+            model.train_on_batch(
+                np.ones((num_samples, timesteps)), np.ones((num_samples, units))
+            )
+            out2 = model.predict(np.ones((num_samples, timesteps)))
+
+            # if the state is not reset, output should be different
+            self.assertNotEqual(out1.max(), out2.max())
+
+            # check that output changes after states are reset
+            # (even though the model itself didn't change)
+            layer.reset_states()
+            out3 = model.predict(np.ones((num_samples, timesteps)))
+            self.assertNotEqual(out2.max(), out3.max())
+
+            # check that container-level reset_states() works
+            model.reset_states()
+            out4 = model.predict(np.ones((num_samples, timesteps)))
+            self.assertAllClose(out3, out4, atol=1e-5)
+
+            # check that the call to `predict` updated the states
+            out5 = model.predict(np.ones((num_samples, timesteps)))
+            self.assertNotEqual(out4.max(), out5.max())
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CuDNNV1OnlyTest(test_combinations.TestCase):
-
-  @tf_test_utils.run_gpu_only
-  def test_trainability(self):
-    input_size = 10
-    units = 2
-    for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
-      layer = layer_class(units)
-      layer.build((None, None, input_size))
-      self.assertEqual(len(layer.weights), 3)
-      self.assertEqual(len(layer.trainable_weights), 3)
-      self.assertEqual(len(layer.non_trainable_weights), 0)
-      layer.trainable = False
-      self.assertEqual(len(layer.weights), 3)
-      self.assertEqual(len(layer.non_trainable_weights), 3)
-      self.assertEqual(len(layer.trainable_weights), 0)
-      layer.trainable = True
-      self.assertEqual(len(layer.weights), 3)
-      self.assertEqual(len(layer.trainable_weights), 3)
-      self.assertEqual(len(layer.non_trainable_weights), 0)
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
-          bidirectional=[True, False], implementation=[1, 2],
-          model_nest_level=[1, 2], model_type=['seq', 'func']))
-  @tf_test_utils.run_v1_only('b/120911602, b/112083752')
-  @tf_test_utils.run_gpu_only
-  def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
-                                             bidirectional, implementation,
-                                             model_nest_level, model_type):
-    input_size = 10
-    timesteps = 6
-    input_shape = (timesteps, input_size)
-    units = 2
-    num_samples = 32
-    inputs = np.random.random((num_samples, timesteps, input_size))
-
-    rnn_layer_kwargs = {
-        'recurrent_activation': 'sigmoid',
-        # ensure biases are non-zero and properly converted
-        'bias_initializer': 'random_uniform',
-        'implementation': implementation
-    }
-    if rnn_type == 'LSTM':
-      rnn_layer_class = keras.layers.LSTM
-      cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
-    else:
-      rnn_layer_class = keras.layers.GRU
-      cudnn_rnn_layer_class = keras.layers.CuDNNGRU
-      rnn_layer_kwargs['reset_after'] = True
-
-    layer = rnn_layer_class(units, **rnn_layer_kwargs)
-    if bidirectional:
-      layer = keras.layers.Bidirectional(layer)
-
-    cudnn_layer = cudnn_rnn_layer_class(units)
-    if bidirectional:
-      cudnn_layer = keras.layers.Bidirectional(cudnn_layer)
-
-    model = self._make_nested_model(input_shape, layer, model_nest_level,
-                                    model_type)
-    cudnn_model = self._make_nested_model(input_shape, cudnn_layer,
-                                          model_nest_level, model_type)
-
-    if to_cudnn:
-      self._convert_model_weights(model, cudnn_model)
-    else:
-      self._convert_model_weights(cudnn_model, model)
-
-    self.assertAllClose(model.predict(inputs), cudnn_model.predict(inputs),
-                        atol=1e-4)
-
-  def _make_nested_model(self, input_shape, layer, level=1, model_type='func'):
-    # example: make_nested_seq_model((1,), Dense(10), level=2).summary()
-    def make_nested_seq_model(input_shape, layer, level=1):
-      model = layer
-      for i in range(1, level + 1):
-        layers = [keras.layers.InputLayer(input_shape),
-                  model] if (i == 1) else [model]
-        model = keras.models.Sequential(layers)
-        if i > 1:
-          model.build((None,) + input_shape)
-      return model
-
-    # example: make_nested_func_model((1,), Dense(10), level=2).summary()
-    def make_nested_func_model(input_shape, layer, level=1):
-      model_input = keras.layers.Input(input_shape)
-      model = layer
-      for _ in range(level):
-        model = keras.models.Model(model_input, model(model_input))
-      return model
-
-    if model_type == 'func':
-      return make_nested_func_model(input_shape, layer, level)
-    elif model_type == 'seq':
-      return make_nested_seq_model(input_shape, layer, level)
-
-  def _convert_model_weights(self, source_model, target_model):
-    _, fname = tempfile.mkstemp('.h5')
-    source_model.save_weights(fname)
-    target_model.load_weights(fname)
-    os.remove(fname)
-
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False]))
-  @tf_test_utils.run_v1_only('b/120911602')
-  @tf_test_utils.run_gpu_only
-  def test_load_weights_between_noncudnn_rnn_time_distributed(self, rnn_type,
-                                                              to_cudnn):
-    # Similar test as test_load_weights_between_noncudnn_rnn() but has different
-    # rank of input due to usage of TimeDistributed. Issue: #10356.
-    input_size = 10
-    steps = 6
-    timesteps = 6
-    input_shape = (timesteps, steps, input_size)
-    units = 2
-    num_samples = 32
-    inputs = np.random.random((num_samples, timesteps, steps, input_size))
-
-    rnn_layer_kwargs = {
-        'recurrent_activation': 'sigmoid',
-        # ensure biases are non-zero and properly converted
-        'bias_initializer': 'random_uniform',
-    }
-    if rnn_type == 'LSTM':
-      rnn_layer_class = keras.layers.LSTM
-      cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
-    else:
-      rnn_layer_class = keras.layers.GRU
-      cudnn_rnn_layer_class = keras.layers.CuDNNGRU
-      rnn_layer_kwargs['reset_after'] = True
-
-    layer = rnn_layer_class(units, **rnn_layer_kwargs)
-    layer = keras.layers.TimeDistributed(layer)
-
-    cudnn_layer = cudnn_rnn_layer_class(units)
-    cudnn_layer = keras.layers.TimeDistributed(cudnn_layer)
-
-    model = self._make_nested_model(input_shape, layer)
-    cudnn_model = self._make_nested_model(input_shape, cudnn_layer)
-
-    if to_cudnn:
-      self._convert_model_weights(model, cudnn_model)
-    else:
-      self._convert_model_weights(cudnn_model, model)
-
-    self.assertAllClose(model.predict(inputs), cudnn_model.predict(inputs),
-                        atol=1e-4)
-
-  @tf_test_utils.run_gpu_only
-  def test_cudnnrnn_bidirectional(self):
-    rnn = keras.layers.CuDNNGRU
-    samples = 2
-    dim = 2
-    timesteps = 2
-    output_dim = 2
-    mode = 'concat'
-
-    x = np.random.random((samples, timesteps, dim))
-    target_dim = 2 * output_dim if mode == 'concat' else output_dim
-    y = np.random.random((samples, target_dim))
-
-    # test with Sequential model
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Bidirectional(
-            rnn(output_dim), merge_mode=mode, input_shape=(None, dim)))
-    model.compile(loss='mse', optimizer='rmsprop')
-    model.fit(x, y, epochs=1, batch_size=1)
-
-    # test config
-    model.get_config()
-    model = keras.models.model_from_json(model.to_json())
-    model.summary()
-
-    # test stacked bidirectional layers
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Bidirectional(
-            rnn(output_dim, return_sequences=True),
-            merge_mode=mode,
-            input_shape=(None, dim)))
-    model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
-    model.compile(loss='mse', optimizer=R'rmsprop')
-    model.fit(x, y, epochs=1, batch_size=1)
-
-    # test with functional API
-    inputs = keras.Input((timesteps, dim))
-    outputs = keras.layers.Bidirectional(
-        rnn(output_dim), merge_mode=mode)(
-            inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile(loss='mse', optimizer=R'rmsprop')
-    model.fit(x, y, epochs=1, batch_size=1)
-
-    # Bidirectional and stateful
-    inputs = keras.Input(batch_shape=(1, timesteps, dim))
-    outputs = keras.layers.Bidirectional(
-        rnn(output_dim, stateful=True), merge_mode=mode)(
-            inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile(loss='mse', optimizer='rmsprop')
-    model.fit(x, y, epochs=1, batch_size=1)
-
-  @tf_test_utils.run_gpu_only
-  def test_preprocess_weights_for_loading_gru_incompatible(self):
-    """Test loading weights between incompatible layers.
-
-    Should fail fast with an exception.
-    """
-    input_shape = (3, 5)
-
-    def gru(cudnn=False, **kwargs):
-      layer_class = keras.layers.CuDNNGRU if cudnn else keras.layers.GRUV1
-      return layer_class(2, input_shape=input_shape, **kwargs)
-
-    def get_layer_weights(layer):
-      layer.build(input_shape=input_shape)
-      return layer.get_weights()
-
-    def assert_not_compatible(src, dest, message):
-      with self.assertRaises(ValueError) as ex:
-        keras.saving.hdf5_format.preprocess_weights_for_loading(
-            dest,
-            get_layer_weights(src))
-      self.assertIn(message, str(ex.exception))
-
-    assert_not_compatible(
-        gru(),
-        gru(cudnn=True),
-        'GRU(reset_after=False) is not compatible with CuDNNGRU')
-    assert_not_compatible(
-        gru(cudnn=True),
-        gru(),
-        'CuDNNGRU is not compatible with GRU(reset_after=False)')
-    assert_not_compatible(
-        gru(),
-        gru(reset_after=True),
-        'GRU(reset_after=False) is not compatible with '
-        'GRU(reset_after=True)')
-    assert_not_compatible(
-        gru(reset_after=True),
-        gru(),
-        'GRU(reset_after=True) is not compatible with '
-        'GRU(reset_after=False)')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @tf_test_utils.run_gpu_only
+    def test_trainability(self):
+        input_size = 10
+        units = 2
+        for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
+            layer = layer_class(units)
+            layer.build((None, None, input_size))
+            self.assertEqual(len(layer.weights), 3)
+            self.assertEqual(len(layer.trainable_weights), 3)
+            self.assertEqual(len(layer.non_trainable_weights), 0)
+            layer.trainable = False
+            self.assertEqual(len(layer.weights), 3)
+            self.assertEqual(len(layer.non_trainable_weights), 3)
+            self.assertEqual(len(layer.trainable_weights), 0)
+            layer.trainable = True
+            self.assertEqual(len(layer.weights), 3)
+            self.assertEqual(len(layer.trainable_weights), 3)
+            self.assertEqual(len(layer.non_trainable_weights), 0)
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            rnn_type=["LSTM", "GRU"],
+            to_cudnn=[True, False],
+            bidirectional=[True, False],
+            implementation=[1, 2],
+            model_nest_level=[1, 2],
+            model_type=["seq", "func"],
+        )
+    )
+    @tf_test_utils.run_v1_only("b/120911602, b/112083752")
+    @tf_test_utils.run_gpu_only
+    def test_load_weights_between_noncudnn_rnn(
+        self,
+        rnn_type,
+        to_cudnn,
+        bidirectional,
+        implementation,
+        model_nest_level,
+        model_type,
+    ):
+        input_size = 10
+        timesteps = 6
+        input_shape = (timesteps, input_size)
+        units = 2
+        num_samples = 32
+        inputs = np.random.random((num_samples, timesteps, input_size))
+
+        rnn_layer_kwargs = {
+            "recurrent_activation": "sigmoid",
+            # ensure biases are non-zero and properly converted
+            "bias_initializer": "random_uniform",
+            "implementation": implementation,
+        }
+        if rnn_type == "LSTM":
+            rnn_layer_class = keras.layers.LSTM
+            cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
+        else:
+            rnn_layer_class = keras.layers.GRU
+            cudnn_rnn_layer_class = keras.layers.CuDNNGRU
+            rnn_layer_kwargs["reset_after"] = True
+
+        layer = rnn_layer_class(units, **rnn_layer_kwargs)
+        if bidirectional:
+            layer = keras.layers.Bidirectional(layer)
+
+        cudnn_layer = cudnn_rnn_layer_class(units)
+        if bidirectional:
+            cudnn_layer = keras.layers.Bidirectional(cudnn_layer)
+
+        model = self._make_nested_model(
+            input_shape, layer, model_nest_level, model_type
+        )
+        cudnn_model = self._make_nested_model(
+            input_shape, cudnn_layer, model_nest_level, model_type
+        )
+
+        if to_cudnn:
+            self._convert_model_weights(model, cudnn_model)
+        else:
+            self._convert_model_weights(cudnn_model, model)
+
+        self.assertAllClose(
+            model.predict(inputs), cudnn_model.predict(inputs), atol=1e-4
+        )
+
+    def _make_nested_model(
+        self, input_shape, layer, level=1, model_type="func"
+    ):
+        # example: make_nested_seq_model((1,), Dense(10), level=2).summary()
+        def make_nested_seq_model(input_shape, layer, level=1):
+            model = layer
+            for i in range(1, level + 1):
+                layers = (
+                    [keras.layers.InputLayer(input_shape), model]
+                    if (i == 1)
+                    else [model]
+                )
+                model = keras.models.Sequential(layers)
+                if i > 1:
+                    model.build((None,) + input_shape)
+            return model
+
+        # example: make_nested_func_model((1,), Dense(10), level=2).summary()
+        def make_nested_func_model(input_shape, layer, level=1):
+            model_input = keras.layers.Input(input_shape)
+            model = layer
+            for _ in range(level):
+                model = keras.models.Model(model_input, model(model_input))
+            return model
+
+        if model_type == "func":
+            return make_nested_func_model(input_shape, layer, level)
+        elif model_type == "seq":
+            return make_nested_seq_model(input_shape, layer, level)
+
+    def _convert_model_weights(self, source_model, target_model):
+        _, fname = tempfile.mkstemp(".h5")
+        source_model.save_weights(fname)
+        target_model.load_weights(fname)
+        os.remove(fname)
+
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            rnn_type=["LSTM", "GRU"], to_cudnn=[True, False]
+        )
+    )
+    @tf_test_utils.run_v1_only("b/120911602")
+    @tf_test_utils.run_gpu_only
+    def test_load_weights_between_noncudnn_rnn_time_distributed(
+        self, rnn_type, to_cudnn
+    ):
+        # Similar test as test_load_weights_between_noncudnn_rnn() but has
+        # different rank of input due to usage of TimeDistributed. Issue:
+        # #10356.
+        input_size = 10
+        steps = 6
+        timesteps = 6
+        input_shape = (timesteps, steps, input_size)
+        units = 2
+        num_samples = 32
+        inputs = np.random.random((num_samples, timesteps, steps, input_size))
+
+        rnn_layer_kwargs = {
+            "recurrent_activation": "sigmoid",
+            # ensure biases are non-zero and properly converted
+            "bias_initializer": "random_uniform",
+        }
+        if rnn_type == "LSTM":
+            rnn_layer_class = keras.layers.LSTM
+            cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
+        else:
+            rnn_layer_class = keras.layers.GRU
+            cudnn_rnn_layer_class = keras.layers.CuDNNGRU
+            rnn_layer_kwargs["reset_after"] = True
+
+        layer = rnn_layer_class(units, **rnn_layer_kwargs)
+        layer = keras.layers.TimeDistributed(layer)
+
+        cudnn_layer = cudnn_rnn_layer_class(units)
+        cudnn_layer = keras.layers.TimeDistributed(cudnn_layer)
+
+        model = self._make_nested_model(input_shape, layer)
+        cudnn_model = self._make_nested_model(input_shape, cudnn_layer)
+
+        if to_cudnn:
+            self._convert_model_weights(model, cudnn_model)
+        else:
+            self._convert_model_weights(cudnn_model, model)
+
+        self.assertAllClose(
+            model.predict(inputs), cudnn_model.predict(inputs), atol=1e-4
+        )
+
+    @tf_test_utils.run_gpu_only
+    def test_cudnnrnn_bidirectional(self):
+        rnn = keras.layers.CuDNNGRU
+        samples = 2
+        dim = 2
+        timesteps = 2
+        output_dim = 2
+        mode = "concat"
+
+        x = np.random.random((samples, timesteps, dim))
+        target_dim = 2 * output_dim if mode == "concat" else output_dim
+        y = np.random.random((samples, target_dim))
+
+        # test with Sequential model
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim), merge_mode=mode, input_shape=(None, dim)
+            )
+        )
+        model.compile(loss="mse", optimizer="rmsprop")
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test config
+        model.get_config()
+        model = keras.models.model_from_json(model.to_json())
+        model.summary()
+
+        # test stacked bidirectional layers
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim, return_sequences=True),
+                merge_mode=mode,
+                input_shape=(None, dim),
+            )
+        )
+        model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
+        model.compile(loss="mse", optimizer=R"rmsprop")
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test with functional API
+        inputs = keras.Input((timesteps, dim))
+        outputs = keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode)(
+            inputs
+        )
+        model = keras.Model(inputs, outputs)
+        model.compile(loss="mse", optimizer=R"rmsprop")
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # Bidirectional and stateful
+        inputs = keras.Input(batch_shape=(1, timesteps, dim))
+        outputs = keras.layers.Bidirectional(
+            rnn(output_dim, stateful=True), merge_mode=mode
+        )(inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile(loss="mse", optimizer="rmsprop")
+        model.fit(x, y, epochs=1, batch_size=1)
+
+    @tf_test_utils.run_gpu_only
+    def test_preprocess_weights_for_loading_gru_incompatible(self):
+        """Test loading weights between incompatible layers.
+
+        Should fail fast with an exception.
+        """
+        input_shape = (3, 5)
+
+        def gru(cudnn=False, **kwargs):
+            layer_class = keras.layers.CuDNNGRU if cudnn else keras.layers.GRUV1
+            return layer_class(2, input_shape=input_shape, **kwargs)
+
+        def get_layer_weights(layer):
+            layer.build(input_shape=input_shape)
+            return layer.get_weights()
+
+        def assert_not_compatible(src, dest, message):
+            with self.assertRaises(ValueError) as ex:
+                keras.saving.legacy.hdf5_format.preprocess_weights_for_loading(
+                    dest, get_layer_weights(src)
+                )
+            self.assertIn(message, str(ex.exception))
+
+        assert_not_compatible(
+            gru(),
+            gru(cudnn=True),
+            "GRU(reset_after=False) is not compatible with CuDNNGRU",
+        )
+        assert_not_compatible(
+            gru(cudnn=True),
+            gru(),
+            "CuDNNGRU is not compatible with GRU(reset_after=False)",
+        )
+        assert_not_compatible(
+            gru(),
+            gru(reset_after=True),
+            "GRU(reset_after=False) is not compatible with "
+            "GRU(reset_after=True)",
+        )
+        assert_not_compatible(
+            gru(reset_after=True),
+            gru(),
+            "GRU(reset_after=True) is not compatible with "
+            "GRU(reset_after=False)",
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/dropout_rnn_cell_mixin.py b/keras/layers/rnn/dropout_rnn_cell_mixin.py
index 43c85271b479..d2ee109fc9ad 100644
--- a/keras/layers/rnn/dropout_rnn_cell_mixin.py
+++ b/keras/layers/rnn/dropout_rnn_cell_mixin.py
@@ -15,159 +15,165 @@
 """Mixin holding dropout fields for RNN cells."""
 
 
-from keras import backend
 import tensorflow.compat.v2 as tf
-
 from tensorflow.tools.docs import doc_controls
 
+from keras import backend
+
 
 @doc_controls.do_not_generate_docs
 class DropoutRNNCellMixin:
-  """Object that hold dropout related fields for RNN Cell.
-
-  This class is not a standalone RNN cell. It suppose to be used with a RNN cell
-  by multiple inheritance. Any cell that mix with class should have following
-  fields:
-    dropout: a float number within range [0, 1). The ratio that the input
-      tensor need to dropout.
-    recurrent_dropout: a float number within range [0, 1). The ratio that the
-      recurrent state weights need to dropout.
-    _random_generator: A backend.RandomGenerator instance, which will be used
-      to produce outputs based on the inputs and dropout rate.
-  This object will create and cache created dropout masks, and reuse them for
-  the incoming data, so that the same mask is used for every batch input.
-  """
-
-  def __init__(self, *args, **kwargs):
-    self._create_non_trackable_mask_cache()
-    super().__init__(*args, **kwargs)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _create_non_trackable_mask_cache(self):
-    """Create the cache for dropout and recurrent dropout mask.
-
-    Note that the following two masks will be used in "graph function" mode,
-    e.g. these masks are symbolic tensors. In eager mode, the `eager_*_mask`
-    tensors will be generated differently than in the "graph function" case,
-    and they will be cached.
-
-    Also note that in graph mode, we still cache those masks only because the
-    RNN could be created with `unroll=True`. In that case, the `cell.call()`
-    function will be invoked multiple times, and we want to ensure same mask
-    is used every time.
-
-    Also the caches are created without tracking. Since they are not picklable
-    by python when deepcopy, we don't want `layer._obj_reference_counts_dict`
-    to track it by default.
-    """
-    self._dropout_mask_cache = backend.ContextValueCache(
-        self._create_dropout_mask)
-    self._recurrent_dropout_mask_cache = backend.ContextValueCache(
-        self._create_recurrent_dropout_mask)
-
-  def reset_dropout_mask(self):
-    """Reset the cached dropout masks if any.
-
-    This is important for the RNN layer to invoke this in it `call()` method so
-    that the cached mask is cleared before calling the `cell.call()`. The mask
-    should be cached across the timestep within the same batch, but shouldn't
-    be cached between batches. Otherwise it will introduce unreasonable bias
-    against certain index of data within the batch.
+    """Object that hold dropout related fields for RNN Cell.
+
+    This class is not a standalone RNN cell. It suppose to be used with a RNN
+    cell by multiple inheritance. Any cell that mix with class should have
+    following fields:
+      dropout: a float number within range [0, 1). The ratio that the input
+        tensor need to dropout.
+      recurrent_dropout: a float number within range [0, 1). The ratio that the
+        recurrent state weights need to dropout.
+      _random_generator: A backend.RandomGenerator instance, which will be used
+        to produce outputs based on the inputs and dropout rate.
+    This object will create and cache created dropout masks, and reuse them for
+    the incoming data, so that the same mask is used for every batch input.
     """
-    self._dropout_mask_cache.clear()
 
-  def reset_recurrent_dropout_mask(self):
-    """Reset the cached recurrent dropout masks if any.
-
-    This is important for the RNN layer to invoke this in it call() method so
-    that the cached mask is cleared before calling the cell.call(). The mask
-    should be cached across the timestep within the same batch, but shouldn't
-    be cached between batches. Otherwise it will introduce unreasonable bias
-    against certain index of data within the batch.
-    """
-    self._recurrent_dropout_mask_cache.clear()
-
-  def _create_dropout_mask(self, inputs, training, count=1):
-    return _generate_dropout_mask(
-        self._random_generator,
-        tf.ones_like(inputs),
-        self.dropout,
-        training=training,
-        count=count)
-
-  def _create_recurrent_dropout_mask(self, inputs, training, count=1):
-    return _generate_dropout_mask(
-        self._random_generator,
-        tf.ones_like(inputs),
-        self.recurrent_dropout,
-        training=training,
-        count=count)
-
-  def get_dropout_mask_for_cell(self, inputs, training, count=1):
-    """Get the dropout mask for RNN cell's input.
-
-    It will create mask based on context if there isn't any existing cached
-    mask. If a new mask is generated, it will update the cache in the cell.
-
-    Args:
-      inputs: The input tensor whose shape will be used to generate dropout
-        mask.
-      training: Boolean tensor, whether its in training mode, dropout will be
-        ignored in non-training mode.
-      count: Int, how many dropout mask will be generated. It is useful for cell
-        that has internal weights fused together.
-    Returns:
-      List of mask tensor, generated or cached mask based on context.
-    """
-    if self.dropout == 0:
-      return None
-    init_kwargs = dict(inputs=inputs, training=training, count=count)
-    return self._dropout_mask_cache.setdefault(kwargs=init_kwargs)
-
-  def get_recurrent_dropout_mask_for_cell(self, inputs, training, count=1):
-    """Get the recurrent dropout mask for RNN cell.
-
-    It will create mask based on context if there isn't any existing cached
-    mask. If a new mask is generated, it will update the cache in the cell.
-
-    Args:
-      inputs: The input tensor whose shape will be used to generate dropout
-        mask.
-      training: Boolean tensor, whether its in training mode, dropout will be
-        ignored in non-training mode.
-      count: Int, how many dropout mask will be generated. It is useful for cell
-        that has internal weights fused together.
-    Returns:
-      List of mask tensor, generated or cached mask based on context.
-    """
-    if self.recurrent_dropout == 0:
-      return None
-    init_kwargs = dict(inputs=inputs, training=training, count=count)
-    return self._recurrent_dropout_mask_cache.setdefault(kwargs=init_kwargs)
-
-  def __getstate__(self):
-    # Used for deepcopy. The caching can't be pickled by python, since it will
-    # contain tensor and graph.
-    state = super().__getstate__()
-    state.pop('_dropout_mask_cache', None)
-    state.pop('_recurrent_dropout_mask_cache', None)
-    return state
-
-  def __setstate__(self, state):
-    state['_dropout_mask_cache'] = backend.ContextValueCache(
-        self._create_dropout_mask)
-    state['_recurrent_dropout_mask_cache'] = backend.ContextValueCache(
-        self._create_recurrent_dropout_mask)
-    super().__setstate__(state)
+    def __init__(self, *args, **kwargs):
+        self._create_non_trackable_mask_cache()
+        super().__init__(*args, **kwargs)
+
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def _create_non_trackable_mask_cache(self):
+        """Create the cache for dropout and recurrent dropout mask.
+
+        Note that the following two masks will be used in "graph function" mode,
+        e.g. these masks are symbolic tensors. In eager mode, the `eager_*_mask`
+        tensors will be generated differently than in the "graph function" case,
+        and they will be cached.
+
+        Also note that in graph mode, we still cache those masks only because
+        the RNN could be created with `unroll=True`. In that case, the
+        `cell.call()` function will be invoked multiple times, and we want to
+        ensure same mask is used every time.
+
+        Also the caches are created without tracking. Since they are not
+        pickleable by python when deepcopy, we don't want
+        `layer._obj_reference_counts_dict` to track it by default.
+        """
+        self._dropout_mask_cache = backend.ContextValueCache(
+            self._create_dropout_mask
+        )
+        self._recurrent_dropout_mask_cache = backend.ContextValueCache(
+            self._create_recurrent_dropout_mask
+        )
+
+    def reset_dropout_mask(self):
+        """Reset the cached dropout masks if any.
+
+        This is important for the RNN layer to invoke this in it `call()` method
+        so that the cached mask is cleared before calling the `cell.call()`. The
+        mask should be cached across the timestep within the same batch, but
+        shouldn't be cached between batches. Otherwise it will introduce
+        unreasonable bias against certain index of data within the batch.
+        """
+        self._dropout_mask_cache.clear()
+
+    def reset_recurrent_dropout_mask(self):
+        """Reset the cached recurrent dropout masks if any.
+
+        This is important for the RNN layer to invoke this in it call() method
+        so that the cached mask is cleared before calling the cell.call(). The
+        mask should be cached across the timestep within the same batch, but
+        shouldn't be cached between batches. Otherwise it will introduce
+        unreasonable bias against certain index of data within the batch.
+        """
+        self._recurrent_dropout_mask_cache.clear()
+
+    def _create_dropout_mask(self, inputs, training, count=1):
+        return _generate_dropout_mask(
+            self._random_generator,
+            tf.ones_like(inputs),
+            self.dropout,
+            training=training,
+            count=count,
+        )
+
+    def _create_recurrent_dropout_mask(self, inputs, training, count=1):
+        return _generate_dropout_mask(
+            self._random_generator,
+            tf.ones_like(inputs),
+            self.recurrent_dropout,
+            training=training,
+            count=count,
+        )
+
+    def get_dropout_mask_for_cell(self, inputs, training, count=1):
+        """Get the dropout mask for RNN cell's input.
+
+        It will create mask based on context if there isn't any existing cached
+        mask. If a new mask is generated, it will update the cache in the cell.
+
+        Args:
+          inputs: The input tensor whose shape will be used to generate dropout
+            mask.
+          training: Boolean tensor, whether its in training mode, dropout will
+            be ignored in non-training mode.
+          count: Int, how many dropout mask will be generated. It is useful for
+            cell that has internal weights fused together.
+        Returns:
+          List of mask tensor, generated or cached mask based on context.
+        """
+        if self.dropout == 0:
+            return None
+        init_kwargs = dict(inputs=inputs, training=training, count=count)
+        return self._dropout_mask_cache.setdefault(kwargs=init_kwargs)
+
+    def get_recurrent_dropout_mask_for_cell(self, inputs, training, count=1):
+        """Get the recurrent dropout mask for RNN cell.
+
+        It will create mask based on context if there isn't any existing cached
+        mask. If a new mask is generated, it will update the cache in the cell.
+
+        Args:
+          inputs: The input tensor whose shape will be used to generate dropout
+            mask.
+          training: Boolean tensor, whether its in training mode, dropout will
+            be ignored in non-training mode.
+          count: Int, how many dropout mask will be generated. It is useful for
+            cell that has internal weights fused together.
+        Returns:
+          List of mask tensor, generated or cached mask based on context.
+        """
+        if self.recurrent_dropout == 0:
+            return None
+        init_kwargs = dict(inputs=inputs, training=training, count=count)
+        return self._recurrent_dropout_mask_cache.setdefault(kwargs=init_kwargs)
+
+    def __getstate__(self):
+        # Used for deepcopy. The caching can't be pickled by python, since it
+        # will contain tensor and graph.
+        state = super().__getstate__()
+        state.pop("_dropout_mask_cache", None)
+        state.pop("_recurrent_dropout_mask_cache", None)
+        return state
+
+    def __setstate__(self, state):
+        state["_dropout_mask_cache"] = backend.ContextValueCache(
+            self._create_dropout_mask
+        )
+        state["_recurrent_dropout_mask_cache"] = backend.ContextValueCache(
+            self._create_recurrent_dropout_mask
+        )
+        super().__setstate__(state)
 
 
 def _generate_dropout_mask(generator, ones, rate, training=None, count=1):
-  def dropped_inputs():
-    return generator.dropout(ones, rate)
-
-  if count > 1:
-    return [
-        backend.in_train_phase(dropped_inputs, ones, training=training)
-        for _ in range(count)
-    ]
-  return backend.in_train_phase(dropped_inputs, ones, training=training)
+    def dropped_inputs():
+        return generator.dropout(ones, rate)
+
+    if count > 1:
+        return [
+            backend.in_train_phase(dropped_inputs, ones, training=training)
+            for _ in range(count)
+        ]
+    return backend.in_train_phase(dropped_inputs, ones, training=training)
diff --git a/keras/layers/rnn/gru.py b/keras/layers/rnn/gru.py
index 99a172c9bc9f..855b2561c29a 100644
--- a/keras/layers/rnn/gru.py
+++ b/keras/layers/rnn/gru.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 """Gated Recurrent Unit layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import uuid
 
+import tensorflow.compat.v2 as tf
+
 from keras import activations
 from keras import backend
 from keras import constraints
@@ -29,1121 +31,1270 @@
 from keras.layers.rnn.base_rnn import RNN
 from keras.layers.rnn.dropout_rnn_cell_mixin import DropoutRNNCellMixin
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-
 RECURRENT_DROPOUT_WARNING_MSG = (
-    'RNN `implementation=2` is not supported when `recurrent_dropout` is set. '
-    'Using `implementation=1`.')
+    "RNN `implementation=2` is not supported when `recurrent_dropout` is set. "
+    "Using `implementation=1`."
+)
 
 
-@keras_export('keras.layers.GRUCell', v1=[])
+@keras_export("keras.layers.GRUCell", v1=[])
 class GRUCell(DropoutRNNCellMixin, base_layer.BaseRandomLayer):
-  """Cell class for the GRU layer.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  This class processes one step within the whole time sequence input, whereas
-  `tf.keras.layer.GRU` processes the whole sequence.
-
-  For example:
-
-  >>> inputs = tf.random.normal([32, 10, 8])
-  >>> rnn = tf.keras.layers.RNN(tf.keras.layers.GRUCell(4))
-  >>> output = rnn(inputs)
-  >>> print(output.shape)
-  (32, 4)
-  >>> rnn = tf.keras.layers.RNN(
-  ...    tf.keras.layers.GRUCell(4),
-  ...    return_sequences=True,
-  ...    return_state=True)
-  >>> whole_sequence_output, final_state = rnn(inputs)
-  >>> print(whole_sequence_output.shape)
-  (32, 10, 4)
-  >>> print(final_state.shape)
-  (32, 4)
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use. Default: hyperbolic tangent
-      (`tanh`). If you pass None, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use for the recurrent step.
-      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
-      applied (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs. Default:
-      `glorot_uniform`.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix, used for the linear transformation of the recurrent state.
-      Default: `orthogonal`.
-    bias_initializer: Initializer for the bias vector. Default: `zeros`.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix. Default: `None`.
-    bias_regularizer: Regularizer function applied to the bias vector. Default:
-      `None`.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix. Default: `None`.
-    bias_constraint: Constraint function applied to the bias vector. Default:
-      `None`.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the
-      linear transformation of the inputs. Default: 0.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state. Default: 0.
-    reset_after: GRU convention (whether to apply reset gate after or
-      before matrix multiplication). False = "before",
-      True = "after" (default and cuDNN compatible).
-
-  Call arguments:
-    inputs: A 2D tensor, with shape of `[batch, feature]`.
-    states: A 2D tensor with shape of `[batch, units]`, which is the state from
-      the previous time step. For timestep 0, the initial state provided by user
-      will be feed to cell.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. Only relevant when `dropout` or
-      `recurrent_dropout` is used.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               reset_after=True,
-               **kwargs):
-    if units < 0:
-      raise ValueError(f'Received an invalid value for argument `units`, '
-                       f'expected a positive integer, got {units}.')
-    # By default use cached variable under v2 mode, see b/143699808.
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self._enable_caching_device = kwargs.pop('enable_caching_device', True)
-    else:
-      self._enable_caching_device = kwargs.pop('enable_caching_device', False)
-    super().__init__(**kwargs)
-    self.units = units
-    self.activation = activations.get(activation)
-    self.recurrent_activation = activations.get(recurrent_activation)
-    self.use_bias = use_bias
-
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.recurrent_initializer = initializers.get(recurrent_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.recurrent_constraint = constraints.get(recurrent_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-    self.dropout = min(1., max(0., dropout))
-    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-
-    implementation = kwargs.pop('implementation', 2)
-    if self.recurrent_dropout != 0 and implementation != 1:
-      logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
-      self.implementation = 1
-    else:
-      self.implementation = implementation
-    self.reset_after = reset_after
-    self.state_size = self.units
-    self.output_size = self.units
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    input_dim = input_shape[-1]
-    default_caching_device = rnn_utils.caching_device(self)
-    self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 3),
-        name='kernel',
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        caching_device=default_caching_device)
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units * 3),
-        name='recurrent_kernel',
-        initializer=self.recurrent_initializer,
-        regularizer=self.recurrent_regularizer,
-        constraint=self.recurrent_constraint,
-        caching_device=default_caching_device)
-
-    if self.use_bias:
-      if not self.reset_after:
-        bias_shape = (3 * self.units,)
-      else:
-        # separate biases for input and recurrent kernels
-        # Note: the shape is intentionally different from CuDNNGRU biases
-        # `(2 * 3 * self.units,)`, so that we can distinguish the classes
-        # when loading and converting saved weights.
-        bias_shape = (2, 3 * self.units)
-      self.bias = self.add_weight(shape=bias_shape,
-                                  name='bias',
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  constraint=self.bias_constraint,
-                                  caching_device=default_caching_device)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs, states, training=None):
-    h_tm1 = states[0] if tf.nest.is_nested(
-        states) else states  # previous memory
-
-    dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
-    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
-        h_tm1, training, count=3)
-
-    if self.use_bias:
-      if not self.reset_after:
-        input_bias, recurrent_bias = self.bias, None
-      else:
-        input_bias, recurrent_bias = tf.unstack(self.bias)
-
-    if self.implementation == 1:
-      if 0. < self.dropout < 1.:
-        inputs_z = inputs * dp_mask[0]
-        inputs_r = inputs * dp_mask[1]
-        inputs_h = inputs * dp_mask[2]
-      else:
-        inputs_z = inputs
-        inputs_r = inputs
-        inputs_h = inputs
-
-      x_z = backend.dot(inputs_z, self.kernel[:, :self.units])
-      x_r = backend.dot(inputs_r, self.kernel[:, self.units:self.units * 2])
-      x_h = backend.dot(inputs_h, self.kernel[:, self.units * 2:])
-
-      if self.use_bias:
-        x_z = backend.bias_add(x_z, input_bias[:self.units])
-        x_r = backend.bias_add(x_r, input_bias[self.units: self.units * 2])
-        x_h = backend.bias_add(x_h, input_bias[self.units * 2:])
-
-      if 0. < self.recurrent_dropout < 1.:
-        h_tm1_z = h_tm1 * rec_dp_mask[0]
-        h_tm1_r = h_tm1 * rec_dp_mask[1]
-        h_tm1_h = h_tm1 * rec_dp_mask[2]
-      else:
-        h_tm1_z = h_tm1
-        h_tm1_r = h_tm1
-        h_tm1_h = h_tm1
-
-      recurrent_z = backend.dot(h_tm1_z, self.recurrent_kernel[:, :self.units])
-      recurrent_r = backend.dot(
-          h_tm1_r, self.recurrent_kernel[:, self.units:self.units * 2])
-      if self.reset_after and self.use_bias:
-        recurrent_z = backend.bias_add(recurrent_z, recurrent_bias[:self.units])
-        recurrent_r = backend.bias_add(
-            recurrent_r, recurrent_bias[self.units:self.units * 2])
-
-      z = self.recurrent_activation(x_z + recurrent_z)
-      r = self.recurrent_activation(x_r + recurrent_r)
-
-      # reset gate applied after/before matrix multiplication
-      if self.reset_after:
-        recurrent_h = backend.dot(
-            h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
-        if self.use_bias:
-          recurrent_h = backend.bias_add(
-              recurrent_h, recurrent_bias[self.units * 2:])
-        recurrent_h = r * recurrent_h
-      else:
-        recurrent_h = backend.dot(
-            r * h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
-
-      hh = self.activation(x_h + recurrent_h)
-    else:
-      if 0. < self.dropout < 1.:
-        inputs = inputs * dp_mask[0]
-
-      # inputs projected by all gate matrices at once
-      matrix_x = backend.dot(inputs, self.kernel)
-      if self.use_bias:
-        # biases: bias_z_i, bias_r_i, bias_h_i
-        matrix_x = backend.bias_add(matrix_x, input_bias)
-
-      x_z, x_r, x_h = tf.split(matrix_x, 3, axis=-1)
+    """Cell class for the GRU layer.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    This class processes one step within the whole time sequence input, whereas
+    `tf.keras.layer.GRU` processes the whole sequence.
+
+    For example:
+
+    >>> inputs = tf.random.normal([32, 10, 8])
+    >>> rnn = tf.keras.layers.RNN(tf.keras.layers.GRUCell(4))
+    >>> output = rnn(inputs)
+    >>> print(output.shape)
+    (32, 4)
+    >>> rnn = tf.keras.layers.RNN(
+    ...    tf.keras.layers.GRUCell(4),
+    ...    return_sequences=True,
+    ...    return_state=True)
+    >>> whole_sequence_output, final_state = rnn(inputs)
+    >>> print(whole_sequence_output.shape)
+    (32, 10, 4)
+    >>> print(final_state.shape)
+    (32, 4)
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use. Default: hyperbolic tangent
+        (`tanh`). If you pass None, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use for the recurrent step.
+        Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+        applied (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs. Default:
+        `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix, used for the linear transformation of the recurrent
+        state.  Default: `orthogonal`.
+      bias_initializer: Initializer for the bias vector. Default: `zeros`.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_regularizer: Regularizer function applied to the bias vector.
+        Default: `None`.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_constraint: Constraint function applied to the bias vector. Default:
+        `None`.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs. Default: 0.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state. Default: 0.
+      reset_after: GRU convention (whether to apply reset gate after or
+        before matrix multiplication). False = "before",
+        True = "after" (default and cuDNN compatible).
+
+    Call arguments:
+      inputs: A 2D tensor, with shape of `[batch, feature]`.
+      states: A 2D tensor with shape of `[batch, units]`, which is the state
+        from the previous time step. For timestep 0, the initial state provided
+        by user will be feed to cell.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. Only relevant when `dropout` or
+        `recurrent_dropout` is used.
+    """
+
+    def __init__(
+        self,
+        units,
+        activation="tanh",
+        recurrent_activation="sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        reset_after=True,
+        **kwargs,
+    ):
+        if units <= 0:
+            raise ValueError(
+                "Received an invalid value for argument `units`, "
+                f"expected a positive integer, got {units}."
+            )
+        # By default use cached variable under v2 mode, see b/143699808.
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self._enable_caching_device = kwargs.pop(
+                "enable_caching_device", True
+            )
+        else:
+            self._enable_caching_device = kwargs.pop(
+                "enable_caching_device", False
+            )
+        super().__init__(**kwargs)
+        self.units = units
+        self.activation = activations.get(activation)
+        self.recurrent_activation = activations.get(recurrent_activation)
+        self.use_bias = use_bias
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.recurrent_initializer = initializers.get(recurrent_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.recurrent_constraint = constraints.get(recurrent_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+        self.dropout = min(1.0, max(0.0, dropout))
+        self.recurrent_dropout = min(1.0, max(0.0, recurrent_dropout))
+
+        implementation = kwargs.pop("implementation", 2)
+        if self.recurrent_dropout != 0 and implementation != 1:
+            logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
+            self.implementation = 1
+        else:
+            self.implementation = implementation
+        self.reset_after = reset_after
+        self.state_size = self.units
+        self.output_size = self.units
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        super().build(input_shape)
+        input_dim = input_shape[-1]
+        default_caching_device = rnn_utils.caching_device(self)
+        self.kernel = self.add_weight(
+            shape=(input_dim, self.units * 3),
+            name="kernel",
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            caching_device=default_caching_device,
+        )
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units * 3),
+            name="recurrent_kernel",
+            initializer=self.recurrent_initializer,
+            regularizer=self.recurrent_regularizer,
+            constraint=self.recurrent_constraint,
+            caching_device=default_caching_device,
+        )
 
-      if self.reset_after:
-        # hidden state projected by all gate matrices at once
-        matrix_inner = backend.dot(h_tm1, self.recurrent_kernel)
         if self.use_bias:
-          matrix_inner = backend.bias_add(matrix_inner, recurrent_bias)
-      else:
-        # hidden state projected separately for update/reset and new
-        matrix_inner = backend.dot(
-            h_tm1, self.recurrent_kernel[:, :2 * self.units])
-
-      recurrent_z, recurrent_r, recurrent_h = tf.split(
-          matrix_inner, [self.units, self.units, -1], axis=-1)
-
-      z = self.recurrent_activation(x_z + recurrent_z)
-      r = self.recurrent_activation(x_r + recurrent_r)
-
-      if self.reset_after:
-        recurrent_h = r * recurrent_h
-      else:
-        recurrent_h = backend.dot(
-            r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
-
-      hh = self.activation(x_h + recurrent_h)
-    # previous and candidate state mixed by update gate
-    h = z * h_tm1 + (1 - z) * hh
-    new_state = [h] if tf.nest.is_nested(states) else h
-    return h, new_state
-
-  def get_config(self):
-    config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation,
-        'reset_after': self.reset_after
-    }
-    config.update(rnn_utils.config_for_enable_caching_device(self))
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+            if not self.reset_after:
+                bias_shape = (3 * self.units,)
+            else:
+                # separate biases for input and recurrent kernels
+                # Note: the shape is intentionally different from CuDNNGRU
+                # biases `(2 * 3 * self.units,)`, so that we can distinguish the
+                # classes when loading and converting saved weights.
+                bias_shape = (2, 3 * self.units)
+            self.bias = self.add_weight(
+                shape=bias_shape,
+                name="bias",
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                caching_device=default_caching_device,
+            )
+        else:
+            self.bias = None
+        self.built = True
 
-  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    return rnn_utils.generate_zero_filled_state_for_cell(
-        self, inputs, batch_size, dtype)
+    def call(self, inputs, states, training=None):
+        h_tm1 = (
+            states[0] if tf.nest.is_nested(states) else states
+        )  # previous memory
 
+        dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
+        rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+            h_tm1, training, count=3
+        )
 
-@keras_export('keras.layers.GRU', v1=[])
+        if self.use_bias:
+            if not self.reset_after:
+                input_bias, recurrent_bias = self.bias, None
+            else:
+                input_bias, recurrent_bias = tf.unstack(self.bias)
+
+        if self.implementation == 1:
+            if 0.0 < self.dropout < 1.0:
+                inputs_z = inputs * dp_mask[0]
+                inputs_r = inputs * dp_mask[1]
+                inputs_h = inputs * dp_mask[2]
+            else:
+                inputs_z = inputs
+                inputs_r = inputs
+                inputs_h = inputs
+
+            x_z = backend.dot(inputs_z, self.kernel[:, : self.units])
+            x_r = backend.dot(
+                inputs_r, self.kernel[:, self.units : self.units * 2]
+            )
+            x_h = backend.dot(inputs_h, self.kernel[:, self.units * 2 :])
+
+            if self.use_bias:
+                x_z = backend.bias_add(x_z, input_bias[: self.units])
+                x_r = backend.bias_add(
+                    x_r, input_bias[self.units : self.units * 2]
+                )
+                x_h = backend.bias_add(x_h, input_bias[self.units * 2 :])
+
+            if 0.0 < self.recurrent_dropout < 1.0:
+                h_tm1_z = h_tm1 * rec_dp_mask[0]
+                h_tm1_r = h_tm1 * rec_dp_mask[1]
+                h_tm1_h = h_tm1 * rec_dp_mask[2]
+            else:
+                h_tm1_z = h_tm1
+                h_tm1_r = h_tm1
+                h_tm1_h = h_tm1
+
+            recurrent_z = backend.dot(
+                h_tm1_z, self.recurrent_kernel[:, : self.units]
+            )
+            recurrent_r = backend.dot(
+                h_tm1_r, self.recurrent_kernel[:, self.units : self.units * 2]
+            )
+            if self.reset_after and self.use_bias:
+                recurrent_z = backend.bias_add(
+                    recurrent_z, recurrent_bias[: self.units]
+                )
+                recurrent_r = backend.bias_add(
+                    recurrent_r, recurrent_bias[self.units : self.units * 2]
+                )
+
+            z = self.recurrent_activation(x_z + recurrent_z)
+            r = self.recurrent_activation(x_r + recurrent_r)
+
+            # reset gate applied after/before matrix multiplication
+            if self.reset_after:
+                recurrent_h = backend.dot(
+                    h_tm1_h, self.recurrent_kernel[:, self.units * 2 :]
+                )
+                if self.use_bias:
+                    recurrent_h = backend.bias_add(
+                        recurrent_h, recurrent_bias[self.units * 2 :]
+                    )
+                recurrent_h = r * recurrent_h
+            else:
+                recurrent_h = backend.dot(
+                    r * h_tm1_h, self.recurrent_kernel[:, self.units * 2 :]
+                )
+
+            hh = self.activation(x_h + recurrent_h)
+        else:
+            if 0.0 < self.dropout < 1.0:
+                inputs = inputs * dp_mask[0]
+
+            # inputs projected by all gate matrices at once
+            matrix_x = backend.dot(inputs, self.kernel)
+            if self.use_bias:
+                # biases: bias_z_i, bias_r_i, bias_h_i
+                matrix_x = backend.bias_add(matrix_x, input_bias)
+
+            x_z, x_r, x_h = tf.split(matrix_x, 3, axis=-1)
+
+            if self.reset_after:
+                # hidden state projected by all gate matrices at once
+                matrix_inner = backend.dot(h_tm1, self.recurrent_kernel)
+                if self.use_bias:
+                    matrix_inner = backend.bias_add(
+                        matrix_inner, recurrent_bias
+                    )
+            else:
+                # hidden state projected separately for update/reset and new
+                matrix_inner = backend.dot(
+                    h_tm1, self.recurrent_kernel[:, : 2 * self.units]
+                )
+
+            recurrent_z, recurrent_r, recurrent_h = tf.split(
+                matrix_inner, [self.units, self.units, -1], axis=-1
+            )
+
+            z = self.recurrent_activation(x_z + recurrent_z)
+            r = self.recurrent_activation(x_r + recurrent_r)
+
+            if self.reset_after:
+                recurrent_h = r * recurrent_h
+            else:
+                recurrent_h = backend.dot(
+                    r * h_tm1, self.recurrent_kernel[:, 2 * self.units :]
+                )
+
+            hh = self.activation(x_h + recurrent_h)
+        # previous and candidate state mixed by update gate
+        h = z * h_tm1 + (1 - z) * hh
+        new_state = [h] if tf.nest.is_nested(states) else h
+        return h, new_state
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+            "implementation": self.implementation,
+            "reset_after": self.reset_after,
+        }
+        config.update(rnn_utils.config_for_enable_caching_device(self))
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        return rnn_utils.generate_zero_filled_state_for_cell(
+            self, inputs, batch_size, dtype
+        )
+
+
+@keras_export("keras.layers.GRU", v1=[])
 class GRU(DropoutRNNCellMixin, RNN, base_layer.BaseRandomLayer):
-  """Gated Recurrent Unit - Cho et al. 2014.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  Based on available runtime hardware and constraints, this layer
-  will choose different implementations (cuDNN-based or pure-TensorFlow)
-  to maximize the performance. If a GPU is available and all
-  the arguments to the layer meet the requirement of the cuDNN kernel
-  (see below for details), the layer will use a fast cuDNN implementation.
-
-  The requirements to use the cuDNN implementation are:
-
-  1. `activation` == `tanh`
-  2. `recurrent_activation` == `sigmoid`
-  3. `recurrent_dropout` == 0
-  4. `unroll` is `False`
-  5. `use_bias` is `True`
-  6. `reset_after` is `True`
-  7. Inputs, if use masking, are strictly right-padded.
-  8. Eager execution is enabled in the outermost context.
-
-  There are two variants of the GRU implementation. The default one is based on
-  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
-  state before matrix multiplication. The other one is based on
-  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
-
-  The second variant is compatible with CuDNNGRU (GPU-only) and allows
-  inference on CPU. Thus it has separate biases for `kernel` and
-  `recurrent_kernel`. To use this variant, set `reset_after=True` and
-  `recurrent_activation='sigmoid'`.
-
-  For example:
-
-  >>> inputs = tf.random.normal([32, 10, 8])
-  >>> gru = tf.keras.layers.GRU(4)
-  >>> output = gru(inputs)
-  >>> print(output.shape)
-  (32, 4)
-  >>> gru = tf.keras.layers.GRU(4, return_sequences=True, return_state=True)
-  >>> whole_sequence_output, final_state = gru(inputs)
-  >>> print(whole_sequence_output.shape)
-  (32, 10, 4)
-  >>> print(final_state.shape)
-  (32, 4)
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use
-      for the recurrent step.
-      Default: sigmoid (`sigmoid`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs. Default:
-      `glorot_uniform`.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-       weights matrix, used for the linear transformation of the recurrent
-       state. Default: `orthogonal`.
-    bias_initializer: Initializer for the bias vector. Default: `zeros`.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix. Default: `None`.
-    bias_regularizer: Regularizer function applied to the bias vector. Default:
-      `None`.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation"). Default: `None`.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix. Default: `None`.
-    bias_constraint: Constraint function applied to the bias vector. Default:
-      `None`.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs. Default: 0.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state. Default: 0.
-    return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence. Default: `False`.
-    return_state: Boolean. Whether to return the last state in addition to the
-      output. Default: `False`.
-    go_backwards: Boolean (default `False`).
-      If True, process the input sequence backwards and return the
-      reversed sequence.
-    stateful: Boolean (default False). If True, the last state
-      for each sample at index i in a batch will be used as initial
-      state for the sample of index i in the following batch.
-    unroll: Boolean (default False).
-      If True, the network will be unrolled,
-      else a symbolic loop will be used.
-      Unrolling can speed-up a RNN,
-      although it tends to be more memory-intensive.
-      Unrolling is only suitable for short sequences.
-    time_major: The shape format of the `inputs` and `outputs` tensors.
-      If True, the inputs and outputs will be in shape
-      `[timesteps, batch, feature]`, whereas in the False case, it will be
-      `[batch, timesteps, feature]`. Using `time_major = True` is a bit more
-      efficient because it avoids transposes at the beginning and end of the
-      RNN calculation. However, most TensorFlow data is batch-major, so by
-      default this function accepts input and emits output in batch-major
-      form.
-    reset_after: GRU convention (whether to apply reset gate after or
-      before matrix multiplication). False = "before",
-      True = "after" (default and cuDNN compatible).
-
-  Call arguments:
-    inputs: A 3D tensor, with shape `[batch, timesteps, feature]`.
-    mask: Binary tensor of shape `[samples, timesteps]` indicating whether
-      a given timestep should be masked  (optional, defaults to `None`).
-      An individual `True` entry indicates that the corresponding timestep
-      should be utilized, while a `False` entry indicates that the
-      corresponding timestep should be ignored.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used  (optional, defaults to `None`).
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell  (optional, defaults to `None` which causes creation
-      of zero-filled initial state tensors).
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               time_major=False,
-               reset_after=True,
-               **kwargs):
-    # return_runtime is a flag for testing, which shows the real backend
-    # implementation chosen by grappler in graph mode.
-    self._return_runtime = kwargs.pop('return_runtime', False)
-    implementation = kwargs.pop('implementation', 2)
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=2`.'
-                      'Please update your layer call.')
-    if 'enable_caching_device' in kwargs:
-      cell_kwargs = {'enable_caching_device':
-                     kwargs.pop('enable_caching_device')}
-    else:
-      cell_kwargs = {}
-    cell = GRUCell(
+    """Gated Recurrent Unit - Cho et al. 2014.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    Based on available runtime hardware and constraints, this layer
+    will choose different implementations (cuDNN-based or pure-TensorFlow)
+    to maximize the performance. If a GPU is available and all
+    the arguments to the layer meet the requirement of the cuDNN kernel
+    (see below for details), the layer will use a fast cuDNN implementation.
+
+    The requirements to use the cuDNN implementation are:
+
+    1. `activation` == `tanh`
+    2. `recurrent_activation` == `sigmoid`
+    3. `recurrent_dropout` == 0
+    4. `unroll` is `False`
+    5. `use_bias` is `True`
+    6. `reset_after` is `True`
+    7. Inputs, if use masking, are strictly right-padded.
+    8. Eager execution is enabled in the outermost context.
+
+    There are two variants of the GRU implementation. The default one is based
+    on [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to
+    hidden state before matrix multiplication. The other one is based on
+    [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
+
+    The second variant is compatible with CuDNNGRU (GPU-only) and allows
+    inference on CPU. Thus it has separate biases for `kernel` and
+    `recurrent_kernel`. To use this variant, set `reset_after=True` and
+    `recurrent_activation='sigmoid'`.
+
+    For example:
+
+    >>> inputs = tf.random.normal([32, 10, 8])
+    >>> gru = tf.keras.layers.GRU(4)
+    >>> output = gru(inputs)
+    >>> print(output.shape)
+    (32, 4)
+    >>> gru = tf.keras.layers.GRU(4, return_sequences=True, return_state=True)
+    >>> whole_sequence_output, final_state = gru(inputs)
+    >>> print(whole_sequence_output.shape)
+    (32, 10, 4)
+    >>> print(final_state.shape)
+    (32, 4)
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+        for the recurrent step.
+        Default: sigmoid (`sigmoid`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs. Default:
+        `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+         weights matrix, used for the linear transformation of the recurrent
+         state. Default: `orthogonal`.
+      bias_initializer: Initializer for the bias vector. Default: `zeros`.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_regularizer: Regularizer function applied to the bias vector.
+        Default: `None`.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation"). Default: `None`.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_constraint: Constraint function applied to the bias vector. Default:
+        `None`.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs. Default: 0.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state. Default: 0.
+      return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence. Default: `False`.
+      return_state: Boolean. Whether to return the last state in addition to the
+        output. Default: `False`.
+      go_backwards: Boolean (default `False`).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+        If True, the network will be unrolled,
+        else a symbolic loop will be used.
+        Unrolling can speed-up a RNN,
+        although it tends to be more memory-intensive.
+        Unrolling is only suitable for short sequences.
+      time_major: The shape format of the `inputs` and `outputs` tensors.
+        If True, the inputs and outputs will be in shape
+        `[timesteps, batch, feature]`, whereas in the False case, it will be
+        `[batch, timesteps, feature]`. Using `time_major = True` is a bit more
+        efficient because it avoids transposes at the beginning and end of the
+        RNN calculation. However, most TensorFlow data is batch-major, so by
+        default this function accepts input and emits output in batch-major
+        form.
+      reset_after: GRU convention (whether to apply reset gate after or
+        before matrix multiplication). False = "before",
+        True = "after" (default and cuDNN compatible).
+
+    Call arguments:
+      inputs: A 3D tensor, with shape `[batch, timesteps, feature]`.
+      mask: Binary tensor of shape `[samples, timesteps]` indicating whether
+        a given timestep should be masked  (optional).
+        An individual `True` entry indicates that the corresponding timestep
+        should be utilized, while a `False` entry indicates that the
+        corresponding timestep should be ignored. Defaults to `None`.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` is used  (optional). Defaults to `None`.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell  (optional, `None` causes creation
+        of zero-filled initial state tensors). Defaults to `None`.
+    """
+
+    def __init__(
+        self,
         units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        reset_after=reset_after,
-        dtype=kwargs.get('dtype'),
-        trainable=kwargs.get('trainable', True),
-        **cell_kwargs)
-    super().__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        time_major=time_major,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.input_spec = [InputSpec(ndim=3)]
-
-    # GPU kernel uses following setting by default and not configurable.
-    self._could_use_gpu_kernel = (
-        self.activation in (activations.tanh, tf.tanh) and
-        self.recurrent_activation in (activations.sigmoid, tf.sigmoid) and
-        recurrent_dropout == 0 and not unroll and use_bias and
-        reset_after and tf.compat.v1.executing_eagerly_outside_functions())
-    if tf.config.list_logical_devices('GPU'):
-      # Only show the message when there is GPU available, user will not care
-      # about the cuDNN if there isn't any GPU.
-      if self._could_use_gpu_kernel:
-        logging.debug(gru_lstm_utils.CUDNN_AVAILABLE_MSG % self.name)
-      else:
-        logging.warning(gru_lstm_utils.CUDNN_NOT_AVAILABLE_MSG % self.name)
-
-    if gru_lstm_utils.use_new_gru_lstm_impl():
-      self._defun_wrapper = gru_lstm_utils.DefunWrapper(
-          time_major, go_backwards, 'gru')
+        activation="tanh",
+        recurrent_activation="sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        unroll=False,
+        time_major=False,
+        reset_after=True,
+        **kwargs,
+    ):
+        # return_runtime is a flag for testing, which shows the real backend
+        # implementation chosen by grappler in graph mode.
+        self._return_runtime = kwargs.pop("return_runtime", False)
+        implementation = kwargs.pop("implementation", 2)
+        if implementation == 0:
+            logging.warning(
+                "`implementation=0` has been deprecated, "
+                "and now defaults to `implementation=2`."
+                "Please update your layer call."
+            )
+        if "enable_caching_device" in kwargs:
+            cell_kwargs = {
+                "enable_caching_device": kwargs.pop("enable_caching_device")
+            }
+        else:
+            cell_kwargs = {}
+        cell = GRUCell(
+            units,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            implementation=implementation,
+            reset_after=reset_after,
+            dtype=kwargs.get("dtype"),
+            trainable=kwargs.get("trainable", True),
+            name="gru_cell",
+            **cell_kwargs,
+        )
+        super().__init__(
+            cell,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            unroll=unroll,
+            time_major=time_major,
+            **kwargs,
+        )
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.input_spec = [InputSpec(ndim=3)]
+
+        # GPU kernel uses following setting by default and not configurable.
+        self._could_use_gpu_kernel = (
+            self.activation in (activations.tanh, tf.tanh)
+            and self.recurrent_activation in (activations.sigmoid, tf.sigmoid)
+            and recurrent_dropout == 0
+            and not unroll
+            and use_bias
+            and reset_after
+            and tf.compat.v1.executing_eagerly_outside_functions()
+        )
+        if tf.config.list_logical_devices("GPU"):
+            # Only show the message when there is GPU available, user will not
+            # care about the cuDNN if there isn't any GPU.
+            if self._could_use_gpu_kernel:
+                logging.debug(gru_lstm_utils.CUDNN_AVAILABLE_MSG % self.name)
+            else:
+                logging.warning(
+                    gru_lstm_utils.CUDNN_NOT_AVAILABLE_MSG % self.name
+                )
+
+        if gru_lstm_utils.use_new_gru_lstm_impl():
+            self._defun_wrapper = gru_lstm_utils.DefunWrapper(
+                time_major, go_backwards, "gru"
+            )
+
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        # The input should be dense, padded with zeros. If a ragged input is fed
+        # into the layer, it is padded and the row lengths are used for masking.
+        inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
+        is_ragged_input = row_lengths is not None
+        self._validate_args_if_ragged(is_ragged_input, mask)
+
+        # GRU does not support constants. Ignore it during process.
+        inputs, initial_state, _ = self._process_inputs(
+            inputs, initial_state, None
+        )
+
+        if isinstance(mask, list):
+            mask = mask[0]
+
+        input_shape = backend.int_shape(inputs)
+        timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+        if not self._could_use_gpu_kernel:
+            kwargs = {"training": training}
+            self._maybe_reset_cell_dropout_mask(self.cell)
+
+            def step(cell_inputs, cell_states):
+                return self.cell(cell_inputs, cell_states, **kwargs)
+
+            last_output, outputs, states = backend.rnn(
+                step,
+                inputs,
+                initial_state,
+                constants=None,
+                go_backwards=self.go_backwards,
+                mask=mask,
+                unroll=self.unroll,
+                input_length=row_lengths
+                if row_lengths is not None
+                else timesteps,
+                time_major=self.time_major,
+                zero_output_for_mask=self.zero_output_for_mask,
+                return_all_outputs=self.return_sequences,
+            )
+            # This is a dummy tensor for testing purpose.
+            runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
+        else:
+            last_output, outputs, runtime, states = self._defun_gru_call(
+                inputs, initial_state, training, mask, row_lengths
+            )
+
+        if self.stateful:
+            updates = [
+                tf.compat.v1.assign(
+                    self.states[0], tf.cast(states[0], self.states[0].dtype)
+                )
+            ]
+            self.add_update(updates)
+
+        if self.return_sequences:
+            output = backend.maybe_convert_to_ragged(
+                is_ragged_input,
+                outputs,
+                row_lengths,
+                go_backwards=self.go_backwards,
+            )
+        else:
+            output = last_output
 
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # The input should be dense, padded with zeros. If a ragged input is fed
-    # into the layer, it is padded and the row lengths are used for masking.
-    inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
-    is_ragged_input = (row_lengths is not None)
-    self._validate_args_if_ragged(is_ragged_input, mask)
+        if self.return_state:
+            return [output] + list(states)
+        elif self._return_runtime:
+            return output, runtime
+        else:
+            return output
+
+    @property
+    def units(self):
+        return self.cell.units
+
+    @property
+    def activation(self):
+        return self.cell.activation
+
+    @property
+    def recurrent_activation(self):
+        return self.cell.recurrent_activation
+
+    @property
+    def use_bias(self):
+        return self.cell.use_bias
+
+    @property
+    def kernel_initializer(self):
+        return self.cell.kernel_initializer
+
+    @property
+    def recurrent_initializer(self):
+        return self.cell.recurrent_initializer
+
+    @property
+    def bias_initializer(self):
+        return self.cell.bias_initializer
+
+    @property
+    def kernel_regularizer(self):
+        return self.cell.kernel_regularizer
+
+    @property
+    def recurrent_regularizer(self):
+        return self.cell.recurrent_regularizer
+
+    @property
+    def bias_regularizer(self):
+        return self.cell.bias_regularizer
+
+    @property
+    def kernel_constraint(self):
+        return self.cell.kernel_constraint
+
+    @property
+    def recurrent_constraint(self):
+        return self.cell.recurrent_constraint
+
+    @property
+    def bias_constraint(self):
+        return self.cell.bias_constraint
+
+    @property
+    def dropout(self):
+        return self.cell.dropout
+
+    @property
+    def recurrent_dropout(self):
+        return self.cell.recurrent_dropout
+
+    @property
+    def implementation(self):
+        return self.cell.implementation
+
+    @property
+    def reset_after(self):
+        return self.cell.reset_after
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+            "implementation": self.implementation,
+            "reset_after": self.reset_after,
+        }
+        config.update(rnn_utils.config_for_enable_caching_device(self.cell))
+        base_config = super().get_config()
+        del base_config["cell"]
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        if "implementation" in config and config["implementation"] == 0:
+            config["implementation"] = 1
+        return cls(**config)
+
+    def _defun_gru_call(
+        self, inputs, initial_state, training, mask, sequence_lengths
+    ):
+        # Use the new defun approach for backend implementation swap.
+        # Note that different implementations need to have same function
+        # signature, eg, the tensor parameters need to have same shape and
+        # dtypes.
+
+        self.reset_dropout_mask()
+        dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
+        if dropout_mask is not None:
+            inputs = inputs * dropout_mask[0]
+
+        if gru_lstm_utils.use_new_gru_lstm_impl():
+            gru_kwargs = {
+                "inputs": inputs,
+                "init_h": gru_lstm_utils.read_variable_value(initial_state[0]),
+                "kernel": gru_lstm_utils.read_variable_value(self.cell.kernel),
+                "recurrent_kernel": gru_lstm_utils.read_variable_value(
+                    self.cell.recurrent_kernel
+                ),
+                "bias": gru_lstm_utils.read_variable_value(self.cell.bias),
+                "mask": mask,
+                "time_major": self.time_major,
+                "go_backwards": self.go_backwards,
+                "sequence_lengths": sequence_lengths,
+                "zero_output_for_mask": self.zero_output_for_mask,
+            }
+            (
+                last_output,
+                outputs,
+                new_h,
+                runtime,
+            ) = self._defun_wrapper.defun_layer(**gru_kwargs)
+        else:
+            gpu_gru_kwargs = {
+                "inputs": inputs,
+                "init_h": gru_lstm_utils.read_variable_value(initial_state[0]),
+                "kernel": gru_lstm_utils.read_variable_value(self.cell.kernel),
+                "recurrent_kernel": gru_lstm_utils.read_variable_value(
+                    self.cell.recurrent_kernel
+                ),
+                "bias": gru_lstm_utils.read_variable_value(self.cell.bias),
+                "mask": mask,
+                "time_major": self.time_major,
+                "go_backwards": self.go_backwards,
+                "sequence_lengths": sequence_lengths,
+                "return_sequences": self.return_sequences,
+            }
+            normal_gru_kwargs = gpu_gru_kwargs.copy()
+            normal_gru_kwargs.update(
+                {
+                    "zero_output_for_mask": self.zero_output_for_mask,
+                }
+            )
+
+            if tf.executing_eagerly():
+                device_type = gru_lstm_utils.get_context_device_type()
+                can_use_gpu = (
+                    # Either user specified GPU or unspecified but GPU is
+                    # available.
+                    (
+                        device_type == gru_lstm_utils.GPU_DEVICE_NAME
+                        or (
+                            device_type is None
+                            and tf.config.list_logical_devices("GPU")
+                        )
+                    )
+                    and (
+                        gru_lstm_utils.is_cudnn_supported_inputs(
+                            mask, self.time_major, sequence_lengths
+                        )
+                    )
+                )
+                # Under eager context, check the device placement and prefer the
+                if can_use_gpu:
+                    last_output, outputs, new_h, runtime = gpu_gru(
+                        **gpu_gru_kwargs
+                    )
+                else:
+                    last_output, outputs, new_h, runtime = standard_gru(
+                        **normal_gru_kwargs
+                    )
+            else:
+                (
+                    last_output,
+                    outputs,
+                    new_h,
+                    runtime,
+                ) = gru_with_backend_selection(**normal_gru_kwargs)
+
+        states = [new_h]
+        return last_output, outputs, runtime, states
+
+
+def standard_gru(
+    inputs,
+    init_h,
+    kernel,
+    recurrent_kernel,
+    bias,
+    mask,
+    time_major,
+    go_backwards,
+    sequence_lengths,
+    zero_output_for_mask,
+    return_sequences,
+):
+    """GRU with standard kernel implementation.
+
+    This implementation can be run on all types of hardware.
+
+    This implementation lifts out all the layer weights and make them function
+    parameters. It has same number of tensor input params as the cuDNN
+    counterpart. The RNN step logic has been simplified, eg dropout and mask is
+    removed since cuDNN implementation does not support that.
+
+    Args:
+      inputs: Input tensor of GRU layer.
+      init_h: Initial state tensor for the cell output.
+      kernel: Weights for cell kernel.
+      recurrent_kernel: Weights for cell recurrent kernel.
+      bias: Weights for cell kernel bias and recurrent bias. The bias contains
+        the combined input_bias and recurrent_bias.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+        a given timestep should be masked. An individual `True` entry indicates
+        that the corresponding timestep should be utilized, while a `False`
+        entry indicates that the corresponding timestep should be ignored.
+      time_major: Boolean, whether the inputs are in the format of
+        [time, batch, feature] or [batch, time, feature].
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      sequence_lengths: The lengths of all sequences coming from a variable
+        length input, such as ragged tensors. If the input has a fixed timestep
+        size, this should be None.
+      zero_output_for_mask: Boolean, whether to output zero for masked timestep.
+      return_sequences: Boolean. If True, return the recurrent outputs for all
+        timesteps in the sequence. If False, only return the output for the
+        last timestep (which consumes less memory).
+
+    Returns:
+      last_output: output tensor for the last timestep, which has shape
+        [batch, units].
+      outputs:
+        - If `return_sequences=True`: output tensor for all timesteps,
+          which has shape [batch, time, units].
+        - Else, a tensor equal to `last_output` with shape [batch, 1, units]
+      state_0: the cell output, which has same shape as init_h.
+      runtime: constant string tensor which indicate real runtime hardware. This
+        value is for testing purpose and should be used by user.
+    """
+    input_shape = backend.int_shape(inputs)
+    timesteps = input_shape[0] if time_major else input_shape[1]
 
-    # GRU does not support constants. Ignore it during process.
-    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+    input_bias, recurrent_bias = tf.unstack(bias)
 
-    if isinstance(mask, list):
-      mask = mask[0]
+    def step(cell_inputs, cell_states):
+        """Step function that will be used by Keras RNN backend."""
+        h_tm1 = cell_states[0]
 
-    input_shape = backend.int_shape(inputs)
-    timesteps = input_shape[0] if self.time_major else input_shape[1]
-
-    if not self._could_use_gpu_kernel:
-      kwargs = {'training': training}
-      self._maybe_reset_cell_dropout_mask(self.cell)
-
-      def step(cell_inputs, cell_states):
-        return self.cell(cell_inputs, cell_states, **kwargs)
-
-      last_output, outputs, states = backend.rnn(
-          step,
-          inputs,
-          initial_state,
-          constants=None,
-          go_backwards=self.go_backwards,
-          mask=mask,
-          unroll=self.unroll,
-          input_length=row_lengths if row_lengths is not None else timesteps,
-          time_major=self.time_major,
-          zero_output_for_mask=self.zero_output_for_mask,
-          return_all_outputs=self.return_sequences)
-      # This is a dummy tensor for testing purpose.
-      runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
-    else:
-      last_output, outputs, runtime, states = self._defun_gru_call(
-          inputs, initial_state, training, mask, row_lengths)
+        # inputs projected by all gate matrices at once
+        matrix_x = backend.dot(cell_inputs, kernel)
+        matrix_x = backend.bias_add(matrix_x, input_bias)
 
-    if self.stateful:
-      updates = [tf.compat.v1.assign(self.states[0],
-                                     tf.cast(states[0], self.states[0].dtype))]
-      self.add_update(updates)
+        x_z, x_r, x_h = tf.split(matrix_x, 3, axis=1)
 
-    if self.return_sequences:
-      output = backend.maybe_convert_to_ragged(
-          is_ragged_input, outputs, row_lengths, go_backwards=self.go_backwards)
+        # hidden state projected by all gate matrices at once
+        matrix_inner = backend.dot(h_tm1, recurrent_kernel)
+        matrix_inner = backend.bias_add(matrix_inner, recurrent_bias)
+
+        recurrent_z, recurrent_r, recurrent_h = tf.split(
+            matrix_inner, 3, axis=1
+        )
+        z = tf.sigmoid(x_z + recurrent_z)
+        r = tf.sigmoid(x_r + recurrent_r)
+        hh = tf.tanh(x_h + r * recurrent_h)
+
+        # previous and candidate state mixed by update gate
+        h = z * h_tm1 + (1 - z) * hh
+        return h, [h]
+
+    last_output, outputs, new_states = backend.rnn(
+        step,
+        inputs,
+        [init_h],
+        constants=None,
+        unroll=False,
+        time_major=time_major,
+        mask=mask,
+        go_backwards=go_backwards,
+        input_length=sequence_lengths
+        if sequence_lengths is not None
+        else timesteps,
+        zero_output_for_mask=zero_output_for_mask,
+        return_all_outputs=return_sequences,
+    )
+    return (
+        last_output,
+        outputs,
+        new_states[0],
+        gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_CPU),
+    )
+
+
+def gpu_gru(
+    inputs,
+    init_h,
+    kernel,
+    recurrent_kernel,
+    bias,
+    mask,
+    time_major,
+    go_backwards,
+    sequence_lengths,
+    return_sequences,
+):
+    """GRU with cuDNN implementation which is only available for GPU."""
+    if mask is not None:
+        sequence_lengths = gru_lstm_utils.calculate_sequence_by_mask(
+            mask, time_major
+        )
+
+    if not time_major and sequence_lengths is None:
+        inputs = tf.transpose(inputs, perm=(1, 0, 2))
+        seq_axis, batch_axis = (0, 1)
     else:
-      output = last_output
-
-    if self.return_state:
-      return [output] + list(states)
-    elif self._return_runtime:
-      return output, runtime
+        seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
+    # For init_h, cuDNN expects one more dim of num_layers before or after batch
+    # dim for time major or batch major inputs respectively
+    init_h = tf.expand_dims(init_h, axis=seq_axis)
+
+    weights = tf.split(kernel, 3, axis=1)
+    weights += tf.split(recurrent_kernel, 3, axis=1)
+    # Note that the bias was initialized as shape (2, 3 * units), flat it into
+    # (6 * units)
+    bias = tf.split(backend.flatten(bias), 6)
+
+    if tf.sysconfig.get_build_info()["is_cuda_build"]:
+        # Note that the gate order for cuDNN is different from the canonical
+        # format.  canonical format is [z, r, h], whereas cuDNN is [r, z, h].
+        # The swap need to be done for kernel, recurrent_kernel, input_bias,
+        # recurrent_bias.
+        # z is update gate weights.
+        # r is reset gate weights.
+        # h is output gate weights.
+        weights[0], weights[1] = weights[1], weights[0]
+        weights[3], weights[4] = weights[4], weights[3]
+        bias[0], bias[1] = bias[1], bias[0]
+        bias[3], bias[4] = bias[4], bias[3]
+
+    params = gru_lstm_utils.canonical_to_params(
+        weights=weights,
+        biases=bias,
+        shape=tf.constant([-1]),
+        transpose_weights=True,
+    )
+
+    if sequence_lengths is not None:
+        if go_backwards:
+            # Three reversals are required. E.g.,
+            # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
+            # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
+            # output_from_cudnn = [6, 5, 4, 0, 0]
+            # expected_output = [0, 0, 6, 5 ,4]
+            inputs = tf.reverse_sequence(
+                inputs,
+                sequence_lengths,
+                seq_axis=seq_axis,
+                batch_axis=batch_axis,
+            )
+        outputs, h, _, _, _ = tf.raw_ops.CudnnRNNV3(
+            input=inputs,
+            input_h=init_h,
+            input_c=0,
+            params=params,
+            is_training=True,
+            rnn_mode="gru",
+            sequence_lengths=sequence_lengths,
+            time_major=time_major,
+        )
+        if go_backwards:
+            outputs = tf.reverse_sequence(
+                outputs,
+                sequence_lengths,
+                seq_axis=seq_axis,
+                batch_axis=batch_axis,
+            )
+            outputs = tf.reverse(outputs, axis=[seq_axis])
     else:
-      return output
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
-  @property
-  def reset_after(self):
-    return self.cell.reset_after
-
-  def get_config(self):
-    config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation,
-        'reset_after':
-            self.reset_after
+        if go_backwards:
+            # Reverse axis 0 since the input is already convert to time major.
+            inputs = tf.reverse(inputs, axis=[0])
+        outputs, h, _, _ = tf.raw_ops.CudnnRNN(
+            input=inputs,
+            input_h=init_h,
+            input_c=0,
+            params=params,
+            is_training=True,
+            rnn_mode="gru",
+        )
+
+    last_output = outputs[-1]
+    if not time_major and sequence_lengths is None and return_sequences:
+        outputs = tf.transpose(outputs, perm=[1, 0, 2])
+    h = tf.squeeze(h, axis=seq_axis)
+
+    # In the case of variable length input, the cudnn kernel will fill zeros for
+    # the output, whereas the default keras behavior is to bring over the
+    # previous output for t-1, so that in the return_sequence=False case, user
+    # can quickly get the final effect output instead just 0s at the last
+    # timestep.  In order to mimic the default keras behavior, we copy the final
+    # h state as the last_output, since it is numerically same as the output.
+    if sequence_lengths is not None:
+        last_output = h
+
+    # Match CPU return format
+    if not return_sequences:
+        outputs = tf.expand_dims(last_output, axis=0 if time_major else 1)
+
+    return (
+        last_output,
+        outputs,
+        h,
+        gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_GPU),
+    )
+
+
+def gru_with_backend_selection(
+    inputs,
+    init_h,
+    kernel,
+    recurrent_kernel,
+    bias,
+    mask,
+    time_major,
+    go_backwards,
+    sequence_lengths,
+    zero_output_for_mask,
+    return_sequences,
+):
+    """Call the GRU with optimized backend kernel selection.
+
+    Under the hood, this function will create two TF function, one with the most
+    generic kernel and can run on all device condition, and the second one with
+    cuDNN specific kernel, which can only run on GPU.
+
+    The first function will be called with normal_lstm_params, while the second
+    function is not called, but only registered in the graph. The Grappler will
+    do the proper graph rewrite and swap the optimized TF function based on the
+    device placement.
+
+    Args:
+      inputs: Input tensor of GRU layer.
+      init_h: Initial state tensor for the cell output.
+      kernel: Weights for cell kernel.
+      recurrent_kernel: Weights for cell recurrent kernel.
+      bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
+        is used in this case.
+      mask: Boolean tensor for mask out the steps within sequence.
+        An individual `True` entry indicates that the corresponding timestep
+        should be utilized, while a `False` entry indicates that the
+        corresponding timestep should be ignored.
+      time_major: Boolean, whether the inputs are in the format of
+        [time, batch, feature] or [batch, time, feature].
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      sequence_lengths: The lengths of all sequences coming from a variable
+        length input, such as ragged tensors. If the input has a fixed timestep
+        size, this should be None.
+      zero_output_for_mask: Boolean, whether to output zero for masked timestep.
+      return_sequences: Boolean. If True, return the recurrent outputs for all
+        timesteps in the sequence. If False, only return the output for the
+        last timestep (which consumes less memory).
+
+    Returns:
+      List of output tensors, same as standard_gru.
+    """
+    params = {
+        "inputs": inputs,
+        "init_h": init_h,
+        "kernel": kernel,
+        "recurrent_kernel": recurrent_kernel,
+        "bias": bias,
+        "mask": mask,
+        "time_major": time_major,
+        "go_backwards": go_backwards,
+        "sequence_lengths": sequence_lengths,
+        "zero_output_for_mask": zero_output_for_mask,
+        "return_sequences": return_sequences,
     }
-    config.update(rnn_utils.config_for_enable_caching_device(self.cell))
-    base_config = super().get_config()
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
-
-  def _defun_gru_call(self, inputs, initial_state, training, mask,
-                      sequence_lengths):
-    # Use the new defun approach for backend implementation swap.
-    # Note that different implementations need to have same function
-    # signature, eg, the tensor parameters need to have same shape and dtypes.
-
-    self.reset_dropout_mask()
-    dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
-    if dropout_mask is not None:
-      inputs = inputs * dropout_mask[0]
+
+    def gpu_gru_with_fallback(
+        inputs,
+        init_h,
+        kernel,
+        recurrent_kernel,
+        bias,
+        mask,
+        time_major,
+        go_backwards,
+        sequence_lengths,
+        zero_output_for_mask,
+        return_sequences,
+    ):
+        """Use cuDNN kernel when mask is none or strictly right padded."""
+
+        def cudnn_gru_fn():
+            return gpu_gru(
+                inputs=inputs,
+                init_h=init_h,
+                kernel=kernel,
+                recurrent_kernel=recurrent_kernel,
+                bias=bias,
+                mask=mask,
+                time_major=time_major,
+                go_backwards=go_backwards,
+                sequence_lengths=sequence_lengths,
+                return_sequences=return_sequences,
+            )
+
+        def standard_gru_fn():
+            return standard_gru(
+                inputs=inputs,
+                init_h=init_h,
+                kernel=kernel,
+                recurrent_kernel=recurrent_kernel,
+                bias=bias,
+                mask=mask,
+                time_major=time_major,
+                go_backwards=go_backwards,
+                sequence_lengths=sequence_lengths,
+                zero_output_for_mask=zero_output_for_mask,
+                return_sequences=return_sequences,
+            )
+
+        return tf.__internal__.smart_cond.smart_cond(
+            gru_lstm_utils.is_cudnn_supported_inputs(
+                mask, time_major, sequence_lengths
+            ),
+            true_fn=cudnn_gru_fn,
+            false_fn=standard_gru_fn,
+        )
 
     if gru_lstm_utils.use_new_gru_lstm_impl():
-      gru_kwargs = {
-          'inputs':
-              inputs,
-          'init_h':
-              gru_lstm_utils.read_variable_value(initial_state[0]),
-          'kernel':
-              gru_lstm_utils.read_variable_value(self.cell.kernel),
-          'recurrent_kernel':
-              gru_lstm_utils.read_variable_value(self.cell.recurrent_kernel),
-          'bias':
-              gru_lstm_utils.read_variable_value(self.cell.bias),
-          'mask':
-              mask,
-          'time_major':
-              self.time_major,
-          'go_backwards':
-              self.go_backwards,
-          'sequence_lengths':
-              sequence_lengths,
-          'zero_output_for_mask':
-              self.zero_output_for_mask
-      }
-      (last_output, outputs, new_h,
-       runtime) = self._defun_wrapper.defun_layer(**gru_kwargs)
+        # Chooses the implementation dynamically based on the running device.
+        (
+            last_output,
+            outputs,
+            new_h,
+            runtime,
+        ) = tf.__internal__.execute_fn_for_device(
+            {
+                gru_lstm_utils.CPU_DEVICE_NAME: lambda: standard_gru(**params),
+                gru_lstm_utils.GPU_DEVICE_NAME: lambda: gpu_gru_with_fallback(
+                    **params
+                ),
+            },
+            lambda: standard_gru(**params),
+        )
     else:
-      gpu_gru_kwargs = {
-          'inputs':
-              inputs,
-          'init_h':
-              gru_lstm_utils.read_variable_value(initial_state[0]),
-          'kernel':
-              gru_lstm_utils.read_variable_value(self.cell.kernel),
-          'recurrent_kernel':
-              gru_lstm_utils.read_variable_value(self.cell.recurrent_kernel),
-          'bias':
-              gru_lstm_utils.read_variable_value(self.cell.bias),
-          'mask':
-              mask,
-          'time_major':
-              self.time_major,
-          'go_backwards':
-              self.go_backwards,
-          'sequence_lengths':
-              sequence_lengths,
-          'return_sequences':
-              self.return_sequences
-      }
-      normal_gru_kwargs = gpu_gru_kwargs.copy()
-      normal_gru_kwargs.update({
-          'zero_output_for_mask': self.zero_output_for_mask,
-      })
-
-      if tf.executing_eagerly():
-        device_type = gru_lstm_utils.get_context_device_type()
-        can_use_gpu = (
-            # Either user specified GPU or unspecified but GPU is available.
-            (device_type == gru_lstm_utils.GPU_DEVICE_NAME or
-             (device_type is None and tf.config.list_logical_devices('GPU')))
-            and
-            (mask is None or
-             gru_lstm_utils.is_cudnn_supported_inputs(mask, self.time_major)))
-        # Under eager context, check the device placement and prefer the
-        if can_use_gpu:
-          last_output, outputs, new_h, runtime = gpu_gru(**gpu_gru_kwargs)
-        else:
-          last_output, outputs, new_h, runtime = standard_gru(
-              **normal_gru_kwargs)
-      else:
-        last_output, outputs, new_h, runtime = gru_with_backend_selection(
-            **normal_gru_kwargs)
-
-    states = [new_h]
-    return last_output, outputs, runtime, states
-
-
-def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask,
-                 time_major, go_backwards, sequence_lengths,
-                 zero_output_for_mask, return_sequences):
-  """GRU with standard kernel implementation.
-
-  This implementation can be run on all types of hardware.
-
-  This implementation lifts out all the layer weights and make them function
-  parameters. It has same number of tensor input params as the cuDNN
-  counterpart. The RNN step logic has been simplified, eg dropout and mask is
-  removed since cuDNN implementation does not support that.
-
-  Args:
-    inputs: Input tensor of GRU layer.
-    init_h: Initial state tensor for the cell output.
-    kernel: Weights for cell kernel.
-    recurrent_kernel: Weights for cell recurrent kernel.
-    bias: Weights for cell kernel bias and recurrent bias. The bias contains the
-      combined input_bias and recurrent_bias.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked. An individual `True` entry indicates
-      that the corresponding timestep should be utilized, while a `False` entry
-      indicates that the corresponding timestep should be ignored.
-    time_major: Boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    sequence_lengths: The lengths of all sequences coming from a variable length
-      input, such as ragged tensors. If the input has a fixed timestep size,
-      this should be None.
-    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
-    return_sequences: Boolean. If True, return the recurrent outputs for all
-      timesteps in the sequence. If False, only return the output for the
-      last timestep (which consumes less memory).
-
-  Returns:
-    last_output: output tensor for the last timestep, which has shape
-      [batch, units].
-    outputs:
-      - If `return_sequences=True`: output tensor for all timesteps,
-        which has shape [batch, time, units].
-      - Else, a tensor equal to `last_output` with shape [batch, 1, units]
-    state_0: the cell output, which has same shape as init_h.
-    runtime: constant string tensor which indicate real runtime hardware. This
-      value is for testing purpose and should be used by user.
-  """
-  input_shape = backend.int_shape(inputs)
-  timesteps = input_shape[0] if time_major else input_shape[1]
-
-  input_bias, recurrent_bias = tf.unstack(bias)
-
-  def step(cell_inputs, cell_states):
-    """Step function that will be used by Keras RNN backend."""
-    h_tm1 = cell_states[0]
-
-    # inputs projected by all gate matrices at once
-    matrix_x = backend.dot(cell_inputs, kernel)
-    matrix_x = backend.bias_add(matrix_x, input_bias)
-
-    x_z, x_r, x_h = tf.split(matrix_x, 3, axis=1)
-
-    # hidden state projected by all gate matrices at once
-    matrix_inner = backend.dot(h_tm1, recurrent_kernel)
-    matrix_inner = backend.bias_add(matrix_inner, recurrent_bias)
-
-    recurrent_z, recurrent_r, recurrent_h = tf.split(matrix_inner, 3, axis=1)
-    z = tf.sigmoid(x_z + recurrent_z)
-    r = tf.sigmoid(x_r + recurrent_r)
-    hh = tf.tanh(x_h + r * recurrent_h)
-
-    # previous and candidate state mixed by update gate
-    h = z * h_tm1 + (1 - z) * hh
-    return h, [h]
-
-  last_output, outputs, new_states = backend.rnn(
-      step,
-      inputs, [init_h],
-      constants=None,
-      unroll=False,
-      time_major=time_major,
-      mask=mask,
-      go_backwards=go_backwards,
-      input_length=sequence_lengths
-      if sequence_lengths is not None else timesteps,
-      zero_output_for_mask=zero_output_for_mask,
-      return_all_outputs=return_sequences)
-  return last_output, outputs, new_states[0], gru_lstm_utils.runtime(
-      gru_lstm_utils.RUNTIME_CPU)
-
-
-def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
-            go_backwards, sequence_lengths, return_sequences):
-  """GRU with cuDNN implementation which is only available for GPU."""
-  if mask is not None:
-    sequence_lengths = gru_lstm_utils.calculate_sequence_by_mask(
-        mask, time_major)
-
-  if not time_major and sequence_lengths is None:
-    inputs = tf.transpose(inputs, perm=(1, 0, 2))
-    seq_axis, batch_axis = (0, 1)
-  else:
-    seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
-  # For init_h, cuDNN expects one more dim of num_layers before or after batch
-  # dim for time major or batch major inputs respectively
-  init_h = tf.expand_dims(init_h, axis=seq_axis)
-
-  weights = tf.split(kernel, 3, axis=1)
-  weights += tf.split(recurrent_kernel, 3, axis=1)
-  # Note that the bias was initialized as shape (2, 3 * units), flat it into
-  # (6 * units)
-  bias = tf.split(backend.flatten(bias), 6)
-
-  if tf.sysconfig.get_build_info()['is_cuda_build']:
-    # Note that the gate order for cuDNN is different from the canonical format.
-    # canonical format is [z, r, h], whereas cuDNN is [r, z, h]. The swap need
-    # to be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
-    # z is update gate weights.
-    # r is reset gate weights.
-    # h is output gate weights.
-    weights[0], weights[1] = weights[1], weights[0]
-    weights[3], weights[4] = weights[4], weights[3]
-    bias[0], bias[1] = bias[1], bias[0]
-    bias[3], bias[4] = bias[4], bias[3]
-
-  params = gru_lstm_utils.canonical_to_params(
-      weights=weights,
-      biases=bias,
-      shape=tf.constant([-1]),
-      transpose_weights=True)
-
-  if sequence_lengths is not None:
-    if go_backwards:
-      # Three reversals are required. E.g.,
-      # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
-      # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
-      # output_from_cudnn = [6, 5, 4, 0, 0]
-      # expected_output = [0, 0, 6, 5 ,4]
-      inputs = tf.reverse_sequence(
-          inputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
-    outputs, h, _, _, _ = tf.raw_ops.CudnnRNNV3(
-        input=inputs,
-        input_h=init_h,
-        input_c=0,
-        params=params,
-        is_training=True,
-        rnn_mode='gru',
-        sequence_lengths=sequence_lengths,
-        time_major=time_major)
-    if go_backwards:
-      outputs = tf.reverse_sequence(
-          outputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
-      outputs = tf.reverse(outputs, axis=[seq_axis])
-  else:
-    if go_backwards:
-      # Reverse axis 0 since the input is already convert to time major.
-      inputs = tf.reverse(inputs, axis=[0])
-    outputs, h, _, _ = tf.raw_ops.CudnnRNN(
-        input=inputs, input_h=init_h, input_c=0, params=params,
-        is_training=True, rnn_mode='gru')
-
-  last_output = outputs[-1]
-  if not time_major and sequence_lengths is None and return_sequences:
-    outputs = tf.transpose(outputs, perm=[1, 0, 2])
-  h = tf.squeeze(h, axis=seq_axis)
-
-  # In the case of variable length input, the cudnn kernel will fill zeros for
-  # the output, whereas the default keras behavior is to bring over the previous
-  # output for t-1, so that in the return_sequence=False case, user can quickly
-  # get the final effect output instead just 0s at the last timestep.
-  # In order to mimic the default keras behavior, we copy the final h state as
-  # the last_output, since it is numerically same as the output.
-  if sequence_lengths is not None:
-    last_output = h
-
-  # Match CPU return format
-  if not return_sequences:
-    outputs = tf.expand_dims(last_output, axis=0 if time_major else 1)
-
-  return last_output, outputs, h, gru_lstm_utils.runtime(
-      gru_lstm_utils.RUNTIME_GPU)
-
-
-def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
-                               mask, time_major, go_backwards, sequence_lengths,
-                               zero_output_for_mask, return_sequences):
-  """Call the GRU with optimized backend kernel selection.
-
-  Under the hood, this function will create two TF function, one with the most
-  generic kernel and can run on all device condition, and the second one with
-  cuDNN specific kernel, which can only run on GPU.
-
-  The first function will be called with normal_lstm_params, while the second
-  function is not called, but only registered in the graph. The Grappler will
-  do the proper graph rewrite and swap the optimized TF function based on the
-  device placement.
-
-  Args:
-    inputs: Input tensor of GRU layer.
-    init_h: Initial state tensor for the cell output.
-    kernel: Weights for cell kernel.
-    recurrent_kernel: Weights for cell recurrent kernel.
-    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
-      is used in this case.
-    mask: Boolean tensor for mask out the steps within sequence.
-      An individual `True` entry indicates that the corresponding timestep
-      should be utilized, while a `False` entry indicates that the corresponding
-      timestep should be ignored.
-    time_major: Boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    sequence_lengths: The lengths of all sequences coming from a variable length
-      input, such as ragged tensors. If the input has a fixed timestep size,
-      this should be None.
-    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
-    return_sequences: Boolean. If True, return the recurrent outputs for all
-      timesteps in the sequence. If False, only return the output for the
-      last timestep (which consumes less memory).
-
-  Returns:
-    List of output tensors, same as standard_gru.
-  """
-  params = {
-      'inputs': inputs,
-      'init_h': init_h,
-      'kernel': kernel,
-      'recurrent_kernel': recurrent_kernel,
-      'bias': bias,
-      'mask': mask,
-      'time_major': time_major,
-      'go_backwards': go_backwards,
-      'sequence_lengths': sequence_lengths,
-      'zero_output_for_mask': zero_output_for_mask,
-      'return_sequences': return_sequences,
-  }
-
-  def gpu_gru_with_fallback(inputs, init_h, kernel, recurrent_kernel, bias,
-                            mask, time_major, go_backwards, sequence_lengths,
-                            zero_output_for_mask, return_sequences):
-    """Use cuDNN kernel when mask is none or strictly right padded."""
-    if mask is None:
-      return gpu_gru(
-          inputs=inputs,
-          init_h=init_h,
-          kernel=kernel,
-          recurrent_kernel=recurrent_kernel,
-          bias=bias,
-          mask=mask,
-          time_major=time_major,
-          go_backwards=go_backwards,
-          sequence_lengths=sequence_lengths,
-          return_sequences=return_sequences)
-
-    def cudnn_gru_fn():
-      return gpu_gru(
-          inputs=inputs,
-          init_h=init_h,
-          kernel=kernel,
-          recurrent_kernel=recurrent_kernel,
-          bias=bias,
-          mask=mask,
-          time_major=time_major,
-          go_backwards=go_backwards,
-          sequence_lengths=sequence_lengths,
-          return_sequences=return_sequences)
-
-    def standard_gru_fn():
-      return standard_gru(
-          inputs=inputs,
-          init_h=init_h,
-          kernel=kernel,
-          recurrent_kernel=recurrent_kernel,
-          bias=bias,
-          mask=mask,
-          time_major=time_major,
-          go_backwards=go_backwards,
-          sequence_lengths=sequence_lengths,
-          zero_output_for_mask=zero_output_for_mask,
-          return_sequences=return_sequences)
-
-    return tf.cond(
-        gru_lstm_utils.is_cudnn_supported_inputs(mask, time_major),
-        true_fn=cudnn_gru_fn,
-        false_fn=standard_gru_fn)
-
-  if gru_lstm_utils.use_new_gru_lstm_impl():
-    # Chooses the implementation dynamically based on the running device.
-    (last_output, outputs, new_h,
-     runtime) = tf.__internal__.execute_fn_for_device(
-         {
-             gru_lstm_utils.CPU_DEVICE_NAME:
-                 lambda: standard_gru(**params),
-             gru_lstm_utils.GPU_DEVICE_NAME:
-                 lambda: gpu_gru_with_fallback(**params)
-         }, lambda: standard_gru(**params))
-  else:
-    # Each time a `tf.function` is called, we will give it a unique
-    # identifiable API name, so that Grappler won't get confused when it
-    # sees multiple GRU layers added into same graph, and it will be able
-    # to pair up the different implementations across them.
-    api_name = 'gru_' + str(uuid.uuid4())
-    supportive_attribute = {
-        'time_major': time_major,
-        'go_backwards': go_backwards,
-    }
-    defun_standard_gru = gru_lstm_utils.generate_defun_backend(
-        api_name, gru_lstm_utils.CPU_DEVICE_NAME, standard_gru,
-        supportive_attribute)
-    defun_gpu_gru = gru_lstm_utils.generate_defun_backend(
-        api_name, gru_lstm_utils.GPU_DEVICE_NAME, gpu_gru_with_fallback,
-        supportive_attribute)
-
-    # Call the normal GRU impl and register the cuDNN impl function. The
-    # grappler will kick in during session execution to optimize the graph.
-    last_output, outputs, new_h, runtime = defun_standard_gru(**params)
-    gru_lstm_utils.function_register(defun_gpu_gru, **params)
-
-  return last_output, outputs, new_h, runtime
+        # Each time a `tf.function` is called, we will give it a unique
+        # identifiable API name, so that Grappler won't get confused when it
+        # sees multiple GRU layers added into same graph, and it will be able
+        # to pair up the different implementations across them.
+        api_name = "gru_" + str(uuid.uuid4())
+        supportive_attribute = {
+            "time_major": time_major,
+            "go_backwards": go_backwards,
+        }
+        defun_standard_gru = gru_lstm_utils.generate_defun_backend(
+            api_name,
+            gru_lstm_utils.CPU_DEVICE_NAME,
+            standard_gru,
+            supportive_attribute,
+        )
+        defun_gpu_gru = gru_lstm_utils.generate_defun_backend(
+            api_name,
+            gru_lstm_utils.GPU_DEVICE_NAME,
+            gpu_gru_with_fallback,
+            supportive_attribute,
+        )
+
+        # Call the normal GRU impl and register the cuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, runtime = defun_standard_gru(**params)
+        gru_lstm_utils.function_register(defun_gpu_gru, **params)
+
+    return last_output, outputs, new_h, runtime
diff --git a/keras/layers/rnn/gru_lstm_test.py b/keras/layers/rnn/gru_lstm_test.py
index 33ed001f7de4..0c09541e605c 100644
--- a/keras/layers/rnn/gru_lstm_test.py
+++ b/keras/layers/rnn/gru_lstm_test.py
@@ -19,131 +19,161 @@
 
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers.rnn import gru
 from keras.layers.rnn import lstm
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class RNNV2Test(test_combinations.TestCase):
-
-  @parameterized.parameters([lstm.LSTM, gru.GRU])
-  def test_device_placement(self, layer):
-    if not tf.test.is_gpu_available():
-      self.skipTest('Need GPU for testing.')
-    vocab_size = 20
-    embedding_dim = 10
-    batch_size = 8
-    timestep = 12
-    units = 5
-    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
-    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
-
-    # Test when GPU is available but not used, the graph should be properly
-    # created with CPU ops.
-    with test_utils.device(should_use_gpu=False):
-      model = keras.Sequential([
-          keras.layers.Embedding(vocab_size, embedding_dim,
-                                 batch_input_shape=[batch_size, timestep]),
-          layer(units, return_sequences=True, stateful=True),
-          keras.layers.Dense(vocab_size)
-      ])
-      model.compile(
-          optimizer='adam',
-          loss='sparse_categorical_crossentropy',
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, epochs=1, shuffle=False)
-
-  @parameterized.parameters([lstm.LSTM, gru.GRU])
-  def test_reset_dropout_mask_between_batch(self, layer):
-    # See https://github.com/tensorflow/tensorflow/issues/29187 for more details
-    batch_size = 8
-    timestep = 12
-    embedding_dim = 10
-    units = 5
-    layer = layer(units, dropout=0.5, recurrent_dropout=0.5)
-
-    inputs = np.random.random((batch_size, timestep, embedding_dim)).astype(
-        np.float32)
-    previous_dropout, previous_recurrent_dropout = None, None
-
-    for _ in range(5):
-      layer(inputs, training=True)
-      dropout = layer.cell.get_dropout_mask_for_cell(inputs, training=True)
-      recurrent_dropout = layer.cell.get_recurrent_dropout_mask_for_cell(
-          inputs, training=True)
-      if previous_dropout is not None:
-        self.assertNotAllClose(self.evaluate(previous_dropout),
-                               self.evaluate(dropout))
-        previous_dropout = dropout
-      if previous_recurrent_dropout is not None:
-        self.assertNotAllClose(self.evaluate(previous_recurrent_dropout),
-                               self.evaluate(recurrent_dropout))
-        previous_recurrent_dropout = recurrent_dropout
-
-  @parameterized.parameters([lstm.LSTM, gru.GRU])
-  def test_recurrent_dropout_with_stateful_RNN(self, layer):
-    # See https://github.com/tensorflow/tensorflow/issues/27829 for details.
-    # The issue was caused by using inplace mul for a variable, which was a
-    # warning for RefVariable, but an error for ResourceVariable in 2.0
-    keras.models.Sequential([
-        layer(128, stateful=True, return_sequences=True, dropout=0.2,
-              batch_input_shape=[32, None, 5], recurrent_dropout=0.2)
-    ])
-
-  @parameterized.parameters([lstm.LSTM, gru.GRU])
-  def test_recurrent_dropout_saved_model(self, layer):
-    if not tf.executing_eagerly():
-      self.skipTest('v2-only test')
-    inputs = keras.Input(shape=(784, 3), name='digits')
-    x = layer(64, activation='relu', name='RNN', dropout=0.1)(inputs)
-    x = keras.layers.Dense(64, activation='relu', name='dense')(x)
-    outputs = keras.layers.Dense(
-        10, activation='softmax', name='predictions')(
-            x)
-    model = keras.Model(inputs=inputs, outputs=outputs, name='3_layer')
-    model.save(os.path.join(self.get_temp_dir(), 'model'), save_format='tf')
-
-  @parameterized.parameters([lstm.LSTM, gru.GRU])
-  def test_ragged(self, layer):
-    vocab_size = 100
-    inputs = tf.ragged.constant(
-        np.random.RandomState(0).randint(0, vocab_size, [128, 25]))
-    embedder = keras.layers.Embedding(input_dim=vocab_size, output_dim=16)
-    embedded_inputs = embedder(inputs)
-    layer = layer(32)
-    layer(embedded_inputs)
-
-  @parameterized.parameters([lstm.LSTM, gru.GRU])
-  @test_utils.run_v2_only
-  def test_compare_ragged_with_masks(self, layer):
-    vocab_size = 100
-    timestep = 20
-    units = 32
-    embedder = keras.layers.Embedding(input_dim=vocab_size, output_dim=units)
-    layer = layer(units, return_sequences=True)
-    data = tf.constant(
-        np.random.RandomState(0).randint(0, vocab_size, [timestep, timestep]))
-    mask = tf.sequence_mask(tf.range(1, timestep + 1))
-    data_ragged = tf.ragged.boolean_mask(data, mask)
-
-    outputs = []
-    devices = [test_utils.device(should_use_gpu=False)]
-    if tf.test.is_gpu_available():
-      devices.append(test_utils.device(should_use_gpu=True))
-    for device in devices:
-      with device:
-        outputs.append(tf.boolean_mask(layer(embedder(data), mask=mask), mask))
-        outputs.append(layer(embedder(data_ragged)).values)
-
-    for i in range(len(outputs) - 1):
-      self.assertAllClose(outputs[i], outputs[i + 1], atol=1e-4)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @parameterized.parameters([lstm.LSTM, gru.GRU])
+    def test_device_placement(self, layer):
+        if not tf.test.is_gpu_available():
+            self.skipTest("Need GPU for testing.")
+        vocab_size = 20
+        embedding_dim = 10
+        batch_size = 8
+        timestep = 12
+        units = 5
+        x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+        y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+        # Test when GPU is available but not used, the graph should be properly
+        # created with CPU ops.
+        with test_utils.device(should_use_gpu=False):
+            model = keras.Sequential(
+                [
+                    keras.layers.Embedding(
+                        vocab_size,
+                        embedding_dim,
+                        batch_input_shape=[batch_size, timestep],
+                    ),
+                    layer(units, return_sequences=True, stateful=True),
+                    keras.layers.Dense(vocab_size),
+                ]
+            )
+            model.compile(
+                optimizer="adam",
+                loss="sparse_categorical_crossentropy",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, epochs=1, shuffle=False)
+
+    @parameterized.parameters([lstm.LSTM, gru.GRU])
+    def test_reset_dropout_mask_between_batch(self, layer):
+        # See https://github.com/tensorflow/tensorflow/issues/29187 for more
+        # details
+        batch_size = 8
+        timestep = 12
+        embedding_dim = 10
+        units = 5
+        layer = layer(units, dropout=0.5, recurrent_dropout=0.5)
+
+        inputs = np.random.random((batch_size, timestep, embedding_dim)).astype(
+            np.float32
+        )
+        previous_dropout, previous_recurrent_dropout = None, None
+
+        for _ in range(5):
+            layer(inputs, training=True)
+            dropout = layer.cell.get_dropout_mask_for_cell(
+                inputs, training=True
+            )
+            recurrent_dropout = layer.cell.get_recurrent_dropout_mask_for_cell(
+                inputs, training=True
+            )
+            if previous_dropout is not None:
+                self.assertNotAllClose(
+                    self.evaluate(previous_dropout), self.evaluate(dropout)
+                )
+                previous_dropout = dropout
+            if previous_recurrent_dropout is not None:
+                self.assertNotAllClose(
+                    self.evaluate(previous_recurrent_dropout),
+                    self.evaluate(recurrent_dropout),
+                )
+                previous_recurrent_dropout = recurrent_dropout
+
+    @parameterized.parameters([lstm.LSTM, gru.GRU])
+    def test_recurrent_dropout_with_stateful_RNN(self, layer):
+        # See https://github.com/tensorflow/tensorflow/issues/27829 for details.
+        # The issue was caused by using inplace mul for a variable, which was a
+        # warning for RefVariable, but an error for ResourceVariable in 2.0
+        keras.models.Sequential(
+            [
+                layer(
+                    128,
+                    stateful=True,
+                    return_sequences=True,
+                    dropout=0.2,
+                    batch_input_shape=[32, None, 5],
+                    recurrent_dropout=0.2,
+                )
+            ]
+        )
+
+    @parameterized.parameters([lstm.LSTM, gru.GRU])
+    def test_recurrent_dropout_saved_model(self, layer):
+        if not tf.executing_eagerly():
+            self.skipTest("v2-only test")
+        inputs = keras.Input(shape=(784, 3), name="digits")
+        x = layer(64, activation="relu", name="RNN", dropout=0.1)(inputs)
+        x = keras.layers.Dense(64, activation="relu", name="dense")(x)
+        outputs = keras.layers.Dense(
+            10, activation="softmax", name="predictions"
+        )(x)
+        model = keras.Model(inputs=inputs, outputs=outputs, name="3_layer")
+        model.save(os.path.join(self.get_temp_dir(), "model"), save_format="tf")
+
+    @parameterized.parameters([lstm.LSTM, gru.GRU])
+    def test_ragged(self, layer):
+        vocab_size = 100
+        inputs = tf.ragged.constant(
+            np.random.RandomState(0).randint(0, vocab_size, [128, 25])
+        )
+        embedder = keras.layers.Embedding(input_dim=vocab_size, output_dim=16)
+        embedded_inputs = embedder(inputs)
+        layer = layer(32)
+        layer(embedded_inputs)
+
+    @parameterized.parameters([lstm.LSTM, gru.GRU])
+    @test_utils.run_v2_only
+    def test_compare_ragged_with_masks(self, layer):
+        vocab_size = 100
+        timestep = 20
+        units = 32
+        embedder = keras.layers.Embedding(
+            input_dim=vocab_size, output_dim=units
+        )
+        layer = layer(units, return_sequences=True)
+        data = tf.constant(
+            np.random.RandomState(0).randint(
+                0, vocab_size, [timestep, timestep]
+            )
+        )
+        mask = tf.sequence_mask(tf.range(1, timestep + 1))
+        data_ragged = tf.ragged.boolean_mask(data, mask)
+
+        outputs = []
+        devices = [test_utils.device(should_use_gpu=False)]
+        if tf.test.is_gpu_available():
+            devices.append(test_utils.device(should_use_gpu=True))
+        for device in devices:
+            with device:
+                outputs.append(
+                    tf.boolean_mask(layer(embedder(data), mask=mask), mask)
+                )
+                outputs.append(layer(embedder(data_ragged)).values)
+
+        for i in range(len(outputs) - 1):
+            self.assertAllClose(outputs[i], outputs[i + 1], atol=1e-4)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/gru_lstm_utils.py b/keras/layers/rnn/gru_lstm_utils.py
index 1ddde291b219..d0f3208134e7 100644
--- a/keras/layers/rnn/gru_lstm_utils.py
+++ b/keras/layers/rnn/gru_lstm_utils.py
@@ -19,15 +19,15 @@
 
 import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.eager.context import get_device_name
 
-
 # The following string constants are used by Defun approach for unified backend
 # of LSTM and GRU.
-_FUNCTION_API_NAME_ATTRIBUTE = 'api_implements'
-_FUNCTION_DEVICE_ATTRIBUTE = 'api_preferred_device'
-CPU_DEVICE_NAME = 'CPU'
-GPU_DEVICE_NAME = 'GPU'
+_FUNCTION_API_NAME_ATTRIBUTE = "api_implements"
+_FUNCTION_DEVICE_ATTRIBUTE = "api_preferred_device"
+CPU_DEVICE_NAME = "CPU"
+GPU_DEVICE_NAME = "GPU"
 
 # The following number constants are used to represent the runtime of the defun
 # backend function. Since the CPU/GPU implementation are mathematically same, we
@@ -37,211 +37,239 @@
 RUNTIME_CPU = 1
 RUNTIME_GPU = 2
 
-CUDNN_AVAILABLE_MSG = 'Layer %s will use cuDNN kernels when running on GPU.'
-CUDNN_NOT_AVAILABLE_MSG = ('Layer %s will not use cuDNN kernels since it '
-                           'doesn\'t meet the criteria. It will '
-                           'use a generic GPU kernel as fallback when running '
-                           'on GPU.')
+CUDNN_AVAILABLE_MSG = "Layer %s will use cuDNN kernels when running on GPU."
+CUDNN_NOT_AVAILABLE_MSG = (
+    "Layer %s will not use cuDNN kernels since it "
+    "doesn't meet the criteria. It will "
+    "use a generic GPU kernel as fallback when running "
+    "on GPU."
+)
 
 
 def use_new_gru_lstm_impl():
-  return False
+    return False
 
 
 # TODO(b/169707691): The wrapper can be removed if TFLite doesn't need to rely
 # on supportive attributes from LSTM/GRU.
 class DefunWrapper:
-  """A wrapper with no deep copy of the Defun in LSTM/GRU layer."""
-
-  def __init__(self, time_major, go_backwards, layer_name):
-    self.time_major = time_major
-    self.go_backwards = go_backwards
-    self.layer_name = layer_name
-    if self.layer_name not in ['lstm', 'gru']:
-      raise ValueError('Defun wrapper only applies to LSTM and GRU layer, '
-                       'but given {}'.format(self.layer_name))
-    # The first two attributes are added to support TFLite use case.
-    supportive_attributes = {
-        'time_major': self.time_major,
-        'go_backwards': self.go_backwards,
-        _FUNCTION_API_NAME_ATTRIBUTE: self.layer_name + '_' + str(uuid.uuid4())
-    }
-    if self.layer_name == 'lstm':
-      from keras.layers.rnn import lstm  # pylint: disable=g-import-not-at-top
-      layer_func = lstm.lstm_with_backend_selection
-    else:
-      from keras.layers.rnn import gru  # pylint: disable=g-import-not-at-top
-      layer_func = gru.gru_with_backend_selection
-
-    self.defun_layer = tf.__internal__.function.defun_with_attributes(
-        layer_func,
-        attributes=supportive_attributes,
-        autograph=False)
-
-  def __deepcopy__(self, memo):
-    new_wrapper = type(self)(
-        self.time_major, self.go_backwards, self.layer_name)
-    memo[id(self)] = new_wrapper
-    return new_wrapper
+    """A wrapper with no deep copy of the Defun in LSTM/GRU layer."""
+
+    def __init__(self, time_major, go_backwards, layer_name):
+        self.time_major = time_major
+        self.go_backwards = go_backwards
+        self.layer_name = layer_name
+        if self.layer_name not in ["lstm", "gru"]:
+            raise ValueError(
+                "Defun wrapper only applies to LSTM and GRU layer, "
+                "but given {}".format(self.layer_name)
+            )
+        # The first two attributes are added to support TFLite use case.
+        supportive_attributes = {
+            "time_major": self.time_major,
+            "go_backwards": self.go_backwards,
+            _FUNCTION_API_NAME_ATTRIBUTE: self.layer_name
+            + "_"
+            + str(uuid.uuid4()),
+        }
+        if self.layer_name == "lstm":
+            from keras.layers.rnn import (
+                lstm,
+            )
+
+            layer_func = lstm.lstm_with_backend_selection
+        else:
+            from keras.layers.rnn import (
+                gru,
+            )
+
+            layer_func = gru.gru_with_backend_selection
+
+        self.defun_layer = tf.function(
+            layer_func,
+            autograph=False,
+            experimental_attributes=supportive_attributes,
+        )
+
+    def __deepcopy__(self, memo):
+        new_wrapper = type(self)(
+            self.time_major, self.go_backwards, self.layer_name
+        )
+        memo[id(self)] = new_wrapper
+        return new_wrapper
 
 
 def canonical_to_params(weights, biases, shape, transpose_weights=False):
-  """Utility function convert variable to cuDNN compatible parameter.
+    """Utility function convert variable to cuDNN compatible parameter.
+
+    Note that Keras weights for kernels are different from the cuDNN format.
+    Eg.:
 
-  Note that Keras weights for kernels are different from the cuDNN format. Eg.:
+    ```
+      Keras                 cuDNN
+      [[0, 1, 2],  <--->  [[0, 2, 4],
+       [3, 4, 5]]          [1, 3, 5]]
+    ```
 
-  ```
-    Keras                 cuDNN
-    [[0, 1, 2],  <--->  [[0, 2, 4],
-     [3, 4, 5]]          [1, 3, 5]]
-  ```
+    If the input weights need to be in a unified format, then set
+    `transpose_weights=True` to convert the weights.
 
-  If the input weights need to be in a unified format, then set
-  `transpose_weights=True` to convert the weights.
+    Args:
+      weights: list of weights for the individual kernels and recurrent kernels.
+      biases: list of biases for individual gate.
+      shape: the shape for the converted variables that will be feed to cuDNN.
+      transpose_weights: boolean, whether to transpose the weights.
 
-  Args:
-    weights: list of weights for the individual kernels and recurrent kernels.
-    biases: list of biases for individual gate.
-    shape: the shape for the converted variables that will be feed to cuDNN.
-    transpose_weights: boolean, whether to transpose the weights.
+    Returns:
+      The converted weights that can be feed to cuDNN ops as param.
+    """
 
-  Returns:
-    The converted weights that can be feed to cuDNN ops as param.
-  """
-  def convert(w):
-    return tf.transpose(w) if transpose_weights else w
+    def convert(w):
+        return tf.transpose(w) if transpose_weights else w
 
-  weights = [tf.reshape(convert(x), shape) for x in weights]
-  biases = [tf.reshape(x, shape) for x in biases]
-  return tf.concat(weights + biases, axis=0)
+    weights = [tf.reshape(convert(x), shape) for x in weights]
+    biases = [tf.reshape(x, shape) for x in biases]
+    return tf.concat(weights + biases, axis=0)
 
 
 def is_sequence_right_padded(mask):
-  """Check the mask tensor and see if it right padded.
+    """Check the mask tensor and see if it right padded.
 
-  For cuDNN kernel, it uses the sequence length param to skip the tailing
-  timestep. If the data is left padded, or not a strict right padding (has
-  masked value in the middle of the sequence), then cuDNN kernel won't be work
-  properly in those cases.
+    For cuDNN kernel, it uses the sequence length param to skip the tailing
+    timestep. If the data is left padded, or not a strict right padding (has
+    masked value in the middle of the sequence), then cuDNN kernel won't be work
+    properly in those cases.
 
-  Left padded data: [[False, False, True, True, True]].
-  Right padded data: [[True, True, True, False, False]].
-  Mixture of mask/unmasked data: [[True, False, True, False, False]].
+    Left padded data: [[False, False, True, True, True]].
+    Right padded data: [[True, True, True, False, False]].
+    Mixture of mask/unmasked data: [[True, False, True, False, False]].
 
-  Note that for the mixed data example above, the actually data RNN should see
-  are those 2 Trues (index 0 and 2), the index 1 False should be ignored and not
-  pollute the internal states.
+    Note that for the mixed data example above, the actually data RNN should see
+    are those 2 Trues (index 0 and 2), the index 1 False should be ignored and
+    not pollute the internal states.
 
-  Args:
-    mask: the Boolean tensor with shape [batch, timestep]
+    Args:
+      mask: the Boolean tensor with shape [batch, timestep]
 
-  Returns:
-    boolean scalar tensor, whether the mask is strictly right padded.
-  """
-  max_seq_length = tf.shape(mask)[1]
-  count_of_true = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1)
-  right_padded_mask = tf.sequence_mask(
-      count_of_true, maxlen=max_seq_length)
-  return tf.reduce_all(tf.equal(mask, right_padded_mask))
+    Returns:
+      boolean scalar tensor, whether the mask is strictly right padded.
+    """
+    max_seq_length = tf.shape(mask)[1]
+    count_of_true = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1)
+    right_padded_mask = tf.sequence_mask(count_of_true, maxlen=max_seq_length)
+    return tf.reduce_all(tf.equal(mask, right_padded_mask))
 
 
 def has_fully_masked_sequence(mask):
-  # See https://github.com/tensorflow/tensorflow/issues/33148 for more details.
-  # Cudnn kernel will error out if the input sequence contains any fully masked
-  # data. We walk around this issue by rerouting the computation to standard
-  # kernel, until the issue on cudnn side has been fixed.
-  # For a fully masked sequence, it will contain all Falses. To make it easy to
-  # check, we inverse the boolean, check if any of the sequence has all True.
-  return tf.reduce_any(
-      tf.reduce_all(
-          tf.logical_not(mask),
-          axis=1))
-
-
-def is_cudnn_supported_inputs(mask, time_major):
-  if time_major:
-    mask = tf.transpose(mask)
-
-  return tf.logical_and(
-      is_sequence_right_padded(mask),
-      tf.logical_not(has_fully_masked_sequence(mask)))
+    # See https://github.com/tensorflow/tensorflow/issues/33148 for more
+    # details.  Cudnn kernel will error out if the input sequence contains any
+    # fully masked data. We walk around this issue by rerouting the computation
+    # to standard kernel, until the issue on cudnn side has been fixed.  For a
+    # fully masked sequence, it will contain all Falses. To make it easy to
+    # check, we inverse the boolean, check if any of the sequence has all True.
+    return tf.reduce_any(tf.reduce_all(tf.logical_not(mask), axis=1))
+
+
+def is_cudnn_supported_inputs(mask, time_major, sequence_lengths):
+    if tf.sysconfig.get_build_info()["is_rocm_build"]:
+        if (not time_major) and (sequence_lengths is not None):
+            return False
+        if mask is not None:
+            return tf.reduce_all(mask)
+        elif sequence_lengths is not None:
+            return tf.math.equal(
+                tf.reduce_min(sequence_lengths), tf.reduce_max(sequence_lengths)
+            )
+        else:
+            return True
+    if mask is None:
+        return True
+    if time_major:
+        mask = tf.transpose(mask)
+
+    return tf.logical_and(
+        is_sequence_right_padded(mask),
+        tf.logical_not(has_fully_masked_sequence(mask)),
+    )
 
 
 def calculate_sequence_by_mask(mask, time_major):
-  """Calculate the sequence length tensor (1-D) based on the masking tensor.
-
-  The masking tensor is a 2D boolean tensor with shape [batch, timestep]. For
-  any timestep that should be masked, the corresponding field will be False.
-  Consider the following example:
-    a = [[True, True, False, False],
-         [True, True, True, False]]
-  It is a (2, 4) tensor, and the corresponding sequence length result should be
-  1D tensor with value [2, 3]. Note that the masking tensor must be right
-  padded that could be checked by, e.g., `is_sequence_right_padded()`.
-
-  Args:
-    mask: Boolean tensor with shape [batch, timestep] or [timestep, batch] if
-      time_major=True.
-    time_major: Boolean, which indicates whether the mask is time major or batch
-      major.
-  Returns:
-    sequence_length: 1D int32 tensor.
-  """
-  timestep_index = 0 if time_major else 1
-  return tf.reduce_sum(tf.cast(mask, tf.int32), axis=timestep_index)
-
-
-def generate_defun_backend(unique_api_name, preferred_device, func,
-                           supportive_attributes):
-  function_attributes = {
-      _FUNCTION_API_NAME_ATTRIBUTE: unique_api_name,
-      _FUNCTION_DEVICE_ATTRIBUTE: preferred_device,
-  }
-  function_attributes.update(supportive_attributes)
-  return tf.__internal__.function.defun_with_attributes(
-      func=func, attributes=function_attributes, autograph=False)
+    """Calculate the sequence length tensor (1-D) based on the masking tensor.
+
+    The masking tensor is a 2D boolean tensor with shape [batch, timestep]. For
+    any timestep that should be masked, the corresponding field will be False.
+    Consider the following example:
+      a = [[True, True, False, False],
+           [True, True, True, False]]
+    It is a (2, 4) tensor, and the corresponding sequence length result should
+    be 1D tensor with value [2, 3]. Note that the masking tensor must be right
+    padded that could be checked by, e.g., `is_sequence_right_padded()`.
+
+    Args:
+      mask: Boolean tensor with shape [batch, timestep] or [timestep, batch] if
+        time_major=True.
+      time_major: Boolean, which indicates whether the mask is time major or
+        batch major.
+    Returns:
+      sequence_length: 1D int32 tensor.
+    """
+    timestep_index = 0 if time_major else 1
+    return tf.reduce_sum(tf.cast(mask, tf.int32), axis=timestep_index)
+
+
+def generate_defun_backend(
+    unique_api_name, preferred_device, func, supportive_attributes
+):
+    function_attributes = {
+        _FUNCTION_API_NAME_ATTRIBUTE: unique_api_name,
+        _FUNCTION_DEVICE_ATTRIBUTE: preferred_device,
+    }
+    function_attributes.update(supportive_attributes)
+    return tf.function(
+        func, autograph=False, experimental_attributes=function_attributes
+    )
 
 
 def get_context_device_type():
-  """Parse the current context and return the device type, eg CPU/GPU."""
-  current_device = get_device_name()
-  if current_device is None:
-    return None
-  return tf.compat.v1.DeviceSpec.from_string(current_device).device_type
+    """Parse the current context and return the device type, eg CPU/GPU."""
+    current_device = get_device_name()
+    if current_device is None:
+        return None
+    return tf.compat.v1.DeviceSpec.from_string(current_device).device_type
 
 
 def runtime(runtime_name):
-  with tf.device('/cpu:0'):
-    return tf.constant(
-        runtime_name, dtype=tf.float32, name='runtime')
+    with tf.device("/cpu:0"):
+        return tf.constant(runtime_name, dtype=tf.float32, name="runtime")
 
 
 def read_variable_value(v):
-  """Read the value of a variable if it is variable."""
-  if isinstance(v, tf.Variable):
-    return v.read_value()
-  return v
+    """Read the value of a variable if it is variable."""
+    if isinstance(v, tf.Variable):
+        return v.read_value()
+    return v
 
 
 def function_register(func, *args, **kwargs):
-  """Register a specialization of a `Function` into the graph.
-
-  This won't actually call the function with the inputs, and only put the
-  function definition into graph. Register function with different input param
-  will result into multiple version of functions registered in graph.
-
-  Args:
-    func: the `Function` instance that generated by a @defun
-    *args: input arguments for the Python function.
-    **kwargs: input keyword arguments for the Python function.
-
-  Returns:
-    a `ConcreteFunction` object specialized to inputs and execution context.
-
-  Raises:
-    ValueError: When the input function is not a defun wrapped python function.
-  """
-  concrete_func = func.get_concrete_function(*args, **kwargs)
-  concrete_func.add_to_graph()
-  concrete_func.add_gradient_functions_to_graph()
-  return concrete_func
+    """Register a specialization of a `Function` into the graph.
+
+    This won't actually call the function with the inputs, and only put the
+    function definition into graph. Register function with different input param
+    will result into multiple version of functions registered in graph.
+
+    Args:
+      func: the `Function` instance that generated by a @defun
+      *args: input arguments for the Python function.
+      **kwargs: input keyword arguments for the Python function.
+
+    Returns:
+      a `ConcreteFunction` object specialized to inputs and execution context.
+
+    Raises:
+      ValueError: When the input function is not a defun wrapped python
+        function.
+    """
+    concrete_func = func.get_concrete_function(*args, **kwargs)
+    concrete_func.add_to_graph()
+    concrete_func.add_gradient_functions_to_graph()
+    return concrete_func
diff --git a/keras/layers/rnn/gru_test.py b/keras/layers/rnn/gru_test.py
index 22ab1c98c1cb..241ad2c3181f 100644
--- a/keras/layers/rnn/gru_test.py
+++ b/keras/layers/rnn/gru_test.py
@@ -19,18 +19,21 @@
 import os
 import shutil
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers.rnn import gru_lstm_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.framework import test_util as tf_test_util
-
+from tensorflow.python.framework import (
+    test_util as tf_test_util,
+)
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
@@ -40,903 +43,1015 @@
 _config = tf.compat.v1.ConfigProto(graph_options=_graph_options)
 
 
-@test_utils.run_all_without_tensor_float_32('RNN GRU can use TF32 on GPU')
+@test_utils.run_all_without_tensor_float_32("RNN GRU can use TF32 on GPU")
 @test_combinations.run_all_keras_modes(config=_config)
 class GRUGraphRewriteTest(test_combinations.TestCase):
 
-  input_shape = 10
-  output_shape = 8
-  rnn_state_size = 8
-  timestep = 4
-  batch = 100
-  epoch = 1
-
-  @parameterized.named_parameters(
-      ('non_tan_activation', 'relu', 'sigmoid', 0, False, True, True),
-      ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True, True),
-      ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True, True),
-      ('unroll', 'tanh', 'sigmoid', 0, True, True, True),
-      ('not_use_bias', 'tanh', 'sigmoid', 0, False, False, True),
-      ('not_reset_after', 'tanh', 'sigmoid', 0, False, True, False)
-  )
-  @test_utils.run_v2_only
-  def test_could_use_defun_backend(self, activation, recurrent_activation,
-                                   recurrent_dropout, unroll, use_bias,
-                                   reset_after):
-    layer = keras.layers.GRU(
-        1,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        recurrent_dropout=recurrent_dropout,
-        unroll=unroll,
-        use_bias=use_bias,
-        reset_after=reset_after)
-    self.assertFalse(layer._could_use_gpu_kernel)
-
-  @test_utils.run_v2_only
-  def test_use_on_default_activation_with_gpu_kernel(self):
-    layer = keras.layers.GRU(1, activation=tf.tanh)
-    self.assertTrue(layer._could_use_gpu_kernel)
-
-    layer = keras.layers.GRU(1, recurrent_activation=tf.sigmoid)
-    self.assertTrue(layer._could_use_gpu_kernel)
-
-  def test_keras_model_with_gru(self):
-    epoch = 10
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=self.batch,
-        test_samples=0,
-        input_shape=(self.timestep, self.input_shape),
-        num_classes=self.output_shape)
-    y_train = np_utils.to_categorical(y_train, self.output_shape)
-
-    layer = keras.layers.GRU(self.rnn_state_size)
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-
-    outputs = layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('rmsprop', loss='mse')
-    model.fit(x_train, y_train, epochs=epoch)
-    model.evaluate(x_train, y_train)
-    model.predict(x_train)
-
-  def test_dynamic_behavior_GRU(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
-    model = keras.models.Sequential()
-    model.add(layer)
-    model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.001), 'mse')
-    x = np.random.random((num_samples, timesteps, embedding_dim))
-    y = np.random.random((num_samples, units))
-    model.train_on_batch(x, y)
-
-  def test_stacking_GRU(self):
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.GRU(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.GRU(5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  def test_from_config_GRU(self):
-    layer_class = keras.layers.GRU
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
-  @parameterized.named_parameters(
-      # test_name, use_bias, bias_initializer, activation
-      ('normal', True, 'zeros'),
-      ('no_bias', False, 'zeros'),
-      ('random_bias', True, 'random_uniform'),
-  )
-  def test_gru_v2_model_save_load(self, use_bias, bias_initializer):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-    h5_path = os.path.join(temp_dir, 'test.h5')
-
-    batch = 10
-    timestep = 3
-    input_dim = 5
-    units = 2
-
-    x = np.random.random((batch, timestep, input_dim))
-
-    def build_model():
-      inputs = keras.layers.Input(
-          shape=[timestep, input_dim], dtype=tf.float32)
-      layer = keras.layers.GRU(
-          units,
-          use_bias=use_bias,
-          bias_initializer=bias_initializer)
-      output = layer(inputs)
-      return keras.models.Model(inputs, output), layer
-
-    model, layer = build_model()
-    y_ref = model.predict(x)
-    model.save_weights(h5_path)
-
-    cloned_model, new_layer = build_model()
-    cloned_model.load_weights(h5_path)
-    y = cloned_model.predict(x)
-
-    self.assertAllClose(y, y_ref)
-    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
-
-  def test_gru_v2_output_on_multiple_kernel(self):
-    x_train = np.random.random((self.batch, self.timestep, self.input_shape))
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-    with test_utils.device(should_use_gpu=False):
-      layer = keras.layers.GRU(self.rnn_state_size)
-      output = layer(inputs)
-      cpu_model = keras.models.Model(inputs, output)
-      weights = cpu_model.get_weights()
-      y_1 = cpu_model.predict(x_train)
-
-    with test_utils.device(should_use_gpu=True):
-      layer = keras.layers.GRU(self.rnn_state_size)
-      output = layer(inputs)
-      gpu_model = keras.models.Model(inputs, output)
-      gpu_model.set_weights(weights)
-      y_2 = gpu_model.predict(x_train)
-
-    self.assertAllClose(y_1, y_2, rtol=1e-5, atol=1e-5)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_with_masking_layer_GRU(self):
-    layer_class = keras.layers.GRU
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.001))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_masking_with_stacking_GRU(self):
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(keras.layers.GRU(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.GRU(5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  def test_return_sequences_GRU(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'return_sequences': True},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Double type is not yet supported in ROCm')
-  @test_utils.run_v2_only
-  def test_float64_GRU(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'return_sequences': True,
-                'dtype': 'float64'},
-        input_shape=(num_samples, timesteps, embedding_dim),
-        input_dtype='float64')
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_return_states_GRU(self):
-    layer_class = keras.layers.GRU
-    x = np.random.random((2, 3, 4))
-    y = np.abs(np.random.random((2, 5)))
-    s = np.abs(np.random.random((2, 5)))
-    inputs = keras.layers.Input(
-        shape=[3, 4], dtype=tf.float32)
-    masked = keras.layers.Masking()(inputs)
-    outputs, states = layer_class(units=5, return_state=True)(masked)
-
-    model = keras.models.Model(inputs, [outputs, states])
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.001))
-    model.fit(x, [y, s], epochs=1, batch_size=2, verbose=1)
-
-  def test_dropout_GRU(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'dropout': 0.1,
-                'recurrent_dropout': 0.1},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_constraints_GRU(self):
-    embedding_dim = 4
-    layer_class = keras.layers.GRU
-    k_constraint = keras.constraints.max_norm(0.01)
-    r_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_constraint=k_constraint,
-        recurrent_constraint=r_constraint,
-        bias_constraint=b_constraint)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-    self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  @parameterized.parameters([0, 1, 2])
-  def test_implementation_mode_GRU(self, implementation_mode):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'implementation': implementation_mode},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_regularizers_GRU(self):
-    embedding_dim = 4
-    layer_class = keras.layers.GRU
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertEqual(len(layer.losses), 3)
-
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    if tf.executing_eagerly():
-      self.assertEqual(len(layer.losses), 4)
-    else:
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_statefulness_GRU(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.GRU
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            4,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timesteps,
-            batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(
-        units, return_sequences=False, stateful=True, weights=None)
-    model.add(layer)
-    model.compile(
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    out1 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertEqual(out1.shape, (num_samples, units))
-
-    # train once so that the states change
-    model.train_on_batch(
-        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-    out2 = model.predict(np.ones((num_samples, timesteps)))
-
-    # if the state is not reset, output should be different
-    self.assertNotEqual(out1.max(), out2.max())
-
-    # check that output changes after states are reset
-    # (even though the model itself didn't change)
-    layer.reset_states()
-    out3 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out2.max(), out3.max())
-
-    # check that container-level reset_states() works
-    model.reset_states()
-    out4 = model.predict(np.ones((num_samples, timesteps)))
-    np.testing.assert_allclose(out3, out4, atol=1e-5)
-
-    # check that the call to `predict` updated the states
-    out5 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out4.max(), out5.max())
-
-    # Check masking
-    layer.reset_states()
-
-    left_padded_input = np.ones((num_samples, timesteps))
-    left_padded_input[0, :1] = 0
-    left_padded_input[1, :2] = 0
-    out6 = model.predict(left_padded_input)
-
-    layer.reset_states()
-
-    right_padded_input = np.ones((num_samples, timesteps))
-    right_padded_input[0, -1:] = 0
-    right_padded_input[1, -2:] = 0
-    out7 = model.predict(right_padded_input)
-
-    layer.reset_states()
-
-    mix_padded_input = np.ones((num_samples, timesteps))
-    mix_padded_input[0, 1] = 0
-    mix_padded_input[1, 0] = 0
-    mix_padded_input[1, 2] = 0
-    out8 = model.predict(mix_padded_input)
-
-    self.assertAllClose(out7, out6, atol=1e-5)
-    self.assertAllClose(out8, out7, atol=1e-5)
-
-  def test_stateful_GRU_training(self):
-    # See b/123587692 for more context.
-    vocab_size = 20
-    embedding_dim = 10
-    batch_size = 8
-    timestep = 12
-    units = 5
-    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
-    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
-
-    model = keras.Sequential([
-        keras.layers.Embedding(vocab_size, embedding_dim,
-                               batch_input_shape=[batch_size, timestep]),
-        keras.layers.GRU(units, return_sequences=True, stateful=True),
-        keras.layers.Dense(vocab_size)
-    ])
-    model.compile(
-        optimizer='adam',
-        loss='sparse_categorical_crossentropy',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, epochs=1, shuffle=False)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_explicit_device_with_go_backward_and_mask(self):
-    batch_size = 8
-    timestep = 7
-    masksteps = 5
-    units = 4
-
-    inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
-    mask = np.ones((batch_size, timestep)).astype(np.bool)
-    mask[:, masksteps:] = 0
-
-    gru_layer = keras.layers.GRU(
-        units, return_sequences=True, go_backwards=True)
-    with test_utils.device(should_use_gpu=True):
-      outputs_masked = gru_layer(inputs, mask=tf.constant(mask))
-      outputs_trimmed = gru_layer(inputs[:, :masksteps])
-    self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
-
-  @tf_test_util.enable_output_all_intermediates
-  def test_v1_session_behavior(self):
-    with tf.compat.v1.get_default_graph().as_default():
-      # See b/139132348 for more details.
-      x = np.random.uniform(size=(100, 4, 8))
-      y = np.random.uniform(size=(100, 1))
-      dataset = tf.data.Dataset.from_tensor_slices(
-          (x, y)).shuffle(100).batch(32)
-
-      inp = keras.layers.Input(shape=(4, 8))
-      layer = keras.layers.GRU(1)(inp)
-      layer = keras.layers.Dense(1)(layer)
-
-      model = keras.models.Model(inp, layer)
-
-      model.compile(loss='mse', optimizer='sgd')
-      model.fit(dataset)
-
-  def test_with_fully_masked_inputs(self):
-    num_samples = 8
-    timestep = 5
-    embedding_dim = 4
-    vocab_size = 20
-    units = 2
-
-    inputs = np.random.randint(0, vocab_size, size=(num_samples, timestep))
-    # Set the first inputs to be fully zero.
-    inputs[0, :] = 0.0
-
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            vocab_size,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timestep,
-            batch_input_shape=(num_samples, timestep)))
-    layer = keras.layers.GRU(units)
-    model.add(layer)
-    model.compile(
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    # Make sure it doesn't crash with cudnn kernel.
-    model.predict(inputs)
-
-  # TODO (b/169895267): test with xla_gpu is disabled.
-  def test_deepcopy(self):
-    if not tf.executing_eagerly():
-      self.skipTest('v2-only test')
-    original_layer = keras.layers.GRU(5)
-    copied_layer = copy.deepcopy(original_layer)
-    self.assertEqual(copied_layer.units, 5)
-    self.assertEqual(original_layer.get_config(), original_layer.get_config())
-
-    # Copy layer before layer call on inputs without weight initialization.
-    inputs = np.random.normal(size=[32, 10, 8]).astype(np.float32)
-    original_layer = keras.layers.GRU(4)
-    copied_layer = copy.deepcopy(original_layer)
-    outputs = original_layer(inputs)
-    copied_outputs = copied_layer(inputs)
-    self.assertNotAllClose(
-        self.evaluate(outputs), self.evaluate(copied_outputs))
-
-    # Copy layer after layer call on inputs with weight initialization.
-    original_layer = keras.layers.GRU(4)
-    outputs = original_layer(inputs)
-    copied_layer = copy.deepcopy(original_layer)
-    copied_outputs = copied_layer(inputs)
-    self.assertAllClose(self.evaluate(outputs), self.evaluate(copied_outputs))
-
-  def _test_runtime_with_model(self, model):
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=self.batch,
-        test_samples=0,
-        input_shape=(self.timestep, self.input_shape),
-        num_classes=self.output_shape)
-    y_train = np_utils.to_categorical(y_train, self.output_shape)
-
-    model.compile(
-        optimizer='sgd',
-        loss=['categorical_crossentropy', None])
-
-    existing_loss = 0
-    for _ in range(self.epoch):
-      history = model.fit(x_train, y_train)
-      loss_value = history.history['loss'][0]
-
-      self.assertNotEqual(existing_loss, loss_value)
-      existing_loss = loss_value
-
-    _, runtime_value = model.predict(x_train)
-    if tf.test.is_gpu_available():
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
-    else:
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-  @test_utils.run_v2_only
-  def test_GRU_runtime(self):
-    layer = keras.layers.GRU(self.rnn_state_size, return_runtime=True)
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-
-    outputs, runtime = layer(inputs)
-    # Expand the runtime so that it is a 1D tensor instead of scalar.
-    # TF model does not work with scalar model output, specially during
-    # aggregation.
-    runtime = keras.layers.Lambda(
-        lambda x: tf.expand_dims(x, axis=-1))(runtime)
-    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
-    self._test_runtime_with_model(model)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_GRU_runtime_with_mask(self):
-    # Masking will affect which backend is selected based on whether the mask
-    # is strictly right padded.
-    layer = keras.layers.GRU(self.rnn_state_size, return_runtime=True)
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-    masked_inputs = keras.layers.Masking()(inputs)
-
-    outputs, runtime = layer(masked_inputs)
-    # Expand the runtime so that it is a 1D tensor instead of scalar.
-    # TF model does not work with scalar model output, specially during
-    # aggregation.
-    runtime = keras.layers.Lambda(
-        lambda x: tf.expand_dims(x, axis=-1))(runtime)
-    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=self.batch,
-        test_samples=0,
-        input_shape=(self.timestep, self.input_shape),
-        num_classes=self.output_shape)
-    y_train = np_utils.to_categorical(y_train, self.output_shape)
-
-    model.compile(
-        optimizer='sgd',
-        loss=['categorical_crossentropy', None],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit(x_train, y_train)
-
-    # Verify unpadded data.
-    _, runtime_value = model.predict(x_train)
-    if tf.test.is_gpu_available():
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
-    else:
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-    # Update x/y to be right padded by setting the last timestep to 0
-    x_train[:, -1, :] = 0
-    y_train[:, -1] = 0
-    _, runtime_value = model.predict(x_train)
-    if tf.test.is_gpu_available():
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
-    else:
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-    # Further update x/y to be mix padded (masks in the middle), and verify
-    # only cpu kernel can be selected.
-    x_train[:, -3, :] = 0
-    y_train[:, -3] = 0
-    _, runtime_value = model.predict(x_train)
-    self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-  @test_utils.run_v2_only
-  def test_GRU_runtime_with_cond(self):
-    # This test is to demonstrate the graph rewrite of grappler plugin under
-    # the condition that the function returns different number of internal
-    # states.
-    layer = keras.layers.GRU(self.rnn_state_size, return_runtime=True)
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-
-    zeros = tf.zeros([self.batch, self.output_shape])
-    dummy_runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
-    a = tf.constant(0)
-    b = tf.constant(1)
-    # Will always run the GRU layer.
-    outputs, runtime = tf.cond(
-        tf.less(a, b),
-        lambda: layer(inputs),
-        lambda: (zeros, dummy_runtime))
-
-    # Expand the runtime so that it is a 1D tensor instead of scalar.
-    # TF model does not work with scalar model output, specially during
-    # aggregation.
-    runtime = keras.layers.Lambda(
-        lambda x: tf.expand_dims(x, axis=-1))(runtime)
-    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
-    self._test_runtime_with_model(model)
-
-
-@test_utils.run_all_without_tensor_float_32('RNN GRU can use TF32 on GPU')
+    input_shape = 10
+    output_shape = 8
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    @parameterized.named_parameters(
+        ("non_tan_activation", "relu", "sigmoid", 0, False, True, True),
+        ("non_sigmoid_recur_activation", "tanh", "relu", 0, False, True, True),
+        ("use_recurrent_dropout", "tanh", "sigmoid", 0.1, False, True, True),
+        ("unroll", "tanh", "sigmoid", 0, True, True, True),
+        ("not_use_bias", "tanh", "sigmoid", 0, False, False, True),
+        ("not_reset_after", "tanh", "sigmoid", 0, False, True, False),
+    )
+    @test_utils.run_v2_only
+    def test_could_use_defun_backend(
+        self,
+        activation,
+        recurrent_activation,
+        recurrent_dropout,
+        unroll,
+        use_bias,
+        reset_after,
+    ):
+        layer = keras.layers.GRU(
+            1,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            recurrent_dropout=recurrent_dropout,
+            unroll=unroll,
+            use_bias=use_bias,
+            reset_after=reset_after,
+        )
+        self.assertFalse(layer._could_use_gpu_kernel)
+
+    @test_utils.run_v2_only
+    def test_use_on_default_activation_with_gpu_kernel(self):
+        layer = keras.layers.GRU(1, activation=tf.tanh)
+        self.assertTrue(layer._could_use_gpu_kernel)
+
+        layer = keras.layers.GRU(1, recurrent_activation=tf.sigmoid)
+        self.assertTrue(layer._could_use_gpu_kernel)
+
+    def test_keras_model_with_gru(self):
+        epoch = 10
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=self.batch,
+            test_samples=0,
+            input_shape=(self.timestep, self.input_shape),
+            num_classes=self.output_shape,
+        )
+        y_train = np_utils.to_categorical(y_train, self.output_shape)
+
+        layer = keras.layers.GRU(self.rnn_state_size)
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+
+        outputs = layer(inputs)
+        model = keras.models.Model(inputs, outputs)
+        model.compile("rmsprop", loss="mse")
+        model.fit(x_train, y_train, epochs=epoch)
+        model.evaluate(x_train, y_train)
+        model.predict(x_train)
+
+    def test_dynamic_behavior_GRU(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
+        model = keras.models.Sequential()
+        model.add(layer)
+        model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.001), "mse")
+        x = np.random.random((num_samples, timesteps, embedding_dim))
+        y = np.random.random((num_samples, units))
+        model.train_on_batch(x, y)
+
+    def test_stacking_GRU(self):
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.GRU(10, return_sequences=True, unroll=False))
+        model.add(keras.layers.GRU(5, return_sequences=True, unroll=False))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    def test_from_config_GRU(self):
+        layer_class = keras.layers.GRU
+        for stateful in (False, True):
+            l1 = layer_class(units=1, stateful=stateful)
+            l2 = layer_class.from_config(l1.get_config())
+            assert l1.get_config() == l2.get_config()
+
+    @parameterized.named_parameters(
+        # test_name, use_bias, bias_initializer, activation
+        ("normal", True, "zeros"),
+        ("no_bias", False, "zeros"),
+        ("random_bias", True, "random_uniform"),
+    )
+    def test_gru_v2_model_save_load(self, use_bias, bias_initializer):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir)
+        h5_path = os.path.join(temp_dir, "test.h5")
+
+        batch = 10
+        timestep = 3
+        input_dim = 5
+        units = 2
+
+        x = np.random.random((batch, timestep, input_dim))
+
+        def build_model():
+            inputs = keras.layers.Input(
+                shape=[timestep, input_dim], dtype=tf.float32
+            )
+            layer = keras.layers.GRU(
+                units, use_bias=use_bias, bias_initializer=bias_initializer
+            )
+            output = layer(inputs)
+            return keras.models.Model(inputs, output), layer
+
+        model, layer = build_model()
+        y_ref = model.predict(x)
+        model.save_weights(h5_path)
+
+        cloned_model, new_layer = build_model()
+        cloned_model.load_weights(h5_path)
+        y = cloned_model.predict(x)
+
+        self.assertAllClose(y, y_ref)
+        self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+    def test_gru_v2_output_on_multiple_kernel(self):
+        x_train = np.random.random(
+            (self.batch, self.timestep, self.input_shape)
+        )
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+        with test_utils.device(should_use_gpu=False):
+            layer = keras.layers.GRU(self.rnn_state_size)
+            output = layer(inputs)
+            cpu_model = keras.models.Model(inputs, output)
+            weights = cpu_model.get_weights()
+            y_1 = cpu_model.predict(x_train)
+
+        with test_utils.device(should_use_gpu=True):
+            layer = keras.layers.GRU(self.rnn_state_size)
+            output = layer(inputs)
+            gpu_model = keras.models.Model(inputs, output)
+            gpu_model.set_weights(weights)
+            y_2 = gpu_model.predict(x_train)
+
+        self.assertAllClose(y_1, y_2, rtol=1e-5, atol=1e-5)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_with_masking_layer_GRU(self):
+        layer_class = keras.layers.GRU
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        model.add(layer_class(units=5, return_sequences=True, unroll=False))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.001),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_masking_with_stacking_GRU(self):
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        model.add(keras.layers.GRU(10, return_sequences=True, unroll=False))
+        model.add(keras.layers.GRU(5, return_sequences=True, unroll=False))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    def test_return_sequences_GRU(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={"units": units, "return_sequences": True},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="Double type is not yet supported in ROCm",
+    )
+    @test_utils.run_v2_only
+    def test_float64_GRU(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={
+                "units": units,
+                "return_sequences": True,
+                "dtype": "float64",
+            },
+            input_shape=(num_samples, timesteps, embedding_dim),
+            input_dtype="float64",
+        )
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_return_states_GRU(self):
+        layer_class = keras.layers.GRU
+        x = np.random.random((2, 3, 4))
+        y = np.abs(np.random.random((2, 5)))
+        s = np.abs(np.random.random((2, 5)))
+        inputs = keras.layers.Input(shape=[3, 4], dtype=tf.float32)
+        masked = keras.layers.Masking()(inputs)
+        outputs, states = layer_class(units=5, return_state=True)(masked)
+
+        model = keras.models.Model(inputs, [outputs, states])
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.001),
+        )
+        model.fit(x, [y, s], epochs=1, batch_size=2, verbose=1)
+
+    def test_dropout_GRU(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={"units": units, "dropout": 0.1, "recurrent_dropout": 0.1},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_constraints_GRU(self):
+        embedding_dim = 4
+        layer_class = keras.layers.GRU
+        k_constraint = keras.constraints.max_norm(0.01)
+        r_constraint = keras.constraints.max_norm(0.01)
+        b_constraint = keras.constraints.max_norm(0.01)
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_constraint=k_constraint,
+            recurrent_constraint=r_constraint,
+            bias_constraint=b_constraint,
+        )
+        layer.build((None, None, embedding_dim))
+        self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+        self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+        self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+    @parameterized.parameters([0, 1, 2])
+    def test_implementation_mode_GRU(self, implementation_mode):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={"units": units, "implementation": implementation_mode},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_regularizers_GRU(self):
+        embedding_dim = 4
+        layer_class = keras.layers.GRU
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            recurrent_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer="l2",
+            activity_regularizer="l1",
+        )
+        layer.build((None, None, 2))
+        self.assertEqual(len(layer.losses), 3)
+
+        x = keras.backend.variable(np.ones((2, 3, 2)))
+        layer(x)
+        if tf.executing_eagerly():
+            self.assertEqual(len(layer.losses), 4)
+        else:
+            self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_statefulness_GRU(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer_class = keras.layers.GRU
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                4,
+                embedding_dim,
+                mask_zero=True,
+                input_length=timesteps,
+                batch_input_shape=(num_samples, timesteps),
+            )
+        )
+        layer = layer_class(
+            units, return_sequences=False, stateful=True, weights=None
+        )
+        model.add(layer)
+        model.compile(
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        out1 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertEqual(out1.shape, (num_samples, units))
+
+        # train once so that the states change
+        model.train_on_batch(
+            np.ones((num_samples, timesteps)), np.ones((num_samples, units))
+        )
+        out2 = model.predict(np.ones((num_samples, timesteps)))
+
+        # if the state is not reset, output should be different
+        self.assertNotEqual(out1.max(), out2.max())
+
+        # check that output changes after states are reset
+        # (even though the model itself didn't change)
+        layer.reset_states()
+        out3 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out2.max(), out3.max())
+
+        # check that container-level reset_states() works
+        model.reset_states()
+        out4 = model.predict(np.ones((num_samples, timesteps)))
+        np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+        # check that the call to `predict` updated the states
+        out5 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out4.max(), out5.max())
+
+        # Check masking
+        layer.reset_states()
+
+        left_padded_input = np.ones((num_samples, timesteps))
+        left_padded_input[0, :1] = 0
+        left_padded_input[1, :2] = 0
+        out6 = model.predict(left_padded_input)
+
+        layer.reset_states()
+
+        right_padded_input = np.ones((num_samples, timesteps))
+        right_padded_input[0, -1:] = 0
+        right_padded_input[1, -2:] = 0
+        out7 = model.predict(right_padded_input)
+
+        layer.reset_states()
+
+        mix_padded_input = np.ones((num_samples, timesteps))
+        mix_padded_input[0, 1] = 0
+        mix_padded_input[1, 0] = 0
+        mix_padded_input[1, 2] = 0
+        out8 = model.predict(mix_padded_input)
+
+        self.assertAllClose(out7, out6, atol=1e-5)
+        self.assertAllClose(out8, out7, atol=1e-5)
+
+    def test_stateful_GRU_training(self):
+        # See b/123587692 for more context.
+        vocab_size = 20
+        embedding_dim = 10
+        batch_size = 8
+        timestep = 12
+        units = 5
+        x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+        y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+        model = keras.Sequential(
+            [
+                keras.layers.Embedding(
+                    vocab_size,
+                    embedding_dim,
+                    batch_input_shape=[batch_size, timestep],
+                ),
+                keras.layers.GRU(units, return_sequences=True, stateful=True),
+                keras.layers.Dense(vocab_size),
+            ]
+        )
+        model.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(x, y, epochs=1, shuffle=False)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_explicit_device_with_go_backward_and_mask(self):
+        batch_size = 8
+        timestep = 7
+        masksteps = 5
+        units = 4
+
+        inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
+        mask = np.ones((batch_size, timestep)).astype(bool)
+        mask[:, masksteps:] = 0
+
+        gru_layer = keras.layers.GRU(
+            units, return_sequences=True, go_backwards=True
+        )
+        with test_utils.device(should_use_gpu=True):
+            outputs_masked = gru_layer(inputs, mask=tf.constant(mask))
+            outputs_trimmed = gru_layer(inputs[:, :masksteps])
+        self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
+
+    @tf_test_util.enable_output_all_intermediates
+    def test_v1_session_behavior(self):
+        with tf.compat.v1.get_default_graph().as_default():
+            # See b/139132348 for more details.
+            x = np.random.uniform(size=(100, 4, 8))
+            y = np.random.uniform(size=(100, 1))
+            dataset = (
+                tf.data.Dataset.from_tensor_slices((x, y))
+                .shuffle(100)
+                .batch(32)
+            )
+
+            inp = keras.layers.Input(shape=(4, 8))
+            layer = keras.layers.GRU(1)(inp)
+            layer = keras.layers.Dense(1)(layer)
+
+            model = keras.models.Model(inp, layer)
+
+            model.compile(loss="mse", optimizer="sgd")
+            model.fit(dataset)
+
+    def test_with_fully_masked_inputs(self):
+        num_samples = 8
+        timestep = 5
+        embedding_dim = 4
+        vocab_size = 20
+        units = 2
+
+        inputs = np.random.randint(0, vocab_size, size=(num_samples, timestep))
+        # Set the first inputs to be fully zero.
+        inputs[0, :] = 0.0
+
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                vocab_size,
+                embedding_dim,
+                mask_zero=True,
+                input_length=timestep,
+                batch_input_shape=(num_samples, timestep),
+            )
+        )
+        layer = keras.layers.GRU(units)
+        model.add(layer)
+        model.compile(
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        # Make sure it doesn't crash with cudnn kernel.
+        model.predict(inputs)
+
+    # TODO (b/169895267): test with xla_gpu is disabled.
+    def test_deepcopy(self):
+        if not tf.executing_eagerly():
+            self.skipTest("v2-only test")
+        original_layer = keras.layers.GRU(5)
+        copied_layer = copy.deepcopy(original_layer)
+        self.assertEqual(copied_layer.units, 5)
+        self.assertEqual(
+            original_layer.get_config(), original_layer.get_config()
+        )
+
+        # Copy layer before layer call on inputs without weight initialization.
+        inputs = np.random.normal(size=[32, 10, 8]).astype(np.float32)
+        original_layer = keras.layers.GRU(4)
+        copied_layer = copy.deepcopy(original_layer)
+        outputs = original_layer(inputs)
+        copied_outputs = copied_layer(inputs)
+        self.assertNotAllClose(
+            self.evaluate(outputs), self.evaluate(copied_outputs)
+        )
+
+        # Copy layer after layer call on inputs with weight initialization.
+        original_layer = keras.layers.GRU(4)
+        outputs = original_layer(inputs)
+        copied_layer = copy.deepcopy(original_layer)
+        copied_outputs = copied_layer(inputs)
+        self.assertAllClose(
+            self.evaluate(outputs), self.evaluate(copied_outputs)
+        )
+
+    def _test_runtime_with_model(self, model):
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=self.batch,
+            test_samples=0,
+            input_shape=(self.timestep, self.input_shape),
+            num_classes=self.output_shape,
+        )
+        y_train = np_utils.to_categorical(y_train, self.output_shape)
+
+        model.compile(optimizer="sgd", loss=["categorical_crossentropy", None])
+
+        existing_loss = 0
+        for _ in range(self.epoch):
+            history = model.fit(x_train, y_train)
+            loss_value = history.history["loss"][0]
+
+            self.assertNotEqual(existing_loss, loss_value)
+            existing_loss = loss_value
+
+        _, runtime_value = model.predict(x_train)
+        if not tf.sysconfig.get_build_info()["is_rocm_build"]:
+            if tf.test.is_gpu_available():
+                self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
+            else:
+                self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+    @test_utils.run_v2_only
+    def test_GRU_runtime(self):
+        layer = keras.layers.GRU(self.rnn_state_size, return_runtime=True)
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+
+        outputs, runtime = layer(inputs)
+        # Expand the runtime so that it is a 1D tensor instead of scalar.
+        # TF model does not work with scalar model output, specially during
+        # aggregation.
+        runtime = keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(
+            runtime
+        )
+        model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+        self._test_runtime_with_model(model)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_GRU_runtime_with_mask(self):
+        # Masking will affect which backend is selected based on whether the
+        # mask is strictly right padded.
+        layer = keras.layers.GRU(self.rnn_state_size, return_runtime=True)
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+        masked_inputs = keras.layers.Masking()(inputs)
+
+        outputs, runtime = layer(masked_inputs)
+        # Expand the runtime so that it is a 1D tensor instead of scalar.
+        # TF model does not work with scalar model output, specially during
+        # aggregation.
+        runtime = keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(
+            runtime
+        )
+        model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=self.batch,
+            test_samples=0,
+            input_shape=(self.timestep, self.input_shape),
+            num_classes=self.output_shape,
+        )
+        y_train = np_utils.to_categorical(y_train, self.output_shape)
+
+        model.compile(
+            optimizer="sgd",
+            loss=["categorical_crossentropy", None],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model.fit(x_train, y_train)
+
+        # Verify unpadded data.
+        _, runtime_value = model.predict(x_train)
+        if tf.test.is_gpu_available():
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
+        else:
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+        # Update x/y to be right padded by setting the last timestep to 0
+        x_train[:, -1, :] = 0
+        y_train[:, -1] = 0
+        _, runtime_value = model.predict(x_train)
+        if tf.test.is_gpu_available():
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
+        else:
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+        # Further update x/y to be mix padded (masks in the middle), and verify
+        # only cpu kernel can be selected.
+        x_train[:, -3, :] = 0
+        y_train[:, -3] = 0
+        _, runtime_value = model.predict(x_train)
+        self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+    @test_utils.run_v2_only
+    def test_GRU_runtime_with_cond(self):
+        # This test is to demonstrate the graph rewrite of grappler plugin under
+        # the condition that the function returns different number of internal
+        # states.
+        layer = keras.layers.GRU(self.rnn_state_size, return_runtime=True)
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+
+        zeros = tf.zeros([self.batch, self.output_shape])
+        dummy_runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
+        a = tf.constant(0)
+        b = tf.constant(1)
+        # Will always run the GRU layer.
+        outputs, runtime = tf.cond(
+            tf.less(a, b), lambda: layer(inputs), lambda: (zeros, dummy_runtime)
+        )
+
+        # Expand the runtime so that it is a 1D tensor instead of scalar.
+        # TF model does not work with scalar model output, specially during
+        # aggregation.
+        runtime = keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(
+            runtime
+        )
+        model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+        self._test_runtime_with_model(model)
+
+
+@test_utils.run_all_without_tensor_float_32("RNN GRU can use TF32 on GPU")
 class GRULayerGradientTapeTest(test_combinations.TestCase):
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_in_tape(self):
+        with self.test_session(config=_config):
+            time_steps = 10
+            embedding_size = 11
+            gru_unit_size = 12
 
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_in_tape(self):
-    with self.test_session(config=_config):
-      time_steps = 10
-      embedding_size = 11
-      gru_unit_size = 12
-
-      gru_layer = keras.layers.GRU(
-          gru_unit_size,
-          return_sequences=True,
-          return_state=True,
-          recurrent_activation='sigmoid',
-          recurrent_initializer='glorot_uniform')
+            gru_layer = keras.layers.GRU(
+                gru_unit_size,
+                return_sequences=True,
+                return_state=True,
+                recurrent_activation="sigmoid",
+                recurrent_initializer="glorot_uniform",
+            )
 
-      x = tf.random.uniform([1, time_steps, embedding_size])
-      y = tf.random.uniform([1, gru_unit_size])
+            x = tf.random.uniform([1, time_steps, embedding_size])
+            y = tf.random.uniform([1, gru_unit_size])
 
-      with tf.GradientTape() as tape:
-        hidden_state = tf.zeros([1, gru_unit_size], dtype=tf.float32)
-        _, state = gru_layer(x, initial_state=hidden_state)
+            with tf.GradientTape() as tape:
+                hidden_state = tf.zeros([1, gru_unit_size], dtype=tf.float32)
+                _, state = gru_layer(x, initial_state=hidden_state)
 
-        loss = tf.reduce_mean(tf.square(state - y))
+                loss = tf.reduce_mean(tf.square(state - y))
 
-      tape.gradient(loss, gru_layer.variables)
+            tape.gradient(loss, gru_layer.variables)
 
 
 @test_combinations.run_all_keras_modes
 class GRULayerTest(test_combinations.TestCase):
-
-  def test_return_sequences_gru(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'return_sequences': True},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Double type is not yet supported in ROCm')
-  @test_utils.run_v2_only
-  def test_float64_gru(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'return_sequences': True,
-                'dtype': 'float64'},
-        input_shape=(num_samples, timesteps, embedding_dim),
-        input_dtype='float64')
-
-  def test_dynamic_behavior_gru(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
-    model = keras.models.Sequential()
-    model.add(layer)
-    model.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    x = np.random.random((num_samples, timesteps, embedding_dim))
-    y = np.random.random((num_samples, units))
-    model.train_on_batch(x, y)
-
-  def test_dropout_gru(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'dropout': 0.1,
-                'recurrent_dropout': 0.1},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_recurrent_dropout_with_implementation_restriction(self):
-    layer = keras.layers.GRU(2, recurrent_dropout=0.1, implementation=2)
-    # The implementation is force to 1 due to the limit of recurrent_dropout.
-    self.assertEqual(layer.implementation, 1)
-
-  @parameterized.parameters([0, 1, 2])
-  def test_implementation_mode_gru(self, implementation_mode):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.GRU,
-        kwargs={'units': units,
-                'implementation': implementation_mode},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_reset_after_gru(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=num_samples,
-        test_samples=0,
-        input_shape=(timesteps, embedding_dim),
-        num_classes=units)
-    y_train = np_utils.to_categorical(y_train, units)
-
-    inputs = keras.layers.Input(shape=[timesteps, embedding_dim])
-    gru_layer = keras.layers.GRU(units,
-                                 reset_after=True)
-    output = gru_layer(inputs)
-    gru_model = keras.models.Model(inputs, output)
-    gru_model.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    gru_model.fit(x_train, y_train)
-    gru_model.predict(x_train)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='MIOpen only supports packed input output')
-  def test_with_masking_layer_gru(self):
-    layer_class = keras.layers.GRU
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='MIOpen only supports packed input output')
-  def test_statefulness_gru(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.GRU
-
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            4,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timesteps,
-            batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(
-        units, return_sequences=False, stateful=True, weights=None)
-    model.add(layer)
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    out1 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertEqual(out1.shape, (num_samples, units))
-
-    # train once so that the states change
-    model.train_on_batch(
-        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-    out2 = model.predict(np.ones((num_samples, timesteps)))
-
-    # if the state is not reset, output should be different
-    self.assertNotEqual(out1.max(), out2.max())
-
-    # check that output changes after states are reset
-    # (even though the model itself didn't change)
-    layer.reset_states()
-    out3 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out2.max(), out3.max())
-
-    # check that container-level reset_states() works
-    model.reset_states()
-    out4 = model.predict(np.ones((num_samples, timesteps)))
-    np.testing.assert_allclose(out3, out4, atol=1e-5)
-
-    # check that the call to `predict` updated the states
-    out5 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out4.max(), out5.max())
-
-    # Check masking
-    layer.reset_states()
-
-    left_padded_input = np.ones((num_samples, timesteps))
-    left_padded_input[0, :1] = 0
-    left_padded_input[1, :2] = 0
-    out6 = model.predict(left_padded_input)
-
-    layer.reset_states()
-
-    right_padded_input = np.ones((num_samples, timesteps))
-    right_padded_input[0, -1:] = 0
-    right_padded_input[1, -2:] = 0
-    out7 = model.predict(right_padded_input)
-
-    np.testing.assert_allclose(out7, out6, atol=1e-5)
-
-  def test_get_initial_states(self):
-    batch_size = 4
-    cell = keras.layers.GRUCell(20)
-    initial_state = cell.get_initial_state(
-        batch_size=batch_size, dtype=tf.float32)
-    _, state = cell(np.ones((batch_size, 20), dtype=np.float32), initial_state)
-    self.assertEqual(state.shape, initial_state.shape)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_return_sequences_gru(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={"units": units, "return_sequences": True},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="Double type is not yet supported in ROCm",
+    )
+    @test_utils.run_v2_only
+    def test_float64_gru(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={
+                "units": units,
+                "return_sequences": True,
+                "dtype": "float64",
+            },
+            input_shape=(num_samples, timesteps, embedding_dim),
+            input_dtype="float64",
+        )
+
+    def test_dynamic_behavior_gru(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
+        model = keras.models.Sequential()
+        model.add(layer)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        x = np.random.random((num_samples, timesteps, embedding_dim))
+        y = np.random.random((num_samples, units))
+        model.train_on_batch(x, y)
+
+    def test_dropout_gru(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={"units": units, "dropout": 0.1, "recurrent_dropout": 0.1},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_recurrent_dropout_with_implementation_restriction(self):
+        layer = keras.layers.GRU(2, recurrent_dropout=0.1, implementation=2)
+        # The implementation is force to 1 due to the limit of
+        # recurrent_dropout.
+        self.assertEqual(layer.implementation, 1)
+
+    @test_utils.run_v2_only
+    def test_dropout_variable_name(self):
+        layer = keras.layers.RNN(
+            keras.layers.GRUCell(2, dropout=0.1, force_generator=True)
+        )
+        layer(np.random.random((2, 3, 4)))
+        self.assertEqual(
+            layer.cell._random_generator._generator._state_var.name,
+            "rnn/gru_cell/StateVar:0",
+        )
+
+        layer = keras.layers.GRU(2, dropout=0.1, force_generator=True)
+        layer(np.random.random((2, 3, 4)))
+        self.assertEqual(
+            layer._random_generator._generator._state_var.name,
+            "gru/StateVar:0",
+        )
+
+    @parameterized.parameters([0, 1, 2])
+    def test_implementation_mode_gru(self, implementation_mode):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.GRU,
+            kwargs={"units": units, "implementation": implementation_mode},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_reset_after_gru(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=num_samples,
+            test_samples=0,
+            input_shape=(timesteps, embedding_dim),
+            num_classes=units,
+        )
+        y_train = np_utils.to_categorical(y_train, units)
+
+        inputs = keras.layers.Input(shape=[timesteps, embedding_dim])
+        gru_layer = keras.layers.GRU(units, reset_after=True)
+        output = gru_layer(inputs)
+        gru_model = keras.models.Model(inputs, output)
+        gru_model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        gru_model.fit(x_train, y_train)
+        gru_model.predict(x_train)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="MIOpen only supports packed input output",
+    )
+    def test_with_masking_layer_gru(self):
+        layer_class = keras.layers.GRU
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        model.add(layer_class(units=5, return_sequences=True, unroll=False))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="MIOpen only supports packed input output",
+    )
+    def test_statefulness_gru(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer_class = keras.layers.GRU
+
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                4,
+                embedding_dim,
+                mask_zero=True,
+                input_length=timesteps,
+                batch_input_shape=(num_samples, timesteps),
+            )
+        )
+        layer = layer_class(
+            units, return_sequences=False, stateful=True, weights=None
+        )
+        model.add(layer)
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        out1 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertEqual(out1.shape, (num_samples, units))
+
+        # train once so that the states change
+        model.train_on_batch(
+            np.ones((num_samples, timesteps)), np.ones((num_samples, units))
+        )
+        out2 = model.predict(np.ones((num_samples, timesteps)))
+
+        # if the state is not reset, output should be different
+        self.assertNotEqual(out1.max(), out2.max())
+
+        # check that output changes after states are reset
+        # (even though the model itself didn't change)
+        layer.reset_states()
+        out3 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out2.max(), out3.max())
+
+        # check that container-level reset_states() works
+        model.reset_states()
+        out4 = model.predict(np.ones((num_samples, timesteps)))
+        np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+        # check that the call to `predict` updated the states
+        out5 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out4.max(), out5.max())
+
+        # Check masking
+        layer.reset_states()
+
+        left_padded_input = np.ones((num_samples, timesteps))
+        left_padded_input[0, :1] = 0
+        left_padded_input[1, :2] = 0
+        out6 = model.predict(left_padded_input)
+
+        layer.reset_states()
+
+        right_padded_input = np.ones((num_samples, timesteps))
+        right_padded_input[0, -1:] = 0
+        right_padded_input[1, -2:] = 0
+        out7 = model.predict(right_padded_input)
+
+        np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+    def test_get_initial_states(self):
+        batch_size = 4
+        cell = keras.layers.GRUCell(20)
+        initial_state = cell.get_initial_state(
+            batch_size=batch_size, dtype=tf.float32
+        )
+        _, state = cell(
+            np.ones((batch_size, 20), dtype=np.float32), initial_state
+        )
+        self.assertEqual(state.shape, initial_state.shape)
+
+    @test_utils.run_v2_only
+    def test_cloned_weight_names(self):
+        inp = keras.Input([None, 3])
+        rnn = keras.layers.GRU(units=3)
+        model = keras.Model(inp, rnn(inp))
+        clone = keras.models.clone_model(model)
+
+        model_names = [x.name for x in model.weights]
+        clone_names = [x.name for x in clone.weights]
+        self.assertEqual(model_names, clone_names)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class GRULayerGenericTest(tf.test.TestCase):
-
-  def test_constraints_gru(self):
-    embedding_dim = 4
-    layer_class = keras.layers.GRU
-    k_constraint = keras.constraints.max_norm(0.01)
-    r_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_constraint=k_constraint,
-        recurrent_constraint=r_constraint,
-        bias_constraint=b_constraint)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-    self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  def test_from_config_gru(self):
-    layer_class = keras.layers.GRU
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
-  def test_deep_copy_gru(self):
-    cell = keras.layers.GRUCell(5)
-    copied_cell = copy.deepcopy(cell)
-    self.assertEqual(copied_cell.units, 5)
-    self.assertEqual(cell.get_config(), copied_cell.get_config())
-
-  def test_regularizers_gru(self):
-    embedding_dim = 4
-    layer_class = keras.layers.GRU
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertLen(layer.losses, 3)
-
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    if tf.executing_eagerly():
-      self.assertLen(layer.losses, 4)
-    else:
-      self.assertLen(layer.get_losses_for(x), 1)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_constraints_gru(self):
+        embedding_dim = 4
+        layer_class = keras.layers.GRU
+        k_constraint = keras.constraints.max_norm(0.01)
+        r_constraint = keras.constraints.max_norm(0.01)
+        b_constraint = keras.constraints.max_norm(0.01)
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_constraint=k_constraint,
+            recurrent_constraint=r_constraint,
+            bias_constraint=b_constraint,
+        )
+        layer.build((None, None, embedding_dim))
+        self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+        self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+        self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+    def test_from_config_gru(self):
+        layer_class = keras.layers.GRU
+        for stateful in (False, True):
+            l1 = layer_class(units=1, stateful=stateful)
+            l2 = layer_class.from_config(l1.get_config())
+            assert l1.get_config() == l2.get_config()
+
+    def test_deep_copy_gru(self):
+        cell = keras.layers.GRUCell(5)
+        copied_cell = copy.deepcopy(cell)
+        self.assertEqual(copied_cell.units, 5)
+        self.assertEqual(cell.get_config(), copied_cell.get_config())
+
+    def test_regularizers_gru(self):
+        embedding_dim = 4
+        layer_class = keras.layers.GRU
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            recurrent_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer="l2",
+            activity_regularizer="l1",
+        )
+        layer.build((None, None, 2))
+        self.assertLen(layer.losses, 3)
+
+        x = keras.backend.variable(np.ones((2, 3, 2)))
+        layer(x)
+        if tf.executing_eagerly():
+            self.assertLen(layer.losses, 4)
+        else:
+            self.assertLen(layer.get_losses_for(x), 1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/gru_v1.py b/keras/layers/rnn/gru_v1.py
index eba9493c2f6f..f6b458c6f8f1 100644
--- a/keras/layers/rnn/gru_v1.py
+++ b/keras/layers/rnn/gru_v1.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Gated Recurrent Unit V1 layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import activations
 from keras import constraints
@@ -24,372 +24,381 @@
 from keras.layers.rnn import rnn_utils
 from keras.layers.rnn.base_rnn import RNN
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export(v1=['keras.layers.GRUCell'])
+@keras_export(v1=["keras.layers.GRUCell"])
 class GRUCell(gru.GRUCell):
-  """Cell class for the GRU layer.
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass None, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use
-      for the recurrent step.
-      Default: hard sigmoid (`hard_sigmoid`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix,
-      used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix.
-    recurrent_regularizer: Regularizer function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    kernel_constraint: Constraint function applied to
-      the `kernel` weights matrix.
-    recurrent_constraint: Constraint function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1.
-      Fraction of the units to drop for the linear transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-    reset_after: GRU convention (whether to apply reset gate after or
-      before matrix multiplication). False = "before" (default),
-      True = "after" (cuDNN compatible).
-
-  Call arguments:
-    inputs: A 2D tensor.
-    states: List of state tensors corresponding to the previous timestep.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. Only relevant when `dropout` or
-      `recurrent_dropout` is used.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               reset_after=False,
-               **kwargs):
-    super().__init__(
+    """Cell class for the GRU layer.
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`).
+        If you pass None, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+        for the recurrent step.
+        Default: hard sigmoid (`hard_sigmoid`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+      reset_after: GRU convention (whether to apply reset gate after or
+        before matrix multiplication). False = "before" (default),
+        True = "after" (cuDNN compatible).
+
+    Call arguments:
+      inputs: A 2D tensor.
+      states: List of state tensors corresponding to the previous timestep.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. Only relevant when `dropout` or
+        `recurrent_dropout` is used.
+    """
+
+    def __init__(
+        self,
         units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=kwargs.pop('implementation', 1),
-        reset_after=reset_after,
-        **kwargs)
-
-
-@keras_export(v1=['keras.layers.GRU'])
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        reset_after=False,
+        **kwargs
+    ):
+        super().__init__(
+            units,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            implementation=kwargs.pop("implementation", 1),
+            reset_after=reset_after,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.layers.GRU"])
 class GRU(RNN):
-  """Gated Recurrent Unit - Cho et al. 2014.
-
-  There are two variants. The default one is based on 1406.1078v3 and
-  has reset gate applied to hidden state before matrix multiplication. The
-  other one is based on original 1406.1078v1 and has the order reversed.
-
-  The second variant is compatible with CuDNNGRU (GPU-only) and allows
-  inference on CPU. Thus it has separate biases for `kernel` and
-  `recurrent_kernel`. Use `'reset_after'=True` and
-  `recurrent_activation='sigmoid'`.
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use
-      for the recurrent step.
-      Default: hard sigmoid (`hard_sigmoid`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix, used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix.
-    recurrent_regularizer: Regularizer function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation")..
-    kernel_constraint: Constraint function applied to
-      the `kernel` weights matrix.
-    recurrent_constraint: Constraint function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-    return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence.
-    return_state: Boolean. Whether to return the last state
-      in addition to the output.
-    go_backwards: Boolean (default False).
-      If True, process the input sequence backwards and return the
-      reversed sequence.
-    stateful: Boolean (default False). If True, the last state
-      for each sample at index i in a batch will be used as initial
-      state for the sample of index i in the following batch.
-    unroll: Boolean (default False).
-      If True, the network will be unrolled,
-      else a symbolic loop will be used.
-      Unrolling can speed-up a RNN,
-      although it tends to be more memory-intensive.
-      Unrolling is only suitable for short sequences.
-    time_major: The shape format of the `inputs` and `outputs` tensors.
-      If True, the inputs and outputs will be in shape
-      `(timesteps, batch, ...)`, whereas in the False case, it will be
-      `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
-      efficient because it avoids transposes at the beginning and end of the
-      RNN calculation. However, most TensorFlow data is batch-major, so by
-      default this function accepts input and emits output in batch-major
-      form.
-    reset_after: GRU convention (whether to apply reset gate after or
-      before matrix multiplication). False = "before" (default),
-      True = "after" (cuDNN compatible).
-
-  Call arguments:
-    inputs: A 3D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked. An individual `True` entry indicates
-      that the corresponding timestep should be utilized, while a `False`
-      entry indicates that the corresponding timestep should be ignored.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used.
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               reset_after=False,
-               **kwargs):
-    implementation = kwargs.pop('implementation', 1)
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=1`.'
-                      'Please update your layer call.')
-    if 'enable_caching_device' in kwargs:
-      cell_kwargs = {'enable_caching_device':
-                     kwargs.pop('enable_caching_device')}
-    else:
-      cell_kwargs = {}
-    cell = GRUCell(
+    """Gated Recurrent Unit - Cho et al. 2014.
+
+    There are two variants. The default one is based on 1406.1078v3 and
+    has reset gate applied to hidden state before matrix multiplication. The
+    other one is based on original 1406.1078v1 and has the order reversed.
+
+    The second variant is compatible with CuDNNGRU (GPU-only) and allows
+    inference on CPU. Thus it has separate biases for `kernel` and
+    `recurrent_kernel`. Use `'reset_after'=True` and
+    `recurrent_activation='sigmoid'`.
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+        for the recurrent step.
+        Default: hard sigmoid (`hard_sigmoid`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+        the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+      return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+        in addition to the output.
+      go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+        If True, the network will be unrolled,
+        else a symbolic loop will be used.
+        Unrolling can speed-up a RNN,
+        although it tends to be more memory-intensive.
+        Unrolling is only suitable for short sequences.
+      time_major: The shape format of the `inputs` and `outputs` tensors.
+        If True, the inputs and outputs will be in shape
+        `(timesteps, batch, ...)`, whereas in the False case, it will be
+        `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
+        efficient because it avoids transposes at the beginning and end of the
+        RNN calculation. However, most TensorFlow data is batch-major, so by
+        default this function accepts input and emits output in batch-major
+        form.
+      reset_after: GRU convention (whether to apply reset gate after or
+        before matrix multiplication). False = "before" (default),
+        True = "after" (cuDNN compatible).
+
+    Call arguments:
+      inputs: A 3D tensor.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+        a given timestep should be masked. An individual `True` entry indicates
+        that the corresponding timestep should be utilized, while a `False`
+        entry indicates that the corresponding timestep should be ignored.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` is used.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+    """
+
+    def __init__(
+        self,
         units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        reset_after=reset_after,
-        dtype=kwargs.get('dtype'),
-        trainable=kwargs.get('trainable', True),
-        **cell_kwargs)
-    super().__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.input_spec = [InputSpec(ndim=3)]
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    return super().call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
-  @property
-  def reset_after(self):
-    return self.cell.reset_after
-
-  def get_config(self):
-    config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation,
-        'reset_after':
-            self.reset_after
-    }
-    config.update(rnn_utils.config_for_enable_caching_device(self.cell))
-    base_config = super().get_config()
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        unroll=False,
+        reset_after=False,
+        **kwargs
+    ):
+        implementation = kwargs.pop("implementation", 1)
+        if implementation == 0:
+            logging.warning(
+                "`implementation=0` has been deprecated, "
+                "and now defaults to `implementation=1`."
+                "Please update your layer call."
+            )
+        if "enable_caching_device" in kwargs:
+            cell_kwargs = {
+                "enable_caching_device": kwargs.pop("enable_caching_device")
+            }
+        else:
+            cell_kwargs = {}
+        cell = GRUCell(
+            units,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            implementation=implementation,
+            reset_after=reset_after,
+            dtype=kwargs.get("dtype"),
+            trainable=kwargs.get("trainable", True),
+            name="gru_cell",
+            **cell_kwargs
+        )
+        super().__init__(
+            cell,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            unroll=unroll,
+            **kwargs
+        )
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.input_spec = [InputSpec(ndim=3)]
+
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        return super().call(
+            inputs, mask=mask, training=training, initial_state=initial_state
+        )
+
+    @property
+    def units(self):
+        return self.cell.units
+
+    @property
+    def activation(self):
+        return self.cell.activation
+
+    @property
+    def recurrent_activation(self):
+        return self.cell.recurrent_activation
+
+    @property
+    def use_bias(self):
+        return self.cell.use_bias
+
+    @property
+    def kernel_initializer(self):
+        return self.cell.kernel_initializer
+
+    @property
+    def recurrent_initializer(self):
+        return self.cell.recurrent_initializer
+
+    @property
+    def bias_initializer(self):
+        return self.cell.bias_initializer
+
+    @property
+    def kernel_regularizer(self):
+        return self.cell.kernel_regularizer
+
+    @property
+    def recurrent_regularizer(self):
+        return self.cell.recurrent_regularizer
+
+    @property
+    def bias_regularizer(self):
+        return self.cell.bias_regularizer
+
+    @property
+    def kernel_constraint(self):
+        return self.cell.kernel_constraint
+
+    @property
+    def recurrent_constraint(self):
+        return self.cell.recurrent_constraint
+
+    @property
+    def bias_constraint(self):
+        return self.cell.bias_constraint
+
+    @property
+    def dropout(self):
+        return self.cell.dropout
+
+    @property
+    def recurrent_dropout(self):
+        return self.cell.recurrent_dropout
+
+    @property
+    def implementation(self):
+        return self.cell.implementation
+
+    @property
+    def reset_after(self):
+        return self.cell.reset_after
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+            "implementation": self.implementation,
+            "reset_after": self.reset_after,
+        }
+        config.update(rnn_utils.config_for_enable_caching_device(self.cell))
+        base_config = super().get_config()
+        del base_config["cell"]
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        if "implementation" in config and config["implementation"] == 0:
+            config["implementation"] = 1
+        return cls(**config)
diff --git a/keras/layers/rnn/gru_v1_test.py b/keras/layers/rnn/gru_v1_test.py
index 88df22c88a1b..84f6e375f859 100644
--- a/keras/layers/rnn/gru_v1_test.py
+++ b/keras/layers/rnn/gru_v1_test.py
@@ -15,18 +15,17 @@
 """Tests for GRU V1 layer."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+from tensorflow.core.protobuf import rewriter_config_pb2
+
 import keras
 from keras.layers.rnn import gru
 from keras.layers.rnn import gru_v1
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.core.protobuf import rewriter_config_pb2
-
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
@@ -36,125 +35,136 @@
 _config = tf.compat.v1.ConfigProto(graph_options=_graph_options)
 
 
-@test_utils.run_all_without_tensor_float_32('RNN GRU can use TF32 on GPU')
+@test_utils.run_all_without_tensor_float_32("RNN GRU can use TF32 on GPU")
 @test_combinations.run_all_keras_modes(config=_config)
 class GRUGraphRewriteTest(test_combinations.TestCase):
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_gru_feature_parity_v1_v2(self):
-    input_shape = 10
-    rnn_state_size = 8
-    timestep = 4
-    batch = 20
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=batch,
-        test_samples=0,
-        input_shape=(timestep, input_shape),
-        num_classes=rnn_state_size,
-        random_seed=87654321)
-    y_train = np_utils.to_categorical(y_train, rnn_state_size)
-    # For the last batch item of the test data, we filter out the last
-    # timestep to simulate the variable length sequence and masking test.
-    x_train[-2:, -1, :] = 0.0
-    y_train[-2:] = 0
-
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=tf.float32)
-    masked_input = keras.layers.Masking()(inputs)
-    gru_layer = gru_v1.GRU(rnn_state_size,
-                           recurrent_activation='sigmoid',
-                           reset_after=True)
-    output = gru_layer(masked_input)
-    gru_model = keras.models.Model(inputs, output)
-    weights = gru_model.get_weights()
-    y_1 = gru_model.predict(x_train)
-    gru_model.compile('rmsprop', 'mse')
-    gru_model.fit(x_train, y_train)
-    y_2 = gru_model.predict(x_train)
-
-    with test_utils.device(should_use_gpu=True):
-      cudnn_layer = gru.GRU(rnn_state_size,
-                            recurrent_activation='sigmoid',
-                            reset_after=True)
-      cudnn_model = keras.models.Model(inputs, cudnn_layer(masked_input))
-    cudnn_model.set_weights(weights)
-    y_3 = cudnn_model.predict(x_train)
-    cudnn_model.compile('rmsprop', 'mse')
-    cudnn_model.fit(x_train, y_train)
-    y_4 = cudnn_model.predict(x_train)
-
-    self.assertAllClose(y_1, y_3, rtol=2e-5, atol=2e-5)
-    self.assertAllClose(y_2, y_4, rtol=2e-5, atol=2e-5)
-
-  @parameterized.named_parameters(
-      # test_name, time_major, go_backwards
-      ('normal', False, False),
-      ('time_major', True, False),
-      ('go_backwards', False, True),
-      ('both', True, True),
-  )
-  def test_time_major_and_go_backward_v1_v2(self, time_major, go_backwards):
-    input_shape = 10
-    rnn_state_size = 8
-    timestep = 4
-    batch = 100
-
-    x_train = np.random.random((batch, timestep, input_shape))
-
-    def build_model(layer_cls):
-      inputs = keras.layers.Input(
-          shape=[timestep, input_shape], dtype=tf.float32)
-      layer = layer_cls(rnn_state_size,
-                        recurrent_activation='sigmoid',
-                        time_major=time_major,
-                        return_sequences=True,
-                        go_backwards=go_backwards,
-                        reset_after=True)
-      if time_major:
-        converted_input = keras.layers.Lambda(
-            lambda t: tf.transpose(t, [1, 0, 2]))(inputs)
-        outputs = layer(converted_input)
-        outputs = keras.layers.Lambda(
-            lambda t: tf.transpose(t, [1, 0, 2]))(outputs)
-      else:
-        outputs = layer(inputs)
-      return keras.models.Model(inputs, outputs)
-
-    gru_model = build_model(gru_v1.GRU)
-    y_ref = gru_model.predict(x_train)
-    weights = gru_model.get_weights()
-
-    gru_v2_model = build_model(gru.GRU)
-    gru_v2_model.set_weights(weights)
-    y = gru_v2_model.predict(x_train)
-
-    self.assertAllClose(y, y_ref)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_explicit_device_with_go_backward_and_mask_v1(self):
-    batch_size = 8
-    timestep = 7
-    masksteps = 5
-    units = 4
-
-    inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
-    mask = np.ones((batch_size, timestep)).astype(np.bool)
-    mask[:, masksteps:] = 0
-
-    gru_layer = gru_v1.GRU(
-        units, return_sequences=True, go_backwards=True)
-    with test_utils.device(should_use_gpu=True):
-      outputs_masked = gru_layer(inputs, mask=tf.constant(mask))
-      outputs_trimmed = gru_layer(inputs[:, :masksteps])
-    self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_gru_feature_parity_v1_v2(self):
+        input_shape = 10
+        rnn_state_size = 8
+        timestep = 4
+        batch = 20
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=batch,
+            test_samples=0,
+            input_shape=(timestep, input_shape),
+            num_classes=rnn_state_size,
+            random_seed=87654321,
+        )
+        y_train = np_utils.to_categorical(y_train, rnn_state_size)
+        # For the last batch item of the test data, we filter out the last
+        # timestep to simulate the variable length sequence and masking test.
+        x_train[-2:, -1, :] = 0.0
+        y_train[-2:] = 0
+
+        inputs = keras.layers.Input(
+            shape=[timestep, input_shape], dtype=tf.float32
+        )
+        masked_input = keras.layers.Masking()(inputs)
+        gru_layer = gru_v1.GRU(
+            rnn_state_size, recurrent_activation="sigmoid", reset_after=True
+        )
+        output = gru_layer(masked_input)
+        gru_model = keras.models.Model(inputs, output)
+        weights = gru_model.get_weights()
+        y_1 = gru_model.predict(x_train)
+        gru_model.compile("rmsprop", "mse")
+        gru_model.fit(x_train, y_train)
+        y_2 = gru_model.predict(x_train)
+
+        with test_utils.device(should_use_gpu=True):
+            cudnn_layer = gru.GRU(
+                rnn_state_size, recurrent_activation="sigmoid", reset_after=True
+            )
+            cudnn_model = keras.models.Model(inputs, cudnn_layer(masked_input))
+        cudnn_model.set_weights(weights)
+        y_3 = cudnn_model.predict(x_train)
+        cudnn_model.compile("rmsprop", "mse")
+        cudnn_model.fit(x_train, y_train)
+        y_4 = cudnn_model.predict(x_train)
+
+        self.assertAllClose(y_1, y_3, rtol=2e-5, atol=2e-5)
+        self.assertAllClose(y_2, y_4, rtol=2e-5, atol=2e-5)
+
+    @parameterized.named_parameters(
+        # test_name, time_major, go_backwards
+        ("normal", False, False),
+        ("time_major", True, False),
+        ("go_backwards", False, True),
+        ("both", True, True),
+    )
+    def test_time_major_and_go_backward_v1_v2(self, time_major, go_backwards):
+        input_shape = 10
+        rnn_state_size = 8
+        timestep = 4
+        batch = 100
+
+        x_train = np.random.random((batch, timestep, input_shape))
+
+        def build_model(layer_cls):
+            inputs = keras.layers.Input(
+                shape=[timestep, input_shape], dtype=tf.float32
+            )
+            layer = layer_cls(
+                rnn_state_size,
+                recurrent_activation="sigmoid",
+                time_major=time_major,
+                return_sequences=True,
+                go_backwards=go_backwards,
+                reset_after=True,
+            )
+            if time_major:
+                converted_input = keras.layers.Lambda(
+                    lambda t: tf.transpose(t, [1, 0, 2])
+                )(inputs)
+                outputs = layer(converted_input)
+                outputs = keras.layers.Lambda(
+                    lambda t: tf.transpose(t, [1, 0, 2])
+                )(outputs)
+            else:
+                outputs = layer(inputs)
+            return keras.models.Model(inputs, outputs)
+
+        gru_model = build_model(gru_v1.GRU)
+        y_ref = gru_model.predict(x_train)
+        weights = gru_model.get_weights()
+
+        gru_v2_model = build_model(gru.GRU)
+        gru_v2_model.set_weights(weights)
+        y = gru_v2_model.predict(x_train)
+
+        self.assertAllClose(y, y_ref)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_explicit_device_with_go_backward_and_mask_v1(self):
+        batch_size = 8
+        timestep = 7
+        masksteps = 5
+        units = 4
+
+        inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
+        mask = np.ones((batch_size, timestep)).astype(bool)
+        mask[:, masksteps:] = 0
+
+        gru_layer = gru_v1.GRU(units, return_sequences=True, go_backwards=True)
+        with test_utils.device(should_use_gpu=True):
+            outputs_masked = gru_layer(inputs, mask=tf.constant(mask))
+            outputs_trimmed = gru_layer(inputs[:, :masksteps])
+        self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/legacy_cell_wrappers.py b/keras/layers/rnn/legacy_cell_wrappers.py
index 4847c73e1887..ebdbd399c63a 100644
--- a/keras/layers/rnn/legacy_cell_wrappers.py
+++ b/keras/layers/rnn/legacy_cell_wrappers.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Module implementing the V1 version of RNN cell wrappers."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,549 +22,647 @@
 import hashlib
 import numbers
 
+import tensorflow.compat.v2 as tf
+
 from keras.layers.rnn.cell_wrappers import _enumerated_map_structure_up_to
 from keras.layers.rnn.cell_wrappers import _parse_config_to_function
 from keras.layers.rnn.cell_wrappers import _serialize_function_to_config
 from keras.layers.rnn.legacy_cells import RNNCell
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
-
 # This can be used with self.assertRaisesRegexp for assert_like_rnncell.
 ASSERT_LIKE_RNNCELL_ERROR_REGEXP = "is not an RNNCell"
 
 
 def _hasattr(obj, attr_name):
-  try:
-    getattr(obj, attr_name)
-  except AttributeError:
-    return False
-  else:
-    return True
+    try:
+        getattr(obj, attr_name)
+    except AttributeError:
+        return False
+    else:
+        return True
 
 
 def assert_like_rnncell(cell_name, cell):
-  """Raises a TypeError if cell is not like an RNNCell.
-
-  NOTE: Do not rely on the error message (in particular in tests) which can be
-  subject to change to increase readability. Use
-  ASSERT_LIKE_RNNCELL_ERROR_REGEXP.
-
-  Args:
-    cell_name: A string to give a meaningful error referencing to the name of
-      the functionargument.
-    cell: The object which should behave like an RNNCell.
-
-  Raises:
-    TypeError: A human-friendly exception.
-  """
-  conditions = [
-      _hasattr(cell, "output_size"),
-      _hasattr(cell, "state_size"),
-      _hasattr(cell, "get_initial_state") or _hasattr(cell, "zero_state"),
-      callable(cell),
-  ]
-  errors = [
-      "'output_size' property is missing", "'state_size' property is missing",
-      "either 'zero_state' or 'get_initial_state' method is required",
-      "is not callable"
-  ]
-
-  if not all(conditions):
-
-    errors = [error for error, cond in zip(errors, conditions) if not cond]
-    raise TypeError("The argument {!r} ({}) is not an RNNCell: {}.".format(
-        cell_name, cell, ", ".join(errors)))
-
-
-class _RNNCellWrapperV1(RNNCell):
-  """Base class for cells wrappers V1 compatibility.
-
-  This class along with `_RNNCellWrapperV2` allows to define cells wrappers that
-  are compatible with V1 and V2, and defines helper methods for this purpose.
-  """
-
-  def __init__(self, cell, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    assert_like_rnncell("cell", cell)
-    self.cell = cell
-    if isinstance(cell, tf.__internal__.tracking.Trackable):
-      self._track_trackable(self.cell, name="cell")
-
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Calls the wrapped cell and performs the wrapping logic.
+    """Raises a TypeError if cell is not like an RNNCell.
 
-    This method is called from the wrapper's `call` or `__call__` methods.
+    NOTE: Do not rely on the error message (in particular in tests) which can be
+    subject to change to increase readability. Use
+    ASSERT_LIKE_RNNCELL_ERROR_REGEXP.
 
     Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      cell_call_fn: Wrapped cell's method to use for step computation (cell's
-        `__call__` or 'call' method).
-      **kwargs: Additional arguments.
-
-    Returns:
-      A pair containing:
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
-    """
-    raise NotImplementedError
-
-  def __call__(self, inputs, state, scope=None):
-    """Runs the RNN cell step computation.
+      cell_name: A string to give a meaningful error referencing to the name of
+        the functionargument.
+      cell: The object which should behave like an RNNCell.
 
-    We assume that the wrapped RNNCell is being built within its `__call__`
-    method. We directly use the wrapped cell's `__call__` in the overridden
-    wrapper `__call__` method.
-
-    This allows to use the wrapped cell and the non-wrapped cell equivalently
-    when using `__call__`.
+    Raises:
+      TypeError: A human-friendly exception.
+    """
+    conditions = [
+        _hasattr(cell, "output_size"),
+        _hasattr(cell, "state_size"),
+        _hasattr(cell, "get_initial_state") or _hasattr(cell, "zero_state"),
+        callable(cell),
+    ]
+    errors = [
+        "'output_size' property is missing",
+        "'state_size' property is missing",
+        "either 'zero_state' or 'get_initial_state' method is required",
+        "is not callable",
+    ]
+
+    if not all(conditions):
+
+        errors = [error for error, cond in zip(errors, conditions) if not cond]
+        raise TypeError(
+            "The argument {!r} ({}) is not an RNNCell: {}.".format(
+                cell_name, cell, ", ".join(errors)
+            )
+        )
 
-    Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      scope: VariableScope for the subgraph created in the wrapped cells'
-        `__call__`.
 
-    Returns:
-      A pair containing:
+class _RNNCellWrapperV1(RNNCell):
+    """Base class for cells wrappers V1 compatibility.
 
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    This class along with `_RNNCellWrapperV2` allows to define cells wrappers
+    that are compatible with V1 and V2, and defines helper methods for this
+    purpose.
     """
-    return self._call_wrapped_cell(
-        inputs, state, cell_call_fn=self.cell.__call__, scope=scope)
-
-  @property
-  def state_size(self):
-    return self.cell.state_size
-
-  @property
-  def output_size(self):
-    return self.cell.output_size
-
-  def zero_state(self, batch_size, dtype):
-    with tf.name_scope(type(self).__name__ + "ZeroState"):
-      return self.cell.zero_state(batch_size, dtype)
-
-  def get_config(self):
-    config = {
-        "cell": {
-            "class_name": self.cell.__class__.__name__,
-            "config": self.cell.get_config()
-        },
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()
-    cell = config.pop("cell")
-    try:
-      assert_like_rnncell("cell", cell)
-      return cls(cell, **config)
-    except TypeError:
-      raise ValueError("RNNCellWrapper cannot reconstruct the wrapped cell. "
-                       "Please overwrite the cell in the config with a RNNCell "
-                       "instance.")
+
+    def __init__(self, cell, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert_like_rnncell("cell", cell)
+        self.cell = cell
+        if isinstance(cell, tf.__internal__.tracking.Trackable):
+            self._track_trackable(self.cell, name="cell")
+
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Calls the wrapped cell and performs the wrapping logic.
+
+        This method is called from the wrapper's `call` or `__call__` methods.
+
+        Args:
+          inputs: A tensor with wrapped cell's input.
+          state: A tensor or tuple of tensors with wrapped cell's state.
+          cell_call_fn: Wrapped cell's method to use for step computation
+            (cell's `__call__` or 'call' method).
+          **kwargs: Additional arguments.
+
+        Returns:
+          A pair containing:
+          - Output: A tensor with cell's output.
+          - New state: A tensor or tuple of tensors with new wrapped cell's
+            state.
+        """
+        raise NotImplementedError
+
+    def __call__(self, inputs, state, scope=None):
+        """Runs the RNN cell step computation.
+
+        We assume that the wrapped RNNCell is being built within its `__call__`
+        method. We directly use the wrapped cell's `__call__` in the overridden
+        wrapper `__call__` method.
+
+        This allows to use the wrapped cell and the non-wrapped cell
+        equivalently when using `__call__`.
+
+        Args:
+          inputs: A tensor with wrapped cell's input.
+          state: A tensor or tuple of tensors with wrapped cell's state.
+          scope: VariableScope for the subgraph created in the wrapped cells'
+            `__call__`.
+
+        Returns:
+          A pair containing:
+
+          - Output: A tensor with cell's output.
+          - New state: A tensor or tuple of tensors with new wrapped cell's
+            state.
+        """
+        return self._call_wrapped_cell(
+            inputs, state, cell_call_fn=self.cell.__call__, scope=scope
+        )
+
+    @property
+    def state_size(self):
+        return self.cell.state_size
+
+    @property
+    def output_size(self):
+        return self.cell.output_size
+
+    def zero_state(self, batch_size, dtype):
+        with tf.name_scope(type(self).__name__ + "ZeroState"):
+            return self.cell.zero_state(batch_size, dtype)
+
+    def get_config(self):
+        config = {
+            "cell": {
+                "class_name": self.cell.__class__.__name__,
+                "config": self.cell.get_config(),
+            },
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+        cell = config.pop("cell")
+        try:
+            assert_like_rnncell("cell", cell)
+            return cls(cell, **config)
+        except TypeError:
+            raise ValueError(
+                "RNNCellWrapper cannot reconstruct the wrapped cell. "
+                "Please overwrite the cell in the config with a RNNCell "
+                "instance."
+            )
 
 
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.DropoutWrapper"])
 @tf_export(v1=["nn.rnn_cell.DropoutWrapper"])
 class DropoutWrapper(_RNNCellWrapperV1):
-  """Operator adding dropout to inputs and outputs of the given cell."""
-
-  def __init__(self,
-               cell,
-               input_keep_prob=1.0,
-               output_keep_prob=1.0,
-               state_keep_prob=1.0,
-               variational_recurrent=False,
-               input_size=None,
-               dtype=None,
-               seed=None,
-               dropout_state_filter_visitor=None,
-               **kwargs):
-    """Create a cell with added input, state, and/or output dropout.
-
-    If `variational_recurrent` is set to `True` (**NOT** the default behavior),
-    then the same dropout mask is applied at every step, as described in:
-    [A Theoretically Grounded Application of Dropout in Recurrent
-    Neural Networks. Y. Gal, Z. Ghahramani](https://arxiv.org/abs/1512.05287).
-
-    Otherwise a different dropout mask is applied at every time step.
-
-    Note, by default (unless a custom `dropout_state_filter` is provided),
-    the memory state (`c` component of any `LSTMStateTuple`) passing through
-    a `DropoutWrapper` is never modified.  This behavior is described in the
-    above article.
-
-    Args:
-      cell: an RNNCell, a projection to output_size is added to it.
-      input_keep_prob: unit Tensor or float between 0 and 1, input keep
-        probability; if it is constant and 1, no input dropout will be added.
-      output_keep_prob: unit Tensor or float between 0 and 1, output keep
-        probability; if it is constant and 1, no output dropout will be added.
-      state_keep_prob: unit Tensor or float between 0 and 1, output keep
-        probability; if it is constant and 1, no output dropout will be added.
-        State dropout is performed on the outgoing states of the cell. **Note**
-        the state components to which dropout is applied when `state_keep_prob`
-        is in `(0, 1)` are also determined by the argument
-        `dropout_state_filter_visitor` (e.g. by default dropout is never applied
-        to the `c` component of an `LSTMStateTuple`).
-      variational_recurrent: Python bool.  If `True`, then the same dropout
-        pattern is applied across all time steps per run call. If this parameter
-        is set, `input_size` **must** be provided.
-      input_size: (optional) (possibly nested tuple of) `TensorShape` objects
-        containing the depth(s) of the input tensors expected to be passed in to
-        the `DropoutWrapper`.  Required and used **iff** `variational_recurrent
-        = True` and `input_keep_prob < 1`.
-      dtype: (optional) The `dtype` of the input, state, and output tensors.
-        Required and used **iff** `variational_recurrent = True`.
-      seed: (optional) integer, the randomness seed.
-      dropout_state_filter_visitor: (optional), default: (see below).  Function
-        that takes any hierarchical level of the state and returns a scalar or
-        depth=1 structure of Python booleans describing which terms in the state
-        should be dropped out.  In addition, if the function returns `True`,
-        dropout is applied across this sublevel.  If the function returns
-        `False`, dropout is not applied across this entire sublevel.
-        Default behavior: perform dropout on all terms except the memory (`c`)
-          state of `LSTMCellState` objects, and don't try to apply dropout to
-        `TensorArray` objects: ```
-        def dropout_state_filter_visitor(s):
-          if isinstance(s, LSTMCellState): # Never perform dropout on the c
-            state. return LSTMCellState(c=False, h=True)
-          elif isinstance(s, TensorArray): return False return True ```
-      **kwargs: dict of keyword arguments for base layer.
-
-    Raises:
-      TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is provided
-        but not `callable`.
-      ValueError: if any of the keep_probs are not between 0 and 1.
-    """
-    super().__init__(cell, dtype=dtype, **kwargs)
-
-    if (dropout_state_filter_visitor is not None and
-        not callable(dropout_state_filter_visitor)):
-      raise TypeError("dropout_state_filter_visitor must be callable. "
-                      f"Received: {dropout_state_filter_visitor}")
-    self._dropout_state_filter = (
-        dropout_state_filter_visitor or _default_dropout_state_filter_visitor)
-    with tf.name_scope("DropoutWrapperInit"):
-
-      def tensor_and_const_value(v):
-        tensor_value = tf.convert_to_tensor(v)
-        const_value = tf.get_static_value(tensor_value)
-        return (tensor_value, const_value)
-
-      for prob, attr in [(input_keep_prob, "input_keep_prob"),
-                         (state_keep_prob, "state_keep_prob"),
-                         (output_keep_prob, "output_keep_prob")]:
-        tensor_prob, const_prob = tensor_and_const_value(prob)
-        if const_prob is not None:
-          if const_prob < 0 or const_prob > 1:
-            raise ValueError(f"Parameter {attr} must be between 0 and 1. "
-                             f"Received {const_prob}")
-          setattr(self, "_%s" % attr, float(const_prob))
+    """Operator adding dropout to inputs and outputs of the given cell."""
+
+    def __init__(
+        self,
+        cell,
+        input_keep_prob=1.0,
+        output_keep_prob=1.0,
+        state_keep_prob=1.0,
+        variational_recurrent=False,
+        input_size=None,
+        dtype=None,
+        seed=None,
+        dropout_state_filter_visitor=None,
+        **kwargs,
+    ):
+        """Create a cell with added input, state, and/or output dropout.
+
+        If `variational_recurrent` is set to `True` (**NOT** the default
+        behavior), then the same dropout mask is applied at every step, as
+        described in: [A Theoretically Grounded Application of Dropout in
+        Recurrent Neural Networks. Y. Gal, Z.
+        Ghahramani](https://arxiv.org/abs/1512.05287).
+
+        Otherwise a different dropout mask is applied at every time step.
+
+        Note, by default (unless a custom `dropout_state_filter` is provided),
+        the memory state (`c` component of any `LSTMStateTuple`) passing through
+        a `DropoutWrapper` is never modified.  This behavior is described in the
+        above article.
+
+        Args:
+          cell: an RNNCell, a projection to output_size is added to it.
+          input_keep_prob: unit Tensor or float between 0 and 1, input keep
+            probability; if it is constant and 1, no input dropout will be
+            added.
+          output_keep_prob: unit Tensor or float between 0 and 1, output keep
+            probability; if it is constant and 1, no output dropout will be
+            added.
+          state_keep_prob: unit Tensor or float between 0 and 1, output keep
+            probability; if it is constant and 1, no output dropout will be
+            added. State dropout is performed on the outgoing states of the
+            cell. **Note** the state components to which dropout is applied when
+            `state_keep_prob` is in `(0, 1)` are also determined by the argument
+            `dropout_state_filter_visitor` (e.g. by default dropout is never
+            applied to the `c` component of an `LSTMStateTuple`).
+          variational_recurrent: Python bool.  If `True`, then the same dropout
+            pattern is applied across all time steps per run call. If this
+            parameter is set, `input_size` **must** be provided.
+          input_size: (optional) (possibly nested tuple of) `TensorShape`
+            objects containing the depth(s) of the input tensors expected to be
+            passed in to the `DropoutWrapper`.  Required and used **iff**
+            `variational_recurrent = True` and `input_keep_prob < 1`.
+          dtype: (optional) The `dtype` of the input, state, and output tensors.
+            Required and used **iff** `variational_recurrent = True`.
+          seed: (optional) integer, the randomness seed.
+          dropout_state_filter_visitor: (optional), default: (see below).
+            Function that takes any hierarchical level of the state and returns
+            a scalar or depth=1 structure of Python booleans describing which
+            terms in the state should be dropped out.  In addition, if the
+            function returns `True`, dropout is applied across this sublevel.
+            If the function returns `False`, dropout is not applied across this
+            entire sublevel.  Default behavior: perform dropout on all terms
+            except the memory (`c`) state of `LSTMCellState` objects, and don't
+            try to apply dropout to `TensorArray` objects:
+            ```
+            def dropout_state_filter_visitor(s):
+              # Never perform dropout on the c state.
+              if isinstance(s, LSTMCellState):
+                return LSTMCellState(c=False, h=True)
+              elif isinstance(s, TensorArray):
+                return False
+              return True
+            ```
+          **kwargs: dict of keyword arguments for base layer.
+
+        Raises:
+          TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is
+            provided but not `callable`.
+          ValueError: if any of the keep_probs are not between 0 and 1.
+        """
+        super().__init__(cell, dtype=dtype, **kwargs)
+
+        if dropout_state_filter_visitor is not None and not callable(
+            dropout_state_filter_visitor
+        ):
+            raise TypeError(
+                "dropout_state_filter_visitor must be callable. "
+                f"Received: {dropout_state_filter_visitor}"
+            )
+        self._dropout_state_filter = (
+            dropout_state_filter_visitor
+            or _default_dropout_state_filter_visitor
+        )
+        with tf.name_scope("DropoutWrapperInit"):
+
+            def tensor_and_const_value(v):
+                tensor_value = tf.convert_to_tensor(v)
+                const_value = tf.get_static_value(tensor_value)
+                return (tensor_value, const_value)
+
+            for prob, attr in [
+                (input_keep_prob, "input_keep_prob"),
+                (state_keep_prob, "state_keep_prob"),
+                (output_keep_prob, "output_keep_prob"),
+            ]:
+                tensor_prob, const_prob = tensor_and_const_value(prob)
+                if const_prob is not None:
+                    if const_prob < 0 or const_prob > 1:
+                        raise ValueError(
+                            f"Parameter {attr} must be between 0 and 1. "
+                            f"Received {const_prob}"
+                        )
+                    setattr(self, f"_{attr}", float(const_prob))
+                else:
+                    setattr(self, f"_{attr}", tensor_prob)
+
+        # Set variational_recurrent, seed before running the code below
+        self._variational_recurrent = variational_recurrent
+        self._input_size = input_size
+        self._seed = seed
+
+        self._recurrent_input_noise = None
+        self._recurrent_state_noise = None
+        self._recurrent_output_noise = None
+
+        if variational_recurrent:
+            if dtype is None:
+                raise ValueError(
+                    "When variational_recurrent=True, dtype must be provided"
+                )
+
+            def convert_to_batch_shape(s):
+                # Prepend a 1 for the batch dimension; for recurrent
+                # variational dropout we use the same dropout mask for all
+                # batch elements.
+                return tf.concat(([1], tf.TensorShape(s).as_list()), 0)
+
+            def batch_noise(s, inner_seed):
+                shape = convert_to_batch_shape(s)
+                return tf.random.uniform(shape, seed=inner_seed, dtype=dtype)
+
+            if (
+                not isinstance(self._input_keep_prob, numbers.Real)
+                or self._input_keep_prob < 1.0
+            ):
+                if input_size is None:
+                    raise ValueError(
+                        "When variational_recurrent=True and input_keep_prob "
+                        "< 1.0 or is unknown, input_size must be provided"
+                    )
+                self._recurrent_input_noise = _enumerated_map_structure_up_to(
+                    input_size,
+                    lambda i, s: batch_noise(
+                        s, inner_seed=self._gen_seed("input", i)
+                    ),
+                    input_size,
+                )
+            self._recurrent_state_noise = _enumerated_map_structure_up_to(
+                cell.state_size,
+                lambda i, s: batch_noise(
+                    s, inner_seed=self._gen_seed("state", i)
+                ),
+                cell.state_size,
+            )
+            self._recurrent_output_noise = _enumerated_map_structure_up_to(
+                cell.output_size,
+                lambda i, s: batch_noise(
+                    s, inner_seed=self._gen_seed("output", i)
+                ),
+                cell.output_size,
+            )
+
+    def _gen_seed(self, salt_prefix, index):
+        if self._seed is None:
+            return None
+        salt = "%s_%d" % (salt_prefix, index)
+        string = (str(self._seed) + salt).encode("utf-8")
+        return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
+
+    @property
+    def wrapped_cell(self):
+        return self.cell
+
+    def build(self, inputs_shape):
+        self.cell.build(inputs_shape)
+        self.built = True
+
+    def _variational_recurrent_dropout_value(
+        self, unused_index, value, noise, keep_prob
+    ):
+        """Performs dropout given the pre-calculated noise tensor."""
+        # uniform [keep_prob, 1.0 + keep_prob)
+        random_tensor = keep_prob + noise
+
+        # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
+        binary_tensor = tf.floor(random_tensor)
+        ret = tf.divide(value, keep_prob) * binary_tensor
+        ret.set_shape(value.get_shape())
+        return ret
+
+    def _dropout(
+        self,
+        values,
+        salt_prefix,
+        recurrent_noise,
+        keep_prob,
+        shallow_filtered_substructure=None,
+    ):
+        """Decides whether to perform standard dropout or recurrent dropout."""
+
+        if shallow_filtered_substructure is None:
+            # Put something so we traverse the entire structure; inside the
+            # dropout function we check to see if leafs of this are bool or not.
+            shallow_filtered_substructure = values
+
+        if not self._variational_recurrent:
+
+            def dropout(i, do_dropout, v):
+                if not isinstance(do_dropout, bool) or do_dropout:
+                    return tf.nn.dropout(
+                        v,
+                        rate=1.0 - keep_prob,
+                        seed=self._gen_seed(salt_prefix, i),
+                    )
+                else:
+                    return v
+
+            return _enumerated_map_structure_up_to(
+                shallow_filtered_substructure,
+                dropout,
+                *[shallow_filtered_substructure, values],
+            )
         else:
-          setattr(self, "_%s" % attr, tensor_prob)
-
-    # Set variational_recurrent, seed before running the code below
-    self._variational_recurrent = variational_recurrent
-    self._input_size = input_size
-    self._seed = seed
-
-    self._recurrent_input_noise = None
-    self._recurrent_state_noise = None
-    self._recurrent_output_noise = None
-
-    if variational_recurrent:
-      if dtype is None:
-        raise ValueError(
-            "When variational_recurrent=True, dtype must be provided")
-
-      def convert_to_batch_shape(s):
-        # Prepend a 1 for the batch dimension; for recurrent
-        # variational dropout we use the same dropout mask for all
-        # batch elements.
-        return tf.concat(([1], tf.TensorShape(s).as_list()), 0)
-
-      def batch_noise(s, inner_seed):
-        shape = convert_to_batch_shape(s)
-        return tf.random.uniform(shape, seed=inner_seed, dtype=dtype)
-
-      if (not isinstance(self._input_keep_prob, numbers.Real) or
-          self._input_keep_prob < 1.0):
-        if input_size is None:
-          raise ValueError(
-              "When variational_recurrent=True and input_keep_prob < 1.0 or "
-              "is unknown, input_size must be provided")
-        self._recurrent_input_noise = _enumerated_map_structure_up_to(
-            input_size,
-            lambda i, s: batch_noise(s, inner_seed=self._gen_seed("input", i)),
-            input_size)
-      self._recurrent_state_noise = _enumerated_map_structure_up_to(
-          cell.state_size,
-          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("state", i)),
-          cell.state_size)
-      self._recurrent_output_noise = _enumerated_map_structure_up_to(
-          cell.output_size,
-          lambda i, s: batch_noise(s, inner_seed=self._gen_seed("output", i)),
-          cell.output_size)
-
-  def _gen_seed(self, salt_prefix, index):
-    if self._seed is None:
-      return None
-    salt = "%s_%d" % (salt_prefix, index)
-    string = (str(self._seed) + salt).encode("utf-8")
-    return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
-
-  @property
-  def wrapped_cell(self):
-    return self.cell
-
-  def build(self, inputs_shape):
-    self.cell.build(inputs_shape)
-    self.built = True
-
-  def _variational_recurrent_dropout_value(
-      self, unused_index, value, noise, keep_prob):
-    """Performs dropout given the pre-calculated noise tensor."""
-    # uniform [keep_prob, 1.0 + keep_prob)
-    random_tensor = keep_prob + noise
-
-    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
-    binary_tensor = tf.floor(random_tensor)
-    ret = tf.divide(value, keep_prob) * binary_tensor
-    ret.set_shape(value.get_shape())
-    return ret
-
-  def _dropout(self,
-               values,
-               salt_prefix,
-               recurrent_noise,
-               keep_prob,
-               shallow_filtered_substructure=None):
-    """Decides whether to perform standard dropout or recurrent dropout."""
-
-    if shallow_filtered_substructure is None:
-      # Put something so we traverse the entire structure; inside the
-      # dropout function we check to see if leafs of this are bool or not.
-      shallow_filtered_substructure = values
-
-    if not self._variational_recurrent:
-
-      def dropout(i, do_dropout, v):
-        if not isinstance(do_dropout, bool) or do_dropout:
-          return tf.nn.dropout(
-              v, rate=1. - keep_prob, seed=self._gen_seed(salt_prefix, i))
-        else:
-          return v
-
-      return _enumerated_map_structure_up_to(
-          shallow_filtered_substructure, dropout,
-          *[shallow_filtered_substructure, values])
-    else:
-
-      def dropout(i, do_dropout, v, n):
-        if not isinstance(do_dropout, bool) or do_dropout:
-          return self._variational_recurrent_dropout_value(i, v, n, keep_prob)
-        else:
-          return v
-
-      return _enumerated_map_structure_up_to(
-          shallow_filtered_substructure, dropout,
-          *[shallow_filtered_substructure, values, recurrent_noise])
-
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Runs the wrapped cell and applies dropout.
 
-    Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      cell_call_fn: Wrapped cell's method to use for step computation (cell's
-        `__call__` or 'call' method).
-      **kwargs: Additional arguments.
-
-    Returns:
-      A pair containing:
-
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
-    """
-
-    def _should_dropout(p):
-      return (not isinstance(p, float)) or p < 1
-
-    if _should_dropout(self._input_keep_prob):
-      inputs = self._dropout(inputs, "input", self._recurrent_input_noise,
-                             self._input_keep_prob)
-    output, new_state = cell_call_fn(inputs, state, **kwargs)
-    if _should_dropout(self._state_keep_prob):
-      # Identify which subsets of the state to perform dropout on and
-      # which ones to keep.
-      shallow_filtered_substructure = tf.__internal__.nest.get_traverse_shallow_structure(
-          self._dropout_state_filter, new_state)
-      new_state = self._dropout(new_state, "state", self._recurrent_state_noise,
-                                self._state_keep_prob,
-                                shallow_filtered_substructure)
-    if _should_dropout(self._output_keep_prob):
-      output = self._dropout(output, "output", self._recurrent_output_noise,
-                             self._output_keep_prob)
-    return output, new_state
-
-  def get_config(self):
-    """Returns the config of the dropout wrapper."""
-    config = {
-        "input_keep_prob": self._input_keep_prob,
-        "output_keep_prob": self._output_keep_prob,
-        "state_keep_prob": self._state_keep_prob,
-        "variational_recurrent": self._variational_recurrent,
-        "input_size": self._input_size,
-        "seed": self._seed,
-    }
-    if self._dropout_state_filter != _default_dropout_state_filter_visitor:  # pylint: disable=comparison-with-callable
-      function, function_type, function_module = _serialize_function_to_config(
-          self._dropout_state_filter)
-      config.update({"dropout_fn": function,
-                     "dropout_fn_type": function_type,
-                     "dropout_fn_module": function_module})
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    if "dropout_fn" in config:
-      config = config.copy()
-      dropout_state_filter = _parse_config_to_function(
-          config, custom_objects, "dropout_fn", "dropout_fn_type",
-          "dropout_fn_module")
-      config.pop("dropout_fn")
-      config["dropout_state_filter_visitor"] = dropout_state_filter
-    return super(DropoutWrapper, cls).from_config(
-        config, custom_objects=custom_objects)
+            def dropout(i, do_dropout, v, n):
+                if not isinstance(do_dropout, bool) or do_dropout:
+                    return self._variational_recurrent_dropout_value(
+                        i, v, n, keep_prob
+                    )
+                else:
+                    return v
+
+            return _enumerated_map_structure_up_to(
+                shallow_filtered_substructure,
+                dropout,
+                *[shallow_filtered_substructure, values, recurrent_noise],
+            )
+
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Runs the wrapped cell and applies dropout.
+
+        Args:
+          inputs: A tensor with wrapped cell's input.
+          state: A tensor or tuple of tensors with wrapped cell's state.
+          cell_call_fn: Wrapped cell's method to use for step computation
+            (cell's `__call__` or 'call' method).
+          **kwargs: Additional arguments.
+
+        Returns:
+          A pair containing:
+
+          - Output: A tensor with cell's output.
+          - New state: A tensor or tuple of tensors with new wrapped cell's
+            state.
+        """
+
+        def _should_dropout(p):
+            return (not isinstance(p, float)) or p < 1
+
+        if _should_dropout(self._input_keep_prob):
+            inputs = self._dropout(
+                inputs,
+                "input",
+                self._recurrent_input_noise,
+                self._input_keep_prob,
+            )
+        output, new_state = cell_call_fn(inputs, state, **kwargs)
+        if _should_dropout(self._state_keep_prob):
+            # Identify which subsets of the state to perform dropout on and
+            # which ones to keep.
+            shallow_filtered_substructure = (
+                tf.__internal__.nest.get_traverse_shallow_structure(
+                    self._dropout_state_filter, new_state
+                )
+            )
+            new_state = self._dropout(
+                new_state,
+                "state",
+                self._recurrent_state_noise,
+                self._state_keep_prob,
+                shallow_filtered_substructure,
+            )
+        if _should_dropout(self._output_keep_prob):
+            output = self._dropout(
+                output,
+                "output",
+                self._recurrent_output_noise,
+                self._output_keep_prob,
+            )
+        return output, new_state
+
+    def get_config(self):
+        """Returns the config of the dropout wrapper."""
+        config = {
+            "input_keep_prob": self._input_keep_prob,
+            "output_keep_prob": self._output_keep_prob,
+            "state_keep_prob": self._state_keep_prob,
+            "variational_recurrent": self._variational_recurrent,
+            "input_size": self._input_size,
+            "seed": self._seed,
+        }
+        if self._dropout_state_filter != _default_dropout_state_filter_visitor:
+            (
+                function,
+                function_type,
+                function_module,
+            ) = _serialize_function_to_config(self._dropout_state_filter)
+            config.update(
+                {
+                    "dropout_fn": function,
+                    "dropout_fn_type": function_type,
+                    "dropout_fn_module": function_module,
+                }
+            )
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        if "dropout_fn" in config:
+            config = config.copy()
+            dropout_state_filter = _parse_config_to_function(
+                config,
+                custom_objects,
+                "dropout_fn",
+                "dropout_fn_type",
+                "dropout_fn_module",
+            )
+            config.pop("dropout_fn")
+            config["dropout_state_filter_visitor"] = dropout_state_filter
+        return super(DropoutWrapper, cls).from_config(
+            config, custom_objects=custom_objects
+        )
 
 
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.ResidualWrapper"])
 @tf_export(v1=["nn.rnn_cell.ResidualWrapper"])
 class ResidualWrapper(_RNNCellWrapperV1):
-  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
-
-  def __init__(self, cell, residual_fn=None, **kwargs):
-    """Constructs a `ResidualWrapper` for `cell`.
-
-    Args:
-      cell: An instance of `RNNCell`.
-      residual_fn: (Optional) The function to map raw cell inputs and raw cell
-        outputs to the actual cell outputs of the residual network.
-        Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
-          and outputs.
-      **kwargs: dict of keyword arguments for base layer.
-    """
-    super().__init__(cell, **kwargs)
-    self._residual_fn = residual_fn
-
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Run the cell and then apply the residual_fn on its inputs to its outputs.
-
-    Args:
-      inputs: cell inputs.
-      state: cell state.
-      cell_call_fn: Wrapped cell's method to use for step computation (cell's
-        `__call__` or 'call' method).
-      **kwargs: Additional arguments passed to the wrapped cell's `call`.
-
-    Returns:
-      Tuple of cell outputs and new state.
-
-    Raises:
-      TypeError: If cell inputs and outputs have different structure (type).
-      ValueError: If cell inputs and outputs have different structure (value).
-    """
-    outputs, new_state = cell_call_fn(inputs, state, **kwargs)
-
-    # Ensure shapes match
-    def assert_shape_match(inp, out):
-      inp.get_shape().assert_is_compatible_with(out.get_shape())
-
-    def default_residual_fn(inputs, outputs):
-      tf.nest.assert_same_structure(inputs, outputs)
-      tf.nest.map_structure(assert_shape_match, inputs, outputs)
-      return tf.nest.map_structure(lambda inp, out: inp + out, inputs, outputs)
-
-    res_outputs = (self._residual_fn or default_residual_fn)(inputs, outputs)
-    return (res_outputs, new_state)
-
-  def get_config(self):
-    """Returns the config of the residual wrapper."""
-    if self._residual_fn is not None:
-      function, function_type, function_module = _serialize_function_to_config(
-          self._residual_fn)
-      config = {
-          "residual_fn": function,
-          "residual_fn_type": function_type,
-          "residual_fn_module": function_module
-      }
-    else:
-      config = {}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    if "residual_fn" in config:
-      config = config.copy()
-      residual_function = _parse_config_to_function(config, custom_objects,
-                                                    "residual_fn",
-                                                    "residual_fn_type",
-                                                    "residual_fn_module")
-      config["residual_fn"] = residual_function
-    return super(ResidualWrapper, cls).from_config(
-        config, custom_objects=custom_objects)
+    """RNNCell wrapper that ensures cell inputs are added to the outputs."""
+
+    def __init__(self, cell, residual_fn=None, **kwargs):
+        """Constructs a `ResidualWrapper` for `cell`.
+
+        Args:
+          cell: An instance of `RNNCell`.
+          residual_fn: (Optional) The function to map raw cell inputs and raw
+            cell outputs to the actual cell outputs of the residual network.
+            Defaults to calling nest.map_structure on (lambda i, o: i + o),
+            inputs and outputs.
+          **kwargs: dict of keyword arguments for base layer.
+        """
+        super().__init__(cell, **kwargs)
+        self._residual_fn = residual_fn
+
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Run the cell and apply the residual_fn.
+
+        Args:
+          inputs: cell inputs.
+          state: cell state.
+          cell_call_fn: Wrapped cell's method to use for step computation
+            (cell's `__call__` or 'call' method).
+          **kwargs: Additional arguments passed to the wrapped cell's `call`.
+
+        Returns:
+          Tuple of cell outputs and new state.
+
+        Raises:
+          TypeError: If cell inputs and outputs have different structure (type).
+          ValueError: If cell inputs and outputs have different structure
+            (value).
+        """
+        outputs, new_state = cell_call_fn(inputs, state, **kwargs)
+
+        # Ensure shapes match
+        def assert_shape_match(inp, out):
+            inp.get_shape().assert_is_compatible_with(out.get_shape())
+
+        def default_residual_fn(inputs, outputs):
+            tf.nest.assert_same_structure(inputs, outputs)
+            tf.nest.map_structure(assert_shape_match, inputs, outputs)
+            return tf.nest.map_structure(
+                lambda inp, out: inp + out, inputs, outputs
+            )
+
+        res_outputs = (self._residual_fn or default_residual_fn)(
+            inputs, outputs
+        )
+        return (res_outputs, new_state)
+
+    def get_config(self):
+        """Returns the config of the residual wrapper."""
+        if self._residual_fn is not None:
+            (
+                function,
+                function_type,
+                function_module,
+            ) = _serialize_function_to_config(self._residual_fn)
+            config = {
+                "residual_fn": function,
+                "residual_fn_type": function_type,
+                "residual_fn_module": function_module,
+            }
+        else:
+            config = {}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        if "residual_fn" in config:
+            config = config.copy()
+            residual_function = _parse_config_to_function(
+                config,
+                custom_objects,
+                "residual_fn",
+                "residual_fn_type",
+                "residual_fn_module",
+            )
+            config["residual_fn"] = residual_function
+        return super(ResidualWrapper, cls).from_config(
+            config, custom_objects=custom_objects
+        )
 
 
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.DeviceWrapper"])
 @tf_export(v1=["nn.rnn_cell.DeviceWrapper"])
 class DeviceWrapper(_RNNCellWrapperV1):
-  """Operator that ensures an RNNCell runs on a particular device."""
+    """Operator that ensures an RNNCell runs on a particular device."""
 
-  def __init__(self, cell, device, **kwargs):
-    """Construct a `DeviceWrapper` for `cell` with device `device`.
+    def __init__(self, cell, device, **kwargs):
+        """Construct a `DeviceWrapper` for `cell` with device `device`.
 
-    Ensures the wrapped `cell` is called with `tf.device(device)`.
+        Ensures the wrapped `cell` is called with `tf.device(device)`.
 
-    Args:
-      cell: An instance of `RNNCell`.
-      device: A device string or function, for passing to `tf.device`.
-      **kwargs: dict of keyword arguments for base layer.
-    """
-    super().__init__(cell, **kwargs)
-    self._device = device
+        Args:
+          cell: An instance of `RNNCell`.
+          device: A device string or function, for passing to `tf.device`.
+          **kwargs: dict of keyword arguments for base layer.
+        """
+        super().__init__(cell, **kwargs)
+        self._device = device
 
-  def zero_state(self, batch_size, dtype):
-    with tf.name_scope(type(self).__name__ + "ZeroState"):
-      with tf.compat.v1.device(self._device):
-        return self.cell.zero_state(batch_size, dtype)
+    def zero_state(self, batch_size, dtype):
+        with tf.name_scope(type(self).__name__ + "ZeroState"):
+            with tf.compat.v1.device(self._device):
+                return self.cell.zero_state(batch_size, dtype)
 
-  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
-    """Run the cell on specified device."""
-    with tf.compat.v1.device(self._device):
-      return cell_call_fn(inputs, state, **kwargs)
+    def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+        """Run the cell on specified device."""
+        with tf.compat.v1.device(self._device):
+            return cell_call_fn(inputs, state, **kwargs)
 
-  def get_config(self):
-    config = {"device": self._device}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {"device": self._device}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 def _default_dropout_state_filter_visitor(substate):
-  from keras.layers.rnn.legacy_cells import LSTMStateTuple  # pylint: disable=g-import-not-at-top
-  if isinstance(substate, LSTMStateTuple):
-    # Do not perform dropout on the memory state.
-    return LSTMStateTuple(c=False, h=True)
-  elif isinstance(substate, tf.TensorArray):
-    return False
-  return True
+    from keras.layers.rnn.legacy_cells import (
+        LSTMStateTuple,
+    )
+
+    if isinstance(substate, LSTMStateTuple):
+        # Do not perform dropout on the memory state.
+        return LSTMStateTuple(c=False, h=True)
+    elif isinstance(substate, tf.TensorArray):
+        return False
+    return True
diff --git a/keras/layers/rnn/legacy_cell_wrappers_test.py b/keras/layers/rnn/legacy_cell_wrappers_test.py
index 8e04fad275fe..f9bf3040e70b 100644
--- a/keras/layers/rnn/legacy_cell_wrappers_test.py
+++ b/keras/layers/rnn/legacy_cell_wrappers_test.py
@@ -14,24 +14,27 @@
 # ==============================================================================
 """Tests for RNN cell wrappers v1 implementation."""
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras.layers.rnn import legacy_cell_wrappers
 from keras.layers.rnn import legacy_cells
 from keras.testing_infra import test_combinations
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class RNNCellWrapperV1Test(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters([
-      legacy_cell_wrappers.DropoutWrapper, legacy_cell_wrappers.ResidualWrapper
-  ])
-  def testWrapperKerasStyle(self, wrapper):
-    """Tests if wrapper cell is instantiated in keras style scope."""
-    wrapped_cell = wrapper(legacy_cells.BasicRNNCell(1))
-    self.assertFalse(wrapped_cell._keras_style)
+    @parameterized.parameters(
+        [
+            legacy_cell_wrappers.DropoutWrapper,
+            legacy_cell_wrappers.ResidualWrapper,
+        ]
+    )
+    def testWrapperKerasStyle(self, wrapper):
+        """Tests if wrapper cell is instantiated in keras style scope."""
+        wrapped_cell = wrapper(legacy_cells.BasicRNNCell(1))
+        self.assertFalse(wrapped_cell._keras_style)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/layers/rnn/legacy_cells.py b/keras/layers/rnn/legacy_cells.py
index 05c601c460d6..ca2431cb67a9 100644
--- a/keras/layers/rnn/legacy_cells.py
+++ b/keras/layers/rnn/legacy_cells.py
@@ -20,7 +20,7 @@
 Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
 calling the `rnn` ops several times.
 """
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from __future__ import absolute_import
 from __future__ import division
@@ -29,6 +29,8 @@
 import collections
 import warnings
 
+import tensorflow.compat.v2 as tf
+
 from keras import activations
 from keras import backend
 from keras import initializers
@@ -36,555 +38,605 @@
 from keras.engine import input_spec
 from keras.legacy_tf_layers import base as base_layer
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
-
 _BIAS_VARIABLE_NAME = "bias"
 _WEIGHTS_VARIABLE_NAME = "kernel"
 
 
 def _hasattr(obj, attr_name):
-  try:
-    getattr(obj, attr_name)
-  except AttributeError:
-    return False
-  else:
-    return True
+    try:
+        getattr(obj, attr_name)
+    except AttributeError:
+        return False
+    else:
+        return True
 
 
 def _concat(prefix, suffix, static=False):
-  """Concat that enables int, Tensor, or TensorShape values.
-
-  This function takes a size specification, which can be an integer, a
-  TensorShape, or a Tensor, and converts it into a concatenated Tensor
-  (if static = False) or a list of integers (if static = True).
-
-  Args:
-    prefix: The prefix; usually the batch size (and/or time step size).
-      (TensorShape, int, or Tensor.)
-    suffix: TensorShape, int, or Tensor.
-    static: If `True`, return a python list with possibly unknown dimensions.
-      Otherwise return a `Tensor`.
-
-  Returns:
-    shape: the concatenation of prefix and suffix.
-
-  Raises:
-    ValueError: if `suffix` is not a scalar or vector (or TensorShape).
-    ValueError: if prefix or suffix was `None` and asked for dynamic
-      Tensors out.
-  """
-  if isinstance(prefix, tf.Tensor):
-    p = prefix
-    p_static = tf.get_static_value(prefix)
-    if p.shape.ndims == 0:
-      p = tf.compat.v1.expand_dims(p, 0)
-    elif p.shape.ndims != 1:
-      raise ValueError(
-          "Prefix tensor must be either a scalar or vector, "
-          f"but received tensor: {p}")
-  else:
-    p = tf.TensorShape(prefix)
-    p_static = p.as_list() if p.ndims is not None else None
-    p = (
-        tf.constant(p.as_list(), dtype=tf.int32)
-        if p.is_fully_defined() else None)
-  if isinstance(suffix, tf.Tensor):
-    s = suffix
-    s_static = tf.get_static_value(suffix)
-    if s.shape.ndims == 0:
-      s = tf.compat.v1.expand_dims(s, 0)
-    elif s.shape.ndims != 1:
-      raise ValueError("suffix tensor must be either a scalar or vector, "
-                       f"but received tensor: {s}")
-  else:
-    s = tf.TensorShape(suffix)
-    s_static = s.as_list() if s.ndims is not None else None
-    s = (
-        tf.constant(s.as_list(), dtype=tf.int32)
-        if s.is_fully_defined() else None)
-
-  if static:
-    shape = tf.TensorShape(p_static).concatenate(s_static)
-    shape = shape.as_list() if shape.ndims is not None else None
-  else:
-    if p is None or s is None:
-      raise ValueError(
-          "Prefix or suffix can't be None. "
-          f"Received prefix = {prefix} and suffix = {suffix}")
-    shape = tf.concat((p, s), 0)
-  return shape
+    """Concat that enables int, Tensor, or TensorShape values.
+
+    This function takes a size specification, which can be an integer, a
+    TensorShape, or a Tensor, and converts it into a concatenated Tensor
+    (if static = False) or a list of integers (if static = True).
+
+    Args:
+      prefix: The prefix; usually the batch size (and/or time step size).
+        (TensorShape, int, or Tensor.)
+      suffix: TensorShape, int, or Tensor.
+      static: If `True`, return a python list with possibly unknown dimensions.
+        Otherwise return a `Tensor`.
+
+    Returns:
+      shape: the concatenation of prefix and suffix.
+
+    Raises:
+      ValueError: if `suffix` is not a scalar or vector (or TensorShape).
+      ValueError: if prefix or suffix was `None` and asked for dynamic
+        Tensors out.
+    """
+    if isinstance(prefix, tf.Tensor):
+        p = prefix
+        p_static = tf.get_static_value(prefix)
+        if p.shape.ndims == 0:
+            p = tf.compat.v1.expand_dims(p, 0)
+        elif p.shape.ndims != 1:
+            raise ValueError(
+                "Prefix tensor must be either a scalar or vector, "
+                f"but received tensor: {p}"
+            )
+    else:
+        p = tf.TensorShape(prefix)
+        p_static = p.as_list() if p.ndims is not None else None
+        p = (
+            tf.constant(p.as_list(), dtype=tf.int32)
+            if p.is_fully_defined()
+            else None
+        )
+    if isinstance(suffix, tf.Tensor):
+        s = suffix
+        s_static = tf.get_static_value(suffix)
+        if s.shape.ndims == 0:
+            s = tf.compat.v1.expand_dims(s, 0)
+        elif s.shape.ndims != 1:
+            raise ValueError(
+                "suffix tensor must be either a scalar or vector, "
+                f"but received tensor: {s}"
+            )
+    else:
+        s = tf.TensorShape(suffix)
+        s_static = s.as_list() if s.ndims is not None else None
+        s = (
+            tf.constant(s.as_list(), dtype=tf.int32)
+            if s.is_fully_defined()
+            else None
+        )
+
+    if static:
+        shape = tf.TensorShape(p_static).concatenate(s_static)
+        shape = shape.as_list() if shape.ndims is not None else None
+    else:
+        if p is None or s is None:
+            raise ValueError(
+                "Prefix or suffix can't be None. "
+                f"Received prefix = {prefix} and suffix = {suffix}"
+            )
+        shape = tf.concat((p, s), 0)
+    return shape
 
 
 def _zero_state_tensors(state_size, batch_size, dtype):
-  """Create tensors of zeros based on state_size, batch_size, and dtype."""
+    """Create tensors of zeros based on state_size, batch_size, and dtype."""
 
-  def get_state_shape(s):
-    """Combine s with batch_size to get a proper tensor shape."""
-    c = _concat(batch_size, s)
-    size = tf.zeros(c, dtype=dtype)
-    if not tf.executing_eagerly():
-      c_static = _concat(batch_size, s, static=True)
-      size.set_shape(c_static)
-    return size
+    def get_state_shape(s):
+        """Combine s with batch_size to get a proper tensor shape."""
+        c = _concat(batch_size, s)
+        size = tf.zeros(c, dtype=dtype)
+        if not tf.executing_eagerly():
+            c_static = _concat(batch_size, s, static=True)
+            size.set_shape(c_static)
+        return size
 
-  return tf.nest.map_structure(get_state_shape, state_size)
+    return tf.nest.map_structure(get_state_shape, state_size)
 
 
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.RNNCell"])
 @tf_export(v1=["nn.rnn_cell.RNNCell"])
 class RNNCell(base_layer.Layer):
-  """Abstract object representing an RNN cell.
-
-  Every `RNNCell` must have the properties below and implement `call` with
-  the signature `(output, next_state) = call(input, state)`.  The optional
-  third input argument, `scope`, is allowed for backwards compatibility
-  purposes; but should be left off for new subclasses.
-
-  This definition of cell differs from the definition used in the literature.
-  In the literature, 'cell' refers to an object with a single scalar output.
-  This definition refers to a horizontal array of such units.
-
-  An RNN cell, in the most abstract setting, is anything that has
-  a state and performs some operation that takes a matrix of inputs.
-  This operation results in an output matrix with `self.output_size` columns.
-  If `self.state_size` is an integer, this operation also results in a new
-  state matrix with `self.state_size` columns.  If `self.state_size` is a
-  (possibly nested tuple of) TensorShape object(s), then it should return a
-  matching structure of Tensors having shape `[batch_size].concatenate(s)`
-  for each `s` in `self.batch_size`.
-  """
-
-  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
-    super().__init__(
-        trainable=trainable, name=name, dtype=dtype, **kwargs)
-    # Attribute that indicates whether the cell is a TF RNN cell, due the slight
-    # difference between TF and Keras RNN cell. Notably the state is not wrapped
-    # in a list for TF cell where they are single tensor state, whereas keras
-    # cell will wrap the state into a list, and call() will have to unwrap them.
-    self._is_tf_rnn_cell = True
-
-  def __call__(self, inputs, state, scope=None):
-    """Run this RNN cell on inputs, starting from the given state.
+    """Abstract object representing an RNN cell.
+
+    Every `RNNCell` must have the properties below and implement `call` with
+    the signature `(output, next_state) = call(input, state)`.  The optional
+    third input argument, `scope`, is allowed for backwards compatibility
+    purposes; but should be left off for new subclasses.
+
+    This definition of cell differs from the definition used in the literature.
+    In the literature, 'cell' refers to an object with a single scalar output.
+    This definition refers to a horizontal array of such units.
+
+    An RNN cell, in the most abstract setting, is anything that has
+    a state and performs some operation that takes a matrix of inputs.
+    This operation results in an output matrix with `self.output_size` columns.
+    If `self.state_size` is an integer, this operation also results in a new
+    state matrix with `self.state_size` columns.  If `self.state_size` is a
+    (possibly nested tuple of) TensorShape object(s), then it should return a
+    matching structure of Tensors having shape `[batch_size].concatenate(s)`
+    for each `s` in `self.batch_size`.
+    """
 
-    Args:
-      inputs: `2-D` tensor with shape `[batch_size, input_size]`.
-      state: if `self.state_size` is an integer, this should be a `2-D Tensor`
-        with shape `[batch_size, self.state_size]`.  Otherwise, if
-        `self.state_size` is a tuple of integers, this should be a tuple with
-        shapes `[batch_size, s] for s in self.state_size`.
-      scope: VariableScope for the created subgraph; defaults to class name.
+    def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
+        super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
+        # Attribute that indicates whether the cell is a TF RNN cell, due the
+        # slight difference between TF and Keras RNN cell. Notably the state is
+        # not wrapped in a list for TF cell where they are single tensor state,
+        # whereas keras cell will wrap the state into a list, and call() will
+        # have to unwrap them.
+        self._is_tf_rnn_cell = True
+
+    def __call__(self, inputs, state, scope=None):
+        """Run this RNN cell on inputs, starting from the given state.
+
+        Args:
+          inputs: `2-D` tensor with shape `[batch_size, input_size]`.
+          state: if `self.state_size` is an integer, this should be a
+            `2-D Tensor` with shape `[batch_size, self.state_size]`. Otherwise,
+            if `self.state_size` is a tuple of integers, this should be a tuple
+            with shapes `[batch_size, s] for s in self.state_size`.
+          scope: VariableScope for the created subgraph; None uses class name.
+            Defaults to `None`.
+
+        Returns:
+          A pair containing:
+
+          - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`.
+          - New state: Either a single `2-D` tensor, or a tuple of tensors
+            matching the arity and shapes of `state`.
+        """
+        if scope is not None:
+            with tf.compat.v1.variable_scope(
+                scope, custom_getter=self._rnn_get_variable
+            ) as scope:
+                return super().__call__(inputs, state, scope=scope)
+        else:
+            scope_attrname = "rnncell_scope"
+            scope = getattr(self, scope_attrname, None)
+            if scope is None:
+                scope = tf.compat.v1.variable_scope(
+                    tf.compat.v1.get_variable_scope(),
+                    custom_getter=self._rnn_get_variable,
+                )
+                setattr(self, scope_attrname, scope)
+            with scope:
+                return super().__call__(inputs, state)
+
+    def _rnn_get_variable(self, getter, *args, **kwargs):
+        variable = getter(*args, **kwargs)
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            trainable = variable.trainable
+        else:
+            trainable = variable in tf.compat.v1.trainable_variables() or (
+                base_layer_utils.is_split_variable(variable)
+                and list(variable)[0] in tf.compat.v1.trainable_variables()
+            )
+        if trainable and all(
+            variable is not v for v in self._trainable_weights
+        ):
+            self._trainable_weights.append(variable)
+        elif not trainable and all(
+            variable is not v for v in self._non_trainable_weights
+        ):
+            self._non_trainable_weights.append(variable)
+        return variable
+
+    @property
+    def state_size(self):
+        """size(s) of state(s) used by this cell.
+
+        It can be represented by an Integer, a TensorShape or a tuple of
+        Integers or TensorShapes.
+        """
+        raise NotImplementedError("Abstract method")
+
+    @property
+    def output_size(self):
+        """Integer or TensorShape: size of outputs produced by this cell."""
+        raise NotImplementedError("Abstract method")
+
+    def build(self, _):
+        # This tells the parent Layer object that it's OK to call
+        # self.add_weight() inside the call() method.
+        pass
+
+    def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        if inputs is not None:
+            # Validate the given batch_size and dtype against inputs if
+            # provided.
+            inputs = tf.convert_to_tensor(inputs, name="inputs")
+            if batch_size is not None:
+                if tf.is_tensor(batch_size):
+                    static_batch_size = tf.get_static_value(
+                        batch_size, partial=True
+                    )
+                else:
+                    static_batch_size = batch_size
+                if inputs.shape.dims[0].value != static_batch_size:
+                    raise ValueError(
+                        "batch size from input tensor is different from the "
+                        "input param. Input tensor batch: "
+                        f"{inputs.shape.dims[0].value}, "
+                        f"batch_size: {batch_size}"
+                    )
+
+            if dtype is not None and inputs.dtype != dtype:
+                raise ValueError(
+                    "dtype from input tensor is different from the "
+                    f"input param. Input tensor dtype: {inputs.dtype}, "
+                    f"dtype: {dtype}"
+                )
+
+            batch_size = (
+                inputs.shape.dims[0].value or tf.compat.v1.shape(inputs)[0]
+            )
+            dtype = inputs.dtype
+        if batch_size is None or dtype is None:
+            raise ValueError(
+                "batch_size and dtype cannot be None while constructing "
+                f"initial state: batch_size={batch_size}, dtype={dtype}"
+            )
+        return self.zero_state(batch_size, dtype)
+
+    def zero_state(self, batch_size, dtype):
+        """Return zero-filled state tensor(s).
+
+        Args:
+          batch_size: int, float, or unit Tensor representing the batch size.
+          dtype: the data type to use for the state.
+
+        Returns:
+          If `state_size` is an int or TensorShape, then the return value is a
+          `N-D` tensor of shape `[batch_size, state_size]` filled with zeros.
+
+          If `state_size` is a nested list or tuple, then the return value is
+          a nested list or tuple (of the same structure) of `2-D` tensors with
+          the shapes `[batch_size, s]` for each s in `state_size`.
+        """
+        # Try to use the last cached zero_state. This is done to avoid
+        # recreating zeros, especially when eager execution is enabled.
+        state_size = self.state_size
+        is_eager = tf.executing_eagerly()
+        if is_eager and _hasattr(self, "_last_zero_state"):
+            (
+                last_state_size,
+                last_batch_size,
+                last_dtype,
+                last_output,
+            ) = getattr(self, "_last_zero_state")
+            if (
+                last_batch_size == batch_size
+                and last_dtype == dtype
+                and last_state_size == state_size
+            ):
+                return last_output
+        with backend.name_scope(type(self).__name__ + "ZeroState"):
+            output = _zero_state_tensors(state_size, batch_size, dtype)
+        if is_eager:
+            self._last_zero_state = (state_size, batch_size, dtype, output)
+        return output
+
+    def get_config(self):
+        return super().get_config()
+
+    @property
+    def _use_input_spec_as_call_signature(self):
+        # We do not store the shape information for the state argument in the
+        # call function for legacy RNN cells, so do not generate an input
+        # signature.
+        return False
 
-    Returns:
-      A pair containing:
 
-      - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`.
-      - New state: Either a single `2-D` tensor, or a tuple of tensors matching
-        the arity and shapes of `state`.
-    """
-    if scope is not None:
-      with tf.compat.v1.variable_scope(
-          scope, custom_getter=self._rnn_get_variable) as scope:
-        return super().__call__(inputs, state, scope=scope)
-    else:
-      scope_attrname = "rnncell_scope"
-      scope = getattr(self, scope_attrname, None)
-      if scope is None:
-        scope = tf.compat.v1.variable_scope(
-            tf.compat.v1.get_variable_scope(),
-            custom_getter=self._rnn_get_variable)
-        setattr(self, scope_attrname, scope)
-      with scope:
-        return super().__call__(inputs, state)
-
-  def _rnn_get_variable(self, getter, *args, **kwargs):
-    variable = getter(*args, **kwargs)
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      trainable = variable.trainable
-    else:
-      trainable = (
-          variable in tf.compat.v1.trainable_variables() or
-          (base_layer_utils.is_split_variable(variable) and
-           list(variable)[0] in tf.compat.v1.trainable_variables()))
-    if trainable and all(variable is not v for v in self._trainable_weights):
-      self._trainable_weights.append(variable)
-    elif not trainable and all(
-        variable is not v for v in self._non_trainable_weights):
-      self._non_trainable_weights.append(variable)
-    return variable
-
-  @property
-  def state_size(self):
-    """size(s) of state(s) used by this cell.
-
-    It can be represented by an Integer, a TensorShape or a tuple of Integers
-    or TensorShapes.
+class LayerRNNCell(RNNCell):
+    """Subclass of RNNCells that act like proper `tf.Layer` objects.
+
+    For backwards compatibility purposes, most `RNNCell` instances allow their
+    `call` methods to instantiate variables via `tf.compat.v1.get_variable`.
+    The underlying variable scope thus keeps track of any variables, and
+    returning cached versions.  This is atypical of `tf.layer` objects, which
+    separate this part of layer building into a `build` method that is only
+    called once.
+
+    Here we provide a subclass for `RNNCell` objects that act exactly as
+    `Layer` objects do.  They must provide a `build` method and their
+    `call` methods do not access Variables `tf.compat.v1.get_variable`.
     """
-    raise NotImplementedError("Abstract method")
-
-  @property
-  def output_size(self):
-    """Integer or TensorShape: size of outputs produced by this cell."""
-    raise NotImplementedError("Abstract method")
-
-  def build(self, _):
-    # This tells the parent Layer object that it's OK to call
-    # self.add_weight() inside the call() method.
-    pass
-
-  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    if inputs is not None:
-      # Validate the given batch_size and dtype against inputs if provided.
-      inputs = tf.convert_to_tensor(inputs, name="inputs")
-      if batch_size is not None:
-        if tf.is_tensor(batch_size):
-          static_batch_size = tf.get_static_value(
-              batch_size, partial=True)
-        else:
-          static_batch_size = batch_size
-        if inputs.shape.dims[0].value != static_batch_size:
-          raise ValueError(
-              "batch size from input tensor is different from the "
-              f"input param. Input tensor batch: {inputs.shape.dims[0].value}, "
-              f"batch_size: {batch_size}")
-
-      if dtype is not None and inputs.dtype != dtype:
-        raise ValueError(
-            "dtype from input tensor is different from the "
-            f"input param. Input tensor dtype: {inputs.dtype}, dtype: {dtype}")
 
-      batch_size = inputs.shape.dims[0].value or tf.compat.v1.shape(inputs)[0]
-      dtype = inputs.dtype
-    if batch_size is None or dtype is None:
-      raise ValueError(
-          "batch_size and dtype cannot be None while constructing initial "
-          f"state: batch_size={batch_size}, dtype={dtype}")
-    return self.zero_state(batch_size, dtype)
+    def __call__(self, inputs, state, scope=None, *args, **kwargs):
+        """Run this RNN cell on inputs, starting from the given state.
+
+        Args:
+          inputs: `2-D` tensor with shape `[batch_size, input_size]`.
+          state: if `self.state_size` is an integer, this should be a `2-D
+            Tensor` with shape `[batch_size, self.state_size]`.  Otherwise, if
+            `self.state_size` is a tuple of integers, this should be a tuple
+            with shapes `[batch_size, s] for s in self.state_size`.
+          scope: optional cell scope.
+          *args: Additional positional arguments.
+          **kwargs: Additional keyword arguments.
+
+        Returns:
+          A pair containing:
+
+          - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`.
+          - New state: Either a single `2-D` tensor, or a tuple of tensors
+            matching the arity and shapes of `state`.
+        """
+        # Bypass RNNCell's variable capturing semantics for LayerRNNCell.
+        # Instead, it is up to subclasses to provide a proper build
+        # method.  See the class docstring for more details.
+        return base_layer.Layer.__call__(
+            self, inputs, state, scope=scope, *args, **kwargs
+        )
 
-  def zero_state(self, batch_size, dtype):
-    """Return zero-filled state tensor(s).
 
-    Args:
-      batch_size: int, float, or unit Tensor representing the batch size.
-      dtype: the data type to use for the state.
+@keras_export(v1=["keras.__internal__.legacy.rnn_cell.BasicRNNCell"])
+@tf_export(v1=["nn.rnn_cell.BasicRNNCell"])
+class BasicRNNCell(LayerRNNCell):
+    """The most basic RNN cell.
 
-    Returns:
-      If `state_size` is an int or TensorShape, then the return value is a
-      `N-D` tensor of shape `[batch_size, state_size]` filled with zeros.
+    Note that this cell is not optimized for performance.
 
-      If `state_size` is a nested list or tuple, then the return value is
-      a nested list or tuple (of the same structure) of `2-D` tensors with
-      the shapes `[batch_size, s]` for each s in `state_size`.
+    Args:
+      num_units: int, The number of units in the RNN cell.
+      activation: Nonlinearity to use.  Default: `tanh`. It could also be string
+        that is within Keras activation function names.
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope. If not `True`, and the existing scope already has the
+        given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
     """
-    # Try to use the last cached zero_state. This is done to avoid recreating
-    # zeros, especially when eager execution is enabled.
-    state_size = self.state_size
-    is_eager = tf.executing_eagerly()
-    if is_eager and _hasattr(self, "_last_zero_state"):
-      (last_state_size, last_batch_size, last_dtype,
-       last_output) = getattr(self, "_last_zero_state")
-      if (last_batch_size == batch_size and last_dtype == dtype and
-          last_state_size == state_size):
-        return last_output
-    with backend.name_scope(type(self).__name__ + "ZeroState"):
-      output = _zero_state_tensors(state_size, batch_size, dtype)
-    if is_eager:
-      self._last_zero_state = (state_size, batch_size, dtype, output)
-    return output
-
-  # TODO(b/134773139): Remove when contrib RNN cells implement `get_config`
-  def get_config(self):  # pylint: disable=useless-super-delegation
-    return super().get_config()
-
-  @property
-  def _use_input_spec_as_call_signature(self):
-    # We do not store the shape information for the state argument in the call
-    # function for legacy RNN cells, so do not generate an input signature.
-    return False
 
+    def __init__(
+        self,
+        num_units,
+        activation=None,
+        reuse=None,
+        name=None,
+        dtype=None,
+        **kwargs,
+    ):
+        warnings.warn(
+            "`tf.nn.rnn_cell.BasicRNNCell` is deprecated and will be "
+            "removed in a future version. This class "
+            "is equivalent as `tf.keras.layers.SimpleRNNCell`, "
+            "and will be replaced by that in Tensorflow 2.0.",
+            stacklevel=2,
+        )
+        super().__init__(_reuse=reuse, name=name, dtype=dtype, **kwargs)
+        _check_supported_dtypes(self.dtype)
+        if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
+            logging.warning(
+                "%s: Note that this cell is not optimized for performance.",
+                self,
+            )
+
+        # Inputs must be 2-dimensional.
+        self.input_spec = input_spec.InputSpec(ndim=2)
+
+        self._num_units = num_units
+        if activation:
+            self._activation = activations.get(activation)
+        else:
+            self._activation = tf.tanh
 
-class LayerRNNCell(RNNCell):
-  """Subclass of RNNCells that act like proper `tf.Layer` objects.
+    @property
+    def state_size(self):
+        return self._num_units
 
-  For backwards compatibility purposes, most `RNNCell` instances allow their
-  `call` methods to instantiate variables via `tf.compat.v1.get_variable`.  The
-  underlying
-  variable scope thus keeps track of any variables, and returning cached
-  versions.  This is atypical of `tf.layer` objects, which separate this
-  part of layer building into a `build` method that is only called once.
+    @property
+    def output_size(self):
+        return self._num_units
 
-  Here we provide a subclass for `RNNCell` objects that act exactly as
-  `Layer` objects do.  They must provide a `build` method and their
-  `call` methods do not access Variables `tf.compat.v1.get_variable`.
-  """
+    @tf_utils.shape_type_conversion
+    def build(self, inputs_shape):
+        if inputs_shape[-1] is None:
+            raise ValueError(
+                "Expected inputs.shape[-1] to be known, "
+                f"received shape: {inputs_shape}"
+            )
+        _check_supported_dtypes(self.dtype)
+
+        input_depth = inputs_shape[-1]
+        self._kernel = self.add_weight(
+            _WEIGHTS_VARIABLE_NAME,
+            shape=[input_depth + self._num_units, self._num_units],
+        )
+        self._bias = self.add_weight(
+            _BIAS_VARIABLE_NAME,
+            shape=[self._num_units],
+            initializer=tf.compat.v1.zeros_initializer(dtype=self.dtype),
+        )
+
+        self.built = True
+
+    def call(self, inputs, state):
+        """Most basic RNN: output = new_state = act(W * input + U * state +
+        B)."""
+        _check_rnn_cell_input_dtypes([inputs, state])
+        gate_inputs = tf.matmul(tf.concat([inputs, state], 1), self._kernel)
+        gate_inputs = tf.nn.bias_add(gate_inputs, self._bias)
+        output = self._activation(gate_inputs)
+        return output, output
+
+    def get_config(self):
+        config = {
+            "num_units": self._num_units,
+            "activation": activations.serialize(self._activation),
+            "reuse": self._reuse,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
-  def __call__(self, inputs, state, scope=None, *args, **kwargs):
-    """Run this RNN cell on inputs, starting from the given state.
 
-    Args:
-      inputs: `2-D` tensor with shape `[batch_size, input_size]`.
-      state: if `self.state_size` is an integer, this should be a `2-D Tensor`
-        with shape `[batch_size, self.state_size]`.  Otherwise, if
-        `self.state_size` is a tuple of integers, this should be a tuple with
-        shapes `[batch_size, s] for s in self.state_size`.
-      scope: optional cell scope.
-      *args: Additional positional arguments.
-      **kwargs: Additional keyword arguments.
+@keras_export(v1=["keras.__internal__.legacy.rnn_cell.GRUCell"])
+@tf_export(v1=["nn.rnn_cell.GRUCell"])
+class GRUCell(LayerRNNCell):
+    """Gated Recurrent Unit cell.
 
-    Returns:
-      A pair containing:
+    Note that this cell is not optimized for performance. Please use
+    `tf.compat.v1.keras.layers.CuDNNGRU` for better performance on GPU, or
+    `tf.raw_ops.GRUBlockCell` for better performance on CPU.
 
-      - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`.
-      - New state: Either a single `2-D` tensor, or a tuple of tensors matching
-        the arity and shapes of `state`.
+    Args:
+      num_units: int, The number of units in the GRU cell.
+      activation: Nonlinearity to use.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope. If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+      kernel_initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      bias_initializer: (optional) The initializer to use for the bias.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
+        References: Learning Phrase Representations using RNN Encoder Decoder
+        for Statistical Machine Translation: [Cho et al., 2014]
+        (https://aclanthology.coli.uni-saarland.de/papers/D14-1179/d14-1179)
+        ([pdf](http://emnlp2014.org/papers/pdf/EMNLP2014179.pdf))
     """
-    # Bypass RNNCell's variable capturing semantics for LayerRNNCell.
-    # Instead, it is up to subclasses to provide a proper build
-    # method.  See the class docstring for more details.
-    return base_layer.Layer.__call__(
-        self, inputs, state, scope=scope, *args, **kwargs)
 
+    def __init__(
+        self,
+        num_units,
+        activation=None,
+        reuse=None,
+        kernel_initializer=None,
+        bias_initializer=None,
+        name=None,
+        dtype=None,
+        **kwargs,
+    ):
+        warnings.warn(
+            "`tf.nn.rnn_cell.GRUCell` is deprecated and will be removed "
+            "in a future version. This class "
+            "is equivalent as `tf.keras.layers.GRUCell`, "
+            "and will be replaced by that in Tensorflow 2.0.",
+            stacklevel=2,
+        )
+        super().__init__(_reuse=reuse, name=name, dtype=dtype, **kwargs)
+        _check_supported_dtypes(self.dtype)
+
+        if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
+            logging.warning(
+                "%s: Note that this cell is not optimized for performance. "
+                "Please use tf.compat.v1.keras.layers.CuDNNGRU for better "
+                "performance on GPU.",
+                self,
+            )
+        # Inputs must be 2-dimensional.
+        self.input_spec = input_spec.InputSpec(ndim=2)
+
+        self._num_units = num_units
+        if activation:
+            self._activation = activations.get(activation)
+        else:
+            self._activation = tf.tanh
+        self._kernel_initializer = initializers.get(kernel_initializer)
+        self._bias_initializer = initializers.get(bias_initializer)
 
-@keras_export(v1=["keras.__internal__.legacy.rnn_cell.BasicRNNCell"])
-@tf_export(v1=["nn.rnn_cell.BasicRNNCell"])
-class BasicRNNCell(LayerRNNCell):
-  """The most basic RNN cell.
-
-  Note that this cell is not optimized for performance. Please use
-  `tf.contrib.cudnn_rnn.CudnnRNNTanh` for better performance on GPU.
-
-  Args:
-    num_units: int, The number of units in the RNN cell.
-    activation: Nonlinearity to use.  Default: `tanh`. It could also be string
-      that is within Keras activation function names.
-    reuse: (optional) Python boolean describing whether to reuse variables in an
-      existing scope.  If not `True`, and the existing scope already has the
-      given variables, an error is raised.
-    name: String, the name of the layer. Layers with the same name will share
-      weights, but to avoid mistakes we require reuse=True in such cases.
-    dtype: Default dtype of the layer (default of `None` means use the type of
-      the first input). Required when `build` is called before `call`.
-    **kwargs: Dict, keyword named properties for common layer attributes, like
-      `trainable` etc when constructing the cell from configs of get_config().
-  """
-
-  def __init__(self,
-               num_units,
-               activation=None,
-               reuse=None,
-               name=None,
-               dtype=None,
-               **kwargs):
-    warnings.warn(
-        "`tf.nn.rnn_cell.BasicRNNCell` is deprecated and will be "
-        "removed in a future version. This class "
-        "is equivalent as `tf.keras.layers.SimpleRNNCell`, "
-        "and will be replaced by that in Tensorflow 2.0.",
-        stacklevel=2)
-    super().__init__(
-        _reuse=reuse, name=name, dtype=dtype, **kwargs)
-    _check_supported_dtypes(self.dtype)
-    if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
-      logging.warning(
-          "%s: Note that this cell is not optimized for performance. "
-          "Please use tf.contrib.cudnn_rnn.CudnnRNNTanh for better "
-          "performance on GPU.", self)
-
-    # Inputs must be 2-dimensional.
-    self.input_spec = input_spec.InputSpec(ndim=2)
-
-    self._num_units = num_units
-    if activation:
-      self._activation = activations.get(activation)
-    else:
-      self._activation = tf.tanh
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  @tf_utils.shape_type_conversion
-  def build(self, inputs_shape):
-    if inputs_shape[-1] is None:
-      raise ValueError(
-          "Expected inputs.shape[-1] to be known, "
-          f"received shape: {inputs_shape}")
-    _check_supported_dtypes(self.dtype)
-
-    input_depth = inputs_shape[-1]
-    self._kernel = self.add_weight(
-        _WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth + self._num_units, self._num_units])
-    self._bias = self.add_weight(
-        _BIAS_VARIABLE_NAME,
-        shape=[self._num_units],
-        initializer=tf.compat.v1.zeros_initializer(dtype=self.dtype))
-
-    self.built = True
-
-  def call(self, inputs, state):
-    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
-    _check_rnn_cell_input_dtypes([inputs, state])
-    gate_inputs = tf.matmul(
-        tf.concat([inputs, state], 1), self._kernel)
-    gate_inputs = tf.nn.bias_add(gate_inputs, self._bias)
-    output = self._activation(gate_inputs)
-    return output, output
-
-  def get_config(self):
-    config = {
-        "num_units": self._num_units,
-        "activation": activations.serialize(self._activation),
-        "reuse": self._reuse,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    @property
+    def state_size(self):
+        return self._num_units
 
+    @property
+    def output_size(self):
+        return self._num_units
 
-@keras_export(v1=["keras.__internal__.legacy.rnn_cell.GRUCell"])
-@tf_export(v1=["nn.rnn_cell.GRUCell"])
-class GRUCell(LayerRNNCell):
-  """Gated Recurrent Unit cell.
-
-  Note that this cell is not optimized for performance. Please use
-  `tf.contrib.cudnn_rnn.CudnnGRU` for better performance on GPU, or
-  `tf.contrib.rnn.GRUBlockCellV2` for better performance on CPU.
-
-  Args:
-    num_units: int, The number of units in the GRU cell.
-    activation: Nonlinearity to use.  Default: `tanh`.
-    reuse: (optional) Python boolean describing whether to reuse variables in an
-      existing scope.  If not `True`, and the existing scope already has the
-      given variables, an error is raised.
-    kernel_initializer: (optional) The initializer to use for the weight and
-      projection matrices.
-    bias_initializer: (optional) The initializer to use for the bias.
-    name: String, the name of the layer. Layers with the same name will share
-      weights, but to avoid mistakes we require reuse=True in such cases.
-    dtype: Default dtype of the layer (default of `None` means use the type of
-      the first input). Required when `build` is called before `call`.
-    **kwargs: Dict, keyword named properties for common layer attributes, like
-      `trainable` etc when constructing the cell from configs of get_config().
-      References: Learning Phrase Representations using RNN Encoder Decoder for
-        Statistical
-    Machine Translation: [Cho et al., 2014]
-      (https://aclanthology.coli.uni-saarland.de/papers/D14-1179/d14-1179)
-      ([pdf](http://emnlp2014.org/papers/pdf/EMNLP2014179.pdf))
-  """
-
-  def __init__(self,
-               num_units,
-               activation=None,
-               reuse=None,
-               kernel_initializer=None,
-               bias_initializer=None,
-               name=None,
-               dtype=None,
-               **kwargs):
-    warnings.warn(
-        "`tf.nn.rnn_cell.GRUCell` is deprecated and will be removed "
-        "in a future version. This class "
-        "is equivalent as `tf.keras.layers.GRUCell`, "
-        "and will be replaced by that in Tensorflow 2.0.",
-        stacklevel=2)
-    super().__init__(
-        _reuse=reuse, name=name, dtype=dtype, **kwargs)
-    _check_supported_dtypes(self.dtype)
-
-    if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
-      logging.warning(
-          "%s: Note that this cell is not optimized for performance. "
-          "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
-          "performance on GPU.", self)
-    # Inputs must be 2-dimensional.
-    self.input_spec = input_spec.InputSpec(ndim=2)
-
-    self._num_units = num_units
-    if activation:
-      self._activation = activations.get(activation)
-    else:
-      self._activation = tf.tanh
-    self._kernel_initializer = initializers.get(kernel_initializer)
-    self._bias_initializer = initializers.get(bias_initializer)
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  @tf_utils.shape_type_conversion
-  def build(self, inputs_shape):
-    if inputs_shape[-1] is None:
-      raise ValueError(
-          "Expected inputs.shape[-1] to be known, "
-          f"received shape: {inputs_shape}")
-    _check_supported_dtypes(self.dtype)
-    input_depth = inputs_shape[-1]
-    self._gate_kernel = self.add_weight(
-        "gates/%s" % _WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth + self._num_units, 2 * self._num_units],
-        initializer=self._kernel_initializer)
-    self._gate_bias = self.add_weight(
-        "gates/%s" % _BIAS_VARIABLE_NAME,
-        shape=[2 * self._num_units],
-        initializer=(self._bias_initializer
-                     if self._bias_initializer is not None else
-                     tf.compat.v1.constant_initializer(1.0, dtype=self.dtype)))
-    self._candidate_kernel = self.add_weight(
-        "candidate/%s" % _WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth + self._num_units, self._num_units],
-        initializer=self._kernel_initializer)
-    self._candidate_bias = self.add_weight(
-        "candidate/%s" % _BIAS_VARIABLE_NAME,
-        shape=[self._num_units],
-        initializer=(self._bias_initializer
-                     if self._bias_initializer is not None else
-                     tf.compat.v1.zeros_initializer(dtype=self.dtype)))
-
-    self.built = True
-
-  def call(self, inputs, state):
-    """Gated recurrent unit (GRU) with nunits cells."""
-    _check_rnn_cell_input_dtypes([inputs, state])
-
-    gate_inputs = tf.matmul(
-        tf.concat([inputs, state], 1), self._gate_kernel)
-    gate_inputs = tf.nn.bias_add(gate_inputs, self._gate_bias)
-
-    value = tf.sigmoid(gate_inputs)
-    r, u = tf.split(value=value, num_or_size_splits=2, axis=1)
-
-    r_state = r * state
-
-    candidate = tf.matmul(
-        tf.concat([inputs, r_state], 1), self._candidate_kernel)
-    candidate = tf.nn.bias_add(candidate, self._candidate_bias)
-
-    c = self._activation(candidate)
-    new_h = u * state + (1 - u) * c
-    return new_h, new_h
-
-  def get_config(self):
-    config = {
-        "num_units": self._num_units,
-        "kernel_initializer": initializers.serialize(self._kernel_initializer),
-        "bias_initializer": initializers.serialize(self._bias_initializer),
-        "activation": activations.serialize(self._activation),
-        "reuse": self._reuse,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    @tf_utils.shape_type_conversion
+    def build(self, inputs_shape):
+        if inputs_shape[-1] is None:
+            raise ValueError(
+                "Expected inputs.shape[-1] to be known, "
+                f"received shape: {inputs_shape}"
+            )
+        _check_supported_dtypes(self.dtype)
+        input_depth = inputs_shape[-1]
+        self._gate_kernel = self.add_weight(
+            f"gates/{_WEIGHTS_VARIABLE_NAME}",
+            shape=[input_depth + self._num_units, 2 * self._num_units],
+            initializer=self._kernel_initializer,
+        )
+        self._gate_bias = self.add_weight(
+            f"gates/{_BIAS_VARIABLE_NAME}",
+            shape=[2 * self._num_units],
+            initializer=(
+                self._bias_initializer
+                if self._bias_initializer is not None
+                else tf.compat.v1.constant_initializer(1.0, dtype=self.dtype)
+            ),
+        )
+        self._candidate_kernel = self.add_weight(
+            f"candidate/{_WEIGHTS_VARIABLE_NAME}",
+            shape=[input_depth + self._num_units, self._num_units],
+            initializer=self._kernel_initializer,
+        )
+        self._candidate_bias = self.add_weight(
+            f"candidate/{_BIAS_VARIABLE_NAME}",
+            shape=[self._num_units],
+            initializer=(
+                self._bias_initializer
+                if self._bias_initializer is not None
+                else tf.compat.v1.zeros_initializer(dtype=self.dtype)
+            ),
+        )
+
+        self.built = True
+
+    def call(self, inputs, state):
+        """Gated recurrent unit (GRU) with nunits cells."""
+        _check_rnn_cell_input_dtypes([inputs, state])
+
+        gate_inputs = tf.matmul(
+            tf.concat([inputs, state], 1), self._gate_kernel
+        )
+        gate_inputs = tf.nn.bias_add(gate_inputs, self._gate_bias)
+
+        value = tf.sigmoid(gate_inputs)
+        r, u = tf.split(value=value, num_or_size_splits=2, axis=1)
+
+        r_state = r * state
+
+        candidate = tf.matmul(
+            tf.concat([inputs, r_state], 1), self._candidate_kernel
+        )
+        candidate = tf.nn.bias_add(candidate, self._candidate_bias)
+
+        c = self._activation(candidate)
+        new_h = u * state + (1 - u) * c
+        return new_h, new_h
+
+    def get_config(self):
+        config = {
+            "num_units": self._num_units,
+            "kernel_initializer": initializers.serialize(
+                self._kernel_initializer
+            ),
+            "bias_initializer": initializers.serialize(self._bias_initializer),
+            "activation": activations.serialize(self._activation),
+            "reuse": self._reuse,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 _LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))
@@ -593,638 +645,714 @@ def get_config(self):
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.LSTMStateTuple"])
 @tf_export(v1=["nn.rnn_cell.LSTMStateTuple"])
 class LSTMStateTuple(_LSTMStateTuple):
-  """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
+    """Tuple used by LSTM Cells for `state_size`, `zero_state`, & output state.
+
+    Stores two elements: `(c, h)`, in that order. Where `c` is the hidden state
+    and `h` is the output.
 
-  Stores two elements: `(c, h)`, in that order. Where `c` is the hidden state
-  and `h` is the output.
+    Only used when `state_is_tuple=True`.
+    """
 
-  Only used when `state_is_tuple=True`.
-  """
-  __slots__ = ()
+    __slots__ = ()
 
-  @property
-  def dtype(self):
-    (c, h) = self
-    if c.dtype != h.dtype:
-      raise TypeError("Inconsistent dtypes for internal state: "
-                      f"{c.dtype} vs {h.dtype}")
-    return c.dtype
+    @property
+    def dtype(self):
+        (c, h) = self
+        if c.dtype != h.dtype:
+            raise TypeError(
+                "Inconsistent dtypes for internal state: "
+                f"{c.dtype} vs {h.dtype}"
+            )
+        return c.dtype
 
 
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.BasicLSTMCell"])
 @tf_export(v1=["nn.rnn_cell.BasicLSTMCell"])
 class BasicLSTMCell(LayerRNNCell):
-  """DEPRECATED: Please use `tf.compat.v1.nn.rnn_cell.LSTMCell` instead.
+    """DEPRECATED: Please use `tf.compat.v1.nn.rnn_cell.LSTMCell` instead.
 
-  Basic LSTM recurrent network cell.
+    Basic LSTM recurrent network cell.
 
-  The implementation is based on
+    The implementation is based on
 
-  We add forget_bias (default: 1) to the biases of the forget gate in order to
-  reduce the scale of forgetting in the beginning of the training.
+    We add forget_bias (default: 1) to the biases of the forget gate in order to
+    reduce the scale of forgetting in the beginning of the training.
 
-  It does not allow cell clipping, a projection layer, and does not
-  use peep-hole connections: it is the basic baseline.
+    It does not allow cell clipping, a projection layer, and does not
+    use peep-hole connections: it is the basic baseline.
 
-  For advanced models, please use the full `tf.compat.v1.nn.rnn_cell.LSTMCell`
-  that follows.
+    For advanced models, please use the full `tf.compat.v1.nn.rnn_cell.LSTMCell`
+    that follows.
 
-  Note that this cell is not optimized for performance. Please use
-  `tf.contrib.cudnn_rnn.CudnnLSTM` for better performance on GPU, or
-  `tf.contrib.rnn.LSTMBlockCell` and `tf.contrib.rnn.LSTMBlockFusedCell` for
-  better performance on CPU.
-  """
+    Note that this cell is not optimized for performance. Please use
+    `tf.compat.v1.keras.layers.CuDNNLSTM` for better performance on GPU, or
+    `tf.raw_ops.LSTMBlockCell` for better performance on CPU.
+    """
 
-  def __init__(self,
-               num_units,
-               forget_bias=1.0,
-               state_is_tuple=True,
-               activation=None,
-               reuse=None,
-               name=None,
-               dtype=None,
-               **kwargs):
-    """Initialize the basic LSTM cell.
+    def __init__(
+        self,
+        num_units,
+        forget_bias=1.0,
+        state_is_tuple=True,
+        activation=None,
+        reuse=None,
+        name=None,
+        dtype=None,
+        **kwargs,
+    ):
+        """Initialize the basic LSTM cell.
+
+        Args:
+          num_units: int, The number of units in the LSTM cell.
+          forget_bias: float, The bias added to forget gates (see above). Must
+            set to `0.0` manually when restoring from CudnnLSTM-trained
+            checkpoints.
+          state_is_tuple: If True, accepted and returned states are 2-tuples of
+            the `c_state` and `m_state`.  If False, they are concatenated along
+            the column axis.  The latter behavior will soon be deprecated.
+          activation: Activation function of the inner states.  Default: `tanh`.
+            It could also be string that is within Keras activation function
+            names.
+          reuse: (optional) Python boolean describing whether to reuse variables
+            in an existing scope.  If not `True`, and the existing scope already
+            has the given variables, an error is raised.
+          name: String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require reuse=True in such
+            cases.
+          dtype: Default dtype of the layer (default of `None` means use the
+            type of the first input). Required when `build` is called before
+            `call`.
+          **kwargs: Dict, keyword named properties for common layer attributes,
+            like `trainable` etc when constructing the cell from configs of
+            get_config().  When restoring from CudnnLSTM-trained checkpoints,
+            must use `CudnnCompatibleLSTMCell` instead.
+        """
+        warnings.warn(
+            "`tf.nn.rnn_cell.BasicLSTMCell` is deprecated and will be "
+            "removed in a future version. This class "
+            "is equivalent as `tf.keras.layers.LSTMCell`, "
+            "and will be replaced by that in Tensorflow 2.0.",
+            stacklevel=2,
+        )
+        super().__init__(_reuse=reuse, name=name, dtype=dtype, **kwargs)
+        _check_supported_dtypes(self.dtype)
+        if not state_is_tuple:
+            logging.warning(
+                "%s: Using a concatenated state is slower and will soon be "
+                "deprecated.  Use state_is_tuple=True.",
+                self,
+            )
+        if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
+            logging.warning(
+                "%s: Note that this cell is not optimized for performance. "
+                "Please use tf.compat.v1.keras.layers.CuDNNLSTM for better "
+                "performance on GPU.",
+                self,
+            )
+
+        # Inputs must be 2-dimensional.
+        self.input_spec = input_spec.InputSpec(ndim=2)
+
+        self._num_units = num_units
+        self._forget_bias = forget_bias
+        self._state_is_tuple = state_is_tuple
+        if activation:
+            self._activation = activations.get(activation)
+        else:
+            self._activation = tf.tanh
+
+    @property
+    def state_size(self):
+        return (
+            LSTMStateTuple(self._num_units, self._num_units)
+            if self._state_is_tuple
+            else 2 * self._num_units
+        )
+
+    @property
+    def output_size(self):
+        return self._num_units
+
+    @tf_utils.shape_type_conversion
+    def build(self, inputs_shape):
+        if inputs_shape[-1] is None:
+            raise ValueError(
+                "Expected inputs.shape[-1] to be known, "
+                f"received shape: {inputs_shape}"
+            )
+        _check_supported_dtypes(self.dtype)
+        input_depth = inputs_shape[-1]
+        h_depth = self._num_units
+        self._kernel = self.add_weight(
+            _WEIGHTS_VARIABLE_NAME,
+            shape=[input_depth + h_depth, 4 * self._num_units],
+        )
+        self._bias = self.add_weight(
+            _BIAS_VARIABLE_NAME,
+            shape=[4 * self._num_units],
+            initializer=tf.compat.v1.zeros_initializer(dtype=self.dtype),
+        )
+
+        self.built = True
+
+    def call(self, inputs, state):
+        """Long short-term memory cell (LSTM).
+
+        Args:
+          inputs: `2-D` tensor with shape `[batch_size, input_size]`.
+          state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size,
+            num_units]`, if `state_is_tuple` has been set to `True`.  Otherwise,
+            a `Tensor` shaped `[batch_size, 2 * num_units]`.
+
+        Returns:
+          A pair containing the new hidden state, and the new state (either a
+            `LSTMStateTuple` or a concatenated state, depending on
+            `state_is_tuple`).
+        """
+        _check_rnn_cell_input_dtypes([inputs, state])
+
+        sigmoid = tf.sigmoid
+        one = tf.constant(1, dtype=tf.int32)
+        # Parameters of gates are concatenated into one multiply for efficiency.
+        if self._state_is_tuple:
+            c, h = state
+        else:
+            c, h = tf.split(value=state, num_or_size_splits=2, axis=one)
 
-    Args:
-      num_units: int, The number of units in the LSTM cell.
-      forget_bias: float, The bias added to forget gates (see above). Must set
-        to `0.0` manually when restoring from CudnnLSTM-trained checkpoints.
-      state_is_tuple: If True, accepted and returned states are 2-tuples of the
-        `c_state` and `m_state`.  If False, they are concatenated along the
-        column axis.  The latter behavior will soon be deprecated.
-      activation: Activation function of the inner states.  Default: `tanh`. It
-        could also be string that is within Keras activation function names.
-      reuse: (optional) Python boolean describing whether to reuse variables in
-        an existing scope.  If not `True`, and the existing scope already has
-        the given variables, an error is raised.
-      name: String, the name of the layer. Layers with the same name will share
-        weights, but to avoid mistakes we require reuse=True in such cases.
-      dtype: Default dtype of the layer (default of `None` means use the type of
-        the first input). Required when `build` is called before `call`.
-      **kwargs: Dict, keyword named properties for common layer attributes, like
-        `trainable` etc when constructing the cell from configs of get_config().
-        When restoring from CudnnLSTM-trained checkpoints, must use
-        `CudnnCompatibleLSTMCell` instead.
-    """
-    warnings.warn(
-        "`tf.nn.rnn_cell.BasicLSTMCell` is deprecated and will be "
-        "removed in a future version. This class "
-        "is equivalent as `tf.keras.layers.LSTMCell`, "
-        "and will be replaced by that in Tensorflow 2.0.",
-        stacklevel=2)
-    super().__init__(
-        _reuse=reuse, name=name, dtype=dtype, **kwargs)
-    _check_supported_dtypes(self.dtype)
-    if not state_is_tuple:
-      logging.warning(
-          "%s: Using a concatenated state is slower and will soon be "
-          "deprecated.  Use state_is_tuple=True.", self)
-    if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
-      logging.warning(
-          "%s: Note that this cell is not optimized for performance. "
-          "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
-          "performance on GPU.", self)
-
-    # Inputs must be 2-dimensional.
-    self.input_spec = input_spec.InputSpec(ndim=2)
-
-    self._num_units = num_units
-    self._forget_bias = forget_bias
-    self._state_is_tuple = state_is_tuple
-    if activation:
-      self._activation = activations.get(activation)
-    else:
-      self._activation = tf.tanh
-
-  @property
-  def state_size(self):
-    return (LSTMStateTuple(self._num_units, self._num_units)
-            if self._state_is_tuple else 2 * self._num_units)
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  @tf_utils.shape_type_conversion
-  def build(self, inputs_shape):
-    if inputs_shape[-1] is None:
-      raise ValueError(
-          "Expected inputs.shape[-1] to be known, "
-          f"received shape: {inputs_shape}")
-    _check_supported_dtypes(self.dtype)
-    input_depth = inputs_shape[-1]
-    h_depth = self._num_units
-    self._kernel = self.add_weight(
-        _WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth + h_depth, 4 * self._num_units])
-    self._bias = self.add_weight(
-        _BIAS_VARIABLE_NAME,
-        shape=[4 * self._num_units],
-        initializer=tf.compat.v1.zeros_initializer(dtype=self.dtype))
-
-    self.built = True
-
-  def call(self, inputs, state):
-    """Long short-term memory cell (LSTM).
+        gate_inputs = tf.matmul(tf.concat([inputs, h], 1), self._kernel)
+        gate_inputs = tf.nn.bias_add(gate_inputs, self._bias)
 
-    Args:
-      inputs: `2-D` tensor with shape `[batch_size, input_size]`.
-      state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size,
-        num_units]`, if `state_is_tuple` has been set to `True`.  Otherwise, a
-        `Tensor` shaped `[batch_size, 2 * num_units]`.
+        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+        i, j, f, o = tf.split(value=gate_inputs, num_or_size_splits=4, axis=one)
 
-    Returns:
-      A pair containing the new hidden state, and the new state (either a
-        `LSTMStateTuple` or a concatenated state, depending on
-        `state_is_tuple`).
-    """
-    _check_rnn_cell_input_dtypes([inputs, state])
+        forget_bias_tensor = tf.constant(self._forget_bias, dtype=f.dtype)
+        # Note that using `add` and `multiply` instead of `+` and `*` gives a
+        # performance improvement. So using those at the cost of readability.
+        add = tf.add
+        multiply = tf.multiply
+        new_c = add(
+            multiply(c, sigmoid(add(f, forget_bias_tensor))),
+            multiply(sigmoid(i), self._activation(j)),
+        )
+        new_h = multiply(self._activation(new_c), sigmoid(o))
 
-    sigmoid = tf.sigmoid
-    one = tf.constant(1, dtype=tf.int32)
-    # Parameters of gates are concatenated into one multiply for efficiency.
-    if self._state_is_tuple:
-      c, h = state
-    else:
-      c, h = tf.split(value=state, num_or_size_splits=2, axis=one)
-
-    gate_inputs = tf.matmul(
-        tf.concat([inputs, h], 1), self._kernel)
-    gate_inputs = tf.nn.bias_add(gate_inputs, self._bias)
-
-    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-    i, j, f, o = tf.split(
-        value=gate_inputs, num_or_size_splits=4, axis=one)
-
-    forget_bias_tensor = tf.constant(self._forget_bias, dtype=f.dtype)
-    # Note that using `add` and `multiply` instead of `+` and `*` gives a
-    # performance improvement. So using those at the cost of readability.
-    add = tf.add
-    multiply = tf.multiply
-    new_c = add(
-        multiply(c, sigmoid(add(f, forget_bias_tensor))),
-        multiply(sigmoid(i), self._activation(j)))
-    new_h = multiply(self._activation(new_c), sigmoid(o))
-
-    if self._state_is_tuple:
-      new_state = LSTMStateTuple(new_c, new_h)
-    else:
-      new_state = tf.concat([new_c, new_h], 1)
-    return new_h, new_state
+        if self._state_is_tuple:
+            new_state = LSTMStateTuple(new_c, new_h)
+        else:
+            new_state = tf.concat([new_c, new_h], 1)
+        return new_h, new_state
 
-  def get_config(self):
-    config = {
-        "num_units": self._num_units,
-        "forget_bias": self._forget_bias,
-        "state_is_tuple": self._state_is_tuple,
-        "activation": activations.serialize(self._activation),
-        "reuse": self._reuse,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def get_config(self):
+        config = {
+            "num_units": self._num_units,
+            "forget_bias": self._forget_bias,
+            "state_is_tuple": self._state_is_tuple,
+            "activation": activations.serialize(self._activation),
+            "reuse": self._reuse,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.LSTMCell"])
 @tf_export(v1=["nn.rnn_cell.LSTMCell"])
 class LSTMCell(LayerRNNCell):
-  """Long short-term memory unit (LSTM) recurrent network cell.
-
-  The default non-peephole implementation is based on (Gers et al., 1999).
-  The peephole implementation is based on (Sak et al., 2014).
-
-  The class uses optional peep-hole connections, optional cell clipping, and
-  an optional projection layer.
-
-  Note that this cell is not optimized for performance. Please use
-  `tf.contrib.cudnn_rnn.CudnnLSTM` for better performance on GPU, or
-  `tf.contrib.rnn.LSTMBlockCell` and `tf.contrib.rnn.LSTMBlockFusedCell` for
-  better performance on CPU.
-  References:
-    Long short-term memory recurrent neural network architectures for large
-    scale acoustic modeling:
-      [Sak et al., 2014]
-      (https://www.isca-speech.org/archive/interspeech_2014/i14_0338.html)
-      ([pdf]
-      (https://www.isca-speech.org/archive/archive_papers/interspeech_2014/i14_0338.pdf))
-    Learning to forget:
-      [Gers et al., 1999]
-      (http://digital-library.theiet.org/content/conferences/10.1049/cp_19991218)
-      ([pdf](https://arxiv.org/pdf/1409.2329.pdf))
-    Long Short-Term Memory:
-      [Hochreiter et al., 1997]
-      (https://www.mitpressjournals.org/doi/abs/10.1162/neco.1997.9.8.1735)
-      ([pdf](http://ml.jku.at/publications/older/3504.pdf))
-  """
-
-  def __init__(self,
-               num_units,
-               use_peepholes=False,
-               cell_clip=None,
-               initializer=None,
-               num_proj=None,
-               proj_clip=None,
-               num_unit_shards=None,
-               num_proj_shards=None,
-               forget_bias=1.0,
-               state_is_tuple=True,
-               activation=None,
-               reuse=None,
-               name=None,
-               dtype=None,
-               **kwargs):
-    """Initialize the parameters for an LSTM cell.
-
-    Args:
-      num_units: int, The number of units in the LSTM cell.
-      use_peepholes: bool, set True to enable diagonal/peephole connections.
-      cell_clip: (optional) A float value, if provided the cell state is clipped
-        by this value prior to the cell output activation.
-      initializer: (optional) The initializer to use for the weight and
-        projection matrices.
-      num_proj: (optional) int, The output dimensionality for the projection
-        matrices.  If None, no projection is performed.
-      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
-        provided, then the projected values are clipped elementwise to within
-        `[-proj_clip, proj_clip]`.
-      num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a
-        variable_scope partitioner instead.
-      num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a
-        variable_scope partitioner instead.
-      forget_bias: Biases of the forget gate are initialized by default to 1 in
-        order to reduce the scale of forgetting at the beginning of the
-        training. Must set it manually to `0.0` when restoring from CudnnLSTM
-        trained checkpoints.
-      state_is_tuple: If True, accepted and returned states are 2-tuples of the
-        `c_state` and `m_state`.  If False, they are concatenated along the
-        column axis.  This latter behavior will soon be deprecated.
-      activation: Activation function of the inner states.  Default: `tanh`. It
-        could also be string that is within Keras activation function names.
-      reuse: (optional) Python boolean describing whether to reuse variables in
-        an existing scope.  If not `True`, and the existing scope already has
-        the given variables, an error is raised.
-      name: String, the name of the layer. Layers with the same name will share
-        weights, but to avoid mistakes we require reuse=True in such cases.
-      dtype: Default dtype of the layer (default of `None` means use the type of
-        the first input). Required when `build` is called before `call`.
-      **kwargs: Dict, keyword named properties for common layer attributes, like
-        `trainable` etc when constructing the cell from configs of get_config().
-        When restoring from CudnnLSTM-trained checkpoints, use
-        `CudnnCompatibleLSTMCell` instead.
+    """Long short-term memory unit (LSTM) recurrent network cell.
+
+    The default non-peephole implementation is based on (Gers et al., 1999).
+    The peephole implementation is based on (Sak et al., 2014).
+
+    The class uses optional peep-hole connections, optional cell clipping, and
+    an optional projection layer.
+
+    Note that this cell is not optimized for performance. Please use
+    `tf.compat.v1.keras.layers.CuDNNLSTM` for better performance on GPU, or
+    `tf.raw_ops.LSTMBlockCell` for better performance on CPU.
+    References:
+      Long short-term memory recurrent neural network architectures for large
+      scale acoustic modeling:
+        [Sak et al., 2014]
+        (https://www.isca-speech.org/archive/interspeech_2014/i14_0338.html)
+        ([pdf]
+        (https://www.isca-speech.org/archive/archive_papers/interspeech_2014/i14_0338.pdf))
+      Learning to forget:
+        [Gers et al., 1999]
+        (http://digital-library.theiet.org/content/conferences/10.1049/cp_19991218)
+        ([pdf](https://arxiv.org/pdf/1409.2329.pdf))
+      Long Short-Term Memory:
+        [Hochreiter et al., 1997]
+        (https://www.mitpressjournals.org/doi/abs/10.1162/neco.1997.9.8.1735)
+        ([pdf](http://ml.jku.at/publications/older/3504.pdf))
     """
-    warnings.warn(
-        "`tf.nn.rnn_cell.LSTMCell` is deprecated and will be "
-        "removed in a future version. This class "
-        "is equivalent as `tf.keras.layers.LSTMCell`, "
-        "and will be replaced by that in Tensorflow 2.0.",
-        stacklevel=2)
-    super().__init__(
-        _reuse=reuse, name=name, dtype=dtype, **kwargs)
-    _check_supported_dtypes(self.dtype)
-    if not state_is_tuple:
-      logging.warning(
-          "%s: Using a concatenated state is slower and will soon be "
-          "deprecated.  Use state_is_tuple=True.", self)
-    if num_unit_shards is not None or num_proj_shards is not None:
-      logging.warning(
-          "%s: The num_unit_shards and proj_unit_shards parameters are "
-          "deprecated and will be removed in Jan 2017.  "
-          "Use a variable scope with a partitioner instead.", self)
-    if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
-      logging.warning(
-          "%s: Note that this cell is not optimized for performance. "
-          "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
-          "performance on GPU.", self)
-
-    # Inputs must be 2-dimensional.
-    self.input_spec = input_spec.InputSpec(ndim=2)
-
-    self._num_units = num_units
-    self._use_peepholes = use_peepholes
-    self._cell_clip = cell_clip
-    self._initializer = initializers.get(initializer)
-    self._num_proj = num_proj
-    self._proj_clip = proj_clip
-    self._num_unit_shards = num_unit_shards
-    self._num_proj_shards = num_proj_shards
-    self._forget_bias = forget_bias
-    self._state_is_tuple = state_is_tuple
-    if activation:
-      self._activation = activations.get(activation)
-    else:
-      self._activation = tf.tanh
-
-    if num_proj:
-      self._state_size = (
-          LSTMStateTuple(num_units, num_proj) if state_is_tuple else num_units +
-          num_proj)
-      self._output_size = num_proj
-    else:
-      self._state_size = (
-          LSTMStateTuple(num_units, num_units) if state_is_tuple else 2 *
-          num_units)
-      self._output_size = num_units
-
-  @property
-  def state_size(self):
-    return self._state_size
-
-  @property
-  def output_size(self):
-    return self._output_size
-
-  @tf_utils.shape_type_conversion
-  def build(self, inputs_shape):
-    if inputs_shape[-1] is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, "
-                       f"received shape: {inputs_shape}")
-    _check_supported_dtypes(self.dtype)
-    input_depth = inputs_shape[-1]
-    h_depth = self._num_units if self._num_proj is None else self._num_proj
-    maybe_partitioner = (
-        tf.compat.v1.fixed_size_partitioner(self._num_unit_shards)
-        if self._num_unit_shards is not None else None)
-    self._kernel = self.add_weight(
-        _WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth + h_depth, 4 * self._num_units],
-        initializer=self._initializer,
-        partitioner=maybe_partitioner)
-    if self.dtype is None:
-      initializer = tf.compat.v1.zeros_initializer
-    else:
-      initializer = tf.compat.v1.zeros_initializer(dtype=self.dtype)
-    self._bias = self.add_weight(
-        _BIAS_VARIABLE_NAME,
-        shape=[4 * self._num_units],
-        initializer=initializer)
-    if self._use_peepholes:
-      self._w_f_diag = self.add_weight(
-          "w_f_diag", shape=[self._num_units], initializer=self._initializer)
-      self._w_i_diag = self.add_weight(
-          "w_i_diag", shape=[self._num_units], initializer=self._initializer)
-      self._w_o_diag = self.add_weight(
-          "w_o_diag", shape=[self._num_units], initializer=self._initializer)
-
-    if self._num_proj is not None:
-      maybe_proj_partitioner = (
-          tf.compat.v1.fixed_size_partitioner(self._num_proj_shards)
-          if self._num_proj_shards is not None else None)
-      self._proj_kernel = self.add_weight(
-          "projection/%s" % _WEIGHTS_VARIABLE_NAME,
-          shape=[self._num_units, self._num_proj],
-          initializer=self._initializer,
-          partitioner=maybe_proj_partitioner)
-
-    self.built = True
-
-  def call(self, inputs, state):
-    """Run one step of LSTM.
 
-    Args:
-      inputs: input Tensor, must be 2-D, `[batch, input_size]`.
-      state: if `state_is_tuple` is False, this must be a state Tensor, `2-D,
-        [batch, state_size]`.  If `state_is_tuple` is True, this must be a tuple
-        of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`.
+    def __init__(
+        self,
+        num_units,
+        use_peepholes=False,
+        cell_clip=None,
+        initializer=None,
+        num_proj=None,
+        proj_clip=None,
+        num_unit_shards=None,
+        num_proj_shards=None,
+        forget_bias=1.0,
+        state_is_tuple=True,
+        activation=None,
+        reuse=None,
+        name=None,
+        dtype=None,
+        **kwargs,
+    ):
+        """Initialize the parameters for an LSTM cell.
+
+        Args:
+          num_units: int, The number of units in the LSTM cell.
+          use_peepholes: bool, set True to enable diagonal/peephole connections.
+          cell_clip: (optional) A float value, if provided the cell state is
+            clipped by this value prior to the cell output activation.
+          initializer: (optional) The initializer to use for the weight and
+            projection matrices.
+          num_proj: (optional) int, The output dimensionality for the projection
+            matrices.  If None, no projection is performed.
+          proj_clip: (optional) A float value.  If `num_proj > 0` and
+            `proj_clip` is provided, then the projected values are clipped
+            elementwise to within `[-proj_clip, proj_clip]`.
+          num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a
+            variable_scope partitioner instead.
+          num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a
+            variable_scope partitioner instead.
+          forget_bias: Biases of the forget gate are initialized by default to 1
+            in order to reduce the scale of forgetting at the beginning of the
+            training. Must set it manually to `0.0` when restoring from
+            CudnnLSTM trained checkpoints.
+          state_is_tuple: If True, accepted and returned states are 2-tuples of
+            the `c_state` and `m_state`.  If False, they are concatenated along
+            the column axis.  This latter behavior will soon be deprecated.
+          activation: Activation function of the inner states.  Default: `tanh`.
+            It could also be string that is within Keras activation function
+            names.
+          reuse: (optional) Python boolean describing whether to reuse variables
+            in an existing scope.  If not `True`, and the existing scope already
+            has the given variables, an error is raised.
+          name: String, the name of the layer. Layers with the same name will
+            share weights, but to avoid mistakes we require reuse=True in such
+            cases.
+          dtype: Default dtype of the layer (default of `None` means use the
+            type of the first input). Required when `build` is called before
+            `call`.
+          **kwargs: Dict, keyword named properties for common layer attributes,
+            like `trainable` etc when constructing the cell from configs of
+            get_config().  When restoring from CudnnLSTM-trained checkpoints,
+            use `CudnnCompatibleLSTMCell` instead.
+        """
+        warnings.warn(
+            "`tf.nn.rnn_cell.LSTMCell` is deprecated and will be "
+            "removed in a future version. This class "
+            "is equivalent as `tf.keras.layers.LSTMCell`, "
+            "and will be replaced by that in Tensorflow 2.0.",
+            stacklevel=2,
+        )
+        super().__init__(_reuse=reuse, name=name, dtype=dtype, **kwargs)
+        _check_supported_dtypes(self.dtype)
+        if not state_is_tuple:
+            logging.warning(
+                "%s: Using a concatenated state is slower and will soon be "
+                "deprecated.  Use state_is_tuple=True.",
+                self,
+            )
+        if num_unit_shards is not None or num_proj_shards is not None:
+            logging.warning(
+                "%s: The num_unit_shards and proj_unit_shards parameters are "
+                "deprecated and will be removed in Jan 2017.  "
+                "Use a variable scope with a partitioner instead.",
+                self,
+            )
+        if tf.executing_eagerly() and tf.config.list_logical_devices("GPU"):
+            logging.warning(
+                "%s: Note that this cell is not optimized for performance. "
+                "Please use tf.compat.v1.keras.layers.CuDNNLSTM for better "
+                "performance on GPU.",
+                self,
+            )
+
+        # Inputs must be 2-dimensional.
+        self.input_spec = input_spec.InputSpec(ndim=2)
+
+        self._num_units = num_units
+        self._use_peepholes = use_peepholes
+        self._cell_clip = cell_clip
+        self._initializer = initializers.get(initializer)
+        self._num_proj = num_proj
+        self._proj_clip = proj_clip
+        self._num_unit_shards = num_unit_shards
+        self._num_proj_shards = num_proj_shards
+        self._forget_bias = forget_bias
+        self._state_is_tuple = state_is_tuple
+        if activation:
+            self._activation = activations.get(activation)
+        else:
+            self._activation = tf.tanh
+
+        if num_proj:
+            self._state_size = (
+                LSTMStateTuple(num_units, num_proj)
+                if state_is_tuple
+                else num_units + num_proj
+            )
+            self._output_size = num_proj
+        else:
+            self._state_size = (
+                LSTMStateTuple(num_units, num_units)
+                if state_is_tuple
+                else 2 * num_units
+            )
+            self._output_size = num_units
+
+    @property
+    def state_size(self):
+        return self._state_size
+
+    @property
+    def output_size(self):
+        return self._output_size
+
+    @tf_utils.shape_type_conversion
+    def build(self, inputs_shape):
+        if inputs_shape[-1] is None:
+            raise ValueError(
+                "Expected inputs.shape[-1] to be known, "
+                f"received shape: {inputs_shape}"
+            )
+        _check_supported_dtypes(self.dtype)
+        input_depth = inputs_shape[-1]
+        h_depth = self._num_units if self._num_proj is None else self._num_proj
+        maybe_partitioner = (
+            tf.compat.v1.fixed_size_partitioner(self._num_unit_shards)
+            if self._num_unit_shards is not None
+            else None
+        )
+        self._kernel = self.add_weight(
+            _WEIGHTS_VARIABLE_NAME,
+            shape=[input_depth + h_depth, 4 * self._num_units],
+            initializer=self._initializer,
+            partitioner=maybe_partitioner,
+        )
+        if self.dtype is None:
+            initializer = tf.compat.v1.zeros_initializer
+        else:
+            initializer = tf.compat.v1.zeros_initializer(dtype=self.dtype)
+        self._bias = self.add_weight(
+            _BIAS_VARIABLE_NAME,
+            shape=[4 * self._num_units],
+            initializer=initializer,
+        )
+        if self._use_peepholes:
+            self._w_f_diag = self.add_weight(
+                "w_f_diag",
+                shape=[self._num_units],
+                initializer=self._initializer,
+            )
+            self._w_i_diag = self.add_weight(
+                "w_i_diag",
+                shape=[self._num_units],
+                initializer=self._initializer,
+            )
+            self._w_o_diag = self.add_weight(
+                "w_o_diag",
+                shape=[self._num_units],
+                initializer=self._initializer,
+            )
+
+        if self._num_proj is not None:
+            maybe_proj_partitioner = (
+                tf.compat.v1.fixed_size_partitioner(self._num_proj_shards)
+                if self._num_proj_shards is not None
+                else None
+            )
+            self._proj_kernel = self.add_weight(
+                f"projection/{_WEIGHTS_VARIABLE_NAME}",
+                shape=[self._num_units, self._num_proj],
+                initializer=self._initializer,
+                partitioner=maybe_proj_partitioner,
+            )
+
+        self.built = True
+
+    def call(self, inputs, state):
+        """Run one step of LSTM.
+
+        Args:
+          inputs: input Tensor, must be 2-D, `[batch, input_size]`.
+          state: if `state_is_tuple` is False, this must be a state Tensor,
+            `2-D, [batch, state_size]`.  If `state_is_tuple` is True, this must
+            be a tuple of state Tensors, both `2-D`, with column sizes `c_state`
+            and `m_state`.
+
+        Returns:
+          A tuple containing:
+
+          - A `2-D, [batch, output_dim]`, Tensor representing the output of the
+            LSTM after reading `inputs` when previous state was `state`.
+            Here output_dim is:
+               num_proj if num_proj was set,
+               num_units otherwise.
+          - Tensor(s) representing the new state of LSTM after reading `inputs`
+            when the previous state was `state`.  Same type and shape(s) as
+            `state`.
+
+        Raises:
+          ValueError: If input size cannot be inferred from inputs via
+            static shape inference.
+        """
+        _check_rnn_cell_input_dtypes([inputs, state])
+
+        num_proj = self._num_units if self._num_proj is None else self._num_proj
+        sigmoid = tf.sigmoid
 
-    Returns:
-      A tuple containing:
+        if self._state_is_tuple:
+            (c_prev, m_prev) = state
+        else:
+            c_prev = tf.slice(state, [0, 0], [-1, self._num_units])
+            m_prev = tf.slice(state, [0, self._num_units], [-1, num_proj])
 
-      - A `2-D, [batch, output_dim]`, Tensor representing the output of the
-        LSTM after reading `inputs` when previous state was `state`.
-        Here output_dim is:
-           num_proj if num_proj was set,
-           num_units otherwise.
-      - Tensor(s) representing the new state of LSTM after reading `inputs` when
-        the previous state was `state`.  Same type and shape(s) as `state`.
+        input_size = inputs.get_shape().with_rank(2).dims[1].value
+        if input_size is None:
+            raise ValueError(
+                "Could not infer input size from inputs.get_shape()[-1]."
+                f"Received input shape: {inputs.get_shape()}"
+            )
+
+        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+        lstm_matrix = tf.matmul(tf.concat([inputs, m_prev], 1), self._kernel)
+        lstm_matrix = tf.nn.bias_add(lstm_matrix, self._bias)
+
+        i, j, f, o = tf.split(value=lstm_matrix, num_or_size_splits=4, axis=1)
+        # Diagonal connections
+        if self._use_peepholes:
+            c = sigmoid(
+                f + self._forget_bias + self._w_f_diag * c_prev
+            ) * c_prev + sigmoid(
+                i + self._w_i_diag * c_prev
+            ) * self._activation(
+                j
+            )
+        else:
+            c = sigmoid(f + self._forget_bias) * c_prev + sigmoid(
+                i
+            ) * self._activation(j)
 
-    Raises:
-      ValueError: If input size cannot be inferred from inputs via
-        static shape inference.
-    """
-    _check_rnn_cell_input_dtypes([inputs, state])
+        if self._cell_clip is not None:
 
-    num_proj = self._num_units if self._num_proj is None else self._num_proj
-    sigmoid = tf.sigmoid
+            c = tf.clip_by_value(c, -self._cell_clip, self._cell_clip)
 
-    if self._state_is_tuple:
-      (c_prev, m_prev) = state
-    else:
-      c_prev = tf.slice(state, [0, 0], [-1, self._num_units])
-      m_prev = tf.slice(state, [0, self._num_units], [-1, num_proj])
-
-    input_size = inputs.get_shape().with_rank(2).dims[1].value
-    if input_size is None:
-      raise ValueError(
-          "Could not infer input size from inputs.get_shape()[-1]."
-          f"Received input shape: {inputs.get_shape()}")
-
-    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-    lstm_matrix = tf.matmul(
-        tf.concat([inputs, m_prev], 1), self._kernel)
-    lstm_matrix = tf.nn.bias_add(lstm_matrix, self._bias)
-
-    i, j, f, o = tf.split(
-        value=lstm_matrix, num_or_size_splits=4, axis=1)
-    # Diagonal connections
-    if self._use_peepholes:
-      c = (
-          sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev +
-          sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
-    else:
-      c = (
-          sigmoid(f + self._forget_bias) * c_prev +
-          sigmoid(i) * self._activation(j))
-
-    if self._cell_clip is not None:
-      # pylint: disable=invalid-unary-operand-type
-      c = tf.clip_by_value(c, -self._cell_clip, self._cell_clip)
-      # pylint: enable=invalid-unary-operand-type
-    if self._use_peepholes:
-      m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
-    else:
-      m = sigmoid(o) * self._activation(c)
-
-    if self._num_proj is not None:
-      m = tf.matmul(m, self._proj_kernel)
-
-      if self._proj_clip is not None:
-        # pylint: disable=invalid-unary-operand-type
-        m = tf.clip_by_value(m, -self._proj_clip, self._proj_clip)
-        # pylint: enable=invalid-unary-operand-type
-
-    new_state = (
-        LSTMStateTuple(c, m)
-        if self._state_is_tuple else tf.concat([c, m], 1))
-    return m, new_state
-
-  def get_config(self):
-    config = {
-        "num_units": self._num_units,
-        "use_peepholes": self._use_peepholes,
-        "cell_clip": self._cell_clip,
-        "initializer": initializers.serialize(self._initializer),
-        "num_proj": self._num_proj,
-        "proj_clip": self._proj_clip,
-        "num_unit_shards": self._num_unit_shards,
-        "num_proj_shards": self._num_proj_shards,
-        "forget_bias": self._forget_bias,
-        "state_is_tuple": self._state_is_tuple,
-        "activation": activations.serialize(self._activation),
-        "reuse": self._reuse,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+        if self._use_peepholes:
+            m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
+        else:
+            m = sigmoid(o) * self._activation(c)
+
+        if self._num_proj is not None:
+            m = tf.matmul(m, self._proj_kernel)
+
+            if self._proj_clip is not None:
+
+                m = tf.clip_by_value(m, -self._proj_clip, self._proj_clip)
+
+        new_state = (
+            LSTMStateTuple(c, m)
+            if self._state_is_tuple
+            else tf.concat([c, m], 1)
+        )
+        return m, new_state
+
+    def get_config(self):
+        config = {
+            "num_units": self._num_units,
+            "use_peepholes": self._use_peepholes,
+            "cell_clip": self._cell_clip,
+            "initializer": initializers.serialize(self._initializer),
+            "num_proj": self._num_proj,
+            "proj_clip": self._proj_clip,
+            "num_unit_shards": self._num_unit_shards,
+            "num_proj_shards": self._num_proj_shards,
+            "forget_bias": self._forget_bias,
+            "state_is_tuple": self._state_is_tuple,
+            "activation": activations.serialize(self._activation),
+            "reuse": self._reuse,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 @keras_export(v1=["keras.__internal__.legacy.rnn_cell.MultiRNNCell"])
 @tf_export(v1=["nn.rnn_cell.MultiRNNCell"])
 class MultiRNNCell(RNNCell):
-  """RNN cell composed sequentially of multiple simple cells.
-
-  Example:
-
-  ```python
-  num_units = [128, 64]
-  cells = [BasicLSTMCell(num_units=n) for n in num_units]
-  stacked_rnn_cell = MultiRNNCell(cells)
-  ```
-  """
+    """RNN cell composed sequentially of multiple simple cells.
 
-  def __init__(self, cells, state_is_tuple=True):
-    """Create a RNN cell composed sequentially of a number of RNNCells.
+    Example:
 
-    Args:
-      cells: list of RNNCells that will be composed in this order.
-      state_is_tuple: If True, accepted and returned states are n-tuples, where
-        `n = len(cells)`.  If False, the states are all concatenated along the
-        column axis.  This latter behavior will soon be deprecated.
-
-    Raises:
-      ValueError: if cells is empty (not allowed), or at least one of the cells
-        returns a state tuple but the flag `state_is_tuple` is `False`.
+    ```python
+    num_units = [128, 64]
+    cells = [BasicLSTMCell(num_units=n) for n in num_units]
+    stacked_rnn_cell = MultiRNNCell(cells)
+    ```
     """
-    logging.warning("`tf.nn.rnn_cell.MultiRNNCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.StackedRNNCells`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
-    super().__init__()
-    if not cells:
-      raise ValueError("Must specify at least one cell for MultiRNNCell.")
-    if not tf.nest.is_nested(cells):
-      raise TypeError(f"cells must be a list or tuple, but received: {cells}.")
-
-    if len(set(id(cell) for cell in cells)) < len(cells):
-      logging.log_first_n(
-          logging.WARN, "At least two cells provided to MultiRNNCell "
-          "are the same object and will share weights.", 1)
-
-    self._cells = cells
-    for cell_number, cell in enumerate(self._cells):
-      # Add Trackable dependencies on these cells so their variables get
-      # saved with this object when using object-based saving.
-      if isinstance(cell, tf.__internal__.tracking.Trackable):
-        # TODO(allenl): Track down non-Trackable callers.
-        self._track_trackable(cell, name="cell-%d" % (cell_number,))
-    self._state_is_tuple = state_is_tuple
-    if not state_is_tuple:
-      if any(tf.nest.is_nested(c.state_size) for c in self._cells):
-        raise ValueError(
-            "Some cells return tuples of states, but the flag "
-            "state_is_tuple is not set. "
-            f"State sizes are: {[c.state_size for c in self._cells]}")
-
-  @property
-  def state_size(self):
-    if self._state_is_tuple:
-      return tuple(cell.state_size for cell in self._cells)
-    else:
-      return sum(cell.state_size for cell in self._cells)
-
-  @property
-  def output_size(self):
-    return self._cells[-1].output_size
-
-  def zero_state(self, batch_size, dtype):
-    with backend.name_scope(type(self).__name__ + "ZeroState"):
-      if self._state_is_tuple:
-        return tuple(cell.zero_state(batch_size, dtype) for cell in self._cells)
-      else:
-        # We know here that state_size of each cell is not a tuple and
-        # presumably does not contain TensorArrays or anything else fancy
-        return super().zero_state(batch_size, dtype)
-
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for cell in self._cells:
-      if isinstance(cell, base_layer.Layer):
-        weights += cell.trainable_weights
-    return weights
-
-  @property
-  def non_trainable_weights(self):
-    weights = []
-    for cell in self._cells:
-      if isinstance(cell, base_layer.Layer):
-        weights += cell.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for cell in self._cells:
-        if isinstance(cell, base_layer.Layer):
-          trainable_weights += cell.trainable_weights
-      return trainable_weights + weights
-    return weights
-
-  def call(self, inputs, state):
-    """Run this multi-layer cell on inputs, starting from state."""
-    cur_state_pos = 0
-    cur_inp = inputs
-    new_states = []
-    for i, cell in enumerate(self._cells):
-      with tf.compat.v1.variable_scope("cell_%d" % i):
+
+    def __init__(self, cells, state_is_tuple=True):
+        """Create a RNN cell composed sequentially of a number of RNNCells.
+
+        Args:
+          cells: list of RNNCells that will be composed in this order.
+          state_is_tuple: If True, accepted and returned states are n-tuples,
+            where `n = len(cells)`.  If False, the states are all concatenated
+            along the column axis.  This latter behavior will soon be
+            deprecated.
+
+        Raises:
+          ValueError: if cells is empty (not allowed), or at least one of the
+            cells returns a state tuple but the flag `state_is_tuple` is
+            `False`.
+        """
+        logging.warning(
+            "`tf.nn.rnn_cell.MultiRNNCell` is deprecated. This class "
+            "is equivalent as `tf.keras.layers.StackedRNNCells`, "
+            "and will be replaced by that in Tensorflow 2.0."
+        )
+        super().__init__()
+        if not cells:
+            raise ValueError("Must specify at least one cell for MultiRNNCell.")
+        if not tf.nest.is_nested(cells):
+            raise TypeError(
+                f"cells must be a list or tuple, but received: {cells}."
+            )
+
+        if len(set(id(cell) for cell in cells)) < len(cells):
+            logging.log_first_n(
+                logging.WARN,
+                "At least two cells provided to MultiRNNCell "
+                "are the same object and will share weights.",
+                1,
+            )
+
+        self._cells = cells
+        for cell_number, cell in enumerate(self._cells):
+            # Add Trackable dependencies on these cells so their variables get
+            # saved with this object when using object-based saving.
+            if isinstance(cell, tf.__internal__.tracking.Trackable):
+                # TODO(allenl): Track down non-Trackable callers.
+                self._track_trackable(cell, name="cell-%d" % (cell_number,))
+        self._state_is_tuple = state_is_tuple
+        if not state_is_tuple:
+            if any(tf.nest.is_nested(c.state_size) for c in self._cells):
+                raise ValueError(
+                    "Some cells return tuples of states, but the flag "
+                    "state_is_tuple is not set. "
+                    f"State sizes are: {[c.state_size for c in self._cells]}"
+                )
+
+    @property
+    def state_size(self):
         if self._state_is_tuple:
-          if not tf.nest.is_nested(state):
-            raise ValueError(
-                f"Expected state to be a tuple of length {len(self.state_size)}"
-                f", but received: {state}")
-          cur_state = state[i]
+            return tuple(cell.state_size for cell in self._cells)
         else:
-          cur_state = tf.slice(state, [0, cur_state_pos], [-1, cell.state_size])
-          cur_state_pos += cell.state_size
-        cur_inp, new_state = cell(cur_inp, cur_state)
-        new_states.append(new_state)
-
-    new_states = (
-        tuple(new_states) if self._state_is_tuple else tf.concat(
-            new_states, 1))
-
-    return cur_inp, new_states
+            return sum(cell.state_size for cell in self._cells)
+
+    @property
+    def output_size(self):
+        return self._cells[-1].output_size
+
+    def zero_state(self, batch_size, dtype):
+        with backend.name_scope(type(self).__name__ + "ZeroState"):
+            if self._state_is_tuple:
+                return tuple(
+                    cell.zero_state(batch_size, dtype) for cell in self._cells
+                )
+            else:
+                # We know here that state_size of each cell is not a tuple and
+                # presumably does not contain TensorArrays or anything else
+                # fancy
+                return super().zero_state(batch_size, dtype)
+
+    @property
+    def trainable_weights(self):
+        if not self.trainable:
+            return []
+        weights = []
+        for cell in self._cells:
+            if isinstance(cell, base_layer.Layer):
+                weights += cell.trainable_weights
+        return weights
+
+    @property
+    def non_trainable_weights(self):
+        weights = []
+        for cell in self._cells:
+            if isinstance(cell, base_layer.Layer):
+                weights += cell.non_trainable_weights
+        if not self.trainable:
+            trainable_weights = []
+            for cell in self._cells:
+                if isinstance(cell, base_layer.Layer):
+                    trainable_weights += cell.trainable_weights
+            return trainable_weights + weights
+        return weights
+
+    def call(self, inputs, state):
+        """Run this multi-layer cell on inputs, starting from state."""
+        cur_state_pos = 0
+        cur_inp = inputs
+        new_states = []
+        for i, cell in enumerate(self._cells):
+            with tf.compat.v1.variable_scope("cell_%d" % i):
+                if self._state_is_tuple:
+                    if not tf.nest.is_nested(state):
+                        raise ValueError(
+                            "Expected state to be a tuple of length "
+                            f"{len(self.state_size)}"
+                            f", but received: {state}"
+                        )
+                    cur_state = state[i]
+                else:
+                    cur_state = tf.slice(
+                        state, [0, cur_state_pos], [-1, cell.state_size]
+                    )
+                    cur_state_pos += cell.state_size
+                cur_inp, new_state = cell(cur_inp, cur_state)
+                new_states.append(new_state)
+
+        new_states = (
+            tuple(new_states)
+            if self._state_is_tuple
+            else tf.concat(new_states, 1)
+        )
+
+        return cur_inp, new_states
 
 
 def _check_rnn_cell_input_dtypes(inputs):
-  """Check whether the input tensors are with supported dtypes.
+    """Check whether the input tensors are with supported dtypes.
 
-  Default RNN cells only support floats and complex as its dtypes since the
-  activation function (tanh and sigmoid) only allow those types. This function
-  will throw a proper error message if the inputs is not in a supported type.
+    Default RNN cells only support floats and complex as its dtypes since the
+    activation function (tanh and sigmoid) only allow those types. This function
+    will throw a proper error message if the inputs is not in a supported type.
 
-  Args:
-    inputs: tensor or nested structure of tensors that are feed to RNN cell as
-      input or state.
+    Args:
+      inputs: tensor or nested structure of tensors that are feed to RNN cell as
+        input or state.
 
-  Raises:
-    ValueError: if any of the input tensor are not having dtypes of float or
-      complex.
-  """
-  for t in tf.nest.flatten(inputs):
-    _check_supported_dtypes(t.dtype)
+    Raises:
+      ValueError: if any of the input tensor are not having dtypes of float or
+        complex.
+    """
+    for t in tf.nest.flatten(inputs):
+        _check_supported_dtypes(t.dtype)
 
 
 def _check_supported_dtypes(dtype):
-  if dtype is None:
-    return
-  dtype = tf.as_dtype(dtype)
-  if not (dtype.is_floating or dtype.is_complex):
-    raise ValueError("RNN cell only supports floating point inputs, "
-                     f"but received dtype: {dtype}")
+    if dtype is None:
+        return
+    dtype = tf.as_dtype(dtype)
+    if not (dtype.is_floating or dtype.is_complex):
+        raise ValueError(
+            "RNN cell only supports floating point inputs, "
+            f"but received dtype: {dtype}"
+        )
diff --git a/keras/layers/rnn/lstm.py b/keras/layers/rnn/lstm.py
index 30d08fbb5e53..47ae51f7e6a5 100644
--- a/keras/layers/rnn/lstm.py
+++ b/keras/layers/rnn/lstm.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 """Long Short-Term Memory layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import uuid
 
+import tensorflow.compat.v2 as tf
+
 from keras import activations
 from keras import backend
 from keras import constraints
@@ -29,1158 +31,1314 @@
 from keras.layers.rnn.base_rnn import RNN
 from keras.layers.rnn.dropout_rnn_cell_mixin import DropoutRNNCellMixin
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-
 RECURRENT_DROPOUT_WARNING_MSG = (
-    'RNN `implementation=2` is not supported when `recurrent_dropout` is set. '
-    'Using `implementation=1`.')
+    "RNN `implementation=2` is not supported when `recurrent_dropout` is set. "
+    "Using `implementation=1`."
+)
 
 
-@keras_export('keras.layers.LSTMCell', v1=[])
+@keras_export("keras.layers.LSTMCell", v1=[])
 class LSTMCell(DropoutRNNCellMixin, base_layer.BaseRandomLayer):
-  """Cell class for the LSTM layer.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  This class processes one step within the whole time sequence input, whereas
-  `tf.keras.layer.LSTM` processes the whole sequence.
-
-  For example:
-
-  >>> inputs = tf.random.normal([32, 10, 8])
-  >>> rnn = tf.keras.layers.RNN(tf.keras.layers.LSTMCell(4))
-  >>> output = rnn(inputs)
-  >>> print(output.shape)
-  (32, 4)
-  >>> rnn = tf.keras.layers.RNN(
-  ...    tf.keras.layers.LSTMCell(4),
-  ...    return_sequences=True,
-  ...    return_state=True)
-  >>> whole_seq_output, final_memory_state, final_carry_state = rnn(inputs)
-  >>> print(whole_seq_output.shape)
-  (32, 10, 4)
-  >>> print(final_memory_state.shape)
-  (32, 4)
-  >>> print(final_carry_state.shape)
-  (32, 4)
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use. Default: hyperbolic tangent
-      (`tanh`). If you pass `None`, no activation is applied (ie. "linear"
-      activation: `a(x) = x`).
-    recurrent_activation: Activation function to use for the recurrent step.
-      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs. Default: `glorot_uniform`.
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state.
-      Default: `orthogonal`.
-    bias_initializer: Initializer for the bias vector. Default: `zeros`.
-    unit_forget_bias: Boolean (default `True`). If True, add 1 to the bias of
-      the forget gate at initialization. Setting it to true will also force
-      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
-        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_regularizer: Regularizer function applied to
-      the `recurrent_kernel` weights matrix. Default: `None`.
-    bias_regularizer: Regularizer function applied to the bias vector. Default:
-      `None`.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix. Default: `None`.
-    bias_constraint: Constraint function applied to the bias vector. Default:
-      `None`.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs. Default: 0.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state. Default: 0.
-
-  Call arguments:
-    inputs: A 2D tensor, with shape of `[batch, feature]`.
-    states: List of 2 tensors that corresponding to the cell's units. Both of
-      them have shape `[batch, units]`, the first tensor is the memory state
-      from previous time step, the second tensor is the carry state from
-      previous time step. For timestep 0, the initial state provided by user
-      will be feed to cell.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. Only relevant when `dropout` or
-      `recurrent_dropout` is used.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               **kwargs):
-    if units < 0:
-      raise ValueError(f'Received an invalid value for argument `units`, '
-                       f'expected a positive integer, got {units}.')
-    # By default use cached variable under v2 mode, see b/143699808.
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self._enable_caching_device = kwargs.pop('enable_caching_device', True)
-    else:
-      self._enable_caching_device = kwargs.pop('enable_caching_device', False)
-    super().__init__(**kwargs)
-    self.units = units
-    self.activation = activations.get(activation)
-    self.recurrent_activation = activations.get(recurrent_activation)
-    self.use_bias = use_bias
-
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.recurrent_initializer = initializers.get(recurrent_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.unit_forget_bias = unit_forget_bias
-
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.recurrent_constraint = constraints.get(recurrent_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-    self.dropout = min(1., max(0., dropout))
-    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    implementation = kwargs.pop('implementation', 2)
-    if self.recurrent_dropout != 0 and implementation != 1:
-      logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
-      self.implementation = 1
-    else:
-      self.implementation = implementation
-    self.state_size = [self.units, self.units]
-    self.output_size = self.units
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    default_caching_device = rnn_utils.caching_device(self)
-    input_dim = input_shape[-1]
-    self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 4),
-        name='kernel',
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        caching_device=default_caching_device)
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units * 4),
-        name='recurrent_kernel',
-        initializer=self.recurrent_initializer,
-        regularizer=self.recurrent_regularizer,
-        constraint=self.recurrent_constraint,
-        caching_device=default_caching_device)
-
-    if self.use_bias:
-      if self.unit_forget_bias:
-
-        def bias_initializer(_, *args, **kwargs):
-          return backend.concatenate([
-              self.bias_initializer((self.units,), *args, **kwargs),
-              initializers.get('ones')((self.units,), *args, **kwargs),
-              self.bias_initializer((self.units * 2,), *args, **kwargs),
-          ])
-      else:
-        bias_initializer = self.bias_initializer
-      self.bias = self.add_weight(
-          shape=(self.units * 4,),
-          name='bias',
-          initializer=bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          caching_device=default_caching_device)
-    else:
-      self.bias = None
-    self.built = True
-
-  def _compute_carry_and_output(self, x, h_tm1, c_tm1):
-    """Computes carry and output using split kernels."""
-    x_i, x_f, x_c, x_o = x
-    h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1
-    i = self.recurrent_activation(
-        x_i + backend.dot(h_tm1_i, self.recurrent_kernel[:, :self.units]))
-    f = self.recurrent_activation(x_f + backend.dot(
-        h_tm1_f, self.recurrent_kernel[:, self.units:self.units * 2]))
-    c = f * c_tm1 + i * self.activation(x_c + backend.dot(
-        h_tm1_c, self.recurrent_kernel[:, self.units * 2:self.units * 3]))
-    o = self.recurrent_activation(
-        x_o + backend.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:]))
-    return c, o
-
-  def _compute_carry_and_output_fused(self, z, c_tm1):
-    """Computes carry and output using fused kernels."""
-    z0, z1, z2, z3 = z
-    i = self.recurrent_activation(z0)
-    f = self.recurrent_activation(z1)
-    c = f * c_tm1 + i * self.activation(z2)
-    o = self.recurrent_activation(z3)
-    return c, o
-
-  def call(self, inputs, states, training=None):
-    h_tm1 = states[0]  # previous memory state
-    c_tm1 = states[1]  # previous carry state
-
-    dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
-    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
-        h_tm1, training, count=4)
-
-    if self.implementation == 1:
-      if 0 < self.dropout < 1.:
-        inputs_i = inputs * dp_mask[0]
-        inputs_f = inputs * dp_mask[1]
-        inputs_c = inputs * dp_mask[2]
-        inputs_o = inputs * dp_mask[3]
-      else:
-        inputs_i = inputs
-        inputs_f = inputs
-        inputs_c = inputs
-        inputs_o = inputs
-      k_i, k_f, k_c, k_o = tf.split(
-          self.kernel, num_or_size_splits=4, axis=1)
-      x_i = backend.dot(inputs_i, k_i)
-      x_f = backend.dot(inputs_f, k_f)
-      x_c = backend.dot(inputs_c, k_c)
-      x_o = backend.dot(inputs_o, k_o)
-      if self.use_bias:
-        b_i, b_f, b_c, b_o = tf.split(
-            self.bias, num_or_size_splits=4, axis=0)
-        x_i = backend.bias_add(x_i, b_i)
-        x_f = backend.bias_add(x_f, b_f)
-        x_c = backend.bias_add(x_c, b_c)
-        x_o = backend.bias_add(x_o, b_o)
-
-      if 0 < self.recurrent_dropout < 1.:
-        h_tm1_i = h_tm1 * rec_dp_mask[0]
-        h_tm1_f = h_tm1 * rec_dp_mask[1]
-        h_tm1_c = h_tm1 * rec_dp_mask[2]
-        h_tm1_o = h_tm1 * rec_dp_mask[3]
-      else:
-        h_tm1_i = h_tm1
-        h_tm1_f = h_tm1
-        h_tm1_c = h_tm1
-        h_tm1_o = h_tm1
-      x = (x_i, x_f, x_c, x_o)
-      h_tm1 = (h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o)
-      c, o = self._compute_carry_and_output(x, h_tm1, c_tm1)
-    else:
-      if 0. < self.dropout < 1.:
-        inputs = inputs * dp_mask[0]
-      z = backend.dot(inputs, self.kernel)
-      z += backend.dot(h_tm1, self.recurrent_kernel)
-      if self.use_bias:
-        z = backend.bias_add(z, self.bias)
-
-      z = tf.split(z, num_or_size_splits=4, axis=1)
-      c, o = self._compute_carry_and_output_fused(z, c_tm1)
-
-    h = o * self.activation(c)
-    return h, [h, c]
-
-  def get_config(self):
-    config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
-    }
-    config.update(rnn_utils.config_for_enable_caching_device(self))
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Cell class for the LSTM layer.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    This class processes one step within the whole time sequence input, whereas
+    `tf.keras.layer.LSTM` processes the whole sequence.
+
+    For example:
+
+    >>> inputs = tf.random.normal([32, 10, 8])
+    >>> rnn = tf.keras.layers.RNN(tf.keras.layers.LSTMCell(4))
+    >>> output = rnn(inputs)
+    >>> print(output.shape)
+    (32, 4)
+    >>> rnn = tf.keras.layers.RNN(
+    ...    tf.keras.layers.LSTMCell(4),
+    ...    return_sequences=True,
+    ...    return_state=True)
+    >>> whole_seq_output, final_memory_state, final_carry_state = rnn(inputs)
+    >>> print(whole_seq_output.shape)
+    (32, 10, 4)
+    >>> print(final_memory_state.shape)
+    (32, 4)
+    >>> print(final_carry_state.shape)
+    (32, 4)
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use. Default: hyperbolic tangent
+        (`tanh`). If you pass `None`, no activation is applied (ie. "linear"
+        activation: `a(x) = x`).
+      recurrent_activation: Activation function to use for the recurrent step.
+        Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+        applied (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs. Default: `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+        Default: `orthogonal`.
+      bias_initializer: Initializer for the bias vector. Default: `zeros`.
+      unit_forget_bias: Boolean (default `True`). If True, add 1 to the bias of
+        the forget gate at initialization. Setting it to true will also force
+        `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](https://github.com/mlresearch/v37/blob/gh-pages/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix. Default: `None`.
+      bias_regularizer: Regularizer function applied to the bias vector.
+        Default: `None`.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_constraint: Constraint function applied to the bias vector. Default:
+        `None`.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs. Default: 0.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state. Default: 0.
+
+    Call arguments:
+      inputs: A 2D tensor, with shape of `[batch, feature]`.
+      states: List of 2 tensors that corresponding to the cell's units. Both of
+        them have shape `[batch, units]`, the first tensor is the memory state
+        from previous time step, the second tensor is the carry state from
+        previous time step. For timestep 0, the initial state provided by user
+        will be feed to cell.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. Only relevant when `dropout` or
+        `recurrent_dropout` is used.
+    """
+
+    def __init__(
+        self,
+        units,
+        activation="tanh",
+        recurrent_activation="sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs,
+    ):
+        if units <= 0:
+            raise ValueError(
+                "Received an invalid value for argument `units`, "
+                f"expected a positive integer, got {units}."
+            )
+        # By default use cached variable under v2 mode, see b/143699808.
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self._enable_caching_device = kwargs.pop(
+                "enable_caching_device", True
+            )
+        else:
+            self._enable_caching_device = kwargs.pop(
+                "enable_caching_device", False
+            )
+        super().__init__(**kwargs)
+        self.units = units
+        self.activation = activations.get(activation)
+        self.recurrent_activation = activations.get(recurrent_activation)
+        self.use_bias = use_bias
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.recurrent_initializer = initializers.get(recurrent_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.unit_forget_bias = unit_forget_bias
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.recurrent_constraint = constraints.get(recurrent_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+        self.dropout = min(1.0, max(0.0, dropout))
+        self.recurrent_dropout = min(1.0, max(0.0, recurrent_dropout))
+        implementation = kwargs.pop("implementation", 2)
+        if self.recurrent_dropout != 0 and implementation != 1:
+            logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
+            self.implementation = 1
+        else:
+            self.implementation = implementation
+        self.state_size = [self.units, self.units]
+        self.output_size = self.units
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        super().build(input_shape)
+        default_caching_device = rnn_utils.caching_device(self)
+        input_dim = input_shape[-1]
+        self.kernel = self.add_weight(
+            shape=(input_dim, self.units * 4),
+            name="kernel",
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            caching_device=default_caching_device,
+        )
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units * 4),
+            name="recurrent_kernel",
+            initializer=self.recurrent_initializer,
+            regularizer=self.recurrent_regularizer,
+            constraint=self.recurrent_constraint,
+            caching_device=default_caching_device,
+        )
+
+        if self.use_bias:
+            if self.unit_forget_bias:
+
+                def bias_initializer(_, *args, **kwargs):
+                    return backend.concatenate(
+                        [
+                            self.bias_initializer(
+                                (self.units,), *args, **kwargs
+                            ),
+                            initializers.get("ones")(
+                                (self.units,), *args, **kwargs
+                            ),
+                            self.bias_initializer(
+                                (self.units * 2,), *args, **kwargs
+                            ),
+                        ]
+                    )
+
+            else:
+                bias_initializer = self.bias_initializer
+            self.bias = self.add_weight(
+                shape=(self.units * 4,),
+                name="bias",
+                initializer=bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                caching_device=default_caching_device,
+            )
+        else:
+            self.bias = None
+        self.built = True
+
+    def _compute_carry_and_output(self, x, h_tm1, c_tm1):
+        """Computes carry and output using split kernels."""
+        x_i, x_f, x_c, x_o = x
+        h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1
+        i = self.recurrent_activation(
+            x_i + backend.dot(h_tm1_i, self.recurrent_kernel[:, : self.units])
+        )
+        f = self.recurrent_activation(
+            x_f
+            + backend.dot(
+                h_tm1_f, self.recurrent_kernel[:, self.units : self.units * 2]
+            )
+        )
+        c = f * c_tm1 + i * self.activation(
+            x_c
+            + backend.dot(
+                h_tm1_c,
+                self.recurrent_kernel[:, self.units * 2 : self.units * 3],
+            )
+        )
+        o = self.recurrent_activation(
+            x_o
+            + backend.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3 :])
+        )
+        return c, o
+
+    def _compute_carry_and_output_fused(self, z, c_tm1):
+        """Computes carry and output using fused kernels."""
+        z0, z1, z2, z3 = z
+        i = self.recurrent_activation(z0)
+        f = self.recurrent_activation(z1)
+        c = f * c_tm1 + i * self.activation(z2)
+        o = self.recurrent_activation(z3)
+        return c, o
+
+    def call(self, inputs, states, training=None):
+        h_tm1 = states[0]  # previous memory state
+        c_tm1 = states[1]  # previous carry state
+
+        dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
+        rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+            h_tm1, training, count=4
+        )
+
+        if self.implementation == 1:
+            if 0 < self.dropout < 1.0:
+                inputs_i = inputs * dp_mask[0]
+                inputs_f = inputs * dp_mask[1]
+                inputs_c = inputs * dp_mask[2]
+                inputs_o = inputs * dp_mask[3]
+            else:
+                inputs_i = inputs
+                inputs_f = inputs
+                inputs_c = inputs
+                inputs_o = inputs
+            k_i, k_f, k_c, k_o = tf.split(
+                self.kernel, num_or_size_splits=4, axis=1
+            )
+            x_i = backend.dot(inputs_i, k_i)
+            x_f = backend.dot(inputs_f, k_f)
+            x_c = backend.dot(inputs_c, k_c)
+            x_o = backend.dot(inputs_o, k_o)
+            if self.use_bias:
+                b_i, b_f, b_c, b_o = tf.split(
+                    self.bias, num_or_size_splits=4, axis=0
+                )
+                x_i = backend.bias_add(x_i, b_i)
+                x_f = backend.bias_add(x_f, b_f)
+                x_c = backend.bias_add(x_c, b_c)
+                x_o = backend.bias_add(x_o, b_o)
+
+            if 0 < self.recurrent_dropout < 1.0:
+                h_tm1_i = h_tm1 * rec_dp_mask[0]
+                h_tm1_f = h_tm1 * rec_dp_mask[1]
+                h_tm1_c = h_tm1 * rec_dp_mask[2]
+                h_tm1_o = h_tm1 * rec_dp_mask[3]
+            else:
+                h_tm1_i = h_tm1
+                h_tm1_f = h_tm1
+                h_tm1_c = h_tm1
+                h_tm1_o = h_tm1
+            x = (x_i, x_f, x_c, x_o)
+            h_tm1 = (h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o)
+            c, o = self._compute_carry_and_output(x, h_tm1, c_tm1)
+        else:
+            if 0.0 < self.dropout < 1.0:
+                inputs = inputs * dp_mask[0]
+            z = backend.dot(inputs, self.kernel)
+            z += backend.dot(h_tm1, self.recurrent_kernel)
+            if self.use_bias:
+                z = backend.bias_add(z, self.bias)
+
+            z = tf.split(z, num_or_size_splits=4, axis=1)
+            c, o = self._compute_carry_and_output_fused(z, c_tm1)
+
+        h = o * self.activation(c)
+        return h, [h, c]
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "unit_forget_bias": self.unit_forget_bias,
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+            "implementation": self.implementation,
+        }
+        config.update(rnn_utils.config_for_enable_caching_device(self))
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
-  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    return list(rnn_utils.generate_zero_filled_state_for_cell(
-        self, inputs, batch_size, dtype))
+    def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        return list(
+            rnn_utils.generate_zero_filled_state_for_cell(
+                self, inputs, batch_size, dtype
+            )
+        )
 
 
-@keras_export('keras.layers.LSTM', v1=[])
+@keras_export("keras.layers.LSTM", v1=[])
 class LSTM(DropoutRNNCellMixin, RNN, base_layer.BaseRandomLayer):
-  """Long Short-Term Memory layer - Hochreiter 1997.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  Based on available runtime hardware and constraints, this layer
-  will choose different implementations (cuDNN-based or pure-TensorFlow)
-  to maximize the performance. If a GPU is available and all
-  the arguments to the layer meet the requirement of the cuDNN kernel
-  (see below for details), the layer will use a fast cuDNN implementation.
-
-  The requirements to use the cuDNN implementation are:
-
-  1. `activation` == `tanh`
-  2. `recurrent_activation` == `sigmoid`
-  3. `recurrent_dropout` == 0
-  4. `unroll` is `False`
-  5. `use_bias` is `True`
-  6. Inputs, if use masking, are strictly right-padded.
-  7. Eager execution is enabled in the outermost context.
-
-  For example:
-
-  >>> inputs = tf.random.normal([32, 10, 8])
-  >>> lstm = tf.keras.layers.LSTM(4)
-  >>> output = lstm(inputs)
-  >>> print(output.shape)
-  (32, 4)
-  >>> lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True)
-  >>> whole_seq_output, final_memory_state, final_carry_state = lstm(inputs)
-  >>> print(whole_seq_output.shape)
-  (32, 10, 4)
-  >>> print(final_memory_state.shape)
-  (32, 4)
-  >>> print(final_carry_state.shape)
-  (32, 4)
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
-      is applied (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use for the recurrent step.
-      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
-      applied (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean (default `True`), whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs. Default: `glorot_uniform`.
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state.
-      Default: `orthogonal`.
-    bias_initializer: Initializer for the bias vector. Default: `zeros`.
-    unit_forget_bias: Boolean (default `True`). If True, add 1 to the bias of
-      the forget gate at initialization. Setting it to true will also force
-      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
-          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix. Default: `None`.
-    bias_regularizer: Regularizer function applied to the bias vector. Default:
-      `None`.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation"). Default: `None`.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix. Default: `None`.
-    bias_constraint: Constraint function applied to the bias vector. Default:
-      `None`.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs. Default: 0.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state. Default: 0.
-    return_sequences: Boolean. Whether to return the last output in the output
-      sequence, or the full sequence. Default: `False`.
-    return_state: Boolean. Whether to return the last state in addition to the
-      output. Default: `False`.
-    go_backwards: Boolean (default `False`). If True, process the input sequence
-      backwards and return the reversed sequence.
-    stateful: Boolean (default `False`). If True, the last state for each sample
-      at index i in a batch will be used as initial state for the sample of
-      index i in the following batch.
-    time_major: The shape format of the `inputs` and `outputs` tensors.
-      If True, the inputs and outputs will be in shape
-      `[timesteps, batch, feature]`, whereas in the False case, it will be
-      `[batch, timesteps, feature]`. Using `time_major = True` is a bit more
-      efficient because it avoids transposes at the beginning and end of the
-      RNN calculation. However, most TensorFlow data is batch-major, so by
-      default this function accepts input and emits output in batch-major
-      form.
-    unroll: Boolean (default `False`). If True, the network will be unrolled,
-      else a symbolic loop will be used. Unrolling can speed-up a RNN, although
-      it tends to be more memory-intensive. Unrolling is only suitable for short
-      sequences.
-
-  Call arguments:
-    inputs: A 3D tensor with shape `[batch, timesteps, feature]`.
-    mask: Binary tensor of shape `[batch, timesteps]` indicating whether
-      a given timestep should be masked (optional, defaults to `None`).
-      An individual `True` entry indicates that the corresponding timestep
-      should be utilized, while a `False` entry indicates that the corresponding
-      timestep should be ignored.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used (optional, defaults to `None`).
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell (optional, defaults to `None` which causes creation
-      of zero-filled initial state tensors).
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               time_major=False,
-               unroll=False,
-               **kwargs):
-    # return_runtime is a flag for testing, which shows the real backend
-    # implementation chosen by grappler in graph mode.
-    self.return_runtime = kwargs.pop('return_runtime', False)
-    implementation = kwargs.pop('implementation', 2)
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=1`.'
-                      'Please update your layer call.')
-    if 'enable_caching_device' in kwargs:
-      cell_kwargs = {'enable_caching_device':
-                     kwargs.pop('enable_caching_device')}
-    else:
-      cell_kwargs = {}
-    cell = LSTMCell(
+    """Long Short-Term Memory layer - Hochreiter 1997.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    Based on available runtime hardware and constraints, this layer
+    will choose different implementations (cuDNN-based or pure-TensorFlow)
+    to maximize the performance. If a GPU is available and all
+    the arguments to the layer meet the requirement of the cuDNN kernel
+    (see below for details), the layer will use a fast cuDNN implementation.
+
+    The requirements to use the cuDNN implementation are:
+
+    1. `activation` == `tanh`
+    2. `recurrent_activation` == `sigmoid`
+    3. `recurrent_dropout` == 0
+    4. `unroll` is `False`
+    5. `use_bias` is `True`
+    6. Inputs, if use masking, are strictly right-padded.
+    7. Eager execution is enabled in the outermost context.
+
+    For example:
+
+    >>> inputs = tf.random.normal([32, 10, 8])
+    >>> lstm = tf.keras.layers.LSTM(4)
+    >>> output = lstm(inputs)
+    >>> print(output.shape)
+    (32, 4)
+    >>> lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True)
+    >>> whole_seq_output, final_memory_state, final_carry_state = lstm(inputs)
+    >>> print(whole_seq_output.shape)
+    (32, 10, 4)
+    >>> print(final_memory_state.shape)
+    (32, 4)
+    >>> print(final_carry_state.shape)
+    (32, 4)
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+        is applied (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use for the recurrent step.
+        Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+        applied (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean (default `True`), whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs. Default: `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+        Default: `orthogonal`.
+      bias_initializer: Initializer for the bias vector. Default: `zeros`.
+      unit_forget_bias: Boolean (default `True`). If True, add 1 to the bias of
+        the forget gate at initialization. Setting it to true will also force
+        `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_regularizer: Regularizer function applied to the bias vector.
+        Default: `None`.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation"). Default: `None`.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_constraint: Constraint function applied to the bias vector. Default:
+        `None`.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs. Default: 0.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state. Default: 0.
+      return_sequences: Boolean. Whether to return the last output in the output
+        sequence, or the full sequence. Default: `False`.
+      return_state: Boolean. Whether to return the last state in addition to the
+        output. Default: `False`.
+      go_backwards: Boolean (default `False`). If True, process the input
+        sequence backwards and return the reversed sequence.
+      stateful: Boolean (default `False`). If True, the last state for each
+      sample at index i in a batch will be used as initial state for the sample
+        of index i in the following batch.
+      time_major: The shape format of the `inputs` and `outputs` tensors.
+        If True, the inputs and outputs will be in shape
+        `[timesteps, batch, feature]`, whereas in the False case, it will be
+        `[batch, timesteps, feature]`. Using `time_major = True` is a bit more
+        efficient because it avoids transposes at the beginning and end of the
+        RNN calculation. However, most TensorFlow data is batch-major, so by
+        default this function accepts input and emits output in batch-major
+        form.
+      unroll: Boolean (default `False`). If True, the network will be unrolled,
+        else a symbolic loop will be used. Unrolling can speed-up a RNN,
+        although it tends to be more memory-intensive. Unrolling is only
+        suitable for short sequences.
+
+    Call arguments:
+      inputs: A 3D tensor with shape `[batch, timesteps, feature]`.
+      mask: Binary tensor of shape `[batch, timesteps]` indicating whether
+        a given timestep should be masked (optional).
+        An individual `True` entry indicates that the corresponding timestep
+        should be utilized, while a `False` entry indicates that the
+        corresponding timestep should be ignored. Defaults to `None`.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` is used (optional). Defaults to `None`.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell (optional, `None` causes creation
+        of zero-filled initial state tensors). Defaults to `None`.
+    """
+
+    def __init__(
+        self,
         units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        unit_forget_bias=unit_forget_bias,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        dtype=kwargs.get('dtype'),
-        trainable=kwargs.get('trainable', True),
-        **cell_kwargs)
-    super().__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        time_major=time_major,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.input_spec = [InputSpec(ndim=3)]
-    self.state_spec = [
-        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
-    ]
-    self._could_use_gpu_kernel = (
-        self.activation in (activations.tanh, tf.tanh) and
-        self.recurrent_activation in (activations.sigmoid, tf.sigmoid) and
-        recurrent_dropout == 0 and not unroll and use_bias and
-        tf.compat.v1.executing_eagerly_outside_functions())
-    if tf.config.list_logical_devices('GPU'):
-      # Only show the message when there is GPU available, user will not care
-      # about the cuDNN if there isn't any GPU.
-      if self._could_use_gpu_kernel:
-        logging.debug(gru_lstm_utils.CUDNN_AVAILABLE_MSG % self.name)
-      else:
-        logging.warning(gru_lstm_utils.CUDNN_NOT_AVAILABLE_MSG % self.name)
-
-    if gru_lstm_utils.use_new_gru_lstm_impl():
-      self._defun_wrapper = gru_lstm_utils.DefunWrapper(
-          time_major, go_backwards, 'lstm')
+        activation="tanh",
+        recurrent_activation="sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        time_major=False,
+        unroll=False,
+        **kwargs,
+    ):
+        # return_runtime is a flag for testing, which shows the real backend
+        # implementation chosen by grappler in graph mode.
+        self.return_runtime = kwargs.pop("return_runtime", False)
+        implementation = kwargs.pop("implementation", 2)
+        if implementation == 0:
+            logging.warning(
+                "`implementation=0` has been deprecated, "
+                "and now defaults to `implementation=1`."
+                "Please update your layer call."
+            )
+        if "enable_caching_device" in kwargs:
+            cell_kwargs = {
+                "enable_caching_device": kwargs.pop("enable_caching_device")
+            }
+        else:
+            cell_kwargs = {}
+        cell = LSTMCell(
+            units,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            unit_forget_bias=unit_forget_bias,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            implementation=implementation,
+            dtype=kwargs.get("dtype"),
+            trainable=kwargs.get("trainable", True),
+            name="lstm_cell",
+            **cell_kwargs,
+        )
+        super().__init__(
+            cell,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            time_major=time_major,
+            unroll=unroll,
+            **kwargs,
+        )
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.input_spec = [InputSpec(ndim=3)]
+        self.state_spec = [
+            InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+        ]
+        self._could_use_gpu_kernel = (
+            self.activation in (activations.tanh, tf.tanh)
+            and self.recurrent_activation in (activations.sigmoid, tf.sigmoid)
+            and recurrent_dropout == 0
+            and not unroll
+            and use_bias
+            and tf.compat.v1.executing_eagerly_outside_functions()
+        )
+        if tf.config.list_logical_devices("GPU"):
+            # Only show the message when there is GPU available, user will not
+            # care about the cuDNN if there isn't any GPU.
+            if self._could_use_gpu_kernel:
+                logging.debug(gru_lstm_utils.CUDNN_AVAILABLE_MSG % self.name)
+            else:
+                logging.warning(
+                    gru_lstm_utils.CUDNN_NOT_AVAILABLE_MSG % self.name
+                )
+
+        if gru_lstm_utils.use_new_gru_lstm_impl():
+            self._defun_wrapper = gru_lstm_utils.DefunWrapper(
+                time_major, go_backwards, "lstm"
+            )
+
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        # The input should be dense, padded with zeros. If a ragged input is fed
+        # into the layer, it is padded and the row lengths are used for masking.
+        inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
+        is_ragged_input = row_lengths is not None
+        self._validate_args_if_ragged(is_ragged_input, mask)
+
+        # LSTM does not support constants. Ignore it during process.
+        inputs, initial_state, _ = self._process_inputs(
+            inputs, initial_state, None
+        )
+
+        if isinstance(mask, list):
+            mask = mask[0]
+
+        input_shape = backend.int_shape(inputs)
+        timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+        if not self._could_use_gpu_kernel:
+            # Fall back to use the normal LSTM.
+            kwargs = {"training": training}
+            self._maybe_reset_cell_dropout_mask(self.cell)
+
+            def step(inputs, states):
+                return self.cell(inputs, states, **kwargs)
+
+            last_output, outputs, states = backend.rnn(
+                step,
+                inputs,
+                initial_state,
+                constants=None,
+                go_backwards=self.go_backwards,
+                mask=mask,
+                unroll=self.unroll,
+                input_length=row_lengths
+                if row_lengths is not None
+                else timesteps,
+                time_major=self.time_major,
+                zero_output_for_mask=self.zero_output_for_mask,
+                return_all_outputs=self.return_sequences,
+            )
+            runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
+        else:
+            # Use the new defun approach for backend implementation swap.
+            # Note that different implementations need to have same function
+            # signature, eg, the tensor parameters need to have same shape and
+            # dtypes. Since the cuDNN has an extra set of bias, those bias will
+            # be passed to both normal and cuDNN implementations.
+            self.reset_dropout_mask()
+            dropout_mask = self.get_dropout_mask_for_cell(
+                inputs, training, count=4
+            )
+            if dropout_mask is not None:
+                inputs = inputs * dropout_mask[0]
+            if gru_lstm_utils.use_new_gru_lstm_impl():
+                lstm_kwargs = {
+                    "inputs": inputs,
+                    "init_h": gru_lstm_utils.read_variable_value(
+                        initial_state[0]
+                    ),
+                    "init_c": gru_lstm_utils.read_variable_value(
+                        initial_state[1]
+                    ),
+                    "kernel": gru_lstm_utils.read_variable_value(
+                        self.cell.kernel
+                    ),
+                    "recurrent_kernel": gru_lstm_utils.read_variable_value(
+                        self.cell.recurrent_kernel
+                    ),
+                    "bias": gru_lstm_utils.read_variable_value(self.cell.bias),
+                    "mask": mask,
+                    "time_major": self.time_major,
+                    "go_backwards": self.go_backwards,
+                    "sequence_lengths": row_lengths,
+                    "zero_output_for_mask": self.zero_output_for_mask,
+                }
+                (
+                    last_output,
+                    outputs,
+                    new_h,
+                    new_c,
+                    runtime,
+                ) = self._defun_wrapper.defun_layer(**lstm_kwargs)
+            else:
+                gpu_lstm_kwargs = {
+                    "inputs": inputs,
+                    "init_h": gru_lstm_utils.read_variable_value(
+                        initial_state[0]
+                    ),
+                    "init_c": gru_lstm_utils.read_variable_value(
+                        initial_state[1]
+                    ),
+                    "kernel": gru_lstm_utils.read_variable_value(
+                        self.cell.kernel
+                    ),
+                    "recurrent_kernel": gru_lstm_utils.read_variable_value(
+                        self.cell.recurrent_kernel
+                    ),
+                    "bias": gru_lstm_utils.read_variable_value(self.cell.bias),
+                    "mask": mask,
+                    "time_major": self.time_major,
+                    "go_backwards": self.go_backwards,
+                    "sequence_lengths": row_lengths,
+                    "return_sequences": self.return_sequences,
+                }
+                normal_lstm_kwargs = gpu_lstm_kwargs.copy()
+                normal_lstm_kwargs.update(
+                    {
+                        "zero_output_for_mask": self.zero_output_for_mask,
+                    }
+                )
+
+                if tf.executing_eagerly():
+                    device_type = gru_lstm_utils.get_context_device_type()
+                    can_use_gpu = (
+                        # Either user specified GPU or unspecified but GPU is
+                        # available.
+                        (
+                            device_type == gru_lstm_utils.GPU_DEVICE_NAME
+                            or (
+                                device_type is None
+                                and tf.config.list_logical_devices("GPU")
+                            )
+                        )
+                        and gru_lstm_utils.is_cudnn_supported_inputs(
+                            mask, self.time_major, row_lengths
+                        )
+                    )
+                    # Under eager context, check the device placement and prefer
+                    # the GPU implementation when GPU is available.
+                    if can_use_gpu:
+                        last_output, outputs, new_h, new_c, runtime = gpu_lstm(
+                            **gpu_lstm_kwargs
+                        )
+                    else:
+                        (
+                            last_output,
+                            outputs,
+                            new_h,
+                            new_c,
+                            runtime,
+                        ) = standard_lstm(**normal_lstm_kwargs)
+                else:
+                    (
+                        last_output,
+                        outputs,
+                        new_h,
+                        new_c,
+                        runtime,
+                    ) = lstm_with_backend_selection(**normal_lstm_kwargs)
+
+            states = [new_h, new_c]
+
+        if self.stateful:
+            updates = [
+                tf.compat.v1.assign(
+                    self_state, tf.cast(state, self_state.dtype)
+                )
+                for self_state, state in zip(self.states, states)
+            ]
+            self.add_update(updates)
+
+        if self.return_sequences:
+            output = backend.maybe_convert_to_ragged(
+                is_ragged_input,
+                outputs,
+                row_lengths,
+                go_backwards=self.go_backwards,
+            )
+        else:
+            output = last_output
 
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # The input should be dense, padded with zeros. If a ragged input is fed
-    # into the layer, it is padded and the row lengths are used for masking.
-    inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
-    is_ragged_input = (row_lengths is not None)
-    self._validate_args_if_ragged(is_ragged_input, mask)
+        if self.return_state:
+            return [output] + list(states)
+        elif self.return_runtime:
+            return output, runtime
+        else:
+            return output
+
+    @property
+    def units(self):
+        return self.cell.units
+
+    @property
+    def activation(self):
+        return self.cell.activation
+
+    @property
+    def recurrent_activation(self):
+        return self.cell.recurrent_activation
+
+    @property
+    def use_bias(self):
+        return self.cell.use_bias
+
+    @property
+    def kernel_initializer(self):
+        return self.cell.kernel_initializer
+
+    @property
+    def recurrent_initializer(self):
+        return self.cell.recurrent_initializer
+
+    @property
+    def bias_initializer(self):
+        return self.cell.bias_initializer
+
+    @property
+    def unit_forget_bias(self):
+        return self.cell.unit_forget_bias
+
+    @property
+    def kernel_regularizer(self):
+        return self.cell.kernel_regularizer
+
+    @property
+    def recurrent_regularizer(self):
+        return self.cell.recurrent_regularizer
+
+    @property
+    def bias_regularizer(self):
+        return self.cell.bias_regularizer
+
+    @property
+    def kernel_constraint(self):
+        return self.cell.kernel_constraint
+
+    @property
+    def recurrent_constraint(self):
+        return self.cell.recurrent_constraint
+
+    @property
+    def bias_constraint(self):
+        return self.cell.bias_constraint
+
+    @property
+    def dropout(self):
+        return self.cell.dropout
+
+    @property
+    def recurrent_dropout(self):
+        return self.cell.recurrent_dropout
+
+    @property
+    def implementation(self):
+        return self.cell.implementation
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "unit_forget_bias": self.unit_forget_bias,
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+            "implementation": self.implementation,
+        }
+        config.update(rnn_utils.config_for_enable_caching_device(self.cell))
+        base_config = super().get_config()
+        del base_config["cell"]
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        if "implementation" in config and config["implementation"] == 0:
+            config["implementation"] = 1
+        return cls(**config)
+
+
+def standard_lstm(
+    inputs,
+    init_h,
+    init_c,
+    kernel,
+    recurrent_kernel,
+    bias,
+    mask,
+    time_major,
+    go_backwards,
+    sequence_lengths,
+    zero_output_for_mask,
+    return_sequences,
+):
+    """LSTM with standard kernel implementation.
+
+    This implementation can be run on all types for hardware.
+
+    This implementation lifts out all the layer weights and make them function
+    parameters. It has same number of tensor input params as the cuDNN
+    counterpart. The RNN step logic has been simplified, eg dropout and mask is
+    removed since cuDNN implementation does not support that.
+
+    Note that the first half of the bias tensor should be ignored by this impl.
+    The cuDNN impl need an extra set of input gate bias. In order to make the
+    both function take same shape of parameter, that extra set of bias is also
+    feed
+    here.
+
+    Args:
+      inputs: input tensor of LSTM layer.
+      init_h: initial state tensor for the cell output.
+      init_c: initial state tensor for the cell hidden state.
+      kernel: weights for cell kernel.
+      recurrent_kernel: weights for cell recurrent kernel.
+      bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
+        is used in this case.
+      mask: Boolean tensor for mask out the steps within sequence.
+        An individual `True` entry indicates that the corresponding timestep
+        should be utilized, while a `False` entry indicates that the
+        corresponding timestep should be ignored.
+      time_major: boolean, whether the inputs are in the format of
+        [time, batch, feature] or [batch, time, feature].
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      sequence_lengths: The lengths of all sequences coming from a variable
+        length input, such as ragged tensors. If the input has a fixed timestep
+        size, this should be None.
+      zero_output_for_mask: Boolean, whether to output zero for masked timestep.
+      return_sequences: Boolean. If True, return the recurrent outputs for all
+        timesteps in the sequence. If False, only return the output for the
+        last timestep (which consumes less memory).
+
+    Returns:
+      last_output: output tensor for the last timestep, which has shape
+        [batch, units].
+      outputs:
+        - If `return_sequences=True`: output tensor for all timesteps,
+          which has shape [batch, time, units].
+        - Else, a tensor equal to `last_output` with shape [batch, 1, units]
+      state_0: the cell output, which has same shape as init_h.
+      state_1: the cell hidden state, which has same shape as init_c.
+      runtime: constant string tensor which indicate real runtime hardware. This
+        value is for testing purpose and should be used by user.
+    """
+    input_shape = backend.int_shape(inputs)
+    timesteps = input_shape[0] if time_major else input_shape[1]
 
-    # LSTM does not support constants. Ignore it during process.
-    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+    def step(cell_inputs, cell_states):
+        """Step function that will be used by Keras RNN backend."""
+        h_tm1 = cell_states[0]  # previous memory state
+        c_tm1 = cell_states[1]  # previous carry state
 
-    if isinstance(mask, list):
-      mask = mask[0]
+        z = backend.dot(cell_inputs, kernel)
+        z += backend.dot(h_tm1, recurrent_kernel)
+        z = backend.bias_add(z, bias)
 
-    input_shape = backend.int_shape(inputs)
-    timesteps = input_shape[0] if self.time_major else input_shape[1]
-
-    if not self._could_use_gpu_kernel:
-      # Fall back to use the normal LSTM.
-      kwargs = {'training': training}
-      self._maybe_reset_cell_dropout_mask(self.cell)
-
-      def step(inputs, states):
-        return self.cell(inputs, states, **kwargs)
-
-      last_output, outputs, states = backend.rnn(
-          step,
-          inputs,
-          initial_state,
-          constants=None,
-          go_backwards=self.go_backwards,
-          mask=mask,
-          unroll=self.unroll,
-          input_length=row_lengths if row_lengths is not None else timesteps,
-          time_major=self.time_major,
-          zero_output_for_mask=self.zero_output_for_mask,
-          return_all_outputs=self.return_sequences)
-      runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
-    else:
-      # Use the new defun approach for backend implementation swap.
-      # Note that different implementations need to have same function
-      # signature, eg, the tensor parameters need to have same shape and dtypes.
-      # Since the cuDNN has an extra set of bias, those bias will be passed to
-      # both normal and cuDNN implementations.
-      self.reset_dropout_mask()
-      dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
-      if dropout_mask is not None:
-        inputs = inputs * dropout_mask[0]
-      if gru_lstm_utils.use_new_gru_lstm_impl():
-        lstm_kwargs = {
-            'inputs':
-                inputs,
-            'init_h':
-                gru_lstm_utils.read_variable_value(initial_state[0]),
-            'init_c':
-                gru_lstm_utils.read_variable_value(initial_state[1]),
-            'kernel':
-                gru_lstm_utils.read_variable_value(self.cell.kernel),
-            'recurrent_kernel':
-                gru_lstm_utils.read_variable_value(self.cell.recurrent_kernel),
-            'bias':
-                gru_lstm_utils.read_variable_value(self.cell.bias),
-            'mask':
-                mask,
-            'time_major':
-                self.time_major,
-            'go_backwards':
-                self.go_backwards,
-            'sequence_lengths':
-                row_lengths,
-            'zero_output_for_mask':
-                self.zero_output_for_mask,
-        }
-        (last_output, outputs, new_h, new_c,
-         runtime) = self._defun_wrapper.defun_layer(**lstm_kwargs)
-      else:
-        gpu_lstm_kwargs = {
-            'inputs':
-                inputs,
-            'init_h':
-                gru_lstm_utils.read_variable_value(initial_state[0]),
-            'init_c':
-                gru_lstm_utils.read_variable_value(initial_state[1]),
-            'kernel':
-                gru_lstm_utils.read_variable_value(self.cell.kernel),
-            'recurrent_kernel':
-                gru_lstm_utils.read_variable_value(self.cell.recurrent_kernel),
-            'bias':
-                gru_lstm_utils.read_variable_value(self.cell.bias),
-            'mask':
-                mask,
-            'time_major':
-                self.time_major,
-            'go_backwards':
-                self.go_backwards,
-            'sequence_lengths':
-                row_lengths,
-            'return_sequences':
-                self.return_sequences
-        }
-        normal_lstm_kwargs = gpu_lstm_kwargs.copy()
-        normal_lstm_kwargs.update({
-            'zero_output_for_mask': self.zero_output_for_mask,
-        })
-
-        if tf.executing_eagerly():
-          device_type = gru_lstm_utils.get_context_device_type()
-          can_use_gpu = (
-              # Either user specified GPU or unspecified but GPU is available.
-              (device_type == gru_lstm_utils.GPU_DEVICE_NAME or
-               (device_type is None
-                and tf.config.list_logical_devices('GPU'))) and
-              (mask is None or
-               gru_lstm_utils.is_cudnn_supported_inputs(mask, self.time_major)))
-          # Under eager context, check the device placement and prefer the
-          # GPU implementation when GPU is available.
-          if can_use_gpu:
-            last_output, outputs, new_h, new_c, runtime = gpu_lstm(
-                **gpu_lstm_kwargs)
-          else:
-            last_output, outputs, new_h, new_c, runtime = standard_lstm(
-                **normal_lstm_kwargs)
-        else:
-          (last_output, outputs, new_h, new_c,
-           runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
+        z0, z1, z2, z3 = tf.split(z, 4, axis=1)
 
-      states = [new_h, new_c]
+        i = tf.sigmoid(z0)
+        f = tf.sigmoid(z1)
+        c = f * c_tm1 + i * tf.tanh(z2)
+        o = tf.sigmoid(z3)
 
-    if self.stateful:
-      updates = [
-          tf.compat.v1.assign(self_state, tf.cast(state, self_state.dtype))
-          for self_state, state in zip(self.states, states)
-      ]
-      self.add_update(updates)
+        h = o * tf.tanh(c)
+        return h, [h, c]
 
-    if self.return_sequences:
-      output = backend.maybe_convert_to_ragged(
-          is_ragged_input, outputs, row_lengths, go_backwards=self.go_backwards)
+    last_output, outputs, new_states = backend.rnn(
+        step,
+        inputs,
+        [init_h, init_c],
+        constants=None,
+        unroll=False,
+        time_major=time_major,
+        mask=mask,
+        go_backwards=go_backwards,
+        input_length=(
+            sequence_lengths if sequence_lengths is not None else timesteps
+        ),
+        zero_output_for_mask=zero_output_for_mask,
+        return_all_outputs=return_sequences,
+    )
+    return (
+        last_output,
+        outputs,
+        new_states[0],
+        new_states[1],
+        gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_CPU),
+    )
+
+
+def gpu_lstm(
+    inputs,
+    init_h,
+    init_c,
+    kernel,
+    recurrent_kernel,
+    bias,
+    mask,
+    time_major,
+    go_backwards,
+    sequence_lengths,
+    return_sequences,
+):
+    """LSTM with either cuDNN or ROCm implementation which is only available for
+    GPU.
+
+    Note that currently only right padded data is supported, or the result will
+    be polluted by the unmasked data which should be filtered.
+
+    Args:
+      inputs: Input tensor of LSTM layer.
+      init_h: Initial state tensor for the cell output.
+      init_c: Initial state tensor for the cell hidden state.
+      kernel: Weights for cell kernel.
+      recurrent_kernel: Weights for cell recurrent kernel.
+      bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
+        is used in this case.
+      mask: Boolean tensor for mask out the steps within sequence. An individual
+        `True` entry indicates that the corresponding timestep should be
+        utilized, while a `False` entry indicates that the corresponding
+        timestep should be ignored.
+      time_major: Boolean, whether the inputs are in the format of [time, batch,
+        feature] or [batch, time, feature].
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      sequence_lengths: The lengths of all sequences coming from a variable
+        length input, such as ragged tensors. If the input has a fixed timestep
+        size, this should be None.
+      return_sequences: Boolean. If True, return the recurrent outputs for all
+        timesteps in the sequence. If False, only return the output for the
+        last timestep, matching the CPU function output format.
+
+    Returns:
+      last_output: Output tensor for the last timestep, which has shape
+        [batch, units].
+      outputs:
+        - If `return_sequences=True`: output tensor for all timesteps,
+          which has shape [batch, time, units].
+        - Else, a tensor equal to `last_output` with shape [batch, 1, units]
+      state_0: The cell output, which has same shape as init_h.
+      state_1: The cell hidden state, which has same shape as init_c.
+      runtime: Constant string tensor which indicate real runtime hardware. This
+        value is for testing purpose and should not be used by user.
+    """
+    if mask is not None:
+        sequence_lengths = gru_lstm_utils.calculate_sequence_by_mask(
+            mask, time_major
+        )
+
+    if not time_major and sequence_lengths is None:
+        inputs = tf.transpose(inputs, perm=(1, 0, 2))
+        seq_axis, batch_axis = (0, 1)
     else:
-      output = last_output
-
-    if self.return_state:
-      return [output] + list(states)
-    elif self.return_runtime:
-      return output, runtime
+        seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
+    # For init_h and init_c, cuDNN expects one more dim of num_layers before or
+    # after batch dim for time major or batch major inputs respectively
+    init_h = tf.expand_dims(init_h, axis=seq_axis)
+    init_c = tf.expand_dims(init_c, axis=seq_axis)
+
+    weights = tf.split(kernel, 4, axis=1)
+    weights += tf.split(recurrent_kernel, 4, axis=1)
+    # cuDNN has an extra set of bias for inputs, we disable them (setting to 0),
+    # so that mathematically it is same as the canonical LSTM implementation.
+    full_bias = tf.concat((tf.zeros_like(bias), bias), 0)
+
+    if tf.sysconfig.get_build_info()["is_rocm_build"]:
+        # ROCm MIOpen's weight sequence for LSTM is different from both
+        # canonical and Cudnn format
+        # MIOpen: [i, f, o, c] Cudnn/Canonical: [i, f, c, o]
+        # i is input gate weights.
+        # f is forget gate weights.
+        # o is output gate weights.
+        # c is cell gate weights.
+        weights = [weights[x] for x in (0, 1, 3, 2, 4, 5, 7, 6)]
+        # full_bias is a tensor of shape (8*n,)
+        full_bias = tf.split(full_bias, 8, axis=0)
+        full_bias = [full_bias[x] for x in (0, 1, 3, 2, 4, 5, 7, 6)]
+
+    params = gru_lstm_utils.canonical_to_params(
+        weights=weights,
+        biases=tf.split(full_bias, 8),
+        shape=tf.constant([-1]),
+        transpose_weights=True,
+    )
+
+    if sequence_lengths is not None:
+        if go_backwards:
+            # Three reversals are required. E.g.,
+            # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
+            # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
+            # output_from_cudnn = [6, 5, 4, 0, 0]
+            # expected_output = [0, 0, 6, 5 ,4]
+            inputs = tf.reverse_sequence(
+                inputs,
+                sequence_lengths,
+                seq_axis=seq_axis,
+                batch_axis=batch_axis,
+            )
+        outputs, h, c, _, _ = tf.raw_ops.CudnnRNNV3(
+            input=inputs,
+            input_h=init_h,
+            input_c=init_c,
+            params=params,
+            is_training=True,
+            rnn_mode="lstm",
+            sequence_lengths=sequence_lengths,
+            time_major=time_major,
+        )
+        if go_backwards:
+            outputs = tf.reverse_sequence(
+                outputs,
+                sequence_lengths,
+                seq_axis=seq_axis,
+                batch_axis=batch_axis,
+            )
+            outputs = tf.reverse(outputs, axis=[seq_axis])
     else:
-      return output
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def unit_forget_bias(self):
-    return self.cell.unit_forget_bias
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
-  def get_config(self):
-    config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
+        # # Fill the array with shape [batch] with value of max timesteps.
+        # sequence_length = array_ops.fill([array_ops.shape(inputs)[1]],
+        #                                  array_ops.shape(inputs)[0])
+        if go_backwards:
+            # Reverse axis 0 since the input is already convert to time major.
+            inputs = tf.reverse(inputs, axis=[0])
+        outputs, h, c, _ = tf.raw_ops.CudnnRNN(
+            input=inputs,
+            input_h=init_h,
+            input_c=init_c,
+            params=params,
+            is_training=True,
+            rnn_mode="lstm",
+        )
+
+    last_output = outputs[-1]
+    if not time_major and sequence_lengths is None and return_sequences:
+        outputs = tf.transpose(outputs, perm=[1, 0, 2])
+    h = tf.squeeze(h, axis=seq_axis)
+    c = tf.squeeze(c, axis=seq_axis)
+
+    # In the case of variable length input, the cudnn kernel will fill zeros for
+    # the output, whereas the default keras behavior is to bring over the
+    # previous output for t-1, so that in the return_sequence=False case, user
+    # can quickly get the final effect output instead just 0s at the last
+    # timestep.  In order to mimic the default keras behavior, we copy the final
+    # h state as the last_output, since it is numerically same as the output.
+    if sequence_lengths is not None:
+        last_output = h
+
+    # Match CPU return format
+    if not return_sequences:
+        outputs = tf.expand_dims(last_output, axis=0 if time_major else 1)
+
+    return (
+        last_output,
+        outputs,
+        h,
+        c,
+        gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_GPU),
+    )
+
+
+def lstm_with_backend_selection(
+    inputs,
+    init_h,
+    init_c,
+    kernel,
+    recurrent_kernel,
+    bias,
+    mask,
+    time_major,
+    go_backwards,
+    sequence_lengths,
+    zero_output_for_mask,
+    return_sequences,
+):
+    """Call the LSTM with optimized backend kernel selection.
+
+    Under the hood, this function will create two TF function, one with the most
+    generic kernel and can run on all device condition, and the second one with
+    cuDNN specific kernel, which can only run on GPU.
+
+    The first function will be called with normal_lstm_params, while the second
+    function is not called, but only registered in the graph. The Grappler will
+    do the proper graph rewrite and swap the optimized TF function based on the
+    device placement.
+
+    Args:
+      inputs: Input tensor of LSTM layer.
+      init_h: Initial state tensor for the cell output.
+      init_c: Initial state tensor for the cell hidden state.
+      kernel: Weights for cell kernel.
+      recurrent_kernel: Weights for cell recurrent kernel.
+      bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
+        is used in this case.
+      mask: Boolean tensor for mask out the steps within sequence.
+        An individual `True` entry indicates that the corresponding timestep
+        should be utilized, while a `False` entry indicates that the
+        corresponding timestep should be ignored.
+      time_major: Boolean, whether the inputs are in the format of
+        [time, batch, feature] or [batch, time, feature].
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      sequence_lengths: The lengths of all sequences coming from a variable
+        length input, such as ragged tensors. If the input has a fixed timestep
+        size, this should be None.
+      zero_output_for_mask: Boolean, whether to output zero for masked timestep.
+      return_sequences: Boolean. If True, return the recurrent outputs for all
+        timesteps in the sequence. If False, only return the output for the
+        last timestep (which consumes less memory).
+
+    Returns:
+      List of output tensors, same as standard_lstm.
+    """
+    params = {
+        "inputs": inputs,
+        "init_h": init_h,
+        "init_c": init_c,
+        "kernel": kernel,
+        "recurrent_kernel": recurrent_kernel,
+        "bias": bias,
+        "mask": mask,
+        "time_major": time_major,
+        "go_backwards": go_backwards,
+        "sequence_lengths": sequence_lengths,
+        "zero_output_for_mask": zero_output_for_mask,
+        "return_sequences": return_sequences,
     }
-    config.update(rnn_utils.config_for_enable_caching_device(self.cell))
-    base_config = super().get_config()
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
-
-
-def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
-                  time_major, go_backwards, sequence_lengths,
-                  zero_output_for_mask, return_sequences):
-  """LSTM with standard kernel implementation.
-
-  This implementation can be run on all types for hardware.
-
-  This implementation lifts out all the layer weights and make them function
-  parameters. It has same number of tensor input params as the cuDNN
-  counterpart. The RNN step logic has been simplified, eg dropout and mask is
-  removed since cuDNN implementation does not support that.
-
-  Note that the first half of the bias tensor should be ignored by this impl.
-  The cuDNN impl need an extra set of input gate bias. In order to make the both
-  function take same shape of parameter, that extra set of bias is also feed
-  here.
-
-  Args:
-    inputs: input tensor of LSTM layer.
-    init_h: initial state tensor for the cell output.
-    init_c: initial state tensor for the cell hidden state.
-    kernel: weights for cell kernel.
-    recurrent_kernel: weights for cell recurrent kernel.
-    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
-      is used in this case.
-    mask: Boolean tensor for mask out the steps within sequence.
-      An individual `True` entry indicates that the corresponding timestep
-      should be utilized, while a `False` entry indicates that the corresponding
-      timestep should be ignored.
-    time_major: boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    sequence_lengths: The lengths of all sequences coming from a variable length
-      input, such as ragged tensors. If the input has a fixed timestep size,
-      this should be None.
-    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
-    return_sequences: Boolean. If True, return the recurrent outputs for all
-      timesteps in the sequence. If False, only return the output for the
-      last timestep (which consumes less memory).
-
-  Returns:
-    last_output: output tensor for the last timestep, which has shape
-      [batch, units].
-    outputs:
-      - If `return_sequences=True`: output tensor for all timesteps,
-        which has shape [batch, time, units].
-      - Else, a tensor equal to `last_output` with shape [batch, 1, units]
-    state_0: the cell output, which has same shape as init_h.
-    state_1: the cell hidden state, which has same shape as init_c.
-    runtime: constant string tensor which indicate real runtime hardware. This
-      value is for testing purpose and should be used by user.
-  """
-  input_shape = backend.int_shape(inputs)
-  timesteps = input_shape[0] if time_major else input_shape[1]
-
-  def step(cell_inputs, cell_states):
-    """Step function that will be used by Keras RNN backend."""
-    h_tm1 = cell_states[0]  # previous memory state
-    c_tm1 = cell_states[1]  # previous carry state
-
-    z = backend.dot(cell_inputs, kernel)
-    z += backend.dot(h_tm1, recurrent_kernel)
-    z = backend.bias_add(z, bias)
-
-    z0, z1, z2, z3 = tf.split(z, 4, axis=1)
-
-    i = tf.sigmoid(z0)
-    f = tf.sigmoid(z1)
-    c = f * c_tm1 + i * tf.tanh(z2)
-    o = tf.sigmoid(z3)
-
-    h = o * tf.tanh(c)
-    return h, [h, c]
-
-  last_output, outputs, new_states = backend.rnn(
-      step,
-      inputs, [init_h, init_c],
-      constants=None,
-      unroll=False,
-      time_major=time_major,
-      mask=mask,
-      go_backwards=go_backwards,
-      input_length=(sequence_lengths
-                    if sequence_lengths is not None else timesteps),
-      zero_output_for_mask=zero_output_for_mask,
-      return_all_outputs=return_sequences)
-  return (last_output, outputs, new_states[0], new_states[1],
-          gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_CPU))
-
-
-def gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
-             time_major, go_backwards, sequence_lengths, return_sequences):
-  """LSTM with either cuDNN or ROCm implementation which is only available for GPU.
-
-  Note that currently only right padded data is supported, or the result will be
-  polluted by the unmasked data which should be filtered.
-
-  Args:
-    inputs: Input tensor of LSTM layer.
-    init_h: Initial state tensor for the cell output.
-    init_c: Initial state tensor for the cell hidden state.
-    kernel: Weights for cell kernel.
-    recurrent_kernel: Weights for cell recurrent kernel.
-    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
-      is used in this case.
-    mask: Boolean tensor for mask out the steps within sequence. An individual
-      `True` entry indicates that the corresponding timestep should be utilized,
-      while a `False` entry indicates that the corresponding timestep should be
-      ignored.
-    time_major: Boolean, whether the inputs are in the format of [time, batch,
-      feature] or [batch, time, feature].
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    sequence_lengths: The lengths of all sequences coming from a variable length
-      input, such as ragged tensors. If the input has a fixed timestep size,
-      this should be None.
-    return_sequences: Boolean. If True, return the recurrent outputs for all
-      timesteps in the sequence. If False, only return the output for the
-      last timestep, matching the CPU function output format.
-
-  Returns:
-    last_output: Output tensor for the last timestep, which has shape
-      [batch, units].
-    outputs:
-      - If `return_sequences=True`: output tensor for all timesteps,
-        which has shape [batch, time, units].
-      - Else, a tensor equal to `last_output` with shape [batch, 1, units]
-    state_0: The cell output, which has same shape as init_h.
-    state_1: The cell hidden state, which has same shape as init_c.
-    runtime: Constant string tensor which indicate real runtime hardware. This
-      value is for testing purpose and should not be used by user.
-  """
-  if mask is not None:
-    sequence_lengths = gru_lstm_utils.calculate_sequence_by_mask(
-        mask, time_major)
-
-  if not time_major and sequence_lengths is None:
-    inputs = tf.transpose(inputs, perm=(1, 0, 2))
-    seq_axis, batch_axis = (0, 1)
-  else:
-    seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
-  # For init_h and init_c, cuDNN expects one more dim of num_layers before or
-  # after batch dim for time major or batch major inputs respectively
-  init_h = tf.expand_dims(init_h, axis=seq_axis)
-  init_c = tf.expand_dims(init_c, axis=seq_axis)
-
-  weights = tf.split(kernel, 4, axis=1)
-  weights += tf.split(recurrent_kernel, 4, axis=1)
-  # cuDNN has an extra set of bias for inputs, we disable them (setting to 0),
-  # so that mathematically it is same as the canonical LSTM implementation.
-  full_bias = tf.concat((tf.zeros_like(bias), bias), 0)
-
-  if tf.sysconfig.get_build_info()['is_rocm_build']:
-    # ROCm MIOpen's weight sequence for LSTM is different from both canonical
-    # and Cudnn format
-    # MIOpen: [i, f, o, c] Cudnn/Canonical: [i, f, c, o]
-    # i is input gate weights.
-    # f is forget gate weights.
-    # o is output gate weights.
-    # c is cell gate weights.
-    weights = [weights[x] for x in (0, 1, 3, 2, 4, 5, 7, 6)]
-    # full_bias is a tensor of shape (8*n,)
-    full_bias = tf.split(full_bias, 8, axis=0)
-    full_bias = [full_bias[x] for x in (0, 1, 3, 2, 4, 5, 7, 6)]
-
-  params = gru_lstm_utils.canonical_to_params(
-      weights=weights,
-      biases=tf.split(full_bias, 8),
-      shape=tf.constant([-1]),
-      transpose_weights=True)
-
-  if sequence_lengths is not None:
-    if go_backwards:
-      # Three reversals are required. E.g.,
-      # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
-      # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
-      # output_from_cudnn = [6, 5, 4, 0, 0]
-      # expected_output = [0, 0, 6, 5 ,4]
-      inputs = tf.reverse_sequence(
-          inputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
-    outputs, h, c, _, _ = tf.raw_ops.CudnnRNNV3(
-        input=inputs,
-        input_h=init_h,
-        input_c=init_c,
-        params=params,
-        is_training=True,
-        rnn_mode='lstm',
-        sequence_lengths=sequence_lengths,
-        time_major=time_major)
-    if go_backwards:
-      outputs = tf.reverse_sequence(
-          outputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
-      outputs = tf.reverse(outputs, axis=[seq_axis])
-  else:
-    # # Fill the array with shape [batch] with value of max timesteps.
-    # sequence_length = array_ops.fill([array_ops.shape(inputs)[1]],
-    #                                  array_ops.shape(inputs)[0])
-    if go_backwards:
-      # Reverse axis 0 since the input is already convert to time major.
-      inputs = tf.reverse(inputs, axis=[0])
-    outputs, h, c, _ = tf.raw_ops.CudnnRNN(
-        input=inputs, input_h=init_h, input_c=init_c, params=params,
-        is_training=True, rnn_mode='lstm')
-
-  last_output = outputs[-1]
-  if not time_major and sequence_lengths is None and return_sequences:
-    outputs = tf.transpose(outputs, perm=[1, 0, 2])
-  h = tf.squeeze(h, axis=seq_axis)
-  c = tf.squeeze(c, axis=seq_axis)
-
-  # In the case of variable length input, the cudnn kernel will fill zeros for
-  # the output, whereas the default keras behavior is to bring over the previous
-  # output for t-1, so that in the return_sequence=False case, user can quickly
-  # get the final effect output instead just 0s at the last timestep.
-  # In order to mimic the default keras behavior, we copy the final h state as
-  # the last_output, since it is numerically same as the output.
-  if sequence_lengths is not None:
-    last_output = h
-
-  # Match CPU return format
-  if not return_sequences:
-    outputs = tf.expand_dims(last_output, axis=0 if time_major else 1)
-
-  return last_output, outputs, h, c, gru_lstm_utils.runtime(
-      gru_lstm_utils.RUNTIME_GPU)
-
-
-def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
-                                recurrent_kernel, bias, mask, time_major,
-                                go_backwards, sequence_lengths,
-                                zero_output_for_mask, return_sequences):
-  """Call the LSTM with optimized backend kernel selection.
-
-  Under the hood, this function will create two TF function, one with the most
-  generic kernel and can run on all device condition, and the second one with
-  cuDNN specific kernel, which can only run on GPU.
-
-  The first function will be called with normal_lstm_params, while the second
-  function is not called, but only registered in the graph. The Grappler will
-  do the proper graph rewrite and swap the optimized TF function based on the
-  device placement.
-
-  Args:
-    inputs: Input tensor of LSTM layer.
-    init_h: Initial state tensor for the cell output.
-    init_c: Initial state tensor for the cell hidden state.
-    kernel: Weights for cell kernel.
-    recurrent_kernel: Weights for cell recurrent kernel.
-    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
-      is used in this case.
-    mask: Boolean tensor for mask out the steps within sequence.
-      An individual `True` entry indicates that the corresponding timestep
-      should be utilized, while a `False` entry indicates that the corresponding
-      timestep should be ignored.
-    time_major: Boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    sequence_lengths: The lengths of all sequences coming from a variable length
-      input, such as ragged tensors. If the input has a fixed timestep size,
-      this should be None.
-    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
-    return_sequences: Boolean. If True, return the recurrent outputs for all
-      timesteps in the sequence. If False, only return the output for the
-      last timestep (which consumes less memory).
-
-  Returns:
-    List of output tensors, same as standard_lstm.
-  """
-  params = {
-      'inputs': inputs,
-      'init_h': init_h,
-      'init_c': init_c,
-      'kernel': kernel,
-      'recurrent_kernel': recurrent_kernel,
-      'bias': bias,
-      'mask': mask,
-      'time_major': time_major,
-      'go_backwards': go_backwards,
-      'sequence_lengths': sequence_lengths,
-      'zero_output_for_mask': zero_output_for_mask,
-      'return_sequences': return_sequences,
-  }
-
-  def gpu_lstm_with_fallback(inputs, init_h, init_c, kernel, recurrent_kernel,
-                             bias, mask, time_major, go_backwards,
-                             sequence_lengths, zero_output_for_mask,
-                             return_sequences):
-    """Use cuDNN kernel when mask is none or strictly right padded."""
-    if mask is None:
-      return gpu_lstm(
-          inputs=inputs,
-          init_h=init_h,
-          init_c=init_c,
-          kernel=kernel,
-          recurrent_kernel=recurrent_kernel,
-          bias=bias,
-          mask=mask,
-          time_major=time_major,
-          go_backwards=go_backwards,
-          sequence_lengths=sequence_lengths,
-          return_sequences=return_sequences)
-
-    def cudnn_lstm_fn():
-      return gpu_lstm(
-          inputs=inputs,
-          init_h=init_h,
-          init_c=init_c,
-          kernel=kernel,
-          recurrent_kernel=recurrent_kernel,
-          bias=bias,
-          mask=mask,
-          time_major=time_major,
-          go_backwards=go_backwards,
-          sequence_lengths=sequence_lengths,
-          return_sequences=return_sequences)
-
-    def stardard_lstm_fn():
-      return standard_lstm(
-          inputs=inputs,
-          init_h=init_h,
-          init_c=init_c,
-          kernel=kernel,
-          recurrent_kernel=recurrent_kernel,
-          bias=bias,
-          mask=mask,
-          time_major=time_major,
-          go_backwards=go_backwards,
-          sequence_lengths=sequence_lengths,
-          zero_output_for_mask=zero_output_for_mask,
-          return_sequences=return_sequences)
-
-    return tf.cond(
-        gru_lstm_utils.is_cudnn_supported_inputs(mask, time_major),
-        true_fn=cudnn_lstm_fn,
-        false_fn=stardard_lstm_fn)
-
-  if gru_lstm_utils.use_new_gru_lstm_impl():
-    # Chooses the implementation dynamically based on the running device.
-    (last_output, outputs, new_h, new_c,
-     runtime) = tf.__internal__.execute_fn_for_device(
-         {
-             gru_lstm_utils.CPU_DEVICE_NAME:
-                 lambda: standard_lstm(**params),
-             gru_lstm_utils.GPU_DEVICE_NAME:
-                 lambda: gpu_lstm_with_fallback(**params)
-         }, lambda: standard_lstm(**params))
-  else:
-    # Each time a `tf.function` is called, we will give it a unique
-    # identifiable API name, so that Grappler won't get confused when it
-    # sees multiple LSTM layers added into same graph, and it will be able
-    # to pair up the different implementations across them.
-    api_name = 'lstm_' + str(uuid.uuid4())
-    supportive_attribute = {
-        'time_major': time_major,
-        'go_backwards': go_backwards,
-    }
-    defun_standard_lstm = gru_lstm_utils.generate_defun_backend(
-        api_name, gru_lstm_utils.CPU_DEVICE_NAME, standard_lstm,
-        supportive_attribute)
-    defun_gpu_lstm = gru_lstm_utils.generate_defun_backend(
-        api_name, gru_lstm_utils.GPU_DEVICE_NAME, gpu_lstm_with_fallback,
-        supportive_attribute)
-
-    # Call the normal LSTM impl and register the cuDNN impl function. The
-    # grappler will kick in during session execution to optimize the graph.
-    last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(**params)
-    gru_lstm_utils.function_register(defun_gpu_lstm, **params)
-
-  return last_output, outputs, new_h, new_c, runtime
+
+    def gpu_lstm_with_fallback(
+        inputs,
+        init_h,
+        init_c,
+        kernel,
+        recurrent_kernel,
+        bias,
+        mask,
+        time_major,
+        go_backwards,
+        sequence_lengths,
+        zero_output_for_mask,
+        return_sequences,
+    ):
+        """Use cuDNN kernel when mask is none or strictly right padded."""
+
+        def cudnn_lstm_fn():
+            return gpu_lstm(
+                inputs=inputs,
+                init_h=init_h,
+                init_c=init_c,
+                kernel=kernel,
+                recurrent_kernel=recurrent_kernel,
+                bias=bias,
+                mask=mask,
+                time_major=time_major,
+                go_backwards=go_backwards,
+                sequence_lengths=sequence_lengths,
+                return_sequences=return_sequences,
+            )
+
+        def stardard_lstm_fn():
+            return standard_lstm(
+                inputs=inputs,
+                init_h=init_h,
+                init_c=init_c,
+                kernel=kernel,
+                recurrent_kernel=recurrent_kernel,
+                bias=bias,
+                mask=mask,
+                time_major=time_major,
+                go_backwards=go_backwards,
+                sequence_lengths=sequence_lengths,
+                zero_output_for_mask=zero_output_for_mask,
+                return_sequences=return_sequences,
+            )
+
+        return tf.__internal__.smart_cond.smart_cond(
+            gru_lstm_utils.is_cudnn_supported_inputs(
+                mask, time_major, sequence_lengths
+            ),
+            true_fn=cudnn_lstm_fn,
+            false_fn=stardard_lstm_fn,
+        )
+
+    if gru_lstm_utils.use_new_gru_lstm_impl():
+        # Chooses the implementation dynamically based on the running device.
+        (
+            last_output,
+            outputs,
+            new_h,
+            new_c,
+            runtime,
+        ) = tf.__internal__.execute_fn_for_device(
+            {
+                gru_lstm_utils.CPU_DEVICE_NAME: lambda: standard_lstm(**params),
+                gru_lstm_utils.GPU_DEVICE_NAME: lambda: gpu_lstm_with_fallback(
+                    **params
+                ),
+            },
+            lambda: standard_lstm(**params),
+        )
+    else:
+        # Each time a `tf.function` is called, we will give it a unique
+        # identifiable API name, so that Grappler won't get confused when it
+        # sees multiple LSTM layers added into same graph, and it will be able
+        # to pair up the different implementations across them.
+        api_name = "lstm_" + str(uuid.uuid4())
+        supportive_attribute = {
+            "time_major": time_major,
+            "go_backwards": go_backwards,
+        }
+        defun_standard_lstm = gru_lstm_utils.generate_defun_backend(
+            api_name,
+            gru_lstm_utils.CPU_DEVICE_NAME,
+            standard_lstm,
+            supportive_attribute,
+        )
+        defun_gpu_lstm = gru_lstm_utils.generate_defun_backend(
+            api_name,
+            gru_lstm_utils.GPU_DEVICE_NAME,
+            gpu_lstm_with_fallback,
+            supportive_attribute,
+        )
+
+        # Call the normal LSTM impl and register the cuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+            **params
+        )
+        gru_lstm_utils.function_register(defun_gpu_lstm, **params)
+
+    return last_output, outputs, new_h, new_c, runtime
diff --git a/keras/layers/rnn/lstm_test.py b/keras/layers/rnn/lstm_test.py
index fd208eeb9f57..e3e77dddae6b 100644
--- a/keras/layers/rnn/lstm_test.py
+++ b/keras/layers/rnn/lstm_test.py
@@ -19,18 +19,21 @@
 import os
 import shutil
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers.rnn import gru_lstm_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.framework import test_util as tf_test_util
-
+from tensorflow.python.framework import (
+    test_util as tf_test_util,
+)
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
@@ -43,1229 +46,1384 @@
 @test_combinations.run_all_keras_modes(config=_config)
 class LSTMGraphRewriteTest(test_combinations.TestCase):
 
-  input_shape = 10
-  output_shape = 8
-  rnn_state_size = 8
-  timestep = 4
-  batch = 100
-  epoch = 1
-
-  @parameterized.named_parameters(
-      ('non_tan_activation', 'relu', 'sigmoid', 0, False, True),
-      ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True),
-      ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True),
-      ('unroll', 'tanh', 'sigmoid', 0, True, True),
-      ('not_use_bias', 'tanh', 'sigmoid', 0, False, False),
-  )
-  @test_utils.run_v2_only
-  def test_could_use_defun_backend(self, activation, recurrent_activation,
-                                   recurrent_dropout, unroll, use_bias):
-    layer = keras.layers.LSTM(
-        1,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        recurrent_dropout=recurrent_dropout,
-        unroll=unroll,
-        use_bias=use_bias)
-    self.assertFalse(layer._could_use_gpu_kernel)
-
-  @test_utils.run_v2_only
-  def test_use_on_default_activation_with_gpu_kernel(self):
-    layer = keras.layers.LSTM(1, activation=tf.tanh)
-    self.assertTrue(layer._could_use_gpu_kernel)
-
-    layer = keras.layers.LSTM(1, recurrent_activation=tf.sigmoid)
-    self.assertTrue(layer._could_use_gpu_kernel)
-
-  def test_static_shape_inference_LSTM(self):
-    # Github issue: 15165
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-
-    model = keras.models.Sequential()
-    inputs = keras.layers.Dense(
-        embedding_dim, input_shape=(timesteps, embedding_dim))
-    model.add(inputs)
-    layer = keras.layers.LSTM(units, return_sequences=True)
-    model.add(layer)
-    outputs = model.layers[-1].output
-    self.assertEqual(outputs.shape.as_list(), [None, timesteps, units])
-
-  def test_dynamic_behavior_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
-    model = keras.models.Sequential()
-    model.add(layer)
-    model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.001), 'mse')
-    x = np.random.random((num_samples, timesteps, embedding_dim))
-    y = np.random.random((num_samples, units))
-    model.train_on_batch(x, y)
-
-  def test_stacking_LSTM(self):
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.LSTM(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.LSTM(5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  def test_from_config_LSTM(self):
-    layer_class = keras.layers.LSTM
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
-  def test_specify_initial_state_keras_tensor(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    # Test with Keras tensor
-    inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    layer = keras.layers.LSTM(units)
-    if len(initial_state) == 1:
-      output = layer(inputs, initial_state=initial_state[0])
-    else:
-      output = layer(inputs, initial_state=initial_state)
-    self.assertTrue(
-        any(initial_state[0] is t
-            for t in layer._inbound_nodes[0].input_tensors))
-
-    model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [
-        np.random.random((num_samples, units)) for _ in range(num_states)
-    ]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([inputs] + initial_state, targets)
-
-  def test_specify_initial_state_non_keras_tensor(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    # Test with non-Keras tensor
-    inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [
-        keras.backend.random_normal_variable((num_samples, units), 0, 1)
-        for _ in range(num_states)
-    ]
-    layer = keras.layers.LSTM(units)
-    output = layer(inputs, initial_state=initial_state)
-
-    model = keras.models.Model(inputs, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch(inputs, targets)
-
-  def test_reset_states_with_values(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    layer = keras.layers.LSTM(units, stateful=True)
-    layer.build((num_samples, timesteps, embedding_dim))
-    initial_weight_count = len(layer.weights)
-    layer.reset_states()
-    assert len(layer.states) == num_states
-    assert layer.states[0] is not None
-    self.assertAllClose(
-        keras.backend.eval(layer.states[0]),
-        np.zeros(keras.backend.int_shape(layer.states[0])),
-        atol=1e-4)
-    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
-    values = [np.ones(shape) for shape in state_shapes]
-    if len(values) == 1:
-      values = values[0]
-    layer.reset_states(values)
-    self.assertAllClose(
-        keras.backend.eval(layer.states[0]),
-        np.ones(keras.backend.int_shape(layer.states[0])),
-        atol=1e-4)
-
-    # Test with invalid data
-    with self.assertRaises(ValueError):
-      layer.reset_states([1] * (len(layer.states) + 1))
-
-    self.assertEqual(initial_weight_count, len(layer.weights))
-    # Variables in "states" shouldn't show up in .weights
-    layer.states = tf.nest.map_structure(tf.Variable, values)
-    layer.reset_states()
-    self.assertEqual(initial_weight_count, len(layer.weights))
-
-  def test_specify_state_with_masking(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    inputs = keras.Input((timesteps, embedding_dim))
-    _ = keras.layers.Masking()(inputs)
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    output = keras.layers.LSTM(units)(
-        inputs, initial_state=initial_state)
-
-    model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [
-        np.random.random((num_samples, units)) for _ in range(num_states)
-    ]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([inputs] + initial_state, targets)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_return_state(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    masked = keras.layers.Masking()(inputs)
-    layer = keras.layers.LSTM(units, return_state=True, stateful=True)
-    outputs = layer(masked)
-    state = outputs[1:]
-    assert len(state) == num_states
-    model = keras.models.Model(inputs, state[0])
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    state = model.predict(inputs)
-    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
-
-  def test_state_reuse(self):
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = keras.layers.LSTM(
-        units, return_state=True, return_sequences=True)
-    outputs = layer(inputs)
-    output, state = outputs[0], outputs[1:]
-    output = keras.layers.LSTM(units)(output, initial_state=state)
-    model = keras.models.Model(inputs, output)
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    model.predict(inputs)
-
-  def test_initial_states_as_other_inputs(self):
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-    num_states = 2
-    layer_class = keras.layers.LSTM
-
-    # Test with Keras tensor
-    main_inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    inputs = [main_inputs] + initial_state
-
-    layer = layer_class(units)
-    output = layer(inputs)
-    self.assertTrue(
-        any(initial_state[0] is t
-            for t in layer._inbound_nodes[0].input_tensors))
-
-    model = keras.models.Model(inputs, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-
-    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [
-        np.random.random((num_samples, units)) for _ in range(num_states)
-    ]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([main_inputs] + initial_state, targets)
-
-  @parameterized.named_parameters(('v0', 0), ('v1', 1), ('v2', 2))
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_implementation_mode_LSTM(self, implementation_mode):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={
-            'units': units,
-            'implementation': implementation_mode
-        },
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-    layer_class = keras.layers.LSTM
-    k_constraint = keras.constraints.max_norm(0.01)
-    r_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_constraint=k_constraint,
-        recurrent_constraint=r_constraint,
-        bias_constraint=b_constraint)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-    self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-    layer_class = keras.layers.LSTM
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_masking_with_stacking_LSTM(self):
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(keras.layers.LSTM(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.LSTM(5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @parameterized.named_parameters(
-      # test_name, use_bias, bias_initializer, activation
-      ('normal', True, 'zeros'),
-      ('no_bias', False, 'zeros'),
-      ('random_bias', True, 'random_uniform'),
-  )
-  def test_lstm_model_save_load(self, use_bias, bias_initializer):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-    h5_path = os.path.join(temp_dir, 'test.h5')
-
-    batch = 10
-    timestep = 3
-    input_dim = 5
-    units = 2
-
-    x = np.random.random((batch, timestep, input_dim))
-
-    def build_model():
-      inputs = keras.layers.Input(
-          shape=[timestep, input_dim], dtype=tf.float32)
-      layer = keras.layers.LSTM(
-          units,
-          use_bias=use_bias,
-          bias_initializer=bias_initializer)
-      output = layer(inputs)
-      return keras.models.Model(inputs, output), layer
-
-    model, layer = build_model()
-    y_ref = model.predict(x)
-    model.save_weights(h5_path)
-
-    cloned_model, new_layer = build_model()
-    cloned_model.load_weights(h5_path)
-    y = cloned_model.predict(x)
-
-    self.assertAllClose(y, y_ref)
-    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
-
-  def test_lstm_output_on_multiple_kernel(self):
-    x_train = np.random.random((self.batch, self.timestep, self.input_shape))
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-    with test_utils.device(should_use_gpu=False):
-      layer = keras.layers.LSTM(self.rnn_state_size)
-      output = layer(inputs)
-      cpu_model = keras.models.Model(inputs, output)
-      weights = cpu_model.get_weights()
-    y_1 = cpu_model.predict(x_train)
-
-    with test_utils.device(should_use_gpu=True):
-      layer = keras.layers.LSTM(self.rnn_state_size)
-      output = layer(inputs)
-      gpu_model = keras.models.Model(inputs, output)
-      gpu_model.set_weights(weights)
-    y_2 = gpu_model.predict(x_train)
-
-    self.assertAllClose(y_1, y_2)
-
-  def test_return_sequences_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={
-            'units': units,
-            'return_sequences': True
-        },
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support float64 yet.')
-  @test_utils.run_v2_only
-  def test_float64_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={
-            'units': units,
-            'return_sequences': True,
-            'dtype': 'float64'
-        },
-        input_shape=(num_samples, timesteps, embedding_dim),
-        input_dtype='float64')
-
-  def test_regularizers_LSTM(self):
-    embedding_dim = 4
-    layer_class = keras.layers.LSTM
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertEqual(len(layer.losses), 3)
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    if tf.executing_eagerly():
-      self.assertEqual(len(layer.losses), 4)
-    else:
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  def test_statefulness_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.LSTM
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            4,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timesteps,
-            batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(
-        units, return_sequences=False, stateful=True, weights=None)
-    model.add(layer)
-    model.compile(
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    out1 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertEqual(out1.shape, (num_samples, units))
-
-    # train once so that the states change
-    model.train_on_batch(
-        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-    out2 = model.predict(np.ones((num_samples, timesteps)))
-
-    # if the state is not reset, output should be different
-    self.assertNotEqual(out1.max(), out2.max())
-
-    # check that output changes after states are reset
-    # (even though the model itself didn't change)
-    layer.reset_states()
-    out3 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out2.max(), out3.max())
-
-    # check that container-level reset_states() works
-    model.reset_states()
-    out4 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertAllClose(out3, out4, atol=1e-5)
-
-    # check that the call to `predict` updated the states
-    out5 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out4.max(), out5.max())
-
-    # Check masking
-    layer.reset_states()
-
-    left_padded_input = np.ones((num_samples, timesteps))
-    left_padded_input[0, :1] = 0
-    left_padded_input[1, :2] = 0
-    out6 = model.predict(left_padded_input)
-
-    layer.reset_states()
-
-    right_padded_input = np.ones((num_samples, timesteps))
-    right_padded_input[0, -1:] = 0
-    right_padded_input[1, -2:] = 0
-    out7 = model.predict(right_padded_input)
-
-    layer.reset_states()
-
-    mix_padded_input = np.ones((num_samples, timesteps))
-    mix_padded_input[0, 1] = 0
-    mix_padded_input[1, 0] = 0
-    mix_padded_input[1, 2] = 0
-    out8 = model.predict(mix_padded_input)
-
-    self.assertAllClose(out7, out6, atol=1e-5)
-    self.assertAllClose(out8, out7, atol=1e-5)
-
-  def test_stateful_LSTM_training(self):
-    # See b/123587692 for more context.
-    vocab_size = 20
-    embedding_dim = 10
-    batch_size = 8
-    timestep = 12
-    units = 5
-    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
-    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
-
-    model = keras.Sequential([
-        keras.layers.Embedding(vocab_size, embedding_dim,
-                               batch_input_shape=[batch_size, timestep]),
-        keras.layers.LSTM(units, return_sequences=True, stateful=True),
-        keras.layers.Dense(vocab_size)
-    ])
-    model.compile(
-        optimizer='adam',
-        loss='sparse_categorical_crossentropy',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, epochs=1, shuffle=False)
-
-  def test_dropout_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={
-            'units': units,
-            'dropout': 0.1,
-            'recurrent_dropout': 0.1
-        },
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_bidirectional(self):
-    batch = 128
-    timestep = 20
-    vocab_size = 1000
-    model = keras.Sequential([
-        keras.layers.Embedding(vocab_size, 64),
-        keras.layers.Bidirectional(keras.layers.LSTM(
-            64, return_sequences=True)),
-        keras.layers.Bidirectional(keras.layers.LSTM(32)),
-        keras.layers.Dense(64, activation='relu'),
-        keras.layers.Dense(1, activation='sigmoid')
-    ])
-
-    model.compile(loss='binary_crossentropy',
-                  optimizer='adam',
-                  metrics=['accuracy'])
-
-    x = np.random.randint(0, vocab_size, size=(batch, timestep))
-    y = np.random.randint(0, 1, size=(batch))
-    model.fit(x, y, epochs=1, shuffle=False)
-    model.evaluate(x, y)
-    model.predict(x)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_explicit_device_with_go_backward_and_mask(self):
-    batch_size = 8
-    timestep = 7
-    masksteps = 5
-    units = 4
-
-    inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
-    mask = np.ones((batch_size, timestep)).astype(np.bool)
-    mask[:, masksteps:] = 0
-
-    lstm_layer = keras.layers.LSTM(
-        units, return_sequences=True, go_backwards=True)
-    with test_utils.device(should_use_gpu=True):
-      outputs_masked = lstm_layer(inputs, mask=tf.constant(mask))
-      outputs_trimmed = lstm_layer(inputs[:, :masksteps])
-    self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
-
-  @tf_test_util.enable_output_all_intermediates
-  def test_v1_session_behavior(self):
-    with tf.compat.v1.get_default_graph().as_default():
-      # See b/139132348 for more details.
-      x = np.random.uniform(size=(100, 4, 8))
-      y = np.random.uniform(size=(100, 1))
-      dataset = tf.data.Dataset.from_tensor_slices(
-          (x, y)).shuffle(100).batch(32)
-
-      inp = keras.layers.Input(shape=(4, 8))
-      layer = keras.layers.LSTM(1)(inp)
-      layer = keras.layers.Dense(1)(layer)
-
-      model = keras.models.Model(inp, layer)
-
-      model.compile(loss='mse', optimizer='sgd')
-      model.fit(dataset)
-
-  def test_with_fully_masked_inputs(self):
-    num_samples = 8
-    timestep = 5
-    embedding_dim = 4
-    vocab_size = 20
-    units = 2
-
-    inputs = np.random.randint(0, vocab_size, size=(num_samples, timestep))
-    # Set the first inputs to be fully zero.
-    inputs[0, :] = 0.0
-
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            vocab_size,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timestep,
-            batch_input_shape=(num_samples, timestep)))
-    layer = keras.layers.LSTM(units)
-    model.add(layer)
-    model.compile(
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    # Make sure it doesn't crash with cudnn kernel.
-    model.predict(inputs)
-
-  # TODO (b/169895267): test with xla_gpu is disabled.
-  def test_deepcopy(self):
-    if not tf.executing_eagerly():
-      self.skipTest('v2-only test')
-    original_layer = keras.layers.LSTM(5)
-    copied_layer = copy.deepcopy(original_layer)
-    self.assertEqual(copied_layer.units, 5)
-    self.assertEqual(original_layer.get_config(), original_layer.get_config())
-
-    # Copy layer before layer call on inputs without weight initialization.
-    inputs = np.random.normal(size=[32, 10, 8]).astype(np.float32)
-    original_layer = keras.layers.LSTM(4)
-    copied_layer = copy.deepcopy(original_layer)
-    outputs = original_layer(inputs)
-    copied_outputs = copied_layer(inputs)
-    self.assertNotAllClose(
-        self.evaluate(outputs), self.evaluate(copied_outputs))
-
-    # Copy layer after layer call on inputs with weight initialization.
-    original_layer = keras.layers.LSTM(4)
-    outputs = original_layer(inputs)
-    copied_layer = copy.deepcopy(original_layer)
-    copied_outputs = copied_layer(inputs)
-    self.assertAllClose(self.evaluate(outputs), self.evaluate(copied_outputs))
-
-  def _test_runtime_with_model(self, model):
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=self.batch,
-        test_samples=0,
-        input_shape=(self.timestep, self.input_shape),
-        num_classes=self.output_shape)
-    y_train = np_utils.to_categorical(y_train, self.output_shape)
-
-    model.compile(
-        optimizer='sgd',
-        loss=['categorical_crossentropy', None],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    existing_loss = 0
-    for _ in range(self.epoch):
-      history = model.fit(x_train, y_train)
-      loss_value = history.history['loss'][0]
-
-      self.assertNotEqual(existing_loss, loss_value)
-      existing_loss = loss_value
-
-    _, runtime_value = model.predict(x_train)
-    if tf.test.is_gpu_available():
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
-    else:
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-  @test_utils.run_v2_only
-  def test_LSTM_runtime(self):
-    layer = keras.layers.LSTM(self.rnn_state_size, return_runtime=True)
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-
-    outputs, runtime = layer(inputs)
-    # Expand the runtime so that it is a 1D tensor instead of scalar.
-    # TF model does not work with scalar model output, specially during
-    # aggregation.
-    runtime = keras.layers.Lambda(
-        lambda x: tf.expand_dims(x, axis=-1))(runtime)
-    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
-    self._test_runtime_with_model(model)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_LSTM_runtime_with_mask(self):
-    # Masking will affect which backend is selected based on whether the mask
-    # is strictly right padded.
-    layer = keras.layers.LSTM(self.rnn_state_size, return_runtime=True)
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-    masked_inputs = keras.layers.Masking()(inputs)
-
-    outputs, runtime = layer(masked_inputs)
-    # Expand the runtime so that it is a 1D tensor instead of scalar.
-    # TF model does not work with scalar model output, specially during
-    # aggregation.
-    runtime = keras.layers.Lambda(
-        lambda x: tf.expand_dims(x, axis=-1))(runtime)
-    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=self.batch,
-        test_samples=0,
-        input_shape=(self.timestep, self.input_shape),
-        num_classes=self.output_shape)
-    y_train = np_utils.to_categorical(y_train, self.output_shape)
-
-    model.compile(
-        optimizer='sgd',
-        loss=['categorical_crossentropy', None],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit(x_train, y_train)
-
-    # Verify unpadded data.
-    _, runtime_value = model.predict(x_train)
-    if tf.test.is_gpu_available():
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
-    else:
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-    # Update x/y to be right padded by setting the last timestep to 0
-    x_train[:, -1, :] = 0
-    y_train[:, -1] = 0
-    _, runtime_value = model.predict(x_train)
-    if tf.test.is_gpu_available():
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
-    else:
-      self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-    # Further update x/y to be mix padded (masks in the middle), and verify
-    # only cpu kernel can be selected.
-    x_train[:, -3, :] = 0
-    y_train[:, -3] = 0
-    _, runtime_value = model.predict(x_train)
-    self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
-
-  @test_utils.run_v2_only
-  def test_LSTM_runtime_with_cond(self):
-    # This test is to demonstrate the graph rewrite of grappler plugin under
-    # the condition that the function returns different number of internal
-    # states.
-    layer = keras.layers.LSTM(self.rnn_state_size, return_runtime=True)
-
-    inputs = keras.layers.Input(
-        shape=[self.timestep, self.input_shape], dtype=tf.float32)
-
-    zeros = tf.zeros([self.batch, self.output_shape])
-    dummy_runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
-    a = tf.constant(0)
-    b = tf.constant(1)
-    # Will always run the lstm layer.
-    outputs, runtime = tf.cond(
-        tf.less(a, b),
-        lambda: layer(inputs),
-        lambda: (zeros, dummy_runtime))
-
-    # Expand the runtime so that it is a 1D tensor instead of scalar.
-    # TF model does not work with scalar model output, specially during
-    # aggregation.
-    runtime = keras.layers.Lambda(
-        lambda x: tf.expand_dims(x, axis=-1))(runtime)
-    model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
-    self._test_runtime_with_model(model)
+    input_shape = 10
+    output_shape = 8
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    @parameterized.named_parameters(
+        ("non_tan_activation", "relu", "sigmoid", 0, False, True),
+        ("non_sigmoid_recur_activation", "tanh", "relu", 0, False, True),
+        ("use_recurrent_dropout", "tanh", "sigmoid", 0.1, False, True),
+        ("unroll", "tanh", "sigmoid", 0, True, True),
+        ("not_use_bias", "tanh", "sigmoid", 0, False, False),
+    )
+    @test_utils.run_v2_only
+    def test_could_use_defun_backend(
+        self,
+        activation,
+        recurrent_activation,
+        recurrent_dropout,
+        unroll,
+        use_bias,
+    ):
+        layer = keras.layers.LSTM(
+            1,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            recurrent_dropout=recurrent_dropout,
+            unroll=unroll,
+            use_bias=use_bias,
+        )
+        self.assertFalse(layer._could_use_gpu_kernel)
+
+    @test_utils.run_v2_only
+    def test_use_on_default_activation_with_gpu_kernel(self):
+        layer = keras.layers.LSTM(1, activation=tf.tanh)
+        self.assertTrue(layer._could_use_gpu_kernel)
+
+        layer = keras.layers.LSTM(1, recurrent_activation=tf.sigmoid)
+        self.assertTrue(layer._could_use_gpu_kernel)
+
+    def test_static_shape_inference_LSTM(self):
+        # GitHub issue: 15165
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+
+        model = keras.models.Sequential()
+        inputs = keras.layers.Dense(
+            embedding_dim, input_shape=(timesteps, embedding_dim)
+        )
+        model.add(inputs)
+        layer = keras.layers.LSTM(units, return_sequences=True)
+        model.add(layer)
+        outputs = model.layers[-1].output
+        self.assertEqual(outputs.shape.as_list(), [None, timesteps, units])
+
+    def test_dynamic_behavior_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
+        model = keras.models.Sequential()
+        model.add(layer)
+        model.compile(tf.compat.v1.train.GradientDescentOptimizer(0.001), "mse")
+        x = np.random.random((num_samples, timesteps, embedding_dim))
+        y = np.random.random((num_samples, units))
+        model.train_on_batch(x, y)
+
+    def test_stacking_LSTM(self):
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.LSTM(10, return_sequences=True, unroll=False))
+        model.add(keras.layers.LSTM(5, return_sequences=True, unroll=False))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    def test_from_config_LSTM(self):
+        layer_class = keras.layers.LSTM
+        for stateful in (False, True):
+            l1 = layer_class(units=1, stateful=stateful)
+            l2 = layer_class.from_config(l1.get_config())
+            assert l1.get_config() == l2.get_config()
+
+    def test_specify_initial_state_keras_tensor(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        # Test with Keras tensor
+        inputs = keras.Input((timesteps, embedding_dim))
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        layer = keras.layers.LSTM(units)
+        if len(initial_state) == 1:
+            output = layer(inputs, initial_state=initial_state[0])
+        else:
+            output = layer(inputs, initial_state=initial_state)
+        self.assertTrue(
+            any(
+                initial_state[0] is t
+                for t in layer._inbound_nodes[0].input_tensors
+            )
+        )
+
+        model = keras.models.Model([inputs] + initial_state, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch([inputs] + initial_state, targets)
+
+    def test_specify_initial_state_non_keras_tensor(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        # Test with non-Keras tensor
+        inputs = keras.Input((timesteps, embedding_dim))
+        initial_state = [
+            keras.backend.random_normal_variable((num_samples, units), 0, 1)
+            for _ in range(num_states)
+        ]
+        layer = keras.layers.LSTM(units)
+        output = layer(inputs, initial_state=initial_state)
+
+        model = keras.models.Model(inputs, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch(inputs, targets)
+
+    def test_reset_states_with_values(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        layer = keras.layers.LSTM(units, stateful=True)
+        layer.build((num_samples, timesteps, embedding_dim))
+        initial_weight_count = len(layer.weights)
+        layer.reset_states()
+        assert len(layer.states) == num_states
+        assert layer.states[0] is not None
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]),
+            np.zeros(keras.backend.int_shape(layer.states[0])),
+            atol=1e-4,
+        )
+        state_shapes = [
+            keras.backend.int_shape(state) for state in layer.states
+        ]
+        values = [np.ones(shape) for shape in state_shapes]
+        if len(values) == 1:
+            values = values[0]
+        layer.reset_states(values)
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]),
+            np.ones(keras.backend.int_shape(layer.states[0])),
+            atol=1e-4,
+        )
+
+        # Test with invalid data
+        with self.assertRaises(ValueError):
+            layer.reset_states([1] * (len(layer.states) + 1))
+
+        self.assertEqual(initial_weight_count, len(layer.weights))
+        # Variables in "states" shouldn't show up in .weights
+        layer.states = tf.nest.map_structure(tf.Variable, values)
+        layer.reset_states()
+        self.assertEqual(initial_weight_count, len(layer.weights))
+
+    def test_specify_state_with_masking(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        inputs = keras.Input((timesteps, embedding_dim))
+        _ = keras.layers.Masking()(inputs)
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
+
+        model = keras.models.Model([inputs] + initial_state, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch([inputs] + initial_state, targets)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_return_state(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        inputs = keras.Input(
+            batch_shape=(num_samples, timesteps, embedding_dim)
+        )
+        masked = keras.layers.Masking()(inputs)
+        layer = keras.layers.LSTM(units, return_state=True, stateful=True)
+        outputs = layer(masked)
+        state = outputs[1:]
+        assert len(state) == num_states
+        model = keras.models.Model(inputs, state[0])
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        state = model.predict(inputs)
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4
+        )
+
+    def test_state_reuse(self):
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        inputs = keras.Input(
+            batch_shape=(num_samples, timesteps, embedding_dim)
+        )
+        layer = keras.layers.LSTM(
+            units, return_state=True, return_sequences=True
+        )
+        outputs = layer(inputs)
+        output, state = outputs[0], outputs[1:]
+        output = keras.layers.LSTM(units)(output, initial_state=state)
+        model = keras.models.Model(inputs, output)
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        model.predict(inputs)
+
+    def test_initial_states_as_other_inputs(self):
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+        num_states = 2
+        layer_class = keras.layers.LSTM
+
+        # Test with Keras tensor
+        main_inputs = keras.Input((timesteps, embedding_dim))
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        inputs = [main_inputs] + initial_state
+
+        layer = layer_class(units)
+        output = layer(inputs)
+        self.assertTrue(
+            any(
+                initial_state[0] is t
+                for t in layer._inbound_nodes[0].input_tensors
+            )
+        )
+
+        model = keras.models.Model(inputs, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+
+        main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch([main_inputs] + initial_state, targets)
+
+    @parameterized.named_parameters(("v0", 0), ("v1", 1), ("v2", 2))
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_implementation_mode_LSTM(self, implementation_mode):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={"units": units, "implementation": implementation_mode},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+        layer_class = keras.layers.LSTM
+        k_constraint = keras.constraints.max_norm(0.01)
+        r_constraint = keras.constraints.max_norm(0.01)
+        b_constraint = keras.constraints.max_norm(0.01)
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_constraint=k_constraint,
+            recurrent_constraint=r_constraint,
+            bias_constraint=b_constraint,
+        )
+        layer.build((None, None, embedding_dim))
+        self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+        self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+        self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+        layer_class = keras.layers.LSTM
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        model.add(layer_class(units=5, return_sequences=True, unroll=False))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_masking_with_stacking_LSTM(self):
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        model.add(keras.layers.LSTM(10, return_sequences=True, unroll=False))
+        model.add(keras.layers.LSTM(5, return_sequences=True, unroll=False))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    @parameterized.named_parameters(
+        # test_name, use_bias, bias_initializer, activation
+        ("normal", True, "zeros"),
+        ("no_bias", False, "zeros"),
+        ("random_bias", True, "random_uniform"),
+    )
+    def test_lstm_model_save_load(self, use_bias, bias_initializer):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir)
+        h5_path = os.path.join(temp_dir, "test.h5")
+
+        batch = 10
+        timestep = 3
+        input_dim = 5
+        units = 2
+
+        x = np.random.random((batch, timestep, input_dim))
+
+        def build_model():
+            inputs = keras.layers.Input(
+                shape=[timestep, input_dim], dtype=tf.float32
+            )
+            layer = keras.layers.LSTM(
+                units, use_bias=use_bias, bias_initializer=bias_initializer
+            )
+            output = layer(inputs)
+            return keras.models.Model(inputs, output), layer
+
+        model, layer = build_model()
+        y_ref = model.predict(x)
+        model.save_weights(h5_path)
+
+        cloned_model, new_layer = build_model()
+        cloned_model.load_weights(h5_path)
+        y = cloned_model.predict(x)
+
+        self.assertAllClose(y, y_ref)
+        self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+    def test_lstm_output_on_multiple_kernel(self):
+        x_train = np.random.random(
+            (self.batch, self.timestep, self.input_shape)
+        )
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+        with test_utils.device(should_use_gpu=False):
+            layer = keras.layers.LSTM(self.rnn_state_size)
+            output = layer(inputs)
+            cpu_model = keras.models.Model(inputs, output)
+            weights = cpu_model.get_weights()
+        y_1 = cpu_model.predict(x_train)
+
+        with test_utils.device(should_use_gpu=True):
+            layer = keras.layers.LSTM(self.rnn_state_size)
+            output = layer(inputs)
+            gpu_model = keras.models.Model(inputs, output)
+            gpu_model.set_weights(weights)
+        y_2 = gpu_model.predict(x_train)
+
+        self.assertAllClose(y_1, y_2)
+
+    def test_return_sequences_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={"units": units, "return_sequences": True},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="Skipping as ROCm MIOpen does not support float64 yet.",
+    )
+    @test_utils.run_v2_only
+    def test_float64_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={
+                "units": units,
+                "return_sequences": True,
+                "dtype": "float64",
+            },
+            input_shape=(num_samples, timesteps, embedding_dim),
+            input_dtype="float64",
+        )
+
+    def test_regularizers_LSTM(self):
+        embedding_dim = 4
+        layer_class = keras.layers.LSTM
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            recurrent_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer="l2",
+            activity_regularizer="l1",
+        )
+        layer.build((None, None, 2))
+        self.assertEqual(len(layer.losses), 3)
+        x = keras.backend.variable(np.ones((2, 3, 2)))
+        layer(x)
+        if tf.executing_eagerly():
+            self.assertEqual(len(layer.losses), 4)
+        else:
+            self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    def test_statefulness_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer_class = keras.layers.LSTM
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                4,
+                embedding_dim,
+                mask_zero=True,
+                input_length=timesteps,
+                batch_input_shape=(num_samples, timesteps),
+            )
+        )
+        layer = layer_class(
+            units, return_sequences=False, stateful=True, weights=None
+        )
+        model.add(layer)
+        model.compile(
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        out1 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertEqual(out1.shape, (num_samples, units))
+
+        # train once so that the states change
+        model.train_on_batch(
+            np.ones((num_samples, timesteps)), np.ones((num_samples, units))
+        )
+        out2 = model.predict(np.ones((num_samples, timesteps)))
+
+        # if the state is not reset, output should be different
+        self.assertNotEqual(out1.max(), out2.max())
+
+        # check that output changes after states are reset
+        # (even though the model itself didn't change)
+        layer.reset_states()
+        out3 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out2.max(), out3.max())
+
+        # check that container-level reset_states() works
+        model.reset_states()
+        out4 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertAllClose(out3, out4, atol=1e-5)
+
+        # check that the call to `predict` updated the states
+        out5 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out4.max(), out5.max())
+
+        # Check masking
+        layer.reset_states()
+
+        left_padded_input = np.ones((num_samples, timesteps))
+        left_padded_input[0, :1] = 0
+        left_padded_input[1, :2] = 0
+        out6 = model.predict(left_padded_input)
+
+        layer.reset_states()
+
+        right_padded_input = np.ones((num_samples, timesteps))
+        right_padded_input[0, -1:] = 0
+        right_padded_input[1, -2:] = 0
+        out7 = model.predict(right_padded_input)
+
+        layer.reset_states()
+
+        mix_padded_input = np.ones((num_samples, timesteps))
+        mix_padded_input[0, 1] = 0
+        mix_padded_input[1, 0] = 0
+        mix_padded_input[1, 2] = 0
+        out8 = model.predict(mix_padded_input)
+
+        self.assertAllClose(out7, out6, atol=1e-5)
+        self.assertAllClose(out8, out7, atol=1e-5)
+
+    def test_stateful_LSTM_training(self):
+        # See b/123587692 for more context.
+        vocab_size = 20
+        embedding_dim = 10
+        batch_size = 8
+        timestep = 12
+        units = 5
+        x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+        y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+        model = keras.Sequential(
+            [
+                keras.layers.Embedding(
+                    vocab_size,
+                    embedding_dim,
+                    batch_input_shape=[batch_size, timestep],
+                ),
+                keras.layers.LSTM(units, return_sequences=True, stateful=True),
+                keras.layers.Dense(vocab_size),
+            ]
+        )
+        model.compile(
+            optimizer="adam",
+            loss="sparse_categorical_crossentropy",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(x, y, epochs=1, shuffle=False)
+
+    def test_dropout_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={"units": units, "dropout": 0.1, "recurrent_dropout": 0.1},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_bidirectional(self):
+        batch = 128
+        timestep = 20
+        vocab_size = 1000
+        model = keras.Sequential(
+            [
+                keras.layers.Embedding(vocab_size, 64),
+                keras.layers.Bidirectional(
+                    keras.layers.LSTM(64, return_sequences=True)
+                ),
+                keras.layers.Bidirectional(keras.layers.LSTM(32)),
+                keras.layers.Dense(64, activation="relu"),
+                keras.layers.Dense(1, activation="sigmoid"),
+            ]
+        )
+
+        model.compile(
+            loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
+        )
+
+        x = np.random.randint(0, vocab_size, size=(batch, timestep))
+        y = np.random.randint(0, 1, size=(batch))
+        model.fit(x, y, epochs=1, shuffle=False)
+        model.evaluate(x, y)
+        model.predict(x)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_explicit_device_with_go_backward_and_mask(self):
+        batch_size = 8
+        timestep = 7
+        masksteps = 5
+        units = 4
+
+        inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
+        mask = np.ones((batch_size, timestep)).astype(bool)
+        mask[:, masksteps:] = 0
+
+        lstm_layer = keras.layers.LSTM(
+            units, return_sequences=True, go_backwards=True
+        )
+        with test_utils.device(should_use_gpu=True):
+            outputs_masked = lstm_layer(inputs, mask=tf.constant(mask))
+            outputs_trimmed = lstm_layer(inputs[:, :masksteps])
+        self.assertAllClose(outputs_masked[:, -masksteps:], outputs_trimmed)
+
+    @tf_test_util.enable_output_all_intermediates
+    def test_v1_session_behavior(self):
+        with tf.compat.v1.get_default_graph().as_default():
+            # See b/139132348 for more details.
+            x = np.random.uniform(size=(100, 4, 8))
+            y = np.random.uniform(size=(100, 1))
+            dataset = (
+                tf.data.Dataset.from_tensor_slices((x, y))
+                .shuffle(100)
+                .batch(32)
+            )
+
+            inp = keras.layers.Input(shape=(4, 8))
+            layer = keras.layers.LSTM(1)(inp)
+            layer = keras.layers.Dense(1)(layer)
+
+            model = keras.models.Model(inp, layer)
+
+            model.compile(loss="mse", optimizer="sgd")
+            model.fit(dataset)
+
+    def test_with_fully_masked_inputs(self):
+        num_samples = 8
+        timestep = 5
+        embedding_dim = 4
+        vocab_size = 20
+        units = 2
+
+        inputs = np.random.randint(0, vocab_size, size=(num_samples, timestep))
+        # Set the first inputs to be fully zero.
+        inputs[0, :] = 0.0
+
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                vocab_size,
+                embedding_dim,
+                mask_zero=True,
+                input_length=timestep,
+                batch_input_shape=(num_samples, timestep),
+            )
+        )
+        layer = keras.layers.LSTM(units)
+        model.add(layer)
+        model.compile(
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        # Make sure it doesn't crash with cudnn kernel.
+        model.predict(inputs)
+
+    # TODO (b/169895267): test with xla_gpu is disabled.
+    def test_deepcopy(self):
+        if not tf.executing_eagerly():
+            self.skipTest("v2-only test")
+        original_layer = keras.layers.LSTM(5)
+        copied_layer = copy.deepcopy(original_layer)
+        self.assertEqual(copied_layer.units, 5)
+        self.assertEqual(
+            original_layer.get_config(), original_layer.get_config()
+        )
+
+        # Copy layer before layer call on inputs without weight initialization.
+        inputs = np.random.normal(size=[32, 10, 8]).astype(np.float32)
+        original_layer = keras.layers.LSTM(4)
+        copied_layer = copy.deepcopy(original_layer)
+        outputs = original_layer(inputs)
+        copied_outputs = copied_layer(inputs)
+        self.assertNotAllClose(
+            self.evaluate(outputs), self.evaluate(copied_outputs)
+        )
+
+        # Copy layer after layer call on inputs with weight initialization.
+        original_layer = keras.layers.LSTM(4)
+        outputs = original_layer(inputs)
+        copied_layer = copy.deepcopy(original_layer)
+        copied_outputs = copied_layer(inputs)
+        self.assertAllClose(
+            self.evaluate(outputs), self.evaluate(copied_outputs)
+        )
+
+    def _test_runtime_with_model(self, model):
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=self.batch,
+            test_samples=0,
+            input_shape=(self.timestep, self.input_shape),
+            num_classes=self.output_shape,
+        )
+        y_train = np_utils.to_categorical(y_train, self.output_shape)
+
+        model.compile(
+            optimizer="sgd",
+            loss=["categorical_crossentropy", None],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        existing_loss = 0
+        for _ in range(self.epoch):
+            history = model.fit(x_train, y_train)
+            loss_value = history.history["loss"][0]
+
+            self.assertNotEqual(existing_loss, loss_value)
+            existing_loss = loss_value
+
+        _, runtime_value = model.predict(x_train)
+        if not tf.sysconfig.get_build_info()["is_rocm_build"]:
+            if tf.test.is_gpu_available():
+                self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
+            else:
+                self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+    @test_utils.run_v2_only
+    def test_LSTM_runtime(self):
+        layer = keras.layers.LSTM(self.rnn_state_size, return_runtime=True)
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+
+        outputs, runtime = layer(inputs)
+        # Expand the runtime so that it is a 1D tensor instead of scalar.
+        # TF model does not work with scalar model output, specially during
+        # aggregation.
+        runtime = keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(
+            runtime
+        )
+        model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+        self._test_runtime_with_model(model)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_LSTM_runtime_with_mask(self):
+        # Masking will affect which backend is selected based on whether the
+        # mask is strictly right padded.
+        layer = keras.layers.LSTM(self.rnn_state_size, return_runtime=True)
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+        masked_inputs = keras.layers.Masking()(inputs)
+
+        outputs, runtime = layer(masked_inputs)
+        # Expand the runtime so that it is a 1D tensor instead of scalar.
+        # TF model does not work with scalar model output, specially during
+        # aggregation.
+        runtime = keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(
+            runtime
+        )
+        model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=self.batch,
+            test_samples=0,
+            input_shape=(self.timestep, self.input_shape),
+            num_classes=self.output_shape,
+        )
+        y_train = np_utils.to_categorical(y_train, self.output_shape)
+
+        model.compile(
+            optimizer="sgd",
+            loss=["categorical_crossentropy", None],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        model.fit(x_train, y_train)
+
+        # Verify unpadded data.
+        _, runtime_value = model.predict(x_train)
+        if tf.test.is_gpu_available():
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
+        else:
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+        # Update x/y to be right padded by setting the last timestep to 0
+        x_train[:, -1, :] = 0
+        y_train[:, -1] = 0
+        _, runtime_value = model.predict(x_train)
+        if tf.test.is_gpu_available():
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_GPU)
+        else:
+            self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+        # Further update x/y to be mix padded (masks in the middle), and verify
+        # only cpu kernel can be selected.
+        x_train[:, -3, :] = 0
+        y_train[:, -3] = 0
+        _, runtime_value = model.predict(x_train)
+        self.assertEqual(runtime_value[0], gru_lstm_utils.RUNTIME_CPU)
+
+    @test_utils.run_v2_only
+    def test_LSTM_runtime_with_cond(self):
+        # This test is to demonstrate the graph rewrite of grappler plugin under
+        # the condition that the function returns different number of internal
+        # states.
+        layer = keras.layers.LSTM(self.rnn_state_size, return_runtime=True)
+
+        inputs = keras.layers.Input(
+            shape=[self.timestep, self.input_shape], dtype=tf.float32
+        )
+
+        zeros = tf.zeros([self.batch, self.output_shape])
+        dummy_runtime = gru_lstm_utils.runtime(gru_lstm_utils.RUNTIME_UNKNOWN)
+        a = tf.constant(0)
+        b = tf.constant(1)
+        # Will always run the lstm layer.
+        outputs, runtime = tf.cond(
+            tf.less(a, b), lambda: layer(inputs), lambda: (zeros, dummy_runtime)
+        )
+
+        # Expand the runtime so that it is a 1D tensor instead of scalar.
+        # TF model does not work with scalar model output, specially during
+        # aggregation.
+        runtime = keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(
+            runtime
+        )
+        model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
+        self._test_runtime_with_model(model)
 
 
 @test_combinations.run_all_keras_modes
 class LSTMLayerTest(test_combinations.TestCase):
-
-  def test_return_sequences_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={'units': units,
-                'return_sequences': True},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Double type is yet not supported in ROCm')
-  @test_utils.run_v2_only
-  def test_float64_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={'units': units,
-                'return_sequences': True,
-                'dtype': 'float64'},
-        input_shape=(num_samples, timesteps, embedding_dim),
-        input_dtype='float64')
-
-  def test_static_shape_inference_LSTM(self):
-    # Github issue: 15165
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-
-    model = keras.models.Sequential()
-    inputs = keras.layers.Dense(embedding_dim,
-                                input_shape=(timesteps, embedding_dim))
-    model.add(inputs)
-    layer = keras.layers.LSTM(units, return_sequences=True)
-    model.add(layer)
-    outputs = model.layers[-1].output
-    self.assertEqual(outputs.shape.as_list(), [None, timesteps, units])
-
-  def test_dynamic_behavior_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
-    model = keras.models.Sequential()
-    model.add(layer)
-    model.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.random.random((num_samples, timesteps, embedding_dim))
-    y = np.random.random((num_samples, units))
-    model.train_on_batch(x, y)
-
-  def test_dropout_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={'units': units,
-                'dropout': 0.1,
-                'recurrent_dropout': 0.1},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_recurrent_dropout_with_implementation_restriction(self):
-    layer = keras.layers.LSTM(2, recurrent_dropout=0.1, implementation=2)
-    # The implementation is force to 1 due to the limit of recurrent_dropout.
-    self.assertEqual(layer.implementation, 1)
-
-  @parameterized.parameters([0, 1, 2])
-  def test_implementation_mode_LSTM(self, implementation_mode):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.LSTM,
-        kwargs={'units': units,
-                'implementation': implementation_mode},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_constraints_LSTM(self):
-    embedding_dim = 4
-    layer_class = keras.layers.LSTM
-    k_constraint = keras.constraints.max_norm(0.01)
-    r_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_constraint=k_constraint,
-        recurrent_constraint=r_constraint,
-        bias_constraint=b_constraint)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-    self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  @parameterized.parameters([True, False])
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input.')
-  def test_with_masking_layer_LSTM(self, unroll):
-    layer_class = keras.layers.LSTM
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(layer_class(units=5, return_sequences=True, unroll=unroll))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @parameterized.parameters([True, False])
-  def test_masking_with_stacking_LSTM(self, unroll):
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
-    model.add(keras.layers.RNN(
-        lstm_cells, return_sequences=True, unroll=unroll))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  def test_from_config_LSTM(self):
-    layer_class = keras.layers.LSTM
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
-  def test_deep_copy_LSTM(self):
-    cell = keras.layers.LSTMCell(5)
-    copied_cell = copy.deepcopy(cell)
-    self.assertEqual(copied_cell.units, 5)
-    self.assertEqual(cell.get_config(), copied_cell.get_config())
-
-  def test_specify_initial_state_keras_tensor(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    # Test with Keras tensor
-    inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    layer = keras.layers.LSTM(units)
-    if len(initial_state) == 1:
-      output = layer(inputs, initial_state=initial_state[0])
-    else:
-      output = layer(inputs, initial_state=initial_state)
-    self.assertTrue(
-        any(initial_state[0] is t
-            for t in layer._inbound_nodes[0].input_tensors))
-
-    model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.AdamOptimizer(),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [np.random.random((num_samples, units))
-                     for _ in range(num_states)]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([inputs] + initial_state, targets)
-
-  def test_specify_initial_state_non_keras_tensor(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    # Test with non-Keras tensor
-    inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [keras.backend.random_normal_variable(
-        (num_samples, units), 0, 1)
-                     for _ in range(num_states)]
-    layer = keras.layers.LSTM(units)
-    output = layer(inputs, initial_state=initial_state)
-
-    model = keras.models.Model(inputs, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.AdamOptimizer(),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch(inputs, targets)
-
-  def test_reset_states_with_values(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    layer = keras.layers.LSTM(units, stateful=True)
-    layer.build((num_samples, timesteps, embedding_dim))
-    layer.reset_states()
-    assert len(layer.states) == num_states
-    assert layer.states[0] is not None
-    self.assertAllClose(
-        keras.backend.eval(layer.states[0]),
-        np.zeros(keras.backend.int_shape(layer.states[0])),
-        atol=1e-4)
-    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
-    values = [np.ones(shape) for shape in state_shapes]
-    if len(values) == 1:
-      values = values[0]
-    layer.reset_states(values)
-    self.assertAllClose(
-        keras.backend.eval(layer.states[0]),
-        np.ones(keras.backend.int_shape(layer.states[0])),
-        atol=1e-4)
-
-    # Test with invalid data
-    with self.assertRaises(ValueError):
-      layer.reset_states([1] * (len(layer.states) + 1))
-
-  def test_specify_state_with_masking(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    inputs = keras.Input((timesteps, embedding_dim))
-    _ = keras.layers.Masking()(inputs)
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
-
-    model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [np.random.random((num_samples, units))
-                     for _ in range(num_states)]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([inputs] + initial_state, targets)
-
-  def test_return_state(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = keras.layers.LSTM(units, return_state=True, stateful=True)
-    outputs = layer(inputs)
-    state = outputs[1:]
-    assert len(state) == num_states
-    model = keras.models.Model(inputs, state[0])
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    state = model.predict(inputs)
-    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
-
-  def test_state_reuse(self):
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = keras.layers.LSTM(units, return_state=True, return_sequences=True)
-    outputs = layer(inputs)
-    output, state = outputs[0], outputs[1:]
-    output = keras.layers.LSTM(units)(output, initial_state=state)
-    model = keras.models.Model(inputs, output)
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    outputs = model.predict(inputs)
-
-  def test_initial_states_as_other_inputs(self):
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-    num_states = 2
-    layer_class = keras.layers.LSTM
-
-    # Test with Keras tensor
-    main_inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    inputs = [main_inputs] + initial_state
-
-    layer = layer_class(units)
-    output = layer(inputs)
-    self.assertTrue(
-        any(initial_state[0] is t
-            for t in layer._inbound_nodes[0].input_tensors))
-
-    model = keras.models.Model(inputs, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=tf.compat.v1.train.AdamOptimizer(),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [np.random.random((num_samples, units))
-                     for _ in range(num_states)]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([main_inputs] + initial_state, targets)
-
-  def test_regularizers_LSTM(self):
-    embedding_dim = 4
-    layer_class = keras.layers.LSTM
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertEqual(len(layer.losses), 3)
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    if tf.executing_eagerly():
-      self.assertEqual(len(layer.losses), 4)
-    else:
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input.')
-  def test_statefulness_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.LSTM
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            4,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timesteps,
-            batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(
-        units, return_sequences=False, stateful=True, weights=None)
-    model.add(layer)
-    model.compile(
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    out1 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertEqual(out1.shape, (num_samples, units))
-
-    # train once so that the states change
-    model.train_on_batch(
-        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-    out2 = model.predict(np.ones((num_samples, timesteps)))
-
-    # if the state is not reset, output should be different
-    self.assertNotEqual(out1.max(), out2.max())
-
-    # check that output changes after states are reset
-    # (even though the model itself didn't change)
-    layer.reset_states()
-    out3 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out2.max(), out3.max())
-
-    # check that container-level reset_states() works
-    model.reset_states()
-    out4 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertAllClose(out3, out4, atol=1e-5)
-
-    # check that the call to `predict` updated the states
-    out5 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out4.max(), out5.max())
-
-    # Check masking
-    layer.reset_states()
-
-    left_padded_input = np.ones((num_samples, timesteps))
-    left_padded_input[0, :1] = 0
-    left_padded_input[1, :2] = 0
-    out6 = model.predict(left_padded_input)
-
-    layer.reset_states()
-
-    right_padded_input = np.ones((num_samples, timesteps))
-    right_padded_input[0, -1:] = 0
-    right_padded_input[1, -2:] = 0
-    out7 = model.predict(right_padded_input)
-
-    self.assertAllClose(out7, out6, atol=1e-5)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_return_sequences_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={"units": units, "return_sequences": True},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="Double type is yet not supported in ROCm",
+    )
+    @test_utils.run_v2_only
+    def test_float64_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={
+                "units": units,
+                "return_sequences": True,
+                "dtype": "float64",
+            },
+            input_shape=(num_samples, timesteps, embedding_dim),
+            input_dtype="float64",
+        )
+
+    def test_static_shape_inference_LSTM(self):
+        # GitHub issue: 15165
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+
+        model = keras.models.Sequential()
+        inputs = keras.layers.Dense(
+            embedding_dim, input_shape=(timesteps, embedding_dim)
+        )
+        model.add(inputs)
+        layer = keras.layers.LSTM(units, return_sequences=True)
+        model.add(layer)
+        outputs = model.layers[-1].output
+        self.assertEqual(outputs.shape.as_list(), [None, timesteps, units])
+
+    def test_dynamic_behavior_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
+        model = keras.models.Sequential()
+        model.add(layer)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        x = np.random.random((num_samples, timesteps, embedding_dim))
+        y = np.random.random((num_samples, units))
+        model.train_on_batch(x, y)
+
+    def test_dropout_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={"units": units, "dropout": 0.1, "recurrent_dropout": 0.1},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_recurrent_dropout_with_implementation_restriction(self):
+        layer = keras.layers.LSTM(2, recurrent_dropout=0.1, implementation=2)
+        # The implementation is force to 1 due to the limit of
+        # recurrent_dropout.
+        self.assertEqual(layer.implementation, 1)
+
+    @test_utils.run_v2_only
+    def test_dropout_variable_name(self):
+        layer = keras.layers.RNN(
+            keras.layers.LSTMCell(2, dropout=0.1, force_generator=True)
+        )
+        layer(np.random.random((2, 3, 4)))
+        self.assertEqual(
+            layer.cell._random_generator._generator._state_var.name,
+            "rnn/lstm_cell/StateVar:0",
+        )
+
+        layer = keras.layers.LSTM(2, dropout=0.1, force_generator=True)
+        layer(np.random.random((2, 3, 4)))
+        self.assertEqual(
+            layer._random_generator._generator._state_var.name,
+            "lstm/StateVar:0",
+        )
+
+    @parameterized.parameters([0, 1, 2])
+    def test_implementation_mode_LSTM(self, implementation_mode):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.LSTM,
+            kwargs={"units": units, "implementation": implementation_mode},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_constraints_LSTM(self):
+        embedding_dim = 4
+        layer_class = keras.layers.LSTM
+        k_constraint = keras.constraints.max_norm(0.01)
+        r_constraint = keras.constraints.max_norm(0.01)
+        b_constraint = keras.constraints.max_norm(0.01)
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_constraint=k_constraint,
+            recurrent_constraint=r_constraint,
+            bias_constraint=b_constraint,
+        )
+        layer.build((None, None, embedding_dim))
+        self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+        self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+        self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+    @parameterized.parameters([True, False])
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="Skipping as ROCm MIOpen does not support padded input.",
+    )
+    def test_with_masking_layer_LSTM(self, unroll):
+        layer_class = keras.layers.LSTM
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        model.add(layer_class(units=5, return_sequences=True, unroll=unroll))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    @parameterized.parameters([True, False])
+    def test_masking_with_stacking_LSTM(self, unroll):
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
+        model.add(
+            keras.layers.RNN(lstm_cells, return_sequences=True, unroll=unroll)
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    def test_from_config_LSTM(self):
+        layer_class = keras.layers.LSTM
+        for stateful in (False, True):
+            l1 = layer_class(units=1, stateful=stateful)
+            l2 = layer_class.from_config(l1.get_config())
+            assert l1.get_config() == l2.get_config()
+
+    def test_deep_copy_LSTM(self):
+        cell = keras.layers.LSTMCell(5)
+        copied_cell = copy.deepcopy(cell)
+        self.assertEqual(copied_cell.units, 5)
+        self.assertEqual(cell.get_config(), copied_cell.get_config())
+
+    def test_specify_initial_state_keras_tensor(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        # Test with Keras tensor
+        inputs = keras.Input((timesteps, embedding_dim))
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        layer = keras.layers.LSTM(units)
+        if len(initial_state) == 1:
+            output = layer(inputs, initial_state=initial_state[0])
+        else:
+            output = layer(inputs, initial_state=initial_state)
+        self.assertTrue(
+            any(
+                initial_state[0] is t
+                for t in layer._inbound_nodes[0].input_tensors
+            )
+        )
+
+        model = keras.models.Model([inputs] + initial_state, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.AdamOptimizer(),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch([inputs] + initial_state, targets)
+
+    def test_specify_initial_state_non_keras_tensor(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        # Test with non-Keras tensor
+        inputs = keras.Input((timesteps, embedding_dim))
+        initial_state = [
+            keras.backend.random_normal_variable((num_samples, units), 0, 1)
+            for _ in range(num_states)
+        ]
+        layer = keras.layers.LSTM(units)
+        output = layer(inputs, initial_state=initial_state)
+
+        model = keras.models.Model(inputs, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.AdamOptimizer(),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch(inputs, targets)
+
+    def test_reset_states_with_values(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        layer = keras.layers.LSTM(units, stateful=True)
+        layer.build((num_samples, timesteps, embedding_dim))
+        layer.reset_states()
+        assert len(layer.states) == num_states
+        assert layer.states[0] is not None
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]),
+            np.zeros(keras.backend.int_shape(layer.states[0])),
+            atol=1e-4,
+        )
+        state_shapes = [
+            keras.backend.int_shape(state) for state in layer.states
+        ]
+        values = [np.ones(shape) for shape in state_shapes]
+        if len(values) == 1:
+            values = values[0]
+        layer.reset_states(values)
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]),
+            np.ones(keras.backend.int_shape(layer.states[0])),
+            atol=1e-4,
+        )
+
+        # Test with invalid data
+        with self.assertRaises(ValueError):
+            layer.reset_states([1] * (len(layer.states) + 1))
+
+    def test_specify_state_with_masking(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        inputs = keras.Input((timesteps, embedding_dim))
+        _ = keras.layers.Masking()(inputs)
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
+
+        model = keras.models.Model([inputs] + initial_state, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch([inputs] + initial_state, targets)
+
+    def test_return_state(self):
+        num_states = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        inputs = keras.Input(
+            batch_shape=(num_samples, timesteps, embedding_dim)
+        )
+        layer = keras.layers.LSTM(units, return_state=True, stateful=True)
+        outputs = layer(inputs)
+        state = outputs[1:]
+        assert len(state) == num_states
+        model = keras.models.Model(inputs, state[0])
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        state = model.predict(inputs)
+        self.assertAllClose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4
+        )
+
+    def test_state_reuse(self):
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+
+        inputs = keras.Input(
+            batch_shape=(num_samples, timesteps, embedding_dim)
+        )
+        layer = keras.layers.LSTM(
+            units, return_state=True, return_sequences=True
+        )
+        outputs = layer(inputs)
+        output, state = outputs[0], outputs[1:]
+        output = keras.layers.LSTM(units)(output, initial_state=state)
+        model = keras.models.Model(inputs, output)
+
+        inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        outputs = model.predict(inputs)
+
+    def test_initial_states_as_other_inputs(self):
+        timesteps = 3
+        embedding_dim = 4
+        units = 3
+        num_samples = 2
+        num_states = 2
+        layer_class = keras.layers.LSTM
+
+        # Test with Keras tensor
+        main_inputs = keras.Input((timesteps, embedding_dim))
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        inputs = [main_inputs] + initial_state
+
+        layer = layer_class(units)
+        output = layer(inputs)
+        self.assertTrue(
+            any(
+                initial_state[0] is t
+                for t in layer._inbound_nodes[0].input_tensors
+            )
+        )
+
+        model = keras.models.Model(inputs, output)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=tf.compat.v1.train.AdamOptimizer(),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.train_on_batch([main_inputs] + initial_state, targets)
+
+    def test_regularizers_LSTM(self):
+        embedding_dim = 4
+        layer_class = keras.layers.LSTM
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            recurrent_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer="l2",
+            activity_regularizer="l1",
+        )
+        layer.build((None, None, 2))
+        self.assertEqual(len(layer.losses), 3)
+        x = keras.backend.variable(np.ones((2, 3, 2)))
+        layer(x)
+        if tf.executing_eagerly():
+            self.assertEqual(len(layer.losses), 4)
+        else:
+            self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message="Skipping as ROCm MIOpen does not support padded input.",
+    )
+    def test_statefulness_LSTM(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer_class = keras.layers.LSTM
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                4,
+                embedding_dim,
+                mask_zero=True,
+                input_length=timesteps,
+                batch_input_shape=(num_samples, timesteps),
+            )
+        )
+        layer = layer_class(
+            units, return_sequences=False, stateful=True, weights=None
+        )
+        model.add(layer)
+        model.compile(
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        out1 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertEqual(out1.shape, (num_samples, units))
+
+        # train once so that the states change
+        model.train_on_batch(
+            np.ones((num_samples, timesteps)), np.ones((num_samples, units))
+        )
+        out2 = model.predict(np.ones((num_samples, timesteps)))
+
+        # if the state is not reset, output should be different
+        self.assertNotEqual(out1.max(), out2.max())
+
+        # check that output changes after states are reset
+        # (even though the model itself didn't change)
+        layer.reset_states()
+        out3 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out2.max(), out3.max())
+
+        # check that container-level reset_states() works
+        model.reset_states()
+        out4 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertAllClose(out3, out4, atol=1e-5)
+
+        # check that the call to `predict` updated the states
+        out5 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out4.max(), out5.max())
+
+        # Check masking
+        layer.reset_states()
+
+        left_padded_input = np.ones((num_samples, timesteps))
+        left_padded_input[0, :1] = 0
+        left_padded_input[1, :2] = 0
+        out6 = model.predict(left_padded_input)
+
+        layer.reset_states()
+
+        right_padded_input = np.ones((num_samples, timesteps))
+        right_padded_input[0, -1:] = 0
+        right_padded_input[1, -2:] = 0
+        out7 = model.predict(right_padded_input)
+
+        self.assertAllClose(out7, out6, atol=1e-5)
+
+    @test_utils.run_v2_only
+    def test_cloned_weight_names(self):
+        inp = keras.Input([None, 3])
+        rnn = keras.layers.LSTM(units=3)
+        model = keras.Model(inp, rnn(inp))
+        clone = keras.models.clone_model(model)
+
+        model_names = [x.name for x in model.weights]
+        clone_names = [x.name for x in clone.weights]
+        self.assertEqual(model_names, clone_names)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/lstm_v1.py b/keras/layers/rnn/lstm_v1.py
index d883879b12b9..78d4c700cbb6 100644
--- a/keras/layers/rnn/lstm_v1.py
+++ b/keras/layers/rnn/lstm_v1.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Long Short-Term Memory V1 layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 from keras import activations
 from keras import constraints
@@ -24,372 +24,381 @@
 from keras.layers.rnn import rnn_utils
 from keras.layers.rnn.base_rnn import RNN
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export(v1=['keras.layers.LSTMCell'])
+@keras_export(v1=["keras.layers.LSTMCell"])
 class LSTMCell(lstm.LSTMCell):
-  """Cell class for the LSTM layer.
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use
-      for the recurrent step.
-      Default: hard sigmoid (`hard_sigmoid`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix,
-      used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean.
-      If True, add 1 to the bias of the forget gate at initialization.
-      Setting it to true will also force `bias_initializer="zeros"`.
-      This is recommended in [Jozefowicz et al., 2015](
-        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix.
-    recurrent_regularizer: Regularizer function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    kernel_constraint: Constraint function applied to
-      the `kernel` weights matrix.
-    recurrent_constraint: Constraint function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-
-  Call arguments:
-    inputs: A 2D tensor.
-    states: List of state tensors corresponding to the previous timestep.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. Only relevant when `dropout` or
-      `recurrent_dropout` is used.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               **kwargs):
-    super().__init__(
+    """Cell class for the LSTM layer.
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+        for the recurrent step.
+        Default: hard sigmoid (`hard_sigmoid`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean.
+        If True, add 1 to the bias of the forget gate at initialization.
+        Setting it to true will also force `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al., 2015](
+          http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+
+    Call arguments:
+      inputs: A 2D tensor.
+      states: List of state tensors corresponding to the previous timestep.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. Only relevant when `dropout` or
+        `recurrent_dropout` is used.
+    """
+
+    def __init__(
+        self,
         units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=kwargs.pop('implementation', 1),
-        **kwargs)
-
-
-@keras_export(v1=['keras.layers.LSTM'])
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs
+    ):
+        super().__init__(
+            units,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            unit_forget_bias=unit_forget_bias,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            implementation=kwargs.pop("implementation", 1),
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.layers.LSTM"])
 class LSTM(RNN):
-  """Long Short-Term Memory layer - Hochreiter 1997.
-
-   Note that this cell is not optimized for performance on GPU. Please use
-  `tf.compat.v1.keras.layers.CuDNNLSTM` for better performance on GPU.
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use
-      for the recurrent step.
-      Default: hard sigmoid (`hard_sigmoid`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs..
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix,
-      used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean.
-      If True, add 1 to the bias of the forget gate at initialization.
-      Setting it to true will also force `bias_initializer="zeros"`.
-      This is recommended in [Jozefowicz et al., 2015](
-        http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix.
-    recurrent_regularizer: Regularizer function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation").
-    kernel_constraint: Constraint function applied to
-      the `kernel` weights matrix.
-    recurrent_constraint: Constraint function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-    return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence.
-    return_state: Boolean. Whether to return the last state
-      in addition to the output.
-    go_backwards: Boolean (default False).
-      If True, process the input sequence backwards and return the
-      reversed sequence.
-    stateful: Boolean (default False). If True, the last state
-      for each sample at index i in a batch will be used as initial
-      state for the sample of index i in the following batch.
-    unroll: Boolean (default False).
-      If True, the network will be unrolled,
-      else a symbolic loop will be used.
-      Unrolling can speed-up a RNN,
-      although it tends to be more memory-intensive.
-      Unrolling is only suitable for short sequences.
-    time_major: The shape format of the `inputs` and `outputs` tensors.
-      If True, the inputs and outputs will be in shape
-      `(timesteps, batch, ...)`, whereas in the False case, it will be
-      `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
-      efficient because it avoids transposes at the beginning and end of the
-      RNN calculation. However, most TensorFlow data is batch-major, so by
-      default this function accepts input and emits output in batch-major
-      form.
-
-  Call arguments:
-    inputs: A 3D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked. An individual `True` entry indicates
-      that the corresponding timestep should be utilized, while a `False`
-      entry indicates that the corresponding timestep should be ignored.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used.
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    implementation = kwargs.pop('implementation', 1)
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=1`.'
-                      'Please update your layer call.')
-    if 'enable_caching_device' in kwargs:
-      cell_kwargs = {'enable_caching_device':
-                     kwargs.pop('enable_caching_device')}
-    else:
-      cell_kwargs = {}
-    cell = LSTMCell(
+    """Long Short-Term Memory layer - Hochreiter 1997.
+
+     Note that this cell is not optimized for performance on GPU. Please use
+    `tf.compat.v1.keras.layers.CuDNNLSTM` for better performance on GPU.
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+        for the recurrent step.
+        Default: hard sigmoid (`hard_sigmoid`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs..
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean.
+        If True, add 1 to the bias of the forget gate at initialization.
+        Setting it to true will also force `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al., 2015](
+          http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
+      kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+        the output of the layer (its "activation").
+      kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+      return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+        in addition to the output.
+      go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+        If True, the network will be unrolled,
+        else a symbolic loop will be used.
+        Unrolling can speed-up a RNN,
+        although it tends to be more memory-intensive.
+        Unrolling is only suitable for short sequences.
+      time_major: The shape format of the `inputs` and `outputs` tensors.
+        If True, the inputs and outputs will be in shape
+        `(timesteps, batch, ...)`, whereas in the False case, it will be
+        `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
+        efficient because it avoids transposes at the beginning and end of the
+        RNN calculation. However, most TensorFlow data is batch-major, so by
+        default this function accepts input and emits output in batch-major
+        form.
+
+    Call arguments:
+      inputs: A 3D tensor.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+        a given timestep should be masked. An individual `True` entry indicates
+        that the corresponding timestep should be utilized, while a `False`
+        entry indicates that the corresponding timestep should be ignored.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` is used.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+    """
+
+    def __init__(
+        self,
         units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        unit_forget_bias=unit_forget_bias,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        dtype=kwargs.get('dtype'),
-        trainable=kwargs.get('trainable', True),
-        **cell_kwargs)
-    super().__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.input_spec = [InputSpec(ndim=3)]
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    return super().call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def unit_forget_bias(self):
-    return self.cell.unit_forget_bias
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
-  def get_config(self):
-    config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
-    }
-    config.update(rnn_utils.config_for_enable_caching_device(self.cell))
-    base_config = super().get_config()
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
+        activation="tanh",
+        recurrent_activation="hard_sigmoid",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        unit_forget_bias=True,
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        unroll=False,
+        **kwargs
+    ):
+        implementation = kwargs.pop("implementation", 1)
+        if implementation == 0:
+            logging.warning(
+                "`implementation=0` has been deprecated, "
+                "and now defaults to `implementation=1`."
+                "Please update your layer call."
+            )
+        if "enable_caching_device" in kwargs:
+            cell_kwargs = {
+                "enable_caching_device": kwargs.pop("enable_caching_device")
+            }
+        else:
+            cell_kwargs = {}
+        cell = LSTMCell(
+            units,
+            activation=activation,
+            recurrent_activation=recurrent_activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            unit_forget_bias=unit_forget_bias,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            implementation=implementation,
+            dtype=kwargs.get("dtype"),
+            trainable=kwargs.get("trainable", True),
+            name="lstm_cell",
+            **cell_kwargs
+        )
+        super().__init__(
+            cell,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            unroll=unroll,
+            **kwargs
+        )
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.input_spec = [InputSpec(ndim=3)]
+
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        return super().call(
+            inputs, mask=mask, training=training, initial_state=initial_state
+        )
+
+    @property
+    def units(self):
+        return self.cell.units
+
+    @property
+    def activation(self):
+        return self.cell.activation
+
+    @property
+    def recurrent_activation(self):
+        return self.cell.recurrent_activation
+
+    @property
+    def use_bias(self):
+        return self.cell.use_bias
+
+    @property
+    def kernel_initializer(self):
+        return self.cell.kernel_initializer
+
+    @property
+    def recurrent_initializer(self):
+        return self.cell.recurrent_initializer
+
+    @property
+    def bias_initializer(self):
+        return self.cell.bias_initializer
+
+    @property
+    def unit_forget_bias(self):
+        return self.cell.unit_forget_bias
+
+    @property
+    def kernel_regularizer(self):
+        return self.cell.kernel_regularizer
+
+    @property
+    def recurrent_regularizer(self):
+        return self.cell.recurrent_regularizer
+
+    @property
+    def bias_regularizer(self):
+        return self.cell.bias_regularizer
+
+    @property
+    def kernel_constraint(self):
+        return self.cell.kernel_constraint
+
+    @property
+    def recurrent_constraint(self):
+        return self.cell.recurrent_constraint
+
+    @property
+    def bias_constraint(self):
+        return self.cell.bias_constraint
+
+    @property
+    def dropout(self):
+        return self.cell.dropout
+
+    @property
+    def recurrent_dropout(self):
+        return self.cell.recurrent_dropout
+
+    @property
+    def implementation(self):
+        return self.cell.implementation
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "recurrent_activation": activations.serialize(
+                self.recurrent_activation
+            ),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "unit_forget_bias": self.unit_forget_bias,
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+            "implementation": self.implementation,
+        }
+        config.update(rnn_utils.config_for_enable_caching_device(self.cell))
+        base_config = super().get_config()
+        del base_config["cell"]
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        if "implementation" in config and config["implementation"] == 0:
+            config["implementation"] = 1
+        return cls(**config)
diff --git a/keras/layers/rnn/lstm_v1_test.py b/keras/layers/rnn/lstm_v1_test.py
index 0cf6ffa0dd92..f1d539985dd8 100644
--- a/keras/layers/rnn/lstm_v1_test.py
+++ b/keras/layers/rnn/lstm_v1_test.py
@@ -17,20 +17,21 @@
 
 import time
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.layers.rnn import lstm
 from keras.layers.rnn import lstm_v1
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.platform import tf_logging as logging
 
-
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
 _rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
@@ -41,281 +42,330 @@
 
 @test_combinations.run_all_keras_modes(config=_config)
 class LSTMGraphRewriteTest(test_combinations.TestCase):
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_lstm_feature_parity_v1_v2(self):
+        input_shape = 10
+        rnn_state_size = 8
+        timestep = 4
+        batch = 20
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=batch,
+            test_samples=0,
+            input_shape=(timestep, input_shape),
+            num_classes=rnn_state_size,
+            random_seed=87654321,
+        )
+        y_train = np_utils.to_categorical(y_train, rnn_state_size)
+        # For the last batch item of the test data, we filter out the last
+        # timestep to simulate the variable length sequence and masking test.
+        x_train[-2:, -1, :] = 0.0
+        y_train[-2:] = 0
+
+        inputs = keras.layers.Input(
+            shape=[timestep, input_shape], dtype=tf.float32
+        )
+        masked_input = keras.layers.Masking()(inputs)
+        lstm_layer = lstm_v1.LSTM(
+            rnn_state_size, recurrent_activation="sigmoid"
+        )
+        output = lstm_layer(masked_input)
+        lstm_model = keras.models.Model(inputs, output)
+        weights = lstm_model.get_weights()
+        y_1 = lstm_model.predict(x_train)
+        lstm_model.compile("rmsprop", "mse")
+        lstm_model.fit(x_train, y_train)
+        y_2 = lstm_model.predict(x_train)
+
+        with test_utils.device(should_use_gpu=True):
+            cudnn_layer = lstm.LSTM(rnn_state_size)
+            cudnn_model = keras.models.Model(inputs, cudnn_layer(masked_input))
+        cudnn_model.set_weights(weights)
+        y_3 = cudnn_model.predict(x_train)
+        cudnn_model.compile("rmsprop", "mse")
+        cudnn_model.fit(x_train, y_train)
+        y_4 = cudnn_model.predict(x_train)
+
+        self.assertAllClose(y_1, y_3, rtol=1e-5, atol=2e-5)
+        self.assertAllClose(y_2, y_4, rtol=1e-5, atol=2e-5)
+
+    @parameterized.named_parameters(
+        # test_name, time_major, go_backwards
+        ("normal", False, False),
+        ("time_major", True, False),
+        ("go_backwards", False, True),
+        ("both", True, True),
+    )
+    def test_time_major_and_go_backward_v1_v2(self, time_major, go_backwards):
+        input_shape = 10
+        rnn_state_size = 8
+        timestep = 4
+        batch = 100
+
+        x_train = np.random.random((batch, timestep, input_shape))
+
+        def build_model(layer_cls):
+            inputs = keras.layers.Input(
+                shape=[timestep, input_shape], dtype=tf.float32
+            )
+            layer = layer_cls(
+                rnn_state_size,
+                recurrent_activation="sigmoid",
+                time_major=time_major,
+                return_sequences=True,
+                go_backwards=go_backwards,
+            )
+            if time_major:
+                converted_input = keras.layers.Lambda(
+                    lambda t: tf.transpose(t, [1, 0, 2])
+                )(inputs)
+                outputs = layer(converted_input)
+                outputs = keras.layers.Lambda(
+                    lambda t: tf.transpose(t, [1, 0, 2])
+                )(outputs)
+            else:
+                outputs = layer(inputs)
+            return keras.models.Model(inputs, outputs)
+
+        lstm_model = build_model(lstm_v1.LSTM)
+        y_ref = lstm_model.predict(x_train)
+        weights = lstm_model.get_weights()
+
+        lstm_v2_model = build_model(lstm.LSTM)
+        lstm_v2_model.set_weights(weights)
+        y = lstm_v2_model.predict(x_train)
+
+        self.assertAllClose(y, y_ref)
+
+        input_shape = 10
+        rnn_state_size = 8
+        output_shape = 8
+        timestep = 4
+        batch = 100
+        epoch = 10
+
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=batch,
+            test_samples=0,
+            input_shape=(timestep, input_shape),
+            num_classes=output_shape,
+        )
+        y_train = np_utils.to_categorical(y_train, output_shape)
+
+        layer = lstm.LSTM(rnn_state_size)
+
+        inputs = keras.layers.Input(
+            shape=[timestep, input_shape], dtype=tf.float32
+        )
 
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_lstm_feature_parity_v1_v2(self):
-    input_shape = 10
-    rnn_state_size = 8
-    timestep = 4
-    batch = 20
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=batch,
-        test_samples=0,
-        input_shape=(timestep, input_shape),
-        num_classes=rnn_state_size,
-        random_seed=87654321)
-    y_train = np_utils.to_categorical(y_train, rnn_state_size)
-    # For the last batch item of the test data, we filter out the last
-    # timestep to simulate the variable length sequence and masking test.
-    x_train[-2:, -1, :] = 0.0
-    y_train[-2:] = 0
-
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=tf.float32)
-    masked_input = keras.layers.Masking()(inputs)
-    lstm_layer = lstm_v1.LSTM(rnn_state_size, recurrent_activation='sigmoid')
-    output = lstm_layer(masked_input)
-    lstm_model = keras.models.Model(inputs, output)
-    weights = lstm_model.get_weights()
-    y_1 = lstm_model.predict(x_train)
-    lstm_model.compile('rmsprop', 'mse')
-    lstm_model.fit(x_train, y_train)
-    y_2 = lstm_model.predict(x_train)
-
-    with test_utils.device(should_use_gpu=True):
-      cudnn_layer = lstm.LSTM(rnn_state_size)
-      cudnn_model = keras.models.Model(inputs, cudnn_layer(masked_input))
-    cudnn_model.set_weights(weights)
-    y_3 = cudnn_model.predict(x_train)
-    cudnn_model.compile('rmsprop', 'mse')
-    cudnn_model.fit(x_train, y_train)
-    y_4 = cudnn_model.predict(x_train)
-
-    self.assertAllClose(y_1, y_3, rtol=1e-5, atol=2e-5)
-    self.assertAllClose(y_2, y_4, rtol=1e-5, atol=2e-5)
-
-  @parameterized.named_parameters(
-      # test_name, time_major, go_backwards
-      ('normal', False, False),
-      ('time_major', True, False),
-      ('go_backwards', False, True),
-      ('both', True, True),
-  )
-  def test_time_major_and_go_backward_v1_v2(self, time_major, go_backwards):
-    input_shape = 10
-    rnn_state_size = 8
-    timestep = 4
-    batch = 100
-
-    x_train = np.random.random((batch, timestep, input_shape))
-
-    def build_model(layer_cls):
-      inputs = keras.layers.Input(
-          shape=[timestep, input_shape], dtype=tf.float32)
-      layer = layer_cls(rnn_state_size,
-                        recurrent_activation='sigmoid',
-                        time_major=time_major,
-                        return_sequences=True,
-                        go_backwards=go_backwards)
-      if time_major:
-        converted_input = keras.layers.Lambda(
-            lambda t: tf.transpose(t, [1, 0, 2]))(inputs)
-        outputs = layer(converted_input)
-        outputs = keras.layers.Lambda(
-            lambda t: tf.transpose(t, [1, 0, 2]))(outputs)
-      else:
         outputs = layer(inputs)
-      return keras.models.Model(inputs, outputs)
-
-    lstm_model = build_model(lstm_v1.LSTM)
-    y_ref = lstm_model.predict(x_train)
-    weights = lstm_model.get_weights()
-
-    lstm_v2_model = build_model(lstm.LSTM)
-    lstm_v2_model.set_weights(weights)
-    y = lstm_v2_model.predict(x_train)
-
-    self.assertAllClose(y, y_ref)
-
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 10
-
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=batch,
-        test_samples=0,
-        input_shape=(timestep, input_shape),
-        num_classes=output_shape)
-    y_train = np_utils.to_categorical(y_train, output_shape)
-
-    layer = lstm.LSTM(rnn_state_size)
-
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=tf.float32)
-
-    outputs = layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('rmsprop', loss='mse')
-    model.fit(x_train, y_train, epochs=epoch)
-    model.evaluate(x_train, y_train)
-    model.predict(x_train)
-
-  @tf.test.disable_with_predicate(
-      pred=tf.test.is_built_with_rocm,
-      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
-  @test_utils.run_v2_only
-  def test_explicit_device_with_go_backward_and_mask_v1(self):
-    batch_size = 8
-    timestep = 7
-    masksteps = 5
-    units = 4
-
-    inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
-    mask = np.ones((batch_size, timestep)).astype(np.bool)
-    mask[:, masksteps:] = 0
-
-    lstm_v1_layer = lstm_v1.LSTM(
-        units, return_sequences=True, go_backwards=True)
-    with test_utils.device(should_use_gpu=True):
-      outputs_masked_v1 = lstm_v1_layer(inputs, mask=tf.constant(mask))
-      outputs_trimmed_v1 = lstm_v1_layer(inputs[:, :masksteps])
-    self.assertAllClose(outputs_masked_v1[:, -masksteps:], outputs_trimmed_v1)
+        model = keras.models.Model(inputs, outputs)
+        model.compile("rmsprop", loss="mse")
+        model.fit(x_train, y_train, epochs=epoch)
+        model.evaluate(x_train, y_train)
+        model.predict(x_train)
+
+    @tf.test.disable_with_predicate(
+        pred=tf.test.is_built_with_rocm,
+        skip_message=(
+            "Skipping as ROCm MIOpen does not support padded input yet."
+        ),
+    )
+    @test_utils.run_v2_only
+    def test_explicit_device_with_go_backward_and_mask_v1(self):
+        batch_size = 8
+        timestep = 7
+        masksteps = 5
+        units = 4
+
+        inputs = np.random.randn(batch_size, timestep, units).astype(np.float32)
+        mask = np.ones((batch_size, timestep)).astype(bool)
+        mask[:, masksteps:] = 0
+
+        lstm_v1_layer = lstm_v1.LSTM(
+            units, return_sequences=True, go_backwards=True
+        )
+        with test_utils.device(should_use_gpu=True):
+            outputs_masked_v1 = lstm_v1_layer(inputs, mask=tf.constant(mask))
+            outputs_trimmed_v1 = lstm_v1_layer(inputs[:, :masksteps])
+        self.assertAllClose(
+            outputs_masked_v1[:, -masksteps:], outputs_trimmed_v1
+        )
 
 
 class LSTMPerformanceTest(tf.test.Benchmark):
+    def _measure_performance(self, test_config, model, x_train, y_train):
+        batch = test_config["batch"]
+        epoch = test_config["epoch"]
+        warmup_epoch = test_config["warmup_epoch"]
+
+        # warm up the model
+        model.fit(x_train, y_train, batch_size=batch, epochs=warmup_epoch)
+        start_time = time.time()
+        model.fit(
+            x_train, y_train, batch_size=batch, epochs=epoch - warmup_epoch
+        )
+        end_time = time.time()
+        return (end_time - start_time) / (epoch - warmup_epoch)
+
+    def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
+        # Get the performance number for standard Cudnn LSTM
+        input_shape = test_config["input_shape"]
+        rnn_state_size = test_config["rnn_state_size"]
+        timestep = test_config["timestep"]
+
+        cudnn_lstm_layer = keras.layers.CuDNNLSTM(rnn_state_size)
+        inputs = keras.layers.Input(
+            shape=[timestep, input_shape], dtype=tf.float32
+        )
+
+        outputs = cudnn_lstm_layer(inputs)
+        model = keras.models.Model(inputs, outputs)
+        model.compile("sgd", "mse")
+
+        sec_per_epoch = self._measure_performance(
+            test_config, model, x_train, y_train
+        )
+        logging.info(
+            "Average performance for %s per epoch is: %s",
+            "CuDNN LSTM",
+            sec_per_epoch,
+        )
+        return sec_per_epoch
+
+    def _time_performance_run_unifed_lstm_gpu(
+        self, test_config, x_train, y_train
+    ):
+        # Get performance number for lstm_v2 with grappler swap the impl
+        input_shape = test_config["input_shape"]
+        rnn_state_size = test_config["rnn_state_size"]
+        timestep = test_config["timestep"]
+
+        layer = keras.layers.LSTM(rnn_state_size)
+        inputs = keras.layers.Input(
+            shape=[timestep, input_shape], dtype=tf.float32
+        )
+
+        outputs = layer(inputs)
+        model = keras.models.Model(inputs, outputs)
+        model.compile("sgd", "mse")
+
+        sec_per_epoch = self._measure_performance(
+            test_config, model, x_train, y_train
+        )
+        logging.info(
+            "Average performance for %s per epoch is: %s",
+            "LSTM V2",
+            sec_per_epoch,
+        )
+        return sec_per_epoch
+
+    def _time_performance_run_normal_lstm(self, test_config, x_train, y_train):
+        # Get performance number for standard LSTM on GPU.
+        input_shape = test_config["input_shape"]
+        rnn_state_size = test_config["rnn_state_size"]
+        timestep = test_config["timestep"]
+
+        layer = lstm_v1.LSTM(rnn_state_size)
+        inputs = keras.layers.Input(
+            shape=[timestep, input_shape], dtype=tf.float32
+        )
 
-  def _measure_performance(self, test_config, model, x_train, y_train):
-    batch = test_config['batch']
-    epoch = test_config['epoch']
-    warmup_epoch = test_config['warmup_epoch']
-
-    # warm up the model
-    model.fit(x_train, y_train, batch_size=batch, epochs=warmup_epoch)
-    start_time = time.time()
-    model.fit(x_train, y_train, batch_size=batch, epochs=epoch - warmup_epoch)
-    end_time = time.time()
-    return (end_time - start_time) / (epoch - warmup_epoch)
-
-  def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
-    # Get the performance number for standard Cudnn LSTM
-    input_shape = test_config['input_shape']
-    rnn_state_size = test_config['rnn_state_size']
-    timestep = test_config['timestep']
-
-    cudnn_lstm_layer = keras.layers.CuDNNLSTM(rnn_state_size)
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=tf.float32)
-
-    outputs = cudnn_lstm_layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-
-    sec_per_epoch = self._measure_performance(
-        test_config, model, x_train, y_train)
-    logging.info('Average performance for %s per epoch is: %s',
-                 'CuDNN LSTM', sec_per_epoch)
-    return sec_per_epoch
-
-  def _time_performance_run_unifed_lstm_gpu(
-      self, test_config, x_train, y_train):
-    # Get performance number for lstm_v2 with grappler swap the impl
-    input_shape = test_config['input_shape']
-    rnn_state_size = test_config['rnn_state_size']
-    timestep = test_config['timestep']
-
-    layer = keras.layers.LSTM(rnn_state_size)
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=tf.float32)
-
-    outputs = layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-
-    sec_per_epoch = self._measure_performance(
-        test_config, model, x_train, y_train)
-    logging.info('Average performance for %s per epoch is: %s',
-                 'LSTM V2', sec_per_epoch)
-    return sec_per_epoch
-
-  def _time_performance_run_normal_lstm(
-      self, test_config, x_train, y_train):
-    # Get performance number for standard LSTM on GPU.
-    input_shape = test_config['input_shape']
-    rnn_state_size = test_config['rnn_state_size']
-    timestep = test_config['timestep']
-
-    layer = lstm_v1.LSTM(rnn_state_size)
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=tf.float32)
-
-    outputs = layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-
-    sec_per_epoch = self._measure_performance(
-        test_config, model, x_train, y_train)
-    logging.info('Average performance for %s per epoch is: %s',
-                 'Normal LSTM', sec_per_epoch)
-    return sec_per_epoch
-
-  def _benchmark_performance_with_standard_cudnn_impl(self):
-    if not tf.test.is_gpu_available():
-      self.skipTest('performance test will only run on GPU')
-
-    mode = 'eager' if tf.executing_eagerly() else 'graph'
-    batch = 64
-    num_batch = 10
-    test_config = {
-        'input_shape': 128,
-        'rnn_state_size': 64,
-        'output_shape': 64,
-        'timestep': 50,
-        'batch': batch,
-        'epoch': 20,
-        # The performance for warmup epoch is ignored.
-        'warmup_epoch': 1,
-    }
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=(batch * num_batch),
-        test_samples=0,
-        input_shape=(test_config['timestep'], test_config['input_shape']),
-        num_classes=test_config['output_shape'])
-    y_train = np_utils.to_categorical(y_train, test_config['output_shape'])
-
-    cudnn_sec_per_epoch = self._time_performance_run_cudnn_lstm(
-        test_config, x_train, y_train)
-    lstm_v2_sec_per_epoch = self._time_performance_run_unifed_lstm_gpu(
-        test_config, x_train, y_train)
-    normal_lstm_sec_per_epoch = self._time_performance_run_normal_lstm(
-        test_config, x_train, y_train)
-
-    cudnn_vs_v2 = cudnn_sec_per_epoch / lstm_v2_sec_per_epoch
-    v2_vs_normal = normal_lstm_sec_per_epoch / lstm_v2_sec_per_epoch
-
-    self.report_benchmark(name='keras_cudnn_lstm_' + mode,
-                          wall_time=cudnn_sec_per_epoch,
-                          iters=test_config['epoch'],
-                          extras=test_config)
-    self.report_benchmark(name='keras_lstm_v2_' + mode,
-                          wall_time=lstm_v2_sec_per_epoch,
-                          iters=test_config['epoch'],
-                          extras=test_config)
-    self.report_benchmark(name='keras_canonical_lstm_' + mode,
-                          wall_time=normal_lstm_sec_per_epoch,
-                          iters=test_config['epoch'],
-                          extras=test_config)
-
-    logging.info('Expect the performance of LSTM V2 is within 80% of '
-                 'cuDNN LSTM, got {0:.2f}%'.format(cudnn_vs_v2 * 100))
-    logging.info('Expect the performance of LSTM V2 is more than 5 times'
-                 ' of normal LSTM, got {0:.2f}'.format(v2_vs_normal))
-
-  def benchmark_performance_graph(self):
-    with tf.compat.v1.get_default_graph().as_default():
-      with tf.compat.v1.Session(config=_config):
-        self._benchmark_performance_with_standard_cudnn_impl()
-
-  def benchmark_performance_eager(self):
-    with tf.__internal__.eager_context.eager_mode():
-      self._benchmark_performance_with_standard_cudnn_impl()
-
-
-if __name__ == '__main__':
-  tf.test.main()
+        outputs = layer(inputs)
+        model = keras.models.Model(inputs, outputs)
+        model.compile("sgd", "mse")
+
+        sec_per_epoch = self._measure_performance(
+            test_config, model, x_train, y_train
+        )
+        logging.info(
+            "Average performance for %s per epoch is: %s",
+            "Normal LSTM",
+            sec_per_epoch,
+        )
+        return sec_per_epoch
+
+    def _benchmark_performance_with_standard_cudnn_impl(self):
+        if not tf.test.is_gpu_available():
+            self.skipTest("performance test will only run on GPU")
+
+        mode = "eager" if tf.executing_eagerly() else "graph"
+        batch = 64
+        num_batch = 10
+        test_config = {
+            "input_shape": 128,
+            "rnn_state_size": 64,
+            "output_shape": 64,
+            "timestep": 50,
+            "batch": batch,
+            "epoch": 20,
+            # The performance for warmup epoch is ignored.
+            "warmup_epoch": 1,
+        }
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=(batch * num_batch),
+            test_samples=0,
+            input_shape=(test_config["timestep"], test_config["input_shape"]),
+            num_classes=test_config["output_shape"],
+        )
+        y_train = np_utils.to_categorical(y_train, test_config["output_shape"])
+
+        cudnn_sec_per_epoch = self._time_performance_run_cudnn_lstm(
+            test_config, x_train, y_train
+        )
+        lstm_v2_sec_per_epoch = self._time_performance_run_unifed_lstm_gpu(
+            test_config, x_train, y_train
+        )
+        normal_lstm_sec_per_epoch = self._time_performance_run_normal_lstm(
+            test_config, x_train, y_train
+        )
+
+        cudnn_vs_v2 = cudnn_sec_per_epoch / lstm_v2_sec_per_epoch
+        v2_vs_normal = normal_lstm_sec_per_epoch / lstm_v2_sec_per_epoch
+
+        self.report_benchmark(
+            name="keras_cudnn_lstm_" + mode,
+            wall_time=cudnn_sec_per_epoch,
+            iters=test_config["epoch"],
+            extras=test_config,
+        )
+        self.report_benchmark(
+            name="keras_lstm_v2_" + mode,
+            wall_time=lstm_v2_sec_per_epoch,
+            iters=test_config["epoch"],
+            extras=test_config,
+        )
+        self.report_benchmark(
+            name="keras_canonical_lstm_" + mode,
+            wall_time=normal_lstm_sec_per_epoch,
+            iters=test_config["epoch"],
+            extras=test_config,
+        )
+
+        logging.info(
+            "Expect the performance of LSTM V2 is within 80% of "
+            "cuDNN LSTM, got {0:.2f}%".format(cudnn_vs_v2 * 100)
+        )
+        logging.info(
+            "Expect the performance of LSTM V2 is more than 5 times"
+            " of normal LSTM, got {0:.2f}".format(v2_vs_normal)
+        )
+
+    def benchmark_performance_graph(self):
+        with tf.compat.v1.get_default_graph().as_default():
+            with tf.compat.v1.Session(config=_config):
+                self._benchmark_performance_with_standard_cudnn_impl()
+
+    def benchmark_performance_eager(self):
+        with tf.__internal__.eager_context.eager_mode():
+            self._benchmark_performance_with_standard_cudnn_impl()
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/rnn_utils.py b/keras/layers/rnn/rnn_utils.py
index 28ba910100c6..c11bb3762fd5 100644
--- a/keras/layers/rnn/rnn_utils.py
+++ b/keras/layers/rnn/rnn_utils.py
@@ -13,171 +13,183 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities for RNN cells and layers."""
-# pylint: disable=protected-access
 
-from keras.utils import control_flow_util
+
 import tensorflow.compat.v2 as tf
 
+from keras.utils import control_flow_util
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 
 
 def standardize_args(inputs, initial_state, constants, num_constants):
-  """Standardizes `__call__` to a single list of tensor inputs.
-
-  When running a model loaded from a file, the input tensors
-  `initial_state` and `constants` can be passed to `RNN.__call__()` as part
-  of `inputs` instead of by the dedicated keyword arguments. This method
-  makes sure the arguments are separated and that `initial_state` and
-  `constants` are lists of tensors (or None).
-
-  Args:
-    inputs: Tensor or list/tuple of tensors. which may include constants
-      and initial states. In that case `num_constant` must be specified.
-    initial_state: Tensor or list of tensors or None, initial states.
-    constants: Tensor or list of tensors or None, constant tensors.
-    num_constants: Expected number of constants (if constants are passed as
-      part of the `inputs` list.
-
-  Returns:
-    inputs: Single tensor or tuple of tensors.
-    initial_state: List of tensors or None.
-    constants: List of tensors or None.
-  """
-  if isinstance(inputs, list):
-    # There are several situations here:
-    # In the graph mode, __call__ will be only called once. The initial_state
-    # and constants could be in inputs (from file loading).
-    # In the eager mode, __call__ will be called twice, once during
-    # rnn_layer(inputs=input_t, constants=c_t, ...), and second time will be
-    # model.fit/train_on_batch/predict with real np data. In the second case,
-    # the inputs will contain initial_state and constants as eager tensor.
-    #
-    # For either case, the real input is the first item in the list, which
-    # could be a nested structure itself. Then followed by initial_states, which
-    # could be a list of items, or list of list if the initial_state is complex
-    # structure, and finally followed by constants which is a flat list.
-    assert initial_state is None and constants is None
-    if num_constants:
-      constants = inputs[-num_constants:]
-      inputs = inputs[:-num_constants]
-    if len(inputs) > 1:
-      initial_state = inputs[1:]
-      inputs = inputs[:1]
-
-    if len(inputs) > 1:
-      inputs = tuple(inputs)
-    else:
-      inputs = inputs[0]
-
-  def to_list_or_none(x):
-    if x is None or isinstance(x, list):
-      return x
-    if isinstance(x, tuple):
-      return list(x)
-    return [x]
-
-  initial_state = to_list_or_none(initial_state)
-  constants = to_list_or_none(constants)
-
-  return inputs, initial_state, constants
+    """Standardizes `__call__` to a single list of tensor inputs.
+
+    When running a model loaded from a file, the input tensors
+    `initial_state` and `constants` can be passed to `RNN.__call__()` as part
+    of `inputs` instead of by the dedicated keyword arguments. This method
+    makes sure the arguments are separated and that `initial_state` and
+    `constants` are lists of tensors (or None).
+
+    Args:
+      inputs: Tensor or list/tuple of tensors. which may include constants
+        and initial states. In that case `num_constant` must be specified.
+      initial_state: Tensor or list of tensors or None, initial states.
+      constants: Tensor or list of tensors or None, constant tensors.
+      num_constants: Expected number of constants (if constants are passed as
+        part of the `inputs` list.
+
+    Returns:
+      inputs: Single tensor or tuple of tensors.
+      initial_state: List of tensors or None.
+      constants: List of tensors or None.
+    """
+    if isinstance(inputs, list):
+        # There are several situations here:
+        # In the graph mode, __call__ will be only called once. The
+        # initial_state and constants could be in inputs (from file loading).
+        # In the eager mode, __call__ will be called twice, once during
+        # rnn_layer(inputs=input_t, constants=c_t, ...), and second time will be
+        # model.fit/train_on_batch/predict with real np data. In the second
+        # case, the inputs will contain initial_state and constants as eager
+        # tensor.
+        #
+        # For either case, the real input is the first item in the list, which
+        # could be a nested structure itself. Then followed by initial_states,
+        # which could be a list of items, or list of list if the initial_state
+        # is complex structure, and finally followed by constants which is a
+        # flat list.
+        assert initial_state is None and constants is None
+        if num_constants:
+            constants = inputs[-num_constants:]
+            inputs = inputs[:-num_constants]
+        if len(inputs) > 1:
+            initial_state = inputs[1:]
+            inputs = inputs[:1]
+
+        if len(inputs) > 1:
+            inputs = tuple(inputs)
+        else:
+            inputs = inputs[0]
+
+    def to_list_or_none(x):
+        if x is None or isinstance(x, list):
+            return x
+        if isinstance(x, tuple):
+            return list(x)
+        return [x]
+
+    initial_state = to_list_or_none(initial_state)
+    constants = to_list_or_none(constants)
+
+    return inputs, initial_state, constants
 
 
 def is_multiple_state(state_size):
-  """Check whether the state_size contains multiple states."""
-  return (hasattr(state_size, '__len__') and
-          not isinstance(state_size, tf.TensorShape))
+    """Check whether the state_size contains multiple states."""
+    return hasattr(state_size, "__len__") and not isinstance(
+        state_size, tf.TensorShape
+    )
 
 
 def generate_zero_filled_state_for_cell(cell, inputs, batch_size, dtype):
-  if inputs is not None:
-    batch_size = tf.shape(inputs)[0]
-    dtype = inputs.dtype
-  return generate_zero_filled_state(batch_size, cell.state_size, dtype)
+    if inputs is not None:
+        batch_size = tf.shape(inputs)[0]
+        dtype = inputs.dtype
+    return generate_zero_filled_state(batch_size, cell.state_size, dtype)
 
 
 def generate_zero_filled_state(batch_size_tensor, state_size, dtype):
-  """Generate a zero filled tensor with shape [batch_size, state_size]."""
-  if batch_size_tensor is None or dtype is None:
-    raise ValueError(
-        'batch_size and dtype cannot be None while constructing initial state. '
-        f'Received: batch_size={batch_size_tensor}, dtype={dtype}')
-
-  def create_zeros(unnested_state_size):
-    flat_dims = tf.TensorShape(unnested_state_size).as_list()
-    init_state_size = [batch_size_tensor] + flat_dims
-    return tf.zeros(init_state_size, dtype=dtype)
-
-  if tf.nest.is_nested(state_size):
-    return tf.nest.map_structure(create_zeros, state_size)
-  else:
-    return create_zeros(state_size)
+    """Generate a zero filled tensor with shape [batch_size, state_size]."""
+    if batch_size_tensor is None or dtype is None:
+        raise ValueError(
+            "batch_size and dtype cannot be None while constructing initial "
+            f"state. Received: batch_size={batch_size_tensor}, dtype={dtype}"
+        )
+
+    def create_zeros(unnested_state_size):
+        flat_dims = tf.TensorShape(unnested_state_size).as_list()
+        init_state_size = [batch_size_tensor] + flat_dims
+        return tf.zeros(init_state_size, dtype=dtype)
+
+    if tf.nest.is_nested(state_size):
+        return tf.nest.map_structure(create_zeros, state_size)
+    else:
+        return create_zeros(state_size)
 
 
 def caching_device(rnn_cell):
-  """Returns the caching device for the RNN variable.
-
-  This is useful for distributed training, when variable is not located as same
-  device as the training worker. By enabling the device cache, this allows
-  worker to read the variable once and cache locally, rather than read it every
-  time step from remote when it is needed.
-
-  Note that this is assuming the variable that cell needs for each time step is
-  having the same value in the forward path, and only gets updated in the
-  backprop. It is true for all the default cells (SimpleRNN, GRU, LSTM). If the
-  cell body relies on any variable that gets updated every time step, then
-  caching device will cause it to read the stall value.
-
-  Args:
-    rnn_cell: the rnn cell instance.
-  """
-  if tf.executing_eagerly():
-    # caching_device is not supported in eager mode.
-    return None
-  if not getattr(rnn_cell, '_enable_caching_device', False):
-    return None
-  # Don't set a caching device when running in a loop, since it is possible that
-  # train steps could be wrapped in a tf.while_loop. In that scenario caching
-  # prevents forward computations in loop iterations from re-reading the
-  # updated weights.
-  if control_flow_util.IsInWhileLoop(tf.compat.v1.get_default_graph()):
-    logging.warning(
-        'Variable read device caching has been disabled because the '
-        'RNN is in tf.while_loop loop context, which will cause '
-        'reading stalled value in forward path. This could slow down '
-        'the training due to duplicated variable reads. Please '
-        'consider updating your code to remove tf.while_loop if possible.')
-    return None
-  if (rnn_cell._dtype_policy.compute_dtype !=
-      rnn_cell._dtype_policy.variable_dtype):
-    logging.warning(
-        'Variable read device caching has been disabled since it '
-        'doesn\'t work with the mixed precision API. This is '
-        'likely to cause a slowdown for RNN training due to '
-        'duplicated read of variable for each timestep, which '
-        'will be significant in a multi remote worker setting. '
-        'Please consider disabling mixed precision API if '
-        'the performance has been affected.')
-    return None
-  # Cache the value on the device that access the variable.
-  return lambda op: op.device
+    """Returns the caching device for the RNN variable.
+
+    This is useful for distributed training, when variable is not located as
+    same device as the training worker. By enabling the device cache, this
+    allows worker to read the variable once and cache locally, rather than read
+    it every time step from remote when it is needed.
+
+    Note that this is assuming the variable that cell needs for each time step
+    is having the same value in the forward path, and only gets updated in the
+    backprop. It is true for all the default cells (SimpleRNN, GRU, LSTM). If
+    the cell body relies on any variable that gets updated every time step, then
+    caching device will cause it to read the stall value.
+
+    Args:
+      rnn_cell: the rnn cell instance.
+    """
+    if tf.executing_eagerly():
+        # caching_device is not supported in eager mode.
+        return None
+    if not getattr(rnn_cell, "_enable_caching_device", False):
+        return None
+    # Don't set a caching device when running in a loop, since it is possible
+    # that train steps could be wrapped in a tf.while_loop. In that scenario
+    # caching prevents forward computations in loop iterations from re-reading
+    # the updated weights.
+    if control_flow_util.IsInWhileLoop(tf.compat.v1.get_default_graph()):
+        logging.warning(
+            "Variable read device caching has been disabled because the "
+            "RNN is in tf.while_loop loop context, which will cause "
+            "reading stalled value in forward path. This could slow down "
+            "the training due to duplicated variable reads. Please "
+            "consider updating your code to remove tf.while_loop if possible."
+        )
+        return None
+    if (
+        rnn_cell._dtype_policy.compute_dtype
+        != rnn_cell._dtype_policy.variable_dtype
+    ):
+        logging.warning(
+            "Variable read device caching has been disabled since it "
+            "doesn't work with the mixed precision API. This is "
+            "likely to cause a slowdown for RNN training due to "
+            "duplicated read of variable for each timestep, which "
+            "will be significant in a multi remote worker setting. "
+            "Please consider disabling mixed precision API if "
+            "the performance has been affected."
+        )
+        return None
+    # Cache the value on the device that access the variable.
+    return lambda op: op.device
 
 
 def config_for_enable_caching_device(rnn_cell):
-  """Return the dict config for RNN cell wrt to enable_caching_device field.
-
-  Since enable_caching_device is a internal implementation detail for speed up
-  the RNN variable read when running on the multi remote worker setting, we
-  don't want this config to be serialized constantly in the JSON. We will only
-  serialize this field when a none default value is used to create the cell.
-  Args:
-    rnn_cell: the RNN cell for serialize.
-
-  Returns:
-    A dict which contains the JSON config for enable_caching_device value or
-    empty dict if the enable_caching_device value is same as the default value.
-  """
-  default_enable_caching_device = tf.compat.v1.executing_eagerly_outside_functions(
-  )
-  if rnn_cell._enable_caching_device != default_enable_caching_device:
-    return {'enable_caching_device': rnn_cell._enable_caching_device}
-  return {}
+    """Return the dict config for RNN cell wrt to enable_caching_device field.
+
+    Since enable_caching_device is a internal implementation detail for speed up
+    the RNN variable read when running on the multi remote worker setting, we
+    don't want this config to be serialized constantly in the JSON. We will only
+    serialize this field when a none default value is used to create the cell.
+    Args:
+      rnn_cell: the RNN cell for serialize.
+
+    Returns:
+      A dict which contains the JSON config for enable_caching_device value or
+      empty dict if the enable_caching_device value is same as the default
+      value.
+    """
+    default_enable_caching_device = (
+        tf.compat.v1.executing_eagerly_outside_functions()
+    )
+    if rnn_cell._enable_caching_device != default_enable_caching_device:
+        return {"enable_caching_device": rnn_cell._enable_caching_device}
+    return {}
diff --git a/keras/layers/rnn/simple_rnn.py b/keras/layers/rnn/simple_rnn.py
index 74c1579422bb..97a2e94d761f 100644
--- a/keras/layers/rnn/simple_rnn.py
+++ b/keras/layers/rnn/simple_rnn.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Fully connected RNN layer."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import activations
 from keras import backend
@@ -26,467 +28,483 @@
 from keras.layers.rnn.base_rnn import RNN
 from keras.layers.rnn.dropout_rnn_cell_mixin import DropoutRNNCellMixin
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.SimpleRNNCell')
+@keras_export("keras.layers.SimpleRNNCell")
 class SimpleRNNCell(DropoutRNNCellMixin, base_layer.BaseRandomLayer):
-  """Cell class for SimpleRNN.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  This class processes one step within the whole time sequence input, whereas
-  `tf.keras.layer.SimpleRNN` processes the whole sequence.
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs. Default:
-      `glorot_uniform`.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix, used for the linear transformation of the recurrent state.
-      Default: `orthogonal`.
-    bias_initializer: Initializer for the bias vector. Default: `zeros`.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix. Default: `None`.
-    bias_regularizer: Regularizer function applied to the bias vector. Default:
-      `None`.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix. Default: `None`.
-    bias_constraint: Constraint function applied to the bias vector. Default:
-      `None`.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs. Default: 0.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state. Default: 0.
-
-  Call arguments:
-    inputs: A 2D tensor, with shape of `[batch, feature]`.
-    states: A 2D tensor with shape of `[batch, units]`, which is the state from
-      the previous time step. For timestep 0, the initial state provided by user
-      will be feed to cell.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. Only relevant when `dropout` or
-      `recurrent_dropout` is used.
-
-  Examples:
-
-  ```python
-  inputs = np.random.random([32, 10, 8]).astype(np.float32)
-  rnn = tf.keras.layers.RNN(tf.keras.layers.SimpleRNNCell(4))
-
-  output = rnn(inputs)  # The output has shape `[32, 4]`.
-
-  rnn = tf.keras.layers.RNN(
-      tf.keras.layers.SimpleRNNCell(4),
-      return_sequences=True,
-      return_state=True)
-
-  # whole_sequence_output has shape `[32, 10, 4]`.
-  # final_state has shape `[32, 4]`.
-  whole_sequence_output, final_state = rnn(inputs)
-  ```
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               **kwargs):
-    if units < 0:
-      raise ValueError(f'Received an invalid value for argument `units`, '
-                       f'expected a positive integer, got {units}.')
-    # By default use cached variable under v2 mode, see b/143699808.
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self._enable_caching_device = kwargs.pop('enable_caching_device', True)
-    else:
-      self._enable_caching_device = kwargs.pop('enable_caching_device', False)
-    super().__init__(**kwargs)
-    self.units = units
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.recurrent_initializer = initializers.get(recurrent_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.recurrent_constraint = constraints.get(recurrent_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-    self.dropout = min(1., max(0., dropout))
-    self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_size = self.units
-    self.output_size = self.units
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    default_caching_device = rnn_utils.caching_device(self)
-    self.kernel = self.add_weight(
-        shape=(input_shape[-1], self.units),
-        name='kernel',
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        caching_device=default_caching_device)
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units),
-        name='recurrent_kernel',
-        initializer=self.recurrent_initializer,
-        regularizer=self.recurrent_regularizer,
-        constraint=self.recurrent_constraint,
-        caching_device=default_caching_device)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.units,),
-          name='bias',
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          caching_device=default_caching_device)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs, states, training=None):
-    prev_output = states[0] if tf.nest.is_nested(states) else states
-    dp_mask = self.get_dropout_mask_for_cell(inputs, training)
-    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
-        prev_output, training)
-
-    if dp_mask is not None:
-      h = backend.dot(inputs * dp_mask, self.kernel)
-    else:
-      h = backend.dot(inputs, self.kernel)
-    if self.bias is not None:
-      h = backend.bias_add(h, self.bias)
-
-    if rec_dp_mask is not None:
-      prev_output = prev_output * rec_dp_mask
-    output = h + backend.dot(prev_output, self.recurrent_kernel)
-    if self.activation is not None:
-      output = self.activation(output)
-
-    new_state = [output] if tf.nest.is_nested(states) else output
-    return output, new_state
-
-  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    return rnn_utils.generate_zero_filled_state_for_cell(
-        self, inputs, batch_size, dtype)
-
-  def get_config(self):
-    config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
-    }
-    config.update(rnn_utils.config_for_enable_caching_device(self))
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.SimpleRNN')
+    """Cell class for SimpleRNN.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    This class processes one step within the whole time sequence input, whereas
+    `tf.keras.layer.SimpleRNN` processes the whole sequence.
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`).
+        If you pass `None`, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs. Default:
+        `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix, used for the linear transformation of the recurrent
+        state.  Default: `orthogonal`.
+      bias_initializer: Initializer for the bias vector. Default: `zeros`.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_regularizer: Regularizer function applied to the bias vector.
+        Default: `None`.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_constraint: Constraint function applied to the bias vector. Default:
+        `None`.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs. Default: 0.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+        for the linear transformation of the recurrent state. Default: 0.
+
+    Call arguments:
+      inputs: A 2D tensor, with shape of `[batch, feature]`.
+      states: A 2D tensor with shape of `[batch, units]`, which is the state
+        from the previous time step. For timestep 0, the initial state provided
+        by user will be feed to cell.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. Only relevant when `dropout` or
+        `recurrent_dropout` is used.
+
+    Examples:
+
+    ```python
+    inputs = np.random.random([32, 10, 8]).astype(np.float32)
+    rnn = tf.keras.layers.RNN(tf.keras.layers.SimpleRNNCell(4))
+
+    output = rnn(inputs)  # The output has shape `[32, 4]`.
+
+    rnn = tf.keras.layers.RNN(
+        tf.keras.layers.SimpleRNNCell(4),
+        return_sequences=True,
+        return_state=True)
+
+    # whole_sequence_output has shape `[32, 10, 4]`.
+    # final_state has shape `[32, 4]`.
+    whole_sequence_output, final_state = rnn(inputs)
+    ```
+    """
+
+    def __init__(
+        self,
+        units,
+        activation="tanh",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        **kwargs,
+    ):
+        if units <= 0:
+            raise ValueError(
+                "Received an invalid value for argument `units`, "
+                f"expected a positive integer, got {units}."
+            )
+        # By default use cached variable under v2 mode, see b/143699808.
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self._enable_caching_device = kwargs.pop(
+                "enable_caching_device", True
+            )
+        else:
+            self._enable_caching_device = kwargs.pop(
+                "enable_caching_device", False
+            )
+        super().__init__(**kwargs)
+        self.units = units
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.recurrent_initializer = initializers.get(recurrent_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.recurrent_constraint = constraints.get(recurrent_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+        self.dropout = min(1.0, max(0.0, dropout))
+        self.recurrent_dropout = min(1.0, max(0.0, recurrent_dropout))
+        self.state_size = self.units
+        self.output_size = self.units
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        super().build(input_shape)
+        default_caching_device = rnn_utils.caching_device(self)
+        self.kernel = self.add_weight(
+            shape=(input_shape[-1], self.units),
+            name="kernel",
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            caching_device=default_caching_device,
+        )
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            name="recurrent_kernel",
+            initializer=self.recurrent_initializer,
+            regularizer=self.recurrent_regularizer,
+            constraint=self.recurrent_constraint,
+            caching_device=default_caching_device,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                shape=(self.units,),
+                name="bias",
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+                caching_device=default_caching_device,
+            )
+        else:
+            self.bias = None
+        self.built = True
+
+    def call(self, inputs, states, training=None):
+        prev_output = states[0] if tf.nest.is_nested(states) else states
+        dp_mask = self.get_dropout_mask_for_cell(inputs, training)
+        rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+            prev_output, training
+        )
+
+        if dp_mask is not None:
+            h = backend.dot(inputs * dp_mask, self.kernel)
+        else:
+            h = backend.dot(inputs, self.kernel)
+        if self.bias is not None:
+            h = backend.bias_add(h, self.bias)
+
+        if rec_dp_mask is not None:
+            prev_output = prev_output * rec_dp_mask
+        output = h + backend.dot(prev_output, self.recurrent_kernel)
+        if self.activation is not None:
+            output = self.activation(output)
+
+        new_state = [output] if tf.nest.is_nested(states) else output
+        return output, new_state
+
+    def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        return rnn_utils.generate_zero_filled_state_for_cell(
+            self, inputs, batch_size, dtype
+        )
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+        }
+        config.update(rnn_utils.config_for_enable_caching_device(self))
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.layers.SimpleRNN")
 class SimpleRNN(RNN):
-  """Fully-connected RNN where the output is to be fed back to input.
-
-  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
-  for details about the usage of RNN API.
-
-  Args:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass None, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs. Default:
-      `glorot_uniform`.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-      weights matrix, used for the linear transformation of the recurrent state.
-      Default: `orthogonal`.
-    bias_initializer: Initializer for the bias vector. Default: `zeros`.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix. Default: `None`.
-    bias_regularizer: Regularizer function applied to the bias vector. Default:
-      `None`.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation"). Default: `None`.
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix. Default: `None`.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix.  Default: `None`.
-    bias_constraint: Constraint function applied to the bias vector. Default:
-      `None`.
-    dropout: Float between 0 and 1.
-      Fraction of the units to drop for the linear transformation of the inputs.
-      Default: 0.
-    recurrent_dropout: Float between 0 and 1.
-      Fraction of the units to drop for the linear transformation of the
-      recurrent state. Default: 0.
-    return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence. Default: `False`.
-    return_state: Boolean. Whether to return the last state
-      in addition to the output. Default: `False`
-    go_backwards: Boolean (default False).
-      If True, process the input sequence backwards and return the
-      reversed sequence.
-    stateful: Boolean (default False). If True, the last state
-      for each sample at index i in a batch will be used as initial
-      state for the sample of index i in the following batch.
-    unroll: Boolean (default False).
-      If True, the network will be unrolled,
-      else a symbolic loop will be used.
-      Unrolling can speed-up a RNN,
-      although it tends to be more memory-intensive.
-      Unrolling is only suitable for short sequences.
-
-  Call arguments:
-    inputs: A 3D tensor, with shape `[batch, timesteps, feature]`.
-    mask: Binary tensor of shape `[batch, timesteps]` indicating whether
-      a given timestep should be masked. An individual `True` entry indicates
-      that the corresponding timestep should be utilized, while a `False` entry
-      indicates that the corresponding timestep should be ignored.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used.
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
-
-  Examples:
-
-  ```python
-  inputs = np.random.random([32, 10, 8]).astype(np.float32)
-  simple_rnn = tf.keras.layers.SimpleRNN(4)
-
-  output = simple_rnn(inputs)  # The output has shape `[32, 4]`.
-
-  simple_rnn = tf.keras.layers.SimpleRNN(
-      4, return_sequences=True, return_state=True)
-
-  # whole_sequence_output has shape `[32, 10, 4]`.
-  # final_state has shape `[32, 4]`.
-  whole_sequence_output, final_state = simple_rnn(inputs)
-  ```
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if 'implementation' in kwargs:
-      kwargs.pop('implementation')
-      logging.warning('The `implementation` argument '
-                      'in `SimpleRNN` has been deprecated. '
-                      'Please remove it from your layer call.')
-    if 'enable_caching_device' in kwargs:
-      cell_kwargs = {'enable_caching_device':
-                     kwargs.pop('enable_caching_device')}
-    else:
-      cell_kwargs = {}
-    cell = SimpleRNNCell(
+    """Fully-connected RNN where the output is to be fed back to input.
+
+    See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
+    for details about the usage of RNN API.
+
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`).
+        If you pass None, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs. Default:
+        `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix, used for the linear transformation of the recurrent
+        state.  Default: `orthogonal`.
+      bias_initializer: Initializer for the bias vector. Default: `zeros`.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix. Default: `None`.
+      bias_regularizer: Regularizer function applied to the bias vector.
+        Default: `None`.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation"). Default: `None`.
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix. Default: `None`.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.  Default: `None`.
+      bias_constraint: Constraint function applied to the bias vector. Default:
+        `None`.
+      dropout: Float between 0 and 1.
+        Fraction of the units to drop for the linear transformation of the
+        inputs. Default: 0.
+      recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for the linear transformation of the
+        recurrent state. Default: 0.
+      return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence. Default: `False`.
+      return_state: Boolean. Whether to return the last state
+        in addition to the output. Default: `False`
+      go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+        If True, the network will be unrolled,
+        else a symbolic loop will be used.
+        Unrolling can speed-up a RNN,
+        although it tends to be more memory-intensive.
+        Unrolling is only suitable for short sequences.
+
+    Call arguments:
+      inputs: A 3D tensor, with shape `[batch, timesteps, feature]`.
+      mask: Binary tensor of shape `[batch, timesteps]` indicating whether
+        a given timestep should be masked. An individual `True` entry indicates
+        that the corresponding timestep should be utilized, while a `False`
+        entry indicates that the corresponding timestep should be ignored.
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the cell
+        when calling it. This is only relevant if `dropout` or
+        `recurrent_dropout` is used.
+      initial_state: List of initial state tensors to be passed to the first
+        call of the cell.
+
+    Examples:
+
+    ```python
+    inputs = np.random.random([32, 10, 8]).astype(np.float32)
+    simple_rnn = tf.keras.layers.SimpleRNN(4)
+
+    output = simple_rnn(inputs)  # The output has shape `[32, 4]`.
+
+    simple_rnn = tf.keras.layers.SimpleRNN(
+        4, return_sequences=True, return_state=True)
+
+    # whole_sequence_output has shape `[32, 10, 4]`.
+    # final_state has shape `[32, 4]`.
+    whole_sequence_output, final_state = simple_rnn(inputs)
+    ```
+    """
+
+    def __init__(
+        self,
         units,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        dtype=kwargs.get('dtype'),
-        trainable=kwargs.get('trainable', True),
-        **cell_kwargs)
-    super().__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.input_spec = [InputSpec(ndim=3)]
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    return super().call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  def get_config(self):
-    config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
-    }
-    base_config = super().get_config()
-    config.update(rnn_utils.config_for_enable_caching_device(self.cell))
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config:
-      config.pop('implementation')
-    return cls(**config)
+        activation="tanh",
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        recurrent_initializer="orthogonal",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        recurrent_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        recurrent_constraint=None,
+        bias_constraint=None,
+        dropout=0.0,
+        recurrent_dropout=0.0,
+        return_sequences=False,
+        return_state=False,
+        go_backwards=False,
+        stateful=False,
+        unroll=False,
+        **kwargs,
+    ):
+        if "implementation" in kwargs:
+            kwargs.pop("implementation")
+            logging.warning(
+                "The `implementation` argument "
+                "in `SimpleRNN` has been deprecated. "
+                "Please remove it from your layer call."
+            )
+        if "enable_caching_device" in kwargs:
+            cell_kwargs = {
+                "enable_caching_device": kwargs.pop("enable_caching_device")
+            }
+        else:
+            cell_kwargs = {}
+        cell = SimpleRNNCell(
+            units,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            recurrent_initializer=recurrent_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            recurrent_regularizer=recurrent_regularizer,
+            bias_regularizer=bias_regularizer,
+            kernel_constraint=kernel_constraint,
+            recurrent_constraint=recurrent_constraint,
+            bias_constraint=bias_constraint,
+            dropout=dropout,
+            recurrent_dropout=recurrent_dropout,
+            dtype=kwargs.get("dtype"),
+            trainable=kwargs.get("trainable", True),
+            name="simple_rnn_cell",
+            **cell_kwargs,
+        )
+        super().__init__(
+            cell,
+            return_sequences=return_sequences,
+            return_state=return_state,
+            go_backwards=go_backwards,
+            stateful=stateful,
+            unroll=unroll,
+            **kwargs,
+        )
+        self.activity_regularizer = regularizers.get(activity_regularizer)
+        self.input_spec = [InputSpec(ndim=3)]
+
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        return super().call(
+            inputs, mask=mask, training=training, initial_state=initial_state
+        )
+
+    @property
+    def units(self):
+        return self.cell.units
+
+    @property
+    def activation(self):
+        return self.cell.activation
+
+    @property
+    def use_bias(self):
+        return self.cell.use_bias
+
+    @property
+    def kernel_initializer(self):
+        return self.cell.kernel_initializer
+
+    @property
+    def recurrent_initializer(self):
+        return self.cell.recurrent_initializer
+
+    @property
+    def bias_initializer(self):
+        return self.cell.bias_initializer
+
+    @property
+    def kernel_regularizer(self):
+        return self.cell.kernel_regularizer
+
+    @property
+    def recurrent_regularizer(self):
+        return self.cell.recurrent_regularizer
+
+    @property
+    def bias_regularizer(self):
+        return self.cell.bias_regularizer
+
+    @property
+    def kernel_constraint(self):
+        return self.cell.kernel_constraint
+
+    @property
+    def recurrent_constraint(self):
+        return self.cell.recurrent_constraint
+
+    @property
+    def bias_constraint(self):
+        return self.cell.bias_constraint
+
+    @property
+    def dropout(self):
+        return self.cell.dropout
+
+    @property
+    def recurrent_dropout(self):
+        return self.cell.recurrent_dropout
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "recurrent_initializer": initializers.serialize(
+                self.recurrent_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "recurrent_regularizer": regularizers.serialize(
+                self.recurrent_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": regularizers.serialize(
+                self.activity_regularizer
+            ),
+            "kernel_constraint": constraints.serialize(self.kernel_constraint),
+            "recurrent_constraint": constraints.serialize(
+                self.recurrent_constraint
+            ),
+            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "dropout": self.dropout,
+            "recurrent_dropout": self.recurrent_dropout,
+        }
+        base_config = super().get_config()
+        config.update(rnn_utils.config_for_enable_caching_device(self.cell))
+        del base_config["cell"]
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        if "implementation" in config:
+            config.pop("implementation")
+        return cls(**config)
diff --git a/keras/layers/rnn/simple_rnn_test.py b/keras/layers/rnn/simple_rnn_test.py
index 8901d363c540..9cd1a27668d7 100644
--- a/keras/layers/rnn/simple_rnn_test.py
+++ b/keras/layers/rnn/simple_rnn_test.py
@@ -14,12 +14,11 @@
 # ==============================================================================
 """Tests for SimpleRNN layer."""
 
-import tensorflow.compat.v2 as tf
-
 import copy
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras.testing_infra import test_combinations
@@ -28,206 +27,228 @@
 
 @test_combinations.generate(test_combinations.keras_mode_combinations())
 class SimpleRNNLayerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_return_sequences_SimpleRNN(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.SimpleRNN,
-        kwargs={'units': units,
-                'return_sequences': True},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  @test_utils.run_v2_only
-  def test_float64_SimpleRNN(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.SimpleRNN,
-        kwargs={'units': units,
-                'return_sequences': True,
-                'dtype': 'float64'},
-        input_shape=(num_samples, timesteps, embedding_dim),
-        input_dtype='float64')
-
-  def test_dynamic_behavior_SimpleRNN(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
-    model = keras.models.Sequential()
-    model.add(layer)
-    model.compile('rmsprop', 'mse')
-    x = np.random.random((num_samples, timesteps, embedding_dim))
-    y = np.random.random((num_samples, units))
-    model.train_on_batch(x, y)
-
-  def test_dropout_SimpleRNN(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    test_utils.layer_test(
-        keras.layers.SimpleRNN,
-        kwargs={'units': units,
-                'dropout': 0.1,
-                'recurrent_dropout': 0.1},
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_implementation_mode_SimpleRNN(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    for mode in [0, 1, 2]:
-      test_utils.layer_test(
-          keras.layers.SimpleRNN,
-          kwargs={'units': units,
-                  'implementation': mode},
-          input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_constraints_SimpleRNN(self):
-    embedding_dim = 4
-    layer_class = keras.layers.SimpleRNN
-    k_constraint = keras.constraints.max_norm(0.01)
-    r_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_constraint=k_constraint,
-        recurrent_constraint=r_constraint,
-        bias_constraint=b_constraint)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-    self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  def test_with_masking_layer_SimpleRNN(self):
-    layer_class = keras.layers.SimpleRNN
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  def test_from_config_SimpleRNN(self):
-    layer_class = keras.layers.SimpleRNN
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
-  def test_deep_copy_SimpleRNN(self):
-    cell = keras.layers.SimpleRNNCell(5)
-    copied_cell = copy.deepcopy(cell)
-    self.assertEqual(copied_cell.units, 5)
-    self.assertEqual(cell.get_config(), copied_cell.get_config())
-
-  def test_regularizers_SimpleRNN(self):
-    embedding_dim = 4
-    layer_class = keras.layers.SimpleRNN
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertLen(layer.losses, 3)
-
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    if tf.executing_eagerly():
-      self.assertLen(layer.losses, 4)
-    else:
-      self.assertLen(layer.get_losses_for(x), 1)
-
-  def test_statefulness_SimpleRNN(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.SimpleRNN
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Embedding(
-            4,
-            embedding_dim,
-            mask_zero=True,
-            input_length=timesteps,
-            batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(
-        units, return_sequences=False, stateful=True, weights=None)
-    model.add(layer)
-    model.compile(
-        optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    out1 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertEqual(out1.shape, (num_samples, units))
-
-    # train once so that the states change
-    model.train_on_batch(
-        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-    out2 = model.predict(np.ones((num_samples, timesteps)))
-
-    # if the state is not reset, output should be different
-    self.assertNotEqual(out1.max(), out2.max())
-
-    # check that output changes after states are reset
-    # (even though the model itself didn't change)
-    layer.reset_states()
-    out3 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out2.max(), out3.max())
-
-    # check that container-level reset_states() works
-    model.reset_states()
-    out4 = model.predict(np.ones((num_samples, timesteps)))
-    np.testing.assert_allclose(out3, out4, atol=1e-5)
-
-    # check that the call to `predict` updated the states
-    out5 = model.predict(np.ones((num_samples, timesteps)))
-    self.assertNotEqual(out4.max(), out5.max())
-
-    # Check masking
-    layer.reset_states()
-
-    left_padded_input = np.ones((num_samples, timesteps))
-    left_padded_input[0, :1] = 0
-    left_padded_input[1, :2] = 0
-    out6 = model.predict(left_padded_input)
-
-    layer.reset_states()
-
-    right_padded_input = np.ones((num_samples, timesteps))
-    right_padded_input[0, -1:] = 0
-    right_padded_input[1, -2:] = 0
-    out7 = model.predict(right_padded_input)
-
-    np.testing.assert_allclose(out7, out6, atol=1e-5)
-
-  def test_get_initial_states(self):
-    batch_size = 4
-    cell = keras.layers.SimpleRNNCell(20)
-    initial_state = cell.get_initial_state(
-        batch_size=batch_size, dtype=tf.float32)
-    _, state = cell(np.ones((batch_size, 20), dtype=np.float32), initial_state)
-    self.assertEqual(state.shape, initial_state.shape)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_return_sequences_SimpleRNN(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.SimpleRNN,
+            kwargs={"units": units, "return_sequences": True},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    @test_utils.run_v2_only
+    def test_float64_SimpleRNN(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.SimpleRNN,
+            kwargs={
+                "units": units,
+                "return_sequences": True,
+                "dtype": "float64",
+            },
+            input_shape=(num_samples, timesteps, embedding_dim),
+            input_dtype="float64",
+        )
+
+    def test_dynamic_behavior_SimpleRNN(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
+        model = keras.models.Sequential()
+        model.add(layer)
+        model.compile("rmsprop", "mse")
+        x = np.random.random((num_samples, timesteps, embedding_dim))
+        y = np.random.random((num_samples, units))
+        model.train_on_batch(x, y)
+
+    def test_dropout_SimpleRNN(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        test_utils.layer_test(
+            keras.layers.SimpleRNN,
+            kwargs={"units": units, "dropout": 0.1, "recurrent_dropout": 0.1},
+            input_shape=(num_samples, timesteps, embedding_dim),
+        )
+
+    def test_implementation_mode_SimpleRNN(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        for mode in [0, 1, 2]:
+            test_utils.layer_test(
+                keras.layers.SimpleRNN,
+                kwargs={"units": units, "implementation": mode},
+                input_shape=(num_samples, timesteps, embedding_dim),
+            )
+
+    def test_constraints_SimpleRNN(self):
+        embedding_dim = 4
+        layer_class = keras.layers.SimpleRNN
+        k_constraint = keras.constraints.max_norm(0.01)
+        r_constraint = keras.constraints.max_norm(0.01)
+        b_constraint = keras.constraints.max_norm(0.01)
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_constraint=k_constraint,
+            recurrent_constraint=r_constraint,
+            bias_constraint=b_constraint,
+        )
+        layer.build((None, None, embedding_dim))
+        self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+        self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+        self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+    def test_with_masking_layer_SimpleRNN(self):
+        layer_class = keras.layers.SimpleRNN
+        inputs = np.random.random((2, 3, 4))
+        targets = np.abs(np.random.random((2, 3, 5)))
+        targets /= targets.sum(axis=-1, keepdims=True)
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(input_shape=(3, 4)))
+        model.add(layer_class(units=5, return_sequences=True, unroll=False))
+        model.compile(loss="categorical_crossentropy", optimizer="rmsprop")
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+    def test_from_config_SimpleRNN(self):
+        layer_class = keras.layers.SimpleRNN
+        for stateful in (False, True):
+            l1 = layer_class(units=1, stateful=stateful)
+            l2 = layer_class.from_config(l1.get_config())
+            assert l1.get_config() == l2.get_config()
+
+    def test_deep_copy_SimpleRNN(self):
+        cell = keras.layers.SimpleRNNCell(5)
+        copied_cell = copy.deepcopy(cell)
+        self.assertEqual(copied_cell.units, 5)
+        self.assertEqual(cell.get_config(), copied_cell.get_config())
+
+    def test_regularizers_SimpleRNN(self):
+        embedding_dim = 4
+        layer_class = keras.layers.SimpleRNN
+        layer = layer_class(
+            5,
+            return_sequences=False,
+            weights=None,
+            input_shape=(None, embedding_dim),
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            recurrent_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer="l2",
+            activity_regularizer="l1",
+        )
+        layer.build((None, None, 2))
+        self.assertLen(layer.losses, 3)
+
+        x = keras.backend.variable(np.ones((2, 3, 2)))
+        layer(x)
+        if tf.executing_eagerly():
+            self.assertLen(layer.losses, 4)
+        else:
+            self.assertLen(layer.get_losses_for(x), 1)
+
+    def test_statefulness_SimpleRNN(self):
+        num_samples = 2
+        timesteps = 3
+        embedding_dim = 4
+        units = 2
+        layer_class = keras.layers.SimpleRNN
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                4,
+                embedding_dim,
+                mask_zero=True,
+                input_length=timesteps,
+                batch_input_shape=(num_samples, timesteps),
+            )
+        )
+        layer = layer_class(
+            units, return_sequences=False, stateful=True, weights=None
+        )
+        model.add(layer)
+        model.compile(
+            optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01),
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        out1 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertEqual(out1.shape, (num_samples, units))
+
+        # train once so that the states change
+        model.train_on_batch(
+            np.ones((num_samples, timesteps)), np.ones((num_samples, units))
+        )
+        out2 = model.predict(np.ones((num_samples, timesteps)))
+
+        # if the state is not reset, output should be different
+        self.assertNotEqual(out1.max(), out2.max())
+
+        # check that output changes after states are reset
+        # (even though the model itself didn't change)
+        layer.reset_states()
+        out3 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out2.max(), out3.max())
+
+        # check that container-level reset_states() works
+        model.reset_states()
+        out4 = model.predict(np.ones((num_samples, timesteps)))
+        np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+        # check that the call to `predict` updated the states
+        out5 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out4.max(), out5.max())
+
+        # Check masking
+        layer.reset_states()
+
+        left_padded_input = np.ones((num_samples, timesteps))
+        left_padded_input[0, :1] = 0
+        left_padded_input[1, :2] = 0
+        out6 = model.predict(left_padded_input)
+
+        layer.reset_states()
+
+        right_padded_input = np.ones((num_samples, timesteps))
+        right_padded_input[0, -1:] = 0
+        right_padded_input[1, -2:] = 0
+        out7 = model.predict(right_padded_input)
+
+        np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+    def test_get_initial_states(self):
+        batch_size = 4
+        cell = keras.layers.SimpleRNNCell(20)
+        initial_state = cell.get_initial_state(
+            batch_size=batch_size, dtype=tf.float32
+        )
+        _, state = cell(
+            np.ones((batch_size, 20), dtype=np.float32), initial_state
+        )
+        self.assertEqual(state.shape, initial_state.shape)
+
+    @test_utils.run_v2_only
+    def test_cloned_weight_names(self):
+        inp = keras.Input([None, 3])
+        rnn = keras.layers.SimpleRNN(units=3)
+        model = keras.Model(inp, rnn(inp))
+        clone = keras.models.clone_model(model)
+
+        model_names = [x.name for x in model.weights]
+        clone_names = [x.name for x in clone.weights]
+        self.assertEqual(model_names, clone_names)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/rnn/stacked_rnn_cells.py b/keras/layers/rnn/stacked_rnn_cells.py
index 2a5ab8cdab05..46bb3091f3fb 100644
--- a/keras/layers/rnn/stacked_rnn_cells.py
+++ b/keras/layers/rnn/stacked_rnn_cells.py
@@ -13,168 +13,205 @@
 # limitations under the License.
 # ==============================================================================
 """Wrapper allowing a stack of RNN cells to behave as a single cell."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
 
 import functools
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine import base_layer
 from keras.layers.rnn import rnn_utils
+from keras.saving import serialization_lib
 from keras.utils import generic_utils
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.StackedRNNCells')
+@keras_export("keras.layers.StackedRNNCells")
 class StackedRNNCells(base_layer.Layer):
-  """Wrapper allowing a stack of RNN cells to behave as a single cell.
-
-  Used to implement efficient stacked RNNs.
-
-  Args:
-    cells: List of RNN cell instances.
-
-  Examples:
-
-  ```python
-  batch_size = 3
-  sentence_max_length = 5
-  n_features = 2
-  new_shape = (batch_size, sentence_max_length, n_features)
-  x = tf.constant(np.reshape(np.arange(30), new_shape), dtype = tf.float32)
-
-  rnn_cells = [tf.keras.layers.LSTMCell(128) for _ in range(2)]
-  stacked_lstm = tf.keras.layers.StackedRNNCells(rnn_cells)
-  lstm_layer = tf.keras.layers.RNN(stacked_lstm)
-
-  result = lstm_layer(x)
-  ```
-  """
-
-  def __init__(self, cells, **kwargs):
-    for cell in cells:
-      if 'call' not in dir(cell):
-        raise ValueError('All cells must have a `call` method. '
-                         f'Received cell without a `call` method: {cell}')
-      if 'state_size' not in dir(cell):
-        raise ValueError('All cells must have a `state_size` attribute. '
-                         f'Received cell without a `state_size`: {cell}')
-    self.cells = cells
-    # reverse_state_order determines whether the state size will be in a reverse
-    # order of the cells' state. User might want to set this to True to keep the
-    # existing behavior. This is only useful when use RNN(return_state=True)
-    # since the state will be returned as the same order of state_size.
-    self.reverse_state_order = kwargs.pop('reverse_state_order', False)
-    if self.reverse_state_order:
-      logging.warning('reverse_state_order=True in StackedRNNCells will soon '
-                      'be deprecated. Please update the code to work with the '
-                      'natural order of states if you rely on the RNN states, '
-                      'eg RNN(return_state=True).')
-    super().__init__(**kwargs)
-
-  @property
-  def state_size(self):
-    return tuple(c.state_size for c in
-                 (self.cells[::-1] if self.reverse_state_order else self.cells))
-
-  @property
-  def output_size(self):
-    if getattr(self.cells[-1], 'output_size', None) is not None:
-      return self.cells[-1].output_size
-    elif rnn_utils.is_multiple_state(self.cells[-1].state_size):
-      return self.cells[-1].state_size[0]
-    else:
-      return self.cells[-1].state_size
-
-  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    initial_states = []
-    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
-      get_initial_state_fn = getattr(cell, 'get_initial_state', None)
-      if get_initial_state_fn:
-        initial_states.append(get_initial_state_fn(
-            inputs=inputs, batch_size=batch_size, dtype=dtype))
-      else:
-        initial_states.append(rnn_utils.generate_zero_filled_state_for_cell(
-            cell, inputs, batch_size, dtype))
-
-    return tuple(initial_states)
-
-  def call(self, inputs, states, constants=None, training=None, **kwargs):
-    # Recover per-cell states.
-    state_size = (self.state_size[::-1]
-                  if self.reverse_state_order else self.state_size)
-    nested_states = tf.nest.pack_sequence_as(state_size,
-                                             tf.nest.flatten(states))
-
-    # Call the cells in order and store the returned states.
-    new_nested_states = []
-    for cell, states in zip(self.cells, nested_states):
-      states = states if tf.nest.is_nested(states) else [states]
-      # TF cell does not wrap the state into list when there is only one state.
-      is_tf_rnn_cell = getattr(cell, '_is_tf_rnn_cell', None) is not None
-      states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
-      if generic_utils.has_arg(cell.call, 'training'):
-        kwargs['training'] = training
-      else:
-        kwargs.pop('training', None)
-      # Use the __call__ function for callable objects, eg layers, so that it
-      # will have the proper name scopes for the ops, etc.
-      cell_call_fn = cell.__call__ if callable(cell) else cell.call
-      if generic_utils.has_arg(cell.call, 'constants'):
-        inputs, states = cell_call_fn(inputs, states,
-                                      constants=constants, **kwargs)
-      else:
-        inputs, states = cell_call_fn(inputs, states, **kwargs)
-      new_nested_states.append(states)
-
-    return inputs, tf.nest.pack_sequence_as(state_size,
-                                            tf.nest.flatten(new_nested_states))
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-
-    def get_batch_input_shape(batch_size, dim):
-      shape = tf.TensorShape(dim).as_list()
-      return tuple([batch_size] + shape)
-
-    for cell in self.cells:
-      if isinstance(cell, base_layer.Layer) and not cell.built:
-        with backend.name_scope(cell.name):
-          cell.build(input_shape)
-          cell.built = True
-      if getattr(cell, 'output_size', None) is not None:
-        output_dim = cell.output_size
-      elif rnn_utils.is_multiple_state(cell.state_size):
-        output_dim = cell.state_size[0]
-      else:
-        output_dim = cell.state_size
-      batch_size = tf.nest.flatten(input_shape)[0]
-      if tf.nest.is_nested(output_dim):
-        input_shape = tf.nest.map_structure(
-            functools.partial(get_batch_input_shape, batch_size), output_dim)
-        input_shape = tuple(input_shape)
-      else:
-        input_shape = tuple([batch_size] + tf.TensorShape(output_dim).as_list())
-    self.built = True
-
-  def get_config(self):
-    cells = []
-    for cell in self.cells:
-      cells.append(generic_utils.serialize_keras_object(cell))
-    config = {'cells': cells}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    from keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    cells = []
-    for cell_config in config.pop('cells'):
-      cells.append(
-          deserialize_layer(cell_config, custom_objects=custom_objects))
-    return cls(cells, **config)
+    """Wrapper allowing a stack of RNN cells to behave as a single cell.
+
+    Used to implement efficient stacked RNNs.
+
+    Args:
+      cells: List of RNN cell instances.
+
+    Examples:
+
+    ```python
+    batch_size = 3
+    sentence_max_length = 5
+    n_features = 2
+    new_shape = (batch_size, sentence_max_length, n_features)
+    x = tf.constant(np.reshape(np.arange(30), new_shape), dtype = tf.float32)
+
+    rnn_cells = [tf.keras.layers.LSTMCell(128) for _ in range(2)]
+    stacked_lstm = tf.keras.layers.StackedRNNCells(rnn_cells)
+    lstm_layer = tf.keras.layers.RNN(stacked_lstm)
+
+    result = lstm_layer(x)
+    ```
+    """
+
+    def __init__(self, cells, **kwargs):
+        for cell in cells:
+            if "call" not in dir(cell):
+                raise ValueError(
+                    "All cells must have a `call` method. "
+                    f"Received cell without a `call` method: {cell}"
+                )
+            if "state_size" not in dir(cell):
+                raise ValueError(
+                    "All cells must have a `state_size` attribute. "
+                    f"Received cell without a `state_size`: {cell}"
+                )
+        self.cells = cells
+        # reverse_state_order determines whether the state size will be in a
+        # reverse order of the cells' state. User might want to set this to True
+        # to keep the existing behavior. This is only useful when use
+        # RNN(return_state=True) since the state will be returned as the same
+        # order of state_size.
+        self.reverse_state_order = kwargs.pop("reverse_state_order", False)
+        if self.reverse_state_order:
+            logging.warning(
+                "reverse_state_order=True in StackedRNNCells will soon "
+                "be deprecated. Please update the code to work with the "
+                "natural order of states if you rely on the RNN states, "
+                "eg RNN(return_state=True)."
+            )
+        super().__init__(**kwargs)
+
+    @property
+    def state_size(self):
+        return tuple(
+            c.state_size
+            for c in (
+                self.cells[::-1] if self.reverse_state_order else self.cells
+            )
+        )
+
+    @property
+    def output_size(self):
+        if getattr(self.cells[-1], "output_size", None) is not None:
+            return self.cells[-1].output_size
+        elif rnn_utils.is_multiple_state(self.cells[-1].state_size):
+            return self.cells[-1].state_size[0]
+        else:
+            return self.cells[-1].state_size
+
+    def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        initial_states = []
+        for cell in (
+            self.cells[::-1] if self.reverse_state_order else self.cells
+        ):
+            get_initial_state_fn = getattr(cell, "get_initial_state", None)
+            if get_initial_state_fn:
+                initial_states.append(
+                    get_initial_state_fn(
+                        inputs=inputs, batch_size=batch_size, dtype=dtype
+                    )
+                )
+            else:
+                initial_states.append(
+                    rnn_utils.generate_zero_filled_state_for_cell(
+                        cell, inputs, batch_size, dtype
+                    )
+                )
+
+        return tuple(initial_states)
+
+    def call(self, inputs, states, constants=None, training=None, **kwargs):
+        # Recover per-cell states.
+        state_size = (
+            self.state_size[::-1]
+            if self.reverse_state_order
+            else self.state_size
+        )
+        nested_states = tf.nest.pack_sequence_as(
+            state_size, tf.nest.flatten(states)
+        )
+
+        # Call the cells in order and store the returned states.
+        new_nested_states = []
+        for cell, states in zip(self.cells, nested_states):
+            states = states if tf.nest.is_nested(states) else [states]
+            # TF cell does not wrap the state into list when there is only one
+            # state.
+            is_tf_rnn_cell = getattr(cell, "_is_tf_rnn_cell", None) is not None
+            states = (
+                states[0] if len(states) == 1 and is_tf_rnn_cell else states
+            )
+            if generic_utils.has_arg(cell.call, "training"):
+                kwargs["training"] = training
+            else:
+                kwargs.pop("training", None)
+            # Use the __call__ function for callable objects, eg layers, so that
+            # it will have the proper name scopes for the ops, etc.
+            cell_call_fn = cell.__call__ if callable(cell) else cell.call
+            if generic_utils.has_arg(cell.call, "constants"):
+                inputs, states = cell_call_fn(
+                    inputs, states, constants=constants, **kwargs
+                )
+            else:
+                inputs, states = cell_call_fn(inputs, states, **kwargs)
+            new_nested_states.append(states)
+
+        return inputs, tf.nest.pack_sequence_as(
+            state_size, tf.nest.flatten(new_nested_states)
+        )
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        if isinstance(input_shape, list):
+            input_shape = input_shape[0]
+
+        def get_batch_input_shape(batch_size, dim):
+            shape = tf.TensorShape(dim).as_list()
+            return tuple([batch_size] + shape)
+
+        for cell in self.cells:
+            if isinstance(cell, base_layer.Layer) and not cell.built:
+                with backend.name_scope(cell.name):
+                    cell.build(input_shape)
+                    cell.built = True
+            if getattr(cell, "output_size", None) is not None:
+                output_dim = cell.output_size
+            elif rnn_utils.is_multiple_state(cell.state_size):
+                output_dim = cell.state_size[0]
+            else:
+                output_dim = cell.state_size
+            batch_size = tf.nest.flatten(input_shape)[0]
+            if tf.nest.is_nested(output_dim):
+                input_shape = tf.nest.map_structure(
+                    functools.partial(get_batch_input_shape, batch_size),
+                    output_dim,
+                )
+                input_shape = tuple(input_shape)
+            else:
+                input_shape = tuple(
+                    [batch_size] + tf.TensorShape(output_dim).as_list()
+                )
+        self.built = True
+
+    def get_config(self):
+        cells = []
+        for cell in self.cells:
+            cells.append(serialization_lib.serialize_keras_object(cell))
+        config = {"cells": cells}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        from keras.layers import deserialize as deserialize_layer
+
+        cells = []
+        for cell_config in config.pop("cells"):
+            cells.append(
+                deserialize_layer(cell_config, custom_objects=custom_objects)
+            )
+        return cls(cells, **config)
diff --git a/keras/layers/rnn/time_distributed.py b/keras/layers/rnn/time_distributed.py
index f0a995afd8e0..27f28236394e 100644
--- a/keras/layers/rnn/time_distributed.py
+++ b/keras/layers/rnn/time_distributed.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Wrapper layer to apply every temporal slice of an input."""
-# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
 from keras.engine.base_layer import Layer
@@ -22,306 +24,329 @@
 from keras.utils import generic_utils
 from keras.utils import layer_utils
 from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.TimeDistributed')
+@keras_export("keras.layers.TimeDistributed")
 class TimeDistributed(Wrapper):
-  """This wrapper allows to apply a layer to every temporal slice of an input.
-
-  Every input should be at least 3D, and the dimension of index one of the
-  first input will be considered to be the temporal dimension.
-
-  Consider a batch of 32 video samples, where each sample is a 128x128 RGB image
-  with `channels_last` data format, across 10 timesteps.
-  The batch input shape is `(32, 10, 128, 128, 3)`.
-
-  You can then use `TimeDistributed` to apply the same `Conv2D` layer to each
-  of the 10 timesteps, independently:
-
-  >>> inputs = tf.keras.Input(shape=(10, 128, 128, 3))
-  >>> conv_2d_layer = tf.keras.layers.Conv2D(64, (3, 3))
-  >>> outputs = tf.keras.layers.TimeDistributed(conv_2d_layer)(inputs)
-  >>> outputs.shape
-  TensorShape([None, 10, 126, 126, 64])
-
-  Because `TimeDistributed` applies the same instance of `Conv2D` to each of the
-  timestamps, the same set of weights are used at each timestamp.
-
-  Args:
-    layer: a `tf.keras.layers.Layer` instance.
-
-  Call arguments:
-    inputs: Input tensor of shape (batch, time, ...) or nested tensors,
-      and each of which has shape (batch, time, ...).
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the
-      wrapped layer (only if the layer supports this argument).
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked. This argument is passed to the
-      wrapped layer (only if the layer supports this argument).
-
-  Raises:
-    ValueError: If not initialized with a `tf.keras.layers.Layer` instance.
-  """
-
-  def __init__(self, layer, **kwargs):
-    if not isinstance(layer, Layer):
-      raise ValueError(
-          'Please initialize `TimeDistributed` layer with a '
-          f'`tf.keras.layers.Layer` instance. Received: {layer}')
-    super().__init__(layer, **kwargs)
-    self.supports_masking = True
-
-    # It is safe to use the fast, reshape-based approach with all of our
-    # built-in Layers.
-    self._always_use_reshape = (
-        layer_utils.is_builtin_layer(layer) and
-        not getattr(layer, 'stateful', False))
-
-  def _get_shape_tuple(self, init_tuple, tensor, start_idx, int_shape=None):
-    """Finds non-specific dimensions in the static shapes.
-
-    The static shapes are replaced with the corresponding dynamic shapes of the
-    tensor.
-    Args:
-      init_tuple: a tuple, the first part of the output shape
-      tensor: the tensor from which to get the (static and dynamic) shapes
-        as the last part of the output shape
-      start_idx: int, which indicate the first dimension to take from
-        the static shape of the tensor
-      int_shape: an alternative static shape to take as the last part
-        of the output shape
-    Returns:
-      The new int_shape with the first part from init_tuple
-      and the last part from either `int_shape` (if provided)
-      or `tensor.shape`, where every `None` is replaced by
-      the corresponding dimension from `tf.shape(tensor)`.
-    """
-    # replace all None in int_shape by backend.shape
-    if int_shape is None:
-      int_shape = backend.int_shape(tensor)[start_idx:]
-    if isinstance(int_shape, tf.TensorShape):
-      int_shape = int_shape.as_list()
-    if not any(not s for s in int_shape):
-      return init_tuple + tuple(int_shape)
-    shape = backend.shape(tensor)
-    int_shape = list(int_shape)
-    for i, s in enumerate(int_shape):
-      if not s:
-        int_shape[i] = shape[start_idx + i]
-    return init_tuple + tuple(int_shape)
-
-  def _remove_timesteps(self, dims):
-    dims = dims.as_list()
-    return tf.TensorShape([dims[0]] + dims[2:])
-
-  def build(self, input_shape):
-    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-    input_dims = tf.nest.flatten(
-        tf.nest.map_structure(lambda x: x.ndims, input_shape))
-    if any(dim < 3 for dim in input_dims):
-      raise ValueError(
-          '`TimeDistributed` Layer should be passed an `input_shape ` '
-          f'with at least 3 dimensions, received: {input_shape}')
-    # Don't enforce the batch or time dimension.
-    self.input_spec = tf.nest.map_structure(
-        lambda x: InputSpec(shape=[None, None] + x.as_list()[2:]), input_shape)
-    child_input_shape = tf.nest.map_structure(self._remove_timesteps,
-                                              input_shape)
-    child_input_shape = tf_utils.convert_shapes(child_input_shape)
-    super().build(tuple(child_input_shape))
-    self.built = True
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-
-    child_input_shape = tf.nest.map_structure(self._remove_timesteps,
-                                              input_shape)
-    child_output_shape = self.layer.compute_output_shape(child_input_shape)
-    child_output_shape = tf_utils.convert_shapes(
-        child_output_shape, to_tuples=False)
-    timesteps = tf_utils.convert_shapes(input_shape)
-    timesteps = tf.nest.flatten(timesteps)[1]
-
-    def insert_timesteps(dims):
-      dims = dims.as_list()
-      return tf.TensorShape([dims[0], timesteps] + dims[1:])
-
-    return tf.nest.map_structure(insert_timesteps, child_output_shape)
-
-  def call(self, inputs, training=None, mask=None):
-    kwargs = {}
-    if generic_utils.has_arg(self.layer.call, 'training'):
-      kwargs['training'] = training
-
-    input_shape = tf.nest.map_structure(
-        lambda x: tf.TensorShape(backend.int_shape(x)), inputs)
-    batch_size = tf_utils.convert_shapes(input_shape)
-    batch_size = tf.nest.flatten(batch_size)[0]
-    if batch_size and not self._always_use_reshape:
-      inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
-      is_ragged_input = row_lengths is not None
-      input_length = tf_utils.convert_shapes(input_shape)
-      input_length = tf.nest.flatten(input_length)[1]
-
-      # batch size matters, use rnn-based implementation
-      def step(x, _):
-        output = self.layer(x, **kwargs)
-        return output, []
-
-      _, outputs, _ = backend.rnn(
-          step,
-          inputs,
-          initial_states=[],
-          input_length=row_lengths[0] if is_ragged_input else input_length,
-          mask=mask,
-          unroll=False)
-      # pylint: disable=g-long-lambda
-      y = tf.nest.map_structure(
-          lambda output: backend.maybe_convert_to_ragged(
-              is_ragged_input, output, row_lengths), outputs)
-    else:
-      # No batch size specified, therefore the layer will be able
-      # to process batches of any size.
-      # We can go with reshape-based implementation for performance.
-      is_ragged_input = tf.nest.map_structure(
-          lambda x: isinstance(x, tf.RaggedTensor), inputs)
-      is_ragged_input = tf.nest.flatten(is_ragged_input)
-      if all(is_ragged_input):
-        input_values = tf.nest.map_structure(lambda x: x.values, inputs)
-        input_row_lenghts = tf.nest.map_structure(
-            lambda x: x.nested_row_lengths()[0], inputs)
-        y = self.layer(input_values, **kwargs)
-        y = tf.nest.map_structure(tf.RaggedTensor.from_row_lengths, y,
-                                  input_row_lenghts)
-      elif any(is_ragged_input):
-        raise ValueError('All inputs has to be either ragged or not, '
-                         f'but not mixed. Received: {inputs}')
-      else:
-        input_length = tf_utils.convert_shapes(input_shape)
-        input_length = tf.nest.flatten(input_length)[1]
-        if not input_length:
-          input_length = tf.nest.map_structure(lambda x: tf.shape(x)[1], inputs)
-          input_length = generic_utils.to_list(tf.nest.flatten(input_length))[0]
+    """This wrapper allows to apply a layer to every temporal slice of an input.
 
-        inner_input_shape = tf.nest.map_structure(
-            lambda x: self._get_shape_tuple((-1,), x, 2), inputs)
-        # Shape: (num_samples * timesteps, ...). And track the
-        # transformation in self._input_map.
-        inputs = tf.__internal__.nest.map_structure_up_to(
-            inputs, tf.reshape, inputs, inner_input_shape)
-        # (num_samples * timesteps, ...)
-        if generic_utils.has_arg(self.layer.call, 'mask') and mask is not None:
-          inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
-          kwargs['mask'] = backend.reshape(mask, inner_mask_shape)
-
-        y = self.layer(inputs, **kwargs)
-
-        # Shape: (num_samples, timesteps, ...)
-        output_shape = self.compute_output_shape(input_shape)
-        # pylint: disable=g-long-lambda
-        output_shape = tf.nest.map_structure(
-            lambda tensor, int_shape: self._get_shape_tuple(
-                (-1, input_length), tensor, 1, int_shape[2:]), y, output_shape)
-        y = tf.__internal__.nest.map_structure_up_to(y, tf.reshape, y,
-                                                     output_shape)
-        if not tf.executing_eagerly():
-          # Set the static shape for the result since it might be lost during
-          # array_ops reshape, eg, some `None` dim in the result could be
-          # inferred.
-          tf.__internal__.nest.map_structure_up_to(
-              y, lambda tensor, shape: tensor.set_shape(shape), y,
-              self.compute_output_shape(input_shape))
-
-    return y
-
-  def compute_mask(self, inputs, mask=None):
-    """Computes an output mask tensor for Embedding layer.
-
-    This is based on the inputs, mask, and the inner layer.
-    If batch size is specified:
-    Simply return the input `mask`. (An rnn-based implementation with
-    more than one rnn inputs is required but not supported in tf.keras yet.)
-    Otherwise we call `compute_mask` of the inner layer at each time step.
-    If the output mask at each time step is not `None`:
-    (E.g., inner layer is Masking or RNN)
-    Concatenate all of them and return the concatenation.
-    If the output mask at each time step is `None` and the input mask is not
-    `None`:(E.g., inner layer is Dense)
-    Reduce the input_mask to 2 dimensions and return it.
-    Otherwise (both the output mask and the input mask are `None`):
-    (E.g., `mask` is not used at all)
-    Return `None`.
+    Every input should be at least 3D, and the dimension of index one of the
+    first input will be considered to be the temporal dimension.
 
-    Args:
-      inputs: Tensor with shape [batch size, timesteps, ...] indicating the
-        input to TimeDistributed. If static shape information is available for
-        "batch size", `mask` is returned unmodified.
-      mask: Either None (indicating no masking) or a Tensor indicating the
-        input mask for TimeDistributed. The shape can be static or dynamic.
-
-    Returns:
-      Either None (no masking), or a [batch size, timesteps, ...] Tensor with
-      an output mask for the TimeDistributed layer with the shape beyond the
-      second dimension being the value of the input mask shape(if the computed
-      output mask is none), an output mask with the shape beyond the first
-      dimension being the value of the mask shape(if mask is not None) or
-      output mask with the shape beyond the first dimension being the
-      value of the computed output shape.
+    Consider a batch of 32 video samples, where each sample is a 128x128 RGB
+    image with `channels_last` data format, across 10 timesteps.
+    The batch input shape is `(32, 10, 128, 128, 3)`.
 
+    You can then use `TimeDistributed` to apply the same `Conv2D` layer to each
+    of the 10 timesteps, independently:
+
+    >>> inputs = tf.keras.Input(shape=(10, 128, 128, 3))
+    >>> conv_2d_layer = tf.keras.layers.Conv2D(64, (3, 3))
+    >>> outputs = tf.keras.layers.TimeDistributed(conv_2d_layer)(inputs)
+    >>> outputs.shape
+    TensorShape([None, 10, 126, 126, 64])
+
+    Because `TimeDistributed` applies the same instance of `Conv2D` to each of
+    the timestamps, the same set of weights are used at each timestamp.
+
+    Args:
+      layer: a `tf.keras.layers.Layer` instance.
+
+    Call arguments:
+      inputs: Input tensor of shape (batch, time, ...) or nested tensors,
+        and each of which has shape (batch, time, ...).
+      training: Python boolean indicating whether the layer should behave in
+        training mode or in inference mode. This argument is passed to the
+        wrapped layer (only if the layer supports this argument).
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+        a given timestep should be masked. This argument is passed to the
+        wrapped layer (only if the layer supports this argument).
+
+    Raises:
+      ValueError: If not initialized with a `tf.keras.layers.Layer` instance.
     """
-    # cases need to call the layer.compute_mask when input_mask is None:
-    # Masking layer and Embedding layer with mask_zero
-    input_shape = tf.nest.map_structure(
-        lambda x: tf.TensorShape(backend.int_shape(x)), inputs)
-    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
-    batch_size = tf_utils.convert_shapes(input_shape)
-    batch_size = tf.nest.flatten(batch_size)[0]
-    is_ragged_input = tf.nest.map_structure(
-        lambda x: isinstance(x, tf.RaggedTensor), inputs)
-    is_ragged_input = generic_utils.to_list(tf.nest.flatten(is_ragged_input))
-    if batch_size and not self._always_use_reshape or any(is_ragged_input):
-      # batch size matters, we currently do not handle mask explicitly, or if
-      # the layer always uses reshape approach, or the input is a ragged tensor.
-      return mask
-    inner_mask = mask
-    if inner_mask is not None:
-      inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
-      inner_mask = backend.reshape(inner_mask, inner_mask_shape)
-    inner_input_shape = tf.nest.map_structure(
-        lambda tensor: self._get_shape_tuple((-1,), tensor, 2), inputs)
-    inner_inputs = tf.__internal__.nest.map_structure_up_to(
-        inputs, tf.reshape, inputs, inner_input_shape)
-    output_mask = self.layer.compute_mask(inner_inputs, inner_mask)
-    if output_mask is None:
-      if mask is None:
-        return None
-      # input_mask is not None, and output_mask is None:
-      # we should return a not-None mask
-      output_mask = mask
-      for _ in range(2, len(backend.int_shape(mask))):
-        output_mask = backend.any(output_mask, axis=-1)
-    else:
-      # output_mask is not None. We need to reshape it
-      input_length = tf_utils.convert_shapes(input_shape)
-      input_length = tf.nest.flatten(input_length)[1]
-      if not input_length:
-        input_length = tf.nest.map_structure(lambda x: backend.shape(x)[1],
-                                             inputs)
-        input_length = tf.nest.flatten(input_length)[0]
-      output_mask_int_shape = backend.int_shape(output_mask)
-      if output_mask_int_shape is None:
-        # if the output_mask does not have a static shape,
-        # its shape must be the same as mask's
-        if mask is not None:
-          output_mask_int_shape = backend.int_shape(mask)
+
+    def __init__(self, layer, **kwargs):
+        if not isinstance(layer, Layer):
+            raise ValueError(
+                "Please initialize `TimeDistributed` layer with a "
+                f"`tf.keras.layers.Layer` instance. Received: {layer}"
+            )
+        super().__init__(layer, **kwargs)
+        self.supports_masking = True
+
+        # It is safe to use the fast, reshape-based approach with all of our
+        # built-in Layers.
+        self._always_use_reshape = layer_utils.is_builtin_layer(
+            layer
+        ) and not getattr(layer, "stateful", False)
+
+    def _get_shape_tuple(self, init_tuple, tensor, start_idx):
+        """Finds non-specific dimensions in the static shapes.
+
+        The static shapes are replaced with the corresponding dynamic shapes of
+        the tensor.
+        Args:
+          init_tuple: a tuple, the first part of the output shape
+          tensor: the tensor from which to get the (static and dynamic) shapes
+            as the last part of the output shape
+          start_idx: int, which indicate the first dimension to take from
+            the static shape of the tensor
+        Returns:
+          The new shape with the first part from `init_tuple` and the last part
+          from or `tensor.shape`, where every `None` is replaced by the
+          corresponding dimension from `tf.shape(tensor)`.
+        """
+        # replace all None in int_shape by backend.shape
+        int_shape = backend.int_shape(tensor)[start_idx:]
+        if not any(s is None for s in int_shape):
+            return init_tuple + int_shape
+        shape = backend.shape(tensor)
+        int_shape = list(int_shape)
+        for i, s in enumerate(int_shape):
+            if s is None:
+                int_shape[i] = shape[start_idx + i]
+        return init_tuple + tuple(int_shape)
+
+    def _remove_timesteps(self, dims):
+        dims = dims.as_list()
+        return tf.TensorShape([dims[0]] + dims[2:])
+
+    def build(self, input_shape):
+        input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
+        input_dims = tf.nest.flatten(
+            tf.nest.map_structure(lambda x: x.ndims, input_shape)
+        )
+        if any(dim < 3 for dim in input_dims):
+            raise ValueError(
+                "`TimeDistributed` Layer should be passed an `input_shape ` "
+                f"with at least 3 dimensions, received: {input_shape}"
+            )
+        # Don't enforce the batch or time dimension.
+        self.input_spec = tf.nest.map_structure(
+            lambda x: InputSpec(shape=[None, None] + x.as_list()[2:]),
+            input_shape,
+        )
+        child_input_shape = tf.nest.map_structure(
+            self._remove_timesteps, input_shape
+        )
+        child_input_shape = tf_utils.convert_shapes(child_input_shape)
+        super().build(tuple(child_input_shape))
+        self.built = True
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
+
+        child_input_shape = tf.nest.map_structure(
+            self._remove_timesteps, input_shape
+        )
+        child_output_shape = self.layer.compute_output_shape(child_input_shape)
+        child_output_shape = tf_utils.convert_shapes(
+            child_output_shape, to_tuples=False
+        )
+        timesteps = tf_utils.convert_shapes(input_shape)
+        timesteps = tf.nest.flatten(timesteps)[1]
+
+        def insert_timesteps(dims):
+            dims = dims.as_list()
+            return tf.TensorShape([dims[0], timesteps] + dims[1:])
+
+        return tf.nest.map_structure(insert_timesteps, child_output_shape)
+
+    def call(self, inputs, training=None, mask=None):
+        kwargs = {}
+        if generic_utils.has_arg(self.layer.call, "training"):
+            kwargs["training"] = training
+
+        input_shape = tf.nest.map_structure(
+            lambda x: tf.TensorShape(backend.int_shape(x)), inputs
+        )
+        batch_size = tf_utils.convert_shapes(input_shape)
+        batch_size = tf.nest.flatten(batch_size)[0]
+        if batch_size and not self._always_use_reshape:
+            inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
+            is_ragged_input = row_lengths is not None
+            input_length = tf_utils.convert_shapes(input_shape)
+            input_length = tf.nest.flatten(input_length)[1]
+
+            # batch size matters, use rnn-based implementation
+            def step(x, _):
+                output = self.layer(x, **kwargs)
+                return output, []
+
+            _, outputs, _ = backend.rnn(
+                step,
+                inputs,
+                initial_states=[],
+                input_length=row_lengths[0]
+                if is_ragged_input
+                else input_length,
+                mask=mask,
+                unroll=False,
+            )
+
+            y = tf.nest.map_structure(
+                lambda output: backend.maybe_convert_to_ragged(
+                    is_ragged_input, output, row_lengths
+                ),
+                outputs,
+            )
+        else:
+            # No batch size specified, therefore the layer will be able
+            # to process batches of any size.
+            # We can go with reshape-based implementation for performance.
+            is_ragged_input = tf.nest.map_structure(
+                lambda x: isinstance(x, tf.RaggedTensor), inputs
+            )
+            is_ragged_input = tf.nest.flatten(is_ragged_input)
+            if all(is_ragged_input):
+                input_values = tf.nest.map_structure(lambda x: x.values, inputs)
+                input_row_lenghts = tf.nest.map_structure(
+                    lambda x: x.nested_row_lengths()[0], inputs
+                )
+                y = self.layer(input_values, **kwargs)
+                y = tf.nest.map_structure(
+                    tf.RaggedTensor.from_row_lengths, y, input_row_lenghts
+                )
+            elif any(is_ragged_input):
+                raise ValueError(
+                    "All inputs has to be either ragged or not, "
+                    f"but not mixed. Received: {inputs}"
+                )
+            else:
+                input_length = tf_utils.convert_shapes(input_shape)
+                input_length = tf.nest.flatten(input_length)[1]
+                if not input_length:
+                    input_length = tf.nest.map_structure(
+                        lambda x: tf.shape(x)[1], inputs
+                    )
+                    input_length = generic_utils.to_list(
+                        tf.nest.flatten(input_length)
+                    )[0]
+
+                inner_input_shape = tf.nest.map_structure(
+                    lambda x: self._get_shape_tuple((-1,), x, 2), inputs
+                )
+                # Shape: (num_samples * timesteps, ...). And track the
+                # transformation in self._input_map.
+                inputs = tf.__internal__.nest.map_structure_up_to(
+                    inputs, tf.reshape, inputs, inner_input_shape
+                )
+                # (num_samples * timesteps, ...)
+                if (
+                    generic_utils.has_arg(self.layer.call, "mask")
+                    and mask is not None
+                ):
+                    inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
+                    kwargs["mask"] = backend.reshape(mask, inner_mask_shape)
+
+                y = self.layer(inputs, **kwargs)
+
+                # Reconstruct the output shape by re-splitting the 0th dimension
+                # back into (num_samples, timesteps, ...)
+                # We use batch_size when available so that the 0th dimension is
+                # set in the static shape of the reshaped output
+                reshape_batch_size = batch_size if batch_size else -1
+                output_shape = tf.nest.map_structure(
+                    lambda tensor: self._get_shape_tuple(
+                        (reshape_batch_size, input_length), tensor, 1
+                    ),
+                    y,
+                )
+                y = tf.__internal__.nest.map_structure_up_to(
+                    y, tf.reshape, y, output_shape
+                )
+
+        return y
+
+    def compute_mask(self, inputs, mask=None):
+        """Computes an output mask tensor for Embedding layer.
+
+        This is based on the inputs, mask, and the inner layer.
+        If batch size is specified:
+        Simply return the input `mask`. (An rnn-based implementation with
+        more than one rnn inputs is required but not supported in tf.keras yet.)
+        Otherwise we call `compute_mask` of the inner layer at each time step.
+        If the output mask at each time step is not `None`:
+        (E.g., inner layer is Masking or RNN)
+        Concatenate all of them and return the concatenation.
+        If the output mask at each time step is `None` and the input mask is not
+        `None`:(E.g., inner layer is Dense)
+        Reduce the input_mask to 2 dimensions and return it.
+        Otherwise (both the output mask and the input mask are `None`):
+        (E.g., `mask` is not used at all)
+        Return `None`.
+
+        Args:
+          inputs: Tensor with shape [batch size, timesteps, ...] indicating the
+            input to TimeDistributed. If static shape information is available
+            for "batch size", `mask` is returned unmodified.
+          mask: Either None (indicating no masking) or a Tensor indicating the
+            input mask for TimeDistributed. The shape can be static or dynamic.
+
+        Returns:
+          Either None (no masking), or a [batch size, timesteps, ...] Tensor
+          with an output mask for the TimeDistributed layer with the shape
+          beyond the second dimension being the value of the input mask shape(if
+          the computed output mask is none), an output mask with the shape
+          beyond the first dimension being the value of the mask shape(if mask
+          is not None) or output mask with the shape beyond the first dimension
+          being the value of the computed output shape.
+
+        """
+        # cases need to call the layer.compute_mask when input_mask is None:
+        # Masking layer and Embedding layer with mask_zero
+        input_shape = tf.nest.map_structure(
+            lambda x: tf.TensorShape(backend.int_shape(x)), inputs
+        )
+        input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
+        batch_size = tf_utils.convert_shapes(input_shape)
+        batch_size = tf.nest.flatten(batch_size)[0]
+        is_ragged_input = tf.nest.map_structure(
+            lambda x: isinstance(x, tf.RaggedTensor), inputs
+        )
+        is_ragged_input = generic_utils.to_list(
+            tf.nest.flatten(is_ragged_input)
+        )
+        if batch_size and not self._always_use_reshape or any(is_ragged_input):
+            # batch size matters, we currently do not handle mask explicitly, or
+            # if the layer always uses reshape approach, or the input is a
+            # ragged tensor.
+            return mask
+        inner_mask = mask
+        if inner_mask is not None:
+            inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
+            inner_mask = backend.reshape(inner_mask, inner_mask_shape)
+        inner_input_shape = tf.nest.map_structure(
+            lambda tensor: self._get_shape_tuple((-1,), tensor, 2), inputs
+        )
+        inner_inputs = tf.__internal__.nest.map_structure_up_to(
+            inputs, tf.reshape, inputs, inner_input_shape
+        )
+        output_mask = self.layer.compute_mask(inner_inputs, inner_mask)
+        if output_mask is None:
+            if mask is None:
+                return None
+            # input_mask is not None, and output_mask is None:
+            # we should return a not-None mask
+            output_mask = mask
+            for _ in range(2, len(backend.int_shape(mask))):
+                output_mask = backend.any(output_mask, axis=-1)
         else:
-          input_shape = generic_utils.to_list(tf.nest.flatten(input_shape))[0]
-          output_mask_int_shape = backend.compute_output_shape(input_shape)[:-1]
-      output_mask_shape = self._get_shape_tuple(
-          (-1, input_length), output_mask, 1, output_mask_int_shape[1:])
-      output_mask = backend.reshape(output_mask, output_mask_shape)
-    return output_mask
+            # output_mask is not None. We need to reshape it
+            input_length = tf_utils.convert_shapes(input_shape)
+            input_length = tf.nest.flatten(input_length)[1]
+            if not input_length:
+                input_length = tf.nest.map_structure(
+                    lambda x: backend.shape(x)[1], inputs
+                )
+                input_length = tf.nest.flatten(input_length)[0]
+            reshape_batch_size = batch_size if batch_size else -1
+            output_mask_shape = self._get_shape_tuple(
+                (reshape_batch_size, input_length), output_mask, 1
+            )
+            output_mask = backend.reshape(output_mask, output_mask_shape)
+        return output_mask
diff --git a/keras/layers/rnn/time_distributed_test.py b/keras/layers/rnn/time_distributed_test.py
index 74cce5b3a388..432fa3ad26f3 100644
--- a/keras/layers/rnn/time_distributed_test.py
+++ b/keras/layers/rnn/time_distributed_test.py
@@ -15,470 +15,560 @@
 """Tests for TimeDistributed wrapper."""
 
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
-from tensorflow.python.training.tracking import util as trackable_util
+# isort: off
+from tensorflow.python.checkpoint import (
+    checkpoint as trackable_util,
+)
 
 
 class TimeDistributedTest(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_timedistributed_dense(self):
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(2), input_shape=(3, 4)))
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(
-        np.random.random((10, 3, 4)),
-        np.random.random((10, 3, 2)),
-        epochs=1,
-        batch_size=10)
-
-    # test config
-    model.get_config()
-
-    # check whether the model variables are present in the
-    # trackable list of objects
-    checkpointed_object_ids = {
-        id(o) for o in trackable_util.list_objects(model)
-    }
-    for v in model.variables:
-      self.assertIn(id(v), checkpointed_object_ids)
-
-  def test_timedistributed_static_batch_size(self):
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(2), input_shape=(3, 4), batch_size=10))
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(
-        np.random.random((10, 3, 4)),
-        np.random.random((10, 3, 2)),
-        epochs=1,
-        batch_size=10)
-
-  def test_timedistributed_invalid_init(self):
-    x = tf.constant(np.zeros((1, 1)).astype('float32'))
-    with self.assertRaisesRegex(
-        ValueError, 'Please initialize `TimeDistributed` layer with a '
-        '`tf.keras.layers.Layer` instance.'):
-      keras.layers.TimeDistributed(x)
-
-  def test_timedistributed_conv2d(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Conv2D(5, (2, 2), padding='same'),
-              input_shape=(2, 4, 4, 3)))
-      model.add(keras.layers.Activation('relu'))
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.random.random((1, 2, 4, 4, 3)), np.random.random((1, 2, 4, 4, 5)))
-
-      model = keras.models.model_from_json(model.to_json())
-      model.summary()
-
-  def test_timedistributed_stacked(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(2), input_shape=(3, 4)))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.add(keras.layers.Activation('relu'))
-      model.compile(optimizer='rmsprop', loss='mse')
-
-      model.fit(
-          np.random.random((10, 3, 4)),
-          np.random.random((10, 3, 3)),
-          epochs=1,
-          batch_size=10)
-
-  def test_regularizers(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(2, kernel_regularizer='l1',
-                                 activity_regularizer='l1'),
-              input_shape=(3, 4)))
-      model.add(keras.layers.Activation('relu'))
-      model.compile(optimizer='rmsprop', loss='mse')
-      self.assertEqual(len(model.losses), 2)
-
-  def test_TimeDistributed_learning_phase(self):
-    with self.cached_session():
-      # test layers that need learning_phase to be set
-      np.random.seed(1234)
-      x = keras.layers.Input(shape=(3, 2))
-      y = keras.layers.TimeDistributed(keras.layers.Dropout(.999))(
-          x, training=True)
-      model = keras.models.Model(x, y)
-      y = model.predict(np.random.random((10, 3, 2)))
-      self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
-
-  def test_TimeDistributed_batchnorm(self):
-    with self.cached_session():
-      # test that wrapped BN updates still work.
-      model = keras.models.Sequential()
-      model.add(keras.layers.TimeDistributed(
-          keras.layers.BatchNormalization(center=True, scale=True),
-          name='bn',
-          input_shape=(10, 2)))
-      model.compile(optimizer='rmsprop', loss='mse')
-      # Assert that mean and variance are 0 and 1.
-      td = model.layers[0]
-      self.assertAllClose(td.get_weights()[2], np.array([0, 0]))
-      assert np.array_equal(td.get_weights()[3], np.array([1, 1]))
-      # Train
-      model.train_on_batch(np.random.normal(loc=2, scale=2, size=(1, 10, 2)),
-                           np.broadcast_to(np.array([0, 1]), (1, 10, 2)))
-      # Assert that mean and variance changed.
-      assert not np.array_equal(td.get_weights()[2], np.array([0, 0]))
-      assert not np.array_equal(td.get_weights()[3], np.array([1, 1]))
-
-  def test_TimeDistributed_trainable(self):
-    # test layers that need learning_phase to be set
-    x = keras.layers.Input(shape=(3, 2))
-    layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
-    _ = layer(x)
-    self.assertEqual(len(layer.trainable_weights), 2)
-    layer.trainable = False
-    assert not layer.trainable_weights
-    layer.trainable = True
-    assert len(layer.trainable_weights) == 2
-
-  def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
-    with self.cached_session():
-      # test with unspecified shape and Embeddings with mask_zero
-      model = keras.models.Sequential()
-      model.add(keras.layers.TimeDistributed(
-          keras.layers.Embedding(5, 6, mask_zero=True),
-          input_shape=(None, None)))  # N by t_1 by t_2 by 6
-      model.add(keras.layers.TimeDistributed(
-          keras.layers.SimpleRNN(7, return_sequences=True)))
-      model.add(keras.layers.TimeDistributed(
-          keras.layers.SimpleRNN(8, return_sequences=False)))
-      model.add(keras.layers.SimpleRNN(1, return_sequences=False))
-      model.compile(optimizer='rmsprop', loss='mse')
-      model_input = np.random.randint(low=1, high=5, size=(10, 3, 4),
-                                      dtype='int32')
-      for i in range(4):
-        model_input[i, i:, i:] = 0
-      model.fit(model_input,
-                np.random.random((10, 1)), epochs=1, batch_size=10)
-      mask_outputs = [model.layers[0].compute_mask(model.input)]
-      for layer in model.layers[1:]:
-        mask_outputs.append(layer.compute_mask(layer.input, mask_outputs[-1]))
-      func = keras.backend.function([model.input], mask_outputs[:-1])
-      mask_outputs_val = func([model_input])
-      ref_mask_val_0 = model_input > 0         # embedding layer
-      ref_mask_val_1 = ref_mask_val_0          # first RNN layer
-      ref_mask_val_2 = np.any(ref_mask_val_1, axis=-1)     # second RNN layer
-      ref_mask_val = [ref_mask_val_0, ref_mask_val_1, ref_mask_val_2]
-      for i in range(3):
-        self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
-      self.assertIs(mask_outputs[-1], None)  # final layer
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_TimeDistributed_with_masking_layer(self):
-    # test with Masking layer
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.TimeDistributed(
-            keras.layers.Masking(mask_value=0.,), input_shape=(None, 4)))
-    model.add(keras.layers.TimeDistributed(keras.layers.Dense(5)))
-    model.compile(optimizer='rmsprop', loss='mse')
-    model_input = np.random.randint(low=1, high=5, size=(10, 3, 4))
-    for i in range(4):
-      model_input[i, i:, :] = 0.
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(model_input, np.random.random((10, 3, 5)), epochs=1, batch_size=6)
-    mask_outputs = [model.layers[0].compute_mask(model.input)]
-    mask_outputs += [
-        model.layers[1].compute_mask(model.layers[1].input, mask_outputs[-1])
-    ]
-    func = keras.backend.function([model.input], mask_outputs)
-    mask_outputs_val = func([model_input])
-    self.assertEqual((mask_outputs_val[0]).all(), model_input.all())
-    self.assertEqual((mask_outputs_val[1]).all(), model_input.all())
-
-  def test_TimeDistributed_with_different_time_shapes(self):
-    time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
-    ph_1 = keras.backend.placeholder(shape=(None, 10, 13))
-    out_1 = time_dist(ph_1)
-    self.assertEqual(out_1.shape.as_list(), [None, 10, 5])
-
-    ph_2 = keras.backend.placeholder(shape=(None, 1, 13))
-    out_2 = time_dist(ph_2)
-    self.assertEqual(out_2.shape.as_list(), [None, 1, 5])
-
-    ph_3 = keras.backend.placeholder(shape=(None, 1, 18))
-    with self.assertRaisesRegex(ValueError, 'is incompatible with'):
-      time_dist(ph_3)
-
-  def test_TimeDistributed_with_invalid_dimensions(self):
-    time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
-    ph = keras.backend.placeholder(shape=(None, 10))
-    with self.assertRaisesRegex(
-        ValueError,
-        '`TimeDistributed` Layer should be passed an `input_shape `'):
-      time_dist(ph)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_TimeDistributed_reshape(self):
-
-    class NoReshapeLayer(keras.layers.Layer):
-
-      def call(self, inputs):
-        return inputs
-
-    # Built-in layers that aren't stateful use the reshape implementation.
-    td1 = keras.layers.TimeDistributed(keras.layers.Dense(5))
-    self.assertTrue(td1._always_use_reshape)
-
-    # Built-in layers that are stateful don't use the reshape implementation.
-    td2 = keras.layers.TimeDistributed(
-        keras.layers.RNN(keras.layers.SimpleRNNCell(10), stateful=True))
-    self.assertFalse(td2._always_use_reshape)
-
-    # Custom layers are not allowlisted for the fast reshape implementation.
-    td3 = keras.layers.TimeDistributed(NoReshapeLayer())
-    self.assertFalse(td3._always_use_reshape)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_TimeDistributed_output_shape_return_types(self):
-
-    class TestLayer(keras.layers.Layer):
-
-      def call(self, inputs):
-        return tf.concat([inputs, inputs], axis=-1)
-
-      def compute_output_shape(self, input_shape):
-        output_shape = tf.TensorShape(input_shape).as_list()
-        output_shape[-1] = output_shape[-1] * 2
-        output_shape = tf.TensorShape(output_shape)
-        return output_shape
-
-    class TestListLayer(TestLayer):
-
-      def compute_output_shape(self, input_shape):
-        shape = super().compute_output_shape(input_shape)
-        return shape.as_list()
-
-    class TestTupleLayer(TestLayer):
-
-      def compute_output_shape(self, input_shape):
-        shape = super().compute_output_shape(input_shape)
-        return tuple(shape.as_list())
-
-    # Layers can specify output shape as list/tuple/TensorShape
-    test_layers = [TestLayer, TestListLayer, TestTupleLayer]
-    for layer in test_layers:
-      input_layer = keras.layers.TimeDistributed(layer())
-      inputs = keras.backend.placeholder(shape=(None, 2, 4))
-      output = input_layer(inputs)
-      self.assertEqual(output.shape.as_list(), [None, 2, 8])
-      self.assertEqual(
-          input_layer.compute_output_shape([None, 2, 4]).as_list(),
-          [None, 2, 8])
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  # TODO(scottzhu): check why v1 session failed.
-  def test_TimeDistributed_with_mask_first_implementation(self):
-    np.random.seed(100)
-    rnn_layer = keras.layers.LSTM(4, return_sequences=True, stateful=True)
-
-    data = np.array([[[[1.0], [1.0]], [[0.0], [1.0]]],
-                     [[[1.0], [0.0]], [[1.0], [1.0]]],
-                     [[[1.0], [0.0]], [[1.0], [1.0]]]])
-    x = keras.layers.Input(shape=(2, 2, 1), batch_size=3)
-    x_masking = keras.layers.Masking()(x)
-    y = keras.layers.TimeDistributed(rnn_layer)(x_masking)
-    model_1 = keras.models.Model(x, y)
-    model_1.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    output_with_mask = model_1.predict(data, steps=1)
-
-    y = keras.layers.TimeDistributed(rnn_layer)(x)
-    model_2 = keras.models.Model(x, y)
-    model_2.compile(
-        'rmsprop',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    output = model_2.predict(data, steps=1)
-
-    self.assertNotAllClose(output_with_mask, output, atol=1e-7)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      *test_utils.generate_combinations_with_testcase_name(
-          layer=[keras.layers.LSTM,
-                 keras.layers.Dense]))
-  def test_TimeDistributed_with_ragged_input(self, layer):
-    if tf.executing_eagerly():
-      self.skipTest('b/143103634')
-    np.random.seed(100)
-    layer = layer(4)
-    ragged_data = tf.ragged.constant(
-        [[[[1.0], [1.0]], [[2.0], [2.0]]],
-         [[[4.0], [4.0]], [[5.0], [5.0]], [[6.0], [6.0]]],
-         [[[7.0], [7.0]], [[8.0], [8.0]], [[9.0], [9.0]]]],
-        ragged_rank=1)
-
-    x_ragged = keras.Input(shape=(None, 2, 1), dtype='float32', ragged=True)
-    y_ragged = keras.layers.TimeDistributed(layer)(x_ragged)
-    model_1 = keras.models.Model(x_ragged, y_ragged)
-    model_1._run_eagerly = test_utils.should_run_eagerly()
-    output_ragged = model_1.predict(ragged_data, steps=1)
-
-    x_dense = keras.Input(shape=(None, 2, 1), dtype='float32')
-    masking = keras.layers.Masking()(x_dense)
-    y_dense = keras.layers.TimeDistributed(layer)(masking)
-    model_2 = keras.models.Model(x_dense, y_dense)
-    dense_data = ragged_data.to_tensor()
-    model_2._run_eagerly = test_utils.should_run_eagerly()
-    output_dense = model_2.predict(dense_data, steps=1)
-
-    output_ragged = convert_ragged_tensor_value(output_ragged)
-    self.assertAllEqual(output_ragged.to_tensor(), output_dense)
-
-  @test_combinations.run_all_keras_modes
-  def test_TimeDistributed_with_ragged_input_with_batch_size(self):
-    np.random.seed(100)
-    layer = keras.layers.Dense(16)
-
-    ragged_data = tf.ragged.constant(
-        [[[[1.0], [1.0]], [[2.0], [2.0]]],
-         [[[4.0], [4.0]], [[5.0], [5.0]], [[6.0], [6.0]]],
-         [[[7.0], [7.0]], [[8.0], [8.0]], [[9.0], [9.0]]]],
-        ragged_rank=1)
-
-    # Use the first implementation by specifying batch_size
-    x_ragged = keras.Input(shape=(None, 2, 1), batch_size=3, dtype='float32',
-                           ragged=True)
-    y_ragged = keras.layers.TimeDistributed(layer)(x_ragged)
-    model_1 = keras.models.Model(x_ragged, y_ragged)
-    output_ragged = model_1.predict(ragged_data, steps=1)
-
-    x_dense = keras.Input(shape=(None, 2, 1), batch_size=3, dtype='float32')
-    masking = keras.layers.Masking()(x_dense)
-    y_dense = keras.layers.TimeDistributed(layer)(masking)
-    model_2 = keras.models.Model(x_dense, y_dense)
-    dense_data = ragged_data.to_tensor()
-    output_dense = model_2.predict(dense_data, steps=1)
-
-    output_ragged = convert_ragged_tensor_value(output_ragged)
-    self.assertAllEqual(output_ragged.to_tensor(), output_dense)
-
-  def test_TimeDistributed_set_static_shape(self):
-    layer = keras.layers.TimeDistributed(keras.layers.Conv2D(16, (3, 3)))
-    inputs = keras.Input(batch_shape=(1, None, 32, 32, 1))
-    outputs = layer(inputs)
-    # Make sure the batch dim is not lost after array_ops.reshape.
-    self.assertListEqual(outputs.shape.as_list(), [1, None, 30, 30, 16])
-
-  @test_combinations.run_all_keras_modes
-  def test_TimeDistributed_with_mimo(self):
-    dense_1 = keras.layers.Dense(8)
-    dense_2 = keras.layers.Dense(16)
-
-    class TestLayer(keras.layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.dense_1 = dense_1
-        self.dense_2 = dense_2
-
-      def call(self, inputs):
-        return self.dense_1(inputs[0]), self.dense_2(inputs[1])
-
-      def compute_output_shape(self, input_shape):
-        output_shape_1 = self.dense_1.compute_output_shape(input_shape[0])
-        output_shape_2 = self.dense_2.compute_output_shape(input_shape[1])
-        return output_shape_1, output_shape_2
-
-    np.random.seed(100)
-    layer = TestLayer()
-
-    data_1 = tf.constant([[[[1.0], [1.0]], [[2.0], [2.0]]],
-                          [[[4.0], [4.0]], [[5.0], [5.0]]],
-                          [[[7.0], [7.0]], [[8.0], [8.0]]]])
-
-    data_2 = tf.constant([[[[1.0], [1.0]], [[2.0], [2.0]]],
-                          [[[4.0], [4.0]], [[5.0], [5.0]]],
-                          [[[7.0], [7.0]], [[8.0], [8.0]]]])
-
-    x1 = keras.Input(shape=(None, 2, 1), dtype='float32')
-    x2 = keras.Input(shape=(None, 2, 1), dtype='float32')
-    y1, y2 = keras.layers.TimeDistributed(layer)([x1, x2])
-    model_1 = keras.models.Model([x1, x2], [y1, y2])
-    model_1.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    output_1 = model_1.predict((data_1, data_2), steps=1)
-
-    y1 = dense_1(x1)
-    y2 = dense_2(x2)
-    model_2 = keras.models.Model([x1, x2], [y1, y2])
-    output_2 = model_2.predict((data_1, data_2), steps=1)
-
-    self.assertAllClose(output_1, output_2)
-
-    model_1.fit(
-        x=[np.random.random((10, 2, 2, 1)),
-           np.random.random((10, 2, 2, 1))],
-        y=[np.random.random((10, 2, 2, 8)),
-           np.random.random((10, 2, 2, 16))],
-        epochs=1,
-        batch_size=3)
-
-  def test_TimeDistributed_Attention(self):
-    query_input = keras.layers.Input(shape=(None, 1, 10), dtype='float32')
-    value_input = keras.layers.Input(shape=(None, 4, 10), dtype='float32')
-
-    # Query-value attention of shape [batch_size, Tq, filters].
-    query_value_attention_seq = keras.layers.TimeDistributed(
-        keras.layers.Attention())([query_input, value_input])
-    model = keras.models.Model([query_input, value_input],
-                               query_value_attention_seq)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(
-        [np.random.random((10, 8, 1, 10)),
-         np.random.random((10, 8, 4, 10))],
-        np.random.random((10, 8, 1, 10)),
-        epochs=1,
-        batch_size=10)
-
-    # test config and serialization/deserialization
-    model.get_config()
-    model = keras.models.model_from_json(model.to_json())
-    model.summary()
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_timedistributed_dense(self):
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.TimeDistributed(
+                keras.layers.Dense(2), input_shape=(3, 4)
+            )
+        )
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            np.random.random((10, 3, 4)),
+            np.random.random((10, 3, 2)),
+            epochs=1,
+            batch_size=10,
+        )
+
+        # test config
+        model.get_config()
+
+        # check whether the model variables are present in the
+        # trackable list of objects
+        checkpointed_object_ids = {
+            id(o) for o in trackable_util.list_objects(model)
+        }
+        for v in model.variables:
+            self.assertIn(id(v), checkpointed_object_ids)
+
+    def test_timedistributed_static_batch_size(self):
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.TimeDistributed(
+                keras.layers.Dense(2), input_shape=(3, 4), batch_size=10
+            )
+        )
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            np.random.random((10, 3, 4)),
+            np.random.random((10, 3, 2)),
+            epochs=1,
+            batch_size=10,
+        )
+
+    def test_timedistributed_invalid_init(self):
+        x = tf.constant(np.zeros((1, 1)).astype("float32"))
+        with self.assertRaisesRegex(
+            ValueError,
+            "Please initialize `TimeDistributed` layer with a "
+            "`tf.keras.layers.Layer` instance.",
+        ):
+            keras.layers.TimeDistributed(x)
+
+    def test_timedistributed_conv2d(self):
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.TimeDistributed(
+                    keras.layers.Conv2D(5, (2, 2), padding="same"),
+                    input_shape=(2, 4, 4, 3),
+                )
+            )
+            model.add(keras.layers.Activation("relu"))
+            model.compile(optimizer="rmsprop", loss="mse")
+            model.train_on_batch(
+                np.random.random((1, 2, 4, 4, 3)),
+                np.random.random((1, 2, 4, 4, 5)),
+            )
+
+            model = keras.models.model_from_json(model.to_json())
+            model.summary()
+
+    def test_timedistributed_stacked(self):
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.TimeDistributed(
+                    keras.layers.Dense(2), input_shape=(3, 4)
+                )
+            )
+            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+            model.add(keras.layers.Activation("relu"))
+            model.compile(optimizer="rmsprop", loss="mse")
+
+            model.fit(
+                np.random.random((10, 3, 4)),
+                np.random.random((10, 3, 3)),
+                epochs=1,
+                batch_size=10,
+            )
+
+    def test_regularizers(self):
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.TimeDistributed(
+                    keras.layers.Dense(
+                        2, kernel_regularizer="l1", activity_regularizer="l1"
+                    ),
+                    input_shape=(3, 4),
+                )
+            )
+            model.add(keras.layers.Activation("relu"))
+            model.compile(optimizer="rmsprop", loss="mse")
+            self.assertEqual(len(model.losses), 2)
+
+    def test_TimeDistributed_learning_phase(self):
+        with self.cached_session():
+            keras.utils.set_random_seed(0)
+            x = keras.layers.Input(shape=(3, 2))
+            y = keras.layers.TimeDistributed(keras.layers.Dropout(0.999))(
+                x, training=True
+            )
+            model = keras.models.Model(x, y)
+            y = model.predict(np.random.random((10, 3, 2)))
+            self.assertAllClose(np.mean(y), 0.0, atol=1e-1, rtol=1e-1)
+
+    def test_TimeDistributed_batchnorm(self):
+        with self.cached_session():
+            # test that wrapped BN updates still work.
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.TimeDistributed(
+                    keras.layers.BatchNormalization(center=True, scale=True),
+                    name="bn",
+                    input_shape=(10, 2),
+                )
+            )
+            model.compile(optimizer="rmsprop", loss="mse")
+            # Assert that mean and variance are 0 and 1.
+            td = model.layers[0]
+            self.assertAllClose(td.get_weights()[2], np.array([0, 0]))
+            assert np.array_equal(td.get_weights()[3], np.array([1, 1]))
+            # Train
+            model.train_on_batch(
+                np.random.normal(loc=2, scale=2, size=(1, 10, 2)),
+                np.broadcast_to(np.array([0, 1]), (1, 10, 2)),
+            )
+            # Assert that mean and variance changed.
+            assert not np.array_equal(td.get_weights()[2], np.array([0, 0]))
+            assert not np.array_equal(td.get_weights()[3], np.array([1, 1]))
+
+    def test_TimeDistributed_trainable(self):
+        # test layers that need learning_phase to be set
+        x = keras.layers.Input(shape=(3, 2))
+        layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
+        _ = layer(x)
+        self.assertEqual(len(layer.trainable_weights), 2)
+        layer.trainable = False
+        assert not layer.trainable_weights
+        layer.trainable = True
+        assert len(layer.trainable_weights) == 2
+
+    def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
+        with self.cached_session():
+            # test with unspecified shape and Embeddings with mask_zero
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.TimeDistributed(
+                    keras.layers.Embedding(5, 6, mask_zero=True),
+                    input_shape=(None, None),
+                )
+            )  # N by t_1 by t_2 by 6
+            model.add(
+                keras.layers.TimeDistributed(
+                    keras.layers.SimpleRNN(7, return_sequences=True)
+                )
+            )
+            model.add(
+                keras.layers.TimeDistributed(
+                    keras.layers.SimpleRNN(8, return_sequences=False)
+                )
+            )
+            model.add(keras.layers.SimpleRNN(1, return_sequences=False))
+            model.compile(optimizer="rmsprop", loss="mse")
+            model_input = np.random.randint(
+                low=1, high=5, size=(10, 3, 4), dtype="int32"
+            )
+            for i in range(4):
+                model_input[i, i:, i:] = 0
+            model.fit(
+                model_input, np.random.random((10, 1)), epochs=1, batch_size=10
+            )
+            mask_outputs = [model.layers[0].compute_mask(model.input)]
+            for layer in model.layers[1:]:
+                mask_outputs.append(
+                    layer.compute_mask(layer.input, mask_outputs[-1])
+                )
+            func = keras.backend.function([model.input], mask_outputs[:-1])
+            mask_outputs_val = func([model_input])
+            ref_mask_val_0 = model_input > 0  # embedding layer
+            ref_mask_val_1 = ref_mask_val_0  # first RNN layer
+            ref_mask_val_2 = np.any(ref_mask_val_1, axis=-1)  # second RNN layer
+            ref_mask_val = [ref_mask_val_0, ref_mask_val_1, ref_mask_val_2]
+            for i in range(3):
+                self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
+            self.assertIs(mask_outputs[-1], None)  # final layer
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_TimeDistributed_with_masking_layer(self):
+        # test with Masking layer
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.TimeDistributed(
+                keras.layers.Masking(
+                    mask_value=0.0,
+                ),
+                input_shape=(None, 4),
+            )
+        )
+        model.add(keras.layers.TimeDistributed(keras.layers.Dense(5)))
+        model.compile(optimizer="rmsprop", loss="mse")
+        model_input = np.random.randint(low=1, high=5, size=(10, 3, 4))
+        for i in range(4):
+            model_input[i, i:, :] = 0.0
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            model_input, np.random.random((10, 3, 5)), epochs=1, batch_size=6
+        )
+        mask_outputs = [model.layers[0].compute_mask(model.input)]
+        mask_outputs += [
+            model.layers[1].compute_mask(
+                model.layers[1].input, mask_outputs[-1]
+            )
+        ]
+        func = keras.backend.function([model.input], mask_outputs)
+        mask_outputs_val = func([model_input])
+        self.assertEqual((mask_outputs_val[0]).all(), model_input.all())
+        self.assertEqual((mask_outputs_val[1]).all(), model_input.all())
+
+    def test_TimeDistributed_with_different_time_shapes(self):
+        time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
+        ph_1 = keras.backend.placeholder(shape=(None, 10, 13))
+        out_1 = time_dist(ph_1)
+        self.assertEqual(out_1.shape.as_list(), [None, 10, 5])
+
+        ph_2 = keras.backend.placeholder(shape=(None, 1, 13))
+        out_2 = time_dist(ph_2)
+        self.assertEqual(out_2.shape.as_list(), [None, 1, 5])
+
+        ph_3 = keras.backend.placeholder(shape=(None, 1, 18))
+        with self.assertRaisesRegex(ValueError, "is incompatible with"):
+            time_dist(ph_3)
+
+    def test_TimeDistributed_with_invalid_dimensions(self):
+        time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
+        ph = keras.backend.placeholder(shape=(None, 10))
+        with self.assertRaisesRegex(
+            ValueError,
+            "`TimeDistributed` Layer should be passed an `input_shape `",
+        ):
+            time_dist(ph)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_TimeDistributed_reshape(self):
+        class NoReshapeLayer(keras.layers.Layer):
+            def call(self, inputs):
+                return inputs
+
+        # Built-in layers that aren't stateful use the reshape implementation.
+        td1 = keras.layers.TimeDistributed(keras.layers.Dense(5))
+        self.assertTrue(td1._always_use_reshape)
+
+        # Built-in layers that are stateful don't use the reshape
+        # implementation.
+        td2 = keras.layers.TimeDistributed(
+            keras.layers.RNN(keras.layers.SimpleRNNCell(10), stateful=True)
+        )
+        self.assertFalse(td2._always_use_reshape)
+
+        # Custom layers are not allowlisted for the fast reshape implementation.
+        td3 = keras.layers.TimeDistributed(NoReshapeLayer())
+        self.assertFalse(td3._always_use_reshape)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        ("fully_defined", [3, 2, 4], [3, 2, 8]),
+        ("dynamic_batch_size", [None, 2, 4], [None, 2, 8]),
+        ("two_dynamic_dims", [None, None, 4], [None, None, 8]),
+        ("rank_only", [None, None, None], [None, None, None]),
+    )
+    def test_TimeDistributed_output_shape_return_types(
+        self, input_shape, expected_output_shape
+    ):
+        class TestLayer(keras.layers.Layer):
+            def call(self, inputs):
+                return tf.concat([inputs, inputs], axis=-1)
+
+            def compute_output_shape(self, input_shape):
+                output_shape = tf.TensorShape(input_shape).as_list()
+                if output_shape[-1] is not None:
+                    output_shape[-1] = output_shape[-1] * 2
+                output_shape = tf.TensorShape(output_shape)
+                return output_shape
+
+        class TestListLayer(TestLayer):
+            def compute_output_shape(self, input_shape):
+                shape = super().compute_output_shape(input_shape)
+                return shape.as_list()
+
+        class TestTupleLayer(TestLayer):
+            def compute_output_shape(self, input_shape):
+                shape = super().compute_output_shape(input_shape)
+                return tuple(shape.as_list())
+
+        # Layers can specify output shape as list/tuple/TensorShape
+        test_layers = [TestLayer, TestListLayer, TestTupleLayer]
+        for layer in test_layers:
+            input_layer = keras.layers.TimeDistributed(layer())
+            inputs = keras.backend.placeholder(shape=input_shape)
+            output = input_layer(inputs)
+            self.assertEqual(output.shape.as_list(), expected_output_shape)
+            self.assertEqual(
+                input_layer.compute_output_shape(input_shape).as_list(),
+                expected_output_shape,
+            )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    # TODO(scottzhu): check why v1 session failed.
+    def test_TimeDistributed_with_mask_first_implementation(self):
+        np.random.seed(100)
+        rnn_layer = keras.layers.LSTM(4, return_sequences=True, stateful=True)
+
+        data = np.array(
+            [
+                [[[1.0], [1.0]], [[0.0], [1.0]]],
+                [[[1.0], [0.0]], [[1.0], [1.0]]],
+                [[[1.0], [0.0]], [[1.0], [1.0]]],
+            ]
+        )
+        x = keras.layers.Input(shape=(2, 2, 1), batch_size=3)
+        x_masking = keras.layers.Masking()(x)
+        y = keras.layers.TimeDistributed(rnn_layer)(x_masking)
+        model_1 = keras.models.Model(x, y)
+        model_1.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        output_with_mask = model_1.predict(data, steps=1)
+
+        y = keras.layers.TimeDistributed(rnn_layer)(x)
+        model_2 = keras.models.Model(x, y)
+        model_2.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        output = model_2.predict(data, steps=1)
+
+        self.assertNotAllClose(output_with_mask, output, atol=1e-7)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        *test_utils.generate_combinations_with_testcase_name(
+            layer=[keras.layers.LSTM, keras.layers.Dense]
+        )
+    )
+    def test_TimeDistributed_with_ragged_input(self, layer):
+        if tf.executing_eagerly():
+            self.skipTest("b/143103634")
+        np.random.seed(100)
+        layer = layer(4)
+        ragged_data = tf.ragged.constant(
+            [
+                [[[1.0], [1.0]], [[2.0], [2.0]]],
+                [[[4.0], [4.0]], [[5.0], [5.0]], [[6.0], [6.0]]],
+                [[[7.0], [7.0]], [[8.0], [8.0]], [[9.0], [9.0]]],
+            ],
+            ragged_rank=1,
+        )
+
+        x_ragged = keras.Input(shape=(None, 2, 1), dtype="float32", ragged=True)
+        y_ragged = keras.layers.TimeDistributed(layer)(x_ragged)
+        model_1 = keras.models.Model(x_ragged, y_ragged)
+        model_1._run_eagerly = test_utils.should_run_eagerly()
+        output_ragged = model_1.predict(ragged_data, steps=1)
+
+        x_dense = keras.Input(shape=(None, 2, 1), dtype="float32")
+        masking = keras.layers.Masking()(x_dense)
+        y_dense = keras.layers.TimeDistributed(layer)(masking)
+        model_2 = keras.models.Model(x_dense, y_dense)
+        dense_data = ragged_data.to_tensor()
+        model_2._run_eagerly = test_utils.should_run_eagerly()
+        output_dense = model_2.predict(dense_data, steps=1)
+
+        output_ragged = convert_ragged_tensor_value(output_ragged)
+        self.assertAllEqual(output_ragged.to_tensor(), output_dense)
+
+    @test_combinations.run_all_keras_modes
+    def test_TimeDistributed_with_ragged_input_with_batch_size(self):
+        np.random.seed(100)
+        layer = keras.layers.Dense(16)
+
+        ragged_data = tf.ragged.constant(
+            [
+                [[[1.0], [1.0]], [[2.0], [2.0]]],
+                [[[4.0], [4.0]], [[5.0], [5.0]], [[6.0], [6.0]]],
+                [[[7.0], [7.0]], [[8.0], [8.0]], [[9.0], [9.0]]],
+            ],
+            ragged_rank=1,
+        )
+
+        # Use the first implementation by specifying batch_size
+        x_ragged = keras.Input(
+            shape=(None, 2, 1), batch_size=3, dtype="float32", ragged=True
+        )
+        y_ragged = keras.layers.TimeDistributed(layer)(x_ragged)
+        model_1 = keras.models.Model(x_ragged, y_ragged)
+        output_ragged = model_1.predict(ragged_data, steps=1)
+
+        x_dense = keras.Input(shape=(None, 2, 1), batch_size=3, dtype="float32")
+        masking = keras.layers.Masking()(x_dense)
+        y_dense = keras.layers.TimeDistributed(layer)(masking)
+        model_2 = keras.models.Model(x_dense, y_dense)
+        dense_data = ragged_data.to_tensor()
+        output_dense = model_2.predict(dense_data, steps=1)
+
+        output_ragged = convert_ragged_tensor_value(output_ragged)
+        self.assertAllEqual(output_ragged.to_tensor(), output_dense)
+
+    def test_TimeDistributed_set_static_shape(self):
+        layer = keras.layers.TimeDistributed(keras.layers.Conv2D(16, (3, 3)))
+        inputs = keras.Input(batch_shape=(1, None, 32, 32, 1))
+        outputs = layer(inputs)
+        # Make sure the batch dim is not lost after array_ops.reshape.
+        self.assertListEqual(outputs.shape.as_list(), [1, None, 30, 30, 16])
+
+    @test_combinations.run_all_keras_modes
+    def test_TimeDistributed_with_mimo(self):
+        dense_1 = keras.layers.Dense(8)
+        dense_2 = keras.layers.Dense(16)
+
+        class TestLayer(keras.layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.dense_1 = dense_1
+                self.dense_2 = dense_2
+
+            def call(self, inputs):
+                return self.dense_1(inputs[0]), self.dense_2(inputs[1])
+
+            def compute_output_shape(self, input_shape):
+                output_shape_1 = self.dense_1.compute_output_shape(
+                    input_shape[0]
+                )
+                output_shape_2 = self.dense_2.compute_output_shape(
+                    input_shape[1]
+                )
+                return output_shape_1, output_shape_2
+
+        np.random.seed(100)
+        layer = TestLayer()
+
+        data_1 = tf.constant(
+            [
+                [[[1.0], [1.0]], [[2.0], [2.0]]],
+                [[[4.0], [4.0]], [[5.0], [5.0]]],
+                [[[7.0], [7.0]], [[8.0], [8.0]]],
+            ]
+        )
+
+        data_2 = tf.constant(
+            [
+                [[[1.0], [1.0]], [[2.0], [2.0]]],
+                [[[4.0], [4.0]], [[5.0], [5.0]]],
+                [[[7.0], [7.0]], [[8.0], [8.0]]],
+            ]
+        )
+
+        x1 = keras.Input(shape=(None, 2, 1), dtype="float32")
+        x2 = keras.Input(shape=(None, 2, 1), dtype="float32")
+        y1, y2 = keras.layers.TimeDistributed(layer)([x1, x2])
+        model_1 = keras.models.Model([x1, x2], [y1, y2])
+        model_1.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        output_1 = model_1.predict((data_1, data_2), steps=1)
+
+        y1 = dense_1(x1)
+        y2 = dense_2(x2)
+        model_2 = keras.models.Model([x1, x2], [y1, y2])
+        output_2 = model_2.predict((data_1, data_2), steps=1)
+
+        self.assertAllClose(output_1, output_2)
+
+        model_1.fit(
+            x=[
+                np.random.random((10, 2, 2, 1)),
+                np.random.random((10, 2, 2, 1)),
+            ],
+            y=[
+                np.random.random((10, 2, 2, 8)),
+                np.random.random((10, 2, 2, 16)),
+            ],
+            epochs=1,
+            batch_size=3,
+        )
+
+    def test_TimeDistributed_Attention(self):
+        query_input = keras.layers.Input(shape=(None, 1, 10), dtype="float32")
+        value_input = keras.layers.Input(shape=(None, 4, 10), dtype="float32")
+
+        # Query-value attention of shape [batch_size, Tq, filters].
+        query_value_attention_seq = keras.layers.TimeDistributed(
+            keras.layers.Attention()
+        )([query_input, value_input])
+        model = keras.models.Model(
+            [query_input, value_input], query_value_attention_seq
+        )
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            [
+                np.random.random((10, 8, 1, 10)),
+                np.random.random((10, 8, 4, 10)),
+            ],
+            np.random.random((10, 8, 1, 10)),
+            epochs=1,
+            batch_size=10,
+        )
+
+        # test config and serialization/deserialization
+        model.get_config()
+        model = keras.models.model_from_json(model.to_json())
+        model.summary()
 
 
 def convert_ragged_tensor_value(inputs):
-  if isinstance(inputs, tf.compat.v1.ragged.RaggedTensorValue):
-    flat_values = tf.convert_to_tensor(
-        value=inputs.flat_values,
-        name='flat_values')
-    return tf.RaggedTensor.from_nested_row_splits(
-        flat_values, inputs.nested_row_splits, validate=False)
-  return inputs
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    if isinstance(inputs, tf.compat.v1.ragged.RaggedTensorValue):
+        flat_values = tf.convert_to_tensor(
+            value=inputs.flat_values, name="flat_values"
+        )
+        return tf.RaggedTensor.from_nested_row_splits(
+            flat_values, inputs.nested_row_splits, validate=False
+        )
+    return inputs
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/serialization.py b/keras/layers/serialization.py
index f0f3b6629bfe..e35761b5b273 100644
--- a/keras/layers/serialization.py
+++ b/keras/layers/serialization.py
@@ -14,10 +14,10 @@
 # ==============================================================================
 """Layer serialization/deserialization functions."""
 
+import threading
+
 import tensorflow.compat.v2 as tf
-# pylint: disable=g-bad-import-order,g-direct-tensorflow-import,unused-import,wildcard-import
 
-import threading
 from keras.engine import base_layer
 from keras.engine import input_layer
 from keras.engine import input_spec
@@ -27,199 +27,273 @@
 from keras.layers import core
 from keras.layers import locally_connected
 from keras.layers import merging
-from keras.layers import noise
 from keras.layers import pooling
 from keras.layers import regularization
 from keras.layers import reshaping
 from keras.layers import rnn
-from keras.layers.rnn import cell_wrappers
-from keras.layers.rnn import gru
-from keras.layers.rnn import lstm
 from keras.layers.normalization import batch_normalization
 from keras.layers.normalization import batch_normalization_v1
+from keras.layers.normalization import group_normalization
 from keras.layers.normalization import layer_normalization
 from keras.layers.normalization import unit_normalization
 from keras.layers.preprocessing import category_encoding
 from keras.layers.preprocessing import discretization
-from keras.layers.preprocessing import hashing
 from keras.layers.preprocessing import hashed_crossing
+from keras.layers.preprocessing import hashing
 from keras.layers.preprocessing import image_preprocessing
 from keras.layers.preprocessing import integer_lookup
-from keras.layers.preprocessing import normalization as preprocessing_normalization
+from keras.layers.preprocessing import (
+    normalization as preprocessing_normalization,
+)
 from keras.layers.preprocessing import string_lookup
 from keras.layers.preprocessing import text_vectorization
-from keras.saving.saved_model import json_utils
+from keras.layers.rnn import cell_wrappers
+from keras.layers.rnn import gru
+from keras.layers.rnn import lstm
+from keras.metrics import base_metric
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.legacy.saved_model import json_utils
 from keras.utils import generic_utils
 from keras.utils import tf_inspect as inspect
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-ALL_MODULES = (base_layer, input_layer, activation, attention, convolutional,
-               core, locally_connected, merging, batch_normalization_v1,
-               layer_normalization, unit_normalization, pooling,
-               image_preprocessing, regularization, reshaping, rnn, hashing,
-               hashed_crossing, category_encoding, discretization,
-               integer_lookup, preprocessing_normalization, string_lookup,
-               text_vectorization)
-ALL_V2_MODULES = (batch_normalization, layer_normalization, cell_wrappers, gru,
-                  lstm)
+ALL_MODULES = (
+    base_layer,
+    input_layer,
+    activation,
+    attention,
+    convolutional,
+    core,
+    locally_connected,
+    merging,
+    batch_normalization_v1,
+    group_normalization,
+    layer_normalization,
+    unit_normalization,
+    pooling,
+    image_preprocessing,
+    regularization,
+    reshaping,
+    rnn,
+    hashing,
+    hashed_crossing,
+    category_encoding,
+    discretization,
+    integer_lookup,
+    preprocessing_normalization,
+    string_lookup,
+    text_vectorization,
+)
+ALL_V2_MODULES = (
+    batch_normalization,
+    layer_normalization,
+    cell_wrappers,
+    gru,
+    lstm,
+)
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
 def populate_deserializable_objects():
-  """Populates dict ALL_OBJECTS with every built-in layer."""
-  global LOCAL
-  if not hasattr(LOCAL, 'ALL_OBJECTS'):
-    LOCAL.ALL_OBJECTS = {}
-    LOCAL.GENERATED_WITH_V2 = None
-
-  if LOCAL.ALL_OBJECTS and LOCAL.GENERATED_WITH_V2 == tf.__internal__.tf2.enabled(
-  ):
-    # Objects dict is already generated for the proper TF version:
-    # do nothing.
-    return
+    """Populates dict ALL_OBJECTS with every built-in layer."""
+    global LOCAL
+    if not hasattr(LOCAL, "ALL_OBJECTS"):
+        LOCAL.ALL_OBJECTS = {}
+        LOCAL.GENERATED_WITH_V2 = None
 
-  LOCAL.ALL_OBJECTS = {}
-  LOCAL.GENERATED_WITH_V2 = tf.__internal__.tf2.enabled()
+    if (
+        LOCAL.ALL_OBJECTS
+        and LOCAL.GENERATED_WITH_V2 == tf.__internal__.tf2.enabled()
+    ):
+        # Objects dict is already generated for the proper TF version:
+        # do nothing.
+        return
 
-  base_cls = base_layer.Layer
-  generic_utils.populate_dict_with_module_objects(
-      LOCAL.ALL_OBJECTS,
-      ALL_MODULES,
-      obj_filter=lambda x: inspect.isclass(x) and issubclass(x, base_cls))
+    LOCAL.ALL_OBJECTS = {}
+    LOCAL.GENERATED_WITH_V2 = tf.__internal__.tf2.enabled()
 
-  # Overwrite certain V1 objects with V2 versions
-  if tf.__internal__.tf2.enabled():
+    base_cls = base_layer.Layer
     generic_utils.populate_dict_with_module_objects(
         LOCAL.ALL_OBJECTS,
-        ALL_V2_MODULES,
-        obj_filter=lambda x: inspect.isclass(x) and issubclass(x, base_cls))
-
-  # These deserialization aliases are added for backward compatibility,
-  # as in TF 1.13, "BatchNormalizationV1" and "BatchNormalizationV2"
-  # were used as class name for v1 and v2 version of BatchNormalization,
-  # respectively. Here we explicitly convert them to their canonical names.
-  LOCAL.ALL_OBJECTS[
-      'BatchNormalizationV1'] = batch_normalization_v1.BatchNormalization
-  LOCAL.ALL_OBJECTS[
-      'BatchNormalizationV2'] = batch_normalization.BatchNormalization
-
-  # Prevent circular dependencies.
-  from keras import models  # pylint: disable=g-import-not-at-top
-  from keras.premade_models.linear import LinearModel  # pylint: disable=g-import-not-at-top
-  from keras.premade_models.wide_deep import WideDeepModel  # pylint: disable=g-import-not-at-top
-  from keras.feature_column.sequence_feature_column import SequenceFeatures  # pylint: disable=g-import-not-at-top
-
-  LOCAL.ALL_OBJECTS['Input'] = input_layer.Input
-  LOCAL.ALL_OBJECTS['InputSpec'] = input_spec.InputSpec
-  LOCAL.ALL_OBJECTS['Functional'] = models.Functional
-  LOCAL.ALL_OBJECTS['Model'] = models.Model
-  LOCAL.ALL_OBJECTS['SequenceFeatures'] = SequenceFeatures
-  LOCAL.ALL_OBJECTS['Sequential'] = models.Sequential
-  LOCAL.ALL_OBJECTS['LinearModel'] = LinearModel
-  LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
-
-  if tf.__internal__.tf2.enabled():
-    from keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
-    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
-  else:
-    from keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
-    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
-
-  # Merging layers, function versions.
-  LOCAL.ALL_OBJECTS['add'] = merging.add
-  LOCAL.ALL_OBJECTS['subtract'] = merging.subtract
-  LOCAL.ALL_OBJECTS['multiply'] = merging.multiply
-  LOCAL.ALL_OBJECTS['average'] = merging.average
-  LOCAL.ALL_OBJECTS['maximum'] = merging.maximum
-  LOCAL.ALL_OBJECTS['minimum'] = merging.minimum
-  LOCAL.ALL_OBJECTS['concatenate'] = merging.concatenate
-  LOCAL.ALL_OBJECTS['dot'] = merging.dot
-
-
-@keras_export('keras.layers.serialize')
-def serialize(layer):
-  """Serializes a `Layer` object into a JSON-compatible representation.
-
-  Args:
-    layer: The `Layer` object to serialize.
-
-  Returns:
-    A JSON-serializable dict representing the object's config.
-
-  Example:
-
-  ```python
-  from pprint import pprint
-  model = tf.keras.models.Sequential()
-  model.add(tf.keras.Input(shape=(16,)))
-  model.add(tf.keras.layers.Dense(32, activation='relu'))
-
-  pprint(tf.keras.layers.serialize(model))
-  # prints the configuration of the model, as a dict.
-  """
-  return generic_utils.serialize_keras_object(layer)
-
-
-@keras_export('keras.layers.deserialize')
-def deserialize(config, custom_objects=None):
-  """Instantiates a layer from a config dictionary.
-
-  Args:
-      config: dict of the form {'class_name': str, 'config': dict}
-      custom_objects: dict mapping class names (or function names) of custom
-        (non-Keras) objects to class/functions
-
-  Returns:
-      Layer instance (may be Model, Sequential, Network, Layer...)
-
-  Example:
-
-  ```python
-  # Configuration of Dense(32, activation='relu')
-  config = {
-    'class_name': 'Dense',
-    'config': {
-      'activation': 'relu',
-      'activity_regularizer': None,
-      'bias_constraint': None,
-      'bias_initializer': {'class_name': 'Zeros', 'config': {}},
-      'bias_regularizer': None,
-      'dtype': 'float32',
-      'kernel_constraint': None,
-      'kernel_initializer': {'class_name': 'GlorotUniform',
-                             'config': {'seed': None}},
-      'kernel_regularizer': None,
-      'name': 'dense',
-      'trainable': True,
-      'units': 32,
-      'use_bias': True
+        ALL_MODULES,
+        obj_filter=lambda x: inspect.isclass(x) and issubclass(x, base_cls),
+    )
+
+    # Overwrite certain V1 objects with V2 versions
+    if tf.__internal__.tf2.enabled():
+        generic_utils.populate_dict_with_module_objects(
+            LOCAL.ALL_OBJECTS,
+            ALL_V2_MODULES,
+            obj_filter=lambda x: inspect.isclass(x) and issubclass(x, base_cls),
+        )
+
+    # These deserialization aliases are added for backward compatibility,
+    # as in TF 1.13, "BatchNormalizationV1" and "BatchNormalizationV2"
+    # were used as class name for v1 and v2 version of BatchNormalization,
+    # respectively. Here we explicitly convert them to their canonical names.
+    LOCAL.ALL_OBJECTS[
+        "BatchNormalizationV1"
+    ] = batch_normalization_v1.BatchNormalization
+    LOCAL.ALL_OBJECTS[
+        "BatchNormalizationV2"
+    ] = batch_normalization.BatchNormalization
+
+    # Prevent circular dependencies.
+    from keras import models
+    from keras.feature_column.sequence_feature_column import (
+        SequenceFeatures,
+    )
+    from keras.premade_models.linear import (
+        LinearModel,
+    )
+    from keras.premade_models.wide_deep import (
+        WideDeepModel,
+    )
+
+    LOCAL.ALL_OBJECTS["Input"] = input_layer.Input
+    LOCAL.ALL_OBJECTS["InputSpec"] = input_spec.InputSpec
+    LOCAL.ALL_OBJECTS["Functional"] = models.Functional
+    LOCAL.ALL_OBJECTS["Model"] = models.Model
+    LOCAL.ALL_OBJECTS["SequenceFeatures"] = SequenceFeatures
+    LOCAL.ALL_OBJECTS["Sequential"] = models.Sequential
+    LOCAL.ALL_OBJECTS["LinearModel"] = LinearModel
+    LOCAL.ALL_OBJECTS["WideDeepModel"] = WideDeepModel
+
+    if tf.__internal__.tf2.enabled():
+        from keras.feature_column.dense_features_v2 import (
+            DenseFeatures,
+        )
+
+        LOCAL.ALL_OBJECTS["DenseFeatures"] = DenseFeatures
+    else:
+        from keras.feature_column.dense_features import (
+            DenseFeatures,
+        )
+
+        LOCAL.ALL_OBJECTS["DenseFeatures"] = DenseFeatures
+
+    # Merging layers, function versions.
+    LOCAL.ALL_OBJECTS["add"] = merging.add
+    LOCAL.ALL_OBJECTS["subtract"] = merging.subtract
+    LOCAL.ALL_OBJECTS["multiply"] = merging.multiply
+    LOCAL.ALL_OBJECTS["average"] = merging.average
+    LOCAL.ALL_OBJECTS["maximum"] = merging.maximum
+    LOCAL.ALL_OBJECTS["minimum"] = merging.minimum
+    LOCAL.ALL_OBJECTS["concatenate"] = merging.concatenate
+    LOCAL.ALL_OBJECTS["dot"] = merging.dot
+
+
+@keras_export("keras.layers.serialize")
+def serialize(layer, use_legacy_format=False):
+    """Serializes a `Layer` object into a JSON-compatible representation.
+
+    Args:
+      layer: The `Layer` object to serialize.
+
+    Returns:
+      A JSON-serializable dict representing the object's config.
+
+    Example:
+
+    ```python
+    from pprint import pprint
+    model = tf.keras.models.Sequential()
+    model.add(tf.keras.Input(shape=(16,)))
+    model.add(tf.keras.layers.Dense(32, activation='relu'))
+
+    pprint(tf.keras.layers.serialize(model))
+    # prints the configuration of the model, as a dict.
+    """
+    if isinstance(layer, base_metric.Metric):
+        raise ValueError(
+            f"Cannot serialize {layer} since it is a metric. "
+            "Please use the `keras.metrics.serialize()` and "
+            "`keras.metrics.deserialize()` APIs to serialize "
+            "and deserialize metrics."
+        )
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(layer)
+
+    return serialization_lib.serialize_keras_object(layer)
+
+
+@keras_export("keras.layers.deserialize")
+def deserialize(config, custom_objects=None, use_legacy_format=False):
+    """Instantiates a layer from a config dictionary.
+
+    Args:
+        config: dict of the form {'class_name': str, 'config': dict}
+        custom_objects: dict mapping class names (or function names) of custom
+          (non-Keras) objects to class/functions
+
+    Returns:
+        Layer instance (may be Model, Sequential, Network, Layer...)
+
+    Example:
+
+    ```python
+    # Configuration of Dense(32, activation='relu')
+    config = {
+      'class_name': 'Dense',
+      'config': {
+        'activation': 'relu',
+        'activity_regularizer': None,
+        'bias_constraint': None,
+        'bias_initializer': {'class_name': 'Zeros', 'config': {}},
+        'bias_regularizer': None,
+        'dtype': 'float32',
+        'kernel_constraint': None,
+        'kernel_initializer': {'class_name': 'GlorotUniform',
+                               'config': {'seed': None}},
+        'kernel_regularizer': None,
+        'name': 'dense',
+        'trainable': True,
+        'units': 32,
+        'use_bias': True
+      }
     }
-  }
-  dense_layer = tf.keras.layers.deserialize(config)
-  ```
-  """
-  populate_deserializable_objects()
-  return generic_utils.deserialize_keras_object(
-      config,
-      module_objects=LOCAL.ALL_OBJECTS,
-      custom_objects=custom_objects,
-      printable_module_name='layer')
+    dense_layer = tf.keras.layers.deserialize(config)
+    ```
+    """
+    populate_deserializable_objects()
+    if not config:
+        raise ValueError(
+            f"Cannot deserialize empty config. Received: config={config}"
+        )
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            config,
+            module_objects=LOCAL.ALL_OBJECTS,
+            custom_objects=custom_objects,
+            printable_module_name="layer",
+        )
+
+    return serialization_lib.deserialize_keras_object(
+        config,
+        module_objects=LOCAL.ALL_OBJECTS,
+        custom_objects=custom_objects,
+        printable_module_name="layer",
+    )
 
 
 def get_builtin_layer(class_name):
-  """Returns class if `class_name` is registered, else returns None."""
-  if not hasattr(LOCAL, 'ALL_OBJECTS'):
-    populate_deserializable_objects()
-  return LOCAL.ALL_OBJECTS.get(class_name)
+    """Returns class if `class_name` is registered, else returns None."""
+    if not hasattr(LOCAL, "ALL_OBJECTS"):
+        populate_deserializable_objects()
+    return LOCAL.ALL_OBJECTS.get(class_name)
 
 
 def deserialize_from_json(json_string, custom_objects=None):
-  """Instantiates a layer from a JSON string."""
-  populate_deserializable_objects()
-  config = json_utils.decode_and_deserialize(
-      json_string,
-      module_objects=LOCAL.ALL_OBJECTS,
-      custom_objects=custom_objects)
-  return deserialize(config, custom_objects)
+    """Instantiates a layer from a JSON string."""
+    populate_deserializable_objects()
+    config = json_utils.decode_and_deserialize(
+        json_string,
+        module_objects=LOCAL.ALL_OBJECTS,
+        custom_objects=custom_objects,
+    )
+    return deserialize(config, custom_objects)
diff --git a/keras/layers/serialization_test.py b/keras/layers/serialization_test.py
index e71ebd5ead20..688466be0b74 100644
--- a/keras/layers/serialization_test.py
+++ b/keras/layers/serialization_test.py
@@ -15,154 +15,188 @@
 """Tests for layer serialization utils."""
 
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
 
 import keras
-from keras.testing_infra import test_combinations
+from keras.layers.normalization import batch_normalization as batchnorm_v2
+from keras.layers.normalization import batch_normalization_v1 as batchnorm_v1
 from keras.layers.rnn import gru
 from keras.layers.rnn import gru_v1
 from keras.layers.rnn import lstm
 from keras.layers.rnn import lstm_v1
-from keras.layers.normalization import batch_normalization as batchnorm_v2
-from keras.layers.normalization import batch_normalization_v1 as batchnorm_v1
+from keras.metrics import Mean
+from keras.testing_infra import test_combinations
 
 
 class SerializableInt(int):
+    def __new__(cls, value):
+        return int.__new__(cls, value)
 
-  def __new__(cls, value):
-    return int.__new__(cls, value)
+    def get_config(self):
+        return {"value": int(self)}
 
-  def get_config(self):
-    return {'value': int(self)}
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
 
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
 
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LayerSerializationTest(parameterized.TestCase, tf.test.TestCase):
-
-  def test_serialize_deserialize(self):
-    layer = keras.layers.Dense(
-        3, activation='relu', kernel_initializer='ones', bias_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
-    if tf.__internal__.tf2.enabled():
-      self.assertEqual(new_layer.kernel_initializer.__class__,
-                       keras.initializers.OnesV2)
-    else:
-      self.assertEqual(new_layer.kernel_initializer.__class__,
-                       keras.initializers.Ones)
-    self.assertEqual(new_layer.units, 3)
-
-  def test_implicit_serialize_deserialize_fails_without_object(self):
-    layer = keras.layers.Dense(
-        SerializableInt(3),
-        activation='relu',
-        kernel_initializer='ones',
-        bias_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    # Because we're passing an unknown class here, deserialization should fail
-    # unless we add SerializableInt to the custom object dict.
-    with self.assertRaisesRegex(ValueError,
-                                'Unknown config_item: SerializableInt.*'):
-      _ = keras.layers.deserialize(config)
-
-  def test_implicit_serialize_deserialize_succeeds_with_object(self):
-    layer = keras.layers.Dense(
-        SerializableInt(3),
-        activation='relu',
-        kernel_initializer='ones',
-        bias_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    # Because we're passing an unknown class here, deserialization should fail
-    # unless we add SerializableInt to the custom object dict.
-    new_layer = keras.layers.deserialize(
-        config, custom_objects={'SerializableInt': SerializableInt})
-    self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
-    if tf.__internal__.tf2.enabled():
-      self.assertEqual(new_layer.kernel_initializer.__class__,
-                       keras.initializers.OnesV2)
-    else:
-      self.assertEqual(new_layer.kernel_initializer.__class__,
-                       keras.initializers.Ones)
-    self.assertEqual(new_layer.units.__class__, SerializableInt)
-    self.assertEqual(new_layer.units, 3)
-
-  @parameterized.parameters(
-      [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
-  def test_serialize_deserialize_batchnorm(self, batchnorm_layer):
-    layer = batchnorm_layer(
-        momentum=0.9, beta_initializer='zeros', gamma_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    self.assertEqual(config['class_name'], 'BatchNormalization')
-    new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.momentum, 0.9)
-    if tf.__internal__.tf2.enabled():
-      self.assertIsInstance(new_layer, batchnorm_v2.BatchNormalization)
-      self.assertEqual(new_layer.beta_initializer.__class__,
-                       keras.initializers.ZerosV2)
-    else:
-      self.assertIsInstance(new_layer, batchnorm_v1.BatchNormalization)
-      self.assertEqual(new_layer.beta_initializer.__class__,
-                       keras.initializers.Zeros)
-    self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L2)
-
-  @parameterized.parameters(
-      [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
-  def test_deserialize_batchnorm_backwards_compatibility(self, batchnorm_layer):
-    layer = batchnorm_layer(
-        momentum=0.9, beta_initializer='zeros', gamma_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.momentum, 0.9)
-    if tf.__internal__.tf2.enabled():
-      self.assertIsInstance(new_layer, batchnorm_v2.BatchNormalization)
-      self.assertEqual(new_layer.beta_initializer.__class__,
-                       keras.initializers.ZerosV2)
-    else:
-      self.assertIsInstance(new_layer, batchnorm_v1.BatchNormalization)
-      self.assertEqual(new_layer.beta_initializer.__class__,
-                       keras.initializers.Zeros)
-    self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L2)
-
-  @parameterized.parameters([lstm_v1.LSTM, lstm.LSTM])
-  def test_serialize_deserialize_lstm(self, layer):
-    lstm_layer = layer(5, return_sequences=True)
-    config = keras.layers.serialize(lstm_layer)
-    self.assertEqual(config['class_name'], 'LSTM')
-    new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.units, 5)
-    self.assertEqual(new_layer.return_sequences, True)
-    if tf.__internal__.tf2.enabled():
-      self.assertIsInstance(new_layer, lstm.LSTM)
-    else:
-      self.assertIsInstance(new_layer, lstm_v1.LSTM)
-      self.assertNotIsInstance(new_layer, lstm.LSTM)
-
-  @parameterized.parameters([gru_v1.GRU, gru.GRU])
-  def test_serialize_deserialize_gru(self, layer):
-    gru_layer = layer(5, return_sequences=True)
-    config = keras.layers.serialize(gru_layer)
-    self.assertEqual(config['class_name'], 'GRU')
-    new_layer = keras.layers.deserialize(config)
-    self.assertEqual(new_layer.units, 5)
-    self.assertEqual(new_layer.return_sequences, True)
-    if tf.__internal__.tf2.enabled():
-      self.assertIsInstance(new_layer, gru.GRU)
-    else:
-      self.assertIsInstance(new_layer, gru_v1.GRU)
-      self.assertNotIsInstance(new_layer, gru.GRU)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_serialize_deserialize(self):
+        layer = keras.layers.Dense(
+            3,
+            activation="relu",
+            kernel_initializer="ones",
+            bias_regularizer="l2",
+        )
+        config = keras.layers.serialize(layer)
+        new_layer = keras.layers.deserialize(config)
+        self.assertEqual(new_layer.activation, keras.activations.relu)
+        self.assertEqual(
+            new_layer.bias_regularizer.__class__, keras.regularizers.L2
+        )
+        if tf.__internal__.tf2.enabled():
+            self.assertEqual(
+                new_layer.kernel_initializer.__class__,
+                keras.initializers.OnesV2,
+            )
+        else:
+            self.assertEqual(
+                new_layer.kernel_initializer.__class__, keras.initializers.Ones
+            )
+        self.assertEqual(new_layer.units, 3)
+
+    def test_implicit_serialize_deserialize_fails_without_object(self):
+        # After discussion (rchao, nkovela) decided to exclude from new saving
+        if tf.__internal__.tf2.enabled():
+            self.skipTest("Test excluded from new saving format.")
+        layer = keras.layers.Dense(
+            SerializableInt(3),
+            activation="relu",
+            kernel_initializer="ones",
+            bias_regularizer="l2",
+        )
+        config = keras.layers.serialize(layer)
+        # Because we're passing an unknown class here, deserialization should
+        # fail unless we add SerializableInt to the custom object dict.
+        with self.assertRaisesRegex(
+            ValueError, "Unknown config_item: 'SerializableInt.*"
+        ):
+            _ = keras.layers.deserialize(config)
+
+    def test_implicit_serialize_deserialize_succeeds_with_object(self):
+        layer = keras.layers.Dense(
+            SerializableInt(3),
+            activation="relu",
+            kernel_initializer="ones",
+            bias_regularizer="l2",
+        )
+        config = keras.layers.serialize(layer)
+        # Because we're passing an unknown class here, deserialization should
+        # fail unless we add SerializableInt to the custom object dict.
+        new_layer = keras.layers.deserialize(
+            config, custom_objects={"SerializableInt": SerializableInt}
+        )
+        self.assertEqual(new_layer.activation, keras.activations.relu)
+        self.assertEqual(
+            new_layer.bias_regularizer.__class__, keras.regularizers.L2
+        )
+        if tf.__internal__.tf2.enabled():
+            self.assertEqual(
+                new_layer.kernel_initializer.__class__,
+                keras.initializers.OnesV2,
+            )
+        else:
+            self.assertEqual(
+                new_layer.kernel_initializer.__class__, keras.initializers.Ones
+            )
+        self.assertEqual(new_layer.units.__class__, SerializableInt)
+        self.assertEqual(new_layer.units, 3)
+
+    @parameterized.parameters(
+        [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization]
+    )
+    def test_serialize_deserialize_batchnorm(self, batchnorm_layer):
+        layer = batchnorm_layer(
+            momentum=0.9, beta_initializer="zeros", gamma_regularizer="l2"
+        )
+        config = keras.layers.serialize(layer)
+        self.assertEqual(config["class_name"], "BatchNormalization")
+        new_layer = keras.layers.deserialize(config)
+        self.assertEqual(new_layer.momentum, 0.9)
+        if tf.__internal__.tf2.enabled():
+            self.assertIsInstance(new_layer, batchnorm_v2.BatchNormalization)
+            self.assertEqual(
+                new_layer.beta_initializer.__class__, keras.initializers.ZerosV2
+            )
+        else:
+            self.assertIsInstance(new_layer, batchnorm_v1.BatchNormalization)
+            self.assertEqual(
+                new_layer.beta_initializer.__class__, keras.initializers.Zeros
+            )
+        self.assertEqual(
+            new_layer.gamma_regularizer.__class__, keras.regularizers.L2
+        )
+
+    @parameterized.parameters(
+        [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization]
+    )
+    def test_deserialize_batchnorm_backwards_compatibility(
+        self, batchnorm_layer
+    ):
+        layer = batchnorm_layer(
+            momentum=0.9, beta_initializer="zeros", gamma_regularizer="l2"
+        )
+        config = keras.layers.serialize(layer)
+        new_layer = keras.layers.deserialize(config)
+        self.assertEqual(new_layer.momentum, 0.9)
+        if tf.__internal__.tf2.enabled():
+            self.assertIsInstance(new_layer, batchnorm_v2.BatchNormalization)
+            self.assertEqual(
+                new_layer.beta_initializer.__class__, keras.initializers.ZerosV2
+            )
+        else:
+            self.assertIsInstance(new_layer, batchnorm_v1.BatchNormalization)
+            self.assertEqual(
+                new_layer.beta_initializer.__class__, keras.initializers.Zeros
+            )
+        self.assertEqual(
+            new_layer.gamma_regularizer.__class__, keras.regularizers.L2
+        )
+
+    @parameterized.parameters([lstm_v1.LSTM, lstm.LSTM])
+    def test_serialize_deserialize_lstm(self, layer):
+        lstm_layer = layer(5, return_sequences=True)
+        config = keras.layers.serialize(lstm_layer)
+        self.assertEqual(config["class_name"], "LSTM")
+        new_layer = keras.layers.deserialize(config)
+        self.assertEqual(new_layer.units, 5)
+        self.assertEqual(new_layer.return_sequences, True)
+        if tf.__internal__.tf2.enabled():
+            self.assertIsInstance(new_layer, lstm.LSTM)
+        else:
+            self.assertIsInstance(new_layer, lstm_v1.LSTM)
+            self.assertNotIsInstance(new_layer, lstm.LSTM)
+
+    @parameterized.parameters([gru_v1.GRU, gru.GRU])
+    def test_serialize_deserialize_gru(self, layer):
+        gru_layer = layer(5, return_sequences=True)
+        config = keras.layers.serialize(gru_layer)
+        self.assertEqual(config["class_name"], "GRU")
+        new_layer = keras.layers.deserialize(config)
+        self.assertEqual(new_layer.units, 5)
+        self.assertEqual(new_layer.return_sequences, True)
+        if tf.__internal__.tf2.enabled():
+            self.assertIsInstance(new_layer, gru.GRU)
+        else:
+            self.assertIsInstance(new_layer, gru_v1.GRU)
+            self.assertNotIsInstance(new_layer, gru.GRU)
+
+    def test_serialize_metric_throws_error(self):
+        metric = Mean()
+        with self.assertRaisesRegex(ValueError, "since it is a metric."):
+            _ = keras.layers.serialize(metric)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/subclassed_layers_test.py b/keras/layers/subclassed_layers_test.py
index 3adfa04d1e8a..de4ebeacaa1c 100644
--- a/keras/layers/subclassed_layers_test.py
+++ b/keras/layers/subclassed_layers_test.py
@@ -25,52 +25,53 @@
 @test_combinations.run_all_keras_modes
 @test_combinations.run_with_all_model_types
 class SubclassedLayersTest(test_combinations.TestCase):
-
-  def test_simple_build_with_constant(self):
-
-    class BuildConstantLayer(keras.layers.Layer):
-
-      def build(self, input_shape):
-        self.b = tf.convert_to_tensor(2.0)
-
-      def call(self, inputs):
-        return self.b * inputs
-
-    layer = BuildConstantLayer()
-    model = test_utils.get_model_from_layers(
-        [layer, keras.layers.Dense(1)], input_shape=(1,))
-
-    x = tf.convert_to_tensor([[3.0]])
-    self.assertEqual(
-        tf_utils.is_symbolic_tensor(model(x)), not tf.executing_eagerly())
-    self.assertEqual(
-        tf_utils.is_symbolic_tensor(layer(x)), not tf.executing_eagerly())
-    self.assertAllClose(keras.backend.get_value(layer(x)), [[6.0]])
-
-  def test_build_with_derived_constant(self):
-
-    class BuildDerivedConstantLayer(keras.layers.Layer):
-
-      def build(self, input_shape):
-        a = tf.convert_to_tensor(1.0)
-        b = 2.0 * a
-        self.variable = tf.Variable(b)
-        self.constant = tf.convert_to_tensor(self.variable)
-
-      def call(self, inputs):
-        return self.variable * self.constant * inputs
-
-    layer = BuildDerivedConstantLayer()
-    model = test_utils.get_model_from_layers(
-        [layer, keras.layers.Dense(1)], input_shape=(1,))
-
-    x = tf.convert_to_tensor([[3.0]])
-    self.assertEqual(
-        tf_utils.is_symbolic_tensor(model(x)), not tf.executing_eagerly())
-    self.assertEqual(
-        tf_utils.is_symbolic_tensor(layer(x)), not tf.executing_eagerly())
-    self.assertAllClose(keras.backend.get_value(layer(x)), [[12.0]])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_simple_build_with_constant(self):
+        class BuildConstantLayer(keras.layers.Layer):
+            def build(self, input_shape):
+                self.b = tf.convert_to_tensor(2.0)
+
+            def call(self, inputs):
+                return self.b * inputs
+
+        layer = BuildConstantLayer()
+        model = test_utils.get_model_from_layers(
+            [layer, keras.layers.Dense(1)], input_shape=(1,)
+        )
+
+        x = tf.convert_to_tensor([[3.0]])
+        self.assertEqual(
+            tf_utils.is_symbolic_tensor(model(x)), not tf.executing_eagerly()
+        )
+        self.assertEqual(
+            tf_utils.is_symbolic_tensor(layer(x)), not tf.executing_eagerly()
+        )
+        self.assertAllClose(keras.backend.get_value(layer(x)), [[6.0]])
+
+    def test_build_with_derived_constant(self):
+        class BuildDerivedConstantLayer(keras.layers.Layer):
+            def build(self, input_shape):
+                a = tf.convert_to_tensor(1.0)
+                b = 2.0 * a
+                self.variable = tf.Variable(b)
+                self.constant = tf.convert_to_tensor(self.variable)
+
+            def call(self, inputs):
+                return self.variable * self.constant * inputs
+
+        layer = BuildDerivedConstantLayer()
+        model = test_utils.get_model_from_layers(
+            [layer, keras.layers.Dense(1)], input_shape=(1,)
+        )
+
+        x = tf.convert_to_tensor([[3.0]])
+        self.assertEqual(
+            tf_utils.is_symbolic_tensor(model(x)), not tf.executing_eagerly()
+        )
+        self.assertEqual(
+            tf_utils.is_symbolic_tensor(layer(x)), not tf.executing_eagerly()
+        )
+        self.assertAllClose(keras.backend.get_value(layer(x)), [[12.0]])
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/layers/tensorflow_op_layer_test.py b/keras/layers/tensorflow_op_layer_test.py
index a42da122c6d3..6c0173c14bad 100644
--- a/keras/layers/tensorflow_op_layer_test.py
+++ b/keras/layers/tensorflow_op_layer_test.py
@@ -14,736 +14,761 @@
 # ==============================================================================
 """Test for allowing TF ops to work with Keras Functional API."""
 
-import tensorflow.compat.v2 as tf
-
 import time
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
+from keras.engine import keras_tensor
+from keras.optimizers.legacy import adam
+from keras.saving.legacy import model_config
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.engine import keras_tensor
-from keras.optimizers.optimizer_v2 import adam
-from keras.saving import model_config
 
 
 def _single_op_at_end():
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(10)(inputs)
-  outputs = tf.nn.relu(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(10)(inputs)
+    outputs = tf.nn.relu(x)
+    return keras.Model(inputs, outputs)
 
 
 def _single_identity_op_at_end():
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(10)(inputs)
-  outputs = tf.identity(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(10)(inputs)
+    outputs = tf.identity(x)
+    return keras.Model(inputs, outputs)
 
 
 def _multiple_ops_at_end():
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(10)(inputs)
-  x = tf.nn.relu(x)
-  outputs = tf.nn.relu(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(10)(inputs)
+    x = tf.nn.relu(x)
+    outputs = tf.nn.relu(x)
+    return keras.Model(inputs, outputs)
 
 
 def _single_op_in_middle():
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(10)(inputs)
-  x = tf.nn.relu(x)
-  outputs = keras.layers.Dense(10)(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(10)(inputs)
+    x = tf.nn.relu(x)
+    outputs = keras.layers.Dense(10)(x)
+    return keras.Model(inputs, outputs)
 
 
 def _multiple_ops_in_middle():
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(10)(inputs)
-  x = tf.nn.relu(x)
-  x = tf.nn.relu(x)
-  outputs = keras.layers.Dense(10)(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(10)(inputs)
+    x = tf.nn.relu(x)
+    x = tf.nn.relu(x)
+    outputs = keras.layers.Dense(10)(x)
+    return keras.Model(inputs, outputs)
 
 
 def _shape_op_inference():
-  inputs = keras.Input(shape=(10,))
-  x = tf.shape(inputs)
-  x = tf.ones(x)
-  assert x.shape.as_list() == [None, 10]
-  outputs = keras.layers.Dense(10)(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = tf.shape(inputs)
+    x = tf.ones(x)
+    assert x.shape.as_list() == [None, 10]
+    outputs = keras.layers.Dense(10)(x)
+    return keras.Model(inputs, outputs)
 
 
 def _shape_op_known_batch_size():
-  inputs = keras.Input(batch_size=2, shape=(10,))
-  x = tf.shape(inputs)
-  x = tf.ones(x)
-  assert x.shape.as_list() == [2, 10]
-  outputs = keras.layers.Dense(10)(x)
-  if tf.executing_eagerly():
-    return keras.Model(inputs, outputs)
-  else:
-    # In V1 the op layer fails for some reason,
-    # but we don't have access to the test case to call
-    # self.skip_test in this util method
-    return keras.Model(inputs, inputs)
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    x = tf.shape(inputs)
+    x = tf.ones(x)
+    assert x.shape.as_list() == [2, 10]
+    outputs = keras.layers.Dense(10)(x)
+    if tf.executing_eagerly():
+        return keras.Model(inputs, outputs)
+    else:
+        # In V1 the op layer fails for some reason,
+        # but we don't have access to the test case to call
+        # self.skip_test in this util method
+        return keras.Model(inputs, inputs)
 
 
 def _shape_op_slice_and_range():
-  inputs = keras.Input(shape=(10,))
-  batch_size = tf.shape(inputs)[0]
-  x = tf.range(batch_size * 2)
-  assert x.shape.as_list() == [None]
-  x = tf.reshape(x, (batch_size, 2))
-  x = tf.cast(x, dtype='float32')
-  outputs = keras.layers.Dense(10)(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    batch_size = tf.shape(inputs)[0]
+    x = tf.range(batch_size * 2)
+    assert x.shape.as_list() == [None]
+    x = tf.reshape(x, (batch_size, 2))
+    x = tf.cast(x, dtype="float32")
+    outputs = keras.layers.Dense(10)(x)
+    return keras.Model(inputs, outputs)
 
 
 def _shape_op_slice_and_range_known_dim():
-  inputs = keras.Input(batch_size=2, shape=(10,))
-  batch_size = tf.shape(inputs)[0]
-  x = tf.range(batch_size * 3)
-  assert x.shape.as_list() == [6]
-  x = tf.reshape(x, (batch_size, 3))
-  x = tf.cast(x, dtype='float32')
-  outputs = keras.layers.Dense(10)(x)
-  if tf.executing_eagerly():
-    return keras.Model(inputs, outputs)
-  else:
-    # In V1 the op layer fails for some reason,
-    # but we don't have access to the test case to call
-    # self.skip_test in this util method
-    return keras.Model(inputs, inputs)
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    batch_size = tf.shape(inputs)[0]
+    x = tf.range(batch_size * 3)
+    assert x.shape.as_list() == [6]
+    x = tf.reshape(x, (batch_size, 3))
+    x = tf.cast(x, dtype="float32")
+    outputs = keras.layers.Dense(10)(x)
+    if tf.executing_eagerly():
+        return keras.Model(inputs, outputs)
+    else:
+        # In V1 the op layer fails for some reason,
+        # but we don't have access to the test case to call
+        # self.skip_test in this util method
+        return keras.Model(inputs, inputs)
 
 
 def _int32_manipulation_too_big_for_shape():
-  # This test verifies that the Keras Functional API
-  # won't crash when manipulating int32 tensors that are too large
-  # to represent shapes.
-  inputs = keras.Input(batch_size=2, shape=(10,))
-  batch_size = tf.shape(inputs)[0]
-  num_features = 3 * 1024 * 16
-  x = tf.range(batch_size * num_features, dtype='int32')
-  assert x.shape.as_list() == [inputs.shape[0] * num_features]
-  x = tf.reshape(x, (batch_size, num_features))
-  x = tf.cast(x, dtype='float32')
-  outputs = keras.layers.Dense(10)(x)
-  if tf.executing_eagerly():
-    return keras.Model(inputs, outputs)
-  else:
-    # In V1 the op layer fails for some reason,
-    # but we don't have access to the test case to call
-    # self.skip_test in this util method
-    return keras.Model(inputs, inputs)
+    # This test verifies that the Keras Functional API
+    # won't crash when manipulating int32 tensors that are too large
+    # to represent shapes.
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    batch_size = tf.shape(inputs)[0]
+    num_features = 3 * 1024 * 16
+    x = tf.range(batch_size * num_features, dtype="int32")
+    assert x.shape.as_list() == [inputs.shape[0] * num_features]
+    x = tf.reshape(x, (batch_size, num_features))
+    x = tf.cast(x, dtype="float32")
+    outputs = keras.layers.Dense(10)(x)
+    if tf.executing_eagerly():
+        return keras.Model(inputs, outputs)
+    else:
+        # In V1 the op layer fails for some reason,
+        # but we don't have access to the test case to call
+        # self.skip_test in this util method
+        return keras.Model(inputs, inputs)
 
 
 def _int32_manipulation_at_max_shape_dims_limit():
-  # This test verifies that the Keras Functional API
-  # won't crash when manipulating int32 tensors that are at the limit
-  # of the max tensor size Keras can try inferring values for.
-  inputs = keras.Input(batch_size=2, shape=(10,))
-  batch_size = tf.shape(inputs)[0]
-  num_features = int(keras_tensor._MAX_TENSOR_RANK / int(inputs.shape[0]))
-  x = tf.range(batch_size * num_features, dtype='int32')
-  assert x.shape.as_list() == [keras_tensor._MAX_TENSOR_RANK]
-
-  # Verify that a value was actually inferred for a tensor that *might*
-  # represent the shape, bying checking that a value in
-  # the range appears in the printed inferred value
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    assert str(keras_tensor._MAX_TENSOR_RANK - 1) in str(x)
-
-  x = tf.reshape(x, (batch_size, num_features))
-  x = tf.cast(x, dtype='float32')
-  outputs = keras.layers.Dense(10)(x)
-  if tf.executing_eagerly():
-    return keras.Model(inputs, outputs)
-  else:
-    # In V1 the op layer fails for some reason,
-    # but we don't have access to the test case to call
-    # self.skip_test in this util method
-    return keras.Model(inputs, inputs)
+    # This test verifies that the Keras Functional API
+    # won't crash when manipulating int32 tensors that are at the limit
+    # of the max tensor size Keras can try inferring values for.
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    batch_size = tf.shape(inputs)[0]
+    num_features = int(keras_tensor._MAX_TENSOR_RANK / int(inputs.shape[0]))
+    x = tf.range(batch_size * num_features, dtype="int32")
+    assert x.shape.as_list() == [keras_tensor._MAX_TENSOR_RANK]
+
+    # Verify that a value was actually inferred for a tensor that *might*
+    # represent the shape, bying checking that a value in
+    # the range appears in the printed inferred value
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        assert str(keras_tensor._MAX_TENSOR_RANK - 1) in str(x)
+
+    x = tf.reshape(x, (batch_size, num_features))
+    x = tf.cast(x, dtype="float32")
+    outputs = keras.layers.Dense(10)(x)
+    if tf.executing_eagerly():
+        return keras.Model(inputs, outputs)
+    else:
+        # In V1 the op layer fails for some reason,
+        # but we don't have access to the test case to call
+        # self.skip_test in this util method
+        return keras.Model(inputs, inputs)
 
 
 def _single_standalone_branch():
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(10)(inputs)
-  outputs = x * 2
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(10)(inputs)
+    outputs = x * 2
+    return keras.Model(inputs, outputs)
 
 
 def _single_op_with_attrs():
-  inputs = keras.Input(shape=(10,))
-  x = tf.reduce_mean(inputs, axis=1, keepdims=True)
-  outputs = keras.layers.Dense(10)(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = tf.reduce_mean(inputs, axis=1, keepdims=True)
+    outputs = keras.layers.Dense(10)(x)
+    return keras.Model(inputs, outputs)
 
 
 def _multiple_uses():
-  inputs = keras.Input(shape=(10,))
-  x = tf.reduce_mean(inputs, axis=1, keepdims=True)
-  x1 = keras.layers.Dense(10)(x)
-  x2 = keras.layers.Dense(10)(x)
-  outputs = x1 + x2
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = tf.reduce_mean(inputs, axis=1, keepdims=True)
+    x1 = keras.layers.Dense(10)(x)
+    x2 = keras.layers.Dense(10)(x)
+    outputs = x1 + x2
+    return keras.Model(inputs, outputs)
 
 
 def _op_with_tensor_list():
-  inputs = keras.Input(shape=(10,))
-  x = tf.concat([inputs, inputs], axis=1)
-  outputs = keras.layers.Dense(10)(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = tf.concat([inputs, inputs], axis=1)
+    outputs = keras.layers.Dense(10)(x)
+    return keras.Model(inputs, outputs)
 
 
 def _add_n():
-  inputs = keras.Input(shape=(10,))
-  outputs = tf.add_n([inputs, inputs, inputs])
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    outputs = tf.add_n([inputs, inputs, inputs])
+    return keras.Model(inputs, outputs)
 
 
 def _reuse_op():
-  inputs = keras.Input(shape=(10,))
-  # This op needs to be checked multiple times.
-  x = tf.nn.relu(inputs)
-  y = keras.layers.Dense(10)(x)
-  x2 = x * 2
-  y2 = keras.layers.Dense(10)(x2)
-  outputs = y + y2
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    # This op needs to be checked multiple times.
+    x = tf.nn.relu(inputs)
+    y = keras.layers.Dense(10)(x)
+    x2 = x * 2
+    y2 = keras.layers.Dense(10)(x2)
+    outputs = y + y2
+    return keras.Model(inputs, outputs)
 
 
 def _float64_op():
-  inputs = keras.Input(shape=(10,))
-  x = keras.layers.Dense(10, dtype='float64')(inputs)
-  x = tf.nn.relu(x)
-  assert x.dtype == 'float64', 'x has dtype: %s' % x.dtype
-  outputs = keras.layers.Dense(10)(x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.Dense(10, dtype="float64")(inputs)
+    x = tf.nn.relu(x)
+    assert x.dtype == "float64", f"x has dtype: {x.dtype}"
+    outputs = keras.layers.Dense(10)(x)
+    return keras.Model(inputs, outputs)
 
 
 class MyAdd(keras.layers.Layer):
-
-  def call(self, x, y):
-    return x + y
+    def call(self, x, y):
+        return x + y
 
 
 def _layer_with_tensor_arg():
-  inputs = keras.Input(shape=(10,))
-  x = inputs * 2
-  outputs = MyAdd()(inputs, x)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    x = inputs * 2
+    outputs = MyAdd()(inputs, x)
+    return keras.Model(inputs, outputs)
 
 
 class LayerWithLayer(keras.layers.Layer):
+    def build(self, input_shape):
+        self.bias = self.add_weight(name="bias", dtype="float32")
+        self.layer = keras.layers.Dense(10)
 
-  def build(self, input_shape):
-    self.bias = self.add_weight(name='bias', dtype='float32')
-    self.layer = keras.layers.Dense(10)
-
-  def call(self, inputs):
-    inputs = inputs * self.bias
-    # Would throw an error if Keras History was created here.
-    return self.layer(inputs)
+    def call(self, inputs):
+        inputs = inputs * self.bias
+        # Would throw an error if Keras History was created here.
+        return self.layer(inputs)
 
 
 def _inner_layer():
-  inputs = keras.Input(shape=(10,))
-  outputs = LayerWithLayer()(inputs)
-  return keras.Model(inputs, outputs)
+    inputs = keras.Input(shape=(10,))
+    outputs = LayerWithLayer()(inputs)
+    return keras.Model(inputs, outputs)
 
 
 def _reuse_ancillary_layer():
-  inputs = (keras.Input(shape=(5,)), keras.Input(shape=(5,)))
-  base_model = keras.Sequential([
-      keras.layers.Dense(3, input_shape=(5,)),
-  ])
-  outputs = base_model(inputs[0])
-  model = keras.Model(inputs, outputs)
-  # The second input is only involved in ancillary layers.
-  outputs_delta = outputs - base_model(0.5 * inputs[1])
-  l2_loss = tf.reduce_mean(
-      tf.reduce_sum(tf.square(outputs_delta), -1))
-  model.add_loss(l2_loss)
-  model.add_metric(l2_loss, aggregation='mean', name='l2_loss')
-  l1_loss = 0.01 * tf.reduce_mean(
-      tf.reduce_sum(tf.abs(outputs_delta), -1))
-  model.add_loss(l1_loss)
-  model.add_metric(l1_loss, aggregation='mean', name='l1_loss')
-  return model
+    inputs = (keras.Input(shape=(5,)), keras.Input(shape=(5,)))
+    base_model = keras.Sequential(
+        [
+            keras.layers.Dense(3, input_shape=(5,)),
+        ]
+    )
+    outputs = base_model(inputs[0])
+    model = keras.Model(inputs, outputs)
+    # The second input is only involved in ancillary layers.
+    outputs_delta = outputs - base_model(0.5 * inputs[1])
+    l2_loss = tf.reduce_mean(tf.reduce_sum(tf.square(outputs_delta), -1))
+    model.add_loss(l2_loss)
+    model.add_metric(l2_loss, aggregation="mean", name="l2_loss")
+    l1_loss = 0.01 * tf.reduce_mean(tf.reduce_sum(tf.abs(outputs_delta), -1))
+    model.add_loss(l1_loss)
+    model.add_metric(l1_loss, aggregation="mean", name="l1_loss")
+    return model
 
 
 @test_combinations.run_all_keras_modes()
 class AutoLambdaTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(
-      ('single_op_at_end', _single_op_at_end),
-      ('single_identity_op_at_end', _single_identity_op_at_end),
-      ('multiple_ops_at_end', _multiple_ops_at_end),
-      ('single_op_in_middle', _single_op_in_middle),
-      ('multiple_ops_in_middle', _multiple_ops_in_middle),
-      ('shape_op_inference', _shape_op_inference),
-      ('shape_op_known_batch_size', _shape_op_known_batch_size),
-      ('shape_op_slice_and_range', _shape_op_slice_and_range),
-      ('shape_op_slice_and_range_known_dim',
-       _shape_op_slice_and_range_known_dim),
-      ('int32_manipulation_too_big_for_shape',
-       _int32_manipulation_too_big_for_shape),
-      ('int32_manipulation_at_max_shape_dims_limit',
-       _int32_manipulation_at_max_shape_dims_limit),
-      ('single_standalone_branch', _single_standalone_branch),
-      ('single_op_with_attrs', _single_op_with_attrs),
-      ('multiple_uses', _multiple_uses),
-      ('op_with_tensor_list', _op_with_tensor_list),
-      ('add_n', _add_n),
-      ('_reuse_op', _reuse_op),
-      ('_float64_op', _float64_op),
-      ('_inner_layer', _inner_layer),
-      ('_reuse_ancillary_layer', _reuse_ancillary_layer),
-      ('_layer_with_tensor_arg', _layer_with_tensor_arg),
-  )
-  def test_autolambda(self, model_fn):
-    model = model_fn()
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    np_inputs = tf.nest.map_structure(
-        lambda x: np.ones((2,) + tuple(x.shape[1:]), 'float32'), model.inputs)
-    np_outputs = tf.nest.map_structure(
-        lambda x: np.ones((2,) + tuple(x.shape[1:]), 'float32'), model.outputs)
-    model.fit(np_inputs, np_outputs, batch_size=2)
-    model(np_inputs)  # Test calling the model directly on inputs.
-
-    new_model = keras.Model.from_config(
-        model.get_config(),
-        custom_objects={
-            'LayerWithLayer': LayerWithLayer,
-            'MyAdd': MyAdd
-        })
-    new_model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    new_model.fit(np_inputs, np_outputs, batch_size=2)
-    new_model(np_inputs)  # Test calling the new model directly on inputs.
-    # Assert that metrics are preserved and in the right order.
-    self.assertAllEqual(model.metrics_names, new_model.metrics_names)
-    # Assert that layer names don't change.
-    self.assertAllEqual([layer.name for layer in model.layers],
-                        [layer.name for layer in new_model.layers])
-
-  def test_stack_preserves_correct_shape(self):
-    ## Test stack([x])
-    inp = keras.Input(shape=(), dtype='float32')
-
-    out = tf.stack([inp])
-    model = keras.Model(
-        inputs=inp,
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = tf.ones(shape=(4, 4))
-    expected = tf.stack([x])
-    self.assertAllEqual(expected.shape, (1, 4, 4))
-
-    self.assertAllEqual(model(x).shape, (1, 4, 4))
-    self.assertAllEqual(model(x), expected)
-
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(x).shape, (1, 4, 4))
-    self.assertAllEqual(model(x), expected)
-
-    ## Test stack(x)
-    inp = keras.Input(shape=(), dtype='float32')
-
-    out = tf.stack(inp)
-    model = keras.Model(
-        inputs=inp,
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = tf.ones(shape=(4, 4))
-    expected = tf.stack(x)
-    self.assertAllEqual(expected.shape, (4, 4))
-
-    self.assertAllEqual(model(x).shape, (4, 4))
-    self.assertAllEqual(model(x), expected)
-
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(x).shape, (4, 4))
-    self.assertAllEqual(model(x), expected)
-
-  def test_getitem_slice_with_step_only(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Complex slicing like this fails in v1')
-    inp = keras.Input(shape=(8,))
-    slice_step = keras.Input(shape=(), dtype='int32')
-
-    out = inp[..., ::slice_step[0]]
-    model = keras.Model(
-        inputs=[inp, slice_step],
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    batch_size = 7
-    step = 3
-    x = tf.stack([
-        tf.range(8) for _ in range(batch_size)])
-    args = [x, tf.constant(step, shape=(batch_size,))]
-    expected = tf.stack([
-        tf.range(8)[::step] for _ in range(batch_size)])
-
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertIn('tf.__operators__.getitem', (
-          x.name for x in model.layers))
-      self.assertNotIn('tf.strided_slice', (
-          x.name for x in model.layers))
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-    # Make sure it can be successfully saved and loaded
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-  def test_getitem_slice_real_tensor(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Complex slicing like this fails in v1')
-    x = tf.range(10.0)
-    slice_stop = keras.Input(shape=(), dtype='int32')
-
-    out = x[:slice_stop[0]]
-    model = keras.Model(
-        inputs=slice_stop,
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    batch_size = 7
-    stop = 6
-    args = tf.constant(stop, shape=(batch_size,))
-    expected = x[:stop]
-
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertIn('tf.__operators__.getitem', (
-          x.name for x in model.layers))
-      # TODO(b/161925288): Fix the dispatch triggering then uncomment:
-      # self.assertNotIn('tf.strided_slice', (
-      #     x.name for x in model.layers))
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-  def test_getitem_index_real_tensor(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Complex slicing like this fails in v1')
-    x = tf.range(10.0)
-    slice_stop = keras.Input(shape=(), dtype='int32')
-
-    out = x[slice_stop[0]]
-    model = keras.Model(
-        inputs=slice_stop,
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    batch_size = 7
-    index = 6
-    args = tf.constant(index, shape=(batch_size,))
-    expected = x[index]
-
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertIn('tf.__operators__.getitem', (
-          x.name for x in model.layers))
-      # TODO(b/161925288): Fix the bug then uncomment:
-      # self.assertNotIn('tf.strided_slice', (
-      #     x.name for x in model.layers))
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-    # Make sure it can be successfully saved and loaded
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-  def test_getitem_slice_with_stop_only(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Complex slicing like this fails in v1')
-    inp = keras.Input(shape=(8,))
-    slice_stop = keras.Input(shape=(), dtype='int32')
-
-    out = inp[:slice_stop[0]]
-    model = keras.Model(
-        inputs=[inp, slice_stop],
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    batch_size = 7
-    stop = 6
-    x = tf.stack([
-        tf.range(8) for _ in range(batch_size)])
-    args = [x, tf.constant(stop, shape=(batch_size,))]
-    expected = x[:stop]
-
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertIn('tf.__operators__.getitem', (
-          x.name for x in model.layers))
-      self.assertNotIn('tf.strided_slice', (
-          x.name for x in model.layers))
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-    # Make sure it can be successfully saved and loaded
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-  def test_getitem_slice_with_stop_and_ellipsis_only(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Complex slicing like this fails in v1')
-    inp = keras.Input(shape=(8,))
-    slice_stop = keras.Input(shape=(), dtype='int32')
-
-    out = inp[..., :slice_stop[0]]
-    model = keras.Model(
-        inputs=[inp, slice_stop],
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    batch_size = 7
-    stop = 6
-    x = tf.stack([
-        tf.range(8) for _ in range(batch_size)])
-    args = [x, tf.constant(stop, shape=(batch_size,))]
-    expected = tf.stack([
-        tf.range(8)[:stop] for _ in range(batch_size)])
-
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertIn('tf.__operators__.getitem', (
-          x.name for x in model.layers))
-      self.assertNotIn('tf.strided_slice', (
-          x.name for x in model.layers))
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-    # Make sure it can be successfully saved and loaded
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-  def test_getitem_complex_slicing(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Complex slicing like this fails in v1')
-    inp = keras.Input(shape=(4, 3, 8))
-    first_dim = keras.Input(shape=(), dtype='int32')
-    slice_start = keras.Input(shape=(), dtype='int32')
-    slice_stop = keras.Input(shape=(), dtype='int32')
-    slice_stride = keras.Input(shape=(), dtype='int32')
-
-    out = inp[..., first_dim[0], slice_start[0]:slice_stop[0]:slice_stride[0]]
-    model = keras.Model(
-        inputs=[inp, first_dim, slice_start, slice_stop, slice_stride],
-        outputs=out)
-    model.compile(
-        adam.Adam(0.001),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    batch_size = 7
-    start = 1
-    stop = 6
-    step = 2
-    x = tf.stack([tf.stack([tf.stack([
-        tf.range(8)
-        for _ in range(3)]) for _ in range(4)]) for _ in range(batch_size)])
-    args = [x,
+    @parameterized.named_parameters(
+        ("single_op_at_end", _single_op_at_end),
+        ("single_identity_op_at_end", _single_identity_op_at_end),
+        ("multiple_ops_at_end", _multiple_ops_at_end),
+        ("single_op_in_middle", _single_op_in_middle),
+        ("multiple_ops_in_middle", _multiple_ops_in_middle),
+        ("shape_op_inference", _shape_op_inference),
+        ("shape_op_known_batch_size", _shape_op_known_batch_size),
+        ("shape_op_slice_and_range", _shape_op_slice_and_range),
+        (
+            "shape_op_slice_and_range_known_dim",
+            _shape_op_slice_and_range_known_dim,
+        ),
+        (
+            "int32_manipulation_too_big_for_shape",
+            _int32_manipulation_too_big_for_shape,
+        ),
+        (
+            "int32_manipulation_at_max_shape_dims_limit",
+            _int32_manipulation_at_max_shape_dims_limit,
+        ),
+        ("single_standalone_branch", _single_standalone_branch),
+        ("single_op_with_attrs", _single_op_with_attrs),
+        ("multiple_uses", _multiple_uses),
+        ("op_with_tensor_list", _op_with_tensor_list),
+        ("add_n", _add_n),
+        ("_reuse_op", _reuse_op),
+        ("_float64_op", _float64_op),
+        ("_inner_layer", _inner_layer),
+        ("_reuse_ancillary_layer", _reuse_ancillary_layer),
+        ("_layer_with_tensor_arg", _layer_with_tensor_arg),
+    )
+    def test_autolambda(self, model_fn):
+        model = model_fn()
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        np_inputs = tf.nest.map_structure(
+            lambda x: np.ones((2,) + tuple(x.shape[1:]), "float32"),
+            model.inputs,
+        )
+        np_outputs = tf.nest.map_structure(
+            lambda x: np.ones((2,) + tuple(x.shape[1:]), "float32"),
+            model.outputs,
+        )
+        model.fit(np_inputs, np_outputs, batch_size=2)
+        model(np_inputs)  # Test calling the model directly on inputs.
+
+        new_model = keras.Model.from_config(
+            model.get_config(),
+            custom_objects={"LayerWithLayer": LayerWithLayer, "MyAdd": MyAdd},
+        )
+        new_model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        new_model.fit(np_inputs, np_outputs, batch_size=2)
+        new_model(np_inputs)  # Test calling the new model directly on inputs.
+        # Assert that metrics are preserved and in the right order.
+        self.assertAllEqual(model.metrics_names, new_model.metrics_names)
+        # Assert that layer names don't change.
+        self.assertAllEqual(
+            [layer.name for layer in model.layers],
+            [layer.name for layer in new_model.layers],
+        )
+
+    def test_stack_preserves_correct_shape(self):
+        ## Test stack([x])
+        inp = keras.Input(shape=(), dtype="float32")
+
+        out = tf.stack([inp])
+        model = keras.Model(inputs=inp, outputs=out)
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        x = tf.ones(shape=(4, 4))
+        expected = tf.stack([x])
+        self.assertAllEqual(expected.shape, (1, 4, 4))
+
+        self.assertAllEqual(model(x).shape, (1, 4, 4))
+        self.assertAllEqual(model(x), expected)
+
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(x).shape, (1, 4, 4))
+        self.assertAllEqual(model(x), expected)
+
+        ## Test stack(x)
+        inp = keras.Input(shape=(), dtype="float32")
+
+        out = tf.stack(inp)
+        model = keras.Model(inputs=inp, outputs=out)
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+
+        x = tf.ones(shape=(4, 4))
+        expected = tf.stack(x)
+        self.assertAllEqual(expected.shape, (4, 4))
+
+        self.assertAllEqual(model(x).shape, (4, 4))
+        self.assertAllEqual(model(x), expected)
+
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(x).shape, (4, 4))
+        self.assertAllEqual(model(x), expected)
+
+    def test_getitem_slice_with_step_only(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Complex slicing like this fails in v1")
+        inp = keras.Input(shape=(8,))
+        slice_step = keras.Input(shape=(), dtype="int32")
+
+        out = inp[..., :: slice_step[0]]
+        model = keras.Model(inputs=[inp, slice_step], outputs=out)
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        batch_size = 7
+        step = 3
+        x = tf.stack([tf.range(8) for _ in range(batch_size)])
+        args = [x, tf.constant(step, shape=(batch_size,))]
+        expected = tf.stack([tf.range(8)[::step] for _ in range(batch_size)])
+
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertIn(
+                "tf.__operators__.getitem", (x.name for x in model.layers)
+            )
+            self.assertNotIn("tf.strided_slice", (x.name for x in model.layers))
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+        # Make sure it can be successfully saved and loaded
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+    def test_getitem_slice_real_tensor(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Complex slicing like this fails in v1")
+        x = tf.range(10.0)
+        slice_stop = keras.Input(shape=(), dtype="int32")
+
+        out = x[: slice_stop[0]]
+        model = keras.Model(inputs=slice_stop, outputs=out)
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        batch_size = 7
+        stop = 6
+        args = tf.constant(stop, shape=(batch_size,))
+        expected = x[:stop]
+
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertIn(
+                "tf.__operators__.getitem", (x.name for x in model.layers)
+            )
+            # TODO(b/161925288): Fix the dispatch triggering then uncomment:
+            # self.assertNotIn('tf.strided_slice', (
+            #     x.name for x in model.layers))
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+    def test_getitem_index_real_tensor(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Complex slicing like this fails in v1")
+        x = tf.range(10.0)
+        slice_stop = keras.Input(shape=(), dtype="int32")
+
+        out = x[slice_stop[0]]
+        model = keras.Model(inputs=slice_stop, outputs=out)
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        batch_size = 7
+        index = 6
+        args = tf.constant(index, shape=(batch_size,))
+        expected = x[index]
+
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertIn(
+                "tf.__operators__.getitem", (x.name for x in model.layers)
+            )
+            # TODO(b/161925288): Fix the bug then uncomment:
+            # self.assertNotIn('tf.strided_slice', (
+            #     x.name for x in model.layers))
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+        # Make sure it can be successfully saved and loaded
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+    def test_getitem_slice_with_stop_only(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Complex slicing like this fails in v1")
+        inp = keras.Input(shape=(8,))
+        slice_stop = keras.Input(shape=(), dtype="int32")
+
+        out = inp[: slice_stop[0]]
+        model = keras.Model(inputs=[inp, slice_stop], outputs=out)
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        batch_size = 7
+        stop = 6
+        x = tf.stack([tf.range(8) for _ in range(batch_size)])
+        args = [x, tf.constant(stop, shape=(batch_size,))]
+        expected = x[:stop]
+
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertIn(
+                "tf.__operators__.getitem", (x.name for x in model.layers)
+            )
+            self.assertNotIn("tf.strided_slice", (x.name for x in model.layers))
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+        # Make sure it can be successfully saved and loaded
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+    def test_getitem_slice_with_stop_and_ellipsis_only(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Complex slicing like this fails in v1")
+        inp = keras.Input(shape=(8,))
+        slice_stop = keras.Input(shape=(), dtype="int32")
+
+        out = inp[..., : slice_stop[0]]
+        model = keras.Model(inputs=[inp, slice_stop], outputs=out)
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        batch_size = 7
+        stop = 6
+        x = tf.stack([tf.range(8) for _ in range(batch_size)])
+        args = [x, tf.constant(stop, shape=(batch_size,))]
+        expected = tf.stack([tf.range(8)[:stop] for _ in range(batch_size)])
+
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertIn(
+                "tf.__operators__.getitem", (x.name for x in model.layers)
+            )
+            self.assertNotIn("tf.strided_slice", (x.name for x in model.layers))
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+        # Make sure it can be successfully saved and loaded
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+    def test_getitem_complex_slicing(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Complex slicing like this fails in v1")
+        inp = keras.Input(shape=(4, 3, 8))
+        first_dim = keras.Input(shape=(), dtype="int32")
+        slice_start = keras.Input(shape=(), dtype="int32")
+        slice_stop = keras.Input(shape=(), dtype="int32")
+        slice_stride = keras.Input(shape=(), dtype="int32")
+
+        out = inp[
+            ..., first_dim[0], slice_start[0] : slice_stop[0] : slice_stride[0]
+        ]
+        model = keras.Model(
+            inputs=[inp, first_dim, slice_start, slice_stop, slice_stride],
+            outputs=out,
+        )
+        model.compile(
+            adam.Adam(0.001), "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        batch_size = 7
+        start = 1
+        stop = 6
+        step = 2
+        x = tf.stack(
+            [
+                tf.stack(
+                    [
+                        tf.stack([tf.range(8) for _ in range(3)])
+                        for _ in range(4)
+                    ]
+                )
+                for _ in range(batch_size)
+            ]
+        )
+        args = [
+            x,
             tf.constant(0, shape=(batch_size,)),
             tf.constant(start, shape=(batch_size,)),
             tf.constant(stop, shape=(batch_size,)),
-            tf.constant(step, shape=(batch_size,))]
-    # Slice the innermost dim. only grab one index from the second-to-innermost
-    # dim, removing that dim from the shape.
-    expected = tf.stack([tf.stack([
-        tf.range(8)[start:stop:step]
-        for _ in range(4)]) for _ in range(batch_size)])
-
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertIn('tf.__operators__.getitem', (
-          x.name for x in model.layers))
-      self.assertNotIn('tf.strided_slice', (
-          x.name for x in model.layers))
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-    # Make sure it can be successfully saved and loaded
-    config = model.get_config()
-    model = keras.Model.from_config(config)
-
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-  def test_left_hand_numpy_multiplication(self):
-    x = np.asarray([3.0])
-    inputs = keras.Input(shape=(4,))
-    outputs = x * inputs
-    model = keras.Model(inputs, outputs)
-    ones = tf.ones((5, 4), dtype='float32')
-    self.assertAllEqual(model(ones), 3.0 * ones)
-
-  def test_numerical_correctness_simple(self):
-    x = tf.convert_to_tensor([[-1., 0., -2., 1.]])
-    inputs = keras.Input(shape=(4,))
-    outputs = tf.nn.relu(inputs)
-    model = keras.Model(inputs, outputs)
-    y = self.evaluate(model(x))
-    self.assertAllClose(y, [[0., 0., 0., 1.]])
-
-  def test_numerical_correctness_with_attrs(self):
-    x = tf.convert_to_tensor([[1.5, 1.5], [2.5, 3.5]])
-    inputs = keras.Input(shape=(2,))
-    outputs = tf.reduce_mean(inputs, axis=1)
-    model = keras.Model(inputs, outputs)
-    y = self.evaluate(model(x))
-    self.assertAllClose(y, [1.5, 3.])
-
-  def test_numerical_correctness_serialization(self):
-    x = tf.convert_to_tensor([[-1., 0., -2., 1.]])
-    inputs = keras.Input(shape=(4,))
-    outputs = tf.nn.relu(inputs)
-    model1 = keras.Model(inputs, outputs)
-    y1 = self.evaluate(model1(x))
-    model2 = keras.Model.from_config(model1.get_config())
-    y2 = self.evaluate(model2(x))
-    self.assertAllClose(y1, y2)
-
-  def test_gradient_tape_in_function(self):
-    z = keras.Input((1,))
-    x = tf.matmul(z, tf.constant(2.0, shape=(1, 1)))
-    x = tf.reduce_mean(x, axis=0, keepdims=True)
-    h = tf.nn.relu(x)
-    m = keras.Model(z, h)
-
-    @tf.function()
-    def f(x):
-      with tf.GradientTape() as t:
-        t.watch(x)
-        z = m(x ** 2)
-      grads = t.gradient(z, x)
-      return grads
-
-    self.assertAllEqual(f(tf.constant(10.0, shape=(1, 1))),
-                        tf.constant(40.0, shape=(1, 1)))
-
-    f = tf.function(f)
-
-    self.assertAllEqual(f(tf.constant(10.0, shape=(1, 1))),
-                        tf.constant(40.0, shape=(1, 1)))
-
-  def test_no_tracking(self):
-    if not tf.executing_eagerly():
-      x = tf.constant(1.0, shape=(10, 10))
-      keras.layers.Dense(1)(x)
-      self.assertTrue(x._keras_history_checked)
-
-  def test_timing_scales_linearly(self):
-
-    def _construct_graph_of_size(size):
-      start = time.time()
-      x = keras.backend.placeholder(shape=(10, 4))
-
-      for _ in range(size):
-        x = keras.layers.Dense(4)(x)
-        x = tf.nn.relu(x)
-
-      end = time.time()
-      return end - start
-
-    size_50 = _construct_graph_of_size(50)
-    size_500 = _construct_graph_of_size(500)
-
-    # Check construction time grows approx. linearly with size.
-    e = 3  # Fudge factor to prevent flakiness.
-    self.assertLess(size_500, (10 * e) * size_50)
-
-  def test_built(self):
-    inputs = keras.Input(shape=(10,))
-    outputs = tf.nn.relu(inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-    for layer in model.layers:
-      self.assertTrue(layer.built)
-    # Test something that requires Layers to be built.
-    model.summary()
-
-  def test_json_serialization(self):
-    inputs = keras.Input(shape=(4,), dtype='uint8')
-    outputs = tf.cast(inputs, 'float32') / 4.
-    model = model_config.model_from_json(keras.Model(inputs, outputs).to_json())
-    self.assertAllEqual(
-        self.evaluate(model(np.array([0, 64, 128, 192], np.uint8))),
-        [0., 16., 32., 48.])
-    model.summary()
+            tf.constant(step, shape=(batch_size,)),
+        ]
+        # Slice the innermost dim. only grab one index from the
+        # second-to-innermost dim, removing that dim from the shape.
+        expected = tf.stack(
+            [
+                tf.stack([tf.range(8)[start:stop:step] for _ in range(4)])
+                for _ in range(batch_size)
+            ]
+        )
+
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertIn(
+                "tf.__operators__.getitem", (x.name for x in model.layers)
+            )
+            self.assertNotIn("tf.strided_slice", (x.name for x in model.layers))
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+        # Make sure it can be successfully saved and loaded
+        config = model.get_config()
+        model = keras.Model.from_config(config)
+
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+    def test_left_hand_numpy_multiplication(self):
+        x = np.asarray([3.0])
+        inputs = keras.Input(shape=(4,))
+        outputs = x * inputs
+        model = keras.Model(inputs, outputs)
+        ones = tf.ones((5, 4), dtype="float32")
+        self.assertAllEqual(model(ones), 3.0 * ones)
+
+    def test_numerical_correctness_simple(self):
+        x = tf.convert_to_tensor([[-1.0, 0.0, -2.0, 1.0]])
+        inputs = keras.Input(shape=(4,))
+        outputs = tf.nn.relu(inputs)
+        model = keras.Model(inputs, outputs)
+        y = self.evaluate(model(x))
+        self.assertAllClose(y, [[0.0, 0.0, 0.0, 1.0]])
+
+    def test_numerical_correctness_with_attrs(self):
+        x = tf.convert_to_tensor([[1.5, 1.5], [2.5, 3.5]])
+        inputs = keras.Input(shape=(2,))
+        outputs = tf.reduce_mean(inputs, axis=1)
+        model = keras.Model(inputs, outputs)
+        y = self.evaluate(model(x))
+        self.assertAllClose(y, [1.5, 3.0])
+
+    def test_numerical_correctness_serialization(self):
+        x = tf.convert_to_tensor([[-1.0, 0.0, -2.0, 1.0]])
+        inputs = keras.Input(shape=(4,))
+        outputs = tf.nn.relu(inputs)
+        model1 = keras.Model(inputs, outputs)
+        y1 = self.evaluate(model1(x))
+        model2 = keras.Model.from_config(model1.get_config())
+        y2 = self.evaluate(model2(x))
+        self.assertAllClose(y1, y2)
+
+    def test_gradient_tape_in_function(self):
+        z = keras.Input((1,))
+        x = tf.matmul(z, tf.constant(2.0, shape=(1, 1)))
+        x = tf.reduce_mean(x, axis=0, keepdims=True)
+        h = tf.nn.relu(x)
+        m = keras.Model(z, h)
+
+        @tf.function()
+        def f(x):
+            with tf.GradientTape() as t:
+                t.watch(x)
+                z = m(x**2)
+            grads = t.gradient(z, x)
+            return grads
+
+        self.assertAllEqual(
+            f(tf.constant(10.0, shape=(1, 1))), tf.constant(40.0, shape=(1, 1))
+        )
+
+        f = tf.function(f)
+
+        self.assertAllEqual(
+            f(tf.constant(10.0, shape=(1, 1))), tf.constant(40.0, shape=(1, 1))
+        )
+
+    def test_no_tracking(self):
+        if not tf.executing_eagerly():
+            x = tf.constant(1.0, shape=(10, 10))
+            keras.layers.Dense(1)(x)
+            self.assertTrue(x._keras_history_checked)
+
+    def test_timing_scales_linearly(self):
+        def _construct_graph_of_size(size):
+            start = time.time()
+            x = keras.backend.placeholder(shape=(10, 4))
+
+            for _ in range(size):
+                x = keras.layers.Dense(4)(x)
+                x = tf.nn.relu(x)
+
+            end = time.time()
+            return end - start
+
+        size_50 = _construct_graph_of_size(50)
+        size_500 = _construct_graph_of_size(500)
+
+        # Check construction time grows approx. linearly with size.
+        e = 3  # Fudge factor to prevent flakiness.
+        self.assertLess(size_500, (10 * e) * size_50)
+
+    def test_built(self):
+        inputs = keras.Input(shape=(10,))
+        outputs = tf.nn.relu(inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile("sgd", "mse")
+        for layer in model.layers:
+            self.assertTrue(layer.built)
+        # Test something that requires Layers to be built.
+        model.summary()
+
+    def test_json_serialization(self):
+        inputs = keras.Input(shape=(4,), dtype="uint8")
+        outputs = tf.cast(inputs, "float32") / 4.0
+        model = model_config.model_from_json(
+            keras.Model(inputs, outputs).to_json()
+        )
+        self.assertAllEqual(
+            self.evaluate(model(np.array([0, 64, 128, 192], np.uint8))),
+            [0.0, 16.0, 32.0, 48.0],
+        )
+        model.summary()
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class InputInEagerTest(test_combinations.TestCase):
-  """Tests ops on keras inputs in Eager runtime.
+    """Tests ops on keras inputs in Eager runtime.
 
-  Input returns graph/symbolic tensors in the Eager runtime (this
-  happens, for example, with tensors returned from Keras layers). These
-  should be routed to the graph-style branch of these ops (b/134715641)
-  """
+    Input returns graph/symbolic tensors in the Eager runtime (this
+    happens, for example, with tensors returned from Keras layers). These
+    should be routed to the graph-style branch of these ops (b/134715641)
+    """
 
-  def test_identity(self):
-    x = keras.Input(shape=(1,))
-    ident = tf.identity(x)
+    def test_identity(self):
+        x = keras.Input(shape=(1,))
+        ident = tf.identity(x)
 
-    # This is now a graph tensor, and should be able to continue in graphland
-    self.assertIn('Identity', ident.name)
+        # This is now a graph tensor, and should be able to continue in
+        # graphland
+        self.assertIn("Identity", ident.name)
 
-  def test_size(self):
-    x = keras.Input(shape=(3,))
-    self.assertAllEqual(x.get_shape().as_list(), [None, 3])
-    sz = tf.size(x)
+    def test_size(self):
+        x = keras.Input(shape=(3,))
+        self.assertAllEqual(x.get_shape().as_list(), [None, 3])
+        sz = tf.size(x)
 
-    # This is now a graph tensor, and should be able to continue in graphland
-    self.assertIn('Size', sz.name)
+        # This is now a graph tensor, and should be able to continue in
+        # graphland
+        self.assertIn("Size", sz.name)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/legacy_tf_layers/BUILD b/keras/legacy_tf_layers/BUILD
index 9beaf00b237d..67a8950d6f5d 100644
--- a/keras/legacy_tf_layers/BUILD
+++ b/keras/legacy_tf_layers/BUILD
@@ -1,11 +1,14 @@
 # Description:
 #   Contains the legacy TF layers (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
+
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//learning/brain/contrib:__subpackages__",
diff --git a/keras/legacy_tf_layers/__init__.py b/keras/legacy_tf_layers/__init__.py
index 11649ccd701b..0bb028307a4f 100644
--- a/keras/legacy_tf_layers/__init__.py
+++ b/keras/legacy_tf_layers/__init__.py
@@ -1,3 +1,3 @@
 """Init file."""
 
-from keras.legacy_tf_layers import migration_utils  # pylint: disable=unused-import
+from keras.legacy_tf_layers import migration_utils
diff --git a/keras/legacy_tf_layers/base.py b/keras/legacy_tf_layers/base.py
index 40c0dbe244c2..fa2beea2f2d1 100644
--- a/keras/legacy_tf_layers/base.py
+++ b/keras/legacy_tf_layers/base.py
@@ -12,604 +12,658 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import copy
 import warnings
+
+import tensorflow.compat.v2 as tf
+
 from keras import backend
-from keras.engine import base_layer_v1 as base_layer
 from keras.engine import base_layer_utils
+from keras.engine import base_layer_v1 as base_layer
 from keras.legacy_tf_layers import variable_scope_shim
 from keras.mixed_precision import policy
 from keras.utils import tf_contextlib
+
+# isort: off
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
-
 
 _KERAS_STYLE_SCOPE = False
 
 
 @keras_export(
-    v1=['keras.__internal__.legacy.layers.experimental.keras_style_scope'])
-@tf_export(v1=['layers.experimental.keras_style_scope'])
+    v1=["keras.__internal__.legacy.layers.experimental.keras_style_scope"]
+)
 @tf_contextlib.contextmanager
 def keras_style_scope():
-  """Use Keras-style variable management.
+    """Use Keras-style variable management.
 
-  All tf.layers and tf RNN cells created in this scope use Keras-style
-  variable management.  Creating such layers with a scope= argument is
-  disallowed, and reuse=True is disallowed.
+    All tf.layers and tf RNN cells created in this scope use Keras-style
+    variable management.  Creating such layers with a scope= argument is
+    disallowed, and reuse=True is disallowed.
 
-  The purpose of this scope is to allow users of existing layers to
-  slowly transition to a Keras layers API without breaking existing
-  functionality.
+    The purpose of this scope is to allow users of existing layers to
+    slowly transition to a Keras layers API without breaking existing
+    functionality.
 
-  One example of this is when using TensorFlow's RNN classes with Keras
-  Models or Networks.  Because Keras models do not properly set variable
-  scopes, users of RNNs may either accidentally share scopes between two
-  different models, or get errors about variables that already exist.
+    One example of this is when using TensorFlow's RNN classes with Keras
+    Models or Networks.  Because Keras models do not properly set variable
+    scopes, users of RNNs may either accidentally share scopes between two
+    different models, or get errors about variables that already exist.
 
-  Example:
+    Example:
 
-  ```python
-  class RNNModel(tf.keras.Model):
+    ```python
+    class RNNModel(tf.keras.Model):
 
-    def __init__(self, name):
-      super(RNNModel, self).__init__(name=name)
-      self.rnn = tf.compat.v1.nn.rnn_cell.MultiRNNCell(
-        [tf.compat.v1.nn.rnn_cell.LSTMCell(64) for _ in range(2)])
+      def __init__(self, name):
+        super(RNNModel, self).__init__(name=name)
+        self.rnn = tf.compat.v1.nn.rnn_cell.MultiRNNCell(
+          [tf.compat.v1.nn.rnn_cell.LSTMCell(64) for _ in range(2)])
 
-    def call(self, input, state):
-      return self.rnn(input, state)
+      def call(self, input, state):
+        return self.rnn(input, state)
 
-  model_1 = RNNModel("model_1")
-  model_2 = RNNModel("model_2")
-
-  # OK
-  output_1, next_state_1 = model_1(input, state)
-  # Raises an error about trying to create an already existing variable.
-  output_2, next_state_2 = model_2(input, state)
-  ```
-
-  The solution is to wrap the model construction and execution in a keras-style
-  scope:
-
-  ```python
-  with keras_style_scope():
     model_1 = RNNModel("model_1")
     model_2 = RNNModel("model_2")
 
-    # model_1 and model_2 are guaranteed to create their own variables.
+    # OK
     output_1, next_state_1 = model_1(input, state)
+    # Raises an error about trying to create an already existing variable.
     output_2, next_state_2 = model_2(input, state)
+    ```
 
-    assert len(model_1.weights) > 0
-    assert len(model_2.weights) > 0
-    assert(model_1.weights != model_2.weights)
-  ```
+    The solution is to wrap the model construction and execution in a
+    keras-style scope:
+
+    ```python
+    with keras_style_scope():
+      model_1 = RNNModel("model_1")
+      model_2 = RNNModel("model_2")
+
+      # model_1 and model_2 are guaranteed to create their own variables.
+      output_1, next_state_1 = model_1(input, state)
+      output_2, next_state_2 = model_2(input, state)
 
-  Yields:
-    A keras layer style scope.
-  """
-  global _KERAS_STYLE_SCOPE
-  stack = _KERAS_STYLE_SCOPE
-  _KERAS_STYLE_SCOPE = True
-  try:
-    yield
-  finally:
-    _KERAS_STYLE_SCOPE = stack
+      assert len(model_1.weights) > 0
+      assert len(model_2.weights) > 0
+      assert(model_1.weights != model_2.weights)
+    ```
+
+    Yields:
+      A keras layer style scope.
+    """
+    global _KERAS_STYLE_SCOPE
+    stack = _KERAS_STYLE_SCOPE
+    _KERAS_STYLE_SCOPE = True
+    try:
+        yield
+    finally:
+        _KERAS_STYLE_SCOPE = stack
 
 
 @keras_export(
-    v1=['keras.__internal__.legacy.layers.experimental.set_keras_style'])
-@tf_export(v1=['layers.experimental.set_keras_style'])
+    v1=["keras.__internal__.legacy.layers.experimental.set_keras_style"]
+)
 def set_keras_style():
-  """Use Keras-style variable management.
+    """Use Keras-style variable management.
 
-  All tf.layers and tf RNN cells created after keras style ha been enabled
-  use Keras-style variable management.  Creating such layers with a
-  scope= argument is disallowed, and reuse=True is disallowed.
+    All tf.layers and tf RNN cells created after keras style ha been enabled
+    use Keras-style variable management.  Creating such layers with a
+    scope= argument is disallowed, and reuse=True is disallowed.
 
-  The purpose of this function is to allow users of existing layers to
-  slowly transition to Keras layers API without breaking existing
-  functionality.
+    The purpose of this function is to allow users of existing layers to
+    slowly transition to Keras layers API without breaking existing
+    functionality.
 
-  For more details, see the documentation for `keras_style_scope`.
+    For more details, see the documentation for `keras_style_scope`.
 
-  Note, once keras style has been set, it is set globally for the entire
-  program and cannot be unset.
+    Note, once keras style has been set, it is set globally for the entire
+    program and cannot be unset.
 
-  Example:
+    Example:
 
-  ```python
-  set_keras_style()
+    ```python
+    set_keras_style()
 
-  model_1 = RNNModel(name="model_1")
-  model_2 = RNNModel(name="model_2")
+    model_1 = RNNModel(name="model_1")
+    model_2 = RNNModel(name="model_2")
 
-  # model_1 and model_2 are guaranteed to create their own variables.
-  output_1, next_state_1 = model_1(input, state)
-  output_2, next_state_2 = model_2(input, state)
+    # model_1 and model_2 are guaranteed to create their own variables.
+    output_1, next_state_1 = model_1(input, state)
+    output_2, next_state_2 = model_2(input, state)
 
-  assert len(model_1.weights) > 0
-  assert len(model_2.weights) > 0
-  assert(model_1.weights != model_2.weights)
-  ```
-  """
-  global _KERAS_STYLE_SCOPE
-  _KERAS_STYLE_SCOPE = True
+    assert len(model_1.weights) > 0
+    assert len(model_2.weights) > 0
+    assert(model_1.weights != model_2.weights)
+    ```
+    """
+    global _KERAS_STYLE_SCOPE
+    _KERAS_STYLE_SCOPE = True
 
 
 def _is_in_keras_style_scope():
-  global _KERAS_STYLE_SCOPE
-  return _KERAS_STYLE_SCOPE
+    global _KERAS_STYLE_SCOPE
+    return _KERAS_STYLE_SCOPE
 
 
-@keras_export(v1=['keras.__internal__.legacy.layers.Layer'])
-@tf_export(v1=['layers.Layer'])
+@keras_export(v1=["keras.__internal__.legacy.layers.Layer"])
 class Layer(base_layer.Layer):
-  """Base layer class.
-
-  It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
-  instead.
-
-  Args:
-    trainable: Boolean, whether the layer's variables should be trainable.
-    name: String name of the layer.
-    dtype: Default dtype of the layer's weights (default of `None` means use the
-      type of the first input).
-
-  Read-only properties:
-    name: The name of the layer (string).
-    dtype: Default dtype of the layer's weights (default of `None` means use the
-      type of the first input).
-    trainable_variables: List of trainable variables.
-    non_trainable_variables: List of non-trainable variables.
-    variables: List of all variables of this layer, trainable and
-      non-trainable.
-    updates: List of update ops of this layer.
-    losses: List of losses added by this layer.
-    trainable_weights: List of variables to be included in backprop.
-    non_trainable_weights: List of variables that should not be
-      included in backprop.
-    weights: The concatenation of the lists trainable_weights and
-      non_trainable_weights (in this order).
-
-  Mutable properties:
-    trainable: Whether the layer should be trained (boolean).
-    input_spec: Optional (list of) `InputSpec` object(s) specifying the
-      constraints on inputs that can be accepted by the layer.
-  """
-
-  def __init__(self, trainable=True, name=None, dtype=None,
-               **kwargs):
-    # For backwards compatibility, legacy layers do not use `ResourceVariable`
-    # by default.
-    self._use_resource_variables = False
-    scope = kwargs.pop('_scope', None)
-    self._reuse = kwargs.pop('_reuse', None)
-
-    # Avoid an incorrect lint error
-    self._trainable_weights = []
-    self.built = False
-
-    if dtype is None:
-      # Indicates to infer dtype from inputs. When the V2 dtype behavior is
-      # enabled, Keras layers default their dtype to floatx instead, so we pass
-      # an "_infer" policy to keep the old V1 behavior.
-      dtype = policy.Policy('_infer')
-
-    if 'autocast' not in kwargs:
-      kwargs['autocast'] = False
-
-    # Mark that legacy layers should not be instrumented as Keras usage
-    self._disable_keras_instrumentation = True
-
-    super().__init__(trainable=trainable, name=name, dtype=dtype,
-                                **kwargs)
-
-    if _is_in_keras_style_scope():
-      if scope is not None:
-        raise ValueError(
-            'scope argument not allowed when keras style layers are enabled, '
-            'but saw: {}'.format(scope))
-      if self._reuse is not None:
-        raise ValueError(
-            'reuse argument not allowed when keras style layers are enabled, '
-            'but saw: {}'.format(self._reuse))
-      self._keras_style = True
-    else:
-      self._keras_style = False
-
-    self._call_has_scope_arg = 'scope' in self._call_spec.arg_names
-    if scope:
-      with tf.compat.v1.variable_scope(scope) as captured_scope:
-        self._scope = captured_scope
-    else:
-      self._scope = None
-    self._current_scope = None
-
-  def apply(self, *args, **kwargs):
-    return self(*args, **kwargs)
-
-  # We no longer track graph in tf.layers layers. This property is only kept to
-  # maintain API backward compatibility.
-  @property
-  def graph(self):
-    warnings.warn(
-        '`Layer.graph` is deprecated and '
-        'will be removed in a future version. '
-        'Please stop using this property because tf.layers layers no '
-        'longer track their graph.',
-        stacklevel=2)
-    if tf.executing_eagerly():
-      raise RuntimeError('Layer.graph not supported when executing eagerly.')
-    return None
-
-  def _init_set_name(self, name):
-    # Determine layer name (non-unique).
-    if isinstance(name, tf.compat.v1.VariableScope):
-      base_name = name.name
-      self._name, _ = self._make_unique_name()
-    else:
-      base_name = name
-      self._name = name
-    if not name:
-      self._name, base_name = self._make_unique_name()
-    self._base_name = base_name
-
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
-                        namespace='', zero_based=False):
-    base_name = base_layer.to_snake_case(self.__class__.__name__)
-    name = backend.unique_object_name(
-        base_name,
-        name_uid_map=name_uid_map,
-        avoid_names=avoid_names,
-        namespace=namespace,
-        zero_based=zero_based)
-    return (name, base_name)
-
-  @property
-  def scope_name(self):
-    if not self._scope:
-      raise ValueError('No name available for layer scope because the layer "' +
-                       self._name + '" has not been used yet. The scope name ' +
-                       ' is determined the first time the layer instance is ' +
-                       'called. You must therefore call the layer before ' +
-                       'querying `scope_name`.')
-    return self._scope.name
-
-  def add_loss(self, losses, inputs=None):
-    previous_losses_length = len(self._losses)
-    previous_callable_losses_length = len(self._callable_losses)
-    super().add_loss(losses, inputs=inputs)
-    if not tf.executing_eagerly():
-      # TODO(fchollet): deprecate collection below.
-      new_losses = self._losses[previous_losses_length:]
-      new_callable_losses = self._callable_losses[
-          previous_callable_losses_length:]
-      for regularizer in new_callable_losses:
-        loss_tensor = regularizer()
-        if loss_tensor is not None:
-          new_losses.append(loss_tensor)
-      _add_elements_to_collection(
-          new_losses,
-          tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-
-  def _name_scope(self):  # pylint: disable=method-hidden
-    """Determines op naming for the Layer."""
-    if self._keras_style:
-      return super()._name_scope()
-    return self._current_scope.original_name_scope
-
-  def _set_scope(self, scope=None):
-    if self._scope is None:
-      # If constructed with _scope=None, lazy setting of scope.
-      if self._reuse:
-        with tf.compat.v1.variable_scope(
-            scope if scope is not None else self._base_name) as captured_scope:
-          self._scope = captured_scope
-      else:
-        with tf.compat.v1.variable_scope(
-            scope, default_name=self._base_name) as captured_scope:
-          self._scope = captured_scope
-
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 use_resource=None,
-                 synchronization=tf.VariableSynchronization.AUTO,
-                 aggregation=tf.compat.v1.VariableAggregation.NONE,
-                 partitioner=None,
-                 **kwargs):
-    """Adds a new variable to the layer, or gets an existing one; returns it.
+    """Base layer class.
 
-    Args:
-      name: variable name.
-      shape: variable shape.
-      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-      initializer: initializer instance (callable).
-      regularizer: regularizer instance (callable).
-      trainable: whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases)
-        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-        Note, if the current variable scope is marked as non-trainable
-        then this parameter is ignored and any added variables are also
-        marked as non-trainable. `trainable` defaults to `True` unless
-        `synchronization` is set to `ON_READ`.
-      constraint: constraint instance (callable).
-      use_resource: Whether to use `ResourceVariable`.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      partitioner: (optional) partitioner instance (callable).  If
-        provided, when the requested variable is created it will be split
-        into multiple partitions according to `partitioner`.  In this case,
-        an instance of `PartitionedVariable` is returned.  Available
-        partitioners include `tf.compat.v1.fixed_size_partitioner` and
-        `tf.compat.v1.variable_axis_size_partitioner`.  For more details, see
-        the documentation of `tf.compat.v1.get_variable` and the  "Variable
-        Partitioners and Sharding" section of the API guide.
-      **kwargs: Additional keyword arguments.
-
-    Returns:
-      The created variable.  Usually either a `Variable` or `ResourceVariable`
-      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
-      instance is returned.
-
-    Raises:
-      RuntimeError: If called with partitioned variable regularization and
-        eager execution is enabled.
-      ValueError: When trainable has been set to True with synchronization
-        set as `ON_READ`.
-    """
-    for kwarg in kwargs:
-      if kwarg != 'experimental_autocast':
-        raise TypeError('Unknown keyword argument:', kwarg)
-    if self._keras_style:
-      return super().add_weight(
-          name=name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          trainable=trainable and self.trainable,
-          constraint=constraint,
-          use_resource=use_resource,
-          synchronization=tf.VariableSynchronization.AUTO,
-          aggregation=tf.compat.v1.VariableAggregation.NONE,
-          partitioner=partitioner,
-          **kwargs)
-
-    if synchronization == tf.VariableSynchronization.ON_READ:
-      if trainable:
-        raise ValueError(
-            'Synchronization value can be set to '
-            'VariableSynchronization.ON_READ only for non-trainable variables. '
-            'You have specified trainable=True and '
-            'synchronization=VariableSynchronization.ON_READ.')
-      else:
-        # Set trainable to be false when variable is to be synced on read.
-        trainable = False
-    elif trainable is None:
-      trainable = True
-
-    def _should_add_regularizer(variable, existing_variable_set):
-      if base_layer_utils.is_split_variable(variable):
-        for var in variable:
-          if var in existing_variable_set:
-            return False
-        return True
-      else:
-        return variable not in existing_variable_set
-
-    init_graph = None
-    if not tf.executing_eagerly():
-      default_graph = tf.compat.v1.get_default_graph()
-      if default_graph.building_function:
-        with tf.init_scope():
-          # Retrieve the variables from the graph into which variables
-          # will be lifted; if initialization ops will be lifted into
-          # the eager context, then there is nothing to retrieve, since variable
-          # collections are not supported when eager execution is enabled.
-          if not tf.executing_eagerly():
-            init_graph = tf.compat.v1.get_default_graph()
-            existing_variables = set(tf.compat.v1.global_variables())
-      else:
-        # Initialization ops will not be lifted out of the default graph.
-        init_graph = default_graph
-        existing_variables = set(tf.compat.v1.global_variables())
-
-    if dtype is None:
-      dtype = self.dtype or tf.float32
-
-    self._set_scope(None)
-    reuse = self.built or self._reuse
-    prev_len_trainable = len(self._trainable_weights)
-    with tf.compat.v1.variable_scope(
-        self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
-      self._current_scope = scope
-      with backend.name_scope(self._name_scope()):  # pylint: disable=not-callable
-        use_resource = (use_resource or
-                        self._use_resource_variables or
-                        scope.use_resource)
-        if initializer is None:
-          initializer = scope.initializer
-        variable = super().add_weight(
-            name,
-            shape,
-            dtype=tf.as_dtype(dtype),
-            initializer=initializer,
-            trainable=trainable and self.trainable,
-            constraint=constraint,
-            partitioner=partitioner,
-            use_resource=use_resource,
-            synchronization=synchronization,
-            aggregation=aggregation,
-            getter=tf.compat.v1.get_variable,
-            **kwargs)
-
-        if regularizer:
-          if (tf.compat.v1.executing_eagerly_outside_functions()
-              or _should_add_regularizer(variable, existing_variables)):
-            self._handle_weight_regularization(name, variable, regularizer)
-            var_store = vs._get_default_variable_store()  # pylint: disable=protected-access
-            # When the shim to get variable scope working in TF2 is used,
-            # We need to explicitly make the shim track the regularization
-            # losses as the collections will not be accessible.
-            if hasattr(var_store, 'add_regularizer'):
-              var_store.add_regularizer(variable, regularizer)
-
-        if init_graph is not None:
-          # Handle edge case where a custom getter has overridden `trainable`.
-          # There is one known occurrence of this, in unit test
-          # testBasicRNNCellNotTrainable in
-          # contrib.rnn.python.kernel_tests.core_rnn_cell_test
-          with init_graph.as_default():
-            trainable_variables = tf.compat.v1.trainable_variables()
-          if (trainable and self.trainable and
-              variable not in trainable_variables):
-            # A custom getter / variable scope overrode the trainable flag.
-            extra_trainable_vars = self._trainable_weights[prev_len_trainable:]
-            self._trainable_weights = self._trainable_weights[
-                :prev_len_trainable]
-            self._non_trainable_weights += extra_trainable_vars
-    return variable
-
-  def __call__(self, inputs, *args, **kwargs):
-    """Wraps `call`, applying pre- and post-processing steps.
+    It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
+    instead.
 
     Args:
-      inputs: input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-        **Note**: kwarg `scope` is reserved for use by the layer.
-
-    Returns:
-      Output tensor(s).
-
-    Note:
-      - If the layer's `call` method takes a `scope` keyword argument,
-        this argument will be automatically set to the current variable scope.
-      - If the layer's `call` method takes a `mask` argument (as some Keras
-        layers do), its default value will be set to the mask generated
-        for `inputs` by the previous layer (if `input` did come from
-        a layer that generated a corresponding mask, i.e. if it came from
-        a Keras layer with masking support.
-
-    Raises:
-      ValueError: if the layer's `call` method returns None (an invalid value).
+      trainable: Boolean, whether the layer's variables should be trainable.
+      name: String name of the layer.
+      dtype: Default dtype of the layer's weights (default of `None` means use
+        the type of the first input).
+
+    Read-only properties:
+      name: The name of the layer (string).
+      dtype: Default dtype of the layer's weights (default of `None` means use
+        the type of the first input).
+      trainable_variables: List of trainable variables.
+      non_trainable_variables: List of non-trainable variables.
+      variables: List of all variables of this layer, trainable and
+        non-trainable.
+      updates: List of update ops of this layer.
+      losses: List of losses added by this layer.
+      trainable_weights: List of variables to be included in backprop.
+      non_trainable_weights: List of variables that should not be
+        included in backprop.
+      weights: The concatenation of the lists trainable_weights and
+        non_trainable_weights (in this order).
+
+    Mutable properties:
+      trainable: Whether the layer should be trained (boolean).
+      input_spec: Optional (list of) `InputSpec` object(s) specifying the
+        constraints on inputs that can be accepted by the layer.
     """
-    scope = kwargs.pop('scope', None)
-
-    if self._keras_style:
-      if scope is not None:
-        raise ValueError(
-            'scope argument not allowed when keras style layers are enabled, '
-            'but saw: {}'.format(scope))
-      return super().__call__(inputs, *args, **kwargs)
-
-    self._set_scope(scope)
-
-    if self.built:
-      try:
-        # Some classes which inherit from Layer do not use its constructor, so
-        # rather than initializing to None we check for an AttributeError.
-        scope_context_manager = self._always_reuse_variable_scope  # pylint: disable=access-member-before-definition
-      except AttributeError:
-        scope_context_manager = None
-
-      if scope_context_manager is None:
-        # From this point we will always set reuse=True, so create a "final"
-        # variable scope with this setting. We avoid re-creating variable scopes
-        # after this point as an optimization.
-        scope_context_manager = tf.compat.v1.variable_scope(
-            self._scope, reuse=True, auxiliary_name_scope=False)
-
-        # Do not cache variable scopes if Eager mode is enabled. If Eager mode
-        # is enabled then we don't want to reuse scopes because the cached scope
-        # might be from a FuncGraph or Eager scope we are no longer in.
-        if not tf.compat.v1.executing_eagerly_outside_functions():
-          self._always_reuse_variable_scope = scope_context_manager
-    else:
-      scope_context_manager = tf.compat.v1.variable_scope(
-          self._scope, reuse=self._reuse, auxiliary_name_scope=False)
-
-    with scope_context_manager as scope:
-      self._current_scope = scope
-
-      try:
-        call_has_scope_arg = self._call_has_scope_arg
-      except AttributeError:
-        self._call_spec.arg_names = variable_scope_shim.fn_args(self.call)
-        self._call_has_scope_arg = 'scope' in self._call_spec.arg_names
-        call_has_scope_arg = self._call_has_scope_arg
-      if call_has_scope_arg:
-        kwargs['scope'] = scope
-
-      # Actually call layer
-      outputs = super().__call__(inputs, *args, **kwargs)
-
-    if not tf.executing_eagerly():
-      # Update global default collections.
-      _add_elements_to_collection(self.updates, tf.compat.v1.GraphKeys.UPDATE_OPS)
-    return outputs
-
-  def __deepcopy__(self, memo):
-    no_copy = set(['_graph', '_thread_local', '_metrics_lock'])
-    shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
-    cls = self.__class__
-    result = cls.__new__(cls)
-    memo[id(self)] = result
-    for k, v in self.__dict__.items():
-      if k in no_copy:
-        setattr(result, k, v)
-      elif k in shallow_copy:
-        setattr(result, k, copy.copy(v))
-      elif base_layer.is_tensor_or_tensor_list(v):
-        setattr(result, k, v)
-      else:
-        setattr(result, k, copy.deepcopy(v, memo))
-    return result
-
-  def __setattr__(self, value, name):
-    # By-pass the automatic dependency tracking performed by the parent Layer.
-    super(tf.__internal__.tracking.Trackable, self).__setattr__(value, name)  # pylint: disable=bad-super-call
-
-  @property
-  def _is_legacy_layer(self):
-    """Used by keras to check compatibility. This should not be overridden."""
-    return True
+
+    def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
+        # For backwards compatibility, legacy layers do not use
+        # `ResourceVariable` by default.
+        self._use_resource_variables = False
+        scope = kwargs.pop("_scope", None)
+        self._reuse = kwargs.pop("_reuse", None)
+
+        # Avoid an incorrect lint error
+        self._trainable_weights = []
+        self.built = False
+
+        if dtype is None:
+            # Indicates to infer dtype from inputs. When the V2 dtype behavior
+            # is enabled, Keras layers default their dtype to floatx instead, so
+            # we pass an "_infer" policy to keep the old V1 behavior.
+            dtype = policy.Policy("_infer")
+
+        if "autocast" not in kwargs:
+            kwargs["autocast"] = False
+
+        # Mark that legacy layers should not be instrumented as Keras usage
+        self._disable_keras_instrumentation = True
+
+        super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
+
+        if _is_in_keras_style_scope():
+            if scope is not None:
+                raise ValueError(
+                    "scope argument not allowed when keras style layers are "
+                    "enabled, but saw: {}".format(scope)
+                )
+            if self._reuse is not None:
+                raise ValueError(
+                    "reuse argument not allowed when keras style layers are "
+                    "enabled, but saw: {}".format(self._reuse)
+                )
+            self._keras_style = True
+        else:
+            self._keras_style = False
+
+        self._call_has_scope_arg = "scope" in self._call_spec.arg_names
+        if scope:
+            with tf.compat.v1.variable_scope(scope) as captured_scope:
+                self._scope = captured_scope
+        else:
+            self._scope = None
+        self._current_scope = None
+
+    def apply(self, *args, **kwargs):
+        return self(*args, **kwargs)
+
+    # We no longer track graph in tf.layers layers. This property is only kept
+    # to maintain API backward compatibility.
+    @property
+    def graph(self):
+        warnings.warn(
+            "`Layer.graph` is deprecated and "
+            "will be removed in a future version. "
+            "Please stop using this property because tf.layers layers no "
+            "longer track their graph.",
+            stacklevel=2,
+        )
+        if tf.executing_eagerly():
+            raise RuntimeError(
+                "Layer.graph not supported when executing eagerly."
+            )
+        return None
+
+    def _init_set_name(self, name):
+        # Determine layer name (non-unique).
+        if isinstance(name, tf.compat.v1.VariableScope):
+            base_name = name.name
+            self._name, _ = self._make_unique_name()
+        else:
+            base_name = name
+            self._name = name
+        if not name:
+            self._name, base_name = self._make_unique_name()
+        self._base_name = base_name
+
+    def _make_unique_name(
+        self,
+        name_uid_map=None,
+        avoid_names=None,
+        namespace="",
+        zero_based=False,
+    ):
+        base_name = base_layer.to_snake_case(self.__class__.__name__)
+        name = backend.unique_object_name(
+            base_name,
+            name_uid_map=name_uid_map,
+            avoid_names=avoid_names,
+            namespace=namespace,
+            zero_based=zero_based,
+        )
+        return (name, base_name)
+
+    @property
+    def scope_name(self):
+        if not self._scope:
+            raise ValueError(
+                'No name available for layer scope because the layer "'
+                + self._name
+                + '" has not been used yet. The scope name '
+                + " is determined the first time the layer instance is "
+                + "called. You must therefore call the layer before "
+                + "querying `scope_name`."
+            )
+        return self._scope.name
+
+    def add_loss(self, losses, inputs=None):
+        previous_losses_length = len(self._losses)
+        previous_callable_losses_length = len(self._callable_losses)
+        super().add_loss(losses, inputs=inputs)
+        if not tf.executing_eagerly():
+            # TODO(fchollet): deprecate collection below.
+            new_losses = self._losses[previous_losses_length:]
+            new_callable_losses = self._callable_losses[
+                previous_callable_losses_length:
+            ]
+            for regularizer in new_callable_losses:
+                loss_tensor = regularizer()
+                if loss_tensor is not None:
+                    new_losses.append(loss_tensor)
+            _add_elements_to_collection(
+                new_losses, tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+
+    def _name_scope(self):
+        """Determines op naming for the Layer."""
+        if self._keras_style:
+            return super()._name_scope()
+        return self._current_scope.original_name_scope
+
+    def _set_scope(self, scope=None):
+        if self._scope is None:
+            # If constructed with _scope=None, lazy setting of scope.
+            if self._reuse:
+                with tf.compat.v1.variable_scope(
+                    scope if scope is not None else self._base_name
+                ) as captured_scope:
+                    self._scope = captured_scope
+            else:
+                with tf.compat.v1.variable_scope(
+                    scope, default_name=self._base_name
+                ) as captured_scope:
+                    self._scope = captured_scope
+
+    def add_weight(
+        self,
+        name,
+        shape,
+        dtype=None,
+        initializer=None,
+        regularizer=None,
+        trainable=None,
+        constraint=None,
+        use_resource=None,
+        synchronization=tf.VariableSynchronization.AUTO,
+        aggregation=tf.compat.v1.VariableAggregation.NONE,
+        partitioner=None,
+        **kwargs
+    ):
+        """Adds a new variable to the layer, or gets an existing one; returns it
+
+        Args:
+          name: variable name.
+          shape: variable shape.
+          dtype: The type of the variable. Defaults to `self.dtype` or
+            `float32`.
+          initializer: initializer instance (callable).
+          regularizer: regularizer instance (callable).
+          trainable: whether the variable should be part of the layer's
+            "trainable_variables" (e.g. variables, biases)
+            or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+            Note, if the current variable scope is marked as non-trainable
+            then this parameter is ignored and any added variables are also
+            marked as non-trainable. `trainable` becomes `True` unless
+            `synchronization` is set to `ON_READ`. Defaults to `True`.
+          constraint: constraint instance (callable).
+          use_resource: Whether to use `ResourceVariable`.
+          synchronization: Indicates when a distributed a variable will be
+            aggregated. Accepted values are constants defined in the class
+            `tf.VariableSynchronization`. By default the synchronization is set
+            to `AUTO` and the current `DistributionStrategy` chooses when to
+            synchronize. If `synchronization` is set to `ON_READ`, `trainable`
+            must not be set to `True`.
+          aggregation: Indicates how a distributed variable will be aggregated.
+            Accepted values are constants defined in the class
+            `tf.VariableAggregation`.
+          partitioner: (optional) partitioner instance (callable).  If
+            provided, when the requested variable is created it will be split
+            into multiple partitions according to `partitioner`.  In this case,
+            an instance of `PartitionedVariable` is returned.  Available
+            partitioners include `tf.compat.v1.fixed_size_partitioner` and
+            `tf.compat.v1.variable_axis_size_partitioner`.  For more details,
+            see the documentation of `tf.compat.v1.get_variable` and the
+            "Variable Partitioners and Sharding" section of the API guide.
+          **kwargs: Additional keyword arguments.
+
+        Returns:
+          The created variable.  Usually either a `Variable` or
+          `ResourceVariable` instance.  If `partitioner` is not `None`, a
+          `PartitionedVariable` instance is returned.
+
+        Raises:
+          RuntimeError: If called with partitioned variable regularization and
+            eager execution is enabled.
+          ValueError: When trainable has been set to True with synchronization
+            set as `ON_READ`.
+        """
+        for kwarg in kwargs:
+            if kwarg != "experimental_autocast":
+                raise TypeError("Unknown keyword argument:", kwarg)
+        if self._keras_style:
+            return super().add_weight(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                initializer=initializer,
+                regularizer=regularizer,
+                trainable=trainable and self.trainable,
+                constraint=constraint,
+                use_resource=use_resource,
+                synchronization=tf.VariableSynchronization.AUTO,
+                aggregation=tf.compat.v1.VariableAggregation.NONE,
+                partitioner=partitioner,
+                **kwargs
+            )
+
+        if synchronization == tf.VariableSynchronization.ON_READ:
+            if trainable:
+                raise ValueError(
+                    "Synchronization value can be set to "
+                    "VariableSynchronization.ON_READ only for non-trainable "
+                    "variables. You have specified trainable=True and "
+                    "synchronization=VariableSynchronization.ON_READ."
+                )
+            else:
+                # Set trainable to be false when variable is to be synced on
+                # read.
+                trainable = False
+        elif trainable is None:
+            trainable = True
+
+        def _should_add_regularizer(variable, existing_variable_set):
+            if base_layer_utils.is_split_variable(variable):
+                for var in variable:
+                    if var in existing_variable_set:
+                        return False
+                return True
+            else:
+                return variable not in existing_variable_set
+
+        init_graph = None
+        if not tf.executing_eagerly():
+            default_graph = tf.compat.v1.get_default_graph()
+            if default_graph.building_function:
+                with tf.init_scope():
+                    # Retrieve the variables from the graph into which variables
+                    # will be lifted; if initialization ops will be lifted into
+                    # the eager context, then there is nothing to retrieve,
+                    # since variable collections are not supported when eager
+                    # execution is enabled.
+                    if not tf.executing_eagerly():
+                        init_graph = tf.compat.v1.get_default_graph()
+                        existing_variables = set(
+                            tf.compat.v1.global_variables()
+                        )
+            else:
+                # Initialization ops will not be lifted out of the default
+                # graph.
+                init_graph = default_graph
+                existing_variables = set(tf.compat.v1.global_variables())
+
+        if dtype is None:
+            dtype = self.dtype or tf.float32
+
+        self._set_scope(None)
+        reuse = self.built or self._reuse
+        prev_len_trainable = len(self._trainable_weights)
+        with tf.compat.v1.variable_scope(
+            self._scope, reuse=reuse, auxiliary_name_scope=False
+        ) as scope:
+            self._current_scope = scope
+            with backend.name_scope(self._name_scope()):
+                use_resource = (
+                    use_resource
+                    or self._use_resource_variables
+                    or scope.use_resource
+                )
+                if initializer is None:
+                    initializer = scope.initializer
+                variable = super().add_weight(
+                    name,
+                    shape,
+                    dtype=tf.as_dtype(dtype),
+                    initializer=initializer,
+                    trainable=trainable and self.trainable,
+                    constraint=constraint,
+                    partitioner=partitioner,
+                    use_resource=use_resource,
+                    synchronization=synchronization,
+                    aggregation=aggregation,
+                    getter=tf.compat.v1.get_variable,
+                    **kwargs
+                )
+
+                if regularizer:
+                    if (
+                        tf.compat.v1.executing_eagerly_outside_functions()
+                        or _should_add_regularizer(variable, existing_variables)
+                    ):
+                        self._handle_weight_regularization(
+                            name, variable, regularizer
+                        )
+                        var_store = vs._get_default_variable_store()
+                        # When the shim to get variable scope working in TF2 is
+                        # used, We need to explicitly make the shim track the
+                        # regularization losses as the collections will not be
+                        # accessible.
+                        if hasattr(var_store, "add_regularizer"):
+                            var_store.add_regularizer(variable, regularizer)
+
+                if init_graph is not None:
+                    # Handle edge case where a custom getter has overridden
+                    # `trainable`.  There is one known occurrence of this, in
+                    # unit test testBasicRNNCellNotTrainable in
+                    # contrib.rnn.python.kernel_tests.core_rnn_cell_test
+                    with init_graph.as_default():
+                        trainable_variables = tf.compat.v1.trainable_variables()
+                    if (
+                        trainable
+                        and self.trainable
+                        and variable not in trainable_variables
+                    ):
+                        # A custom getter / variable scope overrode the
+                        # trainable flag.
+                        extra_trainable_vars = self._trainable_weights[
+                            prev_len_trainable:
+                        ]
+                        self._trainable_weights = self._trainable_weights[
+                            :prev_len_trainable
+                        ]
+                        self._non_trainable_weights += extra_trainable_vars
+        return variable
+
+    def __call__(self, inputs, *args, **kwargs):
+        """Wraps `call`, applying pre- and post-processing steps.
+
+        Args:
+          inputs: input tensor(s).
+          *args: additional positional arguments to be passed to `self.call`.
+          **kwargs: additional keyword arguments to be passed to `self.call`.
+            **Note**: kwarg `scope` is reserved for use by the layer.
+
+        Returns:
+          Output tensor(s).
+
+        Note:
+          - If the layer's `call` method takes a `scope` keyword argument, this
+            argument will be automatically set to the current variable scope.
+          - If the layer's `call` method takes a `mask` argument (as some Keras
+            layers do), its default value will be set to the mask generated
+            for `inputs` by the previous layer (if `input` did come from
+            a layer that generated a corresponding mask, i.e. if it came from
+            a Keras layer with masking support.
+
+        Raises:
+          ValueError: if the layer's `call` method returns None (an invalid
+            value).
+        """
+        scope = kwargs.pop("scope", None)
+
+        if self._keras_style:
+            if scope is not None:
+                raise ValueError(
+                    "scope argument not allowed when keras style layers are "
+                    "enabled, but saw: {}".format(scope)
+                )
+            return super().__call__(inputs, *args, **kwargs)
+
+        self._set_scope(scope)
+
+        if self.built:
+            try:
+                # Some classes which inherit from Layer do not use its
+                # constructor, so rather than initializing to None we check for
+                # an AttributeError.
+                scope_context_manager = self._always_reuse_variable_scope
+            except AttributeError:
+                scope_context_manager = None
+
+            if scope_context_manager is None:
+                # From this point we will always set reuse=True, so create a
+                # "final" variable scope with this setting. We avoid re-creating
+                # variable scopes after this point as an optimization.
+                scope_context_manager = tf.compat.v1.variable_scope(
+                    self._scope, reuse=True, auxiliary_name_scope=False
+                )
+
+                # Do not cache variable scopes if Eager mode is enabled. If
+                # Eager mode is enabled then we don't want to reuse scopes
+                # because the cached scope might be from a FuncGraph or Eager
+                # scope we are no longer in.
+                if not tf.compat.v1.executing_eagerly_outside_functions():
+                    self._always_reuse_variable_scope = scope_context_manager
+        else:
+            scope_context_manager = tf.compat.v1.variable_scope(
+                self._scope, reuse=self._reuse, auxiliary_name_scope=False
+            )
+
+        with scope_context_manager as scope:
+            self._current_scope = scope
+
+            try:
+                call_has_scope_arg = self._call_has_scope_arg
+            except AttributeError:
+                self._call_spec.arg_names = variable_scope_shim.fn_args(
+                    self.call
+                )
+                self._call_has_scope_arg = "scope" in self._call_spec.arg_names
+                call_has_scope_arg = self._call_has_scope_arg
+            if call_has_scope_arg:
+                kwargs["scope"] = scope
+
+            # Actually call layer
+            outputs = super().__call__(inputs, *args, **kwargs)
+
+        if not tf.executing_eagerly():
+            # Update global default collections.
+            _add_elements_to_collection(
+                self.updates, tf.compat.v1.GraphKeys.UPDATE_OPS
+            )
+        return outputs
+
+    def __deepcopy__(self, memo):
+        no_copy = set(["_graph", "_thread_local", "_metrics_lock"])
+        shallow_copy = set(["_scope", "_always_reuse_variable_scope"])
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            if k in no_copy:
+                setattr(result, k, v)
+            elif k in shallow_copy:
+                setattr(result, k, copy.copy(v))
+            elif base_layer.is_tensor_or_tensor_list(v):
+                setattr(result, k, v)
+            else:
+                setattr(result, k, copy.deepcopy(v, memo))
+        return result
+
+    def __setattr__(self, value, name):
+        # By-pass the automatic dependency tracking performed by the parent
+        # Layer.
+        super(tf.__internal__.tracking.Trackable, self).__setattr__(value, name)
+
+    @property
+    def _is_legacy_layer(self):
+        """Used by keras to check compatibility. This should not be
+        overridden."""
+        return True
 
 
 def _add_elements_to_collection(elements, collection_list):
-  if tf.executing_eagerly():
-    raise RuntimeError('Using collections from Layers not supported in Eager '
-                       'mode. Tried to add %s to %s' % (elements,
-                                                        collection_list))
-  elements = tf.nest.flatten(elements)
-  collection_list = tf.nest.flatten(collection_list)
-  for name in collection_list:
-    collection = tf.compat.v1.get_collection_ref(name)
-    collection_set = {id(e) for e in collection}
-    for element in elements:
-      if id(element) not in collection_set:
-        collection.append(element)
+    if tf.executing_eagerly():
+        raise RuntimeError(
+            "Using collections from Layers not supported in Eager "
+            "mode. Tried to add %s to %s" % (elements, collection_list)
+        )
+    elements = tf.nest.flatten(elements)
+    collection_list = tf.nest.flatten(collection_list)
+    for name in collection_list:
+        collection = tf.compat.v1.get_collection_ref(name)
+        collection_set = {id(e) for e in collection}
+        for element in elements:
+            if id(element) not in collection_set:
+                collection.append(element)
diff --git a/keras/legacy_tf_layers/base_test.py b/keras/legacy_tf_layers/base_test.py
index a03e98c74631..e71403e8c680 100644
--- a/keras/legacy_tf_layers/base_test.py
+++ b/keras/legacy_tf_layers/base_test.py
@@ -18,694 +18,719 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import copy
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras.engine import base_layer as keras_base_layer
 from keras.engine import input_spec
 from keras.legacy_tf_layers import base as base_tf_layers
 from keras.legacy_tf_layers import core as core_tf_layers
+from keras.testing_infra import test_combinations
 
 
 class BaseLayerTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testLayerProperties(self):
-    layer = base_tf_layers.Layer(name='my_layer')
-    self.assertEqual(layer.variables, [])
-    self.assertEqual(layer.trainable_variables, [])
-    self.assertEqual(layer.non_trainable_variables, [])
-    if not tf.executing_eagerly():
-      # updates, losses only supported in GRAPH mode
-      self.assertEqual(layer.updates, [])
-      self.assertEqual(layer.losses, [])
-    self.assertEqual(layer.built, False)
-    layer = base_tf_layers.Layer(name='my_layer', trainable=False)
-    self.assertEqual(layer.trainable, False)
-
-    # Assert that the layer was not instrumented as a Keras layer
-    self.assertFalse(layer._instrumented_keras_api)
-
-    # Assert this was instrumented as a legacy layer
-    self.assertTrue(
-        keras_base_layer.keras_api_gauge.get_cell('legacy_layer').value())
-    keras_base_layer.keras_api_gauge.get_cell('legacy_layer').set(False)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInt64Layer(self):
-    layer = base_tf_layers.Layer(name='my_layer', dtype='int64')
-    layer.add_weight('my_var', [2, 2])
-    self.assertEqual(layer.name, 'my_layer')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testKerasStyleAddWeight(self):
-    keras_layer = keras_base_layer.Layer(name='keras_layer')
-    with backend.name_scope('foo'):
-      keras_variable = keras_layer.add_weight(
-          'my_var', [2, 2], initializer=tf.compat.v1.zeros_initializer())
-    self.assertEqual(keras_variable.name, 'foo/my_var:0')
-
-    with backend.name_scope('baz'):
-      old_style_layer = base_tf_layers.Layer(name='my_layer')
-      # Test basic variable creation.
-      variable = old_style_layer.add_weight(
-          'my_var', [2, 2], initializer=tf.compat.v1.zeros_initializer())
-    self.assertEqual(variable.name, 'my_layer/my_var:0')
-
-    with base_tf_layers.keras_style_scope():
-      layer = base_tf_layers.Layer(name='my_layer')
-    # Assert that the layer was not instrumented as a Keras layer
-    self.assertFalse(layer._instrumented_keras_api)
-    # Test basic variable creation.
-    with backend.name_scope('bar'):
-      variable = layer.add_weight(
-          'my_var', [2, 2], initializer=tf.compat.v1.zeros_initializer())
-    self.assertEqual(variable.name, 'bar/my_var:0')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testAddWeight(self):
-    layer = base_tf_layers.Layer(name='my_layer')
-
-    # Test basic variable creation.
-    variable = layer.add_weight(
-        'my_var', [2, 2], initializer=tf.compat.v1.zeros_initializer())
-    self.assertEqual(variable.name, 'my_layer/my_var:0')
-    self.assertEqual(layer.variables, [variable])
-    self.assertEqual(layer.trainable_variables, [variable])
-    self.assertEqual(layer.non_trainable_variables, [])
-    if not tf.executing_eagerly():
-      self.assertEqual(
-          layer.variables,
-          tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES))
-
-    # Test non-trainable variable creation.
-    # layer.add_variable should work even outside `build` and `call`.
-    variable_2 = layer.add_weight(
-        'non_trainable_var', [2, 2],
-        initializer=tf.compat.v1.zeros_initializer(),
-        trainable=False)
-    self.assertEqual(layer.variables, [variable, variable_2])
-    self.assertEqual(layer.trainable_variables, [variable])
-    self.assertEqual(layer.non_trainable_variables, [variable_2])
-
-    if not tf.executing_eagerly():
-      self.assertEqual(
-          len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)), 1)
-
-    regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    _ = layer.add_weight(
-        'reg_var', [2, 2],
-        initializer=tf.compat.v1.zeros_initializer(),
-        regularizer=regularizer)
-    self.assertEqual(len(layer.losses), 1)
-
-    added_variable = [False]
-
-    # Test that sync `ON_READ` variables are defaulted to be non-trainable.
-    variable_3 = layer.add_weight(
-        'sync_on_read_var', [2, 2],
-        initializer=tf.compat.v1.zeros_initializer(),
-        synchronization=tf.VariableSynchronization.ON_READ,
-        aggregation=tf.compat.v1.VariableAggregation.SUM)
-    self.assertEqual(layer.non_trainable_variables, [variable_2, variable_3])
-
-    @tf.function
-    def function_adds_weight():
-      if not added_variable[0]:
-        layer.add_weight(
-            'reg_var_from_function', [2, 2],
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testLayerProperties(self):
+        layer = base_tf_layers.Layer(name="my_layer")
+        self.assertEqual(layer.variables, [])
+        self.assertEqual(layer.trainable_variables, [])
+        self.assertEqual(layer.non_trainable_variables, [])
+        if not tf.executing_eagerly():
+            # updates, losses only supported in GRAPH mode
+            self.assertEqual(layer.updates, [])
+            self.assertEqual(layer.losses, [])
+        self.assertEqual(layer.built, False)
+        layer = base_tf_layers.Layer(name="my_layer", trainable=False)
+        self.assertEqual(layer.trainable, False)
+
+        # Assert that the layer was not instrumented as a Keras layer
+        self.assertFalse(layer._instrumented_keras_api)
+
+        # Assert this was instrumented as a legacy layer
+        self.assertTrue(
+            keras_base_layer.keras_api_gauge.get_cell("legacy_layer").value()
+        )
+        keras_base_layer.keras_api_gauge.get_cell("legacy_layer").set(False)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInt64Layer(self):
+        layer = base_tf_layers.Layer(name="my_layer", dtype="int64")
+        layer.add_weight("my_var", [2, 2])
+        self.assertEqual(layer.name, "my_layer")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testKerasStyleAddWeight(self):
+        keras_layer = keras_base_layer.Layer(name="keras_layer")
+        with backend.name_scope("foo"):
+            keras_variable = keras_layer.add_weight(
+                "my_var", [2, 2], initializer=tf.compat.v1.zeros_initializer()
+            )
+        self.assertEqual(keras_variable.name, "foo/my_var:0")
+
+        with backend.name_scope("baz"):
+            old_style_layer = base_tf_layers.Layer(name="my_layer")
+            # Test basic variable creation.
+            variable = old_style_layer.add_weight(
+                "my_var", [2, 2], initializer=tf.compat.v1.zeros_initializer()
+            )
+        self.assertEqual(variable.name, "my_layer/my_var:0")
+
+        with base_tf_layers.keras_style_scope():
+            layer = base_tf_layers.Layer(name="my_layer")
+        # Assert that the layer was not instrumented as a Keras layer
+        self.assertFalse(layer._instrumented_keras_api)
+        # Test basic variable creation.
+        with backend.name_scope("bar"):
+            variable = layer.add_weight(
+                "my_var", [2, 2], initializer=tf.compat.v1.zeros_initializer()
+            )
+        self.assertEqual(variable.name, "bar/my_var:0")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testAddWeight(self):
+        layer = base_tf_layers.Layer(name="my_layer")
+
+        # Test basic variable creation.
+        variable = layer.add_weight(
+            "my_var", [2, 2], initializer=tf.compat.v1.zeros_initializer()
+        )
+        self.assertEqual(variable.name, "my_layer/my_var:0")
+        self.assertEqual(layer.variables, [variable])
+        self.assertEqual(layer.trainable_variables, [variable])
+        self.assertEqual(layer.non_trainable_variables, [])
+        if not tf.executing_eagerly():
+            self.assertEqual(
+                layer.variables,
+                tf.compat.v1.get_collection(
+                    tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+                ),
+            )
+
+        # Test non-trainable variable creation.
+        # layer.add_variable should work even outside `build` and `call`.
+        variable_2 = layer.add_weight(
+            "non_trainable_var",
+            [2, 2],
             initializer=tf.compat.v1.zeros_initializer(),
-            regularizer=regularizer)
-        added_variable[0] = True
-
-    function_adds_weight()
-    self.assertEqual(len(layer.losses), 2)
-
-  def testInvalidTrainableSynchronizationCombination(self):
-    layer = base_tf_layers.Layer(name='my_layer')
-
-    with self.assertRaisesRegex(
-        ValueError, 'Synchronization value can be set to '
-        'VariableSynchronization.ON_READ only for non-trainable variables. '
-        'You have specified trainable=True and '
-        'synchronization=VariableSynchronization.ON_READ.'):
-      _ = layer.add_weight(
-          'v', [2, 2],
-          initializer=tf.compat.v1.zeros_initializer(),
-          synchronization=tf.VariableSynchronization.ON_READ,
-          trainable=True)
-
-  def testReusePartitionedVariablesAndRegularizers(self):
-    with tf.Graph().as_default():
-      regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-      partitioner = tf.compat.v1.fixed_size_partitioner(3)
-      for reuse in [False, True]:
-        with tf.compat.v1.variable_scope(
-            tf.compat.v1.get_variable_scope(),
-            partitioner=partitioner,
-            reuse=reuse):
-          layer = base_tf_layers.Layer(name='my_layer')
-          _ = layer.add_weight(
-              'reg_part_var', [4, 4],
-              initializer=tf.compat.v1.zeros_initializer(),
-              regularizer=regularizer)
-      self.assertEqual(
-          len(tf.compat.v1.get_collection(
-              tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)), 3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testCall(self):
-
-    class MyLayer(base_tf_layers.Layer):
-
-      def call(self, inputs):
-        return tf.square(inputs)
-
-    layer = MyLayer(name='my_layer')
-    inputs = tf.random.uniform((5,), seed=1)
-    outputs = layer(inputs)
-    self.assertEqual(layer.built, True)
-    if not tf.executing_eagerly():
-      # op is only supported in GRAPH mode
-      self.assertEqual(outputs.op.name, 'my_layer/Square')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testDeepCopy(self):
-
-    class MyLayer(base_tf_layers.Layer):
-
-      def call(self, inputs):
-        return tf.square(inputs)
-
-    layer = MyLayer(name='my_layer')
-    layer._private_tensor = tf.random.uniform(())
-    inputs = tf.random.uniform((5,), seed=1)
-    outputs = layer(inputs)
-    self.assertEqual(layer.built, True)
-    if not tf.executing_eagerly():
-      # op only supported in GRAPH mode.
-      self.assertEqual(outputs.op.name, 'my_layer/Square')
-
-    layer_copy = copy.deepcopy(layer)
-    self.assertEqual(layer_copy.name, layer.name)
-    self.assertEqual(layer_copy._scope.name, layer._scope.name)
-    self.assertEqual(layer_copy._private_tensor, layer._private_tensor)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testScopeNaming(self):
-
-    class PrivateLayer(base_tf_layers.Layer):
-
-      def call(self, inputs):
-        return inputs
-
-    inputs = tf.random.uniform((5,))
-    default_layer = PrivateLayer()
-    _ = default_layer(inputs)
-    self.assertEqual(default_layer._scope.name, 'private_layer')
-    default_layer1 = PrivateLayer()
-    default_layer1(inputs)
-    self.assertEqual(default_layer1._scope.name, 'private_layer_1')
-    my_layer = PrivateLayer(name='my_layer')
-    my_layer(inputs)
-    self.assertEqual(my_layer._scope.name, 'my_layer')
-    my_layer1 = PrivateLayer(name='my_layer')
-    my_layer1(inputs)
-    self.assertEqual(my_layer1._scope.name, 'my_layer_1')
-    my_layer2 = PrivateLayer(name='my_layer')
-    my_layer2(inputs)
-    self.assertEqual(my_layer2._scope.name, 'my_layer_2')
-    # Name scope shouldn't affect names.
-    with backend.name_scope('some_name_scope'):
-      default_layer2 = PrivateLayer()
-      default_layer2(inputs)
-      self.assertEqual(default_layer2._scope.name, 'private_layer_2')
-      my_layer3 = PrivateLayer(name='my_layer')
-      my_layer3(inputs)
-      self.assertEqual(my_layer3._scope.name, 'my_layer_3')
-      other_layer = PrivateLayer(name='other_layer')
-      other_layer(inputs)
-      self.assertEqual(other_layer._scope.name, 'other_layer')
-    # Variable scope gets added to scope names.
-    with tf.compat.v1.variable_scope('var_scope'):
-      default_layer_scoped = PrivateLayer()
-      default_layer_scoped(inputs)
-      self.assertEqual(default_layer_scoped._scope.name,
-                       'var_scope/private_layer')
-      my_layer_scoped = PrivateLayer(name='my_layer')
-      my_layer_scoped(inputs)
-      self.assertEqual(my_layer_scoped._scope.name, 'var_scope/my_layer')
-      my_layer_scoped1 = PrivateLayer(name='my_layer')
-      my_layer_scoped1(inputs)
-      self.assertEqual(my_layer_scoped1._scope.name, 'var_scope/my_layer_1')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInputSpecNdimCheck(self):
-
-    class CustomerLayer(base_tf_layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = input_spec.InputSpec(ndim=2)
-
-      def call(self, inputs):
-        return inputs
-
-    layer = CustomerLayer()
-    with self.assertRaisesRegex(ValueError, r'expected ndim=2'):
-      layer(tf.constant([1]))
-
-    # Note that we re-create the layer since in Eager mode, input spec checks
-    # only happen on first call.
-    # Works
-    layer = CustomerLayer()
-    layer(tf.constant([[1], [2]]))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInputSpecMinNdimCheck(self):
-
-    class CustomLayer(base_tf_layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = input_spec.InputSpec(min_ndim=2)
-
-      def call(self, inputs):
-        return inputs
-
-    layer = CustomLayer()
-    with self.assertRaisesRegex(ValueError, r'expected min_ndim=2'):
-      layer(tf.constant([1]))
-
-    # Works
-    layer = CustomLayer()
-    layer(tf.constant([[1], [2]]))
-
-    layer = CustomLayer()
-    layer(tf.constant([[[1], [2]]]))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInputSpecMaxNdimCheck(self):
-
-    class CustomerLayer(base_tf_layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = input_spec.InputSpec(max_ndim=2)
-
-      def call(self, inputs):
-        return inputs
-
-    layer = CustomerLayer()
-    with self.assertRaisesRegex(ValueError, r'expected max_ndim=2'):
-      layer(tf.constant([[[1], [2]]]))
-
-    # Works
-    layer = CustomerLayer()
-    layer(tf.constant([1]))
-
-    layer = CustomerLayer()
-    layer(tf.constant([[1], [2]]))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInputSpecDtypeCheck(self):
-
-    class CustomerLayer(base_tf_layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = input_spec.InputSpec(dtype='float32')
-
-      def call(self, inputs):
-        return inputs
-
-    layer = CustomerLayer()
-    with self.assertRaisesRegex(ValueError, r'expected dtype=float32'):
-      layer(tf.constant(1, dtype=tf.int32))
-
-    # Works
-    layer = CustomerLayer()
-    layer(tf.constant(1.0, dtype=tf.float32))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInputSpecAxesCheck(self):
-
-    class CustomerLayer(base_tf_layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = input_spec.InputSpec(axes={-1: 2})
-
-      def call(self, inputs):
-        return inputs
-
-    layer = CustomerLayer()
-    with self.assertRaisesRegex(ValueError, r'expected axis'):
-      layer(tf.constant([1, 2, 3]))
-
-    # Works
-    layer = CustomerLayer()
-    layer(tf.constant([1, 2]))
-    layer = CustomerLayer()
-    layer(tf.constant([[1, 2], [3, 4], [5, 6]]))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInputSpecShapeCheck(self):
-
-    class CustomerLayer(base_tf_layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = input_spec.InputSpec(shape=(None, 3))
-
-      def call(self, inputs):
-        return inputs
-
-    layer = CustomerLayer()
-    with self.assertRaisesRegex(ValueError, r'expected shape'):
-      layer(tf.constant([[1, 2]]))
-
-    # Works
-    layer = CustomerLayer()
-    layer(tf.constant([[1, 2, 3], [4, 5, 6]]))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoInputSpec(self):
-
-    class CustomerLayer(base_tf_layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = None
-
-      def call(self, inputs):
-        return inputs
+            trainable=False,
+        )
+        self.assertEqual(layer.variables, [variable, variable_2])
+        self.assertEqual(layer.trainable_variables, [variable])
+        self.assertEqual(layer.non_trainable_variables, [variable_2])
+
+        if not tf.executing_eagerly():
+            self.assertEqual(
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+                    )
+                ),
+                1,
+            )
+
+        regularizer = lambda x: tf.reduce_sum(x) * 1e-3
+        _ = layer.add_weight(
+            "reg_var",
+            [2, 2],
+            initializer=tf.compat.v1.zeros_initializer(),
+            regularizer=regularizer,
+        )
+        self.assertEqual(len(layer.losses), 1)
 
-    layer = CustomerLayer()
-
-    layer(tf.constant(1))
-
-    # Works
-    if not tf.executing_eagerly():
-      layer(tf.compat.v1.placeholder('int32'))
-      layer(tf.compat.v1.placeholder('int32', shape=(2, 3)))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_count_params(self):
-    dense = core_tf_layers.Dense(16)
-    dense.build((None, 4))
-    self.assertEqual(dense.count_params(), 16 * 4 + 16)
-
-    dense = core_tf_layers.Dense(16)
-    with self.assertRaises(ValueError):
-      dense.count_params()
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testDictInputOutput(self):
-
-    class DictLayer(base_tf_layers.Layer):
-
-      def call(self, inputs):
-        return {'l' + key: inputs[key] for key in inputs}
-
-    layer = DictLayer()
-    if tf.executing_eagerly():
-      i1 = tf.constant(3)
-      i2 = tf.constant(4.0)
-      result = layer({'abel': i1, 'ogits': i2})
-      self.assertTrue(isinstance(result, dict))
-      self.assertEqual(set(['label', 'logits']), set(result.keys()))
-      self.assertEqual(3, result['label'].numpy())
-      self.assertEqual(4.0, result['logits'].numpy())
-    else:
-      i1 = tf.compat.v1.placeholder('int32')
-      i2 = tf.compat.v1.placeholder('float32')
-      result = layer({'abel': i1, 'ogits': i2})
-      self.assertTrue(isinstance(result, dict))
-      self.assertEqual(set(['label', 'logits']), set(result.keys()))
-
-  def testActivityRegularizer(self):
-    with tf.Graph().as_default():
-      regularizer = tf.reduce_sum
-      layer = base_tf_layers.Layer(activity_regularizer=regularizer)
-      x = tf.compat.v1.placeholder('int32')
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-  def testNameScopeIsConsistentWithVariableScope(self):
-    # Github issue 13429.
-
-    class MyLayer(base_tf_layers.Layer):
-
-      def build(self, input_shape):
-        self.my_var = self.add_weight('my_var', (), tf.float32)
-        self.built = True
-
-      def call(self, inputs):
-        return tf.multiply(inputs, self.my_var, name='my_op')
-
-    def _gen_layer(x, name=None):
-      layer = MyLayer(name=name)
-      out = layer(x)
-      return layer, out
-
-    # unnamed layer
-    with tf.Graph().as_default():
-      x = tf.compat.v1.placeholder(tf.float32, (), 'x')
-      layer, op = _gen_layer(x)
-      layer1, op1 = _gen_layer(op)
-      layer2, op2 = _gen_layer(op1)
-
-      self.assertEqual(layer.my_var.name, 'my_layer/my_var:0')
-      self.assertEqual(op.name, 'my_layer/my_op:0')
-      self.assertEqual(layer1.my_var.name, 'my_layer_1/my_var:0')
-      self.assertEqual(op1.name, 'my_layer_1/my_op:0')
-      self.assertEqual(layer2.my_var.name, 'my_layer_2/my_var:0')
-      self.assertEqual(op2.name, 'my_layer_2/my_op:0')
-    # name starts from zero
-    with tf.Graph().as_default():
-      x = tf.compat.v1.placeholder(tf.float32, (), 'x')
-      layer, op = _gen_layer(x, name='name')
-      layer1, op1 = _gen_layer(op, name='name_1')
-      layer2, op2 = _gen_layer(op1, name='name_2')
-
-      self.assertEqual(layer.my_var.name, 'name/my_var:0')
-      self.assertEqual(op.name, 'name/my_op:0')
-      self.assertEqual(layer1.my_var.name, 'name_1/my_var:0')
-      self.assertEqual(op1.name, 'name_1/my_op:0')
-      self.assertEqual(layer2.my_var.name, 'name_2/my_var:0')
-      self.assertEqual(op2.name, 'name_2/my_op:0')
-    # name starts from one
-    with tf.Graph().as_default():
-      x = tf.compat.v1.placeholder(tf.float32, (), 'x')
-      layer, op = _gen_layer(x, name='name_1')
-      layer1, op1 = _gen_layer(op, name='name_2')
-      layer2, op2 = _gen_layer(op1, name='name_3')
-
-      self.assertEqual(layer.my_var.name, 'name_1/my_var:0')
-      self.assertEqual(op.name, 'name_1/my_op:0')
-      self.assertEqual(layer1.my_var.name, 'name_2/my_var:0')
-      self.assertEqual(op1.name, 'name_2/my_op:0')
-      self.assertEqual(layer2.my_var.name, 'name_3/my_var:0')
-      self.assertEqual(op2.name, 'name_3/my_op:0')
-
-  def testVariablesAreLiftedFromFunctionBuildingGraphs(self):
-    class MyLayer(base_tf_layers.Layer):
-
-      def build(self, input_shape):
-        self.my_var = self.add_weight('my_var', (), tf.float32)
-        self.built = True
-
-      def call(self, inputs):
-        return inputs
+        added_variable = [False]
 
-    outer_graph = tf.compat.v1.get_default_graph()
-    function_building_graph = tf.Graph()
-    function_building_graph._building_function = True
-    with outer_graph.as_default():
-      with function_building_graph.as_default():
-        layer = MyLayer()
-        # Create a variable by invoking build through __call__ and assert that
-        # it is both tracked and lifted into the outer graph.
-        inputs = tf.compat.v1.placeholder(tf.float32, (), 'inputs')
-        layer(inputs)
-        self.assertEqual(len(layer.variables), 1)
-        self.assertEqual(len(layer.trainable_variables), 1)
-        self.assertEqual(layer.variables[0].graph, outer_graph)
-
-  def testGetUpdateFor(self):
-
-    class MyLayer(base_tf_layers.Layer):
-
-      def build(self, input_shape):
-        self.a = self.add_weight('a',
-                                 (),
-                                 tf.float32,
-                                 trainable=False)
-        self.b = self.add_weight('b',
-                                 (),
-                                 tf.float32,
-                                 trainable=False)
-        self.add_update(tf.compat.v1.assign_add(self.a, 1., name='b_update'))
-        self.built = True
-
-      def call(self, inputs):
-        self.add_update(
-            tf.compat.v1.assign_add(self.a, inputs, name='a_update'))
-        return inputs + 1
-
-    with tf.Graph().as_default():
-      layer = MyLayer()
-      inputs = tf.compat.v1.placeholder(tf.float32, (), 'inputs')
-      intermediate_inputs = inputs + 1
-      outputs = layer(intermediate_inputs)
-
-      self.assertEqual(len(layer.updates), 2)
-      self.assertEqual(len(layer.get_updates_for(None)), 1)
-      self.assertEqual(len(layer.get_updates_for([inputs])), 1)
-      self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
-      self.assertEqual(len(layer.get_updates_for([outputs])), 0)
-
-      # Call same layer on new input, creating one more conditional update
-      inputs = tf.compat.v1.placeholder(tf.float32, (), 'inputs')
-      intermediate_inputs = inputs + 1
-      outputs = layer(intermediate_inputs)
-
-      self.assertEqual(len(layer.updates), 3)
-      self.assertEqual(len(layer.get_updates_for(None)), 1)
-      # Check that we are successfully filtering out irrelevant updates
-      self.assertEqual(len(layer.get_updates_for([inputs])), 1)
-      self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
-      self.assertEqual(len(layer.get_updates_for([outputs])), 0)
-
-  def testGetLossesFor(self):
-
-    class MyLayer(base_tf_layers.Layer):
-
-      def build(self, input_shape):
-        self.a = self.add_weight('a',
-                                 (),
-                                 tf.float32,
-                                 trainable=False)
-        self.b = self.add_weight('b',
-                                 (),
-                                 tf.float32,
-                                 trainable=False)
-        self.add_loss(self.a)
-        self.built = True
-
-      def call(self, inputs):
-        self.add_loss(inputs, inputs=True)
-        return inputs + 1
-
-    with tf.Graph().as_default():
-      layer = MyLayer()
-      inputs = tf.compat.v1.placeholder(tf.float32, (), 'inputs')
-      intermediate_inputs = inputs + 1
-      outputs = layer(intermediate_inputs)
-
-      self.assertEqual(len(layer.losses), 2)
-      self.assertEqual(len(layer.get_losses_for(None)), 1)
-      self.assertEqual(len(layer.get_losses_for([inputs])), 1)
-      self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
-      self.assertEqual(len(layer.get_losses_for([outputs])), 0)
-
-      # Call same layer on new input, creating one more conditional loss
-      inputs = tf.compat.v1.placeholder(tf.float32, (), 'inputs')
-      intermediate_inputs = inputs + 1
-      outputs = layer(intermediate_inputs)
-
-      self.assertEqual(len(layer.losses), 3)
-      self.assertEqual(len(layer.get_losses_for(None)), 1)
-      # Check that we are successfully filtering out irrelevant losses
-      self.assertEqual(len(layer.get_losses_for([inputs])), 1)
-      self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
-      self.assertEqual(len(layer.get_losses_for([outputs])), 0)
+        # Test that sync `ON_READ` variables are defaulted to be non-trainable.
+        variable_3 = layer.add_weight(
+            "sync_on_read_var",
+            [2, 2],
+            initializer=tf.compat.v1.zeros_initializer(),
+            synchronization=tf.VariableSynchronization.ON_READ,
+            aggregation=tf.compat.v1.VariableAggregation.SUM,
+        )
+        self.assertEqual(
+            layer.non_trainable_variables, [variable_2, variable_3]
+        )
+
+        @tf.function
+        def function_adds_weight():
+            if not added_variable[0]:
+                layer.add_weight(
+                    "reg_var_from_function",
+                    [2, 2],
+                    initializer=tf.compat.v1.zeros_initializer(),
+                    regularizer=regularizer,
+                )
+                added_variable[0] = True
+
+        function_adds_weight()
+        self.assertEqual(len(layer.losses), 2)
+
+    def testInvalidTrainableSynchronizationCombination(self):
+        layer = base_tf_layers.Layer(name="my_layer")
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Synchronization value can be set to "
+            "VariableSynchronization.ON_READ only for non-trainable variables. "
+            "You have specified trainable=True and "
+            "synchronization=VariableSynchronization.ON_READ.",
+        ):
+            _ = layer.add_weight(
+                "v",
+                [2, 2],
+                initializer=tf.compat.v1.zeros_initializer(),
+                synchronization=tf.VariableSynchronization.ON_READ,
+                trainable=True,
+            )
+
+    def testReusePartitionedVariablesAndRegularizers(self):
+        with tf.Graph().as_default():
+            regularizer = lambda x: tf.reduce_sum(x) * 1e-3
+            partitioner = tf.compat.v1.fixed_size_partitioner(3)
+            for reuse in [False, True]:
+                with tf.compat.v1.variable_scope(
+                    tf.compat.v1.get_variable_scope(),
+                    partitioner=partitioner,
+                    reuse=reuse,
+                ):
+                    layer = base_tf_layers.Layer(name="my_layer")
+                    _ = layer.add_weight(
+                        "reg_part_var",
+                        [4, 4],
+                        initializer=tf.compat.v1.zeros_initializer(),
+                        regularizer=regularizer,
+                    )
+            self.assertEqual(
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+                    )
+                ),
+                3,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testCall(self):
+        class MyLayer(base_tf_layers.Layer):
+            def call(self, inputs):
+                return tf.square(inputs)
+
+        layer = MyLayer(name="my_layer")
+        inputs = tf.random.uniform((5,), seed=1)
+        outputs = layer(inputs)
+        self.assertEqual(layer.built, True)
+        if not tf.executing_eagerly():
+            # op is only supported in GRAPH mode
+            self.assertEqual(outputs.op.name, "my_layer/Square")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDeepCopy(self):
+        class MyLayer(base_tf_layers.Layer):
+            def call(self, inputs):
+                return tf.square(inputs)
+
+        layer = MyLayer(name="my_layer")
+        layer._private_tensor = tf.random.uniform(())
+        inputs = tf.random.uniform((5,), seed=1)
+        outputs = layer(inputs)
+        self.assertEqual(layer.built, True)
+        if not tf.executing_eagerly():
+            # op only supported in GRAPH mode.
+            self.assertEqual(outputs.op.name, "my_layer/Square")
+
+        layer_copy = copy.deepcopy(layer)
+        self.assertEqual(layer_copy.name, layer.name)
+        self.assertEqual(layer_copy._scope.name, layer._scope.name)
+        self.assertEqual(layer_copy._private_tensor, layer._private_tensor)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testScopeNaming(self):
+        class PrivateLayer(base_tf_layers.Layer):
+            def call(self, inputs):
+                return inputs
+
+        inputs = tf.random.uniform((5,))
+        default_layer = PrivateLayer()
+        _ = default_layer(inputs)
+        self.assertEqual(default_layer._scope.name, "private_layer")
+        default_layer1 = PrivateLayer()
+        default_layer1(inputs)
+        self.assertEqual(default_layer1._scope.name, "private_layer_1")
+        my_layer = PrivateLayer(name="my_layer")
+        my_layer(inputs)
+        self.assertEqual(my_layer._scope.name, "my_layer")
+        my_layer1 = PrivateLayer(name="my_layer")
+        my_layer1(inputs)
+        self.assertEqual(my_layer1._scope.name, "my_layer_1")
+        my_layer2 = PrivateLayer(name="my_layer")
+        my_layer2(inputs)
+        self.assertEqual(my_layer2._scope.name, "my_layer_2")
+        # Name scope shouldn't affect names.
+        with backend.name_scope("some_name_scope"):
+            default_layer2 = PrivateLayer()
+            default_layer2(inputs)
+            self.assertEqual(default_layer2._scope.name, "private_layer_2")
+            my_layer3 = PrivateLayer(name="my_layer")
+            my_layer3(inputs)
+            self.assertEqual(my_layer3._scope.name, "my_layer_3")
+            other_layer = PrivateLayer(name="other_layer")
+            other_layer(inputs)
+            self.assertEqual(other_layer._scope.name, "other_layer")
+        # Variable scope gets added to scope names.
+        with tf.compat.v1.variable_scope("var_scope"):
+            default_layer_scoped = PrivateLayer()
+            default_layer_scoped(inputs)
+            self.assertEqual(
+                default_layer_scoped._scope.name, "var_scope/private_layer"
+            )
+            my_layer_scoped = PrivateLayer(name="my_layer")
+            my_layer_scoped(inputs)
+            self.assertEqual(my_layer_scoped._scope.name, "var_scope/my_layer")
+            my_layer_scoped1 = PrivateLayer(name="my_layer")
+            my_layer_scoped1(inputs)
+            self.assertEqual(
+                my_layer_scoped1._scope.name, "var_scope/my_layer_1"
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInputSpecNdimCheck(self):
+        class CustomerLayer(base_tf_layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = input_spec.InputSpec(ndim=2)
+
+            def call(self, inputs):
+                return inputs
+
+        layer = CustomerLayer()
+        with self.assertRaisesRegex(ValueError, r"expected ndim=2"):
+            layer(tf.constant([1]))
+
+        # Note that we re-create the layer since in Eager mode, input spec
+        # checks only happen on first call.
+        # Works
+        layer = CustomerLayer()
+        layer(tf.constant([[1], [2]]))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInputSpecMinNdimCheck(self):
+        class CustomLayer(base_tf_layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = input_spec.InputSpec(min_ndim=2)
+
+            def call(self, inputs):
+                return inputs
+
+        layer = CustomLayer()
+        with self.assertRaisesRegex(ValueError, r"expected min_ndim=2"):
+            layer(tf.constant([1]))
+
+        # Works
+        layer = CustomLayer()
+        layer(tf.constant([[1], [2]]))
+
+        layer = CustomLayer()
+        layer(tf.constant([[[1], [2]]]))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInputSpecMaxNdimCheck(self):
+        class CustomerLayer(base_tf_layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = input_spec.InputSpec(max_ndim=2)
+
+            def call(self, inputs):
+                return inputs
+
+        layer = CustomerLayer()
+        with self.assertRaisesRegex(ValueError, r"expected max_ndim=2"):
+            layer(tf.constant([[[1], [2]]]))
+
+        # Works
+        layer = CustomerLayer()
+        layer(tf.constant([1]))
+
+        layer = CustomerLayer()
+        layer(tf.constant([[1], [2]]))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInputSpecDtypeCheck(self):
+        class CustomerLayer(base_tf_layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = input_spec.InputSpec(dtype="float32")
+
+            def call(self, inputs):
+                return inputs
+
+        layer = CustomerLayer()
+        with self.assertRaisesRegex(ValueError, r"expected dtype=float32"):
+            layer(tf.constant(1, dtype=tf.int32))
+
+        # Works
+        layer = CustomerLayer()
+        layer(tf.constant(1.0, dtype=tf.float32))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInputSpecAxesCheck(self):
+        class CustomerLayer(base_tf_layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = input_spec.InputSpec(axes={-1: 2})
+
+            def call(self, inputs):
+                return inputs
+
+        layer = CustomerLayer()
+        with self.assertRaisesRegex(ValueError, r"expected axis"):
+            layer(tf.constant([1, 2, 3]))
+
+        # Works
+        layer = CustomerLayer()
+        layer(tf.constant([1, 2]))
+        layer = CustomerLayer()
+        layer(tf.constant([[1, 2], [3, 4], [5, 6]]))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInputSpecShapeCheck(self):
+        class CustomerLayer(base_tf_layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = input_spec.InputSpec(shape=(None, 3))
+
+            def call(self, inputs):
+                return inputs
+
+        layer = CustomerLayer()
+        with self.assertRaisesRegex(ValueError, r"expected shape"):
+            layer(tf.constant([[1, 2]]))
+
+        # Works
+        layer = CustomerLayer()
+        layer(tf.constant([[1, 2, 3], [4, 5, 6]]))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoInputSpec(self):
+        class CustomerLayer(base_tf_layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = None
+
+            def call(self, inputs):
+                return inputs
+
+        layer = CustomerLayer()
+
+        layer(tf.constant(1))
+
+        # Works
+        if not tf.executing_eagerly():
+            layer(tf.compat.v1.placeholder("int32"))
+            layer(tf.compat.v1.placeholder("int32", shape=(2, 3)))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_count_params(self):
+        dense = core_tf_layers.Dense(16)
+        dense.build((None, 4))
+        self.assertEqual(dense.count_params(), 16 * 4 + 16)
+
+        dense = core_tf_layers.Dense(16)
+        with self.assertRaises(ValueError):
+            dense.count_params()
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDictInputOutput(self):
+        class DictLayer(base_tf_layers.Layer):
+            def call(self, inputs):
+                return {"l" + key: inputs[key] for key in inputs}
+
+        layer = DictLayer()
+        if tf.executing_eagerly():
+            i1 = tf.constant(3)
+            i2 = tf.constant(4.0)
+            result = layer({"abel": i1, "ogits": i2})
+            self.assertTrue(isinstance(result, dict))
+            self.assertEqual(set(["label", "logits"]), set(result.keys()))
+            self.assertEqual(3, result["label"].numpy())
+            self.assertEqual(4.0, result["logits"].numpy())
+        else:
+            i1 = tf.compat.v1.placeholder("int32")
+            i2 = tf.compat.v1.placeholder("float32")
+            result = layer({"abel": i1, "ogits": i2})
+            self.assertTrue(isinstance(result, dict))
+            self.assertEqual(set(["label", "logits"]), set(result.keys()))
+
+    def testActivityRegularizer(self):
+        with tf.Graph().as_default():
+            regularizer = tf.reduce_sum
+            layer = base_tf_layers.Layer(activity_regularizer=regularizer)
+            x = tf.compat.v1.placeholder("int32")
+            layer(x)
+            self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+    def testNameScopeIsConsistentWithVariableScope(self):
+        # GitHub issue 13429.
+
+        class MyLayer(base_tf_layers.Layer):
+            def build(self, input_shape):
+                self.my_var = self.add_weight("my_var", (), tf.float32)
+                self.built = True
+
+            def call(self, inputs):
+                return tf.multiply(inputs, self.my_var, name="my_op")
+
+        def _gen_layer(x, name=None):
+            layer = MyLayer(name=name)
+            out = layer(x)
+            return layer, out
+
+        # unnamed layer
+        with tf.Graph().as_default():
+            x = tf.compat.v1.placeholder(tf.float32, (), "x")
+            layer, op = _gen_layer(x)
+            layer1, op1 = _gen_layer(op)
+            layer2, op2 = _gen_layer(op1)
+
+            self.assertEqual(layer.my_var.name, "my_layer/my_var:0")
+            self.assertEqual(op.name, "my_layer/my_op:0")
+            self.assertEqual(layer1.my_var.name, "my_layer_1/my_var:0")
+            self.assertEqual(op1.name, "my_layer_1/my_op:0")
+            self.assertEqual(layer2.my_var.name, "my_layer_2/my_var:0")
+            self.assertEqual(op2.name, "my_layer_2/my_op:0")
+        # name starts from zero
+        with tf.Graph().as_default():
+            x = tf.compat.v1.placeholder(tf.float32, (), "x")
+            layer, op = _gen_layer(x, name="name")
+            layer1, op1 = _gen_layer(op, name="name_1")
+            layer2, op2 = _gen_layer(op1, name="name_2")
+
+            self.assertEqual(layer.my_var.name, "name/my_var:0")
+            self.assertEqual(op.name, "name/my_op:0")
+            self.assertEqual(layer1.my_var.name, "name_1/my_var:0")
+            self.assertEqual(op1.name, "name_1/my_op:0")
+            self.assertEqual(layer2.my_var.name, "name_2/my_var:0")
+            self.assertEqual(op2.name, "name_2/my_op:0")
+        # name starts from one
+        with tf.Graph().as_default():
+            x = tf.compat.v1.placeholder(tf.float32, (), "x")
+            layer, op = _gen_layer(x, name="name_1")
+            layer1, op1 = _gen_layer(op, name="name_2")
+            layer2, op2 = _gen_layer(op1, name="name_3")
+
+            self.assertEqual(layer.my_var.name, "name_1/my_var:0")
+            self.assertEqual(op.name, "name_1/my_op:0")
+            self.assertEqual(layer1.my_var.name, "name_2/my_var:0")
+            self.assertEqual(op1.name, "name_2/my_op:0")
+            self.assertEqual(layer2.my_var.name, "name_3/my_var:0")
+            self.assertEqual(op2.name, "name_3/my_op:0")
+
+    def testVariablesAreLiftedFromFunctionBuildingGraphs(self):
+        class MyLayer(base_tf_layers.Layer):
+            def build(self, input_shape):
+                self.my_var = self.add_weight("my_var", (), tf.float32)
+                self.built = True
+
+            def call(self, inputs):
+                return inputs
+
+        outer_graph = tf.compat.v1.get_default_graph()
+        function_building_graph = tf.Graph()
+        function_building_graph._building_function = True
+        with outer_graph.as_default():
+            with function_building_graph.as_default():
+                layer = MyLayer()
+                # Create a variable by invoking build through __call__ and
+                # assert that it is both tracked and lifted into the outer
+                # graph.
+                inputs = tf.compat.v1.placeholder(tf.float32, (), "inputs")
+                layer(inputs)
+                self.assertEqual(len(layer.variables), 1)
+                self.assertEqual(len(layer.trainable_variables), 1)
+                self.assertEqual(layer.variables[0].graph, outer_graph)
+
+    def testGetUpdateFor(self):
+        class MyLayer(base_tf_layers.Layer):
+            def build(self, input_shape):
+                self.a = self.add_weight("a", (), tf.float32, trainable=False)
+                self.b = self.add_weight("b", (), tf.float32, trainable=False)
+                self.add_update(
+                    tf.compat.v1.assign_add(self.a, 1.0, name="b_update")
+                )
+                self.built = True
+
+            def call(self, inputs):
+                self.add_update(
+                    tf.compat.v1.assign_add(self.a, inputs, name="a_update")
+                )
+                return inputs + 1
+
+        with tf.Graph().as_default():
+            layer = MyLayer()
+            inputs = tf.compat.v1.placeholder(tf.float32, (), "inputs")
+            intermediate_inputs = inputs + 1
+            outputs = layer(intermediate_inputs)
+
+            self.assertEqual(len(layer.updates), 2)
+            self.assertEqual(len(layer.get_updates_for(None)), 1)
+            self.assertEqual(len(layer.get_updates_for([inputs])), 1)
+            self.assertEqual(
+                len(layer.get_updates_for([intermediate_inputs])), 1
+            )
+            self.assertEqual(len(layer.get_updates_for([outputs])), 0)
+
+            # Call same layer on new input, creating one more conditional update
+            inputs = tf.compat.v1.placeholder(tf.float32, (), "inputs")
+            intermediate_inputs = inputs + 1
+            outputs = layer(intermediate_inputs)
+
+            self.assertEqual(len(layer.updates), 3)
+            self.assertEqual(len(layer.get_updates_for(None)), 1)
+            # Check that we are successfully filtering out irrelevant updates
+            self.assertEqual(len(layer.get_updates_for([inputs])), 1)
+            self.assertEqual(
+                len(layer.get_updates_for([intermediate_inputs])), 1
+            )
+            self.assertEqual(len(layer.get_updates_for([outputs])), 0)
+
+    def testGetLossesFor(self):
+        class MyLayer(base_tf_layers.Layer):
+            def build(self, input_shape):
+                self.a = self.add_weight("a", (), tf.float32, trainable=False)
+                self.b = self.add_weight("b", (), tf.float32, trainable=False)
+                self.add_loss(self.a)
+                self.built = True
+
+            def call(self, inputs):
+                self.add_loss(inputs, inputs=True)
+                return inputs + 1
+
+        with tf.Graph().as_default():
+            layer = MyLayer()
+            inputs = tf.compat.v1.placeholder(tf.float32, (), "inputs")
+            intermediate_inputs = inputs + 1
+            outputs = layer(intermediate_inputs)
+
+            self.assertEqual(len(layer.losses), 2)
+            self.assertEqual(len(layer.get_losses_for(None)), 1)
+            self.assertEqual(len(layer.get_losses_for([inputs])), 1)
+            self.assertEqual(
+                len(layer.get_losses_for([intermediate_inputs])), 1
+            )
+            self.assertEqual(len(layer.get_losses_for([outputs])), 0)
+
+            # Call same layer on new input, creating one more conditional loss
+            inputs = tf.compat.v1.placeholder(tf.float32, (), "inputs")
+            intermediate_inputs = inputs + 1
+            outputs = layer(intermediate_inputs)
+
+            self.assertEqual(len(layer.losses), 3)
+            self.assertEqual(len(layer.get_losses_for(None)), 1)
+            # Check that we are successfully filtering out irrelevant losses
+            self.assertEqual(len(layer.get_losses_for([inputs])), 1)
+            self.assertEqual(
+                len(layer.get_losses_for([intermediate_inputs])), 1
+            )
+            self.assertEqual(len(layer.get_losses_for([outputs])), 0)
 
 
 class IdentityLayer(base_tf_layers.Layer):
-  """A layer returns the identity of it's input."""
+    """A layer returns the identity of it's input."""
 
-  def call(self, inputs):
-    return inputs
+    def call(self, inputs):
+        return inputs
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class DTypeTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _const(self, dtype):
-    return tf.constant(1, dtype=dtype)
-
-  def test_dtype_inferred_from_input(self):
-    # Test with Tensor input
-    layer = IdentityLayer()
-    self.assertIsNone(layer.dtype)
-    layer(self._const('float64'))
-    self.assertEqual(layer.dtype, 'float64')
-
-    # Test with Numpy input
-    layer = IdentityLayer()
-    self.assertIsNone(layer.dtype)
-    layer(np.array(1., dtype='float64'))
-    self.assertEqual(layer.dtype, 'float64')
-
-    # Test with integer input
-    layer = IdentityLayer()
-    self.assertIsNone(layer.dtype)
-    layer(self._const('int32'))
-    self.assertEqual(layer.dtype, 'int32')
-
-    # Test layer dtype doesn't change when passed a new dtype
-    layer = IdentityLayer()
-    self.assertIsNone(layer.dtype)
-    layer(self._const('float64'))
-    self.assertEqual(layer.dtype, 'float64')
-    layer(self._const('float16'))
-    self.assertEqual(layer.dtype, 'float64')
-
-    # Test layer dtype inferred from first input
-    layer = IdentityLayer()
-    layer([self._const('float32'), self._const('float64')])
-    self.assertEqual(layer.dtype, 'float32')
-
-  def test_passing_dtype_to_constructor(self):
-    layer = IdentityLayer(dtype='float64')
-    layer(self._const('float32'))
-    self.assertEqual(layer.dtype, 'float64')
-
-    layer = IdentityLayer(dtype='int32')
-    layer(self._const('float32'))
-    self.assertEqual(layer.dtype, 'int32')
-
-    layer = IdentityLayer(dtype=tf.float64)
-    layer(self._const('float32'))
-    self.assertEqual(layer.dtype, 'float64')
-
-  def test_inputs_not_casted(self):
-    layer = IdentityLayer(dtype='float32')
-    self.assertEqual(layer(self._const('float64')).dtype, 'float64')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _const(self, dtype):
+        return tf.constant(1, dtype=dtype)
+
+    def test_dtype_inferred_from_input(self):
+        # Test with Tensor input
+        layer = IdentityLayer()
+        self.assertIsNone(layer.dtype)
+        layer(self._const("float64"))
+        self.assertEqual(layer.dtype, "float64")
+
+        # Test with Numpy input
+        layer = IdentityLayer()
+        self.assertIsNone(layer.dtype)
+        layer(np.array(1.0, dtype="float64"))
+        self.assertEqual(layer.dtype, "float64")
+
+        # Test with integer input
+        layer = IdentityLayer()
+        self.assertIsNone(layer.dtype)
+        layer(self._const("int32"))
+        self.assertEqual(layer.dtype, "int32")
+
+        # Test layer dtype doesn't change when passed a new dtype
+        layer = IdentityLayer()
+        self.assertIsNone(layer.dtype)
+        layer(self._const("float64"))
+        self.assertEqual(layer.dtype, "float64")
+        layer(self._const("float16"))
+        self.assertEqual(layer.dtype, "float64")
+
+        # Test layer dtype inferred from first input
+        layer = IdentityLayer()
+        layer([self._const("float32"), self._const("float64")])
+        self.assertEqual(layer.dtype, "float32")
+
+    def test_passing_dtype_to_constructor(self):
+        layer = IdentityLayer(dtype="float64")
+        layer(self._const("float32"))
+        self.assertEqual(layer.dtype, "float64")
+
+        layer = IdentityLayer(dtype="int32")
+        layer(self._const("float32"))
+        self.assertEqual(layer.dtype, "int32")
+
+        layer = IdentityLayer(dtype=tf.float64)
+        layer(self._const("float32"))
+        self.assertEqual(layer.dtype, "float64")
+
+    def test_inputs_not_casted(self):
+        layer = IdentityLayer(dtype="float32")
+        self.assertEqual(layer(self._const("float64")).dtype, "float64")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/legacy_tf_layers/convolutional.py b/keras/legacy_tf_layers/convolutional.py
index 5eeb440ad7cf..735553e45a48 100644
--- a/keras/legacy_tf_layers/convolutional.py
+++ b/keras/legacy_tf_layers/convolutional.py
@@ -12,125 +12,273 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Contains the convolutional layer classes and their functional aliases."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import warnings
 
+import tensorflow.compat.v2 as tf
+
 from keras import layers as keras_layers
 from keras.legacy_tf_layers import base
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 
-@keras_export(v1=['keras.__internal__.legacy.layers.Conv1D'])
-@tf_export(v1=['layers.Conv1D'])
+@keras_export(v1=["keras.__internal__.legacy.layers.Conv1D"])
 class Conv1D(keras_layers.Conv1D, base.Layer):
-  """1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv1D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   conv = tf.compat.v1.layers.Conv1D(filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  ```python
-   conv = tf.keras.layers.Conv1D(filters=3, kernels_size=3)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
+    """1D convolution layer (e.g. temporal convolution).
+
+    This layer creates a convolution kernel that is convolved
+    (actually cross-correlated) with the layer input to produce a tensor of
+    outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: An integer or tuple/list of a single integer, specifying the
+        length of the 1D convolution window.
+      strides: An integer or tuple/list of a single integer,
+        specifying the stride length of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      dilation_rate: An integer or tuple/list of a single integer, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     conv = tf.compat.v1.layers.Conv1D(filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    ```python
+     conv = tf.keras.layers.Conv1D(filters=3, kernels_size=3)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format="channels_last",
+        dilation_rate=1,
+        activation=None,
+        use_bias=True,
+        kernel_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.conv1d"])
+def conv1d(
+    inputs,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=1,
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=tf.compat.v1.zeros_initializer(),
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+):
+    """Functional interface for 1D convolution (e.g. temporal convolution).
+
+    This layer creates a convolution kernel that is convolved
+    (actually cross-correlated) with the layer input to produce a tensor of
+    outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    Args:
+      inputs: Tensor input.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: An integer or tuple/list of a single integer, specifying the
+        length of the 1D convolution window.
+      strides: An integer or tuple/list of a single integer,
+        specifying the stride length of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      dilation_rate: An integer or tuple/list of a single integer, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.conv1d(x, filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.Conv1D(filters=3, kernels_size=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.conv1d` is deprecated and "
+        "will be removed in a future version. "
+        "Please Use `tf.keras.layers.Conv1D` instead.",
+        stacklevel=2,
+    )
+    layer = Conv1D(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -147,264 +295,278 @@ def __init__(self, filters,
         kernel_constraint=kernel_constraint,
         bias_constraint=bias_constraint,
         trainable=trainable,
-        name=name, **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.conv1d'])
-@tf_export(v1=['layers.conv1d'])
-def conv1d(inputs,
-           filters,
-           kernel_size,
-           strides=1,
-           padding='valid',
-           data_format='channels_last',
-           dilation_rate=1,
-           activation=None,
-           use_bias=True,
-           kernel_initializer=None,
-           bias_initializer=tf.compat.v1.zeros_initializer(),
-           kernel_regularizer=None,
-           bias_regularizer=None,
-           activity_regularizer=None,
-           kernel_constraint=None,
-           bias_constraint=None,
-           trainable=True,
-           name=None,
-           reuse=None):
-  """Functional interface for 1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Args:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv1D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.conv1d(x, filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.Conv1D(filters=3, kernels_size=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.conv1d` is deprecated and '
-      'will be removed in a future version. '
-      'Please Use `tf.keras.layers.Conv1D` instead.',
-      stacklevel=2)
-  layer = Conv1D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.Conv2D'])
-@tf_export(v1=['layers.Conv2D'])
+        name=name,
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.Conv2D"])
 class Conv2D(keras_layers.Conv2D, base.Layer):
-  """2D convolution layer (e.g. spatial convolution over images).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   conv = tf.compat.v1.layers.Conv2D(filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  ```python
-   conv = tf.keras.layers.Conv2D(filters=3, kernels_size=3)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=(1, 1),
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
+    """2D convolution layer (e.g. spatial convolution over images).
+
+    This layer creates a convolution kernel that is convolved
+    (actually cross-correlated) with the layer input to produce a tensor of
+    outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+        height and width of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the height and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     conv = tf.compat.v1.layers.Conv2D(filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    ```python
+     conv = tf.keras.layers.Conv2D(filters=3, kernels_size=3)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        data_format="channels_last",
+        dilation_rate=(1, 1),
+        activation=None,
+        use_bias=True,
+        kernel_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.conv2d"])
+def conv2d(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1),
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=tf.compat.v1.zeros_initializer(),
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+):
+    """Functional interface for the 2D convolution layer.
+
+    This layer creates a convolution kernel that is convolved
+    (actually cross-correlated) with the layer input to produce a tensor of
+    outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    Args:
+      inputs: Tensor input.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+        height and width of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the height and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.conv2d(x, filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.Conv2D(filters=3, kernels_size=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.conv2d` is deprecated and "
+        "will be removed in a future version. "
+        "Please Use `tf.keras.layers.Conv2D` instead.",
+        stacklevel=2,
+    )
+    layer = Conv2D(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -421,272 +583,280 @@ def __init__(self, filters,
         kernel_constraint=kernel_constraint,
         bias_constraint=bias_constraint,
         trainable=trainable,
-        name=name, **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.conv2d'])
-@tf_export(v1=['layers.conv2d'])
-def conv2d(inputs,
-           filters,
-           kernel_size,
-           strides=(1, 1),
-           padding='valid',
-           data_format='channels_last',
-           dilation_rate=(1, 1),
-           activation=None,
-           use_bias=True,
-           kernel_initializer=None,
-           bias_initializer=tf.compat.v1.zeros_initializer(),
-           kernel_regularizer=None,
-           bias_regularizer=None,
-           activity_regularizer=None,
-           kernel_constraint=None,
-           bias_constraint=None,
-           trainable=True,
-           name=None,
-           reuse=None):
-  """Functional interface for the 2D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Args:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.conv2d(x, filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.Conv2D(filters=3, kernels_size=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.conv2d` is deprecated and '
-      'will be removed in a future version. '
-      'Please Use `tf.keras.layers.Conv2D` instead.',
-      stacklevel=2)
-  layer = Conv2D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.Conv3D'])
-@tf_export(v1=['layers.Conv3D'])
+        name=name,
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.Conv3D"])
 class Conv3D(keras_layers.Conv3D, base.Layer):
-  """3D convolution layer (e.g. spatial convolution over volumes).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv3D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   conv = tf.compat.v1.layers.Conv3D(filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  ```python
-   conv = tf.keras.layers.Conv3D(filters=3, kernels_size=3)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1, 1),
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=(1, 1, 1),
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
+    """3D convolution layer (e.g. spatial convolution over volumes).
+
+    This layer creates a convolution kernel that is convolved
+    (actually cross-correlated) with the layer input to produce a tensor of
+    outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+        depth, height and width of the 3D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the convolution along the depth,
+        height and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, depth, height, width)`.
+      dilation_rate: An integer or tuple/list of 3 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv3D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     conv = tf.compat.v1.layers.Conv3D(filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    ```python
+     conv = tf.keras.layers.Conv3D(filters=3, kernels_size=3)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1, 1),
+        padding="valid",
+        data_format="channels_last",
+        dilation_rate=(1, 1, 1),
+        activation=None,
+        use_bias=True,
+        kernel_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.conv3d"])
+def conv3d(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1, 1),
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=tf.compat.v1.zeros_initializer(),
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+):
+    """Functional interface for the 3D convolution layer.
+
+    This layer creates a convolution kernel that is convolved
+    (actually cross-correlated) with the layer input to produce a tensor of
+    outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    Args:
+      inputs: Tensor input.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+        depth, height and width of the 3D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the convolution along the depth,
+        height and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, depth, height, width)`.
+      dilation_rate: An integer or tuple/list of 3 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv3D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.conv3d(x, filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.Conv3D(filters=3, kernels_size=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.conv3d` is deprecated and "
+        "will be removed in a future version. "
+        "Please Use `tf.keras.layers.Conv3D` instead.",
+        stacklevel=2,
+    )
+    layer = Conv3D(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -703,279 +873,453 @@ def __init__(self, filters,
         kernel_constraint=kernel_constraint,
         bias_constraint=bias_constraint,
         trainable=trainable,
-        name=name, **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.conv3d'])
-@tf_export(v1=['layers.conv3d'])
-def conv3d(inputs,
-           filters,
-           kernel_size,
-           strides=(1, 1, 1),
-           padding='valid',
-           data_format='channels_last',
-           dilation_rate=(1, 1, 1),
-           activation=None,
-           use_bias=True,
-           kernel_initializer=None,
-           bias_initializer=tf.compat.v1.zeros_initializer(),
-           kernel_regularizer=None,
-           bias_regularizer=None,
-           activity_regularizer=None,
-           kernel_constraint=None,
-           bias_constraint=None,
-           trainable=True,
-           name=None,
-           reuse=None):
-  """Functional interface for the 3D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Args:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Conv3D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.conv3d(x, filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.Conv3D(filters=3, kernels_size=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.conv3d` is deprecated and '
-      'will be removed in a future version. '
-      'Please Use `tf.keras.layers.Conv3D` instead.',
-      stacklevel=2)
-  layer = Conv3D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.SeparableConv1D'])
-@tf_export(v1=['layers.SeparableConv1D'])
+        name=name,
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.SeparableConv1D"])
 class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
-  """Depthwise separable 1D convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A single integer specifying the spatial
-      dimensions of the filters.
-    strides: A single integer specifying the strides
-      of the convolution.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: A single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.SeparableConv1D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   conv = tf.compat.v1.layers.SeparableConv1D(filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  ```python
-   conv = tf.keras.layers.SeparableConv1D(filters=3, kernels_size=3)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
+    """Depthwise separable 1D convolution.
+
+    This layer performs a depthwise convolution that acts separately on
+    channels, followed by a pointwise convolution that mixes channels.
+    If `use_bias` is True and a bias initializer is provided,
+    it adds a bias vector to the output.
+    It then optionally applies an activation function to produce the final
+    output.
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A single integer specifying the spatial
+        dimensions of the filters.
+      strides: A single integer specifying the strides
+        of the convolution.
+        Specifying any `stride` value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      dilation_rate: A single integer, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `num_filters_in * depth_multiplier`.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      depthwise_initializer: An initializer for the depthwise convolution
+        kernel.
+      pointwise_initializer: An initializer for the pointwise convolution
+        kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      depthwise_regularizer: Optional regularizer for the depthwise
+        convolution kernel.
+      pointwise_regularizer: Optional regularizer for the pointwise
+        convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      depthwise_constraint: Optional projection function to be applied to the
+          depthwise kernel after being updated by an `Optimizer` (e.g. used for
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      pointwise_constraint: Optional projection function to be applied to the
+          pointwise kernel after being updated by an `Optimizer`.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.SeparableConv1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     conv = tf.compat.v1.layers.SeparableConv1D(filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    ```python
+     conv = tf.keras.layers.SeparableConv1D(filters=3, kernels_size=3)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=1,
+        padding="valid",
+        data_format="channels_last",
+        dilation_rate=1,
+        depth_multiplier=1,
+        activation=None,
+        use_bias=True,
+        depthwise_initializer=None,
+        pointwise_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        depthwise_regularizer=None,
+        pointwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        pointwise_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            depth_multiplier=depth_multiplier,
+            activation=activation,
+            use_bias=use_bias,
+            depthwise_initializer=depthwise_initializer,
+            pointwise_initializer=pointwise_initializer,
+            bias_initializer=bias_initializer,
+            depthwise_regularizer=depthwise_regularizer,
+            pointwise_regularizer=pointwise_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            depthwise_constraint=depthwise_constraint,
+            pointwise_constraint=pointwise_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.SeparableConv2D"])
+class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
+    """Depthwise separable 2D convolution.
+
+    This layer performs a depthwise convolution that acts separately on
+    channels, followed by a pointwise convolution that mixes channels.
+    If `use_bias` is True and a bias initializer is provided,
+    it adds a bias vector to the output. It then optionally applies an
+    activation function to produce the final output.
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A tuple or list of 2 integers specifying the spatial
+        dimensions of the filters. Can be a single integer to specify the same
+        value for all spatial dimensions.
+      strides: A tuple or list of 2 positive integers specifying the strides
+        of the convolution. Can be a single integer to specify the same value
+        for all spatial dimensions.
+        Specifying any `stride` value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `num_filters_in * depth_multiplier`.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      depthwise_initializer: An initializer for the depthwise convolution
+        kernel.
+      pointwise_initializer: An initializer for the pointwise convolution
+        kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      depthwise_regularizer: Optional regularizer for the depthwise
+        convolution kernel.
+      pointwise_regularizer: Optional regularizer for the pointwise
+        convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      depthwise_constraint: Optional projection function to be applied to the
+          depthwise kernel after being updated by an `Optimizer` (e.g. used for
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      pointwise_constraint: Optional projection function to be applied to the
+          pointwise kernel after being updated by an `Optimizer`.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.SeparableConv2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     conv = tf.compat.v1.layers.SeparableConv2D(filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    ```python
+     conv = tf.keras.layers.SeparableConv2D(filters=3, kernels_size=3)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        data_format="channels_last",
+        dilation_rate=(1, 1),
+        depth_multiplier=1,
+        activation=None,
+        use_bias=True,
+        depthwise_initializer=None,
+        pointwise_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        depthwise_regularizer=None,
+        pointwise_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        depthwise_constraint=None,
+        pointwise_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            dilation_rate=dilation_rate,
+            depth_multiplier=depth_multiplier,
+            activation=activation,
+            use_bias=use_bias,
+            depthwise_initializer=depthwise_initializer,
+            pointwise_initializer=pointwise_initializer,
+            bias_initializer=bias_initializer,
+            depthwise_regularizer=depthwise_regularizer,
+            pointwise_regularizer=pointwise_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            depthwise_constraint=depthwise_constraint,
+            pointwise_constraint=pointwise_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.separable_conv1d"])
+def separable_conv1d(
+    inputs,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=1,
+    depth_multiplier=1,
+    activation=None,
+    use_bias=True,
+    depthwise_initializer=None,
+    pointwise_initializer=None,
+    bias_initializer=tf.compat.v1.zeros_initializer(),
+    depthwise_regularizer=None,
+    pointwise_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    depthwise_constraint=None,
+    pointwise_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+):
+    """Functional interface for the depthwise separable 1D convolution layer.
+
+    This layer performs a depthwise convolution that acts separately on
+    channels, followed by a pointwise convolution that mixes channels.
+    If `use_bias` is True and a bias initializer is provided,
+    it adds a bias vector to the output. It then optionally applies an
+    activation function to produce the final output.
+
+    Args:
+      inputs: Input tensor.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A single integer specifying the spatial
+        dimensions of the filters.
+      strides: A single integer specifying the strides
+        of the convolution.
+        Specifying any `stride` value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      dilation_rate: A single integer, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `num_filters_in * depth_multiplier`.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      depthwise_initializer: An initializer for the depthwise convolution
+        kernel.
+      pointwise_initializer: An initializer for the pointwise convolution
+        kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      depthwise_regularizer: Optional regularizer for the depthwise
+        convolution kernel.
+      pointwise_regularizer: Optional regularizer for the pointwise
+        convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      depthwise_constraint: Optional projection function to be applied to the
+          depthwise kernel after being updated by an `Optimizer` (e.g. used for
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      pointwise_constraint: Optional projection function to be applied to the
+          pointwise kernel after being updated by an `Optimizer`.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.SeparableConv1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.separable_conv1d(x, filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.SeparableConv1D(filters=3, kernels_size=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.separable_conv1d` is deprecated and "
+        "will be removed in a future version. "
+        "Please Use `tf.keras.layers.SeparableConv1D` instead.",
+        stacklevel=2,
+    )
+    layer = SeparableConv1D(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -997,132 +1341,156 @@ def __init__(self, filters,
         bias_constraint=bias_constraint,
         trainable=trainable,
         name=name,
-        **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.SeparableConv2D'])
-@tf_export(v1=['layers.SeparableConv2D'])
-class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
-  """Depthwise separable 2D convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.SeparableConv2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   conv = tf.compat.v1.layers.SeparableConv2D(filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  ```python
-   conv = tf.keras.layers.SeparableConv2D(filters=3, kernels_size=3)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=(1, 1),
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.separable_conv2d"])
+def separable_conv2d(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1),
+    depth_multiplier=1,
+    activation=None,
+    use_bias=True,
+    depthwise_initializer=None,
+    pointwise_initializer=None,
+    bias_initializer=tf.compat.v1.zeros_initializer(),
+    depthwise_regularizer=None,
+    pointwise_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    depthwise_constraint=None,
+    pointwise_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+):
+    """Functional interface for the depthwise separable 2D convolution layer.
+
+    This layer performs a depthwise convolution that acts separately on
+    channels, followed by a pointwise convolution that mixes channels.
+    If `use_bias` is True and a bias initializer is provided,
+    it adds a bias vector to the output. It then optionally applies an
+    activation function to produce the final output.
+
+    Args:
+      inputs: Input tensor.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A tuple or list of 2 integers specifying the spatial
+        dimensions of the filters. Can be a single integer to specify the same
+        value for all spatial dimensions.
+      strides: A tuple or list of 2 positive integers specifying the strides
+        of the convolution. Can be a single integer to specify the same value
+        for all spatial dimensions. Specifying any `stride` value != 1 is
+        incompatible with specifying any `dilation_rate` value != 1.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+        the dilation rate to use for dilated convolution.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any stride value != 1.
+      depth_multiplier: The number of depthwise convolution output channels for
+        each input channel. The total number of depthwise convolution output
+        channels will be equal to `num_filters_in * depth_multiplier`.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      depthwise_initializer: An initializer for the depthwise convolution
+        kernel.
+      pointwise_initializer: An initializer for the pointwise convolution
+        kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      depthwise_regularizer: Optional regularizer for the depthwise
+        convolution kernel.
+      pointwise_regularizer: Optional regularizer for the pointwise
+        convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      depthwise_constraint: Optional projection function to be applied to the
+          depthwise kernel after being updated by an `Optimizer` (e.g. used for
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      pointwise_constraint: Optional projection function to be applied to the
+          pointwise kernel after being updated by an `Optimizer`.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.SeparableConv2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.separable_conv2d(x, filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.SeparableConv2D(filters=3, kernels_size=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.separable_conv2d` is deprecated and "
+        "will be removed in a future version. "
+        "Please Use `tf.keras.layers.SeparableConv2D` instead.",
+        stacklevel=2,
+    )
+    layer = SeparableConv2D(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1144,439 +1512,256 @@ def __init__(self, filters,
         bias_constraint=bias_constraint,
         trainable=trainable,
         name=name,
-        **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.separable_conv1d'])
-@tf_export(v1=['layers.separable_conv1d'])
-def separable_conv1d(inputs,
-                     filters,
-                     kernel_size,
-                     strides=1,
-                     padding='valid',
-                     data_format='channels_last',
-                     dilation_rate=1,
-                     depth_multiplier=1,
-                     activation=None,
-                     use_bias=True,
-                     depthwise_initializer=None,
-                     pointwise_initializer=None,
-                     bias_initializer=tf.compat.v1.zeros_initializer(),
-                     depthwise_regularizer=None,
-                     pointwise_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     depthwise_constraint=None,
-                     pointwise_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for the depthwise separable 1D convolution layer.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Args:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A single integer specifying the spatial
-      dimensions of the filters.
-    strides: A single integer specifying the strides
-      of the convolution.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: A single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.SeparableConv1D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.separable_conv1d(x, filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.SeparableConv1D(filters=3, kernels_size=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.separable_conv1d` is deprecated and '
-      'will be removed in a future version. '
-      'Please Use `tf.keras.layers.SeparableConv1D` instead.',
-      stacklevel=2)
-  layer = SeparableConv1D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      depth_multiplier=depth_multiplier,
-      activation=activation,
-      use_bias=use_bias,
-      depthwise_initializer=depthwise_initializer,
-      pointwise_initializer=pointwise_initializer,
-      bias_initializer=bias_initializer,
-      depthwise_regularizer=depthwise_regularizer,
-      pointwise_regularizer=pointwise_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      depthwise_constraint=depthwise_constraint,
-      pointwise_constraint=pointwise_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.separable_conv2d'])
-@tf_export(v1=['layers.separable_conv2d'])
-def separable_conv2d(inputs,
-                     filters,
-                     kernel_size,
-                     strides=(1, 1),
-                     padding='valid',
-                     data_format='channels_last',
-                     dilation_rate=(1, 1),
-                     depth_multiplier=1,
-                     activation=None,
-                     use_bias=True,
-                     depthwise_initializer=None,
-                     pointwise_initializer=None,
-                     bias_initializer=tf.compat.v1.zeros_initializer(),
-                     depthwise_regularizer=None,
-                     pointwise_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     depthwise_constraint=None,
-                     pointwise_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for the depthwise separable 2D convolution layer.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Args:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.SeparableConv2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.separable_conv2d(x, filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.SeparableConv2D(filters=3, kernels_size=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.separable_conv2d` is deprecated and '
-      'will be removed in a future version. '
-      'Please Use `tf.keras.layers.SeparableConv2D` instead.',
-      stacklevel=2)
-  layer = SeparableConv2D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      depth_multiplier=depth_multiplier,
-      activation=activation,
-      use_bias=use_bias,
-      depthwise_initializer=depthwise_initializer,
-      pointwise_initializer=pointwise_initializer,
-      bias_initializer=bias_initializer,
-      depthwise_regularizer=depthwise_regularizer,
-      pointwise_regularizer=pointwise_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      depthwise_constraint=depthwise_constraint,
-      pointwise_constraint=pointwise_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.Conv2DTranspose'])
-@tf_export(v1=['layers.Conv2DTranspose'])
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.Conv2DTranspose"])
 class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
-  """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
-
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 positive integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.Conv2DTranspose`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   conv = tf.compat.v1.layers.Conv2DTranspose(filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  ```python
-   conv = tf.keras.layers.Conv2DTranspose(filters=3, kernels_size=3)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format='channels_last',
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
+    """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A tuple or list of 2 positive integers specifying the spatial
+        dimensions of the filters. Can be a single integer to specify the same
+        value for all spatial dimensions.
+      strides: A tuple or list of 2 positive integers specifying the strides
+        of the convolution. Can be a single integer to specify the same value
+        for all spatial dimensions.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.Conv2DTranspose`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     conv = tf.compat.v1.layers.Conv2DTranspose(filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    ```python
+     conv = tf.keras.layers.Conv2DTranspose(filters=3, kernels_size=3)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1),
+        padding="valid",
+        data_format="channels_last",
+        activation=None,
+        use_bias=True,
+        kernel_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.conv2d_transpose"])
+def conv2d_transpose(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1),
+    padding="valid",
+    data_format="channels_last",
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=tf.compat.v1.zeros_initializer(),
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+):
+    """Functional interface for transposed 2D convolution layer.
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
+
+    Args:
+      inputs: Input tensor.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A tuple or list of 2 positive integers specifying the spatial
+        dimensions of the filters. Can be a single integer to specify the same
+        value for all spatial dimensions.
+      strides: A tuple or list of 2 positive integers specifying the strides
+        of the convolution. Can be a single integer to specify the same value
+        for all spatial dimensions.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+      activation: Activation function. Set it to `None` to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If `None`, the
+        default initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.Conv2DTranspose`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.conv2d_transpose(x, filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.Conv2DTranspose(filters=3, kernels_size=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.conv2d_transpose` is deprecated and "
+        "will be removed in a future version. "
+        "Please Use `tf.keras.layers.Conv2DTranspose` instead.",
+        stacklevel=2,
+    )
+    layer = Conv2DTranspose(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1593,247 +1778,246 @@ def __init__(self, filters,
         bias_constraint=bias_constraint,
         trainable=trainable,
         name=name,
-        **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.conv2d_transpose'])
-@tf_export(v1=['layers.conv2d_transpose'])
-def conv2d_transpose(inputs,
-                     filters,
-                     kernel_size,
-                     strides=(1, 1),
-                     padding='valid',
-                     data_format='channels_last',
-                     activation=None,
-                     use_bias=True,
-                     kernel_initializer=None,
-                     bias_initializer=tf.compat.v1.zeros_initializer(),
-                     kernel_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     kernel_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for transposed 2D convolution layer.
-
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
-
-  Args:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 positive integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    activation: Activation function. Set it to `None` to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.Conv2DTranspose`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.conv2d_transpose(x, filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.Conv2DTranspose(filters=3, kernels_size=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.conv2d_transpose` is deprecated and '
-      'will be removed in a future version. '
-      'Please Use `tf.keras.layers.Conv2DTranspose` instead.',
-      stacklevel=2)
-  layer = Conv2DTranspose(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.Conv3DTranspose'])
-@tf_export(v1=['layers.Conv3DTranspose'])
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.Conv3DTranspose"])
 class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
-  """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
-
-  Args:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for all spatial
-      dimensions.
-    strides: An integer or tuple/list of 3 integers, specifying the strides
-      of the convolution along the depth, height and width.
-      Can be a single integer to specify the same value for all spatial
-      dimensions.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    activation: Activation function. Set it to `None` to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.Conv3DTranspose`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   conv = tf.compat.v1.layers.Conv3DTranspose(filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  ```python
-   conv = tf.keras.layers.Conv3DTranspose(filters=3, kernels_size=3)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1, 1),
-               padding='valid',
-               data_format='channels_last',
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(
+    """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
+
+    Args:
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+        depth, height and width of the 3D convolution window.
+        Can be a single integer to specify the same value for all spatial
+        dimensions.
+      strides: An integer or tuple/list of 3 integers, specifying the strides
+        of the convolution along the depth, height and width.
+        Can be a single integer to specify the same value for all spatial
+        dimensions.
+      padding: One of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, depth, height, width)`.
+      activation: Activation function. Set it to `None` to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If `None`, the
+        default initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.Conv3DTranspose`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     conv = tf.compat.v1.layers.Conv3DTranspose(filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    ```python
+     conv = tf.keras.layers.Conv3DTranspose(filters=3, kernels_size=3)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        filters,
+        kernel_size,
+        strides=(1, 1, 1),
+        padding="valid",
+        data_format="channels_last",
+        activation=None,
+        use_bias=True,
+        kernel_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.conv3d_transpose"])
+def conv3d_transpose(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format="channels_last",
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=tf.compat.v1.zeros_initializer(),
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None,
+):
+    """Functional interface for transposed 3D convolution layer.
+
+    Args:
+      inputs: Input tensor.
+      filters: Integer, the dimensionality of the output space (i.e. the number
+        of filters in the convolution).
+      kernel_size: A tuple or list of 3 positive integers specifying the spatial
+        dimensions of the filters. Can be a single integer to specify the same
+        value for all spatial dimensions.
+      strides: A tuple or list of 3 positive integers specifying the strides
+        of the convolution. Can be a single integer to specify the same value
+        for all spatial dimensions.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, depth, height, width)`.
+      activation: Activation function. Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: An initializer for the convolution kernel.
+      bias_initializer: An initializer for the bias vector. If None, the default
+        initializer will be used.
+      kernel_regularizer: Optional regularizer for the convolution kernel.
+      bias_regularizer: Optional regularizer for the bias vector.
+      activity_regularizer: Optional regularizer function for the output.
+      kernel_constraint: Optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: Optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: A string, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.Conv3DTranspose`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.conv3d_transpose(x, filters=3, kernel_size=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.Conv3DTranspose(filters=3, kernels_size=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.conv3d_transpose` is deprecated and "
+        "will be removed in a future version. "
+        "Please Use `tf.keras.layers.Conv3DTranspose` instead.",
+        stacklevel=2,
+    )
+    layer = Conv3DTranspose(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1850,141 +2034,10 @@ def __init__(self,
         bias_constraint=bias_constraint,
         trainable=trainable,
         name=name,
-        **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.conv3d_transpose'])
-@tf_export(v1=['layers.conv3d_transpose'])
-def conv3d_transpose(inputs,
-                     filters,
-                     kernel_size,
-                     strides=(1, 1, 1),
-                     padding='valid',
-                     data_format='channels_last',
-                     activation=None,
-                     use_bias=True,
-                     kernel_initializer=None,
-                     bias_initializer=tf.compat.v1.zeros_initializer(),
-                     kernel_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     kernel_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for transposed 3D convolution layer.
-
-  Args:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 3 positive integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 3 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to
-      the left/right or up/down of the input such that output has the same
-      height/width dimension as the input.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.Conv3DTranspose`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.conv3d_transpose(x, filters=3, kernel_size=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.Conv3DTranspose(filters=3, kernels_size=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.conv3d_transpose` is deprecated and '
-      'will be removed in a future version. '
-      'Please Use `tf.keras.layers.Conv3DTranspose` instead.',
-      stacklevel=2)
-  layer = Conv3DTranspose(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs)
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs)
 
 
 # Aliases
diff --git a/keras/legacy_tf_layers/convolutional_test.py b/keras/legacy_tf_layers/convolutional_test.py
index 19d4a671048e..296aef07d981 100644
--- a/keras/legacy_tf_layers/convolutional_test.py
+++ b/keras/legacy_tf_layers/convolutional_test.py
@@ -18,1154 +18,1373 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 from keras.legacy_tf_layers import convolutional as conv_layers
 
 
 class ConvTest(tf.test.TestCase):
+    def testInvalidDataFormat(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "data_format"):
+            conv_layers.conv2d(images, 32, 3, data_format="invalid")
 
-  def testInvalidDataFormat(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'data_format'):
-      conv_layers.conv2d(images, 32, 3, data_format='invalid')
-
-  def testInvalidStrides(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.conv2d(images, 32, 3, strides=(1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.conv2d(images, 32, 3, strides=None)
-
-  def testInvalidKernelSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.conv2d(images, 32, (1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.conv2d(images, 32, None)
-
-  def testCreateConv2D(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.Conv2D(32, [3, 3], activation=tf.nn.relu)
-    output = layer(images)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'conv2d/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testConv2DFloat16(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4), dtype='float16')
-    output = conv_layers.conv2d(images, 32, [3, 3], activation=tf.nn.relu)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-
-  def testCreateConv2DIntegerKernelSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.Conv2D(32, 3)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testCreateConv2DChannelsFirst(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, 4, height, width))
-      layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, 32, height - 2, width - 2])
-      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
-      self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testUnknownInputChannels(self):
-    with tf.Graph().as_default():
-      images = tf.compat.v1.placeholder(tf.float32, (5, 7, 9, None))
-      layer = conv_layers.Conv2D(32, [3, 3], activation=tf.nn.relu)
-      with self.assertRaisesRegex(
-          ValueError, 'The channel dimension of the inputs '
-          'should be defined. The input_shape received is'):
-        _ = layer(images)
-
-      images = tf.compat.v1.placeholder(tf.float32, (5, None, 7, 9))
-      layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
-      with self.assertRaisesRegex(
-          ValueError, 'The channel dimension of the inputs '
-          'should be defined. The input_shape received is'):
-        _ = layer(images)
-
-  def testConv2DPaddingSame(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 32), seed=1)
-    layer = conv_layers.Conv2D(64, images.get_shape()[1:3], padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
-
-  def testCreateConvWithStrides(self):
-    height, width = 6, 8
-    # Test strides tuple
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    layer = conv_layers.Conv2D(32, [3, 3], strides=(2, 2), padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width / 2, 32])
-
-    # Test strides integer
-    layer = conv_layers.Conv2D(32, [3, 3], strides=2, padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width / 2, 32])
-
-    # Test unequal strides
-    layer = conv_layers.Conv2D(32, [3, 3], strides=(2, 1), padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width, 32])
-
-  def testCreateConv1D(self):
-    width = 7
-    data = tf.random.uniform((5, width, 4))
-    layer = conv_layers.Conv1D(32, 3, activation=tf.nn.relu)
-    output = layer(data)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'conv1d/Relu')
-    self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testConv1DFloat16(self):
-    width = 7
-    data = tf.random.uniform((5, width, 4), dtype='float16')
-    output = conv_layers.conv1d(data, 32, 3, activation=tf.nn.relu)
-    self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
-
-  def testCreateConv1DChannelsFirst(self):
-    with tf.Graph().as_default():
-      width = 7
-      data = tf.random.uniform((5, 4, width))
-      layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
-      output = layer(data)
-      self.assertListEqual(output.get_shape().as_list(), [5, 32, width - 2])
-      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
-      self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testUnknownInputChannelsConv1D(self):
-    with tf.Graph().as_default():
-      data = tf.compat.v1.placeholder(tf.float32, (5, 4, None))
-      layer = conv_layers.Conv1D(32, 3, activation=tf.nn.relu)
-      with self.assertRaisesRegex(
-          ValueError, 'The channel dimension of the inputs '
-          'should be defined. The input_shape received is'):
-        _ = layer(data)
-
-      data = tf.compat.v1.placeholder(tf.float32, (5, None, 4))
-      layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
-      with self.assertRaisesRegex(
-          ValueError, 'The channel dimension of the inputs '
-          'should be defined. The input_shape received is'):
-        _ = layer(data)
-
-  def testCreateConv3D(self):
-    depth, height, width = 6, 7, 9
-    volumes = tf.random.uniform((5, depth, height, width, 4))
-    layer = conv_layers.Conv3D(32, [3, 3, 3], activation=tf.nn.relu)
-    output = layer(volumes)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'conv3d/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth - 2, height - 2, width - 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testUnknownInputChannelsConv3D(self):
-    with tf.Graph().as_default():
-      volumes = tf.compat.v1.placeholder(tf.float32, (5, 6, 7, 9, None))
-      layer = conv_layers.Conv3D(32, [3, 3, 3], activation=tf.nn.relu)
-      with self.assertRaisesRegex(
-          ValueError, 'The channel dimension of the inputs '
-          'should be defined. The input_shape received is'):
-        _ = layer(volumes)
-
-  def testConv2DKernelRegularizer(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.Conv2D(32, [3, 3], kernel_regularizer=reg)
-      layer(images)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testConv2DBiasRegularizer(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.Conv2D(32, [3, 3], bias_regularizer=reg)
-      layer(images)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testConv2DNoBias(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.Conv2D(
-        32, [3, 3], activation=tf.nn.relu, use_bias=False)
-    output = layer(images)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'conv2d/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
-    self.assertEqual(layer.bias, None)
-
-  def testDilatedConv2D(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.Conv2D(32, [3, 3], dilation_rate=3)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 1, 3, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-    # Test tuple dilation rate
-    layer = conv_layers.Conv2D(32, [3, 3], dilation_rate=(1, 3))
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, height - 2, 3, 32])
-
-  def testFunctionalConv2DReuse(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 3), seed=1)
-      conv_layers.conv2d(images, 32, [3, 3], name='conv1')
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-
-  def testFunctionalConv2DReuseFromScope(self):
-    with tf.Graph().as_default():
-      with tf.compat.v1.variable_scope('scope'):
+    def testInvalidStrides(self):
         height, width = 7, 9
         images = tf.random.uniform((5, height, width, 3), seed=1)
-        conv_layers.conv2d(images, 32, [3, 3], name='conv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      with tf.compat.v1.variable_scope('scope', reuse=True):
-        conv_layers.conv2d(images, 32, [3, 3], name='conv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-
-  def testFunctionalConv2DInitializerFromScope(self):
-    with tf.Graph().as_default(), self.cached_session():
-      with tf.compat.v1.variable_scope(
-          'scope', initializer=tf.compat.v1.ones_initializer()):
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.conv2d(images, 32, 3, strides=(1, 2, 3))
+
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.conv2d(images, 32, 3, strides=None)
+
+    def testInvalidKernelSize(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.conv2d(images, 32, (1, 2, 3))
+
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.conv2d(images, 32, None)
+
+    def testCreateConv2D(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.Conv2D(32, [3, 3], activation=tf.nn.relu)
+        output = layer(images)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "conv2d/Relu")
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+        )
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testConv2DFloat16(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4), dtype="float16")
+        output = conv_layers.conv2d(images, 32, [3, 3], activation=tf.nn.relu)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+        )
+
+    def testCreateConv2DIntegerKernelSize(self):
         height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.Conv2D(32, 3)
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+        )
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testCreateConv2DChannelsFirst(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, 4, height, width))
+            layer = conv_layers.Conv2D(32, [3, 3], data_format="channels_first")
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, 32, height - 2, width - 2]
+            )
+            self.assertListEqual(
+                layer.kernel.get_shape().as_list(), [3, 3, 4, 32]
+            )
+            self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testUnknownInputChannels(self):
+        with tf.Graph().as_default():
+            images = tf.compat.v1.placeholder(tf.float32, (5, 7, 9, None))
+            layer = conv_layers.Conv2D(32, [3, 3], activation=tf.nn.relu)
+            with self.assertRaisesRegex(
+                ValueError,
+                "The channel dimension of the inputs "
+                "should be defined. The input_shape received is",
+            ):
+                _ = layer(images)
+
+            images = tf.compat.v1.placeholder(tf.float32, (5, None, 7, 9))
+            layer = conv_layers.Conv2D(32, [3, 3], data_format="channels_first")
+            with self.assertRaisesRegex(
+                ValueError,
+                "The channel dimension of the inputs "
+                "should be defined. The input_shape received is",
+            ):
+                _ = layer(images)
+
+    def testConv2DPaddingSame(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 32), seed=1)
+        layer = conv_layers.Conv2D(64, images.get_shape()[1:3], padding="same")
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height, width, 64]
+        )
+
+    def testCreateConvWithStrides(self):
+        height, width = 6, 8
+        # Test strides tuple
         images = tf.random.uniform((5, height, width, 3), seed=1)
-        conv_layers.conv2d(images, 32, [3, 3], name='conv1')
-        weights = tf.compat.v1.trainable_variables()
-        # Check the names of weights in order.
-        self.assertTrue('kernel' in weights[0].name)
-        self.assertTrue('bias' in weights[1].name)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        weights = self.evaluate(weights)
-        # Check that the kernel weights got initialized to ones (from scope)
-        self.assertAllClose(weights[0], np.ones((3, 3, 3, 32)))
-        # Check that the bias still got initialized to zeros.
-        self.assertAllClose(weights[1], np.zeros((32)))
-
-  def testFunctionalConv2DNoReuse(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 3), seed=1)
-      conv_layers.conv2d(images, 32, [3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      conv_layers.conv2d(images, 32, [3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 4)
-
-  def testConstraints(self):
-    # Conv1D
-    k_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    conv1d = conv_layers.Conv1D(2, 3,
-                                kernel_constraint=k_constraint,
-                                bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3, 5), seed=1)
-    conv1d(inputs)
-    self.assertEqual(conv1d.kernel_constraint, k_constraint)
-    self.assertEqual(conv1d.bias_constraint, b_constraint)
-
-    # Conv2D
-    k_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    conv2d = conv_layers.Conv2D(2, 3,
-                                kernel_constraint=k_constraint,
-                                bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3, 3, 5), seed=1)
-    conv2d(inputs)
-    self.assertEqual(conv2d.kernel_constraint, k_constraint)
-    self.assertEqual(conv2d.bias_constraint, b_constraint)
-
-    # Conv3D
-    k_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    conv3d = conv_layers.Conv3D(2, 3,
-                                kernel_constraint=k_constraint,
-                                bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3, 3, 3, 5), seed=1)
-    conv3d(inputs)
-    self.assertEqual(conv3d.kernel_constraint, k_constraint)
-    self.assertEqual(conv3d.bias_constraint, b_constraint)
-
-  def testConv3DChannelsFirst(self):
-    # Test case for GitHub issue 15655
-    with tf.Graph().as_default():
-      images = tf.compat.v1.placeholder(
-          dtype=tf.float32, shape=[None, 1, 32, 32, 32])
-      conv_layers.conv3d(images, 32, 9, data_format='channels_first')
+        layer = conv_layers.Conv2D(32, [3, 3], strides=(2, 2), padding="same")
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height / 2, width / 2, 32]
+        )
+
+        # Test strides integer
+        layer = conv_layers.Conv2D(32, [3, 3], strides=2, padding="same")
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height / 2, width / 2, 32]
+        )
+
+        # Test unequal strides
+        layer = conv_layers.Conv2D(32, [3, 3], strides=(2, 1), padding="same")
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height / 2, width, 32]
+        )
+
+    def testCreateConv1D(self):
+        width = 7
+        data = tf.random.uniform((5, width, 4))
+        layer = conv_layers.Conv1D(32, 3, activation=tf.nn.relu)
+        output = layer(data)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "conv1d/Relu")
+        self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testConv1DFloat16(self):
+        width = 7
+        data = tf.random.uniform((5, width, 4), dtype="float16")
+        output = conv_layers.conv1d(data, 32, 3, activation=tf.nn.relu)
+        self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
+
+    def testCreateConv1DChannelsFirst(self):
+        with tf.Graph().as_default():
+            width = 7
+            data = tf.random.uniform((5, 4, width))
+            layer = conv_layers.Conv1D(32, 3, data_format="channels_first")
+            output = layer(data)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, 32, width - 2]
+            )
+            self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
+            self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testUnknownInputChannelsConv1D(self):
+        with tf.Graph().as_default():
+            data = tf.compat.v1.placeholder(tf.float32, (5, 4, None))
+            layer = conv_layers.Conv1D(32, 3, activation=tf.nn.relu)
+            with self.assertRaisesRegex(
+                ValueError,
+                "The channel dimension of the inputs "
+                "should be defined. The input_shape received is",
+            ):
+                _ = layer(data)
+
+            data = tf.compat.v1.placeholder(tf.float32, (5, None, 4))
+            layer = conv_layers.Conv1D(32, 3, data_format="channels_first")
+            with self.assertRaisesRegex(
+                ValueError,
+                "The channel dimension of the inputs "
+                "should be defined. The input_shape received is",
+            ):
+                _ = layer(data)
+
+    def testCreateConv3D(self):
+        depth, height, width = 6, 7, 9
+        volumes = tf.random.uniform((5, depth, height, width, 4))
+        layer = conv_layers.Conv3D(32, [3, 3, 3], activation=tf.nn.relu)
+        output = layer(volumes)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "conv3d/Relu")
+        self.assertListEqual(
+            output.get_shape().as_list(),
+            [5, depth - 2, height - 2, width - 2, 32],
+        )
+        self.assertListEqual(
+            layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32]
+        )
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testUnknownInputChannelsConv3D(self):
+        with tf.Graph().as_default():
+            volumes = tf.compat.v1.placeholder(tf.float32, (5, 6, 7, 9, None))
+            layer = conv_layers.Conv3D(32, [3, 3, 3], activation=tf.nn.relu)
+            with self.assertRaisesRegex(
+                ValueError,
+                "The channel dimension of the inputs "
+                "should be defined. The input_shape received is",
+            ):
+                _ = layer(volumes)
+
+    def testConv2DKernelRegularizer(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.Conv2D(32, [3, 3], kernel_regularizer=reg)
+            layer(images)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testConv2DBiasRegularizer(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.Conv2D(32, [3, 3], bias_regularizer=reg)
+            layer(images)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testConv2DNoBias(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.Conv2D(
+            32, [3, 3], activation=tf.nn.relu, use_bias=False
+        )
+        output = layer(images)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "conv2d/Relu")
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+        )
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+        self.assertEqual(layer.bias, None)
+
+    def testDilatedConv2D(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.Conv2D(32, [3, 3], dilation_rate=3)
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 1, 3, 32])
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+        # Test tuple dilation rate
+        layer = conv_layers.Conv2D(32, [3, 3], dilation_rate=(1, 3))
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, 3, 32]
+        )
+
+    def testFunctionalConv2DReuse(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 3), seed=1)
+            conv_layers.conv2d(images, 32, [3, 3], name="conv1")
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            conv_layers.conv2d(images, 32, [3, 3], name="conv1", reuse=True)
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+
+    def testFunctionalConv2DReuseFromScope(self):
+        with tf.Graph().as_default():
+            with tf.compat.v1.variable_scope("scope"):
+                height, width = 7, 9
+                images = tf.random.uniform((5, height, width, 3), seed=1)
+                conv_layers.conv2d(images, 32, [3, 3], name="conv1")
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            with tf.compat.v1.variable_scope("scope", reuse=True):
+                conv_layers.conv2d(images, 32, [3, 3], name="conv1")
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+
+    def testFunctionalConv2DInitializerFromScope(self):
+        with tf.Graph().as_default(), self.cached_session():
+            with tf.compat.v1.variable_scope(
+                "scope", initializer=tf.compat.v1.ones_initializer()
+            ):
+                height, width = 7, 9
+                images = tf.random.uniform((5, height, width, 3), seed=1)
+                conv_layers.conv2d(images, 32, [3, 3], name="conv1")
+                weights = tf.compat.v1.trainable_variables()
+                # Check the names of weights in order.
+                self.assertTrue("kernel" in weights[0].name)
+                self.assertTrue("bias" in weights[1].name)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                weights = self.evaluate(weights)
+                # Check that the kernel weights got initialized to ones (from
+                # scope)
+                self.assertAllClose(weights[0], np.ones((3, 3, 3, 32)))
+                # Check that the bias still got initialized to zeros.
+                self.assertAllClose(weights[1], np.zeros((32)))
+
+    def testFunctionalConv2DNoReuse(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 3), seed=1)
+            conv_layers.conv2d(images, 32, [3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            conv_layers.conv2d(images, 32, [3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 4)
+
+    def testConstraints(self):
+        # Conv1D
+        k_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        conv1d = conv_layers.Conv1D(
+            2, 3, kernel_constraint=k_constraint, bias_constraint=b_constraint
+        )
+        inputs = tf.random.uniform((5, 3, 5), seed=1)
+        conv1d(inputs)
+        self.assertEqual(conv1d.kernel_constraint, k_constraint)
+        self.assertEqual(conv1d.bias_constraint, b_constraint)
+
+        # Conv2D
+        k_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        conv2d = conv_layers.Conv2D(
+            2, 3, kernel_constraint=k_constraint, bias_constraint=b_constraint
+        )
+        inputs = tf.random.uniform((5, 3, 3, 5), seed=1)
+        conv2d(inputs)
+        self.assertEqual(conv2d.kernel_constraint, k_constraint)
+        self.assertEqual(conv2d.bias_constraint, b_constraint)
+
+        # Conv3D
+        k_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        conv3d = conv_layers.Conv3D(
+            2, 3, kernel_constraint=k_constraint, bias_constraint=b_constraint
+        )
+        inputs = tf.random.uniform((5, 3, 3, 3, 5), seed=1)
+        conv3d(inputs)
+        self.assertEqual(conv3d.kernel_constraint, k_constraint)
+        self.assertEqual(conv3d.bias_constraint, b_constraint)
+
+    def testConv3DChannelsFirst(self):
+        # Test case for GitHub issue 15655
+        with tf.Graph().as_default():
+            images = tf.compat.v1.placeholder(
+                dtype=tf.float32, shape=[None, 1, 32, 32, 32]
+            )
+            conv_layers.conv3d(images, 32, 9, data_format="channels_first")
 
 
 class SeparableConv1DTest(tf.test.TestCase):
+    def testInvalidDataFormat(self):
+        length = 9
+        data = tf.random.uniform((5, length, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "data_format"):
+            conv_layers.separable_conv1d(data, 32, 3, data_format="invalid")
+
+    def testInvalidStrides(self):
+        length = 9
+        data = tf.random.uniform((5, length, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.separable_conv1d(data, 32, 3, strides=(1, 2))
+
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.separable_conv1d(data, 32, 3, strides=None)
 
-  def testInvalidDataFormat(self):
-    length = 9
-    data = tf.random.uniform((5, length, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'data_format'):
-      conv_layers.separable_conv1d(data, 32, 3, data_format='invalid')
-
-  def testInvalidStrides(self):
-    length = 9
-    data = tf.random.uniform((5, length, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.separable_conv1d(data, 32, 3, strides=(1, 2))
-
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.separable_conv1d(data, 32, 3, strides=None)
-
-  def testInvalidKernelSize(self):
-    length = 9
-    data = tf.random.uniform((5, length, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.separable_conv1d(data, 32, (1, 2))
-
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.separable_conv1d(data, 32, None)
-
-  def testCreateSeparableConv1D(self):
-    length = 9
-    data = tf.random.uniform((5, length, 4))
-    layer = conv_layers.SeparableConv1D(32, 3, activation=tf.nn.relu)
-    output = layer(data)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'separable_conv1d/Relu')
-    self.assertEqual(output.get_shape().as_list(), [5, length - 2, 32])
-    self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1])
-    self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32])
-    self.assertEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testCreateSeparableConv1DDepthMultiplier(self):
-    length = 9
-    data = tf.random.uniform((5, length, 4))
-    layer = conv_layers.SeparableConv1D(32, 3, depth_multiplier=2)
-    output = layer(data)
-    self.assertEqual(output.get_shape().as_list(), [5, length - 2, 32])
-    self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 2])
-    self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 8, 32])
-    self.assertEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testCreateSeparableConv1DChannelsFirst(self):
-    with tf.Graph().as_default():
-      length = 9
-      data = tf.random.uniform((5, 4, length))
-      layer = conv_layers.SeparableConv1D(32, 3, data_format='channels_first')
-      output = layer(data)
-      self.assertEqual(output.get_shape().as_list(), [5, 32, length - 2])
-      self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1])
-      self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32])
-      self.assertEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testSeparableConv1DPaddingSame(self):
-    length = 9
-    data = tf.random.uniform((5, length, 32), seed=1)
-    layer = conv_layers.SeparableConv1D(
-        64, length, padding='same')
-    output = layer(data)
-    self.assertEqual(output.get_shape().as_list(), [5, length, 64])
-
-  def testCreateSeparableConv1DWithStrides(self):
-    length = 10
-    data = tf.random.uniform((5, length, 3), seed=1)
-    layer = conv_layers.SeparableConv1D(32, 3, strides=2, padding='same')
-    output = layer(data)
-    self.assertEqual(output.get_shape().as_list(), [5, length // 2, 32])
-
-  def testCreateSeparableConv1DWithStridesChannelsFirst(self):
-    with tf.Graph().as_default():
-      data_format = 'channels_first'
-      length = 10
-      data = tf.random.uniform((5, 3, length), seed=1)
-      layer = conv_layers.SeparableConv1D(
-          32, 3, strides=2, padding='same', data_format=data_format)
-      output = layer(data)
-      self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
-
-  def testFunctionalConv1DReuse(self):
-    with tf.Graph().as_default():
-      length = 10
-      data = tf.random.uniform((5, length, 3), seed=1)
-      conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-      conv_layers.separable_conv1d(data, 32, 3, name='sepconv1', reuse=True)
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-
-  def testFunctionalConv1DReuseFromScope(self):
-    with tf.Graph().as_default():
-      with tf.compat.v1.variable_scope('scope'):
+    def testInvalidKernelSize(self):
+        length = 9
+        data = tf.random.uniform((5, length, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.separable_conv1d(data, 32, (1, 2))
+
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.separable_conv1d(data, 32, None)
+
+    def testCreateSeparableConv1D(self):
+        length = 9
+        data = tf.random.uniform((5, length, 4))
+        layer = conv_layers.SeparableConv1D(32, 3, activation=tf.nn.relu)
+        output = layer(data)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "separable_conv1d/Relu")
+        self.assertEqual(output.get_shape().as_list(), [5, length - 2, 32])
+        self.assertEqual(
+            layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1]
+        )
+        self.assertEqual(
+            layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32]
+        )
+        self.assertEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testCreateSeparableConv1DDepthMultiplier(self):
+        length = 9
+        data = tf.random.uniform((5, length, 4))
+        layer = conv_layers.SeparableConv1D(32, 3, depth_multiplier=2)
+        output = layer(data)
+        self.assertEqual(output.get_shape().as_list(), [5, length - 2, 32])
+        self.assertEqual(
+            layer.depthwise_kernel.get_shape().as_list(), [3, 4, 2]
+        )
+        self.assertEqual(
+            layer.pointwise_kernel.get_shape().as_list(), [1, 8, 32]
+        )
+        self.assertEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testCreateSeparableConv1DChannelsFirst(self):
+        with tf.Graph().as_default():
+            length = 9
+            data = tf.random.uniform((5, 4, length))
+            layer = conv_layers.SeparableConv1D(
+                32, 3, data_format="channels_first"
+            )
+            output = layer(data)
+            self.assertEqual(output.get_shape().as_list(), [5, 32, length - 2])
+            self.assertEqual(
+                layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1]
+            )
+            self.assertEqual(
+                layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32]
+            )
+            self.assertEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testSeparableConv1DPaddingSame(self):
+        length = 9
+        data = tf.random.uniform((5, length, 32), seed=1)
+        layer = conv_layers.SeparableConv1D(64, length, padding="same")
+        output = layer(data)
+        self.assertEqual(output.get_shape().as_list(), [5, length, 64])
+
+    def testCreateSeparableConv1DWithStrides(self):
         length = 10
         data = tf.random.uniform((5, length, 3), seed=1)
-        conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-      with tf.compat.v1.variable_scope('scope', reuse=True):
-        conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-
-  def testFunctionalConv1DNoReuse(self):
-    with tf.Graph().as_default():
-      length = 10
-      data = tf.random.uniform((5, length, 3), seed=1)
-      conv_layers.separable_conv1d(data, 32, 3)
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-      conv_layers.separable_conv1d(data, 32, 3)
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 6)
-
-  def testSeparableConv1DDepthwiseRegularizer(self):
-    with tf.Graph().as_default():
-      length = 9
-      data = tf.random.uniform((5, length, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.SeparableConv1D(32, 3, depthwise_regularizer=reg)
-      layer(data)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testSeparableConv1DPointwiseRegularizer(self):
-    with tf.Graph().as_default():
-      length = 9
-      data = tf.random.uniform((5, length, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.SeparableConv1D(32, 3, pointwise_regularizer=reg)
-      layer(data)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testSeparableConv1DBiasRegularizer(self):
-    with tf.Graph().as_default():
-      length = 9
-      data = tf.random.uniform((5, length, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.SeparableConv1D(32, 3, bias_regularizer=reg)
-      layer(data)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testSeparableConv1DNoBias(self):
-    with tf.Graph().as_default():
-      length = 9
-      data = tf.random.uniform((5, length, 4))
-      layer = conv_layers.SeparableConv1D(
-          32, 3, activation=tf.nn.relu, use_bias=False)
-      output = layer(data)
-      self.assertEqual(output.op.name, 'separable_conv1d/Relu')
-      self.assertEqual(layer.bias, None)
-
-  def testConstraints(self):
-    d_constraint = lambda x: x / tf.reduce_sum(x)
-    p_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    layer = conv_layers.SeparableConv1D(2, 3,
-                                        depthwise_constraint=d_constraint,
-                                        pointwise_constraint=p_constraint,
-                                        bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3, 5), seed=1)
-    layer(inputs)
-    self.assertEqual(layer.depthwise_constraint, d_constraint)
-    self.assertEqual(layer.pointwise_constraint, p_constraint)
-    self.assertEqual(layer.bias_constraint, b_constraint)
+        layer = conv_layers.SeparableConv1D(32, 3, strides=2, padding="same")
+        output = layer(data)
+        self.assertEqual(output.get_shape().as_list(), [5, length // 2, 32])
+
+    def testCreateSeparableConv1DWithStridesChannelsFirst(self):
+        with tf.Graph().as_default():
+            data_format = "channels_first"
+            length = 10
+            data = tf.random.uniform((5, 3, length), seed=1)
+            layer = conv_layers.SeparableConv1D(
+                32, 3, strides=2, padding="same", data_format=data_format
+            )
+            output = layer(data)
+            self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
+
+    def testFunctionalConv1DReuse(self):
+        with tf.Graph().as_default():
+            length = 10
+            data = tf.random.uniform((5, length, 3), seed=1)
+            conv_layers.separable_conv1d(data, 32, 3, name="sepconv1")
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+            conv_layers.separable_conv1d(
+                data, 32, 3, name="sepconv1", reuse=True
+            )
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+
+    def testFunctionalConv1DReuseFromScope(self):
+        with tf.Graph().as_default():
+            with tf.compat.v1.variable_scope("scope"):
+                length = 10
+                data = tf.random.uniform((5, length, 3), seed=1)
+                conv_layers.separable_conv1d(data, 32, 3, name="sepconv1")
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+            with tf.compat.v1.variable_scope("scope", reuse=True):
+                conv_layers.separable_conv1d(data, 32, 3, name="sepconv1")
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+
+    def testFunctionalConv1DNoReuse(self):
+        with tf.Graph().as_default():
+            length = 10
+            data = tf.random.uniform((5, length, 3), seed=1)
+            conv_layers.separable_conv1d(data, 32, 3)
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+            conv_layers.separable_conv1d(data, 32, 3)
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 6)
+
+    def testSeparableConv1DDepthwiseRegularizer(self):
+        with tf.Graph().as_default():
+            length = 9
+            data = tf.random.uniform((5, length, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.SeparableConv1D(
+                32, 3, depthwise_regularizer=reg
+            )
+            layer(data)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testSeparableConv1DPointwiseRegularizer(self):
+        with tf.Graph().as_default():
+            length = 9
+            data = tf.random.uniform((5, length, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.SeparableConv1D(
+                32, 3, pointwise_regularizer=reg
+            )
+            layer(data)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testSeparableConv1DBiasRegularizer(self):
+        with tf.Graph().as_default():
+            length = 9
+            data = tf.random.uniform((5, length, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.SeparableConv1D(32, 3, bias_regularizer=reg)
+            layer(data)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testSeparableConv1DNoBias(self):
+        with tf.Graph().as_default():
+            length = 9
+            data = tf.random.uniform((5, length, 4))
+            layer = conv_layers.SeparableConv1D(
+                32, 3, activation=tf.nn.relu, use_bias=False
+            )
+            output = layer(data)
+            self.assertEqual(output.op.name, "separable_conv1d/Relu")
+            self.assertEqual(layer.bias, None)
+
+    def testConstraints(self):
+        d_constraint = lambda x: x / tf.reduce_sum(x)
+        p_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        layer = conv_layers.SeparableConv1D(
+            2,
+            3,
+            depthwise_constraint=d_constraint,
+            pointwise_constraint=p_constraint,
+            bias_constraint=b_constraint,
+        )
+        inputs = tf.random.uniform((5, 3, 5), seed=1)
+        layer(inputs)
+        self.assertEqual(layer.depthwise_constraint, d_constraint)
+        self.assertEqual(layer.pointwise_constraint, p_constraint)
+        self.assertEqual(layer.bias_constraint, b_constraint)
 
 
 class SeparableConv2DTest(tf.test.TestCase):
+    def testInvalidDataFormat(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "data_format"):
+            conv_layers.separable_conv2d(images, 32, 3, data_format="invalid")
 
-  def testInvalidDataFormat(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'data_format'):
-      conv_layers.separable_conv2d(images, 32, 3, data_format='invalid')
-
-  def testInvalidStrides(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.separable_conv2d(images, 32, 3, strides=(1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.separable_conv2d(images, 32, 3, strides=None)
-
-  def testInvalidKernelSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.separable_conv2d(images, 32, (1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.separable_conv2d(images, 32, None)
-
-  def testCreateSeparableConv2D(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.SeparableConv2D(32, [3, 3], activation=tf.nn.relu)
-    output = layer(images)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'separable_conv2d/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-    self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
-                         [3, 3, 4, 1])
-    self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
-                         [1, 1, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testCreateSeparableConv2DDepthMultiplier(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.SeparableConv2D(32, [3, 3], depth_multiplier=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-    self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
-                         [3, 3, 4, 2])
-    self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
-                         [1, 1, 8, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testCreateSeparableConv2DIntegerKernelSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.SeparableConv2D(32, 3)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height - 2, width - 2, 32])
-    self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
-                         [3, 3, 4, 1])
-    self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
-                         [1, 1, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testCreateSeparableConv2DChannelsFirst(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, 4, height, width))
-      layer = conv_layers.SeparableConv2D(
-          32, [3, 3], data_format='channels_first')
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, 32, height - 2, width - 2])
-      self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
-                           [3, 3, 4, 1])
-      self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
-                           [1, 1, 4, 32])
-      self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testSeparableConv2DPaddingSame(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 32), seed=1)
-    layer = conv_layers.SeparableConv2D(
-        64, images.get_shape()[1:3], padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
-
-  def testCreateSeparableConvWithStrides(self):
-    with tf.Graph().as_default():
-      height, width = 6, 8
-      # Test strides tuple
-      images = tf.random.uniform((5, height, width, 3), seed=1)
-      layer = conv_layers.SeparableConv2D(
-          32, [3, 3], strides=(2, 2), padding='same')
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, height / 2, width / 2, 32])
-
-      # Test strides integer
-      layer = conv_layers.SeparableConv2D(32, [3, 3], strides=2, padding='same')
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, height / 2, width / 2, 32])
-
-      # Test unequal strides
-      layer = conv_layers.SeparableConv2D(
-          32, [3, 3], strides=(2, 1), padding='same')
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, height / 2, width, 32])
-
-  def testCreateSeparableConvWithStridesChannelsFirst(self):
-    with tf.Graph().as_default():
-      data_format = 'channels_first'
-      height, width = 6, 8
-      # Test strides tuple
-      images = tf.random.uniform((5, 3, height, width), seed=1)
-      layer = conv_layers.SeparableConv2D(
-          32, [3, 3], strides=(2, 2), padding='same', data_format=data_format)
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, 32, height / 2, width / 2])
-
-      # Test strides integer
-      layer = conv_layers.SeparableConv2D(32, [3, 3], strides=2, padding='same',
-                                          data_format=data_format)
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, 32, height / 2, width / 2])
-
-      # Test unequal strides
-      layer = conv_layers.SeparableConv2D(
-          32, [3, 3], strides=(2, 1), padding='same', data_format=data_format)
-      output = layer(images)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, 32, height / 2, width])
-
-  def testFunctionalConv2DReuse(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 3), seed=1)
-      conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-      conv_layers.separable_conv2d(
-          images, 32, [3, 3], name='sepconv1', reuse=True)
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-
-  def testFunctionalConv2DReuseFromScope(self):
-    with tf.Graph().as_default():
-      with tf.compat.v1.variable_scope('scope'):
+    def testInvalidStrides(self):
         height, width = 7, 9
         images = tf.random.uniform((5, height, width, 3), seed=1)
-        conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-      with tf.compat.v1.variable_scope('scope', reuse=True):
-        conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-
-  def testFunctionalConv2DInitializerFromScope(self):
-    with tf.Graph().as_default(), self.cached_session():
-      with tf.compat.v1.variable_scope(
-          'scope', initializer=tf.compat.v1.ones_initializer()):
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.separable_conv2d(images, 32, 3, strides=(1, 2, 3))
+
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.separable_conv2d(images, 32, 3, strides=None)
+
+    def testInvalidKernelSize(self):
         height, width = 7, 9
         images = tf.random.uniform((5, height, width, 3), seed=1)
-        conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
-        weights = tf.compat.v1.trainable_variables()
-        # Check the names of weights in order.
-        self.assertTrue('depthwise_kernel' in weights[0].name)
-        self.assertTrue('pointwise_kernel' in weights[1].name)
-        self.assertTrue('bias' in weights[2].name)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        weights = self.evaluate(weights)
-        # Check that the kernel weights got initialized to ones (from scope)
-        self.assertAllClose(weights[0], np.ones((3, 3, 3, 1)))
-        self.assertAllClose(weights[1], np.ones((1, 1, 3, 32)))
-        # Check that the bias still got initialized to zeros.
-        self.assertAllClose(weights[2], np.zeros((32)))
-
-  def testFunctionalConv2DNoReuse(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 3), seed=1)
-      conv_layers.separable_conv2d(images, 32, [3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
-      conv_layers.separable_conv2d(images, 32, [3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 6)
-
-  def testSeparableConv2DDepthwiseRegularizer(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.SeparableConv2D(32, [3, 3], depthwise_regularizer=reg)
-      layer(images)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testSeparableConv2DPointwiseRegularizer(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.SeparableConv2D(32, [3, 3], pointwise_regularizer=reg)
-      layer(images)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testSeparableConv2DBiasRegularizer(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.SeparableConv2D(32, [3, 3], bias_regularizer=reg)
-      layer(images)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testSeparableConv2DNoBias(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      layer = conv_layers.SeparableConv2D(
-          32, [3, 3], activation=tf.nn.relu, use_bias=False)
-      output = layer(images)
-      self.assertEqual(output.op.name, 'separable_conv2d/Relu')
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, height - 2, width - 2, 32])
-      self.assertListEqual(layer.depthwise_kernel.get_shape().as_list(),
-                           [3, 3, 4, 1])
-      self.assertListEqual(layer.pointwise_kernel.get_shape().as_list(),
-                           [1, 1, 4, 32])
-      self.assertEqual(layer.bias, None)
-
-  def testConstraints(self):
-    d_constraint = lambda x: x / tf.reduce_sum(x)
-    p_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    layer = conv_layers.SeparableConv2D(2, 3,
-                                        depthwise_constraint=d_constraint,
-                                        pointwise_constraint=p_constraint,
-                                        bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3, 3, 5), seed=1)
-    layer(inputs)
-    self.assertEqual(layer.depthwise_constraint, d_constraint)
-    self.assertEqual(layer.pointwise_constraint, p_constraint)
-    self.assertEqual(layer.bias_constraint, b_constraint)
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.separable_conv2d(images, 32, (1, 2, 3))
+
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.separable_conv2d(images, 32, None)
+
+    def testCreateSeparableConv2D(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.SeparableConv2D(32, [3, 3], activation=tf.nn.relu)
+        output = layer(images)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "separable_conv2d/Relu")
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+        )
+        self.assertListEqual(
+            layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1]
+        )
+        self.assertListEqual(
+            layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32]
+        )
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testCreateSeparableConv2DDepthMultiplier(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.SeparableConv2D(32, [3, 3], depth_multiplier=2)
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+        )
+        self.assertListEqual(
+            layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 2]
+        )
+        self.assertListEqual(
+            layer.pointwise_kernel.get_shape().as_list(), [1, 1, 8, 32]
+        )
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testCreateSeparableConv2DIntegerKernelSize(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.SeparableConv2D(32, 3)
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+        )
+        self.assertListEqual(
+            layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1]
+        )
+        self.assertListEqual(
+            layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32]
+        )
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testCreateSeparableConv2DChannelsFirst(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, 4, height, width))
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], data_format="channels_first"
+            )
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, 32, height - 2, width - 2]
+            )
+            self.assertListEqual(
+                layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1]
+            )
+            self.assertListEqual(
+                layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32]
+            )
+            self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testSeparableConv2DPaddingSame(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 32), seed=1)
+        layer = conv_layers.SeparableConv2D(
+            64, images.get_shape()[1:3], padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height, width, 64]
+        )
+
+    def testCreateSeparableConvWithStrides(self):
+        with tf.Graph().as_default():
+            height, width = 6, 8
+            # Test strides tuple
+            images = tf.random.uniform((5, height, width, 3), seed=1)
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], strides=(2, 2), padding="same"
+            )
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, height / 2, width / 2, 32]
+            )
+
+            # Test strides integer
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], strides=2, padding="same"
+            )
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, height / 2, width / 2, 32]
+            )
+
+            # Test unequal strides
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], strides=(2, 1), padding="same"
+            )
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, height / 2, width, 32]
+            )
+
+    def testCreateSeparableConvWithStridesChannelsFirst(self):
+        with tf.Graph().as_default():
+            data_format = "channels_first"
+            height, width = 6, 8
+            # Test strides tuple
+            images = tf.random.uniform((5, 3, height, width), seed=1)
+            layer = conv_layers.SeparableConv2D(
+                32,
+                [3, 3],
+                strides=(2, 2),
+                padding="same",
+                data_format=data_format,
+            )
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, 32, height / 2, width / 2]
+            )
+
+            # Test strides integer
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], strides=2, padding="same", data_format=data_format
+            )
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, 32, height / 2, width / 2]
+            )
+
+            # Test unequal strides
+            layer = conv_layers.SeparableConv2D(
+                32,
+                [3, 3],
+                strides=(2, 1),
+                padding="same",
+                data_format=data_format,
+            )
+            output = layer(images)
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, 32, height / 2, width]
+            )
+
+    def testFunctionalConv2DReuse(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 3), seed=1)
+            conv_layers.separable_conv2d(images, 32, [3, 3], name="sepconv1")
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+            conv_layers.separable_conv2d(
+                images, 32, [3, 3], name="sepconv1", reuse=True
+            )
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+
+    def testFunctionalConv2DReuseFromScope(self):
+        with tf.Graph().as_default():
+            with tf.compat.v1.variable_scope("scope"):
+                height, width = 7, 9
+                images = tf.random.uniform((5, height, width, 3), seed=1)
+                conv_layers.separable_conv2d(
+                    images, 32, [3, 3], name="sepconv1"
+                )
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+            with tf.compat.v1.variable_scope("scope", reuse=True):
+                conv_layers.separable_conv2d(
+                    images, 32, [3, 3], name="sepconv1"
+                )
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+
+    def testFunctionalConv2DInitializerFromScope(self):
+        with tf.Graph().as_default(), self.cached_session():
+            with tf.compat.v1.variable_scope(
+                "scope", initializer=tf.compat.v1.ones_initializer()
+            ):
+                height, width = 7, 9
+                images = tf.random.uniform((5, height, width, 3), seed=1)
+                conv_layers.separable_conv2d(
+                    images, 32, [3, 3], name="sepconv1"
+                )
+                weights = tf.compat.v1.trainable_variables()
+                # Check the names of weights in order.
+                self.assertTrue("depthwise_kernel" in weights[0].name)
+                self.assertTrue("pointwise_kernel" in weights[1].name)
+                self.assertTrue("bias" in weights[2].name)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                weights = self.evaluate(weights)
+                # Check that the kernel weights got initialized to ones (from
+                # scope)
+                self.assertAllClose(weights[0], np.ones((3, 3, 3, 1)))
+                self.assertAllClose(weights[1], np.ones((1, 1, 3, 32)))
+                # Check that the bias still got initialized to zeros.
+                self.assertAllClose(weights[2], np.zeros((32)))
+
+    def testFunctionalConv2DNoReuse(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 3), seed=1)
+            conv_layers.separable_conv2d(images, 32, [3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 3)
+            conv_layers.separable_conv2d(images, 32, [3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 6)
+
+    def testSeparableConv2DDepthwiseRegularizer(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], depthwise_regularizer=reg
+            )
+            layer(images)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testSeparableConv2DPointwiseRegularizer(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], pointwise_regularizer=reg
+            )
+            layer(images)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testSeparableConv2DBiasRegularizer(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], bias_regularizer=reg
+            )
+            layer(images)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testSeparableConv2DNoBias(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            layer = conv_layers.SeparableConv2D(
+                32, [3, 3], activation=tf.nn.relu, use_bias=False
+            )
+            output = layer(images)
+            self.assertEqual(output.op.name, "separable_conv2d/Relu")
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, height - 2, width - 2, 32]
+            )
+            self.assertListEqual(
+                layer.depthwise_kernel.get_shape().as_list(), [3, 3, 4, 1]
+            )
+            self.assertListEqual(
+                layer.pointwise_kernel.get_shape().as_list(), [1, 1, 4, 32]
+            )
+            self.assertEqual(layer.bias, None)
+
+    def testConstraints(self):
+        d_constraint = lambda x: x / tf.reduce_sum(x)
+        p_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        layer = conv_layers.SeparableConv2D(
+            2,
+            3,
+            depthwise_constraint=d_constraint,
+            pointwise_constraint=p_constraint,
+            bias_constraint=b_constraint,
+        )
+        inputs = tf.random.uniform((5, 3, 3, 5), seed=1)
+        layer(inputs)
+        self.assertEqual(layer.depthwise_constraint, d_constraint)
+        self.assertEqual(layer.pointwise_constraint, p_constraint)
+        self.assertEqual(layer.bias_constraint, b_constraint)
 
 
 class Conv2DTransposeTest(tf.test.TestCase):
+    def testInvalidDataFormat(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "data_format"):
+            conv_layers.conv2d_transpose(images, 32, 3, data_format="invalid")
 
-  def testInvalidDataFormat(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'data_format'):
-      conv_layers.conv2d_transpose(images, 32, 3, data_format='invalid')
-
-  def testInvalidStrides(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.conv2d_transpose(images, 32, 3, strides=(1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.conv2d_transpose(images, 32, 3, strides=None)
-
-  def testInvalidKernelSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.conv2d_transpose(images, 32, (1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.conv2d_transpose(images, 32, None)
-
-  def testCreateConv2DTranspose(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.Conv2DTranspose(32, [3, 3], activation=tf.nn.relu)
-    output = layer(images)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height + 2, width + 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testConv2DTransposeFloat16(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4), dtype='float16')
-    output = conv_layers.conv2d_transpose(images, 32, [3, 3],
-                                          activation=tf.nn.relu)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height + 2, width + 2, 32])
-
-  def testCreateConv2DTransposeIntegerKernelSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = conv_layers.Conv2DTranspose(32, 3)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height + 2, width + 2, 32])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testCreateConv2DTransposeChannelsFirst(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, 4, height, width))
-    layer = conv_layers.Conv2DTranspose(
-        32, [3, 3], data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, 32, height + 2, width + 2])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [32])
-
-  def testConv2DTransposePaddingSame(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 32), seed=1)
-    layer = conv_layers.Conv2DTranspose(
-        64, images.get_shape()[1:3], padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
-
-  def testCreateConv2DTransposeWithStrides(self):
-    height, width = 6, 8
-    # Test strides tuple
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    layer = conv_layers.Conv2DTranspose(
-        32, [3, 3], strides=(2, 2), padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height * 2, width * 2, 32])
-
-    # Test strides integer
-    layer = conv_layers.Conv2DTranspose(32, [3, 3], strides=2, padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height * 2, width * 2, 32])
-
-    # Test unequal strides
-    layer = conv_layers.Conv2DTranspose(
-        32, [3, 3], strides=(2, 1), padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height * 2, width, 32])
-
-  def testConv2DTransposeKernelRegularizer(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.Conv2DTranspose(32, [3, 3], kernel_regularizer=reg)
-      layer(images)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testConv2DTransposeBiasRegularizer(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.Conv2DTranspose(32, [3, 3], bias_regularizer=reg)
-      layer(images)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testConv2DTransposeNoBias(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 4))
-      layer = conv_layers.Conv2DTranspose(
-          32, [3, 3], activation=tf.nn.relu, use_bias=False)
-      output = layer(images)
-      self.assertEqual(output.op.name, 'conv2d_transpose/Relu')
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, height + 2, width + 2, 32])
-      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
-      self.assertEqual(layer.bias, None)
-
-  def testFunctionalConv2DTransposeReuse(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 3), seed=1)
-      conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      conv_layers.conv2d_transpose(
-          images, 32, [3, 3], name='deconv1', reuse=True)
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-
-  def testFunctionalConv2DTransposeReuseFromScope(self):
-    with tf.Graph().as_default():
-      with tf.compat.v1.variable_scope('scope'):
+    def testInvalidStrides(self):
         height, width = 7, 9
         images = tf.random.uniform((5, height, width, 3), seed=1)
-        conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      with tf.compat.v1.variable_scope('scope', reuse=True):
-        conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-
-  def testFunctionalConv2DTransposeInitializerFromScope(self):
-    with tf.Graph().as_default(), self.cached_session():
-      with tf.compat.v1.variable_scope(
-          'scope', initializer=tf.compat.v1.ones_initializer()):
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.conv2d_transpose(images, 32, 3, strides=(1, 2, 3))
+
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.conv2d_transpose(images, 32, 3, strides=None)
+
+    def testInvalidKernelSize(self):
         height, width = 7, 9
         images = tf.random.uniform((5, height, width, 3), seed=1)
-        conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
-        weights = tf.compat.v1.trainable_variables()
-        # Check the names of weights in order.
-        self.assertTrue('kernel' in weights[0].name)
-        self.assertTrue('bias' in weights[1].name)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        weights = self.evaluate(weights)
-        # Check that the kernel weights got initialized to ones (from scope)
-        self.assertAllClose(weights[0], np.ones((3, 3, 32, 3)))
-        # Check that the bias still got initialized to zeros.
-        self.assertAllClose(weights[1], np.zeros((32)))
-
-  def testFunctionalConv2DTransposeNoReuse(self):
-    with tf.Graph().as_default():
-      height, width = 7, 9
-      images = tf.random.uniform((5, height, width, 3), seed=1)
-      conv_layers.conv2d_transpose(images, 32, [3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      conv_layers.conv2d_transpose(images, 32, [3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 4)
-
-  def testConstraints(self):
-    k_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    layer = conv_layers.Conv2DTranspose(2, 3,
-                                        kernel_constraint=k_constraint,
-                                        bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3, 3, 5), seed=1)
-    layer(inputs)
-    self.assertEqual(layer.kernel_constraint, k_constraint)
-    self.assertEqual(layer.bias_constraint, b_constraint)
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.conv2d_transpose(images, 32, (1, 2, 3))
+
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.conv2d_transpose(images, 32, None)
+
+    def testCreateConv2DTranspose(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.Conv2DTranspose(32, [3, 3], activation=tf.nn.relu)
+        output = layer(images)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "conv2d_transpose/Relu")
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height + 2, width + 2, 32]
+        )
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testConv2DTransposeFloat16(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4), dtype="float16")
+        output = conv_layers.conv2d_transpose(
+            images, 32, [3, 3], activation=tf.nn.relu
+        )
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height + 2, width + 2, 32]
+        )
+
+    def testCreateConv2DTransposeIntegerKernelSize(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = conv_layers.Conv2DTranspose(32, 3)
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height + 2, width + 2, 32]
+        )
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testCreateConv2DTransposeChannelsFirst(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, 4, height, width))
+        layer = conv_layers.Conv2DTranspose(
+            32, [3, 3], data_format="channels_first"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, 32, height + 2, width + 2]
+        )
+        self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
+        self.assertListEqual(layer.bias.get_shape().as_list(), [32])
+
+    def testConv2DTransposePaddingSame(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 32), seed=1)
+        layer = conv_layers.Conv2DTranspose(
+            64, images.get_shape()[1:3], padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height, width, 64]
+        )
+
+    def testCreateConv2DTransposeWithStrides(self):
+        height, width = 6, 8
+        # Test strides tuple
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        layer = conv_layers.Conv2DTranspose(
+            32, [3, 3], strides=(2, 2), padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height * 2, width * 2, 32]
+        )
+
+        # Test strides integer
+        layer = conv_layers.Conv2DTranspose(
+            32, [3, 3], strides=2, padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height * 2, width * 2, 32]
+        )
+
+        # Test unequal strides
+        layer = conv_layers.Conv2DTranspose(
+            32, [3, 3], strides=(2, 1), padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height * 2, width, 32]
+        )
+
+    def testConv2DTransposeKernelRegularizer(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.Conv2DTranspose(
+                32, [3, 3], kernel_regularizer=reg
+            )
+            layer(images)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testConv2DTransposeBiasRegularizer(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.Conv2DTranspose(
+                32, [3, 3], bias_regularizer=reg
+            )
+            layer(images)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testConv2DTransposeNoBias(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 4))
+            layer = conv_layers.Conv2DTranspose(
+                32, [3, 3], activation=tf.nn.relu, use_bias=False
+            )
+            output = layer(images)
+            self.assertEqual(output.op.name, "conv2d_transpose/Relu")
+            self.assertListEqual(
+                output.get_shape().as_list(), [5, height + 2, width + 2, 32]
+            )
+            self.assertListEqual(
+                layer.kernel.get_shape().as_list(), [3, 3, 32, 4]
+            )
+            self.assertEqual(layer.bias, None)
+
+    def testFunctionalConv2DTransposeReuse(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 3), seed=1)
+            conv_layers.conv2d_transpose(images, 32, [3, 3], name="deconv1")
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            conv_layers.conv2d_transpose(
+                images, 32, [3, 3], name="deconv1", reuse=True
+            )
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+
+    def testFunctionalConv2DTransposeReuseFromScope(self):
+        with tf.Graph().as_default():
+            with tf.compat.v1.variable_scope("scope"):
+                height, width = 7, 9
+                images = tf.random.uniform((5, height, width, 3), seed=1)
+                conv_layers.conv2d_transpose(images, 32, [3, 3], name="deconv1")
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            with tf.compat.v1.variable_scope("scope", reuse=True):
+                conv_layers.conv2d_transpose(images, 32, [3, 3], name="deconv1")
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+
+    def testFunctionalConv2DTransposeInitializerFromScope(self):
+        with tf.Graph().as_default(), self.cached_session():
+            with tf.compat.v1.variable_scope(
+                "scope", initializer=tf.compat.v1.ones_initializer()
+            ):
+                height, width = 7, 9
+                images = tf.random.uniform((5, height, width, 3), seed=1)
+                conv_layers.conv2d_transpose(images, 32, [3, 3], name="deconv1")
+                weights = tf.compat.v1.trainable_variables()
+                # Check the names of weights in order.
+                self.assertTrue("kernel" in weights[0].name)
+                self.assertTrue("bias" in weights[1].name)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                weights = self.evaluate(weights)
+                # Check that the kernel weights got initialized to ones (from
+                # scope)
+                self.assertAllClose(weights[0], np.ones((3, 3, 32, 3)))
+                # Check that the bias still got initialized to zeros.
+                self.assertAllClose(weights[1], np.zeros((32)))
+
+    def testFunctionalConv2DTransposeNoReuse(self):
+        with tf.Graph().as_default():
+            height, width = 7, 9
+            images = tf.random.uniform((5, height, width, 3), seed=1)
+            conv_layers.conv2d_transpose(images, 32, [3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            conv_layers.conv2d_transpose(images, 32, [3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 4)
+
+    def testConstraints(self):
+        k_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        layer = conv_layers.Conv2DTranspose(
+            2, 3, kernel_constraint=k_constraint, bias_constraint=b_constraint
+        )
+        inputs = tf.random.uniform((5, 3, 3, 5), seed=1)
+        layer(inputs)
+        self.assertEqual(layer.kernel_constraint, k_constraint)
+        self.assertEqual(layer.bias_constraint, b_constraint)
 
 
 class Conv3DTransposeTest(tf.test.TestCase):
+    def testInvalidDataFormat(self):
+        depth, height, width = 5, 7, 9
+        volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
+        with self.assertRaisesRegex(ValueError, "data_format"):
+            conv_layers.conv3d_transpose(volumes, 4, 3, data_format="invalid")
+
+    def testInvalidStrides(self):
+        depth, height, width = 5, 7, 9
+        volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.conv3d_transpose(volumes, 4, 3, strides=(1, 2))
+
+        with self.assertRaisesRegex(ValueError, "strides"):
+            conv_layers.conv3d_transpose(volumes, 4, 3, strides=None)
+
+    def testInvalidKernelSize(self):
+        depth, height, width = 5, 7, 9
+        volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.conv3d_transpose(volumes, 4, (1, 2))
+
+        with self.assertRaisesRegex(ValueError, "kernel_size"):
+            conv_layers.conv3d_transpose(volumes, 4, None)
 
-  def testInvalidDataFormat(self):
-    depth, height, width = 5, 7, 9
-    volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
-    with self.assertRaisesRegex(ValueError, 'data_format'):
-      conv_layers.conv3d_transpose(volumes, 4, 3, data_format='invalid')
-
-  def testInvalidStrides(self):
-    depth, height, width = 5, 7, 9
-    volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.conv3d_transpose(volumes, 4, 3, strides=(1, 2))
-
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      conv_layers.conv3d_transpose(volumes, 4, 3, strides=None)
-
-  def testInvalidKernelSize(self):
-    depth, height, width = 5, 7, 9
-    volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.conv3d_transpose(volumes, 4, (1, 2))
-
-    with self.assertRaisesRegex(ValueError, 'kernel_size'):
-      conv_layers.conv3d_transpose(volumes, 4, None)
-
-  def testCreateConv3DTranspose(self):
-    depth, height, width = 5, 7, 9
-    volumes = tf.random.uniform((5, depth, height, width, 32))
-    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], activation=tf.nn.relu)
-    output = layer(volumes)
-    if not tf.executing_eagerly():
-      self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth + 2, height + 2, width + 2, 4])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
-
-  def testCreateConv3DTransposeIntegerKernelSize(self):
-    depth, height, width = 5, 7, 9
-    volumes = tf.random.uniform((5, depth, height, width, 32))
-    layer = conv_layers.Conv3DTranspose(4, 3)
-    output = layer(volumes)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth + 2, height + 2, width + 2, 4])
-    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
-    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
-
-  def testCreateConv3DTransposeChannelsFirst(self):
-    with tf.Graph().as_default():
-      depth, height, width = 5, 7, 9
-      volumes = tf.random.uniform((5, 32, depth, height, width))
-      layer = conv_layers.Conv3DTranspose(
-          4, [3, 3, 3], data_format='channels_first')
-      output = layer(volumes)
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, 4, depth + 2, height + 2, width + 2])
-      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
-      self.assertListEqual(layer.bias.get_shape().as_list(), [4])
-
-  def testConv3DTransposePaddingSame(self):
-    depth, height, width = 5, 7, 9
-    volumes = tf.random.uniform((5, depth, height, width, 64), seed=1)
-    layer = conv_layers.Conv3DTranspose(
-        32, volumes.get_shape()[1:4], padding='same')
-    output = layer(volumes)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth, height, width, 32])
-
-  def testCreateConv3DTransposeWithStrides(self):
-    depth, height, width = 4, 6, 8
-    # Test strides tuple.
-    volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
-    layer = conv_layers.Conv3DTranspose(
-        4, [3, 3, 3], strides=(2, 2, 2), padding='same')
-    output = layer(volumes)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth * 2, height * 2, width * 2, 4])
-
-    # Test strides integer.
-    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], strides=2, padding='same')
-    output = layer(volumes)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth * 2, height * 2, width * 2, 4])
-
-    # Test unequal strides.
-    layer = conv_layers.Conv3DTranspose(
-        4, [3, 3, 3], strides=(2, 1, 1), padding='same')
-    output = layer(volumes)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, depth * 2, height, width, 4])
-
-  def testConv3DTransposeKernelRegularizer(self):
-    with tf.Graph().as_default():
-      depth, height, width = 5, 7, 9
-      volumes = tf.random.uniform((5, depth, height, width, 32))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], kernel_regularizer=reg)
-      layer(volumes)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testConv3DTransposeBiasRegularizer(self):
-    with tf.Graph().as_default():
-      depth, height, width = 5, 7, 9
-      volumes = tf.random.uniform((5, depth, height, width, 32))
-      reg = lambda x: 0.1 * tf.reduce_sum(x)
-      layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], bias_regularizer=reg)
-      layer(volumes)
-      loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.evaluate([v.initializer for v in layer.variables])
-      self.assertListEqual(
-          self.evaluate(layer.losses), self.evaluate(loss_keys))
-
-  def testConv3DTransposeNoBias(self):
-    with tf.Graph().as_default():
-      depth, height, width = 5, 7, 9
-      volumes = tf.random.uniform((5, depth, height, width, 32))
-      layer = conv_layers.Conv3DTranspose(
-          4, [3, 3, 3], activation=tf.nn.relu, use_bias=False)
-      output = layer(volumes)
-      self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
-      self.assertListEqual(output.get_shape().as_list(),
-                           [5, depth + 2, height + 2, width + 2, 4])
-      self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
-      self.assertEqual(layer.bias, None)
-
-  def testFunctionalConv3DTransposeReuse(self):
-    with tf.Graph().as_default():
-      depth, height, width = 5, 7, 9
-      volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
-      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      conv_layers.conv3d_transpose(
-          volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-
-  def testFunctionalConv3DTransposeReuseFromScope(self):
-    with tf.Graph().as_default():
-      with tf.compat.v1.variable_scope('scope'):
+    def testCreateConv3DTranspose(self):
+        depth, height, width = 5, 7, 9
+        volumes = tf.random.uniform((5, depth, height, width, 32))
+        layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], activation=tf.nn.relu)
+        output = layer(volumes)
+        if not tf.executing_eagerly():
+            self.assertEqual(output.op.name, "conv3d_transpose/Relu")
+        self.assertListEqual(
+            output.get_shape().as_list(),
+            [5, depth + 2, height + 2, width + 2, 4],
+        )
+        self.assertListEqual(
+            layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32]
+        )
+        self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+    def testCreateConv3DTransposeIntegerKernelSize(self):
         depth, height, width = 5, 7, 9
-        volumes = tf.random.uniform(
-            (5, depth, height, width, 32), seed=1)
-        conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      with tf.compat.v1.variable_scope('scope', reuse=True):
-        conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
-        self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-
-  def testFunctionalConv3DTransposeInitializerFromScope(self):
-    with tf.Graph().as_default(), self.cached_session():
-      with tf.compat.v1.variable_scope(
-          'scope', initializer=tf.compat.v1.ones_initializer()):
+        volumes = tf.random.uniform((5, depth, height, width, 32))
+        layer = conv_layers.Conv3DTranspose(4, 3)
+        output = layer(volumes)
+        self.assertListEqual(
+            output.get_shape().as_list(),
+            [5, depth + 2, height + 2, width + 2, 4],
+        )
+        self.assertListEqual(
+            layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32]
+        )
+        self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+    def testCreateConv3DTransposeChannelsFirst(self):
+        with tf.Graph().as_default():
+            depth, height, width = 5, 7, 9
+            volumes = tf.random.uniform((5, 32, depth, height, width))
+            layer = conv_layers.Conv3DTranspose(
+                4, [3, 3, 3], data_format="channels_first"
+            )
+            output = layer(volumes)
+            self.assertListEqual(
+                output.get_shape().as_list(),
+                [5, 4, depth + 2, height + 2, width + 2],
+            )
+            self.assertListEqual(
+                layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32]
+            )
+            self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+    def testConv3DTransposePaddingSame(self):
         depth, height, width = 5, 7, 9
-        volumes = tf.random.uniform(
-            (5, depth, height, width, 32), seed=1)
-        conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
-        weights = tf.compat.v1.trainable_variables()
-        # Check the names of weights in order.
-        self.assertTrue('kernel' in weights[0].name)
-        self.assertTrue('bias' in weights[1].name)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        weights = self.evaluate(weights)
-        # Check that the kernel weights got initialized to ones (from scope)
-        self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
-        # Check that the bias still got initialized to zeros.
-        self.assertAllClose(weights[1], np.zeros((4)))
-
-  def testFunctionalConv3DTransposeNoReuse(self):
-    with tf.Graph().as_default():
-      depth, height, width = 5, 7, 9
-      volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
-      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
-      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
-      self.assertEqual(len(tf.compat.v1.trainable_variables()), 4)
-
-  def testConstraints(self):
-    k_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    layer = conv_layers.Conv3DTranspose(2, 3,
-                                        kernel_constraint=k_constraint,
-                                        bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3, 3, 3, 5), seed=1)
-    layer(inputs)
-    self.assertEqual(layer.kernel_constraint, k_constraint)
-    self.assertEqual(layer.bias_constraint, b_constraint)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+        volumes = tf.random.uniform((5, depth, height, width, 64), seed=1)
+        layer = conv_layers.Conv3DTranspose(
+            32, volumes.get_shape()[1:4], padding="same"
+        )
+        output = layer(volumes)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, depth, height, width, 32]
+        )
+
+    def testCreateConv3DTransposeWithStrides(self):
+        depth, height, width = 4, 6, 8
+        # Test strides tuple.
+        volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
+        layer = conv_layers.Conv3DTranspose(
+            4, [3, 3, 3], strides=(2, 2, 2), padding="same"
+        )
+        output = layer(volumes)
+        self.assertListEqual(
+            output.get_shape().as_list(),
+            [5, depth * 2, height * 2, width * 2, 4],
+        )
+
+        # Test strides integer.
+        layer = conv_layers.Conv3DTranspose(
+            4, [3, 3, 3], strides=2, padding="same"
+        )
+        output = layer(volumes)
+        self.assertListEqual(
+            output.get_shape().as_list(),
+            [5, depth * 2, height * 2, width * 2, 4],
+        )
+
+        # Test unequal strides.
+        layer = conv_layers.Conv3DTranspose(
+            4, [3, 3, 3], strides=(2, 1, 1), padding="same"
+        )
+        output = layer(volumes)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, depth * 2, height, width, 4]
+        )
+
+    def testConv3DTransposeKernelRegularizer(self):
+        with tf.Graph().as_default():
+            depth, height, width = 5, 7, 9
+            volumes = tf.random.uniform((5, depth, height, width, 32))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.Conv3DTranspose(
+                4, [3, 3, 3], kernel_regularizer=reg
+            )
+            layer(volumes)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testConv3DTransposeBiasRegularizer(self):
+        with tf.Graph().as_default():
+            depth, height, width = 5, 7, 9
+            volumes = tf.random.uniform((5, depth, height, width, 32))
+            reg = lambda x: 0.1 * tf.reduce_sum(x)
+            layer = conv_layers.Conv3DTranspose(
+                4, [3, 3, 3], bias_regularizer=reg
+            )
+            layer(volumes)
+            loss_keys = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+            )
+            self.assertEqual(len(loss_keys), 1)
+            self.evaluate([v.initializer for v in layer.variables])
+            self.assertListEqual(
+                self.evaluate(layer.losses), self.evaluate(loss_keys)
+            )
+
+    def testConv3DTransposeNoBias(self):
+        with tf.Graph().as_default():
+            depth, height, width = 5, 7, 9
+            volumes = tf.random.uniform((5, depth, height, width, 32))
+            layer = conv_layers.Conv3DTranspose(
+                4, [3, 3, 3], activation=tf.nn.relu, use_bias=False
+            )
+            output = layer(volumes)
+            self.assertEqual(output.op.name, "conv3d_transpose/Relu")
+            self.assertListEqual(
+                output.get_shape().as_list(),
+                [5, depth + 2, height + 2, width + 2, 4],
+            )
+            self.assertListEqual(
+                layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32]
+            )
+            self.assertEqual(layer.bias, None)
+
+    def testFunctionalConv3DTransposeReuse(self):
+        with tf.Graph().as_default():
+            depth, height, width = 5, 7, 9
+            volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
+            conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name="deconv1")
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            conv_layers.conv3d_transpose(
+                volumes, 4, [3, 3, 3], name="deconv1", reuse=True
+            )
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+
+    def testFunctionalConv3DTransposeReuseFromScope(self):
+        with tf.Graph().as_default():
+            with tf.compat.v1.variable_scope("scope"):
+                depth, height, width = 5, 7, 9
+                volumes = tf.random.uniform(
+                    (5, depth, height, width, 32), seed=1
+                )
+                conv_layers.conv3d_transpose(
+                    volumes, 4, [3, 3, 3], name="deconv1"
+                )
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            with tf.compat.v1.variable_scope("scope", reuse=True):
+                conv_layers.conv3d_transpose(
+                    volumes, 4, [3, 3, 3], name="deconv1"
+                )
+                self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+
+    def testFunctionalConv3DTransposeInitializerFromScope(self):
+        with tf.Graph().as_default(), self.cached_session():
+            with tf.compat.v1.variable_scope(
+                "scope", initializer=tf.compat.v1.ones_initializer()
+            ):
+                depth, height, width = 5, 7, 9
+                volumes = tf.random.uniform(
+                    (5, depth, height, width, 32), seed=1
+                )
+                conv_layers.conv3d_transpose(
+                    volumes, 4, [3, 3, 3], name="deconv1"
+                )
+                weights = tf.compat.v1.trainable_variables()
+                # Check the names of weights in order.
+                self.assertTrue("kernel" in weights[0].name)
+                self.assertTrue("bias" in weights[1].name)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                weights = self.evaluate(weights)
+                # Check that the kernel weights got initialized to ones (from
+                # scope)
+                self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
+                # Check that the bias still got initialized to zeros.
+                self.assertAllClose(weights[1], np.zeros((4)))
+
+    def testFunctionalConv3DTransposeNoReuse(self):
+        with tf.Graph().as_default():
+            depth, height, width = 5, 7, 9
+            volumes = tf.random.uniform((5, depth, height, width, 32), seed=1)
+            conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 2)
+            conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+            self.assertEqual(len(tf.compat.v1.trainable_variables()), 4)
+
+    def testConstraints(self):
+        k_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        layer = conv_layers.Conv3DTranspose(
+            2, 3, kernel_constraint=k_constraint, bias_constraint=b_constraint
+        )
+        inputs = tf.random.uniform((5, 3, 3, 3, 5), seed=1)
+        layer(inputs)
+        self.assertEqual(layer.kernel_constraint, k_constraint)
+        self.assertEqual(layer.bias_constraint, b_constraint)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/legacy_tf_layers/core.py b/keras/legacy_tf_layers/core.py
index f4af5cfdfb65..b4111dc91343 100644
--- a/keras/legacy_tf_layers/core.py
+++ b/keras/legacy_tf_layers/core.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Contains the core layers: Dense, Dropout.
 
 Also contains their functional aliases.
@@ -21,133 +21,139 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import warnings
 
+import tensorflow.compat.v2 as tf
+
 from keras import layers as keras_layers
 from keras.legacy_tf_layers import base
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 
-@keras_export(v1=['keras.__internal__.legacy.layers.Dense'])
-@tf_export(v1=['layers.Dense'])
+@keras_export(v1=["keras.__internal__.legacy.layers.Dense"])
 class Dense(keras_layers.Dense, base.Layer):
-  """Densely-connected layer class.
-
-  This layer implements the operation:
-  `outputs = activation(inputs * kernel + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `kernel` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  Args:
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.compat.v1.get_variable`.
-    bias_initializer: Initializer function for the bias.
-    kernel_regularizer: Regularizer function for the weight matrix.
-    bias_regularizer: Regularizer function for the bias.
-    activity_regularizer: Regularizer function for the output.
-    kernel_constraint: An optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: An optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require reuse=True in such cases.
-    _reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Properties:
-    units: Python integer, dimensionality of the output space.
-    activation: Activation function (callable).
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: Initializer instance (or name) for the kernel matrix.
-    bias_initializer: Initializer instance (or name) for the bias.
-    kernel_regularizer: Regularizer instance for the kernel matrix (callable)
-    bias_regularizer: Regularizer instance for the bias (callable).
-    activity_regularizer: Regularizer instance for the output (callable)
-    kernel_constraint: Constraint function for the kernel matrix.
-    bias_constraint: Constraint function for the bias.
-    kernel: Weight matrix (TensorFlow variable or tensor).
-    bias: Bias vector, if applicable (TensorFlow variable or tensor).
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Dense`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   dense = tf.compat.v1.layers.Dense(units=3)
-  ```
-
-  After:
-
-  ```python
-   dense = tf.keras.layers.Dense(units=3)
-  ```
-
-  @end_compatibility
-  """
-
-  def __init__(self, units,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=tf.compat.v1.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super().__init__(units=units,
-                                activation=activation,
-                                use_bias=use_bias,
-                                kernel_initializer=kernel_initializer,
-                                bias_initializer=bias_initializer,
-                                kernel_regularizer=kernel_regularizer,
-                                bias_regularizer=bias_regularizer,
-                                activity_regularizer=activity_regularizer,
-                                kernel_constraint=kernel_constraint,
-                                bias_constraint=bias_constraint,
-                                trainable=trainable,
-                                name=name,
-                                **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.dense'])
-@tf_export(v1=['layers.dense'])
+    """Densely-connected layer class.
+
+    This layer implements the operation:
+    `outputs = activation(inputs * kernel + bias)`
+    Where `activation` is the activation function passed as the `activation`
+    argument (if not `None`), `kernel` is a weights matrix created by the layer,
+    and `bias` is a bias vector created by the layer
+    (only if `use_bias` is `True`).
+
+    Args:
+      units: Integer or Long, dimensionality of the output space.
+      activation: Activation function (callable). Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: Initializer function for the weight matrix.
+        If `None` (default), weights are initialized using the default
+        initializer used by `tf.compat.v1.get_variable`.
+      bias_initializer: Initializer function for the bias.
+      kernel_regularizer: Regularizer function for the weight matrix.
+      bias_regularizer: Regularizer function for the bias.
+      activity_regularizer: Regularizer function for the output.
+      kernel_constraint: An optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: An optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.
+      _reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Properties:
+      units: Python integer, dimensionality of the output space.
+      activation: Activation function (callable).
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: Initializer instance (or name) for the kernel matrix.
+      bias_initializer: Initializer instance (or name) for the bias.
+      kernel_regularizer: Regularizer instance for the kernel matrix (callable)
+      bias_regularizer: Regularizer instance for the bias (callable).
+      activity_regularizer: Regularizer instance for the output (callable)
+      kernel_constraint: Constraint function for the kernel matrix.
+      bias_constraint: Constraint function for the bias.
+      kernel: Weight matrix (TensorFlow variable or tensor).
+      bias: Bias vector, if applicable (TensorFlow variable or tensor).
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Dense`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     dense = tf.compat.v1.layers.Dense(units=3)
+    ```
+
+    After:
+
+    ```python
+     dense = tf.keras.layers.Dense(units=3)
+    ```
+
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        units,
+        activation=None,
+        use_bias=True,
+        kernel_initializer=None,
+        bias_initializer=tf.compat.v1.zeros_initializer(),
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        trainable=True,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            units=units,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            bias_constraint=bias_constraint,
+            trainable=trainable,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.dense"])
 def dense(
-    inputs, units,
+    inputs,
+    units,
     activation=None,
     use_bias=True,
     kernel_initializer=None,
@@ -159,386 +165,389 @@ def dense(
     bias_constraint=None,
     trainable=True,
     name=None,
-    reuse=None):
-  """Functional interface for the densely-connected layer.
-
-  This layer implements the operation:
-  `outputs = activation(inputs * kernel + bias)`
-  where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `kernel` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  Args:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.compat.v1.get_variable`.
-    bias_initializer: Initializer function for the bias.
-    kernel_regularizer: Regularizer function for the weight matrix.
-    bias_regularizer: Regularizer function for the bias.
-    activity_regularizer: Regularizer function for the output.
-    kernel_constraint: An optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: An optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: String, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor the same shape as `inputs` except the last dimension is of
-    size `units`.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Dense`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.dense(x, units=3)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28,))
-   y = tf.keras.layers.Dense(units=3)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-
-  """
-  warnings.warn(
-      '`tf.layers.dense` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.Dense` instead.',
-      stacklevel=2)
-  layer = Dense(units,
-                activation=activation,
-                use_bias=use_bias,
-                kernel_initializer=kernel_initializer,
-                bias_initializer=bias_initializer,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer,
-                activity_regularizer=activity_regularizer,
-                kernel_constraint=kernel_constraint,
-                bias_constraint=bias_constraint,
-                trainable=trainable,
-                name=name,
-                _scope=name,
-                _reuse=reuse)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.Dropout'])
-@tf_export(v1=['layers.Dropout'])
+    reuse=None,
+):
+    """Functional interface for the densely-connected layer.
+
+    This layer implements the operation:
+    `outputs = activation(inputs * kernel + bias)`
+    where `activation` is the activation function passed as the `activation`
+    argument (if not `None`), `kernel` is a weights matrix created by the layer,
+    and `bias` is a bias vector created by the layer
+    (only if `use_bias` is `True`).
+
+    Args:
+      inputs: Tensor input.
+      units: Integer or Long, dimensionality of the output space.
+      activation: Activation function (callable). Set it to None to maintain a
+        linear activation.
+      use_bias: Boolean, whether the layer uses a bias.
+      kernel_initializer: Initializer function for the weight matrix.
+        If `None` (default), weights are initialized using the default
+        initializer used by `tf.compat.v1.get_variable`.
+      bias_initializer: Initializer function for the bias.
+      kernel_regularizer: Regularizer function for the weight matrix.
+      bias_regularizer: Regularizer function for the bias.
+      activity_regularizer: Regularizer function for the output.
+      kernel_constraint: An optional projection function to be applied to the
+          kernel after being updated by an `Optimizer` (e.g. used to implement
+          norm constraints or value constraints for layer weights). The function
+          must take as input the unprojected variable and must return the
+          projected variable (which must have the same shape). Constraints are
+          not safe to use when doing asynchronous distributed training.
+      bias_constraint: An optional projection function to be applied to the
+          bias after being updated by an `Optimizer`.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: String, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+
+    Returns:
+      Output tensor the same shape as `inputs` except the last dimension is of
+      size `units`.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Dense`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.dense(x, units=3)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28,))
+     y = tf.keras.layers.Dense(units=3)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+
+    """
+    warnings.warn(
+        "`tf.layers.dense` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.Dense` instead.",
+        stacklevel=2,
+    )
+    layer = Dense(
+        units,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        _scope=name,
+        _reuse=reuse,
+    )
+    return layer(inputs)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.Dropout"])
 class Dropout(keras_layers.Dropout, base.Layer):
-  """Applies Dropout to the input.
-
-  Dropout consists in randomly setting a fraction `rate` of input units to 0
-  at each update during training time, which helps prevent overfitting.
-  The units that are kept are scaled by `1 / (1 - rate)`, so that their
-  sum is unchanged at training time and inference time.
-
-  Args:
-    rate: The dropout rate, between 0 and 1. E.g. `rate=0.1` would drop out
-      10% of input units.
-    noise_shape: 1D tensor of type `int32` representing the shape of the
-      binary dropout mask that will be multiplied with the input.
-      For instance, if your inputs have shape
-      `(batch_size, timesteps, features)`, and you want the dropout mask
-      to be the same for all timesteps, you can use
-      `noise_shape=[batch_size, 1, features]`.
-    seed: A Python integer. Used to create random seeds. See
-      `tf.compat.v1.set_random_seed`.
-      for behavior.
-    name: The name of the layer (string).
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Dropout`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   dropout = tf.compat.v1.layers.Dropout()
-  ```
-
-  After:
-
-  ```python
-   dropout = tf.keras.layers.Dropout()
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, rate=0.5,
-               noise_shape=None,
-               seed=None,
-               name=None,
-               **kwargs):
-    super().__init__(rate=rate,
-                                  noise_shape=noise_shape,
-                                  seed=seed,
-                                  name=name,
-                                  **kwargs)
-
-  def call(self, inputs, training=False):
-    return super().call(inputs, training=training)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.dropout'])
-@tf_export(v1=['layers.dropout'])
-def dropout(inputs,
-            rate=0.5,
-            noise_shape=None,
-            seed=None,
-            training=False,
-            name=None):
-  """Applies Dropout to the input.
-
-  Dropout consists in randomly setting a fraction `rate` of input units to 0
-  at each update during training time, which helps prevent overfitting.
-  The units that are kept are scaled by `1 / (1 - rate)`, so that their
-  sum is unchanged at training time and inference time.
-
-  Args:
-    inputs: Tensor input.
-    rate: The dropout rate, between 0 and 1. E.g. "rate=0.1" would drop out
-      10% of input units.
-    noise_shape: 1D tensor of type `int32` representing the shape of the
-      binary dropout mask that will be multiplied with the input.
-      For instance, if your inputs have shape
-      `(batch_size, timesteps, features)`, and you want the dropout mask
-      to be the same for all timesteps, you can use
-      `noise_shape=[batch_size, 1, features]`.
-    seed: A Python integer. Used to create random seeds. See
-      `tf.compat.v1.set_random_seed`
-      for behavior.
-    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
-      (e.g. a placeholder). Whether to return the output in training mode
-      (apply dropout) or in inference mode (return the input untouched).
-    name: The name of the layer (string).
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Dropout`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.dropout(x)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.Dropout()(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.dropout` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.Dropout` instead.',
-      stacklevel=2)
-  layer = Dropout(rate, noise_shape=noise_shape, seed=seed, name=name)
-  return layer(inputs, training=training)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.Flatten'])
-@tf_export(v1=['layers.Flatten'])
+    """Applies Dropout to the input.
+
+    Dropout consists in randomly setting a fraction `rate` of input units to 0
+    at each update during training time, which helps prevent overfitting.
+    The units that are kept are scaled by `1 / (1 - rate)`, so that their
+    sum is unchanged at training time and inference time.
+
+    Args:
+      rate: The dropout rate, between 0 and 1. E.g. `rate=0.1` would drop out
+        10% of input units.
+      noise_shape: 1D tensor of type `int32` representing the shape of the
+        binary dropout mask that will be multiplied with the input.
+        For instance, if your inputs have shape
+        `(batch_size, timesteps, features)`, and you want the dropout mask
+        to be the same for all timesteps, you can use
+        `noise_shape=[batch_size, 1, features]`.
+      seed: A Python integer. Used to create random seeds. See
+        `tf.compat.v1.set_random_seed`.
+        for behavior.
+      name: The name of the layer (string).
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Dropout`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     dropout = tf.compat.v1.layers.Dropout()
+    ```
+
+    After:
+
+    ```python
+     dropout = tf.keras.layers.Dropout()
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self, rate=0.5, noise_shape=None, seed=None, name=None, **kwargs
+    ):
+        # Force the rng type to be legacy stateful since the new stateful code
+        # path is not supported by legacy layer.
+        super().__init__(
+            rate=rate,
+            noise_shape=noise_shape,
+            seed=seed,
+            name=name,
+            rng_type="legacy_stateful",
+            **kwargs
+        )
+
+    def call(self, inputs, training=False):
+        return super().call(inputs, training=training)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.dropout"])
+def dropout(
+    inputs, rate=0.5, noise_shape=None, seed=None, training=False, name=None
+):
+    """Applies Dropout to the input.
+
+    Dropout consists in randomly setting a fraction `rate` of input units to 0
+    at each update during training time, which helps prevent overfitting.
+    The units that are kept are scaled by `1 / (1 - rate)`, so that their
+    sum is unchanged at training time and inference time.
+
+    Args:
+      inputs: Tensor input.
+      rate: The dropout rate, between 0 and 1. E.g. "rate=0.1" would drop out
+        10% of input units.
+      noise_shape: 1D tensor of type `int32` representing the shape of the
+        binary dropout mask that will be multiplied with the input.
+        For instance, if your inputs have shape
+        `(batch_size, timesteps, features)`, and you want the dropout mask
+        to be the same for all timesteps, you can use
+        `noise_shape=[batch_size, 1, features]`.
+      seed: A Python integer. Used to create random seeds. See
+        `tf.compat.v1.set_random_seed`
+        for behavior.
+      training: Either a Python boolean, or a TensorFlow boolean scalar tensor
+        (e.g. a placeholder). Whether to return the output in training mode
+        (apply dropout) or in inference mode (return the input untouched).
+      name: The name of the layer (string).
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Dropout`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.dropout(x)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.Dropout()(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.dropout` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.Dropout` instead.",
+        stacklevel=2,
+    )
+    layer = Dropout(rate, noise_shape=noise_shape, seed=seed, name=name)
+    return layer(inputs, training=training)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.Flatten"])
 class Flatten(keras_layers.Flatten, base.Layer):
-  """Flattens an input tensor while preserving the batch axis (axis 0).
+    """Flattens an input tensor while preserving the batch axis (axis 0).
+
+    Args:
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, ..., channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, ...)`.
 
-  Args:
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
+    Examples:
 
-  Examples:
+    ```
+      x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
+      y = Flatten()(x)
+      # now `y` has shape `(None, 16)`
 
-  ```
-    x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
-    y = Flatten()(x)
-    # now `y` has shape `(None, 16)`
+      x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
+      y = Flatten()(x)
+      # now `y` has shape `(None, None)`
+    ```
 
-    x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
-    y = Flatten()(x)
-    # now `y` has shape `(None, None)`
-  ```
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
 
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
 
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Flatten`.
 
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Flatten`.
 
+    #### Structural Mapping to Native TF2
 
-  #### Structural Mapping to Native TF2
+    None of the supported arguments have changed name.
 
-  None of the supported arguments have changed name.
+    Before:
 
-  Before:
+    ```python
+     flatten = tf.compat.v1.layers.Flatten()
+    ```
 
-  ```python
-   flatten = tf.compat.v1.layers.Flatten()
-  ```
+    After:
 
-  After:
+    ```python
+     flatten = tf.keras.layers.Flatten()
+    ```
+    @end_compatibility
+    """
 
-  ```python
-   flatten = tf.keras.layers.Flatten()
-  ```
-  @end_compatibility
-  """
-  pass
+    pass
 
 
-@keras_export(v1=['keras.__internal__.legacy.layers.flatten'])
-@tf_export(v1=['layers.flatten'])
-def flatten(inputs, name=None, data_format='channels_last'):
-  """Flattens an input tensor while preserving the batch axis (axis 0).
+@keras_export(v1=["keras.__internal__.legacy.layers.flatten"])
+def flatten(inputs, name=None, data_format="channels_last"):
+    """Flattens an input tensor while preserving the batch axis (axis 0).
 
-  Args:
-    inputs: Tensor input.
-    name: The name of the layer (string).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
+    Args:
+      inputs: Tensor input.
+      name: The name of the layer (string).
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
 
-  Returns:
-    Reshaped tensor.
+    Returns:
+      Reshaped tensor.
 
-  Examples:
+    Examples:
 
-  ```
-    x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
-    y = flatten(x)
-    # now `y` has shape `(None, 16)`
+    ```
+      x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
+      y = flatten(x)
+      # now `y` has shape `(None, 16)`
 
-    x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
-    y = flatten(x)
-    # now `y` has shape `(None, None)`
-  ```
+      x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
+      y = flatten(x)
+      # now `y` has shape `(None, None)`
+    ```
 
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
 
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
 
-  The corresponding TensorFlow v2 layer is `tf.keras.layers.Flatten`.
+    The corresponding TensorFlow v2 layer is `tf.keras.layers.Flatten`.
 
 
-  #### Structural Mapping to Native TF2
+    #### Structural Mapping to Native TF2
 
-  None of the supported arguments have changed name.
+    None of the supported arguments have changed name.
 
-  Before:
+    Before:
 
-  ```python
-   y = tf.compat.v1.layers.flatten(x)
-  ```
+    ```python
+     y = tf.compat.v1.layers.flatten(x)
+    ```
 
-  After:
+    After:
 
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
 
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.Flatten()(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.flatten` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.Flatten` instead.',
-      stacklevel=2)
-  layer = Flatten(name=name, data_format=data_format)
-  return layer(inputs)
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.Flatten()(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.flatten` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.Flatten` instead.",
+        stacklevel=2,
+    )
+    layer = Flatten(name=name, data_format=data_format)
+    return layer(inputs)
 
 
 # Aliases
diff --git a/keras/legacy_tf_layers/core_test.py b/keras/legacy_tf_layers/core_test.py
index e945a89d1939..558aa823d4b4 100644
--- a/keras/legacy_tf_layers/core_test.py
+++ b/keras/legacy_tf_layers/core_test.py
@@ -18,548 +18,636 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import collections
 import platform
 
-from absl.testing import parameterized
 import numpy as np
-from tensorflow.python.framework import test_util as tf_test_utils
-from keras.testing_infra import test_combinations
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras.legacy_tf_layers import core as core_layers
+from keras.testing_infra import test_combinations
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 from tensorflow.python.ops import variable_scope
 
 
 class DenseTest(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDenseProperties(self):
+        dense = core_layers.Dense(2, activation=tf.nn.relu, name="my_dense")
+        self.assertEqual(dense.units, 2)
+        self.assertEqual(dense.activation, tf.nn.relu)
+        self.assertEqual(dense.kernel_regularizer, None)
+        self.assertEqual(dense.bias_regularizer, None)
+        self.assertEqual(dense.activity_regularizer, None)
+        self.assertEqual(dense.use_bias, True)
+
+        # Test auto-naming
+        dense = core_layers.Dense(2, activation=tf.nn.relu)
+        dense(tf.random.uniform((5, 2)))
+        self.assertEqual(dense.name, "dense_1")
+        dense = core_layers.Dense(2, activation=tf.nn.relu)
+        dense(tf.random.uniform((5, 2)))
+        self.assertEqual(dense.name, "dense_2")
+
+    @tf_test_utils.run_deprecated_v1
+    def testVariableInput(self):
+        with self.cached_session():
+            v = tf.compat.v1.get_variable(
+                "X", initializer=tf.compat.v1.zeros_initializer(), shape=(1, 1)
+            )
+            x = core_layers.Dense(1)(v)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.assertAllEqual(x, [[0.0]])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testCall(self):
+        dense = core_layers.Dense(2, activation=tf.nn.relu, name="my_dense")
+        inputs = tf.random.uniform((5, 4), seed=1)
+        outputs = dense(inputs)
+        self.assertListEqual([5, 2], outputs.get_shape().as_list())
+        self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
+        self.assertListEqual(
+            dense.trainable_variables, [dense.kernel, dense.bias]
+        )
+        self.assertListEqual(dense.non_trainable_variables, [])
+        if not tf.executing_eagerly():
+            self.assertEqual(
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+                    )
+                ),
+                2,
+            )
+        self.assertEqual(dense.kernel.name, "my_dense/kernel:0")
+        self.assertEqual(dense.bias.name, "my_dense/bias:0")
+
+    @tf_test_utils.assert_no_new_pyobjects_executing_eagerly
+    def testNoEagerLeak(self):
+        # Tests that repeatedly constructing and building a Layer does not leak
+        # Python objects.
+        inputs = tf.random.uniform((5, 4), seed=1)
+        core_layers.Dense(5)(inputs)
+        core_layers.Dense(2, activation=tf.nn.relu, name="my_dense")(inputs)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testCallTensorDot(self):
+        dense = core_layers.Dense(2, activation=tf.nn.relu, name="my_dense")
+        inputs = tf.random.uniform((5, 4, 3), seed=1)
+        outputs = dense(inputs)
+        self.assertListEqual([5, 4, 2], outputs.get_shape().as_list())
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoBias(self):
+        dense = core_layers.Dense(2, use_bias=False, name="my_dense")
+        inputs = tf.random.uniform((5, 2), seed=1)
+        _ = dense(inputs)
+        self.assertListEqual(dense.variables, [dense.kernel])
+        self.assertListEqual(dense.trainable_variables, [dense.kernel])
+        self.assertListEqual(dense.non_trainable_variables, [])
+        if not tf.executing_eagerly():
+            self.assertEqual(
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+                    )
+                ),
+                1,
+            )
+        self.assertEqual(dense.kernel.name, "my_dense/kernel:0")
+        self.assertEqual(dense.bias, None)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNonTrainable(self):
+        dense = core_layers.Dense(2, trainable=False, name="my_dense")
+        inputs = tf.random.uniform((5, 2), seed=1)
+        _ = dense(inputs)
+        self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
+        self.assertListEqual(
+            dense.non_trainable_variables, [dense.kernel, dense.bias]
+        )
+        self.assertListEqual(dense.trainable_variables, [])
+        if not tf.executing_eagerly():
+            self.assertEqual(
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+                    )
+                ),
+                0,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testOutputShape(self):
+        dense = core_layers.Dense(7, activation=tf.nn.relu, name="my_dense")
+        inputs = tf.random.uniform((5, 3), seed=1)
+        outputs = dense(inputs)
+        self.assertEqual(outputs.get_shape().as_list(), [5, 7])
+
+        inputs = tf.random.uniform((5, 2, 3), seed=1)
+        outputs = dense(inputs)
+        self.assertEqual(outputs.get_shape().as_list(), [5, 2, 7])
+
+        inputs = tf.random.uniform((1, 2, 4, 3), seed=1)
+        outputs = dense(inputs)
+        self.assertEqual(outputs.get_shape().as_list(), [1, 2, 4, 7])
+
+    @tf_test_utils.run_deprecated_v1
+    def testCallOnPlaceHolder(self):
+        inputs = tf.compat.v1.placeholder(dtype=tf.float32)
+        dense = core_layers.Dense(4, name="my_dense")
+        with self.assertRaises(ValueError):
+            dense(inputs)
+
+        inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, None])
+        dense = core_layers.Dense(4, name="my_dense")
+        with self.assertRaises(ValueError):
+            dense(inputs)
+
+        inputs = tf.compat.v1.placeholder(
+            dtype=tf.float32, shape=[None, None, None]
+        )
+        dense = core_layers.Dense(4, name="my_dense")
+        with self.assertRaises(ValueError):
+            dense(inputs)
+
+        inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 3])
+        dense = core_layers.Dense(4, name="my_dense")
+        dense(inputs)
+
+        inputs = tf.compat.v1.placeholder(
+            dtype=tf.float32, shape=[None, None, 3]
+        )
+        dense = core_layers.Dense(4, name="my_dense")
+        dense(inputs)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testActivation(self):
+        dense = core_layers.Dense(2, activation=tf.nn.relu, name="dense1")
+        inputs = tf.random.uniform((5, 3), seed=1)
+        outputs = dense(inputs)
+        if not tf.executing_eagerly():
+            self.assertEqual(outputs.op.name, "dense1/Relu")
 
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testDenseProperties(self):
-    dense = core_layers.Dense(2, activation=tf.nn.relu, name='my_dense')
-    self.assertEqual(dense.units, 2)
-    self.assertEqual(dense.activation, tf.nn.relu)
-    self.assertEqual(dense.kernel_regularizer, None)
-    self.assertEqual(dense.bias_regularizer, None)
-    self.assertEqual(dense.activity_regularizer, None)
-    self.assertEqual(dense.use_bias, True)
-
-    # Test auto-naming
-    dense = core_layers.Dense(2, activation=tf.nn.relu)
-    dense(tf.random.uniform((5, 2)))
-    self.assertEqual(dense.name, 'dense_1')
-    dense = core_layers.Dense(2, activation=tf.nn.relu)
-    dense(tf.random.uniform((5, 2)))
-    self.assertEqual(dense.name, 'dense_2')
-
-  @tf_test_utils.run_deprecated_v1
-  def testVariableInput(self):
-    with self.cached_session():
-      v = tf.compat.v1.get_variable(
-          'X', initializer=tf.compat.v1.zeros_initializer(), shape=(1, 1))
-      x = core_layers.Dense(1)(v)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.assertAllEqual(x, [[0.0]])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testCall(self):
-    dense = core_layers.Dense(2, activation=tf.nn.relu, name='my_dense')
-    inputs = tf.random.uniform((5, 4), seed=1)
-    outputs = dense(inputs)
-    self.assertListEqual([5, 2], outputs.get_shape().as_list())
-    self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense.trainable_variables,
-                         [dense.kernel, dense.bias])
-    self.assertListEqual(dense.non_trainable_variables, [])
-    if not tf.executing_eagerly():
-      self.assertEqual(
-          len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)), 2)
-    self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
-    self.assertEqual(dense.bias.name, 'my_dense/bias:0')
-
-  @tf_test_utils.assert_no_new_pyobjects_executing_eagerly
-  def testNoEagerLeak(self):
-    # Tests that repeatedly constructing and building a Layer does not leak
-    # Python objects.
-    inputs = tf.random.uniform((5, 4), seed=1)
-    core_layers.Dense(5)(inputs)
-    core_layers.Dense(2, activation=tf.nn.relu, name='my_dense')(inputs)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testCallTensorDot(self):
-    dense = core_layers.Dense(2, activation=tf.nn.relu, name='my_dense')
-    inputs = tf.random.uniform((5, 4, 3), seed=1)
-    outputs = dense(inputs)
-    self.assertListEqual([5, 4, 2], outputs.get_shape().as_list())
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoBias(self):
-    dense = core_layers.Dense(2, use_bias=False, name='my_dense')
-    inputs = tf.random.uniform((5, 2), seed=1)
-    _ = dense(inputs)
-    self.assertListEqual(dense.variables, [dense.kernel])
-    self.assertListEqual(dense.trainable_variables, [dense.kernel])
-    self.assertListEqual(dense.non_trainable_variables, [])
-    if not tf.executing_eagerly():
-      self.assertEqual(
-          len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)), 1)
-    self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
-    self.assertEqual(dense.bias, None)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNonTrainable(self):
-    dense = core_layers.Dense(2, trainable=False, name='my_dense')
-    inputs = tf.random.uniform((5, 2), seed=1)
-    _ = dense(inputs)
-    self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense.non_trainable_variables,
-                         [dense.kernel, dense.bias])
-    self.assertListEqual(dense.trainable_variables, [])
-    if not tf.executing_eagerly():
-      self.assertEqual(
-          len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)), 0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testOutputShape(self):
-    dense = core_layers.Dense(7, activation=tf.nn.relu, name='my_dense')
-    inputs = tf.random.uniform((5, 3), seed=1)
-    outputs = dense(inputs)
-    self.assertEqual(outputs.get_shape().as_list(), [5, 7])
-
-    inputs = tf.random.uniform((5, 2, 3), seed=1)
-    outputs = dense(inputs)
-    self.assertEqual(outputs.get_shape().as_list(), [5, 2, 7])
-
-    inputs = tf.random.uniform((1, 2, 4, 3), seed=1)
-    outputs = dense(inputs)
-    self.assertEqual(outputs.get_shape().as_list(), [1, 2, 4, 7])
-
-  @tf_test_utils.run_deprecated_v1
-  def testCallOnPlaceHolder(self):
-    inputs = tf.compat.v1.placeholder(dtype=tf.float32)
-    dense = core_layers.Dense(4, name='my_dense')
-    with self.assertRaises(ValueError):
-      dense(inputs)
-
-    inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, None])
-    dense = core_layers.Dense(4, name='my_dense')
-    with self.assertRaises(ValueError):
-      dense(inputs)
-
-    inputs = tf.compat.v1.placeholder(
-        dtype=tf.float32, shape=[None, None, None])
-    dense = core_layers.Dense(4, name='my_dense')
-    with self.assertRaises(ValueError):
-      dense(inputs)
-
-    inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 3])
-    dense = core_layers.Dense(4, name='my_dense')
-    dense(inputs)
-
-    inputs = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, None, 3])
-    dense = core_layers.Dense(4, name='my_dense')
-    dense(inputs)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testActivation(self):
-    dense = core_layers.Dense(2, activation=tf.nn.relu, name='dense1')
-    inputs = tf.random.uniform((5, 3), seed=1)
-    outputs = dense(inputs)
-    if not tf.executing_eagerly():
-      self.assertEqual(outputs.op.name, 'dense1/Relu')
-
-    dense = core_layers.Dense(2, name='dense2')
-    inputs = tf.random.uniform((5, 3), seed=1)
-    outputs = dense(inputs)
-    if not tf.executing_eagerly():
-      self.assertEqual(outputs.op.name, 'dense2/BiasAdd')
-
-  @tf_test_utils.run_deprecated_v1
-  def testActivityRegularizer(self):
-    regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    dense = core_layers.Dense(
-        2, name='my_dense', activity_regularizer=regularizer)
-    inputs = tf.random.uniform((5, 3), seed=1)
-    _ = dense(inputs)
-    loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.assertListEqual(dense.losses, loss_keys)
-
-  @tf_test_utils.run_deprecated_v1
-  def testKernelRegularizer(self):
-    regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    dense = core_layers.Dense(
-        2, name='my_dense', kernel_regularizer=regularizer)
-    inputs = tf.random.uniform((5, 3), seed=1)
-    _ = dense(inputs)
-    loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in dense.variables])
-    self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
-
-  @tf_test_utils.run_deprecated_v1
-  def testKernelRegularizerWithReuse(self):
-    regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    inputs = tf.random.uniform((5, 3), seed=1)
-    _ = core_layers.dense(
-        inputs, 2, name='my_dense', kernel_regularizer=regularizer)
-    self.assertEqual(
-        len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)), 1)
-    _ = core_layers.dense(
-        inputs, 2, name='my_dense', kernel_regularizer=regularizer, reuse=True)
-    self.assertEqual(
-        len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)), 1)
-
-  @tf_test_utils.run_deprecated_v1
-  def testBiasRegularizer(self):
-    regularizer = lambda x: tf.reduce_sum(x) * 1e-3
-    dense = core_layers.Dense(2, name='my_dense', bias_regularizer=regularizer)
-    inputs = tf.random.uniform((5, 3), seed=1)
-    _ = dense(inputs)
-    loss_keys = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-    self.assertEqual(len(loss_keys), 1)
-    self.evaluate([v.initializer for v in dense.variables])
-    self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
-
-  @tf_test_utils.run_deprecated_v1
-  def testFunctionalDense(self):
-    with self.cached_session():
-      inputs = tf.random.uniform((5, 3), seed=1)
-      outputs = core_layers.dense(
-          inputs, 2, activation=tf.nn.relu, name='my_dense')
-      self.assertEqual(
-          len(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)), 2)
-      self.assertEqual(outputs.op.name, 'my_dense/Relu')
-
-  @tf_test_utils.run_deprecated_v1
-  def testFunctionalDenseTwice(self):
-    inputs = tf.random.uniform((5, 3), seed=1)
-    core_layers.dense(inputs, 2)
-    vars1 = _get_variable_dict_from_varstore().values()
-    core_layers.dense(inputs, 2)
-    vars2 = _get_variable_dict_from_varstore().values()
-    self.assertEqual(len(vars1), 2)
-    self.assertEqual(len(vars2), 4)
-
-  # TODO(alive): get this to  work in eager mode.
-  def testFunctionalDenseTwiceReuse(self):
-    with self.cached_session():
-      inputs = tf.random.uniform((5, 3), seed=1)
-      core_layers.dense(inputs, 2, name='my_dense')
-      vars1 = tf.compat.v1.trainable_variables()
-      core_layers.dense(inputs, 2, name='my_dense', reuse=True)
-      vars2 = tf.compat.v1.trainable_variables()
-      self.assertEqual(vars1, vars2)
-
-  # TODO(alive): get this to  work in eager mode.
-  def testFunctionalDenseTwiceReuseFromScope(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope('scope'):
+        dense = core_layers.Dense(2, name="dense2")
+        inputs = tf.random.uniform((5, 3), seed=1)
+        outputs = dense(inputs)
+        if not tf.executing_eagerly():
+            self.assertEqual(outputs.op.name, "dense2/BiasAdd")
+
+    @tf_test_utils.run_deprecated_v1
+    def testActivityRegularizer(self):
+        regularizer = lambda x: tf.reduce_sum(x) * 1e-3
+        dense = core_layers.Dense(
+            2, name="my_dense", activity_regularizer=regularizer
+        )
+        inputs = tf.random.uniform((5, 3), seed=1)
+        _ = dense(inputs)
+        loss_keys = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+        )
+        self.assertEqual(len(loss_keys), 1)
+        self.assertListEqual(dense.losses, loss_keys)
+
+    @tf_test_utils.run_deprecated_v1
+    def testKernelRegularizer(self):
+        regularizer = lambda x: tf.reduce_sum(x) * 1e-3
+        dense = core_layers.Dense(
+            2, name="my_dense", kernel_regularizer=regularizer
+        )
         inputs = tf.random.uniform((5, 3), seed=1)
-        core_layers.dense(inputs, 2, name='my_dense')
-        vars1 = tf.compat.v1.trainable_variables()
-      with tf.compat.v1.variable_scope('scope', reuse=True):
-        core_layers.dense(inputs, 2, name='my_dense')
-        vars2 = tf.compat.v1.trainable_variables()
-      self.assertEqual(vars1, vars2)
-
-  @tf_test_utils.run_deprecated_v1
-  def testFunctionalDenseInitializerFromScope(self):
-    with tf.compat.v1.variable_scope(
-        'scope',
-        initializer=tf.compat.v1.ones_initializer()), self.cached_session():
-      inputs = tf.random.uniform((5, 3), seed=1)
-      core_layers.dense(inputs, 2)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      weights = _get_variable_dict_from_varstore()
-      self.assertEqual(len(weights), 2)
-      # Check that the matrix weights got initialized to ones (from scope).
-      self.assertAllClose(weights['scope/dense/kernel'].read_value(),
-                          np.ones((3, 2)))
-      # Check that the bias still got initialized to zeros.
-      self.assertAllClose(weights['scope/dense/bias'].read_value(), np.zeros(
-          (2)))
-
-  def testFunctionalDenseWithCustomGetter(self):
-    called = [0]
-
-    def custom_getter(getter, *args, **kwargs):
-      called[0] += 1
-      return getter(*args, **kwargs)
-
-    with tf.compat.v1.variable_scope('test', custom_getter=custom_getter):
-      inputs = tf.random.uniform((5, 3), seed=1)
-      core_layers.dense(inputs, 2)
-    self.assertEqual(called[0], 2)
-
-  @tf_test_utils.run_deprecated_v1
-  def testFunctionalDenseInScope(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope('test'):
+        _ = dense(inputs)
+        loss_keys = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+        )
+        self.assertEqual(len(loss_keys), 1)
+        self.evaluate([v.initializer for v in dense.variables])
+        self.assertAllEqual(
+            self.evaluate(dense.losses), self.evaluate(loss_keys)
+        )
+
+    @tf_test_utils.run_deprecated_v1
+    def testKernelRegularizerWithReuse(self):
+        regularizer = lambda x: tf.reduce_sum(x) * 1e-3
         inputs = tf.random.uniform((5, 3), seed=1)
-        core_layers.dense(inputs, 2, name='my_dense')
-        var_dict = _get_variable_dict_from_varstore()
-        var_key = 'test/my_dense/kernel'
-        self.assertEqual(var_dict[var_key].name, '%s:0' % var_key)
-      with tf.compat.v1.variable_scope('test1') as scope:
+        _ = core_layers.dense(
+            inputs, 2, name="my_dense", kernel_regularizer=regularizer
+        )
+        self.assertEqual(
+            len(
+                tf.compat.v1.get_collection(
+                    tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+                )
+            ),
+            1,
+        )
+        _ = core_layers.dense(
+            inputs,
+            2,
+            name="my_dense",
+            kernel_regularizer=regularizer,
+            reuse=True,
+        )
+        self.assertEqual(
+            len(
+                tf.compat.v1.get_collection(
+                    tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+                )
+            ),
+            1,
+        )
+
+    @tf_test_utils.run_deprecated_v1
+    def testBiasRegularizer(self):
+        regularizer = lambda x: tf.reduce_sum(x) * 1e-3
+        dense = core_layers.Dense(
+            2, name="my_dense", bias_regularizer=regularizer
+        )
         inputs = tf.random.uniform((5, 3), seed=1)
-        core_layers.dense(inputs, 2, name=scope)
-        var_dict = _get_variable_dict_from_varstore()
-        var_key = 'test1/kernel'
-        self.assertEqual(var_dict[var_key].name, '%s:0' % var_key)
-      with tf.compat.v1.variable_scope('test2'):
+        _ = dense(inputs)
+        loss_keys = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES
+        )
+        self.assertEqual(len(loss_keys), 1)
+        self.evaluate([v.initializer for v in dense.variables])
+        self.assertAllEqual(
+            self.evaluate(dense.losses), self.evaluate(loss_keys)
+        )
+
+    @tf_test_utils.run_deprecated_v1
+    def testFunctionalDense(self):
+        with self.cached_session():
+            inputs = tf.random.uniform((5, 3), seed=1)
+            outputs = core_layers.dense(
+                inputs, 2, activation=tf.nn.relu, name="my_dense"
+            )
+            self.assertEqual(
+                len(
+                    tf.compat.v1.get_collection(
+                        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+                    )
+                ),
+                2,
+            )
+            self.assertEqual(outputs.op.name, "my_dense/Relu")
+
+    @tf_test_utils.run_deprecated_v1
+    def testFunctionalDenseTwice(self):
         inputs = tf.random.uniform((5, 3), seed=1)
         core_layers.dense(inputs, 2)
-        var_dict = _get_variable_dict_from_varstore()
-        var_key = 'test2/dense/kernel'
-        self.assertEqual(var_dict[var_key].name, '%s:0' % var_key)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testComputeOutputShape(self):
-    dense = core_layers.Dense(2, activation=tf.nn.relu, name='dense1')
-    ts = tf.TensorShape
-    # pylint: disable=protected-access
-    with self.assertRaises(ValueError):
-      dense.compute_output_shape(ts(None))
-    with self.assertRaises(ValueError):
-      dense.compute_output_shape(ts([]))
-    with self.assertRaises(ValueError):
-      dense.compute_output_shape(ts([1]))
-    self.assertEqual(
-        [None, 2],
-        dense.compute_output_shape((None, 3)).as_list())
-    self.assertEqual(
-        [None, 2],
-        dense.compute_output_shape(ts([None, 3])).as_list())
-    self.assertEqual(
-        [None, 4, 2],
-        dense.compute_output_shape(ts([None, 4, 3])).as_list())
-    # pylint: enable=protected-access
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testConstraints(self):
-    k_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    dense = core_layers.Dense(2,
-                              kernel_constraint=k_constraint,
-                              bias_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 3), seed=1)
-    dense(inputs)
-    self.assertEqual(dense.kernel_constraint, k_constraint)
-    self.assertEqual(dense.bias_constraint, b_constraint)
+        vars1 = _get_variable_dict_from_varstore().values()
+        core_layers.dense(inputs, 2)
+        vars2 = _get_variable_dict_from_varstore().values()
+        self.assertEqual(len(vars1), 2)
+        self.assertEqual(len(vars2), 4)
+
+    # TODO(alive): get this to  work in eager mode.
+    def testFunctionalDenseTwiceReuse(self):
+        with self.cached_session():
+            inputs = tf.random.uniform((5, 3), seed=1)
+            core_layers.dense(inputs, 2, name="my_dense")
+            vars1 = tf.compat.v1.trainable_variables()
+            core_layers.dense(inputs, 2, name="my_dense", reuse=True)
+            vars2 = tf.compat.v1.trainable_variables()
+            self.assertEqual(vars1, vars2)
+
+    # TODO(alive): get this to  work in eager mode.
+    def testFunctionalDenseTwiceReuseFromScope(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("scope"):
+                inputs = tf.random.uniform((5, 3), seed=1)
+                core_layers.dense(inputs, 2, name="my_dense")
+                vars1 = tf.compat.v1.trainable_variables()
+            with tf.compat.v1.variable_scope("scope", reuse=True):
+                core_layers.dense(inputs, 2, name="my_dense")
+                vars2 = tf.compat.v1.trainable_variables()
+            self.assertEqual(vars1, vars2)
+
+    @tf_test_utils.run_deprecated_v1
+    def testFunctionalDenseInitializerFromScope(self):
+        with tf.compat.v1.variable_scope(
+            "scope", initializer=tf.compat.v1.ones_initializer()
+        ), self.cached_session():
+            inputs = tf.random.uniform((5, 3), seed=1)
+            core_layers.dense(inputs, 2)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            weights = _get_variable_dict_from_varstore()
+            self.assertEqual(len(weights), 2)
+            # Check that the matrix weights got initialized to ones (from
+            # scope).
+            self.assertAllClose(
+                weights["scope/dense/kernel"].read_value(), np.ones((3, 2))
+            )
+            # Check that the bias still got initialized to zeros.
+            self.assertAllClose(
+                weights["scope/dense/bias"].read_value(), np.zeros((2))
+            )
+
+    def testFunctionalDenseWithCustomGetter(self):
+        called = [0]
+
+        def custom_getter(getter, *args, **kwargs):
+            called[0] += 1
+            return getter(*args, **kwargs)
+
+        with tf.compat.v1.variable_scope("test", custom_getter=custom_getter):
+            inputs = tf.random.uniform((5, 3), seed=1)
+            core_layers.dense(inputs, 2)
+        self.assertEqual(called[0], 2)
+
+    @tf_test_utils.run_deprecated_v1
+    def testFunctionalDenseInScope(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("test"):
+                inputs = tf.random.uniform((5, 3), seed=1)
+                core_layers.dense(inputs, 2, name="my_dense")
+                var_dict = _get_variable_dict_from_varstore()
+                var_key = "test/my_dense/kernel"
+                self.assertEqual(var_dict[var_key].name, f"{var_key}:0")
+            with tf.compat.v1.variable_scope("test1") as scope:
+                inputs = tf.random.uniform((5, 3), seed=1)
+                core_layers.dense(inputs, 2, name=scope)
+                var_dict = _get_variable_dict_from_varstore()
+                var_key = "test1/kernel"
+                self.assertEqual(var_dict[var_key].name, f"{var_key}:0")
+            with tf.compat.v1.variable_scope("test2"):
+                inputs = tf.random.uniform((5, 3), seed=1)
+                core_layers.dense(inputs, 2)
+                var_dict = _get_variable_dict_from_varstore()
+                var_key = "test2/dense/kernel"
+                self.assertEqual(var_dict[var_key].name, f"{var_key}:0")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testComputeOutputShape(self):
+        dense = core_layers.Dense(2, activation=tf.nn.relu, name="dense1")
+        ts = tf.TensorShape
+
+        with self.assertRaises(ValueError):
+            dense.compute_output_shape(ts(None))
+        with self.assertRaises(ValueError):
+            dense.compute_output_shape(ts([]))
+        with self.assertRaises(ValueError):
+            dense.compute_output_shape(ts([1]))
+        self.assertEqual(
+            [None, 2], dense.compute_output_shape((None, 3)).as_list()
+        )
+        self.assertEqual(
+            [None, 2], dense.compute_output_shape(ts([None, 3])).as_list()
+        )
+        self.assertEqual(
+            [None, 4, 2], dense.compute_output_shape(ts([None, 4, 3])).as_list()
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testConstraints(self):
+        k_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        dense = core_layers.Dense(
+            2, kernel_constraint=k_constraint, bias_constraint=b_constraint
+        )
+        inputs = tf.random.uniform((5, 3), seed=1)
+        dense(inputs)
+        self.assertEqual(dense.kernel_constraint, k_constraint)
+        self.assertEqual(dense.bias_constraint, b_constraint)
 
 
 def _get_variable_dict_from_varstore():
-  var_dict = variable_scope._get_default_variable_store()._vars  # pylint: disable=protected-access
-  sorted_var_dict = collections.OrderedDict(
-      sorted(var_dict.items(), key=lambda t: t[0]))
-  return sorted_var_dict
+    var_dict = variable_scope._get_default_variable_store()._vars
+    sorted_var_dict = collections.OrderedDict(
+        sorted(var_dict.items(), key=lambda t: t[0])
+    )
+    return sorted_var_dict
 
 
 class DropoutTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testDropoutProperties(self):
-    dp = core_layers.Dropout(0.5, name='dropout')
-    self.assertEqual(dp.rate, 0.5)
-    self.assertEqual(dp.noise_shape, None)
-    dp(tf.ones(()))
-    self.assertEqual(dp.name, 'dropout')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testBooleanLearningPhase(self):
-    dp = core_layers.Dropout(0.5)
-    inputs = tf.ones((5, 3))
-    dropped = dp(inputs, training=True)
-    if not tf.executing_eagerly():
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-    np_output = self.evaluate(dropped)
-    self.assertAlmostEqual(0., np_output.min())
-    dropped = dp(inputs, training=False)
-    np_output = self.evaluate(dropped)
-    self.assertAllClose(np.ones((5, 3)), np_output)
-
-  @tf_test_utils.run_deprecated_v1
-  def testDynamicLearningPhase(self):
-    with self.cached_session() as sess:
-      dp = core_layers.Dropout(0.5, seed=1)
-      inputs = tf.ones((5, 5))
-      training = tf.compat.v1.placeholder(dtype='bool')
-      dropped = dp(inputs, training=training)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_output = sess.run(dropped, feed_dict={training: True})
-      self.assertAlmostEqual(0., np_output.min())
-      np_output = sess.run(dropped, feed_dict={training: False})
-      self.assertAllClose(np.ones((5, 5)), np_output)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testDynamicNoiseShape(self):
-    inputs = tf.ones((5, 3, 2))
-    noise_shape = [None, 1, None]
-    dp = core_layers.Dropout(0.5, noise_shape=noise_shape, seed=1)
-    dropped = dp(inputs, training=True)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    np_output = self.evaluate(dropped)
-    self.assertAlmostEqual(0., np_output.min())
-    self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
-
-  def testCustomNoiseShape(self):
-    inputs = tf.ones((5, 3, 2))
-    noise_shape = [5, 1, 2]
-    dp = core_layers.Dropout(0.5, noise_shape=noise_shape, seed=1)
-    dropped = dp(inputs, training=True)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    np_output = self.evaluate(dropped)
-    self.assertAlmostEqual(0., np_output.min())
-    self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
-
-  @tf_test_utils.run_deprecated_v1
-  def testFunctionalDropout(self):
-    with self.cached_session():
-      inputs = tf.ones((5, 5))
-      dropped = core_layers.dropout(inputs, 0.5, training=True, seed=1)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_output = self.evaluate(dropped)
-      self.assertAlmostEqual(0., np_output.min())
-      dropped = core_layers.dropout(inputs, 0.5, training=False, seed=1)
-      np_output = self.evaluate(dropped)
-      self.assertAllClose(np.ones((5, 5)), np_output)
-
-  @tf_test_utils.run_deprecated_v1
-  def testDynamicRate(self):
-    with self.cached_session() as sess:
-      rate = tf.compat.v1.placeholder(dtype='float32', name='rate')
-      dp = core_layers.Dropout(rate, name='dropout')
-      inputs = tf.ones((5, 5))
-      dropped = dp(inputs, training=True)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_output = sess.run(dropped, feed_dict={rate: 0.5})
-      self.assertAlmostEqual(0., np_output.min())
-      np_output = sess.run(dropped, feed_dict={rate: 0.0})
-      self.assertAllClose(np.ones((5, 5)), np_output)
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDropoutProperties(self):
+        dp = core_layers.Dropout(0.5, name="dropout")
+        self.assertEqual(dp.rate, 0.5)
+        self.assertEqual(dp.noise_shape, None)
+        dp(tf.ones(()))
+        self.assertEqual(dp.name, "dropout")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBooleanLearningPhase(self):
+        dp = core_layers.Dropout(0.5)
+        inputs = tf.ones((5, 3))
+        dropped = dp(inputs, training=True)
+        if not tf.executing_eagerly():
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+        np_output = self.evaluate(dropped)
+        self.assertAlmostEqual(0.0, np_output.min())
+        dropped = dp(inputs, training=False)
+        np_output = self.evaluate(dropped)
+        self.assertAllClose(np.ones((5, 3)), np_output)
+
+    @tf_test_utils.run_deprecated_v1
+    def testDynamicLearningPhase(self):
+        with self.cached_session() as sess:
+            dp = core_layers.Dropout(0.5, seed=1)
+            inputs = tf.ones((5, 5))
+            training = tf.compat.v1.placeholder(dtype="bool")
+            dropped = dp(inputs, training=training)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_output = sess.run(dropped, feed_dict={training: True})
+            self.assertAlmostEqual(0.0, np_output.min())
+            np_output = sess.run(dropped, feed_dict={training: False})
+            self.assertAllClose(np.ones((5, 5)), np_output)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDynamicNoiseShape(self):
+        inputs = tf.ones((5, 3, 2))
+        noise_shape = [None, 1, None]
+        dp = core_layers.Dropout(0.5, noise_shape=noise_shape, seed=1)
+        dropped = dp(inputs, training=True)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        np_output = self.evaluate(dropped)
+        self.assertAlmostEqual(0.0, np_output.min())
+        self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
+
+    def testCustomNoiseShape(self):
+        inputs = tf.ones((5, 3, 2))
+        noise_shape = [5, 1, 2]
+        dp = core_layers.Dropout(0.5, noise_shape=noise_shape, seed=1)
+        dropped = dp(inputs, training=True)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        np_output = self.evaluate(dropped)
+        self.assertAlmostEqual(0.0, np_output.min())
+        self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
+
+    @tf_test_utils.run_deprecated_v1
+    def testFunctionalDropout(self):
+        with self.cached_session():
+            inputs = tf.ones((5, 5))
+            dropped = core_layers.dropout(inputs, 0.5, training=True, seed=1)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_output = self.evaluate(dropped)
+            self.assertAlmostEqual(0.0, np_output.min())
+            dropped = core_layers.dropout(inputs, 0.5, training=False, seed=1)
+            np_output = self.evaluate(dropped)
+            self.assertAllClose(np.ones((5, 5)), np_output)
+
+    @tf_test_utils.run_deprecated_v1
+    def testDynamicRate(self):
+        with self.cached_session() as sess:
+            rate = tf.compat.v1.placeholder(dtype="float32", name="rate")
+            dp = core_layers.Dropout(rate, name="dropout")
+            inputs = tf.ones((5, 5))
+            dropped = dp(inputs, training=True)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_output = sess.run(dropped, feed_dict={rate: 0.5})
+            self.assertAlmostEqual(0.0, np_output.min())
+            np_output = sess.run(dropped, feed_dict={rate: 0.0})
+            self.assertAllClose(np.ones((5, 5)), np_output)
 
 
 class FlattenTest(tf.test.TestCase):
-
-  @tf_test_utils.run_deprecated_v1
-  def testCreateFlatten(self):
-    with self.cached_session() as sess:
-      x = tf.compat.v1.placeholder(shape=(None, 2, 3), dtype='float32')
-      y = core_layers.Flatten()(x)
-      np_output = sess.run(y, feed_dict={x: np.zeros((3, 2, 3))})
-      self.assertEqual(list(np_output.shape), [3, 6])
-      self.assertEqual(y.get_shape().as_list(), [None, 6])
-
-      x = tf.compat.v1.placeholder(shape=(1, 2, 3, 2), dtype='float32')
-      y = core_layers.Flatten()(x)
-      np_output = sess.run(y, feed_dict={x: np.zeros((1, 2, 3, 2))})
-      self.assertEqual(list(np_output.shape), [1, 12])
-      self.assertEqual(y.get_shape().as_list(), [1, 12])
-
-  def testComputeShape(self):
-    shape = core_layers.Flatten().compute_output_shape((1, 2, 3, 2))
-    self.assertEqual(shape.as_list(), [1, 12])
-
-    shape = core_layers.Flatten().compute_output_shape((None, 3, 2))
-    self.assertEqual(shape.as_list(), [None, 6])
-
-    shape = core_layers.Flatten().compute_output_shape((None, 3, None))
-    self.assertEqual(shape.as_list(), [None, None])
-
-  @tf_test_utils.run_deprecated_v1
-  def testDataFormat5d(self):
-    np_input_channels_last = np.arange(
-        120, dtype='float32').reshape([1, 5, 4, 3, 2])
-
-    with self.test_session() as sess:
-      x = tf.compat.v1.placeholder(shape=(1, 5, 4, 3, 2), dtype='float32')
-      y = core_layers.Flatten(data_format='channels_last')(x)
-      np_output_cl = sess.run(y, feed_dict={x: np_input_channels_last})
-
-      x = tf.compat.v1.placeholder(shape=(1, 2, 5, 4, 3), dtype='float32')
-      y = core_layers.Flatten(data_format='channels_first')(x)
-      np_input_channels_first = np.transpose(np_input_channels_last,
-                                             [0, 4, 1, 2, 3])
-      np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
-
-      self.assertAllEqual(np_output_cl, np_output_cf)
-
-  @tf_test_utils.run_deprecated_v1
-  def testDataFormat4d(self):
-    np_input_channels_last = np.arange(
-        24, dtype='float32').reshape([1, 4, 3, 2])
-
-    with self.test_session() as sess:
-      x = tf.compat.v1.placeholder(shape=(1, 4, 3, 2), dtype='float32')
-      y = core_layers.Flatten(data_format='channels_last')(x)
-      np_output_cl = sess.run(y, feed_dict={x: np_input_channels_last})
-
-      x = tf.compat.v1.placeholder(shape=(1, 2, 4, 3), dtype='float32')
-      y = core_layers.Flatten(data_format='channels_first')(x)
-      np_input_channels_first = np.transpose(np_input_channels_last,
-                                             [0, 3, 1, 2])
-      np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
-
-      self.assertAllEqual(np_output_cl, np_output_cf)
-
-  @tf_test_utils.run_deprecated_v1
-  def testFunctionalFlatten(self):
-    x = tf.compat.v1.placeholder(shape=(None, 2, 3), dtype='float32')
-    y = core_layers.flatten(x, name='flatten')
-    self.assertEqual(y.get_shape().as_list(), [None, 6])
-
-  @tf_test_utils.run_deprecated_v1
-  def testFlatten0D(self):
-    x = tf.compat.v1.placeholder(shape=(None,), dtype='float32')
-    y = core_layers.Flatten()(x)
-    with self.cached_session() as sess:
-      np_output = sess.run(y, feed_dict={x: np.zeros((5,))})
-    self.assertEqual(list(np_output.shape), [5, 1])
-    self.assertEqual(y.shape.as_list(), [None, 1])
-
-  @tf_test_utils.run_deprecated_v1
-  def testFlattenUnknownAxes(self):
-    with self.cached_session() as sess:
-      x = tf.compat.v1.placeholder(shape=(5, None, None), dtype='float32')
-      y = core_layers.Flatten()(x)
-      np_output = sess.run(y, feed_dict={x: np.zeros((5, 2, 3))})
-      self.assertEqual(list(np_output.shape), [5, 6])
-      self.assertEqual(y.get_shape().as_list(), [5, None])
-
-      x = tf.compat.v1.placeholder(shape=(5, None, 2), dtype='float32')
-      y = core_layers.Flatten()(x)
-      np_output = sess.run(y, feed_dict={x: np.zeros((5, 3, 2))})
-      self.assertEqual(list(np_output.shape), [5, 6])
-      self.assertEqual(y.get_shape().as_list(), [5, None])
-
-  @tf_test_utils.run_deprecated_v1
-  def testFlattenLargeDim(self):
-    if any(platform.win32_ver()):
-      self.skipTest('values are truncated on windows causing test failures')
-
-    x = tf.compat.v1.placeholder(shape=(None, 21316, 21316, 80), dtype='float32')
-    y = core_layers.Flatten()(x)
-    self.assertEqual(y.shape.as_list(), [None, 21316 * 21316 * 80])
-
-  @tf_test_utils.run_deprecated_v1
-  def testFlattenLargeBatchDim(self):
-    batch_size = np.iinfo(np.int32).max + 10
-    x = tf.compat.v1.placeholder(
-        shape=(batch_size, None, None, 1), dtype='float32')
-    y = core_layers.Flatten()(x)
-    self.assertEqual(y.shape.as_list(), [batch_size, None])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @tf_test_utils.run_deprecated_v1
+    def testCreateFlatten(self):
+        with self.cached_session() as sess:
+            x = tf.compat.v1.placeholder(shape=(None, 2, 3), dtype="float32")
+            y = core_layers.Flatten()(x)
+            np_output = sess.run(y, feed_dict={x: np.zeros((3, 2, 3))})
+            self.assertEqual(list(np_output.shape), [3, 6])
+            self.assertEqual(y.get_shape().as_list(), [None, 6])
+
+            x = tf.compat.v1.placeholder(shape=(1, 2, 3, 2), dtype="float32")
+            y = core_layers.Flatten()(x)
+            np_output = sess.run(y, feed_dict={x: np.zeros((1, 2, 3, 2))})
+            self.assertEqual(list(np_output.shape), [1, 12])
+            self.assertEqual(y.get_shape().as_list(), [1, 12])
+
+    def testComputeShape(self):
+        shape = core_layers.Flatten().compute_output_shape((1, 2, 3, 2))
+        self.assertEqual(shape.as_list(), [1, 12])
+
+        shape = core_layers.Flatten().compute_output_shape((None, 3, 2))
+        self.assertEqual(shape.as_list(), [None, 6])
+
+        shape = core_layers.Flatten().compute_output_shape((None, 3, None))
+        self.assertEqual(shape.as_list(), [None, None])
+
+    @tf_test_utils.run_deprecated_v1
+    def testDataFormat5d(self):
+        np_input_channels_last = np.arange(120, dtype="float32").reshape(
+            [1, 5, 4, 3, 2]
+        )
+
+        with self.test_session() as sess:
+            x = tf.compat.v1.placeholder(shape=(1, 5, 4, 3, 2), dtype="float32")
+            y = core_layers.Flatten(data_format="channels_last")(x)
+            np_output_cl = sess.run(y, feed_dict={x: np_input_channels_last})
+
+            x = tf.compat.v1.placeholder(shape=(1, 2, 5, 4, 3), dtype="float32")
+            y = core_layers.Flatten(data_format="channels_first")(x)
+            np_input_channels_first = np.transpose(
+                np_input_channels_last, [0, 4, 1, 2, 3]
+            )
+            np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
+
+            self.assertAllEqual(np_output_cl, np_output_cf)
+
+    @tf_test_utils.run_deprecated_v1
+    def testDataFormat4d(self):
+        np_input_channels_last = np.arange(24, dtype="float32").reshape(
+            [1, 4, 3, 2]
+        )
+
+        with self.test_session() as sess:
+            x = tf.compat.v1.placeholder(shape=(1, 4, 3, 2), dtype="float32")
+            y = core_layers.Flatten(data_format="channels_last")(x)
+            np_output_cl = sess.run(y, feed_dict={x: np_input_channels_last})
+
+            x = tf.compat.v1.placeholder(shape=(1, 2, 4, 3), dtype="float32")
+            y = core_layers.Flatten(data_format="channels_first")(x)
+            np_input_channels_first = np.transpose(
+                np_input_channels_last, [0, 3, 1, 2]
+            )
+            np_output_cf = sess.run(y, feed_dict={x: np_input_channels_first})
+
+            self.assertAllEqual(np_output_cl, np_output_cf)
+
+    @tf_test_utils.run_deprecated_v1
+    def testFunctionalFlatten(self):
+        x = tf.compat.v1.placeholder(shape=(None, 2, 3), dtype="float32")
+        y = core_layers.flatten(x, name="flatten")
+        self.assertEqual(y.get_shape().as_list(), [None, 6])
+
+    @tf_test_utils.run_deprecated_v1
+    def testFlatten0D(self):
+        x = tf.compat.v1.placeholder(shape=(None,), dtype="float32")
+        y = core_layers.Flatten()(x)
+        with self.cached_session() as sess:
+            np_output = sess.run(y, feed_dict={x: np.zeros((5,))})
+        self.assertEqual(list(np_output.shape), [5, 1])
+        self.assertEqual(y.shape.as_list(), [None, 1])
+
+    @tf_test_utils.run_deprecated_v1
+    def testFlattenUnknownAxes(self):
+        with self.cached_session() as sess:
+            x = tf.compat.v1.placeholder(shape=(5, None, None), dtype="float32")
+            y = core_layers.Flatten()(x)
+            np_output = sess.run(y, feed_dict={x: np.zeros((5, 2, 3))})
+            self.assertEqual(list(np_output.shape), [5, 6])
+            self.assertEqual(y.get_shape().as_list(), [5, None])
+
+            x = tf.compat.v1.placeholder(shape=(5, None, 2), dtype="float32")
+            y = core_layers.Flatten()(x)
+            np_output = sess.run(y, feed_dict={x: np.zeros((5, 3, 2))})
+            self.assertEqual(list(np_output.shape), [5, 6])
+            self.assertEqual(y.get_shape().as_list(), [5, None])
+
+    @tf_test_utils.run_deprecated_v1
+    def testFlattenLargeDim(self):
+        if any(platform.win32_ver()):
+            self.skipTest(
+                "values are truncated on windows causing test failures"
+            )
+
+        x = tf.compat.v1.placeholder(
+            shape=(None, 21316, 21316, 80), dtype="float32"
+        )
+        y = core_layers.Flatten()(x)
+        self.assertEqual(y.shape.as_list(), [None, 21316 * 21316 * 80])
+
+    @tf_test_utils.run_deprecated_v1
+    def testFlattenLargeBatchDim(self):
+        batch_size = np.iinfo(np.int32).max + 10
+        x = tf.compat.v1.placeholder(
+            shape=(batch_size, None, None, 1), dtype="float32"
+        )
+        y = core_layers.Flatten()(x)
+        self.assertEqual(y.shape.as_list(), [batch_size, None])
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/legacy_tf_layers/migration_utils.py b/keras/legacy_tf_layers/migration_utils.py
index 8d9c43d5837d..e1467beb66c0 100644
--- a/keras/legacy_tf_layers/migration_utils.py
+++ b/keras/legacy_tf_layers/migration_utils.py
@@ -9,95 +9,108 @@
 
 import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export(v1=["keras.utils.DeterministicRandomTestTool"])
 class DeterministicRandomTestTool(object):
-  """DeterministicRandomTestTool is a testing tool.
-
-  This tool is used to validate random number generation semantics match between
-  TF1.x graphs/sessions and eager execution.
-
-  This is useful when you are migrating from TF 1.x to TF2 and need to make sure
-  your computation is still happening correctly along the way. See the
-  validating correctness migration guide for more info :
-  https://www.tensorflow.org/guide/migrate/validate_correctness
-
-  The following DeterministicRandomTestTool object provides a context manager
-  scope() that can make stateful random operations use the same seed across both
-  TF1 graphs/sessions and eager execution,The tool provides two testing modes:
-  - constant which uses the same seed for every single operation no matter how
-  many times it has been called and,
-  - num_random_ops which uses the number of previously-observed stateful random
-  operations as the operation seed.
-  The num_random_ops mode serves as a more sensitive validation check than the
-  constant mode. It ensures that the random numbers initialization does not get
-  accidentaly reused.(for example if several weights take on the same
-  initializations), you can use the num_random_ops mode to avoid this. In the
-  num_random_ops mode, the generated random numbers will depend on the ordering
-  of random ops in the program.
-
-  This applies both to the stateful random operations used for creating and
-  initializing variables, and to the stateful random operations used in
-  computation (such as for dropout layers).
-  """
-
-  def __init__(self, seed: int = 42, mode="constant"):
-    """Set mode to 'constant' or 'num_random_ops'. Defaults to 'constant'."""
-    if mode not in {"constant", "num_random_ops"}:
-      raise ValueError("Mode arg must be 'constant' or 'num_random_ops'. " +
-                       "Got: {}".format(mode))
-    self.seed_implementation = sys.modules[tf.compat.v1.get_seed.__module__]
-    self._mode = mode
-    self._seed = seed
-    self.operation_seed = 0
-    self._observed_seeds = set()
-
-  @property
-  def operation_seed(self):
-    return self._operation_seed
-
-  @operation_seed.setter
-  def operation_seed(self, value):
-    self._operation_seed = value
-
-  def scope(self):
-    """set random seed."""
-
-    tf.random.set_seed(self._seed)
-    def _get_seed(_):
-      """Wraps TF get_seed to make deterministic random generation easier.
-
-      This makes a variable's initialization (and calls that involve random
-      number generation) depend only on how many random number generations
-      were used in the scope so far, rather than on how many unrelated
-      operations the graph contains.
-
-      Returns:
-        Random seed tuple.
-      """
-      op_seed = self._operation_seed
-      if self._mode == "constant":
-        tf.random.set_seed(op_seed)
-      else:
-        if op_seed in self._observed_seeds:
-          raise ValueError(
-              "This `DeterministicRandomTestTool` object is trying to re-use the "
-              + "already-used operation seed {}. ".format(op_seed) +
-              "It cannot guarantee random numbers will match between eager " +
-              "and sessions when an operation seed is reused. " +
-              "You most likely set " +
-              "`operation_seed` explicitly but used a value that caused the " +
-              "naturally-incrementing operation seed sequences to overlap " +
-              "with an already-used seed.")
-
-        self._observed_seeds.add(op_seed)
-        self._operation_seed += 1
-
-      return (self._seed, op_seed)
-    # mock.patch internal symbols to modify the behavior of TF APIs relying on
-    # them
-
-    return tf.compat.v1.test.mock.patch.object(
-        self.seed_implementation, "get_seed", wraps=_get_seed)
+    """DeterministicRandomTestTool is a testing tool.
+
+    This tool is used to validate random number generation semantics match
+    between TF1.x graphs/sessions and eager execution.
+
+    This is useful when you are migrating from TF 1.x to TF2 and need to make
+    sure your computation is still happening correctly along the way. See the
+    validating correctness migration guide for more info:
+    https://www.tensorflow.org/guide/migrate/validate_correctness
+
+    The following DeterministicRandomTestTool object provides a context manager
+    scope() that can make stateful random operations use the same seed across
+    both TF1 graphs/sessions and eager execution,The tool provides two testing
+    modes:
+    - constant which uses the same seed for every single operation no matter how
+    many times it has been called and,
+    - num_random_ops which uses the number of previously-observed stateful
+    random operations as the operation seed.
+    The num_random_ops mode serves as a more sensitive validation check than the
+    constant mode. It ensures that the random numbers initialization does not
+    get accidentaly reused.(for example if several weights take on the same
+    initializations), you can use the num_random_ops mode to avoid this. In the
+    num_random_ops mode, the generated random numbers will depend on the
+    ordering of random ops in the program.
+
+    This applies both to the stateful random operations used for creating and
+    initializing variables, and to the stateful random operations used in
+    computation (such as for dropout layers).
+
+    Args:
+      mode: Set mode to 'constant' or 'num_random_ops'. Defaults to
+        'constant'.
+      seed: The random seed to use.
+    """
+
+    def __init__(self, seed: int = 42, mode="constant"):
+        if mode not in {"constant", "num_random_ops"}:
+            raise ValueError(
+                "Mode arg must be 'constant' or 'num_random_ops'. "
+                + f"Got: {mode}"
+            )
+        self.seed_implementation = sys.modules[tf.compat.v1.get_seed.__module__]
+        self._mode = mode
+        self._seed = seed
+        self.operation_seed = 0
+        self._observed_seeds = set()
+
+    @property
+    def operation_seed(self):
+        return self._operation_seed
+
+    @operation_seed.setter
+    def operation_seed(self, value):
+        self._operation_seed = value
+
+    def scope(self):
+        """set random seed."""
+
+        tf.random.set_seed(self._seed)
+
+        def _get_seed(_):
+            """Wraps TF get_seed to make deterministic random generation easier.
+
+            This makes a variable's initialization (and calls that involve
+            random number generation) depend only on how many random number
+            generations were used in the scope so far, rather than on how many
+            unrelated operations the graph contains.
+
+            Returns:
+              Random seed tuple.
+            """
+            op_seed = self._operation_seed
+            if self._mode == "constant":
+                tf.random.set_seed(op_seed)
+            else:
+                if op_seed in self._observed_seeds:
+                    raise ValueError(
+                        "This `DeterministicRandomTestTool` "
+                        "object is trying to re-use the "
+                        + f"already-used operation seed {op_seed}. "
+                        + "It cannot guarantee random numbers will match "
+                        + "between eager and sessions when an operation seed "
+                        + "is reused. You most likely set "
+                        + "`operation_seed` explicitly but used a value that "
+                        + "caused the naturally-incrementing operation seed "
+                        + "sequences to overlap with an already-used seed."
+                    )
+
+                self._observed_seeds.add(op_seed)
+                self._operation_seed += 1
+
+            return (self._seed, op_seed)
+
+        # mock.patch internal symbols to modify the behavior of TF APIs relying
+        # on them
+
+        return tf.compat.v1.test.mock.patch.object(
+            self.seed_implementation, "get_seed", wraps=_get_seed
+        )
diff --git a/keras/legacy_tf_layers/migration_utils_test.py b/keras/legacy_tf_layers/migration_utils_test.py
index 18c6e0242a01..3d024ceb2bdf 100644
--- a/keras/legacy_tf_layers/migration_utils_test.py
+++ b/keras/legacy_tf_layers/migration_utils_test.py
@@ -1,215 +1,206 @@
 """Tests for migration_utils."""
 
-from keras.initializers import GlorotUniform as V2GlorotUniform
-from keras.legacy_tf_layers import migration_utils
 import tensorflow as tf
 
+from keras.legacy_tf_layers import migration_utils
 
-class DeterministicRandomTestToolTest(tf.test.TestCase):
 
-  def test_constant_mode_no_seed(self):
-    """Test random tensor generation consistancy in constant mode.
-
-    Verify that the random tensor generated without using the seed is
-    consistant between graph and eager mode
-    """
-
-    # Generate three random tensors to show how the stateful random number
-    # generation and glorot_uniform_initializer match between sessions and
-    # eager execution.
-    random_tool = migration_utils.DeterministicRandomTestTool()
-    with random_tool.scope():
-      graph = tf.Graph()
-      with graph.as_default(), tf.compat.v1.Session(graph=graph) as sess:
-        a = tf.compat.v1.random.uniform(shape=(3, 1))
-        # adding additional computation/ops to the graph and ensuring consistant
-        # random number generation
-        a = a * 3
-        b = tf.compat.v1.random.uniform(shape=(3, 3))
-        b = b * 3
-        c = tf.compat.v1.random.uniform(shape=(3, 3))
-        c = c * 3
-        d = tf.compat.v1.glorot_uniform_initializer()(
-            shape=(6, 6), dtype=tf.float32)
-        graph_a, graph_b, graph_c, graph_d = sess.run([a, b, c, d])
-
-      a = tf.compat.v2.random.uniform(shape=(3, 1))
-      a = a * 3
-      b = tf.compat.v2.random.uniform(shape=(3, 3))
-      b = b * 3
-      c = tf.compat.v2.random.uniform(shape=(3, 3))
-      c = c * 3
-      d = V2GlorotUniform()(shape=(6, 6), dtype=tf.float32)
-    # validate that the generated random tensors match
-    self.assertAllClose(graph_a, a)
-    self.assertAllClose(graph_b, b)
-    self.assertAllClose(graph_c, c)
-    self.assertAllClose(graph_d, d)
-    # In constant mode, because b and c were generated with the same seed within
-    # the same scope and have the same shape, they will have exactly the same
-    # values.
-    # validate that b and c are the same, also graph_b and graph_c
-    self.assertAllClose(b, c)
-    self.assertAllClose(graph_b, graph_c)
-
-  def test_constant_mode_seed_argument(self):
-    """Test random tensor generation consistancy in constant mode.
-
-    Verify that the random tensor generated by setting the global seeed
-    in the args is consistant between graph and eager mode.
-    """
-    random_tool = migration_utils.DeterministicRandomTestTool()
-    with random_tool.scope():
-      graph = tf.Graph()
-      with graph.as_default(), tf.compat.v1.Session(graph=graph) as sess:
-        # adding additional computation/ops to the graph and ensuring consistant
-        # random number generation
-        a = tf.compat.v1.random.uniform(shape=(3, 1), seed=1234)
-        a = a * 3
-        b = tf.compat.v1.random.uniform(shape=(3, 3), seed=1234)
-        b = b * 3
-        c = tf.compat.v1.glorot_uniform_initializer(seed=1234)(
-            shape=(6, 6), dtype=tf.float32)
-        graph_a, graph_b, graph_c = sess.run([a, b, c])
-      a = tf.compat.v2.random.uniform(shape=(3, 1), seed=1234)
-      a = a * 3
-      b = tf.compat.v2.random.uniform(shape=(3, 3), seed=1234)
-      b = b * 3
-      c = V2GlorotUniform(seed=1234)(shape=(6, 6), dtype=tf.float32)
-
-    # validate that the generated random tensors match
-    self.assertAllClose(graph_a, a)
-    self.assertAllClose(graph_b, b)
-    self.assertAllClose(graph_c, c)
-
-  def test_num_rand_ops(self):
-    """Test random tensor generation consistancy in num_random_ops mode.
-
-    Verify that the random tensor generated without using the seed is
-    consistant between graph and eager mode.
-    Random tensor generated should be different based on random ops ordering
-    """
-    random_tool = migration_utils.DeterministicRandomTestTool(
-        mode="num_random_ops")
-    with random_tool.scope():
-      graph = tf.Graph()
-      with graph.as_default(), tf.compat.v1.Session(graph=graph) as sess:
-        # adding additional computation/ops to the graph and ensuring consistant
-        # random number generation
-        a = tf.compat.v1.random.uniform(shape=(3, 1))
-        a = a * 3
-        b = tf.compat.v1.random.uniform(shape=(3, 3))
-        b = b * 3
-        c = tf.compat.v1.random.uniform(shape=(3, 3))
-        c = c * 3
-        d = tf.compat.v1.glorot_uniform_initializer()(
-            shape=(6, 6), dtype=tf.float32)
-        graph_a, graph_b, graph_c, graph_d = sess.run([a, b, c, d])
-
-    random_tool = migration_utils.DeterministicRandomTestTool(
-        mode="num_random_ops")
-    with random_tool.scope():
-      a = tf.compat.v2.random.uniform(shape=(3, 1))
-      a = a * 3
-      b = tf.compat.v2.random.uniform(shape=(3, 3))
-      b = b * 3
-      c = tf.compat.v2.random.uniform(shape=(3, 3))
-      c = c * 3
-      d = V2GlorotUniform()(shape=(6, 6), dtype=tf.float32)
-    # validate that the generated random tensors match
-    self.assertAllClose(graph_a, a)
-    self.assertAllClose(graph_b, b)
-    self.assertAllClose(graph_c, c)
-    self.assertAllClose(graph_d, d)
-    # validate that the tensors differ based on ops ordering
-    self.assertNotAllClose(b, c)
-    self.assertNotAllClose(graph_b, graph_c)
-
-  def test_num_rand_ops_program_order(self):
-    """Test random tensor generation consistancy in num_random_ops mode.
-
-    validate that in this mode random number generation is sensitive to program
-    order, so the generated random tesnors should not match.
-    """
-    random_tool = migration_utils.DeterministicRandomTestTool(
-        mode="num_random_ops")
-    with random_tool.scope():
-      a = tf.random.uniform(shape=(3, 1))
-      # adding additional computation/ops to the graph and ensuring consistant
-      # random number generation
-      a = a * 3
-      b = tf.random.uniform(shape=(3, 3))
-      b = b * 3
-
-    random_tool = migration_utils.DeterministicRandomTestTool(
-        mode="num_random_ops")
-    with random_tool.scope():
-      b_prime = tf.random.uniform(shape=(3, 3))
-      # adding additional computation/ops to the graph and ensuring consistant
-      # random number generation
-      b_prime = b_prime * 3
-      a_prime = tf.random.uniform(shape=(3, 1))
-      a_prime = a_prime * 3
-    # validate that the tensors are different
-    self.assertNotAllClose(a, a_prime)
-    self.assertNotAllClose(b, b_prime)
-
-  def test_num_rand_ops_operation_seed(self):
-    """Test random tensor generation consistancy in num_random_ops mode.
-
-    validate if  random number generation match across two different program
-    orders.
-    """
-    random_tool = migration_utils.DeterministicRandomTestTool(
-        mode="num_random_ops")
-    with random_tool.scope():
-      # operation seed = 0
-      a = tf.random.uniform(shape=(3, 1))
-      a = a * 3
-      # operation seed = 1
-      b = tf.random.uniform(shape=(3, 3))
-      b = b * 3
-
-    random_tool = migration_utils.DeterministicRandomTestTool(
-        mode="num_random_ops")
-    with random_tool.scope():
-      random_tool.operation_seed = 1
-      b_prime = tf.random.uniform(shape=(3, 3))
-      b_prime = b_prime * 3
-      random_tool.operation_seed = 0
-      a_prime = tf.random.uniform(shape=(3, 1))
-      a_prime = a_prime * 3
-
-    self.assertAllClose(a, a_prime)
-    self.assertAllClose(b, b_prime)
-
-  def test_num_rand_ops_disallow_repeated_ops_seed(self):
-    """Test random tensor generation consistancy in num_random_ops mode.
-
-    validate if  DeterministicRandomTestTool disallows reusing already-used
-    operation seeds.
-    """
-    random_tool = migration_utils.DeterministicRandomTestTool(
-        mode="num_random_ops")
-    with random_tool.scope():
-      random_tool.operation_seed = 1
-      b_prime = tf.random.uniform(shape=(3, 3))
-      b_prime = b_prime * 3
-      random_tool.operation_seed = 0
-      a_prime = tf.random.uniform(shape=(3, 1))
-      a_prime = a_prime * 3
-      error_string = "An exception should have been raised before this"
-      error_raised = "An exception should have been raised before this"
-      try:
-        c = tf.random.uniform(shape=(3, 1))
-        raise RuntimeError(error_string)
-
-      except ValueError as err:
-        err_raised = err
-
-      self.assertNotEqual(err_raised, error_string)
+class DeterministicRandomTestToolTest(tf.test.TestCase):
+    def test_constant_mode_no_seed(self):
+        """Test random tensor generation consistancy in constant mode.
+
+        Verify that the random tensor generated without using the seed is
+        consistant between graph and eager mode
+        """
+
+        # Generate three random tensors to show how the stateful random number
+        # generation match between sessions and eager execution.
+        random_tool = migration_utils.DeterministicRandomTestTool()
+        with random_tool.scope():
+            graph = tf.Graph()
+            with graph.as_default(), tf.compat.v1.Session(graph=graph) as sess:
+                a = tf.compat.v1.random.uniform(shape=(3, 1))
+                # adding additional computation/ops to the graph and ensuring
+                # consistant random number generation
+                a = a * 3
+                b = tf.compat.v1.random.uniform(shape=(3, 3))
+                b = b * 3
+                c = tf.compat.v1.random.uniform(shape=(3, 3))
+                c = c * 3
+                graph_a, graph_b, graph_c = sess.run([a, b, c])
+
+            a = tf.compat.v2.random.uniform(shape=(3, 1))
+            a = a * 3
+            b = tf.compat.v2.random.uniform(shape=(3, 3))
+            b = b * 3
+            c = tf.compat.v2.random.uniform(shape=(3, 3))
+            c = c * 3
+        # validate that the generated random tensors match
+        self.assertAllClose(graph_a, a)
+        self.assertAllClose(graph_b, b)
+        self.assertAllClose(graph_c, c)
+        # In constant mode, because b and c were generated with the same seed
+        # within the same scope and have the same shape, they will have exactly
+        # the same values.
+        # validate that b and c are the same, also graph_b and graph_c
+        self.assertAllClose(b, c)
+        self.assertAllClose(graph_b, graph_c)
+
+    def test_constant_mode_seed_argument(self):
+        """Test random tensor generation consistancy in constant mode.
+
+        Verify that the random tensor generated by setting the global seeed
+        in the args is consistant between graph and eager mode.
+        """
+        random_tool = migration_utils.DeterministicRandomTestTool()
+        with random_tool.scope():
+            graph = tf.Graph()
+            with graph.as_default(), tf.compat.v1.Session(graph=graph) as sess:
+                # adding additional computation/ops to the graph and ensuring
+                # consistant random number generation
+                a = tf.compat.v1.random.uniform(shape=(3, 1), seed=1234)
+                a = a * 3
+                b = tf.compat.v1.random.uniform(shape=(3, 3), seed=1234)
+                b = b * 3
+                graph_a, graph_b = sess.run([a, b])
+            a = tf.compat.v2.random.uniform(shape=(3, 1), seed=1234)
+            a = a * 3
+            b = tf.compat.v2.random.uniform(shape=(3, 3), seed=1234)
+            b = b * 3
+
+        # validate that the generated random tensors match
+        self.assertAllClose(graph_a, a)
+        self.assertAllClose(graph_b, b)
+
+    def test_num_rand_ops(self):
+        """Test random tensor generation consistancy in num_random_ops mode.
+
+        Verify that the random tensor generated without using the seed is
+        consistant between graph and eager mode.
+        Random tensor generated should be different based on random ops ordering
+        """
+        random_tool = migration_utils.DeterministicRandomTestTool(
+            mode="num_random_ops"
+        )
+        with random_tool.scope():
+            graph = tf.Graph()
+            with graph.as_default(), tf.compat.v1.Session(graph=graph) as sess:
+                # adding additional computation/ops to the graph and ensuring
+                # consistant random number generation
+                a = tf.compat.v1.random.uniform(shape=(3, 1))
+                a = a * 3
+                b = tf.compat.v1.random.uniform(shape=(3, 3))
+                b = b * 3
+                c = tf.compat.v1.random.uniform(shape=(3, 3))
+                c = c * 3
+                graph_a, graph_b, graph_c = sess.run([a, b, c])
+
+        random_tool = migration_utils.DeterministicRandomTestTool(
+            mode="num_random_ops"
+        )
+        with random_tool.scope():
+            a = tf.compat.v2.random.uniform(shape=(3, 1))
+            a = a * 3
+            b = tf.compat.v2.random.uniform(shape=(3, 3))
+            b = b * 3
+            c = tf.compat.v2.random.uniform(shape=(3, 3))
+            c = c * 3
+        # validate that the generated random tensors match
+        self.assertAllClose(graph_a, a)
+        self.assertAllClose(graph_b, b)
+        self.assertAllClose(graph_c, c)
+        # validate that the tensors differ based on ops ordering
+        self.assertNotAllClose(b, c)
+        self.assertNotAllClose(graph_b, graph_c)
+
+    def test_num_rand_ops_program_order(self):
+        """Test random tensor generation consistancy in num_random_ops mode.
+
+        validate that in this mode random number generation is sensitive to
+        program order, so the generated random tesnors should not match.
+        """
+        random_tool = migration_utils.DeterministicRandomTestTool(
+            mode="num_random_ops"
+        )
+        with random_tool.scope():
+            a = tf.random.uniform(shape=(3, 1))
+            # adding additional computation/ops to the graph and ensuring
+            # consistant random number generation
+            a = a * 3
+            b = tf.random.uniform(shape=(3, 3))
+            b = b * 3
+
+        random_tool = migration_utils.DeterministicRandomTestTool(
+            mode="num_random_ops"
+        )
+        with random_tool.scope():
+            b_prime = tf.random.uniform(shape=(3, 3))
+            # adding additional computation/ops to the graph and ensuring
+            # consistant random number generation
+            b_prime = b_prime * 3
+            a_prime = tf.random.uniform(shape=(3, 1))
+            a_prime = a_prime * 3
+        # validate that the tensors are different
+        self.assertNotAllClose(a, a_prime)
+        self.assertNotAllClose(b, b_prime)
+
+    def test_num_rand_ops_operation_seed(self):
+        """Test random tensor generation consistancy in num_random_ops mode.
+
+        validate if  random number generation match across two different program
+        orders.
+        """
+        random_tool = migration_utils.DeterministicRandomTestTool(
+            mode="num_random_ops"
+        )
+        with random_tool.scope():
+            # operation seed = 0
+            a = tf.random.uniform(shape=(3, 1))
+            a = a * 3
+            # operation seed = 1
+            b = tf.random.uniform(shape=(3, 3))
+            b = b * 3
+
+        random_tool = migration_utils.DeterministicRandomTestTool(
+            mode="num_random_ops"
+        )
+        with random_tool.scope():
+            random_tool.operation_seed = 1
+            b_prime = tf.random.uniform(shape=(3, 3))
+            b_prime = b_prime * 3
+            random_tool.operation_seed = 0
+            a_prime = tf.random.uniform(shape=(3, 1))
+            a_prime = a_prime * 3
+
+        self.assertAllClose(a, a_prime)
+        self.assertAllClose(b, b_prime)
+
+    def test_num_rand_ops_disallow_repeated_ops_seed(self):
+        """Test random tensor generation consistancy in num_random_ops mode.
+
+        validate if  DeterministicRandomTestTool disallows reusing already-used
+        operation seeds.
+        """
+        random_tool = migration_utils.DeterministicRandomTestTool(
+            mode="num_random_ops"
+        )
+        with random_tool.scope():
+            random_tool.operation_seed = 1
+            b_prime = tf.random.uniform(shape=(3, 3))
+            b_prime = b_prime * 3
+            random_tool.operation_seed = 0
+            a_prime = tf.random.uniform(shape=(3, 1))
+            a_prime = a_prime * 3
+            error_string = "An exception should have been raised before this"
+            try:
+                tf.random.uniform(shape=(3, 1))
+                raise RuntimeError(error_string)
+
+            except ValueError as err:
+                err_raised = err
+
+            self.assertNotEqual(err_raised, error_string)
 
 
 if __name__ == "__main__":
-  tf.test.main()
-
+    tf.test.main()
diff --git a/keras/legacy_tf_layers/normalization.py b/keras/legacy_tf_layers/normalization.py
index 23d0652d34fa..c11f6457b2c1 100644
--- a/keras/legacy_tf_layers/normalization.py
+++ b/keras/legacy_tf_layers/normalization.py
@@ -12,198 +12,441 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Contains the normalization layer classes and their functional aliases."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import warnings
 
+import tensorflow.compat.v2 as tf
+
 from keras.layers.normalization import batch_normalization_v1
 from keras.legacy_tf_layers import base
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@keras_export(v1=['keras.__internal__.legacy.layers.BatchNormalization'])
-@tf_export(v1=['layers.BatchNormalization'])
+@keras_export(v1=["keras.__internal__.legacy.layers.BatchNormalization"])
+@tf_export(v1=["layers.BatchNormalization"])
 class BatchNormalization(batch_normalization_v1.BatchNormalization, base.Layer):
-  """Batch Normalization layer from (Ioffe et al., 2015).
-
-  Keras APIs handle BatchNormalization updates to the moving_mean and
-  moving_variance as part of their `fit()` and `evaluate()` loops. However, if a
-  custom training loop is used with an instance of `Model`, these updates need
-  to be explicitly included.  Here's a simple example of how it can be done:
-
-  ```python
-    # model is an instance of Model that contains BatchNormalization layer.
-    update_ops = model.get_updates_for(None) + model.get_updates_for(features)
-    train_op = optimizer.minimize(loss)
-    train_op = tf.group([train_op, update_ops])
-  ```
-
-  Args:
-    axis: An `int` or list of `int`, the axis or axes that should be normalized,
-      typically the features axis/axes. For instance, after a `Conv2D` layer
-      with `data_format="channels_first"`, set `axis=1`. If a list of axes is
-      provided, each axis in `axis` will be normalized
+    """Batch Normalization layer from (Ioffe et al., 2015).
+
+    Keras APIs handle BatchNormalization updates to the moving_mean and
+    moving_variance as part of their `fit()` and `evaluate()` loops. However, if
+    a custom training loop is used with an instance of `Model`, these updates
+    need to be explicitly included.  Here's a simple example of how it can be
+    done:
+
+    ```python
+      # model is an instance of Model that contains BatchNormalization layer.
+      update_ops = model.get_updates_for(None) + model.get_updates_for(features)
+      train_op = optimizer.minimize(loss)
+      train_op = tf.group([train_op, update_ops])
+    ```
+
+    Args:
+      axis: An `int` or list of `int`, the axis or axes that should be
+        normalized, typically the features axis/axes. For instance, after a
+        `Conv2D` layer with `data_format="channels_first"`, set `axis=1`. If a
+        list of axes is provided, each axis in `axis` will be normalized
         simultaneously. Default is `-1` which uses the last axis. Note: when
-          using multi-axis batch norm, the `beta`, `gamma`, `moving_mean`, and
-          `moving_variance` variables are the same rank as the input Tensor,
-          with dimension size 1 in all reduced (non-axis) dimensions).
-    momentum: Momentum for the moving average.
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored.
-    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
-      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
-      scaling can be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    moving_mean_initializer: Initializer for the moving mean.
-    moving_variance_initializer: Initializer for the moving variance.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: An optional projection function to be applied to the `beta`
-      weight after being updated by an `Optimizer` (e.g. used to implement norm
-      constraints or value constraints for layer weights). The function must
-      take as input the unprojected variable and must return the projected
-      variable (which must have the same shape). Constraints are not safe to use
-      when doing asynchronous distributed training.
-    gamma_constraint: An optional projection function to be applied to the
-      `gamma` weight after being updated by an `Optimizer`.
-    renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds extra
-      variables during training. The inference is the same for either value of
-      this parameter.
-    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction `(r,
-      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
-      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
-      dmax are set to inf, 0, inf, respectively.
-    renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training and
-      should be neither too small (which would add noise) nor too large (which
-      would give stale estimates). Note that `momentum` is still applied to get
-      the means and variances for inference.
-    fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
-      which means batch normalization is performed across the whole batch. When
-      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
-      Normalization", which creates virtual sub-batches which are each
-      normalized separately (with shared gamma, beta, and moving statistics).
-      Must divide the actual batch size during execution.
-    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
-      the input tensor and returning a pair (scale, bias) to apply to the
-      normalized values (before gamma and beta), only during training. For
-      example, if axis==-1,
-        `adjustment = lambda shape: (
-          tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
-            value by up to 7% up or down, then shift the result by up to 0.1
-            (with independent scaling and bias for each feature but shared
-            across all examples), and finally apply gamma and/or beta. If
-            `None`, no adjustment is applied. Cannot be specified if
-            virtual_batch_size is specified.
-    name: A string, the name of the layer.
-  References:
-    Batch Normalization - Accelerating Deep Network Training by Reducing
+        using multi-axis batch norm, the `beta`, `gamma`, `moving_mean`, and
+        `moving_variance` variables are the same rank as the input Tensor, with
+        dimension size 1 in all reduced (non-axis) dimensions).
+      momentum: Momentum for the moving average.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor. If False,
+        `beta` is ignored.
+      scale: If True, multiply by `gamma`. If False, `gamma` is not used. When
+        the next layer is linear (also e.g. `nn.relu`), this can be disabled
+        since the scaling can be done by the next layer.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      moving_mean_initializer: Initializer for the moving mean.
+      moving_variance_initializer: Initializer for the moving variance.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: An optional projection function to be applied to the
+        `beta` weight after being updated by an `Optimizer` (e.g. used to
+        implement norm constraints or value constraints for layer weights). The
+        function must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are not
+        safe to use when doing asynchronous distributed training.
+      gamma_constraint: An optional projection function to be applied to the
+        `gamma` weight after being updated by an `Optimizer`.
+      renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds
+        extra variables during training. The inference is the same for either
+        value of this parameter.
+      renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+        scalar `Tensors` used to clip the renorm correction. The correction `(r,
+        d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+        clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+        dmax are set to inf, 0, inf, respectively.
+      renorm_momentum: Momentum used to update the moving means and standard
+        deviations with renorm. Unlike `momentum`, this affects training and
+        should be neither too small (which would add noise) nor too large (which
+        would give stale estimates). Note that `momentum` is still applied to
+        get the means and variances for inference.
+      fused: if `None` or `True`, use a faster, fused implementation if
+        possible. If `False`, use the system recommended implementation.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+        which means batch normalization is performed across the whole batch.
+        When `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+        Normalization", which creates virtual sub-batches which are each
+        normalized separately (with shared gamma, beta, and moving statistics).
+        Must divide the actual batch size during execution.
+      adjustment: A function taking the `Tensor` containing the (dynamic) shape
+        of the input tensor and returning a pair (scale, bias) to apply to the
+        normalized values (before gamma and beta), only during training. For
+        example, if axis==-1,
+          `adjustment = lambda shape: (
+            tf.random.uniform(shape[-1:], 0.93, 1.07),
+            tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+              value by up to 7% up or down, then shift the result by up to 0.1
+              (with independent scaling and bias for each feature but shared
+              across all examples), and finally apply gamma and/or beta. If
+              `None`, no adjustment is applied. Cannot be specified if
+              virtual_batch_size is specified.
+      name: A string, the name of the layer.
+    References:
+      Batch Normalization - Accelerating Deep Network Training by Reducing
+        Internal Covariate Shift:
+        [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
+        ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
+      Batch Renormalization - Towards Reducing Minibatch Dependence in
+        Batch-Normalized Models:
+        [Ioffe,
+          2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
+        ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.BatchNormalization`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     bn = tf.compat.v1.layers.BatchNormalization()
+    ```
+
+    After:
+
+    ```python
+     bn = tf.keras.layers.BatchNormalization()
+    ```
+
+    #### How to Map Arguments
+
+    TF1 Arg Name              | TF2 Arg Name              | Note
+    :------------------------ | :------------------------ | :---------------
+    `name`                    | `name`                    | Layer base class
+    `trainable`               | `trainable`               | Layer base class
+    `axis`                    | `axis`                    | -
+    `momentum`                | `momentum`                | -
+    `epsilon`                 | `epsilon`                 | -
+    `center`                  | `center`                  | -
+    `scale`                   | `scale`                   | -
+    `beta_initializer`        | `beta_initializer`        | -
+    `gamma_initializer`       | `gamma_initializer`       | -
+    `moving_mean_initializer` | `moving_mean_initializer` | -
+    `beta_regularizer`        | `beta_regularizer'        | -
+    `gamma_regularizer`       | `gamma_regularizer'       | -
+    `beta_constraint`         | `beta_constraint'         | -
+    `gamma_constraint`        | `gamma_constraint'        | -
+    `renorm`                  | Not supported             | -
+    `renorm_clipping`         | Not supported             | -
+    `renorm_momentum`         | Not supported             | -
+    `fused`                   | Not supported             | -
+    `virtual_batch_size`      | Not supported             | -
+    `adjustment`              | Not supported             | -
+
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        axis=-1,
+        momentum=0.99,
+        epsilon=1e-3,
+        center=True,
+        scale=True,
+        beta_initializer=tf.compat.v1.zeros_initializer(),
+        gamma_initializer=tf.compat.v1.ones_initializer(),
+        moving_mean_initializer=tf.compat.v1.zeros_initializer(),
+        moving_variance_initializer=tf.compat.v1.ones_initializer(),
+        beta_regularizer=None,
+        gamma_regularizer=None,
+        beta_constraint=None,
+        gamma_constraint=None,
+        renorm=False,
+        renorm_clipping=None,
+        renorm_momentum=0.99,
+        fused=None,
+        trainable=True,
+        virtual_batch_size=None,
+        adjustment=None,
+        name=None,
+        **kwargs
+    ):
+        super().__init__(
+            axis=axis,
+            momentum=momentum,
+            epsilon=epsilon,
+            center=center,
+            scale=scale,
+            beta_initializer=beta_initializer,
+            gamma_initializer=gamma_initializer,
+            moving_mean_initializer=moving_mean_initializer,
+            moving_variance_initializer=moving_variance_initializer,
+            beta_regularizer=beta_regularizer,
+            gamma_regularizer=gamma_regularizer,
+            beta_constraint=beta_constraint,
+            gamma_constraint=gamma_constraint,
+            renorm=renorm,
+            renorm_clipping=renorm_clipping,
+            renorm_momentum=renorm_momentum,
+            fused=fused,
+            trainable=trainable,
+            virtual_batch_size=virtual_batch_size,
+            adjustment=adjustment,
+            name=name,
+            **kwargs
+        )
+
+    def call(self, inputs, training=False, mask=None):
+        return super().call(inputs, training=training, mask=mask)
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.batch_normalization"])
+@tf_export(v1=["layers.batch_normalization"])
+def batch_normalization(
+    inputs,
+    axis=-1,
+    momentum=0.99,
+    epsilon=1e-3,
+    center=True,
+    scale=True,
+    beta_initializer=tf.compat.v1.zeros_initializer(),
+    gamma_initializer=tf.compat.v1.ones_initializer(),
+    moving_mean_initializer=tf.compat.v1.zeros_initializer(),
+    moving_variance_initializer=tf.compat.v1.ones_initializer(),
+    beta_regularizer=None,
+    gamma_regularizer=None,
+    beta_constraint=None,
+    gamma_constraint=None,
+    training=False,
+    trainable=True,
+    name=None,
+    reuse=None,
+    renorm=False,
+    renorm_clipping=None,
+    renorm_momentum=0.99,
+    fused=None,
+    virtual_batch_size=None,
+    adjustment=None,
+):
+    """Functional interface for the batch normalization layer from_config(Ioffe
+    et al., 2015).
+
+    Note: when training, the moving_mean and moving_variance need to be updated.
+    By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
+    need to be executed alongside the `train_op`. Also, be sure to add any
+    batch_normalization ops before getting the update_ops collection. Otherwise,
+    update_ops will be empty, and training/inference will not work properly. For
+    example:
+
+    ```python
+      x_norm = tf.compat.v1.layers.batch_normalization(x, training=training)
+
+      # ...
+
+      update_ops = tf.compat.v1.get_collection(tf.GraphKeys.UPDATE_OPS)
+      train_op = optimizer.minimize(loss)
+      train_op = tf.group([train_op, update_ops])
+    ```
+
+    Args:
+      inputs: Tensor input.
+      axis: An `int`, the axis that should be normalized (typically the features
+        axis). For instance, after a `Convolution2D` layer with
+        `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
+      momentum: Momentum for the moving average.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor. If False,
+        `beta` is ignored.
+      scale: If True, multiply by `gamma`. If False, `gamma` is not used. When
+        the next layer is linear (also e.g. `nn.relu`), this can be disabled
+        since the scaling can be done by the next layer.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      moving_mean_initializer: Initializer for the moving mean.
+      moving_variance_initializer: Initializer for the moving variance.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: An optional projection function to be applied to the
+        `beta` weight after being updated by an `Optimizer` (e.g. used to
+        implement norm constraints or value constraints for layer weights). The
+        function must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are not
+        safe to use when doing asynchronous distributed training.
+      gamma_constraint: An optional projection function to be applied to the
+        `gamma` weight after being updated by an `Optimizer`.
+      training: Either a Python boolean, or a TensorFlow boolean scalar tensor
+        (e.g. a placeholder). Whether to return the output in training mode
+        (normalized with statistics of the current batch) or in inference mode
+        (normalized with moving statistics). **NOTE**: make sure to set this
+          parameter correctly, or else your training/inference will not work
+          properly.
+      trainable: Boolean, if `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+      name: String, the name of the layer.
+      reuse: Boolean, whether to reuse the weights of a previous layer by the
+        same name.
+      renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds
+        extra variables during training. The inference is the same for either
+        value of this parameter.
+      renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+        scalar `Tensors` used to clip the renorm correction. The correction `(r,
+        d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+        clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+        dmax are set to inf, 0, inf, respectively.
+      renorm_momentum: Momentum used to update the moving means and standard
+        deviations with renorm. Unlike `momentum`, this affects training and
+        should be neither too small (which would add noise) nor too large (which
+        would give stale estimates). Note that `momentum` is still applied to
+        get the means and variances for inference.
+      fused: if `None` or `True`, use a faster, fused implementation if
+        possible.  If `False`, use the system recommended implementation.
+      virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+        which means batch normalization is performed across the whole batch.
+        When `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+        Normalization", which creates virtual sub-batches which are each
+        normalized separately (with shared gamma, beta, and moving statistics).
+        Must divide the actual batch size during execution.
+      adjustment: A function taking the `Tensor` containing the (dynamic) shape
+        of the input tensor and returning a pair (scale, bias) to apply to the
+        normalized values (before gamma and beta), only during training. For
+        example, if axis==-1,
+          `adjustment = lambda shape: (
+            tf.random.uniform(shape[-1:], 0.93, 1.07),
+            tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+              value by up to 7% up or down, then shift the result by up to 0.1
+              (with independent scaling and bias for each feature but shared
+              across all examples), and finally apply gamma and/or beta. If
+              `None`, no adjustment is applied. Cannot be specified if
+              virtual_batch_size is specified.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+    References:
+      Batch Normalization - Accelerating Deep Network Training by Reducing
       Internal Covariate Shift:
-      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
-      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
-    Batch Renormalization - Towards Reducing Minibatch Dependence in
+        [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
+        ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
+      Batch Renormalization - Towards Reducing Minibatch Dependence in
       Batch-Normalized Models:
-      [Ioffe,
+        [Ioffe,
         2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
-      ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.BatchNormalization`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   bn = tf.compat.v1.layers.BatchNormalization()
-  ```
-
-  After:
-
-  ```python
-   bn = tf.keras.layers.BatchNormalization()
-  ```
-
-  #### How to Map Arguments
-
-  TF1 Arg Name              | TF2 Arg Name              | Note
-  :------------------------ | :------------------------ | :---------------
-  `name`                    | `name`                    | Layer base class
-  `trainable`               | `trainable`               | Layer base class
-  `axis`                    | `axis`                    | -
-  `momentum`                | `momentum`                | -
-  `epsilon`                 | `epsilon`                 | -
-  `center`                  | `center`                  | -
-  `scale`                   | `scale`                   | -
-  `beta_initializer`        | `beta_initializer`        | -
-  `gamma_initializer`       | `gamma_initializer`       | -
-  `moving_mean_initializer` | `moving_mean_initializer` | -
-  `beta_regularizer`        | `beta_regularizer'        | -
-  `gamma_regularizer`       | `gamma_regularizer'       | -
-  `beta_constraint`         | `beta_constraint'         | -
-  `gamma_constraint`        | `gamma_constraint'        | -
-  `renorm`                  | Not supported             | -
-  `renorm_clipping`         | Not supported             | -
-  `renorm_momentum`         | Not supported             | -
-  `fused`                   | Not supported             | -
-  `virtual_batch_size`      | Not supported             | -
-  `adjustment`              | Not supported             | -
-
-  @end_compatibility
-  """
-
-  def __init__(self,
-               axis=-1,
-               momentum=0.99,
-               epsilon=1e-3,
-               center=True,
-               scale=True,
-               beta_initializer=tf.compat.v1.zeros_initializer(),
-               gamma_initializer=tf.compat.v1.ones_initializer(),
-               moving_mean_initializer=tf.compat.v1.zeros_initializer(),
-               moving_variance_initializer=tf.compat.v1.ones_initializer(),
-               beta_regularizer=None,
-               gamma_regularizer=None,
-               beta_constraint=None,
-               gamma_constraint=None,
-               renorm=False,
-               renorm_clipping=None,
-               renorm_momentum=0.99,
-               fused=None,
-               trainable=True,
-               virtual_batch_size=None,
-               adjustment=None,
-               name=None,
-               **kwargs):
-    super().__init__(
+        ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.BatchNormalization`.
+
+    The batch updating pattern with
+    `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used in
+    native TF2. Consult the `tf.keras.layers.BatchNormalization` documentation
+    for further information.
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     x_norm = tf.compat.v1.layers.batch_normalization(x)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input(shape=(28, 28, 1),)
+     y = tf.keras.layers.BatchNormalization()(x)
+     model = tf.keras.Model(x, y)
+    ```
+    #### How to Map Arguments
+
+    TF1 Arg Name              | TF2 Arg Name              | Note
+    :------------------------ | :------------------------ | :---------------
+    `name`                    | `name`                    | Layer base class
+    `trainable`               | `trainable`               | Layer base class
+    `axis`                    | `axis`                    | -
+    `momentum`                | `momentum`                | -
+    `epsilon`                 | `epsilon`                 | -
+    `center`                  | `center`                  | -
+    `scale`                   | `scale`                   | -
+    `beta_initializer`        | `beta_initializer`        | -
+    `gamma_initializer`       | `gamma_initializer`       | -
+    `moving_mean_initializer` | `moving_mean_initializer` | -
+    `beta_regularizer`        | `beta_regularizer'        | -
+    `gamma_regularizer`       | `gamma_regularizer'       | -
+    `beta_constraint`         | `beta_constraint'         | -
+    `gamma_constraint`        | `gamma_constraint'        | -
+    `renorm`                  | Not supported             | -
+    `renorm_clipping`         | Not supported             | -
+    `renorm_momentum`         | Not supported             | -
+    `fused`                   | Not supported             | -
+    `virtual_batch_size`      | Not supported             | -
+    `adjustment`              | Not supported             | -
+
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.batch_normalization` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.BatchNormalization` instead. "
+        "In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` "
+        "should not be used (consult the `tf.keras.layers.BatchNormalization` "
+        "documentation).",
+        stacklevel=2,
+    )
+    layer = BatchNormalization(
         axis=axis,
         momentum=momentum,
         epsilon=epsilon,
@@ -225,242 +468,10 @@ def __init__(self,
         virtual_batch_size=virtual_batch_size,
         adjustment=adjustment,
         name=name,
-        **kwargs)
-
-  def call(self, inputs, training=False):
-    return super().call(inputs, training=training)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.batch_normalization'])
-@tf_export(v1=['layers.batch_normalization'])
-def batch_normalization(inputs,
-                        axis=-1,
-                        momentum=0.99,
-                        epsilon=1e-3,
-                        center=True,
-                        scale=True,
-                        beta_initializer=tf.compat.v1.zeros_initializer(),
-                        gamma_initializer=tf.compat.v1.ones_initializer(),
-                        moving_mean_initializer=tf.compat.v1.zeros_initializer(),
-                        moving_variance_initializer=tf.compat.v1.ones_initializer(),
-                        beta_regularizer=None,
-                        gamma_regularizer=None,
-                        beta_constraint=None,
-                        gamma_constraint=None,
-                        training=False,
-                        trainable=True,
-                        name=None,
-                        reuse=None,
-                        renorm=False,
-                        renorm_clipping=None,
-                        renorm_momentum=0.99,
-                        fused=None,
-                        virtual_batch_size=None,
-                        adjustment=None):
-  """Functional interface for the batch normalization layer from_config(Ioffe et al., 2015).
-
-  Note: when training, the moving_mean and moving_variance need to be updated.
-  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
-  need to be executed alongside the `train_op`. Also, be sure to add any
-  batch_normalization ops before getting the update_ops collection. Otherwise,
-  update_ops will be empty, and training/inference will not work properly. For
-  example:
-
-  ```python
-    x_norm = tf.compat.v1.layers.batch_normalization(x, training=training)
-
-    # ...
-
-    update_ops = tf.compat.v1.get_collection(tf.GraphKeys.UPDATE_OPS)
-    train_op = optimizer.minimize(loss)
-    train_op = tf.group([train_op, update_ops])
-  ```
-
-  Args:
-    inputs: Tensor input.
-    axis: An `int`, the axis that should be normalized (typically the features
-      axis). For instance, after a `Convolution2D` layer with
-      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
-    momentum: Momentum for the moving average.
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored.
-    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
-      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
-      scaling can be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    moving_mean_initializer: Initializer for the moving mean.
-    moving_variance_initializer: Initializer for the moving variance.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: An optional projection function to be applied to the `beta`
-      weight after being updated by an `Optimizer` (e.g. used to implement norm
-      constraints or value constraints for layer weights). The function must
-      take as input the unprojected variable and must return the projected
-      variable (which must have the same shape). Constraints are not safe to use
-      when doing asynchronous distributed training.
-    gamma_constraint: An optional projection function to be applied to the
-      `gamma` weight after being updated by an `Optimizer`.
-    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
-      (e.g. a placeholder). Whether to return the output in training mode
-      (normalized with statistics of the current batch) or in inference mode
-      (normalized with moving statistics). **NOTE**: make sure to set this
-        parameter correctly, or else your training/inference will not work
-        properly.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-    name: String, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer by the same
-      name.
-    renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds extra
-      variables during training. The inference is the same for either value of
-      this parameter.
-    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction `(r,
-      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
-      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
-      dmax are set to inf, 0, inf, respectively.
-    renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training and
-      should be neither too small (which would add noise) nor too large (which
-      would give stale estimates). Note that `momentum` is still applied to get
-      the means and variances for inference.
-    fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.
-    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
-      which means batch normalization is performed across the whole batch. When
-      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
-      Normalization", which creates virtual sub-batches which are each
-      normalized separately (with shared gamma, beta, and moving statistics).
-      Must divide the actual batch size during execution.
-    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
-      the input tensor and returning a pair (scale, bias) to apply to the
-      normalized values (before gamma and beta), only during training. For
-      example, if axis==-1,
-        `adjustment = lambda shape: (
-          tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
-            value by up to 7% up or down, then shift the result by up to 0.1
-            (with independent scaling and bias for each feature but shared
-            across all examples), and finally apply gamma and/or beta. If
-            `None`, no adjustment is applied. Cannot be specified if
-            virtual_batch_size is specified.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  References:
-    Batch Normalization - Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift:
-      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
-      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
-    Batch Renormalization - Towards Reducing Minibatch Dependence in
-    Batch-Normalized Models:
-      [Ioffe,
-      2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
-      ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.BatchNormalization`.
-
-  The batch updating pattern with
-  `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used in
-  native TF2. Consult the `tf.keras.layers.BatchNormalization` documentation
-  for further information.
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   x_norm = tf.compat.v1.layers.batch_normalization(x)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input(shape=(28, 28, 1),)
-   y = tf.keras.layers.BatchNormalization()(x)
-   model = tf.keras.Model(x, y)
-  ```
-  #### How to Map Arguments
-
-  TF1 Arg Name              | TF2 Arg Name              | Note
-  :------------------------ | :------------------------ | :---------------
-  `name`                    | `name`                    | Layer base class
-  `trainable`               | `trainable`               | Layer base class
-  `axis`                    | `axis`                    | -
-  `momentum`                | `momentum`                | -
-  `epsilon`                 | `epsilon`                 | -
-  `center`                  | `center`                  | -
-  `scale`                   | `scale`                   | -
-  `beta_initializer`        | `beta_initializer`        | -
-  `gamma_initializer`       | `gamma_initializer`       | -
-  `moving_mean_initializer` | `moving_mean_initializer` | -
-  `beta_regularizer`        | `beta_regularizer'        | -
-  `gamma_regularizer`       | `gamma_regularizer'       | -
-  `beta_constraint`         | `beta_constraint'         | -
-  `gamma_constraint`        | `gamma_constraint'        | -
-  `renorm`                  | Not supported             | -
-  `renorm_clipping`         | Not supported             | -
-  `renorm_momentum`         | Not supported             | -
-  `fused`                   | Not supported             | -
-  `virtual_batch_size`      | Not supported             | -
-  `adjustment`              | Not supported             | -
-
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.batch_normalization` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.BatchNormalization` instead. '
-      'In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` '
-      'should not be used (consult the `tf.keras.layers.BatchNormalization` '
-      'documentation).',
-      stacklevel=2)
-  layer = BatchNormalization(
-      axis=axis,
-      momentum=momentum,
-      epsilon=epsilon,
-      center=center,
-      scale=scale,
-      beta_initializer=beta_initializer,
-      gamma_initializer=gamma_initializer,
-      moving_mean_initializer=moving_mean_initializer,
-      moving_variance_initializer=moving_variance_initializer,
-      beta_regularizer=beta_regularizer,
-      gamma_regularizer=gamma_regularizer,
-      beta_constraint=beta_constraint,
-      gamma_constraint=gamma_constraint,
-      renorm=renorm,
-      renorm_clipping=renorm_clipping,
-      renorm_momentum=renorm_momentum,
-      fused=fused,
-      trainable=trainable,
-      virtual_batch_size=virtual_batch_size,
-      adjustment=adjustment,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer(inputs, training=training)
+        _reuse=reuse,
+        _scope=name,
+    )
+    return layer(inputs, training=training)
 
 
 # Aliases
diff --git a/keras/legacy_tf_layers/normalization_test.py b/keras/legacy_tf_layers/normalization_test.py
index b0a55cc6a5b2..097b20b8555b 100644
--- a/keras/legacy_tf_layers/normalization_test.py
+++ b/keras/legacy_tf_layers/normalization_test.py
@@ -18,1422 +18,1660 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v2 as tf
-
 import os
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
-from tensorflow.core.protobuf import saver_pb2
-from tensorflow.python.framework import test_util as tf_test_utils
 from keras.legacy_tf_layers import convolutional as conv_layers
 from keras.legacy_tf_layers import normalization as normalization_layers
 
+# isort: off
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
-@tf_test_utils.run_v1_only('b/120545219')
-class BNTest(tf.test.TestCase):
 
-  def _simple_model(self, image, fused, freeze_mode):
-    output_channels, kernel_size = 2, 3
-    conv = conv_layers.conv2d(
-        image,
-        output_channels,
-        kernel_size,
-        use_bias=False,
-        kernel_initializer=tf.compat.v1.ones_initializer())
-    bn_layer = normalization_layers.BatchNormalization(fused=fused)
-    bn_layer._bessels_correction_test_only = False
-    training = not freeze_mode
-    bn = bn_layer(conv, training=training)
-    loss = tf.reduce_sum(tf.abs(bn))
-    optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.01)
-    if not freeze_mode:
-      update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
-      with tf.control_dependencies(update_ops):
-        train_op = optimizer.minimize(loss)
-    else:
-      train_op = optimizer.minimize(loss)
-    saver = tf.compat.v1.train.Saver(write_version=saver_pb2.SaverDef.V2)
-    return loss, train_op, saver
-
-  def _train(self,
-             checkpoint_path,
-             shape,
-             use_gpu,
-             is_fused,
-             restore=False,
-             freeze_mode=False,
-             dtype=tf.float32):
-    tf.compat.v1.reset_default_graph()
-    graph = tf.compat.v1.get_default_graph()
-    with self.session(graph=graph, use_gpu=use_gpu) as sess:
-      image = tf.compat.v1.placeholder(dtype=dtype, shape=shape)
-      loss, train_op, saver = self._simple_model(image, is_fused, freeze_mode)
-      if restore:
-        saver.restore(sess, checkpoint_path)
-      else:
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-      np.random.seed(0)
-      for _ in range(2):
-        image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
-        sess.run([loss, train_op], feed_dict={image: image_val})
-      if restore:
-        all_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
-        all_vars_values = [var.eval() for var in all_vars]
-        return all_vars_values
-      else:
-        saver.save(sess, checkpoint_path)
-
-  def _infer(self, checkpoint_path, image_val, shape, use_gpu, is_fused):
-    dtype = image_val.dtype
-    tf.compat.v1.reset_default_graph()
-    graph = tf.compat.v1.get_default_graph()
-    with self.session(graph=graph, use_gpu=use_gpu) as sess:
-      image = tf.compat.v1.placeholder(dtype=dtype, shape=shape)
-      loss, _, saver = self._simple_model(image, is_fused, True)
-      saver.restore(sess, checkpoint_path)
-      loss_val = sess.run(loss, feed_dict={image: image_val})
-      return loss_val
-
-  def _trainEvalSequence(self, dtype, train1_use_gpu, train2_use_gpu,
-                         infer_use_gpu):
-    batch, height, width, input_channels = 2, 4, 5, 3
-    shape = [batch, height, width, input_channels]
-
-    # Not all characters in a dtype string representation are allowed in
-    # filenames in all operating systems. This map will sanitize these.
-    dtype_to_valid_fn = {
-        tf.float16: 'float16',
-        tf.float32: 'float32',
-    }
-    checkpoint = os.path.join(
-        self.get_temp_dir(), 'cp_%s_%s_%s_%s' % (
-            dtype_to_valid_fn[dtype], train1_use_gpu, train2_use_gpu,
-            infer_use_gpu))
-
-    self._train(
-        checkpoint,
+@tf_test_utils.run_v1_only("b/120545219")
+class BNTest(tf.test.TestCase):
+    def _simple_model(self, image, fused, freeze_mode):
+        output_channels, kernel_size = 2, 3
+        conv = conv_layers.conv2d(
+            image,
+            output_channels,
+            kernel_size,
+            use_bias=False,
+            kernel_initializer=tf.compat.v1.ones_initializer(),
+        )
+        bn_layer = normalization_layers.BatchNormalization(fused=fused)
+        bn_layer._bessels_correction_test_only = False
+        training = not freeze_mode
+        bn = bn_layer(conv, training=training)
+        loss = tf.reduce_sum(tf.abs(bn))
+        optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.01)
+        if not freeze_mode:
+            update_ops = tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.UPDATE_OPS
+            )
+            with tf.control_dependencies(update_ops):
+                train_op = optimizer.minimize(loss)
+        else:
+            train_op = optimizer.minimize(loss)
+        saver = tf.compat.v1.train.Saver(write_version=saver_pb2.SaverDef.V2)
+        return loss, train_op, saver
+
+    def _train(
+        self,
+        checkpoint_path,
         shape,
-        use_gpu=train1_use_gpu,
-        is_fused=True,
+        use_gpu,
+        is_fused,
         restore=False,
         freeze_mode=False,
-        dtype=dtype)
-
-    train_vars = self._train(
-        checkpoint,
-        shape,
-        use_gpu=train2_use_gpu,
-        is_fused=True,
-        restore=True,
-        freeze_mode=False,
-        dtype=dtype)
-
-    np.random.seed(0)
-    image_val = np.random.rand(batch, height, width, input_channels).astype(
-        dtype.as_numpy_dtype)
-    loss_val = self._infer(
-        checkpoint, image_val, shape, use_gpu=infer_use_gpu, is_fused=True)
-
-    return train_vars, loss_val
-
-  def testHalfPrecision(self):
-    ref_vars, ref_loss = self._trainEvalSequence(
         dtype=tf.float32,
-        train1_use_gpu=True,
-        train2_use_gpu=True,
-        infer_use_gpu=True)
-
-    self.assertEqual(len(ref_vars), 5)
-
-    for train1_use_gpu in [True, False]:
-      for train2_use_gpu in [True, False]:
-        for infer_use_gpu in [True, False]:
-          test_vars, test_loss = self._trainEvalSequence(
-              tf.float16, train1_use_gpu, train2_use_gpu, infer_use_gpu)
-          self.assertEqual(len(test_vars), 5)
-          for test_var, ref_var in zip(test_vars, ref_vars):
-            self.assertAllClose(test_var, ref_var, rtol=1.e-3, atol=1.e-3)
-          self.assertAllClose(test_loss, ref_loss, rtol=1.e-3, atol=1.e-3)
-
-  def _testCheckpoint(self, is_fused_checkpoint_a, is_fused_checkpoint_b,
-                      use_gpu_checkpoint_a, use_gpu_checkpoint_b,
-                      use_gpu_test_a, use_gpu_test_b, freeze_mode):
-    batch, height, width, input_channels = 2, 4, 5, 3
-    shape = [batch, height, width, input_channels]
-    base_path = '%s_%s_%s_%s_%s_%s' % (is_fused_checkpoint_a,
-                                       is_fused_checkpoint_b,
-                                       use_gpu_checkpoint_a,
-                                       use_gpu_checkpoint_b, use_gpu_test_a,
-                                       use_gpu_test_b)
-
-    checkpoint_path_a = os.path.join(self.get_temp_dir(),
-                                     'checkpoint_a_%s' % base_path)
-    self._train(
-        checkpoint_path_a,
-        shape,
-        use_gpu_checkpoint_a,
+    ):
+        tf.compat.v1.reset_default_graph()
+        graph = tf.compat.v1.get_default_graph()
+        with self.session(graph=graph, use_gpu=use_gpu) as sess:
+            image = tf.compat.v1.placeholder(dtype=dtype, shape=shape)
+            loss, train_op, saver = self._simple_model(
+                image, is_fused, freeze_mode
+            )
+            if restore:
+                saver.restore(sess, checkpoint_path)
+            else:
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+            np.random.seed(0)
+            for _ in range(2):
+                image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+                sess.run([loss, train_op], feed_dict={image: image_val})
+            if restore:
+                all_vars = tf.compat.v1.get_collection(
+                    tf.compat.v1.GraphKeys.GLOBAL_VARIABLES
+                )
+                all_vars_values = [var.eval() for var in all_vars]
+                return all_vars_values
+            else:
+                saver.save(sess, checkpoint_path)
+
+    def _infer(self, checkpoint_path, image_val, shape, use_gpu, is_fused):
+        dtype = image_val.dtype
+        tf.compat.v1.reset_default_graph()
+        graph = tf.compat.v1.get_default_graph()
+        with self.session(graph=graph, use_gpu=use_gpu) as sess:
+            image = tf.compat.v1.placeholder(dtype=dtype, shape=shape)
+            loss, _, saver = self._simple_model(image, is_fused, True)
+            saver.restore(sess, checkpoint_path)
+            loss_val = sess.run(loss, feed_dict={image: image_val})
+            return loss_val
+
+    def _trainEvalSequence(
+        self, dtype, train1_use_gpu, train2_use_gpu, infer_use_gpu
+    ):
+        batch, height, width, input_channels = 2, 4, 5, 3
+        shape = [batch, height, width, input_channels]
+
+        # Not all characters in a dtype string representation are allowed in
+        # filenames in all operating systems. This map will sanitize these.
+        dtype_to_valid_fn = {
+            tf.float16: "float16",
+            tf.float32: "float32",
+        }
+        checkpoint = os.path.join(
+            self.get_temp_dir(),
+            "cp_%s_%s_%s_%s"
+            % (
+                dtype_to_valid_fn[dtype],
+                train1_use_gpu,
+                train2_use_gpu,
+                infer_use_gpu,
+            ),
+        )
+
+        self._train(
+            checkpoint,
+            shape,
+            use_gpu=train1_use_gpu,
+            is_fused=True,
+            restore=False,
+            freeze_mode=False,
+            dtype=dtype,
+        )
+
+        train_vars = self._train(
+            checkpoint,
+            shape,
+            use_gpu=train2_use_gpu,
+            is_fused=True,
+            restore=True,
+            freeze_mode=False,
+            dtype=dtype,
+        )
+
+        np.random.seed(0)
+        image_val = np.random.rand(batch, height, width, input_channels).astype(
+            dtype.as_numpy_dtype
+        )
+        loss_val = self._infer(
+            checkpoint, image_val, shape, use_gpu=infer_use_gpu, is_fused=True
+        )
+
+        return train_vars, loss_val
+
+    def testHalfPrecision(self):
+        ref_vars, ref_loss = self._trainEvalSequence(
+            dtype=tf.float32,
+            train1_use_gpu=True,
+            train2_use_gpu=True,
+            infer_use_gpu=True,
+        )
+
+        self.assertEqual(len(ref_vars), 5)
+
+        for train1_use_gpu in [True, False]:
+            for train2_use_gpu in [True, False]:
+                for infer_use_gpu in [True, False]:
+                    test_vars, test_loss = self._trainEvalSequence(
+                        tf.float16,
+                        train1_use_gpu,
+                        train2_use_gpu,
+                        infer_use_gpu,
+                    )
+                    self.assertEqual(len(test_vars), 5)
+                    for test_var, ref_var in zip(test_vars, ref_vars):
+                        self.assertAllClose(
+                            test_var, ref_var, rtol=1.0e-3, atol=1.0e-3
+                        )
+                    self.assertAllClose(
+                        test_loss, ref_loss, rtol=1.0e-3, atol=1.0e-3
+                    )
+
+    def _testCheckpoint(
+        self,
         is_fused_checkpoint_a,
-        restore=False,
-        freeze_mode=freeze_mode)
-    checkpoint_path_b = os.path.join(self.get_temp_dir(),
-                                     'checkpoint_b_%s' % base_path)
-    self._train(
-        checkpoint_path_b,
-        shape,
-        use_gpu_checkpoint_b,
         is_fused_checkpoint_b,
-        restore=False,
-        freeze_mode=freeze_mode)
-
-    vars_fused = self._train(
-        checkpoint_path_a,
-        shape,
+        use_gpu_checkpoint_a,
+        use_gpu_checkpoint_b,
         use_gpu_test_a,
-        True,
-        restore=True,
-        freeze_mode=freeze_mode)
-    vars_nonfused = self._train(
-        checkpoint_path_b,
-        shape,
         use_gpu_test_b,
-        False,
-        restore=True,
-        freeze_mode=freeze_mode)
-    self.assertEqual(len(vars_fused), 5)
-    self.assertEqual(len(vars_nonfused), 5)
-    for var_fused, var_nonfused in zip(vars_fused, vars_nonfused):
-      self.assertAllClose(var_fused, var_nonfused, atol=1e-5)
-
-    image_val = np.random.rand(batch, height, width,
-                               input_channels).astype(np.float32)
-    loss_fused_val = self._infer(checkpoint_path_a, image_val, shape,
-                                 use_gpu_test_a, True)
-    loss_nonfused_val = self._infer(checkpoint_path_b, image_val, shape,
-                                    use_gpu_test_b, False)
-    self.assertAllClose(loss_fused_val, loss_nonfused_val, atol=1e-6, rtol=3e-4)
-
-  def _testCheckpointCrossDevice(self, ckpt_a_fused, ckpt_a_use_gpu,
-                                 ckpt_b_fused, ckpt_b_use_gpu):
-    for use_gpu_test_a in [True, False]:
-      for use_gpu_test_b in [True, False]:
-        for freeze_mode in [True, False]:
-          self._testCheckpoint(ckpt_a_fused, ckpt_a_use_gpu, ckpt_b_fused,
-                               ckpt_b_use_gpu, use_gpu_test_a, use_gpu_test_b,
-                               freeze_mode)
-
-  def testCheckpointFusedCPUAndFusedGPU(self):
-    self._testCheckpointCrossDevice(True, False, True, True)
-
-  def testCheckpointFusedCPUAndFusedCPU(self):
-    self._testCheckpointCrossDevice(True, False, True, False)
-
-  def testCheckpointFusedGPUAndFusedGPU(self):
-    self._testCheckpointCrossDevice(True, True, True, True)
-
-  def testCheckpointNonFusedCPUAndNonFusedGPU(self):
-    self._testCheckpointCrossDevice(False, False, False, True)
-
-  def testCheckpointNonFusedCPUAndNonFusedCPU(self):
-    self._testCheckpointCrossDevice(False, False, False, False)
-
-  def testCheckpointNonFusedGPUAndNonFusedGPU(self):
-    self._testCheckpointCrossDevice(False, True, False, True)
-
-  def testCheckpointNonFusedGPUAndFusedGPU(self):
-    self._testCheckpointCrossDevice(False, True, True, True)
-
-  def testCheckpointNonFusedGPUAndFusedCPU(self):
-    self._testCheckpointCrossDevice(False, True, True, False)
-
-  def testCheckpointNonFusedCPUAndFusedCPU(self):
-    self._testCheckpointCrossDevice(False, False, True, False)
-
-  def testCreateBN(self):
-    # Call layer.
-    bn = normalization_layers.BatchNormalization(axis=1)
-    inputs = tf.random.uniform((5, 4, 3), seed=1)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    # Verify shape.
-    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
-
-    # Verify layer attributes.
-    self.assertEqual(len(bn.updates), 2)
-    self.assertEqual(len(bn.variables), 4)
-    self.assertEqual(len(bn.trainable_variables), 2)
-    self.assertEqual(len(bn.non_trainable_variables), 2)
-
-    # Test that updates were created and added to UPDATE_OPS.
-    self.assertEqual(len(bn.updates), 2)
-    self.assertListEqual(
-        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS), bn.updates)
-
-    # Test that weights were created and added to TRAINABLE_VARIABLES.
-    self.assertListEqual(
-        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES),
-        bn.trainable_variables)
-
-  def testCreateFusedBNFloat16(self):
-    # Call layer.
-    bn = normalization_layers.BatchNormalization(axis=1, fused=True)
-    inputs = tf.random.uniform(
-        (5, 4, 3, 3), seed=1, dtype=tf.float16)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    # Verify shape.
-    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3, 3])
-
-    # Verify layer attributes.
-    self.assertEqual(len(bn.updates), 2)
-    self.assertEqual(len(bn.variables), 4)
-    self.assertEqual(len(bn.trainable_variables), 2)
-    self.assertEqual(len(bn.non_trainable_variables), 2)
-    for var in bn.variables:
-      self.assertTrue(var.dtype._is_ref_dtype)
-
-    # Test that updates were created and added to UPDATE_OPS.
-    self.assertEqual(len(bn.updates), 2)
-    self.assertListEqual(
-        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS), bn.updates)
-
-    # Test that weights were created and added to TRAINABLE_VARIABLES.
-    self.assertListEqual(
-        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES),
-        bn.trainable_variables)
-
-  def test3DInputAxis1(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=1, epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 4, 1))
-      np_beta = np.reshape(np_beta, (1, 4, 1))
-
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 2))
-      std = np.std(np_inputs, axis=(0, 2))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def test3DInputAxis2(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=2, epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 3))
-      np_beta = np.reshape(np_beta, (1, 1, 3))
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 1))
-      std = np.std(np_inputs, axis=(0, 1))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def test4DInputAxis1(self):
-    if tf.test.is_gpu_available(cuda_only=True):
-      epsilon = 1e-3
-      bn = normalization_layers.BatchNormalization(
-          axis=1, epsilon=epsilon, momentum=0.9)
-      inputs = tf.Variable(
-          np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32)
-      training = tf.compat.v1.placeholder(dtype='bool')
-      outputs = bn(inputs, training=training)
-
-      with self.session() as sess:
-        # Test training with placeholder learning phase.
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-        np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
-        np_beta = np.reshape(np_beta, (1, 4, 1, 1))
-        for _ in range(100):
-          np_output, _, _ = sess.run(
-              [outputs] + bn.updates, feed_dict={training: True})
-          # Verify that the axis is normalized during training.
-          normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-          self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-          self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-        # Verify that the statistics are updated during training.
-        moving_mean, moving_var = self.evaluate(
-            [bn.moving_mean, bn.moving_variance])
-        np_inputs = self.evaluate(inputs)
-        mean = np.mean(np_inputs, axis=(0, 2, 3))
-        std = np.std(np_inputs, axis=(0, 2, 3))
-        variance = np.square(std)
-        self.assertAllClose(mean, moving_mean, atol=1e-2)
-        self.assertAllClose(variance, moving_var, atol=1e-2)
-
-        # Test inference with placeholder learning phase.
-        np_output = sess.run(outputs, feed_dict={training: False})
-
-        # Verify that the axis is normalized during inference.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def test4DInputAxis2(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=2, epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
-      np_beta = np.reshape(np_beta, (1, 1, 3, 1))
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 1, 3))
-      std = np.std(np_inputs, axis=(0, 1, 3))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def test4DInputAxis3(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=3, epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 1, 2))
-      std = np.std(np_inputs, axis=(0, 1, 2))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def test4DInputAxis3Fused(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=3, epsilon=epsilon, momentum=0.9, fused=True)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      for _ in range(100):
-        np_output, _, _ = sess.run(
-            [outputs] + bn.updates, feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 1, 2))
-      std = np.std(np_inputs, axis=(0, 1, 2))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def test4DInputAxis1Fused(self):
-    if tf.test.is_gpu_available(cuda_only=True):
-      epsilon = 1e-3
-      bn = normalization_layers.BatchNormalization(
-          axis=1, epsilon=epsilon, momentum=0.9, fused=True)
-      inputs = tf.Variable(
-          np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32)
-      training = tf.compat.v1.placeholder(dtype='bool')
-      outputs = bn(inputs, training=training)
-
-      with self.cached_session() as sess:
-        # Test training with placeholder learning phase.
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-        np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
-        np_beta = np.reshape(np_beta, (1, 4, 1, 1))
-        for _ in range(100):
-          np_output, _, _ = sess.run(
-              [outputs] + bn.updates, feed_dict={training: True})
-          # Verify that the axis is normalized during training.
-          normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-          self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-          self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-        # Verify that the statistics are updated during training.
-        moving_mean, moving_var = self.evaluate(
-            [bn.moving_mean, bn.moving_variance])
-        np_inputs = self.evaluate(inputs)
-        mean = np.mean(np_inputs, axis=(0, 2, 3))
-        std = np.std(np_inputs, axis=(0, 2, 3))
-        variance = np.square(std)
-        self.assertAllClose(mean, moving_mean, atol=1e-2)
-        self.assertAllClose(variance, moving_var, atol=1e-2)
-
-        # Test inference with placeholder learning phase.
-        np_output = sess.run(outputs, feed_dict={training: False})
-
-        # Verify that the axis is normalized during inference.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def testNegativeAxis(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=-1, epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 1, 2))
-      std = np.std(np_inputs, axis=(0, 1, 2))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def testBooleanLearningPhase(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=-1, epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32)
-    outputs_training = bn(inputs, training=True)
-    outputs_infer = bn(inputs, training=False)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs_training] + bn.updates)
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 1, 2))
-      std = np.std(np_inputs, axis=(0, 1, 2))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = self.evaluate(outputs_infer)
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def testFunctionalNoReuse(self):
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=tf.float32)
-    epsilon = 1e-3
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = normalization_layers.batch_norm(
-        inputs,
-        axis=-1,
-        momentum=0.9,
-        epsilon=epsilon,
-        training=training,
-        name='bn')
-
-    updates = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
-    all_vars = dict([(v.name, v) for v in tf.compat.v1.global_variables()])
-    moving_mean = all_vars['bn/moving_mean:0']
-    moving_variance = all_vars['bn/moving_variance:0']
-    beta = all_vars['bn/beta:0']
-    gamma = all_vars['bn/gamma:0']
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      np_gamma, np_beta = self.evaluate([gamma, beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = self.evaluate(
-          [moving_mean, moving_variance])
-      np_inputs = self.evaluate(inputs)
-      np_mean = np.mean(np_inputs, axis=(0, 1, 2))
-      np_std = np.std(np_inputs, axis=(0, 1, 2))
-      np_variance = np.square(np_std)
-      self.assertAllClose(np_mean, np_moving_mean, atol=1e-2)
-      self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def testFunctionalReuse(self):
-    inputs1 = tf.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=tf.float32)
-    inputs2 = tf.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=tf.float32)
-    epsilon = 1e-3
-    training = tf.compat.v1.placeholder(dtype='bool')
-    _ = normalization_layers.batch_norm(
-        inputs1,
-        axis=-1,
-        momentum=0.9,
-        epsilon=epsilon,
-        training=training,
-        name='bn')
-    outputs2 = normalization_layers.batch_norm(
-        inputs2,
-        axis=-1,
-        momentum=0.9,
-        epsilon=epsilon,
-        training=training,
-        name='bn',
-        reuse=True)
-
-    # Last 2 update ops
-    updates = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)[-2:]
-    all_vars = dict([(v.name, v) for v in tf.compat.v1.global_variables()])
-    moving_mean = all_vars['bn/moving_mean:0']
-    moving_variance = all_vars['bn/moving_variance:0']
-    beta = all_vars['bn/beta:0']
-    gamma = all_vars['bn/gamma:0']
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs2] + updates,
-                                   feed_dict={training: True})
-
-      # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = self.evaluate(
-          [moving_mean, moving_variance])
-      np_inputs = self.evaluate(inputs2)
-      np_mean = np.mean(np_inputs, axis=(0, 1, 2))
-      np_std = np.std(np_inputs, axis=(0, 1, 2))
-      np_variance = np.square(np_std)
-      self.assertAllClose(np_mean, np_moving_mean, atol=1e-2)
-      self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
-
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = self.evaluate([gamma, beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs2, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def testFunctionalReuseFromScope(self):
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=tf.float32)
-    epsilon = 1e-3
-    training = tf.compat.v1.placeholder(dtype='bool')
-    with tf.compat.v1.variable_scope('scope'):
-      _ = normalization_layers.batch_norm(
-          inputs, axis=-1, momentum=0.9, epsilon=epsilon, training=training)
-      self.assertEqual(len(tf.compat.v1.global_variables()), 5)
-    with tf.compat.v1.variable_scope('scope', reuse=True):
-      _ = normalization_layers.batch_norm(
-          inputs, axis=-1, momentum=0.9, epsilon=epsilon, training=training)
-      self.assertEqual(len(tf.compat.v1.global_variables()), 5)
-
-  def testNoCenter(self):
-    bn = normalization_layers.BatchNormalization(axis=1, center=False)
-    inputs = tf.random.uniform((5, 4, 3), seed=1)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    # Verify shape.
-    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
-
-    # Verify layer attributes.
-    self.assertEqual(len(bn.updates), 2)
-    self.assertEqual(len(bn.variables), 3)
-    self.assertEqual(len(bn.trainable_variables), 1)
-    self.assertEqual(len(bn.non_trainable_variables), 2)
-
-  def testNoScale(self):
-    bn = normalization_layers.BatchNormalization(axis=1, scale=False)
-    inputs = tf.random.uniform((5, 4, 3), seed=1)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    # Verify shape.
-    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
-
-    # Verify layer attributes.
-    self.assertEqual(len(bn.updates), 2)
-    self.assertEqual(len(bn.variables), 3)
-    self.assertEqual(len(bn.trainable_variables), 1)
-    self.assertEqual(len(bn.non_trainable_variables), 2)
-
-  def testRegularizers(self):
-    reg = lambda x: 0.1 * tf.reduce_sum(x)
-    bn = normalization_layers.BatchNormalization(axis=1, beta_regularizer=reg)
-    inputs = tf.random.uniform((5, 4, 3), seed=1)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    _ = bn(inputs, training=training)
-    self.assertEqual(len(bn.losses), 1)
-
-    bn = normalization_layers.BatchNormalization(axis=1, gamma_regularizer=reg)
-    inputs = tf.random.uniform((5, 4, 3), seed=1)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    _ = bn(inputs, training=training)
-    self.assertEqual(len(bn.losses), 1)
-
-  def testConstraints(self):
-    g_constraint = lambda x: x / tf.reduce_sum(x)
-    b_constraint = lambda x: x / tf.reduce_max(x)
-    bn = normalization_layers.BatchNormalization(axis=1,
-                                                 gamma_constraint=g_constraint,
-                                                 beta_constraint=b_constraint)
-    inputs = tf.random.uniform((5, 4, 3), seed=1)
-    bn(inputs)
-    self.assertEqual(bn.gamma_constraint, g_constraint)
-    self.assertEqual(bn.beta_constraint, b_constraint)
-
-  def testRenorm(self):
-    shape = (4, 3)
-    xt = tf.compat.v1.placeholder(tf.float32, shape)
-    momentum = 0.99
-    renorm_momentum = 0.8
-    rmax = 1.1
-    rmin = 0.9
-    dmax = 0.1
-    gamma = 2.
-    beta = 3.
-    epsilon = 0.001
-    bn = normalization_layers.BatchNormalization(
-        axis=1,
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        epsilon=epsilon,
-        momentum=momentum,
-        renorm=True,
-        renorm_clipping={'rmax': rmax, 'rmin': rmin, 'dmax': dmax},
-        renorm_momentum=renorm_momentum)
-    training = tf.compat.v1.placeholder(tf.bool)
-    yt = bn(xt, training=training)
-
-    moving_mean = 0.
-    moving_stddev = 1.
-    renorm_mean = 0.
-    renorm_stddev = 1.
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(5):
-        x = np.random.random(shape)
-
-        mean = x.mean(0)
-        variance = x.var(0)
-        stddev = np.sqrt(variance + epsilon)
-        r = (stddev / renorm_stddev).clip(rmin, rmax)
-        d = ((mean - renorm_mean) / renorm_stddev).clip(-dmax, dmax)
-        y_train = ((x - mean) / stddev * r + d) * gamma + beta
-        renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum)
-        renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum)
-        moving_mean += (mean - moving_mean) * (1. - momentum)
-        moving_stddev += (stddev - moving_stddev) * (1. - momentum)
-
-        y_test = ((x - moving_mean) /
-                  (moving_stddev * moving_stddev)**0.5 * gamma) + beta
-
-        yt_val_train, _, _ = sess.run([yt] + bn.updates,
-                                      feed_dict={xt: x, training: True})
-        yt_val_test, _, _ = sess.run([yt] + bn.updates,
-                                     feed_dict={xt: x, training: False})
-
-        self.assertAllClose(y_train, yt_val_train, atol=1e-5)
-        self.assertAllClose(y_test, yt_val_test, atol=1e-5)
-
-  def testRenormNoClippingSameMomentumGivesSameTestTrain(self):
-    shape = (4, 3)
-    xt = tf.compat.v1.placeholder(tf.float32, shape)
-    momentum = 0.9
-    renorm_momentum = 0.9
-    gamma = 2.
-    beta = 3.
-    epsilon = 0.001
-    bn = normalization_layers.BatchNormalization(
-        axis=1,
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        epsilon=epsilon,
-        momentum=momentum,
-        renorm=True,
-        renorm_clipping=None,
-        renorm_momentum=momentum)
-    training = tf.compat.v1.placeholder(tf.bool)
-    yt = bn(xt, training=training)
-    moving_mean = 0.
-    moving_stddev = 1.
-    renorm_mean = 0.
-    renorm_stddev = 1.
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for step in range(6):
-        x = np.random.random(shape)
-
-        mean = x.mean(0)
-        variance = x.var(0)
-        stddev = np.sqrt(variance + epsilon)
-        r = (stddev / renorm_stddev)
-        d = ((mean - renorm_mean) / renorm_stddev)
-        y_test = ((x - moving_mean) /
-                  (moving_stddev * moving_stddev)**0.5 * gamma) + beta
-        y_train = ((x - mean) / stddev * r + d) * gamma + beta
-        renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum)
-        renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum)
-        moving_mean += (mean - moving_mean) * (1. - momentum)
-        moving_stddev += (stddev - moving_stddev) * (1. - momentum)
-
-        # Compute test values first, before the train mode updates the moving
-        # averages.
-        yt_val_test, _, _ = sess.run([yt] + bn.updates,
-                                     feed_dict={xt: x, training: False})
-        yt_val_train, _, _ = sess.run([yt] + bn.updates,
-                                      feed_dict={xt: x, training: True})
-
-        # Due to initialization inconsistencies, values may not be identical
-        # on the first iteration (but shouldn't be different by much more than
-        # epsilon). After the first iteration they should be identical.
-        atol = epsilon * 1.5 if step == 0 else 1e-5
-        self.assertAllClose(y_train, yt_val_train, atol=atol)
-        self.assertAllClose(y_test, yt_val_test, atol=atol)
-        self.assertAllClose(yt_val_train, yt_val_test, atol=atol)
-
-  def testAdjustment(self):
-    shape = (4, 3)
-    xt = tf.compat.v1.placeholder(tf.float32, shape)
-    momentum = 0.99
-    gamma = 2.
-    beta = 3.
-    epsilon = 0.001
-    adjust_scale = tf.random.uniform(shape[-1:], 0.5, 1.5)
-    adjust_bias = tf.random.uniform(shape[-1:], -.2, .2)
-    bn = normalization_layers.BatchNormalization(
-        axis=1,
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        epsilon=epsilon,
-        momentum=momentum,
-        adjustment=lambda _: (adjust_scale, adjust_bias))
-    training = tf.compat.v1.placeholder(tf.bool)
-    yt = bn(xt, training=training)
-
-    moving_mean = 0.
-    moving_variance = 1.
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(5):
-        x = np.random.random(shape)
-        yt_val_train, adj_scale_val, adj_bias_val = sess.run(
-            [yt, adjust_scale, adjust_bias] + bn.updates,
-            feed_dict={xt: x, training: True})[:3]
-        yt_val_test = sess.run([yt] + bn.updates,
-                               feed_dict={xt: x, training: False})[0]
-
-        mean = x.mean(0)
-        variance = x.var(0)
-        y_train = (((x - mean) / (variance + epsilon) ** 0.5) * adj_scale_val +
-                   adj_bias_val) * gamma + beta
-        moving_mean += (mean - moving_mean) * (1. - momentum)
-        moving_variance += (variance - moving_variance) * (1. - momentum)
-
-        y_test = ((x - moving_mean) / (moving_variance + epsilon) ** 0.5 *
-                  gamma) + beta
-
-        self.assertAllClose(y_train, yt_val_train, atol=1e-5)
-        self.assertAllClose(y_test, yt_val_test, atol=1e-5)
-
-  def testRenormWithAdjustment(self):
-    shape = (4, 3)
-    xt = tf.compat.v1.placeholder(tf.float32, shape)
-    momentum = 0.99
-    renorm_momentum = 0.8
-    rmax = 1.1
-    rmin = 0.9
-    dmax = 0.1
-    gamma = 2.
-    beta = 3.
-    epsilon = 0.001
-    adjust_scale = tf.random.uniform(shape[-1:], 0.5, 1.5)
-    adjust_bias = tf.random.uniform(shape[-1:], -.2, .2)
-    bn = normalization_layers.BatchNormalization(
-        axis=1,
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        epsilon=epsilon,
-        momentum=momentum,
-        renorm=True,
-        renorm_clipping={'rmax': rmax, 'rmin': rmin, 'dmax': dmax},
-        renorm_momentum=renorm_momentum,
-        adjustment=lambda _: (adjust_scale, adjust_bias))
-    training = tf.compat.v1.placeholder(tf.bool)
-    yt = bn(xt, training=training)
-
-    moving_mean = 0.
-    moving_stddev = 1.
-    renorm_mean = 0.
-    renorm_stddev = 1.
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(5):
-        x = np.random.random(shape)
-        yt_val_train, adj_scale_val, adj_bias_val = sess.run(
-            [yt, adjust_scale, adjust_bias] + bn.updates,
-            feed_dict={xt: x, training: True})[:3]
-        yt_val_test = sess.run([yt] + bn.updates,
-                               feed_dict={xt: x, training: False})[0]
-
-        mean = x.mean(0)
-        variance = x.var(0)
-        stddev = np.sqrt(variance + epsilon)
-        r = (stddev / renorm_stddev).clip(rmin, rmax)
-        d = ((mean - renorm_mean) / renorm_stddev).clip(-dmax, dmax)
-        y_train = (((x - mean) / stddev * r + d) * adj_scale_val +
-                   adj_bias_val) * gamma + beta
-        renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum)
-        renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum)
-        moving_mean += (mean - moving_mean) * (1. - momentum)
-        moving_stddev += (stddev - moving_stddev) * (1. - momentum)
-
-        y_test = ((x - moving_mean) /
-                  (moving_stddev * moving_stddev)**0.5 * gamma) + beta
-
-        self.assertAllClose(y_train, yt_val_train, atol=1e-5)
-        self.assertAllClose(y_test, yt_val_test, atol=1e-5)
-
-  def testGhostBNNegativeVirtualBatch(self):
-    shape = [6, 5, 4, 3]
-    inp = tf.random.uniform(shape, seed=1)
-
-    with self.assertRaises(ValueError):
-      normalization_layers.batch_normalization(
-          inp, virtual_batch_size=-1)
-
-  def testGhostBNVirtualBatchFull(self):
-    shape = [6, 5, 4, 3]
-    inp = tf.random.uniform(shape, seed=1)
-    out1 = normalization_layers.batch_normalization(inp)
-    out2 = normalization_layers.batch_normalization(
-        inp, virtual_batch_size=6)
-
-    self.assertListEqual(
-        out1.shape.as_list(), out2.shape.as_list())
-
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      x = np.random.random(shape)
-      y1, y2 = sess.run([out1, out2], feed_dict={inp: x})
-
-      self.assertAllClose(y1, y2, atol=1e-5)
-
-  def testGhostBNInputOutputShapesMatch(self):
-    shape = [6, 4, 3]
-    inp = tf.random.uniform(shape, seed=1)
-    out = normalization_layers.batch_normalization(
-        inp, virtual_batch_size=3)
-    self.assertListEqual(out.shape.as_list(), shape)
-
-  def testGhostBNUnknownBatchSize(self):
-    np_shape = [10, 5, 4]
-    tf_shape = [None, 5, 4]
-    inp = tf.compat.v1.placeholder(tf.float32, tf_shape)
-    out = normalization_layers.batch_normalization(
-        inp, virtual_batch_size=2)
-
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      x = np.random.random(np_shape)
-      y = sess.run(out, feed_dict={inp: x})
-
-      self.assertListEqual(list(y.shape), np_shape)
-
-  def testGhostBN2Dims(self):
-    shape = [6, 2]
-    virtual_batch_size = 3
-    beta = 2.
-    gamma = 3.
-    momentum = 0.8
-    epsilon = 1e-3
-    moving_means = np.zeros([2, 2], dtype=np.float32)
-    moving_vars = np.ones([2, 2], dtype=np.float32)
-
-    inp = tf.compat.v1.placeholder(tf.float32, shape)
-    is_training = tf.compat.v1.placeholder(tf.bool)
-    bn = normalization_layers.BatchNormalization(
-        momentum=momentum,
-        epsilon=epsilon,
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        virtual_batch_size=virtual_batch_size)
-    out = bn(inp, training=is_training)
-    ghost_shape = ([virtual_batch_size,
-                    shape[0] // virtual_batch_size,
-                    shape[1]])
-
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(5):
-        x = np.random.random(shape)
-
-        sub_batched = np.reshape(x, ghost_shape)
-        means = np.mean(sub_batched, axis=0, keepdims=True)
-        variances = np.var(sub_batched, axis=0, keepdims=True)
-
-        avg_means = np.mean(means, axis=1, keepdims=True)
-        avg_variances = np.mean(variances, axis=1, keepdims=True)
-
-        moving_means = moving_means * momentum + avg_means * (1. - momentum)
-        moving_vars = moving_vars * momentum + avg_variances * (1. - momentum)
-
-        y_train = ((sub_batched - means) /
-                   (variances + epsilon) ** 0.5 * gamma) + beta
-        y_test = ((sub_batched - moving_means) /
-                  (moving_vars + epsilon) ** 0.5 * gamma) + beta
-
-        y_train = np.reshape(y_train, shape)
-        y_test = np.reshape(y_test, shape)
-
-        y_val_train, _, _ = sess.run([out] + bn.updates,
-                                     feed_dict={inp: x, is_training: True})
-        y_val_test = sess.run(out, feed_dict={inp: x, is_training: False})
-
-        self.assertAllClose(y_train, y_val_train, atol=1e-5)
-        self.assertAllClose(y_test, y_val_test, atol=1e-5)
-
-  def testGhostBN4DimsAxis3(self):
-    shape = [6, 10, 10, 3]
-    virtual_batch_size = 2
-    beta = 2.
-    gamma = 3.
-    momentum = 0.8
-    epsilon = 1e-3
-    moving_means = np.zeros([1, 1, 1, 1, 3], dtype=np.float32)
-    moving_vars = np.ones([1, 1, 1, 1, 3], dtype=np.float32)
-
-    inp = tf.compat.v1.placeholder(tf.float32, shape)
-    is_training = tf.compat.v1.placeholder(tf.bool)
-    bn = normalization_layers.BatchNormalization(
-        axis=3,
-        momentum=momentum,
-        epsilon=epsilon,
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        virtual_batch_size=virtual_batch_size)
-    out = bn(inp, training=is_training)
-    ghost_shape = ([virtual_batch_size, shape[0] // virtual_batch_size] +
-                   shape[1:])
-
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(5):
-        x = np.random.random(shape)
-
-        sub_batched = np.reshape(x, ghost_shape)
-        means = np.mean(sub_batched, axis=(0, 2, 3), keepdims=True)
-        variances = np.var(sub_batched, axis=(0, 2, 3), keepdims=True)
-
-        avg_means = np.mean(means, axis=1, keepdims=True)
-        avg_variances = np.mean(variances, axis=1, keepdims=True)
-
-        moving_means = moving_means * momentum + avg_means * (1. - momentum)
-        moving_vars = moving_vars * momentum + avg_variances * (1. - momentum)
-
-        y_train = ((sub_batched - means) /
-                   (variances + epsilon) ** 0.5 * gamma) + beta
-        y_test = ((sub_batched - moving_means) /
-                  (moving_vars + epsilon) ** 0.5 * gamma) + beta
-
-        y_train = np.reshape(y_train, shape)
-        y_test = np.reshape(y_test, shape)
-
-        y_val_train, _, _ = sess.run([out] + bn.updates,
-                                     feed_dict={inp: x, is_training: True})
-        y_val_test = sess.run(out, feed_dict={inp: x, is_training: False})
-
-        self.assertAllClose(y_train, y_val_train, atol=1e-2)
-        self.assertAllClose(y_test, y_val_test, atol=1e-2)
-
-  def testGhostBN4DimsAxis1(self):
-    shape = [6, 3, 10, 10]
-    virtual_batch_size = 2
-    beta = 2.
-    gamma = 3.
-    momentum = 0.8
-    epsilon = 1e-3
-    moving_means = np.zeros([1, 1, 3, 1, 1], dtype=np.float32)
-    moving_vars = np.ones([1, 1, 3, 1, 1], dtype=np.float32)
-
-    inp = tf.compat.v1.placeholder(tf.float32, shape)
-    is_training = tf.compat.v1.placeholder(tf.bool)
-    bn = normalization_layers.BatchNormalization(
-        axis=1,
-        momentum=momentum,
-        epsilon=epsilon,
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        virtual_batch_size=virtual_batch_size,
-        fused=False)      # NCHW is unsupported by CPU fused batch norm
-    out = bn(inp, training=is_training)
-    ghost_shape = ([virtual_batch_size, shape[0] // virtual_batch_size] +
-                   shape[1:])
-
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(5):
-        x = np.random.random(shape)
-
-        sub_batched = np.reshape(x, ghost_shape)
-        means = np.mean(sub_batched, axis=(0, 3, 4), keepdims=True)
-        variances = np.var(sub_batched, axis=(0, 3, 4), keepdims=True)
-
-        avg_means = np.mean(means, axis=1, keepdims=True)
-        avg_variances = np.mean(variances, axis=1, keepdims=True)
-
-        moving_means = moving_means * momentum + avg_means * (1. - momentum)
-        moving_vars = moving_vars * momentum + avg_variances * (1. - momentum)
-
-        y_train = ((sub_batched - means) /
-                   (variances + epsilon) ** 0.5 * gamma) + beta
-        y_test = ((sub_batched - moving_means) /
-                  (moving_vars + epsilon) ** 0.5 * gamma) + beta
-
-        y_train = np.reshape(y_train, shape)
-        y_test = np.reshape(y_test, shape)
-
-        y_val_train, _, _ = sess.run([out] + bn.updates,
-                                     feed_dict={inp: x, is_training: True})
-        y_val_test = sess.run(out, feed_dict={inp: x, is_training: False})
-
-        self.assertAllClose(y_train, y_val_train, atol=1e-2)
-        self.assertAllClose(y_test, y_val_test, atol=1e-2)
-
-  def testMultiAxisInvalid(self):
-    shape = [6, 5, 4, 3]
-    inp = tf.random.uniform(shape, seed=1)
-
-    with self.assertRaises(ValueError):
-      normalization_layers.batch_normalization(
-          inp, axis=[1, 4])    # out of bounds
-
-    with self.assertRaises(ValueError):
-      normalization_layers.batch_normalization(
-          inp, axis=[-5, 1])   # out of bounds
-
-    with self.assertRaises(ValueError):
-      normalization_layers.batch_normalization(
-          inp, axis=[1, 2, 1])   # duplicate
-
-  def test3DInputMultiAxis12(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=[1, 2], epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 4, 3)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=0, keepdims=True)
-      std = np.std(np_inputs, axis=0, keepdims=True)
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def test5DInputMultiAxis123(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=[1, 2, 3], epsilon=epsilon, momentum=0.9)
-    inputs = tf.Variable(
-        np.random.random((5, 3, 4, 4, 3)) + 100, dtype=tf.float32)
-    training = tf.compat.v1.placeholder(dtype='bool')
-    outputs = bn(inputs, training=training)
-
-    with self.cached_session() as sess:
-      # Test training with placeholder learning phase.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
-
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = self.evaluate(
-          [bn.moving_mean, bn.moving_variance])
-      np_inputs = self.evaluate(inputs)
-      mean = np.mean(np_inputs, axis=(0, 4), keepdims=True)
-      std = np.std(np_inputs, axis=(0, 4), keepdims=True)
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
-
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
-
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
-  def testGhostBN5DimsMultiAxis14(self):
-    shape = [6, 3, 10, 10, 4]
-    virtual_batch_size = 3
-    beta = 2.
-    gamma = 3.
-    momentum = 0.8
-    epsilon = 1e-3
-    moving_means = np.zeros([1, 1, 3, 1, 1, 4], dtype=np.float32)
-    moving_vars = np.ones([1, 1, 3, 1, 1, 4], dtype=np.float32)
-
-    inp = tf.compat.v1.placeholder(tf.float32, shape)
-    is_training = tf.compat.v1.placeholder(tf.bool)
-    bn = normalization_layers.BatchNormalization(
-        axis=[1, 4],
-        momentum=momentum,
-        epsilon=epsilon,
-        beta_initializer=tf.compat.v1.constant_initializer(beta),
-        gamma_initializer=tf.compat.v1.constant_initializer(gamma),
-        virtual_batch_size=virtual_batch_size,
-        fused=False)
-    out = bn(inp, training=is_training)
-    ghost_shape = ([virtual_batch_size, shape[0] // virtual_batch_size] +
-                   shape[1:])
-
-    with self.session() as sess:
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      for _ in range(5):
-        x = np.random.random(shape)
-
-        sub_batched = np.reshape(x, ghost_shape)
-        means = np.mean(sub_batched, axis=(0, 3, 4), keepdims=True)
-        variances = np.var(sub_batched, axis=(0, 3, 4), keepdims=True)
-
-        avg_means = np.mean(means, axis=1, keepdims=True)
-        avg_variances = np.mean(variances, axis=1, keepdims=True)
-
-        moving_means = moving_means * momentum + avg_means * (1. - momentum)
-        moving_vars = moving_vars * momentum + avg_variances * (1. - momentum)
-
-        y_train = ((sub_batched - means) /
-                   (variances + epsilon) ** 0.5 * gamma) + beta
-        y_test = ((sub_batched - moving_means) /
-                  (moving_vars + epsilon) ** 0.5 * gamma) + beta
-
-        y_train = np.reshape(y_train, shape)
-        y_test = np.reshape(y_test, shape)
-
-        y_val_train, _, _ = sess.run([out] + bn.updates,
-                                     feed_dict={inp: x, is_training: True})
-        y_val_test = sess.run(out, feed_dict={inp: x, is_training: False})
-
-        self.assertAllClose(y_train, y_val_train, atol=1e-2)
-        self.assertAllClose(y_test, y_val_test, atol=1e-2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+        freeze_mode,
+    ):
+        batch, height, width, input_channels = 2, 4, 5, 3
+        shape = [batch, height, width, input_channels]
+        base_path = "%s_%s_%s_%s_%s_%s" % (
+            is_fused_checkpoint_a,
+            is_fused_checkpoint_b,
+            use_gpu_checkpoint_a,
+            use_gpu_checkpoint_b,
+            use_gpu_test_a,
+            use_gpu_test_b,
+        )
+
+        checkpoint_path_a = os.path.join(
+            self.get_temp_dir(), f"checkpoint_a_{base_path}"
+        )
+        self._train(
+            checkpoint_path_a,
+            shape,
+            use_gpu_checkpoint_a,
+            is_fused_checkpoint_a,
+            restore=False,
+            freeze_mode=freeze_mode,
+        )
+        checkpoint_path_b = os.path.join(
+            self.get_temp_dir(), f"checkpoint_b_{base_path}"
+        )
+        self._train(
+            checkpoint_path_b,
+            shape,
+            use_gpu_checkpoint_b,
+            is_fused_checkpoint_b,
+            restore=False,
+            freeze_mode=freeze_mode,
+        )
+
+        vars_fused = self._train(
+            checkpoint_path_a,
+            shape,
+            use_gpu_test_a,
+            True,
+            restore=True,
+            freeze_mode=freeze_mode,
+        )
+        vars_nonfused = self._train(
+            checkpoint_path_b,
+            shape,
+            use_gpu_test_b,
+            False,
+            restore=True,
+            freeze_mode=freeze_mode,
+        )
+        self.assertEqual(len(vars_fused), 5)
+        self.assertEqual(len(vars_nonfused), 5)
+        for var_fused, var_nonfused in zip(vars_fused, vars_nonfused):
+            self.assertAllClose(var_fused, var_nonfused, atol=1e-5)
+
+        image_val = np.random.rand(batch, height, width, input_channels).astype(
+            np.float32
+        )
+        loss_fused_val = self._infer(
+            checkpoint_path_a, image_val, shape, use_gpu_test_a, True
+        )
+        loss_nonfused_val = self._infer(
+            checkpoint_path_b, image_val, shape, use_gpu_test_b, False
+        )
+        self.assertAllClose(
+            loss_fused_val, loss_nonfused_val, atol=1e-6, rtol=3e-4
+        )
+
+    def _testCheckpointCrossDevice(
+        self, ckpt_a_fused, ckpt_a_use_gpu, ckpt_b_fused, ckpt_b_use_gpu
+    ):
+        for use_gpu_test_a in [True, False]:
+            for use_gpu_test_b in [True, False]:
+                for freeze_mode in [True, False]:
+                    self._testCheckpoint(
+                        ckpt_a_fused,
+                        ckpt_a_use_gpu,
+                        ckpt_b_fused,
+                        ckpt_b_use_gpu,
+                        use_gpu_test_a,
+                        use_gpu_test_b,
+                        freeze_mode,
+                    )
+
+    def testCheckpointFusedCPUAndFusedGPU(self):
+        self._testCheckpointCrossDevice(True, False, True, True)
+
+    def testCheckpointFusedCPUAndFusedCPU(self):
+        self._testCheckpointCrossDevice(True, False, True, False)
+
+    def testCheckpointFusedGPUAndFusedGPU(self):
+        self._testCheckpointCrossDevice(True, True, True, True)
+
+    def testCheckpointNonFusedCPUAndNonFusedGPU(self):
+        self._testCheckpointCrossDevice(False, False, False, True)
+
+    def testCheckpointNonFusedCPUAndNonFusedCPU(self):
+        self._testCheckpointCrossDevice(False, False, False, False)
+
+    def testCheckpointNonFusedGPUAndNonFusedGPU(self):
+        self._testCheckpointCrossDevice(False, True, False, True)
+
+    def testCheckpointNonFusedGPUAndFusedGPU(self):
+        self._testCheckpointCrossDevice(False, True, True, True)
+
+    def testCheckpointNonFusedGPUAndFusedCPU(self):
+        self._testCheckpointCrossDevice(False, True, True, False)
+
+    def testCheckpointNonFusedCPUAndFusedCPU(self):
+        self._testCheckpointCrossDevice(False, False, True, False)
+
+    def testCreateBN(self):
+        # Call layer.
+        bn = normalization_layers.BatchNormalization(axis=1)
+        inputs = tf.random.uniform((5, 4, 3), seed=1)
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        # Verify shape.
+        self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
+
+        # Verify layer attributes.
+        self.assertEqual(len(bn.updates), 2)
+        self.assertEqual(len(bn.variables), 4)
+        self.assertEqual(len(bn.trainable_variables), 2)
+        self.assertEqual(len(bn.non_trainable_variables), 2)
+
+        # Test that updates were created and added to UPDATE_OPS.
+        self.assertEqual(len(bn.updates), 2)
+        self.assertListEqual(
+            tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS),
+            bn.updates,
+        )
+
+        # Test that weights were created and added to TRAINABLE_VARIABLES.
+        self.assertListEqual(
+            tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+            ),
+            bn.trainable_variables,
+        )
+
+    def testCreateFusedBNFloat16(self):
+        # Call layer.
+        bn = normalization_layers.BatchNormalization(axis=1, fused=True)
+        inputs = tf.random.uniform((5, 4, 3, 3), seed=1, dtype=tf.float16)
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        # Verify shape.
+        self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3, 3])
+
+        # Verify layer attributes.
+        self.assertEqual(len(bn.updates), 2)
+        self.assertEqual(len(bn.variables), 4)
+        self.assertEqual(len(bn.trainable_variables), 2)
+        self.assertEqual(len(bn.non_trainable_variables), 2)
+        for var in bn.variables:
+            self.assertTrue(var.dtype._is_ref_dtype)
+
+        # Test that updates were created and added to UPDATE_OPS.
+        self.assertEqual(len(bn.updates), 2)
+        self.assertListEqual(
+            tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS),
+            bn.updates,
+        )
+
+        # Test that weights were created and added to TRAINABLE_VARIABLES.
+        self.assertListEqual(
+            tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES
+            ),
+            bn.trainable_variables,
+        )
+
+    def test3DInputAxis1(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=1, epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+            np_gamma = np.reshape(np_gamma, (1, 4, 1))
+            np_beta = np.reshape(np_beta, (1, 4, 1))
+
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 2))
+            std = np.std(np_inputs, axis=(0, 2))
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def test3DInputAxis2(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=2, epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 3))
+            np_beta = np.reshape(np_beta, (1, 1, 3))
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 1))
+            std = np.std(np_inputs, axis=(0, 1))
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def test4DInputAxis1(self):
+        if tf.test.is_gpu_available(cuda_only=True):
+            epsilon = 1e-3
+            bn = normalization_layers.BatchNormalization(
+                axis=1, epsilon=epsilon, momentum=0.9
+            )
+            inputs = tf.Variable(
+                np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32
+            )
+            training = tf.compat.v1.placeholder(dtype="bool")
+            outputs = bn(inputs, training=training)
+
+            with self.session() as sess:
+                # Test training with placeholder learning phase.
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+                np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
+                np_beta = np.reshape(np_beta, (1, 4, 1, 1))
+                for _ in range(100):
+                    np_output, _, _ = sess.run(
+                        [outputs] + bn.updates, feed_dict={training: True}
+                    )
+                    # Verify that the axis is normalized during training.
+                    normed_np_output = (
+                        (np_output - epsilon) * np_gamma
+                    ) + np_beta
+                    self.assertAlmostEqual(
+                        np.mean(normed_np_output), 0.0, places=1
+                    )
+                    self.assertAlmostEqual(
+                        np.std(normed_np_output), 1.0, places=1
+                    )
+
+                # Verify that the statistics are updated during training.
+                moving_mean, moving_var = self.evaluate(
+                    [bn.moving_mean, bn.moving_variance]
+                )
+                np_inputs = self.evaluate(inputs)
+                mean = np.mean(np_inputs, axis=(0, 2, 3))
+                std = np.std(np_inputs, axis=(0, 2, 3))
+                variance = np.square(std)
+                self.assertAllClose(mean, moving_mean, atol=1e-2)
+                self.assertAllClose(variance, moving_var, atol=1e-2)
+
+                # Test inference with placeholder learning phase.
+                np_output = sess.run(outputs, feed_dict={training: False})
+
+                # Verify that the axis is normalized during inference.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def test4DInputAxis2(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=2, epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
+            np_beta = np.reshape(np_beta, (1, 1, 3, 1))
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 1, 3))
+            std = np.std(np_inputs, axis=(0, 1, 3))
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def test4DInputAxis3(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=3, epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+            np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 1, 2))
+            std = np.std(np_inputs, axis=(0, 1, 2))
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def test4DInputAxis3Fused(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=3, epsilon=epsilon, momentum=0.9, fused=True
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+            np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 1, 2))
+            std = np.std(np_inputs, axis=(0, 1, 2))
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def test4DInputAxis1Fused(self):
+        if tf.test.is_gpu_available(cuda_only=True):
+            epsilon = 1e-3
+            bn = normalization_layers.BatchNormalization(
+                axis=1, epsilon=epsilon, momentum=0.9, fused=True
+            )
+            inputs = tf.Variable(
+                np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32
+            )
+            training = tf.compat.v1.placeholder(dtype="bool")
+            outputs = bn(inputs, training=training)
+
+            with self.cached_session() as sess:
+                # Test training with placeholder learning phase.
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+                np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
+                np_beta = np.reshape(np_beta, (1, 4, 1, 1))
+                for _ in range(100):
+                    np_output, _, _ = sess.run(
+                        [outputs] + bn.updates, feed_dict={training: True}
+                    )
+                    # Verify that the axis is normalized during training.
+                    normed_np_output = (
+                        (np_output - epsilon) * np_gamma
+                    ) + np_beta
+                    self.assertAlmostEqual(
+                        np.mean(normed_np_output), 0.0, places=1
+                    )
+                    self.assertAlmostEqual(
+                        np.std(normed_np_output), 1.0, places=1
+                    )
+
+                # Verify that the statistics are updated during training.
+                moving_mean, moving_var = self.evaluate(
+                    [bn.moving_mean, bn.moving_variance]
+                )
+                np_inputs = self.evaluate(inputs)
+                mean = np.mean(np_inputs, axis=(0, 2, 3))
+                std = np.std(np_inputs, axis=(0, 2, 3))
+                variance = np.square(std)
+                self.assertAllClose(mean, moving_mean, atol=1e-2)
+                self.assertAllClose(variance, moving_var, atol=1e-2)
+
+                # Test inference with placeholder learning phase.
+                np_output = sess.run(outputs, feed_dict={training: False})
+
+                # Verify that the axis is normalized during inference.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def testNegativeAxis(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=-1, epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+            np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 1, 2))
+            std = np.std(np_inputs, axis=(0, 1, 2))
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def testBooleanLearningPhase(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=-1, epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3, 6)) + 100, dtype=tf.float32
+        )
+        outputs_training = bn(inputs, training=True)
+        outputs_infer = bn(inputs, training=False)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+            np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+            for _ in range(100):
+                np_output, _, _ = sess.run([outputs_training] + bn.updates)
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=2)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 1, 2))
+            std = np.std(np_inputs, axis=(0, 1, 2))
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = self.evaluate(outputs_infer)
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def testFunctionalNoReuse(self):
+        inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+        epsilon = 1e-3
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = normalization_layers.batch_norm(
+            inputs,
+            axis=-1,
+            momentum=0.9,
+            epsilon=epsilon,
+            training=training,
+            name="bn",
+        )
+
+        updates = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
+        all_vars = {v.name: v for v in tf.compat.v1.global_variables()}
+        moving_mean = all_vars["bn/moving_mean:0"]
+        moving_variance = all_vars["bn/moving_variance:0"]
+        beta = all_vars["bn/beta:0"]
+        gamma = all_vars["bn/gamma:0"]
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            np_gamma, np_beta = self.evaluate([gamma, beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+            np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            np_moving_mean, np_moving_var = self.evaluate(
+                [moving_mean, moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            np_mean = np.mean(np_inputs, axis=(0, 1, 2))
+            np_std = np.std(np_inputs, axis=(0, 1, 2))
+            np_variance = np.square(np_std)
+            self.assertAllClose(np_mean, np_moving_mean, atol=1e-2)
+            self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def testFunctionalReuse(self):
+        inputs1 = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+        inputs2 = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+        epsilon = 1e-3
+        training = tf.compat.v1.placeholder(dtype="bool")
+        _ = normalization_layers.batch_norm(
+            inputs1,
+            axis=-1,
+            momentum=0.9,
+            epsilon=epsilon,
+            training=training,
+            name="bn",
+        )
+        outputs2 = normalization_layers.batch_norm(
+            inputs2,
+            axis=-1,
+            momentum=0.9,
+            epsilon=epsilon,
+            training=training,
+            name="bn",
+            reuse=True,
+        )
+
+        # Last 2 update ops
+        updates = tf.compat.v1.get_collection(
+            tf.compat.v1.GraphKeys.UPDATE_OPS
+        )[-2:]
+        all_vars = {v.name: v for v in tf.compat.v1.global_variables()}
+        moving_mean = all_vars["bn/moving_mean:0"]
+        moving_variance = all_vars["bn/moving_variance:0"]
+        beta = all_vars["bn/beta:0"]
+        gamma = all_vars["bn/gamma:0"]
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs2] + updates, feed_dict={training: True}
+                )
+
+            # Verify that the statistics are updated during training.
+            np_moving_mean, np_moving_var = self.evaluate(
+                [moving_mean, moving_variance]
+            )
+            np_inputs = self.evaluate(inputs2)
+            np_mean = np.mean(np_inputs, axis=(0, 1, 2))
+            np_std = np.std(np_inputs, axis=(0, 1, 2))
+            np_variance = np.square(np_std)
+            self.assertAllClose(np_mean, np_moving_mean, atol=1e-2)
+            self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
+
+            # Verify that the axis is normalized during training.
+            np_gamma, np_beta = self.evaluate([gamma, beta])
+            np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+            np_beta = np.reshape(np_beta, (1, 1, 1, 6))
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=2)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs2, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=2)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def testFunctionalReuseFromScope(self):
+        inputs = tf.Variable(np.random.random((5, 4, 3, 6)), dtype=tf.float32)
+        epsilon = 1e-3
+        training = tf.compat.v1.placeholder(dtype="bool")
+        with tf.compat.v1.variable_scope("scope"):
+            _ = normalization_layers.batch_norm(
+                inputs,
+                axis=-1,
+                momentum=0.9,
+                epsilon=epsilon,
+                training=training,
+            )
+            self.assertEqual(len(tf.compat.v1.global_variables()), 5)
+        with tf.compat.v1.variable_scope("scope", reuse=True):
+            _ = normalization_layers.batch_norm(
+                inputs,
+                axis=-1,
+                momentum=0.9,
+                epsilon=epsilon,
+                training=training,
+            )
+            self.assertEqual(len(tf.compat.v1.global_variables()), 5)
+
+    def testNoCenter(self):
+        bn = normalization_layers.BatchNormalization(axis=1, center=False)
+        inputs = tf.random.uniform((5, 4, 3), seed=1)
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        # Verify shape.
+        self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
+
+        # Verify layer attributes.
+        self.assertEqual(len(bn.updates), 2)
+        self.assertEqual(len(bn.variables), 3)
+        self.assertEqual(len(bn.trainable_variables), 1)
+        self.assertEqual(len(bn.non_trainable_variables), 2)
+
+    def testNoScale(self):
+        bn = normalization_layers.BatchNormalization(axis=1, scale=False)
+        inputs = tf.random.uniform((5, 4, 3), seed=1)
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        # Verify shape.
+        self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])
+
+        # Verify layer attributes.
+        self.assertEqual(len(bn.updates), 2)
+        self.assertEqual(len(bn.variables), 3)
+        self.assertEqual(len(bn.trainable_variables), 1)
+        self.assertEqual(len(bn.non_trainable_variables), 2)
+
+    def testRegularizers(self):
+        reg = lambda x: 0.1 * tf.reduce_sum(x)
+        bn = normalization_layers.BatchNormalization(
+            axis=1, beta_regularizer=reg
+        )
+        inputs = tf.random.uniform((5, 4, 3), seed=1)
+        training = tf.compat.v1.placeholder(dtype="bool")
+        _ = bn(inputs, training=training)
+        self.assertEqual(len(bn.losses), 1)
+
+        bn = normalization_layers.BatchNormalization(
+            axis=1, gamma_regularizer=reg
+        )
+        inputs = tf.random.uniform((5, 4, 3), seed=1)
+        training = tf.compat.v1.placeholder(dtype="bool")
+        _ = bn(inputs, training=training)
+        self.assertEqual(len(bn.losses), 1)
+
+    def testConstraints(self):
+        g_constraint = lambda x: x / tf.reduce_sum(x)
+        b_constraint = lambda x: x / tf.reduce_max(x)
+        bn = normalization_layers.BatchNormalization(
+            axis=1, gamma_constraint=g_constraint, beta_constraint=b_constraint
+        )
+        inputs = tf.random.uniform((5, 4, 3), seed=1)
+        bn(inputs)
+        self.assertEqual(bn.gamma_constraint, g_constraint)
+        self.assertEqual(bn.beta_constraint, b_constraint)
+
+    def testRenorm(self):
+        shape = (4, 3)
+        xt = tf.compat.v1.placeholder(tf.float32, shape)
+        momentum = 0.99
+        renorm_momentum = 0.8
+        rmax = 1.1
+        rmin = 0.9
+        dmax = 0.1
+        gamma = 2.0
+        beta = 3.0
+        epsilon = 0.001
+        bn = normalization_layers.BatchNormalization(
+            axis=1,
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            epsilon=epsilon,
+            momentum=momentum,
+            renorm=True,
+            renorm_clipping={"rmax": rmax, "rmin": rmin, "dmax": dmax},
+            renorm_momentum=renorm_momentum,
+        )
+        training = tf.compat.v1.placeholder(tf.bool)
+        yt = bn(xt, training=training)
+
+        moving_mean = 0.0
+        moving_stddev = 1.0
+        renorm_mean = 0.0
+        renorm_stddev = 1.0
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(5):
+                x = np.random.random(shape)
+
+                mean = x.mean(0)
+                variance = x.var(0)
+                stddev = np.sqrt(variance + epsilon)
+                r = (stddev / renorm_stddev).clip(rmin, rmax)
+                d = ((mean - renorm_mean) / renorm_stddev).clip(-dmax, dmax)
+                y_train = ((x - mean) / stddev * r + d) * gamma + beta
+                renorm_mean += (mean - renorm_mean) * (1.0 - renorm_momentum)
+                renorm_stddev += (stddev - renorm_stddev) * (
+                    1.0 - renorm_momentum
+                )
+                moving_mean += (mean - moving_mean) * (1.0 - momentum)
+                moving_stddev += (stddev - moving_stddev) * (1.0 - momentum)
+
+                y_test = (
+                    (x - moving_mean)
+                    / (moving_stddev * moving_stddev) ** 0.5
+                    * gamma
+                ) + beta
+
+                yt_val_train, _, _ = sess.run(
+                    [yt] + bn.updates, feed_dict={xt: x, training: True}
+                )
+                yt_val_test, _, _ = sess.run(
+                    [yt] + bn.updates, feed_dict={xt: x, training: False}
+                )
+
+                self.assertAllClose(y_train, yt_val_train, atol=1e-5)
+                self.assertAllClose(y_test, yt_val_test, atol=1e-5)
+
+    def testRenormNoClippingSameMomentumGivesSameTestTrain(self):
+        shape = (4, 3)
+        xt = tf.compat.v1.placeholder(tf.float32, shape)
+        momentum = 0.9
+        renorm_momentum = 0.9
+        gamma = 2.0
+        beta = 3.0
+        epsilon = 0.001
+        bn = normalization_layers.BatchNormalization(
+            axis=1,
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            epsilon=epsilon,
+            momentum=momentum,
+            renorm=True,
+            renorm_clipping=None,
+            renorm_momentum=momentum,
+        )
+        training = tf.compat.v1.placeholder(tf.bool)
+        yt = bn(xt, training=training)
+        moving_mean = 0.0
+        moving_stddev = 1.0
+        renorm_mean = 0.0
+        renorm_stddev = 1.0
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for step in range(6):
+                x = np.random.random(shape)
+
+                mean = x.mean(0)
+                variance = x.var(0)
+                stddev = np.sqrt(variance + epsilon)
+                r = stddev / renorm_stddev
+                d = (mean - renorm_mean) / renorm_stddev
+                y_test = (
+                    (x - moving_mean)
+                    / (moving_stddev * moving_stddev) ** 0.5
+                    * gamma
+                ) + beta
+                y_train = ((x - mean) / stddev * r + d) * gamma + beta
+                renorm_mean += (mean - renorm_mean) * (1.0 - renorm_momentum)
+                renorm_stddev += (stddev - renorm_stddev) * (
+                    1.0 - renorm_momentum
+                )
+                moving_mean += (mean - moving_mean) * (1.0 - momentum)
+                moving_stddev += (stddev - moving_stddev) * (1.0 - momentum)
+
+                # Compute test values first, before the train mode updates the
+                # moving averages.
+                yt_val_test, _, _ = sess.run(
+                    [yt] + bn.updates, feed_dict={xt: x, training: False}
+                )
+                yt_val_train, _, _ = sess.run(
+                    [yt] + bn.updates, feed_dict={xt: x, training: True}
+                )
+
+                # Due to initialization inconsistencies, values may not be
+                # identical on the first iteration (but shouldn't be different
+                # by much more than epsilon). After the first iteration they
+                # should be identical.
+                atol = epsilon * 1.5 if step == 0 else 1e-5
+                self.assertAllClose(y_train, yt_val_train, atol=atol)
+                self.assertAllClose(y_test, yt_val_test, atol=atol)
+                self.assertAllClose(yt_val_train, yt_val_test, atol=atol)
+
+    def testAdjustment(self):
+        shape = (4, 3)
+        xt = tf.compat.v1.placeholder(tf.float32, shape)
+        momentum = 0.99
+        gamma = 2.0
+        beta = 3.0
+        epsilon = 0.001
+        adjust_scale = tf.random.uniform(shape[-1:], 0.5, 1.5)
+        adjust_bias = tf.random.uniform(shape[-1:], -0.2, 0.2)
+        bn = normalization_layers.BatchNormalization(
+            axis=1,
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            epsilon=epsilon,
+            momentum=momentum,
+            adjustment=lambda _: (adjust_scale, adjust_bias),
+        )
+        training = tf.compat.v1.placeholder(tf.bool)
+        yt = bn(xt, training=training)
+
+        moving_mean = 0.0
+        moving_variance = 1.0
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(5):
+                x = np.random.random(shape)
+                yt_val_train, adj_scale_val, adj_bias_val = sess.run(
+                    [yt, adjust_scale, adjust_bias] + bn.updates,
+                    feed_dict={xt: x, training: True},
+                )[:3]
+                yt_val_test = sess.run(
+                    [yt] + bn.updates, feed_dict={xt: x, training: False}
+                )[0]
+
+                mean = x.mean(0)
+                variance = x.var(0)
+                y_train = (
+                    ((x - mean) / (variance + epsilon) ** 0.5) * adj_scale_val
+                    + adj_bias_val
+                ) * gamma + beta
+                moving_mean += (mean - moving_mean) * (1.0 - momentum)
+                moving_variance += (variance - moving_variance) * (
+                    1.0 - momentum
+                )
+
+                y_test = (
+                    (x - moving_mean)
+                    / (moving_variance + epsilon) ** 0.5
+                    * gamma
+                ) + beta
+
+                self.assertAllClose(y_train, yt_val_train, atol=1e-5)
+                self.assertAllClose(y_test, yt_val_test, atol=1e-5)
+
+    def testRenormWithAdjustment(self):
+        shape = (4, 3)
+        xt = tf.compat.v1.placeholder(tf.float32, shape)
+        momentum = 0.99
+        renorm_momentum = 0.8
+        rmax = 1.1
+        rmin = 0.9
+        dmax = 0.1
+        gamma = 2.0
+        beta = 3.0
+        epsilon = 0.001
+        adjust_scale = tf.random.uniform(shape[-1:], 0.5, 1.5)
+        adjust_bias = tf.random.uniform(shape[-1:], -0.2, 0.2)
+        bn = normalization_layers.BatchNormalization(
+            axis=1,
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            epsilon=epsilon,
+            momentum=momentum,
+            renorm=True,
+            renorm_clipping={"rmax": rmax, "rmin": rmin, "dmax": dmax},
+            renorm_momentum=renorm_momentum,
+            adjustment=lambda _: (adjust_scale, adjust_bias),
+        )
+        training = tf.compat.v1.placeholder(tf.bool)
+        yt = bn(xt, training=training)
+
+        moving_mean = 0.0
+        moving_stddev = 1.0
+        renorm_mean = 0.0
+        renorm_stddev = 1.0
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(5):
+                x = np.random.random(shape)
+                yt_val_train, adj_scale_val, adj_bias_val = sess.run(
+                    [yt, adjust_scale, adjust_bias] + bn.updates,
+                    feed_dict={xt: x, training: True},
+                )[:3]
+                yt_val_test = sess.run(
+                    [yt] + bn.updates, feed_dict={xt: x, training: False}
+                )[0]
+
+                mean = x.mean(0)
+                variance = x.var(0)
+                stddev = np.sqrt(variance + epsilon)
+                r = (stddev / renorm_stddev).clip(rmin, rmax)
+                d = ((mean - renorm_mean) / renorm_stddev).clip(-dmax, dmax)
+                y_train = (
+                    ((x - mean) / stddev * r + d) * adj_scale_val + adj_bias_val
+                ) * gamma + beta
+                renorm_mean += (mean - renorm_mean) * (1.0 - renorm_momentum)
+                renorm_stddev += (stddev - renorm_stddev) * (
+                    1.0 - renorm_momentum
+                )
+                moving_mean += (mean - moving_mean) * (1.0 - momentum)
+                moving_stddev += (stddev - moving_stddev) * (1.0 - momentum)
+
+                y_test = (
+                    (x - moving_mean)
+                    / (moving_stddev * moving_stddev) ** 0.5
+                    * gamma
+                ) + beta
+
+                self.assertAllClose(y_train, yt_val_train, atol=1e-5)
+                self.assertAllClose(y_test, yt_val_test, atol=1e-5)
+
+    def testGhostBNNegativeVirtualBatch(self):
+        shape = [6, 5, 4, 3]
+        inp = tf.random.uniform(shape, seed=1)
+
+        with self.assertRaises(ValueError):
+            normalization_layers.batch_normalization(inp, virtual_batch_size=-1)
+
+    def testGhostBNVirtualBatchFull(self):
+        shape = [6, 5, 4, 3]
+        inp = tf.random.uniform(shape, seed=1)
+        out1 = normalization_layers.batch_normalization(inp)
+        out2 = normalization_layers.batch_normalization(
+            inp, virtual_batch_size=6
+        )
+
+        self.assertListEqual(out1.shape.as_list(), out2.shape.as_list())
+
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            x = np.random.random(shape)
+            y1, y2 = sess.run([out1, out2], feed_dict={inp: x})
+
+            self.assertAllClose(y1, y2, atol=1e-5)
+
+    def testGhostBNInputOutputShapesMatch(self):
+        shape = [6, 4, 3]
+        inp = tf.random.uniform(shape, seed=1)
+        out = normalization_layers.batch_normalization(
+            inp, virtual_batch_size=3
+        )
+        self.assertListEqual(out.shape.as_list(), shape)
+
+    def testGhostBNUnknownBatchSize(self):
+        np_shape = [10, 5, 4]
+        tf_shape = [None, 5, 4]
+        inp = tf.compat.v1.placeholder(tf.float32, tf_shape)
+        out = normalization_layers.batch_normalization(
+            inp, virtual_batch_size=2
+        )
+
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            x = np.random.random(np_shape)
+            y = sess.run(out, feed_dict={inp: x})
+
+            self.assertListEqual(list(y.shape), np_shape)
+
+    def testGhostBN2Dims(self):
+        shape = [6, 2]
+        virtual_batch_size = 3
+        beta = 2.0
+        gamma = 3.0
+        momentum = 0.8
+        epsilon = 1e-3
+        moving_means = np.zeros([2, 2], dtype=np.float32)
+        moving_vars = np.ones([2, 2], dtype=np.float32)
+
+        inp = tf.compat.v1.placeholder(tf.float32, shape)
+        is_training = tf.compat.v1.placeholder(tf.bool)
+        bn = normalization_layers.BatchNormalization(
+            momentum=momentum,
+            epsilon=epsilon,
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            virtual_batch_size=virtual_batch_size,
+        )
+        out = bn(inp, training=is_training)
+        ghost_shape = [
+            virtual_batch_size,
+            shape[0] // virtual_batch_size,
+            shape[1],
+        ]
+
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(5):
+                x = np.random.random(shape)
+
+                sub_batched = np.reshape(x, ghost_shape)
+                means = np.mean(sub_batched, axis=0, keepdims=True)
+                variances = np.var(sub_batched, axis=0, keepdims=True)
+
+                avg_means = np.mean(means, axis=1, keepdims=True)
+                avg_variances = np.mean(variances, axis=1, keepdims=True)
+
+                moving_means = moving_means * momentum + avg_means * (
+                    1.0 - momentum
+                )
+                moving_vars = moving_vars * momentum + avg_variances * (
+                    1.0 - momentum
+                )
+
+                y_train = (
+                    (sub_batched - means) / (variances + epsilon) ** 0.5 * gamma
+                ) + beta
+                y_test = (
+                    (sub_batched - moving_means)
+                    / (moving_vars + epsilon) ** 0.5
+                    * gamma
+                ) + beta
+
+                y_train = np.reshape(y_train, shape)
+                y_test = np.reshape(y_test, shape)
+
+                y_val_train, _, _ = sess.run(
+                    [out] + bn.updates, feed_dict={inp: x, is_training: True}
+                )
+                y_val_test = sess.run(
+                    out, feed_dict={inp: x, is_training: False}
+                )
+
+                self.assertAllClose(y_train, y_val_train, atol=1e-5)
+                self.assertAllClose(y_test, y_val_test, atol=1e-5)
+
+    def testGhostBN4DimsAxis3(self):
+        shape = [6, 10, 10, 3]
+        virtual_batch_size = 2
+        beta = 2.0
+        gamma = 3.0
+        momentum = 0.8
+        epsilon = 1e-3
+        moving_means = np.zeros([1, 1, 1, 1, 3], dtype=np.float32)
+        moving_vars = np.ones([1, 1, 1, 1, 3], dtype=np.float32)
+
+        inp = tf.compat.v1.placeholder(tf.float32, shape)
+        is_training = tf.compat.v1.placeholder(tf.bool)
+        bn = normalization_layers.BatchNormalization(
+            axis=3,
+            momentum=momentum,
+            epsilon=epsilon,
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            virtual_batch_size=virtual_batch_size,
+        )
+        out = bn(inp, training=is_training)
+        ghost_shape = [
+            virtual_batch_size,
+            shape[0] // virtual_batch_size,
+        ] + shape[1:]
+
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(5):
+                x = np.random.random(shape)
+
+                sub_batched = np.reshape(x, ghost_shape)
+                means = np.mean(sub_batched, axis=(0, 2, 3), keepdims=True)
+                variances = np.var(sub_batched, axis=(0, 2, 3), keepdims=True)
+
+                avg_means = np.mean(means, axis=1, keepdims=True)
+                avg_variances = np.mean(variances, axis=1, keepdims=True)
+
+                moving_means = moving_means * momentum + avg_means * (
+                    1.0 - momentum
+                )
+                moving_vars = moving_vars * momentum + avg_variances * (
+                    1.0 - momentum
+                )
+
+                y_train = (
+                    (sub_batched - means) / (variances + epsilon) ** 0.5 * gamma
+                ) + beta
+                y_test = (
+                    (sub_batched - moving_means)
+                    / (moving_vars + epsilon) ** 0.5
+                    * gamma
+                ) + beta
+
+                y_train = np.reshape(y_train, shape)
+                y_test = np.reshape(y_test, shape)
+
+                y_val_train, _, _ = sess.run(
+                    [out] + bn.updates, feed_dict={inp: x, is_training: True}
+                )
+                y_val_test = sess.run(
+                    out, feed_dict={inp: x, is_training: False}
+                )
+
+                self.assertAllClose(y_train, y_val_train, atol=1e-2)
+                self.assertAllClose(y_test, y_val_test, atol=1e-2)
+
+    def testGhostBN4DimsAxis1(self):
+        shape = [6, 3, 10, 10]
+        virtual_batch_size = 2
+        beta = 2.0
+        gamma = 3.0
+        momentum = 0.8
+        epsilon = 1e-3
+        moving_means = np.zeros([1, 1, 3, 1, 1], dtype=np.float32)
+        moving_vars = np.ones([1, 1, 3, 1, 1], dtype=np.float32)
+
+        inp = tf.compat.v1.placeholder(tf.float32, shape)
+        is_training = tf.compat.v1.placeholder(tf.bool)
+        bn = normalization_layers.BatchNormalization(
+            axis=1,
+            momentum=momentum,
+            epsilon=epsilon,
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            virtual_batch_size=virtual_batch_size,
+            fused=False,
+        )  # NCHW is unsupported by CPU fused batch norm
+        out = bn(inp, training=is_training)
+        ghost_shape = [
+            virtual_batch_size,
+            shape[0] // virtual_batch_size,
+        ] + shape[1:]
+
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(5):
+                x = np.random.random(shape)
+
+                sub_batched = np.reshape(x, ghost_shape)
+                means = np.mean(sub_batched, axis=(0, 3, 4), keepdims=True)
+                variances = np.var(sub_batched, axis=(0, 3, 4), keepdims=True)
+
+                avg_means = np.mean(means, axis=1, keepdims=True)
+                avg_variances = np.mean(variances, axis=1, keepdims=True)
+
+                moving_means = moving_means * momentum + avg_means * (
+                    1.0 - momentum
+                )
+                moving_vars = moving_vars * momentum + avg_variances * (
+                    1.0 - momentum
+                )
+
+                y_train = (
+                    (sub_batched - means) / (variances + epsilon) ** 0.5 * gamma
+                ) + beta
+                y_test = (
+                    (sub_batched - moving_means)
+                    / (moving_vars + epsilon) ** 0.5
+                    * gamma
+                ) + beta
+
+                y_train = np.reshape(y_train, shape)
+                y_test = np.reshape(y_test, shape)
+
+                y_val_train, _, _ = sess.run(
+                    [out] + bn.updates, feed_dict={inp: x, is_training: True}
+                )
+                y_val_test = sess.run(
+                    out, feed_dict={inp: x, is_training: False}
+                )
+
+                self.assertAllClose(y_train, y_val_train, atol=1e-2)
+                self.assertAllClose(y_test, y_val_test, atol=1e-2)
+
+    def testMultiAxisInvalid(self):
+        shape = [6, 5, 4, 3]
+        inp = tf.random.uniform(shape, seed=1)
+
+        with self.assertRaises(ValueError):
+            normalization_layers.batch_normalization(
+                inp, axis=[1, 4]
+            )  # out of bounds
+
+        with self.assertRaises(ValueError):
+            normalization_layers.batch_normalization(
+                inp, axis=[-5, 1]
+            )  # out of bounds
+
+        with self.assertRaises(ValueError):
+            normalization_layers.batch_normalization(
+                inp, axis=[1, 2, 1]
+            )  # duplicate
+
+    def test3DInputMultiAxis12(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=[1, 2], epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 4, 3)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=0, keepdims=True)
+            std = np.std(np_inputs, axis=0, keepdims=True)
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def test5DInputMultiAxis123(self):
+        epsilon = 1e-3
+        bn = normalization_layers.BatchNormalization(
+            axis=[1, 2, 3], epsilon=epsilon, momentum=0.9
+        )
+        inputs = tf.Variable(
+            np.random.random((5, 3, 4, 4, 3)) + 100, dtype=tf.float32
+        )
+        training = tf.compat.v1.placeholder(dtype="bool")
+        outputs = bn(inputs, training=training)
+
+        with self.cached_session() as sess:
+            # Test training with placeholder learning phase.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
+
+            for _ in range(100):
+                np_output, _, _ = sess.run(
+                    [outputs] + bn.updates, feed_dict={training: True}
+                )
+                # Verify that the axis is normalized during training.
+                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+                self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+                self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+            # Verify that the statistics are updated during training.
+            moving_mean, moving_var = self.evaluate(
+                [bn.moving_mean, bn.moving_variance]
+            )
+            np_inputs = self.evaluate(inputs)
+            mean = np.mean(np_inputs, axis=(0, 4), keepdims=True)
+            std = np.std(np_inputs, axis=(0, 4), keepdims=True)
+            variance = np.square(std)
+            self.assertAllClose(mean, moving_mean, atol=1e-2)
+            self.assertAllClose(variance, moving_var, atol=1e-2)
+
+            # Test inference with placeholder learning phase.
+            np_output = sess.run(outputs, feed_dict={training: False})
+
+            # Verify that the axis is normalized during inference.
+            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+            self.assertAlmostEqual(np.mean(normed_np_output), 0.0, places=1)
+            self.assertAlmostEqual(np.std(normed_np_output), 1.0, places=1)
+
+    def testGhostBN5DimsMultiAxis14(self):
+        shape = [6, 3, 10, 10, 4]
+        virtual_batch_size = 3
+        beta = 2.0
+        gamma = 3.0
+        momentum = 0.8
+        epsilon = 1e-3
+        moving_means = np.zeros([1, 1, 3, 1, 1, 4], dtype=np.float32)
+        moving_vars = np.ones([1, 1, 3, 1, 1, 4], dtype=np.float32)
+
+        inp = tf.compat.v1.placeholder(tf.float32, shape)
+        is_training = tf.compat.v1.placeholder(tf.bool)
+        bn = normalization_layers.BatchNormalization(
+            axis=[1, 4],
+            momentum=momentum,
+            epsilon=epsilon,
+            beta_initializer=tf.compat.v1.constant_initializer(beta),
+            gamma_initializer=tf.compat.v1.constant_initializer(gamma),
+            virtual_batch_size=virtual_batch_size,
+            fused=False,
+        )
+        out = bn(inp, training=is_training)
+        ghost_shape = [
+            virtual_batch_size,
+            shape[0] // virtual_batch_size,
+        ] + shape[1:]
+
+        with self.session() as sess:
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            for _ in range(5):
+                x = np.random.random(shape)
+
+                sub_batched = np.reshape(x, ghost_shape)
+                means = np.mean(sub_batched, axis=(0, 3, 4), keepdims=True)
+                variances = np.var(sub_batched, axis=(0, 3, 4), keepdims=True)
+
+                avg_means = np.mean(means, axis=1, keepdims=True)
+                avg_variances = np.mean(variances, axis=1, keepdims=True)
+
+                moving_means = moving_means * momentum + avg_means * (
+                    1.0 - momentum
+                )
+                moving_vars = moving_vars * momentum + avg_variances * (
+                    1.0 - momentum
+                )
+
+                y_train = (
+                    (sub_batched - means) / (variances + epsilon) ** 0.5 * gamma
+                ) + beta
+                y_test = (
+                    (sub_batched - moving_means)
+                    / (moving_vars + epsilon) ** 0.5
+                    * gamma
+                ) + beta
+
+                y_train = np.reshape(y_train, shape)
+                y_test = np.reshape(y_test, shape)
+
+                y_val_train, _, _ = sess.run(
+                    [out] + bn.updates, feed_dict={inp: x, is_training: True}
+                )
+                y_val_test = sess.run(
+                    out, feed_dict={inp: x, is_training: False}
+                )
+
+                self.assertAllClose(y_train, y_val_train, atol=1e-2)
+                self.assertAllClose(y_test, y_val_test, atol=1e-2)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/legacy_tf_layers/pooling.py b/keras/legacy_tf_layers/pooling.py
index 144bf12bbcda..71695d771612 100644
--- a/keras/legacy_tf_layers/pooling.py
+++ b/keras/legacy_tf_layers/pooling.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Contains the pooling layer classes and their functional aliases."""
 from __future__ import absolute_import
 from __future__ import division
@@ -22,878 +22,979 @@
 
 from keras import layers as keras_layers
 from keras.legacy_tf_layers import base
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 
-@keras_export(v1=['keras.__internal__.legacy.layers.AveragePooling1D'])
-@tf_export(v1=['layers.AveragePooling1D'])
+@keras_export(v1=["keras.__internal__.legacy.layers.AveragePooling1D"])
 class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
-  """Average Pooling layer for 1D inputs.
-
-  Args:
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.AveragePooling1D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   pooling = tf.compat.v1.layers.AveragePooling1D(pool_size=2, strides=2)
-  ```
-
-  After:
-
-  ```python
-   pooling = tf.keras.layers.AveragePooling1D(pool_size=2, strides=2)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super().__init__(
+    """Average Pooling layer for 1D inputs.
+
+    Args:
+      pool_size: An integer or tuple/list of a single integer,
+        representing the size of the pooling window.
+      strides: An integer or tuple/list of a single integer, specifying the
+        strides of the pooling operation.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.AveragePooling1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     pooling = tf.compat.v1.layers.AveragePooling1D(pool_size=2, strides=2)
+    ```
+
+    After:
+
+    ```python
+     pooling = tf.keras.layers.AveragePooling1D(pool_size=2, strides=2)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        if strides is None:
+            raise ValueError("Argument `strides` must not be None.")
+        super().__init__(
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.average_pooling1d"])
+def average_pooling1d(
+    inputs,
+    pool_size,
+    strides,
+    padding="valid",
+    data_format="channels_last",
+    name=None,
+):
+    """Average Pooling layer for 1D inputs.
+
+    Args:
+      inputs: The tensor over which to pool. Must have rank 3.
+      pool_size: An integer or tuple/list of a single integer,
+        representing the size of the pooling window.
+      strides: An integer or tuple/list of a single integer, specifying the
+        strides of the pooling operation.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      name: A string, the name of the layer.
+
+    Returns:
+      The output tensor, of rank 3.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.AveragePooling1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.average_pooling1d(x, pool_size=2, strides=2)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.AveragePooling1D(pool_size=2, strides=2)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.average_pooling1d` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.AveragePooling1D` instead.",
+        stacklevel=2,
+    )
+    layer = AveragePooling1D(
         pool_size=pool_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         name=name,
-        **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.average_pooling1d'])
-@tf_export(v1=['layers.average_pooling1d'])
-def average_pooling1d(inputs, pool_size, strides,
-                      padding='valid', data_format='channels_last',
-                      name=None):
-  """Average Pooling layer for 1D inputs.
-
-  Args:
-    inputs: The tensor over which to pool. Must have rank 3.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    The output tensor, of rank 3.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.AveragePooling1D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.average_pooling1d(x, pool_size=2, strides=2)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.AveragePooling1D(pool_size=2, strides=2)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.average_pooling1d` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.AveragePooling1D` instead.',
-      stacklevel=2)
-  layer = AveragePooling1D(pool_size=pool_size,
-                           strides=strides,
-                           padding=padding,
-                           data_format=data_format,
-                           name=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.MaxPooling1D'])
-@tf_export(v1=['layers.MaxPooling1D'])
-class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
-  """Max Pooling layer for 1D inputs.
-
-  Args:
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
+    )
+    return layer(inputs)
 
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
 
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.MaxPooling1D`.
-
-
-  #### Structural Mapping to Native TF2
+@keras_export(v1=["keras.__internal__.legacy.layers.MaxPooling1D"])
+class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
+    """Max Pooling layer for 1D inputs.
+
+    Args:
+      pool_size: An integer or tuple/list of a single integer,
+        representing the size of the pooling window.
+      strides: An integer or tuple/list of a single integer, specifying the
+        strides of the pooling operation.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.MaxPooling1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     pooling = tf.compat.v1.layers.MaxPooling1D(pool_size=2, strides=2)
+    ```
+
+    After:
+
+    ```python
+     pooling = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        if strides is None:
+            raise ValueError("Argument `strides` must not be None.")
+        super().__init__(
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.max_pooling1d"])
+def max_pooling1d(
+    inputs,
+    pool_size,
+    strides,
+    padding="valid",
+    data_format="channels_last",
+    name=None,
+):
+    """Max Pooling layer for 1D inputs.
+
+    Args:
+      inputs: The tensor over which to pool. Must have rank 3.
+      pool_size: An integer or tuple/list of a single integer,
+        representing the size of the pooling window.
+      strides: An integer or tuple/list of a single integer, specifying the
+        strides of the pooling operation.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string, one of `channels_last` (default) or
+        `channels_first`. The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, length, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, length)`.
+      name: A string, the name of the layer.
+
+    Returns:
+      The output tensor, of rank 3.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.MaxPooling1D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.max_pooling1d(x, pool_size=2, strides=2)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.max_pooling1d` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.MaxPooling1D` instead.",
+        stacklevel=2,
+    )
+    layer = MaxPooling1D(
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+    )
+    return layer(inputs)
 
-  None of the supported arguments have changed name.
 
-  Before:
+@keras_export(v1=["keras.__internal__.legacy.layers.AveragePooling2D"])
+class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
+    """Average pooling layer for 2D inputs (e.g. images).
+
+    Args:
+      pool_size: An integer or tuple/list of 2 integers: (pool_height,
+        pool_width) specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.AveragePooling2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     pooling = tf.compat.v1.layers.AveragePooling2D(pool_size=2, strides=2)
+    ```
+
+    After:
+
+    ```python
+     pooling = tf.keras.layers.AveragePooling2D(pool_size=2, strides=2)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        if strides is None:
+            raise ValueError("Argument `strides` must not be None.")
+        super().__init__(
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.average_pooling2d"])
+def average_pooling2d(
+    inputs,
+    pool_size,
+    strides,
+    padding="valid",
+    data_format="channels_last",
+    name=None,
+):
+    """Average pooling layer for 2D inputs (e.g. images).
+
+    Args:
+      inputs: The tensor over which to pool. Must have rank 4.
+      pool_size: An integer or tuple/list of 2 integers: (pool_height,
+        pool_width) specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+      name: A string, the name of the layer.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.AveragePooling2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.average_pooling2d(x, pool_size=2, strides=2)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.AveragePooling2D(pool_size=2, strides=2)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.average_pooling2d` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.AveragePooling2D` instead.",
+        stacklevel=2,
+    )
+    layer = AveragePooling2D(
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+    )
+    return layer(inputs)
 
-  ```python
-   pooling = tf.compat.v1.layers.MaxPooling1D(pool_size=2, strides=2)
-  ```
 
-  After:
+@keras_export(v1=["keras.__internal__.legacy.layers.MaxPooling2D"])
+class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
+    """Max pooling layer for 2D inputs (e.g. images).
+
+    Args:
+      pool_size: An integer or tuple/list of 2 integers: (pool_height,
+        pool_width) specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.MaxPooling2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     pooling = tf.compat.v1.layers.MaxPooling2D(pool_size=2, strides=2)
+    ```
+
+    After:
+
+    ```python
+     pooling = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        if strides is None:
+            raise ValueError("Argument `strides` must not be None.")
+        super().__init__(
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.max_pooling2d"])
+def max_pooling2d(
+    inputs,
+    pool_size,
+    strides,
+    padding="valid",
+    data_format="channels_last",
+    name=None,
+):
+    """Max pooling layer for 2D inputs (e.g. images).
+
+    Args:
+      inputs: The tensor over which to pool. Must have rank 4.
+      pool_size: An integer or tuple/list of 2 integers: (pool_height,
+        pool_width) specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first` corresponds to
+        inputs with shape `(batch, channels, height, width)`.
+      name: A string, the name of the layer.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.MaxPooling2D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.max_pooling2d(x, pool_size=2, strides=2)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.max_pooling2d` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.MaxPooling2D` instead.",
+        stacklevel=2,
+    )
+    layer = MaxPooling2D(
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+    )
+    return layer(inputs)
 
-  ```python
-   pooling = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
-  ```
-  @end_compatibility
-  """
 
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super().__init__(
+@keras_export(v1=["keras.__internal__.legacy.layers.AveragePooling3D"])
+class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
+    """Average pooling layer for 3D inputs (e.g. volumes).
+
+    Args:
+      pool_size: An integer or tuple/list of 3 integers:
+        (pool_depth, pool_height, pool_width)
+        specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, depth, height, width)`.
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.AveragePooling3D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     pooling = tf.compat.v1.layers.AveragePooling3D(pool_size=2, strides=2)
+    ```
+
+    After:
+
+    ```python
+     pooling = tf.keras.layers.AveragePooling3D(pool_size=2, strides=2)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        if strides is None:
+            raise ValueError("Argument `strides` must not be None.")
+        super().__init__(
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.average_pooling3d"])
+def average_pooling3d(
+    inputs,
+    pool_size,
+    strides,
+    padding="valid",
+    data_format="channels_last",
+    name=None,
+):
+    """Average pooling layer for 3D inputs (e.g. volumes).
+
+    Args:
+      inputs: The tensor over which to pool. Must have rank 5.
+      pool_size: An integer or tuple/list of 3 integers:
+        (pool_depth, pool_height, pool_width)
+        specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, depth, height, width)`.
+      name: A string, the name of the layer.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.AveragePooling3D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.average_pooling3d(x, pool_size=2, strides=2)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.AveragePooling3D(pool_size=2, strides=2)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.average_pooling3d` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.AveragePooling3D` instead.",
+        stacklevel=2,
+    )
+    layer = AveragePooling3D(
         pool_size=pool_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         name=name,
-        **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.max_pooling1d'])
-@tf_export(v1=['layers.max_pooling1d'])
-def max_pooling1d(inputs, pool_size, strides,
-                  padding='valid', data_format='channels_last',
-                  name=None):
-  """Max Pooling layer for 1D inputs.
-
-  Args:
-    inputs: The tensor over which to pool. Must have rank 3.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    The output tensor, of rank 3.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.MaxPooling1D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.max_pooling1d(x, pool_size=2, strides=2)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.max_pooling1d` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.MaxPooling1D` instead.',
-      stacklevel=2)
-  layer = MaxPooling1D(pool_size=pool_size,
-                       strides=strides,
-                       padding=padding,
-                       data_format=data_format,
-                       name=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.AveragePooling2D'])
-@tf_export(v1=['layers.AveragePooling2D'])
-class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
-  """Average pooling layer for 2D inputs (e.g. images).
-
-  Args:
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.AveragePooling2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   pooling = tf.compat.v1.layers.AveragePooling2D(pool_size=2, strides=2)
-  ```
-
-  After:
-
-  ```python
-   pooling = tf.keras.layers.AveragePooling2D(pool_size=2, strides=2)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super().__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.average_pooling2d'])
-@tf_export(v1=['layers.average_pooling2d'])
-def average_pooling2d(inputs,
-                      pool_size, strides,
-                      padding='valid', data_format='channels_last',
-                      name=None):
-  """Average pooling layer for 2D inputs (e.g. images).
-
-  Args:
-    inputs: The tensor over which to pool. Must have rank 4.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.AveragePooling2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.average_pooling2d(x, pool_size=2, strides=2)
-  ```
+    )
+    return layer(inputs)
 
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.AveragePooling2D(pool_size=2, strides=2)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.average_pooling2d` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.AveragePooling2D` instead.',
-      stacklevel=2)
-  layer = AveragePooling2D(pool_size=pool_size, strides=strides,
-                           padding=padding, data_format=data_format,
-                           name=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.MaxPooling2D'])
-@tf_export(v1=['layers.MaxPooling2D'])
-class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
-  """Max pooling layer for 2D inputs (e.g. images).
-
-  Args:
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.MaxPooling2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   pooling = tf.compat.v1.layers.MaxPooling2D(pool_size=2, strides=2)
-  ```
-
-  After:
-
-  ```python
-   pooling = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super().__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.max_pooling2d'])
-@tf_export(v1=['layers.max_pooling2d'])
-def max_pooling2d(inputs,
-                  pool_size, strides,
-                  padding='valid', data_format='channels_last',
-                  name=None):
-  """Max pooling layer for 2D inputs (e.g. images).
-
-  Args:
-    inputs: The tensor over which to pool. Must have rank 4.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.MaxPooling2D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.max_pooling2d(x, pool_size=2, strides=2)
-  ```
 
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.max_pooling2d` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.MaxPooling2D` instead.',
-      stacklevel=2)
-  layer = MaxPooling2D(pool_size=pool_size, strides=strides,
-                       padding=padding, data_format=data_format,
-                       name=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.AveragePooling3D'])
-@tf_export(v1=['layers.AveragePooling3D'])
-class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
-  """Average pooling layer for 3D inputs (e.g. volumes).
-
-  Args:
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.AveragePooling3D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   pooling = tf.compat.v1.layers.AveragePooling3D(pool_size=2, strides=2)
-  ```
-
-  After:
-
-  ```python
-   pooling = tf.keras.layers.AveragePooling3D(pool_size=2, strides=2)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super().__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.average_pooling3d'])
-@tf_export(v1=['layers.average_pooling3d'])
-def average_pooling3d(inputs,
-                      pool_size, strides,
-                      padding='valid', data_format='channels_last',
-                      name=None):
-  """Average pooling layer for 3D inputs (e.g. volumes).
-
-  Args:
-    inputs: The tensor over which to pool. Must have rank 5.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.AveragePooling3D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.average_pooling3d(x, pool_size=2, strides=2)
-  ```
-
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.AveragePooling3D(pool_size=2, strides=2)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.average_pooling3d` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.AveragePooling3D` instead.',
-      stacklevel=2)
-  layer = AveragePooling3D(pool_size=pool_size, strides=strides,
-                           padding=padding, data_format=data_format,
-                           name=name)
-  return layer(inputs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.MaxPooling3D'])
-@tf_export(v1=['layers.MaxPooling3D'])
+@keras_export(v1=["keras.__internal__.legacy.layers.MaxPooling3D"])
 class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
-  """Max pooling layer for 3D inputs (e.g. volumes).
-
-  Args:
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.MaxPooling3D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   pooling = tf.compat.v1.layers.MaxPooling3D(pool_size=2, strides=2)
-  ```
-
-  After:
-
-  ```python
-   pooling = tf.keras.layers.MaxPooling3D(pool_size=2, strides=2)
-  ```
-  @end_compatibility
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super().__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@keras_export(v1=['keras.__internal__.legacy.layers.max_pooling3d'])
-@tf_export(v1=['layers.max_pooling3d'])
-def max_pooling3d(inputs,
-                  pool_size, strides,
-                  padding='valid', data_format='channels_last',
-                  name=None):
-  """Max pooling layer for 3D inputs (e.g.
-
-  volumes).
-
-  Args:
-    inputs: The tensor over which to pool. Must have rank 5.
-    pool_size: An integer or tuple/list of 3 integers: (pool_depth, pool_height,
-      pool_width) specifying the size of the pooling window. Can be a single
-      integer to specify the same value for all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers, specifying the strides of
-      the pooling operation. Can be a single integer to specify the same value
-      for all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape `(batch, depth, height,
-      width, channels)` while `channels_first` corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-
-  @compatibility(TF2)
-  This API is a legacy api that is only compatible with eager execution and
-  `tf.function` if you combine it with
-  `tf.compat.v1.keras.utils.track_tf1_style_variables`
-
-  Please refer to [tf.layers model mapping section of the migration guide]
-  (https://www.tensorflow.org/guide/migrate/model_mapping)
-  to learn how to use your TensorFlow v1 model in TF2 with Keras.
-
-  The corresponding TensorFlow v2 layer is
-  `tf.keras.layers.MaxPooling3D`.
-
-
-  #### Structural Mapping to Native TF2
-
-  None of the supported arguments have changed name.
-
-  Before:
-
-  ```python
-   y = tf.compat.v1.layers.max_pooling3d(x, pool_size=2, strides=2)
-  ```
+    """Max pooling layer for 3D inputs (e.g. volumes).
+
+    Args:
+      pool_size: An integer or tuple/list of 3 integers:
+        (pool_depth, pool_height, pool_width)
+        specifying the size of the pooling window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the pooling operation.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape
+        `(batch, depth, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, depth, height, width)`.
+      name: A string, the name of the layer.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.MaxPooling3D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     pooling = tf.compat.v1.layers.MaxPooling3D(pool_size=2, strides=2)
+    ```
+
+    After:
+
+    ```python
+     pooling = tf.keras.layers.MaxPooling3D(pool_size=2, strides=2)
+    ```
+    @end_compatibility
+    """
+
+    def __init__(
+        self,
+        pool_size,
+        strides,
+        padding="valid",
+        data_format="channels_last",
+        name=None,
+        **kwargs
+    ):
+        if strides is None:
+            raise ValueError("Argument `strides` must not be None.")
+        super().__init__(
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            name=name,
+            **kwargs
+        )
+
+
+@keras_export(v1=["keras.__internal__.legacy.layers.max_pooling3d"])
+def max_pooling3d(
+    inputs,
+    pool_size,
+    strides,
+    padding="valid",
+    data_format="channels_last",
+    name=None,
+):
+    """Max pooling layer for 3D inputs (e.g.
+
+    volumes).
+
+    Args:
+      inputs: The tensor over which to pool. Must have rank 5.
+      pool_size: An integer or tuple/list of 3 integers: (pool_depth,
+        pool_height, pool_width) specifying the size of the pooling window. Can
+        be a single integer to specify the same value for all spatial
+        dimensions.
+      strides: An integer or tuple/list of 3 integers, specifying the strides of
+        the pooling operation. Can be a single integer to specify the same value
+        for all spatial dimensions.
+      padding: A string. The padding method, either 'valid' or 'same'.
+        Case-insensitive.
+      data_format: A string. The ordering of the dimensions in the inputs.
+        `channels_last` (default) and `channels_first` are supported.
+        `channels_last` corresponds to inputs with shape `(batch, depth, height,
+        width, channels)` while `channels_first` corresponds to inputs with
+        shape `(batch, channels, depth, height, width)`.
+      name: A string, the name of the layer.
+
+    Returns:
+      Output tensor.
+
+    Raises:
+      ValueError: if eager execution is enabled.
+
+
+    @compatibility(TF2)
+    This API is a legacy api that is only compatible with eager execution and
+    `tf.function` if you combine it with
+    `tf.compat.v1.keras.utils.track_tf1_style_variables`
+
+    Please refer to [tf.layers model mapping section of the migration guide]
+    (https://www.tensorflow.org/guide/migrate/model_mapping)
+    to learn how to use your TensorFlow v1 model in TF2 with Keras.
+
+    The corresponding TensorFlow v2 layer is
+    `tf.keras.layers.MaxPooling3D`.
+
+
+    #### Structural Mapping to Native TF2
+
+    None of the supported arguments have changed name.
+
+    Before:
+
+    ```python
+     y = tf.compat.v1.layers.max_pooling3d(x, pool_size=2, strides=2)
+    ```
+
+    After:
+
+    To migrate code using TF1 functional layers use the [Keras Functional API]
+    (https://www.tensorflow.org/guide/keras/functional):
+
+    ```python
+     x = tf.keras.Input((28, 28, 1))
+     y = tf.keras.layers.MaxPooling3D(pool_size=2, strides=2)(x)
+     model = tf.keras.Model(x, y)
+    ```
+    @end_compatibility
+    """
+    warnings.warn(
+        "`tf.layers.max_pooling3d` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `tf.keras.layers.MaxPooling3D` instead.",
+        stacklevel=2,
+    )
+    layer = MaxPooling3D(
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+    )
+    return layer(inputs)
 
-  After:
-
-  To migrate code using TF1 functional layers use the [Keras Functional API]
-  (https://www.tensorflow.org/guide/keras/functional):
-
-  ```python
-   x = tf.keras.Input((28, 28, 1))
-   y = tf.keras.layers.MaxPooling3D(pool_size=2, strides=2)(x)
-   model = tf.keras.Model(x, y)
-  ```
-  @end_compatibility
-  """
-  warnings.warn(
-      '`tf.layers.max_pooling3d` is deprecated and '
-      'will be removed in a future version. '
-      'Please use `tf.keras.layers.MaxPooling3D` instead.',
-      stacklevel=2)
-  layer = MaxPooling3D(pool_size=pool_size, strides=strides,
-                       padding=padding, data_format=data_format,
-                       name=name)
-  return layer(inputs)
 
 # Aliases
 
diff --git a/keras/legacy_tf_layers/pooling_test.py b/keras/legacy_tf_layers/pooling_test.py
index 6ded7d886b97..a60049897936 100644
--- a/keras/legacy_tf_layers/pooling_test.py
+++ b/keras/legacy_tf_layers/pooling_test.py
@@ -20,187 +20,213 @@
 
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util as tf_test_utils
 from keras.legacy_tf_layers import pooling as pooling_layers
 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
-class PoolingTest(tf.test.TestCase):
 
-  def testInvalidDataFormat(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'data_format'):
-      pooling_layers.max_pooling2d(images, 3, strides=2, data_format='invalid')
-
-  def testInvalidStrides(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      pooling_layers.max_pooling2d(images, 3, strides=(1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'strides'):
-      pooling_layers.max_pooling2d(images, 3, strides=None)
-
-  def testInvalidPoolSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    with self.assertRaisesRegex(ValueError, 'pool_size'):
-      pooling_layers.max_pooling2d(images, (1, 2, 3), strides=2)
-
-    with self.assertRaisesRegex(ValueError, 'pool_size'):
-      pooling_layers.max_pooling2d(images, None, strides=2)
-
-  def testCreateMaxPooling2D(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = pooling_layers.MaxPooling2D([2, 2], strides=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
-
-  def testCreateAveragePooling2D(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = pooling_layers.AveragePooling2D([2, 2], strides=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
-
-  @tf_test_utils.run_deprecated_v1
-  def testCreateMaxPooling2DChannelsFirst(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, 2, height, width))
-    layer = pooling_layers.MaxPooling2D([2, 2],
-                                        strides=1,
-                                        data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 2, 6, 8])
-
-  @tf_test_utils.run_deprecated_v1
-  def testCreateAveragePooling2DChannelsFirst(self):
-    height, width = 5, 6
-    images = tf.random.uniform((3, 4, height, width))
-    layer = pooling_layers.AveragePooling2D((2, 2),
-                                            strides=(1, 1),
-                                            padding='valid',
-                                            data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [3, 4, 4, 5])
-
-  @tf_test_utils.run_deprecated_v1
-  def testCreateAveragePooling2DChannelsFirstWithNoneBatch(self):
-    height, width = 5, 6
-    images = tf.compat.v1.placeholder(dtype='float32',
-                                   shape=(None, 4, height, width))
-    layer = pooling_layers.AveragePooling2D((2, 2),
-                                            strides=(1, 1),
-                                            padding='valid',
-                                            data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [None, 4, 4, 5])
-
-  def testCreateMaxPooling1D(self):
-    width = 7
-    channels = 3
-    images = tf.random.uniform((5, width, channels))
-    layer = pooling_layers.MaxPooling1D(2, strides=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, width // 2, channels])
-
-  def testCreateAveragePooling1D(self):
-    width = 7
-    channels = 3
-    images = tf.random.uniform((5, width, channels))
-    layer = pooling_layers.AveragePooling1D(2, strides=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, width // 2, channels])
-
-  def testCreateMaxPooling1DChannelsFirst(self):
-    width = 7
-    channels = 3
-    images = tf.random.uniform((5, channels, width))
-    layer = pooling_layers.MaxPooling1D(
-        2, strides=2, data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, channels, width // 2])
-
-  def testCreateAveragePooling1DChannelsFirst(self):
-    width = 7
-    channels = 3
-    images = tf.random.uniform((5, channels, width))
-    layer = pooling_layers.AveragePooling1D(
-        2, strides=2, data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, channels, width // 2])
-
-  def testCreateMaxPooling3D(self):
-    depth, height, width = 6, 7, 9
-    images = tf.random.uniform((5, depth, height, width, 4))
-    layer = pooling_layers.MaxPooling3D([2, 2, 2], strides=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 4, 4])
-
-  def testCreateAveragePooling3D(self):
-    depth, height, width = 6, 7, 9
-    images = tf.random.uniform((5, depth, height, width, 4))
-    layer = pooling_layers.AveragePooling3D([2, 2, 2], strides=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 4, 4])
-
-  def testMaxPooling3DChannelsFirst(self):
-    depth, height, width = 6, 7, 9
-    images = tf.random.uniform((5, 2, depth, height, width))
-    layer = pooling_layers.MaxPooling3D(
-        [2, 2, 2], strides=2, data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
-
-  def testAveragePooling3DChannelsFirst(self):
-    depth, height, width = 6, 7, 9
-    images = tf.random.uniform((5, 2, depth, height, width))
-    layer = pooling_layers.AveragePooling3D(
-        [2, 2, 2], strides=2, data_format='channels_first')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
-
-  def testCreateMaxPooling2DIntegerPoolSize(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4))
-    layer = pooling_layers.MaxPooling2D(2, strides=2)
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
-
-  def testMaxPooling2DPaddingSame(self):
-    height, width = 7, 9
-    images = tf.random.uniform((5, height, width, 4), seed=1)
-    layer = pooling_layers.MaxPooling2D(
-        images.get_shape()[1:3], strides=2, padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 4, 5, 4])
-
-  def testCreatePooling2DWithStrides(self):
-    height, width = 6, 8
-    # Test strides tuple
-    images = tf.random.uniform((5, height, width, 3), seed=1)
-    layer = pooling_layers.MaxPooling2D([2, 2], strides=(2, 2), padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width / 2, 3])
-
-    # Test strides integer
-    layer = pooling_layers.MaxPooling2D([2, 2], strides=2, padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width / 2, 3])
-
-    # Test unequal strides
-    layer = pooling_layers.MaxPooling2D([2, 2], strides=(2, 1), padding='same')
-    output = layer(images)
-    self.assertListEqual(output.get_shape().as_list(),
-                         [5, height / 2, width, 3])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+class PoolingTest(tf.test.TestCase):
+    def testInvalidDataFormat(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "data_format"):
+            pooling_layers.max_pooling2d(
+                images, 3, strides=2, data_format="invalid"
+            )
+
+    def testInvalidStrides(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "strides"):
+            pooling_layers.max_pooling2d(images, 3, strides=(1, 2, 3))
+
+        with self.assertRaisesRegex(ValueError, "strides"):
+            pooling_layers.max_pooling2d(images, 3, strides=None)
+
+    def testInvalidPoolSize(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        with self.assertRaisesRegex(ValueError, "pool_size"):
+            pooling_layers.max_pooling2d(images, (1, 2, 3), strides=2)
+
+        with self.assertRaisesRegex(ValueError, "pool_size"):
+            pooling_layers.max_pooling2d(images, None, strides=2)
+
+    def testCreateMaxPooling2D(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = pooling_layers.MaxPooling2D([2, 2], strides=2)
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
+
+    def testCreateAveragePooling2D(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = pooling_layers.AveragePooling2D([2, 2], strides=2)
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
+
+    @tf_test_utils.run_deprecated_v1
+    def testCreateMaxPooling2DChannelsFirst(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, 2, height, width))
+        layer = pooling_layers.MaxPooling2D(
+            [2, 2], strides=1, data_format="channels_first"
+        )
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 2, 6, 8])
+
+    @tf_test_utils.run_deprecated_v1
+    def testCreateAveragePooling2DChannelsFirst(self):
+        height, width = 5, 6
+        images = tf.random.uniform((3, 4, height, width))
+        layer = pooling_layers.AveragePooling2D(
+            (2, 2),
+            strides=(1, 1),
+            padding="valid",
+            data_format="channels_first",
+        )
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [3, 4, 4, 5])
+
+    @tf_test_utils.run_deprecated_v1
+    def testCreateAveragePooling2DChannelsFirstWithNoneBatch(self):
+        height, width = 5, 6
+        images = tf.compat.v1.placeholder(
+            dtype="float32", shape=(None, 4, height, width)
+        )
+        layer = pooling_layers.AveragePooling2D(
+            (2, 2),
+            strides=(1, 1),
+            padding="valid",
+            data_format="channels_first",
+        )
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [None, 4, 4, 5])
+
+    def testCreateMaxPooling1D(self):
+        width = 7
+        channels = 3
+        images = tf.random.uniform((5, width, channels))
+        layer = pooling_layers.MaxPooling1D(2, strides=2)
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, width // 2, channels]
+        )
+
+    def testCreateAveragePooling1D(self):
+        width = 7
+        channels = 3
+        images = tf.random.uniform((5, width, channels))
+        layer = pooling_layers.AveragePooling1D(2, strides=2)
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, width // 2, channels]
+        )
+
+    def testCreateMaxPooling1DChannelsFirst(self):
+        width = 7
+        channels = 3
+        images = tf.random.uniform((5, channels, width))
+        layer = pooling_layers.MaxPooling1D(
+            2, strides=2, data_format="channels_first"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, channels, width // 2]
+        )
+
+    def testCreateAveragePooling1DChannelsFirst(self):
+        width = 7
+        channels = 3
+        images = tf.random.uniform((5, channels, width))
+        layer = pooling_layers.AveragePooling1D(
+            2, strides=2, data_format="channels_first"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, channels, width // 2]
+        )
+
+    def testCreateMaxPooling3D(self):
+        depth, height, width = 6, 7, 9
+        images = tf.random.uniform((5, depth, height, width, 4))
+        layer = pooling_layers.MaxPooling3D([2, 2, 2], strides=2)
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 4, 4])
+
+    def testCreateAveragePooling3D(self):
+        depth, height, width = 6, 7, 9
+        images = tf.random.uniform((5, depth, height, width, 4))
+        layer = pooling_layers.AveragePooling3D([2, 2, 2], strides=2)
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 4, 4])
+
+    def testMaxPooling3DChannelsFirst(self):
+        depth, height, width = 6, 7, 9
+        images = tf.random.uniform((5, 2, depth, height, width))
+        layer = pooling_layers.MaxPooling3D(
+            [2, 2, 2], strides=2, data_format="channels_first"
+        )
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
+
+    def testAveragePooling3DChannelsFirst(self):
+        depth, height, width = 6, 7, 9
+        images = tf.random.uniform((5, 2, depth, height, width))
+        layer = pooling_layers.AveragePooling3D(
+            [2, 2, 2], strides=2, data_format="channels_first"
+        )
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
+
+    def testCreateMaxPooling2DIntegerPoolSize(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4))
+        layer = pooling_layers.MaxPooling2D(2, strides=2)
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
+
+    def testMaxPooling2DPaddingSame(self):
+        height, width = 7, 9
+        images = tf.random.uniform((5, height, width, 4), seed=1)
+        layer = pooling_layers.MaxPooling2D(
+            images.get_shape()[1:3], strides=2, padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(output.get_shape().as_list(), [5, 4, 5, 4])
+
+    def testCreatePooling2DWithStrides(self):
+        height, width = 6, 8
+        # Test strides tuple
+        images = tf.random.uniform((5, height, width, 3), seed=1)
+        layer = pooling_layers.MaxPooling2D(
+            [2, 2], strides=(2, 2), padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height / 2, width / 2, 3]
+        )
+
+        # Test strides integer
+        layer = pooling_layers.MaxPooling2D([2, 2], strides=2, padding="same")
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height / 2, width / 2, 3]
+        )
+
+        # Test unequal strides
+        layer = pooling_layers.MaxPooling2D(
+            [2, 2], strides=(2, 1), padding="same"
+        )
+        output = layer(images)
+        self.assertListEqual(
+            output.get_shape().as_list(), [5, height / 2, width, 3]
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/legacy_tf_layers/variable_scope_shim.py b/keras/legacy_tf_layers/variable_scope_shim.py
index b7ee69ac0396..5eaf3f2fc49e 100644
--- a/keras/legacy_tf_layers/variable_scope_shim.py
+++ b/keras/legacy_tf_layers/variable_scope_shim.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Contains a shim to allow using TF1 get_variable code in TF2."""
 from __future__ import absolute_import
 from __future__ import division
@@ -21,291 +21,158 @@
 import contextlib
 import functools
 
+import tensorflow.compat.v2 as tf
+
 from keras.engine import base_layer
 from keras.utils import layer_utils
 from keras.utils import tf_inspect
-import tensorflow.compat.v2 as tf
 
+# isort: off
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
 def as_shape(shape):
-  """Converts the given object to a TensorShape."""
-  if isinstance(shape, tf.TensorShape):
-    return shape
-  else:
-    return tf.TensorShape(shape)
+    """Converts the given object to a TensorShape."""
+    if isinstance(shape, tf.TensorShape):
+        return shape
+    else:
+        return tf.TensorShape(shape)
 
 
 def _is_callable_object(obj):
-  return hasattr(obj, "__call__") and tf_inspect.ismethod(obj.__call__)
+    return hasattr(obj, "__call__") and tf_inspect.ismethod(obj.__call__)
 
 
 def _has_kwargs(fn):
-  """Returns whether the passed callable has **kwargs in its signature.
+    """Returns whether the passed callable has **kwargs in its signature.
 
-  Args:
-    fn: Function, or function-like object (e.g., result of `functools.partial`).
+    Args:
+      fn: Function, or function-like object (e.g., result of
+        `functools.partial`).
 
-  Returns:
-    `bool`: if `fn` has **kwargs in its signature.
+    Returns:
+      `bool`: if `fn` has **kwargs in its signature.
 
-  Raises:
-     `TypeError`: If fn is not a Function, or function-like object.
-  """
-  if isinstance(fn, functools.partial):
-    fn = fn.func
-  elif _is_callable_object(fn):
-    fn = fn.__call__
-  elif not callable(fn):
-    raise TypeError(
-        "fn should be a function-like object, but is of type {}.".format(
-            type(fn)))
-  return tf_inspect.getfullargspec(fn).varkw is not None
+    Raises:
+       `TypeError`: If fn is not a Function, or function-like object.
+    """
+    if isinstance(fn, functools.partial):
+        fn = fn.func
+    elif _is_callable_object(fn):
+        fn = fn.__call__
+    elif not callable(fn):
+        raise TypeError(
+            f"fn should be a function-like object, but is of type {type(fn)}."
+        )
+    return tf_inspect.getfullargspec(fn).varkw is not None
 
 
 def fn_args(fn):
-  """Get argument names for function-like object.
-
-  Args:
-    fn: Function, or function-like object (e.g., result of `functools.partial`).
-
-  Returns:
-    `tuple` of string argument names.
-
-  Raises:
-    ValueError: if partial function has positionally bound arguments
-  """
-  if isinstance(fn, functools.partial):
-    args = fn_args(fn.func)
-    args = [a for a in args[len(fn.args):] if a not in (fn.keywords or [])]
-  else:
-    if hasattr(fn, "__call__") and tf_inspect.ismethod(fn.__call__):
-      fn = fn.__call__
-    args = tf_inspect.getfullargspec(fn).args
-    if _is_bound_method(fn) and args:
-      # If it's a bound method, it may or may not have a self/cls first
-      # argument; for example, self could be captured in *args.
-      # If it does have a positional argument, it is self/cls.
-      args.pop(0)
-  return tuple(args)
+    """Get argument names for function-like object.
+
+    Args:
+      fn: Function, or function-like object (e.g., result of
+        `functools.partial`).
+
+    Returns:
+      `tuple` of string argument names.
+
+    Raises:
+      ValueError: if partial function has positionally bound arguments
+    """
+    if isinstance(fn, functools.partial):
+        args = fn_args(fn.func)
+        args = [a for a in args[len(fn.args) :] if a not in (fn.keywords or [])]
+    else:
+        if hasattr(fn, "__call__") and tf_inspect.ismethod(fn.__call__):
+            fn = fn.__call__
+        args = tf_inspect.getfullargspec(fn).args
+        if _is_bound_method(fn) and args:
+            # If it's a bound method, it may or may not have a self/cls first
+            # argument; for example, self could be captured in *args.
+            # If it does have a positional argument, it is self/cls.
+            args.pop(0)
+    return tuple(args)
 
 
 def _is_bound_method(fn):
-  _, fn = tf.__internal__.decorator.unwrap(fn)
-  return tf_inspect.ismethod(fn) and (fn.__self__ is not None)
+    _, fn = tf.__internal__.decorator.unwrap(fn)
+    return tf_inspect.ismethod(fn) and (fn.__self__ is not None)
 
 
 def validate_synchronization_aggregation_trainable(
-    synchronization, aggregation, trainable, name):
-  """Given user-provided variable properties, sets defaults and validates."""
-  if aggregation is None:
-    aggregation = tf.compat.v1.VariableAggregation.NONE
-  else:
-    if not isinstance(aggregation,
-                      (tf.compat.v1.VariableAggregation,
-                       tf.VariableAggregation)):
-      try:
-        aggregation = tf.VariableAggregation(aggregation)
-      except ValueError:
-        raise ValueError(
-            "Invalid variable aggregation mode: {} for variable: {}".format(
-                aggregation, name))
-  if synchronization is None:
-    synchronization = tf.VariableSynchronization.AUTO
-  else:
-    try:
-      synchronization = tf.VariableSynchronization(synchronization)
-    except ValueError:
-      raise ValueError(
-          "Invalid variable synchronization mode: {} for variable: {}".format(
-              synchronization, name))
-  if trainable is None:
-    trainable = synchronization != tf.VariableSynchronization.ON_READ
-  return synchronization, aggregation, trainable
+    synchronization, aggregation, trainable, name
+):
+    """Given user-provided variable properties, sets defaults and validates."""
+    if aggregation is None:
+        aggregation = tf.compat.v1.VariableAggregation.NONE
+    else:
+        if not isinstance(
+            aggregation,
+            (tf.compat.v1.VariableAggregation, tf.VariableAggregation),
+        ):
+            try:
+                aggregation = tf.VariableAggregation(aggregation)
+            except ValueError:
+                raise ValueError(
+                    "Invalid variable aggregation mode: {} "
+                    "for variable: {}".format(aggregation, name)
+                )
+    if synchronization is None:
+        synchronization = tf.VariableSynchronization.AUTO
+    else:
+        try:
+            synchronization = tf.VariableSynchronization(synchronization)
+        except ValueError:
+            raise ValueError(
+                "Invalid variable synchronization mode: {} "
+                "for variable: {}".format(synchronization, name)
+            )
+    if trainable is None:
+        trainable = synchronization != tf.VariableSynchronization.ON_READ
+    return synchronization, aggregation, trainable
 
 
 class _EagerVariableStore(tf.Module):
-  """TF2-compatible VariableStore that avoids collections & tracks regularizers.
-
-  New variable names and new variables can be created; all stored
-  variables are initialized with the initializer passed to __init__.
-
-  All variables get created in `tf.init_scope.` to avoid a bad
-  interaction between `tf.function` `FuncGraph` internals, Keras
-  Functional Models, and TPUStrategy variable initialization.
-
-  Also, it always acts as if reuse is set to either "TRUE" or
-  tf.compat.v1.AUTO_REUSE
-
-  Attributes:
-    vars: a dictionary with string names (same as passed in GetVar) as keys and
-      the corresponding TensorFlow Variables as values.
-    regularizers: a dictionary with string names as keys and the corresponding
-      callables that return losses as values.
-    layers: a dictionary with string names as keys and the corresponding
-      nested keras layers as values.
-  """
-
-  def __init__(self):
-    """Create a variable store."""
-    self._vars = {}  # A dictionary of the stored TensorFlow variables.
-    self._regularizers = {}  # A dict mapping var names to their regularizers.
-    self._layers = {}  # A dictionary of stored keras layers.
-    self._store_eager_variables = True
-
-  @contextlib.contextmanager
-  def scope(self):
-    with vs.with_variable_store(self):
-      yield
-
-  def get_variable(
-      self,
-      name,
-      shape=None,
-      dtype=tf.float32,
-      initializer=None,
-      regularizer=None,
-      reuse=None,
-      trainable=None,
-      collections=None,
-      caching_device=None,
-      partitioner=None,
-      validate_shape=True,
-      use_resource=None,
-      custom_getter=None,
-      constraint=None,
-      synchronization=tf.VariableSynchronization.AUTO,
-      aggregation=tf.compat.v1.VariableAggregation.NONE):
-    """Gets an existing variable with these parameters or create a new one.
-
-    If a variable with the given name is already stored, we return the stored
-    variable. Otherwise, we create a new one.
-
-    Set `reuse` to `True` when you only want to reuse existing Variables.
-    Set `reuse` to None (the default) or tf.compat.v1.AUTO_REUSE when you want
-    variables to be created if they don't exist or returned if they do.
-    In this shim, `reuse` of `False` will be treated as auto-reuse.
-
-    If initializer is `None` (the default), the default initializer passed in
-    the constructor is used. If that one is `None` too, we use a new
-    `glorot_uniform_initializer`. If initializer is a Tensor, we use
-    it as a value and derive the shape from the initializer.
-
-    If a partitioner is provided, a `PartitionedVariable` is returned.
-    Accessing this object as a `Tensor` returns the shards concatenated along
-    the partition axis.
-
-    Some useful partitioners are available.  See, e.g.,
-    `variable_axis_size_partitioner` and `min_max_variable_partitioner`.
+    """TF2-safe VariableStore that avoids collections & tracks regularizers.
 
-    Args:
-      name: The name of the new or existing variable.
-      shape: Shape of the new or existing variable.
-      dtype: Type of the new or existing variable (defaults to `DT_FLOAT`).
-      initializer: Initializer for the variable.
-      regularizer: A (Tensor -> Tensor or None) function; the result of applying
-        it on a newly created variable will be added to the collection
-        GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
-      reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation of
-        variables. When eager execution is enabled  this argument is always
-        forced to be False.
-      trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). `trainable`
-        defaults to `True`, unless `synchronization` is set to `ON_READ`, in
-        which case it defaults to `False`.
-      collections: List of graph collections keys to add the `Variable` to.
-        Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
-      caching_device: Optional device string or function describing where the
-        Variable should be cached for reading.  Defaults to the Variable's
-        device.  If not `None`, caches on another device.  Typical use is to
-        cache on the device where the Ops using the `Variable` reside, to
-        deduplicate copying through `Switch` and other conditional statements.
-      partitioner: Optional callable that accepts a fully defined `TensorShape`
-        and dtype of the `Variable` to be created, and returns a list of
-        partitions for each axis (currently only one axis can be partitioned).
-      validate_shape: If False, allows the variable to be initialized with a
-        value of unknown shape. If True, the default, the shape of initial_value
-        must be known.
-      use_resource: If False, creates a regular Variable. If True, creates
-        instead an experimental ResourceVariable which has well-defined
-        semantics. Defaults to False (will later change to True). When eager
-        execution is enabled this argument is always forced to be true.
-      custom_getter: Callable that takes as a first argument the true getter,
-        and allows overwriting the internal get_variable method. The signature
-        of `custom_getter` should match that of this method,
-        but the most future-proof version will allow for changes: `def
-          custom_getter(getter, *args, **kwargs)`.  Direct access to
-        all `get_variable` parameters is also allowed: `def
-          custom_getter(getter, name, *args, **kwargs)`.  A simple identity
-        custom getter that simply creates variables with modified names is:
-          ```python
-        def custom_getter(getter, name, *args, **kwargs): return getter(name +
-          '_suffix', *args, **kwargs) ```
-      constraint: An optional projection function to be applied to the variable
-        after being updated by an `Optimizer` (e.g. used to implement norm
-        constraints or value constraints for layer weights). The function must
-        take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value (which must have
-        the same shape). Constraints are not safe to use when doing asynchronous
-        distributed training.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses when to
-        synchronize.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
+    New variable names and new variables can be created; all stored
+    variables are initialized with the initializer passed to __init__.
 
-    Returns:
-      The created or existing `Variable` (or `PartitionedVariable`, if a
-      partitioner was used).
+    All variables get created in `tf.init_scope.` to avoid a bad
+    interaction between `tf.function` `FuncGraph` internals, Keras
+    Functional Models, and TPUStrategy variable initialization.
 
-    Raises:
-      ValueError: when creating a new variable and shape is not declared,
-        when reusing a variable and specifying a conflicting shape,
-        or when violating reuse during variable creation.
-      RuntimeError: when eager execution is enabled and not called from an
-        EagerVariableStore.
+    Also, it always acts as if reuse is set to either "TRUE" or
+    tf.compat.v1.AUTO_REUSE
+
+    Attributes:
+      vars: a dictionary with string names (same as passed in GetVar) as keys
+        and the corresponding TensorFlow Variables as values.
+      regularizers: a dictionary with string names as keys and the corresponding
+        callables that return losses as values.
+      layers: a dictionary with string names as keys and the corresponding
+        nested keras layers as values.
     """
-    if custom_getter is not None and not callable(custom_getter):
-      raise ValueError("Passed a custom_getter which is not callable: %s" %
-                       custom_getter)
-
-    with tf.init_scope():
-      if tf.executing_eagerly():
-        # Variable creation and initialization takes place in `init_scope`s;
-        # as such, if an `init_scope` lifts us into the eager context, then we
-        # need to use `ResourceVariable`s.
-        use_resource = True
-
-    # Note that it's fine to reuse eager variables whose initialization was
-    # lifted from a function-building graph into the eager context (that's why
-    # the following clause is not wrapped in an `init_scope`); lifted variables
-    # are tracked by the graph's `VariableStore`.
-    if not reuse:
-      reuse = tf.compat.v1.AUTO_REUSE
-
-    # If a *_ref type is passed in an error would be triggered further down the
-    # stack. We prevent this using base_dtype to get a non-ref version of the
-    # type, before doing anything else. When _ref types are removed in favor of
-    # resources, this line can be removed.
-    try:
-      dtype = dtype.base_dtype
-    except AttributeError:
-      # .base_dtype not existing means that we will try and use the raw dtype
-      # which was passed in - this might be a NumPy type which is valid.
-      pass
-
-    # This is the main logic of get_variable.  However, custom_getter
-    # may override this logic.  So we save it as a callable and pass
-    # it to custom_getter.
-    # Note: the parameters of _true_getter, and their documentation, match
-    # *exactly* item-for-item with the docstring of this method.
-    def _true_getter(  # pylint: disable=missing-docstring
+
+    def __init__(self):
+        """Create a variable store."""
+        self._vars = {}  # A dictionary of the stored TensorFlow variables.
+        self._regularizers = (
+            {}
+        )  # A dict mapping var names to their regularizers.
+        self._layers = {}  # A dictionary of stored keras layers.
+        self._store_eager_variables = True
+
+    @contextlib.contextmanager
+    def scope(self):
+        with vs.with_variable_store(self):
+            yield
+
+    def get_variable(
+        self,
         name,
         shape=None,
         dtype=tf.float32,
@@ -313,699 +180,907 @@ def _true_getter(  # pylint: disable=missing-docstring
         regularizer=None,
         reuse=None,
         trainable=None,
-        collections=None,  # pylint: disable=unused-argument
+        collections=None,
         caching_device=None,
         partitioner=None,
         validate_shape=True,
-        use_resource=None,  # pylint: disable=unused-argument
+        use_resource=None,
+        custom_getter=None,
         constraint=None,
         synchronization=tf.VariableSynchronization.AUTO,
-        aggregation=tf.compat.v1.VariableAggregation.NONE):
-      # Partitioned variable currently unsupported w/ the shim
-      if partitioner is not None:
-        raise ValueError(
-            "`partitioner` arg for `get_variable` is unsupported in TF2."
-            "File a bug if you need help. You passed %s" % partitioner)
-
-      # Single variable case
-      if "%s/part_0" % name in self._vars:
-        raise ValueError(
-            "No partitioner was provided, but a partitioned version of the "
-            "variable was found: %s/part_0. Perhaps a variable of the same "
-            "name was already created with partitioning?" % name)
-
-      return self._get_single_variable(
-          name=name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          reuse=reuse,
-          trainable=trainable,
-          caching_device=caching_device,
-          validate_shape=validate_shape,
-          constraint=constraint,
-          synchronization=synchronization,
-          aggregation=aggregation)
-
-    synchronization, aggregation, trainable = (
-        validate_synchronization_aggregation_trainable(
-            synchronization, aggregation, trainable, name))
-
-    if custom_getter is not None:
-      # Handle backwards compatibility with getter arguments that were added
-      # to the API after users started writing custom getters.
-      custom_getter_kwargs = {
-          "getter": _true_getter,
-          "name": name,
-          "shape": shape,
-          "dtype": dtype,
-          "initializer": initializer,
-          "regularizer": regularizer,
-          "reuse": reuse,
-          "trainable": trainable,
-          "collections": collections,
-          "caching_device": caching_device,
-          "partitioner": partitioner,
-          "validate_shape": validate_shape,
-          "use_resource": use_resource,
-          "synchronization": synchronization,
-          "aggregation": aggregation,
-      }
-      # `fn_args` and `has_kwargs` can handle functions, `functools.partial`,
-      # `lambda`.
-      if ("constraint" in fn_args(custom_getter) or
-          _has_kwargs(custom_getter)):
-        custom_getter_kwargs["constraint"] = constraint
-      return custom_getter(**custom_getter_kwargs)
-    else:
-      return _true_getter(
-          name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          reuse=reuse,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          partitioner=partitioner,
-          validate_shape=validate_shape,
-          use_resource=use_resource,
-          constraint=constraint,
-          synchronization=synchronization,
-          aggregation=aggregation)
-
-  def _get_single_variable(
-      self,
-      name,
-      shape=None,
-      dtype=tf.float32,
-      initializer=None,
-      regularizer=None,
-      partition_info=None,
-      reuse=None,
-      trainable=None,
-      caching_device=None,
-      validate_shape=True,
-      constraint=None,
-      synchronization=tf.VariableSynchronization.AUTO,
-      aggregation=tf.compat.v1.VariableAggregation.NONE):
-    """Get or create a single Variable (e.g.
-
-    a shard or entire variable).
-
-    See the documentation of get_variable above (ignore partitioning components)
-    for details.
+        aggregation=tf.compat.v1.VariableAggregation.NONE,
+    ):
+        """Gets an existing variable with these parameters or create a new one.
+
+        If a variable with the given name is already stored, we return the
+        stored variable. Otherwise, we create a new one.
+
+        Set `reuse` to `True` when you only want to reuse existing Variables.
+        Set `reuse` to None (the default) or tf.compat.v1.AUTO_REUSE when you
+        want variables to be created if they don't exist or returned if they do.
+        In this shim, `reuse` of `False` will be treated as auto-reuse.
+
+        If initializer is `None` (the default), the default initializer passed
+        in the constructor is used. If that one is `None` too, we use a new
+        `glorot_uniform_initializer`. If initializer is a Tensor, we use it as a
+        value and derive the shape from the initializer.
+
+        If a partitioner is provided, a `PartitionedVariable` is returned.
+        Accessing this object as a `Tensor` returns the shards concatenated
+        along the partition axis.
+
+        Some useful partitioners are available.  See, e.g.,
+        `variable_axis_size_partitioner` and `min_max_variable_partitioner`.
+
+        Args:
+          name: The name of the new or existing variable.
+          shape: Shape of the new or existing variable.
+          dtype: Type of the new or existing variable. Defaults to `DT_FLOAT`.
+          initializer: Initializer for the variable.
+          regularizer: A (Tensor -> Tensor or None) function; the result of
+            applying it on a newly created variable will be added to the
+            collection GraphKeys.REGULARIZATION_LOSSES and can be used for
+            regularization.
+          reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation
+            of variables. When eager execution is enabled  this argument is
+            always forced to be False.
+          trainable: If `True` also add the variable to the graph collection
+            `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). `trainable`
+            becomes `True`, unless `synchronization` is set to `ON_READ`, in
+            which case it becomes `False`. Defaults to `True`.
+          collections: List of graph collections keys to add the `Variable` to.
+            Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
+          caching_device: Optional device string or function describing where
+            the Variable should be cached for reading. `None` to use the
+            Variable's device.  If not `None`, caches on another device.
+            Typical use is to cache on the device where the Ops using the
+            `Variable` reside, to deduplicate copying through `Switch` and other
+            conditional statements. Defaults to `None`.
+          partitioner: Optional callable that accepts a fully defined
+            `TensorShape` and dtype of the `Variable` to be created, and returns
+            a list of partitions for each axis (currently only one axis can be
+            partitioned).
+          validate_shape: If False, allows the variable to be initialized with a
+            value of unknown shape. If True, the default, the shape of
+            initial_value must be known.
+          use_resource: If False, creates a regular Variable. If True, creates
+            instead an experimental ResourceVariable which has well-defined
+            semantics. When starting off as False it will later change to True.
+            When eager execution is enabled this argument always True.
+            Defaults to `False`.
+          custom_getter: Callable that takes as a first argument the true
+            getter, and allows overwriting the internal get_variable method. The
+            signature of `custom_getter` should match that of this method, but
+            the most future-proof version will allow for changes:
+            `def custom_getter(getter, *args, **kwargs)`.
+            Direct access to all `get_variable` parameters is also allowed:
+            `def custom_getter(getter, name, *args, **kwargs)`.
+            A simple identity custom getter that simply creates variables with
+            modified names is:
+            ```python
+            def custom_getter(getter, name, *args, **kwargs):
+              return getter(name + '_suffix', *args, **kwargs)
+            ```
+          constraint: An optional projection function to be applied to the
+            variable after being updated by an `Optimizer` (e.g. used to
+            implement norm constraints or value constraints for layer weights).
+            The function must take as input the unprojected Tensor representing
+            the value of the variable and return the Tensor for the projected
+            value (which must have the same shape). Constraints are not safe to
+            use when doing asynchronous distributed training.
+          synchronization: Indicates when a distributed a variable will be
+            aggregated. Accepted values are constants defined in the class
+            `tf.VariableSynchronization`. By default the synchronization is set
+            to `AUTO` and the current `DistributionStrategy` chooses when to
+            synchronize.
+          aggregation: Indicates how a distributed variable will be aggregated.
+            Accepted values are constants defined in the class
+            `tf.VariableAggregation`.
+
+        Returns:
+          The created or existing `Variable` (or `PartitionedVariable`, if a
+          partitioner was used).
+
+        Raises:
+          ValueError: when creating a new variable and shape is not declared,
+            when reusing a variable and specifying a conflicting shape,
+            or when violating reuse during variable creation.
+          RuntimeError: when eager execution is enabled and not called from an
+            EagerVariableStore.
+        """
+        if custom_getter is not None and not callable(custom_getter):
+            raise ValueError(
+                f"Passed a custom_getter which is not callable: {custom_getter}"
+            )
+
+        with tf.init_scope():
+            if tf.executing_eagerly():
+                # Variable creation and initialization takes place in
+                # `init_scope`s; as such, if an `init_scope` lifts us into the
+                # eager context, then we need to use `ResourceVariable`s.
+                use_resource = True
+
+        # Note that it's fine to reuse eager variables whose initialization was
+        # lifted from a function-building graph into the eager context (that's
+        # why the following clause is not wrapped in an `init_scope`); lifted
+        # variables are tracked by the graph's `VariableStore`.
+        if not reuse:
+            reuse = tf.compat.v1.AUTO_REUSE
+
+        # If a *_ref type is passed in an error would be triggered further down
+        # the stack. We prevent this using base_dtype to get a non-ref version
+        # of the type, before doing anything else. When _ref types are removed
+        # in favor of resources, this line can be removed.
+        try:
+            dtype = dtype.base_dtype
+        except AttributeError:
+            # .base_dtype not existing means that we will try and use the raw
+            # dtype which was passed in - this might be a NumPy type which is
+            # valid.
+            pass
+
+        # This is the main logic of get_variable.  However, custom_getter
+        # may override this logic.  So we save it as a callable and pass
+        # it to custom_getter.
+        # Note: the parameters of _true_getter, and their documentation, match
+        # *exactly* item-for-item with the docstring of this method.
+        def _true_getter(
+            name,
+            shape=None,
+            dtype=tf.float32,
+            initializer=None,
+            regularizer=None,
+            reuse=None,
+            trainable=None,
+            collections=None,
+            caching_device=None,
+            partitioner=None,
+            validate_shape=True,
+            use_resource=None,
+            constraint=None,
+            synchronization=tf.VariableSynchronization.AUTO,
+            aggregation=tf.compat.v1.VariableAggregation.NONE,
+        ):
+            # Partitioned variable currently unsupported w/ the shim
+            if partitioner is not None:
+                raise ValueError(
+                    "`partitioner` arg for `get_variable` is unsupported in "
+                    "TF2. File a bug if you need help. "
+                    "You passed %s" % partitioner
+                )
+
+            # Single variable case
+            if f"{name}/part_0" in self._vars:
+                raise ValueError(
+                    "No partitioner was provided, but a partitioned version of "
+                    "the variable was found: %s/part_0. Perhaps a variable of "
+                    "the same name was already created with "
+                    "partitioning?" % name
+                )
+
+            return self._get_single_variable(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                initializer=initializer,
+                regularizer=regularizer,
+                reuse=reuse,
+                trainable=trainable,
+                caching_device=caching_device,
+                validate_shape=validate_shape,
+                constraint=constraint,
+                synchronization=synchronization,
+                aggregation=aggregation,
+            )
+
+        (
+            synchronization,
+            aggregation,
+            trainable,
+        ) = validate_synchronization_aggregation_trainable(
+            synchronization, aggregation, trainable, name
+        )
+
+        if custom_getter is not None:
+            # Handle backwards compatibility with getter arguments that were
+            # added to the API after users started writing custom getters.
+            custom_getter_kwargs = {
+                "getter": _true_getter,
+                "name": name,
+                "shape": shape,
+                "dtype": dtype,
+                "initializer": initializer,
+                "regularizer": regularizer,
+                "reuse": reuse,
+                "trainable": trainable,
+                "collections": collections,
+                "caching_device": caching_device,
+                "partitioner": partitioner,
+                "validate_shape": validate_shape,
+                "use_resource": use_resource,
+                "synchronization": synchronization,
+                "aggregation": aggregation,
+            }
+            # `fn_args` and `has_kwargs` can handle functions,
+            # `functools.partial`, `lambda`.
+            if "constraint" in fn_args(custom_getter) or _has_kwargs(
+                custom_getter
+            ):
+                custom_getter_kwargs["constraint"] = constraint
+            return custom_getter(**custom_getter_kwargs)
+        else:
+            return _true_getter(
+                name,
+                shape=shape,
+                dtype=dtype,
+                initializer=initializer,
+                regularizer=regularizer,
+                reuse=reuse,
+                trainable=trainable,
+                collections=collections,
+                caching_device=caching_device,
+                partitioner=partitioner,
+                validate_shape=validate_shape,
+                use_resource=use_resource,
+                constraint=constraint,
+                synchronization=synchronization,
+                aggregation=aggregation,
+            )
+
+    def _get_single_variable(
+        self,
+        name,
+        shape=None,
+        dtype=tf.float32,
+        initializer=None,
+        regularizer=None,
+        partition_info=None,
+        reuse=None,
+        trainable=None,
+        caching_device=None,
+        validate_shape=True,
+        constraint=None,
+        synchronization=tf.VariableSynchronization.AUTO,
+        aggregation=tf.compat.v1.VariableAggregation.NONE,
+    ):
+        """Get or create a single Variable (e.g. a shard or entire variable).
+
+        See the documentation of get_variable above (ignore partitioning
+        components) for details.
+
+        Args:
+          name: see get_variable.
+          shape: see get_variable.
+          dtype: see get_variable.
+          initializer: see get_variable.
+          regularizer: see get_variable.
+          partition_info: _PartitionInfo object.
+          reuse: see get_variable.
+          trainable: see get_variable.
+          caching_device: see get_variable.
+          validate_shape: see get_variable.
+          constraint: see get_variable.
+          synchronization: see get_variable.
+          aggregation: see get_variable.
+
+        Returns:
+          A Variable.  See documentation of get_variable above.
+
+        Raises:
+          ValueError: See documentation of get_variable above.
+        """
+        # Set to true if initializer is a constant.
+        initializing_from_value = False
+        if initializer is not None and not callable(initializer):
+            initializing_from_value = True
+        if shape is not None and initializing_from_value:
+            raise ValueError(
+                "If initializer is a constant, do not specify shape."
+            )
+
+        dtype = tf.as_dtype(dtype)
+        shape = as_shape(shape)
+
+        if name in self._vars:
+            # Here we handle the case when returning an existing variable.
+            found_var = self._vars[name]
+            if not shape.is_compatible_with(found_var.get_shape()):
+                raise ValueError(
+                    "Trying to share variable %s, but specified shape %s"
+                    " and found shape %s."
+                    % (name, shape, found_var.get_shape())
+                )
+            if not dtype.is_compatible_with(found_var.dtype):
+                dtype_str = dtype.name
+                found_type_str = found_var.dtype.name
+                raise ValueError(
+                    "Trying to share variable %s, but specified dtype %s"
+                    " and found dtype %s." % (name, dtype_str, found_type_str)
+                )
+            return found_var
+
+        # The code below handles only the case of creating a new variable.
+        if reuse is True:
+            raise ValueError(
+                "Variable %s does not exist, or was not created with "
+                "tf.get_variable(). Did you mean to set "
+                "reuse=tf.AUTO_REUSE in VarScope?" % name
+            )
+
+        # Create the tensor to initialize the variable with default value.
+        if initializer is None:
+            (
+                initializer,
+                initializing_from_value,
+            ) = self._get_default_initializer(
+                name=name, shape=shape, dtype=dtype
+            )
+        # Enter an init scope when creating the initializer.
+        with tf.init_scope():
+            if initializing_from_value:
+                init_val = initializer
+                variable_dtype = None
+            else:
+                # Instantiate initializer if provided initializer is a type
+                # object.
+                if tf_inspect.isclass(initializer):
+                    initializer = initializer()
+                if shape.is_fully_defined():
+                    if (
+                        "partition_info"
+                        in tf_inspect.getargspec(initializer).args
+                    ):
+                        init_val = functools.partial(
+                            initializer,
+                            shape.as_list(),
+                            dtype=dtype,
+                            partition_info=partition_info,
+                        )
+                    else:
+                        init_val = functools.partial(
+                            initializer, shape.as_list(), dtype=dtype
+                        )
+                    variable_dtype = dtype.base_dtype
+                else:
+                    init_val = initializer
+                    variable_dtype = None
+
+        # Create the variable (Always eagerly as a workaround for a strange
+        # tpu / funcgraph / keras functional model interaction )
+        with tf.init_scope():
+            v = tf.Variable(
+                initial_value=init_val,
+                name=name,
+                trainable=trainable,
+                caching_device=caching_device,
+                dtype=variable_dtype,
+                validate_shape=validate_shape,
+                constraint=constraint,
+                synchronization=synchronization,
+                aggregation=aggregation,
+            )
+
+        self._vars[name] = v
+        logging.vlog(
+            1,
+            "Created variable %s with shape %s and init %s",
+            v.name,
+            format(shape),
+            initializer,
+        )
+
+        # Run the regularizer if requested and save the resulting loss.
+        if regularizer:
+            self.add_regularizer(v, regularizer)
+
+        return v
+
+    def get_or_create_layer(self, name, create_layer_method):
+        if name not in self._layers:
+            layer = create_layer_method()
+            self._layers[name] = layer
+            if isinstance(layer, base_layer.Layer):
+                self._regularizers[name] = lambda: tf.math.reduce_sum(
+                    layer.losses
+                )
+        return self._layers[name]
+
+    def add_regularizer(self, var, regularizer):
+        self._regularizers[var.name] = functools.partial(regularizer, var)
+
+    # Initialize variable when no initializer provided
+    def _get_default_initializer(self, name, shape=None, dtype=tf.float32):
+        """Provide a default initializer and a corresponding value.
+
+        Args:
+          name: see get_variable.
+          shape: see get_variable.
+          dtype: see get_variable.
+
+        Returns:
+          initializer and initializing_from_value. See get_variable above.
+
+        Raises:
+          ValueError: When giving unsupported dtype.
+        """
+        del shape
+        # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+        if dtype.is_floating:
+            initializer = tf.compat.v1.glorot_uniform_initializer()
+            initializing_from_value = False
+        # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+        # If dtype is DT_BOOL, provide a default value `FALSE`
+        elif (
+            dtype.is_integer
+            or dtype.is_unsigned
+            or dtype.is_bool
+            or dtype == tf.string
+        ):
+            initializer = tf.compat.v1.zeros_initializer()
+            initializing_from_value = False
+        # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX
+        # here?
+        else:
+            raise ValueError(
+                "An initializer for variable %s of %s is required"
+                % (name, dtype.base_dtype)
+            )
 
-    Args:
-      name: see get_variable.
-      shape: see get_variable.
-      dtype: see get_variable.
-      initializer: see get_variable.
-      regularizer: see get_variable.
-      partition_info: _PartitionInfo object.
-      reuse: see get_variable.
-      trainable: see get_variable.
-      caching_device: see get_variable.
-      validate_shape: see get_variable.
-      constraint: see get_variable.
-      synchronization: see get_variable.
-      aggregation: see get_variable.
+        return initializer, initializing_from_value
 
-    Returns:
-      A Variable.  See documentation of get_variable above.
 
-    Raises:
-      ValueError: See documentation of get_variable above.
-    """
-    # Set to true if initializer is a constant.
-    initializing_from_value = False
-    if initializer is not None and not callable(initializer):
-      initializing_from_value = True
-    if shape is not None and initializing_from_value:
-      raise ValueError("If initializer is a constant, do not specify shape.")
-
-    dtype = tf.as_dtype(dtype)
-    shape = as_shape(shape)
-
-    if name in self._vars:
-      # Here we handle the case when returning an existing variable.
-      found_var = self._vars[name]
-      if not shape.is_compatible_with(found_var.get_shape()):
-        raise ValueError("Trying to share variable %s, but specified shape %s"
-                         " and found shape %s." %
-                         (name, shape, found_var.get_shape()))
-      if not dtype.is_compatible_with(found_var.dtype):
-        dtype_str = dtype.name
-        found_type_str = found_var.dtype.name
-        raise ValueError("Trying to share variable %s, but specified dtype %s"
-                         " and found dtype %s." %
-                         (name, dtype_str, found_type_str))
-      return found_var
-
-    # The code below handles only the case of creating a new variable.
-    if reuse is True:  # pylint: disable=g-bool-id-comparison
-      raise ValueError("Variable %s does not exist, or was not created with "
-                       "tf.get_variable(). Did you mean to set "
-                       "reuse=tf.AUTO_REUSE in VarScope?" % name)
-
-    # Create the tensor to initialize the variable with default value.
-    if initializer is None:
-      initializer, initializing_from_value = self._get_default_initializer(
-          name=name, shape=shape, dtype=dtype)
-    # Enter an init scope when creating the initializer.
-    with tf.init_scope():
-      if initializing_from_value:
-        init_val = initializer
-        variable_dtype = None
-      else:
-        # Instantiate initializer if provided initializer is a type object.
-        if tf_inspect.isclass(initializer):
-          initializer = initializer()
-        if shape.is_fully_defined():
-          if "partition_info" in tf_inspect.getargspec(initializer).args:
-            init_val = functools.partial(initializer,
-                                         shape.as_list(),
-                                         dtype=dtype,
-                                         partition_info=partition_info)
-          else:
-            init_val = functools.partial(initializer,
-                                         shape.as_list(), dtype=dtype)
-          variable_dtype = dtype.base_dtype
-        else:
-          init_val = initializer
-          variable_dtype = None
-
-    # Create the variable (Always eagerly as a workaround for a strange
-    # tpu / funcgraph / keras functional model interaction )
-    with tf.init_scope():
-      v = tf.Variable(
-          initial_value=init_val,
-          name=name,
-          trainable=trainable,
-          caching_device=caching_device,
-          dtype=variable_dtype,
-          validate_shape=validate_shape,
-          constraint=constraint,
-          synchronization=synchronization,
-          aggregation=aggregation)
-
-    self._vars[name] = v
-    logging.vlog(1, "Created variable %s with shape %s and init %s", v.name,
-                 format(shape), initializer)
-
-    # Run the regularizer if requested and save the resulting loss.
-    if regularizer:
-      self.add_regularizer(v, regularizer)
-
-    return v
-
-  def get_or_create_layer(self, name, create_layer_method):
-    if name not in self._layers:
-      layer = create_layer_method()
-      self._layers[name] = layer
-      if isinstance(layer, base_layer.Layer):
-        self._regularizers[name] = lambda: tf.math.reduce_sum(layer.losses)
-    return self._layers[name]
-
-  def add_regularizer(self, var, regularizer):
-    self._regularizers[var.name] = functools.partial(regularizer, var)
-
-  # Initialize variable when no initializer provided
-  def _get_default_initializer(self, name, shape=None, dtype=tf.float32):
-    """Provide a default initializer and a corresponding value.
+@keras_export(v1=["keras.utils.track_tf1_style_variables"])
+def track_tf1_style_variables(method):
+    """Wrap layer & module methods in this decorator to capture tf1-style
+    weights.
+
+    Decorating a `tf.keras.Layer`'s  or `tf.Module`'s methods with this
+    decorator will cause the layer/module to track weights created/used
+    via `tf.compat.v1.get_variable` (and by extension `tf.compat.v1.layers`)
+    inside the decorated method.
+
+    In addition to tracking the weights themselves under the standard
+    `layer.variable`/`module.variable`/etc. properties, if the method belongs
+    to a `tf.keras.Layer` then any regularization losses specified via the
+    `get_variable` or `tf.compat.v1.layers` regularizer arguments will get
+    tracked by the layer under the standard `layer.losses` property.
+
+    This tracking enables using large classes of TF1-style model-forward-pass
+    code inside of Keras layers or `tf.Modules` in TF2 with TF2 behaviors
+    enabled.
+
+    Example of capturing tf.compat.v1.layer-based modeling code as a Keras
+    layer:
+
+    ```python
+    class WrappedDoubleDenseLayer(tf.keras.layers.Layer):
+
+      def __init__(self, units, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.units = units
+
+      @tf.compat.v1.keras.utils.track_tf1_style_variables
+      def call(self, inputs):
+        with tf.compat.v1.variable_scope("double_dense_layer"):
+          out = tf.compat.v1.layers.dense(
+              inputs, self.units, name="dense_one",
+              kernel_initializer=tf.compat.v1.random_normal_initializer,
+              kernel_regularizer="l2")
+          out = tf.compat.v1.layers.dense(
+              out, self.units, name="dense_two",
+              kernel_initializer=tf.compat.v1.random_normal_initializer(),
+              kernel_regularizer="l2")
+        return out
+
+    # Create a layer that can be used as a standard keras layer
+    layer = WrappedDoubleDenseLayer(10)
+
+    # call the layer on inputs
+    layer(...)
+
+    # Variables created/used within the scope will be tracked by the layer
+    layer.weights
+    layer.trainable_variables
+
+    # Regularization losses will be captured in layer.losses after a call,
+    # just like any other Keras layer
+    reg_losses = layer.losses
+    ```
+
+    Example of capturing tf.compat.v1.get_variable-based modeling code as
+    a Keras layer:
+
+    ```python
+    class WrappedDoubleDenseLayer(tf.keras.layers.Layer):
+
+      def __init__(self, units, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.units = units
+
+      @tf.compat.v1.keras.utils.track_tf1_style_variables
+      def call(self, inputs):
+        out = inputs
+        with tf.compat.v1.variable_scope("double_dense_layer"):
+          with tf.compat.v1.variable_scope("dense_one"):
+            # The weights are created with a `regularizer`,
+            # so the layer should track their regularization losses
+            kernel = tf.compat.v1.get_variable(
+                shape=[out.shape[-1], self.units],
+                regularizer=regularizers.L2(),
+                initializer=init_ops.ones_initializer(),
+                name="kernel")
+            bias = tf.compat.v1.get_variable(
+                shape=[self.units,],
+                initializer=init_ops.zeros_initializer(),
+                name="bias")
+            out = tf.compat.v1.math.matmul(out, kernel)
+            out = tf.compat.v1.nn.bias_add(out, bias)
+          with tf.compat.v1.variable_scope("dense_two"):
+            kernel = tf.compat.v1.get_variable(
+                shape=[out.shape[-1], self.units],
+                regularizer=regularizers.L2(),
+                initializer=init_ops.ones_initializer(),
+                name="kernel")
+            bias = tf.compat.v1.get_variable(
+                shape=[self.units,],
+                initializer=init_ops.zeros_initializer(),
+                name="bias")
+            out = tf.compat.v1.math.matmul(out, kernel)
+            out = tf.compat.v1.nn.bias_add(out, bias)
+        return out
+
+    # Create a layer that can be used as a standard keras layer
+    layer = WrappedDoubleDenseLayer(10)
+
+    # call the layer on inputs
+    layer(...)
+
+    # Variables created/used within the scope will be tracked by the layer
+    layer.weights
+    layer.trainable_variables
+
+    # Regularization losses will be captured in layer.losses after a call,
+    # just like any other Keras layer
+    reg_losses = layer.losses
+    ```
+
+    Regularization losses:
+      Any regularizers specified in the `get_variable` calls or
+      `compat.v1.layer` creations will get captured if they occur in your
+      decorated method and the method belongs to a
+      `tf.keras.Layer`/`tf.keras.Module`. Regularization losses
+      are accessible in `layer.losses` after a call just like in a standard
+      Keras layer, and will be captured by any model that includes this layer.
+      Regularization losses attached to Keras layers/models set as attributes
+      of your layer will also get captured in the standard Keras regularization
+      loss tracking.
+
+      (While Modules have no `losses` property, no-arg callables to compute
+       the regularization losses may be tracked as dict values in a private
+       `module._tf1_style_var_store._regularizers` property, but only for
+       `tf.compat.v1.layers` and `get_variable` weights and not for any other
+       nested Keras layers/tf.Modules)
+
+    Variable scope / variable reuse:
+      variable-scope based reuse in your decorated method will be respected,
+      and work like variable-scope based reuse in TF1.
+
+    Variable Names/Pre-trained checkpoint loading:
+      Variable naming from get_variable and `compat.v1.layer` layers will match
+      the TF1 names, so you should be able to re-use your old name-based
+      checkpoints. Variable naming for Keras layers/models or for variables
+      created by `tf.Variable` may change when going to eager execution.
+
+    Training Arg if you decorate `layer.call`:
+      Keras will pass a `training` arg to this layer if `call` contains
+      a `training` arg or a `**kwargs` varargs in its call signature,
+      similarly to how keras passes `training` to other layers in TF2 that have
+      similar signatures in their `call` implementations.
+      See more details in the docs
+      on `tf.keras.layers.Layer` to understand what will be passed and when.
+      Note: tf.compat.v1.layers are usually not called with `training=None`,
+      so the training arg to `forward_pass` might not feed through to them
+      unless you pass it to their calls explicitly.
+
+    Caveats:
+      * TF2 will not prune unused variable updates (or unused outputs). You may
+        need to adjust your forward pass code to avoid computations or variable
+        updates that you don't intend to use.
+      * Avoid Nesting variable creation in tf.function inside of
+        methods decorated with `track_tf1_style_variables`
+        While the method may safely be used from inside a `tf.function`, using
+        a function inside of a decorated method may break the variable scoping.
+      * This decorator only adds implicit tracking for legacy tf1-style
+        get_variable / compat.v1.layers usage.
+        If you would like to use nested Keras layers/models
+        inside the decorated method, you need to
+        assign them as attributes of your layer so that Keras/Module's standard
+        object-oriented weights (and loss tracking for layers) will kick in.
+        See the intro to modules, layers, and models
+        [guide](https://www.tensorflow.org/guide/intro_to_modules) for more
+        info.  As a backup, the `compat.v1.keras.utils.get_or_create_layer`
+        method will ease tracking nested keras model weights and losses for
+        existing TF1 code, but new code should use explicit tracking.
 
     Args:
-      name: see get_variable.
-      shape: see get_variable.
-      dtype: see get_variable.
+      method: The method to decorate. This should belong to a custom tf.Module,
+      tf.keras.layers.Layer, or tf.keras.Model.
 
     Returns:
-      initializer and initializing_from_value. See get_variable above.
-
-    Raises:
-      ValueError: When giving unsupported dtype.
+      The decorated method.
     """
-    del shape
-    # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
-    if dtype.is_floating:
-      initializer = tf.compat.v1.glorot_uniform_initializer()
-      initializing_from_value = False
-    # If dtype is DT_INT/DT_UINT, provide a default value `zero`
-    # If dtype is DT_BOOL, provide a default value `FALSE`
-    elif (dtype.is_integer or dtype.is_unsigned or dtype.is_bool or
-          dtype == tf.string):
-      initializer = tf.compat.v1.zeros_initializer()
-      initializing_from_value = False
-    # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
-    else:
-      raise ValueError("An initializer for variable %s of %s is required" %
-                       (name, dtype.base_dtype))
 
-    return initializer, initializing_from_value
+    def _method_wrapper(self, *args, **kwargs):
+        var_store = getattr(self, "_tf1_style_var_store", None)
+        if not var_store:
+            if not isinstance(self, tf.Module):
+                # Raise an error if you incorrectly decorate a method
+                # that is not a method of a Module, Layer, or Model:
+                raise ValueError(
+                    "`@tf.compat.v1.keras.utils.track_tf1_layers_and_variables`"
+                    " must be applied to a method of a subclassed `tf.Module`, "
+                    "`tf.keras.layers.Layer`, or `tf.keras.Model` and which "
+                    "takes `self` as the first argument. But, the first "
+                    "argument passed to the decorated method was {}, which "
+                    "does not extend Module, Layer, or Model.".format(self)
+                )
+            var_store = _EagerVariableStore()
+            self._tf1_style_var_store = var_store
+
+        existing_regularized_variables = set(var_store._regularizers.keys())
+        with var_store.scope():
+            out = method(self, *args, **kwargs)
+
+        # If this is a layer method, add the regularization losses
+        # to the layer for any newly-created regularized variables
+        if isinstance(self, base_layer.Layer):
+            for (
+                var_name,
+                regularizer,
+            ) in var_store._regularizers.items():
+                if var_name not in existing_regularized_variables:
+                    self.add_loss(regularizer)
+
+        return out
+
+    return tf.__internal__.decorator.make_decorator(
+        target=method, decorator_func=_method_wrapper
+    )
 
 
-@keras_export(v1=["keras.utils.track_tf1_style_variables"])
-def track_tf1_style_variables(method):
-  """Wrap layer & module methods in this decorator to capture tf1-style weights.
-
-  Decorating a `tf.keras.Layer`'s  or `tf.Module`'s methods with this
-  decorator will cause the layer/module to track weights created/used
-  via `tf.compat.v1.get_variable` (and by extension `tf.compat.v1.layers`)
-  inside the decorated method.
-
-  In addition to tracking the weights themselves under the standard
-  `layer.variable`/`module.variable`/etc. properties, if the method belongs
-  to a `tf.keras.Layer` then any regularization losses specified via the
-  `get_variable` or `tf.compat.v1.layers` regularizer arguments will get
-  tracked by the layer under the standard `layer.losses` property.
-
-  This tracking enables using large classes of TF1-style model-forward-pass
-  code inside of Keras layers or `tf.Modules` in TF2 with TF2 behaviors enabled.
-
-  Example of capturing tf.compat.v1.layer-based modeling code as a Keras layer:
-
-  ```python
-  class WrappedDoubleDenseLayer(tf.keras.layers.Layer):
-
-    def __init__(self, units, *args, **kwargs):
-      super().__init__(*args, **kwargs)
-      self.units = units
-
-    @tf.compat.v1.keras.utils.track_tf1_style_variables
-    def call(self, inputs):
-      with tf.compat.v1.variable_scope("double_dense_layer"):
-        out = tf.compat.v1.layers.dense(
-            inputs, self.units, name="dense_one",
-            kernel_initializer=tf.compat.v1.random_normal_initializer,
-            kernel_regularizer="l2")
-        out = tf.compat.v1.layers.dense(
-            out, self.units, name="dense_two",
-            kernel_initializer=tf.compat.v1.random_normal_initializer(),
-            kernel_regularizer="l2")
-      return out
-
-  # Create a layer that can be used as a standard keras layer
-  layer = WrappedDoubleDenseLayer(10)
-
-  # call the layer on inputs
-  layer(...)
-
-  # Variables created/used within the scope will be tracked by the layer
-  layer.weights
-  layer.trainable_variables
-
-  # Regularization losses will be captured in layer.losses after a call,
-  # just like any other Keras layer
-  reg_losses = layer.losses
-  ```
-
-  Example of capturing tf.compat.v1.get_variable-based modeling code as
-  a Keras layer:
-
-  ```python
-  class WrappedDoubleDenseLayer(tf.keras.layers.Layer):
-
-    def __init__(self, units, *args, **kwargs):
-      super().__init__(*args, **kwargs)
-      self.units = units
-
-    @tf.compat.v1.keras.utils.track_tf1_style_variables
-    def call(self, inputs):
-      out = inputs
-      with tf.compat.v1.variable_scope("double_dense_layer"):
-        with tf.compat.v1.variable_scope("dense_one"):
-          # The weights are created with a `regularizer`,
-          # so the layer should track their regularization losses
-          kernel = tf.compat.v1.get_variable(
-              shape=[out.shape[-1], self.units],
-              regularizer=regularizers.L2(),
-              initializer=init_ops.ones_initializer(),
-              name="kernel")
-          bias = tf.compat.v1.get_variable(
-              shape=[self.units,],
-              initializer=init_ops.zeros_initializer(),
-              name="bias")
-          out = tf.compat.v1.math.matmul(out, kernel)
-          out = tf.compat.v1.nn.bias_add(out, bias)
-        with tf.compat.v1.variable_scope("dense_two"):
-          kernel = tf.compat.v1.get_variable(
-              shape=[out.shape[-1], self.units],
-              regularizer=regularizers.L2(),
-              initializer=init_ops.ones_initializer(),
-              name="kernel")
-          bias = tf.compat.v1.get_variable(
-              shape=[self.units,],
-              initializer=init_ops.zeros_initializer(),
-              name="bias")
-          out = tf.compat.v1.math.matmul(out, kernel)
-          out = tf.compat.v1.nn.bias_add(out, bias)
-      return out
-
-  # Create a layer that can be used as a standard keras layer
-  layer = WrappedDoubleDenseLayer(10)
-
-  # call the layer on inputs
-  layer(...)
-
-  # Variables created/used within the scope will be tracked by the layer
-  layer.weights
-  layer.trainable_variables
-
-  # Regularization losses will be captured in layer.losses after a call,
-  # just like any other Keras layer
-  reg_losses = layer.losses
-  ```
-
-  Regularization losses:
-    Any regularizers specified in the `get_variable` calls or `compat.v1.layer`
-    creations will get captured if they occur in your decorated method
-    and the method belongs to a `tf.keras.Layer`/`tf.keras.Module`.
-    Regularization losses
-    are accessible in `layer.losses` after a call just like in a standard
-    Keras layer, and will be captured by any model that includes this layer.
-    Regularization losses attached to Keras layers/models set as attributes
-    of your layer will also get captured in the standard Keras regularization
-    loss tracking.
-
-    (While Modules have no `losses` property, no-arg callables to compute
-     the regularization losses may be tracked as dict values in a private
-     `module._tf1_style_var_store._regularizers` property, but only for
-     `tf.compat.v1.layers` and `get_variable` weights and not for any other
-     nested Keras layers/tf.Modules)
-
-  Variable scope / variable reuse:
-    variable-scope based reuse in your decorated method will be respected,
-    and work like variable-scope based reuse in TF1.
-
-  Variable Names/Pre-trained checkpoint loading:
-    Variable naming from get_variable and `compat.v1.layer` layers will match
-    the TF1 names, so you should be able to re-use your old name-based
-    checkpoints. Variable naming for Keras layers/models or for variables
-    created by `tf.Variable` may change when going to eager execution.
-
-  Training Arg if you decorate `layer.call`:
-    Keras will pass a `training` arg to this layer if `call` contains
-    a `training` arg or a `**kwargs` varargs in its call signature,
-    similarly to how keras passes `training` to other layers in TF2 that have
-    similar signatures in their `call` implementations.
-    See more details in the docs
-    on `tf.keras.layers.Layer` to understand what will be passed and when.
-    Note: tf.compat.v1.layers are usually not called with `training=None`,
-    so the training arg to `forward_pass` might not feed through to them
-    unless you pass it to their calls explicitly.
-
-  Caveats:
-    * TF2 will not prune unused variable updates (or unused outputs). You may
-      need to adjust your forward pass code to avoid computations or variable
-      updates that you don't intend to use.
-    * Avoid Nesting variable creation in tf.function inside of
-      methods decorated with `track_tf1_style_variables`
-      While the method may safely be used from inside a `tf.function`, using
-      a function inside of a decorated method may break the variable scoping.
-    * This decorator only adds implicit tracking for legacy tf1-style
-      get_variable / compat.v1.layers usage.
-      If you would like to use nested Keras layers/models
-      inside the decorated method, you need to
-      assign them as attributes of your layer so that Keras/Module's standard
-      object-oriented weights (and loss tracking for layers) will kick in.
-      See the intro to modules, layers, and models
-      [guide](https://www.tensorflow.org/guide/intro_to_modules) for more info.
-      As a backup, the `compat.v1.keras.utils.get_or_create_layer` method will
-      ease tracking nested keras model weights and losses for existing TF1 code,
-      but new code should use explicit tracking.
-
-  Args:
-    method: The method to decorate. This should belong to a custom tf.Module,
-    tf.keras.layers.Layer, or tf.keras.Model.
-
-  Returns:
-    The decorated method.
-  """
-
-  def _method_wrapper(self, *args, **kwargs):
-    var_store = getattr(self, "_tf1_style_var_store", None)
-    if not var_store:
-      if not isinstance(self, tf.Module):
-        # Raise an error if you incorrectly decorate a method
-        # that is not a method of a Module, Layer, or Model:
-        raise ValueError(
-            "`@tf.compat.v1.keras.utils.track_tf1_layers_and_variables` must "
-            "be applied to a method of a subclassed `tf.Module`, "
-            "`tf.keras.layers.Layer`, or `tf.keras.Model` and which takes "
-            "`self` as the first argument. But, the first argument passed "
-            "to the decorated method was {}, which does not "
-            "extend Module, Layer, or Model.".format(self))
-      var_store = _EagerVariableStore()
-      self._tf1_style_var_store = var_store  # pylint: disable=protected-access
-
-    existing_regularized_variables = set(var_store._regularizers.keys())  # pylint: disable=protected-access
-    with var_store.scope():
-      out = method(self, *args, **kwargs)
-
-    # If this is a layer method, add the regularization losses
-    # to the layer for any newly-created regularized variables
-    if isinstance(self, base_layer.Layer):
-      for var_name, regularizer in var_store._regularizers.items():  # pylint: disable=protected-access
-        if var_name not in existing_regularized_variables:
-          self.add_loss(regularizer)
-
-    return out
-
-  return tf.__internal__.decorator.make_decorator(
-      target=method, decorator_func=_method_wrapper)
+class VariableScopeLayer(base_layer.Layer):
+    """Wrapper Layer to capture `compat.v1.get_variable` and `compat.v1.layers`.
+
+    This shim layer allows using large sets of TF1 model-forward-pass code as a
+    Keras layer that works in TF2 with TF2 behaviors enabled. It will capture
+    both weights and regularization losses of your forward-pass code. To use it,
+    override this class and put your TF1 model's forward pass inside your
+    implementation for `forward_pass`. (Unlike standard custom Keras layers,
+    do not override `call`.)
+
+    Below are some examples, and then more details on the functionality of this
+    shim layer to wrap TF1 model forward passes.
+
+    Example of capturing tf.compat.v1.layer-based modeling code as a Keras
+    layer:
+
+    ```python
+    class WrappedDoubleDenseLayer(variable_scope_shim.VariableScopeLayer):
+
+      def __init__(self, units, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.units = units
+
+      def forward_pass(self, inputs):
+        with variable_scope.variable_scope("double_dense_layer"):
+          out = tf.compat.v1.layers.dense(
+              inputs, self.units, name="dense_one",
+              kernel_initializer=tf.compat.v1.random_normal_initializer,
+              kernel_regularizer="l2")
+          out = tf.compat.v1.layers.dense(
+              out, self.units, name="dense_two",
+              kernel_initializer=tf.compat.v1.random_normal_initializer(),
+              kernel_regularizer="l2")
+        return out
+
+    # Create a layer that can be used as a standard keras layer
+    layer = WrappedDoubleDenseLayer(10)
+
+    # call the layer on inputs
+    layer(...)
+
+    # Variables created/used within the scope will be tracked by the layer
+    layer.weights
+    layer.trainable_variables
+
+    # Regularization losses will be captured in layer.losses after a call,
+    # just like any other Keras layer
+    reg_losses = layer.losses
+    ```
+
+    Example of capturing tf.compat.v1.get_variable-based modeling code as
+    a Keras layer:
+
+    ```python
+    class WrappedDoubleDenseLayer(variable_scope_shim.VariableScopeLayer):
+
+      def __init__(self, units, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.units = units
+
+      def forward_pass(self, inputs):
+        out = inputs
+        with tf.compat.v1.variable_scope("double_dense_layer"):
+          with tf.compat.v1.variable_scope("dense_one"):
+            # The weights are created with a `regularizer`,
+            # so the layer should track their regularization losses
+            kernel = tf.compat.v1.get_variable(
+                shape=[out.shape[-1], self.units],
+                regularizer=regularizers.L2(),
+                initializer=init_ops.ones_initializer(),
+                name="kernel")
+            bias = tf.compat.v1.get_variable(
+                shape=[self.units,],
+                initializer=init_ops.zeros_initializer(),
+                name="bias")
+            out = tf.compat.v1.math.matmul(out, kernel)
+            out = tf.compat.v1.nn.bias_add(out, bias)
+          with tf.compat.v1.variable_scope("dense_two"):
+            kernel = tf.compat.v1.get_variable(
+                shape=[out.shape[-1], self.units],
+                regularizer=regularizers.L2(),
+                initializer=init_ops.ones_initializer(),
+                name="kernel")
+            bias = tf.compat.v1.get_variable(
+                shape=[self.units,],
+                initializer=init_ops.zeros_initializer(),
+                name="bias")
+            out = tf.compat.v1.math.matmul(out, kernel)
+            out = tf.compat.v1.nn.bias_add(out, bias)
+        return out
+
+    # Create a layer that can be used as a standard keras layer
+    layer = WrappedDoubleDenseLayer(10)
+
+    # call the layer on inputs
+    layer(...)
+
+    # Variables created/used within the scope will be tracked by the layer
+    layer.weights
+    layer.trainable_variables
+
+    # Regularization losses will be captured in layer.losses after a call,
+    # just like any other Keras layer
+    reg_losses = layer.losses
+    ```
+
+    Regularization losses:
+      Any regularizers specified in the `get_variable` calls or
+      `compat.v1.layer` creations will get captured by this wrapper layer.
+      Regularization losses are accessible in `layer.losses` after a call just
+      like in a standard Keras layer, and will be captured by any model that
+      includes this layer.  Regularization losses attached to Keras
+      layers/models set as attributes of your layer will also get captured in
+      the standard Keras regularization loss tracking.
+
+    Variable scope / variable reuse:
+      variable-scope based reuse in the `forward_pass` will be respected,
+      and work like variable-scope based reuse in TF1.
+
+    Variable Names/Pre-trained checkpoint loading:
+      Variable naming from get_variable and `compat.v1.layer` layers will match
+      the TF1 names, so you should be able to re-use your old name-based
+      checkpoints. Variable naming for Keras layers/models or for variables
+      created by `tf.Variable` may change when going to eager execution.
+
+    Training Arg in `forward_pass`:
+      Keras will pass a `training` arg to this layer if `forward_pass` contains
+      a `training` arg or a `**kwargs` varargs in its call signature,
+      similarly to how keras passes `training` to other layers in TF2 that have
+      similar signatures in their `call` implementations.
+      See more details in the docs
+      on `tf.keras.layers.Layer` to understand what will be passed and when.
+      Note: tf.compat.v1.layers are usually not called with `training=None`,
+      so the training arg to `forward_pass` might not feed through to them
+      unless you pass it to their calls explicitly.
+
+    Call signature of the forward pass:
+      The semantics of the forward pass signature match the standard
+      Keras layer `call` signature, including how Keras decides when
+      to pass in a `training` arg., and the semantics applied to
+      the first positional arg in the call signature.
+
+    Caveats:
+      * TF2 will not prune unused variable updates (or unused outputs). You may
+        need to adjust your forward pass code to avoid computations or variable
+        updates that you don't intend to use. (E.g. by adding a flag to the
+        `forward_pass` call signature and branching on it).
+      * Avoid Nesting variable creation in tf.function inside of `forward_pass`
+        While the layer may safely be used from inside a `tf.function`, using
+        a function inside of `forward_pass` will break the variable scoping.
+      * If you would like to nest Keras layers/models or other
+        `VariableScopeLayer`s directly in `forward_pass`, you need to
+        assign them as attributes of your layer so that Keras's standard
+        object-oriented weights and loss tracking will kick in.
+        See the intro to modules, layers, and models
+        [guide](https://www.tensorflow.org/guide/intro_to_modules) for more info
+    """
 
+    @property
+    @layer_utils.cached_per_instance
+    def _call_full_argspec(self):
+        # Argspec inspection is expensive and the call spec is used often, so it
+        # makes sense to cache the result.
+        return tf_inspect.getfullargspec(self.forward_pass)
 
-class VariableScopeLayer(base_layer.Layer):
-  """Wrapper Layer to capture `compat.v1.get_variable` and `compat.v1.layers`.
-
-  This shim layer allows using large sets of TF1 model-forward-pass code as a
-  Keras layer that works in TF2 with TF2 behaviors enabled. It will capture
-  both weights and regularization losses of your forward-pass code. To use it,
-  override this class and put your TF1 model's forward pass inside your
-  implementation for `forward_pass`. (Unlike standard custom Keras layers,
-  do not override `call`.)
-
-  Below are some examples, and then more details on the functionality of this
-  shim layer to wrap TF1 model forward passes.
-
-  Example of capturing tf.compat.v1.layer-based modeling code as a Keras layer:
-
-  ```python
-  class WrappedDoubleDenseLayer(variable_scope_shim.VariableScopeLayer):
-
-    def __init__(self, units, *args, **kwargs):
-      super().__init__(*args, **kwargs)
-      self.units = units
-
-    def forward_pass(self, inputs):
-      with variable_scope.variable_scope("double_dense_layer"):
-        out = tf.compat.v1.layers.dense(
-            inputs, self.units, name="dense_one",
-            kernel_initializer=tf.compat.v1.random_normal_initializer,
-            kernel_regularizer="l2")
-        out = tf.compat.v1.layers.dense(
-            out, self.units, name="dense_two",
-            kernel_initializer=tf.compat.v1.random_normal_initializer(),
-            kernel_regularizer="l2")
-      return out
-
-  # Create a layer that can be used as a standard keras layer
-  layer = WrappedDoubleDenseLayer(10)
-
-  # call the layer on inputs
-  layer(...)
-
-  # Variables created/used within the scope will be tracked by the layer
-  layer.weights
-  layer.trainable_variables
-
-  # Regularization losses will be captured in layer.losses after a call,
-  # just like any other Keras layer
-  reg_losses = layer.losses
-  ```
-
-  Example of capturing tf.compat.v1.get_variable-based modeling code as
-  a Keras layer:
-
-  ```python
-  class WrappedDoubleDenseLayer(variable_scope_shim.VariableScopeLayer):
-
-    def __init__(self, units, *args, **kwargs):
-      super().__init__(*args, **kwargs)
-      self.units = units
-
-    def forward_pass(self, inputs):
-      out = inputs
-      with tf.compat.v1.variable_scope("double_dense_layer"):
-        with tf.compat.v1.variable_scope("dense_one"):
-          # The weights are created with a `regularizer`,
-          # so the layer should track their regularization losses
-          kernel = tf.compat.v1.get_variable(
-              shape=[out.shape[-1], self.units],
-              regularizer=regularizers.L2(),
-              initializer=init_ops.ones_initializer(),
-              name="kernel")
-          bias = tf.compat.v1.get_variable(
-              shape=[self.units,],
-              initializer=init_ops.zeros_initializer(),
-              name="bias")
-          out = tf.compat.v1.math.matmul(out, kernel)
-          out = tf.compat.v1.nn.bias_add(out, bias)
-        with tf.compat.v1.variable_scope("dense_two"):
-          kernel = tf.compat.v1.get_variable(
-              shape=[out.shape[-1], self.units],
-              regularizer=regularizers.L2(),
-              initializer=init_ops.ones_initializer(),
-              name="kernel")
-          bias = tf.compat.v1.get_variable(
-              shape=[self.units,],
-              initializer=init_ops.zeros_initializer(),
-              name="bias")
-          out = tf.compat.v1.math.matmul(out, kernel)
-          out = tf.compat.v1.nn.bias_add(out, bias)
-      return out
-
-  # Create a layer that can be used as a standard keras layer
-  layer = WrappedDoubleDenseLayer(10)
-
-  # call the layer on inputs
-  layer(...)
-
-  # Variables created/used within the scope will be tracked by the layer
-  layer.weights
-  layer.trainable_variables
-
-  # Regularization losses will be captured in layer.losses after a call,
-  # just like any other Keras layer
-  reg_losses = layer.losses
-  ```
-
-  Regularization losses:
-    Any regularizers specified in the `get_variable` calls or `compat.v1.layer`
-    creations will get captured by this wrapper layer. Regularization losses
-    are accessible in `layer.losses` after a call just like in a standard
-    Keras layer, and will be captured by any model that includes this layer.
-    Regularization losses attached to Keras layers/models set as attributes
-    of your layer will also get captured in the standard Keras regularization
-    loss tracking.
-
-  Variable scope / variable reuse:
-    variable-scope based reuse in the `forward_pass` will be respected,
-    and work like variable-scope based reuse in TF1.
-
-  Variable Names/Pre-trained checkpoint loading:
-    Variable naming from get_variable and `compat.v1.layer` layers will match
-    the TF1 names, so you should be able to re-use your old name-based
-    checkpoints. Variable naming for Keras layers/models or for variables
-    created by `tf.Variable` may change when going to eager execution.
-
-  Training Arg in `forward_pass`:
-    Keras will pass a `training` arg to this layer if `forward_pass` contains
-    a `training` arg or a `**kwargs` varargs in its call signature,
-    similarly to how keras passes `training` to other layers in TF2 that have
-    similar signatures in their `call` implementations.
-    See more details in the docs
-    on `tf.keras.layers.Layer` to understand what will be passed and when.
-    Note: tf.compat.v1.layers are usually not called with `training=None`,
-    so the training arg to `forward_pass` might not feed through to them
-    unless you pass it to their calls explicitly.
-
-  Call signature of the forward pass:
-    The semantics of the forward pass signature match the standard
-    Keras layer `call` signature, including how Keras decides when
-    to pass in a `training` arg., and the semantics applied to
-    the first positional arg in the call signature.
-
-  Caveats:
-    * TF2 will not prune unused variable updates (or unused outputs). You may
-      need to adjust your forward pass code to avoid computations or variable
-      updates that you don't intend to use. (E.g. by adding a flag to the
-      `forward_pass` call signature and branching on it).
-    * Avoid Nesting variable creation in tf.function inside of `forward_pass`
-      While the layer may safely be used from inside a `tf.function`, using
-      a function inside of `forward_pass` will break the variable scoping.
-    * If you would like to nest Keras layers/models or other
-      `VariableScopeLayer`s directly in `forward_pass`, you need to
-      assign them as attributes of your layer so that Keras's standard
-      object-oriented weights and loss tracking will kick in.
-      See the intro to modules, layers, and models
-      [guide](https://www.tensorflow.org/guide/intro_to_modules) for more info
-  """
-
-  @property
-  @layer_utils.cached_per_instance
-  def _call_full_argspec(self):
-    # Argspec inspection is expensive and the call spec is used often, so it
-    # makes sense to cache the result.
-    return tf_inspect.getfullargspec(self.forward_pass)
-
-  def forward_pass(self, *args, **kwargs):
-    """Implement this method. It should include your model forward pass."""
-    raise NotImplementedError
-
-  @track_tf1_style_variables
-  def call(self, *args, **kwargs):
-    return self.forward_pass(*args, **kwargs)
+    def forward_pass(self, *args, **kwargs):
+        """Implement this method. It should include your model forward pass."""
+        raise NotImplementedError
+
+    @track_tf1_style_variables
+    def call(self, *args, **kwargs):
+        return self.forward_pass(*args, **kwargs)
 
 
 @keras_export(v1=["keras.utils.get_or_create_layer"])
 def get_or_create_layer(name, create_layer_method):
-  """Use this method to track nested keras models in a shim-decorated method.
-
-  This method can be used within a `tf.keras.Layer`'s methods decorated by
-  the`track_tf1_style_variables` shim, to additionally track inner keras Model
-  objects created within the same method. The inner model's variables and losses
-  will be accessible via the outer model's `variables` and `losses` attributes.
-
-  This enables tracking of inner keras models using TF2 behaviors, with minimal
-  changes to existing TF1-style code.
-
-  Example:
-
-  ```python
-  class NestedLayer(tf.keras.layers.Layer):
-
-    def __init__(self, units, *args, **kwargs):
-      super().__init__(*args, **kwargs)
-      self.units = units
-
-    def build_model(self):
-      inp = tf.keras.Input(shape=(5, 5))
-      dense_layer = tf.keras.layers.Dense(
-          10, name="dense", kernel_regularizer="l2",
-          kernel_initializer=tf.compat.v1.ones_initializer())
-      model = tf.keras.Model(inputs=inp, outputs=dense_layer(inp))
-      return model
-
-    @tf.compat.v1.keras.utils.track_tf1_style_variables
-    def call(self, inputs):
-      model = tf.compat.v1.keras.utils.get_or_create_layer(
-          "dense_model", self.build_model)
-      return model(inputs)
-  ```
-  The inner model creation should be confined to its own zero-arg function,
-  which should be passed into this method. In TF1, this method will immediately
-  create and return the desired model, without any tracking.
-
-  Args:
-    name: A name to give the nested layer to track.
-    create_layer_method: a Callable that takes no args and returns the nested
-    layer.
-
-  Returns:
-    The created layer.
-  """
-  store = vs._get_default_variable_store()  # pylint: disable=protected-access
-  if not isinstance(store, _EagerVariableStore):
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      # tf1 case; just create and return layer
-      return create_layer_method()
-    else:
-      raise ValueError(
-          "Tried to call get_or_create_layer in eager mode from a method not"
-          "decorated with @tf.compat.v1.keras.utils.track_tf1_style_variables.")
-  vs_name = tf.compat.v1.get_variable_scope().name
-  name = f"{vs_name}/{name}"
-  return store.get_or_create_layer(name, create_layer_method)
+    """Use this method to track nested keras models in a shim-decorated method.
+
+    This method can be used within a `tf.keras.Layer`'s methods decorated by
+    the`track_tf1_style_variables` shim, to additionally track inner keras Model
+    objects created within the same method. The inner model's variables and
+    losses will be accessible via the outer model's `variables` and `losses`
+    attributes.
+
+    This enables tracking of inner keras models using TF2 behaviors, with
+    minimal changes to existing TF1-style code.
+
+    Example:
+
+    ```python
+    class NestedLayer(tf.keras.layers.Layer):
+
+      def __init__(self, units, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.units = units
+
+      def build_model(self):
+        inp = tf.keras.Input(shape=(5, 5))
+        dense_layer = tf.keras.layers.Dense(
+            10, name="dense", kernel_regularizer="l2",
+            kernel_initializer=tf.compat.v1.ones_initializer())
+        model = tf.keras.Model(inputs=inp, outputs=dense_layer(inp))
+        return model
+
+      @tf.compat.v1.keras.utils.track_tf1_style_variables
+      def call(self, inputs):
+        model = tf.compat.v1.keras.utils.get_or_create_layer(
+            "dense_model", self.build_model)
+        return model(inputs)
+    ```
+    The inner model creation should be confined to its own zero-arg function,
+    which should be passed into this method. In TF1, this method will
+    immediately create and return the desired model, without any tracking.
+
+    Args:
+      name: A name to give the nested layer to track.
+      create_layer_method: a Callable that takes no args and returns the nested
+      layer.
+
+    Returns:
+      The created layer.
+    """
+    store = vs._get_default_variable_store()
+    if not isinstance(store, _EagerVariableStore):
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            # tf1 case; just create and return layer
+            return create_layer_method()
+        else:
+            raise ValueError(
+                "Tried to call get_or_create_layer in eager mode from a method "
+                "notdecorated with "
+                "@tf.compat.v1.keras.utils.track_tf1_style_variables."
+            )
+    vs_name = tf.compat.v1.get_variable_scope().name
+    name = f"{vs_name}/{name}"
+    return store.get_or_create_layer(name, create_layer_method)
diff --git a/keras/legacy_tf_layers/variable_scope_shim_test.py b/keras/legacy_tf_layers/variable_scope_shim_test.py
index 9de0dd48d47b..f593bdfa71d6 100644
--- a/keras/legacy_tf_layers/variable_scope_shim_test.py
+++ b/keras/legacy_tf_layers/variable_scope_shim_test.py
@@ -21,7 +21,10 @@
 import gc
 import threading
 
+import numpy
+import tensorflow as tf
 from absl.testing import parameterized
+
 from keras import models
 from keras import regularizers
 from keras.engine import base_layer
@@ -32,1589 +35,1823 @@
 from keras.legacy_tf_layers import variable_scope_shim
 from keras.testing_infra import test_combinations
 
-import numpy
-import tensorflow as tf
-
-from tensorflow.python.framework import test_util as tf_test_utils
-from tensorflow.python.ops import variable_scope 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+from tensorflow.python.ops import variable_scope
 
 
 def run_inside_wrap_function_in_eager_mode(graph_function):
-  """Decorator to execute the same graph code in eager and graph modes.
+    """Decorator to execute the same graph code in eager and graph modes.
 
-  In graph mode, we just execute the graph_function passed as argument. In eager
-  mode, we wrap the function using wrap_function and then execute the wrapped
-  result.
+    In graph mode, we just execute the graph_function passed as argument. In
+    eager mode, we wrap the function using wrap_function and then execute the
+    wrapped result.
 
-  Args:
-    graph_function: python function containing graph code to be wrapped
+    Args:
+      graph_function: python function containing graph code to be wrapped
 
-  Returns:
-    decorated function
-  """
-  def wrap_and_execute(self):
-    store = variable_scope_shim._EagerVariableStore()
-    with variable_scope.with_variable_store(store):
-      # use the original function
-      graph_function(self)
-  return wrap_and_execute
+    Returns:
+      decorated function
+    """
 
+    def wrap_and_execute(self):
+        store = variable_scope_shim._EagerVariableStore()
+        with variable_scope.with_variable_store(store):
+            # use the original function
+            graph_function(self)
 
-class VariableScopeTest(tf.test.TestCase):
+    return wrap_and_execute
 
-  def tearDown(self):
-    gc.collect()
-    # This will only contain uncollectable garbage, i.e. reference cycles
-    # involving objects with __del__ defined.
-    self.assertEqual(0, len(gc.garbage))
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testGetVar(self):
-    vs = variable_scope._get_default_variable_store()
-    v = vs.get_variable("v", [1])
-    v1 = vs.get_variable("v", [1])
-    self.assertIs(v, v1)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testNameExists(self):
-    vs = variable_scope._get_default_variable_store()
-    # No check by default, so we can both create and get existing names.
-    v = vs.get_variable("v", [1])
-    v1 = vs.get_variable("v", [1])
-    self.assertIs(v, v1)
-
-    self.assertIsNot(v, vs.get_variable("u", [1], reuse=False))
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testNamelessStore(self):
-    vs = variable_scope._get_default_variable_store()
-    vs.get_variable("v1", [2])
-    vs.get_variable("v2", [2])
-    expected_names = ["%s:0" % name for name in ["v1", "v2"]]
-    self.assertEqual(
-        set(expected_names), set(v.name for v in vs._vars.values()))
-
-  # TODO(mihaimaruseac): Not converted to use wrap_function because of
-  # TypeError: Expected tf.group() expected Tensor arguments not 'None' with
-  # type '<type 'NoneType'>'
-  @tf_test_utils.run_in_graph_and_eager_modes
-  def testVarScopeInitializer(self):
-    init = tf.compat.v1.constant_initializer(0.3)
-    with tf.compat.v1.variable_scope("tower0") as tower:
-      with tf.compat.v1.variable_scope("foo", initializer=init):
-        v = tf.compat.v1.get_variable("v", [])
-        self.evaluate(tf.compat.v1.variables_initializer([v]))
-        self.assertAllClose(self.evaluate(v.value()), 0.3)
-      with tf.compat.v1.variable_scope(tower, initializer=init):
-        w = tf.compat.v1.get_variable("w", [])
-        self.evaluate(tf.compat.v1.variables_initializer([w]))
-        self.assertAllClose(self.evaluate(w.value()), 0.3)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeConstraint(self):
-    constraint = lambda x: 0. * x
-    with tf.compat.v1.variable_scope("tower1") as tower:
-      with tf.compat.v1.variable_scope("foo", constraint=constraint):
-        v = tf.compat.v1.get_variable("v", [])
-        self.assertIsNotNone(v.constraint)
-      with tf.compat.v1.variable_scope(tower, constraint=constraint):
-        w = tf.compat.v1.get_variable("w", [])
-        self.assertIsNotNone(w.constraint)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeDType(self):
-    with tf.compat.v1.variable_scope("tower2") as tower:
-      with tf.compat.v1.variable_scope("foo", dtype=tf.float16):
-        v = tf.compat.v1.get_variable("v", [])
-        self.assertEqual(v.dtype.base_dtype, tf.float16)
-      with tf.compat.v1.variable_scope(tower, dtype=tf.float16):
-        w = tf.compat.v1.get_variable("w", [])
-        self.assertEqual(w.dtype.base_dtype, tf.float16)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testInitFromNonTensorValue(self):
-    v = tf.compat.v1.get_variable("v4", initializer=4, dtype=tf.int32)
-    self.evaluate(tf.compat.v1.variables_initializer([v]))
-    self.assertAllClose(self.evaluate(v.value()), 4)
-
-    w = tf.compat.v1.get_variable(
-        "w4", initializer=numpy.array([1, 2, 3]), dtype=tf.int64)
-    self.evaluate(tf.compat.v1.variables_initializer([w]))
-    self.assertAllClose(self.evaluate(w.value()), [1, 2, 3])
-
-    # A quirk to be revisited?
-    error = ValueError if tf.executing_eagerly() else TypeError
-    with self.assertRaises(error):
-      tf.compat.v1.get_variable("x4", initializer={})
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testInitFromNonInitializer(self):
-    # Test various dtypes with zeros initializer as following:
-    types = [
-        tf.int8, tf.uint8, tf.int16, tf.uint16, tf.int32,
-        tf.int64, tf.bool
-    ]
-
-    # Use different variable_name to distinguish various dtypes
-    for (i, dtype) in enumerate(types):
-      x = tf.compat.v1.get_variable(
-          name="xx%d" % i, shape=(3, 4), dtype=dtype)
-      y = tf.compat.v1.get_variable(
-          name="yy%d" % i,
-          shape=(3, 4),
-          dtype=dtype,
-          initializer=tf.compat.v1.zeros_initializer(dtype=dtype))
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.assertAllEqual(self.evaluate(x.value()), self.evaluate(y.value()))
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeRegularizer(self):
-    init = tf.compat.v1.constant_initializer(0.3)
-
-    def regularizer1(v):
-      return tf.reduce_mean(v) + 0.1
-
-    def regularizer2(v):
-      return tf.reduce_mean(v) + 0.2
-
-    with tf.compat.v1.variable_scope(
-        "tower3", regularizer=regularizer1) as tower:
-      with tf.compat.v1.variable_scope("foo", initializer=init):
-        v = tf.compat.v1.get_variable("v", [])
-        self.evaluate(tf.compat.v1.variables_initializer([v]))
-      with tf.compat.v1.variable_scope(tower, initializer=init) as vs:
-        tf.compat.v1.get_variable("u", [])
-        vs.set_regularizer(regularizer2)
-        tf.compat.v1.get_variable("w", [])
-        # Next 3 variable not regularized to test disabling regularization.
-        tf.compat.v1.get_variable(
-            "x", [], regularizer=tf.compat.v1.no_regularizer)
-        with tf.compat.v1.variable_scope(
-            "baz", regularizer=tf.compat.v1.no_regularizer):
-          tf.compat.v1.get_variable("y", [])
-        vs.set_regularizer(tf.compat.v1.no_regularizer)
-        tf.compat.v1.get_variable("z", [])
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testInitializeFromValue(self):
-    init = tf.constant(0.1)
-    w = tf.compat.v1.get_variable("v", initializer=init)
-    self.evaluate(tf.compat.v1.variables_initializer([w]))
-    self.assertAllClose(self.evaluate(w.value()), 0.1)
-
-    with self.assertRaisesRegex(ValueError, "shape"):
-      # We disallow explicit shape specification when initializer is constant.
-      tf.compat.v1.get_variable("u", [1], initializer=init)
-
-    with tf.compat.v1.variable_scope("foo", initializer=init):
-      # Constant initializer can be passed through scopes if needed.
-      v = tf.compat.v1.get_variable("v")
-      self.evaluate(tf.compat.v1.variables_initializer([v]))
-      self.assertAllClose(self.evaluate(v.value()), 0.1)
-
-    # Check that non-float32 initializer creates a non-float32 variable.
-    init = tf.constant(1, dtype=tf.int32)
-    t = tf.compat.v1.get_variable("t", initializer=init)
-    self.assertEqual(t.dtype.base_dtype, tf.int32)
-
-    # Raise error if `initializer` dtype and `dtype` are not identical.
-    with self.assertRaisesRegex(ValueError, "don't match"):
-      tf.compat.v1.get_variable("s", initializer=init, dtype=tf.float64)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeGetOrCreateReuse(self):
-    with self.cached_session():
-
-      def test_value(value):
-        x = tf.constant(value)
-        with tf.compat.v1.variable_scope(
-            "testVarScopeGetOrCreateReuse_bar",
-            reuse=tf.compat.v1.AUTO_REUSE):
-          _ = tf.compat.v1.assign(tf.compat.v1.get_variable("var", []), x)
-        with tf.compat.v1.variable_scope(
-            "testVarScopeGetOrCreateReuse_bar",
-            reuse=tf.compat.v1.AUTO_REUSE):
-          _ = tf.compat.v1.get_variable("var", [])
-        self.assertEqual(value, self.evaluate(x))
-
-      test_value(42.)  # Variable is created.
-      test_value(13.)  # Variable is reused hereafter.
-      test_value(17.)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeGetOrCreateReuseIgnoreFalse(self):
-    with self.cached_session():
-
-      def test_value(value):
-        x = tf.constant(value)
-        with tf.compat.v1.variable_scope(
-            "testVarScopeGetOrCreateReuse_bar",
-            reuse=False):
-          _ = tf.compat.v1.assign(tf.compat.v1.get_variable("var", []), x)
-        # We need to ignore reuse=False in the shim, because the
-        # code is expected to get rerun each time the user calls the shim.
-        with tf.compat.v1.variable_scope(
-            "testVarScopeGetOrCreateReuse_bar",
-            reuse=False):
-          _ = tf.compat.v1.get_variable("var", [])
-        self.assertEqual(value, self.evaluate(x))
-
-      test_value(42.)  # Variable is created.
-      test_value(13.)  # Variable is reused hereafter.
-      test_value(17.)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScope(self):
-    with self.cached_session():
-      with tf.name_scope("testVarOpScope1"):
-        with tf.compat.v1.variable_scope("tower", "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "tower/w:0")
-
-      with tf.name_scope("testVarOpScope2"):
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "default/w:0")
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "default_1/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScopeUniqueNamesInterleavedSubstringScopes(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope(None, "defaultScope1"):
-        with tf.compat.v1.variable_scope(None, "layer"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name,
-              "defaultScope1/layer/w:0")
-      with tf.compat.v1.variable_scope(None, "defaultScope1"):
-        with tf.compat.v1.variable_scope(None, "layer"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name,
-              "defaultScope1_1/layer/w:0")
-      with tf.compat.v1.variable_scope(None, "defaultScope"):
-        with tf.compat.v1.variable_scope(None, "layer"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name,
-              "defaultScope/layer/w:0")
-      with tf.compat.v1.variable_scope(None, "defaultScope1"):
-        with tf.compat.v1.variable_scope(None, "layer"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name,
-              "defaultScope1_2/layer/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScopeUniqueNamesWithJump(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("default") as default:
-        with tf.compat.v1.variable_scope(None, "layer"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "default/layer/w:0")
-        with tf.compat.v1.variable_scope(None, "layer"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name,
-              "default/layer_1/w:0")
-        with tf.compat.v1.variable_scope(default):
-          pass
-        # No matter the jump in the middle, unique numbering continues.
-        with tf.compat.v1.variable_scope(None, "layer"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name,
-              "default/layer_2/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScopeReuse(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("outer") as outer:
-        with tf.compat.v1.variable_scope("tower", "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/tower/w:0")
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-      with tf.compat.v1.variable_scope(outer, reuse=True) as outer:
-        with tf.compat.v1.variable_scope("tower", "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/tower/w:0")
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeGetVar(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("root"):
-        with tf.compat.v1.variable_scope("towerA") as tower_a:
-          va = tf.compat.v1.get_variable("v", [1])
-          self.assertEqual(va.name, "root/towerA/v:0")
-
-        with tf.compat.v1.variable_scope(tower_a, reuse=True):
-          va2 = tf.compat.v1.get_variable("v", [1])
-          self.assertIs(va2, va)
-
-        with tf.compat.v1.variable_scope("towerB"):
-          vb = tf.compat.v1.get_variable("v", [1])
-          self.assertEqual(vb.name, "root/towerB/v:0")
-
-        with tf.compat.v1.variable_scope("towerA", reuse=True):
-          va2 = tf.compat.v1.get_variable("v", [1])
-          self.assertIs(va2, va)
 
-        with tf.compat.v1.variable_scope("foo"):
-          with tf.compat.v1.variable_scope("bar"):
-            v = tf.compat.v1.get_variable("v", [1])
-            self.assertEqual(v.name, "root/foo/bar/v:0")
-            with tf.compat.v1.variable_scope(tower_a, reuse=True):
-              va3 = tf.compat.v1.get_variable("v", [1])
-              self.assertIs(va, va3)
-
-        with self.assertRaises(ValueError) as exc:
-          with tf.compat.v1.variable_scope(tower_a, reuse=True):
-            tf.compat.v1.get_variable("v", [2])  # Different shape.
-        self.assertEqual("shape" in str(exc.exception), True)
-
-        with self.assertRaises(ValueError) as exc:
-          with tf.compat.v1.variable_scope(tower_a, reuse=True):
-            tf.compat.v1.get_variable("v", [1], dtype=tf.int32)
-        self.assertEqual("dtype" in str(exc.exception), True)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeOuterScope(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("outer") as outer:
-        pass
-      with tf.compat.v1.variable_scope(outer):
+class VariableScopeTest(tf.test.TestCase):
+    def tearDown(self):
+        gc.collect()
+        # This will only contain uncollectable garbage, i.e. reference cycles
+        # involving objects with __del__ defined.
+        self.assertEqual(0, len(gc.garbage))
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testGetVar(self):
+        vs = variable_scope._get_default_variable_store()
+        v = vs.get_variable("v", [1])
+        v1 = vs.get_variable("v", [1])
+        self.assertIs(v, v1)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testNameExists(self):
+        vs = variable_scope._get_default_variable_store()
+        # No check by default, so we can both create and get existing names.
+        v = vs.get_variable("v", [1])
+        v1 = vs.get_variable("v", [1])
+        self.assertIs(v, v1)
+
+        self.assertIsNot(v, vs.get_variable("u", [1], reuse=False))
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testNamelessStore(self):
+        vs = variable_scope._get_default_variable_store()
+        vs.get_variable("v1", [2])
+        vs.get_variable("v2", [2])
+        expected_names = [f"{name}:0" for name in ["v1", "v2"]]
         self.assertEqual(
-            tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        with tf.compat.v1.variable_scope("default"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
+            set(expected_names), set(v.name for v in vs._vars.values())
+        )
+
+    # TODO(mihaimaruseac): Not converted to use wrap_function because of
+    # TypeError: Expected tf.group() expected Tensor arguments not 'None' with
+    # type '<type 'NoneType'>'
+    @tf_test_utils.run_in_graph_and_eager_modes
+    def testVarScopeInitializer(self):
+        init = tf.compat.v1.constant_initializer(0.3)
+        with tf.compat.v1.variable_scope("tower0") as tower:
+            with tf.compat.v1.variable_scope("foo", initializer=init):
+                v = tf.compat.v1.get_variable("v", [])
+                self.evaluate(tf.compat.v1.variables_initializer([v]))
+                self.assertAllClose(self.evaluate(v.value()), 0.3)
+            with tf.compat.v1.variable_scope(tower, initializer=init):
+                w = tf.compat.v1.get_variable("w", [])
+                self.evaluate(tf.compat.v1.variables_initializer([w]))
+                self.assertAllClose(self.evaluate(w.value()), 0.3)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeConstraint(self):
+        constraint = lambda x: 0.0 * x
+        with tf.compat.v1.variable_scope("tower1") as tower:
+            with tf.compat.v1.variable_scope("foo", constraint=constraint):
+                v = tf.compat.v1.get_variable("v", [])
+                self.assertIsNotNone(v.constraint)
+            with tf.compat.v1.variable_scope(tower, constraint=constraint):
+                w = tf.compat.v1.get_variable("w", [])
+                self.assertIsNotNone(w.constraint)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeDType(self):
+        with tf.compat.v1.variable_scope("tower2") as tower:
+            with tf.compat.v1.variable_scope("foo", dtype=tf.float16):
+                v = tf.compat.v1.get_variable("v", [])
+                self.assertEqual(v.dtype.base_dtype, tf.float16)
+            with tf.compat.v1.variable_scope(tower, dtype=tf.float16):
+                w = tf.compat.v1.get_variable("w", [])
+                self.assertEqual(w.dtype.base_dtype, tf.float16)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testInitFromNonTensorValue(self):
+        v = tf.compat.v1.get_variable("v4", initializer=4, dtype=tf.int32)
+        self.evaluate(tf.compat.v1.variables_initializer([v]))
+        self.assertAllClose(self.evaluate(v.value()), 4)
 
-      with tf.compat.v1.variable_scope(outer, reuse=True):
-        self.assertEqual(
-            tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        with tf.compat.v1.variable_scope("default", reuse=True):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarScopeNestedOuterScope(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("outer") as outer:
-        with tf.compat.v1.variable_scope(outer):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        with tf.compat.v1.variable_scope("default"):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-        with tf.compat.v1.variable_scope(outer, reuse=True):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        with tf.compat.v1.variable_scope("default", reuse=True):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScopeReuseParam(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("outer") as outer:
-        with tf.compat.v1.variable_scope("tower", "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/tower/w:0")
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-      with tf.compat.v1.variable_scope(outer) as outer:
-        with tf.compat.v1.variable_scope("tower", "default", reuse=True):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/tower/w:0")
-        outer.reuse_variables()
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScopeReuseError(self):
-    with self.cached_session():
-      with self.assertRaises(ValueError):
-        with tf.compat.v1.variable_scope(None, "default", reuse=True):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/tower/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScopeOuterScope(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("outer") as outer:
-        pass
-      with tf.compat.v1.variable_scope(outer, "default", []):
-        self.assertEqual(
-            tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
+        w = tf.compat.v1.get_variable(
+            "w4", initializer=numpy.array([1, 2, 3]), dtype=tf.int64
+        )
+        self.evaluate(tf.compat.v1.variables_initializer([w]))
+        self.assertAllClose(self.evaluate(w.value()), [1, 2, 3])
+
+        # A quirk to be revisited?
+        error = ValueError if tf.executing_eagerly() else TypeError
+        with self.assertRaises(error):
+            tf.compat.v1.get_variable("x4", initializer={})
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testInitFromNonInitializer(self):
+        # Test various dtypes with zeros initializer as following:
+        types = [
+            tf.int8,
+            tf.uint8,
+            tf.int16,
+            tf.uint16,
+            tf.int32,
+            tf.int64,
+            tf.bool,
+        ]
+
+        # Use different variable_name to distinguish various dtypes
+        for i, dtype in enumerate(types):
+            x = tf.compat.v1.get_variable(
+                name="xx%d" % i, shape=(3, 4), dtype=dtype
+            )
+            y = tf.compat.v1.get_variable(
+                name="yy%d" % i,
+                shape=(3, 4),
+                dtype=dtype,
+                initializer=tf.compat.v1.zeros_initializer(dtype=dtype),
+            )
 
-      with tf.compat.v1.variable_scope(outer, "default", reuse=True):
-        self.assertEqual(
-            tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        outer.reuse_variables()
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVarOpScopeNestedOuterScope(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("outer") as outer:
-        with tf.compat.v1.variable_scope(outer, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-      with tf.compat.v1.variable_scope(outer, "default", reuse=True):
-        self.assertEqual(
-            tf.compat.v1.get_variable("w", []).name, "outer/w:0")
-        with tf.compat.v1.variable_scope(None, "default", []):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testBasicWhenAuxiliaryNameScopeIsFalse(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope(
-          "scope", auxiliary_name_scope=False) as scope:
-        self.assertEqual(
-            tf.compat.v1.get_variable("w", []).name, "scope/w:0")
-      with tf.compat.v1.variable_scope(scope, auxiliary_name_scope=False):
-        self.assertEqual(
-            tf.compat.v1.get_variable("w1", []).name, "scope/w1:0")
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.assertAllEqual(
+                self.evaluate(x.value()), self.evaluate(y.value())
+            )
 
-      with tf.compat.v1.variable_scope("outer"):
-        with tf.compat.v1.variable_scope(
-            "inner", auxiliary_name_scope=False) as inner:
-          self.assertEqual(inner.original_name_scope, "outer/")
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/inner/w:0")
-        with tf.compat.v1.variable_scope(
-            inner, auxiliary_name_scope=False) as inner1:
-          self.assertEqual(inner1.original_name_scope, "outer/")
-          self.assertEqual(
-              tf.compat.v1.get_variable("w1", []).name, "outer/inner/w1:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testCreatedByDefaultNameWhenAuxiliaryNameScopeIsFalse(self):
-    with self.cached_session():
-      with tf.compat.v1.variable_scope(
-          None, default_name="default", auxiliary_name_scope=False):
-        self.assertEqual(
-            tf.compat.v1.get_variable("w", []).name, "default/w:0")
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeRegularizer(self):
+        init = tf.compat.v1.constant_initializer(0.3)
 
-      with tf.compat.v1.variable_scope("outer"):
-        with tf.compat.v1.variable_scope(
-            None, default_name="default",
-            auxiliary_name_scope=False) as inner:
-          self.assertEqual(inner.original_name_scope, "outer/")
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/default/w:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testReenterRootScopeWhenAuxiliaryNameScopeIsFalse(self):
-    with self.cached_session():
-      root_scope = tf.compat.v1.get_variable_scope()
-      with tf.compat.v1.variable_scope(
-          root_scope, auxiliary_name_scope=False):
-        self.assertEqual(tf.compat.v1.get_variable("w", []).name, "w:0")
-
-      with tf.compat.v1.variable_scope("outer"):
-        with tf.compat.v1.variable_scope(
-            root_scope, auxiliary_name_scope=False) as inner:
-          self.assertEqual(inner.original_name_scope, "")
-          self.assertEqual(tf.compat.v1.get_variable("w1", []).name, "w1:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testAuxiliaryNameScopeIsInvalid(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
-        with tf.compat.v1.variable_scope(
-            None, default_name="scope", auxiliary_name_scope="invalid"):
-          pass
+        def regularizer1(v):
+            return tf.reduce_mean(v) + 0.1
 
-      with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
-        with tf.compat.v1.variable_scope(
-            "scope", auxiliary_name_scope="invalid"):
-          pass
+        def regularizer2(v):
+            return tf.reduce_mean(v) + 0.2
 
-      with tf.compat.v1.variable_scope("scope") as scope:
-        pass
-      with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
         with tf.compat.v1.variable_scope(
-            scope, auxiliary_name_scope="invalid"):
-          pass
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testReuseScopeWithoutNameScopeCollision(self):
-    # Github issue: #13429
-    with self.cached_session():
-      with tf.compat.v1.variable_scope("outer"):
-        with tf.compat.v1.variable_scope("inner") as inner:
-          pass
-
-      with tf.compat.v1.variable_scope(
-          inner, auxiliary_name_scope=False) as scope:
-        with tf.name_scope(scope.original_name_scope):
-          self.assertEqual(
-              tf.compat.v1.get_variable("w", []).name, "outer/inner/w:0")
-
-      with tf.compat.v1.variable_scope("another"):
-        with tf.compat.v1.variable_scope(
-            inner, auxiliary_name_scope=False) as scope1:
-          with tf.name_scope(scope1.original_name_scope):
-            self.assertEqual(
-                tf.compat.v1.get_variable("w1", []).name,
-                "outer/inner/w1:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testGetVarWithDevice(self):
-    g = tf.Graph()
-    varname_type = []
-
-    def device_func(op):
-      if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
-        varname_type.append((op.name, op.get_attr("dtype")))
-      return "/device:GPU:0"
-
-    with g.as_default():
-      with tf.compat.v1.device(device_func):
-        _ = tf.compat.v1.get_variable("x", (100, 200))
-        _ = tf.compat.v1.get_variable(
-            "y", dtype=tf.int64, initializer=numpy.arange(73))
-    self.assertEqual(varname_type[0], ("x", tf.float32))
-    self.assertEqual(varname_type[1], ("y", tf.int64))
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testGetVariableWithRefDtype(self):
-    v = tf.compat.v1.get_variable("v", shape=[3, 4], dtype=tf.float32)
-    # Ensure it is possible to do get_variable with a _ref dtype passed in.
-    _ = tf.compat.v1.get_variable("w", shape=[5, 6], dtype=v.dtype)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testGetVariableWithInitializerWhichTakesNoArgs(self):
-    v = tf.compat.v1.get_variable("foo", initializer=lambda: [2])
-    self.assertEqual(v.name, "foo:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testGetVariableWithInitializerWhichTakesOptionalArgs(self):
-    v = tf.compat.v1.get_variable("foo", initializer=lambda x=True: [2])
-    self.assertEqual(v.name, "foo:0")
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testTwoGraphs(self):
-
-    def f():
-      g1 = tf.Graph()
-      g2 = tf.Graph()
-      with g1.as_default():
-        with g2.as_default():
-          with tf.compat.v1.variable_scope("_"):
-            pass
-
-    self.assertRaisesRegex(ValueError,
-                           "'_' is not a valid (?:root )?scope name", f)
+            "tower3", regularizer=regularizer1
+        ) as tower:
+            with tf.compat.v1.variable_scope("foo", initializer=init):
+                v = tf.compat.v1.get_variable("v", [])
+                self.evaluate(tf.compat.v1.variables_initializer([v]))
+            with tf.compat.v1.variable_scope(tower, initializer=init) as vs:
+                tf.compat.v1.get_variable("u", [])
+                vs.set_regularizer(regularizer2)
+                tf.compat.v1.get_variable("w", [])
+                # Next 3 variable not regularized to test disabling
+                # regularization.
+                tf.compat.v1.get_variable(
+                    "x", [], regularizer=tf.compat.v1.no_regularizer
+                )
+                with tf.compat.v1.variable_scope(
+                    "baz", regularizer=tf.compat.v1.no_regularizer
+                ):
+                    tf.compat.v1.get_variable("y", [])
+                vs.set_regularizer(tf.compat.v1.no_regularizer)
+                tf.compat.v1.get_variable("z", [])
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testInitializeFromValue(self):
+        init = tf.constant(0.1)
+        w = tf.compat.v1.get_variable("v", initializer=init)
+        self.evaluate(tf.compat.v1.variables_initializer([w]))
+        self.assertAllClose(self.evaluate(w.value()), 0.1)
+
+        with self.assertRaisesRegex(ValueError, "shape"):
+            # We disallow explicit shape specification when initializer is
+            # constant.
+            tf.compat.v1.get_variable("u", [1], initializer=init)
+
+        with tf.compat.v1.variable_scope("foo", initializer=init):
+            # Constant initializer can be passed through scopes if needed.
+            v = tf.compat.v1.get_variable("v")
+            self.evaluate(tf.compat.v1.variables_initializer([v]))
+            self.assertAllClose(self.evaluate(v.value()), 0.1)
+
+        # Check that non-float32 initializer creates a non-float32 variable.
+        init = tf.constant(1, dtype=tf.int32)
+        t = tf.compat.v1.get_variable("t", initializer=init)
+        self.assertEqual(t.dtype.base_dtype, tf.int32)
+
+        # Raise error if `initializer` dtype and `dtype` are not identical.
+        with self.assertRaisesRegex(ValueError, "don't match"):
+            tf.compat.v1.get_variable("s", initializer=init, dtype=tf.float64)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeGetOrCreateReuse(self):
+        with self.cached_session():
+
+            def test_value(value):
+                x = tf.constant(value)
+                with tf.compat.v1.variable_scope(
+                    "testVarScopeGetOrCreateReuse_bar",
+                    reuse=tf.compat.v1.AUTO_REUSE,
+                ):
+                    _ = tf.compat.v1.assign(
+                        tf.compat.v1.get_variable("var", []), x
+                    )
+                with tf.compat.v1.variable_scope(
+                    "testVarScopeGetOrCreateReuse_bar",
+                    reuse=tf.compat.v1.AUTO_REUSE,
+                ):
+                    _ = tf.compat.v1.get_variable("var", [])
+                self.assertEqual(value, self.evaluate(x))
+
+            test_value(42.0)  # Variable is created.
+            test_value(13.0)  # Variable is reused hereafter.
+            test_value(17.0)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeGetOrCreateReuseIgnoreFalse(self):
+        with self.cached_session():
+
+            def test_value(value):
+                x = tf.constant(value)
+                with tf.compat.v1.variable_scope(
+                    "testVarScopeGetOrCreateReuse_bar", reuse=False
+                ):
+                    _ = tf.compat.v1.assign(
+                        tf.compat.v1.get_variable("var", []), x
+                    )
+                # We need to ignore reuse=False in the shim, because the code is
+                # expected to get rerun each time the user calls the shim.
+                with tf.compat.v1.variable_scope(
+                    "testVarScopeGetOrCreateReuse_bar", reuse=False
+                ):
+                    _ = tf.compat.v1.get_variable("var", [])
+                self.assertEqual(value, self.evaluate(x))
+
+            test_value(42.0)  # Variable is created.
+            test_value(13.0)  # Variable is reused hereafter.
+            test_value(17.0)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScope(self):
+        with self.cached_session():
+            with tf.name_scope("testVarOpScope1"):
+                with tf.compat.v1.variable_scope("tower", "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name, "tower/w:0"
+                    )
+
+            with tf.name_scope("testVarOpScope2"):
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name, "default/w:0"
+                    )
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name, "default_1/w:0"
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScopeUniqueNamesInterleavedSubstringScopes(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope(None, "defaultScope1"):
+                with tf.compat.v1.variable_scope(None, "layer"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "defaultScope1/layer/w:0",
+                    )
+            with tf.compat.v1.variable_scope(None, "defaultScope1"):
+                with tf.compat.v1.variable_scope(None, "layer"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "defaultScope1_1/layer/w:0",
+                    )
+            with tf.compat.v1.variable_scope(None, "defaultScope"):
+                with tf.compat.v1.variable_scope(None, "layer"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "defaultScope/layer/w:0",
+                    )
+            with tf.compat.v1.variable_scope(None, "defaultScope1"):
+                with tf.compat.v1.variable_scope(None, "layer"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "defaultScope1_2/layer/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScopeUniqueNamesWithJump(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("default") as default:
+                with tf.compat.v1.variable_scope(None, "layer"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "default/layer/w:0",
+                    )
+                with tf.compat.v1.variable_scope(None, "layer"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "default/layer_1/w:0",
+                    )
+                with tf.compat.v1.variable_scope(default):
+                    pass
+                # No matter the jump in the middle, unique numbering continues.
+                with tf.compat.v1.variable_scope(None, "layer"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "default/layer_2/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScopeReuse(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("outer") as outer:
+                with tf.compat.v1.variable_scope("tower", "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/tower/w:0",
+                    )
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+            with tf.compat.v1.variable_scope(outer, reuse=True) as outer:
+                with tf.compat.v1.variable_scope("tower", "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/tower/w:0",
+                    )
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeGetVar(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("root"):
+                with tf.compat.v1.variable_scope("towerA") as tower_a:
+                    va = tf.compat.v1.get_variable("v", [1])
+                    self.assertEqual(va.name, "root/towerA/v:0")
+
+                with tf.compat.v1.variable_scope(tower_a, reuse=True):
+                    va2 = tf.compat.v1.get_variable("v", [1])
+                    self.assertIs(va2, va)
+
+                with tf.compat.v1.variable_scope("towerB"):
+                    vb = tf.compat.v1.get_variable("v", [1])
+                    self.assertEqual(vb.name, "root/towerB/v:0")
+
+                with tf.compat.v1.variable_scope("towerA", reuse=True):
+                    va2 = tf.compat.v1.get_variable("v", [1])
+                    self.assertIs(va2, va)
+
+                with tf.compat.v1.variable_scope("foo"):
+                    with tf.compat.v1.variable_scope("bar"):
+                        v = tf.compat.v1.get_variable("v", [1])
+                        self.assertEqual(v.name, "root/foo/bar/v:0")
+                        with tf.compat.v1.variable_scope(tower_a, reuse=True):
+                            va3 = tf.compat.v1.get_variable("v", [1])
+                            self.assertIs(va, va3)
+
+                with self.assertRaises(ValueError) as exc:
+                    with tf.compat.v1.variable_scope(tower_a, reuse=True):
+                        tf.compat.v1.get_variable("v", [2])  # Different shape.
+                self.assertEqual("shape" in str(exc.exception), True)
+
+                with self.assertRaises(ValueError) as exc:
+                    with tf.compat.v1.variable_scope(tower_a, reuse=True):
+                        tf.compat.v1.get_variable("v", [1], dtype=tf.int32)
+                self.assertEqual("dtype" in str(exc.exception), True)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeOuterScope(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("outer") as outer:
+                pass
+            with tf.compat.v1.variable_scope(outer):
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                )
+                with tf.compat.v1.variable_scope("default"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+            with tf.compat.v1.variable_scope(outer, reuse=True):
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                )
+                with tf.compat.v1.variable_scope("default", reuse=True):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarScopeNestedOuterScope(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("outer") as outer:
+                with tf.compat.v1.variable_scope(outer):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                    )
+                with tf.compat.v1.variable_scope("default"):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+                with tf.compat.v1.variable_scope(outer, reuse=True):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                    )
+                with tf.compat.v1.variable_scope("default", reuse=True):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScopeReuseParam(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("outer") as outer:
+                with tf.compat.v1.variable_scope("tower", "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/tower/w:0",
+                    )
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+            with tf.compat.v1.variable_scope(outer) as outer:
+                with tf.compat.v1.variable_scope(
+                    "tower", "default", reuse=True
+                ):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/tower/w:0",
+                    )
+                outer.reuse_variables()
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScopeReuseError(self):
+        with self.cached_session():
+            with self.assertRaises(ValueError):
+                with tf.compat.v1.variable_scope(None, "default", reuse=True):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/tower/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScopeOuterScope(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("outer") as outer:
+                pass
+            with tf.compat.v1.variable_scope(outer, "default", []):
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                )
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+            with tf.compat.v1.variable_scope(outer, "default", reuse=True):
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                )
+                outer.reuse_variables()
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVarOpScopeNestedOuterScope(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("outer") as outer:
+                with tf.compat.v1.variable_scope(outer, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                    )
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+            with tf.compat.v1.variable_scope(outer, "default", reuse=True):
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w", []).name, "outer/w:0"
+                )
+                with tf.compat.v1.variable_scope(None, "default", []):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testBasicWhenAuxiliaryNameScopeIsFalse(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope(
+                "scope", auxiliary_name_scope=False
+            ) as scope:
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w", []).name, "scope/w:0"
+                )
+            with tf.compat.v1.variable_scope(scope, auxiliary_name_scope=False):
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w1", []).name, "scope/w1:0"
+                )
+
+            with tf.compat.v1.variable_scope("outer"):
+                with tf.compat.v1.variable_scope(
+                    "inner", auxiliary_name_scope=False
+                ) as inner:
+                    self.assertEqual(inner.original_name_scope, "outer/")
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/inner/w:0",
+                    )
+                with tf.compat.v1.variable_scope(
+                    inner, auxiliary_name_scope=False
+                ) as inner1:
+                    self.assertEqual(inner1.original_name_scope, "outer/")
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w1", []).name,
+                        "outer/inner/w1:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testCreatedByDefaultNameWhenAuxiliaryNameScopeIsFalse(self):
+        with self.cached_session():
+            with tf.compat.v1.variable_scope(
+                None, default_name="default", auxiliary_name_scope=False
+            ):
+                self.assertEqual(
+                    tf.compat.v1.get_variable("w", []).name, "default/w:0"
+                )
+
+            with tf.compat.v1.variable_scope("outer"):
+                with tf.compat.v1.variable_scope(
+                    None, default_name="default", auxiliary_name_scope=False
+                ) as inner:
+                    self.assertEqual(inner.original_name_scope, "outer/")
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/default/w:0",
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testReenterRootScopeWhenAuxiliaryNameScopeIsFalse(self):
+        with self.cached_session():
+            root_scope = tf.compat.v1.get_variable_scope()
+            with tf.compat.v1.variable_scope(
+                root_scope, auxiliary_name_scope=False
+            ):
+                self.assertEqual(tf.compat.v1.get_variable("w", []).name, "w:0")
+
+            with tf.compat.v1.variable_scope("outer"):
+                with tf.compat.v1.variable_scope(
+                    root_scope, auxiliary_name_scope=False
+                ) as inner:
+                    self.assertEqual(inner.original_name_scope, "")
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w1", []).name, "w1:0"
+                    )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testAuxiliaryNameScopeIsInvalid(self):
+        with self.cached_session():
+            with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
+                with tf.compat.v1.variable_scope(
+                    None, default_name="scope", auxiliary_name_scope="invalid"
+                ):
+                    pass
+
+            with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
+                with tf.compat.v1.variable_scope(
+                    "scope", auxiliary_name_scope="invalid"
+                ):
+                    pass
+
+            with tf.compat.v1.variable_scope("scope") as scope:
+                pass
+            with self.assertRaisesRegex(TypeError, "auxiliary_name_scope"):
+                with tf.compat.v1.variable_scope(
+                    scope, auxiliary_name_scope="invalid"
+                ):
+                    pass
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testReuseScopeWithoutNameScopeCollision(self):
+        # GitHub issue: #13429
+        with self.cached_session():
+            with tf.compat.v1.variable_scope("outer"):
+                with tf.compat.v1.variable_scope("inner") as inner:
+                    pass
+
+            with tf.compat.v1.variable_scope(
+                inner, auxiliary_name_scope=False
+            ) as scope:
+                with tf.name_scope(scope.original_name_scope):
+                    self.assertEqual(
+                        tf.compat.v1.get_variable("w", []).name,
+                        "outer/inner/w:0",
+                    )
+
+            with tf.compat.v1.variable_scope("another"):
+                with tf.compat.v1.variable_scope(
+                    inner, auxiliary_name_scope=False
+                ) as scope1:
+                    with tf.name_scope(scope1.original_name_scope):
+                        self.assertEqual(
+                            tf.compat.v1.get_variable("w1", []).name,
+                            "outer/inner/w1:0",
+                        )
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testGetVarWithDevice(self):
+        g = tf.Graph()
+        varname_type = []
+
+        def device_func(op):
+            if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
+                varname_type.append((op.name, op.get_attr("dtype")))
+            return "/device:GPU:0"
+
+        with g.as_default():
+            with tf.compat.v1.device(device_func):
+                _ = tf.compat.v1.get_variable("x", (100, 200))
+                _ = tf.compat.v1.get_variable(
+                    "y", dtype=tf.int64, initializer=numpy.arange(73)
+                )
+        self.assertEqual(varname_type[0], ("x", tf.float32))
+        self.assertEqual(varname_type[1], ("y", tf.int64))
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testGetVariableWithRefDtype(self):
+        v = tf.compat.v1.get_variable("v", shape=[3, 4], dtype=tf.float32)
+        # Ensure it is possible to do get_variable with a _ref dtype passed in.
+        _ = tf.compat.v1.get_variable("w", shape=[5, 6], dtype=v.dtype)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testGetVariableWithInitializerWhichTakesNoArgs(self):
+        v = tf.compat.v1.get_variable("foo", initializer=lambda: [2])
+        self.assertEqual(v.name, "foo:0")
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testGetVariableWithInitializerWhichTakesOptionalArgs(self):
+        v = tf.compat.v1.get_variable("foo", initializer=lambda x=True: [2])
+        self.assertEqual(v.name, "foo:0")
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testTwoGraphs(self):
+        def f():
+            g1 = tf.Graph()
+            g2 = tf.Graph()
+            with g1.as_default():
+                with g2.as_default():
+                    with tf.compat.v1.variable_scope("_"):
+                        pass
+
+        self.assertRaisesRegex(
+            ValueError, "'_' is not a valid (?:root )?scope name", f
+        )
 
 
 class VariableScopeWithCustomGetterTest(tf.test.TestCase):
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testNonCallableGetterFails(self):
+        with self.assertRaisesRegex(
+            ValueError, r"custom_getter .* not callable:"
+        ):
+            with tf.compat.v1.variable_scope("scope0", custom_getter=3):
+                tf.compat.v1.get_variable("name0")
+        with self.assertRaisesRegex(
+            ValueError, r"custom_getter .* not callable:"
+        ):
+            tf.compat.v1.get_variable("name0", custom_getter=3)
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testNoSideEffectsWithIdentityCustomGetter(self):
+        called = [0]
+
+        def custom_getter(getter, *args, **kwargs):
+            called[0] += 1
+            return getter(*args, **kwargs)
 
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testNonCallableGetterFails(self):
-    with self.assertRaisesRegex(ValueError, r"custom_getter .* not callable:"):
-      with tf.compat.v1.variable_scope("scope0", custom_getter=3):
-        tf.compat.v1.get_variable("name0")
-    with self.assertRaisesRegex(ValueError, r"custom_getter .* not callable:"):
-      tf.compat.v1.get_variable("name0", custom_getter=3)
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testNoSideEffectsWithIdentityCustomGetter(self):
-    called = [0]
-
-    def custom_getter(getter, *args, **kwargs):
-      called[0] += 1
-      return getter(*args, **kwargs)
-
-    with tf.compat.v1.variable_scope(
-        "scope", custom_getter=custom_getter) as scope:
-      v = tf.compat.v1.get_variable("v", [1])
-    with tf.compat.v1.variable_scope(scope, reuse=True):
-      v2 = tf.compat.v1.get_variable("v", [1])
-    with tf.compat.v1.variable_scope("new_scope") as new_scope:
-      v3 = tf.compat.v1.get_variable("v3", [1])
-    with tf.compat.v1.variable_scope(
-        new_scope, reuse=True, custom_getter=custom_getter):
-      v4 = tf.compat.v1.get_variable("v3", [1])
-
-    self.assertIs(v, v2)
-    self.assertIs(v3, v4)
-    self.assertEqual(3, called[0])  # skipped one in the first new_scope
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testSynchronizationAndAggregationWithCustomGetter(self):
-    called = [0]
-    synchronization = tf.VariableSynchronization.AUTO
-    aggregation = tf.compat.v1.VariableAggregation.NONE
-
-    def custom_getter(getter, *args, **kwargs):
-      called[0] += 1
-
-      # Verify synchronization and aggregation kwargs are as expected.
-      self.assertEqual(kwargs["synchronization"], synchronization)
-      self.assertEqual(kwargs["aggregation"], aggregation)
-      return getter(*args, **kwargs)
-
-    with tf.compat.v1.variable_scope("scope", custom_getter=custom_getter):
-      tf.compat.v1.get_variable("v", [1])
-    self.assertEqual(1, called[0])
-
-    with tf.compat.v1.variable_scope("scope", custom_getter=custom_getter):
-      synchronization = tf.VariableSynchronization.ON_READ
-      aggregation = tf.compat.v1.VariableAggregation.MEAN
-      tf.compat.v1.get_variable(
-          "v1", [1], synchronization=synchronization, aggregation=aggregation)
-
-    self.assertEqual(2, called[0])
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVariableCreator(self):
-    variable_names = []
-
-    def creator_a(next_creator, **kwargs):
-      variable_names.append(kwargs.get("name", ""))
-      return next_creator(**kwargs)
-
-    def creator_b(next_creator, **kwargs):
-      kwargs["name"] = "forced_name"
-      return next_creator(**kwargs)
-
-    with tf.variable_creator_scope(creator_a):
-      with tf.variable_creator_scope(creator_b):
-        tf.compat.v1.Variable(1.0, name="one_name")
-
-    self.assertEqual(variable_names[0], "forced_name")
-
-    called = [False]
-
-    def creater_c(next_creator, **kwargs):
-      called[0] = True
-      self.assertEqual(kwargs["synchronization"],
-                       tf.VariableSynchronization.ON_WRITE)
-      self.assertEqual(kwargs["aggregation"],
-                       tf.compat.v1.VariableAggregation.MEAN)
-      return next_creator(**kwargs)
-
-    with tf.variable_creator_scope(creater_c):
-      tf.compat.v1.get_variable(
-          "v", [],
-          synchronization=tf.VariableSynchronization.ON_WRITE,
-          aggregation=tf.compat.v1.VariableAggregation.MEAN)
-    self.assertTrue(called[0])
-
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testVariableCreatorNestingError(self):
-
-    def creator(next_creator, **kwargs):
-      return next_creator(**kwargs)
-
-    # Save the state so we can clean up at the end.
-    graph = tf.compat.v1.get_default_graph()
-    old_creator_stack = graph._variable_creator_stack
-
-    try:
-      scope = tf.variable_creator_scope(creator)
-      scope.__enter__()
-      with tf.variable_creator_scope(creator):
-        with self.assertRaises(RuntimeError):
-          scope.__exit__(None, None, None)
-    finally:
-      graph._variable_creator_stack = old_creator_stack
-
-
-class VariableScopeMultithreadedTest(tf.test.TestCase):
+        with tf.compat.v1.variable_scope(
+            "scope", custom_getter=custom_getter
+        ) as scope:
+            v = tf.compat.v1.get_variable("v", [1])
+        with tf.compat.v1.variable_scope(scope, reuse=True):
+            v2 = tf.compat.v1.get_variable("v", [1])
+        with tf.compat.v1.variable_scope("new_scope") as new_scope:
+            v3 = tf.compat.v1.get_variable("v3", [1])
+        with tf.compat.v1.variable_scope(
+            new_scope, reuse=True, custom_getter=custom_getter
+        ):
+            v4 = tf.compat.v1.get_variable("v3", [1])
+
+        self.assertIs(v, v2)
+        self.assertIs(v3, v4)
+        self.assertEqual(3, called[0])  # skipped one in the first new_scope
+
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testSynchronizationAndAggregationWithCustomGetter(self):
+        called = [0]
+        synchronization = tf.VariableSynchronization.AUTO
+        aggregation = tf.compat.v1.VariableAggregation.NONE
+
+        def custom_getter(getter, *args, **kwargs):
+            called[0] += 1
+
+            # Verify synchronization and aggregation kwargs are as expected.
+            self.assertEqual(kwargs["synchronization"], synchronization)
+            self.assertEqual(kwargs["aggregation"], aggregation)
+            return getter(*args, **kwargs)
+
+        with tf.compat.v1.variable_scope("scope", custom_getter=custom_getter):
+            tf.compat.v1.get_variable("v", [1])
+        self.assertEqual(1, called[0])
+
+        with tf.compat.v1.variable_scope("scope", custom_getter=custom_getter):
+            synchronization = tf.VariableSynchronization.ON_READ
+            aggregation = tf.compat.v1.VariableAggregation.MEAN
+            tf.compat.v1.get_variable(
+                "v1",
+                [1],
+                synchronization=synchronization,
+                aggregation=aggregation,
+            )
 
-  @tf_test_utils.run_in_graph_and_eager_modes
-  @run_inside_wrap_function_in_eager_mode
-  def testReenterMainScope(self):
+        self.assertEqual(2, called[0])
 
-    def thread_fn(graph, main_thread_scope):
-      with graph.as_default():
-        # Variable created with main scope will have prefix "main".
-        with tf.compat.v1.variable_scope(main_thread_scope):
-          with tf.compat.v1.variable_scope("foo"):
-            v = tf.compat.v1.get_variable("v", [])
-            self.assertEqual("main/foo/v:0", v.name)
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVariableCreator(self):
+        variable_names = []
 
-        # Variable created outside main scope will not have prefix "main".
-        with tf.compat.v1.variable_scope("bar"):
-          v = tf.compat.v1.get_variable("v", [])
-          self.assertEqual("bar/v:0", v.name)
+        def creator_a(next_creator, **kwargs):
+            variable_names.append(kwargs.get("name", ""))
+            return next_creator(**kwargs)
 
-    graph = tf.compat.v1.get_default_graph()
-    with tf.compat.v1.variable_scope("main") as main_thread_scope:
-      thread = threading.Thread(
-          target=thread_fn, args=(graph, main_thread_scope))
-      thread.start()
-      thread.join()
+        def creator_b(next_creator, **kwargs):
+            kwargs["name"] = "forced_name"
+            return next_creator(**kwargs)
 
+        with tf.variable_creator_scope(creator_a):
+            with tf.variable_creator_scope(creator_b):
+                tf.compat.v1.Variable(1.0, name="one_name")
 
-class CompatV1TemplateScaleByY(base_layer.Layer):
+        self.assertEqual(variable_names[0], "forced_name")
 
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    def my_op(x, scalar_name):
-      var1 = tf.compat.v1.get_variable(
-          scalar_name,
-          shape=[],
-          regularizer=regularizers.L2(),
-          initializer=tf.compat.v1.constant_initializer(1.5))
-      return x * var1
-    self.scale_by_y = tf.compat.v1.make_template(
-        "scale_by_y", my_op, scalar_name="y")
+        called = [False]
 
-  @variable_scope_shim.track_tf1_style_variables
-  def call(self, inputs):
-    with tf.compat.v1.variable_scope("foo"):
-      return self.scale_by_y(inputs)
+        def creater_c(next_creator, **kwargs):
+            called[0] = True
+            self.assertEqual(
+                kwargs["synchronization"], tf.VariableSynchronization.ON_WRITE
+            )
+            self.assertEqual(
+                kwargs["aggregation"], tf.compat.v1.VariableAggregation.MEAN
+            )
+            return next_creator(**kwargs)
+
+        with tf.variable_creator_scope(creater_c):
+            tf.compat.v1.get_variable(
+                "v",
+                [],
+                synchronization=tf.VariableSynchronization.ON_WRITE,
+                aggregation=tf.compat.v1.VariableAggregation.MEAN,
+            )
+        self.assertTrue(called[0])
 
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testVariableCreatorNestingError(self):
+        def creator(next_creator, **kwargs):
+            return next_creator(**kwargs)
 
-class VariableScopeModule(tf.Module):
-  """Module that uses the shim."""
+        # Save the state so we can clean up at the end.
+        graph = tf.compat.v1.get_default_graph()
+        old_creator_stack = graph._variable_creator_stack
 
-  @variable_scope_shim.track_tf1_style_variables
-  def __call__(self, *args, **kwargs):
-    with self.name_scope:
-      return self.forward_pass(*args, **kwargs)
+        try:
+            scope = tf.variable_creator_scope(creator)
+            scope.__enter__()
+            with tf.variable_creator_scope(creator):
+                with self.assertRaises(RuntimeError):
+                    scope.__exit__(None, None, None)
+        finally:
+            graph._variable_creator_stack = old_creator_stack
 
-  def get_compat_v1_regularization_losses(self):
-    """Dict w/ regularization losses from `get_variable`&`compat.v1.layers`."""
-    return {name: regularizer() for name, regularizer
-            in self._tf1_style_var_store._regularizers.items()}  # pylint: disable=protected-access
 
+class VariableScopeMultithreadedTest(tf.test.TestCase):
+    @tf_test_utils.run_in_graph_and_eager_modes
+    @run_inside_wrap_function_in_eager_mode
+    def testReenterMainScope(self):
+        def thread_fn(graph, main_thread_scope):
+            with graph.as_default():
+                # Variable created with main scope will have prefix "main".
+                with tf.compat.v1.variable_scope(main_thread_scope):
+                    with tf.compat.v1.variable_scope("foo"):
+                        v = tf.compat.v1.get_variable("v", [])
+                        self.assertEqual("main/foo/v:0", v.name)
+
+                # Variable created outside main scope will not have prefix
+                # "main".
+                with tf.compat.v1.variable_scope("bar"):
+                    v = tf.compat.v1.get_variable("v", [])
+                    self.assertEqual("bar/v:0", v.name)
+
+        graph = tf.compat.v1.get_default_graph()
+        with tf.compat.v1.variable_scope("main") as main_thread_scope:
+            thread = threading.Thread(
+                target=thread_fn, args=(graph, main_thread_scope)
+            )
+            thread.start()
+            thread.join()
 
-@test_combinations.generate(test_combinations.combine(mode=["eager"]))
-class TF1VariableScopeLayerTest(tf.test.TestCase, parameterized.TestCase):
 
-  def test_get_variable(self):
-    # Test the shim when using `get_variable` (and regularizers) directly
-
-    class WrappedDenseLayer(base_layer.Layer):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs, training=None):
-        out = inputs
-        with tf.compat.v1.variable_scope("dense_one"):
-          # The weights are created with a `regularizer`,
-          # so the layer should track their regularization losses
-          kernel = tf.compat.v1.get_variable(
-              shape=[out.shape[-1], self.units],
-              regularizer=regularizers.L2(),
-              initializer=tf.compat.v1.ones_initializer(),
-              name="kernel")
-          bias = tf.compat.v1.get_variable(
-              shape=[self.units,],
-              initializer=tf.compat.v1.zeros_initializer(),
-              name="bias")
-          out = tf.matmul(out, kernel)
-          out = tf.nn.bias_add(out, bias)
-        with tf.compat.v1.variable_scope("nested_scope"):
-          with tf.compat.v1.variable_scope("dense_two"):
-            kernel = tf.compat.v1.get_variable(
-                shape=[out.shape[-1], self.units],
-                regularizer=regularizers.L2(),
-                initializer=tf.compat.v1.ones_initializer(),
-                name="kernel")
-            bias = tf.compat.v1.get_variable(
-                shape=[self.units,],
-                initializer=tf.compat.v1.zeros_initializer(),
-                name="bias")
-            out = tf.matmul(out, kernel)
-            out = tf.nn.bias_add(out, bias)
-        return out
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, regularization losses, + variables were made
-    self.assertEqual(weights.keys(), {"dense_one/bias:0",
-                                      "dense_one/kernel:0",
-                                      "nested_scope/dense_two/bias:0",
-                                      "nested_scope/dense_two/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
-    self.assertAllEqual(tf.add_n(layer.losses), 1.5)
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
-    weights["nested_scope/dense_two/kernel:0"].assign(
-        tf.ones(shape=(10, 10)) * 2)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
-    self.assertAllEqual(tf.add_n(layer.losses), 6)
-
-  def test_compat_v1_layer(self):
-    # Test the shim when using `compat.v1` layers
-
-    class WrappedDenseLayer(base_layer.Layer):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs, training=None):
-        out = core_layers.dense(
-            inputs, self.units, name="dense_one",
-            kernel_initializer=tf.compat.v1.ones_initializer(),
-            kernel_regularizer="l2")
-        with tf.compat.v1.variable_scope("nested_scope"):
-          out = core_layers.dense(
-              out, self.units, name="dense_two",
-              kernel_initializer=tf.compat.v1.ones_initializer(),
-              kernel_regularizer="l2")
-        return out
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, losses, + variables were made
-    self.assertEqual(weights.keys(), {"dense_one/bias:0",
-                                      "dense_one/kernel:0",
-                                      "nested_scope/dense_two/bias:0",
-                                      "nested_scope/dense_two/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
-    self.assertAllEqual(tf.add_n(layer.losses), 1.5)
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
-    weights["nested_scope/dense_two/kernel:0"].assign(
-        tf.ones(shape=(10, 10)) * 2)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
-    self.assertAllEqual(tf.add_n(layer.losses), 6)
-
-  def test_shim_exporting(self):
-
-    class WrappedDenseLayer(base_layer.Layer):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs, training=None):
-        out = core_layers.dense(
-            inputs,
-            self.units,
-            name="dense_one",
-            kernel_initializer=tf.compat.v1.ones_initializer(),
-            kernel_regularizer="l2")
-        with tf.compat.v1.variable_scope("nested_scope"):
-          out = core_layers.dense(
-              out,
-              self.units,
-              name="dense_two",
-              kernel_initializer=tf.compat.v1.ones_initializer(),
-              kernel_regularizer="l2")
-        return out
-
-    layer = WrappedDenseLayer(10)
-    layer(tf.ones(shape=(5, 5)))
-
-    tmp_dir = self.get_temp_dir()
-
-    # Try exporting the layer directly
-    tf.saved_model.save(layer, tmp_dir)
-
-    # Try exporting the layer nested in a functional model
-    # This is where saving reflection gets tricky due to
-    # trying to replace the passed training arg in training=True
-    # and training=False modes
-    inp = input_layer_module.Input(shape=(5, 5))
-    outs = layer(inp)
-    model = models.Model(inp, outs)
-    tf.saved_model.save(model, tmp_dir)
-
-  def test_variable_store_scope_get_variable(self):
-    # Test the module shim when using `get_variable` (and regularizers) directly
-
-    class WrappedDenseLayer(tf.Module):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-        self._variable_store = variable_scope_shim._EagerVariableStore()
-
-      def get_compat_v1_regularization_losses(self):
-        """Dict w/ regularization losses from `get_variable`."""
-        return {name: regularizer() for name, regularizer
-                in self._variable_store._regularizers.items()}  # pylint: disable=protected-access
-
-      def __call__(self, inputs, training=None):
-        with self._variable_store.scope():
-          out = inputs
-          with tf.compat.v1.variable_scope("dense_one"):
-            # The weights are created with a `regularizer`,
-            # so the layer should track their regularization losses
-            kernel = tf.compat.v1.get_variable(
-                shape=[out.shape[-1], self.units],
-                regularizer=regularizers.L2(),
-                initializer=tf.compat.v1.ones_initializer(),
-                name="kernel")
-            bias = tf.compat.v1.get_variable(
-                shape=[self.units,],
-                initializer=tf.compat.v1.zeros_initializer(),
-                name="bias")
-            out = tf.matmul(out, kernel)
-            out = tf.nn.bias_add(out, bias)
-          with tf.compat.v1.variable_scope("nested_scope"):
-            with tf.compat.v1.variable_scope("dense_two"):
-              kernel = tf.compat.v1.get_variable(
-                  shape=[out.shape[-1], self.units],
-                  regularizer=regularizers.L2(),
-                  initializer=tf.compat.v1.ones_initializer(),
-                  name="kernel")
-              bias = tf.compat.v1.get_variable(
-                  shape=[self.units,],
-                  initializer=tf.compat.v1.zeros_initializer(),
-                  name="bias")
-              out = tf.matmul(out, kernel)
-              out = tf.nn.bias_add(out, bias)
-          return out
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, regularization losses, + variables were made
-    self.assertEqual(weights.keys(), {"dense_one/bias:0",
-                                      "dense_one/kernel:0",
-                                      "nested_scope/dense_two/bias:0",
-                                      "nested_scope/dense_two/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
-    self.assertAllEqual(
-        tf.add_n(layer.get_compat_v1_regularization_losses().values()), 1.5)
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
-    weights["nested_scope/dense_two/kernel:0"].assign(
-        tf.ones(shape=(10, 10)) * 2)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
-    self.assertAllEqual(
-        tf.add_n(layer.get_compat_v1_regularization_losses().values()), 6)
-
-  def test_module_get_variable(self):
-    # Test the module shim when using `get_variable` (and regularizers) directly
-
-    class WrappedDenseLayer(VariableScopeModule):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      def forward_pass(self, inputs, training=None):
-        out = inputs
-        with tf.compat.v1.variable_scope("dense_one"):
-          # The weights are created with a `regularizer`,
-          # so the layer should track their regularization losses
-          kernel = tf.compat.v1.get_variable(
-              shape=[out.shape[-1], self.units],
-              regularizer=regularizers.L2(),
-              initializer=tf.compat.v1.ones_initializer(),
-              name="kernel")
-          bias = tf.compat.v1.get_variable(
-              shape=[self.units,],
-              initializer=tf.compat.v1.zeros_initializer(),
-              name="bias")
-          out = tf.matmul(out, kernel)
-          out = tf.nn.bias_add(out, bias)
-        with tf.compat.v1.variable_scope("nested_scope"):
-          with tf.compat.v1.variable_scope("dense_two"):
-            kernel = tf.compat.v1.get_variable(
-                shape=[out.shape[-1], self.units],
-                regularizer=regularizers.L2(),
-                initializer=tf.compat.v1.ones_initializer(),
-                name="kernel")
-            bias = tf.compat.v1.get_variable(
-                shape=[self.units,],
-                initializer=tf.compat.v1.zeros_initializer(),
-                name="bias")
-            out = tf.matmul(out, kernel)
-            out = tf.nn.bias_add(out, bias)
-        return out
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, regularization losses, + variables were made
-    self.assertEqual(weights.keys(), {"dense_one/bias:0",
-                                      "dense_one/kernel:0",
-                                      "nested_scope/dense_two/bias:0",
-                                      "nested_scope/dense_two/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
-    self.assertAllEqual(
-        tf.add_n(layer.get_compat_v1_regularization_losses().values()), 1.5)
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
-    weights["nested_scope/dense_two/kernel:0"].assign(
-        tf.ones(shape=(10, 10)) * 2)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
-    self.assertAllEqual(
-        tf.add_n(layer.get_compat_v1_regularization_losses().values()), 6)
-
-  def test_module_compat_v1_layer(self):
-    # Test the module shim when using `compat.v1` layers
-
-    class WrappedDenseLayer(VariableScopeModule):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      def forward_pass(self, inputs, training=None):
-        out = core_layers.dense(
-            inputs, self.units, name="dense_one",
-            kernel_initializer=tf.compat.v1.ones_initializer(),
-            kernel_regularizer="l2")
-        with tf.compat.v1.variable_scope("nested_scope"):
-          out = core_layers.dense(
-              out, self.units, name="dense_two",
-              kernel_initializer=tf.compat.v1.ones_initializer(),
-              kernel_regularizer="l2")
-        return out
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, losses, + variables were made
-    self.assertEqual(weights.keys(), {"dense_one/bias:0",
-                                      "dense_one/kernel:0",
-                                      "nested_scope/dense_two/bias:0",
-                                      "nested_scope/dense_two/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
-    self.assertAllEqual(tf.add_n(
-        layer.get_compat_v1_regularization_losses().values()), 1.5)
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
-    weights["nested_scope/dense_two/kernel:0"].assign(
-        tf.ones(shape=(10, 10)) * 2)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
-    self.assertAllEqual(tf.add_n(
-        layer.get_compat_v1_regularization_losses().values()), 6)
-
-  def test_shim_nesting(self):
-    # Test that nesting the shim in itself works
-
-    class NestedLayer(base_layer.Layer):
-
-      def __init__(self, units, name, *args, **kwargs):
-        super().__init__(*args, name=name, **kwargs)
-        self.units = units
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs):
-        out = inputs
-        with tf.compat.v1.variable_scope(self.name):
-          # The weights are created with a `regularizer`,
-          # so the layer should track their regularization losses
-          kernel = tf.compat.v1.get_variable(
-              shape=[out.shape[-1], self.units],
-              regularizer=regularizers.L2(1.0),
-              initializer=tf.compat.v1.ones_initializer(),
-              name="kernel")
-          bias = tf.compat.v1.get_variable(
-              shape=[self.units,],
-              initializer=tf.compat.v1.initializers.zeros,
-              name="bias")
-          out = tf.linalg.matmul(out, kernel)
-          out = tf.compat.v1.nn.bias_add(out, bias)
-        return out
-
-    class WrappedDenseLayer(base_layer.Layer):
-
-      def __init__(self, units, **kwargs):
-        super().__init__(**kwargs)
-        self.units = units
-        self.dense_layer_a = None
-        self.dense_layer_b = None
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs):
-        # Only create the nested tf.variable/module/layer/model if it has not
-        # already been created!
-        if not self.dense_layer_a:
-          self.dense_layer_a = NestedLayer(self.units * 2, "dense_one")
-        out = self.dense_layer_a(inputs)
-        if not self.dense_layer_b:
-          self.dense_layer_b = NestedLayer(self.units, "dense_two")
-        out = self.dense_layer_b(out)
-        return out
-
-    layer = WrappedDenseLayer(5)
-    out = layer(tf.ones(shape=(1, 3)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, losses, + variables were made
-    # (Specifically: no double-counting of any weights or reg. losses
-    # between nested components!)
-    self.assertEqual({var.name for var in layer.trainable_weights},
-                     {"dense_one/bias:0",
-                      "dense_one/kernel:0",
-                      "dense_two/bias:0",
-                      "dense_two/kernel:0"})
-    self.assertEqual({var.name for var in layer.dense_layer_a.weights},
-                     {"dense_one/bias:0",
-                      "dense_one/kernel:0"})
-    self.assertEqual({var.name for var in layer.dense_layer_b.weights},
-                     {"dense_two/bias:0",
-                      "dense_two/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(1, 5)) * 30)
-    self.assertAllEqual(tf.add_n(layer.dense_layer_a.losses), 30)
-    self.assertAllEqual(tf.add_n(layer.dense_layer_b.losses), 50)
-    self.assertAllEqual(tf.add_n(layer.losses), 80)
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense_one/kernel:0"].assign(tf.ones(shape=(3, 10)) * 2)
-    weights["dense_two/kernel:0"].assign(
-        tf.ones(shape=(10, 5)) * 2)
-    out = layer(tf.ones(shape=(1, 3)))
-    self.assertAllEqual(out, tf.ones(shape=(1, 5)) * 120)
-    self.assertAllEqual(tf.add_n(layer.losses), 320)
-
-  def test_compat_v1_make_template_in_shim_eager(self):
-    # Test the shim when using `compat.v1.make_template`
-    # Verify it works correctly in eager
-    layer = CompatV1TemplateScaleByY()
-    for _ in range(3):
-      # Use multiple calls to verify that no new weights get created
-      self.assertAllEqual(layer(tf.ones(shape=(2, 3))),
-                          tf.constant(1.5, shape=(2, 3)))
-    self.assertAllEqual({var.name: var.numpy() for var in layer.weights},
-                        {"foo/scale_by_y/y:0": 1.5})
-    self.assertAllEqual(tf.add_n(layer.losses),
-                        regularizers.L2()(layer.weights[0]))
-
-  def test_compat_v1_make_template_in_shim_tf_function(self):
-    # Test the shim when using `compat.v1.make_template`
-    # Verify it works correctly in a tf.function
-    # when made outside the function
-    layer = CompatV1TemplateScaleByY()
-
-    @tf.function
-    def foo(x):
-      return layer(x), tf.add_n(layer.losses)
-
-    for _ in range(3):
-      # Use multiple calls to verify that no new weights get created
-      out, loss = foo(tf.ones(shape=(2, 3)))
-      self.assertAllEqual(out, tf.constant(1.5, shape=(2, 3)))
-      self.assertAllEqual(loss, regularizers.L2()(layer.weights[0]))
-    self.assertAllEqual({var.name: var.numpy() for var in layer.weights},
-                        {"foo/scale_by_y/y:0": 1.5})
-
-  def test_compat_v1_make_template_in_trace_in_shim(self):
-    # Test the shim when using `compat.v1.make_template`
-    # Verify it works correctly when the make_template/layer/shim
-    # is created on the first tf.function trace!
-    layers = {}
-    @tf.function
-    def bar(x):
-      if "layer" not in layers:
-        layers["layer"] = CompatV1TemplateScaleByY()
-      layer = layers["layer"]
-      return layer(x), tf.add_n(layer.losses)
-
-    for _ in range(3):
-      # Use multiple calls to verify that no new weights get created
-      out, loss = bar(tf.ones(shape=(2, 3)))
-      self.assertAllEqual(out, tf.constant(1.5, shape=(2, 3)))
-      self.assertAllEqual(loss, regularizers.L2()(layers["layer"].weights[0]))
-    self.assertAllEqual(
-        {var.name: var.numpy() for var in layers["layer"].weights},
-        {"foo/scale_by_y/y:0": 1.5})
-
-  def test_only_track_get_variable(self):
-    # Test the shim does not try tracking or reusing variables
-    # that were not created by get_variable. These variables/modules/layers
-    # need to be tracked separately
-
-    class WrappedDenseLayer(base_layer.Layer):
-
-      def __init__(self, units, **kwargs):
+class CompatV1TemplateScaleByY(base_layer.Layer):
+    def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.units = units
-        self._dense_model = None
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs):
-        dense_layer = core.Dense(
-            self.units, name="dense",
-            kernel_initializer=tf.compat.v1.ones_initializer(),
-            kernel_regularizer="l2")
-        return dense_layer(inputs)
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 5)
 
-    self.assertEmpty(layer.weights)
-
-  def test_embedded_keras_model(self):
-    # Test the shim when embedding a Keras model inside of it
-    # And assigning the model to an attribute
+        def my_op(x, scalar_name):
+            var1 = tf.compat.v1.get_variable(
+                scalar_name,
+                shape=[],
+                regularizer=regularizers.L2(),
+                initializer=tf.compat.v1.constant_initializer(1.5),
+            )
+            return x * var1
 
-    class WrappedDenseLayer(base_layer.Layer):
+        self.scale_by_y = tf.compat.v1.make_template(
+            "scale_by_y", my_op, scalar_name="y"
+        )
 
-      def __init__(self, units, **kwargs):
-        super().__init__(**kwargs)
-        self.units = units
-        self._dense_model = None
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs):
-        if not self._dense_model:
-          inp = input_layer_module.Input(shape=inputs.shape)
-          dense_layer = core.Dense(
-              self.units, name="dense",
-              kernel_initializer=tf.compat.v1.ones_initializer(),
-              kernel_regularizer="l2")
-          self._dense_model = training_module.Model(
-              inputs=inp, outputs=dense_layer(inp))
-        return self._dense_model(inputs)
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, losses, + variables were made
-    self.assertEqual(weights.keys(), {"dense/bias:0",
-                                      "dense/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 5)
-    self.assertAllEqual(tf.add_n(layer.losses), 0.5)
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense/kernel:0"].assign(
-        tf.ones(shape=(5, 10)) * 2)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 10)
-    self.assertAllEqual(tf.add_n(layer.losses), 2)
-
-  def test_embedded_keras_model_in_module(self):
-    # Test the module shim when embedding a Keras model inside of it
-    # And assigning the model to an attribute
-
-    class WrappedDenseLayer(VariableScopeModule):
-
-      def __init__(self, units, **kwargs):
-        super().__init__(**kwargs)
-        self.units = units
-        self._dense_model = None
-
-      def forward_pass(self, inputs):
-        if not self._dense_model:
-          inp = input_layer_module.Input(shape=inputs.shape)
-          dense_layer = core.Dense(
-              self.units, name="dense",
-              kernel_initializer=tf.compat.v1.ones_initializer(),
-              kernel_regularizer="l2")
-          self._dense_model = training_module.Model(
-              inputs=inp, outputs=dense_layer(inp))
-        return self._dense_model(inputs)
-
-    layer = WrappedDenseLayer(10)
-    out = layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct output, losses, + variables were made
-    self.assertEqual(weights.keys(), {"dense/bias:0",
-                                      "dense/kernel:0"})
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 5)
-
-    # The module shim will only track regularization losses made by
-    # compat.v1.layers and compat.v1.get_variable. Other regularization
-    # losses must be tracked by separate user-created mechanisms.
-    self.assertEmpty(layer.get_compat_v1_regularization_losses())
-
-    # Verify reuse by updating the variables then re-running
-    weights["dense/kernel:0"].assign(
-        tf.ones(shape=(5, 10)) * 2)
-    out = layer(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 10)
-
-    # The module shim will only track regularization losses made by
-    # compat.v1.layers and compat.v1.get_variable. Other regularization
-    # losses must be tracked by separate user-created mechanisms.
-    self.assertEmpty(layer.get_compat_v1_regularization_losses())
-
-  def test_training_arg(self):
-    # Test the shim when passing in a Keras `training` arg
-
-    class TrainingCheckLayer(base_layer.Layer):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs, training=None):
-        if training:
-          out = core_layers.dense(inputs, self.units, name="dense_training")
-        else:
-          out = core_layers.dense(inputs, self.units, name="dense_no_training")
-        return out
-
-    layer = TrainingCheckLayer(10)
-    layer(tf.ones(shape=(5, 5)), training=True)
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct variables were made
-    self.assertEqual(weights.keys(),
-                     {"dense_training/bias:0", "dense_training/kernel:0"})
-
-    layer = TrainingCheckLayer(10)
-    layer(tf.ones(shape=(5, 5)))
-    weights = {x.name: x for x in layer.variables}
-
-    # Verify the correct variables were made
-    self.assertEqual(weights.keys(),
-                     {"dense_no_training/bias:0", "dense_no_training/kernel:0"})
-
-  def test_incorrect_decoration(self):
-    # Raise an error if you incorrectly decorate a method
-    # that is not a method of a Module, layer, or model:
     @variable_scope_shim.track_tf1_style_variables
-    def foo(x):
-      return x * 2
-
-    with self.assertRaisesRegex(ValueError, "does not extend"):
-      foo(tf.ones(shape=(4, 4)))
-
-
-class GetOrCreateLayerTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def test_get_or_create_layer_with_regularizer_eager(self):
-
-    class NestedLayer(base_layer.Layer):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      def build_model(self):
-        inp = input_layer_module.Input(shape=(5, 5))
-        dense_layer = core.Dense(
-            10, name="dense", kernel_regularizer="l2",
-            kernel_initializer=tf.compat.v1.ones_initializer())
-        model = training_module.Model(inputs=inp, outputs=dense_layer(inp))
-        return model
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs):
-        # enter a variable scope to check module key naming
-        with tf.compat.v1.variable_scope("test_scope"):
-          model = variable_scope_shim.get_or_create_layer(
-              "dense_model", self.build_model)
-          return model(inputs)
-
-    layer = NestedLayer(10)
-    x = tf.ones(shape=(5, 5))
-
-    out1 = layer(tf.expand_dims(x, 0))
-
-    model1 = layer.submodules[0]._layers["test_scope/dense_model"]
-
-    out2 = layer(tf.expand_dims(x, 0))
-    # Verify model produces same output on successive calls with same input
-    self.assertAllEqual(out1, out2)
-
-    # Verify the model used on subsequent calls is the same
-    model2 = layer.submodules[0]._layers["test_scope/dense_model"]
-    self.assertIs(model1, model2)
-
-    # Verify that stored layer computes outputs and losses correctly
-    weights = {x.name: x for x in layer.variables}
-    self.assertEqual(weights.keys(), {"dense/bias:0", "dense/kernel:0"})
-    self.assertAllEqual(out2, tf.ones(shape=(1, 5, 10)) * 5)
-    self.assertAllEqual(layer.losses, [0.5])
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def test_get_or_create_layer_no_regularizer_eager(self):
-
-    class NestedLayer(base_layer.Layer):
-
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
-
-      def build_model(self):
-        inp = input_layer_module.Input(shape=(5, 5))
-        dense_layer = core.Dense(
-            10, name="dense",
-            kernel_initializer=tf.compat.v1.ones_initializer())
-        model = training_module.Model(inputs=inp, outputs=dense_layer(inp))
-        return model
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs):
-        # enter a variable scope to check module key naming
-        with tf.compat.v1.variable_scope("test_scope"):
-          model = variable_scope_shim.get_or_create_layer(
-              "dense_model", self.build_model)
-          return model(inputs)
-
-    layer = NestedLayer(10)
-    x = tf.ones(shape=(5, 5))
-
-    out1 = layer(tf.expand_dims(x, 0))
-
-    model1 = layer.submodules[0]._layers["test_scope/dense_model"]
-
-    out2 = layer(tf.expand_dims(x, 0))
-    # Verify model produces same output on successive calls with same input
-    self.assertAllEqual(out1, out2)
+    def call(self, inputs):
+        with tf.compat.v1.variable_scope("foo"):
+            return self.scale_by_y(inputs)
 
-    # Verify the model used on subsequent calls is the same
-    model2 = layer.submodules[0]._layers["test_scope/dense_model"]
-    self.assertIs(model1, model2)
 
-    # Verify that stored layer computes outputs and losses correctly
-    weights = {x.name: x for x in layer.variables}
-    self.assertEqual(weights.keys(), {"dense/bias:0", "dense/kernel:0"})
-    self.assertAllEqual(out2, tf.ones(shape=(1, 5, 10)) * 5)
-    self.assertAllEqual(layer.losses, [0.0])
+class VariableScopeModule(tf.Module):
+    """Module that uses the shim."""
 
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def test_get_or_create_layer_tf_function(self):
+    @variable_scope_shim.track_tf1_style_variables
+    def __call__(self, *args, **kwargs):
+        with self.name_scope:
+            return self.forward_pass(*args, **kwargs)
 
-    class NestedLayer(base_layer.Layer):
+    def get_compat_v1_regularization_losses(self):
+        """Dict w/ regularization losses from
+        `get_variable`&`compat.v1.layers`."""
+        return {
+            name: regularizer()
+            for name, regularizer in self._tf1_style_var_store._regularizers.items()  # noqa: E501
+        }
 
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
 
-      def build_model(self):
+@test_combinations.generate(test_combinations.combine(mode=["eager"]))
+class TF1VariableScopeLayerTest(tf.test.TestCase, parameterized.TestCase):
+    def test_get_variable(self):
+        # Test the shim when using `get_variable` (and regularizers) directly
+
+        class WrappedDenseLayer(base_layer.Layer):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs, training=None):
+                out = inputs
+                with tf.compat.v1.variable_scope("dense_one"):
+                    # The weights are created with a `regularizer`,
+                    # so the layer should track their regularization losses
+                    kernel = tf.compat.v1.get_variable(
+                        shape=[out.shape[-1], self.units],
+                        regularizer=regularizers.L2(),
+                        initializer=tf.compat.v1.ones_initializer(),
+                        name="kernel",
+                    )
+                    bias = tf.compat.v1.get_variable(
+                        shape=[
+                            self.units,
+                        ],
+                        initializer=tf.compat.v1.zeros_initializer(),
+                        name="bias",
+                    )
+                    out = tf.matmul(out, kernel)
+                    out = tf.nn.bias_add(out, bias)
+                with tf.compat.v1.variable_scope("nested_scope"):
+                    with tf.compat.v1.variable_scope("dense_two"):
+                        kernel = tf.compat.v1.get_variable(
+                            shape=[out.shape[-1], self.units],
+                            regularizer=regularizers.L2(),
+                            initializer=tf.compat.v1.ones_initializer(),
+                            name="kernel",
+                        )
+                        bias = tf.compat.v1.get_variable(
+                            shape=[
+                                self.units,
+                            ],
+                            initializer=tf.compat.v1.zeros_initializer(),
+                            name="bias",
+                        )
+                        out = tf.matmul(out, kernel)
+                        out = tf.nn.bias_add(out, bias)
+                return out
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, regularization losses, + variables were
+        # made
+        self.assertEqual(
+            weights.keys(),
+            {
+                "dense_one/bias:0",
+                "dense_one/kernel:0",
+                "nested_scope/dense_two/bias:0",
+                "nested_scope/dense_two/kernel:0",
+            },
+        )
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
+        self.assertAllEqual(tf.add_n(layer.losses), 1.5)
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
+        weights["nested_scope/dense_two/kernel:0"].assign(
+            tf.ones(shape=(10, 10)) * 2
+        )
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
+        self.assertAllEqual(tf.add_n(layer.losses), 6)
+
+    def test_compat_v1_layer(self):
+        # Test the shim when using `compat.v1` layers
+
+        class WrappedDenseLayer(base_layer.Layer):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs, training=None):
+                out = core_layers.dense(
+                    inputs,
+                    self.units,
+                    name="dense_one",
+                    kernel_initializer=tf.compat.v1.ones_initializer(),
+                    kernel_regularizer="l2",
+                )
+                with tf.compat.v1.variable_scope("nested_scope"):
+                    out = core_layers.dense(
+                        out,
+                        self.units,
+                        name="dense_two",
+                        kernel_initializer=tf.compat.v1.ones_initializer(),
+                        kernel_regularizer="l2",
+                    )
+                return out
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, losses, + variables were made
+        self.assertEqual(
+            weights.keys(),
+            {
+                "dense_one/bias:0",
+                "dense_one/kernel:0",
+                "nested_scope/dense_two/bias:0",
+                "nested_scope/dense_two/kernel:0",
+            },
+        )
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
+        self.assertAllEqual(tf.add_n(layer.losses), 1.5)
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
+        weights["nested_scope/dense_two/kernel:0"].assign(
+            tf.ones(shape=(10, 10)) * 2
+        )
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
+        self.assertAllEqual(tf.add_n(layer.losses), 6)
+
+    def test_shim_exporting(self):
+        class WrappedDenseLayer(base_layer.Layer):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs, training=None):
+                out = core_layers.dense(
+                    inputs,
+                    self.units,
+                    name="dense_one",
+                    kernel_initializer=tf.compat.v1.ones_initializer(),
+                    kernel_regularizer="l2",
+                )
+                with tf.compat.v1.variable_scope("nested_scope"):
+                    out = core_layers.dense(
+                        out,
+                        self.units,
+                        name="dense_two",
+                        kernel_initializer=tf.compat.v1.ones_initializer(),
+                        kernel_regularizer="l2",
+                    )
+                return out
+
+        layer = WrappedDenseLayer(10)
+        layer(tf.ones(shape=(5, 5)))
+
+        tmp_dir = self.get_temp_dir()
+
+        # Try exporting the layer directly
+        tf.saved_model.save(layer, tmp_dir)
+
+        # Try exporting the layer nested in a functional model
+        # This is where saving reflection gets tricky due to
+        # trying to replace the passed training arg in training=True
+        # and training=False modes
         inp = input_layer_module.Input(shape=(5, 5))
-        dense_layer = core.Dense(
-            10, name="dense", kernel_regularizer="l2",
+        outs = layer(inp)
+        model = models.Model(inp, outs)
+        tf.saved_model.save(model, tmp_dir)
+
+    def test_variable_store_scope_get_variable(self):
+        # Test the module shim when using `get_variable` (and regularizers)
+        # directly
+
+        class WrappedDenseLayer(tf.Module):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+                self._variable_store = variable_scope_shim._EagerVariableStore()
+
+            def get_compat_v1_regularization_losses(self):
+                """Dict w/ regularization losses from `get_variable`."""
+                return {
+                    name: regularizer()
+                    for name, regularizer in self._variable_store._regularizers.items()  # noqa: E501
+                }
+
+            def __call__(self, inputs, training=None):
+                with self._variable_store.scope():
+                    out = inputs
+                    with tf.compat.v1.variable_scope("dense_one"):
+                        # The weights are created with a `regularizer`,
+                        # so the layer should track their regularization losses
+                        kernel = tf.compat.v1.get_variable(
+                            shape=[out.shape[-1], self.units],
+                            regularizer=regularizers.L2(),
+                            initializer=tf.compat.v1.ones_initializer(),
+                            name="kernel",
+                        )
+                        bias = tf.compat.v1.get_variable(
+                            shape=[
+                                self.units,
+                            ],
+                            initializer=tf.compat.v1.zeros_initializer(),
+                            name="bias",
+                        )
+                        out = tf.matmul(out, kernel)
+                        out = tf.nn.bias_add(out, bias)
+                    with tf.compat.v1.variable_scope("nested_scope"):
+                        with tf.compat.v1.variable_scope("dense_two"):
+                            kernel = tf.compat.v1.get_variable(
+                                shape=[out.shape[-1], self.units],
+                                regularizer=regularizers.L2(),
+                                initializer=tf.compat.v1.ones_initializer(),
+                                name="kernel",
+                            )
+                            bias = tf.compat.v1.get_variable(
+                                shape=[
+                                    self.units,
+                                ],
+                                initializer=tf.compat.v1.zeros_initializer(),
+                                name="bias",
+                            )
+                            out = tf.matmul(out, kernel)
+                            out = tf.nn.bias_add(out, bias)
+                    return out
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, regularization losses, + variables were
+        # made
+        self.assertEqual(
+            weights.keys(),
+            {
+                "dense_one/bias:0",
+                "dense_one/kernel:0",
+                "nested_scope/dense_two/bias:0",
+                "nested_scope/dense_two/kernel:0",
+            },
+        )
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
+        self.assertAllEqual(
+            tf.add_n(layer.get_compat_v1_regularization_losses().values()), 1.5
+        )
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
+        weights["nested_scope/dense_two/kernel:0"].assign(
+            tf.ones(shape=(10, 10)) * 2
+        )
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
+        self.assertAllEqual(
+            tf.add_n(layer.get_compat_v1_regularization_losses().values()), 6
+        )
+
+    def test_module_get_variable(self):
+        # Test the module shim when using `get_variable` (and regularizers)
+        # directly
+
+        class WrappedDenseLayer(VariableScopeModule):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            def forward_pass(self, inputs, training=None):
+                out = inputs
+                with tf.compat.v1.variable_scope("dense_one"):
+                    # The weights are created with a `regularizer`,
+                    # so the layer should track their regularization losses
+                    kernel = tf.compat.v1.get_variable(
+                        shape=[out.shape[-1], self.units],
+                        regularizer=regularizers.L2(),
+                        initializer=tf.compat.v1.ones_initializer(),
+                        name="kernel",
+                    )
+                    bias = tf.compat.v1.get_variable(
+                        shape=[
+                            self.units,
+                        ],
+                        initializer=tf.compat.v1.zeros_initializer(),
+                        name="bias",
+                    )
+                    out = tf.matmul(out, kernel)
+                    out = tf.nn.bias_add(out, bias)
+                with tf.compat.v1.variable_scope("nested_scope"):
+                    with tf.compat.v1.variable_scope("dense_two"):
+                        kernel = tf.compat.v1.get_variable(
+                            shape=[out.shape[-1], self.units],
+                            regularizer=regularizers.L2(),
+                            initializer=tf.compat.v1.ones_initializer(),
+                            name="kernel",
+                        )
+                        bias = tf.compat.v1.get_variable(
+                            shape=[
+                                self.units,
+                            ],
+                            initializer=tf.compat.v1.zeros_initializer(),
+                            name="bias",
+                        )
+                        out = tf.matmul(out, kernel)
+                        out = tf.nn.bias_add(out, bias)
+                return out
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, regularization losses, + variables were
+        # made
+        self.assertEqual(
+            weights.keys(),
+            {
+                "dense_one/bias:0",
+                "dense_one/kernel:0",
+                "nested_scope/dense_two/bias:0",
+                "nested_scope/dense_two/kernel:0",
+            },
+        )
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
+        self.assertAllEqual(
+            tf.add_n(layer.get_compat_v1_regularization_losses().values()), 1.5
+        )
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
+        weights["nested_scope/dense_two/kernel:0"].assign(
+            tf.ones(shape=(10, 10)) * 2
+        )
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
+        self.assertAllEqual(
+            tf.add_n(layer.get_compat_v1_regularization_losses().values()), 6
+        )
+
+    def test_module_compat_v1_layer(self):
+        # Test the module shim when using `compat.v1` layers
+
+        class WrappedDenseLayer(VariableScopeModule):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            def forward_pass(self, inputs, training=None):
+                out = core_layers.dense(
+                    inputs,
+                    self.units,
+                    name="dense_one",
+                    kernel_initializer=tf.compat.v1.ones_initializer(),
+                    kernel_regularizer="l2",
+                )
+                with tf.compat.v1.variable_scope("nested_scope"):
+                    out = core_layers.dense(
+                        out,
+                        self.units,
+                        name="dense_two",
+                        kernel_initializer=tf.compat.v1.ones_initializer(),
+                        kernel_regularizer="l2",
+                    )
+                return out
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, losses, + variables were made
+        self.assertEqual(
+            weights.keys(),
+            {
+                "dense_one/bias:0",
+                "dense_one/kernel:0",
+                "nested_scope/dense_two/bias:0",
+                "nested_scope/dense_two/kernel:0",
+            },
+        )
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 50)
+        self.assertAllEqual(
+            tf.add_n(layer.get_compat_v1_regularization_losses().values()), 1.5
+        )
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense_one/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
+        weights["nested_scope/dense_two/kernel:0"].assign(
+            tf.ones(shape=(10, 10)) * 2
+        )
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 200)
+        self.assertAllEqual(
+            tf.add_n(layer.get_compat_v1_regularization_losses().values()), 6
+        )
+
+    def test_shim_nesting(self):
+        # Test that nesting the shim in itself works
+
+        class NestedLayer(base_layer.Layer):
+            def __init__(self, units, name, *args, **kwargs):
+                super().__init__(*args, name=name, **kwargs)
+                self.units = units
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs):
+                out = inputs
+                with tf.compat.v1.variable_scope(self.name):
+                    # The weights are created with a `regularizer`,
+                    # so the layer should track their regularization losses
+                    kernel = tf.compat.v1.get_variable(
+                        shape=[out.shape[-1], self.units],
+                        regularizer=regularizers.L2(1.0),
+                        initializer=tf.compat.v1.ones_initializer(),
+                        name="kernel",
+                    )
+                    bias = tf.compat.v1.get_variable(
+                        shape=[
+                            self.units,
+                        ],
+                        initializer=tf.compat.v1.initializers.zeros,
+                        name="bias",
+                    )
+                    out = tf.linalg.matmul(out, kernel)
+                    out = tf.compat.v1.nn.bias_add(out, bias)
+                return out
+
+        class WrappedDenseLayer(base_layer.Layer):
+            def __init__(self, units, **kwargs):
+                super().__init__(**kwargs)
+                self.units = units
+                self.dense_layer_a = None
+                self.dense_layer_b = None
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs):
+                # Only create the nested tf.variable/module/layer/model if it
+                # has not already been created!
+                if not self.dense_layer_a:
+                    self.dense_layer_a = NestedLayer(
+                        self.units * 2, "dense_one"
+                    )
+                out = self.dense_layer_a(inputs)
+                if not self.dense_layer_b:
+                    self.dense_layer_b = NestedLayer(self.units, "dense_two")
+                out = self.dense_layer_b(out)
+                return out
+
+        layer = WrappedDenseLayer(5)
+        out = layer(tf.ones(shape=(1, 3)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, losses, + variables were made
+        # (Specifically: no double-counting of any weights or reg. losses
+        # between nested components!)
+        self.assertEqual(
+            {var.name for var in layer.trainable_weights},
+            {
+                "dense_one/bias:0",
+                "dense_one/kernel:0",
+                "dense_two/bias:0",
+                "dense_two/kernel:0",
+            },
+        )
+        self.assertEqual(
+            {var.name for var in layer.dense_layer_a.weights},
+            {"dense_one/bias:0", "dense_one/kernel:0"},
+        )
+        self.assertEqual(
+            {var.name for var in layer.dense_layer_b.weights},
+            {"dense_two/bias:0", "dense_two/kernel:0"},
+        )
+        self.assertAllEqual(out, tf.ones(shape=(1, 5)) * 30)
+        self.assertAllEqual(tf.add_n(layer.dense_layer_a.losses), 30)
+        self.assertAllEqual(tf.add_n(layer.dense_layer_b.losses), 50)
+        self.assertAllEqual(tf.add_n(layer.losses), 80)
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense_one/kernel:0"].assign(tf.ones(shape=(3, 10)) * 2)
+        weights["dense_two/kernel:0"].assign(tf.ones(shape=(10, 5)) * 2)
+        out = layer(tf.ones(shape=(1, 3)))
+        self.assertAllEqual(out, tf.ones(shape=(1, 5)) * 120)
+        self.assertAllEqual(tf.add_n(layer.losses), 320)
+
+    def test_compat_v1_make_template_in_shim_eager(self):
+        # Test the shim when using `compat.v1.make_template`
+        # Verify it works correctly in eager
+        layer = CompatV1TemplateScaleByY()
+        for _ in range(3):
+            # Use multiple calls to verify that no new weights get created
+            self.assertAllEqual(
+                layer(tf.ones(shape=(2, 3))), tf.constant(1.5, shape=(2, 3))
             )
-        model = training_module.Model(inputs=inp, outputs=dense_layer(inp))
-        return model
-
-      @variable_scope_shim.track_tf1_style_variables
-      def call(self, inputs):
-        model = variable_scope_shim.get_or_create_layer(
-            "dense_model", self.build_model)
-        return model(inputs)
-
-    layer = NestedLayer(10)
-
-    @tf.function
-    def foo(x):
-      return layer(x), tf.add_n(layer.losses)
-
-    # Verify inner model is reused
-    out1, loss1 = foo(tf.ones(shape=(5, 5)))
-    out2, loss2 = foo(tf.ones(shape=(5, 5)))
-    self.assertAllEqual(out1, out2)
-    self.assertAllEqual(loss1, loss2)
-
-  @tf_test_utils.run_deprecated_v1
-  def test_get_or_create_layer_graph(self):
-
-    class NestedLayer(object):
+        self.assertAllEqual(
+            {var.name: var.numpy() for var in layer.weights},
+            {"foo/scale_by_y/y:0": 1.5},
+        )
+        self.assertAllEqual(
+            tf.add_n(layer.losses), regularizers.L2()(layer.weights[0])
+        )
+
+    def test_compat_v1_make_template_in_shim_tf_function(self):
+        # Test the shim when using `compat.v1.make_template`
+        # Verify it works correctly in a tf.function
+        # when made outside the function
+        layer = CompatV1TemplateScaleByY()
+
+        @tf.function
+        def foo(x):
+            return layer(x), tf.add_n(layer.losses)
+
+        for _ in range(3):
+            # Use multiple calls to verify that no new weights get created
+            out, loss = foo(tf.ones(shape=(2, 3)))
+            self.assertAllEqual(out, tf.constant(1.5, shape=(2, 3)))
+            self.assertAllEqual(loss, regularizers.L2()(layer.weights[0]))
+        self.assertAllEqual(
+            {var.name: var.numpy() for var in layer.weights},
+            {"foo/scale_by_y/y:0": 1.5},
+        )
+
+    def test_compat_v1_make_template_in_trace_in_shim(self):
+        # Test the shim when using `compat.v1.make_template`
+        # Verify it works correctly when the make_template/layer/shim
+        # is created on the first tf.function trace!
+        layers = {}
+
+        @tf.function
+        def bar(x):
+            if "layer" not in layers:
+                layers["layer"] = CompatV1TemplateScaleByY()
+            layer = layers["layer"]
+            return layer(x), tf.add_n(layer.losses)
+
+        for _ in range(3):
+            # Use multiple calls to verify that no new weights get created
+            out, loss = bar(tf.ones(shape=(2, 3)))
+            self.assertAllEqual(out, tf.constant(1.5, shape=(2, 3)))
+            self.assertAllEqual(
+                loss, regularizers.L2()(layers["layer"].weights[0])
+            )
+        self.assertAllEqual(
+            {var.name: var.numpy() for var in layers["layer"].weights},
+            {"foo/scale_by_y/y:0": 1.5},
+        )
+
+    def test_only_track_get_variable(self):
+        # Test the shim does not try tracking or reusing variables
+        # that were not created by get_variable. These variables/modules/layers
+        # need to be tracked separately
+
+        class WrappedDenseLayer(base_layer.Layer):
+            def __init__(self, units, **kwargs):
+                super().__init__(**kwargs)
+                self.units = units
+                self._dense_model = None
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs):
+                dense_layer = core.Dense(
+                    self.units,
+                    name="dense",
+                    kernel_initializer=tf.compat.v1.ones_initializer(),
+                    kernel_regularizer="l2",
+                )
+                return dense_layer(inputs)
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 5)
+
+        self.assertEmpty(layer.weights)
+
+    def test_embedded_keras_model(self):
+        # Test the shim when embedding a Keras model inside of it
+        # And assigning the model to an attribute
+
+        class WrappedDenseLayer(base_layer.Layer):
+            def __init__(self, units, **kwargs):
+                super().__init__(**kwargs)
+                self.units = units
+                self._dense_model = None
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs):
+                if not self._dense_model:
+                    inp = input_layer_module.Input(shape=inputs.shape)
+                    dense_layer = core.Dense(
+                        self.units,
+                        name="dense",
+                        kernel_initializer=tf.compat.v1.ones_initializer(),
+                        kernel_regularizer="l2",
+                    )
+                    self._dense_model = training_module.Model(
+                        inputs=inp, outputs=dense_layer(inp)
+                    )
+                return self._dense_model(inputs)
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, losses, + variables were made
+        self.assertEqual(weights.keys(), {"dense/bias:0", "dense/kernel:0"})
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 5)
+        self.assertAllEqual(tf.add_n(layer.losses), 0.5)
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 10)
+        self.assertAllEqual(tf.add_n(layer.losses), 2)
+
+    def test_embedded_keras_model_in_module(self):
+        # Test the module shim when embedding a Keras model inside of it
+        # And assigning the model to an attribute
+
+        class WrappedDenseLayer(VariableScopeModule):
+            def __init__(self, units, **kwargs):
+                super().__init__(**kwargs)
+                self.units = units
+                self._dense_model = None
+
+            def forward_pass(self, inputs):
+                if not self._dense_model:
+                    inp = input_layer_module.Input(shape=inputs.shape)
+                    dense_layer = core.Dense(
+                        self.units,
+                        name="dense",
+                        kernel_initializer=tf.compat.v1.ones_initializer(),
+                        kernel_regularizer="l2",
+                    )
+                    self._dense_model = training_module.Model(
+                        inputs=inp, outputs=dense_layer(inp)
+                    )
+                return self._dense_model(inputs)
+
+        layer = WrappedDenseLayer(10)
+        out = layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct output, losses, + variables were made
+        self.assertEqual(weights.keys(), {"dense/bias:0", "dense/kernel:0"})
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 5)
+
+        # The module shim will only track regularization losses made by
+        # compat.v1.layers and compat.v1.get_variable. Other regularization
+        # losses must be tracked by separate user-created mechanisms.
+        self.assertEmpty(layer.get_compat_v1_regularization_losses())
+
+        # Verify reuse by updating the variables then re-running
+        weights["dense/kernel:0"].assign(tf.ones(shape=(5, 10)) * 2)
+        out = layer(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out, tf.ones(shape=(5, 10)) * 10)
+
+        # The module shim will only track regularization losses made by
+        # compat.v1.layers and compat.v1.get_variable. Other regularization
+        # losses must be tracked by separate user-created mechanisms.
+        self.assertEmpty(layer.get_compat_v1_regularization_losses())
+
+    def test_training_arg(self):
+        # Test the shim when passing in a Keras `training` arg
+
+        class TrainingCheckLayer(base_layer.Layer):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs, training=None):
+                if training:
+                    out = core_layers.dense(
+                        inputs, self.units, name="dense_training"
+                    )
+                else:
+                    out = core_layers.dense(
+                        inputs, self.units, name="dense_no_training"
+                    )
+                return out
+
+        layer = TrainingCheckLayer(10)
+        layer(tf.ones(shape=(5, 5)), training=True)
+        weights = {x.name: x for x in layer.variables}
+
+        # Verify the correct variables were made
+        self.assertEqual(
+            weights.keys(), {"dense_training/bias:0", "dense_training/kernel:0"}
+        )
 
-      def __init__(self, units, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.units = units
+        layer = TrainingCheckLayer(10)
+        layer(tf.ones(shape=(5, 5)))
+        weights = {x.name: x for x in layer.variables}
 
-      def build_model(self):
-        inp = input_layer_module.Input(shape=(5, 5))
-        dense_layer = core.Dense(
-            10, name="dense", kernel_regularizer="l2",
-            kernel_initializer=tf.compat.v1.ones_initializer())
-        model = training_module.Model(inputs=inp, outputs=dense_layer(inp))
-        return model
-
-      def __call__(self, inputs):
-        model = variable_scope_shim.get_or_create_layer(
-            "dense_model", self.build_model)
-        return model(inputs)
+        # Verify the correct variables were made
+        self.assertEqual(
+            weights.keys(),
+            {"dense_no_training/bias:0", "dense_no_training/kernel:0"},
+        )
 
-    with self.cached_session():
-      layer = NestedLayer(10)
-      x = tf.ones(shape=(5, 5))
+    def test_incorrect_decoration(self):
+        # Raise an error if you incorrectly decorate a method
+        # that is not a method of a Module, layer, or model:
+        @variable_scope_shim.track_tf1_style_variables
+        def foo(x):
+            return x * 2
 
-      out1 = layer(tf.expand_dims(x, 0))
-      self.evaluate(tf.compat.v1.global_variables_initializer())
+        with self.assertRaisesRegex(ValueError, "does not extend"):
+            foo(tf.ones(shape=(4, 4)))
 
-      # verify output
-      self.assertEqual(out1.shape, tf.TensorShape([1, 5, 10]))
-      self.assertAllEqual(out1, tf.ones(shape=(1, 5, 10)) * 5)
 
-      # verify variables are tracked
-      weights = {var.name for var in tf.compat.v1.trainable_variables()}
-      self.assertEqual(weights, {"dense/bias:0", "dense/kernel:0"})
+class GetOrCreateLayerTest(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_get_or_create_layer_with_regularizer_eager(self):
+        class NestedLayer(base_layer.Layer):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            def build_model(self):
+                inp = input_layer_module.Input(shape=(5, 5))
+                dense_layer = core.Dense(
+                    10,
+                    name="dense",
+                    kernel_regularizer="l2",
+                    kernel_initializer=tf.compat.v1.ones_initializer(),
+                )
+                model = training_module.Model(
+                    inputs=inp, outputs=dense_layer(inp)
+                )
+                return model
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs):
+                # enter a variable scope to check module key naming
+                with tf.compat.v1.variable_scope("test_scope"):
+                    model = variable_scope_shim.get_or_create_layer(
+                        "dense_model", self.build_model
+                    )
+                    return model(inputs)
+
+        layer = NestedLayer(10)
+        x = tf.ones(shape=(5, 5))
+
+        out1 = layer(tf.expand_dims(x, 0))
+
+        model1 = layer.submodules[0]._layers["test_scope/dense_model"]
+
+        out2 = layer(tf.expand_dims(x, 0))
+        # Verify model produces same output on successive calls with same input
+        self.assertAllEqual(out1, out2)
+
+        # Verify the model used on subsequent calls is the same
+        model2 = layer.submodules[0]._layers["test_scope/dense_model"]
+        self.assertIs(model1, model2)
+
+        # Verify that stored layer computes outputs and losses correctly
+        weights = {x.name: x for x in layer.variables}
+        self.assertEqual(weights.keys(), {"dense/bias:0", "dense/kernel:0"})
+        self.assertAllEqual(out2, tf.ones(shape=(1, 5, 10)) * 5)
+        self.assertAllEqual(layer.losses, [0.5])
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_get_or_create_layer_no_regularizer_eager(self):
+        class NestedLayer(base_layer.Layer):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            def build_model(self):
+                inp = input_layer_module.Input(shape=(5, 5))
+                dense_layer = core.Dense(
+                    10,
+                    name="dense",
+                    kernel_initializer=tf.compat.v1.ones_initializer(),
+                )
+                model = training_module.Model(
+                    inputs=inp, outputs=dense_layer(inp)
+                )
+                return model
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs):
+                # enter a variable scope to check module key naming
+                with tf.compat.v1.variable_scope("test_scope"):
+                    model = variable_scope_shim.get_or_create_layer(
+                        "dense_model", self.build_model
+                    )
+                    return model(inputs)
+
+        layer = NestedLayer(10)
+        x = tf.ones(shape=(5, 5))
+
+        out1 = layer(tf.expand_dims(x, 0))
+
+        model1 = layer.submodules[0]._layers["test_scope/dense_model"]
+
+        out2 = layer(tf.expand_dims(x, 0))
+        # Verify model produces same output on successive calls with same input
+        self.assertAllEqual(out1, out2)
+
+        # Verify the model used on subsequent calls is the same
+        model2 = layer.submodules[0]._layers["test_scope/dense_model"]
+        self.assertIs(model1, model2)
+
+        # Verify that stored layer computes outputs and losses correctly
+        weights = {x.name: x for x in layer.variables}
+        self.assertEqual(weights.keys(), {"dense/bias:0", "dense/kernel:0"})
+        self.assertAllEqual(out2, tf.ones(shape=(1, 5, 10)) * 5)
+        self.assertAllEqual(layer.losses, [0.0])
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_get_or_create_layer_tf_function(self):
+        class NestedLayer(base_layer.Layer):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            def build_model(self):
+                inp = input_layer_module.Input(shape=(5, 5))
+                dense_layer = core.Dense(
+                    10,
+                    name="dense",
+                    kernel_regularizer="l2",
+                )
+                model = training_module.Model(
+                    inputs=inp, outputs=dense_layer(inp)
+                )
+                return model
+
+            @variable_scope_shim.track_tf1_style_variables
+            def call(self, inputs):
+                model = variable_scope_shim.get_or_create_layer(
+                    "dense_model", self.build_model
+                )
+                return model(inputs)
+
+        layer = NestedLayer(10)
+
+        @tf.function
+        def foo(x):
+            return layer(x), tf.add_n(layer.losses)
+
+        # Verify inner model is reused
+        out1, loss1 = foo(tf.ones(shape=(5, 5)))
+        out2, loss2 = foo(tf.ones(shape=(5, 5)))
+        self.assertAllEqual(out1, out2)
+        self.assertAllEqual(loss1, loss2)
+
+    @tf_test_utils.run_deprecated_v1
+    def test_get_or_create_layer_graph(self):
+        class NestedLayer(object):
+            def __init__(self, units, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.units = units
+
+            def build_model(self):
+                inp = input_layer_module.Input(shape=(5, 5))
+                dense_layer = core.Dense(
+                    10,
+                    name="dense",
+                    kernel_regularizer="l2",
+                    kernel_initializer=tf.compat.v1.ones_initializer(),
+                )
+                model = training_module.Model(
+                    inputs=inp, outputs=dense_layer(inp)
+                )
+                return model
+
+            def __call__(self, inputs):
+                model = variable_scope_shim.get_or_create_layer(
+                    "dense_model", self.build_model
+                )
+                return model(inputs)
+
+        with self.cached_session():
+            layer = NestedLayer(10)
+            x = tf.ones(shape=(5, 5))
+
+            out1 = layer(tf.expand_dims(x, 0))
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            # verify output
+            self.assertEqual(out1.shape, tf.TensorShape([1, 5, 10]))
+            self.assertAllEqual(out1, tf.ones(shape=(1, 5, 10)) * 5)
+
+            # verify variables are tracked
+            weights = {var.name for var in tf.compat.v1.trainable_variables()}
+            self.assertEqual(weights, {"dense/bias:0", "dense/kernel:0"})
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/losses.py b/keras/losses.py
index fbffc3984493..dc325e67963c 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -12,20 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Built-in loss functions."""
 
 
 import abc
 import functools
+import warnings
+
+import tensorflow.compat.v2 as tf
+
 from keras import backend
-from keras.saving.experimental import saving_lib
-from keras.utils import generic_utils
+from keras.saving import saving_lib
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.serialization_lib import deserialize_keras_object
+from keras.saving.serialization_lib import serialize_keras_object
 from keras.utils import losses_utils
 from keras.utils import tf_utils
-from keras.utils.generic_utils import deserialize_keras_object
-from keras.utils.generic_utils import serialize_keras_object
-import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.ops.ragged import ragged_map_ops
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.util import dispatch
@@ -33,2269 +38,2806 @@
 from tensorflow.tools.docs import doc_controls
 
 
-@keras_export('keras.losses.Loss')
+@keras_export("keras.losses.Loss")
 class Loss:
-  """Loss base class.
+    """Loss base class.
 
-  To be implemented by subclasses:
-  * `call()`: Contains the logic for loss calculation using `y_true`, `y_pred`.
+    To be implemented by subclasses:
+    * `call()`: Contains the logic for loss calculation using `y_true`,
+        `y_pred`.
 
-  Example subclass implementation:
+    Example subclass implementation:
 
-  ```python
-  class MeanSquaredError(Loss):
+    ```python
+    class MeanSquaredError(Loss):
 
-    def call(self, y_true, y_pred):
-      return tf.reduce_mean(tf.math.square(y_pred - y_true), axis=-1)
-  ```
-
-  When used with `tf.distribute.Strategy`, outside of built-in training loops
-  such as `tf.keras` `compile` and `fit`, please use 'SUM' or 'NONE' reduction
-  types, and reduce losses explicitly in your training loop. Using 'AUTO' or
-  'SUM_OVER_BATCH_SIZE' will raise an error.
+      def call(self, y_true, y_pred):
+          return tf.reduce_mean(tf.math.square(y_pred - y_true), axis=-1)
+    ```
 
-  Please see this custom training [tutorial](
-    https://www.tensorflow.org/tutorials/distribute/custom_training) for more
-  details on this.
+    When using a Loss under a `tf.distribute.Strategy`, except passing it
+    to `Model.compile()` for use by `Model.fit()`, please use reduction
+    types 'SUM' or 'NONE', and reduce losses explicitly. Using 'AUTO' or
+    'SUM_OVER_BATCH_SIZE' will raise an error when calling the Loss object
+    from a custom training loop or from user-defined code in `Layer.call()`.
+    Please see this custom training
+    [tutorial](https://www.tensorflow.org/tutorials/distribute/custom_training)
+    for more details on this.
+    """
 
-  You can implement 'SUM_OVER_BATCH_SIZE' using global batch size like:
+    def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name=None):
+        """Initializes `Loss` class.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+        """
+        losses_utils.ReductionV2.validate(reduction)
+        self.reduction = reduction
+        self.name = name
+        # SUM_OVER_BATCH is only allowed in losses managed by `fit` or
+        # CannedEstimators.
+        self._allow_sum_over_batch_size = False
+        self._set_name_scope()
+
+    def _set_name_scope(self):
+        """Creates a valid `name_scope` name."""
+        if self.name is None:
+            self._name_scope = self.__class__.__name__.strip("_")
+        elif self.name == "<lambda>":
+            self._name_scope = "lambda"
+        else:
+            # E.g. '_my_loss' => 'my_loss'
+            self._name_scope = self.name.strip("_")
+
+    def __call__(self, y_true, y_pred, sample_weight=None):
+        """Invokes the `Loss` instance.
+
+        Args:
+            y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`,
+                except sparse loss functions such as sparse categorical
+                crossentropy where shape = `[batch_size, d0, .. dN-1]`
+            y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
+            sample_weight: Optional `sample_weight` acts as a coefficient for
+                the loss. If a scalar is provided, then the loss is simply
+                scaled by the given value. If `sample_weight` is a tensor of
+                size `[batch_size]`, then the total loss for each sample of the
+                batch is rescaled by the corresponding element in the
+                `sample_weight` vector. If the shape of `sample_weight` is
+                `[batch_size, d0, .. dN-1]` (or can be broadcasted to this
+                shape), then each loss element of `y_pred` is scaled by the
+                corresponding value of `sample_weight`. (Note on`dN-1`: all loss
+                functions reduce by 1 dimension, usually axis=-1.)
+
+        Returns:
+            Weighted loss float `Tensor`. If `reduction` is `NONE`, this has
+                shape `[batch_size, d0, .. dN-1]`; otherwise, it is scalar.
+                (Note `dN-1` because all loss functions reduce by 1 dimension,
+                usually axis=-1.)
+
+        Raises:
+          ValueError: If the shape of `sample_weight` is invalid.
+        """
+        # If we are wrapping a lambda function strip '<>' from the name as it is
+        # not accepted in scope name.
+        graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
+            y_true, y_pred, sample_weight
+        )
+        with backend.name_scope(self._name_scope), graph_ctx:
+            if tf.executing_eagerly():
+                call_fn = self.call
+            else:
+                call_fn = tf.__internal__.autograph.tf_convert(
+                    self.call, tf.__internal__.autograph.control_status_ctx()
+                )
+
+            losses = call_fn(y_true, y_pred)
+
+            in_mask = losses_utils.get_mask(y_pred)
+            out_mask = losses_utils.get_mask(losses)
+
+            if in_mask is not None and out_mask is not None:
+                mask = in_mask & out_mask
+            elif in_mask is not None:
+                mask = in_mask
+            elif out_mask is not None:
+                mask = out_mask
+            else:
+                mask = None
+
+            reduction = self._get_reduction()
+            sample_weight = losses_utils.apply_valid_mask(
+                losses, sample_weight, mask, reduction
+            )
+            return losses_utils.compute_weighted_loss(
+                losses, sample_weight, reduction=reduction
+            )
+
+    @classmethod
+    def from_config(cls, config):
+        """Instantiates a `Loss` from its config (output of `get_config()`).
+
+        Args:
+            config: Output of `get_config()`.
+
+        Returns:
+            A `Loss` instance.
+        """
+        return cls(**config)
+
+    def get_config(self):
+        """Returns the config dictionary for a `Loss` instance."""
+        return {"reduction": self.reduction, "name": self.name}
+
+    @abc.abstractmethod
+    @doc_controls.for_subclass_implementers
+    def call(self, y_true, y_pred):
+        """Invokes the `Loss` instance.
+
+        Args:
+            y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`,
+                except sparse loss functions such as sparse categorical
+                crossentropy where shape = `[batch_size, d0, .. dN-1]`
+            y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
+
+        Returns:
+            Loss values with the shape `[batch_size, d0, .. dN-1]`.
+        """
+        raise NotImplementedError("Must be implemented in subclasses.")
+
+    def _get_reduction(self):
+        """Handles `AUTO` reduction cases and returns the reduction value."""
+        if (
+            not self._allow_sum_over_batch_size
+            and tf.distribute.has_strategy()
+            and (
+                self.reduction == losses_utils.ReductionV2.AUTO
+                or self.reduction
+                == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+            )
+        ):
+            raise ValueError(
+                "Please use `tf.keras.losses.Reduction.SUM` or "
+                "`tf.keras.losses.Reduction.NONE` for loss reduction when "
+                "losses are used with `tf.distribute.Strategy`, "
+                "except for specifying losses in `Model.compile()` "
+                "for use by the built-in training looop `Model.fit()`.\n"
+                "Please see https://www.tensorflow.org/tutorials"
+                "/distribute/custom_training for more details."
+            )
+
+        if self.reduction == losses_utils.ReductionV2.AUTO:
+            return losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
+        return self.reduction
+
+
+@keras_export("keras.__internal__.losses.LossFunctionWrapper", v1=[])
+class LossFunctionWrapper(Loss):
+    """Wraps a loss function in the `Loss` class."""
+
+    def __init__(
+        self, fn, reduction=losses_utils.ReductionV2.AUTO, name=None, **kwargs
+    ):
+        """Initializes `LossFunctionWrapper` class.
+
+        Args:
+            fn: The loss function to wrap, with signature `fn(y_true, y_pred,
+                **kwargs)`.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+            **kwargs: The keyword arguments that are passed on to `fn`.
+        """
+        super().__init__(reduction=reduction, name=name)
+        self.fn = fn
+        self._fn_kwargs = kwargs
 
-  ```python
-  with strategy.scope():
-    loss_obj = tf.keras.losses.CategoricalCrossentropy(
-        reduction=tf.keras.losses.Reduction.NONE)
-    ....
-    loss = (tf.reduce_sum(loss_obj(labels, predictions)) *
-            (1. / global_batch_size))
-  ```
-  """
+    def call(self, y_true, y_pred):
+        """Invokes the `LossFunctionWrapper` instance.
+
+        Args:
+            y_true: Ground truth values.
+            y_pred: The predicted values.
+
+        Returns:
+            Loss values per sample.
+        """
+        if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
+            y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
+                y_pred, y_true
+            )
+
+        ag_fn = tf.__internal__.autograph.tf_convert(
+            self.fn, tf.__internal__.autograph.control_status_ctx()
+        )
+        return ag_fn(y_true, y_pred, **self._fn_kwargs)
+
+    def get_config(self):
+        config = {}
+        for k, v in self._fn_kwargs.items():
+            config[k] = (
+                backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
+            )
+
+        if saving_lib.saving_v3_enabled():
+            from keras.utils import get_registered_name
+
+            config["fn"] = get_registered_name(self.fn)
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        """Instantiates a `Loss` from its config (output of `get_config()`).
+
+        Args:
+            config: Output of `get_config()`.
+
+        Returns:
+            A `keras.losses.Loss` instance.
+        """
+        if saving_lib.saving_v3_enabled():
+            fn_name = config.pop("fn", None)
+            if fn_name and cls is LossFunctionWrapper:
+                config["fn"] = get(fn_name)
+        return cls(**config)
+
+
+@keras_export("keras.losses.MeanSquaredError")
+class MeanSquaredError(LossFunctionWrapper):
+    """Computes the mean of squares of errors between labels and predictions.
 
-  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name=None):
-    """Initializes `Loss` class.
+    `loss = mean(square(y_true - y_pred))`
 
-    Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance.
-    """
-    losses_utils.ReductionV2.validate(reduction)
-    self.reduction = reduction
-    self.name = name
-    # SUM_OVER_BATCH is only allowed in losses managed by `fit` or
-    # CannedEstimators.
-    self._allow_sum_over_batch_size = False
-    self._set_name_scope()
-
-  def _set_name_scope(self):
-    """Creates a valid `name_scope` name."""
-    if self.name is None:
-      self._name_scope = self.__class__.__name__
-    elif self.name == '<lambda>':
-      self._name_scope = 'lambda'
-    else:
-      # E.g. '_my_loss' => 'my_loss'
-      self._name_scope = self.name.strip('_')
+    Standalone usage:
 
-  def __call__(self, y_true, y_pred, sample_weight=None):
-    """Invokes the `Loss` instance.
+    >>> y_true = [[0., 1.], [0., 0.]]
+    >>> y_pred = [[1., 1.], [1., 0.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> mse = tf.keras.losses.MeanSquaredError()
+    >>> mse(y_true, y_pred).numpy()
+    0.5
 
-    Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`, except
-        sparse loss functions such as sparse categorical crossentropy where
-        shape = `[batch_size, d0, .. dN-1]`
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
-      sample_weight: Optional `sample_weight` acts as a coefficient for the
-        loss. If a scalar is provided, then the loss is simply scaled by the
-        given value. If `sample_weight` is a tensor of size `[batch_size]`, then
-        the total loss for each sample of the batch is rescaled by the
-        corresponding element in the `sample_weight` vector. If the shape of
-        `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be broadcasted to
-        this shape), then each loss element of `y_pred` is scaled
-        by the corresponding value of `sample_weight`. (Note on`dN-1`: all loss
-          functions reduce by 1 dimension, usually axis=-1.)
+    >>> # Calling with 'sample_weight'.
+    >>> mse(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
+    0.25
 
-    Returns:
-      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has
-        shape `[batch_size, d0, .. dN-1]`; otherwise, it is scalar. (Note `dN-1`
-        because all loss functions reduce by 1 dimension, usually axis=-1.)
+    >>> # Using 'sum' reduction type.
+    >>> mse = tf.keras.losses.MeanSquaredError(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> mse(y_true, y_pred).numpy()
+    1.0
 
-    Raises:
-      ValueError: If the shape of `sample_weight` is invalid.
-    """
-    # If we are wrapping a lambda function strip '<>' from the name as it is not
-    # accepted in scope name.
-    graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
-        y_true, y_pred, sample_weight)
-    with backend.name_scope(self._name_scope), graph_ctx:
-      if tf.executing_eagerly():
-        call_fn = self.call
-      else:
-        call_fn = tf.__internal__.autograph.tf_convert(self.call, tf.__internal__.autograph.control_status_ctx())
-      losses = call_fn(y_true, y_pred)
-      return losses_utils.compute_weighted_loss(
-          losses, sample_weight, reduction=self._get_reduction())
-
-  @classmethod
-  def from_config(cls, config):
-    """Instantiates a `Loss` from its config (output of `get_config()`).
+    >>> # Using 'none' reduction type.
+    >>> mse = tf.keras.losses.MeanSquaredError(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> mse(y_true, y_pred).numpy()
+    array([0.5, 0.5], dtype=float32)
 
-    Args:
-        config: Output of `get_config()`.
+    Usage with the `compile()` API:
 
-    Returns:
-        A `Loss` instance.
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.MeanSquaredError())
+    ```
     """
-    return cls(**config)
 
-  def get_config(self):
-    """Returns the config dictionary for a `Loss` instance."""
-    return {'reduction': self.reduction, 'name': self.name}
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name="mean_squared_error"
+    ):
+        """Initializes `MeanSquaredError` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_squared_error'.
+        """
+        super().__init__(mean_squared_error, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.MeanAbsoluteError")
+class MeanAbsoluteError(LossFunctionWrapper):
+    """Computes the mean of absolute difference between labels and predictions.
 
-  @abc.abstractmethod
-  @doc_controls.for_subclass_implementers
-  def call(self, y_true, y_pred):
-    """Invokes the `Loss` instance.
+    `loss = mean(abs(y_true - y_pred))`
 
-    Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`, except
-        sparse loss functions such as sparse categorical crossentropy where
-        shape = `[batch_size, d0, .. dN-1]`
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
+    Standalone usage:
 
-    Returns:
-      Loss values with the shape `[batch_size, d0, .. dN-1]`.
-    """
-    raise NotImplementedError('Must be implemented in subclasses.')
-
-  def _get_reduction(self):
-    """Handles `AUTO` reduction cases and returns the reduction value."""
-    if (not self._allow_sum_over_batch_size and
-        tf.distribute.has_strategy() and
-        (self.reduction == losses_utils.ReductionV2.AUTO or
-         self.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)):
-      raise ValueError(
-          'Please use `tf.keras.losses.Reduction.SUM` or '
-          '`tf.keras.losses.Reduction.NONE` for loss reduction when losses are '
-          'used with `tf.distribute.Strategy` outside of the built-in training '
-          'loops. You can implement '
-          '`tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` using global batch '
-          'size like:\n```\nwith strategy.scope():\n'
-          '    loss_obj = tf.keras.losses.CategoricalCrossentropy('
-          'reduction=tf.keras.losses.Reduction.NONE)\n....\n'
-          '    loss = tf.reduce_sum(loss_obj(labels, predictions)) * '
-          '(1. / global_batch_size)\n```\nPlease see '
-          'https://www.tensorflow.org/tutorials/distribute/custom_training'
-          ' for more details.')
-
-    if self.reduction == losses_utils.ReductionV2.AUTO:
-      return losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
-    return self.reduction
+    >>> y_true = [[0., 1.], [0., 0.]]
+    >>> y_pred = [[1., 1.], [1., 0.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> mae = tf.keras.losses.MeanAbsoluteError()
+    >>> mae(y_true, y_pred).numpy()
+    0.5
 
+    >>> # Calling with 'sample_weight'.
+    >>> mae(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
+    0.25
 
-class LossFunctionWrapper(Loss):
-  """Wraps a loss function in the `Loss` class."""
+    >>> # Using 'sum' reduction type.
+    >>> mae = tf.keras.losses.MeanAbsoluteError(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> mae(y_true, y_pred).numpy()
+    1.0
 
-  def __init__(self,
-               fn,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name=None,
-               **kwargs):
-    """Initializes `LossFunctionWrapper` class.
+    >>> # Using 'none' reduction type.
+    >>> mae = tf.keras.losses.MeanAbsoluteError(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> mae(y_true, y_pred).numpy()
+    array([0.5, 0.5], dtype=float32)
 
-    Args:
-      fn: The loss function to wrap, with signature `fn(y_true, y_pred,
-        **kwargs)`.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance.
-      **kwargs: The keyword arguments that are passed on to `fn`.
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.MeanAbsoluteError())
+    ```
     """
-    super().__init__(reduction=reduction, name=name)
-    self.fn = fn
-    self._fn_kwargs = kwargs
 
-  def call(self, y_true, y_pred):
-    """Invokes the `LossFunctionWrapper` instance.
+    def __init__(
+        self,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="mean_absolute_error",
+    ):
+        """Initializes `MeanAbsoluteError` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_absolute_error'.
+        """
+        super().__init__(mean_absolute_error, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.MeanAbsolutePercentageError")
+class MeanAbsolutePercentageError(LossFunctionWrapper):
+    """Computes the mean absolute percentage error between `y_true` & `y_pred`.
 
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
+    Formula:
 
-    Returns:
-      Loss values per sample.
-    """
-    if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
-      y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
+    `loss = 100 * abs((y_true - y_pred) / y_true)`
 
-    ag_fn = tf.__internal__.autograph.tf_convert(self.fn, tf.__internal__.autograph.control_status_ctx())
-    return ag_fn(y_true, y_pred, **self._fn_kwargs)
+    Note that to avoid dividing by zero, a small epsilon value
+    is added to the denominator.
 
-  def get_config(self):
-    config = {}
-    for k, v in self._fn_kwargs.items():
-      config[k] = backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
+    Standalone usage:
 
-    if saving_lib._ENABLED:  # pylint: disable=protected-access
-      config['fn'] = generic_utils.get_registered_name(self.fn)
+    >>> y_true = [[2., 1.], [2., 3.]]
+    >>> y_pred = [[1., 1.], [1., 0.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> mape = tf.keras.losses.MeanAbsolutePercentageError()
+    >>> mape(y_true, y_pred).numpy()
+    50.
 
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    >>> # Calling with 'sample_weight'.
+    >>> mape(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
+    20.
 
-  @classmethod
-  def from_config(cls, config):
-    """Instantiates a `Loss` from its config (output of `get_config()`).
+    >>> # Using 'sum' reduction type.
+    >>> mape = tf.keras.losses.MeanAbsolutePercentageError(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> mape(y_true, y_pred).numpy()
+    100.
 
-    Args:
-        config: Output of `get_config()`.
+    >>> # Using 'none' reduction type.
+    >>> mape = tf.keras.losses.MeanAbsolutePercentageError(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> mape(y_true, y_pred).numpy()
+    array([25., 75.], dtype=float32)
 
-    Returns:
-        A `keras.losses.Loss` instance.
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.MeanAbsolutePercentageError())
+    ```
     """
-    if saving_lib._ENABLED:  # pylint: disable=protected-access
-      fn_name = config.pop('fn', None)
-      if fn_name and cls is LossFunctionWrapper:
-        config['fn'] = get(fn_name)
-    return cls(**config)
 
+    def __init__(
+        self,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="mean_absolute_percentage_error",
+    ):
+        """Initializes `MeanAbsolutePercentageError` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_absolute_percentage_error'.
+        """
+        super().__init__(
+            mean_absolute_percentage_error, name=name, reduction=reduction
+        )
+
+
+@keras_export("keras.losses.MeanSquaredLogarithmicError")
+class MeanSquaredLogarithmicError(LossFunctionWrapper):
+    """Computes the mean squared logarithmic error between `y_true` & `y_pred`.
 
-@keras_export('keras.losses.MeanSquaredError')
-class MeanSquaredError(LossFunctionWrapper):
-  """Computes the mean of squares of errors between labels and predictions.
+    `loss = square(log(y_true + 1.) - log(y_pred + 1.))`
 
-  `loss = square(y_true - y_pred)`
+    Standalone usage:
 
-  Standalone usage:
+    >>> y_true = [[0., 1.], [0., 0.]]
+    >>> y_pred = [[1., 1.], [1., 0.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> msle = tf.keras.losses.MeanSquaredLogarithmicError()
+    >>> msle(y_true, y_pred).numpy()
+    0.240
 
-  >>> y_true = [[0., 1.], [0., 0.]]
-  >>> y_pred = [[1., 1.], [1., 0.]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> mse = tf.keras.losses.MeanSquaredError()
-  >>> mse(y_true, y_pred).numpy()
-  0.5
+    >>> # Calling with 'sample_weight'.
+    >>> msle(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
+    0.120
 
-  >>> # Calling with 'sample_weight'.
-  >>> mse(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
-  0.25
+    >>> # Using 'sum' reduction type.
+    >>> msle = tf.keras.losses.MeanSquaredLogarithmicError(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> msle(y_true, y_pred).numpy()
+    0.480
 
-  >>> # Using 'sum' reduction type.
-  >>> mse = tf.keras.losses.MeanSquaredError(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> mse(y_true, y_pred).numpy()
-  1.0
+    >>> # Using 'none' reduction type.
+    >>> msle = tf.keras.losses.MeanSquaredLogarithmicError(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> msle(y_true, y_pred).numpy()
+    array([0.240, 0.240], dtype=float32)
 
-  >>> # Using 'none' reduction type.
-  >>> mse = tf.keras.losses.MeanSquaredError(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> mse(y_true, y_pred).numpy()
-  array([0.5, 0.5], dtype=float32)
+    Usage with the `compile()` API:
 
-  Usage with the `compile()` API:
+    ```python
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.MeanSquaredLogarithmicError())
+    ```
+    """
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.MeanSquaredError())
-  ```
-  """
+    def __init__(
+        self,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="mean_squared_logarithmic_error",
+    ):
+        """Initializes `MeanSquaredLogarithmicError` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_squared_logarithmic_error'.
+        """
+        super().__init__(
+            mean_squared_logarithmic_error, name=name, reduction=reduction
+        )
+
+
+@keras_export("keras.losses.BinaryCrossentropy")
+class BinaryCrossentropy(LossFunctionWrapper):
+    """Computes the cross-entropy loss between true labels and predicted labels.
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='mean_squared_error'):
-    """Initializes `MeanSquaredError` instance.
+    Use this cross-entropy loss for binary (0 or 1) classification applications.
+    The loss function requires the following inputs:
 
-    Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'mean_squared_error'.
+    - `y_true` (true label): This is either 0 or 1.
+    - `y_pred` (predicted value): This is the model's prediction, i.e, a single
+        floating-point value which either represents a
+        [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
+        when `from_logits=True`) or a probability (i.e, value in [0., 1.] when
+        `from_logits=False`).
+
+    **Recommended Usage:** (set `from_logits=True`)
+
+    With `tf.keras` API:
+
+    ```python
+    model.compile(
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        ....
+    )
+    ```
+
+    As a standalone function:
+
+    >>> # Example 1: (batch_size = 1, number of samples = 4)
+    >>> y_true = [0, 1, 0, 0]
+    >>> y_pred = [-18.6, 0.51, 2.94, -12.8]
+    >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
+    >>> bce(y_true, y_pred).numpy()
+    0.865
+
+    >>> # Example 2: (batch_size = 2, number of samples = 4)
+    >>> y_true = [[0, 1], [0, 0]]
+    >>> y_pred = [[-18.6, 0.51], [2.94, -12.8]]
+    >>> # Using default 'auto'/'sum_over_batch_size' reduction type.
+    >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
+    >>> bce(y_true, y_pred).numpy()
+    0.865
+    >>> # Using 'sample_weight' attribute
+    >>> bce(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    0.243
+    >>> # Using 'sum' reduction` type.
+    >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True,
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> bce(y_true, y_pred).numpy()
+    1.730
+    >>> # Using 'none' reduction type.
+    >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True,
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> bce(y_true, y_pred).numpy()
+    array([0.235, 1.496], dtype=float32)
+
+    **Default Usage:** (set `from_logits=False`)
+
+    >>> # Make the following updates to the above "Recommended Usage" section
+    >>> # 1. Set `from_logits=False`
+    >>> tf.keras.losses.BinaryCrossentropy() # OR ...('from_logits=False')
+    >>> # 2. Update `y_pred` to use probabilities instead of logits
+    >>> y_pred = [0.6, 0.3, 0.2, 0.8] # OR [[0.6, 0.3], [0.2, 0.8]]
     """
-    super().__init__(mean_squared_error, name=name, reduction=reduction)
 
+    def __init__(
+        self,
+        from_logits=False,
+        label_smoothing=0.0,
+        axis=-1,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="binary_crossentropy",
+    ):
+        """Initializes `BinaryCrossentropy` instance.
+
+        Args:
+            from_logits: Whether to interpret `y_pred` as a tensor of
+                [logit](https://en.wikipedia.org/wiki/Logit) values. By default,
+                we assume that `y_pred` contains probabilities (i.e., values in
+                [0, 1]).
+            label_smoothing: Float in [0, 1]. When 0, no smoothing occurs.
+                When > 0, we compute the loss between the predicted labels and a
+                smoothed version of the true labels, where the smoothing
+                squeezes the labels towards 0.5.  Larger values of
+                `label_smoothing` correspond to heavier smoothing.
+            axis: The axis along which to compute crossentropy (the features
+                axis).  Defaults to -1.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Name for the op. Defaults to 'binary_crossentropy'.
+        """
+        super().__init__(
+            binary_crossentropy,
+            name=name,
+            reduction=reduction,
+            from_logits=from_logits,
+            label_smoothing=label_smoothing,
+            axis=axis,
+        )
+        self.from_logits = from_logits
+
+
+@keras_export("keras.losses.BinaryFocalCrossentropy")
+class BinaryFocalCrossentropy(LossFunctionWrapper):
+    """Computes focal cross-entropy loss between true labels and predictions.
 
-@keras_export('keras.losses.MeanAbsoluteError')
-class MeanAbsoluteError(LossFunctionWrapper):
-  """Computes the mean of absolute difference between labels and predictions.
+    Binary cross-entropy loss is often used for binary (0 or 1) classification
+    tasks. The loss function requires the following inputs:
 
-  `loss = abs(y_true - y_pred)`
+    - `y_true` (true label): This is either 0 or 1.
+    - `y_pred` (predicted value): This is the model's prediction, i.e, a single
+        floating-point value which either represents a
+        [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
+        when `from_logits=True`) or a probability (i.e, value in [0., 1.] when
+        `from_logits=False`).
 
-  Standalone usage:
+    According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
+    helps to apply a "focal factor" to down-weight easy examples and focus more
+    on hard examples. By default, the focal tensor is computed as follows:
 
-  >>> y_true = [[0., 1.], [0., 0.]]
-  >>> y_pred = [[1., 1.], [1., 0.]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> mae = tf.keras.losses.MeanAbsoluteError()
-  >>> mae(y_true, y_pred).numpy()
-  0.5
+    `focal_factor = (1 - output) ** gamma` for class 1
+    `focal_factor = output ** gamma` for class 0
+    where `gamma` is a focusing parameter. When `gamma=0`, this function is
+    equivalent to the binary crossentropy loss.
 
-  >>> # Calling with 'sample_weight'.
-  >>> mae(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
-  0.25
+    With the `compile()` API:
 
-  >>> # Using 'sum' reduction type.
-  >>> mae = tf.keras.losses.MeanAbsoluteError(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> mae(y_true, y_pred).numpy()
-  1.0
+    ```python
+    model.compile(
+      loss=tf.keras.losses.BinaryFocalCrossentropy(gamma=2.0, from_logits=True),
+      ....
+    )
+    ```
+
+    As a standalone function:
+
+    >>> # Example 1: (batch_size = 1, number of samples = 4)
+    >>> y_true = [0, 1, 0, 0]
+    >>> y_pred = [-18.6, 0.51, 2.94, -12.8]
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=2,
+    ...                                                from_logits=True)
+    >>> loss(y_true, y_pred).numpy()
+    0.691
+
+    >>> # Apply class weight
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(
+    ...     apply_class_balancing=True, gamma=2, from_logits=True)
+    >>> loss(y_true, y_pred).numpy()
+    0.51
+
+    >>> # Example 2: (batch_size = 2, number of samples = 4)
+    >>> y_true = [[0, 1], [0, 0]]
+    >>> y_pred = [[-18.6, 0.51], [2.94, -12.8]]
+    >>> # Using default 'auto'/'sum_over_batch_size' reduction type.
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=3,
+    ...                                                from_logits=True)
+    >>> loss(y_true, y_pred).numpy()
+    0.647
+
+    >>> # Apply class weight
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(
+    ...     apply_class_balancing=True, gamma=3, from_logits=True)
+    >>> loss(y_true, y_pred).numpy()
+    0.482
+
+    >>> # Using 'sample_weight' attribute with focal effect
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=3,
+    ...                                                from_logits=True)
+    >>> loss(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    0.133
+
+    >>> # Apply class weight
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(
+    ...     apply_class_balancing=True, gamma=3, from_logits=True)
+    >>> loss(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    0.097
+
+    >>> # Using 'sum' reduction` type.
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=4,
+    ...                                                from_logits=True,
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> loss(y_true, y_pred).numpy()
+    1.222
+
+    >>> # Apply class weight
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(
+    ...     apply_class_balancing=True, gamma=4, from_logits=True,
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> loss(y_true, y_pred).numpy()
+    0.914
+
+    >>> # Using 'none' reduction type.
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(
+    ...     gamma=5, from_logits=True,
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> loss(y_true, y_pred).numpy()
+    array([0.0017 1.1561], dtype=float32)
+
+    >>> # Apply class weight
+    >>> loss = tf.keras.losses.BinaryFocalCrossentropy(
+    ...     apply_class_balancing=True, gamma=5, from_logits=True,
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> loss(y_true, y_pred).numpy()
+    array([0.0004 0.8670], dtype=float32)
 
-  >>> # Using 'none' reduction type.
-  >>> mae = tf.keras.losses.MeanAbsoluteError(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> mae(y_true, y_pred).numpy()
-  array([0.5, 0.5], dtype=float32)
 
-  Usage with the `compile()` API:
+    Args:
+        apply_class_balancing: A bool, whether to apply weight balancing on the
+            binary classes 0 and 1.
+        alpha: A weight balancing factor for class 1, default is `0.25` as
+            mentioned in reference [Lin et al., 2018](
+            https://arxiv.org/pdf/1708.02002.pdf).  The weight for class 0 is
+            `1.0 - alpha`.
+        gamma: A focusing parameter used to compute the focal factor, default is
+            `2.0` as mentioned in the reference
+            [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf).
+        from_logits: Whether to interpret `y_pred` as a tensor of
+            [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
+            assume that `y_pred` are probabilities (i.e., values in `[0, 1]`).
+        label_smoothing: Float in `[0, 1]`. When `0`, no smoothing occurs.
+            When > `0`, we compute the loss between the predicted labels and a
+            smoothed version of the true labels, where the smoothing squeezes
+            the labels towards `0.5`. Larger values of `label_smoothing`
+            correspond to heavier smoothing.
+        axis: The axis along which to compute crossentropy (the features axis).
+            Defaults to `-1`.
+        reduction: Type of `tf.keras.losses.Reduction` to apply to
+            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
+            option will be determined by the usage context. For almost all cases
+            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+            `tf.distribute.Strategy`, except via `Model.compile()` and
+            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
+            will raise an error. Please see this custom training [tutorial](
+            https://www.tensorflow.org/tutorials/distribute/custom_training)
+            for more details.
+        name: Name for the op. Defaults to 'binary_focal_crossentropy'.
+    """
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.MeanAbsoluteError())
-  ```
-  """
+    def __init__(
+        self,
+        apply_class_balancing=False,
+        alpha=0.25,
+        gamma=2.0,
+        from_logits=False,
+        label_smoothing=0.0,
+        axis=-1,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="binary_focal_crossentropy",
+    ):
+        """Initializes `BinaryFocalCrossentropy` instance."""
+        super().__init__(
+            binary_focal_crossentropy,
+            apply_class_balancing=apply_class_balancing,
+            alpha=alpha,
+            gamma=gamma,
+            name=name,
+            reduction=reduction,
+            from_logits=from_logits,
+            label_smoothing=label_smoothing,
+            axis=axis,
+        )
+        self.from_logits = from_logits
+        self.apply_class_balancing = apply_class_balancing
+        self.alpha = alpha
+        self.gamma = gamma
+
+    def get_config(self):
+        config = {
+            "apply_class_balancing": self.apply_class_balancing,
+            "alpha": self.alpha,
+            "gamma": self.gamma,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.losses.CategoricalCrossentropy")
+class CategoricalCrossentropy(LossFunctionWrapper):
+    """Computes the crossentropy loss between the labels and predictions.
+
+    Use this crossentropy loss function when there are two or more label
+    classes. We expect labels to be provided in a `one_hot` representation. If
+    you want to provide labels as integers, please use
+    `SparseCategoricalCrossentropy` loss.  There should be `# classes` floating
+    point values per feature.
+
+    In the snippet below, there is `# classes` floating pointing values per
+    example. The shape of both `y_pred` and `y_true` are
+    `[batch_size, num_classes]`.
+
+    Standalone usage:
+
+    >>> y_true = [[0, 1, 0], [0, 0, 1]]
+    >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> cce = tf.keras.losses.CategoricalCrossentropy()
+    >>> cce(y_true, y_pred).numpy()
+    1.177
+
+    >>> # Calling with 'sample_weight'.
+    >>> cce(y_true, y_pred, sample_weight=tf.constant([0.3, 0.7])).numpy()
+    0.814
+
+    >>> # Using 'sum' reduction type.
+    >>> cce = tf.keras.losses.CategoricalCrossentropy(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> cce(y_true, y_pred).numpy()
+    2.354
+
+    >>> # Using 'none' reduction type.
+    >>> cce = tf.keras.losses.CategoricalCrossentropy(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> cce(y_true, y_pred).numpy()
+    array([0.0513, 2.303], dtype=float32)
+
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.CategoricalCrossentropy())
+    ```
+    """
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='mean_absolute_error'):
-    """Initializes `MeanAbsoluteError` instance.
+    def __init__(
+        self,
+        from_logits=False,
+        label_smoothing=0.0,
+        axis=-1,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="categorical_crossentropy",
+    ):
+        """Initializes `CategoricalCrossentropy` instance.
+
+        Args:
+            from_logits: Whether `y_pred` is expected to be a logits tensor. By
+                default, we assume that `y_pred` encodes a probability
+                distribution.
+            label_smoothing: Float in [0, 1]. When > 0, label values are
+                smoothed, meaning the confidence on label values are relaxed.
+                For example, if `0.1`, use `0.1 / num_classes` for non-target
+                labels and `0.9 + 0.1 / num_classes` for target labels.
+            axis: The axis along which to compute crossentropy (the features
+                axis). Defaults to -1.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+                Default value is `AUTO`. `AUTO` indicates that the reduction
+                option will be determined by the usage context. For almost all
+                cases this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+                `tf.distribute.Strategy`, except via `Model.compile()` and
+                `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
+                will raise an error. Please see this custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'categorical_crossentropy'.
+        """
+        super().__init__(
+            categorical_crossentropy,
+            name=name,
+            reduction=reduction,
+            from_logits=from_logits,
+            label_smoothing=label_smoothing,
+            axis=axis,
+        )
+
+
+@keras_export("keras.losses.CategoricalFocalCrossentropy")
+class CategoricalFocalCrossentropy(LossFunctionWrapper):
+    """Computes the alpha balanced focal crossentropy loss.
+
+    Use this crossentropy loss function when there are two or more label
+    classes and if you want to handle class imbalance without using
+    `class_weights`. We expect labels to be provided in a `one_hot`
+    representation.
+
+    According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
+    helps to apply a focal factor to down-weight easy examples and focus more on
+    hard examples. The general formula for the focal loss (FL)
+    is as follows:
+
+    `FL(p_t) = (1 − p_t)^gamma * log(p_t)`
+
+    where `p_t` is defined as follows:
+    `p_t = output if y_true == 1, else 1 - output`
+
+    `(1 − p_t)^gamma` is the `modulating_factor`, where `gamma` is a focusing
+    parameter. When `gamma` = 0, there is no focal effect on the cross entropy.
+    `gamma` reduces the importance given to simple examples in a smooth manner.
+
+    The authors use alpha-balanced variant of focal loss (FL) in the paper:
+    `FL(p_t) = −alpha * (1 − p_t)^gamma * log(p_t)`
+
+    where `alpha` is the weight factor for the classes. If `alpha` = 1, the
+    loss won't be able to handle class imbalance properly as all
+    classes will have the same weight. This can be a constant or a list of
+    constants. If alpha is a list, it must have the same length as the number
+    of classes.
+
+    The formula above can be generalized to:
+    `FL(p_t) = alpha * (1 − p_t)^gamma * CrossEntropy(y_true, y_pred)`
+
+    where minus comes from `CrossEntropy(y_true, y_pred)` (CE).
+
+    Extending this to multi-class case is straightforward:
+    `FL(p_t) = alpha * (1 − p_t)^gamma * CategoricalCE(y_true, y_pred)`
+
+    In the snippet below, there is `# classes` floating pointing values per
+    example. The shape of both `y_pred` and `y_true` are
+    `[batch_size, num_classes]`.
+
+    Standalone usage:
+
+    >>> y_true = [[0., 1., 0.], [0., 0., 1.]]
+    >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> cce = tf.keras.losses.CategoricalFocalCrossentropy()
+    >>> cce(y_true, y_pred).numpy()
+    0.23315276
+
+    >>> # Calling with 'sample_weight'.
+    >>> cce(y_true, y_pred, sample_weight=tf.constant([0.3, 0.7])).numpy()
+    0.1632
+
+    >>> # Using 'sum' reduction type.
+    >>> cce = tf.keras.losses.CategoricalFocalCrossentropy(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> cce(y_true, y_pred).numpy()
+    0.46631
+
+    >>> # Using 'none' reduction type.
+    >>> cce = tf.keras.losses.CategoricalFocalCrossentropy(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> cce(y_true, y_pred).numpy()
+    array([3.2058331e-05, 4.6627346e-01], dtype=float32)
+
+    Usage with the `compile()` API:
+    ```python
+    model.compile(optimizer='adam',
+                  loss=tf.keras.losses.CategoricalFocalCrossentropy())
+    ```
 
     Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'mean_absolute_error'.
+        alpha: A weight balancing factor for all classes, default is `0.25` as
+            mentioned in the reference. It can be a list of floats or a scalar.
+            In the multi-class case, alpha may be set by inverse class
+            frequency by using `compute_class_weight` from `sklearn.utils`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference. It helps to gradually reduce the importance given to
+            simple (easy) examples in a smooth manner.
+        from_logits: Whether `output` is expected to be a logits tensor. By
+            default, we consider that `output` encodes a probability
+            distribution.
+        label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
+            meaning the confidence on label values are relaxed. For example, if
+            `0.1`, use `0.1 / num_classes` for non-target labels and
+            `0.9 + 0.1 / num_classes` for target labels.
+        axis: The axis along which to compute crossentropy (the features
+            axis). Defaults to -1.
+        reduction: Type of `tf.keras.losses.Reduction` to apply to
+            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
+            option will be determined by the usage context. For almost all cases
+            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+            `tf.distribute.Strategy`, except via `Model.compile()` and
+            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
+            will raise an error. Please see this custom training [tutorial](
+            https://www.tensorflow.org/tutorials/distribute/custom_training)
+            for more details.
+        name: Optional name for the instance.
+            Defaults to 'categorical_focal_crossentropy'.
+
     """
-    super().__init__(mean_absolute_error, name=name, reduction=reduction)
 
+    def __init__(
+        self,
+        alpha=0.25,
+        gamma=2.0,
+        from_logits=False,
+        label_smoothing=0.0,
+        axis=-1,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="categorical_focal_crossentropy",
+    ):
+        """Initializes `CategoricalFocalCrossentropy` instance."""
+        super().__init__(
+            categorical_focal_crossentropy,
+            alpha=alpha,
+            gamma=gamma,
+            name=name,
+            reduction=reduction,
+            from_logits=from_logits,
+            label_smoothing=label_smoothing,
+            axis=axis,
+        )
+        self.from_logits = from_logits
+        self.alpha = alpha
+        self.gamma = gamma
+
+    def get_config(self):
+        config = {
+            "alpha": self.alpha,
+            "gamma": self.gamma,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.losses.SparseCategoricalCrossentropy")
+class SparseCategoricalCrossentropy(LossFunctionWrapper):
+    """Computes the crossentropy loss between the labels and predictions.
+
+    Use this crossentropy loss function when there are two or more label
+    classes.  We expect labels to be provided as integers. If you want to
+    provide labels using `one-hot` representation, please use
+    `CategoricalCrossentropy` loss.  There should be `# classes` floating point
+    values per feature for `y_pred` and a single floating point value per
+    feature for `y_true`.
+
+    In the snippet below, there is a single floating point value per example for
+    `y_true` and `# classes` floating pointing values per example for `y_pred`.
+    The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
+    `[batch_size, num_classes]`.
+
+    Standalone usage:
+
+    >>> y_true = [1, 2]
+    >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> scce = tf.keras.losses.SparseCategoricalCrossentropy()
+    >>> scce(y_true, y_pred).numpy()
+    1.177
+
+    >>> # Calling with 'sample_weight'.
+    >>> scce(y_true, y_pred, sample_weight=tf.constant([0.3, 0.7])).numpy()
+    0.814
+
+    >>> # Using 'sum' reduction type.
+    >>> scce = tf.keras.losses.SparseCategoricalCrossentropy(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> scce(y_true, y_pred).numpy()
+    2.354
+
+    >>> # Using 'none' reduction type.
+    >>> scce = tf.keras.losses.SparseCategoricalCrossentropy(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> scce(y_true, y_pred).numpy()
+    array([0.0513, 2.303], dtype=float32)
+
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.SparseCategoricalCrossentropy())
+    ```
+    """
 
-@keras_export('keras.losses.MeanAbsolutePercentageError')
-class MeanAbsolutePercentageError(LossFunctionWrapper):
-  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
+    def __init__(
+        self,
+        from_logits=False,
+        ignore_class=None,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="sparse_categorical_crossentropy",
+    ):
+        """Initializes `SparseCategoricalCrossentropy` instance.
+
+        Args:
+            from_logits: Whether `y_pred` is expected to be a logits tensor. By
+                default, we assume that `y_pred` encodes a probability
+                distribution.
+            ignore_class: Optional integer. The ID of a class to be ignored
+                during loss computation. This is useful, for example, in
+                segmentation problems featuring a "void" class (commonly -1 or
+                255) in segmentation maps.
+                By default (`ignore_class=None`), all classes are considered.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'sparse_categorical_crossentropy'.
+        """
+        super().__init__(
+            sparse_categorical_crossentropy,
+            name=name,
+            reduction=reduction,
+            from_logits=from_logits,
+            ignore_class=ignore_class,
+        )
+
+
+@keras_export("keras.losses.CosineSimilarity")
+class CosineSimilarity(LossFunctionWrapper):
+    """Computes the cosine similarity between labels and predictions.
+
+    Note that it is a number between -1 and 1. When it is a negative number
+    between -1 and 0, 0 indicates orthogonality and values closer to -1
+    indicate greater similarity. The values closer to 1 indicate greater
+    dissimilarity. This makes it usable as a loss function in a setting
+    where you try to maximize the proximity between predictions and targets.
+    If either `y_true` or `y_pred` is a zero vector, cosine similarity will be 0
+    regardless of the proximity between predictions and targets.
+
+    `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
+
+    Standalone usage:
+
+    >>> y_true = [[0., 1.], [1., 1.]]
+    >>> y_pred = [[1., 0.], [1., 1.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
+    >>> # l2_norm(y_true) = [[0., 1.], [1./1.414, 1./1.414]]
+    >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414, 1./1.414]]
+    >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+    >>> # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+    >>> #       = -((0. + 0.) +  (0.5 + 0.5)) / 2
+    >>> cosine_loss(y_true, y_pred).numpy()
+    -0.5
+
+    >>> # Calling with 'sample_weight'.
+    >>> cosine_loss(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    -0.0999
+
+    >>> # Using 'sum' reduction type.
+    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> cosine_loss(y_true, y_pred).numpy()
+    -0.999
+
+    >>> # Using 'none' reduction type.
+    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> cosine_loss(y_true, y_pred).numpy()
+    array([-0., -0.999], dtype=float32)
+
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.CosineSimilarity(axis=1))
+    ```
 
-  Formula:
+    Args:
+        axis: The axis along which the cosine similarity is computed
+            (the features axis). Defaults to -1.
+        reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            Default value is `AUTO`. `AUTO` indicates that the reduction option
+            will be determined by the usage context. For almost all cases this
+            defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+            `tf.distribute.Strategy`, except via `Model.compile()` and
+            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE` will raise an
+            error. Please see this custom training [tutorial](
+            https://www.tensorflow.org/tutorials/distribute/custom_training)
+            for more details.
+        name: Optional name for the instance. Defaults to 'cosine_similarity'.
+    """
+
+    def __init__(
+        self,
+        axis=-1,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="cosine_similarity",
+    ):
+        super().__init__(
+            cosine_similarity, reduction=reduction, name=name, axis=axis
+        )
 
-  `loss = 100 * abs((y_true - y_pred) / y_true)`
 
-  Note that to avoid dividing by zero, a small epsilon value
-  is added to the denominator.
+@keras_export("keras.losses.Hinge")
+class Hinge(LossFunctionWrapper):
+    """Computes the hinge loss between `y_true` & `y_pred`.
 
-  Standalone usage:
+    `loss = maximum(1 - y_true * y_pred, 0)`
 
-  >>> y_true = [[2., 1.], [2., 3.]]
-  >>> y_pred = [[1., 1.], [1., 0.]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> mape = tf.keras.losses.MeanAbsolutePercentageError()
-  >>> mape(y_true, y_pred).numpy()
-  50.
+    `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+    provided we will convert them to -1 or 1.
 
-  >>> # Calling with 'sample_weight'.
-  >>> mape(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
-  20.
+    Standalone usage:
 
-  >>> # Using 'sum' reduction type.
-  >>> mape = tf.keras.losses.MeanAbsolutePercentageError(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> mape(y_true, y_pred).numpy()
-  100.
+    >>> y_true = [[0., 1.], [0., 0.]]
+    >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> h = tf.keras.losses.Hinge()
+    >>> h(y_true, y_pred).numpy()
+    1.3
 
-  >>> # Using 'none' reduction type.
-  >>> mape = tf.keras.losses.MeanAbsolutePercentageError(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> mape(y_true, y_pred).numpy()
-  array([25., 75.], dtype=float32)
+    >>> # Calling with 'sample_weight'.
+    >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
+    0.55
 
-  Usage with the `compile()` API:
+    >>> # Using 'sum' reduction type.
+    >>> h = tf.keras.losses.Hinge(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> h(y_true, y_pred).numpy()
+    2.6
 
-  ```python
-  model.compile(optimizer='sgd',
-                loss=tf.keras.losses.MeanAbsolutePercentageError())
-  ```
-  """
+    >>> # Using 'none' reduction type.
+    >>> h = tf.keras.losses.Hinge(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> h(y_true, y_pred).numpy()
+    array([1.1, 1.5], dtype=float32)
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='mean_absolute_percentage_error'):
-    """Initializes `MeanAbsolutePercentageError` instance.
+    Usage with the `compile()` API:
 
-    Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to
-        'mean_absolute_percentage_error'.
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.Hinge())
+    ```
     """
-    super().__init__(
-        mean_absolute_percentage_error, name=name, reduction=reduction)
 
+    def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="hinge"):
+        """Initializes `Hinge` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'hinge'.
+        """
+        super().__init__(hinge, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.SquaredHinge")
+class SquaredHinge(LossFunctionWrapper):
+    """Computes the squared hinge loss between `y_true` & `y_pred`.
 
-@keras_export('keras.losses.MeanSquaredLogarithmicError')
-class MeanSquaredLogarithmicError(LossFunctionWrapper):
-  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
+    `loss = square(maximum(1 - y_true * y_pred, 0))`
 
-  `loss = square(log(y_true + 1.) - log(y_pred + 1.))`
+    `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+    provided we will convert them to -1 or 1.
 
-  Standalone usage:
+    Standalone usage:
 
-  >>> y_true = [[0., 1.], [0., 0.]]
-  >>> y_pred = [[1., 1.], [1., 0.]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> msle = tf.keras.losses.MeanSquaredLogarithmicError()
-  >>> msle(y_true, y_pred).numpy()
-  0.240
+    >>> y_true = [[0., 1.], [0., 0.]]
+    >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> h = tf.keras.losses.SquaredHinge()
+    >>> h(y_true, y_pred).numpy()
+    1.86
 
-  >>> # Calling with 'sample_weight'.
-  >>> msle(y_true, y_pred, sample_weight=[0.7, 0.3]).numpy()
-  0.120
+    >>> # Calling with 'sample_weight'.
+    >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
+    0.73
 
-  >>> # Using 'sum' reduction type.
-  >>> msle = tf.keras.losses.MeanSquaredLogarithmicError(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> msle(y_true, y_pred).numpy()
-  0.480
+    >>> # Using 'sum' reduction type.
+    >>> h = tf.keras.losses.SquaredHinge(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> h(y_true, y_pred).numpy()
+    3.72
 
-  >>> # Using 'none' reduction type.
-  >>> msle = tf.keras.losses.MeanSquaredLogarithmicError(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> msle(y_true, y_pred).numpy()
-  array([0.240, 0.240], dtype=float32)
+    >>> # Using 'none' reduction type.
+    >>> h = tf.keras.losses.SquaredHinge(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> h(y_true, y_pred).numpy()
+    array([1.46, 2.26], dtype=float32)
 
-  Usage with the `compile()` API:
+    Usage with the `compile()` API:
 
-  ```python
-  model.compile(optimizer='sgd',
-                loss=tf.keras.losses.MeanSquaredLogarithmicError())
-  ```
-  """
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.SquaredHinge())
+    ```
+    """
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='mean_squared_logarithmic_error'):
-    """Initializes `MeanSquaredLogarithmicError` instance.
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name="squared_hinge"
+    ):
+        """Initializes `SquaredHinge` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'squared_hinge'.
+        """
+        super().__init__(squared_hinge, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.CategoricalHinge")
+class CategoricalHinge(LossFunctionWrapper):
+    """Computes the categorical hinge loss between `y_true` & `y_pred`.
 
-    Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to
-        'mean_squared_logarithmic_error'.
-    """
-    super().__init__(
-        mean_squared_logarithmic_error, name=name, reduction=reduction)
+    `loss = maximum(neg - pos + 1, 0)`
+    where `neg=maximum((1-y_true)*y_pred) and pos=sum(y_true*y_pred)`
 
+    Standalone usage:
 
-@keras_export('keras.losses.BinaryCrossentropy')
-class BinaryCrossentropy(LossFunctionWrapper):
-  """Computes the cross-entropy loss between true labels and predicted labels.
-
-  Use this cross-entropy loss for binary (0 or 1) classification applications.
-  The loss function requires the following inputs:
-
-  - `y_true` (true label): This is either 0 or 1.
-  - `y_pred` (predicted value): This is the model's prediction, i.e, a single
-    floating-point value which either represents a
-    [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
-    when `from_logits=True`) or a probability (i.e, value in [0., 1.] when
-    `from_logits=False`).
-
-  **Recommended Usage:** (set `from_logits=True`)
-
-  With `tf.keras` API:
-
-  ```python
-  model.compile(
-    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-    ....
-  )
-  ```
-
-  As a standalone function:
-
-  >>> # Example 1: (batch_size = 1, number of samples = 4)
-  >>> y_true = [0, 1, 0, 0]
-  >>> y_pred = [-18.6, 0.51, 2.94, -12.8]
-  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
-  >>> bce(y_true, y_pred).numpy()
-  0.865
-
-  >>> # Example 2: (batch_size = 2, number of samples = 4)
-  >>> y_true = [[0, 1], [0, 0]]
-  >>> y_pred = [[-18.6, 0.51], [2.94, -12.8]]
-  >>> # Using default 'auto'/'sum_over_batch_size' reduction type.
-  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
-  >>> bce(y_true, y_pred).numpy()
-  0.865
-  >>> # Using 'sample_weight' attribute
-  >>> bce(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
-  0.243
-  >>> # Using 'sum' reduction` type.
-  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True,
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> bce(y_true, y_pred).numpy()
-  1.730
-  >>> # Using 'none' reduction type.
-  >>> bce = tf.keras.losses.BinaryCrossentropy(from_logits=True,
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> bce(y_true, y_pred).numpy()
-  array([0.235, 1.496], dtype=float32)
-
-  **Default Usage:** (set `from_logits=False`)
-
-  >>> # Make the following updates to the above "Recommended Usage" section
-  >>> # 1. Set `from_logits=False`
-  >>> tf.keras.losses.BinaryCrossentropy() # OR ...('from_logits=False')
-  >>> # 2. Update `y_pred` to use probabilities instead of logits
-  >>> y_pred = [0.6, 0.3, 0.2, 0.8] # OR [[0.6, 0.3], [0.2, 0.8]]
-  """
-
-  def __init__(self,
-               from_logits=False,
-               label_smoothing=0.,
-               axis=-1,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='binary_crossentropy'):
-    """Initializes `BinaryCrossentropy` instance.
+    >>> y_true = [[0, 1], [0, 0]]
+    >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> h = tf.keras.losses.CategoricalHinge()
+    >>> h(y_true, y_pred).numpy()
+    1.4
 
-    Args:
-      from_logits: Whether to interpret `y_pred` as a tensor of
-        [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
-          assume that `y_pred` contains probabilities (i.e., values in [0, 1]).
-      label_smoothing: Float in [0, 1]. When 0, no smoothing occurs. When > 0,
-        we compute the loss between the predicted labels and a smoothed version
-        of the true labels, where the smoothing squeezes the labels towards 0.5.
-        Larger values of `label_smoothing` correspond to heavier smoothing.
-      axis: The axis along which to compute crossentropy (the features axis).
-        Defaults to -1.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Name for the op. Defaults to 'binary_crossentropy'.
+    >>> # Calling with 'sample_weight'.
+    >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
+    0.6
+
+    >>> # Using 'sum' reduction type.
+    >>> h = tf.keras.losses.CategoricalHinge(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> h(y_true, y_pred).numpy()
+    2.8
+
+    >>> # Using 'none' reduction type.
+    >>> h = tf.keras.losses.CategoricalHinge(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> h(y_true, y_pred).numpy()
+    array([1.2, 1.6], dtype=float32)
+
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.CategoricalHinge())
+    ```
     """
-    super().__init__(
-        binary_crossentropy,
-        name=name,
-        reduction=reduction,
-        from_logits=from_logits,
-        label_smoothing=label_smoothing,
-        axis=axis)
-    self.from_logits = from_logits
 
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name="categorical_hinge"
+    ):
+        """Initializes `CategoricalHinge` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'categorical_hinge'.
+        """
+        super().__init__(categorical_hinge, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.Poisson")
+class Poisson(LossFunctionWrapper):
+    """Computes the Poisson loss between `y_true` & `y_pred`.
 
-@keras_export('keras.losses.BinaryFocalCrossentropy')
-class BinaryFocalCrossentropy(LossFunctionWrapper):
-  """Computes the focal cross-entropy loss between true labels and predictions.
-
-  Binary cross-entropy loss is often used for binary (0 or 1) classification
-  tasks. The loss function requires the following inputs:
-
-  - `y_true` (true label): This is either 0 or 1.
-  - `y_pred` (predicted value): This is the model's prediction, i.e, a single
-    floating-point value which either represents a
-    [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
-    when `from_logits=True`) or a probability (i.e, value in `[0., 1.]` when
-    `from_logits=False`).
-
-  According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
-  helps to apply a "focal factor" to down-weight easy examples and focus more on
-  hard examples. By default, the focal tensor is computed as follows:
-
-  `focal_factor = (1 - output) ** gamma` for class 1
-  `focal_factor = output ** gamma` for class 0
-  where `gamma` is a focusing parameter. When `gamma=0`, this function is
-  equivalent to the binary crossentropy loss.
-
-  With the `compile()` API:
-
-  ```python
-  model.compile(
-    loss=tf.keras.losses.BinaryFocalCrossentropy(gamma=2.0, from_logits=True),
-    ....
-  )
-  ```
-
-  As a standalone function:
-
-  >>> # Example 1: (batch_size = 1, number of samples = 4)
-  >>> y_true = [0, 1, 0, 0]
-  >>> y_pred = [-18.6, 0.51, 2.94, -12.8]
-  >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=2, from_logits=True)
-  >>> loss(y_true, y_pred).numpy()
-  0.691
-
-  >>> # Example 2: (batch_size = 2, number of samples = 4)
-  >>> y_true = [[0, 1], [0, 0]]
-  >>> y_pred = [[-18.6, 0.51], [2.94, -12.8]]
-  >>> # Using default 'auto'/'sum_over_batch_size' reduction type.
-  >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=3, from_logits=True)
-  >>> loss(y_true, y_pred).numpy()
-  0.647
-
-  >>> # Using 'sample_weight' attribute
-  >>> loss(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
-  0.133
-
-  >>> # Using 'sum' reduction` type.
-  >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=4, from_logits=True,
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> loss(y_true, y_pred).numpy()
-  1.222
-
-  >>> # Using 'none' reduction type.
-  >>> loss = tf.keras.losses.BinaryFocalCrossentropy(gamma=5, from_logits=True,
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> loss(y_true, y_pred).numpy()
-  array([0.0017 1.1561], dtype=float32)
-
-  Args:
-    gamma: A focusing parameter used to compute the focal factor, default is
-      `2.0` as mentioned in the reference
-      [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf).
-    from_logits: Whether to interpret `y_pred` as a tensor of
-      [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
-      assume that `y_pred` are probabilities (i.e., values in `[0, 1]`).
-    label_smoothing: Float in `[0, 1]`. When `0`, no smoothing occurs. When >
-      `0`, we compute the loss between the predicted labels and a smoothed
-      version of the true labels, where the smoothing squeezes the labels
-      towards `0.5`. Larger values of `label_smoothing` correspond to heavier
-      smoothing.
-    axis: The axis along which to compute crossentropy (the features axis).
-      Defaults to `-1`.
-    reduction: Type of `tf.keras.losses.Reduction` to apply to
-      loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-      option will be determined by the usage context. For almost all cases
-      this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-      `tf.distribute.Strategy`, outside of built-in training loops such as
-      `tf.keras`, `compile()` and `fit()`, using `SUM_OVER_BATCH_SIZE` or
-      `AUTO` will raise an error. Please see this custom training [tutorial](
-      https://www.tensorflow.org/tutorials/distribute/custom_training) for
-      more details.
-    name: Name for the op. Defaults to 'binary_focal_crossentropy'.
-  """
-
-  def __init__(
-      self,
-      gamma=2.0,
-      from_logits=False,
-      label_smoothing=0.,
-      axis=-1,
-      reduction=losses_utils.ReductionV2.AUTO,
-      name='binary_focal_crossentropy',
-  ):
-    """Initializes `BinaryFocalCrossentropy` instance."""
-    super().__init__(
-        binary_focal_crossentropy,
-        gamma=gamma,
-        name=name,
-        reduction=reduction,
-        from_logits=from_logits,
-        label_smoothing=label_smoothing,
-        axis=axis)
-    self.from_logits = from_logits
-    self.gamma = gamma
+    `loss = y_pred - y_true * log(y_pred)`
 
-  def get_config(self):
-    config = {
-        'gamma': self.gamma,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    Standalone usage:
 
+    >>> y_true = [[0., 1.], [0., 0.]]
+    >>> y_pred = [[1., 1.], [0., 0.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> p = tf.keras.losses.Poisson()
+    >>> p(y_true, y_pred).numpy()
+    0.5
 
-@keras_export('keras.losses.CategoricalCrossentropy')
-class CategoricalCrossentropy(LossFunctionWrapper):
-  """Computes the crossentropy loss between the labels and predictions.
-
-  Use this crossentropy loss function when there are two or more label classes.
-  We expect labels to be provided in a `one_hot` representation. If you want to
-  provide labels as integers, please use `SparseCategoricalCrossentropy` loss.
-  There should be `# classes` floating point values per feature.
-
-  In the snippet below, there is `# classes` floating pointing values per
-  example. The shape of both `y_pred` and `y_true` are
-  `[batch_size, num_classes]`.
-
-  Standalone usage:
-
-  >>> y_true = [[0, 1, 0], [0, 0, 1]]
-  >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> cce = tf.keras.losses.CategoricalCrossentropy()
-  >>> cce(y_true, y_pred).numpy()
-  1.177
-
-  >>> # Calling with 'sample_weight'.
-  >>> cce(y_true, y_pred, sample_weight=tf.constant([0.3, 0.7])).numpy()
-  0.814
-
-  >>> # Using 'sum' reduction type.
-  >>> cce = tf.keras.losses.CategoricalCrossentropy(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> cce(y_true, y_pred).numpy()
-  2.354
-
-  >>> # Using 'none' reduction type.
-  >>> cce = tf.keras.losses.CategoricalCrossentropy(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> cce(y_true, y_pred).numpy()
-  array([0.0513, 2.303], dtype=float32)
-
-  Usage with the `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.CategoricalCrossentropy())
-  ```
-  """
-
-  def __init__(self,
-               from_logits=False,
-               label_smoothing=0.,
-               axis=-1,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='categorical_crossentropy'):
-    """Initializes `CategoricalCrossentropy` instance.
+    >>> # Calling with 'sample_weight'.
+    >>> p(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    0.4
 
-    Args:
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
-        meaning the confidence on label values are relaxed. For example, if
-        `0.1`, use `0.1 / num_classes` for non-target labels and
-        `0.9 + 0.1 / num_classes` for target labels.
-      axis: The axis along which to compute crossentropy (the features axis).
-        Defaults to -1.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance.
-        Defaults to 'categorical_crossentropy'.
-    """
-    super().__init__(
-        categorical_crossentropy,
-        name=name,
-        reduction=reduction,
-        from_logits=from_logits,
-        label_smoothing=label_smoothing,
-        axis=axis)
+    >>> # Using 'sum' reduction type.
+    >>> p = tf.keras.losses.Poisson(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> p(y_true, y_pred).numpy()
+    0.999
 
+    >>> # Using 'none' reduction type.
+    >>> p = tf.keras.losses.Poisson(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> p(y_true, y_pred).numpy()
+    array([0.999, 0.], dtype=float32)
 
-@keras_export('keras.losses.SparseCategoricalCrossentropy')
-class SparseCategoricalCrossentropy(LossFunctionWrapper):
-  """Computes the crossentropy loss between the labels and predictions.
-
-  Use this crossentropy loss function when there are two or more label classes.
-  We expect labels to be provided as integers. If you want to provide labels
-  using `one-hot` representation, please use `CategoricalCrossentropy` loss.
-  There should be `# classes` floating point values per feature for `y_pred`
-  and a single floating point value per feature for `y_true`.
-
-  In the snippet below, there is a single floating point value per example for
-  `y_true` and `# classes` floating pointing values per example for `y_pred`.
-  The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
-  `[batch_size, num_classes]`.
-
-  Standalone usage:
-
-  >>> y_true = [1, 2]
-  >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> scce = tf.keras.losses.SparseCategoricalCrossentropy()
-  >>> scce(y_true, y_pred).numpy()
-  1.177
-
-  >>> # Calling with 'sample_weight'.
-  >>> scce(y_true, y_pred, sample_weight=tf.constant([0.3, 0.7])).numpy()
-  0.814
-
-  >>> # Using 'sum' reduction type.
-  >>> scce = tf.keras.losses.SparseCategoricalCrossentropy(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> scce(y_true, y_pred).numpy()
-  2.354
-
-  >>> # Using 'none' reduction type.
-  >>> scce = tf.keras.losses.SparseCategoricalCrossentropy(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> scce(y_true, y_pred).numpy()
-  array([0.0513, 2.303], dtype=float32)
-
-  Usage with the `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss=tf.keras.losses.SparseCategoricalCrossentropy())
-  ```
-  """
-
-  def __init__(self,
-               from_logits=False,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='sparse_categorical_crossentropy'):
-    """Initializes `SparseCategoricalCrossentropy` instance.
+    Usage with the `compile()` API:
 
-    Args:
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to
-        'sparse_categorical_crossentropy'.
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.Poisson())
+    ```
     """
-    super().__init__(
-        sparse_categorical_crossentropy,
-        name=name,
-        reduction=reduction,
-        from_logits=from_logits)
 
+    def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="poisson"):
+        """Initializes `Poisson` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'poisson'.
+        """
+        super().__init__(poisson, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.LogCosh")
+class LogCosh(LossFunctionWrapper):
+    """Computes the logarithm of the hyperbolic cosine of the prediction error.
 
-@keras_export('keras.losses.Hinge')
-class Hinge(LossFunctionWrapper):
-  """Computes the hinge loss between `y_true` and `y_pred`.
+    `logcosh = log((exp(x) + exp(-x))/2)`,
+    where x is the error `y_pred - y_true`.
 
-  `loss = maximum(1 - y_true * y_pred, 0)`
+    Standalone usage:
 
-  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
-  provided we will convert them to -1 or 1.
+    >>> y_true = [[0., 1.], [0., 0.]]
+    >>> y_pred = [[1., 1.], [0., 0.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> l = tf.keras.losses.LogCosh()
+    >>> l(y_true, y_pred).numpy()
+    0.108
 
-  Standalone usage:
+    >>> # Calling with 'sample_weight'.
+    >>> l(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    0.087
 
-  >>> y_true = [[0., 1.], [0., 0.]]
-  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> h = tf.keras.losses.Hinge()
-  >>> h(y_true, y_pred).numpy()
-  1.3
+    >>> # Using 'sum' reduction type.
+    >>> l = tf.keras.losses.LogCosh(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> l(y_true, y_pred).numpy()
+    0.217
 
-  >>> # Calling with 'sample_weight'.
-  >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
-  0.55
+    >>> # Using 'none' reduction type.
+    >>> l = tf.keras.losses.LogCosh(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> l(y_true, y_pred).numpy()
+    array([0.217, 0.], dtype=float32)
 
-  >>> # Using 'sum' reduction type.
-  >>> h = tf.keras.losses.Hinge(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> h(y_true, y_pred).numpy()
-  2.6
+    Usage with the `compile()` API:
 
-  >>> # Using 'none' reduction type.
-  >>> h = tf.keras.losses.Hinge(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> h(y_true, y_pred).numpy()
-  array([1.1, 1.5], dtype=float32)
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.LogCosh())
+    ```
+    """
 
-  Usage with the `compile()` API:
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name="log_cosh"
+    ):
+        """Initializes `LogCosh` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'log_cosh'.
+        """
+        super().__init__(log_cosh, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.KLDivergence")
+class KLDivergence(LossFunctionWrapper):
+    """Computes Kullback-Leibler divergence loss between `y_true` & `y_pred`.
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.Hinge())
-  ```
-  """
+    `loss = y_true * log(y_true / y_pred)`
 
-  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name='hinge'):
-    """Initializes `Hinge` instance.
+    See: https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
 
-    Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'hinge'.
-    """
-    super().__init__(hinge, name=name, reduction=reduction)
+    Standalone usage:
 
+    >>> y_true = [[0, 1], [0, 0]]
+    >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> kl = tf.keras.losses.KLDivergence()
+    >>> kl(y_true, y_pred).numpy()
+    0.458
 
-@keras_export('keras.losses.SquaredHinge')
-class SquaredHinge(LossFunctionWrapper):
-  """Computes the squared hinge loss between `y_true` and `y_pred`.
+    >>> # Calling with 'sample_weight'.
+    >>> kl(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    0.366
 
-  `loss = square(maximum(1 - y_true * y_pred, 0))`
+    >>> # Using 'sum' reduction type.
+    >>> kl = tf.keras.losses.KLDivergence(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> kl(y_true, y_pred).numpy()
+    0.916
 
-  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
-  provided we will convert them to -1 or 1.
+    >>> # Using 'none' reduction type.
+    >>> kl = tf.keras.losses.KLDivergence(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> kl(y_true, y_pred).numpy()
+    array([0.916, -3.08e-06], dtype=float32)
 
-  Standalone usage:
+    Usage with the `compile()` API:
 
-  >>> y_true = [[0., 1.], [0., 0.]]
-  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> h = tf.keras.losses.SquaredHinge()
-  >>> h(y_true, y_pred).numpy()
-  1.86
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.KLDivergence())
+    ```
+    """
 
-  >>> # Calling with 'sample_weight'.
-  >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
-  0.73
+    def __init__(
+        self, reduction=losses_utils.ReductionV2.AUTO, name="kl_divergence"
+    ):
+        """Initializes `KLDivergence` instance.
+
+        Args:
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'kl_divergence'.
+        """
+        super().__init__(kl_divergence, name=name, reduction=reduction)
+
+
+@keras_export("keras.losses.Huber")
+class Huber(LossFunctionWrapper):
+    """Computes the Huber loss between `y_true` & `y_pred`.
+
+    For each value x in `error = y_true - y_pred`:
+
+    ```
+    loss = 0.5 * x^2                  if |x| <= d
+    loss = 0.5 * d^2 + d * (|x| - d)  if |x| > d
+    ```
+    where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
+
+    Standalone usage:
+
+    >>> y_true = [[0, 1], [0, 0]]
+    >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> h = tf.keras.losses.Huber()
+    >>> h(y_true, y_pred).numpy()
+    0.155
+
+    >>> # Calling with 'sample_weight'.
+    >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
+    0.09
+
+    >>> # Using 'sum' reduction type.
+    >>> h = tf.keras.losses.Huber(
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> h(y_true, y_pred).numpy()
+    0.31
+
+    >>> # Using 'none' reduction type.
+    >>> h = tf.keras.losses.Huber(
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> h(y_true, y_pred).numpy()
+    array([0.18, 0.13], dtype=float32)
+
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd', loss=tf.keras.losses.Huber())
+    ```
+    """
 
-  >>> # Using 'sum' reduction type.
-  >>> h = tf.keras.losses.SquaredHinge(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> h(y_true, y_pred).numpy()
-  3.72
+    def __init__(
+        self,
+        delta=1.0,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="huber_loss",
+    ):
+        """Initializes `Huber` instance.
+
+        Args:
+            delta: A float, the point where the Huber loss function changes from
+                a quadratic to linear.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'huber_loss'.
+        """
+        super().__init__(huber, name=name, reduction=reduction, delta=delta)
 
-  >>> # Using 'none' reduction type.
-  >>> h = tf.keras.losses.SquaredHinge(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> h(y_true, y_pred).numpy()
-  array([1.46, 2.26], dtype=float32)
 
-  Usage with the `compile()` API:
+@keras_export(
+    "keras.metrics.mean_squared_error",
+    "keras.metrics.mse",
+    "keras.metrics.MSE",
+    "keras.losses.mean_squared_error",
+    "keras.losses.mse",
+    "keras.losses.MSE",
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def mean_squared_error(y_true, y_pred):
+    """Computes the mean squared error between labels and predictions.
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.SquaredHinge())
-  ```
-  """
+    After computing the squared distance between the inputs, the mean value over
+    the last dimension is returned.
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='squared_hinge'):
-    """Initializes `SquaredHinge` instance.
+    `loss = mean(square(y_true - y_pred), axis=-1)`
+
+    Standalone usage:
+
+    >>> y_true = np.random.randint(0, 2, size=(2, 3))
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.mean_squared_error(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> assert np.array_equal(
+    ...     loss.numpy(), np.mean(np.square(y_true - y_pred), axis=-1))
 
     Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'squared_hinge'.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+
+    Returns:
+        Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
     """
-    super().__init__(squared_hinge, name=name, reduction=reduction)
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
 
 
-@keras_export('keras.losses.CategoricalHinge')
-class CategoricalHinge(LossFunctionWrapper):
-  """Computes the categorical hinge loss between `y_true` and `y_pred`.
+def _ragged_tensor_apply_loss(loss_fn, y_true, y_pred, y_pred_extra_dim=False):
+    """Apply a loss function on a per batch basis.
 
-  `loss = maximum(neg - pos + 1, 0)`
-  where `neg=maximum((1-y_true)*y_pred) and pos=sum(y_true*y_pred)`
+    Args:
+        loss_fn: The loss function
+        y_true: truth values (RaggedTensor)
+        y_pred: predicted values (RaggedTensor)
+        y_pred_extra_dim: whether y_pred has an additional dimension compared to
+        y_true
 
-  Standalone usage:
+    Returns:
+        Loss-function result. A dense tensor if the output has a single
+        dimension (per-batch loss value); a ragged tensor otherwise.
+    """
 
-  >>> y_true = [[0, 1], [0, 0]]
-  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> h = tf.keras.losses.CategoricalHinge()
-  >>> h(y_true, y_pred).numpy()
-  1.4
+    def rt_is_equiv_dense(rt):
+        """Returns true if this RaggedTensor has the same row_lengths across
+
+           all ragged dimensions and thus can be converted to a dense tensor
+           without loss of information.
+
+        Args:
+            rt: RaggedTensor.
+        """
+        return tf.reduce_all(
+            [
+                tf.equal(
+                    tf.math.reduce_variance(
+                        tf.cast(row_lens, backend.floatx())
+                    ),
+                    tf.constant([0.0]),
+                )
+                for row_lens in rt.nested_row_lengths()
+            ]
+        )
+
+    def _convert_to_dense(inputs):
+        return tuple(
+            rt.to_tensor() if isinstance(rt, tf.RaggedTensor) else rt
+            for rt in inputs
+        )
+
+    def _call_loss(inputs, ragged_output):
+        """Adapt the result to ragged or dense tensor according to the expected
 
-  >>> # Calling with 'sample_weight'.
-  >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
-  0.6
+        output type. This is done so that all the return values of the map
+        operation have the same type.
+        """
+        r = loss_fn(*inputs)
+        if ragged_output and not isinstance(r, tf.RaggedTensor):
+            r = tf.RaggedTensor.from_tensor(r)
+        elif not ragged_output and isinstance(r, tf.RaggedTensor):
+            r = r.to_tensor()
+        return r
+
+    def _wrapper(inputs, ragged_output):
+        _, y_pred = inputs
+        if isinstance(y_pred, tf.RaggedTensor):
+            return tf.cond(
+                rt_is_equiv_dense(y_pred),
+                lambda: _call_loss(_convert_to_dense(inputs), ragged_output),
+                lambda: _call_loss(inputs, ragged_output),
+            )
+
+        return loss_fn(*inputs)
+
+    if not isinstance(y_true, tf.RaggedTensor):
+        return loss_fn(y_true, y_pred.to_tensor())
+
+    lshape = y_pred.shape.as_list()[1:-1]
+    if len(lshape) > 0:
+        spec = tf.RaggedTensorSpec(shape=lshape, dtype=y_pred.dtype)
+    else:
+        spec = tf.TensorSpec(shape=[], dtype=y_pred.dtype)
 
-  >>> # Using 'sum' reduction type.
-  >>> h = tf.keras.losses.CategoricalHinge(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> h(y_true, y_pred).numpy()
-  2.8
+    nested_splits_list = [rt.nested_row_splits for rt in (y_true, y_pred)]
+    if y_pred_extra_dim:
+        # The last dimension of a categorical prediction may be ragged or not.
+        rdims = [len(slist) for slist in nested_splits_list]
+        if rdims[0] == rdims[1] - 1:
+            nested_splits_list[1] = nested_splits_list[1][:-1]
 
-  >>> # Using 'none' reduction type.
-  >>> h = tf.keras.losses.CategoricalHinge(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> h(y_true, y_pred).numpy()
-  array([1.2, 1.6], dtype=float32)
+    map_fn = functools.partial(_wrapper, ragged_output=len(lshape) > 1)
 
-  Usage with the `compile()` API:
+    assertion_list = ragged_util.assert_splits_match(nested_splits_list)
+    with tf.control_dependencies(assertion_list):
+        return ragged_map_ops.map_fn(map_fn, elems=(y_true, y_pred), dtype=spec)
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.CategoricalHinge())
-  ```
-  """
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='categorical_hinge'):
-    """Initializes `CategoricalHinge` instance.
+@dispatch.dispatch_for_types(mean_squared_error, tf.RaggedTensor)
+def _ragged_tensor_mse(y_true, y_pred):
+    """Implements support for handling RaggedTensors.
 
     Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'categorical_hinge'.
+        y_true: RaggedTensor truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: RaggedTensor predicted values.
+            shape = `[batch_size, d0, .. dN]`.
+
+    Returns:
+        Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
+        When the number of dimensions of the batch feature vector [d0, .. dN] is
+        greater than one the return value is a RaggedTensor. Otherwise, a Dense
+        tensor with dimensions [batch_size] is returned.
     """
-    super().__init__(categorical_hinge, name=name, reduction=reduction)
+    return _ragged_tensor_apply_loss(mean_squared_error, y_true, y_pred)
 
 
-@keras_export('keras.losses.Poisson')
-class Poisson(LossFunctionWrapper):
-  """Computes the Poisson loss between `y_true` and `y_pred`.
+@keras_export(
+    "keras.metrics.mean_absolute_error",
+    "keras.metrics.mae",
+    "keras.metrics.MAE",
+    "keras.losses.mean_absolute_error",
+    "keras.losses.mae",
+    "keras.losses.MAE",
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def mean_absolute_error(y_true, y_pred):
+    """Computes the mean absolute error between labels and predictions.
 
-  `loss = y_pred - y_true * log(y_pred)`
+    `loss = mean(abs(y_true - y_pred), axis=-1)`
 
-  Standalone usage:
+    Standalone usage:
 
-  >>> y_true = [[0., 1.], [0., 0.]]
-  >>> y_pred = [[1., 1.], [0., 0.]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> p = tf.keras.losses.Poisson()
-  >>> p(y_true, y_pred).numpy()
-  0.5
+    >>> y_true = np.random.randint(0, 2, size=(2, 3))
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.mean_absolute_error(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> assert np.array_equal(
+    ...     loss.numpy(), np.mean(np.abs(y_true - y_pred), axis=-1))
 
-  >>> # Calling with 'sample_weight'.
-  >>> p(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
-  0.4
+    Args:
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
-  >>> # Using 'sum' reduction type.
-  >>> p = tf.keras.losses.Poisson(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> p(y_true, y_pred).numpy()
-  0.999
+    Returns:
+        Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    return backend.mean(tf.abs(y_pred - y_true), axis=-1)
 
-  >>> # Using 'none' reduction type.
-  >>> p = tf.keras.losses.Poisson(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> p(y_true, y_pred).numpy()
-  array([0.999, 0.], dtype=float32)
 
-  Usage with the `compile()` API:
+@dispatch.dispatch_for_types(mean_absolute_error, tf.RaggedTensor)
+def _ragged_tensor_mae(y_true, y_pred):
+    """RaggedTensor adapter for mean_absolute_error."""
+    return _ragged_tensor_apply_loss(mean_absolute_error, y_true, y_pred)
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.Poisson())
-  ```
-  """
 
-  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name='poisson'):
-    """Initializes `Poisson` instance.
+@keras_export(
+    "keras.metrics.mean_absolute_percentage_error",
+    "keras.metrics.mape",
+    "keras.metrics.MAPE",
+    "keras.losses.mean_absolute_percentage_error",
+    "keras.losses.mape",
+    "keras.losses.MAPE",
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def mean_absolute_percentage_error(y_true, y_pred):
+    """Computes the mean absolute percentage error between `y_true` & `y_pred`.
 
-    Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'poisson'.
-    """
-    super().__init__(poisson, name=name, reduction=reduction)
+    `loss = 100 * mean(abs((y_true - y_pred) / y_true), axis=-1)`
 
+    Standalone usage:
 
-@keras_export('keras.losses.LogCosh')
-class LogCosh(LossFunctionWrapper):
-  """Computes the logarithm of the hyperbolic cosine of the prediction error.
+    >>> y_true = np.random.random(size=(2, 3))
+    >>> y_true = np.maximum(y_true, 1e-7)  # Prevent division by zero
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.mean_absolute_percentage_error(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> assert np.array_equal(
+    ...     loss.numpy(),
+    ...     100. * np.mean(np.abs((y_true - y_pred) / y_true), axis=-1))
 
-  `logcosh = log((exp(x) + exp(-x))/2)`,
-  where x is the error `y_pred - y_true`.
+    Args:
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
-  Standalone usage:
+    Returns:
+        Mean absolute percentage error values. shape = `[batch_size, d0, ..
+        dN-1]`.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    diff = tf.abs(
+        (y_true - y_pred) / backend.maximum(tf.abs(y_true), backend.epsilon())
+    )
+    return 100.0 * backend.mean(diff, axis=-1)
 
-  >>> y_true = [[0., 1.], [0., 0.]]
-  >>> y_pred = [[1., 1.], [0., 0.]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> l = tf.keras.losses.LogCosh()
-  >>> l(y_true, y_pred).numpy()
-  0.108
 
-  >>> # Calling with 'sample_weight'.
-  >>> l(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
-  0.087
+@dispatch.dispatch_for_types(mean_absolute_percentage_error, tf.RaggedTensor)
+def _ragged_tensor_mape(y_true, y_pred):
+    """Support RaggedTensors."""
+    return _ragged_tensor_apply_loss(
+        mean_absolute_percentage_error, y_true, y_pred
+    )
 
-  >>> # Using 'sum' reduction type.
-  >>> l = tf.keras.losses.LogCosh(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> l(y_true, y_pred).numpy()
-  0.217
 
-  >>> # Using 'none' reduction type.
-  >>> l = tf.keras.losses.LogCosh(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> l(y_true, y_pred).numpy()
-  array([0.217, 0.], dtype=float32)
+@keras_export(
+    "keras.metrics.mean_squared_logarithmic_error",
+    "keras.metrics.msle",
+    "keras.metrics.MSLE",
+    "keras.losses.mean_squared_logarithmic_error",
+    "keras.losses.msle",
+    "keras.losses.MSLE",
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def mean_squared_logarithmic_error(y_true, y_pred):
+    """Computes the mean squared logarithmic error between `y_true` & `y_pred`.
 
-  Usage with the `compile()` API:
+    `loss = mean(square(log(y_true + 1) - log(y_pred + 1)), axis=-1)`
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.LogCosh())
-  ```
-  """
+    Standalone usage:
 
-  def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name='log_cosh'):
-    """Initializes `LogCosh` instance.
+    >>> y_true = np.random.randint(0, 2, size=(2, 3))
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.mean_squared_logarithmic_error(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> y_true = np.maximum(y_true, 1e-7)
+    >>> y_pred = np.maximum(y_pred, 1e-7)
+    >>> assert np.allclose(
+    ...     loss.numpy(),
+    ...     np.mean(
+    ...         np.square(np.log(y_true + 1.) - np.log(y_pred + 1.)), axis=-1))
 
     Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'log_cosh'.
-    """
-    super().__init__(log_cosh, name=name, reduction=reduction)
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
+    Returns:
+        Mean squared logarithmic error values. shape = `[batch_size, d0, ..
+        dN-1]`.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    first_log = tf.math.log(backend.maximum(y_pred, backend.epsilon()) + 1.0)
+    second_log = tf.math.log(backend.maximum(y_true, backend.epsilon()) + 1.0)
+    return backend.mean(
+        tf.math.squared_difference(first_log, second_log), axis=-1
+    )
 
-@keras_export('keras.losses.KLDivergence')
-class KLDivergence(LossFunctionWrapper):
-  """Computes Kullback-Leibler divergence loss between `y_true` and `y_pred`.
 
-  `loss = y_true * log(y_true / y_pred)`
+@dispatch.dispatch_for_types(mean_squared_logarithmic_error, tf.RaggedTensor)
+def _ragged_tensor_msle(y_true, y_pred):
+    """Implements support for handling RaggedTensors."""
+    return _ragged_tensor_apply_loss(
+        mean_squared_logarithmic_error, y_true, y_pred
+    )
 
-  See: https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
 
-  Standalone usage:
+def _maybe_convert_labels(y_true):
+    """Converts binary labels into -1/1."""
+    are_zeros = tf.equal(y_true, 0)
+    are_ones = tf.equal(y_true, 1)
+    is_binary = tf.reduce_all(tf.logical_or(are_zeros, are_ones))
 
-  >>> y_true = [[0, 1], [0, 0]]
-  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> kl = tf.keras.losses.KLDivergence()
-  >>> kl(y_true, y_pred).numpy()
-  0.458
+    def _convert_binary_labels():
+        # Convert the binary labels to -1 or 1.
+        return 2.0 * y_true - 1.0
 
-  >>> # Calling with 'sample_weight'.
-  >>> kl(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
-  0.366
+    updated_y_true = tf.__internal__.smart_cond.smart_cond(
+        is_binary, _convert_binary_labels, lambda: y_true
+    )
+    return updated_y_true
 
-  >>> # Using 'sum' reduction type.
-  >>> kl = tf.keras.losses.KLDivergence(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> kl(y_true, y_pred).numpy()
-  0.916
 
-  >>> # Using 'none' reduction type.
-  >>> kl = tf.keras.losses.KLDivergence(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> kl(y_true, y_pred).numpy()
-  array([0.916, -3.08e-06], dtype=float32)
+@keras_export("keras.metrics.squared_hinge", "keras.losses.squared_hinge")
+@tf.__internal__.dispatch.add_dispatch_support
+def squared_hinge(y_true, y_pred):
+    """Computes the squared hinge loss between `y_true` & `y_pred`.
 
-  Usage with the `compile()` API:
+    `loss = mean(square(maximum(1 - y_true * y_pred, 0)), axis=-1)`
 
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.KLDivergence())
-  ```
-  """
+    Standalone usage:
 
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='kl_divergence'):
-    """Initializes `KLDivergence` instance.
+    >>> y_true = np.random.choice([-1, 1], size=(2, 3))
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.squared_hinge(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> assert np.array_equal(
+    ...     loss.numpy(),
+    ...     np.mean(np.square(np.maximum(1. - y_true * y_pred, 0.)), axis=-1))
 
     Args:
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'kl_divergence'.
+        y_true: The ground truth values. `y_true` values are expected to be -1
+            or 1. If binary (0 or 1) labels are provided we will convert them to
+            -1 or 1. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+
+    Returns:
+        Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
     """
-    super().__init__(kl_divergence, name=name, reduction=reduction)
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    y_true = _maybe_convert_labels(y_true)
+    return backend.mean(
+        tf.square(tf.maximum(1.0 - y_true * y_pred, 0.0)), axis=-1
+    )
 
 
-@keras_export('keras.losses.Huber')
-class Huber(LossFunctionWrapper):
-  """Computes the Huber loss between `y_true` and `y_pred`.
-
-  For each value x in `error = y_true - y_pred`:
-
-  ```
-  loss = 0.5 * x^2                  if |x| <= d
-  loss = 0.5 * d^2 + d * (|x| - d)  if |x| > d
-  ```
-  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
-
-  Standalone usage:
-
-  >>> y_true = [[0, 1], [0, 0]]
-  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> h = tf.keras.losses.Huber()
-  >>> h(y_true, y_pred).numpy()
-  0.155
-
-  >>> # Calling with 'sample_weight'.
-  >>> h(y_true, y_pred, sample_weight=[1, 0]).numpy()
-  0.09
-
-  >>> # Using 'sum' reduction type.
-  >>> h = tf.keras.losses.Huber(
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> h(y_true, y_pred).numpy()
-  0.31
-
-  >>> # Using 'none' reduction type.
-  >>> h = tf.keras.losses.Huber(
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> h(y_true, y_pred).numpy()
-  array([0.18, 0.13], dtype=float32)
-
-  Usage with the `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.Huber())
-  ```
-  """
-
-  def __init__(self,
-               delta=1.0,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='huber_loss'):
-    """Initializes `Huber` instance.
+@keras_export("keras.metrics.hinge", "keras.losses.hinge")
+@tf.__internal__.dispatch.add_dispatch_support
+def hinge(y_true, y_pred):
+    """Computes the hinge loss between `y_true` & `y_pred`.
+
+    `loss = mean(maximum(1 - y_true * y_pred, 0), axis=-1)`
+
+    Standalone usage:
+
+    >>> y_true = np.random.choice([-1, 1], size=(2, 3))
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.hinge(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> assert np.array_equal(
+    ...     loss.numpy(),
+    ...     np.mean(np.maximum(1. - y_true * y_pred, 0.), axis=-1))
 
     Args:
-      delta: A float, the point where the Huber loss function changes from a
-        quadratic to linear.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-        `tf.distribute.Strategy`, outside of built-in training loops such as
-        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-          https://www.tensorflow.org/tutorials/distribute/custom_training) for
-            more details.
-      name: Optional name for the instance. Defaults to 'huber_loss'.
+        y_true: The ground truth values. `y_true` values are expected to be -1
+            or 1. If binary (0 or 1) labels are provided we will convert them to
+            -1 or 1. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+
+    Returns:
+        Hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
     """
-    super().__init__(huber, name=name, reduction=reduction, delta=delta)
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    y_true = _maybe_convert_labels(y_true)
+    return backend.mean(tf.maximum(1.0 - y_true * y_pred, 0.0), axis=-1)
 
 
-@keras_export('keras.metrics.mean_squared_error', 'keras.metrics.mse',
-              'keras.metrics.MSE', 'keras.losses.mean_squared_error',
-              'keras.losses.mse', 'keras.losses.MSE')
+@keras_export("keras.losses.categorical_hinge")
 @tf.__internal__.dispatch.add_dispatch_support
-def mean_squared_error(y_true, y_pred):
-  """Computes the mean squared error between labels and predictions.
+def categorical_hinge(y_true, y_pred):
+    """Computes the categorical hinge loss between `y_true` & `y_pred`.
 
-  After computing the squared distance between the inputs, the mean value over
-  the last dimension is returned.
+    `loss = maximum(neg - pos + 1, 0)`
+    where `neg=maximum((1-y_true)*y_pred) and pos=sum(y_true*y_pred)`
 
-  `loss = mean(square(y_true - y_pred), axis=-1)`
+    Standalone usage:
 
-  Standalone usage:
+    >>> y_true = np.random.randint(0, 3, size=(2,))
+    >>> y_true = tf.keras.utils.to_categorical(y_true, num_classes=3)
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.categorical_hinge(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> pos = np.sum(y_true * y_pred, axis=-1)
+    >>> neg = np.amax((1. - y_true) * y_pred, axis=-1)
+    >>> assert np.array_equal(loss.numpy(), np.maximum(0., neg - pos + 1.))
 
-  >>> y_true = np.random.randint(0, 2, size=(2, 3))
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.mean_squared_error(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> assert np.array_equal(
-  ...     loss.numpy(), np.mean(np.square(y_true - y_pred), axis=-1))
+    Args:
+        y_true: The ground truth values. `y_true` values are expected to be
+        either `{-1, +1}` or `{0, 1}` (i.e. a one-hot-encoded tensor).
+        y_pred: The predicted values.
 
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+    Returns:
+        Categorical hinge loss values.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    pos = tf.reduce_sum(y_true * y_pred, axis=-1)
+    neg = tf.reduce_max((1.0 - y_true) * y_pred, axis=-1)
+    zero = tf.cast(0.0, y_pred.dtype)
+    return tf.maximum(neg - pos + 1.0, zero)
 
-  Returns:
-    Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
 
+@keras_export("keras.losses.huber", v1=[])
+@tf.__internal__.dispatch.add_dispatch_support
+def huber(y_true, y_pred, delta=1.0):
+    """Computes Huber loss value.
 
-def _ragged_tensor_apply_loss(loss_fn, y_true, y_pred, y_pred_extra_dim=False):
-  """Apply a loss function on a per batch basis.
+    For each value x in `error = y_true - y_pred`:
 
-  Args:
-    loss_fn: The loss function
-    y_true: truth values (RaggedTensor)
-    y_pred: predicted values (RaggedTensor)
-    y_pred_extra_dim: whether y_pred has an additional dimension compared to
-      y_true
+    ```
+    loss = 0.5 * x^2                  if |x| <= d
+    loss = d * |x| - 0.5 * d^2        if |x| > d
+    ```
+    where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
 
-  Returns:
-    Loss-function result. A dense tensor if the output has a single dimension
-    (per-batch loss value); a ragged tensor otherwise.
-  """
+    Args:
+        y_true: tensor of true targets.
+        y_pred: tensor of predicted targets.
+        delta: A float, the point where the Huber loss function changes from a
+            quadratic to linear.
 
-  def rt_is_equiv_dense(rt):
-    """Returns true if this RaggedTensor has the same row_lenghts across
+    Returns:
+        Tensor with one scalar loss entry per sample.
+    """
+    y_pred = tf.cast(y_pred, dtype=backend.floatx())
+    y_true = tf.cast(y_true, dtype=backend.floatx())
+    delta = tf.cast(delta, dtype=backend.floatx())
+    error = tf.subtract(y_pred, y_true)
+    abs_error = tf.abs(error)
+    half = tf.convert_to_tensor(0.5, dtype=abs_error.dtype)
+    return backend.mean(
+        tf.where(
+            abs_error <= delta,
+            half * tf.square(error),
+            delta * abs_error - half * tf.square(delta),
+        ),
+        axis=-1,
+    )
 
-       all ragged dimensions and thus can be converted to a dense tensor
-       without loss of information.
+
+@keras_export(
+    "keras.losses.log_cosh",
+    "keras.losses.logcosh",
+    "keras.metrics.log_cosh",
+    "keras.metrics.logcosh",
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def log_cosh(y_true, y_pred):
+    """Logarithm of the hyperbolic cosine of the prediction error.
+
+    `log(cosh(x))` is approximately equal to `(x ** 2) / 2` for small `x` and
+    to `abs(x) - log(2)` for large `x`. This means that 'logcosh' works mostly
+    like the mean squared error, but will not be so strongly affected by the
+    occasional wildly incorrect prediction.
+
+    Standalone usage:
+
+    >>> y_true = np.random.random(size=(2, 3))
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.logcosh(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> x = y_pred - y_true
+    >>> assert np.allclose(
+    ...     loss.numpy(),
+    ...     np.mean(x + np.log(np.exp(-2. * x) + 1.) - tf.math.log(2.),
+    ...             axis=-1),
+    ...     atol=1e-5)
 
     Args:
-      rt: RaggedTensor.
-    """
-    return tf.reduce_all([
-        tf.equal(
-            tf.math.reduce_variance(tf.cast(row_lens, backend.floatx())),
-            tf.constant([0.])) for row_lens in rt.nested_row_lengths()
-    ])
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
-  def _convert_to_dense(inputs):
-    return tuple(
-        rt.to_tensor() if isinstance(rt, tf.RaggedTensor) else rt
-        for rt in inputs)
+    Returns:
+        Logcosh error values. shape = `[batch_size, d0, .. dN-1]`.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
 
-  def _call_loss(inputs, ragged_output):
-    """ Adapt the result to ragged or dense tensor according to the expected
+    def _logcosh(x):
+        return (
+            x + tf.math.softplus(-2.0 * x) - tf.cast(tf.math.log(2.0), x.dtype)
+        )
 
-        output type. This is done so that all the return values of the map
-        operation have the same type.
-    """
-    r = loss_fn(*inputs)
-    if ragged_output and not isinstance(r, tf.RaggedTensor):
-      r = tf.RaggedTensor.from_tensor(r)
-    elif not ragged_output and isinstance(r, tf.RaggedTensor):
-      r = r.to_tensor()
-    return r
-
-  def _wrapper(inputs, ragged_output):
-    _, y_pred = inputs
-    if isinstance(y_pred, tf.RaggedTensor):
-      return tf.cond(
-          rt_is_equiv_dense(y_pred),
-          lambda: _call_loss(_convert_to_dense(inputs), ragged_output),
-          lambda: _call_loss(inputs, ragged_output))
-
-    return loss_fn(*inputs)
-
-  if not isinstance(y_true, tf.RaggedTensor):
-    return loss_fn(y_true, y_pred.to_tensor())
-
-  lshape = y_pred.shape.as_list()[1:-1]
-  if len(lshape) > 0:
-    spec = tf.RaggedTensorSpec(shape=lshape, dtype=y_pred.dtype)
-  else:
-    spec = tf.TensorSpec(shape=[], dtype=y_pred.dtype)
-
-  nested_splits_list = [rt.nested_row_splits for rt in (y_true, y_pred)]
-  if y_pred_extra_dim:
-    # The last dimension of a categorical prediction may be ragged or not.
-    rdims = [len(slist) for slist in nested_splits_list]
-    if rdims[0] == rdims[1] - 1:
-      nested_splits_list[1] = nested_splits_list[1][:-1]
-
-  map_fn = functools.partial(_wrapper, ragged_output=len(lshape) > 1)
-
-  assertion_list = ragged_util.assert_splits_match(nested_splits_list)
-  with tf.control_dependencies(assertion_list):
-    return ragged_map_ops.map_fn(map_fn, elems=(y_true, y_pred), dtype=spec)
+    return backend.mean(_logcosh(y_pred - y_true), axis=-1)
 
 
-@dispatch.dispatch_for_types(mean_squared_error, tf.RaggedTensor)
-def _ragged_tensor_mse(y_true, y_pred):
-  """Implements support for handling RaggedTensors.
+@keras_export(
+    "keras.metrics.categorical_crossentropy",
+    "keras.losses.categorical_crossentropy",
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def categorical_crossentropy(
+    y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1
+):
+    """Computes the categorical crossentropy loss.
 
-  Args:
-    y_true: RaggedTensor truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: RaggedTensor predicted values. shape = `[batch_size, d0, .. dN]`.
+    Standalone usage:
 
-  Returns:
-    Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
-    When the number of dimensions of the batch feature vector [d0, .. dN] is
-    greater than one the return value is a RaggedTensor. Otherwise a Dense
-    tensor with dimensions [batch_size] is returned.
-  """
-  return _ragged_tensor_apply_loss(mean_squared_error, y_true, y_pred)
+    >>> y_true = [[0, 1, 0], [0, 0, 1]]
+    >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
+    >>> loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> loss.numpy()
+    array([0.0513, 2.303], dtype=float32)
 
+    Args:
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Defaults to -1. The dimension along which the entropy is
+            computed.
 
-@keras_export('keras.metrics.mean_absolute_error', 'keras.metrics.mae',
-              'keras.metrics.MAE', 'keras.losses.mean_absolute_error',
-              'keras.losses.mae', 'keras.losses.MAE')
-@tf.__internal__.dispatch.add_dispatch_support
-def mean_absolute_error(y_true, y_pred):
-  """Computes the mean absolute error between labels and predictions.
+    Returns:
+        Categorical crossentropy loss value.
+    """
+    if isinstance(axis, bool):
+        raise ValueError(
+            "`axis` must be of type `int`. "
+            f"Received: axis={axis} of type {type(axis)}"
+        )
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)
+
+    if y_pred.shape[-1] == 1:
+        warnings.warn(
+            "In loss categorical_crossentropy, expected "
+            "y_pred.shape to be (batch_size, num_classes) "
+            f"with num_classes > 1. Received: y_pred.shape={y_pred.shape}. "
+            "Consider using 'binary_crossentropy' if you only have 2 classes.",
+            SyntaxWarning,
+            stacklevel=2,
+        )
+
+    def _smooth_labels():
+        num_classes = tf.cast(tf.shape(y_true)[axis], y_pred.dtype)
+        return y_true * (1.0 - label_smoothing) + (
+            label_smoothing / num_classes
+        )
+
+    y_true = tf.__internal__.smart_cond.smart_cond(
+        label_smoothing, _smooth_labels, lambda: y_true
+    )
 
-  `loss = mean(abs(y_true - y_pred), axis=-1)`
+    return backend.categorical_crossentropy(
+        y_true, y_pred, from_logits=from_logits, axis=axis
+    )
 
-  Standalone usage:
 
-  >>> y_true = np.random.randint(0, 2, size=(2, 3))
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.mean_absolute_error(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> assert np.array_equal(
-  ...     loss.numpy(), np.mean(np.abs(y_true - y_pred), axis=-1))
+@dispatch.dispatch_for_types(categorical_crossentropy, tf.RaggedTensor)
+def _ragged_tensor_categorical_crossentropy(
+    y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1
+):
+    """Implements support for handling RaggedTensors.
 
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+    Args:
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: The axis along which to compute crossentropy (the features axis).
+            Defaults to -1.
 
-  Returns:
-    Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  return backend.mean(tf.abs(y_pred - y_true), axis=-1)
+    Returns:
+        Categorical crossentropy loss value.
 
+    Expected shape: (batch, sequence_len, n_classes) with sequence_len
+    being variable per batch.
+    Return shape: (batch, sequence_len).
 
-@dispatch.dispatch_for_types(mean_absolute_error, tf.RaggedTensor)
-def _ragged_tensor_mae(y_true, y_pred):
-  """RaggedTensor adapter for mean_absolute_error."""
-  return _ragged_tensor_apply_loss(mean_absolute_error, y_true, y_pred)
+    When used by CategoricalCrossentropy() with the default reduction
+    (SUM_OVER_BATCH_SIZE), the reduction averages the loss over the
+    number of elements independent of the batch. E.g. if the RaggedTensor
+    has 2 batches with [2, 1] values respectively the resulting loss is
+    the sum of the individual loss values divided by 3.
+    """
+    fn = functools.partial(
+        categorical_crossentropy,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing,
+        axis=axis,
+    )
+    return _ragged_tensor_apply_loss(fn, y_true, y_pred)
 
 
-@keras_export('keras.metrics.mean_absolute_percentage_error',
-              'keras.metrics.mape', 'keras.metrics.MAPE',
-              'keras.losses.mean_absolute_percentage_error',
-              'keras.losses.mape', 'keras.losses.MAPE')
+@keras_export(
+    "keras.metrics.categorical_focal_crossentropy",
+    "keras.losses.categorical_focal_crossentropy",
+)
 @tf.__internal__.dispatch.add_dispatch_support
-def mean_absolute_percentage_error(y_true, y_pred):
-  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
-
-  `loss = 100 * mean(abs((y_true - y_pred) / y_true), axis=-1)`
-
-  Standalone usage:
-
-  >>> y_true = np.random.random(size=(2, 3))
-  >>> y_true = np.maximum(y_true, 1e-7)  # Prevent division by zero
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.mean_absolute_percentage_error(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> assert np.array_equal(
-  ...     loss.numpy(),
-  ...     100. * np.mean(np.abs((y_true - y_pred) / y_true), axis=-1))
-
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-
-  Returns:
-    Mean absolute percentage error values. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  diff = tf.abs(
-      (y_true - y_pred) / backend.maximum(tf.abs(y_true),
-                                          backend.epsilon()))
-  return 100. * backend.mean(diff, axis=-1)
-
-
-@dispatch.dispatch_for_types(mean_absolute_percentage_error,
-                             tf.RaggedTensor)
-def _ragged_tensor_mape(y_true, y_pred):
-  """Support RaggedTensors."""
-  return _ragged_tensor_apply_loss(mean_absolute_percentage_error, y_true,
-                                   y_pred)
+def categorical_focal_crossentropy(
+    y_true,
+    y_pred,
+    alpha=0.25,
+    gamma=2.0,
+    from_logits=False,
+    label_smoothing=0.0,
+    axis=-1,
+):
+    """Computes the categorical focal crossentropy loss.
 
+    Standalone usage:
+    >>> y_true = [[0, 1, 0], [0, 0, 1]]
+    >>> y_pred = [[0.05, 0.9, 0.05], [0.1, 0.85, 0.05]]
+    >>> loss = tf.keras.losses.categorical_focal_crossentropy(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> loss.numpy()
+    array([2.63401289e-04, 6.75912094e-01], dtype=float32)
 
-@keras_export('keras.metrics.mean_squared_logarithmic_error',
-              'keras.metrics.msle', 'keras.metrics.MSLE',
-              'keras.losses.mean_squared_logarithmic_error',
-              'keras.losses.msle', 'keras.losses.MSLE')
-@tf.__internal__.dispatch.add_dispatch_support
-def mean_squared_logarithmic_error(y_true, y_pred):
-  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
-
-  `loss = mean(square(log(y_true + 1) - log(y_pred + 1)), axis=-1)`
-
-  Standalone usage:
-
-  >>> y_true = np.random.randint(0, 2, size=(2, 3))
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.mean_squared_logarithmic_error(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> y_true = np.maximum(y_true, 1e-7)
-  >>> y_pred = np.maximum(y_pred, 1e-7)
-  >>> assert np.allclose(
-  ...     loss.numpy(),
-  ...     np.mean(
-  ...         np.square(np.log(y_true + 1.) - np.log(y_pred + 1.)), axis=-1))
-
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-
-  Returns:
-    Mean squared logarithmic error values. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  first_log = tf.math.log(backend.maximum(y_pred, backend.epsilon()) + 1.)
-  second_log = tf.math.log(backend.maximum(y_true, backend.epsilon()) + 1.)
-  return backend.mean(
-      tf.math.squared_difference(first_log, second_log), axis=-1)
-
-
-@dispatch.dispatch_for_types(mean_squared_logarithmic_error,
-                             tf.RaggedTensor)
-def _ragged_tensor_msle(y_true, y_pred):
-  """Implements support for handling RaggedTensors."""
-  return _ragged_tensor_apply_loss(mean_squared_logarithmic_error, y_true,
-                                   y_pred)
+    Args:
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        alpha: A weight balancing factor for all classes, default is `0.25` as
+            mentioned in the reference. It can be a list of floats or a scalar.
+            In the multi-class case, alpha may be set by inverse class
+            frequency by using `compute_class_weight` from `sklearn.utils`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference. It helps to gradually reduce the importance given to
+            simple examples in a smooth manner. When `gamma` = 0, there is
+            no focal effect on the categorical crossentropy.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability
+            distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Defaults to -1. The dimension along which the entropy is
+            computed.
 
+    Returns:
+        Categorical focal crossentropy loss value.
+    """
+    if isinstance(axis, bool):
+        raise ValueError(
+            "`axis` must be of type `int`. "
+            f"Received: axis={axis} of type {type(axis)}"
+        )
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)
+
+    if y_pred.shape[-1] == 1:
+        warnings.warn(
+            "In loss categorical_focal_crossentropy, expected "
+            "y_pred.shape to be (batch_size, num_classes) "
+            f"with num_classes > 1. Received: y_pred.shape={y_pred.shape}. "
+            "Consider using 'binary_crossentropy' if you only have 2 classes.",
+            SyntaxWarning,
+            stacklevel=2,
+        )
+
+    def _smooth_labels():
+        num_classes = tf.cast(tf.shape(y_true)[-1], y_pred.dtype)
+        return y_true * (1.0 - label_smoothing) + (
+            label_smoothing / num_classes
+        )
+
+    y_true = tf.__internal__.smart_cond.smart_cond(
+        label_smoothing, _smooth_labels, lambda: y_true
+    )
 
-def _maybe_convert_labels(y_true):
-  """Converts binary labels into -1/1."""
-  are_zeros = tf.equal(y_true, 0)
-  are_ones = tf.equal(y_true, 1)
-  is_binary = tf.reduce_all(tf.logical_or(are_zeros, are_ones))
+    return backend.categorical_focal_crossentropy(
+        target=y_true,
+        output=y_pred,
+        alpha=alpha,
+        gamma=gamma,
+        from_logits=from_logits,
+        axis=axis,
+    )
 
-  def _convert_binary_labels():
-    # Convert the binary labels to -1 or 1.
-    return 2. * y_true - 1.
 
-  updated_y_true = tf.__internal__.smart_cond.smart_cond(is_binary, _convert_binary_labels,
-                                         lambda: y_true)
-  return updated_y_true
+@dispatch.dispatch_for_types(categorical_focal_crossentropy, tf.RaggedTensor)
+def _ragged_tensor_categorical_focal_crossentropy(
+    y_true,
+    y_pred,
+    alpha=0.25,
+    gamma=2.0,
+    from_logits=False,
+    label_smoothing=0.0,
+    axis=-1,
+):
+    """Implements support for handling RaggedTensors.
 
+    Expected shape: (batch, sequence_len, n_classes) with sequence_len
+    being variable per batch.
+    Return shape: (batch, sequence_len).
+    When used by CategoricalFocalCrossentropy() with the default reduction
+    (SUM_OVER_BATCH_SIZE), the reduction averages the loss over the
+    number of elements independent of the batch. E.g. if the RaggedTensor
+    has 2 batches with [2, 1] values respectively the resulting loss is
+    the sum of the individual loss values divided by 3.
 
-@keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
-@tf.__internal__.dispatch.add_dispatch_support
-def squared_hinge(y_true, y_pred):
-  """Computes the squared hinge loss between `y_true` and `y_pred`.
+    Args:
+        alpha: A weight balancing factor for all classes, default is `0.25` as
+            mentioned in the reference. It can be a list of floats or a scalar.
+            In the multi-class case, alpha may be set by inverse class
+            frequency by using `compute_class_weight` from `sklearn.utils`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference. It helps to gradually reduce the importance given to
+            simple examples in a smooth manner. When `gamma` = 0, there is
+            no focal effect on the categorical crossentropy.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Defaults to -1. The dimension along which the entropy is
+            computed.
 
-  `loss = mean(square(maximum(1 - y_true * y_pred, 0)), axis=-1)`
+    Returns:
+      Categorical focal crossentropy loss value.
+    """
+    fn = functools.partial(
+        categorical_focal_crossentropy,
+        alpha=alpha,
+        gamma=gamma,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing,
+        axis=axis,
+    )
+    return _ragged_tensor_apply_loss(fn, y_true, y_pred)
 
-  Standalone usage:
 
-  >>> y_true = np.random.choice([-1, 1], size=(2, 3))
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.squared_hinge(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> assert np.array_equal(
-  ...     loss.numpy(),
-  ...     np.mean(np.square(np.maximum(1. - y_true * y_pred, 0.)), axis=-1))
+@keras_export(
+    "keras.metrics.sparse_categorical_crossentropy",
+    "keras.losses.sparse_categorical_crossentropy",
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def sparse_categorical_crossentropy(
+    y_true, y_pred, from_logits=False, axis=-1, ignore_class=None
+):
+    """Computes the sparse categorical crossentropy loss.
+
+    Standalone usage:
+
+    >>> y_true = [1, 2]
+    >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
+    >>> loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> loss.numpy()
+    array([0.0513, 2.303], dtype=float32)
+
+    >>> y_true = [[[ 0,  2],
+    ...            [-1, -1]],
+    ...           [[ 0,  2],
+    ...            [-1, -1]]]
+    >>> y_pred = [[[[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
+    ...             [[0.2, 0.5, 0.3], [0.0, 1.0, 0.0]]],
+    ...           [[[1.0, 0.0, 0.0], [0.0, 0.5, 0.5]],
+    ...            [[0.2, 0.5, 0.3], [0.0, 1.0, 0.0]]]]
+    >>> loss = tf.keras.losses.sparse_categorical_crossentropy(
+    ...   y_true, y_pred, ignore_class=-1)
+    >>> loss.numpy()
+    array([[[2.3841855e-07, 2.3841855e-07],
+            [0.0000000e+00, 0.0000000e+00]],
+           [[2.3841855e-07, 6.9314730e-01],
+            [0.0000000e+00, 0.0000000e+00]]], dtype=float32)
 
-  Args:
-    y_true: The ground truth values. `y_true` values are expected to be -1 or 1.
-      If binary (0 or 1) labels are provided we will convert them to -1 or 1.
-      shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+    Args:
+        y_true: Ground truth values.
+        y_pred: The predicted values.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        axis: Defaults to -1. The dimension along which the entropy is
+            computed.
+        ignore_class: Optional integer. The ID of a class to be ignored during
+            loss computation. This is useful, for example, in segmentation
+            problems featuring a "void" class (commonly -1 or 255) in
+            segmentation maps. By default (`ignore_class=None`), all classes are
+            considered.
 
-  Returns:
-     Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  y_true = _maybe_convert_labels(y_true)
-  return backend.mean(
-      tf.square(tf.maximum(1. - y_true * y_pred, 0.)), axis=-1)
+    Returns:
+        Sparse categorical crossentropy loss value.
+    """
+    return backend.sparse_categorical_crossentropy(
+        y_true,
+        y_pred,
+        from_logits=from_logits,
+        ignore_class=ignore_class,
+        axis=axis,
+    )
 
 
-@keras_export('keras.metrics.hinge', 'keras.losses.hinge')
-@tf.__internal__.dispatch.add_dispatch_support
-def hinge(y_true, y_pred):
-  """Computes the hinge loss between `y_true` and `y_pred`.
+@dispatch.dispatch_for_types(sparse_categorical_crossentropy, tf.RaggedTensor)
+def _ragged_tensor_sparse_categorical_crossentropy(
+    y_true, y_pred, from_logits=False, axis=-1, ignore_class=None
+):
+    """Implements support for handling RaggedTensors.
 
-  `loss = mean(maximum(1 - y_true * y_pred, 0), axis=-1)`
+    Expected y_pred shape: (batch, sequence_len, n_classes) with sequence_len
+    being variable per batch.
+    Return shape: (batch, sequence_len).
 
-  Standalone usage:
+    When used by SparseCategoricalCrossentropy() with the default reduction
+    (SUM_OVER_BATCH_SIZE), the reduction averages the loss over the
+    number of elements independent of the batch. E.g. if the RaggedTensor
+    has 2 batches with [2, 1] values respectively, the resulting loss is
+    the sum of the individual loss values divided by 3.
+    """
+    fn = functools.partial(
+        sparse_categorical_crossentropy,
+        from_logits=from_logits,
+        ignore_class=ignore_class,
+        axis=axis,
+    )
+    return _ragged_tensor_apply_loss(fn, y_true, y_pred, y_pred_extra_dim=True)
 
-  >>> y_true = np.random.choice([-1, 1], size=(2, 3))
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.hinge(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> assert np.array_equal(
-  ...     loss.numpy(),
-  ...     np.mean(np.maximum(1. - y_true * y_pred, 0.), axis=-1))
 
-  Args:
-    y_true: The ground truth values. `y_true` values are expected to be -1 or 1.
-      If binary (0 or 1) labels are provided they will be converted to -1 or 1.
-      shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+@keras_export(
+    "keras.metrics.binary_crossentropy", "keras.losses.binary_crossentropy"
+)
+@tf.__internal__.dispatch.add_dispatch_support
+def binary_crossentropy(
+    y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1
+):
+    """Computes the binary crossentropy loss.
 
-  Returns:
-    Hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  y_true = _maybe_convert_labels(y_true)
-  return backend.mean(tf.maximum(1. - y_true * y_pred, 0.), axis=-1)
+    Standalone usage:
 
+    >>> y_true = [[0, 1], [0, 0]]
+    >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
+    >>> loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> loss.numpy()
+    array([0.916 , 0.714], dtype=float32)
 
-@keras_export('keras.losses.categorical_hinge')
-@tf.__internal__.dispatch.add_dispatch_support
-def categorical_hinge(y_true, y_pred):
-  """Computes the categorical hinge loss between `y_true` and `y_pred`.
-
-  `loss = maximum(neg - pos + 1, 0)`
-  where `neg=maximum((1-y_true)*y_pred) and pos=sum(y_true*y_pred)`
-
-  Standalone usage:
-
-  >>> y_true = np.random.randint(0, 3, size=(2,))
-  >>> y_true = tf.keras.utils.to_categorical(y_true, num_classes=3)
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.categorical_hinge(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> pos = np.sum(y_true * y_pred, axis=-1)
-  >>> neg = np.amax((1. - y_true) * y_pred, axis=-1)
-  >>> assert np.array_equal(loss.numpy(), np.maximum(0., neg - pos + 1.))
-
-  Args:
-    y_true: The ground truth values. `y_true` values are expected to be
-    either `{-1, +1}` or `{0, 1}` (i.e. a one-hot-encoded tensor).
-    y_pred: The predicted values.
-
-  Returns:
-    Categorical hinge loss values.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  pos = tf.reduce_sum(y_true * y_pred, axis=-1)
-  neg = tf.reduce_max((1. - y_true) * y_pred, axis=-1)
-  zero = tf.cast(0., y_pred.dtype)
-  return tf.maximum(neg - pos + 1., zero)
-
-
-@keras_export('keras.losses.huber', v1=[])
-@tf.__internal__.dispatch.add_dispatch_support
-def huber(y_true, y_pred, delta=1.0):
-  """Computes Huber loss value.
-
-  For each value x in `error = y_true - y_pred`:
-
-  ```
-  loss = 0.5 * x^2                  if |x| <= d
-  loss = d * |x| - 0.5 * d^2        if |x| > d
-  ```
-  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
-
-  Args:
-    y_true: tensor of true targets.
-    y_pred: tensor of predicted targets.
-    delta: A float, the point where the Huber loss function changes from a
-      quadratic to linear.
-
-  Returns:
-    Tensor with one scalar loss entry per sample.
-  """
-  y_pred = tf.cast(y_pred, dtype=backend.floatx())
-  y_true = tf.cast(y_true, dtype=backend.floatx())
-  delta = tf.cast(delta, dtype=backend.floatx())
-  error = tf.subtract(y_pred, y_true)
-  abs_error = tf.abs(error)
-  half = tf.convert_to_tensor(0.5, dtype=abs_error.dtype)
-  return backend.mean(
-      tf.where(abs_error <= delta, half * tf.square(error),
-                         delta * abs_error - half * tf.square(delta)),
-      axis=-1)
-
-
-@keras_export('keras.losses.log_cosh', 'keras.losses.logcosh',
-              'keras.metrics.log_cosh', 'keras.metrics.logcosh')
-@tf.__internal__.dispatch.add_dispatch_support
-def log_cosh(y_true, y_pred):
-  """Logarithm of the hyperbolic cosine of the prediction error.
+    Args:
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels by
+            squeezing them towards 0.5 That is, using
+            `1. - 0.5 * label_smoothing` for the target class and
+            `0.5 * label_smoothing` for the non-target class.
+        axis: The axis along which the mean is computed. Defaults to -1.
 
-  `log(cosh(x))` is approximately equal to `(x ** 2) / 2` for small `x` and
-  to `abs(x) - log(2)` for large `x`. This means that 'logcosh' works mostly
-  like the mean squared error, but will not be so strongly affected by the
-  occasional wildly incorrect prediction.
+    Returns:
+        Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)
 
-  Standalone usage:
+    def _smooth_labels():
+        return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
 
-  >>> y_true = np.random.random(size=(2, 3))
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.logcosh(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> x = y_pred - y_true
-  >>> assert np.allclose(
-  ...     loss.numpy(),
-  ...     np.mean(x + np.log(np.exp(-2. * x) + 1.) - tf.math.log(2.), axis=-1),
-  ...     atol=1e-5)
+    y_true = tf.__internal__.smart_cond.smart_cond(
+        label_smoothing, _smooth_labels, lambda: y_true
+    )
 
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+    return backend.mean(
+        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
+        axis=axis,
+    )
 
-  Returns:
-    Logcosh error values. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
 
-  def _logcosh(x):
-    return x + tf.math.softplus(-2. * x) - tf.cast(
-        tf.math.log(2.), x.dtype)
+@dispatch.dispatch_for_types(binary_crossentropy, tf.RaggedTensor)
+def _ragged_tensor_binary_crossentropy(
+    y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1
+):
+    """Implements support for handling RaggedTensors.
 
-  return backend.mean(_logcosh(y_pred - y_true), axis=-1)
+    Args:
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Axis along which to compute crossentropy.
 
+    Returns:
+        Binary crossentropy loss value.
 
-@keras_export('keras.metrics.categorical_crossentropy',
-              'keras.losses.categorical_crossentropy')
-@tf.__internal__.dispatch.add_dispatch_support
-def categorical_crossentropy(y_true,
-                             y_pred,
-                             from_logits=False,
-                             label_smoothing=0.,
-                             axis=-1):
-  """Computes the categorical crossentropy loss.
-
-  Standalone usage:
-
-  >>> y_true = [[0, 1, 0], [0, 0, 1]]
-  >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
-  >>> loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> loss.numpy()
-  array([0.0513, 2.303], dtype=float32)
-
-  Args:
-    y_true: Tensor of one-hot true targets.
-    y_pred: Tensor of predicted targets.
-    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
-      example, if `0.1`, use `0.1 / num_classes` for non-target labels
-      and `0.9 + 0.1 / num_classes` for target labels.
-    axis: Defaults to -1. The dimension along which the entropy is
-      computed.
-
-  Returns:
-    Categorical crossentropy loss value.
-  """
-  if isinstance(axis, bool):
-    raise ValueError(
-        f'`axis` must be of type `int`. Received: axis={axis} of type {type(axis)}'
+    Expected shape: (batch, sequence_len) with sequence_len being variable
+    per batch.
+    Return shape: (batch,); returns the per batch mean of the loss values.
+
+    When used by BinaryCrossentropy() with the default reduction
+    (SUM_OVER_BATCH_SIZE), the reduction averages the per batch losses over
+    the number of batches.
+    """
+    fn = functools.partial(
+        binary_crossentropy,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing,
+        axis=axis,
     )
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)
-
-  def _smooth_labels():
-    num_classes = tf.cast(tf.shape(y_true)[-1], y_pred.dtype)
-    return y_true * (1.0 - label_smoothing) + (label_smoothing / num_classes)
-
-  y_true = tf.__internal__.smart_cond.smart_cond(label_smoothing, _smooth_labels,
-                                 lambda: y_true)
-
-  return backend.categorical_crossentropy(
-      y_true, y_pred, from_logits=from_logits, axis=axis)
-
-
-@dispatch.dispatch_for_types(categorical_crossentropy,
-                             tf.RaggedTensor)
-def _ragged_tensor_categorical_crossentropy(y_true,
-                                            y_pred,
-                                            from_logits=False,
-                                            label_smoothing=0.,
-                                            axis=-1):
-  """Implements support for handling RaggedTensors.
-
-  Args:
-    y_true: Tensor of one-hot true targets.
-    y_pred: Tensor of predicted targets.
-    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
-      example, if `0.1`, use `0.1 / num_classes` for non-target labels
-      and `0.9 + 0.1 / num_classes` for target labels.
-    axis: The axis along which to compute crossentropy (the features axis).
-        Defaults to -1.
-
-  Returns:
-    Categorical crossentropy loss value.
-
-  Expected shape: (batch, sequence_len, n_classes) with sequence_len
-  being variable per batch.
-  Return shape: (batch, sequence_len).
-
-  When used by CategoricalCrossentropy() with the default reduction
-  (SUM_OVER_BATCH_SIZE), the reduction averages the loss over the
-  number of elements independent of the batch. E.g. if the RaggedTensor
-  has 2 batches with [2, 1] values respectively the resulting loss is
-  the sum of the individual loss values divided by 3.
-  """
-  fn = functools.partial(
-      categorical_crossentropy,
-      from_logits=from_logits,
-      label_smoothing=label_smoothing,
-      axis=axis)
-  return _ragged_tensor_apply_loss(fn, y_true, y_pred)
-
-
-@keras_export('keras.metrics.sparse_categorical_crossentropy',
-              'keras.losses.sparse_categorical_crossentropy')
-@tf.__internal__.dispatch.add_dispatch_support
-def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
-  """Computes the sparse categorical crossentropy loss.
-
-  Standalone usage:
-
-  >>> y_true = [1, 2]
-  >>> y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
-  >>> loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> loss.numpy()
-  array([0.0513, 2.303], dtype=float32)
-
-  Args:
-    y_true: Ground truth values.
-    y_pred: The predicted values.
-    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we assume that `y_pred` encodes a probability distribution.
-    axis: Defaults to -1. The dimension along which the entropy is
-      computed.
-
-  Returns:
-    Sparse categorical crossentropy loss value.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-
-  return backend.sparse_categorical_crossentropy(
-      y_true, y_pred, from_logits=from_logits, axis=axis)
-
-
-@dispatch.dispatch_for_types(sparse_categorical_crossentropy,
-                             tf.RaggedTensor)
-def _ragged_tensor_sparse_categorical_crossentropy(y_true,
-                                                   y_pred,
-                                                   from_logits=False,
-                                                   axis=-1):
-  """ Implements support for handling RaggedTensors.
-
-      Expected y_pred shape: (batch, sequence_len, n_classes) with sequence_len
-      being variable per batch.
-      Return shape: (batch, sequence_len).
-
-      When used by SparseCategoricalCrossentropy() with the default reduction
-      (SUM_OVER_BATCH_SIZE), the reduction averages the loss over the
-      number of elements independent of the batch. E.g. if the RaggedTensor
-      has 2 batches with [2, 1] values respectively, the resulting loss is
-      the sum of the individual loss values divided by 3.
-  """
-  fn = functools.partial(
-      sparse_categorical_crossentropy, from_logits=from_logits, axis=axis)
-  return _ragged_tensor_apply_loss(fn, y_true, y_pred, y_pred_extra_dim=True)
-
-
-@keras_export('keras.metrics.binary_crossentropy',
-              'keras.losses.binary_crossentropy')
-@tf.__internal__.dispatch.add_dispatch_support
-def binary_crossentropy(y_true,
-                        y_pred,
-                        from_logits=False,
-                        label_smoothing=0.,
-                        axis=-1):
-  """Computes the binary crossentropy loss.
-
-  Standalone usage:
-
-  >>> y_true = [[0, 1], [0, 0]]
-  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
-  >>> loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> loss.numpy()
-  array([0.916 , 0.714], dtype=float32)
-
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels by
-      squeezing them towards 0.5 That is, using `1. - 0.5 * label_smoothing`
-      for the target class and `0.5 * label_smoothing` for the non-target class.
-    axis: The axis along which the mean is computed. Defaults to -1.
-
-  Returns:
-    Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)
-
-  def _smooth_labels():
-    return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
-
-  y_true = tf.__internal__.smart_cond.smart_cond(label_smoothing, _smooth_labels,
-                                 lambda: y_true)
-
-  return backend.mean(
-      backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
-      axis=axis)
+    return _ragged_tensor_apply_loss(fn, y_true, y_pred)
 
 
-@dispatch.dispatch_for_types(binary_crossentropy, tf.RaggedTensor)
-def _ragged_tensor_binary_crossentropy(y_true,
-                                       y_pred,
-                                       from_logits=False,
-                                       label_smoothing=0.,
-                                       axis=-1):
-  """Implements support for handling RaggedTensors.
-
-  Args:
-    y_true: Tensor of one-hot true targets.
-    y_pred: Tensor of predicted targets.
-    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
-      example, if `0.1`, use `0.1 / num_classes` for non-target labels
-      and `0.9 + 0.1 / num_classes` for target labels.
-    axis: Axis along which to compute crossentropy.
-
-  Returns:
-    Binary crossentropy loss value.
-
-  Expected shape: (batch, sequence_len) with sequence_len being variable
-  per batch.
-  Return shape: (batch,); returns the per batch mean of the loss values.
-
-  When used by BinaryCrossentropy() with the default reduction
-  (SUM_OVER_BATCH_SIZE), the reduction averages the per batch losses over
-  the number of batches.
-  """
-  fn = functools.partial(
-      binary_crossentropy,
-      from_logits=from_logits,
-      label_smoothing=label_smoothing,
-      axis=axis)
-  return _ragged_tensor_apply_loss(fn, y_true, y_pred)
-
-
-@keras_export('keras.metrics.binary_focal_crossentropy',
-              'keras.losses.binary_focal_crossentropy')
+@keras_export(
+    "keras.metrics.binary_focal_crossentropy",
+    "keras.losses.binary_focal_crossentropy",
+)
 @tf.__internal__.dispatch.add_dispatch_support
 def binary_focal_crossentropy(
     y_true,
     y_pred,
+    apply_class_balancing=False,
+    alpha=0.25,
     gamma=2.0,
     from_logits=False,
-    label_smoothing=0.,
+    label_smoothing=0.0,
     axis=-1,
 ):
-  """Computes the binary focal crossentropy loss.
-
-  According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
-  helps to apply a focal factor to down-weight easy examples and focus more on
-  hard examples. By default, the focal tensor is computed as follows:
-
-  `focal_factor = (1 - output)**gamma` for class 1
-  `focal_factor = output**gamma` for class 0
-  where `gamma` is a focusing parameter. When `gamma` = 0, this function is
-  equivalent to the binary crossentropy loss.
-
-  Standalone usage:
-
-  >>> y_true = [[0, 1], [0, 0]]
-  >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
-  >>> loss = tf.keras.losses.binary_focal_crossentropy(y_true, y_pred, gamma=2)
-  >>> assert loss.shape == (2,)
-  >>> loss.numpy()
-  array([0.330, 0.206], dtype=float32)
-
-  Args:
-    y_true: Ground truth values, of shape `(batch_size, d0, .. dN)`.
-    y_pred: The predicted values, of shape `(batch_size, d0, .. dN)`.
-    gamma: A focusing parameter, default is `2.0` as mentioned in the reference.
-    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in `[0, 1]`. If higher than 0 then smooth the labels
-      by squeezing them towards `0.5`, i.e., using `1. - 0.5 * label_smoothing`
-      for the target class and `0.5 * label_smoothing` for the non-target class.
-    axis: The axis along which the mean is computed. Defaults to `-1`.
-
-  Returns:
-    Binary focal crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)
-
-  def _smooth_labels():
-    return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
-
-  y_true = tf.__internal__.smart_cond.smart_cond(label_smoothing,
-                                                 _smooth_labels, lambda: y_true)
-
-  return backend.mean(
-      backend.binary_focal_crossentropy(
-          target=y_true,
-          output=y_pred,
-          gamma=gamma,
-          from_logits=from_logits,
-      ),
-      axis=axis,
-  )
+    """Computes the binary focal crossentropy loss.
+
+    According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
+    helps to apply a focal factor to down-weight easy examples and focus more on
+    hard examples. By default, the focal tensor is computed as follows:
+
+    `focal_factor = (1 - output)**gamma` for class 1
+    `focal_factor = output**gamma` for class 0
+    where `gamma` is a focusing parameter. When `gamma` = 0, there is no focal
+    effect on the binary crossentropy loss.
+
+    If `apply_class_balancing == True`, this function also takes into account a
+    weight balancing factor for the binary classes 0 and 1 as follows:
+
+    `weight = alpha` for class 1 (`target == 1`)
+    `weight = 1 - alpha` for class 0
+    where `alpha` is a float in the range of `[0, 1]`.
+
+    Standalone usage:
+
+    >>> y_true = [[0, 1], [0, 0]]
+    >>> y_pred = [[0.6, 0.4], [0.4, 0.6]]
+    >>> loss = tf.keras.losses.binary_focal_crossentropy(y_true, y_pred,
+    ...                                                  gamma=2)
+    >>> assert loss.shape == (2,)
+    >>> loss.numpy()
+    array([0.330, 0.206], dtype=float32)
+
+    Args:
+        y_true: Ground truth values, of shape `(batch_size, d0, .. dN)`.
+        y_pred: The predicted values, of shape `(batch_size, d0, .. dN)`.
+        apply_class_balancing: A bool, whether to apply weight balancing on the
+            binary classes 0 and 1.
+        alpha: A weight balancing factor for class 1, default is `0.25` as
+            mentioned in the reference. The weight for class 0 is `1.0 - alpha`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in `[0, 1]`. If higher than 0 then smooth the
+            labels by squeezing them towards `0.5`, i.e., using `1. - 0.5 *
+            label_smoothing` for the target class and `0.5 * label_smoothing`
+            for the non-target class.
+        axis: The axis along which the mean is computed. Defaults to `-1`.
+
+    Returns:
+        Binary focal crossentropy loss value.
+            shape = `[batch_size, d0, .. dN-1]`.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)
+
+    def _smooth_labels():
+        return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
+
+    y_true = tf.__internal__.smart_cond.smart_cond(
+        label_smoothing, _smooth_labels, lambda: y_true
+    )
+
+    return backend.mean(
+        backend.binary_focal_crossentropy(
+            target=y_true,
+            output=y_pred,
+            apply_class_balancing=apply_class_balancing,
+            alpha=alpha,
+            gamma=gamma,
+            from_logits=from_logits,
+        ),
+        axis=axis,
+    )
 
 
 @dispatch.dispatch_for_types(binary_focal_crossentropy, tf.RaggedTensor)
 def _ragged_tensor_binary_focal_crossentropy(
     y_true,
     y_pred,
+    apply_class_balancing=False,
+    alpha=0.25,
     gamma=2.0,
     from_logits=False,
-    label_smoothing=0.,
+    label_smoothing=0.0,
     axis=-1,
 ):
-  """Implements support for handling RaggedTensors.
-
-  Expected shape: `(batch, sequence_len)` with sequence_len being variable per
-  batch.
-  Return shape: `(batch,)`; returns the per batch mean of the loss values.
-
-  When used by BinaryFocalCrossentropy() with the default reduction
-  (SUM_OVER_BATCH_SIZE), the reduction averages the per batch losses over
-  the number of batches.
-
-  Args:
-    y_true: Tensor of one-hot true targets.
-    y_pred: Tensor of predicted targets.
-    gamma: A focusing parameter, default is `2.0` as mentioned in the reference
-      [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf).
-    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we assume that `y_pred` encodes a probability distribution.
-    label_smoothing: Float in `[0, 1]`. If > `0` then smooth the labels. For
-      example, if `0.1`, use `0.1 / num_classes` for non-target labels
-      and `0.9 + 0.1 / num_classes` for target labels.
-    axis: Axis along which to compute crossentropy.
-
-  Returns:
-    Binary focal crossentropy loss value.
-  """
-  fn = functools.partial(
-      binary_focal_crossentropy,
-      gamma=gamma,
-      from_logits=from_logits,
-      label_smoothing=label_smoothing,
-      axis=axis,
-  )
-  return _ragged_tensor_apply_loss(fn, y_true, y_pred)
-
-
-@keras_export('keras.metrics.kl_divergence',
-              'keras.metrics.kullback_leibler_divergence', 'keras.metrics.kld',
-              'keras.metrics.KLD', 'keras.losses.kl_divergence',
-              'keras.losses.kullback_leibler_divergence', 'keras.losses.kld',
-              'keras.losses.KLD')
+    """Implements support for handling RaggedTensors.
+
+    Expected shape: `(batch, sequence_len)` with sequence_len being variable per
+    batch.
+    Return shape: `(batch,)`; returns the per batch mean of the loss values.
+
+    When used by BinaryFocalCrossentropy() with the default reduction
+    (SUM_OVER_BATCH_SIZE), the reduction averages the per batch losses over
+    the number of batches.
+
+    Args:
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        apply_class_balancing: A bool, whether to apply weight balancing on the
+            binary classes 0 and 1.
+        alpha: A weight balancing factor for class 1, default is `0.25` as
+            mentioned in the reference [Lin et al., 2018](
+            https://arxiv.org/pdf/1708.02002.pdf). The weight for class 0 is
+            `1.0 - alpha`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in `[0, 1]`. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Axis along which to compute crossentropy.
+
+    Returns:
+        Binary focal crossentropy loss value.
+    """
+    fn = functools.partial(
+        binary_focal_crossentropy,
+        apply_class_balancing=apply_class_balancing,
+        alpha=alpha,
+        gamma=gamma,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing,
+        axis=axis,
+    )
+    return _ragged_tensor_apply_loss(fn, y_true, y_pred)
+
+
+@keras_export(
+    "keras.metrics.kl_divergence",
+    "keras.metrics.kullback_leibler_divergence",
+    "keras.metrics.kld",
+    "keras.metrics.KLD",
+    "keras.losses.kl_divergence",
+    "keras.losses.kullback_leibler_divergence",
+    "keras.losses.kld",
+    "keras.losses.KLD",
+)
 @tf.__internal__.dispatch.add_dispatch_support
 def kl_divergence(y_true, y_pred):
-  """Computes Kullback-Leibler divergence loss between `y_true` and `y_pred`.
+    """Computes Kullback-Leibler divergence loss between `y_true` & `y_pred`.
 
-  `loss = y_true * log(y_true / y_pred)`
+    `loss = y_true * log(y_true / y_pred)`
 
-  See: https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+    See: https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
 
-  Standalone usage:
+    Standalone usage:
 
-  >>> y_true = np.random.randint(0, 2, size=(2, 3)).astype(np.float64)
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.kullback_leibler_divergence(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> y_true = tf.keras.backend.clip(y_true, 1e-7, 1)
-  >>> y_pred = tf.keras.backend.clip(y_pred, 1e-7, 1)
-  >>> assert np.array_equal(
-  ...     loss.numpy(), np.sum(y_true * np.log(y_true / y_pred), axis=-1))
+    >>> y_true = np.random.randint(0, 2, size=(2, 3)).astype(np.float64)
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.kullback_leibler_divergence(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> y_true = tf.keras.backend.clip(y_true, 1e-7, 1)
+    >>> y_pred = tf.keras.backend.clip(y_pred, 1e-7, 1)
+    >>> assert np.array_equal(
+    ...     loss.numpy(), np.sum(y_true * np.log(y_true / y_pred), axis=-1))
 
-  Args:
-    y_true: Tensor of true targets.
-    y_pred: Tensor of predicted targets.
+    Args:
+        y_true: Tensor of true targets.
+        y_pred: Tensor of predicted targets.
 
-  Returns:
-    A `Tensor` with loss.
+    Returns:
+        A `Tensor` with loss.
 
-  Raises:
-    TypeError: If `y_true` cannot be cast to the `y_pred.dtype`.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  y_true = backend.clip(y_true, backend.epsilon(), 1)
-  y_pred = backend.clip(y_pred, backend.epsilon(), 1)
-  return tf.reduce_sum(y_true * tf.math.log(y_true / y_pred), axis=-1)
+    Raises:
+        TypeError: If `y_true` cannot be cast to the `y_pred.dtype`.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    y_true = backend.clip(y_true, backend.epsilon(), 1)
+    y_pred = backend.clip(y_pred, backend.epsilon(), 1)
+    return tf.reduce_sum(y_true * tf.math.log(y_true / y_pred), axis=-1)
 
 
-@keras_export('keras.metrics.poisson', 'keras.losses.poisson')
+@keras_export("keras.metrics.poisson", "keras.losses.poisson")
 @tf.__internal__.dispatch.add_dispatch_support
 def poisson(y_true, y_pred):
-  """Computes the Poisson loss between y_true and y_pred.
+    """Computes the Poisson loss between y_true and y_pred.
 
-  The Poisson loss is the mean of the elements of the `Tensor`
-  `y_pred - y_true * log(y_pred)`.
+    The Poisson loss is the mean of the elements of the `Tensor`
+    `y_pred - y_true * log(y_pred)`.
 
-  Standalone usage:
+    Standalone usage:
 
-  >>> y_true = np.random.randint(0, 2, size=(2, 3))
-  >>> y_pred = np.random.random(size=(2, 3))
-  >>> loss = tf.keras.losses.poisson(y_true, y_pred)
-  >>> assert loss.shape == (2,)
-  >>> y_pred = y_pred + 1e-7
-  >>> assert np.allclose(
-  ...     loss.numpy(), np.mean(y_pred - y_true * np.log(y_pred), axis=-1),
-  ...     atol=1e-5)
+    >>> y_true = np.random.randint(0, 2, size=(2, 3))
+    >>> y_pred = np.random.random(size=(2, 3))
+    >>> loss = tf.keras.losses.poisson(y_true, y_pred)
+    >>> assert loss.shape == (2,)
+    >>> y_pred = y_pred + 1e-7
+    >>> assert np.allclose(
+    ...     loss.numpy(), np.mean(y_pred - y_true * np.log(y_pred), axis=-1),
+    ...     atol=1e-5)
 
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+    Args:
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
-  Returns:
-     Poisson loss value. shape = `[batch_size, d0, .. dN-1]`.
+    Returns:
+        Poisson loss value. shape = `[batch_size, d0, .. dN-1]`.
 
-  Raises:
-    InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.cast(y_true, y_pred.dtype)
-  return backend.mean(
-      y_pred - y_true * tf.math.log(y_pred + backend.epsilon()), axis=-1)
+    Raises:
+        InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.cast(y_true, y_pred.dtype)
+    return backend.mean(
+        y_pred - y_true * tf.math.log(y_pred + backend.epsilon()), axis=-1
+    )
 
 
 @keras_export(
-    'keras.losses.cosine_similarity',
+    "keras.losses.cosine_similarity",
     v1=[
-        'keras.metrics.cosine_proximity',
-        'keras.metrics.cosine',
-        'keras.losses.cosine_proximity',
-        'keras.losses.cosine',
-        'keras.losses.cosine_similarity',
-    ])
+        "keras.metrics.cosine_proximity",
+        "keras.metrics.cosine",
+        "keras.losses.cosine_proximity",
+        "keras.losses.cosine",
+        "keras.losses.cosine_similarity",
+    ],
+)
 @tf.__internal__.dispatch.add_dispatch_support
 def cosine_similarity(y_true, y_pred, axis=-1):
-  """Computes the cosine similarity between labels and predictions.
+    """Computes the cosine similarity between labels and predictions.
 
-  Note that it is a number between -1 and 1. When it is a negative number
-  between -1 and 0, 0 indicates orthogonality and values closer to -1
-  indicate greater similarity. The values closer to 1 indicate greater
-  dissimilarity. This makes it usable as a loss function in a setting
-  where you try to maximize the proximity between predictions and
-  targets. If either `y_true` or `y_pred` is a zero vector, cosine
-  similarity will be 0 regardless of the proximity between predictions
-  and targets.
+    Note that it is a number between -1 and 1. When it is a negative number
+    between -1 and 0, 0 indicates orthogonality and values closer to -1
+    indicate greater similarity. The values closer to 1 indicate greater
+    dissimilarity. This makes it usable as a loss function in a setting
+    where you try to maximize the proximity between predictions and
+    targets. If either `y_true` or `y_pred` is a zero vector, cosine
+    similarity will be 0 regardless of the proximity between predictions
+    and targets.
 
-  `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
+    `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
 
-  Standalone usage:
+    Standalone usage:
 
-  >>> y_true = [[0., 1.], [1., 1.], [1., 1.]]
-  >>> y_pred = [[1., 0.], [1., 1.], [-1., -1.]]
-  >>> loss = tf.keras.losses.cosine_similarity(y_true, y_pred, axis=1)
-  >>> loss.numpy()
-  array([-0., -0.999, 0.999], dtype=float32)
-
-  Args:
-    y_true: Tensor of true targets.
-    y_pred: Tensor of predicted targets.
-    axis: Axis along which to determine similarity.
-
-  Returns:
-    Cosine similarity tensor.
-  """
-  y_true = tf.linalg.l2_normalize(y_true, axis=axis)
-  y_pred = tf.linalg.l2_normalize(y_pred, axis=axis)
-  return -tf.reduce_sum(y_true * y_pred, axis=axis)
+    >>> y_true = [[0., 1.], [1., 1.], [1., 1.]]
+    >>> y_pred = [[1., 0.], [1., 1.], [-1., -1.]]
+    >>> loss = tf.keras.losses.cosine_similarity(y_true, y_pred, axis=1)
+    >>> loss.numpy()
+    array([-0., -0.999, 0.999], dtype=float32)
 
+    Args:
+        y_true: Tensor of true targets.
+        y_pred: Tensor of predicted targets.
+        axis: Axis along which to determine similarity.
 
-@keras_export('keras.losses.CosineSimilarity')
-class CosineSimilarity(LossFunctionWrapper):
-  """Computes the cosine similarity between labels and predictions.
-
-  Note that it is a number between -1 and 1. When it is a negative number
-  between -1 and 0, 0 indicates orthogonality and values closer to -1
-  indicate greater similarity. The values closer to 1 indicate greater
-  dissimilarity. This makes it usable as a loss function in a setting
-  where you try to maximize the proximity between predictions and targets.
-  If either `y_true` or `y_pred` is a zero vector, cosine similarity will be 0
-  regardless of the proximity between predictions and targets.
-
-  `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
-
-  Standalone usage:
-
-  >>> y_true = [[0., 1.], [1., 1.]]
-  >>> y_pred = [[1., 0.], [1., 1.]]
-  >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-  >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
-  >>> # l2_norm(y_true) = [[0., 1.], [1./1.414, 1./1.414]]
-  >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414, 1./1.414]]
-  >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
-  >>> # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
-  >>> #       = -((0. + 0.) +  (0.5 + 0.5)) / 2
-  >>> cosine_loss(y_true, y_pred).numpy()
-  -0.5
-
-  >>> # Calling with 'sample_weight'.
-  >>> cosine_loss(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
-  -0.0999
-
-  >>> # Using 'sum' reduction type.
-  >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
-  ...     reduction=tf.keras.losses.Reduction.SUM)
-  >>> cosine_loss(y_true, y_pred).numpy()
-  -0.999
-
-  >>> # Using 'none' reduction type.
-  >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
-  ...     reduction=tf.keras.losses.Reduction.NONE)
-  >>> cosine_loss(y_true, y_pred).numpy()
-  array([-0., -0.999], dtype=float32)
-
-  Usage with the `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd', loss=tf.keras.losses.CosineSimilarity(axis=1))
-  ```
-
-  Args:
-    axis: The axis along which the cosine similarity is computed
-      (the features axis). Defaults to -1.
-    reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `AUTO`. `AUTO` indicates that the reduction option will
-      be determined by the usage context. For almost all cases this defaults to
-      `SUM_OVER_BATCH_SIZE`. When used with `tf.distribute.Strategy`, outside of
-      built-in training loops such as `tf.keras` `compile` and `fit`, using
-      `AUTO` or `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
-      custom training [tutorial]
-      (https://www.tensorflow.org/tutorials/distribute/custom_training) for more
-        details.
-    name: Optional name for the instance.
-  """
-
-  def __init__(self,
-               axis=-1,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='cosine_similarity'):
-    super().__init__(
-        cosine_similarity, reduction=reduction, name=name, axis=axis)
+    Returns:
+        Cosine similarity tensor.
+    """
+    y_true = tf.linalg.l2_normalize(y_true, axis=axis)
+    y_pred = tf.linalg.l2_normalize(y_pred, axis=axis)
+    return -tf.reduce_sum(y_true * y_pred, axis=axis)
 
 
 # Aliases.
@@ -2311,95 +2853,126 @@ def __init__(self,
 
 
 def is_categorical_crossentropy(loss):
-  result = ((isinstance(loss, CategoricalCrossentropy) or
-             (isinstance(loss, LossFunctionWrapper) and
-              loss.fn == categorical_crossentropy) or
-             (hasattr(loss, '__name__') and
-              loss.__name__ == 'categorical_crossentropy') or
-             (loss == 'categorical_crossentropy')))
-  return result
+    result = (
+        isinstance(loss, CategoricalCrossentropy)
+        or (
+            isinstance(loss, LossFunctionWrapper)
+            and loss.fn == categorical_crossentropy
+        )
+        or (
+            hasattr(loss, "__name__")
+            and loss.__name__ == "categorical_crossentropy"
+        )
+        or (loss == "categorical_crossentropy")
+    )
+    return result
 
 
-@keras_export('keras.losses.serialize')
-def serialize(loss):
-  """Serializes loss function or `Loss` instance.
+@keras_export("keras.losses.serialize")
+def serialize(loss, use_legacy_format=False):
+    """Serializes loss function or `Loss` instance.
 
-  Args:
-    loss: A Keras `Loss` instance or a loss function.
+    Args:
+        loss: A Keras `Loss` instance or a loss function.
+        use_legacy_format: Boolean, whether to use the legacy serialization
+            format. Defaults to `False`.
 
-  Returns:
-    Loss configuration dictionary.
-  """
-  return serialize_keras_object(loss)
+    Returns:
+        Loss configuration dictionary.
+    """
+    if loss is None:
+        return None
+    if not isinstance(loss, Loss):
+        warnings.warn(
+            "The `keras.losses.serialize()` API should only be used for "
+            "objects of type `keras.losses.Loss`. Found an instance of type "
+            f"{type(loss)}, which may lead to improper serialization."
+        )
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(loss)
+    return serialize_keras_object(loss)
+
+
+@keras_export("keras.losses.deserialize")
+def deserialize(name, custom_objects=None, use_legacy_format=False):
+    """Deserializes a serialized loss class/function instance.
 
+    Args:
+        name: Loss configuration.
+        custom_objects: Optional dictionary mapping names (strings) to custom
+            objects (classes and functions) to be considered during
+            deserialization.
+        use_legacy_format: Boolean, whether to use the legacy serialization
+            format. Defaults to `False`.
 
-@keras_export('keras.losses.deserialize')
-def deserialize(name, custom_objects=None):
-  """Deserializes a serialized loss class/function instance.
+    Returns:
+        A Keras `Loss` instance or a loss function.
+    """
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            name,
+            module_objects=globals(),
+            custom_objects=custom_objects,
+            printable_module_name="loss function",
+        )
+    return deserialize_keras_object(
+        name,
+        module_objects=globals(),
+        custom_objects=custom_objects,
+        printable_module_name="loss function",
+    )
 
-  Args:
-      name: Loss configuration.
-      custom_objects: Optional dictionary mapping names (strings) to custom
-        objects (classes and functions) to be considered during deserialization.
 
-  Returns:
-      A Keras `Loss` instance or a loss function.
-  """
-  return deserialize_keras_object(
-      name,
-      module_objects=globals(),
-      custom_objects=custom_objects,
-      printable_module_name='loss function')
+@keras_export("keras.losses.get")
+def get(identifier):
+    """Retrieves a Keras loss as a `function`/`Loss` class instance.
 
+    The `identifier` may be the string name of a loss function or `Loss` class.
 
-@keras_export('keras.losses.get')
-def get(identifier):
-  """Retrieves a Keras loss as a `function`/`Loss` class instance.
-
-  The `identifier` may be the string name of a loss function or `Loss` class.
-
-  >>> loss = tf.keras.losses.get("categorical_crossentropy")
-  >>> type(loss)
-  <class 'function'>
-  >>> loss = tf.keras.losses.get("CategoricalCrossentropy")
-  >>> type(loss)
-  <class '...keras.losses.CategoricalCrossentropy'>
-
-  You can also specify `config` of the loss to this function by passing dict
-  containing `class_name` and `config` as an identifier. Also note that the
-  `class_name` must map to a `Loss` class
-
-  >>> identifier = {"class_name": "CategoricalCrossentropy",
-  ...               "config": {"from_logits": True}}
-  >>> loss = tf.keras.losses.get(identifier)
-  >>> type(loss)
-  <class '...keras.losses.CategoricalCrossentropy'>
-
-  Args:
-    identifier: A loss identifier. One of None or string name of a loss
-      function/class or loss configuration dictionary or a loss function or a
-      loss class instance.
-
-  Returns:
-    A Keras loss as a `function`/ `Loss` class instance.
-
-  Raises:
-    ValueError: If `identifier` cannot be interpreted.
-  """
-  if identifier is None:
-    return None
-  if isinstance(identifier, str):
-    identifier = str(identifier)
-    return deserialize(identifier)
-  if isinstance(identifier, dict):
-    return deserialize(identifier)
-  if callable(identifier):
-    return identifier
-  raise ValueError(
-      f'Could not interpret loss function identifier: {identifier}')
+    >>> loss = tf.keras.losses.get("categorical_crossentropy")
+    >>> type(loss)
+    <class 'function'>
+    >>> loss = tf.keras.losses.get("CategoricalCrossentropy")
+    >>> type(loss)
+    <class '...keras.losses.CategoricalCrossentropy'>
+
+    You can also specify `config` of the loss to this function by passing dict
+    containing `class_name` and `config` as an identifier. Also note that the
+    `class_name` must map to a `Loss` class
+
+    >>> identifier = {"class_name": "CategoricalCrossentropy",
+    ...               "config": {"from_logits": True}}
+    >>> loss = tf.keras.losses.get(identifier)
+    >>> type(loss)
+    <class '...keras.losses.CategoricalCrossentropy'>
+
+    Args:
+        identifier: A loss identifier. One of None or string name of a loss
+            function/class or loss configuration dictionary or a loss function
+            or a loss class instance.
+
+    Returns:
+        A Keras loss as a `function`/ `Loss` class instance.
+
+    Raises:
+        ValueError: If `identifier` cannot be interpreted.
+    """
+    if identifier is None:
+        return None
+    if isinstance(identifier, str):
+        identifier = str(identifier)
+        use_legacy_format = "module" not in identifier
+        return deserialize(identifier, use_legacy_format=use_legacy_format)
+    if isinstance(identifier, dict):
+        return deserialize(identifier)
+    if callable(identifier):
+        return identifier
+    raise ValueError(
+        f"Could not interpret loss function identifier: {identifier}"
+    )
 
 
 LABEL_DTYPES_FOR_LOSSES = {
-    tf.compat.v1.losses.sparse_softmax_cross_entropy: 'int32',
-    sparse_categorical_crossentropy: 'int32'
+    tf.compat.v1.losses.sparse_softmax_cross_entropy: "int32",
+    sparse_categorical_crossentropy: "int32",
 }
diff --git a/keras/losses_test.py b/keras/losses_test.py
index 382c9b132a3c..ba4203483c96 100644
--- a/keras/losses_test.py
+++ b/keras/losses_test.py
@@ -14,2068 +14,2988 @@
 # ==============================================================================
 """Tests for Keras loss functions."""
 
-import tensorflow.compat.v2 as tf
+import warnings
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
-from tensorflow.python.autograph.impl import api as autograph
 from keras import activations
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import losses
+from keras.testing_infra import test_combinations
 from keras.utils import losses_utils
 
+# isort: off
+from tensorflow.python.autograph.impl import (
+    api as autograph,
+)
+
 ALL_LOSSES = [
-    losses.mean_squared_error, losses.mean_absolute_error,
+    losses.mean_squared_error,
+    losses.mean_absolute_error,
     losses.mean_absolute_percentage_error,
-    losses.mean_squared_logarithmic_error, losses.squared_hinge, losses.hinge,
-    losses.categorical_crossentropy, losses.binary_crossentropy,
-    losses.kl_divergence, losses.poisson, losses.cosine_similarity,
-    losses.log_cosh, losses.categorical_hinge
+    losses.mean_squared_logarithmic_error,
+    losses.squared_hinge,
+    losses.hinge,
+    losses.categorical_crossentropy,
+    losses.binary_crossentropy,
+    losses.kl_divergence,
+    losses.poisson,
+    losses.cosine_similarity,
+    losses.log_cosh,
+    losses.categorical_hinge,
 ]
 
 
 class KerasLossesTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_objective_shapes_3d(self):
-    with self.cached_session():
-      y_a = backend.variable(np.random.random((5, 6, 7)))
-      y_b = backend.variable(np.random.random((5, 6, 7)))
-      for obj in ALL_LOSSES:
-        objective_output = obj(y_a, y_b)
-        self.assertListEqual(objective_output.shape.as_list(), [5, 6])
-
-  def test_objective_shapes_2d(self):
-    with self.cached_session():
-      y_a = backend.variable(np.random.random((6, 7)))
-      y_b = backend.variable(np.random.random((6, 7)))
-      for obj in ALL_LOSSES:
-        objective_output = obj(y_a, y_b)
-        self.assertListEqual(objective_output.shape.as_list(), [
-            6,
-        ])
-
-  def test_cce_one_hot(self):
-    with self.cached_session():
-      y_a = backend.variable(np.random.randint(0, 7, (5, 6)))
-      y_b = backend.variable(np.random.random((5, 6, 7)))
-      objective_output = losses.sparse_categorical_crossentropy(y_a, y_b)
-      assert backend.eval(objective_output).shape == (5, 6)
-
-      y_a = backend.variable(np.random.randint(0, 7, (6,)))
-      y_b = backend.variable(np.random.random((6, 7)))
-      objective_output = losses.sparse_categorical_crossentropy(y_a, y_b)
-      assert backend.eval(objective_output).shape == (6,)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_categorical_crossentropy_loss(self):
-    target = backend.variable(np.random.randint(0, 1, (5, 1)))
-    logits = backend.variable(np.random.random((5, 1)))
-    softmax_output = backend.softmax(logits)
-    output_from_logit = losses.categorical_crossentropy(
-        target, logits, from_logits=True)
-    output_from_softmax = losses.categorical_crossentropy(
-        target, softmax_output)
-    np.testing.assert_allclose(
-        backend.eval(output_from_logit),
-        backend.eval(output_from_softmax),
-        atol=1e-5)
-
-    axis = 0
-    output_from_logit_axis = losses.categorical_crossentropy(
-        target, logits, from_logits=True, axis=axis)
-    output_from_softmax_axis = losses.categorical_crossentropy(
-        target, softmax_output, axis=axis)
-
-    np.testing.assert_allclose(
-        backend.eval(output_from_logit_axis),
-        backend.eval(output_from_softmax_axis),
-        atol=1e-5)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    t = backend.placeholder()
-    p = backend.placeholder()
-    o = losses.categorical_crossentropy(t, p)
-
-    t_val = tf.convert_to_tensor([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
-    p_val = tf.convert_to_tensor([[.9, .05, .05], [.05, .89, .06],
-                                  [.05, .01, .94]])
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.105, .116, .062], 1e-3)
-
-    # from logits
-    p_val = tf.convert_to_tensor([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    o = losses.categorical_crossentropy(t, p, from_logits=True)
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.002, 0, .17], 1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_sparse_categorical_crossentropy_loss(self):
-    target = backend.variable(np.random.randint(0, 1, (5, 1)))
-    logits = backend.variable(np.random.random((5, 1)))
-    softmax_output = backend.softmax(logits)
-    output_from_logit = losses.sparse_categorical_crossentropy(
-        target, logits, from_logits=True)
-    output_from_softmax = losses.sparse_categorical_crossentropy(
-        target, softmax_output)
-    np.testing.assert_allclose(
-        backend.eval(output_from_logit),
-        backend.eval(output_from_softmax),
-        atol=1e-5)
-
-  @test_combinations.generate(test_combinations.combine(mode=['graph']))
-  def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    # This test only runs in graph because the TF op layer is not supported yet
-    # for sparse ops.
-    t = backend.placeholder()
-    p = backend.placeholder()
-    o = losses.sparse_categorical_crossentropy(t, p)
-
-    t_val = tf.convert_to_tensor([0, 1, 2])
-    p_val = tf.convert_to_tensor([[.9, .05, .05], [.05, .89, .06],
-                                  [.05, .01, .94]])
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.105, .116, .062], 1e-3)
-
-    # from logits
-    p_val = tf.convert_to_tensor([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    o = losses.sparse_categorical_crossentropy(t, p, from_logits=True)
-    f = backend.function([t, p], o)
-
-    result = f([t_val, p_val])
-    self.assertArrayNear(result, [.002, 0, .17], 1e-3)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_sparse_categorical_crossentropy_with_float16(self):
-    # See https://github.com/keras-team/keras/issues/15012 for more details.
-    # we don't cast y_true to have same dtype as y_pred, since y_pred could be
-    # float16 which has a small upbound, and the casting could cause an
-    # underflow. The y_true will be used as int64 anyway.
-
-    # create 2 observations with 2049 labels, since 2048 is the largest number
-    # for float16
-    y_true = [0, 2049]
-    # should result in a loss close to 0 since predicting y_true perfectly
-    y_pred = np.zeros((2, 2050))
-    y_pred[0][0] = 1
-    y_pred[1][2049] = 1
-    y_pred_16 = tf.convert_to_tensor(y_pred, dtype=tf.float16)
-
-    # If we did a cast for y_true to float16 in SparseCategoricalCrossentropy,
-    # then the loss will not be zero.
-    scce = losses.SparseCategoricalCrossentropy()
-    self.assertAllClose(scce(y_true, y_pred_16).numpy(), 0.0, atol=1e-3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_crossentropy_loss(self):
-    target = backend.variable(np.random.randint(0, 1, (5, 1)))
-    logits = backend.variable(np.random.random((5, 1)))
-    sigmoid_output = backend.sigmoid(logits)
-    output_from_logit = losses.binary_crossentropy(
-        target, logits, from_logits=True)
-    output_from_sigmoid = losses.binary_crossentropy(target, sigmoid_output)
-    np.testing.assert_allclose(
-        backend.eval(output_from_logit),
-        backend.eval(output_from_sigmoid),
-        atol=1e-5)
-
-    axis = 0
-    output_from_logit_axis = losses.binary_crossentropy(
-        target, logits, from_logits=True, axis=axis)
-    output_from_sigmoid_axis = losses.binary_crossentropy(
-        target, sigmoid_output, axis=axis)
-
-    np.testing.assert_allclose(
-        backend.eval(output_from_logit_axis),
-        backend.eval(output_from_sigmoid_axis),
-        atol=1e-5)
-
-  def test_get_bce(self):
-    bce_fn = losses.get('bce')
-    self.assertEqual(bce_fn, losses.binary_crossentropy)
-
-  def test_serialization(self):
-    fn = losses.get('mse')
-    config = losses.serialize(fn)
-    new_fn = losses.deserialize(config)
-    self.assertEqual(fn, new_fn)
-
-  def test_categorical_hinge(self):
-    y_pred = backend.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-    y_true = backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
-    expected_loss = ((0.3 - 0.2 + 1) + (0.7 - 0.1 + 1)) / 2.0
-    loss = backend.eval(losses.categorical_hinge(y_true, y_pred))
-    self.assertAllClose(expected_loss, np.mean(loss))
-
-  def test_loss_wrapper(self):
-    loss_fn = losses.get('mse')
-    mse_obj = losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
-
-    self.assertEqual(mse_obj.name, 'mean_squared_error')
-    self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.AUTO)
-
-    y_true = tf.constant([[1., 9.], [2., 5.]])
-    y_pred = tf.constant([[4., 8.], [12., 3.]])
-    sample_weight = tf.constant([1.2, 0.5])
-    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
-    # mse = [5, 52]
-    # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
-    # reduced_weighted_mse = (6 + 26) / 2 =
-    self.assertAllClose(self.evaluate(loss), 16, 1e-2)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_loss_wrapper_autograph(self):
-    # Test that functions with control flow wrapped in a LossFunctionWrapper
-    # get autographed when in a tf.function
-    def loss_fn(y_true, y_pred):
-      mse_loss_fn = losses.get('mse')
-      if tf.reduce_mean(y_true) > 0:
-        return mse_loss_fn(y_true, y_pred)
-      else:
-        return mse_loss_fn(y_true, y_pred)
-
-    mse_obj = losses.LossFunctionWrapper(loss_fn)
-
-    y_true = tf.constant([[1., 9.], [2., 5.]])
-    y_pred = tf.constant([[4., 8.], [12., 3.]])
-    sample_weight = tf.constant([1.2, 0.5])
-
-    @tf.function
-    def tf_functioned_loss_fn(y_true, y_pred, sample_weight=None):
-      return mse_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    loss = tf_functioned_loss_fn(y_true, y_pred, sample_weight=sample_weight)
-
-    # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
-    # mse = [5, 52]
-    # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
-    # reduced_weighted_mse = (6 + 26) / 2 =
-    self.assertAllClose(self.evaluate(loss), 16, 1e-2)
-
-  def test_loss_wrapper_dtype(self):
-    # Make sure the loss wrapper doesn't cause any numerical precision loss
-    # during calculation. See https://github.com/keras-team/keras/issues/15791
-    x = tf.convert_to_tensor([[2.1]], dtype=tf.float64)
-    y_true = tf.square(x)
-    y_pred = tf.convert_to_tensor([[3.68]], dtype=tf.float64)
-
-    # TF loss
-    loss = losses.MeanSquaredError()
-    tf_loss = loss(y_pred, y_true)
-
-    # manually computed loss in 64-bit
-    man_loss64 = tf.squeeze(tf.square(y_pred - y_true))
-
-    self.assertEqual(tf_loss.dtype, tf.float64)
-    # Make a smaller atol to ensure the float64 precision is hold.
-    self.assertAllClose(self.evaluate(tf_loss), self.evaluate(man_loss64),
-                        atol=1e-8)
-
-  def test_invalid_reduction(self):
-    with self.assertRaisesRegex(ValueError, 'Invalid Reduction Key: Foo.'):
-      losses.MeanSquaredError(reduction='Foo')
-
-    mse_obj = losses.MeanSquaredError()
-    y = tf.constant([1])
-    mse_obj.reduction = 'Bar'
-    with self.assertRaisesRegex(ValueError, 'Invalid Reduction Key: Bar.'):
-      mse_obj(y, y)
-
-  def test_deserialization_error(self):
-    with self.assertRaisesRegex(ValueError, 'Could not interpret loss'):
-      losses.get(0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_binary_crossentropy_uses_cached_logits(self):
-    logits = tf.constant([[-30., 30.]])
-    y_pred = activations.sigmoid(logits)
-    self.assertTrue(hasattr(y_pred, '_keras_logits'))
-    y_true = tf.constant([[0., 1.]])
-    loss = losses.binary_crossentropy(y_true, y_pred)[0]
-    # Check that logits are used. If y_pred is used directly, loss will
-    # collapse to 0 from underflow.
-    self.assertNotEqual(self.evaluate(loss), 0.)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_categorical_crossentropy_uses_cached_logits(self):
-    logits = tf.constant([[-5., 0., 5.]])
-    y_pred = activations.softmax(logits)
-    self.assertTrue(hasattr(y_pred, '_keras_logits'))
-    y_true = tf.constant([[0., 0., 1.]])
-    loss = losses.categorical_crossentropy(y_true, logits, from_logits=True)[0]
-    # Check that logits are used. If y_pred is used directly, loss will
-    # collapse to 0 from underflow.
-    self.assertNotEqual(self.evaluate(loss), 0.)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_sparse_categorical_crossentropy_uses_cached_logits(self):
-    logits = tf.constant([[-5., 0., 5.]])
-    y_pred = activations.softmax(logits)
-    self.assertTrue(hasattr(y_pred, '_keras_logits'))
-    y_true = tf.constant([2])
-    loss = losses.sparse_categorical_crossentropy(
-        y_true, logits, from_logits=True)[0]
-    # Check that logits are used. If y_pred is used directly, loss will
-    # collapse to 0 from underflow.
-    self.assertNotEqual(self.evaluate(loss), 0.)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_loss_not_autographed_in_eager(self):
-
-    class MyLoss(losses.Loss):
-
-      def call(self, y_true, y_pred):
-        return y_true - y_pred
-
-    loss = MyLoss()
-    y_true = tf.constant([[0., 0., 0.]])
-    y_pred = tf.constant([[1., 1., 1.]])
-
-    def tf_convert(fn, _):
-      assert False, 'Function should not be autographed.'
-      return fn
-
-    with tf.compat.v1.test.mock.patch.object(autograph, 'tf_convert',
-                                             tf_convert):
-      loss(y_true, y_pred)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_objective_shapes_3d(self):
+        with self.cached_session():
+            y_a = backend.variable(np.random.random((5, 6, 7)))
+            y_b = backend.variable(np.random.random((5, 6, 7)))
+            for obj in ALL_LOSSES:
+                objective_output = obj(y_a, y_b)
+                self.assertListEqual(objective_output.shape.as_list(), [5, 6])
+
+    def test_objective_shapes_2d(self):
+        with self.cached_session():
+            y_a = backend.variable(np.random.random((6, 7)))
+            y_b = backend.variable(np.random.random((6, 7)))
+            for obj in ALL_LOSSES:
+                objective_output = obj(y_a, y_b)
+                self.assertListEqual(
+                    objective_output.shape.as_list(),
+                    [
+                        6,
+                    ],
+                )
+
+    def test_cce_one_hot(self):
+        with self.cached_session():
+            y_a = backend.variable(np.random.randint(0, 7, (5, 6)))
+            y_b = backend.variable(np.random.random((5, 6, 7)))
+            objective_output = losses.sparse_categorical_crossentropy(y_a, y_b)
+            assert backend.eval(objective_output).shape == (5, 6)
+
+            y_a = backend.variable(np.random.randint(0, 7, (6,)))
+            y_b = backend.variable(np.random.random((6, 7)))
+            objective_output = losses.sparse_categorical_crossentropy(y_a, y_b)
+            assert backend.eval(objective_output).shape == (6,)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_crossentropy_loss(self):
+        target = backend.variable(np.random.randint(0, 1, (5, 1)))
+        logits = backend.variable(np.random.random((5, 1)))
+        softmax_output = backend.softmax(logits)
+        output_from_logit = losses.categorical_crossentropy(
+            target, logits, from_logits=True
+        )
+        output_from_softmax = losses.categorical_crossentropy(
+            target, softmax_output
+        )
+        np.testing.assert_allclose(
+            backend.eval(output_from_logit),
+            backend.eval(output_from_softmax),
+            atol=1e-5,
+        )
+
+        axis = 0
+        output_from_logit_axis = losses.categorical_crossentropy(
+            target, logits, from_logits=True, axis=axis
+        )
+        output_from_softmax_axis = losses.categorical_crossentropy(
+            target, softmax_output, axis=axis
+        )
+
+        np.testing.assert_allclose(
+            backend.eval(output_from_logit_axis),
+            backend.eval(output_from_softmax_axis),
+            atol=1e-5,
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
+        t = backend.placeholder()
+        p = backend.placeholder()
+        o = losses.categorical_crossentropy(t, p)
+
+        t_val = tf.convert_to_tensor(
+            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
+        )
+        p_val = tf.convert_to_tensor(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.06], [0.05, 0.01, 0.94]]
+        )
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.105, 0.116, 0.062], 1e-3)
+
+        # from logits
+        p_val = tf.convert_to_tensor(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        o = losses.categorical_crossentropy(t, p, from_logits=True)
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.002, 0, 0.17], 1e-3)
+
+    def test_categorial_crossentropy_loss_different_axis(self):
+        target = backend.variable(np.random.randint(0, 1, (5, 2, 3)))
+        logits = backend.variable(np.random.random((5, 2, 3)))
+        softmax_output = backend.softmax(logits)
+        axis = 1
+        output_from_logit_axis = losses.categorical_crossentropy(
+            target, logits, from_logits=True, axis=axis
+        )
+        output_from_softmax_axis = losses.categorical_crossentropy(
+            target, softmax_output, axis=axis
+        )
+        np.testing.assert_allclose(
+            backend.eval(output_from_logit_axis),
+            backend.eval(output_from_softmax_axis),
+            atol=1e-5,
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_categorical_crossentropy_loss(self):
+        target = backend.variable(np.random.randint(0, 1, (5, 1)))
+        logits = backend.variable(np.random.random((5, 1)))
+        softmax_output = backend.softmax(logits)
+        output_from_logit = losses.sparse_categorical_crossentropy(
+            target, logits, from_logits=True
+        )
+        output_from_softmax = losses.sparse_categorical_crossentropy(
+            target, softmax_output
+        )
+        np.testing.assert_allclose(
+            backend.eval(output_from_logit),
+            backend.eval(output_from_softmax),
+            atol=1e-5,
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_categorical_crossentropy_loss_with_ignore_class(self):
+        ignore_class = 255
+        target = backend.variable(np.random.randint(0, 1, (5, 1)))
+        logits = backend.variable(np.random.random((5, 1)))
+        softmax_output = backend.softmax(logits)
+
+        _valid = tf.constant([[0], [1], [0], [1], [1]], target.dtype)
+        target.assign(target * _valid + (1 - _valid) * ignore_class)
+
+        output_from_logit = losses.sparse_categorical_crossentropy(
+            target, logits, ignore_class=ignore_class, from_logits=True
+        )
+        output_from_softmax = losses.sparse_categorical_crossentropy(
+            target, softmax_output, ignore_class=ignore_class
+        )
+
+        # expected_mask = [False, True, False, True, True]
+        # for o in (output_from_logit, output_from_softmax):
+        #     mask = backend.eval(losses_utils.get_mask(o))
+        #     np.testing.assert_array_equal(mask, expected_mask)
+
+        np.testing.assert_allclose(
+            backend.eval(output_from_logit),
+            backend.eval(output_from_softmax),
+            atol=1e-5,
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["graph"]))
+    def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(
+        self,
+    ):
+        # This test only runs in graph because the TF op layer is not supported
+        # yet for sparse ops.
+        t = backend.placeholder()
+        p = backend.placeholder()
+        o = losses.sparse_categorical_crossentropy(t, p)
+
+        t_val = tf.convert_to_tensor([0, 1, 2])
+        p_val = tf.convert_to_tensor(
+            [[0.9, 0.05, 0.05], [0.05, 0.89, 0.06], [0.05, 0.01, 0.94]]
+        )
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.105, 0.116, 0.062], 1e-3)
+
+        # from logits
+        p_val = tf.convert_to_tensor(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        o = losses.sparse_categorical_crossentropy(t, p, from_logits=True)
+        f = backend.function([t, p], o)
+
+        result = f([t_val, p_val])
+        self.assertArrayNear(result, [0.002, 0, 0.17], 1e-3)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_sparse_categorical_crossentropy_with_float16(self):
+        # See https://github.com/keras-team/keras/issues/15012 for more details.
+        # we don't cast y_true to have same dtype as y_pred, since y_pred could
+        # be float16 which has a small upbound, and the casting could cause an
+        # underflow. The y_true will be used as int64 anyway.
+
+        # create 2 observations with 2049 labels, since 2048 is the largest
+        # number for float16
+        y_true = [0, 2049]
+        # should result in a loss close to 0 since predicting y_true perfectly
+        y_pred = np.zeros((2, 2050))
+        y_pred[0][0] = 1
+        y_pred[1][2049] = 1
+        y_pred_16 = tf.convert_to_tensor(y_pred, dtype=tf.float16)
+
+        # If we did a cast for y_true to float16 in
+        # SparseCategoricalCrossentropy, then the loss will not be zero.
+        scce = losses.SparseCategoricalCrossentropy()
+        self.assertAllClose(scce(y_true, y_pred_16).numpy(), 0.0, atol=1e-3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_crossentropy_loss(self):
+        target = backend.variable(np.random.randint(0, 1, (5, 1)))
+        logits = backend.variable(np.random.random((5, 1)))
+        sigmoid_output = backend.sigmoid(logits)
+        output_from_logit = losses.binary_crossentropy(
+            target, logits, from_logits=True
+        )
+        output_from_sigmoid = losses.binary_crossentropy(target, sigmoid_output)
+        np.testing.assert_allclose(
+            backend.eval(output_from_logit),
+            backend.eval(output_from_sigmoid),
+            atol=1e-5,
+        )
+
+        axis = 0
+        output_from_logit_axis = losses.binary_crossentropy(
+            target, logits, from_logits=True, axis=axis
+        )
+        output_from_sigmoid_axis = losses.binary_crossentropy(
+            target, sigmoid_output, axis=axis
+        )
+
+        np.testing.assert_allclose(
+            backend.eval(output_from_logit_axis),
+            backend.eval(output_from_sigmoid_axis),
+            atol=1e-5,
+        )
+
+    def test_get_bce(self):
+        bce_fn = losses.get("bce")
+        self.assertEqual(bce_fn, losses.binary_crossentropy)
+
+    def test_serialization(self):
+        fn = losses.get("mse")
+        config = losses.serialize(fn)
+        new_fn = losses.deserialize(config)
+        self.assertEqual(fn, new_fn)
+
+    def test_categorical_hinge(self):
+        y_pred = backend.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+        y_true = backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+        expected_loss = ((0.3 - 0.2 + 1) + (0.7 - 0.1 + 1)) / 2.0
+        loss = backend.eval(losses.categorical_hinge(y_true, y_pred))
+        self.assertAllClose(expected_loss, np.mean(loss))
+
+    def test_loss_wrapper(self):
+        loss_fn = losses.get("mse")
+        mse_obj = losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
+
+        self.assertEqual(mse_obj.name, "mean_squared_error")
+        self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.AUTO)
+
+        y_true = tf.constant([[1.0, 9.0], [2.0, 5.0]])
+        y_pred = tf.constant([[4.0, 8.0], [12.0, 3.0]])
+        sample_weight = tf.constant([1.2, 0.5])
+        loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
+        # mse = [5, 52]
+        # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
+        # reduced_weighted_mse = (6 + 26) / 2 =
+        self.assertAllClose(self.evaluate(loss), 16, 1e-2)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_loss_wrapper_autograph(self):
+        # Test that functions with control flow wrapped in a LossFunctionWrapper
+        # get autographed when in a tf.function
+        def loss_fn(y_true, y_pred):
+            mse_loss_fn = losses.get("mse")
+            if tf.reduce_mean(y_true) > 0:
+                return mse_loss_fn(y_true, y_pred)
+            else:
+                return mse_loss_fn(y_true, y_pred)
+
+        mse_obj = losses.LossFunctionWrapper(loss_fn)
+
+        y_true = tf.constant([[1.0, 9.0], [2.0, 5.0]])
+        y_pred = tf.constant([[4.0, 8.0], [12.0, 3.0]])
+        sample_weight = tf.constant([1.2, 0.5])
+
+        @tf.function
+        def tf_functioned_loss_fn(y_true, y_pred, sample_weight=None):
+            return mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        loss = tf_functioned_loss_fn(
+            y_true, y_pred, sample_weight=sample_weight
+        )
+
+        # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
+        # mse = [5, 52]
+        # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
+        # reduced_weighted_mse = (6 + 26) / 2 =
+        self.assertAllClose(self.evaluate(loss), 16, 1e-2)
+
+    def test_loss_wrapper_dtype(self):
+        # Make sure the loss wrapper doesn't cause any numerical precision loss
+        # during calculation. See
+        # https://github.com/keras-team/keras/issues/15791
+        x = tf.convert_to_tensor([[2.1]], dtype=tf.float64)
+        y_true = tf.square(x)
+        y_pred = tf.convert_to_tensor([[3.68]], dtype=tf.float64)
+
+        # TF loss
+        loss = losses.MeanSquaredError()
+        tf_loss = loss(y_pred, y_true)
+
+        # manually computed loss in 64-bit
+        man_loss64 = tf.squeeze(tf.square(y_pred - y_true))
+
+        self.assertEqual(tf_loss.dtype, tf.float64)
+        # Make a smaller atol to ensure the float64 precision is hold.
+        self.assertAllClose(
+            self.evaluate(tf_loss), self.evaluate(man_loss64), atol=1e-8
+        )
+
+    def test_invalid_reduction(self):
+        with self.assertRaisesRegex(ValueError, "Invalid Reduction Key: Foo."):
+            losses.MeanSquaredError(reduction="Foo")
+
+        mse_obj = losses.MeanSquaredError()
+        y = tf.constant([1])
+        mse_obj.reduction = "Bar"
+        with self.assertRaisesRegex(ValueError, "Invalid Reduction Key: Bar."):
+            mse_obj(y, y)
+
+    def test_deserialization_error(self):
+        with self.assertRaisesRegex(ValueError, "Could not interpret loss"):
+            losses.get(0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_binary_crossentropy_uses_cached_logits(self):
+        logits = tf.constant([[-30.0, 30.0]])
+        y_pred = activations.sigmoid(logits)
+        self.assertTrue(hasattr(y_pred, "_keras_logits"))
+        y_true = tf.constant([[0.0, 1.0]])
+        loss = losses.binary_crossentropy(y_true, y_pred)[0]
+        # Check that logits are used. If y_pred is used directly, loss will
+        # collapse to 0 from underflow.
+        self.assertNotEqual(self.evaluate(loss), 0.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_categorical_crossentropy_uses_cached_logits(self):
+        logits = tf.constant([[-5.0, 0.0, 5.0]])
+        y_pred = activations.softmax(logits)
+        self.assertTrue(hasattr(y_pred, "_keras_logits"))
+        y_true = tf.constant([[0.0, 0.0, 1.0]])
+        loss = losses.categorical_crossentropy(
+            y_true, logits, from_logits=True
+        )[0]
+        # Check that logits are used. If y_pred is used directly, loss will
+        # collapse to 0 from underflow.
+        self.assertNotEqual(self.evaluate(loss), 0.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sparse_categorical_crossentropy_uses_cached_logits(self):
+        logits = tf.constant([[-5.0, 0.0, 5.0]])
+        y_pred = activations.softmax(logits)
+        self.assertTrue(hasattr(y_pred, "_keras_logits"))
+        y_true = tf.constant([2])
+        loss = losses.sparse_categorical_crossentropy(
+            y_true, logits, from_logits=True
+        )[0]
+        # Check that logits are used. If y_pred is used directly, loss will
+        # collapse to 0 from underflow.
+        self.assertNotEqual(self.evaluate(loss), 0.0)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_loss_not_autographed_in_eager(self):
+        class MyLoss(losses.Loss):
+            def call(self, y_true, y_pred):
+                return y_true - y_pred
+
+        loss = MyLoss()
+        y_true = tf.constant([[0.0, 0.0, 0.0]])
+        y_pred = tf.constant([[1.0, 1.0, 1.0]])
+
+        def tf_convert(fn, _):
+            assert False, "Function should not be autographed."
+            return fn
+
+        with tf.compat.v1.test.mock.patch.object(
+            autograph, "tf_convert", tf_convert
+        ):
+            loss(y_true, y_pred)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class MeanSquaredErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    mse_obj = losses.MeanSquaredError(
-        reduction=losses_utils.ReductionV2.SUM, name='mse_1')
-    self.assertEqual(mse_obj.name, 'mse_1')
-    self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_all_correct_unweighted(self):
-    mse_obj = losses.MeanSquaredError()
-    y_true = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
-    loss = mse_obj(y_true, y_true)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    mse_obj = losses.MeanSquaredError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mse_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 49.5, 3)
-
-  def test_scalar_weighted(self):
-    mse_obj = losses.MeanSquaredError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 113.85, 3)
-
-  def test_sample_weighted(self):
-    mse_obj = losses.MeanSquaredError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 767.8 / 6, 3)
-
-  def test_ragged_tensors(self):
-    mse_obj = losses.MeanSquaredError()
-
-    y_true = tf.ragged.constant([[1., 1., 9.], [2., 5.]])
-    y_pred = tf.ragged.constant([[4., 1., 8.], [12., 3.]])
-    sample_weight = tf.constant([1.2, 0.5])
-    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # mse = [((4 - 1)^2 + (8 - 9)^2) / 3, ((12 - 2)^2 + (3 - 5)^2) / 2]
-    # mse = [3.(3), 52]
-    # weighted_mse = [3.(3) * 1.2, 52 * 0.5] = [4, 26]
-    # reduced_weighted_mse = (4 + 26) / 2 =
-    self.assertAllClose(self.evaluate(loss), 15, 1e-2)
-
-  def test_timestep_weighted(self):
-    mse_obj = losses.MeanSquaredError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32)
-    sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 587 / 6, 3)
-
-  def test_zero_weighted(self):
-    mse_obj = losses.MeanSquaredError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mse_obj(y_true, y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_invalid_sample_weight(self):
-    mse_obj = losses.MeanSquaredError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
-    sample_weight = tf.constant([3, 6, 5, 0], shape=(2, 2))
-    with self.assertRaisesRegex((ValueError, tf.errors.InvalidArgumentError),
-                                (r'Incompatible shapes: \[2,3\] vs. \[2,2\]|'
-                                 'Dimensions must be equal')):
-      mse_obj(y_true, y_pred, sample_weight=sample_weight)
-
-  def test_no_reduction(self):
-    mse_obj = losses.MeanSquaredError(reduction=losses_utils.ReductionV2.NONE)
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
-    loss = self.evaluate(loss)
-    self.assertArrayNear(loss, [84.3333, 143.3666], 1e-3)
-
-  def test_sum_reduction(self):
-    mse_obj = losses.MeanSquaredError(reduction=losses_utils.ReductionV2.SUM)
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 227.69998, 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        mse_obj = losses.MeanSquaredError(
+            reduction=losses_utils.ReductionV2.SUM, name="mse_1"
+        )
+        self.assertEqual(mse_obj.name, "mse_1")
+        self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_all_correct_unweighted(self):
+        mse_obj = losses.MeanSquaredError()
+        y_true = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+        loss = mse_obj(y_true, y_true)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        mse_obj = losses.MeanSquaredError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mse_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 49.5, 3)
+
+    def test_scalar_weighted(self):
+        mse_obj = losses.MeanSquaredError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 113.85, 3)
+
+    def test_sample_weighted(self):
+        mse_obj = losses.MeanSquaredError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 767.8 / 6, 3)
+
+    def test_ragged_tensors(self):
+        mse_obj = losses.MeanSquaredError()
+
+        y_true = tf.ragged.constant([[1.0, 1.0, 9.0], [2.0, 5.0]])
+        y_pred = tf.ragged.constant([[4.0, 1.0, 8.0], [12.0, 3.0]])
+        sample_weight = tf.constant([1.2, 0.5])
+        loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # mse = [((4 - 1)^2 + (8 - 9)^2) / 3, ((12 - 2)^2 + (3 - 5)^2) / 2]
+        # mse = [3.(3), 52]
+        # weighted_mse = [3.(3) * 1.2, 52 * 0.5] = [4, 26]
+        # reduced_weighted_mse = (4 + 26) / 2 =
+        self.assertAllClose(self.evaluate(loss), 15, 1e-2)
+
+    def test_timestep_weighted(self):
+        mse_obj = losses.MeanSquaredError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32
+        )
+        sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+        loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 587 / 6, 3)
+
+    def test_zero_weighted(self):
+        mse_obj = losses.MeanSquaredError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mse_obj(y_true, y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_invalid_sample_weight(self):
+        mse_obj = losses.MeanSquaredError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+        y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+        sample_weight = tf.constant([3, 6, 5, 0], shape=(2, 2))
+        with self.assertRaisesRegex(
+            (ValueError, tf.errors.InvalidArgumentError),
+            (
+                r"Incompatible shapes: \[2,3\] vs. \[2,2\]|"
+                "Dimensions must be equal"
+            ),
+        ):
+            mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    def test_no_reduction(self):
+        mse_obj = losses.MeanSquaredError(
+            reduction=losses_utils.ReductionV2.NONE
+        )
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+        loss = self.evaluate(loss)
+        self.assertArrayNear(loss, [84.3333, 143.3666], 1e-3)
+
+    def test_sum_reduction(self):
+        mse_obj = losses.MeanSquaredError(
+            reduction=losses_utils.ReductionV2.SUM
+        )
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 227.69998, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class MeanAbsoluteErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    mae_obj = losses.MeanAbsoluteError(
-        reduction=losses_utils.ReductionV2.SUM, name='mae_1')
-    self.assertEqual(mae_obj.name, 'mae_1')
-    self.assertEqual(mae_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_all_correct_unweighted(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
-    loss = mae_obj(y_true, y_true)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mae_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 5.5, 3)
-
-  def test_scalar_weighted(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 12.65, 3)
-
-  def test_sample_weighted(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 81.4 / 6, 3)
-
-  def test_timestep_weighted(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32)
-    sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 83 / 6, 3)
-
-  def test_zero_weighted(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mae_obj(y_true, y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_invalid_sample_weight(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
-    sample_weight = tf.constant([3, 6, 5, 0], shape=(2, 2))
-    with self.assertRaisesRegex((ValueError, tf.errors.InvalidArgumentError),
-                                (r'Incompatible shapes: \[2,3\] vs. \[2,2\]|'
-                                 'Dimensions must be equal')):
-      mae_obj(y_true, y_pred, sample_weight=sample_weight)
-
-  def test_no_reduction(self):
-    mae_obj = losses.MeanAbsoluteError(reduction=losses_utils.ReductionV2.NONE)
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
-    loss = self.evaluate(loss)
-    self.assertArrayNear(loss, [10.7333, 14.5666], 1e-3)
-
-  def test_sum_reduction(self):
-    mae_obj = losses.MeanAbsoluteError(reduction=losses_utils.ReductionV2.SUM)
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 25.29999, 3)
-
-  def test_ragged_tensor(self):
-    mae_obj = losses.MeanAbsoluteError()
-    y_true = tf.ragged.constant([[1, 9, 2], [-5, -2]], dtype=tf.float32)
-    y_pred = tf.ragged.constant([[4, 8, 12], [8, 1]], dtype=tf.float32)
-    # loss = [14/3, 16/2]
-    sample_weight = tf.constant([1.2, 1.0], shape=(2, 1))
-    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 6.8, 5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        mae_obj = losses.MeanAbsoluteError(
+            reduction=losses_utils.ReductionV2.SUM, name="mae_1"
+        )
+        self.assertEqual(mae_obj.name, "mae_1")
+        self.assertEqual(mae_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_all_correct_unweighted(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+        loss = mae_obj(y_true, y_true)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mae_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 5.5, 3)
+
+    def test_scalar_weighted(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 12.65, 3)
+
+    def test_sample_weighted(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 81.4 / 6, 3)
+
+    def test_timestep_weighted(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32
+        )
+        sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+        loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 83 / 6, 3)
+
+    def test_zero_weighted(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mae_obj(y_true, y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_invalid_sample_weight(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+        y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+        sample_weight = tf.constant([3, 6, 5, 0], shape=(2, 2))
+        with self.assertRaisesRegex(
+            (ValueError, tf.errors.InvalidArgumentError),
+            (
+                r"Incompatible shapes: \[2,3\] vs. \[2,2\]|"
+                "Dimensions must be equal"
+            ),
+        ):
+            mae_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    def test_no_reduction(self):
+        mae_obj = losses.MeanAbsoluteError(
+            reduction=losses_utils.ReductionV2.NONE
+        )
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+        loss = self.evaluate(loss)
+        self.assertArrayNear(loss, [10.7333, 14.5666], 1e-3)
+
+    def test_sum_reduction(self):
+        mae_obj = losses.MeanAbsoluteError(
+            reduction=losses_utils.ReductionV2.SUM
+        )
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 25.29999, 3)
+
+    def test_ragged_tensor(self):
+        mae_obj = losses.MeanAbsoluteError()
+        y_true = tf.ragged.constant([[1, 9, 2], [-5, -2]], dtype=tf.float32)
+        y_pred = tf.ragged.constant([[4, 8, 12], [8, 1]], dtype=tf.float32)
+        # loss = [14/3, 16/2]
+        sample_weight = tf.constant([1.2, 1.0], shape=(2, 1))
+        loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 6.8, 5)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class MeanAbsolutePercentageErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    mape_obj = losses.MeanAbsolutePercentageError(
-        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
-    self.assertEqual(mape_obj.name, 'mape_1')
-    self.assertEqual(mape_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_all_correct_unweighted(self):
-    mape_obj = losses.MeanAbsolutePercentageError()
-    y_true = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mape_obj(y_true, y_true)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    mape_obj = losses.MeanAbsolutePercentageError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mape_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 211.8518, 3)
-
-  def test_scalar_weighted(self):
-    mape_obj = losses.MeanAbsolutePercentageError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mape_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 487.259, 3)
-
-  def test_sample_weighted(self):
-    mape_obj = losses.MeanAbsolutePercentageError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 422.8888, 3)
-
-  def test_ragged_tensors(self):
-    mape_obj = losses.MeanAbsolutePercentageError()
-    y_true = tf.ragged.constant([[1, 9, 2], [-5, -2]])
-    y_pred = tf.ragged.constant([[4, 8, 12], [8, 1]], dtype=tf.float32)
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 510.7222, 3)
-
-  def test_timestep_weighted(self):
-    mape_obj = losses.MeanAbsolutePercentageError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32)
-    sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 694.4445, 3)
-
-  def test_zero_weighted(self):
-    mape_obj = losses.MeanAbsolutePercentageError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mape_obj(y_true, y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_no_reduction(self):
-    mape_obj = losses.MeanAbsolutePercentageError(
-        reduction=losses_utils.ReductionV2.NONE)
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = mape_obj(y_true, y_pred, sample_weight=2.3)
-    loss = self.evaluate(loss)
-    self.assertArrayNear(loss, [621.8518, 352.6666], 1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        mape_obj = losses.MeanAbsolutePercentageError(
+            reduction=losses_utils.ReductionV2.SUM, name="mape_1"
+        )
+        self.assertEqual(mape_obj.name, "mape_1")
+        self.assertEqual(mape_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_all_correct_unweighted(self):
+        mape_obj = losses.MeanAbsolutePercentageError()
+        y_true = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mape_obj(y_true, y_true)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        mape_obj = losses.MeanAbsolutePercentageError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mape_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 211.8518, 3)
+
+    def test_scalar_weighted(self):
+        mape_obj = losses.MeanAbsolutePercentageError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mape_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 487.259, 3)
+
+    def test_sample_weighted(self):
+        mape_obj = losses.MeanAbsolutePercentageError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 422.8888, 3)
+
+    def test_ragged_tensors(self):
+        mape_obj = losses.MeanAbsolutePercentageError()
+        y_true = tf.ragged.constant([[1, 9, 2], [-5, -2]])
+        y_pred = tf.ragged.constant([[4, 8, 12], [8, 1]], dtype=tf.float32)
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 510.7222, 3)
+
+    def test_timestep_weighted(self):
+        mape_obj = losses.MeanAbsolutePercentageError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32
+        )
+        sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+        loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 694.4445, 3)
+
+    def test_zero_weighted(self):
+        mape_obj = losses.MeanAbsolutePercentageError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mape_obj(y_true, y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_no_reduction(self):
+        mape_obj = losses.MeanAbsolutePercentageError(
+            reduction=losses_utils.ReductionV2.NONE
+        )
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = mape_obj(y_true, y_pred, sample_weight=2.3)
+        loss = self.evaluate(loss)
+        self.assertArrayNear(loss, [621.8518, 352.6666], 1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class MeanSquaredLogarithmicErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    msle_obj = losses.MeanSquaredLogarithmicError(
-        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
-    self.assertEqual(msle_obj.name, 'mape_1')
-    self.assertEqual(msle_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    msle_obj = losses.MeanSquaredLogarithmicError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = msle_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 1.4370, 3)
-
-  def test_scalar_weighted(self):
-    msle_obj = losses.MeanSquaredLogarithmicError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = msle_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 3.3051, 3)
-
-  def test_sample_weighted(self):
-    msle_obj = losses.MeanSquaredLogarithmicError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 3.7856, 3)
-
-  def test_timestep_weighted(self):
-    msle_obj = losses.MeanSquaredLogarithmicError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32)
-    sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 2.6473, 3)
-
-  def test_zero_weighted(self):
-    msle_obj = losses.MeanSquaredLogarithmicError()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = msle_obj(y_true, y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_ragged_tensors(self):
-    msle_obj = losses.MeanSquaredLogarithmicError()
-    y_true = tf.ragged.constant([[1, 9, 2], [-5, -2]])
-    # log(max(y_true, 0) + 1): [[0.69314, 2.3025, 1.0986], [0., 0.]]
-    y_pred = tf.ragged.constant([[4, 8, 12], [8, 1]], dtype=tf.float32)
-    # log(max(y_pred, 0) + 1): [[1.6094, 2.1972, 2.5649], [2.1972, 0.6932]]
-    # per batch loss: [1.0002, 2.6541]
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 5.1121, 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        msle_obj = losses.MeanSquaredLogarithmicError(
+            reduction=losses_utils.ReductionV2.SUM, name="mape_1"
+        )
+        self.assertEqual(msle_obj.name, "mape_1")
+        self.assertEqual(msle_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        msle_obj = losses.MeanSquaredLogarithmicError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = msle_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 1.4370, 3)
+
+    def test_scalar_weighted(self):
+        msle_obj = losses.MeanSquaredLogarithmicError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = msle_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 3.3051, 3)
+
+    def test_sample_weighted(self):
+        msle_obj = losses.MeanSquaredLogarithmicError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 3.7856, 3)
+
+    def test_timestep_weighted(self):
+        msle_obj = losses.MeanSquaredLogarithmicError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32
+        )
+        sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+        loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 2.6473, 3)
+
+    def test_zero_weighted(self):
+        msle_obj = losses.MeanSquaredLogarithmicError()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = msle_obj(y_true, y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_ragged_tensors(self):
+        msle_obj = losses.MeanSquaredLogarithmicError()
+        y_true = tf.ragged.constant([[1, 9, 2], [-5, -2]])
+        # log(max(y_true, 0) + 1): [[0.69314, 2.3025, 1.0986], [0., 0.]]
+        y_pred = tf.ragged.constant([[4, 8, 12], [8, 1]], dtype=tf.float32)
+        # log(max(y_pred, 0) + 1): [[1.6094, 2.1972, 2.5649], [2.1972, 0.6932]]
+        # per batch loss: [1.0002, 2.6541]
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 5.1121, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CosineSimilarityTest(tf.test.TestCase):
-
-  def l2_norm(self, x, axis):
-    epsilon = 1e-12
-    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
-    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
-    return np.multiply(x, x_inv_norm)
-
-  def setup(self, axis=1):
-    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
-    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
-
-    y_true = self.l2_norm(self.np_y_true, axis)
-    y_pred = self.l2_norm(self.np_y_pred, axis)
-    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
-
-    self.y_true = tf.constant(self.np_y_true)
-    self.y_pred = tf.constant(self.np_y_pred)
-
-  def test_config(self):
-    cosine_obj = losses.CosineSimilarity(
-        axis=2, reduction=losses_utils.ReductionV2.SUM, name='cosine_loss')
-    self.assertEqual(cosine_obj.name, 'cosine_loss')
-    self.assertEqual(cosine_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    self.setup()
-    cosine_obj = losses.CosineSimilarity()
-    loss = cosine_obj(self.y_true, self.y_pred)
-    expected_loss = -np.mean(self.expected_loss)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_scalar_weighted(self):
-    self.setup()
-    cosine_obj = losses.CosineSimilarity()
-    sample_weight = 2.3
-    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    expected_loss = -np.mean(self.expected_loss * sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_sample_weighted(self):
-    self.setup()
-    cosine_obj = losses.CosineSimilarity()
-    sample_weight = np.asarray([1.2, 3.4])
-    loss = cosine_obj(
-        self.y_true, self.y_pred, sample_weight=tf.constant(sample_weight))
-    expected_loss = -np.mean(self.expected_loss * sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_timestep_weighted(self):
-    self.setup()
-    cosine_obj = losses.CosineSimilarity()
-    np_y_true = self.np_y_true.reshape((2, 3, 1))
-    np_y_pred = self.np_y_pred.reshape((2, 3, 1))
-    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
-
-    y_true = self.l2_norm(np_y_true, 2)
-    y_pred = self.l2_norm(np_y_pred, 2)
-    expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(2,))
-
-    y_true = tf.constant(np_y_true)
-    y_pred = tf.constant(np_y_pred)
-    loss = cosine_obj(y_true, y_pred, sample_weight=tf.constant(sample_weight))
-
-    expected_loss = -np.mean(expected_loss * sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_zero_weighted(self):
-    self.setup()
-    cosine_obj = losses.CosineSimilarity()
-    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
-
-  def test_axis(self):
-    self.setup(axis=1)
-    cosine_obj = losses.CosineSimilarity(axis=1)
-    loss = cosine_obj(self.y_true, self.y_pred)
-    expected_loss = -np.mean(self.expected_loss)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def l2_norm(self, x, axis):
+        epsilon = 1e-12
+        square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+        x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+        return np.multiply(x, x_inv_norm)
+
+    def setup(self, axis=1):
+        self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+        self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+        y_true = self.l2_norm(self.np_y_true, axis)
+        y_pred = self.l2_norm(self.np_y_pred, axis)
+        self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+        self.y_true = tf.constant(self.np_y_true)
+        self.y_pred = tf.constant(self.np_y_pred)
+
+    def test_config(self):
+        cosine_obj = losses.CosineSimilarity(
+            axis=2, reduction=losses_utils.ReductionV2.SUM, name="cosine_loss"
+        )
+        self.assertEqual(cosine_obj.name, "cosine_loss")
+        self.assertEqual(cosine_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        self.setup()
+        cosine_obj = losses.CosineSimilarity()
+        loss = cosine_obj(self.y_true, self.y_pred)
+        expected_loss = -np.mean(self.expected_loss)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_scalar_weighted(self):
+        self.setup()
+        cosine_obj = losses.CosineSimilarity()
+        sample_weight = 2.3
+        loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        expected_loss = -np.mean(self.expected_loss * sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_sample_weighted(self):
+        self.setup()
+        cosine_obj = losses.CosineSimilarity()
+        sample_weight = np.asarray([1.2, 3.4])
+        loss = cosine_obj(
+            self.y_true, self.y_pred, sample_weight=tf.constant(sample_weight)
+        )
+        expected_loss = -np.mean(self.expected_loss * sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_timestep_weighted(self):
+        self.setup()
+        cosine_obj = losses.CosineSimilarity()
+        np_y_true = self.np_y_true.reshape((2, 3, 1))
+        np_y_pred = self.np_y_pred.reshape((2, 3, 1))
+        sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
+
+        y_true = self.l2_norm(np_y_true, 2)
+        y_pred = self.l2_norm(np_y_pred, 2)
+        expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(2,))
+
+        y_true = tf.constant(np_y_true)
+        y_pred = tf.constant(np_y_pred)
+        loss = cosine_obj(
+            y_true, y_pred, sample_weight=tf.constant(sample_weight)
+        )
+
+        expected_loss = -np.mean(expected_loss * sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_zero_weighted(self):
+        self.setup()
+        cosine_obj = losses.CosineSimilarity()
+        loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_axis(self):
+        self.setup(axis=1)
+        cosine_obj = losses.CosineSimilarity(axis=1)
+        loss = cosine_obj(self.y_true, self.y_pred)
+        expected_loss = -np.mean(self.expected_loss)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class BinaryCrossentropyTest(tf.test.TestCase):
-
-  def test_config(self):
-    bce_obj = losses.BinaryCrossentropy(
-        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
-    self.assertEqual(bce_obj.name, 'bce_1')
-    self.assertEqual(bce_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_all_correct_unweighted(self):
-    y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=tf.float32)
-    bce_obj = losses.BinaryCrossentropy()
-    loss = bce_obj(y_true, y_true)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-    # Test with logits.
-    logits = tf.constant([[100.0, -100.0, -100.0], [-100.0, 100.0, -100.0],
-                          [-100.0, -100.0, 100.0]])
-    bce_obj = losses.BinaryCrossentropy(from_logits=True)
-    loss = bce_obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
-    bce_obj = losses.BinaryCrossentropy()
-    loss = bce_obj(y_true, y_pred)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
-
-    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
-    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
-    #         -log(Y_MAX + EPSILON), -log(1)]
-    #      = [0, 15.33, 0, 0]
-    # Reduced loss = 15.33 / 4
-
-    self.assertAlmostEqual(self.evaluate(loss), 3.833, 3)
-
-    # Test with logits.
-    y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
-    logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
-    bce_obj = losses.BinaryCrossentropy(from_logits=True)
-    loss = bce_obj(y_true, logits)
-
-    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #            (where x = logits and z = y_true)
-    #      = [((100 - 100 * 1 + log(1 + exp(-100))) +
-    #          (0 + 100 * 0 + log(1 + exp(-100))) +
-    #          (100 - 100 * 1 + log(1 + exp(-100))),
-    #         ((100 - 100 * 0 + log(1 + exp(-100))) +
-    #          (100 - 100 * 1 + log(1 + exp(-100))) +
-    #          (0 + 100 * 1 + log(1 + exp(-100))))]
-    #      = [(0 + 0 + 0) / 3, 200 / 3]
-    # Reduced loss = (0 + 66.666) / 2
-
-    self.assertAlmostEqual(self.evaluate(loss), 33.333, 3)
-
-  def test_scalar_weighted(self):
-    bce_obj = losses.BinaryCrossentropy()
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
-    loss = bce_obj(y_true, y_pred, sample_weight=2.3)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
-
-    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
-    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
-    #         -log(Y_MAX + EPSILON), -log(1)]
-    #      = [0, 15.33, 0, 0]
-    # Weighted loss = [0, 15.33 * 2.3, 0, 0]
-    # Reduced loss = 15.33 * 2.3 / 4
-
-    self.assertAlmostEqual(self.evaluate(loss), 8.817, 3)
-
-    # Test with logits.
-    y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
-    logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
-    bce_obj = losses.BinaryCrossentropy(from_logits=True)
-    loss = bce_obj(y_true, logits, sample_weight=2.3)
-
-    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #            (where x = logits and z = y_true)
-    # Loss = [(0 + 0 + 0) / 3, 200 / 3]
-    # Weighted loss = [0 * 2.3, 66.666 * 2.3]
-    # Reduced loss = (0 + 66.666 * 2.3) / 2
-
-    self.assertAlmostEqual(self.evaluate(loss), 76.667, 3)
-
-  def test_sample_weighted(self):
-    bce_obj = losses.BinaryCrossentropy()
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
-
-    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
-    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
-    #         -log(Y_MAX + EPSILON), -log(1)]
-    #      = [0, 15.33, 0, 0]
-    # Reduced loss = 15.33 * 1.2 / 4
-
-    self.assertAlmostEqual(self.evaluate(loss), 4.6, 3)
-
-    # Test with logits.
-    y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
-    logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
-    weights = tf.constant([4, 3])
-    bce_obj = losses.BinaryCrossentropy(from_logits=True)
-    loss = bce_obj(y_true, logits, sample_weight=weights)
-
-    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #            (where x = logits and z = y_true)
-    # Loss = [(0 + 0 + 0)/3, 200 / 3]
-    # Weighted loss = [0 * 4, 66.666 * 3]
-    # Reduced loss = (0 + 66.666 * 3) / 2
-
-    self.assertAlmostEqual(self.evaluate(loss), 100, 3)
-
-  def test_no_reduction(self):
-    y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
-    logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
-    bce_obj = losses.BinaryCrossentropy(
-        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
-    loss = bce_obj(y_true, logits)
-
-    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #            (where x = logits and z = y_true)
-    # Loss = [(0 + 0 + 0)/3, (200)/3]
-
-    self.assertAllClose((0., 66.6666), self.evaluate(loss), 3)
-
-  def test_label_smoothing(self):
-    logits = tf.constant([[100.0, -100.0, -100.0]])
-    y_true = tf.constant([[1, 0, 1]])
-    label_smoothing = 0.1
-    # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #            (where x = logits and z = y_true)
-    # Label smoothing: z' = z * (1 - L) + 0.5L
-    #                  1  = 1 - 0.5L
-    #                  0  = 0.5L
-    # Applying the above two fns to the given input:
-    # (100 - 100 * (1 - 0.5 L)  + 0 +
-    #  0   + 100 * (0.5 L)      + 0 +
-    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
-    #  = (100 + 50L) * 1/3
-    bce_obj = losses.BinaryCrossentropy(
-        from_logits=True, label_smoothing=label_smoothing)
-    loss = bce_obj(y_true, logits)
-    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
-    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
-
-  def test_label_smoothing_ndarray(self):
-    logits = np.asarray([[100.0, -100.0, -100.0]])
-    y_true = np.asarray([[1, 0, 1]])
-    label_smoothing = 0.1
-    # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #            (where x = logits and z = y_true)
-    # Label smoothing: z' = z * (1 - L) + 0.5L
-    #                  1  = 1 - 0.5L
-    #                  0  = 0.5L
-    # Applying the above two fns to the given input:
-    # (100 - 100 * (1 - 0.5 L)  + 0 +
-    #  0   + 100 * (0.5 L)      + 0 +
-    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
-    #  = (100 + 50L) * 1/3
-    bce_obj = losses.BinaryCrossentropy(
-        from_logits=True, label_smoothing=label_smoothing)
-    loss = bce_obj(y_true, logits)
-    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
-    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
-
-  def test_ragged_tensors(self):
-    bce_obj = losses.BinaryCrossentropy()
-    y_true = tf.ragged.constant([[1, 0, 1], [0]])
-    y_pred = tf.ragged.constant([[1, 1, 1], [0]], dtype=tf.float32)
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # per batch loss = [ sum([0, 15.33, 0]) / 3, 0. ]
-    #                = [ 5.11, 0]
-    # Reduced loss = 5.11 * 1.2 / 2
-
-    self.assertAlmostEqual(self.evaluate(loss), 3.0666, 3)
-
-    # Test with logits.
-    y_true = tf.ragged.constant([[1, 0, 1], [0, 1]])
-    logits = tf.ragged.constant([[100.0, -100.0, 100.0], [100.0, 100.0]])
-    weights = tf.constant([4, 3])
-    bce_obj = losses.BinaryCrossentropy(from_logits=True)
-    loss = bce_obj(y_true, logits, sample_weight=weights)
-
-    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #            (where x = logits and z = y_true)
-    # Loss = [(0 + 0 + 0)/3, 100 / 2]
-    # Weighted loss = [0 * 4, 50 * 3]
-    # Reduced loss = (0 + 50 * 3) / 2
-
-    self.assertAlmostEqual(self.evaluate(loss), 75., 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        bce_obj = losses.BinaryCrossentropy(
+            reduction=losses_utils.ReductionV2.SUM, name="bce_1"
+        )
+        self.assertEqual(bce_obj.name, "bce_1")
+        self.assertEqual(bce_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_all_correct_unweighted(self):
+        y_true = tf.constant(
+            [[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=tf.float32
+        )
+        bce_obj = losses.BinaryCrossentropy()
+        loss = bce_obj(y_true, y_true)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [
+                [100.0, -100.0, -100.0],
+                [-100.0, 100.0, -100.0],
+                [-100.0, -100.0, 100.0],
+            ]
+        )
+        bce_obj = losses.BinaryCrossentropy(from_logits=True)
+        loss = bce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+        bce_obj = losses.BinaryCrossentropy()
+        loss = bce_obj(y_true, y_pred)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+        # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+        #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+        #         -log(Y_MAX + EPSILON), -log(1)]
+        #      = [0, 15.33, 0, 0]
+        # Reduced loss = 15.33 / 4
+
+        self.assertAlmostEqual(self.evaluate(loss), 3.833, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
+        logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
+        bce_obj = losses.BinaryCrossentropy(from_logits=True)
+        loss = bce_obj(y_true, logits)
+
+        # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #            (where x = logits and z = y_true)
+        #      = [((100 - 100 * 1 + log(1 + exp(-100))) +
+        #          (0 + 100 * 0 + log(1 + exp(-100))) +
+        #          (100 - 100 * 1 + log(1 + exp(-100))),
+        #         ((100 - 100 * 0 + log(1 + exp(-100))) +
+        #          (100 - 100 * 1 + log(1 + exp(-100))) +
+        #          (0 + 100 * 1 + log(1 + exp(-100))))]
+        #      = [(0 + 0 + 0) / 3, 200 / 3]
+        # Reduced loss = (0 + 66.666) / 2
+
+        self.assertAlmostEqual(self.evaluate(loss), 33.333, 3)
+
+    def test_scalar_weighted(self):
+        bce_obj = losses.BinaryCrossentropy()
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+        loss = bce_obj(y_true, y_pred, sample_weight=2.3)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+        # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+        #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+        #         -log(Y_MAX + EPSILON), -log(1)]
+        #      = [0, 15.33, 0, 0]
+        # Weighted loss = [0, 15.33 * 2.3, 0, 0]
+        # Reduced loss = 15.33 * 2.3 / 4
+
+        self.assertAlmostEqual(self.evaluate(loss), 8.817, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
+        logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
+        bce_obj = losses.BinaryCrossentropy(from_logits=True)
+        loss = bce_obj(y_true, logits, sample_weight=2.3)
+
+        # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #            (where x = logits and z = y_true)
+        # Loss = [(0 + 0 + 0) / 3, 200 / 3]
+        # Weighted loss = [0 * 2.3, 66.666 * 2.3]
+        # Reduced loss = (0 + 66.666 * 2.3) / 2
+
+        self.assertAlmostEqual(self.evaluate(loss), 76.667, 3)
+
+    def test_sample_weighted(self):
+        bce_obj = losses.BinaryCrossentropy()
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+        # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+        #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+        #         -log(Y_MAX + EPSILON), -log(1)]
+        #      = [0, 15.33, 0, 0]
+        # Reduced loss = 15.33 * 1.2 / 4
+
+        self.assertAlmostEqual(self.evaluate(loss), 4.6, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
+        logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
+        weights = tf.constant([4, 3])
+        bce_obj = losses.BinaryCrossentropy(from_logits=True)
+        loss = bce_obj(y_true, logits, sample_weight=weights)
+
+        # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #            (where x = logits and z = y_true)
+        # Loss = [(0 + 0 + 0)/3, 200 / 3]
+        # Weighted loss = [0 * 4, 66.666 * 3]
+        # Reduced loss = (0 + 66.666 * 3) / 2
+
+        self.assertAlmostEqual(self.evaluate(loss), 100, 3)
+
+    def test_no_reduction(self):
+        y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
+        logits = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
+        bce_obj = losses.BinaryCrossentropy(
+            from_logits=True, reduction=losses_utils.ReductionV2.NONE
+        )
+        loss = bce_obj(y_true, logits)
+
+        # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #            (where x = logits and z = y_true)
+        # Loss = [(0 + 0 + 0)/3, (200)/3]
+
+        self.assertAllClose((0.0, 66.6666), self.evaluate(loss), 3)
+
+    def test_label_smoothing(self):
+        logits = tf.constant([[100.0, -100.0, -100.0]])
+        y_true = tf.constant([[1, 0, 1]])
+        label_smoothing = 0.1
+        # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #            (where x = logits and z = y_true)
+        # Label smoothing: z' = z * (1 - L) + 0.5L
+        #                  1  = 1 - 0.5L
+        #                  0  = 0.5L
+        # Applying the above two fns to the given input:
+        # (100 - 100 * (1 - 0.5 L)  + 0 +
+        #  0   + 100 * (0.5 L)      + 0 +
+        #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+        #  = (100 + 50L) * 1/3
+        bce_obj = losses.BinaryCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        loss = bce_obj(y_true, logits)
+        expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+        self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+    def test_label_smoothing_ndarray(self):
+        logits = np.asarray([[100.0, -100.0, -100.0]])
+        y_true = np.asarray([[1, 0, 1]])
+        label_smoothing = 0.1
+        # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #            (where x = logits and z = y_true)
+        # Label smoothing: z' = z * (1 - L) + 0.5L
+        #                  1  = 1 - 0.5L
+        #                  0  = 0.5L
+        # Applying the above two fns to the given input:
+        # (100 - 100 * (1 - 0.5 L)  + 0 +
+        #  0   + 100 * (0.5 L)      + 0 +
+        #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+        #  = (100 + 50L) * 1/3
+        bce_obj = losses.BinaryCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        loss = bce_obj(y_true, logits)
+        expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+        self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+    def test_ragged_tensors(self):
+        bce_obj = losses.BinaryCrossentropy()
+        y_true = tf.ragged.constant([[1, 0, 1], [0]])
+        y_pred = tf.ragged.constant([[1, 1, 1], [0]], dtype=tf.float32)
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # per batch loss = [ sum([0, 15.33, 0]) / 3, 0. ]
+        #                = [ 5.11, 0]
+        # Reduced loss = 5.11 * 1.2 / 2
+
+        self.assertAlmostEqual(self.evaluate(loss), 3.0666, 3)
+
+        # Test with logits.
+        y_true = tf.ragged.constant([[1, 0, 1], [0, 1]])
+        logits = tf.ragged.constant([[100.0, -100.0, 100.0], [100.0, 100.0]])
+        weights = tf.constant([4, 3])
+        bce_obj = losses.BinaryCrossentropy(from_logits=True)
+        loss = bce_obj(y_true, logits, sample_weight=weights)
+
+        # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #            (where x = logits and z = y_true)
+        # Loss = [(0 + 0 + 0)/3, 100 / 2]
+        # Weighted loss = [0 * 4, 50 * 3]
+        # Reduced loss = (0 + 50 * 3) / 2
+
+        self.assertAlmostEqual(self.evaluate(loss), 75.0, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class BinaryFocalCrossentropyTest(tf.test.TestCase):
-
-  def test_config(self):
-    obj = losses.BinaryFocalCrossentropy(gamma=1.5, name='bfce_0')
-    self.assertEqual(obj.name, 'bfce_0')
-    self.assertAlmostEqual(obj.gamma, 1.5)
-
-    obj_2 = losses.BinaryFocalCrossentropy.from_config(obj.get_config())
-    self.assertEqual(obj_2.name, 'bfce_0')
-    self.assertAlmostEqual(obj_2.gamma, 1.5)
-
-  def test_all_correct_unweighted(self):
-    y_true = tf.constant([
-        [1, 0, 0],
-        [0, 1, 0],
-        [0, 0, 1],
-    ], dtype=tf.float32)
-    obj = losses.BinaryFocalCrossentropy(gamma=1.5)
-    loss = obj(y_true, y_true)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-    # Test with logits.
-    logits = tf.constant([
-        [100.0, -100.0, -100.0],
-        [-100.0, 100.0, -100.0],
-        [-100.0, -100.0, 100.0],
-    ])
-    obj = losses.BinaryFocalCrossentropy(gamma=2.0, from_logits=True)
-    loss = obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape([2, 2])
-    obj = losses.BinaryFocalCrossentropy(gamma=2.0)
-    loss = obj(y_true, y_pred)
-
-    # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7, 0.8]]
-    # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
-
-    # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]]
-    # focalLoss = focal bceLoss = [[0.001, 1.03], [0.032, 0.009]]
-    # Reduced loss = (0.001 + 1.03 + 0.032 + 0.009) / 4 = 0.268
-
-    self.assertAlmostEqual(self.evaluate(loss), 0.268, 3)
-
-    # Test with logits.
-    y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
-    logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
-    obj = losses.BinaryFocalCrossentropy(gamma=3.0, from_logits=True)
-    loss = obj(y_true, logits)
-
-    # sigmoidal = sigmoid(logits)
-    #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
-    # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
-    #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
-    # focal = (1 - p_t) ** gamma
-    #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
-
-    # bceLoss = -log(p_t)
-    #         = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]]
-
-    # focalLoss = focal bceLoss
-    #           = [[0.0012, 2.2743, 2.514], [0.0000002, 0.0033, 0.00000001]]
-    # Reduced loss = 0.799
-
-    self.assertAlmostEqual(self.evaluate(loss), 0.799, 3)
-
-  def test_scalar_weighted(self):
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape([2, 2])
-    obj = losses.BinaryFocalCrossentropy(gamma=2.0)
-    loss = obj(y_true, y_pred, sample_weight=1.23)
-
-    # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7, 0.8]]
-    # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
-
-    # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]] * sample_weight
-    # focalLoss = focal bceLoss
-    #           = [[0.001, 1.03], [0.032, 0.009]] * sample_weight
-    # Reduced loss = (0.001 + 1.03 + 0.032 + 0.009) * 1.23 / 4 = 0.3296
-
-    self.assertAlmostEqual(self.evaluate(loss), 0.3296, 3)
-
-    # Test with logits.
-    y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
-    logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
-    obj = losses.BinaryFocalCrossentropy(gamma=3.0, from_logits=True)
-    loss = obj(y_true, logits, sample_weight=3.21)
-
-    # sigmoidal = sigmoid(logits)
-    #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
-    # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
-    #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
-    # focal = (1 - p_t) ** gamma
-    #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
-
-    # bceLoss = -log(p_t) * sample_weight
-    # = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]] * sample_weight
-
-    # focalLoss = focal * bceLoss =
-    # [[0.0012, 2.2743, 2.514], [0.0000002, 0.0033, 0.00000001]] * sample_weight
-    # Reduced loss = 0.799 * 3.21 = 2.565
-
-    self.assertAlmostEqual(self.evaluate(loss), 2.565, 3)
-
-  def test_sample_weighted(self):
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape([2, 2])
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    obj = losses.BinaryFocalCrossentropy(gamma=2.0)
-    loss = obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7, 0.8]]
-    # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
-
-    # bceLoss = -log(p_t) * sample_weight
-    #         = [[0.105, 1.609] ,[0.357, 0.223]] * sample_weight
-    # focalLoss = focal * bceLoss
-    #           = [[0.001, 1.03], [0.032, 0.009]] * sample_weight
-    #           = [[0.0012, 1.236], [0.1088, 0.0306]]
-    # Reduced loss = (0.0012 + 1.236 + 0.1088 + 0.0306) / 4 = 0.34415
-
-    self.assertAlmostEqual(self.evaluate(loss), 0.34415, 3)
-
-    # Test with logits.
-    y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
-    logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
-    obj = losses.BinaryFocalCrossentropy(gamma=3.0, from_logits=True)
-    loss = obj(y_true, logits, sample_weight=sample_weight)
-
-    # sigmoidal = sigmoid(logits)
-    #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
-    # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
-    #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
-    # focal = (1 - p_t) ** gamma
-    #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
-
-    # bceLoss = -log(p_t) * sample_weight
-    # = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]] * sample_weight
-
-    # focalLoss = focal * bceLoss =
-    # [[0.0012, 2.2743, 2.514], [0.0000002, 0.0033, 0.00000001]] * sample_weight
-    # focalLoss = [[0.00144, 2.72916, 3.0168], [6.8e-7, 0.01122, 3.4e-8]]
-    # Reduced loss = 0.799
-
-    self.assertAlmostEqual(self.evaluate(loss), 0.95977, 3)
-
-  def test_no_reduction(self):
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape([2, 2])
-    obj = losses.BinaryFocalCrossentropy(
-        gamma=2.0,
-        reduction=losses_utils.ReductionV2.NONE,
-    )
-    loss = obj(y_true, y_pred)
-
-    # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7, 0.8]]
-    # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
-
-    # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]]
-    # focalLoss = focal bceLoss = [[0.001, 1.03], [0.032, 0.009]]
-    # Reduced loss = [(0.001 + 1.03) / 2, (0.032 + 0.009) / 2]
-
-    self.assertAllClose(self.evaluate(loss), (0.5155, 0.0205), 3)
-
-  def test_ragged_tensors(self):
-    y_true = tf.ragged.constant([[1, 0, 1], [0]])
-    y_pred = tf.ragged.constant([[0.9, 0.8, 0.7], [0.2]])
-    obj = losses.BinaryFocalCrossentropy(gamma=2.0)
-    loss = obj(y_true, y_pred)
-
-    # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2, 0.7], [0.8]]
-    # focal = (1 - p_t) ** gamma = [[0.01, 0.64, 0.09], [0.04]]
-
-    # bceLoss = -log(p_t) = [[0.105, 1.609, 0.357], [0.223]]
-    # focalLoss = focal bceLoss = [[0.001, 1.03, 0.032], [0.009]]
-    # Reduced loss = ((0.001 + 1.03 + 0.032) / 3 + 0.009) / 2 = 0.18166
-
-    self.assertAlmostEqual(self.evaluate(loss), 0.18166, 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        obj = losses.BinaryFocalCrossentropy(gamma=1.5, name="bfce_0")
+        self.assertEqual(obj.name, "bfce_0")
+        self.assertAlmostEqual(obj.gamma, 1.5)
+
+        obj_2 = losses.BinaryFocalCrossentropy.from_config(obj.get_config())
+        self.assertEqual(obj_2.name, "bfce_0")
+        self.assertAlmostEqual(obj_2.gamma, 1.5)
+
+    def test_all_correct_unweighted(self):
+        y_true = tf.constant(
+            [
+                [1, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+            ],
+            dtype=tf.float32,
+        )
+        obj = losses.BinaryFocalCrossentropy(gamma=1.5)
+        loss = obj(y_true, y_true)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [
+                [100.0, -100.0, -100.0],
+                [-100.0, 100.0, -100.0],
+                [-100.0, -100.0, 100.0],
+            ]
+        )
+        obj = losses.BinaryFocalCrossentropy(gamma=2.0, from_logits=True)
+        loss = obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        obj = losses.BinaryFocalCrossentropy(gamma=2.0)
+        loss = obj(y_true, y_pred)
+
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2],
+        #                                                    [0.7, 0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]]
+        # focalLoss = focal bceLoss = [[0.001, 1.03], [0.032, 0.009]]
+        # Reduced loss = (0.001 + 1.03 + 0.032 + 0.009) / 4 = 0.268
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.268, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
+        logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
+        obj = losses.BinaryFocalCrossentropy(gamma=3.0, from_logits=True)
+        loss = obj(y_true, logits)
+
+        # sigmoidal = sigmoid(logits)
+        #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
+        # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
+        #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
+        # focal = (1 - p_t) ** gamma
+        #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
+
+        # bceLoss = -log(p_t)
+        #         = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]]
+
+        # focalLoss = focal bceLoss
+        #           = [[0.0012, 2.2743, 2.514], [0.0000002, 0.0033, 0.00000001]]
+        # Reduced loss = 0.799
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.799, 3)
+
+    def test_scalar_weighted(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        obj = losses.BinaryFocalCrossentropy(gamma=2.0)
+        loss = obj(y_true, y_pred, sample_weight=1.23)
+
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2],
+        #                                                    [0.7, 0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]] * sample_weight
+        # focalLoss = focal bceLoss
+        #           = [[0.001, 1.03], [0.032, 0.009]] * sample_weight
+        # Reduced loss = (0.001 + 1.03 + 0.032 + 0.009) * 1.23 / 4 = 0.3296
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.3296, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
+        logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
+        obj = losses.BinaryFocalCrossentropy(gamma=3.0, from_logits=True)
+        loss = obj(y_true, logits, sample_weight=3.21)
+
+        # sigmoidal = sigmoid(logits)
+        #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
+        # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
+        #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
+        # focal = (1 - p_t) ** gamma
+        #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
+
+        # bceLoss = -log(p_t) * sample_weight
+        # = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]] *
+        # sample_weight
+
+        # focalLoss = focal * bceLoss =
+        # [[0.0012, 2.2743, 2.514], [0.0000002, 0.0033, 0.00000001]] *
+        # sample_weight
+        # Reduced loss = 0.799 * 3.21 = 2.565
+
+        self.assertAlmostEqual(self.evaluate(loss), 2.565, 3)
+
+    def test_sample_weighted(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        obj = losses.BinaryFocalCrossentropy(gamma=2.0)
+        loss = obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7,
+        # 0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) * sample_weight
+        #         = [[0.105, 1.609] ,[0.357, 0.223]] * sample_weight
+        # focalLoss = focal * bceLoss
+        #           = [[0.001, 1.03], [0.032, 0.009]] * sample_weight
+        #           = [[0.0012, 1.236], [0.1088, 0.0306]]
+        # Reduced loss = (0.0012 + 1.236 + 0.1088 + 0.0306) / 4 = 0.34415
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.34415, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
+        logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
+        obj = losses.BinaryFocalCrossentropy(gamma=3.0, from_logits=True)
+        loss = obj(y_true, logits, sample_weight=sample_weight)
+
+        # sigmoidal = sigmoid(logits)
+        #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
+        # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
+        #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
+        # focal = (1 - p_t) ** gamma
+        #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
+
+        # bceLoss = -log(p_t) * sample_weight
+        # = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]] *
+        # sample_weight
+
+        # focalLoss = focal * bceLoss =
+        # [[0.0012, 2.2743, 2.514], [0.0000002, 0.0033, 0.00000001]] *
+        # sample_weight
+        # focalLoss = [[0.00144, 2.72916, 3.0168], [6.8e-7, 0.01122, 3.4e-8]]
+        # Reduced loss = 0.799
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.95977, 3)
+
+    def test_no_reduction(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        obj = losses.BinaryFocalCrossentropy(
+            gamma=2.0,
+            reduction=losses_utils.ReductionV2.NONE,
+        )
+        loss = obj(y_true, y_pred)
+
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7,
+        # 0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]]
+        # focalLoss = focal bceLoss = [[0.001, 1.03], [0.032, 0.009]]
+        # Reduced loss = [(0.001 + 1.03) / 2, (0.032 + 0.009) / 2]
+
+        self.assertAllClose(self.evaluate(loss), (0.5155, 0.0205), 3)
+
+    def test_ragged_tensors(self):
+        y_true = tf.ragged.constant([[1, 0, 1], [0]])
+        y_pred = tf.ragged.constant([[0.9, 0.8, 0.7], [0.2]])
+        obj = losses.BinaryFocalCrossentropy(gamma=2.0)
+        loss = obj(y_true, y_pred)
+
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2, 0.7],
+        # [0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64, 0.09], [0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609, 0.357], [0.223]]
+        # focalLoss = focal bceLoss = [[0.001, 1.03, 0.032], [0.009]]
+        # Reduced loss = ((0.001 + 1.03 + 0.032) / 3 + 0.009) / 2 = 0.18166
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.18166, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class BinaryWeightedFocalCrossentropyTest(tf.test.TestCase):
+    def test_config(self):
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.1,
+            gamma=1.5,
+            name="bfce_0",
+        )
+        self.assertTrue(obj.apply_class_balancing)
+        self.assertEqual(obj.name, "bfce_0")
+        self.assertAlmostEqual(obj.alpha, 0.1)
+        self.assertAlmostEqual(obj.gamma, 1.5)
+
+        obj_2 = losses.BinaryFocalCrossentropy.from_config(obj.get_config())
+        self.assertTrue(obj_2.apply_class_balancing)
+        self.assertEqual(obj_2.name, "bfce_0")
+        self.assertAlmostEqual(obj_2.alpha, 0.1)
+        self.assertAlmostEqual(obj_2.gamma, 1.5)
+
+    def test_all_correct_unweighted(self):
+        y_true = tf.constant(
+            [
+                [1, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+            ],
+            dtype=tf.float32,
+        )
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True, gamma=1.5
+        )
+        loss = obj(y_true, y_true)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [
+                [100.0, -100.0, -100.0],
+                [-100.0, 100.0, -100.0],
+                [-100.0, -100.0, 100.0],
+            ]
+        )
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.3,
+            gamma=2.0,
+            from_logits=True,
+        )
+        loss = obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.4,
+            gamma=2.0,
+        )
+        loss = obj(y_true, y_pred)
+
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7,
+        # 0.8]]
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.4, 0.6], [0.4, 0.6]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]]
+        # weightedfocalLoss = alpha_weight focal bceLoss
+        #                   = [[0.0004, 0.618], [0.0128, 0.0054]]
+        # Reduced loss = (0.0004 + 0.618 + 0.0128 + 0.0054) / 4 = 0.15915
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.15915, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
+        logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.3,
+            gamma=3.0,
+            from_logits=True,
+        )
+        loss = obj(y_true, logits)
+
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.3, 0.3, 0.7], [0.7, 0.3, 0.7]]
+        # sigmoidal = sigmoid(logits)
+        #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
+        # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
+        #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
+        # focal = (1 - p_t) ** gamma
+        #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
+
+        # bceLoss = -log(p_t)
+        #         = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]]
+
+        # weightedfocalLoss = alpha_weight focal bceLoss
+        # = [[0.00036, 0.68229, 1.7598], [0.00000014, 0.00099, 0.000000007]]
+        # Reduced loss = 0.40724
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.40724, 3)
+
+    def test_scalar_weighted(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.6,
+            gamma=2.0,
+        )
+        loss = obj(y_true, y_pred, sample_weight=1.23)
+
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.6, 0.4], [0.6, 0.4]]
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7,
+        # 0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]] * sample_weight
+        # weightedfocalLoss = alpha_weight focal bceLoss
+        #           = [[0.0006, 0.412], [0.0192, 0.0036]] * sample_weight
+        # Reduced loss = (0.0006 + 0.412 + 0.0192 + 0.0036) * 1.23 / 4 = 0.13388
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.13388, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
+        logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.2,
+            gamma=3.0,
+            from_logits=True,
+        )
+        loss = obj(y_true, logits, sample_weight=3.21)
+
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.2, 0.2, 0.8], [0.8, 0.2, 0.8]]
+        # sigmoidal = sigmoid(logits)
+        #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
+        # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
+        #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
+        # focal = (1 - p_t) ** gamma
+        #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
+
+        # bceLoss = -log(p_t) * sample_weight
+        # = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]] *
+        # sample_weight
+
+        # weightedfocalLoss = alpha_weight * focal * bceLoss =
+        # [[0.00024, 0.45486, 2.0112], [0.00000016, 0.00066, 0.000000008]] *
+        # 3.21
+        # Reduced loss = 0.41116 * 3.21 = 1.32
+
+        self.assertAlmostEqual(self.evaluate(loss), 1.32, 3)
+
+    def test_sample_weighted(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.1,
+            gamma=2.0,
+        )
+        loss = obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.1, 0.9], [0.1, 0.9]]
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7,
+        # 0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) * sample_weight
+        #         = [[0.105, 1.609] ,[0.357, 0.223]] * sample_weight
+        # focalLoss = alpha_weight * focal * bceLoss
+        #           = [[0.0001, 0.927], [0.0032, 0.0081]] * sample_weight
+        #           = [[0.00012, 1.1124], [0.01088, 0.02754]]
+        # Reduced loss = (0.00012 + 1.1124 + 0.01088 + 0.02754) / 4 = 0.2877
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.2877, 3)
+
+        # Test with logits.
+        y_true = tf.constant([[1, 1, 0], [0, 1, 0]], dtype=tf.float32)
+        logits = tf.constant([[1.5, -2.7, 2.9], [-3.8, 1.2, -4.5]])
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.2,
+            gamma=3.0,
+            from_logits=True,
+        )
+        loss = obj(y_true, logits, sample_weight=sample_weight)
+
+        # sigmoidal = sigmoid(logits)
+        #           = [[0.8176, 0.063, 0.9478], [0.0219, 0.7685, 0.011]]
+        # p_t = y_true sigmoidal + (1 - y_true) (1 - sigmoidal)
+        #     = [[0.8176, 0.063, 0.0522], [0.9781, 0.7685, 0.989]]
+        # focal = (1 - p_t) ** gamma
+        #       = [[0.006, 0.823, 0.851], [0.00001, 0.0124, 0.000001]]
+
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.2, 0.2, 0.8], [0.8, 0.2, 0.8]]
+
+        # bceLoss = -log(p_t) * sample_weight
+        # = [[0.2014, 2.7646 , 2.9527], [0.0221, 0.2633, 0.01106]] *
+        # sample_weight
+
+        # focalLoss = alpha_weight * focal * bceLoss =
+        # [[0.00024, 0.45486, 2.0112], [1.6e-7, 6.6e-4, 8e-9]] * sample_weight
+        # focalLoss = [[0.000288, 0.5458, 2.41344], [5.44e-7, 2.444e-3,
+        # 2.72e-8]]
+        # Reduced loss = 0.49366
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.49366, 3)
+
+    def test_no_reduction(self):
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([0.9, 0.8, 0.7, 0.2], dtype=np.float32).reshape(
+            [2, 2]
+        )
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.6,
+            gamma=2.0,
+            reduction=losses_utils.ReductionV2.NONE,
+        )
+        loss = obj(y_true, y_pred)
+
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.6, 0.4], [0.6, 0.4]]
+
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2], [0.7,
+        # 0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64], [0.09, 0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609] ,[0.357, 0.223]]
+        # focalLoss = alpha_weight focal bceLoss
+        #           = [[0.0006, 0.412], [0.0192, 0.0036]]
+        # Reduced loss = [(0.0006 + 0.412) / 2, (0.0192 + 0.0036) / 2]
+
+        self.assertAllClose(self.evaluate(loss), (0.2063, 0.0114), 3)
+
+    def test_ragged_tensors(self):
+        y_true = tf.ragged.constant([[1, 0, 1], [0]])
+        y_pred = tf.ragged.constant([[0.9, 0.8, 0.7], [0.2]])
+        obj = losses.BinaryFocalCrossentropy(
+            apply_class_balancing=True,
+            alpha=0.1,
+            gamma=2.0,
+        )
+        loss = obj(y_true, y_pred)
+
+        # alpha_weight = alpha y_true + (1 - alpha) (1 - y_true)
+        #              = [[0.1, 0.9, 0.1], [0.9]]
+        # p_t = y_true y_pred + (1 - y_true) (1 - y_pred) = [[0.9, 0.2, 0.7],
+        # [0.8]]
+        # focal = (1 - p_t) ** gamma = [[0.01, 0.64, 0.09], [0.04]]
+
+        # bceLoss = -log(p_t) = [[0.105, 1.609, 0.357], [0.223]]
+        # focalLoss = alpha_weight focal bceLoss
+        #           = [[0.0001, 0.927, 0.0032], [0.0081]]
+        # Reduced loss = ((0.0001 + 0.927 + 0.0032) / 3 + 0.0081) / 2 = 0.1591
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.1591, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CategoricalCrossentropyTest(tf.test.TestCase):
-
-  def test_config(self):
-    cce_obj = losses.CategoricalCrossentropy(
-        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
-    self.assertEqual(cce_obj.name, 'bce_1')
-    self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_all_correct_unweighted(self):
-    y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=tf.int64)
-    y_pred = tf.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
-                         dtype=tf.float32)
-    cce_obj = losses.CategoricalCrossentropy()
-    loss = cce_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-    # Test with logits.
-    logits = tf.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
-    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    cce_obj = losses.CategoricalCrossentropy()
-    y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    y_pred = tf.constant([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]],
-                         dtype=tf.float32)
-    loss = cce_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
-
-    # Test with logits.
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
-
-  def test_scalar_weighted(self):
-    cce_obj = losses.CategoricalCrossentropy()
-    y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    y_pred = tf.constant([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]],
-                         dtype=tf.float32)
-    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
-
-    # Test with logits.
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
-
-  def test_sample_weighted(self):
-    cce_obj = losses.CategoricalCrossentropy()
-    y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    y_pred = tf.constant([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]],
-                         dtype=tf.float32)
-    sample_weight = tf.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
-    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
-
-    # Test with logits.
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
-
-  def test_no_reduction(self):
-    y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.CategoricalCrossentropy(
-        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
-    loss = cce_obj(y_true, logits)
-    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
-
-  def test_label_smoothing(self):
-    logits = tf.constant([[100.0, -100.0, -100.0]])
-    y_true = tf.constant([[1, 0, 0]])
-    label_smoothing = 0.1
-    # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
-    # where for a softmax activation
-    # \log q_i = x_i - \log \sum_j \exp x_j
-    #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
-    # For our activations, [100, -100, -100]
-    # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
-    # so our log softmaxes become: [0, -200, -200]
-    # Label smoothing: z' = z * (1 - L) + L/n
-    #                  1  = 1 - L + L/n
-    #                  0  = L/n
-    # Applying the above two fns to the given input:
-    # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
-    cce_obj = losses.CategoricalCrossentropy(
-        from_logits=True, label_smoothing=label_smoothing)
-    loss = cce_obj(y_true, logits)
-    expected_value = 400.0 * label_smoothing / 3.0
-    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
-
-  def test_label_smoothing_ndarray(self):
-    logits = np.asarray([[100.0, -100.0, -100.0]])
-    y_true = np.asarray([[1, 0, 0]])
-    label_smoothing = 0.1
-    # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
-    # where for a softmax activation
-    # \log q_i = x_i - \log \sum_j \exp x_j
-    #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
-    # For our activations, [100, -100, -100]
-    # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
-    # so our log softmaxes become: [0, -200, -200]
-    # Label smoothing: z' = z * (1 - L) + L/n
-    #                  1  = 1 - L + L/n
-    #                  0  = L/n
-    # Applying the above two fns to the given input:
-    # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
-    cce_obj = losses.CategoricalCrossentropy(
-        from_logits=True, label_smoothing=label_smoothing)
-    loss = cce_obj(y_true, logits)
-    expected_value = 400.0 * label_smoothing / 3.0
-    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
-
-  def test_shape_mismatch(self):
-    y_true = tf.constant([[0], [1], [2]])
-    y_pred = tf.constant([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
-
-    cce_obj = losses.CategoricalCrossentropy()
-    with self.assertRaisesRegex(ValueError, 'Shapes .+ are incompatible'):
-      cce_obj(y_true, y_pred)
-
-  def test_ragged_tensors(self):
-    cce_obj = losses.CategoricalCrossentropy()
-    y_true = tf.ragged.constant([[[1, 0, 0], [0, 1, 0]], [[0, 0, 1]]])
-    y_pred = tf.ragged.constant(
-        [[[.9, .05, .05], [.5, .89, .6]], [[.05, .01, .94]]], dtype=tf.float32)
-    # batch losses [[0.1054, 0.8047], [0.0619]]
-    sample_weight = tf.constant([[1.2], [3.4]], shape=(2, 1))
-    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
-    # sum([0.1054, 0.8047, 0.0619]) / 3
-    self.assertAlmostEqual(self.evaluate(loss), 0.4341, 3)
-
-    # Test with logits.
-    logits = tf.ragged.constant([[[8., 1., 1.], [0., 9., 1.]], [[2., 3., 5.]]])
-    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
-    # batch losses [[0.0018, 0.0004], [0.1698]]
-    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.1934, 3)
-
-  def test_ragged_tensors_ragged_sample_weights(self):
-    cce_obj = losses.CategoricalCrossentropy()
-    y_true = tf.ragged.constant([[[1, 0, 0], [0, 1, 0]], [[0, 0, 1]]])
-    y_pred = tf.ragged.constant(
-        [[[.9, .05, .05], [.05, .89, .06]], [[.05, .01, .94]]],
-        dtype=tf.float32)
-    # batch losses [[0.1054, 0.1165], [0.0619]]
-    # Use independent weights for each batch element
-    sample_weight = tf.ragged.constant([[1.2, 3.4], [5.6]], dtype=tf.float32)
-    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
-    # sum([0.1054*1.2, 0.1165*3.4, 0.0619*5.6])/3
-    self.assertAlmostEqual(self.evaluate(loss), 0.2897, 3)
-
-    # Test with logits.
-    logits = tf.ragged.constant([[[8., 1., 1.], [0., 9., 1.]], [[2., 3., 5.]]])
-    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
-    # batch losses [[0.0018, 0.0004], [0.1698]]
-    # sum([0.0018*1.2, 0.0004*3.4, 0.1698*5.6]) / 3
-    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.3181, 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        cce_obj = losses.CategoricalCrossentropy(
+            reduction=losses_utils.ReductionV2.SUM, name="bce_1"
+        )
+        self.assertEqual(cce_obj.name, "bce_1")
+        self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_all_correct_unweighted(self):
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=tf.int64)
+        y_pred = tf.constant(
+            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],
+            dtype=tf.float32,
+        )
+        cce_obj = losses.CategoricalCrossentropy()
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[10.0, 0.0, 0.0], [0.0, 10.0, 0.0], [0.0, 0.0, 10.0]]
+        )
+        cce_obj = losses.CategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        cce_obj = losses.CategoricalCrossentropy()
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.3239, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0573, 3)
+
+    def test_scalar_weighted(self):
+        cce_obj = losses.CategoricalCrossentropy()
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 0.7449, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 0.1317, 3)
+
+    def test_sample_weighted(self):
+        cce_obj = losses.CategoricalCrossentropy()
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        sample_weight = tf.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+    def test_no_reduction(self):
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalCrossentropy(
+            from_logits=True, reduction=losses_utils.ReductionV2.NONE
+        )
+        loss = cce_obj(y_true, logits)
+        self.assertAllClose(
+            (0.001822, 0.000459, 0.169846), self.evaluate(loss), 3
+        )
+
+    def test_label_smoothing(self):
+        logits = tf.constant([[100.0, -100.0, -100.0]])
+        y_true = tf.constant([[1, 0, 0]])
+        label_smoothing = 0.1
+        # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
+        # where for a softmax activation
+        # \log q_i = x_i - \log \sum_j \exp x_j
+        #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
+        # For our activations, [100, -100, -100]
+        # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
+        # so our log softmaxes become: [0, -200, -200]
+        # Label smoothing: z' = z * (1 - L) + L/n
+        #                  1  = 1 - L + L/n
+        #                  0  = L/n
+        # Applying the above two fns to the given input:
+        # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
+        cce_obj = losses.CategoricalCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        loss = cce_obj(y_true, logits)
+        expected_value = 400.0 * label_smoothing / 3.0
+        self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+    def test_label_smoothing_ndarray(self):
+        logits = np.asarray([[100.0, -100.0, -100.0]])
+        y_true = np.asarray([[1, 0, 0]])
+        label_smoothing = 0.1
+        # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
+        # where for a softmax activation
+        # \log q_i = x_i - \log \sum_j \exp x_j
+        #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
+        # For our activations, [100, -100, -100]
+        # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
+        # so our log softmaxes become: [0, -200, -200]
+        # Label smoothing: z' = z * (1 - L) + L/n
+        #                  1  = 1 - L + L/n
+        #                  0  = L/n
+        # Applying the above two fns to the given input:
+        # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
+        cce_obj = losses.CategoricalCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        loss = cce_obj(y_true, logits)
+        expected_value = 400.0 * label_smoothing / 3.0
+        self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+    def test_shape_mismatch(self):
+        y_true = tf.constant([[0], [1], [2]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]]
+        )
+
+        cce_obj = losses.CategoricalCrossentropy()
+        with self.assertRaisesRegex(ValueError, "Shapes .+ are incompatible"):
+            cce_obj(y_true, y_pred)
+
+    def test_ragged_tensors(self):
+        cce_obj = losses.CategoricalCrossentropy()
+        y_true = tf.ragged.constant([[[1, 0, 0], [0, 1, 0]], [[0, 0, 1]]])
+        y_pred = tf.ragged.constant(
+            [[[0.9, 0.05, 0.05], [0.5, 0.89, 0.6]], [[0.05, 0.01, 0.94]]],
+            dtype=tf.float32,
+        )
+        # batch losses [[0.1054, 0.8047], [0.0619]]
+        sample_weight = tf.constant([[1.2], [3.4]], shape=(2, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        # sum([0.1054, 0.8047, 0.0619]) / 3
+        self.assertAlmostEqual(self.evaluate(loss), 0.4341, 3)
+
+        # Test with logits.
+        logits = tf.ragged.constant(
+            [[[8.0, 1.0, 1.0], [0.0, 9.0, 1.0]], [[2.0, 3.0, 5.0]]]
+        )
+        cce_obj = losses.CategoricalCrossentropy(from_logits=True)
+        # batch losses [[0.0018, 0.0004], [0.1698]]
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.1934, 3)
+
+    def test_ragged_tensors_ragged_sample_weights(self):
+        cce_obj = losses.CategoricalCrossentropy()
+        y_true = tf.ragged.constant([[[1, 0, 0], [0, 1, 0]], [[0, 0, 1]]])
+        y_pred = tf.ragged.constant(
+            [[[0.9, 0.05, 0.05], [0.05, 0.89, 0.06]], [[0.05, 0.01, 0.94]]],
+            dtype=tf.float32,
+        )
+        # batch losses [[0.1054, 0.1165], [0.0619]]
+        # Use independent weights for each batch element
+        sample_weight = tf.ragged.constant(
+            [[1.2, 3.4], [5.6]], dtype=tf.float32
+        )
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        # sum([0.1054*1.2, 0.1165*3.4, 0.0619*5.6])/3
+        self.assertAlmostEqual(self.evaluate(loss), 0.2897, 3)
+
+        # Test with logits.
+        logits = tf.ragged.constant(
+            [[[8.0, 1.0, 1.0], [0.0, 9.0, 1.0]], [[2.0, 3.0, 5.0]]]
+        )
+        cce_obj = losses.CategoricalCrossentropy(from_logits=True)
+        # batch losses [[0.0018, 0.0004], [0.1698]]
+        # sum([0.0018*1.2, 0.0004*3.4, 0.1698*5.6]) / 3
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.3181, 3)
+
+    def test_binary_labels(self):
+        # raise a warning if the shape of y_true and y_pred are all (None, 1).
+        # categorical_crossentropy shouldn't be used with binary labels.
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            cce_obj = losses.CategoricalCrossentropy()
+            cce_obj(tf.constant([[1.0], [0.0]]), tf.constant([[1.0], [1.0]]))
+            self.assertIs(w[-1].category, SyntaxWarning)
+            self.assertIn(
+                "In loss categorical_crossentropy, expected ",
+                str(w[-1].message),
+            )
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class CategoricalFocalCrossentropyTest(tf.test.TestCase):
+    def test_config(self):
+        cce_obj = losses.CategoricalFocalCrossentropy(
+            name="focal_cce",
+            reduction=losses_utils.ReductionV2.SUM,
+            alpha=0.25,
+            gamma=2.0,
+        )
+        self.assertEqual(cce_obj.name, "focal_cce")
+        self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
+        self.assertEqual(cce_obj.alpha, 0.25)
+        self.assertEqual(cce_obj.gamma, 2.0)
+
+        # Test alpha as a list
+        cce_obj = losses.CategoricalFocalCrossentropy(alpha=[0.25, 0.5, 0.75])
+        self.assertEqual(cce_obj.alpha, [0.25, 0.5, 0.75])
+
+    def test_all_correct_unweighted(self):
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=tf.int64)
+        y_pred = tf.constant(
+            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],
+            dtype=tf.float32,
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(alpha=0.25, gamma=2.0)
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[10.0, 0.0, 0.0], [0.0, 10.0, 0.0], [0.0, 0.0, 10.0]]
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        cce_obj = losses.CategoricalFocalCrossentropy()
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.02059, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.000345, 3)
+
+    def test_scalar_weighted(self):
+        cce_obj = losses.CategoricalFocalCrossentropy()
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 0.047368, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 0.000794, 4)
+
+    def test_sample_weighted(self):
+        cce_obj = losses.CategoricalFocalCrossentropy()
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        sample_weight = tf.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.06987, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.001933, 3)
+
+    def test_no_reduction(self):
+        y_true = tf.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(
+            from_logits=True, reduction=losses_utils.ReductionV2.NONE
+        )
+        loss = cce_obj(y_true, logits)
+        self.assertAllClose(
+            (1.5096224e-09, 2.4136547e-11, 1.0360638e-03),
+            self.evaluate(loss),
+            3,
+        )
+
+    def test_label_smoothing(self):
+        logits = tf.constant([[4.9, -0.5, 2.05]])
+        y_true = tf.constant([[1, 0, 0]])
+        label_smoothing = 0.1
+
+        cce_obj = losses.CategoricalFocalCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        loss = cce_obj(y_true, logits)
+
+        expected_value = 0.06685
+        self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+    def test_label_smoothing_ndarray(self):
+        logits = np.asarray([[4.9, -0.5, 2.05]])
+        y_true = np.asarray([[1, 0, 0]])
+        label_smoothing = 0.1
+
+        cce_obj = losses.CategoricalFocalCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        loss = cce_obj(y_true, logits)
+
+        expected_value = 0.06685
+        self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+    def test_shape_mismatch(self):
+        y_true = tf.constant([[0], [1], [2]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]]
+        )
+
+        cce_obj = losses.CategoricalFocalCrossentropy()
+        with self.assertRaisesRegex(ValueError, "Shapes .+ are incompatible"):
+            cce_obj(y_true, y_pred)
+
+    def test_ragged_tensors(self):
+        cce_obj = losses.CategoricalFocalCrossentropy()
+        y_true = tf.ragged.constant([[[1, 0, 0], [0, 1, 0]], [[0, 0, 1]]])
+        y_pred = tf.ragged.constant(
+            [[[0.9, 0.05, 0.05], [0.5, 0.89, 0.6]], [[0.05, 0.01, 0.94]]],
+            dtype=tf.float32,
+        )
+        # batch losses [[0.1054, 0.8047], [0.0619]]
+        sample_weight = tf.constant([[1.2], [3.4]], shape=(2, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        self.assertAlmostEqual(self.evaluate(loss), 0.024754, 3)
+
+        # Test with logits.
+        logits = tf.ragged.constant(
+            [[[8.0, 1.0, 1.0], [0.0, 9.0, 1.0]], [[2.0, 3.0, 5.0]]]
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(from_logits=True)
+
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.00117, 3)
+
+    def test_ragged_tensors_ragged_sample_weights(self):
+        cce_obj = losses.CategoricalFocalCrossentropy()
+        y_true = tf.ragged.constant([[[1, 0, 0], [0, 1, 0]], [[0, 0, 1]]])
+        y_pred = tf.ragged.constant(
+            [[[0.9, 0.05, 0.05], [0.05, 0.89, 0.06]], [[0.05, 0.01, 0.94]]],
+            dtype=tf.float32,
+        )
+        sample_weight = tf.ragged.constant(
+            [[1.2, 3.4], [5.6]], dtype=tf.float32
+        )
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0006088, 4)
+
+        # Test with logits.
+        logits = tf.ragged.constant(
+            [[[8.0, 1.0, 1.0], [0.0, 9.0, 1.0]], [[2.0, 3.0, 5.0]]]
+        )
+        cce_obj = losses.CategoricalFocalCrossentropy(from_logits=True)
+
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.001933, 3)
+
+    def test_binary_labels(self):
+        # raise a warning if the shape of y_true and y_pred are all (None, 1).
+        # categorical_crossentropy shouldn't be used with binary labels.
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            cce_obj = losses.CategoricalFocalCrossentropy()
+            cce_obj(tf.constant([[1.0], [0.0]]), tf.constant([[1.0], [1.0]]))
+            self.assertIs(w[-1].category, SyntaxWarning)
+            self.assertIn(
+                "In loss categorical_focal_crossentropy, expected ",
+                str(w[-1].message),
+            )
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class SparseCategoricalCrossentropyTest(tf.test.TestCase):
-
-  def test_config(self):
-    cce_obj = losses.SparseCategoricalCrossentropy(
-        reduction=losses_utils.ReductionV2.SUM, name='scc')
-    self.assertEqual(cce_obj.name, 'scc')
-    self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_all_correct_unweighted(self):
-    y_true = tf.constant([[0], [1], [2]], dtype=tf.int64)
-    y_pred = tf.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
-                         dtype=tf.float32)
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    loss = cce_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-    # Test with logits.
-    logits = tf.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
-    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    y_true = tf.constant([0, 1, 2])
-    y_pred = tf.constant([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]],
-                         dtype=tf.float32)
-    loss = cce_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
-
-    # Test with logits.
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
-
-  def test_scalar_weighted(self):
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    y_true = tf.constant([[0], [1], [2]])
-    y_pred = tf.constant([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]],
-                         dtype=tf.float32)
-    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
-
-    # Test with logits.
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
-
-  def test_sample_weighted(self):
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    y_true = tf.constant([[0], [1], [2]])
-    y_pred = tf.constant([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]],
-                         dtype=tf.float32)
-    sample_weight = tf.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
-    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
-
-    # Test with logits.
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
-    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
-
-  def test_no_reduction(self):
-    y_true = tf.constant([[0], [1], [2]])
-    logits = tf.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = losses.SparseCategoricalCrossentropy(
-        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
-    loss = cce_obj(y_true, logits)
-    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
-
-  def test_non_tensor(self):
-    # Test case for GitHub issue 33394.
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    y_true = [[0], [1], [2]]
-    y_pred = [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]]
-    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
-
-  def test_ragged_tensors(self):
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    y_true = tf.ragged.constant([[0, 1], [2]])
-    y_pred = tf.ragged.constant(
-        [[[.9, .05, .05], [.5, .89, .6]], [[.05, .01, .94]]], dtype=tf.float32)
-    # batch losses [[0.1054, 0.8047], [0.0619]]
-    sample_weight = tf.constant([[1.2], [3.4]], shape=(2, 1))
-    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
-    # sum([0.1054, 0.8047, 0.0619]) / 3
-    self.assertAlmostEqual(self.evaluate(loss), 0.4341, 3)
-
-    # Test with logits.
-    logits = tf.ragged.constant([[[8., 1., 1.], [0., 9., 1.]], [[2., 3., 5.]]])
-    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
-    # batch losses [[0.0018, 0.0004], [0.1698]]
-    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.1934, 3)
-
-  def test_ragged_tensors_rank_1(self):
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    y_true = tf.ragged.constant([[0, 1], [2]])
-    y_pred = tf.ragged.constant(
-        [[[.9, .05, .05], [.5, .89, .6]], [[.05, .01, .94]]],
-        ragged_rank=1,
-        dtype=tf.float32)
-    # batch losses [[0.1054, 0.8047], [0.0619]]
-    sample_weight = tf.constant([[1.2], [3.4]], shape=(2, 1))
-    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
-    # sum([0.1054, 0.8047, 0.0619]) / 3
-    self.assertAlmostEqual(self.evaluate(loss), 0.4341, 3)
-
-    # Test with logits.
-    logits = tf.ragged.constant([[[8., 1., 1.], [0., 9., 1.]], [[2., 3., 5.]]],
-                                ragged_rank=1)
-    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
-    # batch losses [[0.0018, 0.0004], [0.1698]]
-    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.1934, 3)
-
-  def test_ragged_tensors_3d(self):
-    # shape [2, 1, None]
-    y_true = tf.ragged.constant([[[1, 1]], [[0]]])
-    # shape [2, 1, None, 2]
-    y_pred = tf.ragged.constant([[[[0.1, 0.9], [0.1, 0.9]]], [[[0.9, 0.1]]]])
-    cce_obj = losses.SparseCategoricalCrossentropy()
-    loss = cce_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 0.1054, 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        cce_obj = losses.SparseCategoricalCrossentropy(
+            reduction=losses_utils.ReductionV2.SUM, name="scc"
+        )
+        self.assertEqual(cce_obj.name, "scc")
+        self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_all_correct_unweighted(self):
+        y_true = tf.constant([[0], [1], [2]], dtype=tf.int64)
+        y_pred = tf.constant(
+            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],
+            dtype=tf.float32,
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[10.0, 0.0, 0.0], [0.0, 10.0, 0.0], [0.0, 0.0, 10.0]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        y_true = tf.constant([0, 1, 2])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.3239, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0573, 3)
+
+    def test_unweighted_ignore_class(self):
+        cce_obj = losses.SparseCategoricalCrossentropy(ignore_class=-1)
+        y_true = tf.constant([0, 1, 2, -1])
+        y_pred = tf.constant(
+            [
+                [0.9, 0.05, 0.05],
+                [0.5, 0.89, 0.6],
+                [0.05, 0.01, 0.94],
+                [0.85, 0.14, 0.01],
+            ],
+            dtype=tf.float32,
+        )
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.3239, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0], [7.8, 2.0, 1.0]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(
+            ignore_class=-1, from_logits=True
+        )
+        loss = cce_obj(y_true, logits)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0573, 3)
+
+    def test_unweighted_ignore_class_for_segmentation(self):
+        cce_obj = losses.SparseCategoricalCrossentropy(ignore_class=-1)
+        y_true = tf.constant(
+            [[[0, 2], [-1, -1]], [[0, 2], [-1, -1]], [[0, 0], [0, 0]]]
+        )
+        y_pred = tf.constant(
+            [
+                [
+                    [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
+                    [[0.2, 0.5, 0.3], [0.0, 1.0, 0.0]],
+                ],
+                [
+                    [[1.0, 0.0, 0.0], [0.0, 0.5, 0.5]],
+                    [[0.2, 0.5, 0.3], [0.0, 1.0, 0.0]],
+                ],
+                [
+                    [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]],
+                    [[0.1, 0.9, 0.0], [0.2, 0.8, 0.0]],
+                ],
+            ],
+            dtype=tf.float32,
+        )
+
+        # Expected loss values:
+        # [[0.0, 0.0], [0.0, 0.0]],
+        # [[0.0, 0.693148], [0.0, 0.0]],
+        # [[0.0, 0.0], [2.302585, 1.609438]],
+
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.575646375, 3)
+
+        # # Test with logits.
+        # logits = tf.constant(
+        #     [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        # )
+        # cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
+        # loss = cce_obj(y_true, logits)
+        # self.assertAlmostEqual(self.evaluate(loss), 0.0573, 3)
+
+    def test_scalar_weighted(self):
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        y_true = tf.constant([[0], [1], [2]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 0.7449, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 0.1317, 3)
+
+    def test_sample_weighted(self):
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        y_true = tf.constant([[0], [1], [2]])
+        y_pred = tf.constant(
+            [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]],
+            dtype=tf.float32,
+        )
+        sample_weight = tf.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+    def test_sample_weighted_ignore_class(self):
+        cce_obj = losses.SparseCategoricalCrossentropy(ignore_class=-1)
+        y_true = tf.constant([[0], [1], [2], [-1]])
+        y_pred = tf.constant(
+            [
+                [0.9, 0.05, 0.05],
+                [0.5, 0.89, 0.6],
+                [0.05, 0.01, 0.94],
+                [0.85, 0.14, 0.01],
+            ],
+            dtype=tf.float32,
+        )
+        sample_weight = tf.constant([[1.2], [3.4], [5.6], [10.4]], shape=(4, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+        # Test with logits.
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0], [7.8, 2.0, 1.0]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(
+            ignore_class=-1, from_logits=True
+        )
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+    def test_no_reduction(self):
+        y_true = tf.constant([[0], [1], [2]])
+        logits = tf.constant(
+            [[8.0, 1.0, 1.0], [0.0, 9.0, 1.0], [2.0, 3.0, 5.0]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(
+            from_logits=True, reduction=losses_utils.ReductionV2.NONE
+        )
+        loss = cce_obj(y_true, logits)
+        self.assertAllClose(
+            (0.001822, 0.000459, 0.169846), self.evaluate(loss), 3
+        )
+
+    def test_non_tensor(self):
+        # Test case for GitHub issue 33394.
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        y_true = [[0], [1], [2]]
+        y_pred = [[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [0.05, 0.01, 0.94]]
+        loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 0.7449, 3)
+
+    def test_ragged_tensors(self):
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        y_true = tf.ragged.constant([[0, 1], [2]])
+        y_pred = tf.ragged.constant(
+            [[[0.9, 0.05, 0.05], [0.5, 0.89, 0.6]], [[0.05, 0.01, 0.94]]],
+            dtype=tf.float32,
+        )
+        # batch losses [[0.1054, 0.8047], [0.0619]]
+        sample_weight = tf.constant([[1.2], [3.4]], shape=(2, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        # sum([0.1054, 0.8047, 0.0619]) / 3
+        self.assertAlmostEqual(self.evaluate(loss), 0.4341, 3)
+
+        # Test with logits.
+        logits = tf.ragged.constant(
+            [[[8.0, 1.0, 1.0], [0.0, 9.0, 1.0]], [[2.0, 3.0, 5.0]]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
+        # batch losses [[0.0018, 0.0004], [0.1698]]
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.1934, 3)
+
+    def test_ragged_tensors_rank_1(self):
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        y_true = tf.ragged.constant([[0, 1], [2]])
+        y_pred = tf.ragged.constant(
+            [[[0.9, 0.05, 0.05], [0.5, 0.89, 0.6]], [[0.05, 0.01, 0.94]]],
+            ragged_rank=1,
+            dtype=tf.float32,
+        )
+        # batch losses [[0.1054, 0.8047], [0.0619]]
+        sample_weight = tf.constant([[1.2], [3.4]], shape=(2, 1))
+        loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+        # sum([0.1054, 0.8047, 0.0619]) / 3
+        self.assertAlmostEqual(self.evaluate(loss), 0.4341, 3)
+
+        # Test with logits.
+        logits = tf.ragged.constant(
+            [[[8.0, 1.0, 1.0], [0.0, 9.0, 1.0]], [[2.0, 3.0, 5.0]]],
+            ragged_rank=1,
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
+        # batch losses [[0.0018, 0.0004], [0.1698]]
+        loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.1934, 3)
+
+    def test_ragged_tensors_3d(self):
+        # shape [2, 1, None]
+        y_true = tf.ragged.constant([[[1, 1]], [[0]]])
+        # shape [2, 1, None, 2]
+        y_pred = tf.ragged.constant(
+            [[[[0.1, 0.9], [0.1, 0.9]]], [[[0.9, 0.1]]]]
+        )
+        cce_obj = losses.SparseCategoricalCrossentropy()
+        loss = cce_obj(y_true, y_pred)
+        self.assertAlmostEqual(self.evaluate(loss), 0.1054, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class HingeTest(tf.test.TestCase):
-
-  def test_config(self):
-    hinge_obj = losses.Hinge(
-        reduction=losses_utils.ReductionV2.SUM, name='hinge_loss')
-    self.assertEqual(hinge_obj.name, 'hinge_loss')
-    self.assertEqual(hinge_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    hinge_obj = losses.Hinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
-    #      = [0.6, 0.4125]
-    # reduced loss = (0.6 + 0.4125) / 2
-
-    loss = hinge_obj(y_true, y_pred)
-    self.assertAllClose(0.506, self.evaluate(loss), atol=1e-3)
-
-  def test_scalar_weighted(self):
-    hinge_obj = losses.Hinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
-    #      = [0.6, 0.4125]
-    # weighted_loss = [0.6 * 2.3, 0.4125 * 2.3]
-    # reduced loss = (0.6 + 0.4125) * 2.3 / 2
-
-    loss = hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 1.164, 3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAllClose(self.evaluate(loss), self.evaluate(loss_2), 1e-3)
-
-  def test_sample_weighted(self):
-    hinge_obj = losses.Hinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
-    #      = [0.6, 0.4125]
-    # weighted loss = [0.6 * 1.2, 0.4125 * 3.4]
-    # reduced loss = (0.6 * 1.2 + 0.4125 * 3.4) / 2
-
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(loss), 1.061, 1e-3)
-
-  def test_timestep_weighted(self):
-    hinge_obj = losses.Hinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]],
-                         shape=(2, 4, 1))
-    sample_weight = tf.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
-    # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
-    #                    [[0.25], [1], [0.5], [0.6]]]
-    # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
-    #                        [[0.75], [0], [0.5], [0.4]]]
-    # loss = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
-    # weighted loss    = [[2.1, 4.8, 4.5, 0], [3, 0, 0.5, 1.2]]
-    # reduced loss = (2.1 + 4.8 + 4.5 + 0 + 3 + 0 + 0.5 + 1.2) / 8
-
-    loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(loss), 2.012, 1e-3)
-
-  def test_zero_weighted(self):
-    hinge_obj = losses.Hinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-    loss = hinge_obj(y_true, y_pred, sample_weight=0)
-    self.assertAllClose(self.evaluate(loss), 0., 1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        hinge_obj = losses.Hinge(
+            reduction=losses_utils.ReductionV2.SUM, name="hinge_loss"
+        )
+        self.assertEqual(hinge_obj.name, "hinge_loss")
+        self.assertEqual(hinge_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        hinge_obj = losses.Hinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+        #      = [0.6, 0.4125]
+        # reduced loss = (0.6 + 0.4125) / 2
+
+        loss = hinge_obj(y_true, y_pred)
+        self.assertAllClose(0.506, self.evaluate(loss), atol=1e-3)
+
+    def test_scalar_weighted(self):
+        hinge_obj = losses.Hinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+        #      = [0.6, 0.4125]
+        # weighted_loss = [0.6 * 2.3, 0.4125 * 2.3]
+        # reduced loss = (0.6 + 0.4125) * 2.3 / 2
+
+        loss = hinge_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 1.164, 3)
+
+        # Verify we get the same output when the same input is given
+        loss_2 = hinge_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAllClose(self.evaluate(loss), self.evaluate(loss_2), 1e-3)
+
+    def test_sample_weighted(self):
+        hinge_obj = losses.Hinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+        #      = [0.6, 0.4125]
+        # weighted loss = [0.6 * 1.2, 0.4125 * 3.4]
+        # reduced loss = (0.6 * 1.2 + 0.4125 * 3.4) / 2
+
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(loss), 1.061, 1e-3)
+
+    def test_timestep_weighted(self):
+        hinge_obj = losses.Hinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
+        y_pred = tf.constant(
+            [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]], shape=(2, 4, 1)
+        )
+        sample_weight = tf.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
+        # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
+        #                    [[0.25], [1], [0.5], [0.6]]]
+        # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
+        #                        [[0.75], [0], [0.5], [0.4]]]
+        # loss = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+        # weighted loss    = [[2.1, 4.8, 4.5, 0], [3, 0, 0.5, 1.2]]
+        # reduced loss = (2.1 + 4.8 + 4.5 + 0 + 3 + 0 + 0.5 + 1.2) / 8
+
+        loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(loss), 2.012, 1e-3)
+
+    def test_zero_weighted(self):
+        hinge_obj = losses.Hinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+        loss = hinge_obj(y_true, y_pred, sample_weight=0)
+        self.assertAllClose(self.evaluate(loss), 0.0, 1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class SquaredHingeTest(tf.test.TestCase):
-
-  def test_config(self):
-    sq_hinge_obj = losses.SquaredHinge(
-        reduction=losses_utils.ReductionV2.SUM, name='sq_hinge_loss')
-    self.assertEqual(sq_hinge_obj.name, 'sq_hinge_loss')
-    self.assertEqual(sq_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    sq_hinge_obj = losses.SquaredHinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
-    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
-    #                                         [0.5625, 0, 0.25, 0.16]]
-    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
-    #      = [0.485, 0.2431]
-    # reduced loss = (0.485 + 0.2431) / 2
-
-    loss = sq_hinge_obj(y_true, y_pred)
-    self.assertAllClose(self.evaluate(loss), 0.364, 1e-3)
-
-  def test_scalar_weighted(self):
-    sq_hinge_obj = losses.SquaredHinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
-    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
-    #                                         [0.5625, 0, 0.25, 0.16]]
-    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
-    #      = [0.485, 0.2431]
-    # weighted loss = [0.485 * 2.3, 0.2431 * 2.3]
-    # reduced loss = (0.485 + 0.2431) * 2.3 / 2
-
-    loss = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAllClose(self.evaluate(loss), 0.837, 1e-3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
-
-  def test_sample_weighted(self):
-    sq_hinge_obj = losses.SquaredHinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
-    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
-    #                                         [0.5625, 0, 0.25, 0.16]]
-    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
-    #      = [0.485, 0.2431]
-    # weighted loss = [0.485 * 1.2, 0.2431 * 3.4]
-    # reduced loss = (0.485 * 1.2 + 0.2431 * 3.4) / 2
-
-    sample_weight = tf.constant([1.2, 3.4])
-    loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(loss), 0.704, 1e-3)
-
-  def test_timestep_weighted(self):
-    sq_hinge_obj = losses.SquaredHinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]],
-                         shape=(2, 4, 1))
-    sample_weight = tf.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
-
-    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
-    # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
-    #                    [[0.25], [1], [0.5], [0.6]]]
-    # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
-    #                        [[0.75], [0], [0.5], [0.4]]]
-    # loss = [[0.49, 0.64, 0.81, 0], [0.5625, 0, 0.25, 0.16]]
-    # weighted loss    = [[1.47, 3.84, 4.05, 0], [2.25, 0, 0.25, 0.48]]
-    # reduced loss = (1.47 + 3.84 + 4.05 + 0 + 2.25 + 0 + 0.25 + 0.48) / 8
-
-    loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(loss), 1.542, 1e-3)
-
-  def test_zero_weighted(self):
-    sq_hinge_obj = losses.SquaredHinge()
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]])
-    loss = sq_hinge_obj(y_true, y_pred, sample_weight=0)
-    self.assertAllClose(self.evaluate(loss), 0., 1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        sq_hinge_obj = losses.SquaredHinge(
+            reduction=losses_utils.ReductionV2.SUM, name="sq_hinge_loss"
+        )
+        self.assertEqual(sq_hinge_obj.name, "sq_hinge_loss")
+        self.assertEqual(sq_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        sq_hinge_obj = losses.SquaredHinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5,
+        # 0.4]]
+        # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+        #                                         [0.5625, 0, 0.25, 0.16]]
+        # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+        #      = [0.485, 0.2431]
+        # reduced loss = (0.485 + 0.2431) / 2
+
+        loss = sq_hinge_obj(y_true, y_pred)
+        self.assertAllClose(self.evaluate(loss), 0.364, 1e-3)
+
+    def test_scalar_weighted(self):
+        sq_hinge_obj = losses.SquaredHinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5,
+        # 0.4]]
+        # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+        #                                         [0.5625, 0, 0.25, 0.16]]
+        # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+        #      = [0.485, 0.2431]
+        # weighted loss = [0.485 * 2.3, 0.2431 * 2.3]
+        # reduced loss = (0.485 + 0.2431) * 2.3 / 2
+
+        loss = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAllClose(self.evaluate(loss), 0.837, 1e-3)
+
+        # Verify we get the same output when the same input is given
+        loss_2 = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+    def test_sample_weighted(self):
+        sq_hinge_obj = losses.SquaredHinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5,
+        # 0.4]]
+        # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+        #                                         [0.5625, 0, 0.25, 0.16]]
+        # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+        #      = [0.485, 0.2431]
+        # weighted loss = [0.485 * 1.2, 0.2431 * 3.4]
+        # reduced loss = (0.485 * 1.2 + 0.2431 * 3.4) / 2
+
+        sample_weight = tf.constant([1.2, 3.4])
+        loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(loss), 0.704, 1e-3)
+
+    def test_timestep_weighted(self):
+        sq_hinge_obj = losses.SquaredHinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
+        y_pred = tf.constant(
+            [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]], shape=(2, 4, 1)
+        )
+        sample_weight = tf.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
+
+        # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
+        # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
+        #                    [[0.25], [1], [0.5], [0.6]]]
+        # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
+        #                        [[0.75], [0], [0.5], [0.4]]]
+        # loss = [[0.49, 0.64, 0.81, 0], [0.5625, 0, 0.25, 0.16]]
+        # weighted loss    = [[1.47, 3.84, 4.05, 0], [2.25, 0, 0.25, 0.48]]
+        # reduced loss = (1.47 + 3.84 + 4.05 + 0 + 2.25 + 0 + 0.25 + 0.48) / 8
+
+        loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(loss), 1.542, 1e-3)
+
+    def test_zero_weighted(self):
+        sq_hinge_obj = losses.SquaredHinge()
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+        loss = sq_hinge_obj(y_true, y_pred, sample_weight=0)
+        self.assertAllClose(self.evaluate(loss), 0.0, 1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CategoricalHingeTest(tf.test.TestCase):
-
-  def test_config(self):
-    cat_hinge_obj = losses.CategoricalHinge(
-        reduction=losses_utils.ReductionV2.SUM, name='cat_hinge_loss')
-    self.assertEqual(cat_hinge_obj.name, 'cat_hinge_loss')
-    self.assertEqual(cat_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    cat_hinge_obj = losses.CategoricalHinge()
-    y_true = tf.constant([1, 9, 2, -5], shape=(2, 2))
-    y_pred = tf.constant([4, 8, 12, 8], shape=(2, 2), dtype=tf.float32)
-    loss = cat_hinge_obj(y_true, y_pred)
-
-    # pos = reduce_sum(y_true * y_pred) = [1*4+8*9, 12*2+8*-5] = [76, -16]
-    # neg = reduce_max((1. - y_true) * y_pred) = [[0, -64], [-12, 48]] = [0, 48]
-    # cat_hinge = max(0., neg - pos + 1.) = [0, 65]
-    # reduced_loss = (0 + 65)/2 = 32.5
-    self.assertAlmostEqual(self.evaluate(loss), 32.5, 3)
-
-  def test_scalar_weighted(self):
-    cat_hinge_obj = losses.CategoricalHinge()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 83.95, 3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
-
-  def test_sample_weighted(self):
-    cat_hinge_obj = losses.CategoricalHinge()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 124.1, 3)
-
-  def test_timestep_weighted(self):
-    cat_hinge_obj = losses.CategoricalHinge()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32)
-    sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 4.0, 3)
-
-  def test_zero_weighted(self):
-    cat_hinge_obj = losses.CategoricalHinge()
-    y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = tf.constant([4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32)
-    loss = cat_hinge_obj(y_true, y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def test_config(self):
+        cat_hinge_obj = losses.CategoricalHinge(
+            reduction=losses_utils.ReductionV2.SUM, name="cat_hinge_loss"
+        )
+        self.assertEqual(cat_hinge_obj.name, "cat_hinge_loss")
+        self.assertEqual(cat_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        cat_hinge_obj = losses.CategoricalHinge()
+        y_true = tf.constant([1, 9, 2, -5], shape=(2, 2))
+        y_pred = tf.constant([4, 8, 12, 8], shape=(2, 2), dtype=tf.float32)
+        loss = cat_hinge_obj(y_true, y_pred)
+
+        # pos = reduce_sum(y_true * y_pred) = [1*4+8*9, 12*2+8*-5] = [76, -16]
+        # neg = reduce_max((1. - y_true) * y_pred) = [[0, -64], [-12, 48]] = [0,
+        # 48]
+        # cat_hinge = max(0., neg - pos + 1.) = [0, 65]
+        # reduced_loss = (0 + 65)/2 = 32.5
+        self.assertAlmostEqual(self.evaluate(loss), 32.5, 3)
+
+    def test_scalar_weighted(self):
+        cat_hinge_obj = losses.CategoricalHinge()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), 83.95, 3)
+
+        # Verify we get the same output when the same input is given
+        loss_2 = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
+        self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+    def test_sample_weighted(self):
+        cat_hinge_obj = losses.CategoricalHinge()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 124.1, 3)
+
+    def test_timestep_weighted(self):
+        cat_hinge_obj = losses.CategoricalHinge()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3, 1), dtype=tf.float32
+        )
+        sample_weight = tf.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+        loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 4.0, 3)
+
+    def test_zero_weighted(self):
+        cat_hinge_obj = losses.CategoricalHinge()
+        y_true = tf.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+        y_pred = tf.constant(
+            [4, 8, 12, 8, 1, 3], shape=(2, 3), dtype=tf.float32
+        )
+        loss = cat_hinge_obj(y_true, y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LogCoshTest(tf.test.TestCase):
-
-  def setup(self):
-    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
-    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
-
-    self.batch_size = 6
-    error = y_pred - y_true
-    self.expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
-
-    self.y_pred = tf.constant(y_pred, dtype=tf.float32)
-    self.y_true = tf.constant(y_true)
-
-  def test_config(self):
-    logcosh_obj = losses.LogCosh(
-        reduction=losses_utils.ReductionV2.SUM, name='logcosh_loss')
-    self.assertEqual(logcosh_obj.name, 'logcosh_loss')
-    self.assertEqual(logcosh_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    self.setup()
-    logcosh_obj = losses.LogCosh()
-
-    loss = logcosh_obj(self.y_true, self.y_pred)
-    expected_loss = np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_scalar_weighted(self):
-    self.setup()
-    logcosh_obj = losses.LogCosh()
-    sample_weight = 2.3
-
-    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    expected_loss = sample_weight * np.sum(
-        self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
-
-  def test_sample_weighted(self):
-    self.setup()
-    logcosh_obj = losses.LogCosh()
-
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-
-    expected_loss = np.multiply(
-        self.expected_losses,
-        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
-    expected_loss = np.sum(expected_loss) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_timestep_weighted(self):
-    self.setup()
-    logcosh_obj = losses.LogCosh()
-    y_true = np.asarray([1, 9, 2, -5, -2, 6]).reshape(2, 3, 1)
-    y_pred = np.asarray([4, 8, 12, 8, 1, 3]).reshape(2, 3, 1)
-    error = y_pred - y_true
-    expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
-    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
-
-    y_pred = tf.constant(y_pred, dtype=tf.float32)
-    y_true = tf.constant(y_true)
-    loss = logcosh_obj(
-        y_true, y_pred, sample_weight=tf.constant(sample_weight, shape=(2, 3)))
-    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_zero_weighted(self):
-    self.setup()
-    logcosh_obj = losses.LogCosh()
-    sample_weight = 0
-    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def setup(self):
+        y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+        y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+        self.batch_size = 6
+        error = y_pred - y_true
+        self.expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+        self.y_pred = tf.constant(y_pred, dtype=tf.float32)
+        self.y_true = tf.constant(y_true)
+
+    def test_config(self):
+        logcosh_obj = losses.LogCosh(
+            reduction=losses_utils.ReductionV2.SUM, name="logcosh_loss"
+        )
+        self.assertEqual(logcosh_obj.name, "logcosh_loss")
+        self.assertEqual(logcosh_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        self.setup()
+        logcosh_obj = losses.LogCosh()
+
+        loss = logcosh_obj(self.y_true, self.y_pred)
+        expected_loss = np.sum(self.expected_losses) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_scalar_weighted(self):
+        self.setup()
+        logcosh_obj = losses.LogCosh()
+        sample_weight = 2.3
+
+        loss = logcosh_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+        expected_loss = (
+            sample_weight * np.sum(self.expected_losses) / self.batch_size
+        )
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+        # Verify we get the same output when the same input is given
+        loss_2 = logcosh_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+        self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+    def test_sample_weighted(self):
+        self.setup()
+        logcosh_obj = losses.LogCosh()
+
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = logcosh_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+
+        expected_loss = np.multiply(
+            self.expected_losses,
+            np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)),
+        )
+        expected_loss = np.sum(expected_loss) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_timestep_weighted(self):
+        self.setup()
+        logcosh_obj = losses.LogCosh()
+        y_true = np.asarray([1, 9, 2, -5, -2, 6]).reshape(2, 3, 1)
+        y_pred = np.asarray([4, 8, 12, 8, 1, 3]).reshape(2, 3, 1)
+        error = y_pred - y_true
+        expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+        sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+
+        y_pred = tf.constant(y_pred, dtype=tf.float32)
+        y_true = tf.constant(y_true)
+        loss = logcosh_obj(
+            y_true,
+            y_pred,
+            sample_weight=tf.constant(sample_weight, shape=(2, 3)),
+        )
+        expected_loss = (
+            np.sum(expected_losses * sample_weight) / self.batch_size
+        )
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_zero_weighted(self):
+        self.setup()
+        logcosh_obj = losses.LogCosh()
+        sample_weight = 0
+        loss = logcosh_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class PoissonTest(tf.test.TestCase):
-
-  def setup(self):
-    self.np_y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
-    self.np_y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
-
-    self.batch_size = 6
-    self.expected_losses = self.np_y_pred - np.multiply(self.np_y_true,
-                                                        np.log(self.np_y_pred))
-
-    self.y_pred = tf.constant(self.np_y_pred, dtype=tf.float32)
-    self.y_true = tf.constant(self.np_y_true)
-
-  def test_config(self):
-    poisson_obj = losses.Poisson(
-        reduction=losses_utils.ReductionV2.SUM, name='poisson')
-    self.assertEqual(poisson_obj.name, 'poisson')
-    self.assertEqual(poisson_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    self.setup()
-    poisson_obj = losses.Poisson()
-
-    loss = poisson_obj(self.y_true, self.y_pred)
-    expected_loss = np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_scalar_weighted(self):
-    self.setup()
-    poisson_obj = losses.Poisson()
-    sample_weight = 2.3
-    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-
-    expected_loss = sample_weight * np.sum(
-        self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
-
-  def test_sample_weighted(self):
-    self.setup()
-    poisson_obj = losses.Poisson()
-
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-
-    expected_loss = np.multiply(
-        self.expected_losses,
-        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
-    expected_loss = np.sum(expected_loss) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_timestep_weighted(self):
-    self.setup()
-    poisson_obj = losses.Poisson()
-    y_true = self.np_y_true.reshape(2, 3, 1)
-    y_pred = self.np_y_pred.reshape(2, 3, 1)
-    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3, 1)
-    expected_losses = y_pred - np.multiply(y_true, np.log(y_pred))
-
-    y_pred = tf.constant(y_pred, dtype=tf.float32)
-    y_true = tf.constant(y_true)
-
-    loss = poisson_obj(
-        y_true, y_pred, sample_weight=tf.constant(sample_weight, shape=(2, 3)))
-    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_zero_weighted(self):
-    self.setup()
-    poisson_obj = losses.Poisson()
-    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def setup(self):
+        self.np_y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+        self.np_y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+        self.batch_size = 6
+        self.expected_losses = self.np_y_pred - np.multiply(
+            self.np_y_true, np.log(self.np_y_pred)
+        )
+
+        self.y_pred = tf.constant(self.np_y_pred, dtype=tf.float32)
+        self.y_true = tf.constant(self.np_y_true)
+
+    def test_config(self):
+        poisson_obj = losses.Poisson(
+            reduction=losses_utils.ReductionV2.SUM, name="poisson"
+        )
+        self.assertEqual(poisson_obj.name, "poisson")
+        self.assertEqual(poisson_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        self.setup()
+        poisson_obj = losses.Poisson()
+
+        loss = poisson_obj(self.y_true, self.y_pred)
+        expected_loss = np.sum(self.expected_losses) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_scalar_weighted(self):
+        self.setup()
+        poisson_obj = losses.Poisson()
+        sample_weight = 2.3
+        loss = poisson_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+
+        expected_loss = (
+            sample_weight * np.sum(self.expected_losses) / self.batch_size
+        )
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+        # Verify we get the same output when the same input is given
+        loss_2 = poisson_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+        self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+    def test_sample_weighted(self):
+        self.setup()
+        poisson_obj = losses.Poisson()
+
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = poisson_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+
+        expected_loss = np.multiply(
+            self.expected_losses,
+            np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)),
+        )
+        expected_loss = np.sum(expected_loss) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_timestep_weighted(self):
+        self.setup()
+        poisson_obj = losses.Poisson()
+        y_true = self.np_y_true.reshape(2, 3, 1)
+        y_pred = self.np_y_pred.reshape(2, 3, 1)
+        sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3, 1)
+        expected_losses = y_pred - np.multiply(y_true, np.log(y_pred))
+
+        y_pred = tf.constant(y_pred, dtype=tf.float32)
+        y_true = tf.constant(y_true)
+
+        loss = poisson_obj(
+            y_true,
+            y_pred,
+            sample_weight=tf.constant(sample_weight, shape=(2, 3)),
+        )
+        expected_loss = (
+            np.sum(expected_losses * sample_weight) / self.batch_size
+        )
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_zero_weighted(self):
+        self.setup()
+        poisson_obj = losses.Poisson()
+        loss = poisson_obj(self.y_true, self.y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class KLDivergenceTest(tf.test.TestCase):
-
-  def setup(self):
-    self.np_y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
-    self.np_y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
-
-    self.batch_size = 2
-    self.expected_losses = np.multiply(self.np_y_true,
-                                       np.log(self.np_y_true / self.np_y_pred))
-
-    self.y_pred = tf.constant(self.np_y_pred, dtype=tf.float32)
-    self.y_true = tf.constant(self.np_y_true)
-
-  def test_config(self):
-    k_obj = losses.KLDivergence(
-        reduction=losses_utils.ReductionV2.SUM, name='kld')
-    self.assertEqual(k_obj.name, 'kld')
-    self.assertEqual(k_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_unweighted(self):
-    self.setup()
-    k_obj = losses.KLDivergence()
-
-    loss = k_obj(self.y_true, self.y_pred)
-    expected_loss = np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_scalar_weighted(self):
-    self.setup()
-    k_obj = losses.KLDivergence()
-    sample_weight = 2.3
-
-    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    expected_loss = sample_weight * np.sum(
-        self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
-
-  def test_sample_weighted(self):
-    self.setup()
-    k_obj = losses.KLDivergence()
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-
-    expected_loss = np.multiply(
-        self.expected_losses,
-        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(2, 3))
-    expected_loss = np.sum(expected_loss) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_timestep_weighted(self):
-    self.setup()
-    k_obj = losses.KLDivergence()
-    y_true = self.np_y_true.reshape(2, 3, 1)
-    y_pred = self.np_y_pred.reshape(2, 3, 1)
-    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3)
-    expected_losses = np.sum(
-        np.multiply(y_true, np.log(y_true / y_pred)), axis=-1)
-
-    y_pred = tf.constant(y_pred, dtype=tf.float32)
-    y_true = tf.constant(y_true)
-    loss = k_obj(y_true, y_pred, sample_weight=tf.constant(sample_weight))
-
-    num_timesteps = 3
-    expected_loss = np.sum(expected_losses * sample_weight) / (
-        self.batch_size * num_timesteps)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_zero_weighted(self):
-    self.setup()
-    k_obj = losses.KLDivergence()
-    loss = k_obj(self.y_true, self.y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    def setup(self):
+        self.np_y_pred = np.asarray([0.4, 0.9, 0.12, 0.36, 0.3, 0.4]).reshape(
+            (2, 3)
+        )
+        self.np_y_true = np.asarray([0.5, 0.8, 0.12, 0.7, 0.43, 0.8]).reshape(
+            (2, 3)
+        )
+
+        self.batch_size = 2
+        self.expected_losses = np.multiply(
+            self.np_y_true, np.log(self.np_y_true / self.np_y_pred)
+        )
+
+        self.y_pred = tf.constant(self.np_y_pred, dtype=tf.float32)
+        self.y_true = tf.constant(self.np_y_true)
+
+    def test_config(self):
+        k_obj = losses.KLDivergence(
+            reduction=losses_utils.ReductionV2.SUM, name="kld"
+        )
+        self.assertEqual(k_obj.name, "kld")
+        self.assertEqual(k_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_unweighted(self):
+        self.setup()
+        k_obj = losses.KLDivergence()
+
+        loss = k_obj(self.y_true, self.y_pred)
+        expected_loss = np.sum(self.expected_losses) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_scalar_weighted(self):
+        self.setup()
+        k_obj = losses.KLDivergence()
+        sample_weight = 2.3
+
+        loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        expected_loss = (
+            sample_weight * np.sum(self.expected_losses) / self.batch_size
+        )
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+        # Verify we get the same output when the same input is given
+        loss_2 = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+    def test_sample_weighted(self):
+        self.setup()
+        k_obj = losses.KLDivergence()
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+        expected_loss = np.multiply(
+            self.expected_losses,
+            np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(2, 3),
+        )
+        expected_loss = np.sum(expected_loss) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_timestep_weighted(self):
+        self.setup()
+        k_obj = losses.KLDivergence()
+        y_true = self.np_y_true.reshape(2, 3, 1)
+        y_pred = self.np_y_pred.reshape(2, 3, 1)
+        sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3)
+        expected_losses = np.sum(
+            np.multiply(y_true, np.log(y_true / y_pred)), axis=-1
+        )
+
+        y_pred = tf.constant(y_pred, dtype=tf.float32)
+        y_true = tf.constant(y_true)
+        loss = k_obj(y_true, y_pred, sample_weight=tf.constant(sample_weight))
+
+        num_timesteps = 3
+        expected_loss = np.sum(expected_losses * sample_weight) / (
+            self.batch_size * num_timesteps
+        )
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_zero_weighted(self):
+        self.setup()
+        k_obj = losses.KLDivergence()
+        loss = k_obj(self.y_true, self.y_pred, sample_weight=0)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class HuberLossTest(tf.test.TestCase):
-
-  def huber_loss(self, y_true, y_pred, delta=1.0):
-    error = y_pred - y_true
-    abs_error = np.abs(error)
-
-    quadratic = np.minimum(abs_error, delta)
-    linear = np.subtract(abs_error, quadratic)
-    return np.add(
-        np.multiply(0.5, np.multiply(quadratic, quadratic)),
-        np.multiply(delta, linear))
-
-  def setup(self, delta=1.0):
-    self.np_y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
-    self.np_y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
-
-    self.batch_size = 6
-    self.expected_losses = self.huber_loss(self.np_y_true, self.np_y_pred,
-                                           delta)
-
-    self.y_pred = tf.constant(self.np_y_pred)
-    self.y_true = tf.constant(self.np_y_true)
-
-  def test_config(self):
-    h_obj = losses.Huber(reduction=losses_utils.ReductionV2.SUM, name='huber')
-    self.assertEqual(h_obj.name, 'huber')
-    self.assertEqual(h_obj.reduction, losses_utils.ReductionV2.SUM)
-
-  def test_all_correct(self):
-    self.setup()
-    h_obj = losses.Huber()
-    loss = h_obj(self.y_true, self.y_true)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    self.setup()
-    h_obj = losses.Huber()
-    loss = h_obj(self.y_true, self.y_pred)
-    actual_loss = np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-  def test_scalar_weighted(self):
-    self.setup()
-    h_obj = losses.Huber()
-    sample_weight = 2.3
-    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
-
-  def test_sample_weighted(self):
-    self.setup()
-    h_obj = losses.Huber()
-    sample_weight = tf.constant((1.2, 3.4), shape=(2, 1))
-
-    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    actual_loss = np.multiply(
-        self.expected_losses,
-        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
-    actual_loss = np.sum(actual_loss) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-  def test_timestep_weighted(self):
-    self.setup()
-    h_obj = losses.Huber()
-    y_pred = self.np_y_pred.reshape((2, 3, 1))
-    y_true = self.np_y_true.reshape((2, 3, 1))
-    expected_losses = self.huber_loss(y_true, y_pred)
-
-    y_pred = tf.constant(y_pred)
-    y_true = tf.constant(y_true)
-    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
-    loss = h_obj(
-        y_true, y_pred, sample_weight=tf.constant(sample_weight, shape=(2, 3)))
-    actual_loss = np.multiply(expected_losses, sample_weight)
-    actual_loss = np.sum(actual_loss) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-  def test_zero_weighted(self):
-    self.setup()
-    h_obj = losses.Huber()
-    sample_weight = 0
-    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
-
-  def test_non_default_delta(self):
-    self.setup(delta=0.8)
-    h_obj = losses.Huber(delta=0.8)
-    sample_weight = 2.3
-    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-  def test_loss_with_non_default_dtype(self):
-    # Test case for GitHub issue:
-    # https://github.com/tensorflow/tensorflow/issues/39004
-    self.setup()
-    h_obj = losses.Huber()
-    try:
-      backend.set_floatx('float64')
-      loss = h_obj(self.y_true, self.y_true)
-      self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-    finally:
-      backend.set_floatx('float32')
+    def huber_loss(self, y_true, y_pred, delta=1.0):
+        error = y_pred - y_true
+        abs_error = np.abs(error)
+
+        quadratic = np.minimum(abs_error, delta)
+        linear = np.subtract(abs_error, quadratic)
+        return np.add(
+            np.multiply(0.5, np.multiply(quadratic, quadratic)),
+            np.multiply(delta, linear),
+        )
+
+    def setup(self, delta=1.0):
+        self.np_y_pred = np.asarray([0.9, 0.2, 0.2, 0.8, 0.4, 0.6]).reshape(
+            (2, 3)
+        )
+        self.np_y_true = np.asarray([1.0, 0.0, 1.0, 1.0, 0.0, 0.0]).reshape(
+            (2, 3)
+        )
+
+        self.batch_size = 6
+        self.expected_losses = self.huber_loss(
+            self.np_y_true, self.np_y_pred, delta
+        )
+
+        self.y_pred = tf.constant(self.np_y_pred)
+        self.y_true = tf.constant(self.np_y_true)
+
+    def test_config(self):
+        h_obj = losses.Huber(
+            reduction=losses_utils.ReductionV2.SUM, name="huber"
+        )
+        self.assertEqual(h_obj.name, "huber")
+        self.assertEqual(h_obj.reduction, losses_utils.ReductionV2.SUM)
+
+    def test_all_correct(self):
+        self.setup()
+        h_obj = losses.Huber()
+        loss = h_obj(self.y_true, self.y_true)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_unweighted(self):
+        self.setup()
+        h_obj = losses.Huber()
+        loss = h_obj(self.y_true, self.y_pred)
+        actual_loss = np.sum(self.expected_losses) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    def test_scalar_weighted(self):
+        self.setup()
+        h_obj = losses.Huber()
+        sample_weight = 2.3
+        loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        actual_loss = (
+            sample_weight * np.sum(self.expected_losses) / self.batch_size
+        )
+        self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+        # Verify we get the same output when the same input is given
+        loss_2 = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+    def test_sample_weighted(self):
+        self.setup()
+        h_obj = losses.Huber()
+        sample_weight = tf.constant((1.2, 3.4), shape=(2, 1))
+
+        loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        actual_loss = np.multiply(
+            self.expected_losses,
+            np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)),
+        )
+        actual_loss = np.sum(actual_loss) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    def test_timestep_weighted(self):
+        self.setup()
+        h_obj = losses.Huber()
+        y_pred = self.np_y_pred.reshape((2, 3, 1))
+        y_true = self.np_y_true.reshape((2, 3, 1))
+        expected_losses = self.huber_loss(y_true, y_pred)
+
+        y_pred = tf.constant(y_pred)
+        y_true = tf.constant(y_true)
+        sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+        loss = h_obj(
+            y_true,
+            y_pred,
+            sample_weight=tf.constant(sample_weight, shape=(2, 3)),
+        )
+        actual_loss = np.multiply(expected_losses, sample_weight)
+        actual_loss = np.sum(actual_loss) / self.batch_size
+        self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    def test_zero_weighted(self):
+        self.setup()
+        h_obj = losses.Huber()
+        sample_weight = 0
+        loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    def test_non_default_delta(self):
+        self.setup(delta=0.8)
+        h_obj = losses.Huber(delta=0.8)
+        sample_weight = 2.3
+        loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+        actual_loss = (
+            sample_weight * np.sum(self.expected_losses) / self.batch_size
+        )
+        self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    def test_loss_with_non_default_dtype(self):
+        # Test case for GitHub issue:
+        # https://github.com/tensorflow/tensorflow/issues/39004
+        self.setup()
+        h_obj = losses.Huber()
+        try:
+            backend.set_floatx("float64")
+            loss = h_obj(self.y_true, self.y_true)
+            self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+        finally:
+            backend.set_floatx("float32")
 
 
 class BinaryTruePositivesViaControlFlow(losses.Loss):
+    def __init__(self, reduction=losses_utils.ReductionV2.AUTO):
+        super().__init__(reduction=reduction)
 
-  def __init__(self, reduction=losses_utils.ReductionV2.AUTO):
-    super().__init__(reduction=reduction)
-
-  def call(self, y_true, y_pred):
-    y_true = tf.cast(y_true, tf.bool)
-    y_pred = tf.cast(y_pred, tf.bool)
+    def call(self, y_true, y_pred):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
 
-    result = tf.constant(0.0)
-    for i in range(len(y_true)):
-      for j in range(len(y_true[i])):
-        if y_true[i][j] and y_pred[i][j]:
-          result = result + 1
-    return result
+        result = tf.constant(0.0)
+        for i in range(len(y_true)):
+            for j in range(len(y_true[i])):
+                if y_true[i][j] and y_pred[i][j]:
+                    result = result + 1
+        return result
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CustomLossTest(tf.test.TestCase):
-
-  def test_autograph(self):
-    y_true = tf.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1], [1, 1, 1, 1, 0],
-                          [0, 0, 0, 0, 1.5]])
-    y_pred = tf.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1], [0, 1, 0, 1, 0],
-                          [1, 10, 1, 1, 1]])
-
-    @tf.function
-    def loss_fn(y_true, y_pred):
-      loss_obj = BinaryTruePositivesViaControlFlow()
-      return loss_obj(y_true, y_pred)
-
-    loss = loss_fn(y_true, y_pred)
-    self.assertAllEqual(
-        self.evaluate(loss),
-        7.0,
-    )
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_autograph(self):
+        y_true = tf.constant(
+            [
+                [0, 0.9, 0, 1, 0],
+                [0, 0, 1, 1, 1],
+                [1, 1, 1, 1, 0],
+                [0, 0, 0, 0, 1.5],
+            ]
+        )
+        y_pred = tf.constant(
+            [
+                [0, 0, 1, 5, 0],
+                [1, 1, 1, 1, 1],
+                [0, 1, 0, 1, 0],
+                [1, 10, 1, 1, 1],
+            ]
+        )
+
+        @tf.function
+        def loss_fn(y_true, y_pred):
+            loss_obj = BinaryTruePositivesViaControlFlow()
+            return loss_obj(y_true, y_pred)
+
+        loss = loss_fn(y_true, y_pred)
+        self.assertAllEqual(
+            self.evaluate(loss),
+            7.0,
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/BUILD b/keras/metrics/BUILD
index e8d9911016da..6d259d9c8b23 100644
--- a/keras/metrics/BUILD
+++ b/keras/metrics/BUILD
@@ -16,9 +16,12 @@
 # Description:
 #   Contains the Keras metrics submodule.
 
-load("@org_keras//keras:keras.bzl", "tf_py_test")
+# Placeholder: load unaliased py_library
+load("@org_keras//keras:keras.bzl", "cuda_py_test")
+load("@org_keras//keras:keras.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/python/feature_column:__subpackages__",
@@ -32,8 +35,15 @@ py_library(
     name = "metrics",
     srcs = [
         "__init__.py",
+        "accuracy_metrics.py",
         "base_metric.py",
-        "metrics.py",
+        "confusion_metrics.py",
+        "f_score_metrics.py",
+        "hinge_metrics.py",
+        "iou_metrics.py",
+        "probabilistic_metrics.py",
+        "py_metric.py",
+        "regression_metrics.py",
     ],
     srcs_version = "PY3",
     deps = [
@@ -67,9 +77,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "metrics_test",
+    name = "accuracy_metrics_test",
     size = "medium",
-    srcs = ["metrics_test.py"],
+    srcs = ["accuracy_metrics_test.py"],
     python_version = "PY3",
     shard_count = 4,
     deps = [
@@ -84,38 +94,122 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "base_metric_test",
+    name = "confusion_metrics_test",
     size = "medium",
-    srcs = ["base_metric_test.py"],
+    srcs = ["confusion_metrics_test.py"],
     python_version = "PY3",
     shard_count = 4,
     deps = [
         ":metrics",
+        "//:expect_absl_installed",
         "//:expect_numpy_installed",
+        "//:expect_scipy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
         "//keras/layers",
+        "//keras/models",
         "//keras/testing_infra:test_combinations",
         "//keras/testing_infra:test_utils",
+        "//keras/utils:metrics_utils",
     ],
 )
 
 tf_py_test(
-    name = "confusion_matrix_test",
+    name = "f_score_metrics_test",
     size = "medium",
-    srcs = ["confusion_matrix_test.py"],
+    srcs = ["f_score_metrics_test.py"],
     python_version = "PY3",
     shard_count = 4,
     deps = [
         ":metrics",
         "//:expect_absl_installed",
         "//:expect_numpy_installed",
-        "//:expect_scipy_installed",
         "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
+tf_py_test(
+    name = "hinge_metrics_test",
+    size = "medium",
+    srcs = ["hinge_metrics_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    deps = [
+        ":metrics",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
         "//keras/layers",
-        "//keras/models",
         "//keras/testing_infra:test_combinations",
-        "//keras/utils:metrics_utils",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
+tf_py_test(
+    name = "iou_metrics_test",
+    size = "medium",
+    srcs = ["iou_metrics_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    deps = [
+        ":metrics",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/layers",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
+tf_py_test(
+    name = "probabilistic_metrics_test",
+    size = "medium",
+    srcs = ["probabilistic_metrics_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    deps = [
+        ":metrics",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/layers",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
+tf_py_test(
+    name = "regression_metrics_test",
+    size = "medium",
+    srcs = ["regression_metrics_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    deps = [
+        ":metrics",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+tf_py_test(
+    name = "base_metric_test",
+    size = "medium",
+    srcs = ["base_metric_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    deps = [
+        ":metrics",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/layers",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
     ],
 )
 
@@ -133,3 +227,21 @@ tf_py_test(
         "//keras/testing_infra:test_combinations",
     ],
 )
+
+cuda_py_test(
+    name = "py_metric_test",
+    size = "medium",
+    srcs = ["py_metric_test.py"],
+    shard_count = 2,
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":metrics",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/layers",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
diff --git a/keras/metrics/__init__.py b/keras/metrics/__init__.py
index f9581f89038d..9cc4c770ad51 100644
--- a/keras/metrics/__init__.py
+++ b/keras/metrics/__init__.py
@@ -13,92 +13,111 @@
 # limitations under the License.
 # ==============================================================================
 """All Keras metrics."""
-# pylint: disable=g-bad-import-order
 
-from keras.utils.generic_utils import deserialize_keras_object
-from keras.utils.generic_utils import serialize_keras_object
+# isort: off
+import warnings
 from tensorflow.python.util.tf_export import keras_export
 
-# Base classes
-from keras.metrics.base_metric import Metric
-from keras.metrics.base_metric import Reduce
-from keras.metrics.base_metric import Sum
+# Base classes and utilities
 from keras.metrics.base_metric import Mean
 from keras.metrics.base_metric import MeanMetricWrapper
 from keras.metrics.base_metric import MeanTensor
+from keras.metrics.base_metric import Metric
+from keras.metrics.base_metric import Reduce
+from keras.metrics.base_metric import Sum
 from keras.metrics.base_metric import SumOverBatchSize
 from keras.metrics.base_metric import SumOverBatchSizeMetricWrapper
-
-# Individual metric classes
-from keras.metrics.metrics import MeanRelativeError
-from keras.metrics.metrics import Accuracy
-from keras.metrics.metrics import BinaryAccuracy
-from keras.metrics.metrics import CategoricalAccuracy
-from keras.metrics.metrics import SparseCategoricalAccuracy
-from keras.metrics.metrics import TopKCategoricalAccuracy
-from keras.metrics.metrics import SparseTopKCategoricalAccuracy
-from keras.metrics.metrics import FalsePositives
-from keras.metrics.metrics import FalseNegatives
-from keras.metrics.metrics import TrueNegatives
-from keras.metrics.metrics import TruePositives
-from keras.metrics.metrics import Precision
-from keras.metrics.metrics import Recall
-from keras.metrics.metrics import SensitivityAtSpecificity
-from keras.metrics.metrics import SpecificityAtSensitivity
-from keras.metrics.metrics import PrecisionAtRecall
-from keras.metrics.metrics import RecallAtPrecision
-from keras.metrics.metrics import AUC
-from keras.metrics.metrics import CosineSimilarity
-from keras.metrics.metrics import MeanAbsoluteError
-from keras.metrics.metrics import MeanAbsolutePercentageError
-from keras.metrics.metrics import MeanSquaredError
-from keras.metrics.metrics import MeanSquaredLogarithmicError
-from keras.metrics.metrics import Hinge
-from keras.metrics.metrics import SquaredHinge
-from keras.metrics.metrics import CategoricalHinge
-from keras.metrics.metrics import RootMeanSquaredError
-from keras.metrics.metrics import LogCoshError
-from keras.metrics.metrics import Poisson
-from keras.metrics.metrics import KLDivergence
-from keras.metrics.metrics import IoU
-from keras.metrics.metrics import BinaryIoU
-from keras.metrics.metrics import MeanIoU
-from keras.metrics.metrics import OneHotIoU
-from keras.metrics.metrics import OneHotMeanIoU
-from keras.metrics.metrics import BinaryCrossentropy
-from keras.metrics.metrics import CategoricalCrossentropy
-from keras.metrics.metrics import SparseCategoricalCrossentropy
-
-from keras.metrics.metrics import _IoUBase
-from keras.metrics.metrics import _ConfusionMatrixConditionCount
-from keras.metrics.metrics import SensitivitySpecificityBase
-
-# Metric functions
-from keras.metrics.metrics import accuracy
-from keras.metrics.metrics import binary_accuracy
-from keras.metrics.metrics import categorical_accuracy
-from keras.metrics.metrics import sparse_categorical_accuracy
-from keras.metrics.metrics import top_k_categorical_accuracy
-from keras.metrics.metrics import sparse_top_k_categorical_accuracy
-from keras.metrics.metrics import cosine_similarity
-from keras.metrics.metrics import binary_crossentropy
-from keras.metrics.metrics import categorical_crossentropy
-from keras.metrics.metrics import categorical_hinge
-from keras.metrics.metrics import hinge
-from keras.metrics.metrics import squared_hinge
-from keras.metrics.metrics import kullback_leibler_divergence
-from keras.metrics.metrics import logcosh
-from keras.metrics.metrics import mean_absolute_error
-from keras.metrics.metrics import mean_absolute_percentage_error
-from keras.metrics.metrics import mean_squared_error
-from keras.metrics.metrics import mean_squared_logarithmic_error
-from keras.metrics.metrics import poisson
-from keras.metrics.metrics import sparse_categorical_crossentropy
-
-# Utilities
 from keras.metrics.base_metric import clone_metric
 from keras.metrics.base_metric import clone_metrics
 
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.serialization_lib import deserialize_keras_object
+from keras.saving.serialization_lib import serialize_keras_object
+
+from keras.metrics.py_metric import PyMetric
+
+# Individual metric classes
+
+# Accuracy metrics
+from keras.metrics.accuracy_metrics import Accuracy
+from keras.metrics.accuracy_metrics import BinaryAccuracy
+from keras.metrics.accuracy_metrics import CategoricalAccuracy
+from keras.metrics.accuracy_metrics import SparseCategoricalAccuracy
+from keras.metrics.accuracy_metrics import SparseTopKCategoricalAccuracy
+from keras.metrics.accuracy_metrics import TopKCategoricalAccuracy
+
+from keras.metrics.accuracy_metrics import accuracy
+from keras.metrics.accuracy_metrics import binary_accuracy
+from keras.metrics.accuracy_metrics import categorical_accuracy
+from keras.metrics.accuracy_metrics import sparse_categorical_accuracy
+from keras.metrics.accuracy_metrics import sparse_top_k_categorical_accuracy
+from keras.metrics.accuracy_metrics import top_k_categorical_accuracy
+
+# Probabilistic metrics
+from keras.metrics.probabilistic_metrics import BinaryCrossentropy
+from keras.metrics.probabilistic_metrics import CategoricalCrossentropy
+from keras.metrics.probabilistic_metrics import KLDivergence
+from keras.metrics.probabilistic_metrics import Poisson
+from keras.metrics.probabilistic_metrics import SparseCategoricalCrossentropy
+
+from keras.metrics.probabilistic_metrics import binary_crossentropy
+from keras.metrics.probabilistic_metrics import categorical_crossentropy
+from keras.metrics.probabilistic_metrics import poisson
+from keras.metrics.probabilistic_metrics import kullback_leibler_divergence
+from keras.metrics.probabilistic_metrics import sparse_categorical_crossentropy
+
+# Regression metrics
+from keras.metrics.regression_metrics import CosineSimilarity
+from keras.metrics.regression_metrics import LogCoshError
+from keras.metrics.regression_metrics import MeanAbsoluteError
+from keras.metrics.regression_metrics import MeanAbsolutePercentageError
+from keras.metrics.regression_metrics import MeanRelativeError
+from keras.metrics.regression_metrics import MeanSquaredError
+from keras.metrics.regression_metrics import MeanSquaredLogarithmicError
+from keras.metrics.regression_metrics import RootMeanSquaredError
+from keras.metrics.regression_metrics import R2Score
+
+from keras.metrics.regression_metrics import cosine_similarity
+from keras.metrics.regression_metrics import logcosh
+from keras.metrics.regression_metrics import mean_absolute_error
+from keras.metrics.regression_metrics import mean_absolute_percentage_error
+from keras.metrics.regression_metrics import mean_squared_error
+from keras.metrics.regression_metrics import mean_squared_logarithmic_error
+
+# Confusion metrics
+from keras.metrics.confusion_metrics import AUC
+from keras.metrics.confusion_metrics import FalseNegatives
+from keras.metrics.confusion_metrics import FalsePositives
+from keras.metrics.confusion_metrics import Precision
+from keras.metrics.confusion_metrics import PrecisionAtRecall
+from keras.metrics.confusion_metrics import Recall
+from keras.metrics.confusion_metrics import RecallAtPrecision
+from keras.metrics.confusion_metrics import SensitivityAtSpecificity
+from keras.metrics.confusion_metrics import SensitivitySpecificityBase
+from keras.metrics.confusion_metrics import SpecificityAtSensitivity
+from keras.metrics.confusion_metrics import TrueNegatives
+from keras.metrics.confusion_metrics import TruePositives
+
+# F-Scores
+from keras.metrics.f_score_metrics import FBetaScore
+from keras.metrics.f_score_metrics import F1Score
+
+# IoU metrics
+from keras.metrics.iou_metrics import BinaryIoU
+from keras.metrics.iou_metrics import IoU
+from keras.metrics.iou_metrics import MeanIoU
+from keras.metrics.iou_metrics import OneHotIoU
+from keras.metrics.iou_metrics import OneHotMeanIoU
+
+# Hinge metrics
+from keras.metrics.hinge_metrics import CategoricalHinge
+from keras.metrics.hinge_metrics import Hinge
+from keras.metrics.hinge_metrics import SquaredHinge
+
+from keras.metrics.hinge_metrics import categorical_hinge
+from keras.metrics.hinge_metrics import squared_hinge
+from keras.metrics.hinge_metrics import hinge
+
 # Aliases
 acc = ACC = accuracy
 bce = BCE = binary_crossentropy
@@ -110,78 +129,96 @@
 cosine_proximity = cosine_similarity
 
 
-@keras_export('keras.metrics.serialize')
-def serialize(metric):
-  """Serializes metric function or `Metric` instance.
-
-  Args:
-    metric: A Keras `Metric` instance or a metric function.
-
-  Returns:
-    Metric configuration dictionary.
-  """
-  return serialize_keras_object(metric)
-
-
-@keras_export('keras.metrics.deserialize')
-def deserialize(config, custom_objects=None):
-  """Deserializes a serialized metric class/function instance.
-
-  Args:
-    config: Metric configuration.
-    custom_objects: Optional dictionary mapping names (strings) to custom
-      objects (classes and functions) to be considered during deserialization.
-
-  Returns:
-      A Keras `Metric` instance or a metric function.
-  """
-  return deserialize_keras_object(
-      config,
-      module_objects=globals(),
-      custom_objects=custom_objects,
-      printable_module_name='metric function')
-
-
-@keras_export('keras.metrics.get')
+@keras_export("keras.metrics.serialize")
+def serialize(metric, use_legacy_format=False):
+    """Serializes metric function or `Metric` instance.
+
+    Args:
+      metric: A Keras `Metric` instance or a metric function.
+
+    Returns:
+      Metric configuration dictionary.
+    """
+    if metric is None:
+        return None
+    if not isinstance(metric, Metric):
+        warnings.warn(
+            "The `keras.metrics.serialize()` API should only be used for "
+            "objects of type `keras.metrics.Metric`. Found an instance of "
+            f"type {type(metric)}, which may lead to improper serialization."
+        )
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(metric)
+    return serialize_keras_object(metric)
+
+
+@keras_export("keras.metrics.deserialize")
+def deserialize(config, custom_objects=None, use_legacy_format=False):
+    """Deserializes a serialized metric class/function instance.
+
+    Args:
+      config: Metric configuration.
+      custom_objects: Optional dictionary mapping names (strings) to custom
+        objects (classes and functions) to be considered during deserialization.
+
+    Returns:
+        A Keras `Metric` instance or a metric function.
+    """
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            config,
+            module_objects=globals(),
+            custom_objects=custom_objects,
+            printable_module_name="metric function",
+        )
+    return deserialize_keras_object(
+        config,
+        module_objects=globals(),
+        custom_objects=custom_objects,
+        printable_module_name="metric function",
+    )
+
+
+@keras_export("keras.metrics.get")
 def get(identifier):
-  """Retrieves a Keras metric as a `function`/`Metric` class instance.
-
-  The `identifier` may be the string name of a metric function or class.
-
-  >>> metric = tf.keras.metrics.get("categorical_crossentropy")
-  >>> type(metric)
-  <class 'function'>
-  >>> metric = tf.keras.metrics.get("CategoricalCrossentropy")
-  >>> type(metric)
-  <class '...metrics.CategoricalCrossentropy'>
-
-  You can also specify `config` of the metric to this function by passing dict
-  containing `class_name` and `config` as an identifier. Also note that the
-  `class_name` must map to a `Metric` class
-
-  >>> identifier = {"class_name": "CategoricalCrossentropy",
-  ...               "config": {"from_logits": True}}
-  >>> metric = tf.keras.metrics.get(identifier)
-  >>> type(metric)
-  <class '...metrics.CategoricalCrossentropy'>
-
-  Args:
-    identifier: A metric identifier. One of None or string name of a metric
-      function/class or metric configuration dictionary or a metric function or
-      a metric class instance
-
-  Returns:
-    A Keras metric as a `function`/ `Metric` class instance.
-
-  Raises:
-    ValueError: If `identifier` cannot be interpreted.
-  """
-  if isinstance(identifier, dict):
-    return deserialize(identifier)
-  elif isinstance(identifier, str):
-    return deserialize(str(identifier))
-  elif callable(identifier):
-    return identifier
-  else:
-    raise ValueError(
-        f'Could not interpret metric identifier: {identifier}')
+    """Retrieves a Keras metric as a `function`/`Metric` class instance.
+
+    The `identifier` may be the string name of a metric function or class.
+
+    >>> metric = tf.keras.metrics.get("categorical_crossentropy")
+    >>> type(metric)
+    <class 'function'>
+    >>> metric = tf.keras.metrics.get("CategoricalCrossentropy")
+    >>> type(metric)
+    <class '...metrics.CategoricalCrossentropy'>
+
+    You can also specify `config` of the metric to this function by passing dict
+    containing `class_name` and `config` as an identifier. Also note that the
+    `class_name` must map to a `Metric` class
+
+    >>> identifier = {"class_name": "CategoricalCrossentropy",
+    ...               "config": {"from_logits": True}}
+    >>> metric = tf.keras.metrics.get(identifier)
+    >>> type(metric)
+    <class '...metrics.CategoricalCrossentropy'>
+
+    Args:
+      identifier: A metric identifier. One of None or string name of a metric
+        function/class or metric configuration dictionary or a metric function
+        or a metric class instance
+
+    Returns:
+      A Keras metric as a `function`/ `Metric` class instance.
+
+    Raises:
+      ValueError: If `identifier` cannot be interpreted.
+    """
+    if isinstance(identifier, dict):
+        use_legacy_format = "module" not in identifier
+        return deserialize(identifier, use_legacy_format=use_legacy_format)
+    elif isinstance(identifier, str):
+        return deserialize(str(identifier))
+    elif callable(identifier):
+        return identifier
+    else:
+        raise ValueError(f"Could not interpret metric identifier: {identifier}")
diff --git a/keras/metrics/accuracy_metrics.py b/keras/metrics/accuracy_metrics.py
new file mode 100644
index 000000000000..98e130a8efc7
--- /dev/null
+++ b/keras/metrics/accuracy_metrics.py
@@ -0,0 +1,527 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Accuracy metrics."""
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.dtensor import utils as dtensor_utils
+from keras.metrics import base_metric
+from keras.utils import metrics_utils
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.metrics.Accuracy")
+class Accuracy(base_metric.MeanMetricWrapper):
+    """Calculates how often predictions equal labels.
+
+    This metric creates two local variables, `total` and `count` that are used
+    to compute the frequency with which `y_pred` matches `y_true`. This
+    frequency is ultimately returned as `binary accuracy`: an idempotent
+    operation that simply divides `total` by `count`.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.Accuracy()
+    >>> m.update_state([[1], [2], [3], [4]], [[0], [2], [3], [4]])
+    >>> m.result().numpy()
+    0.75
+
+    >>> m.reset_state()
+    >>> m.update_state([[1], [2], [3], [4]], [[0], [2], [3], [4]],
+    ...                sample_weight=[1, 1, 0, 0])
+    >>> m.result().numpy()
+    0.5
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='mse',
+                  metrics=[tf.keras.metrics.Accuracy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="accuracy", dtype=None):
+        super().__init__(accuracy, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.BinaryAccuracy")
+class BinaryAccuracy(base_metric.MeanMetricWrapper):
+    """Calculates how often predictions match binary labels.
+
+    This metric creates two local variables, `total` and `count` that are used
+    to compute the frequency with which `y_pred` matches `y_true`. This
+    frequency is ultimately returned as `binary accuracy`: an idempotent
+    operation that simply divides `total` by `count`.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      threshold: (Optional) Float representing the threshold for deciding
+      whether prediction values are 1 or 0.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.BinaryAccuracy()
+    >>> m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]])
+    >>> m.result().numpy()
+    0.75
+
+    >>> m.reset_state()
+    >>> m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]],
+    ...                sample_weight=[1, 0, 0, 1])
+    >>> m.result().numpy()
+    0.5
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='mse',
+                  metrics=[tf.keras.metrics.BinaryAccuracy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="binary_accuracy", dtype=None, threshold=0.5):
+        super().__init__(
+            metrics_utils.binary_matches, name, dtype=dtype, threshold=threshold
+        )
+
+
+@keras_export("keras.metrics.CategoricalAccuracy")
+class CategoricalAccuracy(base_metric.MeanMetricWrapper):
+    """Calculates how often predictions match one-hot labels.
+
+    You can provide logits of classes as `y_pred`, since argmax of
+    logits and probabilities are same.
+
+    This metric creates two local variables, `total` and `count` that are used
+    to compute the frequency with which `y_pred` matches `y_true`. This
+    frequency is ultimately returned as `categorical accuracy`: an idempotent
+    operation that simply divides `total` by `count`.
+
+    `y_pred` and `y_true` should be passed in as vectors of probabilities,
+    rather than as labels. If necessary, use `tf.one_hot` to expand `y_true` as
+    a vector.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.CategoricalAccuracy()
+    >>> m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8],
+    ...                 [0.05, 0.95, 0]])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8],
+    ...                 [0.05, 0.95, 0]],
+    ...                sample_weight=[0.7, 0.3])
+    >>> m.result().numpy()
+    0.3
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.CategoricalAccuracy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="categorical_accuracy", dtype=None):
+        super().__init__(
+            lambda y_true, y_pred: metrics_utils.sparse_categorical_matches(
+                tf.math.argmax(y_true, axis=-1), y_pred
+            ),
+            name,
+            dtype=dtype,
+        )
+
+
+@keras_export("keras.metrics.SparseCategoricalAccuracy")
+class SparseCategoricalAccuracy(base_metric.MeanMetricWrapper):
+    """Calculates how often predictions match integer labels.
+
+    ```python
+    acc = np.dot(sample_weight, np.equal(y_true, np.argmax(y_pred, axis=1))
+    ```
+
+    You can provide logits of classes as `y_pred`, since argmax of
+    logits and probabilities are same.
+
+    This metric creates two local variables, `total` and `count` that are used
+    to compute the frequency with which `y_pred` matches `y_true`. This
+    frequency is ultimately returned as `sparse categorical accuracy`: an
+    idempotent operation that simply divides `total` by `count`.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.SparseCategoricalAccuracy()
+    >>> m.update_state([[2], [1]], [[0.1, 0.6, 0.3], [0.05, 0.95, 0]])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([[2], [1]], [[0.1, 0.6, 0.3], [0.05, 0.95, 0]],
+    ...                sample_weight=[0.7, 0.3])
+    >>> m.result().numpy()
+    0.3
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="sparse_categorical_accuracy", dtype=None):
+        super().__init__(
+            metrics_utils.sparse_categorical_matches, name, dtype=dtype
+        )
+
+
+_SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING = """Accumulates metric statistics.
+
+For sparse categorical metrics, the shapes of `y_true` and `y_pred` are
+different.
+
+Args:
+  y_true: Ground truth label values. shape = `[batch_size, d0, .. dN-1]` or
+    shape = `[batch_size, d0, .. dN-1, 1]`.
+  y_pred: The predicted probability values. shape = `[batch_size, d0, .. dN]`.
+  sample_weight: Optional `sample_weight` acts as a
+    coefficient for the metric. If a scalar is provided, then the metric is
+    simply scaled by the given value. If `sample_weight` is a tensor of size
+    `[batch_size]`, then the metric for each sample of the batch is rescaled
+    by the corresponding element in the `sample_weight` vector. If the shape
+    of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be broadcasted
+    to this shape), then each metric element of `y_pred` is scaled by the
+    corresponding value of `sample_weight`. (Note on `dN-1`: all metric
+    functions reduce by 1 dimension, usually the last axis (-1)).
+
+Returns:
+  Update op.
+"""
+
+SparseCategoricalAccuracy.update_state.__doc__ = (
+    _SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING
+)
+
+
+@keras_export("keras.metrics.TopKCategoricalAccuracy")
+class TopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
+    """Computes how often targets are in the top `K` predictions.
+
+    Args:
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to `5`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.TopKCategoricalAccuracy(k=1)
+    >>> m.update_state([[0, 0, 1], [0, 1, 0]],
+    ...                [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 0, 1], [0, 1, 0]],
+    ...                [[0.1, 0.9, 0.8], [0.05, 0.95, 0]],
+    ...                sample_weight=[0.7, 0.3])
+    >>> m.result().numpy()
+    0.3
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='mse',
+                  metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, k=5, name="top_k_categorical_accuracy", dtype=None):
+        super().__init__(
+            lambda yt, yp, k: metrics_utils.sparse_top_k_categorical_matches(
+                tf.math.argmax(yt, axis=-1), yp, k
+            ),
+            name,
+            dtype=dtype,
+            k=k,
+        )
+
+
+@keras_export("keras.metrics.SparseTopKCategoricalAccuracy")
+class SparseTopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
+    """Computes how often integer targets are in the top `K` predictions.
+
+    Args:
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to `5`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1)
+    >>> m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]],
+    ...                sample_weight=[0.7, 0.3])
+    >>> m.result().numpy()
+    0.3
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self, k=5, name="sparse_top_k_categorical_accuracy", dtype=None
+    ):
+        super().__init__(
+            metrics_utils.sparse_top_k_categorical_matches,
+            name,
+            dtype=dtype,
+            k=k,
+        )
+
+
+SparseTopKCategoricalAccuracy.update_state.__doc__ = (
+    _SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING
+)
+
+
+def accuracy(y_true, y_pred):
+    [
+        y_pred,
+        y_true,
+    ], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+        [y_pred, y_true]
+    )
+    y_true.shape.assert_is_compatible_with(y_pred.shape)
+    if y_true.dtype != y_pred.dtype:
+        y_pred = tf.cast(y_pred, y_true.dtype)
+    return tf.cast(tf.equal(y_true, y_pred), backend.floatx())
+
+
+@keras_export("keras.metrics.binary_accuracy")
+@tf.__internal__.dispatch.add_dispatch_support
+def binary_accuracy(y_true, y_pred, threshold=0.5):
+    """Calculates how often predictions match binary labels.
+
+    Standalone usage:
+    >>> y_true = [[1], [1], [0], [0]]
+    >>> y_pred = [[1], [1], [0], [0]]
+    >>> m = tf.keras.metrics.binary_accuracy(y_true, y_pred)
+    >>> assert m.shape == (4,)
+    >>> m.numpy()
+    array([1., 1., 1., 1.], dtype=float32)
+
+    Args:
+      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+      threshold: (Optional) Float representing the threshold for deciding
+        whether prediction values are 1 or 0.
+
+    Returns:
+      Binary accuracy values. shape = `[batch_size, d0, .. dN-1]`
+    """
+    # Note: calls metrics_utils.binary_matches with mean reduction. This
+    # maintains public facing binary_accuracy behavior and seperates it from the
+    # vital behavior of the binary_matches method needed in backend
+    # dependencies.
+
+    return tf.reduce_mean(
+        metrics_utils.binary_matches(y_true, y_pred, threshold), axis=-1
+    )
+
+
+@keras_export("keras.metrics.categorical_accuracy")
+@tf.__internal__.dispatch.add_dispatch_support
+def categorical_accuracy(y_true, y_pred):
+    """Calculates how often predictions match one-hot labels.
+
+    Standalone usage:
+    >>> y_true = [[0, 0, 1], [0, 1, 0]]
+    >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
+    >>> m = tf.keras.metrics.categorical_accuracy(y_true, y_pred)
+    >>> assert m.shape == (2,)
+    >>> m.numpy()
+    array([0., 1.], dtype=float32)
+
+    You can provide logits of classes as `y_pred`, since argmax of
+    logits and probabilities are same.
+
+    Args:
+      y_true: One-hot ground truth values.
+      y_pred: The prediction values.
+
+    Returns:
+      Categorical accuracy values.
+    """
+    # Note: wraps metrics_utils.categorical_matches. This seperates public
+    # facing categorical_accuracy behavior from the vital behavior of the
+    # categorical_matches method needed in backend dependencies.
+
+    return metrics_utils.sparse_categorical_matches(
+        tf.math.argmax(y_true, axis=-1), y_pred
+    )
+
+
+@keras_export("keras.metrics.sparse_categorical_accuracy")
+@tf.__internal__.dispatch.add_dispatch_support
+def sparse_categorical_accuracy(y_true, y_pred):
+    """Calculates how often predictions match integer labels.
+
+    Standalone usage:
+    >>> y_true = [2, 1]
+    >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
+    >>> m = tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
+    >>> assert m.shape == (2,)
+    >>> m.numpy()
+    array([0., 1.], dtype=float32)
+
+    You can provide logits of classes as `y_pred`, since argmax of
+    logits and probabilities are same.
+
+    Args:
+      y_true: Integer ground truth values.
+      y_pred: The prediction values.
+
+    Returns:
+      Sparse categorical accuracy values.
+    """
+    # Note: wraps metrics_utils.sparse_categorical_matches method and checks for
+    # squeezing to align with expected public facing behavior. This seperates
+    # public facing sparse_categorical_accuracy behavior from the vital behavior
+    # of the sparse_categorical_matches method needed in backend dependencies.
+
+    matches = metrics_utils.sparse_categorical_matches(y_true, y_pred)
+
+    # if shape is (num_samples, 1) squeeze
+    if matches.shape.ndims > 1 and matches.shape[-1] == 1:
+        matches = tf.squeeze(matches, [-1])
+
+    return matches
+
+
+@keras_export("keras.metrics.top_k_categorical_accuracy")
+@tf.__internal__.dispatch.add_dispatch_support
+def top_k_categorical_accuracy(y_true, y_pred, k=5):
+    """Computes how often targets are in the top `K` predictions.
+
+    Standalone usage:
+    >>> y_true = [[0, 0, 1], [0, 1, 0]]
+    >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
+    >>> m = tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)
+    >>> assert m.shape == (2,)
+    >>> m.numpy()
+    array([1., 1.], dtype=float32)
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The prediction values.
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to `5`.
+
+    Returns:
+      Top K categorical accuracy value.
+    """
+    # Note: wraps metrics_utils.top_k_categorical_matches. This seperates
+    # public facing top_k_categorical_accuracy behavior from the vital behavior
+    # of the top_k_categorical_matches method needed in backend dependencies.
+
+    return metrics_utils.sparse_top_k_categorical_matches(
+        tf.math.argmax(y_true, axis=-1), y_pred, k
+    )
+
+
+@keras_export("keras.metrics.sparse_top_k_categorical_accuracy")
+@tf.__internal__.dispatch.add_dispatch_support
+def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
+    """Computes how often integer targets are in the top `K` predictions.
+
+    Standalone usage:
+    >>> y_true = [2, 1]
+    >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
+    >>> m = tf.keras.metrics.sparse_top_k_categorical_accuracy(
+    ...     y_true, y_pred, k=3)
+    >>> assert m.shape == (2,)
+    >>> m.numpy()
+    array([1., 1.], dtype=float32)
+
+    Args:
+      y_true: tensor of true targets.
+      y_pred: tensor of predicted targets.
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to `5`.
+
+    Returns:
+      Sparse top K categorical accuracy value.
+    """
+    # Note: wraps metrics_utils.sparse_top_k_categorical_matches. This seperates
+    # public facing sparse_top_k_categorical_accuracy behavior from the vital
+    # behavior of the sparse_top_k_categorical_matches method needed in backend
+    # dependencies.
+
+    return metrics_utils.sparse_top_k_categorical_matches(y_true, y_pred, k)
diff --git a/keras/metrics/accuracy_metrics_test.py b/keras/metrics/accuracy_metrics_test.py
new file mode 100644
index 000000000000..a89ded8016cd
--- /dev/null
+++ b/keras/metrics/accuracy_metrics_test.py
@@ -0,0 +1,407 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for accuracy metrics."""
+
+import tensorflow.compat.v2 as tf
+
+from keras import Model
+from keras import layers
+from keras import metrics
+from keras.testing_infra import test_combinations
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class AccuracyTest(tf.test.TestCase):
+    def test_accuracy(self):
+        acc_obj = metrics.Accuracy(name="my_acc")
+
+        # check config
+        self.assertEqual(acc_obj.name, "my_acc")
+        self.assertTrue(acc_obj.stateful)
+        self.assertEqual(len(acc_obj.variables), 2)
+        self.assertEqual(acc_obj.dtype, tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        update_op = acc_obj.update_state(
+            [[1], [2], [3], [4]], [[1], [2], [3], [4]]
+        )
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # Check save and restore config
+        a2 = metrics.Accuracy.from_config(acc_obj.get_config())
+        self.assertEqual(a2.name, "my_acc")
+        self.assertTrue(a2.stateful)
+        self.assertEqual(len(a2.variables), 2)
+        self.assertEqual(a2.dtype, tf.float32)
+
+        # check with sample_weight
+        result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
+
+    def test_accuracy_ragged(self):
+        acc_obj = metrics.Accuracy(name="my_acc")
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        rt1 = tf.ragged.constant([[1], [2], [3], [4]])
+        rt2 = tf.ragged.constant([[1], [2], [3], [4]])
+        update_op = acc_obj.update_state(rt1, rt2)
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # check with sample_weight
+        rt1 = tf.ragged.constant([[2], [1]])
+        rt2 = tf.ragged.constant([[2], [0]])
+        sw_ragged = tf.ragged.constant([[0.5], [0.2]])
+        result_t = acc_obj(rt1, rt2, sample_weight=sw_ragged)
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
+
+    def test_binary_accuracy(self):
+        acc_obj = metrics.BinaryAccuracy(name="my_acc")
+
+        # check config
+        self.assertEqual(acc_obj.name, "my_acc")
+        self.assertTrue(acc_obj.stateful)
+        self.assertEqual(len(acc_obj.variables), 2)
+        self.assertEqual(acc_obj.dtype, tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        update_op = acc_obj.update_state([[1], [0]], [[1], [0]])
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # check y_pred squeeze
+        update_op = acc_obj.update_state([[1], [1]], [[[1]], [[0]]])
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertAlmostEqual(result, 0.75, 2)  # 3/4
+
+        # check y_true squeeze
+        result_t = acc_obj([[[1]], [[1]]], [[1], [0]])
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.67, 2)  # 4/6
+
+        # check with sample_weight
+        result_t = acc_obj([[1], [1]], [[1], [0]], [[0.5], [0.2]])
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
+
+    def test_binary_accuracy_ragged(self):
+        acc_obj = metrics.BinaryAccuracy(name="my_acc")
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        rt1 = tf.ragged.constant([[1], [0]])
+        rt2 = tf.ragged.constant([[1], [0]])
+        update_op = acc_obj.update_state(rt1, rt2)
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # check y_true squeeze only supported for dense tensors and is
+        # not supported by ragged tensor (different ranks). --> error
+        rt1 = tf.ragged.constant([[[1], [1]]])
+        rt2 = tf.ragged.constant([[1], [0]])
+        with self.assertRaises(ValueError):
+            result_t = acc_obj(rt1, rt2)
+            result = self.evaluate(result_t)
+
+    def test_binary_accuracy_threshold(self):
+        acc_obj = metrics.BinaryAccuracy(threshold=0.7)
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+        result_t = acc_obj([[1], [1], [0], [0]], [[0.9], [0.6], [0.4], [0.8]])
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.5, 2)
+
+    def test_binary_accuracy_threshold_ragged(self):
+        acc_obj = metrics.BinaryAccuracy(threshold=0.7)
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+        rt1 = tf.ragged.constant([[1], [1], [0], [0]])
+        rt2 = tf.ragged.constant([[0.9], [0.6], [0.4], [0.8]])
+        result_t = acc_obj(rt1, rt2)
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.5, 2)
+
+    def test_categorical_accuracy(self):
+        acc_obj = metrics.CategoricalAccuracy(name="my_acc")
+
+        # check config
+        self.assertEqual(acc_obj.name, "my_acc")
+        self.assertTrue(acc_obj.stateful)
+        self.assertEqual(len(acc_obj.variables), 2)
+        self.assertEqual(acc_obj.dtype, tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        update_op = acc_obj.update_state(
+            [[0, 0, 1], [0, 1, 0]], [[0.1, 0.1, 0.8], [0.05, 0.95, 0]]
+        )
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # check with sample_weight
+        result_t = acc_obj(
+            [[0, 0, 1], [0, 1, 0]],
+            [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+            [[0.5], [0.2]],
+        )
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
+    def test_categorical_accuracy_ragged(self):
+        acc_obj = metrics.CategoricalAccuracy(name="my_acc")
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        rt1 = tf.ragged.constant([[0, 0, 1], [0, 1, 0]])
+        rt2 = tf.ragged.constant([[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+        update_op = acc_obj.update_state(rt1, rt2)
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # check with sample_weight
+        rt1 = tf.ragged.constant([[0, 0, 1], [0, 1, 0]])
+        rt2 = tf.ragged.constant([[0.1, 0.1, 0.8], [0.05, 0, 0.95]])
+        sample_weight = tf.ragged.constant([[0.5], [0.2]])
+        with self.assertRaises(tf.errors.InvalidArgumentError):
+            result_t = acc_obj(rt1, rt2, sample_weight)
+            result = self.evaluate(result_t)
+
+    def test_sparse_categorical_accuracy(self):
+        acc_obj = metrics.SparseCategoricalAccuracy(name="my_acc")
+
+        # check config
+        self.assertEqual(acc_obj.name, "my_acc")
+        self.assertTrue(acc_obj.stateful)
+        self.assertEqual(len(acc_obj.variables), 2)
+        self.assertEqual(acc_obj.dtype, tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        update_op = acc_obj.update_state(
+            [[2], [1]], [[0.1, 0.1, 0.8], [0.05, 0.95, 0]]
+        )
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # check with sample_weight
+        result_t = acc_obj(
+            [[2], [1]], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]], [[0.5], [0.2]]
+        )
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
+    def test_sparse_categorical_accuracy_ragged(self):
+        acc_obj = metrics.SparseCategoricalAccuracy(name="my_acc")
+
+        # verify that correct value is returned
+        rt1 = tf.ragged.constant([[2], [1]])
+        rt2 = tf.ragged.constant([[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+
+        with self.assertRaises(tf.errors.InvalidArgumentError):
+            # sparse_categorical_accuracy is not supported for composite/ragged
+            # tensors.
+            update_op = acc_obj.update_state(rt1, rt2)
+            self.evaluate(update_op)
+
+    def test_sparse_categorical_accuracy_mismatched_dims(self):
+        acc_obj = metrics.SparseCategoricalAccuracy(name="my_acc")
+
+        # check config
+        self.assertEqual(acc_obj.name, "my_acc")
+        self.assertTrue(acc_obj.stateful)
+        self.assertEqual(len(acc_obj.variables), 2)
+        self.assertEqual(acc_obj.dtype, tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+        # verify that correct value is returned
+        update_op = acc_obj.update_state(
+            [2, 1], [[0.1, 0.1, 0.8], [0.05, 0.95, 0]]
+        )
+        self.evaluate(update_op)
+        result = self.evaluate(acc_obj.result())
+        self.assertEqual(result, 1)  # 2/2
+
+        # check with sample_weight
+        result_t = acc_obj(
+            [2, 1], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]], [[0.5], [0.2]]
+        )
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
+    def test_sparse_categorical_accuracy_mismatched_dims_dynamic(self):
+        with tf.compat.v1.get_default_graph().as_default(), self.cached_session() as sess:  # noqa: E501
+            acc_obj = metrics.SparseCategoricalAccuracy(name="my_acc")
+            self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
+
+            t = tf.compat.v1.placeholder(tf.float32)
+            p = tf.compat.v1.placeholder(tf.float32)
+            w = tf.compat.v1.placeholder(tf.float32)
+
+            result_t = acc_obj(t, p, w)
+            result = sess.run(
+                result_t,
+                feed_dict=(
+                    {
+                        t: [2, 1],
+                        p: [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                        w: [[0.5], [0.2]],
+                    }
+                ),
+            )
+            self.assertAlmostEqual(result, 0.71, 2)  # 2.5/2.7
+
+    def test_get_acc(self):
+        acc_fn = metrics.get("acc")
+        self.assertEqual(acc_fn, metrics.accuracy)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class TopKCategoricalAccuracyTest(tf.test.TestCase):
+    def test_config(self):
+        a_obj = metrics.TopKCategoricalAccuracy(name="topkca", dtype=tf.int32)
+        self.assertEqual(a_obj.name, "topkca")
+        self.assertEqual(a_obj._dtype, tf.int32)
+
+        a_obj2 = metrics.TopKCategoricalAccuracy.from_config(a_obj.get_config())
+        self.assertEqual(a_obj2.name, "topkca")
+        self.assertEqual(a_obj2._dtype, tf.int32)
+
+    def test_correctness(self):
+        a_obj = metrics.TopKCategoricalAccuracy()
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        y_true = tf.constant([[0, 0, 1], [0, 1, 0]])
+        y_pred = tf.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+        result = a_obj(y_true, y_pred)
+        self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+        # With `k` < 5.
+        a_obj = metrics.TopKCategoricalAccuracy(k=1)
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        result = a_obj(y_true, y_pred)
+        self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+        # With `k` > 5.
+        y_true = tf.constant([[0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0]])
+        y_pred = tf.constant(
+            [[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4], [0.05, 0.95, 0, 0, 0, 0, 0]]
+        )
+        a_obj = metrics.TopKCategoricalAccuracy(k=6)
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        result = a_obj(y_true, y_pred)
+        self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
+
+    def test_weighted(self):
+        a_obj = metrics.TopKCategoricalAccuracy(k=2)
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        y_true = tf.constant([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
+        y_pred = tf.constant([[0, 0.9, 0.1], [0, 0.9, 0.1], [0, 0.9, 0.1]])
+        sample_weight = tf.constant((1.0, 0.0, 1.0))
+        result = a_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class SparseTopKCategoricalAccuracyTest(tf.test.TestCase):
+    def test_config(self):
+        a_obj = metrics.SparseTopKCategoricalAccuracy(
+            name="stopkca", dtype=tf.int32
+        )
+        self.assertEqual(a_obj.name, "stopkca")
+        self.assertEqual(a_obj._dtype, tf.int32)
+
+        a_obj2 = metrics.SparseTopKCategoricalAccuracy.from_config(
+            a_obj.get_config()
+        )
+        self.assertEqual(a_obj2.name, "stopkca")
+        self.assertEqual(a_obj2._dtype, tf.int32)
+
+    def test_correctness(self):
+        a_obj = metrics.SparseTopKCategoricalAccuracy()
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        y_true = tf.constant([2, 1])
+        y_pred = tf.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+        result = a_obj(y_true, y_pred)
+        self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+        # With `k` < 5.
+        a_obj = metrics.SparseTopKCategoricalAccuracy(k=1)
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        result = a_obj(y_true, y_pred)
+        self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+        # With `k` > 5.
+        y_pred = tf.constant(
+            [[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4], [0.05, 0.95, 0, 0, 0, 0, 0]]
+        )
+        a_obj = metrics.SparseTopKCategoricalAccuracy(k=6)
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        result = a_obj(y_true, y_pred)
+        self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
+
+    def test_weighted(self):
+        a_obj = metrics.SparseTopKCategoricalAccuracy(k=2)
+        self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+        y_true = tf.constant([1, 0, 2])
+        y_pred = tf.constant([[0, 0.9, 0.1], [0, 0.9, 0.1], [0, 0.9, 0.1]])
+        sample_weight = tf.constant((1.0, 0.0, 1.0))
+        result = a_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
+
+    def test_sparse_top_k_categorical_accuracy_mismatched_dims_dynamic(self):
+
+        if not tf.compat.v1.executing_eagerly():
+            # Test will fail in v1 graph mode since the metric is not a normal
+            # layer.  It will aggregate the output by batch dim, which failed on
+            # v1 code.
+            self.skipTest("v2 eager mode only")
+
+        class AccLayer(layers.Layer):
+            def build(self, _):
+                self.acc = metrics.SparseTopKCategoricalAccuracy(k=1)
+
+            def call(self, y_true, y_pred):
+                return self.acc(y_true, y_pred)
+
+        label = layers.Input(shape=[1])
+        predict = layers.Input(shape=[3])
+        metric_result = AccLayer()(label, predict)
+        model = Model([label, predict], metric_result)
+
+        result = model.predict(
+            [
+                tf.constant([[2], [1]]),
+                tf.constant([[0.1, 0.1, 0.8], [0.05, 0, 0.95]]),
+            ],
+            steps=1,
+        )
+        self.assertAllClose(result, 0.5)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/base_metric.py b/keras/metrics/base_metric.py
index 2dbf91a387cd..7a56b4d13815 100644
--- a/keras/metrics/base_metric.py
+++ b/keras/metrics/base_metric.py
@@ -12,862 +12,982 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes
-# pylint: disable=g-doc-return-or-yield
 """Base Metric classes."""
 
 import abc
-import copy
 import types
 import warnings
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.dtensor import dtensor_api as dtensor
 from keras.dtensor import utils as dtensor_utils
 from keras.engine import base_layer
 from keras.engine import base_layer_utils
 from keras.engine import keras_tensor
-from keras.saving.saved_model import metric_serialization
+from keras.saving.legacy.saved_model import metric_serialization
 from keras.utils import generic_utils
 from keras.utils import losses_utils
 from keras.utils import metrics_utils
-from keras.utils.tf_utils import is_tensor_or_variable
-import numpy as np
-import tensorflow.compat.v2 as tf
+from keras.utils import tf_utils
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 
-@keras_export('keras.metrics.Metric')
+@keras_export("keras.metrics.Metric")
 class Metric(base_layer.Layer, metaclass=abc.ABCMeta):
-  """Encapsulates metric logic and state.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    **kwargs: Additional layer keywords arguments.
-
-  Standalone usage:
-
-  ```python
-  m = SomeMetric(...)
-  for input in ...:
-    m.update_state(input)
-  print('Final result: ', m.result().numpy())
-  ```
-
-  Usage with `compile()` API:
-
-  ```python
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(64, activation='relu'))
-  model.add(tf.keras.layers.Dense(64, activation='relu'))
-  model.add(tf.keras.layers.Dense(10, activation='softmax'))
-
-  model.compile(optimizer=tf.keras.optimizers.RMSprop(0.01),
-                loss=tf.keras.losses.CategoricalCrossentropy(),
-                metrics=[tf.keras.metrics.CategoricalAccuracy()])
-
-  data = np.random.random((1000, 32))
-  labels = np.random.random((1000, 10))
-
-  dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-  dataset = dataset.batch(32)
-
-  model.fit(dataset, epochs=10)
-  ```
-
-  To be implemented by subclasses:
-  * `__init__()`: All state variables should be created in this method by
-    calling `self.add_weight()` like: `self.var = self.add_weight(...)`
-  * `update_state()`: Has all updates to the state variables like:
-    self.var.assign_add(...).
-  * `result()`: Computes and returns a scalar value or a dict of scalar values
-    for the metric from the state variables.
-
-  Example subclass implementation:
-
-  ```python
-  class BinaryTruePositives(tf.keras.metrics.Metric):
-
-    def __init__(self, name='binary_true_positives', **kwargs):
-      super(BinaryTruePositives, self).__init__(name=name, **kwargs)
-      self.true_positives = self.add_weight(name='tp', initializer='zeros')
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-      y_true = tf.cast(y_true, tf.bool)
-      y_pred = tf.cast(y_pred, tf.bool)
-
-      values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
-      values = tf.cast(values, self.dtype)
-      if sample_weight is not None:
-        sample_weight = tf.cast(sample_weight, self.dtype)
-        sample_weight = tf.broadcast_to(sample_weight, values.shape)
-        values = tf.multiply(values, sample_weight)
-      self.true_positives.assign_add(tf.reduce_sum(values))
-
-    def result(self):
-      return self.true_positives
-  ```
-  """
-
-  def __init__(self, name=None, dtype=None, **kwargs):
-    super().__init__(name=name, dtype=dtype, **kwargs)
-    self.stateful = True  # All metric layers are stateful.
-    self.built = True
-    if not base_layer_utils.v2_dtype_behavior_enabled():
-      # We only do this when the V2 behavior is not enabled, as when it is
-      # enabled, the dtype already defaults to floatx.
-      self._dtype = (backend.floatx() if dtype is None
-                     else tf.as_dtype(dtype).name)
-
-  def __new__(cls, *args, **kwargs):
-    obj = super(Metric, cls).__new__(cls)
-
-    # If `update_state` is not in eager/tf.function and it is not from a
-    # built-in metric, wrap it in `tf.function`. This is so that users writing
-    # custom metrics in v1 need not worry about control dependencies and
-    # return ops.
-    if (base_layer_utils.is_in_eager_or_tf_function() or
-        is_built_in(cls)):
-      obj_update_state = obj.update_state
-
-      def update_state_fn(*args, **kwargs):
-        control_status = tf.__internal__.autograph.control_status_ctx()
-        ag_update_state = tf.__internal__.autograph.tf_convert(
-            obj_update_state, control_status)
-        return ag_update_state(*args, **kwargs)
-    else:
-      if isinstance(obj.update_state, tf.__internal__.function.Function):
-        update_state_fn = obj.update_state
-      else:
-        update_state_fn = tf.function(obj.update_state)
-
-    obj.update_state = types.MethodType(
-        metrics_utils.update_state_wrapper(update_state_fn), obj)
-
-    obj_result = obj.result
-
-    def result_fn(*args, **kwargs):
-      control_status = tf.__internal__.autograph.control_status_ctx()
-      ag_result = tf.__internal__.autograph.tf_convert(
-          obj_result, control_status)
-      return ag_result(*args, **kwargs)
-
-    obj.result = types.MethodType(metrics_utils.result_wrapper(result_fn), obj)
-
-    return obj
-
-  def __call__(self, *args, **kwargs):
-    """Accumulates statistics and then computes metric result value.
+    """Encapsulates metric logic and state.
 
     Args:
-      *args:
-      **kwargs: A mini-batch of inputs to the Metric,
-        passed on to `update_state()`.
-
-    Returns:
-      The metric value tensor.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: Additional layer keywords arguments.
+
+    Standalone usage:
+
+    ```python
+    m = SomeMetric(...)
+    for input in ...:
+      m.update_state(input)
+    print('Final result: ', m.result().numpy())
+    ```
+
+    Usage with `compile()` API:
+
+    ```python
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(64, activation='relu'))
+    model.add(tf.keras.layers.Dense(64, activation='relu'))
+    model.add(tf.keras.layers.Dense(10, activation='softmax'))
+
+    model.compile(optimizer=tf.keras.optimizers.RMSprop(0.01),
+                  loss=tf.keras.losses.CategoricalCrossentropy(),
+                  metrics=[tf.keras.metrics.CategoricalAccuracy()])
+
+    data = np.random.random((1000, 32))
+    labels = np.random.random((1000, 10))
+
+    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+    dataset = dataset.batch(32)
+
+    model.fit(dataset, epochs=10)
+    ```
+
+    To be implemented by subclasses:
+    * `__init__()`: All state variables should be created in this method by
+      calling `self.add_weight()` like: `self.var = self.add_weight(...)`
+    * `update_state()`: Has all updates to the state variables like:
+      self.var.assign_add(...).
+    * `result()`: Computes and returns a scalar value or a dict of scalar values
+      for the metric from the state variables.
+
+    Example subclass implementation:
+
+    ```python
+    class BinaryTruePositives(tf.keras.metrics.Metric):
+
+      def __init__(self, name='binary_true_positives', **kwargs):
+        super(BinaryTruePositives, self).__init__(name=name, **kwargs)
+        self.true_positives = self.add_weight(name='tp', initializer='zeros')
+
+      def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
+
+        values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+        values = tf.cast(values, self.dtype)
+        if sample_weight is not None:
+          sample_weight = tf.cast(sample_weight, self.dtype)
+          sample_weight = tf.broadcast_to(sample_weight, values.shape)
+          values = tf.multiply(values, sample_weight)
+        self.true_positives.assign_add(tf.reduce_sum(values))
+
+      def result(self):
+        return self.true_positives
+    ```
     """
 
-    def replica_local_fn(*args, **kwargs):
-      """Updates the state of the metric in a replica-local context."""
-      if any(
-          isinstance(arg, keras_tensor.KerasTensor)
-          for arg in tf.nest.flatten((args, kwargs))):
-        update_op = None
-      else:
-        update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
-      update_ops = []
-      if update_op is not None:
-        update_ops.append(update_op)
-      with tf.control_dependencies(update_ops):
-        result_t = self.result()  # pylint: disable=not-callable
-
-        # We are adding the metric object as metadata on the result tensor.
-        # This is required when we want to use a metric with `add_metric` API on
-        # a Model/Layer in graph mode. This metric instance will later be used
-        # to reset variable state after each epoch of training.
-        # Example:
-        #   model = Model()
-        #   mean = Mean()
-        #   model.add_metric(mean(values), name='mean')
-        result_t._metric_obj = self  # pylint: disable=protected-access
-        return result_t
-
-    from keras.distribute import distributed_training_utils  # pylint:disable=g-import-not-at-top
-    return distributed_training_utils.call_replica_local_fn(
-        replica_local_fn, *args, **kwargs)
-
-  def __str__(self):
-    args = ','.join(f'{k}={v}' for k, v in self.get_config().items())
-    return f'{self.__class__.__name__}({args})'
-
-  def __deepcopy__(self, memo):
-    result = type(self)(name=self.name, dtype=self.dtype)
-    memo[id(self)] = result
-
-    for k, v in self.__dict__.items():
-      if k in ['update_state', 'result']:
-        # `update_state` keeps a closure of `update_state_fn`, and deep
-        # copying it would result in copying that old reference. Avoid that.
-        # Likewise for `result`.
-        continue
-      if k in ['_obj_reference_counts_dict']:
-        # `Layer.__setattr__` attempts to flatten the
-        # `ObjectIdentityDictionary`, which can't be done since it stores
-        # heterogeneous instances.
-        tf.Module.__setattr__(result, k, copy.deepcopy(v, memo))
-      elif k in ['_thread_local', '_metrics_lock']:
-        # Can't pickle _thread.lock objects.
-        setattr(result, k, v)
-      else:
-        setattr(result, k, copy.deepcopy(v, memo))
-
-    return result
-
-  @property
-  def dtype(self):
-    return self._dtype
-
-  def get_config(self):
-    """Returns the serializable config of the metric."""
-    return {'name': self.name, 'dtype': self.dtype}
-
-  def reset_state(self):
-    """Resets all of the metric state variables.
-
-    This function is called between epochs/steps,
-    when a metric is evaluated during training.
-    """
-    if not generic_utils.is_default(self.reset_states):
-      warnings.warn(
-          'Metric %s implements a `reset_states()` method; rename it '
-          'to `reset_state()` (without the final "s"). The name '
-          '`reset_states()` has been deprecated to improve API '
-          'consistency.' % (self.__class__.__name__,),
-          stacklevel=2)
-      return self.reset_states()
-    else:
-      backend.batch_set_value([(v, 0) for v in self.variables])
-
-  @abc.abstractmethod
-  def update_state(self, *args, **kwargs):
-    """Accumulates statistics for the metric.
-
-    Note: This function is executed as a graph function in graph mode.
-    This means:
-      a) Operations on the same resource are executed in textual order.
-         This should make it easier to do things like add the updated
-         value of a variable to another, for example.
-      b) You don't need to worry about collecting the update ops to execute.
-         All update ops added to the graph by this function will be executed.
-      As a result, code should generally work the same way with graph or
-      eager execution.
+    def __init__(self, name=None, dtype=None, **kwargs):
+        super().__init__(name=name, dtype=dtype, **kwargs)
+        self.stateful = True  # All metric layers are stateful.
+        self.built = True
+        if not base_layer_utils.v2_dtype_behavior_enabled():
+            # We only do this when the V2 behavior is not enabled, as when it is
+            # enabled, the dtype already defaults to floatx.
+            self._dtype = (
+                backend.floatx() if dtype is None else tf.as_dtype(dtype).name
+            )
+
+    def __new__(cls, *args, **kwargs):
+        obj = super(Metric, cls).__new__(cls)
+
+        # If `update_state` is not in eager/tf.function and it is not from a
+        # built-in metric, wrap it in `tf.function`. This is so that users
+        # writing custom metrics in v1 need not worry about control dependencies
+        # and return ops.
+        if base_layer_utils.is_in_eager_or_tf_function() or is_built_in(cls):
+            obj_update_state = obj.update_state
+
+            def update_state_fn(*args, **kwargs):
+                control_status = tf.__internal__.autograph.control_status_ctx()
+                ag_update_state = tf.__internal__.autograph.tf_convert(
+                    obj_update_state, control_status
+                )
+                return ag_update_state(*args, **kwargs)
 
-    Args:
-      *args:
-      **kwargs: A mini-batch of inputs to the Metric.
-    """
-    raise NotImplementedError('Must be implemented in subclasses.')
-
-  def merge_state(self, metrics):
-    """Merges the state from one or more metrics.
+        else:
+            if isinstance(obj.update_state, tf.__internal__.function.Function):
+                update_state_fn = obj.update_state
+            else:
+                update_state_fn = tf.function(obj.update_state)
+
+        obj.update_state = types.MethodType(
+            metrics_utils.update_state_wrapper(update_state_fn), obj
+        )
+
+        obj_result = obj.result
+
+        def result_fn(*args, **kwargs):
+            control_status = tf.__internal__.autograph.control_status_ctx()
+            ag_result = tf.__internal__.autograph.tf_convert(
+                obj_result, control_status
+            )
+            return ag_result(*args, **kwargs)
+
+        obj.result = types.MethodType(
+            metrics_utils.result_wrapper(result_fn), obj
+        )
+
+        return obj
+
+    def __call__(self, *args, **kwargs):
+        """Accumulates statistics and then computes metric result value.
+
+        Args:
+          *args:
+          **kwargs: A mini-batch of inputs to the Metric,
+            passed on to `update_state()`.
+
+        Returns:
+          The metric value tensor.
+        """
+
+        def replica_local_fn(*args, **kwargs):
+            """Updates the state of the metric in a replica-local context."""
+            if any(
+                isinstance(arg, keras_tensor.KerasTensor)
+                for arg in tf.nest.flatten((args, kwargs))
+            ):
+                update_op = None
+            else:
+                update_op = self.update_state(*args, **kwargs)
+            update_ops = []
+            if update_op is not None:
+                update_ops.append(update_op)
+            with tf.control_dependencies(update_ops):
+                result_t = self.result()
+
+                # If the metric object return a dictionary as a result, wrap it
+                # with our custom dict object so we can attach the metric object
+                # to it.
+                if isinstance(result_t, dict):
+                    result_t = _MetricDict(**result_t)
+
+                # We are adding the metric object as metadata on the result
+                # tensor.  This is required when we want to use a metric with
+                # `add_metric` API on a Model/Layer in graph mode. This metric
+                # instance will later be used to reset variable state after each
+                # epoch of training.
+                # Example:
+                #   model = Model()
+                #   mean = Mean()
+                #   model.add_metric(mean(values), name='mean')
+                result_t._metric_obj = self
+                return result_t
+
+        from keras.distribute import (
+            distributed_training_utils,
+        )
+
+        return distributed_training_utils.call_replica_local_fn(
+            replica_local_fn, *args, **kwargs
+        )
+
+    def __str__(self):
+        args = ",".join(f"{k}={v}" for k, v in self.get_config().items())
+        return f"{self.__class__.__name__}({args})"
+
+    def __deepcopy__(self, memo=None):
+        try:
+            new_self = self.from_config(self.get_config())
+        except NotImplementedError as e:
+            raise NotImplementedError(
+                "Calling `__deepcopy__()` on a Keras metric "
+                "requires the metric to be serializable,  "
+                "i.e. it should implement `get_config()`.\n\n"
+                f"Error encountered during serialization: [{e}]"
+            )
+        # Note that metrics don't implement `build()` so their variables
+        # are readily available after instantiation.
+        if self.weights:
+            new_self.set_weights(self.get_weights())
+        memo[self] = new_self
+        return new_self
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def get_config(self):
+        """Returns the serializable config of the metric."""
+        return {"name": self.name, "dtype": self.dtype}
+
+    def reset_state(self):
+        """Resets all of the metric state variables.
+
+        This function is called between epochs/steps,
+        when a metric is evaluated during training.
+        """
+        if not generic_utils.is_default(self.reset_states):
+            warnings.warn(
+                "Metric %s implements a `reset_states()` method; rename it "
+                'to `reset_state()` (without the final "s"). The name '
+                "`reset_states()` has been deprecated to improve API "
+                "consistency." % (self.__class__.__name__,),
+                stacklevel=2,
+            )
+            return self.reset_states()
+        else:
+            backend.batch_set_value([(v, 0) for v in self.variables])
+
+    @abc.abstractmethod
+    def update_state(self, *args, **kwargs):
+        """Accumulates statistics for the metric.
+
+        Note: This function is executed as a graph function in graph mode.
+        This means:
+          a) Operations on the same resource are executed in textual order.
+             This should make it easier to do things like add the updated
+             value of a variable to another, for example.
+          b) You don't need to worry about collecting the update ops to execute.
+             All update ops added to the graph by this function will be
+             executed.
+          As a result, code should generally work the same way with graph or
+          eager execution.
+
+        Args:
+          *args:
+          **kwargs: A mini-batch of inputs to the Metric.
+        """
+        raise NotImplementedError("Must be implemented in subclasses.")
+
+    def merge_state(self, metrics):
+        """Merges the state from one or more metrics.
+
+        This method can be used by distributed systems to merge the state
+        computed by different metric instances. Typically the state will be
+        stored in the form of the metric's weights. For example, a
+        tf.keras.metrics.Mean metric contains a list of two weight values: a
+        total and a count. If there were two instances of a
+        tf.keras.metrics.Accuracy that each independently aggregated partial
+        state for an overall accuracy calculation, these two metric's states
+        could be combined as follows:
+
+        >>> m1 = tf.keras.metrics.Accuracy()
+        >>> _ = m1.update_state([[1], [2]], [[0], [2]])
+
+        >>> m2 = tf.keras.metrics.Accuracy()
+        >>> _ = m2.update_state([[3], [4]], [[3], [4]])
+
+        >>> m2.merge_state([m1])
+        >>> m2.result().numpy()
+        0.75
+
+        Args:
+          metrics: an iterable of metrics. The metrics must have compatible
+            state.
+
+        Raises:
+          ValueError: If the provided iterable does not contain metrics matching
+            the metric's required specifications.
+        """
+        assign_add_ops = []
+        for metric in metrics:
+            if len(self.weights) != len(metric.weights):
+                raise ValueError(
+                    f"Metric {metric} is not compatible with {self}"
+                )
+            for weight, weight_to_add in zip(self.weights, metric.weights):
+                assign_add_ops.append(weight.assign_add(weight_to_add))
+        return assign_add_ops
+
+    @abc.abstractmethod
+    def result(self):
+        """Computes and returns the scalar metric value tensor or a dict of
+        scalars.
+
+        Result computation is an idempotent operation that simply calculates the
+        metric value using the state variables.
+
+        Returns:
+          A scalar tensor, or a dictionary of scalar tensors.
+        """
+        raise NotImplementedError("Must be implemented in subclasses.")
+
+    ### For use by subclasses ###
+    @doc_controls.for_subclass_implementers
+    def add_weight(
+        self,
+        name,
+        shape=(),
+        aggregation=tf.VariableAggregation.SUM,
+        synchronization=tf.VariableSynchronization.ON_READ,
+        initializer=None,
+        dtype=None,
+    ):
+        """Adds state variable. Only for use by subclasses."""
+        if tf.distribute.has_strategy():
+            strategy = tf.distribute.get_strategy()
+        else:
+            strategy = None
+
+        additional_kwargs = {}
+
+        # TODO(b/120571621): Make `ON_READ` work with Keras metrics on TPU.
+        if backend.is_tpu_strategy(strategy):
+            synchronization = tf.VariableSynchronization.ON_WRITE
+        if getattr(self, "_mesh", None) is not None:
+            # When self._mesh is set, it means this metric is used for DTensor.
+            additional_kwargs = {
+                "layout": dtensor.Layout.replicated(
+                    self._mesh, tf.TensorShape(shape).rank
+                )
+            }
+
+        if tf_utils.in_local_vars_context():
+            # Metrics created within a remotely-executed tf.function during
+            # parameter server evaluation should use tf2 Variables, so that they
+            # can be local variables that are freely usable and mutable within
+            # the function, using the
+            # `experimental_enable_variable_lifting=False` argument. This
+            # supports a visitation guarantee for model evaluation.
+            def local_v2_var_creator(
+                initializer=None, dtype=None, shape=None, **kwargs
+            ):
+                init_val, var_dtype = base_layer_utils.infer_init_val_and_dtype(
+                    initializer, dtype, shape
+                )
+                v1_only_args = ["use_resource", "collections"]
+                for v1_arg in v1_only_args:
+                    kwargs.pop(v1_arg, None)
+                kwargs["experimental_enable_variable_lifting"] = False
+                return tf.Variable(
+                    initial_value=init_val,
+                    dtype=var_dtype,
+                    shape=shape,
+                    **kwargs,
+                )
+
+            additional_kwargs["getter"] = local_v2_var_creator
+
+        with tf_utils.maybe_init_scope(layer=self):
+            return super().add_weight(
+                name=name,
+                shape=shape,
+                dtype=self._dtype if dtype is None else dtype,
+                trainable=False,
+                initializer=initializer,
+                collections=[],
+                synchronization=synchronization,
+                aggregation=aggregation,
+                **additional_kwargs,
+            )
+
+    ### End: For use by subclasses ###
+
+    @property
+    def trainable_weights(self):
+        # Overridden from Layer class to track submetric weights.
+        if self.trainable:
+            trainable_weights = self._trainable_weights
+            for m in self._metrics:
+                trainable_weights += m.trainable_weights
+            return self._dedup_weights(trainable_weights)
+        else:
+            return []
+
+    @property
+    def non_trainable_weights(self):
+        # Overridden from Layer class to track submetric weights.
+        if self.trainable:
+            non_trainable_weights = self._non_trainable_weights
+            for m in self._metrics:
+                non_trainable_weights += m.non_trainable_weights
+        else:
+            non_trainable_weights = (
+                self._non_trainable_weights + self._trainable_weights
+            )
+            for m in self._metrics:
+                non_trainable_weights += m.weights
+        return self._dedup_weights(non_trainable_weights)
 
-    This method can be used by distributed systems to merge the state computed
-    by different metric instances. Typically the state will be stored in the
-    form of the metric's weights. For example, a tf.keras.metrics.Mean metric
-    contains a list of two weight values: a total and a count. If there were two
-    instances of a tf.keras.metrics.Accuracy that each independently aggregated
-    partial state for an overall accuracy calculation, these two metric's states
-    could be combined as follows:
+    @property
+    def _trackable_saved_model_saver(self):
+        return metric_serialization.MetricSavedModelSaver(self)
 
-    >>> m1 = tf.keras.metrics.Accuracy()
-    >>> _ = m1.update_state([[1], [2]], [[0], [2]])
+    @generic_utils.default
+    @doc_controls.do_not_generate_docs
+    def reset_states(self):
+        # Backwards compatibility alias of `reset_state`. New classes should
+        # only implement `reset_state`.
+        return self.reset_state()
 
-    >>> m2 = tf.keras.metrics.Accuracy()
-    >>> _ = m2.update_state([[3], [4]], [[3], [4]])
 
-    >>> m2.merge_state([m1])
-    >>> m2.result().numpy()
-    0.75
+class Reduce(Metric):
+    """Encapsulates metrics that perform a reduce operation on the values.
 
     Args:
-      metrics: an iterable of metrics. The metrics must have compatible state.
-
-    Raises:
-      ValueError: If the provided iterable does not contain metrics matching the
-        metric's required specifications.
-    """
-    assign_add_ops = []
-    for metric in metrics:
-      if len(self.weights) != len(metric.weights):
-        raise ValueError(f'Metric {metric} is not compatible with {self}')
-      for weight, weight_to_add in zip(self.weights, metric.weights):
-        assign_add_ops.append(weight.assign_add(weight_to_add))
-    return assign_add_ops
-
-  @abc.abstractmethod
-  def result(self):
-    """Computes and returns the scalar metric value tensor or a dict of scalars.
-
-    Result computation is an idempotent operation that simply calculates the
-    metric value using the state variables.
-
-    Returns:
-      A scalar tensor, or a dictionary of scalar tensors.
+      reduction: a `tf.keras.metrics.Reduction` enum value.
+      name: string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
     """
-    raise NotImplementedError('Must be implemented in subclasses.')
-
-  ### For use by subclasses ###
-  @doc_controls.for_subclass_implementers
-  def add_weight(
-      self,
-      name,
-      shape=(),
-      aggregation=tf.VariableAggregation.SUM,
-      synchronization=tf.VariableSynchronization.ON_READ,
-      initializer=None,
-      dtype=None):
-    """Adds state variable. Only for use by subclasses."""
-    if tf.distribute.has_strategy():
-      strategy = tf.distribute.get_strategy()
-    else:
-      strategy = None
-
-    # TODO(b/120571621): Make `ON_READ` work with Keras metrics on TPU.
-    if backend.is_tpu_strategy(strategy):
-      synchronization = tf.VariableSynchronization.ON_WRITE
-    if getattr(self, '_mesh', None) is not None:
-      # When self._mesh is set, it means this metric is used for DTensor.
-      additional_kwargs = {
-          'layout': dtensor.Layout.replicated(self._mesh,
-                                              tf.TensorShape(shape).rank)}
-    else:
-      additional_kwargs = {}
-
-    with tf.init_scope():
-      return super().add_weight(
-          name=name,
-          shape=shape,
-          dtype=self._dtype if dtype is None else dtype,
-          trainable=False,
-          initializer=initializer,
-          collections=[],
-          synchronization=synchronization,
-          aggregation=aggregation,
-          **additional_kwargs)
-
-  ### End: For use by subclasses ###
-
-  @property
-  def trainable_weights(self):
-    # Overridden from Layer class to track submetric weights.
-    if self.trainable:
-      trainable_weights = self._trainable_weights
-      for m in self._metrics:
-        trainable_weights += m.trainable_weights
-      return self._dedup_weights(trainable_weights)
-    else:
-      return []
-
-  @property
-  def non_trainable_weights(self):
-    # Overridden from Layer class to track submetric weights.
-    if self.trainable:
-      non_trainable_weights = self._non_trainable_weights
-      for m in self._metrics:
-        non_trainable_weights += m.non_trainable_weights
-    else:
-      non_trainable_weights = (
-          self._non_trainable_weights + self._trainable_weights)
-      for m in self._metrics:
-        non_trainable_weights += m.weights
-    return self._dedup_weights(non_trainable_weights)
-
-  @property
-  def _trackable_saved_model_saver(self):
-    return metric_serialization.MetricSavedModelSaver(self)
-
-  @generic_utils.default
-  @doc_controls.do_not_generate_docs
-  def reset_states(self):
-    # Backwards compatibility alias of `reset_state`. New classes should
-    # only implement `reset_state`.
-    return self.reset_state()
-
 
-class Reduce(Metric):
-  """Encapsulates metrics that perform a reduce operation on the values.
-
-  Args:
-    reduction: a `tf.keras.metrics.Reduction` enum value.
-    name: string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-  """
-
-  def __init__(self, reduction, name, dtype=None):
-    super().__init__(name=name, dtype=dtype)
-    self.reduction = reduction
-    self.total = self.add_weight(
-        'total', initializer='zeros')
-    if reduction in [metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
-                     metrics_utils.Reduction.WEIGHTED_MEAN]:
-      self.count = self.add_weight(
-          'count', initializer='zeros')
-
-  def update_state(self, values, sample_weight=None):
-    """Accumulates statistics for computing the metric.
+    def __init__(self, reduction, name, dtype=None):
+        super().__init__(name=name, dtype=dtype)
+        self.reduction = reduction
+        self.total = self.add_weight("total", initializer="zeros")
+        if reduction in [
+            metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+            metrics_utils.Reduction.WEIGHTED_MEAN,
+        ]:
+            self.count = self.add_weight("count", initializer="zeros")
+
+    def update_state(self, values, sample_weight=None):
+        """Accumulates statistics for computing the metric.
+
+        Args:
+          values: Per-example value.
+          sample_weight: Optional weighting of each example. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        [
+            values
+        ], sample_weight = metrics_utils.ragged_assert_compatible_and_get_flat_values(  # noqa: E501
+            [values], sample_weight
+        )
+        try:
+            values = tf.cast(values, self._dtype)
+        except (ValueError, TypeError):
+            msg = (
+                "The output of a metric function can only be a single Tensor. "
+                f"Received: {values}. "
+            )
+            if isinstance(values, dict):
+                msg += (
+                    "To return a dict of values, implement a custom Metric "
+                    "subclass."
+                )
+            raise RuntimeError(msg)
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, self._dtype)
+            # Update dimensions of weights to match with values if possible.
+            (
+                values,
+                _,
+                sample_weight,
+            ) = losses_utils.squeeze_or_expand_dimensions(
+                values, sample_weight=sample_weight
+            )
+            try:
+                # Broadcast weights if possible.
+                sample_weight = tf.__internal__.ops.broadcast_weights(
+                    sample_weight, values
+                )
+            except ValueError:
+                # Reduce values to same ndim as weight array
+                ndim = backend.ndim(values)
+                weight_ndim = backend.ndim(sample_weight)
+                if self.reduction == metrics_utils.Reduction.SUM:
+                    values = tf.reduce_sum(
+                        values, axis=list(range(weight_ndim, ndim))
+                    )
+                else:
+                    values = tf.reduce_mean(
+                        values, axis=list(range(weight_ndim, ndim))
+                    )
+            values = tf.multiply(values, sample_weight)
+
+        value_sum = tf.reduce_sum(values)
+        with tf.control_dependencies([value_sum]):
+            update_total_op = self.total.assign_add(value_sum)
+
+        # Exit early if the reduction doesn't have a denominator.
+        if self.reduction == metrics_utils.Reduction.SUM:
+            return update_total_op
+
+        # Update `count` for reductions that require a denominator.
+        if self.reduction == metrics_utils.Reduction.SUM_OVER_BATCH_SIZE:
+            num_values = tf.cast(tf.size(values), self._dtype)
+        elif self.reduction == metrics_utils.Reduction.WEIGHTED_MEAN:
+            if sample_weight is None:
+                num_values = tf.cast(tf.size(values), self._dtype)
+            else:
+                num_values = tf.reduce_sum(sample_weight)
+        else:
+            raise NotImplementedError(
+                f'Reduction "{self.reduction}" not implemented. Expected '
+                '"sum", "weighted_mean", or "sum_over_batch_size".'
+            )
 
-    Args:
-      values: Per-example value.
-      sample_weight: Optional weighting of each example. Defaults to 1.
+        with tf.control_dependencies([update_total_op]):
+            return self.count.assign_add(num_values)
 
-    Returns:
-      Update op.
-    """
-    [values], sample_weight = \
-        metrics_utils.ragged_assert_compatible_and_get_flat_values(
-            [values], sample_weight)
-    try:
-      values = tf.cast(values, self._dtype)
-    except (ValueError, TypeError):
-      msg = ('The output of a metric function can only be a single Tensor. '
-             f'Received: {values}. ')
-      if isinstance(values, dict):
-        msg += ('To return a dict of values, implement a custom Metric '
-                'subclass.')
-      raise RuntimeError(msg)
-    if sample_weight is not None:
-      sample_weight = tf.cast(sample_weight, self._dtype)
-      # Update dimensions of weights to match with values if possible.
-      values, _, sample_weight = losses_utils.squeeze_or_expand_dimensions(
-          values, sample_weight=sample_weight)
-      try:
-        # Broadcast weights if possible.
-        sample_weight = tf.__internal__.ops.broadcast_weights(
-            sample_weight, values)
-      except ValueError:
-        # Reduce values to same ndim as weight array
-        ndim = backend.ndim(values)
-        weight_ndim = backend.ndim(sample_weight)
+    def result(self):
         if self.reduction == metrics_utils.Reduction.SUM:
-          values = tf.reduce_sum(
-              values, axis=list(range(weight_ndim, ndim)))
+            return tf.identity(self.total)
+        elif self.reduction in [
+            metrics_utils.Reduction.WEIGHTED_MEAN,
+            metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+        ]:
+            return tf.math.divide_no_nan(self.total, self.count)
         else:
-          values = tf.reduce_mean(
-              values, axis=list(range(weight_ndim, ndim)))
-      values = tf.multiply(values, sample_weight)
-
-    value_sum = tf.reduce_sum(values)
-    with tf.control_dependencies([value_sum]):
-      update_total_op = self.total.assign_add(value_sum)
-
-    # Exit early if the reduction doesn't have a denominator.
-    if self.reduction == metrics_utils.Reduction.SUM:
-      return update_total_op
-
-    # Update `count` for reductions that require a denominator.
-    if self.reduction == metrics_utils.Reduction.SUM_OVER_BATCH_SIZE:
-      num_values = tf.cast(tf.size(values), self._dtype)
-    elif self.reduction == metrics_utils.Reduction.WEIGHTED_MEAN:
-      if sample_weight is None:
-        num_values = tf.cast(tf.size(values), self._dtype)
-      else:
-        num_values = tf.reduce_sum(sample_weight)
-    else:
-      raise NotImplementedError(
-          f'Reduction "{self.reduction}" not implemented. Expected '
-          '"sum", "weighted_mean", or "sum_over_batch_size".')
-
-    with tf.control_dependencies([update_total_op]):
-      return self.count.assign_add(num_values)
-
-  def result(self):
-    if self.reduction == metrics_utils.Reduction.SUM:
-      return tf.identity(self.total)
-    elif self.reduction in [
-        metrics_utils.Reduction.WEIGHTED_MEAN,
-        metrics_utils.Reduction.SUM_OVER_BATCH_SIZE
-    ]:
-      return tf.math.divide_no_nan(self.total, self.count)
-    else:
-      raise NotImplementedError(
-          f'Reduction "{self.reduction}" not implemented. Expected '
-          '"sum", "weighted_mean", or "sum_over_batch_size".')
-
-
-@keras_export('keras.metrics.Sum')
+            raise NotImplementedError(
+                f'Reduction "{self.reduction}" not implemented. Expected '
+                '"sum", "weighted_mean", or "sum_over_batch_size".'
+            )
+
+
+@keras_export("keras.metrics.Sum")
 class Sum(Reduce):
-  """Computes the (weighted) sum of the given values.
+    """Computes the (weighted) sum of the given values.
 
-  For example, if values is [1, 3, 5, 7] then the sum is 16.
-  If the weights were specified as [1, 1, 0, 0] then the sum would be 4.
+    For example, if values is [1, 3, 5, 7] then the sum is 16.
+    If the weights were specified as [1, 1, 0, 0] then the sum would be 4.
 
-  This metric creates one variable, `total`, that is used to compute the sum of
-  `values`. This is ultimately returned as `sum`.
+    This metric creates one variable, `total`, that is used to compute the sum
+    of `values`. This is ultimately returned as `sum`.
 
-  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
-  to mask values.
+    If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of
+    0 to mask values.
 
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
 
-  Standalone usage:
+    Standalone usage:
 
-  >>> m = tf.keras.metrics.Sum()
-  >>> m.update_state([1, 3, 5, 7])
-  >>> m.result().numpy()
-  16.0
+    >>> m = tf.keras.metrics.Sum()
+    >>> m.update_state([1, 3, 5, 7])
+    >>> m.result().numpy()
+    16.0
 
-  Usage with `compile()` API:
+    Usage with `compile()` API:
 
-  ```python
-  model.add_metric(tf.keras.metrics.Sum(name='sum_1')(outputs))
-  model.compile(optimizer='sgd', loss='mse')
-  ```
-  """
+    ```python
+    model.add_metric(tf.keras.metrics.Sum(name='sum_1')(outputs))
+    model.compile(optimizer='sgd', loss='mse')
+    ```
+    """
 
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='sum', dtype=None):
-    super().__init__(reduction=metrics_utils.Reduction.SUM,
-                              name=name, dtype=dtype)
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="sum", dtype=None):
+        super().__init__(
+            reduction=metrics_utils.Reduction.SUM, name=name, dtype=dtype
+        )
 
 
-@keras_export('keras.metrics.Mean')
+@keras_export("keras.metrics.Mean")
 class Mean(Reduce):
-  """Computes the (weighted) mean of the given values.
+    """Computes the (weighted) mean of the given values.
 
-  For example, if values is [1, 3, 5, 7] then the mean is 4.
-  If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
+    For example, if values is [1, 3, 5, 7] then the mean is 4.
+    If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
 
-  This metric creates two variables, `total` and `count` that are used to
-  compute the average of `values`. This average is ultimately returned as `mean`
-  which is an idempotent operation that simply divides `total` by `count`.
+    This metric creates two variables, `total` and `count` that are used to
+    compute the average of `values`. This average is ultimately returned as
+    `mean` which is an idempotent operation that simply divides `total` by
+    `count`.
 
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
 
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
 
-  Standalone usage:
+    Standalone usage:
 
-  >>> m = tf.keras.metrics.Mean()
-  >>> m.update_state([1, 3, 5, 7])
-  >>> m.result().numpy()
-  4.0
-  >>> m.reset_state()
-  >>> m.update_state([1, 3, 5, 7], sample_weight=[1, 1, 0, 0])
-  >>> m.result().numpy()
-  2.0
+    >>> m = tf.keras.metrics.Mean()
+    >>> m.update_state([1, 3, 5, 7])
+    >>> m.result().numpy()
+    4.0
+    >>> m.reset_state()
+    >>> m.update_state([1, 3, 5, 7], sample_weight=[1, 1, 0, 0])
+    >>> m.result().numpy()
+    2.0
 
-  Usage with `compile()` API:
+    Usage with `compile()` API:
 
-  ```python
-  model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
-  model.compile(optimizer='sgd', loss='mse')
-  ```
-  """
+    ```python
+    model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
+    model.compile(optimizer='sgd', loss='mse')
+    ```
+    """
 
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='mean', dtype=None):
-    super().__init__(
-        reduction=metrics_utils.Reduction.WEIGHTED_MEAN, name=name, dtype=dtype)
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="mean", dtype=None):
+        super().__init__(
+            reduction=metrics_utils.Reduction.WEIGHTED_MEAN,
+            name=name,
+            dtype=dtype,
+        )
 
 
-@keras_export('keras.metrics.MeanMetricWrapper')
+@keras_export("keras.metrics.MeanMetricWrapper")
 class MeanMetricWrapper(Mean):
-  """Wraps a stateless metric function with the Mean metric.
+    """Wraps a stateless metric function with the Mean metric.
 
-  You could use this class to quickly build a mean metric from a function. The
-  function needs to have the signature `fn(y_true, y_pred)` and return a
-  per-sample loss array. `MeanMetricWrapper.result()` will return
-  the average metric value across all samples seen so far.
+    You could use this class to quickly build a mean metric from a function. The
+    function needs to have the signature `fn(y_true, y_pred)` and return a
+    per-sample loss array. `MeanMetricWrapper.result()` will return
+    the average metric value across all samples seen so far.
 
-  For example:
+    For example:
 
-  ```python
-  def accuracy(y_true, y_pred):
-    return tf.cast(tf.math.equal(y_true, y_pred), tf.float32)
+    ```python
+    def accuracy(y_true, y_pred):
+      return tf.cast(tf.math.equal(y_true, y_pred), tf.float32)
 
-  accuracy_metric = tf.keras.metrics.MeanMetricWrapper(fn=accuracy)
+    accuracy_metric = tf.keras.metrics.MeanMetricWrapper(fn=accuracy)
 
-  keras_model.compile(..., metrics=accuracy_metric)
-  ```
+    keras_model.compile(..., metrics=accuracy_metric)
+    ```
 
-  Args:
-    fn: The metric function to wrap, with signature `fn(y_true, y_pred,
-      **kwargs)`.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    **kwargs: Keyword arguments to pass on to `fn`.
-  """
+    Args:
+      fn: The metric function to wrap, with signature `fn(y_true, y_pred,
+        **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: Keyword arguments to pass on to `fn`.
+    """
 
-  @dtensor_utils.inject_mesh
-  def __init__(self, fn, name=None, dtype=None, **kwargs):
-    super().__init__(name=name, dtype=dtype)
-    self._fn = fn
-    self._fn_kwargs = kwargs
+    @dtensor_utils.inject_mesh
+    def __init__(self, fn, name=None, dtype=None, **kwargs):
+        super().__init__(name=name, dtype=dtype)
+        self._fn = fn
+        self._fn_kwargs = kwargs
 
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates metric statistics.
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates metric statistics.
+
+        `y_true` and `y_pred` should have the same shape.
+
+        Args:
+          y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+          y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+          sample_weight: Optional `sample_weight` acts as a
+            coefficient for the metric. If a scalar is provided, then the metric
+            is simply scaled by the given value. If `sample_weight` is a tensor
+            of size `[batch_size]`, then the metric for each sample of the batch
+            is rescaled by the corresponding element in the `sample_weight`
+            vector. If the shape of `sample_weight` is `[batch_size, d0, ..
+            dN-1]` (or can be broadcasted to this shape), then each metric
+            element of `y_pred` is scaled by the corresponding value of
+            `sample_weight`. (Note on `dN-1`: all metric functions reduce by 1
+            dimension, usually the last axis (-1)).
+
+        Returns:
+          Update op.
+        """
+        y_true = tf.cast(y_true, self._dtype)
+        y_pred = tf.cast(y_pred, self._dtype)
+        [
+            y_true,
+            y_pred,
+        ], sample_weight = metrics_utils.ragged_assert_compatible_and_get_flat_values(  # noqa: E501
+            [y_true, y_pred], sample_weight
+        )
+        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
+            y_pred, y_true
+        )
+
+        ag_fn = tf.__internal__.autograph.tf_convert(
+            self._fn, tf.__internal__.autograph.control_status_ctx()
+        )
+        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
+        mask = losses_utils.get_mask(matches)
+        sample_weight = losses_utils.apply_valid_mask(
+            matches, sample_weight, mask, self.reduction
+        )
+        return super().update_state(matches, sample_weight=sample_weight)
+
+    def get_config(self):
+        config = {
+            k: backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
+            for k, v in self._fn_kwargs.items()
+        }
+
+        if type(self) is MeanMetricWrapper:
+            # Only include function argument when the object is a
+            # MeanMetricWrapper and not a subclass.
+            config["fn"] = self._fn
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config):
+        from keras.metrics import get
+
+        # Note that while MeanMetricWrapper itself isn't public, objects of this
+        # class may be created and added to the model by calling model.compile.
+        fn = config.pop("fn", None)
+        if cls is MeanMetricWrapper:
+            return cls(get(fn), **config)
+        return super(MeanMetricWrapper, cls).from_config(config)
+
+
+@keras_export("keras.metrics.MeanTensor")
+class MeanTensor(Metric):
+    """Computes the element-wise (weighted) mean of the given tensors.
 
-    `y_true` and `y_pred` should have the same shape.
+    `MeanTensor` returns a tensor with the same shape of the input tensors. The
+    mean value is updated by keeping local variables `total` and `count`. The
+    `total` tracks the sum of the weighted values, and `count` stores the sum of
+    the weighted counts.
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-      sample_weight: Optional `sample_weight` acts as a
-        coefficient for the metric. If a scalar is provided, then the metric is
-        simply scaled by the given value. If `sample_weight` is a tensor of size
-        `[batch_size]`, then the metric for each sample of the batch is rescaled
-        by the corresponding element in the `sample_weight` vector. If the shape
-        of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be broadcasted
-        to this shape), then each metric element of `y_pred` is scaled by the
-        corresponding value of `sample_weight`. (Note on `dN-1`: all metric
-        functions reduce by 1 dimension, usually the last axis (-1)).
-
-    Returns:
-      Update op.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      shape: (Optional) A list of integers, a tuple of integers, or a 1-D Tensor
+        of type int32. If not specified, the shape is inferred from the values
+        at the first call of update_state.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.MeanTensor()
+    >>> m.update_state([0, 1, 2, 3])
+    >>> m.update_state([4, 5, 6, 7])
+    >>> m.result().numpy()
+    array([2., 3., 4., 5.], dtype=float32)
+
+    >>> m.update_state([12, 10, 8, 6], sample_weight= [0, 0.2, 0.5, 1])
+    >>> m.result().numpy()
+    array([2.       , 3.6363635, 4.8      , 5.3333335], dtype=float32)
+
+    >>> m = tf.keras.metrics.MeanTensor(dtype=tf.float64, shape=(1, 4))
+    >>> m.result().numpy()
+    array([[0., 0., 0., 0.]])
+    >>> m.update_state([[0, 1, 2, 3]])
+    >>> m.update_state([[4, 5, 6, 7]])
+    >>> m.result().numpy()
+    array([[2., 3., 4., 5.]])
     """
-    y_true = tf.cast(y_true, self._dtype)
-    y_pred = tf.cast(y_pred, self._dtype)
-    [y_true, y_pred], sample_weight = (
-        metrics_utils.ragged_assert_compatible_and_get_flat_values(
-            [y_true, y_pred], sample_weight))
-    y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
-        y_pred, y_true)
-
-    ag_fn = tf.__internal__.autograph.tf_convert(self._fn, tf.__internal__.autograph.control_status_ctx())
-    matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
-    return super().update_state(
-        matches, sample_weight=sample_weight)
-
-  def get_config(self):
-    config = {}
-
-    if type(self) is MeanMetricWrapper:  # pylint: disable=unidiomatic-typecheck
-      # Only include function argument when the object is a MeanMetricWrapper
-      # and not a subclass.
-      config['fn'] = self._fn
-
-    for k, v in self._fn_kwargs.items():
-      config[k] = backend.eval(v) if is_tensor_or_variable(v) else v
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    from keras.metrics import get  # pylint: disable=g-import-not-at-top
-    # Note that while MeanMetricWrapper itself isn't public, objects of this
-    # class may be created and added to the model by calling model.compile.
-    fn = config.pop('fn', None)
-    if cls is MeanMetricWrapper:
-      return cls(get(fn), **config)
-    return super(MeanMetricWrapper, cls).from_config(config)
-
-
-@keras_export('keras.metrics.MeanTensor')
-class MeanTensor(Metric):
-  """Computes the element-wise (weighted) mean of the given tensors.
-
-  `MeanTensor` returns a tensor with the same shape of the input tensors. The
-  mean value is updated by keeping local variables `total` and `count`. The
-  `total` tracks the sum of the weighted values, and `count` stores the sum of
-  the weighted counts.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    shape: (Optional) A list of integers, a tuple of integers, or a 1-D Tensor
-      of type int32. If not specified, the shape is inferred from the values at
-      the first call of update_state.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.MeanTensor()
-  >>> m.update_state([0, 1, 2, 3])
-  >>> m.update_state([4, 5, 6, 7])
-  >>> m.result().numpy()
-  array([2., 3., 4., 5.], dtype=float32)
-
-  >>> m.update_state([12, 10, 8, 6], sample_weight= [0, 0.2, 0.5, 1])
-  >>> m.result().numpy()
-  array([2.       , 3.6363635, 4.8      , 5.3333335], dtype=float32)
-
-  >>> m = tf.keras.metrics.MeanTensor(dtype=tf.float64, shape=(1, 4))
-  >>> m.result().numpy()
-  array([[0., 0., 0., 0.]])
-  >>> m.update_state([[0, 1, 2, 3]])
-  >>> m.update_state([[4, 5, 6, 7]])
-  >>> m.result().numpy()
-  array([[2., 3., 4., 5.]])
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='mean_tensor', dtype=None, shape=None):
-    super().__init__(name=name, dtype=dtype)
-    self._shape = None
-    self._total = None
-    self._count = None
-    self._built = False
-    if shape is not None:
-      self._build(shape)
-
-  def _build(self, shape):
-    self._shape = tf.TensorShape(shape)
-    self._build_input_shape = self._shape
-    # Create new state variables
-    self._total = self.add_weight(
-        name='total', shape=shape, initializer='zeros')
-    self._count = self.add_weight(
-        name='count', shape=shape, initializer='zeros')
-    with tf.init_scope():
-      if not tf.executing_eagerly():
-        backend._initialize_variables(backend._get_session())  # pylint: disable=protected-access
-    self._built = True
-
-  @property
-  def total(self):
-    return self._total if self._built else None
-
-  @property
-  def count(self):
-    return self._count if self._built else None
-
-  def update_state(self, values, sample_weight=None):
-    """Accumulates statistics for computing the element-wise mean.
 
-    Args:
-      values: Per-example value.
-      sample_weight: Optional weighting of each example. Defaults to 1.
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="mean_tensor", dtype=None, shape=None):
+        super().__init__(name=name, dtype=dtype)
+        self._shape = None
+        self._total = None
+        self._count = None
+        self._built = False
+        if shape is not None:
+            self._build(shape)
+
+    def _build(self, shape):
+        self._shape = tf.TensorShape(shape)
+        self._build_input_shape = self._shape
+        # Create new state variables
+        self._total = self.add_weight(
+            name="total", shape=shape, initializer="zeros"
+        )
+        self._count = self.add_weight(
+            name="count", shape=shape, initializer="zeros"
+        )
+        with tf.init_scope():
+            if not tf.executing_eagerly():
+                backend._initialize_variables(backend._get_session())
+        self._built = True
+
+    @property
+    def total(self):
+        return self._total if self._built else None
+
+    @property
+    def count(self):
+        return self._count if self._built else None
+
+    def update_state(self, values, sample_weight=None):
+        """Accumulates statistics for computing the element-wise mean.
+
+        Args:
+          values: Per-example value.
+          sample_weight: Optional weighting of each example. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        values = tf.cast(values, self._dtype)
+        if not self._built:
+            self._build(values.shape)
+        elif values.shape != self._shape:
+            raise ValueError(
+                "MeanTensor input values must always have the same "
+                "shape. Expected shape (set during the first call): "
+                f"{self._shape}. "
+                f"Got: {values.shape}."
+            )
+
+        num_values = tf.ones_like(values)
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, self._dtype)
+
+            # Update dimensions of weights to match with values if possible.
+            (
+                values,
+                _,
+                sample_weight,
+            ) = losses_utils.squeeze_or_expand_dimensions(
+                values, sample_weight=sample_weight
+            )
+            try:
+                # Broadcast weights if possible.
+                sample_weight = tf.__internal__.ops.broadcast_weights(
+                    sample_weight, values
+                )
+            except ValueError:
+                # Reduce values to same ndim as weight array
+                ndim = backend.ndim(values)
+                weight_ndim = backend.ndim(sample_weight)
+                values = tf.reduce_mean(
+                    values, axis=list(range(weight_ndim, ndim))
+                )
+
+            num_values = tf.multiply(num_values, sample_weight)
+            values = tf.multiply(values, sample_weight)
+
+        update_total_op = self._total.assign_add(values)
+        with tf.control_dependencies([update_total_op]):
+            return self._count.assign_add(num_values)
 
-    Returns:
-      Update op.
-    """
-    values = tf.cast(values, self._dtype)
-    if not self._built:
-      self._build(values.shape)
-    elif values.shape != self._shape:
-      raise ValueError(
-          'MeanTensor input values must always have the same '
-          f'shape. Expected shape (set during the first call): {self._shape}. '
-          f'Got: {values.shape}.')
-
-    num_values = tf.ones_like(values)
-    if sample_weight is not None:
-      sample_weight = tf.cast(sample_weight, self._dtype)
-
-      # Update dimensions of weights to match with values if possible.
-      values, _, sample_weight = losses_utils.squeeze_or_expand_dimensions(
-          values, sample_weight=sample_weight)
-      try:
-        # Broadcast weights if possible.
-        sample_weight = tf.__internal__.ops.broadcast_weights(
-            sample_weight, values)
-      except ValueError:
-        # Reduce values to same ndim as weight array
-        ndim = backend.ndim(values)
-        weight_ndim = backend.ndim(sample_weight)
-        values = tf.reduce_mean(
-            values, axis=list(range(weight_ndim, ndim)))
-
-      num_values = tf.multiply(num_values, sample_weight)
-      values = tf.multiply(values, sample_weight)
-
-    update_total_op = self._total.assign_add(values)
-    with tf.control_dependencies([update_total_op]):
-      return self._count.assign_add(num_values)
-
-  def result(self):
-    if not self._built:
-      raise ValueError(
-          'MeanTensor does not have any value yet. Please call the MeanTensor '
-          'instance or use `.update_state(value)` before retrieving the result.'
-          )
-    return tf.math.divide_no_nan(self.total, self.count)
-
-  def reset_state(self):
-    if self._built:
-      backend.batch_set_value([
-          (v, np.zeros(v.shape.as_list())) for v in self.variables
-      ])
+    def result(self):
+        if not self._built:
+            raise ValueError(
+                "MeanTensor does not have any value yet. Please call the "
+                "MeanTensor instance or use `.update_state(value)` "
+                "before retrieving the result."
+            )
+        return tf.math.divide_no_nan(self.total, self.count)
+
+    def reset_state(self):
+        if self._built:
+            backend.batch_set_value(
+                [(v, np.zeros(v.shape.as_list())) for v in self.variables]
+            )
 
 
 class SumOverBatchSize(Reduce):
-  """Computes the weighted sum over batch size of the given values.
+    """Computes the weighted sum over batch size of the given values.
 
-  For example, if values is [1, 3, 5, 7] then the metric value is 4.
-  If the weights were specified as [1, 1, 0, 0] then the value would be 1.
+    For example, if values is [1, 3, 5, 7] then the metric value is 4.
+    If the weights were specified as [1, 1, 0, 0] then the value would be 1.
 
-  This metric creates two variables, `total` and `count` that are used to
-  compute the average of `values`. This average is ultimately returned as sum
-  over batch size which is an idempotent operation that simply divides `total`
-  by `count`.
+    This metric creates two variables, `total` and `count` that are used to
+    compute the average of `values`. This average is ultimately returned as sum
+    over batch size which is an idempotent operation that simply divides `total`
+    by `count`.
 
-  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
-  to mask values.
-  """
+    If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of
+    0 to mask values.
+    """
 
-  def __init__(self, name='sum_over_batch_size', dtype=None):
-    super().__init__(
-        reduction=metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
-        name=name,
-        dtype=dtype)
+    def __init__(self, name="sum_over_batch_size", dtype=None):
+        super().__init__(
+            reduction=metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+            name=name,
+            dtype=dtype,
+        )
 
 
 class SumOverBatchSizeMetricWrapper(SumOverBatchSize):
-  """Wraps a function with the `SumOverBatchSizeMetricWrapper` metric."""
-
-  def __init__(self, fn, name=None, dtype=None, **kwargs):
-    """Creates a `SumOverBatchSizeMetricWrapper` instance.
-
-    Args:
-      fn: The metric function to wrap, with signature `fn(y_true, y_pred,
-        **kwargs)`.
-      name: (Optional) string name of the metric instance.
-      dtype: (Optional) data type of the metric result.
-      **kwargs: The keyword arguments that are passed on to `fn`.
-    """
-    super().__init__(name=name, dtype=dtype)
-    self._fn = fn
-    self._fn_kwargs = kwargs
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    y_true = tf.cast(y_true, self._dtype)
-    y_pred = tf.cast(y_pred, self._dtype)
-    y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
-        y_pred, y_true)
-
-    ag_fn = tf.__internal__.autograph.tf_convert(self._fn, tf.__internal__.autograph.control_status_ctx())
-    matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
-    return super().update_state(
-        matches, sample_weight=sample_weight)
+    """Wraps a function with the `SumOverBatchSizeMetricWrapper` metric."""
+
+    def __init__(self, fn, name=None, dtype=None, **kwargs):
+        """Creates a `SumOverBatchSizeMetricWrapper` instance.
+
+        Args:
+          fn: The metric function to wrap, with signature `fn(y_true, y_pred,
+            **kwargs)`.
+          name: (Optional) string name of the metric instance.
+          dtype: (Optional) data type of the metric result.
+          **kwargs: The keyword arguments that are passed on to `fn`.
+        """
+        super().__init__(name=name, dtype=dtype)
+        self._fn = fn
+        self._fn_kwargs = kwargs
 
-  def get_config(self):
-    config = {}
-    for k, v in self._fn_kwargs.items():
-      config[k] = backend.eval(v) if is_tensor_or_variable(v) else v
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, self._dtype)
+        y_pred = tf.cast(y_pred, self._dtype)
+        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
+            y_pred, y_true
+        )
+
+        ag_fn = tf.__internal__.autograph.tf_convert(
+            self._fn, tf.__internal__.autograph.control_status_ctx()
+        )
+        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
+        mask = losses_utils.get_mask(matches)
+        sample_weight = losses_utils.apply_valid_mask(
+            matches, sample_weight, mask, self.reduction
+        )
+        return super().update_state(matches, sample_weight=sample_weight)
+
+    def get_config(self):
+        config = {
+            k: backend.eval(v) if tf_utils.is_tensor_or_variable(v) else v
+            for k, v in self._fn_kwargs.items()
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 def clone_metric(metric):
-  """Returns a clone of the metric if stateful, otherwise returns it as is."""
-  if isinstance(metric, Metric):
-    with tf.init_scope():
-      return metric.__class__.from_config(metric.get_config())
-  return metric
+    """Returns a clone of the metric if stateful, otherwise returns it as is."""
+    if isinstance(metric, Metric):
+        # Metrics created within a remotely-executed tf.function during
+        # parameter server evaluation should not be lifted out of the graph by
+        # `init_scope`. This way the metric variables can be local: freely
+        # usable and mutable within the function. This supports a visitation
+        # guarantee for model evaluation.
+        if tf_utils.in_local_vars_context():
+            return metric.__class__.from_config(metric.get_config())
+        else:
+            with tf.init_scope():
+                return metric.__class__.from_config(metric.get_config())
+    return metric
 
 
 def clone_metrics(metrics):
-  """Clones the given metric list/dict."""
-  return tf.nest.map_structure(clone_metric, metrics)
+    """Clones the given metric list/dict."""
+    return tf.nest.map_structure(clone_metric, metrics)
 
 
 def is_built_in(cls):
-  return cls.__module__.startswith('.'.join(Metric.__module__.split('.')[:-1]))
+    return cls.__module__.startswith(
+        ".".join(Metric.__module__.split(".")[:-1])
+    )
+
+
+class _MetricDict(dict):
+    """Wrapper for returned dictionary of metrics."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._metric_obj = None
diff --git a/keras/metrics/base_metric_test.py b/keras/metrics/base_metric_test.py
index 11ba02d0f3ca..d7287179f89f 100644
--- a/keras/metrics/base_metric_test.py
+++ b/keras/metrics/base_metric_test.py
@@ -17,727 +17,802 @@
 import copy
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
+from keras import Model
 from keras import layers
 from keras import metrics
-from keras import Model
 from keras.engine import base_layer
 from keras.engine import training as training_module
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class KerasSumTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_sum(self):
-    with self.test_session():
-      m = metrics.Sum(name='my_sum')
-
-      # check config
-      self.assertEqual(m.name, 'my_sum')
-      self.assertTrue(m.stateful)
-      self.assertEqual(m.dtype, tf.float32)
-      self.assertLen(m.variables, 1)
-      self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-      # check initial state
-      self.assertEqual(self.evaluate(m.total), 0)
-
-      # check __call__()
-      self.assertEqual(self.evaluate(m(100)), 100)
-      self.assertEqual(self.evaluate(m.total), 100)
-
-      # check update_state() and result() + state accumulation + tensor input
-      update_op = m.update_state(tf.convert_to_tensor([1, 5]))
-      self.evaluate(update_op)
-      self.assertAlmostEqual(self.evaluate(m.result()), 106)
-      self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
-
-      # check reset_state()
-      m.reset_state()
-      self.assertEqual(self.evaluate(m.total), 0)
-
-  def test_sum_with_sample_weight(self):
-    m = metrics.Sum(dtype=tf.float64)
-    self.assertEqual(m.dtype, tf.float64)
-    self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-    # check scalar weight
-    result_t = m(100, sample_weight=0.5)
-    self.assertEqual(self.evaluate(result_t), 50)
-    self.assertEqual(self.evaluate(m.total), 50)
-
-    # check weights not scalar and weights rank matches values rank
-    result_t = m([1, 5], sample_weight=[1, 0.2])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 52., 4)  # 50 + 1 + 5 * 0.2
-    self.assertAlmostEqual(self.evaluate(m.total), 52., 4)
-
-    # check weights broadcast
-    result_t = m([1, 2], sample_weight=0.5)
-    self.assertAlmostEqual(self.evaluate(result_t), 53.5, 1)  # 52 + 0.5 + 1
-    self.assertAlmostEqual(self.evaluate(m.total), 53.5, 1)
-
-    # check weights squeeze
-    result_t = m([1, 5], sample_weight=[[1], [0.2]])
-    self.assertAlmostEqual(self.evaluate(result_t), 55.5, 1)  # 53.5 + 1 + 1
-    self.assertAlmostEqual(self.evaluate(m.total), 55.5, 1)
-
-    # check weights expand
-    result_t = m([[1], [5]], sample_weight=[1, 0.2])
-    self.assertAlmostEqual(self.evaluate(result_t), 57.5, 2)  # 55.5 + 1 + 1
-    self.assertAlmostEqual(self.evaluate(m.total), 57.5, 1)
-
-    # check values reduced to the dimensions of weight
-    result_t = m([[[1., 2.], [3., 2.], [0.5, 4.]]], sample_weight=[0.5])
-    result = np.round(self.evaluate(result_t), decimals=2)
-    # result = (prev: 57.5) + 0.5 + 1 + 1.5 + 1 + 0.25 + 2
-    self.assertAlmostEqual(result, 63.75, 2)
-    self.assertAlmostEqual(self.evaluate(m.total), 63.75, 2)
-
-  def test_sum_graph_with_placeholder(self):
-    with tf.compat.v1.get_default_graph().as_default(), self.cached_session() as sess:
-      m = metrics.Sum()
-      v = tf.compat.v1.placeholder(tf.float32)
-      w = tf.compat.v1.placeholder(tf.float32)
-      self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-      # check __call__()
-      result_t = m(v, sample_weight=w)
-      result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
-      self.assertEqual(result, 50)
-      self.assertEqual(self.evaluate(m.total), 50)
-
-      # check update_state() and result()
-      result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
-      self.assertAlmostEqual(result, 52., 2)  # 50 + 1 + 5 * 0.2
-      self.assertAlmostEqual(self.evaluate(m.total), 52., 2)
-
-  def test_save_restore(self):
-    with self.test_session():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-      m = metrics.Sum()
-      checkpoint = tf.train.Checkpoint(sum=m)
-      self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-      # update state
-      self.evaluate(m(100.))
-      self.evaluate(m(200.))
-
-      # save checkpoint and then add an update
-      save_path = checkpoint.save(checkpoint_prefix)
-      self.evaluate(m(1000.))
-
-      # restore to the same checkpoint sum object (= 300)
-      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-      self.evaluate(m(300.))
-      self.assertEqual(600., self.evaluate(m.result()))
-
-      # restore to a different checkpoint sum object
-      restore_sum = metrics.Sum()
-      restore_checkpoint = tf.train.Checkpoint(sum=restore_sum)
-      status = restore_checkpoint.restore(save_path)
-      restore_update = restore_sum(300.)
-      status.assert_consumed().run_restore_ops()
-      self.evaluate(restore_update)
-      self.assertEqual(600., self.evaluate(restore_sum.result()))
+    def test_sum(self):
+        with self.test_session():
+            m = metrics.Sum(name="my_sum")
+
+            # check config
+            self.assertEqual(m.name, "my_sum")
+            self.assertTrue(m.stateful)
+            self.assertEqual(m.dtype, tf.float32)
+            self.assertLen(m.variables, 1)
+            self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+            # check initial state
+            self.assertEqual(self.evaluate(m.total), 0)
+
+            # check __call__()
+            self.assertEqual(self.evaluate(m(100)), 100)
+            self.assertEqual(self.evaluate(m.total), 100)
+
+            # check update_state() and result() + state accumulation + tensor
+            # input
+            update_op = m.update_state(tf.convert_to_tensor([1, 5]))
+            self.evaluate(update_op)
+            self.assertAlmostEqual(self.evaluate(m.result()), 106)
+            self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
+
+            # check reset_state()
+            m.reset_state()
+            self.assertEqual(self.evaluate(m.total), 0)
+
+    def test_sum_with_sample_weight(self):
+        m = metrics.Sum(dtype=tf.float64)
+        self.assertEqual(m.dtype, tf.float64)
+        self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+        # check scalar weight
+        result_t = m(100, sample_weight=0.5)
+        self.assertEqual(self.evaluate(result_t), 50)
+        self.assertEqual(self.evaluate(m.total), 50)
+
+        # check weights not scalar and weights rank matches values rank
+        result_t = m([1, 5], sample_weight=[1, 0.2])
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 52.0, 4)  # 50 + 1 + 5 * 0.2
+        self.assertAlmostEqual(self.evaluate(m.total), 52.0, 4)
+
+        # check weights broadcast
+        result_t = m([1, 2], sample_weight=0.5)
+        self.assertAlmostEqual(self.evaluate(result_t), 53.5, 1)  # 52 + 0.5 + 1
+        self.assertAlmostEqual(self.evaluate(m.total), 53.5, 1)
+
+        # check weights squeeze
+        result_t = m([1, 5], sample_weight=[[1], [0.2]])
+        self.assertAlmostEqual(self.evaluate(result_t), 55.5, 1)  # 53.5 + 1 + 1
+        self.assertAlmostEqual(self.evaluate(m.total), 55.5, 1)
+
+        # check weights expand
+        result_t = m([[1], [5]], sample_weight=[1, 0.2])
+        self.assertAlmostEqual(self.evaluate(result_t), 57.5, 2)  # 55.5 + 1 + 1
+        self.assertAlmostEqual(self.evaluate(m.total), 57.5, 1)
+
+        # check values reduced to the dimensions of weight
+        result_t = m(
+            [[[1.0, 2.0], [3.0, 2.0], [0.5, 4.0]]], sample_weight=[0.5]
+        )
+        result = np.round(self.evaluate(result_t), decimals=2)
+        # result = (prev: 57.5) + 0.5 + 1 + 1.5 + 1 + 0.25 + 2
+        self.assertAlmostEqual(result, 63.75, 2)
+        self.assertAlmostEqual(self.evaluate(m.total), 63.75, 2)
+
+    def test_sum_graph_with_placeholder(self):
+        with tf.compat.v1.get_default_graph().as_default(), self.cached_session() as sess:  # noqa: E501
+            m = metrics.Sum()
+            v = tf.compat.v1.placeholder(tf.float32)
+            w = tf.compat.v1.placeholder(tf.float32)
+            self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+            # check __call__()
+            result_t = m(v, sample_weight=w)
+            result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
+            self.assertEqual(result, 50)
+            self.assertEqual(self.evaluate(m.total), 50)
+
+            # check update_state() and result()
+            result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
+            self.assertAlmostEqual(result, 52.0, 2)  # 50 + 1 + 5 * 0.2
+            self.assertAlmostEqual(self.evaluate(m.total), 52.0, 2)
+
+    def test_save_restore(self):
+        with self.test_session():
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            m = metrics.Sum()
+            checkpoint = tf.train.Checkpoint(sum=m)
+            self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+            # update state
+            self.evaluate(m(100.0))
+            self.evaluate(m(200.0))
+
+            # save checkpoint and then add an update
+            save_path = checkpoint.save(checkpoint_prefix)
+            self.evaluate(m(1000.0))
+
+            # restore to the same checkpoint sum object (= 300)
+            checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+            self.evaluate(m(300.0))
+            self.assertEqual(600.0, self.evaluate(m.result()))
+
+            # restore to a different checkpoint sum object
+            restore_sum = metrics.Sum()
+            restore_checkpoint = tf.train.Checkpoint(sum=restore_sum)
+            status = restore_checkpoint.restore(save_path)
+            restore_update = restore_sum(300.0)
+            status.assert_consumed().run_restore_ops()
+            self.evaluate(restore_update)
+            self.assertEqual(600.0, self.evaluate(restore_sum.result()))
+
+    def test_init_scope_during_add_weight(self):
+        seen_variables = 0
+
+        def capture_variable_creation(next_creator_fn, **kwargs) -> tf.Variable:
+            nonlocal seen_variables
+            seen_variables += 1
+            return tf.constant(seen_variables)
+
+        @tf.function
+        def create_variables():
+            # When this method is called in a graph context, any usage of
+            # `tf.init_scope` will bypass this variable creator scope, resulting
+            # in different behavior.
+            with tf.variable_creator_scope(capture_variable_creation):
+                return metrics.Sum().variables
+
+        metric_variables = self.evaluate(create_variables())
+        # The Sum metric contains a single `total` variable, which the creation
+        # scope has changed to a `1` tensor.
+        self.assertAllEqual([1], metric_variables)
 
 
 class MeanTest(test_combinations.TestCase):
 
-  # TODO(b/120949004): Re-enable garbage collection check
-  # @tf_test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  @test_combinations.run_all_keras_modes
-  def test_mean(self):
-    m = metrics.Mean(name='my_mean')
-
-    # check config
-    self.assertEqual(m.name, 'my_mean')
-    self.assertTrue(m.stateful)
-    self.assertEqual(m.dtype, tf.float32)
-    self.assertEqual(len(m.variables), 2)
-    self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-    # check initial state
-    self.assertEqual(self.evaluate(m.total), 0)
-    self.assertEqual(self.evaluate(m.count), 0)
-
-    # check __call__()
-    self.assertEqual(self.evaluate(m(100)), 100)
-    self.assertEqual(self.evaluate(m.total), 100)
-    self.assertEqual(self.evaluate(m.count), 1)
-
-    # check update_state() and result() + state accumulation + tensor input
-    update_op = m.update_state([
-        tf.convert_to_tensor(1),
-        tf.convert_to_tensor(5)
-    ])
-    self.evaluate(update_op)
-    self.assertAlmostEqual(self.evaluate(m.result()), 106 / 3, 2)
-    self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
-    self.assertEqual(self.evaluate(m.count), 3)
-
-    # check reset_state()
-    m.reset_state()
-    self.assertEqual(self.evaluate(m.total), 0)
-    self.assertEqual(self.evaluate(m.count), 0)
-
-    # Check save and restore config
-    m2 = metrics.Mean.from_config(m.get_config())
-    self.assertEqual(m2.name, 'my_mean')
-    self.assertTrue(m2.stateful)
-    self.assertEqual(m2.dtype, tf.float32)
-    self.assertEqual(len(m2.variables), 2)
-
-  @test_utils.run_v2_only
-  def test_function_wrapped_reset_state(self):
-    m = metrics.Mean(name='my_mean')
-
-    # check reset_state in function.
-    @tf.function
-    def reset_in_fn():
-      m.reset_state()
-      return m.update_state(100)
-
-    for _ in range(5):
-      self.evaluate(reset_in_fn())
-    self.assertEqual(self.evaluate(m.count), 1)
-
-  @test_combinations.run_all_keras_modes
-  def test_mean_with_sample_weight(self):
-    m = metrics.Mean(dtype=tf.float64)
-    self.assertEqual(m.dtype, tf.float64)
-    self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-    # check scalar weight
-    result_t = m(100, sample_weight=0.5)
-    self.assertEqual(self.evaluate(result_t), 50 / 0.5)
-    self.assertEqual(self.evaluate(m.total), 50)
-    self.assertEqual(self.evaluate(m.count), 0.5)
-
-    # check weights not scalar and weights rank matches values rank
-    result_t = m([1, 5], sample_weight=[1, 0.2])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 52 / 1.7, 2)
-    self.assertAlmostEqual(self.evaluate(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
-    self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
-
-    # check weights broadcast
-    result_t = m([1, 2], sample_weight=0.5)
-    self.assertAlmostEqual(self.evaluate(result_t), 53.5 / 2.7, 2)
-    self.assertAlmostEqual(self.evaluate(m.total), 53.5, 2)  # 52 + 0.5 + 1
-    self.assertAlmostEqual(self.evaluate(m.count), 2.7, 2)  # 1.7 + 0.5 + 0.5
-
-    # check weights squeeze
-    result_t = m([1, 5], sample_weight=[[1], [0.2]])
-    self.assertAlmostEqual(self.evaluate(result_t), 55.5 / 3.9, 2)
-    self.assertAlmostEqual(self.evaluate(m.total), 55.5, 2)  # 53.5 + 1 + 1
-    self.assertAlmostEqual(self.evaluate(m.count), 3.9, 2)  # 2.7 + 1.2
-
-    # check weights expand
-    result_t = m([[1], [5]], sample_weight=[1, 0.2])
-    self.assertAlmostEqual(self.evaluate(result_t), 57.5 / 5.1, 2)
-    self.assertAlmostEqual(self.evaluate(m.total), 57.5, 2)  # 55.5 + 1 + 1
-    self.assertAlmostEqual(self.evaluate(m.count), 5.1, 2)  # 3.9 + 1.2
-
-    # check values reduced to the dimensions of weight
-    result_t = m([[[1., 2.], [3., 2.], [0.5, 4.]]], sample_weight=[0.5])
-    result = np.round(self.evaluate(result_t), decimals=2)  # 58.5 / 5.6
-    self.assertEqual(result, 10.45)
-    self.assertEqual(np.round(self.evaluate(m.total), decimals=2), 58.54)
-    self.assertEqual(np.round(self.evaluate(m.count), decimals=2), 5.6)
-
-  @test_combinations.run_all_keras_modes
-  def test_mean_graph_with_placeholder(self):
-    with tf.compat.v1.get_default_graph().as_default(), self.cached_session() as sess:
-      m = metrics.Mean()
-      v = tf.compat.v1.placeholder(tf.float32)
-      w = tf.compat.v1.placeholder(tf.float32)
-      self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-      # check __call__()
-      result_t = m(v, sample_weight=w)
-      result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
-      self.assertEqual(self.evaluate(m.total), 50)
-      self.assertEqual(self.evaluate(m.count), 0.5)
-      self.assertEqual(result, 50 / 0.5)
-
-      # check update_state() and result()
-      result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
-      self.assertAlmostEqual(self.evaluate(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
-      self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
-      self.assertAlmostEqual(result, 52 / 1.7, 2)
-
-  @test_combinations.run_all_keras_modes
-  def test_save_restore(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-    m = metrics.Mean()
-    checkpoint = tf.train.Checkpoint(mean=m)
-    self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-
-    # update state
-    self.evaluate(m(100.))
-    self.evaluate(m(200.))
-
-    # save checkpoint and then add an update
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.evaluate(m(1000.))
-
-    # restore to the same checkpoint mean object
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.evaluate(m(300.))
-    self.assertEqual(200., self.evaluate(m.result()))
-
-    # restore to a different checkpoint mean object
-    restore_mean = metrics.Mean()
-    restore_checkpoint = tf.train.Checkpoint(mean=restore_mean)
-    status = restore_checkpoint.restore(save_path)
-    restore_update = restore_mean(300.)
-    status.assert_consumed().run_restore_ops()
-    self.evaluate(restore_update)
-    self.assertEqual(200., self.evaluate(restore_mean.result()))
-    self.assertEqual(3, self.evaluate(restore_mean.count))
-
-  @test_combinations.run_all_keras_modes
-  def test_multiple_instances(self):
-    m = metrics.Mean()
-    m2 = metrics.Mean()
-
-    self.assertEqual(m.name, 'mean')
-    self.assertEqual(m2.name, 'mean')
-
-    self.assertEqual([v.name for v in m.variables],
-                     test_utils.get_expected_metric_variable_names(
-                         ['total', 'count']))
-    self.assertEqual([v.name for v in m2.variables],
-                     test_utils.get_expected_metric_variable_names(
-                         ['total', 'count'], name_suffix='_1'))
-
-    self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-    self.evaluate(tf.compat.v1.variables_initializer(m2.variables))
-
-    # check initial state
-    self.assertEqual(self.evaluate(m.total), 0)
-    self.assertEqual(self.evaluate(m.count), 0)
-    self.assertEqual(self.evaluate(m2.total), 0)
-    self.assertEqual(self.evaluate(m2.count), 0)
-
-    # check __call__()
-    self.assertEqual(self.evaluate(m(100)), 100)
-    self.assertEqual(self.evaluate(m.total), 100)
-    self.assertEqual(self.evaluate(m.count), 1)
-    self.assertEqual(self.evaluate(m2.total), 0)
-    self.assertEqual(self.evaluate(m2.count), 0)
-
-    self.assertEqual(self.evaluate(m2([63, 10])), 36.5)
-    self.assertEqual(self.evaluate(m2.total), 73)
-    self.assertEqual(self.evaluate(m2.count), 2)
-    self.assertEqual(self.evaluate(m.result()), 100)
-    self.assertEqual(self.evaluate(m.total), 100)
-    self.assertEqual(self.evaluate(m.count), 1)
-
-  @test_utils.run_v2_only
-  def test_deepcopy_of_metrics(self):
-    m = metrics.Mean(name='my_mean')
-
-    m.reset_state()
-    m.update_state(100)
-    m_copied = copy.deepcopy(m)
-    m_copied.update_state(200)
-
-    self.assertEqual(self.evaluate(m.result()), 100)
-    self.assertEqual(self.evaluate(m_copied.result()), 150)
-
-    m.reset_state()
-
-    self.assertEqual(self.evaluate(m.result()), 0)
-    self.assertEqual(self.evaluate(m_copied.result()), 150)
+    # TODO(b/120949004): Re-enable garbage collection check
+    # @tf_test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+    @test_combinations.run_all_keras_modes
+    def test_mean(self):
+        m = metrics.Mean(name="my_mean")
+
+        # check config
+        self.assertEqual(m.name, "my_mean")
+        self.assertTrue(m.stateful)
+        self.assertEqual(m.dtype, tf.float32)
+        self.assertEqual(len(m.variables), 2)
+        self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+        # check initial state
+        self.assertEqual(self.evaluate(m.total), 0)
+        self.assertEqual(self.evaluate(m.count), 0)
+
+        # check __call__()
+        self.assertEqual(self.evaluate(m(100)), 100)
+        self.assertEqual(self.evaluate(m.total), 100)
+        self.assertEqual(self.evaluate(m.count), 1)
+
+        # check update_state() and result() + state accumulation + tensor input
+        update_op = m.update_state(
+            [tf.convert_to_tensor(1), tf.convert_to_tensor(5)]
+        )
+        self.evaluate(update_op)
+        self.assertAlmostEqual(self.evaluate(m.result()), 106 / 3, 2)
+        self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
+        self.assertEqual(self.evaluate(m.count), 3)
+
+        # check reset_state()
+        m.reset_state()
+        self.assertEqual(self.evaluate(m.total), 0)
+        self.assertEqual(self.evaluate(m.count), 0)
+
+        # Check save and restore config
+        m2 = metrics.Mean.from_config(m.get_config())
+        self.assertEqual(m2.name, "my_mean")
+        self.assertTrue(m2.stateful)
+        self.assertEqual(m2.dtype, tf.float32)
+        self.assertEqual(len(m2.variables), 2)
+
+    @test_utils.run_v2_only
+    def test_function_wrapped_reset_state(self):
+        m = metrics.Mean(name="my_mean")
+
+        # check reset_state in function.
+        @tf.function
+        def reset_in_fn():
+            m.reset_state()
+            m.update_state(100)
+
+        for _ in range(5):
+            self.evaluate(reset_in_fn())
+        self.assertEqual(self.evaluate(m.count), 1)
+
+    @test_combinations.run_all_keras_modes
+    def test_mean_with_sample_weight(self):
+        m = metrics.Mean(dtype=tf.float64)
+        self.assertEqual(m.dtype, tf.float64)
+        self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+        # check scalar weight
+        result_t = m(100, sample_weight=0.5)
+        self.assertEqual(self.evaluate(result_t), 50 / 0.5)
+        self.assertEqual(self.evaluate(m.total), 50)
+        self.assertEqual(self.evaluate(m.count), 0.5)
+
+        # check weights not scalar and weights rank matches values rank
+        result_t = m([1, 5], sample_weight=[1, 0.2])
+        result = self.evaluate(result_t)
+        self.assertAlmostEqual(result, 52 / 1.7, 2)
+        self.assertAlmostEqual(
+            self.evaluate(m.total), 52, 2
+        )  # 50 + 1 + 5 * 0.2
+        self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
+
+        # check weights broadcast
+        result_t = m([1, 2], sample_weight=0.5)
+        self.assertAlmostEqual(self.evaluate(result_t), 53.5 / 2.7, 2)
+        self.assertAlmostEqual(self.evaluate(m.total), 53.5, 2)  # 52 + 0.5 + 1
+        self.assertAlmostEqual(
+            self.evaluate(m.count), 2.7, 2
+        )  # 1.7 + 0.5 + 0.5
+
+        # check weights squeeze
+        result_t = m([1, 5], sample_weight=[[1], [0.2]])
+        self.assertAlmostEqual(self.evaluate(result_t), 55.5 / 3.9, 2)
+        self.assertAlmostEqual(self.evaluate(m.total), 55.5, 2)  # 53.5 + 1 + 1
+        self.assertAlmostEqual(self.evaluate(m.count), 3.9, 2)  # 2.7 + 1.2
+
+        # check weights expand
+        result_t = m([[1], [5]], sample_weight=[1, 0.2])
+        self.assertAlmostEqual(self.evaluate(result_t), 57.5 / 5.1, 2)
+        self.assertAlmostEqual(self.evaluate(m.total), 57.5, 2)  # 55.5 + 1 + 1
+        self.assertAlmostEqual(self.evaluate(m.count), 5.1, 2)  # 3.9 + 1.2
+
+        # check values reduced to the dimensions of weight
+        result_t = m(
+            [[[1.0, 2.0], [3.0, 2.0], [0.5, 4.0]]], sample_weight=[0.5]
+        )
+        result = np.round(self.evaluate(result_t), decimals=2)  # 58.5 / 5.6
+        self.assertEqual(result, 10.45)
+        self.assertEqual(np.round(self.evaluate(m.total), decimals=2), 58.54)
+        self.assertEqual(np.round(self.evaluate(m.count), decimals=2), 5.6)
+
+    @test_combinations.run_all_keras_modes
+    def test_mean_graph_with_placeholder(self):
+        with tf.compat.v1.get_default_graph().as_default(), self.cached_session() as sess:  # noqa: E501
+            m = metrics.Mean()
+            v = tf.compat.v1.placeholder(tf.float32)
+            w = tf.compat.v1.placeholder(tf.float32)
+            self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+            # check __call__()
+            result_t = m(v, sample_weight=w)
+            result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
+            self.assertEqual(self.evaluate(m.total), 50)
+            self.assertEqual(self.evaluate(m.count), 0.5)
+            self.assertEqual(result, 50 / 0.5)
+
+            # check update_state() and result()
+            result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
+            self.assertAlmostEqual(
+                self.evaluate(m.total), 52, 2
+            )  # 50 + 1 + 5 * 0.2
+            self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
+            self.assertAlmostEqual(result, 52 / 1.7, 2)
+
+    @test_combinations.run_all_keras_modes
+    def test_save_restore(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        m = metrics.Mean()
+        checkpoint = tf.train.Checkpoint(mean=m)
+        self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+
+        # update state
+        self.evaluate(m(100.0))
+        self.evaluate(m(200.0))
+
+        # save checkpoint and then add an update
+        save_path = checkpoint.save(checkpoint_prefix)
+        self.evaluate(m(1000.0))
+
+        # restore to the same checkpoint mean object
+        checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+        self.evaluate(m(300.0))
+        self.assertEqual(200.0, self.evaluate(m.result()))
+
+        # restore to a different checkpoint mean object
+        restore_mean = metrics.Mean()
+        restore_checkpoint = tf.train.Checkpoint(mean=restore_mean)
+        status = restore_checkpoint.restore(save_path)
+        restore_update = restore_mean(300.0)
+        status.assert_consumed().run_restore_ops()
+        self.evaluate(restore_update)
+        self.assertEqual(200.0, self.evaluate(restore_mean.result()))
+        self.assertEqual(3, self.evaluate(restore_mean.count))
+
+    @test_combinations.run_all_keras_modes
+    def test_multiple_instances(self):
+        m = metrics.Mean()
+        m2 = metrics.Mean()
+
+        self.assertEqual(m.name, "mean")
+        self.assertEqual(m2.name, "mean")
+
+        self.assertEqual(
+            [v.name for v in m.variables],
+            test_utils.get_expected_metric_variable_names(["total", "count"]),
+        )
+        self.assertEqual(
+            [v.name for v in m2.variables],
+            test_utils.get_expected_metric_variable_names(
+                ["total", "count"], name_suffix="_1"
+            ),
+        )
+
+        self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+        self.evaluate(tf.compat.v1.variables_initializer(m2.variables))
+
+        # check initial state
+        self.assertEqual(self.evaluate(m.total), 0)
+        self.assertEqual(self.evaluate(m.count), 0)
+        self.assertEqual(self.evaluate(m2.total), 0)
+        self.assertEqual(self.evaluate(m2.count), 0)
+
+        # check __call__()
+        self.assertEqual(self.evaluate(m(100)), 100)
+        self.assertEqual(self.evaluate(m.total), 100)
+        self.assertEqual(self.evaluate(m.count), 1)
+        self.assertEqual(self.evaluate(m2.total), 0)
+        self.assertEqual(self.evaluate(m2.count), 0)
+
+        self.assertEqual(self.evaluate(m2([63, 10])), 36.5)
+        self.assertEqual(self.evaluate(m2.total), 73)
+        self.assertEqual(self.evaluate(m2.count), 2)
+        self.assertEqual(self.evaluate(m.result()), 100)
+        self.assertEqual(self.evaluate(m.total), 100)
+        self.assertEqual(self.evaluate(m.count), 1)
+
+    @test_utils.run_v2_only
+    def test_deepcopy_of_metrics(self):
+        m = metrics.Mean(name="my_mean")
+
+        m.reset_state()
+        m.update_state(100)
+        m_copied = copy.deepcopy(m)
+        m_copied.update_state(200)
+
+        self.assertEqual(self.evaluate(m.result()), 100)
+        self.assertEqual(self.evaluate(m_copied.result()), 150)
+
+        m.reset_state()
+
+        self.assertEqual(self.evaluate(m.result()), 0)
+        self.assertEqual(self.evaluate(m_copied.result()), 150)
 
 
 class MeanTensorTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_config(self):
-    with self.test_session():
-      m = metrics.MeanTensor(name='mean_by_element')
-
-      # check config
-      self.assertEqual(m.name, 'mean_by_element')
-      self.assertTrue(m.stateful)
-      self.assertEqual(m.dtype, tf.float32)
-      self.assertEmpty(m.variables)
-
-      with self.assertRaisesRegex(ValueError, 'does not have any value yet'):
-        m.result()
-
-      self.evaluate(m([[3], [5], [3]]))
-      self.assertAllEqual(m._shape, [3, 1])
-
-      m2 = metrics.MeanTensor.from_config(m.get_config())
-      self.assertEqual(m2.name, 'mean_by_element')
-      self.assertTrue(m2.stateful)
-      self.assertEqual(m2.dtype, tf.float32)
-      self.assertEmpty(m2.variables)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_unweighted(self):
-    with self.test_session():
-      m = metrics.MeanTensor(dtype=tf.float64)
-
-      # check __call__()
-      self.assertAllClose(self.evaluate(m([100, 40])), [100, 40])
-      self.assertAllClose(self.evaluate(m.total), [100, 40])
-      self.assertAllClose(self.evaluate(m.count), [1, 1])
-
-      # check update_state() and result() + state accumulation + tensor input
-      update_op = m.update_state([
-          tf.convert_to_tensor(1),
-          tf.convert_to_tensor(5)
-      ])
-      self.evaluate(update_op)
-      self.assertAllClose(self.evaluate(m.result()), [50.5, 22.5])
-      self.assertAllClose(self.evaluate(m.total), [101, 45])
-      self.assertAllClose(self.evaluate(m.count), [2, 2])
-
-      # check reset_state()
-      m.reset_state()
-      self.assertAllClose(self.evaluate(m.total), [0, 0])
-      self.assertAllClose(self.evaluate(m.count), [0, 0])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_weighted(self):
-    with self.test_session():
-      m = metrics.MeanTensor(dtype=tf.float64)
-      self.assertEqual(m.dtype, tf.float64)
-
-      # check scalar weight
-      result_t = m([100, 30], sample_weight=0.5)
-      self.assertAllClose(self.evaluate(result_t), [100, 30])
-      self.assertAllClose(self.evaluate(m.total), [50, 15])
-      self.assertAllClose(self.evaluate(m.count), [0.5, 0.5])
-
-      # check weights not scalar and weights rank matches values rank
-      result_t = m([1, 5], sample_weight=[1, 0.2])
-      result = self.evaluate(result_t)
-      self.assertAllClose(result, [51 / 1.5, 16 / 0.7], 2)
-      self.assertAllClose(self.evaluate(m.total), [51, 16])
-      self.assertAllClose(self.evaluate(m.count), [1.5, 0.7])
-
-      # check weights broadcast
-      result_t = m([1, 2], sample_weight=0.5)
-      self.assertAllClose(self.evaluate(result_t), [51.5 / 2, 17 / 1.2])
-      self.assertAllClose(self.evaluate(m.total), [51.5, 17])
-      self.assertAllClose(self.evaluate(m.count), [2, 1.2])
-
-      # check weights squeeze
-      result_t = m([1, 5], sample_weight=[[1], [0.2]])
-      self.assertAllClose(self.evaluate(result_t), [52.5 / 3, 18 / 1.4])
-      self.assertAllClose(self.evaluate(m.total), [52.5, 18])
-      self.assertAllClose(self.evaluate(m.count), [3, 1.4])
-
-      # check weights expand
-      m = metrics.MeanTensor(dtype=tf.float64)
-      self.evaluate(tf.compat.v1.variables_initializer(m.variables))
-      result_t = m([[1], [5]], sample_weight=[1, 0.2])
-      self.assertAllClose(self.evaluate(result_t), [[1], [5]])
-      self.assertAllClose(self.evaluate(m.total), [[1], [1]])
-      self.assertAllClose(self.evaluate(m.count), [[1], [0.2]])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_invalid_value_shape(self):
-    m = metrics.MeanTensor(dtype=tf.float64)
-    m([1])
-    with self.assertRaisesRegex(
-        ValueError, 'MeanTensor input values must always have the same shape'):
-      m([1, 5])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_build_in_tf_function(self):
-    """Ensure that variables are created correctly in a tf function."""
-    m = metrics.MeanTensor(dtype=tf.float64)
-
-    @tf.function
-    def call_metric(x):
-      return m(x)
-
-    with self.test_session():
-      self.assertAllClose(self.evaluate(call_metric([100, 40])), [100, 40])
-      self.assertAllClose(self.evaluate(m.total), [100, 40])
-      self.assertAllClose(self.evaluate(m.count), [1, 1])
-      self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_in_keras_model(self):
-    class ModelWithMetric(Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense1 = layers.Dense(
-            3, activation='relu', kernel_initializer='ones')
-        self.dense2 = layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones')
-        self.mean_tensor = metrics.MeanTensor()
-
-      def call(self, x):
-        x = self.dense1(x)
-        x = self.dense2(x)
-        self.mean_tensor(self.dense1.kernel)
-        return x
-
-    model = ModelWithMetric()
-    model.compile(
-        loss='mae',
-        optimizer='rmsprop',
-        run_eagerly=True)
-
-    x = np.ones((100, 4))
-    y = np.zeros((100, 1))
-    model.evaluate(x, y, batch_size=50)
-    self.assertAllClose(self.evaluate(model.mean_tensor.result()),
-                        np.ones((4, 3)))
-    self.assertAllClose(self.evaluate(model.mean_tensor.total),
-                        np.full((4, 3), 2))
-    self.assertAllClose(self.evaluate(model.mean_tensor.count),
-                        np.full((4, 3), 2))
-
-    model.evaluate(x, y, batch_size=25)
-    self.assertAllClose(self.evaluate(model.mean_tensor.result()),
-                        np.ones((4, 3)))
-    self.assertAllClose(self.evaluate(model.mean_tensor.total),
-                        np.full((4, 3), 4))
-    self.assertAllClose(self.evaluate(model.mean_tensor.count),
-                        np.full((4, 3), 4))
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_config(self):
+        with self.test_session():
+            m = metrics.MeanTensor(name="mean_by_element")
+
+            # check config
+            self.assertEqual(m.name, "mean_by_element")
+            self.assertTrue(m.stateful)
+            self.assertEqual(m.dtype, tf.float32)
+            self.assertEmpty(m.variables)
+
+            with self.assertRaisesRegex(
+                ValueError, "does not have any value yet"
+            ):
+                m.result()
+
+            self.evaluate(m([[3], [5], [3]]))
+            self.assertAllEqual(m._shape, [3, 1])
+
+            m2 = metrics.MeanTensor.from_config(m.get_config())
+            self.assertEqual(m2.name, "mean_by_element")
+            self.assertTrue(m2.stateful)
+            self.assertEqual(m2.dtype, tf.float32)
+            self.assertEmpty(m2.variables)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_unweighted(self):
+        with self.test_session():
+            m = metrics.MeanTensor(dtype=tf.float64)
+
+            # check __call__()
+            self.assertAllClose(self.evaluate(m([100, 40])), [100, 40])
+            self.assertAllClose(self.evaluate(m.total), [100, 40])
+            self.assertAllClose(self.evaluate(m.count), [1, 1])
+
+            # check update_state() and result() + state accumulation + tensor
+            # input
+            update_op = m.update_state(
+                [tf.convert_to_tensor(1), tf.convert_to_tensor(5)]
+            )
+            self.evaluate(update_op)
+            self.assertAllClose(self.evaluate(m.result()), [50.5, 22.5])
+            self.assertAllClose(self.evaluate(m.total), [101, 45])
+            self.assertAllClose(self.evaluate(m.count), [2, 2])
+
+            # check reset_state()
+            m.reset_state()
+            self.assertAllClose(self.evaluate(m.total), [0, 0])
+            self.assertAllClose(self.evaluate(m.count), [0, 0])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_weighted(self):
+        with self.test_session():
+            m = metrics.MeanTensor(dtype=tf.float64)
+            self.assertEqual(m.dtype, tf.float64)
+
+            # check scalar weight
+            result_t = m([100, 30], sample_weight=0.5)
+            self.assertAllClose(self.evaluate(result_t), [100, 30])
+            self.assertAllClose(self.evaluate(m.total), [50, 15])
+            self.assertAllClose(self.evaluate(m.count), [0.5, 0.5])
+
+            # check weights not scalar and weights rank matches values rank
+            result_t = m([1, 5], sample_weight=[1, 0.2])
+            result = self.evaluate(result_t)
+            self.assertAllClose(result, [51 / 1.5, 16 / 0.7], 2)
+            self.assertAllClose(self.evaluate(m.total), [51, 16])
+            self.assertAllClose(self.evaluate(m.count), [1.5, 0.7])
+
+            # check weights broadcast
+            result_t = m([1, 2], sample_weight=0.5)
+            self.assertAllClose(self.evaluate(result_t), [51.5 / 2, 17 / 1.2])
+            self.assertAllClose(self.evaluate(m.total), [51.5, 17])
+            self.assertAllClose(self.evaluate(m.count), [2, 1.2])
+
+            # check weights squeeze
+            result_t = m([1, 5], sample_weight=[[1], [0.2]])
+            self.assertAllClose(self.evaluate(result_t), [52.5 / 3, 18 / 1.4])
+            self.assertAllClose(self.evaluate(m.total), [52.5, 18])
+            self.assertAllClose(self.evaluate(m.count), [3, 1.4])
+
+            # check weights expand
+            m = metrics.MeanTensor(dtype=tf.float64)
+            self.evaluate(tf.compat.v1.variables_initializer(m.variables))
+            result_t = m([[1], [5]], sample_weight=[1, 0.2])
+            self.assertAllClose(self.evaluate(result_t), [[1], [5]])
+            self.assertAllClose(self.evaluate(m.total), [[1], [1]])
+            self.assertAllClose(self.evaluate(m.count), [[1], [0.2]])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_invalid_value_shape(self):
+        m = metrics.MeanTensor(dtype=tf.float64)
+        m([1])
+        with self.assertRaisesRegex(
+            ValueError,
+            "MeanTensor input values must always have the same shape",
+        ):
+            m([1, 5])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_build_in_tf_function(self):
+        """Ensure that variables are created correctly in a tf function."""
+        m = metrics.MeanTensor(dtype=tf.float64)
+
+        @tf.function
+        def call_metric(x):
+            return m(x)
+
+        with self.test_session():
+            self.assertAllClose(
+                self.evaluate(call_metric([100, 40])), [100, 40]
+            )
+            self.assertAllClose(self.evaluate(m.total), [100, 40])
+            self.assertAllClose(self.evaluate(m.count), [1, 1])
+            self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_in_keras_model(self):
+        class ModelWithMetric(Model):
+            def __init__(self):
+                super().__init__()
+                self.dense1 = layers.Dense(
+                    3, activation="relu", kernel_initializer="ones"
+                )
+                self.dense2 = layers.Dense(
+                    1, activation="sigmoid", kernel_initializer="ones"
+                )
+                self.mean_tensor = metrics.MeanTensor()
+
+            def call(self, x):
+                x = self.dense1(x)
+                x = self.dense2(x)
+                self.mean_tensor(self.dense1.kernel)
+                return x
+
+        model = ModelWithMetric()
+        model.compile(loss="mae", optimizer="rmsprop", run_eagerly=True)
+
+        x = np.ones((100, 4))
+        y = np.zeros((100, 1))
+        model.evaluate(x, y, batch_size=50)
+        self.assertAllClose(
+            self.evaluate(model.mean_tensor.result()), np.ones((4, 3))
+        )
+        self.assertAllClose(
+            self.evaluate(model.mean_tensor.total), np.full((4, 3), 2)
+        )
+        self.assertAllClose(
+            self.evaluate(model.mean_tensor.count), np.full((4, 3), 2)
+        )
+
+        model.evaluate(x, y, batch_size=25)
+        self.assertAllClose(
+            self.evaluate(model.mean_tensor.result()), np.ones((4, 3))
+        )
+        self.assertAllClose(
+            self.evaluate(model.mean_tensor.total), np.full((4, 3), 4)
+        )
+        self.assertAllClose(
+            self.evaluate(model.mean_tensor.count), np.full((4, 3), 4)
+        )
 
 
 class BinaryTruePositives(metrics.Metric):
+    def __init__(self, name="binary_true_positives", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.true_positives = self.add_weight(name="tp", initializer="zeros")
 
-  def __init__(self, name='binary_true_positives', **kwargs):
-    super().__init__(name=name, **kwargs)
-    self.true_positives = self.add_weight(name='tp', initializer='zeros')
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
 
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    y_true = tf.cast(y_true, tf.bool)
-    y_pred = tf.cast(y_pred, tf.bool)
+        values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+        values = tf.cast(values, self.dtype)
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, dtype=self.dtype)
+            sample_weight = tf.__internal__.ops.broadcast_weights(
+                sample_weight, values
+            )
+            values = tf.multiply(values, sample_weight)
+        self.true_positives.assign_add(tf.reduce_sum(values))
 
-    values = tf.logical_and(
-        tf.equal(y_true, True), tf.equal(y_pred, True))
-    values = tf.cast(values, self.dtype)
-    if sample_weight is not None:
-      sample_weight = tf.cast(sample_weight, dtype=self.dtype)
-      sample_weight = tf.__internal__.ops.broadcast_weights(
-          sample_weight, values)
-      values = tf.multiply(values, sample_weight)
-    self.true_positives.assign_add(tf.reduce_sum(values))
-
-  def result(self):
-    return self.true_positives
+    def result(self):
+        return self.true_positives
 
 
 class BinaryTruePositivesViaControlFlow(metrics.Metric):
+    def __init__(self, name="binary_true_positives", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.true_positives = self.add_weight(name="tp", initializer="zeros")
 
-  def __init__(self, name='binary_true_positives', **kwargs):
-    super().__init__(name=name, **kwargs)
-    self.true_positives = self.add_weight(name='tp', initializer='zeros')
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    y_true = tf.cast(y_true, tf.bool)
-    y_pred = tf.cast(y_pred, tf.bool)
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
 
-    for i in range(len(y_true)):
-      for j in range(len(y_true[i])):
-        if y_true[i][j] and y_pred[i][j]:
-          if sample_weight is None:
-            self.true_positives.assign_add(1)
-          else:
-            self.true_positives.assign_add(sample_weight[i][0])
+        for i in range(len(y_true)):
+            for j in range(len(y_true[i])):
+                if y_true[i][j] and y_pred[i][j]:
+                    if sample_weight is None:
+                        self.true_positives.assign_add(1)
+                    else:
+                        self.true_positives.assign_add(sample_weight[i][0])
 
-  def result(self):
-    if tf.constant(True):
-      return self.true_positives
-    return 0.0
+    def result(self):
+        if tf.constant(True):
+            return self.true_positives
+        return 0.0
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CustomMetricsTest(tf.test.TestCase):
-
-  def test_config(self):
-    btp_obj = BinaryTruePositives(name='btp', dtype=tf.int32)
-    self.assertEqual(btp_obj.name, 'btp')
-    self.assertEqual(btp_obj.dtype, tf.int32)
-
-    # Check save and restore config
-    btp_obj2 = BinaryTruePositives.from_config(btp_obj.get_config())
-    self.assertEqual(btp_obj2.name, 'btp')
-    self.assertEqual(btp_obj2.dtype, tf.int32)
-
-  def test_unweighted(self):
-    btp_obj = BinaryTruePositives()
-    self.evaluate(tf.compat.v1.variables_initializer(btp_obj.variables))
-    y_true = tf.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1],
-                          [1, 1, 1, 1, 0], [0, 0, 0, 0, 1.5]])
-    y_pred = tf.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1],
-                          [0, 1, 0, 1, 0], [1, 10, 1, 1, 1]])
-
-    update_op = btp_obj.update_state(y_true, y_pred)  # pylint: disable=assignment-from-no-return
-    self.evaluate(update_op)
-    result = btp_obj.result()
-    self.assertEqual(7, self.evaluate(result))
-
-  def test_weighted(self):
-    btp_obj = BinaryTruePositives()
-    self.evaluate(tf.compat.v1.variables_initializer(btp_obj.variables))
-    y_true = tf.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1],
-                          [1, 1, 1, 1, 0], [0, 0, 0, 0, 1.5]])
-    y_pred = tf.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1],
-                          [0, 1, 0, 1, 0], [1, 10, 1, 1, 1]])
-    sample_weight = tf.constant([[1.], [1.5], [2.], [2.5]])
-    result = btp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertEqual(12, self.evaluate(result))
-
-  def test_autograph(self):
-    metric = BinaryTruePositivesViaControlFlow()
-    self.evaluate(tf.compat.v1.variables_initializer(metric.variables))
-    y_true = tf.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1],
-                          [1, 1, 1, 1, 0], [0, 0, 0, 0, 1.5]])
-    y_pred = tf.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1],
-                          [0, 1, 0, 1, 0], [1, 10, 1, 1, 1]])
-    sample_weight = tf.constant([[1.], [1.5], [2.], [2.5]])
-
-    @tf.function
-    def compute_metric(y_true, y_pred, sample_weight):
-      metric(y_true, y_pred, sample_weight)
-      return metric.result()
-
-    result = compute_metric(y_true, y_pred, sample_weight)
-    self.assertEqual(12, self.evaluate(result))
-
-  def test_metric_wrappers_autograph(self):
-    def metric_fn(y_true, y_pred):
-      x = tf.constant(0.0)
-      for i in range(len(y_true)):
-        for j in range(len(y_true[i])):
-          if tf.equal(y_true[i][j], y_pred[i][j]) and y_true[i][j] > 0:
-            x += 1.0
-      return x
-
-    mean_metric = metrics.MeanMetricWrapper(metric_fn)
-    sum_metric = metrics.SumOverBatchSizeMetricWrapper(metric_fn)
-    self.evaluate(tf.compat.v1.variables_initializer(mean_metric.variables))
-    self.evaluate(tf.compat.v1.variables_initializer(sum_metric.variables))
-
-    y_true = tf.constant([[0, 0, 0, 1, 0],
-                          [0, 0, 1, 1, 1],
-                          [1, 1, 1, 1, 0],
-                          [1, 1, 1, 0, 1]])
-    y_pred = tf.constant([[0, 0, 1, 1, 0],
-                          [1, 1, 1, 1, 1],
-                          [0, 1, 0, 1, 0],
-                          [1, 1, 1, 1, 1]])
-
-    @tf.function
-    def tf_functioned_metric_fn(metric, y_true, y_pred):
-      return metric(y_true, y_pred)
-
-    metric_result = tf_functioned_metric_fn(mean_metric, y_true, y_pred)
-    self.assertAllClose(self.evaluate(metric_result), 10, 1e-2)
-    metric_result = tf_functioned_metric_fn(sum_metric, y_true, y_pred)
-    self.assertAllClose(self.evaluate(metric_result), 10, 1e-2)
-
-  def test_metric_not_tracked_as_sublayer_in_layer(self):
-
-    class MyLayer(base_layer.Layer):
-
-      def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.mean_obj = metrics.Mean(name='my_mean_obj')
-
-      def call(self, x):
-        self.add_metric(
-            tf.reduce_sum(x), aggregation='mean', name='my_mean_tensor')
-        self.add_metric(self.mean_obj(x))
-        return x
-
-    layer = MyLayer()
-    x = np.ones((1, 1))
-    layer(x)
-    self.assertLen(list(layer._flatten_layers(include_self=False)), 0)
-    self.assertLen(layer.metrics, 2)
-
-  def test_metric_not_tracked_as_sublayer_in_model(self):
-
-    class MyModel(training_module.Model):
-
-      def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.mean_obj = metrics.Mean(name='my_mean_obj')
-
-      def call(self, x):
-        self.add_metric(
-            tf.reduce_sum(x), aggregation='mean', name='my_mean_tensor')
-        self.add_metric(self.mean_obj(x))
-        return x
-
-    model = MyModel()
-    x = np.ones((1, 1))
-    model(x)
-    self.assertLen(list(model._flatten_layers(include_self=False)), 0)
-    self.assertLen(model.layers, 0)
-    self.assertLen(model.metrics, 2)
-
-  def test_invalid_custom_metric_class_error_msg(self):
-    x = layers.Input(shape=(2,))
-    y = layers.Dense(3)(x)
-    model = training_module.Model(x, y)
-
-    class BadMetric(metrics.Metric):
-
-      def update_state(self, y_true, y_pred, sample_weight=None):
-        return
-
-      def result(self):
-        return
-
-    with self.assertRaisesRegex(RuntimeError,
-                                'can only be a single'):
-      model.compile('sgd',
-                    'mse',
-                    metrics=[BadMetric()])
-      model.fit(np.ones((10, 2)), np.ones((10, 3)))
-
-  def test_invalid_custom_metric_fn_error_msg(self):
-    x = layers.Input(shape=(2,))
-    y = layers.Dense(3)(x)
-    model = training_module.Model(x, y)
-
-    def bad_metric(y_true, y_pred, sample_weight=None):  # pylint: disable=unused-argument
-      return None
-
-    def dict_metric(y_true, y_pred, sample_weight=None):  # pylint: disable=unused-argument
-      return {'value': 0.}
-
-    with self.assertRaisesRegex(RuntimeError,
-                                'The output of a metric function can only be'):
-      model.compile('sgd',
-                    'mse',
-                    metrics=[bad_metric])
-      model.fit(np.ones((10, 2)), np.ones((10, 3)))
-    with self.assertRaisesRegex(RuntimeError,
-                                'To return a dict of values, implement'):
-      model.compile('sgd',
-                    'mse',
-                    metrics=[dict_metric])
-      model.fit(np.ones((10, 2)), np.ones((10, 3)))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_config(self):
+        btp_obj = BinaryTruePositives(name="btp", dtype=tf.int32)
+        self.assertEqual(btp_obj.name, "btp")
+        self.assertEqual(btp_obj.dtype, tf.int32)
+
+        # Check save and restore config
+        btp_obj2 = BinaryTruePositives.from_config(btp_obj.get_config())
+        self.assertEqual(btp_obj2.name, "btp")
+        self.assertEqual(btp_obj2.dtype, tf.int32)
+
+    def test_unweighted(self):
+        btp_obj = BinaryTruePositives()
+        self.evaluate(tf.compat.v1.variables_initializer(btp_obj.variables))
+        y_true = tf.constant(
+            [
+                [0, 0.9, 0, 1, 0],
+                [0, 0, 1, 1, 1],
+                [1, 1, 1, 1, 0],
+                [0, 0, 0, 0, 1.5],
+            ]
+        )
+        y_pred = tf.constant(
+            [
+                [0, 0, 1, 5, 0],
+                [1, 1, 1, 1, 1],
+                [0, 1, 0, 1, 0],
+                [1, 10, 1, 1, 1],
+            ]
+        )
+
+        update_op = btp_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = btp_obj.result()
+        self.assertEqual(7, self.evaluate(result))
+
+    def test_weighted(self):
+        btp_obj = BinaryTruePositives()
+        self.evaluate(tf.compat.v1.variables_initializer(btp_obj.variables))
+        y_true = tf.constant(
+            [
+                [0, 0.9, 0, 1, 0],
+                [0, 0, 1, 1, 1],
+                [1, 1, 1, 1, 0],
+                [0, 0, 0, 0, 1.5],
+            ]
+        )
+        y_pred = tf.constant(
+            [
+                [0, 0, 1, 5, 0],
+                [1, 1, 1, 1, 1],
+                [0, 1, 0, 1, 0],
+                [1, 10, 1, 1, 1],
+            ]
+        )
+        sample_weight = tf.constant([[1.0], [1.5], [2.0], [2.5]])
+        result = btp_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertEqual(12, self.evaluate(result))
+
+    def test_autograph(self):
+        metric = BinaryTruePositivesViaControlFlow()
+        self.evaluate(tf.compat.v1.variables_initializer(metric.variables))
+        y_true = tf.constant(
+            [
+                [0, 0.9, 0, 1, 0],
+                [0, 0, 1, 1, 1],
+                [1, 1, 1, 1, 0],
+                [0, 0, 0, 0, 1.5],
+            ]
+        )
+        y_pred = tf.constant(
+            [
+                [0, 0, 1, 5, 0],
+                [1, 1, 1, 1, 1],
+                [0, 1, 0, 1, 0],
+                [1, 10, 1, 1, 1],
+            ]
+        )
+        sample_weight = tf.constant([[1.0], [1.5], [2.0], [2.5]])
+
+        @tf.function
+        def compute_metric(y_true, y_pred, sample_weight):
+            metric(y_true, y_pred, sample_weight)
+            return metric.result()
+
+        result = compute_metric(y_true, y_pred, sample_weight)
+        self.assertEqual(12, self.evaluate(result))
+
+    def test_metric_wrappers_autograph(self):
+        def metric_fn(y_true, y_pred):
+            x = tf.constant(0.0)
+            for i in range(len(y_true)):
+                for j in range(len(y_true[i])):
+                    if (
+                        tf.equal(y_true[i][j], y_pred[i][j])
+                        and y_true[i][j] > 0
+                    ):
+                        x += 1.0
+            return x
+
+        mean_metric = metrics.MeanMetricWrapper(metric_fn)
+        sum_metric = metrics.SumOverBatchSizeMetricWrapper(metric_fn)
+        self.evaluate(tf.compat.v1.variables_initializer(mean_metric.variables))
+        self.evaluate(tf.compat.v1.variables_initializer(sum_metric.variables))
+
+        y_true = tf.constant(
+            [[0, 0, 0, 1, 0], [0, 0, 1, 1, 1], [1, 1, 1, 1, 0], [1, 1, 1, 0, 1]]
+        )
+        y_pred = tf.constant(
+            [[0, 0, 1, 1, 0], [1, 1, 1, 1, 1], [0, 1, 0, 1, 0], [1, 1, 1, 1, 1]]
+        )
+
+        @tf.function
+        def tf_functioned_metric_fn(metric, y_true, y_pred):
+            return metric(y_true, y_pred)
+
+        metric_result = tf_functioned_metric_fn(mean_metric, y_true, y_pred)
+        self.assertAllClose(self.evaluate(metric_result), 10, 1e-2)
+        metric_result = tf_functioned_metric_fn(sum_metric, y_true, y_pred)
+        self.assertAllClose(self.evaluate(metric_result), 10, 1e-2)
+
+    def test_metric_not_tracked_as_sublayer_in_layer(self):
+        class MyLayer(base_layer.Layer):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.mean_obj = metrics.Mean(name="my_mean_obj")
+
+            def call(self, x):
+                self.add_metric(
+                    tf.reduce_sum(x), aggregation="mean", name="my_mean_tensor"
+                )
+                self.add_metric(self.mean_obj(x))
+                return x
+
+        layer = MyLayer()
+        x = np.ones((1, 1))
+        layer(x)
+        self.assertLen(list(layer._flatten_layers(include_self=False)), 0)
+        self.assertLen(layer.metrics, 2)
+
+    def test_metric_not_tracked_as_sublayer_in_model(self):
+        class MyModel(training_module.Model):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.mean_obj = metrics.Mean(name="my_mean_obj")
+
+            def call(self, x):
+                self.add_metric(
+                    tf.reduce_sum(x), aggregation="mean", name="my_mean_tensor"
+                )
+                self.add_metric(self.mean_obj(x))
+                return x
+
+        model = MyModel()
+        x = np.ones((1, 1))
+        model(x)
+        self.assertLen(list(model._flatten_layers(include_self=False)), 0)
+        self.assertLen(model.layers, 0)
+        self.assertLen(model.metrics, 2)
+
+    def test_invalid_custom_metric_class_error_msg(self):
+        x = layers.Input(shape=(2,))
+        y = layers.Dense(3)(x)
+        model = training_module.Model(x, y)
+
+        class BadMetric(metrics.Metric):
+            def update_state(self, y_true, y_pred, sample_weight=None):
+                return
+
+            def result(self):
+                return
+
+        with self.assertRaisesRegex(RuntimeError, "can only be a single"):
+            model.compile("sgd", "mse", metrics=[BadMetric()])
+            model.fit(np.ones((10, 2)), np.ones((10, 3)))
+
+    def test_invalid_custom_metric_fn_error_msg(self):
+        x = layers.Input(shape=(2,))
+        y = layers.Dense(3)(x)
+        model = training_module.Model(x, y)
+
+        def bad_metric(y_true, y_pred, sample_weight=None):
+            return None
+
+        def dict_metric(y_true, y_pred, sample_weight=None):
+            return {"value": 0.0}
+
+        with self.assertRaisesRegex(
+            RuntimeError, "The output of a metric function can only be"
+        ):
+            model.compile("sgd", "mse", metrics=[bad_metric])
+            model.fit(np.ones((10, 2)), np.ones((10, 3)))
+        with self.assertRaisesRegex(
+            RuntimeError, "To return a dict of values, implement"
+        ):
+            model.compile("sgd", "mse", metrics=[dict_metric])
+            model.fit(np.ones((10, 2)), np.ones((10, 3)))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/confusion_matrix_test.py b/keras/metrics/confusion_matrix_test.py
deleted file mode 100644
index cf8889218a3a..000000000000
--- a/keras/metrics/confusion_matrix_test.py
+++ /dev/null
@@ -1,1897 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras metrics functions."""
-
-import tensorflow.compat.v2 as tf
-
-import json
-
-from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
-from keras import layers
-from keras import metrics
-from keras import models
-from keras.utils import metrics_utils
-from tensorflow.python.platform import tf_logging
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class FalsePositivesTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
-    self.assertEqual(fp_obj.name, 'my_fp')
-    self.assertLen(fp_obj.variables, 1)
-    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
-
-    # Check save and restore config
-    fp_obj2 = metrics.FalsePositives.from_config(fp_obj.get_config())
-    self.assertEqual(fp_obj2.name, 'my_fp')
-    self.assertLen(fp_obj2.variables, 1)
-    self.assertEqual(fp_obj2.thresholds, [0.4, 0.9])
-
-  def test_unweighted(self):
-    fp_obj = metrics.FalsePositives()
-    self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
-
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = fp_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fp_obj.result()
-    self.assertAllClose(7., result)
-
-  def test_weighted(self):
-    fp_obj = metrics.FalsePositives()
-    self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(14., self.evaluate(result))
-
-  def test_unweighted_with_thresholds(self):
-    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-
-    update_op = fp_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fp_obj.result()
-    self.assertAllClose([7., 4., 2.], result)
-
-  def test_weighted_with_thresholds(self):
-    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
-                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
-
-    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([125., 42., 12.], self.evaluate(result))
-
-  def test_threshold_limit(self):
-    with self.assertRaisesRegex(
-        ValueError,
-        r'Threshold values must be in \[0, 1\]. Received: \[-1, 2\]'):
-      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
-
-    with self.assertRaisesRegex(
-        ValueError,
-        r'Threshold values must be in \[0, 1\]. Received: \[None\]'):
-      metrics.FalsePositives(thresholds=[None])
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class FalseNegativesTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
-    self.assertEqual(fn_obj.name, 'my_fn')
-    self.assertLen(fn_obj.variables, 1)
-    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
-
-    # Check save and restore config
-    fn_obj2 = metrics.FalseNegatives.from_config(fn_obj.get_config())
-    self.assertEqual(fn_obj2.name, 'my_fn')
-    self.assertLen(fn_obj2.variables, 1)
-    self.assertEqual(fn_obj2.thresholds, [0.4, 0.9])
-
-  def test_unweighted(self):
-    fn_obj = metrics.FalseNegatives()
-    self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
-
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = fn_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fn_obj.result()
-    self.assertAllClose(3., result)
-
-  def test_weighted(self):
-    fn_obj = metrics.FalseNegatives()
-    self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(5., self.evaluate(result))
-
-  def test_unweighted_with_thresholds(self):
-    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-
-    update_op = fn_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fn_obj.result()
-    self.assertAllClose([1., 4., 6.], result)
-
-  def test_weighted_with_thresholds(self):
-    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
-
-    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([4., 16., 23.], self.evaluate(result))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class TrueNegativesTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
-    self.assertEqual(tn_obj.name, 'my_tn')
-    self.assertLen(tn_obj.variables, 1)
-    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
-
-    # Check save and restore config
-    tn_obj2 = metrics.TrueNegatives.from_config(tn_obj.get_config())
-    self.assertEqual(tn_obj2.name, 'my_tn')
-    self.assertLen(tn_obj2.variables, 1)
-    self.assertEqual(tn_obj2.thresholds, [0.4, 0.9])
-
-  def test_unweighted(self):
-    tn_obj = metrics.TrueNegatives()
-    self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
-
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = tn_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = tn_obj.result()
-    self.assertAllClose(3., result)
-
-  def test_weighted(self):
-    tn_obj = metrics.TrueNegatives()
-    self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(4., self.evaluate(result))
-
-  def test_unweighted_with_thresholds(self):
-    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-
-    update_op = tn_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = tn_obj.result()
-    self.assertAllClose([2., 5., 7.], result)
-
-  def test_weighted_with_thresholds(self):
-    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
-
-    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([5., 15., 23.], self.evaluate(result))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class TruePositivesTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
-    self.assertEqual(tp_obj.name, 'my_tp')
-    self.assertLen(tp_obj.variables, 1)
-    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
-
-    # Check save and restore config
-    tp_obj2 = metrics.TruePositives.from_config(tp_obj.get_config())
-    self.assertEqual(tp_obj2.name, 'my_tp')
-    self.assertLen(tp_obj2.variables, 1)
-    self.assertEqual(tp_obj2.thresholds, [0.4, 0.9])
-
-  def test_unweighted(self):
-    tp_obj = metrics.TruePositives()
-    self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
-
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = tp_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = tp_obj.result()
-    self.assertAllClose(7., result)
-
-  def test_weighted(self):
-    tp_obj = metrics.TruePositives()
-    self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(12., self.evaluate(result))
-
-  def test_unweighted_with_thresholds(self):
-    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-
-    update_op = tp_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = tp_obj.result()
-    self.assertAllClose([6., 3., 1.], result)
-
-  def test_weighted_with_thresholds(self):
-    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
-
-    y_pred = tf.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = tf.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-
-    result = tp_obj(y_true, y_pred, sample_weight=37.)
-    self.assertAllClose([222., 111., 37.], self.evaluate(result))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class PrecisionTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    p_obj = metrics.Precision(
-        name='my_precision', thresholds=[0.4, 0.9], top_k=15, class_id=12)
-    self.assertEqual(p_obj.name, 'my_precision')
-    self.assertLen(p_obj.variables, 2)
-    self.assertEqual([v.name for v in p_obj.variables],
-                     ['true_positives:0', 'false_positives:0'])
-    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
-    self.assertEqual(p_obj.top_k, 15)
-    self.assertEqual(p_obj.class_id, 12)
-
-    # Check save and restore config
-    p_obj2 = metrics.Precision.from_config(p_obj.get_config())
-    self.assertEqual(p_obj2.name, 'my_precision')
-    self.assertLen(p_obj2.variables, 2)
-    self.assertEqual(p_obj2.thresholds, [0.4, 0.9])
-    self.assertEqual(p_obj2.top_k, 15)
-    self.assertEqual(p_obj2.class_id, 12)
-
-  def test_value_is_idempotent(self):
-    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
-    y_pred = tf.random.uniform(shape=(10, 3))
-    y_true = tf.random.uniform(shape=(10, 3))
-    update_op = p_obj.update_state(y_true, y_pred)
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_precision = self.evaluate(p_obj.result())
-    for _ in range(10):
-      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
-                           1e-3)
-
-  def test_unweighted(self):
-    p_obj = metrics.Precision()
-    y_pred = tf.constant([1, 0, 1, 0], shape=(1, 4))
-    y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_unweighted_all_incorrect(self):
-    p_obj = metrics.Precision(thresholds=[0.5])
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = tf.constant(inputs)
-    y_true = tf.constant(1 - inputs)
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(0, self.evaluate(result))
-
-  def test_weighted(self):
-    p_obj = metrics.Precision()
-    y_pred = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
-    y_true = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    result = p_obj(
-        y_true,
-        y_pred,
-        sample_weight=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
-    weighted_tp = 3.0 + 4.0
-    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
-    expected_precision = weighted_tp / weighted_positives
-    self.assertAlmostEqual(expected_precision, self.evaluate(result))
-
-  def test_div_by_zero(self):
-    p_obj = metrics.Precision()
-    y_pred = tf.constant([0, 0, 0, 0])
-    y_true = tf.constant([0, 0, 0, 0])
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertEqual(0, self.evaluate(result))
-
-  def test_unweighted_with_threshold(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
-    y_pred = tf.constant([1, 0, 0.6, 0], shape=(1, 4))
-    y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
-
-  def test_weighted_with_threshold(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 1.])
-    y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = tf.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=tf.float32)
-    weights = tf.constant([[4, 0], [3, 1]],
-                                   shape=(2, 2),
-                                   dtype=tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred, sample_weight=weights)
-    weighted_tp = 0 + 3.
-    weighted_positives = (0 + 3.) + (4. + 0.)
-    expected_precision = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
-
-  def test_multiple_updates(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 1.])
-    y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = tf.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=tf.float32)
-    weights = tf.constant([[4, 0], [3, 1]],
-                                   shape=(2, 2),
-                                   dtype=tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
-    for _ in range(2):
-      self.evaluate(update_op)
-
-    weighted_tp = (0 + 3.) + (0 + 3.)
-    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
-    expected_precision = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
-                         1e-3)
-
-  def test_unweighted_top_k(self):
-    p_obj = metrics.Precision(top_k=3)
-    y_pred = tf.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(1. / 3, self.evaluate(result))
-
-  def test_weighted_top_k(self):
-    p_obj = metrics.Precision(top_k=3)
-    y_pred1 = tf.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
-    y_true1 = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-    self.evaluate(
-        p_obj(
-            y_true1,
-            y_pred1,
-            sample_weight=tf.constant([[1, 4, 2, 3, 5]])))
-
-    y_pred2 = tf.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
-    y_true2 = tf.constant([1, 0, 1, 1, 1], shape=(1, 5))
-    result = p_obj(y_true2, y_pred2, sample_weight=tf.constant(3))
-
-    tp = (2 + 5) + (3 + 3)
-    predicted_positives = (1 + 2 + 5) + (3 + 3 + 3)
-    expected_precision = tp / predicted_positives
-    self.assertAlmostEqual(expected_precision, self.evaluate(result))
-
-  def test_unweighted_class_id(self):
-    p_obj = metrics.Precision(class_id=2)
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-
-    y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
-    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
-
-    y_pred = tf.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
-    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
-
-    y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 0, 0, 0], shape=(1, 5))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
-    self.assertAlmostEqual(1, self.evaluate(p_obj.false_positives))
-
-  def test_unweighted_top_k_and_class_id(self):
-    p_obj = metrics.Precision(class_id=2, top_k=2)
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-
-    y_pred = tf.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
-    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
-
-    y_pred = tf.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
-    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
-
-  def test_unweighted_top_k_and_threshold(self):
-    p_obj = metrics.Precision(thresholds=.7, top_k=2)
-    self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-
-    y_pred = tf.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
-    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class RecallTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    r_obj = metrics.Recall(
-        name='my_recall', thresholds=[0.4, 0.9], top_k=15, class_id=12)
-    self.assertEqual(r_obj.name, 'my_recall')
-    self.assertLen(r_obj.variables, 2)
-    self.assertEqual([v.name for v in r_obj.variables],
-                     ['true_positives:0', 'false_negatives:0'])
-    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
-    self.assertEqual(r_obj.top_k, 15)
-    self.assertEqual(r_obj.class_id, 12)
-
-    # Check save and restore config
-    r_obj2 = metrics.Recall.from_config(r_obj.get_config())
-    self.assertEqual(r_obj2.name, 'my_recall')
-    self.assertLen(r_obj2.variables, 2)
-    self.assertEqual(r_obj2.thresholds, [0.4, 0.9])
-    self.assertEqual(r_obj2.top_k, 15)
-    self.assertEqual(r_obj2.class_id, 12)
-
-  def test_value_is_idempotent(self):
-    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
-    y_pred = tf.random.uniform(shape=(10, 3))
-    y_true = tf.random.uniform(shape=(10, 3))
-    update_op = r_obj.update_state(y_true, y_pred)
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_recall = self.evaluate(r_obj.result())
-    for _ in range(10):
-      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
-
-  def test_unweighted(self):
-    r_obj = metrics.Recall()
-    y_pred = tf.constant([1, 0, 1, 0], shape=(1, 4))
-    y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_unweighted_all_incorrect(self):
-    r_obj = metrics.Recall(thresholds=[0.5])
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = tf.constant(inputs)
-    y_true = tf.constant(1 - inputs)
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0, self.evaluate(result))
-
-  def test_weighted(self):
-    r_obj = metrics.Recall()
-    y_pred = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
-    y_true = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    result = r_obj(
-        y_true,
-        y_pred,
-        sample_weight=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
-    weighted_tp = 3.0 + 1.0
-    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
-    expected_recall = weighted_tp / weighted_t
-    self.assertAlmostEqual(expected_recall, self.evaluate(result))
-
-  def test_div_by_zero(self):
-    r_obj = metrics.Recall()
-    y_pred = tf.constant([0, 0, 0, 0])
-    y_true = tf.constant([0, 0, 0, 0])
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertEqual(0, self.evaluate(result))
-
-  def test_unweighted_with_threshold(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
-    y_pred = tf.constant([1, 0, 0.6, 0], shape=(1, 4))
-    y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
-
-  def test_weighted_with_threshold(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 1.])
-    y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = tf.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=tf.float32)
-    weights = tf.constant([[1, 4], [3, 2]],
-                                   shape=(2, 2),
-                                   dtype=tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred, sample_weight=weights)
-    weighted_tp = 0 + 3.
-    weighted_positives = (0 + 3.) + (4. + 0.)
-    expected_recall = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
-
-  def test_multiple_updates(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 1.])
-    y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = tf.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=tf.float32)
-    weights = tf.constant([[1, 4], [3, 2]],
-                                   shape=(2, 2),
-                                   dtype=tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
-    for _ in range(2):
-      self.evaluate(update_op)
-
-    weighted_tp = (0 + 3.) + (0 + 3.)
-    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
-    expected_recall = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
-                         1e-3)
-
-  def test_unweighted_top_k(self):
-    r_obj = metrics.Recall(top_k=3)
-    y_pred = tf.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_weighted_top_k(self):
-    r_obj = metrics.Recall(top_k=3)
-    y_pred1 = tf.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
-    y_true1 = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-    self.evaluate(
-        r_obj(
-            y_true1,
-            y_pred1,
-            sample_weight=tf.constant([[1, 4, 2, 3, 5]])))
-
-    y_pred2 = tf.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
-    y_true2 = tf.constant([1, 0, 1, 1, 1], shape=(1, 5))
-    result = r_obj(y_true2, y_pred2, sample_weight=tf.constant(3))
-
-    tp = (2 + 5) + (3 + 3)
-    positives = (4 + 2 + 5) + (3 + 3 + 3 + 3)
-    expected_recall = tp / positives
-    self.assertAlmostEqual(expected_recall, self.evaluate(result))
-
-  def test_unweighted_class_id(self):
-    r_obj = metrics.Recall(class_id=2)
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-
-    y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
-    self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
-
-    y_pred = tf.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
-
-    y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 0, 0, 0], shape=(1, 5))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
-
-  def test_unweighted_top_k_and_class_id(self):
-    r_obj = metrics.Recall(class_id=2, top_k=2)
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-
-    y_pred = tf.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
-    self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
-
-    y_pred = tf.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
-    y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
-
-  def test_unweighted_top_k_and_threshold(self):
-    r_obj = metrics.Recall(thresholds=.7, top_k=2)
-    self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-
-    y_pred = tf.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
-    y_true = tf.constant([1, 1, 1, 0, 1], shape=(1, 5))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.25, self.evaluate(result))
-    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
-    self.assertAlmostEqual(3, self.evaluate(r_obj.false_negatives))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class SensitivityAtSpecificityTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    s_obj = metrics.SensitivityAtSpecificity(
-        0.4,
-        num_thresholds=100,
-        class_id=12,
-        name='sensitivity_at_specificity_1')
-    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.specificity, 0.4)
-    self.assertEqual(s_obj.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-    # Check save and restore config
-    s_obj2 = metrics.SensitivityAtSpecificity.from_config(s_obj.get_config())
-    self.assertEqual(s_obj2.name, 'sensitivity_at_specificity_1')
-    self.assertLen(s_obj2.variables, 4)
-    self.assertEqual(s_obj2.specificity, 0.4)
-    self.assertEqual(s_obj2.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.7)
-    y_pred = tf.random.uniform((10, 3),
-                                       maxval=1,
-                                       dtype=tf.float32,
-                                       seed=1)
-    y_true = tf.random.uniform((10, 3),
-                                       maxval=2,
-                                       dtype=tf.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_sensitivity = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    with self.test_session():
-      s_obj = metrics.SensitivityAtSpecificity(0.7)
-      inputs = np.random.randint(0, 2, size=(100, 1))
-      y_pred = tf.constant(inputs, dtype=tf.float32)
-      y_true = tf.constant(inputs)
-      self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-      result = s_obj(y_true, y_pred)
-      self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_specificity(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.8)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.8, self.evaluate(result))
-
-  def test_unweighted_low_specificity(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  def test_unweighted_class_id(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.4, class_id=2)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
-
-    y_pred = tf.transpose([pred_values] * 3)
-    y_true = tf.one_hot(label_values, depth=3)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  @parameterized.parameters([tf.bool, tf.int32, tf.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.SensitivityAtSpecificity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.cast(label_values, dtype=label_dtype)
-    weights = tf.constant(weight_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    self.assertAlmostEqual(0.675, self.evaluate(result))
-
-  def test_invalid_specificity(self):
-    with self.assertRaisesRegex(
-        ValueError, r'`specificity` must be in the range \[0, 1\].'):
-      metrics.SensitivityAtSpecificity(-1)
-
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Argument `num_thresholds` must be an integer > 0'):
-      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class SpecificityAtSensitivityTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    s_obj = metrics.SpecificityAtSensitivity(
-        0.4,
-        num_thresholds=100,
-        class_id=12,
-        name='specificity_at_sensitivity_1')
-    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.sensitivity, 0.4)
-    self.assertEqual(s_obj.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-    # Check save and restore config
-    s_obj2 = metrics.SpecificityAtSensitivity.from_config(s_obj.get_config())
-    self.assertEqual(s_obj2.name, 'specificity_at_sensitivity_1')
-    self.assertLen(s_obj2.variables, 4)
-    self.assertEqual(s_obj2.sensitivity, 0.4)
-    self.assertEqual(s_obj2.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.7)
-    y_pred = tf.random.uniform((10, 3),
-                                       maxval=1,
-                                       dtype=tf.float32,
-                                       seed=1)
-    y_true = tf.random.uniform((10, 3),
-                                       maxval=2,
-                                       dtype=tf.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_specificity = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = tf.constant(inputs, dtype=tf.float32)
-    y_true = tf.constant(inputs)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_sensitivity(self):
-    s_obj = metrics.SpecificityAtSensitivity(1.0)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.2, self.evaluate(result))
-
-  def test_unweighted_low_sensitivity(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  def test_unweighted_class_id(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.4, class_id=2)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
-
-    y_pred = tf.transpose([pred_values] * 3)
-    y_true = tf.one_hot(label_values, depth=3)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  @parameterized.parameters([tf.bool, tf.int32, tf.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.SpecificityAtSensitivity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.cast(label_values, dtype=label_dtype)
-    weights = tf.constant(weight_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    self.assertAlmostEqual(0.4, self.evaluate(result))
-
-  def test_invalid_sensitivity(self):
-    with self.assertRaisesRegex(
-        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
-      metrics.SpecificityAtSensitivity(-1)
-
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Argument `num_thresholds` must be an integer > 0'):
-      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class PrecisionAtRecallTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    s_obj = metrics.PrecisionAtRecall(
-        0.4, num_thresholds=100, class_id=12, name='precision_at_recall_1')
-    self.assertEqual(s_obj.name, 'precision_at_recall_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.recall, 0.4)
-    self.assertEqual(s_obj.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-    # Check save and restore config
-    s_obj2 = metrics.PrecisionAtRecall.from_config(s_obj.get_config())
-    self.assertEqual(s_obj2.name, 'precision_at_recall_1')
-    self.assertLen(s_obj2.variables, 4)
-    self.assertEqual(s_obj2.recall, 0.4)
-    self.assertEqual(s_obj2.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.PrecisionAtRecall(0.7)
-    y_pred = tf.random.uniform((10, 3),
-                                       maxval=1,
-                                       dtype=tf.float32,
-                                       seed=1)
-    y_true = tf.random.uniform((10, 3),
-                                       maxval=2,
-                                       dtype=tf.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_precision = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_precision, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    s_obj = metrics.PrecisionAtRecall(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = tf.constant(inputs, dtype=tf.float32)
-    y_true = tf.constant(inputs)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_recall(self):
-    s_obj = metrics.PrecisionAtRecall(0.8)
-    pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    # For 0.5 < decision threshold < 0.6.
-    self.assertAlmostEqual(2.0/3, self.evaluate(result))
-
-  def test_unweighted_low_recall(self):
-    s_obj = metrics.PrecisionAtRecall(0.6)
-    pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    # For 0.2 < decision threshold < 0.5.
-    self.assertAlmostEqual(0.75, self.evaluate(result))
-
-  def test_unweighted_class_id(self):
-    s_obj = metrics.PrecisionAtRecall(0.6, class_id=2)
-    pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
-
-    y_pred = tf.transpose([pred_values] * 3)
-    y_true = tf.one_hot(label_values, depth=3)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    # For 0.2 < decision threshold < 0.5.
-    self.assertAlmostEqual(0.75, self.evaluate(result))
-
-  @parameterized.parameters([tf.bool, tf.int32, tf.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.PrecisionAtRecall(7.0/8)
-    pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weight_values = [2, 1, 2, 1, 2, 1, 2, 2, 1, 2]
-
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.cast(label_values, dtype=label_dtype)
-    weights = tf.constant(weight_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    # For 0.0 < decision threshold < 0.2.
-    self.assertAlmostEqual(0.7, self.evaluate(result))
-
-  def test_invalid_sensitivity(self):
-    with self.assertRaisesRegex(ValueError,
-                                r'`recall` must be in the range \[0, 1\].'):
-      metrics.PrecisionAtRecall(-1)
-
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Argument `num_thresholds` must be an integer > 0'):
-      metrics.PrecisionAtRecall(0.4, num_thresholds=-1)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class RecallAtPrecisionTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_config(self):
-    s_obj = metrics.RecallAtPrecision(
-        0.4, num_thresholds=100, class_id=12, name='recall_at_precision_1')
-    self.assertEqual(s_obj.name, 'recall_at_precision_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.precision, 0.4)
-    self.assertEqual(s_obj.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-    # Check save and restore config
-    s_obj2 = metrics.RecallAtPrecision.from_config(s_obj.get_config())
-    self.assertEqual(s_obj2.name, 'recall_at_precision_1')
-    self.assertLen(s_obj2.variables, 4)
-    self.assertEqual(s_obj2.precision, 0.4)
-    self.assertEqual(s_obj2.num_thresholds, 100)
-    self.assertEqual(s_obj.class_id, 12)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.RecallAtPrecision(0.7)
-    y_pred = tf.random.uniform((10, 3),
-                                       maxval=1,
-                                       dtype=tf.float32,
-                                       seed=1)
-    y_true = tf.random.uniform((10, 3),
-                                       maxval=2,
-                                       dtype=tf.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_recall = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_recall, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    s_obj = metrics.RecallAtPrecision(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = tf.constant(inputs, dtype=tf.float32)
-    y_true = tf.constant(inputs)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_precision(self):
-    s_obj = metrics.RecallAtPrecision(0.75)
-    pred_values = [
-        0.05, 0.1, 0.2, 0.3, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.9, 0.95
-    ]
-    label_values = [0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1]
-    # precisions: [1/2, 6/11, 1/2, 5/9, 5/8, 5/7, 2/3, 3/5, 3/5, 2/3, 1/2, 1].
-    # recalls:    [1,   1,    5/6, 5/6, 5/6, 5/6, 2/3, 1/2, 1/2, 1/3, 1/6, 1/6].
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    # The precision 0.75 can be reached at thresholds 0.4<=t<0.45.
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_unweighted_low_precision(self):
-    s_obj = metrics.RecallAtPrecision(2.0 / 3)
-    pred_values = [
-        0.05, 0.1, 0.2, 0.3, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.9, 0.95
-    ]
-    label_values = [0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1]
-    # precisions: [1/2, 6/11, 1/2, 5/9, 5/8, 5/7, 2/3, 3/5, 3/5, 2/3, 1/2, 1].
-    # recalls:    [1,   1,    5/6, 5/6, 5/6, 5/6, 2/3, 1/2, 1/2, 1/3, 1/6, 1/6].
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    # The precision 5/7 can be reached at thresholds 00.3<=t<0.35.
-    self.assertAlmostEqual(5. / 6, self.evaluate(result))
-
-  def test_unweighted_class_id(self):
-    s_obj = metrics.RecallAtPrecision(2.0 / 3, class_id=2)
-    pred_values = [
-        0.05, 0.1, 0.2, 0.3, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.9, 0.95
-    ]
-    label_values = [0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2]
-    # precisions: [1/2, 6/11, 1/2, 5/9, 5/8, 5/7, 2/3, 3/5, 3/5, 2/3, 1/2, 1].
-    # recalls:    [1,   1,    5/6, 5/6, 5/6, 5/6, 2/3, 1/2, 1/2, 1/3, 1/6, 1/6].
-    y_pred = tf.transpose([pred_values] * 3)
-    y_true = tf.one_hot(label_values, depth=3)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    # The precision 5/7 can be reached at thresholds 00.3<=t<0.35.
-    self.assertAlmostEqual(5. / 6, self.evaluate(result))
-
-  @parameterized.parameters([tf.bool, tf.int32, tf.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.RecallAtPrecision(0.75)
-    pred_values = [0.1, 0.2, 0.3, 0.5, 0.6, 0.9, 0.9]
-    label_values = [0, 1, 0, 0, 0, 1, 1]
-    weight_values = [1, 2, 1, 2, 1, 2, 1]
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.cast(label_values, dtype=label_dtype)
-    weights = tf.constant(weight_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  def test_unachievable_precision(self):
-    s_obj = metrics.RecallAtPrecision(2.0 / 3)
-    pred_values = [0.1, 0.2, 0.3, 0.9]
-    label_values = [1, 1, 0, 0]
-    y_pred = tf.constant(pred_values, dtype=tf.float32)
-    y_true = tf.constant(label_values)
-    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    # The highest possible precision is 1/2 which is below the required
-    # value, expect 0 recall.
-    self.assertAlmostEqual(0, self.evaluate(result))
-
-  def test_invalid_sensitivity(self):
-    with self.assertRaisesRegex(ValueError,
-                                r'`precision` must be in the range \[0, 1\].'):
-      metrics.RecallAtPrecision(-1)
-
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Argument `num_thresholds` must be an integer > 0'):
-      metrics.RecallAtPrecision(0.4, num_thresholds=-1)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class AUCTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setup(self):
-    self.num_thresholds = 3
-    self.y_pred = tf.constant([0, 0.5, 0.3, 0.9], dtype=tf.float32)
-    epsilon = 1e-12
-    self.y_pred_logits = -tf.math.log(1.0 / (self.y_pred + epsilon) - 1.0)
-    self.y_true = tf.constant([0, 0, 1, 1])
-    self.sample_weight = [1, 2, 3, 4]
-
-    # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
-    # y_pred when threshold = 0 - 1e-7  : [1, 1, 1, 1]
-    # y_pred when threshold = 0.5       : [0, 0, 0, 1]
-    # y_pred when threshold = 1 + 1e-7  : [0, 0, 0, 0]
-
-    # without sample_weight:
-    # tp = np.sum([[0, 0, 1, 1], [0, 0, 0, 1], [0, 0, 0, 0]], axis=1)
-    # fp = np.sum([[1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
-    # fn = np.sum([[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 1]], axis=1)
-    # tn = np.sum([[0, 0, 0, 0], [1, 1, 0, 0], [1, 1, 0, 0]], axis=1)
-
-    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
-
-    # with sample_weight:
-    # tp = np.sum([[0, 0, 3, 4], [0, 0, 0, 4], [0, 0, 0, 0]], axis=1)
-    # fp = np.sum([[1, 2, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
-    # fn = np.sum([[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 3, 4]], axis=1)
-    # tn = np.sum([[0, 0, 0, 0], [1, 2, 0, 0], [1, 2, 0, 0]], axis=1)
-
-    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
-
-  def test_config(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=100,
-        curve='PR',
-        summation_method='majoring',
-        name='auc_1')
-    auc_obj.update_state(self.y_true, self.y_pred)
-    self.assertEqual(auc_obj.name, 'auc_1')
-    self.assertLen(auc_obj.variables, 4)
-    self.assertEqual(auc_obj.num_thresholds, 100)
-    self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
-    self.assertEqual(auc_obj.summation_method,
-                     metrics_utils.AUCSummationMethod.MAJORING)
-    old_config = auc_obj.get_config()
-    self.assertNotIn('thresholds', old_config)
-    self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
-
-    # Check save and restore config.
-    auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
-    auc_obj2.update_state(self.y_true, self.y_pred)
-    self.assertEqual(auc_obj2.name, 'auc_1')
-    self.assertLen(auc_obj2.variables, 4)
-    self.assertEqual(auc_obj2.num_thresholds, 100)
-    self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
-    self.assertEqual(auc_obj2.summation_method,
-                     metrics_utils.AUCSummationMethod.MAJORING)
-    new_config = auc_obj2.get_config()
-    self.assertNotIn('thresholds', new_config)
-    self.assertDictEqual(old_config, new_config)
-    self.assertAllClose(auc_obj.thresholds, auc_obj2.thresholds)
-
-  def test_config_manual_thresholds(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=None,
-        curve='PR',
-        summation_method='majoring',
-        name='auc_1',
-        thresholds=[0.3, 0.5])
-    auc_obj.update_state(self.y_true, self.y_pred)
-    self.assertEqual(auc_obj.name, 'auc_1')
-    self.assertLen(auc_obj.variables, 4)
-    self.assertEqual(auc_obj.num_thresholds, 4)
-    self.assertAllClose(auc_obj.thresholds, [0.0, 0.3, 0.5, 1.0])
-    self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
-    self.assertEqual(auc_obj.summation_method,
-                     metrics_utils.AUCSummationMethod.MAJORING)
-    old_config = auc_obj.get_config()
-    self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
-
-    # Check save and restore config.
-    auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
-    auc_obj2.update_state(self.y_true, self.y_pred)
-    self.assertEqual(auc_obj2.name, 'auc_1')
-    self.assertLen(auc_obj2.variables, 4)
-    self.assertEqual(auc_obj2.num_thresholds, 4)
-    self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
-    self.assertEqual(auc_obj2.summation_method,
-                     metrics_utils.AUCSummationMethod.MAJORING)
-    new_config = auc_obj2.get_config()
-    self.assertDictEqual(old_config, new_config)
-    self.assertAllClose(auc_obj.thresholds, auc_obj2.thresholds)
-
-  def test_value_is_idempotent(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=3)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-
-    # Run several updates.
-    update_op = auc_obj.update_state(self.y_true, self.y_pred)
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_auc = self.evaluate(auc_obj.result())
-    for _ in range(10):
-      self.assertAllClose(initial_auc, self.evaluate(auc_obj.result()), 1e-3)
-
-  def test_unweighted_all_correct(self):
-    self.setup()
-    auc_obj = metrics.AUC()
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_true)
-    self.assertEqual(self.evaluate(result), 1)
-
-  def test_unweighted(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred)
-
-    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
-    # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
-    # fp_rate = [2/2, 0, 0] = [1, 0, 0]
-    # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
-    # widths = [(1 - 0), (0 - 0)] = [1, 0]
-    expected_result = (0.75 * 1 + 0.25 * 0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_unweighted_from_logits(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred_logits)
-
-    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
-    # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
-    # fp_rate = [2/2, 0, 0] = [1, 0, 0]
-    # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
-    # widths = [(1 - 0), (0 - 0)] = [1, 0]
-    expected_result = (0.75 * 1 + 0.25 * 0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_manual_thresholds(self):
-    self.setup()
-    # Verify that when specified, thresholds are used instead of num_thresholds.
-    auc_obj = metrics.AUC(num_thresholds=2, thresholds=[0.5])
-    self.assertEqual(auc_obj.num_thresholds, 3)
-    self.assertAllClose(auc_obj.thresholds, [0.0, 0.5, 1.0])
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred)
-
-    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
-    # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
-    # fp_rate = [2/2, 0, 0] = [1, 0, 0]
-    # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
-    # widths = [(1 - 0), (0 - 0)] = [1, 0]
-    expected_result = (0.75 * 1 + 0.25 * 0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_weighted_roc_interpolation(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
-
-    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
-    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
-    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
-    # heights = [(1 + 0.571)/2, (0.571 + 0)/2] = [0.7855, 0.2855]
-    # widths = [(1 - 0), (0 - 0)] = [1, 0]
-    expected_result = (0.7855 * 1 + 0.2855 * 0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_weighted_roc_majoring(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds, summation_method='majoring')
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
-
-    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
-    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
-    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
-    # heights = [max(1, 0.571), max(0.571, 0)] = [1, 0.571]
-    # widths = [(1 - 0), (0 - 0)] = [1, 0]
-    expected_result = (1 * 1 + 0.571 * 0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_weighted_roc_minoring(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds, summation_method='minoring')
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
-
-    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
-    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
-    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
-    # heights = [min(1, 0.571), min(0.571, 0)] = [0.571, 0]
-    # widths = [(1 - 0), (0 - 0)] = [1, 0]
-    expected_result = (0.571 * 1 + 0 * 0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_weighted_pr_majoring(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds,
-        curve='PR',
-        summation_method='majoring')
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
-
-    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
-    # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
-    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
-    # heights = [max(0.7, 1), max(1, 0)] = [1, 1]
-    # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
-    expected_result = (1 * 0.429 + 1 * 0.571)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_weighted_pr_minoring(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds,
-        curve='PR',
-        summation_method='minoring')
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
-
-    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
-    # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
-    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
-    # heights = [min(0.7, 1), min(1, 0)] = [0.7, 0]
-    # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
-    expected_result = (0.7 * 0.429 + 0 * 0.571)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_weighted_pr_interpolation(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR')
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
-
-    # auc = (slope / Total Pos) * [dTP - intercept * log(Pb/Pa)]
-
-    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
-    # P = tp + fp = [10, 4, 0]
-    # dTP = [7-4, 4-0] = [3, 4]
-    # dP = [10-4, 4-0] = [6, 4]
-    # slope = dTP/dP = [0.5, 1]
-    # intercept = (TPa+(slope*Pa) = [(4 - 0.5*4), (0 - 1*0)] = [2, 0]
-    # (Pb/Pa) = (Pb/Pa) if Pb > 0 AND Pa > 0 else 1 = [10/4, 4/0] = [2.5, 1]
-    # auc * TotalPos = [(0.5 * (3 + 2 * log(2.5))), (1 * (4 + 0))]
-    #                = [2.416, 4]
-    # auc = [2.416, 4]/(tp[1:]+fn[1:])
-    expected_result = (2.416/7 + 4/7)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Argument `num_thresholds` must be an integer > 1'):
-      metrics.AUC(num_thresholds=-1)
-
-    with self.assertRaisesRegex(
-        ValueError, 'Argument `num_thresholds` must be an integer > 1.'):
-      metrics.AUC(num_thresholds=1)
-
-  def test_invalid_curve(self):
-    with self.assertRaisesRegex(ValueError,
-                                'Invalid AUC curve value: "Invalid".'):
-      metrics.AUC(curve='Invalid')
-
-  def test_invalid_summation_method(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Invalid AUC summation method value: "Invalid".'):
-      metrics.AUC(summation_method='Invalid')
-
-  def test_extra_dims(self):
-    try:
-      from scipy import special  # pylint: disable=g-import-not-at-top
-      self.setup()
-      logits = special.expit(-np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                                        [[-12., 12., -12.], [12., -12., 12.]]],
-                                       dtype=np.float32))
-      labels = np.array([[[1, 0, 0], [1, 0, 0]], [[0, 1, 1], [0, 1, 1]]],
-                        dtype=np.int64)
-      auc_obj = metrics.AUC()
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      result = auc_obj(labels, logits)
-      self.assertEqual(self.evaluate(result), 0.5)
-    except ImportError as e:
-      tf_logging.warning('Cannot test special functions: %s' % str(e))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MultiAUCTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setup(self):
-    self.num_thresholds = 5
-    self.y_pred = tf.constant(
-        np.array([[0, 0.5, 0.3, 0.9], [0.1, 0.2, 0.3, 0.4]]).T,
-        dtype=tf.float32)
-
-    epsilon = 1e-12
-    self.y_pred_logits = -tf.math.log(1.0 / (self.y_pred + epsilon) - 1.0)
-
-    self.y_true_good = tf.constant(
-        np.array([[0, 0, 1, 1], [0, 0, 1, 1]]).T)
-    self.y_true_bad = tf.constant(
-        np.array([[0, 0, 1, 1], [1, 1, 0, 0]]).T)
-    self.sample_weight = [1, 2, 3, 4]
-
-    # threshold values are [0 - 1e-7, 0.25, 0.5, 0.75, 1 + 1e-7]
-    # y_pred when threshold = 0 - 1e-7   : [[1, 1, 1, 1], [1, 1, 1, 1]]
-    # y_pred when threshold = 0.25       : [[0, 1, 1, 1], [0, 0, 1, 1]]
-    # y_pred when threshold = 0.5        : [[0, 0, 0, 1], [0, 0, 0, 0]]
-    # y_pred when threshold = 0.75       : [[0, 0, 0, 1], [0, 0, 0, 0]]
-    # y_pred when threshold = 1 + 1e-7   : [[0, 0, 0, 0], [0, 0, 0, 0]]
-
-    # for y_true_good, over thresholds:
-    # tp = [[2, 2, 1, 1, 0], [2, 2, 0, 0, 0]]
-    # fp = [[2, 1, 0, 0 , 0], [2, 0, 0 ,0, 0]]
-    # fn = [[0, 0, 1, 1, 2], [0, 0, 2, 2, 2]]
-    # tn = [[0, 1, 2, 2, 2], [0, 2, 2, 2, 2]]
-
-    # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
-    # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
-
-    # for y_true_bad:
-    # tp = [[2, 2, 1, 1, 0], [2, 0, 0, 0, 0]]
-    # fp = [[2, 1, 0, 0 , 0], [2, 2, 0 ,0, 0]]
-    # fn = [[0, 0, 1, 1, 2], [0, 2, 2, 2, 2]]
-    # tn = [[0, 1, 2, 2, 2], [0, 0, 2, 2, 2]]
-
-    # tpr = [[1, 1, 0.5, 0.5, 0], [1, 0, 0, 0, 0]]
-    # fpr = [[1, 0.5, 0, 0, 0], [1, 1, 0, 0, 0]]
-
-    # for y_true_good with sample_weights:
-
-    # tp = [[7, 7, 4, 4, 0], [7, 7, 0, 0, 0]]
-    # fp = [[3, 2, 0, 0, 0], [3, 0, 0, 0, 0]]
-    # fn = [[0, 0, 3, 3, 7], [0, 0, 7, 7, 7]]
-    # tn = [[0, 1, 3, 3, 3], [0, 3, 3, 3, 3]]
-
-    # tpr = [[1, 1,    0.57, 0.57, 0], [1, 1, 0, 0, 0]]
-    # fpr = [[1, 0.67, 0,    0,    0], [1, 0, 0, 0, 0]]
-
-  def test_value_is_idempotent(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(num_thresholds=5, multi_label=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-
-      # Run several updates.
-      update_op = auc_obj.update_state(self.y_true_good, self.y_pred)
-      for _ in range(10):
-        self.evaluate(update_op)
-
-      # Then verify idempotency.
-      initial_auc = self.evaluate(auc_obj.result())
-      for _ in range(10):
-        self.assertAllClose(initial_auc, self.evaluate(auc_obj.result()), 1e-3)
-
-  def test_unweighted_all_correct(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(multi_label=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      result = auc_obj(self.y_true_good, self.y_true_good)
-      self.assertEqual(self.evaluate(result), 1)
-
-  def test_unweighted_all_correct_flat(self):
-    self.setup()
-    auc_obj = metrics.AUC(multi_label=False)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_true_good)
-    self.assertEqual(self.evaluate(result), 1)
-
-  def test_unweighted(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds,
-                            multi_label=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      result = auc_obj(self.y_true_good, self.y_pred)
-
-      # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
-      # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
-      expected_result = (0.875 + 1.0) / 2.0
-      self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_unweighted_from_logits(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(
-          num_thresholds=self.num_thresholds,
-          multi_label=True,
-          from_logits=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      result = auc_obj(self.y_true_good, self.y_pred_logits)
-
-      # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
-      # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
-      expected_result = (0.875 + 1.0) / 2.0
-      self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_sample_weight_flat(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, multi_label=False)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_pred, sample_weight=[1, 2, 3, 4])
-
-    # tpr = [1, 1, 0.2857, 0.2857, 0]
-    # fpr = [1, 0.3333, 0, 0, 0]
-    expected_result = 1.0 - (0.3333 * (1.0 - 0.2857) / 2.0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_full_sample_weight_flat(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, multi_label=False)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    sw = np.arange(4 * 2)
-    sw = sw.reshape(4, 2)
-    result = auc_obj(self.y_true_good, self.y_pred, sample_weight=sw)
-
-    # tpr = [1, 1, 0.2727, 0.2727, 0]
-    # fpr = [1, 0.3333, 0, 0, 0]
-    expected_result = 1.0 - (0.3333 * (1.0 - 0.2727) / 2.0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_label_weights(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(
-          num_thresholds=self.num_thresholds,
-          multi_label=True,
-          label_weights=[0.75, 0.25])
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      result = auc_obj(self.y_true_good, self.y_pred)
-
-      # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
-      # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
-      expected_result = (0.875 * 0.75 + 1.0 * 0.25) / (0.75 + 0.25)
-      self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_label_weights_flat(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds,
-        multi_label=False,
-        label_weights=[0.75, 0.25])
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_pred)
-
-    # tpr = [1, 1, 0.375, 0.375, 0]
-    # fpr = [1, 0.375, 0, 0, 0]
-    expected_result = 1.0 - ((1.0 - 0.375) * 0.375 / 2.0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-2)
-
-  def test_unweighted_flat(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, multi_label=False)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_pred)
-
-    # tp = [4, 4, 1, 1, 0]
-    # fp = [4, 1, 0, 0, 0]
-    # fn = [0, 0, 3, 3, 4]
-    # tn = [0, 3, 4, 4, 4]
-
-    # tpr = [1, 1, 0.25, 0.25, 0]
-    # fpr = [1, 0.25, 0, 0, 0]
-    expected_result = 1.0 - (3.0 / 32.0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_unweighted_flat_from_logits(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds, multi_label=False, from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_pred_logits)
-
-    # tp = [4, 4, 1, 1, 0]
-    # fp = [4, 1, 0, 0, 0]
-    # fn = [0, 0, 3, 3, 4]
-    # tn = [0, 3, 4, 4, 4]
-
-    # tpr = [1, 1, 0.25, 0.25, 0]
-    # fpr = [1, 0.25, 0, 0, 0]
-    expected_result = 1.0 - (3.0 / 32.0)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_manual_thresholds(self):
-    with self.test_session():
-      self.setup()
-      # Verify that when specified, thresholds are used instead of
-      # num_thresholds.
-      auc_obj = metrics.AUC(num_thresholds=2, thresholds=[0.5],
-                            multi_label=True)
-      self.assertEqual(auc_obj.num_thresholds, 3)
-      self.assertAllClose(auc_obj.thresholds, [0.0, 0.5, 1.0])
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      result = auc_obj(self.y_true_good, self.y_pred)
-
-      # tp = [[2, 1, 0], [2, 0, 0]]
-      # fp = [2, 0, 0], [2, 0, 0]]
-      # fn = [[0, 1, 2], [0, 2, 2]]
-      # tn = [[0, 2, 2], [0, 2, 2]]
-
-      # tpr = [[1, 0.5, 0], [1, 0, 0]]
-      # fpr = [[1, 0, 0], [1, 0, 0]]
-
-      # auc by slice = [0.75, 0.5]
-      expected_result = (0.75 + 0.5) / 2.0
-
-      self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
-
-  def test_weighted_roc_interpolation(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds,
-                            multi_label=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      result = auc_obj(
-          self.y_true_good, self.y_pred, sample_weight=self.sample_weight)
-
-      # tpr = [[1, 1,    0.57, 0.57, 0], [1, 1, 0, 0, 0]]
-      # fpr = [[1, 0.67, 0,    0,    0], [1, 0, 0, 0, 0]]
-      expected_result = 1.0 - 0.5 * 0.43 * 0.67
-      self.assertAllClose(self.evaluate(result), expected_result, 1e-1)
-
-  def test_pr_interpolation_unweighted(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR',
-                            multi_label=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      good_result = auc_obj(self.y_true_good, self.y_pred)
-      with self.subTest(name='good'):
-        # PR AUCs are 0.917 and 1.0 respectively
-        self.assertAllClose(self.evaluate(good_result), (0.91667 + 1.0) / 2.0,
-                            1e-1)
-      bad_result = auc_obj(self.y_true_bad, self.y_pred)
-      with self.subTest(name='bad'):
-        # PR AUCs are 0.917 and 0.5 respectively
-        self.assertAllClose(self.evaluate(bad_result), (0.91667 + 0.5) / 2.0,
-                            1e-1)
-
-  def test_pr_interpolation(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR',
-                            multi_label=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      good_result = auc_obj(self.y_true_good, self.y_pred,
-                            sample_weight=self.sample_weight)
-      # PR AUCs are 0.939 and 1.0 respectively
-      self.assertAllClose(self.evaluate(good_result), (0.939 + 1.0) / 2.0,
-                          1e-1)
-
-  def test_keras_model_compiles(self):
-    inputs = layers.Input(shape=(10,))
-    output = layers.Dense(3, activation='sigmoid')(inputs)
-    model = models.Model(inputs=inputs, outputs=output)
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=[metrics.AUC(multi_label=True)]
-    )
-
-  def test_reset_state(self):
-    with self.test_session():
-      self.setup()
-      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds,
-                            multi_label=True)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      auc_obj(self.y_true_good, self.y_pred)
-      auc_obj.reset_state()
-      self.assertAllEqual(auc_obj.true_positives, np.zeros((5, 2)))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['eager']))
-class ThresholdsTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters([
-      metrics.TruePositives(),
-      metrics.TrueNegatives(),
-      metrics.FalsePositives(),
-      metrics.FalseNegatives(),
-      metrics.Precision(),
-      metrics.Recall(),
-      metrics.SensitivityAtSpecificity(0.5),
-      metrics.SpecificityAtSensitivity(0.5),
-      metrics.PrecisionAtRecall(0.5),
-      metrics.RecallAtPrecision(0.5),
-      metrics.AUC()])
-  def test_with_default_thresholds(self, metric_obj):
-    # By default, the thresholds will be evenly distributed if there are more
-    # than 1. In case there is only 1 thresholds, then we expect
-    # _thresholds_distributed_evenly to be false.
-    expected = len(metric_obj.thresholds) > 1
-    self.assertEqual(metric_obj._thresholds_distributed_evenly, expected)
-
-  @parameterized.parameters([
-      metrics.TruePositives,
-      metrics.TrueNegatives,
-      metrics.FalsePositives,
-      metrics.FalseNegatives,
-      metrics.Precision,
-      metrics.Recall])
-  def test_with_manual_thresholds(self, metric_cls):
-    even_thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
-    metric_obj = metric_cls(thresholds=even_thresholds)
-    self.assertTrue(metric_obj._thresholds_distributed_evenly)
-
-    uneven_thresholds = [0.0, 0.45, 1.0]
-    metric_obj = metric_cls(thresholds=uneven_thresholds)
-    self.assertFalse(metric_obj._thresholds_distributed_evenly)
-
-  def test_manual_thresholds_auc(self):
-    # The AUC metric handles manual thresholds input differently (it will add
-    # 0.0 and 1.0 for user).
-    even_thresholds = [0.25, 0.5, 0.75]
-    auc = metrics.AUC(thresholds=even_thresholds)
-    self.assertTrue(auc._thresholds_distributed_evenly)
-
-    # Test for save model
-    cloned = metrics.AUC.from_config(auc.get_config())
-    self.assertTrue(cloned._thresholds_distributed_evenly)
-
-    uneven_thresholds = [0.45,]
-    auc = metrics.AUC(thresholds=uneven_thresholds)
-    self.assertFalse(auc._thresholds_distributed_evenly)
-
-    cloned = metrics.AUC.from_config(auc.get_config())
-    self.assertFalse(cloned._thresholds_distributed_evenly)
-
-  @parameterized.parameters([
-      metrics.TruePositives,
-      metrics.TrueNegatives,
-      metrics.FalsePositives,
-      metrics.FalseNegatives,
-      metrics.Precision,
-      metrics.Recall,
-      metrics.AUC])
-  def test_even_thresholds_correctness(self, metric_cls):
-    with tf.compat.forward_compatibility_horizon(2021, 6, 9):
-      # make sure the old approach and new approach produce same result
-      # for evenly distributed thresholds
-      y_true = np.random.randint(2, size=(10,))
-      y_pred = np.random.rand(10)
-
-      even_thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
-      if metric_cls == metrics.AUC:
-        even_thresholds = even_thresholds[1:-1]
-      metric_obj = metric_cls(thresholds=even_thresholds)
-      metric_obj.update_state(y_true, y_pred)
-      result1 = metric_obj.result()
-
-      metric_obj2 = metric_cls(thresholds=even_thresholds)
-      # Force to use the old approach
-      metric_obj2._thresholds_distributed_evenly = False
-      metric_obj2.update_state(y_true, y_pred)
-      result2 = metric_obj2.result()
-
-      self.assertAllClose(result1, result2)
-      # Check all the variables are the same, eg tp, tn, fp, fn
-      for v1, v2 in zip(metric_obj.variables, metric_obj2.variables):
-        self.assertAllClose(v1, v2)
-
-  @parameterized.parameters([
-      metrics.SensitivityAtSpecificity,
-      metrics.SpecificityAtSensitivity,
-      metrics.PrecisionAtRecall,
-      metrics.RecallAtPrecision])
-  def test_even_thresholds_correctness_2(self, metric_cls):
-    with tf.compat.forward_compatibility_horizon(2021, 6, 9):
-      y_true = np.random.randint(2, size=(10,))
-      y_pred = np.random.rand(10)
-
-      metric_obj = metric_cls(0.5)
-      metric_obj.update_state(y_true, y_pred)
-      result1 = metric_obj.result()
-
-      metric_obj2 = metric_cls(0.5)
-      # Force to use the old approach
-      metric_obj2._thresholds_distributed_evenly = False
-      metric_obj2.update_state(y_true, y_pred)
-      result2 = metric_obj2.result()
-
-      self.assertAllClose(result1, result2)
-      # Check all the variables are the same, eg tp, tn, fp, fn
-      for v1, v2 in zip(metric_obj.variables, metric_obj2.variables):
-        self.assertAllClose(v1, v2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/metrics/confusion_metrics.py b/keras/metrics/confusion_metrics.py
new file mode 100644
index 000000000000..75584ff795e1
--- /dev/null
+++ b/keras/metrics/confusion_metrics.py
@@ -0,0 +1,1707 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Confusion metrics, i.e. metrics based on True/False positives/negatives."""
+
+import abc
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras import activations
+from keras import backend
+from keras.dtensor import utils as dtensor_utils
+from keras.metrics import base_metric
+from keras.utils import metrics_utils
+from keras.utils.generic_utils import to_list
+from keras.utils.tf_utils import is_tensor_or_variable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+class _ConfusionMatrixConditionCount(base_metric.Metric):
+    """Calculates the number of the given confusion matrix condition.
+
+    Args:
+      confusion_matrix_cond: One of `metrics_utils.ConfusionMatrix` conditions.
+      thresholds: (Optional) A float value or a python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value. Defaults to `0.5`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+
+    def __init__(
+        self, confusion_matrix_cond, thresholds=None, name=None, dtype=None
+    ):
+        super().__init__(name=name, dtype=dtype)
+        self._confusion_matrix_cond = confusion_matrix_cond
+        self.init_thresholds = thresholds
+        self.thresholds = metrics_utils.parse_init_thresholds(
+            thresholds, default_threshold=0.5
+        )
+        self._thresholds_distributed_evenly = (
+            metrics_utils.is_evenly_distributed_thresholds(self.thresholds)
+        )
+        self.accumulator = self.add_weight(
+            "accumulator", shape=(len(self.thresholds),), initializer="zeros"
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates the metric statistics.
+
+        Args:
+          y_true: The ground truth values.
+          y_pred: The predicted values.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        return metrics_utils.update_confusion_matrix_variables(
+            {self._confusion_matrix_cond: self.accumulator},
+            y_true,
+            y_pred,
+            thresholds=self.thresholds,
+            thresholds_distributed_evenly=self._thresholds_distributed_evenly,
+            sample_weight=sample_weight,
+        )
+
+    def result(self):
+        if len(self.thresholds) == 1:
+            result = self.accumulator[0]
+        else:
+            result = self.accumulator
+        return tf.convert_to_tensor(result)
+
+    def reset_state(self):
+        backend.batch_set_value(
+            [(v, np.zeros(v.shape.as_list())) for v in self.variables]
+        )
+
+    def get_config(self):
+        config = {"thresholds": self.init_thresholds}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.FalsePositives")
+class FalsePositives(_ConfusionMatrixConditionCount):
+    """Calculates the number of false positives.
+
+    If `sample_weight` is given, calculates the sum of the weights of
+    false positives. This metric creates one local variable, `accumulator`
+    that is used to keep track of the number of false positives.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      thresholds: (Optional) A float value, or a Python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). If used with a
+        loss function that sets `from_logits=True` (i.e. no sigmoid applied to
+        predictions), `thresholds` should be set to 0. One metric value is
+        generated for each threshold value. Defaults to `0.5`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.FalsePositives()
+    >>> m.update_state([0, 1, 0, 0], [0, 0, 1, 1])
+    >>> m.result().numpy()
+    2.0
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 1, 0, 0], [0, 0, 1, 1], sample_weight=[0, 0, 1, 0])
+    >>> m.result().numpy()
+    1.0
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='binary_crossentropy',
+                  metrics=[tf.keras.metrics.FalsePositives()])
+    ```
+
+    Usage with a loss with `from_logits=True`:
+
+    ```python
+    model.compile(optimizer='adam',
+                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+                  metrics=[tf.keras.metrics.FalsePositives(thresholds=0)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, thresholds=None, name=None, dtype=None):
+        super().__init__(
+            confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_POSITIVES,
+            thresholds=thresholds,
+            name=name,
+            dtype=dtype,
+        )
+
+
+@keras_export("keras.metrics.FalseNegatives")
+class FalseNegatives(_ConfusionMatrixConditionCount):
+    """Calculates the number of false negatives.
+
+    If `sample_weight` is given, calculates the sum of the weights of
+    false negatives. This metric creates one local variable, `accumulator`
+    that is used to keep track of the number of false negatives.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      thresholds: (Optional) A float value, or a Python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). If used with a
+        loss function that sets `from_logits=True` (i.e. no sigmoid applied to
+        predictions), `thresholds` should be set to 0. One metric value is
+        generated for each threshold value. Defaults to `0.5`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.FalseNegatives()
+    >>> m.update_state([0, 1, 1, 1], [0, 1, 0, 0])
+    >>> m.result().numpy()
+    2.0
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 1, 1, 1], [0, 1, 0, 0], sample_weight=[0, 0, 1, 0])
+    >>> m.result().numpy()
+    1.0
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='binary_crossentropy',
+                  metrics=[tf.keras.metrics.FalseNegatives()])
+    ```
+
+    Usage with a loss with `from_logits=True`:
+
+    ```python
+    model.compile(optimizer='adam',
+                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+                  metrics=[tf.keras.metrics.FalseNegatives(thresholds=0)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, thresholds=None, name=None, dtype=None):
+        super().__init__(
+            confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_NEGATIVES,
+            thresholds=thresholds,
+            name=name,
+            dtype=dtype,
+        )
+
+
+@keras_export("keras.metrics.TrueNegatives")
+class TrueNegatives(_ConfusionMatrixConditionCount):
+    """Calculates the number of true negatives.
+
+    If `sample_weight` is given, calculates the sum of the weights of
+    true negatives. This metric creates one local variable, `accumulator`
+    that is used to keep track of the number of true negatives.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      thresholds: (Optional) A float value, or a Python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). If used with a
+        loss function that sets `from_logits=True` (i.e. no sigmoid applied to
+        predictions), `thresholds` should be set to 0. One metric value is
+        generated for each threshold value. Defaults to `0.5`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.TrueNegatives()
+    >>> m.update_state([0, 1, 0, 0], [1, 1, 0, 0])
+    >>> m.result().numpy()
+    2.0
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 1, 0, 0], [1, 1, 0, 0], sample_weight=[0, 0, 1, 0])
+    >>> m.result().numpy()
+    1.0
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='binary_crossentropy',
+                  metrics=[tf.keras.metrics.TrueNegatives()])
+    ```
+
+    Usage with a loss with `from_logits=True`:
+
+    ```python
+    model.compile(optimizer='adam',
+                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+                  metrics=[tf.keras.metrics.TrueNegatives(thresholds=0)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, thresholds=None, name=None, dtype=None):
+        super().__init__(
+            confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_NEGATIVES,
+            thresholds=thresholds,
+            name=name,
+            dtype=dtype,
+        )
+
+
+@keras_export("keras.metrics.TruePositives")
+class TruePositives(_ConfusionMatrixConditionCount):
+    """Calculates the number of true positives.
+
+    If `sample_weight` is given, calculates the sum of the weights of
+    true positives. This metric creates one local variable, `true_positives`
+    that is used to keep track of the number of true positives.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      thresholds: (Optional) A float value, or a Python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). If used with a
+        loss function that sets `from_logits=True` (i.e. no sigmoid applied to
+        predictions), `thresholds` should be set to 0. One metric value is
+        generated for each threshold value. Defaults to `0.5`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.TruePositives()
+    >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+    >>> m.result().numpy()
+    2.0
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
+    >>> m.result().numpy()
+    1.0
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='binary_crossentropy',
+                  metrics=[tf.keras.metrics.TruePositives()])
+    ```
+
+    Usage with a loss with `from_logits=True`:
+
+    ```python
+    model.compile(optimizer='adam',
+                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+                  metrics=[tf.keras.metrics.TruePositives(thresholds=0)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, thresholds=None, name=None, dtype=None):
+        super().__init__(
+            confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_POSITIVES,
+            thresholds=thresholds,
+            name=name,
+            dtype=dtype,
+        )
+
+
+@keras_export("keras.metrics.Precision")
+class Precision(base_metric.Metric):
+    """Computes the precision of the predictions with respect to the labels.
+
+    The metric creates two local variables, `true_positives` and
+    `false_positives` that are used to compute the precision. This value is
+    ultimately returned as `precision`, an idempotent operation that simply
+    divides `true_positives` by the sum of `true_positives` and
+    `false_positives`.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    If `top_k` is set, we'll calculate precision as how often on average a class
+    among the top-k classes with the highest predicted values of a batch entry
+    is correct and can be found in the label for that entry.
+
+    If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is above the threshold and/or in
+    the top-k highest predictions, and computing the fraction of them for which
+    `class_id` is indeed a correct label.
+
+    Args:
+      thresholds: (Optional) A float value, or a Python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). If used with a loss function
+        that sets `from_logits=True` (i.e. no sigmoid applied to predictions),
+        `thresholds` should be set to 0. One metric value is generated for each
+        threshold value. If neither thresholds nor top_k are set, the default is
+        to calculate precision with `thresholds=0.5`.
+      top_k: (Optional) Unset by default. An int value specifying the top-k
+        predictions to consider when calculating precision.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.Precision()
+    >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+    >>> m.result().numpy()
+    0.6666667
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
+    >>> m.result().numpy()
+    1.0
+
+    >>> # With top_k=2, it will calculate precision over y_true[:2]
+    >>> # and y_pred[:2]
+    >>> m = tf.keras.metrics.Precision(top_k=2)
+    >>> m.update_state([0, 0, 1, 1], [1, 1, 1, 1])
+    >>> m.result().numpy()
+    0.0
+
+    >>> # With top_k=4, it will calculate precision over y_true[:4]
+    >>> # and y_pred[:4]
+    >>> m = tf.keras.metrics.Precision(top_k=4)
+    >>> m.update_state([0, 0, 1, 1], [1, 1, 1, 1])
+    >>> m.result().numpy()
+    0.5
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='binary_crossentropy',
+                  metrics=[tf.keras.metrics.Precision()])
+    ```
+
+    Usage with a loss with `from_logits=True`:
+
+    ```python
+    model.compile(optimizer='adam',
+                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+                  metrics=[tf.keras.metrics.Precision(thresholds=0)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self, thresholds=None, top_k=None, class_id=None, name=None, dtype=None
+    ):
+        super().__init__(name=name, dtype=dtype)
+        self.init_thresholds = thresholds
+        self.top_k = top_k
+        self.class_id = class_id
+
+        default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
+        self.thresholds = metrics_utils.parse_init_thresholds(
+            thresholds, default_threshold=default_threshold
+        )
+        self._thresholds_distributed_evenly = (
+            metrics_utils.is_evenly_distributed_thresholds(self.thresholds)
+        )
+        self.true_positives = self.add_weight(
+            "true_positives", shape=(len(self.thresholds),), initializer="zeros"
+        )
+        self.false_positives = self.add_weight(
+            "false_positives",
+            shape=(len(self.thresholds),),
+            initializer="zeros",
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates true positive and false positive statistics.
+
+        Args:
+          y_true: The ground truth values, with the same dimensions as `y_pred`.
+            Will be cast to `bool`.
+          y_pred: The predicted values. Each element must be in the range
+            `[0, 1]`.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        return metrics_utils.update_confusion_matrix_variables(
+            {
+                metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,  # noqa: E501
+            },
+            y_true,
+            y_pred,
+            thresholds=self.thresholds,
+            thresholds_distributed_evenly=self._thresholds_distributed_evenly,
+            top_k=self.top_k,
+            class_id=self.class_id,
+            sample_weight=sample_weight,
+        )
+
+    def result(self):
+        result = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_positives),
+        )
+        return result[0] if len(self.thresholds) == 1 else result
+
+    def reset_state(self):
+        num_thresholds = len(to_list(self.thresholds))
+        backend.batch_set_value(
+            [
+                (v, np.zeros((num_thresholds,)))
+                for v in (self.true_positives, self.false_positives)
+            ]
+        )
+
+    def get_config(self):
+        config = {
+            "thresholds": self.init_thresholds,
+            "top_k": self.top_k,
+            "class_id": self.class_id,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.Recall")
+class Recall(base_metric.Metric):
+    """Computes the recall of the predictions with respect to the labels.
+
+    This metric creates two local variables, `true_positives` and
+    `false_negatives`, that are used to compute the recall. This value is
+    ultimately returned as `recall`, an idempotent operation that simply divides
+    `true_positives` by the sum of `true_positives` and `false_negatives`.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    If `top_k` is set, recall will be computed as how often on average a class
+    among the labels of a batch entry is in the top-k predictions.
+
+    If `class_id` is specified, we calculate recall by considering only the
+    entries in the batch for which `class_id` is in the label, and computing the
+    fraction of them for which `class_id` is above the threshold and/or in the
+    top-k predictions.
+
+    Args:
+      thresholds: (Optional) A float value, or a Python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). If used with a loss function
+        that sets `from_logits=True` (i.e. no sigmoid applied to predictions),
+        `thresholds` should be set to 0. One metric value is generated for each
+        threshold value. If neither thresholds nor top_k are set, the default is
+        to calculate recall with `thresholds=0.5`.
+      top_k: (Optional) Unset by default. An int value specifying the top-k
+        predictions to consider when calculating recall.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.Recall()
+    >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+    >>> m.result().numpy()
+    0.6666667
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
+    >>> m.result().numpy()
+    1.0
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='binary_crossentropy',
+                  metrics=[tf.keras.metrics.Recall()])
+    ```
+
+    Usage with a loss with `from_logits=True`:
+
+    ```python
+    model.compile(optimizer='adam',
+                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+                  metrics=[tf.keras.metrics.Recall(thresholds=0)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self, thresholds=None, top_k=None, class_id=None, name=None, dtype=None
+    ):
+        super().__init__(name=name, dtype=dtype)
+        self.init_thresholds = thresholds
+        self.top_k = top_k
+        self.class_id = class_id
+
+        default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
+        self.thresholds = metrics_utils.parse_init_thresholds(
+            thresholds, default_threshold=default_threshold
+        )
+        self._thresholds_distributed_evenly = (
+            metrics_utils.is_evenly_distributed_thresholds(self.thresholds)
+        )
+        self.true_positives = self.add_weight(
+            "true_positives", shape=(len(self.thresholds),), initializer="zeros"
+        )
+        self.false_negatives = self.add_weight(
+            "false_negatives",
+            shape=(len(self.thresholds),),
+            initializer="zeros",
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates true positive and false negative statistics.
+
+        Args:
+          y_true: The ground truth values, with the same dimensions as `y_pred`.
+            Will be cast to `bool`.
+          y_pred: The predicted values. Each element must be in the range
+            `[0, 1]`.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        return metrics_utils.update_confusion_matrix_variables(
+            {
+                metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,  # noqa: E501
+            },
+            y_true,
+            y_pred,
+            thresholds=self.thresholds,
+            thresholds_distributed_evenly=self._thresholds_distributed_evenly,
+            top_k=self.top_k,
+            class_id=self.class_id,
+            sample_weight=sample_weight,
+        )
+
+    def result(self):
+        result = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_negatives),
+        )
+        return result[0] if len(self.thresholds) == 1 else result
+
+    def reset_state(self):
+        num_thresholds = len(to_list(self.thresholds))
+        backend.batch_set_value(
+            [
+                (v, np.zeros((num_thresholds,)))
+                for v in (self.true_positives, self.false_negatives)
+            ]
+        )
+
+    def get_config(self):
+        config = {
+            "thresholds": self.init_thresholds,
+            "top_k": self.top_k,
+            "class_id": self.class_id,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class SensitivitySpecificityBase(base_metric.Metric, metaclass=abc.ABCMeta):
+    """Abstract base class for computing sensitivity and specificity.
+
+    For additional information about specificity and sensitivity, see
+    [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
+    """
+
+    def __init__(
+        self, value, num_thresholds=200, class_id=None, name=None, dtype=None
+    ):
+        super().__init__(name=name, dtype=dtype)
+        if num_thresholds <= 0:
+            raise ValueError(
+                "Argument `num_thresholds` must be an integer > 0. "
+                f"Received: num_thresholds={num_thresholds}"
+            )
+        self.value = value
+        self.class_id = class_id
+        self.true_positives = self.add_weight(
+            "true_positives", shape=(num_thresholds,), initializer="zeros"
+        )
+        self.true_negatives = self.add_weight(
+            "true_negatives", shape=(num_thresholds,), initializer="zeros"
+        )
+        self.false_positives = self.add_weight(
+            "false_positives", shape=(num_thresholds,), initializer="zeros"
+        )
+        self.false_negatives = self.add_weight(
+            "false_negatives", shape=(num_thresholds,), initializer="zeros"
+        )
+
+        # Compute `num_thresholds` thresholds in [0, 1]
+        if num_thresholds == 1:
+            self.thresholds = [0.5]
+            self._thresholds_distributed_evenly = False
+        else:
+            thresholds = [
+                (i + 1) * 1.0 / (num_thresholds - 1)
+                for i in range(num_thresholds - 2)
+            ]
+            self.thresholds = [0.0] + thresholds + [1.0]
+            self._thresholds_distributed_evenly = True
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates confusion matrix statistics.
+
+        Args:
+          y_true: The ground truth values.
+          y_pred: The predicted values.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        return metrics_utils.update_confusion_matrix_variables(
+            {
+                metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.TRUE_NEGATIVES: self.true_negatives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,  # noqa: E501
+            },
+            y_true,
+            y_pred,
+            thresholds=self.thresholds,
+            thresholds_distributed_evenly=self._thresholds_distributed_evenly,
+            class_id=self.class_id,
+            sample_weight=sample_weight,
+        )
+
+    def reset_state(self):
+        num_thresholds = len(self.thresholds)
+        confusion_matrix_variables = (
+            self.true_positives,
+            self.true_negatives,
+            self.false_positives,
+            self.false_negatives,
+        )
+        backend.batch_set_value(
+            [
+                (v, np.zeros((num_thresholds,)))
+                for v in confusion_matrix_variables
+            ]
+        )
+
+    def get_config(self):
+        config = {"class_id": self.class_id}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def _find_max_under_constraint(self, constrained, dependent, predicate):
+        """Returns the maximum of dependent_statistic that satisfies the
+        constraint.
+
+        Args:
+          constrained: Over these values the constraint
+            is specified. A rank-1 tensor.
+          dependent: From these values the maximum that satiesfies the
+            constraint is selected. Values in this tensor and in
+            `constrained` are linked by having the same threshold at each
+            position, hence this tensor must have the same shape.
+          predicate: A binary boolean functor to be applied to arguments
+          `constrained` and `self.value`, e.g. `tf.greater`.
+
+        Returns:
+          maximal dependent value, if no value satiesfies the constraint 0.0.
+        """
+        feasible = tf.where(predicate(constrained, self.value))
+        feasible_exists = tf.greater(tf.size(feasible), 0)
+        max_dependent = tf.reduce_max(tf.gather(dependent, feasible))
+
+        return tf.where(feasible_exists, max_dependent, 0.0)
+
+
+@keras_export("keras.metrics.SensitivityAtSpecificity")
+class SensitivityAtSpecificity(SensitivitySpecificityBase):
+    """Computes best sensitivity where specificity is >= specified value.
+
+    the sensitivity at a given specificity.
+
+    `Sensitivity` measures the proportion of actual positives that are correctly
+    identified as such (tp / (tp + fn)).
+    `Specificity` measures the proportion of actual negatives that are correctly
+    identified as such (tn / (tn + fp)).
+
+    This metric creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the sensitivity at the given specificity. The threshold for the
+    given specificity value is computed and used to evaluate the corresponding
+    sensitivity.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is above the threshold
+    predictions, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+
+    For additional information about specificity and sensitivity, see
+    [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
+
+    Args:
+      specificity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given specificity. Defaults to `200`.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.SensitivityAtSpecificity(0.5)
+    >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
+    ...                sample_weight=[1, 1, 2, 2, 1])
+    >>> m.result().numpy()
+    0.333333
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='binary_crossentropy',
+        metrics=[tf.keras.metrics.SensitivityAtSpecificity()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        specificity,
+        num_thresholds=200,
+        class_id=None,
+        name=None,
+        dtype=None,
+    ):
+        if specificity < 0 or specificity > 1:
+            raise ValueError(
+                "Argument `specificity` must be in the range [0, 1]. "
+                f"Received: specificity={specificity}"
+            )
+        self.specificity = specificity
+        self.num_thresholds = num_thresholds
+        super().__init__(
+            specificity,
+            num_thresholds=num_thresholds,
+            class_id=class_id,
+            name=name,
+            dtype=dtype,
+        )
+
+    def result(self):
+        specificities = tf.math.divide_no_nan(
+            self.true_negatives,
+            tf.math.add(self.true_negatives, self.false_positives),
+        )
+        sensitivities = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_negatives),
+        )
+        return self._find_max_under_constraint(
+            specificities, sensitivities, tf.greater_equal
+        )
+
+    def get_config(self):
+        config = {
+            "num_thresholds": self.num_thresholds,
+            "specificity": self.specificity,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.SpecificityAtSensitivity")
+class SpecificityAtSensitivity(SensitivitySpecificityBase):
+    """Computes best specificity where sensitivity is >= specified value.
+
+    `Sensitivity` measures the proportion of actual positives that are correctly
+    identified as such (tp / (tp + fn)).
+    `Specificity` measures the proportion of actual negatives that are correctly
+    identified as such (tn / (tn + fp)).
+
+    This metric creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the specificity at the given sensitivity. The threshold for the
+    given sensitivity value is computed and used to evaluate the corresponding
+    specificity.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is above the threshold
+    predictions, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+
+    For additional information about specificity and sensitivity, see
+    [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
+
+    Args:
+      sensitivity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given sensitivity. Defaults to `200`.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.SpecificityAtSensitivity(0.5)
+    >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
+    >>> m.result().numpy()
+    0.66666667
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
+    ...                sample_weight=[1, 1, 2, 2, 2])
+    >>> m.result().numpy()
+    0.5
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='binary_crossentropy',
+        metrics=[tf.keras.metrics.SpecificityAtSensitivity()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        sensitivity,
+        num_thresholds=200,
+        class_id=None,
+        name=None,
+        dtype=None,
+    ):
+        if sensitivity < 0 or sensitivity > 1:
+            raise ValueError(
+                "Argument `sensitivity` must be in the range [0, 1]. "
+                f"Received: sensitivity={sensitivity}"
+            )
+        self.sensitivity = sensitivity
+        self.num_thresholds = num_thresholds
+        super().__init__(
+            sensitivity,
+            num_thresholds=num_thresholds,
+            class_id=class_id,
+            name=name,
+            dtype=dtype,
+        )
+
+    def result(self):
+        sensitivities = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_negatives),
+        )
+        specificities = tf.math.divide_no_nan(
+            self.true_negatives,
+            tf.math.add(self.true_negatives, self.false_positives),
+        )
+        return self._find_max_under_constraint(
+            sensitivities, specificities, tf.greater_equal
+        )
+
+    def get_config(self):
+        config = {
+            "num_thresholds": self.num_thresholds,
+            "sensitivity": self.sensitivity,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.PrecisionAtRecall")
+class PrecisionAtRecall(SensitivitySpecificityBase):
+    """Computes best precision where recall is >= specified value.
+
+    This metric creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the precision at the given recall. The threshold for the given
+    recall value is computed and used to evaluate the corresponding precision.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is above the threshold
+    predictions, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+
+    Args:
+      recall: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given recall. Defaults to `200`.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.PrecisionAtRecall(0.5)
+    >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
+    ...                sample_weight=[2, 2, 2, 1, 1])
+    >>> m.result().numpy()
+    0.33333333
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='binary_crossentropy',
+        metrics=[tf.keras.metrics.PrecisionAtRecall(recall=0.8)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self, recall, num_thresholds=200, class_id=None, name=None, dtype=None
+    ):
+        if recall < 0 or recall > 1:
+            raise ValueError(
+                "Argument `recall` must be in the range [0, 1]. "
+                f"Received: recall={recall}"
+            )
+        self.recall = recall
+        self.num_thresholds = num_thresholds
+        super().__init__(
+            value=recall,
+            num_thresholds=num_thresholds,
+            class_id=class_id,
+            name=name,
+            dtype=dtype,
+        )
+
+    def result(self):
+        recalls = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_negatives),
+        )
+        precisions = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_positives),
+        )
+        return self._find_max_under_constraint(
+            recalls, precisions, tf.greater_equal
+        )
+
+    def get_config(self):
+        config = {"num_thresholds": self.num_thresholds, "recall": self.recall}
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.RecallAtPrecision")
+class RecallAtPrecision(SensitivitySpecificityBase):
+    """Computes best recall where precision is >= specified value.
+
+    For a given score-label-distribution the required precision might not
+    be achievable, in this case 0.0 is returned as recall.
+
+    This metric creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the recall at the given precision. The threshold for the given
+    precision value is computed and used to evaluate the corresponding recall.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is above the threshold
+    predictions, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+
+    Args:
+      precision: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given precision. Defaults to `200`.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.RecallAtPrecision(0.8)
+    >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9],
+    ...                sample_weight=[1, 0, 0, 1])
+    >>> m.result().numpy()
+    1.0
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='binary_crossentropy',
+        metrics=[tf.keras.metrics.RecallAtPrecision(precision=0.8)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        precision,
+        num_thresholds=200,
+        class_id=None,
+        name=None,
+        dtype=None,
+    ):
+        if precision < 0 or precision > 1:
+            raise ValueError(
+                "Argument `precision` must be in the range [0, 1]. "
+                f"Received: precision={precision}"
+            )
+        self.precision = precision
+        self.num_thresholds = num_thresholds
+        super().__init__(
+            value=precision,
+            num_thresholds=num_thresholds,
+            class_id=class_id,
+            name=name,
+            dtype=dtype,
+        )
+
+    def result(self):
+        precisions = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_positives),
+        )
+        recalls = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_negatives),
+        )
+        return self._find_max_under_constraint(
+            precisions, recalls, tf.greater_equal
+        )
+
+    def get_config(self):
+        config = {
+            "num_thresholds": self.num_thresholds,
+            "precision": self.precision,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.AUC")
+class AUC(base_metric.Metric):
+    """Approximates the AUC (Area under the curve) of the ROC or PR curves.
+
+    The AUC (Area under the curve) of the ROC (Receiver operating
+    characteristic; default) or PR (Precision Recall) curves are quality
+    measures of binary classifiers. Unlike the accuracy, and like cross-entropy
+    losses, ROC-AUC and PR-AUC evaluate all the operational points of a model.
+
+    This class approximates AUCs using a Riemann sum. During the metric
+    accumulation phrase, predictions are accumulated within predefined buckets
+    by value. The AUC is then computed by interpolating per-bucket averages.
+    These buckets define the evaluated operational points.
+
+    This metric creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the AUC.  To discretize the AUC curve, a linearly spaced set of
+    thresholds is used to compute pairs of recall and precision values. The area
+    under the ROC-curve is therefore computed using the height of the recall
+    values by the false positive rate, while the area under the PR-curve is the
+    computed using the height of the precision values by the recall.
+
+    This value is ultimately returned as `auc`, an idempotent operation that
+    computes the area under a discretized curve of precision versus recall
+    values (computed using the aforementioned variables). The `num_thresholds`
+    variable controls the degree of discretization with larger numbers of
+    thresholds more closely approximating the true AUC. The quality of the
+    approximation may vary dramatically depending on `num_thresholds`. The
+    `thresholds` parameter can be used to manually specify thresholds which
+    split the predictions more evenly.
+
+    For a best approximation of the real AUC, `predictions` should be
+    distributed approximately uniformly in the range [0, 1] (if
+    `from_logits=False`). The quality of the AUC approximation may be poor if
+    this is not the case. Setting `summation_method` to 'minoring' or 'majoring'
+    can help quantify the error in the approximation by providing lower or upper
+    bound estimate of the AUC.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      num_thresholds: (Optional) The number of thresholds to
+        use when discretizing the roc curve. Values must be > 1.
+        Defaults to `200`.
+      curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
+        [default] or 'PR' for the Precision-Recall-curve.
+      summation_method: (Optional) Specifies the [Riemann summation method](
+          https://en.wikipedia.org/wiki/Riemann_sum) used.
+          'interpolation' (default) applies mid-point summation scheme for
+          `ROC`.  For PR-AUC, interpolates (true/false) positives but not the
+          ratio that is precision (see Davis & Goadrich 2006 for details);
+          'minoring' applies left summation for increasing intervals and right
+          summation for decreasing intervals; 'majoring' does the opposite.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      thresholds: (Optional) A list of floating point values to use as the
+        thresholds for discretizing the curve. If set, the `num_thresholds`
+        parameter is ignored. Values should be in [0, 1]. Endpoint thresholds
+        equal to {-epsilon, 1+epsilon} for a small positive epsilon value will
+        be automatically included with these to correctly handle predictions
+        equal to exactly 0 or 1.
+      multi_label: boolean indicating whether multilabel data should be
+        treated as such, wherein AUC is computed separately for each label and
+        then averaged across labels, or (when False) if the data should be
+        flattened into a single label before AUC computation. In the latter
+        case, when multilabel data is passed to AUC, each label-prediction pair
+        is treated as an individual data point. Should be set to False for
+        multi-class data.
+      num_labels: (Optional) The number of labels, used when `multi_label` is
+        True. If `num_labels` is not specified, then state variables get created
+        on the first call to `update_state`.
+      label_weights: (Optional) list, array, or tensor of non-negative weights
+        used to compute AUCs for multilabel data. When `multi_label` is True,
+        the weights are applied to the individual label AUCs when they are
+        averaged to produce the multi-label AUC. When it's False, they are used
+        to weight the individual label predictions in computing the confusion
+        matrix on the flattened data. Note that this is unlike class_weights in
+        that class_weights weights the example depending on the value of its
+        label, whereas label_weights depends only on the index of that label
+        before flattening; therefore `label_weights` should not be used for
+        multi-class data.
+      from_logits: boolean indicating whether the predictions (`y_pred` in
+        `update_state`) are probabilities or sigmoid logits. As a rule of thumb,
+        when using a keras loss, the `from_logits` constructor argument of the
+        loss should match the AUC `from_logits` constructor argument.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.AUC(num_thresholds=3)
+    >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+    >>> # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
+    >>> # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+    >>> # tp_rate = recall = [1, 0.5, 0], fp_rate = [1, 0, 0]
+    >>> # auc = ((((1+0.5)/2)*(1-0)) + (((0.5+0)/2)*(0-0))) = 0.75
+    >>> m.result().numpy()
+    0.75
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9],
+    ...                sample_weight=[1, 0, 0, 1])
+    >>> m.result().numpy()
+    1.0
+
+    Usage with `compile()` API:
+
+    ```python
+    # Reports the AUC of a model outputting a probability.
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.BinaryCrossentropy(),
+                  metrics=[tf.keras.metrics.AUC()])
+
+    # Reports the AUC of a model outputting a logit.
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+                  metrics=[tf.keras.metrics.AUC(from_logits=True)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        num_thresholds=200,
+        curve="ROC",
+        summation_method="interpolation",
+        name=None,
+        dtype=None,
+        thresholds=None,
+        multi_label=False,
+        num_labels=None,
+        label_weights=None,
+        from_logits=False,
+    ):
+        # Validate configurations.
+        if isinstance(curve, metrics_utils.AUCCurve) and curve not in list(
+            metrics_utils.AUCCurve
+        ):
+            raise ValueError(
+                f'Invalid `curve` argument value "{curve}". '
+                f"Expected one of: {list(metrics_utils.AUCCurve)}"
+            )
+        if isinstance(
+            summation_method, metrics_utils.AUCSummationMethod
+        ) and summation_method not in list(metrics_utils.AUCSummationMethod):
+            raise ValueError(
+                "Invalid `summation_method` "
+                f'argument value "{summation_method}". '
+                f"Expected one of: {list(metrics_utils.AUCSummationMethod)}"
+            )
+
+        # Update properties.
+        self._init_from_thresholds = thresholds is not None
+        if thresholds is not None:
+            # If specified, use the supplied thresholds.
+            self.num_thresholds = len(thresholds) + 2
+            thresholds = sorted(thresholds)
+            self._thresholds_distributed_evenly = (
+                metrics_utils.is_evenly_distributed_thresholds(
+                    np.array([0.0] + thresholds + [1.0])
+                )
+            )
+        else:
+            if num_thresholds <= 1:
+                raise ValueError(
+                    "Argument `num_thresholds` must be an integer > 1. "
+                    f"Received: num_thresholds={num_thresholds}"
+                )
+
+            # Otherwise, linearly interpolate (num_thresholds - 2) thresholds in
+            # (0, 1).
+            self.num_thresholds = num_thresholds
+            thresholds = [
+                (i + 1) * 1.0 / (num_thresholds - 1)
+                for i in range(num_thresholds - 2)
+            ]
+            self._thresholds_distributed_evenly = True
+
+        # Add an endpoint "threshold" below zero and above one for either
+        # threshold method to account for floating point imprecisions.
+        self._thresholds = np.array(
+            [0.0 - backend.epsilon()] + thresholds + [1.0 + backend.epsilon()]
+        )
+
+        if isinstance(curve, metrics_utils.AUCCurve):
+            self.curve = curve
+        else:
+            self.curve = metrics_utils.AUCCurve.from_str(curve)
+        if isinstance(summation_method, metrics_utils.AUCSummationMethod):
+            self.summation_method = summation_method
+        else:
+            self.summation_method = metrics_utils.AUCSummationMethod.from_str(
+                summation_method
+            )
+        super().__init__(name=name, dtype=dtype)
+
+        # Handle multilabel arguments.
+        self.multi_label = multi_label
+        self.num_labels = num_labels
+        if label_weights is not None:
+            label_weights = tf.constant(label_weights, dtype=self.dtype)
+            tf.debugging.assert_non_negative(
+                label_weights,
+                message="All values of `label_weights` must be non-negative.",
+            )
+            self.label_weights = label_weights
+
+        else:
+            self.label_weights = None
+
+        self._from_logits = from_logits
+
+        self._built = False
+        if self.multi_label:
+            if num_labels:
+                shape = tf.TensorShape([None, num_labels])
+                self._build(shape)
+        else:
+            if num_labels:
+                raise ValueError(
+                    "`num_labels` is needed only when `multi_label` is True."
+                )
+            self._build(None)
+
+    @property
+    def thresholds(self):
+        """The thresholds used for evaluating AUC."""
+        return list(self._thresholds)
+
+    def _build(self, shape):
+        """Initialize TP, FP, TN, and FN tensors, given the shape of the
+        data."""
+        if self.multi_label:
+            if shape.ndims != 2:
+                raise ValueError(
+                    "`y_pred` must have rank 2 when `multi_label=True`. "
+                    f"Found rank {shape.ndims}. "
+                    f"Full shape received for `y_pred`: {shape}"
+                )
+            self._num_labels = shape[1]
+            variable_shape = tf.TensorShape(
+                [self.num_thresholds, self._num_labels]
+            )
+        else:
+            variable_shape = tf.TensorShape([self.num_thresholds])
+
+        self._build_input_shape = shape
+        # Create metric variables
+        self.true_positives = self.add_weight(
+            "true_positives", shape=variable_shape, initializer="zeros"
+        )
+        self.true_negatives = self.add_weight(
+            "true_negatives", shape=variable_shape, initializer="zeros"
+        )
+        self.false_positives = self.add_weight(
+            "false_positives", shape=variable_shape, initializer="zeros"
+        )
+        self.false_negatives = self.add_weight(
+            "false_negatives", shape=variable_shape, initializer="zeros"
+        )
+
+        if self.multi_label:
+            with tf.init_scope():
+                # This should only be necessary for handling v1 behavior. In v2,
+                # AUC should be initialized outside of any tf.functions, and
+                # therefore in eager mode.
+                if not tf.executing_eagerly():
+                    backend._initialize_variables(backend._get_session())
+
+        self._built = True
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates confusion matrix statistics.
+
+        Args:
+          y_true: The ground truth values.
+          y_pred: The predicted values.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        if not self._built:
+            self._build(tf.TensorShape(y_pred.shape))
+
+        if self.multi_label or (self.label_weights is not None):
+            # y_true should have shape (number of examples, number of labels).
+            shapes = [(y_true, ("N", "L"))]
+            if self.multi_label:
+                # TP, TN, FP, and FN should all have shape
+                # (number of thresholds, number of labels).
+                shapes.extend(
+                    [
+                        (self.true_positives, ("T", "L")),
+                        (self.true_negatives, ("T", "L")),
+                        (self.false_positives, ("T", "L")),
+                        (self.false_negatives, ("T", "L")),
+                    ]
+                )
+            if self.label_weights is not None:
+                # label_weights should be of length equal to the number of
+                # labels.
+                shapes.append((self.label_weights, ("L",)))
+                tf.debugging.assert_shapes(
+                    shapes, message="Number of labels is not consistent."
+                )
+
+        # Only forward label_weights to update_confusion_matrix_variables when
+        # multi_label is False. Otherwise the averaging of individual label AUCs
+        # is handled in AUC.result
+        label_weights = None if self.multi_label else self.label_weights
+
+        if self._from_logits:
+            y_pred = activations.sigmoid(y_pred)
+
+        return metrics_utils.update_confusion_matrix_variables(
+            {
+                metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.TRUE_NEGATIVES: self.true_negatives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,  # noqa: E501
+                metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,  # noqa: E501
+            },
+            y_true,
+            y_pred,
+            self._thresholds,
+            thresholds_distributed_evenly=self._thresholds_distributed_evenly,
+            sample_weight=sample_weight,
+            multi_label=self.multi_label,
+            label_weights=label_weights,
+        )
+
+    def interpolate_pr_auc(self):
+        """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
+
+        https://www.biostat.wisc.edu/~page/rocpr.pdf
+
+        Note here we derive & use a closed formula not present in the paper
+        as follows:
+
+          Precision = TP / (TP + FP) = TP / P
+
+        Modeling all of TP (true positive), FP (false positive) and their sum
+        P = TP + FP (predicted positive) as varying linearly within each
+        interval [A, B] between successive thresholds, we get
+
+          Precision slope = dTP / dP
+                          = (TP_B - TP_A) / (P_B - P_A)
+                          = (TP - TP_A) / (P - P_A)
+          Precision = (TP_A + slope * (P - P_A)) / P
+
+        The area within the interval is (slope / total_pos_weight) times
+
+          int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
+          int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
+
+        where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
+
+          int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
+
+        Bringing back the factor (slope / total_pos_weight) we'd put aside, we
+        get
+
+          slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
+
+        where dTP == TP_B - TP_A.
+
+        Note that when P_A == 0 the above calculation simplifies into
+
+          int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
+
+        which is really equivalent to imputing constant precision throughout the
+        first bucket having >0 true positives.
+
+        Returns:
+          pr_auc: an approximation of the area under the P-R curve.
+        """
+        dtp = (
+            self.true_positives[: self.num_thresholds - 1]
+            - self.true_positives[1:]
+        )
+        p = tf.math.add(self.true_positives, self.false_positives)
+        dp = p[: self.num_thresholds - 1] - p[1:]
+        prec_slope = tf.math.divide_no_nan(
+            dtp, tf.maximum(dp, 0), name="prec_slope"
+        )
+        intercept = self.true_positives[1:] - tf.multiply(prec_slope, p[1:])
+
+        safe_p_ratio = tf.where(
+            tf.logical_and(p[: self.num_thresholds - 1] > 0, p[1:] > 0),
+            tf.math.divide_no_nan(
+                p[: self.num_thresholds - 1],
+                tf.maximum(p[1:], 0),
+                name="recall_relative_ratio",
+            ),
+            tf.ones_like(p[1:]),
+        )
+
+        pr_auc_increment = tf.math.divide_no_nan(
+            prec_slope * (dtp + intercept * tf.math.log(safe_p_ratio)),
+            tf.maximum(self.true_positives[1:] + self.false_negatives[1:], 0),
+            name="pr_auc_increment",
+        )
+
+        if self.multi_label:
+            by_label_auc = tf.reduce_sum(
+                pr_auc_increment, name=self.name + "_by_label", axis=0
+            )
+            if self.label_weights is None:
+                # Evenly weighted average of the label AUCs.
+                return tf.reduce_mean(by_label_auc, name=self.name)
+            else:
+                # Weighted average of the label AUCs.
+                return tf.math.divide_no_nan(
+                    tf.reduce_sum(
+                        tf.multiply(by_label_auc, self.label_weights)
+                    ),
+                    tf.reduce_sum(self.label_weights),
+                    name=self.name,
+                )
+        else:
+            return tf.reduce_sum(pr_auc_increment, name="interpolate_pr_auc")
+
+    def result(self):
+        if (
+            self.curve == metrics_utils.AUCCurve.PR
+            and self.summation_method
+            == metrics_utils.AUCSummationMethod.INTERPOLATION
+        ):
+            # This use case is different and is handled separately.
+            return self.interpolate_pr_auc()
+
+        # Set `x` and `y` values for the curves based on `curve` config.
+        recall = tf.math.divide_no_nan(
+            self.true_positives,
+            tf.math.add(self.true_positives, self.false_negatives),
+        )
+        if self.curve == metrics_utils.AUCCurve.ROC:
+            fp_rate = tf.math.divide_no_nan(
+                self.false_positives,
+                tf.math.add(self.false_positives, self.true_negatives),
+            )
+            x = fp_rate
+            y = recall
+        else:  # curve == 'PR'.
+            precision = tf.math.divide_no_nan(
+                self.true_positives,
+                tf.math.add(self.true_positives, self.false_positives),
+            )
+            x = recall
+            y = precision
+
+        # Find the rectangle heights based on `summation_method`.
+        if (
+            self.summation_method
+            == metrics_utils.AUCSummationMethod.INTERPOLATION
+        ):
+            # Note: the case ('PR', 'interpolation') has been handled above.
+            heights = (y[: self.num_thresholds - 1] + y[1:]) / 2.0
+        elif self.summation_method == metrics_utils.AUCSummationMethod.MINORING:
+            heights = tf.minimum(y[: self.num_thresholds - 1], y[1:])
+        # self.summation_method = metrics_utils.AUCSummationMethod.MAJORING:
+        else:
+            heights = tf.maximum(y[: self.num_thresholds - 1], y[1:])
+
+        # Sum up the areas of all the rectangles.
+        if self.multi_label:
+            riemann_terms = tf.multiply(
+                x[: self.num_thresholds - 1] - x[1:], heights
+            )
+            by_label_auc = tf.reduce_sum(
+                riemann_terms, name=self.name + "_by_label", axis=0
+            )
+
+            if self.label_weights is None:
+                # Unweighted average of the label AUCs.
+                return tf.reduce_mean(by_label_auc, name=self.name)
+            else:
+                # Weighted average of the label AUCs.
+                return tf.math.divide_no_nan(
+                    tf.reduce_sum(
+                        tf.multiply(by_label_auc, self.label_weights)
+                    ),
+                    tf.reduce_sum(self.label_weights),
+                    name=self.name,
+                )
+        else:
+            return tf.reduce_sum(
+                tf.multiply(x[: self.num_thresholds - 1] - x[1:], heights),
+                name=self.name,
+            )
+
+    def reset_state(self):
+        if self._built:
+            confusion_matrix_variables = (
+                self.true_positives,
+                self.true_negatives,
+                self.false_positives,
+                self.false_negatives,
+            )
+            if self.multi_label:
+                backend.batch_set_value(
+                    [
+                        (v, np.zeros((self.num_thresholds, self._num_labels)))
+                        for v in confusion_matrix_variables
+                    ]
+                )
+            else:
+                backend.batch_set_value(
+                    [
+                        (v, np.zeros((self.num_thresholds,)))
+                        for v in confusion_matrix_variables
+                    ]
+                )
+
+    def get_config(self):
+        if is_tensor_or_variable(self.label_weights):
+            label_weights = backend.eval(self.label_weights)
+        else:
+            label_weights = self.label_weights
+        config = {
+            "num_thresholds": self.num_thresholds,
+            "curve": self.curve.value,
+            "summation_method": self.summation_method.value,
+            "multi_label": self.multi_label,
+            "num_labels": self.num_labels,
+            "label_weights": label_weights,
+            "from_logits": self._from_logits,
+        }
+        # optimization to avoid serializing a large number of generated
+        # thresholds
+        if self._init_from_thresholds:
+            # We remove the endpoint thresholds as an inverse of how the
+            # thresholds were initialized. This ensures that a metric
+            # initialized from this config has the same thresholds.
+            config["thresholds"] = self.thresholds[1:-1]
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras/metrics/confusion_metrics_test.py b/keras/metrics/confusion_metrics_test.py
new file mode 100644
index 000000000000..a647e4efc67a
--- /dev/null
+++ b/keras/metrics/confusion_metrics_test.py
@@ -0,0 +1,2739 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for confusion metrics."""
+
+import json
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+from tensorflow.python.platform import tf_logging
+
+from keras import backend
+from keras import layers
+from keras import metrics
+from keras import models
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import metrics_utils
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class FalsePositivesTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        fp_obj = metrics.FalsePositives(name="my_fp", thresholds=[0.4, 0.9])
+        self.assertEqual(fp_obj.name, "my_fp")
+        self.assertLen(fp_obj.variables, 1)
+        self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
+
+        # Check save and restore config
+        fp_obj2 = metrics.FalsePositives.from_config(fp_obj.get_config())
+        self.assertEqual(fp_obj2.name, "my_fp")
+        self.assertLen(fp_obj2.variables, 1)
+        self.assertEqual(fp_obj2.thresholds, [0.4, 0.9])
+
+    def test_unweighted(self):
+        fp_obj = metrics.FalsePositives()
+        self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
+
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = fp_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = fp_obj.result()
+        self.assertAllClose(7.0, result)
+
+    def test_weighted(self):
+        fp_obj = metrics.FalsePositives()
+        self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(14.0, self.evaluate(result))
+
+    def test_unweighted_with_thresholds(self):
+        fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+
+        update_op = fp_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = fp_obj.result()
+        self.assertAllClose([7.0, 4.0, 2.0], result)
+
+    def test_weighted_with_thresholds(self):
+        fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+        sample_weight = (
+            (1.0, 2.0, 3.0, 5.0),
+            (7.0, 11.0, 13.0, 17.0),
+            (19.0, 23.0, 29.0, 31.0),
+            (5.0, 15.0, 10.0, 0),
+        )
+
+        result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose([125.0, 42.0, 12.0], self.evaluate(result))
+
+    def test_threshold_limit(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Threshold values must be in \[0, 1\]. Received: \[-1, 2\]",
+        ):
+            metrics.FalsePositives(thresholds=[-1, 0.5, 2])
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Threshold values must be in \[0, 1\]. Received: \[None\]",
+        ):
+            metrics.FalsePositives(thresholds=[None])
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class FalseNegativesTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        fn_obj = metrics.FalseNegatives(name="my_fn", thresholds=[0.4, 0.9])
+        self.assertEqual(fn_obj.name, "my_fn")
+        self.assertLen(fn_obj.variables, 1)
+        self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+
+        # Check save and restore config
+        fn_obj2 = metrics.FalseNegatives.from_config(fn_obj.get_config())
+        self.assertEqual(fn_obj2.name, "my_fn")
+        self.assertLen(fn_obj2.variables, 1)
+        self.assertEqual(fn_obj2.thresholds, [0.4, 0.9])
+
+    def test_unweighted(self):
+        fn_obj = metrics.FalseNegatives()
+        self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
+
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = fn_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = fn_obj.result()
+        self.assertAllClose(3.0, result)
+
+    def test_weighted(self):
+        fn_obj = metrics.FalseNegatives()
+        self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(5.0, self.evaluate(result))
+
+    def test_unweighted_with_thresholds(self):
+        fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+
+        update_op = fn_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = fn_obj.result()
+        self.assertAllClose([1.0, 4.0, 6.0], result)
+
+    def test_weighted_with_thresholds(self):
+        fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+        sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+
+        result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose([4.0, 16.0, 23.0], self.evaluate(result))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class TrueNegativesTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        tn_obj = metrics.TrueNegatives(name="my_tn", thresholds=[0.4, 0.9])
+        self.assertEqual(tn_obj.name, "my_tn")
+        self.assertLen(tn_obj.variables, 1)
+        self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+
+        # Check save and restore config
+        tn_obj2 = metrics.TrueNegatives.from_config(tn_obj.get_config())
+        self.assertEqual(tn_obj2.name, "my_tn")
+        self.assertLen(tn_obj2.variables, 1)
+        self.assertEqual(tn_obj2.thresholds, [0.4, 0.9])
+
+    def test_unweighted(self):
+        tn_obj = metrics.TrueNegatives()
+        self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
+
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = tn_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = tn_obj.result()
+        self.assertAllClose(3.0, result)
+
+    def test_weighted(self):
+        tn_obj = metrics.TrueNegatives()
+        self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(4.0, self.evaluate(result))
+
+    def test_unweighted_with_thresholds(self):
+        tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+
+        update_op = tn_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = tn_obj.result()
+        self.assertAllClose([2.0, 5.0, 7.0], result)
+
+    def test_weighted_with_thresholds(self):
+        tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+        sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+
+        result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose([5.0, 15.0, 23.0], self.evaluate(result))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class TruePositivesTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        tp_obj = metrics.TruePositives(name="my_tp", thresholds=[0.4, 0.9])
+        self.assertEqual(tp_obj.name, "my_tp")
+        self.assertLen(tp_obj.variables, 1)
+        self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+
+        # Check save and restore config
+        tp_obj2 = metrics.TruePositives.from_config(tp_obj.get_config())
+        self.assertEqual(tp_obj2.name, "my_tp")
+        self.assertLen(tp_obj2.variables, 1)
+        self.assertEqual(tp_obj2.thresholds, [0.4, 0.9])
+
+    def test_unweighted(self):
+        tp_obj = metrics.TruePositives()
+        self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
+
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = tp_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = tp_obj.result()
+        self.assertAllClose(7.0, result)
+
+    def test_weighted(self):
+        tp_obj = metrics.TruePositives()
+        self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(12.0, self.evaluate(result))
+
+    def test_unweighted_with_thresholds(self):
+        tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+
+        update_op = tp_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = tp_obj.result()
+        self.assertAllClose([6.0, 3.0, 1.0], result)
+
+    def test_weighted_with_thresholds(self):
+        tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+        self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
+
+        y_pred = tf.constant(
+            (
+                (0.9, 0.2, 0.8, 0.1),
+                (0.2, 0.9, 0.7, 0.6),
+                (0.1, 0.2, 0.4, 0.3),
+                (0, 1, 0.7, 0.3),
+            )
+        )
+        y_true = tf.constant(
+            ((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0), (1, 1, 1, 1))
+        )
+
+        result = tp_obj(y_true, y_pred, sample_weight=37.0)
+        self.assertAllClose([222.0, 111.0, 37.0], self.evaluate(result))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class PrecisionTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        p_obj = metrics.Precision(
+            name="my_precision", thresholds=[0.4, 0.9], top_k=15, class_id=12
+        )
+        self.assertEqual(p_obj.name, "my_precision")
+        self.assertLen(p_obj.variables, 2)
+        self.assertEqual(
+            [v.name for v in p_obj.variables],
+            ["true_positives:0", "false_positives:0"],
+        )
+        self.assertEqual(p_obj.thresholds, [0.4, 0.9])
+        self.assertEqual(p_obj.top_k, 15)
+        self.assertEqual(p_obj.class_id, 12)
+
+        # Check save and restore config
+        p_obj2 = metrics.Precision.from_config(p_obj.get_config())
+        self.assertEqual(p_obj2.name, "my_precision")
+        self.assertLen(p_obj2.variables, 2)
+        self.assertEqual(p_obj2.thresholds, [0.4, 0.9])
+        self.assertEqual(p_obj2.top_k, 15)
+        self.assertEqual(p_obj2.class_id, 12)
+
+    def test_value_is_idempotent(self):
+        p_obj = metrics.Precision(thresholds=[0.3, 0.72])
+        y_pred = tf.random.uniform(shape=(10, 3))
+        y_true = tf.random.uniform(shape=(10, 3))
+        update_op = p_obj.update_state(y_true, y_pred)
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+
+        # Run several updates.
+        for _ in range(10):
+            self.evaluate(update_op)
+
+        # Then verify idempotency.
+        initial_precision = self.evaluate(p_obj.result())
+        for _ in range(10):
+            self.assertArrayNear(
+                initial_precision, self.evaluate(p_obj.result()), 1e-3
+            )
+
+    def test_unweighted(self):
+        p_obj = metrics.Precision()
+        y_pred = tf.constant([1, 0, 1, 0], shape=(1, 4))
+        y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+
+    def test_unweighted_all_incorrect(self):
+        p_obj = metrics.Precision(thresholds=[0.5])
+        inputs = np.random.randint(0, 2, size=(100, 1))
+        y_pred = tf.constant(inputs)
+        y_true = tf.constant(1 - inputs)
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(0, self.evaluate(result))
+
+    def test_weighted(self):
+        p_obj = metrics.Precision()
+        y_pred = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+        y_true = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        result = p_obj(
+            y_true,
+            y_pred,
+            sample_weight=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]),
+        )
+        weighted_tp = 3.0 + 4.0
+        weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+        expected_precision = weighted_tp / weighted_positives
+        self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+    def test_div_by_zero(self):
+        p_obj = metrics.Precision()
+        y_pred = tf.constant([0, 0, 0, 0])
+        y_true = tf.constant([0, 0, 0, 0])
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        result = p_obj(y_true, y_pred)
+        self.assertEqual(0, self.evaluate(result))
+
+    def test_unweighted_with_threshold(self):
+        p_obj = metrics.Precision(thresholds=[0.5, 0.7])
+        y_pred = tf.constant([1, 0, 0.6, 0], shape=(1, 4))
+        y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        result = p_obj(y_true, y_pred)
+        self.assertArrayNear([0.5, 0.0], self.evaluate(result), 0)
+
+    def test_weighted_with_threshold(self):
+        p_obj = metrics.Precision(thresholds=[0.5, 1.0])
+        y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+        y_pred = tf.constant([[1, 0], [0.6, 0]], shape=(2, 2), dtype=tf.float32)
+        weights = tf.constant([[4, 0], [3, 1]], shape=(2, 2), dtype=tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        result = p_obj(y_true, y_pred, sample_weight=weights)
+        weighted_tp = 0 + 3.0
+        weighted_positives = (0 + 3.0) + (4.0 + 0.0)
+        expected_precision = weighted_tp / weighted_positives
+        self.assertArrayNear(
+            [expected_precision, 0], self.evaluate(result), 1e-3
+        )
+
+    def test_multiple_updates(self):
+        p_obj = metrics.Precision(thresholds=[0.5, 1.0])
+        y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+        y_pred = tf.constant([[1, 0], [0.6, 0]], shape=(2, 2), dtype=tf.float32)
+        weights = tf.constant([[4, 0], [3, 1]], shape=(2, 2), dtype=tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
+        for _ in range(2):
+            self.evaluate(update_op)
+
+        weighted_tp = (0 + 3.0) + (0 + 3.0)
+        weighted_positives = ((0 + 3.0) + (4.0 + 0.0)) + (
+            (0 + 3.0) + (4.0 + 0.0)
+        )
+        expected_precision = weighted_tp / weighted_positives
+        self.assertArrayNear(
+            [expected_precision, 0], self.evaluate(p_obj.result()), 1e-3
+        )
+
+    def test_unweighted_top_k(self):
+        p_obj = metrics.Precision(top_k=3)
+        y_pred = tf.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(1.0 / 3, self.evaluate(result))
+
+    def test_weighted_top_k(self):
+        p_obj = metrics.Precision(top_k=3)
+        y_pred1 = tf.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
+        y_true1 = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+        self.evaluate(
+            p_obj(
+                y_true1, y_pred1, sample_weight=tf.constant([[1, 4, 2, 3, 5]])
+            )
+        )
+
+        y_pred2 = tf.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
+        y_true2 = tf.constant([1, 0, 1, 1, 1], shape=(1, 5))
+        result = p_obj(y_true2, y_pred2, sample_weight=tf.constant(3))
+
+        tp = (2 + 5) + (3 + 3)
+        predicted_positives = (1 + 2 + 5) + (3 + 3 + 3)
+        expected_precision = tp / predicted_positives
+        self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+    def test_unweighted_class_id(self):
+        p_obj = metrics.Precision(class_id=2)
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+
+        y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+        self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+        y_pred = tf.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+        self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+        y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 0, 0, 0], shape=(1, 5))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+        self.assertAlmostEqual(1, self.evaluate(p_obj.false_positives))
+
+    def test_unweighted_top_k_and_class_id(self):
+        p_obj = metrics.Precision(class_id=2, top_k=2)
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+
+        y_pred = tf.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+        self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+        y_pred = tf.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+        self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+    def test_unweighted_top_k_and_threshold(self):
+        p_obj = metrics.Precision(thresholds=0.7, top_k=2)
+        self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+
+        y_pred = tf.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
+        result = p_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+        self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class RecallTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        r_obj = metrics.Recall(
+            name="my_recall", thresholds=[0.4, 0.9], top_k=15, class_id=12
+        )
+        self.assertEqual(r_obj.name, "my_recall")
+        self.assertLen(r_obj.variables, 2)
+        self.assertEqual(
+            [v.name for v in r_obj.variables],
+            ["true_positives:0", "false_negatives:0"],
+        )
+        self.assertEqual(r_obj.thresholds, [0.4, 0.9])
+        self.assertEqual(r_obj.top_k, 15)
+        self.assertEqual(r_obj.class_id, 12)
+
+        # Check save and restore config
+        r_obj2 = metrics.Recall.from_config(r_obj.get_config())
+        self.assertEqual(r_obj2.name, "my_recall")
+        self.assertLen(r_obj2.variables, 2)
+        self.assertEqual(r_obj2.thresholds, [0.4, 0.9])
+        self.assertEqual(r_obj2.top_k, 15)
+        self.assertEqual(r_obj2.class_id, 12)
+
+    def test_value_is_idempotent(self):
+        r_obj = metrics.Recall(thresholds=[0.3, 0.72])
+        y_pred = tf.random.uniform(shape=(10, 3))
+        y_true = tf.random.uniform(shape=(10, 3))
+        update_op = r_obj.update_state(y_true, y_pred)
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+
+        # Run several updates.
+        for _ in range(10):
+            self.evaluate(update_op)
+
+        # Then verify idempotency.
+        initial_recall = self.evaluate(r_obj.result())
+        for _ in range(10):
+            self.assertArrayNear(
+                initial_recall, self.evaluate(r_obj.result()), 1e-3
+            )
+
+    def test_unweighted(self):
+        r_obj = metrics.Recall()
+        y_pred = tf.constant([1, 0, 1, 0], shape=(1, 4))
+        y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+
+    def test_unweighted_all_incorrect(self):
+        r_obj = metrics.Recall(thresholds=[0.5])
+        inputs = np.random.randint(0, 2, size=(100, 1))
+        y_pred = tf.constant(inputs)
+        y_true = tf.constant(1 - inputs)
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(0, self.evaluate(result))
+
+    def test_weighted(self):
+        r_obj = metrics.Recall()
+        y_pred = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+        y_true = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        result = r_obj(
+            y_true,
+            y_pred,
+            sample_weight=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]),
+        )
+        weighted_tp = 3.0 + 1.0
+        weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+        expected_recall = weighted_tp / weighted_t
+        self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+    def test_div_by_zero(self):
+        r_obj = metrics.Recall()
+        y_pred = tf.constant([0, 0, 0, 0])
+        y_true = tf.constant([0, 0, 0, 0])
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        result = r_obj(y_true, y_pred)
+        self.assertEqual(0, self.evaluate(result))
+
+    def test_unweighted_with_threshold(self):
+        r_obj = metrics.Recall(thresholds=[0.5, 0.7])
+        y_pred = tf.constant([1, 0, 0.6, 0], shape=(1, 4))
+        y_true = tf.constant([0, 1, 1, 0], shape=(1, 4))
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        result = r_obj(y_true, y_pred)
+        self.assertArrayNear([0.5, 0.0], self.evaluate(result), 0)
+
+    def test_weighted_with_threshold(self):
+        r_obj = metrics.Recall(thresholds=[0.5, 1.0])
+        y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+        y_pred = tf.constant([[1, 0], [0.6, 0]], shape=(2, 2), dtype=tf.float32)
+        weights = tf.constant([[1, 4], [3, 2]], shape=(2, 2), dtype=tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        result = r_obj(y_true, y_pred, sample_weight=weights)
+        weighted_tp = 0 + 3.0
+        weighted_positives = (0 + 3.0) + (4.0 + 0.0)
+        expected_recall = weighted_tp / weighted_positives
+        self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
+
+    def test_multiple_updates(self):
+        r_obj = metrics.Recall(thresholds=[0.5, 1.0])
+        y_true = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+        y_pred = tf.constant([[1, 0], [0.6, 0]], shape=(2, 2), dtype=tf.float32)
+        weights = tf.constant([[1, 4], [3, 2]], shape=(2, 2), dtype=tf.float32)
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
+        for _ in range(2):
+            self.evaluate(update_op)
+
+        weighted_tp = (0 + 3.0) + (0 + 3.0)
+        weighted_positives = ((0 + 3.0) + (4.0 + 0.0)) + (
+            (0 + 3.0) + (4.0 + 0.0)
+        )
+        expected_recall = weighted_tp / weighted_positives
+        self.assertArrayNear(
+            [expected_recall, 0], self.evaluate(r_obj.result()), 1e-3
+        )
+
+    def test_unweighted_top_k(self):
+        r_obj = metrics.Recall(top_k=3)
+        y_pred = tf.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+
+    def test_weighted_top_k(self):
+        r_obj = metrics.Recall(top_k=3)
+        y_pred1 = tf.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
+        y_true1 = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+        self.evaluate(
+            r_obj(
+                y_true1, y_pred1, sample_weight=tf.constant([[1, 4, 2, 3, 5]])
+            )
+        )
+
+        y_pred2 = tf.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
+        y_true2 = tf.constant([1, 0, 1, 1, 1], shape=(1, 5))
+        result = r_obj(y_true2, y_pred2, sample_weight=tf.constant(3))
+
+        tp = (2 + 5) + (3 + 3)
+        positives = (4 + 2 + 5) + (3 + 3 + 3 + 3)
+        expected_recall = tp / positives
+        self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+    def test_unweighted_class_id(self):
+        r_obj = metrics.Recall(class_id=2)
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+
+        y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+        self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
+
+        y_pred = tf.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+        y_pred = tf.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 0, 0, 0], shape=(1, 5))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+    def test_unweighted_top_k_and_class_id(self):
+        r_obj = metrics.Recall(class_id=2, top_k=2)
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+
+        y_pred = tf.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+        self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
+
+        y_pred = tf.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
+        y_true = tf.constant([0, 1, 1, 0, 0], shape=(1, 5))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+    def test_unweighted_top_k_and_threshold(self):
+        r_obj = metrics.Recall(thresholds=0.7, top_k=2)
+        self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+
+        y_pred = tf.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
+        y_true = tf.constant([1, 1, 1, 0, 1], shape=(1, 5))
+        result = r_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.25, self.evaluate(result))
+        self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+        self.assertAlmostEqual(3, self.evaluate(r_obj.false_negatives))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class SensitivityAtSpecificityTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        s_obj = metrics.SensitivityAtSpecificity(
+            0.4,
+            num_thresholds=100,
+            class_id=12,
+            name="sensitivity_at_specificity_1",
+        )
+        self.assertEqual(s_obj.name, "sensitivity_at_specificity_1")
+        self.assertLen(s_obj.variables, 4)
+        self.assertEqual(s_obj.specificity, 0.4)
+        self.assertEqual(s_obj.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+        # Check save and restore config
+        s_obj2 = metrics.SensitivityAtSpecificity.from_config(
+            s_obj.get_config()
+        )
+        self.assertEqual(s_obj2.name, "sensitivity_at_specificity_1")
+        self.assertLen(s_obj2.variables, 4)
+        self.assertEqual(s_obj2.specificity, 0.4)
+        self.assertEqual(s_obj2.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+    def test_value_is_idempotent(self):
+        s_obj = metrics.SensitivityAtSpecificity(0.7)
+        y_pred = tf.random.uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+        y_true = tf.random.uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+        update_op = s_obj.update_state(y_true, y_pred)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+
+        # Run several updates.
+        for _ in range(10):
+            self.evaluate(update_op)
+
+        # Then verify idempotency.
+        initial_sensitivity = self.evaluate(s_obj.result())
+        for _ in range(10):
+            self.assertAlmostEqual(
+                initial_sensitivity, self.evaluate(s_obj.result()), 1e-3
+            )
+
+    def test_unweighted_all_correct(self):
+        with self.test_session():
+            s_obj = metrics.SensitivityAtSpecificity(0.7)
+            inputs = np.random.randint(0, 2, size=(100, 1))
+            y_pred = tf.constant(inputs, dtype=tf.float32)
+            y_true = tf.constant(inputs)
+            self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+            result = s_obj(y_true, y_pred)
+            self.assertAlmostEqual(1, self.evaluate(result))
+
+    def test_unweighted_high_specificity(self):
+        s_obj = metrics.SensitivityAtSpecificity(0.8)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.8, self.evaluate(result))
+
+    def test_unweighted_low_specificity(self):
+        s_obj = metrics.SensitivityAtSpecificity(0.4)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.6, self.evaluate(result))
+
+    def test_unweighted_class_id(self):
+        s_obj = metrics.SpecificityAtSensitivity(0.4, class_id=2)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+        label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
+
+        y_pred = tf.transpose([pred_values] * 3)
+        y_true = tf.one_hot(label_values, depth=3)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.6, self.evaluate(result))
+
+    @parameterized.parameters([tf.bool, tf.int32, tf.float32])
+    def test_weighted(self, label_dtype):
+        s_obj = metrics.SensitivityAtSpecificity(0.4)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+        weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.cast(label_values, dtype=label_dtype)
+        weights = tf.constant(weight_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred, sample_weight=weights)
+        self.assertAlmostEqual(0.675, self.evaluate(result))
+
+    def test_invalid_specificity(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`specificity` must be in the range \[0, 1\]."
+        ):
+            metrics.SensitivityAtSpecificity(-1)
+
+    def test_invalid_num_thresholds(self):
+        with self.assertRaisesRegex(
+            ValueError, "Argument `num_thresholds` must be an integer > 0"
+        ):
+            metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class SpecificityAtSensitivityTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        s_obj = metrics.SpecificityAtSensitivity(
+            0.4,
+            num_thresholds=100,
+            class_id=12,
+            name="specificity_at_sensitivity_1",
+        )
+        self.assertEqual(s_obj.name, "specificity_at_sensitivity_1")
+        self.assertLen(s_obj.variables, 4)
+        self.assertEqual(s_obj.sensitivity, 0.4)
+        self.assertEqual(s_obj.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+        # Check save and restore config
+        s_obj2 = metrics.SpecificityAtSensitivity.from_config(
+            s_obj.get_config()
+        )
+        self.assertEqual(s_obj2.name, "specificity_at_sensitivity_1")
+        self.assertLen(s_obj2.variables, 4)
+        self.assertEqual(s_obj2.sensitivity, 0.4)
+        self.assertEqual(s_obj2.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+    def test_value_is_idempotent(self):
+        s_obj = metrics.SpecificityAtSensitivity(0.7)
+        y_pred = tf.random.uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+        y_true = tf.random.uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+        update_op = s_obj.update_state(y_true, y_pred)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+
+        # Run several updates.
+        for _ in range(10):
+            self.evaluate(update_op)
+
+        # Then verify idempotency.
+        initial_specificity = self.evaluate(s_obj.result())
+        for _ in range(10):
+            self.assertAlmostEqual(
+                initial_specificity, self.evaluate(s_obj.result()), 1e-3
+            )
+
+    def test_unweighted_all_correct(self):
+        s_obj = metrics.SpecificityAtSensitivity(0.7)
+        inputs = np.random.randint(0, 2, size=(100, 1))
+        y_pred = tf.constant(inputs, dtype=tf.float32)
+        y_true = tf.constant(inputs)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+
+    def test_unweighted_high_sensitivity(self):
+        s_obj = metrics.SpecificityAtSensitivity(1.0)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.2, self.evaluate(result))
+
+    def test_unweighted_low_sensitivity(self):
+        s_obj = metrics.SpecificityAtSensitivity(0.4)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.6, self.evaluate(result))
+
+    def test_unweighted_class_id(self):
+        s_obj = metrics.SpecificityAtSensitivity(0.4, class_id=2)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+        label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
+
+        y_pred = tf.transpose([pred_values] * 3)
+        y_true = tf.one_hot(label_values, depth=3)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(0.6, self.evaluate(result))
+
+    @parameterized.parameters([tf.bool, tf.int32, tf.float32])
+    def test_weighted(self, label_dtype):
+        s_obj = metrics.SpecificityAtSensitivity(0.4)
+        pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+        weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.cast(label_values, dtype=label_dtype)
+        weights = tf.constant(weight_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred, sample_weight=weights)
+        self.assertAlmostEqual(0.4, self.evaluate(result))
+
+    def test_invalid_sensitivity(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`sensitivity` must be in the range \[0, 1\]."
+        ):
+            metrics.SpecificityAtSensitivity(-1)
+
+    def test_invalid_num_thresholds(self):
+        with self.assertRaisesRegex(
+            ValueError, "Argument `num_thresholds` must be an integer > 0"
+        ):
+            metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class PrecisionAtRecallTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        s_obj = metrics.PrecisionAtRecall(
+            0.4, num_thresholds=100, class_id=12, name="precision_at_recall_1"
+        )
+        self.assertEqual(s_obj.name, "precision_at_recall_1")
+        self.assertLen(s_obj.variables, 4)
+        self.assertEqual(s_obj.recall, 0.4)
+        self.assertEqual(s_obj.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+        # Check save and restore config
+        s_obj2 = metrics.PrecisionAtRecall.from_config(s_obj.get_config())
+        self.assertEqual(s_obj2.name, "precision_at_recall_1")
+        self.assertLen(s_obj2.variables, 4)
+        self.assertEqual(s_obj2.recall, 0.4)
+        self.assertEqual(s_obj2.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+    def test_value_is_idempotent(self):
+        s_obj = metrics.PrecisionAtRecall(0.7)
+        y_pred = tf.random.uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+        y_true = tf.random.uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+        update_op = s_obj.update_state(y_true, y_pred)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+
+        # Run several updates.
+        for _ in range(10):
+            self.evaluate(update_op)
+
+        # Then verify idempotency.
+        initial_precision = self.evaluate(s_obj.result())
+        for _ in range(10):
+            self.assertAlmostEqual(
+                initial_precision, self.evaluate(s_obj.result()), 1e-3
+            )
+
+    def test_unweighted_all_correct(self):
+        s_obj = metrics.PrecisionAtRecall(0.7)
+        inputs = np.random.randint(0, 2, size=(100, 1))
+        y_pred = tf.constant(inputs, dtype=tf.float32)
+        y_true = tf.constant(inputs)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+
+    def test_unweighted_high_recall(self):
+        s_obj = metrics.PrecisionAtRecall(0.8)
+        pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        # For 0.5 < decision threshold < 0.6.
+        self.assertAlmostEqual(2.0 / 3, self.evaluate(result))
+
+    def test_unweighted_low_recall(self):
+        s_obj = metrics.PrecisionAtRecall(0.6)
+        pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        # For 0.2 < decision threshold < 0.5.
+        self.assertAlmostEqual(0.75, self.evaluate(result))
+
+    def test_unweighted_class_id(self):
+        s_obj = metrics.PrecisionAtRecall(0.6, class_id=2)
+        pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
+        label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
+
+        y_pred = tf.transpose([pred_values] * 3)
+        y_true = tf.one_hot(label_values, depth=3)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        # For 0.2 < decision threshold < 0.5.
+        self.assertAlmostEqual(0.75, self.evaluate(result))
+
+    @parameterized.parameters([tf.bool, tf.int32, tf.float32])
+    def test_weighted(self, label_dtype):
+        s_obj = metrics.PrecisionAtRecall(7.0 / 8)
+        pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
+        label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+        weight_values = [2, 1, 2, 1, 2, 1, 2, 2, 1, 2]
+
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.cast(label_values, dtype=label_dtype)
+        weights = tf.constant(weight_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred, sample_weight=weights)
+        # For 0.0 < decision threshold < 0.2.
+        self.assertAlmostEqual(0.7, self.evaluate(result))
+
+    def test_invalid_sensitivity(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`recall` must be in the range \[0, 1\]."
+        ):
+            metrics.PrecisionAtRecall(-1)
+
+    def test_invalid_num_thresholds(self):
+        with self.assertRaisesRegex(
+            ValueError, "Argument `num_thresholds` must be an integer > 0"
+        ):
+            metrics.PrecisionAtRecall(0.4, num_thresholds=-1)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class RecallAtPrecisionTest(tf.test.TestCase, parameterized.TestCase):
+    def test_config(self):
+        s_obj = metrics.RecallAtPrecision(
+            0.4, num_thresholds=100, class_id=12, name="recall_at_precision_1"
+        )
+        self.assertEqual(s_obj.name, "recall_at_precision_1")
+        self.assertLen(s_obj.variables, 4)
+        self.assertEqual(s_obj.precision, 0.4)
+        self.assertEqual(s_obj.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+        # Check save and restore config
+        s_obj2 = metrics.RecallAtPrecision.from_config(s_obj.get_config())
+        self.assertEqual(s_obj2.name, "recall_at_precision_1")
+        self.assertLen(s_obj2.variables, 4)
+        self.assertEqual(s_obj2.precision, 0.4)
+        self.assertEqual(s_obj2.num_thresholds, 100)
+        self.assertEqual(s_obj.class_id, 12)
+
+    def test_value_is_idempotent(self):
+        s_obj = metrics.RecallAtPrecision(0.7)
+        y_pred = tf.random.uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+        y_true = tf.random.uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+        update_op = s_obj.update_state(y_true, y_pred)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+
+        # Run several updates.
+        for _ in range(10):
+            self.evaluate(update_op)
+
+        # Then verify idempotency.
+        initial_recall = self.evaluate(s_obj.result())
+        for _ in range(10):
+            self.assertAlmostEqual(
+                initial_recall, self.evaluate(s_obj.result()), 1e-3
+            )
+
+    def test_unweighted_all_correct(self):
+        s_obj = metrics.RecallAtPrecision(0.7)
+        inputs = np.random.randint(0, 2, size=(100, 1))
+        y_pred = tf.constant(inputs, dtype=tf.float32)
+        y_true = tf.constant(inputs)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        self.assertAlmostEqual(1, self.evaluate(result))
+
+    def test_unweighted_high_precision(self):
+        s_obj = metrics.RecallAtPrecision(0.75)
+        pred_values = [
+            0.05,
+            0.1,
+            0.2,
+            0.3,
+            0.3,
+            0.35,
+            0.4,
+            0.45,
+            0.5,
+            0.6,
+            0.9,
+            0.95,
+        ]
+        label_values = [0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1]
+        # precisions: [1/2, 6/11, 1/2, 5/9, 5/8, 5/7, 2/3, 3/5, 3/5, 2/3, 1/2,
+        # 1].
+        # recalls:    [1,   1,    5/6, 5/6, 5/6, 5/6, 2/3, 1/2, 1/2, 1/3, 1/6,
+        # 1/6].
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        # The precision 0.75 can be reached at thresholds 0.4<=t<0.45.
+        self.assertAlmostEqual(0.5, self.evaluate(result))
+
+    def test_unweighted_low_precision(self):
+        s_obj = metrics.RecallAtPrecision(2.0 / 3)
+        pred_values = [
+            0.05,
+            0.1,
+            0.2,
+            0.3,
+            0.3,
+            0.35,
+            0.4,
+            0.45,
+            0.5,
+            0.6,
+            0.9,
+            0.95,
+        ]
+        label_values = [0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1]
+        # precisions: [1/2, 6/11, 1/2, 5/9, 5/8, 5/7, 2/3, 3/5, 3/5, 2/3, 1/2,
+        # 1].
+        # recalls:    [1,   1,    5/6, 5/6, 5/6, 5/6, 2/3, 1/2, 1/2, 1/3, 1/6,
+        # 1/6].
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        # The precision 5/7 can be reached at thresholds 00.3<=t<0.35.
+        self.assertAlmostEqual(5.0 / 6, self.evaluate(result))
+
+    def test_unweighted_class_id(self):
+        s_obj = metrics.RecallAtPrecision(2.0 / 3, class_id=2)
+        pred_values = [
+            0.05,
+            0.1,
+            0.2,
+            0.3,
+            0.3,
+            0.35,
+            0.4,
+            0.45,
+            0.5,
+            0.6,
+            0.9,
+            0.95,
+        ]
+        label_values = [0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2]
+        # precisions: [1/2, 6/11, 1/2, 5/9, 5/8, 5/7, 2/3, 3/5, 3/5, 2/3, 1/2,
+        # 1].
+        # recalls:    [1,   1,    5/6, 5/6, 5/6, 5/6, 2/3, 1/2, 1/2, 1/3, 1/6,
+        # 1/6].
+        y_pred = tf.transpose([pred_values] * 3)
+        y_true = tf.one_hot(label_values, depth=3)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        # The precision 5/7 can be reached at thresholds 00.3<=t<0.35.
+        self.assertAlmostEqual(5.0 / 6, self.evaluate(result))
+
+    @parameterized.parameters([tf.bool, tf.int32, tf.float32])
+    def test_weighted(self, label_dtype):
+        s_obj = metrics.RecallAtPrecision(0.75)
+        pred_values = [0.1, 0.2, 0.3, 0.5, 0.6, 0.9, 0.9]
+        label_values = [0, 1, 0, 0, 0, 1, 1]
+        weight_values = [1, 2, 1, 2, 1, 2, 1]
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.cast(label_values, dtype=label_dtype)
+        weights = tf.constant(weight_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred, sample_weight=weights)
+        self.assertAlmostEqual(0.6, self.evaluate(result))
+
+    def test_unachievable_precision(self):
+        s_obj = metrics.RecallAtPrecision(2.0 / 3)
+        pred_values = [0.1, 0.2, 0.3, 0.9]
+        label_values = [1, 1, 0, 0]
+        y_pred = tf.constant(pred_values, dtype=tf.float32)
+        y_true = tf.constant(label_values)
+        self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+        result = s_obj(y_true, y_pred)
+        # The highest possible precision is 1/2 which is below the required
+        # value, expect 0 recall.
+        self.assertAlmostEqual(0, self.evaluate(result))
+
+    def test_invalid_sensitivity(self):
+        with self.assertRaisesRegex(
+            ValueError, r"`precision` must be in the range \[0, 1\]."
+        ):
+            metrics.RecallAtPrecision(-1)
+
+    def test_invalid_num_thresholds(self):
+        with self.assertRaisesRegex(
+            ValueError, "Argument `num_thresholds` must be an integer > 0"
+        ):
+            metrics.RecallAtPrecision(0.4, num_thresholds=-1)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class AUCTest(tf.test.TestCase, parameterized.TestCase):
+    def setup(self):
+        self.num_thresholds = 3
+        self.y_pred = tf.constant([0, 0.5, 0.3, 0.9], dtype=tf.float32)
+        self.y_pred_multi_label = tf.constant(
+            [[0.0, 0.4], [0.5, 0.7], [0.3, 0.2], [0.9, 0.3]], dtype=tf.float32
+        )
+        epsilon = 1e-12
+        self.y_pred_logits = -tf.math.log(1.0 / (self.y_pred + epsilon) - 1.0)
+        self.y_true = tf.constant([0, 0, 1, 1])
+        self.y_true_multi_label = tf.constant([[0, 0], [1, 1], [1, 1], [1, 0]])
+        self.sample_weight = [1, 2, 3, 4]
+
+        # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
+        # y_pred when threshold = 0 - 1e-7  : [1, 1, 1, 1]
+        # y_pred when threshold = 0.5       : [0, 0, 0, 1]
+        # y_pred when threshold = 1 + 1e-7  : [0, 0, 0, 0]
+
+        # without sample_weight:
+        # tp = np.sum([[0, 0, 1, 1], [0, 0, 0, 1], [0, 0, 0, 0]], axis=1)
+        # fp = np.sum([[1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
+        # fn = np.sum([[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 1]], axis=1)
+        # tn = np.sum([[0, 0, 0, 0], [1, 1, 0, 0], [1, 1, 0, 0]], axis=1)
+
+        # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+
+        # with sample_weight:
+        # tp = np.sum([[0, 0, 3, 4], [0, 0, 0, 4], [0, 0, 0, 0]], axis=1)
+        # fp = np.sum([[1, 2, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
+        # fn = np.sum([[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 3, 4]], axis=1)
+        # tn = np.sum([[0, 0, 0, 0], [1, 2, 0, 0], [1, 2, 0, 0]], axis=1)
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+
+    def test_config(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=100,
+            curve="PR",
+            summation_method="majoring",
+            name="auc_1",
+            dtype=tf.float64,
+            multi_label=True,
+            num_labels=2,
+            from_logits=True,
+        )
+        auc_obj.update_state(self.y_true_multi_label, self.y_pred_multi_label)
+        self.assertEqual(auc_obj.name, "auc_1")
+        self.assertEqual(auc_obj._dtype, tf.float64)
+        self.assertLen(auc_obj.variables, 4)
+        self.assertEqual(auc_obj.num_thresholds, 100)
+        self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
+        self.assertEqual(
+            auc_obj.summation_method, metrics_utils.AUCSummationMethod.MAJORING
+        )
+        self.assertTrue(auc_obj.multi_label)
+        self.assertEqual(auc_obj.num_labels, 2)
+        self.assertTrue(auc_obj._from_logits)
+        old_config = auc_obj.get_config()
+        self.assertNotIn("thresholds", old_config)
+        self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
+
+        # Check save and restore config.
+        auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
+        auc_obj2.update_state(self.y_true_multi_label, self.y_pred_multi_label)
+        self.assertEqual(auc_obj2.name, "auc_1")
+        self.assertLen(auc_obj2.variables, 4)
+        self.assertEqual(auc_obj2.num_thresholds, 100)
+        self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
+        self.assertEqual(
+            auc_obj2.summation_method, metrics_utils.AUCSummationMethod.MAJORING
+        )
+        self.assertTrue(auc_obj2.multi_label)
+        self.assertEqual(auc_obj2.num_labels, 2)
+        self.assertTrue(auc_obj2._from_logits)
+        new_config = auc_obj2.get_config()
+        self.assertNotIn("thresholds", new_config)
+        self.assertDictEqual(old_config, new_config)
+        self.assertAllClose(auc_obj.thresholds, auc_obj2.thresholds)
+
+    def test_config_manual_thresholds(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=None,
+            curve="PR",
+            summation_method="majoring",
+            name="auc_1",
+            thresholds=[0.3, 0.5],
+        )
+        auc_obj.update_state(self.y_true, self.y_pred)
+        self.assertEqual(auc_obj.name, "auc_1")
+        self.assertLen(auc_obj.variables, 4)
+        self.assertEqual(auc_obj.num_thresholds, 4)
+        self.assertAllClose(auc_obj.thresholds, [0.0, 0.3, 0.5, 1.0])
+        self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
+        self.assertEqual(
+            auc_obj.summation_method, metrics_utils.AUCSummationMethod.MAJORING
+        )
+        old_config = auc_obj.get_config()
+        self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
+
+        # Check save and restore config.
+        auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
+        auc_obj2.update_state(self.y_true, self.y_pred)
+        self.assertEqual(auc_obj2.name, "auc_1")
+        self.assertLen(auc_obj2.variables, 4)
+        self.assertEqual(auc_obj2.num_thresholds, 4)
+        self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
+        self.assertEqual(
+            auc_obj2.summation_method, metrics_utils.AUCSummationMethod.MAJORING
+        )
+        new_config = auc_obj2.get_config()
+        self.assertDictEqual(old_config, new_config)
+        self.assertAllClose(auc_obj.thresholds, auc_obj2.thresholds)
+
+    def test_value_is_idempotent(self):
+        self.setup()
+        auc_obj = metrics.AUC(num_thresholds=3)
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+
+        # Run several updates.
+        update_op = auc_obj.update_state(self.y_true, self.y_pred)
+        for _ in range(10):
+            self.evaluate(update_op)
+
+        # Then verify idempotency.
+        initial_auc = self.evaluate(auc_obj.result())
+        for _ in range(10):
+            self.assertAllClose(
+                initial_auc, self.evaluate(auc_obj.result()), 1e-3
+            )
+
+    def test_unweighted_all_correct(self):
+        self.setup()
+        auc_obj = metrics.AUC()
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true, self.y_true)
+        self.assertEqual(self.evaluate(result), 1)
+
+    def test_unweighted(self):
+        self.setup()
+        auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true, self.y_pred)
+
+        # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+        # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
+        # fp_rate = [2/2, 0, 0] = [1, 0, 0]
+        # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 0.75 * 1 + 0.25 * 0
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_unweighted_from_logits(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, from_logits=True
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true, self.y_pred_logits)
+
+        # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+        # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
+        # fp_rate = [2/2, 0, 0] = [1, 0, 0]
+        # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 0.75 * 1 + 0.25 * 0
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_manual_thresholds(self):
+        self.setup()
+        # Verify that when specified, thresholds are used instead of
+        # num_thresholds.
+        auc_obj = metrics.AUC(num_thresholds=2, thresholds=[0.5])
+        self.assertEqual(auc_obj.num_thresholds, 3)
+        self.assertAllClose(auc_obj.thresholds, [0.0, 0.5, 1.0])
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true, self.y_pred)
+
+        # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+        # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
+        # fp_rate = [2/2, 0, 0] = [1, 0, 0]
+        # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 0.75 * 1 + 0.25 * 0
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_weighted_roc_interpolation(self):
+        self.setup()
+        auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+        # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+        # heights = [(1 + 0.571)/2, (0.571 + 0)/2] = [0.7855, 0.2855]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 0.7855 * 1 + 0.2855 * 0
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_weighted_roc_majoring(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, summation_method="majoring"
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+        # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+        # heights = [max(1, 0.571), max(0.571, 0)] = [1, 0.571]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 1 * 1 + 0.571 * 0
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_weighted_roc_minoring(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, summation_method="minoring"
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+        # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+        # heights = [min(1, 0.571), min(0.571, 0)] = [0.571, 0]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 0.571 * 1 + 0 * 0
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_weighted_pr_majoring(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds,
+            curve="PR",
+            summation_method="majoring",
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
+        # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+        # heights = [max(0.7, 1), max(1, 0)] = [1, 1]
+        # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
+        expected_result = 1 * 0.429 + 1 * 0.571
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_weighted_pr_minoring(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds,
+            curve="PR",
+            summation_method="minoring",
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
+        # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+        # heights = [min(0.7, 1), min(1, 0)] = [0.7, 0]
+        # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
+        expected_result = 0.7 * 0.429 + 0 * 0.571
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_weighted_pr_interpolation(self):
+        self.setup()
+        auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve="PR")
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # auc = (slope / Total Pos) * [dTP - intercept * log(Pb/Pa)]
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # P = tp + fp = [10, 4, 0]
+        # dTP = [7-4, 4-0] = [3, 4]
+        # dP = [10-4, 4-0] = [6, 4]
+        # slope = dTP/dP = [0.5, 1]
+        # intercept = (TPa+(slope*Pa) = [(4 - 0.5*4), (0 - 1*0)] = [2, 0]
+        # (Pb/Pa) = (Pb/Pa) if Pb > 0 AND Pa > 0 else 1 = [10/4, 4/0] = [2.5, 1]
+        # auc * TotalPos = [(0.5 * (3 + 2 * log(2.5))), (1 * (4 + 0))]
+        #                = [2.416, 4]
+        # auc = [2.416, 4]/(tp[1:]+fn[1:])
+        expected_result = 2.416 / 7 + 4 / 7
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_invalid_num_thresholds(self):
+        with self.assertRaisesRegex(
+            ValueError, "Argument `num_thresholds` must be an integer > 1"
+        ):
+            metrics.AUC(num_thresholds=-1)
+
+        with self.assertRaisesRegex(
+            ValueError, "Argument `num_thresholds` must be an integer > 1."
+        ):
+            metrics.AUC(num_thresholds=1)
+
+    def test_invalid_curve(self):
+        with self.assertRaisesRegex(
+            ValueError, 'Invalid AUC curve value: "Invalid".'
+        ):
+            metrics.AUC(curve="Invalid")
+
+    def test_invalid_summation_method(self):
+        with self.assertRaisesRegex(
+            ValueError, 'Invalid AUC summation method value: "Invalid".'
+        ):
+            metrics.AUC(summation_method="Invalid")
+
+    def test_extra_dims(self):
+        try:
+            from scipy import special
+
+            self.setup()
+            logits = special.expit(
+                -np.array(
+                    [
+                        [[-10.0, 10.0, -10.0], [10.0, -10.0, 10.0]],
+                        [[-12.0, 12.0, -12.0], [12.0, -12.0, 12.0]],
+                    ],
+                    dtype=np.float32,
+                )
+            )
+            labels = np.array(
+                [[[1, 0, 0], [1, 0, 0]], [[0, 1, 1], [0, 1, 1]]], dtype=np.int64
+            )
+            auc_obj = metrics.AUC()
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            result = auc_obj(labels, logits)
+            self.assertEqual(self.evaluate(result), 0.5)
+        except ImportError as e:
+            tf_logging.warning(f"Cannot test special functions: {str(e)}")
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MultiAUCTest(tf.test.TestCase, parameterized.TestCase):
+    def setup(self):
+        self.num_thresholds = 5
+        self.y_pred = tf.constant(
+            np.array([[0, 0.5, 0.3, 0.9], [0.1, 0.2, 0.3, 0.4]]).T,
+            dtype=tf.float32,
+        )
+
+        epsilon = 1e-12
+        self.y_pred_logits = -tf.math.log(1.0 / (self.y_pred + epsilon) - 1.0)
+
+        self.y_true_good = tf.constant(np.array([[0, 0, 1, 1], [0, 0, 1, 1]]).T)
+        self.y_true_bad = tf.constant(np.array([[0, 0, 1, 1], [1, 1, 0, 0]]).T)
+        self.sample_weight = [1, 2, 3, 4]
+
+        # threshold values are [0 - 1e-7, 0.25, 0.5, 0.75, 1 + 1e-7]
+        # y_pred when threshold = 0 - 1e-7   : [[1, 1, 1, 1], [1, 1, 1, 1]]
+        # y_pred when threshold = 0.25       : [[0, 1, 1, 1], [0, 0, 1, 1]]
+        # y_pred when threshold = 0.5        : [[0, 0, 0, 1], [0, 0, 0, 0]]
+        # y_pred when threshold = 0.75       : [[0, 0, 0, 1], [0, 0, 0, 0]]
+        # y_pred when threshold = 1 + 1e-7   : [[0, 0, 0, 0], [0, 0, 0, 0]]
+
+        # for y_true_good, over thresholds:
+        # tp = [[2, 2, 1, 1, 0], [2, 2, 0, 0, 0]]
+        # fp = [[2, 1, 0, 0 , 0], [2, 0, 0 ,0, 0]]
+        # fn = [[0, 0, 1, 1, 2], [0, 0, 2, 2, 2]]
+        # tn = [[0, 1, 2, 2, 2], [0, 2, 2, 2, 2]]
+
+        # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
+        # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
+
+        # for y_true_bad:
+        # tp = [[2, 2, 1, 1, 0], [2, 0, 0, 0, 0]]
+        # fp = [[2, 1, 0, 0 , 0], [2, 2, 0 ,0, 0]]
+        # fn = [[0, 0, 1, 1, 2], [0, 2, 2, 2, 2]]
+        # tn = [[0, 1, 2, 2, 2], [0, 0, 2, 2, 2]]
+
+        # tpr = [[1, 1, 0.5, 0.5, 0], [1, 0, 0, 0, 0]]
+        # fpr = [[1, 0.5, 0, 0, 0], [1, 1, 0, 0, 0]]
+
+        # for y_true_good with sample_weights:
+
+        # tp = [[7, 7, 4, 4, 0], [7, 7, 0, 0, 0]]
+        # fp = [[3, 2, 0, 0, 0], [3, 0, 0, 0, 0]]
+        # fn = [[0, 0, 3, 3, 7], [0, 0, 7, 7, 7]]
+        # tn = [[0, 1, 3, 3, 3], [0, 3, 3, 3, 3]]
+
+        # tpr = [[1, 1,    0.57, 0.57, 0], [1, 1, 0, 0, 0]]
+        # fpr = [[1, 0.67, 0,    0,    0], [1, 0, 0, 0, 0]]
+
+    def test_value_is_idempotent(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(num_thresholds=5, multi_label=True)
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+
+            # Run several updates.
+            update_op = auc_obj.update_state(self.y_true_good, self.y_pred)
+            for _ in range(10):
+                self.evaluate(update_op)
+
+            # Then verify idempotency.
+            initial_auc = self.evaluate(auc_obj.result())
+            for _ in range(10):
+                self.assertAllClose(
+                    initial_auc, self.evaluate(auc_obj.result()), 1e-3
+                )
+
+    def test_unweighted_all_correct(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(multi_label=True)
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            result = auc_obj(self.y_true_good, self.y_true_good)
+            self.assertEqual(self.evaluate(result), 1)
+
+    def test_unweighted_all_correct_flat(self):
+        self.setup()
+        auc_obj = metrics.AUC(multi_label=False)
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true_good, self.y_true_good)
+        self.assertEqual(self.evaluate(result), 1)
+
+    def test_unweighted(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(
+                num_thresholds=self.num_thresholds, multi_label=True
+            )
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            result = auc_obj(self.y_true_good, self.y_pred)
+
+            # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
+            # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
+            expected_result = (0.875 + 1.0) / 2.0
+            self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_unweighted_from_logits(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(
+                num_thresholds=self.num_thresholds,
+                multi_label=True,
+                from_logits=True,
+            )
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            result = auc_obj(self.y_true_good, self.y_pred_logits)
+
+            # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
+            # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
+            expected_result = (0.875 + 1.0) / 2.0
+            self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_sample_weight_flat(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, multi_label=False
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(
+            self.y_true_good, self.y_pred, sample_weight=[1, 2, 3, 4]
+        )
+
+        # tpr = [1, 1, 0.2857, 0.2857, 0]
+        # fpr = [1, 0.3333, 0, 0, 0]
+        expected_result = 1.0 - (0.3333 * (1.0 - 0.2857) / 2.0)
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_full_sample_weight_flat(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, multi_label=False
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        sw = np.arange(4 * 2)
+        sw = sw.reshape(4, 2)
+        result = auc_obj(self.y_true_good, self.y_pred, sample_weight=sw)
+
+        # tpr = [1, 1, 0.2727, 0.2727, 0]
+        # fpr = [1, 0.3333, 0, 0, 0]
+        expected_result = 1.0 - (0.3333 * (1.0 - 0.2727) / 2.0)
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_label_weights(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(
+                num_thresholds=self.num_thresholds,
+                multi_label=True,
+                label_weights=[0.75, 0.25],
+            )
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            result = auc_obj(self.y_true_good, self.y_pred)
+
+            # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
+            # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
+            expected_result = (0.875 * 0.75 + 1.0 * 0.25) / (0.75 + 0.25)
+            self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_label_weights_flat(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds,
+            multi_label=False,
+            label_weights=[0.75, 0.25],
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true_good, self.y_pred)
+
+        # tpr = [1, 1, 0.375, 0.375, 0]
+        # fpr = [1, 0.375, 0, 0, 0]
+        expected_result = 1.0 - ((1.0 - 0.375) * 0.375 / 2.0)
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-2)
+
+    def test_unweighted_flat(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, multi_label=False
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true_good, self.y_pred)
+
+        # tp = [4, 4, 1, 1, 0]
+        # fp = [4, 1, 0, 0, 0]
+        # fn = [0, 0, 3, 3, 4]
+        # tn = [0, 3, 4, 4, 4]
+
+        # tpr = [1, 1, 0.25, 0.25, 0]
+        # fpr = [1, 0.25, 0, 0, 0]
+        expected_result = 1.0 - (3.0 / 32.0)
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_unweighted_flat_from_logits(self):
+        self.setup()
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds,
+            multi_label=False,
+            from_logits=True,
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+        result = auc_obj(self.y_true_good, self.y_pred_logits)
+
+        # tp = [4, 4, 1, 1, 0]
+        # fp = [4, 1, 0, 0, 0]
+        # fn = [0, 0, 3, 3, 4]
+        # tn = [0, 3, 4, 4, 4]
+
+        # tpr = [1, 1, 0.25, 0.25, 0]
+        # fpr = [1, 0.25, 0, 0, 0]
+        expected_result = 1.0 - (3.0 / 32.0)
+        self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_manual_thresholds(self):
+        with self.test_session():
+            self.setup()
+            # Verify that when specified, thresholds are used instead of
+            # num_thresholds.
+            auc_obj = metrics.AUC(
+                num_thresholds=2, thresholds=[0.5], multi_label=True
+            )
+            self.assertEqual(auc_obj.num_thresholds, 3)
+            self.assertAllClose(auc_obj.thresholds, [0.0, 0.5, 1.0])
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            result = auc_obj(self.y_true_good, self.y_pred)
+
+            # tp = [[2, 1, 0], [2, 0, 0]]
+            # fp = [2, 0, 0], [2, 0, 0]]
+            # fn = [[0, 1, 2], [0, 2, 2]]
+            # tn = [[0, 2, 2], [0, 2, 2]]
+
+            # tpr = [[1, 0.5, 0], [1, 0, 0]]
+            # fpr = [[1, 0, 0], [1, 0, 0]]
+
+            # auc by slice = [0.75, 0.5]
+            expected_result = (0.75 + 0.5) / 2.0
+
+            self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+    def test_weighted_roc_interpolation(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(
+                num_thresholds=self.num_thresholds, multi_label=True
+            )
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            result = auc_obj(
+                self.y_true_good, self.y_pred, sample_weight=self.sample_weight
+            )
+
+            # tpr = [[1, 1,    0.57, 0.57, 0], [1, 1, 0, 0, 0]]
+            # fpr = [[1, 0.67, 0,    0,    0], [1, 0, 0, 0, 0]]
+            expected_result = 1.0 - 0.5 * 0.43 * 0.67
+            self.assertAllClose(self.evaluate(result), expected_result, 1e-1)
+
+    def test_pr_interpolation_unweighted(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(
+                num_thresholds=self.num_thresholds, curve="PR", multi_label=True
+            )
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            good_result = auc_obj(self.y_true_good, self.y_pred)
+            with self.subTest(name="good"):
+                # PR AUCs are 0.917 and 1.0 respectively
+                self.assertAllClose(
+                    self.evaluate(good_result), (0.91667 + 1.0) / 2.0, 1e-1
+                )
+            bad_result = auc_obj(self.y_true_bad, self.y_pred)
+            with self.subTest(name="bad"):
+                # PR AUCs are 0.917 and 0.5 respectively
+                self.assertAllClose(
+                    self.evaluate(bad_result), (0.91667 + 0.5) / 2.0, 1e-1
+                )
+
+    def test_pr_interpolation(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(
+                num_thresholds=self.num_thresholds, curve="PR", multi_label=True
+            )
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            good_result = auc_obj(
+                self.y_true_good, self.y_pred, sample_weight=self.sample_weight
+            )
+            # PR AUCs are 0.939 and 1.0 respectively
+            self.assertAllClose(
+                self.evaluate(good_result), (0.939 + 1.0) / 2.0, 1e-1
+            )
+
+    def test_keras_model_compiles(self):
+        inputs = layers.Input(shape=(10,))
+        output = layers.Dense(3, activation="sigmoid")(inputs)
+        model = models.Model(inputs=inputs, outputs=output)
+        model.compile(
+            loss="binary_crossentropy", metrics=[metrics.AUC(multi_label=True)]
+        )
+
+    def test_reset_state(self):
+        with self.test_session():
+            self.setup()
+            auc_obj = metrics.AUC(
+                num_thresholds=self.num_thresholds, multi_label=True
+            )
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            auc_obj(self.y_true_good, self.y_pred)
+            auc_obj.reset_state()
+            self.assertAllEqual(auc_obj.true_positives, np.zeros((5, 2)))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["eager"]))
+class ThresholdsTest(tf.test.TestCase, parameterized.TestCase):
+    @parameterized.parameters(
+        [
+            metrics.TruePositives(),
+            metrics.TrueNegatives(),
+            metrics.FalsePositives(),
+            metrics.FalseNegatives(),
+            metrics.Precision(),
+            metrics.Recall(),
+            metrics.SensitivityAtSpecificity(0.5),
+            metrics.SpecificityAtSensitivity(0.5),
+            metrics.PrecisionAtRecall(0.5),
+            metrics.RecallAtPrecision(0.5),
+            metrics.AUC(),
+        ]
+    )
+    def test_with_default_thresholds(self, metric_obj):
+        # By default, the thresholds will be evenly distributed if there are
+        # more than 1. In case there is only 1 thresholds, then we expect
+        # _thresholds_distributed_evenly to be false.
+        expected = len(metric_obj.thresholds) > 1
+        self.assertEqual(metric_obj._thresholds_distributed_evenly, expected)
+
+    @parameterized.parameters(
+        [
+            metrics.TruePositives,
+            metrics.TrueNegatives,
+            metrics.FalsePositives,
+            metrics.FalseNegatives,
+            metrics.Precision,
+            metrics.Recall,
+        ]
+    )
+    def test_with_manual_thresholds(self, metric_cls):
+        even_thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
+        metric_obj = metric_cls(thresholds=even_thresholds)
+        self.assertTrue(metric_obj._thresholds_distributed_evenly)
+
+        uneven_thresholds = [0.0, 0.45, 1.0]
+        metric_obj = metric_cls(thresholds=uneven_thresholds)
+        self.assertFalse(metric_obj._thresholds_distributed_evenly)
+
+    def test_manual_thresholds_auc(self):
+        # The AUC metric handles manual thresholds input differently (it will
+        # add 0.0 and 1.0 for user).
+        even_thresholds = [0.25, 0.5, 0.75]
+        auc = metrics.AUC(thresholds=even_thresholds)
+        self.assertTrue(auc._thresholds_distributed_evenly)
+
+        # Test for save model
+        cloned = metrics.AUC.from_config(auc.get_config())
+        self.assertTrue(cloned._thresholds_distributed_evenly)
+
+        uneven_thresholds = [
+            0.45,
+        ]
+        auc = metrics.AUC(thresholds=uneven_thresholds)
+        self.assertFalse(auc._thresholds_distributed_evenly)
+
+        cloned = metrics.AUC.from_config(auc.get_config())
+        self.assertFalse(cloned._thresholds_distributed_evenly)
+
+    @parameterized.parameters(
+        [
+            metrics.TruePositives,
+            metrics.TrueNegatives,
+            metrics.FalsePositives,
+            metrics.FalseNegatives,
+            metrics.Precision,
+            metrics.Recall,
+            metrics.AUC,
+        ]
+    )
+    def test_even_thresholds_correctness(self, metric_cls):
+        with tf.compat.forward_compatibility_horizon(2021, 6, 9):
+            # make sure the old approach and new approach produce same result
+            # for evenly distributed thresholds
+            y_true = np.random.randint(2, size=(10,))
+            y_pred = np.random.rand(10)
+
+            even_thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
+            if metric_cls == metrics.AUC:
+                even_thresholds = even_thresholds[1:-1]
+            metric_obj = metric_cls(thresholds=even_thresholds)
+            metric_obj.update_state(y_true, y_pred)
+            result1 = metric_obj.result()
+
+            metric_obj2 = metric_cls(thresholds=even_thresholds)
+            # Force to use the old approach
+            metric_obj2._thresholds_distributed_evenly = False
+            metric_obj2.update_state(y_true, y_pred)
+            result2 = metric_obj2.result()
+
+            self.assertAllClose(result1, result2)
+            # Check all the variables are the same, eg tp, tn, fp, fn
+            for v1, v2 in zip(metric_obj.variables, metric_obj2.variables):
+                self.assertAllClose(v1, v2)
+
+    @parameterized.parameters(
+        [
+            metrics.SensitivityAtSpecificity,
+            metrics.SpecificityAtSensitivity,
+            metrics.PrecisionAtRecall,
+            metrics.RecallAtPrecision,
+        ]
+    )
+    def test_even_thresholds_correctness_2(self, metric_cls):
+        with tf.compat.forward_compatibility_horizon(2021, 6, 9):
+            y_true = np.random.randint(2, size=(10,))
+            y_pred = np.random.rand(10)
+
+            metric_obj = metric_cls(0.5)
+            metric_obj.update_state(y_true, y_pred)
+            result1 = metric_obj.result()
+
+            metric_obj2 = metric_cls(0.5)
+            # Force to use the old approach
+            metric_obj2._thresholds_distributed_evenly = False
+            metric_obj2.update_state(y_true, y_pred)
+            result2 = metric_obj2.result()
+
+            self.assertAllClose(result1, result2)
+            # Check all the variables are the same, eg tp, tn, fp, fn
+            for v1, v2 in zip(metric_obj.variables, metric_obj2.variables):
+                self.assertAllClose(v1, v2)
+
+
+class BinaryTruePositives(metrics.Metric):
+    def __init__(self, name="binary_true_positives", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.true_positives = self.add_weight(name="tp", initializer="zeros")
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
+
+        values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+        values = tf.cast(values, self.dtype)
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, dtype=self.dtype)
+            sample_weight = tf.__internal__.ops.broadcast_weights(
+                sample_weight, values
+            )
+            values = tf.multiply(values, sample_weight)
+        self.true_positives.assign_add(tf.reduce_sum(values))
+
+    def result(self):
+        return self.true_positives
+
+
+class BinaryTruePositivesViaControlFlow(metrics.Metric):
+    def __init__(self, name="binary_true_positives", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.true_positives = self.add_weight(name="tp", initializer="zeros")
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
+
+        for i in range(len(y_true)):
+            for j in range(len(y_true[i])):
+                if y_true[i][j] and y_pred[i][j]:
+                    if sample_weight is None:
+                        self.true_positives.assign_add(1)
+                    else:
+                        self.true_positives.assign_add(sample_weight[i][0])
+
+    def result(self):
+        if tf.constant(True):
+            return self.true_positives
+        return 0.0
+
+
+def _get_model(compile_metrics):
+    model_layers = [
+        layers.Dense(3, activation="relu", kernel_initializer="ones"),
+        layers.Dense(1, activation="sigmoid", kernel_initializer="ones"),
+    ]
+
+    model = test_utils.get_model_from_layers(model_layers, input_shape=(4,))
+    model.compile(
+        loss="mae",
+        metrics=compile_metrics,
+        optimizer="rmsprop",
+        run_eagerly=test_utils.should_run_eagerly(),
+    )
+    return model
+
+
+@test_combinations.run_with_all_model_types
+@test_combinations.run_all_keras_modes
+class ResetStatesTest(test_combinations.TestCase):
+    def test_reset_state_false_positives(self):
+        fp_obj = metrics.FalsePositives()
+        model = _get_model([fp_obj])
+        x = np.ones((100, 4))
+        y = np.zeros((100, 1))
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(fp_obj.accumulator), 100.0)
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(fp_obj.accumulator), 100.0)
+
+    def test_reset_state_false_negatives(self):
+        fn_obj = metrics.FalseNegatives()
+        model = _get_model([fn_obj])
+        x = np.zeros((100, 4))
+        y = np.ones((100, 1))
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(fn_obj.accumulator), 100.0)
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(fn_obj.accumulator), 100.0)
+
+    def test_reset_state_true_negatives(self):
+        tn_obj = metrics.TrueNegatives()
+        model = _get_model([tn_obj])
+        x = np.zeros((100, 4))
+        y = np.zeros((100, 1))
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(tn_obj.accumulator), 100.0)
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(tn_obj.accumulator), 100.0)
+
+    def test_reset_state_true_positives(self):
+        tp_obj = metrics.TruePositives()
+        model = _get_model([tp_obj])
+        x = np.ones((100, 4))
+        y = np.ones((100, 1))
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(tp_obj.accumulator), 100.0)
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(tp_obj.accumulator), 100.0)
+
+    def test_reset_state_precision(self):
+        p_obj = metrics.Precision()
+        model = _get_model([p_obj])
+        x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
+        y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(p_obj.true_positives), 50.0)
+        self.assertEqual(self.evaluate(p_obj.false_positives), 50.0)
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(p_obj.true_positives), 50.0)
+        self.assertEqual(self.evaluate(p_obj.false_positives), 50.0)
+
+    def test_precision_update_state_with_logits(self):
+        p_obj = metrics.Precision()
+        # Update state with logits (not in range (0, 1)) should not an raise
+        # error.
+        p_obj.update_state([-0.5, 0.5], [-2.0, 2.0])
+
+    def test_reset_state_recall(self):
+        r_obj = metrics.Recall()
+        model = _get_model([r_obj])
+        x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+        y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(r_obj.true_positives), 50.0)
+        self.assertEqual(self.evaluate(r_obj.false_negatives), 50.0)
+        model.evaluate(x, y)
+        self.assertEqual(self.evaluate(r_obj.true_positives), 50.0)
+        self.assertEqual(self.evaluate(r_obj.false_negatives), 50.0)
+
+    def test_reset_state_sensitivity_at_specificity(self):
+        s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
+        model = _get_model([s_obj])
+        x = np.concatenate(
+            (
+                np.ones((25, 4)),
+                np.zeros((25, 4)),
+                np.zeros((25, 4)),
+                np.ones((25, 4)),
+            )
+        )
+        y = np.concatenate(
+            (
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+            )
+        )
+
+        for _ in range(2):
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(s_obj.true_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_negatives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.true_negatives), 25.0)
+
+    def test_reset_state_specificity_at_sensitivity(self):
+        s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
+        model = _get_model([s_obj])
+        x = np.concatenate(
+            (
+                np.ones((25, 4)),
+                np.zeros((25, 4)),
+                np.zeros((25, 4)),
+                np.ones((25, 4)),
+            )
+        )
+        y = np.concatenate(
+            (
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+            )
+        )
+
+        for _ in range(2):
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(s_obj.true_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_negatives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.true_negatives), 25.0)
+
+    def test_reset_state_precision_at_recall(self):
+        s_obj = metrics.PrecisionAtRecall(recall=0.5, num_thresholds=1)
+        model = _get_model([s_obj])
+        x = np.concatenate(
+            (
+                np.ones((25, 4)),
+                np.zeros((25, 4)),
+                np.zeros((25, 4)),
+                np.ones((25, 4)),
+            )
+        )
+        y = np.concatenate(
+            (
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+            )
+        )
+
+        for _ in range(2):
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(s_obj.true_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_negatives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.true_negatives), 25.0)
+
+    def test_reset_state_recall_at_precision(self):
+        s_obj = metrics.RecallAtPrecision(precision=0.5, num_thresholds=1)
+        model = _get_model([s_obj])
+        x = np.concatenate(
+            (
+                np.ones((25, 4)),
+                np.zeros((25, 4)),
+                np.zeros((25, 4)),
+                np.ones((25, 4)),
+            )
+        )
+        y = np.concatenate(
+            (
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+            )
+        )
+
+        for _ in range(2):
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(s_obj.true_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_positives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.false_negatives), 25.0)
+            self.assertEqual(self.evaluate(s_obj.true_negatives), 25.0)
+
+    def test_reset_state_auc(self):
+        auc_obj = metrics.AUC(num_thresholds=3)
+        model = _get_model([auc_obj])
+        x = np.concatenate(
+            (
+                np.ones((25, 4)),
+                np.zeros((25, 4)),
+                np.zeros((25, 4)),
+                np.ones((25, 4)),
+            )
+        )
+        y = np.concatenate(
+            (
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+            )
+        )
+
+        for _ in range(2):
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.0)
+
+    def test_reset_state_auc_from_logits(self):
+        auc_obj = metrics.AUC(num_thresholds=3, from_logits=True)
+
+        model_layers = [
+            layers.Dense(1, kernel_initializer="ones", use_bias=False)
+        ]
+        model = test_utils.get_model_from_layers(model_layers, input_shape=(4,))
+        model.compile(
+            loss="mae",
+            metrics=[auc_obj],
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.concatenate(
+            (
+                np.ones((25, 4)),
+                -np.ones((25, 4)),
+                -np.ones((25, 4)),
+                np.ones((25, 4)),
+            )
+        )
+        y = np.concatenate(
+            (
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+            )
+        )
+
+        for _ in range(2):
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.0)
+
+    def test_reset_state_auc_manual_thresholds(self):
+        auc_obj = metrics.AUC(thresholds=[0.5])
+        model = _get_model([auc_obj])
+        x = np.concatenate(
+            (
+                np.ones((25, 4)),
+                np.zeros((25, 4)),
+                np.zeros((25, 4)),
+                np.ones((25, 4)),
+            )
+        )
+        y = np.concatenate(
+            (
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+                np.ones((25, 1)),
+                np.zeros((25, 1)),
+            )
+        )
+
+        for _ in range(2):
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.0)
+            self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.0)
+
+    def test_reset_state_mean_iou(self):
+        m_obj = metrics.MeanIoU(num_classes=2)
+        model = _get_model([m_obj])
+        x = np.asarray(
+            [[0, 0, 0, 0], [1, 1, 1, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
+            dtype=np.float32,
+        )
+        y = np.asarray([[0], [1], [1], [1]], dtype=np.float32)
+        model.evaluate(x, y)
+        self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
+        self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
+        model.evaluate(x, y)
+        self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
+        self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
+
+    def test_reset_state_recall_float64(self):
+        # Test case for GitHub issue 36790.
+        try:
+            backend.set_floatx("float64")
+            r_obj = metrics.Recall()
+            model = _get_model([r_obj])
+            x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+            y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(r_obj.true_positives), 50.0)
+            self.assertEqual(self.evaluate(r_obj.false_negatives), 50.0)
+            model.evaluate(x, y)
+            self.assertEqual(self.evaluate(r_obj.true_positives), 50.0)
+            self.assertEqual(self.evaluate(r_obj.false_negatives), 50.0)
+        finally:
+            backend.set_floatx("float32")
+
+    def test_function_wrapped_reset_state(self):
+        m = metrics.Mean(name="my_mean")
+
+        # check reset_state in function.
+        @tf.function
+        def reset_in_fn():
+            m.reset_state()
+            m.update_state(100)
+
+        for _ in range(5):
+            reset_in_fn()
+            if not tf.executing_eagerly():
+                self.evaluate(
+                    tf.compat.v1.get_default_graph().get_operations()[-1]
+                )
+        self.assertEqual(self.evaluate(m.count), 1)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MergeStateTest(test_combinations.TestCase):
+    def test_merge_state_incompatible_metrics(self):
+        with self.assertRaisesRegex(
+            ValueError, "Metric .* is not compatible with .*"
+        ):
+            obj1 = metrics.FalsePositives()
+            self.evaluate(tf.compat.v1.variables_initializer(obj1.variables))
+            obj2 = metrics.Accuracy()
+            self.evaluate(tf.compat.v1.variables_initializer(obj2.variables))
+            self.evaluate(obj1.merge_state([obj2]))
+
+    def test_merge_state_accuracy(self):
+        a_objs = []
+        for y_true, y_pred in zip(
+            [[[1], [2]], [[3], [4]]], [[[0], [2]], [[3], [4]]]
+        ):
+            a_obj = metrics.Accuracy()
+            a_objs.append(a_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
+            self.evaluate(a_obj.update_state(y_true, y_pred))
+        self.evaluate(a_objs[0].merge_state(a_objs[1:]))
+        self.assertEqual(self.evaluate(a_objs[0].total), 3.0)
+        self.assertEqual(self.evaluate(a_objs[0].count), 4.0)
+        self.assertEqual(self.evaluate(a_objs[0].result()), 0.75)
+
+    def test_merge_state_false_positives(self):
+        fp_objs = []
+        for _ in range(4):
+            fp_obj = metrics.FalsePositives()
+            fp_objs.append(fp_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
+            y_true = np.zeros((25, 1))
+            y_pred = np.ones((25, 1))
+            self.evaluate(fp_obj.update_state(y_true, y_pred))
+        self.evaluate(fp_objs[0].merge_state(fp_objs[1:]))
+        self.assertEqual(self.evaluate(fp_objs[0].accumulator), 100.0)
+
+    def test_merge_state_false_negatives(self):
+        fn_objs = []
+        for _ in range(4):
+            fn_obj = metrics.FalseNegatives()
+            fn_objs.append(fn_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
+            y_true = np.ones((25, 1))
+            y_pred = np.zeros((25, 1))
+            self.evaluate(fn_obj.update_state(y_true, y_pred))
+        self.evaluate(fn_objs[0].merge_state(fn_objs[1:]))
+        self.assertEqual(self.evaluate(fn_objs[0].accumulator), 100.0)
+
+    def test_merge_state_true_negatives(self):
+        tn_objs = []
+        for _ in range(4):
+            tn_obj = metrics.TrueNegatives()
+            tn_objs.append(tn_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
+            y_true = np.zeros((25, 1))
+            y_pred = np.zeros((25, 1))
+            self.evaluate(tn_obj.update_state(y_true, y_pred))
+        self.evaluate(tn_objs[0].merge_state(tn_objs[1:]))
+        self.assertEqual(self.evaluate(tn_objs[0].accumulator), 100.0)
+
+    def test_merge_state_true_positives(self):
+        tp_objs = []
+        for _ in range(4):
+            tp_obj = metrics.TruePositives()
+            tp_objs.append(tp_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
+            y_true = np.ones((25, 1))
+            y_pred = np.ones((25, 1))
+            self.evaluate(tp_obj.update_state(y_true, y_pred))
+        self.evaluate(tp_objs[0].merge_state(tp_objs[1:]))
+        self.assertEqual(self.evaluate(tp_objs[0].accumulator), 100.0)
+
+    def test_merge_state_precision(self):
+        p_objs = []
+        for _ in range(5):
+            p_obj = metrics.Precision()
+            p_objs.append(p_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
+            y_true = np.concatenate((np.ones((10, 1)), np.zeros((10, 1))))
+            y_pred = np.concatenate((np.ones((10, 1)), np.ones((10, 1))))
+            self.evaluate(p_obj.update_state(y_true, y_pred))
+        self.evaluate(p_objs[0].merge_state(p_objs[1:]))
+        self.assertEqual(self.evaluate(p_objs[0].true_positives), 50.0)
+        self.assertEqual(self.evaluate(p_objs[0].false_positives), 50.0)
+
+    def test_merge_state_recall(self):
+        r_objs = []
+        for _ in range(5):
+            r_obj = metrics.Recall()
+            r_objs.append(r_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
+            y_true = np.concatenate((np.ones((10, 1)), np.ones((10, 1))))
+            y_pred = np.concatenate((np.ones((10, 1)), np.zeros((10, 1))))
+            self.evaluate(r_obj.update_state(y_true, y_pred))
+        self.evaluate(r_objs[0].merge_state(r_objs[1:]))
+        self.assertEqual(self.evaluate(r_objs[0].true_positives), 50.0)
+        self.assertEqual(self.evaluate(r_objs[0].false_negatives), 50.0)
+
+    def test_merge_state_sensitivity_at_specificity(self):
+        sas_objs = []
+        for _ in range(5):
+            sas_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
+            sas_objs.append(sas_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(sas_obj.variables))
+            y_true = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                )
+            )
+            y_pred = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                )
+            )
+            self.evaluate(sas_obj.update_state(y_true, y_pred))
+        self.evaluate(sas_objs[0].merge_state(sas_objs[1:]))
+        self.assertEqual(self.evaluate(sas_objs[0].true_positives), 25.0)
+        self.assertEqual(self.evaluate(sas_objs[0].false_positives), 25.0)
+        self.assertEqual(self.evaluate(sas_objs[0].false_negatives), 25.0)
+        self.assertEqual(self.evaluate(sas_objs[0].true_negatives), 25.0)
+
+    def test_merge_state_specificity_at_sensitivity(self):
+        sas_objs = []
+        for _ in range(5):
+            sas_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
+            sas_objs.append(sas_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(sas_obj.variables))
+            y_true = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                )
+            )
+            y_pred = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                )
+            )
+            self.evaluate(sas_obj.update_state(y_true, y_pred))
+        self.evaluate(sas_objs[0].merge_state(sas_objs[1:]))
+        self.assertEqual(self.evaluate(sas_objs[0].true_positives), 25.0)
+        self.assertEqual(self.evaluate(sas_objs[0].false_positives), 25.0)
+        self.assertEqual(self.evaluate(sas_objs[0].false_negatives), 25.0)
+        self.assertEqual(self.evaluate(sas_objs[0].true_negatives), 25.0)
+
+    def test_merge_state_precision_at_recall(self):
+        par_objs = []
+        for _ in range(5):
+            par_obj = metrics.PrecisionAtRecall(recall=0.5, num_thresholds=1)
+            par_objs.append(par_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(par_obj.variables))
+            y_true = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                )
+            )
+            y_pred = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                )
+            )
+            self.evaluate(par_obj.update_state(y_true, y_pred))
+        self.evaluate(par_objs[0].merge_state(par_objs[1:]))
+        self.assertEqual(self.evaluate(par_objs[0].true_positives), 25.0)
+        self.assertEqual(self.evaluate(par_objs[0].false_positives), 25.0)
+        self.assertEqual(self.evaluate(par_objs[0].false_negatives), 25.0)
+        self.assertEqual(self.evaluate(par_objs[0].true_negatives), 25.0)
+
+    def test_merge_state_recall_at_precision(self):
+        rap_objs = []
+        for _ in range(5):
+            rap_obj = metrics.PrecisionAtRecall(recall=0.5, num_thresholds=1)
+            rap_objs.append(rap_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(rap_obj.variables))
+            y_true = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                )
+            )
+            y_pred = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                )
+            )
+            self.evaluate(rap_obj.update_state(y_true, y_pred))
+        self.evaluate(rap_objs[0].merge_state(rap_objs[1:]))
+        self.assertEqual(self.evaluate(rap_objs[0].true_positives), 25.0)
+        self.assertEqual(self.evaluate(rap_objs[0].false_positives), 25.0)
+        self.assertEqual(self.evaluate(rap_objs[0].false_negatives), 25.0)
+        self.assertEqual(self.evaluate(rap_objs[0].true_negatives), 25.0)
+
+    def test_merge_state_auc(self):
+        auc_objs = []
+        for _ in range(5):
+            auc_obj = metrics.AUC(num_thresholds=3)
+            auc_objs.append(auc_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+            y_true = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                )
+            )
+            y_pred = np.concatenate(
+                (
+                    np.ones((5, 1)),
+                    np.zeros((5, 1)),
+                    np.zeros((5, 1)),
+                    np.ones((5, 1)),
+                )
+            )
+            self.evaluate(auc_obj.update_state(y_true, y_pred))
+        self.evaluate(auc_objs[0].merge_state(auc_objs[1:]))
+        self.assertEqual(self.evaluate(auc_objs[0].true_positives[1]), 25.0)
+        self.assertEqual(self.evaluate(auc_objs[0].false_positives[1]), 25.0)
+        self.assertEqual(self.evaluate(auc_objs[0].false_negatives[1]), 25.0)
+        self.assertEqual(self.evaluate(auc_objs[0].true_negatives[1]), 25.0)
+
+    def test_merge_state_mean_iou(self):
+        m_objs = []
+        for y_true, y_pred in zip(
+            [[0], [1], [1], [1]], [[0.5], [1.0], [1.0], [1.0]]
+        ):
+            m_obj = metrics.MeanIoU(num_classes=2)
+            m_objs.append(m_obj)
+            self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+            self.evaluate(m_obj.update_state(y_true, y_pred))
+        self.evaluate(m_objs[0].merge_state(m_objs[1:]))
+        self.assertArrayNear(self.evaluate(m_objs[0].total_cm)[0], [1, 0], 1e-1)
+        self.assertArrayNear(self.evaluate(m_objs[0].total_cm)[1], [0, 3], 1e-1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/f_score_metrics.py b/keras/metrics/f_score_metrics.py
new file mode 100644
index 000000000000..3e59a0de0063
--- /dev/null
+++ b/keras/metrics/f_score_metrics.py
@@ -0,0 +1,323 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""F-Score metrics."""
+
+import tensorflow.compat.v2 as tf
+from tensorflow.python.util.tf_export import keras_export
+
+from keras.dtensor import utils as dtensor_utils
+from keras.metrics import base_metric
+
+
+# Adapted from TF-Addons implementation.
+@keras_export("keras.metrics.FBetaScore")
+class FBetaScore(base_metric.Metric):
+    """Computes F-Beta score.
+
+    This is the weighted harmonic mean of precision and recall.
+    Its output range is `[0, 1]`. It works for both multi-class
+    and multi-label classification.
+
+    It is defined as:
+
+    ```python
+    b2 = beta ** 2
+    f_beta_score = (1 + b2) * (precision * recall) / (precision * b2 + recall)
+    ```
+
+    Args:
+        average: Type of averaging to be performed across per-class results
+            in the multi-class case.
+            Acceptable values are `None`, `"micro"`, `"macro"` and
+            `"weighted"`. Default value is `None`.
+            If `None`, no averaging is performed and `result()` will return
+            the score for each class.
+            If `"micro"`, compute metrics globally by counting the total
+            true positives, false negatives and false positives.
+            If `"macro"`, compute metrics for each label,
+            and return their unweighted mean.
+            This does not take label imbalance into account.
+            If `"weighted"`, compute metrics for each label,
+            and return their average weighted by support
+            (the number of true instances for each label).
+            This alters `"macro"` to account for label imbalance.
+            It can result in an score that is not between precision and recall.
+        beta: Determines the weight of given to recall
+            in the harmonic mean between precision and recall (see pseudocode
+            equation above). Default value is 1.
+        threshold: Elements of `y_pred` greater than `threshold` are
+            converted to be 1, and the rest 0. If `threshold` is
+            `None`, the argmax of `y_pred` is converted to 1, and the rest to 0.
+        name: Optional. String name of the metric instance.
+        dtype: Optional. Data type of the metric result.
+
+    Returns:
+        F-Beta Score: float.
+
+    Example:
+
+    >>> metric = tf.keras.metrics.FBetaScore(beta=2.0, threshold=0.5)
+    >>> y_true = np.array([[1, 1, 1],
+    ...                    [1, 0, 0],
+    ...                    [1, 1, 0]], np.int32)
+    >>> y_pred = np.array([[0.2, 0.6, 0.7],
+    ...                    [0.2, 0.6, 0.6],
+    ...                    [0.6, 0.8, 0.0]], np.float32)
+    >>> metric.update_state(y_true, y_pred)
+    >>> result = metric.result()
+    >>> result.numpy()
+    array([0.3846154 , 0.90909094, 0.8333334 ], dtype=float32)
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        average=None,
+        beta=1.0,
+        threshold=None,
+        name="fbeta_score",
+        dtype=None,
+    ):
+        super().__init__(name=name, dtype=dtype)
+
+        if average not in (None, "micro", "macro", "weighted"):
+            raise ValueError(
+                "Invalid `average` argument value. Expected one of: "
+                "{None, 'micro', 'macro', 'weighted'}. "
+                f"Received: average={average}"
+            )
+
+        if not isinstance(beta, float):
+            raise ValueError(
+                "Invalid `beta` argument value. "
+                "It should be a Python float. "
+                f"Received: beta={beta} of type '{type(beta)}'"
+            )
+        if beta <= 0.0:
+            raise ValueError(
+                "Invalid `beta` argument value. "
+                "It should be > 0. "
+                f"Received: beta={beta}"
+            )
+
+        if threshold is not None:
+            if not isinstance(threshold, float):
+                raise ValueError(
+                    "Invalid `threshold` argument value. "
+                    "It should be a Python float. "
+                    f"Received: threshold={threshold} "
+                    f"of type '{type(threshold)}'"
+                )
+            if threshold > 1.0 or threshold <= 0.0:
+                raise ValueError(
+                    "Invalid `threshold` argument value. "
+                    "It should verify 0 < threshold <= 1. "
+                    f"Received: threshold={threshold}"
+                )
+
+        self.average = average
+        self.beta = beta
+        self.threshold = threshold
+        self.axis = None
+        self.built = False
+
+        if self.average != "micro":
+            self.axis = 0
+
+    def build(self, y_true_shape, y_pred_shape):
+        if len(y_pred_shape) != 2 or len(y_true_shape) != 2:
+            raise ValueError(
+                "FBetaScore expects 2D inputs with shape "
+                "(batch_size, output_dim). Received input "
+                f"shapes: y_pred.shape={y_pred_shape} and "
+                f"y_true.shape={y_true_shape}."
+            )
+        if y_pred_shape[-1] is None or y_true_shape[-1] is None:
+            raise ValueError(
+                "FBetaScore expects 2D inputs with shape "
+                "(batch_size, output_dim), with output_dim fully "
+                "defined (not None). Received input "
+                f"shapes: y_pred.shape={y_pred_shape} and "
+                f"y_true.shape={y_true_shape}."
+            )
+        num_classes = y_pred_shape[-1]
+        if self.average != "micro":
+            init_shape = [num_classes]
+        else:
+            init_shape = []
+
+        def _add_zeros_weight(name):
+            return self.add_weight(
+                name,
+                shape=init_shape,
+                initializer="zeros",
+                dtype=self.dtype,
+            )
+
+        self.true_positives = _add_zeros_weight("true_positives")
+        self.false_positives = _add_zeros_weight("false_positives")
+        self.false_negatives = _add_zeros_weight("false_negatives")
+        self.intermediate_weights = _add_zeros_weight("intermediate_weights")
+        self.built = True
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.convert_to_tensor(y_true, dtype=self.dtype)
+        y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
+        if not self.built:
+            self.build(y_true.shape, y_pred.shape)
+
+        if self.threshold is None:
+            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
+            # make sure [0, 0, 0] doesn't become [1, 1, 1]
+            # Use abs(x) > eps, instead of x != 0 to check for zero
+            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-9)
+        else:
+            y_pred = y_pred > self.threshold
+        y_pred = tf.cast(y_pred, dtype=self.dtype)
+
+        def _weighted_sum(val, sample_weight):
+            if sample_weight is not None:
+                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
+            return tf.reduce_sum(val, axis=self.axis)
+
+        self.true_positives.assign_add(
+            _weighted_sum(y_pred * y_true, sample_weight)
+        )
+        self.false_positives.assign_add(
+            _weighted_sum(y_pred * (1 - y_true), sample_weight)
+        )
+        self.false_negatives.assign_add(
+            _weighted_sum((1 - y_pred) * y_true, sample_weight)
+        )
+        self.intermediate_weights.assign_add(
+            _weighted_sum(y_true, sample_weight)
+        )
+
+    def result(self):
+        precision = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_positives
+        )
+        recall = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_negatives
+        )
+
+        mul_value = precision * recall
+        add_value = (tf.math.square(self.beta) * precision) + recall
+        mean = tf.math.divide_no_nan(mul_value, add_value)
+        f1_score = mean * (1 + tf.math.square(self.beta))
+
+        if self.average == "weighted":
+            weights = tf.math.divide_no_nan(
+                self.intermediate_weights,
+                tf.reduce_sum(self.intermediate_weights),
+            )
+            f1_score = tf.reduce_sum(f1_score * weights)
+
+        elif self.average is not None:  # [micro, macro]
+            f1_score = tf.reduce_mean(f1_score)
+
+        return f1_score
+
+    def get_config(self):
+        """Returns the serializable config of the metric."""
+
+        config = {
+            "average": self.average,
+            "beta": self.beta,
+            "threshold": self.threshold,
+        }
+
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+    def reset_state(self):
+        for v in self.variables:
+            v.assign(tf.zeros(v.shape, dtype=v.dtype))
+
+
+@keras_export("keras.metrics.F1Score")
+class F1Score(FBetaScore):
+    r"""Computes F-1 Score.
+
+    This is the harmonic mean of precision and recall.
+    Its output range is `[0, 1]`. It works for both multi-class
+    and multi-label classification.
+
+    It is defined as:
+
+    ```python
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    ```
+
+    Args:
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `"micro"`, `"macro"`
+            and `"weighted"`. Default value is `None`.
+            If `None`, no averaging is performed and `result()` will return
+            the score for each class.
+            If `"micro"`, compute metrics globally by counting the total
+            true positives, false negatives and false positives.
+            If `"macro"`, compute metrics for each label,
+            and return their unweighted mean.
+            This does not take label imbalance into account.
+            If `"weighted"`, compute metrics for each label,
+            and return their average weighted by support
+            (the number of true instances for each label).
+            This alters `"macro"` to account for label imbalance.
+            It can result in an score that is not between precision and recall.
+        threshold: Elements of `y_pred` greater than `threshold` are
+            converted to be 1, and the rest 0. If `threshold` is
+            `None`, the argmax of `y_pred` is converted to 1, and the rest to 0.
+        name: Optional. String name of the metric instance.
+        dtype: Optional. Data type of the metric result.
+
+    Returns:
+        F-1 Score: float.
+
+    Example:
+
+    >>> metric = tf.keras.metrics.F1Score(threshold=0.5)
+    >>> y_true = np.array([[1, 1, 1],
+    ...                    [1, 0, 0],
+    ...                    [1, 1, 0]], np.int32)
+    >>> y_pred = np.array([[0.2, 0.6, 0.7],
+    ...                    [0.2, 0.6, 0.6],
+    ...                    [0.6, 0.8, 0.0]], np.float32)
+    >>> metric.update_state(y_true, y_pred)
+    >>> result = metric.result()
+    >>> result.numpy()
+    array([0.5      , 0.8      , 0.6666667], dtype=float32)
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        average=None,
+        threshold=None,
+        name="f1_score",
+        dtype=None,
+    ):
+        super().__init__(
+            average=average,
+            beta=1.0,
+            threshold=threshold,
+            name=name,
+            dtype=dtype,
+        )
+
+    def get_config(self):
+        base_config = super().get_config()
+        del base_config["beta"]
+        return base_config
diff --git a/keras/metrics/f_score_metrics_test.py b/keras/metrics/f_score_metrics_test.py
new file mode 100644
index 000000000000..8854467ad8e5
--- /dev/null
+++ b/keras/metrics/f_score_metrics_test.py
@@ -0,0 +1,277 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for F-score metrics."""
+
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.metrics import f_score_metrics
+from keras.testing_infra import test_utils
+
+
+@test_utils.run_v2_only
+class FBetaScoreTest(parameterized.TestCase, tf.test.TestCase):
+    def _run_test(
+        self,
+        y_true,
+        y_pred,
+        sample_weights,
+        average,
+        beta,
+        threshold,
+        reference_result,
+    ):
+        y_true = tf.constant(y_true, dtype="float32")
+        y_pred = tf.constant(y_pred, dtype="float32")
+        fbeta = f_score_metrics.FBetaScore(average, beta, threshold)
+        fbeta.update_state(y_true, y_pred, sample_weights)
+        result = fbeta.result().numpy()
+        self.assertAllClose(result, reference_result, atol=1e-6)
+
+    def test_config(self):
+        fbeta_obj = f_score_metrics.FBetaScore(
+            beta=0.5, threshold=0.3, average=None
+        )
+        self.assertEqual(fbeta_obj.beta, 0.5)
+        self.assertEqual(fbeta_obj.average, None)
+        self.assertEqual(fbeta_obj.threshold, 0.3)
+        self.assertEqual(fbeta_obj.dtype, tf.float32)
+
+        # Check save and restore config
+        fbeta_obj2 = f_score_metrics.FBetaScore.from_config(
+            fbeta_obj.get_config()
+        )
+        self.assertEqual(fbeta_obj2.beta, 0.5)
+        self.assertEqual(fbeta_obj2.average, None)
+        self.assertEqual(fbeta_obj2.threshold, 0.3)
+        self.assertEqual(fbeta_obj2.dtype, tf.float32)
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            average=["micro", "macro", "weighted"], beta=[0.5, 1.0, 2.0]
+        )
+    )
+    def test_fbeta_perfect_score(self, average, beta):
+        y_true = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]
+        y_pred = [[0.7, 0.7, 0.7], [1, 0, 0], [0.9, 0.8, 0]]
+        self._run_test(
+            y_true,
+            y_pred,
+            None,
+            average=average,
+            beta=beta,
+            threshold=0.66,
+            reference_result=1.0,
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            average=["micro", "macro", "weighted"], beta=[0.5, 1.0, 2.0]
+        )
+    )
+    def test_fbeta_worst_score(self, average, beta):
+        y_true = [[0, 0, 0], [0, 1, 0], [0, 0, 1]]
+        y_pred = [[0.7, 0.7, 0.7], [1, 0, 0], [0.9, 0.8, 0]]
+        self._run_test(
+            y_true,
+            y_pred,
+            None,
+            average=average,
+            beta=beta,
+            threshold=0.66,
+            reference_result=0.0,
+        )
+
+    @parameterized.parameters(
+        # average, beta, result
+        (None, 0.5, [0.71428573, 0.5, 0.833334]),
+        (None, 1.0, [0.8, 0.5, 0.6666667]),
+        (None, 2.0, [0.9090904, 0.5, 0.555556]),
+        ("micro", 0.5, 0.6666667),
+        ("micro", 1.0, 0.6666667),
+        ("micro", 2.0, 0.6666667),
+        ("macro", 0.5, 0.6825397),
+        ("macro", 1.0, 0.6555555),
+        ("macro", 2.0, 0.6548822),
+        ("weighted", 0.5, 0.6825397),
+        ("weighted", 1.0, 0.6555555),
+        ("weighted", 2.0, 0.6548822),
+    )
+    def test_fbeta_random_score(self, average, beta, result):
+        y_pred = [[0.7, 0.7, 0.7], [1, 0, 0], [0.9, 0.8, 0]]
+        y_true = [[0, 0, 1], [1, 1, 0], [1, 1, 1]]
+        self._run_test(
+            y_true,
+            y_pred,
+            None,
+            average=average,
+            beta=beta,
+            threshold=0.66,
+            reference_result=result,
+        )
+
+    @parameterized.parameters(
+        # average, beta, result
+        (None, 0.5, [0.9090904, 0.555556, 1.0]),
+        (None, 1.0, [0.8, 0.6666667, 1.0]),
+        (None, 2.0, [0.71428573, 0.833334, 1.0]),
+        ("micro", 0.5, 0.833334),
+        ("micro", 1.0, 0.833334),
+        ("micro", 2.0, 0.833334),
+        ("macro", 0.5, 0.821549),
+        ("macro", 1.0, 0.822222),
+        ("macro", 2.0, 0.849206),
+        ("weighted", 0.5, 0.880471),
+        ("weighted", 1.0, 0.844445),
+        ("weighted", 2.0, 0.829365),
+    )
+    def test_fbeta_random_score_none(self, average, beta, result):
+        y_true = [
+            [1, 0, 0],
+            [0, 1, 0],
+            [0, 0, 1],
+            [1, 0, 0],
+            [1, 0, 0],
+            [0, 0, 1],
+        ]
+        y_pred = [
+            [0.9, 0.1, 0],
+            [0.2, 0.6, 0.2],
+            [0, 0, 1],
+            [0.4, 0.3, 0.3],
+            [0, 0.9, 0.1],
+            [0, 0, 1],
+        ]
+        self._run_test(
+            y_true,
+            y_pred,
+            None,
+            average=average,
+            beta=beta,
+            threshold=None,
+            reference_result=result,
+        )
+
+    @parameterized.parameters(
+        # average, beta, sample_weights, result
+        (None, 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.909091, 0.555556, 1.0]),
+        (None, 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+        (None, 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.9375, 0.714286, 1.0]),
+        (None, 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.8, 0.666667, 1.0]),
+        (None, 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+        (None, 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.857143, 0.8, 1.0]),
+        (None, 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.714286, 0.833333, 1.0]),
+        (None, 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+        (None, 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.789474, 0.909091, 1.0]),
+        ("micro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+        ("micro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("micro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+        ("micro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+        ("micro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("micro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+        ("micro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+        ("micro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("micro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+        ("macro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.821549),
+        ("macro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+        ("macro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.883929),
+        ("macro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.822222),
+        ("macro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+        ("macro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.885714),
+        ("macro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.849206),
+        ("macro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+        ("macro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.899522),
+        ("weighted", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.880471),
+        ("weighted", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("weighted", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.917857),
+        ("weighted", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.844444),
+        ("weighted", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("weighted", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.902857),
+        ("weighted", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.829365),
+        ("weighted", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+        ("weighted", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.897608),
+    )
+    def test_fbeta_weighted_random_score_none(
+        self, average, beta, sample_weights, result
+    ):
+        y_true = [
+            [1, 0, 0],
+            [0, 1, 0],
+            [0, 0, 1],
+            [1, 0, 0],
+            [1, 0, 0],
+            [0, 0, 1],
+        ]
+        y_pred = [
+            [0.9, 0.1, 0],
+            [0.2, 0.6, 0.2],
+            [0, 0, 1],
+            [0.4, 0.3, 0.3],
+            [0, 0.9, 0.1],
+            [0, 0, 1],
+        ]
+        self._run_test(
+            y_true,
+            y_pred,
+            sample_weights,
+            average=average,
+            beta=beta,
+            threshold=None,
+            reference_result=result,
+        )
+
+
+@test_utils.run_v2_only
+class F1ScoreTest(tf.test.TestCase):
+    def test_config(self):
+        f1_obj = f_score_metrics.F1Score()
+        config = f1_obj.get_config()
+        self.assertNotIn("beta", config)
+
+        # Check save and restore config
+        f1_obj = f_score_metrics.F1Score.from_config(config)
+        self.assertEqual(f1_obj.average, None)
+        self.assertEqual(f1_obj.dtype, tf.float32)
+
+    def test_correctness(self):
+        f1 = f_score_metrics.F1Score()
+        fbeta = f_score_metrics.FBetaScore(beta=1.0)
+
+        y_true = [
+            [1, 0, 0],
+            [0, 1, 0],
+            [0, 0, 1],
+            [1, 0, 0],
+            [1, 0, 0],
+            [0, 0, 1],
+        ]
+        y_pred = [
+            [0.9, 0.1, 0],
+            [0.2, 0.6, 0.2],
+            [0, 0, 1],
+            [0.4, 0.3, 0.3],
+            [0, 0.9, 0.1],
+            [0, 0, 1],
+        ]
+
+        fbeta.update_state(y_true, y_pred)
+        f1.update_state(y_true, y_pred)
+        self.assertAllClose(
+            fbeta.result().numpy(), f1.result().numpy(), atol=1e-6
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/hinge_metrics.py b/keras/metrics/hinge_metrics.py
new file mode 100644
index 000000000000..ff49472c8f0d
--- /dev/null
+++ b/keras/metrics/hinge_metrics.py
@@ -0,0 +1,136 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Hinge metrics."""
+
+from keras.dtensor import utils as dtensor_utils
+from keras.losses import categorical_hinge
+from keras.losses import hinge
+from keras.losses import squared_hinge
+from keras.metrics import base_metric
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.metrics.Hinge")
+class Hinge(base_metric.MeanMetricWrapper):
+    """Computes the hinge metric between `y_true` and `y_pred`.
+
+    `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+    provided we will convert them to -1 or 1.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.Hinge()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
+    >>> m.result().numpy()
+    1.3
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    1.1
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd', loss='mse', metrics=[tf.keras.metrics.Hinge()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="hinge", dtype=None):
+        super().__init__(hinge, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.SquaredHinge")
+class SquaredHinge(base_metric.MeanMetricWrapper):
+    """Computes the squared hinge metric between `y_true` and `y_pred`.
+
+    `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+    provided we will convert them to -1 or 1.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.SquaredHinge()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
+    >>> m.result().numpy()
+    1.86
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    1.46
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.SquaredHinge()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="squared_hinge", dtype=None):
+        super().__init__(squared_hinge, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.CategoricalHinge")
+class CategoricalHinge(base_metric.MeanMetricWrapper):
+    """Computes the categorical hinge metric between `y_true` and `y_pred`.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.CategoricalHinge()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
+    >>> m.result().numpy()
+    1.4000001
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    1.2
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.CategoricalHinge()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="categorical_hinge", dtype=None):
+        super().__init__(categorical_hinge, name, dtype=dtype)
diff --git a/keras/metrics/hinge_metrics_test.py b/keras/metrics/hinge_metrics_test.py
new file mode 100644
index 000000000000..d5b093142102
--- /dev/null
+++ b/keras/metrics/hinge_metrics_test.py
@@ -0,0 +1,193 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics."""
+
+import tensorflow.compat.v2 as tf
+
+from keras import metrics
+from keras.testing_infra import test_combinations
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class HingeTest(tf.test.TestCase):
+    def test_config(self):
+        hinge_obj = metrics.Hinge(name="hinge", dtype=tf.int32)
+        self.assertEqual(hinge_obj.name, "hinge")
+        self.assertEqual(hinge_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        hinge_obj2 = metrics.Hinge.from_config(hinge_obj.get_config())
+        self.assertEqual(hinge_obj2.name, "hinge")
+        self.assertEqual(hinge_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        hinge_obj = metrics.Hinge()
+        self.evaluate(tf.compat.v1.variables_initializer(hinge_obj.variables))
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+        #        = [0.6, 0.4125]
+        # reduced metric = (0.6 + 0.4125) / 2
+
+        update_op = hinge_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = hinge_obj.result()
+        self.assertAllClose(0.506, result, atol=1e-3)
+
+    def test_weighted(self):
+        hinge_obj = metrics.Hinge()
+        self.evaluate(tf.compat.v1.variables_initializer(hinge_obj.variables))
+        y_true = tf.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+        sample_weight = tf.constant([1.5, 2.0])
+
+        # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+        #        = [0.6, 0.4125]
+        # weighted metric = [0.6 * 1.5, 0.4125 * 2]
+        # reduced metric = (0.6 * 1.5 + 0.4125 * 2) / (1.5 + 2)
+
+        result = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(0.493, self.evaluate(result), atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class SquaredHingeTest(tf.test.TestCase):
+    def test_config(self):
+        sq_hinge_obj = metrics.SquaredHinge(name="sq_hinge", dtype=tf.int32)
+        self.assertEqual(sq_hinge_obj.name, "sq_hinge")
+        self.assertEqual(sq_hinge_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        sq_hinge_obj2 = metrics.SquaredHinge.from_config(
+            sq_hinge_obj.get_config()
+        )
+        self.assertEqual(sq_hinge_obj2.name, "sq_hinge")
+        self.assertEqual(sq_hinge_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        sq_hinge_obj = metrics.SquaredHinge()
+        self.evaluate(
+            tf.compat.v1.variables_initializer(sq_hinge_obj.variables)
+        )
+        y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+
+        # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5,
+        # 0.4]]
+        # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+        #                                         [0.5625, 0, 0.25, 0.16]]
+        # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) /
+        # 4]
+        #        = [0.485, 0.2431]
+        # reduced metric = (0.485 + 0.2431) / 2
+
+        update_op = sq_hinge_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = sq_hinge_obj.result()
+        self.assertAllClose(0.364, result, atol=1e-3)
+
+    def test_weighted(self):
+        sq_hinge_obj = metrics.SquaredHinge()
+        self.evaluate(
+            tf.compat.v1.variables_initializer(sq_hinge_obj.variables)
+        )
+        y_true = tf.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
+        y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6], [-0.25, -1.0, 0.5, 0.6]])
+        sample_weight = tf.constant([1.5, 2.0])
+
+        # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+        # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+        # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+        # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5,
+        # 0.4]]
+        # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+        #                                         [0.5625, 0, 0.25, 0.16]]
+        # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) /
+        # 4]
+        #        = [0.485, 0.2431]
+        # weighted metric = [0.485 * 1.5, 0.2431 * 2]
+        # reduced metric = (0.485 * 1.5 + 0.2431 * 2) / (1.5 + 2)
+
+        result = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(0.347, self.evaluate(result), atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class CategoricalHingeTest(tf.test.TestCase):
+    def test_config(self):
+        cat_hinge_obj = metrics.CategoricalHinge(
+            name="cat_hinge", dtype=tf.int32
+        )
+        self.assertEqual(cat_hinge_obj.name, "cat_hinge")
+        self.assertEqual(cat_hinge_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        cat_hinge_obj2 = metrics.CategoricalHinge.from_config(
+            cat_hinge_obj.get_config()
+        )
+        self.assertEqual(cat_hinge_obj2.name, "cat_hinge")
+        self.assertEqual(cat_hinge_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        cat_hinge_obj = metrics.CategoricalHinge()
+        self.evaluate(
+            tf.compat.v1.variables_initializer(cat_hinge_obj.variables)
+        )
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = cat_hinge_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = cat_hinge_obj.result()
+        self.assertAllClose(0.5, result, atol=1e-5)
+
+    def test_weighted(self):
+        cat_hinge_obj = metrics.CategoricalHinge()
+        self.evaluate(
+            tf.compat.v1.variables_initializer(cat_hinge_obj.variables)
+        )
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(0.5, self.evaluate(result), atol=1e-5)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/iou_metrics.py b/keras/metrics/iou_metrics.py
new file mode 100644
index 000000000000..377ef8858f96
--- /dev/null
+++ b/keras/metrics/iou_metrics.py
@@ -0,0 +1,759 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""IoU metrics."""
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.dtensor import utils as dtensor_utils
+from keras.metrics import base_metric
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+class _IoUBase(base_metric.Metric):
+    """Computes the confusion matrix for Intersection-Over-Union metrics.
+
+    Intersection-Over-Union is a common evaluation metric for semantic image
+    segmentation.
+
+    For an individual class, the IoU metric is defined as follows:
+
+    ```
+    iou = true_positives / (true_positives + false_positives + false_negatives)
+    ```
+
+    From IoUs of individual classes, the MeanIoU can be computed as the mean of
+    the individual IoUs.
+
+    To compute IoUs, the predictions are accumulated in a confusion matrix,
+    weighted by `sample_weight` and the metric is then calculated from it.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        This value must be provided, since a confusion matrix of size
+        `(num_classes, num_classes)` will be allocated.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      ignore_class: Optional integer. The ID of a class to be ignored during
+        metric computation. This is useful, for example, in segmentation
+        problems featuring a "void" class (commonly -1 or 255) in segmentation
+        maps. By default (`ignore_class=None`), all classes are considered.
+      sparse_y_true: Whether labels are encoded using integers or
+        dense floating point vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      sparse_y_pred: Whether predictions are encoded using integers or
+        dense floating point vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      axis: (Optional) -1 is the dimension containing the logits.
+        Defaults to `-1`.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        name: Optional[str] = None,
+        dtype: Optional[Union[str, tf.dtypes.DType]] = None,
+        ignore_class: Optional[int] = None,
+        sparse_y_true: bool = True,
+        sparse_y_pred: bool = True,
+        axis: int = -1,
+    ):
+        super().__init__(name=name, dtype=dtype)
+        self.num_classes = num_classes
+        self.ignore_class = ignore_class
+        self.sparse_y_true = sparse_y_true
+        self.sparse_y_pred = sparse_y_pred
+        self.axis = axis
+
+        # Variable to accumulate the predictions in the confusion matrix.
+        self.total_cm = self.add_weight(
+            "total_confusion_matrix",
+            shape=(num_classes, num_classes),
+            initializer="zeros",
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates the confusion matrix statistics.
+
+        Args:
+          y_true: The ground truth values.
+          y_pred: The predicted values.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+
+        if not self.sparse_y_true:
+            y_true = tf.argmax(y_true, axis=self.axis)
+        if not self.sparse_y_pred:
+            y_pred = tf.argmax(y_pred, axis=self.axis)
+
+        y_true = tf.cast(y_true, self._dtype)
+        y_pred = tf.cast(y_pred, self._dtype)
+
+        # Flatten the input if its rank > 1.
+        if y_pred.shape.ndims > 1:
+            y_pred = tf.reshape(y_pred, [-1])
+
+        if y_true.shape.ndims > 1:
+            y_true = tf.reshape(y_true, [-1])
+
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, self._dtype)
+            if sample_weight.shape.ndims > 1:
+                sample_weight = tf.reshape(sample_weight, [-1])
+
+        if self.ignore_class is not None:
+            ignore_class = tf.cast(self.ignore_class, y_true.dtype)
+            valid_mask = tf.not_equal(y_true, ignore_class)
+            y_true = y_true[valid_mask]
+            y_pred = y_pred[valid_mask]
+            if sample_weight is not None:
+                sample_weight = sample_weight[valid_mask]
+
+        # Accumulate the prediction to current confusion matrix.
+        current_cm = tf.math.confusion_matrix(
+            y_true,
+            y_pred,
+            self.num_classes,
+            weights=sample_weight,
+            dtype=self._dtype,
+        )
+        return self.total_cm.assign_add(current_cm)
+
+    def reset_state(self):
+        backend.set_value(
+            self.total_cm, np.zeros((self.num_classes, self.num_classes))
+        )
+
+
+@keras_export("keras.metrics.IoU")
+class IoU(_IoUBase):
+    """Computes the Intersection-Over-Union metric for specific target classes.
+
+    General definition and computation:
+
+    Intersection-Over-Union is a common evaluation metric for semantic image
+    segmentation.
+
+    For an individual class, the IoU metric is defined as follows:
+
+    ```
+    iou = true_positives / (true_positives + false_positives + false_negatives)
+    ```
+
+    To compute IoUs, the predictions are accumulated in a confusion matrix,
+    weighted by `sample_weight` and the metric is then calculated from it.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Note, this class first computes IoUs for all individual classes, then
+    returns the mean of IoUs for the classes that are specified by
+    `target_class_ids`. If `target_class_ids` has only one id value, the IoU of
+    that specific class is returned.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        A confusion matrix of dimension = [num_classes, num_classes] will be
+        allocated to accumulate predictions from which the metric is calculated.
+      target_class_ids: A tuple or list of target class ids for which the metric
+        is returned. To compute IoU for a specific class, a list (or tuple) of a
+        single id value should be provided.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      ignore_class: Optional integer. The ID of a class to be ignored during
+        metric computation. This is useful, for example, in segmentation
+        problems featuring a "void" class (commonly -1 or 255) in segmentation
+        maps. By default (`ignore_class=None`), all classes are considered.
+      sparse_y_true: Whether labels are encoded using integers or
+        dense floating point vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      sparse_y_pred: Whether predictions are encoded using integers or
+        dense floating point vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      axis: (Optional) -1 is the dimension containing the logits.
+        Defaults to `-1`.
+
+    Standalone usage:
+
+    >>> # cm = [[1, 1],
+    >>> #        [1, 1]]
+    >>> # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    >>> # iou = true_positives / (sum_row + sum_col - true_positives))
+    >>> # iou = [0.33, 0.33]
+    >>> m = tf.keras.metrics.IoU(num_classes=2, target_class_ids=[0])
+    >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1])
+    >>> m.result().numpy()
+    0.33333334
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1],
+    ...                sample_weight=[0.3, 0.3, 0.3, 0.1])
+    >>> # cm = [[0.3, 0.3],
+    >>> #        [0.3, 0.1]]
+    >>> # sum_row = [0.6, 0.4], sum_col = [0.6, 0.4],
+    >>> # true_positives = [0.3, 0.1]
+    >>> # iou = [0.33, 0.14]
+    >>> m.result().numpy()
+    0.33333334
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.IoU(num_classes=2, target_class_ids=[0])])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        num_classes: int,
+        target_class_ids: Union[List[int], Tuple[int, ...]],
+        name: Optional[str] = None,
+        dtype: Optional[Union[str, tf.dtypes.DType]] = None,
+        ignore_class: Optional[int] = None,
+        sparse_y_true: bool = True,
+        sparse_y_pred: bool = True,
+        axis: int = -1,
+    ):
+        super().__init__(
+            name=name,
+            num_classes=num_classes,
+            ignore_class=ignore_class,
+            sparse_y_true=sparse_y_true,
+            sparse_y_pred=sparse_y_pred,
+            axis=axis,
+            dtype=dtype,
+        )
+        if max(target_class_ids) >= num_classes:
+            raise ValueError(
+                f"Target class id {max(target_class_ids)} "
+                "is out of range, which is "
+                f"[{0}, {num_classes})."
+            )
+        self.target_class_ids = list(target_class_ids)
+
+    def result(self):
+        """Compute the intersection-over-union via the confusion matrix."""
+        sum_over_row = tf.cast(
+            tf.reduce_sum(self.total_cm, axis=0), dtype=self._dtype
+        )
+        sum_over_col = tf.cast(
+            tf.reduce_sum(self.total_cm, axis=1), dtype=self._dtype
+        )
+        true_positives = tf.cast(
+            tf.linalg.tensor_diag_part(self.total_cm), dtype=self._dtype
+        )
+
+        # sum_over_row + sum_over_col =
+        #     2 * true_positives + false_positives + false_negatives.
+        denominator = sum_over_row + sum_over_col - true_positives
+
+        # Only keep the target classes
+        true_positives = tf.gather(true_positives, self.target_class_ids)
+        denominator = tf.gather(denominator, self.target_class_ids)
+
+        # If the denominator is 0, we need to ignore the class.
+        num_valid_entries = tf.reduce_sum(
+            tf.cast(tf.not_equal(denominator, 0), dtype=self._dtype)
+        )
+
+        iou = tf.math.divide_no_nan(true_positives, denominator)
+
+        return tf.math.divide_no_nan(
+            tf.reduce_sum(iou, name="mean_iou"), num_valid_entries
+        )
+
+    def get_config(self):
+        config = {
+            "num_classes": self.num_classes,
+            "target_class_ids": self.target_class_ids,
+            "ignore_class": self.ignore_class,
+            "sparse_y_true": self.sparse_y_true,
+            "sparse_y_pred": self.sparse_y_pred,
+            "axis": self.axis,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.BinaryIoU")
+class BinaryIoU(IoU):
+    """Computes the Intersection-Over-Union metric for class 0 and/or 1.
+
+    General definition and computation:
+
+    Intersection-Over-Union is a common evaluation metric for semantic image
+    segmentation.
+
+    For an individual class, the IoU metric is defined as follows:
+
+    ```
+    iou = true_positives / (true_positives + false_positives + false_negatives)
+    ```
+
+    To compute IoUs, the predictions are accumulated in a confusion matrix,
+    weighted by `sample_weight` and the metric is then calculated from it.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    This class can be used to compute IoUs for a binary classification task
+    where the predictions are provided as logits. First a `threshold` is applied
+    to the predicted values such that those that are below the `threshold` are
+    converted to class 0 and those that are above the `threshold` are converted
+    to class 1.
+
+    IoUs for classes 0 and 1 are then computed, the mean of IoUs for the classes
+    that are specified by `target_class_ids` is returned.
+
+    Note: with `threshold=0`, this metric has the same behavior as `IoU`.
+
+    Args:
+      target_class_ids: A tuple or list of target class ids for which the metric
+        is returned. Options are `[0]`, `[1]`, or `[0, 1]`. With `[0]` (or
+        `[1]`), the IoU metric for class 0 (or class 1, respectively) is
+        returned. With `[0, 1]`, the mean of IoUs for the two classes is
+        returned.
+      threshold: A threshold that applies to the prediction logits to convert
+        them to either predicted class 0 if the logit is below `threshold` or
+        predicted class 1 if the logit is above `threshold`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.3)
+    >>> m.update_state([0, 1, 0, 1], [0.1, 0.2, 0.4, 0.7])
+    >>> m.result().numpy()
+    0.33333334
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 1, 0, 1], [0.1, 0.2, 0.4, 0.7],
+    ...                sample_weight=[0.2, 0.3, 0.4, 0.1])
+    >>> # cm = [[0.2, 0.4],
+    >>> #        [0.3, 0.1]]
+    >>> # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5],
+    >>> # true_positives = [0.2, 0.1]
+    >>> # iou = [0.222, 0.125]
+    >>> m.result().numpy()
+    0.17361112
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.BinaryIoU(target_class_ids=[0], threshold=0.5)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        target_class_ids: Union[List[int], Tuple[int, ...]] = (0, 1),
+        threshold=0.5,
+        name=None,
+        dtype=None,
+    ):
+
+        super().__init__(
+            num_classes=2,
+            target_class_ids=target_class_ids,
+            name=name,
+            dtype=dtype,
+        )
+        self.threshold = threshold
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates the confusion matrix statistics.
+
+        Before the confusion matrix is updated, the predicted values are
+        thresholded to be:
+          0 for values that are smaller than the `threshold`
+          1 for values that are larger or equal to the `threshold`
+
+        Args:
+          y_true: The ground truth values.
+          y_pred: The predicted values.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        y_pred = tf.cast(y_pred, self._dtype)
+        y_pred = tf.cast(y_pred >= self.threshold, self._dtype)
+        return super().update_state(y_true, y_pred, sample_weight)
+
+    def get_config(self):
+        return {
+            "target_class_ids": self.target_class_ids,
+            "threshold": self.threshold,
+            "name": self.name,
+            "dtype": self._dtype,
+        }
+
+
+@keras_export("keras.metrics.MeanIoU")
+class MeanIoU(IoU):
+    """Computes the mean Intersection-Over-Union metric.
+
+    General definition and computation:
+
+    Intersection-Over-Union is a common evaluation metric for semantic image
+    segmentation.
+
+    For an individual class, the IoU metric is defined as follows:
+
+    ```
+    iou = true_positives / (true_positives + false_positives + false_negatives)
+    ```
+
+    To compute IoUs, the predictions are accumulated in a confusion matrix,
+    weighted by `sample_weight` and the metric is then calculated from it.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Note that this class first computes IoUs for all individual classes, then
+    returns the mean of these values.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        This value must be provided, since a confusion matrix of dimension =
+        [num_classes, num_classes] will be allocated.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      ignore_class: Optional integer. The ID of a class to be ignored during
+        metric computation. This is useful, for example, in segmentation
+        problems featuring a "void" class (commonly -1 or 255) in segmentation
+        maps. By default (`ignore_class=None`), all classes are considered.
+      sparse_y_true: Whether labels are encoded using integers or
+        dense floating point vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      sparse_y_pred: Whether predictions are encoded using integers or
+        dense floating point vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      axis: (Optional) The dimension containing the logits. Defaults to `-1`.
+
+    Standalone usage:
+
+    >>> # cm = [[1, 1],
+    >>> #        [1, 1]]
+    >>> # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    >>> # iou = true_positives / (sum_row + sum_col - true_positives))
+    >>> # result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2 = 0.33
+    >>> m = tf.keras.metrics.MeanIoU(num_classes=2)
+    >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1])
+    >>> m.result().numpy()
+    0.33333334
+
+    >>> m.reset_state()
+    >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1],
+    ...                sample_weight=[0.3, 0.3, 0.3, 0.1])
+    >>> m.result().numpy()
+    0.23809525
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.MeanIoU(num_classes=2)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        num_classes: int,
+        name: Optional[str] = None,
+        dtype: Optional[Union[str, tf.dtypes.DType]] = None,
+        ignore_class: Optional[int] = None,
+        sparse_y_true: bool = True,
+        sparse_y_pred: bool = True,
+        axis: int = -1,
+    ):
+        target_class_ids = list(range(num_classes))
+        super().__init__(
+            name=name,
+            num_classes=num_classes,
+            target_class_ids=target_class_ids,
+            axis=axis,
+            dtype=dtype,
+            ignore_class=ignore_class,
+            sparse_y_true=sparse_y_true,
+            sparse_y_pred=sparse_y_pred,
+        )
+
+    def get_config(self):
+        return {
+            "num_classes": self.num_classes,
+            "name": self.name,
+            "dtype": self._dtype,
+            "ignore_class": self.ignore_class,
+            "sparse_y_true": self.sparse_y_true,
+            "sparse_y_pred": self.sparse_y_pred,
+            "axis": self.axis,
+        }
+
+
+@keras_export("keras.metrics.OneHotIoU")
+class OneHotIoU(IoU):
+    """Computes the Intersection-Over-Union metric for one-hot encoded labels.
+
+    General definition and computation:
+
+    Intersection-Over-Union is a common evaluation metric for semantic image
+    segmentation.
+
+    For an individual class, the IoU metric is defined as follows:
+
+    ```
+    iou = true_positives / (true_positives + false_positives + false_negatives)
+    ```
+
+    To compute IoUs, the predictions are accumulated in a confusion matrix,
+    weighted by `sample_weight` and the metric is then calculated from it.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    This class can be used to compute IoU for multi-class classification tasks
+    where the labels are one-hot encoded (the last axis should have one
+    dimension per class). Note that the predictions should also have the same
+    shape. To compute the IoU, first the labels and predictions are converted
+    back into integer format by taking the argmax over the class axis. Then the
+    same computation steps as for the base `IoU` class apply.
+
+    Note, if there is only one channel in the labels and predictions, this class
+    is the same as class `IoU`. In this case, use `IoU` instead.
+
+    Also, make sure that `num_classes` is equal to the number of classes in the
+    data, to avoid a "labels out of bound" error when the confusion matrix is
+    computed.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        A confusion matrix of shape `(num_classes, num_classes)` will be
+        allocated to accumulate predictions from which the metric is calculated.
+      target_class_ids: A tuple or list of target class ids for which the metric
+        is returned. To compute IoU for a specific class, a list (or tuple) of a
+        single id value should be provided.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      ignore_class: Optional integer. The ID of a class to be ignored during
+        metric computation. This is useful, for example, in segmentation
+        problems featuring a "void" class (commonly -1 or 255) in segmentation
+        maps. By default (`ignore_class=None`), all classes are considered.
+      sparse_y_pred: Whether predictions are encoded using natural numbers or
+        probability distribution vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      axis: (Optional) The dimension containing the logits. Defaults to `-1`.
+
+    Standalone usage:
+
+    >>> y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
+    >>> y_pred = tf.constant([[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1],
+    ...                       [0.1, 0.4, 0.5]])
+    >>> sample_weight = [0.1, 0.2, 0.3, 0.4]
+    >>> m = tf.keras.metrics.OneHotIoU(num_classes=3, target_class_ids=[0, 2])
+    >>> m.update_state(
+    ...     y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
+    >>> # cm = [[0, 0, 0.2+0.4],
+    >>> #       [0.3, 0, 0],
+    >>> #       [0, 0, 0.1]]
+    >>> # sum_row = [0.3, 0, 0.7], sum_col = [0.6, 0.3, 0.1]
+    >>> # true_positives = [0, 0, 0.1]
+    >>> # single_iou = true_positives / (sum_row + sum_col - true_positives))
+    >>> # mean_iou = (0 / (0.3 + 0.6 - 0) + 0.1 / (0.7 + 0.1 - 0.1)) / 2
+    >>> m.result().numpy()
+    0.071
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.OneHotIoU(num_classes=3, target_class_id=[1])])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        num_classes: int,
+        target_class_ids: Union[List[int], Tuple[int, ...]],
+        name=None,
+        dtype=None,
+        ignore_class: Optional[int] = None,
+        sparse_y_pred: bool = False,
+        axis: int = -1,
+    ):
+        super().__init__(
+            num_classes=num_classes,
+            target_class_ids=target_class_ids,
+            name=name,
+            dtype=dtype,
+            ignore_class=ignore_class,
+            sparse_y_true=False,
+            sparse_y_pred=sparse_y_pred,
+            axis=axis,
+        )
+
+    def get_config(self):
+        return {
+            "num_classes": self.num_classes,
+            "target_class_ids": self.target_class_ids,
+            "name": self.name,
+            "dtype": self._dtype,
+            "ignore_class": self.ignore_class,
+            "sparse_y_pred": self.sparse_y_pred,
+            "axis": self.axis,
+        }
+
+
+@keras_export("keras.metrics.OneHotMeanIoU")
+class OneHotMeanIoU(MeanIoU):
+    """Computes mean Intersection-Over-Union metric for one-hot encoded labels.
+
+    General definition and computation:
+
+    Intersection-Over-Union is a common evaluation metric for semantic image
+    segmentation.
+
+    For an individual class, the IoU metric is defined as follows:
+
+    ```
+    iou = true_positives / (true_positives + false_positives + false_negatives)
+    ```
+
+    To compute IoUs, the predictions are accumulated in a confusion matrix,
+    weighted by `sample_weight` and the metric is then calculated from it.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    This class can be used to compute the mean IoU for multi-class
+    classification tasks where the labels are one-hot encoded (the last axis
+    should have one dimension per class). Note that the predictions should also
+    have the same shape. To compute the mean IoU, first the labels and
+    predictions are converted back into integer format by taking the argmax over
+    the class axis. Then the same computation steps as for the base `MeanIoU`
+    class apply.
+
+    Note, if there is only one channel in the labels and predictions, this class
+    is the same as class `MeanIoU`. In this case, use `MeanIoU` instead.
+
+    Also, make sure that `num_classes` is equal to the number of classes in the
+    data, to avoid a "labels out of bound" error when the confusion matrix is
+    computed.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        A confusion matrix of shape `(num_classes, num_classes)` will be
+        allocated to accumulate predictions from which the metric is calculated.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      ignore_class: Optional integer. The ID of a class to be ignored during
+        metric computation. This is useful, for example, in segmentation
+        problems featuring a "void" class (commonly -1 or 255) in segmentation
+        maps. By default (`ignore_class=None`), all classes are considered.
+      sparse_y_pred: Whether predictions are encoded using natural numbers or
+        probability distribution vectors. If `False`, the `tf.argmax` function
+        will be used to determine each sample's most likely associated label.
+      axis: (Optional) The dimension containing the logits. Defaults to `-1`.
+
+    Standalone usage:
+
+    >>> y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
+    >>> y_pred = tf.constant([[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1],
+    ...                       [0.1, 0.4, 0.5]])
+    >>> sample_weight = [0.1, 0.2, 0.3, 0.4]
+    >>> m = tf.keras.metrics.OneHotMeanIoU(num_classes=3)
+    >>> m.update_state(
+    ...     y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
+    >>> # cm = [[0, 0, 0.2+0.4],
+    >>> #       [0.3, 0, 0],
+    >>> #       [0, 0, 0.1]]
+    >>> # sum_row = [0.3, 0, 0.7], sum_col = [0.6, 0.3, 0.1]
+    >>> # true_positives = [0, 0, 0.1]
+    >>> # single_iou = true_positives / (sum_row + sum_col - true_positives))
+    >>> # mean_iou = (0 + 0 + 0.1 / (0.7 + 0.1 - 0.1)) / 3
+    >>> m.result().numpy()
+    0.048
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.OneHotMeanIoU(num_classes=3)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        num_classes: int,
+        name: str = None,
+        dtype: Optional[Union[str, tf.dtypes.DType]] = None,
+        ignore_class: Optional[int] = None,
+        sparse_y_pred: bool = False,
+        axis: int = -1,
+    ):
+        super().__init__(
+            num_classes=num_classes,
+            axis=axis,
+            name=name,
+            dtype=dtype,
+            ignore_class=ignore_class,
+            sparse_y_true=False,
+            sparse_y_pred=sparse_y_pred,
+        )
+
+    def get_config(self):
+        return {
+            "num_classes": self.num_classes,
+            "name": self.name,
+            "dtype": self._dtype,
+            "ignore_class": self.ignore_class,
+            "sparse_y_pred": self.sparse_y_pred,
+            "axis": self.axis,
+        }
diff --git a/keras/metrics/iou_metrics_test.py b/keras/metrics/iou_metrics_test.py
new file mode 100644
index 000000000000..a642abeeeffe
--- /dev/null
+++ b/keras/metrics/iou_metrics_test.py
@@ -0,0 +1,475 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics."""
+
+import tensorflow.compat.v2 as tf
+
+from keras import metrics
+from keras.testing_infra import test_combinations
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class IoUTest(tf.test.TestCase):
+    def test_config(self):
+        obj = metrics.IoU(
+            num_classes=2, target_class_ids=[1, 0], name="iou_class_1_0"
+        )
+        self.assertEqual(obj.name, "iou_class_1_0")
+        self.assertEqual(obj.num_classes, 2)
+        self.assertEqual(obj.target_class_ids, [1, 0])
+
+        obj2 = metrics.IoU.from_config(obj.get_config())
+        self.assertEqual(obj2.name, "iou_class_1_0")
+        self.assertEqual(obj2.num_classes, 2)
+        self.assertEqual(obj2.target_class_ids, [1, 0])
+
+    def test_unweighted(self):
+        y_pred = [0, 1, 0, 1]
+        y_true = [0, 0, 1, 1]
+
+        obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+
+        result = obj(y_true, y_pred)
+
+        # cm = [[1, 1],
+        #       [1, 1]]
+        # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_weighted(self):
+        y_pred = tf.constant([0, 1, 0, 1], dtype=tf.float32)
+        y_true = tf.constant([0, 0, 1, 1])
+        sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
+
+        obj = metrics.IoU(num_classes=2, target_class_ids=[1, 0])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+
+        result = obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # cm = [[0.2, 0.3],
+        #       [0.4, 0.1]]
+        # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2,
+        # 0.1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.1 / (0.4 + 0.5 - 0.1) + 0.2 / (0.6 + 0.5 - 0.2)
+        ) / 2
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_multi_dim_input(self):
+        y_pred = tf.constant([[0, 1], [0, 1]], dtype=tf.float32)
+        y_true = tf.constant([[0, 0], [1, 1]])
+        sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]])
+
+        obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+
+        result = obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # cm = [[0.2, 0.3],
+        #       [0.4, 0.1]]
+        # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2,
+        # 0.1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)
+        ) / 2
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_zero_valid_entries(self):
+        obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        self.assertAllClose(self.evaluate(obj.result()), 0, atol=1e-3)
+
+    def test_zero_and_non_zero_entries(self):
+        y_pred = tf.constant([1], dtype=tf.float32)
+        y_true = tf.constant([1])
+
+        obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred)
+
+        # cm = [[0, 0],
+        #       [0, 1]]
+        # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (1 / (1 + 1 - 1)) / 1
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class BinaryIoUTest(tf.test.TestCase):
+    def test_config(self):
+        obj = metrics.BinaryIoU(
+            target_class_ids=[1, 0], threshold=0.1, name="iou_class_1_0"
+        )
+        self.assertEqual(obj.name, "iou_class_1_0")
+        self.assertAlmostEqual(obj.threshold, 0.1)
+        self.assertEqual(obj.target_class_ids, [1, 0])
+
+        obj2 = metrics.BinaryIoU.from_config(obj.get_config())
+        self.assertEqual(obj.name, "iou_class_1_0")
+        self.assertAlmostEqual(obj2.threshold, 0.1)
+        self.assertEqual(obj.target_class_ids, [1, 0])
+
+    def test_different_thresholds_weighted(self):
+        y_true = [0, 1, 0, 1]
+        y_pred = [0.1, 0.2, 0.4, 0.7]
+
+        sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
+        # with threshold = 0.3, y_pred will be converted to [0, 0, 1, 1]
+        # cm = [[0.2, 0.4],
+        #       [0.3, 0.1]]
+        # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2,
+        # 0.1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)
+        ) / 2
+        obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.3)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+        sample_weight = tf.constant([0.1, 0.2, 0.4, 0.3])
+        # with threshold = 0.5, y_pred will be converted to [0, 0, 0, 1]
+        # cm = [[0.1+0.4, 0],
+        #       [0.2, 0.3]]
+        # sum_row = [0.5, 0.5], sum_col = [0.7, 0.3], true_positives = [0.5,
+        # 0.3]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.5 / (0.5 + 0.7 - 0.5) + 0.3 / (0.5 + 0.3 - 0.3)
+        ) / 2
+        obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.5)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_different_thresholds_unweighted(self):
+        y_true = [0, 1, 0, 1]
+        y_pred = [0.1, 0.2, 0.4, 0.7]
+
+        # with threshold = 0.3, y_pred will be converted to [0, 0, 1, 1]
+        # cm = [[1, 1],
+        #       [1, 1]]
+        # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
+        obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.3)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+        # with threshold = 0.5, y_pred will be converted to [0, 0, 0, 1]
+        # cm = [[2, 0],
+        #       [1, 1]]
+        # sum_row = [2, 2], sum_col = [3, 1], true_positives = [2, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (2 / (2 + 3 - 2) + 1 / (2 + 1 - 1)) / 2
+        obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.5)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_multi_dim_input(self):
+        y_true = tf.constant([[0, 1], [0, 1]], dtype=tf.float32)
+        y_pred = tf.constant([[0.1, 0.7], [0.9, 0.3]])
+        threshold = 0.4  # y_pred will become [[0, 1], [1, 0]]
+        sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]])
+        # cm = [[0.2, 0.4],
+        #       [0.1, 0.3]]
+        # sum_row = [0.6, 0.4], sum_col = [0.3, 0.7], true_positives = [0.2,
+        # 0.3]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.2 / (0.6 + 0.3 - 0.2) + 0.3 / (0.4 + 0.7 - 0.3)
+        ) / 2
+        obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=threshold)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_zero_valid_entries(self):
+        obj = metrics.BinaryIoU(target_class_ids=[0, 1])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        self.assertAllClose(self.evaluate(obj.result()), 0, atol=1e-3)
+
+    def test_zero_and_non_zero_entries(self):
+        y_pred = tf.constant([0.6], dtype=tf.float32)
+        threshold = 0.5
+        y_true = tf.constant([1])
+
+        obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=threshold)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred)
+
+        # cm = [[0, 0],
+        #       [0, 1]]
+        # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = 1 / (1 + 1 - 1)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MeanIoUTest(tf.test.TestCase):
+    def test_config(self):
+        m_obj = metrics.MeanIoU(num_classes=2, name="mean_iou")
+        self.assertEqual(m_obj.name, "mean_iou")
+        self.assertEqual(m_obj.num_classes, 2)
+
+        m_obj2 = metrics.MeanIoU.from_config(m_obj.get_config())
+        self.assertEqual(m_obj2.name, "mean_iou")
+        self.assertEqual(m_obj2.num_classes, 2)
+
+    def test_unweighted(self):
+        y_pred = [0, 1, 0, 1]
+        y_true = [0, 0, 1, 1]
+
+        m_obj = metrics.MeanIoU(num_classes=2)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+
+        result = m_obj(y_true, y_pred)
+
+        # cm = [[1, 1],
+        #       [1, 1]]
+        # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_unweighted_ignore_class_255(self):
+        y_pred = [0, 1, 1, 1]
+        y_true = [0, 1, 2, 255]
+
+        m_obj = metrics.MeanIoU(num_classes=3, ignore_class=255)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+
+        result = m_obj(y_true, y_pred)
+
+        # cm = [[1, 0, 0],
+        #       [0, 1, 0],
+        #       [0, 1, 0]]
+        # sum_row = [1, 1, 1], sum_col = [1, 2, 0], true_positives = [1, 1, 0]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            1 / (1 + 1 - 1) + 1 / (2 + 1 - 1) + 0 / (0 + 1 - 0)
+        ) / 3
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_unweighted_ignore_class_1(self):
+        y_pred = [0, 1, 1, 1]
+        y_true = [0, 1, 2, -1]
+
+        m_obj = metrics.MeanIoU(num_classes=3, ignore_class=-1)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+
+        result = m_obj(y_true, y_pred)
+
+        # cm = [[1, 0, 0],
+        #       [0, 1, 0],
+        #       [0, 1, 0]]
+        # sum_row = [1, 1, 1], sum_col = [1, 2, 0], true_positives = [1, 1, 0]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            1 / (1 + 1 - 1) + 1 / (2 + 1 - 1) + 0 / (0 + 1 - 0)
+        ) / 3
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_weighted(self):
+        y_pred = tf.constant([0, 1, 0, 1], dtype=tf.float32)
+        y_true = tf.constant([0, 0, 1, 1])
+        sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
+
+        m_obj = metrics.MeanIoU(num_classes=2)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+
+        result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # cm = [[0.2, 0.3],
+        #       [0.4, 0.1]]
+        # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2,
+        # 0.1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)
+        ) / 2
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_weighted_ignore_class_1(self):
+        y_pred = tf.constant([0, 1, 0, 1], dtype=tf.float32)
+        y_true = tf.constant([0, 0, 1, -1])
+        sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
+
+        m_obj = metrics.MeanIoU(num_classes=2, ignore_class=-1)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+
+        result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # cm = [[0.2, 0.3],
+        #       [0.4, 0.0]]
+        # sum_row = [0.6, 0.3], sum_col = [0.5, 0.4], true_positives = [0.2,
+        # 0.0]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.2 / (0.6 + 0.5 - 0.2) + 0.0 / (0.3 + 0.4 - 0.0)
+        ) / 2
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_multi_dim_input(self):
+        y_pred = tf.constant([[0, 1], [0, 1]], dtype=tf.float32)
+        y_true = tf.constant([[0, 0], [1, 1]])
+        sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]])
+
+        m_obj = metrics.MeanIoU(num_classes=2)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+
+        result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # cm = [[0.2, 0.3],
+        #       [0.4, 0.1]]
+        # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2,
+        # 0.1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)
+        ) / 2
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_zero_valid_entries(self):
+        m_obj = metrics.MeanIoU(num_classes=2)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+        self.assertAllClose(self.evaluate(m_obj.result()), 0, atol=1e-3)
+
+    def test_zero_and_non_zero_entries(self):
+        y_pred = tf.constant([1], dtype=tf.float32)
+        y_true = tf.constant([1])
+
+        m_obj = metrics.MeanIoU(num_classes=2)
+        self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
+        result = m_obj(y_true, y_pred)
+
+        # cm = [[0, 0],
+        #       [0, 1]]
+        # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (0 + 1 / (1 + 1 - 1)) / 1
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class OneHotIoUTest(tf.test.TestCase):
+    def test_unweighted(self):
+        y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
+        # y_true will be converted to [2, 0, 1, 0]
+        y_pred = tf.constant(
+            [[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1], [0.1, 0.4, 0.5]]
+        )
+        # y_pred will be converted to [2, 2, 0, 2]
+        # cm = [[0, 0, 2],
+        #       [1, 0, 0],
+        #       [0, 0, 1]
+        # sum_row = [1, 0, 3], sum_col = [2, 1, 1], true_positives = [0, 0, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (0 / (1 + 2 - 0) + 1 / (3 + 1 - 1)) / 2
+        obj = metrics.OneHotIoU(num_classes=3, target_class_ids=[0, 2])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_weighted(self):
+        y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
+        # y_true will be converted to [2, 0, 1, 0]
+        y_pred = tf.constant(
+            [[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1], [0.1, 0.4, 0.5]]
+        )
+        # y_pred will be converted to [2, 2, 0, 2]
+        sample_weight = [0.1, 0.2, 0.3, 0.4]
+        # cm = [[0, 0, 0.2+0.4],
+        #       [0.3, 0, 0],
+        #       [0, 0, 0.1]]
+        # sum_row = [0.3, 0, 0.7], sum_col = [0.6, 0.3, 0.1]
+        # true_positives = [0, 0, 0.1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (0 / (0.3 + 0.6 - 0) + 0.1 / (0.7 + 0.1 - 0.1)) / 2
+        obj = metrics.OneHotIoU(num_classes=3, target_class_ids=[0, 2])
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class OneHotMeanIoUTest(tf.test.TestCase):
+    def test_unweighted(self):
+        y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
+        # y_true will be converted to [2, 0, 1, 0]
+        y_pred = tf.constant(
+            [[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1], [0.1, 0.4, 0.5]]
+        )
+        # y_pred will be converted to [2, 2, 0, 2]
+        # cm = [[0, 0, 2],
+        #       [1, 0, 0],
+        #       [0, 0, 1]
+        # sum_row = [1, 0, 3], sum_col = [2, 1, 1], true_positives = [0, 0, 1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (0 + 0 + 1 / (3 + 1 - 1)) / 3
+        obj = metrics.OneHotMeanIoU(num_classes=3)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+    def test_weighted(self):
+        y_true = tf.constant(
+            [
+                [0, 0, 1],
+                [1, 0, 0],
+                [0, 1, 0],
+                [1, 0, 0],
+                [1, 0, 0],
+            ]
+        )
+        # y_true will be converted to [2, 0, 1, 0, 0]
+        y_pred = tf.constant(
+            [
+                [0.2, 0.3, 0.5],
+                [0.1, 0.2, 0.7],
+                [0.5, 0.3, 0.1],
+                [0.1, 0.4, 0.5],
+                [0.6, 0.2, 0.2],
+            ]
+        )
+        # y_pred will be converted to [2, 2, 0, 2, 0]
+        sample_weight = [0.1, 0.2, 0.3, 0.3, 0.1]
+        # cm = [[0.1, 0, 0.2+0.3],
+        #       [0.3, 0, 0],
+        #       [0, 0, 0.1]]
+        # sum_row = [0.4, 0, 0.6], sum_col = [0.6, 0.3, 0.1]
+        # true_positives = [0.1, 0, 0.1]
+        # iou = true_positives / (sum_row + sum_col - true_positives))
+        expected_result = (
+            0.1 / (0.4 + 0.6 - 0.1) + 0 + 0.1 / (0.6 + 0.1 - 0.1)
+        ) / 3
+        obj = metrics.OneHotMeanIoU(num_classes=3)
+        self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
+        result = obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/metrics.py b/keras/metrics/metrics.py
deleted file mode 100644
index 18a114d28250..000000000000
--- a/keras/metrics/metrics.py
+++ /dev/null
@@ -1,3471 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=g-classes-have-attributes
-# pylint: disable=g-doc-return-or-yield
-"""Built-in metrics."""
-
-import abc
-from typing import List, Tuple, Union
-
-from keras import activations
-from keras import backend
-from keras.dtensor import utils as dtensor_utils
-from keras.losses import binary_crossentropy
-from keras.losses import categorical_crossentropy
-from keras.losses import categorical_hinge
-from keras.losses import hinge
-from keras.losses import kullback_leibler_divergence
-from keras.losses import logcosh
-from keras.losses import mean_absolute_error
-from keras.losses import mean_absolute_percentage_error
-from keras.losses import mean_squared_error
-from keras.losses import mean_squared_logarithmic_error
-from keras.losses import poisson
-from keras.losses import sparse_categorical_crossentropy
-from keras.losses import squared_hinge
-from keras.metrics import base_metric
-from keras.utils import losses_utils
-from keras.utils import metrics_utils
-from keras.utils.generic_utils import to_list
-from keras.utils.tf_utils import is_tensor_or_variable
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.metrics.MeanRelativeError')
-class MeanRelativeError(base_metric.Mean):
-  """Computes the mean relative error by normalizing with the given values.
-
-  This metric creates two local variables, `total` and `count` that are used to
-  compute the mean relative error. This is weighted by `sample_weight`, and
-  it is ultimately returned as `mean_relative_error`:
-  an idempotent operation that simply divides `total` by `count`.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    normalizer: The normalizer values with same shape as predictions.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.MeanRelativeError(normalizer=[1, 3, 2, 3])
-  >>> m.update_state([1, 3, 2, 3], [2, 4, 6, 8])
-
-  >>> # metric = mean(|y_pred - y_true| / normalizer)
-  >>> #        = mean([1, 1, 4, 5] / [1, 3, 2, 3]) = mean([1, 1/3, 2, 5/3])
-  >>> #        = 5/4 = 1.25
-  >>> m.result().numpy()
-  1.25
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.MeanRelativeError(normalizer=[1, 3])])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, normalizer, name=None, dtype=None):
-    super().__init__(name=name, dtype=dtype)
-    normalizer = tf.cast(normalizer, self._dtype)
-    self.normalizer = normalizer
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates metric statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    y_true = tf.cast(y_true, self._dtype)
-    y_pred = tf.cast(y_pred, self._dtype)
-    [y_pred, y_true], sample_weight = \
-        metrics_utils.ragged_assert_compatible_and_get_flat_values(
-            [y_pred, y_true], sample_weight)
-    y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
-        y_pred, y_true)
-
-    y_pred, self.normalizer = losses_utils.remove_squeezable_dimensions(
-        y_pred, self.normalizer)
-    y_pred.shape.assert_is_compatible_with(y_true.shape)
-    relative_errors = tf.math.divide_no_nan(
-        tf.abs(y_true - y_pred), self.normalizer)
-
-    return super().update_state(
-        relative_errors, sample_weight=sample_weight)
-
-  def get_config(self):
-    n = self.normalizer
-    config = {'normalizer': backend.eval(n) if is_tensor_or_variable(n) else n}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.Accuracy')
-class Accuracy(base_metric.MeanMetricWrapper):
-  """Calculates how often predictions equal labels.
-
-  This metric creates two local variables, `total` and `count` that are used to
-  compute the frequency with which `y_pred` matches `y_true`. This frequency is
-  ultimately returned as `binary accuracy`: an idempotent operation that simply
-  divides `total` by `count`.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.Accuracy()
-  >>> m.update_state([[1], [2], [3], [4]], [[0], [2], [3], [4]])
-  >>> m.result().numpy()
-  0.75
-
-  >>> m.reset_state()
-  >>> m.update_state([[1], [2], [3], [4]], [[0], [2], [3], [4]],
-  ...                sample_weight=[1, 1, 0, 0])
-  >>> m.result().numpy()
-  0.5
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.Accuracy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='accuracy', dtype=None):
-    super().__init__(accuracy, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.BinaryAccuracy')
-class BinaryAccuracy(base_metric.MeanMetricWrapper):
-  """Calculates how often predictions match binary labels.
-
-  This metric creates two local variables, `total` and `count` that are used to
-  compute the frequency with which `y_pred` matches `y_true`. This frequency is
-  ultimately returned as `binary accuracy`: an idempotent operation that simply
-  divides `total` by `count`.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    threshold: (Optional) Float representing the threshold for deciding
-    whether prediction values are 1 or 0.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.BinaryAccuracy()
-  >>> m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]])
-  >>> m.result().numpy()
-  0.75
-
-  >>> m.reset_state()
-  >>> m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]],
-  ...                sample_weight=[1, 0, 0, 1])
-  >>> m.result().numpy()
-  0.5
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.BinaryAccuracy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='binary_accuracy', dtype=None, threshold=0.5):
-    super().__init__(
-        metrics_utils.binary_matches, name, dtype=dtype, threshold=threshold)
-
-
-@keras_export('keras.metrics.CategoricalAccuracy')
-class CategoricalAccuracy(base_metric.MeanMetricWrapper):
-  """Calculates how often predictions match one-hot labels.
-
-  You can provide logits of classes as `y_pred`, since argmax of
-  logits and probabilities are same.
-
-  This metric creates two local variables, `total` and `count` that are used to
-  compute the frequency with which `y_pred` matches `y_true`. This frequency is
-  ultimately returned as `categorical accuracy`: an idempotent operation that
-  simply divides `total` by `count`.
-
-  `y_pred` and `y_true` should be passed in as vectors of probabilities, rather
-  than as labels. If necessary, use `tf.one_hot` to expand `y_true` as a vector.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.CategoricalAccuracy()
-  >>> m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8],
-  ...                 [0.05, 0.95, 0]])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8],
-  ...                 [0.05, 0.95, 0]],
-  ...                sample_weight=[0.7, 0.3])
-  >>> m.result().numpy()
-  0.3
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.CategoricalAccuracy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='categorical_accuracy', dtype=None):
-    super().__init__(
-        lambda y_true, y_pred: metrics_utils.sparse_categorical_matches(  # pylint: disable=g-long-lambda
-            tf.math.argmax(y_true, axis=-1), y_pred),
-        name,
-        dtype=dtype)
-
-
-@keras_export('keras.metrics.SparseCategoricalAccuracy')
-class SparseCategoricalAccuracy(base_metric.MeanMetricWrapper):
-  """Calculates how often predictions match integer labels.
-
-  ```python
-  acc = np.dot(sample_weight, np.equal(y_true, np.argmax(y_pred, axis=1))
-  ```
-
-  You can provide logits of classes as `y_pred`, since argmax of
-  logits and probabilities are same.
-
-  This metric creates two local variables, `total` and `count` that are used to
-  compute the frequency with which `y_pred` matches `y_true`. This frequency is
-  ultimately returned as `sparse categorical accuracy`: an idempotent operation
-  that simply divides `total` by `count`.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.SparseCategoricalAccuracy()
-  >>> m.update_state([[2], [1]], [[0.1, 0.6, 0.3], [0.05, 0.95, 0]])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([[2], [1]], [[0.1, 0.6, 0.3], [0.05, 0.95, 0]],
-  ...                sample_weight=[0.7, 0.3])
-  >>> m.result().numpy()
-  0.3
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='sparse_categorical_accuracy', dtype=None):
-    super().__init__(
-        metrics_utils.sparse_categorical_matches, name, dtype=dtype)
-
-
-_SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING = """Accumulates metric statistics.
-
-For sparse categorical metrics, the shapes of `y_true` and `y_pred` are
-different.
-
-Args:
-  y_true: Ground truth label values. shape = `[batch_size, d0, .. dN-1]` or
-    shape = `[batch_size, d0, .. dN-1, 1]`.
-  y_pred: The predicted probability values. shape = `[batch_size, d0, .. dN]`.
-  sample_weight: Optional `sample_weight` acts as a
-    coefficient for the metric. If a scalar is provided, then the metric is
-    simply scaled by the given value. If `sample_weight` is a tensor of size
-    `[batch_size]`, then the metric for each sample of the batch is rescaled
-    by the corresponding element in the `sample_weight` vector. If the shape
-    of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be broadcasted
-    to this shape), then each metric element of `y_pred` is scaled by the
-    corresponding value of `sample_weight`. (Note on `dN-1`: all metric
-    functions reduce by 1 dimension, usually the last axis (-1)).
-
-Returns:
-  Update op.
-"""
-
-SparseCategoricalAccuracy.update_state.__doc__ = _SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING
-
-
-@keras_export('keras.metrics.TopKCategoricalAccuracy')
-class TopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
-  """Computes how often targets are in the top `K` predictions.
-
-  Args:
-    k: (Optional) Number of top elements to look at for computing accuracy.
-      Defaults to 5.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.TopKCategoricalAccuracy(k=1)
-  >>> m.update_state([[0, 0, 1], [0, 1, 0]],
-  ...                [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 0, 1], [0, 1, 0]],
-  ...                [[0.1, 0.9, 0.8], [0.05, 0.95, 0]],
-  ...                sample_weight=[0.7, 0.3])
-  >>> m.result().numpy()
-  0.3
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, k=5, name='top_k_categorical_accuracy', dtype=None):
-    super().__init__(
-        lambda yt, yp, k: metrics_utils.sparse_top_k_categorical_matches(  # pylint: disable=g-long-lambda
-            tf.math.argmax(yt, axis=-1), yp, k),
-        name,
-        dtype=dtype,
-        k=k)
-
-
-@keras_export('keras.metrics.SparseTopKCategoricalAccuracy')
-class SparseTopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
-  """Computes how often integer targets are in the top `K` predictions.
-
-  Args:
-    k: (Optional) Number of top elements to look at for computing accuracy.
-      Defaults to 5.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1)
-  >>> m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]],
-  ...                sample_weight=[0.7, 0.3])
-  >>> m.result().numpy()
-  0.3
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, k=5, name='sparse_top_k_categorical_accuracy', dtype=None):
-    super().__init__(
-        metrics_utils.sparse_top_k_categorical_matches, name, dtype=dtype, k=k)
-
-
-SparseTopKCategoricalAccuracy.update_state.__doc__ = _SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING
-
-
-class _ConfusionMatrixConditionCount(base_metric.Metric):
-  """Calculates the number of the given confusion matrix condition.
-
-  Args:
-    confusion_matrix_cond: One of `metrics_utils.ConfusionMatrix` conditions.
-    thresholds: (Optional) Defaults to 0.5. A float value or a python list/tuple
-      of float threshold values in [0, 1]. A threshold is compared with
-      prediction values to determine the truth value of predictions (i.e., above
-      the threshold is `true`, below is `false`). One metric value is generated
-      for each threshold value.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-  """
-
-  def __init__(self,
-               confusion_matrix_cond,
-               thresholds=None,
-               name=None,
-               dtype=None):
-    super().__init__(name=name, dtype=dtype)
-    self._confusion_matrix_cond = confusion_matrix_cond
-    self.init_thresholds = thresholds
-    self.thresholds = metrics_utils.parse_init_thresholds(
-        thresholds, default_threshold=0.5)
-    self._thresholds_distributed_evenly = (
-        metrics_utils.is_evenly_distributed_thresholds(self.thresholds))
-    self.accumulator = self.add_weight(
-        'accumulator',
-        shape=(len(self.thresholds),),
-        initializer='zeros')
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates the metric statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    return metrics_utils.update_confusion_matrix_variables(
-        {self._confusion_matrix_cond: self.accumulator},
-        y_true,
-        y_pred,
-        thresholds=self.thresholds,
-        thresholds_distributed_evenly=self._thresholds_distributed_evenly,
-        sample_weight=sample_weight)
-
-  def result(self):
-    if len(self.thresholds) == 1:
-      result = self.accumulator[0]
-    else:
-      result = self.accumulator
-    return tf.convert_to_tensor(result)
-
-  def reset_state(self):
-    backend.batch_set_value([
-        (v, np.zeros(v.shape.as_list())) for v in self.variables
-    ])
-
-  def get_config(self):
-    config = {'thresholds': self.init_thresholds}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.FalsePositives')
-class FalsePositives(_ConfusionMatrixConditionCount):
-  """Calculates the number of false positives.
-
-  If `sample_weight` is given, calculates the sum of the weights of
-  false positives. This metric creates one local variable, `accumulator`
-  that is used to keep track of the number of false positives.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    thresholds: (Optional) Defaults to 0.5. A float value, or a Python
-      list/tuple of float threshold values in [0, 1]. A threshold is compared
-      with prediction values to determine the truth value of predictions
-      (i.e., above the threshold is `true`, below is `false`). If used with a
-      loss function that sets `from_logits=True` (i.e. no sigmoid applied to
-      predictions), `thresholds` should be set to 0. One metric value is
-      generated for each threshold value.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.FalsePositives()
-  >>> m.update_state([0, 1, 0, 0], [0, 0, 1, 1])
-  >>> m.result().numpy()
-  2.0
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 1, 0, 0], [0, 0, 1, 1], sample_weight=[0, 0, 1, 0])
-  >>> m.result().numpy()
-  1.0
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.FalsePositives()])
-  ```
-
-  Usage with a loss with `from_logits=True`:
-
-  ```python
-  model.compile(optimizer='adam',
-                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-                metrics=[tf.keras.metrics.FalsePositives(thresholds=0)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, thresholds=None, name=None, dtype=None):
-    super().__init__(
-        confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_POSITIVES,
-        thresholds=thresholds,
-        name=name,
-        dtype=dtype)
-
-
-@keras_export('keras.metrics.FalseNegatives')
-class FalseNegatives(_ConfusionMatrixConditionCount):
-  """Calculates the number of false negatives.
-
-  If `sample_weight` is given, calculates the sum of the weights of
-  false negatives. This metric creates one local variable, `accumulator`
-  that is used to keep track of the number of false negatives.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    thresholds: (Optional) Defaults to 0.5. A float value, or a Python
-      list/tuple of float threshold values in [0, 1]. A threshold is compared
-      with prediction values to determine the truth value of predictions
-      (i.e., above the threshold is `true`, below is `false`). If used with a
-      loss function that sets `from_logits=True` (i.e. no sigmoid applied to
-      predictions), `thresholds` should be set to 0. One metric value is
-      generated for each threshold value.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.FalseNegatives()
-  >>> m.update_state([0, 1, 1, 1], [0, 1, 0, 0])
-  >>> m.result().numpy()
-  2.0
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 1, 1, 1], [0, 1, 0, 0], sample_weight=[0, 0, 1, 0])
-  >>> m.result().numpy()
-  1.0
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.FalseNegatives()])
-  ```
-
-  Usage with a loss with `from_logits=True`:
-
-  ```python
-  model.compile(optimizer='adam',
-                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-                metrics=[tf.keras.metrics.FalseNegatives(thresholds=0)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, thresholds=None, name=None, dtype=None):
-    super().__init__(
-        confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_NEGATIVES,
-        thresholds=thresholds,
-        name=name,
-        dtype=dtype)
-
-
-@keras_export('keras.metrics.TrueNegatives')
-class TrueNegatives(_ConfusionMatrixConditionCount):
-  """Calculates the number of true negatives.
-
-  If `sample_weight` is given, calculates the sum of the weights of
-  true negatives. This metric creates one local variable, `accumulator`
-  that is used to keep track of the number of true negatives.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    thresholds: (Optional) Defaults to 0.5. A float value, or a Python
-      list/tuple of float threshold values in [0, 1]. A threshold is compared
-      with prediction values to determine the truth value of predictions
-      (i.e., above the threshold is `true`, below is `false`). If used with a
-      loss function that sets `from_logits=True` (i.e. no sigmoid applied to
-      predictions), `thresholds` should be set to 0. One metric value is
-      generated for each threshold value.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.TrueNegatives()
-  >>> m.update_state([0, 1, 0, 0], [1, 1, 0, 0])
-  >>> m.result().numpy()
-  2.0
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 1, 0, 0], [1, 1, 0, 0], sample_weight=[0, 0, 1, 0])
-  >>> m.result().numpy()
-  1.0
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.TrueNegatives()])
-  ```
-
-  Usage with a loss with `from_logits=True`:
-
-  ```python
-  model.compile(optimizer='adam',
-                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-                metrics=[tf.keras.metrics.TrueNegatives(thresholds=0)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, thresholds=None, name=None, dtype=None):
-    super().__init__(
-        confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_NEGATIVES,
-        thresholds=thresholds,
-        name=name,
-        dtype=dtype)
-
-
-@keras_export('keras.metrics.TruePositives')
-class TruePositives(_ConfusionMatrixConditionCount):
-  """Calculates the number of true positives.
-
-  If `sample_weight` is given, calculates the sum of the weights of
-  true positives. This metric creates one local variable, `true_positives`
-  that is used to keep track of the number of true positives.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    thresholds: (Optional) Defaults to 0.5. A float value, or a Python
-      list/tuple of float threshold values in [0, 1]. A threshold is compared
-      with prediction values to determine the truth value of predictions
-      (i.e., above the threshold is `true`, below is `false`). If used with a
-      loss function that sets `from_logits=True` (i.e. no sigmoid applied to
-      predictions), `thresholds` should be set to 0. One metric value is
-      generated for each threshold value.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.TruePositives()
-  >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
-  >>> m.result().numpy()
-  2.0
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
-  >>> m.result().numpy()
-  1.0
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.TruePositives()])
-  ```
-
-  Usage with a loss with `from_logits=True`:
-
-  ```python
-  model.compile(optimizer='adam',
-                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-                metrics=[tf.keras.metrics.TruePositives(thresholds=0)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, thresholds=None, name=None, dtype=None):
-    super().__init__(
-        confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_POSITIVES,
-        thresholds=thresholds,
-        name=name,
-        dtype=dtype)
-
-
-@keras_export('keras.metrics.Precision')
-class Precision(base_metric.Metric):
-  """Computes the precision of the predictions with respect to the labels.
-
-  The metric creates two local variables, `true_positives` and `false_positives`
-  that are used to compute the precision. This value is ultimately returned as
-  `precision`, an idempotent operation that simply divides `true_positives`
-  by the sum of `true_positives` and `false_positives`.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  If `top_k` is set, we'll calculate precision as how often on average a class
-  among the top-k classes with the highest predicted values of a batch entry is
-  correct and can be found in the label for that entry.
-
-  If `class_id` is specified, we calculate precision by considering only the
-  entries in the batch for which `class_id` is above the threshold and/or in the
-  top-k highest predictions, and computing the fraction of them for which
-  `class_id` is indeed a correct label.
-
-  Args:
-    thresholds: (Optional) A float value, or a Python list/tuple of float
-      threshold values in [0, 1]. A threshold is compared with prediction
-      values to determine the truth value of predictions (i.e., above the
-      threshold is `true`, below is `false`). If used with a loss function that
-      sets `from_logits=True` (i.e. no sigmoid applied to predictions),
-      `thresholds` should be set to 0. One metric value is generated for each
-      threshold value. If neither thresholds nor top_k are set, the default is
-      to calculate precision with `thresholds=0.5`.
-    top_k: (Optional) Unset by default. An int value specifying the top-k
-      predictions to consider when calculating precision.
-    class_id: (Optional) Integer class ID for which we want binary metrics.
-      This must be in the half-open interval `[0, num_classes)`, where
-      `num_classes` is the last dimension of predictions.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.Precision()
-  >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
-  >>> m.result().numpy()
-  0.6666667
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
-  >>> m.result().numpy()
-  1.0
-
-  >>> # With top_k=2, it will calculate precision over y_true[:2] and y_pred[:2]
-  >>> m = tf.keras.metrics.Precision(top_k=2)
-  >>> m.update_state([0, 0, 1, 1], [1, 1, 1, 1])
-  >>> m.result().numpy()
-  0.0
-
-  >>> # With top_k=4, it will calculate precision over y_true[:4] and y_pred[:4]
-  >>> m = tf.keras.metrics.Precision(top_k=4)
-  >>> m.update_state([0, 0, 1, 1], [1, 1, 1, 1])
-  >>> m.result().numpy()
-  0.5
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.Precision()])
-  ```
-
-  Usage with a loss with `from_logits=True`:
-
-  ```python
-  model.compile(optimizer='adam',
-                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-                metrics=[tf.keras.metrics.Precision(thresholds=0)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               thresholds=None,
-               top_k=None,
-               class_id=None,
-               name=None,
-               dtype=None):
-    super().__init__(name=name, dtype=dtype)
-    self.init_thresholds = thresholds
-    self.top_k = top_k
-    self.class_id = class_id
-
-    default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
-    self.thresholds = metrics_utils.parse_init_thresholds(
-        thresholds, default_threshold=default_threshold)
-    self._thresholds_distributed_evenly = (
-        metrics_utils.is_evenly_distributed_thresholds(self.thresholds))
-    self.true_positives = self.add_weight(
-        'true_positives',
-        shape=(len(self.thresholds),),
-        initializer='zeros')
-    self.false_positives = self.add_weight(
-        'false_positives',
-        shape=(len(self.thresholds),),
-        initializer='zeros')
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates true positive and false positive statistics.
-
-    Args:
-      y_true: The ground truth values, with the same dimensions as `y_pred`.
-        Will be cast to `bool`.
-      y_pred: The predicted values. Each element must be in the range `[0, 1]`.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    return metrics_utils.update_confusion_matrix_variables(
-        {
-            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
-            metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives
-        },
-        y_true,
-        y_pred,
-        thresholds=self.thresholds,
-        thresholds_distributed_evenly=self._thresholds_distributed_evenly,
-        top_k=self.top_k,
-        class_id=self.class_id,
-        sample_weight=sample_weight)
-
-  def result(self):
-    result = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_positives))
-    return result[0] if len(self.thresholds) == 1 else result
-
-  def reset_state(self):
-    num_thresholds = len(to_list(self.thresholds))
-    backend.batch_set_value([(v, np.zeros((num_thresholds,)))
-                             for v in (self.true_positives,
-                                       self.false_positives)])
-
-  def get_config(self):
-    config = {
-        'thresholds': self.init_thresholds,
-        'top_k': self.top_k,
-        'class_id': self.class_id
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.Recall')
-class Recall(base_metric.Metric):
-  """Computes the recall of the predictions with respect to the labels.
-
-  This metric creates two local variables, `true_positives` and
-  `false_negatives`, that are used to compute the recall. This value is
-  ultimately returned as `recall`, an idempotent operation that simply divides
-  `true_positives` by the sum of `true_positives` and `false_negatives`.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  If `top_k` is set, recall will be computed as how often on average a class
-  among the labels of a batch entry is in the top-k predictions.
-
-  If `class_id` is specified, we calculate recall by considering only the
-  entries in the batch for which `class_id` is in the label, and computing the
-  fraction of them for which `class_id` is above the threshold and/or in the
-  top-k predictions.
-
-  Args:
-    thresholds: (Optional) A float value, or a Python list/tuple of float
-      threshold values in [0, 1]. A threshold is compared with prediction
-      values to determine the truth value of predictions (i.e., above the
-      threshold is `true`, below is `false`). If used with a loss function that
-      sets `from_logits=True` (i.e. no sigmoid applied to predictions),
-      `thresholds` should be set to 0. One metric value is generated for each
-      threshold value. If neither thresholds nor top_k are set, the default is
-      to calculate recall with `thresholds=0.5`.
-    top_k: (Optional) Unset by default. An int value specifying the top-k
-      predictions to consider when calculating recall.
-    class_id: (Optional) Integer class ID for which we want binary metrics.
-      This must be in the half-open interval `[0, num_classes)`, where
-      `num_classes` is the last dimension of predictions.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.Recall()
-  >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
-  >>> m.result().numpy()
-  0.6666667
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
-  >>> m.result().numpy()
-  1.0
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.Recall()])
-  ```
-
-  Usage with a loss with `from_logits=True`:
-
-  ```python
-  model.compile(optimizer='adam',
-                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-                metrics=[tf.keras.metrics.Recall(thresholds=0)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               thresholds=None,
-               top_k=None,
-               class_id=None,
-               name=None,
-               dtype=None):
-    super().__init__(name=name, dtype=dtype)
-    self.init_thresholds = thresholds
-    self.top_k = top_k
-    self.class_id = class_id
-
-    default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
-    self.thresholds = metrics_utils.parse_init_thresholds(
-        thresholds, default_threshold=default_threshold)
-    self._thresholds_distributed_evenly = (
-        metrics_utils.is_evenly_distributed_thresholds(self.thresholds))
-    self.true_positives = self.add_weight(
-        'true_positives',
-        shape=(len(self.thresholds),),
-        initializer='zeros')
-    self.false_negatives = self.add_weight(
-        'false_negatives',
-        shape=(len(self.thresholds),),
-        initializer='zeros')
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates true positive and false negative statistics.
-
-    Args:
-      y_true: The ground truth values, with the same dimensions as `y_pred`.
-        Will be cast to `bool`.
-      y_pred: The predicted values. Each element must be in the range `[0, 1]`.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    return metrics_utils.update_confusion_matrix_variables(
-        {
-            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
-            metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives
-        },
-        y_true,
-        y_pred,
-        thresholds=self.thresholds,
-        thresholds_distributed_evenly=self._thresholds_distributed_evenly,
-        top_k=self.top_k,
-        class_id=self.class_id,
-        sample_weight=sample_weight)
-
-  def result(self):
-    result = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_negatives))
-    return result[0] if len(self.thresholds) == 1 else result
-
-  def reset_state(self):
-    num_thresholds = len(to_list(self.thresholds))
-    backend.batch_set_value([(v, np.zeros((num_thresholds,)))
-                             for v in (self.true_positives,
-                                       self.false_negatives)])
-
-  def get_config(self):
-    config = {
-        'thresholds': self.init_thresholds,
-        'top_k': self.top_k,
-        'class_id': self.class_id
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class SensitivitySpecificityBase(base_metric.Metric, metaclass=abc.ABCMeta):
-  """Abstract base class for computing sensitivity and specificity.
-
-  For additional information about specificity and sensitivity, see
-  [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
-  """
-
-  def __init__(self,
-               value,
-               num_thresholds=200,
-               class_id=None,
-               name=None,
-               dtype=None):
-    super().__init__(name=name, dtype=dtype)
-    if num_thresholds <= 0:
-      raise ValueError(
-          'Argument `num_thresholds` must be an integer > 0. '
-          f'Received: num_thresholds={num_thresholds}')
-    self.value = value
-    self.class_id = class_id
-    self.true_positives = self.add_weight(
-        'true_positives',
-        shape=(num_thresholds,),
-        initializer='zeros')
-    self.true_negatives = self.add_weight(
-        'true_negatives',
-        shape=(num_thresholds,),
-        initializer='zeros')
-    self.false_positives = self.add_weight(
-        'false_positives',
-        shape=(num_thresholds,),
-        initializer='zeros')
-    self.false_negatives = self.add_weight(
-        'false_negatives',
-        shape=(num_thresholds,),
-        initializer='zeros')
-
-    # Compute `num_thresholds` thresholds in [0, 1]
-    if num_thresholds == 1:
-      self.thresholds = [0.5]
-      self._thresholds_distributed_evenly = False
-    else:
-      thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                    for i in range(num_thresholds - 2)]
-      self.thresholds = [0.0] + thresholds + [1.0]
-      self._thresholds_distributed_evenly = True
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates confusion matrix statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    return metrics_utils.update_confusion_matrix_variables(
-        {
-            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
-            metrics_utils.ConfusionMatrix.TRUE_NEGATIVES: self.true_negatives,
-            metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,
-            metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,
-        },
-        y_true,
-        y_pred,
-        thresholds=self.thresholds,
-        thresholds_distributed_evenly=self._thresholds_distributed_evenly,
-        class_id=self.class_id,
-        sample_weight=sample_weight)
-
-  def reset_state(self):
-    num_thresholds = len(self.thresholds)
-    confusion_matrix_variables = (self.true_positives, self.true_negatives,
-                                  self.false_positives, self.false_negatives)
-    backend.batch_set_value([
-        (v, np.zeros((num_thresholds,))) for v in confusion_matrix_variables
-    ])
-
-  def get_config(self):
-    config = {'class_id': self.class_id}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def _find_max_under_constraint(self, constrained, dependent, predicate):
-    """Returns the maximum of dependent_statistic that satisfies the constraint.
-
-    Args:
-      constrained: Over these values the constraint
-        is specified. A rank-1 tensor.
-      dependent: From these values the maximum that satiesfies the
-        constraint is selected. Values in this tensor and in
-        `constrained` are linked by having the same threshold at each
-        position, hence this tensor must have the same shape.
-      predicate: A binary boolean functor to be applied to arguments
-      `constrained` and `self.value`, e.g. `tf.greater`.
-
-    Returns maximal dependent value, if no value satiesfies the constraint 0.0.
-    """
-    feasible = tf.where(predicate(constrained, self.value))
-    feasible_exists = tf.greater(tf.size(feasible), 0)
-    max_dependent = tf.reduce_max(tf.gather(dependent, feasible))
-
-    return tf.where(feasible_exists, max_dependent, 0.0)
-
-
-@keras_export('keras.metrics.SensitivityAtSpecificity')
-class SensitivityAtSpecificity(SensitivitySpecificityBase):
-  """Computes best sensitivity where specificity is >= specified value.
-
-  the sensitivity at a given specificity.
-
-  `Sensitivity` measures the proportion of actual positives that are correctly
-  identified as such (tp / (tp + fn)).
-  `Specificity` measures the proportion of actual negatives that are correctly
-  identified as such (tn / (tn + fp)).
-
-  This metric creates four local variables, `true_positives`, `true_negatives`,
-  `false_positives` and `false_negatives` that are used to compute the
-  sensitivity at the given specificity. The threshold for the given specificity
-  value is computed and used to evaluate the corresponding sensitivity.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  If `class_id` is specified, we calculate precision by considering only the
-  entries in the batch for which `class_id` is above the threshold predictions,
-  and computing the fraction of them for which `class_id` is indeed a correct
-  label.
-
-  For additional information about specificity and sensitivity, see
-  [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
-
-  Args:
-    specificity: A scalar value in range `[0, 1]`.
-    num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-      use for matching the given specificity.
-    class_id: (Optional) Integer class ID for which we want binary metrics.
-      This must be in the half-open interval `[0, num_classes)`, where
-      `num_classes` is the last dimension of predictions.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.SensitivityAtSpecificity(0.5)
-  >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
-  ...                sample_weight=[1, 1, 2, 2, 1])
-  >>> m.result().numpy()
-  0.333333
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.SensitivityAtSpecificity()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               specificity,
-               num_thresholds=200,
-               class_id=None,
-               name=None,
-               dtype=None):
-    if specificity < 0 or specificity > 1:
-      raise ValueError(
-          'Argument `specificity` must be in the range [0, 1]. '
-          f'Received: specificity={specificity}')
-    self.specificity = specificity
-    self.num_thresholds = num_thresholds
-    super().__init__(
-        specificity,
-        num_thresholds=num_thresholds,
-        class_id=class_id,
-        name=name,
-        dtype=dtype)
-
-  def result(self):
-    specificities = tf.math.divide_no_nan(
-        self.true_negatives,
-        tf.math.add(self.true_negatives, self.false_positives))
-    sensitivities = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_negatives))
-    return self._find_max_under_constraint(
-        specificities, sensitivities, tf.greater_equal)
-
-  def get_config(self):
-    config = {
-        'num_thresholds': self.num_thresholds,
-        'specificity': self.specificity
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.SpecificityAtSensitivity')
-class SpecificityAtSensitivity(SensitivitySpecificityBase):
-  """Computes best specificity where sensitivity is >= specified value.
-
-  `Sensitivity` measures the proportion of actual positives that are correctly
-  identified as such (tp / (tp + fn)).
-  `Specificity` measures the proportion of actual negatives that are correctly
-  identified as such (tn / (tn + fp)).
-
-  This metric creates four local variables, `true_positives`, `true_negatives`,
-  `false_positives` and `false_negatives` that are used to compute the
-  specificity at the given sensitivity. The threshold for the given sensitivity
-  value is computed and used to evaluate the corresponding specificity.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  If `class_id` is specified, we calculate precision by considering only the
-  entries in the batch for which `class_id` is above the threshold predictions,
-  and computing the fraction of them for which `class_id` is indeed a correct
-  label.
-
-  For additional information about specificity and sensitivity, see
-  [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
-
-  Args:
-    sensitivity: A scalar value in range `[0, 1]`.
-    num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-      use for matching the given sensitivity.
-    class_id: (Optional) Integer class ID for which we want binary metrics.
-      This must be in the half-open interval `[0, num_classes)`, where
-      `num_classes` is the last dimension of predictions.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.SpecificityAtSensitivity(0.5)
-  >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
-  >>> m.result().numpy()
-  0.66666667
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
-  ...                sample_weight=[1, 1, 2, 2, 2])
-  >>> m.result().numpy()
-  0.5
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.SpecificityAtSensitivity()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               sensitivity,
-               num_thresholds=200,
-               class_id=None,
-               name=None,
-               dtype=None):
-    if sensitivity < 0 or sensitivity > 1:
-      raise ValueError(
-          'Argument `sensitivity` must be in the range [0, 1]. '
-          f'Received: sensitivity={sensitivity}')
-    self.sensitivity = sensitivity
-    self.num_thresholds = num_thresholds
-    super().__init__(
-        sensitivity,
-        num_thresholds=num_thresholds,
-        class_id=class_id,
-        name=name,
-        dtype=dtype)
-
-  def result(self):
-    sensitivities = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_negatives))
-    specificities = tf.math.divide_no_nan(
-        self.true_negatives,
-        tf.math.add(self.true_negatives, self.false_positives))
-    return self._find_max_under_constraint(
-        sensitivities, specificities, tf.greater_equal)
-
-  def get_config(self):
-    config = {
-        'num_thresholds': self.num_thresholds,
-        'sensitivity': self.sensitivity
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.PrecisionAtRecall')
-class PrecisionAtRecall(SensitivitySpecificityBase):
-  """Computes best precision where recall is >= specified value.
-
-  This metric creates four local variables, `true_positives`, `true_negatives`,
-  `false_positives` and `false_negatives` that are used to compute the
-  precision at the given recall. The threshold for the given recall
-  value is computed and used to evaluate the corresponding precision.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  If `class_id` is specified, we calculate precision by considering only the
-  entries in the batch for which `class_id` is above the threshold predictions,
-  and computing the fraction of them for which `class_id` is indeed a correct
-  label.
-
-  Args:
-    recall: A scalar value in range `[0, 1]`.
-    num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-      use for matching the given recall.
-    class_id: (Optional) Integer class ID for which we want binary metrics.
-      This must be in the half-open interval `[0, num_classes)`, where
-      `num_classes` is the last dimension of predictions.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.PrecisionAtRecall(0.5)
-  >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
-  ...                sample_weight=[2, 2, 2, 1, 1])
-  >>> m.result().numpy()
-  0.33333333
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.PrecisionAtRecall(recall=0.8)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               recall,
-               num_thresholds=200,
-               class_id=None,
-               name=None,
-               dtype=None):
-    if recall < 0 or recall > 1:
-      raise ValueError(
-          'Argument `recall` must be in the range [0, 1]. '
-          f'Received: recall={recall}')
-    self.recall = recall
-    self.num_thresholds = num_thresholds
-    super().__init__(
-        value=recall,
-        num_thresholds=num_thresholds,
-        class_id=class_id,
-        name=name,
-        dtype=dtype)
-
-  def result(self):
-    recalls = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_negatives))
-    precisions = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_positives))
-    return self._find_max_under_constraint(
-        recalls, precisions, tf.greater_equal)
-
-  def get_config(self):
-    config = {'num_thresholds': self.num_thresholds, 'recall': self.recall}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.RecallAtPrecision')
-class RecallAtPrecision(SensitivitySpecificityBase):
-  """Computes best recall where precision is >= specified value.
-
-  For a given score-label-distribution the required precision might not
-  be achievable, in this case 0.0 is returned as recall.
-
-  This metric creates four local variables, `true_positives`, `true_negatives`,
-  `false_positives` and `false_negatives` that are used to compute the
-  recall at the given precision. The threshold for the given precision
-  value is computed and used to evaluate the corresponding recall.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  If `class_id` is specified, we calculate precision by considering only the
-  entries in the batch for which `class_id` is above the threshold predictions,
-  and computing the fraction of them for which `class_id` is indeed a correct
-  label.
-
-  Args:
-    precision: A scalar value in range `[0, 1]`.
-    num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-      use for matching the given precision.
-    class_id: (Optional) Integer class ID for which we want binary metrics.
-      This must be in the half-open interval `[0, num_classes)`, where
-      `num_classes` is the last dimension of predictions.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.RecallAtPrecision(0.8)
-  >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9],
-  ...                sample_weight=[1, 0, 0, 1])
-  >>> m.result().numpy()
-  1.0
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.RecallAtPrecision(precision=0.8)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               precision,
-               num_thresholds=200,
-               class_id=None,
-               name=None,
-               dtype=None):
-    if precision < 0 or precision > 1:
-      raise ValueError(
-          'Argument `precision` must be in the range [0, 1]. '
-          f'Received: precision={precision}')
-    self.precision = precision
-    self.num_thresholds = num_thresholds
-    super().__init__(
-        value=precision,
-        num_thresholds=num_thresholds,
-        class_id=class_id,
-        name=name,
-        dtype=dtype)
-
-  def result(self):
-    precisions = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_positives))
-    recalls = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_negatives))
-    return self._find_max_under_constraint(
-        precisions, recalls, tf.greater_equal)
-
-  def get_config(self):
-    config = {'num_thresholds': self.num_thresholds,
-              'precision': self.precision}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.AUC')
-class AUC(base_metric.Metric):
-  """Approximates the AUC (Area under the curve) of the ROC or PR curves.
-
-  The AUC (Area under the curve) of the ROC (Receiver operating
-  characteristic; default) or PR (Precision Recall) curves are quality measures
-  of binary classifiers. Unlike the accuracy, and like cross-entropy
-  losses, ROC-AUC and PR-AUC evaluate all the operational points of a model.
-
-  This class approximates AUCs using a Riemann sum. During the metric
-  accumulation phrase, predictions are accumulated within predefined buckets
-  by value. The AUC is then computed by interpolating per-bucket averages. These
-  buckets define the evaluated operational points.
-
-  This metric creates four local variables, `true_positives`, `true_negatives`,
-  `false_positives` and `false_negatives` that are used to compute the AUC.
-  To discretize the AUC curve, a linearly spaced set of thresholds is used to
-  compute pairs of recall and precision values. The area under the ROC-curve is
-  therefore computed using the height of the recall values by the false positive
-  rate, while the area under the PR-curve is the computed using the height of
-  the precision values by the recall.
-
-  This value is ultimately returned as `auc`, an idempotent operation that
-  computes the area under a discretized curve of precision versus recall values
-  (computed using the aforementioned variables). The `num_thresholds` variable
-  controls the degree of discretization with larger numbers of thresholds more
-  closely approximating the true AUC. The quality of the approximation may vary
-  dramatically depending on `num_thresholds`. The `thresholds` parameter can be
-  used to manually specify thresholds which split the predictions more evenly.
-
-  For a best approximation of the real AUC, `predictions` should be distributed
-  approximately uniformly in the range [0, 1] (if `from_logits=False`). The
-  quality of the AUC approximation may be poor if this is not the case. Setting
-  `summation_method` to 'minoring' or 'majoring' can help quantify the error in
-  the approximation by providing lower or upper bound estimate of the AUC.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-      use when discretizing the roc curve. Values must be > 1.
-    curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
-      [default] or 'PR' for the Precision-Recall-curve.
-    summation_method: (Optional) Specifies the [Riemann summation method](
-        https://en.wikipedia.org/wiki/Riemann_sum) used.
-        'interpolation' (default) applies mid-point summation scheme for `ROC`.
-        For PR-AUC, interpolates (true/false) positives but not the ratio that
-        is precision (see Davis & Goadrich 2006 for details);
-        'minoring' applies left summation
-        for increasing intervals and right summation for decreasing intervals;
-        'majoring' does the opposite.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    thresholds: (Optional) A list of floating point values to use as the
-      thresholds for discretizing the curve. If set, the `num_thresholds`
-      parameter is ignored. Values should be in [0, 1]. Endpoint thresholds
-      equal to {-epsilon, 1+epsilon} for a small positive epsilon value will
-      be automatically included with these to correctly handle predictions
-      equal to exactly 0 or 1.
-    multi_label: boolean indicating whether multilabel data should be
-      treated as such, wherein AUC is computed separately for each label and
-      then averaged across labels, or (when False) if the data should be
-      flattened into a single label before AUC computation. In the latter
-      case, when multilabel data is passed to AUC, each label-prediction pair
-      is treated as an individual data point. Should be set to False for
-      multi-class data.
-    num_labels: (Optional) The number of labels, used when `multi_label` is
-      True. If `num_labels` is not specified, then state variables get created
-      on the first call to `update_state`.
-    label_weights: (Optional) list, array, or tensor of non-negative weights
-      used to compute AUCs for multilabel data. When `multi_label` is True,
-      the weights are applied to the individual label AUCs when they are
-      averaged to produce the multi-label AUC. When it's False, they are used
-      to weight the individual label predictions in computing the confusion
-      matrix on the flattened data. Note that this is unlike class_weights in
-      that class_weights weights the example depending on the value of its
-      label, whereas label_weights depends only on the index of that label
-      before flattening; therefore `label_weights` should not be used for
-      multi-class data.
-    from_logits: boolean indicating whether the predictions (`y_pred` in
-      `update_state`) are probabilities or sigmoid logits. As a rule of thumb,
-      when using a keras loss, the `from_logits` constructor argument of the
-      loss should match the AUC `from_logits` constructor argument.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.AUC(num_thresholds=3)
-  >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
-  >>> # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
-  >>> # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
-  >>> # tp_rate = recall = [1, 0.5, 0], fp_rate = [1, 0, 0]
-  >>> # auc = ((((1+0.5)/2)*(1-0)) + (((0.5+0)/2)*(0-0))) = 0.75
-  >>> m.result().numpy()
-  0.75
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9],
-  ...                sample_weight=[1, 0, 0, 1])
-  >>> m.result().numpy()
-  1.0
-
-  Usage with `compile()` API:
-
-  ```python
-  # Reports the AUC of a model outputting a probability.
-  model.compile(optimizer='sgd',
-                loss=tf.keras.losses.BinaryCrossentropy(),
-                metrics=[tf.keras.metrics.AUC()])
-
-  # Reports the AUC of a model outputting a logit.
-  model.compile(optimizer='sgd',
-                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-                metrics=[tf.keras.metrics.AUC(from_logits=True)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               num_thresholds=200,
-               curve='ROC',
-               summation_method='interpolation',
-               name=None,
-               dtype=None,
-               thresholds=None,
-               multi_label=False,
-               num_labels=None,
-               label_weights=None,
-               from_logits=False):
-    # Validate configurations.
-    if isinstance(curve, metrics_utils.AUCCurve) and curve not in list(
-        metrics_utils.AUCCurve):
-      raise ValueError(
-          f'Invalid `curve` argument value "{curve}". '
-          f'Expected one of: {list(metrics_utils.AUCCurve)}')
-    if isinstance(
-        summation_method,
-        metrics_utils.AUCSummationMethod) and summation_method not in list(
-            metrics_utils.AUCSummationMethod):
-      raise ValueError(
-          f'Invalid `summation_method` argument value "{summation_method}". '
-          f'Expected one of: {list(metrics_utils.AUCSummationMethod)}')
-
-    # Update properties.
-    self._init_from_thresholds = thresholds is not None
-    if thresholds is not None:
-      # If specified, use the supplied thresholds.
-      self.num_thresholds = len(thresholds) + 2
-      thresholds = sorted(thresholds)
-      self._thresholds_distributed_evenly = (
-          metrics_utils.is_evenly_distributed_thresholds(
-              np.array([0.0] + thresholds + [1.0])))
-    else:
-      if num_thresholds <= 1:
-        raise ValueError('Argument `num_thresholds` must be an integer > 1. '
-                         f'Received: num_thresholds={num_thresholds}')
-
-      # Otherwise, linearly interpolate (num_thresholds - 2) thresholds in
-      # (0, 1).
-      self.num_thresholds = num_thresholds
-      thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                    for i in range(num_thresholds - 2)]
-      self._thresholds_distributed_evenly = True
-
-    # Add an endpoint "threshold" below zero and above one for either
-    # threshold method to account for floating point imprecisions.
-    self._thresholds = np.array([0.0 - backend.epsilon()] + thresholds +
-                                [1.0 + backend.epsilon()])
-
-    if isinstance(curve, metrics_utils.AUCCurve):
-      self.curve = curve
-    else:
-      self.curve = metrics_utils.AUCCurve.from_str(curve)
-    if isinstance(summation_method, metrics_utils.AUCSummationMethod):
-      self.summation_method = summation_method
-    else:
-      self.summation_method = metrics_utils.AUCSummationMethod.from_str(
-          summation_method)
-    super().__init__(name=name, dtype=dtype)
-
-    # Handle multilabel arguments.
-    self.multi_label = multi_label
-    if label_weights is not None:
-      label_weights = tf.constant(label_weights, dtype=self.dtype)
-      tf.debugging.assert_non_negative(
-          label_weights,
-          message='All values of `label_weights` must be non-negative.')
-      self.label_weights = label_weights
-
-    else:
-      self.label_weights = None
-
-    self._from_logits = from_logits
-
-    self._built = False
-    if self.multi_label:
-      if num_labels:
-        shape = tf.TensorShape([None, num_labels])
-        self._build(shape)
-    else:
-      if num_labels:
-        raise ValueError(
-            '`num_labels` is needed only when `multi_label` is True.')
-      self._build(None)
-
-  @property
-  def thresholds(self):
-    """The thresholds used for evaluating AUC."""
-    return list(self._thresholds)
-
-  def _build(self, shape):
-    """Initialize TP, FP, TN, and FN tensors, given the shape of the data."""
-    if self.multi_label:
-      if shape.ndims != 2:
-        raise ValueError(
-            '`y_true` must have rank 2 when `multi_label=True`. '
-            f'Found rank {shape.ndims}. '
-            f'Full shape received for `y_true`: {shape}')
-      self._num_labels = shape[1]
-      variable_shape = tf.TensorShape([self.num_thresholds, self._num_labels])
-    else:
-      variable_shape = tf.TensorShape([self.num_thresholds])
-
-    self._build_input_shape = shape
-    # Create metric variables
-    self.true_positives = self.add_weight(
-        'true_positives',
-        shape=variable_shape,
-        initializer='zeros')
-    self.true_negatives = self.add_weight(
-        'true_negatives',
-        shape=variable_shape,
-        initializer='zeros')
-    self.false_positives = self.add_weight(
-        'false_positives',
-        shape=variable_shape,
-        initializer='zeros')
-    self.false_negatives = self.add_weight(
-        'false_negatives',
-        shape=variable_shape,
-        initializer='zeros')
-
-    if self.multi_label:
-      with tf.init_scope():
-        # This should only be necessary for handling v1 behavior. In v2, AUC
-        # should be initialized outside of any tf.functions, and therefore in
-        # eager mode.
-        if not tf.executing_eagerly():
-          backend._initialize_variables(backend._get_session())  # pylint: disable=protected-access
-
-    self._built = True
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates confusion matrix statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    if not self._built:
-      self._build(tf.TensorShape(y_pred.shape))
-
-    if self.multi_label or (self.label_weights is not None):
-      # y_true should have shape (number of examples, number of labels).
-      shapes = [
-          (y_true, ('N', 'L'))
-      ]
-      if self.multi_label:
-        # TP, TN, FP, and FN should all have shape
-        # (number of thresholds, number of labels).
-        shapes.extend([(self.true_positives, ('T', 'L')),
-                       (self.true_negatives, ('T', 'L')),
-                       (self.false_positives, ('T', 'L')),
-                       (self.false_negatives, ('T', 'L'))])
-      if self.label_weights is not None:
-        # label_weights should be of length equal to the number of labels.
-        shapes.append((self.label_weights, ('L',)))
-        tf.debugging.assert_shapes(
-            shapes, message='Number of labels is not consistent.')
-
-    # Only forward label_weights to update_confusion_matrix_variables when
-    # multi_label is False. Otherwise the averaging of individual label AUCs is
-    # handled in AUC.result
-    label_weights = None if self.multi_label else self.label_weights
-
-    if self._from_logits:
-      y_pred = activations.sigmoid(y_pred)
-
-    return metrics_utils.update_confusion_matrix_variables(
-        {
-            metrics_utils.ConfusionMatrix.TRUE_POSITIVES:
-                self.true_positives,
-            metrics_utils.ConfusionMatrix.TRUE_NEGATIVES:
-                self.true_negatives,
-            metrics_utils.ConfusionMatrix.FALSE_POSITIVES:
-                self.false_positives,
-            metrics_utils.ConfusionMatrix.FALSE_NEGATIVES:
-                self.false_negatives,
-        },
-        y_true,
-        y_pred,
-        self._thresholds,
-        thresholds_distributed_evenly=self._thresholds_distributed_evenly,
-        sample_weight=sample_weight,
-        multi_label=self.multi_label,
-        label_weights=label_weights)
-
-  def interpolate_pr_auc(self):
-    """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
-
-    https://www.biostat.wisc.edu/~page/rocpr.pdf
-
-    Note here we derive & use a closed formula not present in the paper
-    as follows:
-
-      Precision = TP / (TP + FP) = TP / P
-
-    Modeling all of TP (true positive), FP (false positive) and their sum
-    P = TP + FP (predicted positive) as varying linearly within each interval
-    [A, B] between successive thresholds, we get
-
-      Precision slope = dTP / dP
-                      = (TP_B - TP_A) / (P_B - P_A)
-                      = (TP - TP_A) / (P - P_A)
-      Precision = (TP_A + slope * (P - P_A)) / P
-
-    The area within the interval is (slope / total_pos_weight) times
-
-      int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
-      int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
-
-    where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
-
-      int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
-
-    Bringing back the factor (slope / total_pos_weight) we'd put aside, we get
-
-      slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
-
-    where dTP == TP_B - TP_A.
-
-    Note that when P_A == 0 the above calculation simplifies into
-
-      int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
-
-    which is really equivalent to imputing constant precision throughout the
-    first bucket having >0 true positives.
-
-    Returns:
-      pr_auc: an approximation of the area under the P-R curve.
-    """
-    dtp = self.true_positives[:self.num_thresholds -
-                              1] - self.true_positives[1:]
-    p = tf.math.add(self.true_positives, self.false_positives)
-    dp = p[:self.num_thresholds - 1] - p[1:]
-    prec_slope = tf.math.divide_no_nan(
-        dtp, tf.maximum(dp, 0), name='prec_slope')
-    intercept = self.true_positives[1:] - tf.multiply(prec_slope, p[1:])
-
-    safe_p_ratio = tf.where(
-        tf.logical_and(p[:self.num_thresholds - 1] > 0, p[1:] > 0),
-        tf.math.divide_no_nan(
-            p[:self.num_thresholds - 1],
-            tf.maximum(p[1:], 0),
-            name='recall_relative_ratio'),
-        tf.ones_like(p[1:]))
-
-    pr_auc_increment = tf.math.divide_no_nan(
-        prec_slope * (dtp + intercept * tf.math.log(safe_p_ratio)),
-        tf.maximum(self.true_positives[1:] + self.false_negatives[1:], 0),
-        name='pr_auc_increment')
-
-    if self.multi_label:
-      by_label_auc = tf.reduce_sum(
-          pr_auc_increment, name=self.name + '_by_label', axis=0)
-      if self.label_weights is None:
-        # Evenly weighted average of the label AUCs.
-        return tf.reduce_mean(by_label_auc, name=self.name)
-      else:
-        # Weighted average of the label AUCs.
-        return tf.math.divide_no_nan(
-            tf.reduce_sum(
-                tf.multiply(by_label_auc, self.label_weights)),
-            tf.reduce_sum(self.label_weights),
-            name=self.name)
-    else:
-      return tf.reduce_sum(pr_auc_increment, name='interpolate_pr_auc')
-
-  def result(self):
-    if (self.curve == metrics_utils.AUCCurve.PR and
-        self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION
-       ):
-      # This use case is different and is handled separately.
-      return self.interpolate_pr_auc()
-
-    # Set `x` and `y` values for the curves based on `curve` config.
-    recall = tf.math.divide_no_nan(
-        self.true_positives,
-        tf.math.add(self.true_positives, self.false_negatives))
-    if self.curve == metrics_utils.AUCCurve.ROC:
-      fp_rate = tf.math.divide_no_nan(
-          self.false_positives,
-          tf.math.add(self.false_positives, self.true_negatives))
-      x = fp_rate
-      y = recall
-    else:  # curve == 'PR'.
-      precision = tf.math.divide_no_nan(
-          self.true_positives,
-          tf.math.add(self.true_positives, self.false_positives))
-      x = recall
-      y = precision
-
-    # Find the rectangle heights based on `summation_method`.
-    if self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION:
-      # Note: the case ('PR', 'interpolation') has been handled above.
-      heights = (y[:self.num_thresholds - 1] + y[1:]) / 2.
-    elif self.summation_method == metrics_utils.AUCSummationMethod.MINORING:
-      heights = tf.minimum(y[:self.num_thresholds - 1], y[1:])
-    else:  # self.summation_method = metrics_utils.AUCSummationMethod.MAJORING:
-      heights = tf.maximum(y[:self.num_thresholds - 1], y[1:])
-
-    # Sum up the areas of all the rectangles.
-    if self.multi_label:
-      riemann_terms = tf.multiply(x[:self.num_thresholds - 1] - x[1:], heights)
-      by_label_auc = tf.reduce_sum(
-          riemann_terms, name=self.name + '_by_label', axis=0)
-
-      if self.label_weights is None:
-        # Unweighted average of the label AUCs.
-        return tf.reduce_mean(by_label_auc, name=self.name)
-      else:
-        # Weighted average of the label AUCs.
-        return tf.math.divide_no_nan(
-            tf.reduce_sum(
-                tf.multiply(by_label_auc, self.label_weights)),
-            tf.reduce_sum(self.label_weights),
-            name=self.name)
-    else:
-      return tf.reduce_sum(
-          tf.multiply(x[:self.num_thresholds - 1] - x[1:], heights),
-          name=self.name)
-
-  def reset_state(self):
-    if self._built:
-      confusion_matrix_variables = (self.true_positives, self.true_negatives,
-                                    self.false_positives, self.false_negatives)
-      if self.multi_label:
-        backend.batch_set_value(
-            [(v, np.zeros((self.num_thresholds, self._num_labels)))
-             for v in confusion_matrix_variables])
-      else:
-        backend.batch_set_value([(v, np.zeros((self.num_thresholds,)))
-                                 for v in confusion_matrix_variables])
-
-  def get_config(self):
-    if is_tensor_or_variable(self.label_weights):
-      label_weights = backend.eval(self.label_weights)
-    else:
-      label_weights = self.label_weights
-    config = {
-        'num_thresholds': self.num_thresholds,
-        'curve': self.curve.value,
-        'summation_method': self.summation_method.value,
-        'multi_label': self.multi_label,
-        'label_weights': label_weights
-    }
-    # optimization to avoid serializing a large number of generated thresholds
-    if self._init_from_thresholds:
-      # We remove the endpoint thresholds as an inverse of how the thresholds
-      # were initialized. This ensures that a metric initialized from this
-      # config has the same thresholds.
-      config['thresholds'] = self.thresholds[1:-1]
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.CosineSimilarity')
-class CosineSimilarity(base_metric.MeanMetricWrapper):
-  """Computes the cosine similarity between the labels and predictions.
-
-  `cosine similarity = (a . b) / ||a|| ||b||`
-
-  See: [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity).
-
-  This metric keeps the average cosine similarity between `predictions` and
-  `labels` over a stream of data.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    axis: (Optional) Defaults to -1. The dimension along which the cosine
-      similarity is computed.
-
-  Standalone usage:
-
-  >>> # l2_norm(y_true) = [[0., 1.], [1./1.414, 1./1.414]]
-  >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414, 1./1.414]]
-  >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
-  >>> # result = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
-  >>> #        = ((0. + 0.) +  (0.5 + 0.5)) / 2
-  >>> m = tf.keras.metrics.CosineSimilarity(axis=1)
-  >>> m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
-  >>> m.result().numpy()
-  0.49999997
-
-  >>> m.reset_state()
-  >>> m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]],
-  ...                sample_weight=[0.3, 0.7])
-  >>> m.result().numpy()
-  0.6999999
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.CosineSimilarity(axis=1)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='cosine_similarity', dtype=None, axis=-1):
-    super().__init__(
-        cosine_similarity, name, dtype=dtype, axis=axis)
-
-
-@keras_export('keras.metrics.MeanAbsoluteError')
-class MeanAbsoluteError(base_metric.MeanMetricWrapper):
-  """Computes the mean absolute error between the labels and predictions.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.MeanAbsoluteError()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
-  >>> m.result().numpy()
-  0.25
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.5
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.MeanAbsoluteError()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='mean_absolute_error', dtype=None):
-    super().__init__(
-        mean_absolute_error, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.MeanAbsolutePercentageError')
-class MeanAbsolutePercentageError(base_metric.MeanMetricWrapper):
-  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.MeanAbsolutePercentageError()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
-  >>> m.result().numpy()
-  250000000.0
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  500000000.0
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='mean_absolute_percentage_error', dtype=None):
-    super().__init__(
-        mean_absolute_percentage_error, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.MeanSquaredError')
-class MeanSquaredError(base_metric.MeanMetricWrapper):
-  """Computes the mean squared error between `y_true` and `y_pred`.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.MeanSquaredError()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
-  >>> m.result().numpy()
-  0.25
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.5
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.MeanSquaredError()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='mean_squared_error', dtype=None):
-    super().__init__(
-        mean_squared_error, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.MeanSquaredLogarithmicError')
-class MeanSquaredLogarithmicError(base_metric.MeanMetricWrapper):
-  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.MeanSquaredLogarithmicError()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
-  >>> m.result().numpy()
-  0.12011322
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.24022643
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='mean_squared_logarithmic_error', dtype=None):
-    super().__init__(
-        mean_squared_logarithmic_error, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.Hinge')
-class Hinge(base_metric.MeanMetricWrapper):
-  """Computes the hinge metric between `y_true` and `y_pred`.
-
-  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
-  provided we will convert them to -1 or 1.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.Hinge()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
-  >>> m.result().numpy()
-  1.3
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  1.1
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd', loss='mse', metrics=[tf.keras.metrics.Hinge()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='hinge', dtype=None):
-    super().__init__(hinge, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.SquaredHinge')
-class SquaredHinge(base_metric.MeanMetricWrapper):
-  """Computes the squared hinge metric between `y_true` and `y_pred`.
-
-  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
-  provided we will convert them to -1 or 1.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.SquaredHinge()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
-  >>> m.result().numpy()
-  1.86
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  1.46
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.SquaredHinge()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='squared_hinge', dtype=None):
-    super().__init__(squared_hinge, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.CategoricalHinge')
-class CategoricalHinge(base_metric.MeanMetricWrapper):
-  """Computes the categorical hinge metric between `y_true` and `y_pred`.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.CategoricalHinge()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
-  >>> m.result().numpy()
-  1.4000001
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  1.2
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.CategoricalHinge()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='categorical_hinge', dtype=None):
-    super().__init__(categorical_hinge, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.RootMeanSquaredError')
-class RootMeanSquaredError(base_metric.Mean):
-  """Computes root mean squared error metric between `y_true` and `y_pred`.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.RootMeanSquaredError()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
-  >>> m.result().numpy()
-  0.5
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.70710677
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.RootMeanSquaredError()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='root_mean_squared_error', dtype=None):
-    super().__init__(name, dtype=dtype)
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates root mean squared error statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    y_true = tf.cast(y_true, self._dtype)
-    y_pred = tf.cast(y_pred, self._dtype)
-    y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
-        y_pred, y_true)
-    error_sq = tf.math.squared_difference(y_pred, y_true)
-    return super().update_state(
-        error_sq, sample_weight=sample_weight)
-
-  def result(self):
-    return tf.sqrt(tf.math.divide_no_nan(self.total, self.count))
-
-
-@keras_export('keras.metrics.LogCoshError')
-class LogCoshError(base_metric.MeanMetricWrapper):
-  """Computes the logarithm of the hyperbolic cosine of the prediction error.
-
-  `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true)
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.LogCoshError()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
-  >>> m.result().numpy()
-  0.10844523
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.21689045
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.LogCoshError()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='logcosh', dtype=None):
-    super().__init__(logcosh, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.Poisson')
-class Poisson(base_metric.MeanMetricWrapper):
-  """Computes the Poisson metric between `y_true` and `y_pred`.
-
-  `metric = y_pred - y_true * log(y_pred)`
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.Poisson()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
-  >>> m.result().numpy()
-  0.49999997
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.99999994
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.Poisson()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='poisson', dtype=None):
-    super().__init__(poisson, name, dtype=dtype)
-
-
-@keras_export('keras.metrics.KLDivergence')
-class KLDivergence(base_metric.MeanMetricWrapper):
-  """Computes Kullback-Leibler divergence metric between `y_true` and `y_pred`.
-
-  `metric = y_true * log(y_true / y_pred)`
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.KLDivergence()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
-  >>> m.result().numpy()
-  0.45814306
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.9162892
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(optimizer='sgd',
-                loss='mse',
-                metrics=[tf.keras.metrics.KLDivergence()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, name='kullback_leibler_divergence', dtype=None):
-    super().__init__(
-        kullback_leibler_divergence, name, dtype=dtype)
-
-
-class _IoUBase(base_metric.Metric):
-  """Computes the confusion matrix for Intersection-Over-Union metrics.
-
-  Intersection-Over-Union is a common evaluation metric for semantic image
-  segmentation.
-
-  For an individual class, the IoU metric is defined as follows:
-
-  ```
-  iou = true_positives / (true_positives + false_positives + false_negatives)
-  ```
-
-  From IoUs of individual classes, the MeanIoU can be computed as the mean of
-  the individual IoUs.
-
-  To compute IoUs, the predictions are accumulated in a confusion matrix,
-  weighted by `sample_weight` and the metric is then calculated from it.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Args:
-    num_classes: The possible number of labels the prediction task can have.
-      This value must be provided, since a confusion matrix of size
-      `(num_classes, num_classes)` will be allocated.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-  """
-
-  def __init__(self, num_classes, name=None, dtype=None):
-    super().__init__(name=name, dtype=dtype)
-    self.num_classes = num_classes
-
-    # Variable to accumulate the predictions in the confusion matrix.
-    self.total_cm = self.add_weight(
-        'total_confusion_matrix',
-        shape=(num_classes, num_classes),
-        initializer='zeros')
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates the confusion matrix statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-
-    y_true = tf.cast(y_true, self._dtype)
-    y_pred = tf.cast(y_pred, self._dtype)
-
-    # Flatten the input if its rank > 1.
-    if y_pred.shape.ndims > 1:
-      y_pred = tf.reshape(y_pred, [-1])
-
-    if y_true.shape.ndims > 1:
-      y_true = tf.reshape(y_true, [-1])
-
-    if sample_weight is not None:
-      sample_weight = tf.cast(sample_weight, self._dtype)
-      if sample_weight.shape.ndims > 1:
-        sample_weight = tf.reshape(sample_weight, [-1])
-
-    # Accumulate the prediction to current confusion matrix.
-    current_cm = tf.math.confusion_matrix(
-        y_true,
-        y_pred,
-        self.num_classes,
-        weights=sample_weight,
-        dtype=self._dtype)
-    return self.total_cm.assign_add(current_cm)
-
-  def reset_state(self):
-    backend.set_value(
-        self.total_cm, np.zeros((self.num_classes, self.num_classes)))
-
-
-@keras_export('keras.metrics.IoU')
-class IoU(_IoUBase):
-  """Computes the Intersection-Over-Union metric for specific target classes.
-
-  General definition and computation:
-
-  Intersection-Over-Union is a common evaluation metric for semantic image
-  segmentation.
-
-  For an individual class, the IoU metric is defined as follows:
-
-  ```
-  iou = true_positives / (true_positives + false_positives + false_negatives)
-  ```
-
-  To compute IoUs, the predictions are accumulated in a confusion matrix,
-  weighted by `sample_weight` and the metric is then calculated from it.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Note, this class first computes IoUs for all individual classes, then returns
-  the mean of IoUs for the classes that are specified by `target_class_ids`. If
-  `target_class_ids` has only one id value, the IoU of that specific class is
-  returned.
-
-  Args:
-    num_classes: The possible number of labels the prediction task can have.
-      A confusion matrix of dimension = [num_classes, num_classes] will be
-      allocated to accumulate predictions from which the metric is calculated.
-    target_class_ids: A tuple or list of target class ids for which the metric
-      is returned. To compute IoU for a specific class, a list (or tuple) of a
-      single id value should be provided.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> # cm = [[1, 1],
-  >>> #        [1, 1]]
-  >>> # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
-  >>> # iou = true_positives / (sum_row + sum_col - true_positives))
-  >>> # iou = [0.33, 0.33]
-  >>> m = tf.keras.metrics.IoU(num_classes=2, target_class_ids=[0])
-  >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1])
-  >>> m.result().numpy()
-  0.33333334
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1],
-  ...                sample_weight=[0.3, 0.3, 0.3, 0.1])
-  >>> # cm = [[0.3, 0.3],
-  >>> #        [0.3, 0.1]]
-  >>> # sum_row = [0.6, 0.4], sum_col = [0.6, 0.4], true_positives = [0.3, 0.1]
-  >>> # iou = [0.33, 0.14]
-  >>> m.result().numpy()
-  0.33333334
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.IoU(num_classes=2, target_class_ids=[0])])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(
-      self,
-      num_classes: int,
-      target_class_ids: Union[List[int], Tuple[int, ...]],
-      name=None,
-      dtype=None,
-  ):
-    super().__init__(
-        name=name,
-        num_classes=num_classes,
-        dtype=dtype,
-    )
-    if max(target_class_ids) >= num_classes:
-      raise ValueError(
-          f'Target class id {max(target_class_ids)} is out of range, which is '
-          f'[{0}, {num_classes}).')
-    self.target_class_ids = list(target_class_ids)
-
-  def result(self):
-    """Compute the intersection-over-union via the confusion matrix."""
-    sum_over_row = tf.cast(
-        tf.reduce_sum(self.total_cm, axis=0), dtype=self._dtype)
-    sum_over_col = tf.cast(
-        tf.reduce_sum(self.total_cm, axis=1), dtype=self._dtype)
-    true_positives = tf.cast(
-        tf.linalg.tensor_diag_part(self.total_cm), dtype=self._dtype)
-
-    # sum_over_row + sum_over_col =
-    #     2 * true_positives + false_positives + false_negatives.
-    denominator = sum_over_row + sum_over_col - true_positives
-
-    # Only keep the target classes
-    true_positives = tf.gather(true_positives, self.target_class_ids)
-    denominator = tf.gather(denominator, self.target_class_ids)
-
-    # If the denominator is 0, we need to ignore the class.
-    num_valid_entries = tf.reduce_sum(
-        tf.cast(tf.not_equal(denominator, 0), dtype=self._dtype))
-
-    iou = tf.math.divide_no_nan(true_positives, denominator)
-
-    return tf.math.divide_no_nan(
-        tf.reduce_sum(iou, name='mean_iou'), num_valid_entries)
-
-  def get_config(self):
-    config = {
-        'num_classes': self.num_classes,
-        'target_class_ids': self.target_class_ids,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.metrics.BinaryIoU')
-class BinaryIoU(IoU):
-  """Computes the Intersection-Over-Union metric for class 0 and/or 1.
-
-  General definition and computation:
-
-  Intersection-Over-Union is a common evaluation metric for semantic image
-  segmentation.
-
-  For an individual class, the IoU metric is defined as follows:
-
-  ```
-  iou = true_positives / (true_positives + false_positives + false_negatives)
-  ```
-
-  To compute IoUs, the predictions are accumulated in a confusion matrix,
-  weighted by `sample_weight` and the metric is then calculated from it.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  This class can be used to compute IoUs for a binary classification task where
-  the predictions are provided as logits. First a `threshold` is applied to the
-  predicted values such that those that are below the `threshold` are converted
-  to class 0 and those that are above the `threshold` are converted to class 1.
-
-  IoUs for classes 0 and 1 are then computed, the mean of IoUs for the classes
-  that are specified by `target_class_ids` is returned.
-
-  Note: with `threshold=0`, this metric has the same behavior as `IoU`.
-
-  Args:
-    target_class_ids: A tuple or list of target class ids for which the metric
-      is returned. Options are `[0]`, `[1]`, or `[0, 1]`. With `[0]` (or `[1]`),
-      the IoU metric for class 0 (or class 1, respectively) is returned. With
-      `[0, 1]`, the mean of IoUs for the two classes is returned.
-    threshold: A threshold that applies to the prediction logits to convert them
-      to either predicted class 0 if the logit is below `threshold` or predicted
-      class 1 if the logit is above `threshold`.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.3)
-  >>> m.update_state([0, 1, 0, 1], [0.1, 0.2, 0.4, 0.7])
-  >>> m.result().numpy()
-  0.33333334
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 1, 0, 1], [0.1, 0.2, 0.4, 0.7],
-  ...                sample_weight=[0.2, 0.3, 0.4, 0.1])
-  >>> # cm = [[0.2, 0.4],
-  >>> #        [0.3, 0.1]]
-  >>> # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
-  >>> # iou = [0.222, 0.125]
-  >>> m.result().numpy()
-  0.17361112
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.BinaryIoU(target_class_ids=[0], threshold=0.5)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(
-      self,
-      target_class_ids: Union[List[int], Tuple[int, ...]] = (0, 1),
-      threshold=0.5,
-      name=None,
-      dtype=None,
-  ):
-
-    super().__init__(
-        num_classes=2,
-        target_class_ids=target_class_ids,
-        name=name,
-        dtype=dtype,
-    )
-    self.threshold = threshold
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates the confusion matrix statistics.
-
-    Before the confusion matrix is updated, the predicted values are thresholded
-    to be:
-      0 for values that are smaller than the `threshold`
-      1 for values that are larger or equal to the `threshold`
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    y_pred = tf.cast(y_pred, self._dtype)
-    y_pred = tf.cast(y_pred >= self.threshold, self._dtype)
-    return super().update_state(y_true, y_pred, sample_weight)
-
-  def get_config(self):
-    return {
-        'target_class_ids': self.target_class_ids,
-        'threshold': self.threshold,
-        'name': self.name,
-        'dtype': self._dtype,
-    }
-
-
-@keras_export('keras.metrics.MeanIoU')
-class MeanIoU(IoU):
-  """Computes the mean Intersection-Over-Union metric.
-
-  General definition and computation:
-
-  Intersection-Over-Union is a common evaluation metric for semantic image
-  segmentation.
-
-  For an individual class, the IoU metric is defined as follows:
-
-  ```
-  iou = true_positives / (true_positives + false_positives + false_negatives)
-  ```
-
-  To compute IoUs, the predictions are accumulated in a confusion matrix,
-  weighted by `sample_weight` and the metric is then calculated from it.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Note that this class first computes IoUs for all individual classes, then
-  returns the mean of these values.
-
-  Args:
-    num_classes: The possible number of labels the prediction task can have.
-      This value must be provided, since a confusion matrix of dimension =
-      [num_classes, num_classes] will be allocated.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> # cm = [[1, 1],
-  >>> #        [1, 1]]
-  >>> # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
-  >>> # iou = true_positives / (sum_row + sum_col - true_positives))
-  >>> # result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2 = 0.33
-  >>> m = tf.keras.metrics.MeanIoU(num_classes=2)
-  >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1])
-  >>> m.result().numpy()
-  0.33333334
-
-  >>> m.reset_state()
-  >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1],
-  ...                sample_weight=[0.3, 0.3, 0.3, 0.1])
-  >>> m.result().numpy()
-  0.23809525
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.MeanIoU(num_classes=2)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self, num_classes, name=None, dtype=None):
-    target_class_ids = list(range(num_classes))
-    super().__init__(
-        name=name,
-        num_classes=num_classes,
-        target_class_ids=target_class_ids,
-        dtype=dtype,
-    )
-
-  def get_config(self):
-    return {
-        'num_classes': self.num_classes,
-        'name': self.name,
-        'dtype': self._dtype,
-    }
-
-
-@keras_export('keras.metrics.OneHotIoU')
-class OneHotIoU(IoU):
-  """Computes the Intersection-Over-Union metric for one-hot encoded labels.
-
-  General definition and computation:
-
-  Intersection-Over-Union is a common evaluation metric for semantic image
-  segmentation.
-
-  For an individual class, the IoU metric is defined as follows:
-
-  ```
-  iou = true_positives / (true_positives + false_positives + false_negatives)
-  ```
-
-  To compute IoUs, the predictions are accumulated in a confusion matrix,
-  weighted by `sample_weight` and the metric is then calculated from it.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  This class can be used to compute IoU for multi-class classification tasks
-  where the labels are one-hot encoded (the last axis should have one dimension
-  per class). Note that the predictions should also have the same shape. To
-  compute the IoU, first the labels and predictions are converted back into
-  integer format by taking the argmax over the class axis. Then the same
-  computation steps as for the base `IoU` class apply.
-
-  Note, if there is only one channel in the labels and predictions, this class
-  is the same as class `IoU`. In this case, use `IoU` instead.
-
-  Also, make sure that `num_classes` is equal to the number of classes in the
-  data, to avoid a "labels out of bound" error when the confusion matrix is
-  computed.
-
-  Args:
-    num_classes: The possible number of labels the prediction task can have.
-      A confusion matrix of shape `(num_classes, num_classes)` will be
-      allocated to accumulate predictions from which the metric is calculated.
-    target_class_ids: A tuple or list of target class ids for which the metric
-      is returned. To compute IoU for a specific class, a list (or tuple) of a
-      single id value should be provided.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
-  >>> y_pred = tf.constant([[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1],
-  ...                       [0.1, 0.4, 0.5]])
-  >>> sample_weight = [0.1, 0.2, 0.3, 0.4]
-  >>> m = tf.keras.metrics.OneHotIoU(num_classes=3, target_class_ids=[0, 2])
-  >>> m.update_state(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
-  >>> # cm = [[0, 0, 0.2+0.4],
-  >>> #       [0.3, 0, 0],
-  >>> #       [0, 0, 0.1]]
-  >>> # sum_row = [0.3, 0, 0.7], sum_col = [0.6, 0.3, 0.1]
-  >>> # true_positives = [0, 0, 0.1]
-  >>> # single_iou = true_positives / (sum_row + sum_col - true_positives))
-  >>> # mean_iou = (0 / (0.3 + 0.6 - 0) + 0.1 / (0.7 + 0.1 - 0.1)) / 2
-  >>> m.result().numpy()
-  0.071
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.OneHotIoU(num_classes=3, target_class_id=[1])])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(
-      self,
-      num_classes: int,
-      target_class_ids: Union[List[int], Tuple[int, ...]],
-      name=None,
-      dtype=None,
-  ):
-    super().__init__(
-        num_classes=num_classes,
-        target_class_ids=target_class_ids,
-        name=name,
-        dtype=dtype,
-    )
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates the confusion matrix statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    # Select max hot-encoding channels to convert into all-class format
-    y_true = tf.argmax(y_true, axis=-1, output_type=tf.int32)
-    y_pred = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
-
-    return super().update_state(y_true, y_pred, sample_weight)
-
-
-@keras_export('keras.metrics.OneHotMeanIoU')
-class OneHotMeanIoU(MeanIoU):
-  """Computes mean Intersection-Over-Union metric for one-hot encoded labels.
-
-  General definition and computation:
-
-  Intersection-Over-Union is a common evaluation metric for semantic image
-  segmentation.
-
-  For an individual class, the IoU metric is defined as follows:
-
-  ```
-  iou = true_positives / (true_positives + false_positives + false_negatives)
-  ```
-
-  To compute IoUs, the predictions are accumulated in a confusion matrix,
-  weighted by `sample_weight` and the metric is then calculated from it.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  This class can be used to compute the mean IoU for multi-class classification
-  tasks where the labels are one-hot encoded (the last axis should have one
-  dimension per class). Note that the predictions should also have the same
-  shape. To compute the mean IoU, first the labels and predictions are converted
-  back into integer format by taking the argmax over the class axis. Then the
-  same computation steps as for the base `MeanIoU` class apply.
-
-  Note, if there is only one channel in the labels and predictions, this class
-  is the same as class `MeanIoU`. In this case, use `MeanIoU` instead.
-
-  Also, make sure that `num_classes` is equal to the number of classes in the
-  data, to avoid a "labels out of bound" error when the confusion matrix is
-  computed.
-
-  Args:
-    num_classes: The possible number of labels the prediction task can have.
-      A confusion matrix of shape `(num_classes, num_classes)` will be
-      allocated to accumulate predictions from which the metric is calculated.
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-
-  Standalone usage:
-
-  >>> y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
-  >>> y_pred = tf.constant([[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1],
-  ...                       [0.1, 0.4, 0.5]])
-  >>> sample_weight = [0.1, 0.2, 0.3, 0.4]
-  >>> m = tf.keras.metrics.OneHotMeanIoU(num_classes=3)
-  >>> m.update_state(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
-  >>> # cm = [[0, 0, 0.2+0.4],
-  >>> #       [0.3, 0, 0],
-  >>> #       [0, 0, 0.1]]
-  >>> # sum_row = [0.3, 0, 0.7], sum_col = [0.6, 0.3, 0.1]
-  >>> # true_positives = [0, 0, 0.1]
-  >>> # single_iou = true_positives / (sum_row + sum_col - true_positives))
-  >>> # mean_iou = (0 + 0 + 0.1 / (0.7 + 0.1 - 0.1)) / 3
-  >>> m.result().numpy()
-  0.048
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.OneHotMeanIoU(num_classes=3)])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(
-      self,
-      num_classes: int,
-      name=None,
-      dtype=None,
-  ):
-    super().__init__(
-        num_classes=num_classes,
-        name=name,
-        dtype=dtype,
-    )
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates the confusion matrix statistics.
-
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
-
-    Returns:
-      Update op.
-    """
-    # Select max hot-encoding channels to convert into all-class format
-    y_true = tf.argmax(y_true, axis=-1, output_type=tf.int32)
-    y_pred = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
-
-    return super().update_state(y_true, y_pred, sample_weight)
-
-
-@keras_export('keras.metrics.BinaryCrossentropy')
-class BinaryCrossentropy(base_metric.MeanMetricWrapper):
-  """Computes the crossentropy metric between the labels and predictions.
-
-  This is the crossentropy metric class to be used when there are only two
-  label classes (0 and 1).
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    from_logits: (Optional )Whether output is expected to be a logits tensor.
-      By default, we consider that output encodes a probability distribution.
-    label_smoothing: (Optional) Float in [0, 1]. When > 0, label values are
-      smoothed, meaning the confidence on label values are relaxed.
-      e.g. `label_smoothing=0.2` means that we will use a value of `0.1` for
-      label `0` and `0.9` for label `1`".
-
-  Standalone usage:
-
-  >>> m = tf.keras.metrics.BinaryCrossentropy()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
-  >>> m.result().numpy()
-  0.81492424
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
-  ...                sample_weight=[1, 0])
-  >>> m.result().numpy()
-  0.9162905
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-      optimizer='sgd',
-      loss='mse',
-      metrics=[tf.keras.metrics.BinaryCrossentropy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               name='binary_crossentropy',
-               dtype=None,
-               from_logits=False,
-               label_smoothing=0):
-    super().__init__(
-        binary_crossentropy,
-        name,
-        dtype=dtype,
-        from_logits=from_logits,
-        label_smoothing=label_smoothing)
-
-
-@keras_export('keras.metrics.CategoricalCrossentropy')
-class CategoricalCrossentropy(base_metric.MeanMetricWrapper):
-  """Computes the crossentropy metric between the labels and predictions.
-
-  This is the crossentropy metric class to be used when there are multiple
-  label classes (2 or more). Here we assume that labels are given as a `one_hot`
-  representation. eg., When labels values are [2, 0, 1],
-   `y_true` = [[0, 0, 1], [1, 0, 0], [0, 1, 0]].
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    from_logits: (Optional) Whether output is expected to be a logits tensor.
-      By default, we consider that output encodes a probability distribution.
-    label_smoothing: (Optional) Float in [0, 1]. When > 0, label values are
-      smoothed, meaning the confidence on label values are relaxed. e.g.
-      `label_smoothing=0.2` means that we will use a value of `0.1` for label
-      `0` and `0.9` for label `1`"
-
-  Standalone usage:
-
-  >>> # EPSILON = 1e-7, y = y_true, y` = y_pred
-  >>> # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-  >>> # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-  >>> # xent = -sum(y * log(y'), axis = -1)
-  >>> #      = -((log 0.95), (log 0.1))
-  >>> #      = [0.051, 2.302]
-  >>> # Reduced xent = (0.051 + 2.302) / 2
-  >>> m = tf.keras.metrics.CategoricalCrossentropy()
-  >>> m.update_state([[0, 1, 0], [0, 0, 1]],
-  ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
-  >>> m.result().numpy()
-  1.1769392
-
-  >>> m.reset_state()
-  >>> m.update_state([[0, 1, 0], [0, 0, 1]],
-  ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]],
-  ...                sample_weight=tf.constant([0.3, 0.7]))
-  >>> m.result().numpy()
-  1.6271976
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.CategoricalCrossentropy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               name='categorical_crossentropy',
-               dtype=None,
-               from_logits=False,
-               label_smoothing=0):
-    super().__init__(
-        categorical_crossentropy,
-        name,
-        dtype=dtype,
-        from_logits=from_logits,
-        label_smoothing=label_smoothing)
-
-
-@keras_export('keras.metrics.SparseCategoricalCrossentropy')
-class SparseCategoricalCrossentropy(base_metric.MeanMetricWrapper):
-  """Computes the crossentropy metric between the labels and predictions.
-
-  Use this crossentropy metric when there are two or more label classes.
-  We expect labels to be provided as integers. If you want to provide labels
-  using `one-hot` representation, please use `CategoricalCrossentropy` metric.
-  There should be `# classes` floating point values per feature for `y_pred`
-  and a single floating point value per feature for `y_true`.
-
-  In the snippet below, there is a single floating point value per example for
-  `y_true` and `# classes` floating pointing values per example for `y_pred`.
-  The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
-  `[batch_size, num_classes]`.
-
-  Args:
-    name: (Optional) string name of the metric instance.
-    dtype: (Optional) data type of the metric result.
-    from_logits: (Optional) Whether output is expected to be a logits tensor.
-      By default, we consider that output encodes a probability distribution.
-    axis: (Optional) Defaults to -1. The dimension along which the metric is
-      computed.
-
-  Standalone usage:
-
-  >>> # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
-  >>> # logits = log(y_pred)
-  >>> # softmax = exp(logits) / sum(exp(logits), axis=-1)
-  >>> # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-  >>> # xent = -sum(y * log(softmax), 1)
-  >>> # log(softmax) = [[-2.9957, -0.0513, -16.1181],
-  >>> #                [-2.3026, -0.2231, -2.3026]]
-  >>> # y_true * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
-  >>> # xent = [0.0513, 2.3026]
-  >>> # Reduced xent = (0.0513 + 2.3026) / 2
-  >>> m = tf.keras.metrics.SparseCategoricalCrossentropy()
-  >>> m.update_state([1, 2],
-  ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
-  >>> m.result().numpy()
-  1.1769392
-
-  >>> m.reset_state()
-  >>> m.update_state([1, 2],
-  ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]],
-  ...                sample_weight=tf.constant([0.3, 0.7]))
-  >>> m.result().numpy()
-  1.6271976
-
-  Usage with `compile()` API:
-
-  ```python
-  model.compile(
-    optimizer='sgd',
-    loss='mse',
-    metrics=[tf.keras.metrics.SparseCategoricalCrossentropy()])
-  ```
-  """
-
-  @dtensor_utils.inject_mesh
-  def __init__(self,
-               name='sparse_categorical_crossentropy',
-               dtype=None,
-               from_logits=False,
-               axis=-1):
-    super().__init__(
-        sparse_categorical_crossentropy,
-        name,
-        dtype=dtype,
-        from_logits=from_logits,
-        axis=axis)
-
-
-SparseCategoricalCrossentropy.update_state.__doc__ = _SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING
-
-
-def accuracy(y_true, y_pred):
-  [y_pred, y_true], _ = \
-      metrics_utils.ragged_assert_compatible_and_get_flat_values(
-          [y_pred, y_true])
-  y_true.shape.assert_is_compatible_with(y_pred.shape)
-  if y_true.dtype != y_pred.dtype:
-    y_pred = tf.cast(y_pred, y_true.dtype)
-  return tf.cast(tf.equal(y_true, y_pred), backend.floatx())
-
-
-@keras_export('keras.metrics.binary_accuracy')
-@tf.__internal__.dispatch.add_dispatch_support
-def binary_accuracy(y_true, y_pred, threshold=0.5):
-  """Calculates how often predictions match binary labels.
-
-  Standalone usage:
-  >>> y_true = [[1], [1], [0], [0]]
-  >>> y_pred = [[1], [1], [0], [0]]
-  >>> m = tf.keras.metrics.binary_accuracy(y_true, y_pred)
-  >>> assert m.shape == (4,)
-  >>> m.numpy()
-  array([1., 1., 1., 1.], dtype=float32)
-
-  Args:
-    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-    threshold: (Optional) Float representing the threshold for deciding whether
-      prediction values are 1 or 0.
-
-  Returns:
-    Binary accuracy values. shape = `[batch_size, d0, .. dN-1]`
-  """
-  # Note: calls metrics_utils.binary_matches with mean reduction. This maintains
-  # public facing binary_accuracy behavior and seperates it from the vital
-  # behavior of the binary_matches method needed in backend dependencies.
-
-  return tf.reduce_mean(
-      metrics_utils.binary_matches(y_true, y_pred, threshold), axis=-1)
-
-
-@keras_export('keras.metrics.categorical_accuracy')
-@tf.__internal__.dispatch.add_dispatch_support
-def categorical_accuracy(y_true, y_pred):
-  """Calculates how often predictions match one-hot labels.
-
-  Standalone usage:
-  >>> y_true = [[0, 0, 1], [0, 1, 0]]
-  >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
-  >>> m = tf.keras.metrics.categorical_accuracy(y_true, y_pred)
-  >>> assert m.shape == (2,)
-  >>> m.numpy()
-  array([0., 1.], dtype=float32)
-
-  You can provide logits of classes as `y_pred`, since argmax of
-  logits and probabilities are same.
-
-  Args:
-    y_true: One-hot ground truth values.
-    y_pred: The prediction values.
-
-  Returns:
-    Categorical accuracy values.
-  """
-  # Note: wraps metrics_utils.categorical_matches. This seperates public facing
-  # categorical_accuracy behavior from the vital behavior of the
-  # categorical_matches method needed in backend dependencies.
-
-  return metrics_utils.sparse_categorical_matches(
-      tf.math.argmax(y_true, axis=-1), y_pred)
-
-
-@keras_export('keras.metrics.sparse_categorical_accuracy')
-@tf.__internal__.dispatch.add_dispatch_support
-def sparse_categorical_accuracy(y_true, y_pred):
-  """Calculates how often predictions match integer labels.
-
-  Standalone usage:
-  >>> y_true = [2, 1]
-  >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
-  >>> m = tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
-  >>> assert m.shape == (2,)
-  >>> m.numpy()
-  array([0., 1.], dtype=float32)
-
-  You can provide logits of classes as `y_pred`, since argmax of
-  logits and probabilities are same.
-
-  Args:
-    y_true: Integer ground truth values.
-    y_pred: The prediction values.
-
-  Returns:
-    Sparse categorical accuracy values.
-  """
-  # Note: wraps metrics_utils.sparse_categorical_matches method and checks for
-  # squeezing to align with expected public facing behavior. This seperates
-  # public facing sparse_categorical_accuracy behavior from the vital behavior
-  # of the sparse_categorical_matches method needed in backend dependencies.
-
-  matches = metrics_utils.sparse_categorical_matches(y_true, y_pred)
-
-  # if shape is (num_samples, 1) squeeze
-  if matches.shape.ndims > 1 and matches.shape[-1] == 1:
-    matches = tf.squeeze(matches, [-1])
-
-  return matches
-
-
-@keras_export('keras.metrics.top_k_categorical_accuracy')
-@tf.__internal__.dispatch.add_dispatch_support
-def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  """Computes how often targets are in the top `K` predictions.
-
-  Standalone usage:
-  >>> y_true = [[0, 0, 1], [0, 1, 0]]
-  >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
-  >>> m = tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)
-  >>> assert m.shape == (2,)
-  >>> m.numpy()
-  array([1., 1.], dtype=float32)
-
-  Args:
-    y_true: The ground truth values.
-    y_pred: The prediction values.
-    k: (Optional) Number of top elements to look at for computing accuracy.
-      Defaults to 5.
-
-  Returns:
-    Top K categorical accuracy value.
-  """
-  # Note: wraps metrics_utils.top_k_categorical_matches. This seperates
-  # public facing top_k_categorical_accuracy behavior from the vital behavior
-  # of the top_k_categorical_matches method needed in backend dependencies.
-
-  return metrics_utils.sparse_top_k_categorical_matches(
-      tf.math.argmax(y_true, axis=-1), y_pred, k)
-
-
-@keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
-@tf.__internal__.dispatch.add_dispatch_support
-def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-  """Computes how often integer targets are in the top `K` predictions.
-
-  Standalone usage:
-  >>> y_true = [2, 1]
-  >>> y_pred = [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]
-  >>> m = tf.keras.metrics.sparse_top_k_categorical_accuracy(
-  ...     y_true, y_pred, k=3)
-  >>> assert m.shape == (2,)
-  >>> m.numpy()
-  array([1., 1.], dtype=float32)
-
-  Args:
-    y_true: tensor of true targets.
-    y_pred: tensor of predicted targets.
-    k: (Optional) Number of top elements to look at for computing accuracy.
-      Defaults to 5.
-
-  Returns:
-    Sparse top K categorical accuracy value.
-  """
-  # Note: wraps metrics_utils.sparse_top_k_categorical_matches. This seperates
-  # public facing sparse_top_k_categorical_accuracy behavior from the vital
-  # behavior of the sparse_top_k_categorical_matches method needed in backend
-  # dependencies.
-
-  return metrics_utils.sparse_top_k_categorical_matches(y_true, y_pred, k)
-
-
-def cosine_similarity(y_true, y_pred, axis=-1):
-  """Computes the cosine similarity between labels and predictions.
-
-  Args:
-    y_true: The ground truth values.
-    y_pred: The prediction values.
-    axis: (Optional) Defaults to -1. The dimension along which the cosine
-      similarity is computed.
-
-  Returns:
-    Cosine similarity value.
-  """
-  y_true = tf.linalg.l2_normalize(y_true, axis=axis)
-  y_pred = tf.linalg.l2_normalize(y_pred, axis=axis)
-  return tf.reduce_sum(y_true * y_pred, axis=axis)
diff --git a/keras/metrics/metrics_correctness_test.py b/keras/metrics/metrics_correctness_test.py
index a3566d39df8c..6532a151252f 100644
--- a/keras/metrics/metrics_correctness_test.py
+++ b/keras/metrics/metrics_correctness_test.py
@@ -14,700 +14,810 @@
 # ==============================================================================
 """Tests metrics correctness using Keras model."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
-from keras.testing_infra import test_combinations
 from keras import layers
 from keras import losses
 from keras import metrics
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import losses_utils
 
 
 def get_multi_io_model():
-  inp_1 = layers.Input(shape=(1,), name='input_1')
-  inp_2 = layers.Input(shape=(1,), name='input_2')
-  x = layers.Dense(3, kernel_initializer='ones', trainable=False)
-  out_1 = layers.Dense(
-      1, kernel_initializer='ones', name='output_1', trainable=False)
-  out_2 = layers.Dense(
-      1, kernel_initializer='ones', name='output_2', trainable=False)
+    inp_1 = layers.Input(shape=(1,), name="input_1")
+    inp_2 = layers.Input(shape=(1,), name="input_2")
+    x = layers.Dense(3, kernel_initializer="ones", trainable=False)
+    out_1 = layers.Dense(
+        1, kernel_initializer="ones", name="output_1", trainable=False
+    )
+    out_2 = layers.Dense(
+        1, kernel_initializer="ones", name="output_2", trainable=False
+    )
 
-  branch_a = [inp_1, x, out_1]
-  branch_b = [inp_2, x, out_2]
-  return test_utils.get_multi_io_model(branch_a, branch_b)
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    return test_utils.get_multi_io_model(branch_a, branch_b)
 
 
 def custom_generator_multi_io(sample_weights=None):
-  batch_size = 2
-  num_samples = 5
-  inputs = np.asarray([[1.], [2.], [3.], [4.], [5.]])
-  targets_1 = np.asarray([[2.], [4.], [6.], [8.], [10.]])
-  targets_2 = np.asarray([[1.], [2.], [3.], [4.], [5.]])
-  start = 0
-  while True:
-    if start > num_samples:
-      start = 0
-    end = start + batch_size
-    x = [inputs[start:end], inputs[start:end]]
-    y = [targets_1[start:end], targets_2[start:end]]
-    if sample_weights:
-      sw = tf.nest.map_structure(lambda w: w[start:end], sample_weights)
-    else:
-      sw = None
-    start = end
-    yield x, y, sw
-
-
-@test_combinations.run_with_all_model_types(exclude_models=['sequential'])
+    batch_size = 2
+    num_samples = 5
+    inputs = np.asarray([[1.0], [2.0], [3.0], [4.0], [5.0]])
+    targets_1 = np.asarray([[2.0], [4.0], [6.0], [8.0], [10.0]])
+    targets_2 = np.asarray([[1.0], [2.0], [3.0], [4.0], [5.0]])
+    start = 0
+    while True:
+        if start > num_samples:
+            start = 0
+        end = start + batch_size
+        x = [inputs[start:end], inputs[start:end]]
+        y = [targets_1[start:end], targets_2[start:end]]
+        if sample_weights:
+            sw = tf.nest.map_structure(lambda w: w[start:end], sample_weights)
+        else:
+            sw = None
+        start = end
+        yield x, y, sw
+
+
+@test_combinations.run_with_all_model_types(exclude_models=["sequential"])
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TestMetricsCorrectnessMultiIO(test_combinations.TestCase):
-
-  def _get_compiled_multi_io_model(self):
-    model = get_multi_io_model()
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
-        weighted_metrics=[
-            metrics.MeanSquaredError(name='mean_squared_error_2')
-        ],
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  def setUp(self):
-    super(TestMetricsCorrectnessMultiIO, self).setUp()
-    self.x = np.asarray([[1.], [2.], [3.], [4.], [5.]])
-    self.y1 = np.asarray([[2.], [4.], [6.], [8.], [10.]])
-    self.y2 = np.asarray([[1.], [2.], [3.], [4.], [5.]])
-    self.sample_weight_1 = np.asarray([2., 3., 4., 5., 6.])
-    self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5, 3.])
-
-    # y_true_1 = [[2.], [4.], [6.], [8.], [10.]]
-    # y_pred_1 = [[3.], [6.], [9.], [12.], [15.]]
-    # y_true_2 = [[1.], [2.], [3.], [4.], [5.]]
-    # y_pred_2 = [[3.], [6.], [9.], [12.], [15.]]
-
-    # Weighted metric `output_1`:
-    #   Total = ((3 - 2)^2 * 2 + (6 - 4)^2 * 3) +
-    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5) +
-    #           ((15 - 10)^2 *  6)
-    #         = 280
-    #   Count = (2 + 3) + (4 + 5) + 6 = 20
-    #   Result = 14
-
-    # Weighted metric `output_2`:
-    #   Total = ((3 - 1)^2 * 3.5 + (6 - 2)^2 * 2.5) +
-    #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5) +
-    #           (15 - 5)^2 * 3.0
-    #         = 440
-    #   Count = (3.5 + 2.5) + (1.5 + 0.5) + 3.0 = 11.0
-    #   Result = 40
-
-    # Loss `output_1` with weights:
-    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
-    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5) +
-    #           ((15 - 10)^2 *  6)
-    #         = 280
-    #   Count = 2 + 2 + 1
-    #   Result = 56
-
-    # Loss `output_1` without weights/Metric `output_1`:
-    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) + (15 - 10)^2
-    #         = 55
-    #   Count = 2 + 2 + 1
-    #   Result = 11
-
-    # Loss `output_2` with weights:
-    #   Total = ((3 - 1)^2 * 3.5 + (6 - 2)^2 * 2.5) +
-    #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5) +
-    #           (15 - 5)^2 * 3.0
-    #         = 440
-    #   Count = 2 + 2 + 1
-    #   Result = 88
-
-    # Loss `output_2` without weights/Metric `output_2`:
-    #   Total = ((3 - 1)^2 + (6 - 2)^2) + ((9 - 3)^2 + (12 - 4)^2) + (15 - 5)^2
-    #         = 220
-    #   Count = 2 + 2 + 1
-    #   Result = 44
-
-    # Total loss with weights = 56 + 88 = 144
-    # Total loss without weights = 11 + 44 = 55
-
-    self.wmse = 'mean_squared_error_2'
-    self.expected_fit_result_with_weights = {
-        'output_1_mean_squared_error': [11, 11],
-        'output_2_mean_squared_error': [44, 44],
-        'output_1_' + self.wmse: [14, 14],
-        'output_2_' + self.wmse: [40, 40],
-        'loss': [144, 144],
-        'output_1_loss': [56, 56],
-        'output_2_loss': [88, 88],
-    }
-
-    self.expected_fit_result_with_weights_output_2 = {
-        'output_1_mean_squared_error': [11, 11],
-        'output_2_mean_squared_error': [44, 44],
-        'output_1_' + self.wmse: [11, 11],
-        'output_2_' + self.wmse: [40, 40],
-        'loss': [99, 99],
-        'output_1_loss': [11, 11],
-        'output_2_loss': [88, 88],
-    }
-
-    self.expected_fit_result = {
-        'output_1_mean_squared_error': [11, 11],
-        'output_2_mean_squared_error': [44, 44],
-        'output_1_' + self.wmse: [11, 11],
-        'output_2_' + self.wmse: [44, 44],
-        'loss': [55, 55],
-        'output_1_loss': [11, 11],
-        'output_2_loss': [44, 44],
-    }
-
-    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
-    # 'output_1_mean_squared_error', 'output_1_mean_squared_error_2',
-    # 'output_2_mean_squared_error', 'output_2_mean_squared_error_2'
-    self.expected_batch_result_with_weights = [144, 56, 88, 11, 14, 44, 40]
-    self.expected_batch_result_with_weights_output_2 = [
-        99, 11, 88, 11, 11, 44, 40
-    ]
-    self.expected_batch_result = [55, 11, 44, 11, 11, 44, 44]
-
-  def test_fit(self):
-    model = self._get_compiled_multi_io_model()
-    history = model.fit([self.x, self.x], [self.y1, self.y2],
-                        batch_size=2,
-                        epochs=2,
-                        shuffle=False)
-    for key, value in self.expected_fit_result.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_fit_with_sample_weight(self):
-    model = self._get_compiled_multi_io_model()
-    history = model.fit([self.x, self.x], [self.y1, self.y2],
-                        sample_weight={
-                            'output_1': self.sample_weight_1,
-                            'output_2': self.sample_weight_2,
-                        },
-                        batch_size=2,
-                        epochs=2,
-                        shuffle=False)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-    # Set weights for one output (use batch size).
-    history = model.fit([self.x, self.x], [self.y1, self.y2],
-                        sample_weight={'output_2': self.sample_weight_2},
-                        batch_size=2,
-                        epochs=2,
-                        shuffle=False)
-
-    for key, value in self.expected_fit_result_with_weights_output_2.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_eval(self):
-    model = self._get_compiled_multi_io_model()
-    eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
-                                 batch_size=2)
-    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
-
-  def test_eval_with_sample_weight(self):
-    model = self._get_compiled_multi_io_model()
-    eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
-                                 batch_size=2,
-                                 sample_weight={
-                                     'output_1': self.sample_weight_1,
-                                     'output_2': self.sample_weight_2,
-                                 })
-    self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
-                        1e-3)
-
-    # Set weights for one output.
-    model = self._get_compiled_multi_io_model()
-    eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
-                                 batch_size=2,
-                                 sample_weight={
-                                     'output_2': self.sample_weight_2,
-                                 })
-    self.assertAllClose(eval_result,
-                        self.expected_batch_result_with_weights_output_2, 1e-3)
-
-    # Verify that metric value is same with arbitrary weights and batch size.
-    x = np.random.random((50, 1))
-    y = np.random.random((50, 1))
-    w = np.random.random((50,))
-    mse1 = model.evaluate([x, x], [y, y], sample_weight=[w, w], batch_size=5)[3]
-    mse2 = model.evaluate([x, x], [y, y], sample_weight=[w, w],
-                          batch_size=10)[3]
-    self.assertAllClose(mse1, mse2, 1e-3)
-
-  def test_train_on_batch(self):
-    model = self._get_compiled_multi_io_model()
-    result = model.train_on_batch([self.x, self.x], [self.y1, self.y2])
-    self.assertAllClose(result, self.expected_batch_result, 1e-3)
-
-  def test_train_on_batch_with_sample_weight(self):
-    model = self._get_compiled_multi_io_model()
-    result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                                  sample_weight={
-                                      'output_1': self.sample_weight_1,
-                                      'output_2': self.sample_weight_2,
-                                  })
-    self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-    # Set weights for one output.
-    result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                                  sample_weight={
-                                      'output_2': self.sample_weight_2,
-                                  })
-    self.assertAllClose(result,
-                        self.expected_batch_result_with_weights_output_2, 1e-3)
-
-  def test_test_on_batch(self):
-    model = self._get_compiled_multi_io_model()
-    result = model.test_on_batch([self.x, self.x], [self.y1, self.y2])
-    self.assertAllClose(result, self.expected_batch_result, 1e-3)
-
-  def test_test_on_batch_with_sample_weight(self):
-    model = self._get_compiled_multi_io_model()
-    result = model.test_on_batch([self.x, self.x], [self.y1, self.y2],
-                                 sample_weight={
-                                     'output_1': self.sample_weight_1,
-                                     'output_2': self.sample_weight_2,
-                                 })
-    self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-    # Set weights for one output.
-    result = model.test_on_batch([self.x, self.x], [self.y1, self.y2],
-                                 sample_weight={
-                                     'output_2': self.sample_weight_2,
-                                 })
-    self.assertAllClose(result,
-                        self.expected_batch_result_with_weights_output_2, 1e-3)
-
-  def test_fit_generator(self):
-    model = self._get_compiled_multi_io_model()
-    history = model.fit_generator(
-        custom_generator_multi_io(), steps_per_epoch=3, epochs=2)
-    for key, value in self.expected_fit_result.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_fit_generator_with_sample_weight(self):
-    model = self._get_compiled_multi_io_model()
-    history = model.fit_generator(
-        custom_generator_multi_io(
-            sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps_per_epoch=3,
-        epochs=2)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-    # Set weights for one output.
-    history = model.fit_generator(
-        custom_generator_multi_io(
-            sample_weights={'output_2': self.sample_weight_2}),
-        steps_per_epoch=3,
-        epochs=2)
-    for key, value in self.expected_fit_result_with_weights_output_2.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_eval_generator(self):
-    model = self._get_compiled_multi_io_model()
-    eval_result = model.evaluate_generator(custom_generator_multi_io(), steps=3)
-    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
-
-  def test_eval_generator_with_sample_weight(self):
-    model = self._get_compiled_multi_io_model()
-    eval_result = model.evaluate_generator(
-        custom_generator_multi_io(
-            sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps=3)
-    self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
-                        1e-3)
-
-    # Set weights for one output.
-    eval_result = model.evaluate_generator(
-        custom_generator_multi_io(
-            sample_weights={'output_2': self.sample_weight_2}),
-        steps=3)
-    self.assertAllClose(eval_result,
-                        self.expected_batch_result_with_weights_output_2, 1e-3)
+    def _get_compiled_multi_io_model(self):
+        model = get_multi_io_model()
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            metrics=[metrics.MeanSquaredError(name="mean_squared_error")],
+            weighted_metrics=[
+                metrics.MeanSquaredError(name="mean_squared_error_2")
+            ],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
+
+    def setUp(self):
+        super(TestMetricsCorrectnessMultiIO, self).setUp()
+        self.x = np.asarray([[1.0], [2.0], [3.0], [4.0], [5.0]])
+        self.y1 = np.asarray([[2.0], [4.0], [6.0], [8.0], [10.0]])
+        self.y2 = np.asarray([[1.0], [2.0], [3.0], [4.0], [5.0]])
+        self.sample_weight_1 = np.asarray([2.0, 3.0, 4.0, 5.0, 6.0])
+        self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5, 3.0])
+
+        # y_true_1 = [[2.], [4.], [6.], [8.], [10.]]
+        # y_pred_1 = [[3.], [6.], [9.], [12.], [15.]]
+        # y_true_2 = [[1.], [2.], [3.], [4.], [5.]]
+        # y_pred_2 = [[3.], [6.], [9.], [12.], [15.]]
+
+        # Weighted metric `output_1`:
+        #   Total = ((3 - 2)^2 * 2 + (6 - 4)^2 * 3) +
+        #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5) +
+        #           ((15 - 10)^2 *  6)
+        #         = 280
+        #   Count = (2 + 3) + (4 + 5) + 6 = 20
+        #   Result = 14
+
+        # Weighted metric `output_2`:
+        #   Total = ((3 - 1)^2 * 3.5 + (6 - 2)^2 * 2.5) +
+        #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5) +
+        #           (15 - 5)^2 * 3.0
+        #         = 440
+        #   Count = (3.5 + 2.5) + (1.5 + 0.5) + 3.0 = 11.0
+        #   Result = 40
+
+        # Loss `output_1` with weights:
+        #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+        #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5) +
+        #           ((15 - 10)^2 *  6)
+        #         = 280
+        #   Count = 2 + 2 + 1
+        #   Result = 56
+
+        # Loss `output_1` without weights/Metric `output_1`:
+        #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + \
+        #           (12 - 8)^2) + (15 - 10)^2
+        #         = 55
+        #   Count = 2 + 2 + 1
+        #   Result = 11
+
+        # Loss `output_2` with weights:
+        #   Total = ((3 - 1)^2 * 3.5 + (6 - 2)^2 * 2.5) +
+        #           ((9 - 3)^2 * 1.5 + (12 - 4)^2 * 0.5) +
+        #           (15 - 5)^2 * 3.0
+        #         = 440
+        #   Count = 2 + 2 + 1
+        #   Result = 88
+
+        # Loss `output_2` without weights/Metric `output_2`:
+        #   Total = ((3 - 1)^2 + (6 - 2)^2) + ((9 - 3)^2 + \
+        #           (12 - 4)^2) + (15 - 5)^2
+        #         = 220
+        #   Count = 2 + 2 + 1
+        #   Result = 44
+
+        # Total loss with weights = 56 + 88 = 144
+        # Total loss without weights = 11 + 44 = 55
+
+        self.wmse = "mean_squared_error_2"
+        self.expected_fit_result_with_weights = {
+            "output_1_mean_squared_error": [11, 11],
+            "output_2_mean_squared_error": [44, 44],
+            "output_1_" + self.wmse: [14, 14],
+            "output_2_" + self.wmse: [40, 40],
+            "loss": [144, 144],
+            "output_1_loss": [56, 56],
+            "output_2_loss": [88, 88],
+        }
+
+        self.expected_fit_result_with_weights_output_2 = {
+            "output_1_mean_squared_error": [11, 11],
+            "output_2_mean_squared_error": [44, 44],
+            "output_1_" + self.wmse: [11, 11],
+            "output_2_" + self.wmse: [40, 40],
+            "loss": [99, 99],
+            "output_1_loss": [11, 11],
+            "output_2_loss": [88, 88],
+        }
+
+        self.expected_fit_result = {
+            "output_1_mean_squared_error": [11, 11],
+            "output_2_mean_squared_error": [44, 44],
+            "output_1_" + self.wmse: [11, 11],
+            "output_2_" + self.wmse: [44, 44],
+            "loss": [55, 55],
+            "output_1_loss": [11, 11],
+            "output_2_loss": [44, 44],
+        }
+
+        # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+        # 'output_1_mean_squared_error', 'output_1_mean_squared_error_2',
+        # 'output_2_mean_squared_error', 'output_2_mean_squared_error_2'
+        self.expected_batch_result_with_weights = [144, 56, 88, 11, 14, 44, 40]
+        self.expected_batch_result_with_weights_output_2 = [
+            99,
+            11,
+            88,
+            11,
+            11,
+            44,
+            40,
+        ]
+        self.expected_batch_result = [55, 11, 44, 11, 11, 44, 44]
+
+    def test_fit(self):
+        model = self._get_compiled_multi_io_model()
+        history = model.fit(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            batch_size=2,
+            epochs=2,
+            shuffle=False,
+        )
+        for key, value in self.expected_fit_result.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_fit_with_sample_weight(self):
+        model = self._get_compiled_multi_io_model()
+        history = model.fit(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+            batch_size=2,
+            epochs=2,
+            shuffle=False,
+        )
+        for key, value in self.expected_fit_result_with_weights.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+        # Set weights for one output (use batch size).
+        history = model.fit(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={"output_2": self.sample_weight_2},
+            batch_size=2,
+            epochs=2,
+            shuffle=False,
+        )
+
+        for (
+            key,
+            value,
+        ) in self.expected_fit_result_with_weights_output_2.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_eval(self):
+        model = self._get_compiled_multi_io_model()
+        eval_result = model.evaluate(
+            [self.x, self.x], [self.y1, self.y2], batch_size=2
+        )
+        self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    def test_eval_with_sample_weight(self):
+        model = self._get_compiled_multi_io_model()
+        eval_result = model.evaluate(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            batch_size=2,
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+        )
+        self.assertAllClose(
+            eval_result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+        # Set weights for one output.
+        model = self._get_compiled_multi_io_model()
+        eval_result = model.evaluate(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            batch_size=2,
+            sample_weight={
+                "output_2": self.sample_weight_2,
+            },
+        )
+        self.assertAllClose(
+            eval_result, self.expected_batch_result_with_weights_output_2, 1e-3
+        )
+
+        # Verify that metric value is same with arbitrary weights and batch
+        # size.
+        x = np.random.random((50, 1))
+        y = np.random.random((50, 1))
+        w = np.random.random((50,))
+        mse1 = model.evaluate(
+            [x, x], [y, y], sample_weight=[w, w], batch_size=5
+        )[3]
+        mse2 = model.evaluate(
+            [x, x], [y, y], sample_weight=[w, w], batch_size=10
+        )[3]
+        self.assertAllClose(mse1, mse2, 1e-3)
+
+    def test_train_on_batch(self):
+        model = self._get_compiled_multi_io_model()
+        result = model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+        self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+    def test_train_on_batch_with_sample_weight(self):
+        model = self._get_compiled_multi_io_model()
+        result = model.train_on_batch(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+        )
+        self.assertAllClose(
+            result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+        # Set weights for one output.
+        result = model.train_on_batch(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_2": self.sample_weight_2,
+            },
+        )
+        self.assertAllClose(
+            result, self.expected_batch_result_with_weights_output_2, 1e-3
+        )
+
+    def test_test_on_batch(self):
+        model = self._get_compiled_multi_io_model()
+        result = model.test_on_batch([self.x, self.x], [self.y1, self.y2])
+        self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+    def test_test_on_batch_with_sample_weight(self):
+        model = self._get_compiled_multi_io_model()
+        result = model.test_on_batch(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+        )
+        self.assertAllClose(
+            result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+        # Set weights for one output.
+        result = model.test_on_batch(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_2": self.sample_weight_2,
+            },
+        )
+        self.assertAllClose(
+            result, self.expected_batch_result_with_weights_output_2, 1e-3
+        )
+
+    def test_fit_generator(self):
+        model = self._get_compiled_multi_io_model()
+        history = model.fit_generator(
+            custom_generator_multi_io(), steps_per_epoch=3, epochs=2
+        )
+        for key, value in self.expected_fit_result.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_fit_generator_with_sample_weight(self):
+        model = self._get_compiled_multi_io_model()
+        history = model.fit_generator(
+            custom_generator_multi_io(
+                sample_weights=[self.sample_weight_1, self.sample_weight_2]
+            ),
+            steps_per_epoch=3,
+            epochs=2,
+        )
+        for key, value in self.expected_fit_result_with_weights.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+        # Set weights for one output.
+        history = model.fit_generator(
+            custom_generator_multi_io(
+                sample_weights={"output_2": self.sample_weight_2}
+            ),
+            steps_per_epoch=3,
+            epochs=2,
+        )
+        for (
+            key,
+            value,
+        ) in self.expected_fit_result_with_weights_output_2.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_eval_generator(self):
+        model = self._get_compiled_multi_io_model()
+        eval_result = model.evaluate_generator(
+            custom_generator_multi_io(), steps=3
+        )
+        self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    def test_eval_generator_with_sample_weight(self):
+        model = self._get_compiled_multi_io_model()
+        eval_result = model.evaluate_generator(
+            custom_generator_multi_io(
+                sample_weights=[self.sample_weight_1, self.sample_weight_2]
+            ),
+            steps=3,
+        )
+        self.assertAllClose(
+            eval_result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+        # Set weights for one output.
+        eval_result = model.evaluate_generator(
+            custom_generator_multi_io(
+                sample_weights={"output_2": self.sample_weight_2}
+            ),
+            steps=3,
+        )
+        self.assertAllClose(
+            eval_result, self.expected_batch_result_with_weights_output_2, 1e-3
+        )
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TestMetricsCorrectnessSingleIO(test_combinations.TestCase):
-
-  def _get_model(self):
-    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
-    out = layers.Dense(
-        1, kernel_initializer='ones', name='output', trainable=False)
-    model = test_utils.get_model_from_layers([x, out], input_shape=(1,))
-    model.compile(
-        optimizer='rmsprop',
-        loss='mse',
-        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
-        weighted_metrics=[
-            metrics.MeanSquaredError(name='mean_squared_error_2')
-        ],
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  def _custom_generator(self, sample_weight=None):
-    batch_size = 2
-    num_samples = 4
-    x = np.asarray([[1.], [2.], [3.], [4.]])
-    y = np.asarray([[2.], [4.], [6.], [8.]])
-    w = sample_weight
-    i = 0
-
-    while True:
-      batch_index = i * batch_size % num_samples
-      i += 1
-      start = batch_index
-      end = start + batch_size
-      yield x[start:end], y[start:end], None if w is None else w[start:end]
-
-  def setUp(self):
-    super(TestMetricsCorrectnessSingleIO, self).setUp()
-    self.x = np.asarray([[1.], [2.], [3.], [4.]])
-    self.y = np.asarray([[2.], [4.], [6.], [8.]])
-    self.sample_weight = np.asarray([2., 3., 4., 5.])
-    self.class_weight = {i: 1 for i in range(10)}
-    self.class_weight.update({2: 2, 4: 3, 6: 4, 8: 5})
-
-    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
-
-    # Metric:
-    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
-    #   Count = 2 + 2
-    #   Result = 7.5
-
-    # Weighted metric:
-    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
-    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
-    #         = 130
-    #   Count = (2 + 3) + (4 + 5)
-    #   Result = 9.2857141
-
-    # Total loss with weights:
-    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
-    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
-    #         = 130,
-    #   Count = 2 + 2
-    #   Result = 32.5
-
-    # Total loss without weights:
-    #   Total = ((3 - 2)^2 + (6 - 4)^2) +
-    #           ((9 - 6)^2 + (12 - 8)^2)
-    #         = 30,
-    #   Count = 2 + 2
-    #   Result = 7.5
-
-    wmse = 'mean_squared_error_2'
-
-    self.expected_fit_result_with_weights = {
-        'mean_squared_error': [7.5, 7.5],
-        wmse: [9.286, 9.286],
-        'loss': [32.5, 32.5]
-    }
-
-    self.expected_fit_result = {
-        'mean_squared_error': [7.5, 7.5],
-        wmse: [7.5, 7.5],
-        'loss': [7.5, 7.5]
-    }
-
-    # In the order: 'loss', 'mean_squared_error', 'mean_squared_error_2'
-    self.expected_batch_result_with_weights = [32.5, 7.5, 9.286]
-    self.expected_batch_result = [7.5, 7.5, 7.5]
-
-  def test_fit(self):
-    model = self._get_model()
-
-    history = model.fit(
-        self.x,
-        self.y,
-        batch_size=2,
-        epochs=2,
-        shuffle=False)
-    for key, value in self.expected_fit_result.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_fit_with_sample_weight(self):
-    model = self._get_model()
-    history = model.fit(
-        self.x,
-        self.y,
-        sample_weight=self.sample_weight,
-        batch_size=2,
-        epochs=2,
-        shuffle=False)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_fit_with_class_weight(self):
-    model = self._get_model()
-    history = model.fit(
-        self.x,
-        self.y,
-        class_weight=self.class_weight,
-        batch_size=2,
-        epochs=2,
-        shuffle=False)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_eval(self):
-    model = self._get_model()
-    eval_result = model.evaluate(self.x, self.y, batch_size=2)
-    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
-
-  def test_eval_with_sample_weight(self):
-    model = self._get_model()
-    eval_result = model.evaluate(
-        self.x, self.y, batch_size=2, sample_weight=self.sample_weight)
-    self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
-                        1e-3)
-
-    # Verify that metric value is same with arbitrary weights and batch size.
-    x = np.random.random((50, 1))
-    y = np.random.random((50, 1))
-    w = np.random.random((50,))
-    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[1]
-    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[1]
-    self.assertAllClose(mse1, mse2, 1e-3)
-
-  def test_train_on_batch(self):
-    model = self._get_model()
-    result = model.train_on_batch(self.x, self.y)
-    self.assertAllClose(result, self.expected_batch_result, 1e-3)
-
-  def test_train_on_batch_with_sample_weight(self):
-    model = self._get_model()
-    result = model.train_on_batch(
-        self.x, self.y, sample_weight=self.sample_weight)
-    self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-  def test_train_on_batch_with_class_weight(self):
-    model = self._get_model()
-    result = model.train_on_batch(
-        self.x, self.y, class_weight=self.class_weight)
-    self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-  def test_test_on_batch(self):
-    model = self._get_model()
-    result = model.test_on_batch(self.x, self.y)
-    self.assertAllClose(result, self.expected_batch_result, 1e-3)
-
-  def test_test_on_batch_with_sample_weight(self):
-    model = self._get_model()
-    result = model.test_on_batch(
-        self.x, self.y, sample_weight=self.sample_weight)
-    self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-  def test_fit_generator(self):
-    model = self._get_model()
-    history = model.fit_generator(
-        self._custom_generator(), steps_per_epoch=2, epochs=2)
-    for key, value in self.expected_fit_result.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_fit_generator_with_sample_weight(self):
-    model = self._get_model()
-    history = model.fit_generator(
-        self._custom_generator(sample_weight=self.sample_weight),
-        steps_per_epoch=2,
-        epochs=2)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_fit_generator_with_class_weight(self):
-    model = self._get_model()
-    history = model.fit_generator(
-        self._custom_generator(),
-        steps_per_epoch=2,
-        epochs=2,
-        class_weight=self.class_weight)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_eval_generator(self):
-    model = self._get_model()
-    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
-    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
-
-  def test_eval_generator_with_sample_weight(self):
-    model = self._get_model()
-    eval_result = model.evaluate_generator(
-        self._custom_generator(sample_weight=self.sample_weight), steps=2)
-    self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
-                        1e-3)
-
-
-@test_combinations.run_with_all_model_types(exclude_models=['sequential'])
+    def _get_model(self):
+        x = layers.Dense(3, kernel_initializer="ones", trainable=False)
+        out = layers.Dense(
+            1, kernel_initializer="ones", name="output", trainable=False
+        )
+        model = test_utils.get_model_from_layers([x, out], input_shape=(1,))
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            metrics=[metrics.MeanSquaredError(name="mean_squared_error")],
+            weighted_metrics=[
+                metrics.MeanSquaredError(name="mean_squared_error_2")
+            ],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
+
+    def _custom_generator(self, sample_weight=None):
+        batch_size = 2
+        num_samples = 4
+        x = np.asarray([[1.0], [2.0], [3.0], [4.0]])
+        y = np.asarray([[2.0], [4.0], [6.0], [8.0]])
+        w = sample_weight
+        i = 0
+
+        while True:
+            batch_index = i * batch_size % num_samples
+            i += 1
+            start = batch_index
+            end = start + batch_size
+            yield x[start:end], y[start:end], None if w is None else w[
+                start:end
+            ]
+
+    def setUp(self):
+        super(TestMetricsCorrectnessSingleIO, self).setUp()
+        self.x = np.asarray([[1.0], [2.0], [3.0], [4.0]])
+        self.y = np.asarray([[2.0], [4.0], [6.0], [8.0]])
+        self.sample_weight = np.asarray([2.0, 3.0, 4.0, 5.0])
+        self.class_weight = {i: 1 for i in range(10)}
+        self.class_weight.update({2: 2, 4: 3, 6: 4, 8: 5})
+
+        # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+        # Metric:
+        #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
+        #   Count = 2 + 2
+        #   Result = 7.5
+
+        # Weighted metric:
+        #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+        #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+        #         = 130
+        #   Count = (2 + 3) + (4 + 5)
+        #   Result = 9.2857141
+
+        # Total loss with weights:
+        #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+        #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+        #         = 130,
+        #   Count = 2 + 2
+        #   Result = 32.5
+
+        # Total loss without weights:
+        #   Total = ((3 - 2)^2 + (6 - 4)^2) +
+        #           ((9 - 6)^2 + (12 - 8)^2)
+        #         = 30,
+        #   Count = 2 + 2
+        #   Result = 7.5
+
+        wmse = "mean_squared_error_2"
+
+        self.expected_fit_result_with_weights = {
+            "mean_squared_error": [7.5, 7.5],
+            wmse: [9.286, 9.286],
+            "loss": [32.5, 32.5],
+        }
+
+        self.expected_fit_result = {
+            "mean_squared_error": [7.5, 7.5],
+            wmse: [7.5, 7.5],
+            "loss": [7.5, 7.5],
+        }
+
+        # In the order: 'loss', 'mean_squared_error', 'mean_squared_error_2'
+        self.expected_batch_result_with_weights = [32.5, 7.5, 9.286]
+        self.expected_batch_result = [7.5, 7.5, 7.5]
+
+    def test_fit(self):
+        model = self._get_model()
+
+        history = model.fit(
+            self.x, self.y, batch_size=2, epochs=2, shuffle=False
+        )
+        for key, value in self.expected_fit_result.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_fit_with_sample_weight(self):
+        model = self._get_model()
+        history = model.fit(
+            self.x,
+            self.y,
+            sample_weight=self.sample_weight,
+            batch_size=2,
+            epochs=2,
+            shuffle=False,
+        )
+        for key, value in self.expected_fit_result_with_weights.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_fit_with_class_weight(self):
+        model = self._get_model()
+        history = model.fit(
+            self.x,
+            self.y,
+            class_weight=self.class_weight,
+            batch_size=2,
+            epochs=2,
+            shuffle=False,
+        )
+        for key, value in self.expected_fit_result_with_weights.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_eval(self):
+        model = self._get_model()
+        eval_result = model.evaluate(self.x, self.y, batch_size=2)
+        self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    def test_eval_with_sample_weight(self):
+        model = self._get_model()
+        eval_result = model.evaluate(
+            self.x, self.y, batch_size=2, sample_weight=self.sample_weight
+        )
+        self.assertAllClose(
+            eval_result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+        # Verify that metric value is same with arbitrary weights and batch
+        # size.
+        x = np.random.random((50, 1))
+        y = np.random.random((50, 1))
+        w = np.random.random((50,))
+        mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[1]
+        mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[1]
+        self.assertAllClose(mse1, mse2, 1e-3)
+
+    def test_train_on_batch(self):
+        model = self._get_model()
+        result = model.train_on_batch(self.x, self.y)
+        self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+    def test_train_on_batch_with_sample_weight(self):
+        model = self._get_model()
+        result = model.train_on_batch(
+            self.x, self.y, sample_weight=self.sample_weight
+        )
+        self.assertAllClose(
+            result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+    def test_train_on_batch_with_class_weight(self):
+        model = self._get_model()
+        result = model.train_on_batch(
+            self.x, self.y, class_weight=self.class_weight
+        )
+        self.assertAllClose(
+            result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+    def test_test_on_batch(self):
+        model = self._get_model()
+        result = model.test_on_batch(self.x, self.y)
+        self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+    def test_test_on_batch_with_sample_weight(self):
+        model = self._get_model()
+        result = model.test_on_batch(
+            self.x, self.y, sample_weight=self.sample_weight
+        )
+        self.assertAllClose(
+            result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+    def test_fit_generator(self):
+        model = self._get_model()
+        history = model.fit_generator(
+            self._custom_generator(), steps_per_epoch=2, epochs=2
+        )
+        for key, value in self.expected_fit_result.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_fit_generator_with_sample_weight(self):
+        model = self._get_model()
+        history = model.fit_generator(
+            self._custom_generator(sample_weight=self.sample_weight),
+            steps_per_epoch=2,
+            epochs=2,
+        )
+        for key, value in self.expected_fit_result_with_weights.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_fit_generator_with_class_weight(self):
+        model = self._get_model()
+        history = model.fit_generator(
+            self._custom_generator(),
+            steps_per_epoch=2,
+            epochs=2,
+            class_weight=self.class_weight,
+        )
+        for key, value in self.expected_fit_result_with_weights.items():
+            self.assertAllClose(history.history[key], value, 1e-3)
+
+    def test_eval_generator(self):
+        model = self._get_model()
+        eval_result = model.evaluate_generator(
+            self._custom_generator(), steps=2
+        )
+        self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    def test_eval_generator_with_sample_weight(self):
+        model = self._get_model()
+        eval_result = model.evaluate_generator(
+            self._custom_generator(sample_weight=self.sample_weight), steps=2
+        )
+        self.assertAllClose(
+            eval_result, self.expected_batch_result_with_weights, 1e-3
+        )
+
+
+@test_combinations.run_with_all_model_types(exclude_models=["sequential"])
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
-@parameterized.parameters([
-    losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
-    losses_utils.ReductionV2.AUTO,
-    losses_utils.ReductionV2.SUM
-])
+@parameterized.parameters(
+    [
+        losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+        losses_utils.ReductionV2.AUTO,
+        losses_utils.ReductionV2.SUM,
+    ]
+)
 class TestOutputLossMetrics(test_combinations.TestCase):
-
-  def _get_compiled_multi_io_model(self, loss):
-    model = get_multi_io_model()
-    model.compile(
-        optimizer='rmsprop',
-        loss=loss,
-        run_eagerly=test_utils.should_run_eagerly())
-    return model
-
-  def setUp(self):
-    super(TestOutputLossMetrics, self).setUp()
-    self.x = np.asarray([[1.], [2.], [3.], [4.], [5.]])
-    self.y1 = np.asarray([[2.], [4.], [6.], [8.], [10.]])
-    self.y2 = np.asarray([[1.], [2.], [3.], [4.], [5.]])
-    self.sample_weight_1 = np.asarray([2., 3., 4., 5., 6.])
-    self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5, 3.])
-
-    # y_true_1 = [[2.], [4.], [6.], [8.], [10.]]
-    # y_pred_1 = [[3.], [6.], [9.], [12.], [15.]]
-    # y_true_2 = [[1.], [2.], [3.], [4.], [5.]]
-    # y_pred_2 = [[3.], [6.], [9.], [12.], [15.]]
-
-    # Loss `output_1`:
-    #   Per-sample weighted losses
-    #   Batch 1 = [(3 - 2)^2 * 2, (6 - 4)^2 * 3)] = [2, 12]
-    #   Batch 2 = [((9 - 6)^2 * 4, (12 - 8)^2 * 5)] = [36, 80]
-    #   Batch 3 = [(15 - 10)^2 * 6] = [150]
-
-    #   Result (reduction=SUM) = ((2 + 12)*2 + (36 + 80)*2 + 150) / 5 = 82
-    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 280 / 5 = 56
-
-    # Loss `output_2`:
-    #   Per-sample weighted losses
-    #   Batch 1 = [(3 - 1)^2 * 3.5, (6 - 2)^2 * 2.5)] = [14, 40]
-    #   Batch 2 = [(9 - 3)^2 * 1.5, (12 - 4)^2 * 0.5)] = [54, 32]
-    #   Batch 3 = [(15 - 5)^2 * 3] = [300]
-
-    #   Result (reduction=SUM) = ((14 + 40)*2 + (54 + 32)*2 + 300) / 5 = 116
-    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 440 / 5 = 88
-
-    # When reduction is 'NONE' loss value that is passed to the optimizer will
-    # be vector loss but what is reported is a scalar, which is an average of
-    # all the values in all the batch vectors.
-
-    # Total loss = Output_loss_1 + Output_loss_2
-
-    sum_over_batch_size_fit_result = {
-        'loss': [144, 144],
-        'output_1_loss': [56, 56],
-        'output_2_loss': [88, 88],
-    }
-
-    self.expected_fit_result = {
-        losses_utils.ReductionV2.NONE:
-            sum_over_batch_size_fit_result,
-        losses_utils.ReductionV2.SUM: {
-            'loss': [198, 198],
-            'output_1_loss': [82, 82],
-            'output_2_loss': [116, 116],
-        },
-        losses_utils.ReductionV2.AUTO:
-            sum_over_batch_size_fit_result,
-        losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
-            sum_over_batch_size_fit_result,
-    }
-
-    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
-    self.expected_batch_result = {
-        losses_utils.ReductionV2.NONE: [144, 56, 88],
-        losses_utils.ReductionV2.SUM: [198, 82, 116],
-        losses_utils.ReductionV2.AUTO: [144, 56, 88],
-        losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE: [144, 56, 88],
-    }
-
-    # 2 + 12 + 36 + 80 + 150 = 280
-    # 14 + 40 + 54 + 32 + 300 = 440
-    self.expected_single_batch_result = [720, 280, 440]
-
-  def test_fit(self, reduction):
-    model = self._get_compiled_multi_io_model(
-        loss=losses.MeanSquaredError(reduction=reduction))
-    history = model.fit([self.x, self.x], [self.y1, self.y2],
-                        sample_weight={
-                            'output_1': self.sample_weight_1,
-                            'output_2': self.sample_weight_2,
-                        },
-                        batch_size=2,
-                        epochs=2,
-                        shuffle=False)
-    for key, value in self.expected_fit_result[reduction].items():
-      self.assertAllClose(history.history[key], value)
-
-  def test_eval(self, reduction):
-    model = self._get_compiled_multi_io_model(
-        loss=losses.MeanSquaredError(reduction=reduction))
-    eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
-                                 batch_size=2,
-                                 sample_weight={
-                                     'output_1': self.sample_weight_1,
-                                     'output_2': self.sample_weight_2,
-                                 })
-    self.assertAllClose(eval_result, self.expected_batch_result[reduction])
-
-  def test_train_on_batch(self, reduction):
-    model = self._get_compiled_multi_io_model(
-        loss=losses.MeanSquaredError(reduction=reduction))
-    result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                                  sample_weight={
-                                      'output_1': self.sample_weight_1,
-                                      'output_2': self.sample_weight_2,
-                                  })
-
-    expected_values = self.expected_batch_result[reduction]
-    if reduction == losses_utils.ReductionV2.SUM:
-      expected_values = self.expected_single_batch_result
-    self.assertAllClose(result, expected_values)
-
-  def test_test_on_batch(self, reduction):
-    model = self._get_compiled_multi_io_model(
-        loss=losses.MeanSquaredError(reduction=reduction))
-    result = model.test_on_batch([self.x, self.x], [self.y1, self.y2],
-                                 sample_weight={
-                                     'output_1': self.sample_weight_1,
-                                     'output_2': self.sample_weight_2,
-                                 })
-    expected_values = self.expected_batch_result[reduction]
-    if reduction == losses_utils.ReductionV2.SUM:
-      expected_values = self.expected_single_batch_result
-    self.assertAllClose(result, expected_values)
-
-  def test_fit_generator(self, reduction):
-    model = self._get_compiled_multi_io_model(
-        loss=losses.MeanSquaredError(reduction=reduction))
-    history = model.fit_generator(
-        custom_generator_multi_io(
-            sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps_per_epoch=3,
-        epochs=2)
-    for key, value in self.expected_fit_result[reduction].items():
-      self.assertAllClose(history.history[key], value)
-
-  def test_eval_generator(self, reduction):
-    model = self._get_compiled_multi_io_model(
-        loss=losses.MeanSquaredError(reduction=reduction))
-    eval_result = model.evaluate_generator(
-        custom_generator_multi_io(
-            sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-        steps=3)
-    self.assertAllClose(eval_result, self.expected_batch_result[reduction])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _get_compiled_multi_io_model(self, loss):
+        model = get_multi_io_model()
+        model.compile(
+            optimizer="rmsprop",
+            loss=loss,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        return model
+
+    def setUp(self):
+        super(TestOutputLossMetrics, self).setUp()
+        self.x = np.asarray([[1.0], [2.0], [3.0], [4.0], [5.0]])
+        self.y1 = np.asarray([[2.0], [4.0], [6.0], [8.0], [10.0]])
+        self.y2 = np.asarray([[1.0], [2.0], [3.0], [4.0], [5.0]])
+        self.sample_weight_1 = np.asarray([2.0, 3.0, 4.0, 5.0, 6.0])
+        self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5, 3.0])
+
+        # y_true_1 = [[2.], [4.], [6.], [8.], [10.]]
+        # y_pred_1 = [[3.], [6.], [9.], [12.], [15.]]
+        # y_true_2 = [[1.], [2.], [3.], [4.], [5.]]
+        # y_pred_2 = [[3.], [6.], [9.], [12.], [15.]]
+
+        # Loss `output_1`:
+        #   Per-sample weighted losses
+        #   Batch 1 = [(3 - 2)^2 * 2, (6 - 4)^2 * 3)] = [2, 12]
+        #   Batch 2 = [((9 - 6)^2 * 4, (12 - 8)^2 * 5)] = [36, 80]
+        #   Batch 3 = [(15 - 10)^2 * 6] = [150]
+
+        #   Result (reduction=SUM) = ((2 + 12)*2 + (36 + 80)*2 + 150) / 5 = 82
+        #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 280 / 5 = 56
+
+        # Loss `output_2`:
+        #   Per-sample weighted losses
+        #   Batch 1 = [(3 - 1)^2 * 3.5, (6 - 2)^2 * 2.5)] = [14, 40]
+        #   Batch 2 = [(9 - 3)^2 * 1.5, (12 - 4)^2 * 0.5)] = [54, 32]
+        #   Batch 3 = [(15 - 5)^2 * 3] = [300]
+
+        #   Result (reduction=SUM) = ((14 + 40)*2 + (54 + 32)*2 + 300) / 5 = 116
+        #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 440 / 5 = 88
+
+        # When reduction is 'NONE' loss value that is passed to the optimizer
+        # will be vector loss but what is reported is a scalar, which is an
+        # average of all the values in all the batch vectors.
+
+        # Total loss = Output_loss_1 + Output_loss_2
+
+        sum_over_batch_size_fit_result = {
+            "loss": [144, 144],
+            "output_1_loss": [56, 56],
+            "output_2_loss": [88, 88],
+        }
+
+        self.expected_fit_result = {
+            losses_utils.ReductionV2.NONE: sum_over_batch_size_fit_result,
+            losses_utils.ReductionV2.SUM: {
+                "loss": [198, 198],
+                "output_1_loss": [82, 82],
+                "output_2_loss": [116, 116],
+            },
+            losses_utils.ReductionV2.AUTO: sum_over_batch_size_fit_result,
+            losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE: sum_over_batch_size_fit_result,  # noqa: E501
+        }
+
+        # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+        self.expected_batch_result = {
+            losses_utils.ReductionV2.NONE: [144, 56, 88],
+            losses_utils.ReductionV2.SUM: [198, 82, 116],
+            losses_utils.ReductionV2.AUTO: [144, 56, 88],
+            losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE: [144, 56, 88],
+        }
+
+        # 2 + 12 + 36 + 80 + 150 = 280
+        # 14 + 40 + 54 + 32 + 300 = 440
+        self.expected_single_batch_result = [720, 280, 440]
+
+    def test_fit(self, reduction):
+        model = self._get_compiled_multi_io_model(
+            loss=losses.MeanSquaredError(reduction=reduction)
+        )
+        history = model.fit(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+            batch_size=2,
+            epochs=2,
+            shuffle=False,
+        )
+        for key, value in self.expected_fit_result[reduction].items():
+            self.assertAllClose(history.history[key], value)
+
+    def test_eval(self, reduction):
+        model = self._get_compiled_multi_io_model(
+            loss=losses.MeanSquaredError(reduction=reduction)
+        )
+        eval_result = model.evaluate(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            batch_size=2,
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+        )
+        self.assertAllClose(eval_result, self.expected_batch_result[reduction])
+
+    def test_train_on_batch(self, reduction):
+        model = self._get_compiled_multi_io_model(
+            loss=losses.MeanSquaredError(reduction=reduction)
+        )
+        result = model.train_on_batch(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+        )
+
+        expected_values = self.expected_batch_result[reduction]
+        if reduction == losses_utils.ReductionV2.SUM:
+            expected_values = self.expected_single_batch_result
+        self.assertAllClose(result, expected_values)
+
+    def test_test_on_batch(self, reduction):
+        model = self._get_compiled_multi_io_model(
+            loss=losses.MeanSquaredError(reduction=reduction)
+        )
+        result = model.test_on_batch(
+            [self.x, self.x],
+            [self.y1, self.y2],
+            sample_weight={
+                "output_1": self.sample_weight_1,
+                "output_2": self.sample_weight_2,
+            },
+        )
+        expected_values = self.expected_batch_result[reduction]
+        if reduction == losses_utils.ReductionV2.SUM:
+            expected_values = self.expected_single_batch_result
+        self.assertAllClose(result, expected_values)
+
+    def test_fit_generator(self, reduction):
+        model = self._get_compiled_multi_io_model(
+            loss=losses.MeanSquaredError(reduction=reduction)
+        )
+        history = model.fit_generator(
+            custom_generator_multi_io(
+                sample_weights=[self.sample_weight_1, self.sample_weight_2]
+            ),
+            steps_per_epoch=3,
+            epochs=2,
+        )
+        for key, value in self.expected_fit_result[reduction].items():
+            self.assertAllClose(history.history[key], value)
+
+    def test_eval_generator(self, reduction):
+        model = self._get_compiled_multi_io_model(
+            loss=losses.MeanSquaredError(reduction=reduction)
+        )
+        eval_result = model.evaluate_generator(
+            custom_generator_multi_io(
+                sample_weights=[self.sample_weight_1, self.sample_weight_2]
+            ),
+            steps=3,
+        )
+        self.assertAllClose(eval_result, self.expected_batch_result[reduction])
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/metrics_functional_test.py b/keras/metrics/metrics_functional_test.py
index 76a3875051ff..c52a2f4cea25 100644
--- a/keras/metrics/metrics_functional_test.py
+++ b/keras/metrics/metrics_functional_test.py
@@ -14,137 +14,177 @@
 # ==============================================================================
 """Tests for Keras metrics functions."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import metrics
+from keras.testing_infra import test_combinations
 
 
 class KerasFunctionalMetricsTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_metrics(self):
-    with self.cached_session():
-      y_a = backend.variable(np.random.random((6, 7)))
-      y_b = backend.variable(np.random.random((6, 7)))
-      for metric in [metrics.binary_accuracy, metrics.categorical_accuracy]:
-        output = metric(y_a, y_b)
-        self.assertEqual(backend.eval(output).shape, (6,))
-
-  def test_sparse_categorical_accuracy_int(self):
-    with self.cached_session():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = backend.variable(np.random.randint(0, 7, (6,)))
-      y_pred = backend.variable(np.random.random((6, 7)))
-      self.assertEqual(backend.eval(metric(y_true, y_pred)).shape, (6,))
-
-      # Test correctness if the shape of y_true is (num_samples,)
-      y_true = backend.variable([1., 0., 0., 0.])
-      y_pred = backend.variable(
-          [[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-      self.assertAllEqual(
-          backend.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
-
-      # Test correctness if the shape of y_true is (num_samples, 1)
-      y_true = backend.variable([[1.], [0.], [0.], [0.]])
-      y_pred = backend.variable(
-          [[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-      self.assertAllEqual(
-          backend.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
-
-      # Test correctness if the shape of y_true is (batch_size, seq_length) and
-      # y_pred is (batch_size, seq_length, num_classes)
-      y_pred = backend.variable(
-          np.array([[[0.2, 0.3, 0.1], [0.1, 0.2, 0.7]],
-                    [[0.3, 0.2, 0.1], [0.7, 0.2, 0.1]]]))
-      y_true = backend.variable(np.array([[1, 0], [1, 0]]))
-      self.assertAllEqual(
-          backend.eval(metric(y_true, y_pred)), [[1., 0.], [0., 1.]])
-
-  def test_sparse_categorical_accuracy_float(self):
-    with self.cached_session():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = backend.variable(np.random.random((6,)))
-      y_pred = backend.variable(np.random.random((6, 7)))
-      self.assertEqual(backend.eval(metric(y_true, y_pred)).shape, (6,))
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_sparse_categorical_accuracy_eager(self):
-    """Tests that ints passed in via Eager return results. See b/113504761."""
-    metric = metrics.sparse_categorical_accuracy
-    y_true = np.arange(6).reshape([6, 1])
-    y_pred = np.arange(36).reshape([6, 6])
-    self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_sparse_categorical_accuracy_float_eager(self):
-    """Tests that floats passed in via Eager return results. See b/113504761."""
-    metric = metrics.sparse_categorical_accuracy
-    y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
-    y_pred = np.arange(36).reshape([6, 6])
-    self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
-
-  def test_sparse_top_k_categorical_accuracy(self):
-    with self.cached_session():
-      # Test correctness if the shape of y_true is (num_samples, 1)
-      y_pred = backend.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = backend.variable(np.array([[1], [0]]))
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(np.mean(result), 1)
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(np.mean(result), 0.5)
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(np.mean(result), 0.)
-
-      # Test correctness if the shape of y_true is (num_samples,)
-      y_pred = backend.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = backend.variable(np.array([1, 0]))
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(np.mean(result), 1)
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(np.mean(result), 0.5)
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(np.mean(result), 0.)
-
-      # Test correctness if the shape of y_true is (batch_size, seq_length) and
-      # y_pred is (batch_size, seq_length, num_classes)
-      y_pred = backend.variable(
-          np.array([[[0.3, 0.2, 0.1], [0.1, 0.2, 0.7], [0.1, 0.2, 0.7]],
-                    [[0.3, 0.2, 0.1], [0.1, 0.2, 0.7], [0.3, 0.2, 0.1]]]))
-      y_true = backend.variable(np.array([[1, 0, 0], [1, 0, 1]]))
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(np.mean(result), 1)
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(np.mean(result), 0.5)
-      result = backend.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(np.mean(result), 0.)
-
-  def test_top_k_categorical_accuracy(self):
-    with self.cached_session():
-      y_pred = backend.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
-      result = backend.eval(
-          metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(np.mean(result), 1)
-      result = backend.eval(
-          metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(np.mean(result), 0.5)
-      result = backend.eval(
-          metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(np.mean(result), 0.)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_metrics(self):
+        with self.cached_session():
+            y_a = backend.variable(np.random.random((6, 7)))
+            y_b = backend.variable(np.random.random((6, 7)))
+            for metric in [
+                metrics.binary_accuracy,
+                metrics.categorical_accuracy,
+            ]:
+                output = metric(y_a, y_b)
+                self.assertEqual(backend.eval(output).shape, (6,))
+
+    def test_sparse_categorical_accuracy_int(self):
+        with self.cached_session():
+            metric = metrics.sparse_categorical_accuracy
+            y_true = backend.variable(np.random.randint(0, 7, (6,)))
+            y_pred = backend.variable(np.random.random((6, 7)))
+            self.assertEqual(backend.eval(metric(y_true, y_pred)).shape, (6,))
+
+            # Test correctness if the shape of y_true is (num_samples,)
+            y_true = backend.variable([1.0, 0.0, 0.0, 0.0])
+            y_pred = backend.variable(
+                [[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]]
+            )
+            self.assertAllEqual(
+                backend.eval(metric(y_true, y_pred)), [0.0, 1.0, 1.0, 1.0]
+            )
+
+            # Test correctness if the shape of y_true is (num_samples, 1)
+            y_true = backend.variable([[1.0], [0.0], [0.0], [0.0]])
+            y_pred = backend.variable(
+                [[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]]
+            )
+            self.assertAllEqual(
+                backend.eval(metric(y_true, y_pred)), [0.0, 1.0, 1.0, 1.0]
+            )
+
+            # Test correctness if the shape of y_true is (batch_size,
+            # seq_length) and y_pred is (batch_size, seq_length, num_classes)
+            y_pred = backend.variable(
+                np.array(
+                    [
+                        [[0.2, 0.3, 0.1], [0.1, 0.2, 0.7]],
+                        [[0.3, 0.2, 0.1], [0.7, 0.2, 0.1]],
+                    ]
+                )
+            )
+            y_true = backend.variable(np.array([[1, 0], [1, 0]]))
+            self.assertAllEqual(
+                backend.eval(metric(y_true, y_pred)), [[1.0, 0.0], [0.0, 1.0]]
+            )
+
+    def test_sparse_categorical_accuracy_float(self):
+        with self.cached_session():
+            metric = metrics.sparse_categorical_accuracy
+            y_true = backend.variable(np.random.random((6,)))
+            y_pred = backend.variable(np.random.random((6, 7)))
+            self.assertEqual(backend.eval(metric(y_true, y_pred)).shape, (6,))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_sparse_categorical_accuracy_eager(self):
+        """Tests that ints passed in via Eager return results. See
+        b/113504761."""
+        metric = metrics.sparse_categorical_accuracy
+        y_true = np.arange(6).reshape([6, 1])
+        y_pred = np.arange(36).reshape([6, 6])
+        self.assertAllEqual(
+            metric(y_true, y_pred), [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_sparse_categorical_accuracy_float_eager(self):
+        """Tests that floats passed in via Eager return results. See
+        b/113504761."""
+        metric = metrics.sparse_categorical_accuracy
+        y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
+        y_pred = np.arange(36).reshape([6, 6])
+        self.assertAllEqual(
+            metric(y_true, y_pred), [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
+        )
+
+    def test_sparse_top_k_categorical_accuracy(self):
+        with self.cached_session():
+            # Test correctness if the shape of y_true is (num_samples, 1)
+            y_pred = backend.variable(
+                np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]])
+            )
+            y_true = backend.variable(np.array([[1], [0]]))
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3)
+            )
+            self.assertEqual(np.mean(result), 1)
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2)
+            )
+            self.assertEqual(np.mean(result), 0.5)
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1)
+            )
+            self.assertEqual(np.mean(result), 0.0)
+
+            # Test correctness if the shape of y_true is (num_samples,)
+            y_pred = backend.variable(
+                np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]])
+            )
+            y_true = backend.variable(np.array([1, 0]))
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3)
+            )
+            self.assertEqual(np.mean(result), 1)
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2)
+            )
+            self.assertEqual(np.mean(result), 0.5)
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1)
+            )
+            self.assertEqual(np.mean(result), 0.0)
+
+            # Test correctness if the shape of y_true is (batch_size,
+            # seq_length) and y_pred is (batch_size, seq_length, num_classes)
+            y_pred = backend.variable(
+                np.array(
+                    [
+                        [[0.3, 0.2, 0.1], [0.1, 0.2, 0.7], [0.1, 0.2, 0.7]],
+                        [[0.3, 0.2, 0.1], [0.1, 0.2, 0.7], [0.3, 0.2, 0.1]],
+                    ]
+                )
+            )
+            y_true = backend.variable(np.array([[1, 0, 0], [1, 0, 1]]))
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3)
+            )
+            self.assertEqual(np.mean(result), 1)
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2)
+            )
+            self.assertEqual(np.mean(result), 0.5)
+            result = backend.eval(
+                metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1)
+            )
+            self.assertEqual(np.mean(result), 0.0)
+
+    def test_top_k_categorical_accuracy(self):
+        with self.cached_session():
+            y_pred = backend.variable(
+                np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]])
+            )
+            y_true = backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+            result = backend.eval(
+                metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)
+            )
+            self.assertEqual(np.mean(result), 1)
+            result = backend.eval(
+                metrics.top_k_categorical_accuracy(y_true, y_pred, k=2)
+            )
+            self.assertEqual(np.mean(result), 0.5)
+            result = backend.eval(
+                metrics.top_k_categorical_accuracy(y_true, y_pred, k=1)
+            )
+            self.assertEqual(np.mean(result), 0.0)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/metrics_test.py b/keras/metrics/metrics_test.py
deleted file mode 100644
index 2597b2e41615..000000000000
--- a/keras/metrics/metrics_test.py
+++ /dev/null
@@ -1,2248 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras metrics."""
-
-import json
-import math
-
-from keras import backend
-from keras import layers
-from keras import metrics
-from keras import Model
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class KerasAccuracyTest(tf.test.TestCase):
-
-  def test_accuracy(self):
-    acc_obj = metrics.Accuracy(name='my_acc')
-
-    # check config
-    self.assertEqual(acc_obj.name, 'my_acc')
-    self.assertTrue(acc_obj.stateful)
-    self.assertEqual(len(acc_obj.variables), 2)
-    self.assertEqual(acc_obj.dtype, tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    update_op = acc_obj.update_state([[1], [2], [3], [4]], [[1], [2], [3], [4]])
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # Check save and restore config
-    a2 = metrics.Accuracy.from_config(acc_obj.get_config())
-    self.assertEqual(a2.name, 'my_acc')
-    self.assertTrue(a2.stateful)
-    self.assertEqual(len(a2.variables), 2)
-    self.assertEqual(a2.dtype, tf.float32)
-
-    # check with sample_weight
-    result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
-
-  def test_accuracy_ragged(self):
-    acc_obj = metrics.Accuracy(name='my_acc')
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    rt1 = tf.ragged.constant([[1], [2], [3], [4]])
-    rt2 = tf.ragged.constant([[1], [2], [3], [4]])
-    update_op = acc_obj.update_state(rt1, rt2)
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # check with sample_weight
-    rt1 = tf.ragged.constant([[2], [1]])
-    rt2 = tf.ragged.constant([[2], [0]])
-    sw_ragged = tf.ragged.constant([[0.5], [0.2]])
-    result_t = acc_obj(rt1, rt2, sample_weight=sw_ragged)
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
-
-  def test_binary_accuracy(self):
-    acc_obj = metrics.BinaryAccuracy(name='my_acc')
-
-    # check config
-    self.assertEqual(acc_obj.name, 'my_acc')
-    self.assertTrue(acc_obj.stateful)
-    self.assertEqual(len(acc_obj.variables), 2)
-    self.assertEqual(acc_obj.dtype, tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    update_op = acc_obj.update_state([[1], [0]], [[1], [0]])
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # check y_pred squeeze
-    update_op = acc_obj.update_state([[1], [1]], [[[1]], [[0]]])
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertAlmostEqual(result, 0.75, 2)  # 3/4
-
-    # check y_true squeeze
-    result_t = acc_obj([[[1]], [[1]]], [[1], [0]])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.67, 2)  # 4/6
-
-    # check with sample_weight
-    result_t = acc_obj([[1], [1]], [[1], [0]], [[0.5], [0.2]])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
-
-  def test_binary_accuracy_ragged(self):
-    acc_obj = metrics.BinaryAccuracy(name='my_acc')
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    rt1 = tf.ragged.constant([[1], [0]])
-    rt2 = tf.ragged.constant([[1], [0]])
-    update_op = acc_obj.update_state(rt1, rt2)
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # check y_true squeeze only supported for dense tensors and is
-    # not supported by ragged tensor (different ranks). --> error
-    rt1 = tf.ragged.constant([[[1], [1]]])
-    rt2 = tf.ragged.constant([[1], [0]])
-    with self.assertRaises(ValueError):
-      result_t = acc_obj(rt1, rt2)
-      result = self.evaluate(result_t)
-
-  def test_binary_accuracy_threshold(self):
-    acc_obj = metrics.BinaryAccuracy(threshold=0.7)
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-    result_t = acc_obj([[1], [1], [0], [0]], [[0.9], [0.6], [0.4], [0.8]])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.5, 2)
-
-  def test_binary_accuracy_threshold_ragged(self):
-    acc_obj = metrics.BinaryAccuracy(threshold=0.7)
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-    rt1 = tf.ragged.constant([[1], [1], [0], [0]])
-    rt2 = tf.ragged.constant([[0.9], [0.6], [0.4], [0.8]])
-    result_t = acc_obj(rt1, rt2)
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.5, 2)
-
-  def test_categorical_accuracy(self):
-    acc_obj = metrics.CategoricalAccuracy(name='my_acc')
-
-    # check config
-    self.assertEqual(acc_obj.name, 'my_acc')
-    self.assertTrue(acc_obj.stateful)
-    self.assertEqual(len(acc_obj.variables), 2)
-    self.assertEqual(acc_obj.dtype, tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    update_op = acc_obj.update_state([[0, 0, 1], [0, 1, 0]],
-                                     [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # check with sample_weight
-    result_t = acc_obj([[0, 0, 1], [0, 1, 0]],
-                       [[0.1, 0.1, 0.8], [0.05, 0, 0.95]], [[0.5], [0.2]])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
-
-  def test_categorical_accuracy_ragged(self):
-    acc_obj = metrics.CategoricalAccuracy(name='my_acc')
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    rt1 = tf.ragged.constant([[0, 0, 1], [0, 1, 0]])
-    rt2 = tf.ragged.constant([[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
-    update_op = acc_obj.update_state(rt1, rt2)
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # check with sample_weight
-    rt1 = tf.ragged.constant([[0, 0, 1], [0, 1, 0]])
-    rt2 = tf.ragged.constant([[0.1, 0.1, 0.8], [0.05, 0, 0.95]])
-    sample_weight = tf.ragged.constant([[0.5], [0.2]])
-    with self.assertRaises(tf.errors.InvalidArgumentError):
-      result_t = acc_obj(rt1, rt2, sample_weight)
-      result = self.evaluate(result_t)
-
-  def test_sparse_categorical_accuracy(self):
-    acc_obj = metrics.SparseCategoricalAccuracy(name='my_acc')
-
-    # check config
-    self.assertEqual(acc_obj.name, 'my_acc')
-    self.assertTrue(acc_obj.stateful)
-    self.assertEqual(len(acc_obj.variables), 2)
-    self.assertEqual(acc_obj.dtype, tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    update_op = acc_obj.update_state([[2], [1]],
-                                     [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # check with sample_weight
-    result_t = acc_obj([[2], [1]], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
-                       [[0.5], [0.2]])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
-
-  def test_sparse_categorical_accuracy_ragged(self):
-    acc_obj = metrics.SparseCategoricalAccuracy(name='my_acc')
-
-    # verify that correct value is returned
-    rt1 = tf.ragged.constant([[2], [1]])
-    rt2 = tf.ragged.constant([[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
-
-    with self.assertRaises(tf.errors.InvalidArgumentError):
-      # sparse_categorical_accuracy is not supported for composite/ragged
-      # tensors.
-      update_op = acc_obj.update_state(rt1, rt2)
-      self.evaluate(update_op)
-
-  def test_sparse_categorical_accuracy_mismatched_dims(self):
-    acc_obj = metrics.SparseCategoricalAccuracy(name='my_acc')
-
-    # check config
-    self.assertEqual(acc_obj.name, 'my_acc')
-    self.assertTrue(acc_obj.stateful)
-    self.assertEqual(len(acc_obj.variables), 2)
-    self.assertEqual(acc_obj.dtype, tf.float32)
-    self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-    # verify that correct value is returned
-    update_op = acc_obj.update_state([2, 1], [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
-    self.evaluate(update_op)
-    result = self.evaluate(acc_obj.result())
-    self.assertEqual(result, 1)  # 2/2
-
-    # check with sample_weight
-    result_t = acc_obj([2, 1], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
-                       [[0.5], [0.2]])
-    result = self.evaluate(result_t)
-    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
-
-  def test_sparse_categorical_accuracy_mismatched_dims_dynamic(self):
-    with tf.compat.v1.get_default_graph().as_default(), self.cached_session() as sess:
-      acc_obj = metrics.SparseCategoricalAccuracy(name='my_acc')
-      self.evaluate(tf.compat.v1.variables_initializer(acc_obj.variables))
-
-      t = tf.compat.v1.placeholder(tf.float32)
-      p = tf.compat.v1.placeholder(tf.float32)
-      w = tf.compat.v1.placeholder(tf.float32)
-
-      result_t = acc_obj(t, p, w)
-      result = sess.run(
-          result_t,
-          feed_dict=({
-              t: [2, 1],
-              p: [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
-              w: [[0.5], [0.2]]
-          }))
-      self.assertAlmostEqual(result, 0.71, 2)  # 2.5/2.7
-
-  def test_get_acc(self):
-    acc_fn = metrics.get('acc')
-    self.assertEqual(acc_fn, metrics.accuracy)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class CosineSimilarityTest(tf.test.TestCase):
-
-  def l2_norm(self, x, axis):
-    epsilon = 1e-12
-    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
-    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
-    return np.multiply(x, x_inv_norm)
-
-  def setup(self, axis=1):
-    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
-    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
-
-    y_true = self.l2_norm(self.np_y_true, axis)
-    y_pred = self.l2_norm(self.np_y_pred, axis)
-    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
-
-    self.y_true = tf.constant(self.np_y_true)
-    self.y_pred = tf.constant(self.np_y_pred)
-
-  def test_config(self):
-    cosine_obj = metrics.CosineSimilarity(
-        axis=2, name='my_cos', dtype=tf.int32)
-    self.assertEqual(cosine_obj.name, 'my_cos')
-    self.assertEqual(cosine_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    cosine_obj2 = metrics.CosineSimilarity.from_config(cosine_obj.get_config())
-    self.assertEqual(cosine_obj2.name, 'my_cos')
-    self.assertEqual(cosine_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    self.setup()
-    cosine_obj = metrics.CosineSimilarity()
-    self.evaluate(tf.compat.v1.variables_initializer(cosine_obj.variables))
-    loss = cosine_obj(self.y_true, self.y_pred)
-    expected_loss = np.mean(self.expected_loss)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_weighted(self):
-    self.setup()
-    cosine_obj = metrics.CosineSimilarity()
-    self.evaluate(tf.compat.v1.variables_initializer(cosine_obj.variables))
-    sample_weight = np.asarray([1.2, 3.4])
-    loss = cosine_obj(
-        self.y_true,
-        self.y_pred,
-        sample_weight=tf.constant(sample_weight))
-    expected_loss = np.sum(
-        self.expected_loss * sample_weight) / np.sum(sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-  def test_axis(self):
-    self.setup(axis=1)
-    cosine_obj = metrics.CosineSimilarity(axis=1)
-    self.evaluate(tf.compat.v1.variables_initializer(cosine_obj.variables))
-    loss = cosine_obj(self.y_true, self.y_pred)
-    expected_loss = np.mean(self.expected_loss)
-    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MeanAbsoluteErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    mae_obj = metrics.MeanAbsoluteError(name='my_mae', dtype=tf.int32)
-    self.assertEqual(mae_obj.name, 'my_mae')
-    self.assertEqual(mae_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    mae_obj2 = metrics.MeanAbsoluteError.from_config(mae_obj.get_config())
-    self.assertEqual(mae_obj2.name, 'my_mae')
-    self.assertEqual(mae_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    mae_obj = metrics.MeanAbsoluteError()
-    self.evaluate(tf.compat.v1.variables_initializer(mae_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = mae_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = mae_obj.result()
-    self.assertAllClose(0.5, result, atol=1e-5)
-
-  def test_weighted(self):
-    mae_obj = metrics.MeanAbsoluteError()
-    self.evaluate(tf.compat.v1.variables_initializer(mae_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = mae_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MeanAbsolutePercentageErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    mape_obj = metrics.MeanAbsolutePercentageError(
-        name='my_mape', dtype=tf.int32)
-    self.assertEqual(mape_obj.name, 'my_mape')
-    self.assertEqual(mape_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    mape_obj2 = metrics.MeanAbsolutePercentageError.from_config(
-        mape_obj.get_config())
-    self.assertEqual(mape_obj2.name, 'my_mape')
-    self.assertEqual(mape_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    mape_obj = metrics.MeanAbsolutePercentageError()
-    self.evaluate(tf.compat.v1.variables_initializer(mape_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = mape_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = mape_obj.result()
-    self.assertAllClose(35e7, result, atol=1e-5)
-
-  def test_weighted(self):
-    mape_obj = metrics.MeanAbsolutePercentageError()
-    self.evaluate(tf.compat.v1.variables_initializer(mape_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = mape_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(40e7, self.evaluate(result), atol=1e-5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MeanSquaredErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    mse_obj = metrics.MeanSquaredError(name='my_mse', dtype=tf.int32)
-    self.assertEqual(mse_obj.name, 'my_mse')
-    self.assertEqual(mse_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    mse_obj2 = metrics.MeanSquaredError.from_config(mse_obj.get_config())
-    self.assertEqual(mse_obj2.name, 'my_mse')
-    self.assertEqual(mse_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    mse_obj = metrics.MeanSquaredError()
-    self.evaluate(tf.compat.v1.variables_initializer(mse_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = mse_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = mse_obj.result()
-    self.assertAllClose(0.5, result, atol=1e-5)
-
-  def test_weighted(self):
-    mse_obj = metrics.MeanSquaredError()
-    self.evaluate(tf.compat.v1.variables_initializer(mse_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = mse_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MeanSquaredLogarithmicErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    msle_obj = metrics.MeanSquaredLogarithmicError(
-        name='my_msle', dtype=tf.int32)
-    self.assertEqual(msle_obj.name, 'my_msle')
-    self.assertEqual(msle_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    msle_obj2 = metrics.MeanSquaredLogarithmicError.from_config(
-        msle_obj.get_config())
-    self.assertEqual(msle_obj2.name, 'my_msle')
-    self.assertEqual(msle_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    msle_obj = metrics.MeanSquaredLogarithmicError()
-    self.evaluate(tf.compat.v1.variables_initializer(msle_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = msle_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = msle_obj.result()
-    self.assertAllClose(0.24022, result, atol=1e-5)
-
-  def test_weighted(self):
-    msle_obj = metrics.MeanSquaredLogarithmicError()
-    self.evaluate(tf.compat.v1.variables_initializer(msle_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = msle_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.26082, self.evaluate(result), atol=1e-5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class HingeTest(tf.test.TestCase):
-
-  def test_config(self):
-    hinge_obj = metrics.Hinge(name='hinge', dtype=tf.int32)
-    self.assertEqual(hinge_obj.name, 'hinge')
-    self.assertEqual(hinge_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    hinge_obj2 = metrics.Hinge.from_config(hinge_obj.get_config())
-    self.assertEqual(hinge_obj2.name, 'hinge')
-    self.assertEqual(hinge_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    hinge_obj = metrics.Hinge()
-    self.evaluate(tf.compat.v1.variables_initializer(hinge_obj.variables))
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6],
-                                   [-0.25, -1., 0.5, 0.6]])
-
-    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
-    #        = [0.6, 0.4125]
-    # reduced metric = (0.6 + 0.4125) / 2
-
-    update_op = hinge_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = hinge_obj.result()
-    self.assertAllClose(0.506, result, atol=1e-3)
-
-  def test_weighted(self):
-    hinge_obj = metrics.Hinge()
-    self.evaluate(tf.compat.v1.variables_initializer(hinge_obj.variables))
-    y_true = tf.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6],
-                                   [-0.25, -1., 0.5, 0.6]])
-    sample_weight = tf.constant([1.5, 2.])
-
-    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
-    #        = [0.6, 0.4125]
-    # weighted metric = [0.6 * 1.5, 0.4125 * 2]
-    # reduced metric = (0.6 * 1.5 + 0.4125 * 2) / (1.5 + 2)
-
-    result = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.493, self.evaluate(result), atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class SquaredHingeTest(tf.test.TestCase):
-
-  def test_config(self):
-    sq_hinge_obj = metrics.SquaredHinge(name='sq_hinge', dtype=tf.int32)
-    self.assertEqual(sq_hinge_obj.name, 'sq_hinge')
-    self.assertEqual(sq_hinge_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    sq_hinge_obj2 = metrics.SquaredHinge.from_config(sq_hinge_obj.get_config())
-    self.assertEqual(sq_hinge_obj2.name, 'sq_hinge')
-    self.assertEqual(sq_hinge_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    sq_hinge_obj = metrics.SquaredHinge()
-    self.evaluate(tf.compat.v1.variables_initializer(sq_hinge_obj.variables))
-    y_true = tf.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6],
-                                   [-0.25, -1., 0.5, 0.6]])
-
-    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
-    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
-    #                                         [0.5625, 0, 0.25, 0.16]]
-    # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
-    #        = [0.485, 0.2431]
-    # reduced metric = (0.485 + 0.2431) / 2
-
-    update_op = sq_hinge_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = sq_hinge_obj.result()
-    self.assertAllClose(0.364, result, atol=1e-3)
-
-  def test_weighted(self):
-    sq_hinge_obj = metrics.SquaredHinge()
-    self.evaluate(tf.compat.v1.variables_initializer(sq_hinge_obj.variables))
-    y_true = tf.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
-    y_pred = tf.constant([[-0.3, 0.2, -0.1, 1.6],
-                                   [-0.25, -1., 0.5, 0.6]])
-    sample_weight = tf.constant([1.5, 2.])
-
-    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
-
-    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
-    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
-    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
-    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
-    #                                         [0.5625, 0, 0.25, 0.16]]
-    # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
-    #        = [0.485, 0.2431]
-    # weighted metric = [0.485 * 1.5, 0.2431 * 2]
-    # reduced metric = (0.485 * 1.5 + 0.2431 * 2) / (1.5 + 2)
-
-    result = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.347, self.evaluate(result), atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class CategoricalHingeTest(tf.test.TestCase):
-
-  def test_config(self):
-    cat_hinge_obj = metrics.CategoricalHinge(
-        name='cat_hinge', dtype=tf.int32)
-    self.assertEqual(cat_hinge_obj.name, 'cat_hinge')
-    self.assertEqual(cat_hinge_obj._dtype, tf.int32)
-
-    # Check save and restore config
-    cat_hinge_obj2 = metrics.CategoricalHinge.from_config(
-        cat_hinge_obj.get_config())
-    self.assertEqual(cat_hinge_obj2.name, 'cat_hinge')
-    self.assertEqual(cat_hinge_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    cat_hinge_obj = metrics.CategoricalHinge()
-    self.evaluate(tf.compat.v1.variables_initializer(cat_hinge_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-
-    update_op = cat_hinge_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = cat_hinge_obj.result()
-    self.assertAllClose(0.5, result, atol=1e-5)
-
-  def test_weighted(self):
-    cat_hinge_obj = metrics.CategoricalHinge()
-    self.evaluate(tf.compat.v1.variables_initializer(cat_hinge_obj.variables))
-    y_true = tf.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = tf.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = tf.constant((1., 1.5, 2., 2.5))
-    result = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.5, self.evaluate(result), atol=1e-5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class RootMeanSquaredErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    rmse_obj = metrics.RootMeanSquaredError(name='rmse', dtype=tf.int32)
-    self.assertEqual(rmse_obj.name, 'rmse')
-    self.assertEqual(rmse_obj._dtype, tf.int32)
-
-    rmse_obj2 = metrics.RootMeanSquaredError.from_config(rmse_obj.get_config())
-    self.assertEqual(rmse_obj2.name, 'rmse')
-    self.assertEqual(rmse_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    rmse_obj = metrics.RootMeanSquaredError()
-    self.evaluate(tf.compat.v1.variables_initializer(rmse_obj.variables))
-    y_true = tf.constant((2, 4, 6))
-    y_pred = tf.constant((1, 3, 2))
-
-    update_op = rmse_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = rmse_obj.result()
-    # error = [-1, -1, -4], square(error) = [1, 1, 16], mean = 18/3 = 6
-    self.assertAllClose(math.sqrt(6), result, atol=1e-3)
-
-  def test_weighted(self):
-    rmse_obj = metrics.RootMeanSquaredError()
-    self.evaluate(tf.compat.v1.variables_initializer(rmse_obj.variables))
-    y_true = tf.constant((2, 4, 6, 8))
-    y_pred = tf.constant((1, 3, 2, 3))
-    sample_weight = tf.constant((0, 1, 0, 1))
-    result = rmse_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(math.sqrt(13), self.evaluate(result), atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class TopKCategoricalAccuracyTest(tf.test.TestCase):
-
-  def test_config(self):
-    a_obj = metrics.TopKCategoricalAccuracy(name='topkca', dtype=tf.int32)
-    self.assertEqual(a_obj.name, 'topkca')
-    self.assertEqual(a_obj._dtype, tf.int32)
-
-    a_obj2 = metrics.TopKCategoricalAccuracy.from_config(a_obj.get_config())
-    self.assertEqual(a_obj2.name, 'topkca')
-    self.assertEqual(a_obj2._dtype, tf.int32)
-
-  def test_correctness(self):
-    a_obj = metrics.TopKCategoricalAccuracy()
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    y_true = tf.constant([[0, 0, 1], [0, 1, 0]])
-    y_pred = tf.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
-
-    result = a_obj(y_true, y_pred)
-    self.assertEqual(1, self.evaluate(result))  # both the samples match
-
-    # With `k` < 5.
-    a_obj = metrics.TopKCategoricalAccuracy(k=1)
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    result = a_obj(y_true, y_pred)
-    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
-
-    # With `k` > 5.
-    y_true = tf.constant([[0, 0, 1, 0, 0, 0, 0],
-                                   [0, 1, 0, 0, 0, 0, 0]])
-    y_pred = tf.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
-                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
-    a_obj = metrics.TopKCategoricalAccuracy(k=6)
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    result = a_obj(y_true, y_pred)
-    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
-
-  def test_weighted(self):
-    a_obj = metrics.TopKCategoricalAccuracy(k=2)
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    y_true = tf.constant([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
-    y_pred = tf.constant([[0, 0.9, 0.1], [0, 0.9, 0.1], [0, 0.9, 0.1]])
-    sample_weight = tf.constant((1.0, 0.0, 1.0))
-    result = a_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class SparseTopKCategoricalAccuracyTest(tf.test.TestCase):
-
-  def test_config(self):
-    a_obj = metrics.SparseTopKCategoricalAccuracy(
-        name='stopkca', dtype=tf.int32)
-    self.assertEqual(a_obj.name, 'stopkca')
-    self.assertEqual(a_obj._dtype, tf.int32)
-
-    a_obj2 = metrics.SparseTopKCategoricalAccuracy.from_config(
-        a_obj.get_config())
-    self.assertEqual(a_obj2.name, 'stopkca')
-    self.assertEqual(a_obj2._dtype, tf.int32)
-
-  def test_correctness(self):
-    a_obj = metrics.SparseTopKCategoricalAccuracy()
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    y_true = tf.constant([2, 1])
-    y_pred = tf.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
-
-    result = a_obj(y_true, y_pred)
-    self.assertEqual(1, self.evaluate(result))  # both the samples match
-
-    # With `k` < 5.
-    a_obj = metrics.SparseTopKCategoricalAccuracy(k=1)
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    result = a_obj(y_true, y_pred)
-    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
-
-    # With `k` > 5.
-    y_pred = tf.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
-                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
-    a_obj = metrics.SparseTopKCategoricalAccuracy(k=6)
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    result = a_obj(y_true, y_pred)
-    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
-
-  def test_weighted(self):
-    a_obj = metrics.SparseTopKCategoricalAccuracy(k=2)
-    self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-    y_true = tf.constant([1, 0, 2])
-    y_pred = tf.constant([[0, 0.9, 0.1], [0, 0.9, 0.1], [0, 0.9, 0.1]])
-    sample_weight = tf.constant((1.0, 0.0, 1.0))
-    result = a_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
-
-  def test_sparse_top_k_categorical_accuracy_mismatched_dims_dynamic(self):
-
-    if not tf.compat.v1.executing_eagerly():
-      # Test will fail in v1 graph mode since the metric is not a normal layer.
-      # It will aggregate the output by batch dim, which failed on v1 code.
-      self.skipTest('v2 eager mode only')
-
-    class AccLayer(layers.Layer):
-
-      def build(self, _):
-        self.acc = metrics.SparseTopKCategoricalAccuracy(k=1)
-
-      def call(self, y_true, y_pred):
-        return self.acc(y_true, y_pred)
-
-    label = layers.Input(shape=[1])
-    predict = layers.Input(shape=[3])
-    metric_result = AccLayer()(label, predict)
-    model = Model([label, predict], metric_result)
-
-    result = model.predict([tf.constant([[2], [1]]),
-                            tf.constant([[0.1, 0.1, 0.8], [0.05, 0, 0.95]])],
-                           steps=1)
-    self.assertAllClose(result, 0.5)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class LogCoshErrorTest(tf.test.TestCase):
-
-  def setup(self):
-    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
-    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
-
-    self.batch_size = 6
-    error = y_pred - y_true
-    self.expected_results = np.log((np.exp(error) + np.exp(-error)) / 2)
-
-    self.y_pred = tf.constant(y_pred, dtype=tf.float32)
-    self.y_true = tf.constant(y_true)
-
-  def test_config(self):
-    logcosh_obj = metrics.LogCoshError(name='logcosh', dtype=tf.int32)
-    self.assertEqual(logcosh_obj.name, 'logcosh')
-    self.assertEqual(logcosh_obj._dtype, tf.int32)
-
-  def test_unweighted(self):
-    self.setup()
-    logcosh_obj = metrics.LogCoshError()
-    self.evaluate(tf.compat.v1.variables_initializer(logcosh_obj.variables))
-
-    update_op = logcosh_obj.update_state(self.y_true, self.y_pred)
-    self.evaluate(update_op)
-    result = logcosh_obj.result()
-    expected_result = np.sum(self.expected_results) / self.batch_size
-    self.assertAllClose(result, expected_result, atol=1e-3)
-
-  def test_weighted(self):
-    self.setup()
-    logcosh_obj = metrics.LogCoshError()
-    self.evaluate(tf.compat.v1.variables_initializer(logcosh_obj.variables))
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    result = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-
-    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
-    expected_result = np.multiply(self.expected_results, sample_weight)
-    expected_result = np.sum(expected_result) / np.sum(sample_weight)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class PoissonTest(tf.test.TestCase):
-
-  def setup(self):
-    y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
-    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
-
-    self.batch_size = 6
-    self.expected_results = y_pred - np.multiply(y_true, np.log(y_pred))
-
-    self.y_pred = tf.constant(y_pred, dtype=tf.float32)
-    self.y_true = tf.constant(y_true)
-
-  def test_config(self):
-    poisson_obj = metrics.Poisson(name='poisson', dtype=tf.int32)
-    self.assertEqual(poisson_obj.name, 'poisson')
-    self.assertEqual(poisson_obj._dtype, tf.int32)
-
-    poisson_obj2 = metrics.Poisson.from_config(poisson_obj.get_config())
-    self.assertEqual(poisson_obj2.name, 'poisson')
-    self.assertEqual(poisson_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    self.setup()
-    poisson_obj = metrics.Poisson()
-    self.evaluate(tf.compat.v1.variables_initializer(poisson_obj.variables))
-
-    update_op = poisson_obj.update_state(self.y_true, self.y_pred)
-    self.evaluate(update_op)
-    result = poisson_obj.result()
-    expected_result = np.sum(self.expected_results) / self.batch_size
-    self.assertAllClose(result, expected_result, atol=1e-3)
-
-  def test_weighted(self):
-    self.setup()
-    poisson_obj = metrics.Poisson()
-    self.evaluate(tf.compat.v1.variables_initializer(poisson_obj.variables))
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-
-    result = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
-    expected_result = np.multiply(self.expected_results, sample_weight)
-    expected_result = np.sum(expected_result) / np.sum(sample_weight)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class KLDivergenceTest(tf.test.TestCase):
-
-  def setup(self):
-    y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
-    y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
-
-    self.batch_size = 2
-    self.expected_results = np.multiply(y_true, np.log(y_true / y_pred))
-
-    self.y_pred = tf.constant(y_pred, dtype=tf.float32)
-    self.y_true = tf.constant(y_true)
-
-  def test_config(self):
-    k_obj = metrics.KLDivergence(name='kld', dtype=tf.int32)
-    self.assertEqual(k_obj.name, 'kld')
-    self.assertEqual(k_obj._dtype, tf.int32)
-
-    k_obj2 = metrics.KLDivergence.from_config(k_obj.get_config())
-    self.assertEqual(k_obj2.name, 'kld')
-    self.assertEqual(k_obj2._dtype, tf.int32)
-
-  def test_unweighted(self):
-    self.setup()
-    k_obj = metrics.KLDivergence()
-    self.evaluate(tf.compat.v1.variables_initializer(k_obj.variables))
-
-    update_op = k_obj.update_state(self.y_true, self.y_pred)
-    self.evaluate(update_op)
-    result = k_obj.result()
-    expected_result = np.sum(self.expected_results) / self.batch_size
-    self.assertAllClose(result, expected_result, atol=1e-3)
-
-  def test_weighted(self):
-    self.setup()
-    k_obj = metrics.KLDivergence()
-    self.evaluate(tf.compat.v1.variables_initializer(k_obj.variables))
-
-    sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
-    result = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-
-    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
-    expected_result = np.multiply(self.expected_results, sample_weight)
-    expected_result = np.sum(expected_result) / (1.2 + 3.4)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MeanRelativeErrorTest(tf.test.TestCase):
-
-  def test_config(self):
-    normalizer = tf.constant([1, 3], dtype=tf.float32)
-    mre_obj = metrics.MeanRelativeError(normalizer=normalizer, name='mre')
-    self.assertEqual(mre_obj.name, 'mre')
-    self.assertArrayNear(self.evaluate(mre_obj.normalizer), [1, 3], 1e-1)
-
-    mre_obj2 = metrics.MeanRelativeError.from_config(mre_obj.get_config())
-    self.assertEqual(mre_obj2.name, 'mre')
-    self.assertArrayNear(self.evaluate(mre_obj2.normalizer), [1, 3], 1e-1)
-
-  def test_unweighted(self):
-    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
-    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
-    expected_error = np.mean(
-        np.divide(np.absolute(np_y_pred - np_y_true), np_y_true))
-
-    y_pred = tf.constant(np_y_pred, shape=(1, 4), dtype=tf.float32)
-    y_true = tf.constant(np_y_true, shape=(1, 4))
-
-    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
-    self.evaluate(tf.compat.v1.variables_initializer(mre_obj.variables))
-
-    result = mre_obj(y_true, y_pred)
-    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
-
-  def test_weighted(self):
-    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
-    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
-    sample_weight = np.asarray([0.2, 0.3, 0.5, 0], dtype=np.float32)
-    rel_errors = np.divide(np.absolute(np_y_pred - np_y_true), np_y_true)
-    expected_error = np.sum(rel_errors * sample_weight)
-
-    y_pred = tf.constant(np_y_pred, dtype=tf.float32)
-    y_true = tf.constant(np_y_true)
-
-    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
-    self.evaluate(tf.compat.v1.variables_initializer(mre_obj.variables))
-
-    result = mre_obj(
-        y_true, y_pred, sample_weight=tf.constant(sample_weight))
-    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
-
-  def test_zero_normalizer(self):
-    y_pred = tf.constant([2, 4], dtype=tf.float32)
-    y_true = tf.constant([1, 3])
-
-    mre_obj = metrics.MeanRelativeError(normalizer=tf.zeros_like(y_true))
-    self.evaluate(tf.compat.v1.variables_initializer(mre_obj.variables))
-
-    result = mre_obj(y_true, y_pred)
-    self.assertEqual(self.evaluate(result), 0)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class IoUTest(tf.test.TestCase):
-
-  def test_config(self):
-    obj = metrics.IoU(
-        num_classes=2, target_class_ids=[1, 0], name='iou_class_1_0')
-    self.assertEqual(obj.name, 'iou_class_1_0')
-    self.assertEqual(obj.num_classes, 2)
-    self.assertEqual(obj.target_class_ids, [1, 0])
-
-    obj2 = metrics.IoU.from_config(obj.get_config())
-    self.assertEqual(obj2.name, 'iou_class_1_0')
-    self.assertEqual(obj2.num_classes, 2)
-    self.assertEqual(obj2.target_class_ids, [1, 0])
-
-  def test_unweighted(self):
-    y_pred = [0, 1, 0, 1]
-    y_true = [0, 0, 1, 1]
-
-    obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-
-    result = obj(y_true, y_pred)
-
-    # cm = [[1, 1],
-    #       [1, 1]]
-    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_weighted(self):
-    y_pred = tf.constant([0, 1, 0, 1], dtype=tf.float32)
-    y_true = tf.constant([0, 0, 1, 1])
-    sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
-
-    obj = metrics.IoU(num_classes=2, target_class_ids=[1, 0])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-
-    result = obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # cm = [[0.2, 0.3],
-    #       [0.4, 0.1]]
-    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.1 / (0.4 + 0.5 - 0.1) + 0.2 / (0.6 + 0.5 - 0.2)) / 2
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_multi_dim_input(self):
-    y_pred = tf.constant([[0, 1], [0, 1]], dtype=tf.float32)
-    y_true = tf.constant([[0, 0], [1, 1]])
-    sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]])
-
-    obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-
-    result = obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # cm = [[0.2, 0.3],
-    #       [0.4, 0.1]]
-    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_zero_valid_entries(self):
-    obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    self.assertAllClose(
-        self.evaluate(obj.result()), 0, atol=1e-3)
-
-  def test_zero_and_non_zero_entries(self):
-    y_pred = tf.constant([1], dtype=tf.float32)
-    y_true = tf.constant([1])
-
-    obj = metrics.IoU(num_classes=2, target_class_ids=[0, 1])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred)
-
-    # cm = [[0, 0],
-    #       [0, 1]]
-    # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (1 / (1 + 1 - 1)) / 1
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class BinaryIoUTest(tf.test.TestCase):
-
-  def test_config(self):
-    obj = metrics.BinaryIoU(
-        target_class_ids=[1, 0], threshold=0.1, name='iou_class_1_0')
-    self.assertEqual(obj.name, 'iou_class_1_0')
-    self.assertAlmostEqual(obj.threshold, 0.1)
-    self.assertEqual(obj.target_class_ids, [1, 0])
-
-    obj2 = metrics.BinaryIoU.from_config(obj.get_config())
-    self.assertEqual(obj.name, 'iou_class_1_0')
-    self.assertAlmostEqual(obj2.threshold, 0.1)
-    self.assertEqual(obj.target_class_ids, [1, 0])
-
-  def test_different_thresholds_weighted(self):
-    y_true = [0, 1, 0, 1]
-    y_pred = [0.1, 0.2, 0.4, 0.7]
-
-    sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
-    # with threshold = 0.3, y_pred will be converted to [0, 0, 1, 1]
-    # cm = [[0.2, 0.4],
-    #       [0.3, 0.1]]
-    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
-    obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.3)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-    sample_weight = tf.constant([0.1, 0.2, 0.4, 0.3])
-    # with threshold = 0.5, y_pred will be converted to [0, 0, 0, 1]
-    # cm = [[0.1+0.4, 0],
-    #       [0.2, 0.3]]
-    # sum_row = [0.5, 0.5], sum_col = [0.7, 0.3], true_positives = [0.5, 0.3]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.5 / (0.5 + 0.7 - 0.5) + 0.3 / (0.5 + 0.3 - 0.3)) / 2
-    obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.5)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_different_thresholds_unweighted(self):
-    y_true = [0, 1, 0, 1]
-    y_pred = [0.1, 0.2, 0.4, 0.7]
-
-    # with threshold = 0.3, y_pred will be converted to [0, 0, 1, 1]
-    # cm = [[1, 1],
-    #       [1, 1]]
-    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
-    obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.3)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-    # with threshold = 0.5, y_pred will be converted to [0, 0, 0, 1]
-    # cm = [[2, 0],
-    #       [1, 1]]
-    # sum_row = [2, 2], sum_col = [3, 1], true_positives = [2, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (2 / (2 + 3 - 2) + 1 / (2 + 1 - 1)) / 2
-    obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=0.5)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_multi_dim_input(self):
-    y_true = tf.constant([[0, 1], [0, 1]], dtype=tf.float32)
-    y_pred = tf.constant([[0.1, 0.7], [0.9, 0.3]])
-    threshold = 0.4  # y_pred will become [[0, 1], [1, 0]]
-    sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]])
-    # cm = [[0.2, 0.4],
-    #       [0.1, 0.3]]
-    # sum_row = [0.6, 0.4], sum_col = [0.3, 0.7], true_positives = [0.2, 0.3]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.2 / (0.6 + 0.3 - 0.2) + 0.3 / (0.4 + 0.7 - 0.3)) / 2
-    obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=threshold)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_zero_valid_entries(self):
-    obj = metrics.BinaryIoU(target_class_ids=[0, 1])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    self.assertAllClose(
-        self.evaluate(obj.result()), 0, atol=1e-3)
-
-  def test_zero_and_non_zero_entries(self):
-    y_pred = tf.constant([0.6], dtype=tf.float32)
-    threshold = 0.5
-    y_true = tf.constant([1])
-
-    obj = metrics.BinaryIoU(target_class_ids=[0, 1], threshold=threshold)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred)
-
-    # cm = [[0, 0],
-    #       [0, 1]]
-    # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = 1 / (1 + 1 - 1)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MeanIoUTest(tf.test.TestCase):
-
-  def test_config(self):
-    m_obj = metrics.MeanIoU(num_classes=2, name='mean_iou')
-    self.assertEqual(m_obj.name, 'mean_iou')
-    self.assertEqual(m_obj.num_classes, 2)
-
-    m_obj2 = metrics.MeanIoU.from_config(m_obj.get_config())
-    self.assertEqual(m_obj2.name, 'mean_iou')
-    self.assertEqual(m_obj2.num_classes, 2)
-
-  def test_unweighted(self):
-    y_pred = [0, 1, 0, 1]
-    y_true = [0, 0, 1, 1]
-
-    m_obj = metrics.MeanIoU(num_classes=2)
-    self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
-
-    result = m_obj(y_true, y_pred)
-
-    # cm = [[1, 1],
-    #       [1, 1]]
-    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_weighted(self):
-    y_pred = tf.constant([0, 1, 0, 1], dtype=tf.float32)
-    y_true = tf.constant([0, 0, 1, 1])
-    sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
-
-    m_obj = metrics.MeanIoU(num_classes=2)
-    self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
-
-    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # cm = [[0.2, 0.3],
-    #       [0.4, 0.1]]
-    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_multi_dim_input(self):
-    y_pred = tf.constant([[0, 1], [0, 1]], dtype=tf.float32)
-    y_true = tf.constant([[0, 0], [1, 1]])
-    sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]])
-
-    m_obj = metrics.MeanIoU(num_classes=2)
-    self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
-
-    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # cm = [[0.2, 0.3],
-    #       [0.4, 0.1]]
-    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_zero_valid_entries(self):
-    m_obj = metrics.MeanIoU(num_classes=2)
-    self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
-    self.assertAllClose(self.evaluate(m_obj.result()), 0, atol=1e-3)
-
-  def test_zero_and_non_zero_entries(self):
-    y_pred = tf.constant([1], dtype=tf.float32)
-    y_true = tf.constant([1])
-
-    m_obj = metrics.MeanIoU(num_classes=2)
-    self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
-    result = m_obj(y_true, y_pred)
-
-    # cm = [[0, 0],
-    #       [0, 1]]
-    # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0 + 1 / (1 + 1 - 1)) / 1
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class OneHotIoUTest(tf.test.TestCase):
-
-  def test_unweighted(self):
-    y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
-    # y_true will be converted to [2, 0, 1, 0]
-    y_pred = tf.constant([[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1],
-                          [0.1, 0.4, 0.5]])
-    # y_pred will be converted to [2, 2, 0, 2]
-    # cm = [[0, 0, 2],
-    #       [1, 0, 0],
-    #       [0, 0, 1]
-    # sum_row = [1, 0, 3], sum_col = [2, 1, 1], true_positives = [0, 0, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0 / (1 + 2 - 0) + 1 / (3 + 1 - 1)) / 2
-    obj = metrics.OneHotIoU(num_classes=3, target_class_ids=[0, 2])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_weighted(self):
-    y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
-    # y_true will be converted to [2, 0, 1, 0]
-    y_pred = tf.constant([[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1],
-                          [0.1, 0.4, 0.5]])
-    # y_pred will be converted to [2, 2, 0, 2]
-    sample_weight = [0.1, 0.2, 0.3, 0.4]
-    # cm = [[0, 0, 0.2+0.4],
-    #       [0.3, 0, 0],
-    #       [0, 0, 0.1]]
-    # sum_row = [0.3, 0, 0.7], sum_col = [0.6, 0.3, 0.1]
-    # true_positives = [0, 0, 0.1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0 / (0.3 + 0.6 - 0) + 0.1 / (0.7 + 0.1 - 0.1)) / 2
-    obj = metrics.OneHotIoU(num_classes=3, target_class_ids=[0, 2])
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class OneHotMeanIoUTest(tf.test.TestCase):
-
-  def test_unweighted(self):
-    y_true = tf.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
-    # y_true will be converted to [2, 0, 1, 0]
-    y_pred = tf.constant([[0.2, 0.3, 0.5], [0.1, 0.2, 0.7], [0.5, 0.3, 0.1],
-                          [0.1, 0.4, 0.5]])
-    # y_pred will be converted to [2, 2, 0, 2]
-    # cm = [[0, 0, 2],
-    #       [1, 0, 0],
-    #       [0, 0, 1]
-    # sum_row = [1, 0, 3], sum_col = [2, 1, 1], true_positives = [0, 0, 1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0 + 0 + 1 / (3 + 1 - 1)) / 3
-    obj = metrics.OneHotMeanIoU(num_classes=3)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-  def test_weighted(self):
-    y_true = tf.constant([
-        [0, 0, 1],
-        [1, 0, 0],
-        [0, 1, 0],
-        [1, 0, 0],
-        [1, 0, 0],
-    ])
-    # y_true will be converted to [2, 0, 1, 0, 0]
-    y_pred = tf.constant([
-        [0.2, 0.3, 0.5],
-        [0.1, 0.2, 0.7],
-        [0.5, 0.3, 0.1],
-        [0.1, 0.4, 0.5],
-        [0.6, 0.2, 0.2],
-    ])
-    # y_pred will be converted to [2, 2, 0, 2, 0]
-    sample_weight = [0.1, 0.2, 0.3, 0.3, 0.1]
-    # cm = [[0.1, 0, 0.2+0.3],
-    #       [0.3, 0, 0],
-    #       [0, 0, 0.1]]
-    # sum_row = [0.4, 0, 0.6], sum_col = [0.6, 0.3, 0.1]
-    # true_positives = [0.1, 0, 0.1]
-    # iou = true_positives / (sum_row + sum_col - true_positives))
-    expected_result = (0.1 / (0.4 + 0.6 - 0.1) + 0 + 0.1 /
-                       (0.6 + 0.1 - 0.1)) / 3
-    obj = metrics.OneHotMeanIoU(num_classes=3)
-    self.evaluate(tf.compat.v1.variables_initializer(obj.variables))
-    result = obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class BinaryCrossentropyTest(tf.test.TestCase):
-
-  def test_config(self):
-    bce_obj = metrics.BinaryCrossentropy(
-        name='bce', dtype=tf.int32, label_smoothing=0.2)
-    self.assertEqual(bce_obj.name, 'bce')
-    self.assertEqual(bce_obj._dtype, tf.int32)
-
-    old_config = bce_obj.get_config()
-    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
-
-    # Check save and restore config
-    bce_obj2 = metrics.BinaryCrossentropy.from_config(old_config)
-    self.assertEqual(bce_obj2.name, 'bce')
-    self.assertEqual(bce_obj2._dtype, tf.int32)
-    new_config = bce_obj2.get_config()
-    self.assertDictEqual(old_config, new_config)
-
-  def test_unweighted(self):
-    bce_obj = metrics.BinaryCrossentropy()
-    self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
-    result = bce_obj(y_true, y_pred)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
-
-    # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
-    #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
-    #           -log(Y_MAX + EPSILON), -log(1)]
-    #        = [(0 + 15.33) / 2, (0 + 0) / 2]
-    # Reduced metric = 7.665 / 2
-
-    self.assertAllClose(self.evaluate(result), 3.833, atol=1e-3)
-
-  def test_unweighted_with_logits(self):
-    bce_obj = metrics.BinaryCrossentropy(from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
-
-    y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
-    y_pred = tf.constant([[100.0, -100.0, 100.0],
-                                   [100.0, 100.0, -100.0]])
-    result = bce_obj(y_true, y_pred)
-
-    # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #              (where x = logits and z = y_true)
-    #        = [((100 - 100 * 1 + log(1 + exp(-100))) +
-    #            (0 + 100 * 0 + log(1 + exp(-100))) +
-    #            (100 - 100 * 1 + log(1 + exp(-100))),
-    #           ((100 - 100 * 0 + log(1 + exp(-100))) +
-    #            (100 - 100 * 1 + log(1 + exp(-100))) +
-    #            (0 + 100 * 1 + log(1 + exp(-100))))]
-    #        = [(0 + 0 + 0) / 3, 200 / 3]
-    # Reduced metric = (0 + 66.666) / 2
-
-    self.assertAllClose(self.evaluate(result), 33.333, atol=1e-3)
-
-  def test_weighted(self):
-    bce_obj = metrics.BinaryCrossentropy()
-    self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
-    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
-    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
-    sample_weight = tf.constant([1.5, 2.])
-    result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
-
-    # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
-    #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
-    #           -log(Y_MAX + EPSILON), -log(1)]
-    #        = [(0 + 15.33) / 2, (0 + 0) / 2]
-    # Weighted metric = [7.665 * 1.5, 0]
-    # Reduced metric = 7.665 * 1.5 / (1.5 + 2)
-
-    self.assertAllClose(self.evaluate(result), 3.285, atol=1e-3)
-
-  def test_weighted_from_logits(self):
-    bce_obj = metrics.BinaryCrossentropy(from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
-    y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
-    y_pred = tf.constant([[100.0, -100.0, 100.0],
-                                   [100.0, 100.0, -100.0]])
-    sample_weight = tf.constant([2., 2.5])
-    result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #              (where x = logits and z = y_true)
-    #        = [(0 + 0 + 0) / 3, 200 / 3]
-    # Weighted metric = [0, 66.666 * 2.5]
-    # Reduced metric = 66.666 * 2.5 / (2 + 2.5)
-
-    self.assertAllClose(self.evaluate(result), 37.037, atol=1e-3)
-
-  def test_label_smoothing(self):
-    logits = tf.constant(((100., -100., -100.)))
-    y_true = tf.constant(((1, 0, 1)))
-    label_smoothing = 0.1
-    # Metric: max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    #             (where x = logits and z = y_true)
-    # Label smoothing: z' = z * (1 - L) + 0.5L
-    # After label smoothing, label 1 becomes 1 - 0.5L
-    #                        label 0 becomes 0.5L
-    # Applying the above two fns to the given input:
-    # (100 - 100 * (1 - 0.5 L)  + 0 +
-    #  0   + 100 * (0.5 L)      + 0 +
-    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
-    #  = (100 + 50L) * 1/3
-    bce_obj = metrics.BinaryCrossentropy(
-        from_logits=True, label_smoothing=label_smoothing)
-    self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
-    result = bce_obj(y_true, logits)
-    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
-    self.assertAllClose(expected_value, self.evaluate(result), atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class CategoricalCrossentropyTest(tf.test.TestCase):
-
-  def test_config(self):
-    cce_obj = metrics.CategoricalCrossentropy(
-        name='cce', dtype=tf.int32, label_smoothing=0.2)
-    self.assertEqual(cce_obj.name, 'cce')
-    self.assertEqual(cce_obj._dtype, tf.int32)
-
-    old_config = cce_obj.get_config()
-    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
-
-    # Check save and restore config
-    cce_obj2 = metrics.CategoricalCrossentropy.from_config(old_config)
-    self.assertEqual(cce_obj2.name, 'cce')
-    self.assertEqual(cce_obj2._dtype, tf.int32)
-    new_config = cce_obj2.get_config()
-    self.assertDictEqual(old_config, new_config)
-
-  def test_unweighted(self):
-    cce_obj = metrics.CategoricalCrossentropy()
-    self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
-
-    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
-    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
-    result = cce_obj(y_true, y_pred)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-
-    # Metric = -sum(y * log(y'), axis = -1)
-    #        = -((log 0.95), (log 0.1))
-    #        = [0.051, 2.302]
-    # Reduced metric = (0.051 + 2.302) / 2
-
-    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
-
-  def test_unweighted_from_logits(self):
-    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
-
-    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
-    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
-    result = cce_obj(y_true, logits)
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # xent = -sum(labels * log(softmax), 1)
-
-    # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
-    # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
-    # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
-    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
-    #                 [-7.00182, -0.00182, -7.00182]]
-    # labels * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
-    # xent = [0.00045, 7.00182]
-    # Reduced xent = (0.00045 + 7.00182) / 2
-
-    self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
-
-  def test_weighted(self):
-    cce_obj = metrics.CategoricalCrossentropy()
-    self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
-
-    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
-    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
-    sample_weight = tf.constant([1.5, 2.])
-    result = cce_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-
-    # Metric = -sum(y * log(y'), axis = -1)
-    #        = -((log 0.95), (log 0.1))
-    #        = [0.051, 2.302]
-    # Weighted metric = [0.051 * 1.5, 2.302 * 2.]
-    # Reduced metric = (0.051 * 1.5 + 2.302 * 2.) / 3.5
-
-    self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
-
-  def test_weighted_from_logits(self):
-    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
-
-    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
-    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
-    sample_weight = tf.constant([1.5, 2.])
-    result = cce_obj(y_true, logits, sample_weight=sample_weight)
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # xent = -sum(labels * log(softmax), 1)
-    # xent = [0.00045, 7.00182]
-    # weighted xent = [0.000675, 14.00364]
-    # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
-
-    self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
-
-  def test_label_smoothing(self):
-    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
-    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
-    label_smoothing = 0.1
-
-    # Label smoothing: z' = z * (1 - L) + L/n,
-    #     where L = label smoothing value and n = num classes
-    # Label value 1 becomes: 1 - L + L/n
-    # Label value 0 becomes: L/n
-    # y_true with label_smoothing = [[0.0333, 0.9333, 0.0333],
-    #                               [0.0333, 0.0333, 0.9333]]
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # xent = -sum(labels * log(softmax), 1)
-    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
-    #                 [-7.00182, -0.00182, -7.00182]]
-    # labels * log(softmax) = [[-0.26641, -0.00042, -0.29971],
-    #                          [-0.23316, -0.00006, -6.53479]]
-    # xent = [0.56654, 6.76801]
-    # Reduced xent = (0.56654 + 6.76801) / 2
-
-    cce_obj = metrics.CategoricalCrossentropy(
-        from_logits=True, label_smoothing=label_smoothing)
-    self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
-    loss = cce_obj(y_true, logits)
-    self.assertAllClose(self.evaluate(loss), 3.667, atol=1e-3)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class SparseCategoricalCrossentropyTest(tf.test.TestCase):
-
-  def test_config(self):
-    scce_obj = metrics.SparseCategoricalCrossentropy(
-        name='scce', dtype=tf.int32)
-    self.assertEqual(scce_obj.name, 'scce')
-    self.assertEqual(scce_obj.dtype, tf.int32)
-    old_config = scce_obj.get_config()
-    self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
-
-    # Check save and restore config
-    scce_obj2 = metrics.SparseCategoricalCrossentropy.from_config(old_config)
-    self.assertEqual(scce_obj2.name, 'scce')
-    self.assertEqual(scce_obj2.dtype, tf.int32)
-    new_config = scce_obj2.get_config()
-    self.assertDictEqual(old_config, new_config)
-
-  def test_unweighted(self):
-    scce_obj = metrics.SparseCategoricalCrossentropy()
-    self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
-
-    y_true = np.asarray([1, 2])
-    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
-    result = scce_obj(y_true, y_pred)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-    # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
-    #                      [-2.3026, -0.2231, -2.3026]]
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
-    # xent = -sum(y * log(softmax), 1)
-
-    # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-    # sum(exp(logits), axis=-1) = [1, 1]
-    # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-    # log(softmax) = [[-2.9957, -0.0513, -16.1181],
-    #                 [-2.3026, -0.2231, -2.3026]]
-    # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
-    # xent = [0.0513, 2.3026]
-    # Reduced xent = (0.0513 + 2.3026) / 2
-
-    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
-
-  def test_unweighted_from_logits(self):
-    scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
-
-    y_true = np.asarray([1, 2])
-    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
-    result = scce_obj(y_true, logits)
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
-    # xent = -sum(y_true * log(softmax), 1)
-
-    # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
-    # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
-    # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
-    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
-    #                 [-7.00182, -0.00182, -7.00182]]
-    # y_true * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
-    # xent = [0.00045, 7.00182]
-    # Reduced xent = (0.00045 + 7.00182) / 2
-
-    self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
-
-  def test_weighted(self):
-    scce_obj = metrics.SparseCategoricalCrossentropy()
-    self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
-
-    y_true = np.asarray([1, 2])
-    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
-    sample_weight = tf.constant([1.5, 2.])
-    result = scce_obj(y_true, y_pred, sample_weight=sample_weight)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-    # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
-    #                      [-2.3026, -0.2231, -2.3026]]
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
-    # xent = -sum(y * log(softmax), 1)
-
-    # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-    # sum(exp(logits), axis=-1) = [1, 1]
-    # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
-    # log(softmax) = [[-2.9957, -0.0513, -16.1181],
-    #                 [-2.3026, -0.2231, -2.3026]]
-    # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
-    # xent = [0.0513, 2.3026]
-    # Weighted xent = [0.051 * 1.5, 2.302 * 2.]
-    # Reduced xent = (0.051 * 1.5 + 2.302 * 2.) / 3.5
-
-    self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
-
-  def test_weighted_from_logits(self):
-    scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
-    self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
-
-    y_true = np.asarray([1, 2])
-    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
-    sample_weight = tf.constant([1.5, 2.])
-    result = scce_obj(y_true, logits, sample_weight=sample_weight)
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
-    # xent = -sum(y_true * log(softmax), 1)
-    # xent = [0.00045, 7.00182]
-    # weighted xent = [0.000675, 14.00364]
-    # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
-
-    self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
-
-  def test_axis(self):
-    scce_obj = metrics.SparseCategoricalCrossentropy(axis=0)
-    self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
-
-    y_true = np.asarray([1, 2])
-    y_pred = np.asarray([[0.05, 0.1], [0.95, 0.8], [0, 0.1]])
-    result = scce_obj(y_true, y_pred)
-
-    # EPSILON = 1e-7, y = y_true, y` = y_pred
-    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
-    # y` = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
-    # logits = log(y`) =  [[-2.9957, -2.3026],
-    #                      [-0.0513, -0.2231],
-    #                      [-16.1181, -2.3026]]
-
-    # softmax = exp(logits) / sum(exp(logits), axis=-1)
-    # y = one_hot(y) = [[0, 0], [1, 0], [0, 1]]
-    # xent = -sum(y * log(softmax), 1)
-
-    # exp(logits) = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
-    # sum(exp(logits)) = [1, 1]
-    # softmax = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
-    # log(softmax) = [[-2.9957, -2.3026],
-    #                 [-0.0513, -0.2231],
-    #                 [-16.1181, -2.3026]]
-    # y * log(softmax) = [[0, 0], [-0.0513, 0], [0, -2.3026]]
-    # xent = [0.0513, 2.3026]
-    # Reduced xent = (0.0513 + 2.3026) / 2
-
-    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
-
-
-class BinaryTruePositives(metrics.Metric):
-
-  def __init__(self, name='binary_true_positives', **kwargs):
-    super().__init__(name=name, **kwargs)
-    self.true_positives = self.add_weight(name='tp', initializer='zeros')
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    y_true = tf.cast(y_true, tf.bool)
-    y_pred = tf.cast(y_pred, tf.bool)
-
-    values = tf.logical_and(
-        tf.equal(y_true, True), tf.equal(y_pred, True))
-    values = tf.cast(values, self.dtype)
-    if sample_weight is not None:
-      sample_weight = tf.cast(sample_weight, dtype=self.dtype)
-      sample_weight = tf.__internal__.ops.broadcast_weights(
-          sample_weight, values)
-      values = tf.multiply(values, sample_weight)
-    self.true_positives.assign_add(tf.reduce_sum(values))
-
-  def result(self):
-    return self.true_positives
-
-
-class BinaryTruePositivesViaControlFlow(metrics.Metric):
-
-  def __init__(self, name='binary_true_positives', **kwargs):
-    super().__init__(name=name, **kwargs)
-    self.true_positives = self.add_weight(name='tp', initializer='zeros')
-
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    y_true = tf.cast(y_true, tf.bool)
-    y_pred = tf.cast(y_pred, tf.bool)
-
-    for i in range(len(y_true)):
-      for j in range(len(y_true[i])):
-        if y_true[i][j] and y_pred[i][j]:
-          if sample_weight is None:
-            self.true_positives.assign_add(1)
-          else:
-            self.true_positives.assign_add(sample_weight[i][0])
-
-  def result(self):
-    if tf.constant(True):
-      return self.true_positives
-    return 0.0
-
-
-def _get_model(compile_metrics):
-  model_layers = [
-      layers.Dense(3, activation='relu', kernel_initializer='ones'),
-      layers.Dense(1, activation='sigmoid', kernel_initializer='ones')]
-
-  model = test_utils.get_model_from_layers(model_layers, input_shape=(4,))
-  model.compile(
-      loss='mae',
-      metrics=compile_metrics,
-      optimizer='rmsprop',
-      run_eagerly=test_utils.should_run_eagerly())
-  return model
-
-
-@test_combinations.run_with_all_model_types
-@test_combinations.run_all_keras_modes
-class ResetStatesTest(test_combinations.TestCase):
-
-  def test_reset_state_false_positives(self):
-    fp_obj = metrics.FalsePositives()
-    model = _get_model([fp_obj])
-    x = np.ones((100, 4))
-    y = np.zeros((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
-
-  def test_reset_state_false_negatives(self):
-    fn_obj = metrics.FalseNegatives()
-    model = _get_model([fn_obj])
-    x = np.zeros((100, 4))
-    y = np.ones((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
-
-  def test_reset_state_true_negatives(self):
-    tn_obj = metrics.TrueNegatives()
-    model = _get_model([tn_obj])
-    x = np.zeros((100, 4))
-    y = np.zeros((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
-
-  def test_reset_state_true_positives(self):
-    tp_obj = metrics.TruePositives()
-    model = _get_model([tp_obj])
-    x = np.ones((100, 4))
-    y = np.ones((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
-
-  def test_reset_state_precision(self):
-    p_obj = metrics.Precision()
-    model = _get_model([p_obj])
-    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
-    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.true_positives), 50.)
-    self.assertEqual(self.evaluate(p_obj.false_positives), 50.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.true_positives), 50.)
-    self.assertEqual(self.evaluate(p_obj.false_positives), 50.)
-
-  def test_precision_update_state_with_logits(self):
-    p_obj = metrics.Precision()
-    # Update state with logits (not in range (0, 1)) should not an raise error.
-    p_obj.update_state([-0.5, 0.5], [-2., 2.])
-
-  def test_reset_state_recall(self):
-    r_obj = metrics.Recall()
-    model = _get_model([r_obj])
-    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
-    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
-    self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
-    self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
-
-  def test_reset_state_sensitivity_at_specificity(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
-    model = _get_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-
-    for _ in range(2):
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
-      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
-
-  def test_reset_state_specificity_at_sensitivity(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
-    model = _get_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-
-    for _ in range(2):
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
-      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
-
-  def test_reset_state_precision_at_recall(self):
-    s_obj = metrics.PrecisionAtRecall(recall=0.5, num_thresholds=1)
-    model = _get_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-
-    for _ in range(2):
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
-      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
-
-  def test_reset_state_recall_at_precision(self):
-    s_obj = metrics.RecallAtPrecision(precision=0.5, num_thresholds=1)
-    model = _get_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-
-    for _ in range(2):
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
-      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
-      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
-
-  def test_reset_state_auc(self):
-    auc_obj = metrics.AUC(num_thresholds=3)
-    model = _get_model([auc_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-
-    for _ in range(2):
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
-
-  def test_reset_state_auc_from_logits(self):
-    auc_obj = metrics.AUC(num_thresholds=3, from_logits=True)
-
-    model_layers = [layers.Dense(1, kernel_initializer='ones', use_bias=False)]
-    model = test_utils.get_model_from_layers(model_layers, input_shape=(4,))
-    model.compile(
-        loss='mae',
-        metrics=[auc_obj],
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.concatenate((np.ones((25, 4)), -np.ones((25, 4)), -np.ones(
-        (25, 4)), np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones(
-        (25, 1)), np.zeros((25, 1))))
-
-    for _ in range(2):
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
-
-  def test_reset_state_auc_manual_thresholds(self):
-    auc_obj = metrics.AUC(thresholds=[0.5])
-    model = _get_model([auc_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-
-    for _ in range(2):
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
-      self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
-
-  def test_reset_state_mean_iou(self):
-    m_obj = metrics.MeanIoU(num_classes=2)
-    model = _get_model([m_obj])
-    x = np.asarray([[0, 0, 0, 0], [1, 1, 1, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
-                   dtype=np.float32)
-    y = np.asarray([[0], [1], [1], [1]], dtype=np.float32)
-    model.evaluate(x, y)
-    self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
-    self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
-    model.evaluate(x, y)
-    self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
-    self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
-
-  def test_reset_state_recall_float64(self):
-    # Test case for GitHub issue 36790.
-    try:
-      backend.set_floatx('float64')
-      r_obj = metrics.Recall()
-      model = _get_model([r_obj])
-      x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
-      y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
-      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
-      model.evaluate(x, y)
-      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
-      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
-    finally:
-      backend.set_floatx('float32')
-
-  def test_function_wrapped_reset_state(self):
-    m = metrics.Mean(name='my_mean')
-
-    # check reset_state in function.
-    @tf.function
-    def reset_in_fn():
-      m.reset_state()
-      return m.update_state(100)
-
-    for _ in range(5):
-      self.evaluate(reset_in_fn())
-    self.assertEqual(self.evaluate(m.count), 1)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MergeStateTest(test_combinations.TestCase):
-
-  def test_merge_state_incompatible_metrics(self):
-    with self.assertRaisesRegex(ValueError,
-                                'Metric .* is not compatible with .*'):
-      obj1 = metrics.FalsePositives()
-      self.evaluate(tf.compat.v1.variables_initializer(obj1.variables))
-      obj2 = metrics.Accuracy()
-      self.evaluate(tf.compat.v1.variables_initializer(obj2.variables))
-      self.evaluate(obj1.merge_state([obj2]))
-
-  def test_merge_state_accuracy(self):
-    a_objs = []
-    for y_true, y_pred in zip([[[1], [2]], [[3], [4]]],
-                              [[[0], [2]], [[3], [4]]]):
-      a_obj = metrics.Accuracy()
-      a_objs.append(a_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(a_obj.variables))
-      self.evaluate(a_obj.update_state(y_true, y_pred))
-    self.evaluate(a_objs[0].merge_state(a_objs[1:]))
-    self.assertEqual(self.evaluate(a_objs[0].total), 3.)
-    self.assertEqual(self.evaluate(a_objs[0].count), 4.)
-    self.assertEqual(self.evaluate(a_objs[0].result()), 0.75)
-
-  def test_merge_state_false_positives(self):
-    fp_objs = []
-    for _ in range(4):
-      fp_obj = metrics.FalsePositives()
-      fp_objs.append(fp_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(fp_obj.variables))
-      y_true = np.zeros((25, 1))
-      y_pred = np.ones((25, 1))
-      self.evaluate(fp_obj.update_state(y_true, y_pred))
-    self.evaluate(fp_objs[0].merge_state(fp_objs[1:]))
-    self.assertEqual(self.evaluate(fp_objs[0].accumulator), 100.)
-
-  def test_merge_state_false_negatives(self):
-    fn_objs = []
-    for _ in range(4):
-      fn_obj = metrics.FalseNegatives()
-      fn_objs.append(fn_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(fn_obj.variables))
-      y_true = np.ones((25, 1))
-      y_pred = np.zeros((25, 1))
-      self.evaluate(fn_obj.update_state(y_true, y_pred))
-    self.evaluate(fn_objs[0].merge_state(fn_objs[1:]))
-    self.assertEqual(self.evaluate(fn_objs[0].accumulator), 100.)
-
-  def test_merge_state_true_negatives(self):
-    tn_objs = []
-    for _ in range(4):
-      tn_obj = metrics.TrueNegatives()
-      tn_objs.append(tn_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(tn_obj.variables))
-      y_true = np.zeros((25, 1))
-      y_pred = np.zeros((25, 1))
-      self.evaluate(tn_obj.update_state(y_true, y_pred))
-    self.evaluate(tn_objs[0].merge_state(tn_objs[1:]))
-    self.assertEqual(self.evaluate(tn_objs[0].accumulator), 100.)
-
-  def test_merge_state_true_positives(self):
-    tp_objs = []
-    for _ in range(4):
-      tp_obj = metrics.TruePositives()
-      tp_objs.append(tp_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(tp_obj.variables))
-      y_true = np.ones((25, 1))
-      y_pred = np.ones((25, 1))
-      self.evaluate(tp_obj.update_state(y_true, y_pred))
-    self.evaluate(tp_objs[0].merge_state(tp_objs[1:]))
-    self.assertEqual(self.evaluate(tp_objs[0].accumulator), 100.)
-
-  def test_merge_state_precision(self):
-    p_objs = []
-    for _ in range(5):
-      p_obj = metrics.Precision()
-      p_objs.append(p_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(p_obj.variables))
-      y_true = np.concatenate((np.ones((10, 1)), np.zeros((10, 1))))
-      y_pred = np.concatenate((np.ones((10, 1)), np.ones((10, 1))))
-      self.evaluate(p_obj.update_state(y_true, y_pred))
-    self.evaluate(p_objs[0].merge_state(p_objs[1:]))
-    self.assertEqual(self.evaluate(p_objs[0].true_positives), 50.)
-    self.assertEqual(self.evaluate(p_objs[0].false_positives), 50.)
-
-  def test_merge_state_recall(self):
-    r_objs = []
-    for _ in range(5):
-      r_obj = metrics.Recall()
-      r_objs.append(r_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(r_obj.variables))
-      y_true = np.concatenate((np.ones((10, 1)), np.ones((10, 1))))
-      y_pred = np.concatenate((np.ones((10, 1)), np.zeros((10, 1))))
-      self.evaluate(r_obj.update_state(y_true, y_pred))
-    self.evaluate(r_objs[0].merge_state(r_objs[1:]))
-    self.assertEqual(self.evaluate(r_objs[0].true_positives), 50.)
-    self.assertEqual(self.evaluate(r_objs[0].false_negatives), 50.)
-
-  def test_merge_state_sensitivity_at_specificity(self):
-    sas_objs = []
-    for _ in range(5):
-      sas_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
-      sas_objs.append(sas_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(sas_obj.variables))
-      y_true = np.concatenate((np.ones((5, 1)), np.zeros((5, 1)), np.ones(
-          (5, 1)), np.zeros((5, 1))))
-      y_pred = np.concatenate((np.ones((5, 1)), np.zeros(
-          (5, 1)), np.zeros((5, 1)), np.ones((5, 1))))
-      self.evaluate(sas_obj.update_state(y_true, y_pred))
-    self.evaluate(sas_objs[0].merge_state(sas_objs[1:]))
-    self.assertEqual(self.evaluate(sas_objs[0].true_positives), 25.)
-    self.assertEqual(self.evaluate(sas_objs[0].false_positives), 25.)
-    self.assertEqual(self.evaluate(sas_objs[0].false_negatives), 25.)
-    self.assertEqual(self.evaluate(sas_objs[0].true_negatives), 25.)
-
-  def test_merge_state_specificity_at_sensitivity(self):
-    sas_objs = []
-    for _ in range(5):
-      sas_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
-      sas_objs.append(sas_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(sas_obj.variables))
-      y_true = np.concatenate((np.ones((5, 1)), np.zeros((5, 1)), np.ones(
-          (5, 1)), np.zeros((5, 1))))
-      y_pred = np.concatenate((np.ones((5, 1)), np.zeros(
-          (5, 1)), np.zeros((5, 1)), np.ones((5, 1))))
-      self.evaluate(sas_obj.update_state(y_true, y_pred))
-    self.evaluate(sas_objs[0].merge_state(sas_objs[1:]))
-    self.assertEqual(self.evaluate(sas_objs[0].true_positives), 25.)
-    self.assertEqual(self.evaluate(sas_objs[0].false_positives), 25.)
-    self.assertEqual(self.evaluate(sas_objs[0].false_negatives), 25.)
-    self.assertEqual(self.evaluate(sas_objs[0].true_negatives), 25.)
-
-  def test_merge_state_precision_at_recall(self):
-    par_objs = []
-    for _ in range(5):
-      par_obj = metrics.PrecisionAtRecall(recall=0.5, num_thresholds=1)
-      par_objs.append(par_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(par_obj.variables))
-      y_true = np.concatenate((np.ones((5, 1)), np.zeros((5, 1)), np.ones(
-          (5, 1)), np.zeros((5, 1))))
-      y_pred = np.concatenate((np.ones((5, 1)), np.zeros(
-          (5, 1)), np.zeros((5, 1)), np.ones((5, 1))))
-      self.evaluate(par_obj.update_state(y_true, y_pred))
-    self.evaluate(par_objs[0].merge_state(par_objs[1:]))
-    self.assertEqual(self.evaluate(par_objs[0].true_positives), 25.)
-    self.assertEqual(self.evaluate(par_objs[0].false_positives), 25.)
-    self.assertEqual(self.evaluate(par_objs[0].false_negatives), 25.)
-    self.assertEqual(self.evaluate(par_objs[0].true_negatives), 25.)
-
-  def test_merge_state_recall_at_precision(self):
-    rap_objs = []
-    for _ in range(5):
-      rap_obj = metrics.PrecisionAtRecall(recall=0.5, num_thresholds=1)
-      rap_objs.append(rap_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(rap_obj.variables))
-      y_true = np.concatenate((np.ones((5, 1)), np.zeros((5, 1)), np.ones(
-          (5, 1)), np.zeros((5, 1))))
-      y_pred = np.concatenate((np.ones((5, 1)), np.zeros(
-          (5, 1)), np.zeros((5, 1)), np.ones((5, 1))))
-      self.evaluate(rap_obj.update_state(y_true, y_pred))
-    self.evaluate(rap_objs[0].merge_state(rap_objs[1:]))
-    self.assertEqual(self.evaluate(rap_objs[0].true_positives), 25.)
-    self.assertEqual(self.evaluate(rap_objs[0].false_positives), 25.)
-    self.assertEqual(self.evaluate(rap_objs[0].false_negatives), 25.)
-    self.assertEqual(self.evaluate(rap_objs[0].true_negatives), 25.)
-
-  def test_merge_state_auc(self):
-    auc_objs = []
-    for _ in range(5):
-      auc_obj = metrics.AUC(num_thresholds=3)
-      auc_objs.append(auc_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-      y_true = np.concatenate((np.ones((5, 1)), np.zeros((5, 1)), np.ones(
-          (5, 1)), np.zeros((5, 1))))
-      y_pred = np.concatenate((np.ones((5, 1)), np.zeros(
-          (5, 1)), np.zeros((5, 1)), np.ones((5, 1))))
-      self.evaluate(auc_obj.update_state(y_true, y_pred))
-    self.evaluate(auc_objs[0].merge_state(auc_objs[1:]))
-    self.assertEqual(self.evaluate(auc_objs[0].true_positives[1]), 25.)
-    self.assertEqual(self.evaluate(auc_objs[0].false_positives[1]), 25.)
-    self.assertEqual(self.evaluate(auc_objs[0].false_negatives[1]), 25.)
-    self.assertEqual(self.evaluate(auc_objs[0].true_negatives[1]), 25.)
-
-  def test_merge_state_mean_iou(self):
-    m_objs = []
-    for y_true, y_pred in zip([[0], [1], [1], [1]],
-                              [[0.5], [1.0], [1.0], [1.0]]):
-      m_obj = metrics.MeanIoU(num_classes=2)
-      m_objs.append(m_obj)
-      self.evaluate(tf.compat.v1.variables_initializer(m_obj.variables))
-      self.evaluate(m_obj.update_state(y_true, y_pred))
-    self.evaluate(m_objs[0].merge_state(m_objs[1:]))
-    self.assertArrayNear(self.evaluate(m_objs[0].total_cm)[0], [1, 0], 1e-1)
-    self.assertArrayNear(self.evaluate(m_objs[0].total_cm)[1], [0, 3], 1e-1)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/metrics/probabilistic_metrics.py b/keras/metrics/probabilistic_metrics.py
new file mode 100644
index 000000000000..c2c8d4871d0b
--- /dev/null
+++ b/keras/metrics/probabilistic_metrics.py
@@ -0,0 +1,346 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Probabilistic metrics (based on Entropy)."""
+
+from typing import Optional
+from typing import Union
+
+import tensorflow.compat.v2 as tf
+
+from keras.dtensor import utils as dtensor_utils
+from keras.losses import binary_crossentropy
+from keras.losses import categorical_crossentropy
+from keras.losses import kullback_leibler_divergence
+from keras.losses import poisson
+from keras.losses import sparse_categorical_crossentropy
+from keras.metrics import base_metric
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.metrics.Poisson")
+class Poisson(base_metric.MeanMetricWrapper):
+    """Computes the Poisson score between `y_true` and `y_pred`.
+
+    🐟 🐟 🐟
+
+    It is defined as: `poisson_score = y_pred - y_true * log(y_pred)`.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.Poisson()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
+    >>> m.result().numpy()
+    0.49999997
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.99999994
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='categorical_crossentropy',
+                  metrics=[tf.keras.metrics.Poisson()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="poisson", dtype=None):
+        super().__init__(poisson, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.KLDivergence")
+class KLDivergence(base_metric.MeanMetricWrapper):
+    """Computes Kullback-Leibler divergence metric between `y_true` and
+    `y_pred`.
+
+    `metric = y_true * log(y_true / y_pred)`
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.KLDivergence()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
+    >>> m.result().numpy()
+    0.45814306
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.9162892
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='categorical_crossentropy',
+                  metrics=[tf.keras.metrics.KLDivergence()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="kullback_leibler_divergence", dtype=None):
+        super().__init__(kullback_leibler_divergence, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.BinaryCrossentropy")
+class BinaryCrossentropy(base_metric.MeanMetricWrapper):
+    """Computes the crossentropy metric between the labels and predictions.
+
+    This is the crossentropy metric class to be used when there are only two
+    label classes (0 and 1).
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      from_logits: (Optional) Whether output is expected to be a logits tensor.
+        By default, we consider that output encodes a probability distribution.
+      label_smoothing: (Optional) Float in [0, 1]. When > 0, label values are
+        smoothed, meaning the confidence on label values are relaxed.
+        e.g. `label_smoothing=0.2` means that we will use a value of `0.1` for
+        label `0` and `0.9` for label `1`".
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.BinaryCrossentropy()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]])
+    >>> m.result().numpy()
+    0.81492424
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.9162905
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='binary_crossentropy',
+        metrics=[tf.keras.metrics.BinaryCrossentropy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        name="binary_crossentropy",
+        dtype=None,
+        from_logits=False,
+        label_smoothing=0,
+    ):
+        super().__init__(
+            binary_crossentropy,
+            name,
+            dtype=dtype,
+            from_logits=from_logits,
+            label_smoothing=label_smoothing,
+        )
+
+
+@keras_export("keras.metrics.CategoricalCrossentropy")
+class CategoricalCrossentropy(base_metric.MeanMetricWrapper):
+    """Computes the crossentropy metric between the labels and predictions.
+
+    This is the crossentropy metric class to be used when there are multiple
+    label classes (2 or more). Here we assume that labels are given as a
+    `one_hot` representation. eg., When labels values are [2, 0, 1],
+     `y_true` = [[0, 0, 1], [1, 0, 0], [0, 1, 0]].
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      from_logits: (Optional) Whether output is expected to be a logits tensor.
+        By default, we consider that output encodes a probability distribution.
+      label_smoothing: (Optional) Float in [0, 1]. When > 0, label values are
+        smoothed, meaning the confidence on label values are relaxed. e.g.
+        `label_smoothing=0.2` means that we will use a value of `0.1` for label
+        `0` and `0.9` for label `1`"
+      axis: (Optional) -1 is the dimension along which entropy is
+        computed. Defaults to `-1`.
+
+    Standalone usage:
+
+    >>> # EPSILON = 1e-7, y = y_true, y` = y_pred
+    >>> # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    >>> # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    >>> # xent = -sum(y * log(y'), axis = -1)
+    >>> #      = -((log 0.95), (log 0.1))
+    >>> #      = [0.051, 2.302]
+    >>> # Reduced xent = (0.051 + 2.302) / 2
+    >>> m = tf.keras.metrics.CategoricalCrossentropy()
+    >>> m.update_state([[0, 1, 0], [0, 0, 1]],
+    ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    >>> m.result().numpy()
+    1.1769392
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1, 0], [0, 0, 1]],
+    ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]],
+    ...                sample_weight=tf.constant([0.3, 0.7]))
+    >>> m.result().numpy()
+    1.6271976
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='categorical_crossentropy',
+      metrics=[tf.keras.metrics.CategoricalCrossentropy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        name="categorical_crossentropy",
+        dtype=None,
+        from_logits=False,
+        label_smoothing=0,
+        axis=-1,
+    ):
+        super().__init__(
+            categorical_crossentropy,
+            name,
+            dtype=dtype,
+            from_logits=from_logits,
+            label_smoothing=label_smoothing,
+            axis=axis,
+        )
+
+
+@keras_export("keras.metrics.SparseCategoricalCrossentropy")
+class SparseCategoricalCrossentropy(base_metric.MeanMetricWrapper):
+    """Computes the crossentropy metric between the labels and predictions.
+
+    Use this crossentropy metric when there are two or more label classes.
+    We expect labels to be provided as integers. If you want to provide labels
+    using `one-hot` representation, please use `CategoricalCrossentropy` metric.
+    There should be `# classes` floating point values per feature for `y_pred`
+    and a single floating point value per feature for `y_true`.
+
+    In the snippet below, there is a single floating point value per example for
+    `y_true` and `# classes` floating pointing values per example for `y_pred`.
+    The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
+    `[batch_size, num_classes]`.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      from_logits: (Optional) Whether output is expected to be a logits tensor.
+        By default, we consider that output encodes a probability distribution.
+      ignore_class: Optional integer. The ID of a class to be ignored during
+        metric computation. This is useful, for example, in segmentation
+        problems featuring a "void" class (commonly -1 or 255) in segmentation
+        maps. By default (`ignore_class=None`), all classes are considered.
+      axis: (Optional) The dimension along which entropy is
+        computed. Defaults to `-1`.
+
+    Standalone usage:
+
+    >>> # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+    >>> # logits = log(y_pred)
+    >>> # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    >>> # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    >>> # xent = -sum(y * log(softmax), 1)
+    >>> # log(softmax) = [[-2.9957, -0.0513, -16.1181],
+    >>> #                [-2.3026, -0.2231, -2.3026]]
+    >>> # y_true * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+    >>> # xent = [0.0513, 2.3026]
+    >>> # Reduced xent = (0.0513 + 2.3026) / 2
+    >>> m = tf.keras.metrics.SparseCategoricalCrossentropy()
+    >>> m.update_state([1, 2],
+    ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    >>> m.result().numpy()
+    1.1769392
+
+    >>> m.reset_state()
+    >>> m.update_state([1, 2],
+    ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]],
+    ...                sample_weight=tf.constant([0.3, 0.7]))
+    >>> m.result().numpy()
+    1.6271976
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='sparse_categorical_crossentropy',
+      metrics=[tf.keras.metrics.SparseCategoricalCrossentropy()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        name: str = "sparse_categorical_crossentropy",
+        dtype: Optional[Union[str, tf.dtypes.DType]] = None,
+        from_logits: bool = False,
+        ignore_class: Optional[int] = None,
+        axis: int = -1,
+    ):
+        super().__init__(
+            sparse_categorical_crossentropy,
+            name,
+            dtype=dtype,
+            from_logits=from_logits,
+            ignore_class=ignore_class,
+            axis=axis,
+        )
+
+
+_SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING = """Accumulates metric statistics.
+
+For sparse categorical metrics, the shapes of `y_true` and `y_pred` are
+different.
+
+Args:
+  y_true: Ground truth label values. shape = `[batch_size, d0, .. dN-1]` or
+    shape = `[batch_size, d0, .. dN-1, 1]`.
+  y_pred: The predicted probability values. shape = `[batch_size, d0, .. dN]`.
+  sample_weight: Optional `sample_weight` acts as a
+    coefficient for the metric. If a scalar is provided, then the metric is
+    simply scaled by the given value. If `sample_weight` is a tensor of size
+    `[batch_size]`, then the metric for each sample of the batch is rescaled
+    by the corresponding element in the `sample_weight` vector. If the shape
+    of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be broadcasted
+    to this shape), then each metric element of `y_pred` is scaled by the
+    corresponding value of `sample_weight`. (Note on `dN-1`: all metric
+    functions reduce by 1 dimension, usually the last axis (-1)).
+
+Returns:
+  Update op.
+"""
+
+SparseCategoricalCrossentropy.update_state.__doc__ = (
+    _SPARSE_CATEGORICAL_UPDATE_STATE_DOCSTRING
+)
diff --git a/keras/metrics/probabilistic_metrics_test.py b/keras/metrics/probabilistic_metrics_test.py
new file mode 100644
index 000000000000..0a2e8577d565
--- /dev/null
+++ b/keras/metrics/probabilistic_metrics_test.py
@@ -0,0 +1,567 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics."""
+
+import json
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras import metrics
+from keras.testing_infra import test_combinations
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class PoissonTest(tf.test.TestCase):
+    def setup(self):
+        y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+        y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+        self.batch_size = 6
+        self.expected_results = y_pred - np.multiply(y_true, np.log(y_pred))
+
+        self.y_pred = tf.constant(y_pred, dtype=tf.float32)
+        self.y_true = tf.constant(y_true)
+
+    def test_config(self):
+        poisson_obj = metrics.Poisson(name="poisson", dtype=tf.int32)
+        self.assertEqual(poisson_obj.name, "poisson")
+        self.assertEqual(poisson_obj._dtype, tf.int32)
+
+        poisson_obj2 = metrics.Poisson.from_config(poisson_obj.get_config())
+        self.assertEqual(poisson_obj2.name, "poisson")
+        self.assertEqual(poisson_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        self.setup()
+        poisson_obj = metrics.Poisson()
+        self.evaluate(tf.compat.v1.variables_initializer(poisson_obj.variables))
+
+        update_op = poisson_obj.update_state(self.y_true, self.y_pred)
+        self.evaluate(update_op)
+        result = poisson_obj.result()
+        expected_result = np.sum(self.expected_results) / self.batch_size
+        self.assertAllClose(result, expected_result, atol=1e-3)
+
+    def test_weighted(self):
+        self.setup()
+        poisson_obj = metrics.Poisson()
+        self.evaluate(tf.compat.v1.variables_initializer(poisson_obj.variables))
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+
+        result = poisson_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+        sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(
+            (2, 3)
+        )
+        expected_result = np.multiply(self.expected_results, sample_weight)
+        expected_result = np.sum(expected_result) / np.sum(sample_weight)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class KLDivergenceTest(tf.test.TestCase):
+    def setup(self):
+        y_pred = np.asarray([0.4, 0.9, 0.12, 0.36, 0.3, 0.4]).reshape((2, 3))
+        y_true = np.asarray([0.5, 0.8, 0.12, 0.7, 0.43, 0.8]).reshape((2, 3))
+
+        self.batch_size = 2
+        self.expected_results = np.multiply(y_true, np.log(y_true / y_pred))
+
+        self.y_pred = tf.constant(y_pred, dtype=tf.float32)
+        self.y_true = tf.constant(y_true)
+
+    def test_config(self):
+        k_obj = metrics.KLDivergence(name="kld", dtype=tf.int32)
+        self.assertEqual(k_obj.name, "kld")
+        self.assertEqual(k_obj._dtype, tf.int32)
+
+        k_obj2 = metrics.KLDivergence.from_config(k_obj.get_config())
+        self.assertEqual(k_obj2.name, "kld")
+        self.assertEqual(k_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        self.setup()
+        k_obj = metrics.KLDivergence()
+        self.evaluate(tf.compat.v1.variables_initializer(k_obj.variables))
+
+        update_op = k_obj.update_state(self.y_true, self.y_pred)
+        self.evaluate(update_op)
+        result = k_obj.result()
+        expected_result = np.sum(self.expected_results) / self.batch_size
+        self.assertAllClose(result, expected_result, atol=1e-3)
+
+    def test_weighted(self):
+        self.setup()
+        k_obj = metrics.KLDivergence()
+        self.evaluate(tf.compat.v1.variables_initializer(k_obj.variables))
+
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        result = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+        sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(
+            (2, 3)
+        )
+        expected_result = np.multiply(self.expected_results, sample_weight)
+        expected_result = np.sum(expected_result) / (1.2 + 3.4)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class BinaryCrossentropyTest(tf.test.TestCase):
+    def test_config(self):
+        bce_obj = metrics.BinaryCrossentropy(
+            name="bce", dtype=tf.int32, label_smoothing=0.2
+        )
+        self.assertEqual(bce_obj.name, "bce")
+        self.assertEqual(bce_obj._dtype, tf.int32)
+
+        old_config = bce_obj.get_config()
+        self.assertAllClose(old_config["label_smoothing"], 0.2, 1e-3)
+
+        # Check save and restore config
+        bce_obj2 = metrics.BinaryCrossentropy.from_config(old_config)
+        self.assertEqual(bce_obj2.name, "bce")
+        self.assertEqual(bce_obj2._dtype, tf.int32)
+        new_config = bce_obj2.get_config()
+        self.assertDictEqual(old_config, new_config)
+
+    def test_unweighted(self):
+        bce_obj = metrics.BinaryCrossentropy()
+        self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+        result = bce_obj(y_true, y_pred)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+        # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+        #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+        #           -log(Y_MAX + EPSILON), -log(1)]
+        #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+        # Reduced metric = 7.665 / 2
+
+        self.assertAllClose(self.evaluate(result), 3.833, atol=1e-3)
+
+    def test_unweighted_with_logits(self):
+        bce_obj = metrics.BinaryCrossentropy(from_logits=True)
+        self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
+
+        y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
+        y_pred = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
+        result = bce_obj(y_true, y_pred)
+
+        # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #              (where x = logits and z = y_true)
+        #        = [((100 - 100 * 1 + log(1 + exp(-100))) +
+        #            (0 + 100 * 0 + log(1 + exp(-100))) +
+        #            (100 - 100 * 1 + log(1 + exp(-100))),
+        #           ((100 - 100 * 0 + log(1 + exp(-100))) +
+        #            (100 - 100 * 1 + log(1 + exp(-100))) +
+        #            (0 + 100 * 1 + log(1 + exp(-100))))]
+        #        = [(0 + 0 + 0) / 3, 200 / 3]
+        # Reduced metric = (0 + 66.666) / 2
+
+        self.assertAllClose(self.evaluate(result), 33.333, atol=1e-3)
+
+    def test_weighted(self):
+        bce_obj = metrics.BinaryCrossentropy()
+        self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
+        y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+        y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+        sample_weight = tf.constant([1.5, 2.0])
+        result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+        # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+        #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+        #           -log(Y_MAX + EPSILON), -log(1)]
+        #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+        # Weighted metric = [7.665 * 1.5, 0]
+        # Reduced metric = 7.665 * 1.5 / (1.5 + 2)
+
+        self.assertAllClose(self.evaluate(result), 3.285, atol=1e-3)
+
+    def test_weighted_from_logits(self):
+        bce_obj = metrics.BinaryCrossentropy(from_logits=True)
+        self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
+        y_true = tf.constant([[1, 0, 1], [0, 1, 1]])
+        y_pred = tf.constant([[100.0, -100.0, 100.0], [100.0, 100.0, -100.0]])
+        sample_weight = tf.constant([2.0, 2.5])
+        result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #              (where x = logits and z = y_true)
+        #        = [(0 + 0 + 0) / 3, 200 / 3]
+        # Weighted metric = [0, 66.666 * 2.5]
+        # Reduced metric = 66.666 * 2.5 / (2 + 2.5)
+
+        self.assertAllClose(self.evaluate(result), 37.037, atol=1e-3)
+
+    def test_label_smoothing(self):
+        logits = tf.constant(((100.0, -100.0, -100.0)))
+        y_true = tf.constant(((1, 0, 1)))
+        label_smoothing = 0.1
+        # Metric: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+        #             (where x = logits and z = y_true)
+        # Label smoothing: z' = z * (1 - L) + 0.5L
+        # After label smoothing, label 1 becomes 1 - 0.5L
+        #                        label 0 becomes 0.5L
+        # Applying the above two fns to the given input:
+        # (100 - 100 * (1 - 0.5 L)  + 0 +
+        #  0   + 100 * (0.5 L)      + 0 +
+        #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+        #  = (100 + 50L) * 1/3
+        bce_obj = metrics.BinaryCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(bce_obj.variables))
+        result = bce_obj(y_true, logits)
+        expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+        self.assertAllClose(expected_value, self.evaluate(result), atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class CategoricalCrossentropyTest(tf.test.TestCase):
+    def test_config(self):
+        cce_obj = metrics.CategoricalCrossentropy(
+            name="cce", dtype=tf.int32, label_smoothing=0.2
+        )
+        self.assertEqual(cce_obj.name, "cce")
+        self.assertEqual(cce_obj._dtype, tf.int32)
+
+        old_config = cce_obj.get_config()
+        self.assertAllClose(old_config["label_smoothing"], 0.2, 1e-3)
+
+        # Check save and restore config
+        cce_obj2 = metrics.CategoricalCrossentropy.from_config(old_config)
+        self.assertEqual(cce_obj2.name, "cce")
+        self.assertEqual(cce_obj2._dtype, tf.int32)
+        new_config = cce_obj2.get_config()
+        self.assertDictEqual(old_config, new_config)
+
+    def test_unweighted(self):
+        cce_obj = metrics.CategoricalCrossentropy()
+        self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
+
+        y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+        y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+        result = cce_obj(y_true, y_pred)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+        # Metric = -sum(y * log(y'), axis = -1)
+        #        = -((log 0.95), (log 0.1))
+        #        = [0.051, 2.302]
+        # Reduced metric = (0.051 + 2.302) / 2
+
+        self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+    def test_unweighted_from_logits(self):
+        cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+        self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
+
+        y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+        logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+        result = cce_obj(y_true, logits)
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # xent = -sum(labels * log(softmax), 1)
+
+        # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
+        # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
+        # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
+        # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+        #                 [-7.00182, -0.00182, -7.00182]]
+        # labels * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
+        # xent = [0.00045, 7.00182]
+        # Reduced xent = (0.00045 + 7.00182) / 2
+
+        self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
+
+    def test_weighted(self):
+        cce_obj = metrics.CategoricalCrossentropy()
+        self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
+
+        y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+        y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+        sample_weight = tf.constant([1.5, 2.0])
+        result = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+        # Metric = -sum(y * log(y'), axis = -1)
+        #        = -((log 0.95), (log 0.1))
+        #        = [0.051, 2.302]
+        # Weighted metric = [0.051 * 1.5, 2.302 * 2.]
+        # Reduced metric = (0.051 * 1.5 + 2.302 * 2.) / 3.5
+
+        self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+    def test_weighted_from_logits(self):
+        cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+        self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
+
+        y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+        logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+        sample_weight = tf.constant([1.5, 2.0])
+        result = cce_obj(y_true, logits, sample_weight=sample_weight)
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # xent = -sum(labels * log(softmax), 1)
+        # xent = [0.00045, 7.00182]
+        # weighted xent = [0.000675, 14.00364]
+        # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
+
+        self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
+
+    def test_label_smoothing(self):
+        y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+        logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+        label_smoothing = 0.1
+
+        # Label smoothing: z' = z * (1 - L) + L/n,
+        #     where L = label smoothing value and n = num classes
+        # Label value 1 becomes: 1 - L + L/n
+        # Label value 0 becomes: L/n
+        # y_true with label_smoothing = [[0.0333, 0.9333, 0.0333],
+        #                               [0.0333, 0.0333, 0.9333]]
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # xent = -sum(labels * log(softmax), 1)
+        # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+        #                 [-7.00182, -0.00182, -7.00182]]
+        # labels * log(softmax) = [[-0.26641, -0.00042, -0.29971],
+        #                          [-0.23316, -0.00006, -6.53479]]
+        # xent = [0.56654, 6.76801]
+        # Reduced xent = (0.56654 + 6.76801) / 2
+
+        cce_obj = metrics.CategoricalCrossentropy(
+            from_logits=True, label_smoothing=label_smoothing
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(cce_obj.variables))
+        loss = cce_obj(y_true, logits)
+        self.assertAllClose(self.evaluate(loss), 3.667, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class SparseCategoricalCrossentropyTest(tf.test.TestCase):
+    def test_config(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy(
+            name="scce", dtype=tf.int32
+        )
+        self.assertEqual(scce_obj.name, "scce")
+        self.assertEqual(scce_obj.dtype, tf.int32)
+        old_config = scce_obj.get_config()
+        self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
+
+        # Check save and restore config
+        scce_obj2 = metrics.SparseCategoricalCrossentropy.from_config(
+            old_config
+        )
+        self.assertEqual(scce_obj2.name, "scce")
+        self.assertEqual(scce_obj2.dtype, tf.int32)
+        new_config = scce_obj2.get_config()
+        self.assertDictEqual(old_config, new_config)
+
+    def test_unweighted(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy()
+        self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
+
+        y_true = np.asarray([1, 2])
+        y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+        result = scce_obj(y_true, y_pred)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+        # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
+        #                      [-2.3026, -0.2231, -2.3026]]
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
+        # xent = -sum(y * log(softmax), 1)
+
+        # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+        # sum(exp(logits), axis=-1) = [1, 1]
+        # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+        # log(softmax) = [[-2.9957, -0.0513, -16.1181],
+        #                 [-2.3026, -0.2231, -2.3026]]
+        # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+        # xent = [0.0513, 2.3026]
+        # Reduced xent = (0.0513 + 2.3026) / 2
+
+        self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+    def test_unweighted_ignore_class(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy(ignore_class=-1)
+        self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
+
+        y_true = np.asarray([-1, 2])
+        y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+        result = scce_obj(y_true, y_pred)
+
+        self.assertAllClose(self.evaluate(result), 2.3026, atol=1e-3)
+
+    def test_unweighted_from_logits(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
+        self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
+
+        y_true = np.asarray([1, 2])
+        logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+        result = scce_obj(y_true, logits)
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+        # xent = -sum(y_true * log(softmax), 1)
+
+        # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
+        # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
+        # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
+        # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+        #                 [-7.00182, -0.00182, -7.00182]]
+        # y_true * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
+        # xent = [0.00045, 7.00182]
+        # Reduced xent = (0.00045 + 7.00182) / 2
+
+        self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
+
+    def test_weighted(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy()
+        self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
+
+        y_true = np.asarray([1, 2])
+        y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+        sample_weight = tf.constant([1.5, 2.0])
+        result = scce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+        # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
+        #                      [-2.3026, -0.2231, -2.3026]]
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
+        # xent = -sum(y * log(softmax), 1)
+
+        # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+        # sum(exp(logits), axis=-1) = [1, 1]
+        # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+        # log(softmax) = [[-2.9957, -0.0513, -16.1181],
+        #                 [-2.3026, -0.2231, -2.3026]]
+        # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+        # xent = [0.0513, 2.3026]
+        # Weighted xent = [0.051 * 1.5, 2.302 * 2.]
+        # Reduced xent = (0.051 * 1.5 + 2.302 * 2.) / 3.5
+
+        self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+    def test_weighted_ignore_class(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy(ignore_class=-1)
+        self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
+
+        y_true = np.asarray([1, 2, -1])
+        y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1], [0.1, 0.8, 0.1]])
+        sample_weight = tf.constant([1.5, 2.0, 1.5])
+        result = scce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+        self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+    def test_weighted_from_logits(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
+        self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
+
+        y_true = np.asarray([1, 2])
+        logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+        sample_weight = tf.constant([1.5, 2.0])
+        result = scce_obj(y_true, logits, sample_weight=sample_weight)
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+        # xent = -sum(y_true * log(softmax), 1)
+        # xent = [0.00045, 7.00182]
+        # weighted xent = [0.000675, 14.00364]
+        # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
+
+        self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
+
+    def test_axis(self):
+        scce_obj = metrics.SparseCategoricalCrossentropy(axis=0)
+        self.evaluate(tf.compat.v1.variables_initializer(scce_obj.variables))
+
+        y_true = np.asarray([1, 2])
+        y_pred = np.asarray([[0.05, 0.1], [0.95, 0.8], [0, 0.1]])
+        result = scce_obj(y_true, y_pred)
+
+        # EPSILON = 1e-7, y = y_true, y` = y_pred
+        # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+        # y` = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+        # logits = log(y`) =  [[-2.9957, -2.3026],
+        #                      [-0.0513, -0.2231],
+        #                      [-16.1181, -2.3026]]
+
+        # softmax = exp(logits) / sum(exp(logits), axis=-1)
+        # y = one_hot(y) = [[0, 0], [1, 0], [0, 1]]
+        # xent = -sum(y * log(softmax), 1)
+
+        # exp(logits) = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+        # sum(exp(logits)) = [1, 1]
+        # softmax = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+        # log(softmax) = [[-2.9957, -2.3026],
+        #                 [-0.0513, -0.2231],
+        #                 [-16.1181, -2.3026]]
+        # y * log(softmax) = [[0, 0], [-0.0513, 0], [0, -2.3026]]
+        # xent = [0.0513, 2.3026]
+        # Reduced xent = (0.0513 + 2.3026) / 2
+
+        self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+
+class BinaryTruePositives(metrics.Metric):
+    def __init__(self, name="binary_true_positives", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.true_positives = self.add_weight(name="tp", initializer="zeros")
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.cast(y_true, tf.bool)
+        y_pred = tf.cast(y_pred, tf.bool)
+
+        values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+        values = tf.cast(values, self.dtype)
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, dtype=self.dtype)
+            sample_weight = tf.__internal__.ops.broadcast_weights(
+                sample_weight, values
+            )
+            values = tf.multiply(values, sample_weight)
+        self.true_positives.assign_add(tf.reduce_sum(values))
+
+    def result(self):
+        return self.true_positives
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/py_metric.py b/keras/metrics/py_metric.py
new file mode 100644
index 000000000000..e0718203119f
--- /dev/null
+++ b/keras/metrics/py_metric.py
@@ -0,0 +1,191 @@
+# Copyright 2023 The Keras Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for Python-based metrics"""
+
+import types
+
+import tensorflow.compat.v2 as tf
+from tensorflow.python.util.tf_export import keras_export
+
+from keras.metrics import base_metric
+
+
+@keras_export("keras.metrics.experimental.PyMetric", v1=[])
+class PyMetric(base_metric.Metric):
+    """Metric which runs in Python, compiled outside of the TensorFlow graph.
+
+    Args:
+      name: (Optional) string name of the PyMetric instance.
+      dtype: (Optional) data type of the PyMetric result.
+      **kwargs: Additional layer keywords arguments.
+
+    Usage of `PyMetric` is generally identical to `keras.metrics.Metric`.
+    It can be used in isolation, or in tandem with the `compile()` API. For more
+    information about the usage of `PyMetric`, see `keras.metrics.Metric`.
+
+    Unlike regular metrics, `PyMetric` instances are outside-compiled
+    with respect to the TensorFlow graph during training or evaluation.
+    They have access to the same
+    inputs of a standard in-graph metric, but they run in a Python interpreter
+    on the host CPU. Any data stored in a `PyMetric` is located on the main
+    memory of the host CPU, and any TensorFlow ops used in a PyMetric are
+    run eagerly on the host CPU.
+
+    As a result, `PyMetric` instances are generally not as performant
+    as in-graph metrics, and should only be used in cases where computing
+    the metric inside of the TensorFlow graph is either impossible
+    or prohibitively expensive.
+
+    **Note:** Due to the use of `tf.py_function`, PyMetrics
+    are incompatible with XLA and therefore TPUs.
+
+    Methods to be implemented by subclasses:
+
+    * `update_state()`: Handles updates to internal state variables
+    * `result()`: Computes and returns a scalar value or a dict of scalar values
+      for the metric from the state variables.
+    * `reset_state()`: Computes and returns a scalar value for the metric from
+      the state variables.
+
+    This subclass implementation is similar to that of `keras.metrics.Metric`,
+    with two notable differences:
+
+    * Inputs to `update_state()` in a `PyMetric` are eager tensors, and both
+    `update_state()` and `result()` run outside of the TensorFlow graph,
+    executing any TensorFlow ops eagerly.
+    * `reset_state()` is also called at initialization time to initialize the
+    Python state of the metric.
+    * `result()` can only return a single scalar. It does not support returning
+    a dictionary of results like `keras.metrics.Metric`.
+
+    Example subclass implementation using sklearn's Jaccard Score:
+
+    ```python
+    from sklearn.metrics import jaccard_score
+    import tensorflow as tf
+
+    class JaccardScore(tf.keras.metrics.experimental.PyMetric):
+
+      def __init__(self, name='jaccard_score', **kwargs):
+        super().__init__(name=name, **kwargs)
+
+      def update_state(self, y_true, y_pred, sample_weight=None):
+        self.jaccard_sum += jaccard_score(y_pred, y_true, average="macro")
+        self.count += 1
+
+      def reset_state(self):
+        self.jaccard_sum = 0.
+        self.count = 0.
+
+      def result(self):
+        return self.jaccard_sum / self.count
+    ```
+    """
+
+    def __init__(self, name=None, dtype=None, **kwargs):
+        super().__init__(name=name, dtype=dtype, **kwargs)
+        self.reset_state()
+
+    def __new__(cls, *args, **kwargs):
+        obj = super(base_metric.Metric, cls).__new__(cls)
+
+        # Wrap the update_state function in a py_function and scope it to /cpu:0
+        obj_update_state = obj.update_state
+
+        def update_state_on_cpu(y_true, y_pred, sample_weight=None):
+            with tf.device("/cpu:0"):
+                return obj_update_state(y_true, y_pred, sample_weight)
+
+        obj.update_state_on_cpu = update_state_on_cpu
+
+        def update_state_fn(self, y_true, y_pred, sample_weight=None):
+            eager_inputs = [y_true, y_pred]
+            if sample_weight is not None:
+                eager_inputs.append(sample_weight)
+            return tf.py_function(
+                func=self.update_state_on_cpu, inp=eager_inputs, Tout=[]
+            )
+
+        obj.update_state = types.MethodType(update_state_fn, obj)
+
+        # Wrap the result function in a py_function and scope it to /cpu:0
+        obj_result = obj.result
+
+        def result_on_host_cpu():
+            with tf.device("/cpu:0"):
+                return obj_result()
+
+        obj.result_on_host_cpu = result_on_host_cpu
+
+        def result_fn(self):
+            return tf.py_function(
+                self.result_on_host_cpu, inp=[], Tout=obj.dtype
+            )
+
+        obj.result = types.MethodType(result_fn, obj)
+
+        return obj
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates statistics for the metric.
+
+        **Note:** This function is executed outside of the TensorFlow graph
+        on the CPU host.
+
+        This means:
+
+        a) Inputs are eager tensors.
+        b) Any TensorFlow ops run in this method are run eagerly.
+        c) Any Tensors created are allocated to the CPU's main memory.
+
+        Args:
+          y_true: Target output
+          y_pred: Predicted output
+          sample_weight: (Optional) weights for the individual samples in
+            `y_true` and `y_pred`
+        """
+        raise NotImplementedError("Subclasses should implement `update_state`")
+
+    def merge_state(self, metrics):
+        """Merges the state from one or more metrics.
+
+        `PyMetric` instances that intend to support merging state must override
+         this method, as the default implementation
+        in `keras.metrics.Metric` does not apply to `PyMetric`.
+        """
+        raise NotImplementedError("Subclasses should implement `merge_state`")
+
+    def reset_state(self):
+        """Resets all of the metric state variables.
+
+        This function is called between epochs when a metric is evaluated during
+        training. It's also called when the metric is initialized.
+        """
+        raise NotImplementedError("Subclasses should implement `reset_state`")
+
+    def result(self):
+        """Computes and returns the scalar metric value.
+
+        **Note:** This function is executed outside of the TensorFlow graph
+         on the CPU host. This means any TensorFlow ops run in this method
+         are run eagerly.
+
+        Result computation is an idempotent operation that simply calculates the
+        metric value using the state variables.
+
+        Returns:
+            A Python scalar.
+        """
+        raise NotImplementedError("Subclasses should implement `result`")
diff --git a/keras/metrics/py_metric_test.py b/keras/metrics/py_metric_test.py
new file mode 100644
index 000000000000..d8f00d3a5109
--- /dev/null
+++ b/keras/metrics/py_metric_test.py
@@ -0,0 +1,145 @@
+# Copyright 2023 The Keras Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras PyMetric classes."""
+
+
+import tensorflow.compat.v2 as tf
+
+from keras import metrics
+from keras.testing_infra import test_combinations
+
+
+class KTrimmedMean(metrics.PyMetric):
+    """An example PyMetric which computes the trimmed mean of `y_pred`."""
+
+    def __init__(self, k=0.1, name="k_trimmed_mean", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.k = k
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = y_true.numpy()
+
+        if sample_weight is not None:
+            y_true *= sample_weight.numpy()
+
+        # Insert y_pred into our values list (keeping the list sorted)
+        index = 0
+        for i, element in enumerate(self.values):
+            if y_true > element:
+                index = i
+                break
+        self.values = self.values[:index] + [y_true] + self.values[index:]
+
+    def reset_state(self):
+        self.values = []
+
+    def result(self):
+        k = int(self.k * len(self.values))
+        return tf.reduce_mean(self.values[k:-k])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"k": self.k})
+        return config
+
+
+class Mean(metrics.PyMetric):
+    """An example PyMetric which computes the mean of `y_pred`."""
+
+    def __init__(self, name="mean", **kwargs):
+        super().__init__(name=name, **kwargs)
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        self.values.append(y_true)
+
+    def reset_state(self):
+        self.values = []
+
+    def result(self):
+        return tf.reduce_mean(tf.concat(self.values, axis=0))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class PyMetricsTest(tf.test.TestCase):
+    def test_config(self):
+        ktm_object = KTrimmedMean(name="ktm", k=0.2, dtype=tf.float16)
+        self.assertEqual(ktm_object.name, "ktm")
+        self.assertEqual(ktm_object.k, 0.2)
+        self.assertEqual(ktm_object.dtype, tf.float16)
+
+        # Check save and restore config
+        ktm_object2 = KTrimmedMean.from_config(ktm_object.get_config())
+        self.assertEqual(ktm_object2.name, "ktm")
+        self.assertEqual(ktm_object.k, 0.2)
+        self.assertEqual(ktm_object2.dtype, tf.float16)
+
+    def test_unweighted(self):
+        ktm_object = KTrimmedMean(k=0.2)
+
+        for y_true in [-100, -10, 1, 2, 3, 4, 5, 6, 14, 9001]:
+            self.evaluate(
+                ktm_object.update_state(
+                    tf.constant(y_true, dtype=tf.float32),
+                    y_pred=tf.constant(0, dtype=tf.float32),
+                )
+            )
+
+        result = ktm_object.result()
+        self.assertEqual(3.5, self.evaluate(result))
+
+    def test_weighted(self):
+        ktm_object = KTrimmedMean(k=0.2)
+
+        for y_true in [-100, -10, 1, 2, 3, 4, 5, 6, 14, 9001]:
+            self.evaluate(
+                ktm_object.update_state(
+                    tf.constant(y_true, dtype=tf.float32),
+                    y_pred=tf.constant(0, dtype=tf.float32),
+                    sample_weight=tf.constant(2, dtype=tf.float32),
+                )
+            )
+
+        result = ktm_object.result()
+        self.assertEqual(7, self.evaluate(result))
+
+    def test_state_stored_on_cpu_host(self):
+        with tf.device("/device:GPU:0"):
+            mean_obj = Mean()
+
+            y_true_0 = tf.constant([0, 1, 2], dtype=tf.float32)
+            y_true_1 = tf.constant([3, 4], dtype=tf.float32)
+            self.evaluate(
+                mean_obj.update_state(
+                    y_true=y_true_0, y_pred=tf.constant(0, dtype=tf.float32)
+                )
+            )
+            self.evaluate(
+                mean_obj.update_state(
+                    y_true=y_true_1, y_pred=tf.constant(0, dtype=tf.float32)
+                )
+            )
+
+        self.assertEqual(2, self.evaluate(mean_obj.result()))
+
+        if not tf.executing_eagerly():
+            self.assertEndsWith(y_true_0.device, "/device:GPU:0")
+            self.assertEndsWith(y_true_1.device, "/device:GPU:0")
+
+        self.assertEndsWith(mean_obj.values[0].device, "/device:CPU:0")
+        self.assertEndsWith(mean_obj.values[1].device, "/device:CPU:0")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/metrics/regression_metrics.py b/keras/metrics/regression_metrics.py
new file mode 100644
index 000000000000..ccc4702f6039
--- /dev/null
+++ b/keras/metrics/regression_metrics.py
@@ -0,0 +1,626 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Regression metrics, e.g. MAE/MSE/etc."""
+
+import warnings
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.dtensor import utils as dtensor_utils
+from keras.losses import logcosh
+from keras.losses import mean_absolute_error
+from keras.losses import mean_absolute_percentage_error
+from keras.losses import mean_squared_error
+from keras.losses import mean_squared_logarithmic_error
+from keras.metrics import base_metric
+from keras.utils import losses_utils
+from keras.utils import metrics_utils
+from keras.utils.tf_utils import is_tensor_or_variable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.metrics.MeanRelativeError")
+class MeanRelativeError(base_metric.Mean):
+    """Computes the mean relative error by normalizing with the given values.
+
+    This metric creates two local variables, `total` and `count` that are used
+    to compute the mean relative error. This is weighted by `sample_weight`, and
+    it is ultimately returned as `mean_relative_error`: an idempotent operation
+    that simply divides `total` by `count`.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use `sample_weight` of 0 to mask values.
+
+    Args:
+      normalizer: The normalizer values with same shape as predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.MeanRelativeError(normalizer=[1, 3, 2, 3])
+    >>> m.update_state([1, 3, 2, 3], [2, 4, 6, 8])
+
+    >>> # metric = mean(|y_pred - y_true| / normalizer)
+    >>> #        = mean([1, 1, 4, 5] / [1, 3, 2, 3]) = mean([1, 1/3, 2, 5/3])
+    >>> #        = 5/4 = 1.25
+    >>> m.result().numpy()
+    1.25
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+      optimizer='sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.MeanRelativeError(normalizer=[1, 3])])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, normalizer, name=None, dtype=None):
+        super().__init__(name=name, dtype=dtype)
+        normalizer = tf.cast(normalizer, self._dtype)
+        self.normalizer = normalizer
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates metric statistics.
+
+        Args:
+          y_true: The ground truth values.
+          y_pred: The predicted values.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        y_true = tf.cast(y_true, self._dtype)
+        y_pred = tf.cast(y_pred, self._dtype)
+        [
+            y_pred,
+            y_true,
+        ], sample_weight = metrics_utils.ragged_assert_compatible_and_get_flat_values(  # noqa: E501
+            [y_pred, y_true], sample_weight
+        )
+        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
+            y_pred, y_true
+        )
+
+        y_pred, self.normalizer = losses_utils.remove_squeezable_dimensions(
+            y_pred, self.normalizer
+        )
+        y_pred.shape.assert_is_compatible_with(y_true.shape)
+        relative_errors = tf.math.divide_no_nan(
+            tf.abs(y_true - y_pred), self.normalizer
+        )
+
+        return super().update_state(
+            relative_errors, sample_weight=sample_weight
+        )
+
+    def get_config(self):
+        n = self.normalizer
+        config = {
+            "normalizer": backend.eval(n) if is_tensor_or_variable(n) else n
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export("keras.metrics.CosineSimilarity")
+class CosineSimilarity(base_metric.MeanMetricWrapper):
+    """Computes the cosine similarity between the labels and predictions.
+
+    `cosine similarity = (a . b) / ||a|| ||b||`
+
+    See: [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity).
+
+    This metric keeps the average cosine similarity between `predictions` and
+    `labels` over a stream of data.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      axis: (Optional) The dimension along which the cosine
+        similarity is computed. Defaults to `-1`.
+
+    Standalone usage:
+
+    >>> # l2_norm(y_true) = [[0., 1.], [1./1.414, 1./1.414]]
+    >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414, 1./1.414]]
+    >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+    >>> # result = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+    >>> #        = ((0. + 0.) +  (0.5 + 0.5)) / 2
+    >>> m = tf.keras.metrics.CosineSimilarity(axis=1)
+    >>> m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
+    >>> m.result().numpy()
+    0.49999997
+
+    >>> m.reset_state()
+    >>> m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]],
+    ...                sample_weight=[0.3, 0.7])
+    >>> m.result().numpy()
+    0.6999999
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.CosineSimilarity(axis=1)])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="cosine_similarity", dtype=None, axis=-1):
+        super().__init__(cosine_similarity, name, dtype=dtype, axis=axis)
+
+
+@keras_export("keras.metrics.MeanAbsoluteError")
+class MeanAbsoluteError(base_metric.MeanMetricWrapper):
+    """Computes the mean absolute error between the labels and predictions.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.MeanAbsoluteError()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
+    >>> m.result().numpy()
+    0.25
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.5
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.MeanAbsoluteError()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="mean_absolute_error", dtype=None):
+        super().__init__(mean_absolute_error, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.MeanAbsolutePercentageError")
+class MeanAbsolutePercentageError(base_metric.MeanMetricWrapper):
+    """Computes the mean absolute percentage error between `y_true` and
+    `y_pred`.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.MeanAbsolutePercentageError()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
+    >>> m.result().numpy()
+    250000000.0
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    500000000.0
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="mean_absolute_percentage_error", dtype=None):
+        super().__init__(mean_absolute_percentage_error, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.MeanSquaredError")
+class MeanSquaredError(base_metric.MeanMetricWrapper):
+    """Computes the mean squared error between `y_true` and `y_pred`.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.MeanSquaredError()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
+    >>> m.result().numpy()
+    0.25
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.5
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.MeanSquaredError()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="mean_squared_error", dtype=None):
+        super().__init__(mean_squared_error, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.MeanSquaredLogarithmicError")
+class MeanSquaredLogarithmicError(base_metric.MeanMetricWrapper):
+    """Computes the mean squared logarithmic error between `y_true` and
+    `y_pred`.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.MeanSquaredLogarithmicError()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
+    >>> m.result().numpy()
+    0.12011322
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.24022643
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="mean_squared_logarithmic_error", dtype=None):
+        super().__init__(mean_squared_logarithmic_error, name, dtype=dtype)
+
+
+@keras_export("keras.metrics.RootMeanSquaredError")
+class RootMeanSquaredError(base_metric.Mean):
+    """Computes root mean squared error metric between `y_true` and `y_pred`.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.RootMeanSquaredError()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
+    >>> m.result().numpy()
+    0.5
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.70710677
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        metrics=[tf.keras.metrics.RootMeanSquaredError()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="root_mean_squared_error", dtype=None):
+        super().__init__(name, dtype=dtype)
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        """Accumulates root mean squared error statistics.
+
+        Args:
+          y_true: The ground truth values.
+          y_pred: The predicted values.
+          sample_weight: Optional weighting of each example. Can
+            be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+            and must be broadcastable to `y_true`. Defaults to `1`.
+
+        Returns:
+          Update op.
+        """
+        y_true = tf.cast(y_true, self._dtype)
+        y_pred = tf.cast(y_pred, self._dtype)
+        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
+            y_pred, y_true
+        )
+        error_sq = tf.math.squared_difference(y_pred, y_true)
+        return super().update_state(error_sq, sample_weight=sample_weight)
+
+    def result(self):
+        return tf.sqrt(tf.math.divide_no_nan(self.total, self.count))
+
+
+@keras_export("keras.metrics.LogCoshError")
+class LogCoshError(base_metric.MeanMetricWrapper):
+    """Computes the logarithm of the hyperbolic cosine of the prediction error.
+
+    `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred -
+    y_true)
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    Standalone usage:
+
+    >>> m = tf.keras.metrics.LogCoshError()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]])
+    >>> m.result().numpy()
+    0.10844523
+
+    >>> m.reset_state()
+    >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
+    ...                sample_weight=[1, 0])
+    >>> m.result().numpy()
+    0.21689045
+
+    Usage with `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss='mse',
+                  metrics=[tf.keras.metrics.LogCoshError()])
+    ```
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(self, name="logcosh", dtype=None):
+        super().__init__(logcosh, name, dtype=dtype)
+
+
+# Adapted from TF-Addons implementation (RSquare class).
+@keras_export("keras.metrics.R2Score")
+class R2Score(base_metric.Metric):
+    """Computes R2 score.
+
+    This is also called the
+    [coefficient of
+    determination](https://en.wikipedia.org/wiki/Coefficient_of_determination).
+
+    It indicates how close the fitted regression line
+    is to ground-truth data.
+
+    - The highest score possible is 1.0. It indicates that the predictors
+        perfectly accounts for variation in the target.
+    - A score of 0.0 indicates that the predictors do not
+        account for variation in the target.
+    - It can also be negative if the model is worse than random.
+
+    This metric can also compute the "Adjusted R2" score.
+
+    Args:
+        class_aggregation: Specifies how to aggregate scores corresponding to
+            different output classes (or target dimensions),
+            i.e. different dimensions on the last axis of the predictions.
+            Equivalent to `multioutput` argument in Scikit-Learn.
+            Should be one of
+            `None` (no aggregation), `"uniform_average"`,
+            `"variance_weighted_average"`.
+        num_regressors: Number of independent regressors used
+            ("Adjusted R2" score). 0 is the standard R2 score.
+            Defaults to `0`.
+        name: Optional. string name of the metric instance.
+        dtype: Optional. data type of the metric result.
+
+    Example:
+
+    >>> y_true = np.array([[1], [4], [3]], dtype=np.float32)
+    >>> y_pred = np.array([[2], [4], [4]], dtype=np.float32)
+    >>> metric = tf.keras.metrics.R2Score()
+    >>> metric.update_state(y_true, y_pred)
+    >>> result = metric.result()
+    >>> result.numpy()
+    0.57142854
+    """
+
+    @dtensor_utils.inject_mesh
+    def __init__(
+        self,
+        class_aggregation="uniform_average",
+        num_regressors=0,
+        name="r2_score",
+        dtype=None,
+    ):
+        super().__init__(name=name, dtype=dtype)
+
+        valid_class_aggregation_values = (
+            None,
+            "uniform_average",
+            "variance_weighted_average",
+        )
+        if class_aggregation not in valid_class_aggregation_values:
+            raise ValueError(
+                "Invalid value for argument `class_aggregation`. Expected "
+                f"one of {valid_class_aggregation_values}. "
+                f"Received: class_aggregation={class_aggregation}"
+            )
+        if num_regressors < 0:
+            raise ValueError(
+                "Invalid value for argument `num_regressors`. "
+                "Expected a value >= 0. "
+                f"Received: num_regressors={num_regressors}"
+            )
+        self.class_aggregation = class_aggregation
+        self.num_regressors = num_regressors
+        self.num_samples = self.add_weight(name="num_samples", dtype="int32")
+        self.built = False
+
+    def build(self, y_true_shape, y_pred_shape):
+        if len(y_pred_shape) != 2 or len(y_true_shape) != 2:
+            raise ValueError(
+                "R2Score expects 2D inputs with shape "
+                "(batch_size, output_dim). Received input "
+                f"shapes: y_pred.shape={y_pred_shape} and "
+                f"y_true.shape={y_true_shape}."
+            )
+        if y_pred_shape[-1] is None or y_true_shape[-1] is None:
+            raise ValueError(
+                "R2Score expects 2D inputs with shape "
+                "(batch_size, output_dim), with output_dim fully "
+                "defined (not None). Received input "
+                f"shapes: y_pred.shape={y_pred_shape} and "
+                f"y_true.shape={y_true_shape}."
+            )
+        num_classes = y_pred_shape[-1]
+        self.squared_sum = self.add_weight(
+            name="squared_sum",
+            shape=[num_classes],
+            initializer="zeros",
+        )
+        self.sum = self.add_weight(
+            name="sum",
+            shape=[num_classes],
+            initializer="zeros",
+        )
+        self.total_mse = self.add_weight(
+            name="residual",
+            shape=[num_classes],
+            initializer="zeros",
+        )
+        self.count = self.add_weight(
+            name="count",
+            shape=[num_classes],
+            initializer="zeros",
+        )
+        self.built = True
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.convert_to_tensor(y_true, dtype=self.dtype)
+        y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
+        if not self.built:
+            self.build(y_true.shape, y_pred.shape)
+
+        if sample_weight is None:
+            sample_weight = 1
+
+        sample_weight = tf.convert_to_tensor(sample_weight, dtype=self.dtype)
+        if sample_weight.shape.rank == 1:
+            # Make sure there's a features dimension
+            sample_weight = tf.expand_dims(sample_weight, axis=1)
+        sample_weight = tf.__internal__.ops.broadcast_weights(
+            weights=sample_weight, values=y_true
+        )
+
+        weighted_y_true = y_true * sample_weight
+        self.sum.assign_add(tf.reduce_sum(weighted_y_true, axis=0))
+        self.squared_sum.assign_add(
+            tf.reduce_sum(y_true * weighted_y_true, axis=0)
+        )
+        self.total_mse.assign_add(
+            tf.reduce_sum((y_true - y_pred) ** 2 * sample_weight, axis=0)
+        )
+        self.count.assign_add(tf.reduce_sum(sample_weight, axis=0))
+        self.num_samples.assign_add(tf.size(y_true))
+
+    def result(self):
+        mean = self.sum / self.count
+        total = self.squared_sum - self.sum * mean
+        raw_scores = 1 - (self.total_mse / total)
+        raw_scores = tf.where(tf.math.is_inf(raw_scores), 0.0, raw_scores)
+
+        if self.class_aggregation == "uniform_average":
+            r2_score = tf.reduce_mean(raw_scores)
+        elif self.class_aggregation == "variance_weighted_average":
+            weighted_sum = tf.reduce_sum(total * raw_scores)
+            sum_of_weights = tf.reduce_sum(total)
+            r2_score = weighted_sum / sum_of_weights
+        else:
+            r2_score = raw_scores
+
+        if self.num_regressors != 0:
+            if self.num_regressors > self.num_samples - 1:
+                warnings.warn(
+                    "More independent predictors than datapoints "
+                    "in adjusted R2 score. Falling back to standard R2 score.",
+                    stacklevel=2,
+                )
+            elif self.num_regressors == self.num_samples - 1:
+                warnings.warn(
+                    "Division by zero in Adjusted R2 score. "
+                    "Falling back to standard R2 score.",
+                    stacklevel=2,
+                )
+            else:
+                n = tf.cast(self.num_samples, dtype=tf.float32)
+                p = tf.cast(self.num_regressors, dtype=tf.float32)
+                num = tf.multiply(
+                    tf.subtract(1.0, r2_score), tf.subtract(n, 1.0)
+                )
+                den = tf.subtract(tf.subtract(n, p), 1.0)
+                r2_score = tf.subtract(1.0, tf.divide(num, den))
+        return r2_score
+
+    def reset_state(self):
+        for v in self.variables:
+            v.assign(tf.zeros(v.shape, dtype=v.dtype))
+
+    def get_config(self):
+        config = {
+            "class_aggregation": self.class_aggregation,
+            "num_regressors": self.num_regressors,
+        }
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+
+def cosine_similarity(y_true, y_pred, axis=-1):
+    """Computes the cosine similarity between labels and predictions.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The prediction values.
+      axis: (Optional) -1 is the dimension along which the cosine
+        similarity is computed. Defaults to `-1`.
+
+    Returns:
+      Cosine similarity value.
+    """
+    y_true = tf.linalg.l2_normalize(y_true, axis=axis)
+    y_pred = tf.linalg.l2_normalize(y_pred, axis=axis)
+    return tf.reduce_sum(y_true * y_pred, axis=axis)
diff --git a/keras/metrics/regression_metrics_test.py b/keras/metrics/regression_metrics_test.py
new file mode 100644
index 000000000000..57b1a8191d35
--- /dev/null
+++ b/keras/metrics/regression_metrics_test.py
@@ -0,0 +1,506 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics."""
+
+import math
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras import Input
+from keras import metrics
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class CosineSimilarityTest(tf.test.TestCase):
+    def l2_norm(self, x, axis):
+        epsilon = 1e-12
+        square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+        x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+        return np.multiply(x, x_inv_norm)
+
+    def setup(self, axis=1):
+        self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+        self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+        y_true = self.l2_norm(self.np_y_true, axis)
+        y_pred = self.l2_norm(self.np_y_pred, axis)
+        self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+        self.y_true = tf.constant(self.np_y_true)
+        self.y_pred = tf.constant(self.np_y_pred)
+
+    def test_config(self):
+        cosine_obj = metrics.CosineSimilarity(
+            axis=2, name="my_cos", dtype=tf.int32
+        )
+        self.assertEqual(cosine_obj.name, "my_cos")
+        self.assertEqual(cosine_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        cosine_obj2 = metrics.CosineSimilarity.from_config(
+            cosine_obj.get_config()
+        )
+        self.assertEqual(cosine_obj2.name, "my_cos")
+        self.assertEqual(cosine_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        self.setup()
+        cosine_obj = metrics.CosineSimilarity()
+        self.evaluate(tf.compat.v1.variables_initializer(cosine_obj.variables))
+        loss = cosine_obj(self.y_true, self.y_pred)
+        expected_loss = np.mean(self.expected_loss)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_weighted(self):
+        self.setup()
+        cosine_obj = metrics.CosineSimilarity()
+        self.evaluate(tf.compat.v1.variables_initializer(cosine_obj.variables))
+        sample_weight = np.asarray([1.2, 3.4])
+        loss = cosine_obj(
+            self.y_true, self.y_pred, sample_weight=tf.constant(sample_weight)
+        )
+        expected_loss = np.sum(self.expected_loss * sample_weight) / np.sum(
+            sample_weight
+        )
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    def test_axis(self):
+        self.setup(axis=1)
+        cosine_obj = metrics.CosineSimilarity(axis=1)
+        self.evaluate(tf.compat.v1.variables_initializer(cosine_obj.variables))
+        loss = cosine_obj(self.y_true, self.y_pred)
+        expected_loss = np.mean(self.expected_loss)
+        self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MeanAbsoluteErrorTest(tf.test.TestCase):
+    def test_config(self):
+        mae_obj = metrics.MeanAbsoluteError(name="my_mae", dtype=tf.int32)
+        self.assertEqual(mae_obj.name, "my_mae")
+        self.assertEqual(mae_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        mae_obj2 = metrics.MeanAbsoluteError.from_config(mae_obj.get_config())
+        self.assertEqual(mae_obj2.name, "my_mae")
+        self.assertEqual(mae_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        mae_obj = metrics.MeanAbsoluteError()
+        self.evaluate(tf.compat.v1.variables_initializer(mae_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = mae_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = mae_obj.result()
+        self.assertAllClose(0.5, result, atol=1e-5)
+
+    def test_weighted(self):
+        mae_obj = metrics.MeanAbsoluteError()
+        self.evaluate(tf.compat.v1.variables_initializer(mae_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MeanAbsolutePercentageErrorTest(tf.test.TestCase):
+    def test_config(self):
+        mape_obj = metrics.MeanAbsolutePercentageError(
+            name="my_mape", dtype=tf.int32
+        )
+        self.assertEqual(mape_obj.name, "my_mape")
+        self.assertEqual(mape_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        mape_obj2 = metrics.MeanAbsolutePercentageError.from_config(
+            mape_obj.get_config()
+        )
+        self.assertEqual(mape_obj2.name, "my_mape")
+        self.assertEqual(mape_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        mape_obj = metrics.MeanAbsolutePercentageError()
+        self.evaluate(tf.compat.v1.variables_initializer(mape_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = mape_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = mape_obj.result()
+        self.assertAllClose(35e7, result, atol=1e-5)
+
+    def test_weighted(self):
+        mape_obj = metrics.MeanAbsolutePercentageError()
+        self.evaluate(tf.compat.v1.variables_initializer(mape_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(40e7, self.evaluate(result), atol=1e-5)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MeanSquaredErrorTest(tf.test.TestCase):
+    def test_config(self):
+        mse_obj = metrics.MeanSquaredError(name="my_mse", dtype=tf.int32)
+        self.assertEqual(mse_obj.name, "my_mse")
+        self.assertEqual(mse_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        mse_obj2 = metrics.MeanSquaredError.from_config(mse_obj.get_config())
+        self.assertEqual(mse_obj2.name, "my_mse")
+        self.assertEqual(mse_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        mse_obj = metrics.MeanSquaredError()
+        self.evaluate(tf.compat.v1.variables_initializer(mse_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = mse_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = mse_obj.result()
+        self.assertAllClose(0.5, result, atol=1e-5)
+
+    def test_weighted(self):
+        mse_obj = metrics.MeanSquaredError()
+        self.evaluate(tf.compat.v1.variables_initializer(mse_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MeanSquaredLogarithmicErrorTest(tf.test.TestCase):
+    def test_config(self):
+        msle_obj = metrics.MeanSquaredLogarithmicError(
+            name="my_msle", dtype=tf.int32
+        )
+        self.assertEqual(msle_obj.name, "my_msle")
+        self.assertEqual(msle_obj._dtype, tf.int32)
+
+        # Check save and restore config
+        msle_obj2 = metrics.MeanSquaredLogarithmicError.from_config(
+            msle_obj.get_config()
+        )
+        self.assertEqual(msle_obj2.name, "my_msle")
+        self.assertEqual(msle_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        msle_obj = metrics.MeanSquaredLogarithmicError()
+        self.evaluate(tf.compat.v1.variables_initializer(msle_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+
+        update_op = msle_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = msle_obj.result()
+        self.assertAllClose(0.24022, result, atol=1e-5)
+
+    def test_weighted(self):
+        msle_obj = metrics.MeanSquaredLogarithmicError()
+        self.evaluate(tf.compat.v1.variables_initializer(msle_obj.variables))
+        y_true = tf.constant(
+            ((0, 1, 0, 1, 0), (0, 0, 1, 1, 1), (1, 1, 1, 1, 0), (0, 0, 0, 0, 1))
+        )
+        y_pred = tf.constant(
+            ((0, 0, 1, 1, 0), (1, 1, 1, 1, 1), (0, 1, 0, 1, 0), (1, 1, 1, 1, 1))
+        )
+        sample_weight = tf.constant((1.0, 1.5, 2.0, 2.5))
+        result = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(0.26082, self.evaluate(result), atol=1e-5)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class RootMeanSquaredErrorTest(tf.test.TestCase):
+    def test_config(self):
+        rmse_obj = metrics.RootMeanSquaredError(name="rmse", dtype=tf.int32)
+        self.assertEqual(rmse_obj.name, "rmse")
+        self.assertEqual(rmse_obj._dtype, tf.int32)
+
+        rmse_obj2 = metrics.RootMeanSquaredError.from_config(
+            rmse_obj.get_config()
+        )
+        self.assertEqual(rmse_obj2.name, "rmse")
+        self.assertEqual(rmse_obj2._dtype, tf.int32)
+
+    def test_unweighted(self):
+        rmse_obj = metrics.RootMeanSquaredError()
+        self.evaluate(tf.compat.v1.variables_initializer(rmse_obj.variables))
+        y_true = tf.constant((2, 4, 6))
+        y_pred = tf.constant((1, 3, 2))
+
+        update_op = rmse_obj.update_state(y_true, y_pred)
+        self.evaluate(update_op)
+        result = rmse_obj.result()
+        # error = [-1, -1, -4], square(error) = [1, 1, 16], mean = 18/3 = 6
+        self.assertAllClose(math.sqrt(6), result, atol=1e-3)
+
+    def test_weighted(self):
+        rmse_obj = metrics.RootMeanSquaredError()
+        self.evaluate(tf.compat.v1.variables_initializer(rmse_obj.variables))
+        y_true = tf.constant((2, 4, 6, 8))
+        y_pred = tf.constant((1, 3, 2, 3))
+        sample_weight = tf.constant((0, 1, 0, 1))
+        result = rmse_obj(y_true, y_pred, sample_weight=sample_weight)
+        self.assertAllClose(math.sqrt(13), self.evaluate(result), atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class LogCoshErrorTest(tf.test.TestCase):
+    def setup(self):
+        y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+        y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+        self.batch_size = 6
+        error = y_pred - y_true
+        self.expected_results = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+        self.y_pred = tf.constant(y_pred, dtype=tf.float32)
+        self.y_true = tf.constant(y_true)
+
+    def test_config(self):
+        logcosh_obj = metrics.LogCoshError(name="logcosh", dtype=tf.int32)
+        self.assertEqual(logcosh_obj.name, "logcosh")
+        self.assertEqual(logcosh_obj._dtype, tf.int32)
+
+    def test_unweighted(self):
+        self.setup()
+        logcosh_obj = metrics.LogCoshError()
+        self.evaluate(tf.compat.v1.variables_initializer(logcosh_obj.variables))
+
+        update_op = logcosh_obj.update_state(self.y_true, self.y_pred)
+        self.evaluate(update_op)
+        result = logcosh_obj.result()
+        expected_result = np.sum(self.expected_results) / self.batch_size
+        self.assertAllClose(result, expected_result, atol=1e-3)
+
+    def test_weighted(self):
+        self.setup()
+        logcosh_obj = metrics.LogCoshError()
+        self.evaluate(tf.compat.v1.variables_initializer(logcosh_obj.variables))
+        sample_weight = tf.constant([1.2, 3.4], shape=(2, 1))
+        result = logcosh_obj(
+            self.y_true, self.y_pred, sample_weight=sample_weight
+        )
+
+        sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(
+            (2, 3)
+        )
+        expected_result = np.multiply(self.expected_results, sample_weight)
+        expected_result = np.sum(expected_result) / np.sum(sample_weight)
+        self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MeanRelativeErrorTest(tf.test.TestCase):
+    def test_config(self):
+        normalizer = tf.constant([1, 3], dtype=tf.float32)
+        mre_obj = metrics.MeanRelativeError(normalizer=normalizer, name="mre")
+        self.assertEqual(mre_obj.name, "mre")
+        self.assertArrayNear(self.evaluate(mre_obj.normalizer), [1, 3], 1e-1)
+
+        mre_obj2 = metrics.MeanRelativeError.from_config(mre_obj.get_config())
+        self.assertEqual(mre_obj2.name, "mre")
+        self.assertArrayNear(self.evaluate(mre_obj2.normalizer), [1, 3], 1e-1)
+
+    def test_unweighted(self):
+        np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+        np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+        expected_error = np.mean(
+            np.divide(np.absolute(np_y_pred - np_y_true), np_y_true)
+        )
+
+        y_pred = tf.constant(np_y_pred, shape=(1, 4), dtype=tf.float32)
+        y_true = tf.constant(np_y_true, shape=(1, 4))
+
+        mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+        self.evaluate(tf.compat.v1.variables_initializer(mre_obj.variables))
+
+        result = mre_obj(y_true, y_pred)
+        self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+    def test_weighted(self):
+        np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+        np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+        sample_weight = np.asarray([0.2, 0.3, 0.5, 0], dtype=np.float32)
+        rel_errors = np.divide(np.absolute(np_y_pred - np_y_true), np_y_true)
+        expected_error = np.sum(rel_errors * sample_weight)
+
+        y_pred = tf.constant(np_y_pred, dtype=tf.float32)
+        y_true = tf.constant(np_y_true)
+
+        mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+        self.evaluate(tf.compat.v1.variables_initializer(mre_obj.variables))
+
+        result = mre_obj(
+            y_true, y_pred, sample_weight=tf.constant(sample_weight)
+        )
+        self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+    def test_zero_normalizer(self):
+        y_pred = tf.constant([2, 4], dtype=tf.float32)
+        y_true = tf.constant([1, 3])
+
+        mre_obj = metrics.MeanRelativeError(normalizer=tf.zeros_like(y_true))
+        self.evaluate(tf.compat.v1.variables_initializer(mre_obj.variables))
+
+        result = mre_obj(y_true, y_pred)
+        self.assertEqual(self.evaluate(result), 0)
+
+
+@test_utils.run_v2_only
+class R2ScoreTest(parameterized.TestCase, tf.test.TestCase):
+    def _run_test(
+        self,
+        y_true,
+        y_pred,
+        sample_weights,
+        class_aggregation,
+        num_regressors,
+        reference_result,
+    ):
+        y_true = tf.constant(y_true, dtype="float32")
+        y_pred = tf.constant(y_pred, dtype="float32")
+        r2 = metrics.R2Score(class_aggregation, num_regressors)
+        r2.update_state(y_true, y_pred, sample_weights)
+        result = r2.result().numpy()
+        self.assertAllClose(result, reference_result, atol=1e-6)
+
+    def test_config(self):
+        r2_obj = metrics.R2Score(
+            class_aggregation=None,
+            num_regressors=2,
+        )
+        self.assertEqual(r2_obj.class_aggregation, None)
+        self.assertEqual(r2_obj.num_regressors, 2)
+        self.assertEqual(r2_obj.dtype, tf.float32)
+
+        # Check save and restore config
+        r2_obj2 = metrics.R2Score.from_config(r2_obj.get_config())
+        self.assertEqual(r2_obj2.class_aggregation, None)
+        self.assertEqual(r2_obj2.num_regressors, 2)
+        self.assertEqual(r2_obj2.dtype, tf.float32)
+
+    @parameterized.parameters(
+        # class_aggregation, num_regressors, result
+        (None, 0, [0.37, -1.295, 0.565]),
+        ("uniform_average", 0, -0.12),
+        ("variance_weighted_average", 0, -0.12),
+    )
+    def test_r2_sklearn_comparison(
+        self, class_aggregation, num_regressors, result
+    ):
+        y_true = [[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]
+        y_pred = [[0.4, 0.5, 0.6], [0.1, 0.2, 0.3], [0.5, 0.8, 0.2]]
+        self._run_test(
+            y_true,
+            y_pred,
+            None,
+            class_aggregation=class_aggregation,
+            num_regressors=num_regressors,
+            reference_result=result,
+        )
+
+    @parameterized.parameters(
+        # class_aggregation, num_regressors, result
+        (None, 0, [0.17305559, -8.836666, -0.521]),
+        (None, 1, [0.054920673, -10.241904, -0.7382858]),
+        (None, 2, [-0.10259259, -12.115555, -1.0280001]),
+        ("uniform_average", 0, -3.0615367889404297),
+        ("uniform_average", 1, -3.641756534576416),
+        ("uniform_average", 2, -4.415382385253906),
+        ("variance_weighted_average", 0, -1.3710224628448486),
+        ("variance_weighted_average", 1, -1.7097399234771729),
+        ("variance_weighted_average", 2, -2.161363363265991),
+    )
+    def test_r2_tfa_comparison(self, class_aggregation, num_regressors, result):
+        y_true = [[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]
+        y_pred = [[0.4, 0.9, 1.6], [0.1, 1.2, 0.6], [1.5, 0.8, 0.6]]
+        sample_weights = [0.8, 0.1, 0.4]
+        self._run_test(
+            y_true,
+            y_pred,
+            sample_weights,
+            class_aggregation=class_aggregation,
+            num_regressors=num_regressors,
+            reference_result=result,
+        )
+
+    def test_errors(self):
+        # Bad class_aggregation value
+        with self.assertRaisesRegex(
+            ValueError, "Invalid value for argument `class_aggregation`"
+        ):
+            metrics.R2Score(class_aggregation="wrong")
+
+        # Bad num_regressors value
+        with self.assertRaisesRegex(
+            ValueError, "Invalid value for argument `num_regressors`"
+        ):
+            metrics.R2Score(num_regressors=-1)
+
+        # Bad input shape
+        with self.assertRaisesRegex(ValueError, "expects 2D inputs with shape"):
+            r2 = metrics.R2Score()
+            r2.update_state(tf.constant([0.0, 1.0]), tf.constant([0.0, 1.0]))
+
+        with self.assertRaisesRegex(
+            ValueError, "with output_dim fully defined"
+        ):
+            r2 = metrics.R2Score()
+            r2.update_state(Input(shape=(None,)), tf.constant([[0.0], [1.0]]))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/mixed_precision/BUILD b/keras/mixed_precision/BUILD
index b1e5162a1990..d29b508403e5 100644
--- a/keras/mixed_precision/BUILD
+++ b/keras/mixed_precision/BUILD
@@ -16,10 +16,12 @@
 # Description:
 #   Contains the Keras Mixed Precision API (TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 load("@org_keras//keras:keras.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         # TODO(scottzhu): Remove these two deps and convert the test to integration test.
         "//third_party/tensorflow/python/distribute:__pkg__",  # For collective_all_reduce_strategy_test
@@ -64,7 +66,7 @@ tf_py_test(
         ":policy",
         "//:expect_tensorflow_installed",
         "//keras",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
     ],
 )
@@ -82,7 +84,6 @@ cuda_py_test(
     name = "device_compatibility_check_test",
     srcs = ["device_compatibility_check_test.py"],
     srcs_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":device_compatibility_check",
         "//:expect_tensorflow_installed",
@@ -111,7 +112,8 @@ tf_py_test(
         ":autocast_variable",
         "//:expect_absl_installed",
         "//:expect_tensorflow_installed",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/layers",
+        "//keras/optimizers/legacy:optimizers",
     ],
 )
 
@@ -122,7 +124,7 @@ py_library(
     deps = [
         "//:expect_absl_installed",
         "//:expect_tensorflow_installed",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/utils:generic_utils",
     ],
 )
@@ -147,14 +149,13 @@ cuda_py_test(
     size = "small",
     srcs = ["mixed_precision_graph_rewrite_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":loss_scale_optimizer",
         ":policy",
         "//:expect_absl_installed",
         "//:expect_tensorflow_installed",
         "//keras",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
         "//keras/testing_infra:test_utils",
     ],
diff --git a/keras/mixed_precision/__init__.py b/keras/mixed_precision/__init__.py
index 62e8e80e3656..58c7cd9475f5 100644
--- a/keras/mixed_precision/__init__.py
+++ b/keras/mixed_precision/__init__.py
@@ -20,6 +20,6 @@
 """
 
 from keras.mixed_precision.loss_scale_optimizer import LossScaleOptimizer
-from keras.mixed_precision.policy import global_policy
 from keras.mixed_precision.policy import Policy
+from keras.mixed_precision.policy import global_policy
 from keras.mixed_precision.policy import set_global_policy
diff --git a/keras/mixed_precision/autocast_variable.py b/keras/mixed_precision/autocast_variable.py
index ec541edda0fe..eea3192b80fb 100644
--- a/keras/mixed_precision/autocast_variable.py
+++ b/keras/mixed_precision/autocast_variable.py
@@ -14,535 +14,611 @@
 # ==============================================================================
 """Contains AutoCastVariable, a variable which automatically casts itself."""
 
+import threading
+from typing import Optional
+
 import tensorflow.compat.v2 as tf
 
-import threading
 from keras.distribute import distributed_training_utils
 
-
 # _autocast_dtype.dtype is the dtype AutoCastVariables should be cast to, or
 # None if AutoCastVariables should not be cast.
 _autocast_dtype = threading.local()
 
 
 def numpy_text(tensor, is_repr=False):
-  """Human readable representation of a tensor's numpy value."""
-  if tensor.dtype.is_numpy_compatible:
-    # pylint: disable=protected-access
-    text = repr(tensor._numpy()) if is_repr else str(tensor._numpy())
-    # pylint: enable=protected-access
-  else:
-    text = '<unprintable>'
-  if '\n' in text:
-    text = '\n' + text
-  return text
-
-
-class AutoCastVariable(tf.Variable, tf.__internal__.types.Tensor):
-  """Variable that will cast itself to a different dtype in applicable contexts.
-
-  This class wraps a floating-point `tf.Variable`. It emulates the variable
-  interface and delegates to the wrapped variable, but it additionally will cast
-  the wrapped variable under an `enable_auto_cast_variables(dtype)` context
-  manager.
-
-  For example:
+    """Human readable representation of a tensor's numpy value."""
+    if tensor.dtype.is_numpy_compatible:
 
-  >>> v = tf.Variable(1.0, dtype=tf.float32)
-  >>> v = AutoCastVariable(v)
-  >>> tf.identity(v).dtype
-  tf.float32
-  >>> with enable_auto_cast_variables(tf.float16):
-  ...   tf.identity(v).dtype
-  tf.float16
+        text = repr(tensor._numpy()) if is_repr else str(tensor._numpy())
 
-  The purpose of this class is to allow Keras layers to create variables in
-  float32, and automatically cast them to float16 or bfloat16 when the layer is
-  called.
-  """
-
-  def __init__(self, variable):
-    """Creates an AutoCastVariable instance.
-
-    Args:
-      variable: A floating-point resource variable to wrap.
-
-    Raises:
-      ValueError: If `variable` is not a floating-point resource variable
-    """
-    if not isinstance(variable, tf.Variable):
-      raise ValueError('variable must be of type tf.ResourceVariable, but got: '
-                       '%s' % variable)
-    if not variable.dtype.is_floating:
-      raise ValueError('variable must be a floating point variable but has '
-                       'type: %s' % variable.dtype.name)
-    self._variable = variable
-    # 'delegate' means AutoCastVariable.op return self._variable.op, which will
-    # raise an AttributeError in Eager (as intended). If set to any other value,
-    # AutoCastVariable.op returns that value instead, which is used to set the
-    # op attribute in AutoCastVariable.assign().
-    self._op = 'delegate'
-
-  def _should_cast(self):
-    """Returns True if this variable should be casted when accessed."""
-    autocast_dtype = getattr(_autocast_dtype, 'dtype', None)
-    return autocast_dtype is not None and self.dtype != autocast_dtype
-
-  @property
-  def dtype(self):
-    """The dtype of the underlying variable, before any casts are done."""
-    return self._variable.dtype
-
-  @property
-  def true_dtype(self):
-    """Deprecated alias of `dtype`."""
-    return self._variable.dtype
-
-  @property
-  def _cast_dtype(self):
-    dtype = getattr(_autocast_dtype, 'dtype', None)
-    return dtype or self._variable.dtype
-
-  def value(self):
-    val = self._variable.value()
-    if not self._should_cast():
-      return val
-    return tf.cast(val, self._cast_dtype)
-
-  def read_value(self):
-    val = self._variable.read_value()
-    return tf.cast(val, self._cast_dtype)
-
-  def sparse_read(self, indices, name=None):
-    """Reads the value of this variable sparsely, using `gather`."""
-    val = self._variable.sparse_read(indices, name=name)
-    return tf.cast(val, self._cast_dtype)
-
-  def gather_nd(self, indices, name=None):
-    """Gather slices of the variable into a Tensor."""
-    val = self._variable.gather_nd(indices, name=name)
-    return tf.cast(val, self._cast_dtype)
-
-  def __getattr__(self, name):
-    return getattr(self._variable, name)
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts this variable to a tensor."""
-    if as_ref:
-      # This ValueError should not occur in practice since it is impossible to
-      # pass as_ref=True using public APIs.
-      raise ValueError('Cannot convert AutoCastVariable to a tensor if '
-                       'as_ref=True is passed to convert_to_tensor')
-    if not self._should_cast():
-      return tf.convert_to_tensor(self._variable, dtype=dtype,
-                                                    name=name)
-    if dtype is not None and not dtype.is_compatible_with(self._cast_dtype):
-      raise ValueError(
-          'Incompatible type conversion requested to type {!r} for '
-          'AutoCastVariable which is casted to type {!r}'.format(
-              dtype.name, self._cast_dtype.name))
-    val = tf.convert_to_tensor(
-        self._variable, dtype=self._variable.dtype, name=name)
-    return tf.cast(val, self._cast_dtype)
-
-  def _should_act_as_resource_variable(self):
-    """Pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-  def __repr__(self):
-    if tf.executing_eagerly() and not self._in_graph_mode:
-      repr_str = ("<AutoCastVariable '{v.name}' shape={v.shape} "
-                  'dtype={v.dtype.name} dtype_to_cast_to={v._cast_dtype.name}, '
-                  'numpy={np_repr}>')
-      return repr_str.format(
-          v=self, np_repr=numpy_text(self.read_value(), is_repr=True))
     else:
-      repr_str = ("<AutoCastVariable '{v.name}' shape={v.shape} "
-                  'dtype={v.dtype.name} dtype_to_cast_to={v._cast_dtype.name}>')
-      return repr_str.format(v=self)
-
-  # Method delegations: We delegate the following methods to self._variable.
-  # Each of these methods simply calls the same method on self._variable. The
-  # base Variable raises NotImplementedError for most of these, so we must
-  # override them.
-  #
-  # We do not define the following methods from Variable for the following
-  # reasons:
-  #   * 'count_up_to': This method only applies to int variables, which cannot
-  #     be wrapped with an AutoCastVariable.
-  #   * 'ref': Instead we inherit the definition from Variable.
-  #     If we defined and delegated to Variable, the ref of an AutoCastVariable
-  #     would be the same as the ref of the underlying variable, which would be
-  #     strange as they are different Python objects.
-
-  def set_shape(self, shape):
-    return self._variable.set_shape(self, shape)
-
-  @property
-  def trainable(self):
-    return self._variable.trainable
-
-  @property
-  def synchronization(self):
-    return self._variable.synchronization
-
-  @property
-  def aggregation(self):
-    return self._variable.aggregation
-
-  def eval(self, session=None):
-    return self._variable.eval(session)
-
-  def initialized_value(self):
-    return self._variable.initialized_value()
-
-  @property
-  def initial_value(self):
-    return self._variable.initial_value
-
-  @property
-  def constraint(self):
-    return self._variable.constraint
-
-  def _apply_assign_update(self,
-                           update_fn,
-                           value,
-                           use_locking=None,
-                           name=None,
-                           read_value=True):
-    # TODO(b/146181571): This logic can be simplified once
-    # DistributedVariable.assign returns a DistributedVariable. Currently for
-    # MirroredStrategy, it returns a Mirrored value.
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      assign_op = update_fn(value, use_locking, name, False)
-      if read_value:
-        # We create a new AutoCastVariable with the same underlying tf.Variable.
-        # The new AutoCastVariable is identical except the 'op' attribute is
-        # defined. This matches the behavior of tf.Variable.assign.
-        var = create_autocast_variable(self._variable)
-        var._op = assign_op  # pylint:disable=protected-access
-        return var
-      return assign_op
-
-    # Fallback to wrapping the returned variable in graph mode if possible
-    assign_var = update_fn(value, use_locking, name, read_value)
-    if read_value and tf.__internal__.ops.is_resource_variable(assign_var):
-      return create_autocast_variable(assign_var)
-    return assign_var
-
-  def _apply_update(self, update_fn, *args, **kwargs):
-    update_var = update_fn(*args, **kwargs)
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      return self
-
-    # Fallback to wrapping the returned variable in graph mode if possible
-    if tf.__internal__.ops.is_resource_variable(update_var):
-      return create_autocast_variable(update_var)
-    return update_var
-
-  def assign(self, value, use_locking=None, name=None, read_value=True):
-    return self._apply_assign_update(self._variable.assign, value, use_locking,
-                                     name, read_value)
-
-  def assign_add(self, delta, use_locking=None, name=None, read_value=True):
-    return self._apply_assign_update(self._variable.assign_add, delta,
-                                     use_locking, name, read_value)
-
-  def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
-    return self._apply_assign_update(self._variable.assign_sub, delta,
-                                     use_locking, name, read_value)
-
-  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_sub, sparse_delta,
-                              use_locking, name)
-
-  def scatter_add(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_add, sparse_delta,
-                              use_locking, name)
-
-  def scatter_max(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_max, sparse_delta,
-                              use_locking, name)
-
-  def scatter_min(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_min, sparse_delta,
-                              use_locking, name)
-
-  def scatter_mul(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_mul, sparse_delta,
-                              use_locking, name)
-
-  def scatter_div(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_div, sparse_delta,
-                              use_locking, name)
-
-  def scatter_update(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.scatter_update, sparse_delta,
-                              use_locking, name)
-
-  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
-    return self._apply_update(self._variable.batch_scatter_update, sparse_delta,
-                              use_locking, name)
-
-  def scatter_nd_sub(self, indices, updates, name=None):
-    return self._apply_update(self._variable.scatter_nd_sub, indices, updates,
-                              name)
-
-  def scatter_nd_add(self, indices, updates, name=None):
-    return self._apply_update(self._variable.scatter_nd_add, indices, updates,
-                              name)
-
-  def scatter_nd_update(self, indices, updates, name=None):
-    return self._apply_update(self._variable.scatter_nd_update, indices,
-                              updates, name)
-
-  def load(self, value, session=None):
-    return self._variable.load(value, session)
-
-  @property
-  def name(self):
-    return self._variable.name
-
-  @property
-  def _shared_name(self):
-    return self._variable._shared_name  # pylint:disable=protected-access
-
-  @property
-  def initializer(self):
-    return self._variable.initializer
-
-  @property
-  def device(self):
-    return self._variable.device
-
-  @property
-  def op(self):
-    if self._op == 'delegate':
-      return self._variable.op
-    return self._op
-
-  def _as_graph_element(self):
-    graph_element = self._variable._as_graph_element()  # pylint:disable=protected-access
-    if graph_element is None:
-      return self._op
-    return graph_element
-
-  @property
-  def graph(self):
-    return self._variable.graph
-
-  @property
-  def shape(self):
-    return self._variable.shape
-
-  def get_shape(self):
-    return self._variable.get_shape()
-
-  def _gather_saveables_for_checkpoint(self):
-    # By delegating this method to the wrapped variable, checkpoints with
-    # AutoCastVariables are identical to checkpoints with normal variables.
-    # Therefore models checkpointed with AutoCastVariables can be restored on
-    # models with normal variables, and vice versa.
-    return self._variable._gather_saveables_for_checkpoint()  # pylint:disable=protected-access
-
-  def _map_resources(self, save_options):
-    # By delegating this method to the wrapped variable, SavedModel with
-    # AutoCastVariables are identical to SavedModel with normal variables.
-    obj_map, resource_map = self._variable._map_resources(save_options)  # pylint:disable=protected-access
-    obj_map[self] = obj_map[self._variable]
-    return obj_map, resource_map
+        text = "<unprintable>"
+    if "\n" in text:
+        text = "\n" + text
+    return text
 
-  # TODO(reedwm): Maybe encode the fact the variable is an AutoCastVariable in
-  # to_proto().
-  def to_proto(self, export_scope=None):
-    return self._variable.to_proto(export_scope)
 
-  def from_proto(self, variable_def, import_scope=None):
-    return self._variable.from_proto(variable_def, import_scope)
+class AutoCastVariableSpec(tf.types.experimental.TraceType):
+    """TraceType for AutoCastVariableSpec for tracing with tf.function.
 
-  # Delegate the private attributes _handle_name and _initializer_op to
-  # self._variable. SavedModel sets these attributes when loading a model. For
-  # example, it sets _handle_name here:
-  # https://github.com/tensorflow/tensorflow/blob/db26bd574fa95b5bdd53c08463dd19407cc0297e/tensorflow/python/keras/saving/saved_model/load.py#L211
-  # We need to expose these attributes on AutoCastVariable as well for
-  # SavedModel to work properly.
-  # TODO(reedwm/kathywu): Find a better way to support SavedModel. Exposing
-  # private attributes is hacky and difficult to maintain.
-  @property
-  def _handle_name(self):
-    return self._variable._handle_name  # pylint: disable=protected-access
-
-  @_handle_name.setter
-  def _handle_name(self, handle_name):
-    self._variable._handle_name = handle_name  # pylint: disable=protected-access
-
-  @property
-  def _initializer_op(self):
-    return self._variable._initializer_op  # pylint: disable=protected-access
-
-  @_initializer_op.setter
-  def _initializer_op(self, initializer_op):
-    self._variable._initializer_op = initializer_op  # pylint: disable=protected-access
-
-  # Operator overloads:
-  # Note we only overload operators that support floating-point types, as
-  # non-float variables cannot be wrapped with an AutoCastVariable.
-  # Also note: We call read_value() instead of value(), because value() causes
-  # gradients not to work properly when TPUStrategy is used: b/143380936
-
-  def __add__(self, o):
-    return self.read_value() + o
-
-  def __radd__(self, o):
-    return o + self.read_value()
-
-  def __sub__(self, o):
-    return self.read_value() - o
-
-  def __rsub__(self, o):
-    return o - self.read_value()
-
-  def __mul__(self, o):
-    return self.read_value() * o
-
-  def __rmul__(self, o):
-    return o * self.read_value()
-
-  def __truediv__(self, o):
-    return self.read_value() / o
+    This class implements the Type for AutoCastVariable used in tracing.
+    """
 
-  def __rtruediv__(self, o):
-    return o / self.read_value()
+    def __init__(self, value):
+        self._value = value
 
-  def __floordiv__(self, o):
-    return self.read_value() // o
+    def is_subtype_of(self, other) -> bool:
+        """If the other spec is the same as `self`, return True."""
+        return self == other
 
-  def __rfloordiv__(self, o):
-    return o // self.read_value()
+    def most_specific_common_supertype(self, others):
+        """`self` is the common supertype if all input types match it."""
+        return self if all(self == other for other in others) else None
 
-  def __mod__(self, o):
-    return self.read_value() % o
+    def placeholder_value(self, placeholder_context=None):
+        """Use the AutoCastVariable value itself as a placeholder."""
+        return self._value
 
-  def __rmod__(self, o):
-    return o % self.read_value()
+    def _cast(self, value, _):
+        return value
 
-  def __lt__(self, o):
-    return self.read_value() < o
+    def _to_tensors(self, value):
+        return []
 
-  def __le__(self, o):
-    return self.read_value() <= o
+    def __hash__(self) -> int:
+        return hash(id(self._value))
 
-  def __gt__(self, o):
-    return self.read_value() > o
+    def __eq__(self, other) -> bool:
+        return self is other
 
-  def __ge__(self, o):
-    return self.read_value() >= o
 
-  def __getitem__(self, o):
-    return self.read_value()[o]
+class AutoCastVariable(tf.Variable, tf.__internal__.types.Tensor):
+    """Variable that casts itself to a different dtype in applicable contexts.
+
+    This class wraps a floating-point `tf.Variable`. It emulates the variable
+    interface and delegates to the wrapped variable, but it additionally will
+    cast the wrapped variable under an `enable_auto_cast_variables(dtype)`
+    context manager.
+
+    For example:
+
+    >>> v = tf.Variable(1.0, dtype=tf.float32)
+    >>> v = AutoCastVariable(v)
+    >>> tf.identity(v).dtype
+    tf.float32
+    >>> with enable_auto_cast_variables(tf.float16):
+    ...   tf.identity(v).dtype
+    tf.float16
+
+    The purpose of this class is to allow Keras layers to create variables in
+    float32, and automatically cast them to float16 or bfloat16 when the layer
+    is called.
+    """
 
-  def __pow__(self, o, modulo=None):
-    return pow(self.read_value(), o, modulo)
+    def __init__(self, variable):
+        """Creates an AutoCastVariable instance.
+
+        Args:
+          variable: A floating-point resource variable to wrap.
+
+        Raises:
+          ValueError: If `variable` is not a floating-point resource variable
+        """
+        if not isinstance(variable, tf.Variable):
+            raise ValueError(
+                "variable must be of type tf.ResourceVariable, but got: %s"
+                % variable
+            )
+        if not variable.dtype.is_floating:
+            raise ValueError(
+                "variable must be a floating point variable but has type: %s"
+                % variable.dtype.name
+            )
+        self._variable = variable
+        # 'delegate' means AutoCastVariable.op return self._variable.op, which
+        # will raise an AttributeError in Eager (as intended). If set to any
+        # other value, AutoCastVariable.op returns that value instead, which is
+        # used to set the op attribute in AutoCastVariable.assign().
+        self._op = "delegate"
+
+    def _should_cast(self):
+        """Returns True if this variable should be casted when accessed."""
+        autocast_dtype = getattr(_autocast_dtype, "dtype", None)
+        return autocast_dtype is not None and self.dtype != autocast_dtype
+
+    @property
+    def dtype(self):
+        """The dtype of the underlying variable, before any casts are done."""
+        return self._variable.dtype
+
+    @property
+    def true_dtype(self):
+        """Deprecated alias of `dtype`."""
+        return self._variable.dtype
+
+    @property
+    def _cast_dtype(self):
+        dtype = getattr(_autocast_dtype, "dtype", None)
+        return dtype or self._variable.dtype
+
+    def value(self):
+        val = self._variable.value()
+        if not self._should_cast():
+            return val
+        return tf.cast(val, self._cast_dtype)
+
+    def read_value(self):
+        val = self._variable.read_value()
+        return tf.cast(val, self._cast_dtype)
+
+    def sparse_read(self, indices, name=None):
+        """Reads the value of this variable sparsely, using `gather`."""
+        val = self._variable.sparse_read(indices, name=name)
+        return tf.cast(val, self._cast_dtype)
+
+    def gather_nd(self, indices, name=None):
+        """Gather slices of the variable into a Tensor."""
+        val = self._variable.gather_nd(indices, name=name)
+        return tf.cast(val, self._cast_dtype)
+
+    def __getattr__(self, name):
+        return getattr(self._variable, name)
+
+    def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+        """Converts this variable to a tensor."""
+        if as_ref:
+            # This ValueError should not occur in practice since it is
+            # impossible to pass as_ref=True using public APIs.
+            raise ValueError(
+                "Cannot convert AutoCastVariable to a tensor if "
+                "as_ref=True is passed to convert_to_tensor"
+            )
+        if not self._should_cast():
+            return tf.convert_to_tensor(self._variable, dtype=dtype, name=name)
+        if dtype is not None and not dtype.is_compatible_with(self._cast_dtype):
+            raise ValueError(
+                "Incompatible type conversion requested to type {!r} for "
+                "AutoCastVariable which is casted to type {!r}".format(
+                    dtype.name, self._cast_dtype.name
+                )
+            )
+        val = tf.convert_to_tensor(
+            self._variable, dtype=self._variable.dtype, name=name
+        )
+        return tf.cast(val, self._cast_dtype)
+
+    def __tf_tensor__(
+        self,
+        dtype: Optional[tf.dtypes.DType] = None,
+        name: Optional[str] = None,
+    ) -> tf.Tensor:
+        return self._dense_var_to_tensor(dtype=dtype, name=name)
+
+    def _should_act_as_resource_variable(self):
+        """Pass resource_variable_ops.is_resource_variable check."""
+        pass
 
-  def __rpow__(self, o):
-    return pow(o, self.read_value())
+    def __repr__(self):
+        if tf.executing_eagerly() and not self._in_graph_mode:
+            repr_str = (
+                "<AutoCastVariable '{v.name}' shape={v.shape} "
+                "dtype={v.dtype.name} dtype_to_cast_to={v._cast_dtype.name}, "
+                "numpy={np_repr}>"
+            )
+            return repr_str.format(
+                v=self, np_repr=numpy_text(self.read_value(), is_repr=True)
+            )
+        else:
+            repr_str = (
+                "<AutoCastVariable '{v.name}' shape={v.shape} "
+                "dtype={v.dtype.name} dtype_to_cast_to={v._cast_dtype.name}>"
+            )
+            return repr_str.format(v=self)
+
+    # Method delegations: We delegate the following methods to self._variable.
+    # Each of these methods simply calls the same method on self._variable. The
+    # base Variable raises NotImplementedError for most of these, so we must
+    # override them.
+    #
+    # We do not define the following methods from Variable for the following
+    # reasons:
+    #   * 'count_up_to': This method only applies to int variables, which cannot
+    #     be wrapped with an AutoCastVariable.
+    #   * 'ref': Instead we inherit the definition from Variable.
+    #     If we defined and delegated to Variable, the ref of an
+    #     AutoCastVariable would be the same as the ref of the underlying
+    #     variable, which would be strange as they are different Python objects.
+
+    def set_shape(self, shape):
+        return self._variable.set_shape(self, shape)
+
+    @property
+    def trainable(self):
+        return self._variable.trainable
+
+    @property
+    def synchronization(self):
+        return self._variable.synchronization
+
+    @property
+    def aggregation(self):
+        return self._variable.aggregation
+
+    def eval(self, session=None):
+        return self._variable.eval(session)
+
+    def initialized_value(self):
+        return self._variable.initialized_value()
+
+    @property
+    def initial_value(self):
+        return self._variable.initial_value
+
+    @property
+    def constraint(self):
+        return self._variable.constraint
+
+    def _apply_assign_update(
+        self, update_fn, value, use_locking=None, name=None, read_value=True
+    ):
+        # TODO(b/146181571): This logic can be simplified once
+        # DistributedVariable.assign returns a DistributedVariable. Currently
+        # for MirroredStrategy, it returns a Mirrored value.
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            assign_op = update_fn(value, use_locking, name, False)
+            if read_value:
+                # We create a new AutoCastVariable with the same underlying
+                # tf.Variable.  The new AutoCastVariable is identical except the
+                # 'op' attribute is defined. This matches the behavior of
+                # tf.Variable.assign.
+                var = create_autocast_variable(self._variable)
+                var._op = assign_op
+                return var
+            return assign_op
+
+        # Fallback to wrapping the returned variable in graph mode if possible
+        assign_var = update_fn(value, use_locking, name, read_value)
+        if read_value and tf.__internal__.ops.is_resource_variable(assign_var):
+            return create_autocast_variable(assign_var)
+        return assign_var
+
+    def _apply_update(self, update_fn, *args, **kwargs):
+        update_var = update_fn(*args, **kwargs)
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            return self
+
+        # Fallback to wrapping the returned variable in graph mode if possible
+        if tf.__internal__.ops.is_resource_variable(update_var):
+            return create_autocast_variable(update_var)
+        return update_var
+
+    def assign(self, value, use_locking=None, name=None, read_value=True):
+        return self._apply_assign_update(
+            self._variable.assign, value, use_locking, name, read_value
+        )
+
+    def assign_add(self, delta, use_locking=None, name=None, read_value=True):
+        return self._apply_assign_update(
+            self._variable.assign_add, delta, use_locking, name, read_value
+        )
+
+    def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
+        return self._apply_assign_update(
+            self._variable.assign_sub, delta, use_locking, name, read_value
+        )
+
+    def scatter_sub(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.scatter_sub, sparse_delta, use_locking, name
+        )
+
+    def scatter_add(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.scatter_add, sparse_delta, use_locking, name
+        )
+
+    def scatter_max(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.scatter_max, sparse_delta, use_locking, name
+        )
+
+    def scatter_min(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.scatter_min, sparse_delta, use_locking, name
+        )
+
+    def scatter_mul(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.scatter_mul, sparse_delta, use_locking, name
+        )
+
+    def scatter_div(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.scatter_div, sparse_delta, use_locking, name
+        )
+
+    def scatter_update(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.scatter_update, sparse_delta, use_locking, name
+        )
+
+    def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+        return self._apply_update(
+            self._variable.batch_scatter_update, sparse_delta, use_locking, name
+        )
+
+    def scatter_nd_sub(self, indices, updates, name=None):
+        return self._apply_update(
+            self._variable.scatter_nd_sub, indices, updates, name
+        )
+
+    def scatter_nd_add(self, indices, updates, name=None):
+        return self._apply_update(
+            self._variable.scatter_nd_add, indices, updates, name
+        )
+
+    def scatter_nd_update(self, indices, updates, name=None):
+        return self._apply_update(
+            self._variable.scatter_nd_update, indices, updates, name
+        )
+
+    def load(self, value, session=None):
+        return self._variable.load(value, session)
+
+    @property
+    def name(self):
+        return self._variable.name
+
+    @property
+    def _shared_name(self):
+        return self._variable._shared_name
+
+    @property
+    def initializer(self):
+        return self._variable.initializer
+
+    @property
+    def device(self):
+        return self._variable.device
+
+    @property
+    def op(self):
+        if self._op == "delegate":
+            return self._variable.op
+        return self._op
+
+    def _as_graph_element(self):
+        graph_element = self._variable._as_graph_element()
+        if graph_element is None:
+            return self._op
+        return graph_element
+
+    @property
+    def graph(self):
+        return self._variable.graph
+
+    @property
+    def shape(self):
+        return self._variable.shape
+
+    def get_shape(self):
+        return self._variable.get_shape()
+
+    def __tf_tracing_type__(self, context):
+        return AutoCastVariableSpec(self)
+
+    def _gather_saveables_for_checkpoint(self):
+        # By delegating this method to the wrapped variable, checkpoints with
+        # AutoCastVariables are identical to checkpoints with normal variables.
+        # Therefore models checkpointed with AutoCastVariables can be restored
+        # on models with normal variables, and vice versa.
+        return self._variable._gather_saveables_for_checkpoint()
+
+    def _export_to_saved_model_graph(
+        self, object_map, tensor_map, options, **kwargs
+    ):
+        # By delegating this method to the wrapped variable, SavedModel with
+        # AutoCastVariables are identical to SavedModel with normal variables.
+        resource_list = self._variable._export_to_saved_model_graph(
+            object_map, tensor_map, options, **kwargs
+        )
+        object_map[self] = object_map[self._variable]
+        return resource_list
+
+    # TODO(reedwm): Maybe encode the fact the variable is an AutoCastVariable in
+    # to_proto().
+    def to_proto(self, export_scope=None):
+        return self._variable.to_proto(export_scope)
+
+    def from_proto(self, variable_def, import_scope=None):
+        return self._variable.from_proto(variable_def, import_scope)
+
+    # Delegate the private attributes _handle_name and _initializer_op to
+    # self._variable. SavedModel sets these attributes when loading a model. For
+    # example, it sets _handle_name here:
+    # https://github.com/tensorflow/tensorflow/blob/db26bd574fa95b5bdd53c08463dd19407cc0297e/tensorflow/python/keras/saving/saved_model/load.py#L211
+    # We need to expose these attributes on AutoCastVariable as well for
+    # SavedModel to work properly.
+    # TODO(reedwm/kathywu): Find a better way to support SavedModel. Exposing
+    # private attributes is hacky and difficult to maintain.
+    @property
+    def _handle_name(self):
+        return self._variable._handle_name
+
+    @_handle_name.setter
+    def _handle_name(self, handle_name):
+        self._variable._handle_name = handle_name
+
+    @property
+    def _initializer_op(self):
+        return self._variable._initializer_op
+
+    @_initializer_op.setter
+    def _initializer_op(self, initializer_op):
+        self._variable._initializer_op = initializer_op
+
+    # Operator overloads:
+    # Note we only overload operators that support floating-point types, as
+    # non-float variables cannot be wrapped with an AutoCastVariable.
+    # Also note: We call read_value() instead of value(), because value() causes
+    # gradients not to work properly when TPUStrategy is used: b/143380936
+
+    def __add__(self, o):
+        return self.read_value() + o
+
+    def __radd__(self, o):
+        return o + self.read_value()
+
+    def __sub__(self, o):
+        return self.read_value() - o
+
+    def __rsub__(self, o):
+        return o - self.read_value()
+
+    def __mul__(self, o):
+        return self.read_value() * o
+
+    def __rmul__(self, o):
+        return o * self.read_value()
+
+    def __truediv__(self, o):
+        return self.read_value() / o
+
+    def __rtruediv__(self, o):
+        return o / self.read_value()
+
+    def __floordiv__(self, o):
+        return self.read_value() // o
+
+    def __rfloordiv__(self, o):
+        return o // self.read_value()
+
+    def __mod__(self, o):
+        return self.read_value() % o
+
+    def __rmod__(self, o):
+        return o % self.read_value()
+
+    def __lt__(self, o):
+        return self.read_value() < o
+
+    def __le__(self, o):
+        return self.read_value() <= o
+
+    def __gt__(self, o):
+        return self.read_value() > o
+
+    def __ge__(self, o):
+        return self.read_value() >= o
+
+    def __getitem__(self, o):
+        return self.read_value()[o]
 
-  def __neg__(self):
-    return -self.read_value()  # pylint: disable=invalid-unary-operand-type
+    def __pow__(self, o, modulo=None):
+        return pow(self.read_value(), o, modulo)
+
+    def __rpow__(self, o):
+        return pow(o, self.read_value())
 
-  def __abs__(self):
-    return abs(self.read_value())
+    def __neg__(self):
+        return -self.read_value()
 
-  def __div__(self, o):
-    try:
-      return self.read_value().__div__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
+    def __abs__(self):
+        return abs(self.read_value())
 
-  def __rdiv__(self, o):
-    try:
-      return self.read_value().__rdiv__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
+    def __div__(self, o):
+        try:
+            return self.read_value().__div__(o)
+        except AttributeError:
+            # See
+            # https://docs.python.org/3/library/constants.html#NotImplemented
+            return NotImplemented
 
-  def __matmul__(self, o):
-    try:
-      return self.read_value().__matmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
+    def __rdiv__(self, o):
+        try:
+            return self.read_value().__rdiv__(o)
+        except AttributeError:
+            # See
+            # https://docs.python.org/3/library/constants.html#NotImplemented
+            return NotImplemented
 
-  def __rmatmul__(self, o):
-    try:
-      return self.read_value().__rmatmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
+    def __matmul__(self, o):
+        try:
+            return self.read_value().__matmul__(o)
+        except AttributeError:
+            # See
+            # https://docs.python.org/3/library/constants.html#NotImplemented
+            return NotImplemented
 
-  # pylint: enable=multiple-statements
+    def __rmatmul__(self, o):
+        try:
+            return self.read_value().__rmatmul__(o)
+        except AttributeError:
+            # See
+            # https://docs.python.org/3/library/constants.html#NotImplemented
+            return NotImplemented
 
 
-tf.register_tensor_conversion_function(AutoCastVariable,
-                                        AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
+tf.register_tensor_conversion_function(
+    AutoCastVariable, AutoCastVariable._dense_var_to_tensor
+)
 
 
 def create_autocast_variable(variable):
-  """Creates an AutoCastVariable that wraps another variable.
+    """Creates an AutoCastVariable that wraps another variable.
 
-  This typically just returns `AutoCastVariable(variable)`. But, if the variable
-  is a DistributedVariable or one of its subclasses, we instead dynamically
-  create a class that subclasses from both AutoCastVariable and
-  variable.__class__. This is so the returned variable will still pass
-  `isinstance(variable, variable.__class__)`, which is required for
-  DistributedVariables and its subclasses to work properly.
+    This typically just returns `AutoCastVariable(variable)`. But, if the
+    variable is a DistributedVariable or one of its subclasses, we instead
+    dynamically create a class that subclasses from both AutoCastVariable and
+    variable.__class__. This is so the returned variable will still pass
+    `isinstance(variable, variable.__class__)`, which is required for
+    DistributedVariables and its subclasses to work properly.
 
-  Args:
-    variable: A floating-point resource variable to wrap.
+    Args:
+      variable: A floating-point resource variable to wrap.
 
-  Returns:
-    An AutoCastVariable that wraps the variable.
-  """
-  if not distributed_training_utils.is_distributed_variable(variable):
-    return AutoCastVariable(variable)
+    Returns:
+      An AutoCastVariable that wraps the variable.
+    """
+    if not distributed_training_utils.is_distributed_variable(variable):
+        return AutoCastVariable(variable)
 
-  class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
-    """An AutoCastVariable that also subclasses from variable.__class__.
+    class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
+        """An AutoCastVariable that also subclasses from variable.__class__.
 
-    variable.__class__ is either a DistributedVariable or an
-    AggregatingVariable.
-    """
+        variable.__class__ is either a DistributedVariable or an
+        AggregatingVariable.
+        """
 
-    def __repr__(self):
+        def __repr__(self):
 
-      # pylint: disable=missing-format-attribute
-      return ('<AutoCastDistributedVariable dtype={v.dtype.name} '
-              'dtype_to_cast_to={v._cast_dtype.name} '
-              'inner_variable={v._variable}>'
-             ).format(v=self)
-      # pylint: enable=missing-format-attribute
+            return (
+                "<AutoCastDistributedVariable dtype={v.dtype.name} "
+                "dtype_to_cast_to={v._cast_dtype.name} "
+                "inner_variable={v._variable}>"
+            ).format(v=self)
 
-  return AutoCastDistributedVariable(variable)
+    return AutoCastDistributedVariable(variable)
 
 
-class enable_auto_cast_variables:  # pylint:disable=invalid-name
-  """Context manager which enables the autocasting of `AutoCastVariable`s.
+class enable_auto_cast_variables:
+    """Context manager which enables the autocasting of `AutoCastVariable`s.
 
-  Under this context manager, `AutoCastVariable`s will be cast to `dtype` if
-  `dtype` is floating-point. Otherwise, `AutoCastVariable`s will not be cast.
-  """
+    Under this context manager, `AutoCastVariable`s will be cast to `dtype` if
+    `dtype` is floating-point. Otherwise, `AutoCastVariable`s will not be cast.
+    """
 
-  __slots__ = ['_dtype', '_prev_dtype']
+    __slots__ = ["_dtype", "_prev_dtype"]
 
-  def __init__(self, dtype):
-    if dtype and not dtype.is_floating:
-      dtype = None
-    self._dtype = dtype
+    def __init__(self, dtype):
+        if dtype and not dtype.is_floating:
+            dtype = None
+        self._dtype = dtype
 
-  def __enter__(self):
-    self._prev_dtype = getattr(_autocast_dtype, 'dtype', None)
-    _autocast_dtype.dtype = self._dtype
+    def __enter__(self):
+        self._prev_dtype = getattr(_autocast_dtype, "dtype", None)
+        _autocast_dtype.dtype = self._dtype
 
-  def __exit__(self, type_arg, value_arg, traceback_arg):
-    _autocast_dtype.dtype = self._prev_dtype
+    def __exit__(self, type_arg, value_arg, traceback_arg):
+        _autocast_dtype.dtype = self._prev_dtype
diff --git a/keras/mixed_precision/autocast_variable_test.py b/keras/mixed_precision/autocast_variable_test.py
index efd1314f7c92..1a6637b6fcc5 100644
--- a/keras/mixed_precision/autocast_variable_test.py
+++ b/keras/mixed_precision/autocast_variable_test.py
@@ -14,561 +14,647 @@
 # ==============================================================================
 """Tests for AutoCastVariable."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import threading
 
-from absl.testing import parameterized
 import numpy as np
-from keras.mixed_precision import autocast_variable
-from keras.optimizers.optimizer_v2 import adadelta
-from keras.optimizers.optimizer_v2 import adagrad
-from keras.optimizers.optimizer_v2 import adam
-from keras.optimizers.optimizer_v2 import adamax
-from keras.optimizers.optimizer_v2 import ftrl
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_v2
-from keras.optimizers.optimizer_v2 import nadam
-from keras.optimizers.optimizer_v2 import rmsprop
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
-maybe_distribute = tf.__internal__.test.combinations.combine(distribution=[
-    tf.__internal__.distribute.combinations.default_strategy,
-    tf.__internal__.distribute.combinations.mirrored_strategy_with_cpu_1_and_2
-])
+from keras.layers import Dense
+from keras.mixed_precision import autocast_variable
+from keras.optimizers.legacy import adadelta
+from keras.optimizers.legacy import adagrad
+from keras.optimizers.legacy import adam
+from keras.optimizers.legacy import adamax
+from keras.optimizers.legacy import ftrl
+from keras.optimizers.legacy import gradient_descent as gradient_descent_v2
+from keras.optimizers.legacy import nadam
+from keras.optimizers.legacy import rmsprop
+
+maybe_distribute = tf.__internal__.test.combinations.combine(
+    distribution=[
+        tf.__internal__.distribute.combinations.default_strategy,
+        tf.__internal__.distribute.combinations.mirrored_strategy_with_two_cpus,  # noqa: E501
+    ]
+)
 
 
 def get_var(val, dtype, name=None):
-  return tf.Variable(val, dtype=dtype, name=name)
-
-
-def set_cpu_logical_devices_to_at_least(num):
-  """Create cpu logical devices of at least a given number."""
-  physical_devices = tf.config.list_physical_devices('CPU')
-  if not physical_devices:
-    raise RuntimeError('No CPU found')
-  if len(physical_devices) >= num:
-    return
-  # By default each physical device corresponds to one logical device. We create
-  # multiple logical devices for the last physical device so that we have `num`
-  # logical devices.
-  num = num - len(physical_devices) + 1
-  logical_devices = []
-  for _ in range(num):
-    logical_devices.append(tf.config.LogicalDeviceConfiguration())
-  # Create logical devices from the last device since sometimes the first GPU
-  # is the primary graphic card and may have less memory available.
-  tf.config.set_logical_device_configuration(physical_devices[-1], logical_devices)
-
-
-@tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(mode=['graph', 'eager']))
-class AutoCastVariableTest(tf.test.TestCase, parameterized.TestCase):
+    return tf.Variable(val, dtype=dtype, name=name)
 
-  def setUp(self):
-    set_cpu_logical_devices_to_at_least(3)
-    super().setUp()
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_read(self, distribution):
-    with distribution.scope():
-      x = get_var(1., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-      self.evaluate(x.initializer)
-
-      # outside of auto cast scope.
-      self.assertEqual(x.dtype, tf.float32)
-      self.assertEqual(x.value().dtype, tf.float32)
-      self.assertEqual(x.read_value().dtype, tf.float32)
-      self.assertEqual(tf.identity(x).dtype, tf.float32)
-
-      # within auto cast scope of different dtype
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        self.assertEqual(x.dtype, tf.float32)
-        self.assertEqual(x.value().dtype, tf.float16)
-        self.assertEqual(x.read_value().dtype, tf.float16)
-        self.assertEqual(tf.identity(x).dtype, tf.float16)
-
-      # within auto cast scope of same dtype
-      with autocast_variable.enable_auto_cast_variables(tf.float32):
-        self.assertEqual(x.dtype, tf.float32)
-        self.assertEqual(x.value().dtype, tf.float32)
-        self.assertEqual(x.read_value().dtype, tf.float32)
-        self.assertEqual(tf.identity(x).dtype, tf.float32)
-
-  def test_sparse_reads(self):
-    x = get_var([1., 2], tf.float32)
-    # DistributedVariables do not support sparse_read or gather_nd, so we pass
-    # distribute=False
-    x = autocast_variable.create_autocast_variable(x)
-    self.evaluate(x.initializer)
-
-    self.assertEqual(x.sparse_read([0]).dtype, tf.float32)
-    self.assertEqual(x.gather_nd([0]).dtype, tf.float32)
-
-    with autocast_variable.enable_auto_cast_variables(tf.float16):
-      self.assertEqual(x.sparse_read([0]).dtype, tf.float16)
-      self.assertEqual(x.gather_nd([0]).dtype, tf.float16)
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_read_nested_scopes(self, distribution):
-    with distribution.scope():
-      x = get_var(1., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-      self.evaluate(x.initializer)
-
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        self.assertEqual(x.read_value().dtype, tf.float16)
-
-        with autocast_variable.enable_auto_cast_variables(tf.float32):
-          self.assertEqual(x.read_value().dtype, tf.float32)
-
-        self.assertEqual(x.read_value().dtype, tf.float16)
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_dtype_is_not_string(self, distribution):
-    with distribution.scope():
-      x = get_var(1., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-      self.assertEqual(x.dtype, tf.float32)
-      self.assertIsInstance(x.dtype, tf.DType)
-      self.assertEqual(x.true_dtype, tf.float32)
-      self.assertIsInstance(x.true_dtype, tf.DType)
-
-      dtype = tf.float16
-      with autocast_variable.enable_auto_cast_variables(dtype):
-        self.assertEqual(x.dtype, tf.float32)
-        self.assertIsInstance(x.dtype, tf.DType)
-        self.assertEqual(x.true_dtype, tf.float32)
-        self.assertIsInstance(x.true_dtype, tf.DType)
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_method_delegations(self, distribution):
-    # Test AutoCastVariable correctly delegates Variable methods to the
-    # underlying variable.
-    with self.test_session(), distribution.scope():
-      for read_dtype in (tf.float32, tf.float16):
-        if tf.distribute.has_strategy() and not tf.executing_eagerly():
-          # MirroredVariable.assign will (incorrectly) return a Mirrored value
-          # instead of a MirroredVariable in graph mode.
-          # So we cannot properly wrap it in an AutoCastVariable.
-          evaluate = self.evaluate
-        else:
 
-          def evaluate(var):
-            self.assertIsInstance(var, autocast_variable.AutoCastVariable)
-            self.assertEqual(tf.identity(var).dtype, read_dtype)  # pylint: disable=cell-var-from-loop
-            return self.evaluate(var)
+@tf.__internal__.distribute.combinations.generate(
+    tf.__internal__.test.combinations.combine(mode=["graph", "eager"])
+)
+class AutoCastVariableTest(tf.test.TestCase, parameterized.TestCase):
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_read(self, distribution):
+        with distribution.scope():
+            x = get_var(1.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+            self.evaluate(x.initializer)
 
-        x = get_var(7., tf.float32)
+            # outside of auto cast scope.
+            self.assertEqual(x.dtype, tf.float32)
+            self.assertEqual(x.value().dtype, tf.float32)
+            self.assertEqual(x.read_value().dtype, tf.float32)
+            self.assertEqual(tf.identity(x).dtype, tf.float32)
+
+            # within auto cast scope of different dtype
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                self.assertEqual(x.dtype, tf.float32)
+                self.assertEqual(x.value().dtype, tf.float16)
+                self.assertEqual(x.read_value().dtype, tf.float16)
+                self.assertEqual(tf.identity(x).dtype, tf.float16)
+
+            # within auto cast scope of same dtype
+            with autocast_variable.enable_auto_cast_variables(tf.float32):
+                self.assertEqual(x.dtype, tf.float32)
+                self.assertEqual(x.value().dtype, tf.float32)
+                self.assertEqual(x.read_value().dtype, tf.float32)
+                self.assertEqual(tf.identity(x).dtype, tf.float32)
+
+    def test_sparse_reads(self):
+        x = get_var([1.0, 2], tf.float32)
+        # DistributedVariables do not support sparse_read or gather_nd, so we
+        # pass distribute=False
         x = autocast_variable.create_autocast_variable(x)
-        with autocast_variable.enable_auto_cast_variables(read_dtype):
-          self.evaluate(x.initializer)
-          self.assertEqual(self.evaluate(x.value()), 7)
-          self.assertEqual(self.evaluate(x.read_value()), 7)
-          self.assertTrue(x.trainable)
-          self.assertEqual(x.synchronization, x._variable.synchronization)
-          self.assertEqual(x.aggregation, x._variable.aggregation)
-          self.assertEqual(self.evaluate(x.initialized_value()), 7)
-          if not tf.executing_eagerly():
-            if not tf.distribute.has_strategy():
-              # These functions are not supported for DistributedVariables
-              x.load(9)
-              self.assertEqual(x.eval(), 9)
-            self.assertEqual(self.evaluate(x.initial_value), 7)
-            self.assertEqual(x.op, x._variable.op)
-            self.assertEqual(x.graph, x._variable.graph)
-          if not tf.distribute.has_strategy():
-            # These attributes are not supported for DistributedVariables
-            self.assertIsNone(x.constraint)
-            self.assertEqual(x.initializer, x._variable.initializer)
-          self.assertEqual(evaluate(x.assign(8)), 8)
-          self.assertEqual(evaluate(x.assign_add(2)), 10)
-          self.assertEqual(evaluate(x.assign_sub(3)), 7)
-          self.assertEqual(x.name, x._variable.name)
-          self.assertEqual(x.device, x._variable.device)
-          self.assertEqual(x.shape, ())
-          self.assertEqual(x.get_shape(), ())
-
-        if not tf.distribute.has_strategy():
-          # Test scatter_* methods. These are not supported for
-          # DistributedVariables
-          x = get_var([7, 8], tf.float32)
-          x = autocast_variable.create_autocast_variable(x)
-          with autocast_variable.enable_auto_cast_variables(read_dtype):
+        self.evaluate(x.initializer)
+
+        self.assertEqual(x.sparse_read([0]).dtype, tf.float32)
+        self.assertEqual(x.gather_nd([0]).dtype, tf.float32)
+
+        with autocast_variable.enable_auto_cast_variables(tf.float16):
+            self.assertEqual(x.sparse_read([0]).dtype, tf.float16)
+            self.assertEqual(x.gather_nd([0]).dtype, tf.float16)
+
+    def test_tf_function_with_variable_and_autocast_variable(self):
+        ones = tf.ones((2, 2))
+        layer1 = Dense(2, dtype="float32")
+        layer2 = Dense(2, dtype="mixed_float16")
+        layer1(ones)
+        layer2(ones)
+
+        @tf.function
+        def f(x):
+            return x + 1
+
+        self.assertEqual(f(layer1.kernel).dtype, tf.dtypes.float32)
+        self.assertEqual(f(layer2.kernel).dtype, tf.dtypes.float32)
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_read_nested_scopes(self, distribution):
+        with distribution.scope():
+            x = get_var(1.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+            self.evaluate(x.initializer)
+
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                self.assertEqual(x.read_value().dtype, tf.float16)
+
+                with autocast_variable.enable_auto_cast_variables(tf.float32):
+                    self.assertEqual(x.read_value().dtype, tf.float32)
+
+                self.assertEqual(x.read_value().dtype, tf.float16)
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_dtype_is_not_string(self, distribution):
+        with distribution.scope():
+            x = get_var(1.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+            self.assertEqual(x.dtype, tf.float32)
+            self.assertIsInstance(x.dtype, tf.DType)
+            self.assertEqual(x.true_dtype, tf.float32)
+            self.assertIsInstance(x.true_dtype, tf.DType)
+
+            dtype = tf.float16
+            with autocast_variable.enable_auto_cast_variables(dtype):
+                self.assertEqual(x.dtype, tf.float32)
+                self.assertIsInstance(x.dtype, tf.DType)
+                self.assertEqual(x.true_dtype, tf.float32)
+                self.assertIsInstance(x.true_dtype, tf.DType)
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_method_delegations(self, distribution):
+        # Test AutoCastVariable correctly delegates Variable methods to the
+        # underlying variable.
+        with self.test_session(), distribution.scope():
+            for read_dtype in (tf.float32, tf.float16):
+                if tf.distribute.has_strategy() and not tf.executing_eagerly():
+                    # MirroredVariable.assign will (incorrectly) return a
+                    # Mirrored value instead of a MirroredVariable in graph
+                    # mode.  So we cannot properly wrap it in an
+                    # AutoCastVariable.
+                    evaluate = self.evaluate
+                else:
+
+                    def evaluate(var):
+                        self.assertIsInstance(
+                            var, autocast_variable.AutoCastVariable
+                        )
+                        self.assertEqual(tf.identity(var).dtype, read_dtype)
+                        return self.evaluate(var)
+
+                x = get_var(7.0, tf.float32)
+                x = autocast_variable.create_autocast_variable(x)
+                with autocast_variable.enable_auto_cast_variables(read_dtype):
+                    self.evaluate(x.initializer)
+                    self.assertEqual(self.evaluate(x.value()), 7)
+                    self.assertEqual(self.evaluate(x.read_value()), 7)
+                    self.assertTrue(x.trainable)
+                    self.assertEqual(
+                        x.synchronization, x._variable.synchronization
+                    )
+                    self.assertEqual(x.aggregation, x._variable.aggregation)
+                    self.assertEqual(self.evaluate(x.read_value()), 7)
+                    if not tf.executing_eagerly():
+                        if not tf.distribute.has_strategy():
+                            # These functions are not supported for
+                            # DistributedVariables
+                            x.load(9)
+                            self.assertEqual(x.eval(), 9)
+                        self.assertEqual(self.evaluate(x.initial_value), 7)
+                        self.assertEqual(x.op, x._variable.op)
+                        self.assertEqual(x.graph, x._variable.graph)
+                    if not tf.distribute.has_strategy():
+                        # These attributes are not supported for
+                        # DistributedVariables
+                        self.assertIsNone(x.constraint)
+                        self.assertEqual(x.initializer, x._variable.initializer)
+                    self.assertEqual(evaluate(x.assign(8)), 8)
+                    self.assertEqual(evaluate(x.assign_add(2)), 10)
+                    self.assertEqual(evaluate(x.assign_sub(3)), 7)
+                    self.assertEqual(x.name, x._variable.name)
+                    self.assertEqual(x.device, x._variable.device)
+                    self.assertEqual(x.shape, ())
+                    self.assertEqual(x.get_shape(), ())
+
+                if not tf.distribute.has_strategy():
+                    # Test scatter_* methods. These are not supported for
+                    # DistributedVariables
+                    x = get_var([7, 8], tf.float32)
+                    x = autocast_variable.create_autocast_variable(x)
+                    with autocast_variable.enable_auto_cast_variables(
+                        read_dtype
+                    ):
+                        self.evaluate(x.initializer)
+                        self.assertAllEqual(self.evaluate(x.value()), [7, 8])
+
+                        def slices(val, index):
+                            return tf.IndexedSlices(
+                                values=tf.constant(val, dtype=tf.float32),
+                                indices=tf.constant(index, dtype=tf.int32),
+                                dense_shape=tf.constant([2], dtype=tf.int32),
+                            )
+
+                        self.assertAllEqual(
+                            evaluate(x.scatter_sub(slices(1.0, 0))), [6, 8]
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_add(slices(1.0, 0))), [7, 8]
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_max(slices(9.0, 1))), [7, 9]
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_min(slices(8.0, 1))), [7, 8]
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_mul(slices(2.0, 1))), [7, 16]
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_div(slices(2.0, 1))), [7, 8]
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_update(slices(4.0, 1))), [7, 4]
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_nd_sub([[0], [1]], [1.0, 2.0])),
+                            [6, 2],
+                        )
+                        self.assertAllEqual(
+                            evaluate(x.scatter_nd_add([[0], [1]], [1.0, 2.0])),
+                            [7, 4],
+                        )
+                        self.assertAllEqual(
+                            evaluate(
+                                x.scatter_nd_update([[0], [1]], [1.0, 2.0])
+                            ),
+                            [1, 2],
+                        )
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_operator_overloads(self, distribution):
+        with distribution.scope():
+            for read_dtype in (tf.float32, tf.float16):
+                x = get_var(7.0, tf.float32)
+                x = autocast_variable.create_autocast_variable(x)
+                with autocast_variable.enable_auto_cast_variables(read_dtype):
+                    self.evaluate(x.initializer)
+                    self.assertAlmostEqual(8, self.evaluate(x + 1))
+                    self.assertAlmostEqual(10, self.evaluate(3 + x))
+                    self.assertAlmostEqual(14, self.evaluate(x + x))
+                    self.assertAlmostEqual(5, self.evaluate(x - 2))
+                    self.assertAlmostEqual(6, self.evaluate(13 - x))
+                    self.assertAlmostEqual(0, self.evaluate(x - x))
+                    self.assertAlmostEqual(14, self.evaluate(x * 2))
+                    self.assertAlmostEqual(21, self.evaluate(3 * x))
+                    self.assertAlmostEqual(49, self.evaluate(x * x))
+                    self.assertAlmostEqual(3.5, self.evaluate(x / 2))
+                    self.assertAlmostEqual(1.5, self.evaluate(10.5 / x))
+                    self.assertAlmostEqual(3, self.evaluate(x // 2))
+                    self.assertAlmostEqual(2, self.evaluate(15 // x))
+                    if read_dtype == tf.float32:
+                        # The "mod" operator does not support float16
+                        self.assertAlmostEqual(1, self.evaluate(x % 2))
+                        self.assertAlmostEqual(2, self.evaluate(16 % x))
+                    self.assertTrue(self.evaluate(x < 12))
+                    self.assertTrue(self.evaluate(x <= 12))
+                    self.assertFalse(self.evaluate(x > 12))
+                    self.assertFalse(self.evaluate(x >= 12))
+                    self.assertFalse(self.evaluate(12 < x))
+                    self.assertFalse(self.evaluate(12 <= x))
+                    self.assertTrue(self.evaluate(12 > x))
+                    self.assertTrue(self.evaluate(12 >= x))
+                    self.assertAlmostEqual(
+                        343, self.evaluate(pow(x, 3)), places=4
+                    )
+                    self.assertAlmostEqual(
+                        128, self.evaluate(pow(2, x)), places=4
+                    )
+                    self.assertAlmostEqual(-7, self.evaluate(-x))
+                    self.assertAlmostEqual(7, self.evaluate(abs(x)))
+
+                    x = get_var([7, 8, 9], tf.float32)
+                    x = autocast_variable.create_autocast_variable(x)
+                    self.evaluate(x.initializer)
+                    self.assertEqual(self.evaluate(x[1]), 8)
+                    if tf.__internal__.tf2.enabled() and tf.executing_eagerly():
+                        self.assertAllEqual(
+                            x == [7.0, 8.0, 10.0], [True, True, False]
+                        )
+                        self.assertAllEqual(
+                            x != [7.0, 8.0, 10.0], [False, False, True]
+                        )
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_assign(self, distribution):
+        with distribution.scope():
+            x = get_var(0.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+            self.evaluate(x.initializer)
+
+            # outside of auto cast scope.
+            v1 = tf.constant(3.0, dtype=tf.float32)
+            v2 = tf.constant(3.0, dtype=tf.float16)
+
+            def run_and_check():
+                # Assign float32 values
+                self.assertAllClose(3.0, self.evaluate(x.assign(v1)))
+                self.assertAllClose(3.0 * 2, self.evaluate(x.assign_add(v1)))
+                self.assertAllClose(3.0, self.evaluate(x.assign_sub(v1)))
+
+                # Attempt to assign float16 values
+                with self.assertRaisesRegex(
+                    ValueError,
+                    "conversion requested dtype float32 for Tensor with dtype "
+                    "float16",
+                ):
+                    self.evaluate(x.assign(v2))
+                with self.assertRaisesRegex(
+                    ValueError,
+                    "conversion requested dtype float32 for Tensor with dtype "
+                    "float16",
+                ):
+                    self.evaluate(x.assign_add(v2))
+                with self.assertRaisesRegex(
+                    ValueError,
+                    "conversion requested dtype float32 for Tensor with dtype "
+                    "float16",
+                ):
+                    self.evaluate(x.assign_sub(v2))
+
+                # Assign Python floats
+                self.assertAllClose(0.0, self.evaluate(x.assign(0.0)))
+                self.assertAllClose(3.0, self.evaluate(x.assign(3.0)))
+                self.assertAllClose(3.0 * 2, self.evaluate(x.assign_add(3.0)))
+                self.assertAllClose(3.0, self.evaluate(x.assign_sub(3.0)))
+
+                # Assign multiple times
+                # This currently doesn't work in graph mode if a strategy is
+                # used
+                if not tf.distribute.has_strategy() or tf.executing_eagerly():
+                    assign = x.assign(1.0)
+                    self.assertAllClose(1.0, self.evaluate(assign))
+                    self.assertAllClose(0.0, self.evaluate(assign.assign(0.0)))
+                    assign_add = x.assign_add(3.0)
+                    self.assertAllClose(3.0, self.evaluate(assign_add))
+                    self.assertAllClose(
+                        3.0 * 3,
+                        self.evaluate(x.assign_add(3.0).assign_add(3.0)),
+                    )
+                    self.assertAllClose(3.0 * 3, x)
+                    assign_sub = x.assign_sub(3.0)
+                    self.assertAllClose(3.0 * 2, self.evaluate(assign_sub))
+                    self.assertAllClose(
+                        0.0, self.evaluate(x.assign_sub(3.0).assign_sub(3.0))
+                    )
+
+                # Assign with read_value=False
+                self.assertIsNone(
+                    self.evaluate(x.assign(1.0, read_value=False))
+                )
+                self.assertAllClose(1.0, self.evaluate(x))
+                self.assertIsNone(
+                    self.evaluate(x.assign_add(2.0, read_value=False))
+                )
+                self.assertAllClose(3.0, self.evaluate(x))
+                self.assertIsNone(
+                    self.evaluate(x.assign_sub(3.0, read_value=False))
+                )
+                self.assertAllClose(0.0, self.evaluate(x))
+
+                # Use the tf.assign functions instead of the var.assign methods.
+                self.assertAllClose(
+                    0.0, self.evaluate(tf.compat.v1.assign(x, 0.0))
+                )
+                self.assertAllClose(
+                    3.0, self.evaluate(tf.compat.v1.assign(x, 3.0))
+                )
+                self.assertAllClose(
+                    3.0 * 2, self.evaluate(tf.compat.v1.assign_add(x, 3.0))
+                )
+                self.assertAllClose(
+                    3.0, self.evaluate(tf.compat.v1.assign_sub(x, 3.0))
+                )
+
+            run_and_check()
+            # reset x
+            self.evaluate(x.assign(0.0))
+            # within auto cast scope.
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                # assign still expect float32 value even if in float16 scope
+                run_and_check()
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_assign_tf_function(self, distribution):
+        if not tf.executing_eagerly():
+            self.skipTest("Test is not compatible with graph mode")
+
+        with distribution.scope():
+            x = get_var(0.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+
+            @tf.function
+            def run_assign():
+                return (
+                    x.assign(1.0)
+                    .assign_add(3.0)
+                    .assign_add(3.0)
+                    .assign_sub(2.0)
+                )
+
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                self.assertAllClose(5.0, self.evaluate(run_assign()))
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_op_attribute(self, distribution):
+        with distribution.scope():
+            x = get_var(0.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+
+            # Variable.op raises an AttributeError in Eager mode and is an op in
+            # graph mode. Variable.assign(...).op is None in Eager mode and an
+            # op in Graph mode or a tf.function. We test this is also true of
+            # AutoCastVariable.
+            if tf.executing_eagerly():
+                with self.assertRaises(AttributeError):
+                    x.op
+                self.assertIsNone(x.assign(1.0).op)
+                self.assertIsNone(x.assign_add(1.0).op)
+                self.assertIsNone(x.assign_sub(1.0).op)
+            else:
+                self.assertIsNotNone(x.op)
+                self.assertIsNotNone(x.assign(1.0).op)
+                self.assertIsNotNone(x.assign_add(1.0).op)
+                self.assertIsNotNone(x.assign_sub(1.0).op)
+
+            @tf.function
+            def func():
+                self.assertIsNotNone(x.assign(1.0).op)
+                self.assertIsNotNone(x.assign_add(1.0).op)
+                self.assertIsNotNone(x.assign_sub(1.0).op)
+
+            func()
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_tf_function_control_dependencies(self, distribution):
+        if not tf.executing_eagerly():
+            self.skipTest("Test is not compatible with graph mode")
+
+        with distribution.scope():
+            x = get_var(0.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+
+            @tf.function
+            def func():
+                update = x.assign_add(1.0)
+                with tf.control_dependencies([update]):
+                    x.assign_add(1.0)
+
+            func()
+            self.assertAllClose(2.0, self.evaluate(x))
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_assign_stays_in_true_dtype(self, distribution):
+        with distribution.scope():
+            x = get_var(1.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
             self.evaluate(x.initializer)
-            self.assertAllEqual(self.evaluate(x.value()), [7, 8])
-
-            def slices(val, index):
-              return tf.IndexedSlices(
-                  values=tf.constant(val, dtype=tf.float32),
-                  indices=tf.constant(index, dtype=tf.int32),
-                  dense_shape=tf.constant([2], dtype=tf.int32))
-
-            self.assertAllEqual(evaluate(x.scatter_sub(slices(1., 0))), [6, 8])
-            self.assertAllEqual(evaluate(x.scatter_add(slices(1., 0))), [7, 8])
-            self.assertAllEqual(evaluate(x.scatter_max(slices(9., 1))), [7, 9])
-            self.assertAllEqual(evaluate(x.scatter_min(slices(8., 1))), [7, 8])
-            self.assertAllEqual(evaluate(x.scatter_mul(slices(2., 1))), [7, 16])
-            self.assertAllEqual(evaluate(x.scatter_div(slices(2., 1))), [7, 8])
-            self.assertAllEqual(
-                evaluate(x.scatter_update(slices(4., 1))), [7, 4])
-            self.assertAllEqual(
-                evaluate(x.scatter_nd_sub([[0], [1]], [1., 2.])), [6, 2])
-            self.assertAllEqual(
-                evaluate(x.scatter_nd_add([[0], [1]], [1., 2.])), [7, 4])
-            self.assertAllEqual(
-                evaluate(x.scatter_nd_update([[0], [1]], [1., 2.])), [1, 2])
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_operator_overloads(self, distribution):
-    with distribution.scope():
-      for read_dtype in (tf.float32, tf.float16):
-        x = get_var(7., tf.float32)
+            # small_val is a value such that 1.0 + small_val == 1.0 in fp16, but
+            # not in fp32
+            small_val = np.finfo("float16").eps / 2
+            small_tensor = tf.constant(small_val, dtype=tf.float32)
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                # Variable should be increased, despite it appearing to be the
+                # same float16 value.
+                self.evaluate(x.assign(1.0 + small_tensor))
+                self.assertEqual(1.0, self.evaluate(x.value()))
+            self.assertEqual(1.0 + small_val, self.evaluate(x))
+
+            self.evaluate(x.assign(1.0))
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                self.evaluate(x.assign_add(small_tensor))
+                self.assertEqual(1.0, self.evaluate(x.value()))
+            self.assertEqual(1.0 + small_val, self.evaluate(x))
+
+    def test_thread_local_autocast_dtype(self):
+        x = get_var(1.0, tf.float32)
         x = autocast_variable.create_autocast_variable(x)
-        with autocast_variable.enable_auto_cast_variables(read_dtype):
-          self.evaluate(x.initializer)
-          self.assertAlmostEqual(8, self.evaluate(x + 1))
-          self.assertAlmostEqual(10, self.evaluate(3 + x))
-          self.assertAlmostEqual(14, self.evaluate(x + x))
-          self.assertAlmostEqual(5, self.evaluate(x - 2))
-          self.assertAlmostEqual(6, self.evaluate(13 - x))
-          self.assertAlmostEqual(0, self.evaluate(x - x))
-          self.assertAlmostEqual(14, self.evaluate(x * 2))
-          self.assertAlmostEqual(21, self.evaluate(3 * x))
-          self.assertAlmostEqual(49, self.evaluate(x * x))
-          self.assertAlmostEqual(3.5, self.evaluate(x / 2))
-          self.assertAlmostEqual(1.5, self.evaluate(10.5 / x))
-          self.assertAlmostEqual(3, self.evaluate(x // 2))
-          self.assertAlmostEqual(2, self.evaluate(15 // x))
-          if read_dtype == tf.float32:
-            # The "mod" operator does not support float16
-            self.assertAlmostEqual(1, self.evaluate(x % 2))
-            self.assertAlmostEqual(2, self.evaluate(16 % x))
-          self.assertTrue(self.evaluate(x < 12))
-          self.assertTrue(self.evaluate(x <= 12))
-          self.assertFalse(self.evaluate(x > 12))
-          self.assertFalse(self.evaluate(x >= 12))
-          self.assertFalse(self.evaluate(12 < x))
-          self.assertFalse(self.evaluate(12 <= x))
-          self.assertTrue(self.evaluate(12 > x))
-          self.assertTrue(self.evaluate(12 >= x))
-          self.assertAlmostEqual(343, self.evaluate(pow(x, 3)), places=4)
-          self.assertAlmostEqual(128, self.evaluate(pow(2, x)), places=4)
-          self.assertAlmostEqual(-7, self.evaluate(-x))
-          self.assertAlmostEqual(7, self.evaluate(abs(x)))
-
-          x = get_var([7, 8, 9], tf.float32)
-          x = autocast_variable.create_autocast_variable(x)
-          self.evaluate(x.initializer)
-          self.assertEqual(self.evaluate(x[1]), 8)
-          if tf.__internal__.tf2.enabled() and tf.executing_eagerly():
-            self.assertAllEqual(x == [7., 8., 10.], [True, True, False])
-            self.assertAllEqual(x != [7., 8., 10.], [False, False, True])
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_assign(self, distribution):
-    with distribution.scope():
-      x = get_var(0., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-      self.evaluate(x.initializer)
-
-      # outside of auto cast scope.
-      v1 = tf.constant(3., dtype=tf.float32)
-      v2 = tf.constant(3., dtype=tf.float16)
-
-      def run_and_check():
-        # Assign float32 values
-        self.assertAllClose(3., self.evaluate(x.assign(v1)))
-        self.assertAllClose(3. * 2, self.evaluate(x.assign_add(v1)))
-        self.assertAllClose(3., self.evaluate(x.assign_sub(v1)))
-
-        # Attempt to assign float16 values
-        with self.assertRaisesRegex(
-            ValueError,
-            'conversion requested dtype float32 for Tensor with dtype float16'):
-          self.evaluate(x.assign(v2))
-        with self.assertRaisesRegex(
-            ValueError,
-            'conversion requested dtype float32 for Tensor with dtype float16'):
-          self.evaluate(x.assign_add(v2))
-        with self.assertRaisesRegex(
-            ValueError,
-            'conversion requested dtype float32 for Tensor with dtype float16'):
-          self.evaluate(x.assign_sub(v2))
-
-        # Assign Python floats
-        self.assertAllClose(0., self.evaluate(x.assign(0.)))
-        self.assertAllClose(3., self.evaluate(x.assign(3.)))
-        self.assertAllClose(3. * 2, self.evaluate(x.assign_add(3.)))
-        self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
-
-        # Assign multiple times
-        # This currently doesn't work in graph mode if a strategy is used
-        if not tf.distribute.has_strategy() or tf.executing_eagerly():
-          assign = x.assign(1.)
-          self.assertAllClose(1., self.evaluate(assign))
-          self.assertAllClose(0., self.evaluate(assign.assign(0.)))
-          assign_add = x.assign_add(3.)
-          self.assertAllClose(3., self.evaluate(assign_add))
-          self.assertAllClose(3. * 3,
-                              self.evaluate(x.assign_add(3.).assign_add(3.)))
-          self.assertAllClose(3. * 3, x)
-          assign_sub = x.assign_sub(3.)
-          self.assertAllClose(3. * 2, self.evaluate(assign_sub))
-          self.assertAllClose(0.,
-                              self.evaluate(x.assign_sub(3.).assign_sub(3.)))
-
-        # Assign with read_value=False
-        self.assertIsNone(self.evaluate(x.assign(1., read_value=False)))
-        self.assertAllClose(1., self.evaluate(x))
-        self.assertIsNone(self.evaluate(x.assign_add(2., read_value=False)))
-        self.assertAllClose(3., self.evaluate(x))
-        self.assertIsNone(self.evaluate(x.assign_sub(3., read_value=False)))
-        self.assertAllClose(0., self.evaluate(x))
-
-        # Use the tf.assign functions instead of the var.assign methods.
-        self.assertAllClose(0., self.evaluate(tf.compat.v1.assign(x, 0.)))
-        self.assertAllClose(3., self.evaluate(tf.compat.v1.assign(x, 3.)))
-        self.assertAllClose(3. * 2,
-                            self.evaluate(tf.compat.v1.assign_add(x, 3.)))
-        self.assertAllClose(3., self.evaluate(tf.compat.v1.assign_sub(x, 3.)))
-
-      run_and_check()
-      # reset x
-      self.evaluate(x.assign(0.))
-      # within auto cast scope.
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        # assign still expect float32 value even if in float16 scope
-        run_and_check()
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_assign_tf_function(self, distribution):
-    if not tf.executing_eagerly():
-      self.skipTest('Test is not compatible with graph mode')
-
-    with distribution.scope():
-      x = get_var(0., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-
-      @tf.function
-      def run_assign():
-        return x.assign(1.).assign_add(3.).assign_add(3.).assign_sub(2.)
-
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        self.assertAllClose(5., self.evaluate(run_assign()))
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_op_attribute(self, distribution):
-    with distribution.scope():
-      x = get_var(0., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-
-      # Variable.op raises an AttributeError in Eager mode and is an op in graph
-      # mode. Variable.assign(...).op is None in Eager mode and an op in Graph
-      # mode or a tf.function. We test this is also true of AutoCastVariable.
-      if tf.executing_eagerly():
-        with self.assertRaises(AttributeError):
-          x.op  # pylint: disable=pointless-statement
-        self.assertIsNone(x.assign(1.0).op)
-        self.assertIsNone(x.assign_add(1.0).op)
-        self.assertIsNone(x.assign_sub(1.0).op)
-      else:
-        self.assertIsNotNone(x.op)
-        self.assertIsNotNone(x.assign(1.0).op)
-        self.assertIsNotNone(x.assign_add(1.0).op)
-        self.assertIsNotNone(x.assign_sub(1.0).op)
-
-      @tf.function
-      def func():
-        self.assertIsNotNone(x.assign(1.0).op)
-        self.assertIsNotNone(x.assign_add(1.0).op)
-        self.assertIsNotNone(x.assign_sub(1.0).op)
-
-      func()
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_tf_function_control_dependencies(self, distribution):
-    if not tf.executing_eagerly():
-      self.skipTest('Test is not compatible with graph mode')
-
-    with distribution.scope():
-      x = get_var(0., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-
-      @tf.function
-      def func():
-        update = x.assign_add(1.)
-        with tf.control_dependencies([update]):
-          x.assign_add(1.)
-
-      func()
-      self.assertAllClose(2., self.evaluate(x))
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_assign_stays_in_true_dtype(self, distribution):
-    with distribution.scope():
-      x = get_var(1., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-      self.evaluate(x.initializer)
-      # small_val is a value such that 1.0 + small_val == 1.0 in fp16, but not
-      # in fp32
-      small_val = np.finfo('float16').eps / 2
-      small_tensor = tf.constant(small_val, dtype=tf.float32)
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        # Variable should be increased, despite it appearing to be the same
-        # float16 value.
-        self.evaluate(x.assign(1. + small_tensor))
-        self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x))
-
-      self.evaluate(x.assign(1.))
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        self.evaluate(x.assign_add(small_tensor))
-        self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x))
-
-  def test_thread_local_autocast_dtype(self):
-    x = get_var(1., tf.float32)
-    x = autocast_variable.create_autocast_variable(x)
-    self.evaluate(x.initializer)
-
-    with autocast_variable.enable_auto_cast_variables(tf.float16):
-      self.assertEqual(tf.identity(x).dtype, tf.float16)
-
-      # New threads should not see the modified value of the autocast dtype.
-      var_dtype = None
-      def f():
-        nonlocal var_dtype
-        var_dtype = x._cast_dtype
-      thread = threading.Thread(target=f)
-      thread.start()
-      thread.join()
-      self.assertEqual(var_dtype, tf.float32)
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_checkpoint(self, distribution):
-    with self.test_session():
-      with distribution.scope():
-        x = get_var(1., tf.float32)
+        self.evaluate(x.initializer)
+
+        with autocast_variable.enable_auto_cast_variables(tf.float16):
+            self.assertEqual(tf.identity(x).dtype, tf.float16)
+
+            # New threads should not see the modified value of the autocast
+            # dtype.
+            var_dtype = None
+
+            def f():
+                nonlocal var_dtype
+                var_dtype = x._cast_dtype
+
+            thread = threading.Thread(target=f)
+            thread.start()
+            thread.join()
+            self.assertEqual(var_dtype, tf.float32)
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_checkpoint(self, distribution):
+        with self.test_session():
+            with distribution.scope():
+                x = get_var(1.0, tf.float32)
+                x = autocast_variable.create_autocast_variable(x)
+            self.evaluate(x.initializer)
+            self.evaluate(x.assign(123.0))
+
+            checkpoint = tf.train.Checkpoint(x=x)
+            prefix = os.path.join(self.get_temp_dir(), "ckpt")
+            save_path = checkpoint.save(prefix)
+            self.evaluate(x.assign(234.0))
+            checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+            self.assertEqual(self.evaluate(x), 123.0)
+
+    @tf.__internal__.distribute.combinations.generate(maybe_distribute)
+    def test_invalid_wrapped_variable(self, distribution):
+        with distribution.scope():
+            # Wrap a non-variable
+            with self.assertRaisesRegex(ValueError, "variable must be of type"):
+                x = tf.constant([1.0], dtype=tf.float32)
+                autocast_variable.create_autocast_variable(x)
+
+            # Wrap a non-floating point variable
+            with self.assertRaisesRegex(
+                ValueError, "variable must be a floating point"
+            ):
+                x = get_var(1, tf.int32)
+                autocast_variable.create_autocast_variable(x)
+
+    def test_repr(self):
+        # We do not test with DistributionStrategy because we do not want to
+        # rely on the exact __repr__ output of a DistributedVariable.
+        x = get_var(1.0, tf.float32, name="x")
         x = autocast_variable.create_autocast_variable(x)
-      self.evaluate(x.initializer)
-      self.evaluate(x.assign(123.))
-
-      checkpoint = tf.train.Checkpoint(x=x)
-      prefix = os.path.join(self.get_temp_dir(), 'ckpt')
-      save_path = checkpoint.save(prefix)
-      self.evaluate(x.assign(234.))
-      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-      self.assertEqual(self.evaluate(x), 123.)
-
-  @tf.__internal__.distribute.combinations.generate(maybe_distribute)
-  def test_invalid_wrapped_variable(self, distribution):
-    with distribution.scope():
-      # Wrap a non-variable
-      with self.assertRaisesRegex(ValueError, 'variable must be of type'):
-        x = tf.constant([1.], dtype=tf.float32)
-        autocast_variable.create_autocast_variable(x)
-
-      # Wrap a non-floating point variable
-      with self.assertRaisesRegex(ValueError,
-                                  'variable must be a floating point'):
-        x = get_var(1, tf.int32)
-        autocast_variable.create_autocast_variable(x)
-
-  def test_repr(self):
-    # We do not test with DistributionStrategy because we do not want to rely on
-    # the exact __repr__ output of a DistributedVariable.
-    x = get_var(1., tf.float32, name='x')
-    x = autocast_variable.create_autocast_variable(x)
-    if tf.executing_eagerly():
-      self.assertStartsWith(
-          repr(x),
-          "<AutoCastVariable 'x:0' shape=() dtype=float32 "
-          "dtype_to_cast_to=float32, numpy="
-      )
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        self.assertStartsWith(
-            repr(x),
-            "<AutoCastVariable 'x:0' shape=() dtype=float32 "
-            "dtype_to_cast_to=float16, numpy="
+        if tf.executing_eagerly():
+            self.assertStartsWith(
+                repr(x),
+                "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+                "dtype_to_cast_to=float32, numpy=",
+            )
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                self.assertStartsWith(
+                    repr(x),
+                    "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+                    "dtype_to_cast_to=float16, numpy=",
+                )
+        else:
+            self.assertEqual(
+                repr(x),
+                "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+                "dtype_to_cast_to=float32>",
+            )
+            with autocast_variable.enable_auto_cast_variables(tf.float16):
+                self.assertEqual(
+                    repr(x),
+                    "<AutoCastVariable 'x:0' shape=() dtype=float32 "
+                    "dtype_to_cast_to=float16>",
+                )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            distribution=[
+                tf.__internal__.distribute.combinations.mirrored_strategy_with_two_cpus,  # noqa: E501
+            ]
         )
-    else:
-      self.assertEqual(
-          repr(x),
-          "<AutoCastVariable 'x:0' shape=() dtype=float32 "
-          "dtype_to_cast_to=float32>"
-      )
-      with autocast_variable.enable_auto_cast_variables(tf.float16):
-        self.assertEqual(
-            repr(x),
-            "<AutoCastVariable 'x:0' shape=() dtype=float32 "
-            "dtype_to_cast_to=float16>"
+    )
+    def test_repr_distributed(self, distribution):
+        with distribution.scope():
+            x = get_var(1.0, tf.float32)
+            x = autocast_variable.create_autocast_variable(x)
+            use_policy = getattr(
+                distribution.extended, "_use_var_policy", False
+            )
+            if use_policy:
+                self.assertRegex(
+                    repr(x).replace("\n", " "),
+                    "<AutoCastDistributedVariable dtype=float32 "
+                    "dtype_to_cast_to=float32 "
+                    "inner_variable=DistributedVariable.*>",
+                )
+            else:
+                self.assertRegex(
+                    repr(x).replace("\n", " "),
+                    "<AutoCastDistributedVariable dtype=float32 "
+                    "dtype_to_cast_to=float32 "
+                    "inner_variable=MirroredVariable.*>",
+                )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            optimizer_class=[
+                adadelta.Adadelta,
+                adagrad.Adagrad,
+                adam.Adam,
+                adamax.Adamax,
+                ftrl.Ftrl,
+                gradient_descent_v2.SGD,
+                nadam.Nadam,
+                rmsprop.RMSprop,
+                tf.compat.v1.train.GradientDescentOptimizer,
+            ],
+            use_tf_function=[False, True],
         )
-
-  def test_repr_distributed(self):
-    strategy = tf.distribute.MirroredStrategy(['/cpu:1', '/cpu:2'])
-    with strategy.scope():
-      x = get_var(1., tf.float32)
-      x = autocast_variable.create_autocast_variable(x)
-      use_policy = getattr(strategy.extended, '_use_var_policy', False)
-      if use_policy:
-        self.assertRegex(
-            repr(x).replace('\n', ' '),
-            '<AutoCastDistributedVariable dtype=float32 '
-            'dtype_to_cast_to=float32 '
-            'inner_variable=DistributedVariable.*>')
-      else:
-        self.assertRegex(
-            repr(x).replace('\n', ' '),
-            '<AutoCastDistributedVariable dtype=float32 '
-            'dtype_to_cast_to=float32 '
-            'inner_variable=MirroredVariable.*>')
-
-  @tf.__internal__.distribute.combinations.generate(tf.__internal__.test.combinations.combine(
-      optimizer_class=[
-          adadelta.Adadelta,
-          adagrad.Adagrad,
-          adam.Adam,
-          adamax.Adamax,
-          ftrl.Ftrl,
-          gradient_descent_v2.SGD,
-          nadam.Nadam,
-          rmsprop.RMSprop,
-          tf.compat.v1.train.GradientDescentOptimizer
-      ],
-      use_tf_function=[False, True]))
-  def test_optimizer(self, optimizer_class, use_tf_function):
-    if use_tf_function and not tf.executing_eagerly():
-      self.skipTest('Test does not support graph mode with tf.function')
-    x = get_var(1., tf.float32)
-    x = autocast_variable.create_autocast_variable(x)
-    y = get_var(1., tf.float32)
-    opt = optimizer_class(learning_rate=1.)
-
-    def f():
-      # Minimize both the AutoCastVariable and the normal tf.Variable. Both
-      # variables should be updated to the same value.
-      op = opt.minimize(lambda: x + y, var_list=[x, y])
-      return None if tf.compat.v1.executing_eagerly_outside_functions() else op
-
-    if use_tf_function:
-      f = tf.function(f)
-
-    if tf.executing_eagerly():
-      f()
-    else:
-      op = f()
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(op)
-    # Assert the AutoCastVariable has changed from its initial value
-    self.assertNotEqual(self.evaluate(x), 1.)
-    # Assert AutoCastVariable is updated correctly by comparing it to the normal
-    # variable
-    self.assertAlmostEqual(self.evaluate(x), self.evaluate(y))
-    if optimizer_class in (gradient_descent_v2.SGD,
-                           tf.compat.v1.train.GradientDescentOptimizer):
-      # With SGD, the variables decreases by exactly 1
-      self.assertEqual(self.evaluate(x), 0)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    )
+    def test_optimizer(self, optimizer_class, use_tf_function):
+        if use_tf_function and not tf.executing_eagerly():
+            self.skipTest("Test does not support graph mode with tf.function")
+        x = get_var(1.0, tf.float32)
+        x = autocast_variable.create_autocast_variable(x)
+        y = get_var(1.0, tf.float32)
+        opt = optimizer_class(learning_rate=1.0)
+
+        def f():
+            # Minimize both the AutoCastVariable and the normal tf.Variable.
+            # Both variables should be updated to the same value.
+            op = opt.minimize(lambda: x + y, var_list=[x, y])
+            return (
+                None
+                if tf.compat.v1.executing_eagerly_outside_functions()
+                else op
+            )
+
+        if use_tf_function:
+            f = tf.function(f)
+
+        if tf.executing_eagerly():
+            f()
+        else:
+            op = f()
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(op)
+        # Assert the AutoCastVariable has changed from its initial value
+        self.assertNotEqual(self.evaluate(x), 1.0)
+        # Assert AutoCastVariable is updated correctly by comparing it to the
+        # normal variable
+        self.assertAlmostEqual(self.evaluate(x), self.evaluate(y))
+        if optimizer_class in (
+            gradient_descent_v2.SGD,
+            tf.compat.v1.train.GradientDescentOptimizer,
+        ):
+            # With SGD, the variables decreases by exactly 1
+            self.assertEqual(self.evaluate(x), 0)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/mixed_precision/device_compatibility_check.py b/keras/mixed_precision/device_compatibility_check.py
index 6f58e00bd386..477b61b562d8 100644
--- a/keras/mixed_precision/device_compatibility_check.py
+++ b/keras/mixed_precision/device_compatibility_check.py
@@ -14,134 +14,153 @@
 # ==============================================================================
 """Contains function to log if devices are compatible with mixed precision."""
 
+import itertools
+
 import tensorflow.compat.v2 as tf
 
-import itertools
+# isort: off
 from tensorflow.python.platform import tf_logging
 
-
-_COMPAT_CHECK_PREFIX = 'Mixed precision compatibility check (mixed_float16): '
-_COMPAT_CHECK_OK_PREFIX = _COMPAT_CHECK_PREFIX + 'OK'
-_COMPAT_CHECK_WARNING_PREFIX = _COMPAT_CHECK_PREFIX + 'WARNING'
+_COMPAT_CHECK_PREFIX = "Mixed precision compatibility check (mixed_float16): "
+_COMPAT_CHECK_OK_PREFIX = _COMPAT_CHECK_PREFIX + "OK"
+_COMPAT_CHECK_WARNING_PREFIX = _COMPAT_CHECK_PREFIX + "WARNING"
 _COMPAT_CHECK_WARNING_SUFFIX = (
-    'If you will use compatible GPU(s) not attached to this host, e.g. by '
-    'running a multi-worker model, you can ignore this warning. This message '
-    'will only be logged once')
+    "If you will use compatible GPU(s) not attached to this host, e.g. by "
+    "running a multi-worker model, you can ignore this warning. This message "
+    "will only be logged once"
+)
 
 
 def _dedup_strings(device_strs):
-  """Groups together consecutive identical strings.
-
-  For example, given:
-      ['GPU 1', 'GPU 2', 'GPU 2', 'GPU 3', 'GPU 3', 'GPU 3']
-  This function returns:
-      ['GPU 1', 'GPU 2 (x2)', 'GPU 3 (x3)']
-
-  Args:
-    device_strs: A list of strings, each representing a device.
-
-  Returns:
-    A copy of the input, but identical consecutive strings are merged into a
-    single string.
-  """
-  new_device_strs = []
-  for device_str, vals in itertools.groupby(device_strs):
-    num = len(list(vals))
-    if num == 1:
-      new_device_strs.append(device_str)
-    else:
-      new_device_strs.append('%s (x%d)' % (device_str, num))
-  return new_device_strs
+    """Groups together consecutive identical strings.
+
+    For example, given:
+        ['GPU 1', 'GPU 2', 'GPU 2', 'GPU 3', 'GPU 3', 'GPU 3']
+    This function returns:
+        ['GPU 1', 'GPU 2 (x2)', 'GPU 3 (x3)']
+
+    Args:
+      device_strs: A list of strings, each representing a device.
+
+    Returns:
+      A copy of the input, but identical consecutive strings are merged into a
+      single string.
+    """
+    new_device_strs = []
+    for device_str, vals in itertools.groupby(device_strs):
+        num = len(list(vals))
+        if num == 1:
+            new_device_strs.append(device_str)
+        else:
+            new_device_strs.append("%s (x%d)" % (device_str, num))
+    return new_device_strs
 
 
 def _log_device_compatibility_check(policy_name, gpu_details_list):
-  """Logs a compatibility check if the devices support the policy.
-
-  Currently only logs for the policy mixed_float16.
-
-  Args:
-    policy_name: The name of the dtype policy.
-    gpu_details_list: A list of dicts, one dict per GPU. Each dict
-      is the device details for a GPU, as returned by
-      `tf.config.experimental.get_device_details()`.
-  """
-  if policy_name != 'mixed_float16':
-    # TODO(b/145686977): Log if the policy is 'mixed_bfloat16'. This requires
-    # checking if a TPU is available.
-    return
-  supported_device_strs = []
-  unsupported_device_strs = []
-  for details in gpu_details_list:
-    name = details.get('device_name', 'Unknown GPU')
-    cc = details.get('compute_capability')
-    if cc:
-      device_str = '%s, compute capability %s.%s' % (name, cc[0], cc[1])
-      if cc >= (7, 0):
-        supported_device_strs.append(device_str)
-      else:
-        unsupported_device_strs.append(device_str)
-    else:
-      unsupported_device_strs.append(
-          name + ', no compute capability (probably not an Nvidia GPU)')
-
-  if unsupported_device_strs:
-    warning_str = _COMPAT_CHECK_WARNING_PREFIX + '\n'
-    if supported_device_strs:
-      warning_str += ('Some of your GPUs may run slowly with dtype policy '
-                      'mixed_float16 because they do not all have compute '
-                      'capability of at least 7.0. Your GPUs:\n')
-    elif len(unsupported_device_strs) == 1:
-      warning_str += ('Your GPU may run slowly with dtype policy mixed_float16 '
-                      'because it does not have compute capability of at least '
-                      '7.0. Your GPU:\n')
+    """Logs a compatibility check if the devices support the policy.
+
+    Currently only logs for the policy mixed_float16.
+
+    Args:
+      policy_name: The name of the dtype policy.
+      gpu_details_list: A list of dicts, one dict per GPU. Each dict
+        is the device details for a GPU, as returned by
+        `tf.config.experimental.get_device_details()`.
+    """
+    if policy_name != "mixed_float16":
+        # TODO(b/145686977): Log if the policy is 'mixed_bfloat16'. This
+        # requires checking if a TPU is available.
+        return
+    supported_device_strs = []
+    unsupported_device_strs = []
+    for details in gpu_details_list:
+        name = details.get("device_name", "Unknown GPU")
+        cc = details.get("compute_capability")
+        if cc:
+            device_str = f"{name}, compute capability {cc[0]}.{cc[1]}"
+            if cc >= (7, 0):
+                supported_device_strs.append(device_str)
+            else:
+                unsupported_device_strs.append(device_str)
+        else:
+            unsupported_device_strs.append(
+                name + ", no compute capability (probably not an Nvidia GPU)"
+            )
+
+    if unsupported_device_strs:
+        warning_str = _COMPAT_CHECK_WARNING_PREFIX + "\n"
+        if supported_device_strs:
+            warning_str += (
+                "Some of your GPUs may run slowly with dtype policy "
+                "mixed_float16 because they do not all have compute "
+                "capability of at least 7.0. Your GPUs:\n"
+            )
+        elif len(unsupported_device_strs) == 1:
+            warning_str += (
+                "Your GPU may run slowly with dtype policy mixed_float16 "
+                "because it does not have compute capability of at least "
+                "7.0. Your GPU:\n"
+            )
+        else:
+            warning_str += (
+                "Your GPUs may run slowly with dtype policy "
+                "mixed_float16 because they do not have compute "
+                "capability of at least 7.0. Your GPUs:\n"
+            )
+        for device_str in _dedup_strings(
+            supported_device_strs + unsupported_device_strs
+        ):
+            warning_str += "  " + device_str + "\n"
+        warning_str += (
+            "See https://developer.nvidia.com/cuda-gpus for a list of "
+            "GPUs and their compute capabilities.\n"
+        )
+        warning_str += _COMPAT_CHECK_WARNING_SUFFIX
+        tf_logging.warning(warning_str)
+    elif not supported_device_strs:
+        tf_logging.warning(
+            "%s\n"
+            "The dtype policy mixed_float16 may run slowly because "
+            "this machine does not have a GPU. Only Nvidia GPUs with "
+            "compute capability of at least 7.0 run quickly with "
+            "mixed_float16.\n%s"
+            % (_COMPAT_CHECK_WARNING_PREFIX, _COMPAT_CHECK_WARNING_SUFFIX)
+        )
+    elif len(supported_device_strs) == 1:
+        tf_logging.info(
+            "%s\n"
+            "Your GPU will likely run quickly with dtype policy "
+            "mixed_float16 as it has compute capability of at least "
+            "7.0. Your GPU: %s"
+            % (_COMPAT_CHECK_OK_PREFIX, supported_device_strs[0])
+        )
     else:
-      warning_str += ('Your GPUs may run slowly with dtype policy '
-                      'mixed_float16 because they do not have compute '
-                      'capability of at least 7.0. Your GPUs:\n')
-    for device_str in _dedup_strings(supported_device_strs +
-                                     unsupported_device_strs):
-      warning_str += '  ' + device_str + '\n'
-    warning_str += ('See https://developer.nvidia.com/cuda-gpus for a list of '
-                    'GPUs and their compute capabilities.\n')
-    warning_str += _COMPAT_CHECK_WARNING_SUFFIX
-    tf_logging.warning(warning_str)
-  elif not supported_device_strs:
-    tf_logging.warning(
-        '%s\n'
-        'The dtype policy mixed_float16 may run slowly because '
-        'this machine does not have a GPU. Only Nvidia GPUs with '
-        'compute capability of at least 7.0 run quickly with '
-        'mixed_float16.\n%s' % (_COMPAT_CHECK_WARNING_PREFIX,
-                                _COMPAT_CHECK_WARNING_SUFFIX))
-  elif len(supported_device_strs) == 1:
-    tf_logging.info('%s\n'
-                    'Your GPU will likely run quickly with dtype policy '
-                    'mixed_float16 as it has compute capability of at least '
-                    '7.0. Your GPU: %s' % (_COMPAT_CHECK_OK_PREFIX,
-                                           supported_device_strs[0]))
-  else:
-    tf_logging.info('%s\n'
-                    'Your GPUs will likely run quickly with dtype policy '
-                    'mixed_float16 as they all have compute capability of at '
-                    'least 7.0' % _COMPAT_CHECK_OK_PREFIX)
+        tf_logging.info(
+            "%s\n"
+            "Your GPUs will likely run quickly with dtype policy "
+            "mixed_float16 as they all have compute capability of at "
+            "least 7.0" % _COMPAT_CHECK_OK_PREFIX
+        )
 
 
 _logged_compatibility_check = False
 
 
 def log_device_compatibility_check(policy_name):
-  """Logs a compatibility check if the devices support the policy.
-
-  Currently only logs for the policy mixed_float16. A log is shown only the
-  first time this function is called.
-
-  Args:
-    policy_name: The name of the dtype policy.
-  """
-  global _logged_compatibility_check
-  if _logged_compatibility_check:
-    return
-  _logged_compatibility_check = True
-  gpus = tf.config.list_physical_devices('GPU')
-  gpu_details_list = [tf.config.experimental.get_device_details(g) for g in gpus]
-  _log_device_compatibility_check(policy_name, gpu_details_list)
+    """Logs a compatibility check if the devices support the policy.
+
+    Currently only logs for the policy mixed_float16. A log is shown only the
+    first time this function is called.
+
+    Args:
+      policy_name: The name of the dtype policy.
+    """
+    global _logged_compatibility_check
+    if _logged_compatibility_check:
+        return
+    _logged_compatibility_check = True
+    gpus = tf.config.list_physical_devices("GPU")
+    gpu_details_list = [
+        tf.config.experimental.get_device_details(g) for g in gpus
+    ]
+    _log_device_compatibility_check(policy_name, gpu_details_list)
diff --git a/keras/mixed_precision/device_compatibility_check_test.py b/keras/mixed_precision/device_compatibility_check_test.py
index 5d58dbec1014..9b355e09b296 100644
--- a/keras/mixed_precision/device_compatibility_check_test.py
+++ b/keras/mixed_precision/device_compatibility_check_test.py
@@ -14,128 +14,151 @@
 # ==============================================================================
 """Tests the device compatibility check."""
 
-import tensorflow.compat.v2 as tf
-
 import re
 
-from keras.testing_infra import test_combinations
+import tensorflow.compat.v2 as tf
+
 from keras.mixed_precision import device_compatibility_check
+from keras.testing_infra import test_combinations
+
+# isort: off
 from tensorflow.python.platform import tf_logging
 
 
 def device_details(device_name, compute_capability=None):
-  details = {}
-  if device_name:
-    details['device_name'] = device_name
-  if compute_capability:
-    details['compute_capability'] = compute_capability
-  return details
+    details = {}
+    if device_name:
+        details["device_name"] = device_name
+    if compute_capability:
+        details["compute_capability"] = compute_capability
+    return details
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class DeviceCompatibilityCheckTest(tf.test.TestCase):
-
-  def _test_compat_check(self, device_attr_list, should_warn, expected_regex,
-                         policy_name='mixed_float16'):
-    with tf.compat.v1.test.mock.patch.object(tf_logging, 'warning') as mock_warn, \
-         tf.compat.v1.test.mock.patch.object(tf_logging, 'info') as mock_info:
-      device_compatibility_check._log_device_compatibility_check(
-          policy_name, device_attr_list)
-    if should_warn:
-      self.assertRegex(mock_warn.call_args[0][0], expected_regex)
-      mock_info.assert_not_called()
-    else:
-      self.assertRegex(mock_info.call_args[0][0], expected_regex)
-      mock_warn.assert_not_called()
-
-  def test_supported(self):
-    details_list = [device_details('GPU 1', (7, 1))]
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): OK\n'
-        r'Your GPU will likely run quickly with dtype policy mixed_float16 as '
-        r'it has compute capability of at least 7.0. Your GPU: GPU 1, compute '
-        r'capability 7.1', flags=re.MULTILINE)
-    self._test_compat_check(details_list, False, regex)
-
-    details_list = [
-        device_details('GPU 1', (7, 0)),
-        device_details('GPU 2', (7, 1)),
-        device_details('GPU 3', (8, 0)),
-    ]
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): OK\n'
-        r'Your GPUs will likely run quickly with dtype policy mixed_float16 as '
-        r'they all have compute capability of at least 7.0', flags=re.MULTILINE)
-    self._test_compat_check(details_list, False, regex)
-
-  def test_unsupported(self):
-    details_list = [
-        device_details('GPU 1', (6, 0))
-    ]
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): WARNING\n'
-        r'Your GPU may run slowly with dtype policy mixed_float16.*\n'
-        r'  GPU 1, compute capability 6.0\n'
-        r'See.*', flags=re.MULTILINE)
-    self._test_compat_check(details_list, True, regex)
-
-    details_list = [
-        device_details(None)
-    ]
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): WARNING\n'
-        r'Your GPU may run slowly with dtype policy mixed_float16.*\n'
-        r'  Unknown GPU, no compute capability \(probably not an Nvidia GPU\)\n'
-        r'See.*', flags=re.MULTILINE)
-    self._test_compat_check(details_list, True, regex)
-
-    details_list = [
-        device_details('GPU 1', (6, 0)),
-        device_details('GPU 2', (3, 10)),
-    ]
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): WARNING\n'
-        r'Your GPUs may run slowly with dtype policy mixed_float16.*\n'
-        r'  GPU 1, compute capability 6.0\n'
-        r'  GPU 2, compute capability 3.10\n'
-        r'See.*', flags=re.MULTILINE)
-    self._test_compat_check(details_list, True, regex)
-
-    details_list = [
-        device_details('GPU 1', (6, 0)),
-        device_details('GPU 1', (6, 0)),
-        device_details('GPU 1', (6, 0)),
-        device_details('GPU 2', (3, 10)),
-    ]
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): WARNING\n'
-        r'Your GPUs may run slowly with dtype policy mixed_float16.*\n'
-        r'  GPU 1, compute capability 6.0 \(x3\)\n'
-        r'  GPU 2, compute capability 3.10\n'
-        r'See.*', flags=re.MULTILINE)
-    self._test_compat_check(details_list, True, regex)
-
-    details_list = []
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): WARNING\n'
-        r'The dtype policy mixed_float16 may run slowly because this machine '
-        r'does not have a GPU', flags=re.MULTILINE)
-    self._test_compat_check(details_list, True, regex)
-
-  def test_mix_of_supported_and_unsupported(self):
-    details_list = [
-        device_details('GPU 1', (7, 0)),
-        device_details('GPU 1', (7, 0)),
-        device_details('GPU 2', (6, 0))
-    ]
-    regex = re.compile(
-        r'.*compatibility check \(mixed_float16\): WARNING\n'
-        r'Some of your GPUs may run slowly with dtype policy mixed_float16.*\n'
-        r'  GPU 1, compute capability 7.0 \(x2\)\n'
-        r'  GPU 2, compute capability 6.0\n'
-        r'See.*', flags=re.MULTILINE)
-    self._test_compat_check(details_list, True, regex)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _test_compat_check(
+        self,
+        device_attr_list,
+        should_warn,
+        expected_regex,
+        policy_name="mixed_float16",
+    ):
+        with tf.compat.v1.test.mock.patch.object(
+            tf_logging, "warning"
+        ) as mock_warn, tf.compat.v1.test.mock.patch.object(
+            tf_logging, "info"
+        ) as mock_info:
+            device_compatibility_check._log_device_compatibility_check(
+                policy_name, device_attr_list
+            )
+        if should_warn:
+            self.assertRegex(mock_warn.call_args[0][0], expected_regex)
+            mock_info.assert_not_called()
+        else:
+            self.assertRegex(mock_info.call_args[0][0], expected_regex)
+            mock_warn.assert_not_called()
+
+    def test_supported(self):
+        details_list = [device_details("GPU 1", (7, 1))]
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): OK\n"
+            r"Your GPU will likely run quickly with dtype policy mixed_float16 "
+            r"as it has compute capability of at least 7.0. Your GPU: GPU 1, "
+            r"compute capability 7.1",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, False, regex)
+
+        details_list = [
+            device_details("GPU 1", (7, 0)),
+            device_details("GPU 2", (7, 1)),
+            device_details("GPU 3", (8, 0)),
+        ]
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): OK\n"
+            r"Your GPUs will likely run quickly with dtype policy "
+            r"mixed_float16 as they all have compute capability of "
+            r"at least 7.0",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, False, regex)
+
+    def test_unsupported(self):
+        details_list = [device_details("GPU 1", (6, 0))]
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): WARNING\n"
+            r"Your GPU may run slowly with dtype policy mixed_float16.*\n"
+            r"  GPU 1, compute capability 6.0\n"
+            r"See.*",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, True, regex)
+
+        details_list = [device_details(None)]
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): WARNING\n"
+            r"Your GPU may run slowly with dtype policy mixed_float16.*\n"
+            r"  Unknown GPU, no compute capability "
+            r"\(probably not an Nvidia GPU\)\nSee.*",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, True, regex)
+
+        details_list = [
+            device_details("GPU 1", (6, 0)),
+            device_details("GPU 2", (3, 10)),
+        ]
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): WARNING\n"
+            r"Your GPUs may run slowly with dtype policy mixed_float16.*\n"
+            r"  GPU 1, compute capability 6.0\n"
+            r"  GPU 2, compute capability 3.10\n"
+            r"See.*",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, True, regex)
+
+        details_list = [
+            device_details("GPU 1", (6, 0)),
+            device_details("GPU 1", (6, 0)),
+            device_details("GPU 1", (6, 0)),
+            device_details("GPU 2", (3, 10)),
+        ]
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): WARNING\n"
+            r"Your GPUs may run slowly with dtype policy mixed_float16.*\n"
+            r"  GPU 1, compute capability 6.0 \(x3\)\n"
+            r"  GPU 2, compute capability 3.10\n"
+            r"See.*",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, True, regex)
+
+        details_list = []
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): WARNING\n"
+            r"The dtype policy mixed_float16 may run slowly because this "
+            r"machine does not have a GPU",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, True, regex)
+
+    def test_mix_of_supported_and_unsupported(self):
+        details_list = [
+            device_details("GPU 1", (7, 0)),
+            device_details("GPU 1", (7, 0)),
+            device_details("GPU 2", (6, 0)),
+        ]
+        regex = re.compile(
+            r".*compatibility check \(mixed_float16\): WARNING\n"
+            r"Some of your GPUs may run slowly with dtype policy "
+            r"mixed_float16.*\n  GPU 1, compute capability 7.0 \(x2\)\n"
+            r"  GPU 2, compute capability 6.0\n"
+            r"See.*",
+            flags=re.MULTILINE,
+        )
+        self._test_compat_check(details_list, True, regex)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/mixed_precision/layer_correctness_test.py b/keras/mixed_precision/layer_correctness_test.py
index 02a012ba5241..274b4e186e7c 100644
--- a/keras/mixed_precision/layer_correctness_test.py
+++ b/keras/mixed_precision/layer_correctness_test.py
@@ -14,14 +14,12 @@
 # ==============================================================================
 """Tests various Layer subclasses have correct outputs with mixed precision."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
+
 from keras import layers
 from keras import models
-from keras.testing_infra import test_utils
 from keras.layers import activation
 from keras.layers import attention
 from keras.layers import convolutional
@@ -31,240 +29,325 @@
 from keras.layers import pooling
 from keras.layers import regularization
 from keras.layers import reshaping
+from keras.layers.normalization import batch_normalization
+from keras.layers.normalization import layer_normalization
+from keras.layers.preprocessing import image_preprocessing
+from keras.layers.preprocessing import normalization
 from keras.layers.rnn import bidirectional
 from keras.layers.rnn import conv_lstm2d
-from keras.layers.rnn import simple_rnn
 from keras.layers.rnn import gru
 from keras.layers.rnn import gru_v1
 from keras.layers.rnn import lstm
 from keras.layers.rnn import lstm_v1
+from keras.layers.rnn import simple_rnn
 from keras.layers.rnn import time_distributed
-from keras.layers.normalization import batch_normalization
-from keras.layers.normalization import layer_normalization
-from keras.layers.preprocessing import image_preprocessing
-from keras.layers.preprocessing import normalization
 from keras.mixed_precision import policy
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 def create_mirrored_strategy():
-  # The test creates two virtual CPUs, and we use both of them to test with
-  # multiple devices.
-  return tf.distribute.MirroredStrategy(['cpu:0', 'cpu:1'])
+    # The test creates two virtual CPUs, and we use both of them to test with
+    # multiple devices.
+    # pylint: disable=protected-access
+    tf.distribute.MirroredStrategy._collective_key_base += 1
+    return tf.distribute.MirroredStrategy(["cpu:0", "cpu:1"])
 
 
 def _create_normalization_layer_with_adapt():
-  layer = normalization.Normalization()
-  layer.adapt(np.random.normal(size=(10, 4)))
-  return layer
+    layer = normalization.Normalization()
+    layer.adapt(np.random.normal(size=(10, 4)))
+    return layer
 
 
 def _create_normalization_layer_without_adapt():
-  return normalization.Normalization(
-      mean=np.random.normal(size=(4,)),
-      variance=np.random.uniform(0.5, 2., size=(4,))
-  )
+    return normalization.Normalization(
+        mean=np.random.normal(size=(4,)),
+        variance=np.random.uniform(0.5, 2.0, size=(4,)),
+    )
 
 
 @test_utils.run_v2_only
 class LayerCorrectnessTest(test_combinations.TestCase):
+    def setUp(self):
+        super().setUp()
+        # Set two virtual CPUs to test MirroredStrategy with multiple devices
+        cpus = tf.config.list_physical_devices("CPU")
+        tf.config.set_logical_device_configuration(
+            cpus[0],
+            [
+                tf.config.LogicalDeviceConfiguration(),
+                tf.config.LogicalDeviceConfiguration(),
+            ],
+        )
+        self.strategy = create_mirrored_strategy()
 
-  def setUp(self):
-    super().setUp()
-    # Set two virtual CPUs to test MirroredStrategy with multiple devices
-    cpus = tf.config.list_physical_devices('CPU')
-    tf.config.set_logical_device_configuration(cpus[0], [
-        tf.config.LogicalDeviceConfiguration(),
-        tf.config.LogicalDeviceConfiguration(),
-    ])
-
-  def _create_model_from_layer(self, layer, input_shapes):
-    inputs = [layers.Input(batch_input_shape=s) for s in input_shapes]
-    if len(inputs) == 1:
-      inputs = inputs[0]
-    y = layer(inputs)
-    model = models.Model(inputs, y)
-    model.compile('sgd', 'mse')
-    return model
+    def _create_model_from_layer(self, layer, input_shapes):
+        inputs = [layers.Input(batch_input_shape=s) for s in input_shapes]
+        if len(inputs) == 1:
+            inputs = inputs[0]
+        y = layer(inputs)
+        model = models.Model(inputs, y)
+        model.compile("sgd", "mse")
+        return model
 
-  @parameterized.named_parameters(
-      ('LeakyReLU', activation.LeakyReLU, (2, 2)),
-      ('PReLU', activation.PReLU, (2, 2)),
-      ('ELU', activation.ELU, (2, 2)),
-      ('ThresholdedReLU', activation.ThresholdedReLU, (2, 2)),
-      ('Softmax', activation.Softmax, (2, 2)),
-      ('ReLU', activation.ReLU, (2, 2)),
-      ('Conv1D', lambda: convolutional.Conv1D(2, 2), (2, 2, 1)),
-      ('Conv2D', lambda: convolutional.Conv2D(2, 2), (2, 2, 2, 1)),
-      ('Conv3D', lambda: convolutional.Conv3D(2, 2), (2, 2, 2, 2, 1)),
-      ('Conv2DTranspose', lambda: convolutional.Conv2DTranspose(2, 2),
-       (2, 2, 2, 2)),
-      ('SeparableConv2D', lambda: convolutional.SeparableConv2D(2, 2),
-       (2, 2, 2, 1)),
-      ('DepthwiseConv2D', lambda: convolutional.DepthwiseConv2D(2, 2),
-       (2, 2, 2, 1)),
-      ('UpSampling2D', reshaping.UpSampling2D, (2, 2, 2, 1)),
-      ('ZeroPadding2D', reshaping.ZeroPadding2D, (2, 2, 2, 1)),
-      ('Cropping2D', reshaping.Cropping2D, (2, 3, 3, 1)),
-      ('ConvLSTM2D',
-       lambda: conv_lstm2d.ConvLSTM2D(4, kernel_size=(2, 2)), (4, 4, 4, 4, 4)),
-      ('Dense', lambda: core.Dense(2), (2, 2)),
-      ('Dropout', lambda: regularization.Dropout(0.5), (2, 2)),
-      ('SpatialDropout2D',
-       lambda: regularization.SpatialDropout2D(0.5), (2, 2, 2, 2)),
-      ('Activation', lambda: core.Activation('sigmoid'), (2, 2)),
-      ('Reshape', lambda: reshaping.Reshape((1, 4, 1)), (2, 2, 2)),
-      ('Permute', lambda: reshaping.Permute((2, 1)), (2, 2, 2)),
-      ('Attention', attention.Attention, [(2, 2, 3), (2, 3, 3), (2, 3, 3)]),
-      ('AdditiveAttention', attention.AdditiveAttention, [(2, 2, 3),
-                                                          (2, 3, 3),
-                                                          (2, 3, 3)]),
-      ('Embedding', lambda: core.Embedding(4, 4),
-       (2, 4), 2e-3, 2e-3, np.random.randint(4, size=(2, 4))),
-      ('LocallyConnected1D', lambda: locally_connected.LocallyConnected1D(2, 2),
-       (2, 2, 1)),
-      ('LocallyConnected2D', lambda: locally_connected.LocallyConnected2D(2, 2),
-       (2, 2, 2, 1)),
-      ('Add', merging.Add, [(2, 2), (2, 2)]),
-      ('Subtract', merging.Subtract, [(2, 2), (2, 2)]),
-      ('Multiply', merging.Multiply, [(2, 2), (2, 2)]),
-      ('Average', merging.Average, [(2, 2), (2, 2)]),
-      ('Maximum', merging.Maximum, [(2, 2), (2, 2)]),
-      ('Minimum', merging.Minimum, [(2, 2), (2, 2)]),
-      ('Concatenate', merging.Concatenate, [(2, 2), (2, 2)]),
-      ('Dot', lambda: merging.Dot(1), [(2, 2), (2, 2)]),
-      ('GaussianNoise', lambda: regularization.GaussianNoise(0.5), (2, 2)),
-      ('GaussianDropout', lambda: regularization.GaussianDropout(0.5), (2, 2)),
-      ('AlphaDropout', lambda: regularization.AlphaDropout(0.5), (2, 2)),
-      ('BatchNormalization', batch_normalization.BatchNormalization,
-       (2, 2), 1e-2, 1e-2),
-      ('LayerNormalization', layer_normalization.LayerNormalization, (2, 2)),
-      ('LayerNormalizationUnfused',
-       lambda: layer_normalization.LayerNormalization(axis=1), (2, 2, 2)),
-      ('MaxPooling2D', pooling.MaxPooling2D, (2, 2, 2, 1)),
-      ('AveragePooling2D', pooling.AveragePooling2D, (2, 2, 2, 1)),
-      ('GlobalMaxPooling2D', pooling.GlobalMaxPooling2D, (2, 2, 2, 1)),
-      ('GlobalAveragePooling2D', pooling.GlobalAveragePooling2D, (2, 2, 2, 1)),
-      ('SimpleRNN', lambda: simple_rnn.SimpleRNN(units=4),
-       (4, 4, 4), 1e-2, 1e-2),
-      ('SimpleRNN_stateful',
-       lambda: simple_rnn.SimpleRNN(units=4, stateful=True),
-       (4, 4, 4), 1e-2, 1e-2),
-      ('GRU', lambda: gru_v1.GRU(units=4), (4, 4, 4)),
-      ('LSTM', lambda: lstm_v1.LSTM(units=4), (4, 4, 4)),
-      ('GRUV2', lambda: gru.GRU(units=4), (4, 4, 4)),
-      ('GRUV2_stateful', lambda: gru.GRU(units=4, stateful=True),
-       (4, 4, 4)),
-      ('LSTMV2', lambda: lstm.LSTM(units=4), (4, 4, 4)),
-      ('LSTMV2_stateful', lambda: lstm.LSTM(units=4, stateful=True),
-       (4, 4, 4)),
-      ('TimeDistributed',
-       lambda: time_distributed.TimeDistributed(core.Dense(2)), (2, 2, 2)),
-      ('Bidirectional',
-       lambda: bidirectional.Bidirectional(simple_rnn.SimpleRNN(units=4)),
-       (2, 2, 2)),
-      ('AttentionLayerCausal', lambda: attention.Attention(causal=True), [
-          (2, 2, 3), (2, 3, 3), (2, 3, 3)
-      ]),
-      ('AdditiveAttentionLayerCausal',
-       lambda: attention.AdditiveAttention(causal=True), [(2, 3, 4),
-                                                          (2, 3, 4),
-                                                          (2, 3, 4)]),
-      ('NormalizationAdapt', _create_normalization_layer_with_adapt, (4, 4)),
-      ('NormalizationNoAdapt', _create_normalization_layer_without_adapt,
-       (4, 4)),
-      ('Resizing', lambda: image_preprocessing.Resizing(3, 3), (2, 5, 5, 1)),
-      ('Rescaling', lambda: image_preprocessing.Rescaling(2., 1.), (6, 6)),
-      ('CenterCrop', lambda: image_preprocessing.CenterCrop(3, 3),
-       (2, 5, 5, 1))
-  )
-  def test_layer(self, f32_layer_fn, input_shape, rtol=2e-3, atol=2e-3,
-                 input_data=None):
-    """Tests a layer by comparing the float32 and mixed precision weights.
+    @parameterized.named_parameters(
+        ("LeakyReLU", activation.LeakyReLU, (2, 2)),
+        ("PReLU", activation.PReLU, (2, 2)),
+        ("ELU", activation.ELU, (2, 2)),
+        ("ThresholdedReLU", activation.ThresholdedReLU, (2, 2)),
+        ("Softmax", activation.Softmax, (2, 2)),
+        ("ReLU", activation.ReLU, (2, 2)),
+        ("Conv1D", lambda: convolutional.Conv1D(2, 2), (2, 2, 1)),
+        ("Conv2D", lambda: convolutional.Conv2D(2, 2), (2, 2, 2, 1)),
+        ("Conv3D", lambda: convolutional.Conv3D(2, 2), (2, 2, 2, 2, 1)),
+        (
+            "Conv2DTranspose",
+            lambda: convolutional.Conv2DTranspose(2, 2),
+            (2, 2, 2, 2),
+        ),
+        (
+            "SeparableConv2D",
+            lambda: convolutional.SeparableConv2D(2, 2),
+            (2, 2, 2, 1),
+        ),
+        (
+            "DepthwiseConv2D",
+            lambda: convolutional.DepthwiseConv2D(2, 2),
+            (2, 2, 2, 1),
+        ),
+        ("UpSampling2D", reshaping.UpSampling2D, (2, 2, 2, 1)),
+        ("ZeroPadding2D", reshaping.ZeroPadding2D, (2, 2, 2, 1)),
+        ("Cropping2D", reshaping.Cropping2D, (2, 3, 3, 1)),
+        (
+            "ConvLSTM2D",
+            lambda: conv_lstm2d.ConvLSTM2D(4, kernel_size=(2, 2)),
+            (4, 4, 4, 4, 4),
+        ),
+        ("Dense", lambda: core.Dense(2), (2, 2)),
+        ("Dropout", lambda: regularization.Dropout(0.5), (2, 2)),
+        (
+            "SpatialDropout2D",
+            lambda: regularization.SpatialDropout2D(0.5),
+            (2, 2, 2, 2),
+        ),
+        ("Activation", lambda: core.Activation("sigmoid"), (2, 2)),
+        ("Reshape", lambda: reshaping.Reshape((1, 4, 1)), (2, 2, 2)),
+        ("Permute", lambda: reshaping.Permute((2, 1)), (2, 2, 2)),
+        ("Attention", attention.Attention, [(2, 2, 3), (2, 3, 3), (2, 3, 3)]),
+        (
+            "AdditiveAttention",
+            attention.AdditiveAttention,
+            [(2, 2, 3), (2, 3, 3), (2, 3, 3)],
+        ),
+        (
+            "Embedding",
+            lambda: core.Embedding(4, 4),
+            (2, 4),
+            2e-3,
+            2e-3,
+            np.random.randint(4, size=(2, 4)),
+        ),
+        (
+            "LocallyConnected1D",
+            lambda: locally_connected.LocallyConnected1D(2, 2),
+            (2, 2, 1),
+        ),
+        (
+            "LocallyConnected2D",
+            lambda: locally_connected.LocallyConnected2D(2, 2),
+            (2, 2, 2, 1),
+        ),
+        ("Add", merging.Add, [(2, 2), (2, 2)]),
+        ("Subtract", merging.Subtract, [(2, 2), (2, 2)]),
+        ("Multiply", merging.Multiply, [(2, 2), (2, 2)]),
+        ("Average", merging.Average, [(2, 2), (2, 2)]),
+        ("Maximum", merging.Maximum, [(2, 2), (2, 2)]),
+        ("Minimum", merging.Minimum, [(2, 2), (2, 2)]),
+        ("Concatenate", merging.Concatenate, [(2, 2), (2, 2)]),
+        ("Dot", lambda: merging.Dot(1), [(2, 2), (2, 2)]),
+        ("GaussianNoise", lambda: regularization.GaussianNoise(0.5), (2, 2)),
+        (
+            "GaussianDropout",
+            lambda: regularization.GaussianDropout(0.5),
+            (2, 2),
+        ),
+        ("AlphaDropout", lambda: regularization.AlphaDropout(0.5), (2, 2)),
+        (
+            "BatchNormalization",
+            batch_normalization.BatchNormalization,
+            (2, 2),
+            1e-2,
+            1e-2,
+        ),
+        ("LayerNormalization", layer_normalization.LayerNormalization, (2, 2)),
+        (
+            "LayerNormalizationUnfused",
+            lambda: layer_normalization.LayerNormalization(axis=1),
+            (2, 2, 2),
+        ),
+        ("MaxPooling2D", pooling.MaxPooling2D, (2, 2, 2, 1)),
+        ("AveragePooling2D", pooling.AveragePooling2D, (2, 2, 2, 1)),
+        ("GlobalMaxPooling2D", pooling.GlobalMaxPooling2D, (2, 2, 2, 1)),
+        (
+            "GlobalAveragePooling2D",
+            pooling.GlobalAveragePooling2D,
+            (2, 2, 2, 1),
+        ),
+        (
+            "SimpleRNN",
+            lambda: simple_rnn.SimpleRNN(units=4),
+            (4, 4, 4),
+            1e-2,
+            1e-2,
+        ),
+        (
+            "SimpleRNN_stateful",
+            lambda: simple_rnn.SimpleRNN(units=4, stateful=True),
+            (4, 4, 4),
+            1e-2,
+            1e-2,
+        ),
+        ("GRU", lambda: gru_v1.GRU(units=4), (4, 4, 4)),
+        ("LSTM", lambda: lstm_v1.LSTM(units=4), (4, 4, 4)),
+        ("GRUV2", lambda: gru.GRU(units=4), (4, 4, 4)),
+        ("GRUV2_stateful", lambda: gru.GRU(units=4, stateful=True), (4, 4, 4)),
+        ("LSTMV2", lambda: lstm.LSTM(units=4), (4, 4, 4)),
+        (
+            "LSTMV2_stateful",
+            lambda: lstm.LSTM(units=4, stateful=True),
+            (4, 4, 4),
+        ),
+        (
+            "TimeDistributed",
+            lambda: time_distributed.TimeDistributed(core.Dense(2)),
+            (2, 2, 2),
+        ),
+        (
+            "Bidirectional",
+            lambda: bidirectional.Bidirectional(simple_rnn.SimpleRNN(units=4)),
+            (2, 2, 2),
+        ),
+        ("NormalizationAdapt", _create_normalization_layer_with_adapt, (4, 4)),
+        (
+            "NormalizationNoAdapt",
+            _create_normalization_layer_without_adapt,
+            (4, 4),
+        ),
+        ("Resizing", lambda: image_preprocessing.Resizing(3, 3), (2, 5, 5, 1)),
+        ("Rescaling", lambda: image_preprocessing.Rescaling(2.0, 1.0), (6, 6)),
+        (
+            "CenterCrop",
+            lambda: image_preprocessing.CenterCrop(3, 3),
+            (2, 5, 5, 1),
+        ),
+    )
+    def test_layer(
+        self, f32_layer_fn, input_shape, rtol=2e-3, atol=2e-3, input_data=None
+    ):
+        """Tests a layer by comparing the float32 and mixed precision weights.
 
-    A float32 layer, a mixed precision layer, and a distributed mixed precision
-    layer are run. The three layers are identical other than their dtypes and
-    distribution strategies. The outputs after predict() and weights after fit()
-    are asserted to be close.
+        A float32 layer, a mixed precision layer, and a distributed mixed
+        precision layer are run. The three layers are identical other than their
+        dtypes and distribution strategies. The outputs after predict() and
+        weights after fit() are asserted to be close.
 
-    Args:
-      f32_layer_fn: A function returning a float32 layer. The other two layers
-        will automatically be created from this
-      input_shape: The shape of the input to the layer, including the batch
-        dimension. Or a list of shapes if the layer takes multiple inputs.
-      rtol: The relative tolerance to be asserted.
-      atol: The absolute tolerance to be asserted.
-      input_data: A Numpy array with the data of the input. If None, input data
-        will be randomly generated
-    """
+        Args:
+          f32_layer_fn: A function returning a float32 layer. The other two
+            layers will automatically be created from this.
+          input_shape: The shape of the input to the layer, including the batch
+            dimension. Or a list of shapes if the layer takes multiple inputs.
+          rtol: The relative tolerance to be asserted.
+          atol: The absolute tolerance to be asserted.
+          input_data: A Numpy array with the data of the input. If None, input
+            data will be randomly generated.
+        """
 
-    if f32_layer_fn == reshaping.ZeroPadding2D and tf.test.is_built_with_rocm():
-      return
-    if isinstance(input_shape[0], int):
-      input_shapes = [input_shape]
-    else:
-      input_shapes = input_shape
-    strategy = create_mirrored_strategy()
-    f32_layer = f32_layer_fn()
+        if (
+            f32_layer_fn == reshaping.ZeroPadding2D
+            and tf.test.is_built_with_rocm()
+        ):
+            return
+        if isinstance(input_shape[0], int):
+            input_shapes = [input_shape]
+        else:
+            input_shapes = input_shape
+        f32_layer = f32_layer_fn()
 
-    # Create the layers
-    assert f32_layer.dtype == f32_layer._compute_dtype == 'float32'
-    config = f32_layer.get_config()
-    config['dtype'] = policy.Policy('mixed_float16')
-    mp_layer = f32_layer.__class__.from_config(config)
-    distributed_mp_layer = f32_layer.__class__.from_config(config)
+        # Create the layers
+        assert f32_layer.dtype == f32_layer._compute_dtype == "float32"
+        config = f32_layer.get_config()
+        config["dtype"] = policy.Policy("mixed_float16")
+        mp_layer = f32_layer.__class__.from_config(config)
+        distributed_mp_layer = f32_layer.__class__.from_config(config)
 
-    # Compute per_replica_input_shapes for the distributed model
-    global_batch_size = input_shapes[0][0]
-    assert global_batch_size % strategy.num_replicas_in_sync == 0, (
-        'The number of replicas, %d, does not divide the global batch size of '
-        '%d' % (strategy.num_replicas_in_sync, global_batch_size))
-    per_replica_batch_size = (
-        global_batch_size // strategy.num_replicas_in_sync)
-    per_replica_input_shapes = [(per_replica_batch_size,) + s[1:]
-                                for s in input_shapes]
+        # Compute per_replica_input_shapes for the distributed model
+        global_batch_size = input_shapes[0][0]
+        assert global_batch_size % self.strategy.num_replicas_in_sync == 0, (
+            "The number of replicas, %d, does not divide the global batch "
+            "size of %d"
+            % (self.strategy.num_replicas_in_sync, global_batch_size)
+        )
+        per_replica_batch_size = (
+            global_batch_size // self.strategy.num_replicas_in_sync
+        )
+        per_replica_input_shapes = [
+            (per_replica_batch_size,) + s[1:] for s in input_shapes
+        ]
 
-    # Create the models
-    f32_model = self._create_model_from_layer(f32_layer, input_shapes)
-    mp_model = self._create_model_from_layer(mp_layer, input_shapes)
-    with strategy.scope():
-      distributed_mp_model = self._create_model_from_layer(
-          distributed_mp_layer, per_replica_input_shapes)
+        # Create the models
+        f32_model = self._create_model_from_layer(f32_layer, input_shapes)
+        mp_model = self._create_model_from_layer(mp_layer, input_shapes)
+        with self.strategy.scope():
+            distributed_mp_model = self._create_model_from_layer(
+                distributed_mp_layer, per_replica_input_shapes
+            )
 
-    # Set all model weights to the same values
-    f32_weights = f32_model.get_weights()
-    mp_model.set_weights(f32_weights)
-    distributed_mp_model.set_weights(f32_weights)
+        # Set all model weights to the same values
+        f32_weights = f32_model.get_weights()
+        mp_model.set_weights(f32_weights)
+        distributed_mp_model.set_weights(f32_weights)
 
-    # Generate input data
-    if input_data is None:
-      # Cast inputs to float16 to avoid measuring error from having f16 layers
-      # cast to float16.
-      input_data = [np.random.normal(size=s).astype('float16')
-                    for s in input_shapes]
-      if len(input_data) == 1:
-        input_data = input_data[0]
+        # Generate input data
+        if input_data is None:
+            # Cast inputs to float16 to avoid measuring error from having f16
+            # layers cast to float16.
+            input_data = [
+                np.random.normal(size=s).astype("float16") for s in input_shapes
+            ]
+            if len(input_data) == 1:
+                input_data = input_data[0]
 
-    # Assert all models have close outputs.
-    f32_output = f32_model.predict(input_data)
-    mp_output = mp_model.predict(input_data)
-    self.assertAllClose(
-        mp_output, f32_output, rtol=rtol, atol=atol)
-    self.assertAllClose(
-        distributed_mp_model.predict(input_data), f32_output, rtol=rtol,
-        atol=atol)
+        # Assert all models have close outputs.
+        f32_output = f32_model.predict(input_data)
+        mp_output = mp_model.predict(input_data)
+        self.assertAllClose(mp_output, f32_output, rtol=rtol, atol=atol)
+        self.assertAllClose(
+            distributed_mp_model.predict(input_data),
+            f32_output,
+            rtol=rtol,
+            atol=atol,
+        )
 
-    # Run fit() on models
-    output = np.random.normal(size=f32_model.outputs[0].shape).astype('float16')
-    for model in f32_model, mp_model, distributed_mp_model:
-      model.fit(input_data, output, batch_size=global_batch_size)
+        # Run fit() on models
+        output = np.random.normal(size=f32_model.outputs[0].shape).astype(
+            "float16"
+        )
+        for model in f32_model, mp_model, distributed_mp_model:
+            model.fit(input_data, output, batch_size=global_batch_size)
 
-    # Assert all models have close weights
-    f32_weights = f32_model.get_weights()
-    self.assertAllClose(
-        mp_model.get_weights(), f32_weights, rtol=rtol, atol=atol)
-    self.assertAllClose(
-        distributed_mp_model.get_weights(), f32_weights, rtol=rtol, atol=atol)
+        # Assert all models have close weights
+        f32_weights = f32_model.get_weights()
+        self.assertAllClose(
+            mp_model.get_weights(), f32_weights, rtol=rtol, atol=atol
+        )
+        self.assertAllClose(
+            distributed_mp_model.get_weights(),
+            f32_weights,
+            rtol=rtol,
+            atol=atol,
+        )
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/mixed_precision/layer_test.py b/keras/mixed_precision/layer_test.py
index 404649a99417..b45133d0a5ca 100644
--- a/keras/mixed_precision/layer_test.py
+++ b/keras/mixed_precision/layer_test.py
@@ -14,13 +14,12 @@
 # ==============================================================================
 """Tests keras.layers.Layer works properly with mixed precision."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
-from absl.testing import parameterized
 import numpy as np
-from keras.testing_infra import test_combinations
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
 from keras import layers
 from keras import models
 from keras.engine import base_layer
@@ -28,15 +27,16 @@
 from keras.engine import input_spec
 from keras.mixed_precision import policy
 from keras.mixed_precision import test_util as mp_test_util
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
+from keras.testing_infra import test_combinations
 
 
 class MultiplyLayerWithFunction(mp_test_util.MultiplyLayer):
-  """Same as MultiplyLayer, but _multiply is decorated with a tf.function."""
+    """Same as MultiplyLayer, but _multiply is decorated with a tf.function."""
 
-  @tf.function
-  def _multiply(self, x, y):
-    return super()._multiply(x, y)
+    @tf.function
+    def _multiply(self, x, y):
+        return super()._multiply(x, y)
 
 
 # If called outside any strategy.scope() calls, this will return the default
@@ -45,381 +45,464 @@ def _multiply(self, x, y):
 
 
 def create_mirrored_strategy():
-  """Create a MirroredStrategy, using a GPU if it is available."""
-  if tf.config.list_logical_devices('GPU'):
-    return tf.distribute.MirroredStrategy(['cpu:0', 'gpu:0'])
-  else:
-    return tf.distribute.MirroredStrategy(['cpu:0'])
+    """Create a MirroredStrategy, using a GPU if it is available."""
+    if tf.config.list_logical_devices("GPU"):
+        return tf.distribute.MirroredStrategy(["cpu:0", "gpu:0"])
+    else:
+        return tf.distribute.MirroredStrategy(["cpu:0"])
 
 
 def create_central_storage_strategy():
-  """Create a CentralStorageStrategy, using a GPU if it is available."""
-  compute_devices = ['cpu:0', 'gpu:0'] if (
-      tf.config.list_logical_devices('GPU')) else ['cpu:0']
-  return tf.distribute.experimental.CentralStorageStrategy(
-      compute_devices, parameter_device='cpu:0')
+    """Create a CentralStorageStrategy, using a GPU if it is available."""
+    compute_devices = (
+        ["cpu:0", "gpu:0"]
+        if (tf.config.list_logical_devices("GPU"))
+        else ["cpu:0"]
+    )
+    return tf.distribute.experimental.CentralStorageStrategy(
+        compute_devices, parameter_device="cpu:0"
+    )
 
 
-TESTCASES = ({
-    'testcase_name': 'base',
-    'strategy_fn': default_strategy_fn
-}, {
-    'testcase_name': 'distribute',
-    'strategy_fn': create_mirrored_strategy
-})
+TESTCASES = (
+    {"testcase_name": "base", "strategy_fn": default_strategy_fn},
+    {"testcase_name": "distribute", "strategy_fn": create_mirrored_strategy},
+)
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LayerTest(test_combinations.TestCase):
-  """Test mixed precision with Keras layers."""
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_mixed_policies_(self, strategy_fn):
-    strategy = strategy_fn()
-    for dtype in 'float16', 'bfloat16':
-      x = tf.constant([1.])
-      policy_name = 'mixed_' + dtype
-      with strategy.scope(), policy.policy_scope(policy_name):
-        layer = mp_test_util.MultiplyLayer(assert_type=dtype)
-        self.assertEqual(layer.dtype, tf.float32)
-        self.assertEqual(layer.dtype_policy.name, policy_name)
-        y = layer(x)
-        self.assertEqual(layer.v.dtype, tf.float32)
-        self.assertEqual(y.dtype, dtype)
-        self.assertEqual(layer.dtype_policy.name, policy_name)
-        self.assertIsInstance(layer.dtype_policy, policy.Policy)
-        self.assertEqual(layer.compute_dtype, dtype)
-        self.assertEqual(layer.dtype, tf.float32)
-        self.assertEqual(layer.variable_dtype, tf.float32)
-        self.assertEqual(layer.dtype_policy.name, policy_name)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertEqual(self.evaluate(y), 1.)
-
-  def test_layer_with_int_variable(self):
-    class LayerWithIntVar(base_layer.Layer):
-
-      def build(self, _):
-        self.v = self.add_weight('v', dtype='int32', trainable=False)
-
-      def call(self, inputs):
-        # Only float variables should be autocasted. This will fail if self.v is
-        # autocasted to float32
-        return tf.cast(inputs, 'int32') + self.v
-
-    x = tf.constant([1.])
-    layer = LayerWithIntVar(dtype='mixed_float16')
-    self.assertEqual(layer(x).dtype, 'int32')
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_layer_with_non_autocast_variable(self, strategy_fn):
-    x = tf.constant([1.])
-    with strategy_fn().scope():
-      with policy.policy_scope('mixed_float16'):
-        layer = mp_test_util.MultiplyLayerWithoutAutoCast(
-            assert_type=tf.float16)
-        y = layer(x)
-        self.assertEqual(layer.v.dtype, tf.float32)
-        self.assertEqual(y.dtype, tf.float16)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertEqual(self.evaluate(y), 1.)
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_layer_calling_tf_function(self, strategy_fn):
-    x = tf.constant([1.])
-    with strategy_fn().scope():
-      with policy.policy_scope('mixed_float16'):
-        layer = MultiplyLayerWithFunction(assert_type=tf.float16)
-        y = layer(x)
-        self.assertEqual(layer.v.dtype, tf.float32)
-        self.assertEqual(y.dtype, tf.float16)
+    """Test mixed precision with Keras layers."""
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_mixed_policies_(self, strategy_fn):
+        strategy = strategy_fn()
+        for dtype in "float16", "bfloat16":
+            x = tf.constant([1.0])
+            policy_name = "mixed_" + dtype
+            with strategy.scope(), policy.policy_scope(policy_name):
+                layer = mp_test_util.MultiplyLayer(assert_type=dtype)
+                self.assertEqual(layer.dtype, tf.float32)
+                self.assertEqual(layer.dtype_policy.name, policy_name)
+                y = layer(x)
+                self.assertEqual(layer.v.dtype, tf.float32)
+                self.assertEqual(y.dtype, dtype)
+                self.assertEqual(layer.dtype_policy.name, policy_name)
+                self.assertIsInstance(layer.dtype_policy, policy.Policy)
+                self.assertEqual(layer.compute_dtype, dtype)
+                self.assertEqual(layer.dtype, tf.float32)
+                self.assertEqual(layer.variable_dtype, tf.float32)
+                self.assertEqual(layer.dtype_policy.name, policy_name)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertEqual(self.evaluate(y), 1.0)
+
+    def test_layer_with_int_variable(self):
+        class LayerWithIntVar(base_layer.Layer):
+            def build(self, _):
+                self.v = self.add_weight("v", dtype="int32", trainable=False)
+
+            def call(self, inputs):
+                # Only float variables should be autocasted. This will fail if
+                # self.v is autocasted to float32
+                return tf.cast(inputs, "int32") + self.v
+
+        x = tf.constant([1.0])
+        layer = LayerWithIntVar(dtype="mixed_float16")
+        self.assertEqual(layer(x).dtype, "int32")
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_layer_with_non_autocast_variable(self, strategy_fn):
+        x = tf.constant([1.0])
+        with strategy_fn().scope():
+            with policy.policy_scope("mixed_float16"):
+                layer = mp_test_util.MultiplyLayerWithoutAutoCast(
+                    assert_type=tf.float16
+                )
+                y = layer(x)
+                self.assertEqual(layer.v.dtype, tf.float32)
+                self.assertEqual(y.dtype, tf.float16)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertEqual(self.evaluate(y), 1.0)
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_layer_calling_tf_function(self, strategy_fn):
+        x = tf.constant([1.0])
+        with strategy_fn().scope():
+            with policy.policy_scope("mixed_float16"):
+                layer = MultiplyLayerWithFunction(assert_type=tf.float16)
+                y = layer(x)
+                self.assertEqual(layer.v.dtype, tf.float32)
+                self.assertEqual(y.dtype, tf.float16)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertEqual(self.evaluate(y), 1.0)
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_layer_regularizer_runs_in_var_dtype(self, strategy_fn):
+        x = tf.constant([1.0])
+        with strategy_fn().scope():
+            with policy.policy_scope("mixed_float16"):
+                # Test on MultiplyLayer
+                layer = mp_test_util.MultiplyLayer(
+                    assert_type=tf.float16,
+                    regularizer=mp_test_util.IdentityRegularizer(),
+                )
+                layer(x)
+                (regularizer_loss,) = layer.losses
+                self.assertEqual(regularizer_loss.dtype, tf.float32)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertEqual(self.evaluate(regularizer_loss), 1.0)
+
+                # Test on MultiplyLayerWithoutAutoCast
+                layer = mp_test_util.MultiplyLayerWithoutAutoCast(
+                    assert_type=tf.float16,
+                    regularizer=mp_test_util.IdentityRegularizer(),
+                )
+                layer(x)
+                (regularizer_loss,) = layer.losses
+                self.assertEqual(regularizer_loss.dtype, tf.float32)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertEqual(self.evaluate(regularizer_loss), 1.0)
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_passing_policy_to_layer(self, strategy_fn):
+        x = tf.constant([1.0], dtype=tf.float16)
+        with strategy_fn().scope():
+            # Passing a Policy to 'dtype' sets the policy for that layer.
+            layer = mp_test_util.MultiplyLayer(
+                assert_type=tf.float16, dtype=policy.Policy("mixed_float16")
+            )
+            # layer.dtype refers to the variable dtype
+            self.assertEqual(layer.dtype, tf.float32)
+            layer(x)
+            self.assertEqual(layer.v.dtype, tf.float32)
+            with policy.policy_scope("mixed_float16"):
+                # Passing a Policy to dtype overrides the global Policy
+                layer = mp_test_util.MultiplyLayer(
+                    assert_type=tf.float64, dtype=policy.Policy("float64")
+                )
+                self.assertEqual(layer.dtype_policy.name, "float64")
+                self.assertIsInstance(layer.dtype_policy, policy.Policy)
+                self.assertEqual(layer.compute_dtype, tf.float64)
+                self.assertEqual(layer.dtype, tf.float64)
+                self.assertEqual(layer.variable_dtype, tf.float64)
+                self.assertEqual(layer(x).dtype, tf.float64)
+                self.assertEqual(layer.v.dtype, tf.float64)
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_gradient(self, strategy_fn):
+        x = tf.constant([1.0])
+        with strategy_fn().scope() as strategy:
+            with policy.policy_scope("mixed_float16"):
+                layer = mp_test_util.MultiplyLayer(assert_type=tf.float16)
+                # Learning rate is small enough that if applied to a float16
+                # variable, the variable will not change. So this tests the
+                # learning rate is not applied to a float16 value, but instead
+                # the float32 variable.
+                opt = gradient_descent.SGD(2**-14)
+
+                def run_fn():
+                    with tf.GradientTape() as tape:
+                        y = layer(x)
+                        # Divide by num_replicas_in_sync, as the effective total
+                        # loss is the sum of each of the replica's losses.
+                        y /= strategy.num_replicas_in_sync
+
+                    grad = tape.gradient(y, layer.v)
+                    return opt.apply_gradients([(grad, layer.v)])
+
+                op = strategy.experimental_run(run_fn)
+                if not tf.executing_eagerly():
+                    self.evaluate(tf.compat.v1.global_variables_initializer())
+                    self.evaluate(op)
+                # The gradient with respective to the variable is 1. Since the
+                # variable is initialized with 1 and the learning rate is
+                # 2**-14, the new variable value should be: init_val - gradient
+                # * learning_rate, which is  1 - 1 * 2**-14
+                self.assertEqual(self.evaluate(layer.v), 1 - 2**-14)
+
+    def _test_checkpointing_layer_weights(
+        self, strategy_fn, mixed_prec_when_saving, mixed_prec_when_loading
+    ):
+        # In this test, we potentially save with mixed precision enabled and
+        # load with mixed precision disabled, or vice versa. This is possible
+        # because variables are float32 regardless of whether mixed precision is
+        # enabled.
+        save_policy = "mixed_float16" if mixed_prec_when_saving else "float32"
+        load_policy = "mixed_float16" if mixed_prec_when_loading else "float32"
+        save_input_dtype = "float16" if mixed_prec_when_saving else "float32"
+        load_input_dtype = "float16" if mixed_prec_when_loading else "float32"
+
+        # Create a layer and save a checkpoint.
+        x = tf.constant([1.0])
+        with strategy_fn().scope():
+            with policy.policy_scope(save_policy):
+                layer = mp_test_util.MultiplyLayer(assert_type=save_input_dtype)
+                layer(x)  # Build layer
+        layer.set_weights([np.array(100.0)])
+        self.assertEqual(self.evaluate(layer(x)), 100.0)
+        checkpoint = tf.train.Checkpoint(layer=layer)
+        prefix = os.path.join(self.get_temp_dir(), "ckpt")
+        save_path = checkpoint.save(prefix)
+
+        # Create a new layer and restore the checkpoint.
+        x = tf.constant([1.0])
+        with strategy_fn().scope():
+            with policy.policy_scope(load_policy):
+                layer = mp_test_util.MultiplyLayer(assert_type=load_input_dtype)
+                layer(x)  # Build layer
+        layer.set_weights([np.array(200.0)])
+        self.assertEqual(self.evaluate(layer(x)), 200.0)
+        checkpoint = tf.train.Checkpoint(layer=layer)
+        checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+        self.assertEqual(layer.get_weights(), [100.0])
+        self.assertEqual(self.evaluate(layer(x)), 100.0)
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_checkpointing_layer_weights(self, strategy_fn):
+        with self.test_session():
+            self._test_checkpointing_layer_weights(
+                strategy_fn,
+                mixed_prec_when_saving=True,
+                mixed_prec_when_loading=True,
+            )
+            self._test_checkpointing_layer_weights(
+                strategy_fn,
+                mixed_prec_when_saving=True,
+                mixed_prec_when_loading=False,
+            )
+            self._test_checkpointing_layer_weights(
+                strategy_fn,
+                mixed_prec_when_saving=False,
+                mixed_prec_when_loading=True,
+            )
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_config(self, strategy_fn):
+        x = tf.constant([1.0], dtype=tf.float16)
+        with strategy_fn().scope():
+            for layer, dtype in (
+                (mp_test_util.MultiplyLayer(), "float32"),
+                (mp_test_util.MultiplyLayer(dtype="float64"), "float64"),
+                (
+                    mp_test_util.MultiplyLayer(dtype=policy.Policy("float64")),
+                    "float64",
+                ),
+            ):
+                config = layer.get_config()
+                self.assertEqual(config["dtype"], dtype)
+                self.assertIsInstance(config["dtype"], str)
+                layer = mp_test_util.MultiplyLayer.from_config(config)
+                self.assertEqual(layer.dtype, dtype)
+                self.assertEqual(layer(x).dtype, dtype)
+                self.assertEqual(layer.v.dtype, dtype)
+
+            layer = mp_test_util.MultiplyLayer(dtype="mixed_float16")
+            config = layer.get_config()
+            if tf.__internal__.tf2.enabled():
+                self.assertEqual(
+                    config["dtype"],
+                    {
+                        "module": "keras.mixed_precision",
+                        "class_name": "Policy",
+                        "config": {"name": "mixed_float16"},
+                        "registered_name": None,
+                    },
+                )
+            else:
+                self.assertEqual(
+                    config["dtype"],
+                    {
+                        "class_name": "Policy",
+                        "config": {"name": "mixed_float16"},
+                    },
+                )
+            layer = mp_test_util.MultiplyLayer.from_config(config)
+            self.assertEqual(layer.dtype, "float32")
+            self.assertEqual(layer(x).dtype, "float16")
+            self.assertEqual(layer.v.dtype, "float32")
+            config = layer.get_config()
+            if tf.__internal__.tf2.enabled():
+                self.assertEqual(
+                    config["dtype"],
+                    {
+                        "module": "keras.mixed_precision",
+                        "class_name": "Policy",
+                        "config": {"name": "mixed_float16"},
+                        "registered_name": None,
+                    },
+                )
+            else:
+                self.assertEqual(
+                    config["dtype"],
+                    {
+                        "class_name": "Policy",
+                        "config": {"name": "mixed_float16"},
+                    },
+                )
+
+            layer = mp_test_util.MultiplyLayer(dtype=policy.Policy("_infer"))
+            config = layer.get_config()
+            self.assertIsNone(config["dtype"])
+            layer = mp_test_util.MultiplyLayer.from_config(config)
+            # If a layer is serialized with the "_infer" policy, when
+            # deserialized into TF 2 it will have the global policy instead of
+            # "_infer". This is because "_infer" is serialized into None, and
+            # passing dtype=None in TensorFlow 2 indicates to use the global
+            # policy.
+            self.assertEqual(layer.dtype, "float32")
+            self.assertEqual(layer(x).dtype, "float32")
+            self.assertEqual(layer.v.dtype, "float32")
+
+    @parameterized.named_parameters(*TESTCASES)
+    def test_from_config_policy_v1(self, strategy_fn):
+        # Test that layers serialized in previous Keras versions with the
+        # now-deleted PolicyV1 can be deserialized. In such cases, the PolicyV1
+        # will be converted to a Policy, since PolicyV1 no longer exists. Unlike
+        # Policy, PolicyV1 had a "loss_scale" field, which is silently dropped
+        # when deserialized.
+        x = tf.constant([1.0], dtype=tf.float16)
+        with strategy_fn().scope():
+            layer = mp_test_util.MultiplyLayer(dtype="mixed_float16")
+            config = layer.get_config()
+            # Change the serialized dtype policy to a PolicyV1
+            if tf.__internal__.tf2.enabled():
+                config["dtype"] = {
+                    "module": "keras.mixed_precision",
+                    "class_name": "PolicyV1",
+                    "config": {"name": "mixed_float16", "loss_scale": None},
+                    "registered_name": None,
+                }
+            else:
+                config["dtype"] = {
+                    "class_name": "PolicyV1",
+                    "config": {"name": "mixed_float16", "loss_scale": None},
+                }
+            layer = mp_test_util.MultiplyLayer.from_config(config)
+            self.assertEqual(layer.dtype, "float32")
+            self.assertEqual(layer(x).dtype, "float16")
+            self.assertEqual(layer.v.dtype, "float32")
+            config = layer.get_config()
+            # The loss_scale is silently dropped
+            if tf.__internal__.tf2.enabled():
+                self.assertEqual(
+                    config["dtype"],
+                    {
+                        "module": "keras.mixed_precision",
+                        "class_name": "Policy",
+                        "config": {"name": "mixed_float16"},
+                        "registered_name": None,
+                    },
+                )
+            else:
+                self.assertEqual(
+                    config["dtype"],
+                    {
+                        "class_name": "Policy",
+                        "config": {"name": "mixed_float16"},
+                    },
+                )
+
+            layer = mp_test_util.MultiplyLayer(dtype="float64")
+            config = layer.get_config()
+            config["dtype"] = {
+                "class_name": "PolicyV1",
+                "config": {
+                    "name": "float64",
+                    "loss_scale": {
+                        "class_name": "FixedLossScale",
+                        "config": {"loss_scale_value": 2.0},
+                    },
+                },
+            }
+            layer = mp_test_util.MultiplyLayer.from_config(config)
+            self.assertEqual(layer.dtype, "float64")
+            self.assertEqual(layer(x).dtype, "float64")
+            self.assertEqual(layer.v.dtype, "float64")
+            config = layer.get_config()
+            self.assertEqual(config["dtype"], "float64")
+
+            layer = mp_test_util.MultiplyLayer(dtype=policy.Policy("_infer"))
+            config = layer.get_config()
+            config["dtype"] = {
+                "class_name": "PolicyV1",
+                "config": {
+                    "name": "_infer",
+                    "loss_scale": {
+                        "class_name": "FixedLossScale",
+                        "config": {"loss_scale_value": 2.0},
+                    },
+                },
+            }
+            layer = mp_test_util.MultiplyLayer.from_config(config)
+            self.assertEqual(layer.dtype, None)
+            self.assertEqual(layer(x).dtype, "float16")
+            self.assertEqual(layer.v.dtype, "float16")
+            self.assertEqual(type(layer.dtype_policy), policy.Policy)
+            config = layer.get_config()
+            self.assertEqual(config["dtype"], "float16")
+
+    def test_delete_variable(self):
+        layer = base_layer.Layer(dtype="mixed_float16")
+        layer.x = layer.add_weight("x")
+        self.assertEqual(layer.trainable_weights, [layer.x])
+        del layer.x
+        self.assertEqual(layer.trainable_weights, [])
+
+    def test_build_and_call_layer_in_function(self):
+        layer = mp_test_util.MultiplyLayer(dtype=policy.Policy("mixed_float16"))
+
+        @tf.function
+        def f():
+            return layer(1.0)
+
+        y = f()
         self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertEqual(self.evaluate(y), 1.)
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_layer_regularizer_runs_in_var_dtype(self, strategy_fn):
-    x = tf.constant([1.])
-    with strategy_fn().scope():
-      with policy.policy_scope('mixed_float16'):
-        # Test on MultiplyLayer
-        layer = mp_test_util.MultiplyLayer(
-            assert_type=tf.float16,
-            regularizer=mp_test_util.IdentityRegularizer())
-        layer(x)
-        (regularizer_loss,) = layer.losses
-        self.assertEqual(regularizer_loss.dtype, tf.float32)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertEqual(self.evaluate(regularizer_loss), 1.)
-
-        # Test on MultiplyLayerWithoutAutoCast
-        layer = mp_test_util.MultiplyLayerWithoutAutoCast(
-            assert_type=tf.float16,
-            regularizer=mp_test_util.IdentityRegularizer())
+        self.assertEqual(y.dtype, "float16")
+        self.assertEqual(layer.v.dtype, "float32")
+        self.assertEqual(self.evaluate(y), 1.0)
+
+    def test_unsupported_strategy(self):
+        strategy = create_central_storage_strategy()
+        with strategy.scope(), self.assertRaisesRegex(
+            ValueError,
+            "Mixed precision is not supported with the "
+            "tf.distribute.Strategy: CentralStorageStrategy.",
+        ):
+            mp_test_util.MultiplyLayer(dtype="mixed_float16")
+        # Non-mixed policies are fine
+        mp_test_util.MultiplyLayer(dtype=policy.Policy("float64"))
+
+    def test_input_spec_dtype(self):
+        # Test the InputSpec's dtype is compared against the inputs before the
+        # layer casts them, not after.
+        layer = mp_test_util.MultiplyLayer(dtype="float64")
+        layer.input_spec = input_spec.InputSpec(dtype="float16")
+
+        # Test passing Eager tensors
+        x = tf.ones((2, 2), dtype="float16")
         layer(x)
-        (regularizer_loss,) = layer.losses
-        self.assertEqual(regularizer_loss.dtype, tf.float32)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertEqual(self.evaluate(regularizer_loss), 1.)
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_passing_policy_to_layer(self, strategy_fn):
-    x = tf.constant([1.], dtype=tf.float16)
-    with strategy_fn().scope():
-      # Passing a Policy to 'dtype' sets the policy for that layer.
-      layer = mp_test_util.MultiplyLayer(
-          assert_type=tf.float16, dtype=policy.Policy('mixed_float16'))
-      # layer.dtype refers to the variable dtype
-      self.assertEqual(layer.dtype, tf.float32)
-      layer(x)
-      self.assertEqual(layer.v.dtype, tf.float32)
-      with policy.policy_scope('mixed_float16'):
-        # Passing a Policy to dtype overrides the global Policy
-        layer = mp_test_util.MultiplyLayer(
-            assert_type=tf.float64, dtype=policy.Policy('float64'))
-        self.assertEqual(layer.dtype_policy.name, 'float64')
-        self.assertIsInstance(layer.dtype_policy, policy.Policy)
-        self.assertEqual(layer.compute_dtype, tf.float64)
-        self.assertEqual(layer.dtype, tf.float64)
-        self.assertEqual(layer.variable_dtype, tf.float64)
-        self.assertEqual(layer(x).dtype, tf.float64)
-        self.assertEqual(layer.v.dtype, tf.float64)
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_gradient(self, strategy_fn):
-    x = tf.constant([1.])
-    with strategy_fn().scope() as strategy:
-      with policy.policy_scope('mixed_float16'):
-        layer = mp_test_util.MultiplyLayer(assert_type=tf.float16)
-        # Learning rate is small enough that if applied to a float16 variable,
-        # the variable will not change. So this tests the learning rate is not
-        # applied to a float16 value, but instead the float32 variable.
-        opt = gradient_descent.SGD(2**-14)
-
-        def run_fn():
-          with tf.GradientTape() as tape:
+        x = tf.ones((2, 2), dtype="float64")
+        with self.assertRaisesRegex(
+            ValueError, "expected dtype=float16, found dtype=.*float64"
+        ):
+            layer(x)
+
+        # Test passing symbolic tensors
+        x = layers.Input((2,), dtype="float16")
+        y = layer(x)
+        model = models.Model(x, y)
+        model(tf.ones((2, 2)))
+
+        x = layers.Input((2,), dtype="float64")
+        with self.assertRaisesRegex(
+            ValueError, "expected dtype=float16, found dtype=.*float64"
+        ):
+            # In TF2, the error is only raised when the model is run
             y = layer(x)
-            # Divide by num_replicas_in_sync, as the effective total loss is the
-            # sum of each of the replica's losses.
-            y /= strategy.num_replicas_in_sync
-
-          grad = tape.gradient(y, layer.v)
-          return opt.apply_gradients([(grad, layer.v)])
-
-        op = strategy.experimental_run(run_fn)
-        if not tf.executing_eagerly():
-          self.evaluate(tf.compat.v1.global_variables_initializer())
-          self.evaluate(op)
-        # The gradient with respective to the variable is 1. Since the
-        # variable is initialized with 1 and the learning rate is 2**-14, the
-        # new variable value should be: init_val - gradient * learning_rate,
-        # which is  1 - 1 * 2**-14
-        self.assertEqual(self.evaluate(layer.v), 1 - 2**-14)
-
-  def _test_checkpointing_layer_weights(self, strategy_fn,
-                                        mixed_prec_when_saving,
-                                        mixed_prec_when_loading):
-    # In this test, we potentially save with mixed precision enabled and load
-    # with mixed precision disabled, or vice versa. This is possible because
-    # variables are float32 regardless of whether mixed precision is enabled.
-    save_policy = 'mixed_float16' if mixed_prec_when_saving else 'float32'
-    load_policy = 'mixed_float16' if mixed_prec_when_loading else 'float32'
-    save_input_dtype = 'float16' if mixed_prec_when_saving else 'float32'
-    load_input_dtype = 'float16' if mixed_prec_when_loading else 'float32'
-
-    # Create a layer and save a checkpoint.
-    x = tf.constant([1.])
-    with strategy_fn().scope():
-      with policy.policy_scope(save_policy):
-        layer = mp_test_util.MultiplyLayer(assert_type=save_input_dtype)
-        layer(x)  # Build layer
-    layer.set_weights([np.array(100.)])
-    self.assertEqual(self.evaluate(layer(x)), 100.)
-    checkpoint = tf.train.Checkpoint(layer=layer)
-    prefix = os.path.join(self.get_temp_dir(), 'ckpt')
-    save_path = checkpoint.save(prefix)
-
-    # Create a new layer and restore the checkpoint.
-    x = tf.constant([1.])
-    with strategy_fn().scope():
-      with policy.policy_scope(load_policy):
-        layer = mp_test_util.MultiplyLayer(assert_type=load_input_dtype)
-        layer(x)  # Build layer
-    layer.set_weights([np.array(200.)])
-    self.assertEqual(self.evaluate(layer(x)), 200.)
-    checkpoint = tf.train.Checkpoint(layer=layer)
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.assertEqual(layer.get_weights(), [100.])
-    self.assertEqual(self.evaluate(layer(x)), 100.)
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_checkpointing_layer_weights(self, strategy_fn):
-    with self.test_session():
-      self._test_checkpointing_layer_weights(
-          strategy_fn, mixed_prec_when_saving=True,
-          mixed_prec_when_loading=True)
-      self._test_checkpointing_layer_weights(
-          strategy_fn, mixed_prec_when_saving=True,
-          mixed_prec_when_loading=False)
-      self._test_checkpointing_layer_weights(
-          strategy_fn, mixed_prec_when_saving=False,
-          mixed_prec_when_loading=True)
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_config(self, strategy_fn):
-    x = tf.constant([1.], dtype=tf.float16)
-    with strategy_fn().scope():
-      for layer, dtype in (
-          (mp_test_util.MultiplyLayer(), 'float32'),
-          (mp_test_util.MultiplyLayer(dtype='float64'), 'float64'),
-          (mp_test_util.MultiplyLayer(dtype=policy.Policy('float64')),
-           'float64')):
-        config = layer.get_config()
-        self.assertEqual(config['dtype'], dtype)
-        self.assertIsInstance(config['dtype'], str)
-        layer = mp_test_util.MultiplyLayer.from_config(config)
-        self.assertEqual(layer.dtype, dtype)
-        self.assertEqual(layer(x).dtype, dtype)
-        self.assertEqual(layer.v.dtype, dtype)
-
-      layer = mp_test_util.MultiplyLayer(dtype='mixed_float16')
-      config = layer.get_config()
-      self.assertEqual(config['dtype'],
-                       {'class_name': 'Policy',
-                        'config': {'name': 'mixed_float16'}})
-      layer = mp_test_util.MultiplyLayer.from_config(config)
-      self.assertEqual(layer.dtype, 'float32')
-      self.assertEqual(layer(x).dtype, 'float16')
-      self.assertEqual(layer.v.dtype, 'float32')
-      config = layer.get_config()
-      self.assertEqual(config['dtype'],
-                       {'class_name': 'Policy',
-                        'config': {'name': 'mixed_float16'}})
-
-      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('_infer'))
-      config = layer.get_config()
-      self.assertIsNone(config['dtype'])
-      layer = mp_test_util.MultiplyLayer.from_config(config)
-      # If a layer is serialized with the "_infer" policy, when deserialized
-      # into TF 2 it will have the global policy instead of "_infer". This is
-      # because "_infer" is serialized into None, and passing dtype=None in
-      # TensorFlow 2 indicates to use the global policy.
-      self.assertEqual(layer.dtype, 'float32')
-      self.assertEqual(layer(x).dtype, 'float32')
-      self.assertEqual(layer.v.dtype, 'float32')
-
-  @parameterized.named_parameters(*TESTCASES)
-  def test_from_config_policy_v1(self, strategy_fn):
-    # Test that layers serialized in previous Keras versions with the
-    # now-deleted PolicyV1 can be deserialized. In such cases, the PolicyV1 will
-    # be converted to a Policy, since PolicyV1 no longer exists. Unlike Policy,
-    # PolicyV1 had a "loss_scale" field, which is silently dropped when
-    # deserialized.
-    x = tf.constant([1.], dtype=tf.float16)
-    with strategy_fn().scope():
-
-      layer = mp_test_util.MultiplyLayer(dtype='mixed_float16')
-      config = layer.get_config()
-      # Change the serialized dtype policy to a PolicyV1
-      config['dtype'] = {'class_name': 'PolicyV1',
-                         'config': {'name': 'mixed_float16',
-                                    'loss_scale': None}}
-      layer = mp_test_util.MultiplyLayer.from_config(config)
-      self.assertEqual(layer.dtype, 'float32')
-      self.assertEqual(layer(x).dtype, 'float16')
-      self.assertEqual(layer.v.dtype, 'float32')
-      config = layer.get_config()
-      # The loss_scale is silently dropped
-      self.assertEqual(config['dtype'],
-                       {'class_name': 'Policy',
-                        'config': {'name': 'mixed_float16'}})
-
-      layer = mp_test_util.MultiplyLayer(dtype='float64')
-      config = layer.get_config()
-      config['dtype'] = {'class_name': 'PolicyV1',
-                         'config': {'name': 'float64',
-                                    'loss_scale': {
-                                        'class_name': 'FixedLossScale',
-                                        'config': {'loss_scale_value': 2.0}}}}
-      layer = mp_test_util.MultiplyLayer.from_config(config)
-      self.assertEqual(layer.dtype, 'float64')
-      self.assertEqual(layer(x).dtype, 'float64')
-      self.assertEqual(layer.v.dtype, 'float64')
-      config = layer.get_config()
-      self.assertEqual(config['dtype'], 'float64')
-
-      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('_infer'))
-      config = layer.get_config()
-      config['dtype'] = {'class_name': 'PolicyV1',
-                         'config': {'name': '_infer',
-                                    'loss_scale': {
-                                        'class_name': 'FixedLossScale',
-                                        'config': {'loss_scale_value': 2.0}}}}
-      layer = mp_test_util.MultiplyLayer.from_config(config)
-      self.assertEqual(layer.dtype, None)
-      self.assertEqual(layer(x).dtype, 'float16')
-      self.assertEqual(layer.v.dtype, 'float16')
-      self.assertEqual(type(layer.dtype_policy), policy.Policy)
-      config = layer.get_config()
-      self.assertEqual(config['dtype'], 'float16')
-
-  def test_delete_variable(self):
-    layer = base_layer.Layer(dtype='mixed_float16')
-    layer.x = layer.add_weight('x')
-    self.assertEqual(layer.trainable_weights, [layer.x])
-    del layer.x
-    self.assertEqual(layer.trainable_weights, [])
-
-  def test_build_and_call_layer_in_function(self):
-    layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('mixed_float16'))
-    @tf.function
-    def f():
-      return layer(1.)
-    y = f()
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertEqual(y.dtype, 'float16')
-    self.assertEqual(layer.v.dtype, 'float32')
-    self.assertEqual(self.evaluate(y), 1.)
-
-  def test_unsupported_strategy(self):
-    strategy = create_central_storage_strategy()
-    with strategy.scope(), self.assertRaisesRegex(
-        ValueError, 'Mixed precision is not supported with the '
-        'tf.distribute.Strategy: CentralStorageStrategy. Either '
-        'stop using mixed precision by removing the use of the '
-        '"mixed_float16" policy or use a different Strategy, e.g. '
-        'a MirroredStrategy.'):
-      mp_test_util.MultiplyLayer(dtype='mixed_float16')
-    # Non-mixed policies are fine
-    mp_test_util.MultiplyLayer(dtype=policy.Policy('float64'))
-
-  def test_input_spec_dtype(self):
-    # Test the InputSpec's dtype is compared against the inputs before the layer
-    # casts them, not after.
-    layer = mp_test_util.MultiplyLayer(dtype='float64')
-    layer.input_spec = input_spec.InputSpec(dtype='float16')
-
-    # Test passing Eager tensors
-    x = tf.ones((2, 2), dtype='float16')
-    layer(x)
-    x = tf.ones((2, 2), dtype='float64')
-    with self.assertRaisesRegex(
-        ValueError, 'expected dtype=float16, found dtype=.*float64'):
-      layer(x)
-
-    # Test passing symbolic tensors
-    x = layers.Input((2,), dtype='float16')
-    y = layer(x)
-    model = models.Model(x, y)
-    model(tf.ones((2, 2)))
-
-    x = layers.Input((2,), dtype='float64')
-    with self.assertRaisesRegex(
-        ValueError, 'expected dtype=float16, found dtype=.*float64'):
-      # In TF2, the error is only raised when the model is run
-      y = layer(x)
-      model = models.Model(x, y)
-      model(tf.ones((2, 2)))
-
-
-if __name__ == '__main__':
-  base_layer_utils.enable_v2_dtype_behavior()
-  tf.test.main()
+            model = models.Model(x, y)
+            model(tf.ones((2, 2)))
+
+
+if __name__ == "__main__":
+    base_layer_utils.enable_v2_dtype_behavior()
+    tf.test.main()
diff --git a/keras/mixed_precision/loss_scale_optimizer.py b/keras/mixed_precision/loss_scale_optimizer.py
index dc35117eec13..4ea1b5d8d9c2 100644
--- a/keras/mixed_precision/loss_scale_optimizer.py
+++ b/keras/mixed_precision/loss_scale_optimizer.py
@@ -14,77 +14,88 @@
 # ==============================================================================
 """Contains the loss scaling optimizer class."""
 
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import optimizers
-from keras.optimizers.optimizer_experimental import optimizer as optimizer_experimental
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from keras.optimizers.optimizer_v2 import utils as optimizer_utils
-from keras.utils import generic_utils
-
-import tensorflow.compat.v2 as tf
+from keras.dtensor import utils as dtensor_utils
+from keras.optimizers import optimizer
+from keras.optimizers import utils as optimizer_utils
+from keras.optimizers.legacy import optimizer_v2
+from keras.saving import serialization_lib
 
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2 as legacy_optimizer
+# isort: off
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util.tf_export import keras_export
 
 
 class _UnwrapPreventer:
-  """Wrapper that DistributionStrategy will not unwrap.
+    """Wrapper that DistributionStrategy will not unwrap.
 
-  Typically, DistributionStrategy will unwrap values when going from a cross-
-  replica context to a replica context via `call_for_each_replica`. This class
-  is a wrapper that DistributionStrategy will not unwrap, so it can be used to
-  prevent it from unwrapping a value.
+    Typically, DistributionStrategy will unwrap values when going from a cross-
+    replica context to a replica context via `call_for_each_replica`. This class
+    is a wrapper that DistributionStrategy will not unwrap, so it can be used to
+    prevent it from unwrapping a value.
 
-  TODO(reedwm): Find/implement a better way of preventing values from being
-  unwrapped by DistributionStrategy
-  """
+    TODO(reedwm): Find/implement a better way of preventing values from being
+    unwrapped by DistributionStrategy
+    """
 
-  __slots__ = ['value']
+    __slots__ = ["value"]
 
-  def __init__(self, value):
-    self.value = value
+    def __init__(self, value):
+        self.value = value
 
 
 def _is_all_finite(grads):
-  """Returns a scalar boolean tensor indicating if all gradients are finite."""
-  is_finite_per_grad = [
-      tf.reduce_all(tf.math.is_finite(g)) for g in grads if g is not None
-  ]
-  return tf.reduce_all(is_finite_per_grad)
+    """Returns a scalar boolean tensor indicating if all gradients are
+    finite."""
+
+    def raw_values(g):
+        return g.values if isinstance(g, tf.IndexedSlices) else g
+
+    is_finite_per_grad = [
+        tf.reduce_all(tf.math.is_finite(raw_values(g)))
+        for g in grads
+        if g is not None
+    ]
+    return tf.reduce_all(is_finite_per_grad)
 
 
 def _op_in_graph_mode(tensor):
-  """Returns the tensor's op in graph mode, or the tensor in eager mode.
+    """Returns the tensor's op in graph mode, or the tensor in eager mode.
 
-  This is useful because sometimes an op is needed in graph mode instead of a
-  tensor. In eager mode, there are no ops.
+    This is useful because sometimes an op is needed in graph mode instead of a
+    tensor. In eager mode, there are no ops.
 
-  Args:
-    tensor: A tensor.
+    Args:
+      tensor: A tensor.
 
-  Returns:
-    The tensor's op in graph mode. The tensor in eager mode.
-  """
-  if tf.executing_eagerly():
-    return tensor
-  return tensor.op
+    Returns:
+      The tensor's op in graph mode. The tensor in eager mode.
+    """
+    if tf.executing_eagerly():
+        return tensor
+    return tensor.op
 
 
 def _assign_if_finite(var, value):
-  """Assigns a value to a variable if the value is finite."""
-  return tf.cond(
-      tf.math.is_finite(value), lambda: _op_in_graph_mode(var.assign(value)),
-      tf.no_op)
+    """Assigns a value to a variable if the value is finite."""
+    return tf.cond(
+        tf.math.is_finite(value),
+        lambda: _op_in_graph_mode(var.assign(value)),
+        tf.no_op,
+    )
 
 
-def _maybe_warn_about_scaling(loss_has_been_scaled,
-                              gradients_have_been_unscaled):
-  """Warn if the loss or gradients hasn't been scaled or unscaled."""
-  if loss_has_been_scaled and gradients_have_been_unscaled:
-    return
+def _maybe_warn_about_scaling(
+    loss_has_been_scaled, gradients_have_been_unscaled
+):
+    """Warn if the loss or gradients hasn't been scaled or unscaled."""
+    if loss_has_been_scaled and gradients_have_been_unscaled:
+        return
 
-  example_code = """
+    example_code = """
     with tf.GradientTape() as tape:
       loss = loss_fn()
       scaled_loss = opt.get_scaled_loss(loss)
@@ -92,1310 +103,1514 @@ def _maybe_warn_about_scaling(loss_has_been_scaled,
     grads = opt.get_unscaled_gradients(scaled_grads)
     opt.apply_gradients([(grads, var)])"""
 
-  if not loss_has_been_scaled and not gradients_have_been_unscaled:
-    tf_logging.warning(
-        'You forgot to call LossScaleOptimizer.get_scaled_loss() and '
-        'LossScaleOptimizer.get_unscaled_gradients() before calling '
-        'LossScaleOptimizer.apply_gradients(). This will likely result in '
-        'worse model quality, so please call them in the correct places! For '
-        f'example:{example_code}\nFor more information, see '
-        'https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/LossScaleOptimizer'
-    )
-  elif not loss_has_been_scaled:
-    tf_logging.warning(
-        'You forgot to call LossScaleOptimizer.get_scaled_loss() before '
-        'calling LossScaleOptimizer.apply_gradients() (you did call '
-        'get_unscaled_gradients() however). This will likely result in worse '
-        'model quality, so please call get_scaled_loss() in the correct place! '
-        f'For example:{example_code}\nFor more information, see '
-        'https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/LossScaleOptimizer'
-    )
-  elif not gradients_have_been_unscaled:
-    tf_logging.warning(
-        'You forgot to call LossScaleOptimizer.get_unscaled_gradients() '
-        'before calling LossScaleOptimizer.apply_gradients() (you did call '
-        'get_scaled_loss() however). This will likely result in worse '
-        'model quality, so please call get_unscaled_gradients() in the correct '
-        f'place! For example:{example_code}\nFor more information, see '
-        'https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/LossScaleOptimizer'
-    )
+    if not loss_has_been_scaled and not gradients_have_been_unscaled:
+        tf_logging.warning(
+            "You forgot to call LossScaleOptimizer.get_scaled_loss() and "
+            "LossScaleOptimizer.get_unscaled_gradients() before calling "
+            "LossScaleOptimizer.apply_gradients(). This will likely result in "
+            "worse model quality, so please call them in the correct places! "
+            f"For example:{example_code}\nFor more information, see "
+            "https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/LossScaleOptimizer"  # noqa: E501
+        )
+    elif not loss_has_been_scaled:
+        tf_logging.warning(
+            "You forgot to call LossScaleOptimizer.get_scaled_loss() before "
+            "calling LossScaleOptimizer.apply_gradients() (you did call "
+            "get_unscaled_gradients() however). This will likely result in "
+            "worse model quality, so please call get_scaled_loss() in the "
+            f"correct place! For example:{example_code}\nFor more information, "
+            "see "
+            "https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/LossScaleOptimizer"  # noqa: E501
+        )
+    elif not gradients_have_been_unscaled:
+        tf_logging.warning(
+            "You forgot to call LossScaleOptimizer.get_unscaled_gradients() "
+            "before calling LossScaleOptimizer.apply_gradients() (you did call "
+            "get_scaled_loss() however). This will likely result in worse "
+            "model quality, so please call get_unscaled_gradients() in the "
+            f"correct place! For example:{example_code}\nFor more information, "
+            "see "
+            "https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/LossScaleOptimizer"  # noqa: E501
+        )
 
 
 class _DynamicLossScaleState(tf.__internal__.tracking.Trackable):
-  """The state of a dynamic loss scale."""
-
-  def __init__(self,
-               initial_loss_scale,
-               growth_steps,
-               multiplier):
-    """Creates the dynamic loss scale."""
-    super().__init__()
-    self._initial_loss_scale = float(initial_loss_scale)
-    self._growth_steps = int(growth_steps)
-    self._multiplier = float(multiplier)
-
-    self._weights = {}
-    self._current_loss_scale = self._add_weight(
-        name='current_loss_scale',
-        dtype=tf.float32,
-        initial_value=self._initial_loss_scale)
-    # The number of consecutive steps with finite gradients since the last
-    # nonfinite gradient or change in loss scale. The name is 'good_steps' for
-    # backwards compatibility with older checkpoints.
-    self._counter = self._add_weight(
-        name='good_steps', dtype=tf.int64, initial_value=0)
-
-  def _add_weight(self, name, initial_value, dtype=None):
-    """Adds a weight to this loss scale.
-
-    Args:
-      name: Variable name.
-      initial_value: The variable's initial value.
-      dtype: The type of the variable.
-
-    Returns:
-      A variable.
-
-    Raises:
-      RuntimeError: If a weight with `name` has already been added.
-    """
-    variable = tf.Variable(
-        initial_value=initial_value,
-        name=name,
-        dtype=dtype,
-        trainable=False,
-        synchronization=tf.VariableSynchronization.AUTO,
-        # Set aggregation to NONE, as loss scaling variables should never be
-        # aggregated.
-        aggregation=tf.VariableAggregation.NONE)
-    if tf.executing_eagerly():
-      graph_key = None
-    else:
-      graph = tf.compat.v1.get_default_graph()
-      graph_key = graph._graph_key  # pylint: disable=protected-access
-
-    key = (name, graph_key)
-    self._weights[key] = variable
-    self._handle_deferred_dependencies(name=name, trackable=variable)
-    backend.track_variable(variable)
-    return variable
-
-  def _trackable_children(self, save_type='checkpoint', **kwargs):
-    """From Trackable. Gather graph-specific weights to save."""
-    if tf.executing_eagerly():
-      graph_key = None
-    else:
-      graph = tf.compat.v1.get_default_graph()
-      graph_key = graph._graph_key  # pylint: disable=protected-access
-    weights = {}
-    for (name, g), v in sorted(self._weights.items(), key=lambda i: i[0][0]):
-      if g == graph_key:
-        weights[name] = v
-    weights.update(
-        super()._trackable_children(save_type, **kwargs))
-    return weights
-
-  def _lookup_dependency(self, name):
-    """From Trackable. Find a weight in the current graph."""
-    unconditional = super()._lookup_dependency(name)
-    if unconditional is not None:
-      return unconditional
-    if tf.executing_eagerly():
-      graph_key = None
-    else:
-      graph = tf.compat.v1.get_default_graph()
-      graph_key = graph._graph_key  # pylint: disable=protected-access
-    return self._weights.get((name, graph_key), None)
-
-  @property
-  def initial_loss_scale(self):
-    return self._initial_loss_scale
-
-  @property
-  def growth_steps(self):
-    return self._growth_steps
-
-  @property
-  def multiplier(self):
-    return self._multiplier
-
-  @property
-  def current_loss_scale(self):
-    """Returns the current loss scale as a float32 `tf.Variable`."""
-    return self._current_loss_scale
-
-  @property
-  def counter(self):
-    """Returns the counter as a float32 `tf.Variable`."""
-    return self._counter
-
-  def __call__(self):
-    """Returns the current loss scale as a scalar `float32` tensor."""
-    return tf.convert_to_tensor(self._current_loss_scale)
-
-  def update(self, grads):
-    """Updates the value of the loss scale.
-
-    Args:
-      grads: A nested structure of unscaled gradients, each which is an
-        all-reduced gradient of the loss with respect to a weight.
-
-    Returns:
-      update_op: In eager mode, None. In graph mode, an op to update the loss
-        scale.
-      should_apply_gradients: Either a bool or a scalar boolean tensor. If
-        False, the caller should skip applying `grads` to the variables this
-        step.
-    """
-    grads = tf.nest.flatten(grads)
-    if tf.distribute.has_strategy(
-    ) and tf.distribute.in_cross_replica_context():
-      distribution = tf.distribute.get_strategy()
-      is_finite_per_replica = distribution.extended.call_for_each_replica(
-          _is_all_finite, args=(grads,))
-      # Each replica computed the same `is_finite` value, since `grads` is
-      # all-reduced across replicas. Arbitrarily take `is_finite` from the first
-      # replica.
-      is_finite = (
-          distribution.experimental_local_results(is_finite_per_replica)[0])
-    else:
-      is_finite = _is_all_finite(grads)
-
-    def update_if_finite_grads():
-      """Update assuming the gradients are finite."""
-
-      def incr_loss_scale():
-        new_loss_scale = self.current_loss_scale * self.multiplier
-        return tf.group(
-            _assign_if_finite(self.current_loss_scale, new_loss_scale),
-            self.counter.assign(0))
-
-      return tf.cond(
-          self.counter + 1 >= self.growth_steps,
-          incr_loss_scale,
-          lambda: _op_in_graph_mode(self.counter.assign_add(1)))
-
-    def update_if_not_finite_grads():
-      """Update assuming the gradients are nonfinite."""
-
-      new_loss_scale = tf.maximum(
-          self.current_loss_scale / self.multiplier, 1)
-      return tf.group(
-          self.counter.assign(0),
-          self.current_loss_scale.assign(new_loss_scale))
-
-    update_op = tf.cond(is_finite,
-                        update_if_finite_grads,
-                        update_if_not_finite_grads)
-    should_apply_gradients = is_finite
-    return update_op, should_apply_gradients
+    """The state of a dynamic loss scale."""
+
+    def __init__(self, initial_loss_scale, growth_steps, multiplier):
+        """Creates the dynamic loss scale."""
+        super().__init__()
+        self._initial_loss_scale = float(initial_loss_scale)
+        self._growth_steps = int(growth_steps)
+        self._multiplier = float(multiplier)
+
+        self._weights = {}
+        self._current_loss_scale = self._add_weight(
+            name="current_loss_scale",
+            dtype=tf.float32,
+            initial_value=self._initial_loss_scale,
+        )
+        # The number of consecutive steps with finite gradients since the last
+        # nonfinite gradient or change in loss scale. The name is 'good_steps'
+        # for backwards compatibility with older checkpoints.
+        self._counter = self._add_weight(
+            name="good_steps", dtype=tf.int64, initial_value=0
+        )
+
+    def _add_weight(self, name, initial_value, dtype=None):
+        """Adds a weight to this loss scale.
+
+        Args:
+          name: Variable name.
+          initial_value: The variable's initial value.
+          dtype: The type of the variable.
+
+        Returns:
+          A variable.
+
+        Raises:
+          RuntimeError: If a weight with `name` has already been added.
+        """
+        variable = tf.Variable(
+            initial_value=initial_value,
+            name=name,
+            dtype=dtype,
+            trainable=False,
+            synchronization=tf.VariableSynchronization.AUTO,
+            # Set aggregation to NONE, as loss scaling variables should never be
+            # aggregated.
+            aggregation=tf.VariableAggregation.NONE,
+        )
+        if tf.executing_eagerly():
+            graph_key = None
+        else:
+            graph = tf.compat.v1.get_default_graph()
+            graph_key = graph._graph_key
+
+        key = (name, graph_key)
+        self._weights[key] = variable
+        self._handle_deferred_dependencies(name=name, trackable=variable)
+        backend.track_variable(variable)
+        return variable
+
+    def _trackable_children(self, save_type="checkpoint", **kwargs):
+        """From Trackable. Gather graph-specific weights to save."""
+        if tf.executing_eagerly():
+            graph_key = None
+        else:
+            graph = tf.compat.v1.get_default_graph()
+            graph_key = graph._graph_key
+        weights = {}
+        for (name, g), v in sorted(
+            self._weights.items(), key=lambda i: i[0][0]
+        ):
+            if g == graph_key:
+                weights[name] = v
+        weights.update(super()._trackable_children(save_type, **kwargs))
+        return weights
+
+    def _lookup_dependency(self, name, cached_dependencies=None):
+        """From Trackable. Find a weight in the current graph."""
+        if cached_dependencies is not None:
+            unconditional = cached_dependencies.get(name)
+        else:
+            unconditional = super()._lookup_dependency(name)
+        if unconditional is not None:
+            return unconditional
+        if tf.executing_eagerly():
+            graph_key = None
+        else:
+            graph = tf.compat.v1.get_default_graph()
+            graph_key = graph._graph_key
+        return self._weights.get((name, graph_key), None)
+
+    @property
+    def initial_loss_scale(self):
+        return self._initial_loss_scale
+
+    @property
+    def growth_steps(self):
+        return self._growth_steps
+
+    @property
+    def multiplier(self):
+        return self._multiplier
+
+    @property
+    def current_loss_scale(self):
+        """Returns the current loss scale as a float32 `tf.Variable`."""
+        return self._current_loss_scale
+
+    @property
+    def counter(self):
+        """Returns the counter as a float32 `tf.Variable`."""
+        return self._counter
+
+    def __call__(self):
+        """Returns the current loss scale as a scalar `float32` tensor."""
+        return tf.convert_to_tensor(self._current_loss_scale)
+
+    def update(self, grads):
+        """Updates the value of the loss scale.
+
+        Args:
+          grads: A nested structure of unscaled gradients, each which is an
+            all-reduced gradient of the loss with respect to a weight.
+
+        Returns:
+          update_op: In eager mode, None. In graph mode, an op to update the
+            loss scale.
+          should_apply_gradients: Either a bool or a scalar boolean tensor. If
+            False, the caller should skip applying `grads` to the variables this
+            step.
+        """
+        grads = tf.nest.flatten(grads)
+        if (
+            tf.distribute.has_strategy()
+            and tf.distribute.in_cross_replica_context()
+        ):
+            distribution = tf.distribute.get_strategy()
+            is_finite_per_replica = distribution.extended.call_for_each_replica(
+                _is_all_finite, args=(grads,)
+            )
+            # Each replica computed the same `is_finite` value, since `grads` is
+            # all-reduced across replicas. Arbitrarily take `is_finite` from the
+            # first replica.
+            is_finite = distribution.experimental_local_results(
+                is_finite_per_replica
+            )[0]
+        else:
+            is_finite = _is_all_finite(grads)
+
+        def update_if_finite_grads():
+            """Update assuming the gradients are finite."""
+
+            def incr_loss_scale():
+                new_loss_scale = self.current_loss_scale * self.multiplier
+                return tf.group(
+                    _assign_if_finite(self.current_loss_scale, new_loss_scale),
+                    self.counter.assign(0),
+                )
+
+            return tf.cond(
+                self.counter + 1 >= self.growth_steps,
+                incr_loss_scale,
+                lambda: _op_in_graph_mode(self.counter.assign_add(1)),
+            )
+
+        def update_if_not_finite_grads():
+            """Update assuming the gradients are nonfinite."""
+
+            new_loss_scale = tf.maximum(
+                self.current_loss_scale / self.multiplier, 1
+            )
+            return tf.group(
+                self.counter.assign(0),
+                self.current_loss_scale.assign(new_loss_scale),
+            )
+
+        update_op = tf.cond(
+            is_finite, update_if_finite_grads, update_if_not_finite_grads
+        )
+        should_apply_gradients = is_finite
+        return update_op, should_apply_gradients
 
 
 # See LossScaleOptimizer docstring for why this is so big
-_DEFAULT_INITIAL_SCALE = 2 ** 15
+_DEFAULT_INITIAL_SCALE = 2**15
 _DEFAULT_GROWTH_STEPS = 2000
 
 
 # TODO(b/215389169): Delete this class after `OptimizerV2` is deprecated.
 class LossScaleOptimizerMetaclass(type):
-  """Metaclass that delegates LossScaleOptimizer instance creation.
-
-  This metaclass causes a LossScaleOptimizer or LossScaleOptimizerV3 to be
-  created when a BaseLossScaleOptimizer is constructed. As a result, when a
-  user creates a loss scale optimizer with
-  `tf.keras.mixed_precision.LossScaleOptimizer(opt)`, either a
-  LossScaleOptimizer or LossScaleOptimizerV3 will be created, depending on the
-  type of `opt`.
-  """
-
-  def __call__(cls, inner_optimizer, *args, **kwargs):
-    if cls is not BaseLossScaleOptimizer:
-      return super(LossScaleOptimizerMetaclass,
-                   cls).__call__(inner_optimizer, *args, **kwargs)
-    if isinstance(inner_optimizer, optimizer_v2.OptimizerV2):
-      return LossScaleOptimizer(inner_optimizer, *args, **kwargs)
-    elif isinstance(inner_optimizer, optimizer_experimental.Optimizer):
-      return LossScaleOptimizerV3(inner_optimizer, *args, **kwargs)
-
-    # Raise TypeError because inner_optimizer is not an optimizer
-    msg = (f'"inner_optimizer" must be an instance of '
-           f'`tf.keras.optimizers.Optimizer` or '
-           f'`tf.keras.optimizers.experimental.Optimizer`, but got: '
-           f'{inner_optimizer}.')
-    if isinstance(inner_optimizer, legacy_optimizer.OptimizerV2):
-      msg += (' Please make sure "inner_optimizer" is not an instance of '
-              '`tensorflow.python.keras.optimizers`, which is '
-              'the legacy keras code and will be removed in future release. '
-              'Please use the tf.keras public API instead.')
-    raise TypeError(msg)
-
-
-# TODO(b/215389169): Delete this class after `OptimizerV2` is deprecated.
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.mixed_precision.LossScaleOptimizer')
-class BaseLossScaleOptimizer(metaclass=LossScaleOptimizerMetaclass):
-  """An optimizer that applies loss scaling to prevent numeric underflow.
-
-  Loss scaling is a technique to prevent numeric underflow in intermediate
-  gradients when float16 is used. To prevent underflow, the loss is multiplied
-  (or "scaled") by a certain factor called the "loss scale", which causes
-  intermediate gradients to be scaled by the loss scale as well. The final
-  gradients are divided (or "unscaled") by the loss scale to bring them back to
-  their original value.
-
-  `LossScaleOptimizer` wraps another optimizer and applies loss scaling to it.
-  By default, the loss scale is dynamically updated over time so you do not have
-  to choose the loss scale. The `minimize` method automatically scales the loss,
-  unscales the gradients, and updates the loss scale so all you have to do is
-  wrap your optimizer with a `LossScaleOptimizer` if you use `minimize`. For
-  example:
-
-  >>> opt = tf.keras.optimizers.SGD(0.25)
-  >>> opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)
-  >>> var = tf.Variable(1.)
-  >>> loss_fn = lambda: var ** 2
-  >>> # 'minimize' applies loss scaling and updates the loss sale.
-  >>> opt.minimize(loss_fn, var_list=var)
-  >>> var.numpy()
-  0.5
-
-  If a `tf.GradientTape` is used to compute gradients instead of `minimize`, you
-  must scale the loss and gradients manually. This can be done with the
-  `LossScaleOptimizer.get_scaled_loss` and
-  `LossScaleOptimizer.get_unscaled_gradients` methods. For example:
-
-  >>> with tf.GradientTape() as tape:
-  ...   loss = loss_fn()
-  ...   scaled_loss = opt.get_scaled_loss(loss)
-  >>> scaled_grad = tape.gradient(scaled_loss, var)
-  >>> (grad,) = opt.get_unscaled_gradients([scaled_grad])
-  >>> opt.apply_gradients([(grad, var)])  # Loss scale is updated here
-  >>> var.numpy()
-  0.25
-
-  Warning: If you forget to call `get_scaled_loss` or `get_unscaled_gradients`
-  (or both) when using a `tf.GradientTape`, the model will likely converge to a
-  worse quality. Please make sure you call each function exactly once.
-
-  When mixed precision with float16 is used, there is typically no risk of
-  underflow affecting model quality if loss scaling is properly used. See
-  [the mixed precision guide](
-  https://www.tensorflow.org/guide/keras/mixed_precision) for more information
-  on how to use mixed precision.
-
-  Args:
-    inner_optimizer: The `tf.keras.optimizers.Optimizer` or
-      `tf.keras.optimizers.experimental.Optimizer` instance to wrap.
-    dynamic: Bool indicating whether dynamic loss scaling is used. Defaults to
-      True. If True, the loss scale will be dynamically updated over time using
-      an algorithm that keeps the loss scale at approximately its optimal value.
-      If False, a single fixed loss scale is used and `initial_scale` must be
-      specified, which is used as the loss scale. Recommended to keep as True,
-      as choosing a fixed loss scale can be tricky. Currently, there is a small
-      performance overhead to dynamic loss scaling compared to fixed loss
-      scaling.
-    initial_scale: The initial loss scale. If `dynamic` is True, this defaults
-      to `2 ** 15`. If `dynamic` is False, this must be specified and acts as
-      the sole loss scale, as the loss scale does not change over time. When
-      dynamic loss scaling is used, is better for this to be a very high number,
-      because a loss scale that is too high gets lowered far more quickly than a
-      loss scale that is too low gets raised.
-    dynamic_growth_steps: With dynamic loss scaling, every
-      `dynamic_growth_steps` steps with finite gradients, the loss scale is
-      doubled. Defaults to 2000. If a nonfinite gradient is encountered, the
-      count is reset back to zero, gradients are skipped that step, and the loss
-      scale is halved. The count can be queried with
-      `LossScaleOptimizer.dynamic_counter`. This argument can only be specified
-      if `dynamic` is True.
-
-  `LossScaleOptimizer` will occasionally skip applying gradients to the
-  variables, in which case the trainable variables will not change that step.
-  This is done because the dynamic loss scale will sometimes be raised too
-  high, causing overflow in the gradients. Typically, the first 2 to 15 steps of
-  the model are skipped as the initial loss scale is very high, but afterwards
-  steps will only be skipped on average 0.05% of the time (the fraction of steps
-  skipped is `1 / dynamic_growth_steps`).
-
-  `LossScaleOptimizer` delegates all public `Optimizer` methods to the inner
-  optimizer. Additionally, in methods `minimize` and `get_gradients`, it scales
-  the loss and unscales the gradients. In methods `minimize` and
-  `apply_gradients`, it additionally updates the loss scale and skips applying
-  gradients if any gradient has a nonfinite value.
-
-  ### Hyperparameters
-
-  If wrapping a `tf.keras.optimizers.Optimizer`, hyperparameters can be accessed
-  and set on the LossScaleOptimizer, which will be delegated to the wrapped
-  optimizer.
-
-  >>> opt = tf.keras.optimizers.Adam(beta_1=0.8, epsilon=1e-5)
-  >>> opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)
-  >>> opt.beta_1  # Equivalent to `opt.inner_optimizer.beta_1`
-  0.8
-  >>> opt.beta_1 = 0.7  # Equivalent to `opt.inner_optimizer.beta_1 = 0.7`
-  >>> opt.beta_1
-  0.7
-  >>> opt.inner_optimizer.beta_1
-  0.7
-
-  However, accessing or setting non-hyperparameters is not delegated to the
-  LossScaleOptimizer. In an Adam optimizer, `beta_1` is a hyperparameter but
-  `epsilon` is not, as the Adam optimizer only calls `Optimizer._set_hyper` on
-  `beta_1`.
-
-  >>> opt.inner_optimizer.epsilon
-  1e-5
-  >>> opt.epsilon
-  Traceback (most recent call last):
-  ...
-  AttributeError: 'LossScaleOptimizer' object has no attribute 'epsilon'
-  >>> opt.epsilon = 1e-4  # This does NOT set epsilon on `opt.inner_optimizer`
-  >>> opt.inner_optimizer.epsilon
-  >>> 1e-5
-
-  In the above example, despite epsilon being set on the LossScaleOptimizer, the
-  old epsilon value will still be used when training as epsilon was not set on
-  the inner optimizer.
-  """
-
-  @property
-  def dynamic(self):
-    """Bool indicating whether dynamic loss scaling is used."""
-    raise NotImplementedError
-
-  @property
-  def loss_scale(self):
-    """The current loss scale as a float32 scalar tensor."""
-    raise NotImplementedError
-
-  @property
-  def dynamic_counter(self):
-    """The number of steps since the loss scale was last increased or decreased.
-
-    This is None if `LossScaleOptimizer.dynamic` is False.
-
-    The counter is incremented every step. Once it reaches
-    `LossScaleOptimizer.dynamic_growth_steps`, the loss scale will be doubled
-    and the counter will be reset back to zero. If nonfinite gradients are
-    encountered, the loss scale will be halved and the counter will be reset
-    back to zero.
+    """Metaclass that delegates LossScaleOptimizer instance creation.
+
+    This metaclass causes a LossScaleOptimizer or LossScaleOptimizerV3 to be
+    created when a BaseLossScaleOptimizer is constructed. As a result, when a
+    user creates a loss scale optimizer with
+    `tf.keras.mixed_precision.LossScaleOptimizer(opt)`, either a
+    LossScaleOptimizer or LossScaleOptimizerV3 will be created, depending on the
+    type of `opt`.
     """
-    raise NotImplementedError
 
-  @property
-  def initial_scale(self):
-    """The initial loss scale.
-
-    If `LossScaleOptimizer.dynamic` is False, this is the same number as
-    `LossScaleOptimizer.loss_scale`, as the loss scale never changes.
-    """
-    raise NotImplementedError
+    def __call__(cls, inner_optimizer, *args, **kwargs):
+        if cls is not BaseLossScaleOptimizer:
+            return super(LossScaleOptimizerMetaclass, cls).__call__(
+                inner_optimizer, *args, **kwargs
+            )
+        if isinstance(inner_optimizer, optimizer_v2.OptimizerV2):
+            return LossScaleOptimizer(inner_optimizer, *args, **kwargs)
+        elif isinstance(inner_optimizer, optimizer.Optimizer):
+            return LossScaleOptimizerV3(inner_optimizer, *args, **kwargs)
+
+        # Raise TypeError because inner_optimizer is not an optimizer
+        msg = (
+            '"inner_optimizer" must be an instance of '
+            "`tf.keras.optimizers.Optimizer` or "
+            "`tf.keras.optimizers.experimental.Optimizer`, but got: "
+            f"{inner_optimizer}."
+        )
+        raise TypeError(msg)
 
-  @property
-  def dynamic_growth_steps(self):
-    """The number of steps it takes to increase the loss scale.
 
-    This is None if `LossScaleOptimizer.dynamic` is False.
-
-    Every `dynamic_growth_steps` consecutive steps with finite gradients, the
-    loss scale is increased.
-    """
-    raise NotImplementedError
-
-  @property
-  def inner_optimizer(self):
-    """The optimizer that this LossScaleOptimizer is wrapping."""
-    raise NotImplementedError
-
-  def get_scaled_loss(self, loss):
-    """Scales the loss by the loss scale.
+# TODO(b/215389169): Delete this class after `OptimizerV2` is deprecated.
 
-    This method is only needed if you compute gradients manually, e.g. with
-    `tf.GradientTape`. In that case, call this method to scale the loss before
-    passing the loss to `tf.GradientTape`. If you use
-    `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`, loss
-    scaling is automatically applied and this method is unneeded.
 
-    If this method is called, `get_unscaled_gradients` should also be called.
-    See the `tf.keras.mixed_precision.LossScaleOptimizer` doc for
-    an example.
+@keras_export("keras.mixed_precision.LossScaleOptimizer")
+class BaseLossScaleOptimizer(metaclass=LossScaleOptimizerMetaclass):
+    """An optimizer that applies loss scaling to prevent numeric underflow.
+
+    Loss scaling is a technique to prevent numeric underflow in intermediate
+    gradients when float16 is used. To prevent underflow, the loss is multiplied
+    (or "scaled") by a certain factor called the "loss scale", which causes
+    intermediate gradients to be scaled by the loss scale as well. The final
+    gradients are divided (or "unscaled") by the loss scale to bring them back
+    to their original value.
+
+    `LossScaleOptimizer` wraps another optimizer and applies loss scaling to it.
+    By default, the loss scale is dynamically updated over time so you do not
+    have to choose the loss scale. The `minimize` method automatically scales
+    the loss, unscales the gradients, and updates the loss scale so all you have
+    to do is wrap your optimizer with a `LossScaleOptimizer` if you use
+    `minimize`. For example:
+
+    >>> opt = tf.keras.optimizers.experimental.SGD(0.25)
+    >>> opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)
+    >>> var = tf.Variable(1.)
+    >>> loss_fn = lambda: var ** 2
+    >>> # 'minimize' applies loss scaling and updates the loss sale.
+    >>> opt.minimize(loss_fn, var_list=[var])
+    >>> var.numpy()
+    0.5
+
+    If a `tf.GradientTape` is used to compute gradients instead of `minimize`,
+    you must scale the loss and gradients manually. This can be done with the
+    `LossScaleOptimizer.get_scaled_loss` and
+    `LossScaleOptimizer.get_unscaled_gradients` methods. For example:
+
+    >>> with tf.GradientTape() as tape:
+    ...   loss = loss_fn()
+    ...   scaled_loss = opt.get_scaled_loss(loss)
+    >>> scaled_grad = tape.gradient(scaled_loss, var)
+    >>> (grad,) = opt.get_unscaled_gradients([scaled_grad])
+    >>> opt.apply_gradients([(grad, var)])  # Loss scale is updated here
+    >>> var.numpy()
+    0.25
+
+    Warning: If you forget to call `get_scaled_loss` or `get_unscaled_gradients`
+    (or both) when using a `tf.GradientTape`, the model will likely converge to
+    a worse quality. Please make sure you call each function exactly once.
+
+    When mixed precision with float16 is used, there is typically no risk of
+    underflow affecting model quality if loss scaling is properly used. See
+    [the mixed precision guide](
+    https://www.tensorflow.org/guide/keras/mixed_precision) for more information
+    on how to use mixed precision.
 
     Args:
-      loss: The loss, which will be multiplied by the loss scale. Can either be
-        a tensor or a callable returning a tensor.
-
-    Returns:
-      `loss` multiplied by `LossScaleOptimizer.loss_scale`.
+      inner_optimizer: The `tf.keras.optimizers.Optimizer` or
+        `tf.keras.optimizers.experimental.Optimizer` instance to wrap.
+      dynamic: Bool indicating whether dynamic loss scaling is used. If `True`,
+        the loss scale will be dynamically updated over time using an algorithm
+        that keeps the loss scale at approximately its optimal value. If False,
+        a single fixed loss scale is used and  `initial_scale` must be
+        specified, which is used as the loss scale.
+        Recommended to keep as True, as choosing a fixed loss scale can be
+        tricky. Currently, there is a small performance overhead to dynamic loss
+        scaling compared to fixed loss scaling. Defaults to `True`.
+      initial_scale: The initial loss scale. If `dynamic` is True, this defaults
+        to `2 ** 15`. If `dynamic` is False, this must be specified and acts as
+        the sole loss scale, as the loss scale does not change over time. When
+        dynamic loss scaling is used, is better for this to be a very high
+        number, because a loss scale that is too high gets lowered far more
+        quickly than a loss scale that is too low gets raised.
+      dynamic_growth_steps: With dynamic loss scaling, every
+        `dynamic_growth_steps` steps with finite gradients, the loss scale is
+        doubled. If a nonfinite gradient is encountered, the
+        count is reset back to zero, gradients are skipped that step, and the
+        loss scale is halved. The count can be queried with
+        `LossScaleOptimizer.dynamic_counter`. This argument can only be
+        specified if `dynamic` is True. Defaults to `2000`.
+
+    `LossScaleOptimizer` will occasionally skip applying gradients to the
+    variables, in which case the trainable variables will not change that step.
+    This is done because the dynamic loss scale will sometimes be raised too
+    high, causing overflow in the gradients. Typically, the first 2 to 15 steps
+    of the model are skipped as the initial loss scale is very high, but
+    afterwards steps will only be skipped on average 0.05% of the time (the
+    fraction of steps skipped is `1 / dynamic_growth_steps`).
+
+    `LossScaleOptimizer` delegates all public `Optimizer` methods to the inner
+    optimizer. Additionally, in methods `minimize` and `get_gradients`, it
+    scales the loss and unscales the gradients. In methods `minimize` and
+    `apply_gradients`, it additionally updates the loss scale and skips applying
+    gradients if any gradient has a nonfinite value.
+
+    ### Hyperparameters
+
+    If wrapping a `tf.keras.optimizers.Optimizer`, hyperparameters can be
+    accessed and set on the LossScaleOptimizer, which will be delegated to the
+    wrapped optimizer.
+
+    >>> opt = tf.keras.optimizers.legacy.Adam(beta_1=0.8, epsilon=1e-5)
+    >>> opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)
+    >>> opt.beta_1  # Equivalent to `opt.inner_optimizer.beta_1`
+    0.8
+    >>> opt.beta_1 = 0.7  # Equivalent to `opt.inner_optimizer.beta_1 = 0.7`
+    >>> opt.beta_1
+    0.7
+    >>> opt.inner_optimizer.beta_1
+    0.7
+
+    However, accessing or setting non-hyperparameters is not delegated to the
+    LossScaleOptimizer. In an Adam optimizer, `beta_1` is a hyperparameter but
+    `epsilon` is not, as the Adam optimizer only calls `Optimizer._set_hyper` on
+    `beta_1`.
+
+    >>> opt.inner_optimizer.epsilon
+    1e-5
+    >>> opt.epsilon
+    Traceback (most recent call last):
+    ...
+    AttributeError: 'LossScaleOptimizer' object has no attribute 'epsilon'
+    >>> opt.epsilon = 1e-4  # This does NOT set epsilon on `opt.inner_optimizer`
+    >>> opt.inner_optimizer.epsilon
+    >>> 1e-5
+
+    In the above example, despite epsilon being set on the LossScaleOptimizer,
+    the old epsilon value will still be used when training as epsilon was not
+    set on the inner optimizer.
     """
-    # Calls to this function would be delegated to `get_scaled_loss`
-    # of either `LossScaleOptimizer` or `LossScaleOptimizerV3`, depending on
-    # the type of `inner_optimizer`.
-    raise NotImplementedError
-
-  def get_unscaled_gradients(self, grads):
-    """Unscales the gradients by the loss scale.
-
-    This method is only needed if you compute gradients manually, e.g. with
-    `tf.GradientTape`. In that case, call this method to unscale the gradients
-    after computing them with `tf.GradientTape`. If you use
-    `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`, loss
-    scaling is automatically applied and this method is unneeded.
-
-    If this method is called, `get_scaled_loss` should also be called. See
-    the `tf.keras.mixed_precision.LossScaleOptimizer` doc for an
-    example.
 
-    Args:
-      grads: A list of tensors, each which will be divided by the loss scale.
-        Can have None values, which are ignored.
-
-    Returns:
-      A new list the same size as `grads`, where every non-None value in `grads`
-      is divided by `LossScaleOptimizer.loss_scale`.
+    @property
+    def dynamic(self):
+        """Bool indicating whether dynamic loss scaling is used."""
+        raise NotImplementedError
+
+    @property
+    def loss_scale(self):
+        """The current loss scale as a float32 scalar tensor."""
+        raise NotImplementedError
+
+    @property
+    def dynamic_counter(self):
+        """The number of steps since the loss scale was last increased or
+        decreased.
+
+        This is None if `LossScaleOptimizer.dynamic` is False.
+
+        The counter is incremented every step. Once it reaches
+        `LossScaleOptimizer.dynamic_growth_steps`, the loss scale will be
+        doubled and the counter will be reset back to zero. If nonfinite
+        gradients are encountered, the loss scale will be halved and the counter
+        will be reset back to zero.
+        """
+        raise NotImplementedError
+
+    @property
+    def initial_scale(self):
+        """The initial loss scale.
+
+        If `LossScaleOptimizer.dynamic` is False, this is the same number as
+        `LossScaleOptimizer.loss_scale`, as the loss scale never changes.
+        """
+        raise NotImplementedError
+
+    @property
+    def dynamic_growth_steps(self):
+        """The number of steps it takes to increase the loss scale.
+
+        This is None if `LossScaleOptimizer.dynamic` is False.
+
+        Every `dynamic_growth_steps` consecutive steps with finite gradients,
+        the loss scale is increased.
+        """
+        raise NotImplementedError
+
+    @property
+    def inner_optimizer(self):
+        """The optimizer that this LossScaleOptimizer is wrapping."""
+        raise NotImplementedError
+
+    def get_scaled_loss(self, loss):
+        """Scales the loss by the loss scale.
+
+        This method is only needed if you compute gradients manually, e.g. with
+        `tf.GradientTape`. In that case, call this method to scale the loss
+        before passing the loss to `tf.GradientTape`. If you use
+        `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`,
+        loss scaling is automatically applied and this method is unneeded.
+
+        If this method is called, `get_unscaled_gradients` should also be
+        called.  See the `tf.keras.mixed_precision.LossScaleOptimizer` doc for
+        an example.
+
+        Args:
+          loss: The loss, which will be multiplied by the loss scale. Can either
+            be a tensor or a callable returning a tensor.
+
+        Returns:
+          `loss` multiplied by `LossScaleOptimizer.loss_scale`.
+        """
+        # Calls to this function would be delegated to `get_scaled_loss`
+        # of either `LossScaleOptimizer` or `LossScaleOptimizerV3`, depending on
+        # the type of `inner_optimizer`.
+        raise NotImplementedError
+
+    def get_unscaled_gradients(self, grads):
+        """Unscales the gradients by the loss scale.
+
+        This method is only needed if you compute gradients manually, e.g. with
+        `tf.GradientTape`. In that case, call this method to unscale the
+        gradients after computing them with `tf.GradientTape`. If you use
+        `LossScaleOptimizer.minimize` or `LossScaleOptimizer.get_gradients`,
+        loss scaling is automatically applied and this method is unneeded.
+
+        If this method is called, `get_scaled_loss` should also be called. See
+        the `tf.keras.mixed_precision.LossScaleOptimizer` doc for an
+        example.
+
+        Args:
+          grads: A list of tensors, each which will be divided by the loss
+            scale. Can have None values, which are ignored.
+
+        Returns:
+          A new list the same size as `grads`, where every non-None value in
+          `grads` is divided by `LossScaleOptimizer.loss_scale`.
+        """
+        # Calls to this function would be delegated to `get_unscaled_gradients`
+        # of either `LossScaleOptimizer` or `LossScaleOptimizerV3`, depending on
+        # the type of `inner_optimizer`.
+        raise NotImplementedError
+
+
+class LossScaleOptimizer(
+    tf.__internal__.tracking.DelegatingTrackableMixin,
+    optimizer_v2.OptimizerV2,
+    BaseLossScaleOptimizer,
+):
+    """An optimizer that applies loss scaling to prevent numeric underflow."""
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        inner_optimizer,
+        dynamic=True,
+        initial_scale=None,
+        dynamic_growth_steps=None,
+    ):
+        if not isinstance(inner_optimizer, optimizer_v2.OptimizerV2):
+            if isinstance(inner_optimizer, optimizer.Optimizer):
+                # Give better error message if the new experimental optimizer is
+                # passed.
+                raise TypeError(
+                    "You passed an instance of the new experimental "
+                    "optimizer, `optimizer.Optimizer`, "
+                    "to LossScaleOptimizer, but "
+                    "only the classic optimizers subclassing from "
+                    "`tf.keras.optimizers.Optimizer` can be passed. Please "
+                    "use `loss_scale_optimizer.LossScaleOptimizerV3` "
+                    "instead of "
+                    "`tf.keras.mixed_precision.LossScaleOptimizer`, "
+                    "as the former supports wrapping "
+                    "instances of the new experimental optimizer. "
+                    f"Got optimizer: {inner_optimizer}"
+                )
+            msg = (
+                '"inner_optimizer" must be an instance of '
+                "`tf.keras.optimizers.Optimizer`, but got: %s. "
+                % inner_optimizer
+            )
+            raise TypeError(msg)
+        if not isinstance(dynamic, bool):
+            # Catch errors if a user incorrectly passes a string or float to the
+            # second argument argument, as this was commonly done for the
+            # now-removed LossScaleOptimizerV1.
+            raise TypeError(
+                '"dynamic" argument to LossScaleOptimizer.__init__ must '
+                "be a bool, but got: %r" % (dynamic,)
+            )
+        if isinstance(inner_optimizer, LossScaleOptimizer):
+            raise TypeError(
+                "LossScaleOptimizer cannot wrap another "
+                "LossScaleOptimizer, but got: %s" % (inner_optimizer,)
+            )
+        _raise_if_strategy_unsupported()
+        if getattr(
+            inner_optimizer, "_is_wrapped_by_loss_scale_optimizer", False
+        ):
+            # TODO(reedwm): Maybe support this. The difficulty is that LSO has
+            # the same checkpoint format as the inner optimizer, so multiple
+            # LSOs wrapping the same optimizer causes the checkpointing logic to
+            # become confused.
+            raise ValueError(
+                '"inner_optimizer" is already wrapped by a '
+                "LossScaleOptimizer. An optimizer can only be wrapped "
+                "by a single LossScaleOptimizer"
+            )
+        self._optimizer = inner_optimizer
+        self._optimizer._is_wrapped_by_loss_scale_optimizer = True
+
+        # We don't call super().__init__, since we do not want to call
+        # OptimizerV2's constructor.
+        tf.__internal__.tracking.DelegatingTrackableMixin.__init__(
+            self, self._optimizer
+        )
+
+        if dynamic:
+            if initial_scale is None:
+                initial_scale = _DEFAULT_INITIAL_SCALE
+            if dynamic_growth_steps is None:
+                dynamic_growth_steps = _DEFAULT_GROWTH_STEPS
+            self._loss_scale = _DynamicLossScaleState(
+                initial_scale, dynamic_growth_steps, multiplier=2
+            )
+            self._track_trackable(self._loss_scale, "loss_scale")
+        else:
+            if initial_scale is None:
+                raise ValueError(
+                    '"initial_scale" must be specified if "dynamic" is False'
+                )
+            self._loss_scale = float(initial_scale)
+            if dynamic_growth_steps is not None:
+                raise ValueError(
+                    '"dynamic_growth_steps" must be None if "dynamic" '
+                    "is False, but got: %s" % (dynamic_growth_steps,)
+                )
+
+        # Used to track whether get_scaled_loss() and get_unscaled_gradients()
+        # have been called
+        self._loss_has_been_scaled = False
+        self._gradients_have_been_unscaled = False
+
+        # To support restoring TensorFlow 2.2 checkpoints.
+        self._track_trackable(
+            FakeOptimizerForRestoration(self._optimizer), "base_optimizer"
+        )
+
+    @property
+    def dynamic(self):
+        return isinstance(self._loss_scale, _DynamicLossScaleState)
+
+    @property
+    def loss_scale(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return tf.convert_to_tensor(self._loss_scale.current_loss_scale)
+        else:
+            return tf.convert_to_tensor(self._loss_scale)
+
+    @property
+    def dynamic_counter(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return self._loss_scale.counter
+        else:
+            return None
+
+    @property
+    def initial_scale(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return self._loss_scale.initial_loss_scale
+        else:
+            return self._loss_scale
+
+    @property
+    def dynamic_growth_steps(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return self._loss_scale.growth_steps
+        else:
+            return None
+
+    @property
+    def inner_optimizer(self):
+        return self._optimizer
+
+    def get_scaled_loss(self, loss):
+        self._loss_has_been_scaled = True
+        if callable(loss):
+
+            def new_loss():
+                loss_val = loss()
+                return loss_val * tf.cast(self.loss_scale, loss_val.dtype)
+
+            return new_loss
+        else:
+            return loss * tf.cast(self.loss_scale, loss.dtype)
+
+    def get_unscaled_gradients(self, grads):
+        self._gradients_have_been_unscaled = True
+        loss_scale_reciprocal = 1.0 / self.loss_scale
+        return [
+            _multiply_gradient(g, loss_scale_reciprocal)
+            if g is not None
+            else None
+            for g in grads
+        ]
+
+    def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
+        tape = tf.GradientTape() if tape is None else tape
+        with tape:
+            loss = self.get_scaled_loss(loss)
+        grads_and_vars = self._optimizer._compute_gradients(
+            loss, var_list, grad_loss, tape=tape
+        )
+        grads = [g for g, _ in grads_and_vars]
+        weights = [v for _, v in grads_and_vars]
+        unscaled_grads = self.get_unscaled_gradients(grads)
+        return list(zip(unscaled_grads, weights))
+
+    def get_gradients(self, loss, params):
+        loss = self.get_scaled_loss(loss)
+        grads = self._optimizer.get_gradients(loss, params)
+        return self.get_unscaled_gradients(grads)
+
+    def _create_all_weights(self, var_list):
+        self._optimizer._create_all_weights(var_list)
+
+    def apply_gradients(
+        self, grads_and_vars, name=None, experimental_aggregate_gradients=True
+    ):
+        if tf.distribute.in_cross_replica_context():
+            raise ValueError(
+                "apply_gradients() must be called in a replica context."
+            )
+        # We check for the strategy here despite already checking in the
+        # constructor as frequently the optimizer is created outside the
+        # strategy's scope.
+        _raise_if_strategy_unsupported()
+        _maybe_warn_about_scaling(
+            self._loss_has_been_scaled, self._gradients_have_been_unscaled
+        )
+
+        grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
+        if experimental_aggregate_gradients:
+            # We must aggregate the gradients here instead of in
+            # self.optimizer.apply_gradients, so that any NaN or Inf gradients
+            # are propagated to each replica. If any replica has a NaN or Inf
+            # gradient, they must all have a NaN or Inf gradient so that they
+            # all skip the step.
+            grads_and_vars = self._optimizer._transform_unaggregated_gradients(
+                grads_and_vars
+            )
+            grads_and_vars = self._optimizer._aggregate_gradients(
+                grads_and_vars
+            )
+
+        grads_and_vars = tuple(grads_and_vars)
+        grads = [g for g, _ in grads_and_vars]
+        # We do not want DistributionStrategy to unwrap any MirroredVariables in
+        # grads_and_vars, because even in a replica context, the wrapped
+        # optimizer expects mirrored variables. So we wrap the variables with an
+        # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
+        # MirroredVariables.
+        wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
+
+        def do_not_apply_fn():
+            # Normally self._optimizer.iterations is incremented in
+            # self._optimizer.apply_gradients(). Since that is not called in
+            # this branch, we increment it here instead.
+            return self._optimizer.iterations.assign_add(1, read_value=False)
+
+        def _if_should_apply_grads(grads):
+            if isinstance(self._loss_scale, _DynamicLossScaleState):
+                return self._loss_scale.update(grads)
+            else:
+                return (tf.no_op(), True)
+
+        if tf.__internal__.distribute.strategy_supports_no_merge_call():
+            loss_scale_update_op, should_apply_grads = _if_should_apply_grads(
+                grads
+            )
+
+            def apply_fn():
+                return self._apply_gradients(grads, wrapped_vars, name)
+
+            maybe_apply_op = tf.__internal__.smart_cond.smart_cond(
+                should_apply_grads, apply_fn, do_not_apply_fn
+            )
+            return tf.group(maybe_apply_op, loss_scale_update_op)
+
+        else:
+
+            def _apply_gradients_cross_replica(
+                distribution, grads, wrapped_vars, name
+            ):
+                (
+                    loss_scale_update_op,
+                    should_apply_grads,
+                ) = _if_should_apply_grads(grads)
+
+                def apply_fn():
+                    return distribution.extended.call_for_each_replica(
+                        self._apply_gradients, args=(grads, wrapped_vars, name)
+                    )
+
+                # Note: We must call this cond() in a cross-replica context.
+                # DistributionStrategy does not support having a cond in a
+                # replica context with a branch that calls `merge_call`, and
+                # self._optimizer.apply_gradients calls `merge_call`.
+                maybe_apply_op = tf.__internal__.smart_cond.smart_cond(
+                    should_apply_grads, apply_fn, do_not_apply_fn
+                )
+                return tf.group(maybe_apply_op, loss_scale_update_op)
+
+            return tf.distribute.get_replica_context().merge_call(
+                _apply_gradients_cross_replica, args=(grads, wrapped_vars, name)
+            )
+
+    def _apply_gradients(self, grads, wrapped_vars, name):
+        # Pass experimental_aggregate_gradients=False since LossScaleOptimizer
+        # already aggregated the gradients.
+        # TODO(reedwm): This will raise a fairly cryptic error message if
+        # self._optimizer.apply_gradients does not take
+        # experimental_aggregate_gradients.
+        return self._optimizer.apply_gradients(
+            list(zip(grads, wrapped_vars.value)),
+            name=name,
+            experimental_aggregate_gradients=False,
+        )
+
+    def get_config(self):
+        serialized_optimizer = optimizers.serialize(self._optimizer)
+        return {
+            "inner_optimizer": serialized_optimizer,
+            "dynamic": self.dynamic,
+            "initial_scale": self.initial_scale,
+            "dynamic_growth_steps": self.dynamic_growth_steps,
+        }
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()  # Make a copy, since we mutate config
+        if "loss_scale" in config:
+            # If loss_scale is in config, we assume we are deserializing a
+            # LossScaleOptimizer from TF 2.3 or below. We convert the config so
+            # it can be deserialized in the current LossScaleOptimizer.
+            loss_scale = serialization_lib.deserialize_keras_object(
+                config.pop("loss_scale"),
+                module_objects={
+                    "FixedLossScale": tf.compat.v1.mixed_precision.FixedLossScale,  # noqa: E501
+                    "DynamicLossScale": tf.compat.v1.mixed_precision.DynamicLossScale,  # noqa: E501
+                },
+                printable_module_name="loss scale",
+            )
+
+            if isinstance(
+                loss_scale, tf.compat.v1.mixed_precision.FixedLossScale
+            ):
+                config["dynamic"] = False
+                config["initial_scale"] = loss_scale._loss_scale_value
+            elif isinstance(
+                loss_scale, tf.compat.v1.mixed_precision.DynamicLossScale
+            ):
+                config["dynamic"] = True
+                config["initial_scale"] = loss_scale.initial_loss_scale
+                config["dynamic_growth_steps"] = loss_scale.increment_period
+                if loss_scale.multiplier != 2:
+                    raise ValueError(
+                        "Cannot deserialize LossScaleOptimizer with a "
+                        "DynamicLossScale whose multiplier is not 2. Got "
+                        "DynamicLossScale: %s" % (loss_scale,)
+                    )
+            else:
+                raise ValueError(
+                    "Serialized LossScaleOptimizers with a LossScale that is "
+                    "neither a FixedLossScale nor a DynamicLossScale can no "
+                    "longer be deserialized"
+                )
+            config["inner_optimizer"] = config.pop("optimizer")
+        if isinstance(config["inner_optimizer"], optimizer_v2.OptimizerV2):
+            inner_optimizer = config["inner_optimizer"]
+        else:
+            inner_optimizer = optimizers.deserialize(
+                config["inner_optimizer"],
+                custom_objects=custom_objects,
+                use_legacy_optimizer=True,
+            )
+        del config["inner_optimizer"]
+        return cls(inner_optimizer, **config)
+
+    # Delegations: We delegate most OptimizerV2 methods to the wrapped optimizer
+    # below.
+
+    @property
+    def iterations(self):
+        return self._optimizer.iterations
+
+    @iterations.setter
+    def iterations(self, variable):
+        self._optimizer.iterations = variable
+
+    def get_slot_names(self):
+        return self._optimizer.get_slot_names()
+
+    def variables(self):
+        return self._optimizer.variables()
+
+    @property
+    def weights(self):
+        return self._optimizer.weights
+
+    def get_weights(self):
+        return self._optimizer.get_weights()
+
+    def set_weights(self, weights):
+        return self._optimizer.set_weights(weights)
+
+    @property
+    def clipnorm(self):
+        return self._optimizer.clipnorm
+
+    @clipnorm.setter
+    def clipnorm(self, val):
+        self._optimizer.clipnorm = val
+
+    @property
+    def global_clipnorm(self):
+        return self._optimizer.global_clipnorm
+
+    @global_clipnorm.setter
+    def global_clipnorm(self, val):
+        self._optimizer.global_clipnorm = val
+
+    @property
+    def clipvalue(self):
+        return self._optimizer.clipvalue
+
+    @clipvalue.setter
+    def clipvalue(self, val):
+        self._optimizer.clipvalue = val
+
+    def _aggregate_gradients(self, grads_and_vars):
+        return self._optimizer._aggregate_gradients(grads_and_vars)
+
+    def _restore_slot_variable(self, slot_name, variable, slot_variable):
+        return self._optimizer._restore_slot_variable(
+            slot_name,
+            variable,
+            slot_variable,
+        )
+
+    def _create_or_restore_slot_variable(
+        self, slot_variable_position, slot_name, variable
+    ):
+        return self._optimizer._create_or_restore_slot_variable(
+            slot_variable_position, slot_name, variable
+        )
+
+    def get_slot(self, var, slot_name):
+        return self._optimizer.get_slot(var, slot_name)
+
+    def add_slot(self, var, slot_name, initializer="zeros"):
+        return self._optimizer.add_slot(var, slot_name, initializer)
+
+    def __getattribute__(self, name):
+        try:
+            return object.__getattribute__(self, name)
+        except AttributeError as e:
+            if name == "_optimizer" or name == "_hyper":
+                # Avoid infinite recursion
+                raise e
+
+            # Delegate hyperparameter accesses to inner optimizer.
+            if name == "lr":
+                name = "learning_rate"
+            if name in self._optimizer._hyper:
+                return self._optimizer._get_hyper(name)
+            raise e
+
+    def __dir__(self):
+        result = set(super().__dir__())
+        if "_optimizer" in result:
+            result |= self._optimizer._hyper.keys()
+            if "learning_rate" in self._optimizer._hyper.keys():
+                result.add("lr")
+        return list(result)
+
+    def __setattr__(self, name, value):
+        if name == "lr":
+            name = "learning_rate"
+        # Delegate setting hyperparameter to inner optimizer if the attribute
+        # does not exist on the LossScaleOptimizer
+        try:
+            # We cannot check for the 'iterations' attribute as it cannot be set
+            # after it is accessed.
+            if name != "iterations":
+                object.__getattribute__(self, name)
+            has_attribute = True
+        except AttributeError:
+            has_attribute = False
+        if (
+            name != "_optimizer"
+            and name in self._optimizer._hyper
+            and not has_attribute
+        ):
+            self._optimizer._set_hyper(name, value)
+        else:
+            super().__setattr__(name, value)
+
+    # Explicitly delegate learning_rate. Normally hyperparameters are delegated
+    # in __getattribute__, but if a hyperparameter is not in
+    # self._optimizer._hyper (e.g. because self._optimizer itself wraps another
+    # optimizer), then it won't be delegated. Since learning_rate is a very
+    # commonly accessed hyperparameter, we delegate it here.
+    @property
+    def learning_rate(self):
+        return self._optimizer.learning_rate
+
+    @learning_rate.setter
+    def learning_rate(self, value):
+        self._optimizer.learning_rate = value
+
+    @property
+    def lr(self):
+        return self._optimizer.learning_rate
+
+    @lr.setter
+    def lr(self, value):
+        self._optimizer.lr = value
+
+    # We do not override some OptimizerV2 methods. For each, we describe why we
+    # do not delegate them to self._optimizer:
+    # * get_updates: get_updates() calls get_gradients(). Since we override
+    #   get_gradients(), we cannot delegate get_updates() to self._optimizer,
+    #   otherwise the overridden get_gradients() method would not be called.
+    #   Luckily, get_updates() does not access any OptimizerV2 fields, so
+    #   inheriting the OptimizerV2 version works fine.
+    # * minimize: We don't delegate for a similar as get_updates(): it calls
+    #   both self._compute_gradients() and self.apply_gradients(), and both need
+    #   to have the LossScaleOptimizer version called.
+
+    # TODO(reedwm): Maybe throw an error if mixed precision is used without this
+    # optimizer being used.
+
+
+class LossScaleOptimizerV3(
+    tf.__internal__.tracking.DelegatingTrackableMixin,
+    optimizer.Optimizer,
+    BaseLossScaleOptimizer,
+):
+    """An optimizer that applies loss scaling to prevent numeric underflow.
+
+    This is a copy of the `mixed_precision.LossScaleOptimizer` class
+    defined above, except it subclasses and wraps the new experimental Optimizer
+    class instead of the `tf.keras.optimizers.Optimizer` class. Some of the
+    methods this class defines and calls are different compared to
+    LossScaleOptimizer due to the differences between the two Optimizer base
+    classes. Additionally, this class does not support the legacy graph mode,
+    but LossScaleOptimizer does.
+
+    Since the new experimental Optimizer does not have a hyperparameter concept,
+    LossScaleOptimizerV3 does not delegate arbitrary hyperparameter accesses to
+    the inner optimizer, unlike LossScaleOptimizer. LossScaleOptimizerV3 does
+    delegate the "learning_rate" attribute, however.
     """
-    # Calls to this function would be delegated to `get_unscaled_gradients`
-    # of either `LossScaleOptimizer` or `LossScaleOptimizerV3`, depending on
-    # the type of `inner_optimizer`.
-    raise NotImplementedError
 
+    @tf.__internal__.tracking.no_automatic_dependency_tracking
+    def __init__(
+        self,
+        inner_optimizer,
+        dynamic=True,
+        initial_scale=None,
+        dynamic_growth_steps=None,
+    ):
+        if not isinstance(inner_optimizer, optimizer.Optimizer):
+            if isinstance(inner_optimizer, optimizer_v2.OptimizerV2):
+                # Give better error message if the OptimizerV2 class is passed
+                # instead of the new experimental optimizer.
+                raise TypeError(
+                    "You passed a `tf.keras.optimizers.Optimizer` instance to "
+                    "LossScaleOptimizerV3, but only the new experimental "
+                    "optimizer defined in "
+                    "keras/optimizer_expeirmental/optimizer.py can be "
+                    "passed. Please use "
+                    "`tf.keras.mixed_precision.LossScaleOptimizer` "
+                    "instead of LossScaleOptimizerV3, as the former supports "
+                    "`tf.keras.optimizers.Optimizer`s. Got optimizer: "
+                    f"{inner_optimizer}"
+                )
+            raise TypeError(
+                '"inner_optimizer" must be an instance of '
+                f"Optimizer, but got: {inner_optimizer}."
+            )
+        if not isinstance(dynamic, bool):
+            # Catch errors if a user incorrectly passes a string or float to the
+            # second argument argument, as this was commonly done for the
+            # now-removed LossScaleOptimizerV1.
+            raise TypeError(
+                '"dynamic" argument to LossScaleOptimizer.__init__ must '
+                f"be a bool, but got: {repr(dynamic)}"
+            )
+        if isinstance(inner_optimizer, LossScaleOptimizerV3):
+            raise TypeError(
+                "LossScaleOptimizer cannot wrap another "
+                f"LossScaleOptimizer, but got: {inner_optimizer}"
+            )
+        _raise_if_strategy_unsupported()
+        if getattr(
+            inner_optimizer, "_is_wrapped_by_loss_scale_optimizer", False
+        ):
+            # TODO(reedwm): Maybe support this. The difficulty is that LSO has
+            # the same checkpoint format as the inner optimizer, so multiple
+            # LSOs wrapping the same optimizer causes the checkpointing logic to
+            # become confused.
+            raise ValueError(
+                '"inner_optimizer" is already wrapped by a '
+                "LossScaleOptimizer. An optimizer can only be wrapped "
+                "by a single LossScaleOptimizer"
+            )
+        self._optimizer = inner_optimizer
+        self._optimizer._is_wrapped_by_loss_scale_optimizer = True
+
+        # We don't call super().__init__, since we do not want to call
+        # Optimizer's constructor.
+        tf.__internal__.tracking.DelegatingTrackableMixin.__init__(
+            self, self._optimizer
+        )
+
+        if dynamic:
+            if initial_scale is None:
+                initial_scale = _DEFAULT_INITIAL_SCALE
+            if dynamic_growth_steps is None:
+                dynamic_growth_steps = _DEFAULT_GROWTH_STEPS
+            self._loss_scale = _DynamicLossScaleState(
+                initial_scale, dynamic_growth_steps, multiplier=2
+            )
+            self._track_trackable(self._loss_scale, "loss_scale")
+        else:
+            if initial_scale is None:
+                raise ValueError(
+                    '"initial_scale" must be specified if "dynamic" is False'
+                )
+            self._loss_scale = float(initial_scale)
+            if dynamic_growth_steps is not None:
+                raise ValueError(
+                    '"dynamic_growth_steps" must be None if "dynamic" '
+                    f"is False, but got: {dynamic_growth_steps}"
+                )
+
+        # Used to track whether get_scaled_loss() and get_unscaled_gradients()
+        # have been called
+        self._loss_has_been_scaled = False
+        self._gradients_have_been_unscaled = False
+
+    @property
+    def dynamic(self):
+        return isinstance(self._loss_scale, _DynamicLossScaleState)
+
+    @property
+    def loss_scale(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return tf.convert_to_tensor(self._loss_scale.current_loss_scale)
+        else:
+            return tf.convert_to_tensor(self._loss_scale)
+
+    @property
+    def dynamic_counter(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return self._loss_scale.counter
+        else:
+            return None
+
+    @property
+    def initial_scale(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return self._loss_scale.initial_loss_scale
+        else:
+            return self._loss_scale
+
+    @property
+    def dynamic_growth_steps(self):
+        if isinstance(self._loss_scale, _DynamicLossScaleState):
+            return self._loss_scale.growth_steps
+        else:
+            return None
+
+    @property
+    def inner_optimizer(self):
+        return self._optimizer
+
+    def get_scaled_loss(self, loss):
+        self._loss_has_been_scaled = True
+        if callable(loss):
+
+            def new_loss():
+                loss_val = loss()
+                return loss_val * tf.cast(self.loss_scale, loss_val.dtype)
+
+            return new_loss
+        else:
+            return loss * tf.cast(self.loss_scale, loss.dtype)
+
+    def get_unscaled_gradients(self, grads):
+        self._gradients_have_been_unscaled = True
+        loss_scale_reciprocal = 1.0 / self.loss_scale
+        return [
+            _multiply_gradient(g, loss_scale_reciprocal)
+            if g is not None
+            else None
+            for g in grads
+        ]
+
+    def compute_gradients(self, loss, var_list, tape=None):
+        tape = tf.GradientTape() if tape is None else tape
+        with tape:
+            loss = self.get_scaled_loss(loss)
+        grads_and_vars = self._optimizer.compute_gradients(
+            loss, var_list, tape=tape
+        )
+        grads = [g for g, _ in grads_and_vars]
+        weights = [v for _, v in grads_and_vars]
+        unscaled_grads = self.get_unscaled_gradients(grads)
+        return list(zip(unscaled_grads, weights))
+
+    def apply_gradients(
+        self, grads_and_vars, skip_gradients_aggregation=False, **kwargs
+    ):
+        grads_and_vars = list(grads_and_vars)
+        grads, trainable_variables = zip(*grads_and_vars)
+        with tf.init_scope():
+            # Lift variable creation to init scope to avoid environment
+            # issues.
+            self.build(trainable_variables)
+        if tf.distribute.in_cross_replica_context():
+            raise ValueError(
+                "apply_gradients() must be called in a replica context."
+            )
+        # We check for the strategy here despite already checking in the
+        # constructor as frequently the optimizer is created outside the
+        # strategy's scope.
+        _raise_if_strategy_unsupported()
+        _maybe_warn_about_scaling(
+            self._loss_has_been_scaled, self._gradients_have_been_unscaled
+        )
+
+        grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
+        # `experimental_aggregate_gradients` is an arg in `apply_gradients` of
+        # v2 optimizer -- the reverse of `skip_gradients_aggregation`.
+        # We read it from kwargs for backward compatibility.
+        experimental_aggregate_gradients = kwargs.pop(
+            "experimental_aggregate_gradients", True
+        )
+        run_with_dtensor = (
+            # `_run_with_dtensor` is for dtensor based strategy scope, and
+            # `_mesh` is when user explicitly specify the mesh setting for
+            # optimizer.
+            self._optimizer._run_with_dtensor
+            or self._optimizer._mesh
+        )
+
+        if (
+            not skip_gradients_aggregation
+            and experimental_aggregate_gradients
+            and not run_with_dtensor
+        ):
+            # We must aggregate the gradients here instead of in
+            # self.optimizer.apply_gradients, so that any NaN or Inf gradients
+            # are propagated to each replica. If any replica has a NaN or Inf
+            # gradient, they must all have a NaN or Inf gradient so that they
+            # all skip the step.
+            grads_and_vars = self._optimizer.aggregate_gradients(grads_and_vars)
+
+        grads_and_vars = tuple(grads_and_vars)
+        grads = [g for g, _ in grads_and_vars]
+        # We do not want DistributionStrategy to unwrap any MirroredVariables in
+        # grads_and_vars, because even in a replica context, the wrapped
+        # optimizer expects mirrored variables. So we wrap the variables with an
+        # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
+        # MirroredVariables.
+        wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
+
+        def do_not_apply_fn():
+            # Normally self._optimizer.iterations is incremented in
+            # self._optimizer.apply_gradients(). Since that is not called in
+            # this branch, we increment it here instead.
+            self._optimizer.iterations.assign_add(1, read_value=False)
+
+        def _if_should_apply_grads(grads):
+            if isinstance(self._loss_scale, _DynamicLossScaleState):
+                _, should_apply_grad = self._loss_scale.update(grads)
+                return should_apply_grad
+            else:
+                return True
+
+        if tf.__internal__.distribute.strategy_supports_no_merge_call():
+            should_apply_grads = _if_should_apply_grads(grads)
+
+            def apply_fn():
+                return self._apply_gradients(grads, wrapped_vars)
+
+            tf.__internal__.smart_cond.smart_cond(
+                should_apply_grads, apply_fn, do_not_apply_fn
+            )
+        else:
+
+            def _apply_gradients_cross_replica(
+                distribution, grads, wrapped_vars
+            ):
+                should_apply_grads = _if_should_apply_grads(grads)
+
+                def apply_fn():
+                    distribution.extended.call_for_each_replica(
+                        self._apply_gradients, args=(grads, wrapped_vars)
+                    )
+
+                # Note: We must call this cond() in a cross-replica context.
+                # DistributionStrategy does not support having a cond in a
+                # replica context with a branch that calls `merge_call`, and
+                # self._optimizer.apply_gradients calls `merge_call`.
+                tf.__internal__.smart_cond.smart_cond(
+                    should_apply_grads, apply_fn, do_not_apply_fn
+                )
+
+            tf.distribute.get_replica_context().merge_call(
+                _apply_gradients_cross_replica, args=(grads, wrapped_vars)
+            )
+
+    def _apply_gradients(self, grads, wrapped_vars):
+        # Pass skip_gradients_aggregation=True since LossScaleOptimizer
+        # already aggregated the gradients.
+        self._optimizer.apply_gradients(
+            list(zip(grads, wrapped_vars.value)),
+            skip_gradients_aggregation=True,
+        )
+
+    def get_config(self):
+        serialized_optimizer = optimizers.serialize(self._optimizer)
+        return {
+            "inner_optimizer": serialized_optimizer,
+            "dynamic": self.dynamic,
+            "initial_scale": self.initial_scale,
+            "dynamic_growth_steps": self.dynamic_growth_steps,
+        }
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()  # Make a copy, since we mutate config
+        if isinstance(config["inner_optimizer"], optimizer.Optimizer):
+            inner_optimizer = config["inner_optimizer"]
+        else:
+            inner_optimizer = optimizers.deserialize(
+                config["inner_optimizer"],
+                custom_objects=custom_objects,
+                use_legacy_optimizer=False,
+            )
+        del config["inner_optimizer"]
+        return cls(inner_optimizer, **config)
+
+    @property
+    def iterations(self):
+        return self._optimizer.iterations
+
+    @iterations.setter
+    def iterations(self, variable):
+        self._optimizer.iterations = variable
+
+    @property
+    def variables(self):
+        return self._optimizer.variables
+
+    def build(self, var_list):
+        return self._optimizer.build(var_list)
+
+    @property
+    def learning_rate(self):
+        return self._optimizer.learning_rate
+
+    @learning_rate.setter
+    def learning_rate(self, learning_rate):
+        self._optimizer.learning_rate = learning_rate
+
+    @property
+    def use_ema(self):
+        return self._optimizer.use_ema
+
+    @use_ema.setter
+    def use_ema(self, use_ema):
+        self._optimizer.use_ema = use_ema
+
+    @property
+    def ema_momentum(self):
+        return self._optimizer.ema_momentum
+
+    @ema_momentum.setter
+    def ema_momentum(self, ema_momentum):
+        self._optimizer.ema_momentum = ema_momentum
+
+    def finalize_variable_values(self, var_list):
+        self._optimizer.finalize_variable_values(var_list)
 
-# pylint: disable=g-classes-have-attributes
-class LossScaleOptimizer(tf.__internal__.tracking.DelegatingTrackableMixin,
-                         optimizer_v2.OptimizerV2, BaseLossScaleOptimizer):
-  """An optimizer that applies loss scaling to prevent numeric underflow."""
 
-  _HAS_AGGREGATE_GRAD = True
+class FakeOptimizerForRestoration(tf.__internal__.tracking.Trackable):
+    """A fake optimizer used to support restoring TensorFlow 2.2 checkpoints.
+
+    The checkpoint format for LossScaleOptimizers changed after TF 2.2. This
+    class exists to support restoring TF 2.2 checkpoints in newer version of
+    TensorFlow.
+
+    In TF 2.2, LossScaleOptimizer would track the wrapped optimizer by calling
+    the following in LossScaleOptimizer.__init__
+
+    ```
+    self._track_trackable(self._optimizer, 'base_optimizer')
+    ```
+
+    This means a dependency from the LossScaleOptimizer to the wrapped optimizer
+    would be stored in the checkpoint. However now, the checkpoint format with a
+    LossScaleOptimizer is the same as the format without a LossScaleOptimizer,
+    except the loss scale is also stored. This means there is no dependency from
+    the LossScaleOptimizer to the wrapped optimizer. Instead, the
+    LossScaleOptimizer acts as if it is the wrapped optimizer, from a
+    checkpoint's perspective, by overriding all Trackable methods and delegating
+    them to the wrapped optimizer.
+
+    To allow restoring TF 2.2. checkpoints, LossScaleOptimizer adds a dependency
+    on this class instead of the inner optimizer. When restored, this class will
+    instead restore the slot variables of the inner optimizer. Since this class
+    has no variables, it does not affect the checkpoint when saved.
+    """
 
-  def __init__(self, inner_optimizer, dynamic=True, initial_scale=None,
-               dynamic_growth_steps=None):
-    if not isinstance(inner_optimizer, optimizer_v2.OptimizerV2):
-      if isinstance(inner_optimizer, optimizer_experimental.Optimizer):
-        # Give better error message if the new experimental optimizer is passed.
-        raise TypeError(
-            f'You passed an instance of the new experimental optimizer, '
-            f'`optimizer_experimental.Optimizer`, to LossScaleOptimizer, but '
-            f'only the classic optimizers subclassing from '
-            f'`tf.keras.optimizers.Optimizer` can be passed. Please use '
-            f'`loss_scale_optimizer.LossScaleOptimizerV3` instead of '
-            f'`tf.keras.mixed_precision.LossScaleOptimizer`, as the former '
-            f'supports wrapping instances of the new experimental optimizer. '
-            f'Got optimizer: {inner_optimizer}')
-      msg = ('"inner_optimizer" must be an instance of '
-             '`tf.keras.optimizers.Optimizer`, but got: %s. ' % inner_optimizer)
-      if isinstance(inner_optimizer, legacy_optimizer.OptimizerV2):
-        msg += ('Please make sure "inner_optimizer" is not an instance of '
-                '`tensorflow.python.keras.optimizers`, which is '
-                'the legacy keras code and will be removed in future release. '
-                'Please use the tf.keras public API instead.')
-      raise TypeError(msg)
-    if not isinstance(dynamic, bool):
-      # Catch errors if a user incorrectly passes a string or float to the
-      # second argument argument, as this was commonly done for the now-removed
-      # LossScaleOptimizerV1.
-      raise TypeError('"dynamic" argument to LossScaleOptimizer.__init__ must '
-                      'be a bool, but got: %r' % (dynamic,))
-    if isinstance(inner_optimizer, LossScaleOptimizer):
-      raise TypeError('LossScaleOptimizer cannot wrap another '
-                      'LossScaleOptimizer, but got: %s' % (inner_optimizer,))
-    _raise_if_strategy_unsupported()
-    if getattr(inner_optimizer, '_is_wrapped_by_loss_scale_optimizer', False):
-      # TODO(reedwm): Maybe support this. The difficulty is that LSO has the
-      # same checkpoint format as the inner optimizer, so multiple LSOs wrapping
-      # the same optimizer causes the checkpointing logic to become confused.
-      raise ValueError('"inner_optimizer" is already wrapped by a '
-                       'LossScaleOptimizer. An optimizer can only be wrapped '
-                       'by a single LossScaleOptimizer')
-    self._optimizer = inner_optimizer
-    self._optimizer._is_wrapped_by_loss_scale_optimizer = True
-
-    # We don't call super().__init__, since we do not want to call OptimizerV2's
-    # constructor.
-    tf.__internal__.tracking.DelegatingTrackableMixin.__init__(self,
-                                                               self._optimizer)
-
-    if dynamic:
-      if initial_scale is None:
-        initial_scale = _DEFAULT_INITIAL_SCALE
-      if dynamic_growth_steps is None:
-        dynamic_growth_steps = _DEFAULT_GROWTH_STEPS
-      self._loss_scale = _DynamicLossScaleState(
-          initial_scale, dynamic_growth_steps, multiplier=2)
-      self._track_trackable(self._loss_scale, 'loss_scale')
-    else:
-      if initial_scale is None:
-        raise ValueError('"initial_scale" must be specified if "dynamic" is '
-                         'False')
-      self._loss_scale = float(initial_scale)
-      if dynamic_growth_steps is not None:
-        raise ValueError('"dynamic_growth_steps" must be None if "dynamic" '
-                         'is False, but got: %s' % (dynamic_growth_steps,))
-
-    # Used to track whether get_scaled_loss() and get_unscaled_gradients() have
-    # been called
-    self._loss_has_been_scaled = False
-    self._gradients_have_been_unscaled = False
-
-    # To support restoring TensorFlow 2.2 checkpoints.
-    self._track_trackable(FakeOptimizerForRestoration(self._optimizer),
-                          'base_optimizer')
-
-  @property
-  def dynamic(self):
-    return isinstance(self._loss_scale, _DynamicLossScaleState)
-
-  @property
-  def loss_scale(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return tf.convert_to_tensor(
-          self._loss_scale.current_loss_scale)
-    else:
-      return tf.convert_to_tensor(self._loss_scale)
+    def __init__(self, optimizer):
+        self._optimizer = optimizer
 
-  @property
-  def dynamic_counter(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return self._loss_scale.counter
-    else:
-      return None
+    def get_slot_names(self):
+        return self._optimizer.get_slot_names()
 
-  @property
-  def initial_scale(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return self._loss_scale.initial_loss_scale
-    else:
-      return self._loss_scale
+    def _create_or_restore_slot_variable(
+        self, slot_variable_position, slot_name, variable
+    ):
+        return self._optimizer._create_or_restore_slot_variable(
+            slot_variable_position, slot_name, variable
+        )
 
-  @property
-  def dynamic_growth_steps(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return self._loss_scale.growth_steps
-    else:
-      return None
-
-  @property
-  def inner_optimizer(self):
-    return self._optimizer
-
-  def get_scaled_loss(self, loss):
-    self._loss_has_been_scaled = True
-    if callable(loss):
-      def new_loss():
-        loss_val = loss()
-        return loss_val * tf.cast(self.loss_scale, loss_val.dtype)
-      return new_loss
-    else:
-      return loss * tf.cast(self.loss_scale, loss.dtype)
 
-  def get_unscaled_gradients(self, grads):
-    self._gradients_have_been_unscaled = True
-    loss_scale_reciprocal = 1. / self.loss_scale
-    return [
-        _multiply_gradient(g, loss_scale_reciprocal) if g is not None else None
-        for g in grads
-    ]
+def _create_loss_scale_optimizer_from_v1_loss_scale(optimizer, loss_scale):
+    """Creates an LSO from a tf.compat.v1.mixed_precision.LossScale.
 
-  def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
-    tape = tf.GradientTape() if tape is None else tape
-    with tape:
-      loss = self.get_scaled_loss(loss)
-    grads_and_vars = self._optimizer._compute_gradients(  # pylint: disable=protected-access
-        loss,
-        var_list,
-        grad_loss,
-        tape=tape)
-    grads = [g for g, _ in grads_and_vars]
-    weights = [v for _, v in grads_and_vars]
-    unscaled_grads = self.get_unscaled_gradients(grads)
-    return list(zip(unscaled_grads, weights))
-
-  def get_gradients(self, loss, params):
-    loss = self.get_scaled_loss(loss)
-    grads = self._optimizer.get_gradients(loss, params)
-    return self.get_unscaled_gradients(grads)
-
-  def _create_all_weights(self, var_list):
-    self._optimizer._create_all_weights(var_list)    # pylint: disable=protected-access
-
-  def apply_gradients(self,
-                      grads_and_vars,
-                      name=None,
-                      experimental_aggregate_gradients=True):
-    if tf.distribute.in_cross_replica_context():
-      raise ValueError('apply_gradients() must be called in a replica context.')
-    # We check for the strategy here despite already checking in the constructor
-    # as frequently the optimizer is created outside the strategy's scope.
-    _raise_if_strategy_unsupported()
-    _maybe_warn_about_scaling(self._loss_has_been_scaled,
-                              self._gradients_have_been_unscaled)
-
-    grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
-    if experimental_aggregate_gradients:
-      # We must aggregate the gradients here instead of in
-      # self.optimizer.apply_gradients, so that any NaN or Inf gradients are
-      # propagated to each replica. If any replica has a NaN or Inf gradient,
-      # they must all have a NaN or Inf gradient so that they all skip the step.
-      # pylint: disable=protected-access
-      grads_and_vars = self._optimizer._transform_unaggregated_gradients(
-          grads_and_vars)
-      grads_and_vars = self._optimizer._aggregate_gradients(grads_and_vars)
-      # pylint: enable=protected-access
-
-    grads_and_vars = tuple(grads_and_vars)
-    grads = [g for g, _ in grads_and_vars]
-    # We do not want DistributionStrategy to unwrap any MirroredVariables in
-    # grads_and_vars, because even in a replica context, the wrapped
-    # optimizer expects mirrored variables. So we wrap the variables with an
-    # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
-    # MirroredVariables.
-    wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
-
-    def do_not_apply_fn():
-      # Normally self._optimizer.iterations is incremented in
-      # self._optimizer.apply_gradients(). Since that is not called in this
-      # branch, we increment it here instead.
-      return self._optimizer.iterations.assign_add(1, read_value=False)
-
-    def _if_should_apply_grads(grads):
-      if isinstance(self._loss_scale, _DynamicLossScaleState):
-        return self._loss_scale.update(grads)
-      else:
-        return (tf.no_op(), True)
-
-    if tf.__internal__.distribute.strategy_supports_no_merge_call():
-      loss_scale_update_op, should_apply_grads = _if_should_apply_grads(grads)
-      def apply_fn():
-        return self._apply_gradients(grads, wrapped_vars, name)
-
-      maybe_apply_op = tf.__internal__.smart_cond.smart_cond(should_apply_grads, apply_fn,
-                                             do_not_apply_fn)
-      return tf.group(maybe_apply_op, loss_scale_update_op)
+    This is only used to pass to
+    `tf.__internal__.mixed_precision.register_loss_scale_wrapper` below, which
+    is called so that
+    `tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite` can
+    wrap a Keras optimizer with a LossScaleOptimizer.
 
-    else:
+    Args:
+      optimizer: An OptimizerV2 instance.
+      loss_scale: A `tf.compat.v1.mixed_precision.LossScale` instance
 
-      def _apply_gradients_cross_replica(distribution, grads, wrapped_vars,
-                                         name):
-        loss_scale_update_op, should_apply_grads = _if_should_apply_grads(grads)
-
-        def apply_fn():
-          return distribution.extended.call_for_each_replica(
-              self._apply_gradients,
-              args=(grads, wrapped_vars, name))
-
-        # Note: We must call this cond() in a cross-replica context.
-        # DistributionStrategy does not support having a cond in a replica
-        # context with a branch that calls `merge_call`, and
-        # self._optimizer.apply_gradients calls `merge_call`.
-        maybe_apply_op = tf.__internal__.smart_cond.smart_cond(should_apply_grads, apply_fn,
-                                               do_not_apply_fn)
-        return tf.group(maybe_apply_op, loss_scale_update_op)
-      return tf.distribute.get_replica_context().merge_call(
-          _apply_gradients_cross_replica,
-          args=(grads, wrapped_vars, name))
-
-  def _apply_gradients(self, grads, wrapped_vars, name):
-    # Pass experimental_aggregate_gradients=False since LossScaleOptimizer
-    # already aggregated the gradients.
-    # TODO(reedwm): This will raise a fairly cryptic error message if
-    # self._optimizer.apply_gradients does not take
-    # experimental_aggregate_gradients.
-    return self._optimizer.apply_gradients(
-        list(zip(grads, wrapped_vars.value)),
-        name=name,
-        experimental_aggregate_gradients=False)
-
-  def get_config(self):
-    serialized_optimizer = optimizers.serialize(self._optimizer)
-    return {
-        'inner_optimizer': serialized_optimizer,
-        'dynamic': self.dynamic,
-        'initial_scale': self.initial_scale,
-        'dynamic_growth_steps': self.dynamic_growth_steps,
-    }
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()  # Make a copy, since we mutate config
-    if 'loss_scale' in config:
-      # If loss_scale is in config, we assume we are deserializing a
-      # LossScaleOptimizer from TF 2.3 or below. We convert the config so it
-      # can be deserialized in the current LossScaleOptimizer.
-      loss_scale = generic_utils.deserialize_keras_object(
-          config.pop('loss_scale'),
-          module_objects={
-              'FixedLossScale': tf.compat.v1.mixed_precision.FixedLossScale,
-              'DynamicLossScale': tf.compat.v1.mixed_precision.DynamicLossScale,
-          },
-          printable_module_name='loss scale')
-
-      if isinstance(loss_scale, tf.compat.v1.mixed_precision.FixedLossScale):
-        config['dynamic'] = False
-        config['initial_scale'] = loss_scale._loss_scale_value  # pylint: disable=protected-access
-      elif isinstance(loss_scale,
-                      tf.compat.v1.mixed_precision.DynamicLossScale):
-        config['dynamic'] = True
-        config['initial_scale'] = loss_scale.initial_loss_scale
-        config['dynamic_growth_steps'] = loss_scale.increment_period
+    Returns:
+      A LossScaleOptimizer that wraps `optimizer` and uses the same loss scaling
+      algorithm as `loss_scale`.
+    """
+    if isinstance(loss_scale, (int, float)):
+        return LossScaleOptimizer(
+            optimizer, dynamic=False, initial_scale=loss_scale
+        )
+    elif isinstance(loss_scale, tf.compat.v1.mixed_precision.FixedLossScale):
+        ls_val = loss_scale._loss_scale_value
+        return LossScaleOptimizer(
+            optimizer, dynamic=False, initial_scale=ls_val
+        )
+    elif loss_scale == "dynamic":
+        return LossScaleOptimizer(optimizer)
+    elif isinstance(loss_scale, tf.compat.v1.mixed_precision.DynamicLossScale):
         if loss_scale.multiplier != 2:
-          raise ValueError('Cannot deserialize LossScaleOptimizer with a '
-                           'DynamicLossScale whose multiplier is not 2. Got '
-                           'DynamicLossScale: %s' % (loss_scale,))
-      else:
-        raise ValueError(
-            'Serialized LossScaleOptimizers with a LossScale that is neither a '
-            'FixedLossScale nor a DynamicLossScale can no longer be '
-            'deserialized')
-      config['inner_optimizer'] = config.pop('optimizer')
-    inner_optimizer = optimizers.deserialize(
-        config['inner_optimizer'], custom_objects=custom_objects)
-    del config['inner_optimizer']
-    return cls(inner_optimizer, **config)
-
-  # Delegations: We delegate most OptimizerV2 methods to the wrapped optimizer
-  # below.
-
-  @property
-  def iterations(self):
-    return self._optimizer.iterations
-
-  @iterations.setter
-  def iterations(self, variable):
-    self._optimizer.iterations = variable
-
-  def get_slot_names(self):
-    return self._optimizer.get_slot_names()
-
-  def variables(self):
-    return self._optimizer.variables()
-
-  @property
-  def weights(self):
-    return self._optimizer.weights
-
-  def get_weights(self):
-    return self._optimizer.get_weights()
-
-  def set_weights(self, weights):
-    return self._optimizer.set_weights(weights)
-
-  @property
-  def clipnorm(self):
-    return self._optimizer.clipnorm
-
-  @clipnorm.setter
-  def clipnorm(self, val):
-    self._optimizer.clipnorm = val
-
-  @property
-  def global_clipnorm(self):
-    return self._optimizer.global_clipnorm
-
-  @global_clipnorm.setter
-  def global_clipnorm(self, val):
-    self._optimizer.global_clipnorm = val
-
-  @property
-  def clipvalue(self):
-    return self._optimizer.clipvalue
-
-  @clipvalue.setter
-  def clipvalue(self, val):
-    self._optimizer.clipvalue = val
-
-  def _aggregate_gradients(self, grads_and_vars):
-    return self._optimizer._aggregate_gradients(grads_and_vars)  # pylint: disable=protected-access
-
-  def _restore_slot_variable(self, slot_name, variable, slot_variable):
-    return self._optimizer._restore_slot_variable(slot_name, variable,  # pylint: disable=protected-access
-                                                  slot_variable)
-
-  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
-                                       variable):
-    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
-        slot_variable_position, slot_name, variable)
-
-  def get_slot(self, var, slot_name):
-    return self._optimizer.get_slot(var, slot_name)
-
-  def add_slot(self, var, slot_name, initializer='zeros'):
-    return self._optimizer.add_slot(var, slot_name, initializer)
-
-  def __getattribute__(self, name):
-    try:
-      return object.__getattribute__(self, name)
-    except AttributeError as e:
-      if name == '_optimizer' or name == '_hyper':
-        # Avoid infinite recursion
-        raise e
-
-      # Delegate hyperparameter accesses to inner optimizer.
-      if name == 'lr':
-        name = 'learning_rate'
-      if name in self._optimizer._hyper:
-        return self._optimizer._get_hyper(name)
-      raise e
-
-  def __dir__(self):
-    result = set(super().__dir__())
-    if '_optimizer' in result:
-      result |= self._optimizer._hyper.keys()
-      if 'learning_rate' in self._optimizer._hyper.keys():
-        result.add('lr')
-    return list(result)
-
-  def __setattr__(self, name, value):
-    if name == 'lr':
-      name = 'learning_rate'
-    # Delegate setting hyperparameter to inner optimizer if the attribute does
-    # not exist on the LossScaleOptimizer
-    try:
-      # We cannot check for the 'iterations' attribute as it cannot be set after
-      # it is accessed.
-      if name != 'iterations':
-        object.__getattribute__(self, name)
-      has_attribute = True
-    except AttributeError:
-      has_attribute = False
-    if (name != '_optimizer' and name in self._optimizer._hyper
-        and not has_attribute):
-      self._optimizer._set_hyper(name, value)
-    else:
-      super().__setattr__(name, value)
-
-  # Explicitly delegate learning_rate. Normally hyperparameters are delegated in
-  # __getattribute__, but if a hyperparameter is not in self._optimizer._hyper
-  # (e.g. because self._optimizer itself wraps another optimizer), then it won't
-  # be delegated. Since learning_rate is a very commonly accessed
-  # hyperparameter, we delegate it here.
-  @property
-  def learning_rate(self):
-    return self._optimizer.learning_rate
-
-  @learning_rate.setter
-  def learning_rate(self, value):
-    self._optimizer.learning_rate = value
-
-  @property
-  def lr(self):
-    return self._optimizer.learning_rate
-
-  @lr.setter
-  def lr(self, value):
-    self._optimizer.lr = value
-
-  # We do not override some OptimizerV2 methods. For each, we describe why we do
-  # not delegate them to self._optimizer:
-  # * get_updates: get_updates() calls get_gradients(). Since we override
-  #   get_gradients(), we cannot delegate get_updates() to self._optimizer,
-  #   otherwise the overridden get_gradients() method would not be called.
-  #   Luckily, get_updates() does not access any OptimizerV2 fields, so
-  #   inheriting the OptimizerV2 version works fine.
-  # * minimize: We don't delegate for a similar as get_updates(): it calls
-  #   both self._compute_gradients() and self.apply_gradients(), and both need
-  #   to have the LossScaleOptimizer version called.
-
-  # TODO(reedwm): Maybe throw an error if mixed precision is used without this
-  # optimizer being used.
-
-
-class LossScaleOptimizerV3(tf.__internal__.tracking.DelegatingTrackableMixin,
-                           optimizer_experimental.Optimizer,
-                           BaseLossScaleOptimizer):
-  """An optimizer that applies loss scaling to prevent numeric underflow.
-
-  This is a copy of the `mixed_precision.LossScaleOptimizer` class
-  defined above, except it subclasses and wraps the new experimental Optimizer
-  class instead of the `tf.keras.optimizers.Optimizer` class. Some of the
-  methods this class defines and calls are different compared to
-  LossScaleOptimizer due to the differences between the two Optimizer base
-  classes. Additionally, this class does not support the legacy graph mode, but
-  LossScaleOptimizer does.
-
-  Since the new experimental Optimizer does not have a hyperparameter concept,
-  LossScaleOptimizerV3 does not delegate arbitrary hyperparameter accesses to
-  the inner optimizer, unlike LossScaleOptimizer. LossScaleOptimizerV3 does
-  delegate the "learning_rate" attribute, however.
-  """
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def __init__(self, inner_optimizer, dynamic=True, initial_scale=None,
-               dynamic_growth_steps=None):
-    if not isinstance(inner_optimizer, optimizer_experimental.Optimizer):
-      if isinstance(inner_optimizer, optimizer_v2.OptimizerV2):
-        # Give better error message if the OptimizerV2 class is passed instead
-        # of the new experimental optimizer.
+            raise ValueError(
+                'When passing a DynamicLossScale to "loss_scale", '
+                "DynamicLossScale.multiplier must be 2. Got: "
+                f"{loss_scale}"
+            )
+        return LossScaleOptimizer(
+            optimizer,
+            initial_scale=loss_scale.initial_loss_scale,
+            dynamic_growth_steps=loss_scale.increment_period,
+        )
+    elif isinstance(loss_scale, tf.compat.v1.mixed_precision.LossScale):
         raise TypeError(
-            f'You passed a `tf.keras.optimizer.Optimizer` instance to '
-            f'LossScaleOptimizerV3, but only the new experimental optimizer '
-            f'defined in keras/optimizer_expeirmental/optimizer.py can be '
-            f'passed. Please use `tf.keras.mixed_precision.LossScaleOptimizer` '
-            f'instead of LossScaleOptimizerV3, as the former supports '
-            f'`tf.keras.optimizer.Optimizer`s. Got optimizer: '
-            f'{inner_optimizer}')
-      raise TypeError(f'"inner_optimizer" must be an instance of '
-                      f'Optimizer, but got: {inner_optimizer}.')
-    if not isinstance(dynamic, bool):
-      # Catch errors if a user incorrectly passes a string or float to the
-      # second argument argument, as this was commonly done for the now-removed
-      # LossScaleOptimizerV1.
-      raise TypeError(f'"dynamic" argument to LossScaleOptimizer.__init__ must '
-                      f'be a bool, but got: {repr(dynamic)}')
-    if isinstance(inner_optimizer, LossScaleOptimizerV3):
-      raise TypeError(f'LossScaleOptimizer cannot wrap another '
-                      f'LossScaleOptimizer, but got: {inner_optimizer}')
-    _raise_if_strategy_unsupported()
-    if getattr(inner_optimizer, '_is_wrapped_by_loss_scale_optimizer', False):
-      # TODO(reedwm): Maybe support this. The difficulty is that LSO has the
-      # same checkpoint format as the inner optimizer, so multiple LSOs wrapping
-      # the same optimizer causes the checkpointing logic to become confused.
-      raise ValueError('"inner_optimizer" is already wrapped by a '
-                       'LossScaleOptimizer. An optimizer can only be wrapped '
-                       'by a single LossScaleOptimizer')
-    self._optimizer = inner_optimizer
-    self._optimizer._is_wrapped_by_loss_scale_optimizer = True
-
-    # We don't call super().__init__, since we do not want to call Optimizer's
-    # constructor.
-    tf.__internal__.tracking.DelegatingTrackableMixin.__init__(self,
-                                                               self._optimizer)
-
-    if dynamic:
-      if initial_scale is None:
-        initial_scale = _DEFAULT_INITIAL_SCALE
-      if dynamic_growth_steps is None:
-        dynamic_growth_steps = _DEFAULT_GROWTH_STEPS
-      self._loss_scale = _DynamicLossScaleState(
-          initial_scale, dynamic_growth_steps, multiplier=2)
-      self._track_trackable(self._loss_scale, 'loss_scale')
-    else:
-      if initial_scale is None:
-        raise ValueError('"initial_scale" must be specified if "dynamic" is '
-                         'False')
-      self._loss_scale = float(initial_scale)
-      if dynamic_growth_steps is not None:
-        raise ValueError(f'"dynamic_growth_steps" must be None if "dynamic" '
-                         f'is False, but got: {dynamic_growth_steps}')
-
-    # Used to track whether get_scaled_loss() and get_unscaled_gradients() have
-    # been called
-    self._loss_has_been_scaled = False
-    self._gradients_have_been_unscaled = False
-
-  @property
-  def dynamic(self):
-    return isinstance(self._loss_scale, _DynamicLossScaleState)
-
-  @property
-  def loss_scale(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return tf.convert_to_tensor(
-          self._loss_scale.current_loss_scale)
-    else:
-      return tf.convert_to_tensor(self._loss_scale)
-
-  @property
-  def dynamic_counter(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return self._loss_scale.counter
-    else:
-      return None
-
-  @property
-  def initial_scale(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return self._loss_scale.initial_loss_scale
-    else:
-      return self._loss_scale
-
-  @property
-  def dynamic_growth_steps(self):
-    if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return self._loss_scale.growth_steps
-    else:
-      return None
-
-  @property
-  def inner_optimizer(self):
-    return self._optimizer
-
-  def get_scaled_loss(self, loss):
-    self._loss_has_been_scaled = True
-    if callable(loss):
-      def new_loss():
-        loss_val = loss()
-        return loss_val * tf.cast(self.loss_scale, loss_val.dtype)
-      return new_loss
-    else:
-      return loss * tf.cast(self.loss_scale, loss.dtype)
-
-  def get_unscaled_gradients(self, grads):
-    self._gradients_have_been_unscaled = True
-    loss_scale_reciprocal = 1. / self.loss_scale
-    return [
-        _multiply_gradient(g, loss_scale_reciprocal) if g is not None else None
-        for g in grads
-    ]
-
-  def compute_gradients(self, loss, var_list, tape=None):
-    tape = tf.GradientTape() if tape is None else tape
-    with tape:
-      loss = self.get_scaled_loss(loss)
-    grads_and_vars = self._optimizer.compute_gradients(  # pylint: disable=protected-access
-        loss,
-        var_list,
-        tape=tape)
-    grads = [g for g, _ in grads_and_vars]
-    weights = [v for _, v in grads_and_vars]
-    unscaled_grads = self.get_unscaled_gradients(grads)
-    return list(zip(unscaled_grads, weights))
-
-  def apply_gradients(self,
-                      grads_and_vars,
-                      skip_gradients_aggregation=False):
-    if tf.distribute.in_cross_replica_context():
-      raise ValueError('apply_gradients() must be called in a replica context.')
-    # We check for the strategy here despite already checking in the constructor
-    # as frequently the optimizer is created outside the strategy's scope.
-    _raise_if_strategy_unsupported()
-    _maybe_warn_about_scaling(self._loss_has_been_scaled,
-                              self._gradients_have_been_unscaled)
-
-    grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
-    if not skip_gradients_aggregation:
-      # We must aggregate the gradients here instead of in
-      # self.optimizer.apply_gradients, so that any NaN or Inf gradients are
-      # propagated to each replica. If any replica has a NaN or Inf gradient,
-      # they must all have a NaN or Inf gradient so that they all skip the step.
-      # pylint: disable=protected-access
-      grads_and_vars = self._optimizer.aggregate_gradients(grads_and_vars)
-      # pylint: enable=protected-access
-
-    grads_and_vars = tuple(grads_and_vars)
-    grads = [g for g, _ in grads_and_vars]
-    # We do not want DistributionStrategy to unwrap any MirroredVariables in
-    # grads_and_vars, because even in a replica context, the wrapped
-    # optimizer expects mirrored variables. So we wrap the variables with an
-    # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
-    # MirroredVariables.
-    wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
-
-    def do_not_apply_fn():
-      # Normally self._optimizer.iterations is incremented in
-      # self._optimizer.apply_gradients(). Since that is not called in this
-      # branch, we increment it here instead.
-      self._optimizer.iterations.assign_add(1, read_value=False)
-
-    def _if_should_apply_grads(grads):
-      if isinstance(self._loss_scale, _DynamicLossScaleState):
-        _, should_apply_grad = self._loss_scale.update(grads)
-        return should_apply_grad
-      else:
-        return True
-
-    if tf.__internal__.distribute.strategy_supports_no_merge_call():
-      should_apply_grads = _if_should_apply_grads(grads)
-      def apply_fn():
-        return self._apply_gradients(grads, wrapped_vars)
-      tf.__internal__.smart_cond.smart_cond(should_apply_grads, apply_fn,
-                                            do_not_apply_fn)
+            "Passing a LossScale that is not a FixedLossScale or a "
+            f"DynamicLossScale is not supported. Got: {loss_scale}"
+        )
     else:
-
-      def _apply_gradients_cross_replica(distribution, grads, wrapped_vars):
-        should_apply_grads = _if_should_apply_grads(grads)
-
-        def apply_fn():
-          distribution.extended.call_for_each_replica(
-              self._apply_gradients,
-              args=(grads, wrapped_vars))
-
-        # Note: We must call this cond() in a cross-replica context.
-        # DistributionStrategy does not support having a cond in a replica
-        # context with a branch that calls `merge_call`, and
-        # self._optimizer.apply_gradients calls `merge_call`.
-        tf.__internal__.smart_cond.smart_cond(should_apply_grads, apply_fn,
-                                              do_not_apply_fn)
-      tf.distribute.get_replica_context().merge_call(
-          _apply_gradients_cross_replica,
-          args=(grads, wrapped_vars))
-
-  def _apply_gradients(self, grads, wrapped_vars):
-    # Pass skip_gradients_aggregation=True since LossScaleOptimizer
-    # already aggregated the gradients.
-    self._optimizer.apply_gradients(
-        list(zip(grads, wrapped_vars.value)),
-        skip_gradients_aggregation=True)
-
-  def get_config(self):
-    serialized_optimizer = optimizers.serialize(self._optimizer)
-    return {
-        'inner_optimizer': serialized_optimizer,
-        'dynamic': self.dynamic,
-        'initial_scale': self.initial_scale,
-        'dynamic_growth_steps': self.dynamic_growth_steps,
-    }
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    config = config.copy()  # Make a copy, since we mutate config
-    inner_optimizer = optimizers.deserialize(
-        config['inner_optimizer'], custom_objects=custom_objects)
-    del config['inner_optimizer']
-    return cls(inner_optimizer, **config)
-
-  @property
-  def iterations(self):
-    return self._optimizer.iterations
-
-  @iterations.setter
-  def iterations(self, variable):
-    self._optimizer.iterations = variable
-
-  @property
-  def learning_rate(self):
-    return self._optimizer.learning_rate
-
-  @learning_rate.setter
-  def learning_rate(self, learning_rate):
-    self._optimizer.learning_rate = learning_rate
-
-
-class FakeOptimizerForRestoration(tf.__internal__.tracking.Trackable):
-  """A fake optimizer used to support restoring TensorFlow 2.2 checkpoints.
-
-  The checkpoint format for LossScaleOptimizers changed after TF 2.2. This class
-  exists to support restoring TF 2.2 checkpoints in newer version of TensorFlow.
-
-  In TF 2.2, LossScaleOptimizer would track the wrapped optimizer by calling the
-  following in LossScaleOptimizer.__init__
-
-  ```
-  self._track_trackable(self._optimizer, 'base_optimizer')
-  ```
-
-  This means a dependency from the LossScaleOptimizer to the wrapped optimizer
-  would be stored in the checkpoint. However now, the checkpoint format with a
-  LossScaleOptimizer is the same as the format without a LossScaleOptimizer,
-  except the loss scale is also stored. This means there is no dependency from
-  the LossScaleOptimizer to the wrapped optimizer. Instead, the
-  LossScaleOptimizer acts as if it is the wrapped optimizer, from a checkpoint's
-  perspective, by overriding all Trackable methods and delegating them to the
-  wrapped optimizer.
-
-  To allow restoring TF 2.2. checkpoints, LossScaleOptimizer adds a dependency
-  on this class instead of the inner optimizer. When restored, this class will
-  instead restore the slot variables of the inner optimizer. Since this class
-  has no variables, it does not affect the checkpoint when saved.
-  """
-
-  def __init__(self, optimizer):
-    self._optimizer = optimizer
-
-  def get_slot_names(self):
-    return self._optimizer.get_slot_names()
-
-  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
-                                       variable):
-    return self._optimizer._create_or_restore_slot_variable(  # pylint: disable=protected-access
-        slot_variable_position, slot_name, variable)
-
-
-def _create_loss_scale_optimizer_from_v1_loss_scale(optimizer, loss_scale):
-  """Creates an LSO from a tf.compat.v1.mixed_precision.LossScale.
-
-  This is only used to pass to
-  `tf.__internal__.mixed_precision.register_loss_scale_wrapper` below, which is
-  called so that
-  `tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite` can
-  wrap a Keras optimizer with a LossScaleOptimizer.
-
-  Args:
-    optimizer: An OptimizerV2 instance.
-    loss_scale: A `tf.compat.v1.mixed_precision.LossScale` instance
-
-  Returns:
-    A LossScaleOptimizer that wraps `optimizer` and uses the same loss scaling
-    algorithm as `loss_scale`.
-  """
-  if isinstance(loss_scale, (int, float)):
-    return LossScaleOptimizer(optimizer, dynamic=False,
-                              initial_scale=loss_scale)
-  elif isinstance(loss_scale, tf.compat.v1.mixed_precision.FixedLossScale):
-    ls_val = loss_scale._loss_scale_value  # pylint: disable=protected-access
-    return LossScaleOptimizer(optimizer, dynamic=False,
-                              initial_scale=ls_val)
-  elif loss_scale == 'dynamic':
-    return LossScaleOptimizer(optimizer)
-  elif isinstance(loss_scale, tf.compat.v1.mixed_precision.DynamicLossScale):
-    if loss_scale.multiplier != 2:
-      raise ValueError(f'When passing a DynamicLossScale to "loss_scale", '
-                       f'DynamicLossScale.multiplier must be 2. Got: '
-                       f'{loss_scale}')
-    return LossScaleOptimizer(
-        optimizer, initial_scale=loss_scale.initial_loss_scale,
-        dynamic_growth_steps=loss_scale.increment_period)
-  elif isinstance(loss_scale, tf.compat.v1.mixed_precision.LossScale):
-    raise TypeError(f'Passing a LossScale that is not a FixedLossScale or a '
-                    f'DynamicLossScale is not supported. Got: {loss_scale}')
-  else:
-    raise ValueError(f'Invalid value passed to loss_scale. loss_scale '
-                     f'must be the string "dynamic" (recommended), an int, '
-                     f'a float, a FixedLossScale, or a DynamicLossScale. Got '
-                     f'value: {loss_scale}')
+        raise ValueError(
+            "Invalid value passed to loss_scale. loss_scale "
+            'must be the string "dynamic" (recommended), an int, '
+            "a float, a FixedLossScale, or a DynamicLossScale. Got "
+            f"value: {loss_scale}"
+        )
 
 
 tf.__internal__.mixed_precision.register_loss_scale_wrapper(
-    optimizer_v2.OptimizerV2, _create_loss_scale_optimizer_from_v1_loss_scale,
-    LossScaleOptimizer)
+    optimizer_v2.OptimizerV2,
+    _create_loss_scale_optimizer_from_v1_loss_scale,
+    LossScaleOptimizer,
+)
 
 
 def _multiply_gradient(gradient, scale):
-  """Multiply a (possibly sparse) gradient by the given scale factor."""
-  scale = tf.cast(scale, gradient.dtype)
-  if isinstance(gradient, tf.IndexedSlices):
-    return tf.IndexedSlices(
-        gradient.values * scale,
-        gradient.indices,
-        dense_shape=gradient.dense_shape)
-  else:
-    return gradient * scale
+    """Multiply a (possibly sparse) gradient by the given scale factor."""
+    scale = tf.cast(scale, gradient.dtype)
+    if isinstance(gradient, tf.IndexedSlices):
+        return tf.IndexedSlices(
+            gradient.values * scale,
+            gradient.indices,
+            dense_shape=gradient.dense_shape,
+        )
+    else:
+        return gradient * scale
 
 
 def strategy_supports_loss_scaling():
-  """Returns True if the current Strategy supports loss scaling."""
-  if not tf.distribute.has_strategy():
-    return True
-  strategy = tf.distribute.get_strategy()
-  # Strategies are supported if either there is only one replica or if variables
-  # are replicated per device. Otherwise, the current model.fit() implementation
-  # and most custom training loops incorrectly unscale the gradients. Currently,
-  # gradients are unscaled once per compute replica, but they should be unscaled
-  # once per variable replica. When there is one variable replica for each
-  # compute replica, this works fine, but otherwise issues will occur.
-  # TODO(reedwm): Support all strategies.
-  return isinstance(strategy, (
-      tf.distribute.MultiWorkerMirroredStrategy,
-      tf.compat.v1.distribute.experimental.MultiWorkerMirroredStrategy,
-      tf.distribute.OneDeviceStrategy,
-      tf.compat.v1.distribute.OneDeviceStrategy,
-      tf.distribute.MirroredStrategy,
-      tf.compat.v1.distribute.MirroredStrategy,
-  ))
+    """Returns True if the current Strategy supports loss scaling."""
+    if not tf.distribute.has_strategy():
+        return True
+    strategy = tf.distribute.get_strategy()
+    # Strategies are supported if either there is only one replica or if
+    # variables are replicated per device. Otherwise, the current model.fit()
+    # implementation and most custom training loops incorrectly unscale the
+    # gradients. Currently, gradients are unscaled once per compute replica, but
+    # they should be unscaled once per variable replica. When there is one
+    # variable replica for each compute replica, this works fine, but otherwise
+    # issues will occur.
+    # TODO(reedwm): Support all strategies.
+    return (
+        isinstance(
+            strategy,
+            (
+                tf.distribute.MultiWorkerMirroredStrategy,
+                tf.compat.v1.distribute.experimental.MultiWorkerMirroredStrategy,  # noqa: E501
+                tf.distribute.OneDeviceStrategy,
+                tf.compat.v1.distribute.OneDeviceStrategy,
+                tf.distribute.MirroredStrategy,
+                tf.compat.v1.distribute.MirroredStrategy,
+            ),
+        )
+        or dtensor_utils.running_with_dtensor_strategy()
+    )
 
 
 def _raise_if_strategy_unsupported():
-  """Raise an exception if the current strategy doesn't support loss scaling."""
-  if not strategy_supports_loss_scaling():
-    strategy = tf.distribute.get_strategy()
-    if isinstance(strategy,
-                  (tf.distribute.experimental.TPUStrategy,
-                   tf.compat.v1.distribute.experimental.TPUStrategy,
-                   tf.distribute.TPUStrategy)):
-      raise ValueError(
-          'Loss scaling is not supported with TPUStrategy. Loss scaling is '
-          'unnecessary with TPUs, since they support bfloat16 instead of '
-          'float16 and bfloat16 does not require loss scaling. You should '
-          'remove the use of the LossScaleOptimizer when TPUs are used.')
-    else:
-      raise ValueError(f'Loss scaling is not supported with the '
-                       f'tf.distribute.Strategy: '
-                       f'{strategy.__class__.__name__}. Try using a different '
-                       f'Strategy, e.g. a MirroredStrategy')
+    """Raise an exception if the current strategy doesn't support loss
+    scaling."""
+    if not strategy_supports_loss_scaling():
+        strategy = tf.distribute.get_strategy()
+        if isinstance(
+            strategy,
+            (
+                tf.distribute.experimental.TPUStrategy,
+                tf.compat.v1.distribute.experimental.TPUStrategy,
+                tf.distribute.TPUStrategy,
+            ),
+        ):
+            raise ValueError(
+                "Loss scaling is not supported with TPUStrategy. Loss scaling "
+                "is unnecessary with TPUs, since they support bfloat16 instead "
+                "of float16 and bfloat16 does not require loss scaling. You "
+                "should remove the use of the LossScaleOptimizer when TPUs are "
+                "used."
+            )
+        else:
+            raise ValueError(
+                "Loss scaling is not supported with the "
+                "tf.distribute.Strategy: "
+                f"{strategy.__class__.__name__}. Try using a different "
+                "Strategy, e.g. a MirroredStrategy"
+            )
diff --git a/keras/mixed_precision/loss_scale_optimizer_test.py b/keras/mixed_precision/loss_scale_optimizer_test.py
index fd495d51ee3d..e7c2885bca79 100644
--- a/keras/mixed_precision/loss_scale_optimizer_test.py
+++ b/keras/mixed_precision/loss_scale_optimizer_test.py
@@ -17,23 +17,27 @@
 import os
 from unittest import mock
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 from keras import optimizers
 from keras.mixed_precision import loss_scale_optimizer
 from keras.mixed_precision import test_util as mp_test_util
-from keras.optimizers.optimizer_experimental import optimizer as optimizer_experimental
-from keras.optimizers.optimizer_experimental import sgd as sgd_experimental
-from keras.optimizers.optimizer_v2 import adam
-from keras.optimizers.optimizer_v2 import gradient_descent
-from keras.optimizers.optimizer_v2 import optimizer_v2
+from keras.optimizers import adam as adam_experimental
+from keras.optimizers import optimizer as optimizer_experimental
+from keras.optimizers import sgd as sgd_experimental
+from keras.optimizers.legacy import adam
+from keras.optimizers.legacy import gradient_descent
+from keras.optimizers.legacy import optimizer_v2
+from keras.optimizers.schedules import learning_rate_schedule
 from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.framework import test_util as tf_test_utils
-from tensorflow.python.keras.optimizer_v2 import gradient_descent as legacy_sgd
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 from tensorflow.python.platform import tf_logging
 
 # If called outside any strategy.scope() calls, this will return the default
@@ -42,1142 +46,1340 @@
 
 
 def create_mirrored_strategy():
-  if tf.config.list_logical_devices('GPU'):
-    return tf.distribute.MirroredStrategy(['cpu:0', 'gpu:0'])
-  else:
-    return tf.distribute.MirroredStrategy(['cpu:0'])
+    if tf.config.list_logical_devices("GPU"):
+        return tf.distribute.MirroredStrategy(["cpu:0", "gpu:0"])
+    else:
+        return tf.distribute.MirroredStrategy(["cpu:0"])
 
 
 STRATEGY_FNS = [default_strategy_fn, create_mirrored_strategy]
 
 
 def create_sgd(base_optimizer_cls, *args, **kwargs):
-  """Creates an SGD optimizer.
-
-  Will return either the new experimental SGD optimizer subclassing from
-  `optimizer_experimental.Optimizer` or the old SGD optimizer subclassing from
-  `optimizer_v2.OptimizerV2`, depending on `base_optimizer_cls`.
-
-  Args:
-    base_optimizer_cls: What the superclass of the returned SGD optimizer will
-      be. Either `optimizer_experimental.Optimizer` or
-      `optimizer_v2.OptimizerV2`.
-    *args: Arguments to pass to the SGD constructor
-    **kwargs: Keyword arguments to pass to the SGD constructor.
-
-  Returns:
-    An SGD optimizer.
-  """
-  if base_optimizer_cls == optimizer_v2.OptimizerV2:
-    return gradient_descent.SGD(*args, **kwargs)
-  else:
-    assert base_optimizer_cls == optimizer_experimental.Optimizer, (
-        f'Got invalid base_optimizer_cls: {base_optimizer_cls}')
-    return sgd_experimental.SGD(*args, **kwargs)
+    """Creates an SGD optimizer.
+
+    Will return either the new experimental SGD optimizer subclassing from
+    `optimizer_experimental.Optimizer` or the old SGD optimizer subclassing from
+    `optimizer_v2.OptimizerV2`, depending on `base_optimizer_cls`.
+
+    Args:
+      base_optimizer_cls: What the superclass of the returned SGD optimizer will
+        be. Either `optimizer_experimental.Optimizer` or
+        `optimizer_v2.OptimizerV2`.
+      *args: Arguments to pass to the SGD constructor
+      **kwargs: Keyword arguments to pass to the SGD constructor.
+
+    Returns:
+      An SGD optimizer.
+    """
+    if base_optimizer_cls == optimizer_v2.OptimizerV2:
+        return gradient_descent.SGD(*args, **kwargs)
+    else:
+        assert (
+            base_optimizer_cls == optimizer_experimental.Optimizer
+        ), f"Got invalid base_optimizer_cls: {base_optimizer_cls}"
+        return sgd_experimental.SGD(*args, **kwargs)
 
 
 # TODO(b/215568552): Remove this as the delegation is handled by metaclass.
-def create_lso(inner_optimizer,
-               dynamic=True,
-               initial_scale=None,
-               dynamic_growth_steps=None):
-  """Creates a LossScaleOptimizer.
-
-  Creates either the new LossScaleOptimizerV3 subclassing from
-  `optimizer_experimental.Optimizer` or the old LossScaleOptimizer subclassing
-  from `optimizer_v2.OptimizerV2`, depending on the type of `inner_optimizer`.
-
-  Args:
-    inner_optimizer: The optimizer to wrap. Either an
-      `optimizer_experimental.Optimizer` or an `optimizer_v2.OptimizerV2`.
-    dynamic: Whether dynamic loss scaling is used.
-    initial_scale: The initial loss scale.
-    dynamic_growth_steps: How frequently to increase the dynamic loss scale.
-
-  Returns:
-    Returns a LossScaleOptimizerV3 or a LossScaleOptimizer, depending on the
-    type of `inner_optimizer`.
-  """
-  return loss_scale_optimizer.BaseLossScaleOptimizer(
-      inner_optimizer,
-      dynamic=dynamic,
-      initial_scale=initial_scale,
-      dynamic_growth_steps=dynamic_growth_steps)
+def create_lso(
+    inner_optimizer, dynamic=True, initial_scale=None, dynamic_growth_steps=None
+):
+    """Creates a LossScaleOptimizer.
+
+    Creates either the new LossScaleOptimizerV3 subclassing from
+    `optimizer_experimental.Optimizer` or the old LossScaleOptimizer subclassing
+    from `optimizer_v2.OptimizerV2`, depending on the type of `inner_optimizer`.
+
+    Args:
+      inner_optimizer: The optimizer to wrap. Either an
+        `optimizer_experimental.Optimizer` or an `optimizer_v2.OptimizerV2`.
+      dynamic: Whether dynamic loss scaling is used.
+      initial_scale: The initial loss scale.
+      dynamic_growth_steps: How frequently to increase the dynamic loss scale.
+
+    Returns:
+      Returns a LossScaleOptimizerV3 or a LossScaleOptimizer, depending on the
+      type of `inner_optimizer`.
+    """
+    return loss_scale_optimizer.BaseLossScaleOptimizer(
+        inner_optimizer,
+        dynamic=dynamic,
+        initial_scale=initial_scale,
+        dynamic_growth_steps=dynamic_growth_steps,
+    )
 
 
 def opt_and_strategy_and_mode_combinations():
-  """Returns combinations for running with multiple optimizers and strategies.
-
-  Returns:
-    Combinations that run with both OptimizerV2 and the experimental optimizer;
-    and with the default strategy and mirrored strategy; and in both graph and
-    eager mode.
-  """
-  # For the experimental optimizer, don't use graph mode directly since it's
-  # unsupported. Instead, run both without and with a tf.function, in order to
-  # test both graph and eager mode.
-  experimental_opt_combinations = test_combinations.combine(
-      opt_cls=optimizer_experimental.Optimizer,
-      strategy_fn=STRATEGY_FNS,
-      mode='eager',
-      use_tf_function=[False, True])
-  orig_opt_combinations = test_combinations.combine(
-      opt_cls=optimizer_v2.OptimizerV2,
-      strategy_fn=STRATEGY_FNS,
-      mode=['graph', 'eager'],
-      use_tf_function=False)
-  return experimental_opt_combinations + orig_opt_combinations
+    """Returns combinations for running with multiple optimizers and strategies.
+
+    Returns:
+      Combinations that run with both OptimizerV2 and the experimental
+      optimizer; and with the default strategy and mirrored strategy; and in
+      both graph and eager mode.
+    """
+    # For the experimental optimizer, don't use graph mode directly since it's
+    # unsupported. Instead, run both without and with a tf.function, in order to
+    # test both graph and eager mode.
+    experimental_opt_combinations = test_combinations.combine(
+        opt_cls=optimizer_experimental.Optimizer,
+        strategy_fn=STRATEGY_FNS,
+        mode="eager",
+        use_tf_function=[False, True],
+    )
+    orig_opt_combinations = test_combinations.combine(
+        opt_cls=optimizer_v2.OptimizerV2,
+        strategy_fn=STRATEGY_FNS,
+        mode=["graph", "eager"],
+        use_tf_function=False,
+    )
+    return experimental_opt_combinations + orig_opt_combinations
 
 
 def opt_combinations_only():
-  """Returns two combinations for running with the two base optimizers."""
-  experimental_opt_combinations = test_combinations.combine(
-      mode='eager', opt_cls=optimizer_experimental.Optimizer)
-  orig_opt_combination = test_combinations.combine(
-      opt_cls=optimizer_v2.OptimizerV2)
-  return experimental_opt_combinations + orig_opt_combination
+    """Returns two combinations for running with the two base optimizers."""
+    experimental_opt_combinations = test_combinations.combine(
+        mode="eager", opt_cls=optimizer_experimental.Optimizer
+    )
+    orig_opt_combination = test_combinations.combine(
+        opt_cls=optimizer_v2.OptimizerV2
+    )
+    return experimental_opt_combinations + orig_opt_combination
 
 
 @tf_test_utils.with_control_flow_v2
 class LossScaleOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def _run_if_in_graph_mode(self, val):
+        # Running only in graph mode is useful, because optimizers sometimes
+        # return a value that, in Graph mode, is runnable with self.evaluate.
+        # But in Eager mode, the optimizer already does the computations and the
+        # return value cannot be run.
+        if not tf.executing_eagerly():
+            self.evaluate(val)
+
+    def _eval_if_tensor(self, val):
+        # Calls self.evaluate on val if val is a Tensor or Variable. This is
+        # useful, since hyperparameters are tf.Variables on OptimizerV2 and are
+        # Python floats on the experimental optimizer.
+        return (
+            self.evaluate(val)
+            if isinstance(val, (tf.Tensor, tf.Variable))
+            else val
+        )
+
+    def _run_fn_with_grad_check(self, strategy, var, opt, expected_grad):
+        grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
+            expected_grad
+        )
+        loss = lambda: grad_check_fn(var) / strategy.num_replicas_in_sync
+        return lambda: opt.minimize(loss, var_list=[var])
+
+    def testIsInstance(self):
+        optimizer = create_lso(sgd_experimental.SGD())
+        self.assertIsInstance(
+            optimizer, loss_scale_optimizer.BaseLossScaleOptimizer
+        )
+
+        optimizer = create_lso(gradient_descent.SGD())
+        self.assertIsInstance(
+            optimizer, loss_scale_optimizer.BaseLossScaleOptimizer
+        )
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testFixedLossScaleAppliedToLossWithMinimize(
+        self, opt_cls, strategy_fn, use_tf_function
+    ):
+        with strategy_fn().scope() as strategy:
+            var = tf.Variable([5.0])
+            opt = create_sgd(opt_cls, 2.0)
+            loss_scale = 10.0
+            opt = create_lso(opt, dynamic=False, initial_scale=loss_scale)
+            self.assertEqual(self.evaluate(opt.loss_scale), loss_scale)
+            self.assertIsInstance(opt.loss_scale, tf.Tensor)
+            # We need num_replicas_in_sync to divide loss_scale, otherwise
+            # loss_scale / strategy.num_replicas_in_sync will not be exact,
+            # which could lead to assertion failures due to rounding issues.
+            self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0)
+            run_fn = self._run_fn_with_grad_check(
+                strategy, var, opt, loss_scale / strategy.num_replicas_in_sync
+            )
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+            # The loss is the identity of the variable. Therefore the gradient
+            # is 1, and so the variable will be init_val - grad * lr == 5 - 1 *
+            # 2 == 3
+            self.assertAllClose([3.0], self.evaluate(var))
+
+    def testFixedLossScaleAppliedToLossWithGetGradients(self):
+        with tf.Graph().as_default():
+            var = tf.Variable([2.0])
+            opt = gradient_descent.SGD(1.0)
+            loss_scale = 10.0
+            opt = loss_scale_optimizer.LossScaleOptimizer(
+                opt, dynamic=False, initial_scale=loss_scale
+            )
+            grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
+                loss_scale
+            )
+            loss = grad_check_fn(var)
+            run_op = opt.get_gradients(loss, [var])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # This will cause an assertion to run, as
+            # mp_test_util.create_identity_with_grad_check_fn added an assertion
+            # op.
+            self.evaluate(run_op)
+
+    @test_combinations.generate(opt_combinations_only())
+    def testDynamicAttrsWithFixedLossScale(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        opt = create_lso(opt, dynamic=False, initial_scale=2.0)
+        self.assertFalse(opt.dynamic)
+        self.assertIsNone(opt.dynamic_counter)
+        self.assertIsNone(opt.dynamic_growth_steps)
+
+    @test_combinations.generate(opt_combinations_only())
+    def testGetScaledLoss(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        opt = create_lso(opt, dynamic=False, initial_scale=2.0)
+        loss = tf.convert_to_tensor(5.0)
+        self.assertEqual(10.0, self.evaluate(opt.get_scaled_loss(loss)))
+        self.assertEqual(
+            10.0, self.evaluate(opt.get_scaled_loss(lambda: loss)())
+        )
+        loss = tf.convert_to_tensor(5.0, dtype="float16")
+        self.assertEqual(10.0, self.evaluate(opt.get_scaled_loss(loss)))
+        self.assertEqual(
+            10.0, self.evaluate(opt.get_scaled_loss(lambda: loss)())
+        )
+
+    @test_combinations.generate(opt_combinations_only())
+    def testGetUnscaledGradients(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        opt = create_lso(opt, dynamic=False, initial_scale=2)
+        scaled_grads = [
+            tf.convert_to_tensor(3.0),
+            None,
+            tf.convert_to_tensor(-4.0, dtype="float16"),
+        ]
+        grads = opt.get_unscaled_gradients(scaled_grads)
+        grads = [self.evaluate(g) if g is not None else g for g in grads]
+        self.assertEqual([1.5, None, -2.0], grads)
+
+    @test_combinations.generate(opt_combinations_only())
+    def testGetUnscaledSparseGradients(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        opt = create_lso(opt, dynamic=False, initial_scale=2)
+        sparse_scaled_grad = tf.IndexedSlices(
+            tf.convert_to_tensor([[4.0, 2.0], [8.0, 5.0]]),
+            tf.convert_to_tensor([1, 3], dtype="int32"),
+            dense_shape=tf.convert_to_tensor([5, 2], dtype="int32"),
+        )
+        sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
+        self.assertIsInstance(sparse_grad, tf.IndexedSlices)
+        self.assertAllEqual(
+            [[2.0, 1.0], [4.0, 2.5]], self.evaluate(sparse_grad.values)
+        )
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testDynamicLossScale(self, opt_cls, strategy_fn, use_tf_function):
+        strategy = strategy_fn()
+        learning_rate = 2.0
+        expected_gradient = tf.Variable(
+            learning_rate / strategy.num_replicas_in_sync
+        )
+        with strategy.scope():
+            var = tf.Variable([5.0])
+            opt = create_sgd(opt_cls, learning_rate)
+            opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
+            self.assertEqual(opt.initial_scale, 2.0)
+            self.assertIsInstance(opt.initial_scale, float)
+            self.assertEqual(opt.dynamic_growth_steps, 1)
+            self.assertIsInstance(opt.dynamic_growth_steps, int)
+
+            self.assertEqual(
+                opt.initial_scale % strategy.num_replicas_in_sync, 0
+            )
+            run_fn = self._run_fn_with_grad_check(
+                strategy, var, opt, expected_gradient
+            )
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+            # The loss is the identity of the variable. Therefore the gradient
+            # is 1, and so the variable will be init_val - grad * lr == 5 - 1 *
+            # 2 == 3
+            self.assertAllClose([3.0], self.evaluate(var))
+
+            # Loss scale will be double, so the expected gradient is also
+            # doubled.
+            self.evaluate(
+                expected_gradient.assign(
+                    2 * learning_rate / strategy.num_replicas_in_sync
+                )
+            )
+            run_op = strategy.experimental_run(run_fn)
+            self._run_if_in_graph_mode(run_op)
+            # As before, the 2 is subtracted from the variable, making it's new
+            # value 1.
+            self.assertAllClose([1.0], self.evaluate(var))
+
+    @test_combinations.generate(opt_combinations_only())
+    def testDynamicLossScaleDefaultValues(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        opt = create_lso(opt)
+        self.assertEqual(opt.initial_scale, 2**15)
+        self.assertEqual(opt.dynamic_growth_steps, 2000)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertEqual(self.evaluate(opt.loss_scale), 2**15)
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testClipping(self, opt_cls, strategy_fn, use_tf_function):
+        strategy = strategy_fn()
+        learning_rate = 2.0
+        for clip_type in ("clipnorm", "global_clipnorm", "clipvalue"):
+            with strategy.scope(), self.subTest(clip_type=clip_type):
+                var = tf.Variable([5.0])
+                opt = create_sgd(opt_cls, learning_rate, **{clip_type: 2.0})
+                opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
+                if isinstance(opt, loss_scale_optimizer.LossScaleOptimizer):
+                    # Only OptimizerV2 exposes the clipping attributes
+                    self.assertEqual(getattr(opt, clip_type), 2.0)
+                self.assertEqual(
+                    opt.initial_scale % strategy.num_replicas_in_sync, 0
+                )
+
+                loss = lambda: var * 4 / strategy.num_replicas_in_sync
+                run_fn = lambda: opt.minimize(loss, var_list=[var])
+                if use_tf_function:
+                    run_fn = tf.function(run_fn)
+
+                # Test running with clipped gradients
+                run_op = strategy.experimental_run(run_fn)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self._run_if_in_graph_mode(run_op)
+                # The gradient is 4 but is clipped to 2, so the variable will be
+                # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
+                self.assertAllClose([1.0], self.evaluate(var))
+                self.assertEqual(self.evaluate(opt.loss_scale), 4)
+
+                if isinstance(opt, loss_scale_optimizer.LossScaleOptimizerV3):
+                    # Only OptimizerV2 exposes the clipping attributes, so we
+                    # cannot set them on the new optimizer
+                    return
+                # Test changing the clip amount and running again
+                setattr(opt, clip_type, 3.0)
+                run_op = strategy.experimental_run(run_fn)
+                self._run_if_in_graph_mode(run_op)
+                # The gradient is 4 but is clipped to 3, so the variable will be
+                # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
+                self.assertAllClose([-5.0], self.evaluate(var))
+                self.assertEqual(self.evaluate(opt.loss_scale), 8)
+
+                # Test Inf gradients are still skipped instead of being clipped
+                loss = lambda: var * float("Inf")
+                run_fn = lambda: opt.minimize(loss, var_list=[var])
+                run_op = strategy.experimental_run(run_fn)
+                self._run_if_in_graph_mode(run_op)
+                self.assertAllClose(
+                    [-5.0], self.evaluate(var)
+                )  # Var does not change
+                self.assertEqual(self.evaluate(opt.loss_scale), 4)
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testDynamicUpdate(self, opt_cls, strategy_fn, use_tf_function):
+        with strategy_fn().scope() as strategy:
+            var = tf.Variable([1.0, 2.0])
+            opt = create_sgd(opt_cls, 1.0)
+            opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
+
+            # Test optimizer with finite gradients
+            loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
+            run_fn = lambda: opt.minimize(loss, var_list=[var])
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+            # Gradient is 2, so variable will have 2 subtracted from it
+            self.assertAllClose([-1.0, 0.0], self.evaluate(var))
+            # Loss scale has doubled from 2 to 4
+            self.assertEqual(4.0, self.evaluate(opt.loss_scale))
+
+            # Test optimizer with NaN gradients
+            loss = lambda: var * float("NaN")
+            run_fn = lambda: opt.minimize(loss, var_list=[var])
+            run_op = strategy.experimental_run(run_fn)
+            self._run_if_in_graph_mode(run_op)
+            # Variable should not change from before, due to NaN gradients.
+            self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
+            # Loss scale should half due to NaN gradients.
+            self.assertEqual(2.0, self.evaluate(opt.loss_scale))
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testDynamicLossScaleWithFloat16Loss(
+        self, opt_cls, strategy_fn, use_tf_function
+    ):
+        strategy = strategy_fn()
+        learning_rate = 2.0
+        with strategy.scope():
+            var = tf.Variable([5.0])
+            opt = create_sgd(opt_cls, learning_rate)
+            opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
+
+            def loss():
+                return tf.cast(var / strategy.num_replicas_in_sync, "float16")
+
+            run_fn = lambda: opt.minimize(loss, var_list=[var])
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+            # The loss is the identity of the variable. Therefore the gradient
+            # is 1, and so the variable will be init_val - grad * lr == 5 - 1 *
+            # 2 == 3
+            self.assertAllClose([3.0], self.evaluate(var))
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testNanOnOneReplicaOnly(self, opt_cls, strategy_fn, use_tf_function):
+        if strategy_fn == default_strategy_fn:
+            self.skipTest("The test is only useful for non-default strategies")
+        if not tf.test.is_gpu_available():
+            self.skipTest("Test requires GPU")
+        if (
+            not tf.executing_eagerly()
+            and not tf.compat.v1.control_flow_v2_enabled()
+        ):
+            self.skipTest(
+                "b/181283011: GradientTape does not work properly with "
+                "V1 control flow, and opt.minimize uses GradientTape"
+            )
+        with strategy_fn().scope() as strategy:
+            var = tf.Variable([1.0, 2.0])
+            opt = create_sgd(opt_cls, 1.0)
+            opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=2)
+
+            def loss():
+                rep_id = (
+                    tf.distribute.get_replica_context().replica_id_in_sync_group
+                )
+                # The last element of last replica's gradient is NaN.
+                return tf.cond(
+                    tf.equal(rep_id, 0),
+                    lambda: var * 2.0,
+                    lambda: var * tf.constant([1.0, float("NaN")]),
+                )
+
+            run_fn = lambda: opt.minimize(loss, var_list=[var])
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+            # Variable should not change from before, due to NaN gradients.
+            self.assertAllClose(self.evaluate(var), [1.0, 2.0])
+            # Loss scale should half due to NaN gradients.
+            self.assertEqual(1.0, self.evaluate(opt.loss_scale))
+
+    def testCustomAggregater(self):
+        def gradient_aggregator(grads_and_vars):
+            # Simulate an all-reduce where a replica has a NaN gradient by
+            # setting the last gradient to NaN
+            grads_and_vars = list(grads_and_vars)
+            last_grad, last_var = grads_and_vars[-1]
+            grads_and_vars[-1] = (last_grad * float("NaN"), last_var)
+            return grads_and_vars
+
+        var = tf.Variable([1.0, 2.0])
+        opt = gradient_descent.SGD(1.0, gradient_aggregator=gradient_aggregator)
+        opt = loss_scale_optimizer.LossScaleOptimizer(
+            opt, initial_scale=2, dynamic_growth_steps=2
+        )
+
+        loss = lambda: var * 2
+        run_op = opt.minimize(loss, var_list=[var])
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self._run_if_in_graph_mode(run_op)
+        # Variable should not change from before, due to NaN gradients.
+        self.assertAllClose(self.evaluate(var), [1.0, 2.0])
+        # Loss scale should half due to NaN gradients.
+        self.assertEqual(1.0, self.evaluate(opt.loss_scale))
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testDynamicLossScaleWithSlots(
+        self, opt_cls, strategy_fn, use_tf_function
+    ):
+        strategy_obj = strategy_fn()
+        if (
+            isinstance(strategy_obj, tf.distribute.MirroredStrategy)
+            and tf.compat.v1.control_flow_v2_enabled()
+            and not tf.executing_eagerly()
+        ):
+            self.skipTest("b/138667997")
+        with strategy_obj.scope() as strategy:
+            var = tf.Variable([1.0, 2.0])
+            # An SGD optimizer with momentum has slot variables.
+            opt = create_sgd(opt_cls, 1.0, momentum=1.0)
+            initial_scale = 2.0
+            opt = create_lso(
+                opt, initial_scale=initial_scale, dynamic_growth_steps=1
+            )
+            loss = lambda: var / strategy.num_replicas_in_sync
+            run_fn = lambda: opt.minimize(loss, var_list=[var])
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+            # The momentum accumulator starts at 0 and the gradient is 1. The
+            # accumulator is incremented by the gradient, so it is now 1. Then
+            # the variable is subtracted by the accumulator, so the variable is
+            # subtracted by 1.
+            self.assertAllClose([0.0, 1.0], self.evaluate(var))
+            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2)
+
+            run_op = strategy.experimental_run(run_fn)
+            self._run_if_in_graph_mode(run_op)
+            # The momentum accumulator was 1 before this step and the gradient
+            # is 1. The accumulator is incremented by the gradient, so it is
+            # now 2. Then the variable is subtracted by the accumulator, so the
+            # variable is subtracted by 2.
+            self.assertAllClose([-2.0, -1.0], self.evaluate(var))
+            self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4)
+
+            if isinstance(opt, loss_scale_optimizer.LossScaleOptimizer):
+                self.assertEqual(opt.get_slot_names(), ["momentum"])
+
+    def testIterations(self):
+        opt = gradient_descent.SGD(2.0)
+        lso = loss_scale_optimizer.LossScaleOptimizer(
+            opt, dynamic=False, initial_scale=10.0
+        )
+        lso.iterations = 7
+        self.assertEqual(lso.iterations, 7)
+        self.assertEqual(opt.iterations, 7)
+
+    @test_combinations.generate(opt_and_strategy_and_mode_combinations())
+    def testIterationsIncremented(self, opt_cls, strategy_fn, use_tf_function):
+        with strategy_fn().scope() as strategy:
+            # Test iterations is incremented in opt.minimize.
+            opt = create_sgd(opt_cls, 1.0)
+            opt = create_lso(opt)
+            var = tf.Variable([5.0])
+            loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
+            run_fn = lambda: opt.minimize(loss, [var])
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+            self.assertEqual(
+                self.evaluate(var), 3.0
+            )  # Grad is 2, so var is 5 - 2
+            self.assertEqual(self.evaluate(opt.iterations), 1)
+
+            # Test iterations is incremented in opt.minimize even if gradients
+            # aren't applied to variables due to NaN gradients.
+            loss = lambda: var * float("NaN")
+            run_fn = lambda: opt.minimize(loss, [var])
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            run_op = strategy.experimental_run(run_fn)
+            self._run_if_in_graph_mode(run_op)
+            self.assertEqual(self.evaluate(var), 3.0)
+            self.assertEqual(self.evaluate(opt.iterations), 2)
+
+    def testWeightMethods(self):
+        with self.test_session():
+            var = tf.Variable([1.0])
+            opt = gradient_descent.SGD(1.0)
+            opt = loss_scale_optimizer.LossScaleOptimizer(
+                opt, initial_scale=2.0, dynamic_growth_steps=1
+            )
+            run_op = opt.minimize(lambda: var * 2, [var])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self._run_if_in_graph_mode(run_op)
+
+            self.assertLen(opt.weights, 1)  # The 'iterations' weight
+            self.assertEqual(self.evaluate(opt.weights[0]), 1)
+            self.assertEqual(opt.get_weights()[0], 1)
+            self.assertEqual(self.evaluate(opt.variables()[0]), 1)
+            opt.set_weights([np.array(2.0)])
+            self.assertEqual(self.evaluate(opt.variables()[0]), 2)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def testHyperParametersExposedLSOV3(self):
+        opt = adam_experimental.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
+        lso = loss_scale_optimizer.BaseLossScaleOptimizer(opt)
+        lso.learning_rate = tf.Variable(0.005)
+        self.assertAllClose(self.evaluate(lso.learning_rate), 0.005)
+        self.assertIs(lso.learning_rate, opt.learning_rate)
+
+        lso.use_ema = True
+        self.assertEqual(lso.use_ema, True)
+        self.assertEqual(opt.use_ema, True)
+
+        lso.ema_momentum = 0.88
+        self.assertEqual(lso.ema_momentum, 0.88)
+        self.assertEqual(opt.ema_momentum, 0.88)
+
+    def testHyperParametersExposed(self):
+        with self.cached_session():
+            opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
+            lso = loss_scale_optimizer.LossScaleOptimizer(opt)
+            # Force hyperparameters to be created
+            opt.lr
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            self.assertEqual(self.evaluate(lso.beta_1), 0.5)
+            self.assertIsInstance(lso.beta_1, tf.Variable)
+            self.assertEqual(self.evaluate(lso.lr), 1.0)
+            self.assertIs(lso.lr, opt.lr)
+            self.assertIs(lso.lr, lso.learning_rate)
+
+            lso.beta_1 = 0.25
+            self.assertEqual(self.evaluate(lso.beta_1), 0.25)
+            self.assertEqual(self.evaluate(opt.beta_1), 0.25)
+            self.assertIs(lso.beta_1, opt.beta_1)
+            opt.beta_1 = 0.75
+            self.assertEqual(self.evaluate(lso.beta_1), 0.75)
+            self.assertEqual(self.evaluate(opt.beta_1), 0.75)
+            self.assertIs(lso.beta_1, opt.beta_1)
+            lso.lr = 2.0
+            self.assertEqual(self.evaluate(lso.lr), 2.0)
+            self.assertEqual(self.evaluate(lso.learning_rate), 2.0)
+            self.assertEqual(self.evaluate(opt.lr), 2.0)
+            self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+            self.assertIs(lso.lr, opt.lr)
+
+            # Test setting attribute that is both attribute on
+            # LossScaleOptimizer and hyperparameter on wrapped optimizer.
+            class MyOpt(gradient_descent.SGD):
+                def __init__(self):
+                    super().__init__()
+                    self._set_hyper("loss_scale", 123.0)
+
+            opt = MyOpt()
+            lso = loss_scale_optimizer.LossScaleOptimizer(opt)
+            with self.assertRaises(AttributeError):
+                lso.loss_scale = 2.0
+
+    @test_combinations.generate(opt_combinations_only())
+    def testArbitraryAttributesNotExposed(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        lso = create_lso(opt)
+        self.assertFalse(opt.nesterov)
+        with self.assertRaisesRegex(
+            AttributeError,
+            "'LossScaleOptimizer(V3)?' object has no attribute 'nesterov'",
+        ):
+            lso.nesterov
+
+        lso.nesterov = True
+        self.assertTrue(lso.nesterov)
+        self.assertFalse(opt.nesterov)
+
+    def testDir(self):
+        lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
+        dir_result = dir(lso)
+        self.assertIn("learning_rate", dir_result)  # Hyperparameter
+        self.assertIn("lr", dir_result)  # Hyperparameter
+        self.assertIn("minimize", dir_result)  # Attribute
+        self.assertIn("loss_scale", dir_result)  # Attribute
+        self.assertNotIn("nesterov", dir_result)  # Attribute on inner optimizer
+        self.assertIn("nesterov", dir(lso.inner_optimizer))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testApplyGradientsGetsUnwrappedTensors(self):
+        # Tests that gradients passed to apply_gradients are not wrapped in a
+        # DistributionStrategy wrapper, such as PerReplica, but instead are raw
+        # Tensors. Optimizer subclasses that override apply_gradients() expect
+        # raw Tensors, even though the base Optimizer can handle PerReplica
+        # gradients.
+
+        outer_self = self
+
+        class MyOptimizer(gradient_descent.SGD):
+            def apply_gradients(
+                self,
+                grads_and_vars,
+                name=None,
+                experimental_aggregate_gradients=True,
+            ):
+                for grad, _ in grads_and_vars:
+                    outer_self.assertIsInstance(grad, tf.Tensor)
+                return super().apply_gradients(
+                    grads_and_vars, name, experimental_aggregate_gradients
+                )
+
+        with create_mirrored_strategy().scope() as strategy:
+            var = tf.Variable([5.0])
+            opt = MyOptimizer(learning_rate=1.0)
+            opt = loss_scale_optimizer.LossScaleOptimizer(
+                opt, dynamic=False, initial_scale=1
+            )
+            loss = lambda: var * 2.0
+            run_fn = lambda: opt.minimize(loss, [var])
+            strategy.experimental_run(run_fn)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode="eager", use_tf_function=[False, True])
+    )
+    def testApplyGradientsGetsUnwrappedTensorsWithNewOptimizer(
+        self, use_tf_function
+    ):
+        outer_self = self
+
+        class MyOptimizer(sgd_experimental.SGD):
+            def apply_gradients(
+                self,
+                grads_and_vars,
+                skip_gradients_aggregation=False,
+                experimental_aggregate_gradients=True,
+            ):
+                for grad, _ in grads_and_vars:
+                    outer_self.assertIsInstance(grad, tf.Tensor)
+                return super().apply_gradients(
+                    grads_and_vars,
+                    skip_gradients_aggregation=skip_gradients_aggregation,
+                )
+
+        with create_mirrored_strategy().scope() as strategy:
+            var = tf.Variable([5.0])
+            opt = MyOptimizer(learning_rate=1.0)
+            opt = loss_scale_optimizer.LossScaleOptimizerV3(
+                opt, dynamic=False, initial_scale=1
+            )
+            loss = lambda: var * 2.0
+            run_fn = lambda: opt.minimize(loss, [var])
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            strategy.experimental_run(run_fn)
+
+    @test_combinations.generate(opt_combinations_only())
+    def testLossScaleDelegationWithWrapper(self, opt_cls):
+        # Test learning_rate is exposed when LossScaleOptimizer wraps another
+        # wrapper.
+
+        class MyOptimizer(opt_cls):
+            def __init__(self):
+                super().__init__("MyOptimizer")
+                self.inner_optimizer = create_sgd(opt_cls, learning_rate=1.0)
+
+            @property
+            def learning_rate(self):
+                return self.inner_optimizer.learning_rate
+
+            @learning_rate.setter
+            def learning_rate(self, value):
+                self.inner_optimizer.learning_rate = value
+
+            def get_config(self):
+                return {}
+
+        with self.cached_session():
+            opt = MyOptimizer()
+            opt = create_lso(opt)
+
+            # Force hyperparameters to be created
+            opt.learning_rate
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            self.assertEqual(self.evaluate(opt.learning_rate), 1.0)
+            self.assertEqual(
+                self.evaluate(
+                    opt.inner_optimizer.inner_optimizer.learning_rate
+                ),
+                1.0,
+            )
+            opt.learning_rate = 2.0
+            self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+            self.assertEqual(
+                self.evaluate(
+                    opt.inner_optimizer.inner_optimizer.learning_rate
+                ),
+                2.0,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(
+            opt_cls=optimizer_v2.OptimizerV2,
+            strategy_fn=STRATEGY_FNS,
+            mode=["graph", "eager"],
+            use_tf_function=False,
+            save_with_ls=[False, True],
+            restore_with_ls=[False, True],
+        )
+        + test_combinations.combine(
+            opt_cls=optimizer_experimental.Optimizer,
+            strategy_fn=STRATEGY_FNS,
+            mode="eager",
+            use_tf_function=[False, True],
+            save_with_ls=[False, True],
+            restore_with_ls=[False, True],
+        )
+    )
+    def testCheckpoint(
+        self,
+        opt_cls,
+        strategy_fn,
+        use_tf_function,
+        save_with_ls,
+        restore_with_ls,
+    ):
+
+        if not save_with_ls and not restore_with_ls:
+            self.skipTest(
+                "Skipping because save_with_ls=False and "
+                "restore_with_ls=False, which means loss scaling is not "
+                "used"
+            )
+
+        sgd_cls = type(create_sgd(opt_cls))
+
+        class MySGD(sgd_cls):
+            """A custom optimizer that tracks an extra variable."""
+
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.my_var = tf.Variable(0.0)
+                self._track_trackable(self.my_var, "my_var")
+
+        strategy = strategy_fn()
+        replicas = strategy.num_replicas_in_sync
+        if (
+            isinstance(strategy, tf.distribute.MirroredStrategy)
+            and not tf.executing_eagerly()
+        ):
+            # TODO(b/121381184): Enable running the test in this case.
+            return
+
+        with self.test_session(), strategy.scope():
+            # Build and run a simple model.
+            var = tf.Variable([2.0])
+            opt = inner_opt = MySGD(1.0, momentum=1.0)
+            if save_with_ls:
+                opt = create_lso(
+                    opt, initial_scale=1.0, dynamic_growth_steps=2.0
+                )
+            run_fn = lambda: opt.minimize(
+                lambda: var / replicas + 1.0, var_list=[var]
+            )
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            opt_op = strategy.experimental_run(run_fn)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(strategy.experimental_local_results(opt_op))
+
+            # Assert values.
+            self.assertEqual(self.evaluate(var), 1.0)
+            if save_with_ls:
+                self.assertEqual(self.evaluate(opt.loss_scale), 1.0)
+                self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
+            if opt_cls == optimizer_v2.OptimizerV2:
+                slot_var = opt.get_slot(var, "momentum")
+                self.assertEqual(self.evaluate(slot_var).item(), -1)
+            self.assertEqual(self.evaluate(opt.iterations), 1)
+
+            # Set optimizer variable to check arbitrary optimizer attributes can
+            # be saved/restored
+            self.evaluate(inner_opt.my_var.assign(1.0))
+
+            # Save a checkpoint.
+            checkpoint = tf.train.Checkpoint(optimizer=opt, var=var)
+            prefix = os.path.join(self.get_temp_dir(), "ckpt")
+            save_path = checkpoint.save(prefix)
+
+            # Create new model
+            var = tf.Variable([2.0])
+            opt = inner_opt = MySGD(1.0, momentum=1.0)
+            if restore_with_ls:
+                opt = create_lso(
+                    opt, initial_scale=1.0, dynamic_growth_steps=2.0
+                )
+
+            # Restore new model.
+            checkpoint = tf.train.Checkpoint(optimizer=opt, var=var)
+            status = checkpoint.restore(save_path)
+            if save_with_ls:
+                status.assert_existing_objects_matched()
+            else:
+                status.assert_nontrivial_match()
+
+            # Assert restored values. We can only assert in eager mode since the
+            # variables are uninitialized in graph mode
+            if tf.executing_eagerly():
+                self.assertEqual(self.evaluate(var), 1.0)
+                if save_with_ls and restore_with_ls:
+                    self.assertEqual(self.evaluate(opt.loss_scale), 1.0)
+                    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
+                elif restore_with_ls:
+                    self.assertEqual(self.evaluate(opt.loss_scale), 1.0)
+                    self.assertEqual(self.evaluate(opt.dynamic_counter), 0)
+                self.assertEqual(self.evaluate(opt.iterations), 1)
+
+            # Run the model again.
+            run_fn = lambda: opt.minimize(
+                lambda: var / replicas + 1.0, var_list=[var]
+            )
+            if use_tf_function:
+                run_fn = tf.function(run_fn)
+            opt_op = strategy.experimental_run(run_fn)
+
+            # Assert new values.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            status.run_restore_ops()
+            self.evaluate(strategy.experimental_local_results(opt_op))
+            self.assertEqual(self.evaluate(var), -1)
+            if opt_cls == optimizer_v2.OptimizerV2:
+                slot_var = opt.get_slot(var, "momentum")
+                self.assertEqual(self.evaluate(slot_var).item(), -2)
+            self.assertEqual(self.evaluate(opt.iterations), 2)
+            self.assertEqual(self.evaluate(inner_opt.my_var), 1)
+
+            # Restore model again to test restoring after slots are created
+            status = checkpoint.restore(save_path)
+            if save_with_ls and restore_with_ls:
+                status.assert_consumed()
+            elif save_with_ls:
+                status.assert_existing_objects_matched()
+            elif restore_with_ls:
+                status.assert_nontrivial_match()
+            status.run_restore_ops()
+            self.assertEqual(self.evaluate(var), 1)
+            if opt_cls == optimizer_v2.OptimizerV2:
+                self.assertEqual(self.evaluate(slot_var).item(), -1)
+
+    @test_combinations.generate(
+        test_combinations.combine(config_version=["v2", "tf2_3"])
+        + test_combinations.combine(config_version="v3", mode="eager")
+    )
+    def testGetConfigFixed(self, config_version):
+        # Get a config from LossScaleOptimizer, LossScaleOptimizerV3, or the
+        # LossScaleOptimizer from TF 2.3. Then restore the config into a
+        # LossScaleOptimizer or LossScaleOptimizerV3
+        if config_version == "v2":
+            opt = gradient_descent.SGD(2.0, momentum=0.5)
+            opt = loss_scale_optimizer.LossScaleOptimizer(
+                opt, dynamic=False, initial_scale=2
+            )
+            config = opt.get_config()
+            opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+        elif config_version == "v3":
+            opt = sgd_experimental.SGD(2.0, momentum=0.5)
+            opt = loss_scale_optimizer.LossScaleOptimizerV3(
+                opt, dynamic=False, initial_scale=2
+            )
+            config = opt.get_config()
+            opt = loss_scale_optimizer.LossScaleOptimizerV3.from_config(config)
+        else:
+            self.assertEqual(config_version, "tf2_3")
+            config = {
+                "optimizer": {
+                    "class_name": "SGD",
+                    "config": {
+                        "learning_rate": 2.0,
+                        "momentum": 0.5,
+                        "decay": 0.0,
+                        "nesterov": False,
+                        "name": "SGD",
+                    },
+                },
+                "loss_scale": {
+                    "class_name": "FixedLossScale",
+                    "config": {"loss_scale_value": 2.0},
+                },
+            }
+            opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+
+        # Force hyperparameters to be created
+        opt.learning_rate
+        self.evaluate(tf.compat.v1.global_variables_initializer())
 
-  def _run_if_in_graph_mode(self, val):
-    # Running only in graph mode is useful, because optimizers sometimes return
-    # a value that, in Graph mode, is runnable with self.evaluate. But in Eager
-    # mode, the optimizer already does the computations and the return value
-    # cannot be run.
-    if not tf.executing_eagerly():
-      self.evaluate(val)
-
-  def _eval_if_tensor(self, val):
-    # Calls self.evaluate on val if val is a Tensor or Variable. This is useful,
-    # since hyperparameters are tf.Variables on OptimizerV2 and are Python
-    # floats on the experimental optimizer.
-    return (self.evaluate(val) if isinstance(val, (tf.Tensor, tf.Variable))
-            else val)
-
-  def _run_fn_with_grad_check(self, strategy, var, opt, expected_grad):
-    grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
-        expected_grad)
-    loss = lambda: grad_check_fn(var) / strategy.num_replicas_in_sync
-    return lambda: opt.minimize(loss, var_list=[var])
-
-  def testIsInstance(self):
-    optimizer = create_lso(sgd_experimental.SGD())
-    self.assertIsInstance(optimizer,
-                          loss_scale_optimizer.BaseLossScaleOptimizer)
-
-    optimizer = create_lso(gradient_descent.SGD())
-    self.assertIsInstance(optimizer,
-                          loss_scale_optimizer.BaseLossScaleOptimizer)
-
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testFixedLossScaleAppliedToLossWithMinimize(self, opt_cls, strategy_fn,
-                                                  use_tf_function):
-    with strategy_fn().scope() as strategy:
-      var = tf.Variable([5.0])
-      opt = create_sgd(opt_cls, 2.0)
-      loss_scale = 10.
-      opt = create_lso(opt, dynamic=False, initial_scale=loss_scale)
-      self.assertEqual(self.evaluate(opt.loss_scale), loss_scale)
-      self.assertIsInstance(opt.loss_scale, tf.Tensor)
-      # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale
-      # / strategy.num_replicas_in_sync will not be exact, which could lead to
-      # assertion failures due to rounding issues.
-      self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0)
-      run_fn = self._run_fn_with_grad_check(
-          strategy, var, opt, loss_scale / strategy.num_replicas_in_sync)
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-      # The loss is the identity of the variable. Therefore the gradient is 1,
-      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
-      self.assertAllClose([3.], self.evaluate(var))
-
-  def testFixedLossScaleAppliedToLossWithGetGradients(self):
-    with tf.Graph().as_default():
-      var = tf.Variable([2.0])
-      opt = gradient_descent.SGD(1.0)
-      loss_scale = 10.
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
-                                                    initial_scale=loss_scale)
-      grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
-          loss_scale)
-      loss = grad_check_fn(var)
-      run_op = opt.get_gradients(loss, [var])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # This will cause an assertion to run, as
-      # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
-      self.evaluate(run_op)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testDynamicAttrsWithFixedLossScale(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    opt = create_lso(opt, dynamic=False, initial_scale=2.)
-    self.assertFalse(opt.dynamic)
-    self.assertIsNone(opt.dynamic_counter)
-    self.assertIsNone(opt.dynamic_growth_steps)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testGetScaledLoss(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    opt = create_lso(opt, dynamic=False, initial_scale=2.)
-    loss = tf.convert_to_tensor(5.)
-    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
-    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
-    loss = tf.convert_to_tensor(5., dtype='float16')
-    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
-    self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
-
-  @test_combinations.generate(opt_combinations_only())
-  def testGetUnscaledGradients(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    opt = create_lso(opt, dynamic=False, initial_scale=2)
-    scaled_grads = [
-        tf.convert_to_tensor(3.), None,
-        tf.convert_to_tensor(-4., dtype='float16')
-    ]
-    grads = opt.get_unscaled_gradients(scaled_grads)
-    grads = [self.evaluate(g) if g is not None else g for g in grads]
-    self.assertEqual([1.5, None, -2.], grads)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testGetUnscaledSparseGradients(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    opt = create_lso(opt, dynamic=False, initial_scale=2)
-    sparse_scaled_grad = tf.IndexedSlices(
-        tf.convert_to_tensor([[4., 2.], [8., 5.]]),
-        tf.convert_to_tensor([1, 3], dtype='int32'),
-        dense_shape=tf.convert_to_tensor([5, 2], dtype='int32'))
-    sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
-    self.assertIsInstance(sparse_grad, tf.IndexedSlices)
-    self.assertAllEqual([[2., 1.], [4., 2.5]],
-                        self.evaluate(sparse_grad.values))
-
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testDynamicLossScale(self, opt_cls, strategy_fn, use_tf_function):
-    strategy = strategy_fn()
-    learning_rate = 2.
-    expected_gradient = tf.Variable(learning_rate /
-                                    strategy.num_replicas_in_sync)
-    with strategy.scope():
-      var = tf.Variable([5.0])
-      opt = create_sgd(opt_cls, learning_rate)
-      opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
-      self.assertEqual(opt.initial_scale, 2.)
-      self.assertIsInstance(opt.initial_scale, float)
-      self.assertEqual(opt.dynamic_growth_steps, 1)
-      self.assertIsInstance(opt.dynamic_growth_steps, int)
-
-      self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync, 0)
-      run_fn = self._run_fn_with_grad_check(strategy, var, opt,
-                                            expected_gradient)
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-      # The loss is the identity of the variable. Therefore the gradient is 1,
-      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
-      self.assertAllClose([3.], self.evaluate(var))
-
-      # Loss scale will be double, so the expected gradient is also doubled.
-      self.evaluate(expected_gradient.assign(
-          2 * learning_rate / strategy.num_replicas_in_sync))
-      run_op = strategy.experimental_run(run_fn)
-      self._run_if_in_graph_mode(run_op)
-      # As before, the 2 is subtracted from the variable, making it's new value
-      # 1.
-      self.assertAllClose([1.], self.evaluate(var))
-
-  @test_combinations.generate(opt_combinations_only())
-  def testDynamicLossScaleDefaultValues(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    opt = create_lso(opt)
-    self.assertEqual(opt.initial_scale, 2 ** 15)
-    self.assertEqual(opt.dynamic_growth_steps, 2000)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertEqual(self.evaluate(opt.loss_scale), 2 ** 15)
-
-  # pylint: disable=cell-var-from-loop
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testClipping(self, opt_cls, strategy_fn, use_tf_function):
-    strategy = strategy_fn()
-    learning_rate = 2.
-    for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'):
-      with strategy.scope(), self.subTest(clip_type=clip_type):
+        # Test attributes on the optimizer
+        self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+        self.assertEqual(self.evaluate(opt.inner_optimizer.learning_rate), 2.0)
+        self.assertEqual(
+            self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5
+        )
+        self.assertEqual(self.evaluate(opt.loss_scale), 2.0)
+        self.assertEqual(opt.initial_scale, 2.0)
+        self.assertIsNone(opt.dynamic_growth_steps)
+        self.assertIsNone(opt.dynamic_counter)
+        self.assertFalse(opt.dynamic)
+
+        # Ensure the optimizer can be used
         var = tf.Variable([5.0])
-        opt = create_sgd(opt_cls, learning_rate, **{clip_type: 2.0})
-        opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
-        if isinstance(opt, loss_scale_optimizer.LossScaleOptimizer):
-          # Only OptimizerV2 exposes the clipping attributes
-          self.assertEqual(getattr(opt, clip_type), 2.0)
-        self.assertEqual(opt.initial_scale % strategy.num_replicas_in_sync, 0)
-
-        loss = lambda: var * 4 / strategy.num_replicas_in_sync
-        run_fn = lambda: opt.minimize(loss, var_list=[var])
-        if use_tf_function:
-          run_fn = tf.function(run_fn)
-
-        # Test running with clipped gradients
-        run_op = strategy.experimental_run(run_fn)
+        run_op = self._run_fn_with_grad_check(
+            tf.distribute.get_strategy(), var, opt, 2
+        )()
         self.evaluate(tf.compat.v1.global_variables_initializer())
         self._run_if_in_graph_mode(run_op)
-        # The gradient is 4 but is clipped to 2, so the variable will be
-        # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
-        self.assertAllClose([1.], self.evaluate(var))
-        self.assertEqual(self.evaluate(opt.loss_scale), 4)
-
-        if isinstance(opt, loss_scale_optimizer.LossScaleOptimizerV3):
-          # Only OptimizerV2 exposes the clipping attributes, so we cannot set
-          # them on the new optimizer
-          return
-        # Test changing the clip amount and running again
-        setattr(opt, clip_type, 3.0)
-        run_op = strategy.experimental_run(run_fn)
+        self.assertEqual(self.evaluate(var), [3.0])
+
+    @test_combinations.generate(
+        test_combinations.combine(config_version=["v2", "tf2_3"])
+        + test_combinations.combine(config_version="v3", mode="eager")
+    )
+    def testGetConfigDynamic(self, config_version):
+        # Get a config from LossScaleOptimizer, LossScaleOptimizerV3, or the
+        # LossScaleOptimizer from TF 2.3. Then restore the config into a
+        # LossScaleOptimizer or LossScaleOptimizerV3
+        if config_version == "v2":
+            opt = gradient_descent.SGD(2.0, momentum=0.5)
+            opt = loss_scale_optimizer.LossScaleOptimizer(
+                opt, initial_scale=2, dynamic_growth_steps=3
+            )
+            config = opt.get_config()
+            opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+        elif config_version == "v3":
+            opt = sgd_experimental.SGD(2.0, momentum=0.5)
+            opt = loss_scale_optimizer.LossScaleOptimizerV3(
+                opt, initial_scale=2, dynamic_growth_steps=3
+            )
+            config = opt.get_config()
+            opt = loss_scale_optimizer.LossScaleOptimizerV3.from_config(config)
+        else:
+            self.assertEqual(config_version, "tf2_3")
+            config = {
+                "optimizer": {
+                    "class_name": "SGD",
+                    "config": {
+                        "learning_rate": 2.0,
+                        "momentum": 0.5,
+                        "decay": 0.0,
+                        "nesterov": False,
+                        "name": "SGD",
+                    },
+                },
+                "loss_scale": {
+                    "class_name": "DynamicLossScale",
+                    "config": {
+                        "initial_loss_scale": 2.0,
+                        "increment_period": 3,
+                        "multiplier": 2.0,
+                    },
+                },
+            }
+            opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+
+        # Force hyperparameters to be created
+        opt.learning_rate
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+
+        # Test attributes on the optimizer
+        self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+        self.assertEqual(self.evaluate(opt.inner_optimizer.learning_rate), 2.0)
+        self.assertEqual(
+            self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5
+        )
+        self.assertEqual(self.evaluate(opt.loss_scale), 2.0)
+        self.assertEqual(opt.initial_scale, 2.0)
+        self.assertEqual(opt.dynamic_growth_steps, 3.0)
+        self.assertTrue(opt.dynamic)
+
+        # Ensure the optimizer can be used
+        var = tf.Variable([5.0])
+        run_op = self._run_fn_with_grad_check(
+            tf.distribute.get_strategy(), var, opt, 2
+        )()
+        self.evaluate(tf.compat.v1.global_variables_initializer())
         self._run_if_in_graph_mode(run_op)
-        # The gradient is 4 but is clipped to 3, so the variable will be
-        # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
-        self.assertAllClose([-5.], self.evaluate(var))
-        self.assertEqual(self.evaluate(opt.loss_scale), 8)
-
-        # Test Inf gradients are still skipped instead of being clipped
-        loss = lambda: var * float('Inf')
-        run_fn = lambda: opt.minimize(loss, var_list=[var])
-        run_op = strategy.experimental_run(run_fn)
+        self.assertEqual(self.evaluate(var), [3.0])
+        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
+
+    def test_from_config_with_invalid_multiplier(self):
+        config = {
+            "optimizer": {
+                "class_name": "SGD",
+                "config": {
+                    "learning_rate": 2.0,
+                    "momentum": 0.5,
+                    "decay": 0.0,
+                    "nesterov": False,
+                    "name": "SGD",
+                },
+            },
+            "loss_scale": {
+                "class_name": "DynamicLossScale",
+                "config": {
+                    "initial_loss_scale": 2.0,
+                    "increment_period": 3,
+                    "multiplier": 4.0,
+                },
+            },
+        }
+
+        expected_error = (
+            "Cannot deserialize LossScaleOptimizer with a "
+            "DynamicLossScale whose multiplier is not 2. Got "
+            "DynamicLossScale: DynamicLossScale\\("
+        )
+        with self.assertRaisesRegex(ValueError, expected_error):
+            loss_scale_optimizer.LossScaleOptimizer.from_config(config)
+
+    @test_combinations.generate(
+        test_combinations.combine(lso_type=["v1", "v2"])
+        + test_combinations.combine(lso_type="v3", mode="eager")
+    )
+    def testSerializationWithBuiltInOptimizer(self, lso_type):
+        if lso_type in ("v1", "v2"):
+            opt = gradient_descent.SGD(2.0, momentum=0.5)
+            opt = loss_scale_optimizer.LossScaleOptimizer(
+                opt, initial_scale=2.0, dynamic_growth_steps=3.0
+            )
+            config = optimizers.serialize(opt)
+            if lso_type == "v1":
+                # LossScaleOptimizerV1 was an older experimental version of LSO
+                # that is now deleted. The config had the same format as LSO but
+                # the class name was different. This tests that LSO V1 configs
+                # can still be deserialized, which are deserialized as a
+                # (non-V1) LSO
+                config["class_name"] = "LossScaleOptimizerV1"
+        else:
+            opt = sgd_experimental.SGD(2.0, momentum=0.5)
+            opt = loss_scale_optimizer.LossScaleOptimizerV3(
+                opt, initial_scale=2.0, dynamic_growth_steps=3
+            )
+            config = optimizers.serialize(opt)
+        opt = optimizers.deserialize(config)
+        # Force hyperparameters to be created
+        opt.learning_rate
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+
+        self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+        self.assertEqual(
+            self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5
+        )
+        self.assertEqual(self.evaluate(opt.loss_scale), 2.0)
+        self.assertEqual(opt.dynamic_growth_steps, 3.0)
+        self.assertTrue(opt.dynamic)
+        if lso_type in ("v1", "v2"):
+            self.assertEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer)
+        else:
+            self.assertEqual(
+                type(opt), loss_scale_optimizer.LossScaleOptimizerV3
+            )
+
+        # Ensure the optimizer can be used
+        var = tf.Variable([5.0])
+        run_op = self._run_fn_with_grad_check(
+            tf.distribute.get_strategy(), var, opt, 2
+        )()
+        self.evaluate(tf.compat.v1.global_variables_initializer())
         self._run_if_in_graph_mode(run_op)
-        self.assertAllClose([-5.], self.evaluate(var))  # Var does not change
-        self.assertEqual(self.evaluate(opt.loss_scale), 4)
-  # pylint: enable=cell-var-from-loop
-
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testDynamicUpdate(self, opt_cls, strategy_fn, use_tf_function):
-    with strategy_fn().scope() as strategy:
-      var = tf.Variable([1.0, 2.0])
-      opt = create_sgd(opt_cls, 1.0)
-      opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
-
-      # Test optimizer with finite gradients
-      loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
-      run_fn = lambda: opt.minimize(loss, var_list=[var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-      # Gradient is 2, so variable will have 2 subtracted from it
-      self.assertAllClose([-1.0, 0.0], self.evaluate(var))
-      # Loss scale has doubled from 2 to 4
-      self.assertEqual(4., self.evaluate(opt.loss_scale))
-
-      # Test optimizer with NaN gradients
-      loss = lambda: var * float('NaN')
-      run_fn = lambda: opt.minimize(loss, var_list=[var])
-      run_op = strategy.experimental_run(run_fn)
-      self._run_if_in_graph_mode(run_op)
-      # Variable should not change from before, due to NaN gradients.
-      self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
-      # Loss scale should half due to NaN gradients.
-      self.assertEqual(2., self.evaluate(opt.loss_scale))
-
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testDynamicLossScaleWithFloat16Loss(self, opt_cls, strategy_fn,
-                                          use_tf_function):
-    strategy = strategy_fn()
-    learning_rate = 2.
-    with strategy.scope():
-      var = tf.Variable([5.0])
-      opt = create_sgd(opt_cls, learning_rate)
-      opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=1)
-
-      def loss():
-        return tf.cast(var / strategy.num_replicas_in_sync, 'float16')
-      run_fn = lambda: opt.minimize(loss, var_list=[var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-      # The loss is the identity of the variable. Therefore the gradient is 1,
-      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
-      self.assertAllClose([3.], self.evaluate(var))
-
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testNanOnOneReplicaOnly(self, opt_cls, strategy_fn, use_tf_function):
-    if strategy_fn == default_strategy_fn:
-      self.skipTest('The test is only useful for non-default strategies')
-    if not tf.test.is_gpu_available():
-      self.skipTest('Test requires GPU')
-    if (not tf.executing_eagerly() and
-        not tf.compat.v1.control_flow_v2_enabled()):
-      self.skipTest('b/181283011: GradientTape does not work properly with '
-                    'V1 control flow, and opt.minimize uses GradientTape')
-    with strategy_fn().scope() as strategy:
-      var = tf.Variable([1.0, 2.0])
-      opt = create_sgd(opt_cls, 1.0)
-      opt = create_lso(opt, initial_scale=2, dynamic_growth_steps=2)
-
-      def loss():
-        rep_id = (tf.distribute.get_replica_context().replica_id_in_sync_group)
-        # The last element of last replica's gradient is NaN.
-        return tf.cond(
-            tf.equal(rep_id, 0), lambda: var * 2.,
-            lambda: var * tf.constant([1., float('NaN')]))
-      run_fn = lambda: opt.minimize(loss, var_list=[var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-      # Variable should not change from before, due to NaN gradients.
-      self.assertAllClose(self.evaluate(var), [1.0, 2.0])
-      # Loss scale should half due to NaN gradients.
-      self.assertEqual(1., self.evaluate(opt.loss_scale))
-
-  def testCustomAggregater(self):
-    def gradient_aggregator(grads_and_vars):
-      # Simulate an all-reduce where a replica has a NaN gradient by setting
-      # the last gradient to NaN
-      grads_and_vars = list(grads_and_vars)
-      last_grad, last_var = grads_and_vars[-1]
-      grads_and_vars[-1] = (last_grad * float('NaN'), last_var)
-      return grads_and_vars
-
-    var = tf.Variable([1.0, 2.0])
-    opt = gradient_descent.SGD(1.0, gradient_aggregator=gradient_aggregator)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2,
-                                                  dynamic_growth_steps=2)
-
-    loss = lambda: var * 2
-    run_op = opt.minimize(loss, var_list=[var])
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self._run_if_in_graph_mode(run_op)
-    # Variable should not change from before, due to NaN gradients.
-    self.assertAllClose(self.evaluate(var), [1.0, 2.0])
-    # Loss scale should half due to NaN gradients.
-    self.assertEqual(1., self.evaluate(opt.loss_scale))
-
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testDynamicLossScaleWithSlots(self, opt_cls, strategy_fn,
-                                    use_tf_function):
-    strategy_obj = strategy_fn()
-    if (isinstance(strategy_obj, tf.distribute.MirroredStrategy) and
-        tf.compat.v1.control_flow_v2_enabled() and
-        not tf.executing_eagerly()):
-      self.skipTest('b/138667997')
-    with strategy_obj.scope() as strategy:
-      var = tf.Variable([1.0, 2.0])
-      # An SGD optimizer with momentum has slot variables.
-      opt = create_sgd(opt_cls, 1.0, momentum=1.)
-      initial_scale = 2.
-      opt = create_lso(opt, initial_scale=initial_scale, dynamic_growth_steps=1)
-      loss = lambda: var / strategy.num_replicas_in_sync
-      run_fn = lambda: opt.minimize(loss, var_list=[var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-      # The momentum accumulator starts at 0 and the gradient is 1. The
-      # accumulator is incremented by the gradient, so it is now 1. Then the
-      # variable is subtracted by the accumulator, so the variable is subtracted
-      # by 1.
-      self.assertAllClose([0.0, 1.0], self.evaluate(var))
-      self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 2)
-
-      run_op = strategy.experimental_run(run_fn)
-      self._run_if_in_graph_mode(run_op)
-      # The momentum accumulator was 1 before this step and the gradient is 1.
-      # The accumulator is incremented by the gradient, so it is now 2. Then the
-      # variable is subtracted by the accumulator, so the variable is subtracted
-      # by 2.
-      self.assertAllClose([-2., -1.], self.evaluate(var))
-      self.assertEqual(self.evaluate(opt.loss_scale), initial_scale * 4)
-
-      if isinstance(opt, loss_scale_optimizer.LossScaleOptimizer):
-        self.assertEqual(opt.get_slot_names(), ['momentum'])
-
-  def testIterations(self):
-    opt = gradient_descent.SGD(2.0)
-    lso = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
-                                                  initial_scale=10.)
-    lso.iterations = 7
-    self.assertEqual(lso.iterations, 7)
-    self.assertEqual(opt.iterations, 7)
-
-  @test_combinations.generate(opt_and_strategy_and_mode_combinations())
-  def testIterationsIncremented(self, opt_cls, strategy_fn, use_tf_function):
-    with strategy_fn().scope() as strategy:
-      # Test iterations is incremented in opt.minimize.
-      opt = create_sgd(opt_cls, 1.0)
-      opt = create_lso(opt)
-      var = tf.Variable([5.0])
-      loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
-      run_fn = lambda: opt.minimize(loss, [var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-      self.assertEqual(self.evaluate(var), 3.0)  # Grad is 2, so var is 5 - 2
-      self.assertEqual(self.evaluate(opt.iterations), 1)
-
-      # Test iterations is incremented in opt.minimize even if gradients aren't
-      # applied to variables due to NaN gradients.
-      loss = lambda: var * float('NaN')
-      run_fn = lambda: opt.minimize(loss, [var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      run_op = strategy.experimental_run(run_fn)
-      self._run_if_in_graph_mode(run_op)
-      self.assertEqual(self.evaluate(var), 3.0)
-      self.assertEqual(self.evaluate(opt.iterations), 2)
-
-  def testWeightMethods(self):
-    with self.test_session():
-      var = tf.Variable([1.0])
-      opt = gradient_descent.SGD(1.0)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=2.,
-                                                    dynamic_growth_steps=1)
-      run_op = opt.minimize(lambda: var * 2, [var])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
-
-      self.assertLen(opt.weights, 1)  # The 'iterations' weight
-      self.assertEqual(self.evaluate(opt.weights[0]), 1)
-      self.assertEqual(opt.get_weights()[0], 1)
-      self.assertEqual(self.evaluate(opt.variables()[0]), 1)
-      opt.set_weights([np.array(2.)])
-      self.assertEqual(self.evaluate(opt.variables()[0]), 2)
-
-  def testHyperParametersExposed(self):
-    with self.cached_session():
-      opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
-      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
-      # Force hyperparameters to be created
-      opt.lr  # pylint: disable=pointless-statement
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      self.assertEqual(self.evaluate(lso.beta_1), 0.5)
-      self.assertIsInstance(lso.beta_1, tf.Variable)
-      self.assertEqual(self.evaluate(lso.lr), 1.0)
-      self.assertIs(lso.lr, opt.lr)
-      self.assertIs(lso.lr, lso.learning_rate)
-
-      lso.beta_1 = 0.25
-      self.assertEqual(self.evaluate(lso.beta_1), 0.25)
-      self.assertEqual(self.evaluate(opt.beta_1), 0.25)
-      self.assertIs(lso.beta_1, opt.beta_1)
-      opt.beta_1 = 0.75
-      self.assertEqual(self.evaluate(lso.beta_1), 0.75)
-      self.assertEqual(self.evaluate(opt.beta_1), 0.75)
-      self.assertIs(lso.beta_1, opt.beta_1)
-      lso.lr = 2.0
-      self.assertEqual(self.evaluate(lso.lr), 2.0)
-      self.assertEqual(self.evaluate(lso.learning_rate), 2.0)
-      self.assertEqual(self.evaluate(opt.lr), 2.0)
-      self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
-      self.assertIs(lso.lr, opt.lr)
-
-      # Test setting attribute that is both attribute on LossScaleOptimizer and
-      # hyperparameter on wrapped optimizer.
-      class MyOpt(gradient_descent.SGD):
-
-        def __init__(self):
-          super().__init__()
-          self._set_hyper('loss_scale', 123.)
-
-      opt = MyOpt()
-      lso = loss_scale_optimizer.LossScaleOptimizer(opt)
-      with self.assertRaises(AttributeError):
-        lso.loss_scale = 2.
-
-  @test_combinations.generate(opt_combinations_only())
-  def testArbitraryAttributesNotExposed(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    lso = create_lso(opt)
-    self.assertFalse(opt.nesterov)
-    with self.assertRaisesRegex(
-        AttributeError,
-        "'LossScaleOptimizer(V3)?' object has no attribute 'nesterov'"):
-      lso.nesterov  # pylint: disable=pointless-statement
-
-    lso.nesterov = True
-    self.assertTrue(lso.nesterov)
-    self.assertFalse(opt.nesterov)
-
-  def testDir(self):
-    lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD())
-    dir_result = dir(lso)
-    self.assertIn('learning_rate', dir_result)  # Hyperparameter
-    self.assertIn('lr', dir_result)  # Hyperparameter
-    self.assertIn('minimize', dir_result)  # Attribute
-    self.assertIn('loss_scale', dir_result)  # Attribute
-    self.assertNotIn('nesterov', dir_result)  # Attribute on inner optimizer
-    self.assertIn('nesterov', dir(lso.inner_optimizer))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testApplyGradientsGetsUnwrappedTensors(self):
-    # Tests that gradients passed to apply_gradients are not wrapped in a
-    # DistributionStrategy wrapper, such as PerReplica, but instead are raw
-    # Tensors. Optimizer subclasses that override apply_gradients() expect raw
-    # Tensors, even though the base Optimizer can handle PerReplica gradients.
-
-    outer_self = self
-
-    class MyOptimizer(gradient_descent.SGD):
-
-      def apply_gradients(self,
-                          grads_and_vars,
-                          name=None,
-                          experimental_aggregate_gradients=True):
-        for grad, _ in grads_and_vars:
-          outer_self.assertIsInstance(grad, tf.Tensor)
-        return super().apply_gradients(grads_and_vars, name,
-                                           experimental_aggregate_gradients)
-
-    with create_mirrored_strategy().scope() as strategy:
-      var = tf.Variable([5.0])
-      opt = MyOptimizer(learning_rate=1.0)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
-                                                    initial_scale=1)
-      loss = lambda: var * 2.0
-      run_fn = lambda: opt.minimize(loss, [var])
-      strategy.experimental_run(run_fn)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode='eager', use_tf_function=[False, True]))
-  def testApplyGradientsGetsUnwrappedTensorsWithNewOptimizer(
-      self, use_tf_function):
-    outer_self = self
-
-    class MyOptimizer(sgd_experimental.SGD):
-
-      def apply_gradients(self,
-                          grads_and_vars,
-                          skip_gradients_aggregation=False):
-        for grad, _ in grads_and_vars:
-          outer_self.assertIsInstance(grad, tf.Tensor)
-        return super().apply_gradients(grads_and_vars,
-                                       skip_gradients_aggregation)
-
-    with create_mirrored_strategy().scope() as strategy:
-      var = tf.Variable([5.0])
-      opt = MyOptimizer(learning_rate=1.0)
-      opt = loss_scale_optimizer.LossScaleOptimizerV3(
-          opt, dynamic=False, initial_scale=1)
-      loss = lambda: var * 2.0
-      run_fn = lambda: opt.minimize(loss, [var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      strategy.experimental_run(run_fn)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testLossScaleDelegationWithWrapper(self, opt_cls):
-    # Test learning_rate is exposed when LossScaleOptimizer wraps another
-    # wrapper.
-
-    class MyOptimizer(opt_cls):
-
-      def __init__(self):
-        super().__init__('MyOptimizer')
-        self.inner_optimizer = create_sgd(opt_cls, learning_rate=1.0)
-
-      @property
-      def learning_rate(self):
-        return self.inner_optimizer.learning_rate
-
-      @learning_rate.setter
-      def learning_rate(self, value):
-        self.inner_optimizer.learning_rate = value
-
-      def get_config(self):
-        return {}
-
-    with self.cached_session():
-      opt = MyOptimizer()
-      opt = create_lso(opt)
-
-      # Force hyperparameters to be created
-      opt.learning_rate  # pylint: disable=pointless-statement
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      self.assertEqual(self.evaluate(opt.learning_rate), 1.0)
-      self.assertEqual(
-          self.evaluate(opt.inner_optimizer.inner_optimizer.learning_rate), 1.0)
-      opt.learning_rate = 2.0
-      self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
-      self.assertEqual(self.evaluate(
-          opt.inner_optimizer.inner_optimizer.learning_rate), 2.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(
-          opt_cls=optimizer_v2.OptimizerV2,
-          strategy_fn=STRATEGY_FNS,
-          mode=['graph', 'eager'],
-          use_tf_function=False,
-          save_with_ls=[False, True],
-          restore_with_ls=[False, True]) + test_combinations.combine(
-              opt_cls=optimizer_experimental.Optimizer,
-              strategy_fn=STRATEGY_FNS,
-              mode='eager',
-              use_tf_function=[False, True],
-              save_with_ls=[False, True],
-              restore_with_ls=[False, True]))
-  def testCheckpoint(self, opt_cls, strategy_fn, use_tf_function, save_with_ls,
-                     restore_with_ls):
-
-    if not save_with_ls and not restore_with_ls:
-      self.skipTest('Skipping because save_with_ls=False and '
-                    'restore_with_ls=False, which means loss scaling is not '
-                    'used')
-
-    sgd_cls = type(create_sgd(opt_cls))
-
-    class MySGD(sgd_cls):
-      """A custom optimizer that tracks an extra variable."""
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.my_var = tf.Variable(0.)
-        self._track_trackable(self.my_var, 'my_var')
-
-    strategy = strategy_fn()
-    replicas = strategy.num_replicas_in_sync
-    if (isinstance(strategy, tf.distribute.MirroredStrategy) and
-        not tf.executing_eagerly()):
-      # TODO(b/121381184): Enable running the test in this case.
-      return
-
-    with self.test_session(), strategy.scope():
-      # Build and run a simple model.
-      var = tf.Variable([2.0])
-      opt = inner_opt = MySGD(1., momentum=1.)
-      if save_with_ls:
-        opt = create_lso(opt, initial_scale=1., dynamic_growth_steps=2.)
-      run_fn = lambda: opt.minimize(lambda: var / replicas + 1., var_list=[var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      opt_op = strategy.experimental_run(run_fn)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(strategy.experimental_local_results(opt_op))
-
-      # Assert values.
-      self.assertEqual(self.evaluate(var), 1.)
-      if save_with_ls:
-        self.assertEqual(self.evaluate(opt.loss_scale), 1.)
+        self.assertEqual(self.evaluate(var), [3.0])
         self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
-      if opt_cls == optimizer_v2.OptimizerV2:
-        slot_var = opt.get_slot(var, 'momentum')
-        self.assertEqual(self.evaluate(slot_var).item(), -1)
-      self.assertEqual(self.evaluate(opt.iterations), 1)
-
-      # Set optimizer variable to check arbitrary optimizer attributes can be
-      # saved/restored
-      self.evaluate(inner_opt.my_var.assign(1.))
-
-      # Save a checkpoint.
-      checkpoint = tf.train.Checkpoint(optimizer=opt, var=var)
-      prefix = os.path.join(self.get_temp_dir(), 'ckpt')
-      save_path = checkpoint.save(prefix)
-
-      # Create new model
-      var = tf.Variable([2.0])
-      opt = inner_opt = MySGD(1., momentum=1.)
-      if restore_with_ls:
-        opt = create_lso(opt, initial_scale=1., dynamic_growth_steps=2.)
-
-      # Restore new model.
-      checkpoint = tf.train.Checkpoint(optimizer=opt, var=var)
-      status = checkpoint.restore(save_path)
-      if save_with_ls:
-        status.assert_existing_objects_matched()
-      else:
-        status.assert_nontrivial_match()
-
-      # Assert restored values. We can only assert in eager mode since the
-      # variables are uninitialized in graph mode
-      if tf.executing_eagerly():
-        self.assertEqual(self.evaluate(var), 1.)
-        if save_with_ls and restore_with_ls:
-          self.assertEqual(self.evaluate(opt.loss_scale), 1.)
-          self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
-        elif restore_with_ls:
-          self.assertEqual(self.evaluate(opt.loss_scale), 1.)
-          self.assertEqual(self.evaluate(opt.dynamic_counter), 0)
-        self.assertEqual(self.evaluate(opt.iterations), 1)
-
-      # Run the model again.
-      run_fn = lambda: opt.minimize(lambda: var / replicas + 1., var_list=[var])
-      if use_tf_function:
-        run_fn = tf.function(run_fn)
-      opt_op = strategy.experimental_run(run_fn)
-
-      # Assert new values.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      status.run_restore_ops()
-      self.evaluate(strategy.experimental_local_results(opt_op))
-      self.assertEqual(self.evaluate(var), -1)
-      if opt_cls == optimizer_v2.OptimizerV2:
-        slot_var = opt.get_slot(var, 'momentum')
-        self.assertEqual(self.evaluate(slot_var).item(), -2)
-      self.assertEqual(self.evaluate(opt.iterations), 2)
-      self.assertEqual(self.evaluate(inner_opt.my_var), 1)
-
-      # Restore model again to test restoring after slots are created
-      status = checkpoint.restore(save_path)
-      if save_with_ls and restore_with_ls:
-        status.assert_consumed()
-      elif save_with_ls:
-        status.assert_existing_objects_matched()
-      elif restore_with_ls:
-        status.assert_nontrivial_match()
-      status.run_restore_ops()
-      self.assertEqual(self.evaluate(var), 1)
-      if opt_cls == optimizer_v2.OptimizerV2:
-        self.assertEqual(self.evaluate(slot_var).item(), -1)
-
-  @test_combinations.generate(
-      test_combinations.combine(config_version=['v2', 'tf2_3']) +
-      test_combinations.combine(config_version='v3', mode='eager'))
-  def testGetConfigFixed(self, config_version):
-    # Get a config from LossScaleOptimizer, LossScaleOptimizerV3, or the
-    # LossScaleOptimizer from TF 2.3. Then restore the config into a
-    # LossScaleOptimizer or LossScaleOptimizerV3
-    if config_version == 'v2':
-      opt = gradient_descent.SGD(2., momentum=0.5)
-      opt = loss_scale_optimizer.LossScaleOptimizer(
-          opt, dynamic=False, initial_scale=2)
-      config = opt.get_config()
-      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
-    elif config_version == 'v3':
-      opt = sgd_experimental.SGD(2., momentum=0.5)
-      opt = loss_scale_optimizer.LossScaleOptimizerV3(
-          opt, dynamic=False, initial_scale=2)
-      config = opt.get_config()
-      opt = loss_scale_optimizer.LossScaleOptimizerV3.from_config(config)
-    else:
-      self.assertEqual(config_version, 'tf2_3')
-      config = {
-          'optimizer': {
-              'class_name': 'SGD',
-              'config': {
-                  'learning_rate': 2.0,
-                  'momentum': 0.5,
-                  'decay': 0.0,
-                  'nesterov': False,
-                  'name': 'SGD',
-              }
-          },
-          'loss_scale': {
-              'class_name': 'FixedLossScale',
-              'config': {'loss_scale_value': 2.0}
-          },
-      }
-      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
-
-    # Force hyperparameters to be created
-    opt.learning_rate  # pylint: disable=pointless-statement
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    # Test attributes on the optimizer
-    self.assertEqual(self.evaluate(opt.learning_rate), 2.)
-    self.assertEqual(self.evaluate(opt.inner_optimizer.learning_rate), 2.)
-    self.assertEqual(self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5)
-    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
-    self.assertEqual(opt.initial_scale, 2.)
-    self.assertIsNone(opt.dynamic_growth_steps)
-    self.assertIsNone(opt.dynamic_counter)
-    self.assertFalse(opt.dynamic)
-
-    # Ensure the optimizer can be used
-    var = tf.Variable([5.0])
-    run_op = self._run_fn_with_grad_check(
-        tf.distribute.get_strategy(), var, opt, 2)()
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self._run_if_in_graph_mode(run_op)
-    self.assertEqual(self.evaluate(var), [3.])
-
-  @test_combinations.generate(
-      test_combinations.combine(config_version=['v2', 'tf2_3']) +
-      test_combinations.combine(config_version='v3', mode='eager'))
-  def testGetConfigDynamic(self, config_version):
-    # Get a config from LossScaleOptimizer, LossScaleOptimizerV3, or the
-    # LossScaleOptimizer from TF 2.3. Then restore the config into a
-    # LossScaleOptimizer or LossScaleOptimizerV3
-    if config_version == 'v2':
-      opt = gradient_descent.SGD(2., momentum=0.5)
-      opt = loss_scale_optimizer.LossScaleOptimizer(
-          opt, initial_scale=2, dynamic_growth_steps=3)
-      config = opt.get_config()
-      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
-    elif config_version == 'v3':
-      opt = sgd_experimental.SGD(2., momentum=0.5)
-      opt = loss_scale_optimizer.LossScaleOptimizerV3(
-          opt, initial_scale=2, dynamic_growth_steps=3)
-      config = opt.get_config()
-      opt = loss_scale_optimizer.LossScaleOptimizerV3.from_config(config)
-    else:
-      self.assertEqual(config_version, 'tf2_3')
-      config = {
-          'optimizer': {
-              'class_name': 'SGD',
-              'config': {
-                  'learning_rate': 2.0,
-                  'momentum': 0.5,
-                  'decay': 0.0,
-                  'nesterov': False,
-                  'name': 'SGD',
-              }
-          },
-          'loss_scale': {
-              'class_name': 'DynamicLossScale',
-              'config': {
-                  'initial_loss_scale': 2.0,
-                  'increment_period': 3,
-                  'multiplier': 2.0,
-              }
-          },
-      }
-      opt = loss_scale_optimizer.LossScaleOptimizer.from_config(config)
-
-    # Force hyperparameters to be created
-    opt.learning_rate  # pylint: disable=pointless-statement
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    # Test attributes on the optimizer
-    self.assertEqual(self.evaluate(opt.learning_rate), 2.)
-    self.assertEqual(self.evaluate(opt.inner_optimizer.learning_rate), 2.)
-    self.assertEqual(self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5)
-    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
-    self.assertEqual(opt.initial_scale, 2.)
-    self.assertEqual(opt.dynamic_growth_steps, 3.)
-    self.assertTrue(opt.dynamic)
-
-    # Ensure the optimizer can be used
-    var = tf.Variable([5.0])
-    run_op = self._run_fn_with_grad_check(
-        tf.distribute.get_strategy(), var, opt, 2)()
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self._run_if_in_graph_mode(run_op)
-    self.assertEqual(self.evaluate(var), [3.])
-    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
-
-  def test_from_config_with_invalid_multiplier(self):
-    config = {
-        'optimizer': {
-            'class_name': 'SGD',
-            'config': {
-                'learning_rate': 2.0,
-                'momentum': 0.5,
-                'decay': 0.0,
-                'nesterov': False,
-                'name': 'SGD',
-            }
-        },
-        'loss_scale': {
-            'class_name': 'DynamicLossScale',
-            'config': {
-                'initial_loss_scale': 2.0,
-                'increment_period': 3,
-                'multiplier': 4.0,
-            }
-        },
-    }
-
-    expected_error = ('Cannot deserialize LossScaleOptimizer with a '
-                      'DynamicLossScale whose multiplier is not 2. Got '
-                      'DynamicLossScale: DynamicLossScale\\(')
-    with self.assertRaisesRegex(ValueError, expected_error):
-      loss_scale_optimizer.LossScaleOptimizer.from_config(config)
-
-  @test_combinations.generate(
-      test_combinations.combine(lso_type=['v1', 'v2']) +
-      test_combinations.combine(lso_type='v3', mode='eager'))
-  def testSerializationWithBuiltInOptimizer(self, lso_type):
-    if lso_type in ('v1', 'v2'):
-      opt = gradient_descent.SGD(2., momentum=0.5)
-      opt = loss_scale_optimizer.LossScaleOptimizer(
-          opt, initial_scale=2., dynamic_growth_steps=3.)
-      config = optimizers.serialize(opt)
-      if lso_type == 'v1':
-        # LossScaleOptimizerV1 was an older experimental version of LSO that is
-        # now deleted. The config had the same format as LSO but the class
-        # name was different. This tests that LSO V1 configs can still be
-        # deserialized, which are deserialized as a (non-V1) LSO
-        config['class_name'] = 'LossScaleOptimizerV1'
-    else:
-      opt = sgd_experimental.SGD(2., momentum=0.5)
-      opt = loss_scale_optimizer.LossScaleOptimizerV3(
-          opt, initial_scale=2., dynamic_growth_steps=3)
-      config = optimizers.serialize(opt)
-    opt = optimizers.deserialize(config)
-    # Force hyperparameters to be created
-    opt.learning_rate  # pylint: disable=pointless-statement
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    self.assertEqual(self.evaluate(opt.learning_rate), 2.)
-    self.assertEqual(self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5)
-    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
-    self.assertEqual(opt.dynamic_growth_steps, 3.)
-    self.assertTrue(opt.dynamic)
-    if lso_type in ('v1', 'v2'):
-      self.assertEqual(type(opt), loss_scale_optimizer.LossScaleOptimizer)
-    else:
-      self.assertEqual(type(opt), loss_scale_optimizer.LossScaleOptimizerV3)
-
-    # Ensure the optimizer can be used
-    var = tf.Variable([5.0])
-    run_op = self._run_fn_with_grad_check(
-        tf.distribute.get_strategy(), var, opt, 2)()
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self._run_if_in_graph_mode(run_op)
-    self.assertEqual(self.evaluate(var), [3.])
-    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testSerializationWithCustomOptimizer(self, opt_cls):
-    sgd_cls = type(create_sgd(opt_cls))
-
-    class MySGD(sgd_cls):
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.my_attribute = 123
-
-    opt = MySGD(2., momentum=0.5)
-    opt = create_lso(opt, initial_scale=2., dynamic_growth_steps=3.)
-    config = optimizers.serialize(opt)
-    custom_objects = {'MySGD': MySGD}
-    opt = optimizers.deserialize(config, custom_objects=custom_objects)
-    # Force hyperparameters to be created
-    opt.learning_rate  # pylint: disable=pointless-statement
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    self.assertEqual(self.evaluate(opt.learning_rate), 2.)
-    self.assertEqual(self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5)
-    self.assertEqual(self.evaluate(opt.loss_scale), 2.)
-    self.assertEqual(opt.dynamic_growth_steps, 3.)
-    self.assertEqual(opt.inner_optimizer.my_attribute, 123)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testUnsupportedStrategy(self, opt_cls):
-    strategy = tf.distribute.experimental.CentralStorageStrategy()
-    expected_error = (
-        'Loss scaling is not supported with the tf.distribute.Strategy: '
-        'CentralStorageStrategy. Try using a different Strategy, e.g. a '
-        'MirroredStrategy')
-    with strategy.scope(), self.assertRaisesRegex(ValueError, expected_error):
-      create_lso(create_sgd(opt_cls))
-    opt = create_lso(create_sgd(opt_cls))
-    with strategy.scope():
-      var = tf.Variable(1.0)
-      loss = lambda: var * 2.0
-      run_fn = lambda: opt.minimize(loss, [var])
-      with self.assertRaisesRegex(ValueError, expected_error):
-        strategy.experimental_run(run_fn)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testInvalidArgsWithFixedLossScale(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    with self.assertRaisesRegex(
-        ValueError, '"initial_scale" must be specified if "dynamic" is False'):
-      create_lso(opt, dynamic=False)
-    opt = create_sgd(opt_cls)
-    with self.assertRaisesRegex(
-        ValueError, '"dynamic_growth_steps" must be None if "dynamic" is '
-                    'False, but got: 2'):
-      create_lso(opt, dynamic=False, initial_scale=1, dynamic_growth_steps=2)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testDynamicMustBeBool(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    with self.assertRaisesRegex(
-        TypeError, '"dynamic" argument to LossScaleOptimizer.__init__ must be '
-                   "a bool, but got: 'dynamic'"):
-      create_lso(opt, 'dynamic')
-
-  @test_combinations.generate(opt_combinations_only())
-  def testScalingWarning(self, opt_cls):
-    var = tf.Variable(1.0)
-    lso = create_lso(create_sgd(opt_cls))
-    with mock.patch.object(tf_logging, 'warning') as mock_warn:
-      lso.apply_gradients([(tf.constant(1.0), var)])
-      self.assertIn(
-          'You forgot to call LossScaleOptimizer.get_scaled_loss() and '
-          'LossScaleOptimizer.get_unscaled_gradients() before',
-          mock_warn.call_args_list[0][0][0])
-    lso = create_lso(create_sgd(opt_cls))
-    with mock.patch.object(tf_logging, 'warning') as mock_warn:
-      lso.get_scaled_loss(tf.constant(1.0))
-      lso.apply_gradients([(tf.constant(1.0), var)])
-      self.assertIn(
-          'You forgot to call LossScaleOptimizer.get_unscaled_gradients() '
-          'before',
-          mock_warn.call_args_list[0][0][0])
-    lso = create_lso(create_sgd(opt_cls))
-    with mock.patch.object(tf_logging, 'warning') as mock_warn:
-      lso.get_unscaled_gradients([tf.constant(1.0)])
-      lso.apply_gradients([(tf.constant(1.0), var)])
-      self.assertIn(
-          'You forgot to call LossScaleOptimizer.get_scaled_loss() before',
-          mock_warn.call_args_list[0][0][0])
-    lso = create_lso(create_sgd(opt_cls))
-    with mock.patch.object(tf_logging, 'warning') as mock_warn:
-      lso.get_scaled_loss(tf.constant(1.0))
-      lso.get_unscaled_gradients([tf.constant(1.0)])
-      lso.apply_gradients([(tf.constant(1.0), var)])
-      mock_warn.assert_not_called()
-
-  @test_combinations.generate(opt_combinations_only())
-  def testErrorWhenNesting(self, opt_cls):
-    opt = create_sgd(opt_cls)
-    opt = create_lso(opt)
-    with self.assertRaisesRegex(
-        TypeError, 'LossScaleOptimizer cannot wrap another LossScaleOptimizer'):
-      create_lso(opt)
-
-  @test_combinations.generate(opt_combinations_only())
-  def testErrorWrappingSameOptimizerMultipleTimes(self, opt_cls):
-    inner_opt = create_sgd(opt_cls)
-    create_lso(inner_opt)
-    with self.assertRaisesRegex(
-        ValueError,
-        '"inner_optimizer" is already wrapped by a LossScaleOptimizer.'):
-      create_lso(inner_opt)
-
-  def testErrorWhenWrappingNonOptimizer(self):
-    with self.assertRaisesRegex(
-        TypeError,
-        '"inner_optimizer" must be an instance of '
-        '`tf.keras.optimizers.Optimizer` or '
-        '`tf.keras.optimizers.experimental.Optimizer`, but got: 1'):
-      loss_scale_optimizer.BaseLossScaleOptimizer(1)
-
-  def testErrorWhenWrappingLegacyKerasOptimizers(self):
-    sgd = legacy_sgd.SGD()
-    with self.assertRaisesRegex(
-        TypeError, 'not an instance of `tensorflow.python.keras.optimizers`'):
-      loss_scale_optimizer.BaseLossScaleOptimizer(sgd)
-
-  def testErrorWhenV3LsoWrapsV2Optimizer(self):
-    sgd = gradient_descent.SGD()
-    with self.assertRaisesRegex(
-        TypeError, 'only the new experimental optimizer '
-        'defined in keras/optimizer_expeirmental/optimizer.py can be '
-        'passed'):
-      loss_scale_optimizer.LossScaleOptimizerV3(sgd)
-
-  def testErrorWhenV2LsoWrapsV3Optimizer(self):
-    sgd = sgd_experimental.SGD()
-    with self.assertRaisesRegex(
-        TypeError, 'only the classic optimizers subclassing from '
-        '`tf.keras.optimizers.Optimizer` can be passed'):
-      loss_scale_optimizer.LossScaleOptimizer(sgd)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+
+    @test_combinations.generate(opt_combinations_only())
+    def testSerializationWithCustomOptimizer(self, opt_cls):
+        sgd_cls = type(create_sgd(opt_cls))
+
+        class MySGD(sgd_cls):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.my_attribute = 123
+
+        opt = MySGD(2.0, momentum=0.5)
+        opt = create_lso(opt, initial_scale=2.0, dynamic_growth_steps=3.0)
+        config = optimizers.serialize(opt)
+        custom_objects = {"MySGD": MySGD}
+        opt = optimizers.deserialize(config, custom_objects=custom_objects)
+        # Force hyperparameters to be created
+        opt.learning_rate
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+
+        self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+        self.assertEqual(
+            self._eval_if_tensor(opt.inner_optimizer.momentum), 0.5
+        )
+        self.assertEqual(self.evaluate(opt.loss_scale), 2.0)
+        self.assertEqual(opt.dynamic_growth_steps, 3.0)
+        self.assertEqual(opt.inner_optimizer.my_attribute, 123)
+
+    @test_utils.run_v2_only
+    def testConvertToLegacyOptimizer(self):
+        opt = sgd_experimental.SGD(1.0)
+        opt = loss_scale_optimizer.BaseLossScaleOptimizer(opt)
+        converted_opt = optimizers.convert_to_legacy_optimizer(opt)
+        self.assertEqual(
+            type(converted_opt), loss_scale_optimizer.LossScaleOptimizer
+        )
+
+        reference_opt = gradient_descent.SGD(1.0)
+        reference_opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+            reference_opt
+        )
+        self.assertEqual(converted_opt.get_config(), reference_opt.get_config())
+
+        # Test with a custom learning rate schedule
+        class CustomLRSchedule(learning_rate_schedule.LearningRateSchedule):
+            def __init__(self, initial_learning_rate):
+                self.initial_learning_rate = initial_learning_rate
+
+            def __call__(self, step):
+                step = tf.cast(step, tf.float32)
+                return self.initial_learning_rate / (step + 1)
+
+            def get_config(self):
+                return {"initial_learning_rate": self.initial_learning_rate}
+
+        opt = sgd_experimental.SGD(CustomLRSchedule(1.0))
+        opt = loss_scale_optimizer.BaseLossScaleOptimizer(opt)
+        converted_opt = optimizers.convert_to_legacy_optimizer(opt)
+        self.assertEqual(
+            type(converted_opt), loss_scale_optimizer.LossScaleOptimizer
+        )
+
+        reference_opt = gradient_descent.SGD(CustomLRSchedule(1.0))
+        reference_opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+            reference_opt
+        )
+        self.assertEqual(converted_opt.get_config(), reference_opt.get_config())
+
+    @test_combinations.generate(opt_combinations_only())
+    def testUnsupportedStrategy(self, opt_cls):
+        strategy = tf.distribute.experimental.CentralStorageStrategy()
+        expected_error = (
+            "Loss scaling is not supported with the tf.distribute.Strategy: "
+            "CentralStorageStrategy. Try using a different Strategy, e.g. a "
+            "MirroredStrategy"
+        )
+        with strategy.scope(), self.assertRaisesRegex(
+            ValueError, expected_error
+        ):
+            create_lso(create_sgd(opt_cls))
+        opt = create_lso(create_sgd(opt_cls))
+        with strategy.scope():
+            var = tf.Variable(1.0)
+            loss = lambda: var * 2.0
+            run_fn = lambda: opt.minimize(loss, [var])
+            with self.assertRaisesRegex(ValueError, expected_error):
+                strategy.experimental_run(run_fn)
+
+    @test_combinations.generate(opt_combinations_only())
+    def testInvalidArgsWithFixedLossScale(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        with self.assertRaisesRegex(
+            ValueError,
+            '"initial_scale" must be specified if "dynamic" is False',
+        ):
+            create_lso(opt, dynamic=False)
+        opt = create_sgd(opt_cls)
+        with self.assertRaisesRegex(
+            ValueError,
+            '"dynamic_growth_steps" must be None if "dynamic" is '
+            "False, but got: 2",
+        ):
+            create_lso(
+                opt, dynamic=False, initial_scale=1, dynamic_growth_steps=2
+            )
+
+    @test_combinations.generate(opt_combinations_only())
+    def testDynamicMustBeBool(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        with self.assertRaisesRegex(
+            TypeError,
+            '"dynamic" argument to LossScaleOptimizer.__init__ must be '
+            "a bool, but got: 'dynamic'",
+        ):
+            create_lso(opt, "dynamic")
+
+    @test_combinations.generate(opt_combinations_only())
+    def testScalingWarning(self, opt_cls):
+        var = tf.Variable(1.0)
+        lso = create_lso(create_sgd(opt_cls))
+        with mock.patch.object(tf_logging, "warning") as mock_warn:
+            lso.apply_gradients([(tf.constant(1.0), var)])
+            self.assertIn(
+                "You forgot to call LossScaleOptimizer.get_scaled_loss() and "
+                "LossScaleOptimizer.get_unscaled_gradients() before",
+                mock_warn.call_args_list[0][0][0],
+            )
+        lso = create_lso(create_sgd(opt_cls))
+        with mock.patch.object(tf_logging, "warning") as mock_warn:
+            lso.get_scaled_loss(tf.constant(1.0))
+            lso.apply_gradients([(tf.constant(1.0), var)])
+            self.assertIn(
+                "You forgot to call "
+                "LossScaleOptimizer.get_unscaled_gradients() before",
+                mock_warn.call_args_list[0][0][0],
+            )
+        lso = create_lso(create_sgd(opt_cls))
+        with mock.patch.object(tf_logging, "warning") as mock_warn:
+            lso.get_unscaled_gradients([tf.constant(1.0)])
+            lso.apply_gradients([(tf.constant(1.0), var)])
+            self.assertIn(
+                "You forgot to call LossScaleOptimizer.get_scaled_loss() "
+                "before",
+                mock_warn.call_args_list[0][0][0],
+            )
+
+    @test_combinations.generate(opt_combinations_only())
+    def testScalingNoWarning(self, opt_cls):
+        var = tf.Variable(1.0)
+        lso = create_lso(create_sgd(opt_cls))
+        with mock.patch.object(tf_logging, "warning") as mock_warn:
+            lso.get_scaled_loss(tf.constant(1.0))
+            lso.get_unscaled_gradients([tf.constant(1.0)])
+            lso.apply_gradients([(tf.constant(1.0), var)])
+            mock_warn.assert_not_called()
+
+    @test_combinations.generate(opt_combinations_only())
+    def testErrorWhenNesting(self, opt_cls):
+        opt = create_sgd(opt_cls)
+        opt = create_lso(opt)
+        with self.assertRaisesRegex(
+            TypeError,
+            "LossScaleOptimizer cannot wrap another LossScaleOptimizer",
+        ):
+            create_lso(opt)
+
+    @test_combinations.generate(opt_combinations_only())
+    def testErrorWrappingSameOptimizerMultipleTimes(self, opt_cls):
+        inner_opt = create_sgd(opt_cls)
+        create_lso(inner_opt)
+        with self.assertRaisesRegex(
+            ValueError,
+            '"inner_optimizer" is already wrapped by a LossScaleOptimizer.',
+        ):
+            create_lso(inner_opt)
+
+    def testErrorWhenWrappingNonOptimizer(self):
+        with self.assertRaisesRegex(
+            TypeError,
+            '"inner_optimizer" must be an instance of '
+            "`tf.keras.optimizers.Optimizer` or "
+            "`tf.keras.optimizers.experimental.Optimizer`, but got: 1",
+        ):
+            loss_scale_optimizer.BaseLossScaleOptimizer(1)
+
+    def testErrorWhenV3LsoWrapsV2Optimizer(self):
+        sgd = gradient_descent.SGD()
+        with self.assertRaisesRegex(
+            TypeError,
+            "only the new experimental optimizer "
+            "defined in keras/optimizer_expeirmental/optimizer.py can be "
+            "passed",
+        ):
+            loss_scale_optimizer.LossScaleOptimizerV3(sgd)
+
+    def testErrorWhenV2LsoWrapsV3Optimizer(self):
+        sgd = sgd_experimental.SGD()
+        with self.assertRaisesRegex(
+            TypeError,
+            "only the classic optimizers subclassing from "
+            "`tf.keras.optimizers.Optimizer` can be passed",
+        ):
+            loss_scale_optimizer.LossScaleOptimizer(sgd)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/mixed_precision/mixed_precision_graph_rewrite_test.py b/keras/mixed_precision/mixed_precision_graph_rewrite_test.py
index 8e36245621cf..6f8523393475 100644
--- a/keras/mixed_precision/mixed_precision_graph_rewrite_test.py
+++ b/keras/mixed_precision/mixed_precision_graph_rewrite_test.py
@@ -14,138 +14,167 @@
 # ==============================================================================
 """Tests Keras integration with enable_mixed_precision_graph_rewrite()."""
 
+import os
+
 import tensorflow.compat.v2 as tf
 
-import os
+from keras.mixed_precision import (
+    loss_scale_optimizer as loss_scale_optimizer_v2,
+)
+from keras.mixed_precision import policy
+from keras.optimizers.legacy import gradient_descent as gradient_descent_v2
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
-from keras.mixed_precision import loss_scale_optimizer as loss_scale_optimizer_v2
-from keras.mixed_precision import policy
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_v2
 
 
 class MixedPrecisionTest(test_combinations.TestCase):
 
-  IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'
-
-  def setUp(self):
-    super().setUp()
-    # Enable the tests to be run on pre-Volta GPUs by telling the grappler pass
-    # to ignore performance and always transform the graph.
-    self._original_ignore_perf_value = os.getenv(self.IGNORE_PERF_VAR)
-    os.environ[self.IGNORE_PERF_VAR] = '1'
-
-  def tearDown(self):
-    # Set the IGNORE_PERF_VAR variable back to it's original value.
-    if self._original_ignore_perf_value is not None:
-      os.environ[self.IGNORE_PERF_VAR] = self._original_ignore_perf_value
-    else:
-      del os.environ[self.IGNORE_PERF_VAR]
-
-    tf.compat.v1.mixed_precision.disable_mixed_precision_graph_rewrite()
-    super().tearDown()
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_wrap_optimizer_fixed_loss_scale(self):
-    opt = gradient_descent_v2.SGD(1.0)
-    opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-        opt, 123)
-    self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertEqual(self.evaluate(opt.loss_scale), 123.)
-    self.assertFalse(opt.dynamic)
-    self.assertTrue(opt.initial_scale, 123.)
-
-    opt = gradient_descent_v2.SGD(1.0)
-    opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-        opt, tf.compat.v1.mixed_precision.FixedLossScale(123))
-    self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertEqual(self.evaluate(opt.loss_scale), 123.)
-    self.assertFalse(opt.dynamic)
-    self.assertTrue(opt.initial_scale, 123.)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_wrap_optimizer_dynamic_loss_scale(self):
-    opt = gradient_descent_v2.SGD(1.0)
-    opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-        opt, 'dynamic')
-    self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertEqual(self.evaluate(opt.loss_scale), 2. ** 15)
-    self.assertTrue(opt.dynamic)
-    self.assertTrue(opt.initial_scale, 2. ** 15)
-    self.assertTrue(opt.dynamic_growth_steps, 2000)
-
-    opt = gradient_descent_v2.SGD(1.0)
-    opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-        opt, tf.compat.v1.mixed_precision.DynamicLossScale(
-            initial_loss_scale=4, increment_period=1000))
-    self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertEqual(self.evaluate(opt.loss_scale), 4.)
-    self.assertTrue(opt.dynamic)
-    self.assertTrue(opt.initial_scale, 4.)
-    self.assertTrue(opt.dynamic_growth_steps, 1000)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_wrap_optimizer_dynamic_loss_scale_errors(self):
-
-    opt = gradient_descent_v2.SGD(1.0)
-    with self.assertRaisesRegex(
-        ValueError, 'When passing a DynamicLossScale to "loss_scale", '
-                    'DynamicLossScale.multiplier must be 2. Got: '
-                    'DynamicLossScale'):
-      tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-          opt, tf.compat.v1.mixed_precision.DynamicLossScale(multiplier=4.))
-
-    class MyLossScale(tf.compat.v1.mixed_precision.LossScale):
-
-      def __call__(self):
-        return 1.
-
-      def update(self, grads):
-        return None, True
-
-      def get_config(self):
-        return {}
-
-    with self.assertRaisesRegex(
-        TypeError, 'Passing a LossScale that is not a FixedLossScale or a '
-                   'DynamicLossScale is not supported. Got:'):
-      tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-          opt, MyLossScale())
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_optimizer_errors(self):
-    opt = gradient_descent_v2.SGD(1.0)
-    opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt)
-    with self.assertRaisesRegex(
-        ValueError, '"opt" must not already be an instance of a '
-        'LossScaleOptimizer.'):
-      tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(opt)
-    self.assertFalse(tf.config.optimizer.get_experimental_options()
-                     .get('auto_mixed_precision', False))
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_error_if_policy_is_set(self):
-    with policy.policy_scope('mixed_float16'):
-      with self.assertRaisesRegex(ValueError,
-                                  'the global Keras dtype Policy has been set'):
+    IGNORE_PERF_VAR = "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE"
+
+    def setUp(self):
+        super().setUp()
+        # Enable the tests to be run on pre-Volta GPUs by telling the grappler
+        # pass to ignore performance and always transform the graph.
+        self._original_ignore_perf_value = os.getenv(self.IGNORE_PERF_VAR)
+        os.environ[self.IGNORE_PERF_VAR] = "1"
+
+    def tearDown(self):
+        # Set the IGNORE_PERF_VAR variable back to its original value.
+        if self._original_ignore_perf_value is not None:
+            os.environ[self.IGNORE_PERF_VAR] = self._original_ignore_perf_value
+        else:
+            del os.environ[self.IGNORE_PERF_VAR]
+
+        tf.compat.v1.mixed_precision.disable_mixed_precision_graph_rewrite()
+        super().tearDown()
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_wrap_optimizer_fixed_loss_scale(self):
+        opt = gradient_descent_v2.SGD(1.0)
+        opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            opt, 123
+        )
+        self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertEqual(self.evaluate(opt.loss_scale), 123.0)
+        self.assertFalse(opt.dynamic)
+        self.assertTrue(opt.initial_scale, 123.0)
+
+        opt = gradient_descent_v2.SGD(1.0)
+        opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            opt, tf.compat.v1.mixed_precision.FixedLossScale(123)
+        )
+        self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertEqual(self.evaluate(opt.loss_scale), 123.0)
+        self.assertFalse(opt.dynamic)
+        self.assertTrue(opt.initial_scale, 123.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_wrap_optimizer_dynamic_loss_scale(self):
+        opt = gradient_descent_v2.SGD(1.0)
+        opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            opt, "dynamic"
+        )
+        self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertEqual(self.evaluate(opt.loss_scale), 2.0**15)
+        self.assertTrue(opt.dynamic)
+        self.assertTrue(opt.initial_scale, 2.0**15)
+        self.assertTrue(opt.dynamic_growth_steps, 2000)
+
+        opt = gradient_descent_v2.SGD(1.0)
+        opt = tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            opt,
+            tf.compat.v1.mixed_precision.DynamicLossScale(
+                initial_loss_scale=4, increment_period=1000
+            ),
+        )
+        self.assertIsInstance(opt, loss_scale_optimizer_v2.LossScaleOptimizer)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertEqual(self.evaluate(opt.loss_scale), 4.0)
+        self.assertTrue(opt.dynamic)
+        self.assertTrue(opt.initial_scale, 4.0)
+        self.assertTrue(opt.dynamic_growth_steps, 1000)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_wrap_optimizer_dynamic_loss_scale_errors(self):
+
+        opt = gradient_descent_v2.SGD(1.0)
+        with self.assertRaisesRegex(
+            ValueError,
+            'When passing a DynamicLossScale to "loss_scale", '
+            "DynamicLossScale.multiplier must be 2. Got: "
+            "DynamicLossScale",
+        ):
+            tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+                opt,
+                tf.compat.v1.mixed_precision.DynamicLossScale(multiplier=4.0),
+            )
+
+        class MyLossScale(tf.compat.v1.mixed_precision.LossScale):
+            def __call__(self):
+                return 1.0
+
+            def update(self, grads):
+                return None, True
+
+            def get_config(self):
+                return {}
+
+        with self.assertRaisesRegex(
+            TypeError,
+            "Passing a LossScale that is not a FixedLossScale or a "
+            "DynamicLossScale is not supported. Got:",
+        ):
+            tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+                opt, MyLossScale()
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_optimizer_errors(self):
+        opt = gradient_descent_v2.SGD(1.0)
+        opt = loss_scale_optimizer_v2.LossScaleOptimizer(opt)
+        with self.assertRaisesRegex(
+            ValueError,
+            '"opt" must not already be an instance of a LossScaleOptimizer.',
+        ):
+            tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+                opt
+            )
+        self.assertFalse(
+            tf.config.optimizer.get_experimental_options().get(
+                "auto_mixed_precision", False
+            )
+        )
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_error_if_policy_is_set(self):
+        with policy.policy_scope("mixed_float16"):
+            with self.assertRaisesRegex(
+                ValueError, "the global Keras dtype Policy has been set"
+            ):
+                tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(  # noqa: E501
+                    gradient_descent_v2.SGD(1.0)
+                )
+        # Test no error is thrown when the policy is currently the default.
         tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-            gradient_descent_v2.SGD(1.0))
-    # Test no error is thrown when the policy is currently the default.
-    tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-        gradient_descent_v2.SGD(1.0))
-    # Test no error is thrown when the policy is a non-mixed policy.
-    with policy.policy_scope('float64'):
-      tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-          gradient_descent_v2.SGD(1.0))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+            gradient_descent_v2.SGD(1.0)
+        )
+        # Test no error is thrown when the policy is a non-mixed policy.
+        with policy.policy_scope("float64"):
+            tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+                gradient_descent_v2.SGD(1.0)
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/mixed_precision/model_test.py b/keras/mixed_precision/model_test.py
index 86c8187ec0ca..0663d589f336 100644
--- a/keras/mixed_precision/model_test.py
+++ b/keras/mixed_precision/model_test.py
@@ -14,19 +14,16 @@
 # ==============================================================================
 """Tests keras.Model works properly with mixed precision."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl import flags
 from absl.testing import parameterized
-import numpy as np
+
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import layers
 from keras import models
-from keras.optimizers import optimizer_v1
-from keras.testing_infra import test_utils
 from keras.applications import densenet
 from keras.applications import efficientnet
 from keras.applications import inception_resnet_v2
@@ -43,10 +40,14 @@
 from keras.mixed_precision import loss_scale_optimizer
 from keras.mixed_precision import policy
 from keras.mixed_precision import test_util as mp_test_util
-from keras.optimizers.optimizer_v2 import gradient_descent
-from keras.saving import save
-from keras.utils import generic_utils
-
+from keras.optimizers import optimizer_v1
+from keras.optimizers import sgd
+from keras.optimizers.legacy import gradient_descent
+from keras.saving import object_registration
+from keras.saving.legacy import save
+from keras.saving.serialization_lib import SafeModeScope
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 # If called outside any strategy.scope() calls, this will return the default
 # strategy.
@@ -54,779 +55,989 @@
 
 
 def create_mirrored_strategy():
-  """Create a MirroredStrategy, using a GPU if it is available."""
-  if tf.config.list_logical_devices('GPU'):
-    return tf.distribute.MirroredStrategy(['cpu:0', 'gpu:0'])
-  else:
-    return tf.distribute.MirroredStrategy(['cpu:0'])
+    """Create a MirroredStrategy, using a GPU if it is available."""
+    if tf.config.list_logical_devices("GPU"):
+        return tf.distribute.MirroredStrategy(["cpu:0", "gpu:0"])
+    else:
+        return tf.distribute.MirroredStrategy(["cpu:0"])
 
 
-TESTCASES = ({
-    'testcase_name': 'base',
-    'strategy_fn': default_strategy_fn
-}, {
-    'testcase_name': 'distribute',
-    'strategy_fn': create_mirrored_strategy
-})
+TESTCASES = (
+    {"testcase_name": "base", "strategy_fn": default_strategy_fn},
+    {"testcase_name": "distribute", "strategy_fn": create_mirrored_strategy},
+)
 
 
 class KerasModelTest(test_combinations.TestCase):
-  """Test mixed precision with Keras models."""
-
-  def _skip_if_strategy_unsupported(self, strategy_fn):
-    if (strategy_fn != default_strategy_fn and
-        test_utils.get_model_type() == 'subclass'):
-      self.skipTest('Non-default strategies are unsupported with subclassed '
-                    'models')
-
-  def _skip_if_save_format_unsupported(self, save_format):
-    model_type = test_utils.get_model_type()
-    if save_format == 'h5' and model_type == 'subclass':
-      self.skipTest('Saving subclassed models with the HDF5 format is '
-                    'unsupported')
-    if (save_format == 'tf' and model_type == 'subclass' and
-        not tf.executing_eagerly()):
-      self.skipTest('b/148820505: This combination of features is currently '
-                    'broken.')
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'base',
-          'strategy_fn': default_strategy_fn
-      }, {
-          'testcase_name': 'distribute',
-          'strategy_fn': create_mirrored_strategy,
-      }, {
-          'testcase_name': 'operator',
-          'strategy_fn': create_mirrored_strategy,
-          'use_operator': True
-      }, {
-          'testcase_name': 'regularizer',
-          'strategy_fn': create_mirrored_strategy,
-          'use_regularizer': True
-      }, {
-          'testcase_name': 'get_config',
-          'strategy_fn': create_mirrored_strategy,
-          'get_config': True,
-          'use_regularizer': True,
-      }, {
-          'testcase_name': 'saved_model',
-          'strategy_fn': default_strategy_fn,
-          'save_format': 'tf',
-          'use_regularizer': True,
-      }, {
-          'testcase_name': 'saved_model_input_spec',
-          'strategy_fn': default_strategy_fn,
-          'save_format': 'tf',
-          'use_regularizer': True,
-          'use_input_spec': True,
-      }, {
-          'testcase_name': 'h5',
-          'strategy_fn': default_strategy_fn,
-          'save_format': 'h5',
-          'use_regularizer': True,
-      }, {
-          'testcase_name': 'saved_model_distribute',
-          'strategy_fn': create_mirrored_strategy,
-          'save_format': 'tf',
-          'use_regularizer': True,
-      }, {
-          'testcase_name': 'saved_model_input_spec_distribute',
-          'strategy_fn': create_mirrored_strategy,
-          'save_format': 'tf',
-          'use_regularizer': True,
-          'use_input_spec': True,
-      }, {
-          'testcase_name': 'h5_distribute',
-          'strategy_fn': create_mirrored_strategy,
-          'save_format': 'h5',
-          'use_regularizer': True,
-      })
-  def test_model(self,
-                 strategy_fn,
-                 use_operator=False,
-                 use_regularizer=False,
-                 policy_name='mixed_float16',
-                 get_config=False,
-                 save_format=None,
-                 use_input_spec=False):
-    self._skip_if_strategy_unsupported(strategy_fn)
-    self._skip_if_save_format_unsupported(save_format)
-    if use_regularizer:
-      weight_regularizer = mp_test_util.IdentityRegularizer()
-      activity_regularizer = mp_test_util.ReduceSumRegularizer()
-    else:
-      weight_regularizer = activity_regularizer = None
-    with strategy_fn().scope():
-      with policy.policy_scope(policy_name):
-        layer = mp_test_util.MultiplyLayer(
-            assert_type=tf.float16,
-            use_operator=use_operator,
-            regularizer=weight_regularizer,
-            activity_regularizer=activity_regularizer,
-            input_shape=(1,))
-        if use_input_spec:
-          layer.input_spec = input_spec.InputSpec(shape=(None, 1))
-        model = test_utils.get_model_from_layers([layer], input_shape=(1,),
-                                                 input_dtype=tf.float16)
-        if get_config:
-          config = model.get_config()
-          model = model.__class__.from_config(
-              config,
-              custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer})
-          (layer,) = (layer for layer in model.layers
-                      if isinstance(layer, mp_test_util.MultiplyLayer))
-
-        def loss_fn(y_true, y_pred):
-          del y_true
-          return tf.reduce_mean(y_pred)
-
-        # Learning rate is small enough that if applied to a float16 variable,
-        # the variable will not change. So this tests the learning rate not
-        # applied to a float16 value, but instead the float32 variable.
-        opt = gradient_descent.SGD(2**-14)
-        # Use a fixed loss scale, as this test will fail if gradients are
-        # skipped for a step due to dynamic loss scaling.
-        opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
-                                                      initial_scale=8)
-        model.compile(
-            opt,
-            loss=loss_fn,
-            run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((2, 1))
-    y = np.ones((2, 1))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
-    model.fit(dataset)
-    # Variable starts at 1, and should have gradient of 2 ** -14 subtracted
-    # from it.
-    expected = 1 - 2**-14
-    if use_regularizer:
-      # Weight and activity regularizer each add another 2 ** -14 to the
-      # gradient.
-      expected -= 2 * 2**-14
-    self.assertEqual(backend.eval(layer.v), expected)
-
-    if save_format:
-      with generic_utils.CustomObjectScope(
-          {'MultiplyLayer': mp_test_util.MultiplyLayer, 'loss_fn': loss_fn}):
-        self._test_saving(model, dataset, save_format, use_regularizer)
-
-  def _test_saving(self, model, dataset, save_format, use_regularizer):
-    # Save and load model, asserting variable does not change
-    save_path = os.path.join(self.get_temp_dir(), 'model')
-    model.save(save_path, save_format=save_format)
-    model = save.load_model(save_path)
-    (layer,) = (layer for layer in model.layers
-                if 'MultiplyLayer' in layer.__class__.__name__)
-    expected = 1 - 2**-14
-    if use_regularizer:
-      expected -= 2 * 2**-14
-    self.assertEqual(backend.eval(layer.v), expected)
-
-    # Continue training, and assert variable is correct value
-    model.fit(dataset)
-    new_expected = expected - 2 ** -14
-    if use_regularizer:
-      new_expected -= 2 * 2 ** -14
-    self.assertEqual(backend.eval(layer.v), new_expected)
-
-    # Load saved model again, and assert variable is previous value
-    model = save.load_model(save_path)
-    (layer,) = (layer for layer in model.layers
-                if 'MultiplyLayer' in layer.__class__.__name__)
-    self.assertEqual(backend.eval(layer.v), expected)
-
-    # Ensure various dtype-related aspects of the layer are correct
-    self.assertEqual(layer.dtype, 'float32')
-    self.assertEqual(layer.dtype_policy.name, 'mixed_float16')
-    self.assertEqual(layer.v.dtype, 'float32')
-    self.assertEqual(layer(np.ones((2, 1))).dtype, 'float16')
-
-    self.assertEqual(type(model.dtype_policy), policy.Policy)
-    self.assertEqual(layer.get_config()['dtype'],
-                     {'class_name': 'Policy', 'config': {
-                         'name': 'mixed_float16'}})
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'base',
-          'strategy_fn': default_strategy_fn
-      }, {
-          'testcase_name': 'distribute',
-          'strategy_fn': create_mirrored_strategy,
-      })
-  def test_fixed_loss_scaling(self,
-                              strategy_fn):
-    # Note: We do not test mixed precision in this method, only loss scaling.
-    loss_scale = 8.
-    batch_size = 4
-    with strategy_fn().scope():
-      x = layers.Input(shape=(1,), batch_size=batch_size)
-      layer = mp_test_util.MultiplyLayer()
-      y = layer(x)
-
-      # The gradient of 'y' at this point is 1. With loss scaling, the gradient
-      # is 'loss_scale'. We divide by the batch size since the loss is averaged
-      # across batch elements.
-      expected_gradient = loss_scale / batch_size
-      identity_with_grad_check_fn = (
-          mp_test_util.create_identity_with_grad_check_fn([expected_gradient]))
-      y = core.Lambda(identity_with_grad_check_fn)(y)
-      model = models.Model(inputs=x, outputs=y)
-
-      def loss_fn(y_true, y_pred):
-        del y_true
-        return tf.reduce_mean(y_pred)
-
-      opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
-                                                    initial_scale=loss_scale)
-      model.compile(
-          opt,
-          loss=loss_fn,
-          run_eagerly=test_utils.should_run_eagerly())
-
-    self.assertEqual(backend.eval(layer.v), 1)
-    x = np.ones((batch_size, 1))
-    y = np.ones((batch_size, 1))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
-    model.fit(dataset)
-    # Variable starts at 1, and should have gradient of 1 subtracted from it.
-    expected = 0
-    self.assertEqual(backend.eval(layer.v), expected)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'base',
-          'strategy_fn': default_strategy_fn
-      }, {
-          'testcase_name': 'distribute',
-          'strategy_fn': create_mirrored_strategy,
-      }, {
-          'testcase_name': 'loss_scaling',
-          'strategy_fn': create_mirrored_strategy,
-          'use_loss_scaling': True
-      })
-  def test_advanced_model(self, strategy_fn, use_loss_scaling=False):
-    # The advanced model tests mixed-precision-related features that would occur
-    # in a resnet50 model. It tests a model that has:
-    #  * Multiple layers, some which use auto-cast variables and some which do
-    #    not
-    #  * Regularization on some variables and not others.
-    #  * A fixed loss scale (if use_loss_scaling is True)
-
-    strategy = strategy_fn()
-    if use_loss_scaling:
-      loss_scale = 8.
-    learning_rate = 2**-14
-
-    with strategy.scope():
-      with policy.policy_scope(policy.Policy('mixed_float16')):
-        x = layers.Input(shape=(1,), batch_size=2)
-        layer1 = mp_test_util.MultiplyLayer(
-            assert_type=tf.float16,
-            regularizer=mp_test_util.IdentityRegularizer(),
-            use_operator=True)
-        layer2 = mp_test_util.MultiplyLayerWithoutAutoCast(
-            assert_type=tf.float16, use_operator=True)
-        layer3 = mp_test_util.MultiplyLayer(assert_type=tf.float16,
-                                            use_operator=False)
-        layer4 = mp_test_util.MultiplyLayerWithoutAutoCast(
-            assert_type=tf.float16,
-            regularizer=mp_test_util.IdentityRegularizer(),
-            use_operator=False)
-        y = layer1(x)
-        y = layer2(y)
-        y = layer3(y)
-        y = layer4(y)
-        if use_loss_scaling:
-          # The gradient of 'y' at this point is 1. With loss scaling, the
-          # gradient is 'loss_scale'. We divide by the batch size of 2 since the
-          # loss is averaged across batch elements.
-          expected_gradient = loss_scale / 2
-          identity_with_grad_check_fn = (
-              mp_test_util.create_identity_with_grad_check_fn(
-                  expected_dtype=tf.float16,
-                  expected_gradient=[expected_gradient]))
-          y = core.Lambda(identity_with_grad_check_fn)(y)
-        model = models.Model(inputs=x, outputs=y)
-
-        def loss_fn(y_true, y_pred):
-          del y_true
-          return tf.reduce_mean(y_pred)
-
-        opt = gradient_descent.SGD(learning_rate)
+    """Test mixed precision with Keras models."""
+
+    def _skip_if_strategy_unsupported(self, strategy_fn):
+        if (
+            strategy_fn != default_strategy_fn
+            and test_utils.get_model_type() == "subclass"
+        ):
+            self.skipTest(
+                "Non-default strategies are unsupported with subclassed models"
+            )
+
+    def _skip_if_save_format_unsupported(self, save_format):
+        model_type = test_utils.get_model_type()
+        if save_format == "h5" and model_type == "subclass":
+            self.skipTest(
+                "Saving subclassed models with the HDF5 format is unsupported"
+            )
+        if (
+            save_format == "tf"
+            and model_type == "subclass"
+            and not tf.executing_eagerly()
+        ):
+            self.skipTest(
+                "b/148820505: This combination of features is currently broken."
+            )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        {"testcase_name": "base", "strategy_fn": default_strategy_fn},
+        {
+            "testcase_name": "distribute",
+            "strategy_fn": create_mirrored_strategy,
+        },
+        {
+            "testcase_name": "operator",
+            "strategy_fn": create_mirrored_strategy,
+            "use_operator": True,
+        },
+        {
+            "testcase_name": "regularizer",
+            "strategy_fn": create_mirrored_strategy,
+            "use_regularizer": True,
+        },
+        {
+            "testcase_name": "get_config",
+            "strategy_fn": create_mirrored_strategy,
+            "get_config": True,
+            "use_regularizer": True,
+        },
+        {
+            "testcase_name": "saved_model",
+            "strategy_fn": default_strategy_fn,
+            "save_format": "tf",
+            "use_regularizer": True,
+        },
+        {
+            "testcase_name": "saved_model_input_spec",
+            "strategy_fn": default_strategy_fn,
+            "save_format": "tf",
+            "use_regularizer": True,
+            "use_input_spec": True,
+        },
+        {
+            "testcase_name": "h5",
+            "strategy_fn": default_strategy_fn,
+            "save_format": "h5",
+            "use_regularizer": True,
+        },
+        {
+            "testcase_name": "saved_model_distribute",
+            "strategy_fn": create_mirrored_strategy,
+            "save_format": "tf",
+            "use_regularizer": True,
+        },
+        {
+            "testcase_name": "saved_model_legacy_distribute",
+            "strategy_fn": create_mirrored_strategy,
+            "save_format": "tf",
+            "use_regularizer": True,
+            "use_legacy_optimizer": True,
+        },
+        {
+            "testcase_name": "saved_model_input_spec_distribute",
+            "strategy_fn": create_mirrored_strategy,
+            "save_format": "tf",
+            "use_regularizer": True,
+            "use_input_spec": True,
+        },
+        {
+            "testcase_name": "h5_distribute",
+            "strategy_fn": create_mirrored_strategy,
+            "save_format": "h5",
+            "use_regularizer": True,
+        },
+        {
+            "testcase_name": "h5_legacy_distribute",
+            "strategy_fn": create_mirrored_strategy,
+            "save_format": "h5",
+            "use_regularizer": True,
+            "use_legacy_optimizer": True,
+        },
+    )
+    def test_model(
+        self,
+        strategy_fn,
+        use_operator=False,
+        use_regularizer=False,
+        policy_name="mixed_float16",
+        get_config=False,
+        save_format=None,
+        use_input_spec=False,
+        use_legacy_optimizer=False,
+    ):
+        self._skip_if_strategy_unsupported(strategy_fn)
+        self._skip_if_save_format_unsupported(save_format)
+        if not tf.__internal__.tf2.enabled():
+            # The non-legacy optimizer is only supported in TF2
+            use_legacy_optimizer = True
+        if use_regularizer:
+            weight_regularizer = mp_test_util.IdentityRegularizer()
+            activity_regularizer = mp_test_util.ReduceSumRegularizer()
+        else:
+            weight_regularizer = activity_regularizer = None
+        with strategy_fn().scope():
+            with policy.policy_scope(policy_name):
+                layer = mp_test_util.MultiplyLayer(
+                    assert_type=tf.float16,
+                    use_operator=use_operator,
+                    regularizer=weight_regularizer,
+                    activity_regularizer=activity_regularizer,
+                    input_shape=(1,),
+                )
+                if use_input_spec:
+                    layer.input_spec = input_spec.InputSpec(shape=(None, 1))
+                model = test_utils.get_model_from_layers(
+                    [layer], input_shape=(1,), input_dtype=tf.float16
+                )
+                if get_config:
+                    config = model.get_config()
+                    model = model.__class__.from_config(
+                        config,
+                        custom_objects={
+                            "MultiplyLayer": mp_test_util.MultiplyLayer
+                        },
+                    )
+                    (layer,) = (
+                        layer
+                        for layer in model.layers
+                        if isinstance(layer, mp_test_util.MultiplyLayer)
+                    )
+
+                def loss_fn(y_true, y_pred):
+                    del y_true
+                    return tf.reduce_mean(y_pred)
+
+                # Learning rate is small enough that if applied to a float16
+                # variable, the variable will not change. So this tests the
+                # learning rate not applied to a float16 value, but instead the
+                # float32 variable.
+                learning_rate = 2**-14
+                if use_legacy_optimizer:
+                    opt = gradient_descent.SGD(learning_rate)
+                else:
+                    opt = sgd.SGD(learning_rate)
+                # Use a fixed loss scale, as this test will fail if gradients
+                # are skipped for a step due to dynamic loss scaling.
+                opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+                    opt, dynamic=False, initial_scale=8
+                )
+                model.compile(
+                    opt,
+                    loss=loss_fn,
+                    run_eagerly=test_utils.should_run_eagerly(),
+                )
+
+        x = np.ones((2, 1))
+        y = np.ones((2, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
+        model.fit(dataset)
+        # Variable starts at 1, and should have gradient of 2 ** -14 subtracted
+        # from it.
+        expected = 1 - 2**-14
+        if use_regularizer:
+            # Weight and activity regularizer each add another 2 ** -14 to the
+            # gradient.
+            expected -= 2 * 2**-14
+        self.assertEqual(backend.eval(layer.v), expected)
+
+        if save_format:
+            with object_registration.CustomObjectScope(
+                {
+                    "MultiplyLayer": mp_test_util.MultiplyLayer,
+                    "loss_fn": loss_fn,
+                }
+            ):
+                self._test_saving(model, dataset, save_format, use_regularizer)
+
+    def _test_saving(self, model, dataset, save_format, use_regularizer):
+        # Save and load model, asserting variable does not change
+        save_path = os.path.join(self.get_temp_dir(), "model")
+        model.save(save_path, save_format=save_format)
+        model = save.load_model(save_path)
+        (layer,) = (
+            layer
+            for layer in model.layers
+            if "MultiplyLayer" in layer.__class__.__name__
+        )
+        expected = 1 - 2**-14
+        if use_regularizer:
+            expected -= 2 * 2**-14
+        self.assertEqual(backend.eval(layer.v), expected)
+
+        # Continue training, and assert variable is correct value
+        model.fit(dataset)
+        new_expected = expected - 2**-14
+        if use_regularizer:
+            new_expected -= 2 * 2**-14
+        self.assertEqual(backend.eval(layer.v), new_expected)
+
+        # Load saved model again, and assert variable is previous value
+        model = save.load_model(save_path)
+        (layer,) = (
+            layer
+            for layer in model.layers
+            if "MultiplyLayer" in layer.__class__.__name__
+        )
+        self.assertEqual(backend.eval(layer.v), expected)
+
+        # Ensure various dtype-related aspects of the layer are correct
+        self.assertEqual(layer.dtype, "float32")
+        self.assertEqual(layer.dtype_policy.name, "mixed_float16")
+        self.assertEqual(layer.v.dtype, "float32")
+        self.assertEqual(layer(np.ones((2, 1))).dtype, "float16")
+
+        self.assertEqual(type(model.dtype_policy), policy.Policy)
+        if tf.__internal__.tf2.enabled():
+            self.assertEqual(
+                layer.get_config()["dtype"],
+                {
+                    "module": "keras.mixed_precision",
+                    "class_name": "Policy",
+                    "config": {"name": "mixed_float16"},
+                    "registered_name": None,
+                },
+            )
+        else:
+            self.assertEqual(
+                layer.get_config()["dtype"],
+                {
+                    "class_name": "Policy",
+                    "config": {"name": "mixed_float16"},
+                },
+            )
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        {"testcase_name": "base", "strategy_fn": default_strategy_fn},
+        {
+            "testcase_name": "distribute",
+            "strategy_fn": create_mirrored_strategy,
+        },
+    )
+    def test_fixed_loss_scaling(self, strategy_fn):
+        # The non-legacy optimizer is only supported in TF2
+        use_legacy_optimizer = not tf.__internal__.tf2.enabled()
+        # Note: We do not test mixed precision in this method, only loss
+        # scaling.
+        loss_scale = 8.0
+        batch_size = 4
+        with strategy_fn().scope():
+            x = layers.Input(shape=(1,), batch_size=batch_size)
+            layer = mp_test_util.MultiplyLayer()
+            y = layer(x)
+
+            # The gradient of 'y' at this point is 1. With loss scaling, the
+            # gradient is 'loss_scale'. We divide by the batch size since the
+            # loss is averaged across batch elements.
+            expected_gradient = loss_scale / batch_size
+            identity_with_grad_check_fn = (
+                mp_test_util.create_identity_with_grad_check_fn(
+                    [expected_gradient]
+                )
+            )
+            y = core.Lambda(identity_with_grad_check_fn)(y)
+            model = models.Model(inputs=x, outputs=y)
+
+            def loss_fn(y_true, y_pred):
+                del y_true
+                return tf.reduce_mean(y_pred)
+
+            if use_legacy_optimizer:
+                opt = gradient_descent.SGD(1.0)
+            else:
+                opt = sgd.SGD(1.0)
+            opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+                opt, dynamic=False, initial_scale=loss_scale
+            )
+            model.compile(
+                opt, loss=loss_fn, run_eagerly=test_utils.should_run_eagerly()
+            )
+
+        self.assertEqual(backend.eval(layer.v), 1)
+        x = np.ones((batch_size, 1))
+        y = np.ones((batch_size, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
+        model.fit(dataset)
+        # Variable starts at 1, and should have gradient of 1 subtracted from
+        # it.
+        expected = 0
+        self.assertEqual(backend.eval(layer.v), expected)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        {"testcase_name": "base", "strategy_fn": default_strategy_fn},
+        {
+            "testcase_name": "distribute",
+            "strategy_fn": create_mirrored_strategy,
+        },
+        {
+            "testcase_name": "loss_scaling",
+            "strategy_fn": create_mirrored_strategy,
+            "use_loss_scaling": True,
+        },
+    )
+    def test_advanced_model(self, strategy_fn, use_loss_scaling=False):
+        # The advanced model tests mixed-precision-related features that would
+        # occur in a resnet50 model. It tests a model that has:
+        #  * Multiple layers, some which use auto-cast variables and some which
+        #    do not
+        #  * Regularization on some variables and not others.
+        #  * A fixed loss scale (if use_loss_scaling is True)
+
+        strategy = strategy_fn()
         if use_loss_scaling:
-          opt = loss_scale_optimizer.LossScaleOptimizer(
-              opt, dynamic=False, initial_scale=loss_scale)
-        model.compile(
-            opt,
-            loss=loss_fn,
-            run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((2, 1))
-    y = np.ones((2, 1))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
-    model.fit(dataset)
-    for layer in (layer1, layer2, layer3, layer4):
-      if layer.losses:
-        # Layer has weight regularizer
-        self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate)
-      else:
-        # Layer does not have weight regularizer
-        self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'base',
-          'strategy_fn': default_strategy_fn
-      }, {
-          'testcase_name': 'distribute',
-          'strategy_fn': create_mirrored_strategy,
-      }, {
-          'testcase_name': 'get_config',
-          'strategy_fn': create_mirrored_strategy,
-          'get_config': True,
-      })
-  def test_dynamic_loss_scaling(self,
-                                strategy_fn,
-                                get_config=False):
-    strategy = strategy_fn()
-    initial_loss_scale = 2.
-    batch_size = 4
-    expected_gradient = backend.variable([initial_loss_scale / batch_size],
-                                         dtype=tf.float16)
-    # If this variable is set to True, the model below will have NaN gradients
-    have_nan_gradients = backend.variable(False, dtype=tf.bool)
-    with strategy.scope():
-      opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(
-          opt, initial_scale=initial_loss_scale, dynamic_growth_steps=2)
-      with policy.policy_scope('mixed_float16'):
-        x = layers.Input(
-            shape=(1,), batch_size=batch_size, dtype=tf.float16)
-        layer = mp_test_util.MultiplyLayer(assert_type=tf.float16)
-        y = layer(x)
-        identity_with_nan_grads = (
-            mp_test_util.create_identity_with_nan_gradients_fn(
-                have_nan_gradients))
-        y = core.Lambda(identity_with_nan_grads)(y)
-        identity_with_grad_check_fn = (
-            mp_test_util.create_identity_with_grad_check_fn(
-                expected_dtype=tf.float16,
-                expected_gradient=expected_gradient))
-        y = core.Lambda(identity_with_grad_check_fn)(y)
-        model = models.Model(inputs=x, outputs=y)
-        if get_config:
-          config = model.get_config()
-          model = model.__class__.from_config(
-              config,
-              custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer})
-          (layer,) = (layer for layer in model.layers
-                      if isinstance(layer, mp_test_util.MultiplyLayer))
-
-        def loss_fn(y_true, y_pred):
-          del y_true
-          return tf.reduce_mean(y_pred)
-
-        model.compile(
-            opt,
-            loss=loss_fn,
-            run_eagerly=test_utils.should_run_eagerly())
-
-    self.assertEqual(backend.eval(layer.v), 1)
-    x = np.ones((batch_size, 1))
-    y = np.ones((batch_size, 1))
-    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
-    model.fit(dataset)
-    # The variables starts with 1 and has a gradient of 1, so will go down by 1
-    # each step.
-    self.assertEqual(backend.eval(layer.v), 0)
-
-    model.fit(dataset)
-    self.assertEqual(backend.eval(layer.v), -1)
-
-    # There have been two steps without NaNs, so the loss scale will double
-    backend.set_value(expected_gradient,
-                      backend.get_value(expected_gradient * 2))
-    model.fit(dataset)
-    self.assertEqual(backend.eval(layer.v), -2)
-
-    # Next test with NaN gradients.
-    backend.set_value(have_nan_gradients, True)
-    model.fit(dataset)
-    # Variable should not be updated
-    self.assertEqual(backend.eval(layer.v), -2)
-
-    # Test with finite gradients again
-    backend.set_value(have_nan_gradients, False)
-    # The loss scale will be halved due to the NaNs, so the gradient will also
-    # be halved
-    backend.set_value(expected_gradient,
-                      backend.get_value(expected_gradient / 2))
-    model.fit(dataset)
-    self.assertEqual(backend.eval(layer.v), -3)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_compile_wraps_with_loss_scale_optimizer(self):
-    x = layers.Input(shape=(1,))
-    y = mp_test_util.MultiplyLayer()(x)
-
-    with policy.policy_scope('mixed_float16'):
-      # Test optimizer is automatically wrapped with LSO
-      model = models.Model(x, y)
-      model.compile(gradient_descent.SGD(1.), 'mse')
-      self.assertIsInstance(model.optimizer,
-                            loss_scale_optimizer.LossScaleOptimizer)
-      self.assertEqual(backend.get_value(model.optimizer.learning_rate), 1.)
-
-      # Test optimizer specified as string is automatically wrapped in LSO
-      model = models.Model(x, y)
-      model.compile('sgd', 'mse')
-      self.assertIsInstance(model.optimizer,
-                            loss_scale_optimizer.LossScaleOptimizer)
-
-      # Test if an LSO is passed, optimizer is not automatically wrapped with
-      # another LSO
-      model = models.Model(x, y)
-      optimizer = loss_scale_optimizer.LossScaleOptimizer(
-          gradient_descent.SGD(1.), dynamic_growth_steps=2)
-      model.compile(optimizer, 'mse')
-      self.assertIsInstance(model.optimizer,
-                            loss_scale_optimizer.LossScaleOptimizer)
-      self.assertEqual(model.optimizer.dynamic_growth_steps, 2)
-
-    with policy.policy_scope('mixed_bfloat16'):
-      # Test mixed_bfloat16 models are not automatically wrapped with LSO
-      model = models.Model(x, y)
-      model.compile(gradient_descent.SGD(1.), 'mse')
-      self.assertNotIsInstance(model.optimizer,
-                               loss_scale_optimizer.LossScaleOptimizer)
-      self.assertIsInstance(model.optimizer, gradient_descent.SGD)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_pass_invalid_optimizer_with_loss_scaling(self):
-    with policy.policy_scope(policy.Policy('mixed_float16')):
-      x = layers.Input(shape=(1,))
-      y = mp_test_util.MultiplyLayer()(x)
-      model = models.Model(x, y)
-      if tf.executing_eagerly():
-        error_msg = 'Use a `tf.keras` Optimizer instead'
-      else:
-        error_msg = 'optimizer" must be an instance of '
-      with self.assertRaisesRegex(ValueError, error_msg):
-        model.compile(optimizer_v1.SGD(1.), 'mse')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_functional_model_loss_dtype(self):
-    with policy.policy_scope('float16'):
-      x = layers.Input(shape=(1,))
-      y = mp_test_util.MultiplyLayer()(x)
-      model = models.Model(x, y)
-      model.add_loss(tf.cast(y, 'float32'))
-      # The loss should not be casted to the policy's dtype.
-      self.assertEqual(model.losses[0].dtype, 'float32')
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'base',
-          'strategy_fn': default_strategy_fn,
-      }, {
-          'testcase_name': 'distribute',
-          'strategy_fn': create_mirrored_strategy,
-      }, {
-          'testcase_name': 'base_h5',
-          'strategy_fn': default_strategy_fn,
-          'h5': True,
-      }, {
-          'testcase_name': 'distribute_h5',
-          'strategy_fn': create_mirrored_strategy,
-          'h5': True,
-      })
-  def test_save_weights_with_autocast_vars(self, strategy_fn, h5=False):
-    with strategy_fn().scope():
-      with policy.policy_scope('mixed_float16'):
-        x = layers.Input(shape=(1,), batch_size=2)
-        layer = mp_test_util.MultiplyLayer(assert_type=tf.float16)
-        y = layer(x)
-        model = models.Model(inputs=x, outputs=y)
-
-    model.set_weights([np.array(100.)])
-    x = np.ones((2, 1))
-    self.assertAllClose(backend.get_value(model(x)), x * 100.)
-    suffix = '.h5' if h5 else ''
-    weights_file = os.path.join(self.get_temp_dir(), 'weights' + suffix)
-    model.save_weights(weights_file)
-
-    model.set_weights([np.array(200.)])
-    self.assertAllClose(backend.get_value(model(x)), x * 200.)
-    model.load_weights(weights_file)
-    self.assertAllClose(backend.get_value(model(x)), x * 100.)
-    self.assertEqual(model.get_weights(), [np.array(100.)])
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'base',
-          'strategy_fn': default_strategy_fn,
-      }, {
-          'testcase_name': 'distribute',
-          'strategy_fn': create_mirrored_strategy,
-      }, {
-          'testcase_name': 'different_var_name',
-          'strategy_fn': default_strategy_fn,
-          'var_name': 'w'
-      }, {
-          'testcase_name': 'different_var_name_distribute',
-          'strategy_fn': create_mirrored_strategy,
-          'var_name': 'w'
-      })
-  def test_save_slot_variables_with_autocast_vars(self,
-                                                  strategy_fn,
-                                                  var_name='v'):
-    p = policy.Policy('mixed_float16')
-    with strategy_fn().scope(), policy.policy_scope(p):
-      x = layers.Input(shape=(2,), batch_size=2)
-      # Having a var_name other than 'v' tests that a fixed bug (b/134713714)
-      # does not reoccur. The bug was that a crash would occur when saving a
-      # checkpoint where an AutoCastVariable with a slot variable would have a
-      # different name than the layer attribute's name (layer.v in this case).
-      layer = mp_test_util.MultiplyLayer(assert_type=tf.float16,
-                                         var_name=var_name)
-      y = layer(x)
-      model = models.Model(inputs=x, outputs=y)
-      opt = gradient_descent.SGD(1., 1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, dynamic=False,
-                                                    initial_scale=1)
-      model.compile(
-          optimizer=opt,
-          loss='mse',
-          run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
-    weights_file = os.path.join(self.get_temp_dir(), 'weights')
-    model.save_weights(weights_file)
-    saved_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
-
-    model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
-    new_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
-    self.assertNotEqual(new_slot, saved_slot)
-
-    model.load_weights(weights_file)
-    restored_slot = backend.get_value(opt.get_slot(layer.v, 'momentum'))
-    self.assertEqual(restored_slot, saved_slot)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(*TESTCASES)
-  def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn):
-    strategy = strategy_fn()
-    if (isinstance(strategy, tf.distribute.MirroredStrategy) and
-        not tf.executing_eagerly()):
-      # TODO(b/121381184): Enable running the test in this case.
-      return
-
-    # Create and run model.
-    with strategy.scope():
-      x = layers.Input(shape=(2,), batch_size=2, dtype=tf.float32)
-      y = mp_test_util.MultiplyLayer(assert_type=tf.float32)(x)
-      model = models.Model(inputs=x, outputs=y)
-
-      opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(
-          opt, initial_scale=1., dynamic_growth_steps=2.)
-      model.compile(
-          optimizer=opt,
-          loss='mse',
-          run_eagerly=test_utils.should_run_eagerly())
-    # Run for 3 steps (6 examples with a batch size of 2)
-    model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
-    self.assertEqual(backend.get_value(opt.loss_scale), 2)
-    self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
-
-    # Save model weights.
-    save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
-    model.save_weights(save_prefix)
-
-    # Run model again for 1 step (2 examples with a batch size of 2)
-    model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
-    self.assertEqual(backend.get_value(opt.loss_scale), 4)
-    self.assertEqual(backend.get_value(opt.dynamic_counter), 0)
-
-    # Load model weights and ensure loss scale weights are restored.
-    model.load_weights(save_prefix)
-    self.assertEqual(backend.get_value(opt.loss_scale), 2)
-    self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
-
-  @test_combinations.run_all_keras_modes
-  def test_restore_old_loss_scale_checkpoint(self):
-    # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format
-    # of LossScaleOptimizer changed, but old checkpoints can still be loaded
-    opt = gradient_descent.SGD(0.1, momentum=0.1)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt)
-    model = sequential.Sequential([core.Dense(2,)])
-
-    # The checkpoint and expected values were obtained from the program in
-    # testdata/BUILD.
-    ckpt_dir = os.path.join(
-        flags.FLAGS['test_srcdir'].value,
-        'org_keras/keras',
-        'mixed_precision/testdata/lso_ckpt_tf2.2')
-    # ckpt_dir = test.test_src_dir_path(
-    #     'python/keras/mixed_precision/testdata/lso_ckpt_tf2.2')
-    model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
-    model.compile(opt, 'mse', run_eagerly=test_utils.should_run_eagerly())
-    model(np.zeros((2, 2)))  # Create model weights
-    opt._create_all_weights(model.weights)
-    expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]])
-    expected_slot = np.array([[10.049943, 9.917691], [10.049943, 9.917691]])
-    self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
-    self.assertAllClose(
-        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
-        expected_slot)
-    self.assertEqual(self.evaluate(opt.loss_scale), 32768)
-    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
-
-    # Check restoring works even after the model is compiled and the weights
-    # have been created.
-    model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2)))
-    self.assertNotAllClose(self.evaluate(model.weights[0]), expected_kernel)
-    self.assertNotAllClose(
-        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
-        expected_slot)
-    model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
-    self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
-    self.assertAllClose(
-        self.evaluate(opt.get_slot(model.weights[0], 'momentum')),
-        expected_slot)
-    self.assertEqual(self.evaluate(opt.loss_scale), 32768)
-    self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
-
-  def test_restore_old_saved_model(self):
-    saved_model_dir = os.path.join(
-        flags.FLAGS['test_srcdir'].value,
-        'org_keras/keras',
-        'mixed_precision/testdata/lso_savedmodel_tf2.2')
-    # saved_model_dir = test.test_src_dir_path(
-    #     'python/keras/mixed_precision/testdata/'
-    #     'lso_savedmodel_tf2.2')
-    model = save.load_model(saved_model_dir)
-    expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]])
-    self.assertAllClose(backend.eval(model.weights[0]), expected_kernel)
-    self.assertEqual(type(model.optimizer),
-                     loss_scale_optimizer.LossScaleOptimizer)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'base',
-          'strategy_fn': default_strategy_fn,
-      }, {
-          'testcase_name': 'distribute',
-          'strategy_fn': create_mirrored_strategy,
-      }, {
-          'testcase_name': 'base_h5',
-          'strategy_fn': default_strategy_fn,
-          'h5': True,
-      }, {
-          'testcase_name': 'distribute_h5',
-          'strategy_fn': create_mirrored_strategy,
-          'h5': True,
-      })
-  def test_save_model_with_dynamic_loss_scaling(
-      self, strategy_fn, h5=False):
-    # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy
-    # as well.
-    strategy = strategy_fn()
-    if (isinstance(strategy, tf.distribute.MirroredStrategy) and
-        not tf.executing_eagerly()):
-      # TODO(b/121381184): Enable running the test in this case.
-      return
-
-    # Create and run model.
-    with strategy.scope():
-      x = layers.Input(shape=(2,), batch_size=2, dtype=tf.float32)
-      y = mp_test_util.MultiplyLayer()(x)
-      model = models.Model(inputs=x, outputs=y)
-
-      opt = gradient_descent.SGD(1.)
-      opt = loss_scale_optimizer.LossScaleOptimizer(opt, initial_scale=1.,
-                                                    dynamic_growth_steps=2.)
-      model.compile(
-          optimizer=opt,
-          loss='mse',
-          run_eagerly=test_utils.should_run_eagerly())
-    # Run for 3 steps (6 examples with a batch size of 2)
-    model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2)
-    self.assertEqual(backend.get_value(opt.loss_scale), 2)
-    self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
-    (weight,) = model.trainable_weights
-    orig_weight = backend.get_value(weight)
-
-    # Save model weights.
-    save_path = os.path.join(self.get_temp_dir(), 'model')
-    model.save(save_path, save_format='h5' if h5 else 'tf')
-
-    # Run model again for 1 step (2 examples with a batch size of 2)
-    model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
-    new_weight = backend.get_value(weight)
-    self.assertNotEqual(new_weight, orig_weight)
-    self.assertEqual(backend.get_value(opt.loss_scale), 4)
-    self.assertEqual(backend.get_value(opt.dynamic_counter), 0)
-
-    # Load model weights and ensure loss scale weights are restored.
-    model = save.load_model(
-        save_path, custom_objects={'MultiplyLayer': mp_test_util.MultiplyLayer})
-    (weight,) = model.trainable_weights
-    loaded_weight = backend.get_value(weight)
-    self.assertEqual(loaded_weight, orig_weight)
-    # Currently the loss scale isn't always saved when the model is saved with
-    # Model.save(). So we assert the loss scale either has the value when it was
-    # saved, or the value it was initialized with.
-    # TODO(reedwm): Always save/restore the loss scale with Model.save().
-    self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2))
-    self.assertIn(backend.get_value(model.optimizer.dynamic_counter), (0, 1))
-
-    # Test optimizer attributes and type
-    self.assertEqual(model.optimizer.initial_scale, 1.)
-    self.assertEqual(model.optimizer.dynamic_growth_steps, 2.)
-    self.assertEqual(type(model.optimizer),
-                     loss_scale_optimizer.LossScaleOptimizer)
+            loss_scale = 8.0
+        learning_rate = 2**-14
+        # The non-legacy optimizer is only supported in TF2
+        use_legacy_optimizer = not tf.__internal__.tf2.enabled()
+
+        with strategy.scope():
+            with policy.policy_scope(policy.Policy("mixed_float16")):
+                x = layers.Input(shape=(1,), batch_size=2)
+                layer1 = mp_test_util.MultiplyLayer(
+                    assert_type=tf.float16,
+                    regularizer=mp_test_util.IdentityRegularizer(),
+                    use_operator=True,
+                )
+                layer2 = mp_test_util.MultiplyLayerWithoutAutoCast(
+                    assert_type=tf.float16, use_operator=True
+                )
+                layer3 = mp_test_util.MultiplyLayer(
+                    assert_type=tf.float16, use_operator=False
+                )
+                layer4 = mp_test_util.MultiplyLayerWithoutAutoCast(
+                    assert_type=tf.float16,
+                    regularizer=mp_test_util.IdentityRegularizer(),
+                    use_operator=False,
+                )
+                y = layer1(x)
+                y = layer2(y)
+                y = layer3(y)
+                y = layer4(y)
+                if use_loss_scaling:
+                    # The gradient of 'y' at this point is 1. With loss scaling,
+                    # the gradient is 'loss_scale'. We divide by the batch size
+                    # of 2 since the loss is averaged across batch elements.
+                    expected_gradient = loss_scale / 2
+                    identity_with_grad_check_fn = (
+                        mp_test_util.create_identity_with_grad_check_fn(
+                            expected_dtype=tf.float16,
+                            expected_gradient=[expected_gradient],
+                        )
+                    )
+                    y = core.Lambda(identity_with_grad_check_fn)(y)
+                model = models.Model(inputs=x, outputs=y)
+
+                def loss_fn(y_true, y_pred):
+                    del y_true
+                    return tf.reduce_mean(y_pred)
+
+                if use_legacy_optimizer:
+                    opt = gradient_descent.SGD(learning_rate)
+                else:
+                    opt = sgd.SGD(learning_rate)
+                if use_loss_scaling:
+                    opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+                        opt, dynamic=False, initial_scale=loss_scale
+                    )
+                model.compile(
+                    opt,
+                    loss=loss_fn,
+                    run_eagerly=test_utils.should_run_eagerly(),
+                )
+
+        x = np.ones((2, 1))
+        y = np.ones((2, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(2)
+        model.fit(dataset)
+        for layer in (layer1, layer2, layer3, layer4):
+            if layer.losses:
+                # Layer has weight regularizer
+                self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate)
+            else:
+                # Layer does not have weight regularizer
+                self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    @parameterized.named_parameters(
+        {"testcase_name": "base", "strategy_fn": default_strategy_fn},
+        {
+            "testcase_name": "distribute",
+            "strategy_fn": create_mirrored_strategy,
+        },
+        {
+            "testcase_name": "get_config",
+            "strategy_fn": create_mirrored_strategy,
+            "get_config": True,
+        },
+    )
+    def test_dynamic_loss_scaling(self, strategy_fn, get_config=False):
+        strategy = strategy_fn()
+        initial_loss_scale = 2.0
+        batch_size = 4
+        expected_gradient = backend.variable(
+            [initial_loss_scale / batch_size], dtype=tf.float16
+        )
+        # If this variable is set to True, the model below will have NaN
+        # gradients
+        have_nan_gradients = backend.variable(False, dtype=tf.bool)
+        with strategy.scope():
+            opt = sgd.SGD(1.0)
+            opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+                opt, initial_scale=initial_loss_scale, dynamic_growth_steps=2
+            )
+            with policy.policy_scope("mixed_float16"):
+                x = layers.Input(
+                    shape=(1,), batch_size=batch_size, dtype=tf.float16
+                )
+                layer = mp_test_util.MultiplyLayer(assert_type=tf.float16)
+                y = layer(x)
+                identity_with_nan_grads = (
+                    mp_test_util.create_identity_with_nan_gradients_fn(
+                        have_nan_gradients
+                    )
+                )
+                y = core.Lambda(identity_with_nan_grads)(y)
+                identity_with_grad_check_fn = (
+                    mp_test_util.create_identity_with_grad_check_fn(
+                        expected_dtype=tf.float16,
+                        expected_gradient=expected_gradient,
+                    )
+                )
+                y = core.Lambda(identity_with_grad_check_fn)(y)
+                model = models.Model(inputs=x, outputs=y)
+                if get_config:
+                    config = model.get_config()
+                    with SafeModeScope(safe_mode=False):
+                        model = model.__class__.from_config(
+                            config,
+                            custom_objects={
+                                "MultiplyLayer": mp_test_util.MultiplyLayer
+                            },
+                        )
+                    (layer,) = (
+                        layer
+                        for layer in model.layers
+                        if isinstance(layer, mp_test_util.MultiplyLayer)
+                    )
+
+                def loss_fn(y_true, y_pred):
+                    del y_true
+                    return tf.reduce_mean(y_pred)
+
+                model.compile(
+                    opt,
+                    loss=loss_fn,
+                    run_eagerly=test_utils.should_run_eagerly(),
+                )
+
+        self.assertEqual(backend.eval(layer.v), 1)
+        x = np.ones((batch_size, 1))
+        y = np.ones((batch_size, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
+        model.fit(dataset)
+        # The variables starts with 1 and has a gradient of 1, so will go down
+        # by 1 each step.
+        self.assertEqual(backend.eval(layer.v), 0)
+
+        model.fit(dataset)
+        self.assertEqual(backend.eval(layer.v), -1)
+
+        # There have been two steps without NaNs, so the loss scale will double
+        backend.set_value(
+            expected_gradient, backend.get_value(expected_gradient * 2)
+        )
+        model.fit(dataset)
+        self.assertEqual(backend.eval(layer.v), -2)
+
+        # Next test with NaN gradients.
+        backend.set_value(have_nan_gradients, True)
+        model.fit(dataset)
+        # Variable should not be updated
+        self.assertEqual(backend.eval(layer.v), -2)
+
+        # Test with finite gradients again
+        backend.set_value(have_nan_gradients, False)
+        # The loss scale will be halved due to the NaNs, so the gradient will
+        # also be halved
+        backend.set_value(
+            expected_gradient, backend.get_value(expected_gradient / 2)
+        )
+        model.fit(dataset)
+        self.assertEqual(backend.eval(layer.v), -3)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_compile_wraps_with_loss_scale_optimizer(self):
+        x = layers.Input(shape=(1,))
+        y = mp_test_util.MultiplyLayer()(x)
+
+        # The non-legacy optimizer is only supported in TF2
+        use_legacy_optimizer = (
+            not tf.__internal__.tf2.enabled() or not tf.executing_eagerly()
+        )
+
+        with policy.policy_scope("mixed_float16"):
+            # Test optimizer is automatically wrapped with LSO
+            model = models.Model(x, y)
+            if use_legacy_optimizer:
+                optimizer = gradient_descent.SGD(1.0)
+            else:
+                optimizer = sgd.SGD(1.0)
+            model.compile(optimizer, "mse")
+            self.assertIsInstance(
+                model.optimizer, loss_scale_optimizer.BaseLossScaleOptimizer
+            )
+            self.assertEqual(
+                backend.get_value(model.optimizer.learning_rate), 1.0
+            )
+
+            # Test optimizer specified as string is automatically wrapped in LSO
+            model = models.Model(x, y)
+            model.compile("sgd", "mse")
+            self.assertIsInstance(
+                model.optimizer, loss_scale_optimizer.BaseLossScaleOptimizer
+            )
+
+            # Test if an LSO is passed, optimizer is not automatically wrapped
+            # with another LSO
+            model = models.Model(x, y)
+            if use_legacy_optimizer:
+                optimizer = gradient_descent.SGD(1.0)
+            else:
+                optimizer = sgd.SGD(1.0)
+            optimizer = loss_scale_optimizer.BaseLossScaleOptimizer(
+                optimizer, dynamic_growth_steps=2
+            )
+            model.compile(optimizer, "mse")
+            self.assertIsInstance(
+                model.optimizer, loss_scale_optimizer.BaseLossScaleOptimizer
+            )
+            self.assertEqual(model.optimizer.dynamic_growth_steps, 2)
+
+        with policy.policy_scope("mixed_bfloat16"):
+            # Test mixed_bfloat16 models are not automatically wrapped with LSO
+            model = models.Model(x, y)
+            if use_legacy_optimizer:
+                optimizer = gradient_descent.SGD(1.0)
+            else:
+                optimizer = sgd.SGD(1.0)
+            model.compile(optimizer, "mse")
+            self.assertNotIsInstance(
+                model.optimizer, loss_scale_optimizer.BaseLossScaleOptimizer
+            )
+            self.assertIsInstance(
+                model.optimizer,
+                gradient_descent.SGD if use_legacy_optimizer else sgd.SGD,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_pass_invalid_optimizer_with_loss_scaling(self):
+        with policy.policy_scope(policy.Policy("mixed_float16")):
+            x = layers.Input(shape=(1,))
+            y = mp_test_util.MultiplyLayer()(x)
+            model = models.Model(x, y)
+            if tf.executing_eagerly():
+                error_msg = "Use a `tf.keras` Optimizer instead"
+            else:
+                error_msg = 'optimizer" must be an instance of '
+            with self.assertRaisesRegex(ValueError, error_msg):
+                model.compile(optimizer_v1.SGD(1.0), "mse")
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_functional_model_loss_dtype(self):
+        with policy.policy_scope("float16"):
+            x = layers.Input(shape=(1,))
+            y = mp_test_util.MultiplyLayer()(x)
+            model = models.Model(x, y)
+            model.add_loss(tf.cast(y, "float32"))
+            # The loss should not be casted to the policy's dtype.
+            self.assertEqual(model.losses[0].dtype, "float32")
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "base",
+            "strategy_fn": default_strategy_fn,
+        },
+        {
+            "testcase_name": "distribute",
+            "strategy_fn": create_mirrored_strategy,
+        },
+        {
+            "testcase_name": "base_h5",
+            "strategy_fn": default_strategy_fn,
+            "h5": True,
+        },
+        {
+            "testcase_name": "distribute_h5",
+            "strategy_fn": create_mirrored_strategy,
+            "h5": True,
+        },
+    )
+    def test_save_weights_with_autocast_vars(self, strategy_fn, h5=False):
+        with strategy_fn().scope():
+            with policy.policy_scope("mixed_float16"):
+                x = layers.Input(shape=(1,), batch_size=2)
+                layer = mp_test_util.MultiplyLayer(assert_type=tf.float16)
+                y = layer(x)
+                model = models.Model(inputs=x, outputs=y)
+
+        model.set_weights([np.array(100.0)])
+        x = np.ones((2, 1))
+        self.assertAllClose(backend.get_value(model(x)), x * 100.0)
+        suffix = ".h5" if h5 else ""
+        weights_file = os.path.join(self.get_temp_dir(), "weights" + suffix)
+        model.save_weights(weights_file)
+
+        model.set_weights([np.array(200.0)])
+        self.assertAllClose(backend.get_value(model(x)), x * 200.0)
+        model.load_weights(weights_file)
+        self.assertAllClose(backend.get_value(model(x)), x * 100.0)
+        self.assertEqual(model.get_weights(), [np.array(100.0)])
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "base",
+            "strategy_fn": default_strategy_fn,
+        },
+        {
+            "testcase_name": "distribute",
+            "strategy_fn": create_mirrored_strategy,
+        },
+        {
+            "testcase_name": "distribute_legacy",
+            "strategy_fn": create_mirrored_strategy,
+            "use_legacy_optimizer": True,
+        },
+        {
+            "testcase_name": "different_var_name",
+            "strategy_fn": default_strategy_fn,
+            "var_name": "w",
+        },
+        {
+            "testcase_name": "different_var_name_distribute",
+            "strategy_fn": create_mirrored_strategy,
+            "var_name": "w",
+        },
+    )
+    def test_save_slot_variables_with_autocast_vars(
+        self, strategy_fn, var_name="v", use_legacy_optimizer=False
+    ):
+        if not tf.__internal__.tf2.enabled():
+            # The non-legacy optimizer is only supported in TF2
+            use_legacy_optimizer = True
+        p = policy.Policy("mixed_float16")
+        with strategy_fn().scope(), policy.policy_scope(p):
+            x = layers.Input(shape=(2,), batch_size=2)
+            # Having a var_name other than 'v' tests that a fixed bug
+            # (b/134713714) does not reoccur. The bug was that a crash would
+            # occur when saving a checkpoint where an AutoCastVariable with a
+            # slot variable would have a different name than the layer
+            # attribute's name (layer.v in this case).
+            layer = mp_test_util.MultiplyLayer(
+                assert_type=tf.float16, var_name=var_name
+            )
+            y = layer(x)
+            model = models.Model(inputs=x, outputs=y)
+            if use_legacy_optimizer:
+                opt = gradient_descent.SGD(1.0, 1.0)
+            else:
+                opt = sgd.SGD(1.0, 1.0)
+            opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+                opt, dynamic=False, initial_scale=1
+            )
+            model.compile(
+                optimizer=opt,
+                loss="mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+        def get_momentum_slot():
+            if use_legacy_optimizer:
+                return opt.get_slot(layer.v, "momentum")
+            else:
+                return opt.inner_optimizer.momentums[0]
+
+        model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
+        weights_file = os.path.join(self.get_temp_dir(), "weights")
+        model.save_weights(weights_file)
+        saved_slot = backend.get_value(get_momentum_slot())
+
+        model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
+        new_slot = backend.get_value(get_momentum_slot())
+        self.assertNotEqual(new_slot, saved_slot)
+
+        model.load_weights(weights_file)
+        restored_slot = backend.get_value(get_momentum_slot())
+        self.assertEqual(restored_slot, saved_slot)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(*TESTCASES)
+    def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn):
+        strategy = strategy_fn()
+        if (
+            isinstance(strategy, tf.distribute.MirroredStrategy)
+            and not tf.executing_eagerly()
+        ):
+            # TODO(b/121381184): Enable running the test in this case.
+            return
+
+        # The non-legacy optimizer is only supported in TF2
+        use_legacy_optimizer = not tf.__internal__.tf2.enabled()
+
+        # Create and run model.
+        with strategy.scope():
+            x = layers.Input(shape=(2,), batch_size=2, dtype=tf.float32)
+            y = mp_test_util.MultiplyLayer(assert_type=tf.float32)(x)
+            model = models.Model(inputs=x, outputs=y)
+
+            if use_legacy_optimizer:
+                opt = gradient_descent.SGD(1.0)
+            else:
+                opt = sgd.SGD(1.0)
+            opt = loss_scale_optimizer.BaseLossScaleOptimizer(
+                opt, initial_scale=1.0, dynamic_growth_steps=2.0
+            )
+            model.compile(
+                optimizer=opt,
+                loss="mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+        # Run for 3 steps (6 examples with a batch size of 2)
+        model.fit(np.zeros((6, 2)), np.zeros((6, 2)), batch_size=2)
+        self.assertEqual(backend.get_value(opt.loss_scale), 2)
+        self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
+
+        # Save model weights.
+        save_prefix = os.path.join(self.get_temp_dir(), "ckpt")
+        model.save_weights(save_prefix)
+
+        # Run model again for 1 step (2 examples with a batch size of 2)
+        model.fit(np.zeros((2, 2)), np.zeros((2, 2)), batch_size=2)
+        self.assertEqual(backend.get_value(opt.loss_scale), 4)
+        self.assertEqual(backend.get_value(opt.dynamic_counter), 0)
+
+        # Load model weights and ensure loss scale weights are restored.
+        model.load_weights(save_prefix)
+        self.assertEqual(backend.get_value(opt.loss_scale), 2)
+        self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
+
+    @test_combinations.run_all_keras_modes
+    def test_restore_old_loss_scale_checkpoint(self):
+        # Ensure a checkpoint from TF 2.2 can be loaded. The checkpoint format
+        # of LossScaleOptimizer changed, but old checkpoints can still be loaded
+        # into the legacy optimizers.
+        opt = gradient_descent.SGD(0.1, momentum=0.1)
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt)
+        model = sequential.Sequential(
+            [
+                core.Dense(
+                    2,
+                )
+            ]
+        )
+
+        # The checkpoint and expected values were obtained from the program in
+        # testdata/BUILD.
+        ckpt_dir = os.path.join(
+            flags.FLAGS["test_srcdir"].value,
+            "org_keras/keras",
+            "mixed_precision/testdata/lso_ckpt_tf2.2",
+        )
+        # ckpt_dir = test.test_src_dir_path(
+        #     'python/keras/mixed_precision/testdata/lso_ckpt_tf2.2')
+        model.load_weights(os.path.join(ckpt_dir, "ckpt"))
+        model.compile(opt, "mse", run_eagerly=test_utils.should_run_eagerly())
+        model(np.zeros((2, 2)))  # Create model weights
+        opt._create_all_weights(model.weights)
+        expected_kernel = np.array(
+            [[9.229685, 10.901115], [10.370763, 9.757362]]
+        )
+        expected_slot = np.array([[10.049943, 9.917691], [10.049943, 9.917691]])
+        self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
+        self.assertAllClose(
+            self.evaluate(opt.get_slot(model.weights[0], "momentum")),
+            expected_slot,
+        )
+        self.assertEqual(self.evaluate(opt.loss_scale), 32768)
+        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
+
+        # Check restoring works even after the model is compiled and the weights
+        # have been created.
+        model.fit(np.random.normal(size=(2, 2)), np.random.normal(size=(2, 2)))
+        self.assertNotAllClose(self.evaluate(model.weights[0]), expected_kernel)
+        self.assertNotAllClose(
+            self.evaluate(opt.get_slot(model.weights[0], "momentum")),
+            expected_slot,
+        )
+        model.load_weights(os.path.join(ckpt_dir, "ckpt"))
+        self.assertAllClose(self.evaluate(model.weights[0]), expected_kernel)
+        self.assertAllClose(
+            self.evaluate(opt.get_slot(model.weights[0], "momentum")),
+            expected_slot,
+        )
+        self.assertEqual(self.evaluate(opt.loss_scale), 32768)
+        self.assertEqual(self.evaluate(opt.dynamic_counter), 1)
+
+    def test_restore_old_saved_model(self):
+        saved_model_dir = os.path.join(
+            flags.FLAGS["test_srcdir"].value,
+            "org_keras/keras",
+            "mixed_precision/testdata/lso_savedmodel_tf2.2",
+        )
+        # saved_model_dir = test.test_src_dir_path(
+        #     'python/keras/mixed_precision/testdata/'
+        #     'lso_savedmodel_tf2.2')
+        model = save.load_model(saved_model_dir)
+        expected_kernel = np.array(
+            [[9.229685, 10.901115], [10.370763, 9.757362]]
+        )
+        self.assertAllClose(backend.eval(model.weights[0]), expected_kernel)
+        self.assertEqual(
+            type(model.optimizer), loss_scale_optimizer.LossScaleOptimizer
+        )
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "base",
+            "strategy_fn": default_strategy_fn,
+        },
+        {
+            "testcase_name": "distribute",
+            "strategy_fn": create_mirrored_strategy,
+        },
+        {
+            "testcase_name": "base_h5",
+            "strategy_fn": default_strategy_fn,
+            "h5": True,
+        },
+        {
+            "testcase_name": "distribute_h5",
+            "strategy_fn": create_mirrored_strategy,
+            "h5": True,
+        },
+    )
+    def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False):
+        # TODO(reedwm): Support and test saving model with a mixed_[b]float16
+        # policy as well.
+        strategy = strategy_fn()
+        if (
+            isinstance(strategy, tf.distribute.MirroredStrategy)
+            and not tf.executing_eagerly()
+        ):
+            # TODO(b/121381184): Enable running the test in this case.
+            return
+
+        # Create and run model.
+        with strategy.scope():
+            x = layers.Input(shape=(2,), batch_size=2, dtype=tf.float32)
+            y = mp_test_util.MultiplyLayer()(x)
+            model = models.Model(inputs=x, outputs=y)
+
+            # Only test the legacy optimizer. The new optimizer does not
+            # support saving optimizer weights.
+            opt = gradient_descent.SGD(1.0)
+            opt = loss_scale_optimizer.LossScaleOptimizer(
+                opt, initial_scale=1.0, dynamic_growth_steps=2.0
+            )
+            model.compile(
+                optimizer=opt,
+                loss="mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+        # Run for 3 steps (6 examples with a batch size of 2)
+        model.fit(np.ones((6, 2)), np.zeros((6, 2)), batch_size=2)
+        self.assertEqual(backend.get_value(opt.loss_scale), 2)
+        self.assertEqual(backend.get_value(opt.dynamic_counter), 1)
+        (weight,) = model.trainable_weights
+        orig_weight = backend.get_value(weight)
+
+        # Save model weights.
+        save_path = os.path.join(self.get_temp_dir(), "model")
+        model.save(save_path, save_format="h5" if h5 else "tf")
+
+        # Run model again for 1 step (2 examples with a batch size of 2)
+        model.fit(np.ones((2, 2)), np.zeros((2, 2)), batch_size=2)
+        new_weight = backend.get_value(weight)
+        self.assertNotEqual(new_weight, orig_weight)
+        self.assertEqual(backend.get_value(opt.loss_scale), 4)
+        self.assertEqual(backend.get_value(opt.dynamic_counter), 0)
+
+        # Load model weights and ensure loss scale weights are restored.
+        model = save.load_model(
+            save_path,
+            custom_objects={"MultiplyLayer": mp_test_util.MultiplyLayer},
+        )
+        (weight,) = model.trainable_weights
+        loaded_weight = backend.get_value(weight)
+        self.assertEqual(loaded_weight, orig_weight)
+        # Currently the loss scale isn't always saved when the model is saved
+        # with Model.save(). So we assert the loss scale either has the value
+        # when it was saved, or the value it was initialized with.
+        # TODO(reedwm): Always save/restore the loss scale with Model.save().
+        self.assertIn(backend.get_value(model.optimizer.loss_scale), (1, 2))
+        self.assertIn(
+            backend.get_value(model.optimizer.dynamic_counter), (0, 1)
+        )
+
+        # Test optimizer attributes and type
+        self.assertEqual(model.optimizer.initial_scale, 1.0)
+        self.assertEqual(model.optimizer.dynamic_growth_steps, 2.0)
+        self.assertEqual(
+            type(model.optimizer), loss_scale_optimizer.LossScaleOptimizer
+        )
 
 
 class ApplicationModelTest(test_combinations.TestCase):
-  """Tests that application models can be built with mixed precision.
-
-  This does not test that such models can be trained in mixed precision, as
-  doing so takes too much time for a unit test.
-  """
-
-  @parameterized.named_parameters(
-      ('densenet', densenet.DenseNet121),
-      ('efficientnet', efficientnet.EfficientNetB0),
-      ('inception_resnet_v2', inception_resnet_v2.InceptionResNetV2),
-      ('inception_v3', inception_v3.InceptionV3),
-      ('mobilenet', mobilenet.MobileNet),
-      ('nasnet', nasnet.NASNetMobile),
-      ('vgg16', vgg16.VGG16),
-      ('xception', xception.Xception),
-      ('resnet50', resnet.ResNet50),
-  )
-  def test_application_model(self, app):
-    # Run on CPU since model weights may exhaust GPU memory
-    with policy.policy_scope('mixed_float16'), tf.device('/CPU:0'):
-      app(weights=None)
-
-
-if __name__ == '__main__':
-  base_layer_utils.enable_v2_dtype_behavior()
-  tf.test.main()
+    """Tests that application models can be built with mixed precision.
+
+    This does not test that such models can be trained in mixed precision, as
+    doing so takes too much time for a unit test.
+    """
+
+    @parameterized.named_parameters(
+        ("densenet", densenet.DenseNet121),
+        ("efficientnet", efficientnet.EfficientNetB0),
+        ("inception_resnet_v2", inception_resnet_v2.InceptionResNetV2),
+        ("inception_v3", inception_v3.InceptionV3),
+        ("mobilenet", mobilenet.MobileNet),
+        ("nasnet", nasnet.NASNetMobile),
+        ("vgg16", vgg16.VGG16),
+        ("xception", xception.Xception),
+        ("resnet50", resnet.ResNet50),
+    )
+    def test_application_model(self, app):
+        # Run on CPU since model weights may exhaust GPU memory
+        with policy.policy_scope("mixed_float16"), tf.device("/CPU:0"):
+            app(weights=None)
+
+
+if __name__ == "__main__":
+    base_layer_utils.enable_v2_dtype_behavior()
+    tf.test.main()
diff --git a/keras/mixed_precision/policy.py b/keras/mixed_precision/policy.py
index 967ffe96c529..faaf9377eea9 100644
--- a/keras/mixed_precision/policy.py
+++ b/keras/mixed_precision/policy.py
@@ -14,479 +14,542 @@
 # ==============================================================================
 """Contains the Policy class for mixed precision training."""
 
+import contextlib
+
 import tensorflow.compat.v2 as tf
 
-import contextlib
 from keras import backend
 from keras.engine import base_layer_utils
 from keras.mixed_precision import device_compatibility_check
-from keras.utils import generic_utils
+from keras.mixed_precision import loss_scale_optimizer
+from keras.saving import serialization_lib
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.mixed_precision.Policy', v1=[])
+@keras_export("keras.mixed_precision.Policy", v1=[])
 class Policy:
-  """A dtype policy for a Keras layer.
-
-  A dtype policy determines a layer's computation and variable dtypes. Each
-  layer has a policy. Policies can be passed to the `dtype` argument of layer
-  constructors, or a global policy can be set with
-  `tf.keras.mixed_precision.set_global_policy`.
-
-  Args:
-    name: The policy name, which determines the compute and variable dtypes. Can
-      be any dtype name, such as `'float32'` or `'float64'`, which causes both
-      the compute and variable dtypes will be that dtype. Can also be the string
-      `'mixed_float16'` or `'mixed_bfloat16'`, which causes the compute dtype to
-      be float16 or bfloat16 and the variable dtype to be float32.
-
-  Typically you only need to interact with dtype policies when using mixed
-  precision, which is the use of float16 or bfloat16 for computations and
-  float32 for variables. This is why the term `mixed_precision` appears in the
-  API name. Mixed precision can be enabled by passing `'mixed_float16'` or
-  `'mixed_bfloat16'` to `tf.keras.mixed_precision.set_global_policy`. See [the
-  mixed precision guide](https://www.tensorflow.org/guide/keras/mixed_precision)
-  for more information on how to use mixed precision.
-
-  >>> tf.keras.mixed_precision.set_global_policy('mixed_float16')
-  >>> layer1 = tf.keras.layers.Dense(10)
-  >>> layer1.dtype_policy  # `layer1` will automatically use mixed precision
-  <Policy "mixed_float16">
-  >>> # Can optionally override layer to use float32 instead of mixed precision.
-  >>> layer2 = tf.keras.layers.Dense(10, dtype='float32')
-  >>> layer2.dtype_policy
-  <Policy "float32">
-  >>> # Set policy back to initial float32 for future examples.
-  >>> tf.keras.mixed_precision.set_global_policy('float32')
-
-  In the example above, passing `dtype='float32'` to the layer is equivalent to
-  passing `dtype=tf.keras.mixed_precision.Policy('float32')`. In general,
-  passing a dtype policy name to a layer is equivalent to passing the
-  corresponding policy, so it is never necessary to explicitly construct a
-  `Policy` object.
-
-  Note: `Model.compile` will automatically wrap an optimizer with a
-  `tf.keras.mixed_precision.LossScaleOptimizer` if you use the `'mixed_float16'`
-  policy. If you use a custom training loop instead of calling `Model.compile`,
-  you should explicitly use a `tf.keras.mixed_precision.LossScaleOptimizer` to
-  avoid numeric underflow with float16.
-
-  ### How a layer uses its policy's compute dtype
-
-  A layer casts its inputs to its compute dtype. This causes the layer's
-  computations and output to also be in the compute dtype. For example:
-
-  >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
-  >>> # `layer`'s policy defaults to float32.
-  >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
-  >>> layer.compute_dtype  # Equivalent to layer.dtype_policy.compute_dtype
-  'float32'
-  >>> # `layer` casts its inputs to its compute dtype and does computations in
-  >>> # that dtype.
-  >>> y = layer(x)
-  >>> y.dtype
-  tf.float32
-
-  Note that the base `tf.keras.layers.Layer` class inserts the casts. If
-  subclassing your own layer, you do not have to insert any casts.
-
-  Currently, only tensors in the first argument to the layer's `call` method are
-  casted (although this will likely be changed in a future minor release). For
-  example:
-
-  >>> class MyLayer(tf.keras.layers.Layer):
-  ...   # Bug! `b` will not be casted.
-  ...   def call(self, a, b):
-  ...     return a + 1., b + 1.
-  >>> a = tf.constant(1., dtype="float32")
-  >>> b = tf.constant(1., dtype="float32")
-  >>> layer = MyLayer(dtype="float64")
-  >>> x, y = layer(a, b)
-  >>> x.dtype
-  tf.float64
-  >>> y.dtype
-  tf.float32
-
-  If writing your own layer with multiple inputs, you should either explicitly
-  cast other tensors to `self.compute_dtype` in `call` or accept all tensors in
-  the first argument as a list.
-
-  The casting only occurs in TensorFlow 2. If
-  `tf.compat.v1.disable_v2_behavior()` has been called, you can enable the
-  casting behavior with `tf.compat.v1.keras.layers.enable_v2_dtype_behavior()`.
-
-  ### How a layer uses its policy's variable dtype
-
-  The default dtype of variables created by `tf.keras.layers.Layer.add_weight`
-  is the layer's policy's variable dtype.
-
-  If a layer's compute and variable dtypes differ, `add_weight` will wrap
-  floating-point variables with a special wrapper called an `AutoCastVariable`.
-  `AutoCastVariable` is identical to the original variable except it casts
-  itself to the layer's compute dtype when used within `Layer.call`. This means
-  if you are writing a layer, you do not have to explicitly cast the variables
-  to the layer's compute dtype. For example:
-
-  >>> class SimpleDense(tf.keras.layers.Layer):
-  ...
-  ...   def build(self, input_shape):
-  ...     # With mixed precision, self.kernel is a float32 AutoCastVariable
-  ...     self.kernel = self.add_weight('kernel', (input_shape[-1], 10))
-  ...
-  ...   def call(self, inputs):
-  ...     # With mixed precision, self.kernel will be casted to float16
-  ...     return tf.linalg.matmul(inputs, self.kernel)
-  ...
-  >>> layer = SimpleDense(dtype='mixed_float16')
-  >>> y = layer(tf.ones((10, 10)))
-  >>> y.dtype
-  tf.float16
-  >>> layer.kernel.dtype
-  tf.float32
-
-  A layer author can prevent a variable from being wrapped with an
-  `AutoCastVariable` by passing `experimental_autocast=False` to `add_weight`,
-  which is useful if the float32 value of the variable must be accessed within
-  the layer.
-
-  ### How to write a layer that supports mixed precision and float64.
-
-  For the most part, layers will automatically support mixed precision and
-  float64 without any additional work, due to the fact the base layer
-  automatically casts inputs, creates variables of the correct type, and in the
-  case of mixed precision, wraps variables with `AutoCastVariables`.
-
-  The primary case where you need extra work to support mixed precision or
-  float64 is when you create a new tensor, such as with `tf.ones` or
-  `tf.random.normal`, In such cases, you must create the tensor of the correct
-  dtype. For example, if you call `tf.random.normal`, you must pass the compute
-  dtype, which is the dtype the inputs have been casted to:
-
-  >>> class AddRandom(tf.keras.layers.Layer):
-  ...
-  ...   def call(self, inputs):
-  ...     # We must pass `dtype=inputs.dtype`, otherwise a TypeError may
-  ...     # occur when adding `inputs` to `rand`.
-  ...     rand = tf.random.normal(shape=inputs.shape, dtype=inputs.dtype)
-  ...     return inputs + rand
-  >>> layer = AddRandom(dtype='mixed_float16')
-  >>> y = layer(x)
-  >>> y.dtype
-  tf.float16
-
-  If you did not pass `dtype=inputs.dtype` to `tf.random.normal`, a
-  `TypeError` would have occurred. This is because the `tf.random.normal`'s
-  dtype defaults to `"float32"`, but the input dtype is float16. You cannot add
-  a float32 tensor with a float16 tensor.
-  """
-
-  def __init__(self, name):
-    if isinstance(name, tf.DType):
-      raise TypeError("'name' must be a string, not a DType. "
-                      "Instead, pass DType.name. Got: %s" % (name.name,))
-    elif not isinstance(name, str):
-      raise TypeError("'name' must be a string, but got: %s" % (name,))
-    self._name = name
-    self._compute_dtype, self._variable_dtype = self._parse_name(name)
-    if name in ('mixed_float16', 'mixed_bloat16'):
-      device_compatibility_check.log_device_compatibility_check(name)
-
-  def _parse_name(self, name):
-    """Parses a Policy name into a compute and variable dtype.
+    """A dtype policy for a Keras layer.
 
-    Args:
-      name: The name of the policy:
+    A dtype policy determines a layer's computation and variable dtypes. Each
+    layer has a policy. Policies can be passed to the `dtype` argument of layer
+    constructors, or a global policy can be set with
+    `tf.keras.mixed_precision.set_global_policy`.
 
-    Returns:
-      The (compute_dtype, variable_dtype) pair.
+    Args:
+      name: The policy name, which determines the compute and variable dtypes.
+        Can be any dtype name, such as `'float32'` or `'float64'`, which causes
+        both the compute and variable dtypes will be that dtype. Can also be the
+        string `'mixed_float16'` or `'mixed_bfloat16'`, which causes the compute
+        dtype to be float16 or bfloat16 and the variable dtype to be float32.
+
+    Typically you only need to interact with dtype policies when using mixed
+    precision, which is the use of float16 or bfloat16 for computations and
+    float32 for variables. This is why the term `mixed_precision` appears in the
+    API name. Mixed precision can be enabled by passing `'mixed_float16'` or
+    `'mixed_bfloat16'` to `tf.keras.mixed_precision.set_global_policy`. See [the
+    mixed precision
+    guide](https://www.tensorflow.org/guide/keras/mixed_precision) for more
+    information on how to use mixed precision.
+
+    >>> tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    >>> layer1 = tf.keras.layers.Dense(10)
+    >>> layer1.dtype_policy  # `layer1` will automatically use mixed precision
+    <Policy "mixed_float16">
+    >>> # Can optionally override layer to use float32
+    >>> # instead of mixed precision.
+    >>> layer2 = tf.keras.layers.Dense(10, dtype='float32')
+    >>> layer2.dtype_policy
+    <Policy "float32">
+    >>> # Set policy back to initial float32 for future examples.
+    >>> tf.keras.mixed_precision.set_global_policy('float32')
+
+    In the example above, passing `dtype='float32'` to the layer is equivalent
+    to passing `dtype=tf.keras.mixed_precision.Policy('float32')`. In general,
+    passing a dtype policy name to a layer is equivalent to passing the
+    corresponding policy, so it is never necessary to explicitly construct a
+    `Policy` object.
+
+    Note: `Model.compile` will automatically wrap an optimizer with a
+    `tf.keras.mixed_precision.LossScaleOptimizer` if you use the
+    `'mixed_float16'` policy. If you use a custom training loop instead of
+    calling `Model.compile`, you should explicitly use a
+    `tf.keras.mixed_precision.LossScaleOptimizer` to avoid numeric underflow
+    with float16.
+
+    ### How a layer uses its policy's compute dtype
+
+    A layer casts its inputs to its compute dtype. This causes the layer's
+    computations and output to also be in the compute dtype. For example:
+
+    >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
+    >>> # `layer`'s policy defaults to float32.
+    >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
+    >>> layer.compute_dtype  # Equivalent to layer.dtype_policy.compute_dtype
+    'float32'
+    >>> # `layer` casts its inputs to its compute dtype and does computations in
+    >>> # that dtype.
+    >>> y = layer(x)
+    >>> y.dtype
+    tf.float32
+
+    Note that the base `tf.keras.layers.Layer` class inserts the casts. If
+    subclassing your own layer, you do not have to insert any casts.
+
+    Currently, only tensors in the first argument to the layer's `call` method
+    are casted (although this will likely be changed in a future minor release).
+    For example:
+
+    >>> class MyLayer(tf.keras.layers.Layer):
+    ...   # Bug! `b` will not be casted.
+    ...   def call(self, a, b):
+    ...     return a + 1., b + 1.
+    >>> a = tf.constant(1., dtype="float32")
+    >>> b = tf.constant(1., dtype="float32")
+    >>> layer = MyLayer(dtype="float64")
+    >>> x, y = layer(a, b)
+    >>> x.dtype
+    tf.float64
+    >>> y.dtype
+    tf.float32
+
+    If writing your own layer with multiple inputs, you should either explicitly
+    cast other tensors to `self.compute_dtype` in `call` or accept all tensors
+    in the first argument as a list.
+
+    The casting only occurs in TensorFlow 2. If
+    `tf.compat.v1.disable_v2_behavior()` has been called, you can enable the
+    casting behavior with
+    `tf.compat.v1.keras.layers.enable_v2_dtype_behavior()`.
+
+    ### How a layer uses its policy's variable dtype
+
+    The default dtype of variables created by `tf.keras.layers.Layer.add_weight`
+    is the layer's policy's variable dtype.
+
+    If a layer's compute and variable dtypes differ, `add_weight` will wrap
+    floating-point variables with a special wrapper called an
+    `AutoCastVariable`.  `AutoCastVariable` is identical to the original
+    variable except it casts itself to the layer's compute dtype when used
+    within `Layer.call`. This means if you are writing a layer, you do not have
+    to explicitly cast the variables to the layer's compute dtype. For example:
+
+    >>> class SimpleDense(tf.keras.layers.Layer):
+    ...
+    ...   def build(self, input_shape):
+    ...     # With mixed precision, self.kernel is a float32 AutoCastVariable
+    ...     self.kernel = self.add_weight('kernel', (input_shape[-1], 10))
+    ...
+    ...   def call(self, inputs):
+    ...     # With mixed precision, self.kernel will be casted to float16
+    ...     return tf.linalg.matmul(inputs, self.kernel)
+    ...
+    >>> layer = SimpleDense(dtype='mixed_float16')
+    >>> y = layer(tf.ones((10, 10)))
+    >>> y.dtype
+    tf.float16
+    >>> layer.kernel.dtype
+    tf.float32
+
+    A layer author can prevent a variable from being wrapped with an
+    `AutoCastVariable` by passing `experimental_autocast=False` to `add_weight`,
+    which is useful if the float32 value of the variable must be accessed within
+    the layer.
+
+    ### How to write a layer that supports mixed precision and float64.
+
+    For the most part, layers will automatically support mixed precision and
+    float64 without any additional work, due to the fact the base layer
+    automatically casts inputs, creates variables of the correct type, and in
+    the case of mixed precision, wraps variables with `AutoCastVariables`.
+
+    The primary case where you need extra work to support mixed precision or
+    float64 is when you create a new tensor, such as with `tf.ones` or
+    `tf.random.normal`, In such cases, you must create the tensor of the correct
+    dtype. For example, if you call `tf.random.normal`, you must pass the
+    compute dtype, which is the dtype the inputs have been casted to:
+
+    >>> class AddRandom(tf.keras.layers.Layer):
+    ...
+    ...   def call(self, inputs):
+    ...     # We must pass `dtype=inputs.dtype`, otherwise a TypeError may
+    ...     # occur when adding `inputs` to `rand`.
+    ...     rand = tf.random.normal(shape=inputs.shape, dtype=inputs.dtype)
+    ...     return inputs + rand
+    >>> layer = AddRandom(dtype='mixed_float16')
+    >>> y = layer(x)
+    >>> y.dtype
+    tf.float16
+
+    If you did not pass `dtype=inputs.dtype` to `tf.random.normal`, a
+    `TypeError` would have occurred. This is because the `tf.random.normal`'s
+    dtype defaults to `"float32"`, but the input dtype is float16. You cannot
+    add a float32 tensor with a float16 tensor.
     """
-    if name.endswith('_float32_vars'):
-      error_msg = ('Policies ending in \'_float32_vars\' have been removed '
-                   'from TensorFlow.')
-      if name in ('infer_float32_vars', 'infer_with_float32_vars'):
-        error_msg += (' Please use the \'mixed_float16\' or \'mixed_bfloat16\' '
-                      'policy instead.')
-      elif name == 'float16_with_float32_vars':
-        error_msg += (' Please use the \'mixed_float16\' policy instead.')
-      elif name == 'bfloat16_with_float32_vars':
-        error_msg += (' Please use the \'mixed_bfloat16\' policy instead.')
-      error_msg += ' Got policy name: \'%s\'' % name
-      raise ValueError(error_msg)
-
-    if name == 'mixed_float16':
-      return 'float16', 'float32'
-    elif name == 'mixed_bfloat16':
-      return 'bfloat16', 'float32'
-    elif name == '_infer':
-      # The "_infer" policy exists only for compatibility with TF 1, where
-      # "_infer" is the default. The behavior matches the behavior of TF 1's
-      # behavior before policies were introduced. With "_infer", the computation
-      # and variable dtype are inferred from the first input the first time the
-      # layer is called. Once the layer is called for the first time, the
-      # layer's policy will change to the dtype of the first input, and it will
-      # no longer have the "_infer" policy.
-      #
-      # The infer policy should be considered an implementation detail and may
-      # be removed in the future.
-      return None, None
 
-    try:
-      dtype = tf.as_dtype(name).name
-    except TypeError:
-      error = ("Cannot convert value %s to a mixed precision Policy. "
-               "Valid policies include 'mixed_float16', 'mixed_bfloat16', "
-               "and the name of any dtype such as 'float32'." % (name,))
-      raise ValueError(error)
-    return dtype, dtype
+    def __init__(self, name):
+        if isinstance(name, tf.DType):
+            raise TypeError(
+                "'name' must be a string, not a DType. "
+                f"Instead, pass DType.name. Received: name={name.name}"
+            )
+        elif not isinstance(name, str):
+            raise TypeError(f"'name' must be a string, but got: {name}")
+        self._name = name
+        self._compute_dtype, self._variable_dtype = self._parse_name(name)
+        if name in ("mixed_float16", "mixed_bfloat16"):
+            device_compatibility_check.log_device_compatibility_check(name)
+
+    def _parse_name(self, name):
+        """Parses a Policy name into a compute and variable dtype.
+
+        Args:
+          name: The name of the policy:
+
+        Returns:
+          The (compute_dtype, variable_dtype) pair.
+        """
+        if name.endswith("_float32_vars"):
+            error_msg = (
+                "Policies ending in '_float32_vars' have been removed "
+                "from TensorFlow."
+            )
+            if name in ("infer_float32_vars", "infer_with_float32_vars"):
+                error_msg += (
+                    " Please use the 'mixed_float16' or 'mixed_bfloat16' "
+                    "policy instead."
+                )
+            elif name == "float16_with_float32_vars":
+                error_msg += " Please use the 'mixed_float16' policy instead."
+            elif name == "bfloat16_with_float32_vars":
+                error_msg += " Please use the 'mixed_bfloat16' policy instead."
+            error_msg += f" Got policy name: '{name}'"
+            raise ValueError(error_msg)
+
+        if name == "mixed_float16":
+            return "float16", "float32"
+        elif name == "mixed_bfloat16":
+            return "bfloat16", "float32"
+        elif name == "_infer":
+            # The "_infer" policy exists only for compatibility with TF 1, where
+            # "_infer" is the default. The behavior matches the behavior of TF
+            # 1's behavior before policies were introduced. With "_infer", the
+            # computation and variable dtype are inferred from the first input
+            # the first time the layer is called. Once the layer is called for
+            # the first time, the layer's policy will change to the dtype of the
+            # first input, and it will no longer have the "_infer" policy.
+            #
+            # The infer policy should be considered an implementation detail and
+            # may be removed in the future.
+            return None, None
+
+        try:
+            dtype = tf.as_dtype(name).name
+        except TypeError:
+            raise ValueError(
+                f"Cannot convert value {name} to a mixed precision Policy. "
+                "Valid policies include 'mixed_float16', 'mixed_bfloat16', "
+                "and the name of any dtype such as 'float32'."
+            )
+        return dtype, dtype
+
+    @property
+    def variable_dtype(self):
+        """The variable dtype of this policy.
+
+        This is the dtype layers will create their variables in, unless a layer
+        explicitly chooses a different dtype. If this is different than
+        `Policy.compute_dtype`, Layers will cast variables to the compute dtype
+        to avoid type errors.
+
+        Variable regularizers are run in the variable dtype, not the compute
+        dtype.
+
+        Returns:
+          The variable dtype of this policy, as a string.
+        """
+        return self._variable_dtype
+
+    @property
+    def compute_dtype(self):
+        """The compute dtype of this policy.
+
+        This is the dtype layers will do their computations in. Typically layers
+        output tensors with the compute dtype as well.
+
+        Note that even if the compute dtype is float16 or bfloat16, hardware
+        devices may not do individual adds, multiplies, and other fundamental
+        operations in float16 or bfloat16, but instead may do some of them in
+        float32 for numeric stability. The compute dtype is the dtype of the
+        inputs and outputs of the TensorFlow ops that the layer executes.
+        Internally, many TensorFlow ops will do certain internal calculations in
+        float32 or some other device-internal intermediate format with higher
+        precision than float16/bfloat16, to increase numeric stability.
+
+        For example, a `tf.keras.layers.Dense` layer, when run on a GPU with a
+        float16 compute dtype, will pass float16 inputs to `tf.linalg.matmul`.
+        But, `tf.linalg.matmul` will do use float32 intermediate math. The
+        performance benefit of float16 is still apparent, due to increased
+        memory bandwidth and the fact modern GPUs have specialized hardware for
+        computing matmuls on float16 inputs while still keeping intermediate
+        computations in float32.
+
+        Returns:
+          The compute dtype of this policy, as a string.
+        """
+        return self._compute_dtype
+
+    @property
+    def name(self):
+        """Returns the name of this policy."""
+        return self._name
+
+    def __repr__(self):
+        return f'<Policy "{self._name}">'
+
+    def get_config(self):
+        return {"name": self.name}
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        del custom_objects
+        if "loss_scale" in config:
+            config = config.copy()
+            # Policy.get_config in TensorFlow 2.3 and below had a loss_scale. We
+            # silently drop it.
+            del config["loss_scale"]
+        return cls(**config)
+
+
+# The current global policy in effect. If None, it means the current value of
+# floatx should be used as the policy if the V2 dtype behavior is enabled,
+# or "_infer" otherwise.
+# TODO(reedwm): Make this thread local?
+_global_policy = None
 
-  @property
-  def variable_dtype(self):
-    """The variable dtype of this policy.
 
-    This is the dtype layers will create their variables in, unless a layer
-    explicitly chooses a different dtype. If this is different than
-    `Policy.compute_dtype`, Layers will cast variables to the compute dtype to
-    avoid type errors.
+@keras_export("keras.mixed_precision.global_policy", v1=[])
+def global_policy():
+    """Returns the global dtype policy.
 
-    Variable regularizers are run in the variable dtype, not the compute dtype.
+    The global policy is the default `tf.keras.mixed_precision.Policy` used for
+    layers, if no policy is passed to the layer constructor. If no policy has
+    been set with `keras.mixed_precision.set_global_policy`, this will return a
+    policy constructed from `tf.keras.backend.floatx()` (floatx defaults to
+    float32).
 
-    Returns:
-      The variable dtype of this policy, as a string.
-    """
-    return self._variable_dtype
-
-  @property
-  def compute_dtype(self):
-    """The compute dtype of this policy.
-
-    This is the dtype layers will do their computations in. Typically layers
-    output tensors with the compute dtype as well.
-
-    Note that even if the compute dtype is float16 or bfloat16, hardware devices
-    may not do individual adds, multiplies, and other fundamental operations in
-    float16 or bfloat16, but instead may do some of them in float32 for numeric
-    stability. The compute dtype is the dtype of the inputs and outputs of the
-    TensorFlow ops that the layer executes. Internally, many TensorFlow ops will
-    do certain internal calculations in float32 or some other device-internal
-    intermediate format with higher precision than float16/bfloat16, to increase
-    numeric stability.
-
-    For example, a `tf.keras.layers.Dense` layer, when run on a GPU with a
-    float16 compute dtype, will pass float16 inputs to `tf.linalg.matmul`. But,
-    `tf.linalg.matmul` will do use float32 intermediate math. The performance
-    benefit of float16 is still apparent, due to increased memory bandwidth and
-    the fact modern GPUs have specialized hardware for computing matmuls on
-    float16 inputs while still keeping intermediate computations in float32.
+    >>> tf.keras.mixed_precision.global_policy()
+    <Policy "float32">
+    >>> tf.keras.layers.Dense(10).dtype_policy  # Defaults to the global policy
+    <Policy "float32">
+
+    If TensorFlow 2 behavior has been disabled with
+    `tf.compat.v1.disable_v2_behavior()`, this will instead return a special
+    "_infer" policy which infers the dtype from the dtype of the first input the
+    first time the layer is called. This behavior matches the behavior that
+    existed in TensorFlow 1.
+
+    See `tf.keras.mixed_precision.Policy` for more information on policies.
 
     Returns:
-      The compute dtype of this policy, as a string.
+      The global Policy.
     """
-    return self._compute_dtype
-
-  @property
-  def name(self):
-    """Returns the name of this policy."""
-    return self._name
+    if _global_policy is None:
+        if base_layer_utils.v2_dtype_behavior_enabled():
+            return Policy(backend.floatx())
+        else:
+            return Policy("_infer")
+    return _global_policy
 
-  def __repr__(self):
-    return '<Policy "%s">' % self._name
 
-  def get_config(self):
-    return {'name': self.name}
+def _check_if_mixed_precision_graph_rewrite_is_enabled(policy):
+    if tf.__internal__.train.is_mixed_precision_graph_rewrite_enabled():
+        raise ValueError(
+            'The global dtype policy cannot be set to "{policy.name}", because '
+            "the mixed precision graph rewrite has already been enabled.\n"
+            "At most, one of the following can be called:\n\n"
+            "  1. tf.compat.v1.train.enable_mixed_precision_graph_rewrite() "
+            "(You called this first)\n"
+            "  2. tf.keras.mixed_precision.set_global_policy() with a mixed "
+            "precision policy (You called this second)\n\n"
+            "You called both functions, which is an error, because both "
+            "functions enable you to use mixed precision. If in doubt which "
+            "function to use, use the second, as it supports Eager execution "
+            "and is more customizable.".format(policy=policy)
+        )
+
+
+@keras_export("keras.mixed_precision.set_global_policy", v1=[])
+def set_global_policy(policy):
+    """Sets the global dtype policy.
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    del custom_objects
-    if 'loss_scale' in config:
-      config = config.copy()
-      # Policy.get_config in TensorFlow 2.3 and below had a loss_scale. We
-      # silently drop it.
-      del config['loss_scale']
-    return cls(**config)
+    The global policy is the default `tf.keras.mixed_precision.Policy` used for
+    layers, if no policy is passed to the layer constructor.
 
+    >>> tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    >>> tf.keras.mixed_precision.global_policy()
+    <Policy "mixed_float16">
+    >>> tf.keras.layers.Dense(10).dtype_policy
+    <Policy "mixed_float16">
+    >>> # Global policy is not used if a policy
+    >>> # is directly passed to constructor
+    >>> tf.keras.layers.Dense(10, dtype='float64').dtype_policy
+    <Policy "float64">
+    >>> tf.keras.mixed_precision.set_global_policy('float32')
 
-# The current global policy in effect. If None, it means the current value of
-# floatx should be used as the policy if the V2 dtype behavior is enabled,
-# or "_infer" otherwise.
-# TODO(reedwm): Make this thread local?
-_global_policy = None
+    If no global policy is set, layers will instead default to a Policy
+    constructed from `tf.keras.backend.floatx()`.
 
+    To use mixed precision, the global policy should be set to `'mixed_float16'`
+    or `'mixed_bfloat16'`, so that every layer uses a 16-bit compute dtype and
+    float32 variable dtype by default.
 
-@keras_export('keras.mixed_precision.global_policy', v1=[])
-def global_policy():
-  """Returns the global dtype policy.
-
-  The global policy is the default `tf.keras.mixed_precision.Policy` used for
-  layers, if no policy is passed to the layer constructor. If no policy has been
-  set with `keras.mixed_precision.set_global_policy`, this will return a policy
-  constructed from `tf.keras.backend.floatx()` (floatx defaults to float32).
-
-  >>> tf.keras.mixed_precision.global_policy()
-  <Policy "float32">
-  >>> tf.keras.layers.Dense(10).dtype_policy  # Defaults to the global policy
-  <Policy "float32">
-
-  If TensorFlow 2 behavior has been disabled with
-  `tf.compat.v1.disable_v2_behavior()`, this will instead return a special
-  "_infer" policy which infers the dtype from the dtype of the first input the
-  first time the layer is called. This behavior matches the behavior that
-  existed in TensorFlow 1.
-
-  See `tf.keras.mixed_precision.Policy` for more information on policies.
-
-  Returns:
-    The global Policy.
-  """
-  if _global_policy is None:
-    if base_layer_utils.v2_dtype_behavior_enabled():
-      return Policy(backend.floatx())
-    else:
-      return Policy('_infer')
-  return _global_policy
+    Only floating point policies can be set as the global policy, such as
+    `'float32'` and `'mixed_float16'`. Non-floating point policies such as
+    `'int32'` and `'complex64'` cannot be set as the global policy because most
+    layers do not support such policies.
 
+    See `tf.keras.mixed_precision.Policy` for more information.
 
-def _check_if_mixed_precision_graph_rewrite_is_enabled(policy):
-  if tf.__internal__.train.is_mixed_precision_graph_rewrite_enabled():
-    raise ValueError(
-        'The global dtype policy cannot be set to "{policy.name}", because the '
-        'mixed precision graph rewrite has already been enabled.\n'
-        'At most, one of the following can be called:\n\n'
-        '  1. tf.compat.v1.train.enable_mixed_precision_graph_rewrite() '
-        '(You called this first)\n'
-        '  2. tf.keras.mixed_precision.set_global_policy() with a mixed '
-        'precision policy (You called this second)\n\n'
-        'You called both functions, which is an error, because both functions '
-        'enable you to use mixed precision. If in doubt which function to use, '
-        'use the second, as it supports Eager execution and is more '
-        'customizable.'.format(policy=policy))
-
-
-@keras_export('keras.mixed_precision.set_global_policy', v1=[])
-def set_global_policy(policy):
-  """Sets the global dtype policy.
-
-  The global policy is the default `tf.keras.mixed_precision.Policy` used for
-  layers, if no policy is passed to the layer constructor.
-
-  >>> tf.keras.mixed_precision.set_global_policy('mixed_float16')
-  >>> tf.keras.mixed_precision.global_policy()
-  <Policy "mixed_float16">
-  >>> tf.keras.layers.Dense(10).dtype_policy
-  <Policy "mixed_float16">
-  >>> # Global policy is not used if a policy is directly passed to constructor
-  >>> tf.keras.layers.Dense(10, dtype='float64').dtype_policy
-  <Policy "float64">
-  >>> tf.keras.mixed_precision.set_global_policy('float32')
-
-  If no global policy is set, layers will instead default to a Policy
-  constructed from `tf.keras.backend.floatx()`.
-
-  To use mixed precision, the global policy should be set to `'mixed_float16'`
-  or `'mixed_bfloat16'`, so that every layer uses a 16-bit compute dtype and
-  float32 variable dtype by default.
-
-  Only floating point policies can be set as the global policy, such as
-  `'float32'` and `'mixed_float16'`. Non-floating point policies such as
-  `'int32'` and `'complex64'` cannot be set as the global policy because most
-  layers do not support such policies.
-
-  See `tf.keras.mixed_precision.Policy` for more information.
-
-  Args:
-    policy: A Policy, or a string that will be converted to a Policy. Can also
-      be None, in which case the global policy will be constructed from
-      `tf.keras.backend.floatx()`
-  """
-  global _global_policy
-  if not base_layer_utils.v2_dtype_behavior_enabled():
-    raise ValueError('The global policy can only be set in TensorFlow 2 or if '
-                     'V2 dtype behavior has been set. To enable V2 dtype '
-                     'behavior, call '
-                     '"tf.compat.v1.keras.layers.enable_v2_dtype_behavior()"')
-  if policy is not None and not isinstance(policy, Policy):
-    policy = Policy(policy)
-  is_mixed_policy = (policy is not None and
-                     policy.compute_dtype != policy.variable_dtype)
-  if is_mixed_policy:
-    _check_if_mixed_precision_graph_rewrite_is_enabled(policy)
-  if (policy is not None and policy.compute_dtype is not None and
-      not tf.as_dtype(policy.compute_dtype).is_floating):
-    raise ValueError('set_global_policy can only be used to set the global '
-                     'policy to floating-point policies, such as "float32" and '
-                     '"mixed_float16", but got policy: %s'
-                     % (policy.name,))
-  _global_policy = policy
-  tf.__internal__.train.set_using_mixed_precision_policy(is_mixed_policy)
+    Args:
+      policy: A Policy, or a string that will be converted to a Policy. Can also
+        be None, in which case the global policy will be constructed from
+        `tf.keras.backend.floatx()`
+    """
+    global _global_policy
+    if not base_layer_utils.v2_dtype_behavior_enabled():
+        raise ValueError(
+            "The global policy can only be set in TensorFlow 2 or if "
+            "V2 dtype behavior has been set. To enable V2 dtype "
+            "behavior, call "
+            '"tf.compat.v1.keras.layers.enable_v2_dtype_behavior()"'
+        )
+    if policy is not None and not isinstance(policy, Policy):
+        policy = Policy(policy)
+    is_mixed_policy = (
+        policy is not None and policy.compute_dtype != policy.variable_dtype
+    )
+    if is_mixed_policy:
+        _check_if_mixed_precision_graph_rewrite_is_enabled(policy)
+    if (
+        policy is not None
+        and policy.compute_dtype is not None
+        and not tf.as_dtype(policy.compute_dtype).is_floating
+    ):
+        raise ValueError(
+            "set_global_policy can only be used to set the global "
+            'policy to floating-point policies, such as "float32" and '
+            f'"mixed_float16", but got policy: {policy.name}'
+        )
+    _global_policy = policy
+    tf.__internal__.train.set_using_mixed_precision_policy(is_mixed_policy)
 
 
 # TODO(reedwm): Make this thread local
 @contextlib.contextmanager
 def policy_scope(policy):
-  """A context manager that sets the global Policy under it.
+    """A context manager that sets the global Policy under it.
 
-  Args:
-    policy: A Policy, or a string that will be converted to a Policy..
+    Args:
+      policy: A Policy, or a string that will be converted to a Policy..
 
-  Yields:
-    Nothing.
-  """
-  old_policy = _global_policy
-  try:
-    set_global_policy(policy)
-    yield
-  finally:
-    set_global_policy(old_policy)
+    Yields:
+      Nothing.
+    """
+    old_policy = _global_policy
+    try:
+        set_global_policy(policy)
+        yield
+    finally:
+        set_global_policy(old_policy)
+
+
+def get_policy(identifier):
+    if isinstance(identifier, Policy):
+        dtype_policy = identifier
+    elif isinstance(identifier, dict):
+        dtype_policy = deserialize(identifier)
+    elif isinstance(identifier, str) and identifier in (
+        "mixed_float16",
+        "mixed_bfloat16",
+    ):
+        # The isinstance check is required since np.dtype raises an error if
+        # compared to a non-dtype string.
+        dtype_policy = Policy(identifier)
+    elif identifier:
+        dtype_policy = Policy(tf.as_dtype(identifier).name)
+    else:
+        dtype_policy = global_policy()
+    if (
+        dtype_policy.name == "mixed_float16"
+        and not loss_scale_optimizer.strategy_supports_loss_scaling()
+    ):
+        # Although only loss scaling doesn't support certain strategies, to
+        # avoid confusion, we disallow the 'mixed_float16' policy with
+        # unsupported strategies. This is because 'mixed_float16' requires
+        # loss scaling for numeric stability.
+        strategy = tf.distribute.get_strategy()
+        raise ValueError(
+            "Mixed precision is not supported with the "
+            f"tf.distribute.Strategy: {strategy.__class__.__name__}. "
+            "Either stop using mixed precision by removing the use of "
+            f"the {dtype_policy.name} policy or "
+            "use a different Strategy, e.g. a MirroredStrategy."
+        )
+    return dtype_policy
 
 
 def _is_convertible_to_dtype(dtype):
-  try:
-    tf.as_dtype(dtype)
-    return True
-  except TypeError:
-    return False
+    try:
+        tf.as_dtype(dtype)
+        return True
+    except TypeError:
+        return False
 
 
 def _policy_equivalent_to_dtype(policy):
-  """Returns True if the Policy is equivalent to a single dtype.
+    """Returns True if the Policy is equivalent to a single dtype.
 
-  A policy is equivalent to a single dtype if the policy's compute and variable
-  dtypes are the same and the policy's type is Policy and not a subclass of
-  Policy.
+    A policy is equivalent to a single dtype if the policy's compute and
+    variable dtypes are the same and the policy's type is Policy and not a
+    subclass of Policy.
 
-  The "_infer" policy is considered equivalent to a single dtype.
+    The "_infer" policy is considered equivalent to a single dtype.
 
-  Args:
-    policy: A Policy.
+    Args:
+      policy: A Policy.
 
-  Returns:
-    True, if the policy is equivalent to a single dtype.
-  """
-  # We use type() instead of isinstance because a subclass of Policy is never
-  # equivalent to a dtype.
-  return (type(policy) == Policy and  # pylint: disable=unidiomatic-typecheck
-          (policy.name == '_infer' or _is_convertible_to_dtype(policy.name)))
+    Returns:
+      True, if the policy is equivalent to a single dtype.
+    """
+    # We use type() instead of isinstance because a subclass of Policy is never
+    # equivalent to a dtype.
+    return type(policy) == Policy and (
+        policy.name == "_infer" or _is_convertible_to_dtype(policy.name)
+    )
 
 
 def serialize(policy):
-  if _policy_equivalent_to_dtype(policy):
-    # We return either None or the policy name for compatibility with older
-    # versions of Keras. If the policy name is returned, it is a dtype string
-    # such as 'float32'.
-    return None if policy.name == '_infer' else policy.name
-  return generic_utils.serialize_keras_object(policy)
+    if _policy_equivalent_to_dtype(policy):
+        # We return either None or the policy name for compatibility with older
+        # versions of Keras. If the policy name is returned, it is a dtype
+        # string such as 'float32'.
+        return None if policy.name == "_infer" else policy.name
+    return serialization_lib.serialize_keras_object(policy)
 
 
 def deserialize(config, custom_objects=None):
-  if isinstance(config, str) and _is_convertible_to_dtype(config):
-    return Policy(config)
-  if config is None:
-    return Policy('_infer')
-  # PolicyV1 was an old version of Policy that was removed. Deserializing it
-  # turns it into a (non-V1) Policy.
-  module_objects = {'Policy': Policy, 'PolicyV1': Policy}
-  return generic_utils.deserialize_keras_object(
-      config,
-      module_objects=module_objects,
-      custom_objects=custom_objects,
-      printable_module_name='dtype policy')
+    if isinstance(config, str) and _is_convertible_to_dtype(config):
+        return Policy(config)
+    if config is None:
+        return Policy("_infer")
+    # PolicyV1 was an old version of Policy that was removed. Deserializing it
+    # turns it into a (non-V1) Policy.
+    module_objects = {"Policy": Policy, "PolicyV1": Policy}
+    return serialization_lib.deserialize_keras_object(
+        config,
+        module_objects=module_objects,
+        custom_objects=custom_objects,
+        printable_module_name="dtype policy",
+    )
diff --git a/keras/mixed_precision/policy_test.py b/keras/mixed_precision/policy_test.py
index 7632966a4309..5131ce085b7e 100644
--- a/keras/mixed_precision/policy_test.py
+++ b/keras/mixed_precision/policy_test.py
@@ -15,237 +15,300 @@
 """Tests Policies."""
 
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
+
 from keras.engine import base_layer_utils
 from keras.mixed_precision import device_compatibility_check
 from keras.mixed_precision import policy as mp_policy
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class PolicyTest(tf.test.TestCase, parameterized.TestCase):
-  """Tests Policies."""
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_dtype_attributes(self):
-    for dtype in 'int32', 'bool', 'float16', 'float32':
-      policy = mp_policy.Policy(dtype)
-      self.assertEqual(policy.name, dtype)
-      self.assertEqual(policy.compute_dtype, dtype)
-      self.assertEqual(policy.variable_dtype, dtype)
-
-    for dtype in 'float16', 'bfloat16':
-      policy = mp_policy.Policy('mixed_' + dtype)
-      self.assertEqual(policy.name, 'mixed_' + dtype)
-      self.assertEqual(policy.compute_dtype, dtype)
-      self.assertEqual(policy.variable_dtype, 'float32')
-
-    policy = mp_policy.Policy('_infer')
-    self.assertEqual(policy.compute_dtype, None)
-    self.assertEqual(policy.variable_dtype, None)
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_repr(self):
-    # Test Policy repr
-    for policy in ('float32', 'int8', 'mixed_float16', 'mixed_bfloat16',
-                   '_infer'):
-      self.assertEqual(repr(mp_policy.Policy(policy)),
-                       '<Policy "%s">' % policy)
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_policy_errors(self):
-    # Test passing invalid strings
-
-    with self.assertRaisesRegex(
-        ValueError, 'Cannot convert value abc to a mixed precision Policy.'):
-      mp_policy.Policy('abc')
-
-    # Test passing a DType
-    with self.assertRaisesRegex(
-        TypeError, "'name' must be a string, not a DType. "
-        'Instead, pass DType.name. Got: float16'):
-      mp_policy.Policy(tf.float16)
-
-    # Test passing a non-DType invalid type
-    with self.assertRaisesRegex(TypeError,
-                                "'name' must be a string, but got: 5"):
-      mp_policy.Policy(5)
-
-    # Test passing a now-removed policy ending in float32_vars
-    with self.assertRaisesRegex(
-        ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-        'from TensorFlow. Please use the \'mixed_float16\' or '
-        '\'mixed_bfloat16\' policy instead. Got policy name: '
-        '\'infer_float32_vars\''):
-      mp_policy.Policy('infer_float32_vars')
-    with self.assertRaisesRegex(
-        ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-        'from TensorFlow. Please use the \'mixed_float16\' policy '
-        'instead. Got policy name: \'float16_with_float32_vars\''):
-      mp_policy.Policy('float16_with_float32_vars')
-    with self.assertRaisesRegex(
-        ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-        'from TensorFlow. Please use the \'mixed_bfloat16\' policy '
-        'instead. Got policy name: \'bfloat16_with_float32_vars\''):
-      mp_policy.Policy('bfloat16_with_float32_vars')
-    with self.assertRaisesRegex(
-        ValueError, 'Policies ending in \'_float32_vars\' have been removed '
-        'from TensorFlow. Got policy name: '
-        '\'int8_with_float32_vars\''):
-      mp_policy.Policy('int8_with_float32_vars')
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_global_policy(self):
-    if base_layer_utils.v2_dtype_behavior_enabled():
-      default_policy = 'float32'
-    else:
-      default_policy = '_infer'
-    self.assertEqual(mp_policy.global_policy().name, default_policy)
-    try:
-      mp_policy.set_global_policy('mixed_float16')
-      self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
-      with tf.Graph().as_default():  # Policies are not associated with a graph
-        self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
-      mp_policy.set_global_policy('_infer')
-      self.assertEqual(mp_policy.global_policy().name, '_infer')
-      policy = mp_policy.Policy('mixed_bfloat16')
-      mp_policy.set_global_policy(policy)
-      self.assertIs(mp_policy.global_policy(), policy)
-    finally:
-      mp_policy.set_global_policy(None)
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_global_policy_dtype_error(self):
-    with self.assertRaisesRegex(
-        ValueError,
-        'set_global_policy can only be used to set the global policy to '
-        'floating-point policies, such as "float32" and "mixed_float16", but '
-        'got policy: int32'):
-      mp_policy.set_global_policy('int32')
-    with self.assertRaisesRegex(
-        ValueError,
-        'set_global_policy can only be used to set the global policy to '
-        'floating-point policies, such as "float32" and "mixed_float16", but '
-        'got policy: complex64'):
-      mp_policy.set_global_policy(mp_policy.Policy('complex64'))
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_device_compatibility_warning(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Run in eager mode only.')
-
-    device_compatibility_check._logged_compatibility_check = False
-    with tf.compat.v1.test.mock.patch.object(tf_logging, 'warning') as mock_warn:
-      mp_policy.Policy('mixed_float16')
-    if tf.config.list_physical_devices('GPU'):
-      mock_warn.assert_not_called()
-    else:
-      self.assertRegex(
-          mock_warn.call_args[0][0],
-          r'Mixed precision compatibility check \(mixed_float16\): WARNING.*')
-
-    if tf.config.list_physical_devices('GPU'):
-      # Assert message is only logged once
-      with tf.compat.v1.test.mock.patch.object(tf_logging, 'warning') as mock_warn:
-        mp_policy.Policy('mixed_float16')
-      mock_warn.assert_not_called()
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_policy_scope(self):
-    if base_layer_utils.v2_dtype_behavior_enabled():
-      default_policy = 'float32'
-    else:
-      default_policy = '_infer'
-    with mp_policy.policy_scope('mixed_float16'):
-      self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
-      with mp_policy.policy_scope('_infer'):
-        self.assertEqual(mp_policy.global_policy().name, '_infer')
-      self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
-    self.assertEqual(mp_policy.global_policy().name, default_policy)
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_config(self):
-    for policy in (
-        mp_policy.Policy('float16'),
-        mp_policy.Policy('float32'),
-        mp_policy.Policy('int16'),
-        mp_policy.Policy('mixed_float16'),
-        mp_policy.Policy('mixed_bfloat16'),
-        mp_policy.Policy('_infer'),
-    ):
-      config = policy.get_config()
-      new_policy = mp_policy.Policy.from_config(config)
-      # Comparing strings is the easiest way to ensure the policies are the
-      # same, as policy does not override the == operator.
-      self.assertEqual(str(policy), str(new_policy))
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_serialization(self):
-    # Test policies that are equivalent to a single dtype
-    for policy_name in 'float16', 'float32', 'int8', 'string', 'bool':
-      policy = mp_policy.Policy(policy_name)
-      config = mp_policy.serialize(policy)
-      self.assertEqual(config, policy_name)
-      new_policy = mp_policy.deserialize(config)
-      self.assertEqual(str(policy), str(new_policy))
-
-    # Test "_infer" policy
-    policy = mp_policy.Policy('_infer')
-    config = mp_policy.serialize(policy)
-    self.assertIsNone(config)
-    new_policy = mp_policy.deserialize(config)
-    self.assertEqual(str(policy), str(new_policy))
-
-    class MyPolicy(mp_policy.Policy):
-      pass
-
-    # Test policies that are not equivalent to a single dtype
-    for policy in (
-        mp_policy.Policy('mixed_float16'),
-        mp_policy.Policy('mixed_bfloat16'),
-        MyPolicy('float32')
-    ):
-      config = mp_policy.serialize(policy)
-      self.assertEqual(config, {'class_name': policy.__class__.__name__,
-                                'config': {'name': policy.name}})
-      new_policy = mp_policy.deserialize(config,
-                                         custom_objects={'MyPolicy': MyPolicy})
-      self.assertEqual(str(policy), str(new_policy))
-
-  @test_utils.enable_v2_dtype_behavior
-  def test_error_if_graph_rewrite_enabled(self):
-    try:
-      tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-          gradient_descent.SGD(1.))
-      with self.assertRaisesRegex(
-          ValueError, 'cannot be set to "mixed_float16", .* the mixed '
-          'precision graph rewrite has already been enabled'):
-        mp_policy.set_global_policy('mixed_float16')
-      with mp_policy.policy_scope('float64'):
-        pass  # Non-mixed policies are allowed
-    finally:
-      tf.compat.v1.mixed_precision.disable_mixed_precision_graph_rewrite()
-
-  @test_utils.disable_v2_dtype_behavior
-  def test_v1_dtype_behavior(self):
-    # Setting global policies are not allowed with V1 dtype behavior
-    with self.assertRaisesRegex(
-        ValueError, 'global policy can only be set in TensorFlow 2'):
-      with mp_policy.policy_scope(mp_policy.Policy('_infer')):
-        pass
-    with self.assertRaisesRegex(
-        ValueError, 'global policy can only be set in TensorFlow 2'):
-      with mp_policy.policy_scope(mp_policy.Policy('float32')):
-        pass
-    with self.assertRaisesRegex(
-        ValueError, 'global policy can only be set in TensorFlow 2'):
-      with mp_policy.policy_scope(mp_policy.Policy('mixed_float16')):
-        pass
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Tests Policies."""
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_dtype_attributes(self):
+        for dtype in "int32", "bool", "float16", "float32":
+            policy = mp_policy.Policy(dtype)
+            self.assertEqual(policy.name, dtype)
+            self.assertEqual(policy.compute_dtype, dtype)
+            self.assertEqual(policy.variable_dtype, dtype)
+
+        for dtype in "float16", "bfloat16":
+            policy = mp_policy.Policy("mixed_" + dtype)
+            self.assertEqual(policy.name, "mixed_" + dtype)
+            self.assertEqual(policy.compute_dtype, dtype)
+            self.assertEqual(policy.variable_dtype, "float32")
+
+        policy = mp_policy.Policy("_infer")
+        self.assertEqual(policy.compute_dtype, None)
+        self.assertEqual(policy.variable_dtype, None)
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_repr(self):
+        # Test Policy repr
+        for policy in (
+            "float32",
+            "int8",
+            "mixed_float16",
+            "mixed_bfloat16",
+            "_infer",
+        ):
+            self.assertEqual(
+                repr(mp_policy.Policy(policy)), f'<Policy "{policy}">'
+            )
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_policy_errors(self):
+        # Test passing invalid strings
+
+        with self.assertRaisesRegex(
+            ValueError, "Cannot convert value abc to a mixed precision Policy."
+        ):
+            mp_policy.Policy("abc")
+
+        # Test passing a DType
+        with self.assertRaisesRegex(
+            TypeError, "'name' must be a string, not a DType. "
+        ):
+            mp_policy.Policy(tf.float16)
+
+        # Test passing a non-DType invalid type
+        with self.assertRaisesRegex(
+            TypeError, "'name' must be a string, but got: 5"
+        ):
+            mp_policy.Policy(5)
+
+        # Test passing a now-removed policy ending in float32_vars
+        with self.assertRaisesRegex(
+            ValueError,
+            "Policies ending in '_float32_vars' have been removed "
+            "from TensorFlow. Please use the 'mixed_float16' or "
+            "'mixed_bfloat16' policy instead. Got policy name: "
+            "'infer_float32_vars'",
+        ):
+            mp_policy.Policy("infer_float32_vars")
+        with self.assertRaisesRegex(
+            ValueError,
+            "Policies ending in '_float32_vars' have been removed "
+            "from TensorFlow. Please use the 'mixed_float16' policy "
+            "instead. Got policy name: 'float16_with_float32_vars'",
+        ):
+            mp_policy.Policy("float16_with_float32_vars")
+        with self.assertRaisesRegex(
+            ValueError,
+            "Policies ending in '_float32_vars' have been removed "
+            "from TensorFlow. Please use the 'mixed_bfloat16' policy "
+            "instead. Got policy name: 'bfloat16_with_float32_vars'",
+        ):
+            mp_policy.Policy("bfloat16_with_float32_vars")
+        with self.assertRaisesRegex(
+            ValueError,
+            "Policies ending in '_float32_vars' have been removed "
+            "from TensorFlow. Got policy name: "
+            "'int8_with_float32_vars'",
+        ):
+            mp_policy.Policy("int8_with_float32_vars")
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_global_policy(self):
+        if base_layer_utils.v2_dtype_behavior_enabled():
+            default_policy = "float32"
+        else:
+            default_policy = "_infer"
+        self.assertEqual(mp_policy.global_policy().name, default_policy)
+        try:
+            mp_policy.set_global_policy("mixed_float16")
+            self.assertEqual(mp_policy.global_policy().name, "mixed_float16")
+            # Policies are not associated with a graph
+            with tf.Graph().as_default():
+                self.assertEqual(
+                    mp_policy.global_policy().name, "mixed_float16"
+                )
+            mp_policy.set_global_policy("_infer")
+            self.assertEqual(mp_policy.global_policy().name, "_infer")
+            policy = mp_policy.Policy("mixed_bfloat16")
+            mp_policy.set_global_policy(policy)
+            self.assertIs(mp_policy.global_policy(), policy)
+        finally:
+            mp_policy.set_global_policy(None)
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_global_policy_dtype_error(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "set_global_policy can only be used to set the global policy to "
+            'floating-point policies, such as "float32" and "mixed_float16", '
+            "but got policy: int32",
+        ):
+            mp_policy.set_global_policy("int32")
+        with self.assertRaisesRegex(
+            ValueError,
+            "set_global_policy can only be used to set the global policy to "
+            'floating-point policies, such as "float32" and "mixed_float16", '
+            "but got policy: complex64",
+        ):
+            mp_policy.set_global_policy(mp_policy.Policy("complex64"))
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_device_compatibility_warning(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Run in eager mode only.")
+
+        device_compatibility_check._logged_compatibility_check = False
+        with tf.compat.v1.test.mock.patch.object(
+            tf_logging, "warning"
+        ) as mock_warn:
+            mp_policy.Policy("mixed_float16")
+        if tf.config.list_physical_devices("GPU"):
+            mock_warn.assert_not_called()
+        else:
+            self.assertRegex(
+                mock_warn.call_args[0][0],
+                r"Mixed precision compatibility check \(mixed_float16\): "
+                r"WARNING.*",
+            )
+
+        if tf.config.list_physical_devices("GPU"):
+            # Assert message is only logged once
+            with tf.compat.v1.test.mock.patch.object(
+                tf_logging, "warning"
+            ) as mock_warn:
+                mp_policy.Policy("mixed_float16")
+            mock_warn.assert_not_called()
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_policy_scope(self):
+        if base_layer_utils.v2_dtype_behavior_enabled():
+            default_policy = "float32"
+        else:
+            default_policy = "_infer"
+        with mp_policy.policy_scope("mixed_float16"):
+            self.assertEqual(mp_policy.global_policy().name, "mixed_float16")
+            with mp_policy.policy_scope("_infer"):
+                self.assertEqual(mp_policy.global_policy().name, "_infer")
+            self.assertEqual(mp_policy.global_policy().name, "mixed_float16")
+        self.assertEqual(mp_policy.global_policy().name, default_policy)
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_config(self):
+        for policy in (
+            mp_policy.Policy("float16"),
+            mp_policy.Policy("float32"),
+            mp_policy.Policy("int16"),
+            mp_policy.Policy("mixed_float16"),
+            mp_policy.Policy("mixed_bfloat16"),
+            mp_policy.Policy("_infer"),
+        ):
+            config = policy.get_config()
+            new_policy = mp_policy.Policy.from_config(config)
+            # Comparing strings is the easiest way to ensure the policies are
+            # the same, as policy does not override the == operator.
+            self.assertEqual(str(policy), str(new_policy))
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_serialization(self):
+        # Test policies that are equivalent to a single dtype
+        for policy_name in "float16", "float32", "int8", "string", "bool":
+            policy = mp_policy.Policy(policy_name)
+            config = mp_policy.serialize(policy)
+            self.assertEqual(config, policy_name)
+            new_policy = mp_policy.deserialize(config)
+            self.assertEqual(str(policy), str(new_policy))
+
+        # Test "_infer" policy
+        policy = mp_policy.Policy("_infer")
+        config = mp_policy.serialize(policy)
+        self.assertIsNone(config)
+        new_policy = mp_policy.deserialize(config)
+        self.assertEqual(str(policy), str(new_policy))
+
+        class MyPolicy(mp_policy.Policy):
+            pass
+
+        # Test policies that are not equivalent to a single dtype
+        for policy in (
+            mp_policy.Policy("mixed_float16"),
+            mp_policy.Policy("mixed_bfloat16"),
+            MyPolicy("float32"),
+        ):
+            config = mp_policy.serialize(policy)
+            if tf.__internal__.tf2.enabled():
+                if policy.name == "float32":
+                    self.assertEqual(
+                        config,
+                        {
+                            "module": None,
+                            "class_name": policy.__class__.__name__,
+                            "config": {"name": policy.name},
+                            "registered_name": "MyPolicy",
+                        },
+                    )
+                else:
+                    self.assertEqual(
+                        config,
+                        {
+                            "module": "keras.mixed_precision",
+                            "class_name": policy.__class__.__name__,
+                            "config": {"name": policy.name},
+                            "registered_name": None,
+                        },
+                    )
+            else:
+                self.assertEqual(
+                    config,
+                    {
+                        "class_name": policy.__class__.__name__,
+                        "config": {"name": policy.name},
+                    },
+                )
+            new_policy = mp_policy.deserialize(
+                config, custom_objects={"MyPolicy": MyPolicy}
+            )
+            self.assertEqual(str(policy), str(new_policy))
+
+    @test_utils.enable_v2_dtype_behavior
+    def test_error_if_graph_rewrite_enabled(self):
+        try:
+            tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+                gradient_descent.SGD(1.0)
+            )
+            with self.assertRaisesRegex(
+                ValueError,
+                'cannot be set to "mixed_float16", .* the mixed '
+                "precision graph rewrite has already been enabled",
+            ):
+                mp_policy.set_global_policy("mixed_float16")
+            with mp_policy.policy_scope("float64"):
+                pass  # Non-mixed policies are allowed
+        finally:
+            tf.compat.v1.mixed_precision.disable_mixed_precision_graph_rewrite()
+
+    @test_utils.disable_v2_dtype_behavior
+    def test_v1_dtype_behavior(self):
+        # Setting global policies are not allowed with V1 dtype behavior
+        with self.assertRaisesRegex(
+            ValueError, "global policy can only be set in TensorFlow 2"
+        ):
+            with mp_policy.policy_scope(mp_policy.Policy("_infer")):
+                pass
+        with self.assertRaisesRegex(
+            ValueError, "global policy can only be set in TensorFlow 2"
+        ):
+            with mp_policy.policy_scope(mp_policy.Policy("float32")):
+                pass
+        with self.assertRaisesRegex(
+            ValueError, "global policy can only be set in TensorFlow 2"
+        ):
+            with mp_policy.policy_scope(mp_policy.Policy("mixed_float16")):
+                pass
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/mixed_precision/test_util.py b/keras/mixed_precision/test_util.py
index f01987732518..43c422189e35 100644
--- a/keras/mixed_precision/test_util.py
+++ b/keras/mixed_precision/test_util.py
@@ -15,202 +15,228 @@
 """Contains testing utilities related to mixed precision."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import regularizers
 from keras.engine import base_layer
 
 
 def create_identity_with_grad_check_fn(expected_gradient, expected_dtype=None):
-  """Returns a function that asserts it's gradient has a certain value.
-
-  This serves as a hook to assert intermediate gradients have a certain value.
-  This returns an identity function. The identity's gradient function is also
-  the identity function, except it asserts that the gradient equals
-  `expected_gradient` and has dtype `expected_dtype`.
-
-  Args:
-    expected_gradient: The gradient function asserts that the gradient is this
-      value.
-    expected_dtype: The gradient function asserts the gradient has this dtype.
-
-  Returns:
-    An identity function whose gradient function asserts the gradient has a
-    certain value.
-  """
-  @tf.custom_gradient
-  def _identity_with_grad_check(x):
-    """Function that asserts it's gradient has a certain value."""
-    x = tf.identity(x)
-    def grad(dx):
-      """Gradient function that asserts the gradient has a certain value."""
-      if expected_dtype:
-        assert dx.dtype == expected_dtype, (
-            'dx.dtype should be %s but is: %s' % (expected_dtype, dx.dtype))
-      expected_tensor = tf.convert_to_tensor(
-          expected_gradient, dtype=dx.dtype, name='expected_gradient')
-      # Control dependency is to ensure input is available. It's possible the
-      # dataset will throw a StopIteration to indicate there is no more data, in
-      # which case we don't want to run the assertion.
-      with tf.control_dependencies([x]):
-        assert_op = tf.compat.v1.assert_equal(dx, expected_tensor)
-      with tf.control_dependencies([assert_op]):
-        dx = tf.identity(dx)
-      return dx
-    return x, grad
-  # Keras sometimes has trouble serializing Lambda layers with a decorated
-  # function. So we define and return a non-decorated function.
-  def identity_with_grad_check(x):
-    return _identity_with_grad_check(x)
-  return identity_with_grad_check
+    """Returns a function that asserts it's gradient has a certain value.
+
+    This serves as a hook to assert intermediate gradients have a certain value.
+    This returns an identity function. The identity's gradient function is also
+    the identity function, except it asserts that the gradient equals
+    `expected_gradient` and has dtype `expected_dtype`.
+
+    Args:
+      expected_gradient: The gradient function asserts that the gradient is this
+        value.
+      expected_dtype: The gradient function asserts the gradient has this dtype.
+
+    Returns:
+      An identity function whose gradient function asserts the gradient has a
+      certain value.
+    """
+
+    @tf.custom_gradient
+    def _identity_with_grad_check(x):
+        """Function that asserts it's gradient has a certain value."""
+        x = tf.identity(x)
+
+        def grad(dx):
+            """Gradient function that asserts the gradient has a certain
+            value."""
+            if expected_dtype:
+                assert (
+                    dx.dtype == expected_dtype
+                ), f"dx.dtype should be {expected_dtype} but is: {dx.dtype}"
+            expected_tensor = tf.convert_to_tensor(
+                expected_gradient, dtype=dx.dtype, name="expected_gradient"
+            )
+            # Control dependency is to ensure input is available. It's possible
+            # the dataset will throw a StopIteration to indicate there is no
+            # more data, in which case we don't want to run the assertion.
+            with tf.control_dependencies([x]):
+                assert_op = tf.compat.v1.assert_equal(dx, expected_tensor)
+            with tf.control_dependencies([assert_op]):
+                dx = tf.identity(dx)
+            return dx
+
+        return x, grad
+
+    # Keras sometimes has trouble serializing Lambda layers with a decorated
+    # function. So we define and return a non-decorated function.
+    def identity_with_grad_check(x):
+        return _identity_with_grad_check(x)
+
+    return identity_with_grad_check
 
 
 def create_identity_with_nan_gradients_fn(have_nan_gradients):
-  """Returns a function that optionally has NaN gradients.
-
-  This serves as a hook to introduce NaN gradients to a model. This returns an
-  identity function. The identity's gradient function will check if the boolean
-  tensor `have_nan_gradients` is True. If so, the gradient will be NaN.
-  Otherwise, the gradient will also be the identity.
-
-  Args:
-    have_nan_gradients: A scalar boolean tensor. If True, gradients will be NaN.
-      Otherwise, the gradient function is the identity function.
-
-  Returns:
-    An identity function whose gradient function will return NaNs, if
-    `have_nan_gradients` is True.
-  """
-  @tf.custom_gradient
-  def _identity_with_nan_gradients(x):
-    """Function whose gradient is NaN iff `have_nan_gradients` is True."""
-    x = tf.identity(x)
-    def grad(dx):
-      return tf.cond(
-          have_nan_gradients,
-          lambda: dx * float('NaN'),
-          lambda: dx
-      )
-    return x, grad
-  # Keras sometimes has trouble serializing Lambda layers with a decorated
-  # function. So we define and return a non-decorated function.
-  def identity_with_nan_gradients(x):
-    return _identity_with_nan_gradients(x)
-  return identity_with_nan_gradients
+    """Returns a function that optionally has NaN gradients.
 
+    This serves as a hook to introduce NaN gradients to a model. This returns an
+    identity function. The identity's gradient function will check if the
+    boolean tensor `have_nan_gradients` is True. If so, the gradient will be
+    NaN.  Otherwise, the gradient will also be the identity.
 
-class AssertTypeLayer(base_layer.Layer):
-  """A layer which asserts it's inputs are a certain type."""
+    Args:
+      have_nan_gradients: A scalar boolean tensor. If True, gradients will be
+        NaN. Otherwise, the gradient function is the identity function.
 
-  def __init__(self, assert_type=None, **kwargs):
-    self._assert_type = (tf.as_dtype(assert_type).name if assert_type
-                         else None)
-    super().__init__(**kwargs)
+    Returns:
+      An identity function whose gradient function will return NaNs, if
+      `have_nan_gradients` is True.
+    """
 
-  def assert_input_types(self, inputs):
-    """Asserts `inputs` are of the correct type. Should be called in call()."""
-    if self._assert_type:
-      inputs_flattened = tf.nest.flatten(inputs)
-      for inp in inputs_flattened:
-        assert inp.dtype.base_dtype == self._assert_type, (
-            'Input tensor has type %s which does not match assert type %s' %
-            (inp.dtype.name, self._assert_type))
+    @tf.custom_gradient
+    def _identity_with_nan_gradients(x):
+        """Function whose gradient is NaN iff `have_nan_gradients` is True."""
+        x = tf.identity(x)
 
+        def grad(dx):
+            return tf.cond(
+                have_nan_gradients, lambda: dx * float("NaN"), lambda: dx
+            )
 
-class MultiplyLayer(AssertTypeLayer):
-  """A layer which multiplies its input by a scalar variable."""
+        return x, grad
 
-  def __init__(self,
-               regularizer=None,
-               activity_regularizer=None,
-               use_operator=False,
-               var_name='v',
-               **kwargs):
-    """Initializes the MultiplyLayer.
+    # Keras sometimes has trouble serializing Lambda layers with a decorated
+    # function. So we define and return a non-decorated function.
+    def identity_with_nan_gradients(x):
+        return _identity_with_nan_gradients(x)
 
-    Args:
-      regularizer: The weight regularizer on the scalar variable.
-      activity_regularizer: The activity regularizer.
-      use_operator: If True, add using the * operator. If False, add using
-        tf.multiply.
-      var_name: The name of the variable. It can be useful to pass a name other
-        than 'v', to test having the attribute name (self.v) being different
-        from the variable name.
-      **kwargs: Passed to AssertTypeLayer constructor.
-    """
-    self._regularizer = regularizer
-    if isinstance(regularizer, dict):
-      self._regularizer = regularizers.deserialize(regularizer,
-                                                   custom_objects=globals())
-    self._activity_regularizer = activity_regularizer
-    if isinstance(activity_regularizer, dict):
-      self._activity_regularizer = regularizers.deserialize(
-          activity_regularizer, custom_objects=globals())
-
-    self._use_operator = use_operator
-    self._var_name = var_name
-    super().__init__(
-        activity_regularizer=self._activity_regularizer, **kwargs)
-
-  def build(self, _):
-    self.v = self.add_weight(
-        self._var_name, (), initializer='ones', regularizer=self._regularizer)
-    self.built = True
-
-  def call(self, inputs):
-    self.assert_input_types(inputs)
-    return self._multiply(inputs, self.v)
-
-  def _multiply(self, x, y):
-    if self._use_operator:
-      return x * y
-    else:
-      return tf.multiply(x, y)
-
-  def get_config(self):
-    config = super().get_config()
-    config['regularizer'] = regularizers.serialize(self._regularizer)
-    config['activity_regularizer'] = regularizers.serialize(
-        self._activity_regularizer)
-    config['use_operator'] = self._use_operator
-    config['var_name'] = self._var_name
-    config['assert_type'] = self._assert_type
-    return config
+    return identity_with_nan_gradients
+
+
+class AssertTypeLayer(base_layer.Layer):
+    """A layer which asserts it's inputs are a certain type."""
+
+    def __init__(self, assert_type=None, **kwargs):
+        self._assert_type = (
+            tf.as_dtype(assert_type).name if assert_type else None
+        )
+        super().__init__(**kwargs)
+
+    def assert_input_types(self, inputs):
+        """Asserts `inputs` are of the correct type. Should be called in
+        call()."""
+        if self._assert_type:
+            inputs_flattened = tf.nest.flatten(inputs)
+            for inp in inputs_flattened:
+                assert inp.dtype.base_dtype == self._assert_type, (
+                    "Input tensor has type %s which does "
+                    "not match assert type %s"
+                    % (inp.dtype.name, self._assert_type)
+                )
+
+
+class MultiplyLayer(AssertTypeLayer):
+    """A layer which multiplies its input by a scalar variable."""
+
+    def __init__(
+        self,
+        regularizer=None,
+        activity_regularizer=None,
+        use_operator=False,
+        var_name="v",
+        **kwargs,
+    ):
+        """Initializes the MultiplyLayer.
+
+        Args:
+          regularizer: The weight regularizer on the scalar variable.
+          activity_regularizer: The activity regularizer.
+          use_operator: If True, add using the * operator. If False, add using
+            tf.multiply.
+          var_name: The name of the variable. It can be useful to pass a name
+            other than 'v', to test having the attribute name (self.v) being
+            different from the variable name.
+          **kwargs: Passed to AssertTypeLayer constructor.
+        """
+        self._regularizer = regularizer
+        if isinstance(regularizer, dict):
+            self._regularizer = regularizers.deserialize(
+                regularizer, custom_objects=globals()
+            )
+        self._activity_regularizer = activity_regularizer
+        if isinstance(activity_regularizer, dict):
+            self._activity_regularizer = regularizers.deserialize(
+                activity_regularizer, custom_objects=globals()
+            )
+
+        self._use_operator = use_operator
+        self._var_name = var_name
+        super().__init__(
+            activity_regularizer=self._activity_regularizer, **kwargs
+        )
+
+    def build(self, _):
+        self.v = self.add_weight(
+            self._var_name,
+            (),
+            initializer="ones",
+            regularizer=self._regularizer,
+        )
+        self.built = True
+
+    def call(self, inputs):
+        self.assert_input_types(inputs)
+        return self._multiply(inputs, self.v)
+
+    def _multiply(self, x, y):
+        if self._use_operator:
+            return x * y
+        else:
+            return tf.multiply(x, y)
+
+    def get_config(self):
+        config = super().get_config()
+        config["regularizer"] = regularizers.serialize(self._regularizer)
+        config["activity_regularizer"] = regularizers.serialize(
+            self._activity_regularizer
+        )
+        config["use_operator"] = self._use_operator
+        config["var_name"] = self._var_name
+        config["assert_type"] = self._assert_type
+        return config
 
 
 class MultiplyLayerWithoutAutoCast(MultiplyLayer):
-  """Same as MultiplyLayer, but does not use AutoCastVariables."""
-
-  def build(self, _):
-    dtype = self.dtype
-    if dtype in ('float16', 'bfloat16'):
-      dtype = 'float32'
-    self.v = self.add_weight(
-        'v', (),
-        initializer='ones',
-        dtype=dtype,
-        experimental_autocast=False,
-        regularizer=self._regularizer)
-    self.built = True
-
-  def call(self, inputs):
-    self.assert_input_types(inputs)
-    assert self.v.dtype in (tf.float32, tf.float64)
-    return self._multiply(inputs, tf.cast(self.v, inputs.dtype))
+    """Same as MultiplyLayer, but does not use AutoCastVariables."""
+
+    def build(self, _):
+        dtype = self.dtype
+        if dtype in ("float16", "bfloat16"):
+            dtype = "float32"
+        self.v = self.add_weight(
+            "v",
+            (),
+            initializer="ones",
+            dtype=dtype,
+            experimental_autocast=False,
+            regularizer=self._regularizer,
+        )
+        self.built = True
+
+    def call(self, inputs):
+        self.assert_input_types(inputs)
+        assert self.v.dtype in (tf.float32, tf.float64)
+        return self._multiply(inputs, tf.cast(self.v, inputs.dtype))
 
 
 class IdentityRegularizer(regularizers.Regularizer):
+    def __call__(self, x):
+        assert x.dtype == tf.float32
+        return tf.identity(x)
 
-  def __call__(self, x):
-    assert x.dtype == tf.float32
-    return tf.identity(x)
-
-  def get_config(self):
-    return {}
+    def get_config(self):
+        return {}
 
 
 class ReduceSumRegularizer(regularizers.Regularizer):
+    def __call__(self, x):
+        return tf.reduce_sum(x)
 
-  def __call__(self, x):
-    return tf.reduce_sum(x)
-
-  def get_config(self):
-    return {}
+    def get_config(self):
+        return {}
diff --git a/keras/mixed_precision/testdata/BUILD b/keras/mixed_precision/testdata/BUILD
index cfb7f63eb457..cd79ce6cd465 100644
--- a/keras/mixed_precision/testdata/BUILD
+++ b/keras/mixed_precision/testdata/BUILD
@@ -2,10 +2,8 @@
 #   Contains checkpoints and SavedModels for testing purposes.
 
 package(
-    default_visibility = [
-        "//keras:friends",
-        "//third_party/tensorflow/tools/pip_package:__pkg__",
-    ],
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
+    default_visibility = ["//keras:friends"],
     licenses = ["notice"],
 )
 
diff --git a/keras/models/BUILD b/keras/models/BUILD
index 66d533286c89..76161b078399 100644
--- a/keras/models/BUILD
+++ b/keras/models/BUILD
@@ -1,9 +1,11 @@
 # Keras models
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "distribute_py_test")
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
@@ -78,7 +80,6 @@ distribute_py_test(
     shard_count = 8,
     tags = [
         "multi_gpu",
-        "no_oss",  # TODO(b/226938240): Reenable
         "nomultivm",
         "requires-net:ipv4",
     ],
diff --git a/keras/models/__init__.py b/keras/models/__init__.py
index 77e0f86f4e2d..6737076ba4c8 100644
--- a/keras/models/__init__.py
+++ b/keras/models/__init__.py
@@ -13,20 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 """Keras models API."""
-# pylint: disable=g-bad-import-order
+
 
 from keras.engine.functional import Functional
 from keras.engine.sequential import Sequential
 from keras.engine.training import Model
-from keras.models.cloning import clone_and_build_model
-from keras.models.cloning import clone_model
-from keras.models.cloning import share_weights
-from keras.models.sharpness_aware_minimization import SharpnessAwareMinimization
-from keras.saving.model_config import model_from_config
-from keras.saving.model_config import model_from_json
-from keras.saving.model_config import model_from_yaml
-from keras.saving.save import load_model
-from keras.saving.save import save_model
 
 # Private symbols that are used in tests.
 # TODO(b/221261361): Clean up private symbols usage and remove these imports.
@@ -34,3 +25,12 @@
 from keras.models.cloning import _clone_layer
 from keras.models.cloning import _clone_layers_and_model_config
 from keras.models.cloning import _clone_sequential_model
+from keras.models.cloning import clone_and_build_model
+from keras.models.cloning import clone_model
+from keras.models.cloning import share_weights
+from keras.models.sharpness_aware_minimization import SharpnessAwareMinimization
+from keras.saving.legacy.model_config import model_from_config
+from keras.saving.legacy.model_config import model_from_json
+from keras.saving.legacy.model_config import model_from_yaml
+from keras.saving.saving_api import load_model
+from keras.saving.saving_api import save_model
diff --git a/keras/models/cloning.py b/keras/models/cloning.py
index abf69a61262c..85c5ffd1319b 100644
--- a/keras/models/cloning.py
+++ b/keras/models/cloning.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Code for model cloning, plus model-related API entries."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import metrics as metrics_module
-from keras.optimizers import optimizer_v1
 from keras.engine import functional
 from keras.engine import sequential
 from keras.engine import training
@@ -27,718 +27,875 @@
 from keras.engine.base_layer import Layer
 from keras.engine.input_layer import Input
 from keras.engine.input_layer import InputLayer
+from keras.optimizers import optimizer_v1
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model.utils import keras_option_scope
+from keras.saving.object_registration import CustomObjectScope
 from keras.utils import generic_utils
 from keras.utils import version_utils
-from keras.utils.generic_utils import CustomObjectScope
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-
 # API entries importable from `keras.models`:
-Model = training.Model  # pylint: disable=invalid-name
-Sequential = sequential.Sequential  # pylint: disable=invalid-name
+Model = training.Model
+Sequential = sequential.Sequential
 
 
 # Callable used to clone a layer with weights preserved.
 def share_weights(layer):
-  return layer
+    return layer
 
 
 def _clone_layer(layer):
-  return layer.__class__.from_config(layer.get_config())
+    return layer.__class__.from_config(layer.get_config())
 
 
 def _insert_ancillary_layers(model, ancillary_layers, metrics_names, new_nodes):
-  """Inserts ancillary layers into the model with the proper order."""
-  # Sort `AddMetric` layers so they agree with metrics_names.
-  metric_layers = [
-      layer for layer in ancillary_layers if isinstance(layer, AddMetric)
-  ]
-  metric_layers.sort(key=lambda layer: metrics_names.index(layer.metric_name))
-  ancillary_layers = [
-      layer for layer in ancillary_layers if not isinstance(layer, AddMetric)
-  ] + metric_layers
-  model._insert_layers(ancillary_layers, relevant_nodes=list(new_nodes))
+    """Inserts ancillary layers into the model with the proper order."""
+    # Sort `AddMetric` layers so they agree with metrics_names.
+    metric_layers = [
+        layer for layer in ancillary_layers if isinstance(layer, AddMetric)
+    ]
+    metric_layers.sort(key=lambda layer: metrics_names.index(layer.metric_name))
+    ancillary_layers = [
+        layer for layer in ancillary_layers if not isinstance(layer, AddMetric)
+    ] + metric_layers
+    model._insert_layers(ancillary_layers, relevant_nodes=list(new_nodes))
 
 
 def _make_new_nodes(nodes_by_depth, layer_fn, layer_map, tensor_map):
-  """Uses the layers in `layer_map` to make new nodes based on `nodes_by_depth`.
-
-  Args:
-    nodes_by_depth: Provides structure information to create new nodes.
-    layer_fn: Function to clone layers.
-    layer_map: Map from layers in `model` to new layers.
-    tensor_map: Map from tensors in `model` to newly compute tensors.
-
-  Returns:
-    A set of new nodes. `layer_map` and `tensor_map` are updated.
-  """
-  # Iterated over every node in the reference model, in depth order.
-  new_nodes = set()
-  depth_keys = list(nodes_by_depth.keys())
-  depth_keys.sort(reverse=True)
-  for depth in depth_keys:
-    nodes = nodes_by_depth[depth]
-    for node in nodes:
-      # Recover the corresponding layer.
-      layer = node.outbound_layer
-
-      # Get or create layer.
-      if layer not in layer_map:
-        new_layer = layer_fn(layer)
-        layer_map[layer] = new_layer
-        layer = new_layer
-      else:
-        # Reuse previously cloned layer.
-        layer = layer_map[layer]
-        # Don't call InputLayer multiple times.
-        if isinstance(layer, InputLayer):
-          continue
-
-      # If all previous input tensors are available in tensor_map,
-      # then call node.inbound_layer on them.
-      if all(
-          tensor in tensor_map for tensor in tf.nest.flatten(node.input_tensors)):
-        # Call layer.
-        args = tf.nest.map_structure(lambda t: tensor_map.get(t, t),
-                                  node.call_args)
-        kwargs = tf.nest.map_structure(lambda t: tensor_map.get(t, t),
-                                    node.call_kwargs)
-        output_tensors = layer(*args, **kwargs)
-
-        # Thread-safe way to keep track of what node was created.
-        first_output_tensor = tf.nest.flatten(output_tensors)[0]
-        new_nodes.add(
-            layer._inbound_nodes[first_output_tensor._keras_history.node_index])
-
-        for x, y in zip(
-            tf.nest.flatten(node.output_tensors), tf.nest.flatten(output_tensors)):
-          tensor_map[x] = y
-  return new_nodes
+    """Make new nodes with the layers in `layer_map` based on `nodes_by_depth`.
+
+    Args:
+      nodes_by_depth: Provides structure information to create new nodes.
+      layer_fn: Function to clone layers.
+      layer_map: Map from layers in `model` to new layers.
+      tensor_map: Map from tensors in `model` to newly compute tensors.
+
+    Returns:
+      A set of new nodes. `layer_map` and `tensor_map` are updated.
+    """
+    # Iterated over every node in the reference model, in depth order.
+    new_nodes = set()
+    depth_keys = list(nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    for depth in depth_keys:
+        nodes = nodes_by_depth[depth]
+        for node in nodes:
+            # Recover the corresponding layer.
+            layer = node.outbound_layer
+
+            # Get or create layer.
+            if layer not in layer_map:
+                new_layer = layer_fn(layer)
+                layer_map[layer] = new_layer
+                layer = new_layer
+            else:
+                # Reuse previously cloned layer.
+                layer = layer_map[layer]
+                # Don't call InputLayer multiple times.
+                if isinstance(layer, InputLayer):
+                    continue
+
+            # If all previous input tensors are available in tensor_map,
+            # then call node.inbound_layer on them.
+            if all(
+                tensor in tensor_map
+                for tensor in tf.nest.flatten(node.input_tensors)
+            ):
+                # Call layer.
+                args = tf.nest.map_structure(
+                    lambda t: tensor_map.get(t, t), node.call_args
+                )
+                kwargs = tf.nest.map_structure(
+                    lambda t: tensor_map.get(t, t), node.call_kwargs
+                )
+                output_tensors = layer(*args, **kwargs)
+
+                # Thread-safe way to keep track of what node was created.
+                first_output_tensor = tf.nest.flatten(output_tensors)[0]
+                new_nodes.add(
+                    layer._inbound_nodes[
+                        first_output_tensor._keras_history.node_index
+                    ]
+                )
+
+                for x, y in zip(
+                    tf.nest.flatten(node.output_tensors),
+                    tf.nest.flatten(output_tensors),
+                ):
+                    tensor_map[x] = y
+    return new_nodes
 
 
 def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
-  """Clone a functional `Model` instance.
-
-  Model cloning is similar to calling a model on new inputs,
-  except that it creates new layers (and thus new weights) instead
-  of sharing the weights of the existing layers.
-
-  Input layers are always cloned.
-
-  Args:
-      model: Instance of `Model`.
-      input_tensors: optional list of input tensors
-          to build the model upon. If not provided,
-          placeholders will be created.
-      layer_fn: callable to be applied on non-input layers in the model. By
-          default it clones the layer. Another example is to preserve the layer
-          to share the weights. This is required when we create a per-replica
-          copy of the model with distribution strategy; we want the weights to
-          be shared but still feed inputs separately so we create new input
-          layers.
-
-  Returns:
-      An instance of `Model` reproducing the behavior
-      of the original model, on top of new inputs tensors,
-      using newly instantiated weights.
-
-  Raises:
-      ValueError: in case of invalid `model` argument value or `layer_fn`
-      argument value.
-  """
-  if not isinstance(model, Model):
-    raise ValueError('Expected `model` argument '
-                     f'to be a `Model` instance. Received: model={model}')
-  if isinstance(model, Sequential):
-    raise ValueError('Expected `model` argument '
-                     'to be a functional `Model` instance, '
-                     f'got a `Sequential` instance instead: {model}')
-  if not model._is_graph_network:
-    raise ValueError('Expected `model` argument '
-                     'to be a functional `Model` instance, '
-                     f'but got a subclassed model instead: {model}')
-
-  new_input_layers = {}  # Cache for created layers.
-  if input_tensors is not None:
-    # Make sure that all input tensors come from a Keras layer.
-    input_tensors = tf.nest.flatten(input_tensors)
-    for i, input_tensor in enumerate(input_tensors):
-      original_input_layer = model._input_layers[i]
-
-      # Cache input layer. Create a new layer if the tensor is originally not
-      # from a Keras layer.
-      if not backend.is_keras_tensor(input_tensor):
-        name = original_input_layer.name
-        input_tensor = Input(tensor=input_tensor,
-                             name='input_wrapper_for_' + name)
-        newly_created_input_layer = input_tensor._keras_history.layer
-        new_input_layers[original_input_layer] = newly_created_input_layer
-      else:
-        new_input_layers[
-            original_input_layer] = input_tensor._keras_history.layer
-
-  if not callable(layer_fn):
-    raise ValueError('Expected `layer_fn` argument to be a callable. '
-                     f'Received: layer_fn={layer_fn}')
-
-  model_configs, created_layers = _clone_layers_and_model_config(
-      model, new_input_layers, layer_fn)
-  # Reconstruct model from the config, using the cloned layers.
-  input_tensors, output_tensors, created_layers = (
-      functional.reconstruct_from_config(model_configs,
-                                         created_layers=created_layers))
-  metrics_names = model.metrics_names
-  model = Model(input_tensors, output_tensors, name=model.name)
-  # Layers not directly tied to outputs of the Model, such as loss layers
-  # created in `add_loss` and `add_metric`.
-  ancillary_layers = [
-      layer for layer in created_layers.values() if layer not in model.layers
-  ]
-  # TODO(b/162887610): This may need to adjust the inbound node index if the
-  # created layers had already been used to define other models.
-  if ancillary_layers:
-    new_nodes = tf.nest.flatten([
-        layer.inbound_nodes[1:]
-        if functional._should_skip_first_node(layer)
-        else layer.inbound_nodes for layer in created_layers.values()
-    ])
-    _insert_ancillary_layers(model, ancillary_layers, metrics_names, new_nodes)
-  return model
+    """Clone a functional `Model` instance.
+
+    Model cloning is similar to calling a model on new inputs,
+    except that it creates new layers (and thus new weights) instead
+    of sharing the weights of the existing layers.
+
+    Input layers are always cloned.
+
+    Args:
+        model: Instance of `Model`.
+        input_tensors: optional list of input tensors
+            to build the model upon. If not provided,
+            placeholders will be created.
+        layer_fn: callable to be applied on non-input layers in the model. By
+            default it clones the layer. Another example is to preserve the
+            layer to share the weights. This is required when we create a
+            per-replica copy of the model with distribution strategy; we want
+            the weights to be shared but still feed inputs separately so we
+            create new input layers.
+
+    Returns:
+        An instance of `Model` reproducing the behavior
+        of the original model, on top of new inputs tensors,
+        using newly instantiated weights.
+
+    Raises:
+        ValueError: in case of invalid `model` argument value or `layer_fn`
+        argument value.
+    """
+    if layer_fn is None:
+        layer_fn = _clone_layer
+
+    if not isinstance(model, Model):
+        raise ValueError(
+            "Expected `model` argument "
+            f"to be a `Model` instance. Received: model={model}"
+        )
+    if isinstance(model, Sequential):
+        raise ValueError(
+            "Expected `model` argument "
+            "to be a functional `Model` instance, "
+            f"got a `Sequential` instance instead: {model}"
+        )
+    if not model._is_graph_network:
+        raise ValueError(
+            "Expected `model` argument "
+            "to be a functional `Model` instance, "
+            f"but got a subclassed model instead: {model}"
+        )
+
+    new_input_layers = {}  # Cache for created layers.
+    if input_tensors is not None:
+        # Make sure that all input tensors come from a Keras layer.
+        input_tensors = tf.nest.flatten(input_tensors)
+        for i, input_tensor in enumerate(input_tensors):
+            original_input_layer = model._input_layers[i]
+
+            # Cache input layer. Create a new layer if the tensor is originally
+            # not from a Keras layer.
+            if not backend.is_keras_tensor(input_tensor):
+                name = original_input_layer.name
+                input_tensor = Input(
+                    tensor=input_tensor, name="input_wrapper_for_" + name
+                )
+                newly_created_input_layer = input_tensor._keras_history.layer
+                new_input_layers[
+                    original_input_layer
+                ] = newly_created_input_layer
+            else:
+                new_input_layers[
+                    original_input_layer
+                ] = input_tensor._keras_history.layer
+
+    if not callable(layer_fn):
+        raise ValueError(
+            "Expected `layer_fn` argument to be a callable. "
+            f"Received: layer_fn={layer_fn}"
+        )
+
+    # For affected g3 users who need to default to old serialization in cloning
+    if getattr(model, "use_legacy_config", False):
+        with keras_option_scope(
+            save_traces=False, in_tf_saved_model_scope=True
+        ):
+            model_configs, created_layers = _clone_layers_and_model_config(
+                model, new_input_layers, layer_fn
+            )
+    else:
+        model_configs, created_layers = _clone_layers_and_model_config(
+            model, new_input_layers, layer_fn
+        )
+    # Reconstruct model from the config, using the cloned layers.
+    (
+        input_tensors,
+        output_tensors,
+        created_layers,
+    ) = functional.reconstruct_from_config(
+        model_configs, created_layers=created_layers
+    )
+    metrics_names = model.metrics_names
+    if functional.has_functional_like_constructor(model.__class__):
+        new_model = model.__class__(
+            input_tensors, output_tensors, name=model.name
+        )
+    else:
+        # This may be incorrect: the new model will end up having a different
+        # class than the original. However various existing models rely
+        # on this behavior, so we keep it.
+        new_model = Model(input_tensors, output_tensors, name=model.name)
+
+    # Layers not directly tied to outputs of the Model, such as loss layers
+    # created in `add_loss` and `add_metric`.
+    ancillary_layers = [
+        layer
+        for layer in created_layers.values()
+        if layer not in new_model.layers
+    ]
+    # TODO(b/162887610): This may need to adjust the inbound node index if the
+    # created layers had already been used to define other models.
+    if ancillary_layers:
+        new_nodes = tf.nest.flatten(
+            [
+                layer.inbound_nodes[1:]
+                if functional._should_skip_first_node(layer)
+                else layer.inbound_nodes
+                for layer in created_layers.values()
+            ]
+        )
+        _insert_ancillary_layers(
+            new_model, ancillary_layers, metrics_names, new_nodes
+        )
+    return new_model
 
 
 def _clone_layers_and_model_config(model, input_layers, layer_fn):
-  """Clones all layers, and returns the model config without serializing layers.
-
-  This function ensures that only the node graph is retrieved when getting the
-  model config. The `layer_fn` used to clone layers might not rely on
-  `layer.get_config()`, so some custom layers do not define `get_config`.
-  Trying to retrieve the config results in errors.
-
-  Args:
-    model: A Functional model.
-    input_layers: Dictionary mapping input layers in `model` to new input layers
-    layer_fn: Function used to clone all non-input layers.
-
-  Returns:
-    Model config object, and a dictionary of newly created layers.
-  """
-  created_layers = {}
-  def _copy_layer(layer):
-    # Whenever the network config attempts to get the layer serialization,
-    # return a dummy dictionary.
-    if layer in input_layers:
-      created_layers[layer.name] = input_layers[layer]
-    elif layer in model._input_layers:
-      created_layers[layer.name] = InputLayer(**layer.get_config())
-    else:
-      created_layers[layer.name] = layer_fn(layer)
-    return {}
+    """Clones all layers; returns the model config without serializing layers.
+
+    This function ensures that only the node graph is retrieved when getting the
+    model config. The `layer_fn` used to clone layers might not rely on
+    `layer.get_config()`, so some custom layers do not define `get_config`.
+    Trying to retrieve the config results in errors.
+
+    Args:
+      model: A Functional model.
+      input_layers: Dictionary mapping input layers in `model` to new input
+        layers.
+      layer_fn: Function used to clone all non-input layers.
+
+    Returns:
+      Model config object, and a dictionary of newly created layers.
+    """
+    created_layers = {}
+
+    def _copy_layer(layer):
+        # Whenever the network config attempts to get the layer serialization,
+        # return a dummy dictionary.
+        if layer in input_layers:
+            created_layers[layer.name] = input_layers[layer]
+        elif layer in model._input_layers:
+            created_layers[layer.name] = InputLayer(**layer.get_config())
+        else:
+            created_layers[layer.name] = layer_fn(layer)
+        return {}
 
-  config = functional.get_network_config(
-      model, serialize_layer_fn=_copy_layer)
-  return config, created_layers
+    config = functional.get_network_config(
+        model, serialize_layer_fn=_copy_layer
+    )
+    return config, created_layers
 
 
 def _remove_ancillary_layers(model, layer_map, layers):
-  """Removes and returns any ancillary layers from `layers` based on `model`.
+    """Removes and returns any ancillary layers from `layers` based on `model`.
 
-  Ancillary layers are part of the model topology but not used to compute the
-  model outputs, e.g., layers from `add_loss` and `add_metric`.
+    Ancillary layers are part of the model topology but not used to compute the
+    model outputs, e.g., layers from `add_loss` and `add_metric`.
 
-  Args:
-    model: A Keras Model.
-    layer_map: A map to from layers in the `model` to those in `layers`.
-    layers: A list of all layers.
+    Args:
+      model: A Keras Model.
+      layer_map: A map to from layers in the `model` to those in `layers`.
+      layers: A list of all layers.
 
-  Returns:
-    Two lists of layers: (1) `layers` with the ancillary layers removed, and (2)
-    the ancillary layers.
-  """
-  ancillary_layers = []  # Additional layers for computing losses and metrics.
-  if not model._is_graph_network:
-    return layers, ancillary_layers
+    Returns:
+      Two lists of layers: (1) `layers` with the ancillary layers removed, and
+      (2) the ancillary layers.
+    """
+    ancillary_layers = []  # Additional layers for computing losses and metrics.
+    if not model._is_graph_network:
+        return layers, ancillary_layers
 
-  # Ancillary layers are those with depth < 0.
-  depths = [depth for depth in model._nodes_by_depth.keys() if depth < 0]
-  depths.sort(reverse=True)  # Order topologically from inputs to outputs.
-  for depth in depths:
-    for node in model._nodes_by_depth[depth]:
-      ancillary_layers.append(layer_map[node.outbound_layer])
+    # Ancillary layers are those with depth < 0.
+    depths = [depth for depth in model._nodes_by_depth.keys() if depth < 0]
+    depths.sort(reverse=True)  # Order topologically from inputs to outputs.
+    for depth in depths:
+        for node in model._nodes_by_depth[depth]:
+            ancillary_layers.append(layer_map[node.outbound_layer])
 
-  return [l for l in layers if l not in ancillary_layers], ancillary_layers
+    return [l for l in layers if l not in ancillary_layers], ancillary_layers
 
 
 def _clone_sequential_model(model, input_tensors=None, layer_fn=_clone_layer):
-  """Clone a `Sequential` model instance.
-
-  Model cloning is similar to calling a model on new inputs,
-  except that it creates new layers (and thus new weights) instead
-  of sharing the weights of the existing layers.
-
-  Args:
-      model: Instance of `Sequential`.
-      input_tensors: optional list of input tensors
-          to build the model upon. If not provided,
-          placeholders will be created.
-      layer_fn: callable to be applied on non-input layers in the model. By
-          default it clones the layer. Another example is to preserve the layer
-          to share the weights. This is required when we create a per-replica
-          copy of the model with distribution strategy; we want the weights to
-          be shared but still feed inputs separately so we create new input
-          layers.
-
-  Returns:
-      An instance of `Sequential` reproducing the behavior
-      of the original model, on top of new inputs tensors,
-      using newly instantiated weights.
-
-  Raises:
-      ValueError: in case of invalid `model` argument value or `layer_fn`
-      argument value.
-  """
-  if not isinstance(model, Sequential):
-    raise ValueError('Expected `model` argument '
-                     'to be a `Sequential` model instance. '
-                     f'Received: model={model}')
-
-  if not callable(layer_fn):
-    raise ValueError(
-        'Expected `layer_fn` argument to be a callable. '
-        f'Received: layer_fn={layer_fn}')
-
-  layers = []  # Layers needed to compute the model's outputs.
-  layer_map = {}
-  # Ensure that all layers are cloned. The model's layers
-  # property will exclude the initial InputLayer (if it exists) in the model,
-  # resulting in a different Sequential model structure.
-  for layer in model._flatten_layers(include_self=False, recursive=False):
-    if isinstance(layer, InputLayer) and input_tensors is not None:
-      # If input tensors are provided, the original model's InputLayer is
-      # overwritten with a different InputLayer.
-      continue
-    cloned_layer = (
-        _clone_layer(layer)
-        if isinstance(layer, InputLayer) else layer_fn(layer))
-    layers.append(cloned_layer)
-    layer_map[layer] = cloned_layer
-  layers, ancillary_layers = _remove_ancillary_layers(model, layer_map, layers)
-
-  if input_tensors is None:
-    cloned_model = Sequential(layers=layers, name=model.name)
-  elif len(generic_utils.to_list(input_tensors)) != 1:
-    raise ValueError(
-        'To clone a `Sequential` model, we expect at most one tensor as part '
-        f'of `input_tensors`. Received: input_tensors={input_tensors}')
-  else:
-    # Overwrite the original model's input layer.
-    if isinstance(input_tensors, tuple):
-      input_tensors = list(input_tensors)
-    x = generic_utils.to_list(input_tensors)[0]
-    if backend.is_keras_tensor(x):
-      origin_layer = x._keras_history.layer
-      if isinstance(origin_layer, InputLayer):
-        cloned_model = Sequential(
-            layers=[origin_layer] + layers, name=model.name)
-      else:
-        raise ValueError('Cannot clone a `Sequential` model on top '
-                         'of a tensor that comes from a Keras layer '
-                         'other than an `InputLayer`. '
-                         'Use the Functional API instead. '
-                         f'Received: input_tensors={input_tensors}')
+    """Clone a `Sequential` model instance.
+
+    Model cloning is similar to calling a model on new inputs,
+    except that it creates new layers (and thus new weights) instead
+    of sharing the weights of the existing layers.
+
+    Args:
+        model: Instance of `Sequential`.
+        input_tensors: optional list of input tensors
+            to build the model upon. If not provided,
+            placeholders will be created.
+        layer_fn: callable to be applied on non-input layers in the model. By
+            default it clones the layer. Another example is to preserve the
+            layer to share the weights. This is required when we create a
+            per-replica copy of the model with distribution strategy; we want
+            the weights to be shared but still feed inputs separately so we
+            create new input layers.
+
+    Returns:
+        An instance of `Sequential` reproducing the behavior
+        of the original model, on top of new inputs tensors,
+        using newly instantiated weights.
+
+    Raises:
+        ValueError: in case of invalid `model` argument value or `layer_fn`
+        argument value.
+    """
+    if layer_fn is None:
+        layer_fn = _clone_layer
+
+    if not isinstance(model, Sequential):
+        raise ValueError(
+            "Expected `model` argument "
+            "to be a `Sequential` model instance. "
+            f"Received: model={model}"
+        )
+
+    if not callable(layer_fn):
+        raise ValueError(
+            "Expected `layer_fn` argument to be a callable. "
+            f"Received: layer_fn={layer_fn}"
+        )
+
+    layers = []  # Layers needed to compute the model's outputs.
+    layer_map = {}
+    # Ensure that all layers are cloned. The model's layers
+    # property will exclude the initial InputLayer (if it exists) in the model,
+    # resulting in a different Sequential model structure.
+    for layer in model._flatten_layers(include_self=False, recursive=False):
+        if isinstance(layer, InputLayer) and input_tensors is not None:
+            # If input tensors are provided, the original model's InputLayer is
+            # overwritten with a different InputLayer.
+            continue
+        cloned_layer = (
+            _clone_layer(layer)
+            if isinstance(layer, InputLayer)
+            else layer_fn(layer)
+        )
+        layers.append(cloned_layer)
+        layer_map[layer] = cloned_layer
+    layers, ancillary_layers = _remove_ancillary_layers(
+        model, layer_map, layers
+    )
+
+    if input_tensors is None:
+        cloned_model = Sequential(layers=layers, name=model.name)
+    elif len(generic_utils.to_list(input_tensors)) != 1:
+        raise ValueError(
+            "To clone a `Sequential` model, we expect at most one tensor as "
+            f"part of `input_tensors`. Received: input_tensors={input_tensors}"
+        )
     else:
-      input_tensor = Input(tensor=x, name='input_wrapper_for_' + str(x.name))
-      input_layer = input_tensor._keras_history.layer
-      cloned_model = Sequential(layers=[input_layer] + layers, name=model.name)
-
-  if not ancillary_layers:
+        # Overwrite the original model's input layer.
+        if isinstance(input_tensors, tuple):
+            input_tensors = list(input_tensors)
+        x = generic_utils.to_list(input_tensors)[0]
+        if backend.is_keras_tensor(x):
+            origin_layer = x._keras_history.layer
+            if isinstance(origin_layer, InputLayer):
+                cloned_model = Sequential(
+                    layers=[origin_layer] + layers, name=model.name
+                )
+            else:
+                raise ValueError(
+                    "Cannot clone a `Sequential` model on top "
+                    "of a tensor that comes from a Keras layer "
+                    "other than an `InputLayer`. "
+                    "Use the Functional API instead. "
+                    f"Received: input_tensors={input_tensors}"
+                )
+        else:
+            input_tensor = Input(
+                tensor=x, name="input_wrapper_for_" + str(x.name)
+            )
+            input_layer = input_tensor._keras_history.layer
+            cloned_model = Sequential(
+                layers=[input_layer] + layers, name=model.name
+            )
+
+    if not ancillary_layers:
+        return cloned_model
+
+    tensor_map = {}  # Maps tensors from `model` to those in `cloned_model`.
+    for depth, cloned_nodes in cloned_model._nodes_by_depth.items():
+        nodes = model._nodes_by_depth[depth]
+        # This should be safe in a Sequential model. In an arbitrary network,
+        # you need to sort using the outbound layer of the node as a key.
+        for cloned_node, node in zip(cloned_nodes, nodes):
+            if isinstance(cloned_node.output_tensors, list):
+                for j, output_tensor in enumerate(cloned_node.output_tensors):
+                    tensor_map[node.output_tensors[j]] = output_tensor
+            else:
+                tensor_map[node.output_tensors] = cloned_node.output_tensors
+    # Ancillary nodes have negative depth.
+    new_nodes = _make_new_nodes(
+        {
+            depth: nodes
+            for depth, nodes in model._nodes_by_depth.items()
+            if depth < 0
+        },
+        layer_fn,
+        layer_map,
+        tensor_map,
+    )
+    _insert_ancillary_layers(
+        cloned_model, ancillary_layers, model.metrics_names, new_nodes
+    )
     return cloned_model
 
-  tensor_map = {}  # Maps tensors from `model` to those in `cloned_model`.
-  for depth, cloned_nodes in cloned_model._nodes_by_depth.items():
-    nodes = model._nodes_by_depth[depth]
-    # This should be safe in a Sequential model. In an arbitrary network, you
-    # need to sort using the outbound layer of the node as a key.
-    for cloned_node, node in zip(cloned_nodes, nodes):
-      if isinstance(cloned_node.output_tensors, list):
-        for j, output_tensor in enumerate(cloned_node.output_tensors):
-          tensor_map[node.output_tensors[j]] = output_tensor
-      else:
-        tensor_map[node.output_tensors] = cloned_node.output_tensors
-  # Ancillary nodes have negative depth.
-  new_nodes = _make_new_nodes(
-      {
-          depth: nodes
-          for depth, nodes in model._nodes_by_depth.items()
-          if depth < 0
-      }, layer_fn, layer_map, tensor_map)
-  _insert_ancillary_layers(cloned_model, ancillary_layers, model.metrics_names,
-                           new_nodes)
-  return cloned_model
-
-
-@keras_export('keras.models.clone_model')
-def clone_model(model, input_tensors=None, clone_function=None):
-  """Clone a Functional or Sequential `Model` instance.
-
-  Model cloning is similar to calling a model on new inputs,
-  except that it creates new layers (and thus new weights) instead
-  of sharing the weights of the existing layers.
-
-  Note that
-  `clone_model` will not preserve the uniqueness of shared objects within the
-  model (e.g. a single variable attached to two distinct layers will be
-  restored as two separate variables).
-
-  Args:
-      model: Instance of `Model`
-          (could be a Functional model or a Sequential model).
-      input_tensors: optional list of input tensors or InputLayer objects
-          to build the model upon. If not provided,
-          new `Input` objects will be created.
-      clone_function: Callable to be used to clone each layer in the target
-          model (except `InputLayer` instances). It takes as argument the layer
-          instance to be cloned, and returns the corresponding layer instance to
-          be used in the model copy. If unspecified, this callable defaults to
-          the following serialization/deserialization function:
-          `lambda layer: layer.__class__.from_config(layer.get_config())`.
-          By passing a custom callable, you can customize your copy of the
-          model, e.g. by wrapping certain layers of interest (you might want to
-          replace all `LSTM` instances with equivalent
-          `Bidirectional(LSTM(...))` instances, for example).
-
-  Returns:
-    An instance of `Model` reproducing the behavior
-    of the original model, on top of new inputs tensors,
-    using newly instantiated weights. The cloned model may behave
-    differently from the original model if a custom `clone_function`
-    modifies the layer.
-
-  Example:
-
-  ```python
-  # Create a test Sequential model.
-  model = keras.Sequential([
-      keras.Input(shape=(728,)),
-      keras.layers.Dense(32, activation='relu'),
-      keras.layers.Dense(1, activation='sigmoid'),
-  ])
-  # Create a copy of the test model (with freshly initialized weights).
-  new_model = clone_model(model)
-  ```
-
-  Note that subclassed models cannot be cloned, since their internal
-  layer structure is not known. To achieve equivalent functionality
-  as `clone_model` in the case of a subclassed model, simply make sure
-  that the model class implements `get_config()`
-  (and optionally `from_config()`), and call:
-
-  ```python
-  new_model = model.__class__.from_config(model.get_config())
-  ```
-  """
-  with generic_utils.DisableSharedObjectScope():
-    if clone_function is None:
-      clone_function = _clone_layer
 
-    if isinstance(model, Sequential):
-      return _clone_sequential_model(
-          model, input_tensors=input_tensors, layer_fn=clone_function)
-    else:
-      return _clone_functional_model(
-          model, input_tensors=input_tensors, layer_fn=clone_function)
+@keras_export("keras.models.clone_model")
+def clone_model(model, input_tensors=None, clone_function=None):
+    """Clone a Functional or Sequential `Model` instance.
+
+    Model cloning is similar to calling a model on new inputs,
+    except that it creates new layers (and thus new weights) instead
+    of sharing the weights of the existing layers.
+
+    Note that
+    `clone_model` will not preserve the uniqueness of shared objects within the
+    model (e.g. a single variable attached to two distinct layers will be
+    restored as two separate variables).
+
+    Args:
+        model: Instance of `Model`
+            (could be a Functional model or a Sequential model).
+        input_tensors: optional list of input tensors or InputLayer objects
+            to build the model upon. If not provided,
+            new `Input` objects will be created.
+        clone_function: Callable to be used to clone each layer in the target
+            model (except `InputLayer` instances). It takes as argument the
+            layer instance to be cloned, and returns the corresponding layer
+            instance to be used in the model copy. If unspecified, this callable
+            becomes the following serialization/deserialization function:
+            `lambda layer: layer.__class__.from_config(layer.get_config())`.
+            By passing a custom callable, you can customize your copy of the
+            model, e.g. by wrapping certain layers of interest (you might want
+            to replace all `LSTM` instances with equivalent
+            `Bidirectional(LSTM(...))` instances, for example).
+            Defaults to `None`.
+
+    Returns:
+      An instance of `Model` reproducing the behavior
+      of the original model, on top of new inputs tensors,
+      using newly instantiated weights. The cloned model may behave
+      differently from the original model if a custom `clone_function`
+      modifies the layer.
+
+    Example:
+
+    ```python
+    # Create a test Sequential model.
+    model = keras.Sequential([
+        keras.Input(shape=(728,)),
+        keras.layers.Dense(32, activation='relu'),
+        keras.layers.Dense(1, activation='sigmoid'),
+    ])
+    # Create a copy of the test model (with freshly initialized weights).
+    new_model = clone_model(model)
+    ```
+
+    Note that subclassed models cannot be cloned, since their internal
+    layer structure is not known. To achieve equivalent functionality
+    as `clone_model` in the case of a subclassed model, simply make sure
+    that the model class implements `get_config()`
+    (and optionally `from_config()`), and call:
+
+    ```python
+    new_model = model.__class__.from_config(model.get_config())
+    ```
+    """
+    with serialization.DisableSharedObjectScope():
+        if isinstance(model, Sequential):
+            return _clone_sequential_model(
+                model, input_tensors=input_tensors, layer_fn=clone_function
+            )
+        if isinstance(model, functional.Functional):
+            # If the get_config() method is the same as a regular Functional
+            # model, we're safe to use _clone_functional_model (which relies
+            # on a Functional constructor). In the case where the get_config
+            # is custom, this may not necessarily work, but if clone_function
+            # or input_tensors are passed, we attempt it anyway
+            # in order to preserve backwards compatibility.
+            if generic_utils.is_default(model.get_config) or (
+                clone_function or input_tensors
+            ):
+                return _clone_functional_model(
+                    model, input_tensors=input_tensors, layer_fn=clone_function
+                )
+
+        # Case of a custom model class
+        if clone_function or input_tensors:
+            raise ValueError(
+                "Arguments clone_function and input_tensors "
+                "are only supported for Sequential models "
+                "or Functional models. Received model of "
+                f"type '{model.__class__.__name__}', with "
+                f"clone_function={clone_function} and "
+                f"input_tensors={input_tensors}"
+            )
+        # Note that a custom object scope may be required in this case.
+        return model.__class__.from_config(model.get_config())
 
 
 # "Clone" a subclassed model by resetting all of the attributes.
 def _in_place_subclassed_model_reset(model):
-  """Substitute for model cloning that works for subclassed models.
-
-  Subclassed models cannot be cloned because their topology is not serializable.
-  To "instantiate" an identical model in a new TF graph, we reuse the original
-  model object, but we clear its state.
-
-  After calling this function on a model instance, you can use the model
-  instance as if it were a model clone (in particular you can use it in a new
-  graph).
-
-  This method clears the state of the input model. It is thus destructive.
-  However the original state can be restored fully by calling
-  `_in_place_subclassed_model_state_restoration`.
-
-  Args:
-    model: Instance of a Keras model created via subclassing.
-
-  Raises:
-    ValueError: In case the model uses a subclassed model as inner layer.
-  """
-  assert not model._is_graph_network  # Only makes sense for subclassed networks
-  # Select correct base class for new Model.
-  version_utils.swap_class(model.__class__, training.Model, training_v1.Model,
-                           tf.compat.v1.executing_eagerly_outside_functions())
-  # Retrieve all layers tracked by the model as well as their attribute names
-  attributes_cache = {}
-  for name in dir(model):
-    # Skip attrs that track other trackables.
-    if name == 'submodules' or name == '_self_tracked_trackables':
-      continue
-
-    try:
-      value = getattr(model, name)
-    except (AttributeError, ValueError, TypeError):
-      continue
-    if isinstance(value, Layer):
-      attributes_cache[name] = value
-      assert value in model.layers
-      if hasattr(value, 'layers') and value.layers:
-        raise ValueError('We do not support the use of nested layers '
-                         'in `model_to_estimator` at this time. Found nested '
-                         f'layer: {value}')
-    elif isinstance(
-        value, (list, tuple)) and name not in ('layers', '_layers', 'metrics',
-                                               '_compile_metric_functions',
-                                               '_output_loss_metrics'):
-      # Handle case: list/tuple of layers (also tracked by the Network API).
-      if value and all(isinstance(val, Layer) for val in value):
-        raise ValueError('We do not support the use of list-of-layers '
-                         'attributes in subclassed models used with '
-                         '`model_to_estimator` at this time. Found list '
-                         f'model: {name}')
-
-  # Replace layers on the model with fresh layers
-  layers_to_names = {value: key for key, value in attributes_cache.items()}
-  original_layers = list(
-      model._flatten_layers(include_self=False, recursive=False))
-  setattr_tracking = model._setattr_tracking
-  model._setattr_tracking = False
-  model._self_tracked_trackables = []
-  for layer in original_layers:  # We preserve layer order.
-    config = layer.get_config()
-    # This will not work for nested subclassed models used as layers.
-    # This would be theoretically possible to support, but would add complexity.
-    # Only do it if users complain.
-    if isinstance(layer, training.Model) and not layer._is_graph_network:
-      raise ValueError('We do not support the use of nested subclassed models '
-                       'in `model_to_estimator` at this time. Found nested '
-                       f'model: {layer}')
-    fresh_layer = layer.__class__.from_config(config)
-    name = layers_to_names[layer]
-    setattr(model, name, fresh_layer)
-    model._self_tracked_trackables.append(fresh_layer)
-
-  # Cache original model build attributes (in addition to layers)
-  if (not hasattr(model, '_original_attributes_cache') or
-      model._original_attributes_cache is None):
-    if model.built:
-      attributes_to_cache = [
-          'inputs',
-          'outputs',
-          'total_loss',
-          'optimizer',
-          'train_function',
-          'test_function',
-          'predict_function',
-          '_training_endpoints',
-          '_collected_trainable_weights',
-          '_feed_inputs',
-          '_feed_input_names',
-          '_feed_input_shapes',
-      ]
-      for name in attributes_to_cache:
-        attributes_cache[name] = getattr(model, name)
-  model._original_attributes_cache = attributes_cache
-  _reset_build_compile_trackers(model)
-  model._setattr_tracking = setattr_tracking
+    """Substitute for model cloning that works for subclassed models.
+
+    Subclassed models cannot be cloned because their topology is not
+    serializable. To "instantiate" an identical model in a new TF graph, we
+    reuse the original model object, but we clear its state.
+
+    After calling this function on a model instance, you can use the model
+    instance as if it were a model clone (in particular you can use it in a new
+    graph).
+
+    This method clears the state of the input model. It is thus destructive.
+    However the original state can be restored fully by calling
+    `_in_place_subclassed_model_state_restoration`.
+
+    Args:
+      model: Instance of a Keras model created via subclassing.
+
+    Raises:
+      ValueError: In case the model uses a subclassed model as inner layer.
+    """
+    assert (
+        not model._is_graph_network
+    )  # Only makes sense for subclassed networks
+    # Select correct base class for new Model.
+    version_utils.swap_class(
+        model.__class__,
+        training.Model,
+        training_v1.Model,
+        tf.compat.v1.executing_eagerly_outside_functions(),
+    )
+    # Retrieve all layers tracked by the model as well as their attribute names
+    attributes_cache = {}
+    for name in dir(model):
+        # Skip attrs that track other trackables.
+        if name == "submodules" or name == "_self_tracked_trackables":
+            continue
+
+        try:
+            value = getattr(model, name)
+        except (AttributeError, ValueError, TypeError):
+            continue
+        if isinstance(value, Layer):
+            attributes_cache[name] = value
+            assert value in model.layers
+            if hasattr(value, "layers") and value.layers:
+                raise ValueError(
+                    "We do not support the use of nested layers "
+                    "in `model_to_estimator` at this time. Found nested "
+                    f"layer: {value}"
+                )
+        elif isinstance(value, (list, tuple)) and name not in (
+            "layers",
+            "_layers",
+            "metrics",
+            "_compile_metric_functions",
+            "_output_loss_metrics",
+        ):
+            # Handle case: list/tuple of layers (also tracked by the Network
+            # API).
+            if value and all(isinstance(val, Layer) for val in value):
+                raise ValueError(
+                    "We do not support the use of list-of-layers "
+                    "attributes in subclassed models used with "
+                    "`model_to_estimator` at this time. Found list "
+                    f"model: {name}"
+                )
+
+    # Replace layers on the model with fresh layers
+    layers_to_names = {value: key for key, value in attributes_cache.items()}
+    original_layers = list(
+        model._flatten_layers(include_self=False, recursive=False)
+    )
+    setattr_tracking = model._setattr_tracking
+    model._setattr_tracking = False
+    model._self_tracked_trackables = []
+    for layer in original_layers:  # We preserve layer order.
+        config = layer.get_config()
+        # This will not work for nested subclassed models used as layers.
+        # This would be theoretically possible to support, but would add
+        # complexity. Only do it if users complain.
+        if isinstance(layer, training.Model) and not layer._is_graph_network:
+            raise ValueError(
+                "We do not support the use of nested subclassed models "
+                "in `model_to_estimator` at this time. Found nested "
+                f"model: {layer}"
+            )
+        fresh_layer = layer.__class__.from_config(config)
+        name = layers_to_names[layer]
+        setattr(model, name, fresh_layer)
+        model._self_tracked_trackables.append(fresh_layer)
+
+    # Cache original model build attributes (in addition to layers)
+    if (
+        not hasattr(model, "_original_attributes_cache")
+        or model._original_attributes_cache is None
+    ):
+        if model.built:
+            attributes_to_cache = [
+                "inputs",
+                "outputs",
+                "total_loss",
+                "optimizer",
+                "train_function",
+                "test_function",
+                "predict_function",
+                "_training_endpoints",
+                "_collected_trainable_weights",
+                "_feed_inputs",
+                "_feed_input_names",
+                "_feed_input_shapes",
+            ]
+            for name in attributes_to_cache:
+                attributes_cache[name] = getattr(model, name)
+    model._original_attributes_cache = attributes_cache
+    _reset_build_compile_trackers(model)
+    model._setattr_tracking = setattr_tracking
 
 
 def _reset_build_compile_trackers(model):
-  """Reset state trackers for model.
-
-  Note that we do not actually zero out attributes such as optimizer,
-  but instead rely on the expectation that all of the attrs will be
-  over-written on calling build/compile/etc. This is somewhat fragile,
-  insofar as we check elsewhere for the presence of these attributes as
-  evidence of having been built/compiled/etc. Pending a better way to do this,
-  we reset key attributes here to allow building and compiling.
-
-  Args:
-    model: the model that is being reset
-  """
-  # Reset build state
-  model.built = False
-  model.inputs = None
-  model.outputs = None
-  # Reset compile state
-  model._is_compiled = False  # pylint:disable=protected-access
-  if not tf.compat.v1.executing_eagerly_outside_functions():
-    model._v1_compile_was_called = False
-  model.optimizer = None
+    """Reset state trackers for model.
+
+    Note that we do not actually zero out attributes such as optimizer,
+    but instead rely on the expectation that all of the attrs will be
+    over-written on calling build/compile/etc. This is somewhat fragile,
+    insofar as we check elsewhere for the presence of these attributes as
+    evidence of having been built/compiled/etc. Pending a better way to do this,
+    we reset key attributes here to allow building and compiling.
+
+    Args:
+      model: the model that is being reset
+    """
+    # Reset build state
+    model.built = False
+    model.inputs = None
+    model.outputs = None
+    # Reset compile state
+    model._is_compiled = False
+    if not tf.compat.v1.executing_eagerly_outside_functions():
+        model._v1_compile_was_called = False
+    model.optimizer = None
 
 
 @keras_export(
-    'keras.__internal__.models.in_place_subclassed_model_state_restoration',
-    v1=[])
+    "keras.__internal__.models.in_place_subclassed_model_state_restoration",
+    v1=[],
+)
 def in_place_subclassed_model_state_restoration(model):
-  """Restores the original state of a model after it was "reset".
-
-  This undoes this action of `_in_place_subclassed_model_reset`, which is called
-  in `clone_and_build_model` if `in_place_reset` is set to True.
-
-  Args:
-    model: Instance of a Keras model created via subclassing, on which
-      `_in_place_subclassed_model_reset` was previously called.
-  """
-  assert not model._is_graph_network
-  # Restore layers and build attributes
-  if (hasattr(model, '_original_attributes_cache') and
-      model._original_attributes_cache is not None):
-    # Models have sticky attribute assignment, so we want to be careful to add
-    # back the previous attributes and track Layers by their original names
-    # without adding dependencies on "utility" attributes which Models exempt
-    # when they're constructed.
-    setattr_tracking = model._setattr_tracking
-    model._setattr_tracking = False
-    model._self_tracked_trackables = []
-    for name, value in model._original_attributes_cache.items():
-      setattr(model, name, value)
-      if isinstance(value, Layer):
-        model._self_tracked_trackables.append(value)
-    model._original_attributes_cache = None
-    model._setattr_tracking = setattr_tracking
-  else:
-    # Restore to the state of a never-called model.
-    _reset_build_compile_trackers(model)
+    """Restores the original state of a model after it was "reset".
+
+    This undoes this action of `_in_place_subclassed_model_reset`, which is
+    called in `clone_and_build_model` if `in_place_reset` is set to True.
+
+    Args:
+      model: Instance of a Keras model created via subclassing, on which
+        `_in_place_subclassed_model_reset` was previously called.
+    """
+    assert not model._is_graph_network
+    # Restore layers and build attributes
+    if (
+        hasattr(model, "_original_attributes_cache")
+        and model._original_attributes_cache is not None
+    ):
+        # Models have sticky attribute assignment, so we want to be careful to
+        # add back the previous attributes and track Layers by their original
+        # names without adding dependencies on "utility" attributes which Models
+        # exempt when they're constructed.
+        setattr_tracking = model._setattr_tracking
+        model._setattr_tracking = False
+        model._self_tracked_trackables = []
+        for name, value in model._original_attributes_cache.items():
+            setattr(model, name, value)
+            if isinstance(value, Layer):
+                model._self_tracked_trackables.append(value)
+        model._original_attributes_cache = None
+        model._setattr_tracking = setattr_tracking
+    else:
+        # Restore to the state of a never-called model.
+        _reset_build_compile_trackers(model)
 
 
-@keras_export('keras.__internal__.models.clone_and_build_model', v1=[])
+@keras_export("keras.__internal__.models.clone_and_build_model", v1=[])
 def clone_and_build_model(
-    model, input_tensors=None, target_tensors=None, custom_objects=None,
-    compile_clone=True, in_place_reset=False, optimizer_iterations=None,
-    optimizer_config=None):
-  """Clone a `Model` and build/compile it with the same settings used before.
-
-  This function can be run in the same graph or in a separate graph from the
-  model. When using a separate graph, `in_place_reset` must be `False`.
-
-  Note that, currently, the clone produced from this function may not work with
-  TPU DistributionStrategy. Try at your own risk.
-
-  Args:
-    model: `tf.keras.Model` object. Can be Functional, Sequential, or
-      sub-classed.
-    input_tensors: Optional list or dictionary of input tensors to build the
-      model upon. If not provided, placeholders will be created.
-    target_tensors: Optional list of target tensors for compiling the model. If
-      not provided, placeholders will be created.
-    custom_objects: Optional dictionary mapping string names to custom classes
-      or functions.
-    compile_clone: Boolean, whether to compile model clone (default `True`).
-    in_place_reset: Boolean, whether to reset the model in place. Only used if
-      the model is a subclassed model. In the case of a subclassed model,
-      this argument must be set to `True` (default `False`). To restore the
-      original model, use the function
-      `in_place_subclassed_model_state_restoration(model)`.
-    optimizer_iterations: An iterations variable that will be incremented by the
-      optimizer if the clone is compiled. This argument is used when a Keras
-      model is cloned into an Estimator model function, because Estimators
-      create their own global step variable.
-    optimizer_config: Optimizer config dictionary or list of dictionary
-      returned from `get_config()`. This argument should be defined if
-      `clone_and_build_model` is called in a different graph or session from
-      the original model, and the optimizer is an instance of `OptimizerV2`.
-
-  Returns:
-    Clone of the model.
-
-  Raises:
-    ValueError: Cloning fails in the following cases
-      - cloning a subclassed model with `in_place_reset` set to False.
-      - compiling the clone when the original model has not been compiled.
-  """
-  # Grab optimizer now, as we reset-in-place for subclassed models, but
-  # want to maintain access to the original optimizer.
-  orig_optimizer = model.optimizer
-  if compile_clone and not orig_optimizer:
-    raise ValueError(
-        'Error when cloning model: `compile_clone` was set to True, but the '
-        f'original model has not been compiled. Received: model={model}')
-
-  if compile_clone:
-    compile_args = model._get_compile_args()  # pylint: disable=protected-access
-    # Allows this method to be robust to switching graph and eager classes.
-    model._get_compile_args = lambda: compile_args
-
-  with CustomObjectScope(custom_objects or {}):
-    if model._is_graph_network:
-      clone = clone_model(model, input_tensors=input_tensors)
-    elif isinstance(model, Sequential):
-      clone = clone_model(model, input_tensors=input_tensors)
-      if (not clone._is_graph_network and model._build_input_shape is not None):
-        if tf.compat.v1.executing_eagerly_outside_functions():
-          clone.build(model._build_input_shape)
+    model,
+    input_tensors=None,
+    target_tensors=None,
+    custom_objects=None,
+    compile_clone=True,
+    in_place_reset=False,
+    optimizer_iterations=None,
+    optimizer_config=None,
+):
+    """Clone a `Model` and build/compile it with the same settings used before.
+
+    This function can be run in the same graph or in a separate graph from the
+    model. When using a separate graph, `in_place_reset` must be `False`.
+
+    Note that, currently, the clone produced from this function may not work
+    with TPU DistributionStrategy. Try at your own risk.
+
+    Args:
+      model: `tf.keras.Model` object. Can be Functional, Sequential, or
+        sub-classed.
+      input_tensors: Optional list or dictionary of input tensors to build the
+        model upon. If not provided, placeholders will be created.
+      target_tensors: Optional list of target tensors for compiling the model.
+        If not provided, placeholders will be created.
+      custom_objects: Optional dictionary mapping string names to custom classes
+        or functions.
+      compile_clone: Boolean, whether to compile model clone (default `True`).
+      in_place_reset: Boolean, whether to reset the model in place. Only used if
+        the model is a subclassed model. In the case of a subclassed model,
+        this argument must be set to `True` (default `False`). To restore the
+        original model, use the function
+        `in_place_subclassed_model_state_restoration(model)`.
+      optimizer_iterations: An iterations variable that will be incremented by
+        the optimizer if the clone is compiled. This argument is used when a
+        Keras model is cloned into an Estimator model function, because
+        Estimators create their own global step variable.
+      optimizer_config: Optimizer config dictionary or list of dictionary
+        returned from `get_config()`. This argument should be defined if
+        `clone_and_build_model` is called in a different graph or session from
+        the original model, and the optimizer is an instance of `OptimizerV2`.
+
+    Returns:
+      Clone of the model.
+
+    Raises:
+      ValueError: Cloning fails in the following cases
+        - cloning a subclassed model with `in_place_reset` set to False.
+        - compiling the clone when the original model has not been compiled.
+    """
+    # Grab optimizer now, as we reset-in-place for subclassed models, but
+    # want to maintain access to the original optimizer.
+    orig_optimizer = model.optimizer
+    if compile_clone and not orig_optimizer:
+        raise ValueError(
+            "Error when cloning model: `compile_clone` was set to True, but "
+            f"the original model has not been compiled. Received: model={model}"
+        )
+
+    if compile_clone:
+        compile_args = model._get_compile_args()
+        # Allows this method to be robust to switching graph and eager classes.
+        model._get_compile_args = lambda: compile_args
+
+    with CustomObjectScope(custom_objects or {}):
+        if model._is_graph_network:
+            clone = clone_model(model, input_tensors=input_tensors)
+        elif isinstance(model, Sequential):
+            clone = clone_model(model, input_tensors=input_tensors)
+            if (
+                not clone._is_graph_network
+                and model._build_input_shape is not None
+            ):
+                if tf.compat.v1.executing_eagerly_outside_functions():
+                    clone.build(model._build_input_shape)
+                else:
+                    clone._set_inputs(
+                        backend.placeholder(
+                            model._build_input_shape,
+                            dtype=model.inputs[0].dtype,
+                        )
+                    )
         else:
-          clone._set_inputs(
-              backend.placeholder(
-                  model._build_input_shape, dtype=model.inputs[0].dtype))
-    else:
-      try:
-        # Prefer cloning the model if serial/deserial logic is implemented for
-        # subclassed model.
-        clone = model.__class__.from_config(model.get_config())
-      except NotImplementedError:
-        logging.warning('This model is a subclassed model. Please implement '
-                        '`get_config` and `from_config` to better support '
-                        'cloning the model.')
-        if not in_place_reset:
-          raise ValueError(
-              f'This model ({model}) is a subclassed model. '
-              'Such a model cannot be cloned, but there is a workaround where '
-              'the model is reset in-place. To use this, please set the '
-              'argument `in_place_reset` to `True`. This will reset the '
-              'attributes in the original model. To restore the attributes, '
-              'call `in_place_subclassed_model_state_restoration(model)`.')
-        clone = model
-        _in_place_subclassed_model_reset(clone)
-      if input_tensors is not None:
-        if isinstance(input_tensors, (list, tuple)) and len(input_tensors) == 1:
-          input_tensors = input_tensors[0]
-        clone._set_inputs(input_tensors)
-
-  if compile_clone:
-    if isinstance(orig_optimizer, optimizer_v1.TFOptimizer):
-      optimizer = optimizer_v1.TFOptimizer(
-          orig_optimizer.optimizer, optimizer_iterations)
-      backend.track_tf_optimizer(optimizer)
-    else:
-      if not isinstance(orig_optimizer, (tuple, list)):
-        orig_optimizer = [orig_optimizer]
-      if optimizer_config is None:
-        optimizer = [
-            opt.__class__.from_config(opt.get_config())
-            for opt in orig_optimizer
-        ]
-      elif isinstance(optimizer_config, dict):
-        optimizer = [orig_optimizer[0].__class__.from_config(optimizer_config)]
-      else:
-        # optimizer config is list of dict, same order as orig_optimizer.
-        optimizer = [
-            opt.__class__.from_config(opt_config)
-            for (opt, opt_config) in zip(orig_optimizer, optimizer_config)
-        ]
-      if optimizer_iterations is not None:
-        for opt in optimizer:
-          opt.iterations = optimizer_iterations
-
-      if len(optimizer) == 1:
-        optimizer = optimizer[0]
-
-    compile_args['optimizer'] = optimizer
-    if target_tensors is not None:
-      compile_args['target_tensors'] = target_tensors
-    # Ensure Metric objects in new model are separate from existing model.
-    compile_args['metrics'] = metrics_module.clone_metrics(
-        compile_args['metrics'])
-    compile_args['weighted_metrics'] = metrics_module.clone_metrics(
-        compile_args['weighted_metrics'])
-    clone.compile(**compile_args)
-
-  return clone
+            try:
+                # Prefer cloning the model if serial/deserial logic is
+                # implemented for subclassed model.
+                clone = model.__class__.from_config(model.get_config())
+            except NotImplementedError:
+                logging.warning(
+                    "This model is a subclassed model. Please implement "
+                    "`get_config` and `from_config` to better support "
+                    "cloning the model."
+                )
+                if not in_place_reset:
+                    raise ValueError(
+                        f"This model ({model}) is a subclassed model. "
+                        "Such a model cannot be cloned, but there is a "
+                        "workaround where the model is reset in-place. "
+                        "To use this, please set the "
+                        "argument `in_place_reset` to `True`. This will reset "
+                        "the attributes in the original model. "
+                        "To restore the attributes, call "
+                        "`in_place_subclassed_model_state_restoration(model)`."
+                    )
+                clone = model
+                _in_place_subclassed_model_reset(clone)
+            if input_tensors is not None:
+                if (
+                    isinstance(input_tensors, (list, tuple))
+                    and len(input_tensors) == 1
+                ):
+                    input_tensors = input_tensors[0]
+                clone._set_inputs(input_tensors)
+
+    if compile_clone:
+        if isinstance(orig_optimizer, optimizer_v1.TFOptimizer):
+            optimizer = optimizer_v1.TFOptimizer(
+                orig_optimizer.optimizer, optimizer_iterations
+            )
+            backend.track_tf_optimizer(optimizer)
+        else:
+            if not isinstance(orig_optimizer, (tuple, list)):
+                orig_optimizer = [orig_optimizer]
+            if optimizer_config is None:
+                optimizer = [
+                    opt.__class__.from_config(opt.get_config())
+                    for opt in orig_optimizer
+                ]
+            elif isinstance(optimizer_config, dict):
+                optimizer = [
+                    orig_optimizer[0].__class__.from_config(optimizer_config)
+                ]
+            else:
+                # optimizer config is list of dict, same order as
+                # orig_optimizer.
+                optimizer = [
+                    opt.__class__.from_config(opt_config)
+                    for (opt, opt_config) in zip(
+                        orig_optimizer, optimizer_config
+                    )
+                ]
+            if optimizer_iterations is not None:
+                for opt in optimizer:
+                    opt.iterations = optimizer_iterations
+
+            if len(optimizer) == 1:
+                optimizer = optimizer[0]
+
+        compile_args["optimizer"] = optimizer
+        if target_tensors is not None:
+            compile_args["target_tensors"] = target_tensors
+        # Ensure Metric objects in new model are separate from existing model.
+        compile_args["metrics"] = metrics_module.clone_metrics(
+            compile_args["metrics"]
+        )
+        compile_args["weighted_metrics"] = metrics_module.clone_metrics(
+            compile_args["weighted_metrics"]
+        )
+        clone.compile(**compile_args)
+
+    return clone
diff --git a/keras/models/cloning_test.py b/keras/models/cloning_test.py
index f95423d57be0..ed79dcaa521d 100644
--- a/keras/models/cloning_test.py
+++ b/keras/models/cloning_test.py
@@ -14,559 +14,652 @@
 # ==============================================================================
 """Tests for `models.py` (model cloning, mainly)."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 import os
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import metrics
 from keras import models
 from keras.optimizers import optimizer_v1
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
 
 class TestModel(keras.Model):
-  """A model subclass."""
+    """A model subclass."""
 
-  def __init__(self, n_outputs=4, trainable=True):
-    """A test class with one dense layer and number of outputs as a variable."""
-    super().__init__()
-    self.layer1 = keras.layers.Dense(n_outputs)
-    self.n_outputs = tf.Variable(n_outputs, trainable=trainable)
+    def __init__(self, n_outputs=4, trainable=True):
+        """A test class with one dense layer and number of outputs as a
+        variable."""
+        super().__init__()
+        self.layer1 = keras.layers.Dense(n_outputs)
+        self.n_outputs = tf.Variable(n_outputs, trainable=trainable)
 
-  def call(self, x):
-    return self.layer1(x)
+    def call(self, x):
+        return self.layer1(x)
 
 
 def _get_layers(input_shape=(4,), add_input_layer=False):
-  if add_input_layer:
-    model_layers = [keras.layers.InputLayer(input_shape=input_shape),
-                    keras.layers.Dense(4)]
-  elif input_shape:
-    model_layers = [keras.layers.Dense(4, input_shape=input_shape)]
-  else:
-    model_layers = [keras.layers.Dense(4)]
+    if add_input_layer:
+        model_layers = [
+            keras.layers.InputLayer(input_shape=input_shape),
+            keras.layers.Dense(4),
+        ]
+    elif input_shape:
+        model_layers = [keras.layers.Dense(4, input_shape=input_shape)]
+    else:
+        model_layers = [keras.layers.Dense(4)]
 
-  model_layers += [
-      keras.layers.BatchNormalization(),
-      keras.layers.Dropout(0.5),
-      keras.layers.Dense(4)]
+    model_layers += [
+        keras.layers.BatchNormalization(),
+        keras.layers.Dropout(0.5),
+        keras.layers.Dense(4),
+    ]
 
-  return model_layers
+    return model_layers
 
 
 def _get_model(input_shape=(4,)):
-  model_layers = _get_layers(input_shape=None, add_input_layer=False)
-  return test_utils.get_model_from_layers(
-      model_layers, input_shape=input_shape)
+    model_layers = _get_layers(input_shape=None, add_input_layer=False)
+    return test_utils.get_model_from_layers(
+        model_layers, input_shape=input_shape
+    )
 
 
 class TestModelCloning(test_combinations.TestCase):
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            {
+                "testcase_name": "has_input_layer",
+                "input_shape": (4,),
+                "add_input_layer": True,
+                "share_weights": False,
+            },
+            {
+                "testcase_name": "no_input_layer",
+                "input_shape": None,
+                "add_input_layer": False,
+                "share_weights": False,
+            },
+            {
+                "testcase_name": "has_input_layer_share_weights",
+                "input_shape": (4,),
+                "add_input_layer": True,
+                "share_weights": True,
+            },
+            {
+                "testcase_name": "no_input_layer_share_weights",
+                "input_shape": None,
+                "add_input_layer": False,
+                "share_weights": True,
+            },
+        ]
+    )
+    def test_clone_sequential_model(
+        self, input_shape, add_input_layer, share_weights
+    ):
 
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      {'testcase_name': 'has_input_layer',
-       'input_shape': (4,),
-       'add_input_layer': True,
-       'share_weights': False},
-      {'testcase_name': 'no_input_layer',
-       'input_shape': None,
-       'add_input_layer': False,
-       'share_weights': False},
-      {'testcase_name': 'has_input_layer_share_weights',
-       'input_shape': (4,),
-       'add_input_layer': True,
-       'share_weights': True},
-      {'testcase_name': 'no_input_layer_share_weights',
-       'input_shape': None,
-       'add_input_layer': False,
-       'share_weights': True},
-  ])
-  def test_clone_sequential_model(
-      self, input_shape, add_input_layer, share_weights):
-
-    if share_weights:
-      clone_fn = functools.partial(
-          keras.models._clone_sequential_model, layer_fn=models.share_weights)
-    else:
-      clone_fn = keras.models.clone_model
-
-    val_a = np.random.random((10, 4))
-    model = models.Sequential(_get_layers(input_shape, add_input_layer))
-    # Sanity check
-    self.assertEqual(
-        isinstance(
-            list(model._flatten_layers(include_self=False, recursive=False))[0],
-            keras.layers.InputLayer), add_input_layer)
-    self.assertEqual(model._is_graph_network, add_input_layer)
-
-    # With placeholder creation -- clone model should have an InputLayer
-    # if the original model has one.
-    new_model = clone_fn(model)
-    self.assertEqual(
-        isinstance(
+        if share_weights:
+            clone_fn = functools.partial(
+                keras.models._clone_sequential_model,
+                layer_fn=models.share_weights,
+            )
+        else:
+            clone_fn = keras.models.clone_model
+
+        val_a = np.random.random((10, 4))
+        model = models.Sequential(_get_layers(input_shape, add_input_layer))
+        # Sanity check
+        self.assertEqual(
+            isinstance(
+                list(
+                    model._flatten_layers(include_self=False, recursive=False)
+                )[0],
+                keras.layers.InputLayer,
+            ),
+            add_input_layer,
+        )
+        self.assertEqual(model._is_graph_network, add_input_layer)
+
+        # With placeholder creation -- clone model should have an InputLayer
+        # if the original model has one.
+        new_model = clone_fn(model)
+        self.assertEqual(
+            isinstance(
+                list(
+                    new_model._flatten_layers(
+                        include_self=False, recursive=False
+                    )
+                )[0],
+                keras.layers.InputLayer,
+            ),
+            add_input_layer,
+        )
+        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
+        if (
+            input_shape
+            and not tf.compat.v1.executing_eagerly_outside_functions()
+        ):
+            # update ops from batch norm needs to be included
+            self.assertGreaterEqual(len(new_model.updates), 2)
+
+        # On top of new tensor  -- clone model should always have an InputLayer.
+        input_a = keras.Input(shape=(4,), name="a")
+        new_model = clone_fn(model, input_tensors=input_a)
+        self.assertIsInstance(
             list(
-                new_model._flatten_layers(include_self=False,
-                                          recursive=False))[0],
-            keras.layers.InputLayer), add_input_layer)
-    self.assertEqual(new_model._is_graph_network, model._is_graph_network)
-    if input_shape and not tf.compat.v1.executing_eagerly_outside_functions():
-      # update ops from batch norm needs to be included
-      self.assertGreaterEqual(len(new_model.updates), 2)
-
-    # On top of new tensor  -- clone model should always have an InputLayer.
-    input_a = keras.Input(shape=(4,), name='a')
-    new_model = clone_fn(model, input_tensors=input_a)
-    self.assertIsInstance(
-        list(new_model._flatten_layers(include_self=False, recursive=False))[0],
-        keras.layers.InputLayer)
-    # The new models inputs should have the properties of the new input tensor
-    if tf.__internal__.tf2.enabled():
-      # In TF1, the new model will be a:0
-      self.assertEqual(new_model.input_names[0], input_a.name)
-    self.assertEqual(new_model.inputs[0].shape, input_a.shape)
-    self.assertTrue(new_model._is_graph_network)
-
-    # On top of new, non-Keras tensor  -- clone model should always have an
-    # InputLayer.
-    if not tf.executing_eagerly():
-      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
-      # saying they should not be used with EagerTensors
-      input_a = keras.backend.variable(val_a)
-      new_model = clone_fn(model, input_tensors=input_a)
-      self.assertIsInstance(
-          list(new_model._flatten_layers(include_self=False,
-                                         recursive=False))[0],
-          keras.layers.InputLayer)
-      self.assertTrue(new_model._is_graph_network)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      {'testcase_name': 'clone_weights', 'share_weights': False},
-      {'testcase_name': 'share_weights', 'share_weights': True},
-  ])
-  def test_clone_functional_model(self, share_weights):
-    if share_weights:
-      clone_fn = functools.partial(
-          keras.models._clone_functional_model, layer_fn=models.share_weights)
-    else:
-      clone_fn = keras.models.clone_model
-
-    val_a = np.random.random((10, 4))
-    val_b = np.random.random((10, 4))
-    val_out = np.random.random((10, 4))
-
-    input_a = keras.Input(shape=(4,))
-    input_b = keras.Input(shape=(4,))
-    dense_1 = keras.layers.Dense(4,)
-    dense_2 = keras.layers.Dense(4,)
-
-    x_a = dense_1(input_a)
-    x_a = keras.layers.Dropout(0.5)(x_a)
-    x_a = keras.layers.BatchNormalization()(x_a)
-    x_b = dense_1(input_b)
-    x_a = dense_2(x_a)
-    outputs = keras.layers.add([x_a, x_b])
-    model = keras.models.Model([input_a, input_b], outputs)
-
-    # With placeholder creation
-    new_model = clone_fn(model)
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertGreaterEqual(len(new_model.updates), 2)
-    new_model.compile(
-        test_utils.get_v2_optimizer('rmsprop'),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    new_model.train_on_batch([val_a, val_b], val_out)
-
-    # On top of new tensors
-    input_a = keras.Input(shape=(4,), name='a')
-    input_b = keras.Input(shape=(4,), name='b')
-    new_input_tensors = [input_a, input_b]
-    new_model = keras.models.clone_model(model, input_tensors=new_input_tensors)
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertLen(new_model.updates, 2)
-    new_model.compile(
-        test_utils.get_v2_optimizer('rmsprop'),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    new_model.train_on_batch([val_a, val_b], val_out)
-
-    # New model should use provided input tensors
-    self.assertListEqual(new_model.inputs, new_input_tensors)
-
-    # On top of new, non-Keras tensors
-    if not tf.executing_eagerly():
-      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
-      # saying they should not be used with EagerTensors
-      input_a = keras.backend.variable(val_a)
-      input_b = keras.backend.variable(val_b)
-      new_model = clone_fn(model, input_tensors=[input_a, input_b])
-      self.assertGreaterEqual(len(new_model.updates), 2)
-      new_model.compile(
-          test_utils.get_v2_optimizer('rmsprop'),
-          'mse',
-          run_eagerly=test_utils.should_run_eagerly())
-      new_model.train_on_batch(None, val_out)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      {'testcase_name': 'clone_weights', 'share_weights': False},
-      {'testcase_name': 'share_weights', 'share_weights': True},
-  ])
-  def test_clone_functional_with_masking(self, share_weights):
-    if share_weights:
-      clone_fn = functools.partial(
-          keras.models._clone_functional_model, layer_fn=models.share_weights)
-    else:
-      clone_fn = keras.models.clone_model
-
-    x = np.array([[[1.], [1.]], [[0.], [0.]]])
-    inputs = keras.Input((2, 1))
-    outputs = keras.layers.Masking(mask_value=0)(inputs)
-    outputs = keras.layers.TimeDistributed(
-        keras.layers.Dense(1, kernel_initializer='one'))(outputs)
-    model = keras.Model(inputs, outputs)
-
-    model = clone_fn(model)
-    model.compile(
-        loss='mse',
-        optimizer=test_utils.get_v2_optimizer('adam'),
-        run_eagerly=test_utils.should_run_eagerly())
-    y = np.array([[[1], [1]], [[1], [1]]])
-    loss = model.train_on_batch(x, y)
-    self.assertEqual(float(loss), 0.)
-
-  def test_clone_rnn(self):
-    # Test cloning a model with multiple cells in an RNN.  This exercises a
-    # few "fancier" features such as the `Bidrectional` wrapper and
-    # `StackedRNNCells` under the hood.
-    inputs = keras.Input(shape=(3, 3))
-    cells = [
-        keras.layers.LSTMCell(
-            units=32,
-            enable_caching_device=True,
-            implementation=2,
-            activation='relu')]
-    rnn = keras.layers.RNN(cells, return_sequences=True)
-    outputs = keras.layers.Bidirectional(rnn)(inputs)
-    outputs = keras.layers.Dense(
-        12, activation='softmax', name='scores')(outputs)
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    model.compile(
-        loss=keras.losses.CategoricalCrossentropy(),
-        optimizer=keras.optimizers.optimizer_v2.rmsprop.RMSprop(lr=0.01),
-        metrics=['accuracy'])
-    keras.models.clone_model(model)
-
-  def test_model_cloning_invalid_use_cases(self):
-    seq_model = keras.models.Sequential()
-    seq_model.add(keras.layers.Dense(4, input_shape=(4,)))
-
-    x = keras.Input((4,))
-    y = keras.layers.Dense(4)(x)
-    fn_model = keras.models.Model(x, y)
-
-    with self.assertRaises(ValueError):
-      keras.models._clone_functional_model(seq_model)
-    with self.assertRaises(ValueError):
-      keras.models._clone_functional_model(None)
-    with self.assertRaises(ValueError):
-      keras.models._clone_sequential_model(fn_model)
-
-    with self.assertRaises(ValueError):
-      keras.models._clone_sequential_model(seq_model, input_tensors=[x, x])
-    with self.assertRaises(ValueError):
-      keras.models._clone_sequential_model(seq_model, input_tensors=y)
-
-  def test_functional_cloning_does_not_create_unnecessary_placeholders(self):
-    with tf.Graph().as_default():
-      x = keras.Input((4,))
-      y = keras.layers.Dense(4)(x)
-      model = keras.models.Model(x, y)
-    graph = tf.Graph()
-    with graph.as_default():
-      x = tf.ones((10, 4))
-      _ = keras.models.clone_model(model, input_tensors=[x])
-      has_placeholder = _has_placeholder(graph)
-      self.assertFalse(has_placeholder)
-
-  def test_sequential_cloning_does_not_create_unnecessary_placeholders(self):
-    with tf.Graph().as_default():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-    graph = tf.Graph()
-    with graph.as_default():
-      x = tf.ones((10, 4))
-      _ = keras.models.clone_model(model, input_tensors=[x])
-      has_placeholder = _has_placeholder(graph)
-      self.assertFalse(has_placeholder)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      {'testcase_name': 'clone_weights', 'share_weights': False},
-      {'testcase_name': 'share_weights', 'share_weights': True},
-  ])
-  def test_functional_cloning_with_tensor_kwarg(self, share_weights):
-    """Test that cloning works with models that use Tensor kwargs."""
-
-    if share_weights:
-      clone_fn = functools.partial(
-          keras.models.clone_model, clone_function=models.share_weights)
-    else:
-      clone_fn = keras.models.clone_model
-
-    class LayerWithTensorKwarg(keras.layers.Layer):
-
-      def call(self, inputs, tensor=None):
-        if tensor is not None:
-          return inputs * tf.cast(tensor, tf.float32)
+                new_model._flatten_layers(include_self=False, recursive=False)
+            )[0],
+            keras.layers.InputLayer,
+        )
+        # The new models inputs should have the properties of the new input
+        # tensor
+        if tf.__internal__.tf2.enabled():
+            # In TF1, the new model will be a:0
+            self.assertEqual(new_model.input_names[0], input_a.name)
+        self.assertEqual(new_model.inputs[0].shape, input_a.shape)
+        self.assertTrue(new_model._is_graph_network)
+
+        # On top of new, non-Keras tensor  -- clone model should always have an
+        # InputLayer.
+        if not tf.executing_eagerly():
+            # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an
+            # error saying they should not be used with EagerTensors
+            input_a = keras.backend.variable(val_a)
+            new_model = clone_fn(model, input_tensors=input_a)
+            self.assertIsInstance(
+                list(
+                    new_model._flatten_layers(
+                        include_self=False, recursive=False
+                    )
+                )[0],
+                keras.layers.InputLayer,
+            )
+            self.assertTrue(new_model._is_graph_network)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            {"testcase_name": "clone_weights", "share_weights": False},
+            {"testcase_name": "share_weights", "share_weights": True},
+        ]
+    )
+    def test_clone_functional_model(self, share_weights):
+        if share_weights:
+            clone_fn = functools.partial(
+                keras.models._clone_functional_model,
+                layer_fn=models.share_weights,
+            )
         else:
-          return inputs
+            clone_fn = keras.models.clone_model
+
+        val_a = np.random.random((10, 4))
+        val_b = np.random.random((10, 4))
+        val_out = np.random.random((10, 4))
+
+        input_a = keras.Input(shape=(4,))
+        input_b = keras.Input(shape=(4,))
+        dense_1 = keras.layers.Dense(
+            4,
+        )
+        dense_2 = keras.layers.Dense(
+            4,
+        )
+
+        x_a = dense_1(input_a)
+        x_a = keras.layers.Dropout(0.5)(x_a)
+        x_a = keras.layers.BatchNormalization()(x_a)
+        x_b = dense_1(input_b)
+        x_a = dense_2(x_a)
+        outputs = keras.layers.add([x_a, x_b])
+        model = keras.models.Model([input_a, input_b], outputs)
+
+        # With placeholder creation
+        new_model = clone_fn(model)
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertGreaterEqual(len(new_model.updates), 2)
+        new_model.compile(
+            test_utils.get_v2_optimizer("rmsprop"),
+            "mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        new_model.train_on_batch([val_a, val_b], val_out)
+
+        # On top of new tensors
+        input_a = keras.Input(shape=(4,), name="a")
+        input_b = keras.Input(shape=(4,), name="b")
+        new_input_tensors = [input_a, input_b]
+        new_model = keras.models.clone_model(
+            model, input_tensors=new_input_tensors
+        )
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertLen(new_model.updates, 2)
+        new_model.compile(
+            test_utils.get_v2_optimizer("rmsprop"),
+            "mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        new_model.train_on_batch([val_a, val_b], val_out)
+
+        # New model should use provided input tensors
+        self.assertListEqual(new_model.inputs, new_input_tensors)
+
+        # On top of new, non-Keras tensors
+        if not tf.executing_eagerly():
+            # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an
+            # error saying they should not be used with EagerTensors
+            input_a = keras.backend.variable(val_a)
+            input_b = keras.backend.variable(val_b)
+            new_model = clone_fn(model, input_tensors=[input_a, input_b])
+            self.assertGreaterEqual(len(new_model.updates), 2)
+            new_model.compile(
+                test_utils.get_v2_optimizer("rmsprop"),
+                "mse",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            new_model.train_on_batch(None, val_out)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            {"testcase_name": "clone_weights", "share_weights": False},
+            {"testcase_name": "share_weights", "share_weights": True},
+        ]
+    )
+    def test_clone_functional_with_masking(self, share_weights):
+        if share_weights:
+            clone_fn = functools.partial(
+                keras.models._clone_functional_model,
+                layer_fn=models.share_weights,
+            )
+        else:
+            clone_fn = keras.models.clone_model
 
-    inputs = keras.layers.Input(shape=(3))
-    t = tf.sequence_mask(tf.shape(inputs)[1])
-    model = keras.models.Model(inputs, LayerWithTensorKwarg()(inputs, t))
-    model.add_loss(tf.reduce_sum(model.outputs))
+        x = np.array([[[1.0], [1.0]], [[0.0], [0.0]]])
+        inputs = keras.Input((2, 1))
+        outputs = keras.layers.Masking(mask_value=0)(inputs)
+        outputs = keras.layers.TimeDistributed(
+            keras.layers.Dense(1, kernel_initializer="one")
+        )(outputs)
+        model = keras.Model(inputs, outputs)
 
-    input_arr = np.random.random((1, 3)).astype(np.float32)
-    clone = clone_fn(model)
+        model = clone_fn(model)
+        model.compile(
+            loss="mse",
+            optimizer=test_utils.get_v2_optimizer("adam"),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        y = np.array([[[1], [1]], [[1], [1]]])
+        loss = model.train_on_batch(x, y)
+        self.assertEqual(float(loss), 0.0)
+
+    def test_clone_rnn(self):
+        # Test cloning a model with multiple cells in an RNN.  This exercises a
+        # few "fancier" features such as the `Bidrectional` wrapper and
+        # `StackedRNNCells` under the hood.
+        inputs = keras.Input(shape=(3, 3))
+        cells = [
+            keras.layers.LSTMCell(
+                units=32,
+                enable_caching_device=True,
+                implementation=2,
+                activation="relu",
+            )
+        ]
+        rnn = keras.layers.RNN(cells, return_sequences=True)
+        outputs = keras.layers.Bidirectional(rnn)(inputs)
+        outputs = keras.layers.Dense(12, activation="softmax", name="scores")(
+            outputs
+        )
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        model.compile(
+            loss=keras.losses.CategoricalCrossentropy(),
+            optimizer=keras.optimizers.legacy.rmsprop.RMSprop(lr=0.01),
+            metrics=["accuracy"],
+        )
+        keras.models.clone_model(model)
+
+    def test_model_cloning_invalid_use_cases(self):
+        seq_model = keras.models.Sequential()
+        seq_model.add(keras.layers.Dense(4, input_shape=(4,)))
+
+        x = keras.Input((4,))
+        y = keras.layers.Dense(4)(x)
+        fn_model = keras.models.Model(x, y)
+
+        with self.assertRaises(ValueError):
+            keras.models._clone_functional_model(seq_model)
+        with self.assertRaises(ValueError):
+            keras.models._clone_functional_model(None)
+        with self.assertRaises(ValueError):
+            keras.models._clone_sequential_model(fn_model)
+
+        with self.assertRaises(ValueError):
+            keras.models._clone_sequential_model(
+                seq_model, input_tensors=[x, x]
+            )
+        with self.assertRaises(ValueError):
+            keras.models._clone_sequential_model(seq_model, input_tensors=y)
+
+    def test_functional_cloning_does_not_create_unnecessary_placeholders(self):
+        with tf.Graph().as_default():
+            x = keras.Input((4,))
+            y = keras.layers.Dense(4)(x)
+            model = keras.models.Model(x, y)
+        graph = tf.Graph()
+        with graph.as_default():
+            x = tf.ones((10, 4))
+            _ = keras.models.clone_model(model, input_tensors=[x])
+            has_placeholder = _has_placeholder(graph)
+            self.assertFalse(has_placeholder)
+
+    def test_sequential_cloning_does_not_create_unnecessary_placeholders(self):
+        with tf.Graph().as_default():
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(4, input_shape=(4,)))
+        graph = tf.Graph()
+        with graph.as_default():
+            x = tf.ones((10, 4))
+            _ = keras.models.clone_model(model, input_tensors=[x])
+            has_placeholder = _has_placeholder(graph)
+            self.assertFalse(has_placeholder)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            {"testcase_name": "clone_weights", "share_weights": False},
+            {"testcase_name": "share_weights", "share_weights": True},
+        ]
+    )
+    def test_functional_cloning_with_tensor_kwarg(self, share_weights):
+        """Test that cloning works with models that use Tensor kwargs."""
 
-    if tf.executing_eagerly():
-      clone(input_arr)
-      loss = clone.losses[0]
-    else:
-      with self.session() as sess:
-        clone(input_arr)
         if share_weights:
-          self.skipTest('Weight sharing with inputs in call **kwargs does '
-                        'not work correctly in v1')
+            clone_fn = functools.partial(
+                keras.models.clone_model, clone_function=models.share_weights
+            )
+        else:
+            clone_fn = keras.models.clone_model
+
+        class LayerWithTensorKwarg(keras.layers.Layer):
+            def call(self, inputs, tensor=None):
+                if tensor is not None:
+                    return inputs * tf.cast(tensor, tf.float32)
+                else:
+                    return inputs
+
+        inputs = keras.layers.Input(shape=(3))
+        t = tf.sequence_mask(tf.shape(inputs)[1])
+        model = keras.models.Model(inputs, LayerWithTensorKwarg()(inputs, t))
+        model.add_loss(tf.reduce_sum(model.outputs))
+
+        input_arr = np.random.random((1, 3)).astype(np.float32)
+        clone = clone_fn(model)
+
+        if tf.executing_eagerly():
+            clone(input_arr)
+            loss = clone.losses[0]
         else:
-          feed_dict = {clone.input: input_arr}
-        loss = sess.run(clone.losses[0], feed_dict=feed_dict)
-    self.assertAllClose(np.sum(input_arr), loss)
+            with self.session() as sess:
+                clone(input_arr)
+                if share_weights:
+                    self.skipTest(
+                        "Weight sharing with inputs in call **kwargs does "
+                        "not work correctly in v1"
+                    )
+                else:
+                    feed_dict = {clone.input: input_arr}
+                loss = sess.run(clone.losses[0], feed_dict=feed_dict)
+        self.assertAllClose(np.sum(input_arr), loss)
 
 
 def _has_placeholder(graph):
-  ops_types = [op.type for op in graph.get_operations()]
-  return any('Placeholder' in s for s in ops_types)
+    ops_types = [op.type for op in graph.get_operations()]
+    return any("Placeholder" in s for s in ops_types)
 
 
 class CheckpointingTests(test_combinations.TestCase):
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_optimizer_dependency(self):
+        model = _get_model()
+        opt = tf.compat.v1.train.AdamOptimizer(0.01)
+        model.compile(
+            optimizer=opt,
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
 
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_optimizer_dependency(self):
-    model = _get_model()
-    opt = tf.compat.v1.train.AdamOptimizer(.01)
-    model.compile(
-        optimizer=opt,
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    model.fit(
-        x=np.array([[1., 2., 3., 4.]]),
-        y=np.array([[1., 1., 1., 1.]]),
-        epochs=2)
-    save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
-    beta1_power, _ = opt._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(12.))
-    model.save_weights(save_prefix)
-    self.evaluate(beta1_power.assign(13.))
-    model.load_weights(save_prefix)
-    self.assertEqual(12., self.evaluate(beta1_power))
+        model.fit(
+            x=np.array([[1.0, 2.0, 3.0, 4.0]]),
+            y=np.array([[1.0, 1.0, 1.0, 1.0]]),
+            epochs=2,
+        )
+        save_prefix = os.path.join(self.get_temp_dir(), "ckpt")
+        beta1_power, _ = opt._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(12.0))
+        model.save_weights(save_prefix)
+        self.evaluate(beta1_power.assign(13.0))
+        model.load_weights(save_prefix)
+        self.assertEqual(12.0, self.evaluate(beta1_power))
 
 
 @test_combinations.run_all_keras_modes
 class TestModelBackend(test_combinations.TestCase):
+    def test_model_backend_float64_use_cases(self):
+        # Test case for GitHub issue 19318
+        floatx = keras.backend.floatx()
+        keras.backend.set_floatx("float64")
+
+        x = keras.Input((5,))
+        y = keras.layers.Dense(1)(x)
+        model = keras.models.Model(x, y)
+        model.compile(
+            test_utils.get_v2_optimizer("rmsprop"),
+            "mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
 
-  def test_model_backend_float64_use_cases(self):
-    # Test case for GitHub issue 19318
-    floatx = keras.backend.floatx()
-    keras.backend.set_floatx('float64')
-
-    x = keras.Input((5,))
-    y = keras.layers.Dense(1)(x)
-    model = keras.models.Model(x, y)
-    model.compile(
-        test_utils.get_v2_optimizer('rmsprop'),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    keras.backend.set_floatx(floatx)
+        keras.backend.set_floatx(floatx)
 
 
 class TestCloneAndBuildModel(test_combinations.TestCase):
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_clone_and_build_non_compiled_model(self):
+        inp = np.random.random((10, 4))
+        out = np.random.random((10, 4))
+
+        model = _get_model()
+
+        with self.assertRaisesRegex(ValueError, "has not been compiled"):
+            models.clone_and_build_model(model, compile_clone=True)
+
+        is_subclassed = test_utils.get_model_type() == "subclass"
+        # With placeholder creation
+        new_model = models.clone_and_build_model(
+            model, compile_clone=False, in_place_reset=is_subclassed
+        )
+        with self.assertRaisesRegex(RuntimeError, "must compile"):
+            new_model.evaluate(inp, out)
+        with self.assertRaisesRegex(RuntimeError, "must compile"):
+            new_model.train_on_batch(inp, out)
+        new_model.compile(
+            test_utils.get_v2_optimizer("rmsprop"),
+            "mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        new_model.train_on_batch(inp, out)
+
+        # Create new tensors for inputs.
+        input_a = keras.Input(shape=(4,))
+        new_model = models.clone_and_build_model(
+            model,
+            input_tensors=input_a,
+            compile_clone=False,
+            in_place_reset=is_subclassed,
+        )
+        with self.assertRaisesRegex(RuntimeError, "must compile"):
+            new_model.evaluate(inp, out)
+        with self.assertRaisesRegex(RuntimeError, "must compile"):
+            new_model.train_on_batch(inp, out)
+        new_model.compile(
+            test_utils.get_v2_optimizer("rmsprop"),
+            "mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        new_model.train_on_batch(inp, out)
+
+    def _assert_same_compile_params(self, model):
+        """Assert that two models have the same compile parameters."""
+
+        self.assertEqual("mse", model.loss)
+        self.assertIsInstance(
+            model.optimizer,
+            (
+                optimizer_v1.RMSprop,
+                keras.optimizers.legacy.rmsprop.RMSprop,
+            ),
+        )
+
+    def _clone_and_build_test_helper(self, model, model_type):
+        inp = np.random.random((10, 4))
+        out = np.random.random((10, 4))
+
+        is_subclassed = model_type == "subclass"
+
+        # With placeholder creation
+        new_model = models.clone_and_build_model(
+            model, compile_clone=True, in_place_reset=is_subclassed
+        )
+
+        self._assert_same_compile_params(new_model)
+        new_model.train_on_batch(inp, out)
+        new_model.evaluate(inp, out)
+
+        # Create new tensors for inputs.
+        input_a = keras.Input(shape=(4,), name="a")
+        new_model = models.clone_and_build_model(
+            model,
+            input_tensors=input_a,
+            compile_clone=True,
+            in_place_reset=is_subclassed,
+        )
+        self._assert_same_compile_params(new_model)
+        new_model.train_on_batch(inp, out)
+        new_model.evaluate(inp, out)
+
+        new_model = models.clone_and_build_model(
+            model,
+            input_tensors=input_a,
+            target_tensors=None,
+            compile_clone=True,
+            in_place_reset=is_subclassed,
+        )
+        self._assert_same_compile_params(new_model)
+        new_model.train_on_batch(inp, out)
+        new_model.evaluate(inp, out)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_clone_and_build_compiled(self):
+        model = _get_model()
+        model.compile(
+            test_utils.get_v2_optimizer("rmsprop"),
+            "mse",
+            metrics=["acc", metrics.categorical_accuracy],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        self._clone_and_build_test_helper(model, test_utils.get_model_type())
 
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_clone_and_build_non_compiled_model(self):
-    inp = np.random.random((10, 4))
-    out = np.random.random((10, 4))
-
-    model = _get_model()
-
-    with self.assertRaisesRegex(ValueError, 'has not been compiled'):
-      models.clone_and_build_model(model, compile_clone=True)
-
-    is_subclassed = (test_utils.get_model_type() == 'subclass')
-    # With placeholder creation
-    new_model = models.clone_and_build_model(
-        model, compile_clone=False, in_place_reset=is_subclassed)
-    with self.assertRaisesRegex(RuntimeError, 'must compile'):
-      new_model.evaluate(inp, out)
-    with self.assertRaisesRegex(RuntimeError, 'must compile'):
-      new_model.train_on_batch(inp, out)
-    new_model.compile(
-        test_utils.get_v2_optimizer('rmsprop'),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    new_model.train_on_batch(inp, out)
-
-    # Create new tensors for inputs.
-    input_a = keras.Input(shape=(4,))
-    new_model = models.clone_and_build_model(
-        model,
-        input_tensors=input_a,
-        compile_clone=False,
-        in_place_reset=is_subclassed)
-    with self.assertRaisesRegex(RuntimeError, 'must compile'):
-      new_model.evaluate(inp, out)
-    with self.assertRaisesRegex(RuntimeError, 'must compile'):
-      new_model.train_on_batch(inp, out)
-    new_model.compile(
-        test_utils.get_v2_optimizer('rmsprop'),
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    new_model.train_on_batch(inp, out)
-
-  def _assert_same_compile_params(self, model):
-    """Assert that two models have the same compile parameters."""
-
-    self.assertEqual('mse', model.loss)
-    self.assertIsInstance(
-        model.optimizer,
-        (optimizer_v1.RMSprop, keras.optimizers.optimizer_v2.rmsprop.RMSprop))
-
-  def _clone_and_build_test_helper(self, model, model_type):
-    inp = np.random.random((10, 4))
-    out = np.random.random((10, 4))
-
-    is_subclassed = (model_type == 'subclass')
-
-    # With placeholder creation
-    new_model = models.clone_and_build_model(
-        model, compile_clone=True, in_place_reset=is_subclassed)
-
-    self._assert_same_compile_params(new_model)
-    new_model.train_on_batch(inp, out)
-    new_model.evaluate(inp, out)
-
-    # Create new tensors for inputs.
-    input_a = keras.Input(shape=(4,), name='a')
-    new_model = models.clone_and_build_model(
-        model, input_tensors=input_a, compile_clone=True,
-        in_place_reset=is_subclassed)
-    self._assert_same_compile_params(new_model)
-    new_model.train_on_batch(inp, out)
-    new_model.evaluate(inp, out)
-
-    new_model = models.clone_and_build_model(
-        model,
-        input_tensors=input_a,
-        target_tensors=None,
-        compile_clone=True,
-        in_place_reset=is_subclassed)
-    self._assert_same_compile_params(new_model)
-    new_model.train_on_batch(inp, out)
-    new_model.evaluate(inp, out)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_clone_and_build_compiled(self):
-    model = _get_model()
-    model.compile(
-        test_utils.get_v2_optimizer('rmsprop'),
-        'mse',
-        metrics=['acc', metrics.categorical_accuracy],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    self._clone_and_build_test_helper(model, test_utils.get_model_type())
-
-  @test_combinations.run_all_keras_modes
-  def test_clone_and_build_sequential_without_inputs_defined(self):
-    model = models.Sequential(_get_layers(input_shape=None))
-    model.compile(
-        test_utils.get_v2_optimizer('rmsprop'),
-        'mse',
-        metrics=['acc', metrics.categorical_accuracy],
-        run_eagerly=test_utils.should_run_eagerly())
-    self._clone_and_build_test_helper(model, 'sequential')
-
-    inp = np.random.random((10, 4))
-    out = np.random.random((10, 4))
-    model.train_on_batch(inp, out)
-    self._clone_and_build_test_helper(model, 'sequential')
-
-  def assert_optimizer_iterations_increases(self, optimizer):
-    model = _get_model()
-    model.compile(
-        optimizer,
-        'mse',
-        metrics=['acc', metrics.categorical_accuracy],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    global_step = keras.backend.variable(123, dtype=tf.int64)
-    clone_model = models.clone_and_build_model(
-        model, compile_clone=True, optimizer_iterations=global_step,
-        in_place_reset=(test_utils.get_model_type() == 'subclass'))
-
-    inp = np.random.random((10, 4))
-    out = np.random.random((10, 4))
-    clone_model.train_on_batch(inp, out)
-
-    self.assertEqual(backend.eval(global_step), 124)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_replace_tf_optimizer_iterations_variable(self):
-    if tf.executing_eagerly():
-      self.skipTest('v1 optimizers not supported with eager.')
-    self.assert_optimizer_iterations_increases(tf.compat.v1.train.AdamOptimizer(0.01))
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_replace_keras_optimizer_iterations_variable(self):
-    self.assert_optimizer_iterations_increases('adam')
-
-  def test_clone_optimizer_in_different_graph(self):
-    with tf.Graph().as_default():
-      with self.session():
-        model = test_utils.get_small_sequential_mlp(3, 4)
-        optimizer = keras.optimizers.optimizer_v2.adam.Adam()
+    @test_combinations.run_all_keras_modes
+    def test_clone_and_build_sequential_without_inputs_defined(self):
+        model = models.Sequential(_get_layers(input_shape=None))
         model.compile(
-            optimizer, 'mse', metrics=['acc', metrics.categorical_accuracy],
-            )
-        model.fit(
-            x=np.array([[1., 2., 3., 4.]]),
-            y=np.array([[1., 1., 1., 1.]]),
-            epochs=1)
-        optimizer_config = optimizer.get_config()
-    with tf.Graph().as_default():
-      with self.session():
-        with self.assertRaisesRegex(ValueError, 'Cannot use the given session'):
-          models.clone_and_build_model(model, compile_clone=True)
-        # The optimizer_config object allows the model to be cloned in a
-        # different graph.
-        models.clone_and_build_model(model, compile_clone=True,
-                                     optimizer_config=optimizer_config)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+            test_utils.get_v2_optimizer("rmsprop"),
+            "mse",
+            metrics=["acc", metrics.categorical_accuracy],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self._clone_and_build_test_helper(model, "sequential")
+
+        inp = np.random.random((10, 4))
+        out = np.random.random((10, 4))
+        model.train_on_batch(inp, out)
+        self._clone_and_build_test_helper(model, "sequential")
+
+    def assert_optimizer_iterations_increases(self, optimizer):
+        model = _get_model()
+        model.compile(
+            optimizer,
+            "mse",
+            metrics=["acc", metrics.categorical_accuracy],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        global_step = keras.backend.variable(123, dtype=tf.int64)
+        clone_model = models.clone_and_build_model(
+            model,
+            compile_clone=True,
+            optimizer_iterations=global_step,
+            in_place_reset=(test_utils.get_model_type() == "subclass"),
+        )
+
+        inp = np.random.random((10, 4))
+        out = np.random.random((10, 4))
+        clone_model.train_on_batch(inp, out)
+
+        self.assertEqual(backend.eval(global_step), 124)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_replace_tf_optimizer_iterations_variable(self):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizers not supported with eager.")
+        self.assert_optimizer_iterations_increases(
+            tf.compat.v1.train.AdamOptimizer(0.01)
+        )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_replace_keras_optimizer_iterations_variable(self):
+        self.assert_optimizer_iterations_increases("adam")
+
+    def test_clone_optimizer_in_different_graph(self):
+        with tf.Graph().as_default():
+            with self.session():
+                model = test_utils.get_small_sequential_mlp(3, 4)
+                optimizer = keras.optimizers.legacy.adam.Adam()
+                model.compile(
+                    optimizer,
+                    "mse",
+                    metrics=["acc", metrics.categorical_accuracy],
+                )
+                model.fit(
+                    x=np.array([[1.0, 2.0, 3.0, 4.0]]),
+                    y=np.array([[1.0, 1.0, 1.0, 1.0]]),
+                    epochs=1,
+                )
+                optimizer_config = optimizer.get_config()
+        with tf.Graph().as_default():
+            with self.session():
+                with self.assertRaisesRegex(
+                    ValueError, "Cannot use the given session"
+                ):
+                    models.clone_and_build_model(model, compile_clone=True)
+                # The optimizer_config object allows the model to be cloned in a
+                # different graph.
+                models.clone_and_build_model(
+                    model, compile_clone=True, optimizer_config=optimizer_config
+                )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/models/sharpness_aware_minimization.py b/keras/models/sharpness_aware_minimization.py
index 4e4e5233c384..543b767966ef 100644
--- a/keras/models/sharpness_aware_minimization.py
+++ b/keras/models/sharpness_aware_minimization.py
@@ -16,156 +16,176 @@
 
 import copy
 
+import tensorflow.compat.v2 as tf
+
 from keras.engine import data_adapter
 from keras.layers import deserialize as deserialize_layer
 from keras.models import Model
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
+from keras.saving.object_registration import register_keras_serializable
+from keras.saving.serialization_lib import serialize_keras_object
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-# pylint: disable=g-classes-have-attributes
 
-
-@generic_utils.register_keras_serializable()
+@register_keras_serializable()
 @keras_export("keras.models.experimental.SharpnessAwareMinimization", v1=[])
 class SharpnessAwareMinimization(Model):
-  """Sharpness aware minimization (SAM) training flow.
-
-  Sharpness-aware minimization (SAM) is a technique that improves the model
-  generalization and provides robustness to label noise. Mini-batch splitting is
-  proven to improve the SAM's performance, so users can control how mini batches
-  are split via setting the `num_batch_splits` argument.
-
-  Args:
-    model: `tf.keras.Model` instance. The inner model that does the
-      forward-backward pass.
-    rho: float, defaults to 0.05. The gradients scaling factor.
-    num_batch_splits: int, defaults to None. The number of mini batches to
-      split into from each data batch. If None, batches are not split into
-      sub-batches.
-    name: string, defaults to None. The name of the SAM model.
-
-  Reference:
-    [Pierre Foret et al., 2020](https://arxiv.org/abs/2010.01412)
-  """
-
-  def __init__(self, model, rho=0.05, num_batch_splits=None, name=None):
-    super().__init__(name=name)
-    self.model = model
-    self.rho = rho
-    self.num_batch_splits = num_batch_splits
-
-  def train_step(self, data):
-    """The logic of one SAM training step.
+    """Sharpness aware minimization (SAM) training flow.
 
-    Args:
-      data: A nested structure of `Tensor`s. It should be of structure
-        (x, y, sample_weight) or (x, y).
-
-    Returns:
-      A dict mapping metric names to running average values.
-    """
-    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
-
-    if self.num_batch_splits is not None:
-      x_split = tf.split(x, self.num_batch_splits)
-      y_split = tf.split(y, self.num_batch_splits)
-    else:
-      x_split = [x]
-      y_split = [y]
-
-    gradients_all_batches = []
-    pred_all_batches = []
-    for (x_batch, y_batch) in zip(x_split, y_split):
-      epsilon_w_cache = []
-      with tf.GradientTape() as tape:
-        pred = self.model(x_batch)
-        loss = self.compiled_loss(y_batch, pred)
-      pred_all_batches.append(pred)
-      trainable_variables = self.model.trainable_variables
-      gradients = tape.gradient(loss, trainable_variables)
-
-      gradients_order2_norm = self._gradients_order2_norm(gradients)
-      scale = self.rho / (gradients_order2_norm + 1e-12)
-
-      for (gradient, variable) in zip(gradients, trainable_variables):
-        epsilon_w = gradient * scale
-        self._distributed_apply_epsilon_w(variable, epsilon_w,
-                                          tf.distribute.get_strategy())
-        epsilon_w_cache.append(epsilon_w)
-
-      with tf.GradientTape() as tape:
-        pred = self(x_batch)
-        loss = self.compiled_loss(y_batch, pred)
-      gradients = tape.gradient(loss, trainable_variables)
-      if len(gradients_all_batches) == 0:
-        for gradient in gradients:
-          gradients_all_batches.append([gradient])
-      else:
-        for (gradient, gradient_all_batches) in zip(gradients,
-                                                    gradients_all_batches):
-          gradient_all_batches.append(gradient)
-      for (variable, epsilon_w) in zip(trainable_variables, epsilon_w_cache):
-        # Restore the variable to its original value before `apply_gradients()`.
-        self._distributed_apply_epsilon_w(variable, -epsilon_w,
-                                          tf.distribute.get_strategy())
-
-    gradients = []
-    for gradient_all_batches in gradients_all_batches:
-      gradients.append(tf.reduce_sum(gradient_all_batches, axis=0))
-    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
-
-    pred = tf.concat(pred_all_batches, axis=0)
-    self.compiled_metrics.update_state(y, pred, sample_weight)
-    return {m.name: m.result() for m in self.metrics}
-
-  def call(self, inputs):
-    """Forward pass of SAM.
-
-    SAM delegates the forward pass call to the wrapped model.
+    Sharpness-aware minimization (SAM) is a technique that improves the model
+    generalization and provides robustness to label noise. Mini-batch splitting
+    is proven to improve the SAM's performance, so users can control how mini
+    batches are split via setting the `num_batch_splits` argument.
 
     Args:
-      inputs: Tensor. The model inputs.
-
-    Returns:
-      A Tensor, the outputs of the wrapped model for given `inputs`.
+      model: `tf.keras.Model` instance. The inner model that does the
+        forward-backward pass.
+      rho: float. The gradients scaling factor. Defaults to `0.05`.
+      num_batch_splits: int. The number of mini batches to
+        split into from each data batch. If None, batches are not split into
+        sub-batches. Defaults to `None`.
+      name: string. The name of the SAM model. Defaults to `None`.
+
+    Reference:
+      [Pierre Foret et al., 2020](https://arxiv.org/abs/2010.01412)
     """
-    return self.model(inputs)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        "model": generic_utils.serialize_keras_object(self.model),
-        "rho": self.rho,
-    })
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    # Avoid mutating the input dict.
-    config = copy.deepcopy(config)
-    model = deserialize_layer(
-        config.pop("model"), custom_objects=custom_objects)
-    config["model"] = model
-    return super().from_config(config, custom_objects)
-
-  def _distributed_apply_epsilon_w(self, var, epsilon_w, strategy):
-    # Helper function to apply epsilon_w on model variables.
-    if isinstance(tf.distribute.get_strategy(),
-                  (tf.distribute.experimental.ParameterServerStrategy,
-                   tf.distribute.experimental.CentralStorageStrategy)):
-      # Under PSS and CSS, the AggregatingVariable has to be kept in sync.
-      def distribute_apply(strategy, var, epsilon_w):
-        strategy.extended.update(
-          var, lambda x, y: x.assign_add(y), args=(epsilon_w,), group=False)
-
-      tf.__internal__.distribute.interim.maybe_merge_call(
-        distribute_apply, tf.distribute.get_strategy(), var, epsilon_w)
-    else:
-      var.assign_add(epsilon_w)
-
-  def _gradients_order2_norm(self, gradients):
-    norm = tf.norm(
-        tf.stack([tf.norm(grad) for grad in gradients if grad is not None]))
-    return norm
+
+    def __init__(self, model, rho=0.05, num_batch_splits=None, name=None):
+        super().__init__(name=name)
+        self.model = model
+        self.rho = rho
+        self.num_batch_splits = num_batch_splits
+
+    def train_step(self, data):
+        """The logic of one SAM training step.
+
+        Args:
+          data: A nested structure of `Tensor`s. It should be of structure
+            (x, y, sample_weight) or (x, y).
+
+        Returns:
+          A dict mapping metric names to running average values.
+        """
+        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+
+        if self.num_batch_splits is not None:
+            x_split = tf.split(x, self.num_batch_splits)
+            y_split = tf.split(y, self.num_batch_splits)
+        else:
+            x_split = [x]
+            y_split = [y]
+
+        gradients_all_batches = []
+        pred_all_batches = []
+        for x_batch, y_batch in zip(x_split, y_split):
+            epsilon_w_cache = []
+            with tf.GradientTape() as tape:
+                pred = self.model(x_batch)
+                loss = self.compiled_loss(y_batch, pred)
+            pred_all_batches.append(pred)
+            trainable_variables = self.model.trainable_variables
+            gradients = tape.gradient(loss, trainable_variables)
+
+            gradients_order2_norm = self._gradients_order2_norm(gradients)
+            scale = self.rho / (gradients_order2_norm + 1e-12)
+
+            for gradient, variable in zip(gradients, trainable_variables):
+                epsilon_w = gradient * scale
+                self._distributed_apply_epsilon_w(
+                    variable, epsilon_w, tf.distribute.get_strategy()
+                )
+                epsilon_w_cache.append(epsilon_w)
+
+            with tf.GradientTape() as tape:
+                pred = self(x_batch)
+                loss = self.compiled_loss(y_batch, pred)
+            gradients = tape.gradient(loss, trainable_variables)
+            if len(gradients_all_batches) == 0:
+                for gradient in gradients:
+                    gradients_all_batches.append([gradient])
+            else:
+                for gradient, gradient_all_batches in zip(
+                    gradients, gradients_all_batches
+                ):
+                    gradient_all_batches.append(gradient)
+            for variable, epsilon_w in zip(
+                trainable_variables, epsilon_w_cache
+            ):
+                # Restore the variable to its original value before
+                # `apply_gradients()`.
+                self._distributed_apply_epsilon_w(
+                    variable, -epsilon_w, tf.distribute.get_strategy()
+                )
+
+        gradients = []
+        for gradient_all_batches in gradients_all_batches:
+            gradients.append(tf.reduce_sum(gradient_all_batches, axis=0))
+        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+
+        pred = tf.concat(pred_all_batches, axis=0)
+        self.compiled_metrics.update_state(y, pred, sample_weight)
+        return {m.name: m.result() for m in self.metrics}
+
+    def call(self, inputs):
+        """Forward pass of SAM.
+
+        SAM delegates the forward pass call to the wrapped model.
+
+        Args:
+          inputs: Tensor. The model inputs.
+
+        Returns:
+          A Tensor, the outputs of the wrapped model for given `inputs`.
+        """
+        return self.model(inputs)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "model": serialize_keras_object(self.model),
+                "rho": self.rho,
+            }
+        )
+        return config
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        # Avoid mutating the input dict.
+        config = copy.deepcopy(config)
+        model = deserialize_layer(
+            config.pop("model"), custom_objects=custom_objects
+        )
+        config["model"] = model
+        return super().from_config(config, custom_objects)
+
+    def _distributed_apply_epsilon_w(self, var, epsilon_w, strategy):
+        # Helper function to apply epsilon_w on model variables.
+        if isinstance(
+            tf.distribute.get_strategy(),
+            (
+                tf.distribute.experimental.ParameterServerStrategy,
+                tf.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            # Under PSS and CSS, the AggregatingVariable has to be kept in sync.
+            def distribute_apply(strategy, var, epsilon_w):
+                strategy.extended.update(
+                    var,
+                    lambda x, y: x.assign_add(y),
+                    args=(epsilon_w,),
+                    group=False,
+                )
+
+            tf.__internal__.distribute.interim.maybe_merge_call(
+                distribute_apply, tf.distribute.get_strategy(), var, epsilon_w
+            )
+        else:
+            var.assign_add(epsilon_w)
+
+    def _gradients_order2_norm(self, gradients):
+        norm = tf.norm(
+            tf.stack([tf.norm(grad) for grad in gradients if grad is not None])
+        )
+        return norm
diff --git a/keras/models/sharpness_aware_minimization_test.py b/keras/models/sharpness_aware_minimization_test.py
index 7a0fd3760889..7571f179b5b0 100644
--- a/keras/models/sharpness_aware_minimization_test.py
+++ b/keras/models/sharpness_aware_minimization_test.py
@@ -2,12 +2,13 @@
 
 import os
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.models import sharpness_aware_minimization
-from keras.optimizers.optimizer_experimental import adam
+from keras.optimizers import adam
 from keras.testing_infra import test_utils
-import tensorflow.compat.v2 as tf
 
 ds_combinations = tf.__internal__.distribute.combinations
 
@@ -24,107 +25,138 @@
 
 @test_utils.run_v2_only
 class SharpnessAwareMinimizationTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_sam_model_call(self):
-    model = keras.Sequential([
-        keras.Input([2, 2]),
-        keras.layers.Dense(4),
-    ])
-    sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(model)
-    data = tf.random.uniform([2, 2])
-    self.assertAllClose(model(data), sam_model(data))
-
-  @ds_combinations.generate(
-      tf.__internal__.test.combinations.combine(strategy=STRATEGIES))
-  def test_sam_model_fit(self, strategy):
-    with strategy.scope():
-      model = keras.Sequential([
-          keras.Input([2, 2]),
-          keras.layers.Dense(4),
-          keras.layers.Dense(1),
-      ])
-      sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(model)
-      data = tf.random.uniform([2, 2])
-      label = data[:, 0] > 0.5
-
-      sam_model.compile(
-          optimizer=adam.Adam(),
-          loss=keras.losses.BinaryCrossentropy(from_logits=True),
-      )
-
-      sam_model.fit(data, label, steps_per_epoch=1)
-
-  @ds_combinations.generate(
-      tf.__internal__.test.combinations.combine(strategy=STRATEGIES))
-  def test_sam_model_fit_with_sub_batch(self, strategy):
-    with strategy.scope():
-      model = keras.Sequential([
-          keras.Input([2, 2]),
-          keras.layers.Dense(4),
-          keras.layers.Dense(1),
-      ])
-      sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(
-          model, num_batch_splits=4)
-      data = tf.random.uniform([48, 2])
-      label = data[:, 0] > 0.5
-
-      sam_model.compile(
-          optimizer=adam.Adam(),
-          loss=keras.losses.BinaryCrossentropy(from_logits=True),
-      )
-
-      sam_model.fit(data, label, steps_per_epoch=1)
-
-  def test_save_sam(self):
-    model = keras.Sequential([
-        keras.Input([2, 2]),
-        keras.layers.Dense(4),
-        keras.layers.Dense(1),
-    ])
-    sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(model)
-    data = tf.random.uniform([1, 2, 2])
-    label = data[:, 0] > 0.5
-
-    sam_model.compile(
-        optimizer=adam.Adam(),
-        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+    def test_sam_model_call(self):
+        model = keras.Sequential(
+            [
+                keras.Input([2, 2]),
+                keras.layers.Dense(4),
+            ]
+        )
+        sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(
+            model
+        )
+        data = tf.random.uniform([2, 2])
+        self.assertAllClose(model(data), sam_model(data))
+
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(strategy=STRATEGIES)
     )
-
-    sam_model.fit(data, label)
-
-    path = os.path.join(self.get_temp_dir(), "model")
-    sam_model.save(path)
-    loaded_sam_model = keras.models.load_model(path)
-    loaded_sam_model.load_weights(path)
-
-    self.assertAllClose(sam_model(data), loaded_sam_model(data))
-
-  def test_checkpoint_sam(self):
-    model = keras.Sequential([
-        keras.Input([2, 2]),
-        keras.layers.Dense(4),
-        keras.layers.Dense(1),
-    ])
-    sam_model_1 = sharpness_aware_minimization.SharpnessAwareMinimization(model)
-    sam_model_2 = sharpness_aware_minimization.SharpnessAwareMinimization(model)
-    data = tf.random.uniform([1, 2, 2])
-    label = data[:, 0] > 0.5
-
-    sam_model_1.compile(
-        optimizer=adam.Adam(),
-        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+    def test_sam_model_fit(self, strategy):
+        with strategy.scope():
+            model = keras.Sequential(
+                [
+                    keras.Input([2, 2]),
+                    keras.layers.Dense(4),
+                    keras.layers.Dense(1),
+                ]
+            )
+            sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(
+                model
+            )
+            data = tf.random.uniform([2, 2])
+            label = data[:, 0] > 0.5
+
+            sam_model.compile(
+                optimizer=adam.Adam(),
+                loss=keras.losses.BinaryCrossentropy(from_logits=True),
+            )
+
+            sam_model.fit(data, label, steps_per_epoch=1)
+
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(strategy=STRATEGIES)
     )
-
-    sam_model_1.fit(data, label)
-
-    checkpoint = tf.train.Checkpoint(sam_model_1)
-    checkpoint2 = tf.train.Checkpoint(sam_model_2)
-    temp_dir = self.get_temp_dir()
-    save_path = checkpoint.save(temp_dir)
-    checkpoint2.restore(save_path)
-
-    self.assertAllClose(sam_model_1(data), sam_model_2(data))
+    def test_sam_model_fit_with_sub_batch(self, strategy):
+        with strategy.scope():
+            model = keras.Sequential(
+                [
+                    keras.Input([2, 2]),
+                    keras.layers.Dense(4),
+                    keras.layers.Dense(1),
+                ]
+            )
+            sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(
+                model, num_batch_splits=4
+            )
+            data = tf.random.uniform([48, 2])
+            label = data[:, 0] > 0.5
+
+            sam_model.compile(
+                optimizer=adam.Adam(),
+                loss=keras.losses.BinaryCrossentropy(from_logits=True),
+            )
+
+            sam_model.fit(data, label, steps_per_epoch=1)
+
+    def test_save_sam(self):
+        model = keras.Sequential(
+            [
+                keras.Input([2, 2]),
+                keras.layers.Dense(4),
+                keras.layers.Dense(1),
+            ]
+        )
+        sam_model = sharpness_aware_minimization.SharpnessAwareMinimization(
+            model
+        )
+        data = tf.random.uniform([1, 2, 2])
+        label = data[:, 0] > 0.5
+
+        sam_model.compile(
+            optimizer=adam.Adam(),
+            loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        )
+
+        sam_model.fit(data, label)
+
+        with self.subTest("savedmodel"):
+            path = os.path.join(self.get_temp_dir(), "model")
+            sam_model.save(path)
+            loaded_sam_model = keras.models.load_model(path)
+            loaded_sam_model.load_weights(path)
+
+            self.assertAllClose(sam_model(data), loaded_sam_model(data))
+
+        with self.subTest("keras_v3"):
+            path = os.path.join(self.get_temp_dir(), "model.keras")
+            sam_model.save(path)
+            loaded_sam_model = keras.models.load_model(path)
+            loaded_sam_model.load_weights(path)
+
+            self.assertAllClose(sam_model(data), loaded_sam_model(data))
+
+    def test_checkpoint_sam(self):
+        model = keras.Sequential(
+            [
+                keras.Input([2, 2]),
+                keras.layers.Dense(4),
+                keras.layers.Dense(1),
+            ]
+        )
+        sam_model_1 = sharpness_aware_minimization.SharpnessAwareMinimization(
+            model
+        )
+        sam_model_2 = sharpness_aware_minimization.SharpnessAwareMinimization(
+            model
+        )
+        data = tf.random.uniform([1, 2, 2])
+        label = data[:, 0] > 0.5
+
+        sam_model_1.compile(
+            optimizer=adam.Adam(),
+            loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        )
+
+        sam_model_1.fit(data, label)
+
+        checkpoint = tf.train.Checkpoint(sam_model_1)
+        checkpoint2 = tf.train.Checkpoint(sam_model_2)
+        temp_dir = self.get_temp_dir()
+        save_path = checkpoint.save(temp_dir)
+        checkpoint2.restore(save_path)
+
+        self.assertAllClose(sam_model_1(data), sam_model_2(data))
 
 
 if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/optimizers/BUILD b/keras/optimizers/BUILD
index e9fea1d46c55..f496373fefd2 100644
--- a/keras/optimizers/BUILD
+++ b/keras/optimizers/BUILD
@@ -1,17 +1,22 @@
 # Description:
 #   Contains the Keras Optimizer API.
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "tf_py_test")
+load("@org_keras//keras:keras.bzl", "distribute_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/python:__pkg__",
         "//third_party/tensorflow/python/distribute:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/saved_model:__pkg__",  # For unit tests.
+        "//third_party/tensorflow/python/tpu/tests:__pkg__",  # For unit tests.
+        "//third_party/tensorflow/python/trackable:__pkg__",
     ],
     licenses = ["notice"],
 )
@@ -20,19 +25,41 @@ py_library(
     name = "optimizers",
     srcs = [
         "__init__.py",
+        "adadelta.py",
+        "adafactor.py",
+        "adagrad.py",
+        "adam.py",
+        "adamax.py",
+        "adamw.py",
+        "ftrl.py",
+        "lion.py",
+        "nadam.py",
+        "optimizer.py",
         "optimizer_v1.py",
+        "rmsprop.py",
+        "sgd.py",
     ],
     srcs_version = "PY3",
     deps = [
+        ":utils",
+        "//:expect_tensorflow_installed",
         "//keras:backend",
-        "//keras/optimizers/legacy:optimizer",
-        "//keras/optimizers/optimizer_experimental:optimizer",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/dtensor:utils",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/optimizers/schedules:learning_rate_schedule",
         "//keras/utils:engine_utils",
     ],
 )
 
+py_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//:expect_tensorflow_installed",
+    ],
+)
+
 py_library(
     name = "legacy_learning_rate_decay",
     srcs = ["legacy_learning_rate_decay.py"],
@@ -44,9 +71,9 @@ py_library(
 )
 
 tf_py_test(
-    name = "optimizers_test",
+    name = "optimizer_v1_test",
     size = "medium",
-    srcs = ["optimizers_test.py"],
+    srcs = ["optimizer_v1_test.py"],
     python_version = "PY3",
     shard_count = 8,
     tags = ["notsan"],
@@ -70,3 +97,55 @@ cuda_py_test(
         "//keras/testing_infra:test_combinations",
     ],
 )
+
+# TODO(b/228209527): Combine this test with optimizer_test after
+# fixing the NCCL issue.
+distribute_py_test(
+    name = "optimizer_pss_test",
+    size = "medium",
+    srcs = ["optimizer_pss_test.py"],
+    shard_count = 32,
+    tags = [
+        "multi_gpu",
+        "no_oss",
+        "no_windows",
+    ],
+    deps = [
+        ":optimizers",
+        "//:expect_absl_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+distribute_py_test(
+    name = "optimizer_test",
+    size = "medium",
+    srcs = ["optimizer_test.py"],
+    shard_count = 16,
+    tags = [
+        "multi_gpu",
+        "no_windows",
+        "nomultivm",  # TODO(b/203558991): Re-enable.
+    ],
+    deps = [
+        ":optimizers",
+        "//:expect_absl_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+cuda_py_test(
+    name = "lion_test",
+    size = "medium",
+    srcs = ["lion_test.py"],
+    shard_count = 4,
+    deps = [
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+    ],
+)
diff --git a/keras/optimizers/__init__.py b/keras/optimizers/__init__.py
index eb4642e65090..39a02669950b 100644
--- a/keras/optimizers/__init__.py
+++ b/keras/optimizers/__init__.py
@@ -12,161 +12,318 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=g-bad-import-order
 """Built-in optimizer classes.
 
 For more examples see the base class `tf.keras.optimizers.Optimizer`.
 """
 
-import tensorflow.compat.v2 as tf
+# Imports needed for deserialization.
 
-# Symbols to be accessed under keras.optimizers. To be replaced with
-# optimizers v2022 when they graduate out of experimental.
-from keras.optimizers.optimizer_v2.gradient_descent import SGD
-from keras.optimizers.optimizer_v2.rmsprop import RMSprop
-from keras.optimizers.optimizer_v2.adam import Adam
-from keras.optimizers.optimizer_v2.adadelta import Adadelta
-from keras.optimizers.optimizer_v2.adagrad import Adagrad
-from keras.optimizers.optimizer_v2.adamax import Adamax
-from keras.optimizers.optimizer_v2.nadam import Nadam
-from keras.optimizers.optimizer_v2.ftrl import Ftrl
+import platform
+import warnings
+
+import tensorflow.compat.v2 as tf
+from absl import logging
 
-# Imports needed for deserialization.
 from keras import backend
-from keras.optimizers.optimizer_experimental import optimizer as optimizer_experimental
-from keras.optimizers.optimizer_experimental import adadelta as adadelta_experimental
-from keras.optimizers.optimizer_experimental import adagrad as adagrad_experimental
-from keras.optimizers.optimizer_experimental import adam as adam_experimental
-from keras.optimizers.optimizer_experimental import adamax as adamax_experimental
-from keras.optimizers.optimizer_experimental import adamw as adamw_experimental
-from keras.optimizers.optimizer_experimental import ftrl as ftrl_experimental
-from keras.optimizers.optimizer_experimental import nadam as nadam_experimental
-from keras.optimizers.optimizer_experimental import rmsprop as rmsprop_experimental
-from keras.optimizers.optimizer_experimental import sgd as sgd_experimental
-from keras.optimizers.legacy import optimizer as optimizer_legacy
+from keras.optimizers import adadelta
+from keras.optimizers import adafactor
+from keras.optimizers import adagrad
+from keras.optimizers import adam
+from keras.optimizers import adamax
+from keras.optimizers import adamw
+from keras.optimizers import ftrl
+from keras.optimizers import lion
+from keras.optimizers import nadam
+from keras.optimizers import optimizer as base_optimizer
+from keras.optimizers import rmsprop
+from keras.optimizers import sgd
 from keras.optimizers.legacy import adadelta as adadelta_legacy
 from keras.optimizers.legacy import adagrad as adagrad_legacy
 from keras.optimizers.legacy import adam as adam_legacy
 from keras.optimizers.legacy import adamax as adamax_legacy
 from keras.optimizers.legacy import ftrl as ftrl_legacy
+from keras.optimizers.legacy import gradient_descent as gradient_descent_legacy
 from keras.optimizers.legacy import nadam as nadam_legacy
+from keras.optimizers.legacy import optimizer_v2 as base_optimizer_legacy
 from keras.optimizers.legacy import rmsprop as rmsprop_legacy
-from keras.optimizers.legacy import sgd as sgd_legacy
+from keras.optimizers.legacy.adadelta import Adadelta
+from keras.optimizers.legacy.adagrad import Adagrad
+from keras.optimizers.legacy.adam import Adam
+from keras.optimizers.legacy.adamax import Adamax
+from keras.optimizers.legacy.ftrl import Ftrl
+
+# Symbols to be accessed under keras.optimizers. To be replaced with
+# optimizers v2022 when they graduate out of experimental.
+from keras.optimizers.legacy.gradient_descent import SGD
+from keras.optimizers.legacy.nadam import Nadam
+from keras.optimizers.legacy.rmsprop import RMSprop
 from keras.optimizers.optimizer_v1 import Optimizer
 from keras.optimizers.optimizer_v1 import TFOptimizer
-from keras.optimizers.optimizer_v2 import adadelta as adadelta_v2
-from keras.optimizers.optimizer_v2 import adagrad as adagrad_v2
-from keras.optimizers.optimizer_v2 import adam as adam_v2
-from keras.optimizers.optimizer_v2 import adamax as adamax_v2
-from keras.optimizers.optimizer_v2 import ftrl
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_v2
-from keras.optimizers.optimizer_v2 import nadam as nadam_v2
-from keras.optimizers.optimizer_v2 import optimizer_v2 as base_optimizer_v2
-from keras.optimizers.optimizer_v2 import rmsprop as rmsprop_v2
-from keras.utils.generic_utils import deserialize_keras_object
-from keras.utils.generic_utils import serialize_keras_object
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.serialization_lib import deserialize_keras_object
+from keras.saving.serialization_lib import serialize_keras_object
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
+# pylint: disable=line-too-long
+
+
+@keras_export("keras.optimizers.serialize")
+def serialize(optimizer, use_legacy_format=False):
+    """Serialize the optimizer configuration to JSON compatible python dict.
+
+    The configuration can be used for persistence and reconstruct the
+    `Optimizer` instance again.
+
+    >>> tf.keras.optimizers.serialize(tf.keras.optimizers.legacy.SGD())
+    {'module': 'keras.optimizers.legacy', 'class_name': 'SGD', 'config': {'name': 'SGD', 'learning_rate': 0.01, 'decay': 0.0, 'momentum': 0.0, 'nesterov': False}, 'registered_name': None}"""  # noqa: E501
+    """
+    Args:
+      optimizer: An `Optimizer` instance to serialize.
+
+    Returns:
+      Python dict which contains the configuration of the input optimizer.
+    """
+    if optimizer is None:
+        return None
+    if not isinstance(
+        optimizer,
+        (
+            base_optimizer.Optimizer,
+            Optimizer,
+            base_optimizer_legacy.OptimizerV2,
+        ),
+    ):
+        warnings.warn(
+            "The `keras.optimizers.serialize()` API should only be used for "
+            "objects of type `keras.optimizers.Optimizer`. Found an instance "
+            f"of type {type(optimizer)}, which may lead to improper "
+            "serialization."
+        )
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(optimizer)
+    return serialize_keras_object(optimizer)
+
+
+def is_arm_mac():
+    return platform.system() == "Darwin" and platform.processor() == "arm"
+
 
-@keras_export('keras.optimizers.serialize')
-def serialize(optimizer):
-  """Serialize the optimizer configuration to JSON compatible python dict.
-
-  The configuration can be used for persistence and reconstruct the `Optimizer`
-  instance again.
-
-  >>> tf.keras.optimizers.serialize(tf.keras.optimizers.SGD())
-  {'class_name': 'SGD', 'config': {'name': 'SGD', 'learning_rate': 0.01,
-                                   'decay': 0.0, 'momentum': 0.0,
-                                   'nesterov': False}}
-
-  Args:
-    optimizer: An `Optimizer` instance to serialize.
-
-  Returns:
-    Python dict which contains the configuration of the input optimizer.
-  """
-  return serialize_keras_object(optimizer)
-
-
-@keras_export('keras.optimizers.deserialize')
-def deserialize(config, custom_objects=None):
-  """Inverse of the `serialize` function.
-
-  Args:
-      config: Optimizer configuration dictionary.
-      custom_objects: Optional dictionary mapping names (strings) to custom
-        objects (classes and functions) to be considered during deserialization.
-
-  Returns:
-      A Keras Optimizer instance.
-  """
-  # loss_scale_optimizer has a direct dependency of optimizer, import here
-  # rather than top to avoid the cyclic dependency.
-  from keras.mixed_precision import loss_scale_optimizer  # pylint: disable=g-import-not-at-top
-  all_classes = {
-      'adadelta': adadelta_v2.Adadelta,
-      'adagrad': adagrad_v2.Adagrad,
-      'adam': adam_v2.Adam,
-      'adamax': adamax_v2.Adamax,
-      'experimentaladadelta': adadelta_experimental.Adadelta,
-      'experimentaladagrad': adagrad_experimental.Adagrad,
-      'experimentaladam': adam_experimental.Adam,
-      'experimentalsgd': sgd_experimental.SGD,
-      'nadam': nadam_v2.Nadam,
-      'rmsprop': rmsprop_v2.RMSprop,
-      'sgd': gradient_descent_v2.SGD,
-      'ftrl': ftrl.Ftrl,
-      'lossscaleoptimizer': loss_scale_optimizer.LossScaleOptimizer,
-      'lossscaleoptimizerv3': loss_scale_optimizer.LossScaleOptimizerV3,
-      # LossScaleOptimizerV1 was an old version of LSO that was removed.
-      # Deserializing it turns it into a LossScaleOptimizer
-      'lossscaleoptimizerv1': loss_scale_optimizer.LossScaleOptimizer,
-  }
-
-  # Make deserialization case-insensitive for built-in optimizers.
-  if config['class_name'].lower() in all_classes:
-    config['class_name'] = config['class_name'].lower()
-  return deserialize_keras_object(
-      config,
-      module_objects=all_classes,
-      custom_objects=custom_objects,
-      printable_module_name='optimizer')
-
-
-@keras_export('keras.optimizers.get')
-def get(identifier):
-  """Retrieves a Keras Optimizer instance.
-
-  Args:
-      identifier: Optimizer identifier, one of
-          - String: name of an optimizer
+@keras_export("keras.optimizers.deserialize")
+def deserialize(config, custom_objects=None, use_legacy_format=False, **kwargs):
+    """Inverse of the `serialize` function.
+
+    Args:
+        config: Optimizer configuration dictionary.
+        custom_objects: Optional dictionary mapping names (strings) to custom
+            objects (classes and functions) to be considered during
+            deserialization.
+
+    Returns:
+        A Keras Optimizer instance.
+    """
+    # loss_scale_optimizer has a direct dependency of optimizer, import here
+    # rather than top to avoid the cyclic dependency.
+    from keras.mixed_precision import (
+        loss_scale_optimizer,
+    )
+
+    use_legacy_optimizer = kwargs.pop("use_legacy_optimizer", False)
+    if kwargs:
+        raise TypeError(f"Invalid keyword arguments: {kwargs}")
+    if len(config["config"]) > 0:
+        # If the optimizer config is not empty, then we use the value of
+        # `is_legacy_optimizer` to override `use_legacy_optimizer`. If
+        # `is_legacy_optimizer` does not exist in config, it means we are
+        # using the legacy optimzier.
+        use_legacy_optimizer = config["config"].get("is_legacy_optimizer", True)
+    if (
+        tf.__internal__.tf2.enabled()
+        and tf.executing_eagerly()
+        and not is_arm_mac()
+        and not use_legacy_optimizer
+    ):
+        # We observed a slowdown of optimizer on M1 Mac, so we fall back to the
+        # legacy optimizer for M1 users now, see b/263339144 for more context.
+        all_classes = {
+            "adadelta": adadelta.Adadelta,
+            "adagrad": adagrad.Adagrad,
+            "adam": adam.Adam,
+            "adamax": adamax.Adamax,
+            "experimentaladadelta": adadelta.Adadelta,
+            "experimentaladagrad": adagrad.Adagrad,
+            "experimentaladam": adam.Adam,
+            "experimentalsgd": sgd.SGD,
+            "nadam": nadam.Nadam,
+            "rmsprop": rmsprop.RMSprop,
+            "sgd": sgd.SGD,
+            "ftrl": ftrl.Ftrl,
+            "lossscaleoptimizer": loss_scale_optimizer.LossScaleOptimizerV3,
+            "lossscaleoptimizerv3": loss_scale_optimizer.LossScaleOptimizerV3,
+            # LossScaleOptimizerV1 was an old version of LSO that was removed.
+            # Deserializing it turns it into a LossScaleOptimizer
+            "lossscaleoptimizerv1": loss_scale_optimizer.LossScaleOptimizer,
+        }
+    else:
+        all_classes = {
+            "adadelta": adadelta_legacy.Adadelta,
+            "adagrad": adagrad_legacy.Adagrad,
+            "adam": adam_legacy.Adam,
+            "adamax": adamax_legacy.Adamax,
+            "experimentaladadelta": adadelta.Adadelta,
+            "experimentaladagrad": adagrad.Adagrad,
+            "experimentaladam": adam.Adam,
+            "experimentalsgd": sgd.SGD,
+            "nadam": nadam_legacy.Nadam,
+            "rmsprop": rmsprop_legacy.RMSprop,
+            "sgd": gradient_descent_legacy.SGD,
+            "ftrl": ftrl_legacy.Ftrl,
+            "lossscaleoptimizer": loss_scale_optimizer.LossScaleOptimizer,
+            "lossscaleoptimizerv3": loss_scale_optimizer.LossScaleOptimizerV3,
+            # LossScaleOptimizerV1 was an old version of LSO that was removed.
+            # Deserializing it turns it into a LossScaleOptimizer
+            "lossscaleoptimizerv1": loss_scale_optimizer.LossScaleOptimizer,
+        }
+
+    # Make deserialization case-insensitive for built-in optimizers.
+    if config["class_name"].lower() in all_classes:
+        config["class_name"] = config["class_name"].lower()
+
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            config,
+            module_objects=all_classes,
+            custom_objects=custom_objects,
+            printable_module_name="optimizer",
+        )
+
+    return deserialize_keras_object(
+        config,
+        module_objects=all_classes,
+        custom_objects=custom_objects,
+        printable_module_name="optimizer",
+    )
+
+
+@keras_export(
+    "keras.__internal__.optimizers.convert_to_legacy_optimizer", v1=[]
+)
+def convert_to_legacy_optimizer(optimizer):
+    """Convert experimental optimizer to legacy optimizer.
+
+    This function takes in a `keras.optimizers.Optimizer`
+    instance and converts it to the corresponding
+    `keras.optimizers.legacy.Optimizer` instance.
+    For example, `keras.optimizers.Adam(...)` to
+    `keras.optimizers.legacy.Adam(...)`.
+
+    Args:
+        optimizer: An instance of `keras.optimizers.Optimizer`.
+    """
+    # loss_scale_optimizer has a direct dependency of optimizer, import here
+    # rather than top to avoid the cyclic dependency.
+    from keras.mixed_precision import (
+        loss_scale_optimizer,
+    )
+
+    if not isinstance(optimizer, base_optimizer.Optimizer):
+        raise ValueError(
+            "`convert_to_legacy_optimizer` should only be called "
+            "on instances of `tf.keras.optimizers.Optimizer`, but "
+            f"received {optimizer} of type {type(optimizer)}."
+        )
+    optimizer_name = optimizer.__class__.__name__.lower()
+    config = optimizer.get_config()
+    # Remove fields that only exist in experimental optimizer.
+    keys_to_remove = [
+        "weight_decay",
+        "use_ema",
+        "ema_momentum",
+        "ema_overwrite_frequency",
+        "jit_compile",
+        "is_legacy_optimizer",
+    ]
+    for key in keys_to_remove:
+        config.pop(key, None)
+
+    if isinstance(optimizer, loss_scale_optimizer.LossScaleOptimizerV3):
+        # For LossScaleOptimizers, recursively convert the inner optimizer
+        config["inner_optimizer"] = convert_to_legacy_optimizer(
+            optimizer.inner_optimizer
+        )
+        if optimizer_name == "lossscaleoptimizerv3":
+            optimizer_name = "lossscaleoptimizer"
+
+    # Learning rate can be a custom LearningRateSchedule, which is stored as
+    # a dict in config, and cannot be deserialized.
+    if hasattr(optimizer, "_learning_rate") and isinstance(
+        optimizer._learning_rate, learning_rate_schedule.LearningRateSchedule
+    ):
+        config["learning_rate"] = optimizer._learning_rate
+    legacy_optimizer_config = {
+        "class_name": optimizer_name,
+        "config": config,
+    }
+    return deserialize(legacy_optimizer_config, use_legacy_optimizer=True)
+
+
+@keras_export("keras.optimizers.get")
+def get(identifier, **kwargs):
+    """Retrieves a Keras Optimizer instance.
+
+    Args:
+        identifier: Optimizer identifier, one of - String: name of an optimizer
           - Dictionary: configuration dictionary. - Keras Optimizer instance (it
-            will be returned unchanged). - TensorFlow Optimizer instance (it
-            will be wrapped as a Keras Optimizer).
-
-  Returns:
-      A Keras Optimizer instance.
-
-  Raises:
-      ValueError: If `identifier` cannot be interpreted.
-  """
-  if isinstance(
-      identifier,
-      (Optimizer, base_optimizer_v2.OptimizerV2,
-       optimizer_experimental.Optimizer)):
-    return identifier
-  # Wrap legacy TF optimizer instances
-  elif isinstance(identifier, tf.compat.v1.train.Optimizer):
-    opt = TFOptimizer(identifier)
-    backend.track_tf_optimizer(opt)
-    return opt
-  elif isinstance(identifier, dict):
-    return deserialize(identifier)
-  elif isinstance(identifier, str):
-    config = {'class_name': str(identifier), 'config': {}}
-    return deserialize(config)
-  else:
-    raise ValueError(
-        'Could not interpret optimizer identifier: {}'.format(identifier))
+          will be returned unchanged). - TensorFlow Optimizer instance (it will
+          be wrapped as a Keras Optimizer).
+
+    Returns:
+        A Keras Optimizer instance.
+
+    Raises:
+        ValueError: If `identifier` cannot be interpreted.
+    """
+    use_legacy_optimizer = kwargs.pop("use_legacy_optimizer", False)
+    if kwargs:
+        raise TypeError(f"Invalid keyword arguments: {kwargs}")
+    if isinstance(
+        identifier,
+        (
+            Optimizer,
+            base_optimizer_legacy.OptimizerV2,
+        ),
+    ):
+        return identifier
+    elif isinstance(identifier, base_optimizer.Optimizer):
+        if tf.__internal__.tf2.enabled():
+            return identifier
+        else:
+            # If TF2 is disabled, we convert to the legacy
+            # optimizer.
+            return convert_to_legacy_optimizer(identifier)
+
+    # Wrap legacy TF optimizer instances
+    elif isinstance(identifier, tf.compat.v1.train.Optimizer):
+        opt = TFOptimizer(identifier)
+        backend.track_tf_optimizer(opt)
+        return opt
+    elif isinstance(identifier, dict):
+        use_legacy_format = "module" not in identifier
+        return deserialize(
+            identifier,
+            use_legacy_optimizer=use_legacy_optimizer,
+            use_legacy_format=use_legacy_format,
+        )
+    elif isinstance(identifier, str):
+        config = {"class_name": str(identifier), "config": {}}
+        return get(
+            config,
+            use_legacy_optimizer=use_legacy_optimizer,
+        )
+    else:
+        raise ValueError(
+            f"Could not interpret optimizer identifier: {identifier}"
+        )
diff --git a/keras/optimizers/adadelta.py b/keras/optimizers/adadelta.py
new file mode 100644
index 000000000000..a82eb5cdface
--- /dev/null
+++ b/keras/optimizers/adadelta.py
@@ -0,0 +1,171 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adadelta optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.experimental.Adadelta",
+    "keras.optimizers.Adadelta",
+    "keras.dtensor.experimental.optimizers.Adadelta",
+    v1=[],
+)
+class Adadelta(optimizer.Optimizer):
+    r"""Optimizer that implements the Adadelta algorithm.
+
+    Adadelta optimization is a stochastic gradient descent method that is based
+    on adaptive learning rate per dimension to address two drawbacks:
+
+    - The continual decay of learning rates throughout training.
+    - The need for a manually selected global learning rate.
+
+    Adadelta is a more robust extension of Adagrad that adapts learning rates
+    based on a moving window of gradient updates, instead of accumulating all
+    past gradients. This way, Adadelta continues learning even when many updates
+    have been done. Compared to Adagrad, in the original version of Adadelta you
+    don't have to set an initial learning rate. In this version, the initial
+    learning rate can be set, as in most other Keras optimizers.
+
+    Args:
+        learning_rate: Initial value for the learning rate: either a floating
+            point value, or a
+            `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+            Defaults to 0.001. Note that `Adadelta` tends to benefit from
+            higher initial learning rate values compared to other optimizers. To
+            match the exact form in the original paper, use 1.0.
+        rho: A `Tensor` or a floating point value. The decay rate. Defaults to
+            0.95.
+        epsilon: Small floating point value used to maintain numerical
+            stability. Defaults to 1e-7.
+      {{base_optimizer_keyword_args}}
+
+    Reference:
+        - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        rho=0.95,
+        epsilon=1e-7,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Adadelta",
+        **kwargs
+    ):
+        super().__init__(
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            name=name,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def build(self, var_list):
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._built = True
+        self._accumulated_grads = []
+        self._accumulated_delta_vars = []
+        for var in var_list:
+            self._accumulated_grads.append(
+                self.add_variable_from_reference(var, "accumulated_grad")
+            )
+            self._accumulated_delta_vars.append(
+                self.add_variable_from_reference(var, "accumulated_delta_var")
+            )
+
+    def update_step(self, grad, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+
+        var_key = self._var_key(variable)
+        rho = self.rho
+        accumulated_grad = self._accumulated_grads[self._index_dict[var_key]]
+        accumulated_delta_var = self._accumulated_delta_vars[
+            self._index_dict[var_key]
+        ]
+
+        def rms(x):
+            return tf.sqrt(x + self.epsilon)
+
+        if isinstance(grad, tf.IndexedSlices):
+            # Sparse gradients.
+            accumulated_grad.assign_add((rho - 1) * accumulated_grad)
+            accumulated_grad.scatter_add(
+                tf.IndexedSlices(
+                    (1 - rho) * tf.square(grad.values), grad.indices
+                )
+            )
+            delta_var = (
+                -rms(accumulated_delta_var) * grad / rms(accumulated_grad)
+            )
+            accumulated_delta_var.assign(
+                rho * accumulated_delta_var + (1 - rho) * delta_var * delta_var
+            )
+        else:
+            # Dense gradients.
+            accumulated_grad.assign(
+                rho * accumulated_grad + (1 - rho) * grad * grad
+            )
+            delta_var = (
+                -rms(accumulated_delta_var) * grad / rms(accumulated_grad)
+            )
+            accumulated_delta_var.assign(
+                rho * accumulated_delta_var + (1 - rho) * delta_var * delta_var
+            )
+        variable.assign_add(lr * delta_var)
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "rho": self.rho,
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
+
+
+Adadelta.__doc__ = Adadelta.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/adafactor.py b/keras/optimizers/adafactor.py
new file mode 100644
index 000000000000..fb93bdac3710
--- /dev/null
+++ b/keras/optimizers/adafactor.py
@@ -0,0 +1,231 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adagrad optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.Adafactor",
+    "keras.optimizers.experimental.Adafactor",
+    v1=[],
+)
+class Adafactor(optimizer.Optimizer):
+    """Optimizer that implements the Adafactor algorithm.
+
+    Adafactor is commonly used in NLP tasks, and has the advantage
+    of taking less memory because it only saves partial information of previous
+    gradients.
+
+    The default argument setup is based on the original paper (see reference).
+    When gradients are of dimension > 2, Adafactor optimizer will delete the
+    last 2 dimensions separately in its accumulator variables.
+
+    Args:
+        learning_rate: Initial value for the learning rate:
+            either a floating point value,
+            or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+            Defaults to 0.001.
+        beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
+        epsilon_1: float, defaults to 1e-30. A small offset to keep denominator
+            away from 0.
+        epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
+            rate becoming too small by time.
+        clip_threshold: float, defaults to 1.0. Clipping threshold. This is a
+            part of Adafactor algorithm, independent from `clipnorm`,
+            `clipvalue` and `global_clipnorm`.
+        relative_step: bool, defaults to True. If `learning_rate` is a
+            constant and `relative_step=True`, learning rate will be adjusted
+            based on current iterations. This is a default learning rate decay
+            in Adafactor.
+      {{base_optimizer_keyword_args}}
+
+    Reference:
+        - [Shazeer, Noam et al., 2018](https://arxiv.org/abs/1804.04235).
+
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_2_decay=-0.8,
+        epsilon_1=1e-30,
+        epsilon_2=1e-3,
+        clip_threshold=1.0,
+        relative_step=True,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Adafactor",
+        **kwargs,
+    ):
+        super().__init__(
+            name=name,
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs,
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.beta_2_decay = beta_2_decay
+        self.epsilon_1 = epsilon_1
+        self.epsilon_2 = epsilon_2
+        self.clip_threshold = clip_threshold
+        self.relative_step = relative_step
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        Adam optimizer has 3 types of variables: momentums, velocities and
+        velocity_hat (only set when amsgrad is applied),
+
+        Args:
+            var_list: list of model variables to build Adam variables on.
+        """
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._built = True
+        self._r = []
+        self._c = []
+        self._v = []
+        for var in var_list:
+            if len(var.shape) < 2:
+                # Don't factor if variable is of dimension < 2, but we still
+                # need to create dummy variables as placeholder.
+                self._r.append(tf.Variable(0, name=f"r/{var._shared_name}"))
+                self._c.append(tf.Variable(0, name=f"r/{var._shared_name}"))
+            else:
+                # Always factor the last 2 dimenstions.
+                r_shape = var.shape[:-1]
+                c_shape = var.shape[:-2] + var.shape[-1]
+                self._r.append(
+                    self.add_variable(
+                        shape=r_shape,
+                        dtype=var.dtype,
+                        name=f"r/{var._shared_name}",
+                    )
+                )
+                self._c.append(
+                    self.add_variable(
+                        shape=c_shape,
+                        dtype=var.dtype,
+                        name=f"c/{var._shared_name}",
+                    )
+                )
+            self._v.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="v"
+                )
+            )
+
+    def _rms(self, x):
+        return tf.sqrt(tf.reduce_mean(tf.square(x)))
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+
+        lr = tf.cast(self.learning_rate, variable.dtype)
+        epsilon_2 = tf.cast(self.epsilon_2, variable.dtype)
+        one = tf.cast(1.0, variable.dtype)
+        local_step = tf.cast(self.iterations + 1, variable.dtype)
+        if (
+            not isinstance(
+                self._learning_rate, learning_rate_schedule.LearningRateSchedule
+            )
+            and self.relative_step
+        ):
+            # If `relative_step=True` and learning rate is a constant, we
+            # apply the relative step algorithm.
+            lr = tf.minimum(lr, tf.math.rsqrt(local_step))
+
+        var_key = self._var_key(variable)
+        r = self._r[self._index_dict[var_key]]
+        c = self._c[self._index_dict[var_key]]
+        v = self._v[self._index_dict[var_key]]
+
+        rho_t = tf.minimum(lr, tf.math.rsqrt(local_step))
+        alpha_t = tf.maximum(epsilon_2, self._rms(variable)) * rho_t
+        regulated_grad_square = tf.square(gradient) + self.epsilon_1
+        beta_2_t = 1 - tf.pow(local_step, self.beta_2_decay)
+
+        if len(variable.shape) >= 2:
+            # `r` deletes the last dimension of gradient, so it is of shape
+            # `gradient.shape[:-1]`.
+            r.assign(
+                beta_2_t * r
+                + (1 - beta_2_t)
+                * tf.reduce_mean(regulated_grad_square, axis=-1)
+            )
+            # `c` deletes the second last dimension of gradient, so it is of
+            # shape `gradient.shape[:-2] + gradient.shape[-1]`.
+            c.assign(
+                beta_2_t * c
+                + (1 - beta_2_t)
+                * tf.reduce_mean(regulated_grad_square, axis=-2)
+            )
+            v.assign(
+                tf.expand_dims(
+                    r / tf.reduce_mean(r, axis=-1, keepdims=True), axis=-1
+                )
+                * tf.expand_dims(c, -2)
+            )
+        else:
+            v.assign(beta_2_t * v + (1 - beta_2_t) * regulated_grad_square)
+
+        # `convert_to_tensor` unifies the handling of sparse and dense grads.
+        u_t = tf.convert_to_tensor(gradient) * tf.math.rsqrt(v)
+        u_t_hat = u_t / tf.maximum(one, (self._rms(u_t) / self.clip_threshold))
+        variable.assign_add(-alpha_t * u_t_hat)
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "beta_2_decay": self.beta_2_decay,
+                "epsilon_1": self.epsilon_1,
+                "epsilon_2": self.epsilon_2,
+                "clip_threshold": self.clip_threshold,
+                "relative_step": self.relative_step,
+            }
+        )
+        return config
+
+
+Adafactor.__doc__ = Adafactor.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/adagrad.py b/keras/optimizers/adagrad.py
new file mode 100644
index 000000000000..0840d492e21d
--- /dev/null
+++ b/keras/optimizers/adagrad.py
@@ -0,0 +1,150 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adagrad optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras import initializers
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.experimental.Adagrad",
+    "keras.optimizers.Adagrad",
+    "keras.dtensor.experimental.optimizers.Adagrad",
+    v1=[],
+)
+class Adagrad(optimizer.Optimizer):
+    r"""Optimizer that implements the Adagrad algorithm.
+
+    Adagrad is an optimizer with parameter-specific learning rates,
+    which are adapted relative to how frequently a parameter gets
+    updated during training. The more updates a parameter receives,
+    the smaller the updates.
+
+    Args:
+        learning_rate: Initial value for the learning rate:
+            either a floating point value,
+            or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+            Defaults to 0.001. Note that `Adagrad` tends to benefit from higher
+            initial learning rate values compared to other optimizers. To match
+            the exact form in the original paper, use 1.0.
+        initial_accumulator_value: Floating point value.
+            Starting value for the accumulators (per-parameter momentum values).
+            Must be non-negative.
+        epsilon: Small floating point value used to maintain numerical
+            stability.
+        {{base_optimizer_keyword_args}}
+
+    Reference:
+        - [Duchi et al., 2011](
+            http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        initial_accumulator_value=0.1,
+        epsilon=1e-7,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Adagrad",
+        **kwargs
+    ):
+        super().__init__(
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            name=name,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.initial_accumulator_value = initial_accumulator_value
+        self.epsilon = epsilon
+
+    def build(self, var_list):
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._built = True
+        self._accumulators = []
+        initializer = initializers.Constant(self.initial_accumulator_value)
+        for var in var_list:
+            self._accumulators.append(
+                self.add_variable_from_reference(
+                    var,
+                    "accumulator",
+                    initial_value=initializer(shape=var.shape, dtype=var.dtype),
+                )
+            )
+
+    def update_step(self, grad, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+
+        var_key = self._var_key(variable)
+        accumulator = self._accumulators[self._index_dict[var_key]]
+
+        if isinstance(grad, tf.IndexedSlices):
+            # Sparse gradients.
+            accumulator.scatter_add(
+                tf.IndexedSlices(grad.values * grad.values, grad.indices)
+            )
+            sparse_accumulator = tf.gather(accumulator, indices=grad.indices)
+            sparse_denominator = tf.sqrt(sparse_accumulator + self.epsilon)
+            variable.scatter_add(
+                tf.IndexedSlices(
+                    -lr * grad.values / sparse_denominator, grad.indices
+                )
+            )
+        else:
+            # Dense gradients.
+            accumulator.assign_add(grad * grad)
+            variable.assign_sub(lr * grad / tf.sqrt(accumulator + self.epsilon))
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "initial_accumulator_value": self.initial_accumulator_value,
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
+
+
+Adagrad.__doc__ = Adagrad.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/adam.py b/keras/optimizers/adam.py
new file mode 100644
index 000000000000..e17b10fa82bd
--- /dev/null
+++ b/keras/optimizers/adam.py
@@ -0,0 +1,225 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adam optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.Adam",
+    "keras.optimizers.experimental.Adam",
+    "keras.dtensor.experimental.optimizers.Adam",
+    v1=[],
+)
+class Adam(optimizer.Optimizer):
+    r"""Optimizer that implements the Adam algorithm.
+
+    Adam optimization is a stochastic gradient descent method that is based on
+    adaptive estimation of first-order and second-order moments.
+
+    According to
+    [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
+    the method is "*computationally
+    efficient, has little memory requirement, invariant to diagonal rescaling of
+    gradients, and is well suited for problems that are large in terms of
+    data/parameters*".
+
+    Args:
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to `0.001`.
+        beta_1: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimates.
+            Defaults to `0.9`.
+        beta_2: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 2nd moment estimates.
+            Defaults to `0.999`.
+        epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper.
+            Defaults to `1e-7`.
+        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
+            from the paper "On the Convergence of Adam and beyond".
+            Defaults to `False`.
+        {{base_optimizer_keyword_args}}
+
+    Reference:
+        - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+        - [Reddi et al., 2018](
+            https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
+
+    Notes:
+
+    The default value of 1e-7 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since Adam uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        amsgrad=False,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Adam",
+        **kwargs
+    ):
+        super().__init__(
+            name=name,
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.amsgrad = amsgrad
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        Adam optimizer has 3 types of variables: momentums, velocities and
+        velocity_hat (only set when amsgrad is applied),
+
+        Args:
+            var_list: list of model variables to build Adam variables on.
+        """
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._built = True
+        self._momentums = []
+        self._velocities = []
+        for var in var_list:
+            self._momentums.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="m"
+                )
+            )
+            self._velocities.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="v"
+                )
+            )
+        if self.amsgrad:
+            self._velocity_hats = []
+            for var in var_list:
+                self._velocity_hats.append(
+                    self.add_variable_from_reference(
+                        model_variable=var, variable_name="vhat"
+                    )
+                )
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+        local_step = tf.cast(self.iterations + 1, variable.dtype)
+        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
+        beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)
+
+        var_key = self._var_key(variable)
+        m = self._momentums[self._index_dict[var_key]]
+        v = self._velocities[self._index_dict[var_key]]
+
+        alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
+
+        if isinstance(gradient, tf.IndexedSlices):
+            # Sparse gradients.
+            m.assign_add(-m * (1 - self.beta_1))
+            m.scatter_add(
+                tf.IndexedSlices(
+                    gradient.values * (1 - self.beta_1), gradient.indices
+                )
+            )
+            v.assign_add(-v * (1 - self.beta_2))
+            v.scatter_add(
+                tf.IndexedSlices(
+                    tf.square(gradient.values) * (1 - self.beta_2),
+                    gradient.indices,
+                )
+            )
+            if self.amsgrad:
+                v_hat = self._velocity_hats[self._index_dict[var_key]]
+                v_hat.assign(tf.maximum(v_hat, v))
+                v = v_hat
+            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
+        else:
+            # Dense gradients.
+            m.assign_add((gradient - m) * (1 - self.beta_1))
+            v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
+            if self.amsgrad:
+                v_hat = self._velocity_hats[self._index_dict[var_key]]
+                v_hat.assign(tf.maximum(v_hat, v))
+                v = v_hat
+            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "beta_1": self.beta_1,
+                "beta_2": self.beta_2,
+                "epsilon": self.epsilon,
+                "amsgrad": self.amsgrad,
+            }
+        )
+        return config
+
+
+Adam.__doc__ = Adam.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/adamax.py b/keras/optimizers/adamax.py
new file mode 100644
index 000000000000..9b542ee57860
--- /dev/null
+++ b/keras/optimizers/adamax.py
@@ -0,0 +1,188 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adamax optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.experimental.Adamax", "keras.optimizers.Adamax", v1=[]
+)
+class Adamax(optimizer.Optimizer):
+    """Optimizer that implements the Adamax algorithm.
+
+    Adamax, a variant of Adam based on the infinity norm, is a first-order
+    gradient-based optimization method. Due to its capability of adjusting the
+    learning rate based on data characteristics, it is suited to learn
+    time-variant process, e.g., speech data with dynamically changed noise
+    conditions. Default parameters follow those provided in the paper (see
+    references below).
+
+    Initialization:
+
+    ```python
+    m = 0  # Initialize initial 1st moment vector
+    u = 0  # Initialize the exponentially weighted infinity norm
+    t = 0  # Initialize timestep
+    ```
+
+    The update rule for parameter `w` with gradient `g` is described at the end
+    of section 7.1 of the paper (see the referenece section):
+
+    ```python
+    t += 1
+    m = beta1 * m + (1 - beta) * g
+    u = max(beta2 * u, abs(g))
+    current_lr = learning_rate / (1 - beta1 ** t)
+    w = w - current_lr * m / (u + epsilon)
+    ```
+
+    Args:
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to `0.001`.
+        beta_1: A float value or a constant float tensor. The exponential decay
+            rate for the 1st moment estimates.
+        beta_2: A float value or a constant float tensor. The exponential decay
+            rate for the exponentially weighted infinity norm.
+        epsilon: A small constant for numerical stability.
+        {{base_optimizer_keyword_args}}
+
+    Reference:
+        - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Adamax",
+        **kwargs
+    ):
+        super().__init__(
+            name=name,
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        Adamax optimizer has 2 types of variables: momentums (denoted as m),
+        exponentially weighted infinity norm (denoted as u).
+
+        Args:
+            var_list: list of model variables to build Adamax variables on.
+        """
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._built = True
+        self._m = []
+        self._u = []
+        for var in var_list:
+            self._m.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="m"
+                )
+            )
+            self._u.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="u"
+                )
+            )
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+        local_step = tf.cast(self.iterations + 1, variable.dtype)
+        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
+
+        var_key = self._var_key(variable)
+        m = self._m[self._index_dict[var_key]]
+        u = self._u[self._index_dict[var_key]]
+
+        if isinstance(gradient, tf.IndexedSlices):
+            # Sparse gradients.
+            indices = gradient.indices
+            m.assign_add(-m * (1 - self.beta_1))
+            m.scatter_add(
+                tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices)
+            )
+            u.assign(u * self.beta_2)
+            u_slice = tf.gather(u, indices)
+            u_slice_incremental = (
+                tf.maximum(u_slice, tf.abs(gradient.values)) - u_slice
+            )
+            u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices))
+            variable.assign_sub(
+                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon))
+            )
+        else:
+            # Dense gradients.
+            m.assign_add((gradient - m) * (1 - self.beta_1))
+            u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient)))
+            variable.assign_sub(
+                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon))
+            )
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "beta_1": self.beta_1,
+                "beta_2": self.beta_2,
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
+
+
+Adamax.__doc__ = Adamax.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/adamw.py b/keras/optimizers/adamw.py
new file mode 100644
index 000000000000..8ae5195b5872
--- /dev/null
+++ b/keras/optimizers/adamw.py
@@ -0,0 +1,230 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AdamW optimizer implementation."""
+
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.AdamW",
+    "keras.optimizers.experimental.AdamW",
+    "keras.dtensor.experimental.optimizers.AdamW",
+    v1=[],
+)
+class AdamW(optimizer.Optimizer):
+    r"""Optimizer that implements the AdamW algorithm.
+
+    AdamW optimization is a stochastic gradient descent method that is based on
+    adaptive estimation of first-order and second-order moments with an added
+    method to decay weights per the techniques discussed in the paper,
+    'Decoupled Weight Decay Regularization' by
+    [Loshchilov, Hutter et al., 2019](https://arxiv.org/abs/1711.05101).
+
+    According to
+    [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
+    the underying Adam method is "*computationally
+    efficient, has little memory requirement, invariant to diagonal rescaling of
+    gradients, and is well suited for problems that are large in terms of
+    data/parameters*".
+
+    Args:
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to 0.001.
+        beta_1: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimates.
+            Defaults to 0.9.
+        beta_2: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 2nd moment estimates.
+            Defaults to 0.999.
+        epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper.
+            Defaults to 1e-7.
+        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
+            from the paper "On the Convergence of Adam and beyond".
+            Defaults to `False`.
+        {{base_optimizer_keyword_args}}
+
+    Reference:
+      - [Loshchilov et al., 2019](https://arxiv.org/abs/1711.05101)
+      - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) for `adam`
+      - [Reddi et al., 2018](
+          https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
+
+    Notes:
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        weight_decay=0.004,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        amsgrad=False,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="AdamW",
+        **kwargs
+    ):
+        super().__init__(
+            name=name,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.weight_decay = weight_decay
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.amsgrad = amsgrad
+
+        if self.weight_decay is None:
+            raise ValueError(
+                "Missing value of `weight_decay` which is required and"
+                " must be a float value."
+            )
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        AdamW optimizer has 3 types of variables: momentums, velocities and
+        velocity_hat (only set when amsgrad is applied),
+
+        Args:
+          var_list: list of model variables to build AdamW variables on.
+        """
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._built = True
+        self._momentums = []
+        self._velocities = []
+        for var in var_list:
+            self._momentums.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="m"
+                )
+            )
+            self._velocities.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="v"
+                )
+            )
+        if self.amsgrad:
+            self._velocity_hats = []
+            for var in var_list:
+                self._velocity_hats.append(
+                    self.add_variable_from_reference(
+                        model_variable=var, variable_name="vhat"
+                    )
+                )
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+        local_step = tf.cast(self.iterations + 1, variable.dtype)
+        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
+        beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)
+
+        var_key = self._var_key(variable)
+        m = self._momentums[self._index_dict[var_key]]
+        v = self._velocities[self._index_dict[var_key]]
+
+        alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
+
+        if isinstance(gradient, tf.IndexedSlices):
+            # Sparse gradients.
+            m.assign_add(-m * (1 - self.beta_1))
+            m.scatter_add(
+                tf.IndexedSlices(
+                    gradient.values * (1 - self.beta_1), gradient.indices
+                )
+            )
+            v.assign_add(-v * (1 - self.beta_2))
+            v.scatter_add(
+                tf.IndexedSlices(
+                    tf.square(gradient.values) * (1 - self.beta_2),
+                    gradient.indices,
+                )
+            )
+            if self.amsgrad:
+                v_hat = self._velocity_hats[self._index_dict[var_key]]
+                v_hat.assign(tf.maximum(v_hat, v))
+                v = v_hat
+            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
+        else:
+            # Dense gradients.
+            m.assign_add((gradient - m) * (1 - self.beta_1))
+            v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
+            if self.amsgrad:
+                v_hat = self._velocity_hats[self._index_dict[var_key]]
+                v_hat.assign(tf.maximum(v_hat, v))
+                v = v_hat
+            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "weight_decay": self.weight_decay,
+                "beta_1": self.beta_1,
+                "beta_2": self.beta_2,
+                "epsilon": self.epsilon,
+                "amsgrad": self.amsgrad,
+            }
+        )
+        return config
+
+
+AdamW.__doc__ = AdamW.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/ftrl.py b/keras/optimizers/ftrl.py
new file mode 100644
index 000000000000..30f4db99c928
--- /dev/null
+++ b/keras/optimizers/ftrl.py
@@ -0,0 +1,258 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FTRL optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.experimental.Ftrl", "keras.optimizers.Ftrl", v1=[]
+)
+class Ftrl(optimizer.Optimizer):
+    r"""Optimizer that implements the FTRL algorithm.
+
+    "Follow The Regularized Leader" (FTRL) is an optimization algorithm
+    developed at Google for click-through rate prediction in the early 2010s. It
+    is most suitable for shallow models with large and sparse feature spaces.
+    The algorithm is described by
+    [McMahan et al., 2013](https://research.google.com/pubs/archive/41159.pdf).
+    The Keras version has support for both online L2 regularization
+    (the L2 regularization described in the paper
+    above) and shrinkage-type L2 regularization
+    (which is the addition of an L2 penalty to the loss function).
+
+    Initialization:
+
+    ```python
+    n = 0
+    sigma = 0
+    z = 0
+    ```
+
+    Update rule for one variable `w`:
+
+    ```python
+    prev_n = n
+    n = n + g ** 2
+    sigma = (n ** -lr_power - prev_n ** -lr_power) / lr
+    z = z + g - sigma * w
+    if abs(z) < lambda_1:
+      w = 0
+    else:
+      w = (sgn(z) * lambda_1 - z) / ((beta + sqrt(n)) / alpha + lambda_2)
+    ```
+
+    Notation:
+
+    - `lr` is the learning rate
+    - `g` is the gradient for the variable
+    - `lambda_1` is the L1 regularization strength
+    - `lambda_2` is the L2 regularization strength
+    - `lr_power` is the power to scale n.
+
+    Check the documentation for the `l2_shrinkage_regularization_strength`
+    parameter for more details when shrinkage is enabled, in which case gradient
+    is replaced with a gradient with shrinkage.
+
+    Args:
+        learning_rate: A `Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+             that takes no arguments and returns the actual value to use. The
+             learning rate.  Defaults to `0.001`.
+        learning_rate_power: A float value, must be less or equal to zero.
+            Controls how the learning rate decreases during training. Use zero
+            for a fixed learning rate.
+        initial_accumulator_value: The starting value for accumulators. Only
+            zero or positive values are allowed.
+        l1_regularization_strength: A float value, must be greater than or equal
+            to zero. Defaults to `0.0`.
+        l2_regularization_strength: A float value, must be greater than or equal
+            to zero. Defaults to `0.0`.
+        l2_shrinkage_regularization_strength: A float value, must be greater
+            than or equal to zero. This differs from L2 above in that the L2
+            above is a stabilization penalty, whereas this L2 shrinkage is a
+            magnitude penalty. When input is sparse shrinkage will only happen
+            on the active weights.
+        beta: A float value, representing the beta value from the paper.
+            Defaults to 0.0.
+        {{base_optimizer_keyword_args}}
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        learning_rate_power=-0.5,
+        initial_accumulator_value=0.1,
+        l1_regularization_strength=0.0,
+        l2_regularization_strength=0.0,
+        l2_shrinkage_regularization_strength=0.0,
+        beta=0.0,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Ftrl",
+        **kwargs,
+    ):
+        super().__init__(
+            name=name,
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs,
+        )
+
+        if initial_accumulator_value < 0.0:
+            raise ValueError(
+                "`initial_accumulator_value` needs to be positive or zero. "
+                "Received: initial_accumulator_value="
+                f"{initial_accumulator_value}."
+            )
+        if learning_rate_power > 0.0:
+            raise ValueError(
+                "`learning_rate_power` needs to be negative or zero. Received: "
+                f"learning_rate_power={learning_rate_power}."
+            )
+        if l1_regularization_strength < 0.0:
+            raise ValueError(
+                "`l1_regularization_strength` needs to be positive or zero. "
+                "Received: l1_regularization_strength="
+                f"{l1_regularization_strength}."
+            )
+        if l2_regularization_strength < 0.0:
+            raise ValueError(
+                "`l2_regularization_strength` needs to be positive or zero. "
+                "Received: l2_regularization_strength="
+                f"{l2_regularization_strength}."
+            )
+        if l2_shrinkage_regularization_strength < 0.0:
+            raise ValueError(
+                "`l2_shrinkage_regularization_strength` needs to be positive "
+                "or zero. Received: l2_shrinkage_regularization_strength"
+                f"={l2_shrinkage_regularization_strength}."
+            )
+
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.learning_rate_power = learning_rate_power
+        self.initial_accumulator_value = initial_accumulator_value
+        self.l1_regularization_strength = l1_regularization_strength
+        self.l2_regularization_strength = l2_regularization_strength
+        self.l2_shrinkage_regularization_strength = (
+            l2_shrinkage_regularization_strength
+        )
+        self.beta = beta
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        Args:
+          var_list: list of model variables to build Ftrl variables on.
+        """
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._accumulators = []
+        self._linears = []
+        for var in var_list:
+            self._accumulators.append(
+                self.add_variable_from_reference(
+                    model_variable=var,
+                    variable_name="accumulator",
+                    initial_value=tf.cast(
+                        tf.fill(
+                            dims=var.shape, value=self.initial_accumulator_value
+                        ),
+                        dtype=var.dtype,
+                    ),
+                )
+            )
+            self._linears.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="linear"
+                )
+            )
+        self._built = True
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+
+        lr = tf.cast(self.learning_rate, variable.dtype)
+        var_key = self._var_key(variable)
+        accum = self._accumulators[self._index_dict[var_key]]
+        linear = self._linears[self._index_dict[var_key]]
+
+        lr_power = self.learning_rate_power
+        l2_reg = self.l2_regularization_strength
+        l2_reg = l2_reg + self.beta / (2.0 * lr)
+
+        # Ftrl optimizer has the same implementation for sparse and dense
+        # gradients update.
+        grad_to_use = (
+            gradient + 2 * self.l2_shrinkage_regularization_strength * variable
+        )
+        new_accum = accum + tf.pow(gradient, 2)
+        linear.assign_add(
+            grad_to_use
+            - (tf.pow(new_accum, -lr_power) - tf.pow(accum, -lr_power))
+            / lr
+            * variable
+        )
+        quadratic = tf.pow(new_accum, (-lr_power)) / lr + 2 * l2_reg
+        linear_clipped = tf.clip_by_value(
+            linear,
+            -self.l1_regularization_strength,
+            self.l1_regularization_strength,
+        )
+        variable.assign((linear_clipped - linear) / quadratic)
+        accum.assign(new_accum)
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "learning_rate_power": self.learning_rate_power,
+                "initial_accumulator_value": self.initial_accumulator_value,
+                "l1_regularization_strength": self.l1_regularization_strength,
+                "l2_regularization_strength": self.l2_regularization_strength,
+                "l2_shrinkage_regularization_strength": self.l2_shrinkage_regularization_strength,  # noqa: E501
+                "beta": self.beta,
+            }
+        )
+        return config
+
+
+Ftrl.__doc__ = Ftrl.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/legacy/BUILD b/keras/optimizers/legacy/BUILD
index bc2d850fcfa1..ee714565e0ff 100644
--- a/keras/optimizers/legacy/BUILD
+++ b/keras/optimizers/legacy/BUILD
@@ -1,42 +1,163 @@
-# Legacy Keras optimizers.
+# Description:
+#   Contains the Keras OptimizerV2 API (internal TensorFlow version).
+
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
+    # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//keras:friends",
+        "//third_party/tensorflow/cc/saved_model:__pkg__",  # For unit tests.
+        "//third_party/tensorflow/python/trackable:__pkg__",
     ],
     licenses = ["notice"],
 )
 
 py_library(
-    name = "optimizer",
+    name = "optimizers",
     srcs = [
         "adadelta.py",
         "adagrad.py",
         "adam.py",
         "adamax.py",
         "ftrl.py",
+        "gradient_descent.py",
         "nadam.py",
-        "optimizer.py",
+        "optimizer_v2.py",
         "rmsprop.py",
-        "sgd.py",
     ],
     srcs_version = "PY3",
     deps = [
         "//:expect_tensorflow_installed",
-        "//keras/optimizers/optimizer_v2",
+        "//keras:backend",
+        "//keras:backend_config",
+        "//keras/engine:base_layer_utils",
+        "//keras/initializers",
+        "//keras/optimizers:utils",
+        "//keras/optimizers/schedules:learning_rate_schedule",
+        "//keras/utils:layer_utils",
+        "//keras/utils:tf_utils",
+    ],
+)
+
+cuda_py_test(
+    name = "adagrad_test",
+    size = "medium",
+    srcs = ["adagrad_test.py"],
+    shard_count = 4,
+    deps = [
+        ":optimizers",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = ["adam_test.py"],
+    shard_count = 4,
+    tags = [
+        "no_rocm",
+        "no_windows",  # TODO(b/171384138)
+    ],
+    deps = [
+        ":optimizers",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+cuda_py_test(
+    name = "adamax_test",
+    size = "medium",
+    srcs = ["adamax_test.py"],
+    shard_count = 4,
+    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
+    tags = ["no_rocm"],
+    deps = [
+        ":optimizers",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    shard_count = 4,
+    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
+    deps = [
+        ":optimizers",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
     ],
 )
 
 cuda_py_test(
-    name = "optimizer_test",
+    name = "ftrl_test",
     size = "medium",
-    srcs = ["optimizer_test.py"],
+    srcs = ["ftrl_test.py"],
+    shard_count = 4,
+    deps = [
+        ":optimizers",
+        "//:expect_tensorflow_installed",
+    ],
+)
+
+cuda_py_test(
+    name = "gradient_descent_test",
+    size = "medium",
+    srcs = ["gradient_descent_test.py"],
+    shard_count = 4,
+    deps = [
+        ":optimizers",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+cuda_py_test(
+    name = "nadam_test",
+    size = "medium",
+    srcs = ["nadam_test.py"],
+    shard_count = 4,
+    deps = [
+        ":optimizers",
+        "//:expect_tensorflow_installed",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    size = "medium",
+    srcs = ["optimizer_v2_test.py"],
     shard_count = 8,
+    tags = [
+        "no_windows",
+    ],
     deps = [
-        ":optimizer",
+        ":optimizers",
         "//:expect_absl_installed",
         "//:expect_tensorflow_installed",
         "//keras",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "medium",
+    srcs = ["rmsprop_test.py"],
+    shard_count = 2,
+    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
+    deps = [
+        ":optimizers",
+        "//:expect_absl_installed",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
     ],
 )
diff --git a/keras/optimizers/legacy/__init__.py b/keras/optimizers/legacy/__init__.py
index 144c69218e11..78cb171abbaf 100644
--- a/keras/optimizers/legacy/__init__.py
+++ b/keras/optimizers/legacy/__init__.py
@@ -12,13 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy optimizer package."""
-
-from keras.optimizers.legacy.adadelta import Adadelta
-from keras.optimizers.legacy.adagrad import Adagrad
-from keras.optimizers.legacy.adam import Adam
-from keras.optimizers.legacy.adamax import Adamax
-from keras.optimizers.legacy.ftrl import Ftrl
-from keras.optimizers.legacy.nadam import Nadam
-from keras.optimizers.legacy.rmsprop import RMSprop
-from keras.optimizers.legacy.sgd import SGD
diff --git a/keras/optimizers/legacy/adadelta.py b/keras/optimizers/legacy/adadelta.py
index b803159d1fb9..9310a9bfcfd5 100644
--- a/keras/optimizers/legacy/adadelta.py
+++ b/keras/optimizers/legacy/adadelta.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,153 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy Adadelta optimizer implementation."""
+"""Adadelta optimizer implementation."""
 
-from keras.optimizers.optimizer_v2 import adadelta
+import numpy as np
+import tensorflow.compat.v2 as tf
 
+from keras import backend_config
+from keras.optimizers.legacy import optimizer_v2
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.Adadelta')
-class Adadelta(adadelta.Adadelta):
-  pass
+@keras_export(
+    "keras.optimizers.legacy.Adadelta",
+    v1=["keras.optimizers.Adadelta", "keras.optimizers.legacy.Adadelta"],
+)
+class Adadelta(optimizer_v2.OptimizerV2):
+    r"""Optimizer that implements the Adadelta algorithm.
+
+    Adadelta optimization is a stochastic gradient descent method that is based
+    on adaptive learning rate per dimension to address two drawbacks:
+
+    - The continual decay of learning rates throughout training.
+    - The need for a manually selected global learning rate.
+
+    Adadelta is a more robust extension of Adagrad that adapts learning rates
+    based on a moving window of gradient updates, instead of accumulating all
+    past gradients. This way, Adadelta continues learning even when many updates
+    have been done. Compared to Adagrad, in the original version of Adadelta you
+    don't have to set an initial learning rate. In this version, the initial
+    learning rate can be set, as in most other Keras optimizers.
+
+    Args:
+      learning_rate: Initial value for the learning rate:
+        either a floating point value,
+        or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+        Note that `Adadelta` tends to benefit from higher initial learning rate
+        values compared to other optimizers.
+        To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
+      rho: A `Tensor` or a floating point value. The decay rate.
+      epsilon: Small floating point value used to maintain numerical stability.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to `"Adadelta"`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
+
+    Reference:
+      - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        rho=0.95,
+        epsilon=1e-7,
+        name="Adadelta",
+        **kwargs
+    ):
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+        self._set_hyper("rho", rho)
+        self.epsilon = epsilon or backend_config.epsilon()
+
+    def _create_slots(self, var_list):
+        # Separate for-loops to respect the ordering of slot variables from v1.
+        for v in var_list:
+            self.add_slot(v, "accum_grad")
+        for v in var_list:
+            self.add_slot(v, "accum_var")
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+        apply_state[(var_device, var_dtype)].update(
+            dict(
+                epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+                rho=tf.identity(self._get_hyper("rho", var_dtype)),
+            )
+        )
+
+    def set_weights(self, weights):
+        params = self.weights
+        # Override set_weights for backward compatibility of Keras V1 optimizer
+        # since it does not include iteration at head of the weight list. Set
+        # iteration to 0.
+        if len(params) == len(weights) + 1:
+            weights = [np.array(0)] + weights
+        super().set_weights(weights)
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        accum_grad = self.get_slot(var, "accum_grad")
+        accum_var = self.get_slot(var, "accum_var")
+        return tf.raw_ops.ResourceApplyAdadelta(
+            var=var.handle,
+            accum=accum_grad.handle,
+            accum_update=accum_var.handle,
+            lr=coefficients["lr_t"],
+            rho=coefficients["rho"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
+            use_locking=self._use_locking,
+        )
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        accum_grad = self.get_slot(var, "accum_grad")
+        accum_var = self.get_slot(var, "accum_var")
+        return tf.raw_ops.ResourceSparseApplyAdadelta(
+            var=var.handle,
+            accum=accum_grad.handle,
+            accum_update=accum_var.handle,
+            lr=coefficients["lr_t"],
+            rho=coefficients["rho"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
+            indices=indices,
+            use_locking=self._use_locking,
+        )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "rho": self._serialize_hyperparameter("rho"),
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
diff --git a/keras/optimizers/legacy/adadelta_test.py b/keras/optimizers/legacy/adadelta_test.py
new file mode 100644
index 000000000000..b9d8937b266f
--- /dev/null
+++ b/keras/optimizers/legacy/adadelta_test.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.optimizers.legacy import adadelta
+from keras.testing_infra import test_combinations
+
+_DATA_TYPES = [tf.half, tf.float32, tf.float64, tf.complex64, tf.complex128]
+
+
+class AdadeltaOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def doTestBasic(self, use_resource=False, use_callable_params=False):
+        num_updates = 4  # number of ADADELTA steps to perform
+        for dtype in _DATA_TYPES:
+            for grad in [0.2, 0.1, 0.01]:
+                for lr in [1.0, 0.5, 0.1]:
+                    var0_init = [1.0, 2.0]
+                    var1_init = [3.0, 4.0]
+                    if use_resource:
+                        var0 = tf.Variable(var0_init, dtype=dtype)
+                        var1 = tf.Variable(var1_init, dtype=dtype)
+                    else:
+                        var0 = tf.Variable(var0_init, dtype=dtype)
+                        var1 = tf.Variable(var1_init, dtype=dtype)
+
+                    grads = tf.constant([grad, grad], dtype=dtype)
+
+                    accum = 0.0
+                    accum_update = 0.0
+
+                    # ADADELTA gradient optimizer
+                    rho = 0.95
+                    epsilon = 1e-8
+                    if use_callable_params:
+                        adadelta_opt = adadelta.Adadelta(
+                            learning_rate=lambda: lr,
+                            rho=lambda: rho,
+                            epsilon=epsilon,
+                        )
+                    else:
+                        adadelta_opt = adadelta.Adadelta(
+                            learning_rate=lr, rho=rho, epsilon=epsilon
+                        )
+                    if not tf.executing_eagerly():
+                        adadelta_update = adadelta_opt.apply_gradients(
+                            zip([grads, grads], [var0, var1])
+                        )
+                        self.evaluate(
+                            tf.compat.v1.global_variables_initializer()
+                        )
+
+                        # Assign slots
+                        slot = [None] * 2
+                        slot_update = [None] * 2
+                        slot[0] = adadelta_opt.get_slot(var0, "accum_grad")
+                        self.assertEqual(slot[0].shape, var0.shape)
+
+                        slot_update[0] = adadelta_opt.get_slot(
+                            var0, "accum_var"
+                        )
+                        self.assertEqual(slot_update[0].shape, var0.shape)
+
+                        slot[1] = adadelta_opt.get_slot(var1, "accum_grad")
+                        self.assertEqual(slot[1].shape, var1.shape)
+
+                        slot_update[1] = adadelta_opt.get_slot(
+                            var1, "accum_var"
+                        )
+                        self.assertEqual(slot_update[1].shape, var1.shape)
+
+                    # Fetch params to validate initial values
+                    self.assertAllClose(var0_init, self.evaluate(var0))
+                    self.assertAllClose(var1_init, self.evaluate(var1))
+
+                    update = [None] * num_updates
+                    tot_update = 0
+                    for step in range(num_updates):
+                        # Run adadelta update for comparison
+                        if not tf.executing_eagerly():
+                            self.evaluate(adadelta_update)
+                        else:
+                            adadelta_opt.apply_gradients(
+                                zip([grads, grads], [var0, var1])
+                            )
+
+                        # Perform initial update without previous accum values
+                        accum = accum * rho + (grad**2) * (1 - rho)
+                        update[step] = (
+                            np.sqrt(accum_update + epsilon)
+                            * (1.0 / np.sqrt(accum + epsilon))
+                            * grad
+                        )
+                        accum_update = accum_update * rho + (
+                            update[step] ** 2
+                        ) * (1.0 - rho)
+                        tot_update += update[step] * lr
+
+                        if not tf.executing_eagerly():
+                            # Check that the accumulators have been updated
+                            # TODO(lxuechen): This is hard to test in eager mode
+                            for slot_idx in range(2):
+                                self.assertAllCloseAccordingToType(
+                                    np.array(
+                                        [accum, accum],
+                                        dtype=dtype.as_numpy_dtype(0),
+                                    ),
+                                    self.evaluate(slot[slot_idx]),
+                                    rtol=1e-5,
+                                )
+
+                                self.assertAllCloseAccordingToType(
+                                    np.array(
+                                        [accum_update, accum_update],
+                                        dtype=dtype.as_numpy_dtype(0),
+                                    ),
+                                    self.evaluate(slot_update[slot_idx]),
+                                    rtol=1e-5,
+                                )
+
+                            # Check that the parameters have been updated
+                            self.assertAllCloseAccordingToType(
+                                np.array(
+                                    [
+                                        var0_init[0] - tot_update,
+                                        var0_init[1] - tot_update,
+                                    ],
+                                    dtype=dtype.as_numpy_dtype(0),
+                                ),
+                                self.evaluate(var0),
+                                rtol=1e-5,
+                            )
+
+                            self.assertAllCloseAccordingToType(
+                                np.array(
+                                    [
+                                        var1_init[0] - tot_update,
+                                        var1_init[1] - tot_update,
+                                    ],
+                                    dtype=dtype.as_numpy_dtype(0),
+                                ),
+                                self.evaluate(var1),
+                                rtol=1e-5,
+                            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testResourceBasic(self):
+        self.doTestBasic(use_resource=True)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testBasicCallableParams(self):
+        self.doTestBasic(use_resource=True, use_callable_params=True)
+
+    def testMinimizeSparseResourceVariable(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+                x = tf.constant([[4.0], [5.0]], dtype=dtype)
+
+                def loss():
+                    pred = tf.matmul(
+                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x
+                    )
+                    return pred * pred
+
+                sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
+                    loss, var_list=[var0]
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllCloseAccordingToType(
+                    [[1.0, 2.0]], self.evaluate(var0)
+                )
+                # Run 1 step of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[-111, -138]], self.evaluate(var0)
+                )
+
+    def testConstructAdadeltaWithLR(self):
+        opt = adadelta.Adadelta(lr=1.0, rho=0.9, epsilon=1.0)
+        opt_2 = adadelta.Adadelta(
+            learning_rate=0.1, rho=0.9, epsilon=1.0, lr=1.0
+        )
+        opt_3 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1.0)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+    def testConstructAdadeltaWithEpsilonValues(self):
+        opt = adadelta.Adadelta(epsilon=None)
+        self.assertEqual(opt.epsilon, 1e-7)
+
+        opt = adadelta.Adadelta(epsilon=1e-8)
+        self.assertEqual(opt.epsilon, 1e-8)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/adagrad.py b/keras/optimizers/legacy/adagrad.py
index 895ed7d9aa7c..4b130051416d 100644
--- a/keras/optimizers/legacy/adagrad.py
+++ b/keras/optimizers/legacy/adagrad.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,174 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy Adagrad optimizer implementation."""
+"""Adagrad optimizer implementation."""
 
-from keras.optimizers.optimizer_v2 import adagrad
+import numpy as np
+import tensorflow.compat.v2 as tf
 
+from keras import backend_config
+from keras.optimizers.legacy import optimizer_v2
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.Adagrad')
-class Adagrad(adagrad.Adagrad):
-  pass
+@keras_export(
+    "keras.optimizers.legacy.Adagrad",
+    v1=["keras.optimizers.Adagrad", "keras.optimizers.legacy.Adagrad"],
+)
+class Adagrad(optimizer_v2.OptimizerV2):
+    r"""Optimizer that implements the Adagrad algorithm.
+
+    Adagrad is an optimizer with parameter-specific learning rates,
+    which are adapted relative to how frequently a parameter gets
+    updated during training. The more updates a parameter receives,
+    the smaller the updates.
+
+    Args:
+      learning_rate: Initial value for the learning rate:
+        either a floating point value,
+        or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+        Note that `Adagrad` tends to benefit from higher initial learning rate
+        values compared to other optimizers.
+        To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
+      initial_accumulator_value: Floating point value.
+        Starting value for the accumulators (per-parameter momentum values).
+        Must be non-negative.
+      epsilon: Small floating point value used to maintain numerical stability.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to `"Adagrad"`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value..
+
+    Reference:
+      - [Duchi et al., 2011](
+        http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        initial_accumulator_value=0.1,
+        epsilon=1e-7,
+        name="Adagrad",
+        **kwargs
+    ):
+        if initial_accumulator_value < 0.0:
+            raise ValueError(
+                "initial_accumulator_value must be non-negative: %s"
+                % initial_accumulator_value
+            )
+        if epsilon is None:
+            epsilon = backend_config.epsilon()
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+        self._initial_accumulator_value = initial_accumulator_value
+        self.epsilon = epsilon or backend_config.epsilon()
+
+    def _create_slots(self, var_list):
+        for var in var_list:
+            dtype = var.dtype.base_dtype
+            init = tf.compat.v1.constant_initializer(
+                self._initial_accumulator_value, dtype=dtype
+            )
+            self.add_slot(var, "accumulator", init)
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+        apply_state[(var_device, var_dtype)].update(
+            dict(
+                epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+                neg_lr_t=-apply_state[(var_device, var_dtype)]["lr_t"],
+                zero=tf.zeros((), dtype=tf.int64),
+            )
+        )
+
+    def set_weights(self, weights):
+        params = self.weights
+        # Override set_weights for backward compatibility of Keras V1 optimizer
+        # since it does not include iteration at head of the weight list. Set
+        # iteration to 0.
+        if len(params) == len(weights) + 1:
+            weights = [np.array(0)] + weights
+        super().set_weights(weights)
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        """Creates an optimizer from its config.
+
+        This method is the reverse of `get_config`,
+        capable of instantiating the same optimizer from the config
+        dictionary.
+
+        Args:
+            config: A Python dictionary, typically the output of get_config.
+            custom_objects: A Python dictionary mapping names to additional
+              Python objects used to create this optimizer, such as a function
+              used for a hyperparameter.
+
+        Returns:
+            An optimizer instance.
+        """
+        if "initial_accumulator_value" not in config:
+            config["initial_accumulator_value"] = 0.1
+        if "lr" in config:
+            config["learning_rate"] = config.pop("lr")
+        return cls(**config)
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        acc = self.get_slot(var, "accumulator")
+        return tf.raw_ops.ResourceApplyAdagradV2(
+            var=var.handle,
+            accum=acc.handle,
+            lr=coefficients["lr_t"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
+            use_locking=self._use_locking,
+        )
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        acc = self.get_slot(var, "accumulator")
+        return tf.raw_ops.ResourceSparseApplyAdagradV2(
+            var=var.handle,
+            accum=acc.handle,
+            lr=coefficients["lr_t"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
+            indices=indices,
+            use_locking=self._use_locking,
+        )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "initial_accumulator_value": self._initial_accumulator_value,
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
diff --git a/keras/optimizers/legacy/adagrad_test.py b/keras/optimizers/legacy/adagrad_test.py
new file mode 100644
index 000000000000..221883aa3f49
--- /dev/null
+++ b/keras/optimizers/legacy/adagrad_test.py
@@ -0,0 +1,618 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+import copy
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.optimizers.legacy import adagrad
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.testing_infra import test_combinations
+
+_DATA_TYPES = [tf.half, tf.float32, tf.float64, tf.complex64, tf.complex128]
+
+
+def adagrad_update_numpy(param, accum, g_t, lr=0.001, epsilon=1e-7):
+    accum_t = accum + g_t * g_t
+    param_t = param - lr * g_t / (np.sqrt(accum_t) + epsilon)
+    return param_t, accum_t
+
+
+def sparse_adagrad_update_numpy(
+    param, accum, gindexs, gvalues, lr=0.001, epsilon=1e-7
+):
+    accum_t = copy.deepcopy(accum)
+    param_t = copy.deepcopy(param)
+    # first loop accumulates repeated indices if necessary.
+    for i in range(len(gindexs)):
+        gindex = gindexs[i]
+        gvalue = gvalues[i]
+        accum_t[gindex] = accum_t[gindex] + gvalue * gvalue
+    for i in range(len(gindexs)):
+        gindex = gindexs[i]
+        gvalue = gvalues[i]
+        param_t[gindex] = param_t[gindex] - lr * gvalue / (
+            np.sqrt(accum_t[gindex]) + epsilon
+        )
+    return param_t, accum_t
+
+
+class AdagradOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def doTestBasic(self, use_callable_params=False):
+        for dtype in _DATA_TYPES:
+            var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+            var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+            grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+            grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+            var0 = tf.Variable(var0_np)
+            var1 = tf.Variable(var1_np)
+            grads0 = tf.constant(grads0_np)
+            grads1 = tf.constant(grads1_np)
+
+            learning_rate = lambda: 3.0
+            if not use_callable_params:
+                learning_rate = learning_rate()
+
+            ada_opt = adagrad.Adagrad(learning_rate)
+
+            accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+            accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+            if not tf.executing_eagerly():
+                ada_update = ada_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            # Fetch params to validate initial values
+            v0_val, v1_val = self.evaluate([var0, var1])
+            self.assertAllClose([1.0, 2.0], v0_val)
+            self.assertAllClose([3.0, 4.0], v1_val)
+
+            # Run 3 steps of adagrad
+            for _ in range(3):
+                if not tf.executing_eagerly():
+                    self.evaluate(ada_update)
+                else:
+                    ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+                var0_np, accum0_np = adagrad_update_numpy(
+                    var0_np, accum0_np, grads0_np, 3.0
+                )
+                var1_np, accum1_np = adagrad_update_numpy(
+                    var1_np, accum1_np, grads1_np, 3.0
+                )
+                self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+                self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasic(self):
+        self.doTestBasic()
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testBasicCallableParams(self):
+        self.doTestBasic(use_callable_params=True)
+
+    def testBasicWithLearningRateDecay(self):
+        for dtype in _DATA_TYPES:
+            var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+            var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+            grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+            grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+            var0 = tf.Variable(var0_np)
+            var1 = tf.Variable(var1_np)
+            grads0 = tf.constant(grads0_np)
+            grads1 = tf.constant(grads1_np)
+
+            learning_rate = 3.0
+            decay = 0.5
+
+            ada_opt = adagrad.Adagrad(learning_rate, decay=decay)
+
+            accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+            accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+            if not tf.executing_eagerly():
+                ada_update = ada_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            # Fetch params to validate initial values
+            v0_val, v1_val = self.evaluate([var0, var1])
+            self.assertAllClose([1.0, 2.0], v0_val)
+            self.assertAllClose([3.0, 4.0], v1_val)
+
+            # Run 3 steps of adagrad
+            for t in range(3):
+                if not tf.executing_eagerly():
+                    self.evaluate(ada_update)
+                else:
+                    ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+                lr_np = learning_rate / (1 + decay * t)
+                var0_np, accum0_np = adagrad_update_numpy(
+                    var0_np, accum0_np, grads0_np, lr_np
+                )
+                var1_np, accum1_np = adagrad_update_numpy(
+                    var1_np, accum1_np, grads1_np, lr_np
+                )
+                self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+                self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+    def testBasicWithLargeEpsilon(self):
+        var0_np = np.array([1.0, 2.0])
+        var1_np = np.array([3.0, 4.0])
+        grads0_np = np.array([0.1, 0.1])
+        grads1_np = np.array([0.01, 0.01])
+        var0 = tf.Variable(var0_np)
+        var1 = tf.Variable(var1_np)
+        grads0 = tf.constant(grads0_np)
+        grads1 = tf.constant(grads1_np)
+
+        learning_rate = 3.0
+
+        ada_opt = adagrad.Adagrad(learning_rate, epsilon=1.0)
+
+        accum0_np = np.array([0.1, 0.1])
+        accum1_np = np.array([0.1, 0.1])
+
+        if not tf.executing_eagerly():
+            ada_update = ada_opt.apply_gradients(
+                zip([grads0, grads1], [var0, var1])
+            )
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for _ in range(3):
+            if not tf.executing_eagerly():
+                self.evaluate(ada_update)
+            else:
+                ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            var0_np, accum0_np = adagrad_update_numpy(
+                var0_np, accum0_np, grads0_np, 3.0, 1.0
+            )
+            var1_np, accum1_np = adagrad_update_numpy(
+                var1_np, accum1_np, grads1_np, 3.0, 1.0
+            )
+            self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+            self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+    def testBasicWithLearningRateInverseTimeDecay(self):
+        for dtype in _DATA_TYPES:
+            var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+            var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+            grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+            grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+            var0 = tf.Variable(var0_np)
+            var1 = tf.Variable(var1_np)
+            grads0 = tf.constant(grads0_np)
+            grads1 = tf.constant(grads1_np)
+
+            learning_rate = 3.0
+            decay = 0.5
+            lr_schedule = learning_rate_schedule.InverseTimeDecay(
+                learning_rate, decay_steps=1.0, decay_rate=decay
+            )
+
+            ada_opt = adagrad.Adagrad(lr_schedule)
+
+            accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+            accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+            if not tf.executing_eagerly():
+                ada_update = ada_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            # Fetch params to validate initial values
+            v0_val, v1_val = self.evaluate([var0, var1])
+            self.assertAllClose([1.0, 2.0], v0_val)
+            self.assertAllClose([3.0, 4.0], v1_val)
+
+            # Run 3 steps of adagrad
+            for t in range(3):
+                if not tf.executing_eagerly():
+                    self.evaluate(ada_update)
+                else:
+                    ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+                lr_np = learning_rate / (1 + decay * t)
+                var0_np, accum0_np = adagrad_update_numpy(
+                    var0_np, accum0_np, grads0_np, lr_np
+                )
+                var1_np, accum1_np = adagrad_update_numpy(
+                    var1_np, accum1_np, grads1_np, lr_np
+                )
+                self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+                self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+    def testMinimizeSparseResourceVariable(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0 = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+                x = tf.constant([[4.0], [5.0]], dtype=dtype)
+
+                def loss():
+                    pred = tf.matmul(
+                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x
+                    )
+                    return pred * pred
+
+                sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllCloseAccordingToType(
+                    [[1.0, 2.0], [3.0, 4.0]], self.evaluate(var0)
+                )
+                # Run 1 step of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[0, 1], [3, 4]], self.evaluate(var0), atol=0.01
+                )
+
+    def testTensorLearningRate(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = tf.constant(3.0)
+                ada_opt = adagrad.Adagrad(learning_rate)
+                ada_update = ada_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+                accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                # Run 3 steps of adagrad
+                for _ in range(3):
+                    self.evaluate(ada_update)
+                    var0_np, accum0_np = adagrad_update_numpy(
+                        var0_np, accum0_np, grads0_np, learning_rate
+                    )
+                    var1_np, accum1_np = adagrad_update_numpy(
+                        var1_np, accum1_np, grads1_np, learning_rate
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testSparseBasic(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array(
+                    [0.01, 0, 0.01], dtype=dtype.as_numpy_dtype
+                )
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0_np_indices = np.array([0, 2], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np[grads0_np_indices]),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([3]),
+                )
+                grads1_np_indices = np.array([0, 2], dtype=np.int32)
+                grads1 = tf.IndexedSlices(
+                    tf.constant(grads1_np[grads1_np_indices]),
+                    tf.constant(grads1_np_indices),
+                    tf.constant([3]),
+                )
+                learning_rate = 3.0
+                ada_opt = adagrad.Adagrad(learning_rate)
+                ada_update = ada_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+                accum0_np = np.array(
+                    [0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype
+                )
+                accum1_np = np.array(
+                    [0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype
+                )
+
+                # Run 3 step of sgd
+                for _ in range(3):
+                    self.evaluate(ada_update)
+
+                    var0_np, accum0_np = sparse_adagrad_update_numpy(
+                        var0_np,
+                        accum0_np,
+                        grads0_np_indices,
+                        grads0_np[grads0_np_indices],
+                        learning_rate,
+                    )
+                    var1_np, accum1_np = sparse_adagrad_update_numpy(
+                        var1_np,
+                        accum1_np,
+                        grads1_np_indices,
+                        grads1_np[grads1_np_indices],
+                        learning_rate,
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testSparseSingleVarDim(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0_np = np.array([1.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                grads0_np_indices = np.array([0], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np[grads0_np_indices]),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([3]),
+                )
+                learning_rate = 3.0
+                ada_opt = adagrad.Adagrad(learning_rate, epsilon=1.0)
+                ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0], self.evaluate(var0))
+
+                accum0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+
+                # Run 3 step of sgd
+                for _ in range(3):
+                    self.evaluate(ada_update)
+
+                    var0_np, accum0_np = sparse_adagrad_update_numpy(
+                        var0_np,
+                        accum0_np,
+                        grads0_np_indices,
+                        grads0_np[grads0_np_indices],
+                        learning_rate,
+                        epsilon=1.0,
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+
+    def testSparseRepeatedIndices(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+
+                repeated_index_update_var = tf.Variable(var_np, dtype=dtype)
+                aggregated_update_var = tf.Variable(var_np, dtype=dtype)
+                grad_repeated_index = tf.IndexedSlices(
+                    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+                    tf.constant([1, 1]),
+                    tf.constant([2, 1]),
+                )
+                grad_aggregated = tf.IndexedSlices(
+                    tf.constant([0.2], shape=[1, 1], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([2, 1]),
+                )
+                repeated_update = adagrad.Adagrad(3.0).apply_gradients(
+                    [(grad_repeated_index, repeated_index_update_var)]
+                )
+                aggregated_update = adagrad.Adagrad(3.0).apply_gradients(
+                    [(grad_aggregated, aggregated_update_var)]
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertAllClose(
+                    self.evaluate(aggregated_update_var),
+                    self.evaluate(repeated_index_update_var),
+                )
+                for _ in range(3):
+                    self.evaluate(repeated_update)
+                    self.evaluate(aggregated_update)
+                    self.assertAllClose(
+                        self.evaluate(aggregated_update_var),
+                        self.evaluate(repeated_index_update_var),
+                    )
+
+    def testSparseRepeatedIndicesByEmbeddingLookUp(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var_repeated = tf.Variable([1.0, 2.0], dtype=dtype)
+                loss_repeated = lambda: tf.reduce_sum(
+                    tf.compat.v1.nn.embedding_lookup(var_repeated, [0, 0])
+                )
+                var_aggregated = tf.Variable([1.0, 2.0], dtype=dtype)
+                loss_aggregated = lambda: 2 * tf.reduce_sum(
+                    tf.compat.v1.nn.embedding_lookup(var_aggregated, [0])
+                )
+                update_op_repeated = adagrad.Adagrad(2.0).minimize(
+                    loss_repeated, var_list=[var_repeated]
+                )
+                update_op_aggregated = adagrad.Adagrad(2.0).minimize(
+                    loss_aggregated, var_list=[var_aggregated]
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertAllCloseAccordingToType(
+                    self.evaluate(var_repeated), self.evaluate(var_aggregated)
+                )
+                for _ in range(3):
+                    self.evaluate(update_op_repeated)
+                    self.evaluate(update_op_aggregated)
+                    self.assertAllCloseAccordingToType(
+                        self.evaluate(var_repeated),
+                        self.evaluate(var_aggregated),
+                    )
+
+    def testSparseStability(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half]:
+                shape = [1, 6]
+                var0_np = np.array(
+                    [
+                        [
+                            0.00872496,
+                            -0.106952,
+                            0.110467,
+                            0.226505,
+                            -0.0147257,
+                            -0.0105945,
+                        ]
+                    ],
+                    dtype=dtype.as_numpy_dtype,
+                )
+                var0 = tf.Variable(var0_np)
+                grads0_np = np.array(
+                    [
+                        [
+                            -5.91278e-05,
+                            5.31673e-05,
+                            -2.5779e-06,
+                            4.29153e-05,
+                            -8.4877e-05,
+                            -9.48906e-05,
+                        ]
+                    ],
+                    dtype=dtype.as_numpy_dtype,
+                )
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np), tf.constant([0]), tf.constant(shape)
+                )
+                ada_opt = adagrad.Adagrad(1.0)
+                ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+                slot0 = ada_opt.get_slot(var0, "accumulator")
+                init = tf.compat.v1.global_variables_initializer()
+                for _ in range(100):
+                    self.evaluate(init)
+                    self.evaluate(ada_update)
+                    self.assertAllCloseAccordingToType(
+                        np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]),
+                        self.evaluate(slot0),
+                    )
+                    self.assertAllCloseAccordingToType(
+                        np.array(
+                            [
+                                [
+                                    0.00891194,
+                                    -0.10712013,
+                                    0.11047515,
+                                    0.22636929,
+                                    -0.0144573,
+                                    -0.01029443,
+                                ]
+                            ]
+                        ),
+                        self.evaluate(var0),
+                    )
+
+    def testSharing(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = 3.0
+                ada_opt = adagrad.Adagrad(learning_rate)
+                # Apply the optimizer twice.  Both applications will use
+                # the same accums.
+                ada_update1 = ada_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                ada_update2 = ada_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                slot0 = ada_opt.get_slot(var0, "accumulator")
+                self.assertEqual(slot0.shape, var0.shape)
+                slot1 = ada_opt.get_slot(var1, "accumulator")
+                self.assertEqual(slot1.shape, var1.shape)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values.
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+                # Mix the first and the second adagrad for 3 steps.
+                self.evaluate(ada_update1)
+                self.evaluate(ada_update2)
+                self.evaluate(ada_update1)
+
+                accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                for _ in range(3):
+                    var0_np, accum0_np = adagrad_update_numpy(
+                        var0_np, accum0_np, grads0_np, learning_rate
+                    )
+                    var1_np, accum1_np = adagrad_update_numpy(
+                        var1_np, accum1_np, grads1_np, learning_rate
+                    )
+                self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+                self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+    def testConstructAdagradWithLR(self):
+        opt = adagrad.Adagrad(lr=1.0)
+        opt_2 = adagrad.Adagrad(learning_rate=0.1, lr=1.0)
+        opt_3 = adagrad.Adagrad(learning_rate=0.1)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/adam.py b/keras/optimizers/legacy/adam.py
index 338470721b7f..fecc337c4c52 100644
--- a/keras/optimizers/legacy/adam.py
+++ b/keras/optimizers/legacy/adam.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,515 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy Adam optimizer implementation."""
+"""Adam optimizer implementation."""
 
-from keras.optimizers.optimizer_v2 import adam
+import tensorflow.compat.v2 as tf
 
+from keras import backend_config
+from keras.optimizers.legacy import optimizer_v2
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.Adam')
-class Adam(adam.Adam):
-  pass
+@keras_export(
+    "keras.optimizers.legacy.Adam",
+    v1=["keras.optimizers.Adam", "keras.optimizers.legacy.Adam"],
+)
+class Adam(optimizer_v2.OptimizerV2):
+    r"""Optimizer that implements the Adam algorithm.
+
+    Adam optimization is a stochastic gradient descent method that is based on
+    adaptive estimation of first-order and second-order moments.
+
+    According to
+    [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
+    the method is "*computationally
+    efficient, has little memory requirement, invariant to diagonal rescaling of
+    gradients, and is well suited for problems that are large in terms of
+    data/parameters*".
+
+    Args:
+      learning_rate: A `Tensor`, floating point value, or a schedule that is a
+        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+        that takes no arguments and returns the actual value to use, The
+        learning rate. Defaults to `0.001`.
+      beta_1: A float value or a constant float tensor, or a callable
+        that takes no arguments and returns the actual value to use. The
+        exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
+      beta_2: A float value or a constant float tensor, or a callable
+        that takes no arguments and returns the actual value to use, The
+        exponential decay rate for the 2nd moment estimates. Defaults to
+        `0.999`.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
+        `1e-7`.
+      amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
+        the paper "On the Convergence of Adam and beyond". Defaults to `False`.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to `"Adam"`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
+
+    Usage:
+
+    >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1)
+    >>> var1 = tf.Variable(10.0)
+    >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
+    >>> step_count = opt.minimize(loss, [var1]).numpy()
+    >>> # The first step is `-learning_rate*sign(grad)`
+    >>> var1.numpy()
+    9.9
+
+    Reference:
+      - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+      - [Reddi et al., 2018](
+          https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
+
+    Notes:
+
+    The default value of 1e-7 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since Adam uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        amsgrad=False,
+        name="Adam",
+        **kwargs
+    ):
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+        self._set_hyper("beta_1", beta_1)
+        self._set_hyper("beta_2", beta_2)
+        self.epsilon = epsilon or backend_config.epsilon()
+        self.amsgrad = amsgrad
+
+    def _create_slots(self, var_list):
+        # Create slots for the first and second moments.
+        # Separate for-loops to respect the ordering of slot variables from v1.
+        for var in var_list:
+            self.add_slot(var, "m")
+        for var in var_list:
+            self.add_slot(var, "v")
+        if self.amsgrad:
+            for var in var_list:
+                self.add_slot(var, "vhat")
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
+        beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
+        beta_1_power = tf.pow(beta_1_t, local_step)
+        beta_2_power = tf.pow(beta_2_t, local_step)
+        lr = apply_state[(var_device, var_dtype)]["lr_t"] * (
+            tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
+        )
+        apply_state[(var_device, var_dtype)].update(
+            dict(
+                lr=lr,
+                epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+                beta_1_t=beta_1_t,
+                beta_1_power=beta_1_power,
+                one_minus_beta_1_t=1 - beta_1_t,
+                beta_2_t=beta_2_t,
+                beta_2_power=beta_2_power,
+                one_minus_beta_2_t=1 - beta_2_t,
+            )
+        )
+
+    def set_weights(self, weights):
+        params = self.weights
+        # If the weights are generated by Keras V1 optimizer, it includes vhats
+        # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
+        # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
+        num_vars = int((len(params) - 1) / 2)
+        if len(weights) == 3 * num_vars + 1:
+            weights = weights[: len(params)]
+        super().set_weights(weights)
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        m = self.get_slot(var, "m")
+        v = self.get_slot(var, "v")
+
+        if not self.amsgrad:
+            return tf.raw_ops.ResourceApplyAdam(
+                var=var.handle,
+                m=m.handle,
+                v=v.handle,
+                beta1_power=coefficients["beta_1_power"],
+                beta2_power=coefficients["beta_2_power"],
+                lr=coefficients["lr_t"],
+                beta1=coefficients["beta_1_t"],
+                beta2=coefficients["beta_2_t"],
+                epsilon=coefficients["epsilon"],
+                grad=grad,
+                use_locking=self._use_locking,
+            )
+        else:
+            vhat = self.get_slot(var, "vhat")
+            return tf.raw_ops.ResourceApplyAdamWithAmsgrad(
+                var=var.handle,
+                m=m.handle,
+                v=v.handle,
+                vhat=vhat.handle,
+                beta1_power=coefficients["beta_1_power"],
+                beta2_power=coefficients["beta_2_power"],
+                lr=coefficients["lr_t"],
+                beta1=coefficients["beta_1_t"],
+                beta2=coefficients["beta_2_t"],
+                epsilon=coefficients["epsilon"],
+                grad=grad,
+                use_locking=self._use_locking,
+            )
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        # m_t = beta1 * m + (1 - beta1) * g_t
+        m = self.get_slot(var, "m")
+        m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
+        m_t = tf.compat.v1.assign(
+            m, m * coefficients["beta_1_t"], use_locking=self._use_locking
+        )
+        with tf.control_dependencies([m_t]):
+            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+        v = self.get_slot(var, "v")
+        v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
+        v_t = tf.compat.v1.assign(
+            v, v * coefficients["beta_2_t"], use_locking=self._use_locking
+        )
+        with tf.control_dependencies([v_t]):
+            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+        if not self.amsgrad:
+            v_sqrt = tf.sqrt(v_t)
+            var_update = tf.compat.v1.assign_sub(
+                var,
+                coefficients["lr"] * m_t / (v_sqrt + coefficients["epsilon"]),
+                use_locking=self._use_locking,
+            )
+            return tf.group(*[var_update, m_t, v_t])
+        else:
+            v_hat = self.get_slot(var, "vhat")
+            v_hat_t = tf.maximum(v_hat, v_t)
+            with tf.control_dependencies([v_hat_t]):
+                v_hat_t = tf.compat.v1.assign(
+                    v_hat, v_hat_t, use_locking=self._use_locking
+                )
+            v_hat_sqrt = tf.sqrt(v_hat_t)
+            var_update = tf.compat.v1.assign_sub(
+                var,
+                coefficients["lr"]
+                * m_t
+                / (v_hat_sqrt + coefficients["epsilon"]),
+                use_locking=self._use_locking,
+            )
+            return tf.group(*[var_update, m_t, v_t, v_hat_t])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "beta_1": self._serialize_hyperparameter("beta_1"),
+                "beta_2": self._serialize_hyperparameter("beta_2"),
+                "epsilon": self.epsilon,
+                "amsgrad": self.amsgrad,
+            }
+        )
+        return config
+
+
+class NonFusedAdam(optimizer_v2.OptimizerV2):
+    r"""Optimizer that implements the Adam algorithm without fused kernels.
+
+    Adam optimization is a stochastic gradient descent method that is based on
+    adaptive estimation of first-order and second-order moments.
+    According to the paper
+    [Adam: A Method for Stochastic Optimization. Kingma et al.,
+    2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
+    efficient, has little memory requirement, invariant to diagonal rescaling of
+    gradients, and is well suited for problems that are large in terms of
+    data/parameters*".
+
+    For AMSGrad see [On The Convergence Of Adam And Beyond.
+    Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
+
+    **If amsgrad = False**:
+
+    initialize $m_0$ as 1st moment vector
+    initialize $v_0$ as 2nd moment vector
+
+    The update rule for $\theta$ with gradient $g$ uses an optimization
+    described at the end of section 2 of the paper:
+
+    $$lr_t = \mathrm{learning\_rate} *
+      \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+    $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+    $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
+    $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+    **If amsgrad = True**:
+
+    initialize $m_0$ as 1st moment vector
+    initialize $v_0$ as 2nd moment vector
+    initialize $\hat{v}_0$ as 2nd moment vector
+
+    The update rule for $\theta$ with gradient $g$ uses an optimization
+    described at the end of section 2 of the paper:
+
+    $$lr_t = \mathrm{learning\_rate} *
+      \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+
+    $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+    $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
+    $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
+    $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
+
+    The default value of 1e-7 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since Adam uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Usage:
+
+    >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1)
+    >>> var1 = tf.Variable(10.0)
+    >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
+    >>> step_count = opt.minimize(loss, [var1]).numpy()
+    >>> # The first step is `-learning_rate*sign(grad)`
+    >>> var1.numpy()
+    9.9
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        amsgrad=False,
+        name="Adam",
+        **kwargs
+    ):
+        """Construct a new Adam optimizer.
+
+        Args:
+          learning_rate: A `Tensor`, floating point value, or a schedule that is
+            a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
+            callable that takes no arguments and returns the actual value to
+            use, The learning rate. Defaults to `0.001`.
+          beta_1: A float value or a constant float tensor, or a callable that
+            takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimates. Defaults to
+            `0.9`.
+          beta_2: A float value or a constant float tensor, or a callable that
+            takes no arguments and returns the actual value to use, The
+            exponential decay rate for the 2nd moment estimates. Defaults to
+            `0.999`.
+          epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults
+            to `1e-7`.
+          amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
+            from the paper "On the Convergence of Adam and beyond". Defaults to
+            `False`.
+          name: Optional name for the operations created when applying
+            gradients.  Defaults to "Adam".
+          **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
+            `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is
+            clip gradients by value, `decay` is included for backward
+            compatibility to allow time inverse decay of learning rate. `lr` is
+            included for backward compatibility, recommended to use
+            `learning_rate` instead.
+        """
+
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+        self._set_hyper("beta_1", beta_1)
+        self._set_hyper("beta_2", beta_2)
+        self.epsilon = epsilon or backend_config.epsilon()
+        self.amsgrad = amsgrad
+
+    def _create_slots(self, var_list):
+        # Create slots for the first and second moments.
+        # Separate for-loops to respect the ordering of slot variables from v1.
+        for var in var_list:
+            self.add_slot(var, "m")
+        for var in var_list:
+            self.add_slot(var, "v")
+        if self.amsgrad:
+            for var in var_list:
+                self.add_slot(var, "vhat")
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
+        beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
+        beta_1_power = tf.pow(beta_1_t, local_step)
+        beta_2_power = tf.pow(beta_2_t, local_step)
+        lr = apply_state[(var_device, var_dtype)]["lr_t"] * (
+            tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
+        )
+        apply_state[(var_device, var_dtype)].update(
+            dict(
+                lr=lr,
+                epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+                beta_1_t=beta_1_t,
+                beta_1_power=beta_1_power,
+                one_minus_beta_1_t=1 - beta_1_t,
+                beta_2_t=beta_2_t,
+                beta_2_power=beta_2_power,
+                one_minus_beta_2_t=1 - beta_2_t,
+            )
+        )
+
+    def set_weights(self, weights):
+        params = self.weights
+        # If the weights are generated by Keras V1 optimizer, it includes vhats
+        # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
+        # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
+        num_vars = int((len(params) - 1) / 2)
+        if len(weights) == 3 * num_vars + 1:
+            weights = weights[: len(params)]
+        super().set_weights(weights)
+
+    @tf.function(jit_compile=True)
+    def _resource_apply_dense_impl(self, grad, var, apply_state):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        m = self.get_slot(var, "m")
+        v = self.get_slot(var, "v")
+
+        alpha = (
+            coefficients["lr_t"]
+            * tf.sqrt(1 - coefficients["beta_2_power"])
+            / (1 - coefficients["beta_1_power"])
+        )
+        m.assign_add((grad - m) * (1 - coefficients["beta_1_t"]))
+        v.assign_add((tf.square(grad) - v) * (1 - coefficients["beta_2_t"]))
+        if self.amsgrad:
+            vhat = self.get_slot(var, "vhat")
+            vhat.assign(tf.maximum(vhat, v))
+            v = vhat
+        var.assign_sub((m * alpha) / (tf.sqrt(v) + coefficients["epsilon"]))
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        self._resource_apply_dense_impl(grad, var, apply_state)
+        if not tf.executing_eagerly():
+            return tf.compat.v1.get_default_graph().get_operations()[-1]
+
+    @tf.function(jit_compile=True)
+    def _resource_apply_sparse_impl(self, grad, var, indices, apply_state):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        # m_t = beta1 * m + (1 - beta1) * g_t
+        m = self.get_slot(var, "m")
+        m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
+        m.assign(m * coefficients["beta_1_t"])
+        m.scatter_add(tf.IndexedSlices(m_scaled_g_values, indices))
+
+        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+        v = self.get_slot(var, "v")
+        v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
+        v.assign(v * coefficients["beta_2_t"])
+        v.scatter_add(tf.IndexedSlices(v_scaled_g_values, indices))
+
+        if not self.amsgrad:
+            var.assign_sub(
+                coefficients["lr"] * m / (tf.sqrt(v) + coefficients["epsilon"])
+            )
+        else:
+            v_hat = self.get_slot(var, "vhat")
+            v_hat.assign(tf.maximum(v_hat, v))
+            var.assign_sub(
+                coefficients["lr"]
+                * m
+                / (tf.sqrt(v_hat) + coefficients["epsilon"])
+            )
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        self._resource_apply_sparse_impl(grad, var, indices, apply_state)
+        if not tf.executing_eagerly():
+            return tf.compat.v1.get_default_graph().get_operations()[-1]
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "beta_1": self._serialize_hyperparameter("beta_1"),
+                "beta_2": self._serialize_hyperparameter("beta_2"),
+                "epsilon": self.epsilon,
+                "amsgrad": self.amsgrad,
+            }
+        )
+        return config
diff --git a/keras/optimizers/legacy/adam_test.py b/keras/optimizers/legacy/adam_test.py
new file mode 100644
index 000000000000..f796b5a98e69
--- /dev/null
+++ b/keras/optimizers/legacy/adam_test.py
@@ -0,0 +1,1196 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.optimizers import optimizer_v1
+from keras.optimizers.legacy import adam
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.testing_infra import test_combinations
+
+
+def adam_update_numpy(
+    param, g_t, t, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7
+):
+    lr_t = lr * np.sqrt(1 - beta2 ** (t + 1)) / (1 - beta1 ** (t + 1))
+
+    m_t = beta1 * m + (1 - beta1) * g_t
+    v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+    param_t = param - lr_t * m_t / (np.sqrt(v_t) + epsilon)
+    return param_t, m_t, v_t
+
+
+def adam_update_numpy_amsgrad(
+    param, g_t, t, m, v, vhat, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7
+):
+    lr_t = lr * np.sqrt(1 - beta2 ** (t + 1)) / (1 - beta1 ** (t + 1))
+
+    m_t = beta1 * m + (1 - beta1) * g_t
+    v_t = beta2 * v + (1 - beta2) * g_t * g_t
+    vhat_t = np.maximum(vhat, v_t)
+
+    param_t = param - lr_t * m_t / (np.sqrt(vhat_t) + epsilon)
+    return param_t, m_t, v_t, vhat_t
+
+
+def adam_sparse_update_numpy_amsgrad(
+    param,
+    indices,
+    g_t,
+    t,
+    m,
+    v,
+    vhat,
+    lr=0.001,
+    beta1=0.9,
+    beta2=0.999,
+    epsilon=1e-7,
+):
+    m_t, v_t, vhat_t, param_t = (
+        np.copy(m),
+        np.copy(v),
+        np.copy(vhat),
+        np.copy(param),
+    )
+    lr_t = lr * np.sqrt(1 - beta2 ** (t + 1)) / (1 - beta1 ** (t + 1))
+    m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+    v_t_slice = beta2 * v[indices] + (1 - beta2) * g_t * g_t
+    m_t[indices] = m_t_slice
+    v_t[indices] = v_t_slice
+    v_hat_t = np.maximum(vhat_t, v_t)
+    v_hat_t_slice = v_hat_t[indices]
+    param_t_slice = param[indices] - (
+        lr_t * (m_t_slice / (np.sqrt(v_hat_t_slice) + epsilon))
+    )
+    param_t[indices] = param_t_slice
+    return param_t, m_t, v_t, vhat_t
+
+
+def get_beta_accumulators(opt, dtype):
+    local_step = tf.cast(opt.iterations + 1, dtype)
+    beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype)
+    beta_1_power = tf.pow(beta_1_t, local_step)
+    beta_2_t = tf.cast(opt._get_hyper("beta_2"), dtype)
+    beta_2_power = tf.pow(beta_2_t, local_step)
+    return (beta_1_power, beta_2_power)
+
+
+class AdamOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def testSparse(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array(
+                    [0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype
+                )
+                var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array(
+                    [0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype
+                )
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0_np_indices = np.array([0, 2], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np[grads0_np_indices]),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([3]),
+                )
+                grads1_np_indices = np.array([0, 2], dtype=np.int32)
+                grads1 = tf.IndexedSlices(
+                    tf.constant(grads1_np[grads1_np_indices]),
+                    tf.constant(grads1_np_indices),
+                    tf.constant([3]),
+                )
+                opt = adam.Adam()
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+                # Run 3 steps of Adam
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    update.run()
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testSparseDevicePlacement(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for index_dtype in [tf.int32, tf.int64]:
+            with tf.Graph().as_default(), self.cached_session(
+                force_gpu=tf.test.is_gpu_available()
+            ):
+                # If a GPU is available, tests that all optimizer ops can be
+                # placed on it (i.e. they have GPU kernels).
+                var = tf.Variable([[1.0], [2.0]])
+                indices = tf.constant([0, 1], dtype=index_dtype)
+                g_sum = lambda: tf.reduce_sum(tf.gather(var, indices))
+                optimizer = adam.Adam(3.0)
+                minimize_op = optimizer.minimize(g_sum, var_list=[var])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                minimize_op.run()
+
+    def testSparseRepeatedIndices(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                repeated_index_update_var = tf.Variable(
+                    [[1.0], [2.0]], dtype=dtype
+                )
+                aggregated_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype)
+                grad_repeated_index = tf.IndexedSlices(
+                    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+                    tf.constant([1, 1]),
+                    tf.constant([2, 1]),
+                )
+                grad_aggregated = tf.IndexedSlices(
+                    tf.constant([0.2], shape=[1, 1], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([2, 1]),
+                )
+                repeated_update = adam.Adam().apply_gradients(
+                    [(grad_repeated_index, repeated_index_update_var)]
+                )
+                aggregated_update = adam.Adam().apply_gradients(
+                    [(grad_aggregated, aggregated_update_var)]
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertAllClose(
+                    aggregated_update_var,
+                    self.evaluate(repeated_index_update_var),
+                )
+                for _ in range(3):
+                    repeated_update.run()
+                    aggregated_update.run()
+                    self.assertAllClose(
+                        aggregated_update_var,
+                        self.evaluate(repeated_index_update_var),
+                    )
+
+    def doTestBasic(self, use_callable_params=False):
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = lambda: 0.001
+                beta1 = lambda: 0.9
+                beta2 = lambda: 0.999
+                epsilon = lambda: 1e-8
+                if not use_callable_params:
+                    learning_rate = learning_rate()
+                    beta1 = beta1()
+                    beta2 = beta2()
+                    epsilon = epsilon()
+
+                opt = adam.Adam(learning_rate=learning_rate)
+                if not tf.executing_eagerly():
+                    update = opt.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of Adam
+                for t in range(3):
+                    beta_1_power, beta_2_power = get_beta_accumulators(
+                        opt, dtype
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    if not tf.executing_eagerly():
+                        self.evaluate(update)
+                    else:
+                        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testResourceBasic(self):
+        self.doTestBasic()
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testBasicCallableParams(self):
+        self.doTestBasic(use_callable_params=True)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicWithAmsgrad(self):
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                opt = adam.Adam(amsgrad=True)
+                if not tf.executing_eagerly():
+                    update = opt.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of Adam
+                for t in range(3):
+                    beta_1_power, beta_2_power = get_beta_accumulators(
+                        opt, dtype
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    if not tf.executing_eagerly():
+                        self.evaluate(update)
+                    else:
+                        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+                    var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
+                        var0_np, grads0_np, t, m0, v0, v0hat
+                    )
+                    var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
+                        var1_np, grads1_np, t, m1, v1, v1hat
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testSparseWithAmsgrad(self):
+        # dtypes.half does not work on gpu + eager.
+        for dtype in [tf.float32, tf.float64]:
+            with self.cached_session():
+                m0 = np.array([[0.0], [0.0]])
+                v0 = np.array([[0.0], [0.0]])
+                v0hat = np.array([[0.0], [0.0]])
+                indices_np = np.array([1])
+                indices = tf.constant(indices_np, dtype=tf.int32)
+                var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+                repeated_index_update_var = tf.Variable(var0_np, dtype=dtype)
+                aggregated_update_var = tf.Variable(var0_np, dtype=dtype)
+                grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
+                grad_repeated_index = tf.IndexedSlices(
+                    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+                    tf.constant([1, 1]),
+                    tf.constant([2, 1]),
+                )
+                grad_aggregated = tf.IndexedSlices(
+                    grads0_np, indices, tf.constant([2, 1])
+                )
+                opt_repeated = adam.Adam(amsgrad=True)
+                opt_aggregated = adam.Adam(amsgrad=True)
+                if not tf.executing_eagerly():
+                    repeated_update = opt_repeated.apply_gradients(
+                        [(grad_repeated_index, repeated_index_update_var)]
+                    )
+                    aggregated_update = opt_aggregated.apply_gradients(
+                        [(grad_aggregated, aggregated_update_var)]
+                    )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertAllClose(
+                    self.evaluate(aggregated_update_var),
+                    self.evaluate(repeated_index_update_var),
+                )
+                for t in range(3):
+                    if not tf.executing_eagerly():
+                        self.evaluate(repeated_update)
+                        self.evaluate(aggregated_update)
+                    else:
+                        opt_repeated.apply_gradients(
+                            [(grad_repeated_index, repeated_index_update_var)]
+                        )
+                        opt_aggregated.apply_gradients(
+                            [(grad_aggregated, aggregated_update_var)]
+                        )
+
+                    var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
+                        var0_np, indices_np, grads0_np, t, m0, v0, v0hat
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(aggregated_update_var)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        self.evaluate(aggregated_update_var),
+                        self.evaluate(repeated_index_update_var),
+                    )
+
+    def testBasicWithLearningRateDecay(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = 0.001
+                beta_1 = 0.9
+                beta_2 = 0.999
+                epsilon = 1e-7
+                decay = 0.5
+
+                opt = adam.Adam(
+                    learning_rate=learning_rate,
+                    beta_1=beta_1,
+                    beta_2=beta_2,
+                    epsilon=epsilon,
+                    decay=decay,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of Adam
+                for t in range(3):
+                    self.evaluate(update)
+                    lr_np = learning_rate / (1 + decay * t)
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0, lr=lr_np
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1, lr=lr_np
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testBasicWithLearningRateInverseTimeDecay(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = 0.001
+                decay = 0.5
+                lr_schedule = learning_rate_schedule.InverseTimeDecay(
+                    learning_rate, decay_steps=1.0, decay_rate=decay
+                )
+                beta_1 = 0.9
+                beta_2 = 0.999
+                epsilon = 1e-7
+
+                opt = adam.Adam(
+                    learning_rate=lr_schedule,
+                    beta_1=beta_1,
+                    beta_2=beta_2,
+                    epsilon=epsilon,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of Adam
+                for t in range(3):
+                    self.evaluate(update)
+
+                    lr_np = learning_rate / (1 + decay * t)
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0, lr=lr_np
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1, lr=lr_np
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testTensorLearningRate(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                opt = adam.Adam(tf.constant(0.001))
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+                # Run 3 steps of Adam
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    update.run()
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testSharing(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                opt = adam.Adam()
+                update1 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                update2 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of intertwined Adam1 and Adam2.
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    if t % 2 == 0:
+                        update1.run()
+                    else:
+                        update2.run()
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testSlotsUniqueEager(self):
+        v1 = tf.Variable(1.0)
+        v2 = tf.Variable(1.0)
+        opt = adam.Adam(1.0)
+        opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+        # There should be iteration, and two unique slot variables for v1 and
+        # v2.
+        self.assertLen(set(v.ref() for v in opt.variables()), 5)
+        self.assertEqual(
+            self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations)
+        )
+
+    def testSetWeightsFromV1AdamWithoutMinimize(self):
+        keras_v1_adam = optimizer_v1.Adam()
+        keras_v2_adam = adam.Adam()
+        keras_v2_adam.set_weights(keras_v1_adam.get_weights())
+        keras_v1_iteration = keras_v1_adam.iterations
+        keras_v2_iteration = keras_v2_adam.iterations
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertEqual(
+            self.evaluate(keras_v1_iteration), self.evaluate(keras_v2_iteration)
+        )
+
+    def testConstructAdamWithLR(self):
+        opt = adam.Adam(lr=1.0)
+        opt_2 = adam.Adam(learning_rate=0.1, lr=1.0)
+        opt_3 = adam.Adam(learning_rate=0.1)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+
+class NonFusedAdamOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def testSparse(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array(
+                    [0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype
+                )
+                var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array(
+                    [0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype
+                )
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0_np_indices = np.array([0, 2], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np[grads0_np_indices]),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([3]),
+                )
+                grads1_np_indices = np.array([0, 2], dtype=np.int32)
+                grads1 = tf.IndexedSlices(
+                    tf.constant(grads1_np[grads1_np_indices]),
+                    tf.constant(grads1_np_indices),
+                    tf.constant([3]),
+                )
+                opt = adam.NonFusedAdam()
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+                # Run 3 steps of NonFusedAdam
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    update.run()
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testSparseDevicePlacement(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for index_dtype in [tf.int32, tf.int64]:
+            with tf.Graph().as_default(), self.cached_session(
+                force_gpu=tf.test.is_gpu_available()
+            ):
+                # If a GPU is available, tests that all optimizer ops can be
+                # placed on it (i.e. they have GPU kernels).
+                var = tf.Variable([[1.0], [2.0]])
+                indices = tf.constant([0, 1], dtype=index_dtype)
+                g_sum = lambda: tf.reduce_sum(tf.gather(var, indices))
+                optimizer = adam.NonFusedAdam(3.0)
+                minimize_op = optimizer.minimize(g_sum, var_list=[var])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                minimize_op.run()
+
+    def testSparseRepeatedIndices(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                repeated_index_update_var = tf.Variable(
+                    [[1.0], [2.0]], dtype=dtype
+                )
+                aggregated_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype)
+                grad_repeated_index = tf.IndexedSlices(
+                    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+                    tf.constant([1, 1]),
+                    tf.constant([2, 1]),
+                )
+                grad_aggregated = tf.IndexedSlices(
+                    tf.constant([0.2], shape=[1, 1], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([2, 1]),
+                )
+                repeated_update = adam.NonFusedAdam().apply_gradients(
+                    [(grad_repeated_index, repeated_index_update_var)]
+                )
+                aggregated_update = adam.NonFusedAdam().apply_gradients(
+                    [(grad_aggregated, aggregated_update_var)]
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertAllClose(
+                    aggregated_update_var,
+                    self.evaluate(repeated_index_update_var),
+                )
+                for _ in range(3):
+                    repeated_update.run()
+                    aggregated_update.run()
+                    self.assertAllClose(
+                        aggregated_update_var,
+                        self.evaluate(repeated_index_update_var),
+                    )
+
+    def doTestBasic(self, use_callable_params=False):
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = lambda: 0.001
+                beta1 = lambda: 0.9
+                beta2 = lambda: 0.999
+                epsilon = lambda: 1e-8
+                if not use_callable_params:
+                    learning_rate = learning_rate()
+                    beta1 = beta1()
+                    beta2 = beta2()
+                    epsilon = epsilon()
+
+                opt = adam.NonFusedAdam(learning_rate=learning_rate)
+                if not tf.executing_eagerly():
+                    update = opt.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of NonFusedAdam
+                for t in range(3):
+                    beta_1_power, beta_2_power = get_beta_accumulators(
+                        opt, dtype
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    if not tf.executing_eagerly():
+                        self.evaluate(update)
+                    else:
+                        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0), rtol=1e-4, atol=1e-4
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1), rtol=1e-4, atol=1e-4
+                    )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testResourceBasic(self):
+        self.doTestBasic()
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testBasicCallableParams(self):
+        self.doTestBasic(use_callable_params=True)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicWithAmsgrad(self):
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                opt = adam.NonFusedAdam(amsgrad=True)
+                if not tf.executing_eagerly():
+                    update = opt.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of NonFusedAdam
+                for t in range(3):
+                    beta_1_power, beta_2_power = get_beta_accumulators(
+                        opt, dtype
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    if not tf.executing_eagerly():
+                        self.evaluate(update)
+                    else:
+                        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+                    var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
+                        var0_np, grads0_np, t, m0, v0, v0hat
+                    )
+                    var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
+                        var1_np, grads1_np, t, m1, v1, v1hat
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0), rtol=1e-4, atol=1e-4
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1), rtol=1e-4, atol=1e-4
+                    )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testSparseWithAmsgrad(self):
+        # dtypes.half does not work on gpu + eager.
+        for dtype in [tf.float32, tf.float64]:
+            with self.cached_session():
+                m0 = np.array([[0.0], [0.0]])
+                v0 = np.array([[0.0], [0.0]])
+                v0hat = np.array([[0.0], [0.0]])
+                indices_np = np.array([1])
+                indices = tf.constant(indices_np, dtype=tf.int32)
+                var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+                repeated_index_update_var = tf.Variable(var0_np, dtype=dtype)
+                aggregated_update_var = tf.Variable(var0_np, dtype=dtype)
+                grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
+                grad_repeated_index = tf.IndexedSlices(
+                    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+                    tf.constant([1, 1]),
+                    tf.constant([2, 1]),
+                )
+                grad_aggregated = tf.IndexedSlices(
+                    grads0_np, indices, tf.constant([2, 1])
+                )
+                opt_repeated = adam.NonFusedAdam(amsgrad=True)
+                opt_aggregated = adam.NonFusedAdam(amsgrad=True)
+                if not tf.executing_eagerly():
+                    repeated_update = opt_repeated.apply_gradients(
+                        [(grad_repeated_index, repeated_index_update_var)]
+                    )
+                    aggregated_update = opt_aggregated.apply_gradients(
+                        [(grad_aggregated, aggregated_update_var)]
+                    )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertAllClose(
+                    self.evaluate(aggregated_update_var),
+                    self.evaluate(repeated_index_update_var),
+                )
+                for t in range(3):
+                    if not tf.executing_eagerly():
+                        self.evaluate(repeated_update)
+                        self.evaluate(aggregated_update)
+                    else:
+                        opt_repeated.apply_gradients(
+                            [(grad_repeated_index, repeated_index_update_var)]
+                        )
+                        opt_aggregated.apply_gradients(
+                            [(grad_aggregated, aggregated_update_var)]
+                        )
+
+                    var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
+                        var0_np, indices_np, grads0_np, t, m0, v0, v0hat
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(aggregated_update_var)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        self.evaluate(aggregated_update_var),
+                        self.evaluate(repeated_index_update_var),
+                    )
+
+    def testBasicWithLearningRateDecay(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = 0.001
+                beta_1 = 0.9
+                beta_2 = 0.999
+                epsilon = 1e-7
+                decay = 0.5
+
+                opt = adam.NonFusedAdam(
+                    learning_rate=learning_rate,
+                    beta_1=beta_1,
+                    beta_2=beta_2,
+                    epsilon=epsilon,
+                    decay=decay,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of NonFusedAdam
+                for t in range(3):
+                    self.evaluate(update)
+                    lr_np = learning_rate / (1 + decay * t)
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0, lr=lr_np
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1, lr=lr_np
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testBasicWithLearningRateInverseTimeDecay(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = 0.001
+                decay = 0.5
+                lr_schedule = learning_rate_schedule.InverseTimeDecay(
+                    learning_rate, decay_steps=1.0, decay_rate=decay
+                )
+                beta_1 = 0.9
+                beta_2 = 0.999
+                epsilon = 1e-7
+
+                opt = adam.NonFusedAdam(
+                    learning_rate=lr_schedule,
+                    beta_1=beta_1,
+                    beta_2=beta_2,
+                    epsilon=epsilon,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 3 steps of NonFusedAdam
+                for t in range(3):
+                    self.evaluate(update)
+
+                    lr_np = learning_rate / (1 + decay * t)
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0, lr=lr_np
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1, lr=lr_np
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testTensorLearningRate(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                opt = adam.NonFusedAdam(tf.constant(0.001))
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+                # Run 3 steps of NonFusedAdam
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    update.run()
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testSharing(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                opt = adam.NonFusedAdam()
+                update1 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                update2 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of intertwined NonFusedAdam1 and NonFusedAdam2.
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), self.evaluate(beta_2_power)
+                    )
+                    if t % 2 == 0:
+                        update1.run()
+                    else:
+                        update2.run()
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/adamax.py b/keras/optimizers/legacy/adamax.py
index 016a2f172578..f89690fadb7a 100644
--- a/keras/optimizers/legacy/adamax.py
+++ b/keras/optimizers/legacy/adamax.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,190 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy Adamax optimizer implementation."""
+"""Adamax optimizer implementation."""
 
-from keras.optimizers.optimizer_v2 import adamax
+import tensorflow.compat.v2 as tf
 
+from keras import backend_config
+from keras.optimizers.legacy import optimizer_v2
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.Adamax')
-class Adamax(adamax.Adamax):
-  pass
+@keras_export(
+    "keras.optimizers.legacy.Adamax",
+    v1=["keras.optimizers.Adamax", "keras.optimizers.legacy.Adamax"],
+)
+class Adamax(optimizer_v2.OptimizerV2):
+    """Optimizer that implements the Adamax algorithm.
+
+    It is a variant of Adam based on the infinity norm.
+    Default parameters follow those provided in the paper.
+    Adamax is sometimes superior to adam, specially in models with embeddings.
+
+    Initialization:
+
+    ```python
+    m = 0  # Initialize initial 1st moment vector
+    v = 0  # Initialize the exponentially weighted infinity norm
+    t = 0  # Initialize timestep
+    ```
+
+    The update rule for parameter `w` with gradient `g` is
+    described at the end of section 7.1 of the paper:
+
+    ```python
+    t += 1
+    m = beta1 * m + (1 - beta) * g
+    v = max(beta2 * v, abs(g))
+    current_lr = learning_rate / (1 - beta1 ** t)
+    w = w - current_lr * m / (v + epsilon)
+    ```
+
+    Similarly to `Adam`, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when `v_t == 0`).
+
+    In contrast to `Adam`, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A `Tensor`, floating point value, or a schedule that is a
+        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to `"Adamax"`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
+
+    Reference:
+      - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        name="Adamax",
+        **kwargs
+    ):
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+        self._set_hyper("beta_1", beta_1)
+        self._set_hyper("beta_2", beta_2)
+        self.epsilon = epsilon or backend_config.epsilon()
+
+    def _create_slots(self, var_list):
+        # Separate for-loops to respect the ordering of slot variables from v1.
+        for var in var_list:
+            self.add_slot(var, "m")  # Create slots for the first moments.
+        for var in var_list:
+            self.add_slot(var, "v")  # Create slots for the second moments.
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
+        beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
+        beta_1_power = tf.pow(beta_1_t, local_step)
+        lr_t = apply_state[(var_device, var_dtype)]["lr_t"]
+
+        apply_state[(var_device, var_dtype)].update(
+            dict(
+                neg_scaled_lr=-lr_t / (1 - beta_1_power),
+                epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+                beta_1_t=beta_1_t,
+                beta_1_power=beta_1_power,
+                one_minus_beta_1_t=1 - beta_1_t,
+                beta_2_t=beta_2_t,
+                zero=tf.zeros((), dtype=tf.int64),
+            )
+        )
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        m = self.get_slot(var, "m")
+        v = self.get_slot(var, "v")
+        return tf.raw_ops.ResourceApplyAdaMax(
+            var=var.handle,
+            m=m.handle,
+            v=v.handle,
+            beta1_power=coefficients["beta_1_power"],
+            lr=coefficients["lr_t"],
+            beta1=coefficients["beta_1_t"],
+            beta2=coefficients["beta_2_t"],
+            epsilon=coefficients["epsilon"],
+            grad=grad,
+            use_locking=self._use_locking,
+        )
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        # m_t = beta1 * m + (1 - beta1) * g_t
+        m = self.get_slot(var, "m")
+        m_slice = tf.gather(m, indices, axis=coefficients["zero"])
+        m_t_slice = (
+            m_slice * coefficients["beta_1_t"]
+            + grad * coefficients["one_minus_beta_1_t"]
+        )
+        with tf.control_dependencies([m_t_slice]):
+            m_t = self._resource_scatter_update(m, indices, m_t_slice)
+
+        # u_t = max(beta2 * u, abs(g_t))
+        v = self.get_slot(var, "v")
+        v_slice = tf.gather(v, indices, axis=coefficients["zero"])
+        v_t_slice = tf.maximum(v_slice * coefficients["beta_2_t"], tf.abs(grad))
+        with tf.control_dependencies([v_t_slice]):
+            v_t = self._resource_scatter_update(v, indices, v_t_slice)
+        # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+        var_slice = coefficients["neg_scaled_lr"] * (
+            m_t_slice / (v_t_slice + coefficients["epsilon"])
+        )
+        with tf.control_dependencies([var_slice]):
+            var_update = self._resource_scatter_add(var, indices, var_slice)
+        return tf.group(*[var_update, m_t, v_t])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "beta_1": self._serialize_hyperparameter("beta_1"),
+                "beta_2": self._serialize_hyperparameter("beta_2"),
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
diff --git a/keras/optimizers/legacy/adamax_test.py b/keras/optimizers/legacy/adamax_test.py
new file mode 100644
index 000000000000..b0a921dc03b6
--- /dev/null
+++ b/keras/optimizers/legacy/adamax_test.py
@@ -0,0 +1,421 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adamax."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.optimizers.legacy import adamax
+from keras.testing_infra import test_combinations
+
+
+def adamax_update_numpy(
+    param, g_t, t, m, v, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8
+):
+    m_t = beta1 * m + (1 - beta1) * g_t
+    v_t = np.maximum(beta2 * v, np.abs(g_t))
+    param_t = param - (alpha / (1 - beta1 ** (t + 1))) * (m_t / (v_t + epsilon))
+    return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(
+    param,
+    indices,
+    g_t,
+    t,
+    m,
+    v,
+    alpha=0.001,
+    beta1=0.9,
+    beta2=0.999,
+    epsilon=1e-8,
+):
+    m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+    m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+    v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+    param_t_slice = param[indices] - (
+        (alpha / (1 - beta1 ** (t + 1))) * (m_t_slice / (v_t_slice + epsilon))
+    )
+    m_t[indices] = m_t_slice
+    v_t[indices] = v_t_slice
+    param_t[indices] = param_t_slice
+    return param_t, m_t, v_t
+
+
+def get_beta_accumulators(opt, dtype):
+    local_step = tf.cast(opt.iterations + 1, dtype)
+    beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype)
+    beta_1_power = tf.pow(beta_1_t, local_step)
+    return beta_1_power
+
+
+class AdamaxOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def testResourceSparse(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)
+                m0, v0, m1, v1 = (
+                    zero_slots(),
+                    zero_slots(),
+                    zero_slots(),
+                    zero_slots(),
+                )
+                var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+
+                grads0_np_indices = np.array([0, 1], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([3]),
+                )
+                grads1_np_indices = np.array([2, 1], dtype=np.int32)
+                grads1 = tf.IndexedSlices(
+                    tf.constant(grads1_np),
+                    tf.constant(grads1_np_indices),
+                    tf.constant([3]),
+                )
+                opt = adamax.Adamax()
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0, 3.0], var0)
+                self.assertAllClose([4.0, 5.0, 6.0], var1)
+
+                beta1_power = get_beta_accumulators(opt, dtype)
+
+                # Run 3 steps of Adamax
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), beta1_power
+                    )
+                    update.run()
+
+                    var0_np, m0, v0 = adamax_sparse_update_numpy(
+                        var0_np, grads0_np_indices, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adamax_sparse_update_numpy(
+                        var1_np, grads1_np_indices, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np, var0)
+                    self.assertAllCloseAccordingToType(var1_np, var1)
+
+    def testSparseDevicePlacement(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for index_dtype in [tf.int32, tf.int64]:
+            with tf.Graph().as_default(), self.cached_session(
+                force_gpu=tf.test.is_gpu_available()
+            ):
+                # If a GPU is available, tests that all optimizer ops can be
+                # placed on it (i.e. they have GPU kernels).
+                var = tf.Variable([[1.0], [2.0]])
+                indices = tf.constant([0, 1], dtype=index_dtype)
+                g_sum = lambda: tf.reduce_sum(tf.gather(var, indices))
+                optimizer = adamax.Adamax(3.0)
+                minimize_op = optimizer.minimize(g_sum, var_list=[var])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                minimize_op.run()
+
+    def testSparseRepeatedIndices(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                repeated_index_update_var = tf.Variable(
+                    [[1.0], [2.0]], dtype=dtype
+                )
+                aggregated_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype)
+                grad_repeated_index = tf.IndexedSlices(
+                    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+                    tf.constant([1, 1]),
+                    tf.constant([2, 1]),
+                )
+                grad_aggregated = tf.IndexedSlices(
+                    tf.constant([0.2], shape=[1, 1], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([2, 1]),
+                )
+                repeated_update = adamax.Adamax().apply_gradients(
+                    [(grad_repeated_index, repeated_index_update_var)]
+                )
+                aggregated_update = adamax.Adamax().apply_gradients(
+                    [(grad_aggregated, aggregated_update_var)]
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.assertAllClose(
+                    aggregated_update_var, repeated_index_update_var.eval()
+                )
+                for _ in range(3):
+                    repeated_update.run()
+                    aggregated_update.run()
+                    self.assertAllClose(
+                        aggregated_update_var, repeated_index_update_var.eval()
+                    )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasic(self):
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with self.session(graph=tf.Graph(), use_gpu=True):
+                # Initialize variables for numpy implementation.
+                m0 = np.array([0.0, 0.0])
+                v0 = np.array([0.0, 0.0])
+                m1 = np.array([0.0, 0.0])
+                v1 = np.array([0.0, 0.0])
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                opt = adamax.Adamax()
+                if not tf.executing_eagerly():
+                    update = opt.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+
+                if not tf.executing_eagerly():
+                    self.evaluate(tf.compat.v1.global_variables_initializer())
+                    # Fetch params to validate initial values
+                    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of Adamax
+                for t in range(3):
+                    beta_1_power = get_beta_accumulators(opt, dtype)
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    if not tf.executing_eagerly():
+                        self.evaluate(update)
+                    else:
+                        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+                    var0_np, m0, v0 = adamax_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adamax_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0), rtol=1e-2
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1), rtol=1e-2
+                    )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicWithLearningRateDecay(self):
+        for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            with self.session(graph=tf.Graph(), use_gpu=True):
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, name="var0_%d" % i)
+                var1 = tf.Variable(var1_np, name="var1_%d" % i)
+
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+
+                learning_rate = 0.001
+                decay = 0.002
+                opt = adamax.Adamax(learning_rate=learning_rate, decay=decay)
+                if not tf.executing_eagerly():
+                    update = opt.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+
+                if not tf.executing_eagerly():
+                    self.evaluate(tf.compat.v1.global_variables_initializer())
+                    # Fetch params to validate initial values
+                    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of Adamax
+                for t in range(3):
+                    beta_1_power = get_beta_accumulators(opt, dtype)
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), self.evaluate(beta_1_power)
+                    )
+                    if not tf.executing_eagerly():
+                        self.evaluate(update)
+                    else:
+                        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+                    lr = learning_rate / (1 + decay * t)
+
+                    var0_np, m0, v0 = adamax_update_numpy(
+                        var0_np, grads0_np, t, m0, v0, alpha=lr
+                    )
+                    var1_np, m1, v1 = adamax_update_numpy(
+                        var1_np, grads1_np, t, m1, v1, alpha=lr
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0), rtol=1e-2
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1), rtol=1e-2
+                    )
+
+    def testTensorLearningRate(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                opt = adamax.Adamax(tf.constant(0.001))
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], var0)
+                self.assertAllClose([3.0, 4.0], var1)
+
+                beta1_power = get_beta_accumulators(opt, dtype)
+
+                # Run 3 steps of Adamax
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), beta1_power
+                    )
+                    update.run()
+
+                    var0_np, m0, v0 = adamax_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adamax_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np, var0)
+                    self.assertAllCloseAccordingToType(var1_np, var1)
+
+    def testSharing(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                opt = adamax.Adamax()
+                update1 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                update2 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                beta1_power = get_beta_accumulators(opt, dtype)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], var0)
+                self.assertAllClose([3.0, 4.0], var1)
+
+                # Run 3 steps of intertwined Adamax1 and Adamax2.
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), beta1_power
+                    )
+                    if t % 2 == 0:
+                        update1.run()
+                    else:
+                        update2.run()
+
+                    var0_np, m0, v0 = adamax_update_numpy(
+                        var0_np, grads0_np, t, m0, v0
+                    )
+                    var1_np, m1, v1 = adamax_update_numpy(
+                        var1_np, grads1_np, t, m1, v1
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np, var0)
+                    self.assertAllCloseAccordingToType(var1_np, var1)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testSlotsUniqueEager(self):
+        v1 = tf.Variable(1.0)
+        v2 = tf.Variable(1.0)
+        opt = adamax.Adamax(1.0)
+        opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+        # There should be iteration, and two unique slot variables for v1 and
+        # v2.
+        self.assertLen({id(v) for v in opt.variables()}, 5)
+
+    def testConstructAdamaxWithLR(self):
+        opt = adamax.Adamax(lr=1.0)
+        opt_2 = adamax.Adamax(learning_rate=0.1, lr=1.0)
+        opt_3 = adamax.Adamax(learning_rate=0.1)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/ftrl.py b/keras/optimizers/legacy/ftrl.py
index e8469a504e3f..0e592b268743 100644
--- a/keras/optimizers/legacy/ftrl.py
+++ b/keras/optimizers/legacy/ftrl.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,298 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy Ftrl optimizer implementation."""
+"""Ftrl-proximal optimizer implementation."""
 
-from keras.optimizers.optimizer_v2 import ftrl
 
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers.legacy import optimizer_v2
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.Ftrl')
-class Ftrl(ftrl.Ftrl):
-  pass
+@keras_export(
+    "keras.optimizers.legacy.Ftrl",
+    v1=["keras.optimizers.Ftrl", "keras.optimizers.legacy.Ftrl"],
+)
+class Ftrl(optimizer_v2.OptimizerV2):
+    r"""Optimizer that implements the FTRL algorithm.
+
+    "Follow The Regularized Leader" (FTRL) is an optimization algorithm
+    developed at Google for click-through rate prediction in the early 2010s. It
+    is most suitable for shallow models with large and sparse feature spaces.
+    The algorithm is described by
+    [McMahan et al., 2013](https://research.google.com/pubs/archive/41159.pdf).
+    The Keras version has support for both online L2 regularization
+    (the L2 regularization described in the paper
+    above) and shrinkage-type L2 regularization
+    (which is the addition of an L2 penalty to the loss function).
+
+    Initialization:
+
+    ```python
+    n = 0
+    sigma = 0
+    z = 0
+    ```
+
+    Update rule for one variable `w`:
+
+    ```python
+    prev_n = n
+    n = n + g ** 2
+    sigma = (sqrt(n) - sqrt(prev_n)) / lr
+    z = z + g - sigma * w
+    if abs(z) < lambda_1:
+      w = 0
+    else:
+      w = (sgn(z) * lambda_1 - z) / ((beta + sqrt(n)) / alpha + lambda_2)
+    ```
+
+    Notation:
+
+    - `lr` is the learning rate
+    - `g` is the gradient for the variable
+    - `lambda_1` is the L1 regularization strength
+    - `lambda_2` is the L2 regularization strength
+
+    Check the documentation for the `l2_shrinkage_regularization_strength`
+    parameter for more details when shrinkage is enabled, in which case gradient
+    is replaced with a gradient with shrinkage.
+
+    Args:
+      learning_rate: A `Tensor`, floating point value, or a schedule that is a
+        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
+      learning_rate_power: A float value, must be less or equal to zero.
+        Controls how the learning rate decreases during training. Use zero for
+        a fixed learning rate.
+      initial_accumulator_value: The starting value for accumulators.
+        Only zero or positive values are allowed.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero. Defaults to `0.0`.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero. Defaults to `0.0`.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to `"Ftrl"`.
+      l2_shrinkage_regularization_strength: A float value, must be greater than
+        or equal to zero. This differs from L2 above in that the L2 above is a
+        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
+        When input is sparse shrinkage will only happen on the active weights.
+      beta: A float value, representing the beta value from the paper.
+        Defaults to `0.0`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
+
+    Reference:
+      - [McMahan et al., 2013](
+        https://research.google.com/pubs/archive/41159.pdf)
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        learning_rate_power=-0.5,
+        initial_accumulator_value=0.1,
+        l1_regularization_strength=0.0,
+        l2_regularization_strength=0.0,
+        name="Ftrl",
+        l2_shrinkage_regularization_strength=0.0,
+        beta=0.0,
+        **kwargs,
+    ):
+        super().__init__(name, **kwargs)
+
+        if initial_accumulator_value < 0.0:
+            raise ValueError(
+                "`initial_accumulator_value` needs to be "
+                "positive or zero. Received: "
+                f"initial_accumulator_value={initial_accumulator_value}."
+            )
+        if learning_rate_power > 0.0:
+            raise ValueError(
+                "`learning_rate_power` needs to be "
+                "negative or zero. Received: "
+                f"learning_rate_power={learning_rate_power}."
+            )
+        if l1_regularization_strength < 0.0:
+            raise ValueError(
+                "`l1_regularization_strength` needs to be positive or zero. "
+                "Received: l1_regularization_strength="
+                f"{l1_regularization_strength}."
+            )
+        if l2_regularization_strength < 0.0:
+            raise ValueError(
+                "`l2_regularization_strength` needs to be positive or zero. "
+                "Received: l2_regularization_strength="
+                f"{l2_regularization_strength}."
+            )
+        if l2_shrinkage_regularization_strength < 0.0:
+            raise ValueError(
+                "`l2_shrinkage_regularization_strength` needs to be positive "
+                "or zero. Received: l2_shrinkage_regularization_strength"
+                f"={l2_shrinkage_regularization_strength}."
+            )
+
+        self._set_hyper("learning_rate", learning_rate)
+        self._set_hyper("decay", self._initial_decay)
+        self._set_hyper("learning_rate_power", learning_rate_power)
+        self._set_hyper(
+            "l1_regularization_strength", l1_regularization_strength
+        )
+        self._set_hyper(
+            "l2_regularization_strength", l2_regularization_strength
+        )
+        self._set_hyper("beta", beta)
+        self._initial_accumulator_value = initial_accumulator_value
+        self._l2_shrinkage_regularization_strength = (
+            l2_shrinkage_regularization_strength
+        )
+
+    def _create_slots(self, var_list):
+        # Create the "accum" and "linear" slots.
+        for var in var_list:
+            dtype = var.dtype.base_dtype
+            init = tf.compat.v1.constant_initializer(
+                self._initial_accumulator_value, dtype=dtype
+            )
+            self.add_slot(var, "accumulator", init)
+            self.add_slot(var, "linear")
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+        apply_state[(var_device, var_dtype)].update(
+            dict(
+                learning_rate_power=tf.identity(
+                    self._get_hyper("learning_rate_power", var_dtype)
+                ),
+                l1_regularization_strength=tf.identity(
+                    self._get_hyper("l1_regularization_strength", var_dtype)
+                ),
+                l2_regularization_strength=tf.identity(
+                    self._get_hyper("l2_regularization_strength", var_dtype)
+                ),
+                beta=tf.identity(self._get_hyper("beta", var_dtype)),
+                l2_shrinkage_regularization_strength=tf.cast(
+                    self._l2_shrinkage_regularization_strength, var_dtype
+                ),
+            )
+        )
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        # Adjust L2 regularization strength to include beta to avoid the
+        # underlying TensorFlow ops needing to include it.
+        adjusted_l2_regularization_strength = coefficients[
+            "l2_regularization_strength"
+        ] + coefficients["beta"] / (2.0 * coefficients["lr_t"])
+
+        accum = self.get_slot(var, "accumulator")
+        linear = self.get_slot(var, "linear")
+
+        if self._l2_shrinkage_regularization_strength <= 0.0:
+            return tf.raw_ops.ResourceApplyFtrl(
+                var=var.handle,
+                accum=accum.handle,
+                linear=linear.handle,
+                grad=grad,
+                lr=coefficients["lr_t"],
+                l1=coefficients["l1_regularization_strength"],
+                l2=adjusted_l2_regularization_strength,
+                lr_power=coefficients["learning_rate_power"],
+                use_locking=self._use_locking,
+            )
+        else:
+            return tf.raw_ops.ResourceApplyFtrlV2(
+                var=var.handle,
+                accum=accum.handle,
+                linear=linear.handle,
+                grad=grad,
+                lr=coefficients["lr_t"],
+                l1=coefficients["l1_regularization_strength"],
+                l2=adjusted_l2_regularization_strength,
+                l2_shrinkage=coefficients[
+                    "l2_shrinkage_regularization_strength"
+                ],
+                lr_power=coefficients["learning_rate_power"],
+                use_locking=self._use_locking,
+            )
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        # Adjust L2 regularization strength to include beta to avoid the
+        # underlying TensorFlow ops needing to include it.
+        adjusted_l2_regularization_strength = coefficients[
+            "l2_regularization_strength"
+        ] + coefficients["beta"] / (2.0 * coefficients["lr_t"])
+
+        accum = self.get_slot(var, "accumulator")
+        linear = self.get_slot(var, "linear")
+
+        if self._l2_shrinkage_regularization_strength <= 0.0:
+            return tf.raw_ops.ResourceSparseApplyFtrl(
+                var=var.handle,
+                accum=accum.handle,
+                linear=linear.handle,
+                grad=grad,
+                indices=indices,
+                lr=coefficients["lr_t"],
+                l1=coefficients["l1_regularization_strength"],
+                l2=adjusted_l2_regularization_strength,
+                lr_power=coefficients["learning_rate_power"],
+                use_locking=self._use_locking,
+            )
+        else:
+            return tf.raw_ops.ResourceSparseApplyFtrlV2(
+                var=var.handle,
+                accum=accum.handle,
+                linear=linear.handle,
+                grad=grad,
+                indices=indices,
+                lr=coefficients["lr_t"],
+                l1=coefficients["l1_regularization_strength"],
+                l2=adjusted_l2_regularization_strength,
+                l2_shrinkage=coefficients[
+                    "l2_shrinkage_regularization_strength"
+                ],
+                lr_power=coefficients["learning_rate_power"],
+                use_locking=self._use_locking,
+            )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "initial_accumulator_value": self._initial_accumulator_value,
+                "learning_rate_power": self._serialize_hyperparameter(
+                    "learning_rate_power"
+                ),
+                "l1_regularization_strength": self._serialize_hyperparameter(
+                    "l1_regularization_strength"
+                ),
+                "l2_regularization_strength": self._serialize_hyperparameter(
+                    "l2_regularization_strength"
+                ),
+                "beta": self._serialize_hyperparameter("beta"),
+                "l2_shrinkage_regularization_strength": self._l2_shrinkage_regularization_strength,  # noqa: E501
+            }
+        )
+        return config
diff --git a/keras/optimizers/legacy/ftrl_test.py b/keras/optimizers/legacy/ftrl_test.py
new file mode 100644
index 000000000000..4c1caa941243
--- /dev/null
+++ b/keras/optimizers/legacy/ftrl_test.py
@@ -0,0 +1,558 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Ftrl operations."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers.legacy import ftrl
+
+
+class FtrlOptimizerTest(tf.test.TestCase):
+    def doTestFtrlwithoutRegularization(self, use_resource=False):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                if use_resource:
+                    var0 = tf.Variable([0.0, 0.0], dtype=dtype)
+                    var1 = tf.Variable([0.0, 0.0], dtype=dtype)
+                else:
+                    var0 = tf.Variable([0.0, 0.0], dtype=dtype)
+                    var1 = tf.Variable([0.0, 0.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+                opt = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.0,
+                    l2_regularization_strength=0.0,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllClose([0.0, 0.0], v0_val)
+                self.assertAllClose([0.0, 0.0], v1_val)
+
+                # Run 3 steps FTRL
+                for _ in range(3):
+                    update.run()
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    np.array([-2.60260963, -4.29698515]), v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.28432083, -0.56694895]), v1_val
+                )
+
+    def testFtrlWithoutRegularization(self):
+        self.doTestFtrlwithoutRegularization(use_resource=False)
+
+    def testResourceFtrlWithoutRegularization(self):
+        self.doTestFtrlwithoutRegularization(use_resource=True)
+
+    def testFtrlwithoutRegularization2(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([4.0, 3.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+
+                opt = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.0,
+                    l2_regularization_strength=0.0,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+                self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+                # Run 3 steps FTRL
+                for _ in range(3):
+                    update.run()
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    np.array([-2.55607247, -3.98729396]), v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.28232238, -0.56096673]), v1_val
+                )
+
+    def testMinimizeSparseResourceVariable(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+                x = tf.constant([[4.0], [5.0]], dtype=dtype)
+
+                def loss():
+                    pred = tf.matmul(
+                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x
+                    )
+                    return pred * pred
+
+                sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllCloseAccordingToType(
+                    [[1.0, 2.0]], self.evaluate(var0)
+                )
+                # Run 1 step of sgd
+                sgd_op.run()
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[0, 1]], self.evaluate(var0), atol=0.01
+                )
+
+    def testFtrlWithL1(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([4.0, 3.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+
+                opt = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.001,
+                    l2_regularization_strength=0.0,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+                self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+                # Run 10 steps FTRL
+                for _ in range(10):
+                    update.run()
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    np.array([-7.66718769, -10.91273689]), v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.93460727, -1.86147261]), v1_val
+                )
+
+    def testFtrlWithBeta(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([4.0, 3.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+
+                opt = ftrl.Ftrl(3.0, initial_accumulator_value=0.1, beta=0.1)
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+                self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+                # Run 10 steps FTRL
+                for _ in range(10):
+                    update.run()
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    np.array([-6.096838, -9.162214]), v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.717741, -1.425132]), v1_val
+                )
+
+    def testFtrlWithL2_Beta(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([4.0, 3.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+
+                opt = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.0,
+                    l2_regularization_strength=0.1,
+                    beta=0.1,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+                self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+                # Run 10 steps FTRL
+                for _ in range(10):
+                    update.run()
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    np.array([-2.735487, -4.704625]), v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.294335, -0.586556]), v1_val
+                )
+
+    def testFtrlWithL1_L2(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([4.0, 3.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+
+                opt = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.001,
+                    l2_regularization_strength=2.0,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+                self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+                # Run 10 steps FTRL
+                for _ in range(10):
+                    update.run()
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.24059935, -0.46829352]), v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.02406147, -0.04830509]), v1_val
+                )
+
+    def testFtrlWithL1_L2_L2Shrinkage(self):
+        """Test the new FTRL op with support for l2 shrinkage.
+
+        The addition of this parameter which places a constant pressure on
+        weights towards the origin causes the gradient descent trajectory to
+        differ. The weights will tend to have smaller magnitudes with this
+        parameter set.
+        """
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([4.0, 3.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+
+                opt = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.001,
+                    l2_regularization_strength=2.0,
+                    l2_shrinkage_regularization_strength=0.1,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+                self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+                # Run 10 steps FTRL
+                for _ in range(10):
+                    update.run()
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.22578995, -0.44345796]), v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.14378493, -0.13229476]), v1_val
+                )
+
+    def testFtrlWithL1_L2_L2ShrinkageSparse(self):
+        """Tests the new FTRL op with support for l2 shrinkage on sparse
+        grads."""
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
+                var1 = tf.Variable([[4.0], [3.0]], dtype=dtype)
+                grads0 = tf.IndexedSlices(
+                    tf.constant([0.1], shape=[1, 1], dtype=dtype),
+                    tf.constant([0]),
+                    tf.constant([2, 1]),
+                )
+                grads1 = tf.IndexedSlices(
+                    tf.constant([0.02], shape=[1, 1], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([2, 1]),
+                )
+
+                opt = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.001,
+                    l2_regularization_strength=2.0,
+                    l2_shrinkage_regularization_strength=0.1,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
+                self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
+
+                # Run 10 steps FTRL
+                for _ in range(10):
+                    update.run()
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType(
+                    [[-0.22578995], [2.0]], v0_val
+                )
+                self.assertAllCloseAccordingToType(
+                    [[4.0], [-0.13229476]], v1_val
+                )
+
+    def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
+        """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session() as sess:
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([1.0, 2.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+                grads1 = tf.constant([0.1, 0.2], dtype=dtype)
+
+                opt0 = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.001,
+                    l2_regularization_strength=2.0,
+                    l2_shrinkage_regularization_strength=0.1,
+                )
+                opt1 = ftrl.Ftrl(
+                    3.0,
+                    initial_accumulator_value=0.1,
+                    l1_regularization_strength=0.001,
+                    l2_regularization_strength=2.0,
+                )
+                update0 = opt0.apply_gradients([(grads0, var0)])
+                update1 = opt1.apply_gradients([(grads1, var1)])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+                self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
+
+                # Run 10 steps FTRL
+                for _ in range(10):
+                    update0.run()
+                    update1.run()
+
+                v0_val, v1_val = self.evaluate([var0, var1])
+                # var0 is experiencing L2 shrinkage so it should be smaller than
+                # var1 in magnitude.
+                self.assertTrue((v0_val**2 < v1_val**2).all())
+                accum0 = sess.run(opt0.get_slot(var0, "accumulator"))
+                accum1 = sess.run(opt1.get_slot(var1, "accumulator"))
+                # L2 shrinkage should not change how we update grad accumulator.
+                self.assertAllCloseAccordingToType(accum0, accum1)
+
+    def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
+        if is_sparse:
+            var0 = tf.Variable([[0.0], [0.0]], dtype=dtype)
+            var1 = tf.Variable([[0.0], [0.0]], dtype=dtype)
+            grads0 = tf.IndexedSlices(
+                tf.constant([0.1], shape=[1, 1], dtype=dtype),
+                tf.constant([0]),
+                tf.constant([2, 1]),
+            )
+            grads1 = tf.IndexedSlices(
+                tf.constant([0.02], shape=[1, 1], dtype=dtype),
+                tf.constant([1]),
+                tf.constant([2, 1]),
+            )
+        else:
+            var0 = tf.Variable([0.0, 0.0], dtype=dtype)
+            var1 = tf.Variable([0.0, 0.0], dtype=dtype)
+            grads0 = tf.constant([0.1, 0.2], dtype=dtype)
+            grads1 = tf.constant([0.01, 0.02], dtype=dtype)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        if is_sparse:
+            self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
+            self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
+        else:
+            self.assertAllCloseAccordingToType([0.0, 0.0], v0_val)
+            self.assertAllCloseAccordingToType([0.0, 0.0], v1_val)
+
+        # Run Ftrl for a few steps
+        for _ in range(steps):
+            update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        return v0_val, v1_val
+
+    # When variables are initialized with Zero, FTRL-Proximal has two
+    # properties:
+    # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
+    # with GradientDescent.
+    # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is
+    # identical with Adagrad.
+    # So, basing on these two properties, we test if our implementation of
+    # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+    def testEquivAdagradwithoutRegularization(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                val0, val1 = self.applyOptimizer(
+                    ftrl.Ftrl(
+                        3.0,
+                        # Adagrad learning rate
+                        learning_rate_power=-0.5,
+                        initial_accumulator_value=0.1,
+                        l1_regularization_strength=0.0,
+                        l2_regularization_strength=0.0,
+                    ),
+                    dtype,
+                )
+
+            with tf.Graph().as_default(), self.cached_session():
+                val2, val3 = self.applyOptimizer(
+                    tf.compat.v1.train.AdagradOptimizer(
+                        3.0, initial_accumulator_value=0.1
+                    ),
+                    dtype,
+                )
+
+            self.assertAllCloseAccordingToType(val0, val2)
+            self.assertAllCloseAccordingToType(val1, val3)
+
+    def testEquivSparseAdagradwithoutRegularization(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                val0, val1 = self.applyOptimizer(
+                    ftrl.Ftrl(
+                        3.0,
+                        # Adagrad learning rate
+                        learning_rate_power=-0.5,
+                        initial_accumulator_value=0.1,
+                        l1_regularization_strength=0.0,
+                        l2_regularization_strength=0.0,
+                    ),
+                    dtype,
+                    is_sparse=True,
+                )
+
+            with tf.Graph().as_default(), self.cached_session():
+                val2, val3 = self.applyOptimizer(
+                    tf.compat.v1.train.AdagradOptimizer(
+                        3.0, initial_accumulator_value=0.1
+                    ),
+                    dtype,
+                    is_sparse=True,
+                )
+
+            self.assertAllCloseAccordingToType(val0, val2)
+            self.assertAllCloseAccordingToType(val1, val3)
+
+    def testEquivSparseGradientDescentwithoutRegularization(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                val0, val1 = self.applyOptimizer(
+                    ftrl.Ftrl(
+                        3.0,
+                        # Fixed learning rate
+                        learning_rate_power=-0.0,
+                        initial_accumulator_value=0.1,
+                        l1_regularization_strength=0.0,
+                        l2_regularization_strength=0.0,
+                    ),
+                    dtype,
+                    is_sparse=True,
+                )
+
+            with tf.Graph().as_default(), self.cached_session():
+                val2, val3 = self.applyOptimizer(
+                    tf.compat.v1.train.GradientDescentOptimizer(3.0),
+                    dtype,
+                    is_sparse=True,
+                )
+
+            self.assertAllCloseAccordingToType(val0, val2)
+            self.assertAllCloseAccordingToType(val1, val3)
+
+    def testEquivGradientDescentwithoutRegularization(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32]:
+            with tf.Graph().as_default(), self.cached_session():
+                val0, val1 = self.applyOptimizer(
+                    ftrl.Ftrl(
+                        3.0,
+                        # Fixed learning rate
+                        learning_rate_power=-0.0,
+                        initial_accumulator_value=0.1,
+                        l1_regularization_strength=0.0,
+                        l2_regularization_strength=0.0,
+                    ),
+                    dtype,
+                )
+
+            with tf.Graph().as_default(), self.cached_session():
+                val2, val3 = self.applyOptimizer(
+                    tf.compat.v1.train.GradientDescentOptimizer(3.0), dtype
+                )
+
+            self.assertAllCloseAccordingToType(val0, val2)
+            self.assertAllCloseAccordingToType(val1, val3)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/gradient_descent.py b/keras/optimizers/legacy/gradient_descent.py
new file mode 100644
index 000000000000..8d305f705e6e
--- /dev/null
+++ b/keras/optimizers/legacy/gradient_descent.py
@@ -0,0 +1,222 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SGD optimizer implementation."""
+
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers.legacy import optimizer_v2
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(
+    "keras.optimizers.legacy.SGD",
+    v1=["keras.optimizers.SGD", "keras.optimizers.legacy.SGD"],
+)
+class SGD(optimizer_v2.OptimizerV2):
+    r"""Gradient descent (with momentum) optimizer.
+
+    Update rule for parameter `w` with gradient `g` when `momentum=0`:
+
+    ```python
+    w = w - learning_rate * g
+    ```
+
+    Update rule when `momentum` is larger than 0:
+
+    ```python
+    velocity = momentum * velocity - learning_rate * g
+    w = w + velocity
+    ```
+
+    When `nesterov=True`, this rule becomes:
+
+    ```python
+    velocity = momentum * velocity - learning_rate * g
+    w = w + momentum * velocity - learning_rate * g
+    ```
+
+    Args:
+      learning_rate: A `Tensor`, floating point value, or a schedule that is a
+        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+        that takes no arguments and returns the actual value to use. The
+        learning rate. Defaults to `0.01`.
+      momentum: float hyperparameter >= 0 that accelerates gradient descent in
+        the relevant direction and dampens oscillations. Vanilla gradient
+        descent means no momentum. Defaults to `0.`.
+      nesterov: boolean. Whether to apply Nesterov momentum.
+        Defaults to `False`.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to `"SGD"`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
+
+    Usage:
+
+    >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
+    >>> var = tf.Variable(1.0)
+    >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
+    >>> step_count = opt.minimize(loss, [var]).numpy()
+    >>> # Step is `- learning_rate * grad`
+    >>> var.numpy()
+    0.9
+
+    >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1, momentum=0.9)
+    >>> var = tf.Variable(1.0)
+    >>> val0 = var.value()
+    >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
+    >>> # First step is `- learning_rate * grad`
+    >>> step_count = opt.minimize(loss, [var]).numpy()
+    >>> val1 = var.value()
+    >>> (val0 - val1).numpy()
+    0.1
+    >>> # On later steps, step-size increases because of momentum
+    >>> step_count = opt.minimize(loss, [var]).numpy()
+    >>> val2 = var.value()
+    >>> (val1 - val2).numpy()
+    0.18
+
+    Reference:
+        - For `nesterov=True`, See [Sutskever et al., 2013](
+          https://github.com/mlresearch/v28/blob/gh-pages/sutskever13.pdf).
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.01,
+        momentum=0.0,
+        nesterov=False,
+        name="SGD",
+        **kwargs,
+    ):
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+
+        self._momentum = False
+        if (
+            isinstance(momentum, tf.Tensor)
+            or callable(momentum)
+            or momentum > 0
+        ):
+            self._momentum = True
+        if isinstance(momentum, (int, float)) and (
+            momentum < 0 or momentum > 1
+        ):
+            raise ValueError(
+                "`momentum` must be between [0, 1]. Received: "
+                f"momentum={momentum} (of type {type(momentum)})."
+            )
+        self._set_hyper("momentum", momentum)
+
+        self.nesterov = nesterov
+
+    def _create_slots(self, var_list):
+        if self._momentum:
+            for var in var_list:
+                self.add_slot(var, "momentum")
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+        apply_state[(var_device, var_dtype)]["momentum"] = tf.identity(
+            self._get_hyper("momentum", var_dtype)
+        )
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        if self._momentum:
+            momentum_var = self.get_slot(var, "momentum")
+            return tf.raw_ops.ResourceApplyKerasMomentum(
+                var=var.handle,
+                accum=momentum_var.handle,
+                lr=coefficients["lr_t"],
+                grad=grad,
+                momentum=coefficients["momentum"],
+                use_locking=self._use_locking,
+                use_nesterov=self.nesterov,
+            )
+        else:
+            return tf.raw_ops.ResourceApplyGradientDescent(
+                var=var.handle,
+                alpha=coefficients["lr_t"],
+                delta=grad,
+                use_locking=self._use_locking,
+            )
+
+    def _resource_apply_sparse_duplicate_indices(
+        self, grad, var, indices, **kwargs
+    ):
+        if self._momentum:
+            return super()._resource_apply_sparse_duplicate_indices(
+                grad, var, indices, **kwargs
+            )
+        else:
+            var_device, var_dtype = var.device, var.dtype.base_dtype
+            coefficients = kwargs.get("apply_state", {}).get(
+                (var_device, var_dtype)
+            ) or self._fallback_apply_state(var_device, var_dtype)
+
+            return tf.raw_ops.ResourceScatterAdd(
+                resource=var.handle,
+                indices=indices,
+                updates=-grad * coefficients["lr_t"],
+            )
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        # This method is only needed for momentum optimization.
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        momentum_var = self.get_slot(var, "momentum")
+        return tf.raw_ops.ResourceSparseApplyKerasMomentum(
+            var=var.handle,
+            accum=momentum_var.handle,
+            lr=coefficients["lr_t"],
+            grad=grad,
+            indices=indices,
+            momentum=coefficients["momentum"],
+            use_locking=self._use_locking,
+            use_nesterov=self.nesterov,
+        )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "momentum": self._serialize_hyperparameter("momentum"),
+                "nesterov": self.nesterov,
+            }
+        )
+        return config
diff --git a/keras/optimizers/legacy/gradient_descent_test.py b/keras/optimizers/legacy/gradient_descent_test.py
new file mode 100644
index 000000000000..ec5bc4e99bd7
--- /dev/null
+++ b/keras/optimizers/legacy/gradient_descent_test.py
@@ -0,0 +1,881 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for GradientDescent."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.optimizers.legacy import gradient_descent
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.testing_infra import test_combinations
+
+
+class GradientDescentOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasic(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+            var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+            grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+            grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+            sgd = gradient_descent.SGD(3.0)
+            sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # Run 1 step of sgd
+            self.evaluate(sgd_op)
+            # Validate updated params
+            self.assertAllCloseAccordingToType(
+                [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+            )
+            self.assertAllCloseAccordingToType(
+                [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+            )
+
+    def _test_basic_sgd_with_learning_rate_decay(self, sgd, dtype):
+        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+        grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+        if not tf.executing_eagerly():
+            sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        # Run 2 steps of sgd
+        if not tf.executing_eagerly():
+            self.evaluate(sgd_op)
+        else:
+            sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+        )
+
+        if not tf.executing_eagerly():
+            self.evaluate(sgd_op)
+        else:
+            sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
+            self.evaluate(var0),
+        )
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
+            self.evaluate(var1),
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicWithLearningRateDecay(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            learning_rate = 3.0
+            decay = 0.5
+            sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
+            self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicWithLearningRateInverseTimeDecay(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            learning_rate = learning_rate_schedule.InverseTimeDecay(
+                3.0, decay_steps=1.0, decay_rate=0.5
+            )
+            sgd = gradient_descent.SGD(learning_rate=learning_rate)
+            self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            learning_rate = learning_rate_schedule.InverseTimeDecay(
+                3.0, decay_steps=1.0, decay_rate=0.5
+            )
+            sgd = gradient_descent.SGD(learning_rate=learning_rate)
+            sgd = gradient_descent.SGD.from_config(sgd.get_config())
+            self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasicCallableParams(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+            var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+            grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+            grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+            lr = lambda: 3.0
+            sgd = gradient_descent.SGD(lr)
+            sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # Run 1 step of sgd
+            self.evaluate(sgd_op)
+            # Validate updated params
+            self.assertAllCloseAccordingToType(
+                [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+            )
+            self.assertAllCloseAccordingToType(
+                [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testMinimizeResourceVariable(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+            var1 = tf.Variable([3.0], dtype=dtype)
+            x = tf.constant([[4.0], [5.0]], dtype=dtype)
+            loss = lambda: tf.matmul(var0, x) + var1
+            sgd = gradient_descent.SGD(1.0)
+            sgd_op = sgd.minimize(loss, [var0, var1])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # Run 1 step of sgd
+            self.evaluate(sgd_op)
+            # Validate updated params
+            self.assertAllCloseAccordingToType(
+                [[1.0 - 4.0, 2.0 - 5.0]], self.evaluate(var0)
+            )
+            self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
+
+    def testMinimizeSparseResourceVariable(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+                var1 = tf.Variable([3.0], dtype=dtype)
+                x = tf.constant([[4.0], [5.0]], dtype=dtype)
+
+                def loss():
+                    pred = tf.matmul(
+                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x
+                    )
+                    pred += var1
+                    return pred * pred
+
+                sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 1 step of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+                np_grad = 2 * np_pred
+                self.assertAllCloseAccordingToType(
+                    [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]],
+                    self.evaluate(var0),
+                )
+                self.assertAllCloseAccordingToType(
+                    [3.0 - np_grad], self.evaluate(var1)
+                )
+
+    def testTensorLearningRate(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+            var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+            grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+            grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+            lrate = tf.constant(3.0)
+            sgd_op = gradient_descent.SGD(lrate).apply_gradients(
+                zip([grads0, grads1], [var0, var1])
+            )
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # Run 1 step of sgd
+            self.evaluate(sgd_op)
+            # Validate updated params
+            self.assertAllCloseAccordingToType(
+                [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+            )
+            self.assertAllCloseAccordingToType(
+                [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+            )
+
+    def testGradWrtRef(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                opt = gradient_descent.SGD(3.0)
+                values = [1.0, 3.0]
+                vars_ = [tf.Variable([v], dtype=dtype) for v in values]
+                loss = lambda: vars_[0] + vars_[1]
+                grads_and_vars = opt._compute_gradients(loss, vars_)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                for grad, _ in grads_and_vars:
+                    self.assertAllCloseAccordingToType(
+                        [1.0], self.evaluate(grad)
+                    )
+
+    def testSparseBasic(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
+                var1 = tf.Variable([[3.0], [4.0]], dtype=dtype)
+                grads0 = tf.IndexedSlices(
+                    tf.constant([0.1], shape=[1, 1], dtype=dtype),
+                    tf.constant([0]),
+                    tf.constant([2, 1]),
+                )
+                grads1 = tf.IndexedSlices(
+                    tf.constant([0.01], shape=[1, 1], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([2, 1]),
+                )
+                sgd_op = gradient_descent.SGD(3.0).apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 1 step of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[1.0 - 3.0 * 0.1], [2.0]], self.evaluate(var0)
+                )
+                self.assertAllCloseAccordingToType(
+                    [[3.0], [4.0 - 3.0 * 0.01]], self.evaluate(var1)
+                )
+
+    def testSparseBasicWithLearningRateDecay(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
+                var1 = tf.Variable([[3.0], [4.0]], dtype=dtype)
+                grads0 = tf.IndexedSlices(
+                    tf.constant([0.1], shape=[1, 1], dtype=dtype),
+                    tf.constant([0]),
+                    tf.constant([2, 1]),
+                )
+                grads1 = tf.IndexedSlices(
+                    tf.constant([0.01], shape=[1, 1], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([2, 1]),
+                )
+                sgd_op = gradient_descent.SGD(3.0, decay=0.5).apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 2 steps of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[1.0 - 3.0 * 0.1], [2.0]], self.evaluate(var0)
+                )
+                self.assertAllCloseAccordingToType(
+                    [[3.0], [4.0 - 3.0 * 0.01]], self.evaluate(var1)
+                )
+
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0)
+                )
+                self.assertAllCloseAccordingToType(
+                    [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]],
+                    self.evaluate(var1),
+                )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testCapturingInFunctionWhileExecutingEagerly(self):
+        optimizer = gradient_descent.SGD(1.0)
+
+        var_holder = {}
+
+        def step():
+            if not var_holder:
+                var_holder["var"] = tf.Variable(1.0)
+            else:
+                var_holder["var"].assign(1.0)
+
+            with tf.GradientTape() as tape:
+                loss = var_holder["var"] ** 2
+            grad = tape.gradient(loss, var_holder["var"])
+            optimizer.apply_gradients([(grad, var_holder["var"])])
+            return var_holder["var"].read_value()
+
+        compiled_step = tf.function(step)
+
+        self.assertEqual(float(step()), -1.0)
+        self.assertEqual(float(compiled_step()), -1.0)
+        # This shouldn't fail; in particular, the learning rate tensor should
+        # be an EagerTensor once again, not a graph Tensor.
+        self.assertEqual(float(step()), -1.0)
+
+    def testConstructSGDWithLR(self):
+        opt = gradient_descent.SGD(lr=1.0)
+        opt_2 = gradient_descent.SGD(learning_rate=0.1, lr=1.0)
+        opt_3 = gradient_descent.SGD(learning_rate=0.1)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+
+class MomentumOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+        accum = accum * momentum - g * lr
+        var += accum * momentum - g * lr
+        return var, accum
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasic(self):
+        for _, dtype in enumerate([tf.half, tf.float32, tf.float64]):
+            var0 = tf.Variable([1.0, 2.0], dtype=dtype, name="var0")
+            var1 = tf.Variable([3.0, 4.0], dtype=dtype, name="var1")
+            grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+            grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+            learning_rate = 2.0
+            momentum = 0.9
+            mom_opt = gradient_descent.SGD(
+                learning_rate=learning_rate, momentum=momentum
+            )
+            # self.assertFalse(mom_opt._initial_decay)
+            mom_update = mom_opt.apply_gradients(
+                zip([grads0, grads1], [var0, var1])
+            )
+
+            # Check we have slots
+            slot0 = mom_opt.get_slot(var0, "momentum")
+            self.assertEqual(slot0.shape, var0.shape)
+            slot1 = mom_opt.get_slot(var1, "momentum")
+            self.assertEqual(slot1.shape, var1.shape)
+
+            # Step 1: the momentum accumulators where 0. So we should see a
+            # normal update: v -= grad * learning_rate
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(mom_update)
+            # Check that the momentum accumulators have been updated.
+            self.assertAllCloseAccordingToType(
+                np.array([-0.2, -0.2]), self.evaluate(slot0)
+            )
+            self.assertAllCloseAccordingToType(
+                np.array([-0.02, -0.02]), self.evaluate(slot1)
+            )
+            # Check that the parameters have been updated.
+            self.assertAllCloseAccordingToType(
+                np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+                self.evaluate(var0),
+            )
+            self.assertAllCloseAccordingToType(
+                np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+                self.evaluate(var1),
+            )
+            # Step 2: the momentum accumulators contain the previous update.
+            self.evaluate(mom_update)
+            if tf.executing_eagerly():
+                mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            # Check that the momentum accumulators have been updated.
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]
+                ),
+                self.evaluate(slot0),
+            )
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [(0.9 * (-0.02) - 2.0 * 0.01), (0.9 * (-0.02) - 2.0 * 0.01)]
+                ),
+                self.evaluate(slot1),
+            )
+            # Check that the parameters have been updated.
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [
+                        1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                        2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                    ]
+                ),
+                self.evaluate(var0),
+            )
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [
+                        2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                        3.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                    ]
+                ),
+                self.evaluate(var1),
+            )
+
+    def testNesterovMomentum(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.float32, tf.float64]:
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype, name="var0")
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype, name="var1")
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                loss = lambda: 5 * var0 * var0 + 3 * var1
+                mom_op = gradient_descent.SGD(
+                    learning_rate=2.0, momentum=0.9, nesterov=True
+                )
+                opt_op = mom_op.minimize(loss, [var0, var1])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                for _ in range(1, 5):
+                    self.evaluate(opt_op)
+                    var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+                        var0_np, accum0_np, var0_np * 10, 2.0, 0.9
+                    )
+                    var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+                        var1_np, accum1_np, 3, 2.0, 0.9
+                    )
+                    self.assertAllClose(var0_np, self.evaluate(var0))
+                    self.assertAllClose(var1_np, self.evaluate(var1))
+
+    def testSparseNesterovMomentum(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session() as sess:
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                grads = []
+                for t in range(1, 5):
+                    grads.append(var0_np * 10)
+                    var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+                        var0_np, accum0_np, var0_np * 10, 2.0, 0.9
+                    )
+                    var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+                        var1_np, accum1_np, 3, 2.0, 0.9
+                    )
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                var0 = tf.Variable(var0_np, dtype=dtype, name="var0")
+                var1 = tf.Variable(var1_np, dtype=dtype, name="var1")
+                mom_op = gradient_descent.SGD(
+                    learning_rate=2.0, momentum=0.9, nesterov=True
+                )
+                x_feed = tf.compat.v1.placeholder(dtype)
+                y_feed = tf.IndexedSlices(
+                    x_feed, tf.constant([0, 1]), tf.constant([2])
+                )
+                grads_and_vars = [
+                    (y_feed, var0),
+                    (tf.constant([3.0, 3.0], dtype=dtype), var1),
+                ]
+                opt_update = mom_op.apply_gradients(grads_and_vars)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                for t in range(1, 5):
+                    sess.run(opt_update, feed_dict={x_feed: grads[t - 1]})
+                    var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+                        var0_np, accum0_np, var0_np * 10, 2.0, 0.9
+                    )
+                    var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+                        var1_np, accum1_np, 3, 2.0, 0.9
+                    )
+                    self.assertAllClose(var0_np, self.evaluate(var0))
+                    self.assertAllClose(var1_np, self.evaluate(var1))
+
+    def testMinimizeSparseResourceVariable(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+
+                def loss():
+                    x = tf.constant([[4.0], [5.0]], dtype=dtype)
+                    pred = tf.matmul(
+                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x
+                    )
+                    return pred * pred
+
+                opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9)
+                sgd_op = opt.minimize(loss, [var0])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Run 1 step of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[-111, -138]], self.evaluate(var0)
+                )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testMinimizeWith2DIndicesForEmbeddingLookup(self):
+        var0 = tf.Variable(tf.ones([2, 2]))
+
+        def loss():
+            return tf.reduce_sum(tf.compat.v1.nn.embedding_lookup(var0, [[1]]))
+
+        opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9)
+        sgd_op = opt.minimize(loss, [var0])
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(sgd_op)
+        self.assertAllCloseAccordingToType(
+            [[1, 1], [0, 0]], self.evaluate(var0)
+        )
+
+    def testTensorLearningRateAndMomentum(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+                mom_opt = gradient_descent.SGD(
+                    learning_rate=tf.constant(2.0), momentum=tf.constant(0.9)
+                )
+                mom_update = mom_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Check we have slots
+                slot0 = mom_opt.get_slot(var0, "momentum")
+                self.assertEqual(slot0.shape, var0.shape)
+                slot1 = mom_opt.get_slot(var1, "momentum")
+                self.assertEqual(slot1.shape, var1.shape)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+                # Step 1: the momentum accumulators where 0. So we should see a
+                # normal update: v -= grad * learning_rate
+                self.evaluate(mom_update)
+                # Check that the momentum accumulators have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.2, -0.2]), self.evaluate(slot0)
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.02, -0.02]), self.evaluate(slot1)
+                )
+                # Check that the parameters have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+                    self.evaluate(var0),
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+                    self.evaluate(var1),
+                )
+                # Step 2: the momentum accumulators contain the previous update.
+                self.evaluate(mom_update)
+                # Check that the momentum accumulators have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]
+                    ),
+                    self.evaluate(slot0),
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            (0.9 * (-0.02) - 2.0 * 0.01),
+                            (0.9 * (-0.02) - 2.0 * 0.01),
+                        ]
+                    ),
+                    self.evaluate(slot1),
+                )
+                # Check that the parameters have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                            2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                        ]
+                    ),
+                    self.evaluate(var0),
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                            3.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                        ]
+                    ),
+                    self.evaluate(var1),
+                )
+
+    def testSparse(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                var0 = tf.Variable(tf.zeros([4, 2], dtype=dtype))
+                var1 = tf.Variable(tf.constant(1.0, dtype, [4, 2]))
+                grads0 = tf.IndexedSlices(
+                    tf.constant([[0.1, 0.1]], dtype=dtype),
+                    tf.constant([1]),
+                    tf.constant([4, 2]),
+                )
+                grads1 = tf.IndexedSlices(
+                    tf.constant([[0.01, 0.01], [0.01, 0.01]], dtype=dtype),
+                    tf.constant([2, 3]),
+                    tf.constant([4, 2]),
+                )
+                mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+                mom_update = mom_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Check we have slots
+                slot0 = mom_opt.get_slot(var0, "momentum")
+                self.assertEqual(slot0.shape, var0.shape)
+                slot1 = mom_opt.get_slot(var1, "momentum")
+                self.assertEqual(slot1.shape, var1.shape)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([0, 0], self.evaluate(var0)[0])
+                self.assertAllClose([0, 0], self.evaluate(var0)[1])
+                self.assertAllClose([1, 1], self.evaluate(var1)[2])
+
+                # Step 1: the momentum accumulators are 0. So we should see a
+                # normal update: v -= grad * learning_rate
+                self.evaluate(mom_update)
+                # Check that the momentum accumulators have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array([0, 0]), self.evaluate(slot0)[0]
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-2.0 * 0.1, -2.0 * 0.1]), self.evaluate(slot0)[1]
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-2.0 * 0.01, -2.0 * 0.01]),
+                    self.evaluate(slot1)[2],
+                )
+                # Check that the parameters have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array([0, 0]), self.evaluate(var0)[0]
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+                    self.evaluate(var0)[1],
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+                    self.evaluate(var1)[2],
+                )
+                # Step 2: the momentum accumulators contain the previous update.
+                self.evaluate(mom_update)
+                # Check that the momentum accumulators have been updated.
+                self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]
+                    ),
+                    self.evaluate(slot0)[1],
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            (0.9 * (-0.02) - 2.0 * 0.01),
+                            (0.9 * (-0.02) - 2.0 * 0.01),
+                        ]
+                    ),
+                    self.evaluate(slot1)[2],
+                )
+                # Check that the parameters have been updated.
+                self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                            -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                        ]
+                    ),
+                    self.evaluate(var0)[1],
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                            0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                        ]
+                    ),
+                    self.evaluate(var1)[2],
+                )
+
+    def testSharing(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in [tf.half, tf.float32, tf.float64]:
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+                grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+                mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+                mom_update1 = mom_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                mom_update2 = mom_opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                slot0 = mom_opt.get_slot(var0, "momentum")
+                self.assertEqual(slot0.shape, var0.shape)
+                slot1 = mom_opt.get_slot(var1, "momentum")
+                self.assertEqual(slot1.shape, var1.shape)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+                # Step 1: the momentum accumulators where 0. So we should see a
+                # normal update: v -= grad * learning_rate
+                self.evaluate(mom_update1)
+                # Check that the momentum accumulators have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.2, -0.2]), self.evaluate(slot0)
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([-0.02, -0.02]), self.evaluate(slot1)
+                )
+                # Check that the parameters have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+                    self.evaluate(var0),
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+                    self.evaluate(var1),
+                )
+                # Step 2: the second momentum accumulators contain the previous
+                # update.
+                self.evaluate(mom_update2)
+                # Check that the momentum accumulators have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]
+                    ),
+                    self.evaluate(slot0),
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            (0.9 * (-0.02) - 2.0 * 0.01),
+                            (0.9 * (-0.02) - 2.0 * 0.01),
+                        ]
+                    ),
+                    self.evaluate(slot1),
+                )
+                # Check that the parameters have been updated.
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                            2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                        ]
+                    ),
+                    self.evaluate(var0),
+                )
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [
+                            2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                            3.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                        ]
+                    ),
+                    self.evaluate(var1),
+                )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testConfig(self):
+        opt = gradient_descent.SGD(
+            learning_rate=1.0, momentum=0.9, nesterov=True
+        )
+        config = opt.get_config()
+        opt2 = gradient_descent.SGD.from_config(config)
+        lr = opt.lr
+        lr2 = opt2.lr
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(lr), self.evaluate(lr2))
+        self.assertAllClose(
+            self.evaluate(opt._get_hyper("momentum")),
+            self.evaluate(opt2._get_hyper("momentum")),
+        )
+        self.assertAllClose(
+            self.evaluate(opt._get_hyper("decay")),
+            self.evaluate(opt2._get_hyper("decay")),
+        )
+        var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32)
+        loss = lambda: 3 * var0
+        # learning rate variable created when calling minimize.
+        opt.minimize(loss, [var0])
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        config = opt.get_config()
+        opt3 = gradient_descent.SGD.from_config(config)
+        lr3 = opt3.lr
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(lr), self.evaluate(lr3))
+        self.assertAllClose(
+            self.evaluate(opt._get_hyper("momentum")),
+            self.evaluate(opt3._get_hyper("momentum")),
+        )
+        self.assertAllClose(
+            self.evaluate(opt._get_hyper("decay")),
+            self.evaluate(opt3._get_hyper("decay")),
+        )
+        self.assertTrue(opt3.nesterov)
+
+    def testNesterovWithoutMomentum(self):
+        with self.assertRaisesRegex(ValueError, "must be between"):
+            gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
+
+    def testConstructMomentumWithLR(self):
+        opt = gradient_descent.SGD(lr=1.0, momentum=0.9)
+        opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0)
+        opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testMinimizeLossTensor(self):
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+            var1 = tf.Variable([3.0], dtype=dtype)
+            x = tf.constant([[4.0], [5.0]], dtype=dtype)
+
+            tape = tf.GradientTape()
+            with tape:
+                loss = tf.matmul(var0, x) + var1
+            sgd = gradient_descent.SGD(1.0)
+            with self.assertRaisesRegex(ValueError, "`tape` is required"):
+                sgd.minimize(loss, [var0, var1])
+            sgd.minimize(loss, [var0, var1], tape=tape)
+
+            self.assertAllCloseAccordingToType(
+                [[1.0 - 4.0, 2.0 - 5.0]], self.evaluate(var0)
+            )
+            self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/nadam.py b/keras/optimizers/legacy/nadam.py
index 6884e964e5c5..263ccca4a649 100644
--- a/keras/optimizers/legacy/nadam.py
+++ b/keras/optimizers/legacy/nadam.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,243 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy Nadam optimizer implementation."""
+"""Nadam optimizer implementation."""
 
-from keras.optimizers.optimizer_v2 import nadam
+import tensorflow.compat.v2 as tf
 
+from keras import backend_config
+from keras.optimizers.legacy import optimizer_v2
+from keras.optimizers.schedules import learning_rate_schedule
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.Nadam')
-class Nadam(nadam.Nadam):
-  pass
+@keras_export(
+    "keras.optimizers.legacy.Nadam",
+    v1=["keras.optimizers.Nadam", "keras.optimizers.legacy.Nadam"],
+)
+class Nadam(optimizer_v2.OptimizerV2):
+    r"""Optimizer that implements the NAdam algorithm.
+    Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
+    Nesterov momentum.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to `"Nadam"`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
+
+    Usage Example:
+      >>> opt = tf.keras.optimizers.legacy.Nadam(learning_rate=0.2)
+      >>> var1 = tf.Variable(10.0)
+      >>> loss = lambda: (var1 ** 2) / 2.0
+      >>> step_count = opt.minimize(loss, [var1]).numpy()
+      >>> "{:.1f}".format(var1.numpy())
+      9.8
+
+    Reference:
+      - [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        name="Nadam",
+        **kwargs
+    ):
+        # Backwards compatibility with keras NAdam optimizer.
+        kwargs["decay"] = kwargs.pop("schedule_decay", 0.004)
+        learning_rate = kwargs.get("lr", learning_rate)
+        if isinstance(
+            learning_rate, learning_rate_schedule.LearningRateSchedule
+        ):
+            raise ValueError(
+                "The Nadam optimizer does not support "
+                "tf.keras.optimizers.LearningRateSchedules as the "
+                "learning rate."
+            )
+
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+        self._set_hyper("beta_1", beta_1)
+        self._set_hyper("beta_2", beta_2)
+        self.epsilon = epsilon or backend_config.epsilon()
+        self._m_cache = None
+
+    def _create_slots(self, var_list):
+        var_dtype = var_list[0].dtype.base_dtype
+        if self._m_cache is None:
+            self._m_cache = self.add_weight(
+                "momentum_cache",
+                shape=[],
+                dtype=var_dtype,
+                initializer="ones",
+                trainable=False,
+                aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+            )
+            self._weights.append(self._m_cache)
+        # Separate for-loops to respect the ordering of slot variables from v1.
+        for var in var_list:
+            # Create slots for the first moments.
+            self.add_slot(var, "m")
+        for var in var_list:
+            # Create slots for the second moments.
+            self.add_slot(var, "v")
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        lr_t = tf.identity(self._get_hyper("learning_rate", var_dtype))
+        beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
+        beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        next_step = tf.cast(self.iterations + 2, var_dtype)
+
+        decay_base = tf.cast(0.96, var_dtype)
+
+        m_t = beta_1_t * (
+            1.0 - 0.5 * (tf.pow(decay_base, self._initial_decay * local_step))
+        )
+        m_t_1 = beta_1_t * (
+            1.0 - 0.5 * (tf.pow(decay_base, self._initial_decay * next_step))
+        )
+
+        m_schedule_new = tf.cast(self._m_cache_read, var_dtype) * m_t
+        if var_dtype is self._m_cache.dtype:
+            m_schedule_new = tf.identity(
+                tf.compat.v1.assign(
+                    self._m_cache, m_schedule_new, use_locking=self._use_locking
+                )
+            )
+        m_schedule_next = m_schedule_new * m_t_1
+
+        apply_state[(var_device, var_dtype)] = dict(
+            lr_t=lr_t,
+            neg_lr_t=-lr_t,
+            epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+            beta_1_t=beta_1_t,
+            beta_2_t=beta_2_t,
+            m_t=m_t,
+            m_t_1=m_t_1,
+            one_minus_beta_1_t=1 - beta_1_t,
+            one_minus_beta_2_t=1 - beta_2_t,
+            one_minus_m_t=1.0 - m_t,
+            one_minus_m_schedule_new=1.0 - m_schedule_new,
+            one_minus_m_schedule_next=1.0 - m_schedule_next,
+            v_t_prime_denominator=1.0 - tf.pow(beta_2_t, local_step),
+        )
+
+    def _prepare(self, var_list):
+        # Get the value of the momentum cache before starting to apply
+        # gradients.
+        self._m_cache_read = tf.identity(self._m_cache)
+        return super()._prepare(var_list)
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        m = self.get_slot(var, "m")
+        v = self.get_slot(var, "v")
+
+        g_prime = grad / coefficients["one_minus_m_schedule_new"]
+        m_t = (
+            coefficients["beta_1_t"] * m
+            + coefficients["one_minus_beta_1_t"] * grad
+        )
+        m_t = tf.compat.v1.assign(m, m_t, use_locking=self._use_locking)
+        m_t_prime = m_t / coefficients["one_minus_m_schedule_next"]
+        v_t = coefficients["beta_2_t"] * v + coefficients[
+            "one_minus_beta_2_t"
+        ] * tf.square(grad)
+        v_t = tf.compat.v1.assign(v, v_t, use_locking=self._use_locking)
+        v_t_prime = v_t / coefficients["v_t_prime_denominator"]
+        m_t_bar = (
+            coefficients["one_minus_m_t"] * g_prime
+            + coefficients["m_t_1"] * m_t_prime
+        )
+        var_t = var - coefficients["lr_t"] * m_t_bar / (
+            tf.sqrt(v_t_prime) + coefficients["epsilon"]
+        )
+        return tf.compat.v1.assign(var, var_t, use_locking=self._use_locking).op
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        m = self.get_slot(var, "m")
+        v = self.get_slot(var, "v")
+
+        g_prime = grad / coefficients["one_minus_m_schedule_new"]
+
+        # m_t = beta1 * m + (1 - beta1) * g_t
+        m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
+        m_t = tf.compat.v1.assign(
+            m, m * coefficients["beta_1_t"], use_locking=self._use_locking
+        )
+
+        with tf.control_dependencies([m_t]):
+            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+            m_t_slice = tf.gather(m_t, indices)
+
+        m_t_prime = m_t_slice / coefficients["one_minus_m_schedule_next"]
+        m_t_bar = (
+            coefficients["one_minus_m_t"] * g_prime
+            + coefficients["m_t_1"] * m_t_prime
+        )
+
+        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+        v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
+        v_t = tf.compat.v1.assign(
+            v, v * coefficients["beta_2_t"], use_locking=self._use_locking
+        )
+
+        with tf.control_dependencies([v_t]):
+            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+            v_t_slice = tf.gather(v_t, indices)
+
+        v_t_prime = v_t_slice / coefficients["v_t_prime_denominator"]
+        v_prime_sqrt_plus_eps = tf.sqrt(v_t_prime) + coefficients["epsilon"]
+
+        var_update = self._resource_scatter_add(
+            var,
+            indices,
+            coefficients["neg_lr_t"] * m_t_bar / v_prime_sqrt_plus_eps,
+        )
+        return tf.group(*[var_update, m_t_bar, v_t])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "beta_1": self._serialize_hyperparameter("beta_1"),
+                "beta_2": self._serialize_hyperparameter("beta_2"),
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
diff --git a/keras/optimizers/legacy/nadam_test.py b/keras/optimizers/legacy/nadam_test.py
new file mode 100644
index 000000000000..aee3453c42f1
--- /dev/null
+++ b/keras/optimizers/legacy/nadam_test.py
@@ -0,0 +1,203 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers.legacy import nadam
+
+
+def get_beta_accumulators(opt, dtype):
+    local_step = tf.cast(opt.iterations + 1, dtype)
+    beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype)
+    beta_1_power = tf.pow(beta_1_t, local_step)
+    beta_2_t = tf.cast(opt._get_hyper("beta_2"), dtype)
+    beta_2_power = tf.pow(beta_2_t, local_step)
+    return (beta_1_power, beta_2_power)
+
+
+def update_m_cache(m_cache, t, beta1=0.9):
+    mu_t = beta1 * (1 - 0.5 * 0.96 ** (0.004 * (t + 1)))
+    m_cache_t = m_cache * mu_t
+    return m_cache_t
+
+
+def nadam_update_numpy(
+    param,
+    g_t,
+    t,
+    m,
+    v,
+    m_cache,
+    alpha=0.001,
+    beta1=0.9,
+    beta2=0.999,
+    epsilon=1e-8,
+):
+
+    mu_t = beta1 * (1 - 0.5 * 0.96 ** (0.004 * (t + 1)))
+    mu_t_1 = beta1 * (1 - 0.5 * 0.96 ** (0.004 * (t + 2)))
+    m_cache_t_1 = m_cache * mu_t_1
+    g_prime_t = g_t / (1 - m_cache)
+    m_t = beta1 * m + (1 - beta1) * g_t
+    v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+    m_prime_t = m_t / (1 - m_cache_t_1)
+    v_prime_t = v_t / (1 - beta2 ** (t + 1))
+    m_bar_t = (1 - mu_t) * g_prime_t + mu_t_1 * m_prime_t
+
+    param_t = param - alpha * m_bar_t / (np.sqrt(v_prime_t) + epsilon)
+    return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(tf.test.TestCase):
+    def testSparse(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        sparse_epsilon = 1e-7
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
+                var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array(
+                    [0.01, 0, 0.01], dtype=dtype.as_numpy_dtype
+                )
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0_np_indices = np.array([0, 2], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np[grads0_np_indices]),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([3]),
+                )
+                grads1_np_indices = np.array([0, 2], dtype=np.int32)
+                grads1 = tf.IndexedSlices(
+                    tf.constant(grads1_np[grads1_np_indices]),
+                    tf.constant(grads1_np_indices),
+                    tf.constant([3]),
+                )
+                opt = nadam.Nadam(epsilon=sparse_epsilon)
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 1.0, 2.0], var0)
+                self.assertAllClose([3.0, 3.0, 4.0], var1)
+
+                beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+                # Run 3 steps of Nadam
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9 ** (t + 1), beta1_power
+                    )
+                    self.assertAllCloseAccordingToType(
+                        0.999 ** (t + 1), beta2_power
+                    )
+                    update.run()
+
+                    mcache = update_m_cache(mcache, t)
+                    var0_np, m0, v0 = nadam_update_numpy(
+                        var0_np,
+                        grads0_np,
+                        t,
+                        m0,
+                        v0,
+                        mcache,
+                        epsilon=sparse_epsilon,
+                    )
+                    var1_np, m1, v1 = nadam_update_numpy(
+                        var1_np,
+                        grads1_np,
+                        t,
+                        m1,
+                        v1,
+                        mcache,
+                        epsilon=sparse_epsilon,
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np, var0)
+                    self.assertAllCloseAccordingToType(var1_np, var1)
+
+    def testBasic(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for dtype in [tf.half, tf.float32, tf.float64]:
+            with tf.Graph().as_default(), self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                opt = nadam.Nadam()
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], var0)
+                self.assertAllClose([3.0, 4.0], var1)
+
+                # Run 3 steps of Nadam
+                for t in range(3):
+                    update.run()
+
+                    mcache = update_m_cache(mcache, t)
+                    var0_np, m0, v0 = nadam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0, mcache
+                    )
+                    var1_np, m1, v1 = nadam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1, mcache
+                    )
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np, var0)
+                    self.assertAllCloseAccordingToType(var1_np, var1)
+
+    def testConstructNAdamWithLR(self):
+        opt = nadam.Nadam(lr=1.0)
+        opt_2 = nadam.Nadam(learning_rate=0.1, lr=1.0)
+        opt_3 = nadam.Nadam(learning_rate=0.1)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+    def testConstructNAdamWithScheduleDecay(self):
+        opt = nadam.Nadam(schedule_decay=0.2)
+        self.assertIsInstance(opt.decay, tf.Variable)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.decay), (0.2))
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/optimizer_test.py b/keras/optimizers/legacy/optimizer_test.py
deleted file mode 100644
index 9c8604509e29..000000000000
--- a/keras/optimizers/legacy/optimizer_test.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Tests for optimizer."""
-
-from absl.testing import parameterized
-import keras
-from keras.optimizers.legacy import adadelta
-from keras.optimizers.legacy import adagrad
-from keras.optimizers.legacy import adam
-from keras.optimizers.legacy import adamax
-from keras.optimizers.legacy import ftrl
-from keras.optimizers.legacy import nadam
-from keras.optimizers.legacy import rmsprop
-from keras.optimizers.legacy import sgd
-import tensorflow.compat.v2 as tf
-
-adadelta_fn = tf.__internal__.test.combinations.NamedObject(
-    "adadelta", lambda: adadelta.Adadelta(0.002))
-adagrad_fn = tf.__internal__.test.combinations.NamedObject(
-    "adagrad", lambda: adagrad.Adagrad(0.002))
-adam_fn = tf.__internal__.test.combinations.NamedObject(
-    "adam", lambda: adam.Adam(0.002))
-adamax_fn = tf.__internal__.test.combinations.NamedObject(
-    "adamax", lambda: adamax.Adamax(0.002))
-ftrl_fn = tf.__internal__.test.combinations.NamedObject(
-    "ftrl", lambda: ftrl.Ftrl(0.002))
-gradient_descent_fn = tf.__internal__.test.combinations.NamedObject(
-    "sgd", lambda: sgd.SGD(0.002))
-nadam_fn = tf.__internal__.test.combinations.NamedObject(
-    "nadam", lambda: nadam.Nadam(0.002))
-rmsprop_fn = tf.__internal__.test.combinations.NamedObject(
-    "rmsprop", lambda: rmsprop.RMSprop(0.002))
-
-OPTIMIZER_FN = [
-    adadelta_fn,
-    adagrad_fn,
-    adam_fn,
-    adamax_fn,
-    ftrl_fn,
-    gradient_descent_fn,
-    nadam_fn,
-    rmsprop_fn,
-]
-
-
-class OptimizerFuntionalityTest(tf.test.TestCase, parameterized.TestCase):
-  """Test the functionality of optimizer."""
-
-  @parameterized.product(optimizer_fn=OPTIMIZER_FN)
-  def testModelFit(self, optimizer_fn):
-    model = keras.Sequential(
-        [keras.layers.Input(shape=(1,)),
-         keras.layers.Dense(1)])
-    optimizer = optimizer_fn()
-    x = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
-    y = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
-    model.compile(loss="mse", optimizer=optimizer)
-    model.fit(x, y, epochs=1, steps_per_epoch=5)
-
-
-if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/optimizers/legacy/optimizer_v2.py b/keras/optimizers/legacy/optimizer_v2.py
new file mode 100644
index 000000000000..984d721f0b37
--- /dev/null
+++ b/keras/optimizers/legacy/optimizer_v2.py
@@ -0,0 +1,1727 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Version 2 of class Optimizer."""
+
+
+import abc
+import contextlib
+import functools
+import warnings
+from copy import deepcopy
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras import initializers
+from keras.engine import base_layer_utils
+from keras.optimizers import utils as optimizer_utils
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.utils import generic_utils
+from keras.utils import layer_utils
+from keras.utils import tf_inspect
+from keras.utils import tf_utils
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+keras_optimizers_gauge = tf.__internal__.monitoring.BoolGauge(
+    "/tensorflow/api/keras/optimizers", "keras optimizer usage", "method"
+)
+
+_DEFAULT_VALID_DTYPES = frozenset(
+    [
+        tf.float16,
+        tf.bfloat16,
+        tf.float32,
+        tf.float64,
+        tf.complex64,
+        tf.complex128,
+    ]
+)
+
+
+def _deduplicate_indexed_slices(values, indices):
+    """Sums `values` associated with any non-unique `indices`.
+
+    Args:
+      values: A `Tensor` with rank >= 1.
+      indices: A one-dimensional integer `Tensor`, indexing into the first
+        dimension of `values` (as in an IndexedSlices object).
+
+    Returns:
+      A tuple of (`summed_values`, `unique_indices`) where `unique_indices` is a
+      de-duplicated version of `indices` and `summed_values` contains the sum of
+      `values` slices associated with each unique index.
+    """
+    unique_indices, new_index_positions = tf.unique(indices)
+    summed_values = tf.math.unsorted_segment_sum(
+        values, new_index_positions, tf.shape(unique_indices)[0]
+    )
+    return (summed_values, unique_indices)
+
+
+class NullContextmanager:
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, type_arg, value_arg, traceback_arg):
+        return False  # False values do not suppress exceptions
+
+
+def name_scope_only_in_function_or_graph(name):
+    """Internal-only entry point for `name_scope*`.
+
+    Enters a compat.v1.name_scope only when in a function or graph,
+    not when running fully eagerly.
+
+    Args:
+      name: The name argument that is passed to the op function.
+
+    Returns:
+      `name_scope*` context manager.
+    """
+    if not tf.executing_eagerly():
+        return tf.name_scope(name)
+    else:
+        return NullContextmanager()
+
+
+@keras_export(
+    "keras.optimizers.legacy.Optimizer",
+    v1=["keras.optimizers.Optimizer", "keras.optimizers.legacy.Optimizer"],
+)
+class OptimizerV2(tf.__internal__.tracking.Trackable):
+    """Base class for legacy Keras optimizers.
+
+    You should not use this class directly, but instead instantiate one of its
+    subclasses such as `tf.keras.optimizers.legacy.SGD`,
+    `tf.keras.optimizers.legacy.Adam`, etc.
+
+    This is the default Keras optimizer base class until v2.10 (included).
+    In v2.11 and later, `tf.keras.optimizers.Optimizer`
+    points to a new base class implementation. The legacy class won't be
+    deleted in the future and will continue to be available at
+    `tf.keras.optimizers.legacy.Optimizer`.
+
+    ### Usage
+
+    ```python
+    # Create an optimizer with the desired parameters.
+    opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
+    # `loss` is a callable that takes no argument and returns the value
+    # to minimize.
+    var1 = tf.Variable(2.0)
+    var2 = tf.Variable(5.0)
+    loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
+    # In graph mode, returns op that minimizes the loss by updating the listed
+    # variables.
+    opt_op = opt.minimize(loss, var_list=[var1, var2])
+    opt_op.run()
+    # In eager mode, simply call minimize to update the list of variables.
+    opt.minimize(loss, var_list=[var1, var2])
+    ```
+
+    ### Usage in custom training loops
+
+    In Keras models, sometimes variables are created when the model is first
+    called, instead of construction time. Examples include 1) sequential models
+    without input shape pre-defined, or 2) subclassed models. Pass var_list as
+    callable in these cases.
+
+    Example:
+
+    ```python
+    opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(num_hidden, activation='relu'))
+    model.add(tf.keras.layers.Dense(num_classes, activation='sigmoid'))
+    loss_fn = lambda: tf.keras.losses.mse(model(input), output)
+    var_list_fn = lambda: model.trainable_weights
+    for input, output in data:
+      opt.minimize(loss_fn, var_list_fn)
+    ```
+
+    ### Processing gradients before applying them
+
+    Calling `minimize()` takes care of both computing the gradients and
+    applying them to the variables.  If you want to process the gradients
+    before applying them you can instead use the optimizer in three steps:
+
+    1.  Compute the gradients with `tf.GradientTape`.
+    2.  Process the gradients as you wish.
+    3.  Apply the processed gradients with `apply_gradients()`.
+
+    Example:
+
+    ```python
+    # Create an optimizer.
+    opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
+
+    # Compute the gradients for a list of variables.
+    with tf.GradientTape() as tape:
+      loss = <call_loss_function>
+    vars = <list_of_variables>
+    grads = tape.gradient(loss, vars)
+
+    # Process the gradients, for example cap them, etc.
+    # capped_grads = [MyCapper(g) for g in grads]
+    processed_grads = [process_gradient(g) for g in grads]
+
+    # Ask the optimizer to apply the processed gradients.
+    opt.apply_gradients(zip(processed_grads, var_list))
+    ```
+
+    ### Use with `tf.distribute.Strategy`
+
+    This optimizer class is `tf.distribute.Strategy` aware, which means it
+    automatically sums gradients across all replicas. To average gradients,
+    you divide your loss by the global batch size, which is done
+    automatically if you use `tf.keras` built-in training or evaluation loops.
+    See the `reduction` argument of your loss which should be set to
+    `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` for averaging or
+    `tf.keras.losses.Reduction.SUM` for not.
+
+    To aggregate gradients yourself, call `apply_gradients` with
+    `experimental_aggregate_gradients` set to False. This is useful if you need
+    to process aggregated gradients.
+
+    If you are not using these and you want to average gradients, you should use
+    `tf.math.reduce_sum` to add up your per-example losses and then divide by
+    the global batch size. Note that when using `tf.distribute.Strategy`, the
+    first component of a tensor's shape is the *replica-local* batch size, which
+    is off by a factor equal to the number of replicas being used to compute a
+    single step. As a result, using `tf.math.reduce_mean` will give the wrong
+    answer, resulting in gradients that can be many times too big.
+
+    ### Variable Constraints
+
+    All Keras optimizers respect variable constraints. If constraint function is
+    passed to any variable, the constraint will be applied to the variable after
+    the gradient has been applied to the variable.
+    Important: If gradient is sparse tensor, variable constraint is not
+    supported.
+
+    ### Thread Compatibility
+
+    The entire optimizer is currently thread compatible, not thread-safe. The
+    user needs to perform synchronization if necessary.
+
+    ### Slots
+
+    Many optimizer subclasses, such as `Adam` and `Adagrad` allocate and manage
+    additional variables associated with the variables to train.  These are
+    called <i>Slots</i>.  Slots have names and you can ask the optimizer for the
+    names of the slots that it uses.  Once you have a slot name you can ask the
+    optimizer for the variable it created to hold the slot value.
+
+    This can be useful if you want to log debug a training algorithm, report
+    stats about the slots, etc.
+
+    ### Hyperparameters
+
+    These are arguments passed to the optimizer subclass constructor
+    (the `__init__` method), and then passed to `self._set_hyper()`.
+    They can be either regular Python values (like 1.0), tensors, or
+    callables. If they are callable, the callable will be called during
+    `apply_gradients()` to get the value for the hyper parameter.
+
+    Hyperparameters can be overwritten through user code:
+
+    Example:
+
+    ```python
+    # Create an optimizer with the desired parameters.
+    opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
+    # `loss` is a callable that takes no argument and returns the value
+    # to minimize.
+    loss = lambda: 3 * var1 + 2 * var2
+    # In eager mode, simply call minimize to update the list of variables.
+    opt.minimize(loss, var_list=[var1, var2])
+    # update learning rate
+    opt.learning_rate = 0.05
+    opt.minimize(loss, var_list=[var1, var2])
+    ```
+
+    ### Callable learning rate
+
+    Optimizer accepts a callable learning rate in two ways. The first way is
+    through built-in or customized
+    `tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be
+    called on each iteration with `schedule(iteration)`, a `tf.Variable`
+    owned by the optimizer.
+
+    Example:
+
+    >>> var = tf.Variable(np.random.random(size=(1,)))
+    >>> learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
+    ... initial_learning_rate=.01, decay_steps=20, decay_rate=.1)
+    >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=learning_rate)
+    >>> loss = lambda: 3 * var
+    >>> opt.minimize(loss, var_list=[var])
+    <tf.Variable...
+
+    The second way is through a callable function that
+    does not accept any arguments.
+
+    Example:
+
+    >>> var = tf.Variable(np.random.random(size=(1,)))
+    >>> def lr_callable():
+    ...   return .1
+    >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=lr_callable)
+    >>> loss = lambda: 3 * var
+    >>> opt.minimize(loss, var_list=[var])
+    <tf.Variable...
+
+    ### Creating a custom optimizer
+
+    If you intend to create your own optimization algorithm, simply inherit from
+    this class and override the following methods:
+
+      - `_resource_apply_dense` (update variable given gradient tensor is a
+        dense `tf.Tensor`)
+      - `_resource_apply_sparse` (update variable given gradient tensor is a
+        sparse `tf.IndexedSlices`. The most common way for this to happen
+        is if you are taking the gradient through a `tf.gather`.)
+      - `_create_slots`
+        (if your optimizer algorithm requires additional variables)
+      - `get_config`
+        (serialization of the optimizer, include all hyper parameters)
+    """
+
+    # Subclasses should set this to True unless they override `apply_gradients`
+    # with a version that does not have the `experimental_aggregate_gradients`
+    # argument.  Older versions of Keras did not have this argument so custom
+    # optimizers may have overridden `apply_gradients` without the
+    # `experimental_aggregate_gradients` argument. Keras only passes
+    # `experimental_aggregate_gradients` if this attribute is True.
+    # Note: This attribute will likely be removed in an upcoming release.
+    _HAS_AGGREGATE_GRAD = False
+
+    def __init__(
+        self,
+        name,
+        gradient_aggregator=None,
+        gradient_transformers=None,
+        **kwargs,
+    ):
+        """Create a new Optimizer.
+
+        This must be called by the constructors of subclasses.
+        Note that Optimizer instances should not bind to a single graph,
+        and so shouldn't keep Tensors as member variables. Generally
+        you should be able to use the _set_hyper()/state.get_hyper()
+        facility instead.
+
+        This class is stateful and thread-compatible.
+
+        Example of custom gradient transformations:
+
+        ```python
+        def my_gradient_transformer(grads_and_vars):
+          # Simple example, double the gradients.
+          return [(2. * g, v) for g, v in grads_and_vars]
+
+        optimizer = tf.keras.optimizers.legacy.SGD(
+            1e-3, gradient_transformers=[my_gradient_transformer])
+        ```
+
+        Args:
+          name: String. The name to use for momentum accumulator weights created
+            by the optimizer.
+          gradient_aggregator: The function to use to aggregate gradients across
+            devices (when using `tf.distribute.Strategy`). If `None`, defaults
+            to summing the gradients across devices. The function should accept
+            and return a list of `(gradient, variable)` tuples.
+          gradient_transformers: Optional. List of functions to use to transform
+            gradients before applying updates to Variables. The functions are
+            applied after `gradient_aggregator`. The functions should accept and
+            return a list of `(gradient, variable)` tuples.
+          **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+            `clipnorm`, `global_clipnorm`.
+            If `clipvalue` (float) is set, the gradient of each weight
+            is clipped to be no higher than this value.
+            If `clipnorm` (float) is set, the gradient of each weight
+            is individually clipped so that its norm is no higher than this
+            value. If `global_clipnorm` (float) is set the gradient of all
+            weights is clipped so that their global norm is no higher than this
+            value.
+
+        Raises:
+          ValueError: in case of any invalid argument.
+        """
+        # Instrument optimizer usages
+        keras_optimizers_gauge.get_cell(self.__class__.__name__).set(True)
+
+        allowed_kwargs = {
+            "clipnorm",
+            "clipvalue",
+            "lr",
+            "decay",
+            "global_clipnorm",
+        }
+        for k in kwargs:
+            if k not in allowed_kwargs:
+                raise TypeError(
+                    "Unexpected keyword argument "
+                    f"passed to optimizer: {str(k)}. Allowed kwargs are "
+                    f"{allowed_kwargs}."
+                )
+            # checks that all keyword arguments are non-negative.
+            if kwargs[k] is not None and kwargs[k] < 0:
+                raise ValueError(f"Expected {k} >= 0, received: {kwargs[k]}")
+            if k == "lr":
+                warnings.warn(
+                    "The `lr` argument is deprecated, "
+                    "use `learning_rate` instead.",
+                    stacklevel=2,
+                )
+
+        self._use_locking = True
+        self._init_set_name(name)
+        self._hyper = {}
+        # dict: {variable name : {slot name : variable}}
+        self._slots = {}
+        self._slot_names = []
+        self._weights = []
+        self._iterations = None
+
+        # For implementing Trackable. Stores information about how to restore
+        # slot variables which have not yet been created
+        # (trackable._CheckpointPosition objects).
+        #  {slot_name :
+        #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
+        #   ... }
+        self._deferred_slot_restorations = {}
+
+        decay = kwargs.pop("decay", 0.0)
+        if decay < 0.0:
+            raise ValueError(
+                f"decay cannot be less than 0. Received: decay={decay}."
+            )
+        self._initial_decay = decay
+
+        self._hypers_created = False
+        # Store the distribution strategy object if the optimizer is created
+        # inside strategy scope, so it could be used to create variables later.
+        if tf.distribute.has_strategy():
+            self._distribution_strategy = tf.distribute.get_strategy()
+        else:
+            self._distribution_strategy = None
+
+        # Configure gradient transformations.
+        if gradient_aggregator is None:
+            gradient_aggregator = optimizer_utils.all_reduce_sum_gradients
+        self.gradient_aggregator = gradient_aggregator
+        if gradient_transformers is None:
+            gradient_transformers = []
+        self.gradient_transformers = gradient_transformers
+        self.clipnorm = kwargs.pop("clipnorm", None)
+        self.global_clipnorm = kwargs.pop("global_clipnorm", None)
+        if self.clipnorm is not None and self.global_clipnorm is not None:
+            raise ValueError(
+                "Cannot accept both `clipnorm` and `global_clipnorm`. "
+                "Received: `clipnorm`={}, `global_clipnorm`={}.".format(
+                    self.clipnorm, self.global_clipnorm
+                )
+            )
+        self.clipvalue = kwargs.pop("clipvalue", None)
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            # DistributionStrategy singleton cannot be serialized
+            if k == "_distribution_strategy":
+                continue
+            setattr(result, k, deepcopy(v, memo))
+        result._distribution_strategy = self._distribution_strategy
+        return result
+
+    @property
+    def clipnorm(self):
+        """`float` or `None`. If set, clips gradients to a maximum norm."""
+        return self._clipnorm
+
+    @property
+    def global_clipnorm(self):
+        """`float` or `None`.
+
+        If set, clips gradients to a maximum norm.
+
+        Check `tf.clip_by_global_norm` for more details.
+        """
+        return self._global_clipnorm
+
+    @clipnorm.setter
+    def clipnorm(self, val):
+        if val is not None and self.gradient_transformers:
+            raise ValueError(
+                "`clipnorm` cannot be set when `gradient_transformers` "
+                "is set. Instead, use the `gradient_transformers` to "
+                "specify clipping and other transformations. Received: "
+                f"val={val}, "
+                f"gradient_transformers={self.gradient_transformers}."
+            )
+        self._clipnorm = val
+        self._clipnorm_fn = optimizer_utils.make_gradient_clipnorm_fn(
+            self._clipnorm
+        )
+
+    @global_clipnorm.setter
+    def global_clipnorm(self, val):
+        if val is not None and self.gradient_transformers:
+            raise ValueError(
+                "`global_clipnorm` cannot be set when "
+                "`gradient_transformers` "
+                "is set. Instead, use the `gradient_transformers` to "
+                "specify clipping and other transformations. Received: "
+                f"val={val}, "
+                f"gradient_transformers={self.gradient_transformers}."
+            )
+        self._global_clipnorm = val
+        self._global_clipnorm_fn = (
+            optimizer_utils.make_global_gradient_clipnorm_fn(
+                self._global_clipnorm
+            )
+        )
+
+    @property
+    def clipvalue(self):
+        """`float` or `None`. If set, clips gradients to a maximum value."""
+        return self._clipvalue
+
+    @clipvalue.setter
+    def clipvalue(self, val):
+        if val is not None and self.gradient_transformers:
+            raise ValueError(
+                "`clipvalue` cannot be set when `gradient_transformers` "
+                "is set. Instead, use the `gradient_transformers` to "
+                "specify clipping and other transformations. Received: "
+                f"val={val}, "
+                f"gradient_transformers={self.gradient_transformers}."
+            )
+        self._clipvalue = val
+        self._clipvalue_fn = optimizer_utils.make_gradient_clipvalue_fn(
+            self._clipvalue
+        )
+
+    def _transform_loss(self, loss):
+        """Called in `.minimize` to transform loss before computing
+        gradients."""
+        return loss
+
+    def _get_gradients(self, tape, loss, var_list, grad_loss=None):
+        """Called in `minimize` to compute gradients from loss."""
+        grads = tape.gradient(loss, var_list, grad_loss)
+        return list(zip(grads, var_list))
+
+    def _transform_unaggregated_gradients(self, grads_and_vars):
+        """Called in `apply_gradients` before gradient aggregation."""
+        return grads_and_vars
+
+    def _aggregate_gradients(self, grads_and_vars):
+        """Called in `apply_gradients` to aggregate gradients across devices.
+
+        Note that user subclasses may override this, so the interface should not
+        be changed.
+
+        Args:
+          grads_and_vars: List of (gradient, variable) pairs.
+
+        Returns:
+          A list of (aggregrated_gradient, variable) pairs. By default, this
+          calls `self.gradient_aggregator`.
+        """
+        return self.gradient_aggregator(grads_and_vars)
+
+    def _transform_gradients(self, grads_and_vars):
+        """Called in `apply_gradients` after aggregation."""
+        if self._clipvalue is not None:
+            grads_and_vars = self._clipvalue_fn(grads_and_vars)
+        if self._clipnorm is not None:
+            grads_and_vars = self._clipnorm_fn(grads_and_vars)
+        if self._global_clipnorm is not None:
+            grads_and_vars = self._global_clipnorm_fn(grads_and_vars)
+
+        for fn in self.gradient_transformers:
+            grads_and_vars = fn(grads_and_vars)
+        return grads_and_vars
+
+    def minimize(self, loss, var_list, grad_loss=None, name=None, tape=None):
+        """Minimize `loss` by updating `var_list`.
+
+        This method simply computes gradient using `tf.GradientTape` and calls
+        `apply_gradients()`. If you want to process the gradient before applying
+        then call `tf.GradientTape` and `apply_gradients()` explicitly instead
+        of using this function.
+
+        Args:
+          loss: `Tensor` or callable. If a callable, `loss` should take no
+            arguments and return the value to minimize. If a `Tensor`, the
+            `tape` argument must be passed.
+          var_list: list or tuple of `Variable` objects to update to minimize
+            `loss`, or a callable returning the list or tuple of `Variable`
+            objects.  Use callable when the variable list would otherwise be
+            incomplete before `minimize` since the variables are created at the
+            first time `loss` is called.
+          grad_loss: (Optional). A `Tensor` holding the gradient computed for
+            `loss`.
+          name: (Optional) str. Name for the returned operation.
+          tape: (Optional) `tf.GradientTape`. If `loss` is provided as a
+            `Tensor`, the tape that computed the `loss` must be provided.
+
+        Returns:
+          An `Operation` that updates the variables in `var_list`. The
+          `iterations` will be automatically increased by 1.
+
+        Raises:
+          ValueError: If some of the variables are not `Variable` objects.
+
+        """
+        grads_and_vars = self._compute_gradients(
+            loss, var_list=var_list, grad_loss=grad_loss, tape=tape
+        )
+        return self.apply_gradients(grads_and_vars, name=name)
+
+    def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
+        """Compute gradients of `loss` for the variables in `var_list`.
+
+        This is the first part of `minimize()`.  It returns a list
+        of (gradient, variable) pairs where "gradient" is the gradient
+        for "variable".  Note that "gradient" can be a `Tensor`, an
+        `IndexedSlices`, or `None` if there is no gradient for the
+        given variable.
+
+        Args:
+          loss: `Tensor` or callable. If a callable, `loss` should take no
+            arguments and return the value to minimize. If a `Tensor`, the
+            `tape` argument must be passed.
+          var_list: list or tuple of `Variable` objects to update to minimize
+            `loss`, or a callable returning the list or tuple of `Variable`
+            objects.  Use callable when the variable list would otherwise be
+            incomplete before `minimize` and the variables are created at the
+            first time when `loss` is called.
+          grad_loss: Optional. A `Tensor` holding the gradient computed for
+            `loss`.
+          tape: (Optional) `tf.GradientTape`. If `loss` is provided as a
+            `Tensor`, the tape that computed the `loss` must be provided.
+
+        Returns:
+          A list of (gradient, variable) pairs. Variable is always present, but
+          gradient can be `None`.
+
+        Raises:
+          TypeError: If `var_list` contains anything else than `Variable`
+            objects.
+          ValueError: If some arguments are invalid, or var_list is None.
+        """
+        # TODO(joshl): Test that we handle weight decay in a reasonable way.
+        if not callable(loss) and tape is None:
+            raise ValueError(
+                "`tape` is required when a `Tensor` loss is passed. "
+                f"Received: loss={loss}, tape={tape}."
+            )
+        tape = tape if tape is not None else tf.GradientTape()
+
+        if callable(loss):
+            with tape:
+                if not callable(var_list):
+                    tape.watch(var_list)
+                loss = loss()
+                if callable(var_list):
+                    var_list = var_list()
+
+        with tape:
+            loss = self._transform_loss(loss)
+
+        var_list = tf.nest.flatten(var_list)
+        with tf.name_scope(self._name + "/gradients"):
+            grads_and_vars = self._get_gradients(
+                tape, loss, var_list, grad_loss
+            )
+
+        self._assert_valid_dtypes(
+            [
+                v
+                for g, v in grads_and_vars
+                if g is not None and v.dtype != tf.resource
+            ]
+        )
+
+        return grads_and_vars
+
+    def apply_gradients(
+        self, grads_and_vars, name=None, experimental_aggregate_gradients=True
+    ):
+        """Apply gradients to variables.
+
+        This is the second part of `minimize()`. It returns an `Operation` that
+        applies gradients.
+
+        The method sums gradients from all replicas in the presence of
+        `tf.distribute.Strategy` by default. You can aggregate gradients
+        yourself by passing `experimental_aggregate_gradients=False`.
+
+        Example:
+
+        ```python
+        grads = tape.gradient(loss, vars)
+        grads = tf.distribute.get_replica_context().all_reduce('sum', grads)
+        # Processing aggregated gradients.
+        optimizer.apply_gradients(zip(grads, vars),
+            experimental_aggregate_gradients=False)
+
+        ```
+
+        Args:
+          grads_and_vars: List of (gradient, variable) pairs.
+          name: Optional name for the returned operation. When `None`, uses the
+            name passed to the `Optimizer` constructor. Defaults to `None`.
+          experimental_aggregate_gradients: Whether to sum gradients from
+            different replicas in the presence of `tf.distribute.Strategy`. If
+            False, it's user responsibility to aggregate the gradients. Default
+            to `True`.
+
+        Returns:
+          An `Operation` that applies the specified gradients. The `iterations`
+          will be automatically increased by 1.
+
+        Raises:
+          TypeError: If `grads_and_vars` is malformed.
+          ValueError: If none of the variables have gradients.
+          RuntimeError: If called in a cross-replica context.
+        """
+        grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
+        var_list = [v for (_, v) in grads_and_vars]
+
+        with tf.name_scope(self._name):
+            # Create iteration if necessary.
+            with tf.init_scope():
+                self._create_all_weights(var_list)
+
+            if not grads_and_vars:
+                # Distribution strategy does not support reducing an empty list
+                # of gradients
+                return tf.no_op()
+
+            if tf.distribute.in_cross_replica_context():
+                raise RuntimeError(
+                    "`apply_gradients() cannot be called in cross-replica "
+                    "context. Use `tf.distribute.Strategy.run` to enter "
+                    "replica context. For more information, please see the "
+                    "docstring of `tf.distribute.get_replica_context`."
+                )
+
+            strategy = tf.distribute.get_strategy()
+            if (
+                not experimental_aggregate_gradients
+                and strategy
+                and isinstance(
+                    strategy,
+                    (
+                        tf.compat.v1.distribute.experimental.ParameterServerStrategy,  # noqa: E501
+                        tf.distribute.experimental.ParameterServerStrategy,
+                        tf.distribute.experimental.CentralStorageStrategy,
+                        tf.compat.v1.distribute.experimental.CentralStorageStrategy,  # noqa: E501
+                    ),
+                )
+            ):
+                raise NotImplementedError(
+                    "`experimental_aggregate_gradients=False is not supported "
+                    "for ParameterServerStrategy and CentralStorageStrategy. "
+                    f"Used: strategy={strategy}."
+                )
+
+            apply_state = self._prepare(var_list)
+            if experimental_aggregate_gradients:
+                grads_and_vars = self._transform_unaggregated_gradients(
+                    grads_and_vars
+                )
+                grads_and_vars = self._aggregate_gradients(grads_and_vars)
+            grads_and_vars = self._transform_gradients(grads_and_vars)
+
+            return tf.__internal__.distribute.interim.maybe_merge_call(
+                functools.partial(
+                    self._distributed_apply, apply_state=apply_state
+                ),
+                strategy,
+                grads_and_vars,
+                name=name,
+            )
+
+    def _distributed_apply(
+        self, distribution, grads_and_vars, apply_state, name
+    ):
+        """`apply_gradients` using a `DistributionStrategy`."""
+
+        def apply_grad_to_update_var(var, grad):
+            """Apply gradient to variable."""
+            if isinstance(var, tf.Tensor):
+                raise NotImplementedError(
+                    "Updating a `Tensor` is not implemented. "
+                    f"Received: var={var}."
+                )
+
+            apply_kwargs = {}
+            if isinstance(grad, tf.IndexedSlices):
+                if var.constraint is not None:
+                    raise RuntimeError(
+                        "Cannot use a constraint function on a sparse "
+                        f"variable. Received: grad={grad}, "
+                        f"var.constraint={var.constraint}."
+                    )
+                if "apply_state" in self._sparse_apply_args:
+                    apply_kwargs["apply_state"] = apply_state
+                return self._resource_apply_sparse_duplicate_indices(
+                    grad.values, var, grad.indices, **apply_kwargs
+                )
+
+            if "apply_state" in self._dense_apply_args:
+                apply_kwargs["apply_state"] = apply_state
+            update_op = self._resource_apply_dense(grad, var, **apply_kwargs)
+            if var.constraint is not None:
+                with tf.control_dependencies([update_op]):
+                    return var.assign(var.constraint(var))
+            else:
+                return update_op
+
+        eagerly_outside_functions = (
+            tf.compat.v1.executing_eagerly_outside_functions()
+        )
+        update_ops = []
+        with name_scope_only_in_function_or_graph(name or self._name):
+            for grad, var in grads_and_vars:
+                # Colocate the update with variables to avoid unnecessary
+                # communication delays. See b/136304694.
+                with distribution.extended.colocate_vars_with(var):
+                    with name_scope_only_in_function_or_graph(
+                        "update"
+                        if eagerly_outside_functions
+                        else "update_" + var.op.name
+                    ):
+                        update_op = distribution.extended.update(
+                            var,
+                            apply_grad_to_update_var,
+                            args=(grad,),
+                            group=False,
+                        )
+                        if tf.distribute.in_cross_replica_context():
+                            # In cross-replica context, extended.update returns
+                            # a list of update ops from all replicas
+                            # (group=False).
+                            update_ops.extend(update_op)
+                        else:
+                            # In replica context, extended.update return the
+                            # single update op of current replica.
+                            update_ops.append(update_op)
+
+            any_symbolic = any(
+                isinstance(i, tf.Operation) or tf_utils.is_symbolic_tensor(i)
+                for i in update_ops
+            )
+            if not tf.executing_eagerly() or any_symbolic:
+                # If the current context is graph mode or any of the update ops
+                # are symbolic then the step update should be carried out under
+                # a graph context. (eager updates execute immediately)
+                with backend._current_graph(update_ops).as_default():
+                    with tf.control_dependencies([tf.group(update_ops)]):
+                        return self.iterations.assign_add(1, read_value=False)
+
+            return self.iterations.assign_add(1)
+
+    def get_gradients(self, loss, params):
+        """Returns gradients of `loss` with respect to `params`.
+
+        Should be used only in legacy v1 graph mode.
+
+        Args:
+          loss: Loss tensor.
+          params: List of variables.
+
+        Returns:
+          List of gradient tensors.
+
+        Raises:
+          ValueError: In case any gradient cannot be computed (e.g. if gradient
+            function not implemented).
+        """
+        params = tf.nest.flatten(params)
+        with backend.get_graph().as_default(), backend.name_scope(
+            self._name + "/gradients"
+        ):
+            grads = tf.compat.v1.gradients(loss, params)
+            for grad, param in zip(grads, params):
+                if grad is None:
+                    raise ValueError(
+                        "Variable {} has `None` for gradient. "
+                        "Please make sure that all of your ops have a "
+                        "gradient defined (i.e. are differentiable). "
+                        "Common ops without gradient: "
+                        "K.argmax, K.round, K.eval.".format(param)
+                    )
+        return grads
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        grads_and_vars = list(zip(grads, params))
+        self._assert_valid_dtypes(
+            [
+                v
+                for g, v in grads_and_vars
+                if g is not None and v.dtype != tf.resource
+            ]
+        )
+        return [self.apply_gradients(grads_and_vars)]
+
+    def _set_hyper(self, name, value):
+        """set hyper `name` to value. value can be callable, tensor, numeric."""
+        if isinstance(value, tf.__internal__.tracking.Trackable):
+            self._track_trackable(value, name, overwrite=True)
+        if name not in self._hyper:
+            self._hyper[name] = value
+        else:
+            prev_value = self._hyper[name]
+            if (
+                callable(prev_value)
+                or isinstance(
+                    prev_value,
+                    (
+                        tf.Tensor,
+                        int,
+                        float,
+                        learning_rate_schedule.LearningRateSchedule,
+                    ),
+                )
+                or isinstance(
+                    value, learning_rate_schedule.LearningRateSchedule
+                )
+            ):
+                self._hyper[name] = value
+            else:
+                backend.set_value(self._hyper[name], value)
+
+    def _get_hyper(self, name, dtype=None):
+        if not self._hypers_created:
+            self._create_hypers()
+        value = self._hyper[name]
+        if isinstance(value, learning_rate_schedule.LearningRateSchedule):
+            return value
+        if callable(value):
+            value = value()
+        if dtype:
+            return tf.cast(value, dtype)
+        else:
+            return value
+
+    def _create_slots(self, var_list):
+        pass
+
+    def _create_slots_for_sharded_variables(self, var_list):
+        """Add ShardedVariables to slots to later reconstruct for checkpointing.
+
+        ShardedVariables don't have slot variables created for them; their
+        shards do. This function allows users to call get_slot with a
+        ShardedVariable input and receive a ShardedVariable output containing
+        the appropriate slot vars.
+
+        Iterate over the variables to find shards, and aggregate the sharded
+        containers in a set. Add these ShardedVariables to _slots so that
+        get_slot can retrieve the proper slot variables for their component
+        shards, and reconstruct those into a ShardedVariable.
+
+        Args:
+          var_list: list or tuple of `Variable` objects that will be minimized
+            using this optimizer.
+        """
+        sharded_vars = set()
+        for var in var_list:
+            if getattr(var, "_sharded_container", False):
+                sharded_vars.add(var._sharded_container())
+
+        for sharded_var in sharded_vars:
+            sharded_key = _var_key(sharded_var)
+            slot_dict = {}
+            for slot in self.get_slot_names():
+                slot_dict[slot] = sharded_var
+            self._slots[sharded_key] = slot_dict
+
+    def _create_all_weights(self, var_list):
+        """Creates all weights, including iterations, hyperparameters and slot
+        vars.
+
+        This will add newly created variables to `optimizer.weights`.
+
+        New variables are only created when this method is called the first
+        time, or when called with different variables in the var_list.
+
+        Args:
+          var_list: list or tuple of `Variable` objects that will be minimized
+            using this optimizer.
+        """
+
+        _ = self.iterations
+        self._create_hypers()
+        self._create_slots(var_list)
+        self._create_slots_for_sharded_variables(var_list)
+
+    def __getattribute__(self, name):
+        """Overridden to support hyperparameter access."""
+        try:
+            return super().__getattribute__(name)
+        except AttributeError as e:
+            # Needed to avoid infinite recursion with __setattr__.
+            if name == "_hyper":
+                raise e
+            # Backwards compatibility with Keras optimizers.
+            if name == "lr":
+                name = "learning_rate"
+            if name in self._hyper:
+                return self._get_hyper(name)
+            raise e
+
+    def __dir__(self):
+        result = set(super().__dir__())
+        if "_hyper" in result:
+            result |= self._hyper.keys()
+            if "learning_rate" in self._hyper.keys():
+                result.add("lr")
+        return list(result)
+
+    def __setattr__(self, name, value):
+        """Override setattr to support dynamic hyperparameter setting."""
+        # Backwards compatibility with Keras optimizers.
+        if name == "lr":
+            name = "learning_rate"
+        if hasattr(self, "_hyper") and name in self._hyper:
+            self._set_hyper(name, value)
+        else:
+            super().__setattr__(name, value)
+
+    def get_slot_names(self):
+        """A list of names for this optimizer's slots."""
+        return self._slot_names
+
+    def add_slot(self, var, slot_name, initializer="zeros", shape=None):
+        """Add a new slot variable for `var`.
+
+        A slot variable is an additional variable associated with `var` to
+        train.  It is allocated and managed by optimizers, e.g. `Adam`.
+
+        Args:
+          var: a `Variable` object.
+          slot_name: name of the slot variable.
+          initializer: initializer of the slot variable
+          shape: (Optional) shape of the slot variable. If not set, it will
+            default to the shape of `var`.
+
+        Returns:
+          A slot variable.
+        """
+        if slot_name not in self._slot_names:
+            self._slot_names.append(slot_name)
+        var_key = _var_key(var)
+        slot_dict = self._slots.setdefault(var_key, {})
+        weight = slot_dict.get(slot_name, None)
+        if weight is None:
+            if isinstance(initializer, str) or callable(initializer):
+                initializer = initializers.get(initializer)
+                if isinstance(
+                    initializer,
+                    tf.__internal__.tracking.CheckpointInitialValueCallable,
+                ) or (shape is not None):
+                    slot_shape = shape
+                else:
+                    slot_shape = var.shape
+                initial_value = functools.partial(
+                    initializer, shape=slot_shape, dtype=var.dtype
+                )
+            else:
+                initial_value = initializer
+
+            with self._distribution_strategy_scope():
+                strategy = tf.distribute.get_strategy()
+                if not strategy.extended.variable_created_in_scope(var):
+                    raise ValueError(
+                        "Trying to create optimizer slot variable under the "
+                        "scope for tf.distribute.Strategy ({}), which is "
+                        "different from the scope used for the original "
+                        "variable ({}). Make sure the slot variables are "
+                        "created under the same strategy scope. This may "
+                        "happen if you're restoring from a checkpoint "
+                        "outside the scope.".format(strategy, var)
+                    )
+
+                with strategy.extended.colocate_vars_with(var):
+                    weight = tf.Variable(
+                        name=f"{var._shared_name}/{slot_name}",
+                        dtype=var.dtype,
+                        trainable=False,
+                        initial_value=initial_value,
+                    )
+            backend.track_variable(weight)
+            slot_dict[slot_name] = weight
+            self._restore_slot_variable(
+                slot_name=slot_name, variable=var, slot_variable=weight
+            )
+            self._weights.append(weight)
+        return weight
+
+    def get_slot(self, var, slot_name):
+        var_key = _var_key(var)
+        slot_dict = self._slots[var_key]
+        slot_variable = slot_dict[slot_name]
+        if isinstance(
+            slot_variable, tf.__internal__.distribute.ShardedVariable
+        ):
+            # Construct a ShardedVariable that points to the input
+            # ShardedVariable's component shard's slot variables.
+            shard_vars = []
+            for shard in slot_variable.variables:
+                slot_shard = self.get_slot(shard, slot_name)
+                shard_vars.append(slot_shard)
+            slot_variable = tf.__internal__.distribute.ShardedVariable(
+                shard_vars, name=slot_variable.name
+            )
+        return slot_variable
+
+    def _prepare(self, var_list):
+        keys = set()
+        for var in var_list:
+            if isinstance(var, tf.distribute.DistributedValues):
+                var_devices = var._devices
+            else:
+                var_devices = [var.device]
+            var_dtype = var.dtype.base_dtype
+            for var_device in var_devices:
+                keys.add((var_device, var_dtype))
+
+        apply_state = {}
+        for var_device, var_dtype in keys:
+            apply_state[(var_device, var_dtype)] = {}
+            with tf.device(var_device):
+                self._prepare_local(var_device, var_dtype, apply_state)
+
+        return apply_state
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        if "learning_rate" in self._hyper:
+            lr_t = tf.identity(self._decayed_lr(var_dtype))
+            apply_state[(var_device, var_dtype)]["lr_t"] = lr_t
+
+    def _fallback_apply_state(self, var_device, var_dtype):
+        """Compatibility for subclasses that don't pass apply_state through."""
+        apply_state = {(var_device, var_dtype): {}}
+        self._prepare_local(var_device, var_dtype, apply_state)
+        return apply_state[(var_device, var_dtype)]
+
+    def _create_hypers(self):
+        if self._hypers_created:
+            return
+        with self._distribution_strategy_scope():
+            # Iterate hyper values deterministically.
+            for name, value in sorted(self._hyper.items()):
+                if isinstance(value, (tf.Tensor, tf.Variable)) or callable(
+                    value
+                ):
+                    # The check for `callable` covers the usage when `value` is
+                    # a `LearningRateSchedule`, in which case it does not need
+                    # to create a variable.
+                    continue
+                else:
+                    self._hyper[name] = self.add_weight(
+                        name,
+                        shape=[],
+                        trainable=False,
+                        initializer=value,
+                        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+                    )
+        self._hypers_created = True
+
+    @property
+    def iterations(self):
+        """Variable. The number of training steps this Optimizer has run."""
+        if self._iterations is None:
+            with self._distribution_strategy_scope():
+                self._iterations = self.add_weight(
+                    "iter",
+                    shape=[],
+                    dtype=tf.int64,
+                    trainable=False,
+                    aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+                )
+            self._weights.append(self._iterations)
+        return self._iterations
+
+    @iterations.setter
+    def iterations(self, variable):
+        if self._iterations is not None:
+            raise RuntimeError(
+                "Cannot set `iterations` to a new Variable after "
+                "the Optimizer weights have been created. Here it is "
+                f"attempting to set `iterations` to {variable}."
+            )
+        self._iterations = variable
+        self._weights.append(self._iterations)
+
+    def _decayed_lr(self, var_dtype):
+        """Get decayed learning rate as a Tensor with dtype=var_dtype."""
+        lr_t = self._get_hyper("learning_rate", var_dtype)
+        if isinstance(lr_t, learning_rate_schedule.LearningRateSchedule):
+            local_step = tf.cast(self.iterations, var_dtype)
+            lr_t = tf.cast(lr_t(local_step), var_dtype)
+        if self._initial_decay > 0.0:
+            local_step = tf.cast(self.iterations, var_dtype)
+            decay_t = tf.cast(self._initial_decay, var_dtype)
+            lr_t = lr_t / (1.0 + decay_t * local_step)
+        return lr_t
+
+    @abc.abstractmethod
+    def get_config(self):
+        """Returns the config of the optimizer.
+
+        An optimizer config is a Python dictionary (serializable)
+        containing the configuration of an optimizer.
+        The same optimizer can be reinstantiated later
+        (without any saved state) from this configuration.
+
+        Returns:
+            Python dictionary.
+        """
+        config = {"name": self._name}
+        if self.clipnorm is not None:
+            config["clipnorm"] = self.clipnorm
+        if self.clipvalue is not None:
+            config["clipvalue"] = self.clipvalue
+        if self.global_clipnorm is not None:
+            config["global_clipnorm"] = self.global_clipnorm
+        return config
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        """Creates an optimizer from its config.
+
+        This method is the reverse of `get_config`,
+        capable of instantiating the same optimizer from the config
+        dictionary.
+
+        Args:
+            config: A Python dictionary, typically the output of get_config.
+            custom_objects: A Python dictionary mapping names to additional
+              Python objects used to create this optimizer, such as a function
+              used for a hyperparameter.
+
+        Returns:
+            An optimizer instance.
+        """
+        if "lr" in config:
+            config["learning_rate"] = config.pop("lr")
+        if "learning_rate" in config:
+            if isinstance(config["learning_rate"], dict):
+                config["learning_rate"] = learning_rate_schedule.deserialize(
+                    config["learning_rate"], custom_objects=custom_objects
+                )
+        return cls(**config)
+
+    def _serialize_hyperparameter(self, hyperparameter_name):
+        """Serialize a hyperparameter that can be a float, callable, or
+        Tensor."""
+        value = self._hyper[hyperparameter_name]
+        if isinstance(value, learning_rate_schedule.LearningRateSchedule):
+            return learning_rate_schedule.serialize(value)
+        if callable(value):
+            return value()
+        if tf.is_tensor(value):
+            return backend.get_value(value)
+        return value
+
+    def variables(self):
+        """Returns variables of this Optimizer based on the order created."""
+        return self._weights
+
+    @property
+    def weights(self):
+        """Returns variables of this Optimizer based on the order created."""
+        return self._weights
+
+    def get_weights(self):
+        """Returns the current weights of the optimizer.
+
+        The weights of an optimizer are its state (ie, variables).
+        This function returns the weight values associated with this
+        optimizer as a list of Numpy arrays. The first value is always the
+        iterations count of the optimizer, followed by the optimizer's state
+        variables in the order they were created. The returned list can in turn
+        be used to load state into similarly parameterized optimizers.
+
+        For example, the RMSprop optimizer for this simple model returns a list
+        of three values-- the iteration count, followed by the root-mean-square
+        value of the kernel and bias of the single Dense layer:
+
+        >>> opt = tf.keras.optimizers.legacy.RMSprop()
+        >>> m = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+        >>> m.compile(opt, loss='mse')
+        >>> data = np.arange(100).reshape(5, 20)
+        >>> labels = np.zeros(5)
+        >>> results = m.fit(data, labels)  # Training.
+        >>> len(opt.get_weights())
+        3
+
+        Returns:
+            Weights values as a list of numpy arrays.
+        """
+        params = self.weights
+        return backend.batch_get_value(params)
+
+    # TODO(tanzheny): Maybe share this logic with base_layer.
+    def set_weights(self, weights):
+        """Set the weights of the optimizer.
+
+        The weights of an optimizer are its state (ie, variables).
+        This function takes the weight values associated with this
+        optimizer as a list of Numpy arrays. The first value is always the
+        iterations count of the optimizer, followed by the optimizer's state
+        variables in the order they are created. The passed values are used to
+        set the new state of the optimizer.
+
+        For example, the RMSprop optimizer for this simple model takes a list of
+        three values-- the iteration count, followed by the root-mean-square
+        value of the kernel and bias of the single Dense layer:
+
+        >>> opt = tf.keras.optimizers.legacy.RMSprop()
+        >>> m = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+        >>> m.compile(opt, loss='mse')
+        >>> data = np.arange(100).reshape(5, 20)
+        >>> labels = np.zeros(5)
+        >>> results = m.fit(data, labels)  # Training.
+        >>> new_weights = [np.array(10), np.ones([20, 10]), np.zeros([10])]
+        >>> opt.set_weights(new_weights)
+        >>> opt.iterations
+        <tf.Variable 'RMSprop/iter:0' shape=() dtype=int64, numpy=10>
+
+        Args:
+            weights: weight values as a list of numpy arrays.
+        """
+        params = self.weights
+        if len(params) != len(weights):
+            raise ValueError(
+                f"You called `set_weights(weights)` on optimizer {self._name} "
+                f"with a  weight list of length {str(len(weights))}, "
+                f"but the optimizer was expecting {str(len(params))} "
+                f"weights. Provided weights: {str(weights)[:50]}..."
+            )
+        if not params:
+            return
+        weight_value_tuples = []
+        param_values = backend.batch_get_value(params)
+        for pv, p, w in zip(param_values, params, weights):
+            if pv.shape != w.shape:
+                raise ValueError(
+                    f"Optimizer weight shape {str(pv.shape)} "
+                    "not compatible with "
+                    f"provided weight shape {str(w.shape)}."
+                )
+            weight_value_tuples.append((p, w))
+        backend.batch_set_value(weight_value_tuples)
+
+    def add_weight(
+        self,
+        name,
+        shape,
+        dtype=None,
+        initializer="zeros",
+        trainable=None,
+        synchronization=tf.VariableSynchronization.AUTO,
+        aggregation=tf.VariableAggregation.NONE,
+    ):
+
+        if dtype is None:
+            dtype = tf.float32
+        if isinstance(initializer, str) or callable(initializer):
+            initializer = initializers.get(initializer)
+
+        if synchronization == tf.VariableSynchronization.ON_READ:
+            if trainable:
+                raise ValueError(
+                    "Synchronization value can be set to "
+                    "VariableSynchronization.ON_READ only for non-trainable "
+                    "variables. You have specified trainable=True and "
+                    "synchronization=VariableSynchronization.ON_READ."
+                )
+            else:
+                # Set trainable to be false when variable is to be synced on
+                # read.
+                trainable = False
+        elif trainable is None:
+            trainable = True
+
+        variable = self._add_variable_with_custom_getter(
+            name=name,
+            shape=shape,
+            getter=base_layer_utils.make_variable,
+            overwrite=True,
+            initializer=initializer,
+            dtype=dtype,
+            trainable=trainable,
+            use_resource=True,
+            synchronization=synchronization,
+            aggregation=aggregation,
+        )
+        backend.track_variable(variable)
+
+        return variable
+
+    def _init_set_name(self, name, zero_based=True):
+        if not name:
+            self._name = backend.unique_object_name(
+                generic_utils.to_snake_case(self.__class__.__name__),
+                zero_based=zero_based,
+            )
+        else:
+            self._name = name
+
+    def _assert_valid_dtypes(self, tensors):
+        """Asserts tensors are all valid types (see `_valid_dtypes`).
+
+        Args:
+          tensors: Tensors to check.
+
+        Raises:
+          ValueError: If any tensor is not a valid type.
+        """
+        valid_dtypes = self._valid_dtypes()
+        for t in tensors:
+            dtype = t.dtype.base_dtype
+            if dtype not in valid_dtypes:
+                raise ValueError(
+                    "Invalid type {} for {}, expected: {}.".format(
+                        dtype, t.name, [v for v in valid_dtypes]
+                    )
+                )
+
+    def _valid_dtypes(self):
+        """Valid types for loss, variables and gradients.
+
+        Subclasses should override to allow other float types.
+
+        Returns:
+          Valid types for loss, variables and gradients.
+        """
+        return _DEFAULT_VALID_DTYPES
+
+    def _call_if_callable(self, param):
+        """Call the function if param is callable."""
+        return param() if callable(param) else param
+
+    def _resource_apply_dense(self, grad, handle, apply_state):
+        """Add ops to apply dense gradients to the variable `handle`.
+
+        Args:
+          grad: a `Tensor` representing the gradient.
+          handle: a `Tensor` of dtype `resource` which points to the variable to
+            be updated.
+          apply_state: A dict which is used across multiple apply calls.
+
+        Returns:
+          An `Operation` which updates the value of the variable.
+        """
+        raise NotImplementedError(
+            "`_resource_apply_dense` must be implemented in subclasses."
+        )
+
+    def _resource_apply_sparse_duplicate_indices(
+        self, grad, handle, indices, **kwargs
+    ):
+        """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+        Optimizers which override this method must deal with repeated indices.
+        See the docstring of `_apply_sparse_duplicate_indices` for details. By
+        default the correct behavior, to sum non-unique indices and their
+        associated gradients, is enforced by first pre-processing `grad` and
+        `indices` and passing them on to `_resource_apply_sparse`. Optimizers
+        which deal correctly with duplicate indices may instead override this
+        method to avoid the overhead of summing.
+
+        Args:
+          grad: a `Tensor` representing the gradient for the affected indices.
+          handle: a `Tensor` of dtype `resource` which points to the variable to
+            be updated.
+          indices: a `Tensor` of integral type representing the indices for
+            which the gradient is nonzero. Indices may be repeated.
+          **kwargs: May optionally contain `apply_state`
+
+        Returns:
+          An `Operation` which updates the value of the variable.
+        """
+        summed_grad, unique_indices = _deduplicate_indexed_slices(
+            values=grad, indices=indices
+        )
+        return self._resource_apply_sparse(
+            summed_grad, handle, unique_indices, **kwargs
+        )
+
+    def _resource_apply_sparse(self, grad, handle, indices, apply_state):
+        """Add ops to apply sparse gradients to the variable `handle`.
+
+        Similar to `_apply_sparse`, the `indices` argument to this method has
+        been de-duplicated. Optimizers which deal correctly with non-unique
+        indices may instead override `_resource_apply_sparse_duplicate_indices`
+        to avoid this overhead.
+
+        Args:
+          grad: a `Tensor` representing the gradient for the affected indices.
+          handle: a `Tensor` of dtype `resource` which points to the variable to
+            be updated.
+          indices: a `Tensor` of integral type representing the indices for
+            which the gradient is nonzero. Indices are unique.
+          apply_state: A dict which is used across multiple apply calls.
+
+        Returns:
+          An `Operation` which updates the value of the variable.
+        """
+        raise NotImplementedError(
+            "`_resource_apply_sparse` Must be implemented in subclasses."
+        )
+
+    def _resource_scatter_add(self, x, i, v):
+        with tf.control_dependencies(
+            [
+                tf.raw_ops.ResourceScatterAdd(
+                    resource=x.handle, indices=i, updates=v
+                )
+            ]
+        ):
+            return x.value()
+
+    def _resource_scatter_update(self, x, i, v):
+        with tf.control_dependencies(
+            [
+                tf.raw_ops.ResourceScatterUpdate(
+                    resource=x.handle, indices=i, updates=v
+                )
+            ]
+        ):
+            return x.value()
+
+    @property
+    @layer_utils.cached_per_instance
+    def _dense_apply_args(self):
+        return tf_inspect.getfullargspec(self._resource_apply_dense).args
+
+    @property
+    @layer_utils.cached_per_instance
+    def _sparse_apply_args(self):
+        return tf_inspect.getfullargspec(self._resource_apply_sparse).args
+
+    # ---------------
+    # For implementing the trackable interface
+    # ---------------
+
+    def _restore_slot_variable(self, slot_name, variable, slot_variable):
+        """Restore a newly created slot variable's value."""
+        variable_key = _var_key(variable)
+        deferred_restorations = self._deferred_slot_restorations.get(
+            slot_name, {}
+        ).pop(variable_key, [])
+        # Iterate over restores, highest restore UID first to minimize the
+        # number of assignments.
+        deferred_restorations.sort(
+            key=lambda position: position.restore_uid, reverse=True
+        )
+        for checkpoint_position in deferred_restorations:
+            checkpoint_position.restore(slot_variable)
+
+    def _create_or_restore_slot_variable(
+        self, slot_variable_position, slot_name, variable
+    ):
+        """Returns the slot variable that should have a value restored into it.
+
+        It is up to the caller to restore the value into the slot variable if a
+        valid slot variable is returned.
+
+        Called when a variable which has an associated slot variable is created
+        or restored. When executing eagerly, we create the slot variable with a
+        restoring initializer.
+
+        No new variables are created when graph building. Instead,
+        _restore_slot_variable catches these after normal creation and adds
+        restore ops to the graph. This method is nonetheless important when
+        graph building for the case when a slot variable has already been
+        created but `variable` has just been added to a dependency graph
+        (causing us to realize that the slot variable needs to be restored).
+
+        Args:
+          slot_variable_position: A `trackable._CheckpointPosition` object
+            indicating the slot variable `Trackable` object to be restored.
+          slot_name: The name of this `Optimizer`'s slot to restore into.
+          variable: The variable object this slot is being created for.
+
+        Returns:
+          A slot variable that should have a value restored into it, or None if
+          a slot variable should not be restored at this time.
+        """
+        variable_key = _var_key(variable)
+        slot_dict = self._slots.get(variable_key, {})
+        slot_variable = slot_dict.get(slot_name, None)
+        if (
+            slot_variable is None
+            and tf.executing_eagerly()
+            and slot_variable_position.is_simple_variable()
+            # Defer slot variable creation if there is an active variable
+            # creator scope. Generally we'd like to eagerly create/restore slot
+            # variables when possible, but this may mean that scopes intended to
+            # catch `variable` also catch its eagerly created slot variable
+            # unintentionally (specifically make_template would add a dependency
+            # on a slot variable if not for this case). Deferring is mostly
+            # harmless (aside from double initialization), and makes variable
+            # creator scopes behave the same way they do when graph building.
+            #
+            # One notable case is with distribution strategy, which uses
+            # variable creator scope but always desires the `variable` and the
+            # slot to use the same scope, thus we can safely eagerly
+            # create/restore slot variables.
+            and (
+                not tf.compat.v1.get_default_graph()._variable_creator_stack
+                or self._distribution_strategy
+            )
+        ):
+            initializer = (
+                tf.__internal__.tracking.CheckpointInitialValueCallable(
+                    checkpoint_position=slot_variable_position
+                )
+            )
+            slot_variable = self.add_slot(
+                var=variable,
+                initializer=initializer,
+                slot_name=slot_name,
+                shape=slot_variable_position.value_shape(),
+            )
+            # Slot variables are not owned by any one object (because we don't
+            # want to save the slot variable if the optimizer is saved without
+            # the non-slot variable, or if the non-slot variable is saved
+            # without the optimizer; it's a dependency hypergraph with edges of
+            # the form (optimizer, non-slot variable, variable)). So we don't
+            # _track_ slot variables anywhere, and instead special-case this
+            # dependency and otherwise pretend it's a normal graph.
+        if slot_variable is not None:
+            # For sharded variables, we need the logic in get_slot to combine
+            # slot variables for its shards
+            if (slot_variable is variable) and (
+                isinstance(variable, tf.__internal__.distribute.ShardedVariable)
+            ):
+                return self.get_slot(variable, slot_name)
+            # If we've either made this slot variable, or if we've pulled out an
+            # existing slot variable, we should restore it.
+            return slot_variable
+        else:
+            # We didn't make the slot variable. Defer restoring until it gets
+            # created normally. We keep a list rather than the one with the
+            # highest restore UID in case slot variables have their own
+            # dependencies, in which case those could differ between restores.
+            self._deferred_slot_restorations.setdefault(
+                slot_name, {}
+            ).setdefault(variable_key, []).append(slot_variable_position)
+        return None
+
+    @contextlib.contextmanager
+    def _distribution_strategy_scope(self):
+        """Returns the `tf.distribute.Strategy` this optimizer was created
+        under."""
+        if self._distribution_strategy and not tf.distribute.has_strategy():
+            with self._distribution_strategy.scope():
+                yield self._distribution_strategy.scope()
+        else:
+            yield
+
+
+def _var_key(var):
+    """Key for representing a primary variable, for looking up slots.
+
+    In graph mode the name is derived from the var shared name.
+    In eager mode the name is derived from the var unique id.
+    If distribution strategy exists, get the primary variable first.
+
+    Args:
+      var: the variable.
+
+    Returns:
+      the unique name of the variable.
+    """
+
+    # Get the distributed variable if it exists.
+    if hasattr(var, "_distributed_container"):
+        var = var._distributed_container()
+    elif (
+        tf_utils.is_extension_type(var)
+        and hasattr(var, "handle")
+        and hasattr(var.handle, "_distributed_container")
+    ):
+        # For ResourceVariables, the _distributed_container attribute
+        # is added to their handle tensors.
+        var = var.handle._distributed_container()
+    if getattr(var, "_in_graph_mode", False):
+        return var._shared_name
+    return var._unique_id
+
+
+def _get_slot_key_from_var(var, slot_name):
+    """Get the slot key for the variable: var_name/slot_name."""
+
+    name = _var_key(var)
+    return name + "/" + slot_name
+
+
+class RestoredOptimizer(OptimizerV2):
+    """A non-functional Optimizer implementation for checkpoint compatibility.
+
+    Holds slot variables and hyperparameters when an optimizer is restored from
+    a SavedModel. These variables may be referenced in functions along with ops
+    created by the original optimizer, but currently we do not support using the
+    optimizer object itself (e.g. through `apply_gradients`).
+    """
+
+    # TODO(allenl): Make the restored optimizer functional by tracing its apply
+    # methods.
+
+    def __init__(self):
+        super().__init__("RestoredOptimizer")
+        self._hypers_created = True
+
+    def get_config(self):
+        # TODO(allenl): Save and restore the Optimizer's config
+        raise NotImplementedError(
+            "Restoring functional Optimizers from SavedModels is not currently "
+            "supported. Please file a feature request if this limitation "
+            "bothers you."
+        )
+
+
+tf.__internal__.saved_model.load.register_revived_type(
+    "optimizer",
+    lambda obj: isinstance(obj, OptimizerV2),
+    versions=[
+        tf.__internal__.saved_model.load.VersionedTypeRegistration(
+            object_factory=lambda proto: RestoredOptimizer(),
+            version=2,
+            min_producer_version=1,
+            min_consumer_version=1,
+            setter=RestoredOptimizer._set_hyper,
+        )
+    ],
+)
diff --git a/keras/optimizers/legacy/optimizer_v2_test.py b/keras/optimizers/legacy/optimizer_v2_test.py
new file mode 100644
index 000000000000..47ffec24453f
--- /dev/null
+++ b/keras/optimizers/legacy/optimizer_v2_test.py
@@ -0,0 +1,1474 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for OptimizerV2."""
+
+import collections
+from copy import deepcopy
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras import backend
+from keras import callbacks
+from keras import losses
+from keras.engine import input_layer
+from keras.engine import sequential
+from keras.engine import training
+from keras.layers import core
+from keras.layers import regularization
+from keras.optimizers import optimizer_v1
+from keras.optimizers.legacy import adadelta
+from keras.optimizers.legacy import adagrad
+from keras.optimizers.legacy import adam
+from keras.optimizers.legacy import adamax
+from keras.optimizers.legacy import ftrl
+from keras.optimizers.legacy import gradient_descent
+from keras.optimizers.legacy import nadam
+from keras.optimizers.legacy import optimizer_v2
+from keras.optimizers.legacy import rmsprop
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import np_utils
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
+_DATA_TYPES = [tf.half, tf.float32, tf.float64]
+# TODO(b/141710709): complex support in NVCC and ROCM.
+if not tf_test_utils.IsBuiltWithNvcc() and not tf.test.is_built_with_rocm():
+    _DATA_TYPES += [tf.complex64, tf.complex128]
+
+
+class OptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testBasic(self):
+        for dtype in _DATA_TYPES:
+            with test_utils.use_gpu():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                loss = lambda: 5 * var0 + 3 * var1
+                sgd = gradient_descent.SGD(3.0)
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+                # Run 1 step of sgd through optimizer
+                opt_op = sgd.minimize(loss, var_list=[var0, var1])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.evaluate(opt_op)
+                # Validate updated params
+                self.assertAllClose([-14.0, -13.0], self.evaluate(var0))
+                self.assertAllClose([-6.0, -5.0], self.evaluate(var1))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testAdaptiveLearningRate(self):
+        for dtype in _DATA_TYPES:
+            with self.test_session():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+
+                def loss():
+                    return 5 * var0 + 3 * var1
+
+                sgd = gradient_descent.SGD(1.0)
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+                # Run 1 step of sgd through optimizer
+                opt_op = sgd.minimize(loss, [var0, var1])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.evaluate(opt_op)
+                # Validate updated params
+                # var0 = [1., 2.] - 1.0 * [5, 5]
+                self.assertAllClose([-4.0, -3.0], self.evaluate(var0))
+                # var1 = [3., 4.] - 1.0 * [3, 3]
+                self.assertAllClose([0.0, 1.0], self.evaluate(var1))
+
+                sgd.learning_rate = 0.5
+                if tf.executing_eagerly():
+                    sgd.minimize(loss, [var0, var1])
+                else:
+                    self.evaluate(opt_op)
+                # Validate updated params
+                # var0 = [-4., -3.] - 0.5 * [5, 5]
+                self.assertAllClose([-6.5, -5.5], self.evaluate(var0))
+                # var1 = [0., 1.] - 0.5 * [3, 3]
+                self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
+
+                sgd.learning_rate = learning_rate_schedule.InverseTimeDecay(
+                    0.5, decay_steps=1.0, decay_rate=0.5
+                )
+                if tf.executing_eagerly():
+                    sgd.minimize(loss, [var0, var1])
+                else:
+                    self.evaluate(opt_op)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testPrecomputedGradient(self):
+        for dtype in _DATA_TYPES:
+            with test_utils.use_gpu():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                loss = lambda: 5 * var0 + 3 * var1
+                grad_loss = tf.constant([42, -42], dtype=dtype)
+                sgd = gradient_descent.SGD(3.0)
+
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+                # Run 1 step of sgd through optimizer
+                opt_op = sgd.minimize(
+                    loss, var_list=[var0, var1], grad_loss=grad_loss
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.evaluate(opt_op)
+                # Validate updated params
+                self.assertAllClose(
+                    [1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
+                    self.evaluate(var0),
+                )
+                self.assertAllClose(
+                    [3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
+                    self.evaluate(var1),
+                )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoGradients(self):
+        for dtype in _DATA_TYPES:
+            with test_utils.use_gpu():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                loss = lambda: 5 * var0
+                sgd_op = gradient_descent.SGD(3.0)
+                with self.assertRaisesRegex(ValueError, "No gradients"):
+                    # var1 has no gradient
+                    sgd_op.minimize(loss, var_list=[var1])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoGradientsForAnyVariables_Minimize(self):
+        for dtype in _DATA_TYPES:
+            with test_utils.use_gpu():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                loss = lambda: tf.constant(5.0)
+
+                sgd_op = gradient_descent.SGD(3.0)
+                with self.assertRaisesRegex(
+                    ValueError, "No gradients provided for any variable"
+                ):
+                    sgd_op.minimize(loss, var_list=[var0, var1])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoGradientsForAnyVariables_ApplyGradients(self):
+        for dtype in _DATA_TYPES:
+            with test_utils.use_gpu():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                sgd_op = gradient_descent.SGD(3.0)
+                with self.assertRaisesRegex(
+                    ValueError, "No gradients provided for any variable"
+                ):
+                    sgd_op.apply_gradients([(None, var0), (None, var1)])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testGradientsAsVariables(self):
+        for i, dtype in enumerate(_DATA_TYPES):
+            with test_utils.use_gpu():
+                var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+                var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+                loss = lambda: 5 * var0 + 3 * var1
+
+                sgd = gradient_descent.SGD(3.0)
+                grads_and_vars = sgd._compute_gradients(loss, [var0, var1])
+                # Convert gradients to tf.Variables
+                converted_grads = [
+                    tf.Variable(tf.zeros([2], dtype), name="c_%d_%d" % (i, j))
+                    for j, gv in enumerate(grads_and_vars)
+                ]
+                convert_ops = [
+                    tf.compat.v1.assign(converted_grads[j], gv[0])
+                    for j, gv in enumerate(grads_and_vars)
+                ]
+
+                # Run convert_ops to achieve the gradients converting
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.evaluate(convert_ops)
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 1 step of sgd through optimizer
+                converted_grads_and_vars = list(
+                    zip(converted_grads, [var0, var1])
+                )
+                opt_op = sgd.apply_gradients(converted_grads_and_vars)
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                self.evaluate(convert_ops)
+                self.evaluate(opt_op)
+
+                # Validate updated params
+                self.assertAllClose([-14.0, -13.0], self.evaluate(var0))
+                self.assertAllClose([-6.0, -5.0], self.evaluate(var1))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testComputeGradientsWithTensors(self):
+        with test_utils.use_gpu():
+            x = tf.convert_to_tensor(1.0)
+
+            def f():
+                return x * x
+
+            sgd = gradient_descent.SGD(3.0)
+            grads_and_vars = sgd._compute_gradients(f, [x])
+            self.assertLen(grads_and_vars, 1)
+            grad, x_as_var = grads_and_vars[0]
+            self.assertIs(x, x_as_var)
+            self.assertEqual(2.0, self.evaluate(grad))
+
+            with self.assertRaises(NotImplementedError):
+                sgd.apply_gradients(grads_and_vars)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testConstraint(self):
+        constraint_01 = lambda x: tf.clip_by_value(x, -0.1, 0.0)
+        constraint_0 = lambda x: tf.clip_by_value(x, 0.0, 1.0)
+        with test_utils.use_gpu():
+            var0 = tf.Variable([1.0, 2.0], constraint=constraint_01)
+            var1 = tf.Variable([3.0, 4.0], constraint=constraint_0)
+            loss = lambda: 5 * var0 + 3 * var1
+            sgd = gradient_descent.SGD(3.0)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # Fetch params to validate initial values
+            self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+            self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+            # Run 1 step of sgd through optimizer
+            opt_op = sgd.minimize(loss, var_list=[var0, var1])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(opt_op)
+            # Validate updated params
+            self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
+            self.assertAllClose([0.0, 0.0], self.evaluate(var1))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testIterationWithoutMinimize(self):
+        with test_utils.use_gpu():
+            sgd = gradient_descent.SGD(3.0)
+            self.evaluate(sgd.iterations.initializer)
+            self.assertEqual(0, self.evaluate(sgd.iterations))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testConfig(self):
+        with test_utils.use_gpu():
+            opt = gradient_descent.SGD(learning_rate=1.0)
+            config = opt.get_config()
+            opt2 = gradient_descent.SGD.from_config(config)
+            lr = opt._get_hyper("learning_rate")
+            lr2 = opt2._get_hyper("learning_rate")
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # assert both are equal float values.
+            self.assertEqual(self.evaluate(lr), self.evaluate(lr2))
+            var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32)
+            loss = lambda: 3 * var0
+            # learning rate variable created when calling minimize.
+            opt.minimize(loss, [var0])
+            opt3 = gradient_descent.SGD.from_config(config)
+            lr3 = opt3._get_hyper("learning_rate")
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.assertEqual(self.evaluate(lr), self.evaluate(lr3))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testConfigWithLearningRateDecay(self):
+        with test_utils.use_gpu():
+            var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32)
+            for decay_schedule in [
+                learning_rate_schedule.InverseTimeDecay(
+                    0.5, decay_steps=1.0, decay_rate=0.1
+                ),
+                learning_rate_schedule.PiecewiseConstantDecay([5], [1.0, 0.5]),
+            ]:
+                step = 10
+                opt = gradient_descent.SGD(decay_schedule)
+                config = opt.get_config()
+                opt2 = gradient_descent.SGD.from_config(config)
+                # assert both are equal float values.
+                self.assertAllEqual(
+                    decay_schedule(step), opt._get_hyper("learning_rate")(step)
+                )
+                self.assertAllEqual(
+                    decay_schedule(step), opt2._get_hyper("learning_rate")(step)
+                )
+                loss = lambda: 3 * var0
+                # learning rate variable is created when calling minimize.
+                opt.minimize(loss, [var0])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                config = opt.get_config()
+                opt3 = gradient_descent.SGD.from_config(config)
+                self.assertAllEqual(
+                    self.evaluate(opt._get_hyper("learning_rate")(step)),
+                    opt3._get_hyper("learning_rate")(step),
+                )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testGradClipValue(self):
+        with test_utils.use_gpu():
+            var = tf.Variable([1.0, 2.0])
+            loss = lambda: 3 * var
+            opt = gradient_descent.SGD(learning_rate=1.0, clipvalue=1.0)
+            opt_op = opt.minimize(loss, [var])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(opt_op)
+            self.assertAllClose([0.0, 1.0], self.evaluate(var))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testGradClipNorm(self):
+        with test_utils.use_gpu():
+            var = tf.Variable([1.0])
+            loss = lambda: 3 * var
+            opt = gradient_descent.SGD(learning_rate=1.0, clipnorm=1.0)
+            opt_op = opt.minimize(loss, [var])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(opt_op)
+            self.assertAllClose([0.0], self.evaluate(var))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testGradGlobalClipNorm(self):
+        with test_utils.use_gpu():
+            # l2 norm is 5.0
+            var1 = tf.Variable([1.0])
+            var2 = tf.Variable([2.0])
+            loss = lambda: 3 * var1 + 4 * var2
+            opt = gradient_descent.SGD(learning_rate=1.0, global_clipnorm=2.0)
+            opt_op = opt.minimize(loss, [var1, var2])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(opt_op)
+            # grad1 = 3.0 * 2.0 / 5.0 = 1.2
+            self.assertAllClose([-0.2], self.evaluate(var1))
+            # grad2 = 4.0 * 2.0 / 5.0 = 1.6
+            self.assertAllClose([0.4], self.evaluate(var2))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInvalidClipNorm(self):
+        with self.assertRaisesRegex(ValueError, ">= 0"):
+            gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(
+            mode=["graph", "eager"],
+            clip_type=["clipnorm", "global_clipnorm", "clipvalue"],
+        )
+    )
+    def testConfigWithCliping(self, clip_type):
+        opt = gradient_descent.SGD(learning_rate=1.0, **{clip_type: 2.0})
+        config = opt.get_config()
+        opt = gradient_descent.SGD.from_config(config)
+        self.assertEqual(getattr(opt, clip_type), 2.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testInvalidKwargs(self):
+        with self.assertRaisesRegex(TypeError, "Unexpected keyword argument"):
+            gradient_descent.SGD(learning_rate=1.0, invalidkwargs=1.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testWeights(self):
+        with test_utils.use_gpu():
+            opt1 = adam.Adam(learning_rate=1.0)
+            var1 = tf.Variable([1.0, 2.0], dtype=tf.float32)
+            loss1 = lambda: 3 * var1
+            opt_op_1 = opt1.minimize(loss1, [var1])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            config = opt1.get_config()
+            opt2 = adam.Adam.from_config(config)
+            var2 = tf.Variable([1.0, 2.0], dtype=tf.float32)
+            loss2 = lambda: 3 * var2
+            opt_op_2 = opt2.minimize(loss2, [var2])
+            weights = opt1.get_weights()
+
+            # Assert set_weights and both variables get updated to same value.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            opt2.set_weights(weights)
+            self.evaluate([opt_op_1, opt_op_2])
+            self.assertAllClose(self.evaluate(var1), self.evaluate(var2))
+            self.assertEqual(1, self.evaluate(opt1.iterations))
+            self.assertEqual(1, self.evaluate(opt2.iterations))
+
+            var3 = tf.Variable([1.0, 2.0, 3.0], dtype=tf.float32)
+            var4 = tf.Variable([4.0, 5.0, 6.0], dtype=tf.float32)
+            loss3 = lambda: 3 * var3 + 5 * var4
+            opt_op_3 = opt1.minimize(loss3, [var3, var4])
+
+            # Assert set_weights with ValueError since weight list does not
+            # match.
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            weights = opt1.get_weights()
+            with self.assertRaisesRegex(ValueError, "but the optimizer was"):
+                opt2.set_weights(weights)
+
+            # Assert set_weights and variables get updated to same value.
+            var5 = tf.Variable([1.0, 2.0, 3.0], dtype=tf.float32)
+            var6 = tf.Variable([4.0, 5.0, 6.0], dtype=tf.float32)
+            loss4 = lambda: 3 * var5 + 5 * var6
+            opt_op_4 = opt2.minimize(loss4, [var5, var6])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            opt2.set_weights(weights)
+            self.evaluate([opt_op_3, opt_op_4])
+            self.assertAllClose(
+                self.evaluate([var3, var4]), self.evaluate([var5, var6])
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testGettingHyperParameters(self):
+        with self.test_session():
+            opt = adam.Adam(learning_rate=1.0)
+            var = tf.Variable([1.0, 2.0], dtype=tf.float32)
+            loss = lambda: 3 * var
+            opt_op = opt.minimize(loss, [var])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(opt_op)
+
+            lr = self.evaluate(opt.lr)
+            self.assertEqual(1.0, lr)
+
+            opt.lr = 2.0
+            lr = self.evaluate(opt.lr)
+            self.assertEqual(2.0, lr)
+
+            self.evaluate(opt.lr.assign(3.0))
+            lr = self.evaluate(opt.lr)
+            self.assertEqual(3.0, lr)
+
+            with self.assertRaises(AttributeError):
+                opt.not_an_attr += 3
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testGettingHyperParametersWithLrInConstructor(self):
+        with self.test_session():
+            opt = gradient_descent.SGD(lr=3.0)
+            var = tf.Variable([1.0, 2.0], dtype=tf.float32)
+            loss = lambda: 3 * var
+            opt_op = opt.minimize(loss, [var])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(opt_op)
+
+            self.assertIsInstance(opt.lr, tf.Variable)
+            self.assertIsInstance(opt.learning_rate, tf.Variable)
+
+            lr = self.evaluate(opt.lr)
+            self.assertEqual(3.0, lr)
+
+            opt.lr = 2.0
+            lr = self.evaluate(opt.lr)
+            self.assertEqual(2.0, lr)
+
+            self.evaluate(opt.lr.assign(4.0))
+            lr = self.evaluate(opt.lr)
+            self.assertEqual(4.0, lr)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDir(self):
+        opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.1)
+        dir_result = set(dir(opt))
+        self.assertIn("learning_rate", dir_result)  # Hyperparameter
+        self.assertIn("lr", dir_result)  # Hyperparameter
+        self.assertIn("momentum", dir_result)  # Hyperparameter
+        self.assertIn("nesterov", dir_result)  # Attribute
+        self.assertIn("minimize", dir_result)  # Attribute
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testOptimizerWithKerasModel(self):
+        a = input_layer.Input(shape=(3,), name="input_a")
+        b = input_layer.Input(shape=(3,), name="input_b")
+
+        dense = core.Dense(4, name="dense")
+        c = dense(a)
+        d = dense(b)
+        e = regularization.Dropout(0.5, name="dropout")(c)
+
+        model = training.Model([a, b], [d, e])
+
+        optimizer = gradient_descent.SGD(learning_rate=0.001)
+        loss = "mse"
+        model.compile(optimizer, loss, metrics=["mae"])
+
+        input_a_np = np.random.random((10, 3))
+        input_b_np = np.random.random((10, 3))
+
+        output_d_np = np.random.random((10, 4))
+        output_e_np = np.random.random((10, 4))
+
+        model.fit(
+            [input_a_np, input_b_np],
+            [output_d_np, output_e_np],
+            epochs=1,
+            batch_size=5,
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testOptimizerWithCallbacks(self):
+        np.random.seed(1331)
+        input_np = np.random.random((10, 3))
+        output_np = np.random.random((10, 4))
+        a = input_layer.Input(shape=(3,), name="input_a")
+        model = sequential.Sequential()
+        model.add(core.Dense(4, kernel_initializer="zeros", name="dense"))
+        model.add(regularization.Dropout(0.5, name="dropout"))
+        model(a)
+        optimizer = gradient_descent.SGD(learning_rate=0.1)
+        model.compile(optimizer, loss="mse", metrics=["mae"])
+        # This does not reduce the LR after the first epoch (due to low delta).
+        cbks = [
+            callbacks.ReduceLROnPlateau(
+                monitor="val_loss",
+                factor=0.1,
+                min_delta=0,
+                patience=1,
+                cooldown=5,
+            )
+        ]
+        model.fit(
+            input_np,
+            output_np,
+            batch_size=10,
+            validation_data=(input_np, output_np),
+            callbacks=cbks,
+            epochs=2,
+            verbose=0,
+        )
+        self.assertAllClose(
+            float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4
+        )
+
+        # This should reduce the LR after the first epoch (due to high delta).
+        cbks = [
+            callbacks.ReduceLROnPlateau(
+                monitor="val_loss",
+                factor=0.1,
+                min_delta=10,
+                patience=1,
+                cooldown=5,
+            )
+        ]
+        model.fit(
+            input_np,
+            output_np,
+            batch_size=10,
+            validation_data=(input_np, output_np),
+            callbacks=cbks,
+            epochs=2,
+            verbose=2,
+        )
+        self.assertAllClose(
+            float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4
+        )
+
+    def testOptimizerSetIterations(self):
+        global_step = tf.compat.v1.train.get_or_create_global_step()
+        opt = adam.Adam(learning_rate=1.0)
+        opt.iterations = global_step
+        var = tf.Variable([1.0, 2.0], dtype=tf.float32)
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        init_step_value = self.evaluate(global_step)
+        loss = lambda: 3 * var
+        opt_op = opt.minimize(loss, [var])
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(opt_op)
+        new_step_value = self.evaluate(global_step)
+        self.assertEqual(new_step_value, init_step_value + 1)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testOptimizerWithCallableVarList(self):
+        train_samples = 20
+        input_dim = 1
+        num_classes = 2
+        (x, y), _ = test_utils.get_test_data(
+            train_samples=train_samples,
+            test_samples=10,
+            input_shape=(input_dim,),
+            num_classes=num_classes,
+        )
+        y = np_utils.to_categorical(y)
+
+        num_hidden = 1
+        model = test_utils.get_small_sequential_mlp(
+            num_hidden=num_hidden, num_classes=num_classes
+        )
+        opt = adam.Adam()
+
+        loss = lambda: losses.mean_squared_error(model(x), y)
+        var_list = lambda: model.trainable_weights
+
+        with self.assertRaisesRegex(
+            ValueError, "Weights for model .* have not yet been created"
+        ):
+            var_list()
+        train_op = opt.minimize(loss, var_list)
+        if not tf.executing_eagerly():
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.assertEqual(
+                [[0.0]], self.evaluate(opt.get_slot(var_list()[0], "m"))
+            )
+            self.evaluate(train_op)
+        self.assertNotEqual(
+            [[0.0]], self.evaluate(opt.get_slot(var_list()[0], "m"))
+        )
+        self.assertLen(var_list(), 4)
+
+    def testVarKey(self):
+        with tf.compat.v1.get_default_graph().as_default():
+            a = tf.Variable([1.0, 2.0], name="var")
+            b = tf.Variable([1.0], name="var")
+            self.assertTrue(a._in_graph_mode)
+            self.assertTrue(b._in_graph_mode)
+            var_key = optimizer_v2._var_key(a)
+            self.assertEqual("var", var_key)
+            var_key = optimizer_v2._var_key(b)
+            self.assertEqual("var_1", var_key)
+
+    def testVarName(self):
+        with tf.compat.v1.get_default_graph().as_default():
+            var = tf.Variable([1.0, 2.0], name="var")
+            loss = var + 1.0
+            opt = adam.Adam()
+            opt.get_updates(loss, [var])
+            opt_vars = opt.variables()
+            self.assertLen(opt_vars, 3)
+            self.assertEqual("Adam/iter:0", opt_vars[0].name)
+            self.assertEqual("Adam/var/m:0", opt_vars[1].name)
+            var_2 = tf.Variable([1.0, 2.0], name="var_2")
+            loss = var_2 + 1.0
+            with backend.name_scope("outter"):
+                opt.get_updates(loss, [var_2])
+            opt_vars = opt.variables()
+            self.assertLen(opt_vars, 5)
+            self.assertEqual("outter/Adam/var_2/m:0", opt_vars[3].name)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testEmptyVarList(self):
+        opt = gradient_descent.SGD(1.0)
+        opt.minimize(lambda: tf.constant(1.0), [])
+        opt.apply_gradients([])
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testAggregationTrue(self):
+        # Test that experimental_aggregate_gradients=True works without
+        # distributed strategy.
+        var = tf.Variable([1.0, 2.0])
+        opt = gradient_descent.SGD(3.0)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var))
+        opt_op = opt.apply_gradients(
+            [([0.1, 0.1], var)], experimental_aggregate_gradients=True
+        )
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(opt_op)
+        self.assertAllClose([0.7, 1.7], self.evaluate(var))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testAggregationFalse(self):
+        # Test that experimental_aggregate_gradients=False works without
+        # distributed strategy.
+        var = tf.Variable([1.0, 2.0])
+        opt = gradient_descent.SGD(3.0)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var))
+        opt_op = opt.apply_gradients(
+            [([0.1, 0.1], var)], experimental_aggregate_gradients=False
+        )
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(opt_op)
+        self.assertAllClose([0.7, 1.7], self.evaluate(var))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testRestoringIterationsWithoutAnOptimizer(self):
+        opt = gradient_descent.SGD(3.0)
+        opt.iterations.assign(5)
+        checkpoint = tf.train.Checkpoint(optimizer=opt)
+        path = checkpoint.save(self.get_temp_dir())
+
+        # Following verifies that the `iterations` can be restored with the
+        # absence of an `Optimizer` object (using a `Checkpoint` as a
+        # placeholder).
+        iterations_var = tf.Variable(0, dtype=tf.int64)
+        optimizer_checkpoint = tf.train.Checkpoint(iter=iterations_var)
+        checkpoint_to_restore = tf.train.Checkpoint(
+            optimizer=optimizer_checkpoint
+        )
+        checkpoint_to_restore.restore(path)
+
+        self.assertEqual(5, self.evaluate(iterations_var))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testSlotWithNonstandardShapeRestoresBasedOnCheckpoint(self):
+        # First create an optimizer and a slot variable with a non-standard
+        # shape.
+        x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
+        slot_shape = [2, 1]
+        optimizer_1 = optimizer_v2.OptimizerV2(name="test")
+        optimizer_1.add_slot(x, "test_slot", "ones", shape=slot_shape)
+
+        # Then save the variable and optimizer to a checkpoint.
+        checkpoint_1 = tf.train.Checkpoint(var=x, optimizer=optimizer_1)
+        checkpoint_path = checkpoint_1.save(self.get_temp_dir())
+
+        # Create a new optimizer and call restore on it (and x)
+        optimizer_2 = optimizer_v2.OptimizerV2(name="test")
+        checkpoint_2 = tf.train.Checkpoint(var=x, optimizer=optimizer_2)
+        checkpoint_2.restore(checkpoint_path)
+
+        self.assertEqual(
+            slot_shape, optimizer_2.get_slot(x, "test_slot").shape.as_list()
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_gradient_aggregator(self):
+        def gradient_aggregator(grads_and_vars):
+            # Simulate an all-reduce where the other replica has zeros for
+            # gradients, by dividing each gradient by 2.
+            grads = [g for g, _ in grads_and_vars]
+            vars = [v for _, v in grads_and_vars]
+            all_reduced_grads = [g / 2 for g in grads]
+            return list(zip(all_reduced_grads, vars))
+
+        var = tf.Variable(2.0)
+        sgd = gradient_descent.SGD(1.0, gradient_aggregator=gradient_aggregator)
+        loss = lambda: 2 * var
+        opt_op = sgd.minimize(loss, var_list=[var])
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(opt_op)
+        self.assertEqual(self.evaluate(var), 1.0)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_override_aggregate_gradients(self):
+        class MyOptimizer(gradient_descent.SGD):
+            def _aggregate_gradients(self, grads_and_vars):
+                # Simulate an all-reduce where the other replica has zeros for
+                # gradients, by dividing each gradient by 2.
+                grads = [g for g, _ in grads_and_vars]
+                vars = [v for _, v in grads_and_vars]
+                all_reduced_grads = [g / 2 for g in grads]
+                return list(zip(all_reduced_grads, vars))
+
+        var = tf.Variable(2.0)
+        sgd = MyOptimizer(1.0)
+        loss = lambda: 2 * var
+        opt_op = sgd.minimize(loss, var_list=[var])
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.evaluate(opt_op)
+        self.assertEqual(self.evaluate(var), 1.0)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_create_slots_for_sharded_variables(self):
+        # set names so that ShardedVariable is well-named for slot variable
+        # keying.
+        var_a = tf.Variable([1.0], name="part_0")
+        var_b = tf.Variable([2.0], name="part_1")
+        sharded_var = tf.__internal__.distribute.ShardedVariable([var_a, var_b])
+
+        opt = adagrad.Adagrad()
+        opt._create_slots(sharded_var.variables)
+        opt._create_slots_for_sharded_variables(sharded_var.variables)
+
+        sharded_slot = opt.get_slot(sharded_var, "accumulator")
+        self.assertIsInstance(
+            sharded_slot, tf.__internal__.distribute.ShardedVariable
+        )
+
+        slot_a = opt.get_slot(var_a, "accumulator")
+        self.assertAllClose(sharded_slot.variables[0], slot_a)
+        slot_b = opt.get_slot(var_b, "accumulator")
+        self.assertAllClose(sharded_slot.variables[1], slot_b)
+
+
+@test_combinations.run_all_keras_modes
+class OptimizersCompatibilityTest(test_combinations.TestCase):
+    def _testOptimizersCompatibility(self, opt_v1, opt_v2, test_weights=True):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        np.random.seed(1331)
+        with test_utils.use_gpu():
+            train_samples = 20
+            input_dim = 3
+            num_classes = 2
+            (x, y), _ = test_utils.get_test_data(
+                train_samples=train_samples,
+                test_samples=10,
+                input_shape=(input_dim,),
+                num_classes=num_classes,
+            )
+            y = np_utils.to_categorical(y)
+
+            num_hidden = 5
+            model_v1 = test_utils.get_small_sequential_mlp(
+                num_hidden=num_hidden,
+                num_classes=num_classes,
+                input_dim=input_dim,
+            )
+            model_v1.compile(
+                opt_v1,
+                loss="categorical_crossentropy",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model_v1.fit(x, y, batch_size=5, epochs=1)
+
+            model_v2 = test_utils.get_small_sequential_mlp(
+                num_hidden=num_hidden,
+                num_classes=num_classes,
+                input_dim=input_dim,
+            )
+            model_v2.set_weights(model_v1.get_weights())
+            model_v2.compile(
+                opt_v2,
+                loss="categorical_crossentropy",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            if not tf.compat.v1.executing_eagerly_outside_functions():
+                model_v2._make_train_function()
+            if test_weights:
+                opt_v2.set_weights(opt_v1.get_weights())
+
+            hist_1 = model_v1.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+            hist_2 = model_v2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+            self.assertAllClose(
+                model_v1.get_weights(),
+                model_v2.get_weights(),
+                rtol=1e-5,
+                atol=1e-5,
+            )
+            self.assertAllClose(
+                hist_1.history["loss"],
+                hist_2.history["loss"],
+                rtol=1e-5,
+                atol=1e-5,
+            )
+
+    def testAdadeltaCompatibility(self):
+        opt_v1 = optimizer_v1.Adadelta(lr=0.01)
+        opt_v2 = adadelta.Adadelta(learning_rate=0.01)
+        self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+    def testAdagradCompatibility(self):
+        opt_v1 = optimizer_v1.Adagrad(lr=0.01)
+        opt_v2 = adagrad.Adagrad(learning_rate=0.01)
+        self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+    def testAdamCompatibility(self):
+        opt_v1 = optimizer_v1.Adam()
+        opt_v2 = adam.Adam()
+        self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+    def testAdamaxCompatibility(self):
+        opt_v1 = optimizer_v1.Adamax(lr=0.01)
+        opt_v2 = adamax.Adamax(learning_rate=0.01)
+        self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+    def testNadamCompatibility(self):
+        opt_v1 = optimizer_v1.Nadam(lr=0.001)
+        opt_v2 = nadam.Nadam(learning_rate=0.001)
+        self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+    def testMomentumCompatibility(self):
+        opt_v1 = optimizer_v1.SGD(lr=0.01, momentum=0.9)
+        opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
+        self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+    def testRMSpropCompatibility(self):
+        opt_v1 = optimizer_v1.RMSprop()
+        opt_v2 = rmsprop.RMSprop()
+        self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+    def testSGDCompatibility(self):
+        opt_v1 = optimizer_v1.SGD(lr=0.01)
+        opt_v2 = gradient_descent.SGD(learning_rate=0.01)
+        self._testOptimizersCompatibility(opt_v1, opt_v2, False)
+
+    def testNumericEquivalenceForNesterovMomentum(self):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        np.random.seed(1331)
+        with test_utils.use_gpu():
+            train_samples = 20
+            input_dim = 3
+            num_classes = 2
+            (x, y), _ = test_utils.get_test_data(
+                train_samples=train_samples,
+                test_samples=10,
+                input_shape=(input_dim,),
+                num_classes=num_classes,
+            )
+            y = np_utils.to_categorical(y)
+
+            num_hidden = 5
+            model_k_v1 = test_utils.get_small_sequential_mlp(
+                num_hidden=num_hidden,
+                num_classes=num_classes,
+                input_dim=input_dim,
+            )
+            model_k_v2 = test_utils.get_small_sequential_mlp(
+                num_hidden=num_hidden,
+                num_classes=num_classes,
+                input_dim=input_dim,
+            )
+            model_k_v2.set_weights(model_k_v1.get_weights())
+            model_tf = test_utils.get_small_sequential_mlp(
+                num_hidden=num_hidden,
+                num_classes=num_classes,
+                input_dim=input_dim,
+            )
+            model_tf.set_weights(model_k_v2.get_weights())
+
+            opt_k_v1 = optimizer_v1.SGD(momentum=0.9, nesterov=True)
+            opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
+            opt_tf = tf.compat.v1.train.MomentumOptimizer(
+                learning_rate=0.01, momentum=0.9, use_nesterov=True
+            )
+
+            model_k_v1.compile(
+                opt_k_v1,
+                loss="categorical_crossentropy",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model_k_v2.compile(
+                opt_k_v2,
+                loss="categorical_crossentropy",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model_tf.compile(
+                opt_tf,
+                loss="categorical_crossentropy",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+            hist_k_v1 = model_k_v1.fit(
+                x, y, batch_size=5, epochs=10, shuffle=False
+            )
+            hist_k_v2 = model_k_v2.fit(
+                x, y, batch_size=5, epochs=10, shuffle=False
+            )
+            hist_tf = model_tf.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+            self.assertAllClose(
+                model_k_v1.get_weights(), model_tf.get_weights()
+            )
+            self.assertAllClose(
+                model_k_v1.get_weights(), model_k_v2.get_weights()
+            )
+            self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+            self.assertAllClose(
+                hist_k_v1.history["loss"], hist_tf.history["loss"]
+            )
+            self.assertAllClose(
+                hist_k_v1.history["loss"], hist_k_v2.history["loss"]
+            )
+
+    def testNumericEquivalenceForAmsgrad(self):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        np.random.seed(1331)
+        with test_utils.use_gpu():
+            train_samples = 20
+            input_dim = 3
+            num_classes = 2
+            (x, y), _ = test_utils.get_test_data(
+                train_samples=train_samples,
+                test_samples=10,
+                input_shape=(input_dim,),
+                num_classes=num_classes,
+            )
+            y = np_utils.to_categorical(y)
+
+            num_hidden = 5
+            model_k_v1 = test_utils.get_small_sequential_mlp(
+                num_hidden=num_hidden,
+                num_classes=num_classes,
+                input_dim=input_dim,
+            )
+            model_k_v2 = test_utils.get_small_sequential_mlp(
+                num_hidden=num_hidden,
+                num_classes=num_classes,
+                input_dim=input_dim,
+            )
+            model_k_v2.set_weights(model_k_v1.get_weights())
+
+            opt_k_v1 = optimizer_v1.Adam(amsgrad=True)
+            opt_k_v2 = adam.Adam(amsgrad=True)
+
+            model_k_v1.compile(
+                opt_k_v1,
+                loss="categorical_crossentropy",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model_k_v2.compile(
+                opt_k_v2,
+                loss="categorical_crossentropy",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+            hist_k_v1 = model_k_v1.fit(
+                x, y, batch_size=5, epochs=10, shuffle=False
+            )
+            hist_k_v2 = model_k_v2.fit(
+                x, y, batch_size=5, epochs=10, shuffle=False
+            )
+
+            self.assertAllClose(
+                model_k_v1.get_weights(), model_k_v2.get_weights()
+            )
+            self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+            self.assertAllClose(
+                hist_k_v1.history["loss"], hist_k_v2.history["loss"]
+            )
+
+
+# Note: These tests are kept in a separate class to avoid bugs in some
+# distributions of Python that break AutoGraph which is used by tf.function.
+@test_combinations.generate(test_combinations.combine(mode=["eager"]))
+class OptimizerWithFunctionTest(tf.test.TestCase, parameterized.TestCase):
+    def testBasic(self):
+        var = tf.Variable([1.0, 2.0], dtype=tf.float32)
+        loss = lambda: 3 * var
+        opt = adam.Adam(learning_rate=1.0)
+
+        @tf.function
+        def fn():
+            opt.minimize(loss, [var])
+            return var
+
+        self.assertAllClose([0.0, 1.0], fn(), atol=1e-4)
+        self.assertAllClose([-1, 0.0], fn(), atol=1e-4)
+
+    def testBasicWithConstantDecay(self):
+        var = tf.Variable([1.0, 2.0], dtype=tf.float32)
+        loss = lambda: 3 * var
+        opt = adam.Adam(learning_rate=1.0)
+
+        @tf.function
+        def fn():
+            opt.minimize(loss, [var])
+            return var
+
+        self.assertAllClose([0.0, 1.0], fn(), atol=1e-4)
+        self.assertAllClose([-1, 0.0], fn(), atol=1e-4)
+
+    def testVarKeyWithVarCreatedInEager(self):
+        a = tf.Variable([1.0, 2.0], name="var")
+        b = tf.Variable([1.0], name="var")
+
+        @tf_test_utils.also_run_as_tf_function
+        def var_key_test():
+            self.assertFalse(a._in_graph_mode)
+            self.assertFalse(b._in_graph_mode)
+            var_key_a = optimizer_v2._var_key(a)
+            self.assertStartsWith(var_key_a, "var_")
+            var_key_b = optimizer_v2._var_key(b)
+            self.assertStartsWith(var_key_b, "var_")
+            self.assertNotEqual(var_key_a, var_key_b)
+
+        var_key_test()
+
+    def testLearningRateDecayUsedInTwoFunctions(self):
+        a = tf.Variable([1.0, 2.0], name="var")
+        b = tf.Variable([1.0], name="var")
+
+        learning_rate_decay = learning_rate_schedule.InverseTimeDecay(
+            0.5, decay_steps=1.0, decay_rate=0.5
+        )
+        opt = adam.Adam(learning_rate=learning_rate_decay)
+        loss_a = lambda: 3 * a
+        loss_b = lambda: 2 * b
+
+        @tf.function
+        def fn_a():
+            opt.minimize(loss_a, [a])
+            return a
+
+        @tf.function
+        def fn_b():
+            opt.minimize(loss_b, [b])
+            return b
+
+        fn_a()
+        fn_b()
+
+
+_NUM_LEARNERS = 50
+APPLY_SCOPE = "debug_apply"
+ALLOWLIST = [
+    # optimizer_v2._deduplicate_indexed_slices contains an indexed slice:
+    #   array_ops.shape(unique_indices)[0]
+    # which winds up expanding to [0:1:1] thereby creating three constants
+    # to represent the indices.
+    ("embeddings/strided_slice/stack", "Const"),
+]
+
+
+def get_inputs(op):
+    op_inputs = list(op.inputs) + op.control_inputs
+    names = [i.name for i in op_inputs]
+    op_inputs = [getattr(i, "op", i) for i in op_inputs]
+    return op_inputs, names
+
+
+def strip_name(node):
+    if "Placeholder" in node.op:
+        return
+    node.name = ""
+
+
+def topological_sort(graph):
+    graph_ops = graph.get_operations()
+
+    sources = []
+    result = []
+
+    inputs = {}
+    outputs = collections.defaultdict(set)
+    for op in graph_ops:
+        op_inputs = get_inputs(op)[0]
+        if not op_inputs:
+            sources.append(op)
+
+        inputs[op] = set(op_inputs)
+        for i in op_inputs:
+            outputs[i].add(op)
+
+    while sources:
+        op = sources.pop()
+        for op_output in outputs[op]:
+            inputs[op_output].remove(op)
+            if not inputs[op_output]:
+                sources.append(op_output)
+
+        result.append(op)
+
+    # Check correctness.
+    if len(result) != len(graph_ops):
+        raise ValueError(
+            f"Sort result has {len(result)} ops, "
+            f"source graph has {len(graph_ops)}."
+        )
+
+    sort_check_seen = set()
+    for op in result:
+        sort_check_seen.add(op)
+        for i in get_inputs(op)[0]:
+            assert i in sort_check_seen
+
+    return result
+
+
+def identify_redundant_ops(graph):
+    """Implements basic common subexpression elimination.
+
+    This is not intended to replicate the graph semantics of TensorFlow Graphs
+    (for instance it does not handle stateful op ordering), nor is it intended
+    to replace the common subexpression elimination Grappler pass. Rather, it
+    provides a high level sanity check that clearly redundant ops are not being
+    created.
+
+    Args:
+      graph: The graph to be analyzed.
+
+    Returns:
+      A count of the duplicate ops and a description of the structure of each.
+    """
+    sorted_ops = topological_sort(graph)
+    duplicates = collections.defaultdict(list)
+    unified_node_defs = {}
+    name_map = {}
+
+    for op in sorted_ops:
+        input_names = []
+        for op_input, name in zip(*get_inputs(op)):
+            input_def = op_input.node_def
+
+            # Operations can have multiple outputs. We track which is used to
+            # prevent overzealous elimination.
+            input_def.name = name
+
+            input_def.input[:] = [name_map.get(i, i) for i in input_def.input]
+            strip_name(input_def)
+
+            # NodeDef.SerializeToString() does not provide identical serialized
+            # representations for identical NodeDefs, so we instead use string
+            # representation as a dict key.
+            key = repr(input_def)
+
+            if key in unified_node_defs:
+                input_names.append(unified_node_defs[key])
+
+            else:
+                unified_node_defs[key] = op_input.name
+                input_names.append(name)
+
+        node_def = op.node_def
+        node_def.input[:] = input_names
+        strip_name(node_def)
+
+        key = repr(node_def)
+        duplicates[key].append(op)
+        name_map[op.name] = duplicates[key][0].name
+
+    num_duplicates = 0
+    duplicate_types = []
+    for standard_def, op_defs in duplicates.items():
+        # We are only interested in testing the apply method of the optimizer
+        op_defs = [i for i in op_defs if APPLY_SCOPE in i.name]
+
+        # We only check for per-apply redundant ops.
+        if len(op_defs) < _NUM_LEARNERS:
+            continue
+
+        # Certain ops are simply not worth eliminating, and are instead simply
+        # ignored.
+        name, op_type = op_defs[0].name, op_defs[0].type
+        if any(
+            allowlisted_scope in name and op_type == allowlisted_type
+            for allowlisted_scope, allowlisted_type in ALLOWLIST
+        ):
+            continue
+
+        num_duplicates += len(op_defs)
+        traceback = []
+        for level in op_defs[0].traceback:
+            traceback.append(f"  {level[0]} {level[2]}:{level[1]}")
+
+        duplicate_types.append(
+            "# Example name: {}\n# Op creation stack:\n{}\n{}".format(
+                op_defs[0].name, "\n".join(traceback), standard_def
+            )
+        )
+
+    return num_duplicates, duplicate_types
+
+
+def make_model():
+    r"""Constructs a simple ensemble of weak learners model.
+
+    ---------    ---------             ---------    ---------
+    | Input |    | Input |     ...     | Input |    | Input |
+    ---------    ---------             ---------    ---------
+        |            |                     |            |
+        V            V                     V            V
+    ---------    ---------             ---------    ---------
+    | Embed |    | Embed |     ...     | Embed |    | Embed |
+    ---------    ---------             ---------    ---------
+        |            |                     |            |
+        V            V                     V            V
+    ---------    ---------             ---------    ---------
+    | Dense |    | Dense |     ...     | Dense |    | Dense |
+    ---------    ---------             ---------    ---------
+        \            |                     |            /
+         \           |                     |           /
+          ---------------------------------------------
+                                |
+                            ---------
+                            | Dense |
+                            ---------
+
+    This topology is chosen because it exercises both dense and sparse update
+    paths.
+
+    Returns:
+      A model for testing optimizer coefficient reuse.
+    """
+    inputs = []
+    intermediates = []
+    for _ in range(_NUM_LEARNERS):
+        inp = keras.layers.Input(shape=(1,), dtype=tf.int32)
+        layer = keras.layers.Embedding(1, 4)(inp)
+        layer = keras.layers.Dense(1)(layer)
+
+        inputs.append(inp)
+        intermediates.append(layer)
+
+    layer = keras.layers.Concatenate(axis=-1)(intermediates)
+    layer = keras.layers.Dense(1)(layer)
+
+    return keras.models.Model(inputs, layer)
+
+
+COEFFICIENT_PARAMS = (
+    ("Adadelta", adadelta.Adadelta, None),
+    ("Adagrad", adagrad.Adagrad, None),
+    ("Adam", adam.Adam, None),
+    ("Adam_amdgrad", adam.Adam, dict(amsgrad=True)),
+    ("Adamax", adamax.Adamax, None),
+    ("Ftrl", ftrl.Ftrl, None),
+    (
+        "Ftrl_l2_shrinkage",
+        ftrl.Ftrl,
+        dict(l2_shrinkage_regularization_strength=0.1),
+    ),
+    ("SGD", gradient_descent.SGD, None),
+    ("SGD_momentum", gradient_descent.SGD, dict(momentum=0.5)),
+    ("Nadam", nadam.Nadam, None),
+    ("RMSprop", rmsprop.RMSprop, None),
+    ("RMSprop_centered", rmsprop.RMSprop, dict(centered=True)),
+    ("RMSprop_momentum", rmsprop.RMSprop, dict(momentum=0.5)),
+    (
+        "RMSprop_momentum_centered",
+        rmsprop.RMSprop,
+        dict(momentum=0.5, centered=True),
+    ),
+)
+
+
+class OptimizerCoefficientTest(test_combinations.TestCase):
+    @parameterized.named_parameters(*COEFFICIENT_PARAMS)
+    def test_duplicate_ops(self, optimizer_class, init_kwargs=None):
+        init_kwargs = init_kwargs or {}
+        optimizer = optimizer_class(**init_kwargs)
+
+        graph = tf.Graph()
+        with graph.as_default():
+            model = make_model()
+            trainable_variables = model.trainable_variables
+            grads = optimizer.get_gradients(
+                model.outputs[0], trainable_variables
+            )
+
+            with backend.name_scope(APPLY_SCOPE):
+                optimizer.apply_gradients(zip(grads, trainable_variables))
+
+        num_duplicates, duplicate_types = identify_redundant_ops(graph)
+        if num_duplicates:
+            # Avoid spamming logs.
+            if len(duplicate_types) > 3:
+                duplicate_types = duplicate_types[:3] + ["..."]
+
+            num_total = len(graph.get_operations())
+            raise ValueError(
+                "{} of {} ({:.1f}%) ops were duplicates:\n\n{}".format(
+                    num_duplicates,
+                    num_total,
+                    num_duplicates / num_total * 100,
+                    "\n".join(duplicate_types),
+                )
+            )
+
+    @parameterized.named_parameters(*COEFFICIENT_PARAMS)
+    def test_subclass_compat(self, optimizer_class, init_kwargs=None):
+        """Ensure that subclassed optimizers without apply_state still work."""
+
+        class SubclassedOptimizer(optimizer_class):
+            def _resource_apply_dense(self, grad, var):
+                return super()._resource_apply_dense(grad, var)
+
+            def _resource_apply_sparse(self, grad, var, indices):
+                return super()._resource_apply_sparse(grad, var, indices)
+
+        init_kwargs = init_kwargs or {}
+        optimizer = SubclassedOptimizer(**init_kwargs)
+
+        graph = tf.Graph()
+        with graph.as_default():
+            model = make_model()
+            trainable_variables = model.trainable_variables
+            grads = optimizer.get_gradients(
+                model.outputs[0], trainable_variables
+            )
+
+            with backend.name_scope(APPLY_SCOPE):
+                optimizer.apply_gradients(zip(grads, trainable_variables))
+
+
+class DeepcopyTests(tf.test.TestCase):
+    def setUp(self):
+        self.optimizer = adam.Adam(0.42)
+        super().setUp()
+
+    def test_deepcopy(self):
+        clone = deepcopy(self.optimizer)
+        assert clone.get_config()["learning_rate"] == 0.42, "wrong lr"
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/rmsprop.py b/keras/optimizers/legacy/rmsprop.py
index fe1bf7ab1a33..5537de9cc8ab 100644
--- a/keras/optimizers/legacy/rmsprop.py
+++ b/keras/optimizers/legacy/rmsprop.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,338 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Legacy RMSprop optimizer implementation."""
+"""RMSprop optimizer implementation."""
 
-from keras.optimizers.optimizer_v2 import rmsprop
+import numpy as np
+import tensorflow.compat.v2 as tf
 
+from keras import backend_config
+from keras.optimizers.legacy import optimizer_v2
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.legacy.RMSprop')
-class RMSprop(rmsprop.RMSprop):
-  pass
+@keras_export(
+    "keras.optimizers.legacy.RMSprop",
+    v1=["keras.optimizers.RMSprop", "keras.optimizers.legacy.RMSprop"],
+)
+class RMSprop(optimizer_v2.OptimizerV2):
+    r"""Optimizer that implements the RMSprop algorithm.
+
+    The gist of RMSprop is to:
+
+    - Maintain a moving (discounted) average of the square of gradients
+    - Divide the gradient by the root of this average
+
+    This implementation of RMSprop uses plain momentum, not Nesterov momentum.
+
+    The centered version additionally maintains a moving average of the
+    gradients, and uses that average to estimate the variance.
+
+    Args:
+      learning_rate: A `Tensor`, floating point value, or a schedule that is a
+        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+        that takes no arguments and returns the actual value to use. The
+        learning rate. Defaults to `0.001`.
+      rho: Discounting factor for the history/coming gradient. Defaults to
+        `0.9`.
+      momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
+        `1e-7`.
+      centered: Boolean. If `True`, gradients are normalized by the estimated
+        variance of the gradient; if False, by the uncentered second moment.
+        Setting this to `True` may help with training, but is slightly more
+        expensive in terms of computation and memory. Defaults to `False`.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to `"RMSprop"`.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Usage:
+
+    >>> opt = tf.keras.optimizers.legacy.RMSprop(learning_rate=0.1)
+    >>> var1 = tf.Variable(10.0)
+    >>> loss = lambda: (var1 ** 2) / 2.0    # d(loss) / d(var1) = var1
+    >>> step_count = opt.minimize(loss, [var1]).numpy()
+    >>> var1.numpy()
+    9.683772
+
+    Reference:
+      - [Hinton, 2012](
+        http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+    """
+
+    _HAS_AGGREGATE_GRAD = True
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        rho=0.9,
+        momentum=0.0,
+        epsilon=1e-7,
+        centered=False,
+        name="RMSprop",
+        **kwargs,
+    ):
+        """Construct a new RMSprop optimizer.
+
+        Args:
+          learning_rate: A `Tensor`, floating point value, or a schedule that is
+            a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
+            callable that takes no arguments and returns the actual value to
+            use. The learning rate. Defaults to `0.001`.
+          rho: Discounting factor for the history/coming gradient. Defaults to
+            `0.9`.
+          momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
+          epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults
+            to 1e-7.
+          centered: Boolean. If `True`, gradients are normalized by the
+            estimated variance of the gradient; if False, by the uncentered
+            second moment.  Setting this to `True` may help with training, but
+            is slightly more expensive in terms of computation and memory.
+            Defaults to `False`.
+          name: Optional name prefix for the operations created when applying
+            gradients. Defaults to "RMSprop".
+          **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
+            `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is
+            clip gradients by value, `decay` is included for backward
+            compatibility to allow time inverse decay of learning rate. `lr` is
+            included for backward compatibility, recommended to use
+            `learning_rate` instead.
+
+        @compatibility(eager)
+        When eager execution is enabled, `learning_rate`, `decay`, `momentum`,
+        and `epsilon` can each be a callable that takes no arguments and returns
+        the actual value to use. This can be useful for changing these values
+        across different invocations of optimizer functions.
+        @end_compatibility
+        """
+        super().__init__(name, **kwargs)
+        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
+        self._set_hyper("decay", self._initial_decay)
+        self._set_hyper("rho", rho)
+
+        self._momentum = False
+        if (
+            isinstance(momentum, tf.Tensor)
+            or callable(momentum)
+            or momentum > 0
+        ):
+            self._momentum = True
+        if isinstance(momentum, (int, float)) and (
+            momentum < 0 or momentum > 1
+        ):
+            raise ValueError(
+                "`momentum` must be between [0, 1]. Received: "
+                f"momentum={momentum} (of type {type(momentum)})."
+            )
+        self._set_hyper("momentum", momentum)
+
+        self.epsilon = epsilon or backend_config.epsilon()
+        self.centered = centered
+
+    def _create_slots(self, var_list):
+        for var in var_list:
+            self.add_slot(var, "rms")
+        if self._momentum:
+            for var in var_list:
+                self.add_slot(var, "momentum")
+        if self.centered:
+            for var in var_list:
+                self.add_slot(var, "mg")
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+
+        rho = tf.identity(self._get_hyper("rho", var_dtype))
+        apply_state[(var_device, var_dtype)].update(
+            dict(
+                neg_lr_t=-apply_state[(var_device, var_dtype)]["lr_t"],
+                epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+                rho=rho,
+                momentum=tf.identity(self._get_hyper("momentum", var_dtype)),
+                one_minus_rho=1.0 - rho,
+            )
+        )
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        rms = self.get_slot(var, "rms")
+        if self._momentum:
+            mom = self.get_slot(var, "momentum")
+            if self.centered:
+                mg = self.get_slot(var, "mg")
+                return tf.raw_ops.ResourceApplyCenteredRMSProp(
+                    var=var.handle,
+                    mg=mg.handle,
+                    ms=rms.handle,
+                    mom=mom.handle,
+                    lr=coefficients["lr_t"],
+                    rho=coefficients["rho"],
+                    momentum=coefficients["momentum"],
+                    epsilon=coefficients["epsilon"],
+                    grad=grad,
+                    use_locking=self._use_locking,
+                )
+            else:
+                return tf.raw_ops.ResourceApplyRMSProp(
+                    var=var.handle,
+                    ms=rms.handle,
+                    mom=mom.handle,
+                    lr=coefficients["lr_t"],
+                    rho=coefficients["rho"],
+                    momentum=coefficients["momentum"],
+                    epsilon=coefficients["epsilon"],
+                    grad=grad,
+                    use_locking=self._use_locking,
+                )
+        else:
+            rms_t = coefficients["rho"] * rms + coefficients[
+                "one_minus_rho"
+            ] * tf.square(grad)
+            rms_t = tf.compat.v1.assign(
+                rms, rms_t, use_locking=self._use_locking
+            )
+            denom_t = rms_t
+            if self.centered:
+                mg = self.get_slot(var, "mg")
+                mg_t = (
+                    coefficients["rho"] * mg
+                    + coefficients["one_minus_rho"] * grad
+                )
+                mg_t = tf.compat.v1.assign(
+                    mg, mg_t, use_locking=self._use_locking
+                )
+                denom_t = rms_t - tf.square(mg_t)
+            var_t = var - coefficients["lr_t"] * grad / (
+                tf.sqrt(denom_t) + coefficients["epsilon"]
+            )
+            return tf.compat.v1.assign(
+                var, var_t, use_locking=self._use_locking
+            ).op
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = (apply_state or {}).get(
+            (var_device, var_dtype)
+        ) or self._fallback_apply_state(var_device, var_dtype)
+
+        rms = self.get_slot(var, "rms")
+        if self._momentum:
+            mom = self.get_slot(var, "momentum")
+            if self.centered:
+                mg = self.get_slot(var, "mg")
+                return tf.raw_ops.ResourceSparseApplyCenteredRMSProp(
+                    var=var.handle,
+                    mg=mg.handle,
+                    ms=rms.handle,
+                    mom=mom.handle,
+                    lr=coefficients["lr_t"],
+                    rho=coefficients["rho"],
+                    momentum=coefficients["momentum"],
+                    epsilon=coefficients["epsilon"],
+                    grad=grad,
+                    indices=indices,
+                    use_locking=self._use_locking,
+                )
+            else:
+                return tf.raw_ops.ResourceSparseApplyRMSProp(
+                    var=var.handle,
+                    ms=rms.handle,
+                    mom=mom.handle,
+                    lr=coefficients["lr_t"],
+                    rho=coefficients["rho"],
+                    momentum=coefficients["momentum"],
+                    epsilon=coefficients["epsilon"],
+                    grad=grad,
+                    indices=indices,
+                    use_locking=self._use_locking,
+                )
+        else:
+            rms_scaled_g_values = (grad * grad) * coefficients["one_minus_rho"]
+            rms_t = tf.compat.v1.assign(
+                rms, rms * coefficients["rho"], use_locking=self._use_locking
+            )
+            with tf.control_dependencies([rms_t]):
+                rms_t = self._resource_scatter_add(
+                    rms, indices, rms_scaled_g_values
+                )
+                rms_slice = tf.gather(rms_t, indices)
+            denom_slice = rms_slice
+            if self.centered:
+                mg = self.get_slot(var, "mg")
+                mg_scaled_g_values = grad * coefficients["one_minus_rho"]
+                mg_t = tf.compat.v1.assign(
+                    mg, mg * coefficients["rho"], use_locking=self._use_locking
+                )
+                with tf.control_dependencies([mg_t]):
+                    mg_t = self._resource_scatter_add(
+                        mg, indices, mg_scaled_g_values
+                    )
+                    mg_slice = tf.gather(mg_t, indices)
+                    denom_slice = rms_slice - tf.square(mg_slice)
+            var_update = self._resource_scatter_add(
+                var,
+                indices,
+                coefficients["neg_lr_t"]
+                * grad
+                / (tf.sqrt(denom_slice) + coefficients["epsilon"]),
+            )
+            if self.centered:
+                return tf.group(*[var_update, rms_t, mg_t])
+            return tf.group(*[var_update, rms_t])
+
+    def set_weights(self, weights):
+        params = self.weights
+        # Override set_weights for backward compatibility of Keras V1 optimizer
+        # since it does not include iteration at head of the weight list. Set
+        # iteration to 0.
+        if len(params) == len(weights) + 1:
+            weights = [np.array(0)] + weights
+        super().set_weights(weights)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    "learning_rate"
+                ),
+                "decay": self._initial_decay,
+                "rho": self._serialize_hyperparameter("rho"),
+                "momentum": self._serialize_hyperparameter("momentum"),
+                "epsilon": self.epsilon,
+                "centered": self.centered,
+            }
+        )
+        return config
+
+
+RMSProp = RMSprop
diff --git a/keras/optimizers/legacy/rmsprop_test.py b/keras/optimizers/legacy/rmsprop_test.py
new file mode 100644
index 000000000000..f47d3f6b6717
--- /dev/null
+++ b/keras/optimizers/legacy/rmsprop_test.py
@@ -0,0 +1,814 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop."""
+
+import copy
+import itertools
+import math
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+from keras.optimizers.legacy import rmsprop
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+
+_DATA_TYPES = [tf.half, tf.float32, tf.float64, tf.complex64, tf.complex128]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, rho, momentum, epsilon, centered
+    [0.05, 0.9, 0.0, 1e-3, True],
+    [0.05, 0.9, 0.0, 1e-3, False],
+    [0.1, 0.9, 0.0, 1e-3, True],
+    [0.01, 0.9, 0.0, 1e-5, True],
+    [0.01, 0.9, 0.9, 1e-5, True],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSpropOptimizerTest(tf.test.TestCase, parameterized.TestCase):
+    def _rmsprop_update_numpy(
+        self, var, g, mg, rms, mom, lr, rho, momentum, epsilon, centered
+    ):
+        rms_t = rms * rho + (1 - rho) * g * g
+        if centered:
+            mg_t = mg * rho + (1 - rho) * g
+            denom_t = rms_t - mg_t * mg_t
+        else:
+            mg_t = mg
+            denom_t = rms_t
+        if momentum > 0.0:
+            mom_t = momentum * mom + lr * g / (np.sqrt(denom_t + epsilon))
+            var_t = var - mom_t
+        else:
+            mom_t = mom
+            var_t = var - lr * g / (np.sqrt(denom_t) + epsilon)
+        return var_t, mg_t, rms_t, mom_t
+
+    def _sparse_rmsprop_update_numpy(
+        self,
+        var,
+        gindexs,
+        gvalues,
+        mg,
+        rms,
+        mom,
+        lr,
+        rho,
+        momentum,
+        epsilon,
+        centered,
+    ):
+        mg_t = copy.deepcopy(mg)
+        rms_t = copy.deepcopy(rms)
+        mom_t = copy.deepcopy(mom)
+        var_t = copy.deepcopy(var)
+        for i in range(len(gindexs)):
+            gindex = gindexs[i]
+            gvalue = gvalues[i]
+            rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
+            if centered:
+                mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
+                denom_t = rms_t[gindex] - mg_t[gindex] * mg_t[gindex]
+            else:
+                denom_t = rms_t[gindex]
+            if momentum > 0.0:
+                mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(
+                    denom_t + epsilon
+                )
+                var_t[gindex] = var[gindex] - mom_t[gindex]
+            else:
+                mom_t[gindex] = mom[gindex]
+                var_t[gindex] = var[gindex] - lr * gvalue / (
+                    np.sqrt(denom_t) + epsilon
+                )
+        return var_t, mg_t, rms_t, mom_t
+
+    def testDense(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for (
+            dtype,
+            learning_rate,
+            rho,
+            momentum,
+            epsilon,
+            centered,
+        ) in _TESTPARAMS:
+            with tf.compat.v1.get_default_graph().as_default(), test_utils.use_gpu():  # noqa: E501
+                # Initialize variables for numpy implementation.
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np, dtype=dtype)
+                var1 = tf.Variable(var1_np, dtype=dtype)
+                grads0 = tf.constant(grads0_np, dtype=dtype)
+                grads1 = tf.constant(grads1_np, dtype=dtype)
+                opt = rmsprop.RMSprop(
+                    learning_rate=learning_rate,
+                    rho=rho,
+                    momentum=momentum,
+                    epsilon=epsilon,
+                    centered=centered,
+                )
+
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                if centered:
+                    mg0 = opt.get_slot(var0, "mg")
+                    mg1 = opt.get_slot(var1, "mg")
+                else:
+                    mg0 = None
+                    mg1 = None
+
+                if momentum > 0.0:
+                    mom0 = opt.get_slot(var0, "momentum")
+                    mom1 = opt.get_slot(var1, "momentum")
+                else:
+                    mom0 = None
+                    mom1 = None
+
+                rms0 = opt.get_slot(var0, "rms")
+                self.assertIsNotNone(rms0)
+                rms1 = opt.get_slot(var1, "rms")
+                self.assertIsNotNone(rms1)
+
+                mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of RMSprop
+                for _ in range(1, 4):
+                    self.evaluate(update)
+
+                    (
+                        var0_np,
+                        mg0_np,
+                        rms0_np,
+                        mom0_np,
+                    ) = self._rmsprop_update_numpy(
+                        var0_np,
+                        grads0_np,
+                        mg0_np,
+                        rms0_np,
+                        mom0_np,
+                        learning_rate,
+                        rho,
+                        momentum,
+                        epsilon,
+                        centered,
+                    )
+                    (
+                        var1_np,
+                        mg1_np,
+                        rms1_np,
+                        mom1_np,
+                    ) = self._rmsprop_update_numpy(
+                        var1_np,
+                        grads1_np,
+                        mg1_np,
+                        rms1_np,
+                        mom1_np,
+                        learning_rate,
+                        rho,
+                        momentum,
+                        epsilon,
+                        centered,
+                    )
+
+                    # Validate updated params
+                    if centered:
+                        self.assertAllCloseAccordingToType(
+                            mg0_np, self.evaluate(mg0)
+                        )
+                        self.assertAllCloseAccordingToType(
+                            mg1_np, self.evaluate(mg1)
+                        )
+                    if momentum > 0.0:
+                        self.assertAllCloseAccordingToType(
+                            mom0_np, self.evaluate(mom0)
+                        )
+                        self.assertAllCloseAccordingToType(
+                            mom1_np, self.evaluate(mom1)
+                        )
+                    self.assertAllCloseAccordingToType(
+                        rms0_np, self.evaluate(rms0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        rms1_np, self.evaluate(rms1)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    def testDenseWithLearningRateDecay(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            var0_np = np.array([1.0, 2.0])
+            grads0_np = np.array([0.1, 0.2])
+            var1_np = np.array([3.0, 4.0])
+            grads1_np = np.array([0.01, 0.2])
+
+            var0 = tf.Variable(var0_np)
+            var1 = tf.Variable(var1_np)
+            grads0 = tf.constant(grads0_np)
+            grads1 = tf.constant(grads1_np)
+            learning_rate = 0.01
+            rho = 0.9
+            momentum = 0.0
+            epsilon = 1e-7
+            centered = False
+            decay = 0.5
+            opt = rmsprop.RMSprop(
+                learning_rate=learning_rate,
+                rho=rho,
+                momentum=momentum,
+                epsilon=epsilon,
+                centered=centered,
+                decay=decay,
+            )
+
+            update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            rms0 = opt.get_slot(var0, "rms")
+            self.assertIsNotNone(rms0)
+            rms1 = opt.get_slot(var1, "rms")
+            self.assertIsNotNone(rms1)
+            if momentum > 0.0:
+                mom0 = opt.get_slot(var0, "momentum")
+                mom1 = opt.get_slot(var1, "momentum")
+            else:
+                mom0 = None
+                mom1 = None
+
+            mg0_np = np.array([0.0, 0.0])
+            mg1_np = np.array([0.0, 0.0])
+            rms0_np = np.array([0.0, 0.0])
+            rms1_np = np.array([0.0, 0.0])
+            mom0_np = np.array([0.0, 0.0])
+            mom1_np = np.array([0.0, 0.0])
+
+            # Fetch params to validate initial values
+            self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+            self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+            # Run 4 steps of RMSprop
+            for t in range(2):
+                self.evaluate(update)
+
+                lr = learning_rate / (1 + decay * t)
+                var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+                    var0_np,
+                    grads0_np,
+                    mg0_np,
+                    rms0_np,
+                    mom0_np,
+                    lr,
+                    rho,
+                    momentum,
+                    epsilon,
+                    centered,
+                )
+                var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+                    var1_np,
+                    grads1_np,
+                    mg1_np,
+                    rms1_np,
+                    mom1_np,
+                    lr,
+                    rho,
+                    momentum,
+                    epsilon,
+                    centered,
+                )
+
+                # Validate updated params
+                self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+                self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+                if momentum > 0.0:
+                    self.assertAllCloseAccordingToType(
+                        mom0_np, self.evaluate(mom0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        mom1_np, self.evaluate(mom1)
+                    )
+                self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+                self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+    def testDenseWithLearningRateInverseTimeDecay(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            var0_np = np.array([1.0, 2.0])
+            grads0_np = np.array([0.1, 0.2])
+            var1_np = np.array([3.0, 4.0])
+            grads1_np = np.array([0.01, 0.2])
+
+            var0 = tf.Variable(var0_np)
+            var1 = tf.Variable(var1_np)
+            grads0 = tf.constant(grads0_np)
+            grads1 = tf.constant(grads1_np)
+            learning_rate = 0.01
+            rho = 0.9
+            momentum = 0.0
+            epsilon = 1e-7
+            centered = False
+            decay = 0.5
+            lr_schedule = learning_rate_schedule.InverseTimeDecay(
+                learning_rate, decay_steps=1.0, decay_rate=decay
+            )
+            opt = rmsprop.RMSprop(
+                learning_rate=lr_schedule,
+                rho=rho,
+                momentum=momentum,
+                epsilon=epsilon,
+                centered=centered,
+            )
+
+            update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+
+            rms0 = opt.get_slot(var0, "rms")
+            self.assertIsNotNone(rms0)
+            rms1 = opt.get_slot(var1, "rms")
+            self.assertIsNotNone(rms1)
+            if momentum > 0.0:
+                mom0 = opt.get_slot(var0, "momentum")
+                mom1 = opt.get_slot(var1, "momentum")
+            else:
+                mom0 = None
+                mom1 = None
+
+            mg0_np = np.array([0.0, 0.0])
+            mg1_np = np.array([0.0, 0.0])
+            rms0_np = np.array([0.0, 0.0])
+            rms1_np = np.array([0.0, 0.0])
+            mom0_np = np.array([0.0, 0.0])
+            mom1_np = np.array([0.0, 0.0])
+
+            # Fetch params to validate initial values
+            self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+            self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+            # Run 4 steps of RMSprop
+            for t in range(2):
+                self.evaluate(update)
+
+                lr = learning_rate / (1 + decay * t)
+                var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+                    var0_np,
+                    grads0_np,
+                    mg0_np,
+                    rms0_np,
+                    mom0_np,
+                    lr,
+                    rho,
+                    momentum,
+                    epsilon,
+                    centered,
+                )
+                var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+                    var1_np,
+                    grads1_np,
+                    mg1_np,
+                    rms1_np,
+                    mom1_np,
+                    lr,
+                    rho,
+                    momentum,
+                    epsilon,
+                    centered,
+                )
+
+                # Validate updated params
+                self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+                self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+                if momentum > 0.0:
+                    self.assertAllCloseAccordingToType(
+                        mom0_np, self.evaluate(mom0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        mom1_np, self.evaluate(mom1)
+                    )
+                self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+                self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+    def testMinimizeSparseResourceVariable(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+                x = tf.constant([[4.0], [5.0]], dtype=dtype)
+
+                def loss():
+                    pred = tf.matmul(
+                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x
+                    )
+                    return pred * pred
+
+                sgd_op = rmsprop.RMSprop(
+                    learning_rate=1.0,
+                    rho=0.0,
+                    momentum=0.0,
+                    epsilon=0.0,
+                    centered=False,
+                ).minimize(loss, var_list=[var0])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllCloseAccordingToType(
+                    [[1.0, 2.0]], self.evaluate(var0)
+                )
+                # Run 1 step of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[0.0, 1.0]], self.evaluate(var0), atol=0.01
+                )
+
+    def testMinimizeSparseResourceVariableCentered(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            for dtype in _DATA_TYPES:
+                var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
+                x = tf.constant([[4.0], [5.0]], dtype=dtype)
+
+                def loss():
+                    pred = tf.matmul(
+                        tf.compat.v1.nn.embedding_lookup([var0], [0]), x
+                    )
+                    return pred * pred
+
+                # loss = lambda: pred * pred
+                # disable=cell-var-from-loop
+                sgd_op = rmsprop.RMSprop(
+                    learning_rate=1.0,
+                    rho=0.0,
+                    momentum=0.0,
+                    epsilon=1.0,
+                    centered=True,
+                ).minimize(loss, var_list=[var0])
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+                # Fetch params to validate initial values
+                self.assertAllCloseAccordingToType(
+                    [[1.0, 2.0]], self.evaluate(var0)
+                )
+                # Run 1 step of sgd
+                self.evaluate(sgd_op)
+                # Validate updated params
+                self.assertAllCloseAccordingToType(
+                    [[-111, -138]], self.evaluate(var0), atol=0.01
+                )
+
+    def testSparse(self):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        for (
+            dtype,
+            learning_rate,
+            rho,
+            momentum,
+            epsilon,
+            centered,
+        ) in _TESTPARAMS:
+            with tf.compat.v1.get_default_graph().as_default(), test_utils.use_gpu():  # noqa: E501
+                # Initialize variables for numpy implementation.
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0_np_indices = np.array([0], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([1]),
+                )
+                grads1_np_indices = np.array([1], dtype=np.int32)
+                grads1 = tf.IndexedSlices(
+                    tf.constant(grads1_np),
+                    tf.constant(grads1_np_indices),
+                    tf.constant([1]),
+                )
+                opt = rmsprop.RMSprop(
+                    learning_rate=learning_rate,
+                    rho=rho,
+                    momentum=momentum,
+                    epsilon=epsilon,
+                    centered=centered,
+                )
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1])
+                )
+                self.evaluate(tf.compat.v1.global_variables_initializer())
+
+                if centered:
+                    mg0 = opt.get_slot(var0, "mg")
+                    self.assertEqual(mg0 is not None, centered)
+                    mg1 = opt.get_slot(var1, "mg")
+                    self.assertEqual(mg1 is not None, centered)
+                else:
+                    mg0 = None
+                    mg1 = None
+                rms0 = opt.get_slot(var0, "rms")
+                self.assertIsNotNone(rms0)
+                rms1 = opt.get_slot(var1, "rms")
+                self.assertIsNotNone(rms1)
+                if momentum > 0.0:
+                    mom0 = opt.get_slot(var0, "momentum")
+                    mom1 = opt.get_slot(var1, "momentum")
+                else:
+                    mom0 = None
+                    mom1 = None
+
+                mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of RMSprop
+                for _ in range(1, 4):
+                    self.evaluate(update)
+
+                    (
+                        var0_np,
+                        mg0_np,
+                        rms0_np,
+                        mom0_np,
+                    ) = self._sparse_rmsprop_update_numpy(
+                        var0_np,
+                        grads0_np_indices,
+                        grads0_np,
+                        mg0_np,
+                        rms0_np,
+                        mom0_np,
+                        learning_rate,
+                        rho,
+                        momentum,
+                        epsilon,
+                        centered,
+                    )
+                    (
+                        var1_np,
+                        mg1_np,
+                        rms1_np,
+                        mom1_np,
+                    ) = self._sparse_rmsprop_update_numpy(
+                        var1_np,
+                        grads1_np_indices,
+                        grads1_np,
+                        mg1_np,
+                        rms1_np,
+                        mom1_np,
+                        learning_rate,
+                        rho,
+                        momentum,
+                        epsilon,
+                        centered,
+                    )
+
+                    # Validate updated params
+                    if centered:
+                        self.assertAllCloseAccordingToType(
+                            mg0_np, self.evaluate(mg0)
+                        )
+                        self.assertAllCloseAccordingToType(
+                            mg1_np, self.evaluate(mg1)
+                        )
+                    self.assertAllCloseAccordingToType(
+                        rms0_np, self.evaluate(rms0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        rms1_np, self.evaluate(rms1)
+                    )
+                    if momentum > 0.0:
+                        self.assertAllCloseAccordingToType(
+                            mom0_np, self.evaluate(mom0)
+                        )
+                        self.assertAllCloseAccordingToType(
+                            mom1_np, self.evaluate(mom1)
+                        )
+                    self.assertAllCloseAccordingToType(
+                        var0_np, self.evaluate(var0)
+                    )
+                    self.assertAllCloseAccordingToType(
+                        var1_np, self.evaluate(var1)
+                    )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testCallableParams(self):
+        for dtype in _DATA_TYPES:
+            var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+            var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+            grads0 = tf.constant([0.1, 0.1], dtype=dtype)
+            grads1 = tf.constant([0.01, 0.01], dtype=dtype)
+
+            learning_rate = lambda: 2.0
+            rho = lambda: 0.9
+            momentum = lambda: 0.0
+            epsilon = 1.0
+            opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
+
+            # Fetch params to validate initial values
+            self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+            self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+            # Step 1: the rms accumulators where 1. So we should see a normal
+            # update: v -= grad * learning_rate
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            # Check the parameters.
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [
+                        1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+                        2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+                    ]
+                ),
+                self.evaluate(var0),
+            )
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [
+                        3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+                        4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+                    ]
+                ),
+                self.evaluate(var1),
+            )
+            # Step 2: the root mean square accumulators contain the previous
+            # update.
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+            # Check the parameters.
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [
+                        1.0
+                        - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+                        - (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+                        2.0
+                        - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+                        - (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+                    ]
+                ),
+                self.evaluate(var0),
+            )
+            self.assertAllCloseAccordingToType(
+                np.array(
+                    [
+                        3.0
+                        - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+                        - (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+                        4.0
+                        - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+                        - (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+                    ]
+                ),
+                self.evaluate(var1),
+            )
+
+    def testConstructRMSpropWithLR(self):
+        opt = rmsprop.RMSprop(lr=1.0)
+        opt_2 = rmsprop.RMSprop(learning_rate=0.1, lr=1.0)
+        opt_3 = rmsprop.RMSprop(learning_rate=0.1)
+        self.assertIsInstance(opt.lr, tf.Variable)
+        self.assertIsInstance(opt_2.lr, tf.Variable)
+        self.assertIsInstance(opt_3.lr, tf.Variable)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(opt.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+        self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testSlotsUniqueEager(self):
+        v1 = tf.Variable(1.0)
+        v2 = tf.Variable(1.0)
+
+        opt = rmsprop.RMSprop(1.0, momentum=0.0, centered=False)
+        opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+        # There should be iteration, and one unique slot variable for v1 and v2.
+        self.assertLen(set({id(v) for v in opt.variables()}), 3)
+        self.assertEqual(
+            self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations)
+        )
+
+        opt = rmsprop.RMSprop(learning_rate=1.0, momentum=0.2, centered=False)
+        opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+        # There should be iteration, and two unique slot variables for v1 and
+        # v2.
+        self.assertLen(set({id(v) for v in opt.variables()}), 5)
+        self.assertEqual(
+            self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations)
+        )
+
+        opt = rmsprop.RMSprop(learning_rate=1.0, momentum=0.2, centered=True)
+        opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+        # There should be iteration, and three unique slot variables for v1 and
+        # v2
+        self.assertLen(set({id(v) for v in opt.variables()}), 7)
+        self.assertEqual(
+            self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations)
+        )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testMomentumProperValue(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            r"`momentum` must be between \[0, 1\]. "
+            r"Received: momentum=2.5 \(of type <class "
+            r"\'float\'>\).",
+        ):
+            rmsprop.RMSprop(1.0, momentum=2.5, centered=False)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class SlotColocationTest(tf.test.TestCase, parameterized.TestCase):
+    @parameterized.parameters([True, False])
+    @tf_test_utils.run_gpu_only
+    def testRunMinimizeOnGPUForCPUVariables(self, use_resource):
+        with tf.device("/device:CPU:0"):
+            if use_resource:
+                var0 = tf.Variable([1.0, 2.0], dtype=tf.float32)
+                var1 = tf.Variable([3.0, 4.0], dtype=tf.float32)
+            else:
+                var0 = tf.Variable([1.0, 2.0], dtype=tf.float32)
+                var1 = tf.Variable([3.0, 4.0], dtype=tf.float32)
+
+        def loss():
+            return 5 * var0 + 3 * var1
+
+        opt = rmsprop.RMSprop(
+            learning_rate=1.0, decay=0.9, momentum=0.5, epsilon=1.0
+        )
+
+        # Fetch params to validate initial values
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 1 step through optimizer on GPU.
+        # Slot variables are created the first time optimizer is used on some
+        # variable. This tests that slot variables will be colocated with the
+        # base variable.
+        with tf.device("/device:GPU:0"):
+            # Note that for eager execution, minimize expects a function instead
+            # of a Tensor.
+            opt_op = opt.minimize(loss, [var0, var1])
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            self.evaluate(opt_op)
+
+        # Validate updated params, All variables should have decreased.
+        self.assertTrue(
+            all(v < 0.0 for v in self.evaluate(var0)),
+            msg=f"updated variables: {self.evaluate(var0)}",
+        )
+        self.assertTrue(
+            all(v < 2.0 for v in self.evaluate(var1)),
+            msg=f"updated variables: {self.evaluate(var1)}",
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/legacy/sgd.py b/keras/optimizers/legacy/sgd.py
deleted file mode 100644
index b53744adbc8e..000000000000
--- a/keras/optimizers/legacy/sgd.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Legacy SGD optimizer implementation."""
-
-from keras.optimizers.optimizer_v2 import gradient_descent
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.optimizers.legacy.SGD')
-class SGD(gradient_descent.SGD):
-  pass
diff --git a/keras/optimizers/legacy_learning_rate_decay.py b/keras/optimizers/legacy_learning_rate_decay.py
index 34afbd4f4c4c..8d8c217cecdf 100644
--- a/keras/optimizers/legacy_learning_rate_decay.py
+++ b/keras/optimizers/legacy_learning_rate_decay.py
@@ -14,754 +14,800 @@
 # ==============================================================================
 """Various learning rate decay functions."""
 
+import functools
+
 import tensorflow.compat.v2 as tf
 
-import functools
 from keras.optimizers.schedules import learning_rate_schedule
+
+# isort: off
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(v1=["train.exponential_decay"])
-def exponential_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False,
-                      name=None):
-  """Applies exponential decay to the learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an exponential decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns the decayed learning rate.  It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate *
-                          decay_rate ^ (global_step / decay_steps)
-  ```
-
-  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
-  integer division and the decayed learning rate follows a staircase function.
-
-  Example: decay every 100000 steps with a base of 0.96:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate,
-  global_step,
-                                             100000, 0.96, staircase=True)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
-      step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
-      be positive.  See the decay computation above.
-    decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
-      The decay rate.
-    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
-    name: String.  Optional name of the operation.  Defaults to
-      'ExponentialDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  decayed_lr = learning_rate_schedule.ExponentialDecay(
-      learning_rate, decay_steps, decay_rate, staircase=staircase, name=name)
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+def exponential_decay(
+    learning_rate,
+    global_step,
+    decay_steps,
+    decay_rate,
+    staircase=False,
+    name=None,
+):
+    """Applies exponential decay to the learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This function applies an exponential decay function
+    to a provided initial learning rate.  It requires a `global_step` value to
+    compute the decayed learning rate.  You can just pass a TensorFlow variable
+    that you increment at each training step.
+
+    The function returns the decayed learning rate.  It is computed as:
+
+    ```python
+    decayed_learning_rate = learning_rate *
+                            decay_rate ^ (global_step / decay_steps)
+    ```
+
+    If the argument `staircase` is `True`, then `global_step / decay_steps` is
+    an integer division and the decayed learning rate follows a staircase
+    function.
+
+    Example: decay every 100000 steps with a base of 0.96:
+
+    ```python
+    ...
+    global_step = tf.Variable(0, trainable=False)
+    starter_learning_rate = 0.1
+    learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate,
+    global_step,
+                                               100000, 0.96, staircase=True)
+    # Passing global_step to minimize() will increment it at each step.
+    learning_step = (
+        tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
+        .minimize(...my loss..., global_step=global_step)
+    )
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
+        number.  The initial learning rate.
+      global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Global step to use for the decay computation.  Must not be negative.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
+        be positive.  See the decay computation above.
+      decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
+        The decay rate.
+      staircase: Boolean. If `True` decay the learning rate at discrete
+        intervals
+      name: String. Optional name of the operation. Defaults to
+        'ExponentialDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+      learning rate.
+
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    decayed_lr = learning_rate_schedule.ExponentialDecay(
+        learning_rate, decay_steps, decay_rate, staircase=staircase, name=name
+    )
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
 
 
 @tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"])
 def piecewise_constant(x, boundaries, values, name=None):
-  """Piecewise constant from boundaries and interval values.
-
-  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
-    for the next 10000 steps, and 0.1 for any additional steps.
-
-  ```python
-  global_step = tf.Variable(0, trainable=False)
-  boundaries = [100000, 110000]
-  values = [1.0, 0.5, 0.1]
-  learning_rate = tf.compat.v1.train.piecewise_constant(global_step, boundaries,
-  values)
-
-  # Later, whenever we perform an optimization step, we increment global_step.
-  ```
-
-  Args:
-    x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
-      `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
-    boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
-      increasing entries, and with all elements having the same type as `x`.
-    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
-      for the intervals defined by `boundaries`. It should have one more element
-      than `boundaries`, and all elements should have the same type.
-    name: A string. Optional name of the operation. Defaults to
-      'PiecewiseConstant'.
-
-  Returns:
-    A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
-    `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
-    and values[-1] when `x > boundaries[-1]`.
-
-  Raises:
-    ValueError: if types of `x` and `boundaries` do not match, or types of all
-        `values` do not match or
-        the number of elements in the lists does not match.
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  boundaries = tf.nest.map_structure(tf.convert_to_tensor,
-                                  tf.nest.flatten(boundaries))
-  values = tf.nest.map_structure(tf.convert_to_tensor,
-                              tf.nest.flatten(values))
-  x_recomp = tf.convert_to_tensor(x)
-  # Avoid explicit conversion to x's dtype. This could result in faulty
-  # comparisons, for example if floats are converted to integers.
-  for i, b in enumerate(boundaries):
-    if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
-      # We can promote int32 boundaries to int64 without loss of precision.
-      # This covers the most common case where the user passes in boundaries
-      # as an array of Python integers.
-      if (b.dtype.base_dtype == tf.int32 and
-          x_recomp.dtype.base_dtype == tf.int64):
-        b = tf.cast(b, x_recomp.dtype.base_dtype)
-        boundaries[i] = b
-      else:
-        raise ValueError(
-            f"`boundaries` ({b.dtype.base_dtype}) must have the same dtype as "
-            f"x ({x_recomp.dtype.base_dtype}).")
-  for v in values[1:]:
-    if v.dtype.base_dtype != values[0].dtype.base_dtype:
-      raise ValueError(
-          f"`values` must have elements all with the same dtype "
-          f"({values[0].dtype.base_dtype} vs {v.dtype.base_dtype}).")
-  decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
-      boundaries, values, name=name)
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(x)
-  else:
-    decayed_lr = functools.partial(decayed_lr, x)
-  return decayed_lr
+    """Piecewise constant from boundaries and interval values.
+
+    Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+      for the next 10000 steps, and 0.1 for any additional steps.
+
+    ```python
+    global_step = tf.Variable(0, trainable=False)
+    boundaries = [100000, 110000]
+    values = [1.0, 0.5, 0.1]
+    learning_rate = tf.compat.v1.train.piecewise_constant(
+        global_step, boundaries, values)
+
+    # Later, whenever we perform an optimization step, we increment global_step.
+    ```
+
+    Args:
+      x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
+        `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
+      boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
+        increasing entries, and with all elements having the same type as `x`.
+      values: A list of `Tensor`s or `float`s or `int`s that specifies the
+        values for the intervals defined by `boundaries`. It should have one
+        more element than `boundaries`, and all elements should have the same
+        type.
+      name: A string. Optional name of the operation. Defaults to
+        'PiecewiseConstant'.
+
+    Returns:
+      A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
+      `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
+      and values[-1] when `x > boundaries[-1]`.
+
+    Raises:
+      ValueError: if types of `x` and `boundaries` do not match, or types of all
+          `values` do not match or
+          the number of elements in the lists does not match.
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    boundaries = tf.nest.map_structure(
+        tf.convert_to_tensor, tf.nest.flatten(boundaries)
+    )
+    values = tf.nest.map_structure(
+        tf.convert_to_tensor, tf.nest.flatten(values)
+    )
+    x_recomp = tf.convert_to_tensor(x)
+    # Avoid explicit conversion to x's dtype. This could result in faulty
+    # comparisons, for example if floats are converted to integers.
+    for i, b in enumerate(boundaries):
+        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
+            # We can promote int32 boundaries to int64 without loss of
+            # precision.  This covers the most common case where the user passes
+            # in boundaries as an array of Python integers.
+            if (
+                b.dtype.base_dtype == tf.int32
+                and x_recomp.dtype.base_dtype == tf.int64
+            ):
+                b = tf.cast(b, x_recomp.dtype.base_dtype)
+                boundaries[i] = b
+            else:
+                raise ValueError(
+                    f"`boundaries` ({b.dtype.base_dtype}) must have the same "
+                    f"dtype as x ({x_recomp.dtype.base_dtype})."
+                )
+    for v in values[1:]:
+        if v.dtype.base_dtype != values[0].dtype.base_dtype:
+            raise ValueError(
+                "`values` must have elements all with the same dtype "
+                f"({values[0].dtype.base_dtype} vs {v.dtype.base_dtype})."
+            )
+    decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+        boundaries, values, name=name
+    )
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(x)
+    else:
+        decayed_lr = functools.partial(decayed_lr, x)
+    return decayed_lr
 
 
 @tf_export(v1=["train.polynomial_decay"])
-def polynomial_decay(learning_rate,
-                     global_step,
-                     decay_steps,
-                     end_learning_rate=0.0001,
-                     power=1.0,
-                     cycle=False,
-                     name=None):
-  """Applies a polynomial decay to the learning rate.
-
-  It is commonly observed that a monotonically decreasing learning rate, whose
-  degree of change is carefully chosen, results in a better performing model.
-  This function applies a polynomial decay function to a provided initial
-  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
-
-  It requires a `global_step` value to compute the decayed learning rate.  You
-  can just pass a TensorFlow variable that you increment at each training step.
-
-  The function returns the decayed learning rate.  It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  decayed_learning_rate = (learning_rate - end_learning_rate) *
-                          (1 - global_step / decay_steps) ^ (power) +
-                          end_learning_rate
-
-  ```
-
-  If `cycle` is True then a multiple of `decay_steps` is used, the first one
-  that is bigger than `global_steps`.
-
-  ```python
-  decay_steps = decay_steps * ceil(global_step / decay_steps)
-  decayed_learning_rate = (learning_rate - end_learning_rate) *
-                          (1 - global_step / decay_steps) ^ (power) +
-                          end_learning_rate
-
-  ```
-
-  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  end_learning_rate = 0.01
-  decay_steps = 10000
-  learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate,
-  global_step,
-                                            decay_steps, end_learning_rate,
-                                            power=0.5)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
-      step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
-      be positive.  See the decay computation above.
-    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
-      number.  The minimal end learning rate.
-    power: A scalar `float32` or `float64` `Tensor` or a Python number.  The
-      power of the polynomial. Defaults to linear, 1.0.
-    cycle: A boolean, whether or not it should cycle beyond decay_steps.
-    name: String.  Optional name of the operation. Defaults to
-      'PolynomialDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  decayed_lr = learning_rate_schedule.PolynomialDecay(
-      learning_rate,
-      decay_steps,
-      end_learning_rate=end_learning_rate,
-      power=power,
-      cycle=cycle,
-      name=name)
-
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+def polynomial_decay(
+    learning_rate,
+    global_step,
+    decay_steps,
+    end_learning_rate=0.0001,
+    power=1.0,
+    cycle=False,
+    name=None,
+):
+    """Applies a polynomial decay to the learning rate.
+
+    It is commonly observed that a monotonically decreasing learning rate, whose
+    degree of change is carefully chosen, results in a better performing model.
+    This function applies a polynomial decay function to a provided initial
+    `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
+
+    It requires a `global_step` value to compute the decayed learning rate. You
+    can just pass a TensorFlow variable that you increment at each training
+    step.
+
+    The function returns the decayed learning rate.  It is computed as:
+
+    ```python
+    global_step = min(global_step, decay_steps)
+    decayed_learning_rate = (learning_rate - end_learning_rate) *
+                            (1 - global_step / decay_steps) ^ (power) +
+                            end_learning_rate
+
+    ```
+
+    If `cycle` is True then a multiple of `decay_steps` is used, the first one
+    that is bigger than `global_steps`.
+
+    ```python
+    decay_steps = decay_steps * ceil(global_step / decay_steps)
+    decayed_learning_rate = (learning_rate - end_learning_rate) *
+                            (1 - global_step / decay_steps) ^ (power) +
+                            end_learning_rate
+
+    ```
+
+    Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
+
+    ```python
+    ...
+    global_step = tf.Variable(0, trainable=False)
+    starter_learning_rate = 0.1
+    end_learning_rate = 0.01
+    decay_steps = 10000
+    learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate,
+    global_step,
+                                              decay_steps, end_learning_rate,
+                                              power=0.5)
+    # Passing global_step to minimize() will increment it at each step.
+    learning_step = (
+        tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
+        .minimize(...my loss..., global_step=global_step)
+    )
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
+        number.  The initial learning rate.
+      global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Global step to use for the decay computation.  Must not be negative.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
+        be positive.  See the decay computation above.
+      end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
+        number.  The minimal end learning rate.
+      power: A scalar `float32` or `float64` `Tensor` or a Python number.  The
+        power of the polynomial. Defaults to `1.0`.
+      cycle: A boolean, whether it should cycle beyond decay_steps. Defaults to
+        `False`.
+      name: String. Optional name of the operation. Defaults to
+        'PolynomialDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+      learning rate.
+
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        learning_rate,
+        decay_steps,
+        end_learning_rate=end_learning_rate,
+        power=power,
+        cycle=cycle,
+        name=name,
+    )
+
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
 
 
 @tf_export(v1=["train.natural_exp_decay"])
-def natural_exp_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False,
-                      name=None):
-  """Applies natural exponential decay to the initial learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an exponential decay function
-  to a provided initial learning rate.  It requires an `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns the decayed learning rate.  It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
-  decay_step)
-  ```
-
-  or, if `staircase` is `True`, as:
-
-  ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step /
-  decay_step))
-  ```
-
-  Example: decay exponentially with a base of 0.96:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  learning_rate = 0.1
-  decay_steps = 5
-  k = 0.5
-  learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate,
-  global_step,
-                                             decay_steps, k)
-
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
-      The initial learning rate.
-    global_step: A Python number. Global step to use for the decay computation.
-      Must not be negative.
-    decay_steps: How often to apply decay.
-    decay_rate: A Python number.  The decay rate.
-    staircase: Whether to apply decay in a discrete staircase, as opposed to
-      continuous, fashion.
-    name: String.  Optional name of the operation.  Defaults to
-      'ExponentialTimeDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  natural_exp_rate = tf.exp(tf.negative(decay_rate))
-  decayed_lr = learning_rate_schedule.ExponentialDecay(
-      learning_rate,
-      decay_steps,
-      natural_exp_rate,
-      staircase=staircase,
-      name=name)
-
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+def natural_exp_decay(
+    learning_rate,
+    global_step,
+    decay_steps,
+    decay_rate,
+    staircase=False,
+    name=None,
+):
+    """Applies natural exponential decay to the initial learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses.  This function applies an exponential decay
+    function to a provided initial learning rate.  It requires an `global_step`
+    value to compute the decayed learning rate.  You can just pass a TensorFlow
+    variable that you increment at each training step.
+
+    The function returns the decayed learning rate.  It is computed as:
+
+    ```python
+    decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
+    decay_step)
+    ```
+
+    or, if `staircase` is `True`, as:
+
+    ```python
+    decayed_learning_rate = learning_rate * exp(-decay_rate * \
+        floor(global_step / decay_step))
+    ```
+
+    Example: decay exponentially with a base of 0.96:
+
+    ```python
+    ...
+    global_step = tf.Variable(0, trainable=False)
+    learning_rate = 0.1
+    decay_steps = 5
+    k = 0.5
+    learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate,
+    global_step,
+                                               decay_steps, k)
+
+    # Passing global_step to minimize() will increment it at each step.
+    learning_step = (
+        tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
+        .minimize(...my loss..., global_step=global_step)
+    )
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
+        number. The initial learning rate.
+      global_step: A Python number. Global step to use for the decay
+        computation. Must not be negative.
+      decay_steps: How often to apply decay.
+      decay_rate: A Python number.  The decay rate.
+      staircase: Whether to apply decay in a discrete staircase, as opposed to
+        continuous, fashion.
+      name: String.  Optional name of the operation.  Defaults to
+        'ExponentialTimeDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+      learning rate.
+
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    natural_exp_rate = tf.exp(tf.negative(decay_rate))
+    decayed_lr = learning_rate_schedule.ExponentialDecay(
+        learning_rate,
+        decay_steps,
+        natural_exp_rate,
+        staircase=staircase,
+        name=name,
+    )
+
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
 
 
 @tf_export(v1=["train.inverse_time_decay"])
-def inverse_time_decay(learning_rate,
-                       global_step,
-                       decay_steps,
-                       decay_rate,
-                       staircase=False,
-                       name=None):
-  """Applies inverse time decay to the initial learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an inverse decay function
-  to a provided initial learning rate.  It requires an `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns the decayed learning rate.  It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
-  decay_step)
-  ```
-
-  or, if `staircase` is `True`, as:
-
-  ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
-  decay_step))
-  ```
-
-  Example: decay 1/t with a rate of 0.5:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  learning_rate = 0.1
-  decay_steps = 1.0
-  decay_rate = 0.5
-  learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate,
-  global_step,
-  decay_steps, decay_rate)
-
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
-      The initial learning rate.
-    global_step: A Python number. Global step to use for the decay computation.
-      Must not be negative.
-    decay_steps: How often to apply decay.
-    decay_rate: A Python number.  The decay rate.
-    staircase: Whether to apply decay in a discrete staircase, as opposed to
-      continuous, fashion.
-    name: String.  Optional name of the operation.  Defaults to
-      'InverseTimeDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  decayed_lr = learning_rate_schedule.InverseTimeDecay(
-      learning_rate, decay_steps, decay_rate, staircase=staircase, name=name)
-
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+def inverse_time_decay(
+    learning_rate,
+    global_step,
+    decay_steps,
+    decay_rate,
+    staircase=False,
+    name=None,
+):
+    """Applies inverse time decay to the initial learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses.  This function applies an inverse decay function
+    to a provided initial learning rate.  It requires an `global_step` value to
+    compute the decayed learning rate.  You can just pass a TensorFlow variable
+    that you increment at each training step.
+
+    The function returns the decayed learning rate.  It is computed as:
+
+    ```python
+    decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
+    decay_step)
+    ```
+
+    or, if `staircase` is `True`, as:
+
+    ```python
+    decayed_learning_rate = learning_rate / (1 + decay_rate * \
+        floor(global_step / decay_step))
+    ```
+
+    Example: decay 1/t with a rate of 0.5:
+
+    ```python
+    ...
+    global_step = tf.Variable(0, trainable=False)
+    learning_rate = 0.1
+    decay_steps = 1.0
+    decay_rate = 0.5
+    learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate,
+    global_step,
+    decay_steps, decay_rate)
+
+    # Passing global_step to minimize() will increment it at each step.
+    learning_step = (
+        tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
+        .minimize(...my loss..., global_step=global_step)
+    )
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
+        number.  The initial learning rate.
+      global_step: A Python number. Global step to use for the decay
+        computation. Must not be negative.
+      decay_steps: How often to apply decay.
+      decay_rate: A Python number.  The decay rate.
+      staircase: Whether to apply decay in a discrete staircase, as opposed to
+        continuous, fashion.
+      name: String. Optional name of the operation. Defaults to
+        'InverseTimeDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`. The decayed
+      learning rate.
+
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    decayed_lr = learning_rate_schedule.InverseTimeDecay(
+        learning_rate, decay_steps, decay_rate, staircase=staircase, name=name
+    )
+
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
 
 
 @tf_export(v1=["train.cosine_decay"])
 def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
-  """Applies cosine decay to the learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a cosine decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns the decayed learning rate.  It is computed as:
-  ```python
-  global_step = min(global_step, decay_steps)
-  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
-  decayed = (1 - alpha) * cosine_decay + alpha
-  decayed_learning_rate = learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
-      step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
-      of steps to decay over.
-    alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
-      learning rate value as a fraction of learning_rate.
-    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  References:
-    Stochastic Gradient Descent with Warm Restarts:
-      [Loshchilov et al., 2017]
-      (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
-      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  decayed_lr = learning_rate_schedule.CosineDecay(
-      learning_rate, decay_steps, alpha=alpha, name=name)
-
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+    """Applies cosine decay to the learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses.  This function applies a cosine decay function
+    to a provided initial learning rate.  It requires a `global_step` value to
+    compute the decayed learning rate.  You can just pass a TensorFlow variable
+    that you increment at each training step.
+
+    The function returns the decayed learning rate.  It is computed as:
+    ```python
+    global_step = min(global_step, decay_steps)
+    cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
+    decayed = (1 - alpha) * cosine_decay + alpha
+    decayed_learning_rate = learning_rate * decayed
+    ```
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+        The initial learning rate.
+      global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Global step to use for the decay computation.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
+        learning rate value as a fraction of learning_rate.
+      name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+      learning rate.
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    References:
+      Stochastic Gradient Descent with Warm Restarts:
+        [Loshchilov et al., 2017]
+        (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+        ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    decayed_lr = learning_rate_schedule.CosineDecay(
+        learning_rate, decay_steps, alpha=alpha, name=name
+    )
+
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
 
 
 @tf_export(v1=["train.cosine_decay_restarts"])
-def cosine_decay_restarts(learning_rate,
-                          global_step,
-                          first_decay_steps,
-                          t_mul=2.0,
-                          m_mul=1.0,
-                          alpha=0.0,
-                          name=None):
-  """Applies cosine decay with restarts to the learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a cosine decay function with
-  restarts to a provided initial learning rate.  It requires a `global_step`
-  value to compute the decayed learning rate.  You can just pass a TensorFlow
-  variable that you increment at each training step.
-
-  The function returns the decayed learning rate while taking into account
-  possible warm restarts. The learning rate multiplier first decays
-  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
-  restart is performed. Each new warm restart runs for `t_mul` times more steps
-  and with `m_mul` times smaller initial learning rate.
-
-  Example usage:
-  ```python
-  first_decay_steps = 1000
-  lr_decayed = cosine_decay_restarts(learning_rate, global_step,
-                                     first_decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
-      step to use for the decay computation.
-    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used to
-      derive the number of iterations in the i-th period
-    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Used to derive the initial learning rate of the i-th period:
-    alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
-      learning rate value as a fraction of the learning_rate.
-    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  References:
-    Stochastic Gradient Descent with Warm Restarts:
-      [Loshchilov et al., 2017]
-      (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
-      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  decayed_lr = learning_rate_schedule.CosineDecayRestarts(
-      learning_rate,
-      first_decay_steps,
-      t_mul=t_mul,
-      m_mul=m_mul,
-      alpha=alpha,
-      name=name)
-
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+def cosine_decay_restarts(
+    learning_rate,
+    global_step,
+    first_decay_steps,
+    t_mul=2.0,
+    m_mul=1.0,
+    alpha=0.0,
+    name=None,
+):
+    """Applies cosine decay with restarts to the learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses.  This function applies a cosine decay function with
+    restarts to a provided initial learning rate.  It requires a `global_step`
+    value to compute the decayed learning rate.  You can just pass a TensorFlow
+    variable that you increment at each training step.
+
+    The function returns the decayed learning rate while taking into account
+    possible warm restarts. The learning rate multiplier first decays
+    from 1 to `alpha` for `first_decay_steps` steps. Then, a warm restart is
+    performed. Each new warm restart runs for `t_mul` times more steps and with
+    `m_mul` times smaller initial learning rate.
+
+    Example usage:
+    ```python
+    first_decay_steps = 1000
+    lr_decayed = cosine_decay_restarts(learning_rate, global_step,
+                                       first_decay_steps)
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+        The initial learning rate.
+      global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Global step to use for the decay computation.
+      first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python
+        number. Number of steps to decay over.
+      t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used
+        to derive the number of iterations in the i-th period
+      m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+        Used to derive the initial learning rate of the i-th period:
+      alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
+        learning rate value as a fraction of the learning_rate.
+      name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+      learning rate.
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    References:
+      Stochastic Gradient Descent with Warm Restarts:
+        [Loshchilov et al., 2017]
+        (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+        ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+        learning_rate,
+        first_decay_steps,
+        t_mul=t_mul,
+        m_mul=m_mul,
+        alpha=alpha,
+        name=name,
+    )
+
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
 
 
 @tf_export(v1=["train.linear_cosine_decay"])
-def linear_cosine_decay(learning_rate,
-                        global_step,
-                        decay_steps,
-                        num_periods=0.5,
-                        alpha=0.0,
-                        beta=0.001,
-                        name=None):
-  """Applies linear cosine decay to the learning rate.
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a linear cosine decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns the decayed learning rate.  It is computed as:
-  ```python
-  global_step = min(global_step, decay_steps)
-  linear_decay = (decay_steps - global_step) / decay_steps)
-  cosine_decay = 0.5 * (
-      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
-  decayed = (alpha + linear_decay) * cosine_decay + beta
-  decayed_learning_rate = learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
-      step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
-      of steps to decay over.
-    num_periods: Number of periods in the cosine part of the decay. See
-      computation above.
-    alpha: See computation above.
-    beta: See computation above.
-    name: String.  Optional name of the operation.  Defaults to
-      'LinearCosineDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  References:
-    Neural Optimizer Search with Reinforcement Learning:
-      [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
-      ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
-    Stochastic Gradient Descent with Warm Restarts:
-      [Loshchilov et al., 2017]
-      (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
-      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  decayed_lr = learning_rate_schedule.LinearCosineDecay(
-      learning_rate,
-      decay_steps,
-      num_periods=num_periods,
-      alpha=alpha,
-      beta=beta,
-      name=name)
-
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+def linear_cosine_decay(
+    learning_rate,
+    global_step,
+    decay_steps,
+    num_periods=0.5,
+    alpha=0.0,
+    beta=0.001,
+    name=None,
+):
+    """Applies linear cosine decay to the learning rate.
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses.  This function applies a linear cosine decay
+    function to a provided initial learning rate.  It requires a `global_step`
+    value to compute the decayed learning rate.  You can just pass a TensorFlow
+    variable that you increment at each training step.
+
+    The function returns the decayed learning rate.  It is computed as:
+    ```python
+    global_step = min(global_step, decay_steps)
+    linear_decay = (decay_steps - global_step) / decay_steps)
+    cosine_decay = 0.5 * (
+        1 + cos(pi * 2 * num_periods * global_step / decay_steps))
+    decayed = (alpha + linear_decay) * cosine_decay + beta
+    decayed_learning_rate = learning_rate * decayed
+    ```
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps)
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+        The initial learning rate.
+      global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Global step to use for the decay computation.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      num_periods: Number of periods in the cosine part of the decay. See
+        computation above.
+      alpha: See computation above.
+      beta: See computation above.
+      name: String.  Optional name of the operation.  Defaults to
+        'LinearCosineDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+      learning rate.
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    References:
+      Neural Optimizer Search with Reinforcement Learning:
+        [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
+        ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
+      Stochastic Gradient Descent with Warm Restarts:
+        [Loshchilov et al., 2017]
+        (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+        ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    decayed_lr = learning_rate_schedule.LinearCosineDecay(
+        learning_rate,
+        decay_steps,
+        num_periods=num_periods,
+        alpha=alpha,
+        beta=beta,
+        name=name,
+    )
+
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
 
 
 @tf_export(v1=["train.noisy_linear_cosine_decay"])
-def noisy_linear_cosine_decay(learning_rate,
-                              global_step,
-                              decay_steps,
-                              initial_variance=1.0,
-                              variance_decay=0.55,
-                              num_periods=0.5,
-                              alpha=0.0,
-                              beta=0.001,
-                              name=None):
-  """Applies noisy linear cosine decay to the learning rate.
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a noisy linear
-  cosine decay function to a provided initial learning rate.
-  It requires a `global_step` value to compute the decayed learning rate.
-  You can just pass a TensorFlow variable that you increment at each
-  training step.
-
-  The function returns the decayed learning rate.  It is computed as:
-  ```python
-  global_step = min(global_step, decay_steps)
-  linear_decay = (decay_steps - global_step) / decay_steps)
-  cosine_decay = 0.5 * (
-      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
-  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
-  decayed_learning_rate = learning_rate * decayed
-  ```
-  where eps_t is 0-centered gaussian noise with variance
-  initial_variance / (1 + global_step) ** variance_decay
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed = noisy_linear_cosine_decay(
-    learning_rate, global_step, decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
-      step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
-      of steps to decay over.
-    initial_variance: initial variance for the noise. See computation above.
-    variance_decay: decay for the noise's variance. See computation above.
-    num_periods: Number of periods in the cosine part of the decay. See
-      computation above.
-    alpha: See computation above.
-    beta: See computation above.
-    name: String.  Optional name of the operation.  Defaults to
-      'NoisyLinearCosineDecay'.
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
-    learning rate.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-
-  References:
-    Neural Optimizer Search with Reinforcement Learning:
-      [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
-      ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
-    Stochastic Gradient Descent with Warm Restarts:
-      [Loshchilov et al., 2017]
-      (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
-      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
-
-  @compatibility(eager)
-  When eager execution is enabled, this function returns a function which in
-  turn returns the decayed learning rate Tensor. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  @end_compatibility
-  """
-  decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
-      learning_rate,
-      decay_steps,
-      initial_variance=initial_variance,
-      variance_decay=variance_decay,
-      num_periods=num_periods,
-      alpha=alpha,
-      beta=beta,
-      name=name)
-
-  if not tf.executing_eagerly():
-    decayed_lr = decayed_lr(global_step)
-  else:
-    decayed_lr = functools.partial(decayed_lr, global_step)
-  return decayed_lr
+def noisy_linear_cosine_decay(
+    learning_rate,
+    global_step,
+    decay_steps,
+    initial_variance=1.0,
+    variance_decay=0.55,
+    num_periods=0.5,
+    alpha=0.0,
+    beta=0.001,
+    name=None,
+):
+    """Applies noisy linear cosine decay to the learning rate.
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses.  This function applies a noisy linear
+    cosine decay function to a provided initial learning rate.
+    It requires a `global_step` value to compute the decayed learning rate.
+    You can just pass a TensorFlow variable that you increment at each
+    training step.
+
+    The function returns the decayed learning rate.  It is computed as:
+    ```python
+    global_step = min(global_step, decay_steps)
+    linear_decay = (decay_steps - global_step) / decay_steps)
+    cosine_decay = 0.5 * (
+        1 + cos(pi * 2 * num_periods * global_step / decay_steps))
+    decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
+    decayed_learning_rate = learning_rate * decayed
+    ```
+    where eps_t is 0-centered gaussian noise with variance
+    initial_variance / (1 + global_step) ** variance_decay
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed = noisy_linear_cosine_decay(
+      learning_rate, global_step, decay_steps)
+    ```
+
+    Args:
+      learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+        The initial learning rate.
+      global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Global step to use for the decay computation.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      initial_variance: initial variance for the noise. See computation above.
+      variance_decay: decay for the noise's variance. See computation above.
+      num_periods: Number of periods in the cosine part of the decay. See
+        computation above.
+      alpha: See computation above.
+      beta: See computation above.
+      name: String.  Optional name of the operation.  Defaults to
+        'NoisyLinearCosineDecay'.
+
+    Returns:
+      A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+      learning rate.
+    Raises:
+      ValueError: if `global_step` is not supplied.
+
+    References:
+      Neural Optimizer Search with Reinforcement Learning:
+        [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
+        ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
+      Stochastic Gradient Descent with Warm Restarts:
+        [Loshchilov et al., 2017]
+        (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+        ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
+    @compatibility(eager)
+    When eager execution is enabled, this function returns a function which in
+    turn returns the decayed learning rate Tensor. This can be useful for
+    changing the learning rate value across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
+        learning_rate,
+        decay_steps,
+        initial_variance=initial_variance,
+        variance_decay=variance_decay,
+        num_periods=num_periods,
+        alpha=alpha,
+        beta=beta,
+        name=name,
+    )
+
+    if not tf.executing_eagerly():
+        decayed_lr = decayed_lr(global_step)
+    else:
+        decayed_lr = functools.partial(decayed_lr, global_step)
+    return decayed_lr
diff --git a/keras/optimizers/legacy_learning_rate_decay_test.py b/keras/optimizers/legacy_learning_rate_decay_test.py
index 7c93d1efeaea..d0322426560c 100644
--- a/keras/optimizers/legacy_learning_rate_decay_test.py
+++ b/keras/optimizers/legacy_learning_rate_decay_test.py
@@ -14,459 +14,479 @@
 # ==============================================================================
 """Functional test for learning rate decay."""
 
+import math
+
 import tensorflow.compat.v2 as tf
 
-import math
 from keras.testing_infra import test_combinations
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LRDecayTest(test_combinations.TestCase):
-
-  def testContinuous(self):
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    step = 5
-    decayed_lr = tf.compat.v1.train.exponential_decay(0.05, step, 10, 0.96)
-    expected = .05 * 0.96**(5.0 / 10.0)
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testStaircase(self):
-    if tf.executing_eagerly():
-      step = tf.Variable(0)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      decayed_lr = tf.compat.v1.train.exponential_decay(
-          .1, step, 3, 0.96, staircase=True)
-
-      # No change to learning rate due to staircase
-      expected = .1
-      self.evaluate(step.assign(1))
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-      expected = .1
-      self.evaluate(step.assign(2))
-      self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
-
-      # Decayed learning rate
-      expected = .1 * 0.96 ** (100 // 3)
-      self.evaluate(step.assign(100))
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testVariables(self):
-    step = tf.Variable(1)
-
-    decayed_lr = tf.compat.v1.train.exponential_decay(
-        .1, step, 3, 0.96, staircase=True)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    # No change to learning rate
-    assign_1 = step.assign(1)
-    if not tf.executing_eagerly():
-      self.evaluate(assign_1.op)
-    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
-    assign_2 = step.assign(2)
-    if not tf.executing_eagerly():
-      self.evaluate(assign_2.op)
-    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
-    # Decayed learning rate
-    assign_100 = step.assign(100)
-    if not tf.executing_eagerly():
-      self.evaluate(assign_100.op)
-    expected = .1 * 0.96**(100 // 3)
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testPiecewiseConstant(self):
-    x = tf.Variable(-999)
-    decayed_lr = tf.compat.v1.train.piecewise_constant(
-        x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6)
-    self.evaluate(x.assign(100))
-    self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6)
-    self.evaluate(x.assign(105))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6)
-    self.evaluate(x.assign(110))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6)
-    self.evaluate(x.assign(120))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.01, 1e-6)
-    self.evaluate(x.assign(999))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
-
-  def testPiecewiseConstantEdgeCases(self):
-    x_int = tf.Variable(0, dtype=tf.int32)
-    boundaries, values = [-1.0, 1.0], [1, 2, 3]
-    with self.assertRaises(ValueError):
-      decayed_lr = tf.compat.v1.train.piecewise_constant(
-          x_int, boundaries, values)
-      if tf.executing_eagerly():
-        decayed_lr()
-
-    x = tf.Variable(0.0)
-    boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
-    with self.assertRaises(ValueError):
-      decayed_lr = tf.compat.v1.train.piecewise_constant(
-          x, boundaries, values)
-      if tf.executing_eagerly():
-        decayed_lr()
-
-    # Test that ref types are valid.
-    if not tf.executing_eagerly():
-      x = tf.compat.v1.Variable(0.0, use_resource=False)
-      x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
-      boundaries, values = [1.0, 2.0], [1, 2, 3]
-      tf.compat.v1.train.piecewise_constant(x_ref, boundaries, values)
-
-    # Test casting boundaries from int32 to int64.
-    x_int64 = tf.Variable(0, dtype=tf.int64)
-    boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
-    decayed_lr = tf.compat.v1.train.piecewise_constant(
-        x_int64, boundaries, values)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(1))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(2))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.5, 1e-6)
-    self.evaluate(x_int64.assign(3))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.6, 1e-6)
-    self.evaluate(x_int64.assign(4))
-    self.assertAllClose(self.evaluate(decayed_lr), 0.7, 1e-6)
+    def testContinuous(self):
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        step = 5
+        decayed_lr = tf.compat.v1.train.exponential_decay(0.05, step, 10, 0.96)
+        expected = 0.05 * 0.96 ** (5.0 / 10.0)
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testStaircase(self):
+        if tf.executing_eagerly():
+            step = tf.Variable(0)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            decayed_lr = tf.compat.v1.train.exponential_decay(
+                0.1, step, 3, 0.96, staircase=True
+            )
+
+            # No change to learning rate due to staircase
+            expected = 0.1
+            self.evaluate(step.assign(1))
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+            expected = 0.1
+            self.evaluate(step.assign(2))
+            self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6)
+
+            # Decayed learning rate
+            expected = 0.1 * 0.96 ** (100 // 3)
+            self.evaluate(step.assign(100))
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testVariables(self):
+        step = tf.Variable(1)
+
+        decayed_lr = tf.compat.v1.train.exponential_decay(
+            0.1, step, 3, 0.96, staircase=True
+        )
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        # No change to learning rate
+        assign_1 = step.assign(1)
+        if not tf.executing_eagerly():
+            self.evaluate(assign_1.op)
+        self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6)
+        assign_2 = step.assign(2)
+        if not tf.executing_eagerly():
+            self.evaluate(assign_2.op)
+        self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6)
+        # Decayed learning rate
+        assign_100 = step.assign(100)
+        if not tf.executing_eagerly():
+            self.evaluate(assign_100.op)
+        expected = 0.1 * 0.96 ** (100 // 3)
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testPiecewiseConstant(self):
+        x = tf.Variable(-999)
+        decayed_lr = tf.compat.v1.train.piecewise_constant(
+            x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001]
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+
+        self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6)
+        self.evaluate(x.assign(100))
+        self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6)
+        self.evaluate(x.assign(105))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6)
+        self.evaluate(x.assign(110))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6)
+        self.evaluate(x.assign(120))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.01, 1e-6)
+        self.evaluate(x.assign(999))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
+
+    def testPiecewiseConstantEdgeCases(self):
+        x_int = tf.Variable(0, dtype=tf.int32)
+        boundaries, values = [-1.0, 1.0], [1, 2, 3]
+        with self.assertRaises(ValueError):
+            decayed_lr = tf.compat.v1.train.piecewise_constant(
+                x_int, boundaries, values
+            )
+            if tf.executing_eagerly():
+                decayed_lr()
+
+        x = tf.Variable(0.0)
+        boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
+        with self.assertRaises(ValueError):
+            decayed_lr = tf.compat.v1.train.piecewise_constant(
+                x, boundaries, values
+            )
+            if tf.executing_eagerly():
+                decayed_lr()
+
+        # Test that ref types are valid.
+        if not tf.executing_eagerly():
+            x = tf.compat.v1.Variable(0.0, use_resource=False)
+            x_ref = x.op.outputs[0]  # float32_ref tensor should be accepted
+            boundaries, values = [1.0, 2.0], [1, 2, 3]
+            tf.compat.v1.train.piecewise_constant(x_ref, boundaries, values)
+
+        # Test casting boundaries from int32 to int64.
+        x_int64 = tf.Variable(0, dtype=tf.int64)
+        boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
+        decayed_lr = tf.compat.v1.train.piecewise_constant(
+            x_int64, boundaries, values
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6)
+        self.evaluate(x_int64.assign(1))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6)
+        self.evaluate(x_int64.assign(2))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.5, 1e-6)
+        self.evaluate(x_int64.assign(3))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.6, 1e-6)
+        self.evaluate(x_int64.assign(4))
+        self.assertAllClose(self.evaluate(decayed_lr), 0.7, 1e-6)
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LinearDecayTest(test_combinations.TestCase):
-
-  def testHalfWay(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
-    expected = lr * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testEnd(self):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testHalfWayWithEnd(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
-    expected = (lr + end_lr) * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testBeyondEnd(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testBeyondEndWithCycle(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = tf.compat.v1.train.polynomial_decay(
-        lr, step, 10, end_lr, cycle=True)
-    expected = (lr - end_lr) * 0.25 + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+    def testHalfWay(self):
+        step = 5
+        lr = 0.05
+        end_lr = 0.0
+        decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
+        expected = lr * 0.5
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testEnd(self):
+        step = 10
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testHalfWayWithEnd(self):
+        step = 5
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
+        expected = (lr + end_lr) * 0.5
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testBeyondEnd(self):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = tf.compat.v1.train.polynomial_decay(lr, step, 10, end_lr)
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testBeyondEndWithCycle(self):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = tf.compat.v1.train.polynomial_decay(
+            lr, step, 10, end_lr, cycle=True
+        )
+        expected = (lr - end_lr) * 0.25 + end_lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class SqrtDecayTest(test_combinations.TestCase):
-
-  def testHalfWay(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    power = 0.5
-    decayed_lr = tf.compat.v1.train.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = lr * 0.5**power
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testEnd(self):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = tf.compat.v1.train.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testHalfWayWithEnd(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = tf.compat.v1.train.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = (lr - end_lr) * 0.5**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testBeyondEnd(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = tf.compat.v1.train.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testBeyondEndWithCycle(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = tf.compat.v1.train.polynomial_decay(
-        lr, step, 10, end_lr, power=power, cycle=True)
-    expected = (lr - end_lr) * 0.25**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+    def testHalfWay(self):
+        step = 5
+        lr = 0.05
+        end_lr = 0.0
+        power = 0.5
+        decayed_lr = tf.compat.v1.train.polynomial_decay(
+            lr, step, 10, end_lr, power=power
+        )
+        expected = lr * 0.5**power
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testEnd(self):
+        step = 10
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = tf.compat.v1.train.polynomial_decay(
+            lr, step, 10, end_lr, power=power
+        )
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testHalfWayWithEnd(self):
+        step = 5
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = tf.compat.v1.train.polynomial_decay(
+            lr, step, 10, end_lr, power=power
+        )
+        expected = (lr - end_lr) * 0.5**power + end_lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testBeyondEnd(self):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = tf.compat.v1.train.polynomial_decay(
+            lr, step, 10, end_lr, power=power
+        )
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testBeyondEndWithCycle(self):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = tf.compat.v1.train.polynomial_decay(
+            lr, step, 10, end_lr, power=power, cycle=True
+        )
+        expected = (lr - end_lr) * 0.25**power + end_lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class PolynomialDecayTest(test_combinations.TestCase):
-
-  def testBeginWithCycle(self):
-    lr = 0.001
-    decay_steps = 10
-    step = 0
-    decayed_lr = tf.compat.v1.train.polynomial_decay(
-        lr, step, decay_steps, cycle=True)
-    expected = lr
-    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+    def testBeginWithCycle(self):
+        lr = 0.001
+        decay_steps = 10
+        step = 0
+        decayed_lr = tf.compat.v1.train.polynomial_decay(
+            lr, step, decay_steps, cycle=True
+        )
+        expected = lr
+        self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class ExponentialDecayTest(test_combinations.TestCase):
-
-  def testDecay(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = tf.Variable(0)
-    decayed_lr = tf.compat.v1.train.natural_exp_decay(initial_lr, step, k,
-                                                       decay_rate)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr * math.exp(-i / k * decay_rate)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-  def testStaircase(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = tf.Variable(0)
-    decayed_lr = tf.compat.v1.train.natural_exp_decay(
-        initial_lr, step, k, decay_rate, staircase=True)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr * math.exp(-decay_rate * (i // k))
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
+    def testDecay(self):
+        initial_lr = 0.1
+        k = 10
+        decay_rate = 0.96
+        step = tf.Variable(0)
+        decayed_lr = tf.compat.v1.train.natural_exp_decay(
+            initial_lr, step, k, decay_rate
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        for i in range(k + 1):
+            expected = initial_lr * math.exp(-i / k * decay_rate)
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+            self.evaluate(step.assign_add(1))
+
+    def testStaircase(self):
+        initial_lr = 0.1
+        k = 10
+        decay_rate = 0.96
+        step = tf.Variable(0)
+        decayed_lr = tf.compat.v1.train.natural_exp_decay(
+            initial_lr, step, k, decay_rate, staircase=True
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        for i in range(k + 1):
+            expected = initial_lr * math.exp(-decay_rate * (i // k))
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+            self.evaluate(step.assign_add(1))
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class InverseDecayTest(test_combinations.TestCase):
-
-  def testDecay(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = tf.Variable(0)
-    decayed_lr = tf.compat.v1.train.inverse_time_decay(initial_lr, step, k,
-                                                        decay_rate)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + i / k * decay_rate)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-  def testStaircase(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = tf.Variable(0)
-    decayed_lr = tf.compat.v1.train.inverse_time_decay(
-        initial_lr, step, k, decay_rate, staircase=True)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + decay_rate * (i // k))
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
+    def testDecay(self):
+        initial_lr = 0.1
+        k = 10
+        decay_rate = 0.96
+        step = tf.Variable(0)
+        decayed_lr = tf.compat.v1.train.inverse_time_decay(
+            initial_lr, step, k, decay_rate
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        for i in range(k + 1):
+            expected = initial_lr / (1 + i / k * decay_rate)
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+            self.evaluate(step.assign_add(1))
+
+    def testStaircase(self):
+        initial_lr = 0.1
+        k = 10
+        decay_rate = 0.96
+        step = tf.Variable(0)
+        decayed_lr = tf.compat.v1.train.inverse_time_decay(
+            initial_lr, step, k, decay_rate, staircase=True
+        )
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        for i in range(k + 1):
+            expected = initial_lr / (1 + decay_rate * (i // k))
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+            self.evaluate(step.assign_add(1))
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CosineDecayTest(test_combinations.TestCase):
-
-  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
-    step = min(step, decay_steps)
-    completed_fraction = step / decay_steps
-    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  def testDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.cosine_decay(initial_lr, step,
-                                                    num_training_steps)
-      expected = self.np_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testAlpha(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.cosine_decay(initial_lr, step,
-                                                    num_training_steps, alpha)
-      expected = self.np_cosine_decay(step, num_training_steps, alpha)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+    def np_cosine_decay(self, step, decay_steps, alpha=0.0):
+        step = min(step, decay_steps)
+        completed_fraction = step / decay_steps
+        decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+        return (1.0 - alpha) * decay + alpha
+
+    def testDecay(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.cosine_decay(
+                initial_lr, step, num_training_steps
+            )
+            expected = self.np_cosine_decay(step, num_training_steps)
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testAlpha(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        alpha = 0.1
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.cosine_decay(
+                initial_lr, step, num_training_steps, alpha
+            )
+            expected = self.np_cosine_decay(step, num_training_steps, alpha)
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CosineDecayRestartsTest(test_combinations.TestCase):
-
-  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
-                               alpha=0.0):
-    fac = 1.0
-    while step >= decay_steps:
-      step -= decay_steps
-      decay_steps *= t_mul
-      fac *= m_mul
-
-    completed_fraction = step / decay_steps
-    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  def testDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
-          initial_lr, step, num_training_steps)
-      expected = self.np_cosine_decay_restarts(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testAlpha(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, alpha=alpha)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, alpha=alpha)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testMMul(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    m_mul = 0.9
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, m_mul=m_mul)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, m_mul=m_mul)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testTMul(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    t_mul = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, t_mul=t_mul)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, t_mul=t_mul)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+    def np_cosine_decay_restarts(
+        self, step, decay_steps, t_mul=2.0, m_mul=1.0, alpha=0.0
+    ):
+        fac = 1.0
+        while step >= decay_steps:
+            step -= decay_steps
+            decay_steps *= t_mul
+            fac *= m_mul
+
+        completed_fraction = step / decay_steps
+        decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+        return (1.0 - alpha) * decay + alpha
+
+    def testDecay(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
+                initial_lr, step, num_training_steps
+            )
+            expected = self.np_cosine_decay_restarts(step, num_training_steps)
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testAlpha(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        alpha = 0.1
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
+                initial_lr, step, num_training_steps, alpha=alpha
+            )
+            expected = self.np_cosine_decay_restarts(
+                step, num_training_steps, alpha=alpha
+            )
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testMMul(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        m_mul = 0.9
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
+                initial_lr, step, num_training_steps, m_mul=m_mul
+            )
+            expected = self.np_cosine_decay_restarts(
+                step, num_training_steps, m_mul=m_mul
+            )
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testTMul(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        t_mul = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.cosine_decay_restarts(
+                initial_lr, step, num_training_steps, t_mul=t_mul
+            )
+            expected = self.np_cosine_decay_restarts(
+                step, num_training_steps, t_mul=t_mul
+            )
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class LinearCosineDecayTest(test_combinations.TestCase):
-
-  def np_linear_cosine_decay(self,
-                             step,
-                             decay_steps,
-                             alpha=0.0,
-                             beta=0.001,
-                             num_periods=0.5):
-    step = min(step, decay_steps)
-    linear_decayed = float(decay_steps - step) / decay_steps
-    fraction = 2.0 * num_periods * step / float(decay_steps)
-    cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
-    return (alpha + linear_decayed) * cosine_decayed + beta
-
-  def testDefaultDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.linear_cosine_decay(
-          initial_lr, step, num_training_steps)
-      expected = self.np_linear_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
-
-  def testNonDefaultDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = tf.compat.v1.train.linear_cosine_decay(
-          initial_lr,
-          step,
-          num_training_steps,
-          alpha=0.1,
-          beta=1e-4,
-          num_periods=5)
-      expected = self.np_linear_cosine_decay(
-          step, num_training_steps, alpha=0.1, beta=1e-4, num_periods=5)
-      self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+    def np_linear_cosine_decay(
+        self, step, decay_steps, alpha=0.0, beta=0.001, num_periods=0.5
+    ):
+        step = min(step, decay_steps)
+        linear_decayed = float(decay_steps - step) / decay_steps
+        fraction = 2.0 * num_periods * step / float(decay_steps)
+        cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
+        return (alpha + linear_decayed) * cosine_decayed + beta
+
+    def testDefaultDecay(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.linear_cosine_decay(
+                initial_lr, step, num_training_steps
+            )
+            expected = self.np_linear_cosine_decay(step, num_training_steps)
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
+
+    def testNonDefaultDecay(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = tf.compat.v1.train.linear_cosine_decay(
+                initial_lr,
+                step,
+                num_training_steps,
+                alpha=0.1,
+                beta=1e-4,
+                num_periods=5,
+            )
+            expected = self.np_linear_cosine_decay(
+                step, num_training_steps, alpha=0.1, beta=1e-4, num_periods=5
+            )
+            self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class NoisyLinearCosineDecayTest(test_combinations.TestCase):
-
-  def testDefaultNoisyLinearCosine(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      # No numerical check because of noise
-      decayed_lr = tf.compat.v1.train.noisy_linear_cosine_decay(
-          initial_lr, step, num_training_steps)
-      # Cannot be deterministically tested
-      self.evaluate(decayed_lr)
-
-  def testNonDefaultNoisyLinearCosine(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      # No numerical check because of noise
-      decayed_lr = tf.compat.v1.train.noisy_linear_cosine_decay(
-          initial_lr,
-          step,
-          num_training_steps,
-          initial_variance=0.5,
-          variance_decay=0.1,
-          alpha=0.1,
-          beta=1e-4,
-          num_periods=5)
-      # Cannot be deterministically tested
-      self.evaluate(decayed_lr)
+    def testDefaultNoisyLinearCosine(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            # No numerical check because of noise
+            decayed_lr = tf.compat.v1.train.noisy_linear_cosine_decay(
+                initial_lr, step, num_training_steps
+            )
+            # Cannot be deterministically tested
+            self.evaluate(decayed_lr)
+
+    def testNonDefaultNoisyLinearCosine(self):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            # No numerical check because of noise
+            decayed_lr = tf.compat.v1.train.noisy_linear_cosine_decay(
+                initial_lr,
+                step,
+                num_training_steps,
+                initial_variance=0.5,
+                variance_decay=0.1,
+                alpha=0.1,
+                beta=1e-4,
+                num_periods=5,
+            )
+            # Cannot be deterministically tested
+            self.evaluate(decayed_lr)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/optimizers/lion.py b/keras/optimizers/lion.py
new file mode 100644
index 000000000000..8c9084981018
--- /dev/null
+++ b/keras/optimizers/lion.py
@@ -0,0 +1,167 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Lion optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export("keras.optimizers.Lion", v1=[])
+class Lion(optimizer.Optimizer):
+    """Optimizer that implements the Lion algorithm.
+
+    The Lion optimizer is a stochastic-gradient-descent method that uses the
+    sign operator to control the magnitude of the update, unlike other adaptive
+    optimizers such as Adam that rely on second-order moments. This make
+    Lion more memory-efficient as it only keeps track of the momentum. According
+    to the authors (see reference), its performance gain over Adam grows with
+    the batch size. Because the update of Lion is produced through the sign
+    operation, resulting in a larger norm, a suitable learning rate for Lion is
+    typically 3-10x smaller than that for AdamW. The weight decay for Lion
+    should be in turn 3-10x larger than that for AdamW to maintain a
+    similar strength (lr * wd).
+
+    Args:
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to 0.0001.
+        beta_1: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            rate to combine the current gradient and the 1st moment estimate.
+        beta_2: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimate.
+        {{base_optimizer_keyword_args}}
+
+    References:
+        - [Chen et al., 2023](http://arxiv.org/abs/2302.06675)
+        - [Authors' implementation](
+            http://github.com/google/automl/tree/master/lion)
+
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.0001,
+        beta_1=0.9,
+        beta_2=0.99,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Lion",
+        **kwargs,
+    ):
+        super().__init__(
+            name=name,
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs,
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        if beta_1 <= 0 or beta_1 > 1:
+            raise ValueError(
+                f"`beta_1`={beta_1} must be between ]0, 1]. Otherwise, "
+                "the optimizer degenerates to SignSGD."
+            )
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        Lion optimizer has one variable `momentums`.
+
+        Args:
+            var_list: list of model variables to build Lion variables on.
+        """
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self.momentums = []
+        for var in var_list:
+            self.momentums.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="m"
+                )
+            )
+        self._built = True
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+        beta_1 = tf.cast(self.beta_1, variable.dtype)
+        beta_2 = tf.cast(self.beta_2, variable.dtype)
+        var_key = self._var_key(variable)
+        m = self.momentums[self._index_dict[var_key]]
+
+        if isinstance(gradient, tf.IndexedSlices):
+            # Sparse gradients (use m as a buffer)
+            m.assign(m * beta_1)
+            m.scatter_add(
+                tf.IndexedSlices(
+                    gradient.values * (1.0 - beta_1), gradient.indices
+                )
+            )
+            variable.assign_sub(lr * tf.math.sign(m))
+
+            m.assign(m * beta_2 / beta_1)
+            m.scatter_add(
+                tf.IndexedSlices(
+                    gradient.values * (1.0 - beta_2 / beta_1), gradient.indices
+                )
+            )
+        else:
+            # Dense gradients
+            variable.assign_sub(
+                lr * tf.math.sign(m * beta_1 + gradient * (1.0 - beta_1))
+            )
+            m.assign(m * beta_2 + gradient * (1.0 - beta_2))
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "beta_1": self.beta_1,
+                "beta_2": self.beta_2,
+            }
+        )
+        return config
+
+
+Lion.__doc__ = Lion.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/lion_test.py b/keras/optimizers/lion_test.py
new file mode 100644
index 000000000000..6cd44066fd6e
--- /dev/null
+++ b/keras/optimizers/lion_test.py
@@ -0,0 +1,149 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Lion."""
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from tensorflow.python.framework import dtypes
+
+from keras.optimizers.lion import Lion
+
+
+def lion_update_numpy(
+    params,
+    grads,
+    momentums,
+    learning_rate=0.0001,
+    beta_1=0.9,
+    beta_2=0.99,
+):
+    params = params - learning_rate * np.sign(
+        beta_1 * momentums + (1 - beta_1) * grads
+    )
+    momentums = beta_2 * momentums + (1 - beta_2) * grads
+    return params, momentums
+
+
+class LionOptimizerTest(tf.test.TestCase):
+    def testDense(self):
+        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+            learning_rate = 0.0001
+            beta_1 = 0.9
+            beta_2 = 0.99
+            with self.cached_session():
+                m0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                m1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.9, 0.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.1, 0.0], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0 = tf.constant(grads0_np)
+                grads1 = tf.constant(grads1_np)
+                optimizer = Lion(
+                    learning_rate=learning_rate,
+                    beta_1=beta_1,
+                    beta_2=beta_2,
+                )
+
+                # Run 3 steps of Lion
+                for _ in range(3):
+                    optimizer.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+                    var0_np, m0_np = lion_update_numpy(
+                        var0_np,
+                        grads0_np,
+                        m0_np,
+                        learning_rate=learning_rate,
+                        beta_1=beta_1,
+                        beta_2=beta_2,
+                    )
+                    var1_np, m1_np = lion_update_numpy(
+                        var1_np,
+                        grads1_np,
+                        m1_np,
+                        learning_rate=learning_rate,
+                        beta_1=beta_1,
+                        beta_2=beta_2,
+                    )
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np, var0)
+                    self.assertAllCloseAccordingToType(var1_np, var1)
+
+    def testSparse(self):
+        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+            learning_rate = 0.0001
+            beta_1 = 0.9
+            beta_2 = 0.99
+            with self.cached_session():
+                m0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                m1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.9, 0.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.1, 0.0], dtype=dtype.as_numpy_dtype)
+
+                var0 = tf.Variable(var0_np)
+                var1 = tf.Variable(var1_np)
+                grads0_np_indices = np.array([0], dtype=np.int32)
+                grads0 = tf.IndexedSlices(
+                    tf.constant(grads0_np[grads0_np_indices]),
+                    tf.constant(grads0_np_indices),
+                    tf.constant([2]),
+                )
+                grads1_np_indices = np.array([0], dtype=np.int32)
+                grads1 = tf.IndexedSlices(
+                    tf.constant(grads1_np[grads1_np_indices]),
+                    tf.constant(grads1_np_indices),
+                    tf.constant([2]),
+                )
+
+                optimizer = Lion(
+                    learning_rate=learning_rate,
+                    beta_1=beta_1,
+                    beta_2=beta_2,
+                )
+
+                # Run 3 steps of Lion
+                for _ in range(3):
+                    optimizer.apply_gradients(
+                        zip([grads0, grads1], [var0, var1])
+                    )
+                    var0_np, m0_np = lion_update_numpy(
+                        var0_np,
+                        grads0_np,
+                        m0_np,
+                        learning_rate=learning_rate,
+                        beta_1=beta_1,
+                        beta_2=beta_2,
+                    )
+                    var1_np, m1_np = lion_update_numpy(
+                        var1_np,
+                        grads1_np,
+                        m1_np,
+                        learning_rate=learning_rate,
+                        beta_1=beta_1,
+                        beta_2=beta_2,
+                    )
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np, var0)
+                    self.assertAllCloseAccordingToType(var1_np, var1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/nadam.py b/keras/optimizers/nadam.py
new file mode 100644
index 000000000000..c24de740410c
--- /dev/null
+++ b/keras/optimizers/nadam.py
@@ -0,0 +1,207 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Nadam optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.experimental.Nadam", "keras.optimizers.Nadam", v1=[]
+)
+class Nadam(optimizer.Optimizer):
+    r"""Optimizer that implements the Nadam algorithm.
+
+    Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
+    Nesterov momentum.
+
+    Args:
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to `0.001`.
+        beta_1: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimates.
+            Defaults to `0.9`.
+        beta_2: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 2nd moment estimates. Defaults to
+            `0.999`.
+        epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper.
+            Defaults to `1e-7`.
+        {{base_optimizer_keyword_args}}
+
+    Reference:
+        - [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-7,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="Nadam",
+        **kwargs
+    ):
+        super().__init__(
+            name=name,
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        Nadam optimizer has 2 types of variables: momentums and velocities.
+
+        Args:
+            var_list: list of model variables to build Nadam variables on.
+        """
+        super().build(var_list)
+        if getattr(self, "_built", False):
+            return
+        self._built = True
+        self._momentums = []
+        self._velocities = []
+        self._u_product = tf.Variable(1.0, dtype=var_list[0].dtype)
+        # Keep a counter on how many times of _u_product has been computed to
+        # avoid duplicated computations.
+        self._u_product_counter = 1
+
+        for var in var_list:
+            self._momentums.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="m"
+                )
+            )
+            self._velocities.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="v"
+                )
+            )
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+        var_dtype = variable.dtype
+        lr = tf.cast(self.learning_rate, var_dtype)
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        next_step = tf.cast(self.iterations + 2, var_dtype)
+        decay = tf.cast(0.96, var_dtype)
+        beta_1 = tf.cast(self.beta_1, var_dtype)
+        beta_2 = tf.cast(self.beta_2, var_dtype)
+        u_t = beta_1 * (1.0 - 0.5 * (tf.pow(decay, local_step)))
+        u_t_1 = beta_1 * (1.0 - 0.5 * (tf.pow(decay, next_step)))
+
+        def get_cached_u_product():
+            return self._u_product
+
+        def compute_new_u_product():
+            u_product_t = self._u_product * u_t
+            self._u_product.assign(u_product_t)
+            self._u_product_counter += 1
+            return u_product_t
+
+        u_product_t = tf.cond(
+            self._u_product_counter == (self.iterations + 2),
+            true_fn=get_cached_u_product,
+            false_fn=compute_new_u_product,
+        )
+        u_product_t_1 = u_product_t * u_t_1
+        beta_2_power = tf.pow(beta_2, local_step)
+
+        var_key = self._var_key(variable)
+        m = self._momentums[self._index_dict[var_key]]
+        v = self._velocities[self._index_dict[var_key]]
+
+        if isinstance(gradient, tf.IndexedSlices):
+            # Sparse gradients.
+            m.assign_add(-m * (1 - beta_1))
+            m.scatter_add(
+                tf.IndexedSlices(
+                    gradient.values * (1 - beta_1), gradient.indices
+                )
+            )
+            v.assign_add(-v * (1 - beta_2))
+            v.scatter_add(
+                tf.IndexedSlices(
+                    tf.square(gradient.values) * (1 - beta_2), gradient.indices
+                )
+            )
+            m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / (
+                1 - u_product_t
+            )
+            v_hat = v / (1 - beta_2_power)
+
+            variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
+        else:
+            # Dense gradients.
+            m.assign_add((gradient - m) * (1 - beta_1))
+            v.assign_add((tf.square(gradient) - v) * (1 - beta_2))
+            m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / (
+                1 - u_product_t
+            )
+            v_hat = v / (1 - beta_2_power)
+
+            variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "beta_1": self.beta_1,
+                "beta_2": self.beta_2,
+                "epsilon": self.epsilon,
+            }
+        )
+        return config
+
+
+Nadam.__doc__ = Nadam.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/optimizer.py b/keras/optimizers/optimizer.py
new file mode 100644
index 000000000000..59f343182ad7
--- /dev/null
+++ b/keras/optimizers/optimizer.py
@@ -0,0 +1,1403 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class of optimizer."""
+
+import abc
+import platform
+import re
+
+import tensorflow.compat.v2 as tf
+from absl import logging
+
+from keras import backend
+from keras import initializers
+from keras.dtensor import utils as dtensor_utils
+from keras.optimizers import utils as optimizer_utils
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.utils import tf_utils
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
+
+
+class _BaseOptimizer(tf.__internal__.tracking.AutoTrackable):
+    """Optimizer base class, which only supports non-distribute use case."""
+
+    def __init__(
+        self,
+        name,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        **kwargs,
+    ):
+        self.name = name
+        self.weight_decay = weight_decay
+        self.clipnorm = clipnorm
+        self.global_clipnorm = global_clipnorm
+        self.clipvalue = clipvalue
+        self.use_ema = use_ema
+        # Optimizer only benefits from XLA when training on GPU. So if no
+        # GPU is found, we turn off XLA.
+        if (
+            jit_compile
+            and tf_utils.can_jit_compile()
+            and tf.config.list_physical_devices("GPU")
+        ):
+            self.jit_compile = True
+        else:
+            self.jit_compile = False
+
+        if platform.system() == "Darwin" and platform.processor() == "arm":
+            logging.warning(
+                "At this time, the v2.11+ optimizer "
+                f"`tf.keras.optimizers.{self.__class__.__name__}` runs slowly "
+                "on M1/M2 Macs, please use the legacy Keras optimizer "
+                "instead, located at "
+                f"`tf.keras.optimizers.legacy.{self.__class__.__name__}`."
+            )
+
+        if use_ema:
+            # Verify the arguments related to EMA.
+            if ema_momentum > 1 or ema_momentum < 0:
+                raise ValueError(
+                    "`ema_momentum` must be in the range [0, 1]. "
+                    f"Received: ema_momentum={ema_momentum}"
+                )
+            if ema_overwrite_frequency and (
+                not isinstance(ema_overwrite_frequency, int)
+                or ema_overwrite_frequency < 1
+            ):
+                raise ValueError(
+                    "`ema_overwrite_frequency` must be an integer > 1 or None. "
+                    "Received: ema_overwrite_frequency="
+                    f"{ema_overwrite_frequency}"
+                )
+        self.ema_momentum = ema_momentum
+        self.ema_overwrite_frequency = ema_overwrite_frequency
+
+        if self.clipnorm is not None and self.global_clipnorm is not None:
+            raise ValueError(
+                "At most one of `clipnorm` and `global_clipnorm` can "
+                f"be set. Received: clipnorm={self.clipnorm}, "
+                f"global_clipnorm={self.global_clipnorm}."
+            )
+
+        self._variables = []
+        self._create_iteration_variable()
+        self._process_kwargs(kwargs)
+
+    def _create_iteration_variable(self):
+        """Create the iterations counter variable."""
+        with tf.init_scope():
+            # Lift the variable creation to init scope to avoid environment
+            # issue.
+            self._iterations = tf.Variable(
+                0, name="iteration", dtype=tf.int64, trainable=False
+            )
+        self._variables.append(self._iterations)
+
+    def _process_kwargs(self, kwargs):
+        # Remove the `is_legacy_optimizer` arg, which is for serialization only.
+        kwargs.pop("is_legacy_optimizer", None)
+        lr = kwargs.pop("lr", None)
+        if lr:
+            logging.warning(
+                "`lr` is deprecated in Keras optimizer, please use "
+                "`learning_rate` or use the legacy optimizer, e.g.,"
+                f"tf.keras.optimizers.legacy.{self.__class__.__name__}."
+            )
+        legacy_kwargs = {
+            "decay",
+            "gradient_aggregator",
+            "gradient_transformers",
+        }
+        for k in kwargs:
+            if k in legacy_kwargs:
+                raise ValueError(
+                    f"{k} is deprecated in the new Keras optimizer, please "
+                    "check the docstring for valid arguments, or use the "
+                    "legacy optimizer, e.g., "
+                    f"tf.keras.optimizers.legacy.{self.__class__.__name__}."
+                )
+            else:
+                raise TypeError(
+                    f"{k} is not a valid argument, kwargs should be empty "
+                    " for `optimizer_experimental.Optimizer`."
+                )
+
+    def _create_or_restore_slot_variable(self, **kwargs):
+        raise ValueError(
+            "You are trying to restore a checkpoint from a legacy Keras "
+            "optimizer into a v2.11+ Optimizer, which can cause "
+            "errors. Please update the optimizer referenced in your code "
+            "to be an instance of "
+            "`tf.keras.optimizers.legacy.Optimizer`, e.g.: "
+            f"`tf.keras.optimizers.legacy.{self.__class__.__name__}`."
+        )
+
+    def _var_key(self, variable):
+        """Get a unique identifier of the given variable."""
+        # Get the distributed variable if it exists.
+        # TODO(b/199214315): replace _unique_id with ref() after fixing ref()
+        # issues on AggregatingVariable.
+        return variable._unique_id
+
+    def _deduplicate_sparse_grad(self, grads):
+        """Deduplicate sparse gradient.
+
+        For sparse gradients, i.e., gradient is of type `tf.IndexedSlices`,
+        it is possible that `gradient.indices` has duplicated indices.
+        This function adds up values for the duplicated indices, and returns
+        a `tf.IndexedSlices` with indices of unique values.
+        """
+        processed_grads = []
+        for grad in grads:
+            if isinstance(grad, tf.IndexedSlices):
+                values = grad.values
+                indices = grad.indices
+                unique_indices, new_index_positions = tf.unique(indices)
+                summed_values = tf.math.unsorted_segment_sum(
+                    values, new_index_positions, tf.shape(unique_indices)[0]
+                )
+                processed_grads.append(
+                    tf.IndexedSlices(
+                        summed_values, unique_indices, grad.dense_shape
+                    )
+                )
+            else:
+                processed_grads.append(grad)
+
+        return processed_grads
+
+    @abc.abstractmethod
+    def update_step(self, gradient, variable):
+        """Function to update variable value based on given gradients.
+
+        This method must be implemented in customized optimizers.
+
+        Args:
+          gradient: backpropagated gradient of the given variable.
+          variable: variable whose value needs to be updated.
+
+        Returns:
+          An `Operation` that applies the specified gradients.
+
+        """
+        raise NotImplementedError
+
+    @tf.function(jit_compile=True)
+    def _update_step_xla(self, gradient, variable, key):
+        """A wrapper of `update_step` to enable XLA acceleration.
+
+        Due to `tf.function` tracing mechanism, for (gradient, variable) pairs
+        of the same shape and dtype, the execution graph always invoke the first
+        pair it has seen. Thus, we need a `key` argument to make each (gradient,
+        variable) pair unique. In additions, XLA cannot understand string input,
+        so the key is an integer.
+
+        Args:
+          gradient: backpropagated gradient of the given variable.
+          variable: variable whose value needs to be updated.
+          key (int): a unique key that identifies the variable.
+
+        Returns:
+          An `Operation` that applies the specified gradients.
+        """
+        return self._update_step(gradient, variable)
+
+    def _update_step(self, gradient, variable):
+        if getattr(variable, "_unique_id", None) is None:
+            # Variable has no `_unique_id` if called during `model.save()`, in
+            # which case we do not want to update the variable.
+            return
+        if self._var_key(variable) not in self._index_dict:
+            raise KeyError(
+                f"The optimizer cannot recognize variable {variable.name}. "
+                "This usually means you are trying to call the optimizer to "
+                "update different parts of the model separately. Please call "
+                "`optimizer.build(variables)` with the full list of trainable "
+                "variables before the training loop or use legacy optimizer "
+                f"`tf.keras.optimizers.legacy.{self.__class__.__name__}."
+            )
+        self.update_step(gradient, variable)
+
+    def compute_gradients(self, loss, var_list, tape=None):
+        """Compute gradients of loss on trainable variables.
+
+        Args:
+          loss: `Tensor` or callable. If a callable, `loss` should take no
+            arguments and return the value to minimize.
+          var_list: list or tuple of `Variable` objects to update to minimize
+            `loss`, or a callable returning the list or tuple of `Variable`
+            objects. Use callable when the variable list would otherwise be
+            incomplete before `minimize` since the variables are created at the
+            first time `loss` is called.
+          tape: (Optional) `tf.GradientTape`. If `loss` is provided as a
+            `Tensor`, the tape that computed the `loss` must be provided.
+
+        Returns:
+          A list of (gradient, variable) pairs. Variable is always present, but
+          gradient can be `None`.
+        """
+        if not callable(loss) and tape is None:
+            raise ValueError(
+                "`tape` is required when a `Tensor` loss is passed. "
+                f"Received: loss={loss}, tape={tape}."
+            )
+        if tape is None:
+            tape = tf.GradientTape()
+        if callable(loss):
+            with tape:
+                if not callable(var_list):
+                    tape.watch(var_list)
+                loss = loss()
+                if callable(var_list):
+                    var_list = var_list()
+
+        grads = tape.gradient(loss, var_list)
+        return list(zip(grads, var_list))
+
+    def _clip_gradients(self, grads):
+        clipped_grads = []
+        if self.clipnorm and self.clipnorm > 0:
+            for g in grads:
+                if g is None:
+                    clipped_grads.append(g)
+                else:
+                    clipped_grads.append(tf.clip_by_norm(g, self.clipnorm))
+            return clipped_grads
+
+        if self.global_clipnorm and self.global_clipnorm > 0:
+            return tf.clip_by_global_norm(grads, self.global_clipnorm)[0]
+
+        if self.clipvalue and self.clipvalue > 0:
+            for g in grads:
+                if g is None:
+                    clipped_grads.append(g)
+                else:
+                    clipped_grads.append(
+                        tf.clip_by_value(
+                            g,
+                            clip_value_min=-self.clipvalue,
+                            clip_value_max=self.clipvalue,
+                        )
+                    )
+            return clipped_grads
+
+        return grads
+
+    @property
+    def iterations(self):
+        """The number of training steps this `optimizer` has run.
+
+        By default, iterations would be incremented by one every time
+        `apply_gradients()` is called.
+        """
+        return self._iterations
+
+    @iterations.setter
+    def iterations(self, variable):
+        if getattr(self, "_built", False):
+            raise RuntimeError(
+                "Cannot set `iterations` to a new Variable after "
+                "the Optimizer weights have been created. Here it is "
+                f"attempting to set `iterations` to {variable}."
+                "Usually this means you are trying to set `iterations`"
+                " after calling `apply_gradients()`. Please set "
+                "`iterations` before calling `apply_gradients()`."
+            )
+        self._iterations = variable
+
+    @property
+    def learning_rate(self):
+        if not hasattr(self, "_learning_rate") or self._learning_rate is None:
+            raise ValueError(
+                "Missing learning rate, please set self.learning_rate at"
+                " optimizer creation time."
+            )
+        lr = self._learning_rate
+        if isinstance(lr, learning_rate_schedule.LearningRateSchedule):
+            # If the optimizer takes in LearningRateSchedule, then each call to
+            # learning_rate would return `self._current_learning_rate`, which is
+            # updated at each call to `apply_gradients`.
+            return self._current_learning_rate
+        return lr
+
+    @learning_rate.setter
+    def learning_rate(self, learning_rate):
+        if isinstance(
+            learning_rate, learning_rate_schedule.LearningRateSchedule
+        ):
+            self._learning_rate = learning_rate
+        else:
+            if isinstance(
+                self._learning_rate, learning_rate_schedule.LearningRateSchedule
+            ):
+                raise TypeError(
+                    "This optimizer was created with a `LearningRateSchedule`"
+                    " object as its `learning_rate` constructor argument, "
+                    "hence its learning rate is not settable. If you need the"
+                    " learning rate to be settable, you should instantiate "
+                    "the optimizer with a float `learning_rate` argument."
+                )
+            self._learning_rate.assign(learning_rate)
+
+    @property
+    @doc_controls.do_not_generate_docs
+    def lr(self):
+        """Alias of `learning_rate()`.
+
+        `lr()` is heavily called in workflows using `optimizer_v2.OptimizerV2`,
+        so we keep it for backward compabitliy.
+        """
+        return self.learning_rate
+
+    @lr.setter
+    def lr(self, learning_rate):
+        self.learning_rate = learning_rate
+
+    def _build_learning_rate(self, learning_rate):
+        with tf.init_scope():
+            if isinstance(
+                learning_rate, learning_rate_schedule.LearningRateSchedule
+            ):
+                # Create a variable to hold the current learning rate.
+                current_learning_rate = tf.convert_to_tensor(
+                    learning_rate(self.iterations)
+                )
+                self._current_learning_rate = tf.Variable(
+                    current_learning_rate,
+                    name="current_learning_rate",
+                    dtype=current_learning_rate.dtype,
+                    trainable=False,
+                )
+                return learning_rate
+
+            return tf.Variable(
+                learning_rate,
+                name="learning_rate",
+                dtype=backend.floatx(),
+                trainable=False,
+            )
+
+    @abc.abstractmethod
+    def build(self, var_list):
+        """Initialize the optimizer's variables, such as momemtum variables.
+
+        This function has to be implemented by subclass optimizers, and subclass
+        optimizers need to call `super().build(var_list)`.
+
+        Args:
+          var_list: List of model variables to build optimizers on. For example,
+            SGD optimizer with momentum will store one momentum variable
+            corresponding to each model variable.
+        """
+        if getattr(self, "_built", False):
+            return
+        self._build_index_dict(var_list)
+        if self.use_ema:
+            self._model_variables_moving_average = []
+            for var in var_list:
+                # Make a copy of the model variables, we will use the copy to
+                # store the moving average of model variables.
+                self._model_variables_moving_average.append(
+                    self.add_variable_from_reference(
+                        var, "average", initial_value=var
+                    )
+                )
+
+    def _build_index_dict(self, var_list):
+        """Build variable to index dictionary.
+
+        Build a dictionary that maps variable to the index of it in the given
+        var_list.
+
+        Args:
+          var_list: List of variables to build index dict on.
+
+        Returns:
+          None
+        """
+        self._index_dict = {}
+        for i, var in enumerate(var_list):
+            var_key = self._var_key(var)
+            self._index_dict[var_key] = i
+
+    def add_variable(self, shape, dtype=None, initializer="zeros", name=None):
+        """Create an optimizer variable.
+
+        Args:
+          shape: A list of integers, a tuple of integers, or a 1-D Tensor of
+            type int32. Defaults to scalar if unspecified.
+          dtype: The DType of the optimizer variable to be created. Defaults to
+            `tf.keras.backend.floatx` if unspecified.
+          initializer: string or callable. Initializer instance.
+          name: The name of the optimizer variable to be created.
+
+        Returns:
+          An optimizer variable, in the format of tf.Variable.
+
+        """
+        if isinstance(initializer, str):
+            initializer = initializers.get(initializer)
+        if dtype is None:
+            dtype = backend.floatx()
+        if shape is None:
+            shape = []
+        variable = tf.Variable(
+            initial_value=initializer(shape, dtype), name=name, trainable=False
+        )
+        self._variables.append(variable)
+        return variable
+
+    def add_variable_from_reference(
+        self, model_variable, variable_name, shape=None, initial_value=None
+    ):
+        """Create an optimizer variable from model variable.
+
+        Create an optimizer variable based on the information of model variable.
+        For example, in SGD optimizer momemtum, for each model variable, a
+        corresponding momemtum variable is created of the same shape and dtype.
+
+        Args:
+          model_variable: tf.Variable. The corresponding model variable to the
+            optimizer variable to be created.
+          variable_name: String. The name prefix of the optimizer variable to be
+            created. The create variables name will follow the pattern
+            `{variable_name}/{model_variable.name}`, e.g., `momemtum/dense_1`.
+          shape: List or Tuple, defaults to None. The shape of the optimizer
+            variable to be created. If None, the created variable will have the
+            same shape as `model_variable`.
+          initial_value: A Tensor, or Python object convertible to a Tensor,
+            defaults to None. The initial value of the optimizer variable, if
+            None, the initial value will be default to 0.
+
+        Returns:
+          An optimizer variable.
+        """
+        if initial_value is None:
+            if shape is None:
+                if model_variable.shape.rank is None:
+                    # When the rank is None, we cannot get a concrete
+                    # `model_variable.shape`, we use dynamic shape.
+                    initial_value = tf.zeros_like(
+                        model_variable, dtype=model_variable.dtype
+                    )
+                else:
+                    # We cannot always use `zeros_like`, because some cases
+                    # the shape exists while values don't.
+                    initial_value = tf.zeros(
+                        model_variable.shape, dtype=model_variable.dtype
+                    )
+            else:
+                initial_value = tf.zeros(shape, dtype=model_variable.dtype)
+        variable = tf.Variable(
+            initial_value=initial_value,
+            name=f"{variable_name}/{model_variable._shared_name}",
+            dtype=model_variable.dtype,
+            trainable=False,
+        )
+        self._variables.append(variable)
+        return variable
+
+    def minimize(self, loss, var_list, tape=None):
+        """Minimize `loss` by updating `var_list`.
+
+        This method simply computes gradient using `tf.GradientTape` and calls
+        `apply_gradients()`. If you want to process the gradient before applying
+        then call `tf.GradientTape` and `apply_gradients()` explicitly instead
+        of using this function.
+
+        Args:
+          loss: `Tensor` or callable. If a callable, `loss` should take no
+            arguments and return the value to minimize.
+          var_list: list or tuple of `Variable` objects to update to minimize
+            `loss`, or a callable returning the list or tuple of `Variable`
+            objects.  Use callable when the variable list would otherwise be
+            incomplete before `minimize` since the variables are created at the
+            first time `loss` is called.
+          tape: (Optional) `tf.GradientTape`.
+
+        Returns:
+          None
+        """
+        grads_and_vars = self.compute_gradients(loss, var_list, tape)
+        self.apply_gradients(grads_and_vars)
+
+    def _compute_current_learning_rate(self):
+        if isinstance(
+            self._learning_rate, learning_rate_schedule.LearningRateSchedule
+        ):
+            # Compute the current learning rate at the beginning of variable
+            # update.
+            if hasattr(self, "_current_learning_rate"):
+                self._current_learning_rate.assign(
+                    self._learning_rate(self.iterations)
+                )
+            else:
+                current_learning_rate = tf.convert_to_tensor(
+                    self._learning_rate(self.iterations)
+                )
+                self._current_learning_rate = tf.Variable(
+                    current_learning_rate,
+                    name="current_learning_rate",
+                    dtype=current_learning_rate.dtype,
+                    trainable=False,
+                )
+
+    def exclude_from_weight_decay(self, var_list=None, var_names=None):
+        """Exclude variables from weight decay.
+
+        This method must be called before the optimizer's `build` method is
+        called. You can set specific variables to exclude out, or set a list of
+        strings as the anchor words, if any of which appear in a variable's
+        name, then the variable is excluded.
+
+        Args:
+            var_list: A list of `tf.Variable`s to exclude from weight decay.
+            var_names: A list of strings. If any string in `var_names` appear
+                in the model variable's name, then this model variable is
+                excluded from weight decay. For example, `var_names=['bias']`
+                excludes all bias variables from weight decay.
+        """
+        if hasattr(self, "_built") and self._built:
+            raise ValueError(
+                "`exclude_from_weight_decay()` can only be configued before "
+                "the optimizer is built."
+            )
+
+        if var_list:
+            self._exclude_from_weight_decay = [
+                self._var_key(variable) for variable in var_list
+            ]
+        else:
+            self._exclude_from_weight_decay = []
+        self._exclude_from_weight_decay_names = var_names or []
+
+    def _use_weight_decay(self, variable):
+        exclude_from_weight_decay = getattr(
+            self, "_exclude_from_weight_decay", []
+        )
+        exclude_from_weight_decay_names = getattr(
+            self, "_exclude_from_weight_decay_names", []
+        )
+        variable_id = self._var_key(variable)
+        for exclude_id in exclude_from_weight_decay:
+            if variable_id == exclude_id:
+                return False
+        for name in exclude_from_weight_decay_names:
+            if re.search(name, variable.name) is not None:
+                return False
+        return True
+
+    def apply_gradients(self, grads_and_vars, name=None):
+        """Apply gradients to variables.
+
+        Args:
+          grads_and_vars: List of `(gradient, variable)` pairs.
+          name: string, defaults to None. The name of the namescope to
+            use when creating variables. If None, `self.name` will be used.
+
+        Returns:
+          A `tf.Variable`, representing the current iteration.
+
+        Raises:
+          TypeError: If `grads_and_vars` is malformed.
+        """
+        self._compute_current_learning_rate()
+        grads_and_vars = list(grads_and_vars)
+        if len(grads_and_vars) == 0:
+            # It is possible that the grad is empty. In this case,
+            # `apply_gradients` is a no-op.
+            return self._iterations
+        grads, trainable_variables = zip(*grads_and_vars)
+        scope_name = name or self.name or "optimizer"
+        with tf.name_scope(scope_name):
+            with tf.init_scope():
+                # Lift variable creation to init scope to avoid environment
+                # issues.
+                self.build(trainable_variables)
+            grads_and_vars = optimizer_utils.filter_empty_gradients(
+                grads_and_vars
+            )
+            if len(list(grads_and_vars)) == 0:
+                # Check again after filtering gradients.
+                return self._iterations
+
+            grads, trainable_variables = zip(*grads_and_vars)
+
+            grads = self._clip_gradients(grads)
+            grads = self._deduplicate_sparse_grad(grads)
+            self._apply_weight_decay(trainable_variables)
+            grads_and_vars = list(zip(grads, trainable_variables))
+            iteration = self._internal_apply_gradients(grads_and_vars)
+
+            # Apply variable constraints after applying gradients.
+            for variable in trainable_variables:
+                if variable.constraint is not None:
+                    variable.assign(variable.constraint(variable))
+            return iteration
+
+    def _apply_weight_decay(self, variables):
+        if self.weight_decay is None:
+            return
+        for variable in variables:
+            if self._use_weight_decay(variable):
+                lr = tf.cast(self.learning_rate, variable.dtype)
+                wd = tf.cast(self.weight_decay, variable.dtype)
+                variable.assign_sub(variable * wd * lr)
+
+    def _internal_apply_gradients(self, grads_and_vars):
+        """Helper function of apply gradients.
+
+        This is required for separating out distributed training logic.
+
+        Args:
+          grads_and_vars: List of (gradient, variable) pairs.
+        """
+        if self.jit_compile:
+            for grad, var in grads_and_vars:
+                self._update_step_xla(grad, var, id(self._var_key(var)))
+        else:
+            for grad, var in grads_and_vars:
+                self._update_step(grad, var)
+        return self.iterations.assign_add(1)
+
+    def _update_model_variables_moving_average(self, var_list):
+        """Update the stored moving average using the latest value."""
+        if self.use_ema:
+            for var in var_list:
+                average = self._model_variables_moving_average[
+                    self._index_dict[self._var_key(var)]
+                ]
+                average.assign(
+                    self.ema_momentum * average + (1 - self.ema_momentum) * var
+                )
+
+    def _overwrite_model_variables_with_average_value(self, var_list):
+        """Overwrite model variables with its moving average."""
+        for var in var_list:
+            average = self._model_variables_moving_average[
+                self._index_dict[self._var_key(var)]
+            ]
+            var.assign(average)
+
+    def finalize_variable_values(self, var_list):
+        """Set the final value of model's trainable variables.
+
+        Sometimes there are some extra steps before ending the variable updates,
+        such as overriding the model variables with its average value.
+
+        Args:
+          var_list: list of model variables.
+        """
+        if self.use_ema:
+            # If the optimizer uses EMA, then when finalizing, we replace the
+            # model variable value with its moving average stored inside
+            # optimizer.
+            self._overwrite_model_variables_with_average_value(var_list)
+
+    def _serialize_hyperparameter(self, hyperparameter):
+        """Serialize a hyperparameter that can be a numeric or callable."""
+        if isinstance(
+            hyperparameter, learning_rate_schedule.LearningRateSchedule
+        ):
+            return learning_rate_schedule.serialize(hyperparameter)
+        if isinstance(hyperparameter, tf.Variable):
+            return hyperparameter.numpy()
+        if callable(hyperparameter):
+            return hyperparameter()
+        return hyperparameter
+
+    def get_config(self):
+        """Returns the config of the optimizer.
+
+        An optimizer config is a Python dictionary (serializable)
+        containing the configuration of an optimizer.
+        The same optimizer can be reinstantiated later
+        (without any saved state) from this configuration.
+
+        Subclass optimizer should override this method to include other
+        hyperparameters.
+
+        Returns:
+            Python dictionary.
+        """
+        config = {
+            "name": self.name,
+            "weight_decay": self.weight_decay,
+            "clipnorm": self.clipnorm,
+            "global_clipnorm": self.global_clipnorm,
+            "clipvalue": self.clipvalue,
+            "use_ema": self.use_ema,
+            "ema_momentum": self.ema_momentum,
+            "ema_overwrite_frequency": self.ema_overwrite_frequency,
+            "jit_compile": self.jit_compile,
+            "is_legacy_optimizer": False,
+        }
+        return config
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        """Creates an optimizer from its config.
+
+        This method is the reverse of `get_config`, capable of instantiating the
+        same optimizer from the config dictionary.
+
+        Args:
+            config: A Python dictionary, typically the output of get_config.
+            custom_objects: A Python dictionary mapping names to additional
+              user-defined Python objects needed to recreate this optimizer.
+
+        Returns:
+            An optimizer instance.
+        """
+        if "learning_rate" in config:
+            if isinstance(config["learning_rate"], dict):
+                config["learning_rate"] = learning_rate_schedule.deserialize(
+                    config["learning_rate"], custom_objects=custom_objects
+                )
+        return cls(**config)
+
+    @property
+    def variables(self):
+        """Returns variables of this optimizer."""
+        return CallableList(self._variables)
+
+    def set_weights(self, weights):
+        """Set the weights of the optimizer.
+
+        Args:
+            weights: a list of `tf.Variable`s or numpy arrays, the target values
+                of optimizer variables. It should have the same order as
+                `self._variables`.
+        """
+        if not getattr(self, "_built", False):
+            raise ValueError(
+                "You are calling `set_weights()` on an optimizer that has not "
+                "yet been built. Please call "
+                "`optimizer.build(trainable_variables)` to create the "
+                "optimizer weights before calling `set_weights()`."
+            )
+
+        for variable, weight in zip(self._variables, weights):
+            if variable.shape != weight.shape:
+                raise ValueError(
+                    f"Optimizer variable {self._var_key(variable)} has shape "
+                    f"{str(variable.shape)} not compatible with provided "
+                    f"weight shape {str(weight.shape)}."
+                )
+            variable.assign(weight)
+
+    def save_own_variables(self, store):
+        """Get the state of this optimizer object."""
+        for i, variable in enumerate(self.variables):
+            store[str(i)] = variable.numpy()
+
+    def load_own_variables(self, store):
+        """Set the state of this optimizer object."""
+        if len(store.keys()) != len(self.variables):
+            msg = (
+                f"Skipping variable loading for optimizer '{self.name}', "
+                f"because it has {len(self.variables)} variables whereas "
+                f"the saved optimizer has {len(store.keys())} variables. "
+            )
+            if len(self.variables) == 0:
+                msg += (
+                    "This is likely because the optimizer has not been "
+                    "called/built yet."
+                )
+            logging.warning(msg)
+            return
+        for i, variable in enumerate(self.variables):
+            variable.assign(store[str(i)])
+
+
+base_optimizer_keyword_args = """name: String. The name to use
+          for momentum accumulator weights created by
+          the optimizer.
+      weight_decay: Float, defaults to None. If set, weight decay is applied.
+      clipnorm: Float. If set, the gradient of each weight is individually
+          clipped so that its norm is no higher than this value.
+      clipvalue: Float. If set, the gradient of each weight is clipped to be no
+          higher than this value.
+      global_clipnorm: Float. If set, the gradient of all weights is clipped so
+          that their global norm is no higher than this value.
+      use_ema: Boolean, defaults to False. If True, exponential moving average
+          (EMA) is applied. EMA consists of computing an exponential moving
+          average of the weights of the model (as the weight values change after
+          each training batch), and periodically overwriting the weights with
+          their moving average.
+      ema_momentum: Float, defaults to 0.99. Only used if `use_ema=True`.
+          This is the momentum to use when computing
+          the EMA of the model's weights:
+          `new_average = ema_momentum * old_average + (1 - ema_momentum) *
+          current_variable_value`.
+      ema_overwrite_frequency: Int or None, defaults to None. Only used if
+          `use_ema=True`. Every `ema_overwrite_frequency` steps of iterations,
+          we overwrite the model variable by its moving average.
+          If None, the optimizer
+          does not overwrite model variables in the middle of training, and you
+          need to explicitly overwrite the variables at the end of training
+          by calling `optimizer.finalize_variable_values()`
+          (which updates the model
+          variables in-place). When using the built-in `fit()` training loop,
+          this happens automatically after the last epoch,
+          and you don't need to do anything.
+      jit_compile: Boolean, defaults to True.
+          If True, the optimizer will use XLA
+          compilation. If no GPU device is found, this flag will be ignored.
+      mesh: optional `tf.experimental.dtensor.Mesh` instance. When provided,
+          the optimizer will be run in DTensor mode, e.g. state
+          tracking variable will be a DVariable, and aggregation/reduction will
+          happen in the global DTensor context.
+      **kwargs: keyword arguments only used for backward compatibility."""
+
+
+@keras_export(
+    "keras.optimizers.Optimizer",
+    "keras.optimizers.experimental.Optimizer",
+    v1=[],
+)
+class Optimizer(_BaseOptimizer):
+    """Abstract optimizer base class.
+
+    This class supports distributed training. If you want to implement your own
+    optimizer, please subclass this class instead of _BaseOptimizer.
+
+    Args:
+      {{base_optimizer_keyword_args}}
+
+    ### Usage
+
+    ```python
+    # Create an optimizer with the desired parameters.
+    opt = keras.optimizers.SGD(learning_rate=0.1)
+    var1, var2 = tf.Variable(1.0), tf.Variable(2.0)
+    # `loss` is a callable that takes no argument and returns the value
+    # to minimize.
+    loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
+    # Call minimize to update the list of variables.
+    opt.minimize(loss, var_list=[var1, var2])
+    ```
+
+    ### Processing gradients before applying them
+
+    Calling `minimize()` takes care of both computing the gradients and
+    applying them to the variables. If you want to process the gradients
+    before applying them you can instead use the optimizer in three steps:
+
+    1.  Compute the gradients with `tf.GradientTape`.
+    2.  Process the gradients as you wish.
+    3.  Apply the processed gradients with `apply_gradients()`.
+
+    Example:
+
+    ```python
+    # Create an optimizer.
+    opt = tf.keras.optimizers.experimental.SGD(learning_rate=0.1)
+    var1, var2 = tf.Variable(1.0), tf.Variable(2.0)
+
+    # Compute the gradients for a list of variables.
+    with tf.GradientTape() as tape:
+      loss = 3 * var1 * var1 + 2 * var2 * var2
+    grads = tape.gradient(loss, [var1, var2])
+
+    # Process the gradients.
+    grads[0] = grads[0] + 1
+
+    # Ask the optimizer to apply the gradients on variables.
+    opt.apply_gradients(zip(grads, [var1, var2]))
+    ```
+
+    ### Dynamic learning rate
+
+    Dynamic learning rate can be achieved by setting learning rate as a built-in
+    or customized `tf.keras.optimizers.schedules.LearningRateSchedule`.
+
+    Example:
+
+    >>> var = tf.Variable(np.random.random(size=(1,)))
+    >>> learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
+    ...   initial_learning_rate=.01, decay_steps=20, decay_rate=.1)
+    >>> opt = tf.keras.optimizers.experimental.SGD(learning_rate=learning_rate)
+    >>> loss = lambda: 3 * var
+    >>> opt.minimize(loss, var_list=[var])
+
+    ### Gradients clipping
+
+    Users can clip the gradients before applying to variables by setting
+    `clipnorm`, `clipvalue` and `global_clipnorm`. Notice that `clipnorm` and
+    `global_clipnorm` can only have one being set.
+
+    Example:
+
+    >>> opt = tf.keras.optimizers.experimental.SGD(learning_rate=1, clipvalue=1)
+    >>> var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
+    >>> with tf.GradientTape() as tape:
+    ...   loss = 2 * var1 + 2 * var2
+    >>> grads = tape.gradient(loss, [var1, var2])
+    >>> print([grads[0].numpy(), grads[1].numpy()])
+    [2.0, 2.0]
+    >>> opt.apply_gradients(zip(grads, [var1, var2]))
+    >>> # Without clipping, we should get [0, 0], but as gradients are clipped
+    >>> # to have max value 1, we get [1.0, 1.0].
+    >>> print([var1.numpy(), var2.numpy()])
+    [1.0, 1.0]
+
+    ### Using weight decay.
+
+    Weight decay in certain scenarios can boost the model's performance. Keras
+    has built-in support for weight decay in all optimizers. Users can apply
+    weight decay by setting `weight_decay` argument.
+
+    >>> opt = tf.keras.optimizers.experimental.SGD(1, weight_decay=0.004)
+    >>> grads, var1, var2 = tf.zeros(()), tf.Variable(2.0), tf.Variable(2.0)
+    >>> # You can exclude variables from weight decay, in this case we
+    >>> # exclude `var2`.
+    >>> opt.exclude_from_weight_decay(var_list=[var2])
+    >>> opt.apply_gradients(zip([grads, grads], [var1, var2]))
+    >>> print([var1.numpy(), var2.numpy()])
+    [1.992, 2.0]
+
+
+    ### Using exponential moving average.
+
+    Empirically it has been found that using the exponential moving average
+    (EMA) of the trained parameters of a deep network achieves a better
+    performance than using its trained parameters directly. Keras optimizers
+    allows users to compute this moving average and overwrite the model
+    variables at desired time.
+
+    Example:
+
+    ```python
+    # Create an SGD optimizer with EMA on. `ema_momentum` controls the decay
+    # rate of the moving average. `ema_momentum=1` means no decay and the stored
+    # moving average is always model variable's initial value before training.
+    # Reversely, `ema_momentum=0` is equivalent to not using EMA.
+    # `ema_overwrite_frequency=3` means every 3 iterations, we overwrite the
+    # trainable variables with their moving average values.
+    opt = tf.keras.optimizers.experimental.SGD(
+        learning_rate=1,
+        use_ema=True,
+        ema_momentum=0.5,
+        ema_overwrite_frequency=3)
+    var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
+    with tf.GradientTape() as tape:
+      loss = var1 + var2
+    grads = tape.gradient(loss, [var1, var2])
+    # First iteration: [var1, var2] = [1.0, 1.0]
+    opt.apply_gradients(zip(grads, [var1, var2]))
+    print([var1, var2])
+
+    # Second iteration: [var1, var2] = [0.0, 0.0]
+    opt.apply_gradients(zip(grads, [var1, var2]))
+    print([var1, var2])
+
+    # Third iteration, without EMA, we should see [var1, var2] = [-1.0, -1.0],
+    # but overwriting results in [var1, var2] = [-0.125, -0.125]. The full
+    # calculation for the moving average of var1 is:
+    # var1=2*0.5**3+1*(1-0.5)*0.5**2+0*(1-0.5)*0.5**1+(-1)*(1-0.5)=-0.125.
+    opt.apply_gradients(zip(grads, [var1, var2]))
+    print([var1, var2])
+
+    ```
+    When optimizer is constructed with `use_ema=True`, in custom training loop,
+    users can explicitly call `finalize_variable_values()` to overwrite
+    trainable variables with their EMA values. `finalize_variable_values()` is
+    by default called at the end of `model.fit()`.
+
+    ### Use with `tf.distribute.Strategy`
+
+    This optimizer class is `tf.distribute.Strategy` aware, which means it
+    automatically sums gradients across all replicas. To aggregate gradients
+    yourself, call `apply_gradients` with `skip_gradients_aggregation` set to
+    True.  This is useful if you need to process aggregated gradients.
+
+    ```python
+    # This example is not runnable, it consists of dummy code for simple
+    # tutorial.
+    strategy = tf.distribute.experimental.TPUStrategy()
+
+    with strategy.scope():
+      opt = tf.keras.optimizers.experimental.SGD()
+      model = magic_function_that_returns_model()
+      gradients = magic_function_that_returns_gradients()
+      # Custom logic to aggregate gradients.
+      gradients = strategy.reduce("SUM", gradients, axis=None)
+      opt.apply_gradients(zip(gradients, model.trainable_variables),
+          skip_gradients_aggregation=True)
+    ```
+
+    ### Creating a custom optimizer
+
+    If you intend to create your own optimization algorithm, please inherit from
+    this class and override the following methods:
+
+      - `build`: Create your optimizer-related variables, such as `momentums` in
+        SGD optimizer.
+      - `update_step`: Implement your optimizer's updating logic.
+      - `get_config`: serialization of the optimizer, include all hyper
+        parameters.
+
+    Your optimizer would automatically be compatible with tensorflow distributed
+    training if you subclass `optimizer_experimental.Optimizer`.
+
+    """
+
+    def __init__(
+        self,
+        name,
+        weight_decay=0,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        **kwargs,
+    ):
+        """Create a new Optimizer."""
+        mesh = kwargs.pop("mesh", None)
+        self._mesh = mesh
+        super().__init__(
+            name,
+            weight_decay,
+            clipnorm,
+            clipvalue,
+            global_clipnorm,
+            use_ema,
+            ema_momentum,
+            ema_overwrite_frequency,
+            jit_compile,
+            **kwargs,
+        )
+        self._distribution_strategy = tf.distribute.get_strategy()
+        self._run_with_dtensor = dtensor_utils.running_with_dtensor_strategy()
+
+    def add_variable_from_reference(
+        self, model_variable, variable_name, shape=None, initial_value=None
+    ):
+        if self._mesh:
+            if initial_value is None:
+                # Use tf.zeros_like which will propagate the layout information
+                # from the model weights if any.
+                initial_value = tf.zeros_like(model_variable)
+            elif isinstance(initial_value, tf.Tensor):
+                initial_value = tf.experimental.dtensor.copy_to_mesh(
+                    initial_value,
+                    tf.experimental.dtensor.Layout.replicated(
+                        self._mesh, rank=initial_value.shape.rank
+                    ),
+                )
+            variable = tf.experimental.dtensor.DVariable(
+                initial_value=initial_value,
+                name=f"{variable_name}/{model_variable._shared_name}",
+                dtype=model_variable.dtype,
+                trainable=False,
+            )
+            self._variables.append(variable)
+            return variable
+        else:
+            strategy = tf.distribute.get_strategy()
+            with strategy.extended.colocate_vars_with(model_variable):
+                return super().add_variable_from_reference(
+                    model_variable, variable_name, shape, initial_value
+                )
+
+    def _create_iteration_variable(self):
+        if self._mesh:
+            init_val = tf.constant(0, dtype=tf.int64)
+            init_val = tf.experimental.dtensor.copy_to_mesh(
+                init_val,
+                tf.experimental.dtensor.Layout.replicated(self._mesh, rank=0),
+            )
+            with tf.init_scope():
+                # Lift the variable creation to init scope to avoid environment
+                # issue.
+                self._iterations = tf.experimental.dtensor.DVariable(
+                    init_val, name="iteration"
+                )
+            self._variables.append(self._iterations)
+        else:
+            super()._create_iteration_variable()
+
+    def _var_key(self, variable):
+        """Get a unique identifier of the given variable."""
+
+        # Get the distributed variable if it exists.
+        # TODO(b/197554203): replace _distributed_container() with a public api.
+        if hasattr(variable, "_distributed_container"):
+            variable = variable._distributed_container()
+        elif (
+            tf_utils.is_extension_type(variable)
+            and hasattr(variable, "handle")
+            and hasattr(variable.handle, "_distributed_container")
+        ):
+            # For ResourceVariables, the _distributed_container attribute
+            # is added to their handle tensors.
+            variable = variable.handle._distributed_container()
+        return super()._var_key(variable)
+
+    def aggregate_gradients(self, grads_and_vars):
+        """Aggregate gradients on all devices.
+
+        By default, we will perform reduce_sum of gradients across devices.
+        Users can implement their own aggregation logic by overriding this
+        method.
+
+        Args:
+          grads_and_vars: List of (gradient, variable) pairs.
+
+        Returns:
+          List of (gradient, variable) pairs.
+        """
+        if self._mesh or self._run_with_dtensor:
+            logging.warning(
+                "Calling aggregate_gradients is unnecessary when the model "
+                "is used with DTensor, which includes aggregation of "
+                "replicated gradients as part of backward pass."
+            )
+            return grads_and_vars
+        else:
+            return optimizer_utils.all_reduce_sum_gradients(grads_and_vars)
+
+    def apply_gradients(
+        self,
+        grads_and_vars,
+        name=None,
+        skip_gradients_aggregation=False,
+        **kwargs,
+    ):
+        """Apply gradients to variables.
+
+        Args:
+          grads_and_vars: List of `(gradient, variable)` pairs.
+          name: string, defaults to None. The name of the namescope to
+            use when creating variables. If None, `self.name` will be used.
+          skip_gradients_aggregation: If true, gradients aggregation will not be
+            performed inside optimizer. Usually this arg is set to True when you
+            write custom code aggregating gradients outside the optimizer.
+          **kwargs: keyword arguments only used for backward compatibility.
+
+        Returns:
+          A `tf.Variable`, representing the current iteration.
+
+        Raises:
+          TypeError: If `grads_and_vars` is malformed.
+          RuntimeError: If called in a cross-replica context.
+        """
+        if self._mesh or self._run_with_dtensor:
+            # Skip any usage of strategy logic for DTensor
+            return super().apply_gradients(grads_and_vars, name=name)
+
+        # `experimental_aggregate_gradients` is an arg in `apply_gradients` of
+        # v2 optimizer -- the reverse of `skip_gradients_aggregation`.
+        # We read it from kwargs for backward compatibility.
+        experimental_aggregate_gradients = kwargs.pop(
+            "experimental_aggregate_gradients", True
+        )
+        if not skip_gradients_aggregation and experimental_aggregate_gradients:
+            grads_and_vars = self.aggregate_gradients(grads_and_vars)
+        return super().apply_gradients(grads_and_vars, name=name)
+
+    def _apply_weight_decay(self, variables):
+        # Apply weight decay in distributed setup.
+        if self.weight_decay is None:
+            return
+
+        def distributed_apply_weight_decay(distribution, variables, **kwargs):
+            def weight_decay_fn(variable):
+                if self._use_weight_decay(variable):
+                    lr = tf.cast(self.learning_rate, variable.dtype)
+                    wd = tf.cast(self.weight_decay, variable.dtype)
+                    variable.assign_sub(variable * wd * lr)
+
+            for variable in variables:
+                distribution.extended.update(
+                    variable, weight_decay_fn, group=False
+                )
+
+        tf.__internal__.distribute.interim.maybe_merge_call(
+            distributed_apply_weight_decay,
+            self._distribution_strategy,
+            variables,
+        )
+
+    def _internal_apply_gradients(self, grads_and_vars):
+        if self._mesh or self._run_with_dtensor:
+            # Skip any usage of strategy logic for DTensor
+            return super()._internal_apply_gradients(grads_and_vars)
+
+        return tf.__internal__.distribute.interim.maybe_merge_call(
+            self._distributed_apply_gradients_fn,
+            self._distribution_strategy,
+            grads_and_vars,
+        )
+
+    def _overwrite_model_variables_with_average_value(self, var_list):
+        """Overwrite model variables with their moving average values.
+
+        This function overwrites variables on each device.
+        Args:
+          var_list: list of model variables.
+        """
+        if self._mesh or self._run_with_dtensor:
+            # Skip any usage of strategy logic for DTensor
+            super()._overwrite_model_variables_with_average_value(var_list)
+
+        strategy = self._distribution_strategy
+        # Override model variable by the stored average value on all devices.
+        for var in var_list:
+            average = self._model_variables_moving_average[
+                self._index_dict[self._var_key(var)]
+            ]
+            strategy.extended.update(
+                var, lambda a, b: a.assign(b), args=(average,)
+            )
+
+    def _build_learning_rate(self, learning_rate):
+        if not self._mesh:
+            return super()._build_learning_rate(learning_rate)
+
+        # For DTensor
+        variable_creation = tf.experimental.dtensor.DVariable
+        init_value_convert_fn = lambda x: tf.experimental.dtensor.copy_to_mesh(
+            x, tf.experimental.dtensor.Layout.replicated(self._mesh, rank=0)
+        )
+        if isinstance(
+            learning_rate, learning_rate_schedule.LearningRateSchedule
+        ):
+            current_learning_rate = tf.convert_to_tensor(
+                learning_rate(self.iterations)
+            )
+            current_learning_rate = init_value_convert_fn(current_learning_rate)
+            # Create a variable to hold the current learning rate.
+            # Note that the init value `learning_rate(self.iterations)` should
+            # have the correct layout information from self.iterations.
+            self._current_learning_rate = variable_creation(
+                current_learning_rate,
+                name="learning_rate",
+                dtype=tf.float32,
+            )
+            return learning_rate
+
+        init_val = init_value_convert_fn(
+            tf.constant(learning_rate, dtype=tf.float32)
+        )
+        return variable_creation(
+            init_val,
+            name="learning_rate",
+            dtype=backend.floatx(),
+            trainable=False,
+        )
+
+    def _update_model_variables_moving_average(self, var_list):
+        """Update the stored moving average using the latest value."""
+        if self.use_ema:
+
+            def update_average(average, var):
+                average.assign(
+                    self.ema_momentum * average + (1 - self.ema_momentum) * var
+                )
+
+            for var in var_list:
+                average = self._model_variables_moving_average[
+                    self._index_dict[self._var_key(var)]
+                ]
+                self._distribution_strategy.extended.update(
+                    average, update_average, args=(var,), group=False
+                )
+
+    def _distributed_apply_gradients_fn(
+        self, distribution, grads_and_vars, **kwargs
+    ):
+        """`apply_gradients` using a `DistributionStrategy`."""
+
+        def apply_grad_to_update_var(var, grad):
+            if self.jit_compile:
+                return self._update_step_xla(grad, var, id(self._var_key(var)))
+            else:
+                return self._update_step(grad, var)
+
+        for grad, var in grads_and_vars:
+            distribution.extended.update(
+                var, apply_grad_to_update_var, args=(grad,), group=False
+            )
+
+        if self.use_ema:
+            _, var_list = zip(*grads_and_vars)
+            self._update_model_variables_moving_average(var_list)
+            if self.ema_overwrite_frequency:
+                # Only when self.ema_overwrite_frequency is not None, we
+                # overwrite the model variables.
+                should_overwrite_model_vars = (
+                    self.iterations + 1
+                ) % self.ema_overwrite_frequency == 0
+                tf.cond(
+                    tf.cast(should_overwrite_model_vars, tf.bool),
+                    true_fn=lambda: self._overwrite_model_variables_with_average_value(  # noqa: E501
+                        var_list
+                    ),
+                    false_fn=lambda: None,
+                )
+        return self.iterations.assign_add(1)
+
+
+class RestoredOptimizer(Optimizer):
+    def __init__(self):
+        super().__init__("RestoredOptimizer")
+
+    def get_config(self):
+        raise NotImplementedError(
+            "Restoring functional Optimizers from SavedModels is not currently "
+            "supported. Please file a feature request if this limitation "
+            "bothers you."
+        )
+
+
+class CallableList(list):
+    """Temporary shim to support both `opt.variables()` and `opt.variables`."""
+
+    def __call__(self):
+        return self
+
+
+# Register the optimizer for loading from saved_model purpose.
+tf.__internal__.saved_model.load.register_revived_type(
+    "experimentalOptimizer",
+    lambda obj: isinstance(obj, Optimizer),
+    versions=[
+        tf.__internal__.saved_model.load.VersionedTypeRegistration(
+            object_factory=lambda proto: RestoredOptimizer(),
+            version=2,
+            min_producer_version=1,
+            min_consumer_version=1,
+        )
+    ],
+)
+
+Optimizer.__doc__ = Optimizer.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/optimizer_experimental/BUILD b/keras/optimizers/optimizer_experimental/BUILD
deleted file mode 100644
index 834f3f5ff55f..000000000000
--- a/keras/optimizers/optimizer_experimental/BUILD
+++ /dev/null
@@ -1,75 +0,0 @@
-# Reworked keras optimizer. For more context, please refer to go/new-keras-optimizer.
-
-load("@org_keras//keras:keras.bzl", "distribute_py_test")
-
-package(
-    default_visibility = [
-        "//keras:friends",
-    ],
-    licenses = ["notice"],
-)
-
-py_library(
-    name = "optimizer",
-    srcs = [
-        "__init__.py",
-        "adadelta.py",
-        "adagrad.py",
-        "adam.py",
-        "adamax.py",
-        "adamw.py",
-        "ftrl.py",
-        "nadam.py",
-        "optimizer.py",
-        "rmsprop.py",
-        "sgd.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        "//:expect_tensorflow_installed",
-        "//keras:backend",
-        "//keras/initializers",
-        "//keras/optimizers/optimizer_v2",
-        "//keras/optimizers/schedules:learning_rate_schedule",
-    ],
-)
-
-distribute_py_test(
-    name = "optimizer_test",
-    size = "medium",
-    srcs = ["optimizer_test.py"],
-    shard_count = 8,
-    tags = [
-        "multi_gpu",
-        "no_windows",
-        "nomultivm",  # TODO(b/203558991): Re-enable.
-    ],
-    deps = [
-        "//:expect_absl_installed",
-        "//:expect_tensorflow_installed",
-        "//keras",
-        "//keras/optimizers",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
-
-# TODO(b/228209527): Combine this test with optimizer_test after
-# fixing the NCCL issue.
-distribute_py_test(
-    name = "optimizer_pss_test",
-    size = "medium",
-    srcs = ["optimizer_pss_test.py"],
-    shard_count = 32,
-    tags = [
-        "multi_gpu",
-        "no_oss",
-        "no_windows",
-    ],
-    deps = [
-        "//:expect_absl_installed",
-        "//:expect_tensorflow_installed",
-        "//keras",
-        "//keras/optimizers",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
diff --git a/keras/optimizers/optimizer_experimental/README.md b/keras/optimizers/optimizer_experimental/README.md
deleted file mode 100644
index 1099d68727ff..000000000000
--- a/keras/optimizers/optimizer_experimental/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Reworked Keras Optimizer
-
-This directory contains code for [reworked Keras optimizer](go/new-keras-optimizer).
-Code in this directory is still under development. To check out production  
-optimizer code, please refer to directory optimizer_v2/.
-
-The optimizer rework is mainly about reducing the complexity, and is transparent
- to users. Optimizer's public api will remain the same as today.
diff --git a/keras/optimizers/optimizer_experimental/adadelta.py b/keras/optimizers/optimizer_experimental/adadelta.py
deleted file mode 100644
index deb788eb5977..000000000000
--- a/keras/optimizers/optimizer_experimental/adadelta.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adadelta optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.Adadelta', v1=[])
-class Adadelta(optimizer.Optimizer):
-  r"""Optimizer that implements the Adadelta algorithm.
-
-  Adadelta optimization is a stochastic gradient descent method that is based on
-  adaptive learning rate per dimension to address two drawbacks:
-
-  - The continual decay of learning rates throughout training.
-  - The need for a manually selected global learning rate.
-
-  Adadelta is a more robust extension of Adagrad that adapts learning rates
-  based on a moving window of gradient updates, instead of accumulating all
-  past gradients. This way, Adadelta continues learning even when many updates
-  have been done. Compared to Adagrad, in the original version of Adadelta you
-  don't have to set an initial learning rate. In this version, the initial
-  learning rate can be set, as in most other Keras optimizers.
-
-  Args:
-    learning_rate: Initial value for the learning rate:
-      either a floating point value,
-      or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-      Defaults to 0.001.
-      Note that `Adadelta` tends to benefit from higher initial learning rate
-      values compared to other optimizers.
-      To match the exact form in the original paper, use 1.0.
-    rho: A `Tensor` or a floating point value. The decay rate. Defaults to 0.95.
-    epsilon: Small floating point value used to maintain numerical stability.
-      Defaults to 1e-7.
-    {{base_optimizer_keyword_args}}
-
-  Reference:
-    - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               rho=0.95,
-               epsilon=1e-7,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='Adadelta',
-               **kwargs):
-    super().__init__(
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        name=name,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.rho = rho
-    self.epsilon = epsilon
-
-  def build(self, var_list):
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self._built = True
-    self._accumulated_grads = []
-    self._accumulated_delta_vars = []
-    for var in var_list:
-      self._accumulated_grads.append(
-          self.add_variable_from_reference(var, 'accumulated_grad'))
-      self._accumulated_delta_vars.append(
-          self.add_variable_from_reference(var, 'accumulated_delta_var'))
-
-  def update_step(self, grad, variable):
-    """Update step given gradient and the associated model variable."""
-    lr = tf.cast(self.learning_rate, variable.dtype)
-
-    var_key = self._var_key(variable)
-    rho = self.rho
-    accumulated_grad = self._accumulated_grads[self._index_dict[var_key]]
-    accumulated_delta_var = self._accumulated_delta_vars[
-        self._index_dict[var_key]]
-
-    def rms(x):
-      return tf.sqrt(x + self.epsilon)
-
-    if isinstance(grad, tf.IndexedSlices):
-      # Sparse gradients.
-      accumulated_grad.assign_add((rho - 1) * accumulated_grad)
-      accumulated_grad.scatter_add(tf.IndexedSlices(
-          (1 - rho) * tf.square(grad.values), grad.indices))
-      delta_var = -rms(accumulated_delta_var) * grad / rms(accumulated_grad)
-      accumulated_delta_var.assign(rho * accumulated_delta_var +
-                                   (1 - rho) * delta_var * delta_var)
-    else:
-      # Dense gradients.
-      accumulated_grad.assign(rho * accumulated_grad + (1 - rho) * grad * grad)
-      delta_var = -rms(accumulated_delta_var) * grad / rms(accumulated_grad)
-      accumulated_delta_var.assign(rho * accumulated_delta_var +
-                                   (1 - rho) * delta_var * delta_var)
-    variable.assign_add(lr * delta_var)
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'rho': self.rho,
-        'epsilon': self.epsilon,
-    })
-    return config
-
-Adadelta.__doc__ = Adadelta.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/adagrad.py b/keras/optimizers/optimizer_experimental/adagrad.py
deleted file mode 100644
index a65bace9f185..000000000000
--- a/keras/optimizers/optimizer_experimental/adagrad.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adagrad optimizer implementation."""
-
-from keras import initializers
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.Adagrad', v1=[])
-class Adagrad(optimizer.Optimizer):
-  r"""Optimizer that implements the Adagrad algorithm.
-
-  Adagrad is an optimizer with parameter-specific learning rates,
-  which are adapted relative to how frequently a parameter gets
-  updated during training. The more updates a parameter receives,
-  the smaller the updates.
-
-  Args:
-    learning_rate: Initial value for the learning rate:
-      either a floating point value,
-      or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-      Defaults to 0.001.
-      Note that `Adagrad` tends to benefit from higher initial learning rate
-      values compared to other optimizers.
-      To match the exact form in the original paper, use 1.0.
-    initial_accumulator_value: Floating point value.
-      Starting value for the accumulators (per-parameter momentum values).
-      Must be non-negative.
-    epsilon: Small floating point value used to maintain numerical stability.
-    {{base_optimizer_keyword_args}}
-
-  Reference:
-    - [Duchi et al., 2011](
-      http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               initial_accumulator_value=0.1,
-               epsilon=1e-7,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='Adagrad',
-               **kwargs):
-    super().__init__(
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        name=name,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.initial_accumulator_value = initial_accumulator_value
-    self.epsilon = epsilon
-
-  def build(self, var_list):
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self._built = True
-    self._accumulators = []
-    initializer = initializers.Constant(self.initial_accumulator_value)
-    for var in var_list:
-      self._accumulators.append(
-          self.add_variable_from_reference(
-              var,
-              'accumulator',
-              initial_value=initializer(shape=var.shape, dtype=var.dtype)))
-
-  def update_step(self, grad, variable):
-    """Update step given gradient and the associated model variable."""
-    lr = tf.cast(self.learning_rate, variable.dtype)
-
-    var_key = self._var_key(variable)
-    accumulator = self._accumulators[self._index_dict[var_key]]
-
-    if isinstance(grad, tf.IndexedSlices):
-      # Sparse gradients.
-      accumulator.scatter_add(
-          tf.IndexedSlices(grad.values * grad.values, grad.indices))
-    else:
-      # Dense gradients.
-      accumulator.assign_add(grad * grad)
-    variable.assign_sub(lr * grad / tf.sqrt(accumulator + self.epsilon))
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'initial_accumulator_value': self.initial_accumulator_value,
-        'epsilon': self.epsilon,
-    })
-    return config
-
-
-Adagrad.__doc__ = Adagrad.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/adam.py b/keras/optimizers/optimizer_experimental/adam.py
deleted file mode 100644
index 5d7f271dc034..000000000000
--- a/keras/optimizers/optimizer_experimental/adam.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adam optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.Adam', v1=[])
-class Adam(optimizer.Optimizer):
-  r"""Optimizer that implements the Adam algorithm.
-
-  Adam optimization is a stochastic gradient descent method that is based on
-  adaptive estimation of first-order and second-order moments.
-
-  According to
-  [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
-  the method is "*computationally
-  efficient, has little memory requirement, invariant to diagonal rescaling of
-  gradients, and is well suited for problems that are large in terms of
-  data/parameters*".
-
-  Args:
-    learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use. The
-      learning rate. Defaults to 0.001.
-    beta_1: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use. The
-      exponential decay rate for the 1st moment estimates. Defaults to 0.9.
-    beta_2: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use. The
-      exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
-    epsilon: A small constant for numerical stability. This epsilon is
-      "epsilon hat" in the Kingma and Ba paper (in the formula just before
-      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-      1e-7.
-    amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
-      the paper "On the Convergence of Adam and beyond". Defaults to `False`.
-    {{base_optimizer_keyword_args}}
-
-  Reference:
-    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
-    - [Reddi et al., 2018](
-        https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
-
-  Notes:
-
-  The default value of 1e-7 for epsilon might not be a good default in
-  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since Adam uses the
-  formulation just before Section 2.1 of the Kingma and Ba paper rather than
-  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
-  hat" in the paper.
-
-  The sparse implementation of this algorithm (used when the gradient is an
-  IndexedSlices object, typically because of `tf.gather` or an embedding
-  lookup in the forward pass) does apply momentum to variable slices even if
-  they were not used in the forward pass (meaning they have a gradient equal
-  to zero). Momentum decay (beta1) is also applied to the entire momentum
-  accumulator. This means that the sparse behavior is equivalent to the dense
-  behavior (in contrast to some momentum implementations which ignore momentum
-  unless a variable slice was actually used).
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='Adam',
-               **kwargs):
-    super().__init__(
-        name=name,
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.beta_1 = beta_1
-    self.beta_2 = beta_2
-    self.epsilon = epsilon
-    self.amsgrad = amsgrad
-
-  def build(self, var_list):
-    """Initialize optimizer variables.
-
-    Adam optimizer has 3 types of variables: momentums, velocities and
-    velocity_hat (only set when amsgrad is applied),
-
-    Args:
-      var_list: list of model variables to build Adam variables on.
-    """
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self._built = True
-    self._momentums = []
-    self._velocities = []
-    for var in var_list:
-      self._momentums.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='m'))
-      self._velocities.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='v'))
-    if self.amsgrad:
-      self._velocity_hats = []
-      for var in var_list:
-        self._velocity_hats.append(
-            self.add_variable_from_reference(
-                model_variable=var, variable_name='vhat'))
-
-  def update_step(self, gradient, variable):
-    """Update step given gradient and the associated model variable."""
-    beta_1_power = None
-    beta_2_power = None
-    lr = tf.cast(self.learning_rate, variable.dtype)
-    local_step = tf.cast(self.iterations + 1, variable.dtype)
-    beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
-    beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)
-
-    var_key = self._var_key(variable)
-    m = self._momentums[self._index_dict[var_key]]
-    v = self._velocities[self._index_dict[var_key]]
-
-    alpha = (lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))
-
-    if isinstance(gradient, tf.IndexedSlices):
-      # Sparse gradients.
-      m.assign_add(-m * (1 - self.beta_1))
-      m.scatter_add(
-          tf.IndexedSlices(gradient.values * (1 - self.beta_1),
-                           gradient.indices))
-      v.assign_add(-v * (1 - self.beta_2))
-      v.scatter_add(
-          tf.IndexedSlices(
-              tf.square(gradient.values) * (1 - self.beta_2), gradient.indices))
-      if self.amsgrad:
-        v_hat = self._velocity_hats[self._index_dict[var_key]]
-        v_hat.assign(tf.maximum(v_hat, v))
-        v = v_hat
-      variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
-    else:
-      # Dense gradients.
-      m.assign_add((gradient - m) * (1 - self.beta_1))
-      v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
-      if self.amsgrad:
-        v_hat = self._velocity_hats[self._index_dict[var_key]]
-        v_hat.assign(tf.maximum(v_hat, v))
-        v = v_hat
-      variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'beta_1': self.beta_1,
-        'beta_2': self.beta_2,
-        'epsilon': self.epsilon,
-        'amsgrad': self.amsgrad,
-    })
-    return config
-
-
-Adam.__doc__ = Adam.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/adamax.py b/keras/optimizers/optimizer_experimental/adamax.py
deleted file mode 100644
index 2d4f89dc7c95..000000000000
--- a/keras/optimizers/optimizer_experimental/adamax.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adamax optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.Adamax', v1=[])
-class Adamax(optimizer.Optimizer):
-  """Optimizer that implements the Adamax algorithm.
-
-  Adamax, a variant of Adam based on the infinity norm, is a first-order
-  gradient-based optimization method. Due to its capability of adjusting the
-  learning rate based on data characteristics, it is suited to learn
-  time-variant process, e.g., speech data with dynamically changed noise
-  conditions. Default parameters follow those provided in the paper (see
-  references below).
-
-  Initialization:
-
-  ```python
-  m = 0  # Initialize initial 1st moment vector
-  u = 0  # Initialize the exponentially weighted infinity norm
-  t = 0  # Initialize timestep
-  ```
-
-  The update rule for parameter `w` with gradient `g` is
-  described at the end of section 7.1 of the paper (see the referenece section):
-
-  ```python
-  t += 1
-  m = beta1 * m + (1 - beta) * g
-  u = max(beta2 * u, abs(g))
-  current_lr = learning_rate / (1 - beta1 ** t)
-  w = w - current_lr * m / (u + epsilon)
-  ```
-
-  Args:
-    learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use. The
-      learning rate. Defaults to 0.001.
-    beta_1: A float value or a constant float tensor. The exponential decay
-      rate for the 1st moment estimates.
-    beta_2: A float value or a constant float tensor. The exponential decay
-      rate for the exponentially weighted infinity norm.
-    epsilon: A small constant for numerical stability.
-    {{base_optimizer_keyword_args}}
-
-  Reference:
-    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='Adamax',
-               **kwargs):
-    super().__init__(
-        name=name,
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.beta_1 = beta_1
-    self.beta_2 = beta_2
-    self.epsilon = epsilon
-
-  def build(self, var_list):
-    """Initialize optimizer variables.
-
-    Adamax optimizer has 2 types of variables: momentums (denoted as m),
-    exponentially weighted infinity norm (denoted as u).
-
-    Args:
-      var_list: list of model variables to build Adamax variables on.
-    """
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self._built = True
-    self._m = []
-    self._u = []
-    for var in var_list:
-      self._m.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='m'))
-      self._u.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='u'))
-
-  def update_step(self, gradient, variable):
-    """Update step given gradient and the associated model variable."""
-    lr = tf.cast(self.learning_rate, variable.dtype)
-    local_step = tf.cast(self.iterations + 1, variable.dtype)
-    beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
-
-    var_key = self._var_key(variable)
-    m = self._m[self._index_dict[var_key]]
-    u = self._u[self._index_dict[var_key]]
-
-    if isinstance(gradient, tf.IndexedSlices):
-      # Sparse gradients.
-      indices = gradient.indices
-      m.assign_add(-m * (1 - self.beta_1))
-      m.scatter_add(
-          tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices))
-      u.assign(u * self.beta_2)
-      u_slice = tf.gather(u, indices)
-      u_slice_incremental = tf.maximum(
-          u_slice,
-          tf.abs(gradient.values)) - u_slice
-      u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices))
-      variable.assign_sub((lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
-    else:
-      # Dense gradients.
-      m.assign_add((gradient - m) * (1 - self.beta_1))
-      u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient)))
-      variable.assign_sub((lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'beta_1': self.beta_1,
-        'beta_2': self.beta_2,
-        'epsilon': self.epsilon,
-    })
-    return config
-
-
-Adamax.__doc__ = Adamax.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/adamw.py b/keras/optimizers/optimizer_experimental/adamw.py
deleted file mode 100644
index 296fbcf8ca19..000000000000
--- a/keras/optimizers/optimizer_experimental/adamw.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""AdamW optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.AdamW', v1=[])
-class AdamW(optimizer.Optimizer):
-  r"""Optimizer that implements the AdamW algorithm.
-
-  AdamW optimization is a stochastic gradient descent method that is based on
-  adaptive estimation of first-order and second-order moments with an added
-  method to decay weights per the techniques discussed in the paeper,
-  'Decoupled Weight Decay Regularization' by
-  [Loshchilov, Hutter et al., 2019](https://arxiv.org/abs/1711.05101).
-
-  According to
-  [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
-  the underying Adam method is "*computationally
-  efficient, has little memory requirement, invariant to diagonal rescaling of
-  gradients, and is well suited for problems that are large in terms of
-  data/parameters*".
-
-  Args:
-    learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use. The
-      learning rate. Defaults to 0.001.
-    weight_decay: A `tf.Tensor`, floating point value. The weight decay.
-      Defaults to 0.004.
-    beta_1: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use. The
-      exponential decay rate for the 1st moment estimates. Defaults to 0.9.
-    beta_2: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use. The
-      exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
-    epsilon: A small constant for numerical stability. This epsilon is
-      "epsilon hat" in the Kingma and Ba paper (in the formula just before
-      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-      1e-7.
-    amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
-      the paper "On the Convergence of Adam and beyond". Defaults to `False`.
-    {{base_optimizer_keyword_args}}
-
-  Reference:
-    - [Loshchilov et al., 2019](https://arxiv.org/abs/1711.05101)
-    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) for `adam`
-    - [Reddi et al., 2018](
-        https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
-
-  Notes:
-
-  The default value of 1e-7 for epsilon might not be a good default in
-  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since Adam uses the
-  formulation just before Section 2.1 of the Kingma and Ba paper rather than
-  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
-  hat" in the paper.
-
-  The sparse implementation of this algorithm (used when the gradient is an
-  IndexedSlices object, typically because of `tf.gather` or an embedding
-  lookup in the forward pass) does apply momentum to variable slices even if
-  they were not used in the forward pass (meaning they have a gradient equal
-  to zero). Momentum decay (beta1) is also applied to the entire momentum
-  accumulator. This means that the sparse behavior is equivalent to the dense
-  behavior (in contrast to some momentum implementations which ignore momentum
-  unless a variable slice was actually used).
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               weight_decay=0.004,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='AdamW',
-               **kwargs):
-    super().__init__(
-        name=name,
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.weight_decay = weight_decay
-    self.beta_1 = beta_1
-    self.beta_2 = beta_2
-    self.epsilon = epsilon
-    self.amsgrad = amsgrad
-
-    if self.weight_decay is None:
-      raise ValueError('Missing value of `weight_decay` which is required and'
-                       ' must be a float value.')
-
-  def build(self, var_list, exclude_from_weight_decay=None):
-    """Initialize optimizer variables.
-
-    AdamW optimizer has 3 types of variables: momentums, velocities and
-    velocity_hat (only set when amsgrad is applied),
-
-    Args:
-      var_list: list of model variables to build AdamW variables on.
-      exclude_from_weight_decay: list of model variables that will be excluded
-        from weight decay.
-    """
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self._built = True
-    if not hasattr(self, '_exclude_from_weight_decay'):
-      self._exclude_from_weight_decay = exclude_from_weight_decay or []
-    self._momentums = []
-    self._velocities = []
-    for var in var_list:
-      self._momentums.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='m'))
-      self._velocities.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='v'))
-    if self.amsgrad:
-      self._velocity_hats = []
-      for var in var_list:
-        self._velocity_hats.append(
-            self.add_variable_from_reference(
-                model_variable=var, variable_name='vhat'))
-
-  def update_step(self, gradient, variable):
-    """Update step given gradient and the associated model variable."""
-    beta_1_power = None
-    beta_2_power = None
-    lr = tf.cast(self.learning_rate, variable.dtype)
-    local_step = tf.cast(self.iterations + 1, variable.dtype)
-    beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
-    beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)
-
-    var_key = self._var_key(variable)
-    m = self._momentums[self._index_dict[var_key]]
-    v = self._velocities[self._index_dict[var_key]]
-
-    alpha = (lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))
-
-    # Apply step weight decay
-    if (self.weight_decay != 0 and
-        variable not in self._exclude_from_weight_decay):
-      wd = tf.cast(self.weight_decay, variable.dtype)
-      variable.assign_sub(variable * wd)
-
-    if isinstance(gradient, tf.IndexedSlices):
-      # Sparse gradients.
-      m.assign_add(-m * (1 - self.beta_1))
-      m.scatter_add(
-          tf.IndexedSlices(gradient.values * (1 - self.beta_1),
-                           gradient.indices))
-      v.assign_add(-v * (1 - self.beta_2))
-      v.scatter_add(
-          tf.IndexedSlices(
-              tf.square(gradient.values) * (1 - self.beta_2), gradient.indices))
-      if self.amsgrad:
-        v_hat = self._velocity_hats[self._index_dict[var_key]]
-        v_hat.assign(tf.maximum(v_hat, v))
-        v = v_hat
-      variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
-    else:
-      # Dense gradients.
-      m.assign_add((gradient - m) * (1 - self.beta_1))
-      v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
-      if self.amsgrad:
-        v_hat = self._velocity_hats[self._index_dict[var_key]]
-        v_hat.assign(tf.maximum(v_hat, v))
-        v = v_hat
-      variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'weight_decay': self.weight_decay,
-        'beta_1': self.beta_1,
-        'beta_2': self.beta_2,
-        'epsilon': self.epsilon,
-        'amsgrad': self.amsgrad,
-    })
-    return config
-
-  def exclude_from_weight_decay(self, var_list):
-    if hasattr(self, '_built') and self._built:
-      raise ValueError(
-          '`exclude_from_weight_decay()` can only be configued before '
-          'the optimizer is built.'
-      )
-
-    self._exclude_from_weight_decay = var_list or []
-
-
-AdamW.__doc__ = AdamW.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/ftrl.py b/keras/optimizers/optimizer_experimental/ftrl.py
deleted file mode 100644
index aa7ffe3cc319..000000000000
--- a/keras/optimizers/optimizer_experimental/ftrl.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""FTRL optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.Ftrl', v1=[])
-class Ftrl(optimizer.Optimizer):
-  r"""Optimizer that implements the FTRL algorithm.
-
-  "Follow The Regularized Leader" (FTRL) is an optimization algorithm developed
-  at Google for click-through rate prediction in the early 2010s. It is most
-  suitable for shallow models with large and sparse feature spaces.
-  The algorithm is described by
-  [McMahan et al., 2013](https://research.google.com/pubs/archive/41159.pdf).
-  The Keras version has support for both online L2 regularization
-  (the L2 regularization described in the paper
-  above) and shrinkage-type L2 regularization
-  (which is the addition of an L2 penalty to the loss function).
-
-  Initialization:
-
-  ```python
-  n = 0
-  sigma = 0
-  z = 0
-  ```
-
-  Update rule for one variable `w`:
-
-  ```python
-  prev_n = n
-  n = n + g ** 2
-  sigma = (n ** -lr_power - prev_n ** -lr_power) / lr
-  z = z + g - sigma * w
-  if abs(z) < lambda_1:
-    w = 0
-  else:
-    w = (sgn(z) * lambda_1 - z) / ((beta + sqrt(n)) / alpha + lambda_2)
-  ```
-
-  Notation:
-
-  - `lr` is the learning rate
-  - `g` is the gradient for the variable
-  - `lambda_1` is the L1 regularization strength
-  - `lambda_2` is the L2 regularization strength
-  - `lr_power` is the power to scale n.
-
-  Check the documentation for the `l2_shrinkage_regularization_strength`
-  parameter for more details when shrinkage is enabled, in which case gradient
-  is replaced with a gradient with shrinkage.
-
-  Args:
-    learning_rate: A `Tensor`, floating point value, a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that
-      takes no arguments and returns the actual value to use. The learning rate.
-      Defaults to 0.001.
-    learning_rate_power: A float value, must be less or equal to zero. Controls
-      how the learning rate decreases during training. Use zero for a fixed
-      learning rate.
-    initial_accumulator_value: The starting value for accumulators. Only zero or
-      positive values are allowed.
-    l1_regularization_strength: A float value, must be greater than or equal to
-      zero. Defaults to 0.0.
-    l2_regularization_strength: A float value, must be greater than or equal to
-      zero. Defaults to 0.0.
-    l2_shrinkage_regularization_strength: A float value, must be greater than or
-      equal to zero. This differs from L2 above in that the L2 above is a
-      stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
-      When input is sparse shrinkage will only happen on the active weights.
-    beta: A float value, representing the beta value from the paper. Defaults to
-      0.0.
-    {{base_optimizer_keyword_args}}
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               learning_rate_power=-0.5,
-               initial_accumulator_value=0.1,
-               l1_regularization_strength=0.0,
-               l2_regularization_strength=0.0,
-               l2_shrinkage_regularization_strength=0.0,
-               beta=0.0,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='Ftrl',
-               **kwargs):
-    super().__init__(
-        name=name,
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        **kwargs)
-
-    if initial_accumulator_value < 0.0:
-      raise ValueError(
-          '`initial_accumulator_value` needs to be positive or zero. Received: '
-          f'initial_accumulator_value={initial_accumulator_value}.')
-    if learning_rate_power > 0.0:
-      raise ValueError(
-          '`learning_rate_power` needs to be negative or zero. Received: '
-          f'learning_rate_power={learning_rate_power}.')
-    if l1_regularization_strength < 0.0:
-      raise ValueError(
-          '`l1_regularization_strength` needs to be positive or zero. '
-          f'Received: l1_regularization_strength={l1_regularization_strength}.')
-    if l2_regularization_strength < 0.0:
-      raise ValueError(
-          '`l2_regularization_strength` needs to be positive or zero. '
-          f'Received: l2_regularization_strength={l2_regularization_strength}.')
-    if l2_shrinkage_regularization_strength < 0.0:
-      raise ValueError(
-          '`l2_shrinkage_regularization_strength` needs to be positive or '
-          'zero. Received: l2_shrinkage_regularization_strength'
-          f'={l2_shrinkage_regularization_strength}.')
-
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.learning_rate_power = learning_rate_power
-    self.initial_accumulator_value = initial_accumulator_value
-    self.l1_regularization_strength = l1_regularization_strength
-    self.l2_regularization_strength = l2_regularization_strength
-    self.l2_shrinkage_regularization_strength = (
-        l2_shrinkage_regularization_strength)
-    self.beta = beta
-
-  def build(self, var_list):
-    """Initialize optimizer variables.
-
-    Args:
-      var_list: list of model variables to build Ftrl variables on.
-    """
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self._accumulators = []
-    self._linears = []
-    for var in var_list:
-      self._accumulators.append(
-          self.add_variable_from_reference(
-              model_variable=var,
-              variable_name='accumulator',
-              initial_value=tf.cast(
-                  tf.fill(dims=var.shape, value=self.initial_accumulator_value),
-                  dtype=var.dtype)))
-      self._linears.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='linear'))
-    self._built = True
-
-  def update_step(self, gradient, variable):
-    """Update step given gradient and the associated model variable."""
-
-    lr = tf.cast(self.learning_rate, variable.dtype)
-    var_key = self._var_key(variable)
-    accum = self._accumulators[self._index_dict[var_key]]
-    linear = self._linears[self._index_dict[var_key]]
-
-    lr_power = self.learning_rate_power
-    l2_reg = self.l2_regularization_strength
-    l2_reg = (l2_reg + self.beta / (2. * lr))
-
-    # Ftrl optimizer has the same implementation for sparse and dense
-    # gradients update.
-    grad_to_use = (
-        gradient + 2 * self.l2_shrinkage_regularization_strength * variable)
-    new_accum = accum + tf.pow(gradient, 2)
-    linear.assign_add(grad_to_use -
-                      (tf.pow(new_accum, -lr_power) -
-                       tf.pow(accum, -lr_power)) / lr * variable)
-    quadratic = tf.pow(new_accum,
-                       (-lr_power)) / lr + 2 * l2_reg
-    linear_clipped = tf.clip_by_value(linear,
-                                      -self.l1_regularization_strength,
-                                      self.l1_regularization_strength)
-    variable.assign((linear_clipped - linear) / quadratic)
-    accum.assign(new_accum)
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate':
-            self._serialize_hyperparameter(self._learning_rate),
-        'learning_rate_power':
-            self.learning_rate_power,
-        'initial_accumulator_value':
-            self.initial_accumulator_value,
-        'l1_regularization_strength':
-            self.l1_regularization_strength,
-        'l2_regularization_strength':
-            self.l2_regularization_strength,
-        'l2_shrinkage_regularization_strength':
-            self.l2_shrinkage_regularization_strength,
-        'beta':
-            self.beta,
-    })
-    return config
-
-
-Ftrl.__doc__ = Ftrl.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/nadam.py b/keras/optimizers/optimizer_experimental/nadam.py
deleted file mode 100644
index b9557ad70da2..000000000000
--- a/keras/optimizers/optimizer_experimental/nadam.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Nadam optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.Nadam', v1=[])
-class Nadam(optimizer.Optimizer):
-  r"""Optimizer that implements the Nadam algorithm.
-
-  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
-  Nesterov momentum.
-
-  Args:
-    learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use. The
-      learning rate. Defaults to 0.001.
-    beta_1: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use. The
-      exponential decay rate for the 1st moment estimates. Defaults to 0.9.
-    beta_2: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use. The
-      exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
-    epsilon: A small constant for numerical stability. This epsilon is
-      "epsilon hat" in the Kingma and Ba paper (in the formula just before
-      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-      1e-7.
-    {{base_optimizer_keyword_args}}
-
-  Reference:
-    - [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
-
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='Nadam',
-               **kwargs):
-    super().__init__(
-        name=name,
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.beta_1 = beta_1
-    self.beta_2 = beta_2
-    self.epsilon = epsilon
-
-  def build(self, var_list):
-    """Initialize optimizer variables.
-
-    Nadam optimizer has 2 types of variables: momentums and velocities.
-
-    Args:
-      var_list: list of model variables to build Nadam variables on.
-    """
-    super().build(var_list)
-    if getattr(self, '_built', False):
-      return
-    self._built = True
-    self._momentums = []
-    self._velocities = []
-    self._u_product = tf.Variable(1.0, dtype=var_list[0].dtype)
-    # Keep a counter on how many times of _u_product has been computed to
-    # avoid duplicated computations.
-    self._u_product_counter = 1
-
-    for var in var_list:
-      self._momentums.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='m'))
-      self._velocities.append(
-          self.add_variable_from_reference(
-              model_variable=var, variable_name='v'))
-
-  def update_step(self, gradient, variable):
-    """Update step given gradient and the associated model variable."""
-    var_dtype = variable.dtype
-    lr = tf.cast(self.learning_rate, var_dtype)
-    local_step = tf.cast(self.iterations + 1, var_dtype)
-    next_step = tf.cast(self.iterations + 2, var_dtype)
-    decay = tf.cast(0.96, var_dtype)
-    beta_1 = tf.cast(self.beta_1, var_dtype)
-    beta_2 = tf.cast(self.beta_2, var_dtype)
-    u_t = beta_1 * (1. - 0.5 * (tf.pow(decay, local_step)))
-    u_t_1 = beta_1 * (1. - 0.5 * (tf.pow(decay, next_step)))
-    def get_cached_u_product():
-      return self._u_product
-
-    def compute_new_u_product():
-      u_product_t = self._u_product * u_t
-      self._u_product.assign(u_product_t)
-      self._u_product_counter += 1
-      return u_product_t
-
-    u_product_t = tf.cond(
-        self._u_product_counter == (self.iterations + 2),
-        true_fn=get_cached_u_product,
-        false_fn=compute_new_u_product)
-    u_product_t_1 = u_product_t * u_t_1
-    beta_2_power = tf.pow(beta_2, local_step)
-
-    var_key = self._var_key(variable)
-    m = self._momentums[self._index_dict[var_key]]
-    v = self._velocities[self._index_dict[var_key]]
-
-    if isinstance(gradient, tf.IndexedSlices):
-      # Sparse gradients.
-      m.assign_add(-m * (1 - beta_1))
-      m.scatter_add(
-          tf.IndexedSlices(gradient.values * (1 - beta_1),
-                           gradient.indices))
-      v.assign_add(-v * (1 - beta_2))
-      v.scatter_add(
-          tf.IndexedSlices(
-              tf.square(gradient.values) * (1 - beta_2), gradient.indices))
-      m_hat = (
-          u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient /
-          (1 - u_product_t))
-      v_hat = v / (1 - beta_2_power)
-
-      variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
-    else:
-      # Dense gradients.
-      m.assign_add((gradient - m) * (1 - beta_1))
-      v.assign_add((tf.square(gradient) - v) * (1 - beta_2))
-      m_hat = (
-          u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient /
-          (1 - u_product_t))
-      v_hat = v / (1 - beta_2_power)
-
-      variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'beta_1': self.beta_1,
-        'beta_2': self.beta_2,
-        'epsilon': self.epsilon,
-    })
-    return config
-
-Nadam.__doc__ = Nadam.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/optimizer.py b/keras/optimizers/optimizer_experimental/optimizer.py
deleted file mode 100644
index eed265b8d0f6..000000000000
--- a/keras/optimizers/optimizer_experimental/optimizer.py
+++ /dev/null
@@ -1,895 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Base class of optimizer.
-
-This is under development, and subject to interface/implementation changes.
-"""
-
-import abc
-from absl import logging
-
-from keras import backend
-from keras import initializers
-from keras.optimizers.optimizer_v2 import utils as optimizer_utils
-from keras.optimizers.schedules import learning_rate_schedule
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-from tensorflow.tools.docs import doc_controls
-
-
-class _BaseOptimizer(tf.Module):
-  """Optimizer base class, which only supports non-distribute use case."""
-
-  def __init__(self,
-               name,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               **kwargs):
-    self._name = name
-    self.clipnorm = clipnorm
-    self.global_clipnorm = global_clipnorm
-    self.clipvalue = clipvalue
-    self.use_ema = use_ema
-    self.jit_compile = jit_compile
-    if not tf.config.list_physical_devices("GPU"):
-      # Optimizer only benefits from XLA when training on GPU. So if no GPU is
-      # found, we turn off XLA.
-      self.jit_compile = False
-    if use_ema:
-      # Verify the arguments related to EMA.
-      if ema_momentum > 1 or ema_momentum < 0:
-        raise ValueError("`ema_momentum` must be in the range [0, 1]. "
-                         f"Received: ema_momentum={ema_momentum}")
-      if ema_overwrite_frequency and (not isinstance(
-          ema_overwrite_frequency, int) or ema_overwrite_frequency < 1):
-        raise ValueError(
-            "`ema_overwrite_frequency` must be an integer > 1 or None. "
-            f"Received: ema_overwrite_frequency={ema_overwrite_frequency}")
-    self.ema_momentum = ema_momentum
-    self.ema_overwrite_frequency = ema_overwrite_frequency
-
-    if self.clipnorm is not None and self.global_clipnorm is not None:
-      raise ValueError(f"At most one of `clipnorm` and `global_clipnorm` can "
-                       f"be set. Received: clipnorm={self.clipnorm}, "
-                       f"global_clipnorm={self.global_clipnorm}.")
-
-    self._create_iteration_variable()
-    self._process_kwargs(kwargs)
-
-  def _create_iteration_variable(self):
-    """Create the iterations counter variable."""
-    with tf.init_scope():
-      # Lift the variable creation to init scope to avoid environment issue.
-      self._iterations = tf.Variable(
-          0, name="iteration", dtype=tf.int64, trainable=False)
-
-  def _process_kwargs(self, kwargs):
-    legacy_kwargs = {
-        "lr", "decay", "gradient_transformers", "gradient_aggregator"
-    }
-    for k in kwargs:
-      if k in legacy_kwargs:
-        logging.warning(
-            "%s is deprecated in `optimizer_experimental.Optimizer`"
-            ", please check the docstring for valid arguments.", k)
-      else:
-        raise TypeError(f"{k} is not a valid argument, kwargs should be empty "
-                        " for `optimizer_experimental.Optimizer`.")
-
-  def _var_key(self, variable):
-    """Get a unique identifier of the given variable."""
-    # Get the distributed variable if it exists.
-    # TODO(b/199214315): replace _unique_id with ref() after fixing ref() issues
-    # on AggregatingVariable.
-    return variable._unique_id  # pylint: disable=protected-access
-
-  @abc.abstractmethod
-  def update_step(self, gradient, variable):
-    """Function to update variable value based on given gradients.
-
-    This method must be implemented in customized optimizers.
-
-    Args:
-      gradient: backpropagated gradient of the given variable.
-      variable: variable whose value needs to be updated.
-
-    Returns:
-      An `Operation` that applies the specified gradients.
-
-    """
-    raise NotImplementedError
-
-  @tf.function(jit_compile=True)
-  def _update_step_xla(self, gradient, variable, key):
-    """A wrapper of `update_step` to enable XLA acceleration.
-
-    Due to `tf.function` tracing mechanism, for (gradient, variable) pairs of
-    the same shape and dtype, the execution graph always invoke the first
-    pair it has seen. Thus, we need a `key` argument to make each
-    (gradient, variable) pair unique. In additions, XLA cannot understand
-    string input, so the key is an integer.
-
-    Args:
-      gradient: backpropagated gradient of the given variable.
-      variable: variable whose value needs to be updated.
-      key (int): a unique key that identifies the variable.
-
-    Returns:
-      An `Operation` that applies the specified gradients.
-    """
-    return self._update_step(gradient, variable)
-
-  def _update_step(self, gradient, variable):
-    if getattr(variable, "_unique_id", None) is None:
-      # Variable has no `_unique_id` if called during `model.save()`, in which
-      # case we do not want to update the variable.
-      return
-    if self._var_key(variable) not in self._index_dict:
-      raise KeyError(
-          f"The optimizer cannot recognize variable {variable.name}. This "
-          f"usually means that you're reusing an optimizer previously created "
-          f"for a different model. Try creating a new optimizer instance.")
-    self.update_step(gradient, variable)
-
-  def compute_gradients(self, loss, var_list, tape=None):
-    """Compute gradients of loss on trainable variables.
-
-    Args:
-      loss: `Tensor` or callable. If a callable, `loss` should take no arguments
-        and return the value to minimize.
-      var_list: list or tuple of `Variable` objects to update to minimize
-        `loss`.
-      tape: (Optional) `tf.GradientTape`. If `loss` is provided as a `Tensor`,
-        the tape that computed the `loss` must be provided.
-
-    Returns:
-      A list of (gradient, variable) pairs. Variable is always present, but
-      gradient can be `None`.
-    """
-    if not callable(loss) and tape is None:
-      raise ValueError("`tape` is required when a `Tensor` loss is passed. "
-                       f"Received: loss={loss}, tape={tape}.")
-    if tape is None:
-      tape = tf.GradientTape()
-    if callable(loss):
-      with tape:
-        tape.watch(var_list)
-        loss = loss()
-    grads = tape.gradient(loss, var_list)
-    return list(zip(grads, var_list))
-
-  def _clip_gradients(self, grads):
-    clipped_grads = []
-    if self.clipnorm and self.clipnorm > 0:
-      for g in grads:
-        if g is None:
-          clipped_grads.append(g)
-        else:
-          clipped_grads.append(tf.clip_by_norm(g, self.clipnorm))
-      return clipped_grads
-
-    if self.global_clipnorm and self.global_clipnorm > 0:
-      return tf.clip_by_global_norm(grads, self.global_clipnorm)[0]
-
-    if self.clipvalue and self.clipvalue > 0:
-      for g in grads:
-        if g is None:
-          clipped_grads.append(g)
-        else:
-          clipped_grads.append(
-              tf.clip_by_value(
-                  g,
-                  clip_value_min=-self.clipvalue,  # pylint: disable=invalid-unary-operand-type
-                  clip_value_max=self.clipvalue))
-      return clipped_grads
-
-    return grads
-
-  @property
-  def iterations(self):
-    """The number of training steps this `optimizer` has run.
-
-    By default, iterations would be incremented by one every time
-    `apply_gradients()` is called.
-    """
-    return self._iterations
-
-  @iterations.setter
-  def iterations(self, variable):
-    if getattr(self, "_built", False):
-      raise RuntimeError("Cannot set `iterations` to a new Variable after "
-                         "the Optimizer weights have been created. Here it is "
-                         f"attempting to set `iterations` to {variable}."
-                         "Usually this means you are trying to set `iterations`"
-                         " after calling `apply_gradients()`. Please set "
-                         "`iterations` before calling `apply_gradients()`.")
-    self._iterations = variable
-
-  @property
-  def learning_rate(self):
-    if not hasattr(self, "_learning_rate") or self._learning_rate is None:
-      raise ValueError("Missing learning rate, please set self.learning_rate at"
-                       " optimizer creation time.")
-    lr = self._learning_rate
-    if isinstance(lr, learning_rate_schedule.LearningRateSchedule):
-      # If the optimizer takes in LearningRateSchedule, then each call to
-      # learning_rate would return `self._current_learning_rate`, which is
-      # updated at each call to `apply_gradients`.
-      return self._current_learning_rate
-    return lr
-
-  @learning_rate.setter
-  def learning_rate(self, learning_rate):
-    if isinstance(self._learning_rate,
-                  learning_rate_schedule.LearningRateSchedule):
-      raise TypeError("This optimizer was created with a `LearningRateSchedule`"
-                      " object as its `learning_rate` constructor argument, "
-                      "hence its learning rate is not settable. If you need the"
-                      " learning rate to be settable, you should instantiate "
-                      "the optimizer with a float `learning_rate` argument.")
-    self._learning_rate.assign(learning_rate)
-
-  @property
-  @doc_controls.do_not_generate_docs
-  def lr(self):
-    """Alias of `learning_rate()`.
-
-    `lr()` is heavily called in workflows using `optimizer_v2.OptimizerV2`,
-    so we keep it for backward compabitliy.
-    """
-    return self.learning_rate
-
-  @lr.setter
-  def lr(self, learning_rate):
-    self.learning_rate = learning_rate
-
-  def _build_learning_rate(self, learning_rate):
-    if isinstance(learning_rate, learning_rate_schedule.LearningRateSchedule):
-      # Create a variable to hold the current learning rate.
-      self._current_learning_rate = tf.Variable(
-          learning_rate(self.iterations),
-          name="learning_rate",
-          dtype=tf.float32,
-          trainable=False)
-      return learning_rate
-    return tf.Variable(
-        learning_rate,
-        name="learning_rate",
-        dtype=backend.floatx(),
-        trainable=False)
-
-  @abc.abstractmethod
-  def build(self, var_list):
-    """Initialize the optimizer's variables, such as momemtum variables.
-
-    This function has to be implemented by subclass optimizers, and subclass
-    optimizers need to call `super().build(var_list)`.
-
-    Args:
-      var_list: List of model variables to build optimizers on. For example, SGD
-        optimizer with momentum will store one momentum variable corresponding
-        to each model variable.
-    """
-    if getattr(self, "_built", False):
-      return
-    self._build_index_dict(var_list)
-    if self.use_ema:
-      self._model_variables_moving_average = []
-      for var in var_list:
-        # Make a copy of the model variables, we will use the copy to store the
-        # moving average of model variables.
-        self._model_variables_moving_average.append(
-            self.add_variable_from_reference(var, "average", initial_value=var))
-
-  def _build_index_dict(self, var_list):
-    """Build variable to index dictionary.
-
-    Build a dictionary that maps variable to the index of it in the given
-    var_list.
-
-    Args:
-      var_list: List of variables to build index dict on.
-
-    Returns:
-      None
-    """
-    self._index_dict = {}
-    for i, var in enumerate(var_list):
-      var_key = self._var_key(var)
-      self._index_dict[var_key] = i
-
-  def add_variable(self, shape, dtype=None, initializer="zeros", name=None):
-    """Create an optimizer variable.
-
-    Args:
-      shape: A list of integers, a tuple of integers, or a 1-D Tensor of type
-        int32. Defaults to scalar if unspecified.
-      dtype: The DType of the optimizer variable to be created. Defaults to
-        `tf.keras.backend.floatx` if unspecified.
-      initializer: string or callable. Initializer instance.
-      name: The name of the optimizer variable to be created.
-
-    Returns:
-      An optimizer variable, in the format of tf.Variable.
-
-    """
-    if isinstance(initializer, str):
-      initializer = initializers.get(initializer)
-    if dtype is None:
-      dtype = backend.floatx()
-    if shape is None:
-      shape = []
-    return tf.Variable(
-        initial_value=initializer(shape, dtype), name=name, trainable=False)
-
-  def add_variable_from_reference(self,
-                                  model_variable,
-                                  variable_name,
-                                  shape=None,
-                                  initial_value=None):
-    """Create an optimizer variable from model variable.
-
-    Create an optimizer variable based on the information of model variable.
-    For example, in SGD optimizer momemtum, for each model variable, a
-    corresponding momemtum variable is created of the same shape and dtype.
-
-    Args:
-      model_variable: tf.Variable. The corresponding model variable to the
-        optimizer variable to be created.
-      variable_name: String. The name prefix of the optimizer variable to be
-        created. The create variables name will follow the pattern
-        `{variable_name}/{model_variable.name}`, e.g., `momemtum/dense_1`.
-      shape: List or Tuple, defaults to None. The shape of the optimizer
-        variable to be created. If None, the created variable will have the
-        same shape as `model_variable`.
-      initial_value: A Tensor, or Python object convertible to a Tensor,
-        defaults to None. The initial value of the optimizer variable, if None,
-        the initial value will be default to 0.
-
-    Returns:
-      An optimizer variable.
-    """
-    if initial_value is None:
-      if shape is None:
-        initial_value = tf.zeros(
-            shape=model_variable.shape, dtype=model_variable.dtype)
-      else:
-        initial_value = tf.zeros(shape, dtype=model_variable.dtype)
-    return tf.Variable(
-        initial_value=initial_value,
-        name=f"{variable_name}/{model_variable._shared_name}",  # pylint: disable=protected-access
-        dtype=model_variable.dtype,
-        trainable=False)
-
-  def minimize(self, loss, var_list, tape=None):
-    """Minimize `loss` by updating `var_list`.
-
-    This method simply computes gradient using `tf.GradientTape` and calls
-    `apply_gradients()`. If you want to process the gradient before applying
-    then call `tf.GradientTape` and `apply_gradients()` explicitly instead
-    of using this function.
-
-    Args:
-      loss: `Tensor` or callable. If a callable, `loss` should take no arguments
-        and return the value to minimize.
-      var_list: list or tuple of `Variable` objects to update to minimize
-        `loss`.
-      tape: (Optional) `tf.GradientTape`.
-
-    Returns:
-      None
-    """
-    grads_and_vars = self.compute_gradients(loss, var_list, tape)
-    self.apply_gradients(grads_and_vars)
-
-  def apply_gradients(self, grads_and_vars):
-    """Apply gradients to variables.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-
-    Returns:
-      None
-
-    Raises:
-      TypeError: If `grads_and_vars` is malformed.
-    """
-    if isinstance(self._learning_rate,
-                  learning_rate_schedule.LearningRateSchedule):
-      # Compute the current learning rate at the beginning of variable update.
-      self._current_learning_rate.assign(self._learning_rate(self.iterations))
-    grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
-    grads, trainable_variables = zip(*grads_and_vars)
-    scope_name = self._name or "optimizer"
-    with tf.name_scope(scope_name):
-      with tf.init_scope():
-        # Lift variable creation to init scope to avoid environment issues.
-        self.build(trainable_variables)
-    grads = self._clip_gradients(grads)
-    grads_and_vars = list(zip(grads, trainable_variables))
-    self._internal_apply_gradients(grads_and_vars)
-
-  def _internal_apply_gradients(self, grads_and_vars):
-    """Helper function of apply gradients.
-
-    This is required for separating out distributed training logic.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-    """
-    if self.jit_compile:
-      for grad, var in grads_and_vars:
-        self._update_step_xla(grad, var, id(self._var_key(var)))
-    else:
-      for grad, var in grads_and_vars:
-        self._update_step(grad, var)
-
-    self.iterations.assign_add(1)
-
-  def _update_model_variables_moving_average(self, var_list):
-    """Update the stored moving average using the latest value."""
-    if self.use_ema:
-      for (var, average) in zip(var_list, self._model_variables_moving_average):
-        average.assign(self.ema_momentum * average +
-                       (1 - self.ema_momentum) * var)
-
-  def _overwrite_model_variables_with_average_value(self, var_list):
-    """Overwrite model variables with its moving average."""
-    if len(var_list) != len(self._model_variables_moving_average):
-      raise ValueError(f"The length of model variables ({len(var_list)}) to "
-                       f"override does not match the length of model variables "
-                       f"stored in the optimizer "
-                       f"({len(self._model_variables_moving_average)}). Please "
-                       f"check if the optimizer was called on your model.")
-    self._overwrite_model_variables_with_average_value_helper(var_list)
-
-  def _overwrite_model_variables_with_average_value_helper(self, var_list):
-    """Helper function that overwrites model variables."""
-    for var, average_var in zip(var_list, self._model_variables_moving_average):
-      var.assign(average_var)
-
-  def finalize_variable_values(self, var_list):
-    """Set the final value of model's trainable variables.
-
-    Sometimes there are some extra steps before ending the variable updates,
-    such as overriding the model variables with its average value.
-
-    Args:
-      var_list: list of model variables.
-    """
-    if self.use_ema:
-      # If the optimizer uses EMA, then when finalizing, we replace the model
-      # variable value with its moving average stored inside optimizer.
-      self._overwrite_model_variables_with_average_value(var_list)
-
-  def _serialize_hyperparameter(self, hyperparameter):
-    """Serialize a hyperparameter that can be a numeric or callable."""
-    if isinstance(hyperparameter, learning_rate_schedule.LearningRateSchedule):
-      return learning_rate_schedule.serialize(hyperparameter)
-    if isinstance(hyperparameter, tf.Variable):
-      return hyperparameter.numpy()
-    if callable(hyperparameter):
-      return hyperparameter()
-    return hyperparameter
-
-  def get_config(self):
-    """Returns the config of the optimizer.
-
-    An optimizer config is a Python dictionary (serializable)
-    containing the configuration of an optimizer.
-    The same optimizer can be reinstantiated later
-    (without any saved state) from this configuration.
-
-    Subclass optimizer should override this method to include other
-    hyperparameters.
-
-    Returns:
-        Python dictionary.
-    """
-    config = {
-        "clipnorm": self.clipnorm,
-        "global_clipnorm": self.global_clipnorm,
-        "clipvalue": self.clipvalue,
-        "use_ema": self.use_ema,
-        "ema_momentum": self.ema_momentum,
-        "ema_overwrite_frequency": self.ema_overwrite_frequency,
-        "jit_compile": self.jit_compile,
-    }
-    return config
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates an optimizer from its config.
-
-    This method is the reverse of `get_config`, capable of instantiating the
-    same optimizer from the config dictionary.
-
-    Args:
-        config: A Python dictionary, typically the output of get_config.
-
-    Returns:
-        An optimizer instance.
-    """
-    if "learning_rate" in config:
-      if isinstance(config["learning_rate"], dict):
-        config["learning_rate"] = learning_rate_schedule.deserialize(
-            config["learning_rate"])
-    return cls(**config)
-
-
-base_optimizer_keyword_args = """name: String. The name to use
-      for momentum accumulator weights created by
-      the optimizer.
-    clipnorm: Float. If set, the gradient of each weight is individually
-      clipped so that its norm is no higher than this value.
-    clipvalue: Float. If set, the gradient of each weight is clipped to be no
-      higher than this value.
-    global_clipnorm: Float. If set, the gradient of all weights is clipped so
-      that their global norm is no higher than this value.
-    use_ema: Boolean, defaults to False. If True, exponential moving average
-      (EMA) is applied. EMA consists of computing an exponential moving
-      average of the weights of the model (as the weight values change after
-      each training batch), and periodically overwriting the weights with
-      their moving average.
-    ema_momentum: Float, defaults to 0.99. Only used if `use_ema=True`. This is
-      the momentum to use when computing the EMA of the model's weights:
-      `new_average = ema_momentum * old_average + (1 - ema_momentum) *
-      current_variable_value`.
-    ema_overwrite_frequency: Int or None, defaults to None. Only used if
-      `use_ema=True`. Every `ema_overwrite_frequency` steps of iterations, we
-      overwrite the model variable by its moving average. If None, the optimizer
-       does not overwrite model variables in the middle of training, and you
-      need to explicitly overwrite the variables at the end of training
-      by calling `optimizer.finalize_variable_values()` (which updates the model
-      variables in-place). When using the built-in `fit()` training loop, this
-      happens automatically after the last epoch, and you don't need to do
-      anything.
-    jit_compile: Boolean, defaults to True. If True, the optimizer will use XLA
-      compilation. If no GPU device is found, this flag will be ignored.
-    **kwargs: keyword arguments only used for backward compatibility."""
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export("keras.optimizers.experimental.Optimizer", v1=[])
-class Optimizer(_BaseOptimizer):
-  """Abstract optimizer base class.
-
-  This class supports distributed training. If you want to implement your own
-  optimizer, please subclass this class instead of _BaseOptimizer.
-
-  Args:
-    {{base_optimizer_keyword_args}}
-
-  ### Usage
-
-  ```python
-  # Create an optimizer with the desired parameters.
-  opt = tf.keras.optimizers.experimental.SGD(learning_rate=0.1)
-  var1, var2 = tf.Variable(1.0), tf.Variable(2.0)
-  # `loss` is a callable that takes no argument and returns the value
-  # to minimize.
-  loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
-  # Call minimize to update the list of variables.
-  opt.minimize(loss, var_list=[var1, var2])
-  ```
-
-  ### Processing gradients before applying them
-
-  Calling `minimize()` takes care of both computing the gradients and
-  applying them to the variables. If you want to process the gradients
-  before applying them you can instead use the optimizer in three steps:
-
-  1.  Compute the gradients with `tf.GradientTape`.
-  2.  Process the gradients as you wish.
-  3.  Apply the processed gradients with `apply_gradients()`.
-
-  Example:
-
-  ```python
-  # Create an optimizer.
-  opt = tf.keras.optimizers.experimental.SGD(learning_rate=0.1)
-  var1, var2 = tf.Variable(1.0), tf.Variable(2.0)
-
-  # Compute the gradients for a list of variables.
-  with tf.GradientTape() as tape:
-    loss = 3 * var1 * var1 + 2 * var2 * var2
-  grads = tape.gradient(loss, [var1, var2])
-
-  # Process the gradients.
-  grads[0] = grads[0] + 1
-
-  # Ask the optimizer to apply the gradients on variables.
-  opt.apply_gradients(zip(grads, [var1, var2]))
-  ```
-
-  ### Dynamic learning rate
-
-  Dynamic learning rate can be achieved by setting learning rate as a built-in
-  or customized `tf.keras.optimizers.schedules.LearningRateSchedule`.
-
-  Example:
-
-  >>> var = tf.Variable(np.random.random(size=(1,)))
-  >>> learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
-  ...   initial_learning_rate=.01, decay_steps=20, decay_rate=.1)
-  >>> opt = tf.keras.optimizers.experimental.SGD(learning_rate=learning_rate)
-  >>> loss = lambda: 3 * var
-  >>> opt.minimize(loss, var_list=[var])
-
-  ### Gradients clipping
-
-  Users can clip the gradients before applying to variables by setting
-  `clipnorm`, `clipvalue` and `global_clipnorm`. Notice that `clipnorm` and
-  `global_clipnorm` can only have one being set.
-
-  Example:
-
-  >>> opt = tf.keras.optimizers.experimental.SGD(learning_rate=1, clipvalue=1)
-  >>> var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
-  >>> with tf.GradientTape() as tape:
-  ...   loss = 2 * var1 + 2 * var2
-  >>> grads = tape.gradient(loss, [var1, var2])
-  >>> print([grads[0].numpy(), grads[1].numpy()])
-  [2.0, 2.0]
-  >>> opt.apply_gradients(zip(grads, [var1, var2]))
-  >>> # Without clipping, we should get [0, 0], but as gradients are clipped to
-  >>> # have max value 1, we get [1.0, 1.0].
-  >>> print([var1.numpy(), var2.numpy()])
-  [1.0, 1.0]
-
-  ### Using exponential moving average.
-
-  Empirically it has been found that using the exponential moving average (EMA)
-  of the trained parameters of a deep network achieves a better performance than
-  using its trained parameters directly. Keras optimizers allows users to
-  compute this moving average and overwrite the model variables at desired time.
-
-  Example:
-
-  ```python
-  # Create an SGD optimizer with EMA on. `ema_momentum` controls the decay rate
-  # of the moving average. `ema_momentum=1` means no decay and the stored moving
-  # average is always model variable's initial value before training. Reversely,
-  # `ema_momentum=0` is equivalent to not using EMA. `ema_overwrite_frequency=3`
-  # means every 3 iterations, we overwrite the trainable variables with their
-  # moving average values.
-  opt = tf.keras.optimizers.experimental.SGD(
-      learning_rate=1,
-      use_ema=True,
-      ema_momentum=0.5,
-      ema_overwrite_frequency=3)
-  var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
-  with tf.GradientTape() as tape:
-    loss = var1 + var2
-  grads = tape.gradient(loss, [var1, var2])
-  # First iteration: [var1, var2] = [1.0, 1.0]
-  opt.apply_gradients(zip(grads, [var1, var2]))
-  print([var1, var2])
-
-  # Second iteration: [var1, var2] = [0.0, 0.0]
-  opt.apply_gradients(zip(grads, [var1, var2]))
-  print([var1, var2])
-
-  # Third iteration, without EMA, we should see [var1, var2] = [-1.0, -1.0],
-  # but overwriting results in [var1, var2] = [-0.125, -0.125]. The full
-  # calculation for the moving average of var1 is:
-  # var1=2*0.5**3+1*(1-0.5)*0.5**2+0*(1-0.5)*0.5**1+(-1)*(1-0.5)=-0.125.
-  opt.apply_gradients(zip(grads, [var1, var2]))
-  print([var1, var2])
-
-  ```
-  When optimizer is constructed with `use_ema=True`, in custom training loop,
-  users can explicitly call `finalize_variable_values()` to overwrite trainable
-  variables with their EMA values. `finalize_variable_values()` is by default
-  called at the end of `model.fit()`.
-
-  ### Use with `tf.distribute.Strategy`
-
-  This optimizer class is `tf.distribute.Strategy` aware, which means it
-  automatically sums gradients across all replicas. To aggregate gradients
-  yourself, call `apply_gradients` with `skip_aggregate_gradients` set to True.
-  This is useful if you need to process aggregated gradients.
-
-  ```python
-  # This example is not runnable, it consists of dummy code for simple tutorial.
-  strategy = tf.distribute.experimental.TPUStrategy()
-
-  with strategy.scope():
-    opt = tf.keras.optimizers.experimental.SGD()
-    model = magic_function_that_returns_model()
-    gradients = magic_function_that_returns_gradients()
-    # Custom logic to aggregate gradients.
-    gradients = strategy.reduce("SUM", gradients, axis=None)
-    opt.apply_gradients(zip(gradients, model.trainable_variables),
-        skip_aggregate_gradients=True)
-  ```
-
-  ### Creating a custom optimizer
-
-  If you intend to create your own optimization algorithm, please inherit from
-  this class and override the following methods:
-
-    - `build`: Create your optimizer-related variables, such as `momentums` in
-      SGD optimizer.
-    - `update_step`: Implement your optimizer's updating logic.
-    - `get_config`: serialization of the optimizer, include all hyper
-      parameters.
-
-  Your optimizer would automatically be compatible with tensorflow distributed
-  training if you subclass `optimizer_experimental.Optimizer`.
-
-  """
-
-  def __init__(self,
-               name,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               **kwargs):
-    """Create a new Optimizer."""
-
-    super().__init__(name, clipnorm, clipvalue, global_clipnorm, use_ema,
-                     ema_momentum, ema_overwrite_frequency, jit_compile,
-                     **kwargs)
-    self._distribution_strategy = tf.distribute.get_strategy()
-
-  def add_variable_from_reference(self,
-                                  model_variable,
-                                  variable_name,
-                                  shape=None,
-                                  initial_value=None):
-    strategy = tf.distribute.get_strategy()
-    with strategy.extended.colocate_vars_with(model_variable):
-      return super().add_variable_from_reference(model_variable, variable_name,
-                                                 shape, initial_value)
-
-  def _var_key(self, variable):
-    """Get a unique identifier of the given variable."""
-    # pylint: disable=protected-access
-    # Get the distributed variable if it exists.
-    # TODO(b/197554203): replace _distributed_container() with a public api.
-    if hasattr(variable, "_distributed_container"):
-      variable = variable._distributed_container()
-    return super()._var_key(variable)
-
-  def aggregate_gradients(self, grads_and_vars):
-    """Aggregate gradients on all devices.
-
-    By default we will perform reduce_sum of gradients across devices. Users can
-    implement their own aggregation logic by overriding this method.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-
-    Returns:
-      List of (gradient, variable) pairs.
-    """
-    return optimizer_utils.all_reduce_sum_gradients(grads_and_vars)
-
-  def apply_gradients(self, grads_and_vars, skip_gradients_aggregation=False):
-    """Apply gradients to variables.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-      skip_gradients_aggregation: If true, gradients aggregation will not be
-        performed inside optimizer. Usually this arg is set to True when you
-        write custom code aggregating gradients outside the optimizer.
-
-    Returns:
-      None
-
-    Raises:
-      TypeError: If `grads_and_vars` is malformed.
-      RuntimeError: If called in a cross-replica context.
-    """
-    if not skip_gradients_aggregation:
-      grads_and_vars = self.aggregate_gradients(grads_and_vars)
-    super().apply_gradients(grads_and_vars)
-
-  def _internal_apply_gradients(self, grads_and_vars):
-    tf.__internal__.distribute.interim.maybe_merge_call(
-        self._distributed_apply_gradients_fn, self._distribution_strategy,
-        grads_and_vars)
-
-  def _overwrite_model_variables_with_average_value_helper(self, var_list):
-    """Helper function to _overwrite_model_variables_with_average_value.
-
-    This function overwrites variables on each device.
-    Args:
-      var_list: list of model variables.
-    """
-    strategy = self._distribution_strategy
-    # Override model variable by the stored average value on all devices.
-    for var, average_var in zip(var_list, self._model_variables_moving_average):
-      strategy.extended.update(
-          var, lambda a, b: a.assign(b), args=(average_var,))
-
-  def _update_model_variables_moving_average(self, var_list):
-    """Update the stored moving average using the latest value."""
-    if self.use_ema:
-      def update_average(average, var):
-        average.assign(self.ema_momentum * average +
-                       (1 - self.ema_momentum) * var)
-
-      for (var, average) in zip(var_list, self._model_variables_moving_average):
-        self._distribution_strategy.extended.update(
-            average, update_average, args=(var,), group=False)
-
-  def _distributed_apply_gradients_fn(self, distribution, grads_and_vars,
-                                      **kwargs):
-    """`apply_gradients` using a `DistributionStrategy`."""
-
-    def apply_grad_to_update_var(var, grad):
-      if self.jit_compile:
-        return self._update_step_xla(grad, var, id(self._var_key(var)))
-      else:
-        return self._update_step(grad, var)
-
-    for grad, var in grads_and_vars:
-      distribution.extended.update(
-          var, apply_grad_to_update_var, args=(grad,), group=False)
-    self.iterations.assign_add(1)
-
-    if self.use_ema:
-      _, var_list = zip(*grads_and_vars)
-      self._update_model_variables_moving_average(var_list)
-      if self.ema_overwrite_frequency:
-        # Only when self.ema_overwrite_frequency is not None, we overwrite the
-        # model variables.
-        should_overwrite_model_vars = (
-            self.iterations % self.ema_overwrite_frequency == 0)
-        tf.cond(
-            tf.cast(should_overwrite_model_vars, tf.bool),
-            true_fn=lambda: self._overwrite_model_variables_with_average_value(  # pylint: disable=g-long-lambda
-                var_list),
-            false_fn=lambda: None)
-
-
-class RestoredOptimizer(Optimizer):
-
-  def __init__(self):
-    super().__init__("RestoredOptimizer")
-
-  def get_config(self):
-    raise NotImplementedError(
-        "Restoring functional Optimizers from SavedModels is not currently "
-        "supported. Please file a feature request if this limitation bothers "
-        "you.")
-
-
-# Register the optimizer for loading from saved_model purpose.
-tf.__internal__.saved_model.load.register_revived_type(
-    "experimentalOptimizer",
-    lambda obj: isinstance(obj, Optimizer),
-    versions=[
-        tf.__internal__.saved_model.load.VersionedTypeRegistration(
-            object_factory=lambda proto: RestoredOptimizer(),
-            version=2,
-            min_producer_version=1,
-            min_consumer_version=1)
-    ])
-
-Optimizer.__doc__ = Optimizer.__doc__.replace(
-    "{{base_optimizer_keyword_args}}", base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/optimizer_pss_test.py b/keras/optimizers/optimizer_experimental/optimizer_pss_test.py
deleted file mode 100644
index 8cc1ba33f1ac..000000000000
--- a/keras/optimizers/optimizer_experimental/optimizer_pss_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-"""Tests for calling optimizer on ParameterServerStrategy."""
-
-from absl.testing import parameterized
-import keras
-from keras.optimizers.optimizer_experimental import adadelta
-from keras.optimizers.optimizer_experimental import adagrad
-from keras.optimizers.optimizer_experimental import adam
-from keras.optimizers.optimizer_experimental import adamax
-from keras.optimizers.optimizer_experimental import adamw
-from keras.optimizers.optimizer_experimental import ftrl
-from keras.optimizers.optimizer_experimental import nadam
-from keras.optimizers.optimizer_experimental import rmsprop
-from keras.optimizers.optimizer_experimental import sgd
-from keras.utils import dataset_creator
-from keras.utils import losses_utils
-import tensorflow.compat.v2 as tf
-
-ds_combinations = tf.__internal__.distribute.combinations
-
-STRATEGIES = [
-    ds_combinations.parameter_server_strategy_3worker_2ps_cpu,
-    ds_combinations.parameter_server_strategy_3worker_2ps_1gpu,
-]
-
-adadelta_fn = tf.__internal__.test.combinations.NamedObject(
-    "adadelta",
-    lambda: adadelta.Adadelta(  # pylint: disable=g-long-lambda
-        0.002,
-        use_ema=True,
-        ema_overwrite_frequency=None))
-adagrad_fn = tf.__internal__.test.combinations.NamedObject(
-    "adagrad", lambda: adagrad.Adagrad(0.002))
-adam_fn = tf.__internal__.test.combinations.NamedObject(
-    "adam", lambda: adam.Adam(0.002))
-adamax_fn = tf.__internal__.test.combinations.NamedObject(
-    "adamax", lambda: adamax.Adamax(0.002))
-adamw_fn = tf.__internal__.test.combinations.NamedObject(
-    "adamw", lambda: adamw.AdamW(0.002, weight_decay=0.004))
-ftrl_fn = tf.__internal__.test.combinations.NamedObject(
-    "ftrl", lambda: ftrl.Ftrl(0.002))
-nadam_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentnadam", lambda: nadam.Nadam(0.002))
-rmsprop_fn = tf.__internal__.test.combinations.NamedObject(
-    "rmsprop", lambda: rmsprop.RMSprop(0.002))
-sgd_fn = tf.__internal__.test.combinations.NamedObject(
-    "sgdaverage",
-    lambda: sgd.SGD(  # pylint: disable=g-long-lambda
-        0.002,
-        use_ema=True,
-        ema_overwrite_frequency=1))
-
-OPTIMIZER_FN = [
-    adadelta_fn,
-    adagrad_fn,
-    adam_fn,
-    adamax_fn,
-    adamw_fn,
-    ftrl_fn,
-    nadam_fn,
-    rmsprop_fn,
-    sgd_fn,
-]
-
-
-# TODO(b/228209527): Combine this test with optimizer_test after
-# fixing the NCCL issue.
-class OptimizerPssTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _get_model(self):
-    return keras.Sequential(
-        [keras.layers.Input(shape=(1,)),
-         keras.layers.Dense(1)])
-
-  def _get_dataset_fn(self):
-
-    def dataset_fn(_):
-      x, y = [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0]
-      ds = tf.data.Dataset.from_tensor_slices((x, y))
-      ds = ds.repeat().batch(6)
-      return ds
-
-    return dataset_fn
-
-  def _verify_accumulators_updated(self, optimizer):
-    variables = optimizer.variables
-    for var in variables:
-      if "iteration" not in var.name and "learning_rate" not in var.name:
-        # Find a variable not iteration or learning_rate, and verify its value
-        # is updated (not 0).
-        self.assertNotAllEqual(var, 0)
-
-  @ds_combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN))
-  def testGetGradientsInModelPss(self, strategy, optimizer_fn):
-    with strategy.scope():
-      model = self._get_model()
-      optimizer = optimizer_fn()
-    ds_fn = self._get_dataset_fn()
-    if isinstance(strategy, tf.distribute.ParameterServerStrategy):
-      ds = dataset_creator.DatasetCreator(ds_fn)
-    else:
-      ds = ds_fn(None)
-    model.compile(loss="mse", optimizer=optimizer)
-    model.fit(ds, epochs=1, steps_per_epoch=5)
-
-    self._verify_accumulators_updated(optimizer)
-
-  @ds_combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN))
-  def testGetGradientsInCustomTrainingLoopPss(self, strategy, optimizer_fn):
-    coordinator = (
-        tf.distribute.experimental.coordinator.ClusterCoordinator(strategy))
-
-    with strategy.scope():
-      model = self._get_model()
-      optimizer = optimizer_fn()
-
-      def per_worker_dataset_fn():
-        return strategy.distribute_datasets_from_function(
-            self._get_dataset_fn())
-
-      ds = coordinator.create_per_worker_dataset(per_worker_dataset_fn)
-
-      @tf.function
-      def train_step(iterator):
-
-        def replica_fn(data):
-          features, labels = data
-          with tf.GradientTape() as tape:
-            output = model(tf.expand_dims(features, axis=1))
-            loss = keras.losses.MeanSquaredError(
-                reduction=losses_utils.ReductionV2.NONE)(labels, output)
-          grads = tape.gradient(loss, model.trainable_variables)
-          optimizer.apply_gradients(zip(grads, model.trainable_variables))
-
-        strategy.run(replica_fn, args=(next(iterator),))
-
-      for _ in range(3):
-        coordinator.schedule(train_step, args=(iter(ds),))
-        coordinator.join()
-      self.assertEqual(self.evaluate(optimizer.iterations), 3)
-      self._verify_accumulators_updated(optimizer)
-
-
-if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/optimizers/optimizer_experimental/optimizer_test.py b/keras/optimizers/optimizer_experimental/optimizer_test.py
deleted file mode 100644
index d1998205bcfa..000000000000
--- a/keras/optimizers/optimizer_experimental/optimizer_test.py
+++ /dev/null
@@ -1,525 +0,0 @@
-"""Tests for the reworked optimizer.
-
-More context in go/new-keras-optimizer
-"""
-
-import os
-import re
-
-from absl import logging
-from absl.testing import parameterized
-import keras
-from keras.optimizers.optimizer_experimental import adadelta as adadelta_new
-from keras.optimizers.optimizer_experimental import adagrad as adagrad_new
-from keras.optimizers.optimizer_experimental import adam as adam_new
-from keras.optimizers.optimizer_experimental import adamax as adamax_new
-from keras.optimizers.optimizer_experimental import adamw as adamw_new
-from keras.optimizers.optimizer_experimental import ftrl as ftrl_new
-from keras.optimizers.optimizer_experimental import nadam as nadam_new
-from keras.optimizers.optimizer_experimental import rmsprop as rmsprop_new
-from keras.optimizers.optimizer_experimental import sgd as sgd_new
-from keras.optimizers.optimizer_v2 import adadelta as adadelta_old
-from keras.optimizers.optimizer_v2 import adagrad as adagrad_old
-from keras.optimizers.optimizer_v2 import adam as adam_old
-from keras.optimizers.optimizer_v2 import ftrl as ftrl_old
-from keras.optimizers.optimizer_v2 import gradient_descent as sgd_old
-from keras.optimizers.optimizer_v2 import rmsprop as rmsprop_old
-from keras.optimizers.schedules import learning_rate_schedule
-from keras.utils import losses_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-ds_combinations = tf.__internal__.distribute.combinations
-
-STRATEGIES = [
-    # TODO(b/202992598): Add PSS strategy once the XLA issues is resolved.
-    ds_combinations.one_device_strategy,
-    ds_combinations.mirrored_strategy_with_cpu_1_and_2,
-    ds_combinations.mirrored_strategy_with_two_gpus,
-    ds_combinations.tpu_strategy,
-    ds_combinations.cloud_tpu_strategy,
-    ds_combinations.multi_worker_mirrored_2x1_cpu,
-    ds_combinations.multi_worker_mirrored_2x2_gpu,
-    ds_combinations.central_storage_strategy_with_two_gpus,
-]
-
-adadelta_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentaladadelta",
-    lambda: adadelta_new.Adadelta(  # pylint: disable=g-long-lambda
-        0.002,
-        use_ema=True,
-        ema_overwrite_frequency=None))
-adagrad_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentaladagrad", lambda: adagrad_new.Adagrad(0.002))
-adam_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentaladam", lambda: adam_new.Adam(0.002))
-adamax_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentaladamax", lambda: adamax_new.Adamax(0.002))
-adamw_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentaladamw", lambda: adamw_new.AdamW(0.002, weight_decay=0.004))
-ftrl_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentalftrl", lambda: ftrl_new.Ftrl(0.002))
-nadam_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentnadam", lambda: nadam_new.Nadam(0.002))
-rmsprop_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentalrmsprop", lambda: rmsprop_new.RMSprop(0.002))
-sgd_new_fn = tf.__internal__.test.combinations.NamedObject(
-    "experimentalsgdaverage",
-    lambda: sgd_new.SGD(  # pylint: disable=g-long-lambda
-        0.002,
-        use_ema=True,
-        ema_overwrite_frequency=1))
-
-OPTIMIZER_FN = [
-    adadelta_new_fn,
-    adagrad_new_fn,
-    adam_new_fn,
-    adamax_new_fn,
-    adamw_new_fn,
-    ftrl_new_fn,
-    nadam_new_fn,
-    rmsprop_new_fn,
-    sgd_new_fn,
-]
-
-
-class OptimizerFuntionalityTest(tf.test.TestCase, parameterized.TestCase):
-  """Test the functionality of optimizer."""
-
-  def testAddVariableFromReference(self):
-    optimizer = adam_new.Adam()
-    variable = optimizer.add_variable_from_reference(
-        tf.Variable(1.0, name="tmp"), "test")
-    self.assertEqual(variable._shared_name, "test/tmp")
-    self.assertEqual(self.evaluate(variable), 0)
-
-  def testAddVarialeWithCustomShape(self):
-    optimizer = adam_new.Adam()
-    variable = optimizer.add_variable_from_reference(
-        tf.Variable([1.0, 2.0], name="tmp"), "test", shape=[])
-    self.assertEqual(variable, tf.Variable(0.))
-
-  def testBuildIndexDict(self):
-    optimizer = adam_new.Adam()
-    var_list = [tf.Variable(0, name=f"var{i}") for i in range(10)]
-    optimizer._build_index_dict(var_list)
-    self.assertEqual(optimizer._index_dict[optimizer._var_key(var_list[7])], 7)
-
-  def testClipNorm(self):
-    optimizer = adam_new.Adam(clipnorm=1)
-    grad = [tf.convert_to_tensor([100.0, 100.0])]
-    clipped_grad = optimizer._clip_gradients(grad)
-    self.assertAllClose(clipped_grad[0], [2**0.5 / 2, 2**0.5 / 2])
-
-  def testClipValue(self):
-    optimizer = adam_new.Adam(clipvalue=1)
-    grad = [tf.convert_to_tensor([100.0, 100.0])]
-    clipped_grad = optimizer._clip_gradients(grad)
-    self.assertAllEqual(clipped_grad[0], [1.0, 1.0])
-
-  def testWeightDecay(self):
-    grads, var1, var2, var3 = tf.zeros(
-        ()), tf.Variable(2.0), tf.Variable(2.0), tf.Variable(2.0)
-    optimizer_1 = adamw_new.AdamW(learning_rate=0.001, weight_decay=0.004)
-    optimizer_1.apply_gradients(zip([grads], [var1]))
-
-    optimizer_2 = adamw_new.AdamW(learning_rate=0.001, weight_decay=0.004)
-    optimizer_2.exclude_from_weight_decay([var2])
-    optimizer_2.apply_gradients(zip([grads], [var2]))
-
-    optimizer_3 = adamw_new.AdamW(learning_rate=0.001, weight_decay=0.004)
-    optimizer_3.build([var3], exclude_from_weight_decay=[var3])
-    optimizer_3.apply_gradients(zip([grads], [var3]))
-
-    self.assertEqual(var1, 1.992)
-    self.assertEqual(var2, 2.0)
-    self.assertEqual(var3, 2.0)
-
-  def testClipGlobalNorm(self):
-    optimizer = adam_new.Adam(global_clipnorm=1)
-    grad = [
-        tf.cast([100.0, 100.0], dtype=tf.float32),
-        tf.cast([100.0, 100.0], dtype=tf.float32)
-    ]
-    clipped_grad = optimizer._clip_gradients(grad)
-    self.assertAllClose(clipped_grad[0], [0.5, 0.5])
-
-  def testPassingLegacyArgsRaiseWarning(self):
-    with self.assertLogs(level="WARNING") as log_output:
-      logging.set_verbosity(logging.WARNING)
-      _ = adam_new.Adam(clipnorm=1, decay=0.5)
-      expected_log = "decay is deprecated in"
-      output = log_output[0][0].message
-
-      self.assertTrue(re.search(expected_log, output))
-
-  def testPassingLegacyClipnorm(self):
-    optimizer = adam_new.Adam(clipnorm=1)
-    self.assertEqual(optimizer.clipnorm, 1)
-
-  def testReturnAllOptimizerVariables(self):
-    x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
-    optimizer = adam_new.Adam()
-    grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])
-    optimizer.apply_gradients(zip([grads], [x]))
-    optimizer_variables = optimizer.variables
-    all_names = [var._shared_name for var in optimizer_variables]
-    self.assertLen(optimizer_variables, 4)
-    self.assertCountEqual(
-        all_names,
-        ["iteration", "learning_rate", "Adam/m/Variable", "Adam/v/Variable"])
-
-  def testSetLearningRate(self):
-    optimizer = adam_new.Adam(learning_rate=1.0)
-    self.assertIsInstance(optimizer._learning_rate, tf.Variable)
-    self.assertEqual(self.evaluate(optimizer.learning_rate), 1.0)
-    optimizer.learning_rate = 2.0
-    self.assertEqual(self.evaluate(optimizer.learning_rate), 2.0)
-    # Test the legacy setter.
-    optimizer.lr = 3.0
-    self.assertEqual(self.evaluate(optimizer.learning_rate), 3.0)
-
-    lr_schedule = learning_rate_schedule.ExponentialDecay(
-        initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9)
-    optimizer = adam_new.Adam(learning_rate=lr_schedule)
-    self.assertIsInstance(optimizer._learning_rate,
-                          learning_rate_schedule.ExponentialDecay)
-    self.assertEqual(optimizer.learning_rate, 0.01)
-    # Test the legacy property.
-    self.assertEqual(optimizer.lr, 0.01)
-
-    x = tf.Variable([1.0, 2.0], dtype=tf.float32)
-    grads = tf.convert_to_tensor([1.0, 2.0])
-    for _ in range(2):
-      optimizer.apply_gradients(zip([grads], [x]))
-    self.assertTrue(optimizer.learning_rate < 0.01 and
-                    optimizer.learning_rate > 0.00999)
-    with self.assertRaisesRegex(TypeError, "This optimizer was created with*"):
-      optimizer.learning_rate = 2.0
-
-  def testSetIterations(self):
-    optimizer = adam_new.Adam(jit_compile=False)
-    optimizer.iterations = tf.Variable(2, dtype=tf.int32)
-    self.assertEqual(optimizer.iterations, 2)
-    var_list = [tf.Variable(2.0), tf.Variable(2.0)]
-    grads = tf.convert_to_tensor([1.0, 1.0])
-    optimizer.apply_gradients(zip(grads, var_list))
-    self.assertEqual(optimizer.iterations, 3)
-    with self.assertRaisesRegex(RuntimeError, "Cannot set*"):
-      optimizer.iterations = 2
-
-  def testPassingMissingWDError(self):
-    with self.assertRaises(ValueError):
-      _ = adamw_new.AdamW(0.01, weight_decay=None)
-
-    with self.assertRaisesRegex(ValueError, "Missing value of"):
-      _ = adamw_new.AdamW(0.01, weight_decay=None)
-
-  def testMovingAverageOptimizer(self):
-    optimizer = sgd_new.SGD(
-        learning_rate=1,
-        use_ema=True,
-        ema_momentum=0.5,
-        ema_overwrite_frequency=3)
-
-    var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
-    with tf.GradientTape() as tape:
-      loss = var1 + var2
-    grads = tape.gradient(loss, [var1, var2])
-    # First iteration: [var1, var2] = [1.0, 1.0]
-    optimizer.apply_gradients(zip(grads, [var1, var2]))
-    self.assertAllEqual([var1.numpy(), var2.numpy()], [1.0, 1.0])
-
-    # Second iteration: [var1, var2] = [0.0, 0.0]
-    optimizer.apply_gradients(zip(grads, [var1, var2]))
-    self.assertAllEqual([var1.numpy(), var2.numpy()], [0.0, 0.0])
-
-    # Third iteration, without EMA, we should see [var1, var2] = [-1.0, -1.0],
-    # but overwriting results in [var1, var2] = [-0.125, -0.125].
-    optimizer.apply_gradients(zip(grads, [var1, var2]))
-    self.assertAllEqual([var1.numpy(), var2.numpy()], [-0.125, -0.125])
-
-  def testGetAndFromConfig(self):
-    optimizer = adam_new.Adam(
-        learning_rate=np.float64(0.05),
-        beta_1=0.7,
-        beta_2=0.77,
-        amsgrad=True,
-        epsilon=0.001,
-        clipnorm=0.5,
-        use_ema=True,
-        ema_momentum=0.5,
-        ema_overwrite_frequency=50)
-    config = optimizer.get_config()
-    expected_config = {
-        "learning_rate": np.float32(0.05),
-        "beta_1": 0.7,
-        "beta_2": 0.77,
-        "epsilon": 0.001,
-        "amsgrad": True,
-        "clipnorm": 0.5,
-        "global_clipnorm": None,
-        "clipvalue": None,
-        "use_ema": True,
-        "ema_momentum": 0.5,
-        "ema_overwrite_frequency": 50,
-    }
-    self.assertDictContainsSubset(expected_config, config)
-    restored_optimizer = adam_new.Adam.from_config(config)
-    self.assertDictEqual(restored_optimizer.get_config(),
-                         optimizer.get_config())
-
-  def testCheckpointOptimizer(self):
-    x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
-    lr_schedule = learning_rate_schedule.ExponentialDecay(
-        initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9)
-    optimizer_1 = adam_new.Adam(
-        learning_rate=lr_schedule, beta_1=0.8, beta_2=0.888)
-    grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])
-
-    for _ in range(1):
-      optimizer_1.apply_gradients(zip([grads], [x]))
-
-    # Then save the variable and optimizer to a checkpoint.
-    checkpoint_1 = tf.train.Checkpoint(var=x, optimizer=optimizer_1)
-    checkpoint_path = checkpoint_1.save(self.get_temp_dir())
-
-    # Create a new optimizer and call restore on it (and x)
-    x2 = tf.Variable([[0., 0.], [0., 0.]], dtype=x.dtype)
-    optimizer_2 = adam_new.Adam(learning_rate=0.02, beta_1=0.7, beta_2=0.777)
-    optimizer_2.build([x2])
-    checkpoint_2 = tf.train.Checkpoint(var=x2, optimizer=optimizer_2)
-    checkpoint_2.restore(checkpoint_path)
-
-    self.assertTrue(
-        (self.evaluate(optimizer_1._momentums._storage[0]) == self.evaluate(
-            optimizer_2._momentums._storage[0])).all())
-    self.assertEqual(
-        self.evaluate(optimizer_1._iterations),
-        self.evaluate(optimizer_2._iterations))
-
-  @parameterized.product(optimizer_fn=OPTIMIZER_FN)
-  def testSaveAndLoadOptimizerWithModel(self, optimizer_fn):
-    model = keras.Sequential(
-        [keras.layers.Input(shape=(1,)),
-         keras.layers.Dense(1)])
-    optimizer = optimizer_fn()
-    optimizer.clipnorm = 0.1
-    x = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
-    y = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
-    model.compile(loss="mse", optimizer=optimizer)
-    model.fit(x, y)
-
-    # Save in h5 format.
-    path = os.path.join(self.get_temp_dir(), "model.h5")
-    model.save(path)
-    loaded_model = keras.models.load_model(path)
-    loaded_model.load_weights(path)
-    loaded_optimizer = loaded_model.optimizer
-    self.assertEqual(type(optimizer), type(loaded_optimizer))
-    self.assertEqual(loaded_optimizer.learning_rate, 0.002)
-    self.assertEqual(loaded_optimizer.clipnorm, 0.1)
-
-    # Save in Keras SavedModel format.
-    model.fit(x, y)
-    path = os.path.join(self.get_temp_dir(), "model")
-    model.save(path)
-    loaded_model = keras.models.load_model(path)
-    loaded_model.load_weights(path)
-    loaded_optimizer = loaded_model.optimizer
-    self.assertEqual(type(optimizer), type(loaded_optimizer))
-    self.assertEqual(loaded_optimizer.learning_rate, 0.002)
-    self.assertEqual(loaded_optimizer.clipnorm, 0.1)
-
-  @parameterized.product(optimizer_fn=OPTIMIZER_FN)
-  def testSparseGradientsWorkAsExpected(self, optimizer_fn):
-    optimizer_1 = optimizer_fn()
-    optimizer_2 = optimizer_fn()
-    x1 = tf.Variable(np.ones([5]), dtype=tf.float64)
-    x2 = tf.Variable(np.ones([5]), dtype=tf.float64)
-    grads = tf.convert_to_tensor([0, 1., 1.5, 0, 0], dtype=tf.float64)
-    sparse_grads = tf.IndexedSlices(
-        tf.convert_to_tensor([1., 1.5], dtype=tf.float64),
-        tf.convert_to_tensor([1, 2]),
-        dense_shape=tf.convert_to_tensor([len(grads)]))
-    for _ in range(5):
-      optimizer_1.apply_gradients(zip([grads], [x1]))
-      optimizer_2.apply_gradients(zip([sparse_grads], [x2]))
-      self.assertAllClose(x1, x2)
-
-
-class OptimizerRegressionTest(tf.test.TestCase, parameterized.TestCase):
-  """Test optimizer outputs the same numerical results as optimizer_v2."""
-
-  def _compare_numerical(self, old_optimizer, new_optimizer):
-    x1 = tf.Variable(np.ones([10]), dtype=tf.float64)
-    x2 = tf.Variable(np.ones([10]), dtype=tf.float64)
-    grads = tf.convert_to_tensor(np.arange(0.1, 1.1, 0.1))
-    sparse_grads = tf.IndexedSlices(
-        tf.convert_to_tensor([0, 0.2, 0.4, 0.8], dtype=tf.float64),
-        tf.convert_to_tensor([0, 2, 4, 6]),
-        dense_shape=tf.convert_to_tensor([len(grads)]))
-
-    for _ in range(5):
-      self.assertAllClose(x1, x2)
-      old_optimizer.apply_gradients(zip([grads], [x1]))
-      new_optimizer.apply_gradients(zip([grads], [x2]))
-
-    for _ in range(5):
-      self.assertAllClose(x1, x2)
-      old_optimizer.apply_gradients(zip([sparse_grads], [x1]))
-      new_optimizer.apply_gradients(zip([sparse_grads], [x2]))
-
-  def testAdam(self):
-    self._compare_numerical(
-        adam_old.Adam(amsgrad=True), adam_new.Adam(amsgrad=True))
-
-  def testAdadelta(self):
-    self._compare_numerical(adadelta_old.Adadelta(), adadelta_new.Adadelta())
-
-  def testAdagrad(self):
-    self._compare_numerical(adagrad_old.Adagrad(), adagrad_new.Adagrad())
-
-  def testFtrl(self):
-    self._compare_numerical(ftrl_old.Ftrl(), ftrl_new.Ftrl())
-
-  def testRMSprop(self):
-    self._compare_numerical(rmsprop_old.RMSprop(), rmsprop_new.RMSprop())
-
-  @parameterized.product(nesterov=[True, False])
-  def testSgd(self, nesterov):
-    self._compare_numerical(
-        sgd_old.SGD(nesterov=nesterov), sgd_new.SGD(nesterov=nesterov))
-
-
-class DistributedTrainingTest(tf.test.TestCase, parameterized.TestCase):
-
-  @ds_combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN))
-  def testGetGradientsInModel(self, strategy, optimizer_fn):
-    with strategy.scope():
-      model = keras.Sequential(
-          [keras.layers.Input(shape=(1,)),
-           keras.layers.Dense(1)])
-      optimizer = optimizer_fn()
-      x = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
-      y = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
-      model.compile(loss="mse", optimizer=optimizer)
-    model.fit(x, y, epochs=1, steps_per_epoch=5)
-    if optimizer.name == "Adam":
-      # Assert the momentum variable is not 0.
-      self.assertNotEqual(self.evaluate(optimizer._momentums._storage[0]), 0)
-    elif optimizer.name == "Adadelta":
-      # Assert the accumulated variable is not 0.
-      self.assertNotEqual(
-          self.evaluate(optimizer._accumulated_grads._storage[0]), 0)
-    elif optimizer.name == "Adagrad":
-      # Assert the accumulated variable is not 0.
-      self.assertNotEqual(self.evaluate(optimizer._accumulators._storage[0]), 0)
-
-  @ds_combinations.generate(
-      tf.__internal__.test.combinations.combine(
-          strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN))
-  def testGetGradientsInCustomTrainingLoop(self, strategy, optimizer_fn):
-    with strategy.scope():
-      model = keras.Sequential(
-          [keras.layers.Input(shape=(1,)),
-           keras.layers.Dense(1)])
-      optimizer = optimizer_fn()
-
-      def per_worker_dataset_fn():
-
-        def dataset_fn(_):
-          x, y = [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0]
-          ds = tf.data.Dataset.from_tensor_slices((x, y))
-          ds = ds.repeat().batch(6)
-          return ds
-
-        return strategy.distribute_datasets_from_function(dataset_fn)
-
-      ds = per_worker_dataset_fn()
-
-      @tf.function
-      def train_step(ds):
-
-        def replica_fn(data):
-          features, labels = data
-          with tf.GradientTape() as tape:
-            output = model(tf.expand_dims(features, axis=1))
-            loss = keras.losses.MeanSquaredError(
-                reduction=losses_utils.ReductionV2.NONE)(labels, output)
-          grads = tape.gradient(loss, model.trainable_variables)
-          optimizer.apply_gradients(zip(grads, model.trainable_variables))
-
-        strategy.run(replica_fn, args=(next(iter(ds)),))
-
-      for _ in range(3):
-        train_step(ds)
-    self.assertEqual(self.evaluate(optimizer.iterations), 3)
-
-  @ds_combinations.generate(
-      tf.__internal__.test.combinations.combine(strategy=[
-          ds_combinations.mirrored_strategy_with_two_gpus,
-          ds_combinations.tpu_strategy,
-          ds_combinations.multi_worker_mirrored_2x2_gpu,
-          ds_combinations.central_storage_strategy_with_two_gpus,
-      ]))
-  def testJitCompile(self, strategy):
-    # Test the optimizer yields same numerical results when jit_compile is
-    # on and off.
-    with strategy.scope():
-      optimizer_1 = adam_new.Adam(
-          jit_compile=False, use_ema=True, ema_overwrite_frequency=1)
-      optimizer_2 = adam_new.Adam(
-          jit_compile=True, use_ema=True, ema_overwrite_frequency=1)
-      model_1 = keras.Sequential([
-          keras.layers.Input(shape=(2,)),
-          keras.layers.Dense(5),
-          keras.layers.Dense(1)
-      ])
-      model_2 = keras.models.clone_model(model_1)
-      model_2.set_weights(model_1.get_weights())
-
-      def per_worker_dataset_fn():
-
-        def dataset_fn(_):
-          x = np.random.rand(6, 2)
-          y = [1, 1, 1, 0, 0, 0]
-          ds = tf.data.Dataset.from_tensor_slices((x, y))
-          ds = ds.repeat().batch(6)
-          return ds
-
-        return strategy.distribute_datasets_from_function(dataset_fn)
-
-      ds = per_worker_dataset_fn()
-
-      @tf.function
-      def train_step(ds):
-
-        def replica_fn(data):
-          features, labels = data
-          with tf.GradientTape() as tape:
-            output_1 = model_1(features)
-            loss_1 = keras.losses.MeanSquaredError(
-                reduction=losses_utils.ReductionV2.NONE)(labels, output_1)
-          grads_1 = tape.gradient(loss_1, model_1.trainable_variables)
-          optimizer_1.apply_gradients(zip(grads_1, model_1.trainable_variables))
-
-          with tf.GradientTape() as tape:
-            output_2 = model_2(features)
-            loss_2 = keras.losses.MeanSquaredError(
-                reduction=losses_utils.ReductionV2.NONE)(labels, output_2)
-          grads_2 = tape.gradient(loss_2, model_2.trainable_variables)
-          optimizer_2.apply_gradients(zip(grads_2, model_2.trainable_variables))
-
-        strategy.run(replica_fn, args=(next(iter(ds)),))
-
-      for _ in range(3):
-        train_step(ds)
-        self.assertAllClose(model_1.trainable_variables[0][0],
-                            model_2.trainable_variables[0][0])
-
-
-if __name__ == "__main__":
-  tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/optimizers/optimizer_experimental/rmsprop.py b/keras/optimizers/optimizer_experimental/rmsprop.py
deleted file mode 100644
index dbfbf1ba30b0..000000000000
--- a/keras/optimizers/optimizer_experimental/rmsprop.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""RMSprop optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.RMSprop', v1=[])
-class RMSprop(optimizer.Optimizer):
-  r"""Optimizer that implements the RMSprop algorithm.
-
-  The gist of RMSprop is to:
-
-  - Maintain a moving (discounted) average of the square of gradients
-  - Divide the gradient by the root of this average
-
-  This implementation of RMSprop uses plain momentum, not Nesterov momentum.
-
-  The centered version additionally maintains a moving average of the
-  gradients, and uses that average to estimate the variance.
-
-  Args:
-    learning_rate: Initial value for the learning rate:
-      either a floating point value,
-      or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-      Defaults to 0.001.
-    rho: float, defaults to 0.9. Discounting factor for the old gradients.
-    momentum: float, defaults to 0.0. If not 0.0., the optimizer tracks the
-      momentum value, with a decay rate equals to `1 - momentum`.
-    epsilon: A small constant for numerical stability. This epsilon is
-      "epsilon hat" in the Kingma and Ba paper (in the formula just before
-      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-      1e-7.
-    centered: Boolean. If `True`, gradients are normalized by the estimated
-      variance of the gradient; if False, by the uncentered second moment.
-      Setting this to `True` may help with training, but is slightly more
-      expensive in terms of computation and memory. Defaults to `False`.
-    {{base_optimizer_keyword_args}}
-
-  Usage:
-
-  >>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
-  >>> var1 = tf.Variable(10.0)
-  >>> loss = lambda: (var1 ** 2) / 2.0    # d(loss) / d(var1) = var1
-  >>> step_count = opt.minimize(loss, [var1]).numpy()
-  >>> var1.numpy()
-  9.683772
-
-  Reference:
-    - [Hinton, 2012](
-      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               rho=0.9,
-               momentum=0.0,
-               epsilon=1e-7,
-               centered=False,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=100,
-               jit_compile=True,
-               name='RMSprop',
-               **kwargs):
-    super().__init__(
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        name=name,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.rho = rho
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.centered = centered
-
-  def build(self, var_list):
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self._built = True
-
-    self._velocities = []
-    for var in var_list:
-      self._velocities.append(
-          self.add_variable_from_reference(var, 'velocity'))
-
-    self._momentums = []
-    if self.momentum > 0:
-      for var in var_list:
-        self._momentums.append(
-            self.add_variable_from_reference(var, 'momentum'))
-
-    self._average_gradients = []
-    if self.centered:
-      for var in var_list:
-        self._average_gradients.append(
-            self.add_variable_from_reference(var, 'average_gradient'))
-
-  def update_step(self, gradient, variable):
-    """Update step given gradient and the associated model variable."""
-    lr = tf.cast(self.learning_rate, variable.dtype)
-
-    var_key = self._var_key(variable)
-    velocity = self._velocities[self._index_dict[var_key]]
-    momentum = None
-    if self.momentum > 0:
-      momentum = self._momentums[self._index_dict[var_key]]
-    average_grad = None
-    if self.centered:
-      average_grad = self._average_gradients[self._index_dict[var_key]]
-
-    rho = self.rho
-
-    if isinstance(gradient, tf.IndexedSlices):
-      # Sparse gradients.
-      velocity.assign(rho * velocity)
-      velocity.scatter_add(tf.IndexedSlices(
-          tf.square(gradient.values) * (1 - rho), gradient.indices))
-      if self.centered:
-        average_grad.assign(rho * average_grad)
-        average_grad.scatter_add(
-            tf.IndexedSlices(
-                tf.square(gradient.values) * (1 - rho), gradient.indices))
-        velocity.assign_add(-tf.square(average_grad))
-      velocity_value = tf.gather(velocity, gradient.indices)
-      transformed_grad = tf.IndexedSlices(
-          gradient.values / (tf.sqrt(velocity_value) + self.epsilon),
-          gradient.indices)
-
-      if self.momentum > 0:
-        momentum.assign(self.momentum * momentum)
-        momentum.scatter_add(transformed_grad)
-        variable.assign_add(-lr * momentum)
-      else:
-        variable.scatter_add(
-            tf.IndexedSlices(-lr * transformed_grad.values,
-                             transformed_grad.indices))
-    else:
-      # Dense gradients.
-      velocity.assign(rho * velocity + (1 - rho) * tf.square(gradient))
-      if self.centered:
-        average_grad.assign(rho * average_grad +
-                            (1 - rho) * tf.square(gradient))
-        velocity.assign_add(-tf.square(average_grad))
-      transformed_grad = gradient / (tf.sqrt(velocity) + self.epsilon)
-      if self.momentum > 0:
-        momentum.assign(self.momentum * momentum + transformed_grad)
-        variable.assign_add(-lr * momentum)
-      else:
-        variable.assign_add(-lr * transformed_grad)
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'rho': self.rho,
-        'momentum': self.momentum,
-        'epsilon': self.epsilon,
-        'centered': self.centered,
-    })
-    return config
-
-
-RMSprop.__doc__ = RMSprop.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_experimental/sgd.py b/keras/optimizers/optimizer_experimental/sgd.py
deleted file mode 100644
index c2bb7ce15210..000000000000
--- a/keras/optimizers/optimizer_experimental/sgd.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SGD optimizer implementation."""
-
-from keras.optimizers.optimizer_experimental import optimizer
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@generic_utils.register_keras_serializable()
-@keras_export('keras.optimizers.experimental.SGD', v1=[])
-class SGD(optimizer.Optimizer):
-  r"""Gradient descent (with momentum) optimizer.
-
-  Update rule for parameter `w` with gradient `g` when `momentum` is 0:
-
-  ```python
-  w = w - learning_rate * g
-  ```
-
-  Update rule when `momentum` is larger than 0:
-
-  ```python
-  velocity = momentum * velocity - learning_rate * g
-  w = w + velocity
-  ```
-
-  When `nesterov=True`, this rule becomes:
-
-  ```python
-  velocity = momentum * velocity - learning_rate * g
-  w = w + momentum * velocity - learning_rate * g
-  ```
-
-  Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use. The
-      learning rate. Defaults to 0.001.
-    momentum: float hyperparameter >= 0 that accelerates gradient descent
-      in the relevant
-      direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient
-      descent.
-    nesterov: boolean. Whether to apply Nesterov momentum.
-      Defaults to `False`.
-    {{base_optimizer_keyword_args}}
-
-  Usage:
-
-  >>> opt = tf.keras.optimizers.SGD(learning_rate=0.1)
-  >>> var = tf.Variable(1.0)
-  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
-  >>> step_count = opt.minimize(loss, [var]).numpy()
-  >>> # Step is `- learning_rate * grad`
-  >>> var.numpy()
-  0.9
-
-  >>> opt = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
-  >>> var = tf.Variable(1.0)
-  >>> val0 = var.value()
-  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
-  >>> # First step is `- learning_rate * grad`
-  >>> step_count = opt.minimize(loss, [var]).numpy()
-  >>> val1 = var.value()
-  >>> (val0 - val1).numpy()
-  0.1
-  >>> # On later steps, step-size increases because of momentum
-  >>> step_count = opt.minimize(loss, [var]).numpy()
-  >>> val2 = var.value()
-  >>> (val1 - val2).numpy()
-  0.18
-
-  Reference:
-      - For `nesterov=True`, See [Sutskever et al., 2013](
-        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
-  """
-
-  def __init__(self,
-               learning_rate=0.01,
-               momentum=0.0,
-               nesterov=False,
-               amsgrad=False,
-               clipnorm=None,
-               clipvalue=None,
-               global_clipnorm=None,
-               use_ema=False,
-               ema_momentum=0.99,
-               ema_overwrite_frequency=None,
-               jit_compile=True,
-               name='SGD',
-               **kwargs):
-    super().__init__(
-        name=name,
-        clipnorm=clipnorm,
-        clipvalue=clipvalue,
-        global_clipnorm=global_clipnorm,
-        use_ema=use_ema,
-        ema_momentum=ema_momentum,
-        ema_overwrite_frequency=ema_overwrite_frequency,
-        jit_compile=jit_compile,
-        **kwargs)
-    self._learning_rate = self._build_learning_rate(learning_rate)
-    self.momentum = momentum
-    self.nesterov = nesterov
-    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
-      raise ValueError('`momentum` must be between [0, 1].')
-
-  def build(self, var_list):
-    """Initialize optimizer variables.
-
-    SGD optimizer has one variable `momentums`, only set if `self.momentum`
-    is not 0.
-
-    Args:
-      var_list: list of model variables to build SGD variables on.
-    """
-    super().build(var_list)
-    if hasattr(self, '_built') and self._built:
-      return
-    self.momentums = []
-    if self.momentum != 0:
-      for var in var_list:
-        self.momentums.append(
-            self.add_variable_from_reference(
-                model_variable=var, variable_name='m'))
-    self._built = True
-
-  def update_step(self, gradient, variable):
-    """Update step given gradient and the associated model variable."""
-    lr = tf.cast(self.learning_rate, variable.dtype)
-    m = None
-    var_key = self._var_key(variable)
-    if self.momentum != 0:
-      momentum = tf.cast(self.momentum, variable.dtype)
-      m = self.momentums[self._index_dict[var_key]]
-
-    # TODO(b/204321487): Add nesterov acceleration.
-    if isinstance(gradient, tf.IndexedSlices):
-      # Sparse gradients.
-      add_value = tf.IndexedSlices(-gradient.values * lr, gradient.indices)
-      if m is not None:
-        m.assign(m * momentum)
-        m.scatter_add(add_value)
-        if self.nesterov:
-          variable.scatter_add(add_value)
-          variable.assign_add(m * momentum)
-        else:
-          variable.assign_add(m)
-      else:
-        variable.scatter_add(add_value)
-    else:
-      # Dense gradients
-      if m is not None:
-        m.assign(-gradient * lr + m * momentum)
-        if self.nesterov:
-          variable.assign_add(-gradient * lr + m * momentum)
-        else:
-          variable.assign_add(m)
-      else:
-        variable.assign_add(-gradient * lr)
-
-  def get_config(self):
-    config = super().get_config()
-
-    config.update({
-        'learning_rate': self._serialize_hyperparameter(self._learning_rate),
-        'momentum': self.momentum,
-        'nesterov': self.nesterov,
-    })
-    return config
-
-
-SGD.__doc__ = SGD.__doc__.replace(
-    '{{base_optimizer_keyword_args}}', optimizer.base_optimizer_keyword_args)
diff --git a/keras/optimizers/optimizer_pss_test.py b/keras/optimizers/optimizer_pss_test.py
new file mode 100644
index 000000000000..f4ff19c98bb5
--- /dev/null
+++ b/keras/optimizers/optimizer_pss_test.py
@@ -0,0 +1,165 @@
+"""Tests for calling optimizer on ParameterServerStrategy."""
+
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras.optimizers import adadelta
+from keras.optimizers import adagrad
+from keras.optimizers import adam
+from keras.optimizers import adamax
+from keras.optimizers import adamw
+from keras.optimizers import ftrl
+from keras.optimizers import lion
+from keras.optimizers import nadam
+from keras.optimizers import rmsprop
+from keras.optimizers import sgd
+from keras.utils import dataset_creator
+from keras.utils import losses_utils
+
+ds_combinations = tf.__internal__.distribute.combinations
+
+STRATEGIES = [
+    ds_combinations.parameter_server_strategy_3worker_2ps_cpu,
+    ds_combinations.parameter_server_strategy_3worker_2ps_1gpu,
+]
+
+adadelta_fn = tf.__internal__.test.combinations.NamedObject(
+    "adadelta",
+    lambda: adadelta.Adadelta(
+        0.002, use_ema=True, ema_overwrite_frequency=None
+    ),
+)
+adagrad_fn = tf.__internal__.test.combinations.NamedObject(
+    "adagrad", lambda: adagrad.Adagrad(0.002)
+)
+adam_fn = tf.__internal__.test.combinations.NamedObject(
+    "adam", lambda: adam.Adam(0.002)
+)
+adamax_fn = tf.__internal__.test.combinations.NamedObject(
+    "adamax", lambda: adamax.Adamax(0.002)
+)
+adamw_fn = tf.__internal__.test.combinations.NamedObject(
+    "adamw", lambda: adamw.AdamW(0.002, weight_decay=0.004)
+)
+ftrl_fn = tf.__internal__.test.combinations.NamedObject(
+    "ftrl", lambda: ftrl.Ftrl(0.002)
+)
+lion_fn = tf.__internal__.test.combinations.NamedObject(
+    "lion", lambda: lion.Lion(0.002)
+)
+nadam_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentnadam", lambda: nadam.Nadam(0.002)
+)
+rmsprop_fn = tf.__internal__.test.combinations.NamedObject(
+    "rmsprop", lambda: rmsprop.RMSprop(0.002)
+)
+sgd_fn = tf.__internal__.test.combinations.NamedObject(
+    "sgdaverage",
+    lambda: sgd.SGD(0.002, use_ema=True, ema_overwrite_frequency=1),
+)
+
+OPTIMIZER_FN = [
+    adadelta_fn,
+    adagrad_fn,
+    adam_fn,
+    adamax_fn,
+    adamw_fn,
+    ftrl_fn,
+    lion_fn,
+    nadam_fn,
+    rmsprop_fn,
+    sgd_fn,
+]
+
+
+# TODO(b/228209527): Combine this test with optimizer_test after
+# fixing the NCCL issue.
+class OptimizerPssTest(tf.test.TestCase, parameterized.TestCase):
+    def _get_model(self):
+        return keras.Sequential(
+            [keras.layers.Input(shape=(1,)), keras.layers.Dense(1)]
+        )
+
+    def _get_dataset_fn(self):
+        def dataset_fn(_):
+            x, y = [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0]
+            ds = tf.data.Dataset.from_tensor_slices((x, y))
+            ds = ds.repeat().batch(6)
+            return ds
+
+        return dataset_fn
+
+    def _verify_accumulators_updated(self, optimizer):
+        variables = optimizer.variables
+        for var in variables:
+            if "iteration" not in var.name and "learning_rate" not in var.name:
+                # Find a variable not iteration or learning_rate, and verify its
+                # value is updated (not 0).
+                self.assertNotAllEqual(var, 0)
+
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN
+        )
+    )
+    def testGetGradientsInModelPss(self, strategy, optimizer_fn):
+        with strategy.scope():
+            model = self._get_model()
+            optimizer = optimizer_fn()
+        ds_fn = self._get_dataset_fn()
+        if isinstance(strategy, tf.distribute.ParameterServerStrategy):
+            ds = dataset_creator.DatasetCreator(ds_fn)
+        else:
+            ds = ds_fn(None)
+        model.compile(loss="mse", optimizer=optimizer)
+        model.fit(ds, epochs=1, steps_per_epoch=5)
+
+        self._verify_accumulators_updated(optimizer)
+
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN
+        )
+    )
+    def testGetGradientsInCustomTrainingLoopPss(self, strategy, optimizer_fn):
+        coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
+            strategy
+        )
+
+        with strategy.scope():
+            model = self._get_model()
+            optimizer = optimizer_fn()
+
+            def per_worker_dataset_fn():
+                return strategy.distribute_datasets_from_function(
+                    self._get_dataset_fn()
+                )
+
+            ds = coordinator.create_per_worker_dataset(per_worker_dataset_fn)
+
+            @tf.function
+            def train_step(iterator):
+                def replica_fn(data):
+                    features, labels = data
+                    with tf.GradientTape() as tape:
+                        output = model(tf.expand_dims(features, axis=1))
+                        loss = keras.losses.MeanSquaredError(
+                            reduction=losses_utils.ReductionV2.NONE
+                        )(labels, output)
+                    grads = tape.gradient(loss, model.trainable_variables)
+                    optimizer.apply_gradients(
+                        zip(grads, model.trainable_variables)
+                    )
+
+                strategy.run(replica_fn, args=(next(iterator),))
+
+            for _ in range(3):
+                coordinator.schedule(train_step, args=(iter(ds),))
+                coordinator.join()
+            self.assertEqual(self.evaluate(optimizer.iterations), 3)
+            self._verify_accumulators_updated(optimizer)
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/optimizers/optimizer_test.py b/keras/optimizers/optimizer_test.py
new file mode 100644
index 000000000000..f501038a2cd1
--- /dev/null
+++ b/keras/optimizers/optimizer_test.py
@@ -0,0 +1,868 @@
+"""Tests for the reworked optimizer.
+
+More context in go/new-keras-optimizer
+"""
+
+import os
+from unittest import mock
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras.optimizers import adadelta as adadelta_new
+from keras.optimizers import adafactor as adafactor_new
+from keras.optimizers import adagrad as adagrad_new
+from keras.optimizers import adam as adam_new
+from keras.optimizers import adamax as adamax_new
+from keras.optimizers import adamw as adamw_new
+from keras.optimizers import ftrl as ftrl_new
+from keras.optimizers import lion as lion_new
+from keras.optimizers import nadam as nadam_new
+from keras.optimizers import rmsprop as rmsprop_new
+from keras.optimizers import sgd as sgd_new
+from keras.optimizers.legacy import adadelta as adadelta_old
+from keras.optimizers.legacy import adagrad as adagrad_old
+from keras.optimizers.legacy import adam as adam_old
+from keras.optimizers.legacy import ftrl as ftrl_old
+from keras.optimizers.legacy import gradient_descent as sgd_old
+from keras.optimizers.legacy import rmsprop as rmsprop_old
+from keras.optimizers.schedules import learning_rate_schedule
+from keras.testing_infra import test_utils
+from keras.utils import losses_utils
+
+ds_combinations = tf.__internal__.distribute.combinations
+
+STRATEGIES = [
+    # TODO(b/202992598): Add PSS strategy once the XLA issues is resolved.
+    ds_combinations.one_device_strategy,
+    ds_combinations.mirrored_strategy_with_two_cpus,
+    ds_combinations.mirrored_strategy_with_two_gpus,
+    ds_combinations.tpu_strategy,
+    ds_combinations.cloud_tpu_strategy,
+    ds_combinations.multi_worker_mirrored_2x1_cpu,
+    ds_combinations.multi_worker_mirrored_2x2_gpu,
+    ds_combinations.central_storage_strategy_with_two_gpus,
+]
+
+adadelta_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentaladadelta",
+    lambda: adadelta_new.Adadelta(
+        0.002, use_ema=True, ema_overwrite_frequency=None
+    ),
+)
+adagrad_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentaladagrad", lambda: adagrad_new.Adagrad(0.002)
+)
+adafactor_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "adafactor", lambda: adafactor_new.Adafactor(0.002)
+)
+adam_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentaladam", lambda: adam_new.Adam(0.002)
+)
+adamax_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentaladamax", lambda: adamax_new.Adamax(0.002)
+)
+adamw_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentaladamw", lambda: adamw_new.AdamW(0.002, weight_decay=0.004)
+)
+ftrl_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentalftrl", lambda: ftrl_new.Ftrl(0.002)
+)
+lion_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "lion", lambda: lion_new.Lion(0.002)
+)
+nadam_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentnadam", lambda: nadam_new.Nadam(0.002)
+)
+rmsprop_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentalrmsprop", lambda: rmsprop_new.RMSprop(0.002)
+)
+sgd_new_fn = tf.__internal__.test.combinations.NamedObject(
+    "experimentalsgdaverage",
+    lambda: sgd_new.SGD(
+        0.002, weight_decay=0.004, use_ema=True, ema_overwrite_frequency=1
+    ),
+)
+
+OPTIMIZER_FN = [
+    adadelta_new_fn,
+    adagrad_new_fn,
+    adafactor_new_fn,
+    adam_new_fn,
+    adamax_new_fn,
+    adamw_new_fn,
+    ftrl_new_fn,
+    lion_new_fn,
+    nadam_new_fn,
+    rmsprop_new_fn,
+    sgd_new_fn,
+]
+
+
+class OptimizerFuntionalityTest(tf.test.TestCase, parameterized.TestCase):
+    """Test the functionality of optimizer."""
+
+    def testAddVariableFromReference(self):
+        optimizer = adam_new.Adam()
+        variable = optimizer.add_variable_from_reference(
+            tf.Variable(1.0, name="tmp"), "test"
+        )
+        self.assertEqual(variable._shared_name, "test/tmp")
+        self.assertEqual(self.evaluate(variable), 0)
+
+    def testAddVarialeWithCustomShape(self):
+        optimizer = adam_new.Adam()
+        variable = optimizer.add_variable_from_reference(
+            tf.Variable([1.0, 2.0], name="tmp"), "test", shape=[]
+        )
+        self.assertEqual(variable, tf.Variable(0.0))
+
+    def testBuildIndexDict(self):
+        optimizer = adam_new.Adam()
+        var_list = [tf.Variable(0, name=f"var{i}") for i in range(10)]
+        optimizer._build_index_dict(var_list)
+        self.assertEqual(
+            optimizer._index_dict[optimizer._var_key(var_list[7])], 7
+        )
+
+    def testComputeGradients(self):
+        optimizer = adam_new.Adam()
+        x = tf.Variable([1.0, 2.0], dtype=tf.float32)
+        loss_fn = lambda: x
+        # Test Tensor-type var_list.
+        var_list = [x]
+        grads_and_vars = optimizer.compute_gradients(loss_fn, var_list)
+        grads, _ = zip(*grads_and_vars)
+        self.assertAllEqual(grads[0], tf.constant([1.0, 1.0]))
+        # Test callable-type var_list, and create variable in loss fn.
+        x = []
+
+        def loss_fn():
+            variable = tf.Variable([1.0, 2.0], dtype=tf.float32)
+            x.append(variable)
+            return variable
+
+        var_list = lambda: x
+
+        grads_and_vars = optimizer.compute_gradients(loss_fn, var_list)
+        grads, _ = zip(*grads_and_vars)
+        self.assertAllEqual(grads[0], tf.constant([1.0, 1.0]))
+
+    def testClipNorm(self):
+        optimizer = adam_new.Adam(clipnorm=1)
+        grad = [tf.convert_to_tensor([100.0, 100.0])]
+        clipped_grad = optimizer._clip_gradients(grad)
+        self.assertAllClose(clipped_grad[0], [2**0.5 / 2, 2**0.5 / 2])
+
+    def testClipValue(self):
+        optimizer = adam_new.Adam(clipvalue=1)
+        grad = [tf.convert_to_tensor([100.0, 100.0])]
+        clipped_grad = optimizer._clip_gradients(grad)
+        self.assertAllEqual(clipped_grad[0], [1.0, 1.0])
+
+    def testWeightDecay(self):
+        grads, var1, var2, var3 = (
+            tf.zeros(()),
+            tf.Variable(2.0),
+            tf.Variable(2.0, name="exclude"),
+            tf.Variable(2.0),
+        )
+        optimizer_1 = adamw_new.AdamW(learning_rate=1, weight_decay=0.004)
+        optimizer_1.apply_gradients(zip([grads], [var1]))
+
+        optimizer_2 = adamw_new.AdamW(learning_rate=1, weight_decay=0.004)
+        optimizer_2.exclude_from_weight_decay(var_names=["exclude"])
+        optimizer_2.apply_gradients(zip([grads, grads], [var1, var2]))
+
+        optimizer_3 = adamw_new.AdamW(learning_rate=1, weight_decay=0.004)
+        optimizer_3.exclude_from_weight_decay(var_list=[var3])
+        optimizer_3.apply_gradients(zip([grads, grads], [var1, var3]))
+
+        self.assertEqual(var1, 1.9760959)
+        self.assertEqual(var2, 2.0)
+        self.assertEqual(var3, 2.0)
+
+        grads, var1, var2, var3 = (
+            tf.zeros(()),
+            tf.Variable(2.0),
+            tf.Variable(2.0, name="exclude"),
+            tf.Variable(2.0),
+        )
+        optimizer_1 = sgd_new.SGD(learning_rate=1, weight_decay=0.004)
+        optimizer_1.apply_gradients(zip([grads], [var1]))
+
+        optimizer_2 = sgd_new.SGD(learning_rate=1, weight_decay=0.004)
+        optimizer_2.exclude_from_weight_decay(var_names=["exclude"])
+        optimizer_2.apply_gradients(zip([grads, grads], [var1, var2]))
+
+        optimizer_3 = sgd_new.SGD(learning_rate=1, weight_decay=0.004)
+        optimizer_3.exclude_from_weight_decay(var_list=[var3])
+        optimizer_3.apply_gradients(zip([grads, grads], [var1, var3]))
+
+        self.assertEqual(var1, 1.9760959)
+        self.assertEqual(var2, 2.0)
+        self.assertEqual(var3, 2.0)
+
+    def testClipGlobalNorm(self):
+        optimizer = adam_new.Adam(global_clipnorm=1)
+        grad = [
+            tf.cast([100.0, 100.0], dtype=tf.float32),
+            tf.cast([100.0, 100.0], dtype=tf.float32),
+        ]
+        clipped_grad = optimizer._clip_gradients(grad)
+        self.assertAllClose(clipped_grad[0], [0.5, 0.5])
+
+    def testPassingLegacyArgsRaiseError(self):
+        with self.assertRaisesRegex(ValueError, "decay is deprecated*"):
+            _ = adam_new.Adam(clipnorm=1, decay=0.5)
+
+    def testPassingLegacyClipnorm(self):
+        optimizer = adam_new.Adam(clipnorm=1)
+        self.assertEqual(optimizer.clipnorm, 1)
+
+    def testReturnAllOptimizerVariables(self):
+        x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
+        optimizer = adam_new.Adam()
+        grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])
+        optimizer.apply_gradients(zip([grads], [x]))
+        optimizer_variables = optimizer.variables
+        all_names = [var._shared_name for var in optimizer_variables]
+        self.assertLen(optimizer_variables, 3)
+        self.assertCountEqual(
+            all_names,
+            [
+                "iteration",
+                "Adam/m/Variable",
+                "Adam/v/Variable",
+            ],
+        )
+
+    def testSetWeights(self):
+        x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
+        optimizer_1 = adam_new.Adam()
+        grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])
+        optimizer_1.apply_gradients(zip([grads], [x]))
+        optimizer_2 = adam_new.Adam()
+        with self.assertRaisesRegex(ValueError, "You are calling*"):
+            optimizer_2.set_weights(optimizer_1.variables)
+        optimizer_2.build([x])
+        optimizer_2.set_weights(optimizer_1.variables)
+        self.assertAllClose(optimizer_1.variables, optimizer_2.variables)
+
+    def testSetLearningRate(self):
+        optimizer = adam_new.Adam(learning_rate=1.0)
+        self.assertIsInstance(optimizer._learning_rate, tf.Variable)
+        self.assertEqual(self.evaluate(optimizer.learning_rate), 1.0)
+        optimizer.learning_rate = 2.0
+        self.assertEqual(self.evaluate(optimizer.learning_rate), 2.0)
+        # Test the legacy setter.
+        optimizer.lr = 3.0
+        self.assertEqual(self.evaluate(optimizer.learning_rate), 3.0)
+
+        lr_schedule = learning_rate_schedule.ExponentialDecay(
+            initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9
+        )
+        optimizer = adam_new.Adam(learning_rate=lr_schedule)
+        self.assertIsInstance(
+            optimizer._learning_rate, learning_rate_schedule.ExponentialDecay
+        )
+        self.assertEqual(optimizer.learning_rate, 0.01)
+        # Test the legacy property.
+        self.assertEqual(optimizer.lr, 0.01)
+
+        x = tf.Variable([1.0, 2.0], dtype=tf.float32)
+        grads = tf.convert_to_tensor([1.0, 2.0])
+        for _ in range(2):
+            optimizer.apply_gradients(zip([grads], [x]))
+        self.assertTrue(
+            optimizer.learning_rate < 0.01 and optimizer.learning_rate > 0.00999
+        )
+        # Check it does not throw error to set `learning_rate` by a
+        # LearningRateScheduler instance.
+        optimizer.learning_rate = learning_rate_schedule.ExponentialDecay(
+            initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9
+        )
+        with self.assertRaisesRegex(
+            TypeError, "This optimizer was created with*"
+        ):
+            optimizer.learning_rate = 2.0
+
+    def testSetIterations(self):
+        optimizer = adam_new.Adam(jit_compile=False)
+        optimizer.iterations = tf.Variable(2, dtype=tf.int32)
+        self.assertEqual(optimizer.iterations, 2)
+        var_list = [tf.Variable(2.0), tf.Variable(2.0)]
+        grads = tf.convert_to_tensor([1.0, 1.0])
+        iterations = optimizer.apply_gradients(zip(grads, var_list))
+        self.assertEqual(iterations, 3)
+        self.assertEqual(optimizer.iterations, 3)
+        with self.assertRaisesRegex(RuntimeError, "Cannot set*"):
+            optimizer.iterations = 2
+
+    def testVariableConstraints(self):
+        optimizer = adam_new.Adam()
+        inputs = keras.layers.Input(shape=[1])
+        outputs = keras.layers.Dense(1, kernel_constraint="NonNeg")(inputs)
+        model = keras.models.Model(inputs=inputs, outputs=outputs)
+        model.trainable_variables[0] = -999999  # Set as a negative number.
+        grads = [tf.zeros(1, 1), tf.zeros(1)]
+        optimizer.apply_gradients(zip(grads, model.trainable_variables))
+        self.assertEqual(model.trainable_variables[0], 0.0)
+
+    def testNoGradients(self):
+        optimizer = adam_new.Adam(jit_compile=False)
+        optimizer.apply_gradients(zip([], []))
+
+    def testApplyGradientsNameArg(self):
+        optimizer = adam_new.Adam(jit_compile=False)
+        var_list = [tf.Variable(2.0), tf.Variable(2.0)]
+        grads = tf.convert_to_tensor([1.0, 1.0])
+        optimizer.apply_gradients(zip(grads, var_list), name="dummy")
+        self.assertIn("dummy", optimizer._velocities[0].name)
+
+    def testPassingMissingWDError(self):
+        with self.assertRaises(ValueError):
+            _ = adamw_new.AdamW(0.01, weight_decay=None)
+
+        with self.assertRaisesRegex(ValueError, "Missing value of"):
+            _ = adamw_new.AdamW(0.01, weight_decay=None)
+
+    def testMovingAverageOptimizer(self):
+        optimizer = sgd_new.SGD(
+            learning_rate=1,
+            use_ema=True,
+            ema_momentum=0.5,
+            ema_overwrite_frequency=3,
+        )
+
+        # `var2` does not produce gradients.
+        var1, var2, var3 = tf.Variable(2.0), tf.Variable(2.0), tf.Variable(2.0)
+        with tf.GradientTape() as tape:
+            loss = var1 + var3
+        grads = tape.gradient(loss, [var1, var2, var3])
+        # First iteration: [var1, var2, var3] = [1.0, 2.0, 1.0]
+        optimizer.apply_gradients(zip(grads, [var1, var2, var3]))
+        self.assertAllEqual(
+            [var1.numpy(), var2.numpy(), var3.numpy()],
+            [1.0, 2.0, 1.0],
+        )
+
+        # Second iteration: [var1, var2, var3] = [0.0, 2.0, 0.0]
+        optimizer.apply_gradients(zip(grads, [var1, var2, var3]))
+        self.assertAllEqual(
+            [var1.numpy(), var2.numpy(), var3.numpy()],
+            [0.0, 2.0, 0.0],
+        )
+
+        # Third iteration, without EMA, we should see [var1, var2, var3] =
+        # [-1.0, 2.0 -1.0], but overwriting results in [var1, var2] =
+        # [-0.125, 2.0, -0.125].
+        optimizer.apply_gradients(zip(grads, [var1, var2, var3]))
+        self.assertAllEqual(
+            [var1.numpy(), var2.numpy(), var3.numpy()],
+            [-0.125, 2.0, -0.125],
+        )
+
+    def testGetAndFromConfig(self):
+        class CustomLRSchedule(learning_rate_schedule.LearningRateSchedule):
+            def __init__(self, initial_learning_rate):
+                self.initial_learning_rate = initial_learning_rate
+
+            def __call__(self, step):
+                step = tf.cast(step, tf.float32)
+                return self.initial_learning_rate / (step + 1)
+
+            def get_config(self):
+                return {"initial_learning_rate": self.initial_learning_rate}
+
+        learning_rate = CustomLRSchedule(0.05)
+        optimizer = adam_new.Adam(
+            learning_rate=learning_rate,
+            beta_1=0.7,
+            beta_2=0.77,
+            amsgrad=True,
+            epsilon=0.001,
+            clipnorm=0.5,
+            use_ema=True,
+            ema_momentum=0.5,
+            ema_overwrite_frequency=50,
+            name="custom_adam",
+        )
+        config = optimizer.get_config()
+        expected_config = {
+            "name": "custom_adam",
+            "beta_1": 0.7,
+            "beta_2": 0.77,
+            "epsilon": 0.001,
+            "amsgrad": True,
+            "clipnorm": 0.5,
+            "global_clipnorm": None,
+            "clipvalue": None,
+            "use_ema": True,
+            "ema_momentum": 0.5,
+            "ema_overwrite_frequency": 50,
+            "is_legacy_optimizer": False,
+        }
+        expected_learning_rate = {
+            "class_name": "CustomLRSchedule",
+            "config": {"initial_learning_rate": 0.05},
+            "module": None,
+            "registered_name": "CustomLRSchedule",
+        }
+        self.assertDictContainsSubset(expected_config, config)
+        self.assertDictEqual(expected_learning_rate, config["learning_rate"])
+
+        restored_optimizer = adam_new.Adam.from_config(
+            config, custom_objects={"CustomLRSchedule": CustomLRSchedule}
+        )
+        self.assertDictEqual(
+            restored_optimizer.get_config(), optimizer.get_config()
+        )
+
+    def testCheckpointOptimizer(self):
+        x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
+        lr_schedule = learning_rate_schedule.ExponentialDecay(
+            initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9
+        )
+        optimizer_1 = adam_new.Adam(
+            learning_rate=lr_schedule, beta_1=0.8, beta_2=0.888
+        )
+        grads = tf.convert_to_tensor([[1.0, 2.0], [3.0, 4.0]])
+
+        for _ in range(1):
+            optimizer_1.apply_gradients(zip([grads], [x]))
+
+        # Then save the variable and optimizer to a checkpoint.
+        checkpoint_1 = tf.train.Checkpoint(var=x, optimizer=optimizer_1)
+        checkpoint_path = checkpoint_1.save(self.get_temp_dir())
+
+        # Create a new optimizer and call restore on it (and x)
+        x2 = tf.Variable([[0.0, 0.0], [0.0, 0.0]], dtype=x.dtype)
+        optimizer_2 = adam_new.Adam(
+            learning_rate=lr_schedule, beta_1=0.8, beta_2=0.888
+        )
+        checkpoint_2 = tf.train.Checkpoint(var=x2, optimizer=optimizer_2)
+        checkpoint_2.restore(checkpoint_path)
+
+        for _ in range(2):
+            optimizer_1.apply_gradients(zip([grads], [x]))
+            optimizer_2.apply_gradients(zip([grads], [x]))
+
+        self.assertTrue(
+            (
+                self.evaluate(optimizer_1._momentums._storage[0])
+                == self.evaluate(optimizer_2._momentums._storage[0])
+            ).all()
+        )
+        self.assertEqual(
+            self.evaluate(optimizer_1._iterations),
+            self.evaluate(optimizer_2._iterations),
+        )
+
+    def testCheckpointOptimizerWithModel(self):
+        inputs = keras.layers.Input(shape=(1,))
+        outputs = keras.layers.Dense(1)(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        optimizer = adamax_new_fn()
+        x = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+        y = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+        model.compile(loss="mse", optimizer=optimizer)
+        path = os.path.join(self.get_temp_dir(), "ckpt")
+        checkpoint_callback = keras.callbacks.ModelCheckpoint(path)
+        model.fit(x, y, callbacks=[checkpoint_callback])
+
+        new_model = keras.Model(inputs=inputs, outputs=outputs)
+        new_optimizer = adamax_new_fn()
+        new_model.compile(loss="mse", optimizer=new_optimizer)
+        new_model.load_weights(path)
+        self.assertEqual(
+            new_model.optimizer.iterations.numpy(),
+            model.optimizer.iterations.numpy(),
+        )
+
+    def testRestoreOldOptimizerCheckpoint(self):
+        inputs = keras.layers.Input(shape=(1,))
+        outputs = keras.layers.Dense(1)(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        optimizer = adam_old.Adam()
+        x = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+        y = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+        model.compile(loss="mse", optimizer=optimizer)
+        path = os.path.join(self.get_temp_dir(), "ckpt")
+        checkpoint_callback = keras.callbacks.ModelCheckpoint(path)
+        model.fit(x, y, callbacks=[checkpoint_callback])
+
+        new_model = keras.Model(inputs=inputs, outputs=outputs)
+        new_optimizer = adam_new.Adam()
+        new_model.compile(loss="mse", optimizer=new_optimizer)
+        with self.assertRaisesRegex(
+            ValueError, "You are trying to restore a checkpoint.*Adam.*"
+        ):
+            new_model.load_weights(path)
+
+    @parameterized.product(optimizer_fn=OPTIMIZER_FN)
+    def testSaveAndLoadOptimizerWithModel(self, optimizer_fn):
+        inputs = keras.layers.Input(shape=(1,))
+        outputs = keras.layers.Dense(1)(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        optimizer = optimizer_fn()
+        optimizer.clipnorm = 0.1
+        x = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+        y = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+        model.compile(loss="mse", optimizer=optimizer)
+        model.fit(x, y)
+
+        # Save in h5 format.
+        path = os.path.join(self.get_temp_dir(), "model.h5")
+        model.save(path)
+        loaded_model = keras.models.load_model(path)
+        loaded_model.load_weights(path)
+        loaded_optimizer = loaded_model.optimizer
+        self.assertEqual(type(optimizer), type(loaded_optimizer))
+        self.assertEqual(loaded_optimizer.learning_rate, 0.002)
+        self.assertEqual(loaded_optimizer.clipnorm, 0.1)
+        self.assertAllClose(optimizer.variables, loaded_optimizer.variables)
+
+        # Save in Keras SavedModel format.
+        model.fit(x, y)
+        path = os.path.join(self.get_temp_dir(), "model")
+        model.save(path)
+        loaded_model = keras.models.load_model(path)
+        loaded_model.load_weights(path)
+        loaded_optimizer = loaded_model.optimizer
+        self.assertEqual(type(optimizer), type(loaded_optimizer))
+        self.assertEqual(loaded_optimizer.learning_rate, 0.002)
+        self.assertEqual(loaded_optimizer.clipnorm, 0.1)
+        loaded_optimizer.build(loaded_model.trainable_variables)
+        self.assertAllClose(optimizer.variables, loaded_optimizer.variables)
+
+        # Save in `.keras` format.
+        path = os.path.join(self.get_temp_dir(), "model.keras")
+        model.save(path)
+        loaded_model = keras.models.load_model(path)
+        loaded_model.load_weights(path)
+        loaded_optimizer = loaded_model.optimizer
+        self.assertEqual(type(optimizer), type(loaded_optimizer))
+        self.assertEqual(loaded_optimizer.learning_rate, 0.002)
+        self.assertEqual(loaded_optimizer.clipnorm, 0.1)
+        self.assertAllClose(optimizer.variables, loaded_optimizer.variables)
+
+    @parameterized.product(optimizer_fn=OPTIMIZER_FN)
+    def testSparseGradientsWorkAsExpected(self, optimizer_fn):
+        optimizer_1 = optimizer_fn()
+        optimizer_2 = optimizer_fn()
+        x1 = tf.Variable(np.ones([5]), dtype=tf.float64)
+        x2 = tf.Variable(np.ones([5]), dtype=tf.float64)
+        grads = tf.convert_to_tensor([0, 1.0, 1.5, 0, 0], dtype=tf.float64)
+        sparse_grads = tf.IndexedSlices(
+            tf.convert_to_tensor([1.0, 1.5], dtype=tf.float64),
+            tf.convert_to_tensor([1, 2]),
+            dense_shape=tf.convert_to_tensor([len(grads)]),
+        )
+        for _ in range(5):
+            optimizer_1.apply_gradients(zip([grads], [x1]))
+            optimizer_2.apply_gradients(zip([sparse_grads], [x2]))
+            self.assertAllClose(x1, x2)
+
+    @test_utils.run_v2_only
+    def test_convert_to_legacy_optimizer(self):
+        if not tf.executing_eagerly():
+            # The conversion could only happen in eager mode.
+            return
+        optimizer_list = [
+            "adadelta",
+            "adagrad",
+            "adam",
+            "adamax",
+            "nadam",
+            "rmsprop",
+            "sgd",
+            "ftrl",
+        ]
+        # Test conversion does not throw errors.
+        for name in optimizer_list:
+            experimental_optimizer = keras.optimizers.get(
+                name, use_legacy_optimizer=False
+            )
+            reference_legacy_optimizer = keras.optimizers.get(
+                name, use_legacy_optimizer=True
+            )
+            converted_legacy_optimizer = (
+                keras.optimizers.convert_to_legacy_optimizer(
+                    experimental_optimizer
+                )
+            )
+            self.assertEqual(
+                type(reference_legacy_optimizer),
+                type(converted_legacy_optimizer),
+            )
+            self.assertDictEqual(
+                reference_legacy_optimizer.get_config(),
+                converted_legacy_optimizer.get_config(),
+            )
+
+        lr_schedule = learning_rate_schedule.ExponentialDecay(
+            initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9
+        )
+        optimizer = adam_new.Adam(learning_rate=lr_schedule)
+        legacy_optimizer = keras.optimizers.convert_to_legacy_optimizer(
+            optimizer
+        )
+        self.assertDictEqual(
+            optimizer.get_config()["learning_rate"],
+            legacy_optimizer.get_config()["learning_rate"],
+        )
+
+        class CustomLRSchedule(learning_rate_schedule.LearningRateSchedule):
+            def __init__(self, initial_learning_rate):
+                self.initial_learning_rate = initial_learning_rate
+
+            def __call__(self, step):
+                step = tf.cast(step, tf.float32)
+                return self.initial_learning_rate / (step + 1)
+
+            def get_config(self):
+                return {"initial_learning_rate": self.initial_learning_rate}
+
+        lr_schedule = CustomLRSchedule(0.001)
+        optimizer = adam_new.Adam(learning_rate=lr_schedule)
+        legacy_optimizer = keras.optimizers.convert_to_legacy_optimizer(
+            optimizer
+        )
+        self.assertDictEqual(
+            optimizer.get_config()["learning_rate"],
+            legacy_optimizer.get_config()["learning_rate"],
+        )
+
+    @test_utils.run_v2_only
+    def test_arm_mac_get_legacy_optimizer(self):
+        with mock.patch(
+            "platform.system",
+            mock.MagicMock(return_value="Darwin"),
+        ):
+            with mock.patch(
+                "platform.processor",
+                mock.MagicMock(return_value="arm"),
+            ):
+                optimizer = keras.optimizers.get("adam")
+        self.assertIsInstance(optimizer, adam_old.Adam)
+
+
+class OptimizerRegressionTest(tf.test.TestCase, parameterized.TestCase):
+    """Test optimizer outputs the same numerical results as optimizer_v2."""
+
+    def _compare_numerical(self, old_optimizer, new_optimizer):
+        x1 = tf.Variable(np.ones([10]), dtype=tf.float64)
+        x2 = tf.Variable(np.ones([10]), dtype=tf.float64)
+        grads = tf.convert_to_tensor(np.arange(0.1, 1.1, 0.1))
+        first_grads = tf.constant([0.01] * 10, dtype=tf.float64)
+        sparse_grads = tf.IndexedSlices(
+            tf.convert_to_tensor([0, 0.2, 0.4, 0.8, 0.8], dtype=tf.float64),
+            tf.convert_to_tensor([0, 2, 4, 6, 6]),
+            dense_shape=tf.convert_to_tensor([len(grads)]),
+        )
+
+        old_optimizer.apply_gradients(zip([first_grads], [x1]))
+        new_optimizer.apply_gradients(zip([first_grads], [x2]))
+        for _ in range(5):
+            self.assertAllClose(x1, x2, rtol=5e-4, atol=5e-4)
+            old_optimizer.apply_gradients(zip([grads], [x1]))
+            new_optimizer.apply_gradients(zip([grads], [x2]))
+
+        for _ in range(5):
+            self.assertAllClose(x1, x2, rtol=5e-4, atol=5e-4)
+            old_optimizer.apply_gradients(zip([sparse_grads], [x1]))
+            new_optimizer.apply_gradients(zip([sparse_grads], [x2]))
+
+    def testAdam(self):
+        self._compare_numerical(
+            adam_old.Adam(amsgrad=True), adam_new.Adam(amsgrad=True)
+        )
+
+    def testAdadelta(self):
+        self._compare_numerical(
+            adadelta_old.Adadelta(), adadelta_new.Adadelta()
+        )
+
+    def testAdagrad(self):
+        self._compare_numerical(adagrad_old.Adagrad(), adagrad_new.Adagrad())
+
+    def testFtrl(self):
+        self._compare_numerical(ftrl_old.Ftrl(), ftrl_new.Ftrl())
+
+    def testRMSprop(self):
+        self._compare_numerical(
+            rmsprop_old.RMSprop(centered=True),
+            rmsprop_new.RMSprop(centered=True),
+        )
+
+    @parameterized.product(nesterov=[True, False])
+    def testSgd(self, nesterov):
+        self._compare_numerical(
+            sgd_old.SGD(nesterov=nesterov), sgd_new.SGD(nesterov=nesterov)
+        )
+
+    def testWeightDecay(self):
+        self._compare_numerical(
+            adam_new.Adam(learning_rate=1, weight_decay=0.5, epsilon=0),
+            adamw_new.AdamW(learning_rate=1, weight_decay=0.5, epsilon=0),
+        )
+
+
+class DistributedTrainingTest(tf.test.TestCase, parameterized.TestCase):
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN
+        )
+    )
+    def testGetGradientsInModel(self, strategy, optimizer_fn):
+        with strategy.scope():
+            model = keras.Sequential(
+                [keras.layers.Input(shape=(1,)), keras.layers.Dense(1)]
+            )
+            optimizer = optimizer_fn()
+            x = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+            y = tf.expand_dims(tf.convert_to_tensor([1, 1, 1, 0, 0, 0]), axis=1)
+            model.compile(loss="mse", optimizer=optimizer)
+        model.fit(x, y, epochs=1, steps_per_epoch=5)
+        if optimizer.name == "Adam":
+            # Assert the momentum variable is not 0.
+            self.assertNotEqual(
+                self.evaluate(optimizer._momentums._storage[0]), 0
+            )
+        elif optimizer.name == "Adadelta":
+            # Assert the accumulated variable is not 0.
+            self.assertNotEqual(
+                self.evaluate(optimizer._accumulated_grads._storage[0]), 0
+            )
+        elif optimizer.name == "Adagrad":
+            # Assert the accumulated variable is not 0.
+            self.assertNotEqual(
+                self.evaluate(optimizer._accumulators._storage[0]), 0
+            )
+
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=STRATEGIES, optimizer_fn=OPTIMIZER_FN
+        )
+    )
+    def testGetGradientsInCustomTrainingLoop(self, strategy, optimizer_fn):
+        with strategy.scope():
+            model = keras.Sequential(
+                [keras.layers.Input(shape=(1,)), keras.layers.Dense(1)]
+            )
+            optimizer = optimizer_fn()
+
+            def per_worker_dataset_fn():
+                def dataset_fn(_):
+                    x, y = [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0]
+                    ds = tf.data.Dataset.from_tensor_slices((x, y))
+                    ds = ds.repeat().batch(6)
+                    return ds
+
+                return strategy.distribute_datasets_from_function(dataset_fn)
+
+            ds = per_worker_dataset_fn()
+
+            @tf.function
+            def train_step(ds):
+                def replica_fn(data):
+                    features, labels = data
+                    with tf.GradientTape() as tape:
+                        output = model(tf.expand_dims(features, axis=1))
+                        loss = keras.losses.MeanSquaredError(
+                            reduction=losses_utils.ReductionV2.NONE
+                        )(labels, output)
+                    grads = tape.gradient(loss, model.trainable_variables)
+                    optimizer.apply_gradients(
+                        zip(grads, model.trainable_variables)
+                    )
+
+                strategy.run(replica_fn, args=(next(iter(ds)),))
+
+            for _ in range(3):
+                train_step(ds)
+        self.assertEqual(self.evaluate(optimizer.iterations), 3)
+
+    @ds_combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            strategy=[
+                ds_combinations.mirrored_strategy_with_two_gpus,
+                ds_combinations.tpu_strategy,
+                ds_combinations.multi_worker_mirrored_2x2_gpu,
+                ds_combinations.central_storage_strategy_with_two_gpus,
+            ]
+        )
+    )
+    def testJitCompile(self, strategy):
+        # Test the optimizer yields same numerical results when jit_compile is
+        # on and off.
+        with strategy.scope():
+            optimizer_1 = adam_new.Adam(
+                jit_compile=False, use_ema=True, ema_overwrite_frequency=1
+            )
+            optimizer_2 = adam_new.Adam(
+                jit_compile=True, use_ema=True, ema_overwrite_frequency=1
+            )
+            model_1 = keras.Sequential(
+                [
+                    keras.layers.Input(shape=(2,)),
+                    keras.layers.Dense(5),
+                    keras.layers.Dense(1),
+                ]
+            )
+            model_2 = keras.models.clone_model(model_1)
+            model_2.set_weights(model_1.get_weights())
+
+            def per_worker_dataset_fn():
+                def dataset_fn(_):
+                    x = np.random.rand(6, 2)
+                    y = [1, 1, 1, 0, 0, 0]
+                    ds = tf.data.Dataset.from_tensor_slices((x, y))
+                    ds = ds.repeat().batch(6)
+                    return ds
+
+                return strategy.distribute_datasets_from_function(dataset_fn)
+
+            ds = per_worker_dataset_fn()
+
+            @tf.function
+            def train_step(ds):
+                def replica_fn(data):
+                    features, labels = data
+                    with tf.GradientTape() as tape:
+                        output_1 = model_1(features)
+                        loss_1 = keras.losses.MeanSquaredError(
+                            reduction=losses_utils.ReductionV2.NONE
+                        )(labels, output_1)
+                    grads_1 = tape.gradient(loss_1, model_1.trainable_variables)
+                    optimizer_1.apply_gradients(
+                        zip(grads_1, model_1.trainable_variables),
+                        skip_gradients_aggregation=False,
+                    )
+
+                    with tf.GradientTape() as tape:
+                        output_2 = model_2(features)
+                        loss_2 = keras.losses.MeanSquaredError(
+                            reduction=losses_utils.ReductionV2.NONE
+                        )(labels, output_2)
+                    grads_2 = tape.gradient(loss_2, model_2.trainable_variables)
+                    optimizer_2.apply_gradients(
+                        zip(grads_2, model_2.trainable_variables),
+                        experimental_aggregate_gradients=True,
+                    )
+
+                strategy.run(replica_fn, args=(next(iter(ds)),))
+
+            for _ in range(3):
+                train_step(ds)
+                self.assertAllClose(
+                    model_1.trainable_variables[0][0],
+                    model_2.trainable_variables[0][0],
+                )
+
+
+if __name__ == "__main__":
+    tf.__internal__.distribute.multi_process_runner.test_main()
diff --git a/keras/optimizers/optimizer_v1.py b/keras/optimizers/optimizer_v1.py
index a366b2154d2e..5cb3544ecf9e 100644
--- a/keras/optimizers/optimizer_v1.py
+++ b/keras/optimizers/optimizer_v1.py
@@ -12,828 +12,913 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=g-classes-have-attributes
+
+
 """Legacy v1 optimizer classes.
 
 For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`.
 """
 
 import tensorflow.compat.v2 as tf
+
 from keras import backend
 
 
 class Optimizer:
-  """Abstract optimizer base class.
-
-  Note: this is the parent class of all optimizers, not an actual optimizer
-  that can be used for training models.
-
-  All Keras optimizers support the following keyword arguments:
-
-      clipnorm: float >= 0. Gradients will be clipped
-          when their L2 norm exceeds this value.
-      clipvalue: float >= 0. Gradients will be clipped
-          when their absolute value exceeds this value.
-  """
-
-  def __init__(self, **kwargs):
-    allowed_kwargs = {'clipnorm', 'clipvalue'}
-    for k in kwargs:
-      if k not in allowed_kwargs:
-        raise TypeError('Unexpected keyword argument '
-                        'passed to optimizer: ' + str(k))
-      # checks that clipnorm >= 0 and clipvalue >= 0
-      if kwargs[k] < 0:
-        raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
-    self.__dict__.update(kwargs)
-    self.updates = []
-    self.weights = []
-
-  # Set this to False, indicating `apply_gradients` does not take the
-  # `experimental_aggregate_gradients` argument.
-  _HAS_AGGREGATE_GRAD = False
-
-  def _create_all_weights(self, params):
-    """Creates and sets all optimizer weights.
+    """Abstract optimizer base class.
 
-    Args:
-      params: list or tuple of `Variable` objects that will be minimized
-        using this optimizer.
+    Note: this is the parent class of all optimizers, not an actual optimizer
+    that can be used for training models.
 
-    Returns:
-      Specific weight values that are used in `get_updates`
-    """
-    raise NotImplementedError
+    All Keras optimizers support the following keyword arguments:
 
-  def get_updates(self, loss, params):
-    raise NotImplementedError
+        clipnorm: float >= 0. Gradients will be clipped
+            when their L2 norm exceeds this value.
+        clipvalue: float >= 0. Gradients will be clipped
+            when their absolute value exceeds this value.
+    """
 
-  def get_gradients(self, loss, params):
-    """Returns gradients of `loss` with respect to `params`.
+    def __init__(self, **kwargs):
+        allowed_kwargs = {"clipnorm", "clipvalue"}
+        for k in kwargs:
+            if k not in allowed_kwargs:
+                raise TypeError(
+                    "Unexpected keyword argument passed to optimizer: " + str(k)
+                )
+            # checks that clipnorm >= 0 and clipvalue >= 0
+            if kwargs[k] < 0:
+                raise ValueError(f"Expected {k} >= 0, received: {kwargs[k]}")
+        self.__dict__.update(kwargs)
+        self.updates = []
+        self.weights = []
+
+    # Set this to False, indicating `apply_gradients` does not take the
+    # `experimental_aggregate_gradients` argument.
+    _HAS_AGGREGATE_GRAD = False
+
+    def _create_all_weights(self, params):
+        """Creates and sets all optimizer weights.
+
+        Args:
+          params: list or tuple of `Variable` objects that will be minimized
+            using this optimizer.
+
+        Returns:
+          Specific weight values that are used in `get_updates`
+        """
+        raise NotImplementedError
+
+    def get_updates(self, loss, params):
+        raise NotImplementedError
+
+    def get_gradients(self, loss, params):
+        """Returns gradients of `loss` with respect to `params`.
+
+        Args:
+            loss: Loss tensor.
+            params: List of variables.
+
+        Returns:
+            List of gradient tensors.
+
+        Raises:
+            ValueError: In case any gradient cannot be computed (e.g. if
+              gradient function not implemented).
+        """
+        grads = backend.gradients(loss, params)
+        if any(g is None for g in grads):
+            raise ValueError(
+                "An operation has `None` for gradient. "
+                "Please make sure that all of your ops have a "
+                "gradient defined (i.e. are differentiable). "
+                "Common ops without gradient: "
+                "backend.argmax, backend.round, backend.eval."
+            )
+        if hasattr(self, "clipnorm"):
+            grads = [tf.clip_by_norm(g, self.clipnorm) for g in grads]
+        if hasattr(self, "clipvalue"):
+            grads = [
+                tf.clip_by_value(g, -self.clipvalue, self.clipvalue)
+                for g in grads
+            ]
+        return grads
+
+    def set_weights(self, weights):
+        """Sets the weights of the optimizer, from Numpy arrays.
+
+        Should only be called after computing the gradients
+        (otherwise the optimizer has no weights).
+
+        Args:
+            weights: a list of Numpy arrays. The number of arrays and their
+              shape must match number of the dimensions of the weights of the
+              optimizer (i.e. it should match the output of `get_weights`).
+
+        Raises:
+            ValueError: in case of incompatible weight shapes.
+        """
+        params = self.weights
+        if len(params) != len(weights):
+            raise ValueError(
+                "Length of the specified weight list ("
+                + str(len(weights))
+                + ") does not match the number of weights of the optimizer ("
+                + str(len(params))
+                + ")"
+            )
+        weight_value_tuples = []
+        param_values = backend.batch_get_value(params)
+        for pv, p, w in zip(param_values, params, weights):
+            if pv.shape != w.shape:
+                raise ValueError(
+                    "Optimizer weight shape "
+                    + str(pv.shape)
+                    + " not compatible with provided weight shape "
+                    + str(w.shape)
+                )
+            weight_value_tuples.append((p, w))
+        backend.batch_set_value(weight_value_tuples)
+
+    def get_weights(self):
+        """Returns the current value of the weights of the optimizer.
+
+        Returns:
+            A list of numpy arrays.
+        """
+        return backend.batch_get_value(self.weights)
+
+    def get_config(self):
+        config = {}
+        if hasattr(self, "clipnorm"):
+            config["clipnorm"] = self.clipnorm
+        if hasattr(self, "clipvalue"):
+            config["clipvalue"] = self.clipvalue
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
 
-    Args:
-        loss: Loss tensor.
-        params: List of variables.
 
-    Returns:
-        List of gradient tensors.
+class SGD(Optimizer):
+    """Stochastic gradient descent optimizer.
 
-    Raises:
-        ValueError: In case any gradient cannot be computed (e.g. if gradient
-          function not implemented).
-    """
-    grads = backend.gradients(loss, params)
-    if any(g is None for g in grads):
-      raise ValueError('An operation has `None` for gradient. '
-                       'Please make sure that all of your ops have a '
-                       'gradient defined (i.e. are differentiable). '
-                       'Common ops without gradient: '
-                       'backend.argmax, backend.round, backend.eval.')
-    if hasattr(self, 'clipnorm'):
-      grads = [tf.clip_by_norm(g, self.clipnorm) for g in grads]
-    if hasattr(self, 'clipvalue'):
-      grads = [
-          tf.clip_by_value(g, -self.clipvalue, self.clipvalue)
-          for g in grads
-      ]
-    return grads
-
-  def set_weights(self, weights):
-    """Sets the weights of the optimizer, from Numpy arrays.
-
-    Should only be called after computing the gradients
-    (otherwise the optimizer has no weights).
+    Includes support for momentum,
+    learning rate decay, and Nesterov momentum.
 
     Args:
-        weights: a list of Numpy arrays. The number of arrays and their shape
-          must match number of the dimensions of the weights of the optimizer
-          (i.e. it should match the output of `get_weights`).
-
-    Raises:
-        ValueError: in case of incompatible weight shapes.
+        lr: float >= 0. Learning rate.
+        momentum: float >= 0. Parameter that accelerates SGD in the relevant
+          direction and dampens oscillations.
+        decay: float >= 0. Learning rate decay over each update.
+        nesterov: boolean. Whether to apply Nesterov momentum.
     """
-    params = self.weights
-    if len(params) != len(weights):
-      raise ValueError('Length of the specified weight list (' +
-                       str(len(weights)) +
-                       ') does not match the number of weights '
-                       'of the optimizer (' + str(len(params)) + ')')
-    weight_value_tuples = []
-    param_values = backend.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
-      weight_value_tuples.append((p, w))
-    backend.batch_set_value(weight_value_tuples)
-
-  def get_weights(self):
-    """Returns the current value of the weights of the optimizer.
-
-    Returns:
-        A list of numpy arrays.
-    """
-    return backend.batch_get_value(self.weights)
 
-  def get_config(self):
-    config = {}
-    if hasattr(self, 'clipnorm'):
-      config['clipnorm'] = self.clipnorm
-    if hasattr(self, 'clipvalue'):
-      config['clipvalue'] = self.clipvalue
-    return config
+    def __init__(
+        self, lr=0.01, momentum=0.0, decay=0.0, nesterov=False, **kwargs
+    ):
+        super().__init__(**kwargs)
+        with backend.name_scope(self.__class__.__name__):
+            self.iterations = backend.variable(
+                0, dtype="int64", name="iterations"
+            )
+            self.lr = backend.variable(lr, name="lr")
+            self.momentum = backend.variable(momentum, name="momentum")
+            self.decay = backend.variable(decay, name="decay")
+        self.initial_decay = decay
+        self.nesterov = nesterov
+
+    def _create_all_weights(self, params):
+        shapes = [backend.int_shape(p) for p in params]
+        moments = [backend.zeros(shape) for shape in shapes]
+        self.weights = [self.iterations] + moments
+        return moments
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (
+                1.0
+                / (
+                    1.0
+                    + self.decay
+                    * tf.cast(self.iterations, backend.dtype(self.decay))
+                )
+            )
+        # momentum
+        moments = self._create_all_weights(params)
+        for p, g, m in zip(params, grads, moments):
+            v = self.momentum * m - lr * g  # velocity
+            self.updates.append(tf.compat.v1.assign(m, v))
+
+            if self.nesterov:
+                new_p = p + self.momentum * v - lr * g
+            else:
+                new_p = p + v
+
+            # Apply constraints.
+            if getattr(p, "constraint", None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(tf.compat.v1.assign(p, new_p))
+        return self.updates
 
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
+    def get_config(self):
+        config = {
+            "lr": float(backend.get_value(self.lr)),
+            "momentum": float(backend.get_value(self.momentum)),
+            "decay": float(backend.get_value(self.decay)),
+            "nesterov": self.nesterov,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
-class SGD(Optimizer):
-  """Stochastic gradient descent optimizer.
+class RMSprop(Optimizer):
+    """RMSProp optimizer.
 
-  Includes support for momentum,
-  learning rate decay, and Nesterov momentum.
+    It is recommended to leave the parameters of this optimizer
+    at their default values
+    (except the learning rate, which can be freely tuned).
 
-  Args:
+    Args:
       lr: float >= 0. Learning rate.
-      momentum: float >= 0. Parameter that accelerates SGD in the relevant
-        direction and dampens oscillations.
+      rho: float >= 0.
+      epsilon: float >= 0. Fuzz factor.
+        If `None`, defaults to `backend.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
-      nesterov: boolean. Whether to apply Nesterov momentum.
-  """
-
-  def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
-    super().__init__(**kwargs)
-    with backend.name_scope(self.__class__.__name__):
-      self.iterations = backend.variable(0, dtype='int64', name='iterations')
-      self.lr = backend.variable(lr, name='lr')
-      self.momentum = backend.variable(momentum, name='momentum')
-      self.decay = backend.variable(decay, name='decay')
-    self.initial_decay = decay
-    self.nesterov = nesterov
-
-  def _create_all_weights(self, params):
-    shapes = [backend.int_shape(p) for p in params]
-    moments = [backend.zeros(shape) for shape in shapes]
-    self.weights = [self.iterations] + moments
-    return moments
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (
-          1. /
-          (1. +
-           self.decay * tf.cast(self.iterations,
-                                      backend.dtype(self.decay))))
-    # momentum
-    moments = self._create_all_weights(params)
-    for p, g, m in zip(params, grads, moments):
-      v = self.momentum * m - lr * g  # velocity
-      self.updates.append(tf.compat.v1.assign(m, v))
-
-      if self.nesterov:
-        new_p = p + self.momentum * v - lr * g
-      else:
-        new_p = p + v
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(tf.compat.v1.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(backend.get_value(self.lr)),
-        'momentum': float(backend.get_value(self.momentum)),
-        'decay': float(backend.get_value(self.decay)),
-        'nesterov': self.nesterov
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """
 
+    def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0.0, **kwargs):
+        super().__init__(**kwargs)
+        with backend.name_scope(self.__class__.__name__):
+            self.lr = backend.variable(lr, name="lr")
+            self.rho = backend.variable(rho, name="rho")
+            self.decay = backend.variable(decay, name="decay")
+            self.iterations = backend.variable(
+                0, dtype="int64", name="iterations"
+            )
+        if epsilon is None:
+            epsilon = backend.epsilon()
+        self.epsilon = epsilon
+        self.initial_decay = decay
+
+    def _create_all_weights(self, params):
+        accumulators = [
+            backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
+            for p in params
+        ]
+        self.weights = accumulators
+        return accumulators
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        accumulators = self._create_all_weights(params)
+        self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
 
-class RMSprop(Optimizer):
-  """RMSProp optimizer.
-
-  It is recommended to leave the parameters of this optimizer
-  at their default values
-  (except the learning rate, which can be freely tuned).
-
-  Args:
-    lr: float >= 0. Learning rate.
-    rho: float >= 0.
-    epsilon: float >= 0. Fuzz factor.
-      If `None`, defaults to `backend.epsilon()`.
-    decay: float >= 0. Learning rate decay over each update.
-  """
-
-  def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
-    super().__init__(**kwargs)
-    with backend.name_scope(self.__class__.__name__):
-      self.lr = backend.variable(lr, name='lr')
-      self.rho = backend.variable(rho, name='rho')
-      self.decay = backend.variable(decay, name='decay')
-      self.iterations = backend.variable(0, dtype='int64', name='iterations')
-    if epsilon is None:
-      epsilon = backend.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-    accumulators = [
-        backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
-        for p in params]
-    self.weights = accumulators
-    return accumulators
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    accumulators = self._create_all_weights(params)
-    self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (
-          1. /
-          (1. +
-           self.decay * tf.cast(self.iterations,
-                                      backend.dtype(self.decay))))
-
-    for p, g, a in zip(params, grads, accumulators):
-      # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * tf.square(g)
-      self.updates.append(tf.compat.v1.assign(a, new_a))
-      new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(tf.compat.v1.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(backend.get_value(self.lr)),
-        'rho': float(backend.get_value(self.rho)),
-        'decay': float(backend.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (
+                1.0
+                / (
+                    1.0
+                    + self.decay
+                    * tf.cast(self.iterations, backend.dtype(self.decay))
+                )
+            )
+
+        for p, g, a in zip(params, grads, accumulators):
+            # update accumulator
+            new_a = self.rho * a + (1.0 - self.rho) * tf.square(g)
+            self.updates.append(tf.compat.v1.assign(a, new_a))
+            new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
+
+            # Apply constraints.
+            if getattr(p, "constraint", None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(tf.compat.v1.assign(p, new_p))
+        return self.updates
+
+    def get_config(self):
+        config = {
+            "lr": float(backend.get_value(self.lr)),
+            "rho": float(backend.get_value(self.rho)),
+            "decay": float(backend.get_value(self.decay)),
+            "epsilon": self.epsilon,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 class Adagrad(Optimizer):
-  """Adagrad optimizer.
+    """Adagrad optimizer.
 
-  Adagrad is an optimizer with parameter-specific learning rates,
-  which are adapted relative to how frequently a parameter gets
-  updated during training. The more updates a parameter receives,
-  the smaller the updates.
+    Adagrad is an optimizer with parameter-specific learning rates,
+    which are adapted relative to how frequently a parameter gets
+    updated during training. The more updates a parameter receives,
+    the smaller the updates.
 
-  It is recommended to leave the parameters of this optimizer
-  at their default values.
+    It is recommended to leave the parameters of this optimizer
+    at their default values.
 
-  # Arguments
-      lr: float >= 0. Initial learning rate.
-      epsilon: float >= 0. If `None`, defaults to `backend.epsilon()`.
-      decay: float >= 0. Learning rate decay over each update.
+    # Arguments
+        lr: float >= 0. Initial learning rate.
+        epsilon: float >= 0. If `None`, defaults to `backend.epsilon()`.
+        decay: float >= 0. Learning rate decay over each update.
+
+    # References
+        - [Adaptive Subgradient Methods for Online Learning and Stochastic
+        Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    """
+
+    def __init__(self, lr=0.01, epsilon=None, decay=0.0, **kwargs):
+        super().__init__(**kwargs)
+        with backend.name_scope(self.__class__.__name__):
+            self.lr = backend.variable(lr, name="lr")
+            self.decay = backend.variable(decay, name="decay")
+            self.iterations = backend.variable(
+                0, dtype="int64", name="iterations"
+            )
+        if epsilon is None:
+            epsilon = backend.epsilon()
+        self.epsilon = epsilon
+        self.initial_decay = decay
+
+    def _create_all_weights(self, params):
+        shapes = [backend.int_shape(p) for p in params]
+        accumulators = [backend.zeros(shape) for shape in shapes]
+        self.weights = accumulators
+        return accumulators
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        accumulators = self._create_all_weights(params)
+
+        self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
 
-  # References
-      - [Adaptive Subgradient Methods for Online Learning and Stochastic
-      Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-  """
-
-  def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
-    super().__init__(**kwargs)
-    with backend.name_scope(self.__class__.__name__):
-      self.lr = backend.variable(lr, name='lr')
-      self.decay = backend.variable(decay, name='decay')
-      self.iterations = backend.variable(0, dtype='int64', name='iterations')
-    if epsilon is None:
-      epsilon = backend.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-    shapes = [backend.int_shape(p) for p in params]
-    accumulators = [backend.zeros(shape) for shape in shapes]
-    self.weights = accumulators
-    return accumulators
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    accumulators = self._create_all_weights(params)
-
-    self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (
-          1. /
-          (1. +
-           self.decay * tf.cast(self.iterations,
-                                      backend.dtype(self.decay))))
-
-    for p, g, a in zip(params, grads, accumulators):
-      new_a = a + tf.square(g)  # update accumulator
-      self.updates.append(tf.compat.v1.assign(a, new_a))
-      new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(tf.compat.v1.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(backend.get_value(self.lr)),
-        'decay': float(backend.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (
+                1.0
+                / (
+                    1.0
+                    + self.decay
+                    * tf.cast(self.iterations, backend.dtype(self.decay))
+                )
+            )
+
+        for p, g, a in zip(params, grads, accumulators):
+            new_a = a + tf.square(g)  # update accumulator
+            self.updates.append(tf.compat.v1.assign(a, new_a))
+            new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
+
+            # Apply constraints.
+            if getattr(p, "constraint", None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(tf.compat.v1.assign(p, new_p))
+        return self.updates
+
+    def get_config(self):
+        config = {
+            "lr": float(backend.get_value(self.lr)),
+            "decay": float(backend.get_value(self.decay)),
+            "epsilon": self.epsilon,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 class Adadelta(Optimizer):
-  """Adadelta optimizer.
-
-  Adadelta is a more robust extension of Adagrad
-  that adapts learning rates based on a moving window of gradient updates,
-  instead of accumulating all past gradients. This way, Adadelta continues
-  learning even when many updates have been done. Compared to Adagrad, in the
-  original version of Adadelta you don't have to set an initial learning
-  rate. In this version, initial learning rate and decay factor can
-  be set, as in most other Keras optimizers.
-
-  It is recommended to leave the parameters of this optimizer
-  at their default values.
-
-  Arguments:
-    lr: float >= 0. Initial learning rate, defaults to 1.
-        It is recommended to leave it at the default value.
-    rho: float >= 0. Adadelta decay factor, corresponding to fraction of
-        gradient to keep at each time step.
-    epsilon: float >= 0. Fuzz factor.
-      If `None`, defaults to `backend.epsilon()`.
-    decay: float >= 0. Initial learning rate decay.
-
-  References:
-      - [Adadelta - an adaptive learning rate
-      method](http://arxiv.org/abs/1212.5701)
-  """
-
-  def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
-    super().__init__(**kwargs)
-    with backend.name_scope(self.__class__.__name__):
-      self.lr = backend.variable(lr, name='lr')
-      self.decay = backend.variable(decay, name='decay')
-      self.iterations = backend.variable(0, dtype='int64', name='iterations')
-    if epsilon is None:
-      epsilon = backend.epsilon()
-    self.rho = rho
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-    shapes = [backend.int_shape(p) for p in params]
-    accumulators = [backend.zeros(shape) for shape in shapes]
-    delta_accumulators = [backend.zeros(shape) for shape in shapes]
-    self.weights = accumulators + delta_accumulators
-    return accumulators, delta_accumulators
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
-    accumulators, delta_accumulators = self._create_all_weights(params)
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (
-          1. /
-          (1. +
-           self.decay * tf.cast(self.iterations,
-                                      backend.dtype(self.decay))))
-
-    for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
-      # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * tf.square(g)
-      self.updates.append(tf.compat.v1.assign(a, new_a))
-
-      # use the new accumulator and the *old* delta_accumulator
-      update = g * backend.sqrt(d_a + self.epsilon) / backend.sqrt(
-          new_a + self.epsilon)
-      new_p = p - lr * update
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(tf.compat.v1.assign(p, new_p))
-
-      # update delta_accumulator
-      new_d_a = self.rho * d_a + (1 - self.rho) * tf.square(update)
-      self.updates.append(tf.compat.v1.assign(d_a, new_d_a))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(backend.get_value(self.lr)),
-        'rho': self.rho,
-        'decay': float(backend.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Adadelta optimizer.
+
+    Adadelta is a more robust extension of Adagrad
+    that adapts learning rates based on a moving window of gradient updates,
+    instead of accumulating all past gradients. This way, Adadelta continues
+    learning even when many updates have been done. Compared to Adagrad, in the
+    original version of Adadelta you don't have to set an initial learning
+    rate. In this version, initial learning rate and decay factor can
+    be set, as in most other Keras optimizers.
+
+    It is recommended to leave the parameters of this optimizer
+    at their default values.
+
+    Arguments:
+      lr: float >= 0. Initial learning rate, defaults to 1.
+          It is recommended to leave it at the default value.
+      rho: float >= 0. Adadelta decay factor, corresponding to fraction of
+          gradient to keep at each time step.
+      epsilon: float >= 0. Fuzz factor.
+        If `None`, defaults to `backend.epsilon()`.
+      decay: float >= 0. Initial learning rate decay.
+
+    References:
+        - [Adadelta - an adaptive learning rate
+        method](http://arxiv.org/abs/1212.5701)
+    """
+
+    def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0.0, **kwargs):
+        super().__init__(**kwargs)
+        with backend.name_scope(self.__class__.__name__):
+            self.lr = backend.variable(lr, name="lr")
+            self.decay = backend.variable(decay, name="decay")
+            self.iterations = backend.variable(
+                0, dtype="int64", name="iterations"
+            )
+        if epsilon is None:
+            epsilon = backend.epsilon()
+        self.rho = rho
+        self.epsilon = epsilon
+        self.initial_decay = decay
+
+    def _create_all_weights(self, params):
+        shapes = [backend.int_shape(p) for p in params]
+        accumulators = [backend.zeros(shape) for shape in shapes]
+        delta_accumulators = [backend.zeros(shape) for shape in shapes]
+        self.weights = accumulators + delta_accumulators
+        return accumulators, delta_accumulators
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
+        accumulators, delta_accumulators = self._create_all_weights(params)
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (
+                1.0
+                / (
+                    1.0
+                    + self.decay
+                    * tf.cast(self.iterations, backend.dtype(self.decay))
+                )
+            )
+
+        for p, g, a, d_a in zip(
+            params, grads, accumulators, delta_accumulators
+        ):
+            # update accumulator
+            new_a = self.rho * a + (1.0 - self.rho) * tf.square(g)
+            self.updates.append(tf.compat.v1.assign(a, new_a))
+
+            # use the new accumulator and the *old* delta_accumulator
+            update = (
+                g
+                * backend.sqrt(d_a + self.epsilon)
+                / backend.sqrt(new_a + self.epsilon)
+            )
+            new_p = p - lr * update
+
+            # Apply constraints.
+            if getattr(p, "constraint", None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(tf.compat.v1.assign(p, new_p))
+
+            # update delta_accumulator
+            new_d_a = self.rho * d_a + (1 - self.rho) * tf.square(update)
+            self.updates.append(tf.compat.v1.assign(d_a, new_d_a))
+        return self.updates
+
+    def get_config(self):
+        config = {
+            "lr": float(backend.get_value(self.lr)),
+            "rho": self.rho,
+            "decay": float(backend.get_value(self.decay)),
+            "epsilon": self.epsilon,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 class Adam(Optimizer):
-  """Adam optimizer.
-
-  Default parameters follow those provided in the original paper.
-
-  Args:
-    lr: float >= 0. Learning rate.
-    beta_1: float, 0 < beta < 1. Generally close to 1.
-    beta_2: float, 0 < beta < 1. Generally close to 1.
-    epsilon: float >= 0. Fuzz factor.
-      If `None`, defaults to `backend.epsilon()`.
-    decay: float >= 0. Learning rate decay over each update.
-    amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
-      from the paper "On the Convergence of Adam and Beyond".
-  """
-
-  def __init__(self,
-               lr=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=None,
-               decay=0.,
-               amsgrad=False,
-               **kwargs):
-    super().__init__(**kwargs)
-    with backend.name_scope(self.__class__.__name__):
-      self.iterations = backend.variable(0, dtype='int64', name='iterations')
-      self.lr = backend.variable(lr, name='lr')
-      self.beta_1 = backend.variable(beta_1, name='beta_1')
-      self.beta_2 = backend.variable(beta_2, name='beta_2')
-      self.decay = backend.variable(decay, name='decay')
-    if epsilon is None:
-      epsilon = backend.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-    self.amsgrad = amsgrad
-
-  def _create_all_weights(self, params):
-    ms = [
-        backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
-        for p in params]
-    vs = [
-        backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
-        for p in params]
-    if self.amsgrad:
-      vhats = [
-          backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
-          for p in params]
-    else:
-      vhats = [backend.zeros(1) for _ in params]
-    self.weights = [self.iterations] + ms + vs + vhats
-    return ms, vs, vhats
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = []
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (
-          1. /
-          (1. +
-           self.decay * tf.cast(self.iterations,
-                                      backend.dtype(self.decay))))
-
-    with tf.control_dependencies([tf.compat.v1.assign_add(self.iterations, 1)]):
-      t = tf.cast(self.iterations, backend.floatx())
-    lr_t = lr * (
-        backend.sqrt(1. - tf.pow(self.beta_2, t)) /
-        (1. - tf.pow(self.beta_1, t)))
-
-    ms, vs, vhats = self._create_all_weights(params)
-    for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
-      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      v_t = (self.beta_2 * v) + (1. - self.beta_2) * tf.square(g)
-      if self.amsgrad:
-        vhat_t = tf.maximum(vhat, v_t)
-        p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon)
-        self.updates.append(tf.compat.v1.assign(vhat, vhat_t))
-      else:
-        p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon)
-
-      self.updates.append(tf.compat.v1.assign(m, m_t))
-      self.updates.append(tf.compat.v1.assign(v, v_t))
-      new_p = p_t
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(tf.compat.v1.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(backend.get_value(self.lr)),
-        'beta_1': float(backend.get_value(self.beta_1)),
-        'beta_2': float(backend.get_value(self.beta_2)),
-        'decay': float(backend.get_value(self.decay)),
-        'epsilon': self.epsilon,
-        'amsgrad': self.amsgrad
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Adam optimizer.
+
+    Default parameters follow those provided in the original paper.
+
+    Args:
+      lr: float >= 0. Learning rate.
+      beta_1: float, 0 < beta < 1. Generally close to 1.
+      beta_2: float, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+        If `None`, defaults to `backend.epsilon()`.
+      decay: float >= 0. Learning rate decay over each update.
+      amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
+        from the paper "On the Convergence of Adam and Beyond".
+    """
+
+    def __init__(
+        self,
+        lr=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=None,
+        decay=0.0,
+        amsgrad=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        with backend.name_scope(self.__class__.__name__):
+            self.iterations = backend.variable(
+                0, dtype="int64", name="iterations"
+            )
+            self.lr = backend.variable(lr, name="lr")
+            self.beta_1 = backend.variable(beta_1, name="beta_1")
+            self.beta_2 = backend.variable(beta_2, name="beta_2")
+            self.decay = backend.variable(decay, name="decay")
+        if epsilon is None:
+            epsilon = backend.epsilon()
+        self.epsilon = epsilon
+        self.initial_decay = decay
+        self.amsgrad = amsgrad
+
+    def _create_all_weights(self, params):
+        ms = [
+            backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
+            for p in params
+        ]
+        vs = [
+            backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
+            for p in params
+        ]
+        if self.amsgrad:
+            vhats = [
+                backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
+                for p in params
+            ]
+        else:
+            vhats = [backend.zeros(1) for _ in params]
+        self.weights = [self.iterations] + ms + vs + vhats
+        return ms, vs, vhats
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = []
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (
+                1.0
+                / (
+                    1.0
+                    + self.decay
+                    * tf.cast(self.iterations, backend.dtype(self.decay))
+                )
+            )
+
+        with tf.control_dependencies(
+            [tf.compat.v1.assign_add(self.iterations, 1)]
+        ):
+            t = tf.cast(self.iterations, backend.floatx())
+        lr_t = lr * (
+            backend.sqrt(1.0 - tf.pow(self.beta_2, t))
+            / (1.0 - tf.pow(self.beta_1, t))
+        )
+
+        ms, vs, vhats = self._create_all_weights(params)
+        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
+            m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g
+            v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * tf.square(g)
+            if self.amsgrad:
+                vhat_t = tf.maximum(vhat, v_t)
+                p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon)
+                self.updates.append(tf.compat.v1.assign(vhat, vhat_t))
+            else:
+                p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon)
+
+            self.updates.append(tf.compat.v1.assign(m, m_t))
+            self.updates.append(tf.compat.v1.assign(v, v_t))
+            new_p = p_t
+
+            # Apply constraints.
+            if getattr(p, "constraint", None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(tf.compat.v1.assign(p, new_p))
+        return self.updates
+
+    def get_config(self):
+        config = {
+            "lr": float(backend.get_value(self.lr)),
+            "beta_1": float(backend.get_value(self.beta_1)),
+            "beta_2": float(backend.get_value(self.beta_2)),
+            "decay": float(backend.get_value(self.decay)),
+            "epsilon": self.epsilon,
+            "amsgrad": self.amsgrad,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 class Adamax(Optimizer):
-  """Adamax optimizer from Adam paper's Section 7.
-
-  It is a variant of Adam based on the infinity norm.
-  Default parameters follow those provided in the paper.
-
-  Args:
-    lr: float >= 0. Learning rate.
-    beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-    epsilon: float >= 0. Fuzz factor.
-      If `None`, defaults to `backend.epsilon()`.
-    decay: float >= 0. Learning rate decay over each update.
-  """
-
-  def __init__(self,
-               lr=0.002,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=None,
-               decay=0.,
-               **kwargs):
-    super().__init__(**kwargs)
-    with backend.name_scope(self.__class__.__name__):
-      self.iterations = backend.variable(0, dtype='int64', name='iterations')
-      self.lr = backend.variable(lr, name='lr')
-      self.beta_1 = backend.variable(beta_1, name='beta_1')
-      self.beta_2 = backend.variable(beta_2, name='beta_2')
-      self.decay = backend.variable(decay, name='decay')
-    if epsilon is None:
-      epsilon = backend.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-
-    shapes = [backend.int_shape(p) for p in params]
-    # zero init of 1st moment
-    ms = [backend.zeros(shape) for shape in shapes]
-    # zero init of exponentially weighted infinity norm
-    us = [backend.zeros(shape) for shape in shapes]
-    self.weights = [self.iterations] + ms + us
-    return ms, us
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = []
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (
-          1. /
-          (1. +
-           self.decay * tf.cast(self.iterations,
-                                      backend.dtype(self.decay))))
-
-    with tf.control_dependencies([tf.compat.v1.assign_add(self.iterations, 1)]):
-      t = tf.cast(self.iterations, backend.floatx())
-    lr_t = lr / (1. - tf.pow(self.beta_1, t))
-
-    ms, us = self._create_all_weights(params)
-
-    for p, g, m, u in zip(params, grads, ms, us):
-
-      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      u_t = tf.maximum(self.beta_2 * u, tf.abs(g))
-      p_t = p - lr_t * m_t / (u_t + self.epsilon)
-
-      self.updates.append(tf.compat.v1.assign(m, m_t))
-      self.updates.append(tf.compat.v1.assign(u, u_t))
-      new_p = p_t
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(tf.compat.v1.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(backend.get_value(self.lr)),
-        'beta_1': float(backend.get_value(self.beta_1)),
-        'beta_2': float(backend.get_value(self.beta_2)),
-        'decay': float(backend.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Adamax optimizer from Adam paper's Section 7.
+
+    It is a variant of Adam based on the infinity norm.
+    Default parameters follow those provided in the paper.
+
+    Args:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+        If `None`, defaults to `backend.epsilon()`.
+      decay: float >= 0. Learning rate decay over each update.
+    """
+
+    def __init__(
+        self,
+        lr=0.002,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=None,
+        decay=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        with backend.name_scope(self.__class__.__name__):
+            self.iterations = backend.variable(
+                0, dtype="int64", name="iterations"
+            )
+            self.lr = backend.variable(lr, name="lr")
+            self.beta_1 = backend.variable(beta_1, name="beta_1")
+            self.beta_2 = backend.variable(beta_2, name="beta_2")
+            self.decay = backend.variable(decay, name="decay")
+        if epsilon is None:
+            epsilon = backend.epsilon()
+        self.epsilon = epsilon
+        self.initial_decay = decay
+
+    def _create_all_weights(self, params):
+
+        shapes = [backend.int_shape(p) for p in params]
+        # zero init of 1st moment
+        ms = [backend.zeros(shape) for shape in shapes]
+        # zero init of exponentially weighted infinity norm
+        us = [backend.zeros(shape) for shape in shapes]
+        self.weights = [self.iterations] + ms + us
+        return ms, us
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = []
+
+        lr = self.lr
+        if self.initial_decay > 0:
+            lr = lr * (
+                1.0
+                / (
+                    1.0
+                    + self.decay
+                    * tf.cast(self.iterations, backend.dtype(self.decay))
+                )
+            )
+
+        with tf.control_dependencies(
+            [tf.compat.v1.assign_add(self.iterations, 1)]
+        ):
+            t = tf.cast(self.iterations, backend.floatx())
+        lr_t = lr / (1.0 - tf.pow(self.beta_1, t))
+
+        ms, us = self._create_all_weights(params)
+
+        for p, g, m, u in zip(params, grads, ms, us):
+
+            m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g
+            u_t = tf.maximum(self.beta_2 * u, tf.abs(g))
+            p_t = p - lr_t * m_t / (u_t + self.epsilon)
+
+            self.updates.append(tf.compat.v1.assign(m, m_t))
+            self.updates.append(tf.compat.v1.assign(u, u_t))
+            new_p = p_t
+
+            # Apply constraints.
+            if getattr(p, "constraint", None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(tf.compat.v1.assign(p, new_p))
+        return self.updates
+
+    def get_config(self):
+        config = {
+            "lr": float(backend.get_value(self.lr)),
+            "beta_1": float(backend.get_value(self.beta_1)),
+            "beta_2": float(backend.get_value(self.beta_2)),
+            "decay": float(backend.get_value(self.decay)),
+            "epsilon": self.epsilon,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
 
 class Nadam(Optimizer):
-  """Nesterov Adam optimizer.
-
-  Much like Adam is essentially RMSprop with momentum,
-  Nadam is Adam RMSprop with Nesterov momentum.
-
-  Default parameters follow those provided in the paper.
-  It is recommended to leave the parameters of this optimizer
-  at their default values.
-
-  Args:
-    lr: float >= 0. Learning rate.
-    beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-    epsilon: float >= 0. Fuzz factor.
-      If `None`, defaults to `backend.epsilon()`.
-  """
-
-  def __init__(self,
-               lr=0.002,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=None,
-               schedule_decay=0.004,
-               **kwargs):
-    super().__init__(**kwargs)
-    with backend.name_scope(self.__class__.__name__):
-      self.iterations = backend.variable(0, dtype='int64', name='iterations')
-      self.m_schedule = backend.variable(1., name='m_schedule')
-      self.lr = backend.variable(lr, name='lr')
-      self.beta_1 = backend.variable(beta_1, name='beta_1')
-      self.beta_2 = backend.variable(beta_2, name='beta_2')
-    if epsilon is None:
-      epsilon = backend.epsilon()
-    self.epsilon = epsilon
-    self.schedule_decay = schedule_decay
-
-  def _create_all_weights(self, params):
-    shapes = [backend.int_shape(p) for p in params]
-    ms = [backend.zeros(shape) for shape in shapes]
-    vs = [backend.zeros(shape) for shape in shapes]
-
-    self.weights = [self.iterations, self.m_schedule] + ms + vs
-    return ms, vs
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = []
-
-    with tf.control_dependencies([tf.compat.v1.assign_add(self.iterations, 1)]):
-      t = tf.cast(self.iterations, backend.floatx())
-
-    # Due to the recommendations in [2], i.e. warming momentum schedule
-    momentum_cache_t = self.beta_1 * (
-        1. - 0.5 *
-        (tf.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay)))
-    momentum_cache_t_1 = self.beta_1 * (
-        1. - 0.5 *
-        (tf.pow(backend.cast_to_floatx(0.96),
-                      (t + 1) * self.schedule_decay)))
-    m_schedule_new = self.m_schedule * momentum_cache_t
-    m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
-    self.updates.append((self.m_schedule, m_schedule_new))
-
-    ms, vs = self._create_all_weights(params)
-
-    for p, g, m, v in zip(params, grads, ms, vs):
-      # the following equations given in [1]
-      g_prime = g / (1. - m_schedule_new)
-      m_t = self.beta_1 * m + (1. - self.beta_1) * g
-      m_t_prime = m_t / (1. - m_schedule_next)
-      v_t = self.beta_2 * v + (1. - self.beta_2) * tf.square(g)
-      v_t_prime = v_t / (1. - tf.pow(self.beta_2, t))
-      m_t_bar = (1. -
-                 momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
-
-      self.updates.append(tf.compat.v1.assign(m, m_t))
-      self.updates.append(tf.compat.v1.assign(v, v_t))
-
-      p_t = p - self.lr * m_t_bar / (backend.sqrt(v_t_prime) + self.epsilon)
-      new_p = p_t
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(tf.compat.v1.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(backend.get_value(self.lr)),
-        'beta_1': float(backend.get_value(self.beta_1)),
-        'beta_2': float(backend.get_value(self.beta_2)),
-        'epsilon': self.epsilon,
-        'schedule_decay': self.schedule_decay
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    """Nesterov Adam optimizer.
 
+    Much like Adam is essentially RMSprop with momentum,
+    Nadam is Adam RMSprop with Nesterov momentum.
 
-class TFOptimizer(Optimizer, tf.__internal__.tracking.Trackable):
-  """Wrapper class for native TensorFlow optimizers."""
-
-  def __init__(self, optimizer, iterations=None):  # pylint: disable=super-init-not-called
-    self.optimizer = optimizer
-    self._track_trackable(optimizer, name='optimizer')
-    if iterations is None:
-      with backend.name_scope(self.__class__.__name__):
-        self.iterations = backend.variable(0, dtype='int64', name='iterations')
-    else:
-      self.iterations = iterations
-    self._track_trackable(self.iterations, name='global_step')
-
-  def _clip_gradients(self, grads):
-    """Clip gradients according to the clipnorm and clipvalue attributes."""
-    # TFOptimizer wrapper has no gradient clipping options.
-    return grads
-
-  def minimize(self, loss, var_list, grad_loss=None, tape=None):
-    """Mimics the `OptimizerV2.minimize` API."""
-    if not callable(loss) and tape is None:
-      raise ValueError('`tape` is required when a `Tensor` loss is passed.')
-    tape = tape if tape is not None else tf.GradientTape()
-
-    if callable(loss):
-      with tape:
-        if not callable(var_list):
-          tape.watch(var_list)
-        loss = loss()
-        if callable(var_list):
-          var_list = var_list()
-
-    var_list = tf.nest.flatten(var_list)
-    if var_list:
-      grads = tape.gradient(loss, var_list, grad_loss)
-      grads_and_vars = list(zip(grads, var_list))
-      self.apply_gradients(grads_and_vars)
-
-  def apply_gradients(self, grads_and_vars):
-    self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
-
-  def get_grads(self, loss, params):
-    return self.optimizer.compute_gradients(loss, params)
-
-  def get_updates(self, loss, params):
-    if tf.distribute.has_strategy():
-      self.updates = []
-
-      if not params:
-        # After the model vars have been created, the second call to get_updates
-        # is called with params as an empty list. This ensures that we call
-        # compute_gradients with params=None.
-        grads = self.optimizer.compute_gradients(loss)
-      else:
-        grads = self.optimizer.compute_gradients(loss, params)
-      global_step = tf.compat.v1.train.get_global_step()
-      opt_update = self.optimizer.apply_gradients(grads, global_step)
-    else:
-      if not params:
-        self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
+    Default parameters follow those provided in the paper.
+    It is recommended to leave the parameters of this optimizer
+    at their default values.
+
+    Args:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor.
+        If `None`, defaults to `backend.epsilon()`.
+    """
+
+    def __init__(
+        self,
+        lr=0.002,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=None,
+        schedule_decay=0.004,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        with backend.name_scope(self.__class__.__name__):
+            self.iterations = backend.variable(
+                0, dtype="int64", name="iterations"
+            )
+            self.m_schedule = backend.variable(1.0, name="m_schedule")
+            self.lr = backend.variable(lr, name="lr")
+            self.beta_1 = backend.variable(beta_1, name="beta_1")
+            self.beta_2 = backend.variable(beta_2, name="beta_2")
+        if epsilon is None:
+            epsilon = backend.epsilon()
+        self.epsilon = epsilon
+        self.schedule_decay = schedule_decay
+
+    def _create_all_weights(self, params):
+        shapes = [backend.int_shape(p) for p in params]
+        ms = [backend.zeros(shape) for shape in shapes]
+        vs = [backend.zeros(shape) for shape in shapes]
+
+        self.weights = [self.iterations, self.m_schedule] + ms + vs
+        return ms, vs
+
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = []
+
+        with tf.control_dependencies(
+            [tf.compat.v1.assign_add(self.iterations, 1)]
+        ):
+            t = tf.cast(self.iterations, backend.floatx())
+
+        # Due to the recommendations in [2], i.e. warming momentum schedule
+        momentum_cache_t = self.beta_1 * (
+            1.0
+            - 0.5
+            * (tf.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay))
+        )
+        momentum_cache_t_1 = self.beta_1 * (
+            1.0
+            - 0.5
+            * (
+                tf.pow(
+                    backend.cast_to_floatx(0.96), (t + 1) * self.schedule_decay
+                )
+            )
+        )
+        m_schedule_new = self.m_schedule * momentum_cache_t
+        m_schedule_next = (
+            self.m_schedule * momentum_cache_t * momentum_cache_t_1
+        )
+        self.updates.append((self.m_schedule, m_schedule_new))
+
+        ms, vs = self._create_all_weights(params)
+
+        for p, g, m, v in zip(params, grads, ms, vs):
+            # the following equations given in [1]
+            g_prime = g / (1.0 - m_schedule_new)
+            m_t = self.beta_1 * m + (1.0 - self.beta_1) * g
+            m_t_prime = m_t / (1.0 - m_schedule_next)
+            v_t = self.beta_2 * v + (1.0 - self.beta_2) * tf.square(g)
+            v_t_prime = v_t / (1.0 - tf.pow(self.beta_2, t))
+            m_t_bar = (
+                1.0 - momentum_cache_t
+            ) * g_prime + momentum_cache_t_1 * m_t_prime
+
+            self.updates.append(tf.compat.v1.assign(m, m_t))
+            self.updates.append(tf.compat.v1.assign(v, v_t))
+
+            p_t = p - self.lr * m_t_bar / (
+                backend.sqrt(v_t_prime) + self.epsilon
+            )
+            new_p = p_t
+
+            # Apply constraints.
+            if getattr(p, "constraint", None) is not None:
+                new_p = p.constraint(new_p)
+
+            self.updates.append(tf.compat.v1.assign(p, new_p))
         return self.updates
 
-      # Updates list starts out empty because the iterations variable is
-      # incremented in optimizer.apply_gradients()
-      self.updates = []
-      grads = self.optimizer.compute_gradients(loss, params)
-      opt_update = self.optimizer.apply_gradients(
-          grads, global_step=self.iterations)
+    def get_config(self):
+        config = {
+            "lr": float(backend.get_value(self.lr)),
+            "beta_1": float(backend.get_value(self.beta_1)),
+            "beta_2": float(backend.get_value(self.beta_2)),
+            "epsilon": self.epsilon,
+            "schedule_decay": self.schedule_decay,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
 
-    self.updates.append(opt_update)
-    return self.updates
 
-  @property
-  def weights(self):
-    raise NotImplementedError
+class TFOptimizer(Optimizer, tf.__internal__.tracking.Trackable):
+    """Wrapper class for native TensorFlow optimizers."""
+
+    def __init__(self, optimizer, iterations=None):
+        self.optimizer = optimizer
+        self._track_trackable(optimizer, name="optimizer")
+        if iterations is None:
+            with backend.name_scope(self.__class__.__name__):
+                self.iterations = backend.variable(
+                    0, dtype="int64", name="iterations"
+                )
+        else:
+            self.iterations = iterations
+        self._track_trackable(self.iterations, name="global_step")
+
+    def _clip_gradients(self, grads):
+        """Clip gradients according to the clipnorm and clipvalue attributes."""
+        # TFOptimizer wrapper has no gradient clipping options.
+        return grads
+
+    def minimize(self, loss, var_list, grad_loss=None, tape=None):
+        """Mimics the `OptimizerV2.minimize` API."""
+        if not callable(loss) and tape is None:
+            raise ValueError(
+                "`tape` is required when a `Tensor` loss is passed."
+            )
+        tape = tape if tape is not None else tf.GradientTape()
+
+        if callable(loss):
+            with tape:
+                if not callable(var_list):
+                    tape.watch(var_list)
+                loss = loss()
+                if callable(var_list):
+                    var_list = var_list()
+
+        var_list = tf.nest.flatten(var_list)
+        if var_list:
+            grads = tape.gradient(loss, var_list, grad_loss)
+            grads_and_vars = list(zip(grads, var_list))
+            self.apply_gradients(grads_and_vars)
+
+    def apply_gradients(self, grads_and_vars):
+        self.optimizer.apply_gradients(
+            grads_and_vars, global_step=self.iterations
+        )
+
+    def get_grads(self, loss, params):
+        return self.optimizer.compute_gradients(loss, params)
+
+    def get_updates(self, loss, params):
+        if tf.distribute.has_strategy():
+            self.updates = []
+
+            if not params:
+                # After the model vars have been created, the second call to
+                # get_updates is called with params as an empty list. This
+                # ensures that we call compute_gradients with params=None.
+                grads = self.optimizer.compute_gradients(loss)
+            else:
+                grads = self.optimizer.compute_gradients(loss, params)
+            global_step = tf.compat.v1.train.get_global_step()
+            opt_update = self.optimizer.apply_gradients(grads, global_step)
+        else:
+            if not params:
+                self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
+                return self.updates
+
+            # Updates list starts out empty because the iterations variable is
+            # incremented in optimizer.apply_gradients()
+            self.updates = []
+            grads = self.optimizer.compute_gradients(loss, params)
+            opt_update = self.optimizer.apply_gradients(
+                grads, global_step=self.iterations
+            )
+
+        self.updates.append(opt_update)
+        return self.updates
+
+    @property
+    def weights(self):
+        raise NotImplementedError
 
-  def get_config(self):
-    raise NotImplementedError
+    def get_config(self):
+        raise NotImplementedError
 
-  def from_config(self, config):
-    raise NotImplementedError
+    def from_config(self, config):
+        raise NotImplementedError
 
 
 # Aliases.
diff --git a/keras/optimizers/optimizer_v1_test.py b/keras/optimizers/optimizer_v1_test.py
new file mode 100644
index 000000000000..977d573ee5b6
--- /dev/null
+++ b/keras/optimizers/optimizer_v1_test.py
@@ -0,0 +1,304 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras optimizers."""
+
+import gc
+import weakref
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras.optimizers import optimizer_v1
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import np_utils
+
+# isort: off
+from tensorflow.python.training.adam import AdamOptimizer
+from tensorflow.python.training.experimental.loss_scale_optimizer import (  # noqa: E501
+    MixedPrecisionLossScaleOptimizer,
+)
+
+
+def _get_model(input_dim, num_hidden, output_dim):
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Dense(
+            num_hidden, activation="relu", input_shape=(input_dim,)
+        )
+    )
+    model.add(keras.layers.Dense(output_dim, activation="softmax"))
+    return model
+
+
+@test_combinations.run_all_keras_modes
+class KerasOptimizersTest(test_combinations.TestCase):
+    def _test_optimizer(self, optimizer, target=0.75):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=1000,
+            test_samples=200,
+            input_shape=(10,),
+            num_classes=2,
+        )
+        y_train = np_utils.to_categorical(y_train)
+        model = _get_model(x_train.shape[1], 20, y_train.shape[1])
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=optimizer,
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        np.testing.assert_equal(
+            keras.backend.get_value(model.optimizer.iterations), 0
+        )
+        history = model.fit(
+            x_train, y_train, epochs=2, batch_size=16, verbose=0
+        )
+        np.testing.assert_equal(
+            keras.backend.get_value(model.optimizer.iterations), 126
+        )  # 63 steps per epoch
+        self.assertGreaterEqual(history.history["acc"][-1], target)
+        config = keras.optimizers.serialize(optimizer)
+        optim = keras.optimizers.deserialize(config)
+        new_config = keras.optimizers.serialize(optim)
+        new_config["class_name"] = new_config["class_name"].lower()
+        new_config["config"].pop("name", None)
+        if "amsgrad" not in config["config"]:
+            new_config["config"].pop("amsgrad", None)
+        if (
+            "decay" in new_config["config"]
+            and "schedule_decay" in config["config"]
+        ):
+            new_config["config"]["schedule_decay"] = new_config["config"].pop(
+                "decay"
+            )
+        if "momentum" not in config["config"]:
+            new_config["config"].pop("momentum", None)
+        if "centered" not in config["config"]:
+            new_config["config"].pop("centered", None)
+        self.assertDictEqual(config, new_config)
+
+        # Test constraints.
+        model = keras.models.Sequential()
+        dense = keras.layers.Dense(
+            10,
+            input_shape=(x_train.shape[1],),
+            kernel_constraint=lambda x: 0.0 * x + 1.0,
+            bias_constraint=lambda x: 0.0 * x + 2.0,
+            activation="relu",
+        )
+        model.add(dense)
+        model.add(keras.layers.Dense(y_train.shape[1], activation="softmax"))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=optimizer,
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        np.testing.assert_equal(
+            keras.backend.get_value(model.optimizer.iterations), 126
+        )  # Using same optimizer from before
+        model.train_on_batch(x_train[:10], y_train[:10])
+        np.testing.assert_equal(
+            keras.backend.get_value(model.optimizer.iterations), 127
+        )
+        kernel, bias = dense.get_weights()
+        np.testing.assert_allclose(kernel, 1.0, atol=1e-3)
+        np.testing.assert_allclose(bias, 2.0, atol=1e-3)
+
+    def test_sgd(self):
+        with self.cached_session():
+            self._test_optimizer(optimizer_v1.SGD())
+
+    def test_momentum(self):
+        with self.cached_session():
+            self._test_optimizer(
+                optimizer_v1.SGD(lr=0.01, momentum=0.9, nesterov=True)
+            )
+
+    def test_rmsprop(self):
+        with self.cached_session():
+            self._test_optimizer(optimizer_v1.RMSprop())
+            self._test_optimizer(optimizer_v1.RMSprop(decay=1e-3))
+
+    def test_adagrad(self):
+        with self.cached_session():
+            self._test_optimizer(optimizer_v1.Adagrad())
+            self._test_optimizer(optimizer_v1.Adagrad(decay=1e-3))
+
+    def test_adadelta(self):
+        with self.cached_session():
+            self._test_optimizer(optimizer_v1.Adadelta(), target=0.6)
+            # Accuracy seems dependent on the initialization. Even adding
+            # tf.compat.v1.Print nodes in the graph seemed to affect the
+            # initialization seed, and hence the accuracy.
+            self._test_optimizer(optimizer_v1.Adadelta(decay=1e-3), target=0.4)
+
+    def test_adam(self):
+        with self.cached_session():
+            self._test_optimizer(optimizer_v1.Adam())
+            # Accuracy seems dependent on the seed initialization.
+            # TODO(b/121051441): fix test flakiness.
+            self._test_optimizer(optimizer_v1.Adam(decay=1e-3), target=0.73)
+            self._test_optimizer(optimizer_v1.Adam(amsgrad=True))
+
+    def test_adamax(self):
+        with self.cached_session():
+            self._test_optimizer(optimizer_v1.Adamax())
+            self._test_optimizer(optimizer_v1.Adamax(decay=1e-3))
+
+    def test_nadam(self):
+        with self.cached_session():
+            self._test_optimizer(optimizer_v1.Nadam())
+
+    def test_clipnorm(self):
+        with self.cached_session():
+            self._test_optimizer(
+                optimizer_v1.SGD(lr=0.01, momentum=0.9, clipnorm=0.5)
+            )
+
+    def test_clipvalue(self):
+        with self.cached_session():
+            self._test_optimizer(
+                optimizer_v1.SGD(lr=0.01, momentum=0.9, clipvalue=0.5)
+            )
+
+    def test_tf_optimizer(self):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                2,
+                input_shape=(3,),
+                kernel_constraint=keras.constraints.MaxNorm(1),
+            )
+        )
+        # This is possible
+        model.compile(
+            loss="mean_squared_error",
+            optimizer=optimizer,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        keras.backend.track_tf_optimizer(optimizer)
+        model.fit(
+            np.random.random((5, 3)),
+            np.random.random((5, 2)),
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+        # not supported
+        with self.assertRaises(NotImplementedError):
+            _ = optimizer.weights
+        with self.assertRaises(NotImplementedError):
+            optimizer.get_config()
+        with self.assertRaises(NotImplementedError):
+            optimizer.from_config(None)
+
+    def test_optimizer_garbage_collection(self):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        graph = tf.Graph()
+        with graph.as_default():
+            optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
+            keras.backend.track_tf_optimizer(optimizer)
+            optimizer_weak = weakref.ref(optimizer)
+        graph_weak = weakref.ref(graph)
+        del graph, optimizer
+        gc.collect()
+        # Check that the weak references are dead now.
+        self.assertIs(graph_weak(), None)
+        self.assertIs(optimizer_weak(), None)
+
+    def test_tf_optimizer_iterations(self):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        with self.cached_session():
+            optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Dense(
+                    2,
+                    input_shape=(3,),
+                    kernel_constraint=keras.constraints.MaxNorm(1),
+                )
+            )
+            model.compile(
+                loss="mean_squared_error",
+                optimizer=optimizer,
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            keras.backend.track_tf_optimizer(optimizer)
+            self.assertEqual(
+                keras.backend.get_value(model.optimizer.iterations), 0
+            )
+
+            model.fit(
+                np.random.random((55, 3)),
+                np.random.random((55, 2)),
+                epochs=1,
+                batch_size=5,
+                verbose=0,
+            )
+            self.assertEqual(
+                keras.backend.get_value(model.optimizer.iterations), 11
+            )
+
+    def test_negative_clipvalue_or_clipnorm(self):
+        with self.assertRaises(ValueError):
+            _ = optimizer_v1.SGD(lr=0.01, clipvalue=-0.5)
+        with self.assertRaises(ValueError):
+            _ = optimizer_v1.Adam(clipnorm=-2.0)
+
+    def test_mixed_precision_loss_scale_optimizer(self):
+        if tf.executing_eagerly():
+            self.skipTest("v1 optimizer does not run in eager mode")
+        optimizer = MixedPrecisionLossScaleOptimizer(AdamOptimizer(), "dynamic")
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                2,
+                input_shape=(3,),
+                kernel_constraint=keras.constraints.MaxNorm(1),
+            )
+        )
+        model.compile(
+            loss="mean_squared_error",
+            optimizer=optimizer,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            np.random.random((5, 3)),
+            np.random.random((5, 2)),
+            epochs=1,
+            batch_size=5,
+            verbose=0,
+        )
+
+    def test_deserialization_error(self):
+        with self.assertRaisesRegex(
+            ValueError, "Could not interpret optimizer"
+        ):
+            keras.optimizers.get(0)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/BUILD b/keras/optimizers/optimizer_v2/BUILD
deleted file mode 100644
index 2784f3a20ae3..000000000000
--- a/keras/optimizers/optimizer_v2/BUILD
+++ /dev/null
@@ -1,162 +0,0 @@
-# Description:
-#   Contains the Keras OptimizerV2 API (internal TensorFlow version).
-
-load("@org_keras//keras:keras.bzl", "cuda_py_test")
-
-package(
-    # TODO(scottzhu): Remove non-keras deps from TF.
-    default_visibility = [
-        "//keras:friends",
-        "//third_party/tensorflow/python:__pkg__",
-        "//third_party/tensorflow/python/distribute:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
-    ],
-    licenses = ["notice"],
-)
-
-py_library(
-    name = "optimizer_v2",
-    srcs = [
-        "adadelta.py",
-        "adagrad.py",
-        "adam.py",
-        "adamax.py",
-        "ftrl.py",
-        "gradient_descent.py",
-        "nadam.py",
-        "optimizer_v2.py",
-        "rmsprop.py",
-        "utils.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        "//:expect_tensorflow_installed",
-        "//keras:backend",
-        "//keras:backend_config",
-        "//keras/engine:base_layer_utils",
-        "//keras/initializers",
-        "//keras/optimizers/schedules:learning_rate_schedule",
-        "//keras/utils:layer_utils",
-        "//keras/utils:tf_utils",
-    ],
-)
-
-cuda_py_test(
-    name = "adagrad_test",
-    size = "medium",
-    srcs = ["adagrad_test.py"],
-    shard_count = 4,
-    deps = [
-        ":optimizer_v2",
-        "//:expect_tensorflow_installed",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
-
-cuda_py_test(
-    name = "adam_test",
-    size = "medium",
-    srcs = ["adam_test.py"],
-    shard_count = 4,
-    tags = [
-        "no_rocm",
-        "no_windows",  # TODO(b/171384138)
-    ],
-    deps = [
-        ":optimizer_v2",
-        "//:expect_tensorflow_installed",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
-
-cuda_py_test(
-    name = "adamax_test",
-    size = "medium",
-    srcs = ["adamax_test.py"],
-    shard_count = 4,
-    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
-    tags = ["no_rocm"],
-    deps = [
-        ":optimizer_v2",
-        "//:expect_tensorflow_installed",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
-
-cuda_py_test(
-    name = "adadelta_test",
-    size = "medium",
-    srcs = ["adadelta_test.py"],
-    shard_count = 4,
-    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
-    deps = [
-        ":optimizer_v2",
-        "//:expect_tensorflow_installed",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
-
-cuda_py_test(
-    name = "ftrl_test",
-    size = "medium",
-    srcs = ["ftrl_test.py"],
-    shard_count = 4,
-    deps = [
-        ":optimizer_v2",
-        "//:expect_tensorflow_installed",
-    ],
-)
-
-cuda_py_test(
-    name = "gradient_descent_test",
-    size = "medium",
-    srcs = ["gradient_descent_test.py"],
-    shard_count = 4,
-    deps = [
-        ":optimizer_v2",
-        "//:expect_tensorflow_installed",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
-
-cuda_py_test(
-    name = "nadam_test",
-    size = "medium",
-    srcs = ["nadam_test.py"],
-    shard_count = 4,
-    deps = [
-        ":optimizer_v2",
-        "//:expect_tensorflow_installed",
-    ],
-)
-
-cuda_py_test(
-    name = "optimizer_v2_test",
-    size = "medium",
-    srcs = ["optimizer_v2_test.py"],
-    shard_count = 8,
-    tags = [
-        "no_windows",
-    ],
-    deps = [
-        ":optimizer_v2",
-        "//:expect_absl_installed",
-        "//:expect_tensorflow_installed",
-        "//keras",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
-
-cuda_py_test(
-    name = "rmsprop_test",
-    size = "medium",
-    srcs = ["rmsprop_test.py"],
-    shard_count = 2,
-    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
-    deps = [
-        ":optimizer_v2",
-        "//:expect_absl_installed",
-        "//:expect_tensorflow_installed",
-        "//keras/testing_infra:test_combinations",
-    ],
-)
diff --git a/keras/optimizers/optimizer_v2/adadelta.py b/keras/optimizers/optimizer_v2/adadelta.py
deleted file mode 100644
index 378e756ad050..000000000000
--- a/keras/optimizers/optimizer_v2/adadelta.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adadelta optimizer implementation."""
-
-import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-from keras import backend_config
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.optimizers.Adadelta')
-class Adadelta(optimizer_v2.OptimizerV2):
-  r"""Optimizer that implements the Adadelta algorithm.
-
-  Adadelta optimization is a stochastic gradient descent method that is based on
-  adaptive learning rate per dimension to address two drawbacks:
-
-  - The continual decay of learning rates throughout training.
-  - The need for a manually selected global learning rate.
-
-  Adadelta is a more robust extension of Adagrad that adapts learning rates
-  based on a moving window of gradient updates, instead of accumulating all
-  past gradients. This way, Adadelta continues learning even when many updates
-  have been done. Compared to Adagrad, in the original version of Adadelta you
-  don't have to set an initial learning rate. In this version, the initial
-  learning rate can be set, as in most other Keras optimizers.
-
-  Args:
-    learning_rate: Initial value for the learning rate:
-      either a floating point value,
-      or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-      Defaults to 0.001.
-      Note that `Adadelta` tends to benefit from higher initial learning rate
-      values compared to other optimizers.
-      To match the exact form in the original paper, use 1.0.
-    rho: A `Tensor` or a floating point value. The decay rate.
-    epsilon: Small floating point value used to maintain numerical stability.
-    name: Optional name prefix for the operations created when applying
-      gradients.  Defaults to `"Adadelta"`.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value.
-
-  Reference:
-    - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               rho=0.95,
-               epsilon=1e-7,
-               name='Adadelta',
-               **kwargs):
-    super().__init__(name, **kwargs)
-    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
-    self._set_hyper('decay', self._initial_decay)
-    self._set_hyper('rho', rho)
-    self.epsilon = epsilon or backend_config.epsilon()
-
-  def _create_slots(self, var_list):
-    # Separate for-loops to respect the ordering of slot variables from v1.
-    for v in var_list:
-      self.add_slot(v, 'accum_grad')
-    for v in var_list:
-      self.add_slot(v, 'accum_var')
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device, var_dtype)].update(
-        dict(
-            epsilon=tf.convert_to_tensor(
-                self.epsilon, var_dtype),
-            rho=tf.identity(self._get_hyper('rho', var_dtype))))
-
-  def set_weights(self, weights):
-    params = self.weights
-    # Override set_weights for backward compatibility of Keras V1 optimizer
-    # since it does not include iteration at head of the weight list. Set
-    # iteration to 0.
-    if len(params) == len(weights) + 1:
-      weights = [np.array(0)] + weights
-    super().set_weights(weights)
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    accum_grad = self.get_slot(var, 'accum_grad')
-    accum_var = self.get_slot(var, 'accum_var')
-    return tf.raw_ops.ResourceApplyAdadelta(
-        var=var.handle,
-        accum=accum_grad.handle,
-        accum_update=accum_var.handle,
-        lr=coefficients['lr_t'],
-        rho=coefficients['rho'],
-        epsilon=coefficients['epsilon'],
-        grad=grad,
-        use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    accum_grad = self.get_slot(var, 'accum_grad')
-    accum_var = self.get_slot(var, 'accum_var')
-    return tf.raw_ops.ResourceSparseApplyAdadelta(
-        var=var.handle,
-        accum=accum_grad.handle,
-        accum_update=accum_var.handle,
-        lr=coefficients['lr_t'],
-        rho=coefficients['rho'],
-        epsilon=coefficients['epsilon'],
-        grad=grad,
-        indices=indices,
-        use_locking=self._use_locking)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'learning_rate': self._serialize_hyperparameter('learning_rate'),
-        'decay': self._initial_decay,
-        'rho': self._serialize_hyperparameter('rho'),
-        'epsilon': self.epsilon,
-    })
-    return config
diff --git a/keras/optimizers/optimizer_v2/adadelta_test.py b/keras/optimizers/optimizer_v2/adadelta_test.py
deleted file mode 100644
index db768532e3a5..000000000000
--- a/keras/optimizers/optimizer_v2/adadelta_test.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Adadelta Optimizer."""
-
-import tensorflow.compat.v2 as tf
-
-from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
-from keras.optimizers.optimizer_v2 import adadelta
-
-_DATA_TYPES = [
-    tf.half, tf.float32, tf.float64, tf.complex64,
-    tf.complex128
-]
-
-
-class AdadeltaOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def doTestBasic(self, use_resource=False, use_callable_params=False):
-    num_updates = 4  # number of ADADELTA steps to perform
-    for dtype in _DATA_TYPES:
-      for grad in [0.2, 0.1, 0.01]:
-        for lr in [1.0, 0.5, 0.1]:
-          var0_init = [1.0, 2.0]
-          var1_init = [3.0, 4.0]
-          if use_resource:
-            var0 = tf.Variable(var0_init, dtype=dtype)
-            var1 = tf.Variable(var1_init, dtype=dtype)
-          else:
-            var0 = tf.Variable(var0_init, dtype=dtype)
-            var1 = tf.Variable(var1_init, dtype=dtype)
-
-          grads = tf.constant([grad, grad], dtype=dtype)
-
-          accum = 0.0
-          accum_update = 0.0
-
-          # ADADELTA gradient optimizer
-          rho = 0.95
-          epsilon = 1e-8
-          if use_callable_params:
-            adadelta_opt = adadelta.Adadelta(
-                learning_rate=lambda: lr,  # pylint: disable=cell-var-from-loop
-                rho=lambda: rho,  # pylint: disable=cell-var-from-loop
-                epsilon=epsilon)  # pylint: disable=cell-var-from-loop
-          else:
-            adadelta_opt = adadelta.Adadelta(
-                learning_rate=lr, rho=rho, epsilon=epsilon)
-          if not tf.executing_eagerly():
-            adadelta_update = adadelta_opt.apply_gradients(
-                zip([grads, grads], [var0, var1]))
-            self.evaluate(tf.compat.v1.global_variables_initializer())
-
-            # Assign slots
-            slot = [None] * 2
-            slot_update = [None] * 2
-            slot[0] = adadelta_opt.get_slot(var0, "accum_grad")
-            self.assertEqual(slot[0].shape, var0.shape)
-
-            slot_update[0] = adadelta_opt.get_slot(var0, "accum_var")
-            self.assertEqual(slot_update[0].shape, var0.shape)
-
-            slot[1] = adadelta_opt.get_slot(var1, "accum_grad")
-            self.assertEqual(slot[1].shape, var1.shape)
-
-            slot_update[1] = adadelta_opt.get_slot(var1, "accum_var")
-            self.assertEqual(slot_update[1].shape, var1.shape)
-
-          # Fetch params to validate initial values
-          self.assertAllClose(var0_init, self.evaluate(var0))
-          self.assertAllClose(var1_init, self.evaluate(var1))
-
-          update = [None] * num_updates
-          tot_update = 0
-          for step in range(num_updates):
-            # Run adadelta update for comparison
-            if not tf.executing_eagerly():
-              self.evaluate(adadelta_update)
-            else:
-              adadelta_opt.apply_gradients(zip([grads, grads], [var0, var1]))
-
-            # Perform initial update without previous accum values
-            accum = accum * rho + (grad**2) * (1 - rho)
-            update[step] = (
-                np.sqrt(accum_update + epsilon) *
-                (1. / np.sqrt(accum + epsilon)) * grad)
-            accum_update = (
-                accum_update * rho + (update[step]**2) * (1.0 - rho))
-            tot_update += update[step] * lr
-
-            if not tf.executing_eagerly():
-              # Check that the accumulators have been updated
-              # TODO(lxuechen): This is hard to test in eager mode
-              for slot_idx in range(2):
-                self.assertAllCloseAccordingToType(
-                    np.array([accum, accum], dtype=dtype.as_numpy_dtype(0)),
-                    self.evaluate(slot[slot_idx]),
-                    rtol=1e-5)
-
-                self.assertAllCloseAccordingToType(
-                    np.array(
-                        [accum_update, accum_update],
-                        dtype=dtype.as_numpy_dtype(0)),
-                    self.evaluate(slot_update[slot_idx]),
-                    rtol=1e-5)
-
-              # Check that the parameters have been updated
-              self.assertAllCloseAccordingToType(
-                  np.array(
-                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
-                      dtype=dtype.as_numpy_dtype(0)),
-                  self.evaluate(var0),
-                  rtol=1e-5)
-
-              self.assertAllCloseAccordingToType(
-                  np.array(
-                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
-                      dtype=dtype.as_numpy_dtype(0)),
-                  self.evaluate(var1),
-                  rtol=1e-5)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testResourceBasic(self):
-    self.doTestBasic(use_resource=True)
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testBasicCallableParams(self):
-    self.doTestBasic(use_resource=True, use_callable_params=True)
-
-  def testMinimizeSparseResourceVariable(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-        x = tf.constant([[4.0], [5.0]], dtype=dtype)
-
-        def loss():
-          pred = tf.matmul(tf.compat.v1.nn.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
-          return pred * pred
-
-        sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
-            loss, var_list=[var0])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
-
-  def testConstructAdadeltaWithLR(self):
-    opt = adadelta.Adadelta(lr=1.0, rho=0.9, epsilon=1.)
-    opt_2 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1., lr=1.0)
-    opt_3 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1.)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-  def testConstructAdadeltaWithEpsilonValues(self):
-    opt = adadelta.Adadelta(epsilon=None)
-    self.assertEqual(opt.epsilon, 1e-7)
-
-    opt = adadelta.Adadelta(epsilon=1e-8)
-    self.assertEqual(opt.epsilon, 1e-8)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/adagrad.py b/keras/optimizers/optimizer_v2/adagrad.py
deleted file mode 100644
index c1fe8dba563b..000000000000
--- a/keras/optimizers/optimizer_v2/adagrad.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adagrad optimizer implementation."""
-
-import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-from keras import backend_config
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.optimizers.Adagrad')
-class Adagrad(optimizer_v2.OptimizerV2):
-  r"""Optimizer that implements the Adagrad algorithm.
-
-  Adagrad is an optimizer with parameter-specific learning rates,
-  which are adapted relative to how frequently a parameter gets
-  updated during training. The more updates a parameter receives,
-  the smaller the updates.
-
-  Args:
-    learning_rate: Initial value for the learning rate:
-      either a floating point value,
-      or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-      Defaults to 0.001.
-      Note that `Adagrad` tends to benefit from higher initial learning rate
-      values compared to other optimizers.
-      To match the exact form in the original paper, use 1.0.
-    initial_accumulator_value: Floating point value.
-      Starting value for the accumulators (per-parameter momentum values).
-      Must be non-negative.
-    epsilon: Small floating point value used to maintain numerical stability.
-    name: Optional name prefix for the operations created when applying
-      gradients.  Defaults to `"Adagrad"`.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value..
-
-  Reference:
-    - [Duchi et al., 2011](
-      http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               initial_accumulator_value=0.1,
-               epsilon=1e-7,
-               name='Adagrad',
-               **kwargs):
-    if initial_accumulator_value < 0.0:
-      raise ValueError('initial_accumulator_value must be non-negative: %s' %
-                       initial_accumulator_value)
-    if epsilon is None:
-      epsilon = backend_config.epsilon()
-    super().__init__(name, **kwargs)
-    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
-    self._set_hyper('decay', self._initial_decay)
-    self._initial_accumulator_value = initial_accumulator_value
-    self.epsilon = epsilon or backend_config.epsilon()
-
-  def _create_slots(self, var_list):
-    for var in var_list:
-      dtype = var.dtype.base_dtype
-      init = tf.compat.v1.constant_initializer(
-          self._initial_accumulator_value, dtype=dtype)
-      self.add_slot(var, 'accumulator', init)
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device, var_dtype)].update(
-        dict(
-            epsilon=tf.convert_to_tensor(
-                self.epsilon, var_dtype),
-            neg_lr_t=-apply_state[(var_device, var_dtype)]['lr_t'],
-            zero=tf.zeros((), dtype=tf.int64)))
-
-  def set_weights(self, weights):
-    params = self.weights
-    # Override set_weights for backward compatibility of Keras V1 optimizer
-    # since it does not include iteration at head of the weight list. Set
-    # iteration to 0.
-    if len(params) == len(weights) + 1:
-      weights = [np.array(0)] + weights
-    super().set_weights(weights)
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    """Creates an optimizer from its config.
-
-    This method is the reverse of `get_config`,
-    capable of instantiating the same optimizer from the config
-    dictionary.
-
-    Args:
-        config: A Python dictionary, typically the output of get_config.
-        custom_objects: A Python dictionary mapping names to additional Python
-          objects used to create this optimizer, such as a function used for a
-          hyperparameter.
-
-    Returns:
-        An optimizer instance.
-    """
-    if 'initial_accumulator_value' not in config:
-      config['initial_accumulator_value'] = 0.1
-    if 'lr' in config:
-      config['learning_rate'] = config.pop('lr')
-    return cls(**config)
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    acc = self.get_slot(var, 'accumulator')
-    return tf.raw_ops.ResourceApplyAdagradV2(
-        var=var.handle,
-        accum=acc.handle,
-        lr=coefficients['lr_t'],
-        epsilon=coefficients['epsilon'],
-        grad=grad,
-        use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    acc = self.get_slot(var, 'accumulator')
-    return tf.raw_ops.ResourceSparseApplyAdagradV2(
-        var=var.handle,
-        accum=acc.handle,
-        lr=coefficients['lr_t'],
-        epsilon=coefficients['epsilon'],
-        grad=grad,
-        indices=indices,
-        use_locking=self._use_locking)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'learning_rate': self._serialize_hyperparameter('learning_rate'),
-        'decay': self._initial_decay,
-        'initial_accumulator_value': self._initial_accumulator_value,
-        'epsilon': self.epsilon,
-    })
-    return config
diff --git a/keras/optimizers/optimizer_v2/adagrad_test.py b/keras/optimizers/optimizer_v2/adagrad_test.py
deleted file mode 100644
index 7db5a0c19a07..000000000000
--- a/keras/optimizers/optimizer_v2/adagrad_test.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional tests for aggregate operations."""
-
-import tensorflow.compat.v2 as tf
-
-import copy
-
-from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
-from keras.optimizers.optimizer_v2 import adagrad
-from keras.optimizers.schedules import learning_rate_schedule
-
-_DATA_TYPES = [
-    tf.half, tf.float32, tf.float64, tf.complex64,
-    tf.complex128
-]
-
-
-def adagrad_update_numpy(param, accum, g_t, lr=0.001, epsilon=1e-7):
-  accum_t = accum + g_t * g_t
-  param_t = param - lr * g_t / (np.sqrt(accum_t) + epsilon)
-  return param_t, accum_t
-
-
-def sparse_adagrad_update_numpy(param,
-                                accum,
-                                gindexs,
-                                gvalues,
-                                lr=0.001,
-                                epsilon=1e-7):
-  accum_t = copy.deepcopy(accum)
-  param_t = copy.deepcopy(param)
-  # first loop accumulates repeated indices if necessary.
-  for i in range(len(gindexs)):
-    gindex = gindexs[i]
-    gvalue = gvalues[i]
-    accum_t[gindex] = accum_t[gindex] + gvalue * gvalue
-  for i in range(len(gindexs)):
-    gindex = gindexs[i]
-    gvalue = gvalues[i]
-    param_t[gindex] = param_t[gindex] - lr * gvalue / (
-        np.sqrt(accum_t[gindex]) + epsilon)
-  return param_t, accum_t
-
-
-class AdagradOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def doTestBasic(self, use_callable_params=False):
-    for dtype in _DATA_TYPES:
-      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-      grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-      grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-      var0 = tf.Variable(var0_np)
-      var1 = tf.Variable(var1_np)
-      grads0 = tf.constant(grads0_np)
-      grads1 = tf.constant(grads1_np)
-
-      learning_rate = lambda: 3.0
-      if not use_callable_params:
-        learning_rate = learning_rate()
-
-      ada_opt = adagrad.Adagrad(learning_rate)
-
-      accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-      accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-
-      if not tf.executing_eagerly():
-        ada_update = ada_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      # Fetch params to validate initial values
-      v0_val, v1_val = self.evaluate([var0, var1])
-      self.assertAllClose([1.0, 2.0], v0_val)
-      self.assertAllClose([3.0, 4.0], v1_val)
-
-      # Run 3 steps of adagrad
-      for _ in range(3):
-        if not tf.executing_eagerly():
-          self.evaluate(ada_update)
-        else:
-          ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np, grads0_np,
-                                                  3.0)
-        var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np, grads1_np,
-                                                  3.0)
-        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasic(self):
-    self.doTestBasic()
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testBasicCallableParams(self):
-    self.doTestBasic(use_callable_params=True)
-
-  def testBasicWithLearningRateDecay(self):
-    for dtype in _DATA_TYPES:
-      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-      grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-      grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-      var0 = tf.Variable(var0_np)
-      var1 = tf.Variable(var1_np)
-      grads0 = tf.constant(grads0_np)
-      grads1 = tf.constant(grads1_np)
-
-      learning_rate = 3.0
-      decay = 0.5
-
-      ada_opt = adagrad.Adagrad(learning_rate, decay=decay)
-
-      accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-      accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-
-      if not tf.executing_eagerly():
-        ada_update = ada_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      # Fetch params to validate initial values
-      v0_val, v1_val = self.evaluate([var0, var1])
-      self.assertAllClose([1.0, 2.0], v0_val)
-      self.assertAllClose([3.0, 4.0], v1_val)
-
-      # Run 3 steps of adagrad
-      for t in range(3):
-        if not tf.executing_eagerly():
-          self.evaluate(ada_update)
-        else:
-          ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        lr_np = learning_rate / (1 + decay * t)
-        var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np, grads0_np,
-                                                  lr_np)
-        var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np, grads1_np,
-                                                  lr_np)
-        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testBasicWithLargeEpsilon(self):
-    var0_np = np.array([1.0, 2.0])
-    var1_np = np.array([3.0, 4.0])
-    grads0_np = np.array([0.1, 0.1])
-    grads1_np = np.array([0.01, 0.01])
-    var0 = tf.Variable(var0_np)
-    var1 = tf.Variable(var1_np)
-    grads0 = tf.constant(grads0_np)
-    grads1 = tf.constant(grads1_np)
-
-    learning_rate = 3.0
-
-    ada_opt = adagrad.Adagrad(learning_rate, epsilon=1.0)
-
-    accum0_np = np.array([0.1, 0.1])
-    accum1_np = np.array([0.1, 0.1])
-
-    if not tf.executing_eagerly():
-      ada_update = ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    # Fetch params to validate initial values
-    v0_val, v1_val = self.evaluate([var0, var1])
-    self.assertAllClose([1.0, 2.0], v0_val)
-    self.assertAllClose([3.0, 4.0], v1_val)
-
-    # Run 3 steps of adagrad
-    for _ in range(3):
-      if not tf.executing_eagerly():
-        self.evaluate(ada_update)
-      else:
-        ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np, grads0_np,
-                                                3.0, 1.0)
-      var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np, grads1_np,
-                                                3.0, 1.0)
-      self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-      self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testBasicWithLearningRateInverseTimeDecay(self):
-    for dtype in _DATA_TYPES:
-      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-      grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-      grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-      var0 = tf.Variable(var0_np)
-      var1 = tf.Variable(var1_np)
-      grads0 = tf.constant(grads0_np)
-      grads1 = tf.constant(grads1_np)
-
-      learning_rate = 3.0
-      decay = 0.5
-      lr_schedule = learning_rate_schedule.InverseTimeDecay(
-          learning_rate, decay_steps=1.0, decay_rate=decay)
-
-      ada_opt = adagrad.Adagrad(lr_schedule)
-
-      accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-      accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-
-      if not tf.executing_eagerly():
-        ada_update = ada_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      # Fetch params to validate initial values
-      v0_val, v1_val = self.evaluate([var0, var1])
-      self.assertAllClose([1.0, 2.0], v0_val)
-      self.assertAllClose([3.0, 4.0], v1_val)
-
-      # Run 3 steps of adagrad
-      for t in range(3):
-        if not tf.executing_eagerly():
-          self.evaluate(ada_update)
-        else:
-          ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        lr_np = learning_rate / (1 + decay * t)
-        var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np, grads0_np,
-                                                  lr_np)
-        var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np, grads1_np,
-                                                  lr_np)
-        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testMinimizeSparseResourceVariable(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0 = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
-        x = tf.constant([[4.0], [5.0]], dtype=dtype)
-
-        def loss():
-          pred = tf.matmul(tf.compat.v1.nn.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
-          return pred * pred
-
-        sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
-                                           self.evaluate(var0))
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[0, 1], [3, 4]],
-                                           self.evaluate(var0),
-                                           atol=0.01)
-
-  def testTensorLearningRate(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = tf.constant(3.0)
-        ada_opt = adagrad.Adagrad(learning_rate)
-        ada_update = ada_opt.apply_gradients(zip([grads0, grads1],
-                                                 [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        # Run 3 steps of adagrad
-        for _ in range(3):
-          self.evaluate(ada_update)
-          var0_np, accum0_np = adagrad_update_numpy(
-              var0_np, accum0_np, grads0_np, learning_rate)
-          var1_np, accum1_np = adagrad_update_numpy(
-              var1_np, accum1_np, grads1_np, learning_rate)
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testSparseBasic(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0_np_indices = np.array([0, 2], dtype=np.int32)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np[grads0_np_indices]),
-            tf.constant(grads0_np_indices), tf.constant([3]))
-        grads1_np_indices = np.array([0, 2], dtype=np.int32)
-        grads1 = tf.IndexedSlices(
-            tf.constant(grads1_np[grads1_np_indices]),
-            tf.constant(grads1_np_indices), tf.constant([3]))
-        learning_rate = 3.0
-        ada_opt = adagrad.Adagrad(learning_rate)
-        ada_update = ada_opt.apply_gradients(zip([grads0, grads1],
-                                                 [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
-
-        accum0_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        accum1_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
-
-        # Run 3 step of sgd
-        for _ in range(3):
-          self.evaluate(ada_update)
-
-          var0_np, accum0_np = sparse_adagrad_update_numpy(
-              var0_np, accum0_np, grads0_np_indices,
-              grads0_np[grads0_np_indices], learning_rate)
-          var1_np, accum1_np = sparse_adagrad_update_numpy(
-              var1_np, accum1_np, grads1_np_indices,
-              grads1_np[grads1_np_indices], learning_rate)
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testSparseSingleVarDim(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0_np = np.array([1.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        grads0_np_indices = np.array([0], dtype=np.int32)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np[grads0_np_indices]),
-            tf.constant(grads0_np_indices), tf.constant([3]))
-        learning_rate = 3.0
-        ada_opt = adagrad.Adagrad(learning_rate, epsilon=1.)
-        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0], self.evaluate(var0))
-
-        accum0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
-
-        # Run 3 step of sgd
-        for _ in range(3):
-          self.evaluate(ada_update)
-
-          var0_np, accum0_np = sparse_adagrad_update_numpy(
-              var0_np,
-              accum0_np,
-              grads0_np_indices,
-              grads0_np[grads0_np_indices],
-              learning_rate,
-              epsilon=1.)
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-
-  def testSparseRepeatedIndices(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
-
-        repeated_index_update_var = tf.Variable(
-            var_np, dtype=dtype)
-        aggregated_update_var = tf.Variable(
-            var_np, dtype=dtype)
-        grad_repeated_index = tf.IndexedSlices(
-            tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
-            tf.constant([1, 1]), tf.constant([2, 1]))
-        grad_aggregated = tf.IndexedSlices(
-            tf.constant([0.2], shape=[1, 1], dtype=dtype),
-            tf.constant([1]), tf.constant([2, 1]))
-        repeated_update = adagrad.Adagrad(3.0).apply_gradients([
-            (grad_repeated_index, repeated_index_update_var)
-        ])
-        aggregated_update = adagrad.Adagrad(3.0).apply_gradients([
-            (grad_aggregated, aggregated_update_var)
-        ])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertAllClose(
-            self.evaluate(aggregated_update_var),
-            self.evaluate(repeated_index_update_var))
-        for _ in range(3):
-          self.evaluate(repeated_update)
-          self.evaluate(aggregated_update)
-          self.assertAllClose(
-              self.evaluate(aggregated_update_var),
-              self.evaluate(repeated_index_update_var))
-
-  def testSparseRepeatedIndicesByEmbeddingLookUp(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var_repeated = tf.Variable([1.0, 2.0], dtype=dtype)
-        loss_repeated = lambda: tf.reduce_sum(  # pylint: disable=g-long-lambda
-            tf.compat.v1.nn.embedding_lookup(var_repeated, [0, 0]))  # pylint: disable=cell-var-from-loop
-        var_aggregated = tf.Variable([1.0, 2.0], dtype=dtype)
-        loss_aggregated = lambda: 2 * tf.reduce_sum(  # pylint: disable=g-long-lambda
-            tf.compat.v1.nn.embedding_lookup(var_aggregated, [0]))  # pylint: disable=cell-var-from-loop
-        update_op_repeated = adagrad.Adagrad(2.0).minimize(
-            loss_repeated, var_list=[var_repeated])
-        update_op_aggregated = adagrad.Adagrad(2.0).minimize(
-            loss_aggregated, var_list=[var_aggregated])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertAllCloseAccordingToType(
-            self.evaluate(var_repeated), self.evaluate(var_aggregated))
-        for _ in range(3):
-          self.evaluate(update_op_repeated)
-          self.evaluate(update_op_aggregated)
-          self.assertAllCloseAccordingToType(
-              self.evaluate(var_repeated), self.evaluate(var_aggregated))
-
-  def testSparseStability(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half]:
-        shape = [1, 6]
-        var0_np = np.array([[0.00872496, -0.106952, 0.110467,
-                             0.226505, -0.0147257, -0.0105945]],
-                           dtype=dtype.as_numpy_dtype)
-        var0 = tf.Variable(var0_np)
-        grads0_np = np.array([[
-            -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, -8.4877e-05,
-            -9.48906e-05
-        ]],
-                             dtype=dtype.as_numpy_dtype)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np), tf.constant([0]),
-            tf.constant(shape))
-        ada_opt = adagrad.Adagrad(1.0)
-        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
-        slot0 = ada_opt.get_slot(var0, "accumulator")
-        init = tf.compat.v1.global_variables_initializer()
-        for _ in range(100):
-          self.evaluate(init)
-          self.evaluate(ada_update)
-          self.assertAllCloseAccordingToType(
-              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), self.evaluate(slot0))
-          self.assertAllCloseAccordingToType(
-              np.array([[
-                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
-                  -0.01029443
-              ]]), self.evaluate(var0))
-
-  def testSharing(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = 3.0
-        ada_opt = adagrad.Adagrad(learning_rate)
-        # Apply the optimizer twice.  Both applications will use
-        # the same accums.
-        ada_update1 = ada_opt.apply_gradients(zip([grads0, grads1],
-                                                  [var0, var1]))
-        ada_update2 = ada_opt.apply_gradients(zip([grads0, grads1],
-                                                  [var0, var1]))
-        slot0 = ada_opt.get_slot(var0, "accumulator")
-        self.assertEqual(slot0.shape, var0.shape)
-        slot1 = ada_opt.get_slot(var1, "accumulator")
-        self.assertEqual(slot1.shape, var1.shape)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Mix the first and the second adagrad for 3 steps.
-        self.evaluate(ada_update1)
-        self.evaluate(ada_update2)
-        self.evaluate(ada_update1)
-
-        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        for _ in range(3):
-          var0_np, accum0_np = adagrad_update_numpy(
-              var0_np, accum0_np, grads0_np, learning_rate)
-          var1_np, accum1_np = adagrad_update_numpy(
-              var1_np, accum1_np, grads1_np, learning_rate)
-        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testConstructAdagradWithLR(self):
-    opt = adagrad.Adagrad(lr=1.0)
-    opt_2 = adagrad.Adagrad(learning_rate=0.1, lr=1.0)
-    opt_3 = adagrad.Adagrad(learning_rate=0.1)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/adam.py b/keras/optimizers/optimizer_v2/adam.py
deleted file mode 100644
index b96bd69c499d..000000000000
--- a/keras/optimizers/optimizer_v2/adam.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adam optimizer implementation."""
-
-import tensorflow.compat.v2 as tf
-from keras import backend_config
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.optimizers.Adam')
-class Adam(optimizer_v2.OptimizerV2):
-  r"""Optimizer that implements the Adam algorithm.
-
-  Adam optimization is a stochastic gradient descent method that is based on
-  adaptive estimation of first-order and second-order moments.
-
-  According to
-  [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
-  the method is "*computationally
-  efficient, has little memory requirement, invariant to diagonal rescaling of
-  gradients, and is well suited for problems that are large in terms of
-  data/parameters*".
-
-  Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use, The
-      learning rate. Defaults to 0.001.
-    beta_1: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use. The
-      exponential decay rate for the 1st moment estimates. Defaults to 0.9.
-    beta_2: A float value or a constant float tensor, or a callable
-      that takes no arguments and returns the actual value to use, The
-      exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
-    epsilon: A small constant for numerical stability. This epsilon is
-      "epsilon hat" in the Kingma and Ba paper (in the formula just before
-      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-      1e-7.
-    amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
-      the paper "On the Convergence of Adam and beyond". Defaults to `False`.
-    name: Optional name for the operations created when applying gradients.
-      Defaults to `"Adam"`.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value.
-
-  Usage:
-
-  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
-  >>> var1 = tf.Variable(10.0)
-  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
-  >>> step_count = opt.minimize(loss, [var1]).numpy()
-  >>> # The first step is `-learning_rate*sign(grad)`
-  >>> var1.numpy()
-  9.9
-
-  Reference:
-    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
-    - [Reddi et al., 2018](
-        https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
-
-  Notes:
-
-  The default value of 1e-7 for epsilon might not be a good default in
-  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since Adam uses the
-  formulation just before Section 2.1 of the Kingma and Ba paper rather than
-  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
-  hat" in the paper.
-
-  The sparse implementation of this algorithm (used when the gradient is an
-  IndexedSlices object, typically because of `tf.gather` or an embedding
-  lookup in the forward pass) does apply momentum to variable slices even if
-  they were not used in the forward pass (meaning they have a gradient equal
-  to zero). Momentum decay (beta1) is also applied to the entire momentum
-  accumulator. This means that the sparse behavior is equivalent to the dense
-  behavior (in contrast to some momentum implementations which ignore momentum
-  unless a variable slice was actually used).
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               name='Adam',
-               **kwargs):
-    super().__init__(name, **kwargs)
-    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
-    self._set_hyper('decay', self._initial_decay)
-    self._set_hyper('beta_1', beta_1)
-    self._set_hyper('beta_2', beta_2)
-    self.epsilon = epsilon or backend_config.epsilon()
-    self.amsgrad = amsgrad
-
-  def _create_slots(self, var_list):
-    # Create slots for the first and second moments.
-    # Separate for-loops to respect the ordering of slot variables from v1.
-    for var in var_list:
-      self.add_slot(var, 'm')
-    for var in var_list:
-      self.add_slot(var, 'v')
-    if self.amsgrad:
-      for var in var_list:
-        self.add_slot(var, 'vhat')
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-
-    local_step = tf.cast(self.iterations + 1, var_dtype)
-    beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
-    beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
-    beta_1_power = tf.pow(beta_1_t, local_step)
-    beta_2_power = tf.pow(beta_2_t, local_step)
-    lr = (apply_state[(var_device, var_dtype)]['lr_t'] *
-          (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
-    apply_state[(var_device, var_dtype)].update(
-        dict(
-            lr=lr,
-            epsilon=tf.convert_to_tensor(
-                self.epsilon, var_dtype),
-            beta_1_t=beta_1_t,
-            beta_1_power=beta_1_power,
-            one_minus_beta_1_t=1 - beta_1_t,
-            beta_2_t=beta_2_t,
-            beta_2_power=beta_2_power,
-            one_minus_beta_2_t=1 - beta_2_t))
-
-  def set_weights(self, weights):
-    params = self.weights
-    # If the weights are generated by Keras V1 optimizer, it includes vhats
-    # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
-    # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
-    num_vars = int((len(params) - 1) / 2)
-    if len(weights) == 3 * num_vars + 1:
-      weights = weights[:len(params)]
-    super().set_weights(weights)
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
-
-    if not self.amsgrad:
-      return tf.raw_ops.ResourceApplyAdam(
-          var=var.handle,
-          m=m.handle,
-          v=v.handle,
-          beta1_power=coefficients['beta_1_power'],
-          beta2_power=coefficients['beta_2_power'],
-          lr=coefficients['lr_t'],
-          beta1=coefficients['beta_1_t'],
-          beta2=coefficients['beta_2_t'],
-          epsilon=coefficients['epsilon'],
-          grad=grad,
-          use_locking=self._use_locking)
-    else:
-      vhat = self.get_slot(var, 'vhat')
-      return tf.raw_ops.ResourceApplyAdamWithAmsgrad(
-          var=var.handle,
-          m=m.handle,
-          v=v.handle,
-          vhat=vhat.handle,
-          beta1_power=coefficients['beta_1_power'],
-          beta2_power=coefficients['beta_2_power'],
-          lr=coefficients['lr_t'],
-          beta1=coefficients['beta_1_t'],
-          beta2=coefficients['beta_2_t'],
-          epsilon=coefficients['epsilon'],
-          grad=grad,
-          use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    # m_t = beta1 * m + (1 - beta1) * g_t
-    m = self.get_slot(var, 'm')
-    m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
-    m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'],
-                           use_locking=self._use_locking)
-    with tf.control_dependencies([m_t]):
-      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
-
-    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
-    v = self.get_slot(var, 'v')
-    v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
-    v_t = tf.compat.v1.assign(v, v * coefficients['beta_2_t'],
-                           use_locking=self._use_locking)
-    with tf.control_dependencies([v_t]):
-      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
-
-    if not self.amsgrad:
-      v_sqrt = tf.sqrt(v_t)
-      var_update = tf.compat.v1.assign_sub(
-          var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']),
-          use_locking=self._use_locking)
-      return tf.group(*[var_update, m_t, v_t])
-    else:
-      v_hat = self.get_slot(var, 'vhat')
-      v_hat_t = tf.maximum(v_hat, v_t)
-      with tf.control_dependencies([v_hat_t]):
-        v_hat_t = tf.compat.v1.assign(
-            v_hat, v_hat_t, use_locking=self._use_locking)
-      v_hat_sqrt = tf.sqrt(v_hat_t)
-      var_update = tf.compat.v1.assign_sub(
-          var,
-          coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']),
-          use_locking=self._use_locking)
-      return tf.group(*[var_update, m_t, v_t, v_hat_t])
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'learning_rate': self._serialize_hyperparameter('learning_rate'),
-        'decay': self._initial_decay,
-        'beta_1': self._serialize_hyperparameter('beta_1'),
-        'beta_2': self._serialize_hyperparameter('beta_2'),
-        'epsilon': self.epsilon,
-        'amsgrad': self.amsgrad,
-    })
-    return config
-
-
-class NonFusedAdam(optimizer_v2.OptimizerV2):
-  r"""Optimizer that implements the Adam algorithm without fused kernels.
-
-  Adam optimization is a stochastic gradient descent method that is based on
-  adaptive estimation of first-order and second-order moments.
-  According to the paper
-  [Adam: A Method for Stochastic Optimization. Kingma et al.,
-  2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
-  efficient, has little memory requirement, invariant to diagonal rescaling of
-  gradients, and is well suited for problems that are large in terms of
-  data/parameters*".
-
-  For AMSGrad see [On The Convergence Of Adam And Beyond.
-  Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
-
-  **If amsgrad = False**:
-
-  initialize $m_0$ as 1st moment vector
-  initialize $v_0$ as 2nd moment vector
-
-  The update rule for $\theta$ with gradient $g$ uses an optimization
-  described at the end of section 2 of the paper:
-
-  $$lr_t = \mathrm{learning\_rate} *
-    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-
-  **If amsgrad = True**:
-
-  initialize $m_0$ as 1st moment vector
-  initialize $v_0$ as 2nd moment vector
-  initialize $\hat{v}_0$ as 2nd moment vector
-
-  The update rule for $\theta$ with gradient $g$ uses an optimization
-  described at the end of section 2 of the paper:
-
-  $$lr_t = \mathrm{learning\_rate} *
-    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-
-  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-  $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
-  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
-
-  The default value of 1e-7 for epsilon might not be a good default in
-  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since Adam uses the
-  formulation just before Section 2.1 of the Kingma and Ba paper rather than
-  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
-  hat" in the paper.
-
-  The sparse implementation of this algorithm (used when the gradient is an
-  IndexedSlices object, typically because of `tf.gather` or an embedding
-  lookup in the forward pass) does apply momentum to variable slices even if
-  they were not used in the forward pass (meaning they have a gradient equal
-  to zero). Momentum decay (beta1) is also applied to the entire momentum
-  accumulator. This means that the sparse behavior is equivalent to the dense
-  behavior (in contrast to some momentum implementations which ignore momentum
-  unless a variable slice was actually used).
-
-  Usage:
-
-  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
-  >>> var1 = tf.Variable(10.0)
-  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
-  >>> step_count = opt.minimize(loss, [var1]).numpy()
-  >>> # The first step is `-learning_rate*sign(grad)`
-  >>> var1.numpy()
-  9.9
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               name='Adam',
-               **kwargs):
-    """Construct a new Adam optimizer.
-
-    Args:
-      learning_rate: A `Tensor`, floating point value, or a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that
-        takes no arguments and returns the actual value to use, The learning
-        rate. Defaults to 0.001.
-      beta_1: A float value or a constant float tensor, or a callable that takes
-        no arguments and returns the actual value to use. The exponential decay
-        rate for the 1st moment estimates. Defaults to 0.9.
-      beta_2: A float value or a constant float tensor, or a callable that takes
-        no arguments and returns the actual value to use, The exponential decay
-        rate for the 2nd moment estimates. Defaults to 0.999.
-      epsilon: A small constant for numerical stability. This epsilon is
-        "epsilon hat" in the Kingma and Ba paper (in the formula just before
-        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
-      amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
-        the paper "On the Convergence of Adam and beyond". Defaults to `False`.
-      name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
-    """
-
-    super().__init__(name, **kwargs)
-    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
-    self._set_hyper('decay', self._initial_decay)
-    self._set_hyper('beta_1', beta_1)
-    self._set_hyper('beta_2', beta_2)
-    self.epsilon = epsilon or backend_config.epsilon()
-    self.amsgrad = amsgrad
-
-  def _create_slots(self, var_list):
-    # Create slots for the first and second moments.
-    # Separate for-loops to respect the ordering of slot variables from v1.
-    for var in var_list:
-      self.add_slot(var, 'm')
-    for var in var_list:
-      self.add_slot(var, 'v')
-    if self.amsgrad:
-      for var in var_list:
-        self.add_slot(var, 'vhat')
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-
-    local_step = tf.cast(self.iterations + 1, var_dtype)
-    beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
-    beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
-    beta_1_power = tf.pow(beta_1_t, local_step)
-    beta_2_power = tf.pow(beta_2_t, local_step)
-    lr = (
-        apply_state[(var_device, var_dtype)]['lr_t'] *
-        (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
-    apply_state[(var_device, var_dtype)].update(
-        dict(
-            lr=lr,
-            epsilon=tf.convert_to_tensor(
-                self.epsilon, var_dtype),
-            beta_1_t=beta_1_t,
-            beta_1_power=beta_1_power,
-            one_minus_beta_1_t=1 - beta_1_t,
-            beta_2_t=beta_2_t,
-            beta_2_power=beta_2_power,
-            one_minus_beta_2_t=1 - beta_2_t))
-
-  def set_weights(self, weights):
-    params = self.weights
-    # If the weights are generated by Keras V1 optimizer, it includes vhats
-    # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
-    # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
-    num_vars = int((len(params) - 1) / 2)
-    if len(weights) == 3 * num_vars + 1:
-      weights = weights[:len(params)]
-    super().set_weights(weights)
-
-  @tf.function(jit_compile=True)
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
-                    self._fallback_apply_state(var_device, var_dtype))
-
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
-
-    alpha = (
-        coefficients['lr_t'] * tf.sqrt(1 - coefficients['beta_2_power']) /
-        (1 - coefficients['beta_1_power']))
-    m.assign_add((grad - m) * (1 - coefficients['beta_1_t']))
-    v.assign_add((tf.square(grad) - v) * (1 - coefficients['beta_2_t']))
-    if self.amsgrad:
-      vhat = self.get_slot(var, 'vhat')
-      vhat.assign(tf.maximum(vhat, v))
-      v = vhat
-    var.assign_sub(
-        (m * alpha) / (tf.sqrt(v) - coefficients['epsilon']))
-
-  @tf.function(jit_compile=True)
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
-                    self._fallback_apply_state(var_device, var_dtype))
-
-    # m_t = beta1 * m + (1 - beta1) * g_t
-    m = self.get_slot(var, 'm')
-    m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
-    m.assign(m * coefficients['beta_1_t'])
-    m.scatter_add(tf.IndexedSlices(m_scaled_g_values, indices))
-
-    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
-    v = self.get_slot(var, 'v')
-    v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
-    v.assign(v * coefficients['beta_2_t'])
-    v.scatter_add(tf.IndexedSlices(v_scaled_g_values, indices))
-
-    if not self.amsgrad:
-      var.assign_sub(coefficients['lr'] * m /
-                     (tf.sqrt(v) + coefficients['epsilon']))
-    else:
-      v_hat = self.get_slot(var, 'vhat')
-      v_hat.assign(tf.maximum(v_hat, v))
-      var.assign_sub(coefficients['lr'] * m /
-                     (tf.sqrt(v_hat) + coefficients['epsilon']))
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'learning_rate': self._serialize_hyperparameter('learning_rate'),
-        'decay': self._initial_decay,
-        'beta_1': self._serialize_hyperparameter('beta_1'),
-        'beta_2': self._serialize_hyperparameter('beta_2'),
-        'epsilon': self.epsilon,
-        'amsgrad': self.amsgrad,
-    })
-    return config
diff --git a/keras/optimizers/optimizer_v2/adam_test.py b/keras/optimizers/optimizer_v2/adam_test.py
deleted file mode 100644
index 6384fa109596..000000000000
--- a/keras/optimizers/optimizer_v2/adam_test.py
+++ /dev/null
@@ -1,981 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Adam."""
-
-import tensorflow.compat.v2 as tf
-
-from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
-from keras.optimizers import optimizer_v1
-from keras.optimizers.optimizer_v2 import adam
-from keras.optimizers.schedules import learning_rate_schedule
-
-
-def adam_update_numpy(param,
-                      g_t,
-                      t,
-                      m,
-                      v,
-                      lr=0.001,
-                      beta1=0.9,
-                      beta2=0.999,
-                      epsilon=1e-7):
-  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
-
-  m_t = beta1 * m + (1 - beta1) * g_t
-  v_t = beta2 * v + (1 - beta2) * g_t * g_t
-
-  param_t = param - lr_t * m_t / (np.sqrt(v_t) + epsilon)
-  return param_t, m_t, v_t
-
-
-def adam_update_numpy_amsgrad(param,
-                              g_t,
-                              t,
-                              m,
-                              v,
-                              vhat,
-                              lr=0.001,
-                              beta1=0.9,
-                              beta2=0.999,
-                              epsilon=1e-7):
-  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
-
-  m_t = beta1 * m + (1 - beta1) * g_t
-  v_t = beta2 * v + (1 - beta2) * g_t * g_t
-  vhat_t = np.maximum(vhat, v_t)
-
-  param_t = param - lr_t * m_t / (np.sqrt(vhat_t) + epsilon)
-  return param_t, m_t, v_t, vhat_t
-
-
-def adam_sparse_update_numpy_amsgrad(param,
-                                     indices,
-                                     g_t,
-                                     t,
-                                     m,
-                                     v,
-                                     vhat,
-                                     lr=0.001,
-                                     beta1=0.9,
-                                     beta2=0.999,
-                                     epsilon=1e-7):
-  m_t, v_t, vhat_t, param_t = (np.copy(m), np.copy(v), np.copy(vhat),
-                               np.copy(param))
-  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
-  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
-  v_t_slice = beta2 * v[indices] + (1 - beta2) * g_t * g_t
-  m_t[indices] = m_t_slice
-  v_t[indices] = v_t_slice
-  v_hat_t = np.maximum(vhat_t, v_t)
-  v_hat_t_slice = v_hat_t[indices]
-  param_t_slice = param[indices] - (
-      lr_t * (m_t_slice / (np.sqrt(v_hat_t_slice) + epsilon)))
-  param_t[indices] = param_t_slice
-  return param_t, m_t, v_t, vhat_t
-
-
-def get_beta_accumulators(opt, dtype):
-  local_step = tf.cast(opt.iterations + 1, dtype)
-  beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype)
-  beta_1_power = tf.pow(beta_1_t, local_step)
-  beta_2_t = tf.cast(opt._get_hyper("beta_2"), dtype)
-  beta_2_power = tf.pow(beta_2_t, local_step)
-  return (beta_1_power, beta_2_power)
-
-
-class AdamOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def testSparse(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0_np_indices = np.array([0, 2], dtype=np.int32)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np[grads0_np_indices]),
-            tf.constant(grads0_np_indices), tf.constant([3]))
-        grads1_np_indices = np.array([0, 2], dtype=np.int32)
-        grads1 = tf.IndexedSlices(
-            tf.constant(grads1_np[grads1_np_indices]),
-            tf.constant(grads1_np_indices), tf.constant([3]))
-        opt = adam.Adam()
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
-
-        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-        # Run 3 steps of Adam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          update.run()
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testSparseDevicePlacement(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for index_dtype in [tf.int32, tf.int64]:
-      with tf.Graph().as_default(), self.cached_session(
-          force_gpu=tf.test.is_gpu_available()):
-        # If a GPU is available, tests that all optimizer ops can be placed on
-        # it (i.e. they have GPU kernels).
-        var = tf.Variable([[1.0], [2.0]])
-        indices = tf.constant([0, 1], dtype=index_dtype)
-        g_sum = lambda: tf.reduce_sum(tf.gather(var, indices))  # pylint: disable=cell-var-from-loop
-        optimizer = adam.Adam(3.0)
-        minimize_op = optimizer.minimize(g_sum, var_list=[var])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        minimize_op.run()
-
-  def testSparseRepeatedIndices(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        repeated_index_update_var = tf.Variable(
-            [[1.0], [2.0]], dtype=dtype)
-        aggregated_update_var = tf.Variable(
-            [[1.0], [2.0]], dtype=dtype)
-        grad_repeated_index = tf.IndexedSlices(
-            tf.constant(
-                [0.1, 0.1], shape=[2, 1], dtype=dtype),
-            tf.constant([1, 1]),
-            tf.constant([2, 1]))
-        grad_aggregated = tf.IndexedSlices(
-            tf.constant(
-                [0.2], shape=[1, 1], dtype=dtype),
-            tf.constant([1]),
-            tf.constant([2, 1]))
-        repeated_update = adam.Adam().apply_gradients(
-            [(grad_repeated_index, repeated_index_update_var)])
-        aggregated_update = adam.Adam().apply_gradients(
-            [(grad_aggregated, aggregated_update_var)])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertAllClose(aggregated_update_var,
-                            self.evaluate(repeated_index_update_var))
-        for _ in range(3):
-          repeated_update.run()
-          aggregated_update.run()
-          self.assertAllClose(aggregated_update_var,
-                              self.evaluate(repeated_index_update_var))
-
-  def doTestBasic(self, use_callable_params=False):
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = lambda: 0.001
-        beta1 = lambda: 0.9
-        beta2 = lambda: 0.999
-        epsilon = lambda: 1e-8
-        if not use_callable_params:
-          learning_rate = learning_rate()
-          beta1 = beta1()
-          beta2 = beta2()
-          epsilon = epsilon()
-
-        opt = adam.Adam(learning_rate=learning_rate)
-        if not tf.executing_eagerly():
-          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of Adam
-        for t in range(3):
-          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          if not tf.executing_eagerly():
-            self.evaluate(update)
-          else:
-            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testResourceBasic(self):
-    self.doTestBasic()
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testBasicCallableParams(self):
-    self.doTestBasic(use_callable_params=True)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasicWithAmsgrad(self):
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        opt = adam.Adam(amsgrad=True)
-        if not tf.executing_eagerly():
-          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of Adam
-        for t in range(3):
-          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          if not tf.executing_eagerly():
-            self.evaluate(update)
-          else:
-            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-          var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
-              var0_np, grads0_np, t, m0, v0, v0hat)
-          var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
-              var1_np, grads1_np, t, m1, v1, v1hat)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testSparseWithAmsgrad(self):
-    # dtypes.half does not work on gpu + eager.
-    for dtype in [tf.float32, tf.float64]:
-      with self.cached_session():
-        m0 = np.array([[0.0], [0.0]])
-        v0 = np.array([[0.0], [0.0]])
-        v0hat = np.array([[0.0], [0.0]])
-        indices_np = np.array([1])
-        indices = tf.constant(indices_np, dtype=tf.int32)
-        var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
-        repeated_index_update_var = tf.Variable(var0_np, dtype=dtype)
-        aggregated_update_var = tf.Variable(var0_np, dtype=dtype)
-        grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
-        grad_repeated_index = tf.IndexedSlices(
-            tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
-            tf.constant([1, 1]), tf.constant([2, 1]))
-        grad_aggregated = tf.IndexedSlices(grads0_np, indices,
-                                            tf.constant([2, 1]))
-        opt_repeated = adam.Adam(amsgrad=True)
-        opt_aggregated = adam.Adam(amsgrad=True)
-        if not tf.executing_eagerly():
-          repeated_update = opt_repeated.apply_gradients(
-              [(grad_repeated_index, repeated_index_update_var)])
-          aggregated_update = opt_aggregated.apply_gradients(
-              [(grad_aggregated, aggregated_update_var)])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertAllClose(
-            self.evaluate(aggregated_update_var),
-            self.evaluate(repeated_index_update_var))
-        for t in range(3):
-          if not tf.executing_eagerly():
-            self.evaluate(repeated_update)
-            self.evaluate(aggregated_update)
-          else:
-            opt_repeated.apply_gradients(
-                [(grad_repeated_index, repeated_index_update_var)])
-            opt_aggregated.apply_gradients(
-                [(grad_aggregated, aggregated_update_var)])
-
-          var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
-              var0_np, indices_np, grads0_np, t, m0, v0, v0hat)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(
-              var0_np, self.evaluate(aggregated_update_var))
-          self.assertAllCloseAccordingToType(
-              self.evaluate(aggregated_update_var),
-              self.evaluate(repeated_index_update_var))
-
-  def testBasicWithLearningRateDecay(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = 0.001
-        beta_1 = 0.9
-        beta_2 = 0.999
-        epsilon = 1e-7
-        decay = 0.5
-
-        opt = adam.Adam(
-            learning_rate=learning_rate,
-            beta_1=beta_1,
-            beta_2=beta_2,
-            epsilon=epsilon,
-            decay=decay)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of Adam
-        for t in range(3):
-          self.evaluate(update)
-          lr_np = learning_rate / (1 + decay * t)
-
-          var0_np, m0, v0 = adam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, lr=lr_np)
-          var1_np, m1, v1 = adam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, lr=lr_np)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testBasicWithLearningRateInverseTimeDecay(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = 0.001
-        decay = 0.5
-        lr_schedule = learning_rate_schedule.InverseTimeDecay(
-            learning_rate, decay_steps=1.0, decay_rate=decay)
-        beta_1 = 0.9
-        beta_2 = 0.999
-        epsilon = 1e-7
-
-        opt = adam.Adam(
-            learning_rate=lr_schedule,
-            beta_1=beta_1,
-            beta_2=beta_2,
-            epsilon=epsilon)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of Adam
-        for t in range(3):
-          self.evaluate(update)
-
-          lr_np = learning_rate / (1 + decay * t)
-
-          var0_np, m0, v0 = adam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, lr=lr_np)
-          var1_np, m1, v1 = adam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, lr=lr_np)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testTensorLearningRate(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-        opt = adam.Adam(tf.constant(0.001))
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-        # Run 3 steps of Adam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          update.run()
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testSharing(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-        opt = adam.Adam()
-        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        # Run 3 steps of intertwined Adam1 and Adam2.
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          if t % 2 == 0:
-            update1.run()
-          else:
-            update2.run()
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testSlotsUniqueEager(self):
-    v1 = tf.Variable(1.)
-    v2 = tf.Variable(1.)
-    opt = adam.Adam(1.)
-    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-    # There should be iteration, and two unique slot variables for v1 and v2.
-    self.assertLen(set(v.ref() for v in opt.variables()), 5)
-    self.assertEqual(
-        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
-
-  def testSetWeightsFromV1AdamWithoutMinimize(self):
-    keras_v1_adam = optimizer_v1.Adam()
-    keras_v2_adam = adam.Adam()
-    keras_v2_adam.set_weights(keras_v1_adam.get_weights())
-    keras_v1_iteration = keras_v1_adam.iterations
-    keras_v2_iteration = keras_v2_adam.iterations
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertEqual(
-        self.evaluate(keras_v1_iteration), self.evaluate(keras_v2_iteration))
-
-  def testConstructAdamWithLR(self):
-    opt = adam.Adam(lr=1.0)
-    opt_2 = adam.Adam(learning_rate=0.1, lr=1.0)
-    opt_3 = adam.Adam(learning_rate=0.1)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-
-class NonFusedAdamOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def testSparse(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0_np_indices = np.array([0, 2], dtype=np.int32)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np[grads0_np_indices]),
-            tf.constant(grads0_np_indices), tf.constant([3]))
-        grads1_np_indices = np.array([0, 2], dtype=np.int32)
-        grads1 = tf.IndexedSlices(
-            tf.constant(grads1_np[grads1_np_indices]),
-            tf.constant(grads1_np_indices), tf.constant([3]))
-        opt = adam.NonFusedAdam()
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
-
-        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-        # Run 3 steps of NonFusedAdam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          update.run()
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testSparseDevicePlacement(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for index_dtype in [tf.int32, tf.int64]:
-      with tf.Graph().as_default(), self.cached_session(
-          force_gpu=tf.test.is_gpu_available()):
-        # If a GPU is available, tests that all optimizer ops can be placed on
-        # it (i.e. they have GPU kernels).
-        var = tf.Variable([[1.0], [2.0]])
-        indices = tf.constant([0, 1], dtype=index_dtype)
-        g_sum = lambda: tf.reduce_sum(tf.gather(var, indices))  # pylint: disable=cell-var-from-loop
-        optimizer = adam.NonFusedAdam(3.0)
-        minimize_op = optimizer.minimize(g_sum, var_list=[var])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        minimize_op.run()
-
-  def testSparseRepeatedIndices(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        repeated_index_update_var = tf.Variable(
-            [[1.0], [2.0]], dtype=dtype)
-        aggregated_update_var = tf.Variable(
-            [[1.0], [2.0]], dtype=dtype)
-        grad_repeated_index = tf.IndexedSlices(
-            tf.constant(
-                [0.1, 0.1], shape=[2, 1], dtype=dtype),
-            tf.constant([1, 1]),
-            tf.constant([2, 1]))
-        grad_aggregated = tf.IndexedSlices(
-            tf.constant(
-                [0.2], shape=[1, 1], dtype=dtype),
-            tf.constant([1]),
-            tf.constant([2, 1]))
-        repeated_update = adam.NonFusedAdam().apply_gradients(
-            [(grad_repeated_index, repeated_index_update_var)])
-        aggregated_update = adam.NonFusedAdam().apply_gradients(
-            [(grad_aggregated, aggregated_update_var)])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertAllClose(aggregated_update_var,
-                            self.evaluate(repeated_index_update_var))
-        for _ in range(3):
-          repeated_update.run()
-          aggregated_update.run()
-          self.assertAllClose(aggregated_update_var,
-                              self.evaluate(repeated_index_update_var))
-
-  def doTestBasic(self, use_callable_params=False):
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = lambda: 0.001
-        beta1 = lambda: 0.9
-        beta2 = lambda: 0.999
-        epsilon = lambda: 1e-8
-        if not use_callable_params:
-          learning_rate = learning_rate()
-          beta1 = beta1()
-          beta2 = beta2()
-          epsilon = epsilon()
-
-        opt = adam.NonFusedAdam(learning_rate=learning_rate)
-        if not tf.executing_eagerly():
-          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of NonFusedAdam
-        for t in range(3):
-          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          if not tf.executing_eagerly():
-            self.evaluate(update)
-          else:
-            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(
-              var0_np, self.evaluate(var0), rtol=1e-4, atol=1e-4)
-          self.assertAllCloseAccordingToType(
-              var1_np, self.evaluate(var1), rtol=1e-4, atol=1e-4)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testResourceBasic(self):
-    self.doTestBasic()
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testBasicCallableParams(self):
-    self.doTestBasic(use_callable_params=True)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasicWithAmsgrad(self):
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        opt = adam.NonFusedAdam(amsgrad=True)
-        if not tf.executing_eagerly():
-          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of NonFusedAdam
-        for t in range(3):
-          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          if not tf.executing_eagerly():
-            self.evaluate(update)
-          else:
-            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-          var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
-              var0_np, grads0_np, t, m0, v0, v0hat)
-          var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
-              var1_np, grads1_np, t, m1, v1, v1hat)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(
-              var0_np, self.evaluate(var0), rtol=1e-4, atol=1e-4)
-          self.assertAllCloseAccordingToType(
-              var1_np, self.evaluate(var1), rtol=1e-4, atol=1e-4)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testSparseWithAmsgrad(self):
-    # dtypes.half does not work on gpu + eager.
-    for dtype in [tf.float32, tf.float64]:
-      with self.cached_session():
-        m0 = np.array([[0.0], [0.0]])
-        v0 = np.array([[0.0], [0.0]])
-        v0hat = np.array([[0.0], [0.0]])
-        indices_np = np.array([1])
-        indices = tf.constant(indices_np, dtype=tf.int32)
-        var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
-        repeated_index_update_var = tf.Variable(var0_np, dtype=dtype)
-        aggregated_update_var = tf.Variable(var0_np, dtype=dtype)
-        grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
-        grad_repeated_index = tf.IndexedSlices(
-            tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
-            tf.constant([1, 1]), tf.constant([2, 1]))
-        grad_aggregated = tf.IndexedSlices(grads0_np, indices,
-                                            tf.constant([2, 1]))
-        opt_repeated = adam.NonFusedAdam(amsgrad=True)
-        opt_aggregated = adam.NonFusedAdam(amsgrad=True)
-        if not tf.executing_eagerly():
-          repeated_update = opt_repeated.apply_gradients(
-              [(grad_repeated_index, repeated_index_update_var)])
-          aggregated_update = opt_aggregated.apply_gradients(
-              [(grad_aggregated, aggregated_update_var)])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertAllClose(
-            self.evaluate(aggregated_update_var),
-            self.evaluate(repeated_index_update_var))
-        for t in range(3):
-          if not tf.executing_eagerly():
-            self.evaluate(repeated_update)
-            self.evaluate(aggregated_update)
-          else:
-            opt_repeated.apply_gradients(
-                [(grad_repeated_index, repeated_index_update_var)])
-            opt_aggregated.apply_gradients(
-                [(grad_aggregated, aggregated_update_var)])
-
-          var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
-              var0_np, indices_np, grads0_np, t, m0, v0, v0hat)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(
-              var0_np, self.evaluate(aggregated_update_var))
-          self.assertAllCloseAccordingToType(
-              self.evaluate(aggregated_update_var),
-              self.evaluate(repeated_index_update_var))
-
-  def testBasicWithLearningRateDecay(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = 0.001
-        beta_1 = 0.9
-        beta_2 = 0.999
-        epsilon = 1e-7
-        decay = 0.5
-
-        opt = adam.NonFusedAdam(
-            learning_rate=learning_rate,
-            beta_1=beta_1,
-            beta_2=beta_2,
-            epsilon=epsilon,
-            decay=decay)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of NonFusedAdam
-        for t in range(3):
-          self.evaluate(update)
-          lr_np = learning_rate / (1 + decay * t)
-
-          var0_np, m0, v0 = adam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, lr=lr_np)
-          var1_np, m1, v1 = adam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, lr=lr_np)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testBasicWithLearningRateInverseTimeDecay(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = 0.001
-        decay = 0.5
-        lr_schedule = learning_rate_schedule.InverseTimeDecay(
-            learning_rate, decay_steps=1.0, decay_rate=decay)
-        beta_1 = 0.9
-        beta_2 = 0.999
-        epsilon = 1e-7
-
-        opt = adam.NonFusedAdam(
-            learning_rate=lr_schedule,
-            beta_1=beta_1,
-            beta_2=beta_2,
-            epsilon=epsilon)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 3 steps of NonFusedAdam
-        for t in range(3):
-          self.evaluate(update)
-
-          lr_np = learning_rate / (1 + decay * t)
-
-          var0_np, m0, v0 = adam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, lr=lr_np)
-          var1_np, m1, v1 = adam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, lr=lr_np)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testTensorLearningRate(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-        opt = adam.NonFusedAdam(tf.constant(0.001))
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-        # Run 3 steps of NonFusedAdam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          update.run()
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testSharing(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-        opt = adam.NonFusedAdam()
-        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        # Run 3 steps of intertwined NonFusedAdam1 and NonFusedAdam2.
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1),
-                                             self.evaluate(beta_2_power))
-          if t % 2 == 0:
-            update1.run()
-          else:
-            update2.run()
-
-          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/adamax.py b/keras/optimizers/optimizer_v2/adamax.py
deleted file mode 100644
index 972a08ed43bd..000000000000
--- a/keras/optimizers/optimizer_v2/adamax.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Adamax optimizer implementation."""
-
-import tensorflow.compat.v2 as tf
-from keras import backend_config
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.optimizers.Adamax')
-class Adamax(optimizer_v2.OptimizerV2):
-  """Optimizer that implements the Adamax algorithm.
-
-  It is a variant of Adam based on the infinity norm.
-  Default parameters follow those provided in the paper.
-  Adamax is sometimes superior to adam, specially in models with embeddings.
-
-  Initialization:
-
-  ```python
-  m = 0  # Initialize initial 1st moment vector
-  v = 0  # Initialize the exponentially weighted infinity norm
-  t = 0  # Initialize timestep
-  ```
-
-  The update rule for parameter `w` with gradient `g` is
-  described at the end of section 7.1 of the paper:
-
-  ```python
-  t += 1
-  m = beta1 * m + (1 - beta) * g
-  v = max(beta2 * v, abs(g))
-  current_lr = learning_rate / (1 - beta1 ** t)
-  w = w - current_lr * m / (v + epsilon)
-  ```
-
-  Similarly to `Adam`, the epsilon is added for numerical stability
-  (especially to get rid of division by zero when `v_t == 0`).
-
-  In contrast to `Adam`, the sparse implementation of this algorithm
-  (used when the gradient is an IndexedSlices object, typically because of
-  `tf.gather` or an embedding lookup in the forward pass) only updates
-  variable slices and corresponding `m_t`, `v_t` terms when that part of
-  the variable was used in the forward pass. This means that the sparse
-  behavior is contrast to the dense behavior (similar to some momentum
-  implementations which ignore momentum unless a variable slice was actually
-  used).
-
-  Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-    beta_1: A float value or a constant float tensor. The exponential decay
-      rate for the 1st moment estimates.
-    beta_2: A float value or a constant float tensor. The exponential decay
-      rate for the exponentially weighted infinity norm.
-    epsilon: A small constant for numerical stability.
-    name: Optional name for the operations created when applying gradients.
-      Defaults to `"Adamax"`.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value.
-
-  Reference:
-    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               name='Adamax',
-               **kwargs):
-    super().__init__(name, **kwargs)
-    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
-    self._set_hyper('decay', self._initial_decay)
-    self._set_hyper('beta_1', beta_1)
-    self._set_hyper('beta_2', beta_2)
-    self.epsilon = epsilon or backend_config.epsilon()
-
-  def _create_slots(self, var_list):
-    # Separate for-loops to respect the ordering of slot variables from v1.
-    for var in var_list:
-      self.add_slot(var, 'm')  # Create slots for the first moments.
-    for var in var_list:
-      self.add_slot(var, 'v')  # Create slots for the second moments.
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-
-    local_step = tf.cast(self.iterations + 1, var_dtype)
-    beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
-    beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
-    beta_1_power = tf.pow(beta_1_t, local_step)
-    lr_t = apply_state[(var_device, var_dtype)]['lr_t']
-
-    apply_state[(var_device, var_dtype)].update(
-        dict(
-            neg_scaled_lr=-lr_t / (1 - beta_1_power),
-            epsilon=tf.convert_to_tensor(
-                self.epsilon, var_dtype),
-            beta_1_t=beta_1_t,
-            beta_1_power=beta_1_power,
-            one_minus_beta_1_t=1 - beta_1_t,
-            beta_2_t=beta_2_t,
-            zero=tf.zeros((), dtype=tf.int64)))
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
-    return tf.raw_ops.ResourceApplyAdaMax(
-        var=var.handle,
-        m=m.handle,
-        v=v.handle,
-        beta1_power=coefficients['beta_1_power'],
-        lr=coefficients['lr_t'],
-        beta1=coefficients['beta_1_t'],
-        beta2=coefficients['beta_2_t'],
-        epsilon=coefficients['epsilon'],
-        grad=grad,
-        use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    # m_t = beta1 * m + (1 - beta1) * g_t
-    m = self.get_slot(var, 'm')
-    m_slice = tf.gather(m, indices, axis=coefficients['zero'])
-    m_t_slice = (m_slice * coefficients['beta_1_t'] +
-                 grad * coefficients['one_minus_beta_1_t'])
-    with tf.control_dependencies([m_t_slice]):
-      m_t = self._resource_scatter_update(m, indices, m_t_slice)
-
-    # u_t = max(beta2 * u, abs(g_t))
-    v = self.get_slot(var, 'v')
-    v_slice = tf.gather(v, indices, axis=coefficients['zero'])
-    v_t_slice = tf.maximum(v_slice * coefficients['beta_2_t'],
-                                 tf.abs(grad))
-    with tf.control_dependencies([v_t_slice]):
-      v_t = self._resource_scatter_update(v, indices, v_t_slice)
-    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
-    var_slice = coefficients['neg_scaled_lr'] * (
-        m_t_slice / (v_t_slice + coefficients['epsilon']))
-    with tf.control_dependencies([var_slice]):
-      var_update = self._resource_scatter_add(var, indices, var_slice)
-    return tf.group(*[var_update, m_t, v_t])
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'learning_rate': self._serialize_hyperparameter('learning_rate'),
-        'decay': self._initial_decay,
-        'beta_1': self._serialize_hyperparameter('beta_1'),
-        'beta_2': self._serialize_hyperparameter('beta_2'),
-        'epsilon': self.epsilon,
-    })
-    return config
diff --git a/keras/optimizers/optimizer_v2/adamax_test.py b/keras/optimizers/optimizer_v2/adamax_test.py
deleted file mode 100644
index 5d5eb52bfd71..000000000000
--- a/keras/optimizers/optimizer_v2/adamax_test.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Adamax."""
-
-import tensorflow.compat.v2 as tf
-
-from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
-from keras.optimizers.optimizer_v2 import adamax
-
-
-def adamax_update_numpy(param,
-                        g_t,
-                        t,
-                        m,
-                        v,
-                        alpha=0.001,
-                        beta1=0.9,
-                        beta2=0.999,
-                        epsilon=1e-8):
-  m_t = beta1 * m + (1 - beta1) * g_t
-  v_t = np.maximum(beta2 * v, np.abs(g_t))
-  param_t = param - (alpha / (1 - beta1**(t + 1))) * (m_t / (v_t + epsilon))
-  return param_t, m_t, v_t
-
-
-def adamax_sparse_update_numpy(param,
-                               indices,
-                               g_t,
-                               t,
-                               m,
-                               v,
-                               alpha=0.001,
-                               beta1=0.9,
-                               beta2=0.999,
-                               epsilon=1e-8):
-  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
-  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
-  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
-  param_t_slice = param[indices] - (
-      (alpha / (1 - beta1**(t + 1))) * (m_t_slice / (v_t_slice + epsilon)))
-  m_t[indices] = m_t_slice
-  v_t[indices] = v_t_slice
-  param_t[indices] = param_t_slice
-  return param_t, m_t, v_t
-
-
-def get_beta_accumulators(opt, dtype):
-  local_step = tf.cast(opt.iterations + 1, dtype)
-  beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype)
-  beta_1_power = tf.pow(beta_1_t, local_step)
-  return beta_1_power
-
-
-class AdamaxOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def testResourceSparse(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)  # pylint: disable=cell-var-from-loop
-        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
-        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-
-        grads0_np_indices = np.array([0, 1], dtype=np.int32)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np),
-            tf.constant(grads0_np_indices), tf.constant([3]))
-        grads1_np_indices = np.array([2, 1], dtype=np.int32)
-        grads1 = tf.IndexedSlices(
-            tf.constant(grads1_np),
-            tf.constant(grads1_np_indices), tf.constant([3]))
-        opt = adamax.Adamax()
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0, 3.0], var0)
-        self.assertAllClose([4.0, 5.0, 6.0], var1)
-
-        beta1_power = get_beta_accumulators(opt, dtype)
-
-        # Run 3 steps of Adamax
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
-          update.run()
-
-          var0_np, m0, v0 = adamax_sparse_update_numpy(
-              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adamax_sparse_update_numpy(
-              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0)
-          self.assertAllCloseAccordingToType(var1_np, var1)
-
-  def testSparseDevicePlacement(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for index_dtype in [tf.int32, tf.int64]:
-      with tf.Graph().as_default(), self.cached_session(
-          force_gpu=tf.test.is_gpu_available()):
-        # If a GPU is available, tests that all optimizer ops can be placed on
-        # it (i.e. they have GPU kernels).
-        var = tf.Variable([[1.0], [2.0]])
-        indices = tf.constant([0, 1], dtype=index_dtype)
-        g_sum = lambda: tf.reduce_sum(tf.gather(var, indices))  # pylint: disable=cell-var-from-loop
-        optimizer = adamax.Adamax(3.0)
-        minimize_op = optimizer.minimize(g_sum, var_list=[var])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        minimize_op.run()
-
-  def testSparseRepeatedIndices(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        repeated_index_update_var = tf.Variable(
-            [[1.0], [2.0]], dtype=dtype)
-        aggregated_update_var = tf.Variable(
-            [[1.0], [2.0]], dtype=dtype)
-        grad_repeated_index = tf.IndexedSlices(
-            tf.constant(
-                [0.1, 0.1], shape=[2, 1], dtype=dtype),
-            tf.constant([1, 1]),
-            tf.constant([2, 1]))
-        grad_aggregated = tf.IndexedSlices(
-            tf.constant(
-                [0.2], shape=[1, 1], dtype=dtype),
-            tf.constant([1]),
-            tf.constant([2, 1]))
-        repeated_update = adamax.Adamax().apply_gradients(
-            [(grad_repeated_index, repeated_index_update_var)])
-        aggregated_update = adamax.Adamax().apply_gradients(
-            [(grad_aggregated, aggregated_update_var)])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.assertAllClose(aggregated_update_var,
-                            repeated_index_update_var.eval())
-        for _ in range(3):
-          repeated_update.run()
-          aggregated_update.run()
-          self.assertAllClose(aggregated_update_var,
-                              repeated_index_update_var.eval())
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasic(self):
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with self.session(graph=tf.Graph(), use_gpu=True):
-        # Initialize variables for numpy implementation.
-        m0 = np.array([0.0, 0.0])
-        v0 = np.array([0.0, 0.0])
-        m1 = np.array([0.0, 0.0])
-        v1 = np.array([0.0, 0.0])
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        opt = adamax.Adamax()
-        if not tf.executing_eagerly():
-          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        if not tf.executing_eagerly():
-          self.evaluate(tf.compat.v1.global_variables_initializer())
-          # Fetch params to validate initial values
-          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        # Run 3 steps of Adamax
-        for t in range(3):
-          beta_1_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          if not tf.executing_eagerly():
-            self.evaluate(update)
-          else:
-            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(
-              var0_np, self.evaluate(var0), rtol=1e-2)
-          self.assertAllCloseAccordingToType(
-              var1_np, self.evaluate(var1), rtol=1e-2)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasicWithLearningRateDecay(self):
-    for i, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      with self.session(graph=tf.Graph(), use_gpu=True):
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, name="var0_%d" % i)
-        var1 = tf.Variable(var1_np, name="var1_%d" % i)
-
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-
-        learning_rate = 0.001
-        decay = 0.002
-        opt = adamax.Adamax(learning_rate=learning_rate, decay=decay)
-        if not tf.executing_eagerly():
-          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-        if not tf.executing_eagerly():
-          self.evaluate(tf.compat.v1.global_variables_initializer())
-          # Fetch params to validate initial values
-          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        # Run 3 steps of Adamax
-        for t in range(3):
-          beta_1_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1),
-                                             self.evaluate(beta_1_power))
-          if not tf.executing_eagerly():
-            self.evaluate(update)
-          else:
-            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-
-          lr = learning_rate / (1 + decay * t)
-
-          var0_np, m0, v0 = adamax_update_numpy(
-              var0_np, grads0_np, t, m0, v0, alpha=lr)
-          var1_np, m1, v1 = adamax_update_numpy(
-              var1_np, grads1_np, t, m1, v1, alpha=lr)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
-                                             rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
-                                             rtol=1e-2)
-
-  def testTensorLearningRate(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-        opt = adamax.Adamax(tf.constant(0.001))
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0)
-        self.assertAllClose([3.0, 4.0], var1)
-
-        beta1_power = get_beta_accumulators(opt, dtype)
-
-        # Run 3 steps of Adamax
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
-          update.run()
-
-          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0)
-          self.assertAllCloseAccordingToType(var1_np, var1)
-
-  def testSharing(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-        opt = adamax.Adamax()
-        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        beta1_power = get_beta_accumulators(opt, dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0)
-        self.assertAllClose([3.0, 4.0], var1)
-
-        # Run 3 steps of intertwined Adamax1 and Adamax2.
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
-          if t % 2 == 0:
-            update1.run()
-          else:
-            update2.run()
-
-          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0)
-          self.assertAllCloseAccordingToType(var1_np, var1)
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testSlotsUniqueEager(self):
-    v1 = tf.Variable(1.)
-    v2 = tf.Variable(1.)
-    opt = adamax.Adamax(1.)
-    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-    # There should be iteration, and two unique slot variables for v1 and v2.
-    self.assertLen({id(v) for v in opt.variables()}, 5)
-
-  def testConstructAdamaxWithLR(self):
-    opt = adamax.Adamax(lr=1.0)
-    opt_2 = adamax.Adamax(learning_rate=0.1, lr=1.0)
-    opt_3 = adamax.Adamax(learning_rate=0.1)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/ftrl.py b/keras/optimizers/optimizer_v2/ftrl.py
deleted file mode 100644
index 104f6c551952..000000000000
--- a/keras/optimizers/optimizer_v2/ftrl.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Ftrl-proximal optimizer implementation."""
-# pylint: disable=g-bad-import-order
-# pylint: disable=g-classes-have-attributes
-
-import tensorflow.compat.v2 as tf
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.optimizers.Ftrl')
-class Ftrl(optimizer_v2.OptimizerV2):
-  r"""Optimizer that implements the FTRL algorithm.
-
-  "Follow The Regularized Leader" (FTRL) is an optimization algorithm developed
-  at Google for click-through rate prediction in the early 2010s. It is most
-  suitable for shallow models with large and sparse feature spaces.
-  The algorithm is described by
-  [McMahan et al., 2013](https://research.google.com/pubs/archive/41159.pdf).
-  The Keras version has support for both online L2 regularization
-  (the L2 regularization described in the paper
-  above) and shrinkage-type L2 regularization
-  (which is the addition of an L2 penalty to the loss function).
-
-  Initialization:
-
-  ```python
-  n = 0
-  sigma = 0
-  z = 0
-  ```
-
-  Update rule for one variable `w`:
-
-  ```python
-  prev_n = n
-  n = n + g ** 2
-  sigma = (sqrt(n) - sqrt(prev_n)) / lr
-  z = z + g - sigma * w
-  if abs(z) < lambda_1:
-    w = 0
-  else:
-    w = (sgn(z) * lambda_1 - z) / ((beta + sqrt(n)) / alpha + lambda_2)
-  ```
-
-  Notation:
-
-  - `lr` is the learning rate
-  - `g` is the gradient for the variable
-  - `lambda_1` is the L1 regularization strength
-  - `lambda_2` is the L2 regularization strength
-
-  Check the documentation for the `l2_shrinkage_regularization_strength`
-  parameter for more details when shrinkage is enabled, in which case gradient
-  is replaced with a gradient with shrinkage.
-
-  Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-    learning_rate_power: A float value, must be less or equal to zero.
-      Controls how the learning rate decreases during training. Use zero for
-      a fixed learning rate.
-    initial_accumulator_value: The starting value for accumulators.
-      Only zero or positive values are allowed.
-    l1_regularization_strength: A float value, must be greater than or
-      equal to zero. Defaults to 0.0.
-    l2_regularization_strength: A float value, must be greater than or
-      equal to zero. Defaults to 0.0.
-    name: Optional name prefix for the operations created when applying
-      gradients.  Defaults to `"Ftrl"`.
-    l2_shrinkage_regularization_strength: A float value, must be greater than
-      or equal to zero. This differs from L2 above in that the L2 above is a
-      stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
-      When input is sparse shrinkage will only happen on the active weights.
-    beta: A float value, representing the beta value from the paper.
-      Defaults to 0.0.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value.
-
-  Reference:
-    - [McMahan et al., 2013](
-      https://research.google.com/pubs/archive/41159.pdf)
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               learning_rate_power=-0.5,
-               initial_accumulator_value=0.1,
-               l1_regularization_strength=0.0,
-               l2_regularization_strength=0.0,
-               name='Ftrl',
-               l2_shrinkage_regularization_strength=0.0,
-               beta=0.0,
-               **kwargs):
-    super().__init__(name, **kwargs)
-
-    if initial_accumulator_value < 0.0:
-      raise ValueError(
-          '`initial_accumulator_value` needs to be positive or zero. Received: '
-          f'initial_accumulator_value={initial_accumulator_value}.')
-    if learning_rate_power > 0.0:
-      raise ValueError(
-          '`learning_rate_power` needs to be negative or zero. Received: '
-          f'learning_rate_power={learning_rate_power}.')
-    if l1_regularization_strength < 0.0:
-      raise ValueError(
-          '`l1_regularization_strength` needs to be positive or zero. '
-          f'Received: l1_regularization_strength={l1_regularization_strength}.')
-    if l2_regularization_strength < 0.0:
-      raise ValueError(
-          '`l2_regularization_strength` needs to be positive or zero. '
-          f'Received: l2_regularization_strength={l2_regularization_strength}.')
-    if l2_shrinkage_regularization_strength < 0.0:
-      raise ValueError(
-          '`l2_shrinkage_regularization_strength` needs to be positive or '
-          'zero. Received: l2_shrinkage_regularization_strength'
-          f'={l2_shrinkage_regularization_strength}.')
-
-    self._set_hyper('learning_rate', learning_rate)
-    self._set_hyper('decay', self._initial_decay)
-    self._set_hyper('learning_rate_power', learning_rate_power)
-    self._set_hyper('l1_regularization_strength', l1_regularization_strength)
-    self._set_hyper('l2_regularization_strength', l2_regularization_strength)
-    self._set_hyper('beta', beta)
-    self._initial_accumulator_value = initial_accumulator_value
-    self._l2_shrinkage_regularization_strength = (
-        l2_shrinkage_regularization_strength)
-
-  def _create_slots(self, var_list):
-    # Create the "accum" and "linear" slots.
-    for var in var_list:
-      dtype = var.dtype.base_dtype
-      init = tf.compat.v1.constant_initializer(
-          self._initial_accumulator_value, dtype=dtype)
-      self.add_slot(var, 'accumulator', init)
-      self.add_slot(var, 'linear')
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device, var_dtype)].update(
-        dict(
-            learning_rate_power=tf.identity(
-                self._get_hyper('learning_rate_power', var_dtype)),
-            l1_regularization_strength=tf.identity(
-                self._get_hyper('l1_regularization_strength', var_dtype)),
-            l2_regularization_strength=tf.identity(
-                self._get_hyper('l2_regularization_strength', var_dtype)),
-            beta=tf.identity(self._get_hyper('beta', var_dtype)),
-            l2_shrinkage_regularization_strength=tf.cast(
-                self._l2_shrinkage_regularization_strength, var_dtype)))
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    # Adjust L2 regularization strength to include beta to avoid the underlying
-    # TensorFlow ops needing to include it.
-    adjusted_l2_regularization_strength = (
-        coefficients['l2_regularization_strength'] + coefficients['beta'] /
-        (2. * coefficients['lr_t']))
-
-    accum = self.get_slot(var, 'accumulator')
-    linear = self.get_slot(var, 'linear')
-
-    if self._l2_shrinkage_regularization_strength <= 0.0:
-      return tf.raw_ops.ResourceApplyFtrl(
-          var=var.handle,
-          accum=accum.handle,
-          linear=linear.handle,
-          grad=grad,
-          lr=coefficients['lr_t'],
-          l1=coefficients['l1_regularization_strength'],
-          l2=adjusted_l2_regularization_strength,
-          lr_power=coefficients['learning_rate_power'],
-          use_locking=self._use_locking)
-    else:
-      return tf.raw_ops.ResourceApplyFtrlV2(
-          var=var.handle,
-          accum=accum.handle,
-          linear=linear.handle,
-          grad=grad,
-          lr=coefficients['lr_t'],
-          l1=coefficients['l1_regularization_strength'],
-          l2=adjusted_l2_regularization_strength,
-          l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
-          lr_power=coefficients['learning_rate_power'],
-          use_locking=self._use_locking)
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    # Adjust L2 regularization strength to include beta to avoid the underlying
-    # TensorFlow ops needing to include it.
-    adjusted_l2_regularization_strength = (
-        coefficients['l2_regularization_strength'] + coefficients['beta'] /
-        (2. * coefficients['lr_t']))
-
-    accum = self.get_slot(var, 'accumulator')
-    linear = self.get_slot(var, 'linear')
-
-    if self._l2_shrinkage_regularization_strength <= 0.0:
-      return tf.raw_ops.ResourceSparseApplyFtrl(
-          var=var.handle,
-          accum=accum.handle,
-          linear=linear.handle,
-          grad=grad,
-          indices=indices,
-          lr=coefficients['lr_t'],
-          l1=coefficients['l1_regularization_strength'],
-          l2=adjusted_l2_regularization_strength,
-          lr_power=coefficients['learning_rate_power'],
-          use_locking=self._use_locking)
-    else:
-      return tf.raw_ops.ResourceSparseApplyFtrlV2(
-          var=var.handle,
-          accum=accum.handle,
-          linear=linear.handle,
-          grad=grad,
-          indices=indices,
-          lr=coefficients['lr_t'],
-          l1=coefficients['l1_regularization_strength'],
-          l2=adjusted_l2_regularization_strength,
-          l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
-          lr_power=coefficients['learning_rate_power'],
-          use_locking=self._use_locking)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'learning_rate':
-            self._serialize_hyperparameter('learning_rate'),
-        'decay':
-            self._initial_decay,
-        'initial_accumulator_value':
-            self._initial_accumulator_value,
-        'learning_rate_power':
-            self._serialize_hyperparameter('learning_rate_power'),
-        'l1_regularization_strength':
-            self._serialize_hyperparameter('l1_regularization_strength'),
-        'l2_regularization_strength':
-            self._serialize_hyperparameter('l2_regularization_strength'),
-        'beta':
-            self._serialize_hyperparameter('beta'),
-        'l2_shrinkage_regularization_strength':
-            self._l2_shrinkage_regularization_strength,
-    })
-    return config
diff --git a/keras/optimizers/optimizer_v2/ftrl_test.py b/keras/optimizers/optimizer_v2/ftrl_test.py
deleted file mode 100644
index 187e868c30d2..000000000000
--- a/keras/optimizers/optimizer_v2/ftrl_test.py
+++ /dev/null
@@ -1,484 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional tests for Ftrl operations."""
-
-import tensorflow.compat.v2 as tf
-
-import numpy as np
-from keras.optimizers.optimizer_v2 import ftrl
-
-
-class FtrlOptimizerTest(tf.test.TestCase):
-
-  def doTestFtrlwithoutRegularization(self, use_resource=False):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        if use_resource:
-          var0 = tf.Variable([0.0, 0.0], dtype=dtype)
-          var1 = tf.Variable([0.0, 0.0], dtype=dtype)
-        else:
-          var0 = tf.Variable([0.0, 0.0], dtype=dtype)
-          var1 = tf.Variable([0.0, 0.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-        opt = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.0,
-            l2_regularization_strength=0.0)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllClose([0.0, 0.0], v0_val)
-        self.assertAllClose([0.0, 0.0], v1_val)
-
-        # Run 3 steps FTRL
-        for _ in range(3):
-          update.run()
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType(
-            np.array([-2.60260963, -4.29698515]), v0_val)
-        self.assertAllCloseAccordingToType(
-            np.array([-0.28432083, -0.56694895]), v1_val)
-
-  def testFtrlWithoutRegularization(self):
-    self.doTestFtrlwithoutRegularization(use_resource=False)
-
-  def testResourceFtrlWithoutRegularization(self):
-    self.doTestFtrlwithoutRegularization(use_resource=True)
-
-  def testFtrlwithoutRegularization2(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([4.0, 3.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-
-        opt = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.0,
-            l2_regularization_strength=0.0)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
-        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
-
-        # Run 3 steps FTRL
-        for _ in range(3):
-          update.run()
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType(
-            np.array([-2.55607247, -3.98729396]), v0_val)
-        self.assertAllCloseAccordingToType(
-            np.array([-0.28232238, -0.56096673]), v1_val)
-
-  def testMinimizeSparseResourceVariable(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-        x = tf.constant([[4.0], [5.0]], dtype=dtype)
-
-        def loss():
-          pred = tf.matmul(tf.compat.v1.nn.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
-          return pred * pred
-
-        sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[0, 1]],
-                                           self.evaluate(var0),
-                                           atol=0.01)
-
-  def testFtrlWithL1(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([4.0, 3.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-
-        opt = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.001,
-            l2_regularization_strength=0.0)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
-        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
-
-        # Run 10 steps FTRL
-        for _ in range(10):
-          update.run()
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType(
-            np.array([-7.66718769, -10.91273689]), v0_val)
-        self.assertAllCloseAccordingToType(
-            np.array([-0.93460727, -1.86147261]), v1_val)
-
-  def testFtrlWithBeta(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([4.0, 3.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-
-        opt = ftrl.Ftrl(3.0, initial_accumulator_value=0.1, beta=0.1)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
-        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
-
-        # Run 10 steps FTRL
-        for _ in range(10):
-          update.run()
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType(
-            np.array([-6.096838, -9.162214]), v0_val)
-        self.assertAllCloseAccordingToType(
-            np.array([-0.717741, -1.425132]), v1_val)
-
-  def testFtrlWithL2_Beta(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([4.0, 3.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-
-        opt = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.0,
-            l2_regularization_strength=0.1,
-            beta=0.1)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
-        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
-
-        # Run 10 steps FTRL
-        for _ in range(10):
-          update.run()
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType(
-            np.array([-2.735487, -4.704625]), v0_val)
-        self.assertAllCloseAccordingToType(
-            np.array([-0.294335, -0.586556]), v1_val)
-
-  def testFtrlWithL1_L2(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([4.0, 3.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-
-        opt = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.001,
-            l2_regularization_strength=2.0)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
-        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
-
-        # Run 10 steps FTRL
-        for _ in range(10):
-          update.run()
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType(
-            np.array([-0.24059935, -0.46829352]), v0_val)
-        self.assertAllCloseAccordingToType(
-            np.array([-0.02406147, -0.04830509]), v1_val)
-
-  def testFtrlWithL1_L2_L2Shrinkage(self):
-    """Test the new FTRL op with support for l2 shrinkage.
-
-    The addition of this parameter which places a constant pressure on weights
-    towards the origin causes the gradient descent trajectory to differ. The
-    weights will tend to have smaller magnitudes with this parameter set.
-    """
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([4.0, 3.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-
-        opt = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.001,
-            l2_regularization_strength=2.0,
-            l2_shrinkage_regularization_strength=0.1)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
-        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
-
-        # Run 10 steps FTRL
-        for _ in range(10):
-          update.run()
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType(
-            np.array([-0.22578995, -0.44345796]), v0_val)
-        self.assertAllCloseAccordingToType(
-            np.array([-0.14378493, -0.13229476]), v1_val)
-
-  def testFtrlWithL1_L2_L2ShrinkageSparse(self):
-    """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
-        var1 = tf.Variable([[4.0], [3.0]], dtype=dtype)
-        grads0 = tf.IndexedSlices(
-            tf.constant([0.1], shape=[1, 1], dtype=dtype),
-            tf.constant([0]), tf.constant([2, 1]))
-        grads1 = tf.IndexedSlices(
-            tf.constant([0.02], shape=[1, 1], dtype=dtype),
-            tf.constant([1]), tf.constant([2, 1]))
-
-        opt = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.001,
-            l2_regularization_strength=2.0,
-            l2_shrinkage_regularization_strength=0.1)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
-        self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
-
-        # Run 10 steps FTRL
-        for _ in range(10):
-          update.run()
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
-        self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
-
-  def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
-    """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session() as sess:
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([1.0, 2.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-        grads1 = tf.constant([0.1, 0.2], dtype=dtype)
-
-        opt0 = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.001,
-            l2_regularization_strength=2.0,
-            l2_shrinkage_regularization_strength=0.1)
-        opt1 = ftrl.Ftrl(
-            3.0,
-            initial_accumulator_value=0.1,
-            l1_regularization_strength=0.001,
-            l2_regularization_strength=2.0)
-        update0 = opt0.apply_gradients([(grads0, var0)])
-        update1 = opt1.apply_gradients([(grads1, var1)])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
-        self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
-
-        # Run 10 steps FTRL
-        for _ in range(10):
-          update0.run()
-          update1.run()
-
-        v0_val, v1_val = self.evaluate([var0, var1])
-        # var0 is experiencing L2 shrinkage so it should be smaller than var1
-        # in magnitude.
-        self.assertTrue((v0_val**2 < v1_val**2).all())
-        accum0 = sess.run(opt0.get_slot(var0, "accumulator"))
-        accum1 = sess.run(opt1.get_slot(var1, "accumulator"))
-        # L2 shrinkage should not change how we update grad accumulator.
-        self.assertAllCloseAccordingToType(accum0, accum1)
-
-  def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
-    if is_sparse:
-      var0 = tf.Variable([[0.0], [0.0]], dtype=dtype)
-      var1 = tf.Variable([[0.0], [0.0]], dtype=dtype)
-      grads0 = tf.IndexedSlices(
-          tf.constant([0.1], shape=[1, 1], dtype=dtype),
-          tf.constant([0]), tf.constant([2, 1]))
-      grads1 = tf.IndexedSlices(
-          tf.constant([0.02], shape=[1, 1], dtype=dtype),
-          tf.constant([1]), tf.constant([2, 1]))
-    else:
-      var0 = tf.Variable([0.0, 0.0], dtype=dtype)
-      var1 = tf.Variable([0.0, 0.0], dtype=dtype)
-      grads0 = tf.constant([0.1, 0.2], dtype=dtype)
-      grads1 = tf.constant([0.01, 0.02], dtype=dtype)
-
-    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    v0_val, v1_val = self.evaluate([var0, var1])
-    if is_sparse:
-      self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
-      self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
-    else:
-      self.assertAllCloseAccordingToType([0.0, 0.0], v0_val)
-      self.assertAllCloseAccordingToType([0.0, 0.0], v1_val)
-
-    # Run Ftrl for a few steps
-    for _ in range(steps):
-      update.run()
-
-    v0_val, v1_val = self.evaluate([var0, var1])
-    return v0_val, v1_val
-
-  # When variables are initialized with Zero, FTRL-Proximal has two properties:
-  # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
-  # with GradientDescent.
-  # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is identical
-  # with Adagrad.
-  # So, basing on these two properties, we test if our implementation of
-  # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
-  def testEquivAdagradwithoutRegularization(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        val0, val1 = self.applyOptimizer(
-            ftrl.Ftrl(
-                3.0,
-                # Adagrad learning rate
-                learning_rate_power=-0.5,
-                initial_accumulator_value=0.1,
-                l1_regularization_strength=0.0,
-                l2_regularization_strength=0.0),
-            dtype)
-
-      with tf.Graph().as_default(), self.cached_session():
-        val2, val3 = self.applyOptimizer(
-            tf.compat.v1.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1), dtype)
-
-      self.assertAllCloseAccordingToType(val0, val2)
-      self.assertAllCloseAccordingToType(val1, val3)
-
-  def testEquivSparseAdagradwithoutRegularization(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        val0, val1 = self.applyOptimizer(
-            ftrl.Ftrl(
-                3.0,
-                # Adagrad learning rate
-                learning_rate_power=-0.5,
-                initial_accumulator_value=0.1,
-                l1_regularization_strength=0.0,
-                l2_regularization_strength=0.0),
-            dtype,
-            is_sparse=True)
-
-      with tf.Graph().as_default(), self.cached_session():
-        val2, val3 = self.applyOptimizer(
-            tf.compat.v1.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
-            dtype,
-            is_sparse=True)
-
-      self.assertAllCloseAccordingToType(val0, val2)
-      self.assertAllCloseAccordingToType(val1, val3)
-
-  def testEquivSparseGradientDescentwithoutRegularization(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        val0, val1 = self.applyOptimizer(
-            ftrl.Ftrl(
-                3.0,
-                # Fixed learning rate
-                learning_rate_power=-0.0,
-                initial_accumulator_value=0.1,
-                l1_regularization_strength=0.0,
-                l2_regularization_strength=0.0),
-            dtype,
-            is_sparse=True)
-
-      with tf.Graph().as_default(), self.cached_session():
-        val2, val3 = self.applyOptimizer(
-            tf.compat.v1.train.GradientDescentOptimizer(3.0),
-            dtype,
-            is_sparse=True)
-
-      self.assertAllCloseAccordingToType(val0, val2)
-      self.assertAllCloseAccordingToType(val1, val3)
-
-  def testEquivGradientDescentwithoutRegularization(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32]:
-      with tf.Graph().as_default(), self.cached_session():
-        val0, val1 = self.applyOptimizer(
-            ftrl.Ftrl(
-                3.0,
-                # Fixed learning rate
-                learning_rate_power=-0.0,
-                initial_accumulator_value=0.1,
-                l1_regularization_strength=0.0,
-                l2_regularization_strength=0.0),
-            dtype)
-
-      with tf.Graph().as_default(), self.cached_session():
-        val2, val3 = self.applyOptimizer(
-            tf.compat.v1.train.GradientDescentOptimizer(3.0), dtype)
-
-      self.assertAllCloseAccordingToType(val0, val2)
-      self.assertAllCloseAccordingToType(val1, val3)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/gradient_descent.py b/keras/optimizers/optimizer_v2/gradient_descent.py
deleted file mode 100644
index 47c91d9a5756..000000000000
--- a/keras/optimizers/optimizer_v2/gradient_descent.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SGD optimizer implementation."""
-# pylint: disable=g-bad-import-order
-# pylint: disable=g-classes-have-attributes
-import tensorflow.compat.v2 as tf
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export("keras.optimizers.SGD")
-class SGD(optimizer_v2.OptimizerV2):
-  r"""Gradient descent (with momentum) optimizer.
-
-  Update rule for parameter `w` with gradient `g` when `momentum` is 0:
-
-  ```python
-  w = w - learning_rate * g
-  ```
-
-  Update rule when `momentum` is larger than 0:
-
-  ```python
-  velocity = momentum * velocity - learning_rate * g
-  w = w + velocity
-  ```
-
-  When `nesterov=True`, this rule becomes:
-
-  ```python
-  velocity = momentum * velocity - learning_rate * g
-  w = w + momentum * velocity - learning_rate * g
-  ```
-
-  Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use. The
-      learning rate. Defaults to 0.01.
-    momentum: float hyperparameter >= 0 that accelerates gradient descent
-      in the relevant
-      direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient
-      descent.
-    nesterov: boolean. Whether to apply Nesterov momentum.
-      Defaults to `False`.
-    name: Optional name prefix for the operations created when applying
-      gradients.  Defaults to `"SGD"`.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value.
-
-  Usage:
-
-  >>> opt = tf.keras.optimizers.SGD(learning_rate=0.1)
-  >>> var = tf.Variable(1.0)
-  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
-  >>> step_count = opt.minimize(loss, [var]).numpy()
-  >>> # Step is `- learning_rate * grad`
-  >>> var.numpy()
-  0.9
-
-  >>> opt = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
-  >>> var = tf.Variable(1.0)
-  >>> val0 = var.value()
-  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
-  >>> # First step is `- learning_rate * grad`
-  >>> step_count = opt.minimize(loss, [var]).numpy()
-  >>> val1 = var.value()
-  >>> (val0 - val1).numpy()
-  0.1
-  >>> # On later steps, step-size increases because of momentum
-  >>> step_count = opt.minimize(loss, [var]).numpy()
-  >>> val2 = var.value()
-  >>> (val1 - val2).numpy()
-  0.18
-
-  Reference:
-      - For `nesterov=True`, See [Sutskever et al., 2013](
-        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.01,
-               momentum=0.0,
-               nesterov=False,
-               name="SGD",
-               **kwargs):
-    super().__init__(name, **kwargs)
-    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
-    self._set_hyper("decay", self._initial_decay)
-
-    self._momentum = False
-    if isinstance(momentum, tf.Tensor) or callable(momentum) or momentum > 0:
-      self._momentum = True
-    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
-      raise ValueError(f"`momentum` must be between [0, 1]. Received: "
-                       f"momentum={momentum} (of type {type(momentum)}).")
-    self._set_hyper("momentum", momentum)
-
-    self.nesterov = nesterov
-
-  def _create_slots(self, var_list):
-    if self._momentum:
-      for var in var_list:
-        self.add_slot(var, "momentum")
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device, var_dtype)]["momentum"] = tf.identity(
-        self._get_hyper("momentum", var_dtype))
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    if self._momentum:
-      momentum_var = self.get_slot(var, "momentum")
-      return tf.raw_ops.ResourceApplyKerasMomentum(
-          var=var.handle,
-          accum=momentum_var.handle,
-          lr=coefficients["lr_t"],
-          grad=grad,
-          momentum=coefficients["momentum"],
-          use_locking=self._use_locking,
-          use_nesterov=self.nesterov)
-    else:
-      return tf.raw_ops.ResourceApplyGradientDescent(
-          var=var.handle,
-          alpha=coefficients["lr_t"],
-          delta=grad,
-          use_locking=self._use_locking)
-
-  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices,
-                                               **kwargs):
-    if self._momentum:
-      return super()._resource_apply_sparse_duplicate_indices(
-          grad, var, indices, **kwargs)
-    else:
-      var_device, var_dtype = var.device, var.dtype.base_dtype
-      coefficients = (kwargs.get("apply_state", {}).get((var_device, var_dtype))
-                      or self._fallback_apply_state(var_device, var_dtype))
-
-      return tf.raw_ops.ResourceScatterAdd(
-          resource=var.handle,
-          indices=indices,
-          updates=-grad * coefficients["lr_t"])
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    # This method is only needed for momentum optimization.
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    momentum_var = self.get_slot(var, "momentum")
-    return tf.raw_ops.ResourceSparseApplyKerasMomentum(
-        var=var.handle,
-        accum=momentum_var.handle,
-        lr=coefficients["lr_t"],
-        grad=grad,
-        indices=indices,
-        momentum=coefficients["momentum"],
-        use_locking=self._use_locking,
-        use_nesterov=self.nesterov)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        "learning_rate": self._serialize_hyperparameter("learning_rate"),
-        "decay": self._initial_decay,
-        "momentum": self._serialize_hyperparameter("momentum"),
-        "nesterov": self.nesterov,
-    })
-    return config
diff --git a/keras/optimizers/optimizer_v2/gradient_descent_test.py b/keras/optimizers/optimizer_v2/gradient_descent_test.py
deleted file mode 100644
index d97b341fb543..000000000000
--- a/keras/optimizers/optimizer_v2/gradient_descent_test.py
+++ /dev/null
@@ -1,726 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional test for GradientDescent."""
-
-import tensorflow.compat.v2 as tf
-
-from absl.testing import parameterized
-import numpy as np
-from keras.testing_infra import test_combinations
-from keras.optimizers.optimizer_v2 import gradient_descent
-from keras.optimizers.schedules import learning_rate_schedule
-
-
-class GradientDescentOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasic(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-      var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-      grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-      grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-      sgd = gradient_descent.SGD(3.0)
-      sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # Run 1 step of sgd
-      self.evaluate(sgd_op)
-      # Validate updated params
-      self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                         self.evaluate(var0))
-      self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                         self.evaluate(var1))
-
-  def _test_basic_sgd_with_learning_rate_decay(self, sgd, dtype):
-    var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-    var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-    grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-    grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-    if not tf.executing_eagerly():
-      sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    # Run 2 steps of sgd
-    if not tf.executing_eagerly():
-      self.evaluate(sgd_op)
-    else:
-      sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-    # Validate updated params
-    self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                       self.evaluate(var0))
-    self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                       self.evaluate(var1))
-
-    if not tf.executing_eagerly():
-      self.evaluate(sgd_op)
-    else:
-      sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-    # Validate updated params
-    self.assertAllCloseAccordingToType(
-        [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
-        self.evaluate(var0))
-    self.assertAllCloseAccordingToType(
-        [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
-        self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasicWithLearningRateDecay(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      learning_rate = 3.0
-      decay = 0.5
-      sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
-      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasicWithLearningRateInverseTimeDecay(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      learning_rate = learning_rate_schedule.InverseTimeDecay(
-          3.0, decay_steps=1.0, decay_rate=0.5)
-      sgd = gradient_descent.SGD(learning_rate=learning_rate)
-      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      learning_rate = learning_rate_schedule.InverseTimeDecay(
-          3.0, decay_steps=1.0, decay_rate=0.5)
-      sgd = gradient_descent.SGD(learning_rate=learning_rate)
-      sgd = gradient_descent.SGD.from_config(sgd.get_config())
-      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasicCallableParams(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-      var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-      grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-      grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-      lr = lambda: 3.0
-      sgd = gradient_descent.SGD(lr)
-      sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # Run 1 step of sgd
-      self.evaluate(sgd_op)
-      # Validate updated params
-      self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                         self.evaluate(var0))
-      self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                         self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testMinimizeResourceVariable(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-      var1 = tf.Variable([3.0], dtype=dtype)
-      x = tf.constant([[4.0], [5.0]], dtype=dtype)
-      loss = lambda: tf.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
-      sgd = gradient_descent.SGD(1.0)
-      sgd_op = sgd.minimize(loss, [var0, var1])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # Run 1 step of sgd
-      self.evaluate(sgd_op)
-      # Validate updated params
-      self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
-                                         self.evaluate(var0))
-      self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
-
-  def testMinimizeSparseResourceVariable(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-        var1 = tf.Variable([3.0], dtype=dtype)
-        x = tf.constant([[4.0], [5.0]], dtype=dtype)
-
-        def loss():
-          pred = tf.matmul(tf.compat.v1.nn.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
-          pred += var1  # pylint: disable=cell-var-from-loop
-          return pred * pred
-
-        sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
-        np_grad = 2 * np_pred
-        self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
-
-  def testTensorLearningRate(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-      var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-      grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-      grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-      lrate = tf.constant(3.0)
-      sgd_op = gradient_descent.SGD(lrate).apply_gradients(
-          zip([grads0, grads1], [var0, var1]))
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # Run 1 step of sgd
-      self.evaluate(sgd_op)
-      # Validate updated params
-      self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                         self.evaluate(var0))
-      self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                         self.evaluate(var1))
-
-  def testGradWrtRef(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        opt = gradient_descent.SGD(3.0)
-        values = [1.0, 3.0]
-        vars_ = [tf.Variable([v], dtype=dtype) for v in values]
-        loss = lambda: vars_[0] + vars_[1]  # pylint: disable=cell-var-from-loop
-        grads_and_vars = opt._compute_gradients(loss, vars_)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
-
-  def testSparseBasic(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
-        var1 = tf.Variable([[3.0], [4.0]], dtype=dtype)
-        grads0 = tf.IndexedSlices(
-            tf.constant([0.1], shape=[1, 1], dtype=dtype),
-            tf.constant([0]), tf.constant([2, 1]))
-        grads1 = tf.IndexedSlices(
-            tf.constant([0.01], shape=[1, 1], dtype=dtype),
-            tf.constant([1]), tf.constant([2, 1]))
-        sgd_op = gradient_descent.SGD(3.0).apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           self.evaluate(var1))
-
-  def testSparseBasicWithLearningRateDecay(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
-        var1 = tf.Variable([[3.0], [4.0]], dtype=dtype)
-        grads0 = tf.IndexedSlices(
-            tf.constant([0.1], shape=[1, 1], dtype=dtype),
-            tf.constant([0]), tf.constant([2, 1]))
-        grads1 = tf.IndexedSlices(
-            tf.constant([0.01], shape=[1, 1], dtype=dtype),
-            tf.constant([1]), tf.constant([2, 1]))
-        sgd_op = gradient_descent.SGD(
-            3.0, decay=0.5).apply_gradients(
-                zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 2 steps of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           self.evaluate(var1))
-
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testCapturingInFunctionWhileExecutingEagerly(self):
-    optimizer = gradient_descent.SGD(1.0)
-
-    var_holder = {}
-    def step():
-      if not var_holder:
-        var_holder["var"] = tf.Variable(1.0)
-      else:
-        var_holder["var"].assign(1.0)
-
-      with tf.GradientTape() as tape:
-        loss = var_holder["var"]**2
-      grad = tape.gradient(loss, var_holder["var"])
-      optimizer.apply_gradients([(grad, var_holder["var"])])
-      return var_holder["var"].read_value()
-
-    compiled_step = tf.function(step)
-
-    self.assertEqual(float(step()), -1.0)
-    self.assertEqual(float(compiled_step()), -1.0)
-    # This shouldn't fail; in particular, the learning rate tensor should
-    # be an EagerTensor once again, not a graph Tensor.
-    self.assertEqual(float(step()), -1.0)
-
-  def testConstructSGDWithLR(self):
-    opt = gradient_descent.SGD(lr=1.0)
-    opt_2 = gradient_descent.SGD(learning_rate=0.1, lr=1.0)
-    opt_3 = gradient_descent.SGD(learning_rate=0.1)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-
-class MomentumOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
-    accum = accum * momentum - g * lr
-    var += (accum * momentum - g * lr)
-    return var, accum
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testBasic(self):
-    for _, dtype in enumerate([tf.half, tf.float32, tf.float64]):
-      var0 = tf.Variable([1.0, 2.0], dtype=dtype, name="var0")
-      var1 = tf.Variable([3.0, 4.0], dtype=dtype, name="var1")
-      grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-      grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-      learning_rate = 2.0
-      momentum = 0.9
-      mom_opt = gradient_descent.SGD(
-          learning_rate=learning_rate, momentum=momentum)
-      # self.assertFalse(mom_opt._initial_decay)
-      mom_update = mom_opt.apply_gradients(
-          zip([grads0, grads1], [var0, var1]))
-
-      # Check we have slots
-      slot0 = mom_opt.get_slot(var0, "momentum")
-      self.assertEqual(slot0.shape, var0.shape)
-      slot1 = mom_opt.get_slot(var1, "momentum")
-      self.assertEqual(slot1.shape, var1.shape)
-
-      # Step 1: the momentum accumulators where 0. So we should see a normal
-      # update: v -= grad * learning_rate
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(mom_update)
-      # Check that the momentum accumulators have been updated.
-      self.assertAllCloseAccordingToType(
-          np.array([-0.2, -0.2]), self.evaluate(slot0))
-      self.assertAllCloseAccordingToType(
-          np.array([-0.02, -0.02]), self.evaluate(slot1))
-      # Check that the parameters have been updated.
-      self.assertAllCloseAccordingToType(
-          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-          self.evaluate(var0))
-      self.assertAllCloseAccordingToType(
-          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
-          self.evaluate(var1))
-      # Step 2: the momentum accumulators contain the previous update.
-      self.evaluate(mom_update)
-      if tf.executing_eagerly():
-        mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      # Check that the momentum accumulators have been updated.
-      self.assertAllCloseAccordingToType(
-          np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-          self.evaluate(slot0))
-      self.assertAllCloseAccordingToType(
-          np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                    (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
-      # Check that the parameters have been updated.
-      self.assertAllCloseAccordingToType(
-          np.array([
-              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-          ]), self.evaluate(var0))
-      self.assertAllCloseAccordingToType(
-          np.array([
-              2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-              3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-          ]), self.evaluate(var1))
-
-  def testNesterovMomentum(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.float32, tf.float64]:
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype, name="var0")
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype, name="var1")
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        loss = lambda: 5 * var0 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        mom_op = gradient_descent.SGD(
-            learning_rate=2.0, momentum=0.9, nesterov=True)
-        opt_op = mom_op.minimize(loss, [var0, var1])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        for _ in range(1, 5):
-          self.evaluate(opt_op)
-          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
-              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
-              var1_np, accum1_np, 3, 2.0, 0.9)
-          self.assertAllClose(var0_np, self.evaluate(var0))
-          self.assertAllClose(var1_np, self.evaluate(var1))
-
-  def testSparseNesterovMomentum(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session() as sess:
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        grads = []
-        for t in range(1, 5):
-          grads.append(var0_np * 10)
-          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
-              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
-              var1_np, accum1_np, 3, 2.0, 0.9)
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        var0 = tf.Variable(var0_np, dtype=dtype, name="var0")
-        var1 = tf.Variable(var1_np, dtype=dtype, name="var1")
-        mom_op = gradient_descent.SGD(
-            learning_rate=2.0, momentum=0.9, nesterov=True)
-        x_feed = tf.compat.v1.placeholder(dtype)
-        y_feed = tf.IndexedSlices(x_feed, tf.constant([0, 1]),
-                                   tf.constant([2]))
-        grads_and_vars = [(y_feed, var0),
-                          (tf.constant([3.0, 3.0], dtype=dtype), var1)]
-        opt_update = mom_op.apply_gradients(grads_and_vars)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        for t in range(1, 5):
-          sess.run(opt_update, feed_dict={x_feed: grads[t - 1]})
-          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
-              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
-              var1_np, accum1_np, 3, 2.0, 0.9)
-          self.assertAllClose(var0_np, self.evaluate(var0))
-          self.assertAllClose(var1_np, self.evaluate(var1))
-
-  def testMinimizeSparseResourceVariable(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-
-        # pylint: disable=cell-var-from-loop
-        def loss():
-          x = tf.constant([[4.0], [5.0]], dtype=dtype)
-          pred = tf.matmul(tf.compat.v1.nn.embedding_lookup([var0], [0]), x)
-          return pred * pred
-
-        # pylint: enable=cell-var-from-loop
-
-        opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9)
-        sgd_op = opt.minimize(loss, [var0])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testMinimizeWith2DIndicesForEmbeddingLookup(self):
-    var0 = tf.Variable(tf.ones([2, 2]))
-
-    def loss():
-      return tf.reduce_sum(tf.compat.v1.nn.embedding_lookup(var0, [[1]]))
-
-    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9)
-    sgd_op = opt.minimize(loss, [var0])
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(sgd_op)
-    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
-
-  def testTensorLearningRateAndMomentum(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-        mom_opt = gradient_descent.SGD(
-            learning_rate=tf.constant(2.0),
-            momentum=tf.constant(0.9))
-        mom_update = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Check we have slots
-        slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEqual(slot0.shape, var0.shape)
-        slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEqual(slot1.shape, var1.shape)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Step 1: the momentum accumulators where 0. So we should see a normal
-        # update: v -= grad * learning_rate
-        self.evaluate(mom_update)
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([-0.2, -0.2]), self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([-0.02, -0.02]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-            self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
-            self.evaluate(var1))
-        # Step 2: the momentum accumulators contain the previous update.
-        self.evaluate(mom_update)
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-            self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), self.evaluate(var1))
-
-  def testSparse(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        var0 = tf.Variable(tf.zeros([4, 2], dtype=dtype))
-        var1 = tf.Variable(tf.constant(1.0, dtype, [4, 2]))
-        grads0 = tf.IndexedSlices(
-            tf.constant([[.1, .1]], dtype=dtype),
-            tf.constant([1]), tf.constant([4, 2]))
-        grads1 = tf.IndexedSlices(
-            tf.constant([[.01, .01], [.01, .01]], dtype=dtype),
-            tf.constant([2, 3]), tf.constant([4, 2]))
-        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
-        mom_update = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Check we have slots
-        slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEqual(slot0.shape, var0.shape)
-        slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEqual(slot1.shape, var1.shape)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([0, 0], self.evaluate(var0)[0])
-        self.assertAllClose([0, 0], self.evaluate(var0)[1])
-        self.assertAllClose([1, 1], self.evaluate(var1)[2])
-
-        # Step 1: the momentum accumulators are 0. So we should see a normal
-        # update: v -= grad * learning_rate
-        self.evaluate(mom_update)
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([0, 0]),
-            self.evaluate(slot0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([-2.0 * .1, -2.0 * .1]),
-            self.evaluate(slot0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([-2.0 * .01, -2.0 * .01]),
-            self.evaluate(slot1)[2])
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([0, 0]),
-            self.evaluate(var0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
-            self.evaluate(var0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
-            self.evaluate(var1)[2])
-        # Step 2: the momentum accumulators contain the previous update.
-        self.evaluate(mom_update)
-        # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-            self.evaluate(slot0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                      (0.9 * (-0.02) - 2.0 * 0.01)]),
-            self.evaluate(slot1)[2])
-        # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]),
-            self.evaluate(var0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]),
-            self.evaluate(var1)[2])
-
-  def testSharing(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in [tf.half, tf.float32, tf.float64]:
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-        grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
-        mom_update1 = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        mom_update2 = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEqual(slot0.shape, var0.shape)
-        slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEqual(slot1.shape, var1.shape)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Step 1: the momentum accumulators where 0. So we should see a normal
-        # update: v -= grad * learning_rate
-        self.evaluate(mom_update1)
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([-0.2, -0.2]), self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([-0.02, -0.02]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-            self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
-            self.evaluate(var1))
-        # Step 2: the second momentum accumulators contain the previous update.
-        self.evaluate(mom_update2)
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-            self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testConfig(self):
-    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
-    config = opt.get_config()
-    opt2 = gradient_descent.SGD.from_config(config)
-    lr = opt.lr
-    lr2 = opt2.lr
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(lr), self.evaluate(lr2))
-    self.assertAllClose(
-        self.evaluate(opt._get_hyper("momentum")),
-        self.evaluate(opt2._get_hyper("momentum")))
-    self.assertAllClose(
-        self.evaluate(opt._get_hyper("decay")),
-        self.evaluate(opt2._get_hyper("decay")))
-    var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32)
-    loss = lambda: 3 * var0
-    # learning rate variable created when calling minimize.
-    opt.minimize(loss, [var0])
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    config = opt.get_config()
-    opt3 = gradient_descent.SGD.from_config(config)
-    lr3 = opt3.lr
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(lr), self.evaluate(lr3))
-    self.assertAllClose(
-        self.evaluate(opt._get_hyper("momentum")),
-        self.evaluate(opt3._get_hyper("momentum")))
-    self.assertAllClose(
-        self.evaluate(opt._get_hyper("decay")),
-        self.evaluate(opt3._get_hyper("decay")))
-    self.assertTrue(opt3.nesterov)
-
-  def testNesterovWithoutMomentum(self):
-    with self.assertRaisesRegex(ValueError, "must be between"):
-      gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
-
-  def testConstructMomentumWithLR(self):
-    opt = gradient_descent.SGD(lr=1.0, momentum=0.9)
-    opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0)
-    opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testMinimizeLossTensor(self):
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-      var1 = tf.Variable([3.0], dtype=dtype)
-      x = tf.constant([[4.0], [5.0]], dtype=dtype)
-
-      tape = tf.GradientTape()
-      with tape:
-        loss = tf.matmul(var0, x) + var1
-      sgd = gradient_descent.SGD(1.0)
-      with self.assertRaisesRegex(ValueError, "`tape` is required"):
-        sgd.minimize(loss, [var0, var1])
-      sgd.minimize(loss, [var0, var1], tape=tape)
-
-      self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
-                                         self.evaluate(var0))
-      self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/nadam.py b/keras/optimizers/optimizer_v2/nadam.py
deleted file mode 100644
index 96007cce1c01..000000000000
--- a/keras/optimizers/optimizer_v2/nadam.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Nadam optimizer implementation."""
-
-import tensorflow.compat.v2 as tf
-from keras import backend_config
-from keras.optimizers.schedules import learning_rate_schedule
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export('keras.optimizers.Nadam')
-class Nadam(optimizer_v2.OptimizerV2):
-  r"""Optimizer that implements the NAdam algorithm.
-  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
-  Nesterov momentum.
-
-  Args:
-    learning_rate: A Tensor or a floating point value.  The learning rate.
-    beta_1: A float value or a constant float tensor. The exponential decay
-      rate for the 1st moment estimates.
-    beta_2: A float value or a constant float tensor. The exponential decay
-      rate for the exponentially weighted infinity norm.
-    epsilon: A small constant for numerical stability.
-    name: Optional name for the operations created when applying gradients.
-      Defaults to `"Nadam"`.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value.
-
-  Usage Example:
-    >>> opt = tf.keras.optimizers.Nadam(learning_rate=0.2)
-    >>> var1 = tf.Variable(10.0)
-    >>> loss = lambda: (var1 ** 2) / 2.0
-    >>> step_count = opt.minimize(loss, [var1]).numpy()
-    >>> "{:.1f}".format(var1.numpy())
-    9.8
-
-  Reference:
-    - [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               name='Nadam',
-               **kwargs):
-    # Backwards compatibility with keras NAdam optimizer.
-    kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
-    learning_rate = kwargs.get('lr', learning_rate)
-    if isinstance(learning_rate, learning_rate_schedule.LearningRateSchedule):
-      raise ValueError('The Nadam optimizer does not support '
-                       'tf.keras.optimizers.LearningRateSchedules as the '
-                       'learning rate.')
-
-    super().__init__(name, **kwargs)
-    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
-    self._set_hyper('decay', self._initial_decay)
-    self._set_hyper('beta_1', beta_1)
-    self._set_hyper('beta_2', beta_2)
-    self.epsilon = epsilon or backend_config.epsilon()
-    self._m_cache = None
-
-  def _create_slots(self, var_list):
-    var_dtype = var_list[0].dtype.base_dtype
-    if self._m_cache is None:
-      self._m_cache = self.add_weight(
-          'momentum_cache',
-          shape=[],
-          dtype=var_dtype,
-          initializer='ones',
-          trainable=False,
-          aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-      self._weights.append(self._m_cache)
-    # Separate for-loops to respect the ordering of slot variables from v1.
-    for var in var_list:
-      # Create slots for the first moments.
-      self.add_slot(var, 'm')
-    for var in var_list:
-      # Create slots for the second moments.
-      self.add_slot(var, 'v')
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    lr_t = tf.identity(self._get_hyper('learning_rate', var_dtype))
-    beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
-    beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
-    local_step = tf.cast(self.iterations + 1, var_dtype)
-    next_step = tf.cast(self.iterations + 2, var_dtype)
-
-    decay_base = tf.cast(0.96, var_dtype)
-
-    m_t = beta_1_t * (1. - 0.5 * (
-        tf.pow(decay_base, self._initial_decay * local_step)))
-    m_t_1 = beta_1_t * (1. - 0.5 * (
-        tf.pow(decay_base, self._initial_decay * next_step)))
-
-    m_schedule_new = tf.cast(self._m_cache_read, var_dtype) * m_t
-    if var_dtype is self._m_cache.dtype:
-      m_schedule_new = tf.identity(tf.compat.v1.assign(
-          self._m_cache, m_schedule_new, use_locking=self._use_locking))
-    m_schedule_next = m_schedule_new * m_t_1
-
-    apply_state[(var_device, var_dtype)] = dict(
-        lr_t=lr_t,
-        neg_lr_t=-lr_t,  # pylint: disable=invalid-unary-operand-type
-        epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
-        beta_1_t=beta_1_t,
-        beta_2_t=beta_2_t,
-        m_t=m_t,
-        m_t_1=m_t_1,
-        one_minus_beta_1_t=1 - beta_1_t,
-        one_minus_beta_2_t=1 - beta_2_t,
-        one_minus_m_t=1. - m_t,
-        one_minus_m_schedule_new=1. - m_schedule_new,
-        one_minus_m_schedule_next=1. - m_schedule_next,
-        v_t_prime_denominator=1. - tf.pow(beta_2_t, local_step),
-    )
-
-  def _prepare(self, var_list):
-    # Get the value of the momentum cache before starting to apply gradients.
-    self._m_cache_read = tf.identity(self._m_cache)
-    return super()._prepare(var_list)
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
-
-    g_prime = grad / coefficients['one_minus_m_schedule_new']
-    m_t = (coefficients['beta_1_t'] * m +
-           coefficients['one_minus_beta_1_t'] * grad)
-    m_t = tf.compat.v1.assign(m, m_t, use_locking=self._use_locking)
-    m_t_prime = m_t / coefficients['one_minus_m_schedule_next']
-    v_t = (coefficients['beta_2_t'] * v +
-           coefficients['one_minus_beta_2_t'] * tf.square(grad))
-    v_t = tf.compat.v1.assign(v, v_t, use_locking=self._use_locking)
-    v_t_prime = v_t / coefficients['v_t_prime_denominator']
-    m_t_bar = (coefficients['one_minus_m_t'] * g_prime +
-               coefficients['m_t_1'] * m_t_prime)
-    var_t = var - coefficients['lr_t'] * m_t_bar / (
-        tf.sqrt(v_t_prime) + coefficients['epsilon'])
-    return tf.compat.v1.assign(var, var_t, use_locking=self._use_locking).op
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
-
-    g_prime = grad / coefficients['one_minus_m_schedule_new']
-
-    # m_t = beta1 * m + (1 - beta1) * g_t
-    m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
-    m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'],
-                           use_locking=self._use_locking)
-
-    with tf.control_dependencies([m_t]):
-      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
-      m_t_slice = tf.gather(m_t, indices)
-
-    m_t_prime = m_t_slice / coefficients['one_minus_m_schedule_next']
-    m_t_bar = (coefficients['one_minus_m_t'] * g_prime +
-               coefficients['m_t_1'] * m_t_prime)
-
-    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
-    v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
-    v_t = tf.compat.v1.assign(v, v * coefficients['beta_2_t'],
-                           use_locking=self._use_locking)
-
-    with tf.control_dependencies([v_t]):
-      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
-      v_t_slice = tf.gather(v_t, indices)
-
-    v_t_prime = v_t_slice / coefficients['v_t_prime_denominator']
-    v_prime_sqrt_plus_eps = tf.sqrt(v_t_prime) + coefficients['epsilon']
-
-    var_update = self._resource_scatter_add(
-        var, indices,
-        coefficients['neg_lr_t'] * m_t_bar / v_prime_sqrt_plus_eps)
-    return tf.group(*[var_update, m_t_bar, v_t])
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        'learning_rate': self._serialize_hyperparameter('learning_rate'),
-        'decay': self._initial_decay,
-        'beta_1': self._serialize_hyperparameter('beta_1'),
-        'beta_2': self._serialize_hyperparameter('beta_2'),
-        'epsilon': self.epsilon,
-    })
-    return config
diff --git a/keras/optimizers/optimizer_v2/nadam_test.py b/keras/optimizers/optimizer_v2/nadam_test.py
deleted file mode 100644
index 2fd09df4e3a0..000000000000
--- a/keras/optimizers/optimizer_v2/nadam_test.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Nadam."""
-
-import tensorflow.compat.v2 as tf
-
-import numpy as np
-from keras.optimizers.optimizer_v2 import nadam
-
-
-def get_beta_accumulators(opt, dtype):
-  local_step = tf.cast(opt.iterations + 1, dtype)
-  beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype)
-  beta_1_power = tf.pow(beta_1_t, local_step)
-  beta_2_t = tf.cast(opt._get_hyper("beta_2"), dtype)
-  beta_2_power = tf.pow(beta_2_t, local_step)
-  return (beta_1_power, beta_2_power)
-
-
-def update_m_cache(m_cache, t, beta1=0.9):
-  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
-  m_cache_t = m_cache * mu_t
-  return m_cache_t
-
-
-def nadam_update_numpy(param,
-                       g_t,
-                       t,
-                       m,
-                       v,
-                       m_cache,
-                       alpha=0.001,
-                       beta1=0.9,
-                       beta2=0.999,
-                       epsilon=1e-8):
-
-  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
-  mu_t_1 = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 2)))
-  m_cache_t_1 = m_cache * mu_t_1
-  g_prime_t = g_t / (1 - m_cache)
-  m_t = beta1 * m + (1 - beta1) * g_t
-  v_t = beta2 * v + (1 - beta2) * g_t * g_t
-
-  m_prime_t = m_t / (1 - m_cache_t_1)
-  v_prime_t = v_t / (1 - beta2**(t + 1))
-  m_bar_t = (1 - mu_t) * g_prime_t + mu_t_1 * m_prime_t
-
-  param_t = param - alpha * m_bar_t / (np.sqrt(v_prime_t) + epsilon)
-  return param_t, m_t, v_t
-
-
-class NadamOptimizerTest(tf.test.TestCase):
-
-  def testSparse(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    sparse_epsilon = 1e-7
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
-        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0_np_indices = np.array([0, 2], dtype=np.int32)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np[grads0_np_indices]),
-            tf.constant(grads0_np_indices), tf.constant([3]))
-        grads1_np_indices = np.array([0, 2], dtype=np.int32)
-        grads1 = tf.IndexedSlices(
-            tf.constant(grads1_np[grads1_np_indices]),
-            tf.constant(grads1_np_indices), tf.constant([3]))
-        opt = nadam.Nadam(epsilon=sparse_epsilon)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 1.0, 2.0], var0)
-        self.assertAllClose([3.0, 3.0, 4.0], var1)
-
-        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
-
-        # Run 3 steps of Nadam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power)
-          update.run()
-
-          mcache = update_m_cache(mcache, t)
-          var0_np, m0, v0 = nadam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, mcache, epsilon=sparse_epsilon)
-          var1_np, m1, v1 = nadam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, mcache, epsilon=sparse_epsilon)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0)
-          self.assertAllCloseAccordingToType(var1_np, var1)
-
-  def testBasic(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for dtype in [tf.half, tf.float32, tf.float64]:
-      with tf.Graph().as_default(), self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0 = tf.constant(grads0_np)
-        grads1 = tf.constant(grads1_np)
-        opt = nadam.Nadam()
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0)
-        self.assertAllClose([3.0, 4.0], var1)
-
-        # Run 3 steps of Nadam
-        for t in range(3):
-          update.run()
-
-          mcache = update_m_cache(mcache, t)
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
-                                               mcache)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
-                                               mcache)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0)
-          self.assertAllCloseAccordingToType(var1_np, var1)
-
-  def testConstructNAdamWithLR(self):
-    opt = nadam.Nadam(lr=1.0)
-    opt_2 = nadam.Nadam(learning_rate=0.1, lr=1.0)
-    opt_3 = nadam.Nadam(learning_rate=0.1)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-  def testConstructNAdamWithScheduleDecay(self):
-    opt = nadam.Nadam(schedule_decay=0.2)
-    self.assertIsInstance(opt.decay, tf.Variable)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.decay), (0.2))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/optimizer_v2.py b/keras/optimizers/optimizer_v2/optimizer_v2.py
deleted file mode 100644
index a9d37f21f50c..000000000000
--- a/keras/optimizers/optimizer_v2/optimizer_v2.py
+++ /dev/null
@@ -1,1542 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Version 2 of class Optimizer."""
-# pylint: disable=g-bad-name
-
-
-import abc
-import contextlib
-import functools
-import warnings
-from keras import backend
-from keras import initializers
-from keras.engine import base_layer_utils
-from keras.optimizers.optimizer_v2 import utils as optimizer_utils
-from keras.optimizers.schedules import learning_rate_schedule
-from keras.utils import generic_utils
-from keras.utils import layer_utils
-from keras.utils import tf_inspect
-from keras.utils import tf_utils
-import tensorflow.compat.v2 as tf
-from tensorflow.python.util.tf_export import keras_export
-
-
-keras_optimizers_gauge = tf.__internal__.monitoring.BoolGauge(
-    "/tensorflow/api/keras/optimizers", "keras optimizer usage", "method")
-
-_DEFAULT_VALID_DTYPES = frozenset([
-    tf.float16, tf.bfloat16, tf.float32, tf.float64,
-    tf.complex64, tf.complex128
-])
-
-
-def _deduplicate_indexed_slices(values, indices):
-  """Sums `values` associated with any non-unique `indices`.
-
-  Args:
-    values: A `Tensor` with rank >= 1.
-    indices: A one-dimensional integer `Tensor`, indexing into the first
-      dimension of `values` (as in an IndexedSlices object).
-
-  Returns:
-    A tuple of (`summed_values`, `unique_indices`) where `unique_indices` is a
-    de-duplicated version of `indices` and `summed_values` contains the sum of
-    `values` slices associated with each unique index.
-  """
-  unique_indices, new_index_positions = tf.unique(indices)
-  summed_values = tf.math.unsorted_segment_sum(
-      values, new_index_positions,
-      tf.shape(unique_indices)[0])
-  return (summed_values, unique_indices)
-
-
-class NullContextmanager:
-
-  def __init__(self, *args, **kwargs):
-    pass
-
-  def __enter__(self):
-    pass
-
-  def __exit__(self, type_arg, value_arg, traceback_arg):
-    return False  # False values do not suppress exceptions
-
-
-def name_scope_only_in_function_or_graph(name):
-  """Internal-only entry point for `name_scope*`.
-
-  Enters a compat.v1.name_scope only when in a function or graph,
-  not when running fully eagerly.
-
-  Args:
-    name: The name argument that is passed to the op function.
-
-  Returns:
-    `name_scope*` context manager.
-  """
-  if not tf.executing_eagerly():
-    return tf.name_scope(name)
-  else:
-    return NullContextmanager()
-
-
-@keras_export(
-    "keras.optimizers.Optimizer",
-    metaclass=abc.ABCMeta)
-class OptimizerV2(tf.__internal__.tracking.Trackable):
-  """Base class for Keras optimizers.
-
-  You should not use this class directly, but instead instantiate one of its
-  subclasses such as `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`, etc.
-
-  ### Usage
-
-  ```python
-  # Create an optimizer with the desired parameters.
-  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
-  # `loss` is a callable that takes no argument and returns the value
-  # to minimize.
-  loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
-  # In graph mode, returns op that minimizes the loss by updating the listed
-  # variables.
-  opt_op = opt.minimize(loss, var_list=[var1, var2])
-  opt_op.run()
-  # In eager mode, simply call minimize to update the list of variables.
-  opt.minimize(loss, var_list=[var1, var2])
-  ```
-
-  ### Usage in custom training loops
-
-  In Keras models, sometimes variables are created when the model is first
-  called, instead of construction time. Examples include 1) sequential models
-  without input shape pre-defined, or 2) subclassed models. Pass var_list as
-  callable in these cases.
-
-  Example:
-
-  ```python
-  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(num_hidden, activation='relu'))
-  model.add(tf.keras.layers.Dense(num_classes, activation='sigmoid'))
-  loss_fn = lambda: tf.keras.losses.mse(model(input), output)
-  var_list_fn = lambda: model.trainable_weights
-  for input, output in data:
-    opt.minimize(loss_fn, var_list_fn)
-  ```
-
-  ### Processing gradients before applying them
-
-  Calling `minimize()` takes care of both computing the gradients and
-  applying them to the variables.  If you want to process the gradients
-  before applying them you can instead use the optimizer in three steps:
-
-  1.  Compute the gradients with `tf.GradientTape`.
-  2.  Process the gradients as you wish.
-  3.  Apply the processed gradients with `apply_gradients()`.
-
-  Example:
-
-  ```python
-  # Create an optimizer.
-  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
-
-  # Compute the gradients for a list of variables.
-  with tf.GradientTape() as tape:
-    loss = <call_loss_function>
-  vars = <list_of_variables>
-  grads = tape.gradient(loss, vars)
-
-  # Process the gradients, for example cap them, etc.
-  # capped_grads = [MyCapper(g) for g in grads]
-  processed_grads = [process_gradient(g) for g in grads]
-
-  # Ask the optimizer to apply the processed gradients.
-  opt.apply_gradients(zip(processed_grads, var_list))
-  ```
-
-  ### Use with `tf.distribute.Strategy`
-
-  This optimizer class is `tf.distribute.Strategy` aware, which means it
-  automatically sums gradients across all replicas. To average gradients,
-  you divide your loss by the global batch size, which is done
-  automatically if you use `tf.keras` built-in training or evaluation loops.
-  See the `reduction` argument of your loss which should be set to
-  `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` for averaging or
-  `tf.keras.losses.Reduction.SUM` for not.
-
-  To aggregate gradients yourself, call `apply_gradients` with
-  `experimental_aggregate_gradients` set to False. This is useful if you need to
-  process aggregated gradients.
-
-  If you are not using these and you want to average gradients, you should use
-  `tf.math.reduce_sum` to add up your per-example losses and then divide by the
-  global batch size. Note that when using `tf.distribute.Strategy`, the first
-  component of a tensor's shape is the *replica-local* batch size, which is off
-  by a factor equal to the number of replicas being used to compute a single
-  step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
-  resulting in gradients that can be many times too big.
-
-  ### Variable Constraints
-
-  All Keras optimizers respect variable constraints. If constraint function is
-  passed to any variable, the constraint will be applied to the variable after
-  the gradient has been applied to the variable.
-  Important: If gradient is sparse tensor, variable constraint is not supported.
-
-  ### Thread Compatibility
-
-  The entire optimizer is currently thread compatible, not thread-safe. The user
-  needs to perform synchronization if necessary.
-
-  ### Slots
-
-  Many optimizer subclasses, such as `Adam` and `Adagrad` allocate and manage
-  additional variables associated with the variables to train.  These are called
-  <i>Slots</i>.  Slots have names and you can ask the optimizer for the names of
-  the slots that it uses.  Once you have a slot name you can ask the optimizer
-  for the variable it created to hold the slot value.
-
-  This can be useful if you want to log debug a training algorithm, report stats
-  about the slots, etc.
-
-  ### Hyperparameters
-
-  These are arguments passed to the optimizer subclass constructor
-  (the `__init__` method), and then passed to `self._set_hyper()`.
-  They can be either regular Python values (like 1.0), tensors, or
-  callables. If they are callable, the callable will be called during
-  `apply_gradients()` to get the value for the hyper parameter.
-
-  Hyperparameters can be overwritten through user code:
-
-  Example:
-
-  ```python
-  # Create an optimizer with the desired parameters.
-  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
-  # `loss` is a callable that takes no argument and returns the value
-  # to minimize.
-  loss = lambda: 3 * var1 + 2 * var2
-  # In eager mode, simply call minimize to update the list of variables.
-  opt.minimize(loss, var_list=[var1, var2])
-  # update learning rate
-  opt.learning_rate = 0.05
-  opt.minimize(loss, var_list=[var1, var2])
-  ```
-
-  ### Callable learning rate
-
-  Optimizer accepts a callable learning rate in two ways. The first way is
-  through built-in or customized
-  `tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be
-  called on each iteration with `schedule(iteration)`, a `tf.Variable`
-  owned by the optimizer.
-
-  Example:
-
-  >>> var = tf.Variable(np.random.random(size=(1,)))
-  >>> learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
-  ... initial_learning_rate=.01, decay_steps=20, decay_rate=.1)
-  >>> opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)
-  >>> loss = lambda: 3 * var
-  >>> opt.minimize(loss, var_list=[var])
-  <tf.Variable...
-
-  The second way is through a callable function that
-  does not accept any arguments.
-
-  Example:
-
-  >>> var = tf.Variable(np.random.random(size=(1,)))
-  >>> def lr_callable():
-  ...   return .1
-  >>> opt = tf.keras.optimizers.SGD(learning_rate=lr_callable)
-  >>> loss = lambda: 3 * var
-  >>> opt.minimize(loss, var_list=[var])
-  <tf.Variable...
-
-  ### Creating a custom optimizer
-
-  If you intend to create your own optimization algorithm, simply inherit from
-  this class and override the following methods:
-
-    - `_resource_apply_dense` (update variable given gradient tensor is a dense
-      `tf.Tensor`)
-    - `_resource_apply_sparse` (update variable given gradient tensor is a
-      sparse `tf.IndexedSlices`. The most common way for this to happen
-      is if you are taking the gradient through a `tf.gather`.)
-    - `_create_slots`
-      (if your optimizer algorithm requires additional variables)
-    - `get_config`
-      (serialization of the optimizer, include all hyper parameters)
-  """
-
-  # Subclasses should set this to True unless they override `apply_gradients`
-  # with a version that does not have the `experimental_aggregate_gradients`
-  # argument.  Older versions of Keras did not have this argument so custom
-  # optimizers may have overridden `apply_gradients` without the
-  # `experimental_aggregate_gradients` argument. Keras only passes
-  # `experimental_aggregate_gradients` if this attribute is True.
-  # Note: This attribute will likely be removed in an upcoming release.
-  _HAS_AGGREGATE_GRAD = False
-
-  def __init__(self,
-               name,
-               gradient_aggregator=None,
-               gradient_transformers=None,
-               **kwargs):
-    """Create a new Optimizer.
-
-    This must be called by the constructors of subclasses.
-    Note that Optimizer instances should not bind to a single graph,
-    and so shouldn't keep Tensors as member variables. Generally
-    you should be able to use the _set_hyper()/state.get_hyper()
-    facility instead.
-
-    This class is stateful and thread-compatible.
-
-    Example of custom gradient transformations:
-
-    ```python
-    def my_gradient_transformer(grads_and_vars):
-      # Simple example, double the gradients.
-      return [(2. * g, v) for g, v in grads_and_vars]
-
-    optimizer = tf.keras.optimizers.SGD(
-        1e-3, gradient_transformers=[my_gradient_transformer])
-    ```
-
-    Args:
-      name: String. The name to use for momentum accumulator weights created
-        by the optimizer.
-      gradient_aggregator: The function to use to aggregate gradients across
-        devices (when using `tf.distribute.Strategy`). If `None`, defaults to
-        summing the gradients across devices. The function should accept and
-        return a list of `(gradient, variable)` tuples.
-      gradient_transformers: Optional. List of functions to use to transform
-        gradients before applying updates to Variables. The functions are
-        applied after `gradient_aggregator`. The functions should accept and
-        return a list of `(gradient, variable)` tuples.
-      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-        `clipnorm`, `global_clipnorm`.
-        If `clipvalue` (float) is set, the gradient of each weight
-        is clipped to be no higher than this value.
-        If `clipnorm` (float) is set, the gradient of each weight
-        is individually clipped so that its norm is no higher than this value.
-        If `global_clipnorm` (float) is set the gradient of all weights is
-        clipped so that their global norm is no higher than this value.
-
-    Raises:
-      ValueError: in case of any invalid argument.
-    """
-    # Instrument optimizer usages
-    keras_optimizers_gauge.get_cell(self.__class__.__name__).set(True)
-
-    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay", "global_clipnorm"}
-    for k in kwargs:
-      if k not in allowed_kwargs:
-        raise TypeError("Unexpected keyword argument "
-                        f"passed to optimizer: {str(k)}. Allowed kwargs are "
-                        f"{allowed_kwargs}.")
-      # checks that all keyword arguments are non-negative.
-      if kwargs[k] is not None and kwargs[k] < 0:
-        raise ValueError("Expected {} >= 0, received: {}".format(k, kwargs[k]))
-      if k == "lr":
-        warnings.warn(
-            "The `lr` argument is deprecated, use `learning_rate` instead.",
-            stacklevel=2)
-
-    self._use_locking = True
-    self._init_set_name(name)
-    self._hyper = {}
-    # dict: {variable name : {slot name : variable}}
-    self._slots = {}
-    self._slot_names = []
-    self._weights = []
-    self._iterations = None
-
-    # For implementing Trackable. Stores information about how to restore
-    # slot variables which have not yet been created
-    # (trackable._CheckpointPosition objects).
-    #  {slot_name :
-    #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
-    #   ... }
-    self._deferred_slot_restorations = {}
-
-    decay = kwargs.pop("decay", 0.0)
-    if decay < 0.:
-      raise ValueError("decay cannot be less than 0. "
-                       "Received: decay={}.".format(decay))
-    self._initial_decay = decay
-
-    self._hypers_created = False
-    # Store the distribution strategy object if the optimizer is created inside
-    # strategy scope, so it could be used to create variables later.
-    if tf.distribute.has_strategy():
-      self._distribution_strategy = tf.distribute.get_strategy()
-    else:
-      self._distribution_strategy = None
-
-    # Configure gradient transformations.
-    if gradient_aggregator is None:
-      gradient_aggregator = optimizer_utils.all_reduce_sum_gradients
-    self.gradient_aggregator = gradient_aggregator
-    if gradient_transformers is None:
-      gradient_transformers = []
-    self.gradient_transformers = gradient_transformers
-    self.clipnorm = kwargs.pop("clipnorm", None)
-    self.global_clipnorm = kwargs.pop("global_clipnorm", None)
-    if self.clipnorm is not None and self.global_clipnorm is not None:
-      raise ValueError("Cannot accept both `clipnorm` and `global_clipnorm`. "
-                       "Received: `clipnorm`={}, `global_clipnorm`={}.".format(
-                           self.clipnorm, self.global_clipnorm))
-    self.clipvalue = kwargs.pop("clipvalue", None)
-
-  @property
-  def clipnorm(self):
-    """`float` or `None`. If set, clips gradients to a maximum norm."""
-    return self._clipnorm
-
-  @property
-  def global_clipnorm(self):
-    """`float` or `None`.
-
-    If set, clips gradients to a maximum norm.
-
-    Check `tf.clip_by_global_norm` for more details.
-    """
-    return self._global_clipnorm
-
-  @clipnorm.setter
-  def clipnorm(self, val):
-    if val is not None and self.gradient_transformers:
-      raise ValueError("`clipnorm` cannot be set when `gradient_transformers` "
-                       "is set. Instead, use the `gradient_transformers` to "
-                       "specify clipping and other transformations. Received: "
-                       f"val={val}, "
-                       f"gradient_transformers={self.gradient_transformers}.")
-    self._clipnorm = val
-    self._clipnorm_fn = optimizer_utils.make_gradient_clipnorm_fn(
-        self._clipnorm)
-
-  @global_clipnorm.setter
-  def global_clipnorm(self, val):
-    if val is not None and self.gradient_transformers:
-      raise ValueError("`global_clipnorm` cannot be set when "
-                       "`gradient_transformers` "
-                       "is set. Instead, use the `gradient_transformers` to "
-                       "specify clipping and other transformations. Received: "
-                       f"val={val}, "
-                       f"gradient_transformers={self.gradient_transformers}.")
-    self._global_clipnorm = val
-    self._global_clipnorm_fn = optimizer_utils.make_global_gradient_clipnorm_fn(
-        self._global_clipnorm)
-
-  @property
-  def clipvalue(self):
-    """`float` or `None`. If set, clips gradients to a maximum value."""
-    return self._clipvalue
-
-  @clipvalue.setter
-  def clipvalue(self, val):
-    if val is not None and self.gradient_transformers:
-      raise ValueError("`clipvalue` cannot be set when `gradient_transformers` "
-                       "is set. Instead, use the `gradient_transformers` to "
-                       "specify clipping and other transformations. Received: "
-                       f"val={val}, "
-                       f"gradient_transformers={self.gradient_transformers}.")
-    self._clipvalue = val
-    self._clipvalue_fn = optimizer_utils.make_gradient_clipvalue_fn(
-        self._clipvalue)
-
-  def _transform_loss(self, loss):
-    """Called in `.minimize` to transform loss before computing gradients."""
-    return loss
-
-  def _get_gradients(self, tape, loss, var_list, grad_loss=None):
-    """Called in `minimize` to compute gradients from loss."""
-    grads = tape.gradient(loss, var_list, grad_loss)
-    return list(zip(grads, var_list))
-
-  def _transform_unaggregated_gradients(self, grads_and_vars):
-    """Called in `apply_gradients` before gradient aggregation."""
-    return grads_and_vars
-
-  def _aggregate_gradients(self, grads_and_vars):
-    """Called in `apply_gradients` to aggregate gradients across devices.
-
-    Note that user subclasses may override this, so the interface should not be
-    changed.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-
-    Returns:
-      A list of (aggregrated_gradient, variable) pairs. By default, this calls
-      `self.gradient_aggregator`.
-    """
-    return self.gradient_aggregator(grads_and_vars)
-
-  def _transform_gradients(self, grads_and_vars):
-    """Called in `apply_gradients` after aggregation."""
-    if self._clipvalue is not None:
-      grads_and_vars = self._clipvalue_fn(grads_and_vars)
-    if self._clipnorm is not None:
-      grads_and_vars = self._clipnorm_fn(grads_and_vars)
-    if self._global_clipnorm is not None:
-      grads_and_vars = self._global_clipnorm_fn(grads_and_vars)
-
-    for fn in self.gradient_transformers:
-      grads_and_vars = fn(grads_and_vars)
-    return grads_and_vars
-
-  def minimize(self, loss, var_list, grad_loss=None, name=None, tape=None):
-    """Minimize `loss` by updating `var_list`.
-
-    This method simply computes gradient using `tf.GradientTape` and calls
-    `apply_gradients()`. If you want to process the gradient before applying
-    then call `tf.GradientTape` and `apply_gradients()` explicitly instead
-    of using this function.
-
-    Args:
-      loss: `Tensor` or callable. If a callable, `loss` should take no arguments
-        and return the value to minimize. If a `Tensor`, the `tape` argument
-        must be passed.
-      var_list: list or tuple of `Variable` objects to update to minimize
-        `loss`, or a callable returning the list or tuple of `Variable` objects.
-        Use callable when the variable list would otherwise be incomplete before
-        `minimize` since the variables are created at the first time `loss` is
-        called.
-      grad_loss: (Optional). A `Tensor` holding the gradient computed for
-        `loss`.
-      name: (Optional) str. Name for the returned operation.
-      tape: (Optional) `tf.GradientTape`. If `loss` is provided as a `Tensor`,
-        the tape that computed the `loss` must be provided.
-
-    Returns:
-      An `Operation` that updates the variables in `var_list`. The `iterations`
-      will be automatically increased by 1.
-
-    Raises:
-      ValueError: If some of the variables are not `Variable` objects.
-
-    """
-    grads_and_vars = self._compute_gradients(
-        loss, var_list=var_list, grad_loss=grad_loss, tape=tape)
-    return self.apply_gradients(grads_and_vars, name=name)
-
-  def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
-    """Compute gradients of `loss` for the variables in `var_list`.
-
-    This is the first part of `minimize()`.  It returns a list
-    of (gradient, variable) pairs where "gradient" is the gradient
-    for "variable".  Note that "gradient" can be a `Tensor`, an
-    `IndexedSlices`, or `None` if there is no gradient for the
-    given variable.
-
-    Args:
-      loss: `Tensor` or callable. If a callable, `loss` should take no
-        arguments and return the value to minimize. If a `Tensor`, the `tape`
-        argument must be passed.
-      var_list: list or tuple of `Variable` objects to update to minimize
-        `loss`, or a callable returning the list or tuple of `Variable` objects.
-        Use callable when the variable list would otherwise be incomplete before
-        `minimize` and the variables are created at the first time when `loss`
-        is called.
-      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
-      tape: (Optional) `tf.GradientTape`. If `loss` is provided as a `Tensor`,
-        the tape that computed the `loss` must be provided.
-
-    Returns:
-      A list of (gradient, variable) pairs. Variable is always present, but
-      gradient can be `None`.
-
-    Raises:
-      TypeError: If `var_list` contains anything else than `Variable` objects.
-      ValueError: If some arguments are invalid, or var_list is None.
-    """
-    # TODO(joshl): Test that we handle weight decay in a reasonable way.
-    if not callable(loss) and tape is None:
-      raise ValueError("`tape` is required when a `Tensor` loss is passed. "
-                       f"Received: loss={loss}, tape={tape}.")
-    tape = tape if tape is not None else tf.GradientTape()
-
-    if callable(loss):
-      with tape:
-        if not callable(var_list):
-          tape.watch(var_list)
-        loss = loss()
-        if callable(var_list):
-          var_list = var_list()
-
-    with tape:
-      loss = self._transform_loss(loss)
-
-    var_list = tf.nest.flatten(var_list)
-    with tf.name_scope(self._name + "/gradients"):
-      grads_and_vars = self._get_gradients(tape, loss, var_list, grad_loss)
-
-    self._assert_valid_dtypes([
-        v for g, v in grads_and_vars
-        if g is not None and v.dtype != tf.resource
-    ])
-
-    return grads_and_vars
-
-  def apply_gradients(self,
-                      grads_and_vars,
-                      name=None,
-                      experimental_aggregate_gradients=True):
-    """Apply gradients to variables.
-
-    This is the second part of `minimize()`. It returns an `Operation` that
-    applies gradients.
-
-    The method sums gradients from all replicas in the presence of
-    `tf.distribute.Strategy` by default. You can aggregate gradients yourself by
-    passing `experimental_aggregate_gradients=False`.
-
-    Example:
-
-    ```python
-    grads = tape.gradient(loss, vars)
-    grads = tf.distribute.get_replica_context().all_reduce('sum', grads)
-    # Processing aggregated gradients.
-    optimizer.apply_gradients(zip(grads, vars),
-        experimental_aggregate_gradients=False)
-
-    ```
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-      name: Optional name for the returned operation. Default to the name passed
-        to the `Optimizer` constructor.
-      experimental_aggregate_gradients: Whether to sum gradients from different
-        replicas in the presence of `tf.distribute.Strategy`. If False, it's
-        user responsibility to aggregate the gradients. Default to True.
-
-    Returns:
-      An `Operation` that applies the specified gradients. The `iterations`
-      will be automatically increased by 1.
-
-    Raises:
-      TypeError: If `grads_and_vars` is malformed.
-      ValueError: If none of the variables have gradients.
-      RuntimeError: If called in a cross-replica context.
-    """
-    grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
-    var_list = [v for (_, v) in grads_and_vars]
-
-    with tf.name_scope(self._name):
-      # Create iteration if necessary.
-      with tf.init_scope():
-        self._create_all_weights(var_list)
-
-      if not grads_and_vars:
-        # Distribution strategy does not support reducing an empty list of
-        # gradients
-        return tf.no_op()
-
-      if tf.distribute.in_cross_replica_context():
-        raise RuntimeError(
-            "`apply_gradients() cannot be called in cross-replica context. "
-            "Use `tf.distribute.Strategy.run` to enter replica "
-            "context. For more information, please see the docstring of "
-            "`tf.distribute.get_replica_context`.")
-
-      strategy = tf.distribute.get_strategy()
-      if (not experimental_aggregate_gradients and strategy and isinstance(
-          strategy,
-          (tf.compat.v1.distribute.experimental.ParameterServerStrategy,
-           tf.distribute.experimental.ParameterServerStrategy,
-           tf.distribute.experimental.CentralStorageStrategy,
-           tf.compat.v1.distribute.experimental.CentralStorageStrategy))):
-        raise NotImplementedError(
-            "`experimental_aggregate_gradients=False is not supported for "
-            "ParameterServerStrategy and CentralStorageStrategy. Used: "
-            f"strategy={strategy}.")
-
-      apply_state = self._prepare(var_list)
-      if experimental_aggregate_gradients:
-        grads_and_vars = self._transform_unaggregated_gradients(grads_and_vars)
-        grads_and_vars = self._aggregate_gradients(grads_and_vars)
-      grads_and_vars = self._transform_gradients(grads_and_vars)
-
-      return tf.__internal__.distribute.interim.maybe_merge_call(
-          functools.partial(self._distributed_apply, apply_state=apply_state),
-          strategy,
-          grads_and_vars,
-          name=name)
-
-  def _distributed_apply(self, distribution, grads_and_vars, apply_state, name):
-    """`apply_gradients` using a `DistributionStrategy`."""
-
-    def apply_grad_to_update_var(var, grad):
-      """Apply gradient to variable."""
-      if isinstance(var, tf.Tensor):
-        raise NotImplementedError(
-            f"Updating a `Tensor` is not implemented. Received: var={var}.")
-
-      apply_kwargs = {}
-      if isinstance(grad, tf.IndexedSlices):
-        if var.constraint is not None:
-          raise RuntimeError(
-              "Cannot use a constraint function on a sparse variable. "
-              f"Received: grad={grad}, var.constraint={var.constraint}.")
-        if "apply_state" in self._sparse_apply_args:
-          apply_kwargs["apply_state"] = apply_state
-        return self._resource_apply_sparse_duplicate_indices(
-            grad.values, var, grad.indices, **apply_kwargs)
-
-      if "apply_state" in self._dense_apply_args:
-        apply_kwargs["apply_state"] = apply_state
-      update_op = self._resource_apply_dense(grad, var, **apply_kwargs)
-      if var.constraint is not None:
-        with tf.control_dependencies([update_op]):
-          return var.assign(var.constraint(var))
-      else:
-        return update_op
-
-    eagerly_outside_functions = tf.compat.v1.executing_eagerly_outside_functions()
-    update_ops = []
-    with name_scope_only_in_function_or_graph(name or self._name):
-      for grad, var in grads_and_vars:
-        # Colocate the update with variables to avoid unnecessary communication
-        # delays. See b/136304694.
-        with distribution.extended.colocate_vars_with(var):
-          with name_scope_only_in_function_or_graph(
-              "update" if eagerly_outside_functions else "update_" +
-              var.op.name):
-            update_op = distribution.extended.update(
-                var, apply_grad_to_update_var, args=(grad,), group=False)
-            if tf.distribute.in_cross_replica_context():
-              # In cross-replica context, extended.update returns a list of
-              # update ops from all replicas (group=False).
-              update_ops.extend(update_op)
-            else:
-              # In replica context, extended.update return the single update op
-              # of current replica.
-              update_ops.append(update_op)
-
-      any_symbolic = any(isinstance(i, tf.Operation) or
-                         tf_utils.is_symbolic_tensor(i) for i in update_ops)
-      if not tf.executing_eagerly() or any_symbolic:
-        # If the current context is graph mode or any of the update ops are
-        # symbolic then the step update should be carried out under a graph
-        # context. (eager updates execute immediately)
-        with backend._current_graph(update_ops).as_default():  # pylint: disable=protected-access
-          with tf.control_dependencies([tf.group(update_ops)]):
-            return self.iterations.assign_add(1, read_value=False)
-
-      return self.iterations.assign_add(1)
-
-  def get_gradients(self, loss, params):
-    """Returns gradients of `loss` with respect to `params`.
-
-    Should be used only in legacy v1 graph mode.
-
-    Args:
-      loss: Loss tensor.
-      params: List of variables.
-
-    Returns:
-      List of gradient tensors.
-
-    Raises:
-      ValueError: In case any gradient cannot be computed (e.g. if gradient
-        function not implemented).
-    """
-    params = tf.nest.flatten(params)
-    with backend.get_graph().as_default(), backend.name_scope(self._name +
-                                                              "/gradients"):
-      grads = tf.compat.v1.gradients(loss, params)
-      for grad, param in zip(grads, params):
-        if grad is None:
-          raise ValueError("Variable {} has `None` for gradient. "
-                           "Please make sure that all of your ops have a "
-                           "gradient defined (i.e. are differentiable). "
-                           "Common ops without gradient: "
-                           "K.argmax, K.round, K.eval.".format(param))
-    return grads
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    grads_and_vars = list(zip(grads, params))
-    self._assert_valid_dtypes([
-        v for g, v in grads_and_vars
-        if g is not None and v.dtype != tf.resource
-    ])
-    return [self.apply_gradients(grads_and_vars)]
-
-  def _set_hyper(self, name, value):
-    """set hyper `name` to value. value can be callable, tensor, numeric."""
-    if isinstance(value, tf.__internal__.tracking.Trackable):
-      self._track_trackable(value, name, overwrite=True)
-    if name not in self._hyper:
-      self._hyper[name] = value
-    else:
-      prev_value = self._hyper[name]
-      if (callable(prev_value)
-          or isinstance(prev_value,
-                        (tf.Tensor, int, float,
-                         learning_rate_schedule.LearningRateSchedule))
-          or isinstance(value, learning_rate_schedule.LearningRateSchedule)):
-        self._hyper[name] = value
-      else:
-        backend.set_value(self._hyper[name], value)
-
-  def _get_hyper(self, name, dtype=None):
-    if not self._hypers_created:
-      self._create_hypers()
-    value = self._hyper[name]
-    if isinstance(value, learning_rate_schedule.LearningRateSchedule):
-      return value
-    if callable(value):
-      value = value()
-    if dtype:
-      return tf.cast(value, dtype)
-    else:
-      return value
-
-  def _create_slots(self, var_list):
-    pass
-
-  def _create_slots_for_sharded_variables(self, var_list):
-    """Add ShardedVariables to slots to later reconstruct for checkpointing.
-
-    ShardedVariables don't have slot variables created for them; their shards
-    do. This function allows users to call get_slot with a ShardedVariable input
-    and receive a ShardedVariable output containing the appropriate slot vars.
-
-    Iterate over the variables to find shards, and aggregate the sharded
-    containers in a set. Add these ShardedVariables to _slots so that get_slot
-    can retrieve the proper slot variables for their component shards, and
-    reconstruct those into a ShardedVariable.
-
-    Args:
-      var_list: list or tuple of `Variable` objects that will be minimized
-        using this optimizer.
-    """
-    sharded_vars = set()
-    for var in var_list:
-      if getattr(var, "_sharded_container", False):
-        sharded_vars.add(var._sharded_container())  # pylint: disable=protected-access
-
-    for sharded_var in sharded_vars:
-      sharded_key = _var_key(sharded_var)
-      slot_dict = {}
-      for slot in self.get_slot_names():
-        slot_dict[slot] = sharded_var
-      self._slots[sharded_key] = slot_dict
-
-  def _create_all_weights(self, var_list):
-    """Creates all weights, including iterations, hyperparameters and slot vars.
-
-    This will add newly created variables to `optimizer.weights`.
-
-    New variables are only created when this method is called the first time, or
-    when called with different variables in the var_list.
-
-    Args:
-      var_list: list or tuple of `Variable` objects that will be minimized
-        using this optimizer.
-    """
-
-    _ = self.iterations
-    self._create_hypers()
-    self._create_slots(var_list)
-    self._create_slots_for_sharded_variables(var_list)
-
-  def __getattribute__(self, name):
-    """Overridden to support hyperparameter access."""
-    try:
-      return super().__getattribute__(name)
-    except AttributeError as e:
-      # Needed to avoid infinite recursion with __setattr__.
-      if name == "_hyper":
-        raise e
-      # Backwards compatibility with Keras optimizers.
-      if name == "lr":
-        name = "learning_rate"
-      if name in self._hyper:
-        return self._get_hyper(name)
-      raise e
-
-  def __dir__(self):
-    result = set(super().__dir__())
-    if "_hyper" in result:
-      result |= self._hyper.keys()
-      if "learning_rate" in self._hyper.keys():
-        result.add("lr")
-    return list(result)
-
-  def __setattr__(self, name, value):
-    """Override setattr to support dynamic hyperparameter setting."""
-    # Backwards compatibility with Keras optimizers.
-    if name == "lr":
-      name = "learning_rate"
-    if hasattr(self, "_hyper") and name in self._hyper:
-      self._set_hyper(name, value)
-    else:
-      super().__setattr__(name, value)
-
-  def get_slot_names(self):
-    """A list of names for this optimizer's slots."""
-    return self._slot_names
-
-  def add_slot(self, var, slot_name, initializer="zeros", shape=None):
-    """Add a new slot variable for `var`.
-
-    A slot variable is an additional variable associated with `var` to train.
-    It is allocated and managed by optimizers, e.g. `Adam`.
-
-    Args:
-      var: a `Variable` object.
-      slot_name: name of the slot variable.
-      initializer: initializer of the slot variable
-      shape: (Optional) shape of the slot variable. If not set, it will default
-      to the shape of `var`.
-
-    Returns:
-      A slot variable.
-    """
-    if slot_name not in self._slot_names:
-      self._slot_names.append(slot_name)
-    var_key = _var_key(var)
-    slot_dict = self._slots.setdefault(var_key, {})
-    weight = slot_dict.get(slot_name, None)
-    if weight is None:
-      if isinstance(initializer, str) or callable(initializer):
-        initializer = initializers.get(initializer)
-        if isinstance(initializer, tf.__internal__.tracking
-                      .CheckpointInitialValueCallable) or (shape is not None):
-          slot_shape = shape
-        else:
-          slot_shape = var.shape
-        initial_value = functools.partial(
-            initializer, shape=slot_shape, dtype=var.dtype)
-      else:
-        initial_value = initializer
-
-      with self._distribution_strategy_scope():
-        strategy = tf.distribute.get_strategy()
-        if not strategy.extended.variable_created_in_scope(var):
-          raise ValueError(
-              "Trying to create optimizer slot variable under the scope for "
-              "tf.distribute.Strategy ({}), which is different from the scope "
-              "used for the original variable ({}). Make sure the slot "
-              "variables are created under the same strategy scope. This may "
-              "happen if you're restoring from a checkpoint outside the scope."
-              .format(strategy, var))
-
-        with strategy.extended.colocate_vars_with(var):
-          weight = tf.Variable(
-              name="%s/%s" % (var._shared_name, slot_name),  # pylint: disable=protected-access
-              dtype=var.dtype,
-              trainable=False,
-              initial_value=initial_value)
-      backend.track_variable(weight)
-      slot_dict[slot_name] = weight
-      self._restore_slot_variable(
-          slot_name=slot_name, variable=var,
-          slot_variable=weight)
-      self._weights.append(weight)
-    return weight
-
-  def get_slot(self, var, slot_name):
-    var_key = _var_key(var)
-    slot_dict = self._slots[var_key]
-    slot_variable = slot_dict[slot_name]
-    if isinstance(slot_variable,
-                  tf.__internal__.distribute.ShardedVariable):
-      # Construct a ShardedVariable that points to the input ShardedVariable's
-      # component shard's slot variables.
-      shard_vars = []
-      for shard in slot_variable.variables:
-        slot_shard = self.get_slot(shard, slot_name)
-        shard_vars.append(slot_shard)
-      slot_variable = (
-          tf.__internal__.distribute.ShardedVariable(
-              shard_vars, name=slot_variable.name)
-          )
-    return slot_variable
-
-  def _prepare(self, var_list):
-    keys = set()
-    for var in var_list:
-      if isinstance(var, tf.distribute.DistributedValues):
-        var_devices = var._devices   # pylint: disable=protected-access
-      else:
-        var_devices = [var.device]
-      var_dtype = var.dtype.base_dtype
-      for var_device in var_devices:
-        keys.add((var_device, var_dtype))
-
-    apply_state = {}
-    for var_device, var_dtype in keys:
-      apply_state[(var_device, var_dtype)] = {}
-      with tf.device(var_device):
-        self._prepare_local(var_device, var_dtype, apply_state)
-
-    return apply_state
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    if "learning_rate" in self._hyper:
-      lr_t = tf.identity(self._decayed_lr(var_dtype))
-      apply_state[(var_device, var_dtype)]["lr_t"] = lr_t
-
-  def _fallback_apply_state(self, var_device, var_dtype):
-    """Compatibility for subclasses that don't pass apply_state through."""
-    apply_state = {(var_device, var_dtype): {}}
-    self._prepare_local(var_device, var_dtype, apply_state)
-    return apply_state[(var_device, var_dtype)]
-
-  def _create_hypers(self):
-    if self._hypers_created:
-      return
-    with self._distribution_strategy_scope():
-      # Iterate hyper values deterministically.
-      for name, value in sorted(self._hyper.items()):
-        if isinstance(value,
-                      (tf.Tensor, tf.Variable)) or callable(value):
-          # The check for `callable` covers the usage when `value` is a
-          # `LearningRateSchedule`, in which case it does not need to create a
-          # variable.
-          continue
-        else:
-          self._hyper[name] = self.add_weight(
-              name,
-              shape=[],
-              trainable=False,
-              initializer=value,
-              aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-    self._hypers_created = True
-
-  @property
-  def iterations(self):
-    """Variable. The number of training steps this Optimizer has run."""
-    if self._iterations is None:
-      with self._distribution_strategy_scope():
-        self._iterations = self.add_weight(
-            "iter",
-            shape=[],
-            dtype=tf.int64,
-            trainable=False,
-            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-      self._weights.append(self._iterations)
-    return self._iterations
-
-  @iterations.setter
-  def iterations(self, variable):
-    if self._iterations is not None:
-      raise RuntimeError("Cannot set `iterations` to a new Variable after "
-                         "the Optimizer weights have been created. Here it is "
-                         f"attempting to set `iterations` to {variable}.")
-    self._iterations = variable
-    self._weights.append(self._iterations)
-
-  def _decayed_lr(self, var_dtype):
-    """Get decayed learning rate as a Tensor with dtype=var_dtype."""
-    lr_t = self._get_hyper("learning_rate", var_dtype)
-    if isinstance(lr_t, learning_rate_schedule.LearningRateSchedule):
-      local_step = tf.cast(self.iterations, var_dtype)
-      lr_t = tf.cast(lr_t(local_step), var_dtype)
-    if self._initial_decay > 0.:
-      local_step = tf.cast(self.iterations, var_dtype)
-      decay_t = tf.cast(self._initial_decay, var_dtype)
-      lr_t = lr_t / (1. + decay_t * local_step)
-    return lr_t
-
-  @abc.abstractmethod
-  def get_config(self):
-    """Returns the config of the optimizer.
-
-    An optimizer config is a Python dictionary (serializable)
-    containing the configuration of an optimizer.
-    The same optimizer can be reinstantiated later
-    (without any saved state) from this configuration.
-
-    Returns:
-        Python dictionary.
-    """
-    config = {"name": self._name}
-    if self.clipnorm is not None:
-      config["clipnorm"] = self.clipnorm
-    if self.clipvalue is not None:
-      config["clipvalue"] = self.clipvalue
-    if self.global_clipnorm is not None:
-      config["global_clipnorm"] = self.global_clipnorm
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    """Creates an optimizer from its config.
-
-    This method is the reverse of `get_config`,
-    capable of instantiating the same optimizer from the config
-    dictionary.
-
-    Args:
-        config: A Python dictionary, typically the output of get_config.
-        custom_objects: A Python dictionary mapping names to additional Python
-          objects used to create this optimizer, such as a function used for a
-          hyperparameter.
-
-    Returns:
-        An optimizer instance.
-    """
-    if "lr" in config:
-      config["learning_rate"] = config.pop("lr")
-    if "learning_rate" in config:
-      if isinstance(config["learning_rate"], dict):
-        config["learning_rate"] = learning_rate_schedule.deserialize(
-            config["learning_rate"], custom_objects=custom_objects)
-    return cls(**config)
-
-  def _serialize_hyperparameter(self, hyperparameter_name):
-    """Serialize a hyperparameter that can be a float, callable, or Tensor."""
-    value = self._hyper[hyperparameter_name]
-    if isinstance(value, learning_rate_schedule.LearningRateSchedule):
-      return learning_rate_schedule.serialize(value)
-    if callable(value):
-      return value()
-    if tf.is_tensor(value):
-      return backend.get_value(value)
-    return value
-
-  def variables(self):
-    """Returns variables of this Optimizer based on the order created."""
-    return self._weights
-
-  @property
-  def weights(self):
-    """Returns variables of this Optimizer based on the order created."""
-    return self._weights
-
-  def get_weights(self):
-    """Returns the current weights of the optimizer.
-
-    The weights of an optimizer are its state (ie, variables).
-    This function returns the weight values associated with this
-    optimizer as a list of Numpy arrays. The first value is always the
-    iterations count of the optimizer, followed by the optimizer's state
-    variables in the order they were created. The returned list can in turn
-    be used to load state into similarly parameterized optimizers.
-
-    For example, the RMSprop optimizer for this simple model returns a list of
-    three values-- the iteration count, followed by the root-mean-square value
-    of the kernel and bias of the single Dense layer:
-
-    >>> opt = tf.keras.optimizers.RMSprop()
-    >>> m = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-    >>> m.compile(opt, loss='mse')
-    >>> data = np.arange(100).reshape(5, 20)
-    >>> labels = np.zeros(5)
-    >>> results = m.fit(data, labels)  # Training.
-    >>> len(opt.get_weights())
-    3
-
-    Returns:
-        Weights values as a list of numpy arrays.
-    """
-    params = self.weights
-    return backend.batch_get_value(params)
-
-  # TODO(tanzheny): Maybe share this logic with base_layer.
-  def set_weights(self, weights):
-    """Set the weights of the optimizer.
-
-    The weights of an optimizer are its state (ie, variables).
-    This function takes the weight values associated with this
-    optimizer as a list of Numpy arrays. The first value is always the
-    iterations count of the optimizer, followed by the optimizer's state
-    variables in the order they are created. The passed values are used to set
-    the new state of the optimizer.
-
-    For example, the RMSprop optimizer for this simple model takes a list of
-    three values-- the iteration count, followed by the root-mean-square value
-    of the kernel and bias of the single Dense layer:
-
-    >>> opt = tf.keras.optimizers.RMSprop()
-    >>> m = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
-    >>> m.compile(opt, loss='mse')
-    >>> data = np.arange(100).reshape(5, 20)
-    >>> labels = np.zeros(5)
-    >>> results = m.fit(data, labels)  # Training.
-    >>> new_weights = [np.array(10), np.ones([20, 10]), np.zeros([10])]
-    >>> opt.set_weights(new_weights)
-    >>> opt.iterations
-    <tf.Variable 'RMSprop/iter:0' shape=() dtype=int64, numpy=10>
-
-    Args:
-        weights: weight values as a list of numpy arrays.
-    """
-    params = self.weights
-    if len(params) != len(weights):
-      raise ValueError(
-          f"You called `set_weights(weights)` on optimizer {self._name} "
-          f"with a  weight list of length {str(len(weights))}, "
-          f"but the optimizer was expecting {str(len(params))} "
-          f"weights. Provided weights: {str(weights)[:50]}...")
-    if not params:
-      return
-    weight_value_tuples = []
-    param_values = backend.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError(f"Optimizer weight shape {str(pv.shape)} "
-                         "not compatible with "
-                         f"provided weight shape {str(w.shape)}.")
-      weight_value_tuples.append((p, w))
-    backend.batch_set_value(weight_value_tuples)
-
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer="zeros",
-                 trainable=None,
-                 synchronization=tf.VariableSynchronization.AUTO,
-                 aggregation=tf.VariableAggregation.NONE):
-
-    if dtype is None:
-      dtype = tf.float32
-    if isinstance(initializer, str) or callable(initializer):
-      initializer = initializers.get(initializer)
-
-    if synchronization == tf.VariableSynchronization.ON_READ:
-      if trainable:
-        raise ValueError(
-            "Synchronization value can be set to "
-            "VariableSynchronization.ON_READ only for non-trainable variables. "
-            "You have specified trainable=True and "
-            "synchronization=VariableSynchronization.ON_READ.")
-      else:
-        # Set trainable to be false when variable is to be synced on read.
-        trainable = False
-    elif trainable is None:
-      trainable = True
-
-    variable = self._add_variable_with_custom_getter(
-        name=name,
-        shape=shape,
-        getter=base_layer_utils.make_variable,
-        overwrite=True,
-        initializer=initializer,
-        dtype=dtype,
-        trainable=trainable,
-        use_resource=True,
-        synchronization=synchronization,
-        aggregation=aggregation)
-    backend.track_variable(variable)
-
-    return variable
-
-  def _init_set_name(self, name, zero_based=True):
-    if not name:
-      self._name = backend.unique_object_name(
-          generic_utils.to_snake_case(self.__class__.__name__),
-          zero_based=zero_based)
-    else:
-      self._name = name
-
-  def _assert_valid_dtypes(self, tensors):
-    """Asserts tensors are all valid types (see `_valid_dtypes`).
-
-    Args:
-      tensors: Tensors to check.
-
-    Raises:
-      ValueError: If any tensor is not a valid type.
-    """
-    valid_dtypes = self._valid_dtypes()
-    for t in tensors:
-      dtype = t.dtype.base_dtype
-      if dtype not in valid_dtypes:
-        raise ValueError("Invalid type {} for {}, expected: {}.".format(
-            dtype, t.name, [v for v in valid_dtypes]))
-
-  def _valid_dtypes(self):
-    """Valid types for loss, variables and gradients.
-
-    Subclasses should override to allow other float types.
-
-    Returns:
-      Valid types for loss, variables and gradients.
-    """
-    return _DEFAULT_VALID_DTYPES
-
-  def _call_if_callable(self, param):
-    """Call the function if param is callable."""
-    return param() if callable(param) else param
-
-  def _resource_apply_dense(self, grad, handle, apply_state):
-    """Add ops to apply dense gradients to the variable `handle`.
-
-    Args:
-      grad: a `Tensor` representing the gradient.
-      handle: a `Tensor` of dtype `resource` which points to the variable to be
-        updated.
-      apply_state: A dict which is used across multiple apply calls.
-
-    Returns:
-      An `Operation` which updates the value of the variable.
-    """
-    raise NotImplementedError("`_resource_apply_dense` must be implemented in "
-                              "subclasses.")
-
-  def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices,
-                                               **kwargs):
-    """Add ops to apply sparse gradients to `handle`, with repeated indices.
-
-    Optimizers which override this method must deal with repeated indices. See
-    the docstring of `_apply_sparse_duplicate_indices` for details. By default
-    the correct behavior, to sum non-unique indices and their associated
-    gradients, is enforced by first pre-processing `grad` and `indices` and
-    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
-    with duplicate indices may instead override this method to avoid the
-    overhead of summing.
-
-    Args:
-      grad: a `Tensor` representing the gradient for the affected indices.
-      handle: a `Tensor` of dtype `resource` which points to the variable to be
-        updated.
-      indices: a `Tensor` of integral type representing the indices for which
-        the gradient is nonzero. Indices may be repeated.
-      **kwargs: May optionally contain `apply_state`
-
-    Returns:
-      An `Operation` which updates the value of the variable.
-    """
-    summed_grad, unique_indices = _deduplicate_indexed_slices(
-        values=grad, indices=indices)
-    return self._resource_apply_sparse(summed_grad, handle, unique_indices,
-                                       **kwargs)
-
-  def _resource_apply_sparse(self, grad, handle, indices, apply_state):
-    """Add ops to apply sparse gradients to the variable `handle`.
-
-    Similar to `_apply_sparse`, the `indices` argument to this method has been
-    de-duplicated. Optimizers which deal correctly with non-unique indices may
-    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
-    overhead.
-
-    Args:
-      grad: a `Tensor` representing the gradient for the affected indices.
-      handle: a `Tensor` of dtype `resource` which points to the variable to be
-        updated.
-      indices: a `Tensor` of integral type representing the indices for which
-        the gradient is nonzero. Indices are unique.
-      apply_state: A dict which is used across multiple apply calls.
-
-    Returns:
-      An `Operation` which updates the value of the variable.
-    """
-    raise NotImplementedError("`_resource_apply_sparse` Must be implemented in "
-                              "subclasses.")
-
-  def _resource_scatter_add(self, x, i, v):
-    with tf.control_dependencies([
-        tf.raw_ops.ResourceScatterAdd(
-            resource=x.handle, indices=i, updates=v)
-    ]):
-      return x.value()
-
-  def _resource_scatter_update(self, x, i, v):
-    with tf.control_dependencies(
-        [tf.raw_ops.ResourceScatterUpdate(
-            resource=x.handle, indices=i, updates=v)]):
-      return x.value()
-
-  @property
-  @layer_utils.cached_per_instance
-  def _dense_apply_args(self):
-    return tf_inspect.getfullargspec(self._resource_apply_dense).args
-
-  @property
-  @layer_utils.cached_per_instance
-  def _sparse_apply_args(self):
-    return tf_inspect.getfullargspec(self._resource_apply_sparse).args
-
-  # ---------------
-  # For implementing the trackable interface
-  # ---------------
-
-  def _restore_slot_variable(self, slot_name, variable, slot_variable):
-    """Restore a newly created slot variable's value."""
-    variable_key = _var_key(variable)
-    deferred_restorations = self._deferred_slot_restorations.get(
-        slot_name, {}).pop(variable_key, [])
-    # Iterate over restores, highest restore UID first to minimize the number
-    # of assignments.
-    deferred_restorations.sort(key=lambda position: position.restore_uid,
-                               reverse=True)
-    for checkpoint_position in deferred_restorations:
-      checkpoint_position.restore(slot_variable)
-
-  def _create_or_restore_slot_variable(
-      self, slot_variable_position, slot_name, variable):
-    """Returns the slot variable that should have a value restored into it.
-
-    It is up to the caller to restore the value into the slot variable if a
-    valid slot variable is returned.
-
-    Called when a variable which has an associated slot variable is created or
-    restored. When executing eagerly, we create the slot variable with a
-    restoring initializer.
-
-    No new variables are created when graph building. Instead,
-    _restore_slot_variable catches these after normal creation and adds restore
-    ops to the graph. This method is nonetheless important when graph building
-    for the case when a slot variable has already been created but `variable`
-    has just been added to a dependency graph (causing us to realize that the
-    slot variable needs to be restored).
-
-    Args:
-      slot_variable_position: A `trackable._CheckpointPosition` object
-        indicating the slot variable `Trackable` object to be restored.
-      slot_name: The name of this `Optimizer`'s slot to restore into.
-      variable: The variable object this slot is being created for.
-
-    Returns:
-      A slot variable that should have a value restored into it, or None if a
-      slot variable should not be restored at this time.
-    """
-    variable_key = _var_key(variable)
-    slot_dict = self._slots.get(variable_key, {})
-    slot_variable = slot_dict.get(slot_name, None)
-    if (slot_variable is None and tf.executing_eagerly() and
-        slot_variable_position.is_simple_variable()
-        # Defer slot variable creation if there is an active variable creator
-        # scope. Generally we'd like to eagerly create/restore slot variables
-        # when possible, but this may mean that scopes intended to catch
-        # `variable` also catch its eagerly created slot variable
-        # unintentionally (specifically make_template would add a dependency on
-        # a slot variable if not for this case). Deferring is mostly harmless
-        # (aside from double initialization), and makes variable creator scopes
-        # behave the same way they do when graph building.
-        #
-        # One notable case is with distribution strategy, which uses variable
-        # creator scope but always desires the `variable` and the slot to use
-        # the same scope, thus we can safely eagerly create/restore slot
-        # variables.
-        and (not tf.compat.v1.get_default_graph()._variable_creator_stack or  # pylint: disable=protected-access
-             self._distribution_strategy)):
-      initializer = tf.__internal__.tracking.CheckpointInitialValueCallable(
-          checkpoint_position=slot_variable_position)
-      slot_variable = self.add_slot(
-          var=variable,
-          initializer=initializer,
-          slot_name=slot_name,
-          shape=slot_variable_position.value_shape())
-      # Slot variables are not owned by any one object (because we don't want to
-      # save the slot variable if the optimizer is saved without the non-slot
-      # variable, or if the non-slot variable is saved without the optimizer;
-      # it's a dependency hypergraph with edges of the form (optimizer, non-slot
-      # variable, variable)). So we don't _track_ slot variables anywhere, and
-      # instead special-case this dependency and otherwise pretend it's a normal
-      # graph.
-    if slot_variable is not None:
-      # For sharded variables, we need the logic in get_slot to combine slot
-      # variables for its shards
-      if (slot_variable is variable) and (isinstance(
-          variable, tf.__internal__.distribute.ShardedVariable)):
-        return self.get_slot(variable, slot_name)
-      # If we've either made this slot variable, or if we've pulled out an
-      # existing slot variable, we should restore it.
-      return slot_variable
-    else:
-      # We didn't make the slot variable. Defer restoring until it gets created
-      # normally. We keep a list rather than the one with the highest restore
-      # UID in case slot variables have their own dependencies, in which case
-      # those could differ between restores.
-      self._deferred_slot_restorations.setdefault(
-          slot_name, {}).setdefault(variable_key, []).append(
-              slot_variable_position)
-    return None
-
-  @contextlib.contextmanager
-  def _distribution_strategy_scope(self):
-    """Returns the `tf.distribute.Strategy` this optimizer was created under."""
-    if self._distribution_strategy and not tf.distribute.has_strategy():
-      with self._distribution_strategy.scope():
-        yield self._distribution_strategy.scope()
-    else:
-      yield
-
-
-def _var_key(var):
-  """Key for representing a primary variable, for looking up slots.
-
-  In graph mode the name is derived from the var shared name.
-  In eager mode the name is derived from the var unique id.
-  If distribution strategy exists, get the primary variable first.
-
-  Args:
-    var: the variable.
-
-  Returns:
-    the unique name of the variable.
-  """
-
-  # pylint: disable=protected-access
-  # Get the distributed variable if it exists.
-  if hasattr(var, "_distributed_container"):
-    var = var._distributed_container()
-  if getattr(var, "_in_graph_mode", False):
-    return var._shared_name
-  return var._unique_id
-
-
-def _get_slot_key_from_var(var, slot_name):
-  """Get the slot key for the variable: var_name/slot_name."""
-
-  name = _var_key(var)
-  return name + "/" + slot_name
-
-
-class RestoredOptimizer(OptimizerV2):
-  """A non-functional Optimizer implementation for checkpoint compatibility.
-
-  Holds slot variables and hyperparameters when an optimizer is restored from a
-  SavedModel. These variables may be referenced in functions along with ops
-  created by the original optimizer, but currently we do not support using the
-  optimizer object itself (e.g. through `apply_gradients`).
-  """
-  # TODO(allenl): Make the restored optimizer functional by tracing its apply
-  # methods.
-
-  def __init__(self):
-    super().__init__("RestoredOptimizer")
-    self._hypers_created = True
-
-  def get_config(self):
-    # TODO(allenl): Save and restore the Optimizer's config
-    raise NotImplementedError(
-        "Restoring functional Optimizers from SavedModels is not currently "
-        "supported. Please file a feature request if this limitation bothers "
-        "you.")
-
-tf.__internal__.saved_model.load.register_revived_type(
-    "optimizer",
-    lambda obj: isinstance(obj, OptimizerV2),
-    versions=[tf.__internal__.saved_model.load.VersionedTypeRegistration(
-        object_factory=lambda proto: RestoredOptimizer(),
-        version=2,
-        min_producer_version=1,
-        min_consumer_version=1,
-        setter=RestoredOptimizer._set_hyper  # pylint: disable=protected-access
-    )])
diff --git a/keras/optimizers/optimizer_v2/optimizer_v2_test.py b/keras/optimizers/optimizer_v2/optimizer_v2_test.py
deleted file mode 100644
index f22efb0050d0..000000000000
--- a/keras/optimizers/optimizer_v2/optimizer_v2_test.py
+++ /dev/null
@@ -1,1320 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional test for OptimizerV2."""
-
-import collections
-
-from absl.testing import parameterized
-import keras
-from keras import backend
-from keras import callbacks
-from keras import losses
-from keras.engine import input_layer
-from keras.engine import sequential
-from keras.engine import training
-from keras.layers import core
-from keras.layers import regularization
-from keras.optimizers import optimizer_v1
-from keras.optimizers.optimizer_v2 import adadelta
-from keras.optimizers.optimizer_v2 import adagrad
-from keras.optimizers.optimizer_v2 import adam
-from keras.optimizers.optimizer_v2 import adamax
-from keras.optimizers.optimizer_v2 import ftrl
-from keras.optimizers.optimizer_v2 import gradient_descent
-from keras.optimizers.optimizer_v2 import nadam
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from keras.optimizers.optimizer_v2 import rmsprop
-from keras.optimizers.schedules import learning_rate_schedule
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-from keras.utils import np_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.framework import test_util as tf_test_utils
-
-
-_DATA_TYPES = [tf.half, tf.float32, tf.float64]
-# TODO(b/141710709): complex support in NVCC and ROCM.
-if (not tf_test_utils.IsBuiltWithNvcc() and not tf.test.is_built_with_rocm()):
-  _DATA_TYPES += [tf.complex64, tf.complex128]
-
-
-class OptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testBasic(self):
-    for dtype in _DATA_TYPES:
-      with test_utils.use_gpu():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        sgd = gradient_descent.SGD(3.0)
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Run 1 step of sgd through optimizer
-        opt_op = sgd.minimize(loss, var_list=[var0, var1])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.evaluate(opt_op)
-        # Validate updated params
-        self.assertAllClose([-14., -13.], self.evaluate(var0))
-        self.assertAllClose([-6., -5.], self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testAdaptiveLearningRate(self):
-    for dtype in _DATA_TYPES:
-      with self.test_session():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-
-        def loss():
-          return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-
-        sgd = gradient_descent.SGD(1.0)
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Run 1 step of sgd through optimizer
-        opt_op = sgd.minimize(loss, [var0, var1])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.evaluate(opt_op)
-        # Validate updated params
-        # var0 = [1., 2.] - 1.0 * [5, 5]
-        self.assertAllClose([-4., -3.], self.evaluate(var0))
-        # var1 = [3., 4.] - 1.0 * [3, 3]
-        self.assertAllClose([0., 1.], self.evaluate(var1))
-
-        sgd.learning_rate = 0.5
-        if tf.executing_eagerly():
-          sgd.minimize(loss, [var0, var1])
-        else:
-          self.evaluate(opt_op)
-        # Validate updated params
-        # var0 = [-4., -3.] - 0.5 * [5, 5]
-        self.assertAllClose([-6.5, -5.5], self.evaluate(var0))
-        # var1 = [0., 1.] - 0.5 * [3, 3]
-        self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
-
-        sgd.learning_rate = learning_rate_schedule.InverseTimeDecay(
-            0.5, decay_steps=1.0, decay_rate=0.5)
-        if tf.executing_eagerly():
-          sgd.minimize(loss, [var0, var1])
-        else:
-          self.evaluate(opt_op)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testPrecomputedGradient(self):
-    for dtype in _DATA_TYPES:
-      with test_utils.use_gpu():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        grad_loss = tf.constant([42, -42], dtype=dtype)
-        sgd = gradient_descent.SGD(3.0)
-
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Run 1 step of sgd through optimizer
-        opt_op = sgd.minimize(loss, var_list=[var0, var1], grad_loss=grad_loss)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.evaluate(opt_op)
-        # Validate updated params
-        self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
-                            self.evaluate(var0))
-        self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
-                            self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoGradients(self):
-    for dtype in _DATA_TYPES:
-      with test_utils.use_gpu():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
-        sgd_op = gradient_descent.SGD(3.0)
-        with self.assertRaisesRegex(ValueError, 'No gradients'):
-          # var1 has no gradient
-          sgd_op.minimize(loss, var_list=[var1])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoGradientsForAnyVariables_Minimize(self):
-    for dtype in _DATA_TYPES:
-      with test_utils.use_gpu():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        loss = lambda: tf.constant(5.0)
-
-        sgd_op = gradient_descent.SGD(3.0)
-        with self.assertRaisesRegex(ValueError,
-                                    'No gradients provided for any variable'):
-          sgd_op.minimize(loss, var_list=[var0, var1])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testNoGradientsForAnyVariables_ApplyGradients(self):
-    for dtype in _DATA_TYPES:
-      with test_utils.use_gpu():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        sgd_op = gradient_descent.SGD(3.0)
-        with self.assertRaisesRegex(ValueError,
-                                    'No gradients provided for any variable'):
-          sgd_op.apply_gradients([(None, var0), (None, var1)])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testGradientsAsVariables(self):
-    for i, dtype in enumerate(_DATA_TYPES):
-      with test_utils.use_gpu():
-        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-
-        sgd = gradient_descent.SGD(3.0)
-        grads_and_vars = sgd._compute_gradients(loss, [var0, var1])
-        # Convert gradients to tf.Variables
-        converted_grads = [
-            tf.Variable(
-                tf.zeros([2], dtype), name='c_%d_%d' % (i, j))
-            for j, gv in enumerate(grads_and_vars)
-        ]
-        convert_ops = [
-            tf.compat.v1.assign(converted_grads[j], gv[0])
-            for j, gv in enumerate(grads_and_vars)
-        ]
-
-        # Run convert_ops to achieve the gradients converting
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.evaluate(convert_ops)
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        # Run 1 step of sgd through optimizer
-        converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
-        opt_op = sgd.apply_gradients(converted_grads_and_vars)
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        self.evaluate(convert_ops)
-        self.evaluate(opt_op)
-
-        # Validate updated params
-        self.assertAllClose([-14., -13.], self.evaluate(var0))
-        self.assertAllClose([-6., -5.], self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testComputeGradientsWithTensors(self):
-    with test_utils.use_gpu():
-      x = tf.convert_to_tensor(1.0)
-
-      def f():
-        return x * x
-
-      sgd = gradient_descent.SGD(3.0)
-      grads_and_vars = sgd._compute_gradients(f, [x])
-      self.assertLen(grads_and_vars, 1)
-      grad, x_as_var = grads_and_vars[0]
-      self.assertIs(x, x_as_var)
-      self.assertEqual(2.0, self.evaluate(grad))
-
-      with self.assertRaises(NotImplementedError):
-        sgd.apply_gradients(grads_and_vars)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testConstraint(self):
-    constraint_01 = lambda x: tf.clip_by_value(x, -0.1, 0.)
-    constraint_0 = lambda x: tf.clip_by_value(x, 0., 1.)
-    with test_utils.use_gpu():
-      var0 = tf.Variable([1.0, 2.0], constraint=constraint_01)
-      var1 = tf.Variable([3.0, 4.0], constraint=constraint_0)
-      loss = lambda: 5 * var0 + 3 * var1
-      sgd = gradient_descent.SGD(3.0)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-      # Run 1 step of sgd through optimizer
-      opt_op = sgd.minimize(loss, var_list=[var0, var1])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(opt_op)
-      # Validate updated params
-      self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
-      self.assertAllClose([0., 0.], self.evaluate(var1))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testIterationWithoutMinimize(self):
-    with test_utils.use_gpu():
-      sgd = gradient_descent.SGD(3.0)
-      self.evaluate(sgd.iterations.initializer)
-      self.assertEqual(0, self.evaluate(sgd.iterations))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testConfig(self):
-    with test_utils.use_gpu():
-      opt = gradient_descent.SGD(learning_rate=1.0)
-      config = opt.get_config()
-      opt2 = gradient_descent.SGD.from_config(config)
-      lr = opt._get_hyper('learning_rate')
-      lr2 = opt2._get_hyper('learning_rate')
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # assert both are equal float values.
-      self.assertEqual(self.evaluate(lr), self.evaluate(lr2))
-      var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32)
-      loss = lambda: 3 * var0
-      # learning rate variable created when calling minimize.
-      opt.minimize(loss, [var0])
-      opt3 = gradient_descent.SGD.from_config(config)
-      lr3 = opt3._get_hyper('learning_rate')
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.assertEqual(self.evaluate(lr), self.evaluate(lr3))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testConfigWithLearningRateDecay(self):
-    with test_utils.use_gpu():
-      var0 = tf.Variable([[1.0], [2.0]], dtype=tf.float32)
-      for decay_schedule in [
-          learning_rate_schedule.InverseTimeDecay(
-              0.5, decay_steps=1.0, decay_rate=0.1),
-          learning_rate_schedule.PiecewiseConstantDecay(
-              [5], [1., .5])
-      ]:
-        step = 10
-        opt = gradient_descent.SGD(decay_schedule)
-        config = opt.get_config()
-        opt2 = gradient_descent.SGD.from_config(config)
-        # assert both are equal float values.
-        self.assertAllEqual(
-            decay_schedule(step),
-            opt._get_hyper('learning_rate')(step))
-        self.assertAllEqual(
-            decay_schedule(step),
-            opt2._get_hyper('learning_rate')(step))
-        loss = lambda: 3 * var0
-        # learning rate variable is created when calling minimize.
-        opt.minimize(loss, [var0])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        config = opt.get_config()
-        opt3 = gradient_descent.SGD.from_config(config)
-        self.assertAllEqual(
-            self.evaluate(opt._get_hyper('learning_rate')(step)),
-            opt3._get_hyper('learning_rate')(step))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testGradClipValue(self):
-    with test_utils.use_gpu():
-      var = tf.Variable([1.0, 2.0])
-      loss = lambda: 3 * var
-      opt = gradient_descent.SGD(learning_rate=1.0, clipvalue=1.0)
-      opt_op = opt.minimize(loss, [var])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(opt_op)
-      self.assertAllClose([0., 1.], self.evaluate(var))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testGradClipNorm(self):
-    with test_utils.use_gpu():
-      var = tf.Variable([1.0])
-      loss = lambda: 3 * var
-      opt = gradient_descent.SGD(learning_rate=1.0, clipnorm=1.0)
-      opt_op = opt.minimize(loss, [var])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(opt_op)
-      self.assertAllClose([0.], self.evaluate(var))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testGradGlobalClipNorm(self):
-    with test_utils.use_gpu():
-      # l2 norm is 5.0
-      var1 = tf.Variable([1.0])
-      var2 = tf.Variable([2.0])
-      loss = lambda: 3 * var1 + 4 * var2
-      opt = gradient_descent.SGD(learning_rate=1.0, global_clipnorm=2.0)
-      opt_op = opt.minimize(loss, [var1, var2])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(opt_op)
-      # grad1 = 3.0 * 2.0 / 5.0 = 1.2
-      self.assertAllClose([-.2], self.evaluate(var1))
-      # grad2 = 4.0 * 2.0 / 5.0 = 1.6
-      self.assertAllClose([.4], self.evaluate(var2))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInvalidClipNorm(self):
-    with self.assertRaisesRegex(ValueError, '>= 0'):
-      gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(
-          mode=['graph', 'eager'],
-          clip_type=['clipnorm', 'global_clipnorm', 'clipvalue']))
-  def testConfigWithCliping(self, clip_type):
-    opt = gradient_descent.SGD(learning_rate=1.0, **{clip_type: 2.0})
-    config = opt.get_config()
-    opt = gradient_descent.SGD.from_config(config)
-    self.assertEqual(getattr(opt, clip_type), 2.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testInvalidKwargs(self):
-    with self.assertRaisesRegex(TypeError, 'Unexpected keyword argument'):
-      gradient_descent.SGD(learning_rate=1.0, invalidkwargs=1.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testWeights(self):
-    with test_utils.use_gpu():
-      opt1 = adam.Adam(learning_rate=1.0)
-      var1 = tf.Variable([1.0, 2.0], dtype=tf.float32)
-      loss1 = lambda: 3 * var1
-      opt_op_1 = opt1.minimize(loss1, [var1])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      config = opt1.get_config()
-      opt2 = adam.Adam.from_config(config)
-      var2 = tf.Variable([1.0, 2.0], dtype=tf.float32)
-      loss2 = lambda: 3 * var2
-      opt_op_2 = opt2.minimize(loss2, [var2])
-      weights = opt1.get_weights()
-
-      # Assert set_weights and both variables get updated to same value.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      opt2.set_weights(weights)
-      self.evaluate([opt_op_1, opt_op_2])
-      self.assertAllClose(self.evaluate(var1), self.evaluate(var2))
-      self.assertEqual(1, self.evaluate(opt1.iterations))
-      self.assertEqual(1, self.evaluate(opt2.iterations))
-
-      var3 = tf.Variable([1.0, 2.0, 3.0], dtype=tf.float32)
-      var4 = tf.Variable([4.0, 5.0, 6.0], dtype=tf.float32)
-      loss3 = lambda: 3 * var3 + 5 * var4
-      opt_op_3 = opt1.minimize(loss3, [var3, var4])
-
-      # Assert set_weights with ValueError since weight list does not match.
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      weights = opt1.get_weights()
-      with self.assertRaisesRegex(ValueError, 'but the optimizer was'):
-        opt2.set_weights(weights)
-
-      # Assert set_weights and variables get updated to same value.
-      var5 = tf.Variable([1.0, 2.0, 3.0], dtype=tf.float32)
-      var6 = tf.Variable([4.0, 5.0, 6.0], dtype=tf.float32)
-      loss4 = lambda: 3 * var5 + 5 * var6
-      opt_op_4 = opt2.minimize(loss4, [var5, var6])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      opt2.set_weights(weights)
-      self.evaluate([opt_op_3, opt_op_4])
-      self.assertAllClose(
-          self.evaluate([var3, var4]), self.evaluate([var5, var6]))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testGettingHyperParameters(self):
-    with self.test_session():
-      opt = adam.Adam(learning_rate=1.0)
-      var = tf.Variable([1.0, 2.0], dtype=tf.float32)
-      loss = lambda: 3 * var
-      opt_op = opt.minimize(loss, [var])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(opt_op)
-
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(1.0, lr)
-
-      opt.lr = 2.0
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(2.0, lr)
-
-      self.evaluate(opt.lr.assign(3.0))
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(3.0, lr)
-
-      with self.assertRaises(AttributeError):
-        opt.not_an_attr += 3
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testGettingHyperParametersWithLrInConstructor(self):
-    with self.test_session():
-      opt = gradient_descent.SGD(lr=3.0)
-      var = tf.Variable([1.0, 2.0], dtype=tf.float32)
-      loss = lambda: 3 * var
-      opt_op = opt.minimize(loss, [var])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(opt_op)
-
-      self.assertIsInstance(opt.lr, tf.Variable)
-      self.assertIsInstance(opt.learning_rate, tf.Variable)
-
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(3.0, lr)
-
-      opt.lr = 2.0
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(2.0, lr)
-
-      self.evaluate(opt.lr.assign(4.0))
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(4.0, lr)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testDir(self):
-    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.1)
-    dir_result = set(dir(opt))
-    self.assertIn('learning_rate', dir_result)  # Hyperparameter
-    self.assertIn('lr', dir_result)  # Hyperparameter
-    self.assertIn('momentum', dir_result)  # Hyperparameter
-    self.assertIn('nesterov', dir_result)  # Attribute
-    self.assertIn('minimize', dir_result)  # Attribute
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testOptimizerWithKerasModel(self):
-    a = input_layer.Input(shape=(3,), name='input_a')
-    b = input_layer.Input(shape=(3,), name='input_b')
-
-    dense = core.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = regularization.Dropout(0.5, name='dropout')(c)
-
-    model = training.Model([a, b], [d, e])
-
-    optimizer = gradient_descent.SGD(learning_rate=0.001)
-    loss = 'mse'
-    model.compile(optimizer, loss, metrics=['mae'])
-
-    input_a_np = np.random.random((10, 3))
-    input_b_np = np.random.random((10, 3))
-
-    output_d_np = np.random.random((10, 4))
-    output_e_np = np.random.random((10, 4))
-
-    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
-              epochs=1,
-              batch_size=5)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testOptimizerWithCallbacks(self):
-    np.random.seed(1331)
-    input_np = np.random.random((10, 3))
-    output_np = np.random.random((10, 4))
-    a = input_layer.Input(shape=(3,), name='input_a')
-    model = sequential.Sequential()
-    model.add(core.Dense(4, kernel_initializer='zeros', name='dense'))
-    model.add(regularization.Dropout(0.5, name='dropout'))
-    model(a)
-    optimizer = gradient_descent.SGD(learning_rate=0.1)
-    model.compile(optimizer, loss='mse', metrics=['mae'])
-    # This does not reduce the LR after the first epoch (due to low delta).
-    cbks = [
-        callbacks.ReduceLROnPlateau(
-            monitor='val_loss', factor=0.1, min_delta=0, patience=1, cooldown=5)
-    ]
-    model.fit(
-        input_np,
-        output_np,
-        batch_size=10,
-        validation_data=(input_np, output_np),
-        callbacks=cbks,
-        epochs=2,
-        verbose=0)
-    self.assertAllClose(
-        float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
-
-    # This should reduce the LR after the first epoch (due to high delta).
-    cbks = [
-        callbacks.ReduceLROnPlateau(
-            monitor='val_loss',
-            factor=0.1,
-            min_delta=10,
-            patience=1,
-            cooldown=5)
-    ]
-    model.fit(
-        input_np,
-        output_np,
-        batch_size=10,
-        validation_data=(input_np, output_np),
-        callbacks=cbks,
-        epochs=2,
-        verbose=2)
-    self.assertAllClose(
-        float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
-
-  def testOptimizerSetIterations(self):
-    global_step = tf.compat.v1.train.get_or_create_global_step()
-    opt = adam.Adam(learning_rate=1.0)
-    opt.iterations = global_step
-    var = tf.Variable([1.0, 2.0], dtype=tf.float32)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    init_step_value = self.evaluate(global_step)
-    loss = lambda: 3 * var
-    opt_op = opt.minimize(loss, [var])
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(opt_op)
-    new_step_value = self.evaluate(global_step)
-    self.assertEqual(new_step_value, init_step_value + 1)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testOptimizerWithCallableVarList(self):
-    train_samples = 20
-    input_dim = 1
-    num_classes = 2
-    (x, y), _ = test_utils.get_test_data(
-        train_samples=train_samples,
-        test_samples=10,
-        input_shape=(input_dim,),
-        num_classes=num_classes)
-    y = np_utils.to_categorical(y)
-
-    num_hidden = 1
-    model = test_utils.get_small_sequential_mlp(
-        num_hidden=num_hidden, num_classes=num_classes)
-    opt = adam.Adam()
-
-    loss = lambda: losses.mean_squared_error(model(x), y)
-    var_list = lambda: model.trainable_weights
-
-    with self.assertRaisesRegex(
-        ValueError, 'Weights for model .* have not yet been created'):
-      var_list()
-    train_op = opt.minimize(loss, var_list)
-    if not tf.executing_eagerly():
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.assertEqual(
-          [[0.]], self.evaluate(opt.get_slot(var_list()[0], 'm')))
-      self.evaluate(train_op)
-    self.assertNotEqual(
-        [[0.]], self.evaluate(opt.get_slot(var_list()[0], 'm')))
-    self.assertLen(var_list(), 4)
-
-  def testVarKey(self):
-    with tf.compat.v1.get_default_graph().as_default():
-      a = tf.Variable([1., 2.], name='var')
-      b = tf.Variable([1.], name='var')
-      self.assertTrue(a._in_graph_mode)
-      self.assertTrue(b._in_graph_mode)
-      var_key = optimizer_v2._var_key(a)
-      self.assertEqual('var', var_key)
-      var_key = optimizer_v2._var_key(b)
-      self.assertEqual('var_1', var_key)
-
-  def testVarName(self):
-    with tf.compat.v1.get_default_graph().as_default():
-      var = tf.Variable([1., 2.], name='var')
-      loss = var + 1.
-      opt = adam.Adam()
-      opt.get_updates(loss, [var])
-      opt_vars = opt.variables()
-      self.assertLen(opt_vars, 3)
-      self.assertEqual('Adam/iter:0', opt_vars[0].name)
-      self.assertEqual('Adam/var/m:0', opt_vars[1].name)
-      var_2 = tf.Variable([1., 2.], name='var_2')
-      loss = var_2 + 1.
-      with backend.name_scope('outter'):
-        opt.get_updates(loss, [var_2])
-      opt_vars = opt.variables()
-      self.assertLen(opt_vars, 5)
-      self.assertEqual('outter/Adam/var_2/m:0', opt_vars[3].name)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testEmptyVarList(self):
-    opt = gradient_descent.SGD(1.)
-    opt.minimize(lambda: tf.constant(1.), [])
-    opt.apply_gradients([])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testAggregationTrue(self):
-    # Test that experimental_aggregate_gradients=True works without distributed
-    # strategy.
-    var = tf.Variable([1., 2.])
-    opt = gradient_descent.SGD(3.0)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose([1., 2.], self.evaluate(var))
-    opt_op = opt.apply_gradients([([0.1, 0.1], var)],
-                                 experimental_aggregate_gradients=True)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(opt_op)
-    self.assertAllClose([0.7, 1.7], self.evaluate(var))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def testAggregationFalse(self):
-    # Test that experimental_aggregate_gradients=False works without distributed
-    # strategy.
-    var = tf.Variable([1., 2.])
-    opt = gradient_descent.SGD(3.0)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose([1., 2.], self.evaluate(var))
-    opt_op = opt.apply_gradients([([0.1, 0.1], var)],
-                                 experimental_aggregate_gradients=False)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(opt_op)
-    self.assertAllClose([0.7, 1.7], self.evaluate(var))
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testRestoringIterationsWithoutAnOptimizer(self):
-    opt = gradient_descent.SGD(3.0)
-    opt.iterations.assign(5)
-    checkpoint = tf.train.Checkpoint(optimizer=opt)
-    path = checkpoint.save(self.get_temp_dir())
-
-    # Following verifies that the `iterations` can be restored with the absence
-    # of an `Optimizer` object (using a `Checkpoint` as a placeholder).
-    iterations_var = tf.Variable(0, dtype=tf.int64)
-    optimizer_checkpoint = tf.train.Checkpoint(iter=iterations_var)
-    checkpoint_to_restore = tf.train.Checkpoint(
-        optimizer=optimizer_checkpoint)
-    checkpoint_to_restore.restore(path)
-
-    self.assertEqual(5, self.evaluate(iterations_var))
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def testSlotWithNonstandardShapeRestoresBasedOnCheckpoint(self):
-    # First create an optimizer and a slot variable with a non-standard shape.
-    x = tf.Variable([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
-    slot_shape = [2, 1]
-    optimizer_1 = optimizer_v2.OptimizerV2(name='test')
-    optimizer_1.add_slot(x, 'test_slot', 'ones', shape=slot_shape)
-
-    # Then save the variable and optimizer to a checkpoint.
-    checkpoint_1 = tf.train.Checkpoint(var=x, optimizer=optimizer_1)
-    checkpoint_path = checkpoint_1.save(self.get_temp_dir())
-
-    # Create a new optimizer and call restore on it (and x)
-    optimizer_2 = optimizer_v2.OptimizerV2(name='test')
-    checkpoint_2 = tf.train.Checkpoint(var=x, optimizer=optimizer_2)
-    checkpoint_2.restore(checkpoint_path)
-
-    self.assertEqual(slot_shape,
-                     optimizer_2.get_slot(x, 'test_slot').shape.as_list())
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_gradient_aggregator(self):
-    def gradient_aggregator(grads_and_vars):
-      # Simulate an all-reduce where the other replica has zeros for gradients,
-      # by dividing each gradient by 2.
-      grads = [g for g, _ in grads_and_vars]
-      vars = [v for _, v in grads_and_vars]  # pylint: disable=redefined-builtin
-      all_reduced_grads = [g / 2 for g in grads]
-      return list(zip(all_reduced_grads, vars))
-
-    var = tf.Variable(2.0)
-    sgd = gradient_descent.SGD(1.0, gradient_aggregator=gradient_aggregator)
-    loss = lambda: 2 * var
-    opt_op = sgd.minimize(loss, var_list=[var])
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(opt_op)
-    self.assertEqual(self.evaluate(var), 1.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_override_aggregate_gradients(self):
-    class MyOptimizer(gradient_descent.SGD):
-
-      def _aggregate_gradients(self, grads_and_vars):
-        # Simulate an all-reduce where the other replica has zeros for
-        # gradients, by dividing each gradient by 2.
-        grads = [g for g, _ in grads_and_vars]
-        vars = [v for _, v in grads_and_vars]  # pylint: disable=redefined-builtin
-        all_reduced_grads = [g / 2 for g in grads]
-        return list(zip(all_reduced_grads, vars))
-
-    var = tf.Variable(2.0)
-    sgd = MyOptimizer(1.0)
-    loss = lambda: 2 * var
-    opt_op = sgd.minimize(loss, var_list=[var])
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.evaluate(opt_op)
-    self.assertEqual(self.evaluate(var), 1.0)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_create_slots_for_sharded_variables(self):
-    # set names so that ShardedVariable is well-named for slot variable keying.
-    var_a = tf.Variable([1.0], name='part_0')
-    var_b = tf.Variable([2.0], name='part_1')
-    sharded_var = tf.__internal__.distribute.ShardedVariable([var_a, var_b])
-
-    opt = adagrad.Adagrad()
-    opt._create_slots(sharded_var.variables)
-    opt._create_slots_for_sharded_variables(sharded_var.variables)
-
-    sharded_slot = opt.get_slot(sharded_var, 'accumulator')
-    self.assertIsInstance(
-        sharded_slot, tf.__internal__.distribute.ShardedVariable)
-
-    slot_a = opt.get_slot(var_a, 'accumulator')
-    self.assertAllClose(sharded_slot.variables[0], slot_a)
-    slot_b = opt.get_slot(var_b, 'accumulator')
-    self.assertAllClose(sharded_slot.variables[1], slot_b)
-
-
-@test_combinations.run_all_keras_modes
-class OptimizersCompatibilityTest(test_combinations.TestCase):
-
-  def _testOptimizersCompatibility(self, opt_v1, opt_v2, test_weights=True):
-    if tf.executing_eagerly():
-      self.skipTest(
-          'v1 optimizer does not run in eager mode')
-    np.random.seed(1331)
-    with test_utils.use_gpu():
-      train_samples = 20
-      input_dim = 3
-      num_classes = 2
-      (x, y), _ = test_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=10,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
-      y = np_utils.to_categorical(y)
-
-      num_hidden = 5
-      model_v1 = test_utils.get_small_sequential_mlp(
-          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_v1.compile(
-          opt_v1,
-          loss='categorical_crossentropy',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-      model_v1.fit(x, y, batch_size=5, epochs=1)
-
-      model_v2 = test_utils.get_small_sequential_mlp(
-          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_v2.set_weights(model_v1.get_weights())
-      model_v2.compile(
-          opt_v2,
-          loss='categorical_crossentropy',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-      if not tf.compat.v1.executing_eagerly_outside_functions():
-        model_v2._make_train_function()
-      if test_weights:
-        opt_v2.set_weights(opt_v1.get_weights())
-
-      hist_1 = model_v1.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-      hist_2 = model_v2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-      self.assertAllClose(model_v1.get_weights(), model_v2.get_weights(),
-                          rtol=1e-5, atol=1e-5)
-      self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'],
-                          rtol=1e-5, atol=1e-5)
-
-  def testAdadeltaCompatibility(self):
-    opt_v1 = optimizer_v1.Adadelta(lr=0.01)
-    opt_v2 = adadelta.Adadelta(learning_rate=0.01)
-    self._testOptimizersCompatibility(opt_v1, opt_v2)
-
-  def testAdagradCompatibility(self):
-    opt_v1 = optimizer_v1.Adagrad(lr=0.01)
-    opt_v2 = adagrad.Adagrad(learning_rate=0.01)
-    self._testOptimizersCompatibility(opt_v1, opt_v2)
-
-  def testAdamCompatibility(self):
-    opt_v1 = optimizer_v1.Adam()
-    opt_v2 = adam.Adam()
-    self._testOptimizersCompatibility(opt_v1, opt_v2)
-
-  def testAdamaxCompatibility(self):
-    opt_v1 = optimizer_v1.Adamax(lr=0.01)
-    opt_v2 = adamax.Adamax(learning_rate=0.01)
-    self._testOptimizersCompatibility(opt_v1, opt_v2)
-
-  def testNadamCompatibility(self):
-    opt_v1 = optimizer_v1.Nadam(lr=0.001)
-    opt_v2 = nadam.Nadam(learning_rate=0.001)
-    self._testOptimizersCompatibility(opt_v1, opt_v2)
-
-  def testMomentumCompatibility(self):
-    opt_v1 = optimizer_v1.SGD(lr=0.01, momentum=0.9)
-    opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
-    self._testOptimizersCompatibility(opt_v1, opt_v2)
-
-  def testRMSpropCompatibility(self):
-    opt_v1 = optimizer_v1.RMSprop()
-    opt_v2 = rmsprop.RMSprop()
-    self._testOptimizersCompatibility(opt_v1, opt_v2)
-
-  def testSGDCompatibility(self):
-    opt_v1 = optimizer_v1.SGD(lr=0.01)
-    opt_v2 = gradient_descent.SGD(learning_rate=0.01)
-    self._testOptimizersCompatibility(opt_v1, opt_v2, False)
-
-  def testNumericEquivalenceForNesterovMomentum(self):
-    if tf.executing_eagerly():
-      self.skipTest(
-          'v1 optimizer does not run in eager mode')
-    np.random.seed(1331)
-    with test_utils.use_gpu():
-      train_samples = 20
-      input_dim = 3
-      num_classes = 2
-      (x, y), _ = test_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=10,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
-      y = np_utils.to_categorical(y)
-
-      num_hidden = 5
-      model_k_v1 = test_utils.get_small_sequential_mlp(
-          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_k_v2 = test_utils.get_small_sequential_mlp(
-          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_k_v2.set_weights(model_k_v1.get_weights())
-      model_tf = test_utils.get_small_sequential_mlp(
-          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_tf.set_weights(model_k_v2.get_weights())
-
-      opt_k_v1 = optimizer_v1.SGD(momentum=0.9, nesterov=True)
-      opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
-      opt_tf = tf.compat.v1.train.MomentumOptimizer(
-          learning_rate=0.01, momentum=0.9, use_nesterov=True)
-
-      model_k_v1.compile(
-          opt_k_v1,
-          loss='categorical_crossentropy',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-      model_k_v2.compile(
-          opt_k_v2,
-          loss='categorical_crossentropy',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-      model_tf.compile(
-          opt_tf,
-          loss='categorical_crossentropy',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-
-      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
-      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
-      hist_tf = model_tf.fit(x, y, batch_size=5, epochs=10, shuffle=False)
-
-      self.assertAllClose(model_k_v1.get_weights(), model_tf.get_weights())
-      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
-      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
-      self.assertAllClose(hist_k_v1.history['loss'], hist_tf.history['loss'])
-      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
-
-  def testNumericEquivalenceForAmsgrad(self):
-    if tf.executing_eagerly():
-      self.skipTest(
-          'v1 optimizer does not run in eager mode')
-    np.random.seed(1331)
-    with test_utils.use_gpu():
-      train_samples = 20
-      input_dim = 3
-      num_classes = 2
-      (x, y), _ = test_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=10,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
-      y = np_utils.to_categorical(y)
-
-      num_hidden = 5
-      model_k_v1 = test_utils.get_small_sequential_mlp(
-          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_k_v2 = test_utils.get_small_sequential_mlp(
-          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
-      model_k_v2.set_weights(model_k_v1.get_weights())
-
-      opt_k_v1 = optimizer_v1.Adam(amsgrad=True)
-      opt_k_v2 = adam.Adam(amsgrad=True)
-
-      model_k_v1.compile(
-          opt_k_v1,
-          loss='categorical_crossentropy',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-      model_k_v2.compile(
-          opt_k_v2,
-          loss='categorical_crossentropy',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-
-      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
-      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
-
-      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
-      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
-      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
-
-
-# Note: These tests are kept in a separate class to avoid bugs in some
-# distributions of Python that break AutoGraph which is used by tf.function.
-@test_combinations.generate(test_combinations.combine(mode=['eager']))
-class OptimizerWithFunctionTest(tf.test.TestCase, parameterized.TestCase):
-
-  def testBasic(self):
-    var = tf.Variable([1.0, 2.0], dtype=tf.float32)
-    loss = lambda: 3 * var
-    opt = adam.Adam(learning_rate=1.0)
-
-    @tf.function
-    def fn():
-      opt.minimize(loss, [var])
-      return var
-
-    self.assertAllClose([0., 1.], fn(), atol=1e-4)
-    self.assertAllClose([-1, 0.], fn(), atol=1e-4)
-
-  def testBasicWithConstantDecay(self):
-    var = tf.Variable([1.0, 2.0], dtype=tf.float32)
-    loss = lambda: 3 * var
-    opt = adam.Adam(learning_rate=1.0)
-
-    @tf.function
-    def fn():
-      opt.minimize(loss, [var])
-      return var
-
-    self.assertAllClose([0., 1.], fn(), atol=1e-4)
-    self.assertAllClose([-1, 0.], fn(), atol=1e-4)
-
-  def testVarKeyWithVarCreatedInEager(self):
-    a = tf.Variable([1., 2.], name='var')
-    b = tf.Variable([1.], name='var')
-
-    @tf_test_utils.also_run_as_tf_function
-    def var_key_test():
-      self.assertFalse(a._in_graph_mode)
-      self.assertFalse(b._in_graph_mode)
-      var_key_a = optimizer_v2._var_key(a)
-      self.assertStartsWith(var_key_a, 'var_')
-      var_key_b = optimizer_v2._var_key(b)
-      self.assertStartsWith(var_key_b, 'var_')
-      self.assertNotEqual(var_key_a, var_key_b)
-
-    var_key_test()
-
-  def testLearningRateDecayUsedInTwoFunctions(self):
-    a = tf.Variable([1., 2.], name='var')
-    b = tf.Variable([1.], name='var')
-
-    learning_rate_decay = learning_rate_schedule.InverseTimeDecay(
-        0.5, decay_steps=1.0, decay_rate=0.5)
-    opt = adam.Adam(learning_rate=learning_rate_decay)
-    loss_a = lambda: 3 * a
-    loss_b = lambda: 2 * b
-
-    @tf.function
-    def fn_a():
-      opt.minimize(loss_a, [a])
-      return a
-
-    @tf.function
-    def fn_b():
-      opt.minimize(loss_b, [b])
-      return b
-
-    fn_a()
-    fn_b()
-
-
-_NUM_LEARNERS = 50
-APPLY_SCOPE = 'debug_apply'
-ALLOWLIST = [
-    # optimizer_v2._deduplicate_indexed_slices contains an indexed slice:
-    #   array_ops.shape(unique_indices)[0]
-    # which winds up expanding to [0:1:1] thereby creating three constants
-    # to represent the indices.
-    ('embeddings/strided_slice/stack', 'Const'),
-]
-
-
-def get_inputs(op):
-  op_inputs = list(op.inputs) + op.control_inputs
-  names = [i.name for i in op_inputs]
-  op_inputs = [getattr(i, 'op', i) for i in op_inputs]
-  return op_inputs, names
-
-
-def strip_name(node):
-  if 'Placeholder' in node.op:
-    return
-  node.name = ''
-
-
-def topological_sort(graph):
-  graph_ops = graph.get_operations()
-
-  sources = []
-  result = []
-
-  inputs = {}
-  outputs = collections.defaultdict(set)
-  for op in graph_ops:
-    op_inputs = get_inputs(op)[0]
-    if not op_inputs:
-      sources.append(op)
-
-    inputs[op] = set(op_inputs)
-    for i in op_inputs:
-      outputs[i].add(op)
-
-  while sources:
-    op = sources.pop()
-    for op_output in outputs[op]:
-      inputs[op_output].remove(op)
-      if not inputs[op_output]:
-        sources.append(op_output)
-
-    result.append(op)
-
-  # Check correctness.
-  if len(result) != len(graph_ops):
-    raise ValueError('Sort result has {} ops, source graph has {}.'
-                     .format(len(result), len(graph_ops)))
-
-  sort_check_seen = set()
-  for op in result:
-    sort_check_seen.add(op)
-    for i in get_inputs(op)[0]:
-      assert i in sort_check_seen
-
-  return result
-
-
-def identify_redundant_ops(graph):
-  """Implements basic common subexpression elimination.
-
-  This is not intended to replicate the graph semantics of TensorFlow Graphs
-  (for instance it does not handle stateful op ordering), nor is it intended to
-  replace the common subexpression elimination Grappler pass. Rather, it
-  provides a high level sanity check that clearly redundant ops are not being
-  created.
-
-  Args:
-    graph: The graph to be analyzed.
-
-  Returns:
-    A count of the duplicate ops and a description of the structure of each.
-  """
-  sorted_ops = topological_sort(graph)
-  duplicates = collections.defaultdict(list)
-  unified_node_defs = {}
-  name_map = {}
-
-  for op in sorted_ops:
-    input_names = []
-    for op_input, name in zip(*get_inputs(op)):
-      input_def = op_input.node_def
-
-      # Operations can have multiple outputs. We track which is used to prevent
-      # overzealous elimination.
-      input_def.name = name
-
-      input_def.input[:] = [name_map.get(i, i) for i in input_def.input]
-      strip_name(input_def)
-
-      # NodeDef.SerializeToString() does not provide identical serialized
-      # representations for identical NodeDefs, so we instead use string
-      # representation as a dict key.
-      key = repr(input_def)
-
-      if key in unified_node_defs:
-        input_names.append(unified_node_defs[key])
-
-      else:
-        unified_node_defs[key] = op_input.name
-        input_names.append(name)
-
-    node_def = op.node_def
-    node_def.input[:] = input_names
-    strip_name(node_def)
-
-    key = repr(node_def)
-    duplicates[key].append(op)
-    name_map[op.name] = duplicates[key][0].name
-
-  num_duplicates = 0
-  duplicate_types = []
-  for standard_def, op_defs in duplicates.items():
-    # We are only interested in testing the apply method of the optimizer
-    op_defs = [i for i in op_defs if APPLY_SCOPE in i.name]
-
-    # We only check for per-apply redundant ops.
-    if len(op_defs) < _NUM_LEARNERS:
-      continue
-
-    # Certain ops are simply not worth eliminating, and are instead simply
-    # ignored.
-    name, op_type = op_defs[0].name, op_defs[0].type
-    if any(allowlisted_scope in name and op_type == allowlisted_type
-           for allowlisted_scope, allowlisted_type in ALLOWLIST):
-      continue
-
-    num_duplicates += len(op_defs)
-    traceback = []
-    for level in op_defs[0].traceback:
-      traceback.append('  {} {}:{}'.format(level[0], level[2], level[1]))
-
-    duplicate_types.append(
-        '# Example name: {}\n# Op creation stack:\n{}\n{}'.format(
-            op_defs[0].name,
-            '\n'.join(traceback),
-            standard_def))
-
-  return num_duplicates, duplicate_types
-
-
-def make_model():
-  r"""Constructs a simple ensemble of weak learners model.
-
-  ---------    ---------             ---------    ---------
-  | Input |    | Input |     ...     | Input |    | Input |
-  ---------    ---------             ---------    ---------
-      |            |                     |            |
-      V            V                     V            V
-  ---------    ---------             ---------    ---------
-  | Embed |    | Embed |     ...     | Embed |    | Embed |
-  ---------    ---------             ---------    ---------
-      |            |                     |            |
-      V            V                     V            V
-  ---------    ---------             ---------    ---------
-  | Dense |    | Dense |     ...     | Dense |    | Dense |
-  ---------    ---------             ---------    ---------
-      \            |                     |            /
-       \           |                     |           /
-        ---------------------------------------------
-                              |
-                          ---------
-                          | Dense |
-                          ---------
-
-  This topology is chosen because it exercises both dense and sparse update
-  paths.
-
-  Returns:
-    A model for testing optimizer coefficient reuse.
-  """
-  inputs = []
-  intermediates = []
-  for _ in range(_NUM_LEARNERS):
-    inp = keras.layers.Input(shape=(1,), dtype=tf.int32)
-    layer = keras.layers.Embedding(1, 4)(inp)
-    layer = keras.layers.Dense(1)(layer)
-
-    inputs.append(inp)
-    intermediates.append(layer)
-
-  layer = keras.layers.Concatenate(axis=-1)(intermediates)
-  layer = keras.layers.Dense(1)(layer)
-
-  return keras.models.Model(inputs, layer)
-
-
-COEFFICIENT_PARAMS = (
-    ('Adadelta', adadelta.Adadelta, None),
-    ('Adagrad', adagrad.Adagrad, None),
-    ('Adam', adam.Adam, None),
-    ('Adam_amdgrad', adam.Adam, dict(amsgrad=True)),
-    ('Adamax', adamax.Adamax, None),
-    ('Ftrl', ftrl.Ftrl, None),
-    ('Ftrl_l2_shrinkage', ftrl.Ftrl,
-     dict(l2_shrinkage_regularization_strength=0.1)),
-    ('SGD', gradient_descent.SGD, None),
-    ('SGD_momentum', gradient_descent.SGD, dict(momentum=0.5)),
-    ('Nadam', nadam.Nadam, None),
-    ('RMSprop', rmsprop.RMSprop, None),
-    ('RMSprop_centered', rmsprop.RMSprop, dict(centered=True)),
-    ('RMSprop_momentum', rmsprop.RMSprop, dict(momentum=0.5)),
-    ('RMSprop_momentum_centered', rmsprop.RMSprop,
-     dict(momentum=0.5, centered=True)),
-)
-
-
-class OptimizerCoefficientTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(*COEFFICIENT_PARAMS)
-  def test_duplicate_ops(self, optimizer_class, init_kwargs=None):
-    init_kwargs = init_kwargs or {}
-    optimizer = optimizer_class(**init_kwargs)
-
-    graph = tf.Graph()
-    with graph.as_default():
-      model = make_model()
-      trainable_variables = model.trainable_variables
-      grads = optimizer.get_gradients(model.outputs[0], trainable_variables)
-
-      with backend.name_scope(APPLY_SCOPE):
-        optimizer.apply_gradients(zip(grads, trainable_variables))
-
-    num_duplicates, duplicate_types = identify_redundant_ops(graph)
-    if num_duplicates:
-      # Avoid spamming logs.
-      if len(duplicate_types) > 3:
-        duplicate_types = duplicate_types[:3] + ['...']
-
-      num_total = len(graph.get_operations())
-      raise ValueError('{} of {} ({:.1f}%) ops were duplicates:\n\n{}'.format(
-          num_duplicates, num_total, num_duplicates / num_total * 100,
-          '\n'.join(duplicate_types)))
-
-  @parameterized.named_parameters(*COEFFICIENT_PARAMS)
-  def test_subclass_compat(self, optimizer_class, init_kwargs=None):
-    """Ensure that subclassed optimizers without apply_state still work."""
-
-    class SubclassedOptimizer(optimizer_class):
-
-      def _resource_apply_dense(self, grad, var):  # pylint: disable=useless-super-delegation
-        return super()._resource_apply_dense(grad, var)
-
-      def _resource_apply_sparse(self, grad, var, indices):  # pylint: disable=useless-super-delegation
-        return super()._resource_apply_sparse(
-            grad, var, indices)
-
-    init_kwargs = init_kwargs or {}
-    optimizer = SubclassedOptimizer(**init_kwargs)
-
-    graph = tf.Graph()
-    with graph.as_default():
-      model = make_model()
-      trainable_variables = model.trainable_variables
-      grads = optimizer.get_gradients(model.outputs[0], trainable_variables)
-
-      with backend.name_scope(APPLY_SCOPE):
-        optimizer.apply_gradients(zip(grads, trainable_variables))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/rmsprop.py b/keras/optimizers/optimizer_v2/rmsprop.py
deleted file mode 100644
index c3c7fbd52bd9..000000000000
--- a/keras/optimizers/optimizer_v2/rmsprop.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""RMSprop optimizer implementation."""
-
-import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-from keras import backend_config
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from tensorflow.python.util.tf_export import keras_export
-
-
-# pylint: disable=g-classes-have-attributes
-@keras_export("keras.optimizers.RMSprop")
-class RMSprop(optimizer_v2.OptimizerV2):
-  r"""Optimizer that implements the RMSprop algorithm.
-
-  The gist of RMSprop is to:
-
-  - Maintain a moving (discounted) average of the square of gradients
-  - Divide the gradient by the root of this average
-
-  This implementation of RMSprop uses plain momentum, not Nesterov momentum.
-
-  The centered version additionally maintains a moving average of the
-  gradients, and uses that average to estimate the variance.
-
-  Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      that takes no arguments and returns the actual value to use. The
-      learning rate. Defaults to 0.001.
-    rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
-    momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
-    epsilon: A small constant for numerical stability. This epsilon is
-      "epsilon hat" in the Kingma and Ba paper (in the formula just before
-      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-      1e-7.
-    centered: Boolean. If `True`, gradients are normalized by the estimated
-      variance of the gradient; if False, by the uncentered second moment.
-      Setting this to `True` may help with training, but is slightly more
-      expensive in terms of computation and memory. Defaults to `False`.
-    name: Optional name prefix for the operations created when applying
-      gradients. Defaults to `"RMSprop"`.
-    **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
-      `clipnorm`, `global_clipnorm`.
-      If `clipvalue` (float) is set, the gradient of each weight
-      is clipped to be no higher than this value.
-      If `clipnorm` (float) is set, the gradient of each weight
-      is individually clipped so that its norm is no higher than this value.
-      If `global_clipnorm` (float) is set the gradient of all weights is
-      clipped so that their global norm is no higher than this value.
-
-  Note that in the dense implementation of this algorithm, variables and their
-  corresponding accumulators (momentum, gradient moving average, square
-  gradient moving average) will be updated even if the gradient is zero
-  (i.e. accumulators will decay, momentum will be applied). The sparse
-  implementation (used when the gradient is an `IndexedSlices` object,
-  typically because of `tf.gather` or an embedding lookup in the forward pass)
-  will not update variable slices or their accumulators unless those slices
-  were used in the forward pass (nor is there an "eventual" correction to
-  account for these omitted updates). This leads to more efficient updates for
-  large embedding lookup tables (where most of the slices are not accessed in
-  a particular graph execution), but differs from the published algorithm.
-
-  Usage:
-
-  >>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
-  >>> var1 = tf.Variable(10.0)
-  >>> loss = lambda: (var1 ** 2) / 2.0    # d(loss) / d(var1) = var1
-  >>> step_count = opt.minimize(loss, [var1]).numpy()
-  >>> var1.numpy()
-  9.683772
-
-  Reference:
-    - [Hinton, 2012](
-      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               rho=0.9,
-               momentum=0.0,
-               epsilon=1e-7,
-               centered=False,
-               name="RMSprop",
-               **kwargs):
-    """Construct a new RMSprop optimizer.
-
-    Args:
-      learning_rate: A `Tensor`, floating point value, or a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-        that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
-      rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
-      momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
-      epsilon: A small constant for numerical stability. This epsilon is
-        "epsilon hat" in the Kingma and Ba paper (in the formula just before
-        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
-      centered: Boolean. If `True`, gradients are normalized by the estimated
-        variance of the gradient; if False, by the uncentered second moment.
-        Setting this to `True` may help with training, but is slightly more
-        expensive in terms of computation and memory. Defaults to `False`.
-      name: Optional name prefix for the operations created when applying
-        gradients. Defaults to "RMSprop".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `decay`, `momentum`, and
-    `epsilon` can each be a callable that takes no arguments and returns the
-    actual value to use. This can be useful for changing these values across
-    different invocations of optimizer functions.
-    @end_compatibility
-    """
-    super().__init__(name, **kwargs)
-    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
-    self._set_hyper("decay", self._initial_decay)
-    self._set_hyper("rho", rho)
-
-    self._momentum = False
-    if isinstance(momentum, tf.Tensor) or callable(momentum) or momentum > 0:
-      self._momentum = True
-    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
-      raise ValueError(f"`momentum` must be between [0, 1]. Received: "
-                       f"momentum={momentum} (of type {type(momentum)}).")
-    self._set_hyper("momentum", momentum)
-
-    self.epsilon = epsilon or backend_config.epsilon()
-    self.centered = centered
-
-  def _create_slots(self, var_list):
-    for var in var_list:
-      self.add_slot(var, "rms")
-    if self._momentum:
-      for var in var_list:
-        self.add_slot(var, "momentum")
-    if self.centered:
-      for var in var_list:
-        self.add_slot(var, "mg")
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super()._prepare_local(var_device, var_dtype, apply_state)
-
-    rho = tf.identity(self._get_hyper("rho", var_dtype))
-    apply_state[(var_device, var_dtype)].update(
-        dict(
-            neg_lr_t=-apply_state[(var_device, var_dtype)]["lr_t"],
-            epsilon=tf.convert_to_tensor(
-                self.epsilon, var_dtype),
-            rho=rho,
-            momentum=tf.identity(self._get_hyper("momentum", var_dtype)),
-            one_minus_rho=1. - rho))
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    rms = self.get_slot(var, "rms")
-    if self._momentum:
-      mom = self.get_slot(var, "momentum")
-      if self.centered:
-        mg = self.get_slot(var, "mg")
-        return tf.raw_ops.ResourceApplyCenteredRMSProp(
-            var=var.handle,
-            mg=mg.handle,
-            ms=rms.handle,
-            mom=mom.handle,
-            lr=coefficients["lr_t"],
-            rho=coefficients["rho"],
-            momentum=coefficients["momentum"],
-            epsilon=coefficients["epsilon"],
-            grad=grad,
-            use_locking=self._use_locking)
-      else:
-        return tf.raw_ops.ResourceApplyRMSProp(
-            var=var.handle,
-            ms=rms.handle,
-            mom=mom.handle,
-            lr=coefficients["lr_t"],
-            rho=coefficients["rho"],
-            momentum=coefficients["momentum"],
-            epsilon=coefficients["epsilon"],
-            grad=grad,
-            use_locking=self._use_locking)
-    else:
-      rms_t = (coefficients["rho"] * rms +
-               coefficients["one_minus_rho"] * tf.square(grad))
-      rms_t = tf.compat.v1.assign(rms, rms_t, use_locking=self._use_locking)
-      denom_t = rms_t
-      if self.centered:
-        mg = self.get_slot(var, "mg")
-        mg_t = coefficients["rho"] * mg + coefficients["one_minus_rho"] * grad
-        mg_t = tf.compat.v1.assign(mg, mg_t, use_locking=self._use_locking)
-        denom_t = rms_t - tf.square(mg_t)
-      var_t = var - coefficients["lr_t"] * grad / (
-          tf.sqrt(denom_t) + coefficients["epsilon"])
-      return tf.compat.v1.assign(var, var_t, use_locking=self._use_locking).op
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype))
-                    or self._fallback_apply_state(var_device, var_dtype))
-
-    rms = self.get_slot(var, "rms")
-    if self._momentum:
-      mom = self.get_slot(var, "momentum")
-      if self.centered:
-        mg = self.get_slot(var, "mg")
-        return tf.raw_ops.ResourceSparseApplyCenteredRMSProp(
-            var=var.handle,
-            mg=mg.handle,
-            ms=rms.handle,
-            mom=mom.handle,
-            lr=coefficients["lr_t"],
-            rho=coefficients["rho"],
-            momentum=coefficients["momentum"],
-            epsilon=coefficients["epsilon"],
-            grad=grad,
-            indices=indices,
-            use_locking=self._use_locking)
-      else:
-        return tf.raw_ops.ResourceSparseApplyRMSProp(
-            var=var.handle,
-            ms=rms.handle,
-            mom=mom.handle,
-            lr=coefficients["lr_t"],
-            rho=coefficients["rho"],
-            momentum=coefficients["momentum"],
-            epsilon=coefficients["epsilon"],
-            grad=grad,
-            indices=indices,
-            use_locking=self._use_locking)
-    else:
-      rms_scaled_g_values = (grad * grad) * coefficients["one_minus_rho"]
-      rms_t = tf.compat.v1.assign(rms, rms * coefficients["rho"],
-                               use_locking=self._use_locking)
-      with tf.control_dependencies([rms_t]):
-        rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values)
-        rms_slice = tf.gather(rms_t, indices)
-      denom_slice = rms_slice
-      if self.centered:
-        mg = self.get_slot(var, "mg")
-        mg_scaled_g_values = grad * coefficients["one_minus_rho"]
-        mg_t = tf.compat.v1.assign(mg, mg * coefficients["rho"],
-                                use_locking=self._use_locking)
-        with tf.control_dependencies([mg_t]):
-          mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values)
-          mg_slice = tf.gather(mg_t, indices)
-          denom_slice = rms_slice - tf.square(mg_slice)
-      var_update = self._resource_scatter_add(
-          var, indices, coefficients["neg_lr_t"] * grad / (
-              tf.sqrt(denom_slice) + coefficients["epsilon"]))
-      if self.centered:
-        return tf.group(*[var_update, rms_t, mg_t])
-      return tf.group(*[var_update, rms_t])
-
-  def set_weights(self, weights):
-    params = self.weights
-    # Override set_weights for backward compatibility of Keras V1 optimizer
-    # since it does not include iteration at head of the weight list. Set
-    # iteration to 0.
-    if len(params) == len(weights) + 1:
-      weights = [np.array(0)] + weights
-    super().set_weights(weights)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        "learning_rate": self._serialize_hyperparameter("learning_rate"),
-        "decay": self._initial_decay,
-        "rho": self._serialize_hyperparameter("rho"),
-        "momentum": self._serialize_hyperparameter("momentum"),
-        "epsilon": self.epsilon,
-        "centered": self.centered,
-    })
-    return config
-
-
-RMSProp = RMSprop
diff --git a/keras/optimizers/optimizer_v2/rmsprop_test.py b/keras/optimizers/optimizer_v2/rmsprop_test.py
deleted file mode 100644
index 6175520576d5..000000000000
--- a/keras/optimizers/optimizer_v2/rmsprop_test.py
+++ /dev/null
@@ -1,589 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for rmsprop."""
-
-import tensorflow.compat.v2 as tf
-
-import copy
-import itertools
-import math
-
-from absl.testing import parameterized
-import numpy as np
-from tensorflow.python.framework import test_util as tf_test_utils
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-from keras.optimizers.schedules import learning_rate_schedule
-from keras.optimizers.optimizer_v2 import rmsprop
-
-_DATA_TYPES = [
-    tf.half, tf.float32, tf.float64, tf.complex64,
-    tf.complex128
-]
-
-_TEST_PARAM_VALUES = [
-    # learning_rate, rho, momentum, epsilon, centered
-    [0.05, 0.9, 0.0, 1e-3, True],
-    [0.05, 0.9, 0.0, 1e-3, False],
-    [0.1, 0.9, 0.0, 1e-3, True],
-    [0.01, 0.9, 0.0, 1e-5, True],
-    [0.01, 0.9, 0.9, 1e-5, True],
-]
-
-_TESTPARAMS = [
-    [data_type] + values
-    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
-]
-
-
-class RMSpropOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
-                            epsilon, centered):
-    rms_t = rms * rho + (1 - rho) * g * g
-    if centered:
-      mg_t = mg * rho + (1 - rho) * g
-      denom_t = rms_t - mg_t * mg_t
-    else:
-      mg_t = mg
-      denom_t = rms_t
-    if momentum > 0.:
-      mom_t = momentum * mom + lr * g / (np.sqrt(denom_t + epsilon))
-      var_t = var - mom_t
-    else:
-      mom_t = mom
-      var_t = var - lr * g / (np.sqrt(denom_t) + epsilon)
-    return var_t, mg_t, rms_t, mom_t
-
-  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
-                                   lr, rho, momentum, epsilon, centered):
-    mg_t = copy.deepcopy(mg)
-    rms_t = copy.deepcopy(rms)
-    mom_t = copy.deepcopy(mom)
-    var_t = copy.deepcopy(var)
-    for i in range(len(gindexs)):
-      gindex = gindexs[i]
-      gvalue = gvalues[i]
-      rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
-      if centered:
-        mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
-        denom_t = rms_t[gindex] - mg_t[gindex] * mg_t[gindex]
-      else:
-        denom_t = rms_t[gindex]
-      if momentum > 0.:
-        mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t +
-                                                                       epsilon)
-        var_t[gindex] = var[gindex] - mom_t[gindex]
-      else:
-        mom_t[gindex] = mom[gindex]
-        var_t[gindex] = var[gindex] - lr * gvalue / (np.sqrt(denom_t) + epsilon)
-    return var_t, mg_t, rms_t, mom_t
-
-  def testDense(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
-      with tf.compat.v1.get_default_graph().as_default(), test_utils.use_gpu():
-        # Initialize variables for numpy implementation.
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np, dtype=dtype)
-        var1 = tf.Variable(var1_np, dtype=dtype)
-        grads0 = tf.constant(grads0_np, dtype=dtype)
-        grads1 = tf.constant(grads1_np, dtype=dtype)
-        opt = rmsprop.RMSprop(
-            learning_rate=learning_rate,
-            rho=rho,
-            momentum=momentum,
-            epsilon=epsilon,
-            centered=centered)
-
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        if centered:
-          mg0 = opt.get_slot(var0, "mg")
-          mg1 = opt.get_slot(var1, "mg")
-        else:
-          mg0 = None
-          mg1 = None
-
-        if momentum > 0.:
-          mom0 = opt.get_slot(var0, "momentum")
-          mom1 = opt.get_slot(var1, "momentum")
-        else:
-          mom0 = None
-          mom1 = None
-
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertIsNotNone(rms0)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertIsNotNone(rms1)
-
-        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        # Run 3 steps of RMSprop
-        for _ in range(1, 4):
-          self.evaluate(update)
-
-          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
-              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate, rho,
-              momentum, epsilon, centered)
-          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
-              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate, rho,
-              momentum, epsilon, centered)
-
-          # Validate updated params
-          if centered:
-            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
-            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
-          if momentum > 0.:
-            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
-          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
-          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testDenseWithLearningRateDecay(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      var0_np = np.array([1.0, 2.0])
-      grads0_np = np.array([0.1, 0.2])
-      var1_np = np.array([3.0, 4.0])
-      grads1_np = np.array([0.01, 0.2])
-
-      var0 = tf.Variable(var0_np)
-      var1 = tf.Variable(var1_np)
-      grads0 = tf.constant(grads0_np)
-      grads1 = tf.constant(grads1_np)
-      learning_rate = 0.01
-      rho = 0.9
-      momentum = 0.0
-      epsilon = 1e-7
-      centered = False
-      decay = 0.5
-      opt = rmsprop.RMSprop(
-          learning_rate=learning_rate,
-          rho=rho,
-          momentum=momentum,
-          epsilon=epsilon,
-          centered=centered,
-          decay=decay)
-
-      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      rms0 = opt.get_slot(var0, "rms")
-      self.assertIsNotNone(rms0)
-      rms1 = opt.get_slot(var1, "rms")
-      self.assertIsNotNone(rms1)
-      if momentum > 0.:
-        mom0 = opt.get_slot(var0, "momentum")
-        mom1 = opt.get_slot(var1, "momentum")
-      else:
-        mom0 = None
-        mom1 = None
-
-      mg0_np = np.array([0.0, 0.0])
-      mg1_np = np.array([0.0, 0.0])
-      rms0_np = np.array([0.0, 0.0])
-      rms1_np = np.array([0.0, 0.0])
-      mom0_np = np.array([0.0, 0.0])
-      mom1_np = np.array([0.0, 0.0])
-
-      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-      # Run 4 steps of RMSprop
-      for t in range(2):
-        self.evaluate(update)
-
-        lr = learning_rate / (1 + decay * t)
-        var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
-            var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum,
-            epsilon, centered)
-        var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
-            var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum,
-            epsilon, centered)
-
-        # Validate updated params
-        self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
-        self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-        if momentum > 0.:
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
-        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testDenseWithLearningRateInverseTimeDecay(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      var0_np = np.array([1.0, 2.0])
-      grads0_np = np.array([0.1, 0.2])
-      var1_np = np.array([3.0, 4.0])
-      grads1_np = np.array([0.01, 0.2])
-
-      var0 = tf.Variable(var0_np)
-      var1 = tf.Variable(var1_np)
-      grads0 = tf.constant(grads0_np)
-      grads1 = tf.constant(grads1_np)
-      learning_rate = 0.01
-      rho = 0.9
-      momentum = 0.0
-      epsilon = 1e-7
-      centered = False
-      decay = 0.5
-      lr_schedule = learning_rate_schedule.InverseTimeDecay(
-          learning_rate, decay_steps=1.0, decay_rate=decay)
-      opt = rmsprop.RMSprop(
-          learning_rate=lr_schedule,
-          rho=rho,
-          momentum=momentum,
-          epsilon=epsilon,
-          centered=centered)
-
-      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-
-      rms0 = opt.get_slot(var0, "rms")
-      self.assertIsNotNone(rms0)
-      rms1 = opt.get_slot(var1, "rms")
-      self.assertIsNotNone(rms1)
-      if momentum > 0.:
-        mom0 = opt.get_slot(var0, "momentum")
-        mom1 = opt.get_slot(var1, "momentum")
-      else:
-        mom0 = None
-        mom1 = None
-
-      mg0_np = np.array([0.0, 0.0])
-      mg1_np = np.array([0.0, 0.0])
-      rms0_np = np.array([0.0, 0.0])
-      rms1_np = np.array([0.0, 0.0])
-      mom0_np = np.array([0.0, 0.0])
-      mom1_np = np.array([0.0, 0.0])
-
-      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-      # Run 4 steps of RMSprop
-      for t in range(2):
-        self.evaluate(update)
-
-        lr = learning_rate / (1 + decay * t)
-        var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
-            var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum,
-            epsilon, centered)
-        var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
-            var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum,
-            epsilon, centered)
-
-        # Validate updated params
-        self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
-        self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-        if momentum > 0.:
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
-        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  def testMinimizeSparseResourceVariable(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-        x = tf.constant([[4.0], [5.0]], dtype=dtype)
-
-        def loss():
-          pred = tf.matmul(tf.compat.v1.nn.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
-          return pred * pred
-
-        sgd_op = rmsprop.RMSprop(
-            learning_rate=1.0, rho=0.0, momentum=0.0, epsilon=0.0,
-            centered=False).minimize(
-                loss, var_list=[var0])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[0., 1.]],
-                                           self.evaluate(var0),
-                                           atol=0.01)
-
-  def testMinimizeSparseResourceVariableCentered(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      for dtype in _DATA_TYPES:
-        var0 = tf.Variable([[1.0, 2.0]], dtype=dtype)
-        x = tf.constant([[4.0], [5.0]], dtype=dtype)
-
-        def loss():
-          pred = tf.matmul(tf.compat.v1.nn.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
-          return pred * pred
-
-        # loss = lambda: pred * pred  # pylint: disable=cell-var-from-loop
-        sgd_op = rmsprop.RMSprop(
-            learning_rate=1.0, rho=0.0, momentum=0.0, epsilon=1.0,
-            centered=True).minimize(
-                loss, var_list=[var0])
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[-111, -138]],
-                                           self.evaluate(var0),
-                                           atol=0.01)
-
-  def testSparse(self):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
-      with tf.compat.v1.get_default_graph().as_default(), test_utils.use_gpu():
-        # Initialize variables for numpy implementation.
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
-
-        var0 = tf.Variable(var0_np)
-        var1 = tf.Variable(var1_np)
-        grads0_np_indices = np.array([0], dtype=np.int32)
-        grads0 = tf.IndexedSlices(
-            tf.constant(grads0_np),
-            tf.constant(grads0_np_indices), tf.constant([1]))
-        grads1_np_indices = np.array([1], dtype=np.int32)
-        grads1 = tf.IndexedSlices(
-            tf.constant(grads1_np),
-            tf.constant(grads1_np_indices), tf.constant([1]))
-        opt = rmsprop.RMSprop(
-            learning_rate=learning_rate,
-            rho=rho,
-            momentum=momentum,
-            epsilon=epsilon,
-            centered=centered)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(tf.compat.v1.global_variables_initializer())
-
-        if centered:
-          mg0 = opt.get_slot(var0, "mg")
-          self.assertEqual(mg0 is not None, centered)
-          mg1 = opt.get_slot(var1, "mg")
-          self.assertEqual(mg1 is not None, centered)
-        else:
-          mg0 = None
-          mg1 = None
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertIsNotNone(rms0)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertIsNotNone(rms1)
-        if momentum > 0.:
-          mom0 = opt.get_slot(var0, "momentum")
-          mom1 = opt.get_slot(var1, "momentum")
-        else:
-          mom0 = None
-          mom1 = None
-
-        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-        # Run 3 steps of RMSprop
-        for _ in range(1, 4):
-          self.evaluate(update)
-
-          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
-              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
-              learning_rate, rho, momentum, epsilon, centered)
-          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
-              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
-              learning_rate, rho, momentum, epsilon, centered)
-
-          # Validate updated params
-          if centered:
-            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
-            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
-          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
-          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          if momentum > 0.:
-            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testCallableParams(self):
-    for dtype in _DATA_TYPES:
-      var0 = tf.Variable([1.0, 2.0], dtype=dtype)
-      var1 = tf.Variable([3.0, 4.0], dtype=dtype)
-      grads0 = tf.constant([0.1, 0.1], dtype=dtype)
-      grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-
-      learning_rate = lambda: 2.0
-      rho = lambda: 0.9
-      momentum = lambda: 0.0
-      epsilon = 1.0
-      opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
-
-      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-      # Step 1: the rms accumulators where 1. So we should see a normal
-      # update: v -= grad * learning_rate
-      opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      # Check the parameters.
-      self.assertAllCloseAccordingToType(
-          np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
-          ]), self.evaluate(var0))
-      self.assertAllCloseAccordingToType(
-          np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
-          ]), self.evaluate(var1))
-      # Step 2: the root mean square accumulators contain the previous update.
-      opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-      # Check the parameters.
-      self.assertAllCloseAccordingToType(
-          np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
-              (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
-              (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
-          ]), self.evaluate(var0))
-      self.assertAllCloseAccordingToType(
-          np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
-              (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
-              (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
-          ]), self.evaluate(var1))
-
-  def testConstructRMSpropWithLR(self):
-    opt = rmsprop.RMSprop(lr=1.0)
-    opt_2 = rmsprop.RMSprop(learning_rate=0.1, lr=1.0)
-    opt_3 = rmsprop.RMSprop(learning_rate=0.1)
-    self.assertIsInstance(opt.lr, tf.Variable)
-    self.assertIsInstance(opt_2.lr, tf.Variable)
-    self.assertIsInstance(opt_3.lr, tf.Variable)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(opt.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
-    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testSlotsUniqueEager(self):
-    v1 = tf.Variable(1.)
-    v2 = tf.Variable(1.)
-
-    opt = rmsprop.RMSprop(1., momentum=0., centered=False)
-    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-    # There should be iteration, and one unique slot variable for v1 and v2.
-    self.assertLen(set({id(v) for v in opt.variables()}), 3)
-    self.assertEqual(
-        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
-
-    opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
-    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-    # There should be iteration, and two unique slot variables for v1 and v2.
-    self.assertLen(set({id(v) for v in opt.variables()}), 5)
-    self.assertEqual(
-        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
-
-    opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
-    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-    # There should be iteration, and three unique slot variables for v1 and v2
-    self.assertLen(set({id(v) for v in opt.variables()}), 7)
-    self.assertEqual(
-        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testMomentumProperValue(self):
-    with self.assertRaisesRegex(ValueError,
-                                r"`momentum` must be between \[0, 1\]. "
-                                r"Received: momentum=2.5 \(of type <class "
-                                r"\'float\'>\)."):
-      rmsprop.RMSprop(1., momentum=2.5, centered=False)
-
-
-@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
-class SlotColocationTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters([True, False])
-  @tf_test_utils.run_gpu_only
-  def testRunMinimizeOnGPUForCPUVariables(self, use_resource):
-    with tf.device("/device:CPU:0"):
-      if use_resource:
-        var0 = tf.Variable([1.0, 2.0], dtype=tf.float32)
-        var1 = tf.Variable([3.0, 4.0], dtype=tf.float32)
-      else:
-        var0 = tf.Variable([1.0, 2.0], dtype=tf.float32)
-        var1 = tf.Variable([3.0, 4.0], dtype=tf.float32)
-
-    def loss():
-      return 5 * var0 + 3 * var1
-
-    opt = rmsprop.RMSprop(
-        learning_rate=1.0, decay=0.9, momentum=0.5, epsilon=1.0)
-
-    # Fetch params to validate initial values
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-
-    # Run 1 step through optimizer on GPU.
-    # Slot variables are created the first time optimizer is used on some
-    # variable. This tests that slot variables will be colocated with the base
-    # variable.
-    with tf.device("/device:GPU:0"):
-      # Note that for eager execution, minimize expects a function instead of a
-      # Tensor.
-      opt_op = opt.minimize(loss, [var0, var1])
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      self.evaluate(opt_op)
-
-    # Validate updated params, All variables should have decreased.
-    self.assertTrue(all(v < 0.0 for v in self.evaluate(var0)),
-                    msg="updated variables: %s" % self.evaluate(var0))
-    self.assertTrue(all(v < 2.0 for v in self.evaluate(var1)),
-                    msg="updated variables: %s" % self.evaluate(var1))
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/utils.py b/keras/optimizers/optimizer_v2/utils.py
deleted file mode 100644
index 52cee4124227..000000000000
--- a/keras/optimizers/optimizer_v2/utils.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Optimizer utilities."""
-
-import tensorflow.compat.v2 as tf
-from tensorflow.python.platform import tf_logging as logging
-
-
-def all_reduce_sum_gradients(grads_and_vars):
-  """Returns all-reduced gradients aggregated via summation.
-
-  Args:
-    grads_and_vars: List of (gradient, variable) pairs.
-
-  Returns:
-    List of (gradient, variable) pairs where gradients have been all-reduced.
-  """
-  grads_and_vars = list(grads_and_vars)
-  filtered_grads_and_vars = filter_empty_gradients(grads_and_vars)
-  if filtered_grads_and_vars:
-    if tf.__internal__.distribute.strategy_supports_no_merge_call():
-      grads = [pair[0] for pair in filtered_grads_and_vars]
-      reduced = tf.distribute.get_replica_context().all_reduce(
-          tf.distribute.ReduceOp.SUM, grads)
-    else:
-      # TODO(b/183257003): Remove this branch
-      reduced = tf.distribute.get_replica_context().merge_call(
-          _all_reduce_sum_fn, args=(filtered_grads_and_vars,))
-  else:
-    reduced = []
-  # Copy 'reduced' but add None gradients back in
-  reduced_with_nones = []
-  reduced_pos = 0
-  for g, v in grads_and_vars:
-    if g is None:
-      reduced_with_nones.append((None, v))
-    else:
-      reduced_with_nones.append((reduced[reduced_pos], v))
-      reduced_pos += 1
-  assert reduced_pos == len(reduced), "Failed to add all gradients"
-  return reduced_with_nones
-
-
-def filter_empty_gradients(grads_and_vars):
-  """Filter out `(grad, var)` pairs that have a gradient equal to `None`."""
-  grads_and_vars = tuple(grads_and_vars)
-  if not grads_and_vars:
-    return grads_and_vars
-
-  filtered = []
-  vars_with_empty_grads = []
-  for grad, var in grads_and_vars:
-    if grad is None:
-      vars_with_empty_grads.append(var)
-    else:
-      filtered.append((grad, var))
-  filtered = tuple(filtered)
-
-  if not filtered:
-    variable = ([v.name for _, v in grads_and_vars],)
-    raise ValueError(f"No gradients provided for any variable: {variable}. "
-                     f"Provided `grads_and_vars` is {grads_and_vars}.")
-  if vars_with_empty_grads:
-    logging.warning(
-        ("Gradients do not exist for variables %s when minimizing the loss. "
-         "If you're using `model.compile()`, did you forget to provide a `loss`"
-         "argument?"),
-        ([v.name for v in vars_with_empty_grads]))
-  return filtered
-
-
-def make_gradient_clipnorm_fn(clipnorm):
-  """Creates a gradient transformation function for clipping by norm."""
-  if clipnorm is None:
-    return lambda grads_and_vars: grads_and_vars
-
-  def gradient_clipnorm_fn(grads_and_vars):
-
-    if isinstance(tf.distribute.get_strategy(),
-                  (tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      raise ValueError(
-          "`clipnorm` is not supported with `CenteralStorageStrategy`. "
-          f"The strategy used is {tf.distribute.get_strategy()}.")
-
-    clipped_grads_and_vars = [
-        (tf.clip_by_norm(g, clipnorm), v) for g, v in grads_and_vars
-    ]
-    return clipped_grads_and_vars
-
-  return gradient_clipnorm_fn
-
-
-def make_global_gradient_clipnorm_fn(clipnorm):
-  """Creates a gradient transformation function for clipping by norm."""
-  if clipnorm is None:
-    return lambda grads_and_vars: grads_and_vars
-
-  def gradient_clipnorm_fn(grads_and_vars):
-
-    if isinstance(tf.distribute.get_strategy(),
-                  (tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      raise ValueError(
-          "`global_clipnorm` is not supported with `CenteralStorageStrategy`. "
-          f"The strategy used is {tf.distribute.get_strategy()}.")
-
-    grads, variables = zip(*grads_and_vars)
-    clipped_grads, _ = tf.clip_by_global_norm(grads, clipnorm)
-    clipped_grads_and_vars = list(zip(clipped_grads, variables))
-    return clipped_grads_and_vars
-
-  return gradient_clipnorm_fn
-
-
-def make_gradient_clipvalue_fn(clipvalue):
-  """Creates a gradient transformation function for clipping by value."""
-  if clipvalue is None:
-    return lambda grads_and_vars: grads_and_vars
-
-  def gradient_clipvalue_fn(grads_and_vars):
-
-    if isinstance(tf.distribute.get_strategy(),
-                  (tf.distribute.experimental.CentralStorageStrategy,
-                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
-      raise ValueError(
-          "`clipvalue` is not supported with `CenteralStorageStrategy`. "
-          f"The strategy used is {tf.distribute.get_strategy()}.")
-
-    clipped_grads_and_vars = [(tf.clip_by_value(g, -clipvalue,
-                                                      clipvalue), v)
-                              for g, v in grads_and_vars]
-    return clipped_grads_and_vars
-
-  return gradient_clipvalue_fn
-
-
-def _all_reduce_sum_fn(distribution, grads_and_vars):
-  return distribution.extended.batch_reduce_to(tf.distribute.ReduceOp.SUM,
-                                               grads_and_vars)
diff --git a/keras/optimizers/optimizers_test.py b/keras/optimizers/optimizers_test.py
deleted file mode 100644
index ee08cb7eded3..000000000000
--- a/keras/optimizers/optimizers_test.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras optimizers."""
-
-import tensorflow.compat.v2 as tf
-
-import gc
-import weakref
-
-import numpy as np
-
-import keras
-from keras.optimizers import optimizer_v1
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-from keras.utils import np_utils
-from tensorflow.python.training.adam import AdamOptimizer
-from tensorflow.python.training.experimental.loss_scale_optimizer import MixedPrecisionLossScaleOptimizer
-
-
-def _get_model(input_dim, num_hidden, output_dim):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(num_hidden,
-                               activation='relu',
-                               input_shape=(input_dim,)))
-  model.add(keras.layers.Dense(output_dim, activation='softmax'))
-  return model
-
-
-@test_combinations.run_all_keras_modes
-class KerasOptimizersTest(test_combinations.TestCase):
-
-  def _test_optimizer(self, optimizer, target=0.75):
-    if tf.executing_eagerly():
-      self.skipTest(
-          'v1 optimizer does not run in eager mode')
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=1000, test_samples=200, input_shape=(10,), num_classes=2)
-    y_train = np_utils.to_categorical(y_train)
-    model = _get_model(x_train.shape[1], 20, y_train.shape[1])
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-    np.testing.assert_equal(
-        keras.backend.get_value(model.optimizer.iterations), 0)
-    history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
-    np.testing.assert_equal(
-        keras.backend.get_value(model.optimizer.iterations),
-        126)  # 63 steps per epoch
-    self.assertGreaterEqual(history.history['acc'][-1], target)
-    config = keras.optimizers.serialize(optimizer)
-    optim = keras.optimizers.deserialize(config)
-    new_config = keras.optimizers.serialize(optim)
-    new_config['class_name'] = new_config['class_name'].lower()
-    new_config['config'].pop('name', None)
-    if 'amsgrad' not in config['config']:
-      new_config['config'].pop('amsgrad', None)
-    if 'decay' in new_config['config'] and 'schedule_decay' in config['config']:
-      new_config['config']['schedule_decay'] = new_config['config'].pop('decay')
-    if 'momentum' not in config['config']:
-      new_config['config'].pop('momentum', None)
-    if 'centered' not in config['config']:
-      new_config['config'].pop('centered', None)
-    self.assertDictEqual(config, new_config)
-
-    # Test constraints.
-    model = keras.models.Sequential()
-    dense = keras.layers.Dense(
-        10,
-        input_shape=(x_train.shape[1],),
-        kernel_constraint=lambda x: 0. * x + 1.,
-        bias_constraint=lambda x: 0. * x + 2.,
-        activation='relu')
-    model.add(dense)
-    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-    np.testing.assert_equal(
-        keras.backend.get_value(model.optimizer.iterations),
-        126)  # Using same optimizer from before
-    model.train_on_batch(x_train[:10], y_train[:10])
-    np.testing.assert_equal(
-        keras.backend.get_value(model.optimizer.iterations), 127)
-    kernel, bias = dense.get_weights()
-    np.testing.assert_allclose(kernel, 1., atol=1e-3)
-    np.testing.assert_allclose(bias, 2., atol=1e-3)
-
-  def test_sgd(self):
-    with self.cached_session():
-      self._test_optimizer(optimizer_v1.SGD())
-
-  def test_momentum(self):
-    with self.cached_session():
-      self._test_optimizer(
-          optimizer_v1.SGD(lr=0.01, momentum=0.9, nesterov=True))
-
-  def test_rmsprop(self):
-    with self.cached_session():
-      self._test_optimizer(optimizer_v1.RMSprop())
-      self._test_optimizer(optimizer_v1.RMSprop(decay=1e-3))
-
-  def test_adagrad(self):
-    with self.cached_session():
-      self._test_optimizer(optimizer_v1.Adagrad())
-      self._test_optimizer(optimizer_v1.Adagrad(decay=1e-3))
-
-  def test_adadelta(self):
-    with self.cached_session():
-      self._test_optimizer(optimizer_v1.Adadelta(), target=0.6)
-      # Accuracy seems dependent on the initialization. Even adding
-      # tf.compat.v1.Print nodes in the graph seemed to affect the
-      # initialization seed, and hence the accuracy.
-      self._test_optimizer(optimizer_v1.Adadelta(decay=1e-3), target=0.4)
-
-  def test_adam(self):
-    with self.cached_session():
-      self._test_optimizer(optimizer_v1.Adam())
-      # Accuracy seems dependent on the seed initialization.
-      # TODO(b/121051441): fix test flakiness.
-      self._test_optimizer(optimizer_v1.Adam(decay=1e-3), target=0.73)
-      self._test_optimizer(optimizer_v1.Adam(amsgrad=True))
-
-  def test_adamax(self):
-    with self.cached_session():
-      self._test_optimizer(optimizer_v1.Adamax())
-      self._test_optimizer(optimizer_v1.Adamax(decay=1e-3))
-
-  def test_nadam(self):
-    with self.cached_session():
-      self._test_optimizer(optimizer_v1.Nadam())
-
-  def test_clipnorm(self):
-    with self.cached_session():
-      self._test_optimizer(
-          optimizer_v1.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
-
-  def test_clipvalue(self):
-    with self.cached_session():
-      self._test_optimizer(
-          optimizer_v1.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
-
-  def test_tf_optimizer(self):
-    if tf.executing_eagerly():
-      self.skipTest(
-          'v1 optimizer does not run in eager mode')
-    optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(
-        2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
-    # This is possible
-    model.compile(
-        loss='mean_squared_error',
-        optimizer=optimizer,
-        run_eagerly=test_utils.should_run_eagerly())
-    keras.backend.track_tf_optimizer(optimizer)
-    model.fit(np.random.random((5, 3)),
-              np.random.random((5, 2)),
-              epochs=1,
-              batch_size=5,
-              verbose=0)
-    # not supported
-    with self.assertRaises(NotImplementedError):
-      _ = optimizer.weights
-    with self.assertRaises(NotImplementedError):
-      optimizer.get_config()
-    with self.assertRaises(NotImplementedError):
-      optimizer.from_config(None)
-
-  def test_optimizer_garbage_collection(self):
-    if tf.executing_eagerly():
-      self.skipTest(
-          'v1 optimizer does not run in eager mode')
-    graph = tf.Graph()
-    with graph.as_default():
-      optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
-      keras.backend.track_tf_optimizer(optimizer)
-      optimizer_weak = weakref.ref(optimizer)
-    graph_weak = weakref.ref(graph)
-    del graph, optimizer
-    gc.collect()
-    # Check that the weak references are dead now.
-    self.assertIs(graph_weak(), None)
-    self.assertIs(optimizer_weak(), None)
-
-  def test_tf_optimizer_iterations(self):
-    if tf.executing_eagerly():
-      self.skipTest(
-          'v1 optimizer does not run in eager mode')
-    with self.cached_session():
-      optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(
-          2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
-      model.compile(
-          loss='mean_squared_error',
-          optimizer=optimizer,
-          run_eagerly=test_utils.should_run_eagerly())
-      keras.backend.track_tf_optimizer(optimizer)
-      self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 0)
-
-      model.fit(np.random.random((55, 3)),
-                np.random.random((55, 2)),
-                epochs=1,
-                batch_size=5,
-                verbose=0)
-      self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 11)
-
-  def test_negative_clipvalue_or_clipnorm(self):
-    with self.assertRaises(ValueError):
-      _ = optimizer_v1.SGD(lr=0.01, clipvalue=-0.5)
-    with self.assertRaises(ValueError):
-      _ = optimizer_v1.Adam(clipnorm=-2.0)
-
-  def test_mixed_precision_loss_scale_optimizer(self):
-    if tf.executing_eagerly():
-      self.skipTest('v1 optimizer does not run in eager mode')
-    optimizer = MixedPrecisionLossScaleOptimizer(AdamOptimizer(), 'dynamic')
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Dense(
-            2, input_shape=(3,),
-            kernel_constraint=keras.constraints.MaxNorm(1)))
-    model.compile(
-        loss='mean_squared_error',
-        optimizer=optimizer,
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(
-        np.random.random((5, 3)),
-        np.random.random((5, 2)),
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-
-  def test_deserialization_error(self):
-    with self.assertRaisesRegex(ValueError, 'Could not interpret optimizer'):
-      keras.optimizers.get(0)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/optimizers/rmsprop.py b/keras/optimizers/rmsprop.py
new file mode 100644
index 000000000000..c59a822ca55a
--- /dev/null
+++ b/keras/optimizers/rmsprop.py
@@ -0,0 +1,218 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.experimental.RMSprop",
+    "keras.optimizers.RMSprop",
+    "keras.dtensor.experimental.optimizers.RMSprop",
+    v1=[],
+)
+class RMSprop(optimizer.Optimizer):
+    r"""Optimizer that implements the RMSprop algorithm.
+
+    The gist of RMSprop is to:
+
+    - Maintain a moving (discounted) average of the square of gradients
+    - Divide the gradient by the root of this average
+
+    This implementation of RMSprop uses plain momentum, not Nesterov momentum.
+
+    The centered version additionally maintains a moving average of the
+    gradients, and uses that average to estimate the variance.
+
+    Args:
+        learning_rate: Initial value for the learning rate:
+            either a floating point value,
+            or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+            Defaults to 0.001.
+        rho: float, defaults to 0.9. Discounting factor for the old gradients.
+        momentum: float, defaults to 0.0. If not 0.0., the optimizer tracks the
+            momentum value, with a decay rate equals to `1 - momentum`.
+        epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper.
+            Defaults to `1e-7`.
+        centered: Boolean. If `True`, gradients are normalized by the estimated
+            variance of the gradient; if False, by the uncentered second moment.
+            Setting this to `True` may help with training, but is slightly more
+            expensive in terms of computation and memory. Defaults to `False`.
+        {{base_optimizer_keyword_args}}
+
+    Usage:
+
+    >>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
+    >>> var1 = tf.Variable(10.0)
+    >>> loss = lambda: (var1 ** 2) / 2.0  # d(loss) / d(var1) = var1
+    >>> opt.minimize(loss, [var1])
+    >>> var1.numpy()
+    9.683772
+
+    Reference:
+        - [Hinton, 2012](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) # noqa: E501
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        rho=0.9,
+        momentum=0.0,
+        epsilon=1e-7,
+        centered=False,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=100,
+        jit_compile=True,
+        name="RMSprop",
+        **kwargs
+    ):
+        super().__init__(
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            name=name,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.rho = rho
+        self.momentum = momentum
+        self.epsilon = epsilon
+        self.centered = centered
+
+    def build(self, var_list):
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self._built = True
+
+        self._velocities = []
+        for var in var_list:
+            self._velocities.append(
+                self.add_variable_from_reference(var, "velocity")
+            )
+
+        self._momentums = []
+        if self.momentum > 0:
+            for var in var_list:
+                self._momentums.append(
+                    self.add_variable_from_reference(var, "momentum")
+                )
+
+        self._average_gradients = []
+        if self.centered:
+            for var in var_list:
+                self._average_gradients.append(
+                    self.add_variable_from_reference(var, "average_gradient")
+                )
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+
+        var_key = self._var_key(variable)
+        velocity = self._velocities[self._index_dict[var_key]]
+        momentum = None
+        if self.momentum > 0:
+            momentum = self._momentums[self._index_dict[var_key]]
+        average_grad = None
+        if self.centered:
+            average_grad = self._average_gradients[self._index_dict[var_key]]
+
+        rho = self.rho
+
+        if isinstance(gradient, tf.IndexedSlices):
+            # Sparse gradients.
+            velocity.assign(rho * velocity)
+            velocity.scatter_add(
+                tf.IndexedSlices(
+                    tf.square(gradient.values) * (1 - rho), gradient.indices
+                )
+            )
+            if self.centered:
+                average_grad.assign(rho * average_grad)
+                average_grad.scatter_add(
+                    tf.IndexedSlices(
+                        gradient.values * (1 - rho), gradient.indices
+                    )
+                )
+                denominator = velocity - tf.square(average_grad) + self.epsilon
+            else:
+                denominator = velocity + self.epsilon
+            denominator_slices = tf.gather(denominator, gradient.indices)
+            increment = tf.IndexedSlices(
+                lr * gradient.values * tf.math.rsqrt(denominator_slices),
+                gradient.indices,
+            )
+
+            if self.momentum > 0:
+                momentum.assign(self.momentum * momentum)
+                momentum.scatter_add(increment)
+                variable.assign_add(-momentum)
+            else:
+                variable.scatter_add(-increment)
+        else:
+            # Dense gradients.
+            velocity.assign(rho * velocity + (1 - rho) * tf.square(gradient))
+            if self.centered:
+                average_grad.assign(rho * average_grad + (1 - rho) * gradient)
+                denominator = velocity - tf.square(average_grad) + self.epsilon
+            else:
+                denominator = velocity + self.epsilon
+            increment = lr * gradient * tf.math.rsqrt(denominator)
+            if self.momentum > 0:
+                momentum.assign(self.momentum * momentum + increment)
+                variable.assign_add(-momentum)
+            else:
+                variable.assign_add(-increment)
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "rho": self.rho,
+                "momentum": self.momentum,
+                "epsilon": self.epsilon,
+                "centered": self.centered,
+            }
+        )
+        return config
+
+
+RMSprop.__doc__ = RMSprop.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/schedules/BUILD b/keras/optimizers/schedules/BUILD
index c0a313e338c5..a4854299cf40 100644
--- a/keras/optimizers/schedules/BUILD
+++ b/keras/optimizers/schedules/BUILD
@@ -1,14 +1,16 @@
 # Description:
 #   Contains the learning rate schedule API,
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/python:__pkg__",
         "//third_party/tensorflow/python/distribute:__pkg__",
-        "//third_party/tensorflow/python/training/tracking:__pkg__",
+        "//third_party/tensorflow/python/trackable:__pkg__",
     ],
     licenses = ["notice"],
 )
@@ -35,7 +37,7 @@ cuda_py_test(
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
     ],
 )
diff --git a/keras/optimizers/schedules/__init__.py b/keras/optimizers/schedules/__init__.py
index e5ffd337974e..cfa6e7a47ff2 100644
--- a/keras/optimizers/schedules/__init__.py
+++ b/keras/optimizers/schedules/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Learning rate schedule API."""
 
-from keras.optimizers.schedules.learning_rate_schedules import ExponentialDecay
-from keras.optimizers.schedules.learning_rate_schedules import InverseTimeDecay
-from keras.optimizers.schedules.learning_rate_schedules import PiecewiseConstantDecay
-from keras.optimizers.schedules.learning_rate_schedules import PolynomialDecay
+from keras.optimizers.schedules.learning_rate_schedule import ExponentialDecay
+from keras.optimizers.schedules.learning_rate_schedule import InverseTimeDecay
+from keras.optimizers.schedules.learning_rate_schedule import (
+    PiecewiseConstantDecay,
+)
+from keras.optimizers.schedules.learning_rate_schedule import PolynomialDecay
diff --git a/keras/optimizers/schedules/learning_rate_schedule.py b/keras/optimizers/schedules/learning_rate_schedule.py
index 0aa8765dbb2c..c017a7d6d5f4 100644
--- a/keras/optimizers/schedules/learning_rate_schedule.py
+++ b/keras/optimizers/schedules/learning_rate_schedule.py
@@ -14,1071 +14,1246 @@
 # ==============================================================================
 """Various learning rate schedule functions."""
 
-import tensorflow.compat.v2 as tf
-
 import abc
 import math
+
+import tensorflow.compat.v2 as tf
+
 from keras import backend
-from keras.utils import generic_utils
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization as legacy_serialization
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export("keras.optimizers.schedules.LearningRateSchedule")
 class LearningRateSchedule:
-  """The learning rate schedule base class.
+    """The learning rate schedule base class.
 
-  You can use a learning rate schedule to modulate how the learning rate
-  of your optimizer changes over time.
+    You can use a learning rate schedule to modulate how the learning rate
+    of your optimizer changes over time.
 
-  Several built-in learning rate schedules are available, such as
-  `tf.keras.optimizers.schedules.ExponentialDecay` or
-  `tf.keras.optimizers.schedules.PiecewiseConstantDecay`:
+    Several built-in learning rate schedules are available, such as
+    `tf.keras.optimizers.schedules.ExponentialDecay` or
+    `tf.keras.optimizers.schedules.PiecewiseConstantDecay`:
 
-  ```python
-  lr_schedule = keras.optimizers.schedules.ExponentialDecay(
-      initial_learning_rate=1e-2,
-      decay_steps=10000,
-      decay_rate=0.9)
-  optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)
-  ```
+    ```python
+    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=1e-2,
+        decay_steps=10000,
+        decay_rate=0.9)
+    optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)
+    ```
 
-  A `LearningRateSchedule` instance can be passed in as the `learning_rate`
-  argument of any optimizer.
+    A `LearningRateSchedule` instance can be passed in as the `learning_rate`
+    argument of any optimizer.
 
-  To implement your own schedule object, you should implement the `__call__`
-  method, which takes a `step` argument (scalar integer tensor, the
-  current training step count).
-  Like for any other Keras object, you can also optionally
-  make your object serializable by implementing the `get_config`
-  and `from_config` methods.
+    To implement your own schedule object, you should implement the `__call__`
+    method, which takes a `step` argument (scalar integer tensor, the
+    current training step count).
+    Like for any other Keras object, you can also optionally
+    make your object serializable by implementing the `get_config`
+    and `from_config` methods.
 
-  Example:
+    Example:
 
-  ```python
-  class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+    ```python
+    class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
 
-    def __init__(self, initial_learning_rate):
-      self.initial_learning_rate = initial_learning_rate
+      def __init__(self, initial_learning_rate):
+        self.initial_learning_rate = initial_learning_rate
 
-    def __call__(self, step):
-       return self.initial_learning_rate / (step + 1)
+      def __call__(self, step):
+         return self.initial_learning_rate / (step + 1)
 
-  optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))
-  ```
-  """
+    optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))
+    ```
+    """
 
-  @abc.abstractmethod
-  def __call__(self, step):
-    raise NotImplementedError("Learning rate schedule must override __call__")
+    @abc.abstractmethod
+    def __call__(self, step):
+        raise NotImplementedError(
+            f"Learning rate schedule '{self.__class__.__name__}' "
+            "must override `__call__(self, step)`."
+        )
 
-  @abc.abstractmethod
-  def get_config(self):
-    raise NotImplementedError("Learning rate schedule must override get_config")
+    @abc.abstractmethod
+    def get_config(self):
+        raise NotImplementedError(
+            f"Learning rate schedule '{self.__class__.__name__}' "
+            "must override `get_config()` in order to be serializable."
+        )
 
-  @classmethod
-  def from_config(cls, config):
-    """Instantiates a `LearningRateSchedule` from its config.
+    @classmethod
+    def from_config(cls, config):
+        """Instantiates a `LearningRateSchedule` from its config.
 
-    Args:
-        config: Output of `get_config()`.
+        Args:
+            config: Output of `get_config()`.
 
-    Returns:
-        A `LearningRateSchedule` instance.
-    """
-    return cls(**config)
+        Returns:
+            A `LearningRateSchedule` instance.
+        """
+        return cls(**config)
 
 
 @keras_export("keras.optimizers.schedules.ExponentialDecay")
 class ExponentialDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses an exponential decay schedule.
-
-  When training a model, it is often useful to lower the learning rate as
-  the training progresses. This schedule applies an exponential decay function
-  to an optimizer step, given a provided initial learning rate.
-
-  The schedule is a 1-arg callable that produces a decayed learning
-  rate when passed the current optimizer step. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  def decayed_learning_rate(step):
-    return initial_learning_rate * decay_rate ^ (step / decay_steps)
-  ```
-
-  If the argument `staircase` is `True`, then `step / decay_steps` is
-  an integer division and the decayed learning rate follows a
-  staircase function.
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate.
-  Example: When fitting a Keras model, decay every 100000 steps with a base
-  of 0.96:
-
-  ```python
-  initial_learning_rate = 0.1
-  lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
-      initial_learning_rate,
-      decay_steps=100000,
-      decay_rate=0.96,
-      staircase=True)
-
-  model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
-                loss='sparse_categorical_crossentropy',
-                metrics=['accuracy'])
-
-  model.fit(data, labels, epochs=5)
-  ```
-
-  The learning rate schedule is also serializable and deserializable using
-  `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as `initial_learning_rate`.
-  """
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_steps,
-      decay_rate,
-      staircase=False,
-      name=None):
-    """Applies exponential decay to the learning rate.
+    """A LearningRateSchedule that uses an exponential decay schedule.
+
+    When training a model, it is often useful to lower the learning rate as
+    the training progresses. This schedule applies an exponential decay function
+    to an optimizer step, given a provided initial learning rate.
+
+    The schedule is a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate * decay_rate ^ (step / decay_steps)
+    ```
+
+    If the argument `staircase` is `True`, then `step / decay_steps` is
+    an integer division and the decayed learning rate follows a
+    staircase function.
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: When fitting a Keras model, decay every 100000 steps with a base
+    of 0.96:
+
+    ```python
+    initial_learning_rate = 0.1
+    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate,
+        decay_steps=100000,
+        decay_rate=0.96,
+        staircase=True)
 
-    Args:
-      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
-        Python number.  The initial learning rate.
-      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-        Must be positive.  See the decay computation above.
-      decay_rate: A scalar `float32` or `float64` `Tensor` or a
-        Python number.  The decay rate.
-      staircase: Boolean.  If `True` decay the learning rate at discrete
-        intervals
-      name: String.  Optional name of the operation.  Defaults to
-        'ExponentialDecay'.
+    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    The learning rate schedule is also serializable and deserializable using
+    `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
     """
-    super().__init__()
-    self.initial_learning_rate = initial_learning_rate
-    self.decay_steps = decay_steps
-    self.decay_rate = decay_rate
-    self.staircase = staircase
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "ExponentialDecay") as name:
-      initial_learning_rate = tf.convert_to_tensor(
-          self.initial_learning_rate, name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      decay_steps = tf.cast(self.decay_steps, dtype)
-      decay_rate = tf.cast(self.decay_rate, dtype)
-
-      global_step_recomp = tf.cast(step, dtype)
-      p = global_step_recomp / decay_steps
-      if self.staircase:
-        p = tf.floor(p)
-      return tf.multiply(
-          initial_learning_rate, tf.pow(decay_rate, p), name=name)
-
-  def get_config(self):
-    return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "decay_steps": self.decay_steps,
-        "decay_rate": self.decay_rate,
-        "staircase": self.staircase,
-        "name": self.name
-    }
+
+    def __init__(
+        self,
+        initial_learning_rate,
+        decay_steps,
+        decay_rate,
+        staircase=False,
+        name=None,
+    ):
+        """Applies exponential decay to the learning rate.
+
+        Args:
+          initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+            Python number.  The initial learning rate.
+          decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+            Must be positive.  See the decay computation above.
+          decay_rate: A scalar `float32` or `float64` `Tensor` or a
+            Python number.  The decay rate.
+          staircase: Boolean.  If `True` decay the learning rate at discrete
+            intervals
+          name: String.  Optional name of the operation.  Defaults to
+            'ExponentialDecay'.
+        """
+        super().__init__()
+        self.initial_learning_rate = initial_learning_rate
+        self.decay_steps = decay_steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "ExponentialDecay") as name:
+            initial_learning_rate = tf.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate"
+            )
+            dtype = initial_learning_rate.dtype
+            decay_steps = tf.cast(self.decay_steps, dtype)
+            decay_rate = tf.cast(self.decay_rate, dtype)
+
+            global_step_recomp = tf.cast(step, dtype)
+            p = global_step_recomp / decay_steps
+            if self.staircase:
+                p = tf.floor(p)
+            return tf.multiply(
+                initial_learning_rate, tf.pow(decay_rate, p), name=name
+            )
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_steps": self.decay_steps,
+            "decay_rate": self.decay_rate,
+            "staircase": self.staircase,
+            "name": self.name,
+        }
 
 
 @keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
 class PiecewiseConstantDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a piecewise constant decay schedule.
-
-  The function returns a 1-arg callable to compute the piecewise constant
-  when passed the current optimizer step. This can be useful for changing the
-  learning rate value across different invocations of optimizer functions.
-
-  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
-    for the next 10000 steps, and 0.1 for any additional steps.
-
-  ```python
-  step = tf.Variable(0, trainable=False)
-  boundaries = [100000, 110000]
-  values = [1.0, 0.5, 0.1]
-  learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
-      boundaries, values)
-
-  # Later, whenever we perform an optimization step, we pass in the step.
-  learning_rate = learning_rate_fn(step)
-  ```
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate. The learning rate schedule is also serializable and
-  deserializable using `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as the boundary tensors.
-
-    The output of the 1-arg function that takes the `step`
-    is `values[0]` when `step <= boundaries[0]`,
-    `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
-    and values[-1] when `step > boundaries[-1]`.
-  """
-
-  def __init__(
-      self,
-      boundaries,
-      values,
-      name=None):
-    """Piecewise constant from boundaries and interval values.
+    """A LearningRateSchedule that uses a piecewise constant decay schedule.
 
-    Args:
-      boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
-        increasing entries, and with all elements having the same type as the
-        optimizer step.
-      values: A list of `Tensor`s or `float`s or `int`s that specifies the
-        values for the intervals defined by `boundaries`. It should have one
-        more element than `boundaries`, and all elements should have the same
-        type.
-      name: A string. Optional name of the operation. Defaults to
-        'PiecewiseConstant'.
-
-    Raises:
-      ValueError: if the number of elements in the lists do not match.
+    The function returns a 1-arg callable to compute the piecewise constant
+    when passed the current optimizer step. This can be useful for changing the
+    learning rate value across different invocations of optimizer functions.
+
+    Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+      for the next 10000 steps, and 0.1 for any additional steps.
+
+    ```python
+    step = tf.Variable(0, trainable=False)
+    boundaries = [100000, 110000]
+    values = [1.0, 0.5, 0.1]
+    learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries, values)
+
+    # Later, whenever we perform an optimization step, we pass in the step.
+    learning_rate = learning_rate_fn(step)
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as the boundary tensors.
+
+      The output of the 1-arg function that takes the `step`
+      is `values[0]` when `step <= boundaries[0]`,
+      `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
+      and values[-1] when `step > boundaries[-1]`.
     """
-    super().__init__()
-
-    if len(boundaries) != len(values) - 1:
-      raise ValueError(
-          "The length of boundaries should be 1 less than the length of "
-          f"values. Received: boundaries={boundaries} of length "
-          f"{len(boundaries)}, and values={values} of length {len(values)}.")
-
-    self.boundaries = boundaries
-    self.values = values
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "PiecewiseConstant"):
-      boundaries = tf.nest.map_structure(tf.convert_to_tensor,
-                                      tf.nest.flatten(self.boundaries))
-      values = tf.nest.map_structure(tf.convert_to_tensor,
-                                  tf.nest.flatten(self.values))
-      x_recomp = tf.convert_to_tensor(step)
-      for i, b in enumerate(boundaries):
-        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
-          # We cast the boundaries to have the same type as the step
-          b = tf.cast(b, x_recomp.dtype.base_dtype)
-          boundaries[i] = b
-      pred_fn_pairs = []
-      pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
-      pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
-      for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
-        # Need to bind v here; can do this with lambda v=v: ...
-        pred = (x_recomp > low) & (x_recomp <= high)
-        pred_fn_pairs.append((pred, lambda v=v: v))
-
-      # The default isn't needed here because our conditions are mutually
-      # exclusive and exhaustive, but tf.case requires it.
-      default = lambda: values[0]
-      return tf.case(pred_fn_pairs, default, exclusive=True)
-
-  def get_config(self):
-    return {
-        "boundaries": self.boundaries,
-        "values": self.values,
-        "name": self.name
-    }
+
+    def __init__(self, boundaries, values, name=None):
+        """Piecewise constant from boundaries and interval values.
+
+        Args:
+          boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
+            increasing entries, and with all elements having the same type as
+            the optimizer step.
+          values: A list of `Tensor`s or `float`s or `int`s that specifies the
+            values for the intervals defined by `boundaries`. It should have one
+            more element than `boundaries`, and all elements should have the
+            same type.
+          name: A string. Optional name of the operation. Defaults to
+            'PiecewiseConstant'.
+
+        Raises:
+          ValueError: if the number of elements in the lists do not match.
+        """
+        super().__init__()
+
+        if len(boundaries) != len(values) - 1:
+            raise ValueError(
+                "The length of boundaries should be 1 less than the length of "
+                f"values. Received: boundaries={boundaries} of length "
+                f"{len(boundaries)}, and values={values} "
+                f"of length {len(values)}."
+            )
+
+        self.boundaries = boundaries
+        self.values = values
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "PiecewiseConstant"):
+            boundaries = tf.nest.map_structure(
+                tf.convert_to_tensor, tf.nest.flatten(self.boundaries)
+            )
+            values = tf.nest.map_structure(
+                tf.convert_to_tensor, tf.nest.flatten(self.values)
+            )
+            x_recomp = tf.convert_to_tensor(step)
+            for i, b in enumerate(boundaries):
+                if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
+                    # We cast the boundaries to have the same type as the step
+                    b = tf.cast(b, x_recomp.dtype.base_dtype)
+                    boundaries[i] = b
+            pred_fn_pairs = []
+            pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
+            pred_fn_pairs.append(
+                (x_recomp > boundaries[-1], lambda: values[-1])
+            )
+            for low, high, v in zip(
+                boundaries[:-1], boundaries[1:], values[1:-1]
+            ):
+                # Need to bind v here; can do this with lambda v=v: ...
+                pred = (x_recomp > low) & (x_recomp <= high)
+                pred_fn_pairs.append((pred, lambda v=v: v))
+
+            # The default isn't needed here because our conditions are mutually
+            # exclusive and exhaustive, but tf.case requires it.
+            default = lambda: values[0]
+            return tf.case(pred_fn_pairs, default, exclusive=True)
+
+    def get_config(self):
+        return {
+            "boundaries": self.boundaries,
+            "values": self.values,
+            "name": self.name,
+        }
 
 
 @keras_export("keras.optimizers.schedules.PolynomialDecay")
 class PolynomialDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a polynomial decay schedule.
-
-  It is commonly observed that a monotonically decreasing learning rate, whose
-  degree of change is carefully chosen, results in a better performing model.
-  This schedule applies a polynomial decay function to an optimizer step,
-  given a provided `initial_learning_rate`, to reach an `end_learning_rate`
-  in the given `decay_steps`.
-
-  It requires a `step` value to compute the decayed learning rate. You
-  can just pass a TensorFlow variable that you increment at each training
-  step.
-
-  The schedule is a 1-arg callable that produces a decayed learning rate
-  when passed the current optimizer step. This can be useful for changing the
-  learning rate value across different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  def decayed_learning_rate(step):
-    step = min(step, decay_steps)
-    return ((initial_learning_rate - end_learning_rate) *
-            (1 - step / decay_steps) ^ (power)
-           ) + end_learning_rate
-  ```
-
-  If `cycle` is True then a multiple of `decay_steps` is used, the first one
-  that is bigger than `step`.
-
-  ```python
-  def decayed_learning_rate(step):
-    decay_steps = decay_steps * ceil(step / decay_steps)
-    return ((initial_learning_rate - end_learning_rate) *
-            (1 - step / decay_steps) ^ (power)
-           ) + end_learning_rate
-  ```
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate.
-  Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
-  sqrt (i.e. power=0.5):
-
-  ```python
-  ...
-  starter_learning_rate = 0.1
-  end_learning_rate = 0.01
-  decay_steps = 10000
-  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-      starter_learning_rate,
-      decay_steps,
-      end_learning_rate,
-      power=0.5)
-
-  model.compile(optimizer=tf.keras.optimizers.SGD(
-                    learning_rate=learning_rate_fn),
-                loss='sparse_categorical_crossentropy',
-                metrics=['accuracy'])
-
-  model.fit(data, labels, epochs=5)
-  ```
-
-  The learning rate schedule is also serializable and deserializable using
-  `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as `initial_learning_rate`.
-  """
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_steps,
-      end_learning_rate=0.0001,
-      power=1.0,
-      cycle=False,
-      name=None):
-    """Applies a polynomial decay to the learning rate.
+    """A LearningRateSchedule that uses a polynomial decay schedule.
+
+    It is commonly observed that a monotonically decreasing learning rate, whose
+    degree of change is carefully chosen, results in a better performing model.
+    This schedule applies a polynomial decay function to an optimizer step,
+    given a provided `initial_learning_rate`, to reach an `end_learning_rate`
+    in the given `decay_steps`.
+
+    It requires a `step` value to compute the decayed learning rate. You
+    can just pass a TensorFlow variable that you increment at each training
+    step.
+
+    The schedule is a 1-arg callable that produces a decayed learning rate
+    when passed the current optimizer step. This can be useful for changing the
+    learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      return ((initial_learning_rate - end_learning_rate) *
+              (1 - step / decay_steps) ^ (power)
+             ) + end_learning_rate
+    ```
+
+    If `cycle` is True then a multiple of `decay_steps` is used, the first one
+    that is bigger than `step`.
+
+    ```python
+    def decayed_learning_rate(step):
+      decay_steps = decay_steps * ceil(step / decay_steps)
+      return ((initial_learning_rate - end_learning_rate) *
+              (1 - step / decay_steps) ^ (power)
+             ) + end_learning_rate
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
+    sqrt (i.e. power=0.5):
+
+    ```python
+    ...
+    starter_learning_rate = 0.1
+    end_learning_rate = 0.01
+    decay_steps = 10000
+    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+        starter_learning_rate,
+        decay_steps,
+        end_learning_rate,
+        power=0.5)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                      learning_rate=learning_rate_fn),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    The learning rate schedule is also serializable and deserializable using
+    `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
 
-    Args:
-      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
-        Python number.  The initial learning rate.
-      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-        Must be positive.  See the decay computation above.
-      end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
-        Python number.  The minimal end learning rate.
-      power: A scalar `float32` or `float64` `Tensor` or a
-        Python number.  The power of the polynomial. Defaults to linear, 1.0.
-      cycle: A boolean, whether or not it should cycle beyond decay_steps.
-      name: String.  Optional name of the operation. Defaults to
-        'PolynomialDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
     """
-    super().__init__()
-
-    self.initial_learning_rate = initial_learning_rate
-    self.decay_steps = decay_steps
-    self.end_learning_rate = end_learning_rate
-    self.power = power
-    self.cycle = cycle
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "PolynomialDecay") as name:
-      initial_learning_rate = tf.convert_to_tensor(
-          self.initial_learning_rate, name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      end_learning_rate = tf.cast(self.end_learning_rate, dtype)
-      power = tf.cast(self.power, dtype)
-
-      global_step_recomp = tf.cast(step, dtype)
-      decay_steps_recomp = tf.cast(self.decay_steps, dtype)
-      if self.cycle:
-        # Find the first multiple of decay_steps that is bigger than
-        # global_step. If global_step is zero set the multiplier to 1
-        multiplier = tf.where(
-            tf.equal(global_step_recomp, 0), 1.0,
-            tf.math.ceil(global_step_recomp / self.decay_steps))
-        decay_steps_recomp = tf.multiply(decay_steps_recomp, multiplier)
-      else:
-        # Make sure that the global_step used is not bigger than decay_steps.
-        global_step_recomp = tf.minimum(global_step_recomp,
-                                              decay_steps_recomp)
-
-      p = tf.divide(global_step_recomp, decay_steps_recomp)
-      return tf.add(
-          tf.multiply(initial_learning_rate - end_learning_rate,
-                            tf.pow(1 - p, power)),
-          end_learning_rate,
-          name=name)
-
-  def get_config(self):
-    return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "decay_steps": self.decay_steps,
-        "end_learning_rate": self.end_learning_rate,
-        "power": self.power,
-        "cycle": self.cycle,
-        "name": self.name
-    }
+
+    def __init__(
+        self,
+        initial_learning_rate,
+        decay_steps,
+        end_learning_rate=0.0001,
+        power=1.0,
+        cycle=False,
+        name=None,
+    ):
+        """Applies a polynomial decay to the learning rate.
+
+        Args:
+          initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+            Python number.  The initial learning rate.
+          decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+            Must be positive.  See the decay computation above.
+          end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+            Python number.  The minimal end learning rate.
+          power: A scalar `float32` or `float64` `Tensor` or a
+            Python number. The power of the polynomial. Defaults to `1.0`.
+          cycle: A boolean, whether it should cycle beyond decay_steps.
+          name: String.  Optional name of the operation. Defaults to
+            'PolynomialDecay'.
+        """
+        super().__init__()
+
+        self.initial_learning_rate = initial_learning_rate
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "PolynomialDecay") as name:
+            initial_learning_rate = tf.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate"
+            )
+            dtype = initial_learning_rate.dtype
+            end_learning_rate = tf.cast(self.end_learning_rate, dtype)
+            power = tf.cast(self.power, dtype)
+
+            global_step_recomp = tf.cast(step, dtype)
+            decay_steps_recomp = tf.cast(self.decay_steps, dtype)
+            if self.cycle:
+                # Find the first multiple of decay_steps that is bigger than
+                # global_step. If global_step is zero set the multiplier to 1
+                multiplier = tf.where(
+                    tf.equal(global_step_recomp, 0),
+                    1.0,
+                    tf.math.ceil(global_step_recomp / self.decay_steps),
+                )
+                decay_steps_recomp = tf.multiply(decay_steps_recomp, multiplier)
+            else:
+                # Make sure that the global_step used is not bigger than
+                # decay_steps.
+                global_step_recomp = tf.minimum(
+                    global_step_recomp, decay_steps_recomp
+                )
+
+            p = tf.divide(global_step_recomp, decay_steps_recomp)
+            return tf.add(
+                tf.multiply(
+                    initial_learning_rate - end_learning_rate,
+                    tf.pow(1 - p, power),
+                ),
+                end_learning_rate,
+                name=name,
+            )
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_steps": self.decay_steps,
+            "end_learning_rate": self.end_learning_rate,
+            "power": self.power,
+            "cycle": self.cycle,
+            "name": self.name,
+        }
 
 
 @keras_export("keras.optimizers.schedules.InverseTimeDecay")
 class InverseTimeDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses an inverse time decay schedule.
-
-  When training a model, it is often useful to lower the learning rate as
-  the training progresses. This schedule applies the inverse decay function
-  to an optimizer step, given a provided initial learning rate.
-  It requires a `step` value to compute the decayed learning rate. You can
-  just pass a TensorFlow variable that you increment at each training step.
-
-  The schedule is a 1-arg callable that produces a decayed learning
-  rate when passed the current optimizer step. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  def decayed_learning_rate(step):
-    return initial_learning_rate / (1 + decay_rate * step / decay_step)
-  ```
-
-  or, if `staircase` is `True`, as:
-
-  ```python
-  def decayed_learning_rate(step):
-    return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
-  ```
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate.
-  Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
-
-  ```python
-  ...
-  initial_learning_rate = 0.1
-  decay_steps = 1.0
-  decay_rate = 0.5
-  learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
-    initial_learning_rate, decay_steps, decay_rate)
-
-  model.compile(optimizer=tf.keras.optimizers.SGD(
-                    learning_rate=learning_rate_fn),
-                loss='sparse_categorical_crossentropy',
-                metrics=['accuracy'])
-
-  model.fit(data, labels, epochs=5)
-  ```
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as `initial_learning_rate`.
-  """
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_steps,
-      decay_rate,
-      staircase=False,
-      name=None):
-    """Applies inverse time decay to the initial learning rate.
+    """A LearningRateSchedule that uses an inverse time decay schedule.
+
+    When training a model, it is often useful to lower the learning rate as
+    the training progresses. This schedule applies the inverse decay function
+    to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule is a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate / (1 + decay_rate * step / decay_step)
+    ```
+
+    or, if `staircase` is `True`, as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
+
+    ```python
+    ...
+    initial_learning_rate = 0.1
+    decay_steps = 1.0
+    decay_rate = 0.5
+    learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
+      initial_learning_rate, decay_steps, decay_rate)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                      learning_rate=learning_rate_fn),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
 
-    Args:
-      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
-        Python number.  The initial learning rate.
-      decay_steps: How often to apply decay.
-      decay_rate: A Python number.  The decay rate.
-      staircase: Whether to apply decay in a discrete staircase, as opposed to
-        continuous, fashion.
-      name: String.  Optional name of the operation.  Defaults to
-        'InverseTimeDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
     """
-    super().__init__()
-
-    self.initial_learning_rate = initial_learning_rate
-    self.decay_steps = decay_steps
-    self.decay_rate = decay_rate
-    self.staircase = staircase
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "InverseTimeDecay") as name:
-      initial_learning_rate = tf.convert_to_tensor(
-          self.initial_learning_rate, name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      decay_steps = tf.cast(self.decay_steps, dtype)
-      decay_rate = tf.cast(self.decay_rate, dtype)
-
-      global_step_recomp = tf.cast(step, dtype)
-      p = global_step_recomp / decay_steps
-      if self.staircase:
-        p = tf.floor(p)
-      const = tf.cast(tf.constant(1), dtype)
-      denom = tf.add(const, tf.multiply(decay_rate, p))
-      return tf.divide(initial_learning_rate, denom, name=name)
-
-  def get_config(self):
-    return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "decay_steps": self.decay_steps,
-        "decay_rate": self.decay_rate,
-        "staircase": self.staircase,
-        "name": self.name
-    }
-
-
-@keras_export("keras.optimizers.schedules.CosineDecay",
-              "keras.experimental.CosineDecay")
+
+    def __init__(
+        self,
+        initial_learning_rate,
+        decay_steps,
+        decay_rate,
+        staircase=False,
+        name=None,
+    ):
+        """Applies inverse time decay to the initial learning rate.
+
+        Args:
+          initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+            Python number.  The initial learning rate.
+          decay_steps: How often to apply decay.
+          decay_rate: A Python number.  The decay rate.
+          staircase: Whether to apply decay in a discrete staircase, as opposed
+            to continuous, fashion.
+          name: String.  Optional name of the operation.  Defaults to
+            'InverseTimeDecay'.
+        """
+        super().__init__()
+
+        self.initial_learning_rate = initial_learning_rate
+        self.decay_steps = decay_steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "InverseTimeDecay") as name:
+            initial_learning_rate = tf.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate"
+            )
+            dtype = initial_learning_rate.dtype
+            decay_steps = tf.cast(self.decay_steps, dtype)
+            decay_rate = tf.cast(self.decay_rate, dtype)
+
+            global_step_recomp = tf.cast(step, dtype)
+            p = global_step_recomp / decay_steps
+            if self.staircase:
+                p = tf.floor(p)
+            const = tf.cast(tf.constant(1), dtype)
+            denom = tf.add(const, tf.multiply(decay_rate, p))
+            return tf.divide(initial_learning_rate, denom, name=name)
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_steps": self.decay_steps,
+            "decay_rate": self.decay_rate,
+            "staircase": self.staircase,
+            "name": self.name,
+        }
+
+
+@keras_export(
+    "keras.optimizers.schedules.CosineDecay", "keras.experimental.CosineDecay"
+)
 class CosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a cosine decay schedule.
-
-  See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
-  SGDR: Stochastic Gradient Descent with Warm Restarts.
-
-  When training a model, it is often useful to lower the learning rate as
-  the training progresses. This schedule applies a cosine decay function
-  to an optimizer step, given a provided initial learning rate.
-  It requires a `step` value to compute the decayed learning rate. You can
-  just pass a TensorFlow variable that you increment at each training step.
-
-  The schedule is a 1-arg callable that produces a decayed learning
-  rate when passed the current optimizer step. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  def decayed_learning_rate(step):
-    step = min(step, decay_steps)
-    cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
-    decayed = (1 - alpha) * cosine_decay + alpha
-    return initial_learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
-      initial_learning_rate, decay_steps)
-  ```
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate. The learning rate schedule is also serializable and
-  deserializable using `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as `initial_learning_rate`.
-  """
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_steps,
-      alpha=0.0,
-      name=None):
-    """Applies cosine decay to the learning rate.
+    """A LearningRateSchedule that uses a cosine decay with optional warmup.
+
+    See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
+    SGDR: Stochastic Gradient Descent with Warm Restarts.
+
+    For the idea of a linear warmup of our learning rate,
+    see [Goyal et al.](https://arxiv.org/pdf/1706.02677.pdf).
+
+    When we begin training a model, we often want an initial increase in our
+    learning rate followed by a decay. If `warmup_target` is an int, this
+    schedule applies a linear increase per optimizer step to our learning rate
+    from `initial_learning_rate` to `warmup_target` for a duration of
+    `warmup_steps`. Afterwards, it applies a cosine decay function taking our
+    learning rate from `warmup_target` to `alpha` for a duration of
+    `decay_steps`. If `warmup_target` is None we skip warmup and our decay
+    will take our learning rate from `initial_learning_rate` to `alpha`.
+    It requires a `step` value to  compute the learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule is a 1-arg callable that produces a warmup followed by a
+    decayed learning rate when passed the current optimizer step. This can be
+    useful for changing the learning rate value across different invocations of
+    optimizer functions.
+
+    Our warmup is computed as:
+
+    ```python
+    def warmup_learning_rate(step):
+        completed_fraction = step / warmup_steps
+        total_delta = target_warmup - initial_learning_rate
+        return completed_fraction * total_delta
+    ```
+
+    And our decay is computed as:
+
+    ```python
+    if warmup_target is None:
+        initial_decay_lr = initial_learning_rate
+    else:
+        initial_decay_lr = warmup_target
+
+    def decayed_learning_rate(step):
+        step = min(step, decay_steps)
+        cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
+        decayed = (1 - alpha) * cosine_decay + alpha
+        return initial_decay_lr * decayed
+    ```
+
+    Example usage without warmup:
+
+    ```python
+    decay_steps = 1000
+    initial_learning_rate = 0.1
+    lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
+        initial_learning_rate, decay_steps)
+    ```
+
+    Example usage with warmup:
+
+    ```python
+    decay_steps = 1000
+    initial_learning_rate = 0
+    warmup_steps = 1000
+    target_learning_rate = 0.1
+    lr_warmup_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
+        initial_learning_rate, decay_steps, warmup_target=target_learning_rate,
+        warmup_steps=warmup_steps
+    )
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
 
-    Args:
-      initial_learning_rate: A scalar `float32` or `float64` Tensor or a
-        Python number. The initial learning rate.
-      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-        Number of steps to decay over.
-      alpha: A scalar `float32` or `float64` Tensor or a Python number.
-        Minimum learning rate value as a fraction of initial_learning_rate.
-      name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
     """
-    super().__init__()
-
-    self.initial_learning_rate = initial_learning_rate
-    self.decay_steps = decay_steps
-    self.alpha = alpha
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "CosineDecay"):
-      initial_learning_rate = tf.convert_to_tensor(
-          self.initial_learning_rate, name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      decay_steps = tf.cast(self.decay_steps, dtype)
-
-      global_step_recomp = tf.cast(step, dtype)
-      global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
-      completed_fraction = global_step_recomp / decay_steps
-      cosine_decayed = 0.5 * (1.0 + tf.cos(
-          tf.constant(math.pi, dtype=dtype) * completed_fraction))
-
-      decayed = (1 - self.alpha) * cosine_decayed + self.alpha
-      return tf.multiply(initial_learning_rate, decayed)
-
-  def get_config(self):
-    return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "decay_steps": self.decay_steps,
-        "alpha": self.alpha,
-        "name": self.name
-    }
-
-
-@keras_export("keras.optimizers.schedules.CosineDecayRestarts",
-              "keras.experimental.CosineDecayRestarts")
-class CosineDecayRestarts(LearningRateSchedule):
-  """A LearningRateSchedule that uses a cosine decay schedule with restarts.
-
-  See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
-  SGDR: Stochastic Gradient Descent with Warm Restarts.
-
-  When training a model, it is often useful to lower the learning rate as
-  the training progresses. This schedule applies a cosine decay function with
-  restarts to an optimizer step, given a provided initial learning rate.
-  It requires a `step` value to compute the decayed learning rate. You can
-  just pass a TensorFlow variable that you increment at each training step.
-
-  The schedule is a 1-arg callable that produces a decayed learning
-  rate when passed the current optimizer step. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-
-  The learning rate multiplier first decays
-  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
-  restart is performed. Each new warm restart runs for `t_mul` times more
-  steps and with `m_mul` times initial learning rate as the new learning rate.
-
-  Example usage:
-  ```python
-  first_decay_steps = 1000
-  lr_decayed_fn = (
-    tf.keras.optimizers.schedules.CosineDecayRestarts(
+
+    def __init__(
+        self,
         initial_learning_rate,
-        first_decay_steps))
-  ```
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate. The learning rate schedule is also serializable and
-  deserializable using `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as `initial_learning_rate`.
-  """
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      first_decay_steps,
-      t_mul=2.0,
-      m_mul=1.0,
-      alpha=0.0,
-      name=None):
-    """Applies cosine decay with restarts to the learning rate.
+        decay_steps,
+        alpha=0.0,
+        name=None,
+        warmup_target=None,
+        warmup_steps=0,
+    ):
+        """Applies cosine decay to the learning rate.
+
+        Args:
+          initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+            Python int. The initial learning rate.
+          decay_steps: A scalar `int32` or `int64` `Tensor` or a Python int.
+            Number of steps to decay over.
+          alpha: A scalar `float32` or `float64` `Tensor` or a Python int.
+            Minimum learning rate value for decay as a fraction of
+            `initial_learning_rate`.
+          name: String. Optional name of the operation.  Defaults to
+            'CosineDecay'.
+          warmup_target: None or a scalar `float32` or `float64` `Tensor` or a
+            Python int. The target learning rate for our warmup phase. Will cast
+            to the `initial_learning_rate` datatype. Setting to None will skip
+            warmup and begins decay phase from `initial_learning_rate`.
+            Otherwise scheduler will warmup from `initial_learning_rate` to
+            `warmup_target`.
+          warmup_steps: A scalar `int32` or `int64` `Tensor` or a Python int.
+            Number of steps to warmup over.
+        """
+        super().__init__()
+
+        self.initial_learning_rate = initial_learning_rate
+        self.decay_steps = decay_steps
+        self.alpha = alpha
+        self.name = name
+        self.warmup_steps = warmup_steps
+        self.warmup_target = warmup_target
+
+    def _decay_function(self, step, decay_steps, decay_from_lr, dtype):
+        with tf.name_scope(self.name or "CosineDecay"):
+            completed_fraction = step / decay_steps
+            tf_pi = tf.constant(math.pi, dtype=dtype)
+            cosine_decayed = 0.5 * (1.0 + tf.cos(tf_pi * completed_fraction))
+            decayed = (1 - self.alpha) * cosine_decayed + self.alpha
+            return tf.multiply(decay_from_lr, decayed)
+
+    def _warmup_function(
+        self, step, warmup_steps, warmup_target, initial_learning_rate
+    ):
+        with tf.name_scope(self.name or "CosineDecay"):
+            completed_fraction = step / warmup_steps
+            total_step_delta = warmup_target - initial_learning_rate
+            return total_step_delta * completed_fraction + initial_learning_rate
 
-    Args:
-      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
-        number. The initial learning rate.
-      first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python
-        number. Number of steps to decay over.
-      t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-        Used to derive the number of iterations in the i-th period.
-      m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-        Used to derive the initial learning rate of the i-th period.
-      alpha: A scalar `float32` or `float64` Tensor or a Python number.
-        Minimum learning rate value as a fraction of the initial_learning_rate.
-      name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+    def __call__(self, step):
+        with tf.name_scope(self.name or "CosineDecay"):
+            initial_learning_rate = tf.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate"
+            )
+            dtype = initial_learning_rate.dtype
+            decay_steps = tf.cast(self.decay_steps, dtype)
+            global_step_recomp = tf.cast(step, dtype)
+
+            if self.warmup_target is None:
+                global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
+                return self._decay_function(
+                    global_step_recomp,
+                    decay_steps,
+                    initial_learning_rate,
+                    dtype,
+                )
+
+            warmup_target = tf.cast(self.warmup_target, dtype)
+            warmup_steps = tf.cast(self.warmup_steps, dtype)
+
+            global_step_recomp = tf.minimum(
+                global_step_recomp, decay_steps + warmup_steps
+            )
+
+            return tf.cond(
+                global_step_recomp < warmup_steps,
+                lambda: self._warmup_function(
+                    global_step_recomp,
+                    warmup_steps,
+                    warmup_target,
+                    initial_learning_rate,
+                ),
+                lambda: self._decay_function(
+                    global_step_recomp - warmup_steps,
+                    decay_steps,
+                    warmup_target,
+                    dtype,
+                ),
+            )
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_steps": self.decay_steps,
+            "alpha": self.alpha,
+            "name": self.name,
+            "warmup_target": self.warmup_target,
+            "warmup_steps": self.warmup_steps,
+        }
+
+
+@keras_export(
+    "keras.optimizers.schedules.CosineDecayRestarts",
+    "keras.experimental.CosineDecayRestarts",
+)
+class CosineDecayRestarts(LearningRateSchedule):
+    """A LearningRateSchedule that uses a cosine decay schedule with restarts.
+
+    See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
+    SGDR: Stochastic Gradient Descent with Warm Restarts.
+
+    When training a model, it is often useful to lower the learning rate as
+    the training progresses. This schedule applies a cosine decay function with
+    restarts to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule is a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+
+    The learning rate multiplier first decays
+    from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+    restart is performed. Each new warm restart runs for `t_mul` times more
+    steps and with `m_mul` times initial learning rate as the new learning rate.
+
+    Example usage:
+    ```python
+    first_decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.optimizers.schedules.CosineDecayRestarts(
+          initial_learning_rate,
+          first_decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
     """
-    super().__init__()
-
-    self.initial_learning_rate = initial_learning_rate
-    self.first_decay_steps = first_decay_steps
-    self._t_mul = t_mul
-    self._m_mul = m_mul
-    self.alpha = alpha
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "SGDRDecay") as name:
-      initial_learning_rate = tf.convert_to_tensor(
-          self.initial_learning_rate, name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      first_decay_steps = tf.cast(self.first_decay_steps, dtype)
-      alpha = tf.cast(self.alpha, dtype)
-      t_mul = tf.cast(self._t_mul, dtype)
-      m_mul = tf.cast(self._m_mul, dtype)
-
-      global_step_recomp = tf.cast(step, dtype)
-      completed_fraction = global_step_recomp / first_decay_steps
-
-      def compute_step(completed_fraction, geometric=False):
-        """Helper for `cond` operation."""
-        if geometric:
-          i_restart = tf.floor(
-              tf.math.log(1.0 - completed_fraction * (1.0 - t_mul)) /
-              tf.math.log(t_mul))
-
-          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
-          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
-
-        else:
-          i_restart = tf.floor(completed_fraction)
-          completed_fraction -= i_restart
-
-        return i_restart, completed_fraction
-
-      i_restart, completed_fraction = tf.cond(
-          tf.equal(t_mul, 1.0),
-          lambda: compute_step(completed_fraction, geometric=False),
-          lambda: compute_step(completed_fraction, geometric=True))
-
-      m_fac = m_mul**i_restart
-      cosine_decayed = 0.5 * m_fac * (1.0 + tf.cos(
-          tf.constant(math.pi, dtype=dtype) * completed_fraction))
-      decayed = (1 - alpha) * cosine_decayed + alpha
-
-      return tf.multiply(initial_learning_rate, decayed, name=name)
-
-  def get_config(self):
-    return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "first_decay_steps": self.first_decay_steps,
-        "t_mul": self._t_mul,
-        "m_mul": self._m_mul,
-        "alpha": self.alpha,
-        "name": self.name
-    }
+
+    def __init__(
+        self,
+        initial_learning_rate,
+        first_decay_steps,
+        t_mul=2.0,
+        m_mul=1.0,
+        alpha=0.0,
+        name=None,
+    ):
+        """Applies cosine decay with restarts to the learning rate.
+
+        Args:
+          initial_learning_rate: A scalar `float32` or `float64` Tensor or a
+            Python number. The initial learning rate.
+          first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python
+            number. Number of steps to decay over.
+          t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+            Used to derive the number of iterations in the i-th period.
+          m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+            Used to derive the initial learning rate of the i-th period.
+          alpha: A scalar `float32` or `float64` Tensor or a Python number.
+            Minimum learning rate value as a fraction of the
+            initial_learning_rate.
+          name: String. Optional name of the operation. Defaults to 'SGDRDecay'.
+        """
+        super().__init__()
+
+        self.initial_learning_rate = initial_learning_rate
+        self.first_decay_steps = first_decay_steps
+        self._t_mul = t_mul
+        self._m_mul = m_mul
+        self.alpha = alpha
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "SGDRDecay") as name:
+            initial_learning_rate = tf.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate"
+            )
+            dtype = initial_learning_rate.dtype
+            first_decay_steps = tf.cast(self.first_decay_steps, dtype)
+            alpha = tf.cast(self.alpha, dtype)
+            t_mul = tf.cast(self._t_mul, dtype)
+            m_mul = tf.cast(self._m_mul, dtype)
+
+            global_step_recomp = tf.cast(step, dtype)
+            completed_fraction = global_step_recomp / first_decay_steps
+
+            def compute_step(completed_fraction, geometric=False):
+                """Helper for `cond` operation."""
+                if geometric:
+                    i_restart = tf.floor(
+                        tf.math.log(1.0 - completed_fraction * (1.0 - t_mul))
+                        / tf.math.log(t_mul)
+                    )
+
+                    sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+                    completed_fraction = (
+                        completed_fraction - sum_r
+                    ) / t_mul**i_restart
+
+                else:
+                    i_restart = tf.floor(completed_fraction)
+                    completed_fraction -= i_restart
+
+                return i_restart, completed_fraction
+
+            i_restart, completed_fraction = tf.cond(
+                tf.equal(t_mul, 1.0),
+                lambda: compute_step(completed_fraction, geometric=False),
+                lambda: compute_step(completed_fraction, geometric=True),
+            )
+
+            m_fac = m_mul**i_restart
+            cosine_decayed = (
+                0.5
+                * m_fac
+                * (
+                    1.0
+                    + tf.cos(
+                        tf.constant(math.pi, dtype=dtype) * completed_fraction
+                    )
+                )
+            )
+            decayed = (1 - alpha) * cosine_decayed + alpha
+
+            return tf.multiply(initial_learning_rate, decayed, name=name)
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "first_decay_steps": self.first_decay_steps,
+            "t_mul": self._t_mul,
+            "m_mul": self._m_mul,
+            "alpha": self.alpha,
+            "name": self.name,
+        }
 
 
 # Note: this code is still used by V1 APIs.
 class LinearCosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a linear cosine decay schedule.
-
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses. This schedule applies a linear cosine decay
-  function to an optimizer step, given a provided initial learning rate.
-  It requires a `step` value to compute the decayed learning rate. You can
-  just pass a TensorFlow variable that you increment at each training step.
-
-  The schedule is a 1-arg callable that produces a decayed learning
-  rate when passed the current optimizer step. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  def decayed_learning_rate(step):
-    step = min(step, decay_steps)
-    linear_decay = (decay_steps - step) / decay_steps
-    cosine_decay = 0.5 * (
-        1 + cos(pi * 2 * num_periods * step / decay_steps))
-    decayed = (alpha + linear_decay) * cosine_decay + beta
-    return initial_learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = (
-    tf.keras.experimental.LinearCosineDecay(
-      initial_learning_rate, decay_steps))
-  ```
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate. The learning rate schedule is also serializable and
-  deserializable using `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as `initial_learning_rate`.
-  """
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_steps,
-      num_periods=0.5,
-      alpha=0.0,
-      beta=0.001,
-      name=None):
-    """Applies linear cosine decay to the learning rate.
+    """A LearningRateSchedule that uses a linear cosine decay schedule.
+
+    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+    https://arxiv.org/abs/1709.07417
+
+    For the idea of warm starts here controlled by `num_periods`,
+    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a linear cosine decay
+    function to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule is a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      linear_decay = (decay_steps - step) / decay_steps
+      cosine_decay = 0.5 * (
+          1 + cos(pi * 2 * num_periods * step / decay_steps))
+      decayed = (alpha + linear_decay) * cosine_decay + beta
+      return initial_learning_rate * decayed
+    ```
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.LinearCosineDecay(
+        initial_learning_rate, decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
 
-    Args:
-      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
-        number. The initial learning rate.
-      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-        Number of steps to decay over.
-      num_periods: Number of periods in the cosine part of the decay.
-        See computation above.
-      alpha: See computation above.
-      beta: See computation above.
-      name: String.  Optional name of the operation.  Defaults to
-        'LinearCosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
     """
-    super().__init__()
-
-    self.initial_learning_rate = initial_learning_rate
-    self.decay_steps = decay_steps
-    self.num_periods = num_periods
-    self.alpha = alpha
-    self.beta = beta
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "LinearCosineDecay") as name:
-      initial_learning_rate = tf.convert_to_tensor(
-          self.initial_learning_rate, name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      decay_steps = tf.cast(self.decay_steps, dtype)
-      num_periods = tf.cast(self.num_periods, dtype)
-      alpha = tf.cast(self.alpha, dtype)
-      beta = tf.cast(self.beta, dtype)
-
-      global_step_recomp = tf.cast(step, dtype)
-      global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction))
-
-      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
-      return tf.multiply(initial_learning_rate, linear_cosine_decayed,
-                         name=name)
-
-  def get_config(self):
-    return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "decay_steps": self.decay_steps,
-        "num_periods": self.num_periods,
-        "alpha": self.alpha,
-        "beta": self.beta,
-        "name": self.name
-    }
+
+    def __init__(
+        self,
+        initial_learning_rate,
+        decay_steps,
+        num_periods=0.5,
+        alpha=0.0,
+        beta=0.001,
+        name=None,
+    ):
+        """Applies linear cosine decay to the learning rate.
+
+        Args:
+          initial_learning_rate: A scalar `float32` or `float64` Tensor or a
+            Python number. The initial learning rate.
+          decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+            Number of steps to decay over.
+          num_periods: Number of periods in the cosine part of the decay.
+            See computation above.
+          alpha: See computation above.
+          beta: See computation above.
+          name: String.  Optional name of the operation.  Defaults to
+            'LinearCosineDecay'.
+        """
+        super().__init__()
+
+        self.initial_learning_rate = initial_learning_rate
+        self.decay_steps = decay_steps
+        self.num_periods = num_periods
+        self.alpha = alpha
+        self.beta = beta
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "LinearCosineDecay") as name:
+            initial_learning_rate = tf.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate"
+            )
+            dtype = initial_learning_rate.dtype
+            decay_steps = tf.cast(self.decay_steps, dtype)
+            num_periods = tf.cast(self.num_periods, dtype)
+            alpha = tf.cast(self.alpha, dtype)
+            beta = tf.cast(self.beta, dtype)
+
+            global_step_recomp = tf.cast(step, dtype)
+            global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
+            linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+            completed_fraction = global_step_recomp / decay_steps
+            fraction = 2.0 * num_periods * completed_fraction
+            cosine_decayed = 0.5 * (
+                1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction)
+            )
+
+            linear_cosine_decayed = (
+                alpha + linear_decayed
+            ) * cosine_decayed + beta
+            return tf.multiply(
+                initial_learning_rate, linear_cosine_decayed, name=name
+            )
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_steps": self.decay_steps,
+            "num_periods": self.num_periods,
+            "alpha": self.alpha,
+            "beta": self.beta,
+            "name": self.name,
+        }
 
 
 # Note: this code is still used by V1 APIs.
 class NoisyLinearCosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a noisy linear cosine decay schedule.
-
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses. This schedule applies a noisy linear cosine decay
-  function to an optimizer step, given a provided initial learning rate.
-  It requires a `step` value to compute the decayed learning rate. You can
-  just pass a TensorFlow variable that you increment at each training step.
-
-  The schedule is a 1-arg callable that produces a decayed learning
-  rate when passed the current optimizer step. This can be useful for changing
-  the learning rate value across different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  def decayed_learning_rate(step):
-    step = min(step, decay_steps)
-    linear_decay = (decay_steps - step) / decay_steps)
-    cosine_decay = 0.5 * (
-        1 + cos(pi * 2 * num_periods * step / decay_steps))
-    decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
-    return initial_learning_rate * decayed
-  ```
-  where eps_t is 0-centered gaussian noise with variance
-  initial_variance / (1 + global_step) ** variance_decay
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = (
-    tf.keras.experimental.NoisyLinearCosineDecay(
-      initial_learning_rate, decay_steps))
-  ```
-
-  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-  as the learning rate. The learning rate schedule is also serializable and
-  deserializable using `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
-
-  Returns:
-    A 1-arg callable learning rate schedule that takes the current optimizer
-    step and outputs the decayed learning rate, a scalar `Tensor` of the same
-    type as `initial_learning_rate`.
-  """
-
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_steps,
-      initial_variance=1.0,
-      variance_decay=0.55,
-      num_periods=0.5,
-      alpha=0.0,
-      beta=0.001,
-      seed=None,
-      name=None):
-    """Applies noisy linear cosine decay to the learning rate.
+    """A LearningRateSchedule that uses a noisy linear cosine decay schedule.
+
+    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+    https://arxiv.org/abs/1709.07417
+
+    For the idea of warm starts here controlled by `num_periods`,
+    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a noisy linear cosine decay
+    function to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule is a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      linear_decay = (decay_steps - step) / decay_steps)
+      cosine_decay = 0.5 * (
+          1 + cos(pi * 2 * num_periods * step / decay_steps))
+      decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
+      return initial_learning_rate * decayed
+    ```
+    where eps_t is 0-centered gaussian noise with variance
+    initial_variance / (1 + global_step) ** variance_decay
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.NoisyLinearCosineDecay(
+        initial_learning_rate, decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
 
-    Args:
-      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
-        number. The initial learning rate.
-      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-        Number of steps to decay over.
-      initial_variance: initial variance for the noise. See computation above.
-      variance_decay: decay for the noise's variance. See computation above.
-      num_periods: Number of periods in the cosine part of the decay.
-        See computation above.
-      alpha: See computation above.
-      beta: See computation above.
-      seed: Integer, optional random seed to enable deterministic behavior.
-      name: String.  Optional name of the operation.  Defaults to
-        'NoisyLinearCosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
     """
-    super().__init__()
-
-    self.initial_learning_rate = initial_learning_rate
-    self.decay_steps = decay_steps
-    self.initial_variance = initial_variance
-    self.variance_decay = variance_decay
-    self.num_periods = num_periods
-    self.alpha = alpha
-    self.beta = beta
-    self.seed = seed
-    self.name = name
-    self._random_generator = backend.RandomGenerator(seed)
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name:
-      initial_learning_rate = tf.convert_to_tensor(
-          self.initial_learning_rate, name="initial_learning_rate")
-      dtype = initial_learning_rate.dtype
-      decay_steps = tf.cast(self.decay_steps, dtype)
-      initial_variance = tf.cast(self.initial_variance, dtype)
-      variance_decay = tf.cast(self.variance_decay, dtype)
-      num_periods = tf.cast(self.num_periods, dtype)
-      alpha = tf.cast(self.alpha, dtype)
-      beta = tf.cast(self.beta, dtype)
-
-      global_step_recomp = tf.cast(step, dtype)
-      global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      variance = initial_variance / (
-          tf.pow(1.0 + global_step_recomp, variance_decay))
-      std = tf.sqrt(variance)
-      noisy_linear_decayed = (
-          linear_decayed + self._random_generator.random_normal(
-              linear_decayed.shape, stddev=std))
-
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction))
-      noisy_linear_cosine_decayed = (
-          (alpha + noisy_linear_decayed) * cosine_decayed + beta)
-
-      return tf.multiply(
-          initial_learning_rate, noisy_linear_cosine_decayed, name=name)
-
-  def get_config(self):
-    return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "decay_steps": self.decay_steps,
-        "initial_variance": self.initial_variance,
-        "variance_decay": self.variance_decay,
-        "num_periods": self.num_periods,
-        "alpha": self.alpha,
-        "beta": self.beta,
-        "seed": self.seed,
-        "name": self.name,
-    }
+
+    def __init__(
+        self,
+        initial_learning_rate,
+        decay_steps,
+        initial_variance=1.0,
+        variance_decay=0.55,
+        num_periods=0.5,
+        alpha=0.0,
+        beta=0.001,
+        seed=None,
+        name=None,
+    ):
+        """Applies noisy linear cosine decay to the learning rate.
+
+        Args:
+          initial_learning_rate: A scalar `float32` or `float64` Tensor or a
+            Python number. The initial learning rate.
+          decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+            Number of steps to decay over.
+          initial_variance: initial variance for the noise. See computation
+            above.
+          variance_decay: decay for the noise's variance. See computation above.
+          num_periods: Number of periods in the cosine part of the decay.
+            See computation above.
+          alpha: See computation above.
+          beta: See computation above.
+          seed: Integer, optional random seed to enable deterministic behavior.
+          name: String.  Optional name of the operation.  Defaults to
+            'NoisyLinearCosineDecay'.
+        """
+        super().__init__()
+
+        self.initial_learning_rate = initial_learning_rate
+        self.decay_steps = decay_steps
+        self.initial_variance = initial_variance
+        self.variance_decay = variance_decay
+        self.num_periods = num_periods
+        self.alpha = alpha
+        self.beta = beta
+        self.seed = seed
+        self.name = name
+        self._random_generator = backend.RandomGenerator(seed)
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name:
+            initial_learning_rate = tf.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate"
+            )
+            dtype = initial_learning_rate.dtype
+            decay_steps = tf.cast(self.decay_steps, dtype)
+            initial_variance = tf.cast(self.initial_variance, dtype)
+            variance_decay = tf.cast(self.variance_decay, dtype)
+            num_periods = tf.cast(self.num_periods, dtype)
+            alpha = tf.cast(self.alpha, dtype)
+            beta = tf.cast(self.beta, dtype)
+
+            global_step_recomp = tf.cast(step, dtype)
+            global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
+            linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+            variance = initial_variance / (
+                tf.pow(1.0 + global_step_recomp, variance_decay)
+            )
+            std = tf.sqrt(variance)
+            noisy_linear_decayed = (
+                linear_decayed
+                + self._random_generator.random_normal(
+                    linear_decayed.shape, stddev=std
+                )
+            )
+
+            completed_fraction = global_step_recomp / decay_steps
+            fraction = 2.0 * num_periods * completed_fraction
+            cosine_decayed = 0.5 * (
+                1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction)
+            )
+            noisy_linear_cosine_decayed = (
+                alpha + noisy_linear_decayed
+            ) * cosine_decayed + beta
+
+            return tf.multiply(
+                initial_learning_rate, noisy_linear_cosine_decayed, name=name
+            )
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_steps": self.decay_steps,
+            "initial_variance": self.initial_variance,
+            "variance_decay": self.variance_decay,
+            "num_periods": self.num_periods,
+            "alpha": self.alpha,
+            "beta": self.beta,
+            "seed": self.seed,
+            "name": self.name,
+        }
 
 
 @keras_export("keras.optimizers.schedules.serialize")
-def serialize(learning_rate_schedule):
-  """Serializes a `LearningRateSchedule` into a JSON-compatible representation.
+def serialize(learning_rate_schedule, use_legacy_format=False):
+    """Serializes a `LearningRateSchedule` into a JSON-compatible dict.
 
-  Args:
-    learning_rate_schedule: The `LearningRateSchedule` object to serialize.
+    Args:
+      learning_rate_schedule: The `LearningRateSchedule` object to serialize.
 
-  Returns:
-    A JSON-serializable dict representing the object's config.
+    Returns:
+      A JSON-serializable dict representing the object's config.
 
-  Example:
+    Example:
 
-  >>> lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
-  ...   0.1, decay_steps=100000, decay_rate=0.96, staircase=True)
-  >>> tf.keras.optimizers.schedules.serialize(lr_schedule)
-  {'class_name': 'ExponentialDecay', 'config': {...}}
-  """
-  return generic_utils.serialize_keras_object(learning_rate_schedule)
+    >>> lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+    ...   0.1, decay_steps=100000, decay_rate=0.96, staircase=True)
+    >>> tf.keras.optimizers.schedules.serialize(lr_schedule)
+    {'module': 'keras.optimizers.schedules',
+    'class_name': 'ExponentialDecay', 'config': {...},
+    'registered_name': None}
+    """
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(
+            learning_rate_schedule
+        )
+
+    return serialization_lib.serialize_keras_object(learning_rate_schedule)
 
 
 @keras_export("keras.optimizers.schedules.deserialize")
-def deserialize(config, custom_objects=None):
-  """Instantiates a `LearningRateSchedule` object from a serialized form.
-
-  Args:
-    config: The serialized form of the `LearningRateSchedule`.
-      Dictionary of the form {'class_name': str, 'config': dict}.
-    custom_objects: A dictionary mapping class names (or function names) of
-      custom (non-Keras) objects to class/functions.
-
-  Returns:
-    A `LearningRateSchedule` object.
-
-  Example:
-
-  ```python
-  # Configuration for PolynomialDecay
-  config = {
-    'class_name': 'PolynomialDecay',
-    'config': {'cycle': False,
-      'decay_steps': 10000,
-      'end_learning_rate': 0.01,
-      'initial_learning_rate': 0.1,
-      'name': None,
-      'power': 0.5}}
-  lr_schedule = tf.keras.optimizers.schedules.deserialize(config)
-  ```
-  """
-  return generic_utils.deserialize_keras_object(
-      config,
-      module_objects=globals(),
-      custom_objects=custom_objects,
-      printable_module_name="decay")
+def deserialize(config, custom_objects=None, use_legacy_format=False):
+    """Instantiates a `LearningRateSchedule` object from a serialized form.
+
+    Args:
+      config: The serialized form of the `LearningRateSchedule`.
+        Dictionary of the form {'class_name': str, 'config': dict}.
+      custom_objects: A dictionary mapping class names (or function names) of
+        custom (non-Keras) objects to class/functions.
+
+    Returns:
+      A `LearningRateSchedule` object.
+
+    Example:
+
+    ```python
+    # Configuration for PolynomialDecay
+    config = {
+      'class_name': 'PolynomialDecay',
+      'config': {'cycle': False,
+        'decay_steps': 10000,
+        'end_learning_rate': 0.01,
+        'initial_learning_rate': 0.1,
+        'name': None,
+        'power': 0.5}}
+    lr_schedule = tf.keras.optimizers.schedules.deserialize(config)
+    ```
+    """
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            config,
+            module_objects=globals(),
+            custom_objects=custom_objects,
+            printable_module_name="decay",
+        )
+
+    return serialization_lib.deserialize_keras_object(
+        config,
+        module_objects=globals(),
+        custom_objects=custom_objects,
+        printable_module_name="decay",
+    )
diff --git a/keras/optimizers/schedules/learning_rate_schedule_test.py b/keras/optimizers/schedules/learning_rate_schedule_test.py
index 4239da5894b4..e78709d9089a 100644
--- a/keras/optimizers/schedules/learning_rate_schedule_test.py
+++ b/keras/optimizers/schedules/learning_rate_schedule_test.py
@@ -16,433 +16,501 @@
 
 import math
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
 from keras.optimizers.schedules import learning_rate_schedule
 from keras.testing_infra import test_combinations
-import numpy as np
-
-import tensorflow.compat.v2 as tf
 
 
 def _maybe_serialized(lr_decay, serialize_and_deserialize):
-  if serialize_and_deserialize:
-    serialized = learning_rate_schedule.serialize(lr_decay)
-    return learning_rate_schedule.deserialize(serialized)
-  else:
-    return lr_decay
+    if serialize_and_deserialize:
+        serialized = learning_rate_schedule.serialize(lr_decay)
+        return learning_rate_schedule.deserialize(serialized)
+    else:
+        return lr_decay
 
 
-@test_combinations.generate(test_combinations.combine(serialize=[False, True],
-                                                      mode=["graph", "eager"]))
+@test_combinations.generate(
+    test_combinations.combine(serialize=[False, True], mode=["graph", "eager"])
+)
 class LRDecayTestV2(tf.test.TestCase, parameterized.TestCase):
-
-  def testContinuous(self, serialize):
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    step = 5
-    decayed_lr = learning_rate_schedule.ExponentialDecay(0.05, 10, 0.96)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = .05 * 0.96**(5.0 / 10.0)
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testStaircase(self, serialize):
-    if tf.executing_eagerly():
-      step = tf.Variable(0)
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      decayed_lr = learning_rate_schedule.ExponentialDecay(
-          .1, 3, 0.96, staircase=True)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-
-      # No change to learning rate due to staircase
-      expected = .1
-      self.evaluate(step.assign(1))
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-      expected = .1
-      self.evaluate(step.assign(2))
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-      # Decayed learning rate
-      expected = .1 * 0.96 ** (100 // 3)
-      self.evaluate(step.assign(100))
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testVariables(self, serialize):
-    # TODO(tanzheny, omalleyt): Fix test in eager mode.
-    with tf.Graph().as_default():
-      step = tf.Variable(1)
-      assign_1 = step.assign(1)
-      assign_2 = step.assign(2)
-      assign_100 = step.assign(100)
-      decayed_lr = learning_rate_schedule.ExponentialDecay(
-          .1, 3, 0.96, staircase=True)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      # No change to learning rate
-      self.evaluate(assign_1.op)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), .1, 1e-6)
-      self.evaluate(assign_2.op)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), .1, 1e-6)
-      # Decayed learning rate
-      self.evaluate(assign_100.op)
-      expected = .1 * 0.96**(100 // 3)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testPiecewiseConstant(self, serialize):
-    x = tf.Variable(-999)
-    decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
-        [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-
-    self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
-    self.evaluate(x.assign(100))
-    self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
-    self.evaluate(x.assign(105))
-    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
-    self.evaluate(x.assign(110))
-    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
-    self.evaluate(x.assign(120))
-    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.01, 1e-6)
-    self.evaluate(x.assign(999))
-    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.001, 1e-6)
-
-  def testPiecewiseFunction(self, serialize):
-    if not tf.executing_eagerly():
-      self.skipTest("Run on eager mode only.")
-
-    del serialize
-    v = tf.Variable(1.)
-    def loss_fn():
-      return v * v
-    learning_rate = learning_rate_schedule.PiecewiseConstantDecay(
-        [1.], [1., 0.1])
-    opt = gradient_descent.SGD(learning_rate=learning_rate)
-
-    @tf.function
-    def minimize():
-      with tf.GradientTape() as tape:
-        loss = loss_fn()
-      g = tape.gradient(loss, [v])
-      opt.apply_gradients(list(zip(g, [v])))
-
-    minimize()
-    self.assertAllEqual(v.read_value(), -1.0)
-
-  def testPiecewiseConstantEdgeCases(self, serialize):
-    # Test casting boundaries from int32 to int64.
-    x_int64 = tf.Variable(0, dtype=tf.int64)
-    boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
-    decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
-        boundaries, values)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(1))
-    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(2))
-    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.5, 1e-6)
-    self.evaluate(x_int64.assign(3))
-    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.6, 1e-6)
-    self.evaluate(x_int64.assign(4))
-    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.7, 1e-6)
+    def testContinuous(self, serialize):
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        step = 5
+        decayed_lr = learning_rate_schedule.ExponentialDecay(0.05, 10, 0.96)
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = 0.05 * 0.96 ** (5.0 / 10.0)
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testStaircase(self, serialize):
+        if tf.executing_eagerly():
+            step = tf.Variable(0)
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            decayed_lr = learning_rate_schedule.ExponentialDecay(
+                0.1, 3, 0.96, staircase=True
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+            # No change to learning rate due to staircase
+            expected = 0.1
+            self.evaluate(step.assign(1))
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+            expected = 0.1
+            self.evaluate(step.assign(2))
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+            # Decayed learning rate
+            expected = 0.1 * 0.96 ** (100 // 3)
+            self.evaluate(step.assign(100))
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testVariables(self, serialize):
+        # TODO(tanzheny, omalleyt): Fix test in eager mode.
+        with tf.Graph().as_default():
+            step = tf.Variable(1)
+            assign_1 = step.assign(1)
+            assign_2 = step.assign(2)
+            assign_100 = step.assign(100)
+            decayed_lr = learning_rate_schedule.ExponentialDecay(
+                0.1, 3, 0.96, staircase=True
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            # No change to learning rate
+            self.evaluate(assign_1.op)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), 0.1, 1e-6)
+            self.evaluate(assign_2.op)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), 0.1, 1e-6)
+            # Decayed learning rate
+            self.evaluate(assign_100.op)
+            expected = 0.1 * 0.96 ** (100 // 3)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testPiecewiseConstant(self, serialize):
+        x = tf.Variable(-999)
+        decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+            [100, 110, 120], [1.0, 0.1, 0.01, 0.001]
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+
+        self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
+        self.evaluate(x.assign(100))
+        self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
+        self.evaluate(x.assign(105))
+        self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
+        self.evaluate(x.assign(110))
+        self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
+        self.evaluate(x.assign(120))
+        self.assertAllClose(self.evaluate(decayed_lr(x)), 0.01, 1e-6)
+        self.evaluate(x.assign(999))
+        self.assertAllClose(self.evaluate(decayed_lr(x)), 0.001, 1e-6)
+
+    def testPiecewiseFunction(self, serialize):
+        if not tf.executing_eagerly():
+            self.skipTest("Run on eager mode only.")
+
+        del serialize
+        v = tf.Variable(1.0)
+
+        def loss_fn():
+            return v * v
+
+        learning_rate = learning_rate_schedule.PiecewiseConstantDecay(
+            [1.0], [1.0, 0.1]
+        )
+        opt = gradient_descent.SGD(learning_rate=learning_rate)
+
+        @tf.function
+        def minimize():
+            with tf.GradientTape() as tape:
+                loss = loss_fn()
+            g = tape.gradient(loss, [v])
+            opt.apply_gradients(list(zip(g, [v])))
+
+        minimize()
+        self.assertAllEqual(v.read_value(), -1.0)
+
+    def testPiecewiseConstantEdgeCases(self, serialize):
+        # Test casting boundaries from int32 to int64.
+        x_int64 = tf.Variable(0, dtype=tf.int64)
+        boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
+        decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+            boundaries, values
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
+        self.evaluate(x_int64.assign(1))
+        self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
+        self.evaluate(x_int64.assign(2))
+        self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.5, 1e-6)
+        self.evaluate(x_int64.assign(3))
+        self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.6, 1e-6)
+        self.evaluate(x_int64.assign(4))
+        self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.7, 1e-6)
 
 
 # @parameterized.named_parameters(
 #     ("NotSerialized", False),
 #     ("Serialized", True))
-@test_combinations.generate(test_combinations.combine(serialize=[False, True],
-                                                      mode=["graph", "eager"]))
+@test_combinations.generate(
+    test_combinations.combine(serialize=[False, True], mode=["graph", "eager"])
+)
 class LinearDecayTestV2(tf.test.TestCase, parameterized.TestCase):
-
-  def testHalfWay(self, serialize):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = lr * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testEnd(self, serialize):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testHalfWayWithEnd(self, serialize):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = (lr + end_lr) * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testBeyondEnd(self, serialize):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testBeyondEndWithCycle(self, serialize):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_schedule.PolynomialDecay(
-        lr, 10, end_lr, cycle=True)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = (lr - end_lr) * 0.25 + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+    def testHalfWay(self, serialize):
+        step = 5
+        lr = 0.05
+        end_lr = 0.0
+        decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = lr * 0.5
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testEnd(self, serialize):
+        step = 10
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testHalfWayWithEnd(self, serialize):
+        step = 5
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = (lr + end_lr) * 0.5
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testBeyondEnd(self, serialize):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testBeyondEndWithCycle(self, serialize):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        decayed_lr = learning_rate_schedule.PolynomialDecay(
+            lr, 10, end_lr, cycle=True
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = (lr - end_lr) * 0.25 + end_lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
 # @parameterized.named_parameters(
 #     ("NotSerialized", False),
 #     ("Serialized", True))
-@test_combinations.generate(test_combinations.combine(serialize=[False, True],
-                                                      mode=["graph", "eager"]))
-class SqrtDecayTestV2(tf.test.TestCase,
-                      parameterized.TestCase):
-
-  def testHalfWay(self, serialize):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    power = 0.5
-    decayed_lr = learning_rate_schedule.PolynomialDecay(
-        lr, 10, end_lr, power=power)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = lr * 0.5**power
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testEnd(self, serialize):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_schedule.PolynomialDecay(
-        lr, 10, end_lr, power=power)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testHalfWayWithEnd(self, serialize):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_schedule.PolynomialDecay(
-        lr, 10, end_lr, power=power)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = (lr - end_lr) * 0.5**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testBeyondEnd(self, serialize):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_schedule.PolynomialDecay(
-        lr, 10, end_lr, power=power)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testBeyondEndWithCycle(self, serialize):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_schedule.PolynomialDecay(
-        lr, 10, end_lr, power=power, cycle=True)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = (lr - end_lr) * 0.25**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+@test_combinations.generate(
+    test_combinations.combine(serialize=[False, True], mode=["graph", "eager"])
+)
+class SqrtDecayTestV2(tf.test.TestCase, parameterized.TestCase):
+    def testHalfWay(self, serialize):
+        step = 5
+        lr = 0.05
+        end_lr = 0.0
+        power = 0.5
+        decayed_lr = learning_rate_schedule.PolynomialDecay(
+            lr, 10, end_lr, power=power
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = lr * 0.5**power
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testEnd(self, serialize):
+        step = 10
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = learning_rate_schedule.PolynomialDecay(
+            lr, 10, end_lr, power=power
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testHalfWayWithEnd(self, serialize):
+        step = 5
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = learning_rate_schedule.PolynomialDecay(
+            lr, 10, end_lr, power=power
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = (lr - end_lr) * 0.5**power + end_lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testBeyondEnd(self, serialize):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = learning_rate_schedule.PolynomialDecay(
+            lr, 10, end_lr, power=power
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = end_lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testBeyondEndWithCycle(self, serialize):
+        step = 15
+        lr = 0.05
+        end_lr = 0.001
+        power = 0.5
+        decayed_lr = learning_rate_schedule.PolynomialDecay(
+            lr, 10, end_lr, power=power, cycle=True
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = (lr - end_lr) * 0.25**power + end_lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
 # @parameterized.named_parameters(
 #     ("NotSerialized", False),
 #     ("Serialized", True))
-@test_combinations.generate(test_combinations.combine(serialize=[False, True],
-                                                      mode=["graph", "eager"]))
-class PolynomialDecayTestV2(tf.test.TestCase,
-                            parameterized.TestCase):
-
-  def testBeginWithCycle(self, serialize):
-    lr = 0.001
-    decay_steps = 10
-    step = 0
-    decayed_lr = learning_rate_schedule.PolynomialDecay(
-        lr, decay_steps, cycle=True)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-    expected = lr
-    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+@test_combinations.generate(
+    test_combinations.combine(serialize=[False, True], mode=["graph", "eager"])
+)
+class PolynomialDecayTestV2(tf.test.TestCase, parameterized.TestCase):
+    def testBeginWithCycle(self, serialize):
+        lr = 0.001
+        decay_steps = 10
+        step = 0
+        decayed_lr = learning_rate_schedule.PolynomialDecay(
+            lr, decay_steps, cycle=True
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+        expected = lr
+        self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
 # @parameterized.named_parameters(
 #     ("NotSerialized", False),
 #     ("Serialized", True))
-@test_combinations.generate(test_combinations.combine(serialize=[False, True],
-                                                      mode=["graph", "eager"]))
+@test_combinations.generate(
+    test_combinations.combine(serialize=[False, True], mode=["graph", "eager"])
+)
 class InverseDecayTestV2(tf.test.TestCase, parameterized.TestCase):
-
-  def testDecay(self, serialize):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = tf.Variable(0)
-    decayed_lr = learning_rate_schedule.InverseTimeDecay(initial_lr, k,
-                                                         decay_rate)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + i / k * decay_rate)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-  def testStaircase(self, serialize):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = tf.Variable(0)
-    decayed_lr = learning_rate_schedule.InverseTimeDecay(
-        initial_lr, k, decay_rate, staircase=True)
-    decayed_lr = _maybe_serialized(decayed_lr, serialize)
-
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + decay_rate * (i // k))
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-
-@test_combinations.generate(test_combinations.combine(serialize=[False, True],
-                                                      mode=["graph", "eager"]))
+    def testDecay(self, serialize):
+        initial_lr = 0.1
+        k = 10
+        decay_rate = 0.96
+        step = tf.Variable(0)
+        decayed_lr = learning_rate_schedule.InverseTimeDecay(
+            initial_lr, k, decay_rate
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        for i in range(k + 1):
+            expected = initial_lr / (1 + i / k * decay_rate)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+            self.evaluate(step.assign_add(1))
+
+    def testStaircase(self, serialize):
+        initial_lr = 0.1
+        k = 10
+        decay_rate = 0.96
+        step = tf.Variable(0)
+        decayed_lr = learning_rate_schedule.InverseTimeDecay(
+            initial_lr, k, decay_rate, staircase=True
+        )
+        decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        for i in range(k + 1):
+            expected = initial_lr / (1 + decay_rate * (i // k))
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+            self.evaluate(step.assign_add(1))
+
+
+@test_combinations.generate(
+    test_combinations.combine(serialize=[False, True], mode=["graph", "eager"])
+)
 class CosineDecayTestV2(tf.test.TestCase, parameterized.TestCase):
-
-  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
-    step = min(step, decay_steps)
-    completed_fraction = step / decay_steps
-    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  def testDecay(self, serialize):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecay(initial_lr,
-                                                      num_training_steps)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testAlpha(self, serialize):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecay(initial_lr,
-                                                      num_training_steps,
-                                                      alpha)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay(step, num_training_steps, alpha)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testFloat64InitLearningRate(self, serialize):
-    num_training_steps = 1000
-    initial_lr = np.float64(1.0)
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecay(initial_lr,
-                                                      num_training_steps)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-
-@test_combinations.generate(test_combinations.combine(serialize=[False, True],
-                                                      mode=["graph", "eager"]))
-class CosineDecayRestartsTestV2(tf.test.TestCase,
-                                parameterized.TestCase):
-
-  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
-                               alpha=0.0):
-    fac = 1.0
-    while step >= decay_steps:
-      step -= decay_steps
-      decay_steps *= t_mul
-      fac *= m_mul
-
-    completed_fraction = step / decay_steps
-    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  def testDecay(self, serialize):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
-          initial_lr, num_training_steps)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay_restarts(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testFloat64InitLearningRate(self, serialize):
-    num_training_steps = 1000
-    initial_lr = np.float64(1.0)
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
-          initial_lr, num_training_steps)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay_restarts(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testAlpha(self, serialize):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
-          initial_lr, num_training_steps, alpha=alpha)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, alpha=alpha)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testMMul(self, serialize):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    m_mul = 0.9
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
-          initial_lr, num_training_steps, m_mul=m_mul)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, m_mul=m_mul)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
-
-  def testTMul(self, serialize):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    t_mul = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
-          initial_lr, num_training_steps, t_mul=t_mul)
-      decayed_lr = _maybe_serialized(decayed_lr, serialize)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, t_mul=t_mul)
-      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+    def np_cosine_decay(self, step, decay_steps, alpha=0.0):
+        step = min(step, decay_steps)
+        completed_fraction = step / decay_steps
+        decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+        return (1.0 - alpha) * decay + alpha
+
+    def testDecay(self, serialize):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecay(
+                initial_lr, num_training_steps
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay(step, num_training_steps)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def linear_warmup(self, step, warmup_steps, initial_lr, target_lr):
+        completed_fraction = step / warmup_steps
+        total_delta = target_lr - initial_lr
+        return completed_fraction * total_delta
+
+    def testWarmup(self, serialize):
+        warmup_steps = 1500
+        initial_lr = 0.0
+        target_lr = 10.0
+        for step in range(0, 1500, 250):
+            lr = learning_rate_schedule.CosineDecay(
+                initial_lr,
+                0,
+                warmup_target=target_lr,
+                warmup_steps=warmup_steps,
+            )
+            lr = _maybe_serialized(lr, serialize)
+            expected = self.linear_warmup(
+                step, warmup_steps, initial_lr, target_lr
+            )
+            self.assertAllClose(self.evaluate(lr(step)), expected)
+
+    def testAlpha(self, serialize):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        alpha = 0.1
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecay(
+                initial_lr, num_training_steps, alpha
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay(step, num_training_steps, alpha)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testFloat64InitLearningRate(self, serialize):
+        num_training_steps = 1000
+        initial_lr = np.float64(1.0)
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecay(
+                initial_lr, num_training_steps
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay(step, num_training_steps)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testWarmupDecay(self, serialize):
+        warmup_steps = 2000
+        decay_steps = 1000
+        initial_lr = 0.0
+        target_lr = 10.0
+        for step in range(0, 3000, 250):
+            lr = learning_rate_schedule.CosineDecay(
+                initial_lr,
+                decay_steps,
+                warmup_target=target_lr,
+                warmup_steps=warmup_steps,
+            )
+            lr = _maybe_serialized(lr, serialize)
+            if step < warmup_steps + 1:
+                expected = self.linear_warmup(
+                    step, warmup_steps, initial_lr, target_lr
+                )
+            else:
+                expected = target_lr * self.np_cosine_decay(
+                    step - warmup_steps, decay_steps
+                )
+            self.assertAllClose(self.evaluate(lr(step)), expected)
+
+
+@test_combinations.generate(
+    test_combinations.combine(serialize=[False, True], mode=["graph", "eager"])
+)
+class CosineDecayRestartsTestV2(tf.test.TestCase, parameterized.TestCase):
+    def np_cosine_decay_restarts(
+        self, step, decay_steps, t_mul=2.0, m_mul=1.0, alpha=0.0
+    ):
+        fac = 1.0
+        while step >= decay_steps:
+            step -= decay_steps
+            decay_steps *= t_mul
+            fac *= m_mul
+
+        completed_fraction = step / decay_steps
+        decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+        return (1.0 - alpha) * decay + alpha
+
+    def testDecay(self, serialize):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+                initial_lr, num_training_steps
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay_restarts(step, num_training_steps)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testFloat64InitLearningRate(self, serialize):
+        num_training_steps = 1000
+        initial_lr = np.float64(1.0)
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+                initial_lr, num_training_steps
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay_restarts(step, num_training_steps)
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testAlpha(self, serialize):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        alpha = 0.1
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+                initial_lr, num_training_steps, alpha=alpha
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay_restarts(
+                step, num_training_steps, alpha=alpha
+            )
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testMMul(self, serialize):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        m_mul = 0.9
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+                initial_lr, num_training_steps, m_mul=m_mul
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay_restarts(
+                step, num_training_steps, m_mul=m_mul
+            )
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+    def testTMul(self, serialize):
+        num_training_steps = 1000
+        initial_lr = 1.0
+        t_mul = 1.0
+        for step in range(0, 1500, 250):
+            decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+                initial_lr, num_training_steps, t_mul=t_mul
+            )
+            decayed_lr = _maybe_serialized(decayed_lr, serialize)
+            expected = self.np_cosine_decay_restarts(
+                step, num_training_steps, t_mul=t_mul
+            )
+            self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/optimizers/sgd.py b/keras/optimizers/sgd.py
new file mode 100644
index 000000000000..c6f83e1eefa4
--- /dev/null
+++ b/keras/optimizers/sgd.py
@@ -0,0 +1,207 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SGD optimizer implementation."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.optimizers import optimizer
+from keras.saving.object_registration import register_keras_serializable
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+
+@register_keras_serializable()
+@keras_export(
+    "keras.optimizers.experimental.SGD",
+    "keras.optimizers.SGD",
+    "keras.dtensor.experimental.optimizers.SGD",
+    v1=[],
+)
+class SGD(optimizer.Optimizer):
+    r"""Gradient descent (with momentum) optimizer.
+
+    Update rule for parameter `w` with gradient `g` when `momentum` is 0:
+
+    ```python
+    w = w - learning_rate * g
+    ```
+
+    Update rule when `momentum` is larger than 0:
+
+    ```python
+    velocity = momentum * velocity - learning_rate * g
+    w = w + velocity
+    ```
+
+    When `nesterov=True`, this rule becomes:
+
+    ```python
+    velocity = momentum * velocity - learning_rate * g
+    w = w + momentum * velocity - learning_rate * g
+    ```
+
+    Args:
+        learning_rate: A `Tensor`, floating point value, or a schedule that is a
+            `keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to 0.001.
+        momentum: float hyperparameter >= 0 that accelerates gradient descent in
+            the relevant direction and dampens oscillations.
+            Defaults to 0, i.e., vanilla gradient descent.
+        nesterov: boolean. Whether to apply Nesterov momentum.
+            Defaults to `False`.
+      {{base_optimizer_keyword_args}}
+
+    Usage:
+
+    >>> opt = tf.keras.optimizers.SGD(learning_rate=0.1)
+    >>> var = tf.Variable(1.0)
+    >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
+    >>> opt.minimize(loss, [var])
+    >>> # Step is `- learning_rate * grad`
+    >>> var.numpy()
+    0.9
+
+    >>> opt = tf.keras.optimizers.SGD(0.1, momentum=0.9)
+    >>> var = tf.Variable(1.0)
+    >>> val0 = var.value()
+    >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
+    >>> # First step is `- learning_rate * grad`
+    >>> opt.minimize(loss, [var])
+    >>> val1 = var.value()
+    >>> (val0 - val1).numpy()
+    0.1
+    >>> # On later steps, step-size increases because of momentum
+    >>> opt.minimize(loss, [var])
+    >>> val2 = var.value()
+    >>> (val1 - val2).numpy()
+    0.18
+
+    Reference:
+        - For `nesterov=True`, See [Sutskever et al., 2013](
+          http://proceedings.mlr.press/v28/sutskever13.pdf).
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.01,
+        momentum=0.0,
+        nesterov=False,
+        weight_decay=None,
+        clipnorm=None,
+        clipvalue=None,
+        global_clipnorm=None,
+        use_ema=False,
+        ema_momentum=0.99,
+        ema_overwrite_frequency=None,
+        jit_compile=True,
+        name="SGD",
+        **kwargs
+    ):
+        super().__init__(
+            name=name,
+            weight_decay=weight_decay,
+            clipnorm=clipnorm,
+            clipvalue=clipvalue,
+            global_clipnorm=global_clipnorm,
+            use_ema=use_ema,
+            ema_momentum=ema_momentum,
+            ema_overwrite_frequency=ema_overwrite_frequency,
+            jit_compile=jit_compile,
+            **kwargs
+        )
+        self._learning_rate = self._build_learning_rate(learning_rate)
+        self.momentum = momentum
+        self.nesterov = nesterov
+        if isinstance(momentum, (int, float)) and (
+            momentum < 0 or momentum > 1
+        ):
+            raise ValueError("`momentum` must be between [0, 1].")
+
+    def build(self, var_list):
+        """Initialize optimizer variables.
+
+        SGD optimizer has one variable `momentums`, only set if `self.momentum`
+        is not 0.
+
+        Args:
+          var_list: list of model variables to build SGD variables on.
+        """
+        super().build(var_list)
+        if hasattr(self, "_built") and self._built:
+            return
+        self.momentums = []
+        for var in var_list:
+            self.momentums.append(
+                self.add_variable_from_reference(
+                    model_variable=var, variable_name="m"
+                )
+            )
+        self._built = True
+
+    def update_step(self, gradient, variable):
+        """Update step given gradient and the associated model variable."""
+        lr = tf.cast(self.learning_rate, variable.dtype)
+        m = None
+        var_key = self._var_key(variable)
+        momentum = tf.cast(self.momentum, variable.dtype)
+        m = self.momentums[self._index_dict[var_key]]
+
+        # TODO(b/204321487): Add nesterov acceleration.
+        if isinstance(gradient, tf.IndexedSlices):
+            # Sparse gradients.
+            add_value = tf.IndexedSlices(
+                -gradient.values * lr, gradient.indices
+            )
+            if m is not None:
+                m.assign(m * momentum)
+                m.scatter_add(add_value)
+                if self.nesterov:
+                    variable.scatter_add(add_value)
+                    variable.assign_add(m * momentum)
+                else:
+                    variable.assign_add(m)
+            else:
+                variable.scatter_add(add_value)
+        else:
+            # Dense gradients
+            if m is not None:
+                m.assign(-gradient * lr + m * momentum)
+                if self.nesterov:
+                    variable.assign_add(-gradient * lr + m * momentum)
+                else:
+                    variable.assign_add(m)
+            else:
+                variable.assign_add(-gradient * lr)
+
+    def get_config(self):
+        config = super().get_config()
+
+        config.update(
+            {
+                "learning_rate": self._serialize_hyperparameter(
+                    self._learning_rate
+                ),
+                "momentum": self.momentum,
+                "nesterov": self.nesterov,
+            }
+        )
+        return config
+
+
+SGD.__doc__ = SGD.__doc__.replace(
+    "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
+)
diff --git a/keras/optimizers/utils.py b/keras/optimizers/utils.py
new file mode 100644
index 000000000000..720ed64fd0a3
--- /dev/null
+++ b/keras/optimizers/utils.py
@@ -0,0 +1,177 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer utilities."""
+
+import tensorflow.compat.v2 as tf
+
+# isort: off
+from tensorflow.python.platform import tf_logging as logging
+
+
+def all_reduce_sum_gradients(grads_and_vars):
+    """Returns all-reduced gradients aggregated via summation.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs.
+
+    Returns:
+      List of (gradient, variable) pairs where gradients have been all-reduced.
+    """
+    grads_and_vars = list(grads_and_vars)
+    filtered_grads_and_vars = filter_empty_gradients(grads_and_vars)
+    if filtered_grads_and_vars:
+        if tf.__internal__.distribute.strategy_supports_no_merge_call():
+            grads = [pair[0] for pair in filtered_grads_and_vars]
+            reduced = tf.distribute.get_replica_context().all_reduce(
+                tf.distribute.ReduceOp.SUM, grads
+            )
+        else:
+            # TODO(b/183257003): Remove this branch
+            reduced = tf.distribute.get_replica_context().merge_call(
+                _all_reduce_sum_fn, args=(filtered_grads_and_vars,)
+            )
+    else:
+        reduced = []
+    # Copy 'reduced' but add None gradients back in
+    reduced_with_nones = []
+    reduced_pos = 0
+    for g, v in grads_and_vars:
+        if g is None:
+            reduced_with_nones.append((None, v))
+        else:
+            reduced_with_nones.append((reduced[reduced_pos], v))
+            reduced_pos += 1
+    assert reduced_pos == len(reduced), "Failed to add all gradients"
+    return reduced_with_nones
+
+
+def filter_empty_gradients(grads_and_vars):
+    """Filter out `(grad, var)` pairs that have a gradient equal to `None`."""
+    grads_and_vars = tuple(grads_and_vars)
+    if not grads_and_vars:
+        return grads_and_vars
+
+    filtered = []
+    vars_with_empty_grads = []
+    for grad, var in grads_and_vars:
+        if grad is None:
+            vars_with_empty_grads.append(var)
+        else:
+            filtered.append((grad, var))
+    filtered = tuple(filtered)
+
+    if not filtered:
+        variable = ([v.name for _, v in grads_and_vars],)
+        raise ValueError(
+            f"No gradients provided for any variable: {variable}. "
+            f"Provided `grads_and_vars` is {grads_and_vars}."
+        )
+    if vars_with_empty_grads:
+        logging.warning(
+            "Gradients do not exist for variables %s when minimizing the "
+            "loss. If you're using `model.compile()`, did you forget to "
+            "provide a `loss` argument?",
+            ([v.name for v in vars_with_empty_grads]),
+        )
+    return filtered
+
+
+def make_gradient_clipnorm_fn(clipnorm):
+    """Creates a gradient transformation function for clipping by norm."""
+    if clipnorm is None:
+        return lambda grads_and_vars: grads_and_vars
+
+    def gradient_clipnorm_fn(grads_and_vars):
+
+        if isinstance(
+            tf.distribute.get_strategy(),
+            (
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            raise ValueError(
+                "`clipnorm` is not supported with `CenteralStorageStrategy`. "
+                f"The strategy used is {tf.distribute.get_strategy()}."
+            )
+
+        clipped_grads_and_vars = [
+            (tf.clip_by_norm(g, clipnorm), v) for g, v in grads_and_vars
+        ]
+        return clipped_grads_and_vars
+
+    return gradient_clipnorm_fn
+
+
+def make_global_gradient_clipnorm_fn(clipnorm):
+    """Creates a gradient transformation function for clipping by norm."""
+    if clipnorm is None:
+        return lambda grads_and_vars: grads_and_vars
+
+    def gradient_clipnorm_fn(grads_and_vars):
+
+        if isinstance(
+            tf.distribute.get_strategy(),
+            (
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            raise ValueError(
+                "`global_clipnorm` is not supported with "
+                "`CenteralStorageStrategy`. "
+                f"The strategy used is {tf.distribute.get_strategy()}."
+            )
+
+        grads, variables = zip(*grads_and_vars)
+        clipped_grads, _ = tf.clip_by_global_norm(grads, clipnorm)
+        clipped_grads_and_vars = list(zip(clipped_grads, variables))
+        return clipped_grads_and_vars
+
+    return gradient_clipnorm_fn
+
+
+def make_gradient_clipvalue_fn(clipvalue):
+    """Creates a gradient transformation function for clipping by value."""
+    if clipvalue is None:
+        return lambda grads_and_vars: grads_and_vars
+
+    def gradient_clipvalue_fn(grads_and_vars):
+
+        if isinstance(
+            tf.distribute.get_strategy(),
+            (
+                tf.distribute.experimental.CentralStorageStrategy,
+                tf.compat.v1.distribute.experimental.CentralStorageStrategy,
+            ),
+        ):
+            raise ValueError(
+                "`clipvalue` is not supported with `CenteralStorageStrategy`. "
+                f"The strategy used is {tf.distribute.get_strategy()}."
+            )
+
+        clipped_grads_and_vars = [
+            (tf.clip_by_value(g, -clipvalue, clipvalue), v)
+            for g, v in grads_and_vars
+        ]
+        return clipped_grads_and_vars
+
+    return gradient_clipvalue_fn
+
+
+def _all_reduce_sum_fn(distribution, grads_and_vars):
+    return distribution.extended.batch_reduce_to(
+        tf.distribute.ReduceOp.SUM, grads_and_vars
+    )
diff --git a/keras/premade_models/BUILD b/keras/premade_models/BUILD
index 00286775da63..3441331df273 100644
--- a/keras/premade_models/BUILD
+++ b/keras/premade_models/BUILD
@@ -1,8 +1,11 @@
+# Placeholder: load unaliased py_library
+
 # Description:
 #   Contains the Keras Premade Models (internal TensorFlow version).
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
diff --git a/keras/premade_models/linear.py b/keras/premade_models/linear.py
index a2518bf0d08c..e24236166955 100644
--- a/keras/premade_models/linear.py
+++ b/keras/premade_models/linear.py
@@ -15,6 +15,7 @@
 """Built-in linear model classes."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import activations
 from keras import initializers
 from keras import regularizers
@@ -22,179 +23,196 @@
 from keras.engine import input_spec
 from keras.engine import training
 from keras.layers import core
+
+# isort: off
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export(
-    'keras.experimental.LinearModel',
-    v1=['keras.experimental.LinearModel', 'keras.models.LinearModel'])
-@deprecation.deprecated_endpoints('keras.experimental.LinearModel')
+    "keras.experimental.LinearModel",
+    v1=["keras.experimental.LinearModel", "keras.models.LinearModel"],
+)
+@deprecation.deprecated_endpoints("keras.experimental.LinearModel")
 class LinearModel(training.Model):
-  r"""Linear Model for regression and classification problems.
-
-  This model approximates the following function:
-  $$y = \beta + \sum_{i=1}^{N} w_{i} * x_{i}$$
-  where $$\beta$$ is the bias and $$w_{i}$$ is the weight for each feature.
-
-  Example:
-
-  ```python
-  model = LinearModel()
-  model.compile(optimizer='sgd', loss='mse')
-  model.fit(x, y, epochs=epochs)
-  ```
-
-  This model accepts sparse float inputs as well:
-
-  Example:
-  ```python
-  model = LinearModel()
-  opt = tf.keras.optimizers.Adam()
-  loss_fn = tf.keras.losses.MeanSquaredError()
-  with tf.GradientTape() as tape:
-    output = model(sparse_input)
-    loss = tf.reduce_mean(loss_fn(target, output))
-  grads = tape.gradient(loss, model.weights)
-  opt.apply_gradients(zip(grads, model.weights))
-  ```
-
-  """
-
-  def __init__(self,
-               units=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='zeros',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               **kwargs):
-    """Create a Linear Model.
-
-    Args:
-      units: Positive integer, output dimension without the batch size.
-      activation: Activation function to use.
-        If you don't specify anything, no activation is applied.
-      use_bias: whether to calculate the bias/intercept for this model. If set
-        to False, no bias/intercept will be used in calculations, e.g., the data
-        is already centered.
-      kernel_initializer: Initializer for the `kernel` weights matrices.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: regularizer for kernel vectors.
-      bias_regularizer: regularizer for bias vector.
-      **kwargs: The keyword arguments that are passed on to BaseLayer.__init__.
+    r"""Linear Model for regression and classification problems.
+
+    This model approximates the following function:
+    $$y = \beta + \sum_{i=1}^{N} w_{i} * x_{i}$$
+    where $$\beta$$ is the bias and $$w_{i}$$ is the weight for each feature.
+
+    Example:
+
+    ```python
+    model = LinearModel()
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(x, y, epochs=epochs)
+    ```
+
+    This model accepts sparse float inputs as well:
+
+    Example:
+    ```python
+    model = LinearModel()
+    opt = tf.keras.optimizers.Adam()
+    loss_fn = tf.keras.losses.MeanSquaredError()
+    with tf.GradientTape() as tape:
+      output = model(sparse_input)
+      loss = tf.reduce_mean(loss_fn(target, output))
+    grads = tape.gradient(loss, model.weights)
+    opt.apply_gradients(zip(grads, model.weights))
+    ```
+
     """
 
-    self.units = units
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    super().__init__(**kwargs)
-    base_layer.keras_premade_model_gauge.get_cell('Linear').set(True)
-
-  def build(self, input_shape):
-    if isinstance(input_shape, dict):
-      names = sorted(list(input_shape.keys()))
-      self.input_specs = []
-      self.dense_layers = []
-      for name in names:
-        shape = input_shape[name]
-        layer = core.Dense(
-            units=self.units,
-            use_bias=False,
-            kernel_initializer=self.kernel_initializer,
-            kernel_regularizer=self.kernel_regularizer,
-            name=name)
-        layer.build(shape)
-        self.input_specs.append(
-            input_spec.InputSpec(shape=shape, name=name))
-        self.dense_layers.append(layer)
-    elif isinstance(input_shape, (tuple, list)) and all(
-        isinstance(shape, tf.TensorShape) for shape in input_shape):
-      self.dense_layers = []
-      for shape in input_shape:
-        layer = core.Dense(
-            units=self.units,
-            use_bias=False,
-            kernel_initializer=self.kernel_initializer,
-            kernel_regularizer=self.kernel_regularizer)
-        layer.build(shape)
-        self.dense_layers.append(layer)
-    else:
-      # input_shape can be a single TensorShape or a tuple of ints.
-      layer = core.Dense(
-          units=self.units,
-          use_bias=False,
-          kernel_initializer=self.kernel_initializer,
-          kernel_regularizer=self.kernel_regularizer)
-      layer.build(input_shape)
-      self.dense_layers = [layer]
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          'bias',
-          shape=self.units,
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    result = None
-    if isinstance(inputs, dict):
-      names = [layer.name for layer in self.dense_layers]
-      different_keys = set(names) - set(inputs.keys())
-      if different_keys:
-        raise ValueError(
-            'The `inputs` dictionary does not match '
-            'the structure expected by the model.'
-            f'\n\tExpected keys: {set(names)}'
-            f'\n\tReceived keys: {set(inputs.keys())}'
-            f'\n\tMissing keys: {different_keys}')
-      inputs = [inputs[name] for name in names]
-      for inp, layer in zip(inputs, self.dense_layers):
-        output = layer(inp)
-        if result is None:
-          result = output
+    def __init__(
+        self,
+        units=1,
+        activation=None,
+        use_bias=True,
+        kernel_initializer="zeros",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        **kwargs,
+    ):
+        """Create a Linear Model.
+
+        Args:
+          units: Positive integer, output dimension without the batch size.
+          activation: Activation function to use.
+            If you don't specify anything, no activation is applied.
+          use_bias: whether to calculate the bias/intercept for this model. If
+            set to False, no bias/intercept will be used in calculations, e.g.,
+            the data is already centered.
+          kernel_initializer: Initializer for the `kernel` weights matrices.
+          bias_initializer: Initializer for the bias vector.
+          kernel_regularizer: regularizer for kernel vectors.
+          bias_regularizer: regularizer for bias vector.
+          **kwargs: The keyword arguments that are passed on to
+            BaseLayer.__init__.
+        """
+
+        self.units = units
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+        super().__init__(**kwargs)
+        base_layer.keras_premade_model_gauge.get_cell("Linear").set(True)
+
+    def build(self, input_shape):
+        if isinstance(input_shape, dict):
+            names = sorted(list(input_shape.keys()))
+            self.input_specs = []
+            self.dense_layers = []
+            for name in names:
+                shape = input_shape[name]
+                layer = core.Dense(
+                    units=self.units,
+                    use_bias=False,
+                    kernel_initializer=self.kernel_initializer,
+                    kernel_regularizer=self.kernel_regularizer,
+                    name=name,
+                )
+                layer.build(shape)
+                self.input_specs.append(
+                    input_spec.InputSpec(shape=shape, name=name)
+                )
+                self.dense_layers.append(layer)
+        elif isinstance(input_shape, (tuple, list)) and all(
+            isinstance(shape, tf.TensorShape) for shape in input_shape
+        ):
+            self.dense_layers = []
+            for shape in input_shape:
+                layer = core.Dense(
+                    units=self.units,
+                    use_bias=False,
+                    kernel_initializer=self.kernel_initializer,
+                    kernel_regularizer=self.kernel_regularizer,
+                )
+                layer.build(shape)
+                self.dense_layers.append(layer)
+        else:
+            # input_shape can be a single TensorShape or a tuple of ints.
+            layer = core.Dense(
+                units=self.units,
+                use_bias=False,
+                kernel_initializer=self.kernel_initializer,
+                kernel_regularizer=self.kernel_regularizer,
+            )
+            layer.build(input_shape)
+            self.dense_layers = [layer]
+
+        if self.use_bias:
+            self.bias = self.add_weight(
+                "bias",
+                shape=self.units,
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                dtype=self.dtype,
+                trainable=True,
+            )
         else:
-          result += output
-    elif isinstance(inputs, (tuple, list)):
-      for inp, layer in zip(inputs, self.dense_layers):
-        output = layer(inp)
-        if result is None:
-          result = output
+            self.bias = None
+        self.built = True
+
+    def call(self, inputs):
+        result = None
+        if isinstance(inputs, dict):
+            names = [layer.name for layer in self.dense_layers]
+            different_keys = set(names) - set(inputs.keys())
+            if different_keys:
+                raise ValueError(
+                    "The `inputs` dictionary does not match "
+                    "the structure expected by the model."
+                    f"\n\tExpected keys: {set(names)}"
+                    f"\n\tReceived keys: {set(inputs.keys())}"
+                    f"\n\tMissing keys: {different_keys}"
+                )
+            inputs = [inputs[name] for name in names]
+            for inp, layer in zip(inputs, self.dense_layers):
+                output = layer(inp)
+                if result is None:
+                    result = output
+                else:
+                    result += output
+        elif isinstance(inputs, (tuple, list)):
+            for inp, layer in zip(inputs, self.dense_layers):
+                output = layer(inp)
+                if result is None:
+                    result = output
+                else:
+                    result += output
         else:
-          result += output
-    else:
-      result = self.dense_layers[0](inputs)
-
-    if self.use_bias:
-      result = tf.nn.bias_add(result, self.bias)
-    if self.activation is not None:
-      return self.activation(result)  # pylint: disable=not-callable
-    return result
-
-  def get_config(self):
-    config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-    }
-    base_config = base_layer.Layer.get_config(self)
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    del custom_objects
-    return cls(**config)
+            result = self.dense_layers[0](inputs)
+
+        if self.use_bias:
+            result = tf.nn.bias_add(result, self.bias)
+        if self.activation is not None:
+            return self.activation(result)
+        return result
+
+    def get_config(self):
+        config = {
+            "units": self.units,
+            "activation": activations.serialize(self.activation),
+            "use_bias": self.use_bias,
+            "kernel_initializer": initializers.serialize(
+                self.kernel_initializer
+            ),
+            "bias_initializer": initializers.serialize(self.bias_initializer),
+            "kernel_regularizer": regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+        }
+        base_config = base_layer.Layer.get_config(self)
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        del custom_objects
+        return cls(**config)
diff --git a/keras/premade_models/linear_test.py b/keras/premade_models/linear_test.py
index c31dda2e40b5..9d7d83b76b2a 100644
--- a/keras/premade_models/linear_test.py
+++ b/keras/premade_models/linear_test.py
@@ -14,160 +14,164 @@
 # ==============================================================================
 """Tests for Keras Premade Linear models."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 from keras import backend
-from keras.testing_infra import test_combinations
 from keras import losses
 from keras.engine import input_layer
 from keras.engine import sequential
 from keras.engine import training
 from keras.feature_column import dense_features_v2
 from keras.layers import core
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
 from keras.premade_models import linear
+from keras.testing_infra import test_combinations
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class LinearModelTest(test_combinations.TestCase):
-
-  def test_linear_model_with_single_input(self):
-    model = linear.LinearModel()
-    inp = np.random.uniform(low=-5., high=5., size=(64, 2))
-    output = .3 * inp[:, 0] + .2 * inp[:, 1]
-    model.compile('sgd', 'mse', [])
-    model.fit(inp, output, epochs=5)
-    self.assertTrue(model.built)
-
-  def test_linear_model_with_list_input(self):
-    model = linear.LinearModel()
-    input_a = np.random.uniform(low=-5., high=5., size=(64, 1))
-    input_b = np.random.uniform(low=-5., high=5., size=(64, 1))
-    output = .3 * input_a + .2 * input_b
-    model.compile('sgd', 'mse', [])
-    model.fit([input_a, input_b], output, epochs=5)
-
-  def test_linear_model_with_mismatched_dict_inputs(self):
-    model = linear.LinearModel()
-    input_a = np.random.uniform(low=-5., high=5., size=(64, 1))
-    input_b = np.random.uniform(low=-5., high=5., size=(64, 1))
-    output = .3 * input_a + .2 * input_b
-    model.compile('sgd', 'mse', [])
-    model.build({'a': tf.TensorShape([None, 1]),
-                 'b': tf.TensorShape([None, 1])})
-    with self.assertRaisesRegex(ValueError, 'Missing keys'):
-      model.fit({'c': input_a, 'b': input_b}, output, epochs=5)
-
-  def test_linear_model_with_dict_input(self):
-    model = linear.LinearModel()
-    input_a = np.random.uniform(low=-5., high=5., size=(64, 1))
-    input_b = np.random.uniform(low=-5., high=5., size=(64, 1))
-    output = .3 * input_a + .2 * input_b
-    model.compile('sgd', 'mse', [])
-    model.fit({'a': input_a, 'b': input_b}, output, epochs=5)
-
-  def test_linear_model_as_layer(self):
-    input_a = input_layer.Input(shape=(1,), name='a')
-    output_a = linear.LinearModel()(input_a)
-    input_b = input_layer.Input(shape=(1,), name='b')
-    output_b = core.Dense(units=1)(input_b)
-    output = output_a + output_b
-    model = training.Model(inputs=[input_a, input_b], outputs=[output])
-    input_a_np = np.random.uniform(low=-5., high=5., size=(64, 1))
-    input_b_np = np.random.uniform(low=-5., high=5., size=(64, 1))
-    output_np = .3 * input_a_np + .2 * input_b_np
-    model.compile('sgd', 'mse', [])
-    model.fit([input_a_np, input_b_np], output_np, epochs=5)
-
-  def test_linear_model_with_sparse_input(self):
-    indices = tf.constant([[0, 0], [0, 2], [1, 0], [1, 1]],
-                                   dtype=tf.int64)
-    values = tf.constant([.4, .6, .8, .5])
-    shape = tf.constant([2, 3], dtype=tf.int64)
-    model = linear.LinearModel()
-    inp = tf.SparseTensor(indices, values, shape)
-    output = model(inp)
-    self.evaluate(tf.compat.v1.global_variables_initializer())
-    if tf.executing_eagerly():
-      weights = model.get_weights()
-      weights[0] = np.ones((3, 1))
-      model.set_weights(weights)
-      output = model(inp)
-      self.assertAllClose([[1.], [1.3]], self.evaluate(output))
-
-  def test_linear_model_with_sparse_input_and_custom_training(self):
-    batch_size = 64
-    indices = []
-    values = []
-    target = np.zeros((batch_size, 1))
-    for i in range(64):
-      rand_int = np.random.randint(3)
-      if rand_int == 0:
-        indices.append((i, 0))
-        val = np.random.uniform(low=-5., high=5.)
-        values.append(val)
-        target[i] = 0.3 * val
-      elif rand_int == 1:
-        indices.append((i, 1))
-        val = np.random.uniform(low=-5., high=5.)
-        values.append(val)
-        target[i] = 0.2 * val
-      else:
-        indices.append((i, 0))
-        indices.append((i, 1))
-        val_1 = np.random.uniform(low=-5., high=5.)
-        val_2 = np.random.uniform(low=-5., high=5.)
-        values.append(val_1)
-        values.append(val_2)
-        target[i] = 0.3 * val_1 + 0.2 * val_2
-
-    indices = np.asarray(indices)
-    values = np.asarray(values)
-    shape = tf.constant([batch_size, 2], dtype=tf.int64)
-    inp = tf.SparseTensor(indices, values, shape)
-    model = linear.LinearModel(use_bias=False)
-    opt = gradient_descent.SGD()
-    for _ in range(20):
-      with tf.GradientTape() as t:
+    def test_linear_model_with_single_input(self):
+        model = linear.LinearModel()
+        inp = np.random.uniform(low=-5.0, high=5.0, size=(64, 2))
+        output = 0.3 * inp[:, 0] + 0.2 * inp[:, 1]
+        model.compile("sgd", "mse", [])
+        model.fit(inp, output, epochs=5)
+        self.assertTrue(model.built)
+
+    def test_linear_model_with_list_input(self):
+        model = linear.LinearModel()
+        input_a = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        input_b = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        output = 0.3 * input_a + 0.2 * input_b
+        model.compile("sgd", "mse", [])
+        model.fit([input_a, input_b], output, epochs=5)
+
+    def test_linear_model_with_mismatched_dict_inputs(self):
+        model = linear.LinearModel()
+        input_a = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        input_b = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        output = 0.3 * input_a + 0.2 * input_b
+        model.compile("sgd", "mse", [])
+        model.build(
+            {"a": tf.TensorShape([None, 1]), "b": tf.TensorShape([None, 1])}
+        )
+        with self.assertRaisesRegex(ValueError, "Missing keys"):
+            model.fit({"c": input_a, "b": input_b}, output, epochs=5)
+
+    def test_linear_model_with_dict_input(self):
+        model = linear.LinearModel()
+        input_a = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        input_b = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        output = 0.3 * input_a + 0.2 * input_b
+        model.compile("sgd", "mse", [])
+        model.fit({"a": input_a, "b": input_b}, output, epochs=5)
+
+    def test_linear_model_as_layer(self):
+        input_a = input_layer.Input(shape=(1,), name="a")
+        output_a = linear.LinearModel()(input_a)
+        input_b = input_layer.Input(shape=(1,), name="b")
+        output_b = core.Dense(units=1)(input_b)
+        output = output_a + output_b
+        model = training.Model(inputs=[input_a, input_b], outputs=[output])
+        input_a_np = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        input_b_np = np.random.uniform(low=-5.0, high=5.0, size=(64, 1))
+        output_np = 0.3 * input_a_np + 0.2 * input_b_np
+        model.compile("sgd", "mse", [])
+        model.fit([input_a_np, input_b_np], output_np, epochs=5)
+
+    def test_linear_model_with_sparse_input(self):
+        indices = tf.constant([[0, 0], [0, 2], [1, 0], [1, 1]], dtype=tf.int64)
+        values = tf.constant([0.4, 0.6, 0.8, 0.5])
+        shape = tf.constant([2, 3], dtype=tf.int64)
+        model = linear.LinearModel()
+        inp = tf.SparseTensor(indices, values, shape)
         output = model(inp)
-        loss = backend.mean(losses.mean_squared_error(target, output))
-      grads = t.gradient(loss, model.trainable_variables)
-      grads_and_vars = zip(grads, model.trainable_variables)
-      opt.apply_gradients(grads_and_vars)
-
-  # This test is an example for a regression on categorical inputs, i.e.,
-  # the output is 0.4, 0.6, 0.9 when input is 'alpha', 'beta', 'gamma'
-  # separately.
-  def test_linear_model_with_feature_column(self):
-    vocab_list = ['alpha', 'beta', 'gamma']
-    vocab_val = [0.4, 0.6, 0.9]
-    data = np.random.choice(vocab_list, size=256)
-    y = np.zeros_like(data, dtype=np.float32)
-    for vocab, val in zip(vocab_list, vocab_val):
-      indices = np.where(data == vocab)
-      y[indices] = val + np.random.uniform(
-          low=-0.01, high=0.01, size=indices[0].shape)
-    cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
-        key='symbol', vocabulary_list=vocab_list)
-    ind_column = tf.feature_column.indicator_column(cat_column)
-    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-    linear_model = linear.LinearModel(
-        use_bias=False, kernel_initializer='zeros')
-    combined = sequential.Sequential([dense_feature_layer, linear_model])
-    opt = gradient_descent.SGD(learning_rate=0.1)
-    combined.compile(opt, 'mse', [])
-    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
-    self.assertAllClose([[0.4], [0.6], [0.9]],
-                        combined.layers[1].dense_layers[0].kernel.numpy(),
-                        atol=0.01)
-
-  def test_config(self):
-    linear_model = linear.LinearModel(units=3, use_bias=True)
-    config = linear_model.get_config()
-    cloned_linear_model = linear.LinearModel.from_config(config)
-    self.assertEqual(linear_model.units, cloned_linear_model.units)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+        self.evaluate(tf.compat.v1.global_variables_initializer())
+        if tf.executing_eagerly():
+            weights = model.get_weights()
+            weights[0] = np.ones((3, 1))
+            model.set_weights(weights)
+            output = model(inp)
+            self.assertAllClose([[1.0], [1.3]], self.evaluate(output))
+
+    def test_linear_model_with_sparse_input_and_custom_training(self):
+        batch_size = 64
+        indices = []
+        values = []
+        target = np.zeros((batch_size, 1))
+        for i in range(64):
+            rand_int = np.random.randint(3)
+            if rand_int == 0:
+                indices.append((i, 0))
+                val = np.random.uniform(low=-5.0, high=5.0)
+                values.append(val)
+                target[i] = 0.3 * val
+            elif rand_int == 1:
+                indices.append((i, 1))
+                val = np.random.uniform(low=-5.0, high=5.0)
+                values.append(val)
+                target[i] = 0.2 * val
+            else:
+                indices.append((i, 0))
+                indices.append((i, 1))
+                val_1 = np.random.uniform(low=-5.0, high=5.0)
+                val_2 = np.random.uniform(low=-5.0, high=5.0)
+                values.append(val_1)
+                values.append(val_2)
+                target[i] = 0.3 * val_1 + 0.2 * val_2
+
+        indices = np.asarray(indices)
+        values = np.asarray(values)
+        shape = tf.constant([batch_size, 2], dtype=tf.int64)
+        inp = tf.SparseTensor(indices, values, shape)
+        model = linear.LinearModel(use_bias=False)
+        opt = gradient_descent.SGD()
+        for _ in range(20):
+            with tf.GradientTape() as t:
+                output = model(inp)
+                loss = backend.mean(losses.mean_squared_error(target, output))
+            grads = t.gradient(loss, model.trainable_variables)
+            grads_and_vars = zip(grads, model.trainable_variables)
+            opt.apply_gradients(grads_and_vars)
+
+    # This test is an example for a regression on categorical inputs, i.e.,
+    # the output is 0.4, 0.6, 0.9 when input is 'alpha', 'beta', 'gamma'
+    # separately.
+    def test_linear_model_with_feature_column(self):
+        vocab_list = ["alpha", "beta", "gamma"]
+        vocab_val = [0.4, 0.6, 0.9]
+        data = np.random.choice(vocab_list, size=256)
+        y = np.zeros_like(data, dtype=np.float32)
+        for vocab, val in zip(vocab_list, vocab_val):
+            indices = np.where(data == vocab)
+            y[indices] = val + np.random.uniform(
+                low=-0.01, high=0.01, size=indices[0].shape
+            )
+        cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
+            key="symbol", vocabulary_list=vocab_list
+        )
+        ind_column = tf.feature_column.indicator_column(cat_column)
+        dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+        linear_model = linear.LinearModel(
+            use_bias=False, kernel_initializer="zeros"
+        )
+        combined = sequential.Sequential([dense_feature_layer, linear_model])
+        opt = gradient_descent.SGD(learning_rate=0.1)
+        combined.compile(opt, "mse", [])
+        combined.fit(x={"symbol": data}, y=y, batch_size=32, epochs=10)
+        self.assertAllClose(
+            [[0.4], [0.6], [0.9]],
+            combined.layers[1].dense_layers[0].kernel.numpy(),
+            atol=0.01,
+        )
+
+    def test_config(self):
+        linear_model = linear.LinearModel(units=3, use_bias=True)
+        config = linear_model.get_config()
+        cloned_linear_model = linear.LinearModel.from_config(config)
+        self.assertEqual(linear_model.units, cloned_linear_model.units)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/premade_models/wide_deep.py b/keras/premade_models/wide_deep.py
index 89f9fe0c538d..b06aa60cf729 100644
--- a/keras/premade_models/wide_deep.py
+++ b/keras/premade_models/wide_deep.py
@@ -15,203 +15,226 @@
 """Built-in WideNDeep model classes."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import activations
 from keras import backend
 from keras import layers as layer_module
 from keras.engine import base_layer
 from keras.engine import data_adapter
 from keras.engine import training as keras_training
-from keras.utils import generic_utils
+from keras.saving import serialization_lib
+
+# isort: off
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export(
-    'keras.experimental.WideDeepModel',
-    v1=['keras.experimental.WideDeepModel', 'keras.models.WideDeepModel'])
-@deprecation.deprecated_endpoints('keras.experimental.WideDeepModel')
+    "keras.experimental.WideDeepModel",
+    v1=["keras.experimental.WideDeepModel", "keras.models.WideDeepModel"],
+)
+@deprecation.deprecated_endpoints("keras.experimental.WideDeepModel")
 class WideDeepModel(keras_training.Model):
-  r"""Wide & Deep Model for regression and classification problems.
-
-  This model jointly train a linear and a dnn model.
-
-  Example:
-
-  ```python
-  linear_model = LinearModel()
-  dnn_model = keras.Sequential([keras.layers.Dense(units=64),
-                               keras.layers.Dense(units=1)])
-  combined_model = WideDeepModel(linear_model, dnn_model)
-  combined_model.compile(optimizer=['sgd', 'adam'], loss='mse', metrics=['mse'])
-  # define dnn_inputs and linear_inputs as separate numpy arrays or
-  # a single numpy array if dnn_inputs is same as linear_inputs.
-  combined_model.fit([linear_inputs, dnn_inputs], y, epochs)
-  # or define a single `tf.data.Dataset` that contains a single tensor or
-  # separate tensors for dnn_inputs and linear_inputs.
-  dataset = tf.data.Dataset.from_tensors(([linear_inputs, dnn_inputs], y))
-  combined_model.fit(dataset, epochs)
-  ```
-
-  Both linear and dnn model can be pre-compiled and trained separately
-  before jointly training:
-
-  Example:
-  ```python
-  linear_model = LinearModel()
-  linear_model.compile('adagrad', 'mse')
-  linear_model.fit(linear_inputs, y, epochs)
-  dnn_model = keras.Sequential([keras.layers.Dense(units=1)])
-  dnn_model.compile('rmsprop', 'mse')
-  dnn_model.fit(dnn_inputs, y, epochs)
-  combined_model = WideDeepModel(linear_model, dnn_model)
-  combined_model.compile(optimizer=['sgd', 'adam'], loss='mse', metrics=['mse'])
-  combined_model.fit([linear_inputs, dnn_inputs], y, epochs)
-  ```
-
-  """
-
-  def __init__(self, linear_model, dnn_model, activation=None, **kwargs):
-    """Create a Wide & Deep Model.
-
-    Args:
-      linear_model: a premade LinearModel, its output must match the output of
-        the dnn model.
-      dnn_model: a `tf.keras.Model`, its output must match the output of the
-        linear model.
-      activation: Activation function. Set it to None to maintain a linear
-        activation.
-      **kwargs: The keyword arguments that are passed on to BaseLayer.__init__.
-        Allowed keyword arguments include `name`.
+    r"""Wide & Deep Model for regression and classification problems.
+
+    This model jointly train a linear and a dnn model.
+
+    Example:
+
+    ```python
+    linear_model = LinearModel()
+    dnn_model = keras.Sequential([keras.layers.Dense(units=64),
+                                 keras.layers.Dense(units=1)])
+    combined_model = WideDeepModel(linear_model, dnn_model)
+    combined_model.compile(optimizer=['sgd', 'adam'],
+                           loss='mse', metrics=['mse'])
+    # define dnn_inputs and linear_inputs as separate numpy arrays or
+    # a single numpy array if dnn_inputs is same as linear_inputs.
+    combined_model.fit([linear_inputs, dnn_inputs], y, epochs)
+    # or define a single `tf.data.Dataset` that contains a single tensor or
+    # separate tensors for dnn_inputs and linear_inputs.
+    dataset = tf.data.Dataset.from_tensors(([linear_inputs, dnn_inputs], y))
+    combined_model.fit(dataset, epochs)
+    ```
+
+    Both linear and dnn model can be pre-compiled and trained separately
+    before jointly training:
+
+    Example:
+    ```python
+    linear_model = LinearModel()
+    linear_model.compile('adagrad', 'mse')
+    linear_model.fit(linear_inputs, y, epochs)
+    dnn_model = keras.Sequential([keras.layers.Dense(units=1)])
+    dnn_model.compile('rmsprop', 'mse')
+    dnn_model.fit(dnn_inputs, y, epochs)
+    combined_model = WideDeepModel(linear_model, dnn_model)
+    combined_model.compile(optimizer=['sgd', 'adam'],
+                           loss='mse', metrics=['mse'])
+    combined_model.fit([linear_inputs, dnn_inputs], y, epochs)
+    ```
+
     """
-    super().__init__(**kwargs)
-    base_layer.keras_premade_model_gauge.get_cell('WideDeep').set(True)
-    self.linear_model = linear_model
-    self.dnn_model = dnn_model
-    self.activation = activations.get(activation)
-
-  def call(self, inputs, training=None):
-    if not isinstance(inputs, (tuple, list)) or len(inputs) != 2:
-      linear_inputs = dnn_inputs = inputs
-    else:
-      linear_inputs, dnn_inputs = inputs
-    linear_output = self.linear_model(linear_inputs)
-    # pylint: disable=protected-access
-    if self.dnn_model._expects_training_arg:
-      if training is None:
-        training = backend.learning_phase()
-      dnn_output = self.dnn_model(dnn_inputs, training=training)
-    else:
-      dnn_output = self.dnn_model(dnn_inputs)
-    output = tf.nest.map_structure(
-        lambda x, y: (x + y), linear_output, dnn_output)
-    if self.activation:
-      return tf.nest.map_structure(self.activation, output)
-    return output
-
-  # This does not support gradient scaling and LossScaleOptimizer.
-  def train_step(self, data):
-    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
-    with tf.GradientTape() as tape:
-      y_pred = self(x, training=True)
-      loss = self.compiled_loss(
-          y, y_pred, sample_weight, regularization_losses=self.losses)
-    self.compiled_metrics.update_state(y, y_pred, sample_weight)
-
-    if isinstance(self.optimizer, (list, tuple)):
-      linear_vars = self.linear_model.trainable_variables
-      dnn_vars = self.dnn_model.trainable_variables
-      linear_grads, dnn_grads = tape.gradient(loss, (linear_vars, dnn_vars))
-
-      linear_optimizer = self.optimizer[0]
-      dnn_optimizer = self.optimizer[1]
-      linear_optimizer.apply_gradients(zip(linear_grads, linear_vars))
-      dnn_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
-    else:
-      trainable_variables = self.trainable_variables
-      grads = tape.gradient(loss, trainable_variables)
-      self.optimizer.apply_gradients(zip(grads, trainable_variables))
-
-    return {m.name: m.result() for m in self.metrics}
-
-  def _make_train_function(self):
-    # Only needed for graph mode and model_to_estimator.
-    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
-    self._check_trainable_weights_consistency()
-    # If we have re-compiled the loss/weighted metric sub-graphs then create
-    # train function even if one exists already. This is because
-    # `_feed_sample_weights` list has been updated on re-compile.
-    if getattr(self, 'train_function', None) is None or has_recompiled:
-      # Restore the compiled trainable state.
-      current_trainable_state = self._get_trainable_state()
-      self._set_trainable_state(self._compiled_trainable_state)
-
-      inputs = (
-          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
-      if not isinstance(backend.symbolic_learning_phase(), int):
-        inputs += [backend.symbolic_learning_phase()]
-
-      if isinstance(self.optimizer, (list, tuple)):
-        linear_optimizer = self.optimizer[0]
-        dnn_optimizer = self.optimizer[1]
-      else:
-        linear_optimizer = self.optimizer
-        dnn_optimizer = self.optimizer
-
-      with backend.get_graph().as_default():
-        with backend.name_scope('training'):
-          # Training updates
-          updates = []
-          linear_updates = linear_optimizer.get_updates(
-              params=self.linear_model.trainable_weights,  # pylint: disable=protected-access
-              loss=self.total_loss)
-          updates += linear_updates
-          dnn_updates = dnn_optimizer.get_updates(
-              params=self.dnn_model.trainable_weights,  # pylint: disable=protected-access
-              loss=self.total_loss)
-          updates += dnn_updates
-          # Unconditional updates
-          updates += self.get_updates_for(None)
-          # Conditional updates relevant to this model
-          updates += self.get_updates_for(self.inputs)
-
-        metrics = self._get_training_eval_metrics()
-        metrics_tensors = [
-            m._call_result for m in metrics if hasattr(m, '_call_result')  # pylint: disable=protected-access
-        ]
-
-      with backend.name_scope('training'):
-        # Gets loss and metrics. Updates weights at each call.
-        fn = backend.function(
-            inputs, [self.total_loss] + metrics_tensors,
-            updates=updates,
-            name='train_function',
-            **self._function_kwargs)
-        setattr(self, 'train_function', fn)
-
-      # Restore the current trainable state
-      self._set_trainable_state(current_trainable_state)
-
-  def get_config(self):
-    linear_config = generic_utils.serialize_keras_object(self.linear_model)
-    dnn_config = generic_utils.serialize_keras_object(self.dnn_model)
-    config = {
-        'linear_model': linear_config,
-        'dnn_model': dnn_config,
-        'activation': activations.serialize(self.activation),
-    }
-    base_config = base_layer.Layer.get_config(self)
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    linear_config = config.pop('linear_model')
-    linear_model = layer_module.deserialize(linear_config, custom_objects)
-    dnn_config = config.pop('dnn_model')
-    dnn_model = layer_module.deserialize(dnn_config, custom_objects)
-    activation = activations.deserialize(
-        config.pop('activation', None), custom_objects=custom_objects)
-    return cls(
-        linear_model=linear_model,
-        dnn_model=dnn_model,
-        activation=activation,
-        **config)
+
+    def __init__(self, linear_model, dnn_model, activation=None, **kwargs):
+        """Create a Wide & Deep Model.
+
+        Args:
+          linear_model: a premade LinearModel, its output must match the output
+            of the dnn model.
+          dnn_model: a `tf.keras.Model`, its output must match the output of the
+            linear model.
+          activation: Activation function. Set it to None to maintain a linear
+            activation.
+          **kwargs: The keyword arguments that are passed on to
+            BaseLayer.__init__. Allowed keyword arguments include `name`.
+        """
+        super().__init__(**kwargs)
+        base_layer.keras_premade_model_gauge.get_cell("WideDeep").set(True)
+        self.linear_model = linear_model
+        self.dnn_model = dnn_model
+        self.activation = activations.get(activation)
+
+    def call(self, inputs, training=None):
+        if not isinstance(inputs, (tuple, list)) or len(inputs) != 2:
+            linear_inputs = dnn_inputs = inputs
+        else:
+            linear_inputs, dnn_inputs = inputs
+        linear_output = self.linear_model(linear_inputs)
+
+        if self.dnn_model._expects_training_arg:
+            if training is None:
+                training = backend.learning_phase()
+            dnn_output = self.dnn_model(dnn_inputs, training=training)
+        else:
+            dnn_output = self.dnn_model(dnn_inputs)
+        output = tf.nest.map_structure(
+            lambda x, y: (x + y), linear_output, dnn_output
+        )
+        if self.activation:
+            return tf.nest.map_structure(self.activation, output)
+        return output
+
+    # This does not support gradient scaling and LossScaleOptimizer.
+    def train_step(self, data):
+        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+        with tf.GradientTape() as tape:
+            y_pred = self(x, training=True)
+            loss = self.compiled_loss(
+                y, y_pred, sample_weight, regularization_losses=self.losses
+            )
+        self.compiled_metrics.update_state(y, y_pred, sample_weight)
+
+        if isinstance(self.optimizer, (list, tuple)):
+            linear_vars = self.linear_model.trainable_variables
+            dnn_vars = self.dnn_model.trainable_variables
+            linear_grads, dnn_grads = tape.gradient(
+                loss, (linear_vars, dnn_vars)
+            )
+
+            linear_optimizer = self.optimizer[0]
+            dnn_optimizer = self.optimizer[1]
+            linear_optimizer.apply_gradients(zip(linear_grads, linear_vars))
+            dnn_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
+        else:
+            trainable_variables = self.trainable_variables
+            grads = tape.gradient(loss, trainable_variables)
+            self.optimizer.apply_gradients(zip(grads, trainable_variables))
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def _make_train_function(self):
+        # Only needed for graph mode and model_to_estimator.
+        has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
+        self._check_trainable_weights_consistency()
+        # If we have re-compiled the loss/weighted metric sub-graphs then create
+        # train function even if one exists already. This is because
+        # `_feed_sample_weights` list has been updated on re-compile.
+        if getattr(self, "train_function", None) is None or has_recompiled:
+            # Restore the compiled trainable state.
+            current_trainable_state = self._get_trainable_state()
+            self._set_trainable_state(self._compiled_trainable_state)
+
+            inputs = (
+                self._feed_inputs
+                + self._feed_targets
+                + self._feed_sample_weights
+            )
+            if not isinstance(backend.symbolic_learning_phase(), int):
+                inputs += [backend.symbolic_learning_phase()]
+
+            if isinstance(self.optimizer, (list, tuple)):
+                linear_optimizer = self.optimizer[0]
+                dnn_optimizer = self.optimizer[1]
+            else:
+                linear_optimizer = self.optimizer
+                dnn_optimizer = self.optimizer
+
+            with backend.get_graph().as_default():
+                with backend.name_scope("training"):
+                    # Training updates
+                    updates = []
+                    linear_updates = linear_optimizer.get_updates(
+                        params=self.linear_model.trainable_weights,
+                        loss=self.total_loss,
+                    )
+                    updates += linear_updates
+                    dnn_updates = dnn_optimizer.get_updates(
+                        params=self.dnn_model.trainable_weights,
+                        loss=self.total_loss,
+                    )
+                    updates += dnn_updates
+                    # Unconditional updates
+                    updates += self.get_updates_for(None)
+                    # Conditional updates relevant to this model
+                    updates += self.get_updates_for(self.inputs)
+
+                metrics = self._get_training_eval_metrics()
+                metrics_tensors = [
+                    m._call_result
+                    for m in metrics
+                    if hasattr(m, "_call_result")
+                ]
+
+            with backend.name_scope("training"):
+                # Gets loss and metrics. Updates weights at each call.
+                fn = backend.function(
+                    inputs,
+                    [self.total_loss] + metrics_tensors,
+                    updates=updates,
+                    name="train_function",
+                    **self._function_kwargs
+                )
+                setattr(self, "train_function", fn)
+
+            # Restore the current trainable state
+            self._set_trainable_state(current_trainable_state)
+
+    def get_config(self):
+        linear_config = serialization_lib.serialize_keras_object(
+            self.linear_model
+        )
+        dnn_config = serialization_lib.serialize_keras_object(self.dnn_model)
+        config = {
+            "linear_model": linear_config,
+            "dnn_model": dnn_config,
+            "activation": activations.serialize(self.activation),
+        }
+        base_config = base_layer.Layer.get_config(self)
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        linear_config = config.pop("linear_model")
+        linear_model = layer_module.deserialize(linear_config, custom_objects)
+        dnn_config = config.pop("dnn_model")
+        dnn_model = layer_module.deserialize(dnn_config, custom_objects)
+        activation = activations.deserialize(
+            config.pop("activation", None), custom_objects=custom_objects
+        )
+        return cls(
+            linear_model=linear_model,
+            dnn_model=dnn_model,
+            activation=activation,
+            **config
+        )
diff --git a/keras/premade_models/wide_deep_test.py b/keras/premade_models/wide_deep_test.py
index 5b0ec003f87b..076c12efb300 100644
--- a/keras/premade_models/wide_deep_test.py
+++ b/keras/premade_models/wide_deep_test.py
@@ -14,257 +14,304 @@
 # ==============================================================================
 """Tests for Keras Premade WideNDeep models."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.engine import input_layer
 from keras.engine import sequential
 from keras.engine import training
 from keras.feature_column import dense_features_v2
 from keras.layers import core
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
 from keras.premade_models import linear
 from keras.premade_models import wide_deep
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class WideDeepModelTest(test_combinations.TestCase):
+    def test_wide_deep_model(self):
+        linear_model = linear.LinearModel(units=1)
+        dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        linear_inp = np.random.uniform(low=-5.0, high=5.0, size=(64, 2))
+        dnn_inp = np.random.uniform(low=-5.0, high=5.0, size=(64, 3))
+        inputs = [linear_inp, dnn_inp]
+        output = 0.3 * linear_inp[:, 0] + 0.2 * dnn_inp[:, 1]
+        wide_deep_model.compile(
+            optimizer=["sgd", "adam"],
+            loss="mse",
+            metrics=[],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        wide_deep_model.fit(inputs, output, epochs=5)
+        self.assertTrue(wide_deep_model.built)
+
+    def test_wide_deep_model_backprop(self):
+        with self.cached_session():
+            linear_model = linear.LinearModel(
+                units=1, kernel_initializer="zeros"
+            )
+            dnn_model = sequential.Sequential(
+                [core.Dense(units=1, kernel_initializer="zeros")]
+            )
+            wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+            linear_inp = np.array([[1.0]])
+            dnn_inp = np.array([[1.0]])
+            inputs = [linear_inp, dnn_inp]
+            output = linear_inp + 2 * dnn_inp
+            linear_opt = gradient_descent.SGD(learning_rate=0.1)
+            dnn_opt = gradient_descent.SGD(learning_rate=0.3)
+            wide_deep_model.compile(
+                optimizer=[linear_opt, dnn_opt],
+                loss="mse",
+                metrics=[],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            self.evaluate(tf.compat.v1.global_variables_initializer())
+            wide_deep_model.fit(inputs, output, epochs=1)
+            self.assertAllClose(
+                [[0.6]],
+                self.evaluate(
+                    wide_deep_model.linear_model.dense_layers[0].kernel
+                ),
+            )
+            self.assertAllClose(
+                [[1.8]],
+                self.evaluate(wide_deep_model.dnn_model.layers[0].kernel),
+            )
+
+    def test_wide_deep_model_with_single_input(self):
+        linear_model = linear.LinearModel(units=1)
+        dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        inputs = np.random.uniform(low=-5.0, high=5.0, size=(64, 3))
+        output = 0.3 * inputs[:, 0]
+        wide_deep_model.compile(
+            optimizer=["sgd", "adam"],
+            loss="mse",
+            metrics=[],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        wide_deep_model.fit(inputs, output, epochs=5)
 
-  def test_wide_deep_model(self):
-    linear_model = linear.LinearModel(units=1)
-    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    linear_inp = np.random.uniform(low=-5., high=5., size=(64, 2))
-    dnn_inp = np.random.uniform(low=-5., high=5., size=(64, 3))
-    inputs = [linear_inp, dnn_inp]
-    output = .3 * linear_inp[:, 0] + .2 * dnn_inp[:, 1]
-    wide_deep_model.compile(
-        optimizer=['sgd', 'adam'],
-        loss='mse',
-        metrics=[],
-        run_eagerly=test_utils.should_run_eagerly())
-    wide_deep_model.fit(inputs, output, epochs=5)
-    self.assertTrue(wide_deep_model.built)
+    def test_wide_deep_model_with_multi_outputs(self):
+        inp = input_layer.Input(shape=(1,), name="linear")
+        l = linear.LinearModel(units=2, use_bias=False)(inp)
+        l1, l2 = tf.split(l, num_or_size_splits=2, axis=1)
+        linear_model = training.Model(inp, [l1, l2])
+        linear_model.set_weights([np.asarray([[0.5, 0.3]])])
+        h = core.Dense(units=2, use_bias=False)(inp)
+        h1, h2 = tf.split(h, num_or_size_splits=2, axis=1)
+        dnn_model = training.Model(inp, [h1, h2])
+        dnn_model.set_weights([np.asarray([[0.1, -0.5]])])
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        inp_np = np.asarray([[1.0]])
+        out1, out2 = wide_deep_model(inp_np)
+        # output should be (0.5 + 0.1), and (0.3 - 0.5)
+        self.assertAllClose([[0.6]], out1)
+        self.assertAllClose([[-0.2]], out2)
 
-  def test_wide_deep_model_backprop(self):
-    with self.cached_session():
-      linear_model = linear.LinearModel(units=1, kernel_initializer='zeros')
-      dnn_model = sequential.Sequential(
-          [core.Dense(units=1, kernel_initializer='zeros')])
-      wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-      linear_inp = np.array([[1.]])
-      dnn_inp = np.array([[1.]])
-      inputs = [linear_inp, dnn_inp]
-      output = linear_inp + 2 * dnn_inp
-      linear_opt = gradient_descent.SGD(learning_rate=.1)
-      dnn_opt = gradient_descent.SGD(learning_rate=.3)
-      wide_deep_model.compile(
-          optimizer=[linear_opt, dnn_opt],
-          loss='mse',
-          metrics=[],
-          run_eagerly=test_utils.should_run_eagerly())
-      self.evaluate(tf.compat.v1.global_variables_initializer())
-      wide_deep_model.fit(inputs, output, epochs=1)
-      self.assertAllClose(
-          [[0.6]],
-          self.evaluate(wide_deep_model.linear_model.dense_layers[0].kernel))
-      self.assertAllClose([[1.8]],
-                          self.evaluate(
-                              wide_deep_model.dnn_model.layers[0].kernel))
+        wide_deep_model = wide_deep.WideDeepModel(
+            linear_model, dnn_model, activation="relu"
+        )
+        out1, out2 = wide_deep_model(inp_np)
+        # output should be relu((0.5 + 0.1)), and relu((0.3 - 0.5))
+        self.assertAllClose([[0.6]], out1)
+        self.assertAllClose([[0.0]], out2)
 
-  def test_wide_deep_model_with_single_input(self):
-    linear_model = linear.LinearModel(units=1)
-    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    inputs = np.random.uniform(low=-5., high=5., size=(64, 3))
-    output = .3 * inputs[:, 0]
-    wide_deep_model.compile(
-        optimizer=['sgd', 'adam'],
-        loss='mse',
-        metrics=[],
-        run_eagerly=test_utils.should_run_eagerly())
-    wide_deep_model.fit(inputs, output, epochs=5)
+    def test_wide_deep_model_with_single_optimizer(self):
+        linear_model = linear.LinearModel(units=1)
+        dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        linear_inp = np.random.uniform(low=-5.0, high=5.0, size=(64, 2))
+        dnn_inp = np.random.uniform(low=-5.0, high=5.0, size=(64, 3))
+        inputs = [linear_inp, dnn_inp]
+        output = 0.3 * linear_inp[:, 0] + 0.2 * dnn_inp[:, 1]
+        wide_deep_model.compile(
+            optimizer="sgd",
+            loss="mse",
+            metrics=[],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        wide_deep_model.fit(inputs, output, epochs=5)
+        self.assertTrue(wide_deep_model.built)
 
-  def test_wide_deep_model_with_multi_outputs(self):
-    inp = input_layer.Input(shape=(1,), name='linear')
-    l = linear.LinearModel(units=2, use_bias=False)(inp)
-    l1, l2 = tf.split(l, num_or_size_splits=2, axis=1)
-    linear_model = training.Model(inp, [l1, l2])
-    linear_model.set_weights([np.asarray([[0.5, 0.3]])])
-    h = core.Dense(units=2, use_bias=False)(inp)
-    h1, h2 = tf.split(h, num_or_size_splits=2, axis=1)
-    dnn_model = training.Model(inp, [h1, h2])
-    dnn_model.set_weights([np.asarray([[0.1, -0.5]])])
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    inp_np = np.asarray([[1.]])
-    out1, out2 = wide_deep_model(inp_np)
-    # output should be (0.5 + 0.1), and (0.3 - 0.5)
-    self.assertAllClose([[0.6]], out1)
-    self.assertAllClose([[-0.2]], out2)
+    def test_wide_deep_model_as_layer(self):
+        linear_model = linear.LinearModel(units=1)
+        dnn_model = sequential.Sequential([core.Dense(units=1)])
+        linear_input = input_layer.Input(shape=(3,), name="linear")
+        dnn_input = input_layer.Input(shape=(5,), name="dnn")
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        wide_deep_output = wide_deep_model((linear_input, dnn_input))
+        input_b = input_layer.Input(shape=(1,), name="b")
+        output_b = core.Dense(units=1)(input_b)
+        model = training.Model(
+            inputs=[linear_input, dnn_input, input_b],
+            outputs=[wide_deep_output + output_b],
+        )
+        linear_input_np = np.random.uniform(low=-5.0, high=5.0, size=(64, 3))
+        dnn_input_np = np.random.uniform(low=-5.0, high=5.0, size=(64, 5))
+        input_b_np = np.random.uniform(low=-5.0, high=5.0, size=(64,))
+        output_np = (
+            linear_input_np[:, 0] + 0.2 * dnn_input_np[:, 1] + input_b_np
+        )
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            metrics=[],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            [linear_input_np, dnn_input_np, input_b_np], output_np, epochs=5
+        )
 
-    wide_deep_model = wide_deep.WideDeepModel(
-        linear_model, dnn_model, activation='relu')
-    out1, out2 = wide_deep_model(inp_np)
-    # output should be relu((0.5 + 0.1)), and relu((0.3 - 0.5))
-    self.assertAllClose([[0.6]], out1)
-    self.assertAllClose([[0.]], out2)
+    def test_wide_deep_model_with_sub_model_trained(self):
+        linear_model = linear.LinearModel(units=1)
+        dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+        wide_deep_model = wide_deep.WideDeepModel(
+            linear.LinearModel(units=1),
+            sequential.Sequential([core.Dense(units=1, input_dim=3)]),
+        )
+        linear_inp = np.random.uniform(low=-5.0, high=5.0, size=(64, 2))
+        dnn_inp = np.random.uniform(low=-5.0, high=5.0, size=(64, 3))
+        inputs = [linear_inp, dnn_inp]
+        output = 0.3 * linear_inp[:, 0] + 0.2 * dnn_inp[:, 1]
+        linear_model.compile(
+            optimizer="sgd",
+            loss="mse",
+            metrics=[],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        dnn_model.compile(
+            optimizer="adam",
+            loss="mse",
+            metrics=[],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        linear_model.fit(linear_inp, output, epochs=50)
+        dnn_model.fit(dnn_inp, output, epochs=50)
+        wide_deep_model.compile(
+            optimizer=["sgd", "adam"],
+            loss="mse",
+            metrics=[],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        wide_deep_model.fit(inputs, output, epochs=50)
 
-  def test_wide_deep_model_with_single_optimizer(self):
-    linear_model = linear.LinearModel(units=1)
-    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    linear_inp = np.random.uniform(low=-5., high=5., size=(64, 2))
-    dnn_inp = np.random.uniform(low=-5., high=5., size=(64, 3))
-    inputs = [linear_inp, dnn_inp]
-    output = .3 * linear_inp[:, 0] + .2 * dnn_inp[:, 1]
-    wide_deep_model.compile(
-        optimizer='sgd',
-        loss='mse',
-        metrics=[],
-        run_eagerly=test_utils.should_run_eagerly())
-    wide_deep_model.fit(inputs, output, epochs=5)
-    self.assertTrue(wide_deep_model.built)
+    # This test is an example for cases where linear and dnn model accepts
+    # same raw input and same transformed inputs, i.e., the raw input is
+    # categorical, and both linear and dnn model accept one hot encoding.
+    def test_wide_deep_model_with_single_feature_column(self):
+        vocab_list = ["alpha", "beta", "gamma"]
+        vocab_val = [0.4, 0.6, 0.9]
+        data = np.random.choice(vocab_list, size=256)
+        y = np.zeros_like(data, dtype=np.float32)
+        for vocab, val in zip(vocab_list, vocab_val):
+            indices = np.where(data == vocab)
+            y[indices] = val + np.random.uniform(
+                low=-0.01, high=0.01, size=indices[0].shape
+            )
+        cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
+            key="symbol", vocabulary_list=vocab_list
+        )
+        ind_column = tf.feature_column.indicator_column(cat_column)
+        dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+        linear_model = linear.LinearModel(
+            use_bias=False, kernel_initializer="zeros"
+        )
+        dnn_model = sequential.Sequential([core.Dense(units=1)])
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        combined = sequential.Sequential([dense_feature_layer, wide_deep_model])
+        opt = gradient_descent.SGD(learning_rate=0.1)
+        combined.compile(
+            opt, "mse", [], run_eagerly=test_utils.should_run_eagerly()
+        )
+        combined.fit(x={"symbol": data}, y=y, batch_size=32, epochs=10)
 
-  def test_wide_deep_model_as_layer(self):
-    linear_model = linear.LinearModel(units=1)
-    dnn_model = sequential.Sequential([core.Dense(units=1)])
-    linear_input = input_layer.Input(shape=(3,), name='linear')
-    dnn_input = input_layer.Input(shape=(5,), name='dnn')
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    wide_deep_output = wide_deep_model((linear_input, dnn_input))
-    input_b = input_layer.Input(shape=(1,), name='b')
-    output_b = core.Dense(units=1)(input_b)
-    model = training.Model(
-        inputs=[linear_input, dnn_input, input_b],
-        outputs=[wide_deep_output + output_b])
-    linear_input_np = np.random.uniform(low=-5., high=5., size=(64, 3))
-    dnn_input_np = np.random.uniform(low=-5., high=5., size=(64, 5))
-    input_b_np = np.random.uniform(low=-5., high=5., size=(64,))
-    output_np = linear_input_np[:, 0] + .2 * dnn_input_np[:, 1] + input_b_np
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        metrics=[],
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit([linear_input_np, dnn_input_np, input_b_np], output_np, epochs=5)
+    # This test is an example for cases where linear and dnn model accepts
+    # same raw input but different transformed inputs, i.e,. the raw input is
+    # categorical, and linear model accepts one hot encoding, while dnn model
+    # accepts embedding encoding.
+    def test_wide_deep_model_with_two_feature_columns(self):
+        vocab_list = ["alpha", "beta", "gamma"]
+        vocab_val = [0.4, 0.6, 0.9]
+        data = np.random.choice(vocab_list, size=256)
+        y = np.zeros_like(data, dtype=np.float32)
+        for vocab, val in zip(vocab_list, vocab_val):
+            indices = np.where(data == vocab)
+            y[indices] = val + np.random.uniform(
+                low=-0.01, high=0.01, size=indices[0].shape
+            )
+        cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
+            key="symbol", vocabulary_list=vocab_list
+        )
+        ind_column = tf.feature_column.indicator_column(cat_column)
+        emb_column = tf.feature_column.embedding_column(cat_column, dimension=5)
+        linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+        linear_model = linear.LinearModel(
+            use_bias=False, kernel_initializer="zeros"
+        )
+        combined_linear = sequential.Sequential(
+            [linear_feature_layer, linear_model]
+        )
+        dnn_model = sequential.Sequential([core.Dense(units=1)])
+        dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
+        combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model])
+        wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn)
+        opt = gradient_descent.SGD(learning_rate=0.1)
+        wide_deep_model.compile(
+            opt, "mse", [], run_eagerly=test_utils.should_run_eagerly()
+        )
+        wide_deep_model.fit(x={"symbol": data}, y=y, batch_size=32, epochs=10)
 
-  def test_wide_deep_model_with_sub_model_trained(self):
-    linear_model = linear.LinearModel(units=1)
-    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
-    wide_deep_model = wide_deep.WideDeepModel(
-        linear.LinearModel(units=1),
-        sequential.Sequential([core.Dense(units=1, input_dim=3)]))
-    linear_inp = np.random.uniform(low=-5., high=5., size=(64, 2))
-    dnn_inp = np.random.uniform(low=-5., high=5., size=(64, 3))
-    inputs = [linear_inp, dnn_inp]
-    output = .3 * linear_inp[:, 0] + .2 * dnn_inp[:, 1]
-    linear_model.compile(
-        optimizer='sgd',
-        loss='mse',
-        metrics=[],
-        run_eagerly=test_utils.should_run_eagerly())
-    dnn_model.compile(
-        optimizer='adam',
-        loss='mse',
-        metrics=[],
-        run_eagerly=test_utils.should_run_eagerly())
-    linear_model.fit(linear_inp, output, epochs=50)
-    dnn_model.fit(dnn_inp, output, epochs=50)
-    wide_deep_model.compile(
-        optimizer=['sgd', 'adam'],
-        loss='mse',
-        metrics=[],
-        run_eagerly=test_utils.should_run_eagerly())
-    wide_deep_model.fit(inputs, output, epochs=50)
+    def test_config(self):
+        linear_model = linear.LinearModel(units=1)
+        dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        config = wide_deep_model.get_config()
+        cloned_wide_deep_model = wide_deep.WideDeepModel.from_config(config)
+        self.assertEqual(
+            linear_model.units, cloned_wide_deep_model.linear_model.units
+        )
+        self.assertEqual(
+            dnn_model.layers[0].units,
+            cloned_wide_deep_model.dnn_model.layers[0].units,
+        )
 
-  # This test is an example for cases where linear and dnn model accepts
-  # same raw input and same transformed inputs, i.e., the raw input is
-  # categorical, and both linear and dnn model accept one hot encoding.
-  def test_wide_deep_model_with_single_feature_column(self):
-    vocab_list = ['alpha', 'beta', 'gamma']
-    vocab_val = [0.4, 0.6, 0.9]
-    data = np.random.choice(vocab_list, size=256)
-    y = np.zeros_like(data, dtype=np.float32)
-    for vocab, val in zip(vocab_list, vocab_val):
-      indices = np.where(data == vocab)
-      y[indices] = val + np.random.uniform(
-          low=-0.01, high=0.01, size=indices[0].shape)
-    cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
-        key='symbol', vocabulary_list=vocab_list)
-    ind_column = tf.feature_column.indicator_column(cat_column)
-    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-    linear_model = linear.LinearModel(
-        use_bias=False, kernel_initializer='zeros')
-    dnn_model = sequential.Sequential([core.Dense(units=1)])
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    combined = sequential.Sequential([dense_feature_layer, wide_deep_model])
-    opt = gradient_descent.SGD(learning_rate=0.1)
-    combined.compile(
-        opt,
-        'mse', [],
-        run_eagerly=test_utils.should_run_eagerly())
-    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+    def test_config_with_custom_objects(self):
+        def my_activation(x):
+            return x
 
-  # This test is an example for cases where linear and dnn model accepts
-  # same raw input but different transformed inputs, i.e,. the raw input is
-  # categorical, and linear model accepts one hot encoding, while dnn model
-  # accepts embedding encoding.
-  def test_wide_deep_model_with_two_feature_columns(self):
-    vocab_list = ['alpha', 'beta', 'gamma']
-    vocab_val = [0.4, 0.6, 0.9]
-    data = np.random.choice(vocab_list, size=256)
-    y = np.zeros_like(data, dtype=np.float32)
-    for vocab, val in zip(vocab_list, vocab_val):
-      indices = np.where(data == vocab)
-      y[indices] = val + np.random.uniform(
-          low=-0.01, high=0.01, size=indices[0].shape)
-    cat_column = tf.feature_column.categorical_column_with_vocabulary_list(
-        key='symbol', vocabulary_list=vocab_list)
-    ind_column = tf.feature_column.indicator_column(cat_column)
-    emb_column = tf.feature_column.embedding_column(cat_column, dimension=5)
-    linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-    linear_model = linear.LinearModel(
-        use_bias=False, kernel_initializer='zeros')
-    combined_linear = sequential.Sequential(
-        [linear_feature_layer, linear_model])
-    dnn_model = sequential.Sequential([core.Dense(units=1)])
-    dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
-    combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model])
-    wide_deep_model = wide_deep.WideDeepModel(combined_linear, combined_dnn)
-    opt = gradient_descent.SGD(learning_rate=0.1)
-    wide_deep_model.compile(
-        opt,
-        'mse', [],
-        run_eagerly=test_utils.should_run_eagerly())
-    wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+        linear_model = linear.LinearModel(units=1)
+        dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
+        wide_deep_model = wide_deep.WideDeepModel(
+            linear_model, dnn_model, activation=my_activation
+        )
+        config = wide_deep_model.get_config()
+        cloned_wide_deep_model = wide_deep.WideDeepModel.from_config(
+            config, custom_objects={"my_activation": my_activation}
+        )
+        self.assertEqual(cloned_wide_deep_model.activation, my_activation)
 
-  def test_config(self):
-    linear_model = linear.LinearModel(units=1)
-    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
-    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-    config = wide_deep_model.get_config()
-    cloned_wide_deep_model = wide_deep.WideDeepModel.from_config(config)
-    self.assertEqual(linear_model.units,
-                     cloned_wide_deep_model.linear_model.units)
-    self.assertEqual(dnn_model.layers[0].units,
-                     cloned_wide_deep_model.dnn_model.layers[0].units)
+    def test_export(self):
+        input1 = input_layer.Input(shape=(1,))
+        output1 = linear.LinearModel()(input1)
+        linear_model = training.Model(input1, output1)
 
-  def test_config_with_custom_objects(self):
+        input2 = input_layer.Input(shape=(1,))
+        output2 = core.Dense(units=1)(input2)
+        dnn_model = training.Model(input2, output2)
 
-    def my_activation(x):
-      return x
+        wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+        wide_deep_model.compile(optimizer=["adam", "adam"])
 
-    linear_model = linear.LinearModel(units=1)
-    dnn_model = sequential.Sequential([core.Dense(units=1, input_dim=3)])
-    wide_deep_model = wide_deep.WideDeepModel(
-        linear_model, dnn_model, activation=my_activation)
-    config = wide_deep_model.get_config()
-    cloned_wide_deep_model = wide_deep.WideDeepModel.from_config(
-        config, custom_objects={'my_activation': my_activation})
-    self.assertEqual(cloned_wide_deep_model.activation, my_activation)
+        output = wide_deep_model([input1, input2])
+        model = training.Model([input1, input2], output)
+        model.compile()
+        model.export(self.get_temp_dir())
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/preprocessing/BUILD b/keras/preprocessing/BUILD
index 8cb88f6ecbbc..f4613447a258 100644
--- a/keras/preprocessing/BUILD
+++ b/keras/preprocessing/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #   Contains the Keras preprocessing layers (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
diff --git a/keras/preprocessing/image.py b/keras/preprocessing/image.py
index 08ee76e0c949..2aec637f51b9 100644
--- a/keras/preprocessing/image.py
+++ b/keras/preprocessing/image.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=g-import-not-at-top
-# pylint: disable=g-classes-have-attributes
+
 
 """Utilies for image preprocessing and augmentation.
 
@@ -35,1518 +33,847 @@
 import threading
 import warnings
 
+import numpy as np
+
 from keras import backend
 from keras.utils import data_utils
 from keras.utils import image_utils
-import numpy as np
+from keras.utils import io_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 try:
-  import scipy
-  from scipy import linalg  # pylint: disable=unused-import
-  from scipy import ndimage  # pylint: disable=unused-import
+    import scipy
+    from scipy import linalg  # noqa: F401
+    from scipy import ndimage  # noqa: F401
 except ImportError:
-  pass
+    pass
 try:
-  from PIL import ImageEnhance
+    from PIL import ImageEnhance
 except ImportError:
-  ImageEnhance = None
+    ImageEnhance = None
 
 
-@keras_export('keras.preprocessing.image.Iterator')
+@keras_export("keras.preprocessing.image.Iterator")
 class Iterator(data_utils.Sequence):
-  """Base class for image data iterators.
-
-  Deprecated: `tf.keras.preprocessing.image.Iterator` is not recommended for
-  new code. Prefer loading images with
-  `tf.keras.utils.image_dataset_from_directory` and transforming the output
-  `tf.data.Dataset` with preprocessing layers. For more information, see the
-  tutorials for [loading images](
-  https://www.tensorflow.org/tutorials/load_data/images) and
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Every `Iterator` must implement the `_get_batches_of_transformed_samples`
-  method.
-
-  Args:
-      n: Integer, total number of samples in the dataset to loop over.
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      seed: Random seeding for data shuffling.
-  """
-  white_list_formats = ('png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff')
-
-  def __init__(self, n, batch_size, shuffle, seed):
-    self.n = n
-    self.batch_size = batch_size
-    self.seed = seed
-    self.shuffle = shuffle
-    self.batch_index = 0
-    self.total_batches_seen = 0
-    self.lock = threading.Lock()
-    self.index_array = None
-    self.index_generator = self._flow_index()
-
-  def _set_index_array(self):
-    self.index_array = np.arange(self.n)
-    if self.shuffle:
-      self.index_array = np.random.permutation(self.n)
-
-  def __getitem__(self, idx):
-    if idx >= len(self):
-      raise ValueError('Asked to retrieve element {idx}, '
-                       'but the Sequence '
-                       'has length {length}'.format(idx=idx, length=len(self)))
-    if self.seed is not None:
-      np.random.seed(self.seed + self.total_batches_seen)
-    self.total_batches_seen += 1
-    if self.index_array is None:
-      self._set_index_array()
-    index_array = self.index_array[self.batch_size * idx:self.batch_size *
-                                   (idx + 1)]
-    return self._get_batches_of_transformed_samples(index_array)
-
-  def __len__(self):
-    return (self.n + self.batch_size - 1) // self.batch_size  # round up
-
-  def on_epoch_end(self):
-    self._set_index_array()
-
-  def reset(self):
-    self.batch_index = 0
-
-  def _flow_index(self):
-    # Ensure self.batch_index is 0.
-    self.reset()
-    while 1:
-      if self.seed is not None:
-        np.random.seed(self.seed + self.total_batches_seen)
-      if self.batch_index == 0:
+    """Base class for image data iterators.
+
+    Deprecated: `tf.keras.preprocessing.image.Iterator` is not recommended for
+    new code. Prefer loading images with
+    `tf.keras.utils.image_dataset_from_directory` and transforming the output
+    `tf.data.Dataset` with preprocessing layers. For more information, see the
+    tutorials for [loading images](
+    https://www.tensorflow.org/tutorials/load_data/images) and
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Every `Iterator` must implement the `_get_batches_of_transformed_samples`
+    method.
+
+    Args:
+        n: Integer, total number of samples in the dataset to loop over.
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        seed: Random seeding for data shuffling.
+    """
+
+    white_list_formats = ("png", "jpg", "jpeg", "bmp", "ppm", "tif", "tiff")
+
+    def __init__(self, n, batch_size, shuffle, seed):
+        self.n = n
+        self.batch_size = batch_size
+        self.seed = seed
+        self.shuffle = shuffle
+        self.batch_index = 0
+        self.total_batches_seen = 0
+        self.lock = threading.Lock()
+        self.index_array = None
+        self.index_generator = self._flow_index()
+
+    def _set_index_array(self):
+        self.index_array = np.arange(self.n)
+        if self.shuffle:
+            self.index_array = np.random.permutation(self.n)
+
+    def __getitem__(self, idx):
+        if idx >= len(self):
+            raise ValueError(
+                "Asked to retrieve element {idx}, "
+                "but the Sequence "
+                "has length {length}".format(idx=idx, length=len(self))
+            )
+        if self.seed is not None:
+            np.random.seed(self.seed + self.total_batches_seen)
+        self.total_batches_seen += 1
+        if self.index_array is None:
+            self._set_index_array()
+        index_array = self.index_array[
+            self.batch_size * idx : self.batch_size * (idx + 1)
+        ]
+        return self._get_batches_of_transformed_samples(index_array)
+
+    def __len__(self):
+        return (self.n + self.batch_size - 1) // self.batch_size  # round up
+
+    def on_epoch_end(self):
         self._set_index_array()
 
-      if self.n == 0:
-        # Avoiding modulo by zero error
-        current_index = 0
-      else:
-        current_index = (self.batch_index * self.batch_size) % self.n
-      if self.n > current_index + self.batch_size:
-        self.batch_index += 1
-      else:
+    def reset(self):
         self.batch_index = 0
-      self.total_batches_seen += 1
-      yield self.index_array[current_index:current_index + self.batch_size]
 
-  def __iter__(self):
-    # Needed if we want to do something like:
-    # for x, y in data_gen.flow(...):
-    return self
+    def _flow_index(self):
+        # Ensure self.batch_index is 0.
+        self.reset()
+        while 1:
+            if self.seed is not None:
+                np.random.seed(self.seed + self.total_batches_seen)
+            if self.batch_index == 0:
+                self._set_index_array()
+
+            if self.n == 0:
+                # Avoiding modulo by zero error
+                current_index = 0
+            else:
+                current_index = (self.batch_index * self.batch_size) % self.n
+            if self.n > current_index + self.batch_size:
+                self.batch_index += 1
+            else:
+                self.batch_index = 0
+            self.total_batches_seen += 1
+            yield self.index_array[
+                current_index : current_index + self.batch_size
+            ]
+
+    def __iter__(self):
+        # Needed if we want to do something like:
+        # for x, y in data_gen.flow(...):
+        return self
+
+    def __next__(self, *args, **kwargs):
+        return self.next(*args, **kwargs)
+
+    def next(self):
+        """For python 2.x.
+
+        Returns:
+            The next batch.
+        """
+        with self.lock:
+            index_array = next(self.index_generator)
+        # The transformation of images is not under thread lock
+        # so it can be done in parallel
+        return self._get_batches_of_transformed_samples(index_array)
+
+    def _get_batches_of_transformed_samples(self, index_array):
+        """Gets a batch of transformed samples.
+
+        Args:
+            index_array: Array of sample indices to include in batch.
+        Returns:
+            A batch of transformed samples.
+        """
+        raise NotImplementedError
 
-  def __next__(self, *args, **kwargs):
-    return self.next(*args, **kwargs)
 
-  def next(self):
-    """For python 2.x.
+def _iter_valid_files(directory, white_list_formats, follow_links):
+    """Iterates on files with extension.
 
-    Returns:
-        The next batch.
+    Args:
+        directory: Absolute path to the directory
+            containing files to be counted
+        white_list_formats: Set of strings containing allowed extensions for
+            the files to be counted.
+        follow_links: Boolean, follow symbolic links to subdirectories.
+    Yields:
+        Tuple of (root, filename) with extension in `white_list_formats`.
     """
-    with self.lock:
-      index_array = next(self.index_generator)
-    # The transformation of images is not under thread lock
-    # so it can be done in parallel
-    return self._get_batches_of_transformed_samples(index_array)
 
-  def _get_batches_of_transformed_samples(self, index_array):
-    """Gets a batch of transformed samples.
+    def _recursive_list(subpath):
+        return sorted(
+            os.walk(subpath, followlinks=follow_links), key=lambda x: x[0]
+        )
+
+    for root, _, files in _recursive_list(directory):
+        for fname in sorted(files):
+            if fname.lower().endswith(".tiff"):
+                warnings.warn(
+                    'Using ".tiff" files with multiple bands '
+                    "will cause distortion. Please verify your output."
+                )
+            if fname.lower().endswith(white_list_formats):
+                yield root, fname
+
+
+def _list_valid_filenames_in_directory(
+    directory, white_list_formats, split, class_indices, follow_links
+):
+    """Lists paths of files in `subdir` with extensions in `white_list_formats`.
 
     Args:
-        index_array: Array of sample indices to include in batch.
+        directory: absolute path to a directory containing the files to list.
+            The directory name is used as class label
+            and must be a key of `class_indices`.
+        white_list_formats: set of strings containing allowed extensions for
+            the files to be counted.
+        split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
+            account a certain fraction of files in each directory.
+            E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
+            of images in each directory.
+        class_indices: dictionary mapping a class name to its index.
+        follow_links: boolean, follow symbolic links to subdirectories.
+
     Returns:
-        A batch of transformed samples.
+         classes: a list of class indices
+         filenames: the path of valid files in `directory`, relative from
+             `directory`'s parent (e.g., if `directory` is "dataset/class1",
+            the filenames will be
+            `["class1/file1.jpg", "class1/file2.jpg", ...]`).
+    """
+    dirname = os.path.basename(directory)
+    if split:
+        all_files = list(
+            _iter_valid_files(directory, white_list_formats, follow_links)
+        )
+        num_files = len(all_files)
+        start, stop = int(split[0] * num_files), int(split[1] * num_files)
+        valid_files = all_files[start:stop]
+    else:
+        valid_files = _iter_valid_files(
+            directory, white_list_formats, follow_links
+        )
+    classes = []
+    filenames = []
+    for root, fname in valid_files:
+        classes.append(class_indices[dirname])
+        absolute_path = os.path.join(root, fname)
+        relative_path = os.path.join(
+            dirname, os.path.relpath(absolute_path, directory)
+        )
+        filenames.append(relative_path)
+
+    return classes, filenames
+
+
+class BatchFromFilesMixin:
+    """Adds methods related to getting batches from filenames.
+
+    It includes the logic to transform image files to batches.
     """
-    raise NotImplementedError
-
 
-def _iter_valid_files(directory, white_list_formats, follow_links):
-  """Iterates on files with extension.
-
-  Args:
-      directory: Absolute path to the directory
-          containing files to be counted
-      white_list_formats: Set of strings containing allowed extensions for
-          the files to be counted.
-      follow_links: Boolean, follow symbolic links to subdirectories.
-  Yields:
-      Tuple of (root, filename) with extension in `white_list_formats`.
-  """
-
-  def _recursive_list(subpath):
-    return sorted(
-        os.walk(subpath, followlinks=follow_links), key=lambda x: x[0])
-
-  for root, _, files in _recursive_list(directory):
-    for fname in sorted(files):
-      if fname.lower().endswith('.tiff'):
-        warnings.warn('Using ".tiff" files with multiple bands '
-                      'will cause distortion. Please verify your output.')
-      if fname.lower().endswith(white_list_formats):
-        yield root, fname
-
-
-def _list_valid_filenames_in_directory(directory, white_list_formats, split,
-                                       class_indices, follow_links):
-  """Lists paths of files in `subdir` with extensions in `white_list_formats`.
-
-  Args:
-      directory: absolute path to a directory containing the files to list.
-          The directory name is used as class label
-          and must be a key of `class_indices`.
-      white_list_formats: set of strings containing allowed extensions for
-          the files to be counted.
-      split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
-          account a certain fraction of files in each directory.
-          E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
-          of images in each directory.
-      class_indices: dictionary mapping a class name to its index.
-      follow_links: boolean, follow symbolic links to subdirectories.
-
-  Returns:
-       classes: a list of class indices
-       filenames: the path of valid files in `directory`, relative from
-           `directory`'s parent (e.g., if `directory` is "dataset/class1",
-          the filenames will be
-          `["class1/file1.jpg", "class1/file2.jpg", ...]`).
-  """
-  dirname = os.path.basename(directory)
-  if split:
-    all_files = list(
-        _iter_valid_files(directory, white_list_formats, follow_links))
-    num_files = len(all_files)
-    start, stop = int(split[0] * num_files), int(split[1] * num_files)
-    valid_files = all_files[start:stop]
-  else:
-    valid_files = _iter_valid_files(directory, white_list_formats, follow_links)
-  classes = []
-  filenames = []
-  for root, fname in valid_files:
-    classes.append(class_indices[dirname])
-    absolute_path = os.path.join(root, fname)
-    relative_path = os.path.join(dirname,
-                                 os.path.relpath(absolute_path, directory))
-    filenames.append(relative_path)
-
-  return classes, filenames
-
-
-class BatchFromFilesMixin():
-  """Adds methods related to getting batches from filenames.
-
-  It includes the logic to transform image files to batches.
-  """
-
-  def set_processing_attrs(self, image_data_generator, target_size, color_mode,
-                           data_format, save_to_dir, save_prefix, save_format,
-                           subset, interpolation, keep_aspect_ratio):
-    """Sets attributes to use later for processing files into a batch.
+    def set_processing_attrs(
+        self,
+        image_data_generator,
+        target_size,
+        color_mode,
+        data_format,
+        save_to_dir,
+        save_prefix,
+        save_format,
+        subset,
+        interpolation,
+        keep_aspect_ratio,
+    ):
+        """Sets attributes to use later for processing files into a batch.
+
+        Args:
+            image_data_generator: Instance of `ImageDataGenerator`
+                to use for random transformations and normalization.
+            target_size: tuple of integers, dimensions to resize input images
+            to.
+            color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`.
+                Color mode to read images.
+            data_format: String, one of `channels_first`, `channels_last`.
+            save_to_dir: Optional directory where to save the pictures
+                being yielded, in a viewable format. This is useful
+                for visualizing the random transformations being
+                applied, for debugging purposes.
+            save_prefix: String prefix to use for saving sample
+                images (if `save_to_dir` is set).
+            save_format: Format to use for saving sample images
+                (if `save_to_dir` is set).
+            subset: Subset of data (`"training"` or `"validation"`) if
+                validation_split is set in ImageDataGenerator.
+            interpolation: Interpolation method used to resample the image if
+                the target size is different from that of the loaded image.
+                Supported methods are "nearest", "bilinear", and "bicubic". If
+                PIL version 1.1.3 or newer is installed, "lanczos" is also
+                supported. If PIL version 3.4.0 or newer is installed, "box" and
+                "hamming" are also supported. By default, "nearest" is used.
+            keep_aspect_ratio: Boolean, whether to resize images to a target
+                size without aspect ratio distortion. The image is cropped in
+                the center with target aspect ratio before resizing.
+        """
+        self.image_data_generator = image_data_generator
+        self.target_size = tuple(target_size)
+        self.keep_aspect_ratio = keep_aspect_ratio
+        if color_mode not in {"rgb", "rgba", "grayscale"}:
+            raise ValueError(
+                "Invalid color mode:",
+                color_mode,
+                '; expected "rgb", "rgba", or "grayscale".',
+            )
+        self.color_mode = color_mode
+        self.data_format = data_format
+        if self.color_mode == "rgba":
+            if self.data_format == "channels_last":
+                self.image_shape = self.target_size + (4,)
+            else:
+                self.image_shape = (4,) + self.target_size
+        elif self.color_mode == "rgb":
+            if self.data_format == "channels_last":
+                self.image_shape = self.target_size + (3,)
+            else:
+                self.image_shape = (3,) + self.target_size
+        else:
+            if self.data_format == "channels_last":
+                self.image_shape = self.target_size + (1,)
+            else:
+                self.image_shape = (1,) + self.target_size
+        self.save_to_dir = save_to_dir
+        self.save_prefix = save_prefix
+        self.save_format = save_format
+        self.interpolation = interpolation
+        if subset is not None:
+            validation_split = self.image_data_generator._validation_split
+            if subset == "validation":
+                split = (0, validation_split)
+            elif subset == "training":
+                split = (validation_split, 1)
+            else:
+                raise ValueError(
+                    "Invalid subset name: %s;"
+                    'expected "training" or "validation"' % (subset,)
+                )
+        else:
+            split = None
+        self.split = split
+        self.subset = subset
+
+    def _get_batches_of_transformed_samples(self, index_array):
+        """Gets a batch of transformed samples.
+
+        Args:
+            index_array: Array of sample indices to include in batch.
+        Returns:
+            A batch of transformed samples.
+        """
+        batch_x = np.zeros(
+            (len(index_array),) + self.image_shape, dtype=self.dtype
+        )
+        # build batch of image data
+        # self.filepaths is dynamic, is better to call it once outside the loop
+        filepaths = self.filepaths
+        for i, j in enumerate(index_array):
+            img = image_utils.load_img(
+                filepaths[j],
+                color_mode=self.color_mode,
+                target_size=self.target_size,
+                interpolation=self.interpolation,
+                keep_aspect_ratio=self.keep_aspect_ratio,
+            )
+            x = image_utils.img_to_array(img, data_format=self.data_format)
+            # Pillow images should be closed after `load_img`,
+            # but not PIL images.
+            if hasattr(img, "close"):
+                img.close()
+            if self.image_data_generator:
+                params = self.image_data_generator.get_random_transform(x.shape)
+                x = self.image_data_generator.apply_transform(x, params)
+                x = self.image_data_generator.standardize(x)
+            batch_x[i] = x
+        # optionally save augmented images to disk for debugging purposes
+        if self.save_to_dir:
+            for i, j in enumerate(index_array):
+                img = image_utils.array_to_img(
+                    batch_x[i], self.data_format, scale=True
+                )
+                fname = "{prefix}_{index}_{hash}.{format}".format(
+                    prefix=self.save_prefix,
+                    index=j,
+                    hash=np.random.randint(1e7),
+                    format=self.save_format,
+                )
+                img.save(os.path.join(self.save_to_dir, fname))
+        # build batch of labels
+        if self.class_mode == "input":
+            batch_y = batch_x.copy()
+        elif self.class_mode in {"binary", "sparse"}:
+            batch_y = np.empty(len(batch_x), dtype=self.dtype)
+            for i, n_observation in enumerate(index_array):
+                batch_y[i] = self.classes[n_observation]
+        elif self.class_mode == "categorical":
+            batch_y = np.zeros(
+                (len(batch_x), len(self.class_indices)), dtype=self.dtype
+            )
+            for i, n_observation in enumerate(index_array):
+                batch_y[i, self.classes[n_observation]] = 1.0
+        elif self.class_mode == "multi_output":
+            batch_y = [output[index_array] for output in self.labels]
+        elif self.class_mode == "raw":
+            batch_y = self.labels[index_array]
+        else:
+            return batch_x
+        if self.sample_weight is None:
+            return batch_x, batch_y
+        else:
+            return batch_x, batch_y, self.sample_weight[index_array]
+
+    @property
+    def filepaths(self):
+        """List of absolute paths to image files."""
+        raise NotImplementedError(
+            "`filepaths` property method has not "
+            "been implemented in {}.".format(type(self).__name__)
+        )
+
+    @property
+    def labels(self):
+        """Class labels of every observation."""
+        raise NotImplementedError(
+            "`labels` property method has not been implemented in {}.".format(
+                type(self).__name__
+            )
+        )
+
+    @property
+    def sample_weight(self):
+        raise NotImplementedError(
+            "`sample_weight` property method has not "
+            "been implemented in {}.".format(type(self).__name__)
+        )
+
+
+@keras_export("keras.preprocessing.image.DirectoryIterator")
+class DirectoryIterator(BatchFromFilesMixin, Iterator):
+    """Iterator capable of reading images from a directory on disk.
+
+    Deprecated: `tf.keras.preprocessing.image.DirectoryIterator` is not
+    recommended for new code. Prefer loading images with
+    `tf.keras.utils.image_dataset_from_directory` and transforming the output
+    `tf.data.Dataset` with preprocessing layers. For more information, see the
+    tutorials for [loading images](
+    https://www.tensorflow.org/tutorials/load_data/images) and
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
     Args:
-        image_data_generator: Instance of `ImageDataGenerator`
-            to use for random transformations and normalization.
-        target_size: tuple of integers, dimensions to resize input images
-        to.
-        color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`.
-            Color mode to read images.
+        directory: Path to the directory to read images from. Each subdirectory
+          in this directory will be considered to contain images from one class,
+          or alternatively you could specify class subdirectories via the
+          `classes` argument.
+        image_data_generator: Instance of `ImageDataGenerator` to use for random
+          transformations and normalization.
+        target_size: tuple of integers, dimensions to resize input images to.
+        color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
+          images.
+        classes: Optional list of strings, names of subdirectories containing
+          images from each class (e.g. `["dogs", "cats"]`). It will be computed
+          automatically if not set.
+        class_mode: Mode for yielding the targets:
+            - `"binary"`: binary targets (if there are only two classes),
+            - `"categorical"`: categorical targets,
+            - `"sparse"`: integer targets,
+            - `"input"`: targets are images identical to input images (mainly
+              used to work with autoencoders),
+            - `None`: no targets get yielded (only input images are yielded).
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        seed: Random seed for data shuffling.
         data_format: String, one of `channels_first`, `channels_last`.
-        save_to_dir: Optional directory where to save the pictures
-            being yielded, in a viewable format. This is useful
-            for visualizing the random transformations being
-            applied, for debugging purposes.
-        save_prefix: String prefix to use for saving sample
-            images (if `save_to_dir` is set).
-        save_format: Format to use for saving sample images
-            (if `save_to_dir` is set).
+        save_to_dir: Optional directory where to save the pictures being
+          yielded, in a viewable format. This is useful for visualizing the
+          random transformations being applied, for debugging purposes.
+        save_prefix: String prefix to use for saving sample images (if
+          `save_to_dir` is set).
+        save_format: Format to use for saving sample images (if `save_to_dir` is
+          set).
         subset: Subset of data (`"training"` or `"validation"`) if
-            validation_split is set in ImageDataGenerator.
+          validation_split is set in ImageDataGenerator.
         interpolation: Interpolation method used to resample the image if the
-            target size is different from that of the loaded image.
-            Supported methods are "nearest", "bilinear", and "bicubic".
-            If PIL version 1.1.3 or newer is installed, "lanczos" is also
-            supported. If PIL version 3.4.0 or newer is installed, "box" and
-            "hamming" are also supported. By default, "nearest" is used.
+          target size is different from that of the loaded image. Supported
+          methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
+          or newer is installed, "lanczos" is also supported. If PIL version
+          3.4.0 or newer is installed, "box" and "hamming" are also supported.
+          By default, "nearest" is used.
         keep_aspect_ratio: Boolean, whether to resize images to a target size
             without aspect ratio distortion. The image is cropped in the center
             with target aspect ratio before resizing.
+        dtype: Dtype to use for generated arrays.
     """
-    self.image_data_generator = image_data_generator
-    self.target_size = tuple(target_size)
-    self.keep_aspect_ratio = keep_aspect_ratio
-    if color_mode not in {'rgb', 'rgba', 'grayscale'}:
-      raise ValueError('Invalid color mode:', color_mode,
-                       '; expected "rgb", "rgba", or "grayscale".')
-    self.color_mode = color_mode
-    self.data_format = data_format
-    if self.color_mode == 'rgba':
-      if self.data_format == 'channels_last':
-        self.image_shape = self.target_size + (4,)
-      else:
-        self.image_shape = (4,) + self.target_size
-    elif self.color_mode == 'rgb':
-      if self.data_format == 'channels_last':
-        self.image_shape = self.target_size + (3,)
-      else:
-        self.image_shape = (3,) + self.target_size
-    else:
-      if self.data_format == 'channels_last':
-        self.image_shape = self.target_size + (1,)
-      else:
-        self.image_shape = (1,) + self.target_size
-    self.save_to_dir = save_to_dir
-    self.save_prefix = save_prefix
-    self.save_format = save_format
-    self.interpolation = interpolation
-    if subset is not None:
-      validation_split = self.image_data_generator._validation_split  # pylint: disable=protected-access
-      if subset == 'validation':
-        split = (0, validation_split)
-      elif subset == 'training':
-        split = (validation_split, 1)
-      else:
-        raise ValueError('Invalid subset name: %s;'
-                         'expected "training" or "validation"' % (subset,))
-    else:
-      split = None
-    self.split = split
-    self.subset = subset
 
-  def _get_batches_of_transformed_samples(self, index_array):
-    """Gets a batch of transformed samples.
+    allowed_class_modes = {"categorical", "binary", "sparse", "input", None}
 
-    Args:
-        index_array: Array of sample indices to include in batch.
-    Returns:
-        A batch of transformed samples.
-    """
-    batch_x = np.zeros((len(index_array),) + self.image_shape, dtype=self.dtype)
-    # build batch of image data
-    # self.filepaths is dynamic, is better to call it once outside the loop
-    filepaths = self.filepaths
-    for i, j in enumerate(index_array):
-      img = image_utils.load_img(
-          filepaths[j],
-          color_mode=self.color_mode,
-          target_size=self.target_size,
-          interpolation=self.interpolation,
-          keep_aspect_ratio=self.keep_aspect_ratio)
-      x = image_utils.img_to_array(img, data_format=self.data_format)
-      # Pillow images should be closed after `load_img`,
-      # but not PIL images.
-      if hasattr(img, 'close'):
-        img.close()
-      if self.image_data_generator:
-        params = self.image_data_generator.get_random_transform(x.shape)
-        x = self.image_data_generator.apply_transform(x, params)
-        x = self.image_data_generator.standardize(x)
-      batch_x[i] = x
-    # optionally save augmented images to disk for debugging purposes
-    if self.save_to_dir:
-      for i, j in enumerate(index_array):
-        img = image_utils.array_to_img(batch_x[i], self.data_format, scale=True)
-        fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix,
-            index=j,
-            hash=np.random.randint(1e7),
-            format=self.save_format)
-        img.save(os.path.join(self.save_to_dir, fname))
-    # build batch of labels
-    if self.class_mode == 'input':
-      batch_y = batch_x.copy()
-    elif self.class_mode in {'binary', 'sparse'}:
-      batch_y = np.empty(len(batch_x), dtype=self.dtype)
-      for i, n_observation in enumerate(index_array):
-        batch_y[i] = self.classes[n_observation]
-    elif self.class_mode == 'categorical':
-      batch_y = np.zeros((len(batch_x), len(self.class_indices)),
-                         dtype=self.dtype)
-      for i, n_observation in enumerate(index_array):
-        batch_y[i, self.classes[n_observation]] = 1.
-    elif self.class_mode == 'multi_output':
-      batch_y = [output[index_array] for output in self.labels]
-    elif self.class_mode == 'raw':
-      batch_y = self.labels[index_array]
-    else:
-      return batch_x
-    if self.sample_weight is None:
-      return batch_x, batch_y
-    else:
-      return batch_x, batch_y, self.sample_weight[index_array]
-
-  @property
-  def filepaths(self):
-    """List of absolute paths to image files."""
-    raise NotImplementedError(
-        '`filepaths` property method has not been implemented in {}.'.format(
-            type(self).__name__))
-
-  @property
-  def labels(self):
-    """Class labels of every observation."""
-    raise NotImplementedError(
-        '`labels` property method has not been implemented in {}.'.format(
-            type(self).__name__))
-
-  @property
-  def sample_weight(self):
-    raise NotImplementedError(
-        '`sample_weight` property method has not been implemented in {}.'
-        .format(type(self).__name__))
-
-
-@keras_export('keras.preprocessing.image.DirectoryIterator')
-class DirectoryIterator(BatchFromFilesMixin, Iterator):
-  """Iterator capable of reading images from a directory on disk.
-
-  Deprecated: `tf.keras.preprocessing.image.DirectoryIterator` is not
-  recommended for new code. Prefer loading images with
-  `tf.keras.utils.image_dataset_from_directory` and transforming the output
-  `tf.data.Dataset` with preprocessing layers. For more information, see the
-  tutorials for [loading images](
-  https://www.tensorflow.org/tutorials/load_data/images) and
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-      directory: Path to the directory to read images from. Each subdirectory in
-        this directory will be considered to contain images from one class, or
-        alternatively you could specify class subdirectories via the `classes`
-        argument.
-      image_data_generator: Instance of `ImageDataGenerator` to use for random
-        transformations and normalization.
-      target_size: tuple of integers, dimensions to resize input images to.
-      color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
-        images.
-      classes: Optional list of strings, names of subdirectories containing
-        images from each class (e.g. `["dogs", "cats"]`). It will be computed
-        automatically if not set.
-      class_mode: Mode for yielding the targets:
-          - `"binary"`: binary targets (if there are only two classes),
-          - `"categorical"`: categorical targets,
-          - `"sparse"`: integer targets,
-          - `"input"`: targets are images identical to input images (mainly used
-            to work with autoencoders),
-          - `None`: no targets get yielded (only input images are yielded).
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      seed: Random seed for data shuffling.
-      data_format: String, one of `channels_first`, `channels_last`.
-      save_to_dir: Optional directory where to save the pictures being yielded,
-        in a viewable format. This is useful for visualizing the random
-        transformations being applied, for debugging purposes.
-      save_prefix: String prefix to use for saving sample images (if
-        `save_to_dir` is set).
-      save_format: Format to use for saving sample images (if `save_to_dir` is
-        set).
-      subset: Subset of data (`"training"` or `"validation"`) if
-        validation_split is set in ImageDataGenerator.
-      interpolation: Interpolation method used to resample the image if the
-        target size is different from that of the loaded image. Supported
-        methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
-        or newer is installed, "lanczos" is also supported. If PIL version 3.4.0
-        or newer is installed, "box" and "hamming" are also supported. By
-        default, "nearest" is used.
-      keep_aspect_ratio: Boolean, whether to resize images to a target size
-          without aspect ratio distortion. The image is cropped in the center
-          with target aspect ratio before resizing.
-      dtype: Dtype to use for generated arrays.
-  """
-  allowed_class_modes = {'categorical', 'binary', 'sparse', 'input', None}
-
-  def __init__(self,
-               directory,
-               image_data_generator,
-               target_size=(256, 256),
-               color_mode='rgb',
-               classes=None,
-               class_mode='categorical',
-               batch_size=32,
-               shuffle=True,
-               seed=None,
-               data_format=None,
-               save_to_dir=None,
-               save_prefix='',
-               save_format='png',
-               follow_links=False,
-               subset=None,
-               interpolation='nearest',
-               keep_aspect_ratio=False,
-               dtype=None):
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if dtype is None:
-      dtype = backend.floatx()
-    super().set_processing_attrs(image_data_generator, target_size, color_mode,
-                                 data_format, save_to_dir, save_prefix,
-                                 save_format, subset, interpolation,
-                                 keep_aspect_ratio)
-    self.directory = directory
-    self.classes = classes
-    if class_mode not in self.allowed_class_modes:
-      raise ValueError('Invalid class_mode: {}; expected one of: {}'
-                       .format(class_mode, self.allowed_class_modes))
-    self.class_mode = class_mode
-    self.dtype = dtype
-    # First, count the number of samples and classes.
-    self.samples = 0
-
-    if not classes:
-      classes = []
-      for subdir in sorted(os.listdir(directory)):
-        if os.path.isdir(os.path.join(directory, subdir)):
-          classes.append(subdir)
-    self.num_classes = len(classes)
-    self.class_indices = dict(zip(classes, range(len(classes))))
-
-    pool = multiprocessing.pool.ThreadPool()
-
-    # Second, build an index of the images
-    # in the different class subfolders.
-    results = []
-    self.filenames = []
-    i = 0
-    for dirpath in (os.path.join(directory, subdir) for subdir in classes):
-      results.append(
-          pool.apply_async(_list_valid_filenames_in_directory,
-                           (dirpath, self.white_list_formats, self.split,
-                            self.class_indices, follow_links)))
-    classes_list = []
-    for res in results:
-      classes, filenames = res.get()
-      classes_list.append(classes)
-      self.filenames += filenames
-    self.samples = len(self.filenames)
-    self.classes = np.zeros((self.samples,), dtype='int32')
-    for classes in classes_list:
-      self.classes[i:i + len(classes)] = classes
-      i += len(classes)
-
-    print('Found %d images belonging to %d classes.' %
-          (self.samples, self.num_classes))
-    pool.close()
-    pool.join()
-    self._filepaths = [
-        os.path.join(self.directory, fname) for fname in self.filenames
-    ]
-    super().__init__(self.samples, batch_size, shuffle, seed)
-
-  @property
-  def filepaths(self):
-    return self._filepaths
-
-  @property
-  def labels(self):
-    return self.classes
-
-  @property  # mixin needs this property to work
-  def sample_weight(self):
-    # no sample weights will be returned
-    return None
-
-
-@keras_export('keras.preprocessing.image.NumpyArrayIterator')
+    def __init__(
+        self,
+        directory,
+        image_data_generator,
+        target_size=(256, 256),
+        color_mode="rgb",
+        classes=None,
+        class_mode="categorical",
+        batch_size=32,
+        shuffle=True,
+        seed=None,
+        data_format=None,
+        save_to_dir=None,
+        save_prefix="",
+        save_format="png",
+        follow_links=False,
+        subset=None,
+        interpolation="nearest",
+        keep_aspect_ratio=False,
+        dtype=None,
+    ):
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if dtype is None:
+            dtype = backend.floatx()
+        super().set_processing_attrs(
+            image_data_generator,
+            target_size,
+            color_mode,
+            data_format,
+            save_to_dir,
+            save_prefix,
+            save_format,
+            subset,
+            interpolation,
+            keep_aspect_ratio,
+        )
+        self.directory = directory
+        self.classes = classes
+        if class_mode not in self.allowed_class_modes:
+            raise ValueError(
+                "Invalid class_mode: {}; expected one of: {}".format(
+                    class_mode, self.allowed_class_modes
+                )
+            )
+        self.class_mode = class_mode
+        self.dtype = dtype
+        # First, count the number of samples and classes.
+        self.samples = 0
+
+        if not classes:
+            classes = []
+            for subdir in sorted(os.listdir(directory)):
+                if os.path.isdir(os.path.join(directory, subdir)):
+                    classes.append(subdir)
+        self.num_classes = len(classes)
+        self.class_indices = dict(zip(classes, range(len(classes))))
+
+        pool = multiprocessing.pool.ThreadPool()
+
+        # Second, build an index of the images
+        # in the different class subfolders.
+        results = []
+        self.filenames = []
+        i = 0
+        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
+            results.append(
+                pool.apply_async(
+                    _list_valid_filenames_in_directory,
+                    (
+                        dirpath,
+                        self.white_list_formats,
+                        self.split,
+                        self.class_indices,
+                        follow_links,
+                    ),
+                )
+            )
+        classes_list = []
+        for res in results:
+            classes, filenames = res.get()
+            classes_list.append(classes)
+            self.filenames += filenames
+        self.samples = len(self.filenames)
+        self.classes = np.zeros((self.samples,), dtype="int32")
+        for classes in classes_list:
+            self.classes[i : i + len(classes)] = classes
+            i += len(classes)
+
+        io_utils.print_msg(
+            f"Found {self.samples} images belonging to "
+            f"{self.num_classes} classes."
+        )
+        pool.close()
+        pool.join()
+        self._filepaths = [
+            os.path.join(self.directory, fname) for fname in self.filenames
+        ]
+        super().__init__(self.samples, batch_size, shuffle, seed)
+
+    @property
+    def filepaths(self):
+        return self._filepaths
+
+    @property
+    def labels(self):
+        return self.classes
+
+    @property  # mixin needs this property to work
+    def sample_weight(self):
+        # no sample weights will be returned
+        return None
+
+
+@keras_export("keras.preprocessing.image.NumpyArrayIterator")
 class NumpyArrayIterator(Iterator):
-  """Iterator yielding data from a Numpy array.
-
-  Deprecated: `tf.keras.preprocessing.image.NumpyArrayIterator` is not
-  recommended for new code. Prefer loading images with
-  `tf.keras.utils.image_dataset_from_directory` and transforming the output
-  `tf.data.Dataset` with preprocessing layers. For more information, see the
-  tutorials for [loading images](
-  https://www.tensorflow.org/tutorials/load_data/images) and
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-      x: Numpy array of input data or tuple. If tuple, the second elements is
-        either another numpy array or a list of numpy arrays, each of which gets
-        passed through as an output without any modifications.
-      y: Numpy array of targets data.
-      image_data_generator: Instance of `ImageDataGenerator` to use for random
-        transformations and normalization.
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      sample_weight: Numpy array of sample weights.
-      seed: Random seed for data shuffling.
-      data_format: String, one of `channels_first`, `channels_last`.
-      save_to_dir: Optional directory where to save the pictures being yielded,
-        in a viewable format. This is useful for visualizing the random
-        transformations being applied, for debugging purposes.
-      save_prefix: String prefix to use for saving sample images (if
-        `save_to_dir` is set).
-      save_format: Format to use for saving sample images (if `save_to_dir` is
-        set).
-      subset: Subset of data (`"training"` or `"validation"`) if
-        validation_split is set in ImageDataGenerator.
-      ignore_class_split: Boolean (default: False), ignore difference
-        in number of classes in labels across train and validation
-        split (useful for non-classification tasks)
-      dtype: Dtype to use for the generated arrays.
-  """
-
-  def __init__(self,
-               x,
-               y,
-               image_data_generator,
-               batch_size=32,
-               shuffle=False,
-               sample_weight=None,
-               seed=None,
-               data_format=None,
-               save_to_dir=None,
-               save_prefix='',
-               save_format='png',
-               subset=None,
-               ignore_class_split=False,
-               dtype=None):
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if dtype is None:
-      dtype = backend.floatx()
-    self.dtype = dtype
-    if isinstance(x, tuple) or isinstance(x, list):
-      if not isinstance(x[1], list):
-        x_misc = [np.asarray(x[1])]
-      else:
-        x_misc = [np.asarray(xx) for xx in x[1]]
-      x = x[0]
-      for xx in x_misc:
-        if len(x) != len(xx):
-          raise ValueError('All of the arrays in `x` '
-                           'should have the same length. '
-                           'Found a pair with: len(x[0]) = %s, len(x[?]) = %s' %
-                           (len(x), len(xx)))
-    else:
-      x_misc = []
-
-    if y is not None and len(x) != len(y):
-      raise ValueError('`x` (images tensor) and `y` (labels) '
-                       'should have the same length. '
-                       'Found: x.shape = %s, y.shape = %s' %
-                       (np.asarray(x).shape, np.asarray(y).shape))
-    if sample_weight is not None and len(x) != len(sample_weight):
-      raise ValueError('`x` (images tensor) and `sample_weight` '
-                       'should have the same length. '
-                       'Found: x.shape = %s, sample_weight.shape = %s' %
-                       (np.asarray(x).shape, np.asarray(sample_weight).shape))
-    if subset is not None:
-      if subset not in {'training', 'validation'}:
-        raise ValueError('Invalid subset name:', subset,
-                         '; expected "training" or "validation".')
-      split_idx = int(len(x) * image_data_generator._validation_split)
-
-      if (y is not None and not ignore_class_split and not np.array_equal(
-          np.unique(y[:split_idx]), np.unique(y[split_idx:]))):
-        raise ValueError('Training and validation subsets '
-                         'have different number of classes after '
-                         'the split. If your numpy arrays are '
-                         'sorted by the label, you might want '
-                         'to shuffle them.')
-
-      if subset == 'validation':
-        x = x[:split_idx]
-        x_misc = [np.asarray(xx[:split_idx]) for xx in x_misc]
-        if y is not None:
-          y = y[:split_idx]
-      else:
-        x = x[split_idx:]
-        x_misc = [np.asarray(xx[split_idx:]) for xx in x_misc]
-        if y is not None:
-          y = y[split_idx:]
-
-    self.x = np.asarray(x, dtype=self.dtype)
-    self.x_misc = x_misc
-    if self.x.ndim != 4:
-      raise ValueError(
-          'Input data in `NumpyArrayIterator` '
-          'should have rank 4. You passed an array '
-          'with shape', self.x.shape)
-    channels_axis = 3 if data_format == 'channels_last' else 1
-    if self.x.shape[channels_axis] not in {1, 3, 4}:
-      warnings.warn('NumpyArrayIterator is set to use the '
-                    'data format convention "' + data_format + '" '
-                    '(channels on axis ' + str(channels_axis) +
-                    '), i.e. expected either 1, 3, or 4 '
-                    'channels on axis ' + str(channels_axis) + '. '
-                    'However, it was passed an array with shape ' +
-                    str(self.x.shape) + ' (' +
-                    str(self.x.shape[channels_axis]) + ' channels).')
-    if y is not None:
-      self.y = np.asarray(y)
-    else:
-      self.y = None
-    if sample_weight is not None:
-      self.sample_weight = np.asarray(sample_weight)
-    else:
-      self.sample_weight = None
-    self.image_data_generator = image_data_generator
-    self.data_format = data_format
-    self.save_to_dir = save_to_dir
-    self.save_prefix = save_prefix
-    self.save_format = save_format
-    super().__init__(x.shape[0], batch_size, shuffle, seed)
-
-  def _get_batches_of_transformed_samples(self, index_array):
-    batch_x = np.zeros(
-        tuple([len(index_array)] + list(self.x.shape)[1:]), dtype=self.dtype)
-    for i, j in enumerate(index_array):
-      x = self.x[j]
-      params = self.image_data_generator.get_random_transform(x.shape)
-      x = self.image_data_generator.apply_transform(
-          x.astype(self.dtype), params)
-      x = self.image_data_generator.standardize(x)
-      batch_x[i] = x
-
-    if self.save_to_dir:
-      for i, j in enumerate(index_array):
-        img = image_utils.array_to_img(batch_x[i], self.data_format, scale=True)
-        fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix,
-            index=j,
-            hash=np.random.randint(1e4),
-            format=self.save_format)
-        img.save(os.path.join(self.save_to_dir, fname))
-    batch_x_miscs = [xx[index_array] for xx in self.x_misc]
-    output = (batch_x if not batch_x_miscs else [batch_x] + batch_x_miscs,)
-    if self.y is None:
-      return output[0]
-    output += (self.y[index_array],)
-    if self.sample_weight is not None:
-      output += (self.sample_weight[index_array],)
-    return output
-
-
-def validate_filename(filename, white_list_formats):
-  """Check if a filename refers to a valid file.
-
-  Args:
-      filename: String, absolute path to a file
-      white_list_formats: Set, allowed file extensions
-  Returns:
-      A boolean value indicating if the filename is valid or not
-  """
-  return (filename.lower().endswith(white_list_formats) and
-          os.path.isfile(filename))
-
-
-class DataFrameIterator(BatchFromFilesMixin, Iterator):
-  """Iterator capable of reading images from a directory on disk as a dataframe.
-
-  Args:
-      dataframe: Pandas dataframe containing the filepaths relative to
-        `directory` (or absolute paths if `directory` is None) of the images in
-        a string column. It should include other column/s depending on the
-        `class_mode`: - if `class_mode` is `"categorical"` (default value) it
-          must include the `y_col` column with the class/es of each image.
-          Values in column can be string/list/tuple if a single class or
-          list/tuple if multiple classes. - if `class_mode` is `"binary"` or
-          `"sparse"` it must include the given `y_col` column with class values
-          as strings. - if `class_mode` is `"raw"` or `"multi_output"` it should
-          contain the columns specified in `y_col`. - if `class_mode` is
-          `"input"` or `None` no extra column is needed.
-      directory: string, path to the directory to read images from. If `None`,
-        data in `x_col` column should be absolute paths.
-      image_data_generator: Instance of `ImageDataGenerator` to use for random
-        transformations and normalization. If None, no transformations and
-        normalizations are made.
-      x_col: string, column in `dataframe` that contains the filenames (or
-        absolute paths if `directory` is `None`).
-      y_col: string or list, column/s in `dataframe` that has the target data.
-      weight_col: string, column in `dataframe` that contains the sample
-          weights. Default: `None`.
-      target_size: tuple of integers, dimensions to resize input images to.
-      color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
-        images.
-      classes: Optional list of strings, classes to use (e.g. `["dogs",
-        "cats"]`). If None, all classes in `y_col` will be used.
-      class_mode: one of "binary", "categorical", "input", "multi_output",
-        "raw", "sparse" or None. Default: "categorical".
-        Mode for yielding the targets:
-          - `"binary"`: 1D numpy array of binary labels,
-          - `"categorical"`: 2D numpy array of one-hot encoded labels. Supports
-            multi-label output.
-          - `"input"`: images identical to input images (mainly used to work
-            with autoencoders),
-          - `"multi_output"`: list with the values of the different columns,
-          - `"raw"`: numpy array of values in `y_col` column(s),
-          - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
-            are returned (the generator will only yield batches of image data,
-            which is useful to use in `model.predict()`).
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      seed: Random seed for data shuffling.
-      data_format: String, one of `channels_first`, `channels_last`.
-      save_to_dir: Optional directory where to save the pictures being yielded,
-        in a viewable format. This is useful for visualizing the random
-        transformations being applied, for debugging purposes.
-      save_prefix: String prefix to use for saving sample images (if
-        `save_to_dir` is set).
-      save_format: Format to use for saving sample images (if `save_to_dir` is
-        set).
-      subset: Subset of data (`"training"` or `"validation"`) if
-        validation_split is set in ImageDataGenerator.
-      interpolation: Interpolation method used to resample the image if the
-        target size is different from that of the loaded image. Supported
-        methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
-        or newer is installed, "lanczos" is also supported. If PIL version 3.4.0
-        or newer is installed, "box" and "hamming" are also supported. By
-        default, "nearest" is used.
-      keep_aspect_ratio: Boolean, whether to resize images to a target size
-        without aspect ratio distortion. The image is cropped in the center
-        with target aspect ratio before resizing.
-      dtype: Dtype to use for the generated arrays.
-      validate_filenames: Boolean, whether to validate image filenames in
-        `x_col`. If `True`, invalid images will be ignored. Disabling this
-        option can lead to speed-up in the instantiation of this class. Default:
-        `True`.
-  """
-  allowed_class_modes = {
-      'binary', 'categorical', 'input', 'multi_output', 'raw', 'sparse', None
-  }
-
-  def __init__(self,
-               dataframe,
-               directory=None,
-               image_data_generator=None,
-               x_col='filename',
-               y_col='class',
-               weight_col=None,
-               target_size=(256, 256),
-               color_mode='rgb',
-               classes=None,
-               class_mode='categorical',
-               batch_size=32,
-               shuffle=True,
-               seed=None,
-               data_format='channels_last',
-               save_to_dir=None,
-               save_prefix='',
-               save_format='png',
-               subset=None,
-               interpolation='nearest',
-               keep_aspect_ratio=False,
-               dtype='float32',
-               validate_filenames=True):
-    super().set_processing_attrs(image_data_generator, target_size, color_mode,
-                                 data_format, save_to_dir, save_prefix,
-                                 save_format, subset, interpolation,
-                                 keep_aspect_ratio)
-    df = dataframe.copy()
-    self.directory = directory or ''
-    self.class_mode = class_mode
-    self.dtype = dtype
-    # check that inputs match the required class_mode
-    self._check_params(df, x_col, y_col, weight_col, classes)
-    if validate_filenames:  # check which image files are valid and keep them
-      df = self._filter_valid_filepaths(df, x_col)
-    if class_mode not in ['input', 'multi_output', 'raw', None]:
-      df, classes = self._filter_classes(df, y_col, classes)
-      num_classes = len(classes)
-      # build an index of all the unique classes
-      self.class_indices = dict(zip(classes, range(len(classes))))
-    # retrieve only training or validation set
-    if self.split:
-      num_files = len(df)
-      start = int(self.split[0] * num_files)
-      stop = int(self.split[1] * num_files)
-      df = df.iloc[start:stop, :]
-    # get labels for each observation
-    if class_mode not in ['input', 'multi_output', 'raw', None]:
-      self.classes = self.get_classes(df, y_col)
-    self.filenames = df[x_col].tolist()
-    self._sample_weight = df[weight_col].values if weight_col else None
-
-    if class_mode == 'multi_output':
-      self._targets = [np.array(df[col].tolist()) for col in y_col]
-    if class_mode == 'raw':
-      self._targets = df[y_col].values
-    self.samples = len(self.filenames)
-    validated_string = 'validated' if validate_filenames else 'non-validated'
-    if class_mode in ['input', 'multi_output', 'raw', None]:
-      print(f'Found {self.samples} {validated_string} image filenames.')
-    else:
-      print(f'Found {self.samples} {validated_string} image filenames '
-            f'belonging to {num_classes} classes.')
-    self._filepaths = [
-        os.path.join(self.directory, fname) for fname in self.filenames
-    ]
-    super().__init__(self.samples, batch_size, shuffle, seed)
-
-  def _check_params(self, df, x_col, y_col, weight_col, classes):
-    # check class mode is one of the currently supported
-    if self.class_mode not in self.allowed_class_modes:
-      raise ValueError('Invalid class_mode: {}; expected one of: {}'.format(
-          self.class_mode, self.allowed_class_modes))
-    # check that y_col has several column names if class_mode is multi_output
-    if (self.class_mode == 'multi_output') and not isinstance(y_col, list):
-      raise TypeError(
-          'If class_mode="{}", y_col must be a list. Received {}.'.format(
-              self.class_mode,
-              type(y_col).__name__))
-    # check that filenames/filepaths column values are all strings
-    if not all(df[x_col].apply(lambda x: isinstance(x, str))):
-      raise TypeError(
-          'All values in column x_col={} must be strings.'.format(x_col))
-    # check labels are string if class_mode is binary or sparse
-    if self.class_mode in {'binary', 'sparse'}:
-      if not all(df[y_col].apply(lambda x: isinstance(x, str))):
-        raise TypeError('If class_mode="{}", y_col="{}" column '
-                        'values must be strings.'.format(
-                            self.class_mode, y_col))
-    # check that if binary there are only 2 different classes
-    if self.class_mode == 'binary':
-      if classes:
-        classes = set(classes)
-        if len(classes) != 2:
-          raise ValueError('If class_mode="binary" there must be 2 '
-                           'classes. {} class/es were given.'.format(
-                               len(classes)))
-      elif df[y_col].nunique() != 2:
-        raise ValueError('If class_mode="binary" there must be 2 classes. '
-                         'Found {} classes.'.format(df[y_col].nunique()))
-    # check values are string, list or tuple if class_mode is categorical
-    if self.class_mode == 'categorical':
-      types = (str, list, tuple)
-      if not all(df[y_col].apply(lambda x: isinstance(x, types))):
-        raise TypeError('If class_mode="{}", y_col="{}" column '
-                        'values must be type string, list or tuple.'.format(
-                            self.class_mode, y_col))
-    # raise warning if classes are given but will be unused
-    if classes and self.class_mode in {'input', 'multi_output', 'raw', None}:
-      warnings.warn(
-          '`classes` will be ignored given the class_mode="{}"'.format(
-              self.class_mode))
-    # check that if weight column that the values are numerical
-    if weight_col and not issubclass(df[weight_col].dtype.type, np.number):
-      raise TypeError(
-          'Column weight_col={} must be numeric.'.format(weight_col))
-
-  def get_classes(self, df, y_col):
-    labels = []
-    for label in df[y_col]:
-      if isinstance(label, (list, tuple)):
-        labels.append([self.class_indices[lbl] for lbl in label])
-      else:
-        labels.append(self.class_indices[label])
-    return labels
-
-  @staticmethod
-  def _filter_classes(df, y_col, classes):
-    df = df.copy()
-
-    def remove_classes(labels, classes):
-      if isinstance(labels, (list, tuple)):
-        labels = [cls for cls in labels if cls in classes]
-        return labels or None
-      elif isinstance(labels, str):
-        return labels if labels in classes else None
-      else:
-        raise TypeError(
-            'Expect string, list or tuple but found {} in {} column '.format(
-                type(labels), y_col))
-
-    if classes:
-      # prepare for membership lookup
-      classes = list(collections.OrderedDict.fromkeys(classes).keys())
-      df[y_col] = df[y_col].apply(lambda x: remove_classes(x, classes))
-    else:
-      classes = set()
-      for v in df[y_col]:
-        if isinstance(v, (list, tuple)):
-          classes.update(v)
-        else:
-          classes.add(v)
-      classes = sorted(classes)
-    return df.dropna(subset=[y_col]), classes
-
-  def _filter_valid_filepaths(self, df, x_col):
-    """Keep only dataframe rows with valid filenames.
+    """Iterator yielding data from a Numpy array.
+
+    Deprecated: `tf.keras.preprocessing.image.NumpyArrayIterator` is not
+    recommended for new code. Prefer loading images with
+    `tf.keras.utils.image_dataset_from_directory` and transforming the output
+    `tf.data.Dataset` with preprocessing layers. For more information, see the
+    tutorials for [loading images](
+    https://www.tensorflow.org/tutorials/load_data/images) and
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
     Args:
-        df: Pandas dataframe containing filenames in a column
-        x_col: string, column in `df` that contains the filenames or filepaths
-    Returns:
-        absolute paths to image files
-    """
-    filepaths = df[x_col].map(lambda fname: os.path.join(self.directory, fname))
-    mask = filepaths.apply(validate_filename, args=(self.white_list_formats,))
-    n_invalid = (~mask).sum()
-    if n_invalid:
-      warnings.warn('Found {} invalid image filename(s) in x_col="{}". '
-                    'These filename(s) will be ignored.'.format(
-                        n_invalid, x_col))
-    return df[mask]
-
-  @property
-  def filepaths(self):
-    return self._filepaths
-
-  @property
-  def labels(self):
-    if self.class_mode in {'multi_output', 'raw'}:
-      return self._targets
-    else:
-      return self.classes
-
-  @property
-  def sample_weight(self):
-    return self._sample_weight
-
-
-def flip_axis(x, axis):
-  x = np.asarray(x).swapaxes(axis, 0)
-  x = x[::-1, ...]
-  x = x.swapaxes(0, axis)
-  return x
-
-
-@keras_export('keras.preprocessing.image.ImageDataGenerator')
-class ImageDataGenerator():
-  """Generate batches of tensor image data with real-time data augmentation.
-
-  Deprecated: `tf.keras.preprocessing.image.ImageDataGenerator` is not
-  recommended for new code. Prefer loading images with
-  `tf.keras.utils.image_dataset_from_directory` and transforming the output
-  `tf.data.Dataset` with preprocessing layers. For more information, see the
-  tutorials for [loading images](
-  https://www.tensorflow.org/tutorials/load_data/images) and
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-   The data will be looped over (in batches).
-
-  Args:
-      featurewise_center: Boolean. Set input mean to 0 over the dataset,
-        feature-wise.
-      samplewise_center: Boolean. Set each sample mean to 0.
-      featurewise_std_normalization: Boolean. Divide inputs by std of the
-        dataset, feature-wise.
-      samplewise_std_normalization: Boolean. Divide each input by its std.
-      zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
-      zca_whitening: Boolean. Apply ZCA whitening.
-      rotation_range: Int. Degree range for random rotations.
-      width_shift_range: Float, 1-D array-like or int
-          - float: fraction of total width, if < 1, or pixels if >= 1.
-          - 1-D array-like: random elements from the array.
-          - int: integer number of pixels from interval `(-width_shift_range,
-            +width_shift_range)` - With `width_shift_range=2` possible values
-            are integers `[-1, 0, +1]`, same as with `width_shift_range=[-1, 0,
-            +1]`, while with `width_shift_range=1.0` possible values are floats
-            in the interval [-1.0, +1.0).
-      height_shift_range: Float, 1-D array-like or int
-          - float: fraction of total height, if < 1, or pixels if >= 1.
-          - 1-D array-like: random elements from the array.
-          - int: integer number of pixels from interval `(-height_shift_range,
-            +height_shift_range)` - With `height_shift_range=2` possible values
-            are integers `[-1, 0, +1]`, same as with `height_shift_range=[-1, 0,
-            +1]`, while with `height_shift_range=1.0` possible values are floats
-            in the interval [-1.0, +1.0).
-      brightness_range: Tuple or list of two floats. Range for picking a
-        brightness shift value from.
-      shear_range: Float. Shear Intensity (Shear angle in counter-clockwise
-        direction in degrees)
-      zoom_range: Float or [lower, upper]. Range for random zoom. If a float,
-        `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
-      channel_shift_range: Float. Range for random channel shifts.
-      fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}. Default is
-        'nearest'. Points outside the boundaries of the input are filled
-          according to the given mode:
-          - 'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
-          - 'nearest':  aaaaaaaa|abcd|dddddddd
-          - 'reflect':  abcddcba|abcd|dcbaabcd
-          - 'wrap':  abcdabcd|abcd|abcdabcd
-      cval: Float or Int. Value used for points outside the boundaries when
-        `fill_mode = "constant"`.
-      horizontal_flip: Boolean. Randomly flip inputs horizontally.
-      vertical_flip: Boolean. Randomly flip inputs vertically.
-      rescale: rescaling factor. Defaults to None. If None or 0, no rescaling is
-        applied, otherwise we multiply the data by the value provided (after
-        applying all other transformations).
-      preprocessing_function: function that will be applied on each input. The
-        function will run after the image is resized and augmented.
-          The function should take one argument: one image (Numpy tensor with
-            rank 3), and should output a Numpy tensor with the same shape.
-      data_format: Image data format, either "channels_first" or
-        "channels_last". "channels_last" mode means that the images should have
-        shape `(samples, height, width, channels)`, "channels_first" mode means
-        that the images should have shape `(samples, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your Keras config
-        file at `~/.keras/keras.json`. If you never set it, then it will be
-        "channels_last".
-      validation_split: Float. Fraction of images reserved for validation
-        (strictly between 0 and 1).
-      dtype: Dtype to use for the generated arrays.
-
-  Raises:
-    ValueError: If the value of the argument, `data_format` is other than
-          `"channels_last"` or `"channels_first"`.
-    ValueError: If the value of the argument, `validation_split` > 1
-          or `validation_split` < 0.
-
-  Examples:
-
-  Example of using `.flow(x, y)`:
-
-  ```python
-  (x_train, y_train), (x_test, y_test) = cifar10.load_data()
-  y_train = utils.to_categorical(y_train, num_classes)
-  y_test = utils.to_categorical(y_test, num_classes)
-  datagen = ImageDataGenerator(
-      featurewise_center=True,
-      featurewise_std_normalization=True,
-      rotation_range=20,
-      width_shift_range=0.2,
-      height_shift_range=0.2,
-      horizontal_flip=True,
-      validation_split=0.2)
-  # compute quantities required for featurewise normalization
-  # (std, mean, and principal components if ZCA whitening is applied)
-  datagen.fit(x_train)
-  # fits the model on batches with real-time data augmentation:
-  model.fit(datagen.flow(x_train, y_train, batch_size=32,
-           subset='training'),
-           validation_data=datagen.flow(x_train, y_train,
-           batch_size=8, subset='validation'),
-           steps_per_epoch=len(x_train) / 32, epochs=epochs)
-  # here's a more "manual" example
-  for e in range(epochs):
-      print('Epoch', e)
-      batches = 0
-      for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
-          model.fit(x_batch, y_batch)
-          batches += 1
-          if batches >= len(x_train) / 32:
-              # we need to break the loop by hand because
-              # the generator loops indefinitely
-              break
-  ```
-
-  Example of using `.flow_from_directory(directory)`:
-
-  ```python
-  train_datagen = ImageDataGenerator(
-          rescale=1./255,
-          shear_range=0.2,
-          zoom_range=0.2,
-          horizontal_flip=True)
-  test_datagen = ImageDataGenerator(rescale=1./255)
-  train_generator = train_datagen.flow_from_directory(
-          'data/train',
-          target_size=(150, 150),
-          batch_size=32,
-          class_mode='binary')
-  validation_generator = test_datagen.flow_from_directory(
-          'data/validation',
-          target_size=(150, 150),
-          batch_size=32,
-          class_mode='binary')
-  model.fit(
-          train_generator,
-          steps_per_epoch=2000,
-          epochs=50,
-          validation_data=validation_generator,
-          validation_steps=800)
-  ```
-
-  Example of transforming images and masks together.
-
-  ```python
-  # we create two instances with the same arguments
-  data_gen_args = dict(featurewise_center=True,
-                       featurewise_std_normalization=True,
-                       rotation_range=90,
-                       width_shift_range=0.1,
-                       height_shift_range=0.1,
-                       zoom_range=0.2)
-  image_datagen = ImageDataGenerator(**data_gen_args)
-  mask_datagen = ImageDataGenerator(**data_gen_args)
-  # Provide the same seed and keyword arguments to the fit and flow methods
-  seed = 1
-  image_datagen.fit(images, augment=True, seed=seed)
-  mask_datagen.fit(masks, augment=True, seed=seed)
-  image_generator = image_datagen.flow_from_directory(
-      'data/images',
-      class_mode=None,
-      seed=seed)
-  mask_generator = mask_datagen.flow_from_directory(
-      'data/masks',
-      class_mode=None,
-      seed=seed)
-  # combine generators into one which yields image and masks
-  train_generator = zip(image_generator, mask_generator)
-  model.fit(
-      train_generator,
-      steps_per_epoch=2000,
-      epochs=50)
-  ```
-  """
-
-  def __init__(self,
-               featurewise_center=False,
-               samplewise_center=False,
-               featurewise_std_normalization=False,
-               samplewise_std_normalization=False,
-               zca_whitening=False,
-               zca_epsilon=1e-6,
-               rotation_range=0,
-               width_shift_range=0.,
-               height_shift_range=0.,
-               brightness_range=None,
-               shear_range=0.,
-               zoom_range=0.,
-               channel_shift_range=0.,
-               fill_mode='nearest',
-               cval=0.,
-               horizontal_flip=False,
-               vertical_flip=False,
-               rescale=None,
-               preprocessing_function=None,
-               data_format=None,
-               validation_split=0.0,
-               interpolation_order=1,
-               dtype=None):
-    if data_format is None:
-      data_format = backend.image_data_format()
-    if dtype is None:
-      dtype = backend.floatx()
-
-    self.featurewise_center = featurewise_center
-    self.samplewise_center = samplewise_center
-    self.featurewise_std_normalization = featurewise_std_normalization
-    self.samplewise_std_normalization = samplewise_std_normalization
-    self.zca_whitening = zca_whitening
-    self.zca_epsilon = zca_epsilon
-    self.rotation_range = rotation_range
-    self.width_shift_range = width_shift_range
-    self.height_shift_range = height_shift_range
-    self.shear_range = shear_range
-    self.zoom_range = zoom_range
-    self.channel_shift_range = channel_shift_range
-    self.fill_mode = fill_mode
-    self.cval = cval
-    self.horizontal_flip = horizontal_flip
-    self.vertical_flip = vertical_flip
-    self.rescale = rescale
-    self.preprocessing_function = preprocessing_function
-    self.dtype = dtype
-    self.interpolation_order = interpolation_order
-
-    if data_format not in {'channels_last', 'channels_first'}:
-      raise ValueError('`data_format` should be `"channels_last"` '
-                       '(channel after row and column) or '
-                       '`"channels_first"` (channel before row and column). '
-                       'Received: %s' % data_format)
-    self.data_format = data_format
-    if data_format == 'channels_first':
-      self.channel_axis = 1
-      self.row_axis = 2
-      self.col_axis = 3
-    if data_format == 'channels_last':
-      self.channel_axis = 3
-      self.row_axis = 1
-      self.col_axis = 2
-    if validation_split and not 0 < validation_split < 1:
-      raise ValueError('`validation_split` must be strictly between 0 and 1. '
-                       ' Received: %s' % validation_split)
-    self._validation_split = validation_split
-
-    self.mean = None
-    self.std = None
-    self.zca_whitening_matrix = None
-
-    if isinstance(zoom_range, (float, int)):
-      self.zoom_range = [1 - zoom_range, 1 + zoom_range]
-    elif (len(zoom_range) == 2 and
-          all(isinstance(val, (float, int)) for val in zoom_range)):
-      self.zoom_range = [zoom_range[0], zoom_range[1]]
-    else:
-      raise ValueError('`zoom_range` should be a float or '
-                       'a tuple or list of two floats. '
-                       'Received: %s' % (zoom_range,))
-    if zca_whitening:
-      if not featurewise_center:
-        self.featurewise_center = True
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`zca_whitening`, which overrides '
-                      'setting of `featurewise_center`.')
-      if featurewise_std_normalization:
-        self.featurewise_std_normalization = False
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`zca_whitening` '
-                      'which overrides setting of'
-                      '`featurewise_std_normalization`.')
-    if featurewise_std_normalization:
-      if not featurewise_center:
-        self.featurewise_center = True
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`featurewise_std_normalization`, '
-                      'which overrides setting of '
-                      '`featurewise_center`.')
-    if samplewise_std_normalization:
-      if not samplewise_center:
-        self.samplewise_center = True
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`samplewise_std_normalization`, '
-                      'which overrides setting of '
-                      '`samplewise_center`.')
-    if brightness_range is not None:
-      if (not isinstance(brightness_range, (tuple, list)) or
-          len(brightness_range) != 2):
-        raise ValueError(
-            '`brightness_range should be tuple or list of two floats. '
-            'Received: %s' % (brightness_range,))
-    self.brightness_range = brightness_range
-
-  def flow(self,
-           x,
-           y=None,
-           batch_size=32,
-           shuffle=True,
-           sample_weight=None,
-           seed=None,
-           save_to_dir=None,
-           save_prefix='',
-           save_format='png',
-           ignore_class_split=False,
-           subset=None):
-    """Takes data & label arrays, generates batches of augmented data.
-
-    Args:
-        x: Input data. Numpy array of rank 4 or a tuple. If tuple, the first
-          element should contain the images and the second element another numpy
-          array or a list of numpy arrays that gets passed to the output without
-          any modifications. Can be used to feed the model miscellaneous data
-          along with the images. In case of grayscale data, the channels axis of
-          the image array should have value 1, in case of RGB data, it should
-          have value 3, and in case of RGBA data, it should have value 4.
-        y: Labels.
-        batch_size: Int (default: 32).
-        shuffle: Boolean (default: True).
-        sample_weight: Sample weights.
-        seed: Int (default: None).
-        save_to_dir: None or str (default: None). This allows you to optionally
-          specify a directory to which to save the augmented pictures being
-          generated (useful for visualizing what you are doing).
-        save_prefix: Str (default: `''`). Prefix to use for filenames of saved
-          pictures (only relevant if `save_to_dir` is set).
-        save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif", "tif",
-          "jpg" (only relevant if `save_to_dir` is set). Default: "png".
+        x: Numpy array of input data or tuple. If tuple, the second elements is
+          either another numpy array or a list of numpy arrays, each of which
+          gets passed through as an output without any modifications.
+        y: Numpy array of targets data.
+        image_data_generator: Instance of `ImageDataGenerator` to use for random
+          transformations and normalization.
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        sample_weight: Numpy array of sample weights.
+        seed: Random seed for data shuffling.
+        data_format: String, one of `channels_first`, `channels_last`.
+        save_to_dir: Optional directory where to save the pictures being
+          yielded, in a viewable format. This is useful for visualizing the
+          random transformations being applied, for debugging purposes.
+        save_prefix: String prefix to use for saving sample images (if
+          `save_to_dir` is set).
+        save_format: Format to use for saving sample images (if `save_to_dir` is
+          set).
+        subset: Subset of data (`"training"` or `"validation"`) if
+          validation_split is set in ImageDataGenerator.
         ignore_class_split: Boolean (default: False), ignore difference
           in number of classes in labels across train and validation
           split (useful for non-classification tasks)
-        subset: Subset of data (`"training"` or `"validation"`) if
-          `validation_split` is set in `ImageDataGenerator`.
-
-    Returns:
-        An `Iterator` yielding tuples of `(x, y)`
-            where `x` is a numpy array of image data
-            (in the case of a single image input) or a list
-            of numpy arrays (in the case with
-            additional inputs) and `y` is a numpy array
-            of corresponding labels. If 'sample_weight' is not None,
-            the yielded tuples are of the form `(x, y, sample_weight)`.
-            If `y` is None, only the numpy array `x` is returned.
-    Raises:
-      ValueError: If the Value of the argument, `subset` is other than
-            "training" or "validation".
-
+        dtype: Dtype to use for the generated arrays.
     """
-    return NumpyArrayIterator(
+
+    def __init__(
+        self,
         x,
         y,
-        self,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        sample_weight=sample_weight,
-        seed=seed,
-        data_format=self.data_format,
-        save_to_dir=save_to_dir,
-        save_prefix=save_prefix,
-        save_format=save_format,
-        ignore_class_split=ignore_class_split,
-        subset=subset,
-        dtype=self.dtype)
-
-  def flow_from_directory(self,
-                          directory,
-                          target_size=(256, 256),
-                          color_mode='rgb',
-                          classes=None,
-                          class_mode='categorical',
-                          batch_size=32,
-                          shuffle=True,
-                          seed=None,
-                          save_to_dir=None,
-                          save_prefix='',
-                          save_format='png',
-                          follow_links=False,
-                          subset=None,
-                          interpolation='nearest',
-                          keep_aspect_ratio=False):
-    """Takes the path to a directory & generates batches of augmented data.
+        image_data_generator,
+        batch_size=32,
+        shuffle=False,
+        sample_weight=None,
+        seed=None,
+        data_format=None,
+        save_to_dir=None,
+        save_prefix="",
+        save_format="png",
+        subset=None,
+        ignore_class_split=False,
+        dtype=None,
+    ):
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if dtype is None:
+            dtype = backend.floatx()
+        self.dtype = dtype
+        if isinstance(x, tuple) or isinstance(x, list):
+            if not isinstance(x[1], list):
+                x_misc = [np.asarray(x[1])]
+            else:
+                x_misc = [np.asarray(xx) for xx in x[1]]
+            x = x[0]
+            for xx in x_misc:
+                if len(x) != len(xx):
+                    raise ValueError(
+                        "All of the arrays in `x` "
+                        "should have the same length. "
+                        "Found a pair with: len(x[0]) = %s, len(x[?]) = %s"
+                        % (len(x), len(xx))
+                    )
+        else:
+            x_misc = []
+
+        if y is not None and len(x) != len(y):
+            raise ValueError(
+                "`x` (images tensor) and `y` (labels) "
+                "should have the same length. "
+                "Found: x.shape = %s, y.shape = %s"
+                % (np.asarray(x).shape, np.asarray(y).shape)
+            )
+        if sample_weight is not None and len(x) != len(sample_weight):
+            raise ValueError(
+                "`x` (images tensor) and `sample_weight` "
+                "should have the same length. "
+                "Found: x.shape = %s, sample_weight.shape = %s"
+                % (np.asarray(x).shape, np.asarray(sample_weight).shape)
+            )
+        if subset is not None:
+            if subset not in {"training", "validation"}:
+                raise ValueError(
+                    "Invalid subset name:",
+                    subset,
+                    '; expected "training" or "validation".',
+                )
+            split_idx = int(len(x) * image_data_generator._validation_split)
+
+            if (
+                y is not None
+                and not ignore_class_split
+                and not np.array_equal(
+                    np.unique(y[:split_idx]), np.unique(y[split_idx:])
+                )
+            ):
+                raise ValueError(
+                    "Training and validation subsets "
+                    "have different number of classes after "
+                    "the split. If your numpy arrays are "
+                    "sorted by the label, you might want "
+                    "to shuffle them."
+                )
+
+            if subset == "validation":
+                x = x[:split_idx]
+                x_misc = [np.asarray(xx[:split_idx]) for xx in x_misc]
+                if y is not None:
+                    y = y[:split_idx]
+            else:
+                x = x[split_idx:]
+                x_misc = [np.asarray(xx[split_idx:]) for xx in x_misc]
+                if y is not None:
+                    y = y[split_idx:]
+
+        self.x = np.asarray(x, dtype=self.dtype)
+        self.x_misc = x_misc
+        if self.x.ndim != 4:
+            raise ValueError(
+                "Input data in `NumpyArrayIterator` "
+                "should have rank 4. You passed an array "
+                "with shape",
+                self.x.shape,
+            )
+        channels_axis = 3 if data_format == "channels_last" else 1
+        if self.x.shape[channels_axis] not in {1, 3, 4}:
+            warnings.warn(
+                'NumpyArrayIterator is set to use the data format convention "'
+                + data_format
+                + '" (channels on axis '
+                + str(channels_axis)
+                + "), i.e. expected either 1, 3, or 4 channels on axis "
+                + str(channels_axis)
+                + ". However, it was passed an array with shape "
+                + str(self.x.shape)
+                + " ("
+                + str(self.x.shape[channels_axis])
+                + " channels)."
+            )
+        if y is not None:
+            self.y = np.asarray(y)
+        else:
+            self.y = None
+        if sample_weight is not None:
+            self.sample_weight = np.asarray(sample_weight)
+        else:
+            self.sample_weight = None
+        self.image_data_generator = image_data_generator
+        self.data_format = data_format
+        self.save_to_dir = save_to_dir
+        self.save_prefix = save_prefix
+        self.save_format = save_format
+        super().__init__(x.shape[0], batch_size, shuffle, seed)
+
+    def _get_batches_of_transformed_samples(self, index_array):
+        batch_x = np.zeros(
+            tuple([len(index_array)] + list(self.x.shape)[1:]), dtype=self.dtype
+        )
+        for i, j in enumerate(index_array):
+            x = self.x[j]
+            params = self.image_data_generator.get_random_transform(x.shape)
+            x = self.image_data_generator.apply_transform(
+                x.astype(self.dtype), params
+            )
+            x = self.image_data_generator.standardize(x)
+            batch_x[i] = x
+
+        if self.save_to_dir:
+            for i, j in enumerate(index_array):
+                img = image_utils.array_to_img(
+                    batch_x[i], self.data_format, scale=True
+                )
+                fname = "{prefix}_{index}_{hash}.{format}".format(
+                    prefix=self.save_prefix,
+                    index=j,
+                    hash=np.random.randint(1e4),
+                    format=self.save_format,
+                )
+                img.save(os.path.join(self.save_to_dir, fname))
+        batch_x_miscs = [xx[index_array] for xx in self.x_misc]
+        output = (batch_x if not batch_x_miscs else [batch_x] + batch_x_miscs,)
+        if self.y is None:
+            return output[0]
+        output += (self.y[index_array],)
+        if self.sample_weight is not None:
+            output += (self.sample_weight[index_array],)
+        return output
 
-    Args:
-        directory: string, path to the target directory. It should contain one
-          subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images inside
-          each of the subdirectories directory tree will be included in the
-          generator. See [this script](
-            https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
-              for more details.
-        target_size: Tuple of integers `(height, width)`, defaults to `(256,
-          256)`. The dimensions to which all images found will be resized.
-        color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb". Whether
-          the images will be converted to have 1, 3, or 4 channels.
-        classes: Optional list of class subdirectories
-            (e.g. `['dogs', 'cats']`). Default: None. If not provided, the list
-              of classes will be automatically inferred from the subdirectory
-              names/structure under `directory`, where each subdirectory will be
-              treated as a different class (and the order of the classes, which
-              will map to the label indices, will be alphanumeric). The
-              dictionary containing the mapping from class names to class
-              indices can be obtained via the attribute `class_indices`.
-        class_mode: One of "categorical", "binary", "sparse",
-            "input", or None. Default: "categorical".
-            Determines the type of label arrays that are returned:
-            - "categorical" will be 2D one-hot encoded labels,
-            - "binary" will be 1D binary labels,
-                "sparse" will be 1D integer labels,
-            - "input" will be images identical
-                to input images (mainly used to work with autoencoders).
-            - If None, no labels are returned
-              (the generator will only yield batches of image data,
-              which is useful to use with `model.predict_generator()`).
-              Please note that in case of class_mode None,
-              the data still needs to reside in a subdirectory
-              of `directory` for it to work correctly.
-        batch_size: Size of the batches of data (default: 32).
-        shuffle: Whether to shuffle the data (default: True) If set to False,
-          sorts the data in alphanumeric order.
-        seed: Optional random seed for shuffling and transformations.
-        save_to_dir: None or str (default: None). This allows you to optionally
-          specify a directory to which to save the augmented pictures being
-          generated (useful for visualizing what you are doing).
-        save_prefix: Str. Prefix to use for filenames of saved pictures (only
-          relevant if `save_to_dir` is set).
-        save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif", "tif",
-          "jpg"
-            (only relevant if `save_to_dir` is set). Default: "png".
-        follow_links: Whether to follow symlinks inside
-            class subdirectories (default: False).
-        subset: Subset of data (`"training"` or `"validation"`) if
-          `validation_split` is set in `ImageDataGenerator`.
-        interpolation: Interpolation method used to resample the image if the
-          target size is different from that of the loaded image. Supported
-          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
-          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
-          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
-          supported. By default, `"nearest"` is used.
-        keep_aspect_ratio: Boolean, whether to resize images to a target
-          size without aspect ratio distortion. The image is cropped in
-          the center with target aspect ratio before resizing.
 
+def validate_filename(filename, white_list_formats):
+    """Check if a filename refers to a valid file.
+
+    Args:
+        filename: String, absolute path to a file
+        white_list_formats: Set, allowed file extensions
     Returns:
-        A `DirectoryIterator` yielding tuples of `(x, y)`
-            where `x` is a numpy array containing a batch
-            of images with shape `(batch_size, *target_size, channels)`
-            and `y` is a numpy array of corresponding labels.
+        A boolean value indicating if the filename is valid or not
     """
-    return DirectoryIterator(
-        directory,
-        self,
-        target_size=target_size,
-        color_mode=color_mode,
-        keep_aspect_ratio=keep_aspect_ratio,
-        classes=classes,
-        class_mode=class_mode,
-        data_format=self.data_format,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        seed=seed,
-        save_to_dir=save_to_dir,
-        save_prefix=save_prefix,
-        save_format=save_format,
-        follow_links=follow_links,
-        subset=subset,
-        interpolation=interpolation,
-        dtype=self.dtype)
-
-  def flow_from_dataframe(self,
-                          dataframe,
-                          directory=None,
-                          x_col='filename',
-                          y_col='class',
-                          weight_col=None,
-                          target_size=(256, 256),
-                          color_mode='rgb',
-                          classes=None,
-                          class_mode='categorical',
-                          batch_size=32,
-                          shuffle=True,
-                          seed=None,
-                          save_to_dir=None,
-                          save_prefix='',
-                          save_format='png',
-                          subset=None,
-                          interpolation='nearest',
-                          validate_filenames=True,
-                          **kwargs):
-    """Takes the dataframe and the path to a directory + generates batches.
-
-     The generated batches contain augmented/normalized data.
-
-    **A simple tutorial can be found **[here](
-                                http://bit.ly/keras_flow_from_dataframe).
+    return filename.lower().endswith(white_list_formats) and os.path.isfile(
+        filename
+    )
+
+
+class DataFrameIterator(BatchFromFilesMixin, Iterator):
+    """Iterator capable of reading images from a directory as a dataframe.
 
     Args:
         dataframe: Pandas dataframe containing the filepaths relative to
-            `directory` (or absolute paths if `directory` is None) of the
-            images in a string column. It should include other column/s
-            depending on the `class_mode`:
-            - if `class_mode` is `"categorical"` (default value) it must
-                include the `y_col` column with the class/es of each image.
-                Values in column can be string/list/tuple if a single class
-                or list/tuple if multiple classes.
-            - if `class_mode` is `"binary"` or `"sparse"` it must include
-                the given `y_col` column with class values as strings.
+          `directory` (or absolute paths if `directory` is None) of the images
+          in a string column. It should include other column/s depending on the
+          `class_mode`: - if `class_mode` is `"categorical"` (default value) it
+          must include the `y_col` column with the class/es of each image.
+          Values in column can be string/list/tuple if a single class or
+          list/tuple if multiple classes.
+            - if `class_mode` is `"binary"` or `"sparse"` it must include the
+              given `y_col` column with class values as strings.
             - if `class_mode` is `"raw"` or `"multi_output"` it should contain
-            the columns specified in `y_col`.
+              the columns specified in `y_col`.
             - if `class_mode` is `"input"` or `None` no extra column is needed.
         directory: string, path to the directory to read images from. If `None`,
           data in `x_col` column should be absolute paths.
+        image_data_generator: Instance of `ImageDataGenerator` to use for random
+          transformations and normalization. If None, no transformations and
+          normalizations are made.
         x_col: string, column in `dataframe` that contains the filenames (or
           absolute paths if `directory` is `None`).
         y_col: string or list, column/s in `dataframe` that has the target data.
         weight_col: string, column in `dataframe` that contains the sample
             weights. Default: `None`.
-        target_size: tuple of integers `(height, width)`, default: `(256, 256)`.
-          The dimensions to which all images found will be resized.
-        color_mode: one of "grayscale", "rgb", "rgba". Default: "rgb". Whether
-          the images will be converted to have 1 or 3 color channels.
-        classes: optional list of classes (e.g. `['dogs', 'cats']`). Default is
-          None. If not provided, the list of classes will be automatically
-          inferred from the `y_col`, which will map to the label indices, will
-          be alphanumeric). The dictionary containing the mapping from class
-          names to class indices can be obtained via the attribute
-          `class_indices`.
+        target_size: tuple of integers, dimensions to resize input images to.
+        color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
+          images.
+        classes: Optional list of strings, classes to use (e.g. `["dogs",
+          "cats"]`). If None, all classes in `y_col` will be used.
         class_mode: one of "binary", "categorical", "input", "multi_output",
-            "raw", sparse" or None. Default: "categorical".
-            Mode for yielding the targets:
+          "raw", "sparse" or None. Default: "categorical".
+          Mode for yielding the targets:
             - `"binary"`: 1D numpy array of binary labels,
             - `"categorical"`: 2D numpy array of one-hot encoded labels.
               Supports multi-label output.
@@ -1557,777 +884,1740 @@ def flow_from_dataframe(self,
             - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
               are returned (the generator will only yield batches of image data,
               which is useful to use in `model.predict()`).
-        batch_size: size of the batches of data (default: 32).
-        shuffle: whether to shuffle the data (default: True)
-        seed: optional random seed for shuffling and transformations.
-        save_to_dir: None or str (default: None). This allows you to optionally
-          specify a directory to which to save the augmented pictures being
-          generated (useful for visualizing what you are doing).
-        save_prefix: str. Prefix to use for filenames of saved pictures (only
-          relevant if `save_to_dir` is set).
-        save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif", "tif",
-          "jpg" (only relevant if `save_to_dir` is set). Default: "png".
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        seed: Random seed for data shuffling.
+        data_format: String, one of `channels_first`, `channels_last`.
+        save_to_dir: Optional directory where to save the pictures being
+          yielded, in a viewable format. This is useful for visualizing the
+          random transformations being applied, for debugging purposes.
+        save_prefix: String prefix to use for saving sample images (if
+          `save_to_dir` is set).
+        save_format: Format to use for saving sample images (if `save_to_dir` is
+          set).
         subset: Subset of data (`"training"` or `"validation"`) if
-          `validation_split` is set in `ImageDataGenerator`.
+          validation_split is set in ImageDataGenerator.
         interpolation: Interpolation method used to resample the image if the
           target size is different from that of the loaded image. Supported
-          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
-          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
-          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
-          supported. By default, `"nearest"` is used.
+          methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
+          or newer is installed, "lanczos" is also supported. If PIL version
+          3.4.0 or newer is installed, "box" and "hamming" are also supported.
+          By default, "nearest" is used.
+        keep_aspect_ratio: Boolean, whether to resize images to a target size
+          without aspect ratio distortion. The image is cropped in the center
+          with target aspect ratio before resizing.
+        dtype: Dtype to use for the generated arrays.
         validate_filenames: Boolean, whether to validate image filenames in
           `x_col`. If `True`, invalid images will be ignored. Disabling this
-          option can lead to speed-up in the execution of this function.
-          Defaults to `True`.
-        **kwargs: legacy arguments for raising deprecation warnings.
-
-    Returns:
-        A `DataFrameIterator` yielding tuples of `(x, y)`
-        where `x` is a numpy array containing a batch
-        of images with shape `(batch_size, *target_size, channels)`
-        and `y` is a numpy array of corresponding labels.
+          option can lead to speed-up in the instantiation of this class.
+          Default: `True`.
     """
-    if 'has_ext' in kwargs:
-      warnings.warn(
-          'has_ext is deprecated, filenames in the dataframe have '
-          'to match the exact filenames in disk.', DeprecationWarning)
-    if 'sort' in kwargs:
-      warnings.warn(
-          'sort is deprecated, batches will be created in the'
-          'same order than the filenames provided if shuffle'
-          'is set to False.', DeprecationWarning)
-    if class_mode == 'other':
-      warnings.warn(
-          '`class_mode` "other" is deprecated, please use '
-          '`class_mode` "raw".', DeprecationWarning)
-      class_mode = 'raw'
-    if 'drop_duplicates' in kwargs:
-      warnings.warn(
-          'drop_duplicates is deprecated, you can drop duplicates '
-          'by using the pandas.DataFrame.drop_duplicates method.',
-          DeprecationWarning)
-
-    return DataFrameIterator(
+
+    allowed_class_modes = {
+        "binary",
+        "categorical",
+        "input",
+        "multi_output",
+        "raw",
+        "sparse",
+        None,
+    }
+
+    def __init__(
+        self,
         dataframe,
+        directory=None,
+        image_data_generator=None,
+        x_col="filename",
+        y_col="class",
+        weight_col=None,
+        target_size=(256, 256),
+        color_mode="rgb",
+        classes=None,
+        class_mode="categorical",
+        batch_size=32,
+        shuffle=True,
+        seed=None,
+        data_format="channels_last",
+        save_to_dir=None,
+        save_prefix="",
+        save_format="png",
+        subset=None,
+        interpolation="nearest",
+        keep_aspect_ratio=False,
+        dtype="float32",
+        validate_filenames=True,
+    ):
+        super().set_processing_attrs(
+            image_data_generator,
+            target_size,
+            color_mode,
+            data_format,
+            save_to_dir,
+            save_prefix,
+            save_format,
+            subset,
+            interpolation,
+            keep_aspect_ratio,
+        )
+        df = dataframe.copy()
+        self.directory = directory or ""
+        self.class_mode = class_mode
+        self.dtype = dtype
+        # check that inputs match the required class_mode
+        self._check_params(df, x_col, y_col, weight_col, classes)
+        if (
+            validate_filenames
+        ):  # check which image files are valid and keep them
+            df = self._filter_valid_filepaths(df, x_col)
+        if class_mode not in ["input", "multi_output", "raw", None]:
+            df, classes = self._filter_classes(df, y_col, classes)
+            num_classes = len(classes)
+            # build an index of all the unique classes
+            self.class_indices = dict(zip(classes, range(len(classes))))
+        # retrieve only training or validation set
+        if self.split:
+            num_files = len(df)
+            start = int(self.split[0] * num_files)
+            stop = int(self.split[1] * num_files)
+            df = df.iloc[start:stop, :]
+        # get labels for each observation
+        if class_mode not in ["input", "multi_output", "raw", None]:
+            self.classes = self.get_classes(df, y_col)
+        self.filenames = df[x_col].tolist()
+        self._sample_weight = df[weight_col].values if weight_col else None
+
+        if class_mode == "multi_output":
+            self._targets = [np.array(df[col].tolist()) for col in y_col]
+        if class_mode == "raw":
+            self._targets = df[y_col].values
+        self.samples = len(self.filenames)
+        validated_string = (
+            "validated" if validate_filenames else "non-validated"
+        )
+        if class_mode in ["input", "multi_output", "raw", None]:
+            io_utils.print_msg(
+                f"Found {self.samples} {validated_string} image filenames."
+            )
+        else:
+            io_utils.print_msg(
+                f"Found {self.samples} {validated_string} image filenames "
+                f"belonging to {num_classes} classes."
+            )
+        self._filepaths = [
+            os.path.join(self.directory, fname) for fname in self.filenames
+        ]
+        super().__init__(self.samples, batch_size, shuffle, seed)
+
+    def _check_params(self, df, x_col, y_col, weight_col, classes):
+        # check class mode is one of the currently supported
+        if self.class_mode not in self.allowed_class_modes:
+            raise ValueError(
+                "Invalid class_mode: {}; expected one of: {}".format(
+                    self.class_mode, self.allowed_class_modes
+                )
+            )
+        # check that y_col has several column names if class_mode is
+        # multi_output
+        if (self.class_mode == "multi_output") and not isinstance(y_col, list):
+            raise TypeError(
+                'If class_mode="{}", y_col must be a list. Received {}.'.format(
+                    self.class_mode, type(y_col).__name__
+                )
+            )
+        # check that filenames/filepaths column values are all strings
+        if not all(df[x_col].apply(lambda x: isinstance(x, str))):
+            raise TypeError(
+                f"All values in column x_col={x_col} must be strings."
+            )
+        # check labels are string if class_mode is binary or sparse
+        if self.class_mode in {"binary", "sparse"}:
+            if not all(df[y_col].apply(lambda x: isinstance(x, str))):
+                raise TypeError(
+                    'If class_mode="{}", y_col="{}" column '
+                    "values must be strings.".format(self.class_mode, y_col)
+                )
+        # check that if binary there are only 2 different classes
+        if self.class_mode == "binary":
+            if classes:
+                classes = set(classes)
+                if len(classes) != 2:
+                    raise ValueError(
+                        'If class_mode="binary" there must be 2 '
+                        "classes. {} class/es were given.".format(len(classes))
+                    )
+            elif df[y_col].nunique() != 2:
+                raise ValueError(
+                    'If class_mode="binary" there must be 2 classes. '
+                    "Found {} classes.".format(df[y_col].nunique())
+                )
+        # check values are string, list or tuple if class_mode is categorical
+        if self.class_mode == "categorical":
+            types = (str, list, tuple)
+            if not all(df[y_col].apply(lambda x: isinstance(x, types))):
+                raise TypeError(
+                    'If class_mode="{}", y_col="{}" column '
+                    "values must be type string, list or tuple.".format(
+                        self.class_mode, y_col
+                    )
+                )
+        # raise warning if classes are given but will be unused
+        if classes and self.class_mode in {
+            "input",
+            "multi_output",
+            "raw",
+            None,
+        }:
+            warnings.warn(
+                '`classes` will be ignored given the class_mode="{}"'.format(
+                    self.class_mode
+                )
+            )
+        # check that if weight column that the values are numerical
+        if weight_col and not issubclass(df[weight_col].dtype.type, np.number):
+            raise TypeError(f"Column weight_col={weight_col} must be numeric.")
+
+    def get_classes(self, df, y_col):
+        labels = []
+        for label in df[y_col]:
+            if isinstance(label, (list, tuple)):
+                labels.append([self.class_indices[lbl] for lbl in label])
+            else:
+                labels.append(self.class_indices[label])
+        return labels
+
+    @staticmethod
+    def _filter_classes(df, y_col, classes):
+        df = df.copy()
+
+        def remove_classes(labels, classes):
+            if isinstance(labels, (list, tuple)):
+                labels = [cls for cls in labels if cls in classes]
+                return labels or None
+            elif isinstance(labels, str):
+                return labels if labels in classes else None
+            else:
+                raise TypeError(
+                    "Expect string, list or tuple "
+                    "but found {} in {} column ".format(type(labels), y_col)
+                )
+
+        if classes:
+            # prepare for membership lookup
+            classes = list(collections.OrderedDict.fromkeys(classes).keys())
+            df[y_col] = df[y_col].apply(lambda x: remove_classes(x, classes))
+        else:
+            classes = set()
+            for v in df[y_col]:
+                if isinstance(v, (list, tuple)):
+                    classes.update(v)
+                else:
+                    classes.add(v)
+            classes = sorted(classes)
+        return df.dropna(subset=[y_col]), classes
+
+    def _filter_valid_filepaths(self, df, x_col):
+        """Keep only dataframe rows with valid filenames.
+
+        Args:
+            df: Pandas dataframe containing filenames in a column
+            x_col: string, column in `df` that contains the filenames or
+                filepaths
+        Returns:
+            absolute paths to image files
+        """
+        filepaths = df[x_col].map(
+            lambda fname: os.path.join(self.directory, fname)
+        )
+        mask = filepaths.apply(
+            validate_filename, args=(self.white_list_formats,)
+        )
+        n_invalid = (~mask).sum()
+        if n_invalid:
+            warnings.warn(
+                'Found {} invalid image filename(s) in x_col="{}". '
+                "These filename(s) will be ignored.".format(n_invalid, x_col)
+            )
+        return df[mask]
+
+    @property
+    def filepaths(self):
+        return self._filepaths
+
+    @property
+    def labels(self):
+        if self.class_mode in {"multi_output", "raw"}:
+            return self._targets
+        else:
+            return self.classes
+
+    @property
+    def sample_weight(self):
+        return self._sample_weight
+
+
+def flip_axis(x, axis):
+    x = np.asarray(x).swapaxes(axis, 0)
+    x = x[::-1, ...]
+    x = x.swapaxes(0, axis)
+    return x
+
+
+@keras_export("keras.preprocessing.image.ImageDataGenerator")
+class ImageDataGenerator:
+    """Generate batches of tensor image data with real-time data augmentation.
+
+    Deprecated: `tf.keras.preprocessing.image.ImageDataGenerator` is not
+    recommended for new code. Prefer loading images with
+    `tf.keras.utils.image_dataset_from_directory` and transforming the output
+    `tf.data.Dataset` with preprocessing layers. For more information, see the
+    tutorials for [loading images](
+    https://www.tensorflow.org/tutorials/load_data/images) and
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+     The data will be looped over (in batches).
+
+    Args:
+        featurewise_center: Boolean. Set input mean to 0 over the dataset,
+          feature-wise.
+        samplewise_center: Boolean. Set each sample mean to 0.
+        featurewise_std_normalization: Boolean. Divide inputs by std of the
+          dataset, feature-wise.
+        samplewise_std_normalization: Boolean. Divide each input by its std.
+        zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
+        zca_whitening: Boolean. Apply ZCA whitening.
+        rotation_range: Int. Degree range for random rotations.
+        width_shift_range: Float, 1-D array-like or int
+            - float: fraction of total width, if < 1, or pixels if >= 1.
+            - 1-D array-like: random elements from the array.
+            - int: integer number of pixels from interval `(-width_shift_range,
+              +width_shift_range)` - With `width_shift_range=2` possible values
+              are integers `[-1, 0, +1]`, same as with `width_shift_range=[-1,
+              0, +1]`, while with `width_shift_range=1.0` possible values are
+              floats in the interval [-1.0, +1.0).
+        height_shift_range: Float, 1-D array-like or int
+            - float: fraction of total height, if < 1, or pixels if >= 1.
+            - 1-D array-like: random elements from the array.
+            - int: integer number of pixels from interval `(-height_shift_range,
+              +height_shift_range)` - With `height_shift_range=2` possible
+              values are integers `[-1, 0, +1]`, same as with
+              `height_shift_range=[-1, 0, +1]`, while with
+              `height_shift_range=1.0` possible values are floats in the
+              interval [-1.0, +1.0).
+        brightness_range: Tuple or list of two floats. Range for picking a
+          brightness shift value from.
+        shear_range: Float. Shear Intensity (Shear angle in counter-clockwise
+          direction in degrees)
+        zoom_range: Float or [lower, upper]. Range for random zoom. If a float,
+          `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
+        channel_shift_range: Float. Range for random channel shifts.
+        fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}. Default
+          is 'nearest'. Points outside the boundaries of the input are filled
+          according to the given mode:
+            - 'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
+            - 'nearest':  aaaaaaaa|abcd|dddddddd
+            - 'reflect':  abcddcba|abcd|dcbaabcd
+            - 'wrap':  abcdabcd|abcd|abcdabcd
+        cval: Float or Int. Value used for points outside the boundaries when
+          `fill_mode = "constant"`.
+        horizontal_flip: Boolean. Randomly flip inputs horizontally.
+        vertical_flip: Boolean. Randomly flip inputs vertically.
+        rescale: rescaling factor. If None or 0, no rescaling
+          is applied, otherwise we multiply the data by the value provided
+          (after applying all other transformations). Defaults to `None`.
+        preprocessing_function: function that will be applied on each input. The
+          function will run after the image is resized and augmented.
+            The function should take one argument: one image (Numpy tensor with
+              rank 3), and should output a Numpy tensor with the same shape.
+        data_format: Image data format, either "channels_first" or
+          "channels_last". "channels_last" mode means that the images should
+          have shape `(samples, height, width, channels)`, "channels_first" mode
+          means that the images should have shape `(samples, channels, height,
+          width)`. When unspecified, uses `image_data_format` value found in
+          your Keras config file at `~/.keras/keras.json` (if exists) else
+          'channels_last'. Defaults to "channels_last".
+        validation_split: Float. Fraction of images reserved for validation
+          (strictly between 0 and 1).
+        dtype: Dtype to use for the generated arrays.
+
+    Raises:
+      ValueError: If the value of the argument, `data_format` is other than
+            `"channels_last"` or `"channels_first"`.
+      ValueError: If the value of the argument, `validation_split` > 1
+            or `validation_split` < 0.
+
+    Examples:
+
+    Example of using `.flow(x, y)`:
+
+    ```python
+    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+    y_train = utils.to_categorical(y_train, num_classes)
+    y_test = utils.to_categorical(y_test, num_classes)
+    datagen = ImageDataGenerator(
+        featurewise_center=True,
+        featurewise_std_normalization=True,
+        rotation_range=20,
+        width_shift_range=0.2,
+        height_shift_range=0.2,
+        horizontal_flip=True,
+        validation_split=0.2)
+    # compute quantities required for featurewise normalization
+    # (std, mean, and principal components if ZCA whitening is applied)
+    datagen.fit(x_train)
+    # fits the model on batches with real-time data augmentation:
+    model.fit(datagen.flow(x_train, y_train, batch_size=32,
+             subset='training'),
+             validation_data=datagen.flow(x_train, y_train,
+             batch_size=8, subset='validation'),
+             steps_per_epoch=len(x_train) / 32, epochs=epochs)
+    # here's a more "manual" example
+    for e in range(epochs):
+        print('Epoch', e)
+        batches = 0
+        for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
+            model.fit(x_batch, y_batch)
+            batches += 1
+            if batches >= len(x_train) / 32:
+                # we need to break the loop by hand because
+                # the generator loops indefinitely
+                break
+    ```
+
+    Example of using `.flow_from_directory(directory)`:
+
+    ```python
+    train_datagen = ImageDataGenerator(
+            rescale=1./255,
+            shear_range=0.2,
+            zoom_range=0.2,
+            horizontal_flip=True)
+    test_datagen = ImageDataGenerator(rescale=1./255)
+    train_generator = train_datagen.flow_from_directory(
+            'data/train',
+            target_size=(150, 150),
+            batch_size=32,
+            class_mode='binary')
+    validation_generator = test_datagen.flow_from_directory(
+            'data/validation',
+            target_size=(150, 150),
+            batch_size=32,
+            class_mode='binary')
+    model.fit(
+            train_generator,
+            steps_per_epoch=2000,
+            epochs=50,
+            validation_data=validation_generator,
+            validation_steps=800)
+    ```
+
+    Example of transforming images and masks together.
+
+    ```python
+    # we create two instances with the same arguments
+    data_gen_args = dict(featurewise_center=True,
+                         featurewise_std_normalization=True,
+                         rotation_range=90,
+                         width_shift_range=0.1,
+                         height_shift_range=0.1,
+                         zoom_range=0.2)
+    image_datagen = ImageDataGenerator(**data_gen_args)
+    mask_datagen = ImageDataGenerator(**data_gen_args)
+    # Provide the same seed and keyword arguments to the fit and flow methods
+    seed = 1
+    image_datagen.fit(images, augment=True, seed=seed)
+    mask_datagen.fit(masks, augment=True, seed=seed)
+    image_generator = image_datagen.flow_from_directory(
+        'data/images',
+        class_mode=None,
+        seed=seed)
+    mask_generator = mask_datagen.flow_from_directory(
+        'data/masks',
+        class_mode=None,
+        seed=seed)
+    # combine generators into one which yields image and masks
+    train_generator = zip(image_generator, mask_generator)
+    model.fit(
+        train_generator,
+        steps_per_epoch=2000,
+        epochs=50)
+    ```
+    """
+
+    def __init__(
+        self,
+        featurewise_center=False,
+        samplewise_center=False,
+        featurewise_std_normalization=False,
+        samplewise_std_normalization=False,
+        zca_whitening=False,
+        zca_epsilon=1e-6,
+        rotation_range=0,
+        width_shift_range=0.0,
+        height_shift_range=0.0,
+        brightness_range=None,
+        shear_range=0.0,
+        zoom_range=0.0,
+        channel_shift_range=0.0,
+        fill_mode="nearest",
+        cval=0.0,
+        horizontal_flip=False,
+        vertical_flip=False,
+        rescale=None,
+        preprocessing_function=None,
+        data_format=None,
+        validation_split=0.0,
+        interpolation_order=1,
+        dtype=None,
+    ):
+        if data_format is None:
+            data_format = backend.image_data_format()
+        if dtype is None:
+            dtype = backend.floatx()
+
+        self.featurewise_center = featurewise_center
+        self.samplewise_center = samplewise_center
+        self.featurewise_std_normalization = featurewise_std_normalization
+        self.samplewise_std_normalization = samplewise_std_normalization
+        self.zca_whitening = zca_whitening
+        self.zca_epsilon = zca_epsilon
+        self.rotation_range = rotation_range
+        self.width_shift_range = width_shift_range
+        self.height_shift_range = height_shift_range
+        self.shear_range = shear_range
+        self.zoom_range = zoom_range
+        self.channel_shift_range = channel_shift_range
+        self.fill_mode = fill_mode
+        self.cval = cval
+        self.horizontal_flip = horizontal_flip
+        self.vertical_flip = vertical_flip
+        self.rescale = rescale
+        self.preprocessing_function = preprocessing_function
+        self.dtype = dtype
+        self.interpolation_order = interpolation_order
+
+        if data_format not in {"channels_last", "channels_first"}:
+            raise ValueError(
+                '`data_format` should be `"channels_last"` '
+                "(channel after row and column) or "
+                '`"channels_first"` (channel before row and column). '
+                "Received: %s" % data_format
+            )
+        self.data_format = data_format
+        if data_format == "channels_first":
+            self.channel_axis = 1
+            self.row_axis = 2
+            self.col_axis = 3
+        if data_format == "channels_last":
+            self.channel_axis = 3
+            self.row_axis = 1
+            self.col_axis = 2
+        if validation_split and not 0 < validation_split < 1:
+            raise ValueError(
+                "`validation_split` must be strictly between 0 and 1. "
+                " Received: %s" % validation_split
+            )
+        self._validation_split = validation_split
+
+        self.mean = None
+        self.std = None
+        self.zca_whitening_matrix = None
+
+        if isinstance(zoom_range, (float, int)):
+            self.zoom_range = [1 - zoom_range, 1 + zoom_range]
+        elif len(zoom_range) == 2 and all(
+            isinstance(val, (float, int)) for val in zoom_range
+        ):
+            self.zoom_range = [zoom_range[0], zoom_range[1]]
+        else:
+            raise ValueError(
+                "`zoom_range` should be a float or "
+                "a tuple or list of two floats. "
+                "Received: %s" % (zoom_range,)
+            )
+        if zca_whitening:
+            if not featurewise_center:
+                self.featurewise_center = True
+                warnings.warn(
+                    "This ImageDataGenerator specifies "
+                    "`zca_whitening`, which overrides "
+                    "setting of `featurewise_center`."
+                )
+            if featurewise_std_normalization:
+                self.featurewise_std_normalization = False
+                warnings.warn(
+                    "This ImageDataGenerator specifies "
+                    "`zca_whitening` "
+                    "which overrides setting of"
+                    "`featurewise_std_normalization`."
+                )
+        if featurewise_std_normalization:
+            if not featurewise_center:
+                self.featurewise_center = True
+                warnings.warn(
+                    "This ImageDataGenerator specifies "
+                    "`featurewise_std_normalization`, "
+                    "which overrides setting of "
+                    "`featurewise_center`."
+                )
+        if samplewise_std_normalization:
+            if not samplewise_center:
+                self.samplewise_center = True
+                warnings.warn(
+                    "This ImageDataGenerator specifies "
+                    "`samplewise_std_normalization`, "
+                    "which overrides setting of "
+                    "`samplewise_center`."
+                )
+        if brightness_range is not None:
+            if (
+                not isinstance(brightness_range, (tuple, list))
+                or len(brightness_range) != 2
+            ):
+                raise ValueError(
+                    "`brightness_range should be tuple or list of two floats. "
+                    "Received: %s" % (brightness_range,)
+                )
+        self.brightness_range = brightness_range
+
+    def flow(
+        self,
+        x,
+        y=None,
+        batch_size=32,
+        shuffle=True,
+        sample_weight=None,
+        seed=None,
+        save_to_dir=None,
+        save_prefix="",
+        save_format="png",
+        ignore_class_split=False,
+        subset=None,
+    ):
+        """Takes data & label arrays, generates batches of augmented data.
+
+        Args:
+            x: Input data. Numpy array of rank 4 or a tuple. If tuple, the first
+              element should contain the images and the second element another
+              numpy array or a list of numpy arrays that gets passed to the
+              output without any modifications. Can be used to feed the model
+              miscellaneous data along with the images. In case of grayscale
+              data, the channels axis of the image array should have value 1, in
+              case of RGB data, it should have value 3, and in case of RGBA
+              data, it should have value 4.
+            y: Labels.
+            batch_size: Int (default: 32).
+            shuffle: Boolean (default: True).
+            sample_weight: Sample weights.
+            seed: Int (default: None).
+            save_to_dir: None or str (default: None). This allows you to
+              optionally specify a directory to which to save the augmented
+              pictures being generated (useful for visualizing what you are
+              doing).
+            save_prefix: Str (default: `''`). Prefix to use for filenames of
+              saved pictures (only relevant if `save_to_dir` is set).
+            save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif",
+              "tif", "jpg" (only relevant if `save_to_dir` is set). Default:
+              "png".
+            ignore_class_split: Boolean (default: False), ignore difference
+              in number of classes in labels across train and validation
+              split (useful for non-classification tasks)
+            subset: Subset of data (`"training"` or `"validation"`) if
+              `validation_split` is set in `ImageDataGenerator`.
+
+        Returns:
+            An `Iterator` yielding tuples of `(x, y)`
+                where `x` is a numpy array of image data
+                (in the case of a single image input) or a list
+                of numpy arrays (in the case with
+                additional inputs) and `y` is a numpy array
+                of corresponding labels. If 'sample_weight' is not None,
+                the yielded tuples are of the form `(x, y, sample_weight)`.
+                If `y` is None, only the numpy array `x` is returned.
+        Raises:
+          ValueError: If the Value of the argument, `subset` is other than
+                "training" or "validation".
+
+        """
+        return NumpyArrayIterator(
+            x,
+            y,
+            self,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            sample_weight=sample_weight,
+            seed=seed,
+            data_format=self.data_format,
+            save_to_dir=save_to_dir,
+            save_prefix=save_prefix,
+            save_format=save_format,
+            ignore_class_split=ignore_class_split,
+            subset=subset,
+            dtype=self.dtype,
+        )
+
+    def flow_from_directory(
+        self,
         directory,
+        target_size=(256, 256),
+        color_mode="rgb",
+        classes=None,
+        class_mode="categorical",
+        batch_size=32,
+        shuffle=True,
+        seed=None,
+        save_to_dir=None,
+        save_prefix="",
+        save_format="png",
+        follow_links=False,
+        subset=None,
+        interpolation="nearest",
+        keep_aspect_ratio=False,
+    ):
+        """Takes the path to a directory & generates batches of augmented data.
+
+        Args:
+          directory: string, path to the target directory. It should contain
+            one subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images
+            inside each of the subdirectories directory tree will be included
+            in the generator. See [this script](
+            https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
+            for more details.
+          target_size: Tuple of integers `(height, width)`. The dimensions to
+            which all images found will be resized. Defaults to `(256,256)`.
+          color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
+            Whether the images will be converted to have 1, 3, or 4 channels.
+          classes: Optional list of class subdirectories (e.g. `['dogs',
+            'cats']`). Default: None. If not provided, the list of classes
+            will be automatically inferred from the subdirectory
+            names/structure under `directory`, where each subdirectory will be
+            treated as a different class (and the order of the classes, which
+            will map to the label indices, will be alphanumeric). The
+            dictionary containing the mapping from class names to class
+            indices can be obtained via the attribute `class_indices`.
+          class_mode: One of "categorical", "binary", "sparse",
+            "input", or None.
+            Determines the type of label arrays that are returned:
+              - "categorical" will be 2D one-hot encoded labels,
+              - "binary" will be 1D binary labels,
+              - "sparse" will be 1D integer labels,
+              - "input" will be images identical
+                to input images (mainly used to work with autoencoders).
+              - If None, no labels are returned
+                (the generator will only yield batches of image data,
+                which is useful to use with `model.predict_generator()`).
+                Please note that in case of class_mode None,
+                the data still needs to reside in a subdirectory
+                of `directory` for it to work correctly.
+              Defaults to "categorical".
+          batch_size: Size of the batches of data. Defaults to `32`.
+          shuffle: Whether to shuffle the data If `False`, sorts the
+            data in alphanumeric order. Defaults to `True`.
+          seed: Optional random seed for shuffling and transformations.
+          save_to_dir: None or str (default: None). This allows you to
+            optionally specify a directory to which to save the augmented
+            pictures being generated (useful for visualizing what you are
+            doing).
+          save_prefix: Str. Prefix to use for filenames of saved pictures
+            (only relevant if `save_to_dir` is set).
+          save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif",
+            "tif", "jpg" (only relevant if `save_to_dir` is set).
+            Defaults to "png".
+          follow_links: Whether to follow symlinks inside
+            class subdirectories. Defaults to `False`.
+          subset: Subset of data (`"training"` or `"validation"`) if
+            `validation_split` is set in `ImageDataGenerator`.
+          interpolation: Interpolation method used to resample the image if
+            the target size is different from that of the loaded image.
+            Supported methods are `"nearest"`, `"bilinear"`, and `"bicubic"`.
+            If PIL version 1.1.3 or newer is installed, `"lanczos"` is also
+            supported. If PIL version 3.4.0 or newer is installed, `"box"` and
+            `"hamming"` are also supported. Defaults to `"nearest"`.
+          keep_aspect_ratio: Boolean, whether to resize images to a target
+            size without aspect ratio distortion. The image is cropped in
+            the center with target aspect ratio before resizing.
+
+        Returns:
+          A `DirectoryIterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array containing a batch
+            of images with shape `(batch_size, *target_size, channels)`
+            and `y` is a numpy array of corresponding labels.
+        """
+        return DirectoryIterator(
+            directory,
+            self,
+            target_size=target_size,
+            color_mode=color_mode,
+            keep_aspect_ratio=keep_aspect_ratio,
+            classes=classes,
+            class_mode=class_mode,
+            data_format=self.data_format,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            seed=seed,
+            save_to_dir=save_to_dir,
+            save_prefix=save_prefix,
+            save_format=save_format,
+            follow_links=follow_links,
+            subset=subset,
+            interpolation=interpolation,
+            dtype=self.dtype,
+        )
+
+    def flow_from_dataframe(
         self,
-        x_col=x_col,
-        y_col=y_col,
-        weight_col=weight_col,
-        target_size=target_size,
-        color_mode=color_mode,
-        classes=classes,
-        class_mode=class_mode,
-        data_format=self.data_format,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        seed=seed,
-        save_to_dir=save_to_dir,
-        save_prefix=save_prefix,
-        save_format=save_format,
-        subset=subset,
-        interpolation=interpolation,
-        validate_filenames=validate_filenames,
-        dtype=self.dtype)
-
-  def standardize(self, x):
-    """Applies the normalization configuration in-place to a batch of inputs.
-
-    `x` is changed in-place since the function is mainly used internally
-    to standardize images and feed them to your network. If a copy of `x`
-    would be created instead it would have a significant performance cost.
-    If you want to apply this method without changing the input in-place
-    you can call the method creating a copy before:
-
-    standardize(np.copy(x))
+        dataframe,
+        directory=None,
+        x_col="filename",
+        y_col="class",
+        weight_col=None,
+        target_size=(256, 256),
+        color_mode="rgb",
+        classes=None,
+        class_mode="categorical",
+        batch_size=32,
+        shuffle=True,
+        seed=None,
+        save_to_dir=None,
+        save_prefix="",
+        save_format="png",
+        subset=None,
+        interpolation="nearest",
+        validate_filenames=True,
+        **kwargs,
+    ):
+        """Takes the dataframe and the path to a directory + generates batches.
+
+         The generated batches contain augmented/normalized data.
+
+        **A simple tutorial can be found **[here](
+                                    http://bit.ly/keras_flow_from_dataframe).
+
+        Args:
+            dataframe: Pandas dataframe containing the filepaths relative to
+                `directory` (or absolute paths if `directory` is None) of the
+                images in a string column. It should include other column/s
+                depending on the `class_mode`:
+                - if `class_mode` is `"categorical"` (default value) it must
+                    include the `y_col` column with the class/es of each image.
+                    Values in column can be string/list/tuple if a single class
+                    or list/tuple if multiple classes.
+                - if `class_mode` is `"binary"` or `"sparse"` it must include
+                    the given `y_col` column with class values as strings.
+                - if `class_mode` is `"raw"` or `"multi_output"` it should
+                    contain the columns specified in `y_col`.
+                - if `class_mode` is `"input"` or `None` no extra column is
+                    needed.
+            directory: string, path to the directory to read images from. If
+              `None`, data in `x_col` column should be absolute paths.
+            x_col: string, column in `dataframe` that contains the filenames (or
+              absolute paths if `directory` is `None`).
+            y_col: string or list, column/s in `dataframe` that has the target
+              data.
+            weight_col: string, column in `dataframe` that contains the sample
+                weights. Default: `None`.
+            target_size: tuple of integers `(height, width)`, default: `(256,
+              256)`. The dimensions to which all images found will be resized.
+            color_mode: one of "grayscale", "rgb", "rgba". Default: "rgb".
+              Whether the images will be converted to have 1 or 3 color
+              channels.
+            classes: optional list of classes (e.g. `['dogs', 'cats']`). Default
+              is None. If not provided, the list of classes will be
+              automatically inferred from the `y_col`, which will map to the
+              label indices, will be alphanumeric). The dictionary containing
+              the mapping from class names to class indices can be obtained via
+              the attribute `class_indices`.
+            class_mode: one of "binary", "categorical", "input", "multi_output",
+                "raw", sparse" or None. Default: "categorical".
+                Mode for yielding the targets:
+                - `"binary"`: 1D numpy array of binary labels,
+                - `"categorical"`: 2D numpy array of one-hot encoded labels.
+                  Supports multi-label output.
+                - `"input"`: images identical to input images (mainly used to
+                  work with autoencoders),
+                - `"multi_output"`: list with the values of the different
+                  columns,
+                - `"raw"`: numpy array of values in `y_col` column(s),
+                - `"sparse"`: 1D numpy array of integer labels,
+                - `None`, no targets are returned (the generator will only yield
+                  batches of image data, which is useful to use in
+                  `model.predict()`).
+            batch_size: size of the batches of data (default: 32).
+            shuffle: whether to shuffle the data (default: True)
+            seed: optional random seed for shuffling and transformations.
+            save_to_dir: None or str (default: None). This allows you to
+              optionally specify a directory to which to save the augmented
+              pictures being generated (useful for visualizing what you are
+              doing).
+            save_prefix: str. Prefix to use for filenames of saved pictures
+              (only relevant if `save_to_dir` is set).
+            save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif",
+              "tif", "jpg" (only relevant if `save_to_dir` is set). Default:
+              "png".
+            subset: Subset of data (`"training"` or `"validation"`) if
+              `validation_split` is set in `ImageDataGenerator`.
+            interpolation: Interpolation method used to resample the image if
+              the target size is different from that of the loaded image.
+              Supported methods are `"nearest"`, `"bilinear"`, and `"bicubic"`.
+              If PIL version 1.1.3 or newer is installed, `"lanczos"` is also
+              supported. If PIL version 3.4.0 or newer is installed, `"box"` and
+              `"hamming"` are also supported. By default, `"nearest"` is used.
+            validate_filenames: Boolean, whether to validate image filenames in
+              `x_col`. If `True`, invalid images will be ignored. Disabling this
+              option can lead to speed-up in the execution of this function.
+              Defaults to `True`.
+            **kwargs: legacy arguments for raising deprecation warnings.
+
+        Returns:
+            A `DataFrameIterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array containing a batch
+            of images with shape `(batch_size, *target_size, channels)`
+            and `y` is a numpy array of corresponding labels.
+        """
+        if "has_ext" in kwargs:
+            warnings.warn(
+                "has_ext is deprecated, filenames in the dataframe have "
+                "to match the exact filenames in disk.",
+                DeprecationWarning,
+            )
+        if "sort" in kwargs:
+            warnings.warn(
+                "sort is deprecated, batches will be created in the"
+                "same order than the filenames provided if shuffle"
+                "is set to False.",
+                DeprecationWarning,
+            )
+        if class_mode == "other":
+            warnings.warn(
+                '`class_mode` "other" is deprecated, please use '
+                '`class_mode` "raw".',
+                DeprecationWarning,
+            )
+            class_mode = "raw"
+        if "drop_duplicates" in kwargs:
+            warnings.warn(
+                "drop_duplicates is deprecated, you can drop duplicates "
+                "by using the pandas.DataFrame.drop_duplicates method.",
+                DeprecationWarning,
+            )
+
+        return DataFrameIterator(
+            dataframe,
+            directory,
+            self,
+            x_col=x_col,
+            y_col=y_col,
+            weight_col=weight_col,
+            target_size=target_size,
+            color_mode=color_mode,
+            classes=classes,
+            class_mode=class_mode,
+            data_format=self.data_format,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            seed=seed,
+            save_to_dir=save_to_dir,
+            save_prefix=save_prefix,
+            save_format=save_format,
+            subset=subset,
+            interpolation=interpolation,
+            validate_filenames=validate_filenames,
+            dtype=self.dtype,
+        )
+
+    def standardize(self, x):
+        """Applies the normalization configuration in-place to a batch of
+        inputs.
+
+        `x` is changed in-place since the function is mainly used internally
+        to standardize images and feed them to your network. If a copy of `x`
+        would be created instead it would have a significant performance cost.
+        If you want to apply this method without changing the input in-place
+        you can call the method creating a copy before:
+
+        standardize(np.copy(x))
+
+        Args:
+            x: Batch of inputs to be normalized.
+
+        Returns:
+            The inputs, normalized.
+        """
+        if self.preprocessing_function:
+            x = self.preprocessing_function(x)
+        if self.rescale:
+            x *= self.rescale
+        if self.samplewise_center:
+            x -= np.mean(x, keepdims=True)
+        if self.samplewise_std_normalization:
+            x /= np.std(x, keepdims=True) + 1e-6
+
+        if self.featurewise_center:
+            if self.mean is not None:
+                x -= self.mean
+            else:
+                warnings.warn(
+                    "This ImageDataGenerator specifies "
+                    "`featurewise_center`, but it hasn't "
+                    "been fit on any training data. Fit it "
+                    "first by calling `.fit(numpy_data)`."
+                )
+        if self.featurewise_std_normalization:
+            if self.std is not None:
+                x /= self.std + 1e-6
+            else:
+                warnings.warn(
+                    "This ImageDataGenerator specifies "
+                    "`featurewise_std_normalization`, "
+                    "but it hasn't "
+                    "been fit on any training data. Fit it "
+                    "first by calling `.fit(numpy_data)`."
+                )
+        if self.zca_whitening:
+            if self.zca_whitening_matrix is not None:
+                flat_x = x.reshape(-1, np.prod(x.shape[-3:]))
+                white_x = flat_x @ self.zca_whitening_matrix
+                x = np.reshape(white_x, x.shape)
+            else:
+                warnings.warn(
+                    "This ImageDataGenerator specifies "
+                    "`zca_whitening`, but it hasn't "
+                    "been fit on any training data. Fit it "
+                    "first by calling `.fit(numpy_data)`."
+                )
+        return x
+
+    def get_random_transform(self, img_shape, seed=None):
+        """Generates random parameters for a transformation.
+
+        Args:
+            img_shape: Tuple of integers.
+                Shape of the image that is transformed.
+            seed: Random seed.
+
+        Returns:
+            A dictionary containing randomly chosen parameters describing the
+            transformation.
+        """
+        img_row_axis = self.row_axis - 1
+        img_col_axis = self.col_axis - 1
+
+        if seed is not None:
+            np.random.seed(seed)
+
+        if self.rotation_range:
+            theta = np.random.uniform(-self.rotation_range, self.rotation_range)
+        else:
+            theta = 0
+
+        if self.height_shift_range:
+            try:  # 1-D array-like or int
+                tx = np.random.choice(self.height_shift_range)
+                tx *= np.random.choice([-1, 1])
+            except ValueError:  # floating point
+                tx = np.random.uniform(
+                    -self.height_shift_range, self.height_shift_range
+                )
+            if np.max(self.height_shift_range) < 1:
+                tx *= img_shape[img_row_axis]
+        else:
+            tx = 0
+
+        if self.width_shift_range:
+            try:  # 1-D array-like or int
+                ty = np.random.choice(self.width_shift_range)
+                ty *= np.random.choice([-1, 1])
+            except ValueError:  # floating point
+                ty = np.random.uniform(
+                    -self.width_shift_range, self.width_shift_range
+                )
+            if np.max(self.width_shift_range) < 1:
+                ty *= img_shape[img_col_axis]
+        else:
+            ty = 0
+
+        if self.shear_range:
+            shear = np.random.uniform(-self.shear_range, self.shear_range)
+        else:
+            shear = 0
+
+        if self.zoom_range[0] == 1 and self.zoom_range[1] == 1:
+            zx, zy = 1, 1
+        else:
+            zx, zy = np.random.uniform(
+                self.zoom_range[0], self.zoom_range[1], 2
+            )
+
+        flip_horizontal = (np.random.random() < 0.5) * self.horizontal_flip
+        flip_vertical = (np.random.random() < 0.5) * self.vertical_flip
+
+        channel_shift_intensity = None
+        if self.channel_shift_range != 0:
+            channel_shift_intensity = np.random.uniform(
+                -self.channel_shift_range, self.channel_shift_range
+            )
+
+        brightness = None
+        if self.brightness_range is not None:
+            brightness = np.random.uniform(
+                self.brightness_range[0], self.brightness_range[1]
+            )
+
+        transform_parameters = {
+            "theta": theta,
+            "tx": tx,
+            "ty": ty,
+            "shear": shear,
+            "zx": zx,
+            "zy": zy,
+            "flip_horizontal": flip_horizontal,
+            "flip_vertical": flip_vertical,
+            "channel_shift_intensity": channel_shift_intensity,
+            "brightness": brightness,
+        }
+
+        return transform_parameters
+
+    def apply_transform(self, x, transform_parameters):
+        """Applies a transformation to an image according to given parameters.
+
+        Args:
+            x: 3D tensor, single image.
+            transform_parameters: Dictionary with string - parameter pairs
+                describing the transformation.
+                Currently, the following parameters
+                from the dictionary are used:
+                - `'theta'`: Float. Rotation angle in degrees.
+                - `'tx'`: Float. Shift in the x direction.
+                - `'ty'`: Float. Shift in the y direction.
+                - `'shear'`: Float. Shear angle in degrees.
+                - `'zx'`: Float. Zoom in the x direction.
+                - `'zy'`: Float. Zoom in the y direction.
+                - `'flip_horizontal'`: Boolean. Horizontal flip.
+                - `'flip_vertical'`: Boolean. Vertical flip.
+                - `'channel_shift_intensity'`: Float. Channel shift intensity.
+                - `'brightness'`: Float. Brightness shift intensity.
+
+        Returns:
+            A transformed version of the input (same shape).
+        """
+        # x is a single image, so it doesn't have image number at index 0
+        img_row_axis = self.row_axis - 1
+        img_col_axis = self.col_axis - 1
+        img_channel_axis = self.channel_axis - 1
+
+        x = apply_affine_transform(
+            x,
+            transform_parameters.get("theta", 0),
+            transform_parameters.get("tx", 0),
+            transform_parameters.get("ty", 0),
+            transform_parameters.get("shear", 0),
+            transform_parameters.get("zx", 1),
+            transform_parameters.get("zy", 1),
+            row_axis=img_row_axis,
+            col_axis=img_col_axis,
+            channel_axis=img_channel_axis,
+            fill_mode=self.fill_mode,
+            cval=self.cval,
+            order=self.interpolation_order,
+        )
+
+        if transform_parameters.get("channel_shift_intensity") is not None:
+            x = apply_channel_shift(
+                x,
+                transform_parameters["channel_shift_intensity"],
+                img_channel_axis,
+            )
+
+        if transform_parameters.get("flip_horizontal", False):
+            x = flip_axis(x, img_col_axis)
+
+        if transform_parameters.get("flip_vertical", False):
+            x = flip_axis(x, img_row_axis)
+
+        if transform_parameters.get("brightness") is not None:
+            x = apply_brightness_shift(
+                x, transform_parameters["brightness"], False
+            )
+
+        return x
+
+    def random_transform(self, x, seed=None):
+        """Applies a random transformation to an image.
+
+        Args:
+            x: 3D tensor, single image.
+            seed: Random seed.
+
+        Returns:
+            A randomly transformed version of the input (same shape).
+        """
+        params = self.get_random_transform(x.shape, seed)
+        return self.apply_transform(x, params)
+
+    def fit(self, x, augment=False, rounds=1, seed=None):
+        """Fits the data generator to some sample data.
+
+        This computes the internal data stats related to the
+        data-dependent transformations, based on an array of sample data.
+
+        Only required if `featurewise_center` or
+        `featurewise_std_normalization` or `zca_whitening` are set to True.
+
+        When `rescale` is set to a value, rescaling is applied to
+        sample data before computing the internal data stats.
+
+        Args:
+            x: Sample data. Should have rank 4.
+             In case of grayscale data,
+             the channels axis should have value 1, in case
+             of RGB data, it should have value 3, and in case
+             of RGBA data, it should have value 4.
+            augment: Boolean (default: False).
+                Whether to fit on randomly augmented samples.
+            rounds: Int (default: 1).
+                If using data augmentation (`augment=True`),
+                this is how many augmentation passes over the data to use.
+            seed: Int (default: None). Random seed.
+        """
+        x = np.asarray(x, dtype=self.dtype)
+        if x.ndim != 4:
+            raise ValueError(
+                "Input to `.fit()` should have rank 4. Got array with shape: "
+                + str(x.shape)
+            )
+        if x.shape[self.channel_axis] not in {1, 3, 4}:
+            warnings.warn(
+                "Expected input to be images (as Numpy array) "
+                'following the data format convention "'
+                + self.data_format
+                + '" (channels on axis '
+                + str(self.channel_axis)
+                + "), i.e. expected either 1, 3 or 4 channels on axis "
+                + str(self.channel_axis)
+                + ". However, it was passed an array with shape "
+                + str(x.shape)
+                + " ("
+                + str(x.shape[self.channel_axis])
+                + " channels)."
+            )
+
+        if seed is not None:
+            np.random.seed(seed)
+
+        x = np.copy(x)
+        if self.rescale:
+            x *= self.rescale
+
+        if augment:
+            ax = np.zeros(
+                tuple([rounds * x.shape[0]] + list(x.shape)[1:]),
+                dtype=self.dtype,
+            )
+            for r in range(rounds):
+                for i in range(x.shape[0]):
+                    ax[i + r * x.shape[0]] = self.random_transform(x[i])
+            x = ax
+
+        if self.featurewise_center:
+            self.mean = np.mean(x, axis=(0, self.row_axis, self.col_axis))
+            broadcast_shape = [1, 1, 1]
+            broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
+            self.mean = np.reshape(self.mean, broadcast_shape)
+            x -= self.mean
+
+        if self.featurewise_std_normalization:
+            self.std = np.std(x, axis=(0, self.row_axis, self.col_axis))
+            broadcast_shape = [1, 1, 1]
+            broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
+            self.std = np.reshape(self.std, broadcast_shape)
+            x /= self.std + 1e-6
+
+        if self.zca_whitening:
+            n = len(x)
+            flat_x = np.reshape(x, (n, -1))
+
+            u, s, _ = np.linalg.svd(flat_x.T, full_matrices=False)
+            s_inv = np.sqrt(n) / (s + self.zca_epsilon)
+            self.zca_whitening_matrix = (u * s_inv).dot(u.T)
+
+
+@keras_export("keras.preprocessing.image.random_rotation")
+def random_rotation(
+    x,
+    rg,
+    row_axis=1,
+    col_axis=2,
+    channel_axis=0,
+    fill_mode="nearest",
+    cval=0.0,
+    interpolation_order=1,
+):
+    """Performs a random rotation of a Numpy image tensor.
+
+    Deprecated: `tf.keras.preprocessing.image.random_rotation` does not operate
+    on tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.RandomRotation` which provides equivalent functionality as
+    a preprocessing layer. For more information, see the tutorial for
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
     Args:
-        x: Batch of inputs to be normalized.
+        x: Input tensor. Must be 3D.
+        rg: Rotation range, in degrees.
+        row_axis: Index of axis for rows in the input tensor.
+        col_axis: Index of axis for columns in the input tensor.
+        channel_axis: Index of axis for channels in the input tensor.
+        fill_mode: Points outside the boundaries of the input
+            are filled according to the given mode
+            (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+        cval: Value used for points outside the boundaries
+            of the input if `mode='constant'`.
+        interpolation_order: int, order of spline interpolation.
+            see `ndimage.interpolation.affine_transform`
 
     Returns:
-        The inputs, normalized.
+        Rotated Numpy image tensor.
     """
-    if self.preprocessing_function:
-      x = self.preprocessing_function(x)
-    if self.rescale:
-      x *= self.rescale
-    if self.samplewise_center:
-      x -= np.mean(x, keepdims=True)
-    if self.samplewise_std_normalization:
-      x /= (np.std(x, keepdims=True) + 1e-6)
-
-    if self.featurewise_center:
-      if self.mean is not None:
-        x -= self.mean
-      else:
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`featurewise_center`, but it hasn\'t '
-                      'been fit on any training data. Fit it '
-                      'first by calling `.fit(numpy_data)`.')
-    if self.featurewise_std_normalization:
-      if self.std is not None:
-        x /= (self.std + 1e-6)
-      else:
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`featurewise_std_normalization`, '
-                      'but it hasn\'t '
-                      'been fit on any training data. Fit it '
-                      'first by calling `.fit(numpy_data)`.')
-    if self.zca_whitening:
-      if self.zca_whitening_matrix is not None:
-        flat_x = x.reshape(-1, np.prod(x.shape[-3:]))
-        white_x = flat_x @ self.zca_whitening_matrix
-        x = np.reshape(white_x, x.shape)
-      else:
-        warnings.warn('This ImageDataGenerator specifies '
-                      '`zca_whitening`, but it hasn\'t '
-                      'been fit on any training data. Fit it '
-                      'first by calling `.fit(numpy_data)`.')
+    theta = np.random.uniform(-rg, rg)
+    x = apply_affine_transform(
+        x,
+        theta=theta,
+        row_axis=row_axis,
+        col_axis=col_axis,
+        channel_axis=channel_axis,
+        fill_mode=fill_mode,
+        cval=cval,
+        order=interpolation_order,
+    )
     return x
 
-  def get_random_transform(self, img_shape, seed=None):
-    """Generates random parameters for a transformation.
+
+@keras_export("keras.preprocessing.image.random_shift")
+def random_shift(
+    x,
+    wrg,
+    hrg,
+    row_axis=1,
+    col_axis=2,
+    channel_axis=0,
+    fill_mode="nearest",
+    cval=0.0,
+    interpolation_order=1,
+):
+    """Performs a random spatial shift of a Numpy image tensor.
+
+    Deprecated: `tf.keras.preprocessing.image.random_shift` does not operate on
+    tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.RandomTranslation` which provides equivalent functionality
+    as a preprocessing layer. For more information, see the tutorial for
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
     Args:
-        img_shape: Tuple of integers.
-            Shape of the image that is transformed.
-        seed: Random seed.
+        x: Input tensor. Must be 3D.
+        wrg: Width shift range, as a float fraction of the width.
+        hrg: Height shift range, as a float fraction of the height.
+        row_axis: Index of axis for rows in the input tensor.
+        col_axis: Index of axis for columns in the input tensor.
+        channel_axis: Index of axis for channels in the input tensor.
+        fill_mode: Points outside the boundaries of the input
+            are filled according to the given mode
+            (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+        cval: Value used for points outside the boundaries
+            of the input if `mode='constant'`.
+        interpolation_order: int, order of spline interpolation.
+            see `ndimage.interpolation.affine_transform`
 
     Returns:
-        A dictionary containing randomly chosen parameters describing the
-        transformation.
+        Shifted Numpy image tensor.
     """
-    img_row_axis = self.row_axis - 1
-    img_col_axis = self.col_axis - 1
+    h, w = x.shape[row_axis], x.shape[col_axis]
+    tx = np.random.uniform(-hrg, hrg) * h
+    ty = np.random.uniform(-wrg, wrg) * w
+    x = apply_affine_transform(
+        x,
+        tx=tx,
+        ty=ty,
+        row_axis=row_axis,
+        col_axis=col_axis,
+        channel_axis=channel_axis,
+        fill_mode=fill_mode,
+        cval=cval,
+        order=interpolation_order,
+    )
+    return x
 
-    if seed is not None:
-      np.random.seed(seed)
 
-    if self.rotation_range:
-      theta = np.random.uniform(-self.rotation_range, self.rotation_range)
-    else:
-      theta = 0
-
-    if self.height_shift_range:
-      try:  # 1-D array-like or int
-        tx = np.random.choice(self.height_shift_range)
-        tx *= np.random.choice([-1, 1])
-      except ValueError:  # floating point
-        tx = np.random.uniform(-self.height_shift_range,
-                               self.height_shift_range)
-      if np.max(self.height_shift_range) < 1:
-        tx *= img_shape[img_row_axis]
-    else:
-      tx = 0
-
-    if self.width_shift_range:
-      try:  # 1-D array-like or int
-        ty = np.random.choice(self.width_shift_range)
-        ty *= np.random.choice([-1, 1])
-      except ValueError:  # floating point
-        ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
-      if np.max(self.width_shift_range) < 1:
-        ty *= img_shape[img_col_axis]
-    else:
-      ty = 0
+@keras_export("keras.preprocessing.image.random_shear")
+def random_shear(
+    x,
+    intensity,
+    row_axis=1,
+    col_axis=2,
+    channel_axis=0,
+    fill_mode="nearest",
+    cval=0.0,
+    interpolation_order=1,
+):
+    """Performs a random spatial shear of a Numpy image tensor.
 
-    if self.shear_range:
-      shear = np.random.uniform(-self.shear_range, self.shear_range)
-    else:
-      shear = 0
+    Args:
+        x: Input tensor. Must be 3D.
+        intensity: Transformation intensity in degrees.
+        row_axis: Index of axis for rows in the input tensor.
+        col_axis: Index of axis for columns in the input tensor.
+        channel_axis: Index of axis for channels in the input tensor.
+        fill_mode: Points outside the boundaries of the input
+            are filled according to the given mode
+            (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+        cval: Value used for points outside the boundaries
+            of the input if `mode='constant'`.
+        interpolation_order: int, order of spline interpolation.
+            see `ndimage.interpolation.affine_transform`
 
-    if self.zoom_range[0] == 1 and self.zoom_range[1] == 1:
-      zx, zy = 1, 1
-    else:
-      zx, zy = np.random.uniform(self.zoom_range[0], self.zoom_range[1], 2)
-
-    flip_horizontal = (np.random.random() < 0.5) * self.horizontal_flip
-    flip_vertical = (np.random.random() < 0.5) * self.vertical_flip
-
-    channel_shift_intensity = None
-    if self.channel_shift_range != 0:
-      channel_shift_intensity = np.random.uniform(-self.channel_shift_range,
-                                                  self.channel_shift_range)
-
-    brightness = None
-    if self.brightness_range is not None:
-      brightness = np.random.uniform(self.brightness_range[0],
-                                     self.brightness_range[1])
-
-    transform_parameters = {
-        'theta': theta,
-        'tx': tx,
-        'ty': ty,
-        'shear': shear,
-        'zx': zx,
-        'zy': zy,
-        'flip_horizontal': flip_horizontal,
-        'flip_vertical': flip_vertical,
-        'channel_shift_intensity': channel_shift_intensity,
-        'brightness': brightness
-    }
+    Returns:
+        Sheared Numpy image tensor.
+    """
+    shear = np.random.uniform(-intensity, intensity)
+    x = apply_affine_transform(
+        x,
+        shear=shear,
+        row_axis=row_axis,
+        col_axis=col_axis,
+        channel_axis=channel_axis,
+        fill_mode=fill_mode,
+        cval=cval,
+        order=interpolation_order,
+    )
+    return x
 
-    return transform_parameters
 
-  def apply_transform(self, x, transform_parameters):
-    """Applies a transformation to an image according to given parameters.
+@keras_export("keras.preprocessing.image.random_zoom")
+def random_zoom(
+    x,
+    zoom_range,
+    row_axis=1,
+    col_axis=2,
+    channel_axis=0,
+    fill_mode="nearest",
+    cval=0.0,
+    interpolation_order=1,
+):
+    """Performs a random spatial zoom of a Numpy image tensor.
+
+    Deprecated: `tf.keras.preprocessing.image.random_zoom` does not operate on
+    tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.RandomZoom` which provides equivalent functionality as
+    a preprocessing layer. For more information, see the tutorial for
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
 
     Args:
-        x: 3D tensor, single image.
-        transform_parameters: Dictionary with string - parameter pairs
-            describing the transformation.
-            Currently, the following parameters
-            from the dictionary are used:
-            - `'theta'`: Float. Rotation angle in degrees.
-            - `'tx'`: Float. Shift in the x direction.
-            - `'ty'`: Float. Shift in the y direction.
-            - `'shear'`: Float. Shear angle in degrees.
-            - `'zx'`: Float. Zoom in the x direction.
-            - `'zy'`: Float. Zoom in the y direction.
-            - `'flip_horizontal'`: Boolean. Horizontal flip.
-            - `'flip_vertical'`: Boolean. Vertical flip.
-            - `'channel_shift_intensity'`: Float. Channel shift intensity.
-            - `'brightness'`: Float. Brightness shift intensity.
+        x: Input tensor. Must be 3D.
+        zoom_range: Tuple of floats; zoom range for width and height.
+        row_axis: Index of axis for rows in the input tensor.
+        col_axis: Index of axis for columns in the input tensor.
+        channel_axis: Index of axis for channels in the input tensor.
+        fill_mode: Points outside the boundaries of the input
+            are filled according to the given mode
+            (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+        cval: Value used for points outside the boundaries
+            of the input if `mode='constant'`.
+        interpolation_order: int, order of spline interpolation.
+            see `ndimage.interpolation.affine_transform`
 
     Returns:
-        A transformed version of the input (same shape).
+        Zoomed Numpy image tensor.
+
+    Raises:
+        ValueError: if `zoom_range` isn't a tuple.
     """
-    # x is a single image, so it doesn't have image number at index 0
-    img_row_axis = self.row_axis - 1
-    img_col_axis = self.col_axis - 1
-    img_channel_axis = self.channel_axis - 1
+    if len(zoom_range) != 2:
+        raise ValueError(
+            "`zoom_range` should be a tuple or list of two floats. Received: %s"
+            % (zoom_range,)
+        )
 
+    if zoom_range[0] == 1 and zoom_range[1] == 1:
+        zx, zy = 1, 1
+    else:
+        zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
     x = apply_affine_transform(
         x,
-        transform_parameters.get('theta', 0),
-        transform_parameters.get('tx', 0),
-        transform_parameters.get('ty', 0),
-        transform_parameters.get('shear', 0),
-        transform_parameters.get('zx', 1),
-        transform_parameters.get('zy', 1),
-        row_axis=img_row_axis,
-        col_axis=img_col_axis,
-        channel_axis=img_channel_axis,
-        fill_mode=self.fill_mode,
-        cval=self.cval,
-        order=self.interpolation_order)
-
-    if transform_parameters.get('channel_shift_intensity') is not None:
-      x = apply_channel_shift(x,
-                              transform_parameters['channel_shift_intensity'],
-                              img_channel_axis)
-
-    if transform_parameters.get('flip_horizontal', False):
-      x = flip_axis(x, img_col_axis)
-
-    if transform_parameters.get('flip_vertical', False):
-      x = flip_axis(x, img_row_axis)
-
-    if transform_parameters.get('brightness') is not None:
-      x = apply_brightness_shift(x, transform_parameters['brightness'], False)
-
+        zx=zx,
+        zy=zy,
+        row_axis=row_axis,
+        col_axis=col_axis,
+        channel_axis=channel_axis,
+        fill_mode=fill_mode,
+        cval=cval,
+        order=interpolation_order,
+    )
     return x
 
-  def random_transform(self, x, seed=None):
-    """Applies a random transformation to an image.
+
+@keras_export("keras.preprocessing.image.apply_channel_shift")
+def apply_channel_shift(x, intensity, channel_axis=0):
+    """Performs a channel shift.
 
     Args:
-        x: 3D tensor, single image.
-        seed: Random seed.
+        x: Input tensor. Must be 3D.
+        intensity: Transformation intensity.
+        channel_axis: Index of axis for channels in the input tensor.
 
     Returns:
-        A randomly transformed version of the input (same shape).
+        Numpy image tensor.
     """
-    params = self.get_random_transform(x.shape, seed)
-    return self.apply_transform(x, params)
+    x = np.rollaxis(x, channel_axis, 0)
+    min_x, max_x = np.min(x), np.max(x)
+    channel_images = [
+        np.clip(x_channel + intensity, min_x, max_x) for x_channel in x
+    ]
+    x = np.stack(channel_images, axis=0)
+    x = np.rollaxis(x, 0, channel_axis + 1)
+    return x
+
+
+@keras_export("keras.preprocessing.image.random_channel_shift")
+def random_channel_shift(x, intensity_range, channel_axis=0):
+    """Performs a random channel shift.
 
-  def fit(self, x, augment=False, rounds=1, seed=None):
-    """Fits the data generator to some sample data.
+    Args:
+        x: Input tensor. Must be 3D.
+        intensity_range: Transformation intensity.
+        channel_axis: Index of axis for channels in the input tensor.
 
-    This computes the internal data stats related to the
-    data-dependent transformations, based on an array of sample data.
+    Returns:
+        Numpy image tensor.
+    """
+    intensity = np.random.uniform(-intensity_range, intensity_range)
+    return apply_channel_shift(x, intensity, channel_axis=channel_axis)
 
-    Only required if `featurewise_center` or
-    `featurewise_std_normalization` or `zca_whitening` are set to True.
 
-    When `rescale` is set to a value, rescaling is applied to
-    sample data before computing the internal data stats.
+@keras_export("keras.preprocessing.image.apply_brightness_shift")
+def apply_brightness_shift(x, brightness, scale=True):
+    """Performs a brightness shift.
 
     Args:
-        x: Sample data. Should have rank 4.
-         In case of grayscale data,
-         the channels axis should have value 1, in case
-         of RGB data, it should have value 3, and in case
-         of RGBA data, it should have value 4.
-        augment: Boolean (default: False).
-            Whether to fit on randomly augmented samples.
-        rounds: Int (default: 1).
-            If using data augmentation (`augment=True`),
-            this is how many augmentation passes over the data to use.
-        seed: Int (default: None). Random seed.
-    """
-    x = np.asarray(x, dtype=self.dtype)
-    if x.ndim != 4:
-      raise ValueError('Input to `.fit()` should have rank 4. '
-                       'Got array with shape: ' + str(x.shape))
-    if x.shape[self.channel_axis] not in {1, 3, 4}:
-      warnings.warn('Expected input to be images (as Numpy array) '
-                    'following the data format convention "' +
-                    self.data_format + '" (channels on axis ' +
-                    str(self.channel_axis) + '), i.e. expected '
-                    'either 1, 3 or 4 channels on axis ' +
-                    str(self.channel_axis) + '. '
-                    'However, it was passed an array with shape ' +
-                    str(x.shape) + ' (' + str(x.shape[self.channel_axis]) +
-                    ' channels).')
-
-    if seed is not None:
-      np.random.seed(seed)
-
-    x = np.copy(x)
-    if self.rescale:
-      x *= self.rescale
-
-    if augment:
-      ax = np.zeros(
-          tuple([rounds * x.shape[0]] + list(x.shape)[1:]), dtype=self.dtype)
-      for r in range(rounds):
-        for i in range(x.shape[0]):
-          ax[i + r * x.shape[0]] = self.random_transform(x[i])
-      x = ax
-
-    if self.featurewise_center:
-      self.mean = np.mean(x, axis=(0, self.row_axis, self.col_axis))
-      broadcast_shape = [1, 1, 1]
-      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
-      self.mean = np.reshape(self.mean, broadcast_shape)
-      x -= self.mean
-
-    if self.featurewise_std_normalization:
-      self.std = np.std(x, axis=(0, self.row_axis, self.col_axis))
-      broadcast_shape = [1, 1, 1]
-      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
-      self.std = np.reshape(self.std, broadcast_shape)
-      x /= (self.std + 1e-6)
-
-    if self.zca_whitening:
-      n = len(x)
-      flat_x = np.reshape(x, (n, -1))
-
-      u, s, _ = np.linalg.svd(flat_x.T, full_matrices=False)
-      s_inv = np.sqrt(n) / (s + self.zca_epsilon)
-      self.zca_whitening_matrix = (u * s_inv).dot(u.T)
-
-
-@keras_export('keras.preprocessing.image.random_rotation')
-def random_rotation(x, rg, row_axis=1, col_axis=2, channel_axis=0,
-                    fill_mode='nearest', cval=0., interpolation_order=1):
-  """Performs a random rotation of a Numpy image tensor.
-
-  Deprecated: `tf.keras.preprocessing.image.random_rotation` does not operate on
-  tensors and is not recommended for new code. Prefer
-  `tf.keras.layers.RandomRotation` which provides equivalent functionality as a
-  preprocessing layer. For more information, see the tutorial for
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-      x: Input tensor. Must be 3D.
-      rg: Rotation range, in degrees.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-      interpolation_order: int, order of spline interpolation.
-          see `ndimage.interpolation.affine_transform`
-
-  Returns:
-      Rotated Numpy image tensor.
-  """
-  theta = np.random.uniform(-rg, rg)
-  x = apply_affine_transform(x,
-                             theta=theta,
-                             row_axis=row_axis,
-                             col_axis=col_axis,
-                             channel_axis=channel_axis,
-                             fill_mode=fill_mode,
-                             cval=cval,
-                             order=interpolation_order)
-  return x
-
-
-@keras_export('keras.preprocessing.image.random_shift')
-def random_shift(x, wrg, hrg, row_axis=1, col_axis=2, channel_axis=0,
-                 fill_mode='nearest', cval=0., interpolation_order=1):
-  """Performs a random spatial shift of a Numpy image tensor.
-
-  Deprecated: `tf.keras.preprocessing.image.random_shift` does not operate on
-  tensors and is not recommended for new code. Prefer
-  `tf.keras.layers.RandomTranslation` which provides equivalent functionality as
-  a preprocessing layer. For more information, see the tutorial for
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-      x: Input tensor. Must be 3D.
-      wrg: Width shift range, as a float fraction of the width.
-      hrg: Height shift range, as a float fraction of the height.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-      interpolation_order: int, order of spline interpolation.
-          see `ndimage.interpolation.affine_transform`
-
-  Returns:
-      Shifted Numpy image tensor.
-  """
-  h, w = x.shape[row_axis], x.shape[col_axis]
-  tx = np.random.uniform(-hrg, hrg) * h
-  ty = np.random.uniform(-wrg, wrg) * w
-  x = apply_affine_transform(x,
-                             tx=tx,
-                             ty=ty,
-                             row_axis=row_axis,
-                             col_axis=col_axis,
-                             channel_axis=channel_axis,
-                             fill_mode=fill_mode,
-                             cval=cval,
-                             order=interpolation_order)
-  return x
-
-
-@keras_export('keras.preprocessing.image.random_shear')
-def random_shear(x, intensity, row_axis=1, col_axis=2, channel_axis=0,
-                 fill_mode='nearest', cval=0., interpolation_order=1):
-  """Performs a random spatial shear of a Numpy image tensor.
-
-  Args:
-      x: Input tensor. Must be 3D.
-      intensity: Transformation intensity in degrees.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-      interpolation_order: int, order of spline interpolation.
-          see `ndimage.interpolation.affine_transform`
-
-  Returns:
-      Sheared Numpy image tensor.
-  """
-  shear = np.random.uniform(-intensity, intensity)
-  x = apply_affine_transform(
-      x,
-      shear=shear,
-      row_axis=row_axis,
-      col_axis=col_axis,
-      channel_axis=channel_axis,
-      fill_mode=fill_mode,
-      cval=cval,
-      order=interpolation_order)
-  return x
-
-
-@keras_export('keras.preprocessing.image.random_zoom')
-def random_zoom(x, zoom_range, row_axis=1, col_axis=2, channel_axis=0,
-                fill_mode='nearest', cval=0., interpolation_order=1):
-  """Performs a random spatial zoom of a Numpy image tensor.
-
-  Deprecated: `tf.keras.preprocessing.image.random_zoom` does not operate on
-  tensors and is not recommended for new code. Prefer
-  `tf.keras.layers.RandomZoom` which provides equivalent functionality as
-  a preprocessing layer. For more information, see the tutorial for
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-      x: Input tensor. Must be 3D.
-      zoom_range: Tuple of floats; zoom range for width and height.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-      interpolation_order: int, order of spline interpolation.
-          see `ndimage.interpolation.affine_transform`
-
-  Returns:
-      Zoomed Numpy image tensor.
-
-  Raises:
-      ValueError: if `zoom_range` isn't a tuple.
-  """
-  if len(zoom_range) != 2:
-    raise ValueError('`zoom_range` should be a tuple or list of two'
-                     ' floats. Received: %s' % (zoom_range,))
-
-  if zoom_range[0] == 1 and zoom_range[1] == 1:
-    zx, zy = 1, 1
-  else:
-    zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
-  x = apply_affine_transform(
-      x,
-      zx=zx,
-      zy=zy,
-      row_axis=row_axis,
-      col_axis=col_axis,
-      channel_axis=channel_axis,
-      fill_mode=fill_mode,
-      cval=cval,
-      order=interpolation_order)
-  return x
-
-
-@keras_export('keras.preprocessing.image.apply_channel_shift')
-def apply_channel_shift(x, intensity, channel_axis=0):
-  """Performs a channel shift.
-
-  Args:
-      x: Input tensor. Must be 3D.
-      intensity: Transformation intensity.
-      channel_axis: Index of axis for channels in the input tensor.
-
-  Returns:
-      Numpy image tensor.
-  """
-  x = np.rollaxis(x, channel_axis, 0)
-  min_x, max_x = np.min(x), np.max(x)
-  channel_images = [
-      np.clip(x_channel + intensity, min_x, max_x) for x_channel in x]
-  x = np.stack(channel_images, axis=0)
-  x = np.rollaxis(x, 0, channel_axis + 1)
-  return x
-
-
-@keras_export('keras.preprocessing.image.random_channel_shift')
-def random_channel_shift(x, intensity_range, channel_axis=0):
-  """Performs a random channel shift.
+        x: Input tensor. Must be 3D.
+        brightness: Float. The new brightness value.
+        scale: Whether to rescale the image such that minimum and maximum values
+            are 0 and 255 respectively. Default: True.
 
-  Args:
-      x: Input tensor. Must be 3D.
-      intensity_range: Transformation intensity.
-      channel_axis: Index of axis for channels in the input tensor.
+    Returns:
+        Numpy image tensor.
 
-  Returns:
-      Numpy image tensor.
-  """
-  intensity = np.random.uniform(-intensity_range, intensity_range)
-  return apply_channel_shift(x, intensity, channel_axis=channel_axis)
+    Raises:
+        ImportError: if PIL is not available.
+    """
+    if ImageEnhance is None:
+        raise ImportError(
+            "Using brightness shifts requires PIL. Install PIL or Pillow."
+        )
+    x_min, x_max = np.min(x), np.max(x)
+    local_scale = (x_min < 0) or (x_max > 255)
+    x = image_utils.array_to_img(x, scale=local_scale or scale)
+    x = imgenhancer_Brightness = ImageEnhance.Brightness(x)
+    x = imgenhancer_Brightness.enhance(brightness)
+    x = image_utils.img_to_array(x)
+    if not scale and local_scale:
+        x = x / 255 * (x_max - x_min) + x_min
+    return x
 
 
-@keras_export('keras.preprocessing.image.apply_brightness_shift')
-def apply_brightness_shift(x, brightness, scale=True):
-  """Performs a brightness shift.
-
-  Args:
-      x: Input tensor. Must be 3D.
-      brightness: Float. The new brightness value.
-      scale: Whether to rescale the image such that minimum and maximum values
-          are 0 and 255 respectively. Default: True.
-
-  Returns:
-      Numpy image tensor.
-
-  Raises:
-      ImportError: if PIL is not available.
-  """
-  if ImageEnhance is None:
-    raise ImportError('Using brightness shifts requires PIL. '
-                      'Install PIL or Pillow.')
-  x_min, x_max = np.min(x), np.max(x)
-  local_scale = (x_min < 0) or (x_max > 255)
-  x = image_utils.array_to_img(x, scale=local_scale or scale)
-  x = imgenhancer_Brightness = ImageEnhance.Brightness(x)
-  x = imgenhancer_Brightness.enhance(brightness)
-  x = image_utils.img_to_array(x)
-  if not scale and local_scale:
-    x = x / 255 * (x_max - x_min) + x_min
-  return x
-
-
-@keras_export('keras.preprocessing.image.random_brightness')
+@keras_export("keras.preprocessing.image.random_brightness")
 def random_brightness(x, brightness_range, scale=True):
-  """Performs a random brightness shift.
-
-  Deprecated: `tf.keras.preprocessing.image.random_brightness` does not operate
-  on tensors and is not recommended for new code. Prefer
-  `tf.keras.layers.RandomBrightness` which provides equivalent functionality as
-  a preprocessing layer. For more information, see the tutorial for
-  [augmenting images](
-  https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers).
-
-  Args:
-      x: Input tensor. Must be 3D.
-      brightness_range: Tuple of floats; brightness range.
-      scale: Whether to rescale the image such that minimum and maximum values
-          are 0 and 255 respectively. Default: True.
-
-  Returns:
-      Numpy image tensor.
-
-  Raises:
-      ValueError if `brightness_range` isn't a tuple.
-  """
-  if len(brightness_range) != 2:
-    raise ValueError(
-        '`brightness_range should be tuple or list of two floats. '
-        'Received: %s' % (brightness_range,))
-
-  u = np.random.uniform(brightness_range[0], brightness_range[1])
-  return apply_brightness_shift(x, u, scale)
+    """Performs a random brightness shift.
+
+    Deprecated: `tf.keras.preprocessing.image.random_brightness` does not
+    operate on tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.RandomBrightness` which provides equivalent functionality
+    as a preprocessing layer. For more information, see the tutorial for
+    [augmenting images](
+    https://www.tensorflow.org/tutorials/images/data_augmentation), as well as
+    the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers).
+
+    Args:
+        x: Input tensor. Must be 3D.
+        brightness_range: Tuple of floats; brightness range.
+        scale: Whether to rescale the image such that minimum and maximum values
+            are 0 and 255 respectively. Default: True.
+
+    Returns:
+        Numpy image tensor.
+
+    Raises:
+        ValueError if `brightness_range` isn't a tuple.
+    """
+    if len(brightness_range) != 2:
+        raise ValueError(
+            "`brightness_range should be tuple or list of two floats. "
+            "Received: %s" % (brightness_range,)
+        )
+
+    u = np.random.uniform(brightness_range[0], brightness_range[1])
+    return apply_brightness_shift(x, u, scale)
 
 
 def transform_matrix_offset_center(matrix, x, y):
-  o_x = float(x) / 2 - 0.5
-  o_y = float(y) / 2 - 0.5
-  offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
-  reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
-  transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
-  return transform_matrix
-
-
-@keras_export('keras.preprocessing.image.apply_affine_transform')
-def apply_affine_transform(x, theta=0, tx=0, ty=0, shear=0, zx=1, zy=1,
-                           row_axis=1, col_axis=2, channel_axis=0,
-                           fill_mode='nearest', cval=0., order=1):
-  """Applies an affine transformation specified by the parameters given.
-
-  Args:
-      x: 3D numpy array - a 2D image with one or more channels.
-      theta: Rotation angle in degrees.
-      tx: Width shift.
-      ty: Heigh shift.
-      shear: Shear angle in degrees.
-      zx: Zoom in x direction.
-      zy: Zoom in y direction
-      row_axis: Index of axis for rows (aka Y axis) in the input
-          image. Direction: left to right.
-      col_axis: Index of axis for columns (aka X axis) in the input
-          image. Direction: top to bottom.
-      channel_axis: Index of axis for channels in the input image.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-      order: int, order of interpolation
-
-  Returns:
-      The transformed version of the input.
-
-  Raises:
-      ImportError: if SciPy is not available.
-  """
-  if scipy is None:
-    raise ImportError('Image transformations require SciPy. '
-                      'Install SciPy.')
-
-  # Input sanity checks:
-  # 1. x must 2D image with one or more channels (i.e., a 3D tensor)
-  # 2. channels must be either first or last dimension
-  if np.unique([row_axis, col_axis, channel_axis]).size != 3:
-    raise ValueError("'row_axis', 'col_axis', and 'channel_axis'"
-                     " must be distinct")
-
-  # shall we support negative indices?
-  valid_indices = set([0, 1, 2])
-  actual_indices = set([row_axis, col_axis, channel_axis])
-  if actual_indices != valid_indices:
-    raise ValueError(
-        f'Invalid axis\' indices: {actual_indices - valid_indices}')
-
-  if x.ndim != 3:
-    raise ValueError('Input arrays must be multi-channel 2D images.')
-  if channel_axis not in [0, 2]:
-    raise ValueError('Channels are allowed and the first and last dimensions.')
-
-  transform_matrix = None
-  if theta != 0:
-    theta = np.deg2rad(theta)
-    rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
-                                [np.sin(theta), np.cos(theta), 0],
-                                [0, 0, 1]])
-    transform_matrix = rotation_matrix
-
-  if tx != 0 or ty != 0:
-    shift_matrix = np.array([[1, 0, tx],
-                             [0, 1, ty],
-                             [0, 0, 1]])
-    if transform_matrix is None:
-      transform_matrix = shift_matrix
-    else:
-      transform_matrix = np.dot(transform_matrix, shift_matrix)
-
-  if shear != 0:
-    shear = np.deg2rad(shear)
-    shear_matrix = np.array([[1, -np.sin(shear), 0],
-                             [0, np.cos(shear), 0],
-                             [0, 0, 1]])
-    if transform_matrix is None:
-      transform_matrix = shear_matrix
-    else:
-      transform_matrix = np.dot(transform_matrix, shear_matrix)
-
-  if zx != 1 or zy != 1:
-    zoom_matrix = np.array([[zx, 0, 0],
-                            [0, zy, 0],
-                            [0, 0, 1]])
-    if transform_matrix is None:
-      transform_matrix = zoom_matrix
-    else:
-      transform_matrix = np.dot(transform_matrix, zoom_matrix)
+    o_x = float(x) / 2 - 0.5
+    o_y = float(y) / 2 - 0.5
+    offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
+    reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
+    transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
+    return transform_matrix
+
+
+@keras_export("keras.preprocessing.image.apply_affine_transform")
+def apply_affine_transform(
+    x,
+    theta=0,
+    tx=0,
+    ty=0,
+    shear=0,
+    zx=1,
+    zy=1,
+    row_axis=1,
+    col_axis=2,
+    channel_axis=0,
+    fill_mode="nearest",
+    cval=0.0,
+    order=1,
+):
+    """Applies an affine transformation specified by the parameters given.
 
-  if transform_matrix is not None:
-    h, w = x.shape[row_axis], x.shape[col_axis]
-    transform_matrix = transform_matrix_offset_center(
-        transform_matrix, h, w)
-    x = np.rollaxis(x, channel_axis, 0)
+    Args:
+        x: 3D numpy array - a 2D image with one or more channels.
+        theta: Rotation angle in degrees.
+        tx: Width shift.
+        ty: Heigh shift.
+        shear: Shear angle in degrees.
+        zx: Zoom in x direction.
+        zy: Zoom in y direction
+        row_axis: Index of axis for rows (aka Y axis) in the input
+            image. Direction: left to right.
+        col_axis: Index of axis for columns (aka X axis) in the input
+            image. Direction: top to bottom.
+        channel_axis: Index of axis for channels in the input image.
+        fill_mode: Points outside the boundaries of the input
+            are filled according to the given mode
+            (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
+        cval: Value used for points outside the boundaries
+            of the input if `mode='constant'`.
+        order: int, order of interpolation
 
-    # Matrix construction assumes that coordinates are x, y (in that order).
-    # However, regular numpy arrays use y,x (aka i,j) indexing.
-    # Possible solution is:
-    #   1. Swap the x and y axes.
-    #   2. Apply transform.
-    #   3. Swap the x and y axes again to restore image-like data ordering.
-    # Mathematically, it is equivalent to the following transformation:
-    # M' = PMP, where P is the permutation matrix, M is the original
-    # transformation matrix.
-    if col_axis > row_axis:
-      transform_matrix[:, [0, 1]] = transform_matrix[:, [1, 0]]
-      transform_matrix[[0, 1]] = transform_matrix[[1, 0]]
-    final_affine_matrix = transform_matrix[:2, :2]
-    final_offset = transform_matrix[:2, 2]
-
-    channel_images = [ndimage.interpolation.affine_transform(  # pylint: disable=g-complex-comprehension
-        x_channel,
-        final_affine_matrix,
-        final_offset,
-        order=order,
-        mode=fill_mode,
-        cval=cval) for x_channel in x]
-    x = np.stack(channel_images, axis=0)
-    x = np.rollaxis(x, 0, channel_axis + 1)
-  return x
+    Returns:
+        The transformed version of the input.
+
+    Raises:
+        ImportError: if SciPy is not available.
+    """
+    if scipy is None:
+        raise ImportError("Image transformations require SciPy. Install SciPy.")
+
+    # Input sanity checks:
+    # 1. x must 2D image with one or more channels (i.e., a 3D tensor)
+    # 2. channels must be either first or last dimension
+    if np.unique([row_axis, col_axis, channel_axis]).size != 3:
+        raise ValueError(
+            "'row_axis', 'col_axis', and 'channel_axis' must be distinct"
+        )
+
+    # shall we support negative indices?
+    valid_indices = set([0, 1, 2])
+    actual_indices = set([row_axis, col_axis, channel_axis])
+    if actual_indices != valid_indices:
+        raise ValueError(
+            f"Invalid axis' indices: {actual_indices - valid_indices}"
+        )
+
+    if x.ndim != 3:
+        raise ValueError("Input arrays must be multi-channel 2D images.")
+    if channel_axis not in [0, 2]:
+        raise ValueError(
+            "Channels are allowed and the first and last dimensions."
+        )
+
+    transform_matrix = None
+    if theta != 0:
+        theta = np.deg2rad(theta)
+        rotation_matrix = np.array(
+            [
+                [np.cos(theta), -np.sin(theta), 0],
+                [np.sin(theta), np.cos(theta), 0],
+                [0, 0, 1],
+            ]
+        )
+        transform_matrix = rotation_matrix
+
+    if tx != 0 or ty != 0:
+        shift_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+        if transform_matrix is None:
+            transform_matrix = shift_matrix
+        else:
+            transform_matrix = np.dot(transform_matrix, shift_matrix)
+
+    if shear != 0:
+        shear = np.deg2rad(shear)
+        shear_matrix = np.array(
+            [[1, -np.sin(shear), 0], [0, np.cos(shear), 0], [0, 0, 1]]
+        )
+        if transform_matrix is None:
+            transform_matrix = shear_matrix
+        else:
+            transform_matrix = np.dot(transform_matrix, shear_matrix)
+
+    if zx != 1 or zy != 1:
+        zoom_matrix = np.array([[zx, 0, 0], [0, zy, 0], [0, 0, 1]])
+        if transform_matrix is None:
+            transform_matrix = zoom_matrix
+        else:
+            transform_matrix = np.dot(transform_matrix, zoom_matrix)
+
+    if transform_matrix is not None:
+        h, w = x.shape[row_axis], x.shape[col_axis]
+        transform_matrix = transform_matrix_offset_center(
+            transform_matrix, h, w
+        )
+        x = np.rollaxis(x, channel_axis, 0)
+
+        # Matrix construction assumes that coordinates are x, y (in that order).
+        # However, regular numpy arrays use y,x (aka i,j) indexing.
+        # Possible solution is:
+        #   1. Swap the x and y axes.
+        #   2. Apply transform.
+        #   3. Swap the x and y axes again to restore image-like data ordering.
+        # Mathematically, it is equivalent to the following transformation:
+        # M' = PMP, where P is the permutation matrix, M is the original
+        # transformation matrix.
+        if col_axis > row_axis:
+            transform_matrix[:, [0, 1]] = transform_matrix[:, [1, 0]]
+            transform_matrix[[0, 1]] = transform_matrix[[1, 0]]
+        final_affine_matrix = transform_matrix[:2, :2]
+        final_offset = transform_matrix[:2, 2]
+
+        channel_images = [
+            ndimage.interpolation.affine_transform(
+                x_channel,
+                final_affine_matrix,
+                final_offset,
+                order=order,
+                mode=fill_mode,
+                cval=cval,
+            )
+            for x_channel in x
+        ]
+        x = np.stack(channel_images, axis=0)
+        x = np.rollaxis(x, 0, channel_axis + 1)
+    return x
diff --git a/keras/preprocessing/image_test.py b/keras/preprocessing/image_test.py
index ac8515181f4b..90a379cc8d97 100644
--- a/keras/preprocessing/image_test.py
+++ b/keras/preprocessing/image_test.py
@@ -19,2057 +19,2345 @@
 import shutil
 import tempfile
 
+import numpy as np
+import pandas as pd
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras import layers
 from keras.engine import sequential
 from keras.preprocessing import image
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import image_utils
-import numpy as np
-import pandas as pd
-import tensorflow.compat.v2 as tf
 
 try:
-  import PIL  # pylint:disable=g-import-not-at-top
+    import PIL
 except ImportError:
-  PIL = None
-
-
-def _generate_test_images(include_rgba=False,
-                          include_16bit=False,
-                          include_32bit=False):
-  img_w = img_h = 20
-  rgb_images = []
-  rgba_images = []
-  gray_images = []
-  gray_images_16bit = []
-  gray_images_32bit = []
-  for _ in range(8):
-    bias = np.random.rand(img_w, img_h, 1) * 64
-    variance = np.random.rand(img_w, img_h, 1) * (255 - 64)
-    # RGB
-    imarray = np.random.rand(img_w, img_h, 3) * variance + bias
-    im = PIL.Image.fromarray(imarray.astype('uint8')).convert('RGB')
-    rgb_images.append(im)
-    # RGBA
-    imarray = np.random.rand(img_w, img_h, 4) * variance + bias
-    im = PIL.Image.fromarray(imarray.astype('uint8')).convert('RGBA')
-    rgba_images.append(im)
-    # 8-bit grayscale
-    imarray = np.random.rand(img_w, img_h, 1) * variance + bias
-    im = PIL.Image.fromarray(imarray.astype('uint8').squeeze()).convert('L')
-    gray_images.append(im)
-    # 16-bit grayscale
-    imarray = np.array(
-        np.random.randint(-2147483648, 2147483647, (img_w, img_h)))
-    im = PIL.Image.fromarray(imarray.astype('uint16'))
-    gray_images_16bit.append(im)
-    # 32-bit grayscale
-    im = PIL.Image.fromarray(imarray.astype('uint32'))
-    gray_images_32bit.append(im)
-
-  ret = [rgb_images, gray_images]
-  if include_rgba:
-    ret.append(rgba_images)
-  if include_16bit:
-    ret.append(gray_images_16bit)
-  if include_32bit:
-    ret.append(gray_images_32bit)
-  return ret
+    PIL = None
+
+
+def _generate_test_images(
+    include_rgba=False, include_16bit=False, include_32bit=False
+):
+    img_w = img_h = 20
+    rgb_images = []
+    rgba_images = []
+    gray_images = []
+    gray_images_16bit = []
+    gray_images_32bit = []
+    for _ in range(8):
+        bias = np.random.rand(img_w, img_h, 1) * 64
+        variance = np.random.rand(img_w, img_h, 1) * (255 - 64)
+        # RGB
+        imarray = np.random.rand(img_w, img_h, 3) * variance + bias
+        im = PIL.Image.fromarray(imarray.astype("uint8")).convert("RGB")
+        rgb_images.append(im)
+        # RGBA
+        imarray = np.random.rand(img_w, img_h, 4) * variance + bias
+        im = PIL.Image.fromarray(imarray.astype("uint8")).convert("RGBA")
+        rgba_images.append(im)
+        # 8-bit grayscale
+        imarray = np.random.rand(img_w, img_h, 1) * variance + bias
+        im = PIL.Image.fromarray(imarray.astype("uint8").squeeze()).convert("L")
+        gray_images.append(im)
+        # 16-bit grayscale
+        imarray = np.array(
+            np.random.randint(-2147483648, 2147483647, (img_w, img_h))
+        )
+        im = PIL.Image.fromarray(imarray.astype("uint16"))
+        gray_images_16bit.append(im)
+        # 32-bit grayscale
+        im = PIL.Image.fromarray(imarray.astype("uint32"))
+        gray_images_32bit.append(im)
+
+    ret = [rgb_images, gray_images]
+    if include_rgba:
+        ret.append(rgba_images)
+    if include_16bit:
+        ret.append(gray_images_16bit)
+    if include_32bit:
+        ret.append(gray_images_32bit)
+    return ret
 
 
 @test_utils.run_v2_only
 class TestImage(test_combinations.TestCase):
-
-  def test_iterator_empty_directory(self):
-    # Testing with different batch sizes
-    for batch_size in [0, 32]:
-      data_iterator = image.Iterator(0, batch_size, False, 0)
-      ret = next(data_iterator.index_generator)
-      self.assertEqual(ret.size, 0)
-
-  def test_image(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    for test_images in _generate_test_images():
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-
-      images = np.vstack(img_list)
-      generator = image.ImageDataGenerator(
-          featurewise_center=True,
-          samplewise_center=True,
-          featurewise_std_normalization=True,
-          samplewise_std_normalization=True,
-          zca_whitening=True,
-          rotation_range=90.,
-          width_shift_range=0.1,
-          height_shift_range=0.1,
-          shear_range=0.5,
-          zoom_range=0.2,
-          channel_shift_range=0.,
-          brightness_range=(1, 5),
-          fill_mode='nearest',
-          cval=0.5,
-          horizontal_flip=True,
-          vertical_flip=True)
-      # Basic test before fit
-      x = np.random.random((32, 10, 10, 3))
-      generator.flow(x)
-
-      # Fit
-      generator.fit(images, augment=True)
-
-      for x, _ in generator.flow(
-          images, np.arange(images.shape[0]), shuffle=True):
-        self.assertEqual(x.shape[1:], images.shape[1:])
-        break
-
-  def test_image_with_split_value_error(self):
-    with self.assertRaises(ValueError):
-      image.ImageDataGenerator(validation_split=5)
-
-  def test_image_invalid_data(self):
-    generator = image.ImageDataGenerator(
-        featurewise_center=True,
-        samplewise_center=True,
-        featurewise_std_normalization=True,
-        samplewise_std_normalization=True,
-        zca_whitening=True,
-        data_format='channels_last')
-
-    # Test fit with invalid data
-    with self.assertRaises(ValueError):
-      x = np.random.random((3, 10, 10))
-      generator.fit(x)
-    # Test flow with invalid data
-    with self.assertRaises(ValueError):
-      generator.flow(np.arange(5))
-    # Invalid number of channels: will work but raise a warning
-    x = np.random.random((32, 10, 10, 5))
-    generator.flow(x)
-
-    with self.assertRaises(ValueError):
-      generator = image.ImageDataGenerator(data_format='unknown')
-
-    generator = image.ImageDataGenerator(zoom_range=(2., 2.))
-
-  def test_image_fit(self):
-    generator = image.ImageDataGenerator(
-        featurewise_center=True,
-        samplewise_center=True,
-        featurewise_std_normalization=True,
-        samplewise_std_normalization=True,
-        zca_whitening=True,
-        data_format='channels_last')
-    # Test grayscale
-    x = np.random.random((32, 10, 10, 1))
-    generator.fit(x)
-    # Test RBG
-    x = np.random.random((32, 10, 10, 3))
-    generator.fit(x)
-    generator = image.ImageDataGenerator(
-        featurewise_center=True,
-        samplewise_center=True,
-        featurewise_std_normalization=True,
-        samplewise_std_normalization=True,
-        zca_whitening=True,
-        data_format='channels_first')
-    # Test grayscale
-    x = np.random.random((32, 1, 10, 10))
-    generator.fit(x)
-    # Test RBG
-    x = np.random.random((32, 3, 10, 10))
-    generator.fit(x)
-
-  def test_directory_iterator(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    num_classes = 2
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-
-    # create folders and subfolders
-    paths = []
-    for cl in range(num_classes):
-      class_directory = 'class-{}'.format(cl)
-      classpaths = [
-          class_directory,
-          os.path.join(class_directory, 'subfolder-1'),
-          os.path.join(class_directory, 'subfolder-2'),
-          os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
-      ]
-      for path in classpaths:
-        os.mkdir(os.path.join(temp_dir, path))
-      paths.append(classpaths)
-
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in _generate_test_images():
-      for im in test_images:
-        # rotate image class
-        im_class = count % num_classes
-        # rotate subfolders
-        classpaths = paths[im_class]
-        filename = os.path.join(classpaths[count % len(classpaths)],
-                                'image-{}.jpg'.format(count))
-        filenames.append(filename)
-        im.save(os.path.join(temp_dir, filename))
-        count += 1
-
-    # Test image loading util
-    fname = os.path.join(temp_dir, filenames[0])
-    _ = image_utils.load_img(fname)
-    _ = image_utils.load_img(fname, grayscale=True)
-    _ = image_utils.load_img(fname, target_size=(10, 10))
-    _ = image_utils.load_img(
-        fname, target_size=(10, 10), interpolation='bilinear')
-
-    # create iterator
-    generator = image.ImageDataGenerator()
-    dir_iterator = generator.flow_from_directory(temp_dir)
-
-    # check number of classes and images
-    self.assertEqual(len(dir_iterator.class_indices), num_classes)
-    self.assertEqual(len(dir_iterator.classes), count)
-    self.assertEqual(set(dir_iterator.filenames), set(filenames))
-
-    def preprocessing_function(x):
-      """This will fail if not provided by a Numpy array.
-
-      Note: This is made to enforce backward compatibility.
-
-      Args:
-          x: A numpy array.
-
-      Returns:
-          An array of zeros with the same shape as the given array.
-      """
-      self.assertEqual(x.shape, (26, 26, 3))
-      self.assertIs(type(x), np.ndarray)
-      return np.zeros_like(x)
-
-    # Test usage as Sequence
-    generator = image.ImageDataGenerator(
-        preprocessing_function=preprocessing_function)
-    dir_seq = generator.flow_from_directory(
-        str(temp_dir),
-        target_size=(26, 26),
-        color_mode='rgb',
-        batch_size=3,
-        class_mode='categorical')
-    self.assertEqual(len(dir_seq), count // 3 + 1)
-    x1, y1 = dir_seq[1]
-    self.assertEqual(x1.shape, (3, 26, 26, 3))
-    self.assertEqual(y1.shape, (3, num_classes))
-    x1, y1 = dir_seq[5]
-    self.assertTrue((x1 == 0).all())
-
-  def directory_iterator_with_validation_split_test_helper(
-      self, validation_split):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    num_classes = 2
-    tmp_folder = tempfile.mkdtemp(prefix='test_images')
-
-    # create folders and subfolders
-    paths = []
-    for cl in range(num_classes):
-      class_directory = 'class-{}'.format(cl)
-      classpaths = [
-          class_directory,
-          os.path.join(class_directory, 'subfolder-1'),
-          os.path.join(class_directory, 'subfolder-2'),
-          os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
-      ]
-      for path in classpaths:
-        os.mkdir(os.path.join(tmp_folder, path))
-      paths.append(classpaths)
-
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in _generate_test_images():
-      for im in test_images:
-        # rotate image class
-        im_class = count % num_classes
-        # rotate subfolders
-        classpaths = paths[im_class]
-        filename = os.path.join(classpaths[count % len(classpaths)],
-                                'image-{}.jpg'.format(count))
-        filenames.append(filename)
-        im.save(os.path.join(tmp_folder, filename))
-        count += 1
-
-    # create iterator
-    generator = image.ImageDataGenerator(validation_split=validation_split)
-
-    with self.assertRaises(ValueError):
-      generator.flow_from_directory(tmp_folder, subset='foo')
-
-    num_validation = int(count * validation_split)
-    num_training = count - num_validation
-    train_iterator = generator.flow_from_directory(
-        tmp_folder, subset='training')
-    self.assertEqual(train_iterator.samples, num_training)
-
-    valid_iterator = generator.flow_from_directory(
-        tmp_folder, subset='validation')
-    self.assertEqual(valid_iterator.samples, num_validation)
-
-    # check number of classes and images
-    self.assertEqual(len(train_iterator.class_indices), num_classes)
-    self.assertEqual(len(train_iterator.classes), num_training)
-    self.assertEqual(
-        len(set(train_iterator.filenames) & set(filenames)), num_training)
-
-    model = sequential.Sequential([layers.Flatten(), layers.Dense(2)])
-    model.compile(optimizer='sgd', loss='mse')
-    model.fit(train_iterator, epochs=1)
-
-    shutil.rmtree(tmp_folder)
-
-  @test_combinations.run_all_keras_modes
-  def test_directory_iterator_with_validation_split_25_percent(self):
-    self.directory_iterator_with_validation_split_test_helper(0.25)
-
-  @test_combinations.run_all_keras_modes
-  def test_directory_iterator_with_validation_split_40_percent(self):
-    self.directory_iterator_with_validation_split_test_helper(0.40)
-
-  @test_combinations.run_all_keras_modes
-  def test_directory_iterator_with_validation_split_50_percent(self):
-    self.directory_iterator_with_validation_split_test_helper(0.50)
-
-  def test_batch_standardize(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    # ImageDataGenerator.standardize should work on batches
-    for test_images in _generate_test_images():
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-
-      images = np.vstack(img_list)
-      generator = image.ImageDataGenerator(
-          featurewise_center=True,
-          samplewise_center=True,
-          featurewise_std_normalization=True,
-          samplewise_std_normalization=True,
-          zca_whitening=True,
-          rotation_range=90.,
-          width_shift_range=0.1,
-          height_shift_range=0.1,
-          shear_range=0.5,
-          zoom_range=0.2,
-          channel_shift_range=0.,
-          brightness_range=(1, 5),
-          fill_mode='nearest',
-          cval=0.5,
-          horizontal_flip=True,
-          vertical_flip=True)
-      generator.fit(images, augment=True)
-
-      transformed = np.copy(images)
-      for i, im in enumerate(transformed):
-        transformed[i] = generator.random_transform(im)
-      transformed = generator.standardize(transformed)
-
-  def test_img_transforms(self):
-    x = np.random.random((3, 200, 200))
-    _ = image.random_rotation(x, 20)
-    _ = image.random_shift(x, 0.2, 0.2)
-    _ = image.random_shear(x, 2.)
-    _ = image.random_zoom(x, (0.5, 0.5))
-    _ = image.apply_channel_shift(x, 2, 2)
-    _ = image.apply_affine_transform(x, 2)
-    with self.assertRaises(ValueError):
-      image.random_zoom(x, (0, 0, 0))
-    _ = image.random_channel_shift(x, 2.)
+    def test_iterator_empty_directory(self):
+        # Testing with different batch sizes
+        for batch_size in [0, 32]:
+            data_iterator = image.Iterator(0, batch_size, False, 0)
+            ret = next(data_iterator.index_generator)
+            self.assertEqual(ret.size, 0)
+
+    def test_image(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        for test_images in _generate_test_images():
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+
+            images = np.vstack(img_list)
+            generator = image.ImageDataGenerator(
+                featurewise_center=True,
+                samplewise_center=True,
+                featurewise_std_normalization=True,
+                samplewise_std_normalization=True,
+                zca_whitening=True,
+                rotation_range=90.0,
+                width_shift_range=0.1,
+                height_shift_range=0.1,
+                shear_range=0.5,
+                zoom_range=0.2,
+                channel_shift_range=0.0,
+                brightness_range=(1, 5),
+                fill_mode="nearest",
+                cval=0.5,
+                horizontal_flip=True,
+                vertical_flip=True,
+            )
+            # Basic test before fit
+            x = np.random.random((32, 10, 10, 3))
+            generator.flow(x)
+
+            # Fit
+            generator.fit(images, augment=True)
+
+            for x, _ in generator.flow(
+                images, np.arange(images.shape[0]), shuffle=True
+            ):
+                self.assertEqual(x.shape[1:], images.shape[1:])
+                break
+
+    def test_image_with_split_value_error(self):
+        with self.assertRaises(ValueError):
+            image.ImageDataGenerator(validation_split=5)
+
+    def test_image_invalid_data(self):
+        generator = image.ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            data_format="channels_last",
+        )
+
+        # Test fit with invalid data
+        with self.assertRaises(ValueError):
+            x = np.random.random((3, 10, 10))
+            generator.fit(x)
+        # Test flow with invalid data
+        with self.assertRaises(ValueError):
+            generator.flow(np.arange(5))
+        # Invalid number of channels: will work but raise a warning
+        x = np.random.random((32, 10, 10, 5))
+        generator.flow(x)
+
+        with self.assertRaises(ValueError):
+            generator = image.ImageDataGenerator(data_format="unknown")
+
+        generator = image.ImageDataGenerator(zoom_range=(2.0, 2.0))
+
+    def test_image_fit(self):
+        generator = image.ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            data_format="channels_last",
+        )
+        # Test grayscale
+        x = np.random.random((32, 10, 10, 1))
+        generator.fit(x)
+        # Test RBG
+        x = np.random.random((32, 10, 10, 3))
+        generator.fit(x)
+        generator = image.ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            data_format="channels_first",
+        )
+        # Test grayscale
+        x = np.random.random((32, 1, 10, 10))
+        generator.fit(x)
+        # Test RBG
+        x = np.random.random((32, 3, 10, 10))
+        generator.fit(x)
+
+    def test_directory_iterator(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        num_classes = 2
+
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir)
+
+        # create folders and subfolders
+        paths = []
+        for cl in range(num_classes):
+            class_directory = f"class-{cl}"
+            classpaths = [
+                class_directory,
+                os.path.join(class_directory, "subfolder-1"),
+                os.path.join(class_directory, "subfolder-2"),
+                os.path.join(class_directory, "subfolder-1", "sub-subfolder"),
+            ]
+            for path in classpaths:
+                os.mkdir(os.path.join(temp_dir, path))
+            paths.append(classpaths)
+
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in _generate_test_images():
+            for im in test_images:
+                # rotate image class
+                im_class = count % num_classes
+                # rotate subfolders
+                classpaths = paths[im_class]
+                filename = os.path.join(
+                    classpaths[count % len(classpaths)],
+                    f"image-{count}.jpg",
+                )
+                filenames.append(filename)
+                im.save(os.path.join(temp_dir, filename))
+                count += 1
+
+        # Test image loading util
+        fname = os.path.join(temp_dir, filenames[0])
+        _ = image_utils.load_img(fname)
+        _ = image_utils.load_img(fname, grayscale=True)
+        _ = image_utils.load_img(fname, target_size=(10, 10))
+        _ = image_utils.load_img(
+            fname, target_size=(10, 10), interpolation="bilinear"
+        )
+
+        # create iterator
+        generator = image.ImageDataGenerator()
+        dir_iterator = generator.flow_from_directory(temp_dir)
+
+        # check number of classes and images
+        self.assertEqual(len(dir_iterator.class_indices), num_classes)
+        self.assertEqual(len(dir_iterator.classes), count)
+        self.assertEqual(set(dir_iterator.filenames), set(filenames))
+
+        def preprocessing_function(x):
+            """This will fail if not provided by a Numpy array.
+
+            Note: This is made to enforce backward compatibility.
+
+            Args:
+                x: A numpy array.
+
+            Returns:
+                An array of zeros with the same shape as the given array.
+            """
+            self.assertEqual(x.shape, (26, 26, 3))
+            self.assertIs(type(x), np.ndarray)
+            return np.zeros_like(x)
+
+        # Test usage as Sequence
+        generator = image.ImageDataGenerator(
+            preprocessing_function=preprocessing_function
+        )
+        dir_seq = generator.flow_from_directory(
+            str(temp_dir),
+            target_size=(26, 26),
+            color_mode="rgb",
+            batch_size=3,
+            class_mode="categorical",
+        )
+        self.assertEqual(len(dir_seq), count // 3 + 1)
+        x1, y1 = dir_seq[1]
+        self.assertEqual(x1.shape, (3, 26, 26, 3))
+        self.assertEqual(y1.shape, (3, num_classes))
+        x1, y1 = dir_seq[5]
+        self.assertTrue((x1 == 0).all())
+
+    def directory_iterator_with_validation_split_test_helper(
+        self, validation_split
+    ):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        num_classes = 2
+        tmp_folder = tempfile.mkdtemp(prefix="test_images")
+
+        # create folders and subfolders
+        paths = []
+        for cl in range(num_classes):
+            class_directory = f"class-{cl}"
+            classpaths = [
+                class_directory,
+                os.path.join(class_directory, "subfolder-1"),
+                os.path.join(class_directory, "subfolder-2"),
+                os.path.join(class_directory, "subfolder-1", "sub-subfolder"),
+            ]
+            for path in classpaths:
+                os.mkdir(os.path.join(tmp_folder, path))
+            paths.append(classpaths)
+
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in _generate_test_images():
+            for im in test_images:
+                # rotate image class
+                im_class = count % num_classes
+                # rotate subfolders
+                classpaths = paths[im_class]
+                filename = os.path.join(
+                    classpaths[count % len(classpaths)],
+                    f"image-{count}.jpg",
+                )
+                filenames.append(filename)
+                im.save(os.path.join(tmp_folder, filename))
+                count += 1
+
+        # create iterator
+        generator = image.ImageDataGenerator(validation_split=validation_split)
+
+        with self.assertRaises(ValueError):
+            generator.flow_from_directory(tmp_folder, subset="foo")
+
+        num_validation = int(count * validation_split)
+        num_training = count - num_validation
+        train_iterator = generator.flow_from_directory(
+            tmp_folder, subset="training"
+        )
+        self.assertEqual(train_iterator.samples, num_training)
+
+        valid_iterator = generator.flow_from_directory(
+            tmp_folder, subset="validation"
+        )
+        self.assertEqual(valid_iterator.samples, num_validation)
+
+        # check number of classes and images
+        self.assertEqual(len(train_iterator.class_indices), num_classes)
+        self.assertEqual(len(train_iterator.classes), num_training)
+        self.assertEqual(
+            len(set(train_iterator.filenames) & set(filenames)), num_training
+        )
+
+        model = sequential.Sequential([layers.Flatten(), layers.Dense(2)])
+        model.compile(optimizer="sgd", loss="mse")
+        model.fit(train_iterator, epochs=1)
+
+        shutil.rmtree(tmp_folder)
+
+    @test_combinations.run_all_keras_modes
+    def test_directory_iterator_with_validation_split_25_percent(self):
+        self.directory_iterator_with_validation_split_test_helper(0.25)
+
+    @test_combinations.run_all_keras_modes
+    def test_directory_iterator_with_validation_split_40_percent(self):
+        self.directory_iterator_with_validation_split_test_helper(0.40)
+
+    @test_combinations.run_all_keras_modes
+    def test_directory_iterator_with_validation_split_50_percent(self):
+        self.directory_iterator_with_validation_split_test_helper(0.50)
+
+    def test_batch_standardize(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        # ImageDataGenerator.standardize should work on batches
+        for test_images in _generate_test_images():
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+
+            images = np.vstack(img_list)
+            generator = image.ImageDataGenerator(
+                featurewise_center=True,
+                samplewise_center=True,
+                featurewise_std_normalization=True,
+                samplewise_std_normalization=True,
+                zca_whitening=True,
+                rotation_range=90.0,
+                width_shift_range=0.1,
+                height_shift_range=0.1,
+                shear_range=0.5,
+                zoom_range=0.2,
+                channel_shift_range=0.0,
+                brightness_range=(1, 5),
+                fill_mode="nearest",
+                cval=0.5,
+                horizontal_flip=True,
+                vertical_flip=True,
+            )
+            generator.fit(images, augment=True)
+
+            transformed = np.copy(images)
+            for i, im in enumerate(transformed):
+                transformed[i] = generator.random_transform(im)
+            transformed = generator.standardize(transformed)
+
+    def test_img_transforms(self):
+        x = np.random.random((3, 200, 200))
+        _ = image.random_rotation(x, 20)
+        _ = image.random_shift(x, 0.2, 0.2)
+        _ = image.random_shear(x, 2.0)
+        _ = image.random_zoom(x, (0.5, 0.5))
+        _ = image.apply_channel_shift(x, 2, 2)
+        _ = image.apply_affine_transform(x, 2)
+        with self.assertRaises(ValueError):
+            image.random_zoom(x, (0, 0, 0))
+        _ = image.random_channel_shift(x, 2.0)
 
 
 @test_utils.run_v2_only
 class TestDirectoryIterator(test_combinations.TestCase):
-
-  def test_directory_iterator(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(
-        include_rgba=True, include_16bit=True, include_32bit=True)
-    num_classes = 2
-
-    # create folders and subfolders
-    paths = []
-    for cl in range(num_classes):
-      class_directory = 'class-{}'.format(cl)
-      classpaths = [
-          class_directory,
-          os.path.join(class_directory, 'subfolder-1'),
-          os.path.join(class_directory, 'subfolder-2'),
-          os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
-      ]
-      for path in classpaths:
-        os.mkdir(os.path.join(tmpdir.full_path, path))
-      paths.append(classpaths)
-
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        # rotate image class
-        im_class = count % num_classes
-        # rotate subfolders
-        classpaths = paths[im_class]
-        filename = os.path.join(classpaths[count % len(classpaths)],
-                                'image-{}.png'.format(count))
-        filenames.append(filename)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        count += 1
-
-    # create iterator
-    generator = image.ImageDataGenerator()
-    dir_iterator = generator.flow_from_directory(tmpdir.full_path)
-
-    # check number of classes and images
-    self.assertLen(dir_iterator.class_indices, num_classes)
-    self.assertLen(dir_iterator.classes, count)
-    self.assertEqual(set(dir_iterator.filenames), set(filenames))
-
-    # Test invalid use cases
-    with self.assertRaises(ValueError):
-      generator.flow_from_directory(tmpdir.full_path, color_mode='cmyk')
-    with self.assertRaises(ValueError):
-      generator.flow_from_directory(tmpdir.full_path, class_mode='output')
-
-    def preprocessing_function(x):
-      # This will fail if not provided by a Numpy array.
-      # Note: This is made to enforce backward compatibility.
-      self.assertEqual(x.shape, (26, 26, 3))
-      self.assertIsInstance(x, np.ndarray)
-
-      return np.zeros_like(x)
-
-    # Test usage as Sequence
-    generator = image.ImageDataGenerator(
-        preprocessing_function=preprocessing_function)
-    dir_seq = generator.flow_from_directory(
-        tmpdir.full_path,
-        target_size=(26, 26),
-        color_mode='rgb',
-        batch_size=3,
-        class_mode='categorical')
-    self.assertLen(dir_seq, np.ceil(count / 3.))
-    x1, y1 = dir_seq[1]
-    self.assertEqual(x1.shape, (3, 26, 26, 3))
-    self.assertEqual(y1.shape, (3, num_classes))
-    x1, y1 = dir_seq[5]
-    self.assertTrue((x1 == 0).all())
-
-    with self.assertRaises(ValueError):
-      x1, y1 = dir_seq[14]  # there are 40 images and batch size is 3
-
-  def test_directory_iterator_class_mode_input(self):
-    tmpdir = self.create_tempdir()
-    os.mkdir(os.path.join(tmpdir.full_path, 'class-1'))
-    all_test_images = _generate_test_images(
-        include_rgba=True, include_16bit=True, include_32bit=True)
-
-    # save the images in the paths
-    count = 0
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = os.path.join(tmpdir, 'class-1', 'image-{}.png'.format(count))
-        im.save(filename)
-        count += 1
-
-    # create iterator
-    generator = image.ImageDataGenerator()
-    dir_iterator = generator.flow_from_directory(
-        tmpdir.full_path, class_mode='input')
-    batch = next(dir_iterator)
-
-    # check if input and output have the same shape
-    self.assertEqual(batch[0].shape, batch[1].shape)
-    # check if the input and output images are not the same numpy array
-    input_img = batch[0][0]
-    output_img = batch[1][0]
-    output_img[0][0][0] += 1
-    self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
-
-  @parameterized.parameters([
-      (0.25, 30),
-      (0.50, 20),
-      (0.75, 10),
-  ])
-  def test_directory_iterator_with_validation_split(self, validation_split,
-                                                    num_training):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(
-        include_rgba=True, include_16bit=True, include_32bit=True)
-    num_classes = 2
-
-    # create folders and subfolders
-    paths = []
-    for cl in range(num_classes):
-      class_directory = 'class-{}'.format(cl)
-      classpaths = [
-          class_directory,
-          os.path.join(class_directory, 'subfolder-1'),
-          os.path.join(class_directory, 'subfolder-2'),
-          os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
-      ]
-      for path in classpaths:
-        os.mkdir(os.path.join(tmpdir.full_path, path))
-      paths.append(classpaths)
-
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        # rotate image class
-        im_class = count % num_classes
-        # rotate subfolders
-        classpaths = paths[im_class]
-        filename = os.path.join(classpaths[count % len(classpaths)],
-                                'image-{}.png'.format(count))
-        filenames.append(filename)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        count += 1
-
-    # create iterator
-    generator = image.ImageDataGenerator(validation_split=validation_split)
-
-    with self.assertRaises(ValueError):
-      generator.flow_from_directory(tmpdir.full_path, subset='foo')
-
-    train_iterator = generator.flow_from_directory(
-        tmpdir.full_path, subset='training')
-    self.assertEqual(train_iterator.samples, num_training)
-
-    valid_iterator = generator.flow_from_directory(
-        tmpdir.full_path, subset='validation')
-    self.assertEqual(valid_iterator.samples, count - num_training)
-
-    # check number of classes and images
-    self.assertLen(train_iterator.class_indices, num_classes)
-    self.assertLen(train_iterator.classes, num_training)
-    self.assertLen(set(train_iterator.filenames) & set(filenames), num_training)
+    def test_directory_iterator(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(
+            include_rgba=True, include_16bit=True, include_32bit=True
+        )
+        num_classes = 2
+
+        # create folders and subfolders
+        paths = []
+        for cl in range(num_classes):
+            class_directory = f"class-{cl}"
+            classpaths = [
+                class_directory,
+                os.path.join(class_directory, "subfolder-1"),
+                os.path.join(class_directory, "subfolder-2"),
+                os.path.join(class_directory, "subfolder-1", "sub-subfolder"),
+            ]
+            for path in classpaths:
+                os.mkdir(os.path.join(tmpdir.full_path, path))
+            paths.append(classpaths)
+
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                # rotate image class
+                im_class = count % num_classes
+                # rotate subfolders
+                classpaths = paths[im_class]
+                filename = os.path.join(
+                    classpaths[count % len(classpaths)],
+                    f"image-{count}.png",
+                )
+                filenames.append(filename)
+                im.save(os.path.join(tmpdir.full_path, filename))
+                count += 1
+
+        # create iterator
+        generator = image.ImageDataGenerator()
+        dir_iterator = generator.flow_from_directory(tmpdir.full_path)
+
+        # check number of classes and images
+        self.assertLen(dir_iterator.class_indices, num_classes)
+        self.assertLen(dir_iterator.classes, count)
+        self.assertEqual(set(dir_iterator.filenames), set(filenames))
+
+        # Test invalid use cases
+        with self.assertRaises(ValueError):
+            generator.flow_from_directory(tmpdir.full_path, color_mode="cmyk")
+        with self.assertRaises(ValueError):
+            generator.flow_from_directory(tmpdir.full_path, class_mode="output")
+
+        def preprocessing_function(x):
+            # This will fail if not provided by a Numpy array.
+            # Note: This is made to enforce backward compatibility.
+            self.assertEqual(x.shape, (26, 26, 3))
+            self.assertIsInstance(x, np.ndarray)
+
+            return np.zeros_like(x)
+
+        # Test usage as Sequence
+        generator = image.ImageDataGenerator(
+            preprocessing_function=preprocessing_function
+        )
+        dir_seq = generator.flow_from_directory(
+            tmpdir.full_path,
+            target_size=(26, 26),
+            color_mode="rgb",
+            batch_size=3,
+            class_mode="categorical",
+        )
+        self.assertLen(dir_seq, np.ceil(count / 3.0))
+        x1, y1 = dir_seq[1]
+        self.assertEqual(x1.shape, (3, 26, 26, 3))
+        self.assertEqual(y1.shape, (3, num_classes))
+        x1, y1 = dir_seq[5]
+        self.assertTrue((x1 == 0).all())
+
+        with self.assertRaises(ValueError):
+            x1, y1 = dir_seq[14]  # there are 40 images and batch size is 3
+
+    def test_directory_iterator_class_mode_input(self):
+        tmpdir = self.create_tempdir()
+        os.mkdir(os.path.join(tmpdir.full_path, "class-1"))
+        all_test_images = _generate_test_images(
+            include_rgba=True, include_16bit=True, include_32bit=True
+        )
+
+        # save the images in the paths
+        count = 0
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = os.path.join(tmpdir, "class-1", f"image-{count}.png")
+                im.save(filename)
+                count += 1
+
+        # create iterator
+        generator = image.ImageDataGenerator()
+        dir_iterator = generator.flow_from_directory(
+            tmpdir.full_path, class_mode="input"
+        )
+        batch = next(dir_iterator)
+
+        # check if input and output have the same shape
+        self.assertEqual(batch[0].shape, batch[1].shape)
+        # check if the input and output images are not the same numpy array
+        input_img = batch[0][0]
+        output_img = batch[1][0]
+        output_img[0][0][0] += 1
+        self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
+
+    @parameterized.parameters(
+        [
+            (0.25, 30),
+            (0.50, 20),
+            (0.75, 10),
+        ]
+    )
+    def test_directory_iterator_with_validation_split(
+        self, validation_split, num_training
+    ):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(
+            include_rgba=True, include_16bit=True, include_32bit=True
+        )
+        num_classes = 2
+
+        # create folders and subfolders
+        paths = []
+        for cl in range(num_classes):
+            class_directory = f"class-{cl}"
+            classpaths = [
+                class_directory,
+                os.path.join(class_directory, "subfolder-1"),
+                os.path.join(class_directory, "subfolder-2"),
+                os.path.join(class_directory, "subfolder-1", "sub-subfolder"),
+            ]
+            for path in classpaths:
+                os.mkdir(os.path.join(tmpdir.full_path, path))
+            paths.append(classpaths)
+
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                # rotate image class
+                im_class = count % num_classes
+                # rotate subfolders
+                classpaths = paths[im_class]
+                filename = os.path.join(
+                    classpaths[count % len(classpaths)],
+                    f"image-{count}.png",
+                )
+                filenames.append(filename)
+                im.save(os.path.join(tmpdir.full_path, filename))
+                count += 1
+
+        # create iterator
+        generator = image.ImageDataGenerator(validation_split=validation_split)
+
+        with self.assertRaises(ValueError):
+            generator.flow_from_directory(tmpdir.full_path, subset="foo")
+
+        train_iterator = generator.flow_from_directory(
+            tmpdir.full_path, subset="training"
+        )
+        self.assertEqual(train_iterator.samples, num_training)
+
+        valid_iterator = generator.flow_from_directory(
+            tmpdir.full_path, subset="validation"
+        )
+        self.assertEqual(valid_iterator.samples, count - num_training)
+
+        # check number of classes and images
+        self.assertLen(train_iterator.class_indices, num_classes)
+        self.assertLen(train_iterator.classes, num_training)
+        self.assertLen(
+            set(train_iterator.filenames) & set(filenames), num_training
+        )
 
 
 @test_utils.run_v2_only
 class TestNumpyArrayIterator(test_combinations.TestCase):
-
-  def test_numpy_array_iterator(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-
-    image_data_generator = image.ImageDataGenerator(
-        featurewise_center=True,
-        samplewise_center=True,
-        featurewise_std_normalization=True,
-        samplewise_std_normalization=True,
-        zca_whitening=True,
-        rotation_range=90.,
-        width_shift_range=0.1,
-        height_shift_range=0.1,
-        shear_range=0.5,
-        zoom_range=0.2,
-        channel_shift_range=0.,
-        brightness_range=(1, 5),
-        fill_mode='nearest',
-        cval=0.5,
-        horizontal_flip=True,
-        vertical_flip=True,
-        interpolation_order=1)
-
-    for test_images in all_test_images:
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-      images = np.vstack(img_list)
-      dsize = images.shape[0]
-
-      iterator = image.NumpyArrayIterator(
-          images,
-          np.arange(images.shape[0]),
-          image_data_generator,
-          shuffle=False,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-      x, y = next(iterator)
-      self.assertEqual(x.shape, images[:3].shape)
-      self.assertEqual(list(y), [0, 1, 2])
-
-      # Test with sample weights
-      iterator = image.NumpyArrayIterator(
-          images,
-          np.arange(images.shape[0]),
-          image_data_generator,
-          shuffle=False,
-          sample_weight=np.arange(images.shape[0]) + 1,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-      x, y, w = iterator.next()
-      self.assertEqual(x.shape, images[:3].shape)
-      self.assertEqual(list(y), [0, 1, 2])
-      self.assertEqual(list(w), [1, 2, 3])
-
-      # Test with `shuffle=True`
-      iterator = image.NumpyArrayIterator(
-          images,
-          np.arange(images.shape[0]),
-          image_data_generator,
-          shuffle=True,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3,
-          seed=42)
-      x, y = iterator.next()
-      self.assertEqual(x.shape, images[:3].shape)
-      # Check that the sequence is shuffled.
-      self.assertNotEqual(list(y), [0, 1, 2])
-
-      # Test without y
-      iterator = image.NumpyArrayIterator(
-          images,
-          None,
-          image_data_generator,
-          shuffle=True,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-      x = iterator.next()
-      self.assertIsInstance(x, np.ndarray)
-      self.assertEqual(x.shape, images[:3].shape)
-
-      # Test with a single miscellaneous input data array
-      x_misc1 = np.random.random(dsize)
-      iterator = image.NumpyArrayIterator((images, x_misc1),
-                                          np.arange(dsize),
-                                          image_data_generator,
-                                          shuffle=False,
-                                          batch_size=2)
-      for i, (x, y) in enumerate(iterator):
-        self.assertEqual(x[0].shape, images[:2].shape)
-        self.assertTrue((x[1] == x_misc1[(i * 2):((i + 1) * 2)]).all())
-        if i == 2:
-          break
-
-      # Test with two miscellaneous inputs
-      x_misc2 = np.random.random((dsize, 3, 3))
-      iterator = image.NumpyArrayIterator((images, [x_misc1, x_misc2]),
-                                          np.arange(dsize),
-                                          image_data_generator,
-                                          shuffle=False,
-                                          batch_size=2)
-      for i, (x, y) in enumerate(iterator):
-        self.assertEqual(x[0].shape, images[:2].shape)
-        self.assertTrue((x[1] == x_misc1[(i * 2):((i + 1) * 2)]).all())
-        self.assertTrue((x[2] == x_misc2[(i * 2):((i + 1) * 2)]).all())
-        if i == 2:
-          break
-
-      # Test cases with `y = None`
-      iterator = image.NumpyArrayIterator(
-          images, None, image_data_generator, batch_size=3)
-      x = iterator.next()
-      self.assertIsInstance(x, np.ndarray)
-      self.assertEqual(x.shape, images[:3].shape)
-
-      iterator = image.NumpyArrayIterator((images, x_misc1),
-                                          None,
-                                          image_data_generator,
-                                          batch_size=3,
-                                          shuffle=False)
-      x = iterator.next()
-      self.assertIsInstance(x, list)
-      self.assertEqual(x[0].shape, images[:3].shape)
-      self.assertTrue((x[1] == x_misc1[:3]).all())
-
-      iterator = image.NumpyArrayIterator((images, [x_misc1, x_misc2]),
-                                          None,
-                                          image_data_generator,
-                                          batch_size=3,
-                                          shuffle=False)
-      x = iterator.next()
-      self.assertIsInstance(x, list)
-      self.assertEqual(x[0].shape, images[:3].shape)
-      self.assertTrue((x[1] == x_misc1[:3]).all())
-      self.assertTrue((x[2] == x_misc2[:3]).all())
-
-      # Test with validation split
-      generator = image.ImageDataGenerator(validation_split=0.2)
-      iterator = image.NumpyArrayIterator(images, None, generator, batch_size=3)
-      x = iterator.next()
-      self.assertIsInstance(x, np.ndarray)
-      self.assertEqual(x.shape, images[:3].shape)
-
-      # Test some failure cases:
-      x_misc_err = np.random.random((dsize + 1, 3, 3))
-
-      with self.assertRaisesRegex(ValueError, 'All of the arrays in'):
-        image.NumpyArrayIterator((images, x_misc_err),
-                                 np.arange(dsize),
-                                 generator,
-                                 batch_size=3)
-
-      with self.assertRaisesRegex(ValueError,
-                                  r'`x` \(images tensor\) and `y` \(labels\)'):
-        image.NumpyArrayIterator((images, x_misc1),
-                                 np.arange(dsize + 1),
-                                 generator,
-                                 batch_size=3)
-
-      # Test `flow` behavior as Sequence
-      seq = image.NumpyArrayIterator(
-          images,
-          np.arange(images.shape[0]),
-          generator,
-          shuffle=False,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-      self.assertLen(seq, images.shape[0] // 3 + 1)
-      x, y = seq[0]
-      self.assertEqual(x.shape, images[:3].shape)
-      self.assertEqual(list(y), [0, 1, 2])
-
-      # Test with `shuffle=True`
-      seq = image.NumpyArrayIterator(
-          images,
-          np.arange(images.shape[0]),
-          generator,
-          shuffle=True,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3,
-          seed=123)
-      x, y = seq[0]
-      # Check that the sequence is shuffled.
-      self.assertNotEqual(list(y), [0, 1, 2])
-      # `on_epoch_end` should reshuffle the sequence.
-      seq.on_epoch_end()
-      _, y2 = seq[0]
-      self.assertNotEqual(list(y), list(y2))
-
-    # test order_interpolation
-    labels = np.array([[2, 2, 0, 2, 2], [1, 3, 2, 3, 1], [2, 1, 0, 1, 2],
-                       [3, 1, 0, 2, 0], [3, 1, 3, 2, 1]])
-    label_generator = image.ImageDataGenerator(
-        rotation_range=90., interpolation_order=0)
-    labels_gen = image.NumpyArrayIterator(
-        labels[np.newaxis, ..., np.newaxis], None, label_generator, seed=123)
-    self.assertTrue((np.unique(labels) == np.unique(next(labels_gen))).all())
+    def test_numpy_array_iterator(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+
+        image_data_generator = image.ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            rotation_range=90.0,
+            width_shift_range=0.1,
+            height_shift_range=0.1,
+            shear_range=0.5,
+            zoom_range=0.2,
+            channel_shift_range=0.0,
+            brightness_range=(1, 5),
+            fill_mode="nearest",
+            cval=0.5,
+            horizontal_flip=True,
+            vertical_flip=True,
+            interpolation_order=1,
+        )
+
+        for test_images in all_test_images:
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+            images = np.vstack(img_list)
+            dsize = images.shape[0]
+
+            iterator = image.NumpyArrayIterator(
+                images,
+                np.arange(images.shape[0]),
+                image_data_generator,
+                shuffle=False,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+            x, y = next(iterator)
+            self.assertEqual(x.shape, images[:3].shape)
+            self.assertEqual(list(y), [0, 1, 2])
+
+            # Test with sample weights
+            iterator = image.NumpyArrayIterator(
+                images,
+                np.arange(images.shape[0]),
+                image_data_generator,
+                shuffle=False,
+                sample_weight=np.arange(images.shape[0]) + 1,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+            x, y, w = iterator.next()
+            self.assertEqual(x.shape, images[:3].shape)
+            self.assertEqual(list(y), [0, 1, 2])
+            self.assertEqual(list(w), [1, 2, 3])
+
+            # Test with `shuffle=True`
+            iterator = image.NumpyArrayIterator(
+                images,
+                np.arange(images.shape[0]),
+                image_data_generator,
+                shuffle=True,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+                seed=42,
+            )
+            x, y = iterator.next()
+            self.assertEqual(x.shape, images[:3].shape)
+            # Check that the sequence is shuffled.
+            self.assertNotEqual(list(y), [0, 1, 2])
+
+            # Test without y
+            iterator = image.NumpyArrayIterator(
+                images,
+                None,
+                image_data_generator,
+                shuffle=True,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+            x = iterator.next()
+            self.assertIsInstance(x, np.ndarray)
+            self.assertEqual(x.shape, images[:3].shape)
+
+            # Test with a single miscellaneous input data array
+            x_misc1 = np.random.random(dsize)
+            iterator = image.NumpyArrayIterator(
+                (images, x_misc1),
+                np.arange(dsize),
+                image_data_generator,
+                shuffle=False,
+                batch_size=2,
+            )
+            for i, (x, y) in enumerate(iterator):
+                self.assertEqual(x[0].shape, images[:2].shape)
+                self.assertTrue(
+                    (x[1] == x_misc1[(i * 2) : ((i + 1) * 2)]).all()
+                )
+                if i == 2:
+                    break
+
+            # Test with two miscellaneous inputs
+            x_misc2 = np.random.random((dsize, 3, 3))
+            iterator = image.NumpyArrayIterator(
+                (images, [x_misc1, x_misc2]),
+                np.arange(dsize),
+                image_data_generator,
+                shuffle=False,
+                batch_size=2,
+            )
+            for i, (x, y) in enumerate(iterator):
+                self.assertEqual(x[0].shape, images[:2].shape)
+                self.assertTrue(
+                    (x[1] == x_misc1[(i * 2) : ((i + 1) * 2)]).all()
+                )
+                self.assertTrue(
+                    (x[2] == x_misc2[(i * 2) : ((i + 1) * 2)]).all()
+                )
+                if i == 2:
+                    break
+
+            # Test cases with `y = None`
+            iterator = image.NumpyArrayIterator(
+                images, None, image_data_generator, batch_size=3
+            )
+            x = iterator.next()
+            self.assertIsInstance(x, np.ndarray)
+            self.assertEqual(x.shape, images[:3].shape)
+
+            iterator = image.NumpyArrayIterator(
+                (images, x_misc1),
+                None,
+                image_data_generator,
+                batch_size=3,
+                shuffle=False,
+            )
+            x = iterator.next()
+            self.assertIsInstance(x, list)
+            self.assertEqual(x[0].shape, images[:3].shape)
+            self.assertTrue((x[1] == x_misc1[:3]).all())
+
+            iterator = image.NumpyArrayIterator(
+                (images, [x_misc1, x_misc2]),
+                None,
+                image_data_generator,
+                batch_size=3,
+                shuffle=False,
+            )
+            x = iterator.next()
+            self.assertIsInstance(x, list)
+            self.assertEqual(x[0].shape, images[:3].shape)
+            self.assertTrue((x[1] == x_misc1[:3]).all())
+            self.assertTrue((x[2] == x_misc2[:3]).all())
+
+            # Test with validation split
+            generator = image.ImageDataGenerator(validation_split=0.2)
+            iterator = image.NumpyArrayIterator(
+                images, None, generator, batch_size=3
+            )
+            x = iterator.next()
+            self.assertIsInstance(x, np.ndarray)
+            self.assertEqual(x.shape, images[:3].shape)
+
+            # Test some failure cases:
+            x_misc_err = np.random.random((dsize + 1, 3, 3))
+
+            with self.assertRaisesRegex(ValueError, "All of the arrays in"):
+                image.NumpyArrayIterator(
+                    (images, x_misc_err),
+                    np.arange(dsize),
+                    generator,
+                    batch_size=3,
+                )
+
+            with self.assertRaisesRegex(
+                ValueError, r"`x` \(images tensor\) and `y` \(labels\)"
+            ):
+                image.NumpyArrayIterator(
+                    (images, x_misc1),
+                    np.arange(dsize + 1),
+                    generator,
+                    batch_size=3,
+                )
+
+            # Test `flow` behavior as Sequence
+            seq = image.NumpyArrayIterator(
+                images,
+                np.arange(images.shape[0]),
+                generator,
+                shuffle=False,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+            self.assertLen(seq, images.shape[0] // 3 + 1)
+            x, y = seq[0]
+            self.assertEqual(x.shape, images[:3].shape)
+            self.assertEqual(list(y), [0, 1, 2])
+
+            # Test with `shuffle=True`
+            seq = image.NumpyArrayIterator(
+                images,
+                np.arange(images.shape[0]),
+                generator,
+                shuffle=True,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+                seed=123,
+            )
+            x, y = seq[0]
+            # Check that the sequence is shuffled.
+            self.assertNotEqual(list(y), [0, 1, 2])
+            # `on_epoch_end` should reshuffle the sequence.
+            seq.on_epoch_end()
+            _, y2 = seq[0]
+            self.assertNotEqual(list(y), list(y2))
+
+        # test order_interpolation
+        labels = np.array(
+            [
+                [2, 2, 0, 2, 2],
+                [1, 3, 2, 3, 1],
+                [2, 1, 0, 1, 2],
+                [3, 1, 0, 2, 0],
+                [3, 1, 3, 2, 1],
+            ]
+        )
+        label_generator = image.ImageDataGenerator(
+            rotation_range=90.0, interpolation_order=0
+        )
+        labels_gen = image.NumpyArrayIterator(
+            labels[np.newaxis, ..., np.newaxis], None, label_generator, seed=123
+        )
+        self.assertTrue(
+            (np.unique(labels) == np.unique(next(labels_gen))).all()
+        )
 
 
 @test_utils.run_v2_only
 class TestDataFrameIterator(test_combinations.TestCase):
-
-  def test_dataframe_iterator(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    num_classes = 2
-
-    # save the images in the tmpdir
-    count = 0
-    filenames = []
-    filepaths = []
-    filenames_without = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        filename_without = 'image-{}'.format(count)
-        filenames.append(filename)
-        filepaths.append(os.path.join(tmpdir.full_path, filename))
-        filenames_without.append(filename_without)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        count += 1
-
-    df = pd.DataFrame({
-        'filename': filenames,
-        'class': [str(random.randint(0, 1)) for _ in filenames],
-        'filepaths': filepaths
-    })
-
-    # create iterator
-    iterator = image.DataFrameIterator(df, tmpdir.full_path)
-    batch = next(iterator)
-    self.assertLen(batch, 2)
-    self.assertIsInstance(batch[0], np.ndarray)
-    self.assertIsInstance(batch[1], np.ndarray)
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(df, x_col='filepaths')
-    df_iterator_dir = generator.flow_from_dataframe(df, tmpdir.full_path)
-    df_sparse_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, class_mode='sparse')
-    self.assertFalse(np.isnan(df_sparse_iterator.classes).any())
-    # check number of classes and images
-    self.assertLen(df_iterator.class_indices, num_classes)
-    self.assertLen(df_iterator.classes, count)
-    self.assertEqual(set(df_iterator.filenames), set(filepaths))
-    self.assertLen(df_iterator_dir.class_indices, num_classes)
-    self.assertLen(df_iterator_dir.classes, count)
-    self.assertEqual(set(df_iterator_dir.filenames), set(filenames))
-    # test without shuffle
-    _, batch_y = next(
-        generator.flow_from_dataframe(
-            df, tmpdir.full_path, shuffle=False, class_mode='sparse'))
-    self.assertTrue(
-        (batch_y == df['class'].astype('float')[:len(batch_y)]).all())
-    # Test invalid use cases
-    with self.assertRaises(ValueError):
-      generator.flow_from_dataframe(df, tmpdir.full_path, color_mode='cmyk')
-    with self.assertRaises(ValueError):
-      generator.flow_from_dataframe(df, tmpdir.full_path, class_mode='output')
-    with self.assertWarns(DeprecationWarning):
-      generator.flow_from_dataframe(df, tmpdir.full_path, has_ext=True)
-    with self.assertWarns(DeprecationWarning):
-      generator.flow_from_dataframe(df, tmpdir.full_path, has_ext=False)
-
-    def preprocessing_function(x):
-      # This will fail if not provided by a Numpy array.
-      # Note: This is made to enforce backward compatibility.
-
-      self.assertEqual(x.shape, (26, 26, 3))
-      self.assertIsInstance(x, np.ndarray)
-
-      return np.zeros_like(x)
-
-    # Test usage as Sequence
-    generator = image.ImageDataGenerator(
-        preprocessing_function=preprocessing_function)
-    dir_seq = generator.flow_from_dataframe(
-        df,
-        tmpdir.full_path,
-        target_size=(26, 26),
-        color_mode='rgb',
-        batch_size=3,
-        class_mode='categorical')
-    self.assertLen(dir_seq, np.ceil(count / 3))
-    x1, y1 = dir_seq[1]
-    self.assertEqual(x1.shape, (3, 26, 26, 3))
-    self.assertEqual(y1.shape, (3, num_classes))
-    x1, y1 = dir_seq[5]
-    self.assertTrue((x1 == 0).all())
-
-    with self.assertRaises(ValueError):
-      x1, y1 = dir_seq[9]
-
-  def test_dataframe_iterator_validate_filenames(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        filenames.append(filename)
-        count += 1
-    df = pd.DataFrame({'filename': filenames + ['test.jpp', 'test.jpg']})
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, class_mode='input')
-    self.assertLen(df_iterator.filenames, len(df['filename']) - 2)
-    df_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, class_mode='input', validate_filenames=False)
-    self.assertLen(df_iterator.filenames, len(df['filename']))
-
-  def test_dataframe_iterator_sample_weights(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        filenames.append(filename)
-        count += 1
-    df = pd.DataFrame({'filename': filenames})
-    df['weight'] = ([2, 5] * len(df))[:len(df)]
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(
-        df,
-        tmpdir.full_path,
-        x_col='filename',
-        y_col=None,
-        shuffle=False,
-        batch_size=5,
-        weight_col='weight',
-        class_mode='input')
-
-    batch = next(df_iterator)
-    self.assertLen(batch, 3)  # (x, y, weights)
-    # check if input and output have the same shape and they're the same
-    self.assertEqual(batch[0].all(), batch[1].all())
-    # check if the input and output images are not the same numpy array
-    input_img = batch[0][0]
-    output_img = batch[1][0]
-    output_img[0][0][0] += 1
-    self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
-    self.assertAllEqual(np.array([2, 5, 2, 5, 2]), batch[2])
-
-    # fail
-    df['weight'] = (['2', '5'] * len(df))[:len(df)]
-    with self.assertRaises(TypeError):
-      image.ImageDataGenerator().flow_from_dataframe(
-          df, weight_col='weight', class_mode='input')
-
-  def test_dataframe_iterator_class_mode_input(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        filenames.append(filename)
-        count += 1
-    df = pd.DataFrame({'filename': filenames})
-    generator = image.ImageDataGenerator()
-    df_autoencoder_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, x_col='filename', y_col=None, class_mode='input')
-
-    batch = next(df_autoencoder_iterator)
-
-    # check if input and output have the same shape and they're the same
-    self.assertAllClose(batch[0], batch[1])
-    # check if the input and output images are not the same numpy array
-    input_img = batch[0][0]
-    output_img = batch[1][0]
-    output_img[0][0][0] += 1
-    self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
-
-    df_autoencoder_iterator = generator.flow_from_dataframe(
-        df,
-        tmpdir.full_path,
-        x_col='filename',
-        y_col='class',
-        class_mode='input')
-
-    batch = next(df_autoencoder_iterator)
-
-    # check if input and output have the same shape and they're the same
-    self.assertEqual(batch[0].all(), batch[1].all())
-    # check if the input and output images are not the same numpy array
-    input_img = batch[0][0]
-    output_img = batch[1][0]
-    output_img[0][0][0] += 1
-    self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
-
-  def test_dataframe_iterator_class_mode_categorical_multi_label(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    # save the images in the paths
-    filenames = []
-    count = 0
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        filenames.append(filename)
-        count += 1
-    label_opt = ['a', 'b', ['a'], ['b'], ['a', 'b'], ['b', 'a']]
-    df = pd.DataFrame({
-        'filename': filenames,
-        'class': [random.choice(label_opt) for _ in filenames[:-2]] +
-                 ['b', 'a']
-    })
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(df, tmpdir.full_path)
-    batch_x, batch_y = next(df_iterator)
-    self.assertIsInstance(batch_x, np.ndarray)
-    self.assertLen(batch_x.shape, 4)
-    self.assertIsInstance(batch_y, np.ndarray)
-    self.assertEqual(batch_y.shape, (len(batch_x), 2))
-    for labels in batch_y:
-      self.assertTrue(all(label in {0, 1} for label in labels))
-
-    # on first 3 batches
-    df = pd.DataFrame({
-        'filename':
-            filenames,
-        'class': [['b', 'a']] + ['b'] + [['c']] +
-                 [random.choice(label_opt) for _ in filenames[:-3]]
-    })
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, shuffle=False)
-    batch_x, batch_y = next(df_iterator)
-    self.assertIsInstance(batch_x, np.ndarray)
-    self.assertLen(batch_x.shape, 4)
-    self.assertIsInstance(batch_y, np.ndarray)
-    self.assertEqual(batch_y.shape, (len(batch_x), 3))
-    for labels in batch_y:
-      self.assertTrue(all(label in {0, 1} for label in labels))
-    self.assertTrue((batch_y[0] == np.array([1, 1, 0])).all())
-    self.assertTrue((batch_y[1] == np.array([0, 1, 0])).all())
-    self.assertTrue((batch_y[2] == np.array([0, 0, 1])).all())
-
-  def test_dataframe_iterator_class_mode_multi_output(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    # save the images in the paths
-    filenames = []
-    count = 0
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        filenames.append(filename)
-        count += 1
-    # fit both outputs are a single number
-    df = pd.DataFrame({
-        'filename': filenames
-    }).assign(
-        output_0=np.random.uniform(size=len(filenames)),
-        output_1=np.random.uniform(size=len(filenames)))
-    df_iterator = image.ImageDataGenerator().flow_from_dataframe(
-        df,
-        y_col=['output_0', 'output_1'],
-        directory=tmpdir.full_path,
-        batch_size=3,
-        shuffle=False,
-        class_mode='multi_output')
-    batch_x, batch_y = next(df_iterator)
-    self.assertIsInstance(batch_x, np.ndarray)
-    self.assertLen(batch_x.shape, 4)
-    self.assertIsInstance(batch_y, list)
-    self.assertLen(batch_y, 2)
-    self.assertAllEqual(batch_y[0], np.array(df['output_0'].tolist()[:3]))
-    self.assertAllEqual(batch_y[1], np.array(df['output_1'].tolist()[:3]))
-    # if one of the outputs is a 1D array
-    df['output_1'] = [
-        np.random.uniform(size=(2, 2, 1)).flatten() for _ in range(len(df))
-    ]
-    df_iterator = image.ImageDataGenerator().flow_from_dataframe(
-        df,
-        y_col=['output_0', 'output_1'],
-        directory=tmpdir.full_path,
-        batch_size=3,
-        shuffle=False,
-        class_mode='multi_output')
-    batch_x, batch_y = next(df_iterator)
-    self.assertIsInstance(batch_x, np.ndarray)
-    self.assertLen(batch_x.shape, 4)
-    self.assertIsInstance(batch_y, list)
-    self.assertLen(batch_y, 2)
-    self.assertAllEqual(batch_y[0], np.array(df['output_0'].tolist()[:3]))
-    self.assertAllEqual(batch_y[1], np.array(df['output_1'].tolist()[:3]))
-    # if one of the outputs is a 2D array
-    df['output_1'] = [np.random.uniform(size=(2, 2, 1)) for _ in range(len(df))]
-    df_iterator = image.ImageDataGenerator().flow_from_dataframe(
-        df,
-        y_col=['output_0', 'output_1'],
-        directory=tmpdir.full_path,
-        batch_size=3,
-        shuffle=False,
-        class_mode='multi_output')
-    batch_x, batch_y = next(df_iterator)
-    self.assertIsInstance(batch_x, np.ndarray)
-    self.assertLen(batch_x.shape, 4)
-    self.assertIsInstance(batch_y, list)
-    self.assertLen(batch_y, 2)
-    self.assertAllEqual(batch_y[0], np.array(df['output_0'].tolist()[:3]))
-    self.assertAllEqual(batch_y[1], np.array(df['output_1'].tolist()[:3]))
-    # fail if single column
-    with self.assertRaises(TypeError):
-      image.ImageDataGenerator().flow_from_dataframe(
-          df,
-          y_col='output_0',
-          directory=tmpdir.full_path,
-          class_mode='multi_output')
-
-  def test_dataframe_iterator_class_mode_raw(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    # save the images in the paths
-    filenames = []
-    count = 0
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        filenames.append(filename)
-        count += 1
-    # case for 1D output
-    df = pd.DataFrame({
-        'filename': filenames
-    }).assign(
-        output_0=np.random.uniform(size=len(filenames)),
-        output_1=np.random.uniform(size=len(filenames)))
-    df_iterator = image.ImageDataGenerator().flow_from_dataframe(
-        df,
-        y_col='output_0',
-        directory=tmpdir.full_path,
-        batch_size=3,
-        shuffle=False,
-        class_mode='raw')
-    batch_x, batch_y = next(df_iterator)
-    self.assertIsInstance(batch_x, np.ndarray)
-    self.assertLen(batch_x.shape, 4)
-    self.assertIsInstance(batch_y, np.ndarray)
-    self.assertEqual(batch_y.shape, (3,))
-    self.assertAllEqual(batch_y, df['output_0'].values[:3])
-    # case with a 2D output
-    df_iterator = image.ImageDataGenerator().flow_from_dataframe(
-        df,
-        y_col=['output_0', 'output_1'],
-        directory=tmpdir.full_path,
-        batch_size=3,
-        shuffle=False,
-        class_mode='raw')
-    batch_x, batch_y = next(df_iterator)
-    self.assertIsInstance(batch_x, np.ndarray)
-    self.assertLen(batch_x.shape, 4)
-    self.assertIsInstance(batch_y, np.ndarray)
-    self.assertEqual(batch_y.shape, (3, 2))
-    self.assertAllEqual(batch_y, df[['output_0', 'output_1']].values[:3])
-
-  @parameterized.parameters([
-      (0.25, 18),
-      (0.50, 12),
-      (0.75, 6),
-  ])
-  def test_dataframe_iterator_with_validation_split(self, validation_split,
-                                                    num_training):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    num_classes = 2
-
-    # save the images in the tmpdir
-    count = 0
-    filenames = []
-    filenames_without = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        filename_without = 'image-{}'.format(count)
-        filenames.append(filename)
-        filenames_without.append(filename_without)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        count += 1
-
-    df = pd.DataFrame({
-        'filename': filenames,
-        'class': [str(random.randint(0, 1)) for _ in filenames]
-    })
-    # create iterator
-    generator = image.ImageDataGenerator(validation_split=validation_split)
-    df_sparse_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, class_mode='sparse')
-    if np.isnan(next(df_sparse_iterator)[:][1]).any():
-      raise ValueError('Invalid values.')
-
-    with self.assertRaises(ValueError):
-      generator.flow_from_dataframe(df, tmpdir.full_path, subset='foo')
-
-    train_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, subset='training')
-    self.assertEqual(train_iterator.samples, num_training)
-
-    valid_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, subset='validation')
-    self.assertEqual(valid_iterator.samples, count - num_training)
-
-    # check number of classes and images
-    self.assertLen(train_iterator.class_indices, num_classes)
-    self.assertLen(train_iterator.classes, num_training)
-    self.assertLen(set(train_iterator.filenames) & set(filenames), num_training)
-
-  def test_dataframe_iterator_with_custom_indexed_dataframe(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    num_classes = 2
-
-    # save the images in the tmpdir
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        filenames.append(filename)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        count += 1
-
-    # create dataframes
-    classes = np.random.randint(num_classes, size=len(filenames))
-    classes = [str(c) for c in classes]
-    df = pd.DataFrame({'filename': filenames, 'class': classes})
-    df2 = pd.DataFrame({
-        'filename': filenames,
-        'class': classes
-    },
-                       index=np.arange(1,
-                                       len(filenames) + 1))
-    df3 = pd.DataFrame({
-        'filename': filenames,
-        'class': classes
-    },
-                       index=filenames)
-
-    # create iterators
-    seed = 1
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(df, tmpdir.full_path, seed=seed)
-    df2_iterator = generator.flow_from_dataframe(
-        df2, tmpdir.full_path, seed=seed)
-    df3_iterator = generator.flow_from_dataframe(
-        df3, tmpdir.full_path, seed=seed)
-
-    # Test all iterators return same pairs of arrays
-    for _ in range(len(filenames)):
-      a1, c1 = next(df_iterator)
-      a2, c2 = next(df2_iterator)
-      a3, c3 = next(df3_iterator)
-      self.assertAllEqual(a1, a2)
-      self.assertAllEqual(a1, a3)
-      self.assertAllEqual(c1, c2)
-      self.assertAllEqual(c1, c3)
-
-  def test_dataframe_iterator_n(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-
-    # save the images in the tmpdir
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        filenames.append(filename)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        count += 1
-
-    # exclude first two items
-    n_files = len(filenames)
-    input_filenames = filenames[2:]
-
-    # create dataframes
-    classes = np.random.randint(2, size=len(input_filenames))
-    classes = [str(c) for c in classes]
-    df = pd.DataFrame({'filename': input_filenames})
-    df2 = pd.DataFrame({'filename': input_filenames, 'class': classes})
-
-    # create iterators
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, class_mode=None)
-    df2_iterator = generator.flow_from_dataframe(
-        df2, tmpdir.full_path, class_mode='binary')
-
-    # Test the number of items in iterators
-    self.assertEqual(df_iterator.n, n_files - 2)
-    self.assertEqual(df2_iterator.n, n_files - 2)
-
-  def test_dataframe_iterator_absolute_path(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-
-    # save the images in the tmpdir
-    count = 0
-    file_paths = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{:0>5}.png'.format(count)
-        file_path = os.path.join(tmpdir.full_path, filename)
-        file_paths.append(file_path)
-        im.save(file_path)
-        count += 1
-
-    # prepare an image with a forbidden extension.
-    file_path_fbd = os.path.join(tmpdir.full_path, 'image-forbid.fbd')
-    shutil.copy(file_path, file_path_fbd)
-
-    # create dataframes
-    classes = np.random.randint(2, size=len(file_paths))
-    classes = [str(c) for c in classes]
-    df = pd.DataFrame({'filename': file_paths})
-    df2 = pd.DataFrame({'filename': file_paths, 'class': classes})
-    df3 = pd.DataFrame({'filename': ['image-not-exist.png'] + file_paths})
-    df4 = pd.DataFrame({'filename': file_paths + [file_path_fbd]})
-
-    # create iterators
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(
-        df, None, class_mode=None, shuffle=False, batch_size=1)
-    df2_iterator = generator.flow_from_dataframe(
-        df2, None, class_mode='binary', shuffle=False, batch_size=1)
-    df3_iterator = generator.flow_from_dataframe(
-        df3, None, class_mode=None, shuffle=False, batch_size=1)
-    df4_iterator = generator.flow_from_dataframe(
-        df4, None, class_mode=None, shuffle=False, batch_size=1)
-
-    validation_split = 0.2
-    generator_split = image.ImageDataGenerator(
-        validation_split=validation_split)
-    df_train_iterator = generator_split.flow_from_dataframe(
-        df,
-        None,
-        class_mode=None,
-        shuffle=False,
-        subset='training',
-        batch_size=1)
-    df_val_iterator = generator_split.flow_from_dataframe(
-        df,
-        None,
-        class_mode=None,
-        shuffle=False,
-        subset='validation',
-        batch_size=1)
-
-    # Test the number of items in iterators
-    self.assertLen(file_paths, df_iterator.n)
-    self.assertLen(file_paths, df2_iterator.n)
-    self.assertLen(file_paths, df3_iterator.n)
-    self.assertLen(file_paths, df4_iterator.n)
-    self.assertEqual(df_val_iterator.n, int(validation_split * len(file_paths)))
-    self.assertLen(file_paths, df_train_iterator.n + df_val_iterator.n)
-
-    # Test flow_from_dataframe
-    for i in range(len(file_paths)):
-      a1 = next(df_iterator)
-      a2, _ = next(df2_iterator)
-      a3 = next(df3_iterator)
-      a4 = next(df4_iterator)
-
-      if i < df_val_iterator.n:
-        a5 = next(df_val_iterator)
-      else:
-        a5 = next(df_train_iterator)
-
-      self.assertAllEqual(a1, a2)
-      self.assertAllEqual(a1, a3)
-      self.assertAllEqual(a1, a4)
-      self.assertAllEqual(a1, a5)
-
-  def test_dataframe_iterator_with_subdirs(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    num_classes = 2
-
-    # create folders and subfolders
-    paths = []
-    for cl in range(num_classes):
-      class_directory = 'class-{}'.format(cl)
-      classpaths = [
-          class_directory,
-          os.path.join(class_directory, 'subfolder-1'),
-          os.path.join(class_directory, 'subfolder-2'),
-          os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
-      ]
-      for path in classpaths:
-        os.mkdir(os.path.join(tmpdir, path))
-      paths.append(classpaths)
-
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        # rotate image class
-        im_class = count % num_classes
-        # rotate subfolders
-        classpaths = paths[im_class]
-        filename = os.path.join(classpaths[count % len(classpaths)],
-                                'image-{}.png'.format(count))
-        filenames.append(filename)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        count += 1
-
-    # create dataframe
-    classes = np.random.randint(num_classes, size=len(filenames))
-    classes = [str(c) for c in classes]
-    df = pd.DataFrame({'filename': filenames, 'class': classes})
-
-    # create iterator
-    generator = image.ImageDataGenerator()
-    df_iterator = generator.flow_from_dataframe(
-        df, tmpdir.full_path, class_mode='binary')
-
-    # Test the number of items in iterator
-    self.assertLen(filenames, df_iterator.n)
-    self.assertEqual(set(df_iterator.filenames), set(filenames))
-
-  def test_dataframe_iterator_classes_indices_order(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    # save the images in the paths
-    count = 0
-    filenames = []
-    for test_images in all_test_images:
-      for im in test_images:
-        filename = 'image-{}.png'.format(count)
-        im.save(os.path.join(tmpdir.full_path, filename))
-        filenames.append(filename)
-        count += 1
-
-    # Test the class_indices without classes input
-    generator = image.ImageDataGenerator()
-    label_opt = ['a', 'b', ['a'], ['b'], ['a', 'b'], ['b', 'a']]
-    df_f = pd.DataFrame({
-        'filename': filenames,
-        'class': ['a', 'b'] +
-                 [random.choice(label_opt) for _ in filenames[:-2]]
-    })
-    flow_forward_iter = generator.flow_from_dataframe(df_f, tmpdir.full_path)
-    label_rev = ['b', 'a', ['b'], ['a'], ['b', 'a'], ['a', 'b']]
-    df_r = pd.DataFrame({
-        'filename': filenames,
-        'class': ['b', 'a'] +
-                 [random.choice(label_rev) for _ in filenames[:-2]]
-    })
-    flow_backward_iter = generator.flow_from_dataframe(df_r, tmpdir.full_path)
-
-    # check class_indices
-    self.assertEqual(flow_forward_iter.class_indices,
-                     flow_backward_iter.class_indices)
-
-    # Test the class_indices with classes input
-    generator_2 = image.ImageDataGenerator()
-    df_f2 = pd.DataFrame([['data/A.jpg', 'A'], ['data/B.jpg', 'B']],
-                         columns=['filename', 'class'])
-    flow_forward = generator_2.flow_from_dataframe(df_f2, classes=['A', 'B'])
-    df_b2 = pd.DataFrame([['data/A.jpg', 'A'], ['data/B.jpg', 'B']],
-                         columns=['filename', 'class'])
-    flow_backward = generator_2.flow_from_dataframe(df_b2, classes=['B', 'A'])
-
-    # check class_indices
-    self.assertNotEqual(flow_forward.class_indices, flow_backward.class_indices)
+    def test_dataframe_iterator(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        num_classes = 2
+
+        # save the images in the tmpdir
+        count = 0
+        filenames = []
+        filepaths = []
+        filenames_without = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                filename_without = f"image-{count}"
+                filenames.append(filename)
+                filepaths.append(os.path.join(tmpdir.full_path, filename))
+                filenames_without.append(filename_without)
+                im.save(os.path.join(tmpdir.full_path, filename))
+                count += 1
+
+        df = pd.DataFrame(
+            {
+                "filename": filenames,
+                "class": [str(random.randint(0, 1)) for _ in filenames],
+                "filepaths": filepaths,
+            }
+        )
+
+        # create iterator
+        iterator = image.DataFrameIterator(df, tmpdir.full_path)
+        batch = next(iterator)
+        self.assertLen(batch, 2)
+        self.assertIsInstance(batch[0], np.ndarray)
+        self.assertIsInstance(batch[1], np.ndarray)
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(df, x_col="filepaths")
+        df_iterator_dir = generator.flow_from_dataframe(df, tmpdir.full_path)
+        df_sparse_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, class_mode="sparse"
+        )
+        self.assertFalse(np.isnan(df_sparse_iterator.classes).any())
+        # check number of classes and images
+        self.assertLen(df_iterator.class_indices, num_classes)
+        self.assertLen(df_iterator.classes, count)
+        self.assertEqual(set(df_iterator.filenames), set(filepaths))
+        self.assertLen(df_iterator_dir.class_indices, num_classes)
+        self.assertLen(df_iterator_dir.classes, count)
+        self.assertEqual(set(df_iterator_dir.filenames), set(filenames))
+        # test without shuffle
+        _, batch_y = next(
+            generator.flow_from_dataframe(
+                df, tmpdir.full_path, shuffle=False, class_mode="sparse"
+            )
+        )
+        self.assertTrue(
+            (batch_y == df["class"].astype("float")[: len(batch_y)]).all()
+        )
+        # Test invalid use cases
+        with self.assertRaises(ValueError):
+            generator.flow_from_dataframe(
+                df, tmpdir.full_path, color_mode="cmyk"
+            )
+        with self.assertRaises(ValueError):
+            generator.flow_from_dataframe(
+                df, tmpdir.full_path, class_mode="output"
+            )
+        with self.assertWarns(DeprecationWarning):
+            generator.flow_from_dataframe(df, tmpdir.full_path, has_ext=True)
+        with self.assertWarns(DeprecationWarning):
+            generator.flow_from_dataframe(df, tmpdir.full_path, has_ext=False)
+
+        def preprocessing_function(x):
+            # This will fail if not provided by a Numpy array.
+            # Note: This is made to enforce backward compatibility.
+
+            self.assertEqual(x.shape, (26, 26, 3))
+            self.assertIsInstance(x, np.ndarray)
+
+            return np.zeros_like(x)
+
+        # Test usage as Sequence
+        generator = image.ImageDataGenerator(
+            preprocessing_function=preprocessing_function
+        )
+        dir_seq = generator.flow_from_dataframe(
+            df,
+            tmpdir.full_path,
+            target_size=(26, 26),
+            color_mode="rgb",
+            batch_size=3,
+            class_mode="categorical",
+        )
+        self.assertLen(dir_seq, np.ceil(count / 3))
+        x1, y1 = dir_seq[1]
+        self.assertEqual(x1.shape, (3, 26, 26, 3))
+        self.assertEqual(y1.shape, (3, num_classes))
+        x1, y1 = dir_seq[5]
+        self.assertTrue((x1 == 0).all())
+
+        with self.assertRaises(ValueError):
+            x1, y1 = dir_seq[9]
+
+    def test_dataframe_iterator_validate_filenames(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                im.save(os.path.join(tmpdir.full_path, filename))
+                filenames.append(filename)
+                count += 1
+        df = pd.DataFrame({"filename": filenames + ["test.jpp", "test.jpg"]})
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, class_mode="input"
+        )
+        self.assertLen(df_iterator.filenames, len(df["filename"]) - 2)
+        df_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, class_mode="input", validate_filenames=False
+        )
+        self.assertLen(df_iterator.filenames, len(df["filename"]))
+
+    def test_dataframe_iterator_sample_weights(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                im.save(os.path.join(tmpdir.full_path, filename))
+                filenames.append(filename)
+                count += 1
+        df = pd.DataFrame({"filename": filenames})
+        df["weight"] = ([2, 5] * len(df))[: len(df)]
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(
+            df,
+            tmpdir.full_path,
+            x_col="filename",
+            y_col=None,
+            shuffle=False,
+            batch_size=5,
+            weight_col="weight",
+            class_mode="input",
+        )
+
+        batch = next(df_iterator)
+        self.assertLen(batch, 3)  # (x, y, weights)
+        # check if input and output have the same shape and they're the same
+        self.assertEqual(batch[0].all(), batch[1].all())
+        # check if the input and output images are not the same numpy array
+        input_img = batch[0][0]
+        output_img = batch[1][0]
+        output_img[0][0][0] += 1
+        self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
+        self.assertAllEqual(np.array([2, 5, 2, 5, 2]), batch[2])
+
+        # fail
+        df["weight"] = (["2", "5"] * len(df))[: len(df)]
+        with self.assertRaises(TypeError):
+            image.ImageDataGenerator().flow_from_dataframe(
+                df, weight_col="weight", class_mode="input"
+            )
+
+    def test_dataframe_iterator_class_mode_input(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                im.save(os.path.join(tmpdir.full_path, filename))
+                filenames.append(filename)
+                count += 1
+        df = pd.DataFrame({"filename": filenames})
+        generator = image.ImageDataGenerator()
+        df_autoencoder_iterator = generator.flow_from_dataframe(
+            df,
+            tmpdir.full_path,
+            x_col="filename",
+            y_col=None,
+            class_mode="input",
+        )
+
+        batch = next(df_autoencoder_iterator)
+
+        # check if input and output have the same shape and they're the same
+        self.assertAllClose(batch[0], batch[1])
+        # check if the input and output images are not the same numpy array
+        input_img = batch[0][0]
+        output_img = batch[1][0]
+        output_img[0][0][0] += 1
+        self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
+
+        df_autoencoder_iterator = generator.flow_from_dataframe(
+            df,
+            tmpdir.full_path,
+            x_col="filename",
+            y_col="class",
+            class_mode="input",
+        )
+
+        batch = next(df_autoencoder_iterator)
+
+        # check if input and output have the same shape and they're the same
+        self.assertEqual(batch[0].all(), batch[1].all())
+        # check if the input and output images are not the same numpy array
+        input_img = batch[0][0]
+        output_img = batch[1][0]
+        output_img[0][0][0] += 1
+        self.assertNotEqual(input_img[0][0][0], output_img[0][0][0])
+
+    def test_dataframe_iterator_class_mode_categorical_multi_label(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        # save the images in the paths
+        filenames = []
+        count = 0
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                im.save(os.path.join(tmpdir.full_path, filename))
+                filenames.append(filename)
+                count += 1
+        label_opt = ["a", "b", ["a"], ["b"], ["a", "b"], ["b", "a"]]
+        df = pd.DataFrame(
+            {
+                "filename": filenames,
+                "class": [random.choice(label_opt) for _ in filenames[:-2]]
+                + ["b", "a"],
+            }
+        )
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(df, tmpdir.full_path)
+        batch_x, batch_y = next(df_iterator)
+        self.assertIsInstance(batch_x, np.ndarray)
+        self.assertLen(batch_x.shape, 4)
+        self.assertIsInstance(batch_y, np.ndarray)
+        self.assertEqual(batch_y.shape, (len(batch_x), 2))
+        for labels in batch_y:
+            self.assertTrue(all(label in {0, 1} for label in labels))
+
+        # on first 3 batches
+        df = pd.DataFrame(
+            {
+                "filename": filenames,
+                "class": [["b", "a"]]
+                + ["b"]
+                + [["c"]]
+                + [random.choice(label_opt) for _ in filenames[:-3]],
+            }
+        )
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, shuffle=False
+        )
+        batch_x, batch_y = next(df_iterator)
+        self.assertIsInstance(batch_x, np.ndarray)
+        self.assertLen(batch_x.shape, 4)
+        self.assertIsInstance(batch_y, np.ndarray)
+        self.assertEqual(batch_y.shape, (len(batch_x), 3))
+        for labels in batch_y:
+            self.assertTrue(all(label in {0, 1} for label in labels))
+        self.assertTrue((batch_y[0] == np.array([1, 1, 0])).all())
+        self.assertTrue((batch_y[1] == np.array([0, 1, 0])).all())
+        self.assertTrue((batch_y[2] == np.array([0, 0, 1])).all())
+
+    def test_dataframe_iterator_class_mode_multi_output(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        # save the images in the paths
+        filenames = []
+        count = 0
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                im.save(os.path.join(tmpdir.full_path, filename))
+                filenames.append(filename)
+                count += 1
+        # fit both outputs are a single number
+        df = pd.DataFrame({"filename": filenames}).assign(
+            output_0=np.random.uniform(size=len(filenames)),
+            output_1=np.random.uniform(size=len(filenames)),
+        )
+        df_iterator = image.ImageDataGenerator().flow_from_dataframe(
+            df,
+            y_col=["output_0", "output_1"],
+            directory=tmpdir.full_path,
+            batch_size=3,
+            shuffle=False,
+            class_mode="multi_output",
+        )
+        batch_x, batch_y = next(df_iterator)
+        self.assertIsInstance(batch_x, np.ndarray)
+        self.assertLen(batch_x.shape, 4)
+        self.assertIsInstance(batch_y, list)
+        self.assertLen(batch_y, 2)
+        self.assertAllEqual(batch_y[0], np.array(df["output_0"].tolist()[:3]))
+        self.assertAllEqual(batch_y[1], np.array(df["output_1"].tolist()[:3]))
+        # if one of the outputs is a 1D array
+        df["output_1"] = [
+            np.random.uniform(size=(2, 2, 1)).flatten() for _ in range(len(df))
+        ]
+        df_iterator = image.ImageDataGenerator().flow_from_dataframe(
+            df,
+            y_col=["output_0", "output_1"],
+            directory=tmpdir.full_path,
+            batch_size=3,
+            shuffle=False,
+            class_mode="multi_output",
+        )
+        batch_x, batch_y = next(df_iterator)
+        self.assertIsInstance(batch_x, np.ndarray)
+        self.assertLen(batch_x.shape, 4)
+        self.assertIsInstance(batch_y, list)
+        self.assertLen(batch_y, 2)
+        self.assertAllEqual(batch_y[0], np.array(df["output_0"].tolist()[:3]))
+        self.assertAllEqual(batch_y[1], np.array(df["output_1"].tolist()[:3]))
+        # if one of the outputs is a 2D array
+        df["output_1"] = [
+            np.random.uniform(size=(2, 2, 1)) for _ in range(len(df))
+        ]
+        df_iterator = image.ImageDataGenerator().flow_from_dataframe(
+            df,
+            y_col=["output_0", "output_1"],
+            directory=tmpdir.full_path,
+            batch_size=3,
+            shuffle=False,
+            class_mode="multi_output",
+        )
+        batch_x, batch_y = next(df_iterator)
+        self.assertIsInstance(batch_x, np.ndarray)
+        self.assertLen(batch_x.shape, 4)
+        self.assertIsInstance(batch_y, list)
+        self.assertLen(batch_y, 2)
+        self.assertAllEqual(batch_y[0], np.array(df["output_0"].tolist()[:3]))
+        self.assertAllEqual(batch_y[1], np.array(df["output_1"].tolist()[:3]))
+        # fail if single column
+        with self.assertRaises(TypeError):
+            image.ImageDataGenerator().flow_from_dataframe(
+                df,
+                y_col="output_0",
+                directory=tmpdir.full_path,
+                class_mode="multi_output",
+            )
+
+    def test_dataframe_iterator_class_mode_raw(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        # save the images in the paths
+        filenames = []
+        count = 0
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                im.save(os.path.join(tmpdir.full_path, filename))
+                filenames.append(filename)
+                count += 1
+        # case for 1D output
+        df = pd.DataFrame({"filename": filenames}).assign(
+            output_0=np.random.uniform(size=len(filenames)),
+            output_1=np.random.uniform(size=len(filenames)),
+        )
+        df_iterator = image.ImageDataGenerator().flow_from_dataframe(
+            df,
+            y_col="output_0",
+            directory=tmpdir.full_path,
+            batch_size=3,
+            shuffle=False,
+            class_mode="raw",
+        )
+        batch_x, batch_y = next(df_iterator)
+        self.assertIsInstance(batch_x, np.ndarray)
+        self.assertLen(batch_x.shape, 4)
+        self.assertIsInstance(batch_y, np.ndarray)
+        self.assertEqual(batch_y.shape, (3,))
+        self.assertAllEqual(batch_y, df["output_0"].values[:3])
+        # case with a 2D output
+        df_iterator = image.ImageDataGenerator().flow_from_dataframe(
+            df,
+            y_col=["output_0", "output_1"],
+            directory=tmpdir.full_path,
+            batch_size=3,
+            shuffle=False,
+            class_mode="raw",
+        )
+        batch_x, batch_y = next(df_iterator)
+        self.assertIsInstance(batch_x, np.ndarray)
+        self.assertLen(batch_x.shape, 4)
+        self.assertIsInstance(batch_y, np.ndarray)
+        self.assertEqual(batch_y.shape, (3, 2))
+        self.assertAllEqual(batch_y, df[["output_0", "output_1"]].values[:3])
+
+    @parameterized.parameters(
+        [
+            (0.25, 18),
+            (0.50, 12),
+            (0.75, 6),
+        ]
+    )
+    def test_dataframe_iterator_with_validation_split(
+        self, validation_split, num_training
+    ):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        num_classes = 2
+
+        # save the images in the tmpdir
+        count = 0
+        filenames = []
+        filenames_without = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                filename_without = f"image-{count}"
+                filenames.append(filename)
+                filenames_without.append(filename_without)
+                im.save(os.path.join(tmpdir.full_path, filename))
+                count += 1
+
+        df = pd.DataFrame(
+            {
+                "filename": filenames,
+                "class": [str(random.randint(0, 1)) for _ in filenames],
+            }
+        )
+        # create iterator
+        generator = image.ImageDataGenerator(validation_split=validation_split)
+        df_sparse_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, class_mode="sparse"
+        )
+        if np.isnan(next(df_sparse_iterator)[:][1]).any():
+            raise ValueError("Invalid values.")
+
+        with self.assertRaises(ValueError):
+            generator.flow_from_dataframe(df, tmpdir.full_path, subset="foo")
+
+        train_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, subset="training"
+        )
+        self.assertEqual(train_iterator.samples, num_training)
+
+        valid_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, subset="validation"
+        )
+        self.assertEqual(valid_iterator.samples, count - num_training)
+
+        # check number of classes and images
+        self.assertLen(train_iterator.class_indices, num_classes)
+        self.assertLen(train_iterator.classes, num_training)
+        self.assertLen(
+            set(train_iterator.filenames) & set(filenames), num_training
+        )
+
+    def test_dataframe_iterator_with_custom_indexed_dataframe(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        num_classes = 2
+
+        # save the images in the tmpdir
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                filenames.append(filename)
+                im.save(os.path.join(tmpdir.full_path, filename))
+                count += 1
+
+        # create dataframes
+        classes = np.random.randint(num_classes, size=len(filenames))
+        classes = [str(c) for c in classes]
+        df = pd.DataFrame({"filename": filenames, "class": classes})
+        df2 = pd.DataFrame(
+            {"filename": filenames, "class": classes},
+            index=np.arange(1, len(filenames) + 1),
+        )
+        df3 = pd.DataFrame(
+            {"filename": filenames, "class": classes}, index=filenames
+        )
+
+        # create iterators
+        seed = 1
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, seed=seed
+        )
+        df2_iterator = generator.flow_from_dataframe(
+            df2, tmpdir.full_path, seed=seed
+        )
+        df3_iterator = generator.flow_from_dataframe(
+            df3, tmpdir.full_path, seed=seed
+        )
+
+        # Test all iterators return same pairs of arrays
+        for _ in range(len(filenames)):
+            a1, c1 = next(df_iterator)
+            a2, c2 = next(df2_iterator)
+            a3, c3 = next(df3_iterator)
+            self.assertAllEqual(a1, a2)
+            self.assertAllEqual(a1, a3)
+            self.assertAllEqual(c1, c2)
+            self.assertAllEqual(c1, c3)
+
+    def test_dataframe_iterator_n(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+
+        # save the images in the tmpdir
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                filenames.append(filename)
+                im.save(os.path.join(tmpdir.full_path, filename))
+                count += 1
+
+        # exclude first two items
+        n_files = len(filenames)
+        input_filenames = filenames[2:]
+
+        # create dataframes
+        classes = np.random.randint(2, size=len(input_filenames))
+        classes = [str(c) for c in classes]
+        df = pd.DataFrame({"filename": input_filenames})
+        df2 = pd.DataFrame({"filename": input_filenames, "class": classes})
+
+        # create iterators
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, class_mode=None
+        )
+        df2_iterator = generator.flow_from_dataframe(
+            df2, tmpdir.full_path, class_mode="binary"
+        )
+
+        # Test the number of items in iterators
+        self.assertEqual(df_iterator.n, n_files - 2)
+        self.assertEqual(df2_iterator.n, n_files - 2)
+
+    def test_dataframe_iterator_absolute_path(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+
+        # save the images in the tmpdir
+        count = 0
+        file_paths = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count:0>5}.png"
+                file_path = os.path.join(tmpdir.full_path, filename)
+                file_paths.append(file_path)
+                im.save(file_path)
+                count += 1
+
+        # prepare an image with a forbidden extension.
+        file_path_fbd = os.path.join(tmpdir.full_path, "image-forbid.fbd")
+        shutil.copy(file_path, file_path_fbd)
+
+        # create dataframes
+        classes = np.random.randint(2, size=len(file_paths))
+        classes = [str(c) for c in classes]
+        df = pd.DataFrame({"filename": file_paths})
+        df2 = pd.DataFrame({"filename": file_paths, "class": classes})
+        df3 = pd.DataFrame({"filename": ["image-not-exist.png"] + file_paths})
+        df4 = pd.DataFrame({"filename": file_paths + [file_path_fbd]})
+
+        # create iterators
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(
+            df, None, class_mode=None, shuffle=False, batch_size=1
+        )
+        df2_iterator = generator.flow_from_dataframe(
+            df2, None, class_mode="binary", shuffle=False, batch_size=1
+        )
+        df3_iterator = generator.flow_from_dataframe(
+            df3, None, class_mode=None, shuffle=False, batch_size=1
+        )
+        df4_iterator = generator.flow_from_dataframe(
+            df4, None, class_mode=None, shuffle=False, batch_size=1
+        )
+
+        validation_split = 0.2
+        generator_split = image.ImageDataGenerator(
+            validation_split=validation_split
+        )
+        df_train_iterator = generator_split.flow_from_dataframe(
+            df,
+            None,
+            class_mode=None,
+            shuffle=False,
+            subset="training",
+            batch_size=1,
+        )
+        df_val_iterator = generator_split.flow_from_dataframe(
+            df,
+            None,
+            class_mode=None,
+            shuffle=False,
+            subset="validation",
+            batch_size=1,
+        )
+
+        # Test the number of items in iterators
+        self.assertLen(file_paths, df_iterator.n)
+        self.assertLen(file_paths, df2_iterator.n)
+        self.assertLen(file_paths, df3_iterator.n)
+        self.assertLen(file_paths, df4_iterator.n)
+        self.assertEqual(
+            df_val_iterator.n, int(validation_split * len(file_paths))
+        )
+        self.assertLen(file_paths, df_train_iterator.n + df_val_iterator.n)
+
+        # Test flow_from_dataframe
+        for i in range(len(file_paths)):
+            a1 = next(df_iterator)
+            a2, _ = next(df2_iterator)
+            a3 = next(df3_iterator)
+            a4 = next(df4_iterator)
+
+            if i < df_val_iterator.n:
+                a5 = next(df_val_iterator)
+            else:
+                a5 = next(df_train_iterator)
+
+            self.assertAllEqual(a1, a2)
+            self.assertAllEqual(a1, a3)
+            self.assertAllEqual(a1, a4)
+            self.assertAllEqual(a1, a5)
+
+    def test_dataframe_iterator_with_subdirs(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        num_classes = 2
+
+        # create folders and subfolders
+        paths = []
+        for cl in range(num_classes):
+            class_directory = f"class-{cl}"
+            classpaths = [
+                class_directory,
+                os.path.join(class_directory, "subfolder-1"),
+                os.path.join(class_directory, "subfolder-2"),
+                os.path.join(class_directory, "subfolder-1", "sub-subfolder"),
+            ]
+            for path in classpaths:
+                os.mkdir(os.path.join(tmpdir, path))
+            paths.append(classpaths)
+
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                # rotate image class
+                im_class = count % num_classes
+                # rotate subfolders
+                classpaths = paths[im_class]
+                filename = os.path.join(
+                    classpaths[count % len(classpaths)],
+                    f"image-{count}.png",
+                )
+                filenames.append(filename)
+                im.save(os.path.join(tmpdir.full_path, filename))
+                count += 1
+
+        # create dataframe
+        classes = np.random.randint(num_classes, size=len(filenames))
+        classes = [str(c) for c in classes]
+        df = pd.DataFrame({"filename": filenames, "class": classes})
+
+        # create iterator
+        generator = image.ImageDataGenerator()
+        df_iterator = generator.flow_from_dataframe(
+            df, tmpdir.full_path, class_mode="binary"
+        )
+
+        # Test the number of items in iterator
+        self.assertLen(filenames, df_iterator.n)
+        self.assertEqual(set(df_iterator.filenames), set(filenames))
+
+    def test_dataframe_iterator_classes_indices_order(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        # save the images in the paths
+        count = 0
+        filenames = []
+        for test_images in all_test_images:
+            for im in test_images:
+                filename = f"image-{count}.png"
+                im.save(os.path.join(tmpdir.full_path, filename))
+                filenames.append(filename)
+                count += 1
+
+        # Test the class_indices without classes input
+        generator = image.ImageDataGenerator()
+        label_opt = ["a", "b", ["a"], ["b"], ["a", "b"], ["b", "a"]]
+        df_f = pd.DataFrame(
+            {
+                "filename": filenames,
+                "class": ["a", "b"]
+                + [random.choice(label_opt) for _ in filenames[:-2]],
+            }
+        )
+        flow_forward_iter = generator.flow_from_dataframe(
+            df_f, tmpdir.full_path
+        )
+        label_rev = ["b", "a", ["b"], ["a"], ["b", "a"], ["a", "b"]]
+        df_r = pd.DataFrame(
+            {
+                "filename": filenames,
+                "class": ["b", "a"]
+                + [random.choice(label_rev) for _ in filenames[:-2]],
+            }
+        )
+        flow_backward_iter = generator.flow_from_dataframe(
+            df_r, tmpdir.full_path
+        )
+
+        # check class_indices
+        self.assertEqual(
+            flow_forward_iter.class_indices, flow_backward_iter.class_indices
+        )
+
+        # Test the class_indices with classes input
+        generator_2 = image.ImageDataGenerator()
+        df_f2 = pd.DataFrame(
+            [["data/A.jpg", "A"], ["data/B.jpg", "B"]],
+            columns=["filename", "class"],
+        )
+        flow_forward = generator_2.flow_from_dataframe(
+            df_f2, classes=["A", "B"]
+        )
+        df_b2 = pd.DataFrame(
+            [["data/A.jpg", "A"], ["data/B.jpg", "B"]],
+            columns=["filename", "class"],
+        )
+        flow_backward = generator_2.flow_from_dataframe(
+            df_b2, classes=["B", "A"]
+        )
+
+        # check class_indices
+        self.assertNotEqual(
+            flow_forward.class_indices, flow_backward.class_indices
+        )
 
 
 @test_utils.run_v2_only
 class TestImageDataGenerator(test_combinations.TestCase):
-
-  def test_image_data_generator(self):
-    all_test_images = _generate_test_images(include_rgba=True)
-    for test_images in all_test_images:
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-
-      image.ImageDataGenerator(
-          featurewise_center=True,
-          samplewise_center=True,
-          featurewise_std_normalization=True,
-          samplewise_std_normalization=True,
-          zca_whitening=True,
-          rotation_range=90.,
-          width_shift_range=0.1,
-          height_shift_range=0.1,
-          shear_range=0.5,
-          zoom_range=0.2,
-          channel_shift_range=0.,
-          brightness_range=(1, 5),
-          fill_mode='nearest',
-          cval=0.5,
-          horizontal_flip=True,
-          vertical_flip=True,
-          interpolation_order=1)
-
-  def test_image_data_generator_with_validation_split(self):
-    all_test_images = _generate_test_images(include_rgba=True)
-    for test_images in all_test_images:
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-
-      images = np.vstack(img_list)
-      labels = np.concatenate(
-          [np.zeros((int(len(images) / 2),)),
-           np.ones((int(len(images) / 2),))])
-      generator = image.ImageDataGenerator(validation_split=0.5)
-
-      # training and validation sets would have different
-      # number of classes, because labels are sorted
-      with self.assertRaisesRegex(
-          ValueError,
-          'Training and validation subsets have different number of classes'):
-        generator.flow(
-            images, labels, shuffle=False, batch_size=10, subset='validation')
-
-      # test non categorical labels with validation split
-      generator.flow(
-          images,
-          labels,
-          shuffle=False,
-          batch_size=10,
-          ignore_class_split=True,
-          subset='validation')
-
-      labels = np.concatenate([
-          np.zeros((int(len(images) / 4),)),
-          np.ones((int(len(images) / 4),)),
-          np.zeros((int(len(images) / 4),)),
-          np.ones((int(len(images) / 4),))
-      ])
-
-      seq = generator.flow(
-          images, labels, shuffle=False, batch_size=10, subset='validation')
-
-      _, y = seq[0]
-      self.assertLen(np.unique(y), 2)
-
-      seq = generator.flow(
-          images, labels, shuffle=False, batch_size=10, subset='training')
-      _, y2 = seq[0]
-      self.assertLen(np.unique(y2), 2)
-
-      with self.assertRaises(ValueError):
-        generator.flow(
-            images,
-            np.arange(images.shape[0]),
-            shuffle=False,
-            batch_size=3,
-            subset='foo')
-
-  def test_image_data_generator_with_split_value_error(self):
-    with self.assertRaises(ValueError):
-      image.ImageDataGenerator(validation_split=5)
-
-  def test_image_data_generator_invalid_data(self):
-    generator = image.ImageDataGenerator(
-        featurewise_center=True,
-        samplewise_center=True,
-        featurewise_std_normalization=True,
-        samplewise_std_normalization=True,
-        zca_whitening=True,
-        data_format='channels_last')
-    # Test fit with invalid data
-    with self.assertRaises(ValueError):
-      x = np.random.random((3, 10, 10))
-      generator.fit(x)
-
-    # Test flow with invalid data
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 10, 10))
-      generator.flow(np.arange(x.shape[0]))
-
-  def test_image_data_generator_fit(self):
-    generator = image.ImageDataGenerator(
-        featurewise_center=True,
-        samplewise_center=True,
-        featurewise_std_normalization=True,
-        samplewise_std_normalization=True,
-        zca_whitening=True,
-        rotation_range=90.,
-        width_shift_range=0.1,
-        height_shift_range=0.1,
-        shear_range=0.5,
-        zoom_range=(0.2, 0.2),
-        channel_shift_range=0.,
-        brightness_range=(1, 5),
-        fill_mode='nearest',
-        cval=0.5,
-        horizontal_flip=True,
-        vertical_flip=True,
-        interpolation_order=1,
-        data_format='channels_last')
-    x = np.random.random((32, 10, 10, 3))
-    generator.fit(x, augment=True)
-    # Test grayscale
-    x = np.random.random((32, 10, 10, 1))
-    generator.fit(x)
-    # Test RBG
-    x = np.random.random((32, 10, 10, 3))
-    generator.fit(x)
-    # Test more samples than dims
-    x = np.random.random((32, 4, 4, 1))
-    generator.fit(x)
-    generator = image.ImageDataGenerator(
-        featurewise_center=True,
-        samplewise_center=True,
-        featurewise_std_normalization=True,
-        samplewise_std_normalization=True,
-        zca_whitening=True,
-        rotation_range=90.,
-        width_shift_range=0.1,
-        height_shift_range=0.1,
-        shear_range=0.5,
-        zoom_range=(0.2, 0.2),
-        channel_shift_range=0.,
-        brightness_range=(1, 5),
-        fill_mode='nearest',
-        cval=0.5,
-        horizontal_flip=True,
-        vertical_flip=True,
-        interpolation_order=1,
-        data_format='channels_first')
-    x = np.random.random((32, 10, 10, 3))
-    generator.fit(x, augment=True)
-    # Test grayscale
-    x = np.random.random((32, 1, 10, 10))
-    generator.fit(x)
-    # Test RBG
-    x = np.random.random((32, 3, 10, 10))
-    generator.fit(x)
-    # Test more samples than dims
-    x = np.random.random((32, 1, 4, 4))
-    generator.fit(x)
-
-  def test_image_data_generator_flow(self):
-    tmpdir = self.create_tempdir()
-    all_test_images = _generate_test_images(include_rgba=True)
-    for test_images in all_test_images:
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-
-      images = np.vstack(img_list)
-      dsize = images.shape[0]
-      generator = image.ImageDataGenerator(
-          featurewise_center=True,
-          samplewise_center=True,
-          featurewise_std_normalization=True,
-          samplewise_std_normalization=True,
-          zca_whitening=True,
-          rotation_range=90.,
-          width_shift_range=0.1,
-          height_shift_range=0.1,
-          shear_range=0.5,
-          zoom_range=0.2,
-          channel_shift_range=0.,
-          brightness_range=(1, 5),
-          fill_mode='nearest',
-          cval=0.5,
-          horizontal_flip=True,
-          vertical_flip=True,
-          interpolation_order=1)
-
-      generator.flow(
-          images,
-          np.arange(images.shape[0]),
-          shuffle=False,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-
-      generator.flow(
-          images,
-          np.arange(images.shape[0]),
-          shuffle=False,
-          sample_weight=np.arange(images.shape[0]) + 1,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-
-      # Test with `shuffle=True`
-      generator.flow(
-          images,
-          np.arange(images.shape[0]),
-          shuffle=True,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3,
-          seed=42)
-
-      # Test without y
-      generator.flow(
-          images,
-          None,
-          shuffle=True,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-
-      # Test with a single miscellaneous input data array
-      x_misc1 = np.random.random(dsize)
-      generator.flow((images, x_misc1),
-                     np.arange(dsize),
-                     shuffle=False,
-                     batch_size=2)
-
-      # Test with two miscellaneous inputs
-      x_misc2 = np.random.random((dsize, 3, 3))
-      generator.flow((images, [x_misc1, x_misc2]),
-                     np.arange(dsize),
-                     shuffle=False,
-                     batch_size=2)
-
-      # Test cases with `y = None`
-      generator.flow(images, None, batch_size=3)
-      generator.flow((images, x_misc1), None, batch_size=3, shuffle=False)
-      generator.flow((images, [x_misc1, x_misc2]),
-                     None,
-                     batch_size=3,
-                     shuffle=False)
-      generator = image.ImageDataGenerator(validation_split=0.2)
-      generator.flow(images, batch_size=3)
-
-      # Test some failure cases:
-      x_misc_err = np.random.random((dsize + 1, 3, 3))
-      with self.assertRaisesRegex(ValueError, 'All of the arrays in'):
-        generator.flow((images, x_misc_err), np.arange(dsize), batch_size=3)
-
-      with self.assertRaisesRegex(ValueError,
-                                  r'`x` \(images tensor\) and `y` \(labels\)'):
-        generator.flow((images, x_misc1), np.arange(dsize + 1), batch_size=3)
-
-      # Test `flow` behavior as Sequence
-      generator.flow(
-          images,
-          np.arange(images.shape[0]),
-          shuffle=False,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3)
-
-      # Test with `shuffle=True`
-      generator.flow(
-          images,
-          np.arange(images.shape[0]),
-          shuffle=True,
-          save_to_dir=tmpdir.full_path,
-          batch_size=3,
-          seed=123)
-
-    # test order_interpolation
-    labels = np.array([[2, 2, 0, 2, 2], [1, 3, 2, 3, 1], [2, 1, 0, 1, 2],
-                       [3, 1, 0, 2, 0], [3, 1, 3, 2, 1]])
-
-    label_generator = image.ImageDataGenerator(
-        rotation_range=90., interpolation_order=0)
-    label_generator.flow(x=labels[np.newaxis, ..., np.newaxis], seed=123)
-
-  def test_valid_args(self):
-    with self.assertRaises(ValueError):
-      image.ImageDataGenerator(brightness_range=0.1)
-
-  def test_batch_standardize(self):
-    all_test_images = _generate_test_images(include_rgba=True)
-    # ImageDataGenerator.standardize should work on batches
-    for test_images in all_test_images:
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-
-      images = np.vstack(img_list)
-      generator = image.ImageDataGenerator(
-          featurewise_center=True,
-          samplewise_center=True,
-          featurewise_std_normalization=True,
-          samplewise_std_normalization=True,
-          zca_whitening=True,
-          rotation_range=90.,
-          width_shift_range=0.1,
-          height_shift_range=0.1,
-          shear_range=0.5,
-          zoom_range=0.2,
-          channel_shift_range=0.,
-          brightness_range=(1, 5),
-          fill_mode='nearest',
-          cval=0.5,
-          horizontal_flip=True,
-          vertical_flip=True)
-      generator.fit(images, augment=True)
-
-      transformed = np.copy(images)
-      for i, im in enumerate(transformed):
-        transformed[i] = generator.random_transform(im)
-      transformed = generator.standardize(transformed)
-
-  def test_deterministic_transform(self):
-    x = np.ones((32, 32, 3))
-    generator = image.ImageDataGenerator(
-        rotation_range=90, fill_mode='constant')
-    x = np.random.random((32, 32, 3))
-    self.assertAllClose(
-        generator.apply_transform(x, {'flip_vertical': True}), x[::-1, :, :])
-    self.assertAllClose(
-        generator.apply_transform(x, {'flip_horizontal': True}), x[:, ::-1, :])
-    x = np.ones((3, 3, 3))
-    x_rotated = np.array([[[0., 0., 0.], [1., 1., 1.], [0., 0., 0.]],
-                          [[1., 1., 1.], [1., 1., 1.], [1., 1., 1.]],
-                          [[0., 0., 0.], [1., 1., 1.], [0., 0., 0.]]])
-    self.assertAllClose(generator.apply_transform(x, {'theta': 45}), x_rotated)
-
-  def test_random_transforms(self):
-    x = np.random.random((2, 28, 28))
-    # Test get_random_transform with predefined seed
-    seed = 1
-    generator = image.ImageDataGenerator(
-        rotation_range=90.,
-        width_shift_range=0.1,
-        height_shift_range=0.1,
-        shear_range=0.5,
-        zoom_range=0.2,
-        channel_shift_range=0.1,
-        brightness_range=(1, 5),
-        horizontal_flip=True,
-        vertical_flip=True)
-    transform_dict = generator.get_random_transform(x.shape, seed)
-    transform_dict2 = generator.get_random_transform(x.shape, seed * 2)
-    self.assertNotEqual(transform_dict['theta'], 0)
-    self.assertNotEqual(transform_dict['theta'], transform_dict2['theta'])
-    self.assertNotEqual(transform_dict['tx'], 0)
-    self.assertNotEqual(transform_dict['tx'], transform_dict2['tx'])
-    self.assertNotEqual(transform_dict['ty'], 0)
-    self.assertNotEqual(transform_dict['ty'], transform_dict2['ty'])
-    self.assertNotEqual(transform_dict['shear'], 0)
-    self.assertNotEqual(transform_dict['shear'], transform_dict2['shear'])
-    self.assertNotEqual(transform_dict['zx'], 0)
-    self.assertNotEqual(transform_dict['zx'], transform_dict2['zx'])
-    self.assertNotEqual(transform_dict['zy'], 0)
-    self.assertNotEqual(transform_dict['zy'], transform_dict2['zy'])
-    self.assertNotEqual(transform_dict['channel_shift_intensity'], 0)
-    self.assertNotEqual(transform_dict['channel_shift_intensity'],
-                        transform_dict2['channel_shift_intensity'])
-    self.assertNotEqual(transform_dict['brightness'], 0)
-    self.assertNotEqual(transform_dict['brightness'],
-                        transform_dict2['brightness'])
-
-    # Test get_random_transform without any randomness
-    generator = image.ImageDataGenerator()
-    transform_dict = generator.get_random_transform(x.shape, seed)
-    self.assertEqual(transform_dict['theta'], 0)
-    self.assertEqual(transform_dict['tx'], 0)
-    self.assertEqual(transform_dict['ty'], 0)
-    self.assertEqual(transform_dict['shear'], 0)
-    self.assertEqual(transform_dict['zx'], 1)
-    self.assertEqual(transform_dict['zy'], 1)
-    self.assertIsNone(transform_dict['channel_shift_intensity'], None)
-    self.assertIsNone(transform_dict['brightness'], None)
-
-  def test_fit_rescale(self):
-    all_test_images = _generate_test_images(include_rgba=True)
-    rescale = 1. / 255
-
-    for test_images in all_test_images:
-      img_list = []
-      for im in test_images:
-        img_list.append(image_utils.img_to_array(im)[None, ...])
-      images = np.vstack(img_list)
-
-      # featurewise_center test
-      generator = image.ImageDataGenerator(
-          rescale=rescale, featurewise_center=True, dtype='float64')
-      generator.fit(images)
-      batch = generator.flow(images, batch_size=8).next()
-      self.assertLess(abs(np.mean(batch)), 1e-6)
-
-      # featurewise_std_normalization test
-      generator = image.ImageDataGenerator(
-          rescale=rescale,
-          featurewise_center=True,
-          featurewise_std_normalization=True,
-          dtype='float64')
-      generator.fit(images)
-      batch = generator.flow(images, batch_size=8).next()
-      self.assertLess(abs(np.mean(batch)), 1e-6)
-      self.assertLess(abs(1 - np.std(batch)), 1e-5)
-
-      # zca_whitening test
-      generator = image.ImageDataGenerator(
-          rescale=rescale,
-          featurewise_center=True,
-          zca_whitening=True,
-          dtype='float64')
-      generator.fit(images)
-      batch = generator.flow(images, batch_size=8).next()
-      batch = np.reshape(
-          batch,
-          (batch.shape[0], batch.shape[1] * batch.shape[2] * batch.shape[3]))
-      # Y * Y_T = n * I, where Y = W * X
-      identity = np.dot(batch, batch.T) / batch.shape[0]
-      self.assertTrue(
-          ((np.abs(identity) - np.identity(identity.shape[0])) < 1e-6).all())
+    def test_image_data_generator(self):
+        all_test_images = _generate_test_images(include_rgba=True)
+        for test_images in all_test_images:
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+
+            image.ImageDataGenerator(
+                featurewise_center=True,
+                samplewise_center=True,
+                featurewise_std_normalization=True,
+                samplewise_std_normalization=True,
+                zca_whitening=True,
+                rotation_range=90.0,
+                width_shift_range=0.1,
+                height_shift_range=0.1,
+                shear_range=0.5,
+                zoom_range=0.2,
+                channel_shift_range=0.0,
+                brightness_range=(1, 5),
+                fill_mode="nearest",
+                cval=0.5,
+                horizontal_flip=True,
+                vertical_flip=True,
+                interpolation_order=1,
+            )
+
+    def test_image_data_generator_with_validation_split(self):
+        all_test_images = _generate_test_images(include_rgba=True)
+        for test_images in all_test_images:
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+
+            images = np.vstack(img_list)
+            labels = np.concatenate(
+                [
+                    np.zeros((int(len(images) / 2),)),
+                    np.ones((int(len(images) / 2),)),
+                ]
+            )
+            generator = image.ImageDataGenerator(validation_split=0.5)
+
+            # training and validation sets would have different
+            # number of classes, because labels are sorted
+            with self.assertRaisesRegex(
+                ValueError,
+                "Training and validation subsets have "
+                "different number of classes",
+            ):
+                generator.flow(
+                    images,
+                    labels,
+                    shuffle=False,
+                    batch_size=10,
+                    subset="validation",
+                )
+
+            # test non categorical labels with validation split
+            generator.flow(
+                images,
+                labels,
+                shuffle=False,
+                batch_size=10,
+                ignore_class_split=True,
+                subset="validation",
+            )
+
+            labels = np.concatenate(
+                [
+                    np.zeros((int(len(images) / 4),)),
+                    np.ones((int(len(images) / 4),)),
+                    np.zeros((int(len(images) / 4),)),
+                    np.ones((int(len(images) / 4),)),
+                ]
+            )
+
+            seq = generator.flow(
+                images,
+                labels,
+                shuffle=False,
+                batch_size=10,
+                subset="validation",
+            )
+
+            _, y = seq[0]
+            self.assertLen(np.unique(y), 2)
+
+            seq = generator.flow(
+                images, labels, shuffle=False, batch_size=10, subset="training"
+            )
+            _, y2 = seq[0]
+            self.assertLen(np.unique(y2), 2)
+
+            with self.assertRaises(ValueError):
+                generator.flow(
+                    images,
+                    np.arange(images.shape[0]),
+                    shuffle=False,
+                    batch_size=3,
+                    subset="foo",
+                )
+
+    def test_image_data_generator_with_split_value_error(self):
+        with self.assertRaises(ValueError):
+            image.ImageDataGenerator(validation_split=5)
+
+    def test_image_data_generator_invalid_data(self):
+        generator = image.ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            data_format="channels_last",
+        )
+        # Test fit with invalid data
+        with self.assertRaises(ValueError):
+            x = np.random.random((3, 10, 10))
+            generator.fit(x)
+
+        # Test flow with invalid data
+        with self.assertRaises(ValueError):
+            x = np.random.random((32, 10, 10))
+            generator.flow(np.arange(x.shape[0]))
+
+    def test_image_data_generator_fit(self):
+        generator = image.ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            rotation_range=90.0,
+            width_shift_range=0.1,
+            height_shift_range=0.1,
+            shear_range=0.5,
+            zoom_range=(0.2, 0.2),
+            channel_shift_range=0.0,
+            brightness_range=(1, 5),
+            fill_mode="nearest",
+            cval=0.5,
+            horizontal_flip=True,
+            vertical_flip=True,
+            interpolation_order=1,
+            data_format="channels_last",
+        )
+        x = np.random.random((32, 10, 10, 3))
+        generator.fit(x, augment=True)
+        # Test grayscale
+        x = np.random.random((32, 10, 10, 1))
+        generator.fit(x)
+        # Test RBG
+        x = np.random.random((32, 10, 10, 3))
+        generator.fit(x)
+        # Test more samples than dims
+        x = np.random.random((32, 4, 4, 1))
+        generator.fit(x)
+        generator = image.ImageDataGenerator(
+            featurewise_center=True,
+            samplewise_center=True,
+            featurewise_std_normalization=True,
+            samplewise_std_normalization=True,
+            zca_whitening=True,
+            rotation_range=90.0,
+            width_shift_range=0.1,
+            height_shift_range=0.1,
+            shear_range=0.5,
+            zoom_range=(0.2, 0.2),
+            channel_shift_range=0.0,
+            brightness_range=(1, 5),
+            fill_mode="nearest",
+            cval=0.5,
+            horizontal_flip=True,
+            vertical_flip=True,
+            interpolation_order=1,
+            data_format="channels_first",
+        )
+        x = np.random.random((32, 10, 10, 3))
+        generator.fit(x, augment=True)
+        # Test grayscale
+        x = np.random.random((32, 1, 10, 10))
+        generator.fit(x)
+        # Test RBG
+        x = np.random.random((32, 3, 10, 10))
+        generator.fit(x)
+        # Test more samples than dims
+        x = np.random.random((32, 1, 4, 4))
+        generator.fit(x)
+
+    def test_image_data_generator_flow(self):
+        tmpdir = self.create_tempdir()
+        all_test_images = _generate_test_images(include_rgba=True)
+        for test_images in all_test_images:
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+
+            images = np.vstack(img_list)
+            dsize = images.shape[0]
+            generator = image.ImageDataGenerator(
+                featurewise_center=True,
+                samplewise_center=True,
+                featurewise_std_normalization=True,
+                samplewise_std_normalization=True,
+                zca_whitening=True,
+                rotation_range=90.0,
+                width_shift_range=0.1,
+                height_shift_range=0.1,
+                shear_range=0.5,
+                zoom_range=0.2,
+                channel_shift_range=0.0,
+                brightness_range=(1, 5),
+                fill_mode="nearest",
+                cval=0.5,
+                horizontal_flip=True,
+                vertical_flip=True,
+                interpolation_order=1,
+            )
+
+            generator.flow(
+                images,
+                np.arange(images.shape[0]),
+                shuffle=False,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+
+            generator.flow(
+                images,
+                np.arange(images.shape[0]),
+                shuffle=False,
+                sample_weight=np.arange(images.shape[0]) + 1,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+
+            # Test with `shuffle=True`
+            generator.flow(
+                images,
+                np.arange(images.shape[0]),
+                shuffle=True,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+                seed=42,
+            )
+
+            # Test without y
+            generator.flow(
+                images,
+                None,
+                shuffle=True,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+
+            # Test with a single miscellaneous input data array
+            x_misc1 = np.random.random(dsize)
+            generator.flow(
+                (images, x_misc1), np.arange(dsize), shuffle=False, batch_size=2
+            )
+
+            # Test with two miscellaneous inputs
+            x_misc2 = np.random.random((dsize, 3, 3))
+            generator.flow(
+                (images, [x_misc1, x_misc2]),
+                np.arange(dsize),
+                shuffle=False,
+                batch_size=2,
+            )
+
+            # Test cases with `y = None`
+            generator.flow(images, None, batch_size=3)
+            generator.flow((images, x_misc1), None, batch_size=3, shuffle=False)
+            generator.flow(
+                (images, [x_misc1, x_misc2]), None, batch_size=3, shuffle=False
+            )
+            generator = image.ImageDataGenerator(validation_split=0.2)
+            generator.flow(images, batch_size=3)
+
+            # Test some failure cases:
+            x_misc_err = np.random.random((dsize + 1, 3, 3))
+            with self.assertRaisesRegex(ValueError, "All of the arrays in"):
+                generator.flow(
+                    (images, x_misc_err), np.arange(dsize), batch_size=3
+                )
+
+            with self.assertRaisesRegex(
+                ValueError, r"`x` \(images tensor\) and `y` \(labels\)"
+            ):
+                generator.flow(
+                    (images, x_misc1), np.arange(dsize + 1), batch_size=3
+                )
+
+            # Test `flow` behavior as Sequence
+            generator.flow(
+                images,
+                np.arange(images.shape[0]),
+                shuffle=False,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+            )
+
+            # Test with `shuffle=True`
+            generator.flow(
+                images,
+                np.arange(images.shape[0]),
+                shuffle=True,
+                save_to_dir=tmpdir.full_path,
+                batch_size=3,
+                seed=123,
+            )
+
+        # test order_interpolation
+        labels = np.array(
+            [
+                [2, 2, 0, 2, 2],
+                [1, 3, 2, 3, 1],
+                [2, 1, 0, 1, 2],
+                [3, 1, 0, 2, 0],
+                [3, 1, 3, 2, 1],
+            ]
+        )
+
+        label_generator = image.ImageDataGenerator(
+            rotation_range=90.0, interpolation_order=0
+        )
+        label_generator.flow(x=labels[np.newaxis, ..., np.newaxis], seed=123)
+
+    def test_valid_args(self):
+        with self.assertRaises(ValueError):
+            image.ImageDataGenerator(brightness_range=0.1)
+
+    def test_batch_standardize(self):
+        all_test_images = _generate_test_images(include_rgba=True)
+        # ImageDataGenerator.standardize should work on batches
+        for test_images in all_test_images:
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+
+            images = np.vstack(img_list)
+            generator = image.ImageDataGenerator(
+                featurewise_center=True,
+                samplewise_center=True,
+                featurewise_std_normalization=True,
+                samplewise_std_normalization=True,
+                zca_whitening=True,
+                rotation_range=90.0,
+                width_shift_range=0.1,
+                height_shift_range=0.1,
+                shear_range=0.5,
+                zoom_range=0.2,
+                channel_shift_range=0.0,
+                brightness_range=(1, 5),
+                fill_mode="nearest",
+                cval=0.5,
+                horizontal_flip=True,
+                vertical_flip=True,
+            )
+            generator.fit(images, augment=True)
+
+            transformed = np.copy(images)
+            for i, im in enumerate(transformed):
+                transformed[i] = generator.random_transform(im)
+            transformed = generator.standardize(transformed)
+
+    def test_deterministic_transform(self):
+        x = np.ones((32, 32, 3))
+        generator = image.ImageDataGenerator(
+            rotation_range=90, fill_mode="constant"
+        )
+        x = np.random.random((32, 32, 3))
+        self.assertAllClose(
+            generator.apply_transform(x, {"flip_vertical": True}), x[::-1, :, :]
+        )
+        self.assertAllClose(
+            generator.apply_transform(x, {"flip_horizontal": True}),
+            x[:, ::-1, :],
+        )
+        x = np.ones((3, 3, 3))
+        x_rotated = np.array(
+            [
+                [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
+                [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
+                [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
+            ]
+        )
+        self.assertAllClose(
+            generator.apply_transform(x, {"theta": 45}), x_rotated
+        )
+
+    def test_random_transforms(self):
+        x = np.random.random((2, 28, 28))
+        # Test get_random_transform with predefined seed
+        seed = 1
+        generator = image.ImageDataGenerator(
+            rotation_range=90.0,
+            width_shift_range=0.1,
+            height_shift_range=0.1,
+            shear_range=0.5,
+            zoom_range=0.2,
+            channel_shift_range=0.1,
+            brightness_range=(1, 5),
+            horizontal_flip=True,
+            vertical_flip=True,
+        )
+        transform_dict = generator.get_random_transform(x.shape, seed)
+        transform_dict2 = generator.get_random_transform(x.shape, seed * 2)
+        self.assertNotEqual(transform_dict["theta"], 0)
+        self.assertNotEqual(transform_dict["theta"], transform_dict2["theta"])
+        self.assertNotEqual(transform_dict["tx"], 0)
+        self.assertNotEqual(transform_dict["tx"], transform_dict2["tx"])
+        self.assertNotEqual(transform_dict["ty"], 0)
+        self.assertNotEqual(transform_dict["ty"], transform_dict2["ty"])
+        self.assertNotEqual(transform_dict["shear"], 0)
+        self.assertNotEqual(transform_dict["shear"], transform_dict2["shear"])
+        self.assertNotEqual(transform_dict["zx"], 0)
+        self.assertNotEqual(transform_dict["zx"], transform_dict2["zx"])
+        self.assertNotEqual(transform_dict["zy"], 0)
+        self.assertNotEqual(transform_dict["zy"], transform_dict2["zy"])
+        self.assertNotEqual(transform_dict["channel_shift_intensity"], 0)
+        self.assertNotEqual(
+            transform_dict["channel_shift_intensity"],
+            transform_dict2["channel_shift_intensity"],
+        )
+        self.assertNotEqual(transform_dict["brightness"], 0)
+        self.assertNotEqual(
+            transform_dict["brightness"], transform_dict2["brightness"]
+        )
+
+        # Test get_random_transform without any randomness
+        generator = image.ImageDataGenerator()
+        transform_dict = generator.get_random_transform(x.shape, seed)
+        self.assertEqual(transform_dict["theta"], 0)
+        self.assertEqual(transform_dict["tx"], 0)
+        self.assertEqual(transform_dict["ty"], 0)
+        self.assertEqual(transform_dict["shear"], 0)
+        self.assertEqual(transform_dict["zx"], 1)
+        self.assertEqual(transform_dict["zy"], 1)
+        self.assertIsNone(transform_dict["channel_shift_intensity"], None)
+        self.assertIsNone(transform_dict["brightness"], None)
+
+    def test_fit_rescale(self):
+        all_test_images = _generate_test_images(include_rgba=True)
+        rescale = 1.0 / 255
+
+        for test_images in all_test_images:
+            img_list = []
+            for im in test_images:
+                img_list.append(image_utils.img_to_array(im)[None, ...])
+            images = np.vstack(img_list)
+
+            # featurewise_center test
+            generator = image.ImageDataGenerator(
+                rescale=rescale, featurewise_center=True, dtype="float64"
+            )
+            generator.fit(images)
+            batch = generator.flow(images, batch_size=8).next()
+            self.assertLess(abs(np.mean(batch)), 1e-6)
+
+            # featurewise_std_normalization test
+            generator = image.ImageDataGenerator(
+                rescale=rescale,
+                featurewise_center=True,
+                featurewise_std_normalization=True,
+                dtype="float64",
+            )
+            generator.fit(images)
+            batch = generator.flow(images, batch_size=8).next()
+            self.assertLess(abs(np.mean(batch)), 1e-6)
+            self.assertLess(abs(1 - np.std(batch)), 1e-5)
+
+            # zca_whitening test
+            generator = image.ImageDataGenerator(
+                rescale=rescale,
+                featurewise_center=True,
+                zca_whitening=True,
+                dtype="float64",
+            )
+            generator.fit(images)
+            batch = generator.flow(images, batch_size=8).next()
+            batch = np.reshape(
+                batch,
+                (
+                    batch.shape[0],
+                    batch.shape[1] * batch.shape[2] * batch.shape[3],
+                ),
+            )
+            # Y * Y_T = n * I, where Y = W * X
+            identity = np.dot(batch, batch.T) / batch.shape[0]
+            self.assertTrue(
+                (
+                    (np.abs(identity) - np.identity(identity.shape[0])) < 1e-6
+                ).all()
+            )
 
 
 @test_utils.run_v2_only
 class TestAffineTransformations(test_combinations.TestCase):
-
-  def test_random_transforms(self):
-    x = np.random.random((2, 28, 28))
-    self.assertEqual(image.random_rotation(x, 45).shape, (2, 28, 28))
-    self.assertEqual(image.random_shift(x, 1, 1).shape, (2, 28, 28))
-    self.assertEqual(image.random_shear(x, 20).shape, (2, 28, 28))
-    self.assertEqual(image.random_channel_shift(x, 20).shape, (2, 28, 28))
-
-  def test_deterministic_transform(self):
-    x = np.ones((3, 3, 3))
-    x_rotated = np.array([[[0., 0., 0.], [1., 1., 1.], [0., 0., 0.]],
-                          [[1., 1., 1.], [1., 1., 1.], [1., 1., 1.]],
-                          [[0., 0., 0.], [1., 1., 1.], [0., 0., 0.]]])
-    self.assertAllClose(
-        image.apply_affine_transform(
-            x,
-            theta=45,
-            row_axis=0,
-            col_axis=1,
-            channel_axis=2,
-            fill_mode='constant'), x_rotated)
-
-  def test_matrix_center(self):
-    x = np.expand_dims(np.array([
-        [0, 1],
-        [0, 0],
-    ]), -1)
-    x_rotated90 = np.expand_dims(np.array([
-        [1, 0],
-        [0, 0],
-    ]), -1)
-
-    self.assertAllClose(
-        image.apply_affine_transform(
-            x, theta=90, row_axis=0, col_axis=1, channel_axis=2), x_rotated90)
-
-  def test_translation(self):
-    x = np.array([
-        [0, 0, 0, 0],
-        [0, 1, 0, 0],
-        [0, 0, 0, 0],
-    ])
-    x_up = np.array([
-        [0, 1, 0, 0],
-        [0, 0, 0, 0],
-        [0, 0, 0, 0],
-    ])
-    x_dn = np.array([
-        [0, 0, 0, 0],
-        [0, 0, 0, 0],
-        [0, 1, 0, 0],
-    ])
-    x_left = np.array([
-        [0, 0, 0, 0],
-        [1, 0, 0, 0],
-        [0, 0, 0, 0],
-    ])
-    x_right = np.array([
-        [0, 0, 0, 0],
-        [0, 0, 1, 0],
-        [0, 0, 0, 0],
-    ])
-
-    # Channels first
-    x_test = np.expand_dims(x, 0)
-
-    # Horizontal translation
-    self.assertAllEqual(x_left,
-                        np.squeeze(image.apply_affine_transform(x_test, tx=1)))
-    self.assertAllEqual(x_right,
-                        np.squeeze(image.apply_affine_transform(x_test, tx=-1)))
-
-    # change axes: x<->y
-    self.assertAllEqual(
-        x_left,
-        np.squeeze(
-            image.apply_affine_transform(x_test, ty=1, row_axis=2, col_axis=1)))
-    self.assertAllEqual(
-        x_right,
-        np.squeeze(
-            image.apply_affine_transform(x_test, ty=-1, row_axis=2,
-                                         col_axis=1)))
-
-    # Vertical translation
-    self.assertAllEqual(x_up,
-                        np.squeeze(image.apply_affine_transform(x_test, ty=1)))
-    self.assertAllEqual(x_dn,
-                        np.squeeze(image.apply_affine_transform(x_test, ty=-1)))
-
-    # change axes: x<->y
-    self.assertAllEqual(
-        x_up,
-        np.squeeze(
-            image.apply_affine_transform(x_test, tx=1, row_axis=2, col_axis=1)))
-    self.assertAllEqual(
-        x_dn,
-        np.squeeze(
-            image.apply_affine_transform(x_test, tx=-1, row_axis=2,
-                                         col_axis=1)))
-
-    # Channels last
-    x_test = np.expand_dims(x, -1)
-
-    # Horizontal translation
-    self.assertAllEqual(
-        x_left,
-        np.squeeze(
-            image.apply_affine_transform(
-                x_test, tx=1, row_axis=0, col_axis=1, channel_axis=2)))
-    self.assertAllEqual(
-        x_right,
-        np.squeeze(
-            image.apply_affine_transform(
-                x_test, tx=-1, row_axis=0, col_axis=1, channel_axis=2)))
-
-    # change axes: x<->y
-    self.assertAllEqual(
-        x_left,
-        np.squeeze(
-            image.apply_affine_transform(
-                x_test, ty=1, row_axis=1, col_axis=0, channel_axis=2)))
-    self.assertAllEqual(
-        x_right,
-        np.squeeze(
-            image.apply_affine_transform(
-                x_test, ty=-1, row_axis=1, col_axis=0, channel_axis=2)))
-
-    # Vertical translation
-    self.assertAllEqual(
-        x_up,
-        np.squeeze(
-            image.apply_affine_transform(
-                x_test, ty=1, row_axis=0, col_axis=1, channel_axis=2)))
-    self.assertAllEqual(
-        x_dn,
-        np.squeeze(
-            image.apply_affine_transform(
-                x_test, ty=-1, row_axis=0, col_axis=1, channel_axis=2)))
-
-    # change axes: x<->y
-    self.assertAllEqual(
-        x_up,
-        np.squeeze(
+    def test_random_transforms(self):
+        x = np.random.random((2, 28, 28))
+        self.assertEqual(image.random_rotation(x, 45).shape, (2, 28, 28))
+        self.assertEqual(image.random_shift(x, 1, 1).shape, (2, 28, 28))
+        self.assertEqual(image.random_shear(x, 20).shape, (2, 28, 28))
+        self.assertEqual(image.random_channel_shift(x, 20).shape, (2, 28, 28))
+
+    def test_deterministic_transform(self):
+        x = np.ones((3, 3, 3))
+        x_rotated = np.array(
+            [
+                [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
+                [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
+                [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
+            ]
+        )
+        self.assertAllClose(
             image.apply_affine_transform(
-                x_test, tx=1, row_axis=1, col_axis=0, channel_axis=2)))
-    self.assertAllEqual(
-        x_dn,
-        np.squeeze(
+                x,
+                theta=45,
+                row_axis=0,
+                col_axis=1,
+                channel_axis=2,
+                fill_mode="constant",
+            ),
+            x_rotated,
+        )
+
+    def test_matrix_center(self):
+        x = np.expand_dims(
+            np.array(
+                [
+                    [0, 1],
+                    [0, 0],
+                ]
+            ),
+            -1,
+        )
+        x_rotated90 = np.expand_dims(
+            np.array(
+                [
+                    [1, 0],
+                    [0, 0],
+                ]
+            ),
+            -1,
+        )
+
+        self.assertAllClose(
             image.apply_affine_transform(
-                x_test, tx=-1, row_axis=1, col_axis=0, channel_axis=2)))
-
-  def test_random_zoom(self):
-    x = np.random.random((2, 28, 28))
-    self.assertEqual(image.random_zoom(x, (5, 5)).shape, (2, 28, 28))
-    self.assertAllClose(x, image.random_zoom(x, (1, 1)))
-
-  def test_random_zoom_error(self):
-    with self.assertRaises(ValueError):
-      image.random_zoom(0, zoom_range=[0])
-
-  def test_random_brightness_error(self):
-    with self.assertRaises(ValueError):
-      image.random_brightness(0, [0])
-
-  def test_random_brightness_scale(self):
-    img = np.ones((1, 1, 3)) * 128
-    zeros = np.zeros((1, 1, 3))
-    must_be_128 = image.random_brightness(img, [1, 1], False)
-    self.assertAllEqual(img, must_be_128)
-    must_be_0 = image.random_brightness(img, [1, 1], True)
-    self.assertAllEqual(zeros, must_be_0)
-
-  def test_random_brightness_scale_outside_range_positive(self):
-    img = np.ones((1, 1, 3)) * 1024
-    zeros = np.zeros((1, 1, 3))
-    must_be_1024 = image.random_brightness(img, [1, 1], False)
-    self.assertAllEqual(img, must_be_1024)
-    must_be_0 = image.random_brightness(img, [1, 1], True)
-    self.assertAllEqual(zeros, must_be_0)
-
-  def test_random_brightness_scale_outside_range_negative(self):
-    img = np.ones((1, 1, 3)) * -1024
-    zeros = np.zeros((1, 1, 3))
-    must_be_neg_1024 = image.random_brightness(img, [1, 1], False)
-    self.assertAllEqual(img, must_be_neg_1024)
-    must_be_0 = image.random_brightness(img, [1, 1], True)
-    self.assertAllEqual(zeros, must_be_0)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+                x, theta=90, row_axis=0, col_axis=1, channel_axis=2
+            ),
+            x_rotated90,
+        )
+
+    def test_translation(self):
+        x = np.array(
+            [
+                [0, 0, 0, 0],
+                [0, 1, 0, 0],
+                [0, 0, 0, 0],
+            ]
+        )
+        x_up = np.array(
+            [
+                [0, 1, 0, 0],
+                [0, 0, 0, 0],
+                [0, 0, 0, 0],
+            ]
+        )
+        x_dn = np.array(
+            [
+                [0, 0, 0, 0],
+                [0, 0, 0, 0],
+                [0, 1, 0, 0],
+            ]
+        )
+        x_left = np.array(
+            [
+                [0, 0, 0, 0],
+                [1, 0, 0, 0],
+                [0, 0, 0, 0],
+            ]
+        )
+        x_right = np.array(
+            [
+                [0, 0, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 0],
+            ]
+        )
+
+        # Channels first
+        x_test = np.expand_dims(x, 0)
+
+        # Horizontal translation
+        self.assertAllEqual(
+            x_left, np.squeeze(image.apply_affine_transform(x_test, tx=1))
+        )
+        self.assertAllEqual(
+            x_right, np.squeeze(image.apply_affine_transform(x_test, tx=-1))
+        )
+
+        # change axes: x<->y
+        self.assertAllEqual(
+            x_left,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, ty=1, row_axis=2, col_axis=1
+                )
+            ),
+        )
+        self.assertAllEqual(
+            x_right,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, ty=-1, row_axis=2, col_axis=1
+                )
+            ),
+        )
+
+        # Vertical translation
+        self.assertAllEqual(
+            x_up, np.squeeze(image.apply_affine_transform(x_test, ty=1))
+        )
+        self.assertAllEqual(
+            x_dn, np.squeeze(image.apply_affine_transform(x_test, ty=-1))
+        )
+
+        # change axes: x<->y
+        self.assertAllEqual(
+            x_up,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, tx=1, row_axis=2, col_axis=1
+                )
+            ),
+        )
+        self.assertAllEqual(
+            x_dn,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, tx=-1, row_axis=2, col_axis=1
+                )
+            ),
+        )
+
+        # Channels last
+        x_test = np.expand_dims(x, -1)
+
+        # Horizontal translation
+        self.assertAllEqual(
+            x_left,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, tx=1, row_axis=0, col_axis=1, channel_axis=2
+                )
+            ),
+        )
+        self.assertAllEqual(
+            x_right,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, tx=-1, row_axis=0, col_axis=1, channel_axis=2
+                )
+            ),
+        )
+
+        # change axes: x<->y
+        self.assertAllEqual(
+            x_left,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, ty=1, row_axis=1, col_axis=0, channel_axis=2
+                )
+            ),
+        )
+        self.assertAllEqual(
+            x_right,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, ty=-1, row_axis=1, col_axis=0, channel_axis=2
+                )
+            ),
+        )
+
+        # Vertical translation
+        self.assertAllEqual(
+            x_up,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, ty=1, row_axis=0, col_axis=1, channel_axis=2
+                )
+            ),
+        )
+        self.assertAllEqual(
+            x_dn,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, ty=-1, row_axis=0, col_axis=1, channel_axis=2
+                )
+            ),
+        )
+
+        # change axes: x<->y
+        self.assertAllEqual(
+            x_up,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, tx=1, row_axis=1, col_axis=0, channel_axis=2
+                )
+            ),
+        )
+        self.assertAllEqual(
+            x_dn,
+            np.squeeze(
+                image.apply_affine_transform(
+                    x_test, tx=-1, row_axis=1, col_axis=0, channel_axis=2
+                )
+            ),
+        )
+
+    def test_random_zoom(self):
+        x = np.random.random((2, 28, 28))
+        self.assertEqual(image.random_zoom(x, (5, 5)).shape, (2, 28, 28))
+        self.assertAllClose(x, image.random_zoom(x, (1, 1)))
+
+    def test_random_zoom_error(self):
+        with self.assertRaises(ValueError):
+            image.random_zoom(0, zoom_range=[0])
+
+    def test_random_brightness_error(self):
+        with self.assertRaises(ValueError):
+            image.random_brightness(0, [0])
+
+    def test_random_brightness_scale(self):
+        img = np.ones((1, 1, 3)) * 128
+        zeros = np.zeros((1, 1, 3))
+        must_be_128 = image.random_brightness(img, [1, 1], False)
+        self.assertAllEqual(img, must_be_128)
+        must_be_0 = image.random_brightness(img, [1, 1], True)
+        self.assertAllEqual(zeros, must_be_0)
+
+    def test_random_brightness_scale_outside_range_positive(self):
+        img = np.ones((1, 1, 3)) * 1024
+        zeros = np.zeros((1, 1, 3))
+        must_be_1024 = image.random_brightness(img, [1, 1], False)
+        self.assertAllEqual(img, must_be_1024)
+        must_be_0 = image.random_brightness(img, [1, 1], True)
+        self.assertAllEqual(zeros, must_be_0)
+
+    def test_random_brightness_scale_outside_range_negative(self):
+        img = np.ones((1, 1, 3)) * -1024
+        zeros = np.zeros((1, 1, 3))
+        must_be_neg_1024 = image.random_brightness(img, [1, 1], False)
+        self.assertAllEqual(img, must_be_neg_1024)
+        must_be_0 = image.random_brightness(img, [1, 1], True)
+        self.assertAllEqual(zeros, must_be_0)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/preprocessing/sequence.py b/keras/preprocessing/sequence.py
index f5f686614a1f..25569118718b 100644
--- a/keras/preprocessing/sequence.py
+++ b/keras/preprocessing/sequence.py
@@ -20,349 +20,366 @@
 with sequences. See the [tf.data guide](https://www.tensorflow.org/guide/data)
 for more details.
 """
-# pylint: disable=invalid-name
-# pylint: disable=g-classes-have-attributes
 
 
 import json
 import random
 
-from keras.utils import data_utils
 import numpy as np
 
+from keras.utils import data_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
 def _remove_long_seq(maxlen, seq, label):
-  """Removes sequences that exceed the maximum length.
+    """Removes sequences that exceed the maximum length.
 
-  Args:
-      maxlen: Int, maximum length of the output sequences.
-      seq: List of lists, where each sublist is a sequence.
-      label: List where each element is an integer.
+    Args:
+        maxlen: Int, maximum length of the output sequences.
+        seq: List of lists, where each sublist is a sequence.
+        label: List where each element is an integer.
 
-  Returns:
-      new_seq, new_label: shortened lists for `seq` and `label`.
-  """
-  new_seq, new_label = [], []
-  for x, y in zip(seq, label):
-    if len(x) < maxlen:
-      new_seq.append(x)
-      new_label.append(y)
-  return new_seq, new_label
+    Returns:
+        new_seq, new_label: shortened lists for `seq` and `label`.
+    """
+    new_seq, new_label = [], []
+    for x, y in zip(seq, label):
+        if len(x) < maxlen:
+            new_seq.append(x)
+            new_label.append(y)
+    return new_seq, new_label
 
 
-@keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
+@keras_export("keras.preprocessing.sequence.TimeseriesGenerator")
 class TimeseriesGenerator(data_utils.Sequence):
-  """Utility class for generating batches of temporal data.
-
-  Deprecated: `tf.keras.preprocessing.sequence.TimeseriesGenerator` does not
-  operate on tensors and is not recommended for new code. Prefer using a
-  `tf.data.Dataset` which provides a more efficient and flexible mechanism for
-  batching, shuffling, and windowing input. See the
-  [tf.data guide](https://www.tensorflow.org/guide/data) for more details.
-
-  This class takes in a sequence of data-points gathered at
-  equal intervals, along with time series parameters such as
-  stride, length of history, etc., to produce batches for
-  training/validation.
-
-  Arguments:
-      data: Indexable generator (such as list or Numpy array)
-          containing consecutive data points (timesteps).
-          The data should be at 2D, and axis 0 is expected
-          to be the time dimension.
-      targets: Targets corresponding to timesteps in `data`.
-          It should have same length as `data`.
-      length: Length of the output sequences (in number of timesteps).
-      sampling_rate: Period between successive individual timesteps
-          within sequences. For rate `r`, timesteps
-          `data[i]`, `data[i-r]`, ... `data[i - length]`
-          are used for create a sample sequence.
-      stride: Period between successive output sequences.
-          For stride `s`, consecutive output samples would
-          be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
-      start_index: Data points earlier than `start_index` will not be used
-          in the output sequences. This is useful to reserve part of the
-          data for test or validation.
-      end_index: Data points later than `end_index` will not be used
-          in the output sequences. This is useful to reserve part of the
-          data for test or validation.
-      shuffle: Whether to shuffle output samples,
-          or instead draw them in chronological order.
-      reverse: Boolean: if `true`, timesteps in each output sample will be
-          in reverse chronological order.
-      batch_size: Number of timeseries samples in each batch
-          (except maybe the last one).
-
-  Returns:
-      A [Sequence](
-      https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence)
-      instance.
-
-  Examples:
-      ```python
-      from keras.preprocessing.sequence import TimeseriesGenerator
-      import numpy as np
-      data = np.array([[i] for i in range(50)])
-      targets = np.array([[i] for i in range(50)])
-      data_gen = TimeseriesGenerator(data, targets,
-                                     length=10, sampling_rate=2,
-                                     batch_size=2)
-      assert len(data_gen) == 20
-      batch_0 = data_gen[0]
-      x, y = batch_0
-      assert np.array_equal(x,
-                            np.array([[[0], [2], [4], [6], [8]],
-                                      [[1], [3], [5], [7], [9]]]))
-      assert np.array_equal(y,
-                            np.array([[10], [11]]))
-      ```
-  """
-
-  def __init__(self,
-               data,
-               targets,
-               length,
-               sampling_rate=1,
-               stride=1,
-               start_index=0,
-               end_index=None,
-               shuffle=False,
-               reverse=False,
-               batch_size=128):
-
-    if len(data) != len(targets):
-      raise ValueError('Data and targets have to be' + ' of same length. '
-                       'Data length is {}'.format(len(data)) +
-                       ' while target length is {}'.format(len(targets)))
-
-    self.data = data
-    self.targets = targets
-    self.length = length
-    self.sampling_rate = sampling_rate
-    self.stride = stride
-    self.start_index = start_index + length
-    if end_index is None:
-      end_index = len(data) - 1
-    self.end_index = end_index
-    self.shuffle = shuffle
-    self.reverse = reverse
-    self.batch_size = batch_size
-
-    if self.start_index > self.end_index:
-      raise ValueError('`start_index+length=%i > end_index=%i` '
-                       'is disallowed, as no part of the sequence '
-                       'would be left to be used as current step.' %
-                       (self.start_index, self.end_index))
-
-  def __len__(self):
-    return (self.end_index - self.start_index +
-            self.batch_size * self.stride) // (
-                self.batch_size * self.stride)
-
-  def __getitem__(self, index):
-    if self.shuffle:
-      rows = np.random.randint(
-          self.start_index, self.end_index + 1, size=self.batch_size)
-    else:
-      i = self.start_index + self.batch_size * self.stride * index
-      rows = np.arange(
-          i, min(i + self.batch_size * self.stride, self.end_index + 1),
-          self.stride)
-
-    samples = np.array(
-        [self.data[row - self.length:row:self.sampling_rate] for row in rows])
-    targets = np.array([self.targets[row] for row in rows])
-
-    if self.reverse:
-      return samples[:, ::-1, ...], targets
-    return samples, targets
-
-  def get_config(self):
-    """Returns the TimeseriesGenerator configuration as Python dictionary.
+    """Utility class for generating batches of temporal data.
+
+    Deprecated: `tf.keras.preprocessing.sequence.TimeseriesGenerator` does not
+    operate on tensors and is not recommended for new code. Prefer using a
+    `tf.data.Dataset` which provides a more efficient and flexible mechanism for
+    batching, shuffling, and windowing input. See the
+    [tf.data guide](https://www.tensorflow.org/guide/data) for more details.
+
+    This class takes in a sequence of data-points gathered at
+    equal intervals, along with time series parameters such as
+    stride, length of history, etc., to produce batches for
+    training/validation.
+
+    Arguments:
+        data: Indexable generator (such as list or Numpy array)
+            containing consecutive data points (timesteps).
+            The data should be at 2D, and axis 0 is expected
+            to be the time dimension.
+        targets: Targets corresponding to timesteps in `data`.
+            It should have same length as `data`.
+        length: Length of the output sequences (in number of timesteps).
+        sampling_rate: Period between successive individual timesteps
+            within sequences. For rate `r`, timesteps
+            `data[i]`, `data[i-r]`, ... `data[i - length]`
+            are used for create a sample sequence.
+        stride: Period between successive output sequences.
+            For stride `s`, consecutive output samples would
+            be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
+        start_index: Data points earlier than `start_index` will not be used
+            in the output sequences. This is useful to reserve part of the
+            data for test or validation.
+        end_index: Data points later than `end_index` will not be used
+            in the output sequences. This is useful to reserve part of the
+            data for test or validation.
+        shuffle: Whether to shuffle output samples,
+            or instead draw them in chronological order.
+        reverse: Boolean: if `true`, timesteps in each output sample will be
+            in reverse chronological order.
+        batch_size: Number of timeseries samples in each batch
+            (except maybe the last one).
 
     Returns:
-        A Python dictionary with the TimeseriesGenerator configuration.
+        A [Sequence](
+        https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence)
+        instance.
+
+    Examples:
+        ```python
+        from keras.preprocessing.sequence import TimeseriesGenerator
+        import numpy as np
+        data = np.array([[i] for i in range(50)])
+        targets = np.array([[i] for i in range(50)])
+        data_gen = TimeseriesGenerator(data, targets,
+                                       length=10, sampling_rate=2,
+                                       batch_size=2)
+        assert len(data_gen) == 20
+        batch_0 = data_gen[0]
+        x, y = batch_0
+        assert np.array_equal(x,
+                              np.array([[[0], [2], [4], [6], [8]],
+                                        [[1], [3], [5], [7], [9]]]))
+        assert np.array_equal(y,
+                              np.array([[10], [11]]))
+        ```
     """
-    data = self.data
-    if type(self.data).__module__ == np.__name__:
-      data = self.data.tolist()
-    try:
-      json_data = json.dumps(data)
-    except TypeError as e:
-      raise TypeError('Data not JSON Serializable:', data) from e
-
-    targets = self.targets
-    if type(self.targets).__module__ == np.__name__:
-      targets = self.targets.tolist()
-    try:
-      json_targets = json.dumps(targets)
-    except TypeError as e:
-      raise TypeError('Targets not JSON Serializable:', targets) from e
-
-    return {
-        'data': json_data,
-        'targets': json_targets,
-        'length': self.length,
-        'sampling_rate': self.sampling_rate,
-        'stride': self.stride,
-        'start_index': self.start_index,
-        'end_index': self.end_index,
-        'shuffle': self.shuffle,
-        'reverse': self.reverse,
-        'batch_size': self.batch_size
-    }
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the timeseries generator configuration.
+
+    def __init__(
+        self,
+        data,
+        targets,
+        length,
+        sampling_rate=1,
+        stride=1,
+        start_index=0,
+        end_index=None,
+        shuffle=False,
+        reverse=False,
+        batch_size=128,
+    ):
+
+        if len(data) != len(targets):
+            raise ValueError(
+                "Data and targets have to be"
+                + f" of same length. Data length is {len(data)}"
+                + f" while target length is {len(targets)}"
+            )
+
+        self.data = data
+        self.targets = targets
+        self.length = length
+        self.sampling_rate = sampling_rate
+        self.stride = stride
+        self.start_index = start_index + length
+        if end_index is None:
+            end_index = len(data) - 1
+        self.end_index = end_index
+        self.shuffle = shuffle
+        self.reverse = reverse
+        self.batch_size = batch_size
+
+        if self.start_index > self.end_index:
+            raise ValueError(
+                "`start_index+length=%i > end_index=%i` "
+                "is disallowed, as no part of the sequence "
+                "would be left to be used as current step."
+                % (self.start_index, self.end_index)
+            )
+
+    def __len__(self):
+        return (
+            self.end_index - self.start_index + self.batch_size * self.stride
+        ) // (self.batch_size * self.stride)
+
+    def __getitem__(self, index):
+        if self.shuffle:
+            rows = np.random.randint(
+                self.start_index, self.end_index + 1, size=self.batch_size
+            )
+        else:
+            i = self.start_index + self.batch_size * self.stride * index
+            rows = np.arange(
+                i,
+                min(i + self.batch_size * self.stride, self.end_index + 1),
+                self.stride,
+            )
+
+        samples = np.array(
+            [
+                self.data[row - self.length : row : self.sampling_rate]
+                for row in rows
+            ]
+        )
+        targets = np.array([self.targets[row] for row in rows])
+
+        if self.reverse:
+            return samples[:, ::-1, ...], targets
+        return samples, targets
+
+    def get_config(self):
+        """Returns the TimeseriesGenerator configuration as Python dictionary.
+
+        Returns:
+            A Python dictionary with the TimeseriesGenerator configuration.
+        """
+        data = self.data
+        if type(self.data).__module__ == np.__name__:
+            data = self.data.tolist()
+        try:
+            json_data = json.dumps(data)
+        except TypeError as e:
+            raise TypeError("Data not JSON Serializable:", data) from e
+
+        targets = self.targets
+        if type(self.targets).__module__ == np.__name__:
+            targets = self.targets.tolist()
+        try:
+            json_targets = json.dumps(targets)
+        except TypeError as e:
+            raise TypeError("Targets not JSON Serializable:", targets) from e
+
+        return {
+            "data": json_data,
+            "targets": json_targets,
+            "length": self.length,
+            "sampling_rate": self.sampling_rate,
+            "stride": self.stride,
+            "start_index": self.start_index,
+            "end_index": self.end_index,
+            "shuffle": self.shuffle,
+            "reverse": self.reverse,
+            "batch_size": self.batch_size,
+        }
+
+    def to_json(self, **kwargs):
+        """Returns a JSON string containing the generator's configuration.
+
+        Args:
+            **kwargs: Additional keyword arguments to be passed
+                to `json.dumps()`.
+
+        Returns:
+            A JSON string containing the tokenizer configuration.
+        """
+        config = self.get_config()
+        timeseries_generator_config = {
+            "class_name": self.__class__.__name__,
+            "config": config,
+        }
+        return json.dumps(timeseries_generator_config, **kwargs)
+
+
+@keras_export("keras.preprocessing.sequence.make_sampling_table")
+def make_sampling_table(size, sampling_factor=1e-5):
+    """Generates a word rank-based probabilistic sampling table.
+
+    Used for generating the `sampling_table` argument for `skipgrams`.
+    `sampling_table[i]` is the probability of sampling
+    the word i-th most common word in a dataset
+    (more common words should be sampled less frequently, for balance).
+
+    The sampling probabilities are generated according
+    to the sampling distribution used in word2vec:
+
+    ```
+    p(word) = (min(1, sqrt(word_frequency / sampling_factor) /
+        (word_frequency / sampling_factor)))
+    ```
+
+    We assume that the word frequencies follow Zipf's law (s=1) to derive
+    a numerical approximation of frequency(rank):
+
+    `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
+    where `gamma` is the Euler-Mascheroni constant.
 
     Args:
-        **kwargs: Additional keyword arguments
-            to be passed to `json.dumps()`.
+        size: Int, number of possible words to sample.
+        sampling_factor: The sampling factor in the word2vec formula.
+
     Returns:
-        A JSON string containing the tokenizer configuration.
+        A 1D Numpy array of length `size` where the ith entry
+        is the probability that a word of rank i should be sampled.
     """
-    config = self.get_config()
-    timeseries_generator_config = {
-        'class_name': self.__class__.__name__,
-        'config': config
-    }
-    return json.dumps(timeseries_generator_config, **kwargs)
+    gamma = 0.577
+    rank = np.arange(size)
+    rank[0] = 1
+    inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1.0 / (12.0 * rank)
+    f = sampling_factor * inv_fq
+
+    return np.minimum(1.0, f / np.sqrt(f))
+
+
+@keras_export("keras.preprocessing.sequence.skipgrams")
+def skipgrams(
+    sequence,
+    vocabulary_size,
+    window_size=4,
+    negative_samples=1.0,
+    shuffle=True,
+    categorical=False,
+    sampling_table=None,
+    seed=None,
+):
+    """Generates skipgram word pairs.
+
+    This function transforms a sequence of word indexes (list of integers)
+    into tuples of words of the form:
+
+    - (word, word in the same window), with label 1 (positive samples).
+    - (word, random word from the vocabulary), with label 0 (negative samples).
+
+    Read more about Skipgram in this gnomic paper by Mikolov et al.:
+    [Efficient Estimation of Word Representations in
+    Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)
 
+    Args:
+        sequence: A word sequence (sentence), encoded as a list
+            of word indices (integers). If using a `sampling_table`,
+            word indices are expected to match the rank
+            of the words in a reference dataset (e.g. 10 would encode
+            the 10-th most frequently occurring token).
+            Note that index 0 is expected to be a non-word and will be skipped.
+        vocabulary_size: Int, maximum possible word index + 1
+        window_size: Int, size of sampling windows (technically half-window).
+            The window of a word `w_i` will be
+            `[i - window_size, i + window_size+1]`.
+        negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
+            1 for same number as positive samples.
+        shuffle: Whether to shuffle the word couples before returning them.
+        categorical: bool. if False, labels will be
+            integers (eg. `[0, 1, 1 .. ]`),
+            if `True`, labels will be categorical, e.g.
+            `[[1,0],[0,1],[0,1] .. ]`.
+        sampling_table: 1D array of size `vocabulary_size` where the entry i
+            encodes the probability to sample a word of rank i.
+        seed: Random seed.
 
-@keras_export('keras.preprocessing.sequence.make_sampling_table')
-def make_sampling_table(size, sampling_factor=1e-5):
-  """Generates a word rank-based probabilistic sampling table.
-
-  Used for generating the `sampling_table` argument for `skipgrams`.
-  `sampling_table[i]` is the probability of sampling
-  the word i-th most common word in a dataset
-  (more common words should be sampled less frequently, for balance).
-
-  The sampling probabilities are generated according
-  to the sampling distribution used in word2vec:
-
-  ```
-  p(word) = (min(1, sqrt(word_frequency / sampling_factor) /
-      (word_frequency / sampling_factor)))
-  ```
-
-  We assume that the word frequencies follow Zipf's law (s=1) to derive
-  a numerical approximation of frequency(rank):
-
-  `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
-  where `gamma` is the Euler-Mascheroni constant.
-
-  Args:
-      size: Int, number of possible words to sample.
-      sampling_factor: The sampling factor in the word2vec formula.
-
-  Returns:
-      A 1D Numpy array of length `size` where the ith entry
-      is the probability that a word of rank i should be sampled.
-  """
-  gamma = 0.577
-  rank = np.arange(size)
-  rank[0] = 1
-  inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1. / (12. * rank)
-  f = sampling_factor * inv_fq
-
-  return np.minimum(1., f / np.sqrt(f))
-
-
-@keras_export('keras.preprocessing.sequence.skipgrams')
-def skipgrams(sequence,
-              vocabulary_size,
-              window_size=4,
-              negative_samples=1.,
-              shuffle=True,
-              categorical=False,
-              sampling_table=None,
-              seed=None):
-  """Generates skipgram word pairs.
-
-  This function transforms a sequence of word indexes (list of integers)
-  into tuples of words of the form:
-
-  - (word, word in the same window), with label 1 (positive samples).
-  - (word, random word from the vocabulary), with label 0 (negative samples).
-
-  Read more about Skipgram in this gnomic paper by Mikolov et al.:
-  [Efficient Estimation of Word Representations in
-  Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)
-
-  Args:
-      sequence: A word sequence (sentence), encoded as a list
-          of word indices (integers). If using a `sampling_table`,
-          word indices are expected to match the rank
-          of the words in a reference dataset (e.g. 10 would encode
-          the 10-th most frequently occurring token).
-          Note that index 0 is expected to be a non-word and will be skipped.
-      vocabulary_size: Int, maximum possible word index + 1
-      window_size: Int, size of sampling windows (technically half-window).
-          The window of a word `w_i` will be
-          `[i - window_size, i + window_size+1]`.
-      negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
-          1 for same number as positive samples.
-      shuffle: Whether to shuffle the word couples before returning them.
-      categorical: bool. if False, labels will be
-          integers (eg. `[0, 1, 1 .. ]`),
-          if `True`, labels will be categorical, e.g.
-          `[[1,0],[0,1],[0,1] .. ]`.
-      sampling_table: 1D array of size `vocabulary_size` where the entry i
-          encodes the probability to sample a word of rank i.
-      seed: Random seed.
-
-  Returns:
-      couples, labels: where `couples` are int pairs and
-          `labels` are either 0 or 1.
-
-  Note:
-      By convention, index 0 in the vocabulary is
-      a non-word and will be skipped.
-  """
-  couples = []
-  labels = []
-  for i, wi in enumerate(sequence):
-    if not wi:
-      continue
-    if sampling_table is not None:
-      if sampling_table[wi] < random.random():
-        continue
-
-    window_start = max(0, i - window_size)
-    window_end = min(len(sequence), i + window_size + 1)
-    for j in range(window_start, window_end):
-      if j != i:
-        wj = sequence[j]
-        if not wj:
-          continue
-        couples.append([wi, wj])
+    Returns:
+        couples, labels: where `couples` are int pairs and
+            `labels` are either 0 or 1.
+
+    Note:
+        By convention, index 0 in the vocabulary is
+        a non-word and will be skipped.
+    """
+    couples = []
+    labels = []
+    for i, wi in enumerate(sequence):
+        if not wi:
+            continue
+        if sampling_table is not None:
+            if sampling_table[wi] < random.random():
+                continue
+
+        window_start = max(0, i - window_size)
+        window_end = min(len(sequence), i + window_size + 1)
+        for j in range(window_start, window_end):
+            if j != i:
+                wj = sequence[j]
+                if not wj:
+                    continue
+                couples.append([wi, wj])
+                if categorical:
+                    labels.append([0, 1])
+                else:
+                    labels.append(1)
+
+    if negative_samples > 0:
+        num_negative_samples = int(len(labels) * negative_samples)
+        words = [c[0] for c in couples]
+        random.shuffle(words)
+
+        couples += [
+            [words[i % len(words)], random.randint(1, vocabulary_size - 1)]
+            for i in range(num_negative_samples)
+        ]
         if categorical:
-          labels.append([0, 1])
+            labels += [[1, 0]] * num_negative_samples
         else:
-          labels.append(1)
-
-  if negative_samples > 0:
-    num_negative_samples = int(len(labels) * negative_samples)
-    words = [c[0] for c in couples]
-    random.shuffle(words)
-
-    couples += [[words[i % len(words)],
-                 random.randint(1, vocabulary_size - 1)]
-                for i in range(num_negative_samples)]
-    if categorical:
-      labels += [[1, 0]] * num_negative_samples
-    else:
-      labels += [0] * num_negative_samples
-
-  if shuffle:
-    if seed is None:
-      seed = random.randint(0, 10e6)
-    random.seed(seed)
-    random.shuffle(couples)
-    random.seed(seed)
-    random.shuffle(labels)
-
-  return couples, labels
+            labels += [0] * num_negative_samples
+
+    if shuffle:
+        if seed is None:
+            seed = random.randint(0, 10e6)
+        random.seed(seed)
+        random.shuffle(couples)
+        random.seed(seed)
+        random.shuffle(labels)
+
+    return couples, labels
diff --git a/keras/preprocessing/sequence_test.py b/keras/preprocessing/sequence_test.py
index b34fc082801e..a5b2637efcc8 100644
--- a/keras/preprocessing/sequence_test.py
+++ b/keras/preprocessing/sequence_test.py
@@ -16,178 +16,222 @@
 
 import math
 
-from keras.preprocessing import sequence
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+from keras.preprocessing import sequence
+
 
 class TestSequence(tf.test.TestCase):
-
-  def test_make_sampling_table(self):
-    a = sequence.make_sampling_table(3)
-    self.assertAllClose(
-        a, np.asarray([0.00315225, 0.00315225, 0.00547597]), rtol=.1)
-
-  def test_skipgrams(self):
-    # test with no window size and binary labels
-    couples, labels = sequence.skipgrams(np.arange(3), vocabulary_size=3)
-    for couple in couples:
-      self.assertIn(couple[0], [0, 1, 2])
-      self.assertIn(couple[1], [0, 1, 2])
-
-    # test window size and categorical labels
-    couples, labels = sequence.skipgrams(
-        np.arange(5), vocabulary_size=5, window_size=1, categorical=True)
-    for couple in couples:
-      self.assertLessEqual(couple[0] - couple[1], 3)
-    for label in labels:
-      self.assertLen(label, 2)
-
-  def test_remove_long_seq(self):
-    maxlen = 5
-    seq = [
-        [1, 2, 3],
-        [1, 2, 3, 4, 5, 6],
-    ]
-    label = ['a', 'b']
-    new_seq, new_label = sequence._remove_long_seq(maxlen, seq, label)
-    self.assertEqual(new_seq, [[1, 2, 3]])
-    self.assertEqual(new_label, ['a'])
-
-  def test_TimeseriesGenerator(self):
-    data = np.array([[i] for i in range(50)])
-    targets = np.array([[i] for i in range(50)])
-
-    data_gen = sequence.TimeseriesGenerator(
-        data, targets, length=10, sampling_rate=2, batch_size=2)
-    self.assertLen(data_gen, 20)
-    self.assertAllClose(
-        data_gen[0][0],
-        np.array([[[0], [2], [4], [6], [8]], [[1], [3], [5], [7], [9]]]))
-    self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
-    self.assertAllClose(
-        data_gen[1][0],
-        np.array([[[2], [4], [6], [8], [10]], [[3], [5], [7], [9], [11]]]))
-    self.assertAllClose(data_gen[1][1], np.array([[12], [13]]))
-
-    data_gen = sequence.TimeseriesGenerator(
-        data, targets, length=10, sampling_rate=2, reverse=True, batch_size=2)
-    self.assertLen(data_gen, 20)
-    self.assertAllClose(
-        data_gen[0][0],
-        np.array([[[8], [6], [4], [2], [0]], [[9], [7], [5], [3], [1]]]))
-    self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
-
-    data_gen = sequence.TimeseriesGenerator(
-        data, targets, length=10, sampling_rate=2, shuffle=True, batch_size=1)
-    batch = data_gen[0]
-    r = batch[1][0][0]
-    self.assertAllClose(
-        batch[0], np.array([[[r - 10], [r - 8], [r - 6], [r - 4], [r - 2]]]))
-    self.assertAllClose(batch[1], np.array([
-        [r],
-    ]))
-
-    data_gen = sequence.TimeseriesGenerator(
-        data, targets, length=10, sampling_rate=2, stride=2, batch_size=2)
-    self.assertLen(data_gen, 10)
-    self.assertAllClose(
-        data_gen[1][0],
-        np.array([[[4], [6], [8], [10], [12]], [[6], [8], [10], [12], [14]]]))
-    self.assertAllClose(data_gen[1][1], np.array([[14], [16]]))
-
-    data_gen = sequence.TimeseriesGenerator(
-        data,
-        targets,
-        length=10,
-        sampling_rate=2,
-        start_index=10,
-        end_index=30,
-        batch_size=2)
-    self.assertLen(data_gen, 6)
-    self.assertAllClose(
-        data_gen[0][0],
-        np.array([[[10], [12], [14], [16], [18]], [[11], [13], [15], [17],
-                                                   [19]]]))
-    self.assertAllClose(data_gen[0][1], np.array([[20], [21]]))
-
-    data = np.array([np.random.random_sample((1, 2, 3, 4)) for i in range(50)])
-    targets = np.array([np.random.random_sample((3, 2, 1)) for i in range(50)])
-    data_gen = sequence.TimeseriesGenerator(
-        data,
-        targets,
-        length=10,
-        sampling_rate=2,
-        start_index=10,
-        end_index=30,
-        batch_size=2)
-    self.assertLen(data_gen, 6)
-    self.assertAllClose(
-        data_gen[0][0],
-        np.array([np.array(data[10:19:2]),
-                  np.array(data[11:20:2])]))
-    self.assertAllClose(data_gen[0][1], np.array([targets[20], targets[21]]))
-
-    with self.assertRaisesRegex(
-        ValueError, r'`start_index\+length=50 > end_index=49` is disallowed'):
-      sequence.TimeseriesGenerator(data, targets, length=50)
-
-  def test_TimeSeriesGenerator_doesnt_miss_any_sample(self):
-    x = np.array([[i] for i in range(10)])
-
-    for length in range(3, 10):
-      g = sequence.TimeseriesGenerator(x, x, length=length, batch_size=1)
-      expected = max(0, len(x) - length)
-      actual = len(g)
-
-      self.assertEqual(expected, actual)
-
-      if len(g) > 0:  # pylint: disable=g-explicit-length-test
-        # All elements in range(length, 10) should be used as current step
-        expected = np.arange(length, 10).reshape(-1, 1)
-
-        y = np.concatenate([g[ix][1] for ix in range(len(g))], axis=0)
-        self.assertAllClose(y, expected)
-
-    x = np.array([[i] for i in range(23)])
-
-    strides = (1, 1, 5, 7, 3, 5, 3)
-    lengths = (3, 3, 4, 3, 1, 3, 7)
-    batch_sizes = (6, 6, 6, 5, 6, 6, 6)
-    shuffles = (False, True, True, False, False, False, False)
-
-    for stride, length, batch_size, shuffle in zip(strides, lengths,
-                                                   batch_sizes, shuffles):
-      g = sequence.TimeseriesGenerator(
-          x,
-          x,
-          length=length,
-          sampling_rate=1,
-          stride=stride,
-          start_index=0,
-          end_index=None,
-          shuffle=shuffle,
-          reverse=False,
-          batch_size=batch_size)
-      if shuffle:
-        # all batches have the same size when shuffle is True.
-        expected_sequences = math.ceil(
-            (23 - length) / float(batch_size * stride)) * batch_size
-      else:
-        # last batch will be different if `(samples - length) / stride`
-        # is not a multiple of `batch_size`.
-        expected_sequences = math.ceil((23 - length) / float(stride))
-
-      expected_batches = math.ceil(expected_sequences / float(batch_size))
-
-      y = [g[ix][1] for ix in range(len(g))]
-
-      actual_sequences = sum(len(y_) for y_ in y)
-      actual_batches = len(y)
-
-      self.assertEqual(expected_sequences, actual_sequences)
-      self.assertEqual(expected_batches, actual_batches)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_make_sampling_table(self):
+        a = sequence.make_sampling_table(3)
+        self.assertAllClose(
+            a, np.asarray([0.00315225, 0.00315225, 0.00547597]), rtol=0.1
+        )
+
+    def test_skipgrams(self):
+        # test with no window size and binary labels
+        couples, labels = sequence.skipgrams(np.arange(3), vocabulary_size=3)
+        for couple in couples:
+            self.assertIn(couple[0], [0, 1, 2])
+            self.assertIn(couple[1], [0, 1, 2])
+
+        # test window size and categorical labels
+        couples, labels = sequence.skipgrams(
+            np.arange(5), vocabulary_size=5, window_size=1, categorical=True
+        )
+        for couple in couples:
+            self.assertLessEqual(couple[0] - couple[1], 3)
+        for label in labels:
+            self.assertLen(label, 2)
+
+    def test_remove_long_seq(self):
+        maxlen = 5
+        seq = [
+            [1, 2, 3],
+            [1, 2, 3, 4, 5, 6],
+        ]
+        label = ["a", "b"]
+        new_seq, new_label = sequence._remove_long_seq(maxlen, seq, label)
+        self.assertEqual(new_seq, [[1, 2, 3]])
+        self.assertEqual(new_label, ["a"])
+
+    def test_TimeseriesGenerator(self):
+        data = np.array([[i] for i in range(50)])
+        targets = np.array([[i] for i in range(50)])
+
+        data_gen = sequence.TimeseriesGenerator(
+            data, targets, length=10, sampling_rate=2, batch_size=2
+        )
+        self.assertLen(data_gen, 20)
+        self.assertAllClose(
+            data_gen[0][0],
+            np.array([[[0], [2], [4], [6], [8]], [[1], [3], [5], [7], [9]]]),
+        )
+        self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
+        self.assertAllClose(
+            data_gen[1][0],
+            np.array([[[2], [4], [6], [8], [10]], [[3], [5], [7], [9], [11]]]),
+        )
+        self.assertAllClose(data_gen[1][1], np.array([[12], [13]]))
+
+        data_gen = sequence.TimeseriesGenerator(
+            data,
+            targets,
+            length=10,
+            sampling_rate=2,
+            reverse=True,
+            batch_size=2,
+        )
+        self.assertLen(data_gen, 20)
+        self.assertAllClose(
+            data_gen[0][0],
+            np.array([[[8], [6], [4], [2], [0]], [[9], [7], [5], [3], [1]]]),
+        )
+        self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
+
+        data_gen = sequence.TimeseriesGenerator(
+            data,
+            targets,
+            length=10,
+            sampling_rate=2,
+            shuffle=True,
+            batch_size=1,
+        )
+        batch = data_gen[0]
+        r = batch[1][0][0]
+        self.assertAllClose(
+            batch[0], np.array([[[r - 10], [r - 8], [r - 6], [r - 4], [r - 2]]])
+        )
+        self.assertAllClose(
+            batch[1],
+            np.array(
+                [
+                    [r],
+                ]
+            ),
+        )
+
+        data_gen = sequence.TimeseriesGenerator(
+            data, targets, length=10, sampling_rate=2, stride=2, batch_size=2
+        )
+        self.assertLen(data_gen, 10)
+        self.assertAllClose(
+            data_gen[1][0],
+            np.array(
+                [[[4], [6], [8], [10], [12]], [[6], [8], [10], [12], [14]]]
+            ),
+        )
+        self.assertAllClose(data_gen[1][1], np.array([[14], [16]]))
+
+        data_gen = sequence.TimeseriesGenerator(
+            data,
+            targets,
+            length=10,
+            sampling_rate=2,
+            start_index=10,
+            end_index=30,
+            batch_size=2,
+        )
+        self.assertLen(data_gen, 6)
+        self.assertAllClose(
+            data_gen[0][0],
+            np.array(
+                [[[10], [12], [14], [16], [18]], [[11], [13], [15], [17], [19]]]
+            ),
+        )
+        self.assertAllClose(data_gen[0][1], np.array([[20], [21]]))
+
+        data = np.array(
+            [np.random.random_sample((1, 2, 3, 4)) for i in range(50)]
+        )
+        targets = np.array(
+            [np.random.random_sample((3, 2, 1)) for i in range(50)]
+        )
+        data_gen = sequence.TimeseriesGenerator(
+            data,
+            targets,
+            length=10,
+            sampling_rate=2,
+            start_index=10,
+            end_index=30,
+            batch_size=2,
+        )
+        self.assertLen(data_gen, 6)
+        self.assertAllClose(
+            data_gen[0][0],
+            np.array([np.array(data[10:19:2]), np.array(data[11:20:2])]),
+        )
+        self.assertAllClose(
+            data_gen[0][1], np.array([targets[20], targets[21]])
+        )
+
+        with self.assertRaisesRegex(
+            ValueError, r"`start_index\+length=50 > end_index=49` is disallowed"
+        ):
+            sequence.TimeseriesGenerator(data, targets, length=50)
+
+    def test_TimeSeriesGenerator_doesnt_miss_any_sample(self):
+        x = np.array([[i] for i in range(10)])
+
+        for length in range(3, 10):
+            g = sequence.TimeseriesGenerator(x, x, length=length, batch_size=1)
+            expected = max(0, len(x) - length)
+            actual = len(g)
+
+            self.assertEqual(expected, actual)
+
+            if len(g) > 0:
+                # All elements in range(length, 10) should be used as current
+                # step
+                expected = np.arange(length, 10).reshape(-1, 1)
+
+                y = np.concatenate([g[ix][1] for ix in range(len(g))], axis=0)
+                self.assertAllClose(y, expected)
+
+        x = np.array([[i] for i in range(23)])
+
+        strides = (1, 1, 5, 7, 3, 5, 3)
+        lengths = (3, 3, 4, 3, 1, 3, 7)
+        batch_sizes = (6, 6, 6, 5, 6, 6, 6)
+        shuffles = (False, True, True, False, False, False, False)
+
+        for stride, length, batch_size, shuffle in zip(
+            strides, lengths, batch_sizes, shuffles
+        ):
+            g = sequence.TimeseriesGenerator(
+                x,
+                x,
+                length=length,
+                sampling_rate=1,
+                stride=stride,
+                start_index=0,
+                end_index=None,
+                shuffle=shuffle,
+                reverse=False,
+                batch_size=batch_size,
+            )
+            if shuffle:
+                # all batches have the same size when shuffle is True.
+                expected_sequences = (
+                    math.ceil((23 - length) / float(batch_size * stride))
+                    * batch_size
+                )
+            else:
+                # last batch will be different if `(samples - length) / stride`
+                # is not a multiple of `batch_size`.
+                expected_sequences = math.ceil((23 - length) / float(stride))
+
+            expected_batches = math.ceil(expected_sequences / float(batch_size))
+
+            y = [g[ix][1] for ix in range(len(g))]
+
+            actual_sequences = sum(len(y_) for y_ in y)
+            actual_batches = len(y)
+
+            self.assertEqual(expected_sequences, actual_sequences)
+            self.assertEqual(expected_batches, actual_batches)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/preprocessing/text.py b/keras/preprocessing/text.py
index ba7f626f09b3..a429fb4b56a8 100644
--- a/keras/preprocessing/text.py
+++ b/keras/preprocessing/text.py
@@ -23,8 +23,6 @@
 and [preprocessing layer guide]
 (https://www.tensorflow.org/guide/keras/preprocessing_layers).
 """
-# pylint: disable=invalid-name
-# pylint: disable=g-classes-have-attributes
 
 
 import collections
@@ -33,549 +31,583 @@
 import warnings
 
 import numpy as np
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.preprocessing.text.text_to_word_sequence')
-def text_to_word_sequence(input_text,
-                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-                          lower=True,
-                          split=' '):
-  r"""Converts a text to a sequence of words (or tokens).
-
-  Deprecated: `tf.keras.preprocessing.text.text_to_word_sequence` does not
-  operate on tensors and is not recommended for new code. Prefer
-  `tf.strings.regex_replace` and `tf.strings.split` which provide equivalent
-  functionality and accept `tf.Tensor` input. For an overview of text handling
-  in Tensorflow, see the [text loading tutorial]
-  (https://www.tensorflow.org/tutorials/load_data/text).
 
-  This function transforms a string of text into a list of words
-  while ignoring `filters` which include punctuations by default.
-
-  >>> sample_text = 'This is a sample sentence.'
-  >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
-  ['this', 'is', 'a', 'sample', 'sentence']
-
-  Args:
-      input_text: Input text (string).
-      filters: list (or concatenation) of characters to filter out, such as
-          punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
-            includes basic punctuation, tabs, and newlines.
-      lower: boolean. Whether to convert the input to lowercase.
-      split: str. Separator for word splitting.
-
-  Returns:
-      A list of words (or tokens).
-  """
-  if lower:
-    input_text = input_text.lower()
-
-  translate_dict = {c: split for c in filters}
-  translate_map = str.maketrans(translate_dict)
-  input_text = input_text.translate(translate_map)
-
-  seq = input_text.split(split)
-  return [i for i in seq if i]
-
-
-@keras_export('keras.preprocessing.text.one_hot')
-def one_hot(input_text,
-            n,
-            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-            lower=True,
-            split=' ',
-            analyzer=None):
-  r"""One-hot encodes a text into a list of word indexes of size `n`.
-
-  Deprecated: `tf.keras.text.preprocessing.one_hot` does not operate on tensors
-  and is not recommended for new code. Prefer `tf.keras.layers.Hashing` with
-  `output_mode='one_hot'` which provides equivalent functionality through a
-  layer which accepts `tf.Tensor` input. See the [preprocessing layer guide]
-  (https://www.tensorflow.org/guide/keras/preprocessing_layers)
-  for an overview of preprocessing layers.
-
-  This function receives as input a string of text and returns a
-  list of encoded integers each corresponding to a word (or token)
-  in the given input string.
-
-  Args:
-      input_text: Input text (string).
-      n: int. Size of vocabulary.
-      filters: list (or concatenation) of characters to filter out, such as
-        punctuation. Default:
-        ```
-        '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
-        ```,
-        includes basic punctuation, tabs, and newlines.
-      lower: boolean. Whether to set the text to lowercase.
-      split: str. Separator for word splitting.
-      analyzer: function. Custom analyzer to split the text
-
-  Returns:
-      List of integers in `[1, n]`. Each integer encodes a word
-      (unicity non-guaranteed).
-  """
-  return hashing_trick(
-      input_text,
-      n,
-      hash_function=hash,
-      filters=filters,
-      lower=lower,
-      split=split,
-      analyzer=analyzer)
-
-
-@keras_export('keras.preprocessing.text.hashing_trick')
-def hashing_trick(text,
-                  n,
-                  hash_function=None,
-                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-                  lower=True,
-                  split=' ',
-                  analyzer=None):
-  r"""Converts a text to a sequence of indexes in a fixed-size hashing space.
-
-  Deprecated: `tf.keras.text.preprocessing.hashing_trick` does not operate on
-  tensors and is not recommended for new code. Prefer `tf.keras.layers.Hashing`
-  which provides equivalent functionality through a layer which accepts
-  `tf.Tensor` input. See the [preprocessing layer guide]
-  (https://www.tensorflow.org/guide/keras/preprocessing_layers)
-  for an overview of preprocessing layers.
-
-  Args:
-      text: Input text (string).
-      n: Dimension of the hashing space.
-      hash_function: defaults to python `hash` function, can be 'md5' or
-          any function that takes in input a string and returns a int.
-          Note that 'hash' is not a stable hashing function, so
-          it is not consistent across different runs, while 'md5'
-          is a stable hashing function.
-      filters: list (or concatenation) of characters to filter out, such as
-          punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,
-          includes basic punctuation, tabs, and newlines.
-      lower: boolean. Whether to set the text to lowercase.
-      split: str. Separator for word splitting.
-      analyzer: function. Custom analyzer to split the text
-
-  Returns:
-      A list of integer word indices (unicity non-guaranteed).
-      `0` is a reserved index that won't be assigned to any word.
-      Two or more words may be assigned to the same index, due to possible
-      collisions by the hashing function.
-      The [probability](
-          https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
-      of a collision is in relation to the dimension of the hashing space and
-      the number of distinct objects.
-  """
-  if hash_function is None:
-    hash_function = hash
-  elif hash_function == 'md5':
-    hash_function = lambda w: int(hashlib.md5(w.encode()).hexdigest(), 16)
-
-  if analyzer is None:
-    seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
-  else:
-    seq = analyzer(text)
-
-  return [(hash_function(w) % (n - 1) + 1) for w in seq]
-
-
-@keras_export('keras.preprocessing.text.Tokenizer')
-class Tokenizer(object):
-  """Text tokenization utility class.
-
-  Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on
-  tensors and is not recommended for new code. Prefer
-  `tf.keras.layers.TextVectorization` which provides equivalent functionality
-  through a layer which accepts `tf.Tensor` input. See the
-  [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)
-  for an overview of the layer and text handling in tensorflow.
-
-  This class allows to vectorize a text corpus, by turning each
-  text into either a sequence of integers (each integer being the index
-  of a token in a dictionary) or into a vector where the coefficient
-  for each token could be binary, based on word count, based on tf-idf...
-
-  By default, all punctuation is removed, turning the texts into
-  space-separated sequences of words
-  (words maybe include the `'` character). These sequences are then
-  split into lists of tokens. They will then be indexed or vectorized.
-
-  `0` is a reserved index that won't be assigned to any word.
-
-  Args:
-      num_words: the maximum number of words to keep, based
-          on word frequency. Only the most common `num_words-1` words will
-          be kept.
-      filters: a string where each element is a character that will be
-          filtered from the texts. The default is all punctuation, plus
-          tabs and line breaks, minus the `'` character.
-      lower: boolean. Whether to convert the texts to lowercase.
-      split: str. Separator for word splitting.
-      char_level: if True, every character will be treated as a token.
-      oov_token: if given, it will be added to word_index and used to
-          replace out-of-vocabulary words during text_to_sequence calls
-      analyzer: function. Custom analyzer to split the text.
-          The default analyzer is text_to_word_sequence
-  """
-
-  def __init__(self,
-               num_words=None,
-               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-               lower=True,
-               split=' ',
-               char_level=False,
-               oov_token=None,
-               analyzer=None,
-               **kwargs):
-    # Legacy support
-    if 'nb_words' in kwargs:
-      warnings.warn('The `nb_words` argument in `Tokenizer` '
-                    'has been renamed `num_words`.')
-      num_words = kwargs.pop('nb_words')
-    document_count = kwargs.pop('document_count', 0)
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    self.word_counts = collections.OrderedDict()
-    self.word_docs = collections.defaultdict(int)
-    self.filters = filters
-    self.split = split
-    self.lower = lower
-    self.num_words = num_words
-    self.document_count = document_count
-    self.char_level = char_level
-    self.oov_token = oov_token
-    self.index_docs = collections.defaultdict(int)
-    self.word_index = {}
-    self.index_word = {}
-    self.analyzer = analyzer
-
-  def fit_on_texts(self, texts):
-    """Updates internal vocabulary based on a list of texts.
-
-    In the case where texts contains lists,
-    we assume each entry of the lists to be a token.
-
-    Required before using `texts_to_sequences` or `texts_to_matrix`.
-
-    Args:
-        texts: can be a list of strings,
-            a generator of strings (for memory-efficiency),
-            or a list of list of strings.
-    """
-    for text in texts:
-      self.document_count += 1
-      if self.char_level or isinstance(text, list):
-        if self.lower:
-          if isinstance(text, list):
-            text = [text_elem.lower() for text_elem in text]
-          else:
-            text = text.lower()
-        seq = text
-      else:
-        if self.analyzer is None:
-          seq = text_to_word_sequence(
-              text, filters=self.filters, lower=self.lower, split=self.split)
-        else:
-          seq = self.analyzer(text)
-      for w in seq:
-        if w in self.word_counts:
-          self.word_counts[w] += 1
-        else:
-          self.word_counts[w] = 1
-      for w in set(seq):
-        # In how many documents each word occurs
-        self.word_docs[w] += 1
-
-    wcounts = list(self.word_counts.items())
-    wcounts.sort(key=lambda x: x[1], reverse=True)
-    # forcing the oov_token to index 1 if it exists
-    if self.oov_token is None:
-      sorted_voc = []
-    else:
-      sorted_voc = [self.oov_token]
-    sorted_voc.extend(wc[0] for wc in wcounts)
-
-    # note that index 0 is reserved, never assigned to an existing word
-    self.word_index = dict(zip(sorted_voc, list(range(1, len(sorted_voc) + 1))))
-
-    self.index_word = {c: w for w, c in self.word_index.items()}
-
-    for w, c in list(self.word_docs.items()):
-      self.index_docs[self.word_index[w]] = c
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-  def fit_on_sequences(self, sequences):
-    """Updates internal vocabulary based on a list of sequences.
 
-    Required before using `sequences_to_matrix`
-    (if `fit_on_texts` was never called).
+@keras_export("keras.preprocessing.text.text_to_word_sequence")
+def text_to_word_sequence(
+    input_text,
+    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+    lower=True,
+    split=" ",
+):
+    r"""Converts a text to a sequence of words (or tokens).
 
-    Args:
-        sequences: A list of sequence.
-            A "sequence" is a list of integer word indices.
-    """
-    self.document_count += len(sequences)
-    for seq in sequences:
-      seq = set(seq)
-      for i in seq:
-        self.index_docs[i] += 1
+    Deprecated: `tf.keras.preprocessing.text.text_to_word_sequence` does not
+    operate on tensors and is not recommended for new code. Prefer
+    `tf.strings.regex_replace` and `tf.strings.split` which provide equivalent
+    functionality and accept `tf.Tensor` input. For an overview of text handling
+    in Tensorflow, see the [text loading tutorial]
+    (https://www.tensorflow.org/tutorials/load_data/text).
 
-  def texts_to_sequences(self, texts):
-    """Transforms each text in texts to a sequence of integers.
+    This function transforms a string of text into a list of words
+    while ignoring `filters` which include punctuations by default.
 
-    Only top `num_words-1` most frequent words will be taken into account.
-    Only words known by the tokenizer will be taken into account.
+    >>> sample_text = 'This is a sample sentence.'
+    >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
+    ['this', 'is', 'a', 'sample', 'sentence']
 
     Args:
-        texts: A list of texts (strings).
+        input_text: Input text (string).
+        filters: list (or concatenation) of characters to filter out, such as
+            punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
+              includes basic punctuation, tabs, and newlines.
+        lower: boolean. Whether to convert the input to lowercase.
+        split: str. Separator for word splitting.
 
     Returns:
-        A list of sequences.
+        A list of words (or tokens).
     """
-    return list(self.texts_to_sequences_generator(texts))
-
-  def texts_to_sequences_generator(self, texts):
-    """Transforms each text in `texts` to a sequence of integers.
-
-    Each item in texts can also be a list,
-    in which case we assume each item of that list to be a token.
-
-    Only top `num_words-1` most frequent words will be taken into account.
-    Only words known by the tokenizer will be taken into account.
+    if lower:
+        input_text = input_text.lower()
+
+    translate_dict = {c: split for c in filters}
+    translate_map = str.maketrans(translate_dict)
+    input_text = input_text.translate(translate_map)
+
+    seq = input_text.split(split)
+    return [i for i in seq if i]
+
+
+@keras_export("keras.preprocessing.text.one_hot")
+def one_hot(
+    input_text,
+    n,
+    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+    lower=True,
+    split=" ",
+    analyzer=None,
+):
+    r"""One-hot encodes a text into a list of word indexes of size `n`.
+
+    Deprecated: `tf.keras.text.preprocessing.one_hot` does not operate on
+    tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.Hashing` with `output_mode='one_hot'` which provides
+    equivalent functionality through a layer which accepts `tf.Tensor` input.
+    See the [preprocessing layer guide]
+    (https://www.tensorflow.org/guide/keras/preprocessing_layers) for an
+    overview of preprocessing layers.
+
+    This function receives as input a string of text and returns a
+    list of encoded integers each corresponding to a word (or token)
+    in the given input string.
 
     Args:
-        texts: A list of texts (strings).
+        input_text: Input text (string).
+        n: int. Size of vocabulary.
+        filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default:
+          ```
+          '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
+          ```,
+          includes basic punctuation, tabs, and newlines.
+        lower: boolean. Whether to set the text to lowercase.
+        split: str. Separator for word splitting.
+        analyzer: function. Custom analyzer to split the text
 
-    Yields:
-        Yields individual sequences.
+    Returns:
+        List of integers in `[1, n]`. Each integer encodes a word
+        (unicity non-guaranteed).
     """
-    num_words = self.num_words
-    oov_token_index = self.word_index.get(self.oov_token)
-    for text in texts:
-      if self.char_level or isinstance(text, list):
-        if self.lower:
-          if isinstance(text, list):
-            text = [text_elem.lower() for text_elem in text]
-          else:
-            text = text.lower()
-        seq = text
-      else:
-        if self.analyzer is None:
-          seq = text_to_word_sequence(
-              text, filters=self.filters, lower=self.lower, split=self.split)
-        else:
-          seq = self.analyzer(text)
-      vect = []
-      for w in seq:
-        i = self.word_index.get(w)
-        if i is not None:
-          if num_words and i >= num_words:
-            if oov_token_index is not None:
-              vect.append(oov_token_index)
-          else:
-            vect.append(i)
-        elif self.oov_token is not None:
-          vect.append(oov_token_index)
-      yield vect
-
-  def sequences_to_texts(self, sequences):
-    """Transforms each sequence into a list of text.
-
-    Only top `num_words-1` most frequent words will be taken into account.
-    Only words known by the tokenizer will be taken into account.
+    return hashing_trick(
+        input_text,
+        n,
+        hash_function=hash,
+        filters=filters,
+        lower=lower,
+        split=split,
+        analyzer=analyzer,
+    )
+
+
+@keras_export("keras.preprocessing.text.hashing_trick")
+def hashing_trick(
+    text,
+    n,
+    hash_function=None,
+    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+    lower=True,
+    split=" ",
+    analyzer=None,
+):
+    r"""Converts a text to a sequence of indexes in a fixed-size hashing space.
+
+    Deprecated: `tf.keras.text.preprocessing.hashing_trick` does not operate on
+    tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.Hashing` which provides equivalent functionality through a
+    layer which accepts `tf.Tensor` input. See the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers) for an
+    overview of preprocessing layers.
 
     Args:
-        sequences: A list of sequences (list of integers).
+        text: Input text (string).
+        n: Dimension of the hashing space.
+        hash_function: When `None` uses a python `hash` function. Can be 'md5'
+            or any function that takes in input a string and returns a int.
+            Note that 'hash' is not a stable hashing function, so
+            it is not consistent across different runs, while 'md5'
+            is a stable hashing function. Defaults to `None`.
+        filters: list (or concatenation) of characters to filter out, such as
+            punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,
+            includes basic punctuation, tabs, and newlines.
+        lower: boolean. Whether to set the text to lowercase.
+        split: str. Separator for word splitting.
+        analyzer: function. Custom analyzer to split the text
 
     Returns:
-        A list of texts (strings)
+        A list of integer word indices (unicity non-guaranteed).
+        `0` is a reserved index that won't be assigned to any word.
+        Two or more words may be assigned to the same index, due to possible
+        collisions by the hashing function.
+        The [probability](
+            https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
+        of a collision is in relation to the dimension of the hashing space and
+        the number of distinct objects.
     """
-    return list(self.sequences_to_texts_generator(sequences))
+    if hash_function is None:
+        hash_function = hash
+    elif hash_function == "md5":
+        hash_function = lambda w: int(hashlib.md5(w.encode()).hexdigest(), 16)
+
+    if analyzer is None:
+        seq = text_to_word_sequence(
+            text, filters=filters, lower=lower, split=split
+        )
+    else:
+        seq = analyzer(text)
 
-  def sequences_to_texts_generator(self, sequences):
-    """Transforms each sequence in `sequences` to a list of texts(strings).
+    return [(hash_function(w) % (n - 1) + 1) for w in seq]
 
-    Each sequence has to a list of integers.
-    In other words, sequences should be a list of sequences
 
-    Only top `num_words-1` most frequent words will be taken into account.
-    Only words known by the tokenizer will be taken into account.
+@keras_export("keras.preprocessing.text.Tokenizer")
+class Tokenizer(object):
+    """Text tokenization utility class.
 
-    Args:
-        sequences: A list of sequences.
+    Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on
+    tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.TextVectorization` which provides equivalent functionality
+    through a layer which accepts `tf.Tensor` input. See the
+    [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)
+    for an overview of the layer and text handling in tensorflow.
 
-    Yields:
-        Yields individual texts.
-    """
-    num_words = self.num_words
-    oov_token_index = self.word_index.get(self.oov_token)
-    for seq in sequences:
-      vect = []
-      for num in seq:
-        word = self.index_word.get(num)
-        if word is not None:
-          if num_words and num >= num_words:
-            if oov_token_index is not None:
-              vect.append(self.index_word[oov_token_index])
-          else:
-            vect.append(word)
-        elif self.oov_token is not None:
-          vect.append(self.index_word[oov_token_index])
-      vect = ' '.join(vect)
-      yield vect
-
-  def texts_to_matrix(self, texts, mode='binary'):
-    """Convert a list of texts to a Numpy matrix.
+    This class allows to vectorize a text corpus, by turning each
+    text into either a sequence of integers (each integer being the index
+    of a token in a dictionary) or into a vector where the coefficient
+    for each token could be binary, based on word count, based on tf-idf...
 
-    Args:
-        texts: list of strings.
-        mode: one of "binary", "count", "tfidf", "freq".
-
-    Returns:
-        A Numpy matrix.
-    """
-    sequences = self.texts_to_sequences(texts)
-    return self.sequences_to_matrix(sequences, mode=mode)
+    By default, all punctuation is removed, turning the texts into
+    space-separated sequences of words
+    (words may include the `'` character). These sequences are then
+    split into lists of tokens. They will then be indexed or vectorized.
 
-  def sequences_to_matrix(self, sequences, mode='binary'):
-    """Converts a list of sequences into a Numpy matrix.
+    `0` is a reserved index that won't be assigned to any word.
 
     Args:
-        sequences: list of sequences
-            (a sequence is a list of integer word indices).
-        mode: one of "binary", "count", "tfidf", "freq"
-
-    Returns:
-        A Numpy matrix.
-
-    Raises:
-        ValueError: In case of invalid `mode` argument,
-            or if the Tokenizer requires to be fit to sample data.
+        num_words: the maximum number of words to keep, based
+            on word frequency. Only the most common `num_words-1` words will
+            be kept.
+        filters: a string where each element is a character that will be
+            filtered from the texts. The default is all punctuation, plus
+            tabs and line breaks, minus the `'` character.
+        lower: boolean. Whether to convert the texts to lowercase.
+        split: str. Separator for word splitting.
+        char_level: if True, every character will be treated as a token.
+        oov_token: if given, it will be added to word_index and used to
+            replace out-of-vocabulary words during text_to_sequence calls
+        analyzer: function. Custom analyzer to split the text.
+            The default analyzer is text_to_word_sequence
     """
-    if not self.num_words:
-      if self.word_index:
-        num_words = len(self.word_index) + 1
-      else:
-        raise ValueError('Specify a dimension (`num_words` argument), '
-                         'or fit on some text data first.')
-    else:
-      num_words = self.num_words
-
-    if mode == 'tfidf' and not self.document_count:
-      raise ValueError('Fit the Tokenizer on some data '
-                       'before using tfidf mode.')
-
-    x = np.zeros((len(sequences), num_words))
-    for i, seq in enumerate(sequences):
-      if not seq:
-        continue
-      counts = collections.defaultdict(int)
-      for j in seq:
-        if j >= num_words:
-          continue
-        counts[j] += 1
-      for j, c in list(counts.items()):
-        if mode == 'count':
-          x[i][j] = c
-        elif mode == 'freq':
-          x[i][j] = c / len(seq)
-        elif mode == 'binary':
-          x[i][j] = 1
-        elif mode == 'tfidf':
-          # Use weighting scheme 2 in
-          # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
-          tf = 1 + np.log(c)
-          idf = np.log(1 + self.document_count /
-                       (1 + self.index_docs.get(j, 0)))
-          x[i][j] = tf * idf
-        else:
-          raise ValueError('Unknown vectorization mode:', mode)
-    return x
 
-  def get_config(self):
-    """Returns the tokenizer configuration as Python dictionary.
-
-    The word count dictionaries used by the tokenizer get serialized
-    into plain JSON, so that the configuration can be read by other
-    projects.
+    def __init__(
+        self,
+        num_words=None,
+        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+        lower=True,
+        split=" ",
+        char_level=False,
+        oov_token=None,
+        analyzer=None,
+        **kwargs
+    ):
+        # Legacy support
+        if "nb_words" in kwargs:
+            warnings.warn(
+                "The `nb_words` argument in `Tokenizer` "
+                "has been renamed `num_words`."
+            )
+            num_words = kwargs.pop("nb_words")
+        document_count = kwargs.pop("document_count", 0)
+        if kwargs:
+            raise TypeError("Unrecognized keyword arguments: " + str(kwargs))
+
+        self.word_counts = collections.OrderedDict()
+        self.word_docs = collections.defaultdict(int)
+        self.filters = filters
+        self.split = split
+        self.lower = lower
+        self.num_words = num_words
+        self.document_count = document_count
+        self.char_level = char_level
+        self.oov_token = oov_token
+        self.index_docs = collections.defaultdict(int)
+        self.word_index = {}
+        self.index_word = {}
+        self.analyzer = analyzer
+
+    def fit_on_texts(self, texts):
+        """Updates internal vocabulary based on a list of texts.
+
+        In the case where texts contains lists,
+        we assume each entry of the lists to be a token.
+
+        Required before using `texts_to_sequences` or `texts_to_matrix`.
+
+        Args:
+            texts: can be a list of strings,
+                a generator of strings (for memory-efficiency),
+                or a list of list of strings.
+        """
+        for text in texts:
+            self.document_count += 1
+            if self.char_level or isinstance(text, list):
+                if self.lower:
+                    if isinstance(text, list):
+                        text = [text_elem.lower() for text_elem in text]
+                    else:
+                        text = text.lower()
+                seq = text
+            else:
+                if self.analyzer is None:
+                    seq = text_to_word_sequence(
+                        text,
+                        filters=self.filters,
+                        lower=self.lower,
+                        split=self.split,
+                    )
+                else:
+                    seq = self.analyzer(text)
+            for w in seq:
+                if w in self.word_counts:
+                    self.word_counts[w] += 1
+                else:
+                    self.word_counts[w] = 1
+            for w in set(seq):
+                # In how many documents each word occurs
+                self.word_docs[w] += 1
+
+        wcounts = list(self.word_counts.items())
+        wcounts.sort(key=lambda x: x[1], reverse=True)
+        # forcing the oov_token to index 1 if it exists
+        if self.oov_token is None:
+            sorted_voc = []
+        else:
+            sorted_voc = [self.oov_token]
+        sorted_voc.extend(wc[0] for wc in wcounts)
+
+        # note that index 0 is reserved, never assigned to an existing word
+        self.word_index = dict(
+            zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))
+        )
+
+        self.index_word = {c: w for w, c in self.word_index.items()}
+
+        for w, c in list(self.word_docs.items()):
+            self.index_docs[self.word_index[w]] = c
+
+    def fit_on_sequences(self, sequences):
+        """Updates internal vocabulary based on a list of sequences.
+
+        Required before using `sequences_to_matrix`
+        (if `fit_on_texts` was never called).
+
+        Args:
+            sequences: A list of sequence.
+                A "sequence" is a list of integer word indices.
+        """
+        self.document_count += len(sequences)
+        for seq in sequences:
+            seq = set(seq)
+            for i in seq:
+                self.index_docs[i] += 1
+
+    def texts_to_sequences(self, texts):
+        """Transforms each text in texts to a sequence of integers.
+
+        Only top `num_words-1` most frequent words will be taken into account.
+        Only words known by the tokenizer will be taken into account.
+
+        Args:
+            texts: A list of texts (strings).
+
+        Returns:
+            A list of sequences.
+        """
+        return list(self.texts_to_sequences_generator(texts))
+
+    def texts_to_sequences_generator(self, texts):
+        """Transforms each text in `texts` to a sequence of integers.
+
+        Each item in texts can also be a list,
+        in which case we assume each item of that list to be a token.
+
+        Only top `num_words-1` most frequent words will be taken into account.
+        Only words known by the tokenizer will be taken into account.
+
+        Args:
+            texts: A list of texts (strings).
+
+        Yields:
+            Yields individual sequences.
+        """
+        num_words = self.num_words
+        oov_token_index = self.word_index.get(self.oov_token)
+        for text in texts:
+            if self.char_level or isinstance(text, list):
+                if self.lower:
+                    if isinstance(text, list):
+                        text = [text_elem.lower() for text_elem in text]
+                    else:
+                        text = text.lower()
+                seq = text
+            else:
+                if self.analyzer is None:
+                    seq = text_to_word_sequence(
+                        text,
+                        filters=self.filters,
+                        lower=self.lower,
+                        split=self.split,
+                    )
+                else:
+                    seq = self.analyzer(text)
+            vect = []
+            for w in seq:
+                i = self.word_index.get(w)
+                if i is not None:
+                    if num_words and i >= num_words:
+                        if oov_token_index is not None:
+                            vect.append(oov_token_index)
+                    else:
+                        vect.append(i)
+                elif self.oov_token is not None:
+                    vect.append(oov_token_index)
+            yield vect
+
+    def sequences_to_texts(self, sequences):
+        """Transforms each sequence into a list of text.
+
+        Only top `num_words-1` most frequent words will be taken into account.
+        Only words known by the tokenizer will be taken into account.
+
+        Args:
+            sequences: A list of sequences (list of integers).
+
+        Returns:
+            A list of texts (strings)
+        """
+        return list(self.sequences_to_texts_generator(sequences))
+
+    def sequences_to_texts_generator(self, sequences):
+        """Transforms each sequence in `sequences` to a list of texts(strings).
+
+        Each sequence has to a list of integers.
+        In other words, sequences should be a list of sequences
+
+        Only top `num_words-1` most frequent words will be taken into account.
+        Only words known by the tokenizer will be taken into account.
+
+        Args:
+            sequences: A list of sequences.
+
+        Yields:
+            Yields individual texts.
+        """
+        num_words = self.num_words
+        oov_token_index = self.word_index.get(self.oov_token)
+        for seq in sequences:
+            vect = []
+            for num in seq:
+                word = self.index_word.get(num)
+                if word is not None:
+                    if num_words and num >= num_words:
+                        if oov_token_index is not None:
+                            vect.append(self.index_word[oov_token_index])
+                    else:
+                        vect.append(word)
+                elif self.oov_token is not None:
+                    vect.append(self.index_word[oov_token_index])
+            vect = " ".join(vect)
+            yield vect
+
+    def texts_to_matrix(self, texts, mode="binary"):
+        """Convert a list of texts to a Numpy matrix.
+
+        Args:
+            texts: list of strings.
+            mode: one of "binary", "count", "tfidf", "freq".
+
+        Returns:
+            A Numpy matrix.
+        """
+        sequences = self.texts_to_sequences(texts)
+        return self.sequences_to_matrix(sequences, mode=mode)
+
+    def sequences_to_matrix(self, sequences, mode="binary"):
+        """Converts a list of sequences into a Numpy matrix.
+
+        Args:
+            sequences: list of sequences
+                (a sequence is a list of integer word indices).
+            mode: one of "binary", "count", "tfidf", "freq"
+
+        Returns:
+            A Numpy matrix.
+
+        Raises:
+            ValueError: In case of invalid `mode` argument,
+                or if the Tokenizer requires to be fit to sample data.
+        """
+        if not self.num_words:
+            if self.word_index:
+                num_words = len(self.word_index) + 1
+            else:
+                raise ValueError(
+                    "Specify a dimension (`num_words` argument), "
+                    "or fit on some text data first."
+                )
+        else:
+            num_words = self.num_words
+
+        if mode == "tfidf" and not self.document_count:
+            raise ValueError(
+                "Fit the Tokenizer on some data before using tfidf mode."
+            )
+
+        x = np.zeros((len(sequences), num_words))
+        for i, seq in enumerate(sequences):
+            if not seq:
+                continue
+            counts = collections.defaultdict(int)
+            for j in seq:
+                if j >= num_words:
+                    continue
+                counts[j] += 1
+            for j, c in list(counts.items()):
+                if mode == "count":
+                    x[i][j] = c
+                elif mode == "freq":
+                    x[i][j] = c / len(seq)
+                elif mode == "binary":
+                    x[i][j] = 1
+                elif mode == "tfidf":
+                    # Use weighting scheme 2 in
+                    # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+                    tf = 1 + np.log(c)
+                    idf = np.log(
+                        1
+                        + self.document_count / (1 + self.index_docs.get(j, 0))
+                    )
+                    x[i][j] = tf * idf
+                else:
+                    raise ValueError("Unknown vectorization mode:", mode)
+        return x
+
+    def get_config(self):
+        """Returns the tokenizer configuration as Python dictionary.
+
+        The word count dictionaries used by the tokenizer get serialized
+        into plain JSON, so that the configuration can be read by other
+        projects.
+
+        Returns:
+            A Python dictionary with the tokenizer configuration.
+        """
+        json_word_counts = json.dumps(self.word_counts)
+        json_word_docs = json.dumps(self.word_docs)
+        json_index_docs = json.dumps(self.index_docs)
+        json_word_index = json.dumps(self.word_index)
+        json_index_word = json.dumps(self.index_word)
+
+        return {
+            "num_words": self.num_words,
+            "filters": self.filters,
+            "lower": self.lower,
+            "split": self.split,
+            "char_level": self.char_level,
+            "oov_token": self.oov_token,
+            "document_count": self.document_count,
+            "word_counts": json_word_counts,
+            "word_docs": json_word_docs,
+            "index_docs": json_index_docs,
+            "index_word": json_index_word,
+            "word_index": json_word_index,
+        }
+
+    def to_json(self, **kwargs):
+        """Returns a JSON string containing the tokenizer configuration.
+
+        To load a tokenizer from a JSON string, use
+        `keras.preprocessing.text.tokenizer_from_json(json_string)`.
+
+        Args:
+            **kwargs: Additional keyword arguments
+                to be passed to `json.dumps()`.
+
+        Returns:
+            A JSON string containing the tokenizer configuration.
+        """
+        config = self.get_config()
+        tokenizer_config = {
+            "class_name": self.__class__.__name__,
+            "config": config,
+        }
+        return json.dumps(tokenizer_config, **kwargs)
+
+
+@keras_export("keras.preprocessing.text.tokenizer_from_json")
+def tokenizer_from_json(json_string):
+    """Parses a JSON tokenizer configuration and returns a tokenizer instance.
 
-    Returns:
-        A Python dictionary with the tokenizer configuration.
-    """
-    json_word_counts = json.dumps(self.word_counts)
-    json_word_docs = json.dumps(self.word_docs)
-    json_index_docs = json.dumps(self.index_docs)
-    json_word_index = json.dumps(self.word_index)
-    json_index_word = json.dumps(self.index_word)
-
-    return {
-        'num_words': self.num_words,
-        'filters': self.filters,
-        'lower': self.lower,
-        'split': self.split,
-        'char_level': self.char_level,
-        'oov_token': self.oov_token,
-        'document_count': self.document_count,
-        'word_counts': json_word_counts,
-        'word_docs': json_word_docs,
-        'index_docs': json_index_docs,
-        'index_word': json_index_word,
-        'word_index': json_word_index
-    }
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the tokenizer configuration.
-
-    To load a tokenizer from a JSON string, use
-    `keras.preprocessing.text.tokenizer_from_json(json_string)`.
+    Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on
+    tensors and is not recommended for new code. Prefer
+    `tf.keras.layers.TextVectorization` which provides equivalent functionality
+    through a layer which accepts `tf.Tensor` input. See the
+    [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)
+    for an overview of the layer and text handling in tensorflow.
 
     Args:
-        **kwargs: Additional keyword arguments
-            to be passed to `json.dumps()`.
+        json_string: JSON string encoding a tokenizer configuration.
 
     Returns:
-        A JSON string containing the tokenizer configuration.
+        A Keras Tokenizer instance
     """
-    config = self.get_config()
-    tokenizer_config = {'class_name': self.__class__.__name__, 'config': config}
-    return json.dumps(tokenizer_config, **kwargs)
-
-
-@keras_export('keras.preprocessing.text.tokenizer_from_json')
-def tokenizer_from_json(json_string):
-  """Parses a JSON tokenizer configuration and returns a tokenizer instance.
-
-  Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on
-  tensors and is not recommended for new code. Prefer
-  `tf.keras.layers.TextVectorization` which provides equivalent functionality
-  through a layer which accepts `tf.Tensor` input. See the
-  [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)
-  for an overview of the layer and text handling in tensorflow.
-
-  Args:
-      json_string: JSON string encoding a tokenizer configuration.
-
-  Returns:
-      A Keras Tokenizer instance
-  """
-  tokenizer_config = json.loads(json_string)
-  config = tokenizer_config.get('config')
-
-  word_counts = json.loads(config.pop('word_counts'))
-  word_docs = json.loads(config.pop('word_docs'))
-  index_docs = json.loads(config.pop('index_docs'))
-  # Integer indexing gets converted to strings with json.dumps()
-  index_docs = {int(k): v for k, v in index_docs.items()}
-  index_word = json.loads(config.pop('index_word'))
-  index_word = {int(k): v for k, v in index_word.items()}
-  word_index = json.loads(config.pop('word_index'))
-
-  tokenizer = Tokenizer(**config)
-  tokenizer.word_counts = word_counts
-  tokenizer.word_docs = word_docs
-  tokenizer.index_docs = index_docs
-  tokenizer.word_index = word_index
-  tokenizer.index_word = index_word
-  return tokenizer
+    tokenizer_config = json.loads(json_string)
+    config = tokenizer_config.get("config")
+
+    word_counts = json.loads(config.pop("word_counts"))
+    word_docs = json.loads(config.pop("word_docs"))
+    index_docs = json.loads(config.pop("index_docs"))
+    # Integer indexing gets converted to strings with json.dumps()
+    index_docs = {int(k): v for k, v in index_docs.items()}
+    index_word = json.loads(config.pop("index_word"))
+    index_word = {int(k): v for k, v in index_word.items()}
+    word_index = json.loads(config.pop("word_index"))
+
+    tokenizer = Tokenizer(**config)
+    tokenizer.word_counts = word_counts
+    tokenizer.word_docs = word_docs
+    tokenizer.index_docs = index_docs
+    tokenizer.word_index = word_index
+    tokenizer.index_word = index_word
+    return tokenizer
diff --git a/keras/preprocessing/text_test.py b/keras/preprocessing/text_test.py
index 7edbe05f4415..a73e81ccc620 100644
--- a/keras/preprocessing/text_test.py
+++ b/keras/preprocessing/text_test.py
@@ -17,281 +17,332 @@
 
 import collections
 
-from keras.preprocessing import text
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+from keras.preprocessing import text
+
 
 class TestText(tf.test.TestCase):
-
-  def test_one_hot(self):
-    sample_text = 'The cat sat on the mat.'
-    encoded = text.one_hot(sample_text, 5)
-    self.assertLen(encoded, 6)
-    self.assertLessEqual(np.max(encoded), 4)
-    self.assertGreaterEqual(np.min(encoded), 0)
-
-    sample_text = 'The-cat-sat-on-the-mat'
-    encoded2 = text.one_hot(
-        sample_text, 5, analyzer=lambda t: t.lower().split('-'))
-    self.assertEqual(encoded, encoded2)
-    self.assertLen(encoded, 6)
-    self.assertLessEqual(np.max(encoded), 4)
-    self.assertGreaterEqual(np.min(encoded), 0)
-
-  def test_hashing_trick_hash(self):
-    sample_text = 'The cat sat on the mat.'
-    encoded = text.hashing_trick(sample_text, 5)
-    self.assertLen(encoded, 6)
-    self.assertLessEqual(np.max(encoded), 4)
-    self.assertGreaterEqual(np.min(encoded), 1)
-
-  def test_hashing_trick_md5(self):
-    sample_text = 'The cat sat on the mat.'
-    encoded = text.hashing_trick(sample_text, 5, hash_function='md5')
-    self.assertLen(encoded, 6)
-    self.assertLessEqual(np.max(encoded), 4)
-    self.assertGreaterEqual(np.min(encoded), 1)
-
-  def test_tokenizer(self):
-    sample_texts = [
-        'The cat sat on the mat.', 'The dog sat on the log.',
-        'Dogs and cats living together.'
-    ]
-    tokenizer = text.Tokenizer(num_words=10)
-    tokenizer.fit_on_texts(sample_texts)
-
-    sequences = []
-    for seq in tokenizer.texts_to_sequences_generator(sample_texts):
-      sequences.append(seq)
-    self.assertLess(np.max(np.max(sequences)), 10)
-    self.assertEqual(np.min(np.min(sequences)), 1)
-
-    tokenizer.fit_on_sequences(sequences)
-
-    for mode in ['binary', 'count', 'tfidf', 'freq']:
-      tokenizer.texts_to_matrix(sample_texts, mode)
-
-  def test_tokenizer_serde_no_fitting(self):
-    tokenizer = text.Tokenizer(num_words=100)
-
-    tokenizer_json = tokenizer.to_json()
-    recovered = text.tokenizer_from_json(tokenizer_json)
-
-    self.assertEqual(tokenizer.get_config(), recovered.get_config())
-
-    self.assertEqual(tokenizer.word_docs, recovered.word_docs)
-    self.assertEqual(tokenizer.word_counts, recovered.word_counts)
-    self.assertEqual(tokenizer.word_index, recovered.word_index)
-    self.assertEqual(tokenizer.index_word, recovered.index_word)
-    self.assertEqual(tokenizer.index_docs, recovered.index_docs)
-
-  def test_tokenizer_serde_fitting(self):
-    sample_texts = [
-        'There was a time that the pieces fit, but I watched them fall away',
-        'Mildewed and smoldering, strangled by our coveting',
-        'I\'ve done the math enough to know the dangers of our second guessing'
-    ]
-    tokenizer = text.Tokenizer(num_words=100)
-    tokenizer.fit_on_texts(sample_texts)
-
-    seq_generator = tokenizer.texts_to_sequences_generator(sample_texts)
-    sequences = [seq for seq in seq_generator]
-    tokenizer.fit_on_sequences(sequences)
-
-    tokenizer_json = tokenizer.to_json()
-    recovered = text.tokenizer_from_json(tokenizer_json)
-
-    self.assertEqual(tokenizer.char_level, recovered.char_level)
-    self.assertEqual(tokenizer.document_count, recovered.document_count)
-    self.assertEqual(tokenizer.filters, recovered.filters)
-    self.assertEqual(tokenizer.lower, recovered.lower)
-    self.assertEqual(tokenizer.num_words, recovered.num_words)
-    self.assertEqual(tokenizer.oov_token, recovered.oov_token)
-
-    self.assertEqual(tokenizer.word_docs, recovered.word_docs)
-    self.assertEqual(tokenizer.word_counts, recovered.word_counts)
-    self.assertEqual(tokenizer.word_index, recovered.word_index)
-    self.assertEqual(tokenizer.index_word, recovered.index_word)
-    self.assertEqual(tokenizer.index_docs, recovered.index_docs)
-
-  def test_sequential_fit(self):
-    texts = [
-        'The cat sat on the mat.', 'The dog sat on the log.',
-        'Dogs and cats living together.'
-    ]
-    word_sequences = [['The', 'cat', 'is', 'sitting'],
-                      ['The', 'dog', 'is', 'standing']]
-
-    tokenizer = text.Tokenizer()
-    tokenizer.fit_on_texts(texts)
-    tokenizer.fit_on_texts(word_sequences)
-
-    self.assertEqual(tokenizer.document_count, 5)
-
-    tokenizer.texts_to_matrix(texts)
-    tokenizer.texts_to_matrix(word_sequences)
-
-  def test_text_to_word_sequence(self):
-    sample_text = 'hello! ? world!'
-    self.assertEqual(
-        text.text_to_word_sequence(sample_text), ['hello', 'world'])
-
-  def test_text_to_word_sequence_multichar_split(self):
-    sample_text = 'hello!stop?world!'
-    self.assertEqual(
-        text.text_to_word_sequence(sample_text, split='stop'),
-        ['hello', 'world'])
-
-  def test_text_to_word_sequence_unicode(self):
-    sample_text = u'ali! veli? kırk dokuz elli'
-    self.assertEqual(
-        text.text_to_word_sequence(sample_text),
-        [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
-
-  def test_text_to_word_sequence_unicode_multichar_split(self):
-    sample_text = u'ali!stopveli?stopkırkstopdokuzstopelli'
-    self.assertEqual(
-        text.text_to_word_sequence(sample_text, split='stop'),
-        [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
-
-  def test_tokenizer_unicode(self):
-    sample_texts = [
-        u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
-    ]
-    tokenizer = text.Tokenizer(num_words=5)
-    tokenizer.fit_on_texts(sample_texts)
-
-    self.assertLen(tokenizer.word_counts, 5)
-
-  def test_tokenizer_oov_flag(self):
-    """Test of Out of Vocabulary (OOV) flag in text.Tokenizer."""
-    x_train = ['This text has only known words']
-    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
-
-    # Default, without OOV flag
-    tokenizer = text.Tokenizer()
-    tokenizer.fit_on_texts(x_train)
-    x_test_seq = tokenizer.texts_to_sequences(x_test)
-    self.assertLen(x_test_seq[0], 4)  # discards 2 OOVs
-
-    # With OOV feature
-    tokenizer = text.Tokenizer(oov_token='<unk>')
-    tokenizer.fit_on_texts(x_train)
-    x_test_seq = tokenizer.texts_to_sequences(x_test)
-    self.assertLen(x_test_seq[0], 6)  # OOVs marked in place
-
-  def test_tokenizer_oov_flag_and_num_words(self):
-    x_train = ['This text has only known words this text']
-    x_test = ['This text has some unknown words']
-
-    tokenizer = text.Tokenizer(num_words=3, oov_token='<unk>')
-    tokenizer.fit_on_texts(x_train)
-    x_test_seq = tokenizer.texts_to_sequences(x_test)
-    trans_text = ' '.join(tokenizer.index_word[t] for t in x_test_seq[0])
-    self.assertLen(x_test_seq[0], 6)
-    self.assertEqual(trans_text, 'this <unk> <unk> <unk> <unk> <unk>')
-
-  def test_sequences_to_texts_with_num_words_and_oov_token(self):
-    x_train = ['This text has only known words this text']
-    x_test = ['This text has some unknown words']
-
-    tokenizer = text.Tokenizer(num_words=3, oov_token='<unk>')
-
-    tokenizer.fit_on_texts(x_train)
-    x_test_seq = tokenizer.texts_to_sequences(x_test)
-    trans_text = tokenizer.sequences_to_texts(x_test_seq)
-    self.assertEqual(trans_text, ['this <unk> <unk> <unk> <unk> <unk>'])
-
-  def test_sequences_to_texts_no_num_words(self):
-    x_train = ['This text has only known words this text']
-    x_test = ['This text has some unknown words']
-
-    tokenizer = text.Tokenizer(oov_token='<unk>')
-
-    tokenizer.fit_on_texts(x_train)
-    x_test_seq = tokenizer.texts_to_sequences(x_test)
-    trans_text = tokenizer.sequences_to_texts(x_test_seq)
-    self.assertEqual(trans_text, ['this text has <unk> <unk> words'])
-
-  def test_sequences_to_texts_no_oov_token(self):
-    x_train = ['This text has only known words this text']
-    x_test = ['This text has some unknown words']
-
-    tokenizer = text.Tokenizer(num_words=3)
-
-    tokenizer.fit_on_texts(x_train)
-    x_test_seq = tokenizer.texts_to_sequences(x_test)
-    trans_text = tokenizer.sequences_to_texts(x_test_seq)
-    self.assertEqual(trans_text, ['this text'])
-
-  def test_sequences_to_texts_no_num_words_no_oov_token(self):
-    x_train = ['This text has only known words this text']
-    x_test = ['This text has some unknown words']
-
-    tokenizer = text.Tokenizer()
-
-    tokenizer.fit_on_texts(x_train)
-    x_test_seq = tokenizer.texts_to_sequences(x_test)
-    trans_text = tokenizer.sequences_to_texts(x_test_seq)
-    self.assertEqual(trans_text, ['this text has words'])
-
-  def test_sequences_to_texts(self):
-    texts = [
-        'The cat sat on the mat.', 'The dog sat on the log.',
-        'Dogs and cats living together.'
-    ]
-    tokenizer = text.Tokenizer(num_words=10, oov_token='<unk>')
-    tokenizer.fit_on_texts(texts)
-    tokenized_text = tokenizer.texts_to_sequences(texts)
-    trans_text = tokenizer.sequences_to_texts(tokenized_text)
-    self.assertEqual(trans_text, [
-        'the cat sat on the mat', 'the dog sat on the log',
-        'dogs <unk> <unk> <unk> <unk>'
-    ])
-
-  def test_tokenizer_lower_flag(self):
-    """Tests for `lower` flag in text.Tokenizer."""
-    # word level tokenizer with sentences as texts
-    word_tokenizer = text.Tokenizer(lower=True)
-    texts = [
-        'The cat sat on the mat.', 'The dog sat on the log.',
-        'Dog and Cat living Together.'
-    ]
-    word_tokenizer.fit_on_texts(texts)
-    expected_word_counts = collections.OrderedDict([('the', 4), ('cat', 2),
-                                                    ('sat', 2), ('on', 2),
-                                                    ('mat', 1), ('dog', 2),
-                                                    ('log', 1), ('and', 1),
-                                                    ('living', 1),
-                                                    ('together', 1)])
-    self.assertEqual(word_tokenizer.word_counts, expected_word_counts)
-
-    # word level tokenizer with word_sequences as texts
-    word_tokenizer = text.Tokenizer(lower=True)
-    word_sequences = [['The', 'cat', 'is', 'sitting'],
-                      ['The', 'dog', 'is', 'standing']]
-    word_tokenizer.fit_on_texts(word_sequences)
-    expected_word_counts = collections.OrderedDict([('the', 2), ('cat', 1),
-                                                    ('is', 2), ('sitting', 1),
-                                                    ('dog', 1),
-                                                    ('standing', 1)])
-    self.assertEqual(word_tokenizer.word_counts, expected_word_counts)
-
-    # char level tokenizer with sentences as texts
-    char_tokenizer = text.Tokenizer(lower=True, char_level=True)
-    texts = [
-        'The cat sat on the mat.', 'The dog sat on the log.',
-        'Dog and Cat living Together.'
-    ]
-    char_tokenizer.fit_on_texts(texts)
-    expected_word_counts = collections.OrderedDict([
-        ('t', 11), ('h', 5), ('e', 6), (' ', 14), ('c', 2), ('a', 6), ('s', 2),
-        ('o', 6), ('n', 4), ('m', 1), ('.', 3), ('d', 3), ('g', 5), ('l', 2),
-        ('i', 2), ('v', 1), ('r', 1)
-    ])
-    self.assertEqual(char_tokenizer.word_counts, expected_word_counts)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_one_hot(self):
+        sample_text = "The cat sat on the mat."
+        encoded = text.one_hot(sample_text, 5)
+        self.assertLen(encoded, 6)
+        self.assertLessEqual(np.max(encoded), 4)
+        self.assertGreaterEqual(np.min(encoded), 0)
+
+        sample_text = "The-cat-sat-on-the-mat"
+        encoded2 = text.one_hot(
+            sample_text, 5, analyzer=lambda t: t.lower().split("-")
+        )
+        self.assertEqual(encoded, encoded2)
+        self.assertLen(encoded, 6)
+        self.assertLessEqual(np.max(encoded), 4)
+        self.assertGreaterEqual(np.min(encoded), 0)
+
+    def test_hashing_trick_hash(self):
+        sample_text = "The cat sat on the mat."
+        encoded = text.hashing_trick(sample_text, 5)
+        self.assertLen(encoded, 6)
+        self.assertLessEqual(np.max(encoded), 4)
+        self.assertGreaterEqual(np.min(encoded), 1)
+
+    def test_hashing_trick_md5(self):
+        sample_text = "The cat sat on the mat."
+        encoded = text.hashing_trick(sample_text, 5, hash_function="md5")
+        self.assertLen(encoded, 6)
+        self.assertLessEqual(np.max(encoded), 4)
+        self.assertGreaterEqual(np.min(encoded), 1)
+
+    def test_tokenizer(self):
+        sample_texts = [
+            "The cat sat on the mat.",
+            "The dog sat on the log.",
+            "Dogs and cats living together.",
+        ]
+        tokenizer = text.Tokenizer(num_words=10)
+        tokenizer.fit_on_texts(sample_texts)
+
+        sequences = []
+        for seq in tokenizer.texts_to_sequences_generator(sample_texts):
+            sequences.append(seq)
+        self.assertLess(np.max(np.max(np.asarray(sequences, dtype=object))), 10)
+        self.assertEqual(np.min(np.min(np.asarray(sequences, dtype=object))), 1)
+
+        tokenizer.fit_on_sequences(sequences)
+
+        for mode in ["binary", "count", "tfidf", "freq"]:
+            tokenizer.texts_to_matrix(sample_texts, mode)
+
+    def test_tokenizer_serde_no_fitting(self):
+        tokenizer = text.Tokenizer(num_words=100)
+
+        tokenizer_json = tokenizer.to_json()
+        recovered = text.tokenizer_from_json(tokenizer_json)
+
+        self.assertEqual(tokenizer.get_config(), recovered.get_config())
+
+        self.assertEqual(tokenizer.word_docs, recovered.word_docs)
+        self.assertEqual(tokenizer.word_counts, recovered.word_counts)
+        self.assertEqual(tokenizer.word_index, recovered.word_index)
+        self.assertEqual(tokenizer.index_word, recovered.index_word)
+        self.assertEqual(tokenizer.index_docs, recovered.index_docs)
+
+    def test_tokenizer_serde_fitting(self):
+        sample_texts = [
+            "There was a time that the pieces fit, but I watched "
+            "them fall away",
+            "Mildewed and smoldering, strangled by our coveting",
+            "I've done the math enough to know the dangers of our second "
+            "guessing",
+        ]
+        tokenizer = text.Tokenizer(num_words=100)
+        tokenizer.fit_on_texts(sample_texts)
+
+        seq_generator = tokenizer.texts_to_sequences_generator(sample_texts)
+        sequences = [seq for seq in seq_generator]
+        tokenizer.fit_on_sequences(sequences)
+
+        tokenizer_json = tokenizer.to_json()
+        recovered = text.tokenizer_from_json(tokenizer_json)
+
+        self.assertEqual(tokenizer.char_level, recovered.char_level)
+        self.assertEqual(tokenizer.document_count, recovered.document_count)
+        self.assertEqual(tokenizer.filters, recovered.filters)
+        self.assertEqual(tokenizer.lower, recovered.lower)
+        self.assertEqual(tokenizer.num_words, recovered.num_words)
+        self.assertEqual(tokenizer.oov_token, recovered.oov_token)
+
+        self.assertEqual(tokenizer.word_docs, recovered.word_docs)
+        self.assertEqual(tokenizer.word_counts, recovered.word_counts)
+        self.assertEqual(tokenizer.word_index, recovered.word_index)
+        self.assertEqual(tokenizer.index_word, recovered.index_word)
+        self.assertEqual(tokenizer.index_docs, recovered.index_docs)
+
+    def test_sequential_fit(self):
+        texts = [
+            "The cat sat on the mat.",
+            "The dog sat on the log.",
+            "Dogs and cats living together.",
+        ]
+        word_sequences = [
+            ["The", "cat", "is", "sitting"],
+            ["The", "dog", "is", "standing"],
+        ]
+
+        tokenizer = text.Tokenizer()
+        tokenizer.fit_on_texts(texts)
+        tokenizer.fit_on_texts(word_sequences)
+
+        self.assertEqual(tokenizer.document_count, 5)
+
+        tokenizer.texts_to_matrix(texts)
+        tokenizer.texts_to_matrix(word_sequences)
+
+    def test_text_to_word_sequence(self):
+        sample_text = "hello! ? world!"
+        self.assertEqual(
+            text.text_to_word_sequence(sample_text), ["hello", "world"]
+        )
+
+    def test_text_to_word_sequence_multichar_split(self):
+        sample_text = "hello!stop?world!"
+        self.assertEqual(
+            text.text_to_word_sequence(sample_text, split="stop"),
+            ["hello", "world"],
+        )
+
+    def test_text_to_word_sequence_unicode(self):
+        sample_text = "ali! veli? kırk dokuz elli"
+        self.assertEqual(
+            text.text_to_word_sequence(sample_text),
+            ["ali", "veli", "kırk", "dokuz", "elli"],
+        )
+
+    def test_text_to_word_sequence_unicode_multichar_split(self):
+        sample_text = "ali!stopveli?stopkırkstopdokuzstopelli"
+        self.assertEqual(
+            text.text_to_word_sequence(sample_text, split="stop"),
+            ["ali", "veli", "kırk", "dokuz", "elli"],
+        )
+
+    def test_tokenizer_unicode(self):
+        sample_texts = [
+            "ali veli kırk dokuz elli",
+            "ali veli kırk dokuz elli veli kırk dokuz",
+        ]
+        tokenizer = text.Tokenizer(num_words=5)
+        tokenizer.fit_on_texts(sample_texts)
+
+        self.assertLen(tokenizer.word_counts, 5)
+
+    def test_tokenizer_oov_flag(self):
+        """Test of Out of Vocabulary (OOV) flag in text.Tokenizer."""
+        x_train = ["This text has only known words"]
+        x_test = ["This text has some unknown words"]  # 2 OOVs: some, unknown
+
+        # Default, without OOV flag
+        tokenizer = text.Tokenizer()
+        tokenizer.fit_on_texts(x_train)
+        x_test_seq = tokenizer.texts_to_sequences(x_test)
+        self.assertLen(x_test_seq[0], 4)  # discards 2 OOVs
+
+        # With OOV feature
+        tokenizer = text.Tokenizer(oov_token="<unk>")
+        tokenizer.fit_on_texts(x_train)
+        x_test_seq = tokenizer.texts_to_sequences(x_test)
+        self.assertLen(x_test_seq[0], 6)  # OOVs marked in place
+
+    def test_tokenizer_oov_flag_and_num_words(self):
+        x_train = ["This text has only known words this text"]
+        x_test = ["This text has some unknown words"]
+
+        tokenizer = text.Tokenizer(num_words=3, oov_token="<unk>")
+        tokenizer.fit_on_texts(x_train)
+        x_test_seq = tokenizer.texts_to_sequences(x_test)
+        trans_text = " ".join(tokenizer.index_word[t] for t in x_test_seq[0])
+        self.assertLen(x_test_seq[0], 6)
+        self.assertEqual(trans_text, "this <unk> <unk> <unk> <unk> <unk>")
+
+    def test_sequences_to_texts_with_num_words_and_oov_token(self):
+        x_train = ["This text has only known words this text"]
+        x_test = ["This text has some unknown words"]
+
+        tokenizer = text.Tokenizer(num_words=3, oov_token="<unk>")
+
+        tokenizer.fit_on_texts(x_train)
+        x_test_seq = tokenizer.texts_to_sequences(x_test)
+        trans_text = tokenizer.sequences_to_texts(x_test_seq)
+        self.assertEqual(trans_text, ["this <unk> <unk> <unk> <unk> <unk>"])
+
+    def test_sequences_to_texts_no_num_words(self):
+        x_train = ["This text has only known words this text"]
+        x_test = ["This text has some unknown words"]
+
+        tokenizer = text.Tokenizer(oov_token="<unk>")
+
+        tokenizer.fit_on_texts(x_train)
+        x_test_seq = tokenizer.texts_to_sequences(x_test)
+        trans_text = tokenizer.sequences_to_texts(x_test_seq)
+        self.assertEqual(trans_text, ["this text has <unk> <unk> words"])
+
+    def test_sequences_to_texts_no_oov_token(self):
+        x_train = ["This text has only known words this text"]
+        x_test = ["This text has some unknown words"]
+
+        tokenizer = text.Tokenizer(num_words=3)
+
+        tokenizer.fit_on_texts(x_train)
+        x_test_seq = tokenizer.texts_to_sequences(x_test)
+        trans_text = tokenizer.sequences_to_texts(x_test_seq)
+        self.assertEqual(trans_text, ["this text"])
+
+    def test_sequences_to_texts_no_num_words_no_oov_token(self):
+        x_train = ["This text has only known words this text"]
+        x_test = ["This text has some unknown words"]
+
+        tokenizer = text.Tokenizer()
+
+        tokenizer.fit_on_texts(x_train)
+        x_test_seq = tokenizer.texts_to_sequences(x_test)
+        trans_text = tokenizer.sequences_to_texts(x_test_seq)
+        self.assertEqual(trans_text, ["this text has words"])
+
+    def test_sequences_to_texts(self):
+        texts = [
+            "The cat sat on the mat.",
+            "The dog sat on the log.",
+            "Dogs and cats living together.",
+        ]
+        tokenizer = text.Tokenizer(num_words=10, oov_token="<unk>")
+        tokenizer.fit_on_texts(texts)
+        tokenized_text = tokenizer.texts_to_sequences(texts)
+        trans_text = tokenizer.sequences_to_texts(tokenized_text)
+        self.assertEqual(
+            trans_text,
+            [
+                "the cat sat on the mat",
+                "the dog sat on the log",
+                "dogs <unk> <unk> <unk> <unk>",
+            ],
+        )
+
+    def test_tokenizer_lower_flag(self):
+        """Tests for `lower` flag in text.Tokenizer."""
+        # word level tokenizer with sentences as texts
+        word_tokenizer = text.Tokenizer(lower=True)
+        texts = [
+            "The cat sat on the mat.",
+            "The dog sat on the log.",
+            "Dog and Cat living Together.",
+        ]
+        word_tokenizer.fit_on_texts(texts)
+        expected_word_counts = collections.OrderedDict(
+            [
+                ("the", 4),
+                ("cat", 2),
+                ("sat", 2),
+                ("on", 2),
+                ("mat", 1),
+                ("dog", 2),
+                ("log", 1),
+                ("and", 1),
+                ("living", 1),
+                ("together", 1),
+            ]
+        )
+        self.assertEqual(word_tokenizer.word_counts, expected_word_counts)
+
+        # word level tokenizer with word_sequences as texts
+        word_tokenizer = text.Tokenizer(lower=True)
+        word_sequences = [
+            ["The", "cat", "is", "sitting"],
+            ["The", "dog", "is", "standing"],
+        ]
+        word_tokenizer.fit_on_texts(word_sequences)
+        expected_word_counts = collections.OrderedDict(
+            [
+                ("the", 2),
+                ("cat", 1),
+                ("is", 2),
+                ("sitting", 1),
+                ("dog", 1),
+                ("standing", 1),
+            ]
+        )
+        self.assertEqual(word_tokenizer.word_counts, expected_word_counts)
+
+        # char level tokenizer with sentences as texts
+        char_tokenizer = text.Tokenizer(lower=True, char_level=True)
+        texts = [
+            "The cat sat on the mat.",
+            "The dog sat on the log.",
+            "Dog and Cat living Together.",
+        ]
+        char_tokenizer.fit_on_texts(texts)
+        expected_word_counts = collections.OrderedDict(
+            [
+                ("t", 11),
+                ("h", 5),
+                ("e", 6),
+                (" ", 14),
+                ("c", 2),
+                ("a", 6),
+                ("s", 2),
+                ("o", 6),
+                ("n", 4),
+                ("m", 1),
+                (".", 3),
+                ("d", 3),
+                ("g", 5),
+                ("l", 2),
+                ("i", 2),
+                ("v", 1),
+                ("r", 1),
+            ]
+        )
+        self.assertEqual(char_tokenizer.word_counts, expected_word_counts)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/protobuf/BUILD b/keras/protobuf/BUILD
index 413dcb74d90b..e2f9c1f3ba70 100644
--- a/keras/protobuf/BUILD
+++ b/keras/protobuf/BUILD
@@ -4,6 +4,7 @@
 load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
     ],
diff --git a/keras/regularizers.py b/keras/regularizers.py
index 627f481c3eb8..763b99097000 100644
--- a/keras/regularizers.py
+++ b/keras/regularizers.py
@@ -13,386 +13,403 @@
 # limitations under the License.
 # ==============================================================================
 """Built-in regularizers."""
-# pylint: disable=g-classes-have-attributes
-# pylint: disable=invalid-name
 
-import tensorflow.compat.v2 as tf
 
 import math
+import warnings
+
+import tensorflow.compat.v2 as tf
 
 from keras import backend
-from keras.utils.generic_utils import deserialize_keras_object
-from keras.utils.generic_utils import serialize_keras_object
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.serialization_lib import deserialize_keras_object
+from keras.saving.serialization_lib import serialize_keras_object
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
 def _check_penalty_number(x):
-  """check penalty number availability, raise ValueError if failed."""
-  if not isinstance(x, (float, int)):
-    raise ValueError(
-        f'Value {x} is not a valid regularization penalty number, '
-        'expected an int or float value.')
+    """check penalty number availability, raise ValueError if failed."""
+    if not isinstance(x, (float, int)):
+        raise ValueError(
+            f"Value {x} is not a valid regularization penalty number, "
+            "expected an int or float value."
+        )
 
-  if math.isinf(x) or math.isnan(x):
-    raise ValueError(
-        f'Value {x} is not a valid regularization penalty number, '
-        'an infinite number or NaN are not valid values.')
+    if math.isinf(x) or math.isnan(x):
+        raise ValueError(
+            f"Value {x} is not a valid regularization penalty number, "
+            "an infinite number or NaN are not valid values."
+        )
 
 
 def _none_to_default(inputs, default):
-  return default if inputs is None else default
+    return default if inputs is None else default
 
 
-@keras_export('keras.regularizers.Regularizer')
+@keras_export("keras.regularizers.Regularizer")
 class Regularizer:
-  """Regularizer base class.
-
-  Regularizers allow you to apply penalties on layer parameters or layer
-  activity during optimization. These penalties are summed into the loss
-  function that the network optimizes.
-
-  Regularization penalties are applied on a per-layer basis. The exact API will
-  depend on the layer, but many layers (e.g. `Dense`, `Conv1D`, `Conv2D` and
-  `Conv3D`) have a unified API.
-
-  These layers expose 3 keyword arguments:
-
-  - `kernel_regularizer`: Regularizer to apply a penalty on the layer's kernel
-  - `bias_regularizer`: Regularizer to apply a penalty on the layer's bias
-  - `activity_regularizer`: Regularizer to apply a penalty on the layer's output
-
-  All layers (including custom layers) expose `activity_regularizer` as a
-  settable property, whether or not it is in the constructor arguments.
-
-  The value returned by the `activity_regularizer` is divided by the input
-  batch size so that the relative weighting between the weight regularizers and
-  the activity regularizers does not change with the batch size.
-
-  You can access a layer's regularization penalties by calling `layer.losses`
-  after calling the layer on inputs.
-
-  ## Example
-
-  >>> layer = tf.keras.layers.Dense(
-  ...     5, input_dim=5,
-  ...     kernel_initializer='ones',
-  ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
-  ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
-  >>> tensor = tf.ones(shape=(5, 5)) * 2.0
-  >>> out = layer(tensor)
-
-  >>> # The kernel regularization term is 0.25
-  >>> # The activity regularization term (after dividing by the batch size) is 5
-  >>> tf.math.reduce_sum(layer.losses)
-  <tf.Tensor: shape=(), dtype=float32, numpy=5.25>
-
-  ## Available penalties
-
-  ```python
-  tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
-  tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
-  tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
-  ```
-
-  ## Directly calling a regularizer
-
-  Compute a regularization loss on a tensor by directly calling a regularizer
-  as if it is a one-argument function.
-
-  E.g.
-  >>> regularizer = tf.keras.regularizers.L2(2.)
-  >>> tensor = tf.ones(shape=(5, 5))
-  >>> regularizer(tensor)
-  <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
-
-
-  ## Developing new regularizers
-
-  Any function that takes in a weight matrix and returns a scalar
-  tensor can be used as a regularizer, e.g.:
-
-  >>> @tf.keras.utils.register_keras_serializable(package='Custom', name='l1')
-  ... def l1_reg(weight_matrix):
-  ...    return 0.01 * tf.math.reduce_sum(tf.math.abs(weight_matrix))
-  ...
-  >>> layer = tf.keras.layers.Dense(5, input_dim=5,
-  ...     kernel_initializer='ones', kernel_regularizer=l1_reg)
-  >>> tensor = tf.ones(shape=(5, 5))
-  >>> out = layer(tensor)
-  >>> layer.losses
-  [<tf.Tensor: shape=(), dtype=float32, numpy=0.25>]
-
-  Alternatively, you can write your custom regularizers in an
-  object-oriented way by extending this regularizer base class, e.g.:
-
-  >>> @tf.keras.utils.register_keras_serializable(package='Custom', name='l2')
-  ... class L2Regularizer(tf.keras.regularizers.Regularizer):
-  ...   def __init__(self, l2=0.):  # pylint: disable=redefined-outer-name
-  ...     self.l2 = l2
-  ...
-  ...   def __call__(self, x):
-  ...     return self.l2 * tf.math.reduce_sum(tf.math.square(x))
-  ...
-  ...   def get_config(self):
-  ...     return {'l2': float(self.l2)}
-  ...
-  >>> layer = tf.keras.layers.Dense(
-  ...   5, input_dim=5, kernel_initializer='ones',
-  ...   kernel_regularizer=L2Regularizer(l2=0.5))
-
-  >>> tensor = tf.ones(shape=(5, 5))
-  >>> out = layer(tensor)
-  >>> layer.losses
-  [<tf.Tensor: shape=(), dtype=float32, numpy=12.5>]
-
-  ### A note on serialization and deserialization:
-
-  Registering the regularizers as serializable is optional if you are just
-  training and executing models, exporting to and from SavedModels, or saving
-  and loading weight checkpoints.
-
-  Registration is required for saving and
-  loading models to HDF5 format, Keras model cloning, some visualization
-  utilities, and exporting models to and from JSON. If using this functionality,
-  you must make sure any python process running your model has also defined
-  and registered your custom regularizer.
-  """
-
-  def __call__(self, x):
-    """Compute a regularization penalty from an input tensor."""
-    return 0.
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates a regularizer from its config.
-
-    This method is the reverse of `get_config`,
-    capable of instantiating the same regularizer from the config
-    dictionary.
-
-    This method is used by Keras `model_to_estimator`, saving and
-    loading models to HDF5 formats, Keras model cloning, some visualization
-    utilities, and exporting models to and from JSON.
+    """Regularizer base class.
+
+    Regularizers allow you to apply penalties on layer parameters or layer
+    activity during optimization. These penalties are summed into the loss
+    function that the network optimizes.
+
+    Regularization penalties are applied on a per-layer basis. The exact API
+    will depend on the layer, but many layers (e.g. `Dense`, `Conv1D`, `Conv2D`
+    and `Conv3D`) have a unified API.
+
+    These layers expose 3 keyword arguments:
+
+    - `kernel_regularizer`: Regularizer to apply a penalty on the layer's kernel
+    - `bias_regularizer`: Regularizer to apply a penalty on the layer's bias
+    - `activity_regularizer`: Regularizer to apply a penalty on the layer's
+    output
+
+    All layers (including custom layers) expose `activity_regularizer` as a
+    settable property, whether or not it is in the constructor arguments.
+
+    The value returned by the `activity_regularizer` is divided by the input
+    batch size so that the relative weighting between the weight regularizers
+    and the activity regularizers does not change with the batch size.
+
+    You can access a layer's regularization penalties by calling `layer.losses`
+    after calling the layer on inputs.
+
+    ## Example
+
+    >>> layer = tf.keras.layers.Dense(
+    ...     5, input_dim=5,
+    ...     kernel_initializer='ones',
+    ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
+    ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
+    >>> tensor = tf.ones(shape=(5, 5)) * 2.0
+    >>> out = layer(tensor)
+
+    >>> # The kernel regularization term is 0.25
+    >>> # The activity regularization term (after dividing by the batch size)
+    >>> # is 5
+    >>> tf.math.reduce_sum(layer.losses)
+    <tf.Tensor: shape=(), dtype=float32, numpy=5.25>
+
+    ## Available penalties
+
+    ```python
+    tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
+    tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
+    tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
+    ```
+
+    ## Directly calling a regularizer
+
+    Compute a regularization loss on a tensor by directly calling a regularizer
+    as if it is a one-argument function.
+
+    E.g.
+    >>> regularizer = tf.keras.regularizers.L2(2.)
+    >>> tensor = tf.ones(shape=(5, 5))
+    >>> regularizer(tensor)
+    <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
+
+
+    ## Developing new regularizers
+
+    Any function that takes in a weight matrix and returns a scalar
+    tensor can be used as a regularizer, e.g.:
+
+    >>> @tf.keras.utils.register_keras_serializable(package='Custom', name='l1')
+    ... def l1_reg(weight_matrix):
+    ...    return 0.01 * tf.math.reduce_sum(tf.math.abs(weight_matrix))
+    ...
+    >>> layer = tf.keras.layers.Dense(5, input_dim=5,
+    ...     kernel_initializer='ones', kernel_regularizer=l1_reg)
+    >>> tensor = tf.ones(shape=(5, 5))
+    >>> out = layer(tensor)
+    >>> layer.losses
+    [<tf.Tensor: shape=(), dtype=float32, numpy=0.25>]
+
+    Alternatively, you can write your custom regularizers in an
+    object-oriented way by extending this regularizer base class, e.g.:
+
+    >>> @tf.keras.utils.register_keras_serializable(package='Custom', name='l2')
+    ... class L2Regularizer(tf.keras.regularizers.Regularizer):
+    ...   def __init__(self, l2=0.):
+    ...     self.l2 = l2
+    ...
+    ...   def __call__(self, x):
+    ...     return self.l2 * tf.math.reduce_sum(tf.math.square(x))
+    ...
+    ...   def get_config(self):
+    ...     return {'l2': float(self.l2)}
+    ...
+    >>> layer = tf.keras.layers.Dense(
+    ...   5, input_dim=5, kernel_initializer='ones',
+    ...   kernel_regularizer=L2Regularizer(l2=0.5))
+
+    >>> tensor = tf.ones(shape=(5, 5))
+    >>> out = layer(tensor)
+    >>> layer.losses
+    [<tf.Tensor: shape=(), dtype=float32, numpy=12.5>]
+
+    ### A note on serialization and deserialization:
+
+    Registering the regularizers as serializable is optional if you are just
+    training and executing models, exporting to and from SavedModels, or saving
+    and loading weight checkpoints.
+
+    Registration is required for saving and
+    loading models to HDF5 format, Keras model cloning, some visualization
+    utilities, and exporting models to and from JSON. If using this
+    functionality, you must make sure any python process running your model has
+    also defined and registered your custom regularizer.
+    """
 
-    Args:
-        config: A Python dictionary, typically the output of get_config.
+    def __call__(self, x):
+        """Compute a regularization penalty from an input tensor."""
+        return 0.0
 
-    Returns:
-        A regularizer instance.
-    """
-    return cls(**config)
+    @classmethod
+    def from_config(cls, config):
+        """Creates a regularizer from its config.
 
-  def get_config(self):
-    """Returns the config of the regularizer.
+        This method is the reverse of `get_config`,
+        capable of instantiating the same regularizer from the config
+        dictionary.
 
-    An regularizer config is a Python dictionary (serializable)
-    containing all configuration parameters of the regularizer.
-    The same regularizer can be reinstantiated later
-    (without any saved state) from this configuration.
+        This method is used by Keras `model_to_estimator`, saving and
+        loading models to HDF5 formats, Keras model cloning, some visualization
+        utilities, and exporting models to and from JSON.
 
-    This method is optional if you are just training and executing models,
-    exporting to and from SavedModels, or using weight checkpoints.
+        Args:
+            config: A Python dictionary, typically the output of get_config.
 
-    This method is required for Keras `model_to_estimator`, saving and
-    loading models to HDF5 formats, Keras model cloning, some visualization
-    utilities, and exporting models to and from JSON.
+        Returns:
+            A regularizer instance.
+        """
+        return cls(**config)
 
-    Returns:
-        Python dictionary.
-    """
-    raise NotImplementedError(f'{self} does not implement get_config()')
+    def get_config(self):
+        """Returns the config of the regularizer.
+
+        An regularizer config is a Python dictionary (serializable)
+        containing all configuration parameters of the regularizer.
+        The same regularizer can be reinstantiated later
+        (without any saved state) from this configuration.
+
+        This method is optional if you are just training and executing models,
+        exporting to and from SavedModels, or using weight checkpoints.
 
+        This method is required for Keras `model_to_estimator`, saving and
+        loading models to HDF5 formats, Keras model cloning, some visualization
+        utilities, and exporting models to and from JSON.
 
-@keras_export('keras.regularizers.L1L2')
+        Returns:
+            Python dictionary.
+        """
+        raise NotImplementedError(f"{self} does not implement get_config()")
+
+
+@keras_export("keras.regularizers.L1L2")
 class L1L2(Regularizer):
-  """A regularizer that applies both L1 and L2 regularization penalties.
+    """A regularizer that applies both L1 and L2 regularization penalties.
 
-  The L1 regularization penalty is computed as:
-  `loss = l1 * reduce_sum(abs(x))`
+    The L1 regularization penalty is computed as:
+    `loss = l1 * reduce_sum(abs(x))`
 
-  The L2 regularization penalty is computed as
-  `loss = l2 * reduce_sum(square(x))`
+    The L2 regularization penalty is computed as
+    `loss = l2 * reduce_sum(square(x))`
 
-  L1L2 may be passed to a layer as a string identifier:
+    L1L2 may be passed to a layer as a string identifier:
 
-  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
+    >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
 
-  In this case, the default values used are `l1=0.01` and `l2=0.01`.
+    In this case, the default values used are `l1=0.01` and `l2=0.01`.
 
-  Arguments:
-      l1: Float; L1 regularization factor.
-      l2: Float; L2 regularization factor.
-  """
+    Arguments:
+        l1: Float; L1 regularization factor.
+        l2: Float; L2 regularization factor.
+    """
 
-  def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
-    # The default value for l1 and l2 are different from the value in l1_l2
-    # for backward compatibility reason. Eg, L1L2(l2=0.1) will only have l2
-    # and no l1 penalty.
-    l1 = 0. if l1 is None else l1
-    l2 = 0. if l2 is None else l2
-    _check_penalty_number(l1)
-    _check_penalty_number(l2)
+    def __init__(self, l1=0.0, l2=0.0):
+        # The default value for l1 and l2 are different from the value in l1_l2
+        # for backward compatibility reason. Eg, L1L2(l2=0.1) will only have l2
+        # and no l1 penalty.
+        l1 = 0.0 if l1 is None else l1
+        l2 = 0.0 if l2 is None else l2
+        _check_penalty_number(l1)
+        _check_penalty_number(l2)
 
-    self.l1 = backend.cast_to_floatx(l1)
-    self.l2 = backend.cast_to_floatx(l2)
+        self.l1 = backend.cast_to_floatx(l1)
+        self.l2 = backend.cast_to_floatx(l2)
 
-  def __call__(self, x):
-    regularization = backend.constant(0., dtype=x.dtype)
-    if self.l1:
-      regularization += self.l1 * tf.reduce_sum(tf.abs(x))
-    if self.l2:
-      regularization += self.l2 * tf.reduce_sum(tf.square(x))
-    return regularization
+    def __call__(self, x):
+        regularization = backend.constant(0.0, dtype=x.dtype)
+        if self.l1:
+            regularization += self.l1 * tf.reduce_sum(tf.abs(x))
+        if self.l2:
+            # equivalent to "self.l2 * tf.reduce_sum(tf.square(x))"
+            regularization += 2.0 * self.l2 * tf.nn.l2_loss(x)
+        return regularization
 
-  def get_config(self):
-    return {'l1': float(self.l1), 'l2': float(self.l2)}
+    def get_config(self):
+        return {"l1": float(self.l1), "l2": float(self.l2)}
 
 
-@keras_export('keras.regularizers.L1', 'keras.regularizers.l1')
+@keras_export("keras.regularizers.L1", "keras.regularizers.l1")
 class L1(Regularizer):
-  """A regularizer that applies a L1 regularization penalty.
+    """A regularizer that applies a L1 regularization penalty.
 
-  The L1 regularization penalty is computed as:
-  `loss = l1 * reduce_sum(abs(x))`
+    The L1 regularization penalty is computed as:
+    `loss = l1 * reduce_sum(abs(x))`
 
-  L1 may be passed to a layer as a string identifier:
+    L1 may be passed to a layer as a string identifier:
 
-  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
+    >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
 
-  In this case, the default value used is `l1=0.01`.
+    In this case, the default value used is `l1=0.01`.
 
-  Arguments:
-      l1: Float; L1 regularization factor.
-  """
+    Arguments:
+        l1: Float; L1 regularization factor.
+    """
 
-  def __init__(self, l1=0.01, **kwargs):  # pylint: disable=redefined-outer-name
-    l1 = kwargs.pop('l', l1)  # Backwards compatibility
-    if kwargs:
-      raise TypeError(f'Argument(s) not recognized: {kwargs}')
+    def __init__(self, l1=0.01, **kwargs):
+        l1 = kwargs.pop("l", l1)  # Backwards compatibility
+        if kwargs:
+            raise TypeError(f"Argument(s) not recognized: {kwargs}")
 
-    l1 = 0.01 if l1 is None else l1
-    _check_penalty_number(l1)
+        l1 = 0.01 if l1 is None else l1
+        _check_penalty_number(l1)
 
-    self.l1 = backend.cast_to_floatx(l1)
+        self.l1 = backend.cast_to_floatx(l1)
 
-  def __call__(self, x):
-    return self.l1 * tf.reduce_sum(tf.abs(x))
+    def __call__(self, x):
+        return self.l1 * tf.reduce_sum(tf.abs(x))
 
-  def get_config(self):
-    return {'l1': float(self.l1)}
+    def get_config(self):
+        return {"l1": float(self.l1)}
 
 
-@keras_export('keras.regularizers.L2', 'keras.regularizers.l2')
+@keras_export("keras.regularizers.L2", "keras.regularizers.l2")
 class L2(Regularizer):
-  """A regularizer that applies a L2 regularization penalty.
+    """A regularizer that applies a L2 regularization penalty.
 
-  The L2 regularization penalty is computed as:
-  `loss = l2 * reduce_sum(square(x))`
+    The L2 regularization penalty is computed as:
+    `loss = l2 * reduce_sum(square(x))`
 
-  L2 may be passed to a layer as a string identifier:
+    L2 may be passed to a layer as a string identifier:
 
-  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
+    >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
 
-  In this case, the default value used is `l2=0.01`.
+    In this case, the default value used is `l2=0.01`.
 
-  Arguments:
-      l2: Float; L2 regularization factor.
-  """
+    Arguments:
+        l2: Float; L2 regularization factor.
+    """
 
-  def __init__(self, l2=0.01, **kwargs):  # pylint: disable=redefined-outer-name
-    l2 = kwargs.pop('l', l2)  # Backwards compatibility
-    if kwargs:
-      raise TypeError(f'Argument(s) not recognized: {kwargs}')
+    def __init__(self, l2=0.01, **kwargs):
+        l2 = kwargs.pop("l", l2)  # Backwards compatibility
+        if kwargs:
+            raise TypeError(f"Argument(s) not recognized: {kwargs}")
 
-    l2 = 0.01 if l2 is None else l2
-    _check_penalty_number(l2)
+        l2 = 0.01 if l2 is None else l2
+        _check_penalty_number(l2)
 
-    self.l2 = backend.cast_to_floatx(l2)
+        self.l2 = backend.cast_to_floatx(l2)
 
-  def __call__(self, x):
-    return self.l2 * tf.reduce_sum(tf.square(x))
+    def __call__(self, x):
+        # equivalent to "self.l2 * tf.reduce_sum(tf.square(x))"
+        return 2.0 * self.l2 * tf.nn.l2_loss(x)
 
-  def get_config(self):
-    return {'l2': float(self.l2)}
+    def get_config(self):
+        return {"l2": float(self.l2)}
 
 
 @keras_export(
-    'keras.regularizers.OrthogonalRegularizer',
-    'keras.regularizers.orthogonal_regularizer',
-    v1=[])
+    "keras.regularizers.OrthogonalRegularizer",
+    "keras.regularizers.orthogonal_regularizer",
+    v1=[],
+)
 class OrthogonalRegularizer(Regularizer):
-  """A regularizer that encourages input vectors to be orthogonal to each other.
-
-  It can be applied to either the rows of a matrix (`mode="rows"`) or its
-  columns (`mode="columns"`). When applied to a `Dense` kernel of shape
-  `(input_dim, units)`, rows mode will seek to make the feature vectors
-  (i.e. the basis of the output space) orthogonal to each other.
-
-  Arguments:
-    factor: Float. The regularization factor. The regularization penalty will
-      be proportional to `factor` times the mean of the dot products between
-      the L2-normalized rows (if `mode="rows"`, or columns if `mode="columns"`)
-      of the inputs, excluding the product of each row/column with itself.
-      Defaults to 0.01.
-    mode: String, one of `{"rows", "columns"}`. Defaults to `"rows"`. In rows
-      mode, the regularization effect seeks to make the rows of the input
-      orthogonal to each other. In columns mode, it seeks to make the columns
-      of the input orthogonal to each other.
-
-  Example:
-
-  >>> regularizer = tf.keras.regularizers.OrthogonalRegularizer(factor=0.01)
-  >>> layer = tf.keras.layers.Dense(units=4, kernel_regularizer=regularizer)
-  """
-
-  def __init__(self, factor=0.01, mode='rows'):
-    _check_penalty_number(factor)
-    self.factor = backend.cast_to_floatx(factor)
-    if mode not in {'rows', 'columns'}:
-      raise ValueError('Invalid value for argument `mode`. Expected one of '
-                       f'{{"rows", "columns"}}. Received: mode={mode}')
-    self.mode = mode
-
-  def __call__(self, inputs):
-    if inputs.shape.rank != 2:
-      raise ValueError(
-          'Inputs to OrthogonalRegularizer must have rank 2. Received: '
-          f'inputs.shape == {inputs.shape}')
-    if self.mode == 'rows':
-      inputs = tf.math.l2_normalize(inputs, axis=1)
-      product = tf.matmul(inputs, tf.transpose(inputs))
-      size = inputs.shape[0]
-    else:
-      inputs = tf.math.l2_normalize(inputs, axis=0)
-      product = tf.matmul(tf.transpose(inputs), inputs)
-      size = inputs.shape[1]
-    product_no_diagonal = product * (1. - tf.eye(size, dtype=inputs.dtype))
-    num_pairs = size * (size - 1.) / 2.
-    return self.factor * 0.5 * tf.reduce_sum(
-        tf.abs(product_no_diagonal)) / num_pairs
-
-  def get_config(self):
-    return {'factor': float(self.factor), 'mode': self.mode}
-
-
-@keras_export('keras.regularizers.l1_l2')
-def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
-  r"""Create a regularizer that applies both L1 and L2 penalties.
-
-  The L1 regularization penalty is computed as:
-  `loss = l1 * reduce_sum(abs(x))`
+    """Regularizer that encourages input vectors to be orthogonal to each other.
+
+    It can be applied to either the rows of a matrix (`mode="rows"`) or its
+    columns (`mode="columns"`). When applied to a `Dense` kernel of shape
+    `(input_dim, units)`, rows mode will seek to make the feature vectors
+    (i.e. the basis of the output space) orthogonal to each other.
+
+    Arguments:
+      factor: Float. The regularization factor. The regularization penalty will
+        be proportional to `factor` times the mean of the dot products between
+        the L2-normalized rows (if `mode="rows"`, or columns if
+        `mode="columns"`) of the inputs, excluding the product of each
+        row/column with itself.  Defaults to 0.01.
+      mode: String, one of `{"rows", "columns"}`. Defaults to `"rows"`. In rows
+        mode, the regularization effect seeks to make the rows of the input
+        orthogonal to each other. In columns mode, it seeks to make the columns
+        of the input orthogonal to each other.
+
+    Example:
+
+    >>> regularizer = tf.keras.regularizers.OrthogonalRegularizer(factor=0.01)
+    >>> layer = tf.keras.layers.Dense(units=4, kernel_regularizer=regularizer)
+    """
 
-  The L2 regularization penalty is computed as:
-  `loss = l2 * reduce_sum(square(x))`
+    def __init__(self, factor=0.01, mode="rows"):
+        _check_penalty_number(factor)
+        self.factor = backend.cast_to_floatx(factor)
+        if mode not in {"rows", "columns"}:
+            raise ValueError(
+                "Invalid value for argument `mode`. Expected one of "
+                f'{{"rows", "columns"}}. Received: mode={mode}'
+            )
+        self.mode = mode
+
+    def __call__(self, inputs):
+        if inputs.shape.rank != 2:
+            raise ValueError(
+                "Inputs to OrthogonalRegularizer must have rank 2. Received: "
+                f"inputs.shape == {inputs.shape}"
+            )
+        if self.mode == "rows":
+            inputs = tf.math.l2_normalize(inputs, axis=1)
+            product = tf.matmul(inputs, tf.transpose(inputs))
+            size = inputs.shape[0]
+        else:
+            inputs = tf.math.l2_normalize(inputs, axis=0)
+            product = tf.matmul(tf.transpose(inputs), inputs)
+            size = inputs.shape[1]
+        product_no_diagonal = product * (1.0 - tf.eye(size, dtype=inputs.dtype))
+        num_pairs = size * (size - 1.0) / 2.0
+        return (
+            self.factor
+            * 0.5
+            * tf.reduce_sum(tf.abs(product_no_diagonal))
+            / num_pairs
+        )
+
+    def get_config(self):
+        return {"factor": float(self.factor), "mode": self.mode}
+
+
+@keras_export("keras.regularizers.l1_l2")
+def l1_l2(l1=0.01, l2=0.01):
+    r"""Create a regularizer that applies both L1 and L2 penalties.
+
+    The L1 regularization penalty is computed as:
+    `loss = l1 * reduce_sum(abs(x))`
+
+    The L2 regularization penalty is computed as:
+    `loss = l2 * reduce_sum(square(x))`
 
-  Args:
-      l1: Float; L1 regularization factor.
-      l2: Float; L2 regularization factor.
+    Args:
+        l1: Float; L1 regularization factor.
+        l2: Float; L2 regularization factor.
 
-  Returns:
-    An L1L2 Regularizer with the given regularization factors.
-  """
-  return L1L2(l1=l1, l2=l2)
+    Returns:
+      An L1L2 Regularizer with the given regularization factors.
+    """
+    return L1L2(l1=l1, l2=l2)
 
 
 # Deserialization aliases.
@@ -401,35 +418,56 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
 orthogonal_regularizer = OrthogonalRegularizer
 
 
-@keras_export('keras.regularizers.serialize')
-def serialize(regularizer):
-  return serialize_keras_object(regularizer)
-
-
-@keras_export('keras.regularizers.deserialize')
-def deserialize(config, custom_objects=None):
-  if config == 'l1_l2':
-    # Special case necessary since the defaults used for "l1_l2" (string)
-    # differ from those of the L1L2 class.
-    return L1L2(l1=0.01, l2=0.01)
-  return deserialize_keras_object(
-      config,
-      module_objects=globals(),
-      custom_objects=custom_objects,
-      printable_module_name='regularizer')
-
-
-@keras_export('keras.regularizers.get')
+@keras_export("keras.regularizers.serialize")
+def serialize(regularizer, use_legacy_format=False):
+    if regularizer is None:
+        return None
+    if not isinstance(regularizer, Regularizer):
+        warnings.warn(
+            "The `keras.regularizers.serialize()` API should only be used for "
+            "objects of type `keras.regularizers.Regularizer`. Found an "
+            f"instance of type {type(regularizer)}, which may lead to improper "
+            "serialization."
+        )
+    if use_legacy_format:
+        return legacy_serialization.serialize_keras_object(regularizer)
+    return serialize_keras_object(regularizer)
+
+
+@keras_export("keras.regularizers.deserialize")
+def deserialize(config, custom_objects=None, use_legacy_format=False):
+    if config == "l1_l2":
+        # Special case necessary since the defaults used for "l1_l2" (string)
+        # differ from those of the L1L2 class.
+        return L1L2(l1=0.01, l2=0.01)
+    if use_legacy_format:
+        return legacy_serialization.deserialize_keras_object(
+            config,
+            module_objects=globals(),
+            custom_objects=custom_objects,
+            printable_module_name="regularizer",
+        )
+    return deserialize_keras_object(
+        config,
+        module_objects=globals(),
+        custom_objects=custom_objects,
+        printable_module_name="regularizer",
+    )
+
+
+@keras_export("keras.regularizers.get")
 def get(identifier):
-  """Retrieve a regularizer instance from a config or identifier."""
-  if identifier is None:
-    return None
-  if isinstance(identifier, dict):
-    return deserialize(identifier)
-  elif isinstance(identifier, str):
-    return deserialize(str(identifier))
-  elif callable(identifier):
-    return identifier
-  else:
-    raise ValueError(
-        f'Could not interpret regularizer identifier: {identifier}')
+    """Retrieve a regularizer instance from a config or identifier."""
+    if identifier is None:
+        return None
+    if isinstance(identifier, dict):
+        use_legacy_format = "module" not in identifier
+        return deserialize(identifier, use_legacy_format=use_legacy_format)
+    elif isinstance(identifier, str):
+        return deserialize(str(identifier))
+    elif callable(identifier):
+        return identifier
+    else:
+        raise ValueError(
+            f"Could not interpret regularizer identifier: {identifier}"
+        )
diff --git a/keras/regularizers_test.py b/keras/regularizers_test.py
index 01e23092f56a..e8bc3606e12c 100644
--- a/keras/regularizers_test.py
+++ b/keras/regularizers_test.py
@@ -14,321 +14,369 @@
 # ==============================================================================
 """Tests for Keras regularizers."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
-from keras.testing_infra import test_combinations
 from keras import regularizers
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import np_utils
 
-
 DATA_DIM = 5
 NUM_CLASSES = 2
 
 
-class KerasRegularizersTest(test_combinations.TestCase,
-                            parameterized.TestCase):
-
-  def create_model(self,
-                   kernel_regularizer=None,
-                   bias_regularizer=None,
-                   activity_regularizer=None):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(NUM_CLASSES,
-                                 kernel_regularizer=kernel_regularizer,
-                                 bias_regularizer=bias_regularizer,
-                                 activity_regularizer=activity_regularizer,
-                                 input_shape=(DATA_DIM,)))
-    return model
-
-  def regularizer_fn_tensor(x):
-    return tf.constant(0.)
-
-  def regularizer_fn_scalar(x):
-    return 0.
-
-  class RegularizerTensor(regularizers.Regularizer):
-    def __call__(self, x):
-      return tf.constant(0.)
-
-  class RegularizerScalar(regularizers.Regularizer):
-    def __call__(self, x):
-      return 0.
-
-  def get_data(self):
-    (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
-        train_samples=10,
-        test_samples=10,
-        input_shape=(DATA_DIM,),
-        num_classes=NUM_CLASSES)
-    y_train = np_utils.to_categorical(y_train, NUM_CLASSES)
-    y_test = np_utils.to_categorical(y_test, NUM_CLASSES)
-    return (x_train, y_train), (x_test, y_test)
-
-  def create_multi_input_model_from(self, layer1, layer2):
-    input_1 = keras.layers.Input(shape=(DATA_DIM,))
-    input_2 = keras.layers.Input(shape=(DATA_DIM,))
-    out1 = layer1(input_1)
-    out2 = layer2(input_2)
-    out = keras.layers.Average()([out1, out2])
-    model = keras.models.Model([input_1, input_2], out)
-    model.add_loss(keras.backend.mean(out2))
-    model.add_loss(tf.reduce_sum(input_1))
-    return model
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      ('l1', regularizers.l1()),
-      ('l2', regularizers.l2()),
-      ('l1_l2', regularizers.l1_l2()),
-      ('l2_zero', keras.regularizers.l2(0.)),
-      ('function_tensor', regularizer_fn_tensor),
-      ('function_scalar', regularizer_fn_scalar),
-      ('lambda_tensor', lambda x: tf.constant(0.)),
-      ('lambda_scalar', lambda x: 0.),
-      ('regularizer_base_class', regularizers.Regularizer()),
-      ('regularizer_custom_class_tensor', RegularizerTensor()),
-      ('regularizer_custom_class_scalar', RegularizerScalar()),
-  ])
-  def test_kernel_regularization(self, regularizer):
-    (x_train, y_train), _ = self.get_data()
-    model = self.create_model(kernel_regularizer=regularizer)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertEqual(len(model.losses), 1)
-    model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      ('l1', regularizers.l1()),
-      ('l2', regularizers.l2()),
-      ('l1_l2', regularizers.l1_l2()),
-      ('l2_zero', keras.regularizers.l2(0.)),
-      ('function_tensor', regularizer_fn_tensor),
-      ('function_scalar', regularizer_fn_scalar),
-      ('lambda_tensor', lambda x: tf.constant(0.)),
-      ('lambda_scalar', lambda x: 0.),
-      ('regularizer_base_class', regularizers.Regularizer()),
-      ('regularizer_custom_class_tensor', RegularizerTensor()),
-      ('regularizer_custom_class_scalar', RegularizerScalar()),
-  ])
-  def test_bias_regularization(self, regularizer):
-    (x_train, y_train), _ = self.get_data()
-    model = self.create_model(bias_regularizer=regularizer)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertEqual(len(model.losses), 1)
-    model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      ('l1', regularizers.l1()),
-      ('l2', regularizers.l2()),
-      ('l1_l2', regularizers.l1_l2()),
-      ('l2_zero', keras.regularizers.l2(0.)),
-      ('function_tensor', regularizer_fn_tensor),
-      ('function_scalar', regularizer_fn_scalar),
-      ('lambda_tensor', lambda x: tf.constant(0.)),
-      ('lambda_scalar', lambda x: 0.),
-      ('regularizer_base_class', regularizers.Regularizer()),
-      ('regularizer_custom_class_tensor', RegularizerTensor()),
-      ('regularizer_custom_class_scalar', RegularizerScalar()),
-  ])
-  def test_activity_regularization(self, regularizer):
-    (x_train, y_train), _ = self.get_data()
-    model = self.create_model(activity_regularizer=regularizer)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertEqual(len(model.losses), 1 if tf.executing_eagerly() else 1)
-    model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_zero_regularization(self):
-    # Verifies that training with zero regularization works.
-    x, y = np.ones((10, 10)), np.ones((10, 3))
-    model = test_utils.get_model_from_layers(
-        [keras.layers.Dense(3, kernel_regularizer=keras.regularizers.l2(0))],
-        input_shape=(10,))
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x, y, batch_size=5, epochs=1)
-
-  def test_custom_regularizer_saving(self):
-
-    def my_regularizer(weights):
-      return tf.reduce_sum(tf.abs(weights))
-
-    inputs = keras.Input((10,))
-    outputs = keras.layers.Dense(1, kernel_regularizer=my_regularizer)(inputs)
-    model = keras.Model(inputs, outputs)
-    model2 = model.from_config(
-        model.get_config(), custom_objects={'my_regularizer': my_regularizer})
-    self.assertEqual(model2.layers[1].kernel_regularizer, my_regularizer)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      ('l1', regularizers.l1()),
-      ('l2', regularizers.l2()),
-      ('l1_l2', regularizers.l1_l2()),
-  ])
-  def test_regularization_shared_layer(self, regularizer):
-    dense_layer = keras.layers.Dense(
-        NUM_CLASSES,
-        kernel_regularizer=regularizer,
-        activity_regularizer=regularizer)
-    model = self.create_multi_input_model_from(dense_layer, dense_layer)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertLen(model.losses, 5)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      ('l1', regularizers.l1()),
-      ('l2', regularizers.l2()),
-      ('l1_l2', regularizers.l1_l2()),
-  ])
-  def test_regularization_shared_model(self, regularizer):
-    dense_layer = keras.layers.Dense(
-        NUM_CLASSES,
-        kernel_regularizer=regularizer,
-        activity_regularizer=regularizer)
-
-    input_tensor = keras.layers.Input(shape=(DATA_DIM,))
-    dummy_model = keras.models.Model(input_tensor, dense_layer(input_tensor))
-
-    model = self.create_multi_input_model_from(dummy_model, dummy_model)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertLen(model.losses, 6)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters([
-      ('l1', regularizers.l1()),
-      ('l2', regularizers.l2()),
-      ('l1_l2', regularizers.l1_l2()),
-  ])
-  def test_regularization_shared_layer_in_different_models(self, regularizer):
-    shared_dense = keras.layers.Dense(
-        NUM_CLASSES,
-        kernel_regularizer=regularizer,
-        activity_regularizer=regularizer)
-    models = []
-    for _ in range(2):
-      input_tensor = keras.layers.Input(shape=(DATA_DIM,))
-      unshared_dense = keras.layers.Dense(
-          NUM_CLASSES, kernel_regularizer=regularizer)
-      out = unshared_dense(shared_dense(input_tensor))
-      models.append(keras.models.Model(input_tensor, out))
-
-    model = self.create_multi_input_model_from(
-        layer1=models[0], layer2=models[1])
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer='sgd',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    # We expect to see 9 losses on the model:
-    # - 2 from the 2 add_loss calls on the outer model.
-    # - 3 from the weight regularizers on the shared_dense layer, unshared_dense
-    # in inner model 1, unshared_dense in inner model 2.
-    # - 4 from activity regularizers on the shared_dense layer.
-    self.assertLen(model.losses, 9)
-
-  def test_deserialization_error(self):
-    with self.assertRaisesRegex(ValueError, 'Could not interpret regularizer'):
-      keras.regularizers.get(0)
-
-  @parameterized.named_parameters([
-      ('l1', regularizers.l1(l1=None), 0.01),
-      ('l2', regularizers.l2(l2=None), 0.01),
-      ('l1_l2', regularizers.l1_l2(l1=None, l2=None), 0.),
-  ])
-  def test_default_value_when_init_with_none(self, regularizer, expected_value):
-    expected_value = np.asarray(expected_value)
-    if hasattr(regularizer, 'l1'):
-      self.assertAllClose(regularizer.l1, expected_value)
-    if hasattr(regularizer, 'l2'):
-      self.assertAllClose(regularizer.l2, expected_value)
-
-  @test_utils.run_v2_only
-  def test_orthogonal_regularizer(self):
-    # Test correctness.
-    factor = 0.1
-    reg_rows = regularizers.OrthogonalRegularizer(factor=factor, mode='rows')
-    reg_cols = regularizers.OrthogonalRegularizer(factor=factor, mode='columns')
-
-    # Test with square matrix
-    inputs = tf.constant([[1, 1, 1, 1],
-                          [2, 0, 0, 0],
-                          [0, 0, 3, 1]], dtype='float32')
-    normalized_rows = tf.math.l2_normalize(inputs, axis=1)
-    normalized_cols = tf.math.l2_normalize(inputs, axis=0)
-    rows_pairs = [
-        tf.reduce_sum(normalized_rows[0] * normalized_rows[1]),
-        tf.reduce_sum(normalized_rows[0] * normalized_rows[2]),
-        tf.reduce_sum(normalized_rows[1] * normalized_rows[2]),
-    ]
-    col_pairs = [
-        tf.reduce_sum(normalized_cols[:, 0] * normalized_cols[:, 1]),
-        tf.reduce_sum(normalized_cols[:, 0] * normalized_cols[:, 2]),
-        tf.reduce_sum(normalized_cols[:, 0] * normalized_cols[:, 3]),
-        tf.reduce_sum(normalized_cols[:, 1] * normalized_cols[:, 2]),
-        tf.reduce_sum(normalized_cols[:, 1] * normalized_cols[:, 3]),
-        tf.reduce_sum(normalized_cols[:, 2] * normalized_cols[:, 3]),
-    ]
-    num_row_pairs = 3
-    num_col_pairs = 6
-    # Expected: factor * sum(pairwise_dot_products_of_rows) / num_row_pairs
-    self.assertAllClose(reg_rows(inputs),
-                        factor * sum(rows_pairs) / num_row_pairs)
-    # Expected: factor * sum(pairwise_dot_products_of_columns) / num_col_pairs
-    self.assertAllClose(reg_cols(inputs),
-                        factor * sum(col_pairs) / num_col_pairs)
-
-    # Test incorrect usage.
-    with self.assertRaisesRegex(ValueError, 'must have rank 2'):
-      reg_rows(tf.constant([1, 1], dtype='float32'))
-
-    # Test serialization
-    self.assertDictEqual(reg_cols.get_config(),
-                         {'factor': factor, 'mode': 'columns'})
-
-    # Test usage in model.
-    model_inputs = keras.Input((3,))
-    model_outputs = keras.layers.Dense(
-        4, kernel_regularizer=reg_rows)(model_inputs)
-    model = keras.Model(model_inputs, model_outputs)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.fit(np.random.random((16, 3)), np.random.random((16, 4)), epochs=1)
-
-    # Test serialization and deserialiation as part of model.
-    inputs = tf.constant([[1, 1, 1],
-                          [2, 0, 0],
-                          [0, 0, 3]], dtype='float32')
-    outputs = model(inputs)
-    config = model.get_config()
-    weights = model.get_weights()
-    model = keras.Model.from_config(config)
-    model.set_weights(weights)
-    self.assertAllClose(model(inputs), outputs, atol=1e-5)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+class KerasRegularizersTest(test_combinations.TestCase, parameterized.TestCase):
+    def create_model(
+        self,
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+    ):
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                NUM_CLASSES,
+                kernel_regularizer=kernel_regularizer,
+                bias_regularizer=bias_regularizer,
+                activity_regularizer=activity_regularizer,
+                input_shape=(DATA_DIM,),
+            )
+        )
+        return model
+
+    def regularizer_fn_tensor(x):
+        return tf.constant(0.0)
+
+    def regularizer_fn_scalar(x):
+        return 0.0
+
+    class RegularizerTensor(regularizers.Regularizer):
+        def __call__(self, x):
+            return tf.constant(0.0)
+
+    class RegularizerScalar(regularizers.Regularizer):
+        def __call__(self, x):
+            return 0.0
+
+    def get_data(self):
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=10,
+            test_samples=10,
+            input_shape=(DATA_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_train = np_utils.to_categorical(y_train, NUM_CLASSES)
+        y_test = np_utils.to_categorical(y_test, NUM_CLASSES)
+        return (x_train, y_train), (x_test, y_test)
+
+    def create_multi_input_model_from(self, layer1, layer2):
+        input_1 = keras.layers.Input(shape=(DATA_DIM,))
+        input_2 = keras.layers.Input(shape=(DATA_DIM,))
+        out1 = layer1(input_1)
+        out2 = layer2(input_2)
+        out = keras.layers.Average()([out1, out2])
+        model = keras.models.Model([input_1, input_2], out)
+        model.add_loss(keras.backend.mean(out2))
+        model.add_loss(tf.reduce_sum(input_1))
+        return model
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            ("l1", regularizers.l1()),
+            ("l2", regularizers.l2()),
+            ("l1_l2", regularizers.l1_l2()),
+            ("l2_zero", keras.regularizers.l2(0.0)),
+            ("function_tensor", regularizer_fn_tensor),
+            ("function_scalar", regularizer_fn_scalar),
+            ("lambda_tensor", lambda x: tf.constant(0.0)),
+            ("lambda_scalar", lambda x: 0.0),
+            ("regularizer_base_class", regularizers.Regularizer()),
+            ("regularizer_custom_class_tensor", RegularizerTensor()),
+            ("regularizer_custom_class_scalar", RegularizerScalar()),
+        ]
+    )
+    def test_kernel_regularization(self, regularizer):
+        (x_train, y_train), _ = self.get_data()
+        model = self.create_model(kernel_regularizer=regularizer)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertEqual(len(model.losses), 1)
+        model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            ("l1", regularizers.l1()),
+            ("l2", regularizers.l2()),
+            ("l1_l2", regularizers.l1_l2()),
+            ("l2_zero", keras.regularizers.l2(0.0)),
+            ("function_tensor", regularizer_fn_tensor),
+            ("function_scalar", regularizer_fn_scalar),
+            ("lambda_tensor", lambda x: tf.constant(0.0)),
+            ("lambda_scalar", lambda x: 0.0),
+            ("regularizer_base_class", regularizers.Regularizer()),
+            ("regularizer_custom_class_tensor", RegularizerTensor()),
+            ("regularizer_custom_class_scalar", RegularizerScalar()),
+        ]
+    )
+    def test_bias_regularization(self, regularizer):
+        (x_train, y_train), _ = self.get_data()
+        model = self.create_model(bias_regularizer=regularizer)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertEqual(len(model.losses), 1)
+        model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            ("l1", regularizers.l1()),
+            ("l2", regularizers.l2()),
+            ("l1_l2", regularizers.l1_l2()),
+            ("l2_zero", keras.regularizers.l2(0.0)),
+            ("function_tensor", regularizer_fn_tensor),
+            ("function_scalar", regularizer_fn_scalar),
+            ("lambda_tensor", lambda x: tf.constant(0.0)),
+            ("lambda_scalar", lambda x: 0.0),
+            ("regularizer_base_class", regularizers.Regularizer()),
+            ("regularizer_custom_class_tensor", RegularizerTensor()),
+            ("regularizer_custom_class_scalar", RegularizerScalar()),
+        ]
+    )
+    def test_activity_regularization(self, regularizer):
+        (x_train, y_train), _ = self.get_data()
+        model = self.create_model(activity_regularizer=regularizer)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertEqual(len(model.losses), 1 if tf.executing_eagerly() else 1)
+        model.fit(x_train, y_train, batch_size=10, epochs=1, verbose=0)
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_zero_regularization(self):
+        # Verifies that training with zero regularization works.
+        x, y = np.ones((10, 10)), np.ones((10, 3))
+        model = test_utils.get_model_from_layers(
+            [
+                keras.layers.Dense(
+                    3, kernel_regularizer=keras.regularizers.l2(0)
+                )
+            ],
+            input_shape=(10,),
+        )
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(x, y, batch_size=5, epochs=1)
+
+    def test_custom_regularizer_saving(self):
+        def my_regularizer(weights):
+            return tf.reduce_sum(tf.abs(weights))
+
+        inputs = keras.Input((10,))
+        outputs = keras.layers.Dense(1, kernel_regularizer=my_regularizer)(
+            inputs
+        )
+        model = keras.Model(inputs, outputs)
+        model2 = model.from_config(
+            model.get_config(),
+            custom_objects={"my_regularizer": my_regularizer},
+        )
+        self.assertEqual(model2.layers[1].kernel_regularizer, my_regularizer)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            ("l1", regularizers.l1()),
+            ("l2", regularizers.l2()),
+            ("l1_l2", regularizers.l1_l2()),
+        ]
+    )
+    def test_regularization_shared_layer(self, regularizer):
+        dense_layer = keras.layers.Dense(
+            NUM_CLASSES,
+            kernel_regularizer=regularizer,
+            activity_regularizer=regularizer,
+        )
+        model = self.create_multi_input_model_from(dense_layer, dense_layer)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertLen(model.losses, 5)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            ("l1", regularizers.l1()),
+            ("l2", regularizers.l2()),
+            ("l1_l2", regularizers.l1_l2()),
+        ]
+    )
+    def test_regularization_shared_model(self, regularizer):
+        dense_layer = keras.layers.Dense(
+            NUM_CLASSES,
+            kernel_regularizer=regularizer,
+            activity_regularizer=regularizer,
+        )
+
+        input_tensor = keras.layers.Input(shape=(DATA_DIM,))
+        dummy_model = keras.models.Model(
+            input_tensor, dense_layer(input_tensor)
+        )
+
+        model = self.create_multi_input_model_from(dummy_model, dummy_model)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertLen(model.losses, 6)
+
+    @test_combinations.run_all_keras_modes
+    @parameterized.named_parameters(
+        [
+            ("l1", regularizers.l1()),
+            ("l2", regularizers.l2()),
+            ("l1_l2", regularizers.l1_l2()),
+        ]
+    )
+    def test_regularization_shared_layer_in_different_models(self, regularizer):
+        shared_dense = keras.layers.Dense(
+            NUM_CLASSES,
+            kernel_regularizer=regularizer,
+            activity_regularizer=regularizer,
+        )
+        models = []
+        for _ in range(2):
+            input_tensor = keras.layers.Input(shape=(DATA_DIM,))
+            unshared_dense = keras.layers.Dense(
+                NUM_CLASSES, kernel_regularizer=regularizer
+            )
+            out = unshared_dense(shared_dense(input_tensor))
+            models.append(keras.models.Model(input_tensor, out))
+
+        model = self.create_multi_input_model_from(
+            layer1=models[0], layer2=models[1]
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="sgd",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        # We expect to see 9 losses on the model:
+        # - 2 from the 2 add_loss calls on the outer model.
+        # - 3 from the weight regularizers on the shared_dense layer,
+        # unshared_dense in inner model 1, unshared_dense in inner model 2.
+        # - 4 from activity regularizers on the shared_dense layer.
+        self.assertLen(model.losses, 9)
+
+    def test_deserialization_error(self):
+        with self.assertRaisesRegex(
+            ValueError, "Could not interpret regularizer"
+        ):
+            keras.regularizers.get(0)
+
+    @parameterized.named_parameters(
+        [
+            ("l1", regularizers.l1(l1=None), 0.01),
+            ("l2", regularizers.l2(l2=None), 0.01),
+            ("l1_l2", regularizers.l1_l2(l1=None, l2=None), 0.0),
+        ]
+    )
+    def test_default_value_when_init_with_none(
+        self, regularizer, expected_value
+    ):
+        expected_value = np.asarray(expected_value)
+        if hasattr(regularizer, "l1"):
+            self.assertAllClose(regularizer.l1, expected_value)
+        if hasattr(regularizer, "l2"):
+            self.assertAllClose(regularizer.l2, expected_value)
+
+    @test_utils.run_v2_only
+    def test_orthogonal_regularizer(self):
+        # Test correctness.
+        factor = 0.1
+        reg_rows = regularizers.OrthogonalRegularizer(
+            factor=factor, mode="rows"
+        )
+        reg_cols = regularizers.OrthogonalRegularizer(
+            factor=factor, mode="columns"
+        )
+
+        # Test with square matrix
+        inputs = tf.constant(
+            [[1, 1, 1, 1], [2, 0, 0, 0], [0, 0, 3, 1]], dtype="float32"
+        )
+        normalized_rows = tf.math.l2_normalize(inputs, axis=1)
+        normalized_cols = tf.math.l2_normalize(inputs, axis=0)
+        rows_pairs = [
+            tf.reduce_sum(normalized_rows[0] * normalized_rows[1]),
+            tf.reduce_sum(normalized_rows[0] * normalized_rows[2]),
+            tf.reduce_sum(normalized_rows[1] * normalized_rows[2]),
+        ]
+        col_pairs = [
+            tf.reduce_sum(normalized_cols[:, 0] * normalized_cols[:, 1]),
+            tf.reduce_sum(normalized_cols[:, 0] * normalized_cols[:, 2]),
+            tf.reduce_sum(normalized_cols[:, 0] * normalized_cols[:, 3]),
+            tf.reduce_sum(normalized_cols[:, 1] * normalized_cols[:, 2]),
+            tf.reduce_sum(normalized_cols[:, 1] * normalized_cols[:, 3]),
+            tf.reduce_sum(normalized_cols[:, 2] * normalized_cols[:, 3]),
+        ]
+        num_row_pairs = 3
+        num_col_pairs = 6
+        # Expected: factor * sum(pairwise_dot_products_of_rows) / num_row_pairs
+        self.assertAllClose(
+            reg_rows(inputs), factor * sum(rows_pairs) / num_row_pairs
+        )
+        # Expected: factor * sum(pairwise_dot_products_of_columns) /
+        # num_col_pairs
+        self.assertAllClose(
+            reg_cols(inputs), factor * sum(col_pairs) / num_col_pairs
+        )
+
+        # Test incorrect usage.
+        with self.assertRaisesRegex(ValueError, "must have rank 2"):
+            reg_rows(tf.constant([1, 1], dtype="float32"))
+
+        # Test serialization
+        self.assertDictEqual(
+            reg_cols.get_config(), {"factor": factor, "mode": "columns"}
+        )
+
+        # Test usage in model.
+        model_inputs = keras.Input((3,))
+        model_outputs = keras.layers.Dense(4, kernel_regularizer=reg_rows)(
+            model_inputs
+        )
+        model = keras.Model(model_inputs, model_outputs)
+        model.compile(optimizer="rmsprop", loss="mse")
+        model.fit(
+            np.random.random((16, 3)), np.random.random((16, 4)), epochs=1
+        )
+
+        # Test serialization and deserialiation as part of model.
+        inputs = tf.constant([[1, 1, 1], [2, 0, 0], [0, 0, 3]], dtype="float32")
+        outputs = model(inputs)
+        config = model.get_config()
+        weights = model.get_weights()
+        model = keras.Model.from_config(config)
+        model.set_weights(weights)
+        self.assertAllClose(model(inputs), outputs, atol=1e-5)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/BUILD b/keras/saving/BUILD
index 826069278d0b..ab4a8830fd69 100644
--- a/keras/saving/BUILD
+++ b/keras/saving/BUILD
@@ -1,9 +1,11 @@
 # Description:
 #   Contains the Keras save model API (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//keras:friends",
@@ -16,15 +18,18 @@ py_library(
     name = "saving",
     srcs = [
         "__init__.py",
-        "hdf5_format.py",
-        "model_config.py",
+        "legacy/hdf5_format.py",
+        "legacy/model_config.py",
+        "legacy/save.py",
+        "legacy/saving_utils.py",
         "pickle_utils.py",
-        "save.py",
-        "saved_model_experimental.py",
-        "saving_utils.py",
+        "saving_api.py",
     ],
     srcs_version = "PY3",
     deps = [
+        ":object_registration",
+        ":serialization",
+        ":serialization_lib",
         "//:expect_h5py_installed",
         "//:expect_tensorflow_installed",
         "//:expect_yaml_installed",
@@ -35,18 +40,94 @@ py_library(
         "//keras/mixed_precision:autocast_variable",
         "//keras/optimizers",
         "//keras/protobuf:saved_metadata_proto_py_pb2",
-        "//keras/saving/saved_model",
-        "//keras/saving/utils_v1",
+        "//keras/saving/legacy/saved_model",
         "//keras/utils:engine_utils",
         "//keras/utils:metrics_utils",
         "//keras/utils:mode_keys",
     ],
 )
 
+py_library(
+    name = "saving_lib",
+    srcs = [
+        "saving_lib.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":serialization_lib",
+        "//:expect_tensorflow_installed",
+        "//keras/utils:generic_utils",
+        "//keras/utils:io_utils",
+    ],
+)
+
+tf_py_test(
+    name = "saving_lib_test",
+    size = "medium",
+    srcs = ["saving_lib_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//:expect_absl_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_combinations",
+        "//keras/utils:generic_utils",
+    ],
+)
+
+py_library(
+    name = "object_registration",
+    srcs = [
+        "object_registration.py",
+    ],
+    srcs_version = "PY3",
+)
+
+py_library(
+    name = "serialization_lib",
+    srcs = [
+        "serialization_lib.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":object_registration",
+        ":serialization",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras/saving/legacy/saved_model:utils",
+    ],
+)
+
+py_library(
+    name = "serialization",
+    srcs = [
+        "legacy/serialization.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":object_registration",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras/utils:tf_contextlib",
+        "//keras/utils:tf_inspect",
+    ],
+)
+
+tf_py_test(
+    name = "object_registration_test",
+    size = "small",
+    srcs = ["object_registration_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras",
+    ],
+)
+
 tf_py_test(
     name = "metrics_serialization_test",
     size = "medium",
-    srcs = ["metrics_serialization_test.py"],
+    srcs = ["legacy/metrics_serialization_test.py"],
     python_version = "PY3",
     shard_count = 8,
     tags = [
@@ -64,7 +145,7 @@ tf_py_test(
 tf_py_test(
     name = "losses_serialization_test",
     size = "medium",
-    srcs = ["losses_serialization_test.py"],
+    srcs = ["legacy/losses_serialization_test.py"],
     python_version = "PY3",
     shard_count = 4,
     deps = [
@@ -94,7 +175,7 @@ tf_py_test(
 tf_py_test(
     name = "save_weights_test",
     size = "medium",
-    srcs = ["save_weights_test.py"],
+    srcs = ["legacy/save_weights_test.py"],
     python_version = "PY3",
     shard_count = 4,
     tags = [
@@ -114,7 +195,7 @@ tf_py_test(
 tf_py_test(
     name = "save_test",
     size = "medium",
-    srcs = ["save_test.py"],
+    srcs = ["legacy/save_test.py"],
     python_version = "PY3",
     shard_count = 4,
     tags = [
@@ -130,34 +211,30 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "saved_model_experimental_test",
+    name = "saving_utils_test",
     size = "medium",
-    srcs = ["saved_model_experimental_test.py"],
+    srcs = ["legacy/saving_utils_test.py"],
     python_version = "PY3",
-    shard_count = 4,
-    tags = [
-        "no_oss",  # TODO(b/119349471): Re-enable
-        "no_windows",
-    ],
+    tags = ["notsan"],
     deps = [
         "//:expect_absl_installed",
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
+        "//keras/testing_infra:test_combinations",
     ],
 )
 
 tf_py_test(
-    name = "saving_utils_test",
-    size = "medium",
-    srcs = ["saving_utils_test.py"],
+    name = "serialization_lib_test",
+    size = "small",
+    srcs = ["serialization_lib_test.py"],
     python_version = "PY3",
-    tags = ["notsan"],
     deps = [
         "//:expect_absl_installed",
-        "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
+        "//keras/saving:serialization",
         "//keras/testing_infra:test_combinations",
     ],
 )
diff --git a/keras/saving/experimental/saving_lib.py b/keras/saving/experimental/saving_lib.py
deleted file mode 100644
index 7ccc0c8c9799..000000000000
--- a/keras/saving/experimental/saving_lib.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras python-based idempotent saving functions (experimental)."""
-import importlib
-import json
-import os
-import types
-from keras.saving.saved_model import json_utils
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-from tensorflow.python.util import tf_export
-
-_CONFIG_FILE = 'config.keras'
-
-# A temporary flag to enable the new idempotent saving framework.
-_ENABLED = False
-
-
-def load(dirpath):
-  """Load a saved python model."""
-  file_path = os.path.join(dirpath, _CONFIG_FILE)
-  with tf.io.gfile.GFile(file_path, 'r') as f:
-    config_json = f.read()
-  config_dict = json_utils.decode(config_json)
-  return deserialize_keras_object(config_dict)
-
-
-def save(model, dirpath):
-  """Save a saved python model."""
-  if not tf.io.gfile.exists(dirpath):
-    tf.io.gfile.mkdir(dirpath)
-  file_path = os.path.join(dirpath, _CONFIG_FILE)
-
-  # TODO(rchao): Save the model's metadata (e.g. Keras version) in a separate
-  # file in the archive.
-  # TODO(rchao): Save the model's state (e.g. layer weights/vocab) in a separate
-  # set of files in the archive.
-  # TODO(rchao): Write the config into a file in an archive. In this prototype
-  # we're temporarily settled on a standalone json file.
-  serialized_model_dict = serialize_keras_object(model)
-  config_json = json.dumps(serialized_model_dict, cls=json_utils.Encoder)
-  with tf.io.gfile.GFile(file_path, 'w') as f:
-    f.write(config_json)
-
-
-# TODO(rchao): Replace the current Keras' `deserialize_keras_object` with this
-# (as well as the reciprocal function).
-def deserialize_keras_object(config_dict):
-  """Retrieve the object by deserializing the config dict.
-
-  The config dict is a python dictionary that consists of a set of key-value
-  pairs, and represents a Keras object, such as an `Optimizer`, `Layer`,
-  `Metrics`, etc. The saving and loading library uses the following keys to
-  record information of a Keras object:
-
-  - `class_name`: String. For classes that have an exported Keras namespace,
-    this is the full path that starts with "keras", such as
-    "keras.optimizers.Adam". For classes that do not have an exported Keras
-    namespace, this is the name of the class, as exactly defined in the source
-    code, such as "LossesContainer".
-  - `config`: Dict. Library-defined or user-defined key-value pairs that store
-    the configuration of the object, as obtained by `object.get_config()`.
-  - `module`: String. The path of the python module, such as
-    "keras.engine.compile_utils". Built-in Keras classes
-    expect to have prefix `keras`. For classes that have an exported Keras
-    namespace, this is `None` since the class can be fully identified by the
-    full Keras path.
-  - `registered_name`: String. The key the class is registered under via
-    `keras.utils.register_keras_serializable(package, name)` API. The key has
-    the format of '{package}>{name}', where `package` and `name` are the
-    arguments passed to `register_keras_serializable()`. If `name` is not
-    provided, it defaults to the class name. If `registered_name` successfully
-    resolves to a class (that was registered), `class_name` and `config` values
-    in the dict will not be used. `registered_name` is only used for
-    non-built-in classes.
-
-  For example, the following dictionary represents the built-in Adam optimizer
-  with the relevant config. Note that for built-in (exported symbols that have
-  an exported Keras namespace) classes, the library tracks the class by the
-  the import location of the built-in object in the Keras namespace, e.g.
-  `"keras.optimizers.Adam"`, and this information is stored in `class_name`:
-
-  ```
-  dict_structure = {
-      "class_name": "keras.optimizers.Adam",
-      "config": {
-          "amsgrad": false,
-          "beta_1": 0.8999999761581421,
-          "beta_2": 0.9990000128746033,
-          "decay": 0.0,
-          "epsilon": 1e-07,
-          "learning_rate": 0.0010000000474974513,
-          "name": "Adam"
-      },
-      "module": null,
-      "registered_name": "Adam"
-  }
-  # Returns an `Adam` instance identical to the original one.
-  deserialize_keras_object(dict_structure)
-  ```
-
-  If the class does not have an exported Keras namespace, the library tracks it
-  by its `module` and `class_name`. For example:
-
-  ```
-  dict_structure = {
-    "class_name": "LossesContainer",
-    "config": {
-        "losses": [...],
-        "total_loss_mean": {...},
-    },
-    "module": "keras.engine.compile_utils",
-    "registered_name": "LossesContainer"
-  }
-
-  # Returns a `LossesContainer` instance identical to the original one.
-  deserialize_keras_object(dict_structure)
-  ```
-
-  And the following dictionary represents a user-customized `MeanSquaredError`
-  loss:
-
-  ```
-  @keras.utils.generic_utils.register_keras_serializable(package='my_package')
-  class ModifiedMeanSquaredError(keras.losses.MeanSquaredError):
-    ...
-
-  dict_structure = {
-      "class_name": "ModifiedMeanSquaredError",
-      "config": {
-          "fn": "mean_squared_error",
-          "name": "mean_squared_error",
-          "reduction": "auto"
-      },
-      "registered_name": "my_package>ModifiedMeanSquaredError"
-  }
-  # Gives `ModifiedMeanSquaredError` object
-  deserialize_keras_object(dict_structure)
-  ```
-
-  Args:
-    config_dict: the python dict structure to deserialize the Keras object from.
-
-  Returns:
-    The Keras object that is deserialized from `config_dict`.
-
-  """
-  # TODO(rchao): Design a 'version' key for `config_dict` for defining versions
-  # for classes.
-  class_name = config_dict['class_name']
-  config = config_dict['config']
-  module = config_dict['module']
-  registered_name = config_dict['registered_name']
-
-  # Strings and functions will have `builtins` as its module.
-  if module == 'builtins':
-    if class_name == 'str':
-      if not isinstance(config, str):
-        raise TypeError('Config of string is supposed to be a string. '
-                        f'Received: {config}.')
-      return config
-
-    elif class_name == 'function':
-      custom_function = generic_utils.get_custom_objects_by_name(
-          registered_name)
-      if custom_function is not None:
-        # If there is a custom function registered (via
-        # `register_keras_serializable` API), that takes precedence.
-        return custom_function
-
-      # Otherwise, attempt to import the tracked module, and find the function.
-      function_module = config.get('module', None)
-      try:
-        function_module = importlib.import_module(function_module)
-      except ImportError as e:
-        raise ImportError(
-            f'The function module {function_module} is not available. The '
-            f'config dictionary provided is {config_dict}.') from e
-      return vars(function_module).get(config['function_name'])
-
-    raise TypeError(f'Unrecognized type: {class_name}')
-
-  custom_class = generic_utils.get_custom_objects_by_name(registered_name)
-  if custom_class is not None:
-    # For others (classes), see if there is a custom class registered (via
-    # `register_keras_serializable` API). If so, that takes precedence.
-    return custom_class.from_config(config)
-  else:
-    # Otherwise, attempt to retrieve the class object given the `module`, and
-    # `class_name`.
-    if module is None:
-      # In the case where `module` is not recorded, the `class_name` represents
-      # the full exported Keras namespace (used by `keras_export`) such as
-      # "keras.optimizers.Adam".
-      cls = tf_export.get_symbol_from_name(class_name)
-    else:
-      # In the case where `module` is available, the class does not have an
-      # Keras namespace (which is the case when the symbol is not exported via
-      # `keras_export`). Import the tracked module (that is used for the
-      # internal path), find the class, and use its config.
-      mod = importlib.import_module(module)
-      cls = vars(mod).get(class_name, None)
-    if not hasattr(cls, 'from_config'):
-      raise TypeError(f'Unable to reconstruct an instance of {cls}.')
-    return cls.from_config(config)
-
-
-def serialize_keras_object(obj):
-  """Retrieve the config dict by serializing the Keras object.
-
-  `serialize_keras_object()` serializes a Keras object to a python dictionary
-  that represents the object, and is a reciprocal function of
-  `deserialize_keras_object()`. See `deserialize_keras_object()` for more
-  information about the config format.
-
-  Args:
-    obj: the Keras object to serialize.
-
-  Returns:
-    A python dict that represents the object. The python dict can be
-    deserialized via `deserialize_keras_object()`.
-  """
-
-  # Note that in the case of the `obj` being a function, the module used will be
-  # "builtins", and the `class_name` used will be "function"; in the case of the
-  # `obj` being a string, the module used will be "builtins", and the
-  # `class_name` used will be "str"
-  module = None
-
-  # This gets the `keras.*` exported name, such as "keras.optimizers.Adam".
-  class_name = tf_export.get_canonical_name_for_symbol(
-      obj.__class__, api_name='keras')
-  if class_name is None:
-    module = obj.__class__.__module__
-    class_name = obj.__class__.__name__
-  return {
-      'module': module,
-      'class_name': class_name,
-      'config': _get_object_config(obj),
-      'registered_name': _get_object_registered_name(obj)
-  }
-
-
-def _get_object_registered_name(obj):
-  if isinstance(obj, types.FunctionType):
-    return generic_utils.get_registered_name(obj)
-  else:
-    return generic_utils.get_registered_name(obj.__class__)
-
-
-def _get_object_config(obj):
-  """Return the object's config depending on string, function, or others."""
-  if isinstance(obj, str):
-    # Use the content of the string as the config for string.
-    return obj
-  elif isinstance(obj, types.FunctionType):
-    # Keep track of the function's module and name in a dict as the config.
-    return {
-        'module': obj.__module__,
-        'function_name': obj.__name__,
-    }
-  if not hasattr(obj, 'get_config'):
-    raise TypeError(f'Unable to recognize the config of {obj}.')
-  return obj.get_config()
diff --git a/keras/saving/experimental/saving_lib_test.py b/keras/saving/experimental/saving_lib_test.py
deleted file mode 100644
index 4f289d8d9e8a..000000000000
--- a/keras/saving/experimental/saving_lib_test.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras python-based idempotent saving functions (experimental)."""
-import os
-import sys
-
-import keras
-from keras import backend
-from keras.saving.experimental import saving_lib
-from keras.saving.saved_model import json_utils
-from keras.utils import generic_utils
-from keras.utils import io_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-train_step_message = 'This is my training step'
-
-
-@keras.utils.generic_utils.register_keras_serializable(
-    package='my_custom_package')
-class MyDense(keras.layers.Dense):
-
-  def two(self):
-    return 2
-
-
-@keras.utils.generic_utils.register_keras_serializable(
-    package='my_custom_package')
-class CustomModelX(keras.Model):
-
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.dense1 = MyDense(1)
-
-  def call(self, inputs):
-    return self.dense1(inputs)
-
-  def train_step(self, data):
-    tf.print(train_step_message)
-    x, y = data
-    with tf.GradientTape() as tape:
-      y_pred = self(x)
-      loss = self.compiled_loss(y, y_pred)
-
-    gradients = tape.gradient(loss, self.trainable_variables)
-    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-    return {}
-
-  def one(self):
-    return 1
-
-
-@keras.utils.generic_utils.register_keras_serializable(
-    package='my_custom_package')
-def my_mean_squared_error(y_true, y_pred):
-  """Identical to built-in `mean_squared_error`, added here as a custom func."""
-  return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
-
-
-module_my_mean_squared_error = my_mean_squared_error
-
-
-class NewSavingTest(tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    saving_lib._ENABLED = True
-
-  def tearDown(self):
-    super().tearDown()
-    saving_lib._ENABLED = False
-
-  def _get_subclassed_model(self):
-    subclassed_model = CustomModelX()
-    subclassed_model.compile(
-        optimizer='adam',
-        loss=[
-            'mse', keras.losses.mean_squared_error,
-            keras.losses.MeanSquaredError(), my_mean_squared_error
-        ])
-    return subclassed_model
-
-  def test_saving_after_compile_but_before_fit(self):
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-    subclassed_model = self._get_subclassed_model()
-    subclassed_model._save_new(temp_dir)
-
-    # This is so that we can register another function with the same custom
-    # object key, and make sure the newly registered function is used while
-    # loading.
-    del generic_utils._GLOBAL_CUSTOM_OBJECTS[
-        'my_custom_package>my_mean_squared_error']
-
-    @keras.utils.generic_utils.register_keras_serializable(
-        package='my_custom_package')
-    def my_mean_squared_error(y_true, y_pred):  # pylint: disable=redefined-outer-name
-      """Function-local `mean_squared_error`."""
-      return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
-
-    loaded_model = saving_lib.load(temp_dir)
-
-    # Everything should be the same class or function for the original model
-    # and the loaded model.
-    for model in [subclassed_model, loaded_model]:
-      self.assertIs(model.optimizer.__class__,
-                    keras.optimizers.optimizer_v2.adam.Adam)
-      self.assertIs(model.compiled_loss.__class__,
-                    keras.engine.compile_utils.LossesContainer)
-      self.assertEqual(model.compiled_loss._losses[0], 'mse')
-      self.assertIs(model.compiled_loss._losses[1],
-                    keras.losses.mean_squared_error)
-      self.assertIs(model.compiled_loss._losses[2].__class__,
-                    keras.losses.MeanSquaredError)
-      self.assertIs(model.compiled_loss._total_loss_mean.__class__,
-                    keras.metrics.base_metric.Mean)
-
-    # Except for a custom function used because the loaded model is supposed to
-    # be using the newly registered custom function.
-    self.assertIs(subclassed_model.compiled_loss._losses[3],
-                  module_my_mean_squared_error)
-    self.assertIs(loaded_model.compiled_loss._losses[3], my_mean_squared_error)
-    self.assertIsNot(module_my_mean_squared_error, my_mean_squared_error)
-
-  def test_saving_after_fit(self):
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-    subclassed_model = self._get_subclassed_model()
-
-    x = np.random.random((100, 32))
-    y = np.random.random((100, 1))
-    subclassed_model.fit(x, y, epochs=1)
-    subclassed_model._save_new(temp_dir)
-    loaded_model = saving_lib.load(temp_dir)
-
-    io_utils.enable_interactive_logging()
-    # `tf.print` writes to stderr. This is to make sure the custom training step
-    # is used.
-    with self.captureWritesToStream(sys.stderr) as printed:
-      loaded_model.fit(x, y, epochs=1)
-      self.assertRegex(printed.contents(), train_step_message)
-
-    # Check that the custom classes do get used.
-    self.assertIsInstance(loaded_model, CustomModelX)
-    self.assertIsInstance(loaded_model.dense1, MyDense)
-    # Check that the custom method is available.
-    self.assertEqual(loaded_model.one(), 1)
-    self.assertEqual(loaded_model.dense1.two(), 2)
-
-    # Everything should be the same class or function for the original model
-    # and the loaded model.
-    for model in [subclassed_model, loaded_model]:
-      self.assertIs(model.optimizer.__class__,
-                    keras.optimizers.optimizer_v2.adam.Adam)
-      self.assertIs(model.compiled_loss.__class__,
-                    keras.engine.compile_utils.LossesContainer)
-      self.assertIs(model.compiled_loss._losses[0].__class__,
-                    keras.losses.LossFunctionWrapper)
-      self.assertIs(model.compiled_loss._losses[1].__class__,
-                    keras.losses.LossFunctionWrapper)
-      self.assertIs(model.compiled_loss._losses[2].__class__,
-                    keras.losses.MeanSquaredError)
-      self.assertIs(model.compiled_loss._losses[3].__class__,
-                    keras.losses.LossFunctionWrapper)
-      self.assertIs(model.compiled_loss._total_loss_mean.__class__,
-                    keras.metrics.base_metric.Mean)
-
-  def test_saving_preserve_unbuilt_state(self):
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-    subclassed_model = CustomModelX()
-    subclassed_model._save_new(temp_dir)
-    loaded_model = saving_lib.load(temp_dir)
-    self.assertFalse(subclassed_model.built)
-    self.assertFalse(loaded_model.built)
-
-  def test_saving_preserve_built_state(self):
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-    subclassed_model = self._get_subclassed_model()
-    x = np.random.random((100, 32))
-    y = np.random.random((100, 1))
-    subclassed_model.fit(x, y, epochs=1)
-    subclassed_model._save_new(temp_dir)
-    loaded_model = saving_lib.load(temp_dir)
-    self.assertTrue(subclassed_model.built)
-    self.assertTrue(loaded_model.built)
-    self.assertEqual(subclassed_model._build_input_shape,
-                     loaded_model._build_input_shape)
-    self.assertEqual(
-        tf.TensorShape([None, 32]), loaded_model._build_input_shape)
-
-  def test_saved_module_paths_and_class_names(self):
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-    subclassed_model = self._get_subclassed_model()
-    x = np.random.random((100, 32))
-    y = np.random.random((100, 1))
-    subclassed_model.fit(x, y, epochs=1)
-    subclassed_model._save_new(temp_dir)
-
-    file_path = os.path.join(temp_dir, saving_lib._CONFIG_FILE)
-    with tf.io.gfile.GFile(file_path, 'r') as f:
-      config_json = f.read()
-    config_dict = json_utils.decode(config_json)
-    self.assertEqual(config_dict['registered_name'],
-                     'my_custom_package>CustomModelX')
-    self.assertIsNone(config_dict['config']['optimizer']['module'])
-    self.assertEqual(config_dict['config']['optimizer']['class_name'],
-                     'keras.optimizers.Adam')
-    self.assertEqual(config_dict['config']['loss']['module'],
-                     'keras.engine.compile_utils')
-    self.assertEqual(config_dict['config']['loss']['class_name'],
-                     'LossesContainer')
-
-
-  def test_functional_model_with_tf_op_lambda_layer(self):
-
-    class ToString:
-
-      def __init__(self):
-        self.contents = ''
-
-      def __call__(self, msg):
-        self.contents += msg + '\n'
-
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-
-    inputs = keras.layers.Input(shape=(32,))
-    outputs = keras.layers.Dense(1)(inputs)
-    outputs = outputs + inputs
-    functional_model = keras.Model(inputs, outputs)
-    functional_to_string = ToString()
-    functional_model.summary(print_fn=functional_to_string)
-    functional_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
-
-    x = np.random.random((1000, 32))
-    y = np.random.random((1000, 1))
-    functional_model.fit(x, y, epochs=3)
-    functional_model._save_new(temp_dir)
-    loaded_model = saving_lib.load(temp_dir)
-    loaded_to_string = ToString()
-    loaded_model.summary(print_fn=loaded_to_string)
-
-    self.assertEqual(functional_to_string.contents, loaded_to_string.contents)
-
-
-if __name__ == '__main__':
-  if tf.__internal__.tf2.enabled():
-    tf.test.main()
diff --git a/keras/saving/hdf5_format.py b/keras/saving/hdf5_format.py
deleted file mode 100644
index cb7ef4b36069..000000000000
--- a/keras/saving/hdf5_format.py
+++ /dev/null
@@ -1,992 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Functions for saving and loading a Keras Model from HDF5 format."""
-
-import tensorflow.compat.v2 as tf
-
-import json
-import os
-
-import numpy as np
-
-from keras import backend
-from keras.optimizers import optimizer_v1
-from keras.optimizers.optimizer_experimental import optimizer as optimizer_experimental
-from keras.saving import model_config as model_config_lib
-from keras.saving import saving_utils
-from keras.saving.saved_model import json_utils
-from keras.utils.generic_utils import LazyLoader
-from keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.platform import tf_logging as logging
-
-
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-  HDF5_OBJECT_HEADER_LIMIT = 64512
-except ImportError:
-  h5py = None
-# pylint: enable=g-import-not-at-top
-
-# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
-# once the issue with copybara is fixed.
-# pylint:disable=g-inconsistent-quotes
-sequential_lib = LazyLoader(
-    "sequential_lib", globals(),
-    "keras.engine.sequential")
-# pylint:enable=g-inconsistent-quotes
-
-
-def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
-  """Saves a model to a HDF5 file.
-
-  The saved model contains:
-      - the model's configuration (topology)
-      - the model's weights
-      - the model's optimizer's state (if any)
-
-  Thus the saved model can be reinstantiated in
-  the exact same state, without any of the code
-  used for model definition or training.
-
-  Args:
-      model: Keras model instance to be saved.
-      filepath: One of the following:
-          - String, path where to save the model
-          - `h5py.File` object where to save the model
-      overwrite: Whether we should overwrite any existing
-          model at the target location, or instead
-          ask the user with a manual prompt.
-      include_optimizer: If True, save optimizer's state together.
-
-  Raises:
-      ImportError: if h5py is not available.
-  """
-
-  if h5py is None:
-    raise ImportError('`save_model()` using h5 format requires h5py. Could not '
-                      'import h5py.')
-
-  # TODO(psv) Add warning when we save models that contain non-serializable
-  # entities like metrics added using `add_metric` and losses added using
-  # `add_loss.`
-  if len(model.weights) != len(model._undeduplicated_weights):
-    logging.warning('Found duplicated `Variable`s in Model\'s `weights`. '
-                    'This is usually caused by `Variable`s being shared by '
-                    'Layers in the Model. These `Variable`s will be treated '
-                    'as separate `Variable`s when the Model is restored. To '
-                    'avoid this, please save with `save_format="tf"`.')
-
-  if not isinstance(filepath, h5py.File):
-    # If file exists and should not be overwritten.
-    if not overwrite and os.path.isfile(filepath):
-      proceed = ask_to_proceed_with_overwrite(filepath)
-      if not proceed:
-        return
-
-    # Try creating dir if not exist
-    dirpath = os.path.dirname(filepath)
-    if not os.path.exists(dirpath):
-      tf.io.gfile.makedirs(dirpath)
-
-    f = h5py.File(filepath, mode='w')
-    opened_new_file = True
-  else:
-    f = filepath
-    opened_new_file = False
-
-  try:
-    model_metadata = saving_utils.model_metadata(model, include_optimizer)
-    for k, v in model_metadata.items():
-      if isinstance(v, (dict, list, tuple)):
-        f.attrs[k] = json.dumps(
-            v, default=json_utils.get_json_type).encode('utf8')
-      else:
-        f.attrs[k] = v
-
-    model_weights_group = f.create_group('model_weights')
-    save_weights_to_hdf5_group(model_weights_group, model)
-
-    # TODO(b/128683857): Add integration tests between tf.keras and external
-    # Keras, to avoid breaking TF.js users.
-    if isinstance(model.optimizer, optimizer_experimental.Optimizer):
-      logging.warning('HDF5 format does not save weights of'
-                      ' `optimizer_experimental.Optimizer`, your optimizer will'
-                      ' be recompiled at loading time.')
-    elif (include_optimizer and model.optimizer and
-          not isinstance(model.optimizer, optimizer_v1.TFOptimizer)):
-      save_optimizer_weights_to_hdf5_group(f, model.optimizer)
-
-    f.flush()
-  finally:
-    if opened_new_file:
-      f.close()
-
-
-def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
-  """Loads a model saved via `save_model_to_hdf5`.
-
-  Args:
-      filepath: One of the following:
-          - String, path to the saved model
-          - `h5py.File` object from which to load the model
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-      compile: Boolean, whether to compile the model
-          after loading.
-
-  Returns:
-      A Keras model instance. If an optimizer was found
-      as part of the saved model, the model is already
-      compiled. Otherwise, the model is uncompiled and
-      a warning will be displayed. When `compile` is set
-      to False, the compilation is omitted without any
-      warning.
-
-  Raises:
-      ImportError: if h5py is not available.
-      ValueError: In case of an invalid savefile.
-  """
-  if h5py is None:
-    raise ImportError('`load_model()` using h5 format requires h5py. Could not '
-                      'import h5py.')
-
-  if not custom_objects:
-    custom_objects = {}
-
-  opened_new_file = not isinstance(filepath, h5py.File)
-  if opened_new_file:
-    f = h5py.File(filepath, mode='r')
-  else:
-    f = filepath
-
-  model = None
-  try:
-    # instantiate model
-    model_config = f.attrs.get('model_config')
-    if model_config is None:
-      raise ValueError(f'No model config found in the file at {filepath}.')
-    if hasattr(model_config, 'decode'):
-      model_config = model_config.decode('utf-8')
-    model_config = json_utils.decode(model_config)
-    model = model_config_lib.model_from_config(model_config,
-                                               custom_objects=custom_objects)
-
-    # set weights
-    load_weights_from_hdf5_group(f['model_weights'], model)
-
-    if compile:
-      # instantiate optimizer
-      training_config = f.attrs.get('training_config')
-      if hasattr(training_config, 'decode'):
-        training_config = training_config.decode('utf-8')
-      if training_config is None:
-        logging.warning('No training configuration found in the save file, so '
-                        'the model was *not* compiled. Compile it manually.')
-        return model
-      training_config = json_utils.decode(training_config)
-
-      # Compile model.
-      model.compile(**saving_utils.compile_args_from_training_config(
-          training_config, custom_objects), from_serialized=True)
-      saving_utils.try_build_compiled_arguments(model)
-
-      # Set optimizer weights.
-      if isinstance(model.optimizer, optimizer_experimental.Optimizer):
-        logging.warning('Loading model from HDF5 will not restore the '
-                        'optimizer\'s weights, since the optimizer is an '
-                        'instance of `optimizer_experimental.Optimizer`')
-      elif 'optimizer_weights' in f:
-        try:
-          model.optimizer._create_all_weights(model.trainable_variables)
-        except (NotImplementedError, AttributeError):
-          logging.warning(
-              'Error when creating the weights of optimizer {}, making it '
-              'impossible to restore the saved optimizer state. As a result, '
-              'your model is starting with a freshly initialized optimizer.')
-
-        optimizer_weight_values = load_optimizer_weights_from_hdf5_group(f)
-        try:
-          model.optimizer.set_weights(optimizer_weight_values)
-        except ValueError:
-          logging.warning('Error in loading the saved optimizer '
-                          'state. As a result, your model is '
-                          'starting with a freshly initialized '
-                          'optimizer.')
-  finally:
-    if opened_new_file:
-      f.close()
-  return model
-
-
-def preprocess_weights_for_loading(layer,
-                                   weights,
-                                   original_keras_version=None,
-                                   original_backend=None):
-  """Preprocess layer weights between different Keras formats.
-
-  Converts layers weights from Keras 1 format to Keras 2 and also weights of
-  cuDNN layers in Keras 2.
-
-  Args:
-      layer: Layer instance.
-      weights: List of weights values (Numpy arrays).
-      original_keras_version: Keras version for the weights, as a string.
-      original_backend: Keras backend the weights were trained with,
-          as a string.
-
-  Returns:
-      A list of weights values (Numpy arrays).
-  """
-  def convert_nested_bidirectional(weights):
-    """Converts layers nested in `Bidirectional` wrapper.
-
-    This function uses `preprocess_weights_for_loading()` for converting
-    layers.
-
-    Args:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    num_weights_per_layer = len(weights) // 2
-    forward_weights = preprocess_weights_for_loading(
-        layer.forward_layer, weights[:num_weights_per_layer],
-        original_keras_version, original_backend)
-    backward_weights = preprocess_weights_for_loading(
-        layer.backward_layer, weights[num_weights_per_layer:],
-        original_keras_version, original_backend)
-    return forward_weights + backward_weights
-
-  def convert_nested_time_distributed(weights):
-    """Converts layers nested in `TimeDistributed` wrapper.
-
-    This function uses `preprocess_weights_for_loading()` for converting nested
-    layers.
-
-    Args:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    return preprocess_weights_for_loading(
-        layer.layer, weights, original_keras_version, original_backend)
-
-  def convert_nested_model(weights):
-    """Converts layers nested in `Model` or `Sequential`.
-
-    This function uses `preprocess_weights_for_loading()` for converting nested
-    layers.
-
-    Args:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    trainable_weights = weights[:len(layer.trainable_weights)]
-    non_trainable_weights = weights[len(layer.trainable_weights):]
-
-    new_trainable_weights = []
-    new_non_trainable_weights = []
-
-    for sublayer in layer.layers:
-      num_trainable_weights = len(sublayer.trainable_weights)
-      num_non_trainable_weights = len(sublayer.non_trainable_weights)
-      if sublayer.weights:
-        preprocessed = preprocess_weights_for_loading(
-            layer=sublayer,
-            weights=(trainable_weights[:num_trainable_weights] +
-                     non_trainable_weights[:num_non_trainable_weights]),
-            original_keras_version=original_keras_version,
-            original_backend=original_backend)
-        new_trainable_weights.extend(preprocessed[:num_trainable_weights])
-        new_non_trainable_weights.extend(preprocessed[num_trainable_weights:])
-
-        trainable_weights = trainable_weights[num_trainable_weights:]
-        non_trainable_weights = non_trainable_weights[
-            num_non_trainable_weights:]
-    new_trainable_weights += layer._trainable_weights
-    new_non_trainable_weights += layer._non_trainable_weights
-    return new_trainable_weights + new_non_trainable_weights
-
-  # Convert layers nested in Bidirectional/Model/Sequential.
-  # Both transformation should be ran for both Keras 1->2 conversion
-  # and for conversion of cuDNN layers.
-  if layer.__class__.__name__ == 'Bidirectional':
-    weights = convert_nested_bidirectional(weights)
-  if layer.__class__.__name__ == 'TimeDistributed':
-    weights = convert_nested_time_distributed(weights)
-  elif layer.__class__.__name__ in ['Model', 'Sequential', 'Functional']:
-    weights = convert_nested_model(weights)
-
-  if original_keras_version == '1':
-    if layer.__class__.__name__ == 'TimeDistributed':
-      weights = preprocess_weights_for_loading(
-          layer.layer, weights, original_keras_version, original_backend)
-
-    if layer.__class__.__name__ == 'Conv1D':
-      shape = weights[0].shape
-      # Handle Keras 1.1 format
-      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
-        # Legacy shape:
-        # (filters, input_dim, filter_length, 1)
-        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
-                                                           1)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-      weights[0] = weights[0][:, 0, :, :]
-
-    if layer.__class__.__name__ == 'Conv2D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-
-    if layer.__class__.__name__ == 'Conv2DTranspose':
-      if layer.data_format == 'channels_last':
-        # old: (kernel_rows, kernel_cols, stack_size, filters)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
-
-    if layer.__class__.__name__ == 'Conv3D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, ...)
-        # new: (..., stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
-
-    if layer.__class__.__name__ == 'GRU':
-      if len(weights) == 9:
-        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[4], weights[7]], axis=-1)
-        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'LSTM':
-      if len(weights) == 12:
-        # old: i, c, f, o
-        # new: i, f, c, o
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'ConvLSTM2D':
-      if len(weights) == 12:
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        if layer.data_format == 'channels_first':
-          # old: (filters, stack_size, kernel_rows, kernel_cols)
-          # new: (kernel_rows, kernel_cols, stack_size, filters)
-          kernel = np.transpose(kernel, (2, 3, 1, 0))
-          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
-        weights = [kernel, recurrent_kernel, bias]
-
-  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
-  if layer.__class__.__name__ in conv_layers:
-    if backend.int_shape(layer.weights[0]) != weights[0].shape:
-      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
-
-  # convert cuDNN layers
-  return _convert_rnn_weights(layer, weights)
-
-
-def _convert_rnn_weights(layer, weights):
-  """Converts weights for RNN layers between native and cuDNN format.
-
-  Input kernels for each gate are transposed and converted between Fortran
-  and C layout, recurrent kernels are transposed. For LSTM biases are summed/
-  split in half, for GRU biases are reshaped.
-
-  Weights can be converted in both directions between `LSTM` and`CuDNNSLTM`
-  and between `CuDNNGRU` and `GRU(reset_after=True)`. Default `GRU` is not
-  compatible with `CuDNNGRU`.
-
-  For missing biases in `LSTM`/`GRU` (`use_bias=False`) no conversion is made.
-
-  Args:
-      layer: Target layer instance.
-      weights: List of source weights values (input kernels, recurrent kernels,
-        [biases]) (Numpy arrays).
-
-  Returns:
-      A list of converted weights values (Numpy arrays).
-
-  Raises:
-      ValueError: for incompatible GRU layer/weights or incompatible biases
-  """
-
-  def transform_kernels(kernels, func, n_gates):
-    """Transforms kernel for each gate separately using given function.
-
-    Args:
-        kernels: Stacked array of kernels for individual gates.
-        func: Function applied to kernel of each gate.
-        n_gates: Number of gates (4 for LSTM, 3 for GRU).
-
-    Returns:
-        Stacked array of transformed kernels.
-    """
-    return np.hstack([func(k) for k in np.hsplit(kernels, n_gates)])
-
-  def transpose_input(from_cudnn):
-    """Makes a function that transforms input kernels from/to cuDNN format.
-
-    It keeps the shape, but changes between the layout (Fortran/C). Eg.:
-
-    ```
-    Keras                 cuDNN
-    [[0, 1, 2],  <--->  [[0, 2, 4],
-     [3, 4, 5]]          [1, 3, 5]]
-    ```
-
-    It can be passed to `transform_kernels()`.
-
-    Args:
-        from_cudnn: `True` if source weights are in cuDNN format, `False` if
-          they're in plain Keras format.
-
-    Returns:
-        Function that converts input kernel to the other format.
-    """
-    order = 'F' if from_cudnn else 'C'
-
-    def transform(kernel):
-      return kernel.T.reshape(kernel.shape, order=order)
-
-    return transform
-
-  target_class = layer.__class__.__name__
-
-  # convert the weights between CuDNNLSTM and LSTM
-  if target_class in ['LSTM', 'CuDNNLSTM'] and len(weights) == 3:
-    # determine if we're loading a CuDNNLSTM layer
-    # from the number of bias weights:
-    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
-    # if there's no bias weight in the file, skip this conversion
-    units = weights[1].shape[0]
-    bias_shape = weights[2].shape
-    n_gates = 4
-
-    if bias_shape == (2 * units * n_gates,):
-      source = 'CuDNNLSTM'
-    elif bias_shape == (units * n_gates,):
-      source = 'LSTM'
-    else:
-      raise ValueError('Invalid bias shape: ' + str(bias_shape))
-
-    def convert_lstm_weights(weights, from_cudnn=True):
-      """Converts the weights between CuDNNLSTM and LSTM.
-
-      Args:
-        weights: Original weights.
-        from_cudnn: Indicates whether original weights are from cuDNN layer.
-
-      Returns:
-        Updated weights compatible with LSTM.
-      """
-
-      # Transpose (and reshape) input and recurrent kernels
-      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
-                                  n_gates)
-      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
-      if from_cudnn:
-        # merge input and recurrent biases into a single set
-        biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
-      else:
-        # Split single set of biases evenly to two sets. The way of
-        # splitting doesn't matter as long as the two sets sum is kept.
-        biases = np.tile(0.5 * weights[2], 2)
-      return [kernels, recurrent_kernels, biases]
-
-    if source != target_class:
-      weights = convert_lstm_weights(weights, from_cudnn=source == 'CuDNNLSTM')
-
-  # convert the weights between CuDNNGRU and GRU(reset_after=True)
-  if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
-    # We can determine the source of the weights from the shape of the bias.
-    # If there is no bias we skip the conversion since
-    # CuDNNGRU always has biases.
-
-    units = weights[1].shape[0]
-    bias_shape = weights[2].shape
-    n_gates = 3
-
-    def convert_gru_weights(weights, from_cudnn=True):
-      """Converts the weights between CuDNNGRU and GRU.
-
-      Args:
-        weights: Original weights.
-        from_cudnn: Indicates whether original weights are from cuDNN layer.
-
-      Returns:
-        Updated weights compatible with GRU.
-      """
-
-      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
-                                  n_gates)
-      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
-      biases = np.array(weights[2]).reshape((2, -1) if from_cudnn else -1)
-      return [kernels, recurrent_kernels, biases]
-
-    if bias_shape == (2 * units * n_gates,):
-      source = 'CuDNNGRU'
-    elif bias_shape == (2, units * n_gates):
-      source = 'GRU(reset_after=True)'
-    elif bias_shape == (units * n_gates,):
-      source = 'GRU(reset_after=False)'
-    else:
-      raise ValueError('Invalid bias shape: ' + str(bias_shape))
-
-    if target_class == 'CuDNNGRU':
-      target = 'CuDNNGRU'
-    elif layer.reset_after:
-      target = 'GRU(reset_after=True)'
-    else:
-      target = 'GRU(reset_after=False)'
-
-    # only convert between different types
-    if source != target:
-      types = (source, target)
-      if 'GRU(reset_after=False)' in types:
-        raise ValueError('%s is not compatible with %s' % types)
-      if source == 'CuDNNGRU':
-        weights = convert_gru_weights(weights, from_cudnn=True)
-      elif source == 'GRU(reset_after=True)':
-        weights = convert_gru_weights(weights, from_cudnn=False)
-
-  return weights
-
-
-def save_optimizer_weights_to_hdf5_group(hdf5_group, optimizer):
-  """Saves optimizer weights of a optimizer to a HDF5 group.
-
-  Args:
-      hdf5_group: HDF5 group.
-      optimizer: optimizer instance.
-  """
-
-  symbolic_weights = getattr(optimizer, 'weights')
-  if symbolic_weights:
-    weights_group = hdf5_group.create_group('optimizer_weights')
-    weight_names = [str(w.name).encode('utf8') for w in symbolic_weights]
-    save_attributes_to_hdf5_group(weights_group, 'weight_names', weight_names)
-    weight_values = backend.batch_get_value(symbolic_weights)
-    for name, val in zip(weight_names, weight_values):
-      param_dset = weights_group.create_dataset(
-          name, val.shape, dtype=val.dtype)
-      if not val.shape:
-        # scalar
-        param_dset[()] = val
-      else:
-        param_dset[:] = val
-
-
-def load_optimizer_weights_from_hdf5_group(hdf5_group):
-  """Load optimizer weights from a HDF5 group.
-
-  Args:
-      hdf5_group: A pointer to a HDF5 group.
-
-  Returns:
-      data: List of optimizer weight names.
-  """
-  weights_group = hdf5_group['optimizer_weights']
-  optimizer_weight_names = load_attributes_from_hdf5_group(
-      weights_group, 'weight_names')
-  return [weights_group[weight_name] for weight_name in optimizer_weight_names]
-
-
-def save_subset_weights_to_hdf5_group(f, weights):
-  """Save top-level weights of a model to a HDF5 group.
-
-  Args:
-      f: HDF5 group.
-      weights: List of weight variables.
-  """
-  weight_values = backend.batch_get_value(weights)
-  weight_names = [w.name.encode('utf8') for w in weights]
-  save_attributes_to_hdf5_group(f, 'weight_names', weight_names)
-  for name, val in zip(weight_names, weight_values):
-    param_dset = f.create_dataset(name, val.shape, dtype=val.dtype)
-    if not val.shape:
-      # scalar
-      param_dset[()] = val
-    else:
-      param_dset[:] = val
-
-
-def save_weights_to_hdf5_group(f, model):
-  """Saves the weights of a list of layers to a HDF5 group.
-
-  Args:
-      f: HDF5 group.
-      model: Model instance.
-  """
-  from keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-  save_attributes_to_hdf5_group(
-      f, 'layer_names', [layer.name.encode('utf8') for layer in model.layers])
-  f.attrs['backend'] = backend.backend().encode('utf8')
-  f.attrs['keras_version'] = str(keras_version).encode('utf8')
-
-  # Sort model layers by layer name to ensure that group names are strictly
-  # growing to avoid prefix issues.
-  for layer in sorted(model.layers, key=lambda x: x.name):
-    g = f.create_group(layer.name)
-    weights = _legacy_weights(layer)
-    save_subset_weights_to_hdf5_group(g, weights)
-  weights = model._trainable_weights + model._non_trainable_weights
-  g = f.create_group('top_level_model_weights')
-  save_subset_weights_to_hdf5_group(g, weights)
-
-
-def load_subset_weights_from_hdf5_group(f):
-  """Load layer weights of a model from hdf5.
-
-  Args:
-      f: A pointer to a HDF5 group.
-
-  Returns:
-      List of NumPy arrays of the weight values.
-
-  Raises:
-      ValueError: in case of mismatch between provided model
-          and weights file.
-  """
-  weight_names = load_attributes_from_hdf5_group(f, 'weight_names')
-  return [np.asarray(f[weight_name]) for weight_name in weight_names]
-
-
-def load_weights_from_hdf5_group(f, model):
-  """Implements topological (order-based) weight loading.
-
-  Args:
-      f: A pointer to a HDF5 group.
-      model: Model instance.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version']
-    if hasattr(original_keras_version, 'decode'):
-      original_keras_version = original_keras_version.decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend']
-    if hasattr(original_backend, 'decode'):
-      original_backend = original_backend.decode('utf8')
-  else:
-    original_backend = None
-
-  filtered_layers = []
-  for layer in model.layers:
-    weights = _legacy_weights(layer)
-    if weights:
-      filtered_layers.append(layer)
-
-  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
-  filtered_layer_names = []
-  for name in layer_names:
-    g = f[name]
-    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    if weight_names:
-      filtered_layer_names.append(name)
-  layer_names = filtered_layer_names
-  if len(layer_names) != len(filtered_layers):
-    raise ValueError(
-        f'Layer count mismatch when loading weights from file. '
-        f'Model expected {len(filtered_layers)} layers, found '
-        f'{len(layer_names)} saved layers.')
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    layer = filtered_layers[k]
-    symbolic_weights = _legacy_weights(layer)
-    weight_values = load_subset_weights_from_hdf5_group(g)
-    weight_values = preprocess_weights_for_loading(layer, weight_values,
-                                                   original_keras_version,
-                                                   original_backend)
-    if len(weight_values) != len(symbolic_weights):
-      raise ValueError(
-          f'Weight count mismatch for layer #{k} (named {layer.name} in the '
-          f'current model, {name} in the save file). '
-          f'Layer expects {len(symbolic_weights)} weight(s). Received '
-          f'{len(weight_values)} saved weight(s)')
-    weight_value_tuples += zip(symbolic_weights, weight_values)
-
-  if 'top_level_model_weights' in f:
-    symbolic_weights = model._trainable_weights + model._non_trainable_weights
-    weight_values = load_subset_weights_from_hdf5_group(
-        f['top_level_model_weights'])
-    if len(weight_values) != len(symbolic_weights):
-      raise ValueError(
-          f'Weight count mismatch for top-level weights when loading weights '
-          f'from file. '
-          f'Model expects {len(symbolic_weights)} top-level weight(s). '
-          f'Received {len(weight_values)} saved top-level weight(s)')
-    weight_value_tuples += zip(symbolic_weights, weight_values)
-  backend.batch_set_value(weight_value_tuples)
-
-  # Perform any layer defined finalization of the layer state.
-  for layer in model._flatten_layers():
-    layer.finalize_state()
-
-
-def load_weights_from_hdf5_group_by_name(f, model, skip_mismatch=False):
-  """Implements name-based weight loading (instead of topological loading).
-
-  Layers that have no matching name are skipped.
-
-  Args:
-      f: A pointer to a HDF5 group.
-      model: Model instance.
-      skip_mismatch: Boolean, whether to skip loading of layers
-          where there is a mismatch in the number of weights,
-          or a mismatch in the shape of the weights.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file and skip_match=False.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version']
-    if hasattr(original_keras_version, 'decode'):
-      original_keras_version = original_keras_version.decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend']
-    if hasattr(original_backend, 'decode'):
-      original_backend = original_backend.decode('utf8')
-  else:
-    original_backend = None
-
-  # New file format.
-  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
-
-  # Reverse index of layer name to list of layers with name.
-  index = {}
-  for layer in model.layers:
-    if layer.name:
-      index.setdefault(layer.name, []).append(layer)
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    weight_values = load_subset_weights_from_hdf5_group(g)
-    for layer in index.get(name, []):
-      symbolic_weights = _legacy_weights(layer)
-      weight_values = preprocess_weights_for_loading(
-          layer, weight_values, original_keras_version, original_backend)
-      if len(weight_values) != len(symbolic_weights):
-        if skip_mismatch:
-          logging.warning(
-              f'Skipping loading of weights for layer #{k} (named '
-              f'{layer.name}) due to mismatch in number of weights. '
-              f'Layer expects {len(symbolic_weights)} weight(s). Received '
-              f'{len(weight_values)} saved weight(s)')
-          continue
-        raise ValueError(
-            f'Weight count mismatch for layer #{k} (named {layer.name}). '
-            f'Layer expects {len(symbolic_weights)} weight(s). Received '
-            f'{len(weight_values)} saved weight(s)')
-      # Set values.
-      for i in range(len(weight_values)):
-        expected_shape = backend.int_shape(symbolic_weights[i])
-        received_shape = weight_values[i].shape
-        if expected_shape != received_shape:
-          if skip_mismatch:
-            logging.warning(
-                f'Skipping loading weights for layer #{k} (named '
-                f'{layer.name}) due to mismatch in shape for weight '
-                f'{symbolic_weights[i].name}. '
-                f'Weight expects shape {expected_shape}. Received saved weight '
-                f'with shape {received_shape}')
-            continue
-          raise ValueError(
-              f'Shape mismatch in layer #{k} (named {layer.name}) for weight '
-              f'{symbolic_weights[i].name}. '
-              f'Weight expects shape {expected_shape}. Received saved weight '
-              f'with shape {received_shape}')
-        else:
-          weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
-
-  if 'top_level_model_weights' in f:
-    symbolic_weights = model._trainable_weights + model._non_trainable_weights
-    weight_values = load_subset_weights_from_hdf5_group(
-        f['top_level_model_weights'])
-
-    if len(weight_values) != len(symbolic_weights):
-      if skip_mismatch:
-        logging.warning(
-            f'Skipping loading top-level weights for model due to mismatch '
-            f'in number of weights. '
-            f'Model expects {len(symbolic_weights)} top-level weight(s). '
-            f'Received {len(weight_values)} saved top-level weight(s)')
-      else:
-        raise ValueError(
-            f'Weight count mismatch for top-level weights of model. '
-            f'Model expects {len(symbolic_weights)} top-level weight(s). '
-            f'Received {len(weight_values)} saved top-level weight(s)')
-    else:
-      for i in range(len(weight_values)):
-        expected_shape = backend.int_shape(symbolic_weights[i])
-        received_shape = weight_values[i].shape
-        if expected_shape != received_shape:
-          if skip_mismatch:
-            logging.warning(
-                f'Skipping loading top-level weight for model due to '
-                f'mismatch in shape for weight {symbolic_weights[i].name}. '
-                f'Weight expects shape {expected_shape}. Received saved weight '
-                f'with shape {received_shape}')
-          else:
-            raise ValueError(
-                f'Shape mismatch in model for top-level weight '
-                f'{symbolic_weights[i].name}. '
-                f'Weight expects shape {expected_shape}. Received saved weight '
-                f'with shape {received_shape}')
-        else:
-          weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
-
-  backend.batch_set_value(weight_value_tuples)
-
-  # Perform any layer defined finalization of the layer state.
-  for layer in model._flatten_layers():
-    layer.finalize_state()
-
-
-def save_attributes_to_hdf5_group(group, name, data):
-  """Saves attributes (data) of the specified name into the HDF5 group.
-
-  This method deals with an inherent problem of HDF5 file which is not
-  able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
-  Args:
-      group: A pointer to a HDF5 group.
-      name: A name of the attributes to save.
-      data: Attributes data to store.
-
-  Raises:
-    RuntimeError: If any single attribute is too large to be saved.
-  """
-  # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
-  # because in that case even chunking the array would not make the saving
-  # possible.
-  bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
-
-  # Expecting this to never be true.
-  if bad_attributes:
-    raise RuntimeError(
-        'The following attributes cannot be saved to HDF5 file because they '
-        f'are larger than {HDF5_OBJECT_HEADER_LIMIT} bytes: {bad_attributes}')
-
-  data_npy = np.asarray(data)
-
-  num_chunks = 1
-  chunked_data = np.array_split(data_npy, num_chunks)
-
-  # This will never loop forever thanks to the test above.
-  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
-    num_chunks += 1
-    chunked_data = np.array_split(data_npy, num_chunks)
-
-  if num_chunks > 1:
-    for chunk_id, chunk_data in enumerate(chunked_data):
-      group.attrs['%s%d' % (name, chunk_id)] = chunk_data
-  else:
-    group.attrs[name] = data
-
-
-def load_attributes_from_hdf5_group(group, name):
-  """Loads attributes of the specified name from the HDF5 group.
-
-  This method deals with an inherent problem
-  of HDF5 file which is not able to store
-  data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
-  Args:
-      group: A pointer to a HDF5 group.
-      name: A name of the attributes to load.
-
-  Returns:
-      data: Attributes data.
-  """
-  if name in group.attrs:
-    data = [
-        n.decode('utf8') if hasattr(n, 'decode') else n
-        for n in group.attrs[name]
-    ]
-  else:
-    data = []
-    chunk_id = 0
-    while '%s%d' % (name, chunk_id) in group.attrs:
-      data.extend([
-          n.decode('utf8') if hasattr(n, 'decode') else n
-          for n in group.attrs['%s%d' % (name, chunk_id)]
-      ])
-      chunk_id += 1
-  return data
-
-
-def _legacy_weights(layer):
-  """DO NOT USE.
-
-  For legacy reason, the layer.weights was in the order of
-  [self.trainable_weights + self.non_trainable_weights], and this order was
-  used for preserving the weights in h5 format. The new order of layer.weights
-  are the same as layer.get_weights() which is more intuitive for user. To
-  keep supporting the existing saved h5 file, this method should be used to
-  save/load weights. In future version, we will delete this method and
-  introduce a breaking change for h5 and stay with the new order for weights.
-
-  Args:
-    layer: a `tf.keras.Model` or `tf.keras.layers.Layer` instance.
-
-  Returns:
-    A list of variables with the order of trainable_weights, followed by
-      non_trainable_weights.
-  """
-  weights = layer.trainable_weights + layer.non_trainable_weights
-  if any(not isinstance(w, tf.Variable) for w in weights):
-    raise NotImplementedError(
-        f'Save or restore weights that is not an instance of `tf.Variable` is '
-        f'not supported in h5, use `save_format=\'tf\'` instead. Received a '
-        f'model or layer {layer.__class__.__name__} with weights {weights}')
-  return weights
diff --git a/keras/saving/legacy/__init__.py b/keras/saving/legacy/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/keras/saving/legacy/hdf5_format.py b/keras/saving/legacy/hdf5_format.py
new file mode 100644
index 000000000000..8d4a95eeaaa8
--- /dev/null
+++ b/keras/saving/legacy/hdf5_format.py
@@ -0,0 +1,1119 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functions for saving and loading a Keras Model from HDF5 format."""
+
+import json
+import os
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.optimizers import optimizer as optimizer_base
+from keras.optimizers import optimizer_v1
+from keras.saving import object_registration
+from keras.saving.legacy import model_config as model_config_lib
+from keras.saving.legacy import saving_utils
+from keras.saving.legacy.saved_model import json_utils
+from keras.utils.generic_utils import LazyLoader
+from keras.utils.io_utils import ask_to_proceed_with_overwrite
+
+# isort: off
+from tensorflow.python.platform import tf_logging as logging
+
+try:
+    import h5py
+
+    HDF5_OBJECT_HEADER_LIMIT = 64512
+except ImportError:
+    h5py = None
+
+# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
+# once the issue with copybara is fixed.
+
+sequential_lib = LazyLoader(
+    "sequential_lib", globals(), "keras.engine.sequential"
+)
+
+
+def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
+    """Saves a model to a HDF5 file.
+
+    The saved model contains:
+        - the model's configuration (topology)
+        - the model's weights
+        - the model's optimizer's state (if any)
+
+    Thus the saved model can be reinstantiated in
+    the exact same state, without any of the code
+    used for model definition or training.
+
+    Args:
+        model: Keras model instance to be saved.
+        filepath: One of the following:
+            - String, path where to save the model
+            - `h5py.File` object where to save the model
+        overwrite: Whether we should overwrite any existing
+            model at the target location, or instead
+            ask the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+
+    Raises:
+        ImportError: if h5py is not available.
+    """
+
+    if h5py is None:
+        raise ImportError(
+            "`save_model()` using h5 format requires h5py. Could not "
+            "import h5py."
+        )
+
+    # Ensures that all models saved in HDF5 format follow the old serialization
+    model.use_legacy_config = True
+
+    # TODO(psv) Add warning when we save models that contain non-serializable
+    # entities like metrics added using `add_metric` and losses added using
+    # `add_loss.`
+    if len(model.weights) != len(model._undeduplicated_weights):
+        logging.warning(
+            "Found duplicated `Variable`s in Model's `weights`. "
+            "This is usually caused by `Variable`s being shared by "
+            "Layers in the Model. These `Variable`s will be treated "
+            "as separate `Variable`s when the Model is restored. To "
+            'avoid this, please save with `save_format="tf"`.'
+        )
+
+    if not isinstance(filepath, h5py.File):
+        # If file exists and should not be overwritten.
+        if not overwrite and os.path.isfile(filepath):
+            proceed = ask_to_proceed_with_overwrite(filepath)
+            if not proceed:
+                return
+
+        # Try creating dir if not exist
+        dirpath = os.path.dirname(filepath)
+        if not os.path.exists(dirpath):
+            tf.io.gfile.makedirs(dirpath)
+
+        f = h5py.File(filepath, mode="w")
+        opened_new_file = True
+    else:
+        f = filepath
+        opened_new_file = False
+
+    try:
+        model_metadata = saving_utils.model_metadata(model, include_optimizer)
+        for k, v in model_metadata.items():
+            if isinstance(v, (dict, list, tuple)):
+                f.attrs[k] = json.dumps(
+                    v, default=json_utils.get_json_type
+                ).encode("utf8")
+            else:
+                f.attrs[k] = v
+
+        model_weights_group = f.create_group("model_weights")
+        save_weights_to_hdf5_group(model_weights_group, model)
+
+        # TODO(b/128683857): Add integration tests between tf.keras and external
+        # Keras, to avoid breaking TF.js users.
+        if (
+            include_optimizer
+            and model.optimizer
+            and not isinstance(model.optimizer, optimizer_v1.TFOptimizer)
+        ):
+            save_optimizer_weights_to_hdf5_group(f, model.optimizer)
+
+        f.flush()
+    finally:
+        if opened_new_file:
+            f.close()
+
+        # Remove legacy serialization attribute after H5 saving complete
+        delattr(model, "use_legacy_config")
+
+
+def load_model_from_hdf5(filepath, custom_objects=None, compile=True):
+    """Loads a model saved via `save_model_to_hdf5`.
+
+    Args:
+        filepath: One of the following:
+            - String, path to the saved model
+            - `h5py.File` object from which to load the model
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+        compile: Boolean, whether to compile the model
+            after loading.
+
+    Returns:
+        A Keras model instance. If an optimizer was found
+        as part of the saved model, the model is already
+        compiled. Otherwise, the model is uncompiled and
+        a warning will be displayed. When `compile` is set
+        to False, the compilation is omitted without any
+        warning.
+
+    Raises:
+        ImportError: if h5py is not available.
+        ValueError: In case of an invalid savefile.
+    """
+    if h5py is None:
+        raise ImportError(
+            "`load_model()` using h5 format requires h5py. Could not "
+            "import h5py."
+        )
+
+    if not custom_objects:
+        custom_objects = {}
+
+    tlco = object_registration._THREAD_LOCAL_CUSTOM_OBJECTS.__dict__
+    gco = object_registration._GLOBAL_CUSTOM_OBJECTS
+    custom_objects = {**custom_objects, **tlco, **gco}
+
+    opened_new_file = not isinstance(filepath, h5py.File)
+    if opened_new_file:
+        f = h5py.File(filepath, mode="r")
+    else:
+        f = filepath
+
+    model = None
+    try:
+        # instantiate model
+        model_config = f.attrs.get("model_config")
+        if model_config is None:
+            raise ValueError(
+                f"No model config found in the file at {filepath}."
+            )
+        if hasattr(model_config, "decode"):
+            model_config = model_config.decode("utf-8")
+        model_config = json_utils.decode(model_config)
+        model = model_config_lib.model_from_config(
+            model_config, custom_objects=custom_objects
+        )
+
+        # set weights
+        load_weights_from_hdf5_group(f["model_weights"], model)
+
+        if compile:
+            # instantiate optimizer
+            training_config = f.attrs.get("training_config")
+            if hasattr(training_config, "decode"):
+                training_config = training_config.decode("utf-8")
+            if training_config is None:
+                logging.warning(
+                    "No training configuration found in the save file, so "
+                    "the model was *not* compiled. Compile it manually."
+                )
+                return model
+            training_config = json_utils.decode(training_config)
+
+            # Compile model.
+            model.compile(
+                **saving_utils.compile_args_from_training_config(
+                    training_config, custom_objects
+                ),
+                from_serialized=True,
+            )
+            saving_utils.try_build_compiled_arguments(model)
+
+            # Set optimizer weights.
+            if "optimizer_weights" in f:
+                try:
+                    if isinstance(model.optimizer, optimizer_base.Optimizer):
+                        model.optimizer.build(model.trainable_variables)
+                    else:
+                        model.optimizer._create_all_weights(
+                            model.trainable_variables
+                        )
+                except (NotImplementedError, AttributeError):
+                    logging.warning(
+                        "Error when creating the weights of optimizer {}, "
+                        "making it impossible to restore the saved optimizer "
+                        "state. As a result, your model is starting with "
+                        "a freshly initialized optimizer."
+                    )
+
+                optimizer_weight_values = (
+                    load_optimizer_weights_from_hdf5_group(f)
+                )
+                try:
+                    model.optimizer.set_weights(optimizer_weight_values)
+                except ValueError:
+                    logging.warning(
+                        "Error in loading the saved optimizer "
+                        "state. As a result, your model is "
+                        "starting with a freshly initialized "
+                        "optimizer."
+                    )
+    finally:
+        if opened_new_file:
+            f.close()
+    return model
+
+
+def preprocess_weights_for_loading(
+    layer, weights, original_keras_version=None, original_backend=None
+):
+    """Preprocess layer weights between different Keras formats.
+
+    Converts layers weights from Keras 1 format to Keras 2 and also weights of
+    cuDNN layers in Keras 2.
+
+    Args:
+        layer: Layer instance.
+        weights: List of weights values (Numpy arrays).
+        original_keras_version: Keras version for the weights, as a string.
+        original_backend: Keras backend the weights were trained with,
+            as a string.
+
+    Returns:
+        A list of weights values (Numpy arrays).
+    """
+
+    def convert_nested_bidirectional(weights):
+        """Converts layers nested in `Bidirectional` wrapper.
+
+        This function uses `preprocess_weights_for_loading()` for converting
+        layers.
+
+        Args:
+            weights: List of weights values (Numpy arrays).
+
+        Returns:
+            A list of weights values (Numpy arrays).
+        """
+        num_weights_per_layer = len(weights) // 2
+        forward_weights = preprocess_weights_for_loading(
+            layer.forward_layer,
+            weights[:num_weights_per_layer],
+            original_keras_version,
+            original_backend,
+        )
+        backward_weights = preprocess_weights_for_loading(
+            layer.backward_layer,
+            weights[num_weights_per_layer:],
+            original_keras_version,
+            original_backend,
+        )
+        return forward_weights + backward_weights
+
+    def convert_nested_time_distributed(weights):
+        """Converts layers nested in `TimeDistributed` wrapper.
+
+        This function uses `preprocess_weights_for_loading()` for converting
+        nested layers.
+
+        Args:
+            weights: List of weights values (Numpy arrays).
+
+        Returns:
+            A list of weights values (Numpy arrays).
+        """
+        return preprocess_weights_for_loading(
+            layer.layer, weights, original_keras_version, original_backend
+        )
+
+    def convert_nested_model(weights):
+        """Converts layers nested in `Model` or `Sequential`.
+
+        This function uses `preprocess_weights_for_loading()` for converting
+        nested layers.
+
+        Args:
+            weights: List of weights values (Numpy arrays).
+
+        Returns:
+            A list of weights values (Numpy arrays).
+        """
+        trainable_weights = weights[: len(layer.trainable_weights)]
+        non_trainable_weights = weights[len(layer.trainable_weights) :]
+
+        new_trainable_weights = []
+        new_non_trainable_weights = []
+
+        for sublayer in layer.layers:
+            num_trainable_weights = len(sublayer.trainable_weights)
+            num_non_trainable_weights = len(sublayer.non_trainable_weights)
+            if sublayer.weights:
+                preprocessed = preprocess_weights_for_loading(
+                    layer=sublayer,
+                    weights=(
+                        trainable_weights[:num_trainable_weights]
+                        + non_trainable_weights[:num_non_trainable_weights]
+                    ),
+                    original_keras_version=original_keras_version,
+                    original_backend=original_backend,
+                )
+                new_trainable_weights.extend(
+                    preprocessed[:num_trainable_weights]
+                )
+                new_non_trainable_weights.extend(
+                    preprocessed[num_trainable_weights:]
+                )
+
+                trainable_weights = trainable_weights[num_trainable_weights:]
+                non_trainable_weights = non_trainable_weights[
+                    num_non_trainable_weights:
+                ]
+        new_trainable_weights += layer._trainable_weights
+        new_non_trainable_weights += layer._non_trainable_weights
+        return new_trainable_weights + new_non_trainable_weights
+
+    # Convert layers nested in Bidirectional/Model/Sequential.
+    # Both transformation should be ran for both Keras 1->2 conversion
+    # and for conversion of cuDNN layers.
+    if layer.__class__.__name__ == "Bidirectional":
+        weights = convert_nested_bidirectional(weights)
+    if layer.__class__.__name__ == "TimeDistributed":
+        weights = convert_nested_time_distributed(weights)
+    elif layer.__class__.__name__ in ["Model", "Sequential", "Functional"]:
+        weights = convert_nested_model(weights)
+
+    if original_keras_version == "1":
+        if layer.__class__.__name__ == "TimeDistributed":
+            weights = preprocess_weights_for_loading(
+                layer.layer, weights, original_keras_version, original_backend
+            )
+
+        if layer.__class__.__name__ == "Conv1D":
+            shape = weights[0].shape
+            # Handle Keras 1.1 format
+            if (
+                shape[:2] != (layer.kernel_size[0], 1)
+                or shape[3] != layer.filters
+            ):
+                # Legacy shape:
+                # (filters, input_dim, filter_length, 1)
+                assert shape[0] == layer.filters and shape[2:] == (
+                    layer.kernel_size[0],
+                    1,
+                )
+                weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+            weights[0] = weights[0][:, 0, :, :]
+
+        if layer.__class__.__name__ == "Conv2D":
+            if layer.data_format == "channels_first":
+                # old: (filters, stack_size, kernel_rows, kernel_cols)
+                # new: (kernel_rows, kernel_cols, stack_size, filters)
+                weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+
+        if layer.__class__.__name__ == "Conv2DTranspose":
+            if layer.data_format == "channels_last":
+                # old: (kernel_rows, kernel_cols, stack_size, filters)
+                # new: (kernel_rows, kernel_cols, filters, stack_size)
+                weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
+            if layer.data_format == "channels_first":
+                # old: (filters, stack_size, kernel_rows, kernel_cols)
+                # new: (kernel_rows, kernel_cols, filters, stack_size)
+                weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
+
+        if layer.__class__.__name__ == "Conv3D":
+            if layer.data_format == "channels_first":
+                # old: (filters, stack_size, ...)
+                # new: (..., stack_size, filters)
+                weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
+
+        if layer.__class__.__name__ == "GRU":
+            if len(weights) == 9:
+                kernel = np.concatenate(
+                    [weights[0], weights[3], weights[6]], axis=-1
+                )
+                recurrent_kernel = np.concatenate(
+                    [weights[1], weights[4], weights[7]], axis=-1
+                )
+                bias = np.concatenate(
+                    [weights[2], weights[5], weights[8]], axis=-1
+                )
+                weights = [kernel, recurrent_kernel, bias]
+
+        if layer.__class__.__name__ == "LSTM":
+            if len(weights) == 12:
+                # old: i, c, f, o
+                # new: i, f, c, o
+                kernel = np.concatenate(
+                    [weights[0], weights[6], weights[3], weights[9]], axis=-1
+                )
+                recurrent_kernel = np.concatenate(
+                    [weights[1], weights[7], weights[4], weights[10]], axis=-1
+                )
+                bias = np.concatenate(
+                    [weights[2], weights[8], weights[5], weights[11]], axis=-1
+                )
+                weights = [kernel, recurrent_kernel, bias]
+
+        if layer.__class__.__name__ == "ConvLSTM2D":
+            if len(weights) == 12:
+                kernel = np.concatenate(
+                    [weights[0], weights[6], weights[3], weights[9]], axis=-1
+                )
+                recurrent_kernel = np.concatenate(
+                    [weights[1], weights[7], weights[4], weights[10]], axis=-1
+                )
+                bias = np.concatenate(
+                    [weights[2], weights[8], weights[5], weights[11]], axis=-1
+                )
+                if layer.data_format == "channels_first":
+                    # old: (filters, stack_size, kernel_rows, kernel_cols)
+                    # new: (kernel_rows, kernel_cols, stack_size, filters)
+                    kernel = np.transpose(kernel, (2, 3, 1, 0))
+                    recurrent_kernel = np.transpose(
+                        recurrent_kernel, (2, 3, 1, 0)
+                    )
+                weights = [kernel, recurrent_kernel, bias]
+
+    conv_layers = [
+        "Conv1D",
+        "Conv2D",
+        "Conv3D",
+        "Conv2DTranspose",
+        "ConvLSTM2D",
+    ]
+    if layer.__class__.__name__ in conv_layers:
+        if backend.int_shape(layer.weights[0]) != weights[0].shape:
+            weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
+            if layer.__class__.__name__ == "ConvLSTM2D":
+                weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
+
+    # convert cuDNN layers
+    return _convert_rnn_weights(layer, weights)
+
+
+def _convert_rnn_weights(layer, weights):
+    """Converts weights for RNN layers between native and cuDNN format.
+
+    Input kernels for each gate are transposed and converted between Fortran
+    and C layout, recurrent kernels are transposed. For LSTM biases are summed/
+    split in half, for GRU biases are reshaped.
+
+    Weights can be converted in both directions between `LSTM` and`CuDNNSLTM`
+    and between `CuDNNGRU` and `GRU(reset_after=True)`. Default `GRU` is not
+    compatible with `CuDNNGRU`.
+
+    For missing biases in `LSTM`/`GRU` (`use_bias=False`) no conversion is made.
+
+    Args:
+        layer: Target layer instance.
+        weights: List of source weights values (input kernels, recurrent
+          kernels, [biases]) (Numpy arrays).
+
+    Returns:
+        A list of converted weights values (Numpy arrays).
+
+    Raises:
+        ValueError: for incompatible GRU layer/weights or incompatible biases
+    """
+
+    def transform_kernels(kernels, func, n_gates):
+        """Transforms kernel for each gate separately using given function.
+
+        Args:
+            kernels: Stacked array of kernels for individual gates.
+            func: Function applied to kernel of each gate.
+            n_gates: Number of gates (4 for LSTM, 3 for GRU).
+
+        Returns:
+            Stacked array of transformed kernels.
+        """
+        return np.hstack([func(k) for k in np.hsplit(kernels, n_gates)])
+
+    def transpose_input(from_cudnn):
+        """Makes a function that transforms input kernels from/to cuDNN format.
+
+        It keeps the shape, but changes between the layout (Fortran/C). Eg.:
+
+        ```
+        Keras                 cuDNN
+        [[0, 1, 2],  <--->  [[0, 2, 4],
+         [3, 4, 5]]          [1, 3, 5]]
+        ```
+
+        It can be passed to `transform_kernels()`.
+
+        Args:
+            from_cudnn: `True` if source weights are in cuDNN format, `False` if
+              they're in plain Keras format.
+
+        Returns:
+            Function that converts input kernel to the other format.
+        """
+        order = "F" if from_cudnn else "C"
+
+        def transform(kernel):
+            return kernel.T.reshape(kernel.shape, order=order)
+
+        return transform
+
+    target_class = layer.__class__.__name__
+
+    # convert the weights between CuDNNLSTM and LSTM
+    if target_class in ["LSTM", "CuDNNLSTM"] and len(weights) == 3:
+        # determine if we're loading a CuDNNLSTM layer
+        # from the number of bias weights:
+        # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
+        # if there's no bias weight in the file, skip this conversion
+        units = weights[1].shape[0]
+        bias_shape = weights[2].shape
+        n_gates = 4
+
+        if bias_shape == (2 * units * n_gates,):
+            source = "CuDNNLSTM"
+        elif bias_shape == (units * n_gates,):
+            source = "LSTM"
+        else:
+            raise ValueError("Invalid bias shape: " + str(bias_shape))
+
+        def convert_lstm_weights(weights, from_cudnn=True):
+            """Converts the weights between CuDNNLSTM and LSTM.
+
+            Args:
+              weights: Original weights.
+              from_cudnn: Indicates whether original weights are from cuDNN
+                layer.
+
+            Returns:
+              Updated weights compatible with LSTM.
+            """
+
+            # Transpose (and reshape) input and recurrent kernels
+            kernels = transform_kernels(
+                weights[0], transpose_input(from_cudnn), n_gates
+            )
+            recurrent_kernels = transform_kernels(
+                weights[1], lambda k: k.T, n_gates
+            )
+            if from_cudnn:
+                # merge input and recurrent biases into a single set
+                biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
+            else:
+                # Split single set of biases evenly to two sets. The way of
+                # splitting doesn't matter as long as the two sets sum is kept.
+                biases = np.tile(0.5 * weights[2], 2)
+            return [kernels, recurrent_kernels, biases]
+
+        if source != target_class:
+            weights = convert_lstm_weights(
+                weights, from_cudnn=source == "CuDNNLSTM"
+            )
+
+    # convert the weights between CuDNNGRU and GRU(reset_after=True)
+    if target_class in ["GRU", "CuDNNGRU"] and len(weights) == 3:
+        # We can determine the source of the weights from the shape of the bias.
+        # If there is no bias we skip the conversion since
+        # CuDNNGRU always has biases.
+
+        units = weights[1].shape[0]
+        bias_shape = weights[2].shape
+        n_gates = 3
+
+        def convert_gru_weights(weights, from_cudnn=True):
+            """Converts the weights between CuDNNGRU and GRU.
+
+            Args:
+              weights: Original weights.
+              from_cudnn: Indicates whether original weights are from cuDNN
+                layer.
+
+            Returns:
+              Updated weights compatible with GRU.
+            """
+
+            kernels = transform_kernels(
+                weights[0], transpose_input(from_cudnn), n_gates
+            )
+            recurrent_kernels = transform_kernels(
+                weights[1], lambda k: k.T, n_gates
+            )
+            biases = np.array(weights[2]).reshape((2, -1) if from_cudnn else -1)
+            return [kernels, recurrent_kernels, biases]
+
+        if bias_shape == (2 * units * n_gates,):
+            source = "CuDNNGRU"
+        elif bias_shape == (2, units * n_gates):
+            source = "GRU(reset_after=True)"
+        elif bias_shape == (units * n_gates,):
+            source = "GRU(reset_after=False)"
+        else:
+            raise ValueError("Invalid bias shape: " + str(bias_shape))
+
+        if target_class == "CuDNNGRU":
+            target = "CuDNNGRU"
+        elif layer.reset_after:
+            target = "GRU(reset_after=True)"
+        else:
+            target = "GRU(reset_after=False)"
+
+        # only convert between different types
+        if source != target:
+            types = (source, target)
+            if "GRU(reset_after=False)" in types:
+                raise ValueError("%s is not compatible with %s" % types)
+            if source == "CuDNNGRU":
+                weights = convert_gru_weights(weights, from_cudnn=True)
+            elif source == "GRU(reset_after=True)":
+                weights = convert_gru_weights(weights, from_cudnn=False)
+
+    return weights
+
+
+def save_optimizer_weights_to_hdf5_group(hdf5_group, optimizer):
+    """Saves optimizer weights of a optimizer to a HDF5 group.
+
+    Args:
+        hdf5_group: HDF5 group.
+        optimizer: optimizer instance.
+    """
+    if isinstance(optimizer, optimizer_base.Optimizer):
+        symbolic_weights = optimizer.variables
+    else:
+        symbolic_weights = getattr(optimizer, "weights")
+    if symbolic_weights:
+        weights_group = hdf5_group.create_group("optimizer_weights")
+        weight_names = [str(w.name).encode("utf8") for w in symbolic_weights]
+        save_attributes_to_hdf5_group(
+            weights_group, "weight_names", weight_names
+        )
+        weight_values = backend.batch_get_value(symbolic_weights)
+        for name, val in zip(weight_names, weight_values):
+            param_dset = weights_group.create_dataset(
+                name, val.shape, dtype=val.dtype
+            )
+            if not val.shape:
+                # scalar
+                param_dset[()] = val
+            else:
+                param_dset[:] = val
+
+
+def load_optimizer_weights_from_hdf5_group(hdf5_group):
+    """Load optimizer weights from a HDF5 group.
+
+    Args:
+        hdf5_group: A pointer to a HDF5 group.
+
+    Returns:
+        data: List of optimizer weight names.
+    """
+    weights_group = hdf5_group["optimizer_weights"]
+    optimizer_weight_names = load_attributes_from_hdf5_group(
+        weights_group, "weight_names"
+    )
+    return [
+        weights_group[weight_name] for weight_name in optimizer_weight_names
+    ]
+
+
+def save_subset_weights_to_hdf5_group(f, weights):
+    """Save top-level weights of a model to a HDF5 group.
+
+    Args:
+        f: HDF5 group.
+        weights: List of weight variables.
+    """
+    weight_values = backend.batch_get_value(weights)
+    weight_names = [w.name.encode("utf8") for w in weights]
+    save_attributes_to_hdf5_group(f, "weight_names", weight_names)
+    for name, val in zip(weight_names, weight_values):
+        param_dset = f.create_dataset(name, val.shape, dtype=val.dtype)
+        if not val.shape:
+            # scalar
+            param_dset[()] = val
+        else:
+            param_dset[:] = val
+
+
+def save_weights_to_hdf5_group(f, model):
+    """Saves the weights of a list of layers to a HDF5 group.
+
+    Args:
+        f: HDF5 group.
+        model: Model instance.
+    """
+    from keras import __version__ as keras_version
+
+    save_attributes_to_hdf5_group(
+        f, "layer_names", [layer.name.encode("utf8") for layer in model.layers]
+    )
+    f.attrs["backend"] = backend.backend().encode("utf8")
+    f.attrs["keras_version"] = str(keras_version).encode("utf8")
+
+    # Sort model layers by layer name to ensure that group names are strictly
+    # growing to avoid prefix issues.
+    for layer in sorted(model.layers, key=lambda x: x.name):
+        g = f.create_group(layer.name)
+        weights = _legacy_weights(layer)
+        save_subset_weights_to_hdf5_group(g, weights)
+    weights = model._trainable_weights + model._non_trainable_weights
+    g = f.create_group("top_level_model_weights")
+    save_subset_weights_to_hdf5_group(g, weights)
+
+
+def load_subset_weights_from_hdf5_group(f):
+    """Load layer weights of a model from hdf5.
+
+    Args:
+        f: A pointer to a HDF5 group.
+
+    Returns:
+        List of NumPy arrays of the weight values.
+
+    Raises:
+        ValueError: in case of mismatch between provided model
+            and weights file.
+    """
+    weight_names = load_attributes_from_hdf5_group(f, "weight_names")
+    return [np.asarray(f[weight_name]) for weight_name in weight_names]
+
+
+def load_weights_from_hdf5_group(f, model):
+    """Implements topological (order-based) weight loading.
+
+    Args:
+        f: A pointer to a HDF5 group.
+        model: Model instance.
+
+    Raises:
+        ValueError: in case of mismatch between provided layers
+            and weights file.
+    """
+    if "keras_version" in f.attrs:
+        original_keras_version = f.attrs["keras_version"]
+        if hasattr(original_keras_version, "decode"):
+            original_keras_version = original_keras_version.decode("utf8")
+    else:
+        original_keras_version = "1"
+    if "backend" in f.attrs:
+        original_backend = f.attrs["backend"]
+        if hasattr(original_backend, "decode"):
+            original_backend = original_backend.decode("utf8")
+    else:
+        original_backend = None
+
+    filtered_layers = []
+    for layer in model.layers:
+        weights = _legacy_weights(layer)
+        if weights:
+            filtered_layers.append(layer)
+
+    layer_names = load_attributes_from_hdf5_group(f, "layer_names")
+    filtered_layer_names = []
+    for name in layer_names:
+        g = f[name]
+        weight_names = load_attributes_from_hdf5_group(g, "weight_names")
+        if weight_names:
+            filtered_layer_names.append(name)
+    layer_names = filtered_layer_names
+    if len(layer_names) != len(filtered_layers):
+        raise ValueError(
+            "Layer count mismatch when loading weights from file. "
+            f"Model expected {len(filtered_layers)} layers, found "
+            f"{len(layer_names)} saved layers."
+        )
+
+    # We batch weight value assignments in a single backend call
+    # which provides a speedup in TensorFlow.
+    weight_value_tuples = []
+    for k, name in enumerate(layer_names):
+        g = f[name]
+        layer = filtered_layers[k]
+        symbolic_weights = _legacy_weights(layer)
+        weight_values = load_subset_weights_from_hdf5_group(g)
+        weight_values = preprocess_weights_for_loading(
+            layer, weight_values, original_keras_version, original_backend
+        )
+        if len(weight_values) != len(symbolic_weights):
+            raise ValueError(
+                f"Weight count mismatch for layer #{k} (named {layer.name} in "
+                f"the current model, {name} in the save file). "
+                f"Layer expects {len(symbolic_weights)} weight(s). Received "
+                f"{len(weight_values)} saved weight(s)"
+            )
+        weight_value_tuples += zip(symbolic_weights, weight_values)
+
+    if "top_level_model_weights" in f:
+        symbolic_weights = (
+            model._trainable_weights + model._non_trainable_weights
+        )
+        weight_values = load_subset_weights_from_hdf5_group(
+            f["top_level_model_weights"]
+        )
+        if len(weight_values) != len(symbolic_weights):
+            raise ValueError(
+                "Weight count mismatch for top-level weights when loading "
+                "weights from file. "
+                f"Model expects {len(symbolic_weights)} top-level weight(s). "
+                f"Received {len(weight_values)} saved top-level weight(s)"
+            )
+        weight_value_tuples += zip(symbolic_weights, weight_values)
+    backend.batch_set_value(weight_value_tuples)
+
+    # Perform any layer defined finalization of the layer state.
+    for layer in model._flatten_layers():
+        layer.finalize_state()
+
+
+def load_weights_from_hdf5_group_by_name(f, model, skip_mismatch=False):
+    """Implements name-based weight loading (instead of topological loading).
+
+    Layers that have no matching name are skipped.
+
+    Args:
+        f: A pointer to a HDF5 group.
+        model: Model instance.
+        skip_mismatch: Boolean, whether to skip loading of layers
+            where there is a mismatch in the number of weights,
+            or a mismatch in the shape of the weights.
+
+    Raises:
+        ValueError: in case of mismatch between provided layers
+            and weights file and skip_match=False.
+    """
+    if "keras_version" in f.attrs:
+        original_keras_version = f.attrs["keras_version"]
+        if hasattr(original_keras_version, "decode"):
+            original_keras_version = original_keras_version.decode("utf8")
+    else:
+        original_keras_version = "1"
+    if "backend" in f.attrs:
+        original_backend = f.attrs["backend"]
+        if hasattr(original_backend, "decode"):
+            original_backend = original_backend.decode("utf8")
+    else:
+        original_backend = None
+
+    # New file format.
+    layer_names = load_attributes_from_hdf5_group(f, "layer_names")
+
+    # Reverse index of layer name to list of layers with name.
+    index = {}
+    for layer in model.layers:
+        if layer.name:
+            index.setdefault(layer.name, []).append(layer)
+
+    # We batch weight value assignments in a single backend call
+    # which provides a speedup in TensorFlow.
+    weight_value_tuples = []
+    for k, name in enumerate(layer_names):
+        g = f[name]
+        weight_values = load_subset_weights_from_hdf5_group(g)
+        for layer in index.get(name, []):
+            symbolic_weights = _legacy_weights(layer)
+            weight_values = preprocess_weights_for_loading(
+                layer, weight_values, original_keras_version, original_backend
+            )
+            if len(weight_values) != len(symbolic_weights):
+                if skip_mismatch:
+                    logging.warning(
+                        f"Skipping loading of weights for layer #{k} (named "
+                        f"{layer.name}) due to mismatch in number of weights. "
+                        f"Layer expects {len(symbolic_weights)} weight(s). "
+                        f"Received {len(weight_values)} saved weight(s)"
+                    )
+                    continue
+                raise ValueError(
+                    f"Weight count mismatch for layer #{k} "
+                    f"(named {layer.name}). "
+                    f"Layer expects {len(symbolic_weights)} weight(s). "
+                    f"Received {len(weight_values)} saved weight(s)"
+                )
+            # Set values.
+            for i in range(len(weight_values)):
+                expected_shape = backend.int_shape(symbolic_weights[i])
+                received_shape = weight_values[i].shape
+                if expected_shape != received_shape:
+                    if skip_mismatch:
+                        logging.warning(
+                            f"Skipping loading weights for layer #{k} (named "
+                            f"{layer.name}) due to mismatch in shape for "
+                            f"weight {symbolic_weights[i].name}. "
+                            f"Weight expects shape {expected_shape}. "
+                            "Received saved weight "
+                            f"with shape {received_shape}"
+                        )
+                        continue
+                    raise ValueError(
+                        f"Shape mismatch in layer #{k} (named {layer.name}) "
+                        f"for weight {symbolic_weights[i].name}. "
+                        f"Weight expects shape {expected_shape}. "
+                        "Received saved weight "
+                        f"with shape {received_shape}"
+                    )
+                else:
+                    weight_value_tuples.append(
+                        (symbolic_weights[i], weight_values[i])
+                    )
+
+    if "top_level_model_weights" in f:
+        symbolic_weights = (
+            model._trainable_weights + model._non_trainable_weights
+        )
+        weight_values = load_subset_weights_from_hdf5_group(
+            f["top_level_model_weights"]
+        )
+
+        if len(weight_values) != len(symbolic_weights):
+            if skip_mismatch:
+                logging.warning(
+                    "Skipping loading top-level weights for model due to "
+                    "mismatch in number of weights. "
+                    f"Model expects {len(symbolic_weights)} "
+                    "top-level weight(s). "
+                    f"Received {len(weight_values)} saved top-level weight(s)"
+                )
+            else:
+                raise ValueError(
+                    "Weight count mismatch for top-level weights of model. "
+                    f"Model expects {len(symbolic_weights)} "
+                    "top-level weight(s). "
+                    f"Received {len(weight_values)} saved top-level weight(s)"
+                )
+        else:
+            for i in range(len(weight_values)):
+                expected_shape = backend.int_shape(symbolic_weights[i])
+                received_shape = weight_values[i].shape
+                if expected_shape != received_shape:
+                    if skip_mismatch:
+                        logging.warning(
+                            "Skipping loading top-level weight for model due "
+                            "to mismatch in shape for "
+                            f"weight {symbolic_weights[i].name}. "
+                            f"Weight expects shape {expected_shape}. "
+                            "Received saved weight "
+                            f"with shape {received_shape}"
+                        )
+                    else:
+                        raise ValueError(
+                            "Shape mismatch in model for top-level weight "
+                            f"{symbolic_weights[i].name}. "
+                            f"Weight expects shape {expected_shape}. "
+                            "Received saved weight "
+                            f"with shape {received_shape}"
+                        )
+                else:
+                    weight_value_tuples.append(
+                        (symbolic_weights[i], weight_values[i])
+                    )
+
+    backend.batch_set_value(weight_value_tuples)
+
+    # Perform any layer defined finalization of the layer state.
+    for layer in model._flatten_layers():
+        layer.finalize_state()
+
+
+def save_attributes_to_hdf5_group(group, name, data):
+    """Saves attributes (data) of the specified name into the HDF5 group.
+
+    This method deals with an inherent problem of HDF5 file which is not
+    able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+    Args:
+        group: A pointer to a HDF5 group.
+        name: A name of the attributes to save.
+        data: Attributes data to store.
+
+    Raises:
+      RuntimeError: If any single attribute is too large to be saved.
+    """
+    # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
+    # because in that case even chunking the array would not make the saving
+    # possible.
+    bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
+
+    # Expecting this to never be true.
+    if bad_attributes:
+        raise RuntimeError(
+            "The following attributes cannot be saved to HDF5 file because "
+            f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} "
+            f"bytes: {bad_attributes}"
+        )
+
+    data_npy = np.asarray(data)
+
+    num_chunks = 1
+    chunked_data = np.array_split(data_npy, num_chunks)
+
+    # This will never loop forever thanks to the test above.
+    while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
+        num_chunks += 1
+        chunked_data = np.array_split(data_npy, num_chunks)
+
+    if num_chunks > 1:
+        for chunk_id, chunk_data in enumerate(chunked_data):
+            group.attrs["%s%d" % (name, chunk_id)] = chunk_data
+    else:
+        group.attrs[name] = data
+
+
+def load_attributes_from_hdf5_group(group, name):
+    """Loads attributes of the specified name from the HDF5 group.
+
+    This method deals with an inherent problem
+    of HDF5 file which is not able to store
+    data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+    Args:
+        group: A pointer to a HDF5 group.
+        name: A name of the attributes to load.
+
+    Returns:
+        data: Attributes data.
+    """
+    if name in group.attrs:
+        data = [
+            n.decode("utf8") if hasattr(n, "decode") else n
+            for n in group.attrs[name]
+        ]
+    else:
+        data = []
+        chunk_id = 0
+        while "%s%d" % (name, chunk_id) in group.attrs:
+            data.extend(
+                [
+                    n.decode("utf8") if hasattr(n, "decode") else n
+                    for n in group.attrs["%s%d" % (name, chunk_id)]
+                ]
+            )
+            chunk_id += 1
+    return data
+
+
+def _legacy_weights(layer):
+    """DO NOT USE.
+
+    For legacy reason, the layer.weights was in the order of
+    [self.trainable_weights + self.non_trainable_weights], and this order was
+    used for preserving the weights in h5 format. The new order of layer.weights
+    are the same as layer.get_weights() which is more intuitive for user. To
+    keep supporting the existing saved h5 file, this method should be used to
+    save/load weights. In future version, we will delete this method and
+    introduce a breaking change for h5 and stay with the new order for weights.
+
+    Args:
+      layer: a `tf.keras.Model` or `tf.keras.layers.Layer` instance.
+
+    Returns:
+      A list of variables with the order of trainable_weights, followed by
+        non_trainable_weights.
+    """
+    weights = layer.trainable_weights + layer.non_trainable_weights
+    if any(not isinstance(w, tf.Variable) for w in weights):
+        raise NotImplementedError(
+            "Save or restore weights that is not an instance of `tf.Variable` "
+            "is not supported in h5, use `save_format='tf'` instead. Received "
+            f"a model or layer {layer.__class__.__name__} "
+            f"with weights {weights}"
+        )
+    return weights
diff --git a/keras/saving/legacy/losses_serialization_test.py b/keras/saving/legacy/losses_serialization_test.py
new file mode 100644
index 000000000000..3a4df6ad84b5
--- /dev/null
+++ b/keras/saving/legacy/losses_serialization_test.py
@@ -0,0 +1,213 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras losses serialization."""
+
+import os
+import shutil
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras import layers
+from keras import losses
+from keras.optimizers import legacy as optimizer_legacy
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import losses_utils
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+
+
+# Custom loss class
+class MyMeanAbsoluteError(losses.LossFunctionWrapper):
+    def __init__(
+        self,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="mean_absolute_error",
+    ):
+        super().__init__(my_mae, name=name, reduction=reduction)
+
+
+# Custom loss function
+def my_mae(y_true, y_pred):
+    return keras.backend.mean(tf.abs(y_pred - y_true), axis=-1)
+
+
+def _get_multi_io_model():
+    inp_1 = layers.Input(shape=(1,), name="input_1")
+    inp_2 = layers.Input(shape=(1,), name="input_2")
+    d = test_utils.Bias(name="output")
+    out_1 = d(inp_1)
+    out_2 = d(inp_2)
+    return keras.Model([inp_1, inp_2], [out_1, out_2])
+
+
+@test_combinations.run_all_keras_modes
+@parameterized.named_parameters(
+    [
+        dict(testcase_name="string", value="mae"),
+        dict(testcase_name="built_in_fn", value=losses.mae),
+        dict(testcase_name="built_in_class", value=losses.MeanAbsoluteError()),
+        dict(testcase_name="custom_fn", value=my_mae),
+        dict(testcase_name="custom_class", value=MyMeanAbsoluteError()),
+        dict(testcase_name="list_of_strings", value=["mae", "mae"]),
+        dict(
+            testcase_name="list_of_built_in_fns", value=[losses.mae, losses.mae]
+        ),
+        dict(
+            testcase_name="list_of_built_in_classes",
+            value=[losses.MeanAbsoluteError(), losses.MeanAbsoluteError()],
+        ),
+        dict(testcase_name="list_of_custom_fns", value=[my_mae, my_mae]),
+        dict(
+            testcase_name="list_of_custom_classes",
+            value=[MyMeanAbsoluteError(), MyMeanAbsoluteError()],
+        ),
+        dict(
+            testcase_name="dict_of_string",
+            value={
+                "output": "mae",
+                "output_1": "mae",
+            },
+        ),
+        dict(
+            testcase_name="dict_of_built_in_fn",
+            value={
+                "output": losses.mae,
+                "output_1": losses.mae,
+            },
+        ),
+        dict(
+            testcase_name="dict_of_built_in_class",
+            value={
+                "output": losses.MeanAbsoluteError(),
+                "output_1": losses.MeanAbsoluteError(),
+            },
+        ),
+        dict(
+            testcase_name="dict_of_custom_fn",
+            value={"output": my_mae, "output_1": my_mae},
+        ),
+        dict(
+            testcase_name="dict_of_custom_class",
+            value={
+                "output": MyMeanAbsoluteError(),
+                "output_1": MyMeanAbsoluteError(),
+            },
+        ),
+    ]
+)
+class LossesSerialization(test_combinations.TestCase):
+    def setUp(self):
+        super(LossesSerialization, self).setUp()
+        tmpdir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, tmpdir)
+        self.model_filename = os.path.join(tmpdir, "tmp_model_loss.h5")
+        self.x = np.array([[0.0], [1.0], [2.0]], dtype="float32")
+        self.y = np.array([[0.5], [2.0], [3.5]], dtype="float32")
+        self.w = np.array([1.25, 0.5, 1.25], dtype="float32")
+
+    def test_serializing_model_with_loss_with_custom_object_scope(self, value):
+        with keras.utils.custom_object_scope(
+            {
+                "MyMeanAbsoluteError": MyMeanAbsoluteError,
+                "my_mae": my_mae,
+                "Bias": test_utils.Bias,
+            }
+        ):
+            model = _get_multi_io_model()
+            model.compile(
+                optimizer_legacy.gradient_descent.SGD(0.1),
+                loss=value,
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            history = model.fit(
+                [self.x, self.x],
+                [self.y, self.y],
+                batch_size=3,
+                epochs=3,
+                sample_weight=[self.w, self.w],
+            )
+
+            # Assert training.
+            self.assertAllClose(history.history["loss"], [2.0, 1.6, 1.2], 1e-3)
+            eval_results = model.evaluate(
+                [self.x, self.x],
+                [self.y, self.y],
+                sample_weight=[self.w, self.w],
+            )
+
+            if h5py is None:
+                return
+            model.save(self.model_filename)
+            loaded_model = keras.models.load_model(self.model_filename)
+            loaded_model.predict([self.x, self.x])
+            loaded_eval_results = loaded_model.evaluate(
+                [self.x, self.x],
+                [self.y, self.y],
+                sample_weight=[self.w, self.w],
+            )
+
+            # Assert all evaluation results are the same.
+            self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
+
+    def test_serializing_model_with_loss_with_custom_objects(self, value):
+        model = _get_multi_io_model()
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(0.1),
+            loss=value,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(
+            [self.x, self.x],
+            [self.y, self.y],
+            batch_size=3,
+            epochs=3,
+            sample_weight=[self.w, self.w],
+        )
+
+        # Assert training.
+        self.assertAllClose(history.history["loss"], [2.0, 1.6, 1.2], 1e-3)
+        eval_results = model.evaluate(
+            [self.x, self.x], [self.y, self.y], sample_weight=[self.w, self.w]
+        )
+
+        if h5py is None:
+            return
+        model.save(self.model_filename)
+        loaded_model = keras.models.load_model(
+            self.model_filename,
+            custom_objects={
+                "MyMeanAbsoluteError": MyMeanAbsoluteError,
+                "my_mae": my_mae,
+                "Bias": test_utils.Bias,
+            },
+        )
+        loaded_model.predict([self.x, self.x])
+        loaded_eval_results = loaded_model.evaluate(
+            [self.x, self.x], [self.y, self.y], sample_weight=[self.w, self.w]
+        )
+
+        # Assert all evaluation results are the same.
+        self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/legacy/metrics_serialization_test.py b/keras/saving/legacy/metrics_serialization_test.py
new file mode 100644
index 000000000000..9956657d0440
--- /dev/null
+++ b/keras/saving/legacy/metrics_serialization_test.py
@@ -0,0 +1,278 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics serialization."""
+
+import os
+import shutil
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras import layers
+from keras import metrics
+from keras.optimizers import legacy as optimizer_legacy
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import custom_object_scope
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+
+
+# Custom metric
+class MyMeanAbsoluteError(metrics.MeanMetricWrapper):
+    def __init__(self, name="my_mae", dtype=None):
+        super().__init__(_my_mae, name, dtype=dtype)
+
+
+# Custom metric function
+def _my_mae(y_true, y_pred):
+    return keras.backend.mean(tf.abs(y_pred - y_true), axis=-1)
+
+
+def _get_multi_io_model():
+    inp_1 = layers.Input(shape=(1,), name="input_1")
+    inp_2 = layers.Input(shape=(1,), name="input_2")
+    d = test_utils.Bias(name="output")
+    out_1 = d(inp_1)
+    out_2 = d(inp_2)
+    return keras.Model([inp_1, inp_2], [out_1, out_2])
+
+
+@test_combinations.run_all_keras_modes
+@parameterized.named_parameters(
+    dict(testcase_name="string", value=["mae"]),
+    dict(testcase_name="built_in_fn", value=[metrics.mae]),
+    dict(testcase_name="built_in_class", value=[metrics.MeanAbsoluteError]),
+    dict(testcase_name="custom_fn", value=[_my_mae]),
+    dict(testcase_name="custom_class", value=[MyMeanAbsoluteError]),
+    dict(
+        testcase_name="list_of_built_in_fn_and_list",
+        value=[metrics.mae, [metrics.mae]],
+    ),
+    dict(
+        testcase_name="list_of_built_in_class_and_list",
+        value=[metrics.MeanAbsoluteError, [metrics.MeanAbsoluteError]],
+    ),
+    dict(
+        testcase_name="list_of_custom_fn_and_list", value=[_my_mae, [_my_mae]]
+    ),
+    dict(
+        testcase_name="list_of_custom_class_and_list",
+        value=[MyMeanAbsoluteError, [MyMeanAbsoluteError]],
+    ),
+    dict(
+        testcase_name="list_of_lists_of_custom_fns",
+        value=[[_my_mae], [_my_mae, "mae"]],
+    ),
+    dict(
+        testcase_name="list_of_lists_of_custom_classes",
+        value=[[MyMeanAbsoluteError], [MyMeanAbsoluteError, "mae"]],
+    ),
+    dict(
+        testcase_name="dict_of_list_of_string",
+        value={
+            "output": ["mae"],
+            "output_1": ["mae"],
+        },
+    ),
+    dict(
+        testcase_name="dict_of_list_of_built_in_fn",
+        value={
+            "output": [metrics.mae],
+            "output_1": [metrics.mae],
+        },
+    ),
+    dict(
+        testcase_name="dict_of_list_of_built_in_class",
+        value={
+            "output": [metrics.MeanAbsoluteError],
+            "output_1": [metrics.MeanAbsoluteError],
+        },
+    ),
+    dict(
+        testcase_name="dict_of_list_of_custom_fn",
+        value={
+            "output": [_my_mae],
+            "output_1": [_my_mae],
+        },
+    ),
+    dict(
+        testcase_name="dict_of_list_of_custom_class",
+        value={
+            "output": [MyMeanAbsoluteError],
+            "output_1": [MyMeanAbsoluteError],
+        },
+    ),
+    dict(
+        testcase_name="dict_of_string",
+        value={
+            "output": "mae",
+            "output_1": "mae",
+        },
+    ),
+    dict(
+        testcase_name="dict_of_built_in_fn",
+        value={
+            "output": metrics.mae,
+            "output_1": metrics.mae,
+        },
+    ),
+    dict(
+        testcase_name="dict_of_built_in_class",
+        value={
+            "output": metrics.MeanAbsoluteError,
+            "output_1": metrics.MeanAbsoluteError,
+        },
+    ),
+    dict(
+        testcase_name="dict_of_custom_fn",
+        value={"output": _my_mae, "output_1": _my_mae},
+    ),
+    dict(
+        testcase_name="dict_of_custom_class",
+        value={
+            "output": MyMeanAbsoluteError,
+            "output_1": MyMeanAbsoluteError,
+        },
+    ),
+)
+class MetricsSerialization(test_combinations.TestCase):
+    def setUp(self):
+        super(MetricsSerialization, self).setUp()
+        tmpdir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, tmpdir)
+        self.model_filename = os.path.join(tmpdir, "tmp_model_metric.h5")
+        self.x = np.array([[0.0], [1.0], [2.0]], dtype="float32")
+        self.y = np.array([[0.5], [2.0], [3.5]], dtype="float32")
+        self.w = np.array([1.25, 0.5, 1.25], dtype="float32")
+
+    def test_serializing_model_with_metric_with_custom_object_scope(
+        self, value
+    ):
+        def get_instance(x):
+            if isinstance(x, str):
+                return x
+            if isinstance(x, type) and issubclass(x, metrics.Metric):
+                return x()
+            return x
+
+        metric_input = tf.nest.map_structure(get_instance, value)
+        weighted_metric_input = tf.nest.map_structure(get_instance, value)
+
+        with custom_object_scope(
+            {
+                "MyMeanAbsoluteError": MyMeanAbsoluteError,
+                "_my_mae": _my_mae,
+                "Bias": test_utils.Bias,
+            }
+        ):
+            model = _get_multi_io_model()
+            model.compile(
+                optimizer_legacy.gradient_descent.SGD(0.1),
+                "mae",
+                metrics=metric_input,
+                weighted_metrics=weighted_metric_input,
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            history = model.fit(
+                [self.x, self.x],
+                [self.y, self.y],
+                batch_size=3,
+                epochs=3,
+                sample_weight=[self.w, self.w],
+            )
+
+            # Assert training.
+            self.assertAllClose(history.history["loss"], [2.0, 1.6, 1.2], 1e-3)
+            eval_results = model.evaluate(
+                [self.x, self.x],
+                [self.y, self.y],
+                sample_weight=[self.w, self.w],
+            )
+
+            if h5py is None:
+                return
+            model.save(self.model_filename)
+            loaded_model = keras.models.load_model(self.model_filename)
+            loaded_model.predict([self.x, self.x])
+            loaded_eval_results = loaded_model.evaluate(
+                [self.x, self.x],
+                [self.y, self.y],
+                sample_weight=[self.w, self.w],
+            )
+
+            # Assert all evaluation results are the same.
+            self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
+
+    def test_serializing_model_with_metric_with_custom_objects(self, value):
+        def get_instance(x):
+            if isinstance(x, str):
+                return x
+            if isinstance(x, type) and issubclass(x, metrics.Metric):
+                return x()
+            return x
+
+        metric_input = tf.nest.map_structure(get_instance, value)
+        weighted_metric_input = tf.nest.map_structure(get_instance, value)
+
+        model = _get_multi_io_model()
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(0.1),
+            "mae",
+            metrics=metric_input,
+            weighted_metrics=weighted_metric_input,
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(
+            [self.x, self.x],
+            [self.y, self.y],
+            batch_size=3,
+            epochs=3,
+            sample_weight=[self.w, self.w],
+        )
+
+        # Assert training.
+        self.assertAllClose(history.history["loss"], [2.0, 1.6, 1.2], 1e-3)
+        eval_results = model.evaluate(
+            [self.x, self.x], [self.y, self.y], sample_weight=[self.w, self.w]
+        )
+
+        if h5py is None:
+            return
+        model.save(self.model_filename)
+        loaded_model = keras.models.load_model(
+            self.model_filename,
+            custom_objects={
+                "MyMeanAbsoluteError": MyMeanAbsoluteError,
+                "_my_mae": _my_mae,
+                "Bias": test_utils.Bias,
+            },
+        )
+        loaded_model.predict([self.x, self.x])
+        loaded_eval_results = loaded_model.evaluate(
+            [self.x, self.x], [self.y, self.y], sample_weight=[self.w, self.w]
+        )
+
+        # Assert all evaluation results are the same.
+        self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/legacy/model_config.py b/keras/saving/legacy/model_config.py
new file mode 100644
index 000000000000..a916289b3ab6
--- /dev/null
+++ b/keras/saving/legacy/model_config.py
@@ -0,0 +1,125 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functions that save the model's config into different formats."""
+
+# isort: off
+
+import threading
+from tensorflow.python.util.tf_export import keras_export
+from keras.saving.legacy import serialization
+
+MODULE_OBJECTS = threading.local()
+
+
+@keras_export("keras.models.model_from_config")
+def model_from_config(config, custom_objects=None):
+    """Instantiates a Keras model from its config.
+
+    Usage:
+    ```
+    # for a Functional API model
+    tf.keras.Model().from_config(model.get_config())
+
+    # for a Sequential model
+    tf.keras.Sequential().from_config(model.get_config())
+    ```
+
+    Args:
+        config: Configuration dictionary.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+
+    Returns:
+        A Keras model instance (uncompiled).
+
+    Raises:
+        TypeError: if `config` is not a dictionary.
+    """
+    if isinstance(config, list):
+        raise TypeError(
+            "`model_from_config` expects a dictionary, not a list. "
+            f"Received: config={config}. Did you meant to use "
+            "`Sequential.from_config(config)`?"
+        )
+    from keras import layers
+
+    global MODULE_OBJECTS
+
+    if not hasattr(MODULE_OBJECTS, "ALL_OBJECTS"):
+        layers.serialization.populate_deserializable_objects()
+        MODULE_OBJECTS.ALL_OBJECTS = layers.serialization.LOCAL.ALL_OBJECTS
+
+    return serialization.deserialize_keras_object(
+        config,
+        module_objects=MODULE_OBJECTS.ALL_OBJECTS,
+        custom_objects=custom_objects,
+        printable_module_name="layer",
+    )
+
+
+@keras_export("keras.models.model_from_yaml")
+def model_from_yaml(yaml_string, custom_objects=None):
+    """Parses a yaml model configuration file and returns a model instance.
+
+    Note: Since TF 2.6, this method is no longer supported and will raise a
+    RuntimeError.
+
+    Args:
+        yaml_string: YAML string or open file encoding a model configuration.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+
+    Returns:
+        A Keras model instance (uncompiled).
+
+    Raises:
+        RuntimeError: announces that the method poses a security risk
+    """
+    raise RuntimeError(
+        "Method `model_from_yaml()` has been removed due to security risk of "
+        "arbitrary code execution. Please use `Model.to_json()` and "
+        "`model_from_json()` instead."
+    )
+
+
+@keras_export("keras.models.model_from_json")
+def model_from_json(json_string, custom_objects=None):
+    """Parses a JSON model configuration string and returns a model instance.
+
+    Usage:
+
+    >>> model = tf.keras.Sequential([
+    ...     tf.keras.layers.Dense(5, input_shape=(3,)),
+    ...     tf.keras.layers.Softmax()])
+    >>> config = model.to_json()
+    >>> loaded_model = tf.keras.models.model_from_json(config)
+
+    Args:
+        json_string: JSON string encoding a model configuration.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+
+    Returns:
+        A Keras model instance (uncompiled).
+    """
+    from keras.layers import (
+        deserialize_from_json,
+    )
+
+    return deserialize_from_json(json_string, custom_objects=custom_objects)
diff --git a/keras/saving/legacy/save.py b/keras/saving/legacy/save.py
new file mode 100644
index 000000000000..4c6a3825308f
--- /dev/null
+++ b/keras/saving/legacy/save.py
@@ -0,0 +1,547 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras model saving code."""
+
+import os
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.saving import object_registration
+from keras.saving.legacy import hdf5_format
+from keras.saving.legacy import saving_utils
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model import load as saved_model_load
+from keras.saving.legacy.saved_model import load_context
+from keras.saving.legacy.saved_model import save as saved_model_save
+from keras.saving.legacy.saved_model.utils import keras_option_scope
+from keras.utils import io_utils
+from keras.utils import traceback_utils
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+
+
+@traceback_utils.filter_traceback
+def save_model(
+    model,
+    filepath,
+    overwrite=True,
+    include_optimizer=True,
+    save_format=None,
+    signatures=None,
+    options=None,
+    save_traces=True,
+):
+    """Saves a model as a TensorFlow SavedModel or HDF5 file.
+
+    See the [Serialization and Saving
+    guide](https://keras.io/guides/serialization_and_saving/) for details.
+
+    Usage:
+
+    >>> model = tf.keras.Sequential([
+    ...     tf.keras.layers.Dense(5, input_shape=(3,)),
+    ...     tf.keras.layers.Softmax()])
+    >>> model.save('/tmp/model')
+    >>> loaded_model = tf.keras.models.load_model('/tmp/model')
+    >>> x = tf.random.uniform((10, 3))
+    >>> assert np.allclose(model.predict(x), loaded_model.predict(x))
+
+    Note that `model.save()` is an alias for `tf.keras.models.save_model()`.
+
+    The SavedModel and HDF5 file contains:
+
+    - the model's configuration (topology)
+    - the model's weights
+    - the model's optimizer's state (if any)
+
+    Thus models can be reinstantiated in the exact same state, without any of
+    the code used for model definition or training.
+
+    Note that the model weights may have different scoped names after being
+    loaded. Scoped names include the model/layer names, such as
+    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
+    access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+
+    __SavedModel serialization format__
+
+    Keras SavedModel uses `tf.saved_model.save` to save the model and all
+    trackable objects attached to the model (e.g. layers and variables). The
+    model config, weights, and optimizer are saved in the SavedModel.
+    Additionally, for every Keras layer attached to the model, the SavedModel
+    stores:
+
+      * the config and metadata -- e.g. name, dtype, trainable status
+      * traced call and loss functions, which are stored as TensorFlow
+        subgraphs.
+
+    The traced functions allow the SavedModel format to save and load custom
+    layers without the original class definition.
+
+    You can choose to not save the traced functions by disabling the
+    `save_traces` option. This will decrease the time it takes to save the model
+    and the amount of disk space occupied by the output SavedModel. If you
+    enable this option, then you _must_ provide all custom class definitions
+    when loading the model. See the `custom_objects` argument in
+    `tf.keras.models.load_model`.
+
+    Args:
+        model: Keras model instance to be saved.
+        filepath: One of the following:
+          - String or `pathlib.Path` object, path where to save the model
+          - `h5py.File` object where to save the model
+        overwrite: Whether we should overwrite any existing model at the target
+          location, or instead ask the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+        save_format: Either 'tf' or 'h5', indicating whether to save the model
+          to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X, and 'h5'
+          in TF 1.X.
+        signatures: Signatures to save with the SavedModel. Applicable to the
+          'tf' format only. Please see the `signatures` argument in
+          `tf.saved_model.save` for details.
+        options: (only applies to SavedModel format)
+          `tf.saved_model.SaveOptions` object that specifies options for saving
+          to SavedModel.
+        save_traces: (only applies to SavedModel format) When enabled, the
+          SavedModel will store the function traces for each layer. This
+          can be disabled, so that only the configs of each layer are stored.
+          Defaults to `True`. Disabling this will decrease serialization time
+          and reduce file size, but it requires that all custom layers/models
+          implement a `get_config()` method.
+
+    Raises:
+        ImportError: If save format is hdf5, and h5py is not available.
+    """
+
+    from keras.engine import sequential
+
+    default_format = "tf" if tf.__internal__.tf2.enabled() else "h5"
+    save_format = save_format or default_format
+
+    filepath = io_utils.path_to_string(filepath)
+
+    # If the user has not already called fit or built the underlying metrics, we
+    # should do that before saving to ensure the metric names have all
+    # appropriate name transformations applied.
+    saving_utils.try_build_compiled_arguments(model)
+
+    if (
+        save_format == "h5"
+        or (h5py is not None and isinstance(filepath, h5py.File))
+        or saving_utils.is_hdf5_filepath(filepath)
+    ):
+        # TODO(b/130258301): add utility method for detecting model type.
+        if not model._is_graph_network and not isinstance(
+            model, sequential.Sequential
+        ):
+            raise NotImplementedError(
+                "Saving the model to HDF5 format requires the model to be a "
+                "Functional model or a Sequential model. It does not work for "
+                "subclassed models, because such models are defined via the "
+                "body of a Python method, which isn't safely serializable. "
+                "Consider saving to the Tensorflow SavedModel format (by "
+                'setting save_format="tf") or using `save_weights`.'
+            )
+        hdf5_format.save_model_to_hdf5(
+            model, filepath, overwrite, include_optimizer
+        )
+    else:
+        with serialization.SharedObjectSavingScope():
+            with keras_option_scope(
+                save_traces=save_traces, in_tf_saved_model_scope=True
+            ):
+                saved_model_save.save(
+                    model,
+                    filepath,
+                    overwrite,
+                    include_optimizer,
+                    signatures,
+                    options,
+                    save_traces,
+                )
+
+
+@traceback_utils.filter_traceback
+def load_model(filepath, custom_objects=None, compile=True, options=None):
+    """Loads a model saved via `model.save()`.
+
+    Usage:
+
+    >>> model = tf.keras.Sequential([
+    ...     tf.keras.layers.Dense(5, input_shape=(3,)),
+    ...     tf.keras.layers.Softmax()])
+    >>> model.save('/tmp/model')
+    >>> loaded_model = tf.keras.models.load_model('/tmp/model')
+    >>> x = tf.random.uniform((10, 3))
+    >>> assert np.allclose(model.predict(x), loaded_model.predict(x))
+
+    Note that the model weights may have different scoped names after being
+    loaded. Scoped names include the model/layer names, such as
+    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
+    access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+
+    Args:
+        filepath: One of the following:
+            - String or `pathlib.Path` object, path to the saved model
+            - `h5py.File` object from which to load the model
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+        compile: Boolean, whether to compile the model
+            after loading.
+        options: Optional `tf.saved_model.LoadOptions` object that specifies
+          options for loading from SavedModel.
+
+    Returns:
+        A Keras model instance. If the original model was compiled, and saved
+        with the optimizer, then the returned model will be compiled. Otherwise,
+        the model will be left uncompiled. In the case that an uncompiled model
+        is returned, a warning is displayed if the `compile` argument is set to
+        `True`.
+
+    Raises:
+        ImportError: if loading from an hdf5 file and h5py is not available.
+        IOError: In case of an invalid savefile.
+    """
+    with serialization.SharedObjectLoadingScope():
+        custom_objects = custom_objects or {}
+        tlco = object_registration._THREAD_LOCAL_CUSTOM_OBJECTS.__dict__
+        gco = object_registration._GLOBAL_CUSTOM_OBJECTS
+        custom_objects = {**custom_objects, **tlco, **gco}
+        with object_registration.CustomObjectScope(custom_objects):
+            with keras_option_scope(
+                save_traces=False, in_tf_saved_model_scope=True
+            ):
+                with load_context.load_context(options):
+                    filepath_str = io_utils.path_to_string(filepath)
+                    if isinstance(filepath_str, str):
+                        if not tf.io.gfile.exists(filepath_str):
+                            raise IOError(
+                                f"No file or directory found at {filepath_str}"
+                            )
+
+                        if tf.io.gfile.isdir(filepath_str):
+                            return saved_model_load.load(
+                                filepath_str, compile, options
+                            )
+                        else:
+                            if h5py is None:
+                                raise ImportError(
+                                    "Filepath looks like a hdf5 file but h5py"
+                                    "is not available."
+                                    f" filepath={filepath_str}"
+                                )
+                            return hdf5_format.load_model_from_hdf5(
+                                tf.io.gfile.GFile(filepath_str, mode="rb"),
+                                custom_objects,
+                                compile,
+                            )
+                    elif h5py is not None and isinstance(filepath, h5py.File):
+                        return hdf5_format.load_model_from_hdf5(
+                            filepath, custom_objects, compile
+                        )
+
+    raise IOError(
+        "Unable to load model. Filepath is not an hdf5 file (or h5py is not "
+        f"available) or SavedModel. Received: filepath={filepath}"
+    )
+
+
+def save_weights(
+    model, filepath, overwrite=True, save_format=None, options=None
+):
+    """Saves all layer weights.
+
+    Either saves in HDF5 or in TensorFlow format based on the `save_format`
+    argument.
+
+    When saving in HDF5 format, the weight file has:
+        - `layer_names` (attribute), a list of strings
+            (ordered names of model layers).
+        - For every layer, a `group` named `layer.name`
+            - For every such layer group, a group attribute `weight_names`,
+                a list of strings
+                (ordered names of weights tensor of the layer).
+            - For every weight in the layer, a dataset
+                storing the weight value, named after the weight tensor.
+
+    When saving in TensorFlow format, all objects referenced by the network
+    are saved in the same format as `tf.train.Checkpoint`, including any
+    `Layer` instances or `Optimizer` instances assigned to object
+    attributes. For networks constructed from inputs and outputs using
+    `tf.keras.Model(inputs, outputs)`, `Layer` instances used by the network
+    are tracked/saved automatically. For user-defined classes which inherit
+    from `tf.keras.Model`, `Layer` instances must be assigned to object
+    attributes, typically in the constructor. See the documentation of
+    `tf.train.Checkpoint` and `tf.keras.Model` for details.
+
+    While the formats are the same, do not mix `save_weights` and
+    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should
+    be loaded using `Model.load_weights`. Checkpoints saved using
+    `tf.train.Checkpoint.save` should be restored using the corresponding
+    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
+    `save_weights` for training checkpoints.
+
+    The TensorFlow format matches objects and variables by starting at a
+    root object, `self` for `save_weights`, and greedily matching attribute
+    names. For `Model.save` this is the `Model`, and for `Checkpoint.save`
+    this is the `Checkpoint` even if the `Checkpoint` has a model attached.
+    This means saving a `tf.keras.Model` using `save_weights` and loading
+    into a `tf.train.Checkpoint` with a `Model` attached (or vice versa)
+    will not match the `Model`'s variables. See the
+    [guide to training checkpoints](
+    https://www.tensorflow.org/guide/checkpoint) for details on
+    the TensorFlow format.
+
+    Args:
+        filepath: String or PathLike, path to the file to save the weights
+            to. When saving in TensorFlow format, this is the prefix used
+            for checkpoint files (multiple files are generated). Note that
+            the '.h5' suffix causes weights to be saved in HDF5 format.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+            '.keras' will default to HDF5 if `save_format` is `None`.
+            Otherwise `None` defaults to 'tf'.
+        options: Optional `tf.train.CheckpointOptions` object that specifies
+            options for saving weights.
+
+    Raises:
+        ImportError: If `h5py` is not available when attempting to save in
+            HDF5 format.
+    """
+    model._assert_weights_created()
+    filepath = io_utils.path_to_string(filepath)
+    filepath_is_h5 = saving_utils.is_hdf5_filepath(filepath)
+    if save_format is None:
+        if filepath_is_h5:
+            save_format = "h5"
+        else:
+            save_format = "tf"
+    else:
+        user_format = save_format.lower().strip()
+        if user_format in ("tensorflow", "tf"):
+            save_format = "tf"
+        elif user_format in ("hdf5", "h5", "keras"):
+            save_format = "h5"
+        else:
+            raise ValueError(
+                f"Unknown format. Received: `save_format`={save_format}. "
+                'Was expecting one of {"tf", "h5"}.'
+            )
+    if save_format == "tf" and filepath_is_h5:
+        raise ValueError(
+            'save_weights got save_format="tf"/"tensorflow", but the '
+            f"filepath ({filepath}) looks like an HDF5 file. "
+            'Omit the ".h5"/".keras" when saving in TensorFlow format.'
+        )
+
+    if save_format == "h5" and h5py is None:
+        raise ImportError(
+            "`save_weights` requires h5py when saving in hdf5, but h5py is "
+            "not available. Try installing h5py package."
+        )
+    if save_format == "tf":
+        check_filepath = filepath + ".index"
+    else:
+        check_filepath = filepath
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(check_filepath):
+        proceed = io_utils.ask_to_proceed_with_overwrite(check_filepath)
+        if not proceed:
+            return
+    if save_format == "h5":
+        with h5py.File(filepath, "w") as f:
+            hdf5_format.save_weights_to_hdf5_group(f, model)
+    else:
+        if not tf.executing_eagerly():
+            # Call `get_session` to initialize any uninitialized variables.
+            backend.get_session()
+        model._checkpoint.write(filepath, options=options)
+
+        # Record this checkpoint so it's visible from
+        # tf.train.latest_checkpoint.
+        tf.__internal__.train.update_checkpoint_state(
+            save_dir=os.path.dirname(filepath),
+            model_checkpoint_path=filepath,
+            save_relative_paths=True,
+            all_model_checkpoint_paths=[filepath],
+        )
+
+
+def load_weights(
+    model, filepath, by_name=False, skip_mismatch=False, options=None
+):
+    """Loads all layer weights, either from a SavedModel or H5 weights file.
+
+    If `by_name` is False weights are loaded based on the network's
+    topology. This means the architecture should be the same as when the
+    weights were saved.  Note that layers that don't have weights are not
+    taken into account in the topological ordering, so adding or removing
+    layers is fine as long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers only if they share
+    the same name. This is useful for fine-tuning or transfer-learning
+    models where some of the layers have changed.
+
+    Only topological loading (`by_name=False`) is supported when loading
+    weights from the TensorFlow format. Note that topological loading
+    differs slightly between TensorFlow and HDF5 formats for user-defined
+    classes inheriting from `tf.keras.Model`: HDF5 loads based on a
+    flattened list of weights, while the TensorFlow format loads based on
+    the object-local names of attributes to which layers are assigned in the
+    `Model`'s constructor.
+
+    Args:
+        filepath: String, path to the weights file to load. For weight files
+            in TensorFlow format, this is the file prefix (the same as was
+            passed to `save_weights`). This can also be a path to a
+            SavedModel saved from `model.save`.
+        by_name: Boolean, whether to load weights by name or by topological
+            order. Only topological loading is supported for weight files in
+            TensorFlow format.
+        skip_mismatch: Boolean, whether to skip loading of layers where
+            there is a mismatch in the number of weights, or a mismatch in
+            the shape of the weight (only valid when `by_name=True`).
+        options: Optional `tf.train.CheckpointOptions` object that specifies
+            options for loading weights.
+
+    Returns:
+        When loading a weight file in TensorFlow format, returns the same
+        status object as `tf.train.Checkpoint.restore`. When graph building,
+        restore ops are run automatically as soon as the network is built
+        (on first call for user-defined classes inheriting from `Model`,
+        immediately if it is already built).
+
+        When loading weights in HDF5 format, returns `None`.
+
+    Raises:
+        ImportError: If `h5py` is not available and the weight file is in
+            HDF5 format.
+        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
+            `False`.
+    """
+    if backend.is_tpu_strategy(model._distribution_strategy):
+        if model._distribution_strategy.extended.steps_per_run > 1 and (
+            not saving_utils.is_hdf5_filepath(filepath)
+        ):
+            spr = model._distribution_strategy.extended.steps_per_run
+            raise ValueError(
+                "Load weights is not implemented with TPUStrategy "
+                "with `steps_per_run` greater than 1. The "
+                f"`steps_per_run` is {spr}"
+            )
+    if skip_mismatch and not by_name:
+        raise ValueError(
+            "When calling model.load_weights, skip_mismatch can only be "
+            "set to True when by_name is True."
+        )
+
+    filepath, save_format = _detect_save_format(filepath)
+    if save_format == "tf":
+        status = model._checkpoint.read(filepath, options)
+        if by_name:
+            raise NotImplementedError(
+                "Weights may only be loaded based on topology into Models "
+                "when loading TensorFlow-formatted weights "
+                "(got by_name=True to load_weights)."
+            )
+        if not tf.executing_eagerly():
+            session = backend.get_session()
+            # Restore existing variables (if any) immediately, and set up a
+            # streaming restore for any variables created in the future.
+            tf.__internal__.tracking.streaming_restore(
+                status=status, session=session
+            )
+        status.assert_nontrivial_match()
+    else:
+        status = None
+        if h5py is None:
+            raise ImportError(
+                "`load_weights` requires h5py package when loading weights "
+                "from HDF5. Try installing h5py."
+            )
+        if not model._is_graph_network and not model.built:
+            raise ValueError(
+                "Unable to load weights saved in HDF5 format into a "
+                "subclassed Model which has not created its variables yet. "
+                "Call the Model first, then load the weights."
+            )
+        model._assert_weights_created()
+        with h5py.File(filepath, "r") as f:
+            if "layer_names" not in f.attrs and "model_weights" in f:
+                f = f["model_weights"]
+            if by_name:
+                hdf5_format.load_weights_from_hdf5_group_by_name(
+                    f, model, skip_mismatch
+                )
+            else:
+                hdf5_format.load_weights_from_hdf5_group(f, model)
+
+    # Perform any layer defined finalization of the layer state.
+    for layer in model.layers:
+        layer.finalize_state()
+    return status
+
+
+def _detect_save_format(filepath):
+    """Returns path to weights file and save format."""
+
+    filepath = io_utils.path_to_string(filepath)
+    if saving_utils.is_hdf5_filepath(filepath):
+        return filepath, "h5"
+
+    # Filepath could be a TensorFlow checkpoint file prefix or SavedModel
+    # directory. It's possible for filepath to be both a prefix and directory.
+    # Prioritize checkpoint over SavedModel.
+    if _is_readable_tf_checkpoint(filepath):
+        save_format = "tf"
+    elif tf.saved_model.contains_saved_model(filepath):
+        ckpt_path = os.path.join(
+            filepath,
+            tf.saved_model.VARIABLES_DIRECTORY,
+            tf.saved_model.VARIABLES_FILENAME,
+        )
+        if _is_readable_tf_checkpoint(ckpt_path):
+            filepath = ckpt_path
+            save_format = "tf"
+        else:
+            raise ValueError(
+                "Unable to load weights. filepath {} appears to be a "
+                "SavedModel directory, but checkpoint either doesn't "
+                "exist, or is incorrectly formatted.".format(filepath)
+            )
+    else:
+        # Not a TensorFlow checkpoint. This filepath is likely an H5 file that
+        # doesn't have the hdf5/keras extensions.
+        save_format = "h5"
+    return filepath, save_format
+
+
+def _is_readable_tf_checkpoint(filepath):
+    try:
+        tf.compat.v1.train.NewCheckpointReader(filepath)
+        return True
+    except tf.errors.DataLossError:
+        # The checkpoint is not readable in TensorFlow format.
+        return False
+
+
+# Inject the load_model function to keras_deps to remove the dependency
+# from TFLite to Keras.
+tf.__internal__.register_load_model_function(load_model)
diff --git a/keras/saving/legacy/save_test.py b/keras/saving/legacy/save_test.py
new file mode 100644
index 000000000000..b9ec7d5d749f
--- /dev/null
+++ b/keras/saving/legacy/save_test.py
@@ -0,0 +1,1555 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras model saving code."""
+
+import collections
+import os
+import pathlib
+import shutil
+import tempfile
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras import losses
+from keras import optimizers
+from keras.engine import functional
+from keras.engine import sequential
+from keras.feature_column import dense_features
+from keras.feature_column import sequence_feature_column as ksfc
+from keras.layers import core
+from keras.optimizers import optimizer_v1
+from keras.premade_models.linear import LinearModel
+from keras.saving import object_registration
+from keras.saving.legacy import model_config
+from keras.saving.legacy import save
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model import utils as saved_model_utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+
+
+class TestSaveModel(tf.test.TestCase, parameterized.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.model = test_utils.get_small_sequential_mlp(1, 2, 3)
+        self.subclassed_model = test_utils.get_small_subclass_mlp(1, 2)
+
+    def assert_h5_format(self, path):
+        if h5py is not None:
+            self.assertTrue(
+                h5py.is_hdf5(path),
+                f"Model saved at path {path} is not a valid hdf5 file.",
+            )
+
+    def assert_saved_model(self, path):
+        tf.__internal__.saved_model.parse_saved_model(path)
+
+    @test_utils.run_v2_only
+    def test_load_file_not_found(self):
+        path = pathlib.Path(self.get_temp_dir()) / "does_not_exist"
+        with self.assertRaisesRegex(IOError, "No file or directory found at"):
+            save.load_model(path)
+
+    @test_utils.run_v2_only
+    def test_save_format_defaults(self):
+        path = os.path.join(self.get_temp_dir(), "model_path")
+        save.save_model(self.model, path)
+        self.assert_saved_model(path)
+
+    @test_utils.run_v2_only
+    def test_save_format_defaults_pathlib(self):
+        path = pathlib.Path(self.get_temp_dir()) / "model_path"
+        save.save_model(self.model, path)
+        self.assert_saved_model(path)
+
+    @test_utils.run_v2_only
+    def test_save_hdf5(self):
+        path = os.path.join(self.get_temp_dir(), "model")
+        save.save_model(self.model, path, save_format="h5")
+        self.assert_h5_format(path)
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "requires the model to be a Functional model "
+            "or a Sequential model.",
+        ):
+            save.save_model(self.subclassed_model, path, save_format="h5")
+
+    @test_utils.run_v2_only
+    def test_save_load_hdf5_pathlib(self):
+        path = pathlib.Path(self.get_temp_dir()) / "model"
+        save.save_model(self.model, path, save_format="h5")
+        save.load_model(path)
+
+    @test_utils.run_v2_only
+    def test_save_tf(self):
+        path = os.path.join(self.get_temp_dir(), "model")
+        save.save_model(self.model, path, save_format="tf")
+        self.assert_saved_model(path)
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Model.*cannot be saved.*as opposed to `model.call\(\).*",
+        ):
+            save.save_model(self.subclassed_model, path, save_format="tf")
+        self.subclassed_model.predict(np.random.random((3, 5)))
+        save.save_model(self.subclassed_model, path, save_format="tf")
+        self.assert_saved_model(path)
+
+    @test_utils.run_v2_only
+    def test_save_load_tf_string(self):
+        path = os.path.join(self.get_temp_dir(), "model")
+        save.save_model(self.model, path, save_format="tf")
+        save.load_model(path)
+
+    @test_utils.run_v2_only
+    def test_save_load_tf_pathlib(self):
+        path = pathlib.Path(self.get_temp_dir()) / "model"
+        save.save_model(self.model, path, save_format="tf")
+        save.load_model(path)
+
+    @test_utils.run_v2_only
+    def test_save_load_weights_tf_pathlib(self):
+        path = pathlib.Path(self.get_temp_dir()) / "model"
+        self.model.save_weights(path, save_format="tf")
+        self.model.load_weights(path)
+
+    @test_utils.run_v2_only
+    def test_save_load_weights_hdf5_pathlib(self):
+        path = pathlib.Path(self.get_temp_dir()) / "model"
+        self.model.save_weights(path, save_format="h5")
+        self.model.load_weights(path)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_saving_h5_for_rnn_layers(self):
+        # See https://github.com/tensorflow/tensorflow/issues/35731 for details.
+        inputs = keras.Input([10, 91], name="train_input")
+        rnn_layers = [
+            keras.layers.LSTMCell(
+                size, recurrent_dropout=0, name="rnn_cell%d" % i
+            )
+            for i, size in enumerate([512, 512])
+        ]
+        rnn_output = keras.layers.RNN(
+            rnn_layers, return_sequences=True, name="rnn_layer"
+        )(inputs)
+        pred_feat = keras.layers.Dense(91, name="prediction_features")(
+            rnn_output
+        )
+        pred = keras.layers.Softmax()(pred_feat)
+        model = keras.Model(inputs=[inputs], outputs=[pred, pred_feat])
+        path = os.path.join(self.get_temp_dir(), "model_path.h5")
+        model.save(path)
+
+        # Make sure the variable name is unique.
+        self.assertNotEqual(
+            rnn_layers[0].kernel.name, rnn_layers[1].kernel.name
+        )
+        self.assertIn("rnn_cell1", rnn_layers[1].kernel.name)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_saving_optimizer_weights(self):
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.layer = keras.layers.Dense(1)
+
+            def call(self, x):
+                return self.layer(x)
+
+        path = os.path.join(self.get_temp_dir(), "weights_path")
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+
+        model = MyModel()
+        model.compile("rmsprop", loss="bce")
+        model.train_on_batch(x, y)
+        model.reset_metrics()
+        model.save_weights(path, save_format="tf")
+
+        batch_loss = model.train_on_batch(x, y)
+
+        new_model = MyModel()
+        new_model.compile("rmsprop", loss="bce")
+        new_model.train_on_batch(x, y)
+        new_model.reset_metrics()
+
+        new_model.load_weights(path)
+        new_batch_loss = new_model.train_on_batch(x, y)
+
+        self.assertAllClose(batch_loss, new_batch_loss)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["eager", "graph"])
+    )
+    def test_save_include_optimizer_false(self):
+        def get_variables(file_name):
+            reader = tf.train.load_checkpoint(
+                os.path.join(file_name, "variables/variables")
+            )
+            shape_from_key = reader.get_variable_to_shape_map()
+            return sorted(shape_from_key.keys())
+
+        path = os.path.join(self.get_temp_dir(), "no_optimizer")
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(1))
+        model.compile("adam", loss="mse")
+        model.train_on_batch(x, y)
+        model.save(path, save_format="tf", include_optimizer=False)
+        variables = get_variables(path)
+
+        for v in variables:
+            self.assertNotIn("optimizer", v)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_saving_model_with_custom_object(self):
+        with object_registration.custom_object_scope(), self.cached_session():
+
+            @object_registration.register_keras_serializable()
+            class CustomLoss(losses.MeanSquaredError):
+                pass
+
+            model = sequential.Sequential(
+                [core.Dense(units=1, input_shape=(1,))]
+            )
+            model.compile(optimizer="sgd", loss=CustomLoss())
+            model.fit(np.zeros([10, 1]), np.zeros([10, 1]))
+
+            temp_dir = self.get_temp_dir()
+            filepath = os.path.join(temp_dir, "saving")
+            model.save(filepath)
+
+            # Make sure the model can be correctly load back.
+            _ = save.load_model(filepath, compile=True)
+
+    def test_saving_model_with_name_conflict(self):
+        class Sequential(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.layer = keras.layers.Dense(1)
+
+            def call(self, x):
+                return self.layer(x)
+
+        model = Sequential()
+        model(tf.ones((10, 10)))
+        temp_dir = self.get_temp_dir()
+        filepath = os.path.join(temp_dir, "Sequential")
+
+        with self.assertLogs() as logs:
+            model.save(filepath, save_format="tf")
+
+        expected_substring = (
+            "has the same name 'Sequential' as a built-in Keras"
+        )
+        matched = [log for log in logs.output if expected_substring in log]
+        self.assertNotEmpty(matched)
+
+    def test_saving_built_in_model(self):
+        model = LinearModel()
+        model(tf.constant([[5.0]]))
+        temp_dir = self.get_temp_dir()
+        filepath = os.path.join(temp_dir, "LinearModel")
+        with self.assertLogs() as logs:
+            model.save(filepath, save_format="tf")
+
+        expected_substring = (
+            "has the same name 'LinearModel' as a built-in Keras"
+        )
+        matched = [log for log in logs.output if expected_substring in log]
+        # Check that a warning is *not* logged for a premade model.
+        self.assertEmpty(matched)
+
+
+@object_registration.register_keras_serializable(package="Foo")
+class RegisteredSubLayer(keras.layers.Layer):
+    pass
+
+
+class TestJson(test_combinations.TestCase):
+    """Tests to_json()/from_json()."""
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_saving_with_dense_features(self):
+        cols = [
+            tf.feature_column.numeric_column("a"),
+            tf.feature_column.indicator_column(
+                tf.feature_column.categorical_column_with_vocabulary_list(
+                    "b", ["one", "two"]
+                )
+            ),
+        ]
+        input_layers = {
+            "a": keras.layers.Input(shape=(1,), name="a"),
+            "b": keras.layers.Input(shape=(1,), name="b", dtype="string"),
+        }
+
+        fc_layer = dense_features.DenseFeatures(cols)(input_layers)
+        output = keras.layers.Dense(10)(fc_layer)
+
+        model = keras.models.Model(input_layers, output)
+
+        model.compile(
+            loss=keras.losses.MSE,
+            optimizer="rmsprop",
+            metrics=[keras.metrics.categorical_accuracy],
+        )
+
+        config = model.to_json()
+        loaded_model = model_config.model_from_json(config)
+
+        inputs_a = np.arange(10).reshape(10, 1)
+        inputs_b = np.arange(10).reshape(10, 1).astype("str")
+
+        with self.cached_session():
+            # Initialize tables for V1 lookup.
+            if not tf.executing_eagerly():
+                self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertLen(
+                loaded_model.predict({"a": inputs_a, "b": inputs_b}), 10
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_saving_with_sequence_features(self):
+        cols = [
+            tf.feature_column.sequence_numeric_column("a"),
+            tf.feature_column.indicator_column(
+                tf.feature_column.sequence_categorical_column_with_vocabulary_list(  # noqa: E501
+                    "b", ["one", "two"]
+                )
+            ),
+        ]
+        input_layers = {
+            "a": keras.layers.Input(shape=(None, 1), sparse=True, name="a"),
+            "b": keras.layers.Input(
+                shape=(None, 1), sparse=True, name="b", dtype="string"
+            ),
+        }
+
+        fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
+        # TODO(tibell): Figure out the right dtype and apply masking.
+        # sequence_length_mask = array_ops.sequence_mask(sequence_length)
+        # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
+        x = keras.layers.GRU(32)(fc_layer)
+        output = keras.layers.Dense(10)(x)
+
+        model = keras.models.Model(input_layers, output)
+
+        model.compile(
+            loss=keras.losses.MSE,
+            optimizer="rmsprop",
+            metrics=[keras.metrics.categorical_accuracy],
+        )
+
+        config = model.to_json()
+        loaded_model = model_config.model_from_json(config)
+
+        batch_size = 10
+        timesteps = 1
+
+        values_a = np.arange(10, dtype=np.float32)
+        indices_a = np.zeros((10, 3), dtype=np.int64)
+        indices_a[:, 0] = np.arange(10)
+        inputs_a = tf.SparseTensor(
+            indices_a, values_a, (batch_size, timesteps, 1)
+        )
+
+        values_b = np.zeros(10, dtype=str)
+        indices_b = np.zeros((10, 3), dtype=np.int64)
+        indices_b[:, 0] = np.arange(10)
+        inputs_b = tf.SparseTensor(
+            indices_b, values_b, (batch_size, timesteps, 1)
+        )
+
+        with self.cached_session():
+            # Initialize tables for V1 lookup.
+            if not tf.executing_eagerly():
+                self.evaluate(tf.compat.v1.tables_initializer())
+
+            self.assertLen(
+                loaded_model.predict({"a": inputs_a, "b": inputs_b}, steps=1),
+                batch_size,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_nested_layers(self):
+        class MyLayer(keras.layers.Layer):
+            def __init__(self, sublayers, **kwargs):
+                super().__init__(**kwargs)
+                self.sublayers = sublayers
+
+            def get_config(self):
+                config = super().get_config()
+                config["sublayers"] = self.sublayers
+                return config
+
+        layer = MyLayer(
+            [
+                keras.layers.Dense(2, name="MyDense"),
+                RegisteredSubLayer(name="MySubLayer"),
+            ]
+        )
+        model = keras.Sequential([keras.Input([None]), layer])
+        model_json = model.to_json()
+
+        self.assertIn("Foo>RegisteredSubLayer", model_json)
+
+        loaded_model = model_config.model_from_json(
+            model_json, custom_objects={"MyLayer": MyLayer}
+        )
+        loaded_layer = loaded_model.layers[0]
+        self.assertIsInstance(loaded_layer.sublayers[0], keras.layers.Dense)
+        self.assertEqual(loaded_layer.sublayers[0].name, "MyDense")
+        self.assertIsInstance(loaded_layer.sublayers[1], RegisteredSubLayer)
+        self.assertEqual(loaded_layer.sublayers[1].name, "MySubLayer")
+
+
+class MaskedTensor(tf.experimental.ExtensionType):
+    __name__ = "MaskedTensor_save_test"
+    values: tf.Tensor
+    mask: tf.Tensor
+
+    class Spec(tf.TypeSpec):
+        @property
+        def shape(self):
+            return self.values.shape
+
+        @property
+        def dtype(self):
+            return self.values.dtype
+
+        def with_shape(self, shape):
+            values_spec = tf.TensorSpec(
+                shape, dtype=self.values.dtype, name=self.values.name
+            )
+            mask_spec = tf.TensorSpec(
+                shape, dtype=self.mask.dtype, name=self.mask.name
+            )
+            return MaskedTensor.Spec(values_spec, mask_spec)
+
+
+@test_combinations.run_with_all_saved_model_formats
+class TestWholeModelSaving(test_combinations.TestCase):
+    def _save_model_dir(self, dirname="saved_model"):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        return os.path.join(temp_dir, dirname)
+
+    def _assert_same_weights_and_metrics(self, model, loaded_model):
+        """Checks that loaded weights & metrics are the same as the original.
+
+        Args:
+          model: original model
+          loaded_model: loaded model
+        """
+        self.assertAllClose(model.weights, loaded_model.weights)
+
+        if loaded_model.optimizer:
+            if test_utils.get_save_format() == "tf":
+                # TODO(b/153110928): Keras TF format doesn't restore optimizer
+                # weights currently.
+                return
+            if isinstance(
+                loaded_model.optimizer,
+                keras.optimizers.optimizer.Optimizer,
+            ):
+                loaded_model.optimizer.build(loaded_model.trainable_variables)
+                self.assertAllClose(
+                    model.optimizer.variables,
+                    loaded_model.optimizer.variables,
+                )
+            else:
+                self.assertAllClose(
+                    model.optimizer.weights, loaded_model.optimizer.weights
+                )
+
+        # In V1/Graph mode, the model isn't built, so the metrics are not loaded
+        # immediately (requires model to be called on some data before building
+        # metrics).
+        check_metrics = tf.__internal__.tf2.enabled() and tf.executing_eagerly()
+
+        if check_metrics:
+            self.assertAllEqual(
+                [m.name for m in model.metrics],
+                [m.name for m in loaded_model.metrics],
+            )
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_save_and_load(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        save_kwargs = test_utils.get_save_kwargs()
+
+        if (
+            save_format == "h5" or not save_kwargs.get("save_traces", True)
+        ) and test_utils.get_model_type() == "subclass":
+            # HDF5 format currently does not allow saving subclassed models.
+            # When saving with `save_traces=False`, the subclassed model must
+            # have a get_config/from_config, which the autogenerated model does
+            # not have.
+            return
+
+        with self.cached_session():
+            model = test_utils.get_model_from_layers(
+                [
+                    keras.layers.Dense(2),
+                    keras.layers.RepeatVector(3),
+                    keras.layers.TimeDistributed(keras.layers.Dense(3)),
+                ],
+                input_shape=(3,),
+            )
+            model.compile(
+                loss=keras.losses.MSE,
+                optimizer=keras.optimizers.legacy.rmsprop.RMSprop(lr=0.0001),
+                metrics=[
+                    keras.metrics.categorical_accuracy,
+                    keras.metrics.CategoricalCrossentropy(
+                        name="cce", label_smoothing=tf.constant(0.2)
+                    ),
+                ],
+                weighted_metrics=[
+                    keras.metrics.categorical_crossentropy,
+                    keras.metrics.CategoricalCrossentropy(
+                        name="cce", label_smoothing=tf.constant(0.2)
+                    ),
+                ],
+                sample_weight_mode="temporal",
+            )
+
+            x = np.random.random((1, 3))
+            y = np.random.random((1, 3, 3))
+            model.train_on_batch(x, y)
+
+            out = model.predict(x)
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format, **save_kwargs
+            )
+
+            loaded_model = keras.models.load_model(saved_model_dir)
+            self._assert_same_weights_and_metrics(model, loaded_model)
+
+            out2 = loaded_model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+            eval_out = model.evaluate(x, y)
+            eval_out2 = loaded_model.evaluate(x, y)
+            self.assertArrayNear(eval_out, eval_out2, 0.001)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sequential_model_saving_without_input_shape(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(2))
+            model.add(keras.layers.RepeatVector(3))
+            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+            model.compile(
+                loss=keras.losses.MSE,
+                optimizer="rmsprop",
+                metrics=[
+                    keras.metrics.categorical_accuracy,
+                    keras.metrics.CategoricalAccuracy(name="cat_acc"),
+                ],
+                weighted_metrics=[
+                    keras.metrics.categorical_accuracy,
+                    keras.metrics.CategoricalAccuracy(name="cat_acc2"),
+                ],
+                sample_weight_mode="temporal",
+            )
+            x = np.random.random((1, 3))
+            y = np.random.random((1, 3, 3))
+            model.train_on_batch(x, y)
+
+            out = model.predict(x)
+            model.save(saved_model_dir, save_format=save_format)
+
+            new_model = keras.models.load_model(saved_model_dir)
+
+            self._assert_same_weights_and_metrics(model, new_model)
+
+            out2 = new_model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sequential_model_saving_without_compile(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(2, input_shape=(3,)))
+            model.add(keras.layers.RepeatVector(3))
+            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+            x = np.random.random((1, 3))
+            out = model.predict(x)
+
+            # Save the model without any compilation or training.
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+
+            new_model = keras.models.load_model(saved_model_dir)
+            self._assert_same_weights_and_metrics(model, new_model)
+
+            out2 = new_model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+    def test_sequential_model_saving_2(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        with tf.Graph().as_default(), self.cached_session():
+            # test with custom optimizer, loss
+
+            class CustomOp(optimizer_v1.RMSprop):
+                pass
+
+            def custom_loss(y_true, y_pred):
+                return keras.losses.mse(y_true, y_pred)
+
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(2, input_shape=(3,)))
+            model.add(keras.layers.Dense(3))
+            model.compile(
+                loss=custom_loss, optimizer=CustomOp(), metrics=["acc"]
+            )
+
+            x = np.random.random((1, 3))
+            y = np.random.random((1, 3))
+            model.train_on_batch(x, y)
+
+            out = model.predict(x)
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+
+            new_model = keras.models.load_model(
+                saved_model_dir,
+                custom_objects={
+                    "CustomOp": CustomOp,
+                    "custom_loss": custom_loss,
+                },
+            )
+            self._assert_same_weights_and_metrics(model, new_model)
+
+            out2 = new_model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+    def test_saving_without_compilation(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(2, input_shape=(3,)))
+        model.add(keras.layers.Dense(3))
+        model.compile(loss="mse", optimizer="sgd", metrics=["acc"])
+
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        model = keras.models.load_model(saved_model_dir)
+
+    def test_saving_with_tf_optimizer(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(2, input_shape=(3,)))
+        model.add(keras.layers.Dense(3))
+        model.compile(
+            loss="mse",
+            optimizer=tf.compat.v1.train.AdadeltaOptimizer(0.1),
+            metrics=["acc"],
+        )
+
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        model = keras.models.load_model(saved_model_dir)
+
+    def test_saving_right_after_compilation(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(2, input_shape=(3,)))
+            model.add(keras.layers.Dense(3))
+            model.compile(loss="mse", optimizer="sgd", metrics=["acc"])
+            if not tf.compat.v1.executing_eagerly_outside_functions():
+                model._make_train_function()
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+            model = keras.models.load_model(saved_model_dir)
+
+    def test_saving_lambda_numpy_array_arguments(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        if h5py is None:
+            self.skipTest("h5py required to run this test")
+
+        mean = np.random.random((4, 2, 3))
+        std = np.abs(np.random.random((4, 2, 3))) + 1e-5
+        inputs = keras.layers.Input(shape=(4, 2, 3))
+        output = keras.layers.Lambda(
+            lambda image, mu, std: (image - mu) / std,
+            arguments={"mu": mean, "std": std},
+        )(inputs)
+        model = keras.models.Model(inputs, output)
+        model.compile(loss="mse", optimizer="sgd", metrics=["acc"])
+
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+
+        model = keras.models.load_model(saved_model_dir)
+
+        self.assertAllClose(mean, model.layers[1].arguments["mu"])
+        self.assertAllClose(std, model.layers[1].arguments["std"])
+
+    def test_saving_model_with_long_layer_names(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        with self.cached_session():
+            # This layer name will make the `layers_name` HDF5 attribute blow
+            # out of proportion. Note that it fits into the internal HDF5
+            # attribute memory limit on its own but because h5py converts
+            # the list of layer names into numpy array, which uses the same
+            # amount of memory for every item, it increases the memory
+            # requirements substantially.
+            x = keras.Input(shape=(2,), name="input_" + ("x" * (2**15)))
+            f = x
+            for i in range(4):
+                f = keras.layers.Dense(2, name="dense_%d" % (i,))(f)
+            model = keras.Model(inputs=[x], outputs=[f])
+            model.compile(
+                "adam", loss=keras.losses.MeanSquaredError(), metrics=["acc"]
+            )
+
+            x = np.random.random((1, 2))
+            y = np.random.random((1, 2))
+            model.train_on_batch(x, y)
+            out = model.predict(x)
+
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+            model = keras.models.load_model(saved_model_dir)
+
+            if save_format in ["tf", "tensorflow"]:
+                return
+            # Check that the HDF5 files contains chunked array
+            # of layer names.
+            with h5py.File(saved_model_dir, "r") as h5file:
+                num_names_arrays = len(
+                    [
+                        attr
+                        for attr in h5file["model_weights"].attrs
+                        if attr.startswith("layer_names")
+                    ]
+                )
+            # The chunking of layer names array should have happened.
+            self.assertGreater(num_names_arrays, 0)
+            out2 = model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+    def test_saving_model_with_long_weights_names(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        with self.cached_session():
+            x = keras.Input(shape=(2,), name="nested_model_input")
+            f = x
+            for i in range(4):
+                f = keras.layers.Dense(2, name="nested_model_dense_%d" % (i,))(
+                    f
+                )
+            # This layer name will make the `weights_name`
+            # HDF5 attribute blow out of proportion.
+            f = keras.layers.Dense(
+                2, name="nested_model_output" + ("x" * (2**14))
+            )(f)
+            nested_model = keras.Model(
+                inputs=[x], outputs=[f], name="nested_model"
+            )
+
+            x = keras.Input(shape=(2,), name="outer_model_input")
+            f = nested_model(x)
+            f = keras.layers.Dense(2, name="outer_model_output")(f)
+
+            model = keras.Model(inputs=[x], outputs=[f])
+            model.compile(loss="mse", optimizer="adam", metrics=["acc"])
+
+            x = np.random.random((1, 2))
+            y = np.random.random((1, 2))
+            model.train_on_batch(x, y)
+            out = model.predict(x)
+
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+            model = keras.models.load_model(saved_model_dir)
+
+            if save_format in ["h5", "hdf5", "keras"]:
+                # Check that the HDF5 files contains chunked array
+                # of weight names.
+                with h5py.File(saved_model_dir, "r") as h5file:
+                    num_weight_arrays = len(
+                        [
+                            attr
+                            for attr in h5file["model_weights"][
+                                "nested_model"
+                            ].attrs
+                            if attr.startswith("weight_names")
+                        ]
+                    )
+                # The chunking of layer names array should have happened.
+                self.assertGreater(num_weight_arrays, 0)
+            out2 = model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+    def test_model_saving_to_pre_created_h5py_file(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        with tf.Graph().as_default(), self.cached_session():
+            inputs = keras.Input(shape=(3,))
+            x = keras.layers.Dense(2)(inputs)
+            outputs = keras.layers.Dense(3)(x)
+
+            model = keras.Model(inputs, outputs)
+            model.compile(
+                loss=keras.losses.MSE,
+                optimizer=optimizer_v1.Adam(),
+                metrics=[
+                    keras.metrics.categorical_accuracy,
+                    keras.metrics.CategoricalAccuracy(),
+                ],
+            )
+            x = np.random.random((1, 3))
+            y = np.random.random((1, 3))
+            model.train_on_batch(x, y)
+
+            out = model.predict(x)
+
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+            loaded_model = keras.models.load_model(saved_model_dir)
+            out1 = loaded_model.predict(x)
+            self.assertAllClose(out, out1, atol=1e-05)
+            if save_format in ["tf", "tensorflow"]:
+                return
+
+            # Test h5 format specifically
+            fd, fname = tempfile.mkstemp(".h5")
+            with h5py.File(fname, mode="r+") as h5file:
+                keras.models.save_model(model, h5file)
+                loaded_model = keras.models.load_model(h5file)
+                out2 = loaded_model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+            # Test non-default options in h5
+            with h5py.File(
+                "_", driver="core", mode="w", backing_store=False
+            ) as h5file:
+                keras.models.save_model(model, h5file)
+                loaded_model = keras.models.load_model(h5file)
+                out2 = loaded_model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+            # Cleanup
+            os.close(fd)
+            os.remove(fname)
+
+    def test_model_saving_to_new_dir_path(self):
+        saved_model_dir = os.path.join(
+            self._save_model_dir(), "newdir", "saved_model"
+        )
+        save_format = test_utils.get_save_format()
+
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(2, input_shape=(3,)))
+            model.add(keras.layers.RepeatVector(3))
+            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+            x = np.random.random((1, 3))
+            out = model.predict(x)
+
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+
+            new_model = keras.models.load_model(saved_model_dir)
+            self._assert_same_weights_and_metrics(model, new_model)
+
+            out2 = new_model.predict(x)
+            self.assertAllClose(out, out2, atol=1e-05)
+
+    def test_model_raise_exception_with_failed_saving(self):
+        if h5py is None:
+            self.skipTest("h5py required to run this test")
+
+        saved_model_dir = self._save_model_dir()
+        saved_model_path = os.path.join(saved_model_dir, "saved_model.h5")
+
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(2, input_shape=(3,)))
+            model.add(keras.layers.RepeatVector(3))
+            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+            with self.assertRaisesRegex(OSError, "Unable to create file"):
+                with h5py.File(saved_model_path, "w"):
+                    keras.models.save_model(model, saved_model_path)
+
+    def test_saving_constant_initializer_with_numpy(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                2,
+                input_shape=(3,),
+                kernel_initializer=keras.initializers.Constant(np.ones((3, 2))),
+            )
+        )
+        model.add(keras.layers.Dense(3))
+        model.compile(loss="mse", optimizer="sgd", metrics=["acc"])
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        model = keras.models.load_model(saved_model_dir)
+
+    def test_saving_group_naming_h5py(self):
+        # Test saving model with layer which name is prefix to a previous layer
+        # name.
+
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir)
+        h5_path = os.path.join(temp_dir, "test.h5")
+
+        input_layer = keras.layers.Input((None, None, 3), name="test_input")
+        x = keras.layers.Conv2D(1, 1, name="conv1/conv")(input_layer)
+        x = keras.layers.Activation("relu", name="conv1")(x)
+        model = keras.models.Model(inputs=input_layer, outputs=x)
+
+        model.save_weights(h5_path)
+        model.load_weights(h5_path)
+
+    def test_primitive_attrs_contain_no_extraneous_strings(self):
+        if h5py is None:
+            self.skipTest("h5py required to run this test")
+
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(1, input_shape=[2]))
+        model.save(saved_model_dir, save_format=save_format)
+        if save_format in ["tf", "tensorflow"]:
+            return
+
+        h5file = h5py.File(saved_model_dir, "r")
+        self.assertRegex(
+            h5file.attrs["keras_version"], r"^[\d]+\.[\d]+\.[\S]+$"
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_functional_model_with_custom_loss_and_metric(self):
+        def _make_model():
+            inputs = keras.Input(shape=(4,))
+            x = keras.layers.Dense(8, activation="relu")(inputs)
+            outputs = keras.layers.Dense(3, activation="softmax")(x)
+            model = keras.Model(inputs=inputs, outputs=outputs)
+            custom_loss = keras.layers.Lambda(
+                lambda x: keras.backend.sum(x * x)
+            )(x)
+            model.add_loss(custom_loss)
+            model.add_metric(
+                custom_loss, aggregation="mean", name="custom_loss"
+            )
+            return model
+
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        with self.cached_session():
+            model = _make_model()
+            model.compile(
+                loss=keras.losses.SparseCategoricalCrossentropy(),
+                optimizer=optimizers.gradient_descent_legacy.SGD(),
+                metrics=[keras.metrics.SparseCategoricalCrossentropy()],
+            )
+            x = np.random.normal(size=(32, 4))
+            y = np.random.randint(0, 3, size=32)
+            model.train_on_batch(x, y)
+            evaluation_results = model.evaluate(x, y)
+            # Save and reload model.
+            model.save(saved_model_dir, save_format=save_format)
+            del model  # Prevent misuse.
+            loaded_model = keras.models.load_model(saved_model_dir)
+            loaded_model_eval_results = loaded_model.evaluate(x, y)
+            # Assert all evaluation results are the same.
+            self.assertAllClose(
+                evaluation_results, loaded_model_eval_results, 1e-9
+            )
+            # Check correctness of the loss calculation.
+            self.assertAllGreater(evaluation_results, 0.0)
+            evaluation_results = dict(
+                zip(loaded_model.metrics_names, evaluation_results)
+            )
+            self.assertNear(
+                evaluation_results["sparse_categorical_crossentropy"]
+                + evaluation_results["custom_loss"],
+                evaluation_results["loss"],
+                1e-6,
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_save_uncompiled_model_with_optimizer(self):
+        with self.cached_session() as session:
+            saved_model_dir = self._save_model_dir()
+            save_format = test_utils.get_save_format()
+            model = keras.models.Sequential(
+                [keras.layers.Dense(1, input_shape=(3,))]
+            )
+            # Set the model's optimizer but don't compile. This can happen if
+            # the model is trained with a custom training loop.
+            model.optimizer = keras.optimizers.legacy.rmsprop.RMSprop(lr=0.0001)
+            if not tf.executing_eagerly():
+                session.run([v.initializer for v in model.variables])
+            model.save(saved_model_dir, save_format=save_format)
+
+            if save_format in ["tf", "tensorflow"]:
+                loaded = keras.models.load_model(saved_model_dir)
+                self.assertIsInstance(
+                    loaded.optimizer,
+                    keras.optimizers.legacy.optimizer_v2.OptimizerV2,
+                )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_functional_model_with_getitem_op_layer(self):
+        inp = keras.Input(shape=(8))
+
+        out = inp[:]
+        model = keras.Model(inputs=[inp], outputs=out)
+        batch_size = 7
+        x = tf.stack([tf.range(8) for _ in range(batch_size)])
+        args = [x]
+        expected = x[:]
+
+        self.assertAllEqual(model(args), expected)
+        self.assertAllEqual(
+            model.predict(args, batch_size=batch_size), expected
+        )
+
+        # Make sure it can be successfully saved and loaded.
+        save_format = test_utils.get_save_format()
+        saved_model_dir = self._save_model_dir()
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+
+        loaded_model = keras.models.load_model(saved_model_dir)
+
+        self.assertAllEqual(loaded_model(args), expected)
+        self.assertAllEqual(
+            loaded_model.predict(args, batch_size=batch_size), expected
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["eager", "graph"])
+    )
+    def test_custom_functional_registered(self):
+        def _get_cls_definition():
+            class CustomModel(keras.Model):
+                def c(self):
+                    return "c"
+
+            return CustomModel
+
+        cls = _get_cls_definition()
+        self.assertEqual(cls.__bases__[0], keras.Model)
+
+        with self.cached_session() as sess:
+            input_ = keras.layers.Input(shape=(1,))
+            output = keras.layers.Dense(1)(input_)
+            model = cls(input_, output)
+            # `cls` now inherits from `Functional` class.
+            self.assertEqual(cls.__bases__[0], functional.Functional)
+
+            if not tf.executing_eagerly():
+                sess.run([v.initializer for v in model.variables])
+
+            save_format = test_utils.get_save_format()
+            saved_model_dir = self._save_model_dir()
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+
+        loaded_model = keras.models.load_model(
+            saved_model_dir, custom_objects={"CustomModel": cls}
+        )
+        self.assertIsInstance(loaded_model, cls)
+
+        # Check with "new" `CustomModel` class definition.
+        new_cls = _get_cls_definition()
+        # The new `CustomModel` class is *not* derived from `Functional`.
+        self.assertEqual(new_cls.__bases__[0], keras.Model)
+        reloaded_model = keras.models.load_model(
+            saved_model_dir, custom_objects={"CustomModel": new_cls}
+        )
+        self.assertIsInstance(reloaded_model, new_cls)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_custom_sequential_registered_no_scope(self):
+        @object_registration.register_keras_serializable(package="my_package")
+        class MyDense(keras.layers.Dense):
+            def __init__(self, units, **kwargs):
+                super().__init__(units, **kwargs)
+
+        input_shape = [1]
+        inputs = keras.Input(shape=input_shape)
+        custom_layer = MyDense(1)
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        model = keras.Sequential(layers=[inputs, custom_layer])
+        model.save(saved_model_dir, save_format=save_format)
+        loaded_model = keras.models.load_model(saved_model_dir)
+
+        x = tf.constant([5])
+        self.assertAllEqual(model(x), loaded_model(x))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_custom_functional_registered_no_scope(self):
+        @object_registration.register_keras_serializable(package="my_package")
+        class MyDense(keras.layers.Dense):
+            def __init__(self, units, **kwargs):
+                super().__init__(units, **kwargs)
+
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        input_shape = [1]
+        inputs = keras.Input(shape=input_shape)
+        outputs = MyDense(1)(inputs)
+        model = keras.Model(inputs, outputs)
+
+        model.save(saved_model_dir, save_format=save_format)
+        loaded_model = keras.models.load_model(saved_model_dir)
+
+        x = tf.constant([5])
+        self.assertAllEqual(model(x), loaded_model(x))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_shared_objects(self):
+        class OuterLayer(keras.layers.Layer):
+            def __init__(self, inner_layer):
+                super().__init__()
+                self.inner_layer = inner_layer
+
+            def call(self, inputs):
+                return self.inner_layer(inputs)
+
+            def get_config(self):
+                return {
+                    "inner_layer": serialization.serialize_keras_object(
+                        self.inner_layer
+                    )
+                }
+
+            @classmethod
+            def from_config(cls, config):
+                return cls(
+                    serialization.deserialize_keras_object(
+                        config["inner_layer"]
+                    )
+                )
+
+        class InnerLayer(keras.layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.v = self.add_weight(name="v", shape=[], dtype=tf.float32)
+
+            def call(self, inputs):
+                return self.v + inputs
+
+            @classmethod
+            def from_config(cls, config):
+                return cls()
+
+        # Create a model with 2 output layers that share the same inner layer.
+        inner_layer = InnerLayer()
+        outer_layer_1 = OuterLayer(inner_layer)
+        outer_layer_2 = OuterLayer(inner_layer)
+        input_ = keras.Input(shape=(1,))
+        model = keras.Model(
+            inputs=input_,
+            outputs=[outer_layer_1(input_), outer_layer_2(input_)],
+        )
+
+        # Changes to the shared layer should affect both outputs.
+        model.layers[1].inner_layer.v.assign(5)
+        self.assertAllEqual(model(1), [6.0, 6.0])
+        model.layers[1].inner_layer.v.assign(3)
+        self.assertAllEqual(model(1), [4.0, 4.0])
+
+        # After loading, changes to the shared layer should still affect both
+        # outputs.
+        def _do_assertions(loaded):
+            loaded.layers[1].inner_layer.v.assign(5)
+            self.assertAllEqual(loaded(1), [6.0, 6.0])
+            loaded.layers[1].inner_layer.v.assign(3)
+            self.assertAllEqual(loaded(1), [4.0, 4.0])
+            loaded.layers[2].inner_layer.v.assign(5)
+            self.assertAllEqual(loaded(1), [6.0, 6.0])
+            loaded.layers[2].inner_layer.v.assign(3)
+            self.assertAllEqual(loaded(1), [4.0, 4.0])
+
+        # We'd like to make sure we only attach shared object IDs when strictly
+        # necessary, so we'll recursively traverse the generated config to count
+        # whether we have the exact number we expect.
+        def _get_all_keys_recursive(dict_or_iterable):
+            if isinstance(dict_or_iterable, dict):
+                for key in dict_or_iterable.keys():
+                    yield key
+                for key in _get_all_keys_recursive(dict_or_iterable.values()):
+                    yield key
+            elif isinstance(dict_or_iterable, str):
+                return
+            else:
+                try:
+                    for item in dict_or_iterable:
+                        for key in _get_all_keys_recursive(item):
+                            yield key
+                # Not an iterable or dictionary
+                except TypeError:
+                    return
+
+        with object_registration.CustomObjectScope(
+            {"OuterLayer": OuterLayer, "InnerLayer": InnerLayer}
+        ):
+            # Test saving and loading to disk
+            save_format = test_utils.get_save_format()
+            saved_model_dir = self._save_model_dir()
+            keras.models.save_model(
+                model, saved_model_dir, save_format=save_format
+            )
+            loaded = keras.models.load_model(saved_model_dir)
+            _do_assertions(loaded)
+
+            # Test recreating directly from config
+            config = model.get_config()
+            key_count = collections.Counter(_get_all_keys_recursive(config))
+            self.assertEqual(key_count[serialization.SHARED_OBJECT_KEY], 2)
+            loaded = keras.Model.from_config(config)
+            _do_assertions(loaded)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def test_shared_objects_wrapper(self):
+        """Tests that shared layers wrapped with `Wrapper` restore correctly."""
+        input_ = keras.Input(shape=(1,))
+        unwrapped = keras.layers.Layer(name="unwrapped")
+        wrapped = keras.layers.Wrapper(unwrapped, name="wrapped")
+        model = keras.Model(
+            inputs=input_, outputs=[unwrapped(input_), wrapped(input_)]
+        )
+
+        # Test recreating directly from config
+        config = model.get_config()
+        loaded = keras.Model.from_config(config)
+        self.assertIs(loaded.layers[1], loaded.layers[2].layer)
+
+        # Test saving and loading to disk
+        save_format = test_utils.get_save_format()
+        saved_model_dir = self._save_model_dir()
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        loaded = keras.models.load_model(saved_model_dir)
+        self.assertIs(loaded.layers[1], loaded.layers[2].layer)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"], fit=[True, False])
+    )
+    def test_multi_output_metrics_name_stay_same(self, fit):
+        """Tests that metric names don't change with each save/load cycle.
+
+        e.g. "head_0_accuracy" should not become "head_0_head_0_accuracy" after
+        saving and loading a model.
+
+        Arguments:
+          fit: Whether the model should be fit before saving.
+        """
+        # This doesn't work at all, so we can't check whether metric names are
+        # correct.
+        if not tf.executing_eagerly() and not fit:
+            self.skipTest("b/181767784")
+
+        input_ = keras.Input((4,))
+        model = keras.Model(
+            input_,
+            [
+                keras.layers.Softmax(name="head_0")(
+                    keras.layers.Dense(3)(input_)
+                ),
+                keras.layers.Softmax(name="head_1")(
+                    keras.layers.Dense(5)(input_)
+                ),
+            ],
+        )
+        metric = keras.metrics.BinaryAccuracy()
+        model.compile(
+            optimizer="rmsprop",
+            loss="mse",
+            metrics={"head_0": [metric, "accuracy"]},
+        )
+
+        x = np.random.rand(2, 4)
+        y = {
+            "head_0": np.random.randint(2, size=(2, 3)),
+            "head_1": np.random.randint(2, size=(2, 5)),
+        }
+
+        # Make sure metrix prefixing works the same regardless of whether the
+        # user has fit the model before saving.
+        if fit:
+            model.fit(x, y, verbose=0)
+
+        # Save and reload.
+        save_format = test_utils.get_save_format()
+        saved_model_dir = self._save_model_dir()
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        loaded = keras.models.load_model(saved_model_dir)
+
+        # Make sure the metrics names from the model before saving match the
+        # loaded model.
+        self.assertSequenceEqual(model.metrics_names, loaded.metrics_names)
+
+    # Test only in eager mode because ragged tensor inputs
+    # cannot be used in graph mode.
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    @test_utils.run_v2_only
+    def test_save_functional_with_ragged_constant_input(self):
+        input1 = keras.Input(shape=[])
+        input2 = tf.ragged.constant([[1.0, 2.0], [3.0]])
+        outputs = keras.layers.Add()([input1, input2])
+        model = keras.Model(input1, outputs)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir)
+        keras.models.load_model(saved_model_dir)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    @test_utils.run_v2_only
+    def test_save_functional_with_constant_input(self):
+        input1 = keras.Input(shape=[2])
+        input2 = tf.constant([[1.0, 2.0]])
+        outputs = keras.layers.Add()([input1, input2])
+        model = keras.Model(input1, outputs)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir)
+        keras.models.load_model(saved_model_dir)
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    @test_utils.run_v2_only
+    def test_save_functional_with_constant_string_input(self):
+        input1 = keras.Input(shape=[2], dtype=tf.string)
+        input2 = tf.constant([["単", "に"]])
+        outputs = keras.layers.Concatenate()([input1, input2])
+        model = keras.Model(input1, outputs)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir)
+        loaded_model = keras.models.load_model(saved_model_dir)
+        x = tf.constant([["a", "b"]])
+        self.assertAllEqual(model(x), loaded_model(x))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    @test_utils.run_v2_only
+    def test_save_functional_with_ragged_constant_string_input(self):
+        input1 = keras.Input(shape=[1], dtype=tf.string)
+        input2 = tf.ragged.constant([["単", "に"], ["単"]])
+        outputs = keras.layers.Concatenate(axis=0)([input1, input2])
+        model = keras.Model(input1, outputs)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir)
+        loaded_model = keras.models.load_model(saved_model_dir)
+        x = tf.constant([["a"]])
+        self.assertAllEqual(model(x), loaded_model(x))
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    @test_utils.run_v2_only
+    def test_save_inputs_spec_with_composite_tensor_names(self):
+        class KerasModel(keras.Model):
+            def call(self, inputs):
+                return inputs
+
+        spec = MaskedTensor.Spec(
+            tf.TensorSpec([None], name="x__values"),
+            tf.TensorSpec([None], dtype=tf.bool, name="x__mask"),
+        )
+        km1 = KerasModel()
+        inputs = keras.Input(type_spec=spec)
+        km1(inputs)
+        self.assertEqual(km1.save_spec()[0][0].mask.name, "x__mask")
+
+
+# Factory functions to create models that will be serialized inside a Network.
+def _make_graph_network(input_size, output_size):
+    inputs = keras.Input(input_size)
+    x = keras.layers.Dense(8, activation="relu")(inputs)
+    y = keras.layers.Dense(output_size)(x)
+    return keras.Model(inputs=inputs, outputs=y)
+
+
+def _make_sequential(input_size, output_size):
+    del input_size
+    return keras.Sequential(
+        [
+            keras.layers.Dense(8, activation="relu"),
+            keras.layers.Dense(output_size),
+        ]
+    )
+
+
+def _make_sequential_built(input_size, output_size):
+    model = _make_sequential(input_size, output_size)
+    model.build((None, input_size))
+    return model
+
+
+def _make_sequential_graph_network(input_size, output_size):
+    return keras.Sequential(
+        [
+            keras.layers.InputLayer(input_size),
+            keras.layers.Dense(8, activation="relu"),
+            keras.layers.Dense(output_size),
+        ]
+    )
+
+
+def _make_sequential_input_shape(input_size, output_size):
+    return keras.Sequential(
+        [
+            keras.layers.Dense(8, activation="relu", input_shape=(input_size,)),
+            keras.layers.Dense(output_size),
+        ]
+    )
+
+
+class _make_subclassed(keras.Model):
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self._config = {"input_size": input_size, "output_size": output_size}
+        self._hidden_layer = keras.layers.Dense(
+            8, activation="relu", name="hidden"
+        )
+        self._logits_layer = keras.layers.Dense(output_size, name="logits")
+
+    def call(self, inputs):
+        x = self._hidden_layer(inputs)
+        return self._logits_layer(x)
+
+    def get_config(self):
+        return self._config
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+
+class _make_subclassed_built(_make_subclassed):
+    def __init__(self, input_size, output_size):
+        super().__init__(input_size, output_size)
+        self.build((None, input_size))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class TestWholeModelSavingWithNesting(tf.test.TestCase, parameterized.TestCase):
+    """Tests saving a whole model that contains other models."""
+
+    @parameterized.named_parameters(
+        [
+            ("graph_network", _make_graph_network),
+            ("sequential", _make_sequential),
+            ("sequential_built", _make_sequential_built),
+            ("sequential_graph_network", _make_sequential_graph_network),
+            ("sequential_input_shape", _make_sequential_input_shape),
+            ("subclassed", _make_subclassed),
+            ("subclassed_built", _make_subclassed_built),
+        ]
+    )
+    def test_functional(self, model_fn):
+        """Tests serializing a model that uses a nested model to share
+        weights."""
+        if h5py is None:
+            self.skipTest("h5py required to run this test")
+
+        def _make_model():
+            inputs = (
+                keras.Input(shape=(4,), name="examples"),
+                keras.Input(shape=(4,), name="neighbors"),
+            )
+            base_model = model_fn(inputs[0].shape.as_list()[-1], 2)
+            outputs = keras.layers.add(
+                [base_model(inputs[0]), base_model(inputs[1])]
+            )
+            return keras.Model(inputs=inputs, outputs=outputs)
+
+        with self.cached_session():
+            x = (
+                np.random.normal(size=(16, 4)).astype(np.float32),
+                np.random.normal(size=(16, 4)).astype(np.float32),
+            )
+            model = _make_model()
+            predictions = model(x)
+            # Save and reload.
+            model_path = os.path.join(self.get_temp_dir(), "model.h5")
+            model.save(model_path)
+            del model
+            loaded_model = keras.models.load_model(
+                model_path,
+                custom_objects={
+                    "_make_subclassed": _make_subclassed,
+                    "_make_subclassed_built": _make_subclassed_built,
+                },
+                compile=False,
+            )
+            self.assertAllClose(loaded_model(x), predictions, 1e-9)
+
+
+if __name__ == "__main__":
+    with saved_model_utils.keras_option_scope(
+        save_traces=False, in_tf_saved_model_scope=True
+    ):
+        tf.test.main()
diff --git a/keras/saving/legacy/save_weights_test.py b/keras/saving/legacy/save_weights_test.py
new file mode 100644
index 000000000000..fbfcea017116
--- /dev/null
+++ b/keras/saving/legacy/save_weights_test.py
@@ -0,0 +1,764 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ,============================================================================
+"""Tests for model saving in the HDF5 format."""
+
+import os
+import shutil
+import uuid
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras.engine import training
+from keras.optimizers import optimizer_v1
+from keras.saving.legacy import hdf5_format
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class TestWeightSavingAndLoading(tf.test.TestCase, parameterized.TestCase):
+    def _save_model_dir(self, dirname="saved_model"):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        return os.path.join(temp_dir, dirname)
+
+    @test_combinations.run_with_all_weight_formats
+    def test_weight_loading(self):
+        saved_model_dir = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+        with self.cached_session():
+            a = keras.layers.Input(shape=(2,))
+            x = keras.layers.Dense(3)(a)
+            b = keras.layers.Dense(1)(x)
+            model = keras.models.Model(a, b)
+
+            x = np.random.random((3, 2))
+            ref_y = model.predict(x)
+            weights = model.get_weights()
+            model.set_weights(weights)
+            y = model.predict(x)
+            self.assertAllClose(ref_y, y)
+
+            with self.assertRaises(ValueError):
+                model.set_weights(weights[1:])
+            with self.assertRaises(ValueError):
+                model.set_weights(weights[::-1])
+
+            model.save_weights(saved_model_dir, save_format=save_format)
+            model.load_weights(saved_model_dir)
+            y = model.predict(x)
+            self.assertAllClose(ref_y, y)
+
+    def test_weight_preprocessing(self):
+        input_dim = 3
+        output_dim = 3
+        size = 2
+        cases = [
+            [
+                (keras.layers.Bidirectional(keras.layers.SimpleRNN(2))),
+                [np.random.random((2, 1)), np.random.random((2, 1))],
+                (None, 3, 2),
+            ],
+            [
+                (keras.layers.TimeDistributed(keras.layers.Dense(1))),
+                [np.random.random((2, 1)), np.random.random((1,))],
+                (None, 3, 2),
+            ],
+            [
+                (keras.layers.Conv1D(output_dim, size, use_bias=False)),
+                [np.random.random((output_dim, input_dim, size, 1))],
+                (None, 4, input_dim),
+            ],
+            [
+                (
+                    keras.layers.Conv2D(
+                        output_dim,
+                        size,
+                        use_bias=False,
+                        data_format="channels_first",
+                    )
+                ),
+                [np.random.random((output_dim, input_dim, size, size))],
+                (None, input_dim, 4, 4),
+            ],
+            [
+                (
+                    keras.layers.Conv2DTranspose(
+                        output_dim,
+                        size,
+                        use_bias=False,
+                        data_format="channels_first",
+                    )
+                ),
+                [np.random.random((output_dim, input_dim, size, size))],
+                (None, input_dim, 4, 4),
+            ],
+            [
+                (
+                    keras.layers.Conv2DTranspose(
+                        output_dim,
+                        size,
+                        use_bias=False,
+                        data_format="channels_last",
+                    )
+                ),
+                [np.random.random((size, size, input_dim, output_dim))],
+                (None, 4, 4, input_dim),
+            ],
+            [
+                (
+                    keras.layers.Conv3D(
+                        output_dim,
+                        size,
+                        use_bias=False,
+                        data_format="channels_first",
+                    )
+                ),
+                [np.random.random((output_dim, input_dim, size, size, size))],
+                (None, input_dim, 4, 4, 4),
+            ],
+            [
+                (keras.layers.GRUV1(output_dim)),
+                [
+                    np.random.random((input_dim, output_dim)),
+                    np.random.random((output_dim, output_dim)),
+                    np.random.random((output_dim,)),
+                    np.random.random((input_dim, output_dim)),
+                    np.random.random((output_dim, output_dim)),
+                    np.random.random((output_dim,)),
+                    np.random.random((input_dim, output_dim)),
+                    np.random.random((output_dim, output_dim)),
+                    np.random.random((output_dim,)),
+                ],
+                (None, 4, input_dim),
+            ],
+            [
+                (keras.layers.LSTMV1(output_dim)),
+                [
+                    np.random.random((input_dim, output_dim)),
+                    np.random.random((output_dim, output_dim)),
+                    np.random.random((output_dim,)),
+                    np.random.random((input_dim, output_dim)),
+                    np.random.random((output_dim, output_dim)),
+                    np.random.random((output_dim,)),
+                    np.random.random((input_dim, output_dim)),
+                    np.random.random((output_dim, output_dim)),
+                    np.random.random((output_dim,)),
+                    np.random.random((input_dim, output_dim)),
+                    np.random.random((output_dim, output_dim)),
+                    np.random.random((output_dim,)),
+                ],
+                (None, 4, input_dim),
+            ],
+        ]
+        for layer, weights, input_shape in cases:
+            layer.build(input_shape)
+            _ = hdf5_format.preprocess_weights_for_loading(
+                layer, weights, original_keras_version="1"
+            )
+
+        model = keras.models.Sequential([keras.layers.Dense(2, input_dim=2)])
+        _ = hdf5_format.preprocess_weights_for_loading(
+            model, model.weights, original_keras_version="1"
+        )
+
+        x = keras.Input((2,))
+        y = keras.layers.Dense(2)(x)
+        model = keras.models.Model(x, y)
+        _ = hdf5_format.preprocess_weights_for_loading(
+            model, model.weights, original_keras_version="1"
+        )
+
+    @parameterized.named_parameters(
+        ("gru", keras.layers.GRU, {"units": 2, "input_shape": (3, 5)}),
+        (
+            "gru_with_reset_after",
+            keras.layers.GRU,
+            {"units": 2, "input_shape": (3, 5), "reset_after": True},
+        ),
+        ("lstm", keras.layers.LSTM, {"units": 2, "input_shape": (3, 5)}),
+        (
+            "cudnngru",
+            keras.layers.CuDNNGRU,
+            {"units": 2, "input_shape": (3, 5)},
+        ),
+        (
+            "cudnnlstm",
+            keras.layers.CuDNNLSTM,
+            {"units": 2, "input_shape": (3, 5)},
+        ),
+    )
+    def test_preprocess_weights_for_loading_rnn_should_be_idempotent(
+        self, layer_class, layer_args
+    ):
+        with self.cached_session():
+            layer = layer_class(**layer_args)
+            layer.build(input_shape=layer_args.get("input_shape"))
+            weights1 = layer.get_weights()
+            weights2 = hdf5_format.preprocess_weights_for_loading(
+                layer, weights1
+            )
+            _ = [
+                self.assertAllClose(x, y, rtol=1e-05)
+                for (x, y) in zip(weights1, weights2)
+            ]
+
+    def test_sequential_weight_loading(self):
+        if h5py is None:
+            return
+
+        h5_path = self._save_model_dir("test.h5")
+
+        num_hidden = 5
+        input_dim = 3
+        batch_size = 5
+        num_classes = 2
+
+        with self.cached_session():
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+            model.add(keras.layers.Dense(num_classes))
+
+            x = np.random.random((batch_size, input_dim))
+            ref_y = model.predict(x)
+
+            model.save_weights(h5_path)
+
+            model = keras.models.Sequential()
+            model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
+            model.add(keras.layers.Dense(num_classes))
+            model.load_weights(h5_path)
+            y = model.predict(x)
+
+            self.assertAllClose(y, ref_y)
+
+    @test_combinations.run_with_all_saved_model_formats(
+        exclude_formats=["tf_no_traces"]
+    )
+    def test_nested_model_weight_loading(self):
+        save_format = test_utils.get_save_format()
+        saved_model_dir = self._save_model_dir()
+
+        batch_size = 5
+        shape = (None, None, 3)
+
+        with self.cached_session():
+
+            def gen_model():
+                def seq_model():
+                    model = keras.models.Sequential(
+                        [
+                            keras.layers.Conv2D(3, 1, input_shape=shape),
+                            keras.layers.BatchNormalization(),
+                        ]
+                    )
+                    return model
+
+                x = inner_inputs = keras.layers.Input((None, None, 3))
+                x = seq_model()(x)
+                x = seq_model()(x)
+                inner_model = keras.models.Model(inner_inputs, x)
+
+                inputs = keras.layers.Input(shape)
+                return keras.models.Model(inputs, inner_model(inputs))
+
+            model = gen_model()
+            x = np.random.random((batch_size, 1, 1, 3))
+            ref_y = model.predict(x)
+
+            model.save_weights(saved_model_dir, save_format=save_format)
+
+            model = gen_model()
+            model.load_weights(saved_model_dir)
+            y = model.predict(x)
+
+            self.assertAllClose(y, ref_y)
+
+    def test_sequential_weight_loading_group_name_with_incorrect_length(self):
+        if h5py is None:
+            return
+
+        h5_path = self._save_model_dir("test.h5")
+
+        num_hidden = 5
+        input_dim = 3
+        num_classes = 2
+        with self.cached_session():
+            ref_model = keras.models.Sequential()
+            ref_model.add(
+                keras.layers.Dense(num_hidden, input_dim=input_dim, name="d1")
+            )
+            ref_model.add(keras.layers.Dense(num_classes, name="d2"))
+            ref_model.compile(
+                loss=keras.losses.MSE,
+                optimizer="rmsprop",
+                metrics=[keras.metrics.categorical_accuracy],
+            )
+
+            f_ref_model = h5py.File(h5_path, "w")
+            hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model)
+
+            f_model = h5py.File(h5_path, "r")
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Dense(
+                    num_hidden, use_bias=False, input_dim=input_dim, name="d1"
+                )
+            )
+            model.add(keras.layers.Dense(num_classes, name="d2"))
+            model.compile(
+                loss=keras.losses.MSE,
+                optimizer="rmsprop",
+                metrics=[keras.metrics.categorical_accuracy],
+            )
+            with self.assertRaises(
+                ValueError,
+                msg=(
+                    "Weight count mismatch for layer #0 (named d1). "
+                    "Layer expects 1 weight(s). Received 2 saved weight(s)"
+                ),
+            ):
+                hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model)
+
+            hdf5_format.load_weights_from_hdf5_group_by_name(
+                f_model, model, skip_mismatch=True
+            )
+            self.assertAllClose(
+                keras.backend.get_value(ref_model.layers[1].kernel),
+                keras.backend.get_value(model.layers[1].kernel),
+            )
+
+    def test_sequential_weight_loading_group_name_with_incorrect_shape(self):
+        if h5py is None:
+            return
+
+        h5_path = self._save_model_dir("test.h5")
+
+        num_hidden = 5
+        input_dim = 3
+        num_classes = 2
+        with tf.Graph().as_default(), self.cached_session():
+            ref_model = keras.models.Sequential()
+            ref_model.add(
+                keras.layers.Dense(num_hidden, input_dim=input_dim, name="d1")
+            )
+            ref_model.add(keras.layers.Dense(num_classes, name="d2"))
+            ref_model.compile(
+                loss=keras.losses.MSE,
+                optimizer=optimizer_v1.RMSprop(lr=0.0001),
+                metrics=[keras.metrics.categorical_accuracy],
+            )
+
+            f_ref_model = h5py.File(h5_path, "w")
+            keras.backend.set_value(
+                ref_model.layers[1].bias, [3.5] * num_classes
+            )
+            hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model)
+
+            f_model = h5py.File(h5_path, "r")
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.Dense(
+                    num_hidden + 5, input_dim=input_dim, name="d1"
+                )
+            )
+            model.add(keras.layers.Dense(num_classes, name="d2"))
+            model.compile(
+                loss=keras.losses.MSE,
+                optimizer=optimizer_v1.RMSprop(lr=0.0001),
+                metrics=[keras.metrics.categorical_accuracy],
+            )
+            with self.assertRaises(
+                ValueError,
+                msg=(
+                    "Shape mismatch in layer #0 (named d1) for weight "
+                    "d1_1/kernel:0. Weight expects shape (3, 10). "
+                    "Received saved weight with shape (3, 5)"
+                ),
+            ):
+                hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model)
+
+            hdf5_format.load_weights_from_hdf5_group_by_name(
+                f_model, model, skip_mismatch=True
+            )
+            self.assertAllClose(
+                [3.5] * num_classes,
+                keras.backend.get_value(model.layers[1].bias),
+            )
+
+    @test_combinations.run_with_all_saved_model_formats(
+        exclude_formats=["tf_no_traces"]
+    )
+    @test_combinations.run_with_all_model_types
+    def test_load_weights_from_saved_model(self):
+        save_path = self._save_model_dir()
+        save_format = test_utils.get_save_format()
+
+        if save_format == "h5" and test_utils.get_model_type() == "subclass":
+            # TODO(b/173646281): HDF5 format currently does not allow saving
+            # subclassed models.
+            return
+
+        with self.cached_session():
+            model = test_utils.get_small_mlp(1, 4, input_dim=3)
+            data = np.random.random((1, 3))
+            labels = np.random.random((1, 4))
+            model.compile(loss="mse", optimizer="rmsprop")
+            model.fit(data, labels)
+            model.save(save_path, save_format=save_format)
+            new_model = test_utils.get_small_mlp(1, 4, input_dim=3)
+            if test_utils.get_model_type() == "subclass":
+                # Call on test data to build the model.
+                new_model.predict(data)
+            new_model.load_weights(save_path)
+            self.assertAllClose(model.weights, new_model.weights)
+
+
+class SubclassedModel(training.Model):
+    def __init__(self):
+        super().__init__()
+        self.x_layer = keras.layers.Dense(3)
+        self.b_layer = keras.layers.Dense(1)
+
+    def call(self, a):
+        return self.b_layer(self.x_layer(a))
+
+
+class TestWeightSavingAndLoadingTFFormat(
+    tf.test.TestCase, parameterized.TestCase
+):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_tensorflow_format_overwrite(self):
+        with self.cached_session() as session:
+            model = SubclassedModel()
+            temp_dir = self.get_temp_dir()
+            prefix = os.path.join(temp_dir, "ckpt")
+
+            x = tf.constant(np.random.random((3, 2)), dtype=tf.float32)
+            executing_eagerly = tf.executing_eagerly()
+            model(x)
+            if not executing_eagerly:
+                session.run([v.initializer for v in model.variables])
+            model.save_weights(prefix, save_format="tensorflow")
+            model.save_weights(prefix, save_format="tensorflow", overwrite=True)
+            with self.assertRaises(EOFError):
+                # Indirectly tests that the user is prompted
+                model.save_weights(
+                    prefix, save_format="tensorflow", overwrite=False
+                )
+
+    def test_no_default_session(self):
+        with tf.Graph().as_default():
+            self.assertFalse(tf.compat.v1.get_default_session())
+            data = np.random.random((1000, 32)).astype(np.float32)
+            labels = np.random.random((1000, 10)).astype(np.float32)
+
+            model = keras.models.Sequential(
+                [
+                    keras.layers.Dense(10, activation="softmax"),
+                    keras.layers.Dense(10, activation="softmax"),
+                ]
+            )
+
+            model.compile(
+                optimizer=tf.compat.v1.train.RMSPropOptimizer(0.001),
+                loss="categorical_crossentropy",
+                metrics=["accuracy"],
+            )
+
+            model.fit(data, labels)
+            fname = os.path.join(self.get_temp_dir(), "weights", "ckpt")
+            model.save_weights(fname)
+            model.load_weights(fname)
+
+    def test_no_graph_pollution(self):
+        with tf.compat.v1.get_default_graph().as_default():
+            graph = tf.Graph()
+            with graph.as_default(), self.session(graph) as session:
+                model = SubclassedModel()
+                temp_dir = self.get_temp_dir()
+                prefix = os.path.join(temp_dir, "ckpt")
+
+                x = tf.constant(np.random.random((3, 2)), dtype=tf.float32)
+                model(x)
+                session.run([v.initializer for v in model.variables])
+                model.save_weights(prefix, save_format="tensorflow")
+                op_count = len(graph.get_operations())
+                model.save_weights(prefix, save_format="tensorflow")
+                self.assertLen(graph.get_operations(), op_count)
+
+                model.load_weights(prefix)
+                op_count = len(graph.get_operations())
+                model.load_weights(prefix)
+                self.assertLen(graph.get_operations(), op_count)
+
+    def _weight_loading_test_template(self, make_model_fn):
+        with self.cached_session():
+            model = make_model_fn()
+            model.compile(
+                loss="mse",
+                optimizer=tf.compat.v1.train.RMSPropOptimizer(0.1),
+                metrics=["acc", keras.metrics.CategoricalAccuracy()],
+            )
+            temp_dir = self.get_temp_dir()
+            prefix = os.path.join(temp_dir, "ckpt")
+            train_x = np.random.random((3, 2))
+            train_y = np.random.random((3,))
+            x = tf.constant(train_x, dtype=tf.float32)
+
+            model.train_on_batch(train_x, train_y)
+            model.save_weights(prefix, save_format="tf")
+            ref_y_before_train = model.predict(train_x)
+            model.train_on_batch(train_x, train_y)
+            ref_y_after_train = model.predict(train_x)
+            for v in model.variables:
+                self.evaluate(v.assign(tf.random.normal(shape=tf.shape(v))))
+
+            self.addCleanup(shutil.rmtree, temp_dir)
+
+            model.load_weights(prefix)
+            self.assertAllClose(ref_y_before_train, self.evaluate(model(x)))
+
+            # Test restore-on-create if this is a subclassed Model (graph
+            # Networks will have already created their variables).
+            load_model = make_model_fn()
+            load_model.load_weights(prefix)
+            self.assertAllClose(
+                ref_y_before_train, self.evaluate(load_model(x))
+            )
+            load_model = make_model_fn()
+            load_model.load_weights(prefix)
+            # We need to run some of the restore ops for predict(), but not all
+            # variables have been created yet (optimizer slot variables). Tests
+            # incremental restore.
+            load_model.predict(train_x)
+            load_model.compile(
+                loss="mse",
+                optimizer=tf.compat.v1.train.RMSPropOptimizer(0.1),
+                metrics=["acc", keras.metrics.CategoricalAccuracy()],
+            )
+            load_model.train_on_batch(train_x, train_y)
+            self.assertAllClose(ref_y_after_train, self.evaluate(load_model(x)))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_weight_loading_graph_model(self):
+        def _make_graph_model():
+            a = keras.layers.Input(shape=(2,))
+            x = keras.layers.Dense(3)(a)
+            b = keras.layers.Dense(1)(x)
+            return keras.models.Model(a, b)
+
+        self._weight_loading_test_template(_make_graph_model)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_weight_loading_subclassed_model(self):
+        self._weight_loading_test_template(SubclassedModel)
+
+    def _new_layer_weight_loading_test_template(
+        self, first_model_fn, second_model_fn
+    ):
+        with self.cached_session() as session:
+            model = first_model_fn()
+            temp_dir = self.get_temp_dir()
+            prefix = os.path.join(temp_dir, "ckpt")
+
+            x = tf.constant(np.random.random((3, 2)), dtype=tf.float32)
+            executing_eagerly = tf.executing_eagerly()
+            ref_y_tensor = model(x)
+            if not executing_eagerly:
+                session.run([v.initializer for v in model.variables])
+            ref_y = self.evaluate(ref_y_tensor)
+            model.save_weights(prefix)
+            self.assertEqual(prefix, tf.train.latest_checkpoint(temp_dir))
+            for v in model.variables:
+                self.evaluate(v.assign(tf.random.normal(shape=tf.shape(v))))
+
+            self.addCleanup(shutil.rmtree, temp_dir)
+
+            second_model = second_model_fn()
+            status = second_model.load_weights(prefix)
+            second_model(x)
+            status.run_restore_ops()
+            second_model.save_weights(prefix)
+            # Check that the second model's checkpoint loads into the original
+            # model
+            status = model.load_weights(prefix)
+            status.run_restore_ops(session)
+            y = self.evaluate(model(x))
+            self.assertAllClose(ref_y, y)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_weight_loading_graph_model_added_layer(self):
+        def _save_graph_model():
+            a = keras.layers.Input(shape=(2,))
+            x = keras.layers.Dense(3, name="first")(a)
+            b = keras.layers.Dense(1, name="second")(x)
+            return keras.models.Model(a, b)
+
+        def _restore_graph_model():
+            a = keras.layers.Input(shape=(2,))
+            x = keras.layers.Dense(3, name="first")(a)
+            y = keras.layers.Dense(1, name="second")(x)
+            b = keras.layers.Dense(3, name="secondjr")(y)
+            return keras.models.Model(a, b)
+
+        self._new_layer_weight_loading_test_template(
+            _save_graph_model, _restore_graph_model
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_weight_loading_graph_model_added_no_weight_layer(self):
+        def _save_graph_model():
+            a = keras.layers.Input(shape=(2,))
+            x = keras.layers.Dense(3, name="first")(a)
+            b = keras.layers.Dense(1, name="second")(x)
+            return keras.models.Model(a, b)
+
+        def _restore_graph_model():
+            a = keras.layers.Input(shape=(2,))
+            x = keras.layers.Dense(3, name="first")(a)
+            b = keras.layers.Dense(1, name="second")(x)
+            y = keras.layers.Dropout(rate=0.1)(b)
+            return keras.models.Model(a, y)
+
+        self._new_layer_weight_loading_test_template(
+            _save_graph_model, _restore_graph_model
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_weight_loading_subclassed_model_added_layer(self):
+        class SubclassedModelRestore(training.Model):
+            def __init__(self):
+                super().__init__()
+                self.x_layer = keras.layers.Dense(3)
+                self.y_layer = keras.layers.Dense(3)
+                self.b_layer = keras.layers.Dense(1)
+
+            def call(self, a):
+                return self.b_layer(self.y_layer(self.x_layer(a)))
+
+        self._new_layer_weight_loading_test_template(
+            SubclassedModel, SubclassedModelRestore
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_incompatible_checkpoint(self):
+        save_path = tf.train.Checkpoint().save(
+            os.path.join(self.get_temp_dir(), "ckpt")
+        )
+        m = DummySubclassModel()
+        with self.assertRaisesRegex(AssertionError, "Nothing to load"):
+            m.load_weights(save_path)
+        m.dense = keras.layers.Dense(2)
+        m.dense(tf.constant([[1.0]]))
+        with self.assertRaisesRegex(
+            AssertionError, "Nothing except the root object matched"
+        ):
+            m.load_weights(save_path)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_directory_passed(self):
+        with self.cached_session():
+            m = DummySubclassModel()
+            v = m.add_weight(name="v", shape=[])
+            self.evaluate(v.assign(42.0))
+            prefix = os.path.join(
+                self.get_temp_dir(), str(uuid.uuid4()), "ckpt/"
+            )
+            m.save_weights(prefix)
+            self.evaluate(v.assign(2.0))
+            m.load_weights(prefix)
+            self.assertEqual(42.0, self.evaluate(v))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_relative_path(self):
+        with self.cached_session():
+            m = DummySubclassModel()
+            v = m.add_weight(name="v", shape=[])
+            os.chdir(self.get_temp_dir())
+
+            prefix = "ackpt"
+            self.evaluate(v.assign(42.0))
+            m.save_weights(prefix)
+            self.assertTrue(tf.io.gfile.exists("ackpt.index"))
+            self.evaluate(v.assign(1.0))
+            m.load_weights(prefix)
+            self.assertEqual(42.0, self.evaluate(v))
+
+            prefix = "subdir/ackpt"
+            self.evaluate(v.assign(43.0))
+            m.save_weights(prefix)
+            self.assertTrue(tf.io.gfile.exists("subdir/ackpt.index"))
+            self.evaluate(v.assign(2.0))
+            m.load_weights(prefix)
+            self.assertEqual(43.0, self.evaluate(v))
+
+            prefix = "ackpt/"
+            self.evaluate(v.assign(44.0))
+            m.save_weights(prefix)
+            self.assertTrue(tf.io.gfile.exists("ackpt/.index"))
+            self.evaluate(v.assign(3.0))
+            m.load_weights(prefix)
+            self.assertEqual(44.0, self.evaluate(v))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_nonexistent_prefix_directory(self):
+        with self.cached_session():
+            m = DummySubclassModel()
+            v = m.add_weight(name="v", shape=[])
+            self.evaluate(v.assign(42.0))
+            prefix = os.path.join(
+                self.get_temp_dir(), str(uuid.uuid4()), "bckpt"
+            )
+            m.save_weights(prefix)
+            self.evaluate(v.assign(2.0))
+            m.load_weights(prefix)
+            self.assertEqual(42.0, self.evaluate(v))
+
+
+class DummySubclassModel(training.Model):
+    pass
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/saved_model/BUILD b/keras/saving/legacy/saved_model/BUILD
similarity index 89%
rename from keras/saving/saved_model/BUILD
rename to keras/saving/legacy/saved_model/BUILD
index 58672e0776d1..ac954f803596 100644
--- a/keras/saving/saved_model/BUILD
+++ b/keras/saving/legacy/saved_model/BUILD
@@ -18,9 +18,12 @@
 
 # buildifier: disable=same-origin-load
 
+# Placeholder: load unaliased py_library
+# Placeholder: load unaliased py_binary
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras/layers/rnn:__pkg__",
         "//keras/saving:__subpackages__",
@@ -39,6 +42,16 @@ py_library(
     visibility = ["//visibility:private"],
 )
 
+py_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras/engine:base_layer_utils",
+        "//keras/utils:layer_utils",
+    ],
+)
+
 py_library(
     name = "saved_model",
     srcs = [
@@ -54,11 +67,11 @@ py_library(
         "save.py",
         "save_impl.py",
         "serialized_attributes.py",
-        "utils.py",
     ],
     srcs_version = "PY3",
     deps = [
         ":order_preserving_set",
+        ":utils",
         "//:expect_tensorflow_installed",
         "//keras/utils:generic_utils",
     ],
@@ -104,6 +117,7 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = [
+        "no_oss",  # TODO(b/296236267)
         "no_pip",  # TODO(b/202022379)
         "no_rocm",
         "no_windows",
diff --git a/keras/saving/saved_model/README.md b/keras/saving/legacy/saved_model/README.md
similarity index 100%
rename from keras/saving/saved_model/README.md
rename to keras/saving/legacy/saved_model/README.md
diff --git a/keras/saving/legacy/saved_model/__init__.py b/keras/saving/legacy/saved_model/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/keras/saving/legacy/saved_model/base_serialization.py b/keras/saving/legacy/saved_model/base_serialization.py
new file mode 100644
index 000000000000..51057c084dd7
--- /dev/null
+++ b/keras/saving/legacy/saved_model/base_serialization.py
@@ -0,0 +1,141 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper classes that list&validate all attributes to serialize to
+SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from keras.saving.legacy.saved_model import json_utils
+from keras.saving.legacy.saved_model import utils
+
+
+class SavedModelSaver(object, metaclass=abc.ABCMeta):
+    """Saver defining the methods and properties used to serialize Keras
+    objects."""
+
+    def __init__(self, obj):
+        self.obj = obj
+
+    @abc.abstractproperty
+    def object_identifier(self):
+        """String stored in object identifier field in the SavedModel proto.
+
+        Returns:
+          A string with the object identifier, which is used at load time.
+        """
+        raise NotImplementedError
+
+    @property
+    def tracking_metadata(self):
+        """String stored in metadata field in the SavedModel proto.
+
+        Returns:
+          A serialized JSON storing information necessary for recreating this
+          layer.
+        """
+        # TODO(kathywu): check that serialized JSON can be loaded (e.g., if an
+        # object is in the python property)
+        return json_utils.Encoder().encode(self.python_properties)
+
+    def trackable_children(self, serialization_cache):
+        """Lists all Trackable children connected to this object."""
+        if not utils.should_save_traces():
+            return {}
+
+        children = self.objects_to_serialize(serialization_cache)
+        children.update(self.functions_to_serialize(serialization_cache))
+        return children
+
+    @abc.abstractproperty
+    def python_properties(self):
+        """Returns dictionary of python properties to save in the metadata.
+
+        This dictionary must be serializable and deserializable to/from JSON.
+
+        When loading, the items in this dict are used to initialize the object
+        and define attributes in the revived object.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def objects_to_serialize(self, serialization_cache):
+        """Returns dictionary of extra checkpointable objects to serialize.
+
+        See `functions_to_serialize` for an explanation of this function's
+        effects.
+
+        Args:
+          serialization_cache: Dictionary passed to all objects in the same
+            object graph during serialization.
+
+        Returns:
+            A dictionary mapping attribute names to checkpointable objects.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def functions_to_serialize(self, serialization_cache):
+        """Returns extra functions to include when serializing a Keras object.
+
+        Normally, when calling exporting an object to SavedModel, only the
+        functions and objects defined by the user are saved. For example:
+
+        ```
+        obj = tf.Module()
+        obj.v = tf.Variable(1.)
+
+        @tf.function
+        def foo(...): ...
+
+        obj.foo = foo
+
+        w = tf.Variable(1.)
+
+        tf.saved_model.save(obj, 'path/to/saved/model')
+        loaded = tf.saved_model.load('path/to/saved/model')
+
+        loaded.v  # Variable with the same value as obj.v
+        loaded.foo  # Equivalent to obj.foo
+        loaded.w  # AttributeError
+        ```
+
+        Assigning trackable objects to attributes creates a graph, which is used
+        for both checkpointing and SavedModel serialization.
+
+        When the graph generated from attribute tracking is insufficient, extra
+        objects and functions may be added at serialization time. For example,
+        most models do not have their call function wrapped with a @tf.function
+        decorator. This results in `model.call` not being saved. Since Keras
+        objects should be revivable from the SavedModel format, the call
+        function is added as an extra function to serialize.
+
+        This function and `objects_to_serialize` is called multiple times when
+        exporting to SavedModel. Please use the cache to avoid generating new
+        functions and objects. A fresh cache is created for each SavedModel
+        export.
+
+        Args:
+          serialization_cache: Dictionary passed to all objects in the same
+            object graph during serialization.
+
+        Returns:
+            A dictionary mapping attribute names to `Function` or
+            `ConcreteFunction`.
+        """
+        raise NotImplementedError
diff --git a/keras/saving/saved_model/constants.py b/keras/saving/legacy/saved_model/constants.py
similarity index 76%
rename from keras/saving/saved_model/constants.py
rename to keras/saving/legacy/saved_model/constants.py
index fae2c1bd07bc..c505586310c1 100644
--- a/keras/saving/saved_model/constants.py
+++ b/keras/saving/legacy/saved_model/constants.py
@@ -17,24 +17,24 @@
 # Namespace used to store all attributes added during serialization.
 # e.g. the list of layers can be accessed using `loaded.keras_api.layers`, in an
 # object loaded from `tf.saved_model.load()`.
-KERAS_ATTR = 'keras_api'
+KERAS_ATTR = "keras_api"
 
 # Keys for the serialization cache.
 # Maps to the keras serialization dict {Layer --> SerializedAttributes object}
-KERAS_CACHE_KEY = 'keras_serialized_attributes'
+KERAS_CACHE_KEY = "keras_serialized_attributes"
 
 
 # Name of Keras metadata file stored in the SavedModel.
-SAVED_METADATA_PATH = 'keras_metadata.pb'
+SAVED_METADATA_PATH = "keras_metadata.pb"
 
 # Names of SavedObject Keras identifiers.
-INPUT_LAYER_IDENTIFIER = '_tf_keras_input_layer'
-LAYER_IDENTIFIER = '_tf_keras_layer'
-METRIC_IDENTIFIER = '_tf_keras_metric'
-MODEL_IDENTIFIER = '_tf_keras_model'
-NETWORK_IDENTIFIER = '_tf_keras_network'
-RNN_LAYER_IDENTIFIER = '_tf_keras_rnn_layer'
-SEQUENTIAL_IDENTIFIER = '_tf_keras_sequential'
+INPUT_LAYER_IDENTIFIER = "_tf_keras_input_layer"
+LAYER_IDENTIFIER = "_tf_keras_layer"
+METRIC_IDENTIFIER = "_tf_keras_metric"
+MODEL_IDENTIFIER = "_tf_keras_model"
+NETWORK_IDENTIFIER = "_tf_keras_network"
+RNN_LAYER_IDENTIFIER = "_tf_keras_rnn_layer"
+SEQUENTIAL_IDENTIFIER = "_tf_keras_sequential"
 
 KERAS_OBJECT_IDENTIFIERS = (
     INPUT_LAYER_IDENTIFIER,
diff --git a/keras/saving/legacy/saved_model/create_test_saved_model.py b/keras/saving/legacy/saved_model/create_test_saved_model.py
new file mode 100644
index 000000000000..5a281df9c41d
--- /dev/null
+++ b/keras/saving/legacy/saved_model/create_test_saved_model.py
@@ -0,0 +1,36 @@
+"""A binary that creates a serialized SavedModel from a keras model.
+
+This is used in tests to ensure that model serialization is deterministic across
+different processes.
+"""
+
+import tensorflow.compat.v2 as tf
+from absl import app
+from absl import flags
+
+from keras import regularizers
+from keras.testing_infra import test_utils
+
+flags.DEFINE_string("output_path", "", "The path to write the SavedModel at.")
+
+FLAGS = flags.FLAGS
+
+
+def main(_) -> None:
+    with test_utils.model_type_scope("functional"):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.layers[-1].activity_regularizer = regularizers.get("l2")
+        model.activity_regularizer = regularizers.get("l2")
+        model.compile(loss="mse", optimizer="rmsprop")
+
+        def callable_loss():
+            return tf.reduce_sum(model.weights[0])
+
+        model.add_loss(callable_loss)
+
+        print(f"_____Writing saved model to: {FLAGS.output_path}")
+        model.save(FLAGS.output_path)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/keras/saving/legacy/saved_model/determinism_test.py b/keras/saving/legacy/saved_model/determinism_test.py
new file mode 100755
index 000000000000..dc9d8835d857
--- /dev/null
+++ b/keras/saving/legacy/saved_model/determinism_test.py
@@ -0,0 +1,33 @@
+"""Saves the same model twice and ensures that they are serialized the same."""
+
+import subprocess
+
+import tensorflow.compat.v2 as tf
+from absl import flags
+from tensorflow.core.protobuf import saved_model_pb2
+
+FLAGS = flags.FLAGS
+
+
+class DeterminismTest(tf.test.TestCase):
+    def test_saving_is_deterministic(self):
+        create_saved_model = f"{FLAGS.test_srcdir}/create_test_saved_model.par"
+        saved_model_a_path = f"{FLAGS.test_tmpdir}/a"
+        saved_model_b_path = f"{FLAGS.test_tmpdir}/b"
+
+        save_a = subprocess.Popen(
+            [create_saved_model, "--output_path", saved_model_a_path]
+        )
+        save_b = subprocess.Popen(
+            [create_saved_model, "--output_path", saved_model_b_path]
+        )
+        save_a.wait()
+        save_b.wait()
+        saved_model_a = saved_model_pb2.SavedModel()
+        with tf.io.gfile.GFile(f"{saved_model_a_path}/saved_model.pb") as f:
+            saved_model_a.MergeFromString(f.read())
+        saved_model_b = saved_model_pb2.SavedModel()
+        with tf.io.gfile.GFile(f"{saved_model_b_path}/saved_model.pb") as f:
+            saved_model_b.MergeFromString(f.read())
+
+        self.assertProtoEquals(saved_model_a, saved_model_b)
diff --git a/keras/saving/legacy/saved_model/json_utils.py b/keras/saving/legacy/saved_model/json_utils.py
new file mode 100644
index 000000000000..05b0e285be75
--- /dev/null
+++ b/keras/saving/legacy/saved_model/json_utils.py
@@ -0,0 +1,237 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for creating and loading the Layer metadata for SavedModel.
+
+These are required to retain the original format of the build input shape, since
+layers and models may have different build behaviors depending on if the shape
+is a list, tuple, or TensorShape. For example, Network.build() will create
+separate inputs if the given input_shape is a list, and will create a single
+input if the given shape is a tuple.
+"""
+
+import collections
+import enum
+import functools
+import json
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+import wrapt
+
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model.utils import in_tf_saved_model_scope
+
+# isort: off
+from tensorflow.python.framework import type_spec_registry
+
+_EXTENSION_TYPE_SPEC = "_EXTENSION_TYPE_SPEC"
+
+
+class Encoder(json.JSONEncoder):
+    """JSON encoder and decoder that handles TensorShapes and tuples."""
+
+    def default(self, obj):
+        """Encodes objects for types that aren't handled by the default
+        encoder."""
+        if isinstance(obj, tf.TensorShape):
+            items = obj.as_list() if obj.rank is not None else None
+            return {"class_name": "TensorShape", "items": items}
+        return get_json_type(obj)
+
+    def encode(self, obj):
+        return super().encode(_encode_tuple(obj))
+
+
+def _encode_tuple(x):
+    if isinstance(x, tuple):
+        return {
+            "class_name": "__tuple__",
+            "items": tuple(_encode_tuple(i) for i in x),
+        }
+    elif isinstance(x, list):
+        return [_encode_tuple(i) for i in x]
+    elif isinstance(x, dict):
+        return {key: _encode_tuple(value) for key, value in x.items()}
+    else:
+        return x
+
+
+def decode(json_string):
+    return json.loads(json_string, object_hook=_decode_helper)
+
+
+def decode_and_deserialize(
+    json_string, module_objects=None, custom_objects=None
+):
+    """Decodes the JSON and deserializes any Keras objects found in the dict."""
+    return json.loads(
+        json_string,
+        object_hook=functools.partial(
+            _decode_helper,
+            deserialize=True,
+            module_objects=module_objects,
+            custom_objects=custom_objects,
+        ),
+    )
+
+
+def _decode_helper(
+    obj, deserialize=False, module_objects=None, custom_objects=None
+):
+    """A decoding helper that is TF-object aware.
+
+    Args:
+      obj: A decoded dictionary that may represent an object.
+      deserialize: Boolean. When True, deserializes any Keras
+        objects found in `obj`. Defaults to `False`.
+      module_objects: A dictionary of built-in objects to look the name up in.
+        Generally, `module_objects` is provided by midlevel library
+        implementers.
+      custom_objects: A dictionary of custom objects to look the name up in.
+        Generally, `custom_objects` is provided by the end user.
+
+    Returns:
+      The decoded object.
+    """
+    if isinstance(obj, dict) and "class_name" in obj:
+        if obj["class_name"] == "TensorShape":
+            return tf.TensorShape(obj["items"])
+        elif obj["class_name"] == "TypeSpec":
+            return type_spec_registry.lookup(obj["type_spec"])._deserialize(
+                _decode_helper(obj["serialized"])
+            )
+        elif obj["class_name"] == "CompositeTensor":
+            spec = obj["spec"]
+            tensors = []
+            for dtype, tensor in obj["tensors"]:
+                tensors.append(
+                    tf.constant(tensor, dtype=tf.dtypes.as_dtype(dtype))
+                )
+            return tf.nest.pack_sequence_as(
+                _decode_helper(spec), tensors, expand_composites=True
+            )
+        elif obj["class_name"] == "__tuple__":
+            return tuple(_decode_helper(i) for i in obj["items"])
+        elif obj["class_name"] == "__ellipsis__":
+            return Ellipsis
+        elif deserialize and "__passive_serialization__" in obj:
+            # __passive_serialization__ is added by the JSON encoder when
+            # encoding an object that has a `get_config()` method.
+            try:
+                if in_tf_saved_model_scope() or "module" not in obj:
+                    return serialization.deserialize_keras_object(
+                        obj,
+                        module_objects=module_objects,
+                        custom_objects=custom_objects,
+                    )
+                else:
+                    return serialization_lib.deserialize_keras_object(
+                        obj,
+                        module_objects=module_objects,
+                        custom_objects=custom_objects,
+                    )
+            except ValueError:
+                pass
+        elif obj["class_name"] == "__bytes__":
+            return obj["value"].encode("utf-8")
+    return obj
+
+
+def get_json_type(obj):
+    """Serializes any object to a JSON-serializable structure.
+
+    Args:
+        obj: the object to serialize
+
+    Returns:
+        JSON-serializable structure representing `obj`.
+
+    Raises:
+        TypeError: if `obj` cannot be serialized.
+    """
+    # if obj is a serializable Keras class instance
+    # e.g. optimizer, layer
+    if hasattr(obj, "get_config"):
+        serialized = serialization.serialize_keras_object(obj)
+        serialized["__passive_serialization__"] = True
+        return serialized
+
+    # if obj is any numpy type
+    if type(obj).__module__ == np.__name__:
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return obj.item()
+
+    # misc functions (e.g. loss function)
+    if callable(obj):
+        return obj.__name__
+
+    # if obj is a python 'type'
+    if type(obj).__name__ == type.__name__:
+        return obj.__name__
+
+    if isinstance(obj, tf.compat.v1.Dimension):
+        return obj.value
+
+    if isinstance(obj, tf.TensorShape):
+        return obj.as_list()
+
+    if isinstance(obj, tf.DType):
+        return obj.name
+
+    if isinstance(obj, collections.abc.Mapping):
+        return dict(obj)
+
+    if obj is Ellipsis:
+        return {"class_name": "__ellipsis__"}
+
+    if isinstance(obj, wrapt.ObjectProxy):
+        return obj.__wrapped__
+
+    if isinstance(obj, tf.TypeSpec):
+        try:
+            type_spec_name = type_spec_registry.get_name(type(obj))
+            return {
+                "class_name": "TypeSpec",
+                "type_spec": type_spec_name,
+                "serialized": obj._serialize(),
+            }
+        except ValueError:
+            raise ValueError(
+                f"Unable to serialize {obj} to JSON, because the TypeSpec "
+                f"class {type(obj)} has not been registered."
+            )
+    if isinstance(obj, tf.__internal__.CompositeTensor):
+        spec = tf.type_spec_from_value(obj)
+        tensors = []
+        for tensor in tf.nest.flatten(obj, expand_composites=True):
+            tensors.append((tensor.dtype.name, tensor.numpy().tolist()))
+        return {
+            "class_name": "CompositeTensor",
+            "spec": get_json_type(spec),
+            "tensors": tensors,
+        }
+
+    if isinstance(obj, enum.Enum):
+        return obj.value
+
+    if isinstance(obj, bytes):
+        return {"class_name": "__bytes__", "value": obj.decode("utf-8")}
+
+    raise TypeError(
+        f"Unable to serialize {obj} to JSON. Unrecognized type {type(obj)}."
+    )
diff --git a/keras/saving/legacy/saved_model/json_utils_test.py b/keras/saving/legacy/saved_model/json_utils_test.py
new file mode 100644
index 000000000000..3a86aad31520
--- /dev/null
+++ b/keras/saving/legacy/saved_model/json_utils_test.py
@@ -0,0 +1,107 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests the JSON encoder and decoder."""
+
+import enum
+
+import tensorflow.compat.v2 as tf
+
+from keras.saving.legacy.saved_model import json_utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+
+class JsonUtilsTest(test_combinations.TestCase):
+    def test_encode_decode_tensor_shape(self):
+        metadata = {
+            "key1": tf.TensorShape(None),
+            "key2": [tf.TensorShape([None]), tf.TensorShape([3, None, 5])],
+        }
+        string = json_utils.Encoder().encode(metadata)
+        loaded = json_utils.decode(string)
+
+        self.assertEqual(set(loaded.keys()), {"key1", "key2"})
+        self.assertAllEqual(loaded["key1"].rank, None)
+        self.assertAllEqual(loaded["key2"][0].as_list(), [None])
+        self.assertAllEqual(loaded["key2"][1].as_list(), [3, None, 5])
+
+    def test_encode_decode_tuple(self):
+        metadata = {"key1": (3, 5), "key2": [(1, (3, 4)), (1,)]}
+        string = json_utils.Encoder().encode(metadata)
+        loaded = json_utils.decode(string)
+
+        self.assertEqual(set(loaded.keys()), {"key1", "key2"})
+        self.assertAllEqual(loaded["key1"], (3, 5))
+        self.assertAllEqual(loaded["key2"], [(1, (3, 4)), (1,)])
+
+    def test_encode_decode_type_spec(self):
+        spec = tf.TensorSpec((1, 5), tf.float32)
+        string = json_utils.Encoder().encode(spec)
+        loaded = json_utils.decode(string)
+        self.assertEqual(spec, loaded)
+
+        invalid_type_spec = {
+            "class_name": "TypeSpec",
+            "type_spec": "Invalid Type",
+            "serialized": None,
+        }
+        string = json_utils.Encoder().encode(invalid_type_spec)
+        with self.assertRaisesRegexp(
+            ValueError, "No TypeSpec has been registered"
+        ):
+            loaded = json_utils.decode(string)
+
+    def test_encode_decode_enum(self):
+        class Enum(enum.Enum):
+            CLASS_A = "a"
+            CLASS_B = "b"
+
+        config = {"key": Enum.CLASS_A, "key2": Enum.CLASS_B}
+        string = json_utils.Encoder().encode(config)
+        loaded = json_utils.decode(string)
+        self.assertAllEqual({"key": "a", "key2": "b"}, loaded)
+
+    @test_utils.run_v2_only
+    def test_encode_decode_ragged_tensor(self):
+        x = tf.ragged.constant([[1.0, 2.0], [3.0]])
+        string = json_utils.Encoder().encode(x)
+        loaded = json_utils.decode(string)
+        self.assertAllEqual(loaded, x)
+
+    @test_utils.run_v2_only
+    def test_encode_decode_extension_type_tensor(self):
+        class MaskedTensor(tf.experimental.ExtensionType):
+            __name__ = "MaskedTensor"
+            values: tf.Tensor
+            mask: tf.Tensor
+
+        x = MaskedTensor(
+            values=[[1, 2, 3], [4, 5, 6]],
+            mask=[[True, True, False], [True, False, True]],
+        )
+        string = json_utils.Encoder().encode(x)
+        loaded = json_utils.decode(string)
+        self.assertAllEqual(loaded, x)
+
+    def test_encode_decode_bytes(self):
+        b_string = b"abc"
+        json_string = json_utils.Encoder().encode(b_string)
+        loaded = json_utils.decode(json_string)
+        self.assertAllEqual(b_string, loaded)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/legacy/saved_model/layer_serialization.py b/keras/saving/legacy/saved_model/layer_serialization.py
new file mode 100644
index 000000000000..ae7e320a0198
--- /dev/null
+++ b/keras/saving/legacy/saved_model/layer_serialization.py
@@ -0,0 +1,211 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes and functions implementing Layer SavedModel serialization."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.mixed_precision import policy
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model import base_serialization
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import save_impl
+from keras.saving.legacy.saved_model import serialized_attributes
+
+
+class LayerSavedModelSaver(base_serialization.SavedModelSaver):
+    """Implements Layer SavedModel serialization."""
+
+    @property
+    def object_identifier(self):
+        return constants.LAYER_IDENTIFIER
+
+    @property
+    def python_properties(self):
+        # TODO(kathywu): Add python property validator
+        return self._python_properties_internal()
+
+    def _python_properties_internal(self):
+        """Returns dictionary of all python properties."""
+        # TODO(kathywu): Add support for metrics serialization.
+        # TODO(kathywu): Synchronize with the keras spec (go/keras-json-spec)
+        # once the python config serialization has caught up.
+        metadata = dict(
+            name=self.obj.name,
+            trainable=self.obj.trainable,
+            expects_training_arg=self.obj._expects_training_arg,
+            dtype=policy.serialize(self.obj._dtype_policy),
+            batch_input_shape=getattr(self.obj, "_batch_input_shape", None),
+            stateful=self.obj.stateful,
+            must_restore_from_config=self.obj._must_restore_from_config,
+            preserve_input_structure_in_config=self.obj._preserve_input_structure_in_config,  # noqa: E501
+            autocast=self.obj._autocast,
+        )
+
+        metadata.update(get_serialized(self.obj))
+        if self.obj.input_spec is not None:
+            # Layer's input_spec has already been type-checked in the property
+            # setter.
+            metadata["input_spec"] = tf.nest.map_structure(
+                lambda x: serialization.serialize_keras_object(x)
+                if x
+                else None,
+                self.obj.input_spec,
+            )
+        if self.obj.activity_regularizer is not None and hasattr(
+            self.obj.activity_regularizer, "get_config"
+        ):
+            metadata[
+                "activity_regularizer"
+            ] = serialization.serialize_keras_object(
+                self.obj.activity_regularizer
+            )
+        if self.obj._build_input_shape is not None:
+            metadata["build_input_shape"] = self.obj._build_input_shape
+        return metadata
+
+    def objects_to_serialize(self, serialization_cache):
+        return self._get_serialized_attributes(
+            serialization_cache
+        ).objects_to_serialize
+
+    def functions_to_serialize(self, serialization_cache):
+        return self._get_serialized_attributes(
+            serialization_cache
+        ).functions_to_serialize
+
+    def _get_serialized_attributes(self, serialization_cache):
+        """Generates or retrieves serialized attributes from cache."""
+        keras_cache = serialization_cache.setdefault(
+            constants.KERAS_CACHE_KEY, {}
+        )
+        if self.obj in keras_cache:
+            return keras_cache[self.obj]
+
+        serialized_attr = keras_cache[
+            self.obj
+        ] = serialized_attributes.SerializedAttributes.new(self.obj)
+
+        if (
+            save_impl.should_skip_serialization(self.obj)
+            or self.obj._must_restore_from_config
+        ):
+            return serialized_attr
+
+        object_dict, function_dict = self._get_serialized_attributes_internal(
+            serialization_cache
+        )
+
+        serialized_attr.set_and_validate_objects(object_dict)
+        serialized_attr.set_and_validate_functions(function_dict)
+        return serialized_attr
+
+    def _get_serialized_attributes_internal(self, serialization_cache):
+        """Returns dictionary of serialized attributes."""
+        objects = save_impl.wrap_layer_objects(self.obj, serialization_cache)
+        functions = save_impl.wrap_layer_functions(
+            self.obj, serialization_cache
+        )
+        # Attribute validator requires that the default save signature is added
+        # to function dict, even if the value is None.
+        functions["_default_save_signature"] = None
+        return objects, functions
+
+
+# TODO(kathywu): Move serialization utils (and related utils from
+# generic_utils.py) to a separate file.
+def get_serialized(obj):
+    with serialization.skip_failed_serialization():
+        # Store the config dictionary, which may be used when reviving the
+        # object.  When loading, the program will attempt to revive the object
+        # from config, and if that fails, the object will be revived from the
+        # SavedModel.
+        return serialization.serialize_keras_object(obj)
+
+
+class InputLayerSavedModelSaver(base_serialization.SavedModelSaver):
+    """InputLayer serialization."""
+
+    @property
+    def object_identifier(self):
+        return constants.INPUT_LAYER_IDENTIFIER
+
+    @property
+    def python_properties(self):
+
+        return dict(
+            class_name=type(self.obj).__name__,
+            name=self.obj.name,
+            dtype=self.obj.dtype,
+            sparse=self.obj.sparse,
+            ragged=self.obj.ragged,
+            batch_input_shape=self.obj._batch_input_shape,
+            config=self.obj.get_config(),
+        )
+
+    def objects_to_serialize(self, serialization_cache):
+        return {}
+
+    def functions_to_serialize(self, serialization_cache):
+        return {}
+
+
+class RNNSavedModelSaver(LayerSavedModelSaver):
+    """RNN layer serialization."""
+
+    @property
+    def object_identifier(self):
+        return constants.RNN_LAYER_IDENTIFIER
+
+    def _get_serialized_attributes_internal(self, serialization_cache):
+        objects, functions = super()._get_serialized_attributes_internal(
+            serialization_cache
+        )
+        states = tf.__internal__.tracking.wrap(self.obj.states)
+        # SaveModel require all the objects to be Trackable when saving.  If the
+        # states is still a tuple after wrap_or_unwrap, it means it doesn't
+        # contain any trackable item within it, eg empty tuple or (None, None)
+        # for stateless ConvLSTM2D. We convert them to list so that
+        # wrap_or_unwrap can make it a Trackable again for saving. When loaded,
+        # ConvLSTM2D is able to handle the tuple/list conversion.
+        if isinstance(states, tuple):
+            states = tf.__internal__.tracking.wrap(list(states))
+        objects["states"] = states
+        return objects, functions
+
+
+class VocabularySavedModelSaver(LayerSavedModelSaver):
+    """Handles vocabulary layer serialization.
+
+    This class is needed for StringLookup, IntegerLookup, and TextVectorization,
+    which all have a vocabulary as part of the config. Currently, we keep this
+    vocab as part of the config until saving, when we need to clear it to avoid
+    initializing a StaticHashTable twice (once when restoring the config and
+    once when restoring restoring module resources). After clearing the vocab,
+    we persist a property to the layer indicating it was constructed with a
+    vocab.
+    """
+
+    @property
+    def python_properties(self):
+        # TODO(kathywu): Add python property validator
+        metadata = self._python_properties_internal()
+        # Clear the vocabulary from the config during saving.
+        metadata["config"]["vocabulary"] = None
+        # Persist a property to track that a vocabulary was passed on
+        # construction.
+        metadata["config"][
+            "has_input_vocabulary"
+        ] = self.obj._has_input_vocabulary
+        return metadata
diff --git a/keras/saving/legacy/saved_model/load.py b/keras/saving/legacy/saved_model/load.py
new file mode 100644
index 000000000000..ffc4bad14d5d
--- /dev/null
+++ b/keras/saving/legacy/saved_model/load.py
@@ -0,0 +1,1384 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras SavedModel deserialization."""
+
+import re
+import types
+import warnings
+
+import tensorflow.compat.v1.logging as logging
+import tensorflow.compat.v2 as tf
+from google.protobuf import message
+
+from keras import backend
+from keras import regularizers
+from keras.engine import input_spec
+from keras.optimizers.legacy import optimizer_v2
+from keras.protobuf import saved_metadata_pb2
+from keras.protobuf import versions_pb2
+from keras.saving import object_registration
+from keras.saving.legacy import model_config
+from keras.saving.legacy import saving_utils
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import json_utils
+from keras.saving.legacy.saved_model import utils
+from keras.saving.legacy.saved_model.serialized_attributes import (
+    CommonEndpoints,
+)
+from keras.utils import layer_utils
+from keras.utils import metrics_utils
+from keras.utils import tf_inspect
+from keras.utils.generic_utils import LazyLoader
+
+# To avoid circular dependencies between keras/engine and keras/saving,
+# code in keras/saving must delay imports.
+
+# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
+# once the issue with copybara is fixed.
+
+models_lib = LazyLoader("models_lib", globals(), "keras.models")
+base_layer = LazyLoader("base_layer", globals(), "keras.engine.base_layer")
+layers_module = LazyLoader("layers_module", globals(), "keras.layers")
+input_layer = LazyLoader("input_layer", globals(), "keras.engine.input_layer")
+functional_lib = LazyLoader(
+    "functional_lib", globals(), "keras.engine.functional"
+)
+training_lib = LazyLoader("training_lib", globals(), "keras.engine.training")
+training_lib_v1 = LazyLoader(
+    "training_lib_v1", globals(), "keras.engine.training_v1"
+)
+metrics = LazyLoader("metrics", globals(), "keras.metrics")
+base_rnn = LazyLoader("base_rnn", globals(), "keras.layers.rnn.base_rnn")
+
+
+PUBLIC_ATTRIBUTES = CommonEndpoints.all_functions.union(
+    CommonEndpoints.all_checkpointable_objects
+)
+PUBLIC_ATTRIBUTES.add(constants.KERAS_ATTR)
+
+
+def load(path, compile=True, options=None):
+    """Loads Keras objects from a SavedModel.
+
+    Any Keras layer or model saved to the SavedModel will be loaded back
+    as Keras objects. Other objects are loaded as regular trackable objects
+    (same as `tf.saved_model.load`).
+
+    Currently, Keras saving/loading only retains the Keras object's weights,
+    losses, and call function.
+
+    The loaded model can be re-compiled, but the original optimizer, compiled
+    loss functions, and metrics are not retained. This is temporary, and
+    `model.save` will soon be able to serialize compiled models.
+
+    Args:
+      path: Path to SavedModel.
+      compile: If true, compile the model after loading it.
+      options: Optional `tf.saved_model.LoadOptions` object that specifies
+        options for loading from SavedModel.
+
+    Returns:
+      Object loaded from SavedModel.
+    """
+    # TODO(kathywu): Add saving/loading of optimizer, compiled losses and
+    # metrics.
+    # TODO(kathywu): Add code to load from objects that contain all endpoints
+
+    # Look for metadata file or parse the SavedModel
+    metadata = saved_metadata_pb2.SavedMetadata()
+    meta_graph_def = tf.__internal__.saved_model.parse_saved_model(
+        path
+    ).meta_graphs[0]
+    object_graph_def = meta_graph_def.object_graph_def
+    path_to_metadata_pb = tf.io.gfile.join(path, constants.SAVED_METADATA_PATH)
+    if tf.compat.v1.gfile.Exists(path_to_metadata_pb):
+        try:
+            with tf.io.gfile.GFile(path_to_metadata_pb, "rb") as f:
+                file_content = f.read()
+            metadata.ParseFromString(file_content)
+        except message.DecodeError as e:
+            raise IOError(
+                f"Cannot parse keras metadata at path {path_to_metadata_pb}: "
+                f"Received error: {e}"
+            )
+    else:
+        logging.warning(
+            "SavedModel saved prior to TF 2.5 detected when loading "
+            "Keras model. Please ensure that you are saving the model "
+            "with model.save() or tf.keras.models.save_model(), *NOT* "
+            "tf.saved_model.save(). To confirm, there should be a file "
+            'named "keras_metadata.pb" in the SavedModel directory.'
+        )
+        _read_legacy_metadata(object_graph_def, metadata, path)
+
+    if not metadata.nodes:
+        # When there are no Keras objects, return the results from the core
+        # loader
+        return tf.saved_model.load(path, options=options)
+
+    metadata = _update_to_current_version(metadata)
+    # Recreate layers and metrics using the info stored in the metadata.
+    keras_loader = KerasObjectLoader(metadata, object_graph_def)
+    keras_loader.load_layers(compile=compile)
+
+    # Generate a dictionary of all loaded nodes.
+    nodes_to_load = {"root": None}
+    for node_id, loaded_node in keras_loader.loaded_nodes.items():
+        nodes_to_load[keras_loader.get_path(node_id)] = loaded_node
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", message="Trying to load ShardedVariables"
+        )
+        loaded = tf.__internal__.saved_model.load_partial(
+            path, nodes_to_load, options=options
+        )
+
+    # Finalize the loaded layers and remove the extra tracked dependencies.
+    keras_loader.finalize_objects()
+    keras_loader.del_tracking()
+
+    model = loaded["root"]
+
+    if isinstance(model, training_lib.Model) and compile:
+        # TODO(kathywu): Use compiled objects from SavedModel, instead of
+        # creating new objects from the training config.
+        training_config = model._serialized_attributes["metadata"].get(
+            "training_config", None
+        )
+        if training_config is not None:
+            model.compile(
+                **saving_utils.compile_args_from_training_config(
+                    training_config
+                ),
+                from_serialized=True,
+            )
+            saving_utils.try_build_compiled_arguments(model)
+            if isinstance(model.optimizer, optimizer_v2.OptimizerV2):
+                if model.optimizer.get_slot_names():
+                    logging.warning(
+                        "Your optimizer uses slots. "
+                        "Slots cannot be restored from saved_model, "
+                        "as a result, your model is starting with  "
+                        "a new initialized optimizer."
+                    )
+        else:
+            logging.warning(
+                "No training configuration found in save file, so the "
+                "model was *not* compiled. Compile it manually."
+            )
+
+    # Force variables and resources to initialize.
+    if not tf.executing_eagerly():
+        sess = backend.get_session()  # Variables are initialized by this call.
+        sess.run(
+            tf.compat.v1.get_collection(
+                tf.compat.v1.GraphKeys.TABLE_INITIALIZERS
+            )
+        )
+
+    return model
+
+
+def _update_to_current_version(metadata):
+    """Applies version updates to the metadata proto for backwards compat."""
+    for node in metadata.nodes:
+        if node.version.producer == 1 and node.identifier in [
+            constants.MODEL_IDENTIFIER,
+            constants.SEQUENTIAL_IDENTIFIER,
+            constants.NETWORK_IDENTIFIER,
+        ]:
+            node_metadata = json_utils.decode(node.metadata)
+            save_spec = node_metadata.get("save_spec")
+
+            if save_spec is not None:
+                node_metadata["full_save_spec"] = ([save_spec], {})
+                node.metadata = json_utils.Encoder().encode(node_metadata)
+    return metadata
+
+
+def _read_legacy_metadata(object_graph_def, metadata, path):
+    """Builds a KerasMetadata proto from the SavedModel ObjectGraphDef."""
+    # Older SavedModels store the metadata directly in the proto instead of the
+    # separate pb file.
+    node_paths = _generate_object_paths(object_graph_def)
+    for node_id, proto in enumerate(object_graph_def.nodes):
+        if (
+            proto.WhichOneof("kind") == "user_object"
+            and proto.user_object.identifier
+            in constants.KERAS_OBJECT_IDENTIFIERS
+        ):
+            if not proto.user_object.metadata:
+                raise ValueError(
+                    "Unable to create a Keras model from SavedModel at "
+                    f"{path}. This SavedModel was exported with "
+                    "`tf.saved_model.save`, and lacks the Keras metadata file. "
+                    "Please save your Keras model by calling `model.save` "
+                    "or `tf.keras.models.save_model`. Note that "
+                    "you can still load this SavedModel with "
+                    "`tf.saved_model.load`."
+                )
+            metadata.nodes.add(
+                node_id=node_id,
+                node_path=node_paths[node_id],
+                version=versions_pb2.VersionDef(
+                    producer=1, min_consumer=1, bad_consumers=[]
+                ),
+                identifier=proto.user_object.identifier,
+                metadata=proto.user_object.metadata,
+            )
+
+
+def _generate_object_paths(object_graph_def):
+    """Traverses through an ObjectGraphDef and builds a map of all node
+    paths."""
+    paths = {0: "root"}
+    nodes_to_visit = [0]
+
+    while nodes_to_visit:
+        current_node = nodes_to_visit.pop()
+        current_path = paths[current_node]
+        for reference in object_graph_def.nodes[current_node].children:
+            if reference.node_id in paths:
+                continue
+            paths[reference.node_id] = f"{current_path}.{reference.local_name}"
+            nodes_to_visit.append(reference.node_id)
+
+    return paths
+
+
+def _is_graph_network(layer):
+    """Determines whether the layer is a graph network."""
+
+    if isinstance(layer, RevivedNetwork):
+        return False
+    elif isinstance(layer, functional_lib.Functional):
+        return layer._is_graph_network or isinstance(
+            layer, models_lib.Sequential
+        )
+    return False
+
+
+class KerasObjectLoader:
+    """Loader that recreates Keras objects (e.g.
+
+    layers, models).
+
+    Layers and models are revived from either the config or SavedModel following
+    these rules:
+    1. If object is a graph network (i.e. Sequential or Functional) then it will
+       be initialized using the structure from the config only after the
+       children layers have been created. Graph networks must be initialized
+       with inputs and outputs, so all child layers must be created beforehand.
+    2. If object's config exists and the class can be found, then revive from
+       config.
+    3. Object may have already been created if its parent was revived from
+       config. In this case, do nothing.
+    4. If nothing of the above applies, compose the various artifacts from the
+       SavedModel to create a subclassed layer or model. At this time, custom
+       metrics are not supported.
+
+    """
+
+    def __init__(self, metadata, object_graph_def):
+        self._metadata = {x.node_id: x for x in metadata.nodes}
+        self._proto = object_graph_def
+
+        self._node_paths = {
+            node_data.node_id: node_data.node_path
+            for node_data in metadata.nodes
+        }
+        self.loaded_nodes = {}  # Maps node path -> loaded node
+
+        # Store all node ids that have already been traversed when tracking
+        # nodes that were recreated from the config.
+        self._traversed_nodes_from_config = set()
+
+        # Maps model id -> (blank model obj, list of child layer or their node
+        # ids) This tracks all layers in functional and sequential models. These
+        # models are only reconstructed after all of their child layers have
+        # been created.
+        self.model_layer_dependencies = {}
+        self._models_to_reconstruct = []
+
+    def del_tracking(self):
+        """Removes tracked references that are only used when loading the
+        model."""
+        # Now that the node object has been fully loaded, and the checkpoint has
+        # been restored, the object no longer needs to track objects added from
+        # SerializedAttributes. (Note that saving a training checkpoint still
+        # functions correctly, because layers and variables are tracked
+        # separately by the Layer object.)
+        # TODO(kathywu): Instead of outright deleting these nodes (which would
+        # make restoring from a different checkpoint tricky), mark them as extra
+        # dependencies that are OK to overwrite.
+        for node in self.loaded_nodes.values():
+            node = node[0]
+            if not isinstance(node, base_layer.Layer):
+                # Loaded nodes can contain other trackable objects created when
+                # loading layers from the config, such as variables.
+                continue
+            for name in PUBLIC_ATTRIBUTES:
+                node._delete_tracking(name)
+
+            if isinstance(node, functional_lib.Functional):
+                # Delete the temporary layer dependencies, which were used to
+                # restore the checkpointed values. When the model is live, the
+                # user can delete or add layers to the model at any time, so
+                # these layer dependencies may be obsolete.
+                dependencies = list(node._self_unconditional_dependency_names)
+                for name in dependencies:
+                    if (
+                        re.match(r"^layer(_with_weights)?-[\d+]", name)
+                        is not None
+                    ):
+                        node._delete_tracking(name)
+
+    def _add_children_recreated_from_config(self, obj, proto, node_id):
+        """Recursively records objects recreated from config."""
+
+        if node_id in self._traversed_nodes_from_config:
+            return
+
+        parent_path = self._node_paths[node_id]
+        self._traversed_nodes_from_config.add(node_id)
+        obj._maybe_initialize_trackable()
+        if isinstance(obj, base_layer.Layer) and not obj.built:
+            metadata = json_utils.decode(self._metadata[node_id].metadata)
+            self._try_build_layer(
+                obj, node_id, metadata.get("build_input_shape")
+            )
+
+        # Create list of all possible children
+        children = []
+        # Look for direct children
+        for reference in proto.children:
+            obj_child = obj._lookup_dependency(reference.local_name)
+            children.append(
+                (obj_child, reference.node_id, reference.local_name)
+            )
+
+        # Add metrics that may have been added to the layer._metrics list.
+        # This is stored in the SavedModel as layer.keras_api.layer_metrics in
+        # SavedModels created after Tf 2.2.
+        metric_list_node_id = self._search_for_child_node(
+            node_id, [constants.KERAS_ATTR, "layer_metrics"]
+        )
+        if metric_list_node_id is not None and hasattr(obj, "_metrics"):
+            obj_metrics = {m.name: m for m in obj._metrics}
+            for reference in self._proto.nodes[metric_list_node_id].children:
+                metric = obj_metrics.get(reference.local_name)
+                if metric is not None:
+                    metric_path = "{}.layer_metrics.{}".format(
+                        constants.KERAS_ATTR, reference.local_name
+                    )
+                    children.append((metric, reference.node_id, metric_path))
+
+        for obj_child, child_id, child_name in children:
+            child_proto = self._proto.nodes[child_id]
+
+            if not isinstance(obj_child, tf.__internal__.tracking.Trackable):
+                continue
+            if (
+                child_proto.user_object.identifier
+                in tf.__internal__.saved_model.load.registered_identifiers()
+            ):
+                setter = tf.__internal__.saved_model.load.get_setter(
+                    child_proto.user_object
+                )
+            elif (
+                obj_child._object_identifier
+                in constants.KERAS_OBJECT_IDENTIFIERS
+            ):
+                setter = _revive_setter
+            else:
+                setter = setattr
+
+            if child_id in self.loaded_nodes:
+                if self.loaded_nodes[child_id][0] is not obj_child:
+                    # This means that the same trackable object is referenced by
+                    # two different objects that were recreated from the config.
+                    logging.warning(
+                        "Looks like there is an object (perhaps variable or "
+                        "layer) that is shared between different "
+                        "layers/models. This may cause issues when restoring "
+                        "the variable values. Object: {}".format(obj_child)
+                    )
+                continue
+
+            # Overwrite variable names with the ones saved in the SavedModel.
+            if (
+                child_proto.WhichOneof("kind") == "variable"
+                and child_proto.variable.name
+            ):
+                obj_child._handle_name = child_proto.variable.name + ":0"
+
+            if isinstance(
+                obj_child, tf.__internal__.tracking.TrackableDataStructure
+            ):
+                setter = lambda *args: None
+
+            child_path = f"{parent_path}.{child_name}"
+            self._node_paths[child_id] = child_path
+            self._add_children_recreated_from_config(
+                obj_child, child_proto, child_id
+            )
+            self.loaded_nodes[child_id] = obj_child, setter
+
+    def load_layers(self, compile=True):
+        """Load all layer nodes from the metadata."""
+        # Load metrics after models and layers, since it's likely that models
+        # and layers will create the metric when initialized (this avoids
+        # wasting time by creating objects multiple times).
+        metric_list = []
+        for node_metadata in self._metadata.values():
+            if node_metadata.identifier == constants.METRIC_IDENTIFIER:
+                metric_list.append(node_metadata)
+                continue
+
+            self.loaded_nodes[node_metadata.node_id] = self._load_layer(
+                node_metadata.node_id,
+                node_metadata.identifier,
+                node_metadata.metadata,
+            )
+
+        for node_metadata in metric_list:
+            try:
+                self.loaded_nodes[node_metadata.node_id] = self._load_layer(
+                    node_metadata.node_id,
+                    node_metadata.identifier,
+                    node_metadata.metadata,
+                )
+            except ValueError as e:
+                # Metrics are only needed when the model is compiled later. We
+                # ignore errors when trying to load custom metrics when
+                # `compile=False` until custom metrics are serialized properly
+                # (b/135550038).
+                if compile:
+                    raise e
+                logging.warning(
+                    "Unable to restore custom metric. Please ensure that "
+                    "the layer implements `get_config` and `from_config` "
+                    "when saving. In addition, please use the "
+                    "`custom_objects` arg when calling `load_model()`."
+                )
+
+    def _load_layer(self, node_id, identifier, metadata):
+        """Load a single layer from a SavedUserObject proto."""
+        metadata = json_utils.decode(metadata)
+
+        # If node was already created
+        if node_id in self.loaded_nodes:
+            node, setter = self.loaded_nodes[node_id]
+
+            # Revive setter requires the object to have a
+            # `_serialized_attributes` property. Add it here.
+            _maybe_add_serialized_attributes(node, metadata)
+
+            config = metadata.get("config")
+            if _is_graph_network(node) and serialization.validate_config(
+                config
+            ):
+                child_nodes = self._get_child_layer_node_ids(node_id)
+                self.model_layer_dependencies[node_id] = (node, child_nodes)
+                if not child_nodes:
+                    self._models_to_reconstruct.append(node_id)
+            return node, setter
+
+        # Detect whether this object can be revived from the config. If not,
+        # then revive from the SavedModel instead.
+        obj, setter = self._revive_from_config(identifier, metadata, node_id)
+        if obj is None:
+            obj, setter = revive_custom_object(identifier, metadata)
+
+        # Add an attribute that stores the extra functions/objects saved in the
+        # SavedModel. Most of these functions/objects are ignored, but some are
+        # used later in the loading process (e.g. the list of regularization
+        # losses, or the training config of compiled models).
+        _maybe_add_serialized_attributes(obj, metadata)
+        return obj, setter
+
+    def _revive_from_config(self, identifier, metadata, node_id):
+        """Revives a layer/model from config, or returns None."""
+        if identifier == constants.METRIC_IDENTIFIER:
+            obj = self._revive_metric_from_config(metadata)
+        else:
+            obj = self._revive_graph_network(
+                identifier, metadata, node_id
+            ) or self._revive_layer_or_model_from_config(metadata, node_id)
+
+        if obj is None:
+            return None, None
+
+        setter = self._config_node_setter(_revive_setter)
+        self._add_children_recreated_from_config(
+            obj, self._proto.nodes[node_id], node_id
+        )
+        return obj, setter
+
+    def _revive_graph_network(self, identifier, metadata, node_id):
+        """Revives a graph network from config."""
+        # Determine whether the metadata contains information for reviving a
+        # functional or Sequential model.
+        config = metadata.get("config")
+        if not serialization.validate_config(config):
+            return None
+
+        class_name = tf.compat.as_str(metadata["class_name"])
+        if object_registration.get_registered_object(class_name) is not None:
+            return None
+        model_is_functional_or_sequential = (
+            metadata.get("is_graph_network", False)
+            or class_name == "Sequential"
+            or class_name == "Functional"
+        )
+        if not model_is_functional_or_sequential:
+            return None
+
+        # Revive functional and sequential models as blank model objects for now
+        # ( must be initialized to enable setattr tracking and attribute
+        # caching).  Reconstruction of the network is deferred until all of the
+        # model's layers have been revived.
+        if class_name == "Sequential":
+            model = models_lib.Sequential(name=config["name"])
+        # The model is a custom Sequential model.
+        elif identifier == constants.SEQUENTIAL_IDENTIFIER:
+            # Uses the custom class name, since the config does not have one.
+            model = models_lib.Sequential(name=class_name)
+        else:
+            model = models_lib.Functional(
+                inputs=[], outputs=[], name=config["name"]
+            )
+
+        # Record this model and its layers. This will later be used to
+        # reconstruct the model.
+        layers = self._get_child_layer_node_ids(node_id)
+        self.model_layer_dependencies[node_id] = (model, layers)
+        if not layers:
+            self._models_to_reconstruct.append(node_id)
+        return model
+
+    def _revive_layer_or_model_from_config(self, metadata, node_id):
+        """Revives a layer/custom model from config; returns None if
+        infeasible."""
+        # Check that the following requirements are met for reviving from
+        # config:
+        #    1. Object can be deserialized from config.
+        #    2. If the object needs to be built, then the build input shape can
+        #       be found.
+        class_name = metadata.get("class_name")
+        config = metadata.get("config")
+        shared_object_id = metadata.get("shared_object_id")
+        must_restore_from_config = metadata.get("must_restore_from_config")
+        if not serialization.validate_config(config):
+            return None
+
+        try:
+            try:
+                obj = model_config.model_from_config(
+                    serialization.serialize_keras_class_and_config(
+                        class_name, config, shared_object_id=shared_object_id
+                    )
+                )
+            except (TypeError, KeyError) as e:
+                # A name conflict has occurred. The `class_name` is in the Keras
+                # native framework; however, the value in the framework is
+                # different from the user's class definition which confuses the
+                # KerasObjectLoader.
+                builtin_layer = layers_module.get_builtin_layer(class_name)
+                if builtin_layer:
+                    raise RuntimeError(
+                        f"Unable to restore object of class '{class_name}'. "
+                        "One of several possible causes could be "
+                        "a missing custom object. "
+                        "Decorate your custom object with "
+                        "`@keras.utils.register_keras_serializable()` and "
+                        "include that file in your program, "
+                        "or pass your class in a "
+                        "`keras.utils.CustomObjectScope` "
+                        "that wraps this load call. "
+                        f"\n\nException: {e}"
+                    ) from e
+                else:
+                    raise
+        except Exception as e:
+            if must_restore_from_config:
+                raise e
+            else:
+                return None
+
+        # Use the dtype, name, and trainable status. Often times these are not
+        # specified in custom configs, so retrieve their values from the
+        # metadata.
+
+        obj._name = metadata["name"]
+        if metadata.get("trainable") is not None:
+            obj.trainable = metadata["trainable"]
+        if metadata.get("dtype") is not None:
+            obj._set_dtype_policy(metadata["dtype"])
+        if metadata.get("stateful") is not None:
+            obj.stateful = metadata["stateful"]
+        if metadata.get("autocast") is not None:
+            obj._autocast = metadata["autocast"]
+        # Restore model save spec for subclassed models. (layers do not store a
+        # SaveSpec)
+        if isinstance(obj, training_lib.Model):
+            full_save_spec = metadata.get("full_save_spec")
+            if full_save_spec is not None:
+                args_spec, kwargs_spec = full_save_spec
+                inputs_spec = args_spec.pop(0)
+                obj._set_save_spec(inputs_spec, args_spec, kwargs_spec)
+
+        build_input_shape = metadata.get("build_input_shape")
+        built = self._try_build_layer(obj, node_id, build_input_shape)
+
+        if not built:
+            # If the layer cannot be built, revive a custom layer instead.
+            return None
+        return obj
+
+    def _revive_metric_from_config(self, metadata):
+        """Revives a metric object using the config saved in the metadata."""
+        class_name = tf.compat.as_str(metadata["class_name"])
+        config = metadata.get("config")
+
+        if not serialization.validate_config(config):
+            return None
+
+        try:
+            obj = metrics.deserialize(
+                serialization.serialize_keras_class_and_config(
+                    class_name, config
+                )
+            )
+        except ValueError:
+            return None
+
+        build_input_shape = metadata.get("build_input_shape")
+        if build_input_shape is not None and hasattr(obj, "_build"):
+            obj._build(build_input_shape)
+
+        return obj
+
+    def _try_build_layer(self, obj, node_id, build_input_shape):
+        """Attempts to build the layer."""
+        if obj.built or hasattr(obj.build, "_is_default"):
+            obj.built = True
+            return True
+
+        if build_input_shape is None:
+            build_input_shape = self._infer_inputs(
+                node_id, convert_to_shapes=True
+            )
+
+        if build_input_shape is not None:
+            obj.build(build_input_shape)
+            base_layer.Layer.build(obj, build_input_shape)
+            return True
+
+        return False
+
+    def get_path(self, node_id):
+        return self._node_paths[node_id]
+
+    def finalize_objects(self):
+        """Finish setting up Keras objects.
+
+        This function is executed after all objects and functions have been
+        created.  Call functions and losses are attached to each layer, and once
+        all layers have been fully set up, graph networks are initialized.
+
+        Subclassed models that are revived from the SavedModel are treated like
+        layers, and have their call/loss functions attached here.
+        """
+        # Finish setting up layers and subclassed models. This step attaches
+        # call functions and losses to each object, and sets model
+        # inputs/outputs.
+        layers_revived_from_config = []
+        layers_revived_from_saved_model = []
+        for node_id, (node, _) in self.loaded_nodes.items():
+            if (
+                not isinstance(node, base_layer.Layer)
+                # Don't finalize models until all layers have finished loading.
+                or node_id in self.model_layer_dependencies
+            ):
+                continue
+
+            self._unblock_model_reconstruction(node_id, node)
+
+            if isinstance(node, input_layer.InputLayer):
+                continue
+            elif isinstance(node, metrics.Metric):
+                continue
+
+            if isinstance(node, (RevivedLayer, RevivedInputLayer)):
+                layers_revived_from_saved_model.append(node)
+            else:
+                layers_revived_from_config.append(node)
+
+        _finalize_saved_model_layers(layers_revived_from_saved_model)
+        _finalize_config_layers(layers_revived_from_config)
+
+        # Initialize graph networks, now that layer dependencies have been
+        # resolved.
+        self._reconstruct_all_models()
+
+    def _unblock_model_reconstruction(self, layer_id, layer):
+        """Removes layer from blocking model reconstruction."""
+        for model_id, v in self.model_layer_dependencies.items():
+            _, layers = v
+            if layer_id not in layers:
+                continue
+            layers[layers.index(layer_id)] = layer
+            if all(isinstance(x, base_layer.Layer) for x in layers):
+                self._models_to_reconstruct.append(model_id)
+
+    def _reconstruct_all_models(self):
+        """Reconstructs the network structure of all models."""
+        all_initialized_models = set()
+        while self._models_to_reconstruct:
+            model_id = self._models_to_reconstruct.pop(0)
+            all_initialized_models.add(model_id)
+            model, layers = self.model_layer_dependencies[model_id]
+            self._reconstruct_model(model_id, model, layers)
+            _finalize_config_layers([model])
+
+        if all_initialized_models != set(self.model_layer_dependencies.keys()):
+            # This should not happen.
+            uninitialized_model_ids = (
+                set(self.model_layer_dependencies.keys())
+                - all_initialized_models
+            )
+            uninitialized_model_names = [
+                self.model_layer_dependencies[model_id][0].name
+                for model_id in uninitialized_model_ids
+            ]
+            raise ValueError(
+                "Error loading model(s) in the SavedModel format. "
+                "The following model(s) could not be initialized: "
+                f"{uninitialized_model_names}"
+            )
+
+    def _reconstruct_model(self, model_id, model, layers):
+        """Reconstructs the network structure."""
+        config = json_utils.decode(self._metadata[model_id].metadata)["config"]
+
+        # Set up model inputs
+        if model.inputs:
+            # Inputs may already be created if the model is instantiated in
+            # another object's __init__.
+            pass
+        elif isinstance(model, models_lib.Sequential):
+            if not layers or not isinstance(layers[0], input_layer.InputLayer):
+                if config["layers"][0]["class_name"] == "InputLayer":
+                    layers.insert(
+                        0,
+                        input_layer.InputLayer.from_config(
+                            config["layers"][0]["config"]
+                        ),
+                    )
+                elif "batch_input_shape" in config["layers"][0]["config"]:
+                    batch_input_shape = config["layers"][0]["config"][
+                        "batch_input_shape"
+                    ]
+                    layers.insert(
+                        0,
+                        input_layer.InputLayer(
+                            input_shape=batch_input_shape[1:],
+                            batch_size=batch_input_shape[0],
+                            dtype=layers[0].dtype,
+                            name=layers[0].name + "_input",
+                        ),
+                    )
+            model.__init__(layers, name=config["name"])
+            if not model.inputs:
+                first_layer = self._get_child_layer_node_ids(model_id)[0]
+                input_specs = self._infer_inputs(first_layer)
+                input_shapes = self._infer_inputs(
+                    first_layer, convert_to_shapes=True
+                )
+                model._set_inputs(input_specs)
+                if not model.built and not isinstance(input_specs, dict):
+                    model.build(input_shapes)
+        else:  # Reconstruct functional model
+            (
+                inputs,
+                outputs,
+                created_layers,
+            ) = functional_lib.reconstruct_from_config(
+                config, created_layers={layer.name: layer for layer in layers}
+            )
+            model.__init__(inputs, outputs, name=config["name"])
+            functional_lib.connect_ancillary_layers(model, created_layers)
+
+        # Set model dtype.
+        _set_network_attributes_from_metadata(model)
+
+        # Unblock models that are dependent on this model.
+        self._unblock_model_reconstruction(model_id, model)
+
+    def _get_child_layer_node_ids(self, node_id):
+        """Returns the node ids of each layer in a Sequential/Functional
+        model."""
+        # Sequential and Functional track layers with names following the format
+        # "layer-N". Use this to generate the list of layers.
+        num_layers = 0
+        child_layers = {}
+        pattern = re.compile("layer-(\\d+)")
+
+        for child in self._proto.nodes[node_id].children:
+            m = pattern.match(child.local_name)
+            if m is None:
+                continue
+            layer_n = int(m.group(1))
+            num_layers = max(layer_n + 1, num_layers)
+            child_layers[layer_n] = child.node_id
+
+        ordered = []
+        for n in range(num_layers):
+            child = child_layers.get(n)
+            if child is None:
+                break
+            ordered.append(child)
+        return ordered
+
+    def _search_for_child_node(self, parent_id, path_to_child):
+        """Returns node id of child node.
+
+        A helper method for traversing the object graph proto.
+
+        As an example, say that the object graph proto in the SavedModel
+        contains an object with the following child and grandchild attributes:
+
+        `parent.child_a.child_b`
+
+        This method can be used to retrieve the node id of `child_b` using the
+        parent's node id by calling:
+
+        `_search_for_child_node(parent_id, ['child_a', 'child_b'])`.
+
+        Args:
+          parent_id: node id of parent node
+          path_to_child: list of children names.
+
+        Returns:
+          node_id of child, or None if child isn't found.
+        """
+        if not path_to_child:
+            return parent_id
+
+        for child in self._proto.nodes[parent_id].children:
+            if child.local_name == path_to_child[0]:
+                return self._search_for_child_node(
+                    child.node_id, path_to_child[1:]
+                )
+        return None
+
+    def _infer_inputs(self, layer_node_id, convert_to_shapes=False):
+        """Infers input shape of layer from SavedModel functions."""
+        call_fn_id = self._search_for_child_node(
+            layer_node_id, ["call_and_return_all_conditional_losses"]
+        )
+        if call_fn_id is None:
+            return None
+
+        concrete_functions = self._proto.nodes[
+            call_fn_id
+        ].function.concrete_functions
+        if not concrete_functions:
+            return None
+        call_fn_name = concrete_functions[0]
+        call_fn_proto = self._proto.concrete_functions[call_fn_name]
+        structured_input_signature = tf.__internal__.saved_model.decode_proto(
+            call_fn_proto.canonicalized_input_signature
+        )
+        inputs = structured_input_signature[0][0]
+        if convert_to_shapes:
+            return tf.nest.map_structure(lambda spec: spec.shape, inputs)
+        else:
+            return inputs
+
+    def _config_node_setter(self, setter):
+        """Creates edges for nodes that are recreated from config."""
+
+        def setattr_wrapper(obj, name, value):
+            # Avoid overwriting attributes of objects recreated from the config.
+            if obj._lookup_dependency(name) is None:
+                setter(obj, name, value)
+
+        return setattr_wrapper
+
+
+def _finalize_saved_model_layers(layers):
+    """Runs the final steps of loading Keras Layers from SavedModel."""
+
+    # 1. Set up call functions for all layers initialized from the SavedModel (
+    # and not the config)
+    for layer in layers:
+        layer.built = True
+        layer_call = getattr(
+            _get_keras_attr(layer), "call_and_return_conditional_losses", None
+        )
+        if layer_call and layer_call.concrete_functions:
+            call_spec = layer_utils.CallFunctionSpec(
+                tf_inspect.getfullargspec(layer_call)
+            )
+            layer.call = utils.use_wrapped_call(
+                layer, layer_call, call_spec, return_method=True
+            )
+            expects_training_arg = layer._serialized_attributes["metadata"][
+                "expects_training_arg"
+            ]
+            if "training" in layer_call.function_spec.arg_names:
+                # This could change the value of `expects_training_arg` if this
+                # layer doesn't expect a training arg, but has a child layer
+                # that does.
+                expects_training_arg = True
+            layer._init_call_fn_args(expects_training_arg)
+        else:
+            layer.call = types.MethodType(
+                _unable_to_call_layer_due_to_serialization_issue, layer
+            )
+
+    for layer in layers:
+        # 2. Set model inputs and outputs.
+        if isinstance(layer, RevivedNetwork):
+            _set_network_attributes_from_metadata(layer)
+
+            if hasattr(
+                _get_keras_attr(layer), "call_and_return_conditional_losses"
+            ):
+                call_fn = _get_keras_attr(
+                    layer
+                ).call_and_return_conditional_losses
+                if not call_fn.concrete_functions:
+                    continue
+                if call_fn.input_signature is None:
+                    args, kwargs = infer_inputs_from_restored_call_function(
+                        call_fn
+                    )
+                    args = list(args)
+                    inputs = args.pop(0)
+                else:
+                    args = call_fn.input_signature
+                    args = list(args)
+                    inputs = args.pop(0)
+                    kwargs = None
+                layer._set_save_spec(inputs, args, kwargs)
+
+                # V1 models require calling _set_inputs to set the `.inputs`
+                # attr.  Skip this step when there are multiple tensor inputs
+                # (this behavior is not well supported in V1 models).
+                if not any(
+                    isinstance(x, tf.TensorSpec)
+                    for x in tf.nest.flatten([args, kwargs])
+                ):
+                    layer._set_inputs(inputs)
+
+        # 3. Add losses that aren't generated by the layer.call function.
+        _restore_layer_unconditional_losses(layer)
+        _restore_layer_activation_loss(layer)
+
+        # 4. Restore metrics list
+        _restore_layer_metrics(layer)
+
+
+def _unable_to_call_layer_due_to_serialization_issue(
+    layer, *unused_args, **unused_kwargs
+):
+    """Replaces the `layer.call` if the layer was not fully serialized.
+
+    Keras Model/Layer serialization is relatively relaxed because SavedModels
+    are not always loaded back as keras models. Thus, when there is an issue
+    tracing a non-signature function, a warning is logged instead of raising an
+    error. This results in a SavedModel where the model's call function is
+    saved, but the internal layer call functions are not.
+
+    When deserialized with `tf.keras.models.load_model`, the internal layers
+    which do not have serialized call functions should raise an error when
+    called.
+
+    Args:
+      layer: Layer without the serialized call function.
+
+    Raises:
+      ValueError
+    """
+
+    raise ValueError(
+        f"Cannot call custom layer {layer.name} of type {type(layer)}, because "
+        "the call function was not serialized to the SavedModel."
+        "Please try one of the following methods to fix this issue:"
+        "\n\n(1) Implement `get_config` and `from_config` in the layer/model "
+        "class, and pass the object to the `custom_objects` argument when "
+        "loading the model. For more details, see: "
+        "https://www.tensorflow.org/guide/keras/save_and_serialize"
+        "\n\n(2) Ensure that the subclassed model or layer overwrites `call` "
+        "and not `__call__`. The input shape and dtype will be automatically "
+        "recorded when the object is called, and used when saving. To manually "
+        "specify the input shape/dtype, decorate the call function with "
+        "`@tf.function(input_signature=...)`."
+    )
+
+
+def _finalize_config_layers(layers):
+    """Runs the final steps of loading Keras Layers from config."""
+    for layer in layers:
+        # It is assumed that layers define their unconditional losses after
+        # being recreated from the config and built. The exceptions to this are
+        # Functional and Sequential models, which only store conditional losses
+        # (losses dependent on the inputs) in the config. Unconditional losses
+        # like weight regularization must be revived from the SavedModel.
+        if _is_graph_network(layer):
+            _restore_layer_unconditional_losses(layer)
+
+        # Some layers, like Dense, record their activation loss function in the
+        # config. However, not all layers do this, so the activation loss may be
+        # missing when restored from the config/hdf5.
+        # TODO(kathywu): Investigate ways to improve the config to ensure
+        # consistent loading behavior between HDF5 and SavedModel.
+        _restore_layer_activation_loss(layer)
+
+        # Restore metrics list.
+        _restore_layer_metrics(layer)
+
+        # Restore RNN layer states.
+        if (
+            isinstance(layer, base_rnn.RNN)
+            and layer.stateful
+            and hasattr(_get_keras_attr(layer), "states")
+        ):
+            layer.states = getattr(_get_keras_attr(layer), "states", None)
+            for variable in tf.nest.flatten(layer.states):
+                backend.track_variable(variable)
+
+        # Perform any layer defined finalization of the layer state.
+        layer.finalize_state()
+
+
+def _finalize_metric(metric):
+    metric.update_state = types.MethodType(
+        metrics_utils.update_state_wrapper(metric.keras_api.update_state),
+        metric,
+    )
+    metric.result = metric.keras_api.result
+
+
+def _restore_layer_unconditional_losses(layer):
+    """Restore unconditional losses from SavedModel."""
+    if hasattr(_get_keras_attr(layer), "layer_regularization_losses"):
+        losses = getattr(
+            _get_keras_attr(layer), "layer_regularization_losses", []
+        )
+    else:
+        # Some earlier SavedModels may not have layer_regularization_losses
+        # serialized separately. Fall back to using the regularization_losses
+        # list if it does not exist.
+        losses = layer._serialized_attributes.get("regularization_losses", [])
+    for loss in losses:
+        layer.add_loss(loss)
+
+
+def _restore_layer_activation_loss(layer):
+    """Restore actiation loss from SavedModel."""
+    # Use wrapped activity regularizer function if the layer's activity
+    # regularizer wasn't created during initialization.
+    activity_regularizer = getattr(
+        _get_keras_attr(layer), "activity_regularizer_fn", None
+    )
+    if activity_regularizer and not layer.activity_regularizer:
+        try:
+            layer.activity_regularizer = activity_regularizer
+        except AttributeError:
+            # This may happen if a layer wrapper is saved with an activity
+            # regularizer. The wrapper object's activity regularizer is
+            # unsettable.
+            pass
+
+
+def revive_custom_object(identifier, metadata):
+    """Revives object from SavedModel."""
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        model_class = training_lib.Model
+    else:
+        model_class = training_lib_v1.Model
+
+    revived_classes = {
+        constants.INPUT_LAYER_IDENTIFIER: (
+            RevivedInputLayer,
+            input_layer.InputLayer,
+        ),
+        constants.LAYER_IDENTIFIER: (RevivedLayer, base_layer.Layer),
+        constants.MODEL_IDENTIFIER: (RevivedNetwork, model_class),
+        constants.NETWORK_IDENTIFIER: (
+            RevivedNetwork,
+            functional_lib.Functional,
+        ),
+        constants.SEQUENTIAL_IDENTIFIER: (
+            RevivedNetwork,
+            models_lib.Sequential,
+        ),
+    }
+    parent_classes = revived_classes.get(identifier, None)
+
+    class_name = tf.compat.as_str(metadata["class_name"])
+    if parent_classes is not None:
+        parent_classes = revived_classes[identifier]
+        revived_cls = type(class_name, parent_classes, {})
+        return revived_cls._init_from_metadata(metadata)
+    else:
+        raise ValueError(
+            f'Unable to restore custom object of class "{class_name}" '
+            f"(type {identifier}). Please make sure that this class is "
+            "included in the `custom_objects` arg when calling `load_model()`. "
+            "Also, check that the class implements `get_config` and "
+            f"`from_config`.\n\nComplete metadata: {metadata}"
+        )
+
+
+def _restore_layer_metrics(layer):
+    metrics_list = getattr(_get_keras_attr(layer), "layer_metrics", {})
+    layer_metrics = {m.name: m for m in layer._metrics}
+    for name, metric in metrics_list.items():
+        if name not in layer_metrics:
+            # Metrics may be added during initialization/building of custom
+            # layers.
+            layer._metrics.append(metric)
+
+
+# TODO(kathywu): Centrally define keys and functions for both  serialization and
+# deserialization.
+class RevivedLayer:
+    """Keras layer loaded from a SavedModel."""
+
+    @classmethod
+    def _init_from_metadata(cls, metadata):
+        """Create revived layer from metadata stored in the SavedModel proto."""
+        init_args = dict(name=metadata["name"], trainable=metadata["trainable"])
+        if metadata.get("dtype") is not None:
+            init_args["dtype"] = metadata["dtype"]
+        if metadata.get("batch_input_shape") is not None:
+            init_args["batch_input_shape"] = metadata["batch_input_shape"]
+
+        revived_obj = cls(**init_args)
+
+        with utils.no_automatic_dependency_tracking_scope(revived_obj):
+
+            revived_obj._call_spec.expects_training_arg = metadata[
+                "expects_training_arg"
+            ]
+            config = metadata.get("config")
+            if serialization.validate_config(config):
+                revived_obj._config = config
+            if metadata.get("input_spec") is not None:
+                revived_obj.input_spec = recursively_deserialize_keras_object(
+                    metadata["input_spec"],
+                    module_objects={"InputSpec": input_spec.InputSpec},
+                )
+            if metadata.get("activity_regularizer") is not None:
+                revived_obj.activity_regularizer = regularizers.deserialize(
+                    metadata["activity_regularizer"]
+                )
+            if metadata.get("_is_feature_layer") is not None:
+                revived_obj._is_feature_layer = metadata["_is_feature_layer"]
+            if metadata.get("stateful") is not None:
+                revived_obj.stateful = metadata["stateful"]
+            if metadata.get("autocast") is not None:
+                revived_obj._autocast = metadata["autocast"]
+            if metadata.get("preserve_input_structure_in_config") is not None:
+                revived_obj._preserve_input_structure_in_config = metadata[
+                    "preserve_input_structure_in_config"
+                ]
+
+        return revived_obj, _revive_setter
+
+    @property
+    def keras_api(self):
+        return self._serialized_attributes.get(constants.KERAS_ATTR, None)
+
+    def get_config(self):
+        if hasattr(self, "_config"):
+            return self._config
+        else:
+            raise NotImplementedError
+
+
+def _revive_setter(layer, name, value):
+    """Setter function that saves some attributes to separate dictionary."""
+    # Many attributes in the SavedModel conflict with properties defined in
+    # Layer and Model. Save these attributes to a separate dictionary.
+    if name in PUBLIC_ATTRIBUTES:
+
+        if isinstance(value, tf.__internal__.tracking.Trackable):
+            layer._track_trackable(value, name=name)
+        layer._serialized_attributes[name] = value
+
+    elif (
+        isinstance(layer, functional_lib.Functional)
+        and re.match(r"^layer(_with_weights)?-[\d+]", name) is not None
+    ):
+        # Edges named "layer-n" or "layer_with_weights-n", which are tracked in
+        # network._track_layers, should not be added as an attribute. They
+        # should be temporarily added as a dependency so that checkpointed
+        # values can be restored. These dependencies are manually deleted in
+        # KerasObjectLoader.del_tracking.
+
+        # Set `overwrite=True` in the case that `layer` already tracks a
+        # different layer-n. This may cause variable values to not be loaded
+        # properly in the original layer-n, but we already warn the users about
+        # this (ctrl-f "shared between different layers/models").
+        layer._track_trackable(value, name, overwrite=True)
+    elif getattr(layer, name, None) is not None:
+        # Don't overwrite already defined attributes.
+        pass
+    else:
+        setattr(layer, name, value)
+
+
+class RevivedInputLayer:
+    """InputLayer loaded from a SavedModel."""
+
+    @classmethod
+    def _init_from_metadata(cls, metadata):
+        """Revives the saved InputLayer from the Metadata."""
+        init_args = dict(
+            name=metadata["name"],
+            dtype=metadata["dtype"],
+            sparse=metadata["sparse"],
+            ragged=metadata["ragged"],
+            batch_input_shape=metadata["batch_input_shape"],
+        )
+        revived_obj = cls(**init_args)
+        with utils.no_automatic_dependency_tracking_scope(revived_obj):
+            revived_obj._config = metadata["config"]
+
+        return revived_obj, setattr
+
+    def get_config(self):
+        return self._config
+
+
+def recursively_deserialize_keras_object(config, module_objects=None):
+    """Deserialize Keras object from a nested structure."""
+    if isinstance(config, dict):
+        if "class_name" in config:
+            return serialization.deserialize_keras_object(
+                config, module_objects=module_objects
+            )
+        else:
+            return {
+                key: recursively_deserialize_keras_object(
+                    config[key], module_objects
+                )
+                for key in config
+            }
+    elif isinstance(config, (tuple, list)):
+        return [
+            recursively_deserialize_keras_object(x, module_objects)
+            for x in config
+        ]
+    else:
+        raise ValueError(
+            "Unable to decode Keras layer config. Config should be a "
+            f"dictionary, tuple or list. Received: config={config}"
+        )
+
+
+def infer_inputs_from_restored_call_function(fn):
+    """Returns TypeSpec of inputs from a restored call function.
+
+    Args:
+      fn: Restored layer call function. It is assumed that `fn` has at least one
+        concrete function and that the inputs are in the first argument.
+
+    Returns:
+      TypeSpec of call function inputs in the form of (args, kwargs)
+    """
+
+    def common_spec(x, y):
+        if not isinstance(x, tf.TypeSpec):
+            # Doesn't particularly matter what is returned in this case because
+            # the result will be filtered out in _set_input_shape.
+            return x
+
+        result = x._without_tensor_names().most_specific_common_supertype(
+            [y._without_tensor_names()]
+        )
+        if result is None:
+            # Please file a bug if you are being hindered by this error.
+            raise TypeError(f"No common supertype of {x} and {y}.")
+        return result
+
+    spec = fn.concrete_functions[0].structured_input_signature
+    for concrete in fn.concrete_functions[1:]:
+        spec2 = concrete.structured_input_signature
+        spec = tf.nest.map_structure(common_spec, spec, spec2)
+    return spec
+
+
+class RevivedNetwork(RevivedLayer):
+    """Keras network of layers loaded from a SavedModel."""
+
+    @classmethod
+    def _init_from_metadata(cls, metadata):
+        """Create revived network from metadata stored in the SavedModel
+        proto."""
+        revived_obj = cls(name=metadata["name"])
+
+        # Store attributes revived from SerializedAttributes in a un-tracked
+        # dictionary. The attributes are the ones listed in CommonEndpoints or
+        # "keras_api" for keras-specific attributes.
+        with utils.no_automatic_dependency_tracking_scope(revived_obj):
+
+            revived_obj._call_spec.expects_training_arg = metadata[
+                "expects_training_arg"
+            ]
+            config = metadata.get("config")
+            if serialization.validate_config(config):
+                revived_obj._config = config
+
+            if metadata.get("activity_regularizer") is not None:
+                revived_obj.activity_regularizer = regularizers.deserialize(
+                    metadata["activity_regularizer"]
+                )
+            if metadata.get("autocast") is not None:
+                revived_obj._autocast = metadata["autocast"]
+
+        return revived_obj, _revive_setter
+
+
+def _set_network_attributes_from_metadata(revived_obj):
+    """Sets attributes recorded in the metadata."""
+    with utils.no_automatic_dependency_tracking_scope(revived_obj):
+
+        metadata = revived_obj._serialized_attributes["metadata"]
+        if metadata.get("dtype") is not None:
+            revived_obj._set_dtype_policy(metadata["dtype"])
+        revived_obj._trainable = metadata["trainable"]
+
+
+def _maybe_add_serialized_attributes(layer, metadata):
+    # Store attributes revived from SerializedAttributes in a un-tracked
+    # dictionary. The attributes are the ones listed in CommonEndpoints or
+    # "keras_api" for keras-specific attributes.
+    if not hasattr(layer, "_serialized_attributes"):
+        with utils.no_automatic_dependency_tracking_scope(layer):
+            layer._serialized_attributes = {"metadata": metadata}
+
+
+def _get_keras_attr(layer):
+    return getattr(layer, "_serialized_attributes", {}).get(
+        constants.KERAS_ATTR, None
+    )
diff --git a/keras/saving/saved_model/load_context.py b/keras/saving/legacy/saved_model/load_context.py
similarity index 50%
rename from keras/saving/saved_model/load_context.py
rename to keras/saving/legacy/saved_model/load_context.py
index dd9d06c443d5..7e4d1d1b74e8 100644
--- a/keras/saving/saved_model/load_context.py
+++ b/keras/saving/legacy/saved_model/load_context.py
@@ -17,28 +17,30 @@
 import contextlib
 import threading
 
+import tensorflow.compat.v2 as tf
+
 
 class LoadContext(threading.local):
-  """A context for loading a model."""
+    """A context for loading a model."""
 
-  def __init__(self):
-    super().__init__()
-    self._entered_load_context = []
-    self._load_options = None
+    def __init__(self):
+        super().__init__()
+        self._entered_load_context = []
+        self._load_options = None
 
-  def set_load_options(self, load_options):
-    self._load_options = load_options
-    self._entered_load_context.append(True)
+    def set_load_options(self, load_options):
+        self._load_options = load_options
+        self._entered_load_context.append(True)
 
-  def clear_load_options(self):
-    self._load_options = None
-    self._entered_load_context.pop()
+    def clear_load_options(self):
+        self._load_options = None
+        self._entered_load_context.pop()
 
-  def load_options(self):
-    return self._load_options
+    def load_options(self):
+        return self._load_options
 
-  def in_load_context(self):
-    return self._entered_load_context
+    def in_load_context(self):
+        return self._entered_load_context
 
 
 _load_context = LoadContext()
@@ -46,18 +48,21 @@ def in_load_context(self):
 
 @contextlib.contextmanager
 def load_context(load_options):
-  _load_context.set_load_options(load_options)
-  try:
-    yield
-  finally:
-    _load_context.clear_load_options()
+    _load_context.set_load_options(load_options)
+    try:
+        yield
+    finally:
+        _load_context.clear_load_options()
 
 
 def get_load_options():
-  """Returns the load options under a load context."""
-  return _load_context.load_options()
+    """Returns the load options under a load context."""
+    return _load_context.load_options()
 
 
 def in_load_context():
-  """Returns whether under a load context."""
-  return _load_context.in_load_context()
+    """Returns whether under a load context."""
+    return _load_context.in_load_context()
+
+
+tf.__internal__.register_load_context_function(in_load_context)
diff --git a/keras/saving/legacy/saved_model/metric_serialization.py b/keras/saving/legacy/saved_model/metric_serialization.py
new file mode 100644
index 000000000000..4d032ca28cab
--- /dev/null
+++ b/keras/saving/legacy/saved_model/metric_serialization.py
@@ -0,0 +1,47 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes and functions implementing Metrics SavedModel serialization."""
+
+import tensorflow.compat.v2 as tf
+
+from keras.saving import object_registration
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import layer_serialization
+
+
+class MetricSavedModelSaver(layer_serialization.LayerSavedModelSaver):
+    """Metric serialization."""
+
+    @property
+    def object_identifier(self):
+        return constants.METRIC_IDENTIFIER
+
+    def _python_properties_internal(self):
+        metadata = dict(
+            class_name=object_registration.get_registered_name(type(self.obj)),
+            name=self.obj.name,
+            dtype=self.obj.dtype,
+        )
+        metadata.update(layer_serialization.get_serialized(self.obj))
+        if self.obj._build_input_shape is not None:
+            metadata["build_input_shape"] = self.obj._build_input_shape
+        return metadata
+
+    def _get_serialized_attributes_internal(self, unused_serialization_cache):
+        return (
+            dict(variables=tf.__internal__.tracking.wrap(self.obj.variables)),
+            # TODO(b/135550038): save functions to enable saving custom metrics.
+            {},
+        )
diff --git a/keras/saving/legacy/saved_model/model_serialization.py b/keras/saving/legacy/saved_model/model_serialization.py
new file mode 100644
index 000000000000..991b92d92350
--- /dev/null
+++ b/keras/saving/legacy/saved_model/model_serialization.py
@@ -0,0 +1,67 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes and functions implementing to Model SavedModel serialization."""
+
+from keras.saving.legacy import saving_utils
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import layer_serialization
+from keras.saving.legacy.saved_model import save_impl
+
+
+class ModelSavedModelSaver(layer_serialization.LayerSavedModelSaver):
+    """Model SavedModel serialization."""
+
+    @property
+    def object_identifier(self):
+        return constants.MODEL_IDENTIFIER
+
+    def _python_properties_internal(self):
+        metadata = super()._python_properties_internal()
+        # Network stateful property is dependent on the child layers.
+        metadata.pop("stateful")
+        metadata["is_graph_network"] = self.obj._is_graph_network
+        spec = self.obj.save_spec(dynamic_batch=False)
+        metadata["full_save_spec"] = spec
+        # save_spec is saved for forward compatibility on older TF versions.
+        metadata["save_spec"] = None if spec is None else spec[0][0]
+
+        metadata.update(
+            saving_utils.model_metadata(
+                self.obj, include_optimizer=True, require_config=False
+            )
+        )
+        return metadata
+
+    def _get_serialized_attributes_internal(self, serialization_cache):
+        default_signature = None
+
+        # Create a default signature function if this is the only object in the
+        # cache (i.e. this is the root level object).
+        if len(serialization_cache[constants.KERAS_CACHE_KEY]) == 1:
+            default_signature = save_impl.default_save_signature(self.obj)
+
+        # Other than the default signature function, all other attributes match
+        # with the ones serialized by Layer.
+        objects, functions = super()._get_serialized_attributes_internal(
+            serialization_cache
+        )
+        functions["_default_save_signature"] = default_signature
+        return objects, functions
+
+
+class SequentialSavedModelSaver(ModelSavedModelSaver):
+    @property
+    def object_identifier(self):
+        return constants.SEQUENTIAL_IDENTIFIER
diff --git a/keras/saving/saved_model/network_serialization.py b/keras/saving/legacy/saved_model/network_serialization.py
similarity index 79%
rename from keras/saving/saved_model/network_serialization.py
rename to keras/saving/legacy/saved_model/network_serialization.py
index 6e8e12e8168a..dfc2ba33531f 100644
--- a/keras/saving/saved_model/network_serialization.py
+++ b/keras/saving/legacy/saved_model/network_serialization.py
@@ -14,14 +14,14 @@
 # ==============================================================================
 """Classes and functions implementing to Network SavedModel serialization."""
 
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import model_serialization
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import model_serialization
 
 
 # FunctionalModel serialization is pretty much the same as Model serialization.
 class NetworkSavedModelSaver(model_serialization.ModelSavedModelSaver):
-  """Network serialization."""
+    """Network serialization."""
 
-  @property
-  def object_identifier(self):
-    return constants.NETWORK_IDENTIFIER
+    @property
+    def object_identifier(self):
+        return constants.NETWORK_IDENTIFIER
diff --git a/keras/saving/legacy/saved_model/order_preserving_set.py b/keras/saving/legacy/saved_model/order_preserving_set.py
new file mode 100644
index 000000000000..f2479381534a
--- /dev/null
+++ b/keras/saving/legacy/saved_model/order_preserving_set.py
@@ -0,0 +1,93 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A set based on dict so that it preserves key insertion order.
+
+Python Dicts are order-preserving since 3.6
+(https://mail.python.org/pipermail/python-dev/2017-December/151283.html),
+but sets are not. This class implements a set on top of a dict so that we get
+deterministic iteration order across runs.
+"""
+
+import collections.abc
+
+
+class OrderPreservingSet(collections.abc.MutableSet):
+    """A set based on dict so that it preserves key insertion order."""
+
+    def __init__(self, iterable=None):
+        self._dict = {item: None for item in (iterable or [])}
+
+    # abstract from collections.MutableSet
+    def __len__(self):
+        return len(self._dict)
+
+    # abstract from collections.MutableSet
+    def __contains__(self, value):
+        return value in self._dict
+
+    # override from collections.MutableSet
+    def __iter__(self):
+        return iter(self._dict)
+
+    # abstract from collections.MutableSet
+    def add(self, item):
+        self._dict[item] = None
+
+    # abstract from collections.MutableSet
+    def discard(self, value):
+        del self._dict[value]
+
+    # override from collections.MutableSet
+    def clear(self):
+        self._dict = {}
+
+    # override from collections.Set
+    def __eq__(self, other):
+        if not isinstance(other, OrderPreservingSet):
+            return NotImplemented
+        return self._dict.keys() == other._dict.keys()
+
+    # override from collections.Set
+    def __le__(self, other):
+        if not isinstance(other, OrderPreservingSet):
+            return NotImplemented
+        return self._dict.keys() <= other._dict.keys()
+
+    # override from collections.Set
+    def __ge__(self, other):
+        if not isinstance(other, OrderPreservingSet):
+            return NotImplemented
+        return self._dict.keys() >= other._dict.keys()
+
+    # override from collections.Set
+    def __and__(self, other):
+        # collections.Set defaults to the ordering in other, we want to use self
+        return self._from_iterable(value for value in self if value in other)
+
+    # override from collections.Set
+    def __or__(self, other):
+        # ensure that other is ordered before performing __or__
+        if not isinstance(other, OrderPreservingSet):
+            raise TypeError(
+                "cannot union an 'OrderPreservingSet' with an "
+                "unordered iterable."
+            )
+        result = self._from_iterable(value for value in self)
+        for value in other:
+            result._dict[value] = None
+        return result
+
+    def union(self, other):
+        return self | other
diff --git a/keras/saving/legacy/saved_model/revive_test.py b/keras/saving/legacy/saved_model/revive_test.py
new file mode 100644
index 000000000000..4a134fc82fdc
--- /dev/null
+++ b/keras/saving/legacy/saved_model/revive_test.py
@@ -0,0 +1,458 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests reviving models from config and SavedModel.
+
+These tests ensure that a model revived from a combination of config and
+SavedModel have the expected structure.
+"""
+
+# TODO(kathywu): Move relevant tests from saved_model_test to
+import shutil
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras import backend
+from keras.saving.legacy.saved_model import load as keras_load
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import CustomObjectScope
+
+
+class SubclassedModelNoConfig(keras.Model):
+    def __init__(self, a, b):
+        super().__init__()
+
+        self.a = a
+        self.b = b
+        self.shared = CustomLayerNoConfig(a, b)
+        self.all_layers = []
+
+    def build(self, input_shape):
+        self.all_layers.extend(
+            [
+                self.shared,
+                CustomLayerWithConfig(self.a + 1, self.b + 2),
+                CustomLayerNoConfig(self.a + 3, self.b + 4),
+                keras.Sequential(
+                    [
+                        # TODO(b/145029112): Bug with losses when there are
+                        # shared layers.  self.shared,  <-- Enable when bug is
+                        # fixed.
+                        CustomLayerNoConfig(self.a + 5, self.b + 6)
+                    ]
+                ),
+            ]
+        )
+        super().build(input_shape)
+
+    def call(self, inputs):
+        x = inputs
+        for layer in self.all_layers:
+            x = layer(x)
+        return x
+
+
+class SparseDense(keras.layers.Dense):
+    def call(self, inputs):
+        input_shape = tf.stack(
+            (tf.reduce_prod(tf.shape(inputs)[:-1]), self.kernel.shape[0])
+        )
+        output_shape = tf.concat(
+            (tf.shape(inputs)[:-1], [self.kernel.shape[1]]), -1
+        )
+        x = tf.sparse.reshape(inputs, input_shape)
+        return tf.reshape(
+            self.activation(
+                tf.sparse.sparse_dense_matmul(x, self.kernel) + self.bias
+            ),
+            output_shape,
+        )
+
+
+class SubclassedSparseModelNoConfig(keras.Model):
+    def __init__(self, a, b):
+        super().__init__()
+        self.a = a
+        self.shared = CustomLayerNoConfig(a, b)
+        self.all_layers = [SparseDense(4)]
+
+    def call(self, inputs):
+        x = inputs
+        for layer in self.all_layers:
+            x = layer(x)
+        return self.shared(x + self.a)
+
+
+class SubclassedModelWithConfig(SubclassedModelNoConfig):
+    def get_config(self):
+        return {"a": self.a, "b": self.b}
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+
+class CustomLayerNoConfig(keras.layers.Layer):
+    def __init__(self, a, b, name=None):
+        super().__init__(name=name)
+        self.a = tf.Variable(a, name="a")
+        self.b = b
+
+        def a_regularizer():
+            return self.a * 2
+
+        self.add_loss(a_regularizer)
+        self.sum_metric = keras.metrics.Sum(name="inputs_sum")
+        self.unused_metric = keras.metrics.Sum(name="not_added_to_metrics")
+
+    def build(self, input_shape):
+        self.c = tf.Variable(
+            tf.constant(1.0, shape=input_shape[1:]), name=self.name + "_c"
+        )
+
+    def call(self, inputs):
+        self.add_loss(tf.reduce_sum(inputs))
+        self.add_metric(self.sum_metric(inputs))
+        self.add_metric(inputs, aggregation="mean", name="mean")
+
+        return inputs + self.c
+
+
+class CustomLayerWithConfig(CustomLayerNoConfig):
+    def get_config(self):
+        return {"a": backend.get_value(self.a), "b": self.b, "name": self.name}
+
+
+class CustomNetworkDefaultConfig(keras.Model):
+    def __init__(self, num_classes, name=None):
+        inputs = keras.Input((2, 3), name="inputs")
+        x = keras.layers.Flatten(name="flatten")(inputs)
+        y = keras.layers.Dense(num_classes, name="outputs")(x)
+        super().__init__(inputs, y, name=name)
+
+
+class CustomNetworkWithConfig(CustomNetworkDefaultConfig):
+    def __init__(self, num_classes, name=None):
+        super().__init__(num_classes, name=name)
+        self._config_dict = dict(num_classes=num_classes)
+
+    def get_config(self):
+        return self._config_dict
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(config["num_classes"], name=config.get("name"))
+
+
+class CustomNetworkWithConfigName(CustomNetworkWithConfig):
+    def __init__(self, num_classes, name=None):
+        super().__init__(num_classes, name=name)
+        self._config_dict["name"] = self.name
+
+
+class UnregisteredCustomSequentialModel(keras.Sequential):
+    # This class is *not* registered in the CustomObjectScope.
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add(keras.layers.InputLayer(input_shape=(2, 3)))
+
+
+class FunctionalSubclassModel(keras.Model):
+    def __init__(self, units):
+        self.units = units
+        my_input = keras.Input(shape=(2, 3), name="inputs")
+        dense = keras.layers.Dense(self.units, activation="relu", name="dense")
+        output = dense(my_input)
+        outputs = {"output": output}
+        super().__init__(inputs=[my_input], outputs=outputs)
+
+    def get_config(self):
+        return {"units": self.units}
+
+
+class FunctionalSubclassModelWrongConfig(FunctionalSubclassModel):
+    def get_config(self):
+        return {}
+
+
+# The WideDeepModel, whose name conflicts with a Keras built-in model, is
+# registered in these tests.
+class WideDeepModel(SubclassedModelWithConfig):
+    pass
+
+
+class ReviveTestBase(test_combinations.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.path = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, self.path, ignore_errors=True)
+
+    def _assert_revived_correctness(self, model, revived):
+        self.assertAllEqual(model.input_names, revived.input_names)
+        self.assertAllEqual(model.output_names, revived.output_names)
+        if model.inputs is not None:
+            self.assertTrue(
+                all(
+                    [
+                        i.shape.as_list() == r.shape.as_list()
+                        and i.dtype == r.dtype
+                        for (i, r) in zip(model.inputs, revived.inputs)
+                    ]
+                )
+            )
+            self.assertTrue(
+                all(
+                    [
+                        i.shape.as_list() == r.shape.as_list()
+                        and i.dtype == r.dtype
+                        for (i, r) in zip(model.outputs, revived.outputs)
+                    ]
+                )
+            )
+
+        self.assertAllClose(
+            self.evaluate(model.weights), self.evaluate(revived.weights)
+        )
+        input_arr = tf.constant(np.random.random((2, 2, 3)).astype(np.float32))
+        if isinstance(revived.save_spec()[0][0], tf.SparseTensorSpec):
+            input_arr = tf.sparse.from_dense(input_arr)
+
+        self.assertAllClose(model(input_arr), revived(input_arr))
+        self.assertAllClose(sum(model.losses), sum(revived.losses))
+        self.assertAllClose(len(model.losses), len(revived.losses))
+        self.assertEqual(len(model.metrics), len(revived.metrics))
+        # TODO(b/150403085): Investigate why the metric order changes when
+        # running this test in tf-nightly.
+        self.assertAllClose(
+            sorted([m.result() for m in model.metrics]),
+            sorted([m.result() for m in revived.metrics]),
+        )
+        model_layers = {layer.name: layer for layer in model.layers}
+        revived_layers = {layer.name: layer for layer in revived.layers}
+        self.assertAllEqual(model_layers.keys(), revived_layers.keys())
+
+        for name in model_layers:
+            model_layer = model_layers[name]
+            revived_layer = revived_layers[name]
+            self.assertEqual(model_layer.name, revived_layer.name)
+            self.assertEqual(model_layer.dtype, revived_layer.dtype)
+            self.assertEqual(model_layer.trainable, revived_layer.trainable)
+            if "WithConfig" in type(model_layer).__name__:
+                self.assertEqual(type(model_layer), type(revived_layer))
+            else:
+                # When loading layers from SavedModel, a new class is
+                # dynamically created with the same name.
+                self.assertEqual(
+                    type(model_layer).__name__, type(revived_layer).__name__
+                )
+
+
+# These tests take a while to run, so each should run in a separate shard
+# (putting them in the same TestCase resolves this).
+class TestBigModelRevive(ReviveTestBase):
+    @test_combinations.run_with_all_model_types
+    def test_revive(self):
+        input_shape = None
+        if test_utils.get_model_type() == "functional":
+            input_shape = (2, 3)
+
+        layer_with_config = CustomLayerWithConfig(1.0, 2)
+        layer_without_config = CustomLayerNoConfig(3.0, 4)
+        subclassed_with_config = SubclassedModelWithConfig(4.0, 6.0)
+        subclassed_without_config = SubclassedModelNoConfig(7.0, 8.0)
+
+        inputs = keras.Input((2, 3))
+        x = CustomLayerWithConfig(1.0, 2)(inputs)
+        x = CustomLayerNoConfig(3.0, 4)(x)
+        x = SubclassedModelWithConfig(4.0, 6.0)(x)
+        x = SubclassedModelNoConfig(7.0, 8.0)(x)
+        inner_model_functional = keras.Model(inputs, x)
+
+        inner_model_sequential = keras.Sequential(
+            [
+                CustomLayerWithConfig(1.0, 2),
+                CustomLayerNoConfig(3.0, 4),
+                SubclassedModelWithConfig(4.0, 6.0),
+                SubclassedModelNoConfig(7.0, 8.0),
+            ]
+        )
+
+        class SubclassedModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.all_layers = [
+                    CustomLayerWithConfig(1.0, 2),
+                    CustomLayerNoConfig(3.0, 4),
+                    SubclassedModelWithConfig(4.0, 6.0),
+                    SubclassedModelNoConfig(7.0, 8.0),
+                ]
+
+            def call(self, inputs):
+                x = inputs
+                for layer in self.all_layers:
+                    x = layer(x)
+                return x
+
+        inner_model_subclassed = SubclassedModel()
+
+        layers = [
+            layer_with_config,
+            layer_without_config,
+            subclassed_with_config,
+            subclassed_without_config,
+            inner_model_functional,
+            inner_model_sequential,
+            inner_model_subclassed,
+        ]
+        model = test_utils.get_model_from_layers(
+            layers, input_shape=input_shape
+        )
+        # Run data through the Model to create save spec and weights.
+        model.predict(np.ones((10, 2, 3)), batch_size=10)
+
+        # Test that the correct checkpointed values are loaded, whether the
+        # layer is created from the config or SavedModel.
+        layer_with_config.c.assign(2 * layer_with_config.c)
+        layer_without_config.c.assign(3 * layer_without_config.c)
+
+        model.save(self.path, save_format="tf")
+        revived = keras_load.load(self.path)
+        self._assert_revived_correctness(model, revived)
+
+
+class TestModelRevive(ReviveTestBase):
+    def test_revive_subclassed_with_nested_model(self):
+        model = SubclassedModelNoConfig(1.0, 2.0)
+        # Run data through the Model to create save spec and weights.
+        model.predict(np.ones((10, 2, 3)), batch_size=10)
+        model.save(self.path, save_format="tf")
+        revived = keras_load.load(self.path)
+        self._assert_revived_correctness(model, revived)
+
+    def test_revive_subclassed_with_sparse_model(self):
+        model = SubclassedSparseModelNoConfig(1.0, 2.0)
+        # Run data through the Model to create save spec and weights.
+        x = tf.sparse.from_dense(np.ones((10, 2, 3), dtype=np.float32))
+        model.predict(x, batch_size=10)
+        model.save(self.path, save_format="tf")
+        revived = keras_load.load(self.path)
+        self._assert_revived_correctness(model, revived)
+
+    def test_revive_unregistered_sequential(self):
+        model = UnregisteredCustomSequentialModel()
+        x = np.random.random((2, 2, 3)).astype(np.float32)
+        model(x)
+        model.save(self.path, save_format="tf")
+        revived = keras_load.load(self.path)
+        self._assert_revived_correctness(model, revived)
+
+    def test_revive_sequential_inputs(self):
+        model = keras.models.Sequential(
+            [
+                keras.Input((None,), dtype=tf.string),
+                keras.layers.Lambda(tf.strings.lower),
+            ]
+        )
+        model.save(self.path, save_format="tf")
+        revived = keras_load.load(self.path)
+        revived_layers = list(
+            revived._flatten_layers(include_self=False, recursive=False)
+        )
+        self.assertEqual(tf.string, revived_layers[0].dtype)
+
+    @parameterized.named_parameters(
+        ("default_config", CustomNetworkDefaultConfig),
+        ("with_config", CustomNetworkWithConfig),
+        ("with_config_name", CustomNetworkWithConfigName),
+    )
+    def test_revive_network(self, model_cls):
+        model = model_cls(8)
+        model.save(self.path, include_optimizer=False, save_format="tf")
+        revived = keras_load.load(self.path, compile=False)
+        self._assert_revived_correctness(model, revived)
+
+    def test_functional_subclass(self):
+        model = FunctionalSubclassModel(32)
+        model.save(self.path, save_format="tf")
+        revived = keras_load.load(self.path, compile=False)
+        self._assert_revived_correctness(model, revived)
+
+    def test_functional_subclass_wrong_config(self):
+        model = FunctionalSubclassModelWrongConfig(32)
+        model.save(self.path, save_format="tf")
+        with self.assertRaisesRegex(TypeError, "required positional arguments"):
+            keras_load.load(self.path, compile=False)
+
+    def test_load_compiled_metrics(self):
+        model = test_utils.get_small_sequential_mlp(1, 3)
+
+        # Compile with dense categorical accuracy
+        model.compile("rmsprop", "mse", "acc")
+        x = np.random.random((5, 10)).astype(np.float32)
+        y_true = np.random.random((5, 3)).astype(np.float32)
+        model.train_on_batch(x, y_true)
+
+        model.save(self.path, include_optimizer=True, save_format="tf")
+        revived = keras_load.load(self.path, compile=True)
+        self.assertAllClose(
+            model.test_on_batch(x, y_true), revived.test_on_batch(x, y_true)
+        )
+
+        # Compile with sparse categorical accuracy
+        model.compile("rmsprop", "mse", "acc")
+        y_true = np.random.randint(0, 3, (5, 1)).astype(np.float32)
+        model.train_on_batch(x, y_true)
+        model.save(self.path, include_optimizer=True, save_format="tf")
+        revived = keras_load.load(self.path, compile=True)
+        self.assertAllClose(
+            model.test_on_batch(x, y_true), revived.test_on_batch(x, y_true)
+        )
+
+    def test_revived_model_has_save_spec(self):
+        model = SubclassedModelWithConfig(2, 3)
+        model.predict(np.random.random((5, 10)).astype(np.float32))
+        model.save(self.path, save_format="tf")
+        revived = keras_load.load(self.path, compile=True)
+        self.assertAllEqual(
+            model._get_save_spec(dynamic_batch=False),
+            revived._get_save_spec(dynamic_batch=False),
+        )
+
+    def test_load_model_with_name_conflict_registered_works(self):
+        model = WideDeepModel(2, 3)
+        model(np.random.random((5, 10)).astype(np.float32))
+        model.save(self.path, save_format="tf")
+        keras_load.load(self.path, compile=True)
+
+
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    with CustomObjectScope(
+        {
+            "CustomLayerWithConfig": CustomLayerWithConfig,
+            "CustomNetworkWithConfig": CustomNetworkWithConfig,
+            "CustomNetworkWithConfigName": CustomNetworkWithConfigName,
+            "SubclassedModelWithConfig": SubclassedModelWithConfig,
+            "FunctionalSubclassModel": FunctionalSubclassModel,
+            "FunctionalSubclassModelWrongConfig": FunctionalSubclassModelWrongConfig,  # noqa: E501
+            "WideDeepModel": WideDeepModel,
+        }
+    ):
+        tf.test.main()
diff --git a/keras/saving/legacy/saved_model/save.py b/keras/saving/legacy/saved_model/save.py
new file mode 100644
index 000000000000..9126275cf3b3
--- /dev/null
+++ b/keras/saving/legacy/saved_model/save.py
@@ -0,0 +1,157 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras legacy SavedModel saving."""
+
+import os
+
+import tensorflow.compat.v2 as tf
+from absl import logging
+
+from keras import backend
+from keras.protobuf import saved_metadata_pb2
+from keras.protobuf import versions_pb2
+from keras.saving.legacy import saving_utils
+from keras.saving.legacy import serialization
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import save_impl
+from keras.saving.legacy.saved_model import utils
+from keras.utils.generic_utils import LazyLoader
+from keras.utils.io_utils import ask_to_proceed_with_overwrite
+
+# isort: off
+from tensorflow.python.saved_model import save as save_lib
+
+# To avoid circular dependencies between keras/engine and keras/saving,
+# code in keras/saving must delay imports.
+
+base_layer = LazyLoader("base_layer", globals(), "keras.engine.base_layer")
+training_lib = LazyLoader("training_lib", globals(), "keras.engine.training")
+
+
+def save(
+    model,
+    filepath,
+    overwrite,
+    include_optimizer,
+    signatures=None,
+    options=None,
+    save_traces=True,
+):
+    """Saves a model as a SavedModel to the filepath.
+
+    Args:
+      model: Keras model instance to be saved.
+      filepath: String path to save the model.
+      overwrite: whether to overwrite the existing filepath.
+      include_optimizer: If True, save the model's optimizer state.
+      signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
+        format only. Please see the `signatures` argument in
+        `tf.saved_model.save` for details.
+      options: (only applies to SavedModel format) `tf.saved_model.SaveOptions`
+        object that specifies options for saving to SavedModel.
+      save_traces: (only applies to SavedModel format) When enabled, the
+        SavedModel will store the function traces for each layer. This
+        can be disabled, so that only the configs of each layer are stored.
+        Disabling this will decrease serialization time and file size, but
+        it requires that all custom layers/models implement a
+        `get_config()` method. Defaults to `True`.
+
+    Raises:
+      ValueError: if the model's inputs have not been defined.
+    """
+    # If file exists and should not be overwritten.
+    if not overwrite and os.path.exists(filepath):
+        proceed = ask_to_proceed_with_overwrite(filepath)
+        if not proceed:
+            return
+
+    if save_traces:
+        if save_impl.should_skip_serialization(model):
+            saving_utils.raise_model_input_error(model)
+
+    if not include_optimizer:
+        orig_optimizer = model.optimizer
+        model.optimizer = None
+        # TODO(b/180760306) Change to del model.optimizer if Layer's __delattr__
+        # calls AutoTrackable's __delattr__.
+        model._delete_tracking("optimizer")
+
+    # Trace all functions and signatures with `training=0` instead of using an
+    # already-set learning phase placeholder.
+    # This is needed for compatibility reasons until learning phase setting
+    # is removed from the public apis.
+    with serialization.SharedObjectSavingScope():
+        with backend.deprecated_internal_learning_phase_scope(0):
+            with utils.keras_option_scope(save_traces):
+                saved_nodes, node_paths = save_lib.save_and_return_nodes(
+                    model, filepath, signatures, options
+                )
+
+            # Save all metadata to a separate file in the SavedModel directory.
+            metadata = generate_keras_metadata(saved_nodes, node_paths)
+
+    with tf.io.gfile.GFile(
+        tf.io.gfile.join(filepath, constants.SAVED_METADATA_PATH), "wb"
+    ) as w:
+        w.write(metadata.SerializeToString(deterministic=True))
+
+    if not include_optimizer:
+        model.optimizer = orig_optimizer
+
+
+def generate_keras_metadata(saved_nodes, node_paths):
+    """Constructs a KerasMetadata proto with the metadata of each object."""
+    metadata = saved_metadata_pb2.SavedMetadata()
+    for node_id, node in enumerate(saved_nodes):
+        if isinstance(node, base_layer.Layer):
+            path = node_paths[node]
+            if not path:
+                node_path = "root"
+            else:
+                node_path = f"root.{'.'.join([ref.name for ref in path])}"
+
+            metadata.nodes.add(
+                node_id=node_id,
+                node_path=node_path,
+                version=versions_pb2.VersionDef(
+                    producer=2, min_consumer=1, bad_consumers=[]
+                ),
+                identifier=node._object_identifier,
+                metadata=node._tracking_metadata,
+            )
+
+            # Log warning if the node's class name conflicts with a Keras
+            # built-in object.
+            class_name = node.__class__.__name__
+            from keras.layers import serialization as layers_serialization
+
+            builtin_layer = layers_serialization.get_builtin_layer(class_name)
+            if builtin_layer:
+                if not isinstance(node, builtin_layer):
+                    logging.warning(
+                        "%s has the same name '%s' as a built-in Keras "
+                        "object. Consider renaming %s to avoid naming "
+                        "conflicts when loading with "
+                        "`tf.keras.models.load_model`. "
+                        "If renaming is not possible, pass "
+                        "the object in the `custom_objects` "
+                        "parameter of the load "
+                        "function.",
+                        node,
+                        class_name,
+                        node.__class__,
+                    )
+
+    return metadata
diff --git a/keras/saving/legacy/saved_model/save_impl.py b/keras/saving/legacy/saved_model/save_impl.py
new file mode 100644
index 000000000000..a3e769c47618
--- /dev/null
+++ b/keras/saving/legacy/saved_model/save_impl.py
@@ -0,0 +1,781 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras SavedModel serialization.
+
+TODO (kathywu): Move to layer_serialization.py. Some model-specific logic should
+go to model_serialization.py.
+"""
+
+import functools
+import threading
+import weakref
+
+import tensorflow.compat.v1.logging as logging
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.engine import base_layer_utils
+from keras.engine import input_spec
+from keras.mixed_precision import autocast_variable
+from keras.saving.legacy import saving_utils
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import load as keras_load
+from keras.saving.legacy.saved_model import serialized_attributes
+from keras.saving.legacy.saved_model import utils
+from keras.utils import layer_utils
+from keras.utils import tf_contextlib
+from keras.utils import tf_utils
+from keras.utils import version_utils
+from keras.utils.generic_utils import LazyLoader
+
+# To avoid circular dependencies between keras/engine and keras/saving,
+# code in keras/saving must delay imports.
+
+# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
+# once the issue with copybara is fixed.
+
+base_layer = LazyLoader("base_layer", globals(), "keras.engine.base_layer")
+metrics = LazyLoader("metrics", globals(), "keras.metrics")
+input_layer = LazyLoader("input_layer", globals(), "keras.engine.input_layer")
+training_lib = LazyLoader("training_lib", globals(), "keras.engine.training")
+sequential_lib = LazyLoader(
+    "sequential_lib", globals(), "keras.engine.sequential"
+)
+
+
+def should_skip_serialization(layer):
+    """Skip serializing extra objects and functions if layer inputs aren't
+    set."""
+    saved_model_input_spec_set = (
+        isinstance(layer, training_lib.Model)
+        and layer._saved_model_inputs_spec is not None
+    )
+    if not layer.built and not saved_model_input_spec_set:
+        logging.warning(
+            "Skipping full serialization of Keras layer {}, because "
+            "it is not built.".format(layer)
+        )
+        return True
+    return False
+
+
+def _filter_shards(variables):
+    return [var for var in variables if not hasattr(var, "_sharded_container")]
+
+
+def wrap_layer_objects(layer, serialization_cache):
+    """Returns extra trackable objects to attach to the serialized layer.
+
+    Args:
+      layer: Keras Layer object.
+      serialization_cache: Dictionary shared between all objects during
+        serialization.
+
+    Returns:
+      A dictionary containing all checkpointable objects from a
+      SerializedAttributes object. See LayerAttributes and ModelAttributes for
+      entire list of objects
+    """
+    # Wrap all regularization losses as tf.functions.
+    # First, generate list of all regularization losses in this layer and
+    # sublayers.
+    all_losses = layer._callable_losses[:]
+    for child_layer in utils.list_all_layers(layer):
+        all_losses.extend(child_layer._callable_losses)
+    # Next, wrap all loss functions as tf.functions. Use the serialization cache
+    # to store already-wrapped functions.
+    keras_loss_cache = serialization_cache.setdefault("keras_losses", {})
+    wrapped_loss_functions = []
+    for loss_fn in all_losses:
+        if loss_fn in keras_loss_cache:
+            wrapped_loss_functions.append(keras_loss_cache[loss_fn])
+        else:
+            wrapped_loss = _wrap_unconditional_loss(
+                loss_fn, len(keras_loss_cache)
+            )
+            keras_loss_cache[loss_fn] = wrapped_loss
+            wrapped_loss_functions.append(wrapped_loss)
+    wrapped_layer_losses = [
+        keras_loss_cache[fn] for fn in layer._callable_losses[:]
+    ]
+
+    layer_metrics = tf.__internal__.tracking.wrap(
+        {m.name: m for m in layer._metrics}
+    )
+
+    # Avoid duplicate creation of shard Variables on loading.
+    # `layer.variables` will return the shard Variables rather than the
+    # ShardedVariables (b/224541446), but Keras loading will create new
+    # ShardedVariables (and thus shard Variables) from Keras metadata if needed.
+    # There's no need to also save the shard Variables here, so filter them out.
+    variables = _filter_shards(layer.variables)
+    trainable_variables = _filter_shards(layer.trainable_variables)
+    non_trainable_variables = _filter_shards(layer.non_trainable_variables)
+    return dict(
+        variables=tf.__internal__.tracking.wrap(variables),
+        trainable_variables=tf.__internal__.tracking.wrap(trainable_variables),
+        non_trainable_variables=tf.__internal__.tracking.wrap(
+            non_trainable_variables
+        ),
+        layers=tf.__internal__.tracking.wrap(utils.list_all_layers(layer)),
+        metrics=tf.__internal__.tracking.wrap(layer.metrics),
+        regularization_losses=tf.__internal__.tracking.wrap(
+            wrapped_loss_functions
+        ),
+        layer_regularization_losses=tf.__internal__.tracking.wrap(
+            wrapped_layer_losses
+        ),
+        layer_metrics=layer_metrics,
+    )
+
+
+def wrap_layer_functions(layer, serialization_cache):
+    """Returns dict of wrapped layer call function and losses in tf.functions.
+
+    Args:
+      layer: Keras Layer object.
+      serialization_cache: Dictionary shared between all objects during
+        serialization.
+
+    Returns:
+      A dictionary containing all keras tf.functions to serialize. See
+      LayerAttributes and ModelAttributes for the list of all attributes.
+    """
+    # Since Sequential models may be modified in place using model.add() or
+    # model.pop(), don't use saved functions.
+    if isinstance(layer, keras_load.RevivedLayer) and not isinstance(
+        layer, sequential_lib.Sequential
+    ):
+        return {
+            fn_name: getattr(layer.keras_api, fn_name, None)
+            for fn_name in serialized_attributes.LayerAttributes.all_functions
+        }
+
+    # Reset the losses of the layer and its children. The call function in each
+    # child layer is replaced with tf.functions.
+    original_fns = _replace_child_layer_functions(layer, serialization_cache)
+    original_losses = _reset_layer_losses(layer)
+
+    # Wrap all the layer call and activity regularizer functions.
+
+    # Use LayerCallCollection to ensure that all layer call functions (__call__,
+    # call with losses) are traced with the same inputs.
+    call_collection = LayerCallCollection(layer)
+    call_fn_with_losses = call_collection.add_function(
+        _wrap_call_and_conditional_losses(layer),
+        f"{layer.name}_layer_call_and_return_conditional_losses",
+        # If any of this layer's child layers use the training arg, the traced
+        # call functions of this layer will have a training keyword argument. If
+        # the original layer does not expect the training arg, then it will have
+        # to be removed (by setting `match_layer_training_arg`).
+        match_layer_training_arg=True,
+    )
+    call_fn = call_collection.add_function(
+        _extract_outputs_from_fn(layer, call_fn_with_losses),
+        f"{layer.name}_layer_call_fn",
+        # Since `call_fn` wraps call_fn_with_losses and not the original call
+        # function, `match_layer_training_arg` should be set to False.
+        match_layer_training_arg=False,
+    )
+
+    fns = {
+        "call_and_return_conditional_losses": call_fn_with_losses,
+        "__call__": call_fn,
+    }
+
+    if layer._activity_regularizer is not None:
+        fns["activity_regularizer_fn"] = _wrap_activity_regularizer(layer)
+        fns[
+            "call_and_return_all_conditional_losses"
+        ] = call_collection.add_function(
+            _append_activity_regularizer_loss(
+                layer, call_fn_with_losses, fns["activity_regularizer_fn"]
+            ),
+            f"{layer.name}_layer_call_and_return_all_conditional_losses",
+            match_layer_training_arg=False,
+        )
+    else:
+        fns["activity_regularizer_fn"] = None
+        fns["call_and_return_all_conditional_losses"] = call_fn_with_losses
+
+    # Manually trigger traces before restoring the overwritten functions. The
+    # functions are traced within the layer call context to ensure that layer
+    # functions (e.g. add_loss) behave as though running in graph mode.
+    with tracing_scope():
+        call_collection.trace_with_input_signature()
+        with base_layer_utils.call_context().enter(
+            layer, inputs=None, build_graph=True, training=None, saving=True
+        ):
+            for fn in fns.values():
+                if fn is not None and not isinstance(fn, LayerCall):
+                    fn.get_concrete_function()
+
+    # Restore overwritten functions and losses
+    _restore_child_layer_functions(original_fns)
+    _restore_layer_losses(original_losses)
+
+    return fns
+
+
+def default_save_signature(layer):
+    original_losses = _reset_layer_losses(layer)
+    fn = saving_utils.trace_model_call(layer)
+    _restore_layer_losses(original_losses)
+    return fn
+
+
+def _replace_child_layer_functions(layer, serialization_cache):
+    """Replaces functions in the children layers with wrapped tf.functions.
+
+    This step allows functions from parent layers to reference the wrapped
+    functions from their children layers instead of retracing the ops.
+
+    This function also resets all losses stored in the layer. These are stored
+    in the returned dictionary. Use `_restore_child_layer_functions` to restore
+    the original attributes.
+
+    Args:
+      layer: Keras Layer object.
+      serialization_cache: Dictionary shared between all objects during
+        serialization.
+
+    Returns:
+      Dictionary mapping layer objects -> original functions and losses:
+        { Child layer 1: {
+            'losses': Original losses,
+            'call': Original call function
+            '_activity_regularizer': Original activity regularizer},
+          Child layer 2: ...
+        }
+    """
+
+    original_fns = {}
+
+    def replace_layer_functions(child_layer, serialized_fns):
+        """Replaces layer call and activity regularizer with wrapped
+        functions."""
+        original_fns[child_layer] = {
+            "call": child_layer.call,
+            "_activity_regularizer": child_layer._activity_regularizer,
+        }
+        with utils.no_automatic_dependency_tracking_scope(child_layer):
+            try:
+                child_layer._activity_regularizer = serialized_fns.get(
+                    "activity_regularizer_fn"
+                )
+            except AttributeError:
+                # Some layers have an unsettable activity regularizer.
+                pass
+            child_layer.call = utils.use_wrapped_call(
+                child_layer,
+                serialized_fns["call_and_return_conditional_losses"],
+                child_layer._call_spec,
+                default_training_value=False,
+            )
+
+    def replace_metric_functions(child_layer, serialized_fns):
+        """Replaces metric functions with wrapped functions."""
+        original_fns[child_layer] = {
+            "__call__": child_layer.__call__,
+            "result": child_layer.result,
+            "update_state": child_layer.update_state,
+        }
+        with utils.no_automatic_dependency_tracking_scope(child_layer):
+            child_layer.__call__ = serialized_fns["__call__"]
+            child_layer.result = serialized_fns["result"]
+            child_layer.update_state = serialized_fns["update_state"]
+
+    for child_layer in utils.list_all_layers(layer):
+        if isinstance(child_layer, input_layer.InputLayer):
+            continue
+
+        if child_layer not in serialization_cache[constants.KERAS_CACHE_KEY]:
+            serialized_functions = child_layer._trackable_saved_model_saver._get_serialized_attributes(  # noqa: E501
+                serialization_cache
+            ).functions
+        else:
+            serialized_functions = serialization_cache[
+                constants.KERAS_CACHE_KEY
+            ][child_layer].functions
+        if not serialized_functions:
+            # This indicates either:
+            #   - circular dependency, which means the current layer's functions
+            #     should be wrapped first.
+            #   - Child layer's inputs are not defined, so its functions have
+            #     not been wrapped. In this case, no replacement is necessary so
+            #     move on to the next child.
+            continue
+
+        if isinstance(child_layer, metrics.Metric):
+            replace_metric_functions(child_layer, serialized_functions)
+        else:
+            replace_layer_functions(child_layer, serialized_functions)
+
+    return original_fns
+
+
+def _restore_child_layer_functions(original_fns):
+    """Restores attributes replaced with `_replace_child_layer_functions`."""
+    for child_layer, fns in original_fns.items():
+        with utils.no_automatic_dependency_tracking_scope(child_layer):
+            for fn_name, fn in fns.items():
+                try:
+                    setattr(child_layer, fn_name, fn)
+                except AttributeError:
+                    # In the case of _activity_regularizer, setting the
+                    # attribute may be disallowed.
+                    pass
+
+
+def _reset_layer_losses(parent_layer):
+    """Resets losses of layer and its sublayers, and returns original losses."""
+    losses_dict = {}
+    for layer in utils.list_all_layers_and_sublayers(parent_layer):
+        losses_dict[layer] = {
+            "losses": layer._losses[:],
+            "eager_losses": layer._eager_losses[:],
+        }
+        with utils.no_automatic_dependency_tracking_scope(layer):
+            layer._losses = []
+            layer._eager_losses = []
+    return losses_dict
+
+
+def _restore_layer_losses(losses_dict):
+    for layer in losses_dict:
+        with utils.no_automatic_dependency_tracking_scope(layer):
+            layer._losses = losses_dict[layer]["losses"]
+            layer._eager_losses = losses_dict[layer]["eager_losses"]
+
+
+class LayerTracingContext(threading.local):
+    def __init__(self):
+        super().__init__()
+        self.enable_call_tracing = False
+        self.trace_queue = []
+
+
+_thread_local_data = LayerTracingContext()
+
+
+@tf_contextlib.contextmanager
+def tracing_scope():
+    """Enables tracing scope."""
+    # This enables the LayerCallCollection's tracing mechanism to trace all call
+    # functions in the collection.
+    previous_value = _thread_local_data.enable_call_tracing
+    previous_queue = _thread_local_data.trace_queue
+    try:
+        _thread_local_data.enable_call_tracing = True
+        _thread_local_data.trace_queue = []
+        yield
+    finally:
+        # Run traces from the queue.
+        while _thread_local_data.trace_queue:
+            fn, args, kwargs, training = _thread_local_data.trace_queue.pop(0)
+            if training is not None:
+                with backend.deprecated_internal_learning_phase_scope(training):
+                    fn.get_concrete_function(*args, **kwargs)
+            else:
+                fn.get_concrete_function(*args, **kwargs)
+        _thread_local_data.trace_queue = previous_queue
+        _thread_local_data.enable_call_tracing = previous_value
+
+
+def add_trace_to_queue(fn, args, kwargs, training=None):
+    if tracing_enabled():
+        _thread_local_data.trace_queue.append(
+            (fn, args[:], kwargs.copy(), training)
+        )
+
+
+def tracing_enabled():
+    """Whether to add extra traces to the queue."""
+    return _thread_local_data.enable_call_tracing
+
+
+class LayerCallCollection:
+    """Groups wrapped layer call functions.
+
+    This is used to ensure that all layer call functions are traced with the
+    same inputs-
+      - call
+      - call_and_return_conditional_losses
+      - call_and_return_all_conditional_losses
+    """
+
+    def __init__(self, layer):
+        self.layer = layer
+
+        self.layer_call_method = _get_layer_call_method(layer)
+        self._expects_training_arg = utils.layer_uses_training_bool(layer)
+        self._call_spec = layer._call_spec
+
+        # Create new call spec if the layer itself does not accept a training
+        # arg, but one of its child layers does. When this layer's call
+        # functions are traced, they will be traced with an added `training`
+        # keyword argument.
+        if not self.layer._expects_training_arg and self._expects_training_arg:
+            arg_spec = utils.set_training_arg_spec(
+                self._call_spec.full_argspec, False
+            )
+            self._call_spec = layer_utils.CallFunctionSpec(arg_spec)
+
+        self._layer_inputs = self._get_layer_inputs(layer)
+        self._functions = weakref.WeakValueDictionary()
+
+        # Get the input argument name from the args.
+        if self._call_spec.arg_names:
+            self._input_arg_name = self._call_spec.arg_names[0]
+        else:
+            # Layer could be defined with only varargs, in which case use a
+            # default name.
+            self._input_arg_name = "inputs"
+
+    def _get_layer_inputs(self, layer):
+        """Inspects layer object and returns the inferred input signature.
+
+        Args:
+          layer: Layer object.
+
+        Returns:
+          List of possibly nested TensorSpecs of the layer call function inputs
+          in the form of `(args, kwargs)`
+        """
+        if (
+            isinstance(layer.call, tf.__internal__.function.Function)
+            and layer.call.input_signature is not None
+        ):
+            return layer.call.input_signature, {}
+        elif isinstance(layer, training_lib.Model):
+            return saving_utils.model_call_inputs(layer)
+        elif (
+            layer.input_spec is not None
+            and layer._use_input_spec_as_call_signature
+        ):
+
+            def to_tensor_spec_or_none(x):
+                spec = input_spec.to_tensor_spec(x, layer._compute_dtype)
+                # If the shape is too general (e.g. multiple dimensions are
+                # allowed), return None so that separate functions can be
+                # generated for each inferred input signature.
+                # TODO(b/134962016): currently partial signatures are not
+                # supported.
+                if spec.shape == tf.TensorShape(None):
+                    return None, None
+                return spec
+
+            input_signature = [
+                tf.nest.map_structure(to_tensor_spec_or_none, layer.input_spec)
+            ]
+
+            return input_signature, {}
+        else:
+            return None, None
+
+    def add_trace(self, *args, **kwargs):
+        """Traces all functions with the same args and kwargs.
+
+        Args:
+          *args: Positional args passed to the original function.
+          **kwargs: Keyword args passed to the original function.
+        """
+        args = list(args)
+        kwargs = kwargs.copy()
+
+        for fn in self._functions.values():
+            # TODO(kathywu): Replace arguments with broader shapes defined in
+            # the input signature.
+            if self._expects_training_arg:
+
+                def trace_with_training(value, fn=fn):
+                    nonlocal args, kwargs
+                    (args, kwargs,) = self._call_spec.set_arg_value(
+                        "training", value, args, kwargs, inputs_in_args=True
+                    )
+                    add_trace_to_queue(fn, args, kwargs, value)
+
+                trace_with_training(True)
+                trace_with_training(False)
+            else:
+                add_trace_to_queue(fn, args, kwargs)
+
+    def training_arg_was_passed(self, args, kwargs):
+        return self._call_spec.arg_was_passed(
+            "training", args, kwargs, inputs_in_args=True
+        )
+
+    def get_training_arg_value(self, args, kwargs):
+        try:
+            return self._call_spec.get_arg_value(
+                "training", args, kwargs, inputs_in_args=True
+            )
+        except KeyError:  # Training is not in args or kwargs.
+            return None
+
+    def get_input_arg_value(self, args, kwargs):
+        return self._call_spec.get_arg_value(
+            self._input_arg_name, args, kwargs, inputs_in_args=True
+        )
+
+    def _maybe_wrap_with_training_arg(self, call_fn, match_layer_training_arg):
+        """Wraps call function with added training argument if necessary."""
+        if not self.layer._expects_training_arg and self._expects_training_arg:
+            # Add training arg to wrapper function.
+            def wrap_with_training_arg(*args, **kwargs):
+                if match_layer_training_arg:
+                    # Remove the training value, since the original call_fn does
+                    # not expect a training arg. Instead, the training value
+                    # will be propagated using the call context created in
+                    # LayerCall.
+                    args = list(args)
+                    kwargs = kwargs.copy()
+                    (args, kwargs,) = self._call_spec.set_arg_value(
+                        "training",
+                        None,
+                        args,
+                        kwargs,
+                        inputs_in_args=True,
+                        pop_kwarg_if_none=True,
+                    )
+                return call_fn(*args, **kwargs)
+
+            return tf.__internal__.decorator.make_decorator(
+                target=call_fn,
+                decorator_func=wrap_with_training_arg,
+                decorator_argspec=self._call_spec.full_argspec,
+            )
+
+        return call_fn
+
+    def add_function(self, call_fn, name, match_layer_training_arg):
+        """Adds a layer call function to the collection.
+
+        Args:
+          call_fn: a python function
+          name: Name of call function
+          match_layer_training_arg: If True, removes the `training` from the
+            function arguments when calling `call_fn`.
+
+        Returns:
+          LayerCall (tf.function)
+        """
+        fn = LayerCall(
+            self,
+            self._maybe_wrap_with_training_arg(
+                call_fn, match_layer_training_arg
+            ),
+            name,
+        )
+        self._functions[name] = fn.wrapped_call
+        return fn
+
+    def trace_with_input_signature(self):
+        """Trace with the layer/models inferred input signature if possible."""
+        if self._layer_inputs[0] is None:
+            return
+
+        args, kwargs = self._layer_inputs
+        if self._expects_training_arg:
+            args, kwargs = self._call_spec.set_arg_value(
+                "training", False, args, kwargs, inputs_in_args=True
+            )
+        if None not in tf.nest.flatten([args, kwargs]):
+            # Manually add traces for layers that have keyword arguments and
+            # have a fully defined input signature.
+            self.add_trace(*args, **kwargs)
+
+
+def _filtered_inputs(inputs):
+    return list(filter(tf_utils.is_tensor_or_variable, tf.nest.flatten(inputs)))
+
+
+def layer_call_wrapper(call_collection, method, name):
+    """Ensures layer losses are kept the same, and runs method in call
+    context."""
+
+    # Create wrapper that deals with losses and call context.
+    def wrapper(*args, **kwargs):
+        """Calls method within call context."""
+        layer = call_collection.layer
+        training = None
+        inputs = _filtered_inputs([args, kwargs])
+
+        if (args or kwargs) and call_collection.training_arg_was_passed(
+            args, kwargs
+        ):
+            training = call_collection.get_training_arg_value(args, kwargs)
+
+        original_losses = _reset_layer_losses(layer)
+        with base_layer_utils.call_context().enter(
+            layer,
+            inputs=inputs,
+            build_graph=False,
+            training=training,
+            saving=True,
+        ):
+            with autocast_variable.enable_auto_cast_variables(
+                layer._compute_dtype_object
+            ):
+                ret = method(*args, **kwargs)
+        _restore_layer_losses(original_losses)
+        return ret
+
+    # Rename to `name`, since tf.function doesn't have a name argument. Without
+    # this, all functions returned by this method will be named "call", which
+    # would be a nightmare to debug.
+    fn = tf.__internal__.decorator.make_decorator(
+        target=method, decorator_func=wrapper
+    )
+    fn.__name__ = name
+    return fn
+
+
+class LayerCall:
+    """Function that triggers traces of other functions in the same
+    collection."""
+
+    def __init__(self, call_collection, call_fn, name):
+        """Initializes a LayerCall object.
+
+        Args:
+          call_collection: a LayerCallCollection, which contains the other layer
+            call functions (e.g. call_with_conditional_losses, call). These
+            functions should be traced with the same arguments.
+          call_fn: A call function.
+          name: Name of the call function.
+        """
+        self.call_collection = call_collection
+        self.wrapped_call = tf.function(
+            layer_call_wrapper(call_collection, call_fn, name)
+        )
+
+    def _maybe_trace(self, args, kwargs):
+        # Trigger traces of other call functions + extra training-arg traces.
+        if tracing_enabled():
+            self.call_collection.add_trace(*args, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        self._maybe_trace(args, kwargs)
+        return self.wrapped_call(*args, **kwargs)
+
+    def get_concrete_function(self, *args, **kwargs):
+        self._maybe_trace(args, kwargs)
+        return self.wrapped_call.get_concrete_function(*args, **kwargs)
+
+
+def _wrap_call_and_conditional_losses(layer):
+    """Wraps call function that returns a tuple of (outputs, losses).
+
+    The losses returned are conditional on the inputs passed to the call
+    function.  Unconditional losses (e.g. weight regularizeration) are wrapped
+    separately.
+
+    Args:
+      layer: a Keras layer object
+
+    Returns:
+      python call function that returns outputs and conditional losses --
+      excludes activity regularizer
+    """
+    # Create function that generates both outputs and losses
+    layer_call = _get_layer_call_method(layer)
+
+    def call_and_return_conditional_losses(*args, **kwargs):
+        """Returns layer (call_output, conditional losses) tuple."""
+        call_output = layer_call(*args, **kwargs)
+        if version_utils.is_v1_layer_or_model(layer):
+            conditional_losses = layer.get_losses_for(
+                _filtered_inputs([args, kwargs])
+            )
+        else:
+            conditional_losses = [
+                l for l in layer.losses if not hasattr(l, "_unconditional_loss")
+            ]
+        return call_output, conditional_losses
+
+    return _create_call_fn_decorator(layer, call_and_return_conditional_losses)
+
+
+def _extract_outputs_from_fn(layer, call_and_return_conditional_losses):
+    """Returns a function that returns only call function outputs."""
+    if isinstance(layer, keras_load.RevivedLayer):
+        return layer.keras_api.__call__
+
+    def call(inputs, *args, **kwargs):
+        return call_and_return_conditional_losses(inputs, *args, **kwargs)[0]
+
+    return _create_call_fn_decorator(layer, call)
+
+
+def _append_activity_regularizer_loss(
+    layer, call_fn_with_losses, activity_regularizer_fn
+):
+    """Appends activity regularizer loss to losses returned by the wrapped
+    fn."""
+
+    def fn(inputs, *args, **kwargs):
+        outputs, losses = call_fn_with_losses(inputs, *args, **kwargs)
+        losses.append(activity_regularizer_fn(outputs))
+        return outputs, losses
+
+    return _create_call_fn_decorator(layer, fn)
+
+
+def _create_call_fn_decorator(layer, wrapped_call):
+    call_fn = _get_layer_call_method(layer)
+    fn, arg_spec = utils.maybe_add_training_arg(
+        layer._call_spec,
+        wrapped_call,
+        layer._expects_training_arg,
+        default_training_value=False,
+    )
+    return tf.__internal__.decorator.make_decorator(
+        target=call_fn, decorator_func=fn, decorator_argspec=arg_spec
+    )
+
+
+def _wrap_unconditional_loss(loss_fn, index):
+    """Wraps callable/unconditional loss, returning a serializable function."""
+    # Extract original loss function from partial function
+    fn = loss_fn.args[0] if isinstance(loss_fn, functools.partial) else loss_fn
+    if isinstance(fn, tf.__internal__.function.Function):
+        return fn
+    else:
+        return tf.__internal__.function.Function(
+            fn, f"loss_fn_{index}", input_signature=[]
+        )
+
+
+def _wrap_activity_regularizer(layer):
+    """Wraps the activity regularizer."""
+
+    if isinstance(
+        layer._activity_regularizer, tf.__internal__.function.Function
+    ):
+        return layer._activity_regularizer
+    return tf.__internal__.function.Function(
+        layer._activity_regularizer,
+        f"{layer.name}_activity_regularizer",
+        input_signature=[
+            tf.TensorSpec(None, layer._compute_dtype or backend.floatx())
+        ],
+    )
+
+
+def _get_layer_call_method(layer):
+    if isinstance(layer.call, (tf.__internal__.function.Function)):
+        return layer.call.python_function
+    return layer.call
diff --git a/keras/saving/legacy/saved_model/saved_model_test.py b/keras/saving/legacy/saved_model/saved_model_test.py
new file mode 100644
index 000000000000..7ae94743645d
--- /dev/null
+++ b/keras/saving/legacy/saved_model/saved_model_test.py
@@ -0,0 +1,1630 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for saving and loading Keras models and layers from SavedModel.
+
+These should ensure that all layer properties are correctly assigned after
+loading from the SavedModel.
+
+Tests that focus on the model structure should go in revive_test.py
+"""
+
+import os
+import shutil
+import sys
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+
+import keras
+from keras import regularizers
+from keras.feature_column.dense_features import DenseFeatures
+from keras.protobuf import saved_metadata_pb2
+from keras.protobuf import versions_pb2
+from keras.saving import object_registration
+from keras.saving.legacy.saved_model import json_utils
+from keras.saving.legacy.saved_model import load as keras_load
+from keras.saving.legacy.saved_model import save_impl as keras_save
+from keras.saving.legacy.saved_model import utils as saved_model_utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import control_flow_util
+from keras.utils import tf_contextlib
+from keras.utils import tf_inspect
+
+
+class LayerWithLearningPhase(keras.engine.base_layer.Layer):
+    def build(self, input_shape):
+        self.input_spec = keras.layers.InputSpec(
+            shape=[None] * len(input_shape)
+        )
+        self.built = True
+
+    def call(self, x, training=None):
+        if training is None:
+            training = keras.backend.learning_phase()
+        output = control_flow_util.smart_cond(
+            training, lambda: x * 0, lambda: tf.identity(x)
+        )
+        if not tf.executing_eagerly():
+            output._uses_learning_phase = True
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    @property
+    def _use_input_spec_as_call_signature(self):
+        return True
+
+
+class LayerWithLoss(keras.layers.Layer):
+    def call(self, inputs):
+        self.add_loss(tf.reduce_sum(inputs))
+        return inputs * 2
+
+
+class LayerWithUpdate(keras.layers.Layer):
+    def build(self, _):
+        self.v = self.add_weight(
+            "v",
+            shape=[],
+            initializer=keras.initializers.zeros,
+            trainable=False,
+            dtype=tf.float32,
+        )
+
+    def call(self, inputs, training=True):
+        if training:
+            self.add_update(self.v.assign_add(1.0))
+        return inputs * 2.0
+
+
+@object_registration.register_keras_serializable("Testing")
+class GlobalLayerThatShouldFailIfNotAdded(keras.layers.Layer):
+    _must_restore_from_config = True
+
+
+@test_combinations.run_all_keras_modes
+class TestSavedModelFormatAllModes(test_combinations.TestCase):
+    def _save_model_dir(self, dirname="saved_model"):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        return os.path.join(temp_dir, dirname)
+
+    def _get_model(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.layers[-1].activity_regularizer = regularizers.get("l2")
+        model.activity_regularizer = regularizers.get("l2")
+        model.compile(loss="mse", optimizer="rmsprop")
+
+        def callable_loss():
+            return tf.reduce_sum(model.weights[0])
+
+        model.add_loss(callable_loss)
+        return model
+
+    def _train_model(self, model, use_dataset=False):
+        x = np.random.random((1, 3))
+        y = np.random.random((1, 4))
+
+        if not tf.__internal__.tf2.enabled():
+            # The layer autocast behavior only runs when autocast is enabled, so
+            # in V1, the numpy inputs still need to be cast to float32.
+            x = x.astype(np.float32)
+            y = y.astype(np.float32)
+
+        if use_dataset:
+            dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(1)
+            model.fit(dataset)
+        else:
+            model.train_on_batch(x, y)
+
+    def _save_and_load(self, model):
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        return loaded
+
+    def _test_evaluation(self, model, loaded):
+        # Assert that original and loaded models have the same results when
+        # called.
+        self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
+        self.assertAllClose(
+            self.evaluate(model.weights), self.evaluate(loaded.weights)
+        )
+
+        input_arr = tf.constant(np.random.random((1, 3)).astype(np.float32))
+        self.assertAllClose(
+            self.evaluate(model(input_arr)), self.evaluate(loaded(input_arr))
+        )
+        # Validate losses. The order of conditional losses may change between
+        # the model and loaded model, so sort the losses first.
+        if tf.executing_eagerly():
+            self.assertAllClose(
+                sorted(self.evaluate(model.losses)),
+                sorted(self.evaluate(loaded.losses)),
+            )
+
+    @test_combinations.run_with_all_model_types
+    def test_model_save_and_load(self):
+        model = self._get_model()
+        self._train_model(model, use_dataset=False)
+        loaded = self._save_and_load(model)
+        self._test_evaluation(model, loaded)
+
+    @test_combinations.run_with_all_model_types
+    def test_model_save_and_load_dataset(self):
+        model = self._get_model()
+        self._train_model(model, use_dataset=True)
+        loaded = self._save_and_load(model)
+        self._test_evaluation(model, loaded)
+
+    def test_trainable_weights(self):
+        """Tests that trainable status of individual weights is preserved."""
+        layer = keras.layers.Dense(4, name="custom_layer")
+        layer.build([None, 3])
+        layer.add_weight(
+            "extra_weight",
+            shape=[],
+            initializer=tf.compat.v1.constant_initializer(11),
+            trainable=True,
+        )
+        layer.add_weight(
+            "extra_weight_2",
+            shape=[],
+            initializer=tf.compat.v1.constant_initializer(12),
+            trainable=False,
+        )
+        model = keras.Sequential(
+            [
+                keras.Input(
+                    [
+                        3,
+                    ]
+                ),
+                layer,
+            ]
+        )
+
+        saved_model_dir = self._save_model_dir()
+        self.evaluate(tf.compat.v1.variables_initializer(layer.variables))
+        model.save(saved_model_dir, save_format="tf")
+        loaded_model = keras_load.load(saved_model_dir)
+        self.evaluate(
+            tf.compat.v1.variables_initializer(loaded_model.variables)
+        )
+
+        loaded = loaded_model.layers[-1]
+
+        equal_attrs = ["name", "_expects_training_arg", "trainable"]
+        for attr in equal_attrs:
+            self.assertEqual(getattr(layer, attr), getattr(loaded, attr))
+
+        all_close = ["weights", "trainable_weights", "non_trainable_weights"]
+        for attr in all_close:
+            self.assertAllClose(
+                self.evaluate(getattr(layer, attr)),
+                self.evaluate(getattr(loaded, attr)),
+            )
+
+    @test_combinations.run_with_all_model_types
+    def test_trainable_layers(self):
+        """Tests that trainable status of individual layers is preserved."""
+        model = model = self._get_model()
+        # Set the last layer to *not* be trainable.
+        model.layers[-1].trainable = False
+        self._train_model(model, use_dataset=True)
+        loaded = self._save_and_load(model)
+
+        self._test_evaluation(model, loaded)
+        self.assertFalse(model.layers[-1].trainable)
+        self.assertFalse(loaded.layers[-1].trainable)
+
+    def test_trainable_custom_model_false(self):
+        """Tests that overall False trainable status of Model is preserved."""
+        # Set all layers to *not* be trainable.
+        model = test_utils.SmallSubclassMLP(1, 4, trainable=False)
+        model.compile(loss="mse", optimizer="rmsprop")
+        self._train_model(model, use_dataset=False)
+        loaded = self._save_and_load(model)
+
+        self._test_evaluation(model, loaded)
+        self.assertEmpty(model.trainable_variables)
+        self.assertEmpty(loaded.trainable_variables)
+
+    def test_maintains_losses(self):
+        """Tests that the layer losses do not change before and after export."""
+        model = keras.models.Sequential([LayerWithLoss()])
+        model.compile(loss="mse", optimizer="rmsprop")
+        input_arr = np.random.random((1, 3))
+        target_arr = np.random.random((1, 3))
+
+        # Test that symbolic losses are maintained (train_on_batch saves
+        # symbolic losses.)
+        model.train_on_batch(input_arr, target_arr)
+        previous_losses = model.losses[:]
+
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        with previous_losses[0].graph.as_default():
+            # If we try to compare symbolic Tensors in eager mode assertAllEqual
+            # will return False even if they are the same Tensor.
+            self.assertEqual(previous_losses, model.losses)
+
+        if tf.executing_eagerly():
+            # Test that eager losses are maintained.
+            model(input_arr)  # Calls model eagerly, creating eager losses.
+            previous_losses = model.losses[:]
+            model.save(saved_model_dir, save_format="tf")
+            self.assertAllEqual(previous_losses, model.losses)
+
+    def test_layer_with_learning_phase(self):
+        layer = LayerWithLearningPhase()
+        layer.build([None, None])
+        saved_model_dir = self._save_model_dir()
+        model = test_utils.get_model_from_layers(
+            [layer], input_shape=[None], model_type="functional"
+        )
+        model.save(saved_model_dir, save_format="tf")
+        loaded_model = keras_load.load(saved_model_dir)
+        loaded = loaded_model.layers[-1]
+        input_arr = tf.ones((4, 3))
+
+        # Run the layer, and use the keras backend learning phase
+        keras.backend.set_learning_phase(0)
+        self.assertAllEqual(input_arr, loaded(input_arr))
+        keras.backend.set_learning_phase(1)
+        self.assertAllEqual(tf.zeros((4, 3)), loaded(input_arr))
+
+        # Run the layer while explicitly setting the training argument
+        self.assertAllEqual(
+            input_arr, loaded(input_arr, training=tf.constant(False))
+        )
+        self.assertAllEqual(
+            tf.zeros((4, 3)), loaded(input_arr, training=tf.constant(True))
+        )
+
+    @test_combinations.run_with_all_model_types
+    def test_standard_loader(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.activity_regularizer = regularizers.get("l2")
+
+        def eager_loss():
+            return tf.reduce_sum(model.weights[0])
+
+        model.add_loss(eager_loss)
+
+        # Call predict to ensure that all layers are built and inputs are set.
+        model.predict(np.random.random((1, 3)).astype(np.float32))
+        saved_model_dir = self._save_model_dir()
+
+        model.save(saved_model_dir, save_format="tf")
+
+        loaded = tf.saved_model.load(saved_model_dir)
+        self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
+        all_close = [
+            "variables",
+            "trainable_variables",
+            "non_trainable_variables",
+        ]
+        for attr in all_close:
+            self.assertAllClose(
+                self.evaluate(getattr(model, attr)),
+                self.evaluate(getattr(loaded.keras_api, attr)),
+            )
+        self.assertLen(loaded.regularization_losses, 1)
+        expected_layers = len(model.layers)
+        self.assertEqual(expected_layers, len(loaded.keras_api.layers))
+        input_arr = tf.ones((4, 3))
+        self.assertAllClose(
+            self.evaluate(model(input_arr)),
+            self.evaluate(loaded(input_arr, training=False)),
+        )
+
+    @test_combinations.run_with_all_model_types
+    def test_compiled_model(self):
+        # TODO(b/134519980): Issue with model.fit if the model call function
+        # uses a tf.function (Graph mode only).
+        if not tf.executing_eagerly():
+            return
+
+        input_arr = np.random.random((1, 3))
+        target_arr = np.random.random((1, 4))
+
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        expected_predict = model.predict(input_arr)
+
+        # Compile and save model.
+        model.compile("rmsprop", "mse")
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+        actual_predict = loaded.predict(input_arr)
+        self.assertAllClose(expected_predict, actual_predict)
+
+        loss_before = loaded.evaluate(input_arr, target_arr)
+        loaded.fit(input_arr, target_arr)
+        loss_after = loaded.evaluate(input_arr, target_arr)
+        self.assertLess(loss_after, loss_before)
+        predict = loaded.predict(input_arr)
+
+        ckpt_path = os.path.join(self.get_temp_dir(), "weights")
+        loaded.save_weights(ckpt_path)
+
+        # Ensure that the checkpoint is compatible with the original model.
+        model.load_weights(ckpt_path)
+        self.assertAllClose(predict, model.predict(input_arr))
+
+    def test_metadata_input_spec(self):
+        class LayerWithNestedSpec(keras.layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = {
+                    "a": keras.layers.InputSpec(max_ndim=3, axes={-1: 2}),
+                    "b": keras.layers.InputSpec(
+                        shape=(None, 2, 3), dtype="int32"
+                    ),
+                }
+
+            @property
+            def _use_input_spec_as_call_signature(self):
+                return True
+
+        layer = LayerWithNestedSpec()
+        saved_model_dir = self._save_model_dir()
+        model = test_utils.get_model_from_layers([layer], model_type="subclass")
+        model(
+            {
+                "a": tf.constant([[2, 4]]),
+                "b": tf.ones([1, 2, 3], dtype=tf.int32),
+            }
+        )
+        model.save(saved_model_dir, save_format="tf")
+        loaded_model = keras_load.load(saved_model_dir)
+        loaded = loaded_model.layers[-1]
+        self.assertEqual(3, loaded.input_spec["a"].max_ndim)
+        self.assertEqual({-1: 2}, loaded.input_spec["a"].axes)
+        self.assertAllEqual([None, 2, 3], loaded.input_spec["b"].shape)
+        self.assertEqual("int32", loaded.input_spec["b"].dtype)
+
+    def test_must_restore_from_config_fails_if_layer_is_not_in_scope(self):
+        class LayerThatShouldFailIfNotAdded(keras.layers.Layer):
+            _must_restore_from_config = True
+
+        layer = LayerThatShouldFailIfNotAdded()
+        saved_model_dir = self._save_model_dir()
+        model = test_utils.get_model_from_layers(
+            [layer], input_shape=[3], model_type="functional"
+        )
+        model.save(saved_model_dir, save_format="tf")
+        with self.assertRaisesRegex(
+            ValueError, "Unknown layer: 'LayerThatShouldFailIfNotAdded'"
+        ):
+            _ = keras_load.load(saved_model_dir)
+
+    def test_must_restore_from_config_custom_object_scope(self):
+        class LayerThatShouldFailIfNotAdded(keras.layers.Layer):
+            _must_restore_from_config = True
+
+        layer = LayerThatShouldFailIfNotAdded()
+        model = test_utils.get_model_from_layers(
+            [layer], input_shape=[3], model_type="functional"
+        )
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        with object_registration.CustomObjectScope(
+            {"LayerThatShouldFailIfNotAdded": LayerThatShouldFailIfNotAdded}
+        ):
+            _ = keras_load.load(saved_model_dir)
+
+    def test_must_restore_from_config_registration(self):
+        layer = GlobalLayerThatShouldFailIfNotAdded()
+        saved_model_dir = self._save_model_dir()
+        model = test_utils.get_model_from_layers(
+            [layer], input_shape=[3], model_type="functional"
+        )
+        model.save(saved_model_dir, save_format="tf")
+        _ = keras_load.load(saved_model_dir)
+
+    def test_multi_input_model(self):
+        input_1 = keras.layers.Input(shape=(3,))
+        input_2 = keras.layers.Input(shape=(5,))
+        model = keras.Model([input_1, input_2], [input_1, input_2])
+        saved_model_dir = self._save_model_dir()
+
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        input_arr_1 = np.random.random((1, 3)).astype("float32")
+        input_arr_2 = np.random.random((1, 5)).astype("float32")
+
+        outputs = loaded([input_arr_1, input_arr_2])
+        self.assertAllEqual(input_arr_1, outputs[0])
+        self.assertAllEqual(input_arr_2, outputs[1])
+
+    def test_revived_sequential(self):
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                5, input_shape=(3,), kernel_regularizer=regularizers.get("l2")
+            )
+        )
+        model.add(
+            keras.layers.Dense(2, kernel_regularizer=regularizers.get("l2"))
+        )
+
+        self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+
+        self.assertLen(loaded.layers, 2)
+        self.assertLen(loaded.losses, 2)
+
+        loaded.pop()
+
+        self.assertLen(loaded.layers, 1)
+        self.assertLen(loaded.losses, 1)
+
+        loaded.add(
+            keras.layers.Dense(2, kernel_regularizer=regularizers.get("l2"))
+        )
+
+        self.assertLen(loaded.layers, 2)
+        self.assertLen(loaded.losses, 2)
+
+    def testBatchNormUpdates(self):
+        model = keras.models.Sequential(
+            keras.layers.BatchNormalization(input_shape=(1,))
+        )
+        self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+        saved_model_dir = self._save_model_dir()
+
+        with self.captureWritesToStream(sys.stderr) as captured_logs:
+            model.save(saved_model_dir, save_format="tf")
+            loaded = keras_load.load(saved_model_dir)
+
+        # Assert that saving does not log deprecation warnings
+        # (even if it needs to set learning phase for compat reasons)
+        if tf.executing_eagerly():
+            self.assertNotIn("deprecated", captured_logs.contents())
+
+        input_arr = tf.constant([[11], [12], [13]], dtype=tf.float32)
+        input_arr2 = tf.constant([[14], [15], [16]], dtype=tf.float32)
+        self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
+
+        self.evaluate(loaded(input_arr, training=True))
+        if not tf.executing_eagerly():
+            self.evaluate(loaded.get_updates_for(input_arr))
+        self.assertAllClose(
+            self.evaluate(loaded.layers[-1].moving_mean), [0.12]
+        )
+
+        self.evaluate(loaded(input_arr2, training=False))
+        if not tf.executing_eagerly():
+            self.evaluate(loaded.get_updates_for(input_arr2))
+        self.assertAllClose(
+            self.evaluate(loaded.layers[-1].moving_mean), [0.12]
+        )
+
+    def testDisablingBatchNormTrainableBeforeSaving(self):
+        # We disable trainable on the batchnorm layers before saving
+        model = keras.models.Sequential(
+            keras.layers.BatchNormalization(input_shape=(1,))
+        )
+        model.trainable = False
+        self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
+        input_arr = tf.constant([[11], [12], [13]], dtype=tf.float32)
+        input_arr2 = tf.constant([[14], [15], [16]], dtype=tf.float32)
+        self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
+
+        # Trainable should still be disabled after loading
+        self.evaluate(loaded(input_arr, training=True))
+        if not tf.executing_eagerly():
+            self.evaluate(loaded.get_updates_for(input_arr))
+        self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.0])
+
+        # Re-enabling trainable on the loaded model should cause the batchnorm
+        # layer to start training again.
+        # Note: this only works in v2.
+        if tf.executing_eagerly():
+            loaded.trainable = True
+            self.evaluate(loaded(input_arr, training=True))
+            self.assertAllClose(
+                self.evaluate(loaded.layers[-1].moving_mean), [0.12]
+            )
+
+            self.evaluate(loaded(input_arr2, training=False))
+            self.assertAllClose(
+                self.evaluate(loaded.layers[-1].moving_mean), [0.12]
+            )
+
+    def testSaveWithSignatures(self):
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Dense(
+                5, input_shape=(3,), kernel_regularizer=regularizers.get("l2")
+            )
+        )
+        model.add(keras.layers.Dropout(0.5))
+        model.add(
+            keras.layers.Dense(4, kernel_regularizer=regularizers.get("l2"))
+        )
+
+        input_arr = np.random.random((2, 3))
+        target_arr = np.random.random((2, 4))
+
+        model.compile(loss="mse", optimizer="rmsprop")
+        model.train_on_batch(input_arr, target_arr)
+
+        @tf.function(input_signature=[tf.TensorSpec((None, 3))])
+        def predict(inputs):
+            return {"predictions": model(inputs)}
+
+        feature_configs = {
+            "inputs": tf.io.FixedLenFeature(shape=[2, 3], dtype=tf.float32)
+        }
+
+        @tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
+        def parse_and_predict(examples):
+            features = tf.compat.v1.parse_single_example(
+                examples[0], feature_configs
+            )
+            return {
+                "predictions": model(features["inputs"]),
+                "layer_1_outputs": model.layers[0](features["inputs"]),
+            }
+
+        saved_model_dir = self._save_model_dir()
+        model.save(
+            saved_model_dir,
+            save_format="tf",
+            signatures={
+                "predict": predict,
+                "parse_and_predict": parse_and_predict,
+            },
+        )
+        model.save(
+            "/tmp/saved",
+            save_format="tf",
+            signatures={
+                "predict": predict,
+                "parse_and_predict": parse_and_predict,
+            },
+        )
+
+        loaded = keras_load.load(saved_model_dir)
+
+        self.assertAllClose(
+            model.predict(input_arr),
+            loaded.signatures["predict"](
+                tf.convert_to_tensor(input_arr.astype("float32"))
+            )["predictions"],
+        )
+
+        feature = {
+            "inputs": feature_pb2.Feature(
+                float_list=feature_pb2.FloatList(
+                    value=input_arr.astype("float32").flatten()
+                )
+            )
+        }
+        example = example_pb2.Example(
+            features=feature_pb2.Features(feature=feature)
+        )
+        outputs = loaded.signatures["parse_and_predict"](
+            tf.convert_to_tensor([example.SerializeToString()])
+        )
+        self.assertAllClose(model.predict(input_arr), outputs["predictions"])
+        self.assertAllClose(
+            model.layers[0](input_arr), outputs["layer_1_outputs"]
+        )
+
+    def testTrainingDefaults(self):
+        def assert_training_default(fn, default_value):
+            arg_spec = tf_inspect.getfullargspec(fn)
+            fn_defaults = arg_spec.defaults or []
+            defaults = dict()
+            # The call arg defaults are an n-tuple of the last n elements of the
+            # args list. (n = # of elements that have a default argument)
+            for i in range(-1 * len(fn_defaults), 0):
+                defaults[arg_spec.args[i]] = fn_defaults[i]
+            # The default training arg will be any (non-None) default specified
+            # in the method signature, or None if no value is specified.
+            defaults.update(arg_spec.kwonlydefaults or {})
+            self.assertEqual(defaults["training"], default_value)
+
+        class LayerWithTrainingRequiredArg(keras.engine.base_layer.Layer):
+            def call(self, inputs, training):
+                return control_flow_util.smart_cond(
+                    training, lambda: inputs * 0, lambda: tf.identity(inputs)
+                )
+
+        class LayerWithTrainingDefaultTrue(keras.engine.base_layer.Layer):
+            def call(self, inputs, training=True):
+                return control_flow_util.smart_cond(
+                    training, lambda: inputs * 0, lambda: tf.identity(inputs)
+                )
+
+        class Model(keras.models.Model):
+            def __init__(self):
+                super().__init__()
+                self.layer_with_training_default_none = LayerWithLearningPhase()
+                self.layer_with_training_default_true = (
+                    LayerWithTrainingDefaultTrue()
+                )
+                self.layer_with_required_training_arg = (
+                    LayerWithTrainingRequiredArg()
+                )
+
+            def call(self, inputs):
+                x = self.layer_with_training_default_none(inputs)
+                x += self.layer_with_training_default_true(inputs)
+                x += self.layer_with_required_training_arg(inputs, False)
+                return x
+
+        model = Model()
+        # Build and set model inputs
+        model.predict(np.ones([1, 3]).astype("float32"))
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        load = tf.saved_model.load(saved_model_dir)
+
+        # Ensure that the Keras loader is able to load and build the model.
+        _ = keras_load.load(saved_model_dir)
+
+        assert_training_default(load.__call__, False)
+        assert_training_default(
+            load.layer_with_training_default_none.__call__, False
+        )
+        assert_training_default(
+            load.layer_with_training_default_true.__call__, True
+        )
+
+        # Assert that there are no defaults for layer with required training arg
+        arg_spec = tf_inspect.getfullargspec(
+            load.layer_with_required_training_arg.__call__
+        )
+        self.assertFalse(arg_spec.defaults)  # defaults is None or empty
+
+    def testTraceModelWithKwarg(self):
+        class Model(keras.models.Model):
+            def call(self, inputs, keyword=None):
+                return tf.identity(inputs)
+
+        model = Model()
+        prediction = model.predict(np.ones([1, 3]).astype("float32"))
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        with object_registration.custom_object_scope({"Model": Model}):
+            loaded = keras_load.load(saved_model_dir)
+        self.assertAllClose(
+            prediction, loaded.predict(np.ones([1, 3]).astype("float32"))
+        )
+
+        loaded_without_scope = keras_load.load(saved_model_dir)
+        if tf.__internal__.tf2.enabled():
+            with self.assertRaises(NotImplementedError):
+                loaded_without_scope.predict(np.ones([1, 3]).astype("float32"))
+
+    def testFeatureColumns(self):
+        # TODO(b/120099662): Error with table initialization with Keras models
+        # in graph mode.
+        if tf.executing_eagerly():
+            numeric = tf.feature_column.numeric_column("a")
+            bucketized = tf.feature_column.bucketized_column(
+                numeric, boundaries=[5, 10, 15]
+            )
+            cat_vocab = (
+                tf.feature_column.categorical_column_with_vocabulary_list(
+                    "b", ["1", "2", "3"]
+                )
+            )
+            one_hot = tf.feature_column.indicator_column(cat_vocab)
+            embedding = tf.feature_column.embedding_column(
+                cat_vocab, dimension=8
+            )
+            feature_layer = DenseFeatures([bucketized, one_hot, embedding])
+            model = keras.models.Sequential(feature_layer)
+
+            features = {"a": np.array([13, 15]), "b": np.array(["1", "2"])}
+            predictions = model.predict(features)
+
+            saved_model_dir = self._save_model_dir()
+            model.save(saved_model_dir, save_format="tf")
+            loaded = keras_load.load(saved_model_dir)
+            loaded_predictions = loaded.predict(features)
+            self.assertAllClose(predictions, loaded_predictions)
+
+    def testSaveTensorKwarg(self):
+        class LayerWithTensorKwarg(keras.layers.Layer):
+            def call(self, inputs, tensor=None):
+                if tensor is not None:
+                    return inputs * tf.cast(tensor, tf.float32)
+                else:
+                    return inputs
+
+        t = self.evaluate(tf.sequence_mask(1))
+        inputs = keras.layers.Input(shape=(3))
+        model = keras.models.Model(inputs, LayerWithTensorKwarg()(inputs, t))
+
+        input_arr = np.random.random((1, 3))
+        predictions = model.predict(input_arr)
+
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        loaded_predictions = loaded.predict(input_arr)
+        self.assertAllClose(predictions, loaded_predictions)
+
+    def testModelWithTfFunctionCall(self):
+        class Subclass(keras.models.Model):
+            @tf.function
+            def call(self, inputs, training=False):
+                return inputs * tf.cast(training, tf.float32)
+
+        model = Subclass()
+        model.predict(tf.ones((1, 2)), steps=1)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        self.assertAllEqual(
+            [[1, 5]],
+            self.evaluate(loaded(tf.constant([[1, 5.0]]), training=True)),
+        )
+        self.assertAllEqual(
+            [[0, 0]],
+            self.evaluate(loaded(tf.constant([[1, 5.0]]), training=False)),
+        )
+
+    def testReviveFunctionalModel(self):
+        class CustomAdd(keras.layers.Add):
+            def build(self, input_shape):
+                self.w = self.add_weight("w", shape=[])
+                super().build(input_shape)
+
+            def call(self, inputs):
+                outputs = super().call(inputs)
+                return outputs * self.w
+
+        input1 = keras.layers.Input(shape=(None, 3), name="input_1")
+        input2 = keras.layers.Input(shape=(None, 3), name="input_2")
+
+        d = keras.layers.Dense(4, name="dense_with_two_inbound_nodes")
+        output1 = d(input1)
+        output2 = d(input2)
+
+        # Use a custom layer in this model to ensure that layers aren't being
+        # recreated directly from the config.
+        outputs = CustomAdd(name="custom")([output1, output2])
+        model = keras.models.Model([input1, input2], outputs, name="save_model")
+
+        self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+        self.assertEqual("save_model", loaded.name)
+        self.assertLen(
+            loaded.get_layer("dense_with_two_inbound_nodes")._inbound_nodes, 2
+        )
+        self.assertEqual("CustomAdd", type(loaded.get_layer("custom")).__name__)
+        self.assertLen(loaded.get_layer("custom").weights, 1)
+
+    def _testAddUpdate(self, scope):
+        with scope:
+            layer_with_update = LayerWithUpdate()
+            model = test_utils.get_model_from_layers(
+                [layer_with_update], input_shape=(3,)
+            )
+
+            x = np.ones((10, 3))
+            if test_utils.get_model_type() == "subclass":
+                model.predict(x, batch_size=10)
+            self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+            saved_model_dir = self._save_model_dir()
+            model.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+        loaded_layer = loaded.layers[-1]
+        self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
+        self.assertEqual(self.evaluate(loaded_layer.v), 0.0)
+
+        loaded.compile("sgd", "mse")
+        loaded.fit(x, x, batch_size=10)
+        self.assertEqual(self.evaluate(loaded_layer.v), 1.0)
+
+    @test_combinations.run_with_all_model_types
+    def testSaveLayerWithUpdates(self):
+        @tf_contextlib.contextmanager
+        def nullcontextmanager():
+            yield
+
+        self._testAddUpdate(nullcontextmanager())
+
+    @test_combinations.run_with_all_model_types
+    def testSaveInStrategyScope(self):
+        self._testAddUpdate(tf.distribute.MirroredStrategy().scope())
+
+    def testSaveTimeDistributedLayer(self):
+        model = keras.Sequential(
+            [
+                keras.layers.TimeDistributed(
+                    keras.layers.Dense(
+                        1, kernel_regularizer=regularizers.get("l2")
+                    ),
+                    input_shape=(None, 1),
+                )
+            ]
+        )
+        predictions = model.predict_on_batch(tf.ones((3, 2, 1)))
+
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+        self.assertAllClose(
+            loaded.predict_on_batch(tf.ones((3, 2, 1))), predictions
+        )
+
+    @parameterized.named_parameters(
+        [("with_unrolling", True), ("no_unrolling", False)]
+    )
+    def testSaveStatefulRNN(self, unroll):
+        batch = 12
+        timesteps = 10
+        input_dim = 8
+        input_arr = np.ones((batch, timesteps, input_dim)).astype("float32")
+
+        cells = [keras.layers.LSTMCell(32), keras.layers.LSTMCell(64)]
+        if unroll:
+            x = keras.Input(batch_shape=(batch, timesteps, input_dim))
+        else:
+            x = keras.Input(batch_shape=(batch, None, input_dim))
+        layer = keras.layers.RNN(cells, stateful=True, unroll=unroll)
+        y = layer(x)
+
+        model = keras.Model(x, y)
+        model.compile(
+            "rmsprop", "mse", run_eagerly=test_utils.should_run_eagerly()
+        )
+        model.train_on_batch(
+            np.zeros((batch, timesteps, input_dim)).astype("float32"),
+            np.zeros((batch, 64)).astype("float32"),
+        )
+
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+        loaded_layer = loaded.layers[1]
+
+        if not tf.executing_eagerly():
+            keras.backend.get_session()  # force variable initialization
+
+        self.assertAllClose(layer.states, loaded_layer.states)
+        self.assertAllClose(model(input_arr), loaded(input_arr))
+
+    def testSaveBidirectionalLSTM(self):
+        # Make sure that the input spec of an unrolled RNN is not used when
+        # wrapped in a Bidirectional layer.
+        # https://github.com/keras-team/keras/issues/15454
+        input_layer = keras.Input(
+            batch_input_shape=(1, 15, 128), name="input", dtype=tf.float32
+        )
+        lstm_layer = keras.layers.Bidirectional(
+            keras.layers.LSTM(
+                units=64,
+                name="lstm",
+                dropout=0.2,
+                trainable=False,
+                unroll=True,
+            )
+        )
+        output_layer = lstm_layer(input_layer)
+        model = keras.Model(input_layer, output_layer)
+        saved_model_dir = self._save_model_dir()
+        self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        input_arr = np.random.random((1, 15, 128)).astype("float32")
+        self.assertAllClose(model(input_arr), loaded(input_arr))
+
+    @parameterized.named_parameters([("stateful", True), ("stateless", False)])
+    def testSaveConvLSTM2D(self, stateful):
+        data_format = "channels_first"
+        batch, timesteps, channels, rows, cols = 12, 10, 8, 4, 4
+        input_arr = np.ones((batch, timesteps, channels, rows, cols)).astype(
+            "float32"
+        )
+        layer = keras.layers.ConvLSTM2D(
+            filters=16,
+            kernel_size=(1, 1),
+            data_format=data_format,
+            stateful=stateful,
+        )
+        x = keras.Input(batch_shape=(batch, timesteps, channels, rows, cols))
+        y = layer(x)
+        model = keras.Model(x, y)
+
+        predict_1 = model(input_arr)
+        self.evaluate([v.initializer for v in model.variables])
+        saved_model_dir = self._save_model_dir()
+
+        model.save(saved_model_dir, save_format="tf")
+        del model
+
+        loaded = keras_load.load(saved_model_dir)
+        self.evaluate([v.initializer for v in loaded.variables])
+        if stateful:
+            loaded.reset_states()
+        predict_2 = loaded(input_arr)
+        self.assertAllClose(predict_1, predict_2)
+
+    def testSaveWithRaggedInputs(self):
+        class EmbeddingMerger(keras.layers.Layer):
+            def __init__(self, list_features, **kwargs):
+                super().__init__(**kwargs)
+                self._supports_ragged_inputs = True
+                self.embeddings = {
+                    feature: keras.layers.Embedding(10, 3)
+                    for feature in list_features
+                }
+                self.mean = keras.layers.Lambda(
+                    tf.reduce_mean, arguments=dict(axis=1)
+                )
+
+            def call(self, inputs):
+                tensors = [self.embeddings[col](inputs[col]) for col in inputs]
+                tensors = [self.mean(inp) for inp in tensors]
+                return keras.layers.Add()(tensors)
+
+        list_features = ["feature_1", "feature_2"]
+        feature_1 = tf.ragged.constant([[0.0], [1, 3]])
+        feature_2 = tf.ragged.constant([[1.0, 2], [4]])
+        f = {"feature_1": feature_1, "feature_2": feature_2}
+        f_inputs = {
+            "feature_1": keras.Input(
+                shape=(None,), name="feature_1", ragged=True
+            ),
+            "feature_2": keras.Input(
+                shape=(None,), name="feature_2", ragged=True
+            ),
+        }
+
+        out = EmbeddingMerger(list_features)(f_inputs)
+        model = keras.Model(f_inputs, out)
+        self.evaluate(tf.compat.v1.variables_initializer(model.variables))
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+        self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
+        self.assertAllClose(model.predict(f), loaded.predict(f))
+
+    def testSaveMultipleInputs(self):
+        class CustomLayer(keras.layers.Layer):
+            def call(self, *input_list):
+                self.add_loss(input_list[-2] * 2)
+                return sum(
+                    input_list[:-1]
+                )  # The test's last input is a non-tensor arg
+
+        class CustomModel(keras.Model):
+            def build(self, _):
+                self.layer = CustomLayer()
+
+            def call(self, *inputs):
+                inputs = list(inputs)
+                inputs.append(
+                    object()
+                )  # Test that the layer handles non-tensor inputs
+                return self.layer(*inputs)
+
+        model = CustomModel()
+        inp = [
+            tf.constant(i, shape=[1, 1], dtype=tf.float32) for i in range(1, 5)
+        ]
+        expected = model(*inp)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        actual = loaded(*inp)
+        self.assertAllEqual(self.evaluate(expected), self.evaluate(actual))
+
+    def testSaveMultipleInputsWithTraining(self):
+        class CustomModel(keras.Model):
+            def call(self, input_1, training, input_2):
+                if training:
+                    return input_1
+                else:
+                    return input_2
+
+        inp1 = tf.constant(1.0, shape=[1])
+        inp2 = tf.constant(2.0, shape=[1])
+
+        model = CustomModel()
+        self.assertEqual(self.evaluate(model(inp1, True, inp2)), 1.0)
+        self.assertEqual(self.evaluate(model(inp1, False, inp2)), 2.0)
+
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        self.assertEqual(self.evaluate(loaded(inp1, True, inp2)), 1.0)
+        self.assertEqual(self.evaluate(loaded(inp1, False, inp2)), 2.0)
+
+    def test_wrapped_layer_training(self):
+        class Custom(keras.models.Model):
+            def __init__(self):
+                super().__init__()
+                self.layer = LayerWithLearningPhase()
+
+            def call(self, inputs):
+                return self.layer(inputs)
+
+        model = Custom()
+        x = tf.constant(1.0, shape=[1, 1])
+        expected_default = model(x)
+        expected_training_true = model(x, training=True)
+        expected_training_false = model(x, training=False)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        def assert_loaded_model(loaded):
+            actual_default = loaded(x)
+            actual_training_true = loaded(x, training=True)
+            actual_training_false = loaded(x, training=False)
+            self.assertAllClose(
+                [
+                    expected_default,
+                    expected_training_true,
+                    expected_training_false,
+                ],
+                [actual_default, actual_training_true, actual_training_false],
+            )
+
+        assert_loaded_model(keras_load.load(saved_model_dir))
+        assert_loaded_model(tf.saved_model.load(saved_model_dir))
+
+    @parameterized.named_parameters([("true", True), ("false", False)])
+    def test_save_layer_autocast(self, autocast):
+        class CustomLayer(keras.layers.Layer):
+            def __init__(self):
+                super().__init__(autocast=autocast)
+
+        class CustomModel(keras.Model):
+            def __init__(self):
+                super().__init__(autocast=autocast)
+
+            def call(self, inputs):
+                return inputs
+
+        x = tf.constant([3], dtype=tf.float64)
+
+        x_in = keras.Input((1,))
+        output = CustomLayer()(x_in)
+        output = CustomModel()(output)
+        model = keras.Model(inputs=x_in, outputs=output)
+
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+        loaded = keras_load.load(saved_model_dir)
+        self.assertEqual(autocast, loaded.layers[-1]._autocast)
+        self.assertEqual(autocast, loaded.layers[-2]._autocast)
+        self.assertEqual(self.evaluate(model(x)), self.evaluate(loaded(x)))
+
+
+class TestSavedModelFormat(tf.test.TestCase):
+    def _save_model_dir(self, dirname="saved_model"):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        return os.path.join(temp_dir, dirname)
+
+    def test_load_with_custom_model_and_layer(self):
+        class CustomLayer(keras.layers.Layer):
+            def __call__(self, inputs):
+                return inputs
+
+        class Model(keras.models.Model):
+            def __init__(self):
+                super().__init__()
+                self.layer = CustomLayer()  # noqa: F821
+
+            @tf.function(input_signature=[tf.TensorSpec([None, 1])])
+            def call(self, inputs):
+                return self.layer(inputs)
+
+        model = Model()
+        inp = tf.constant([[1.0]])
+        model(inp)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        # Even if the `CustomLayer` is not provided in `custom_object_scope`,
+        # `Model` still has that reference.
+        with object_registration.custom_object_scope({"Model": Model}):
+            loaded = keras_load.load(saved_model_dir)
+        self.assertAllEqual([[1.0]], self.evaluate(loaded(inp)))
+        self.assertAllEqual([[1.0]], self.evaluate(loaded.layer(inp)))
+        self.assertIsInstance(loaded.layer, CustomLayer)
+
+        # If `CustomLayer` is provided in `custom_object_scope`, it should of
+        # course use that custom class.
+        with object_registration.custom_object_scope(
+            {"Model": Model, "CustomLayer": CustomLayer}
+        ):
+            loaded = keras_load.load(saved_model_dir)
+        self.assertAllEqual([[1.0]], self.evaluate(loaded(inp)))
+        self.assertAllEqual([[1.0]], self.evaluate(loaded.layer(inp)))
+        self.assertIsInstance(loaded.layer, CustomLayer)
+
+    def test_save_without_tracing(self):
+        class DoNotTrace(keras.layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = keras.layers.InputSpec(shape=[None])
+                self.built = True
+
+            def call(self, inputs):
+                raise ValueError("I said do not trace")
+
+            def get_config(self):
+                return {}
+
+            @property
+            def _use_input_spec_as_call_signature(self):
+                return True
+
+        root = keras.models.Sequential()
+        root.add(keras.layers.Input(shape=(3,)))
+        root.attached_layer = DoNotTrace()
+
+        saved_model_dir = self._save_model_dir()
+
+        # With the default settings, the call function is traced.
+        with self.assertRaisesRegex(ValueError, "do not trace"):
+            root.save(saved_model_dir, save_format="tf")
+
+        # When saving the config only, the layer call function should not be not
+        # traced.
+        root.save(saved_model_dir, save_format="tf", save_traces=False)
+        loaded = tf.saved_model.load(saved_model_dir)
+        self.assertTrue(hasattr(loaded, "attached_layer"))
+
+        # This should raise an error when loaded without the custom object
+        loaded = keras_load.load(saved_model_dir)
+        with self.assertRaisesRegex(ValueError, "Cannot call custom layer"):
+            loaded.attached_layer(tf.constant([1.0]))
+
+        # Try loading with the custom objects
+        with object_registration.CustomObjectScope({"DoNotTrace": DoNotTrace}):
+            loaded = keras_load.load(saved_model_dir)
+        with self.assertRaisesRegex(ValueError, "I said do not trace"):
+            loaded.attached_layer(tf.constant([1.0]))
+
+    def test_load_non_keras_saved_model(self):
+        model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
+        saved_model_dir = self._save_model_dir()
+        tf.saved_model.save(model, saved_model_dir)
+        with self.assertRaisesRegex(
+            ValueError, "Unable to create a Keras model"
+        ):
+            keras_load.load(saved_model_dir)
+
+    def test_random_generator_custom_layer(self):
+        class CustomDropout(keras.layers.Layer):
+            def __init__(self, dropout_rate=0.1, **kwargs):
+                super().__init__(**kwargs)
+                self.dropout_rate = dropout_rate
+                self.dropout = keras.layers.Dropout(
+                    dropout_rate, rng_type="stateful"
+                )
+
+            def call(self, inputs, training=False):
+                return self.dropout(inputs, training=training)
+
+        root = keras.models.Sequential(
+            [keras.layers.Input(shape=(3,)), CustomDropout()]
+        )
+        saved_model_dir = self._save_model_dir()
+        root.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+
+        output = loaded(tf.random.uniform([1, 3]), training=True)
+        self.assertAllEqual([1, 3], output.shape)
+
+    def test_random_generator_with_tracing(self):
+        # This test is to ensure we trace the training = True function first,
+        # otherwise tf.function will raise error about creating variables in the
+        # non-first call.
+        class LayerWithDropout(keras.layers.Layer):
+            def __init__(self, dropout_rate):
+                super().__init__()
+                self.dropout_rate = dropout_rate
+                self.dropout_layer = keras.layers.Dropout(self.dropout_rate)
+
+            def call(self, inputs, training=None):
+                if not training:
+                    return inputs
+                else:
+                    return self.dropout_layer(inputs, training=training)
+
+        root = keras.models.Sequential(
+            [keras.layers.Input(shape=(3,)), LayerWithDropout(0.1)]
+        )
+        saved_model_dir = self._save_model_dir()
+        root.save(saved_model_dir, save_format="tf")
+
+        loaded = keras_load.load(saved_model_dir)
+
+        output = loaded(tf.random.uniform([1, 3]), training=True)
+        self.assertAllEqual([1, 3], output.shape)
+
+
+class TestLayerCallTracing(tf.test.TestCase, parameterized.TestCase):
+    def test_functions_have_same_trace(self):
+        class Layer(keras.engine.base_layer.Layer):
+            def call(self, inputs):
+                return inputs
+
+            def call2(self, inputs):
+                return inputs * 2
+
+        layer = Layer()
+
+        call_collection = keras_save.LayerCallCollection(layer)
+        fn = call_collection.add_function(layer.call, "call", True)
+        fn2 = call_collection.add_function(layer.call2, "call2", True)
+
+        with keras_save.tracing_scope():
+            fn(np.ones((2, 3)))
+            fn(np.ones((4, 5)))
+
+        self.assertLen(
+            fn.wrapped_call._list_all_concrete_functions_for_serialization(), 2
+        )
+        self.assertLen(
+            fn2.wrapped_call._list_all_concrete_functions_for_serialization(), 2
+        )
+
+        # Check that the shapes are correct
+        self.assertEqual(
+            {(2, 3), (4, 5)},
+            set(
+                tuple(c.structured_input_signature[0][0].shape.as_list())
+                for c in fn2.wrapped_call._list_all_concrete_functions_for_serialization()  # noqa: E501
+            ),
+        )
+
+    def test_training_arg_replacement(self):
+        def assert_num_traces(layer_cls, training_keyword):
+            layer = layer_cls()
+            call_collection = keras_save.LayerCallCollection(layer)
+            fn = call_collection.add_function(layer.call, "call", True)
+
+            with keras_save.tracing_scope():
+                fn(np.ones((2, 3)), training=True)
+            self.assertLen(
+                fn.wrapped_call._list_all_concrete_functions_for_serialization(),  # noqa: E501
+                2,
+            )
+            with keras_save.tracing_scope():
+                fn(np.ones((2, 4)), training=False)
+            self.assertLen(
+                fn.wrapped_call._list_all_concrete_functions_for_serialization(),  # noqa: E501
+                4,
+            )
+
+            if training_keyword:
+                with keras_save.tracing_scope():
+                    fn(np.ones((2, 5)), True)
+                self.assertLen(
+                    fn.wrapped_call._list_all_concrete_functions_for_serialization(),  # noqa: E501
+                    6,
+                )
+                with keras_save.tracing_scope():
+                    fn(np.ones((2, 6)))
+                self.assertLen(
+                    fn.wrapped_call._list_all_concrete_functions_for_serialization(),  # noqa: E501
+                    8,
+                )
+
+        class LayerWithTrainingKeyword(keras.engine.base_layer.Layer):
+            def call(self, inputs, training=False):
+                return inputs * training
+
+        assert_num_traces(LayerWithTrainingKeyword, training_keyword=True)
+
+        class LayerWithKwargs(keras.engine.base_layer.Layer):
+            def call(self, inputs, **kwargs):
+                return inputs * kwargs["training"]
+
+        assert_num_traces(LayerWithKwargs, training_keyword=False)
+
+        class LayerWithChildLayer(keras.engine.base_layer.Layer):
+            def __init__(self):
+                self.child = LayerWithKwargs()
+                super().__init__()
+
+            def call(self, inputs):
+                return self.child(inputs)
+
+        assert_num_traces(LayerWithChildLayer, training_keyword=False)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_maintains_losses(self):
+        layer = LayerWithLoss()
+        layer(np.ones((2, 3)))
+        previous_losses = layer.losses[:]
+
+        call_collection = keras_save.LayerCallCollection(layer)
+        fn = call_collection.add_function(layer.call, "call", True)
+        fn(np.ones((2, 3)))
+
+        self.assertAllEqual(
+            self.evaluate(previous_losses), self.evaluate(layer.losses)
+        )
+
+
+@object_registration.register_keras_serializable("Testing")
+class CustomMeanMetric(keras.metrics.Mean):
+    def update_state(self, *args):
+        # Sometimes built-in metrics return an op in update_state. Custom
+        # metrics don't support returning ops, so wrap the update_state method
+        # while returning nothing.
+        super().update_state(*args)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
+class MetricTest(tf.test.TestCase, parameterized.TestCase):
+    def _save_model_dir(self, dirname="saved_model"):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        return os.path.join(temp_dir, dirname)
+
+    def generate_inputs(self, num_tensor_args, shape=(1, 5)):
+        return [
+            np.random.uniform(0, 1, shape).astype("float32")
+            for _ in range(num_tensor_args)
+        ]
+
+    def _test_metric_save_and_load(
+        self,
+        metric,
+        save_dir,
+        num_tensor_args,
+        shape=(1, 5),
+        test_sample_weight=True,
+    ):
+        with self.cached_session():
+            model = test_utils.get_model_from_layers(
+                [keras.layers.Layer()], input_shape=[3], model_type="functional"
+            )
+            model.saved_metric = metric
+            model.save(save_dir, save_format="tf")
+            loaded_model = keras_load.load(save_dir)
+            loaded = loaded_model.saved_metric
+            self.evaluate([v.initializer for v in loaded.variables])
+            self.assertEqual(metric.name, loaded.name)
+            self.assertEqual(metric.dtype, loaded.dtype)
+
+            inputs = self.generate_inputs(num_tensor_args, shape)
+            actual = self.evaluate(metric(*inputs))
+            self.assertAllClose(actual, loaded(*inputs))
+            self.assertAllClose(metric.variables, loaded.variables)
+
+            # Test with separate calls to update state and result.
+            inputs = self.generate_inputs(num_tensor_args, shape)
+            self.evaluate(metric.update_state(*inputs))
+            self.evaluate(loaded.update_state(*inputs))
+            actual = self.evaluate(metric.result())
+            self.assertAllClose(actual, loaded.result())
+
+            if test_sample_weight:
+                # Test with sample weights input.
+                inputs = self.generate_inputs(num_tensor_args, shape)
+                sample_weight = self.generate_inputs(1, [])[0]
+                inputs.append(sample_weight)
+
+                actual = self.evaluate(metric(*inputs))
+                self.assertAllClose(actual, loaded(*inputs))
+            return loaded
+
+    @parameterized.named_parameters(
+        [
+            ("mean", keras.metrics.Mean, 1, (1, 5)),
+            ("false_positives", keras.metrics.FalsePositives, 2, (1, 5)),
+            (
+                "precision_at_top_k",
+                keras.metrics.Precision,
+                2,
+                (2, 3, 4),
+                {"top_k": 2, "class_id": 1},
+            ),
+            (
+                "precision_at_recall",
+                keras.metrics.PrecisionAtRecall,
+                2,
+                (1, 5),
+                {"recall": 0.8},
+            ),
+            ("auc", keras.metrics.AUC, 2, (1, 5), {"multi_label": True}),
+            ("cosine_similarity", keras.metrics.CosineSimilarity, 2, (2, 3, 1)),
+        ]
+    )
+    def test_metric(self, metric_cls, num_tensor_args, shape, init_kwargs=None):
+        init_kwargs = init_kwargs or {}
+        metric = metric_cls(**init_kwargs)
+        metric(*self.generate_inputs(num_tensor_args, shape))
+        self.evaluate([v.initializer for v in metric.variables])
+        loaded = self._test_metric_save_and_load(
+            metric, self._save_model_dir(), num_tensor_args, shape
+        )
+        self.assertEqual(type(loaded), type(metric))
+
+    @parameterized.named_parameters(
+        [
+            ("mean", keras.metrics.Mean, 1, False),
+            ("auc", keras.metrics.AUC, 2, False),
+            ("mean_tensor", keras.metrics.MeanTensor, 1, True),
+        ]
+    )
+    def test_custom_metric(self, base_cls, num_tensor_args, requires_build):
+        class CustomMetric(base_cls):
+            def update_state(self, *args):
+                # Sometimes built-in metrics return an op in update_state.
+                # Custom metrics don't support returning ops, so wrap the
+                # update_state method while returning nothing.
+                super().update_state(*args)
+
+        with self.cached_session():
+            metric = CustomMetric()
+            save_dir = self._save_model_dir("first_save")
+
+            if requires_build:
+                metric(*self.generate_inputs(num_tensor_args))
+
+            self.evaluate([v.initializer for v in metric.variables])
+
+            with self.assertRaisesRegex(
+                ValueError, "Unable to restore custom object"
+            ):
+                self._test_metric_save_and_load(
+                    metric, save_dir, num_tensor_args
+                )
+            with object_registration.CustomObjectScope(
+                {"CustomMetric": CustomMetric}
+            ):
+                loaded = self._test_metric_save_and_load(
+                    metric, save_dir, num_tensor_args, test_sample_weight=False
+                )
+
+                self._test_metric_save_and_load(
+                    loaded,
+                    self._save_model_dir("second_save"),
+                    num_tensor_args,
+                    test_sample_weight=False,
+                )
+
+    def test_registered_custom_metric(self):
+
+        with self.cached_session():
+            metric = CustomMeanMetric()
+            save_dir = self._save_model_dir("first_save")
+            self.evaluate([v.initializer for v in metric.variables])
+            loaded = self._test_metric_save_and_load(
+                metric, save_dir, num_tensor_args=1, test_sample_weight=False
+            )
+
+            self._test_metric_save_and_load(
+                loaded,
+                self._save_model_dir("second_save"),
+                num_tensor_args=1,
+                test_sample_weight=False,
+            )
+
+    def test_custom_metric_wrapped_call(self):
+        class NegativeMean(keras.metrics.Mean):
+            @tf.function(input_signature=[tf.TensorSpec(None, tf.float32)])
+            def update_state(self, value):
+                super().update_state(-value)
+
+        metric = NegativeMean()
+        self.evaluate([v.initializer for v in metric.variables])
+        with object_registration.CustomObjectScope(
+            {"NegativeMean": NegativeMean}
+        ):
+            self._test_metric_save_and_load(
+                metric, self._save_model_dir(), 1, test_sample_weight=False
+            )
+
+    @test_combinations.run_with_all_model_types
+    def test_custom_metric_model(self):
+        # TODO(b/134519980): Issue with `model.fit` if the model call function
+        # uses a `tf.function` in graph mode.
+        if not tf.executing_eagerly():
+            return
+
+        x = np.random.random((1, 3))
+        y = np.random.random((1, 4))
+
+        class CustomMetric(keras.metrics.MeanSquaredError):
+            pass
+
+        def zero_metric(y_true, y_pred):
+            del y_true, y_pred
+            return 0
+
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        model.compile(
+            loss="mse", optimizer="SGD", metrics=[CustomMetric(), zero_metric]
+        )
+        model.fit(x, y)
+        saved_model_dir = self._save_model_dir()
+        model.save(saved_model_dir, save_format="tf")
+
+        with self.assertRaisesRegex(ValueError, "custom_objects"):
+            keras_load.load(saved_model_dir)
+
+        with object_registration.CustomObjectScope(
+            {"CustomMetric": CustomMetric, "zero_metric": zero_metric}
+        ):
+            loaded = keras_load.load(saved_model_dir)
+
+        self.evaluate([v.initializer for v in loaded.variables])
+        loaded.fit(x, y)
+
+
+class TestUpdateMetadata(tf.test.TestCase):
+    def testAddFullSaveSpec(self):
+        save_spec = tf.TensorSpec([3, 5], dtype=tf.int32)
+        node_metadata = json_utils.Encoder().encode({"save_spec": save_spec})
+
+        metadata = saved_metadata_pb2.SavedMetadata()
+        metadata.nodes.add(
+            version=versions_pb2.VersionDef(
+                producer=1, min_consumer=1, bad_consumers=[]
+            ),
+            identifier="_tf_keras_model",
+            metadata=node_metadata,
+        )
+
+        new_metadata = keras_load._update_to_current_version(metadata)
+        node_metadata = json_utils.decode(new_metadata.nodes[0].metadata)
+        expected_full_spec = ([tf.TensorSpec(shape=(3, 5), dtype=tf.int32)], {})
+        self.assertAllEqual(
+            expected_full_spec, node_metadata.get("full_save_spec")
+        )
+
+
+if __name__ == "__main__":
+    with saved_model_utils.keras_option_scope(
+        save_traces=False, in_tf_saved_model_scope=True
+    ):
+        tf.test.main()
diff --git a/keras/saving/legacy/saved_model/serialized_attributes.py b/keras/saving/legacy/saved_model/serialized_attributes.py
new file mode 100644
index 000000000000..6780ad669b94
--- /dev/null
+++ b/keras/saving/legacy/saved_model/serialized_attributes.py
@@ -0,0 +1,376 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper classes that list&validate all attributes to serialize to SavedModel.
+"""
+
+import tensorflow.compat.v2 as tf
+
+from keras.saving.legacy.saved_model import constants
+from keras.saving.legacy.saved_model import order_preserving_set as ops
+from keras.saving.legacy.saved_model import save_impl
+from keras.utils.generic_utils import LazyLoader
+
+# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
+# once the issue with copybara is fixed.
+
+base_layer = LazyLoader("base_layer", globals(), "keras.engine.base_layer")
+training_lib = LazyLoader("training_lib", globals(), "keras.engine.training")
+metrics = LazyLoader("metrics", globals(), "keras.metrics")
+base_rnn = LazyLoader("base_rnn", globals(), "keras.layers.rnn.base_rnn")
+
+
+class SerializedAttributes:
+    """Class that tracks and validates all serialization attributes.
+
+    Keras models contain many Python-defined components. For example, the
+    trainable_variable property lists the model's trainable variables by
+    recursively retrieving the trainable variables from each of the child
+    layers.  Another example is model.call, a python function that calls child
+    layers and adds ops to the backend graph.
+
+    Only Tensorflow checkpointable objects and functions can be serialized to
+    SavedModel. Serializing a Keras model as-is results in a checkpointable
+    object that does not resemble a Keras model at all. Thus, extra
+    checkpointable objects and functions must be created during serialization.
+
+    **Defining new serialized attributes**
+    Child classes should be defined using:
+      SerializedAttributes.with_attributes(
+          'name', checkpointable_objects=[...],
+          functions=[...], copy_from=[...])
+    This class is used to cache generated checkpointable objects and functions,
+    ensuring that new objects and functions are generated a single time.
+
+    **Usage during serialization**
+    Each Layer/Model object should have a corresponding instance of
+    SerializedAttributes. Create a new instance by calling
+    `SerializedAttributes.new(obj)`. Objects and functions may be saved using
+    `.set_and_validate_checkpointable_objects`/`.set_and_and_validate_functions`.
+    The properties `.checkpointable_objects` and `.functions` returns the cached
+    values.
+
+    **Adding/changing attributes to save to SavedModel**
+    1. Change the call to `SerializedAttributes.with_attributes` in the correct
+       class:
+       - CommonEndpoints: Base attributes to be added during serialization. If
+         these attributes are present in a Trackable object, it can be
+         deserialized to a Keras Model.
+       - LayerAttributes: Attributes to serialize for Layer objects.
+       - ModelAttributes: Attributes to serialize for Model objects.
+    2. Update class docstring
+    3. Update arguments to any calls to `set_and_validate_*`. For example, if
+       `call_raw_tensors` is added to the ModelAttributes function list, then
+       a `call_raw_tensors` function should be passed to
+       `set_and_validate_functions`.
+
+    **Common endpoints vs other attributes**
+    Only common endpoints are attached directly to the root object.
+    Keras-specific attributes are saved to a separate trackable object with the
+    name "keras_api".  The number of objects attached to the root is limited
+    because any naming conflicts will cause user code to break.
+
+    Another reason is that this will only affect users who call
+    `tf.saved_model.load` instead of `tf.keras.models.load_model`. These are
+    advanced users who are likely to have defined their own tf.functions and
+    trackable objects. The added Keras-specific attributes are kept out of the
+    way in the "keras_api" namespace.
+
+    Properties defined in this class may be used to filter out keras-specific
+    attributes:
+    - `functions_to_serialize`: Returns dict of functions to attach to the root
+        object.
+    - `checkpointable_objects_to_serialize`: Returns dict of objects to attach
+         to the root object (including separate trackable object containing
+         keras-specific attributes)
+
+    All changes to the serialized attributes must be backwards-compatible, so
+    attributes should not be removed or modified without sufficient
+    justification.
+    """
+
+    @staticmethod
+    def with_attributes(
+        name, checkpointable_objects=None, functions=None, copy_from=None
+    ):
+        """Creates a subclass with all attributes as specified in the arguments.
+
+        Args:
+          name: Name of subclass
+          checkpointable_objects: List of checkpointable objects to be
+            serialized in the SavedModel.
+          functions: List of functions to be serialized in the SavedModel.
+          copy_from: List of other SerializedAttributes subclasses. The returned
+            class will copy checkpoint objects/functions from each subclass.
+
+        Returns:
+          Child class with attributes as defined in the `checkpointable_objects`
+          and `functions` lists.
+        """
+        checkpointable_objects = checkpointable_objects or []
+        functions = functions or []
+
+        if copy_from is not None:
+            for cls in copy_from:
+                checkpointable_objects.extend(cls.all_checkpointable_objects)
+                functions.extend(cls.all_functions)
+
+        # OrderPreservingSets are used here to guarantee serialization
+        # determinism of Keras objects.
+        classdict = {
+            "all_checkpointable_objects": ops.OrderPreservingSet(
+                checkpointable_objects
+            ),
+            "all_functions": ops.OrderPreservingSet(functions),
+        }
+        return type(name, (SerializedAttributes,), classdict)
+
+    @staticmethod
+    def new(obj):
+        """Returns a new SerializedAttribute object."""
+        if isinstance(obj, training_lib.Model):
+            return ModelAttributes()
+        elif isinstance(obj, metrics.Metric):
+            return MetricAttributes()
+        elif isinstance(obj, base_rnn.RNN):
+            return RNNAttributes()
+        elif isinstance(obj, base_layer.Layer):
+            return LayerAttributes()
+        else:
+            raise TypeError(
+                "Internal error during serialization. Expected Keras "
+                f"Layer object. Received: {obj} "
+                f"(of type {type(obj)})"
+            )
+
+    def __init__(self):
+        self._object_dict = {}
+        self._function_dict = {}
+        self._keras_trackable = tf.__internal__.tracking.AutoTrackable()
+
+    @property
+    def functions(self):
+        """Returns dictionary of all functions."""
+        return {
+            key: value
+            for key, value in self._function_dict.items()
+            if value is not None
+        }
+
+    @property
+    def checkpointable_objects(self):
+        """Returns dictionary of all checkpointable objects."""
+        return {
+            key: value
+            for key, value in self._object_dict.items()
+            if value is not None
+        }
+
+    @property
+    def functions_to_serialize(self):
+        """Returns functions to attach to the root object during
+        serialization."""
+        functions = {}
+        for key, v in self.functions.items():
+            if key in CommonEndpoints.all_functions:
+                functions[key] = (
+                    v.wrapped_call if isinstance(v, save_impl.LayerCall) else v
+                )
+        return functions
+
+    @property
+    def objects_to_serialize(self):
+        """Returns objects to attach to the root object during serialization."""
+        objects = {
+            key: value
+            for key, value in self.checkpointable_objects.items()
+            if key in CommonEndpoints.all_checkpointable_objects
+        }
+        objects[constants.KERAS_ATTR] = self._keras_trackable
+        return objects
+
+    def set_and_validate_functions(self, function_dict):
+        """Saves function dictionary, and validates dictionary values."""
+        for key in self.all_functions:
+            if key in function_dict:
+                if function_dict[
+                    key
+                    # Not all functions are required
+                ] is not None and not isinstance(
+                    function_dict[key],
+                    (
+                        tf.__internal__.function.Function,
+                        tf.types.experimental.ConcreteFunction,
+                        save_impl.LayerCall,
+                    ),
+                ):
+                    raise ValueError(
+                        "The tf.function dictionary contained a non-function "
+                        f"object: {function_dict[key]} (for key {key}). Only "
+                        "tf.function instances or ConcreteFunction instances "
+                        "should be passed."
+                    )
+                fn = function_dict[key]
+                self._function_dict[key] = fn
+
+                # Extract TensorFlow `Function` from LayerCall.
+                tf_fn = (
+                    fn.wrapped_call
+                    if isinstance(fn, save_impl.LayerCall)
+                    else fn
+                )
+                setattr(self._keras_trackable, key, tf_fn)
+            else:
+                raise ValueError(
+                    f"Function {key} missing from serialized "
+                    "tf.function dictionary."
+                )
+        return self.functions
+
+    def set_and_validate_objects(self, object_dict):
+        """Saves objects to a dictionary, and validates the values."""
+        for key in self.all_checkpointable_objects:
+            if key in object_dict:
+                if not isinstance(
+                    object_dict[key], tf.__internal__.tracking.Trackable
+                ):
+                    raise ValueError(
+                        "The object dictionary contained a non-trackable "
+                        f"object: {object_dict[key]} (for key {key}). "
+                        "Only trackable objects are "
+                        "allowed, such as Keras layers/models or "
+                        "tf.Module instances."
+                    )
+                self._object_dict[key] = object_dict[key]
+                setattr(self._keras_trackable, key, object_dict[key])
+            else:
+                raise ValueError(
+                    f"Object {key} missing from serialized object dictionary."
+                )
+        return self.checkpointable_objects
+
+
+class CommonEndpoints(
+    SerializedAttributes.with_attributes(
+        "CommonEndpoints",
+        checkpointable_objects=[
+            "variables",
+            "trainable_variables",
+            "regularization_losses",
+        ],
+        functions=[
+            "__call__",
+            "call_and_return_all_conditional_losses",
+            "_default_save_signature",
+        ],
+    )
+):
+    """Common endpoints shared by all models loadable by Keras.
+
+    List of all attributes:
+      variables: List of all variables in the model and its sublayers.
+      trainable_variables: List of all trainable variables in the model and its
+        sublayers.
+      regularization_losses: List of all unconditional losses (losses not
+        dependent on the inputs) in the model and its sublayers.
+      __call__: Function that takes inputs and returns the outputs of the model
+        call function.
+      call_and_return_all_conditional_losses: Function that returns a tuple of
+        (call function outputs, list of all losses that depend on the inputs).
+      _default_save_signature: Traced model call function. This is only included
+        if the top level exported object is a Keras model.
+    """
+
+
+class LayerAttributes(
+    SerializedAttributes.with_attributes(
+        "LayerAttributes",
+        checkpointable_objects=[
+            "non_trainable_variables",
+            "layers",
+            "metrics",
+            "layer_regularization_losses",
+            "layer_metrics",
+        ],
+        functions=[
+            "call_and_return_conditional_losses",
+            "activity_regularizer_fn",
+        ],
+        copy_from=[CommonEndpoints],
+    )
+):
+    """Layer checkpointable objects + functions saved to the SavedModel.
+
+    List of all attributes:
+      All attributes from CommonEndpoints
+      non_trainable_variables: List of non-trainable variables in the layer and
+        its sublayers.
+      layers: List of all sublayers.
+      metrics: List of all metrics in the layer and its sublayers.
+      call_and_return_conditional_losses: Function that takes inputs and returns
+        a tuple of (outputs of the call function, list of input-dependent
+        losses).  The list of losses excludes the activity regularizer function,
+        which is separate to allow the deserialized Layer object to define a
+        different activity regularizer.
+      activity_regularizer_fn: Callable that returns the activity regularizer
+        loss
+      layer_regularization_losses: List of losses owned only by this layer.
+      layer_metrics: List of metrics owned by this layer.
+    """
+
+
+class ModelAttributes(
+    SerializedAttributes.with_attributes(
+        "ModelAttributes", copy_from=[LayerAttributes]
+    )
+):
+    """Model checkpointable objects + functions saved to the SavedModel.
+
+    List of all attributes:
+      All attributes from LayerAttributes (including CommonEndpoints)
+    """
+
+    # TODO(kathywu): Add attributes `compile_losses` and `compile_metrics`,
+    # which list all losses and metrics defined by `model.compile`.
+
+
+class MetricAttributes(
+    SerializedAttributes.with_attributes(
+        "MetricAttributes",
+        checkpointable_objects=["variables"],
+        functions=[],
+    )
+):
+    """Attributes that are added to Metric objects when saved to SavedModel.
+
+    List of all attributes:
+      variables: list of all variables
+    """
+
+    pass
+
+
+class RNNAttributes(
+    SerializedAttributes.with_attributes(
+        "RNNAttributes",
+        checkpointable_objects=["states"],
+        copy_from=[LayerAttributes],
+    )
+):
+    """RNN checkpointable objects + functions that are saved to the SavedModel.
+
+    List of all attributes:
+      All attributes from LayerAttributes (including CommonEndpoints)
+      states: List of state variables
+    """
diff --git a/keras/saving/legacy/saved_model/utils.py b/keras/saving/legacy/saved_model/utils.py
new file mode 100644
index 000000000000..62c49f7785b1
--- /dev/null
+++ b/keras/saving/legacy/saved_model/utils.py
@@ -0,0 +1,289 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions shared between SavedModel saving/loading
+implementations."""
+
+import copy
+import itertools
+import threading
+import types
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.engine import base_layer_utils
+from keras.utils import control_flow_util
+from keras.utils import tf_contextlib
+from keras.utils.generic_utils import LazyLoader
+from keras.utils.layer_utils import CallFunctionSpec
+
+training_lib = LazyLoader("training_lib", globals(), "keras.engine.training")
+
+
+def use_wrapped_call(
+    layer, call_fn, call_spec, default_training_value=None, return_method=False
+):
+    """Creates fn that adds losses returned by call_fn & returns the outputs.
+
+    Args:
+      layer: A Keras layer object
+      call_fn: tf.function that takes layer inputs (and possibly a training
+        arg), and returns a tuple of (outputs, list of losses).
+      call_spec: The `CallFunctionSpec` for the layer's call function.
+      default_training_value: Default value of the training kwarg. If `None`,
+        the default is `tf.keras.backend.learning_phase()`.
+      return_method: Whether to return a method bound to the layer.
+
+    Returns:
+      function that calls call_fn and returns the outputs. Losses returned by
+      call_fn are added to the layer losses.
+    """
+    expects_training_arg = layer_uses_training_bool(layer)
+
+    fn, arg_spec = maybe_add_training_arg(
+        call_spec, call_fn, expects_training_arg, default_training_value
+    )
+
+    def return_outputs_and_add_losses(*args, **kwargs):
+        """Returns the outputs from the layer call function, and adds the
+        losses."""
+        if return_method:
+            args = args[1:]
+
+        outputs, losses = fn(*args, **kwargs)
+        layer.add_loss(losses)
+
+        # TODO(kathywu): This is a temporary hack. When a network of layers is
+        # revived from SavedModel, only the top-level layer will have losses.
+        # This causes issues in eager mode because the child layers may have
+        # graph losses (thus model.losses returns a mix of Eager and graph
+        # tensors). To fix this, whenever eager losses are added to one layer,
+        # add eager losses to all child layers. This causes `.losses` to only
+        # return eager losses.
+
+        if tf.executing_eagerly():
+            for i in layer._flatten_layers():
+                if i is not layer:
+                    i._eager_losses = [
+                        base_layer_utils.REVIVED_LOSS_PLACEHOLDER
+                    ]
+
+        return outputs
+
+    decorated = tf.__internal__.decorator.make_decorator(
+        target=call_fn,
+        decorator_func=return_outputs_and_add_losses,
+        decorator_argspec=arg_spec,
+    )
+
+    if return_method:
+        return types.MethodType(decorated, layer)
+    else:
+        return decorated
+
+
+def layer_uses_training_bool(layer):
+    """Returns whether this layer or any of its children uses the training
+    arg."""
+    if layer._expects_training_arg:
+        return True
+    visited = {layer}
+    to_visit = list_all_layers(layer)
+    while to_visit:
+        layer = to_visit.pop()
+        if layer in visited:
+            continue
+        if getattr(layer, "_expects_training_arg", True):
+            return True
+        visited.add(layer)
+        to_visit.extend(list_all_layers(layer))
+    return False
+
+
+def list_all_layers(obj):
+    if isinstance(obj, training_lib.Model):
+        # Handle special case of Sequential, which doesn't return
+        # the `Input` layer.
+        return obj.layers
+    else:
+        return list(obj._flatten_layers(include_self=False, recursive=False))
+
+
+def list_all_layers_and_sublayers(obj):
+    s = set([obj])
+    s.update(
+        itertools.chain.from_iterable(
+            list_all_layers_and_sublayers(layer)
+            for layer in list_all_layers(obj)
+        )
+    )
+    return s
+
+
+def maybe_add_training_arg(
+    call_spec, wrapped_call, expects_training_arg, default_training_value
+):
+    """Decorate call and optionally adds training argument.
+
+    If a layer expects a training argument, this function ensures that
+    'training' is present in the layer args or kwonly args, with the default
+    training value.
+
+    Args:
+      call_spec: CallFunctionSpec of the layer.
+      wrapped_call: Wrapped call function.
+      expects_training_arg: Whether to include 'training' argument.
+      default_training_value: Default value of the training kwarg to include in
+        the arg spec. If `None`, the default is
+        `tf.keras.backend.learning_phase()`.
+
+    Returns:
+      Tuple of (
+        function that calls `wrapped_call` and sets the training arg,
+        Argspec of returned function or `None` if the argspec is unchanged)
+    """
+    if not expects_training_arg:
+        return wrapped_call, None
+
+    arg_spec = set_training_arg_spec(
+        call_spec.full_argspec, default_training_value
+    )
+    call_spec = CallFunctionSpec(arg_spec)
+
+    def wrap_with_training_arg(*args, **kwargs):
+        """Wrap the `wrapped_call` function, and set training argument."""
+        try:
+            training = call_spec.get_arg_value(
+                "training", args, kwargs, inputs_in_args=True
+            )
+        except KeyError:
+            training = None
+
+        if training is None:
+            training = (
+                default_training_value
+                or base_layer_utils.call_context().training
+                or backend.learning_phase()
+            )
+
+        args = list(args)
+        kwargs = kwargs.copy()
+
+        def replace_training_and_call(training):
+            new_args, new_kwargs = call_spec.set_arg_value(
+                "training", training, args, kwargs, inputs_in_args=True
+            )
+            return wrapped_call(*new_args, **new_kwargs)
+
+        return control_flow_util.smart_cond(
+            training,
+            lambda: replace_training_and_call(True),
+            lambda: replace_training_and_call(False),
+        )
+
+    return wrap_with_training_arg, arg_spec
+
+
+def set_training_arg_spec(arg_spec, default_training_value):
+    """Set `training=DEFAULT` argument in an ArgSpec."""
+    if "training" in arg_spec.args:
+        # If `training` is already in the args list, try to set the default
+        # value.
+        index = arg_spec.args.index("training")
+        training_default_index = len(arg_spec.args) - index
+        defaults = (
+            list(arg_spec.defaults) if arg_spec.defaults is not None else []
+        )
+        if (
+            arg_spec.defaults
+            and len(arg_spec.defaults) >= training_default_index
+            and defaults[-training_default_index] is None
+        ):
+            defaults[-training_default_index] = default_training_value
+            return arg_spec._replace(defaults=defaults)
+    elif "training" not in arg_spec.kwonlyargs:
+        kwonlyargs = arg_spec.kwonlyargs + ["training"]
+        kwonlydefaults = copy.copy(arg_spec.kwonlydefaults) or {}
+        kwonlydefaults["training"] = default_training_value
+        return arg_spec._replace(
+            kwonlyargs=kwonlyargs, kwonlydefaults=kwonlydefaults
+        )
+
+    return arg_spec
+
+
+class SaveOptionsContext(threading.local):
+    def __init__(self):
+        super().__init__()
+        self.save_traces = True
+        self.in_tf_saved_model_scope = False
+
+
+_save_options_context = SaveOptionsContext()
+
+
+@tf_contextlib.contextmanager
+def keras_option_scope(save_traces, in_tf_saved_model_scope=True):
+    save_traces_previous_value = _save_options_context.save_traces
+    in_scope_previous_value = _save_options_context.in_tf_saved_model_scope
+    try:
+        _save_options_context.save_traces = save_traces
+        _save_options_context.in_tf_saved_model_scope = in_tf_saved_model_scope
+        yield
+    finally:
+        _save_options_context.save_traces = save_traces_previous_value
+        _save_options_context.in_tf_saved_model_scope = in_scope_previous_value
+
+
+def should_save_traces():
+    """Whether to trace layer functions-can be disabled in the save_traces
+    arg."""
+    return _save_options_context.save_traces
+
+
+def in_tf_saved_model_scope():
+    return _save_options_context.in_tf_saved_model_scope
+
+
+@tf_contextlib.contextmanager
+def no_automatic_dependency_tracking_scope(obj):
+    """Context that disables automatic dependency tracking when assigning attrs.
+
+    Objects that inherit from Autotrackable automatically creates dependencies
+    to trackable objects through attribute assignments, and wraps data
+    structures (lists or dicts) with trackable classes. This scope may be used
+    to temporarily disable this behavior. This works similar to the decorator
+    `no_automatic_dependency_tracking`.
+
+    Example usage:
+    ```
+    model = tf.keras.Model()
+    model.arr1 = []  # Creates a ListWrapper object
+    with no_automatic_dependency_tracking_scope(model):
+      model.arr2 = []  # Creates a regular, untracked python list
+    ```
+
+    Args:
+      obj: A trackable object.
+
+    Yields:
+      a scope in which the object doesn't track dependencies.
+    """
+    previous_value = getattr(obj, "_setattr_tracking", True)
+    obj._setattr_tracking = False
+    try:
+        yield
+    finally:
+        obj._setattr_tracking = previous_value
diff --git a/keras/saving/legacy/saving_utils.py b/keras/saving/legacy/saving_utils.py
new file mode 100644
index 000000000000..3522f2214bef
--- /dev/null
+++ b/keras/saving/legacy/saving_utils.py
@@ -0,0 +1,371 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils related to keras model saving."""
+
+import copy
+import os
+
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras import backend
+from keras import losses
+from keras import optimizers
+from keras.engine import base_layer_utils
+from keras.optimizers import optimizer_v1
+from keras.saving.legacy import serialization
+from keras.utils import version_utils
+from keras.utils.io_utils import ask_to_proceed_with_overwrite
+
+# isort: off
+from tensorflow.python.platform import tf_logging as logging
+
+
+def extract_model_metrics(model):
+    """Convert metrics from a Keras model `compile` API to dictionary.
+
+    This is used for converting Keras models to Estimators and SavedModels.
+
+    Args:
+      model: A `tf.keras.Model` object.
+
+    Returns:
+      Dictionary mapping metric names to metric instances. May return `None` if
+      the model does not contain any metrics.
+    """
+    if getattr(model, "_compile_metrics", None):
+        # TODO(psv/kathywu): use this implementation in model to estimator flow.
+        # We are not using model.metrics here because we want to exclude the
+        # metrics added using `add_metric` API.
+        return {m.name: m for m in model._compile_metric_functions}
+    return None
+
+
+def model_call_inputs(model, keep_original_batch_size=False):
+    """Inspect model to get its input signature.
+
+    The model's input signature is a list with a single (possibly-nested)
+    object. This is due to the Keras-enforced restriction that tensor inputs
+    must be passed in as the first argument.
+
+    For example, a model with input {'feature1': <Tensor>, 'feature2': <Tensor>}
+    will have input signature:
+    [{'feature1': TensorSpec, 'feature2': TensorSpec}]
+
+    Args:
+      model: Keras Model object.
+      keep_original_batch_size: A boolean indicating whether we want to keep
+        using the original batch size or set it to None. Default is `False`,
+        which means that the batch dim of the returned input signature will
+        always be set to `None`.
+
+    Returns:
+      A tuple containing `(args, kwargs)` TensorSpecs of the model call function
+      inputs.
+      `kwargs` does not contain the `training` argument.
+    """
+    input_specs = model.save_spec(dynamic_batch=not keep_original_batch_size)
+    if input_specs is None:
+        return None, None
+    input_specs = _enforce_names_consistency(input_specs)
+    return input_specs
+
+
+def raise_model_input_error(model):
+    if isinstance(model, keras.models.Sequential):
+        raise ValueError(
+            f"Model {model} cannot be saved because the input shape is not "
+            "available. Please specify an input shape either by calling "
+            "`build(input_shape)` directly, or by calling the model on actual "
+            "data using `Model()`, `Model.fit()`, or `Model.predict()`."
+        )
+
+    # If the model is not a `Sequential`, it is intended to be a subclassed
+    # model.
+    raise ValueError(
+        f"Model {model} cannot be saved either because the input shape is not "
+        "available or because the forward pass of the model is not defined."
+        "To define a forward pass, please override `Model.call()`. To specify "
+        "an input shape, either call `build(input_shape)` directly, or call "
+        "the model on actual data using `Model()`, `Model.fit()`, or "
+        "`Model.predict()`. If you have a custom training step, please make "
+        "sure to invoke the forward pass in train step through "
+        "`Model.__call__`, i.e. `model(inputs)`, as opposed to `model.call()`."
+    )
+
+
+def trace_model_call(model, input_signature=None):
+    """Trace the model call to create a tf.function for exporting a Keras model.
+
+    Args:
+      model: A Keras model.
+      input_signature: optional, a list of tf.TensorSpec objects specifying the
+        inputs to the model.
+
+    Returns:
+      A tf.function wrapping the model's call function with input signatures
+      set.
+
+    Raises:
+      ValueError: if input signature cannot be inferred from the model.
+    """
+    if input_signature is None:
+        if isinstance(model.call, tf.__internal__.function.Function):
+            input_signature = model.call.input_signature
+
+    if input_signature:
+        model_args = input_signature
+        model_kwargs = {}
+    else:
+        model_args, model_kwargs = model_call_inputs(model)
+
+        if model_args is None:
+            raise_model_input_error(model)
+
+    @tf.function
+    def _wrapped_model(*args, **kwargs):
+        """A concrete tf.function that wraps the model's call function."""
+        (args, kwargs,) = model._call_spec.set_arg_value(
+            "training", False, args, kwargs, inputs_in_args=True
+        )
+
+        with base_layer_utils.call_context().enter(
+            model, inputs=None, build_graph=False, training=False, saving=True
+        ):
+            outputs = model(*args, **kwargs)
+
+        # Outputs always has to be a flat dict.
+        output_names = model.output_names  # Functional Model.
+        if output_names is None:  # Subclassed Model.
+            from keras.engine import compile_utils
+
+            output_names = compile_utils.create_pseudo_output_names(outputs)
+        outputs = tf.nest.flatten(outputs)
+        return {name: output for name, output in zip(output_names, outputs)}
+
+    return _wrapped_model.get_concrete_function(*model_args, **model_kwargs)
+
+
+def model_metadata(model, include_optimizer=True, require_config=True):
+    """Returns a dictionary containing the model metadata."""
+    from keras import __version__ as keras_version
+    from keras.optimizers.legacy import optimizer_v2
+
+    model_config = {"class_name": model.__class__.__name__}
+    try:
+        model_config["config"] = model.get_config()
+    except NotImplementedError as e:
+        if require_config:
+            raise e
+
+    metadata = dict(
+        keras_version=str(keras_version),
+        backend=backend.backend(),
+        model_config=model_config,
+    )
+    if model.optimizer and include_optimizer:
+        if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
+            logging.warning(
+                "TensorFlow optimizers do not "
+                "make it possible to access "
+                "optimizer attributes or optimizer state "
+                "after instantiation. "
+                "As a result, we cannot save the optimizer "
+                "as part of the model save file. "
+                "You will have to compile your model again after loading it. "
+                "Prefer using a Keras optimizer instead "
+                "(see keras.io/optimizers)."
+            )
+        elif model._compile_was_called:
+            training_config = model._get_compile_args(user_metrics=False)
+            training_config.pop("optimizer", None)  # Handled separately.
+            metadata["training_config"] = _serialize_nested_config(
+                training_config
+            )
+            if isinstance(model.optimizer, optimizer_v2.RestoredOptimizer):
+                raise NotImplementedError(
+                    "Optimizers loaded from a SavedModel cannot be saved. "
+                    "If you are calling `model.save` or "
+                    "`tf.keras.models.save_model`, "
+                    "please set the `include_optimizer` option to `False`. For "
+                    "`tf.saved_model.save`, "
+                    "delete the optimizer from the model."
+                )
+            else:
+                optimizer_config = {
+                    "class_name": keras.utils.get_registered_name(
+                        model.optimizer.__class__
+                    ),
+                    "config": model.optimizer.get_config(),
+                }
+            metadata["training_config"]["optimizer_config"] = optimizer_config
+    return metadata
+
+
+def should_overwrite(filepath, overwrite):
+    """Returns whether the filepath should be overwritten."""
+    # If file exists and should not be overwritten.
+    if not overwrite and os.path.isfile(filepath):
+        return ask_to_proceed_with_overwrite(filepath)
+    return True
+
+
+def compile_args_from_training_config(training_config, custom_objects=None):
+    """Return model.compile arguments from training config."""
+    if custom_objects is None:
+        custom_objects = {}
+
+    with keras.utils.CustomObjectScope(custom_objects):
+        optimizer_config = training_config["optimizer_config"]
+        optimizer = optimizers.deserialize(optimizer_config)
+
+        # Recover losses.
+        loss = None
+        loss_config = training_config.get("loss", None)
+        if loss_config is not None:
+            loss = _deserialize_nested_config(losses.deserialize, loss_config)
+
+        # Recover metrics.
+        metrics = None
+        metrics_config = training_config.get("metrics", None)
+        if metrics_config is not None:
+            metrics = _deserialize_nested_config(
+                _deserialize_metric, metrics_config
+            )
+
+        # Recover weighted metrics.
+        weighted_metrics = None
+        weighted_metrics_config = training_config.get("weighted_metrics", None)
+        if weighted_metrics_config is not None:
+            weighted_metrics = _deserialize_nested_config(
+                _deserialize_metric, weighted_metrics_config
+            )
+
+        sample_weight_mode = (
+            training_config["sample_weight_mode"]
+            if hasattr(training_config, "sample_weight_mode")
+            else None
+        )
+        loss_weights = training_config["loss_weights"]
+
+    return dict(
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        weighted_metrics=weighted_metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=sample_weight_mode,
+    )
+
+
+def _deserialize_nested_config(deserialize_fn, config):
+    """Deserializes arbitrary Keras `config` using `deserialize_fn`."""
+
+    def _is_single_object(obj):
+        if isinstance(obj, dict) and "class_name" in obj:
+            return True  # Serialized Keras object.
+        if isinstance(obj, str):
+            return True  # Serialized function or string.
+        return False
+
+    if config is None:
+        return None
+    if _is_single_object(config):
+        return deserialize_fn(config)
+    elif isinstance(config, dict):
+        return {
+            k: _deserialize_nested_config(deserialize_fn, v)
+            for k, v in config.items()
+        }
+    elif isinstance(config, (tuple, list)):
+        return [
+            _deserialize_nested_config(deserialize_fn, obj) for obj in config
+        ]
+
+    raise ValueError(
+        "Saved configuration not understood. Configuration should be a "
+        f"dictionary, string, tuple or list. Received: config={config}."
+    )
+
+
+def _serialize_nested_config(config):
+    """Serialized a nested structure of Keras objects."""
+
+    def _serialize_fn(obj):
+        if callable(obj):
+            return serialization.serialize_keras_object(obj)
+        return obj
+
+    return tf.nest.map_structure(_serialize_fn, config)
+
+
+def _deserialize_metric(metric_config):
+    """Deserialize metrics, leaving special strings untouched."""
+    from keras import metrics as metrics_module
+
+    if metric_config in ["accuracy", "acc", "crossentropy", "ce"]:
+        # Do not deserialize accuracy and cross-entropy strings as we have
+        # special case handling for these in compile, based on model output
+        # shape.
+        return metric_config
+    return metrics_module.deserialize(metric_config)
+
+
+def _enforce_names_consistency(specs):
+    """Enforces that either all specs have names or none do."""
+
+    def _has_name(spec):
+        return spec is None or (hasattr(spec, "name") and spec.name is not None)
+
+    def _clear_name(spec):
+        spec = copy.deepcopy(spec)
+        if hasattr(spec, "name"):
+            spec._name = None
+        return spec
+
+    flat_specs = tf.nest.flatten(specs)
+    name_inconsistency = any(_has_name(s) for s in flat_specs) and not all(
+        _has_name(s) for s in flat_specs
+    )
+
+    if name_inconsistency:
+        specs = tf.nest.map_structure(_clear_name, specs)
+    return specs
+
+
+def try_build_compiled_arguments(model):
+    if (
+        not version_utils.is_v1_layer_or_model(model)
+        and model.outputs is not None
+    ):
+        try:
+            if not model.compiled_loss.built:
+                model.compiled_loss.build(model.outputs)
+            if not model.compiled_metrics.built:
+                model.compiled_metrics.build(model.outputs, model.outputs)
+        except:  # noqa: E722
+            logging.warning(
+                "Compiled the loaded model, but the compiled metrics have "
+                "yet to be built. `model.compile_metrics` will be empty "
+                "until you train or evaluate the model."
+            )
+
+
+def is_hdf5_filepath(filepath):
+    return (
+        filepath.endswith(".h5")
+        or filepath.endswith(".keras")
+        or filepath.endswith(".hdf5")
+    )
diff --git a/keras/saving/legacy/saving_utils_test.py b/keras/saving/legacy/saving_utils_test.py
new file mode 100644
index 000000000000..3a34783f45e5
--- /dev/null
+++ b/keras/saving/legacy/saving_utils_test.py
@@ -0,0 +1,553 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for saving utility functions."""
+
+import os
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras import backend
+from keras.engine import sequential
+from keras.feature_column import dense_features
+from keras.optimizers.legacy import gradient_descent
+from keras.saving.legacy import saving_utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+
+class TraceModelCallTest(test_combinations.TestCase):
+    def _assert_all_close(self, expected, actual):
+        if not tf.executing_eagerly():
+            with self.cached_session() as sess:
+                backend._initialize_variables(sess)
+                self.assertAllClose(expected, actual)
+        else:
+            self.assertAllClose(expected, actual)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_trace_model_outputs(self):
+        input_dim = 5 if test_utils.get_model_type() == "functional" else None
+        model = test_utils.get_small_mlp(10, 3, input_dim)
+        inputs = tf.ones((8, 5))
+
+        if input_dim is None:
+            with self.assertRaisesRegex(
+                ValueError, ".*input shape is not availabl*"
+            ):
+                saving_utils.trace_model_call(model)
+            model._set_inputs(inputs)
+
+        fn = saving_utils.trace_model_call(model)
+        signature_outputs = fn(inputs)
+        if model.output_names:
+            expected_outputs = {model.output_names[0]: model(inputs)}
+        else:
+            expected_outputs = {"output_1": model(inputs)}
+
+        self._assert_all_close(expected_outputs, signature_outputs)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_trace_model_outputs_after_fitting(self):
+        input_dim = 5 if test_utils.get_model_type() == "functional" else None
+        model = test_utils.get_small_mlp(10, 3, input_dim)
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            x=np.random.random((8, 5)).astype(np.float32),
+            y=np.random.random((8, 3)).astype(np.float32),
+            epochs=2,
+        )
+
+        inputs = tf.ones((8, 5))
+
+        fn = saving_utils.trace_model_call(model)
+        signature_outputs = fn(inputs)
+        if model.output_names:
+            expected_outputs = {model.output_names[0]: model(inputs)}
+        else:
+            expected_outputs = {"output_1": model(inputs)}
+
+        self._assert_all_close(expected_outputs, signature_outputs)
+
+    @test_combinations.run_with_all_model_types(exclude_models="sequential")
+    @test_combinations.run_all_keras_modes
+    def test_trace_multi_io_model_outputs(self):
+        input_dim = 5
+        num_classes = 3
+        num_classes_b = 4
+        input_a = keras.layers.Input(shape=(input_dim,), name="input_a")
+        input_b = keras.layers.Input(shape=(input_dim,), name="input_b")
+
+        dense = keras.layers.Dense(num_classes, name="dense")
+        dense2 = keras.layers.Dense(num_classes_b, name="dense2")
+        dropout = keras.layers.Dropout(0.5, name="dropout")
+        branch_a = [input_a, dense]
+        branch_b = [input_b, dense, dense2, dropout]
+
+        model = test_utils.get_multi_io_model(branch_a, branch_b)
+
+        input_a_ts = tf.constant(
+            np.random.random((10, input_dim)).astype(np.float32)
+        )
+        input_b_ts = tf.constant(
+            np.random.random((10, input_dim)).astype(np.float32)
+        )
+
+        if test_utils.get_model_type() == "subclass":
+            with self.assertRaisesRegex(
+                ValueError, ".*input shape is not availabl*"
+            ):
+                saving_utils.trace_model_call(model)
+
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            x=[
+                np.random.random((8, input_dim)).astype(np.float32),
+                np.random.random((8, input_dim)).astype(np.float32),
+            ],
+            y=[
+                np.random.random((8, num_classes)).astype(np.float32),
+                np.random.random((8, num_classes_b)).astype(np.float32),
+            ],
+            epochs=2,
+        )
+
+        fn = saving_utils.trace_model_call(model)
+        # tf.function requires that the input structures match when calling a
+        # ConcreteFunction. For some reason V1 models defines the inputs as a
+        # list, while V2 models sets the inputs as a tuple.
+        if (
+            not tf.executing_eagerly()
+            and test_utils.get_model_type() != "functional"
+        ):
+            signature_outputs = fn([input_a_ts, input_b_ts])
+        else:
+            signature_outputs = fn((input_a_ts, input_b_ts))
+        outputs = model([input_a_ts, input_b_ts])
+        if model.output_names:
+            expected_outputs = {
+                model.output_names[0]: outputs[0],
+                model.output_names[1]: outputs[1],
+            }
+        else:
+            expected_outputs = {"output_1": outputs[0], "output_2": outputs[1]}
+        self._assert_all_close(expected_outputs, signature_outputs)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_trace_features_layer(self):
+        columns = [tf.feature_column.numeric_column("x")]
+        model = sequential.Sequential([dense_features.DenseFeatures(columns)])
+        model_input = {"x": tf.constant([[1.0]])}
+        model.predict(model_input, steps=1)
+        fn = saving_utils.trace_model_call(model)
+        self.assertAllClose({"output_1": [[1.0]]}, fn(model_input))
+
+        columns = [
+            tf.feature_column.numeric_column("x"),
+            tf.feature_column.numeric_column("y"),
+        ]
+        model = sequential.Sequential([dense_features.DenseFeatures(columns)])
+        model_input = {"x": tf.constant([[1.0]]), "y": tf.constant([[2.0]])}
+        model.predict(model_input, steps=1)
+        fn = saving_utils.trace_model_call(model)
+        self.assertAllClose({"output_1": [[1.0, 2.0]]}, fn(model_input))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_specify_input_signature(self):
+        model = test_utils.get_small_sequential_mlp(10, 3, None)
+        inputs = tf.ones((8, 5))
+
+        with self.assertRaisesRegex(
+            ValueError, ".*input shape is not availabl*"
+        ):
+            saving_utils.trace_model_call(model)
+
+        fn = saving_utils.trace_model_call(
+            model, [tf.TensorSpec(shape=[None, 5], dtype=tf.float32)]
+        )
+        signature_outputs = fn(inputs)
+        if model.output_names:
+            expected_outputs = {model.output_names[0]: model(inputs)}
+        else:
+            expected_outputs = {"output_1": model(inputs)}
+        self._assert_all_close(expected_outputs, signature_outputs)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_subclassed_model_with_input_signature(self):
+        class Model(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dense = keras.layers.Dense(3, name="dense")
+
+            @tf.function(
+                input_signature=[
+                    [
+                        tf.TensorSpec([None, 5], tf.float32),
+                        tf.TensorSpec([None], tf.float32),
+                    ]
+                ],
+            )
+            def call(self, inputs, *args):
+                x, y = inputs
+                return self.dense(x) + y
+
+        model = Model()
+        fn = saving_utils.trace_model_call(model)
+        x = tf.ones((8, 5), dtype=tf.float32)
+        y = tf.ones((3,), dtype=tf.float32)
+        expected_outputs = {"output_1": model([x, y])}
+        signature_outputs = fn([x, y])
+        self._assert_all_close(expected_outputs, signature_outputs)
+
+    @test_combinations.run_with_all_model_types
+    @test_combinations.run_all_keras_modes
+    def test_model_with_fixed_input_dim(self):
+        """Ensure that the batch_dim is removed when saving.
+
+        When serving or retraining, it is important to reset the batch dim.
+        This can be an issue inside of tf.function. See b/132783590 for context.
+        """
+        model = test_utils.get_small_mlp(10, 3, 5)
+
+        loss_object = keras.losses.MeanSquaredError()
+        optimizer = gradient_descent.SGD()
+
+        @tf.function
+        def train_step(data, labels):
+            with tf.GradientTape() as tape:
+                predictions = model(data)
+                loss = loss_object(labels, predictions)
+            gradients = tape.gradient(loss, model.trainable_variables)
+            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+
+        x = np.random.random((8, 5))
+        y = np.random.random((8, 3))
+
+        train_step(x, y)
+
+        fn = saving_utils.trace_model_call(model)
+        self.assertEqual(
+            fn.structured_input_signature[0][0].shape.as_list(),
+            tf.TensorShape([None, 5]).as_list(),
+        )
+
+
+def _import_and_infer(save_dir, inputs):
+    """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+    graph = tf.Graph()
+    with graph.as_default(), tf.compat.v1.Session() as session:
+        model = tf.compat.v1.saved_model.load(
+            session, [tf.saved_model.SERVING], save_dir
+        )
+        signature = model.signature_def[
+            tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+        ]
+        assert set(inputs.keys()) == set(
+            signature.inputs.keys()
+        ), f"expected {signature.inputs.keys()}, found {inputs.keys()}"
+        feed_dict = {}
+        for arg_name in inputs.keys():
+            feed_dict[
+                graph.get_tensor_by_name(signature.inputs[arg_name].name)
+            ] = inputs[arg_name]
+        output_dict = {}
+        for output_name, output_tensor_info in signature.outputs.items():
+            output_dict[output_name] = graph.get_tensor_by_name(
+                output_tensor_info.name
+            )
+        return session.run(output_dict, feed_dict=feed_dict)
+
+
+class AutographedMetric(keras.metrics.Metric):
+    def build(self, input_shape):
+        pass
+
+    def update_state(self, values):
+        if tf.constant(False):
+            x = 1
+        else:
+            x = 2
+        return x
+
+    def reset_states(self):
+        pass
+
+    def result(self):
+        return tf.constant(0)
+
+    def GetMean(self):
+        return tf.constant(0)
+
+    def GetCount(self):
+        return tf.constant(0)
+
+
+class BasicAutographedMetricLayer(keras.layers.Layer):
+    def build(self, input_shape):
+        self._metric = AutographedMetric()
+
+    def call(self, inp):
+        self._metric.update_state(inp)
+        # TODO(b/172853147): Test control flow here.
+        return inp
+
+
+class BasicAutographedMetricModel(keras.models.Model):
+    def __init__(self):
+        super().__init__(name="test_model")
+        self._layer = BasicAutographedMetricLayer()
+
+    def call(self, inputs, **kwargs):
+        return self._layer(inputs)
+
+
+@test_combinations.run_with_all_model_types
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class ModelSaveTest(test_combinations.TestCase):
+    def test_model_save_preserves_autograph(self):
+        model = BasicAutographedMetricModel()
+        inputs = tf.ones((8, 5))
+        model._set_inputs(inputs)
+
+        save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+        tf.saved_model.save(model, save_dir)
+
+        if model.output_names:
+            output_name = model.output_names[0]
+            input_name = model.input_names[0]
+        else:
+            output_name = "output_1"
+            input_name = "input_1"
+
+        self.assertAllClose(
+            {output_name: model.predict_on_batch(inputs)},
+            _import_and_infer(save_dir, {input_name: np.ones((8, 5))}),
+        )
+
+        # Test v2 loading.
+        # TODO(mdan): tests using _import_and_infer should uniformly do this.
+        self.assertAllClose(
+            model.predict_on_batch(inputs),
+            tf.saved_model.load(save_dir)(inputs),
+        )
+
+    def test_model_save(self):
+        input_dim = 5
+        model = test_utils.get_small_mlp(10, 3, input_dim)
+        inputs = tf.ones((8, 5))
+
+        if test_utils.get_model_type() == "subclass":
+            model._set_inputs(inputs)
+
+        save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+        tf.saved_model.save(model, save_dir)
+
+        if model.output_names:
+            output_name = model.output_names[0]
+            input_name = model.input_names[0]
+        else:
+            output_name = "output_1"
+            input_name = "input_1"
+
+        self.assertAllClose(
+            {output_name: model.predict_on_batch(inputs)},
+            _import_and_infer(save_dir, {input_name: np.ones((8, 5))}),
+        )
+
+
+class ExtractModelMetricsTest(test_combinations.TestCase):
+    def test_extract_model_metrics(self):
+        # saving_utils.extract_model_metrics is used in V1 only API
+        # keras.experimental.export_saved_model.
+        with tf.Graph().as_default():
+            a = keras.layers.Input(shape=(3,), name="input_a")
+            b = keras.layers.Input(shape=(3,), name="input_b")
+
+            dense = keras.layers.Dense(4, name="dense")
+            c = dense(a)
+            d = dense(b)
+            e = keras.layers.Dropout(0.5, name="dropout")(c)
+
+            model = keras.models.Model([a, b], [d, e])
+            extract_metrics = saving_utils.extract_model_metrics(model)
+            self.assertEqual(None, extract_metrics)
+
+            extract_metric_names = [
+                "dense_binary_accuracy",
+                "dropout_binary_accuracy",
+                "dense_mean_squared_error",
+                "dropout_mean_squared_error",
+            ]
+            if tf.__internal__.tf2.enabled():
+                extract_metric_names.extend(["dense_mae", "dropout_mae"])
+            else:
+                extract_metric_names.extend(
+                    ["dense_mean_absolute_error", "dropout_mean_absolute_error"]
+                )
+
+            model_metric_names = [
+                "loss",
+                "dense_loss",
+                "dropout_loss",
+            ] + extract_metric_names
+            model.compile(
+                loss="mae",
+                metrics=[
+                    keras.metrics.BinaryAccuracy(),
+                    "mae",
+                    keras.metrics.mean_squared_error,
+                ],
+                optimizer=tf.compat.v1.train.RMSPropOptimizer(
+                    learning_rate=0.01
+                ),
+            )
+            extract_metrics = saving_utils.extract_model_metrics(model)
+            self.assertEqual(set(model_metric_names), set(model.metrics_names))
+            self.assertEqual(
+                set(extract_metric_names), set(extract_metrics.keys())
+            )
+
+
+class UnbuiltModelSavingErrorMessageTest(test_combinations.TestCase):
+    def setUp(self):
+        super().setUp()
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest("The test does not intend to cover TF1.")
+
+    def test_sequential(self):
+        model = sequential.Sequential([keras.layers.Dense(10)])
+        optimizer = gradient_descent.SGD()
+        model.compile(optimizer, loss="mse", steps_per_execution=10)
+
+        # Forward pass not called yet. Input shape not available and thus error.
+        with self.assertRaisesRegex(
+            ValueError,
+            "Model.*cannot be saved."
+            "*specify an input shape either by calling.*",
+        ):
+            model.save(os.path.join(self.get_temp_dir(), "my_saved_model"))
+
+    def test_functional(self):
+        inputs = keras.Input(shape=(32,))
+        outputs = keras.layers.Dense(1)(inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile(optimizer="adam", loss="mse", metrics=["mae"])
+
+        x = np.random.random((1000, 32))
+        y = np.random.random((1000, 1))
+        model.fit(x, y, epochs=3)
+
+        # Functional model always has an input shape, so should save just fine.
+        model.save(os.path.join(self.get_temp_dir(), "my_saved_model"))
+
+    def test_subclass_forward_pass_by_layer_underscore_call(self):
+        class CustomModel(keras.Model):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.dense1 = keras.layers.Dense(1)
+
+            def train_step(self, data):
+                x, y = data
+                with tf.GradientTape() as tape:
+                    y_pred = self.dense1(x, training=True)
+                    loss = self.compiled_loss(y, y_pred)
+
+                gradients = tape.gradient(loss, self.trainable_variables)
+                self.optimizer.apply_gradients(
+                    zip(gradients, self.trainable_variables)
+                )
+                return {}
+
+        subclassed_model = CustomModel()
+        subclassed_model.compile(optimizer="adam", loss="mse")
+
+        x = np.random.random((1000, 32))
+        y = np.random.random((1000, 1))
+        subclassed_model.fit(x, y, epochs=1)
+
+        # Saving of this subclassed model is supposed to raise an error, even if
+        # `fit` has been called. This is because the model does not have
+        # `call()` overridden. Forward pass using `layer.__call__` works for
+        # training, but saving requires that `call()` be used.
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Model.*cannot be saved.*as opposed to `model.call\(\).*",
+        ):
+            subclassed_model.save(
+                os.path.join(self.get_temp_dir(), "my_saved_model")
+            )
+
+    def test_subclass_forward_pass_by_model_call(self):
+        class CustomModel(keras.Model):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.dense1 = keras.layers.Dense(1)
+
+            def call(self, inputs):
+                return self.dense1(inputs)
+
+            def train_step(self, data):
+                x, y = data
+                with tf.GradientTape() as tape:
+                    y_pred = self.call(x)
+                    loss = self.compiled_loss(y, y_pred)
+
+                gradients = tape.gradient(loss, self.trainable_variables)
+                self.optimizer.apply_gradients(
+                    zip(gradients, self.trainable_variables)
+                )
+                return {}
+
+        subclassed_model = CustomModel()
+        subclassed_model.compile(optimizer="adam", loss="mse")
+
+        x = np.random.random((1000, 32))
+        y = np.random.random((1000, 1))
+        subclassed_model.fit(x, y, epochs=1)
+
+        # Saving of this subclassed model is supposed to raise an error, even if
+        # `fit` has been called. This is because the model has `call()`
+        # overridden, but the forward pass uses `Model.call` as opposed to
+        # `Model.__call__`, and as a result the `Model` is not really built. The
+        # error message hints the user to use `Model.__call__`, i.e.,
+        # `Model(inputs)` instead.
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Model.*cannot be saved.*as opposed to `model.call\(\).*",
+        ):
+            subclassed_model.save(
+                os.path.join(self.get_temp_dir(), "my_saved_model")
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/legacy/serialization.py b/keras/saving/legacy/serialization.py
new file mode 100644
index 000000000000..7d55d92f58ca
--- /dev/null
+++ b/keras/saving/legacy/serialization.py
@@ -0,0 +1,570 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Legacy serialization logic for Keras models."""
+
+import threading
+import weakref
+
+import tensorflow.compat.v2 as tf
+
+from keras.utils import tf_contextlib
+from keras.utils import tf_inspect
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+# Flag that determines whether to skip the NotImplementedError when calling
+# get_config in custom models and layers. This is only enabled when saving to
+# SavedModel, when the config isn't required.
+_SKIP_FAILED_SERIALIZATION = False
+# If a layer does not have a defined config, then the returned config will be a
+# dictionary with the below key.
+_LAYER_UNDEFINED_CONFIG_KEY = "layer was saved without config"
+
+# Store a unique, per-object ID for shared objects.
+#
+# We store a unique ID for each object so that we may, at loading time,
+# re-create the network properly.  Without this ID, we would have no way of
+# determining whether a config is a description of a new object that
+# should be created or is merely a reference to an already-created object.
+SHARED_OBJECT_KEY = "shared_object_id"
+
+SHARED_OBJECT_DISABLED = threading.local()
+SHARED_OBJECT_LOADING = threading.local()
+SHARED_OBJECT_SAVING = threading.local()
+
+
+# Attributes on the threadlocal variable must be set per-thread, thus we
+# cannot initialize these globally. Instead, we have accessor functions with
+# default values.
+def _shared_object_disabled():
+    """Get whether shared object handling is disabled in a threadsafe manner."""
+    return getattr(SHARED_OBJECT_DISABLED, "disabled", False)
+
+
+def _shared_object_loading_scope():
+    """Get the current shared object saving scope in a threadsafe manner."""
+    return getattr(SHARED_OBJECT_LOADING, "scope", NoopLoadingScope())
+
+
+def _shared_object_saving_scope():
+    """Get the current shared object saving scope in a threadsafe manner."""
+    return getattr(SHARED_OBJECT_SAVING, "scope", None)
+
+
+class DisableSharedObjectScope:
+    """A context manager for disabling handling of shared objects.
+
+    Disables shared object handling for both saving and loading.
+
+    Created primarily for use with `clone_model`, which does extra surgery that
+    is incompatible with shared objects.
+    """
+
+    def __enter__(self):
+        SHARED_OBJECT_DISABLED.disabled = True
+        self._orig_loading_scope = _shared_object_loading_scope()
+        self._orig_saving_scope = _shared_object_saving_scope()
+
+    def __exit__(self, *args, **kwargs):
+        SHARED_OBJECT_DISABLED.disabled = False
+        SHARED_OBJECT_LOADING.scope = self._orig_loading_scope
+        SHARED_OBJECT_SAVING.scope = self._orig_saving_scope
+
+
+class NoopLoadingScope:
+    """The default shared object loading scope. It does nothing.
+
+    Created to simplify serialization code that doesn't care about shared
+    objects (e.g. when serializing a single object).
+    """
+
+    def get(self, unused_object_id):
+        return None
+
+    def set(self, object_id, obj):
+        pass
+
+
+class SharedObjectLoadingScope:
+    """A context manager for keeping track of loaded objects.
+
+    During the deserialization process, we may come across objects that are
+    shared across multiple layers. In order to accurately restore the network
+    structure to its original state, `SharedObjectLoadingScope` allows us to
+    re-use shared objects rather than cloning them.
+    """
+
+    def __enter__(self):
+        if _shared_object_disabled():
+            return NoopLoadingScope()
+
+        global SHARED_OBJECT_LOADING
+        SHARED_OBJECT_LOADING.scope = self
+        self._obj_ids_to_obj = {}
+        return self
+
+    def get(self, object_id):
+        """Given a shared object ID, returns a previously instantiated object.
+
+        Args:
+          object_id: shared object ID to use when attempting to find
+            already-loaded object.
+
+        Returns:
+          The object, if we've seen this ID before. Else, `None`.
+        """
+        # Explicitly check for `None` internally to make external calling code a
+        # bit cleaner.
+        if object_id is None:
+            return
+        return self._obj_ids_to_obj.get(object_id)
+
+    def set(self, object_id, obj):
+        """Stores an instantiated object for future lookup and sharing."""
+        if object_id is None:
+            return
+        self._obj_ids_to_obj[object_id] = obj
+
+    def __exit__(self, *args, **kwargs):
+        global SHARED_OBJECT_LOADING
+        SHARED_OBJECT_LOADING.scope = NoopLoadingScope()
+
+
+class SharedObjectConfig(dict):
+    """A configuration container that keeps track of references.
+
+    `SharedObjectConfig` will automatically attach a shared object ID to any
+    configs which are referenced more than once, allowing for proper shared
+    object reconstruction at load time.
+
+    In most cases, it would be more proper to subclass something like
+    `collections.UserDict` or `collections.Mapping` rather than `dict` directly.
+    Unfortunately, python's json encoder does not support `Mapping`s. This is
+    important functionality to retain, since we are dealing with serialization.
+
+    We should be safe to subclass `dict` here, since we aren't actually
+    overriding any core methods, only augmenting with a new one for reference
+    counting.
+    """
+
+    def __init__(self, base_config, object_id, **kwargs):
+        self.ref_count = 1
+        self.object_id = object_id
+        super().__init__(base_config, **kwargs)
+
+    def increment_ref_count(self):
+        # As soon as we've seen the object more than once, we want to attach the
+        # shared object ID. This allows us to only attach the shared object ID
+        # when it's strictly necessary, making backwards compatibility breakage
+        # less likely.
+        if self.ref_count == 1:
+            self[SHARED_OBJECT_KEY] = self.object_id
+        self.ref_count += 1
+
+
+class SharedObjectSavingScope:
+    """Keeps track of shared object configs when serializing."""
+
+    def __enter__(self):
+        if _shared_object_disabled():
+            return None
+
+        global SHARED_OBJECT_SAVING
+
+        # Serialization can happen at a number of layers for a number of
+        # reasons.  We may end up with a case where we're opening a saving scope
+        # within another saving scope. In that case, we'd like to use the
+        # outermost scope available and ignore inner scopes, since there is not
+        # (yet) a reasonable use case for having these nested and distinct.
+        if _shared_object_saving_scope() is not None:
+            self._passthrough = True
+            return _shared_object_saving_scope()
+        else:
+            self._passthrough = False
+
+        SHARED_OBJECT_SAVING.scope = self
+        self._shared_objects_config = weakref.WeakKeyDictionary()
+        self._next_id = 0
+        return self
+
+    def get_config(self, obj):
+        """Gets a `SharedObjectConfig` if one has already been seen for `obj`.
+
+        Args:
+          obj: The object for which to retrieve the `SharedObjectConfig`.
+
+        Returns:
+          The SharedObjectConfig for a given object, if already seen. Else,
+            `None`.
+        """
+        try:
+            shared_object_config = self._shared_objects_config[obj]
+        except (TypeError, KeyError):
+            # If the object is unhashable (e.g. a subclass of
+            # `AbstractBaseClass` that has not overridden `__hash__`), a
+            # `TypeError` will be thrown.  We'll just continue on without shared
+            # object support.
+            return None
+        shared_object_config.increment_ref_count()
+        return shared_object_config
+
+    def create_config(self, base_config, obj):
+        """Create a new SharedObjectConfig for a given object."""
+        shared_object_config = SharedObjectConfig(base_config, self._next_id)
+        self._next_id += 1
+        try:
+            self._shared_objects_config[obj] = shared_object_config
+        except TypeError:
+            # If the object is unhashable (e.g. a subclass of
+            # `AbstractBaseClass` that has not overridden `__hash__`), a
+            # `TypeError` will be thrown.  We'll just continue on without shared
+            # object support.
+            pass
+        return shared_object_config
+
+    def __exit__(self, *args, **kwargs):
+        if not getattr(self, "_passthrough", False):
+            global SHARED_OBJECT_SAVING
+            SHARED_OBJECT_SAVING.scope = None
+
+
+def serialize_keras_class_and_config(
+    cls_name, cls_config, obj=None, shared_object_id=None
+):
+    """Returns the serialization of the class with the given config."""
+    base_config = {"class_name": cls_name, "config": cls_config}
+
+    # We call `serialize_keras_class_and_config` for some branches of the load
+    # path. In that case, we may already have a shared object ID we'd like to
+    # retain.
+    if shared_object_id is not None:
+        base_config[SHARED_OBJECT_KEY] = shared_object_id
+
+    # If we have an active `SharedObjectSavingScope`, check whether we've
+    # already serialized this config. If so, just use that config. This will
+    # store an extra ID field in the config, allowing us to re-create the shared
+    # object relationship at load time.
+    if _shared_object_saving_scope() is not None and obj is not None:
+        shared_object_config = _shared_object_saving_scope().get_config(obj)
+        if shared_object_config is None:
+            return _shared_object_saving_scope().create_config(base_config, obj)
+        return shared_object_config
+
+    return base_config
+
+
+@tf_contextlib.contextmanager
+def skip_failed_serialization():
+    global _SKIP_FAILED_SERIALIZATION
+    prev = _SKIP_FAILED_SERIALIZATION
+    try:
+        _SKIP_FAILED_SERIALIZATION = True
+        yield
+    finally:
+        _SKIP_FAILED_SERIALIZATION = prev
+
+
+@keras_export("keras.utils.legacy.serialize_keras_object")
+def serialize_keras_object(instance):
+    """Serialize a Keras object into a JSON-compatible representation.
+
+    Calls to `serialize_keras_object` while underneath the
+    `SharedObjectSavingScope` context manager will cause any objects re-used
+    across multiple layers to be saved with a special shared object ID. This
+    allows the network to be re-created properly during deserialization.
+
+    Args:
+      instance: The object to serialize.
+
+    Returns:
+      A dict-like, JSON-compatible representation of the object's config.
+    """
+    from keras.saving import object_registration
+
+    _, instance = tf.__internal__.decorator.unwrap(instance)
+    if instance is None:
+        return None
+
+    if hasattr(instance, "get_config"):
+        name = object_registration.get_registered_name(instance.__class__)
+        try:
+            config = instance.get_config()
+        except NotImplementedError as e:
+            if _SKIP_FAILED_SERIALIZATION:
+                return serialize_keras_class_and_config(
+                    name, {_LAYER_UNDEFINED_CONFIG_KEY: True}
+                )
+            raise e
+        serialization_config = {}
+        for key, item in config.items():
+            if isinstance(item, str):
+                serialization_config[key] = item
+                continue
+
+            # Any object of a different type needs to be converted to string or
+            # dict for serialization (e.g. custom functions, custom classes)
+            try:
+                serialized_item = serialize_keras_object(item)
+                if isinstance(serialized_item, dict) and not isinstance(
+                    item, dict
+                ):
+                    serialized_item["__passive_serialization__"] = True
+                serialization_config[key] = serialized_item
+            except ValueError:
+                serialization_config[key] = item
+
+        name = object_registration.get_registered_name(instance.__class__)
+        return serialize_keras_class_and_config(
+            name, serialization_config, instance
+        )
+    if hasattr(instance, "__name__"):
+        return object_registration.get_registered_name(instance)
+    raise ValueError(
+        f"Cannot serialize {instance} because it doesn't implement "
+        "`get_config()`."
+    )
+
+
+def class_and_config_for_serialized_keras_object(
+    config,
+    module_objects=None,
+    custom_objects=None,
+    printable_module_name="object",
+):
+    """Returns the class name and config for a serialized keras object."""
+    from keras.saving import object_registration
+
+    if (
+        not isinstance(config, dict)
+        or "class_name" not in config
+        or "config" not in config
+    ):
+        raise ValueError(
+            f"Improper config format for {config}. "
+            "Expecting python dict contains `class_name` and `config` as keys"
+        )
+
+    class_name = config["class_name"]
+    cls = object_registration.get_registered_object(
+        class_name, custom_objects, module_objects
+    )
+    if cls is None:
+        raise ValueError(
+            f"Unknown {printable_module_name}: '{class_name}'. "
+            "Please ensure you are using a `keras.utils.custom_object_scope` "
+            "and that this object is included in the scope. See "
+            "https://www.tensorflow.org/guide/keras/save_and_serialize"
+            "#registering_the_custom_object for details."
+        )
+
+    cls_config = config["config"]
+    # Check if `cls_config` is a list. If it is a list, return the class and the
+    # associated class configs for recursively deserialization. This case will
+    # happen on the old version of sequential model (e.g. `keras_version` ==
+    # "2.0.6"), which is serialized in a different structure, for example
+    # "{'class_name': 'Sequential',
+    #   'config': [{'class_name': 'Embedding', 'config': ...}, {}, ...]}".
+    if isinstance(cls_config, list):
+        return (cls, cls_config)
+
+    deserialized_objects = {}
+    for key, item in cls_config.items():
+        if key == "name":
+            # Assume that the value of 'name' is a string that should not be
+            # deserialized as a function. This avoids the corner case where
+            # cls_config['name'] has an identical name to a custom function and
+            # gets converted into that function.
+            deserialized_objects[key] = item
+        elif isinstance(item, dict) and "__passive_serialization__" in item:
+            deserialized_objects[key] = deserialize_keras_object(
+                item,
+                module_objects=module_objects,
+                custom_objects=custom_objects,
+                printable_module_name="config_item",
+            )
+        # TODO(momernick): Should this also have 'module_objects'?
+        elif isinstance(item, str) and tf_inspect.isfunction(
+            object_registration.get_registered_object(item, custom_objects)
+        ):
+            # Handle custom functions here. When saving functions, we only save
+            # the function's name as a string. If we find a matching string in
+            # the custom objects during deserialization, we convert the string
+            # back to the original function.
+            # Note that a potential issue is that a string field could have a
+            # naming conflict with a custom function name, but this should be a
+            # rare case.  This issue does not occur if a string field has a
+            # naming conflict with a custom object, since the config of an
+            # object will always be a dict.
+            deserialized_objects[
+                key
+            ] = object_registration.get_registered_object(item, custom_objects)
+    for key, item in deserialized_objects.items():
+        cls_config[key] = deserialized_objects[key]
+
+    return (cls, cls_config)
+
+
+@keras_export("keras.utils.legacy.deserialize_keras_object")
+def deserialize_keras_object(
+    identifier,
+    module_objects=None,
+    custom_objects=None,
+    printable_module_name="object",
+):
+    """Turns the serialized form of a Keras object back into an actual object.
+
+    This function is for mid-level library implementers rather than end users.
+
+    Importantly, this utility requires you to provide the dict of
+    `module_objects` to use for looking up the object config; this is not
+    populated by default. If you need a deserialization utility that has
+    preexisting knowledge of built-in Keras objects, use e.g.
+    `keras.layers.deserialize(config)`, `keras.metrics.deserialize(config)`,
+    etc.
+
+    Calling `deserialize_keras_object` while underneath the
+    `SharedObjectLoadingScope` context manager will cause any already-seen
+    shared objects to be returned as-is rather than creating a new object.
+
+    Args:
+      identifier: the serialized form of the object.
+      module_objects: A dictionary of built-in objects to look the name up in.
+        Generally, `module_objects` is provided by midlevel library
+        implementers.
+      custom_objects: A dictionary of custom objects to look the name up in.
+        Generally, `custom_objects` is provided by the end user.
+      printable_module_name: A human-readable string representing the type of
+        the object. Printed in case of exception.
+
+    Returns:
+      The deserialized object.
+
+    Example:
+
+    A mid-level library implementer might want to implement a utility for
+    retrieving an object from its config, as such:
+
+    ```python
+    def deserialize(config, custom_objects=None):
+       return deserialize_keras_object(
+         identifier,
+         module_objects=globals(),
+         custom_objects=custom_objects,
+         name="MyObjectType",
+       )
+    ```
+
+    This is how e.g. `keras.layers.deserialize()` is implemented.
+    """
+    from keras.saving import object_registration
+
+    if identifier is None:
+        return None
+
+    if isinstance(identifier, dict):
+        # In this case we are dealing with a Keras config dictionary.
+        config = identifier
+        (cls, cls_config) = class_and_config_for_serialized_keras_object(
+            config, module_objects, custom_objects, printable_module_name
+        )
+
+        # If this object has already been loaded (i.e. it's shared between
+        # multiple objects), return the already-loaded object.
+        shared_object_id = config.get(SHARED_OBJECT_KEY)
+        shared_object = _shared_object_loading_scope().get(shared_object_id)
+        if shared_object is not None:
+            return shared_object
+
+        if hasattr(cls, "from_config"):
+            arg_spec = tf_inspect.getfullargspec(cls.from_config)
+            custom_objects = custom_objects or {}
+
+            if "custom_objects" in arg_spec.args:
+                tlco = object_registration._THREAD_LOCAL_CUSTOM_OBJECTS.__dict__
+                deserialized_obj = cls.from_config(
+                    cls_config,
+                    custom_objects={
+                        **object_registration._GLOBAL_CUSTOM_OBJECTS,
+                        **tlco,
+                        **custom_objects,
+                    },
+                )
+            else:
+                with object_registration.CustomObjectScope(custom_objects):
+                    deserialized_obj = cls.from_config(cls_config)
+        else:
+            # Then `cls` may be a function returning a class.
+            # in this case by convention `config` holds
+            # the kwargs of the function.
+            custom_objects = custom_objects or {}
+            with object_registration.CustomObjectScope(custom_objects):
+                deserialized_obj = cls(**cls_config)
+
+        # Add object to shared objects, in case we find it referenced again.
+        _shared_object_loading_scope().set(shared_object_id, deserialized_obj)
+
+        return deserialized_obj
+
+    elif isinstance(identifier, str):
+        object_name = identifier
+        if custom_objects and object_name in custom_objects:
+            obj = custom_objects.get(object_name)
+        elif (
+            object_name
+            in object_registration._THREAD_LOCAL_CUSTOM_OBJECTS.__dict__
+        ):
+            obj = object_registration._THREAD_LOCAL_CUSTOM_OBJECTS.__dict__[
+                object_name
+            ]
+        elif object_name in object_registration._GLOBAL_CUSTOM_OBJECTS:
+            obj = object_registration._GLOBAL_CUSTOM_OBJECTS[object_name]
+        else:
+            obj = module_objects.get(object_name)
+            if obj is None:
+                raise ValueError(
+                    f"Unknown {printable_module_name}: '{object_name}'. "
+                    "Please ensure you are using a "
+                    "`keras.utils.custom_object_scope` "
+                    "and that this object is included in the scope. See "
+                    "https://www.tensorflow.org/guide/keras/save_and_serialize"
+                    "#registering_the_custom_object for details."
+                )
+
+        # Classes passed by name are instantiated with no args, functions are
+        # returned as-is.
+        if tf_inspect.isclass(obj):
+            return obj()
+        return obj
+    elif tf_inspect.isfunction(identifier):
+        # If a function has already been deserialized, return as is.
+        return identifier
+    else:
+        raise ValueError(
+            "Could not interpret serialized "
+            f"{printable_module_name}: {identifier}"
+        )
+
+
+def validate_config(config):
+    """Determines whether config appears to be a valid layer config."""
+    return (
+        isinstance(config, dict) and _LAYER_UNDEFINED_CONFIG_KEY not in config
+    )
+
+
+def is_default(method):
+    """Check if a method is decorated with the `default` wrapper."""
+    return getattr(method, "_is_default", False)
diff --git a/keras/saving/losses_serialization_test.py b/keras/saving/losses_serialization_test.py
deleted file mode 100644
index 354e67bf735d..000000000000
--- a/keras/saving/losses_serialization_test.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras losses serialization."""
-
-import tensorflow.compat.v2 as tf
-
-import os
-import shutil
-
-from absl.testing import parameterized
-import numpy as np
-
-import keras
-from keras.testing_infra import test_combinations
-from keras import layers
-from keras import losses
-from keras.optimizers import optimizer_v2
-from keras.testing_infra import test_utils
-from keras.utils import generic_utils
-from keras.utils import losses_utils
-
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-# Custom loss class
-class MyMeanAbsoluteError(losses.LossFunctionWrapper):
-
-  def __init__(self,
-               reduction=losses_utils.ReductionV2.AUTO,
-               name='mean_absolute_error'):
-    super().__init__(
-        my_mae, name=name, reduction=reduction)
-
-
-# Custom loss function
-def my_mae(y_true, y_pred):
-  return keras.backend.mean(tf.abs(y_pred - y_true), axis=-1)
-
-
-def _get_multi_io_model():
-  inp_1 = layers.Input(shape=(1,), name='input_1')
-  inp_2 = layers.Input(shape=(1,), name='input_2')
-  d = test_utils.Bias(name='output')
-  out_1 = d(inp_1)
-  out_2 = d(inp_2)
-  return keras.Model([inp_1, inp_2], [out_1, out_2])
-
-
-@test_combinations.run_all_keras_modes
-@parameterized.named_parameters([
-    dict(testcase_name='string', value='mae'),
-    dict(testcase_name='built_in_fn', value=losses.mae),
-    dict(testcase_name='built_in_class', value=losses.MeanAbsoluteError()),
-    dict(testcase_name='custom_fn', value=my_mae),
-    dict(testcase_name='custom_class', value=MyMeanAbsoluteError()),
-    dict(testcase_name='list_of_strings', value=['mae', 'mae']),
-    dict(testcase_name='list_of_built_in_fns', value=[losses.mae, losses.mae]),
-    dict(
-        testcase_name='list_of_built_in_classes',
-        value=[losses.MeanAbsoluteError(),
-               losses.MeanAbsoluteError()]),
-    dict(testcase_name='list_of_custom_fns', value=[my_mae, my_mae]),
-    dict(
-        testcase_name='list_of_custom_classes',
-        value=[MyMeanAbsoluteError(),
-               MyMeanAbsoluteError()]),
-    dict(
-        testcase_name='dict_of_string',
-        value={
-            'output': 'mae',
-            'output_1': 'mae',
-        }),
-    dict(
-        testcase_name='dict_of_built_in_fn',
-        value={
-            'output': losses.mae,
-            'output_1': losses.mae,
-        }),
-    dict(
-        testcase_name='dict_of_built_in_class',
-        value={
-            'output': losses.MeanAbsoluteError(),
-            'output_1': losses.MeanAbsoluteError(),
-        }),
-    dict(
-        testcase_name='dict_of_custom_fn',
-        value={
-            'output': my_mae,
-            'output_1': my_mae
-        }),
-    dict(
-        testcase_name='dict_of_custom_class',
-        value={
-            'output': MyMeanAbsoluteError(),
-            'output_1': MyMeanAbsoluteError(),
-        }),
-])
-class LossesSerialization(test_combinations.TestCase):
-
-  def setUp(self):
-    super(LossesSerialization, self).setUp()
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir)
-    self.model_filename = os.path.join(tmpdir, 'tmp_model_loss.h5')
-    self.x = np.array([[0.], [1.], [2.]], dtype='float32')
-    self.y = np.array([[0.5], [2.], [3.5]], dtype='float32')
-    self.w = np.array([1.25, 0.5, 1.25], dtype='float32')
-
-  def test_serializing_model_with_loss_with_custom_object_scope(self, value):
-    with generic_utils.custom_object_scope({
-        'MyMeanAbsoluteError': MyMeanAbsoluteError,
-        'my_mae': my_mae,
-        'Bias': test_utils.Bias,
-    }):
-      model = _get_multi_io_model()
-      model.compile(
-          optimizer_v2.gradient_descent.SGD(0.1),
-          loss=value,
-          run_eagerly=test_utils.should_run_eagerly())
-      history = model.fit([self.x, self.x], [self.y, self.y],
-                          batch_size=3,
-                          epochs=3,
-                          sample_weight=[self.w, self.w])
-
-      # Assert training.
-      self.assertAllClose(history.history['loss'], [2., 1.6, 1.2], 1e-3)
-      eval_results = model.evaluate([self.x, self.x], [self.y, self.y],
-                                    sample_weight=[self.w, self.w])
-
-      if h5py is None:
-        return
-      model.save(self.model_filename)
-      loaded_model = keras.models.load_model(self.model_filename)
-      loaded_model.predict([self.x, self.x])
-      loaded_eval_results = loaded_model.evaluate(
-          [self.x, self.x], [self.y, self.y], sample_weight=[self.w, self.w])
-
-      # Assert all evaluation results are the same.
-      self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
-
-  def test_serializing_model_with_loss_with_custom_objects(self, value):
-    model = _get_multi_io_model()
-    model.compile(
-        optimizer_v2.gradient_descent.SGD(0.1),
-        loss=value,
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit([self.x, self.x], [self.y, self.y],
-                        batch_size=3,
-                        epochs=3,
-                        sample_weight=[self.w, self.w])
-
-    # Assert training.
-    self.assertAllClose(history.history['loss'], [2., 1.6, 1.2], 1e-3)
-    eval_results = model.evaluate([self.x, self.x], [self.y, self.y],
-                                  sample_weight=[self.w, self.w])
-
-    if h5py is None:
-      return
-    model.save(self.model_filename)
-    loaded_model = keras.models.load_model(
-        self.model_filename,
-        custom_objects={
-            'MyMeanAbsoluteError': MyMeanAbsoluteError,
-            'my_mae': my_mae,
-            'Bias': test_utils.Bias,
-        })
-    loaded_model.predict([self.x, self.x])
-    loaded_eval_results = loaded_model.evaluate([self.x, self.x],
-                                                [self.y, self.y],
-                                                sample_weight=[self.w, self.w])
-
-    # Assert all evaluation results are the same.
-    self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/metrics_serialization_test.py b/keras/saving/metrics_serialization_test.py
deleted file mode 100644
index abbe99d122f9..000000000000
--- a/keras/saving/metrics_serialization_test.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras metrics serialization."""
-
-import tensorflow.compat.v2 as tf
-
-import os
-import shutil
-
-from absl.testing import parameterized
-import numpy as np
-
-import keras
-from keras.testing_infra import test_combinations
-from keras import layers
-from keras import metrics
-from keras.optimizers import optimizer_v2
-from keras.testing_infra import test_utils
-from keras.utils import generic_utils
-
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-# Custom metric
-class MyMeanAbsoluteError(metrics.MeanMetricWrapper):
-
-  def __init__(self, name='my_mae', dtype=None):
-    super().__init__(_my_mae, name, dtype=dtype)
-
-
-# Custom metric function
-def _my_mae(y_true, y_pred):
-  return keras.backend.mean(tf.abs(y_pred - y_true), axis=-1)
-
-
-def _get_multi_io_model():
-  inp_1 = layers.Input(shape=(1,), name='input_1')
-  inp_2 = layers.Input(shape=(1,), name='input_2')
-  d = test_utils.Bias(name='output')
-  out_1 = d(inp_1)
-  out_2 = d(inp_2)
-  return keras.Model([inp_1, inp_2], [out_1, out_2])
-
-
-@test_combinations.run_all_keras_modes
-@parameterized.named_parameters(
-    dict(testcase_name='string', value=['mae']),
-    dict(testcase_name='built_in_fn', value=[metrics.mae]),
-    dict(testcase_name='built_in_class', value=[metrics.MeanAbsoluteError]),
-    dict(testcase_name='custom_fn', value=[_my_mae]),
-    dict(testcase_name='custom_class', value=[MyMeanAbsoluteError]),
-    dict(
-        testcase_name='list_of_built_in_fn_and_list',
-        value=[metrics.mae, [metrics.mae]]),
-    dict(
-        testcase_name='list_of_built_in_class_and_list',
-        value=[metrics.MeanAbsoluteError, [metrics.MeanAbsoluteError]]),
-    dict(
-        testcase_name='list_of_custom_fn_and_list', value=[_my_mae, [_my_mae]]),
-    dict(
-        testcase_name='list_of_custom_class_and_list',
-        value=[MyMeanAbsoluteError, [MyMeanAbsoluteError]]),
-    dict(
-        testcase_name='list_of_lists_of_custom_fns',
-        value=[[_my_mae], [_my_mae, 'mae']]),
-    dict(
-        testcase_name='list_of_lists_of_custom_classes',
-        value=[[MyMeanAbsoluteError], [MyMeanAbsoluteError, 'mae']]),
-    dict(
-        testcase_name='dict_of_list_of_string',
-        value={
-            'output': ['mae'],
-            'output_1': ['mae'],
-        }),
-    dict(
-        testcase_name='dict_of_list_of_built_in_fn',
-        value={
-            'output': [metrics.mae],
-            'output_1': [metrics.mae],
-        }),
-    dict(
-        testcase_name='dict_of_list_of_built_in_class',
-        value={
-            'output': [metrics.MeanAbsoluteError],
-            'output_1': [metrics.MeanAbsoluteError],
-        }),
-    dict(
-        testcase_name='dict_of_list_of_custom_fn',
-        value={
-            'output': [_my_mae],
-            'output_1': [_my_mae],
-        }),
-    dict(
-        testcase_name='dict_of_list_of_custom_class',
-        value={
-            'output': [MyMeanAbsoluteError],
-            'output_1': [MyMeanAbsoluteError],
-        }),
-    dict(
-        testcase_name='dict_of_string',
-        value={
-            'output': 'mae',
-            'output_1': 'mae',
-        }),
-    dict(
-        testcase_name='dict_of_built_in_fn',
-        value={
-            'output': metrics.mae,
-            'output_1': metrics.mae,
-        }),
-    dict(
-        testcase_name='dict_of_built_in_class',
-        value={
-            'output': metrics.MeanAbsoluteError,
-            'output_1': metrics.MeanAbsoluteError,
-        }),
-    dict(
-        testcase_name='dict_of_custom_fn',
-        value={
-            'output': _my_mae,
-            'output_1': _my_mae
-        }),
-    dict(
-        testcase_name='dict_of_custom_class',
-        value={
-            'output': MyMeanAbsoluteError,
-            'output_1': MyMeanAbsoluteError,
-        }),
-)
-class MetricsSerialization(test_combinations.TestCase):
-
-  def setUp(self):
-    super(MetricsSerialization, self).setUp()
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir)
-    self.model_filename = os.path.join(tmpdir, 'tmp_model_metric.h5')
-    self.x = np.array([[0.], [1.], [2.]], dtype='float32')
-    self.y = np.array([[0.5], [2.], [3.5]], dtype='float32')
-    self.w = np.array([1.25, 0.5, 1.25], dtype='float32')
-
-  def test_serializing_model_with_metric_with_custom_object_scope(self, value):
-
-    def get_instance(x):
-      if isinstance(x, str):
-        return x
-      if isinstance(x, type) and issubclass(x, metrics.Metric):
-        return x()
-      return x
-
-    metric_input = tf.nest.map_structure(get_instance, value)
-    weighted_metric_input = tf.nest.map_structure(get_instance, value)
-
-    with generic_utils.custom_object_scope({
-        'MyMeanAbsoluteError': MyMeanAbsoluteError,
-        '_my_mae': _my_mae,
-        'Bias': test_utils.Bias,
-    }):
-      model = _get_multi_io_model()
-      model.compile(
-          optimizer_v2.gradient_descent.SGD(0.1),
-          'mae',
-          metrics=metric_input,
-          weighted_metrics=weighted_metric_input,
-          run_eagerly=test_utils.should_run_eagerly())
-      history = model.fit([self.x, self.x], [self.y, self.y],
-                          batch_size=3,
-                          epochs=3,
-                          sample_weight=[self.w, self.w])
-
-      # Assert training.
-      self.assertAllClose(history.history['loss'], [2., 1.6, 1.2], 1e-3)
-      eval_results = model.evaluate([self.x, self.x], [self.y, self.y],
-                                    sample_weight=[self.w, self.w])
-
-      if h5py is None:
-        return
-      model.save(self.model_filename)
-      loaded_model = keras.models.load_model(self.model_filename)
-      loaded_model.predict([self.x, self.x])
-      loaded_eval_results = loaded_model.evaluate(
-          [self.x, self.x], [self.y, self.y], sample_weight=[self.w, self.w])
-
-      # Assert all evaluation results are the same.
-      self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
-
-  def test_serializing_model_with_metric_with_custom_objects(self, value):
-
-    def get_instance(x):
-      if isinstance(x, str):
-        return x
-      if isinstance(x, type) and issubclass(x, metrics.Metric):
-        return x()
-      return x
-
-    metric_input = tf.nest.map_structure(get_instance, value)
-    weighted_metric_input = tf.nest.map_structure(get_instance, value)
-
-    model = _get_multi_io_model()
-    model.compile(
-        optimizer_v2.gradient_descent.SGD(0.1),
-        'mae',
-        metrics=metric_input,
-        weighted_metrics=weighted_metric_input,
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit([self.x, self.x], [self.y, self.y],
-                        batch_size=3,
-                        epochs=3,
-                        sample_weight=[self.w, self.w])
-
-    # Assert training.
-    self.assertAllClose(history.history['loss'], [2., 1.6, 1.2], 1e-3)
-    eval_results = model.evaluate([self.x, self.x], [self.y, self.y],
-                                  sample_weight=[self.w, self.w])
-
-    if h5py is None:
-      return
-    model.save(self.model_filename)
-    loaded_model = keras.models.load_model(
-        self.model_filename,
-        custom_objects={
-            'MyMeanAbsoluteError': MyMeanAbsoluteError,
-            '_my_mae': _my_mae,
-            'Bias': test_utils.Bias,
-        })
-    loaded_model.predict([self.x, self.x])
-    loaded_eval_results = loaded_model.evaluate([self.x, self.x],
-                                                [self.y, self.y],
-                                                sample_weight=[self.w, self.w])
-
-    # Assert all evaluation results are the same.
-    self.assertAllClose(eval_results, loaded_eval_results, 1e-9)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/model_config.py b/keras/saving/model_config.py
deleted file mode 100644
index c0590cce79b0..000000000000
--- a/keras/saving/model_config.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Functions that save the model's config into different formats."""
-
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.models.model_from_config')
-def model_from_config(config, custom_objects=None):
-  """Instantiates a Keras model from its config.
-
-  Usage:
-  ```
-  # for a Functional API model
-  tf.keras.Model().from_config(model.get_config())
-
-  # for a Sequential model
-  tf.keras.Sequential().from_config(model.get_config())
-  ```
-
-  Args:
-      config: Configuration dictionary.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      TypeError: if `config` is not a dictionary.
-  """
-  if isinstance(config, list):
-    raise TypeError('`model_from_config` expects a dictionary, not a list. '
-                    f'Received: config={config}. Did you meant to use '
-                    '`Sequential.from_config(config)`?')
-  from keras.layers import deserialize  # pylint: disable=g-import-not-at-top
-  return deserialize(config, custom_objects=custom_objects)
-
-
-@keras_export('keras.models.model_from_yaml')
-def model_from_yaml(yaml_string, custom_objects=None):
-  """Parses a yaml model configuration file and returns a model instance.
-
-  Note: Since TF 2.6, this method is no longer supported and will raise a
-  RuntimeError.
-
-  Args:
-      yaml_string: YAML string or open file encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      RuntimeError: announces that the method poses a security risk
-  """
-  raise RuntimeError(
-      'Method `model_from_yaml()` has been removed due to security risk of '
-      'arbitrary code execution. Please use `Model.to_json()` and '
-      '`model_from_json()` instead.'
-  )
-
-
-@keras_export('keras.models.model_from_json')
-def model_from_json(json_string, custom_objects=None):
-  """Parses a JSON model configuration string and returns a model instance.
-
-  Usage:
-
-  >>> model = tf.keras.Sequential([
-  ...     tf.keras.layers.Dense(5, input_shape=(3,)),
-  ...     tf.keras.layers.Softmax()])
-  >>> config = model.to_json()
-  >>> loaded_model = tf.keras.models.model_from_json(config)
-
-  Args:
-      json_string: JSON string encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-  """
-  from keras.layers import deserialize_from_json  # pylint: disable=g-import-not-at-top
-  return deserialize_from_json(json_string, custom_objects=custom_objects)
diff --git a/keras/saving/object_registration.py b/keras/saving/object_registration.py
new file mode 100644
index 000000000000..a64b21f3313f
--- /dev/null
+++ b/keras/saving/object_registration.py
@@ -0,0 +1,226 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python utilities required by Keras."""
+
+import inspect
+import threading
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+_GLOBAL_CUSTOM_OBJECTS = {}
+_GLOBAL_CUSTOM_NAMES = {}
+# Thread-local custom objects set by custom_object_scope.
+_THREAD_LOCAL_CUSTOM_OBJECTS = threading.local()
+
+
+@keras_export(
+    "keras.saving.custom_object_scope",
+    "keras.utils.custom_object_scope",
+    "keras.utils.CustomObjectScope",
+)
+class CustomObjectScope:
+    """Exposes custom classes/functions to Keras deserialization internals.
+
+    Under a scope `with custom_object_scope(objects_dict)`, Keras methods such
+    as `tf.keras.models.load_model` or `tf.keras.models.model_from_config`
+    will be able to deserialize any custom object referenced by a
+    saved config (e.g. a custom layer or metric).
+
+    Example:
+
+    Consider a custom regularizer `my_regularizer`:
+
+    ```python
+    layer = Dense(3, kernel_regularizer=my_regularizer)
+    # Config contains a reference to `my_regularizer`
+    config = layer.get_config()
+    ...
+    # Later:
+    with custom_object_scope({'my_regularizer': my_regularizer}):
+      layer = Dense.from_config(config)
+    ```
+
+    Args:
+        *args: Dictionary or dictionaries of `{name: object}` pairs.
+    """
+
+    def __init__(self, *args):
+        self.custom_objects = args
+        self.backup = None
+
+    def __enter__(self):
+        self.backup = _THREAD_LOCAL_CUSTOM_OBJECTS.__dict__.copy()
+        for objects in self.custom_objects:
+            _THREAD_LOCAL_CUSTOM_OBJECTS.__dict__.update(objects)
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        _THREAD_LOCAL_CUSTOM_OBJECTS.__dict__.clear()
+        _THREAD_LOCAL_CUSTOM_OBJECTS.__dict__.update(self.backup)
+
+
+@keras_export(
+    "keras.saving.get_custom_objects", "keras.utils.get_custom_objects"
+)
+def get_custom_objects():
+    """Retrieves a live reference to the global dictionary of custom objects.
+
+    Custom objects set using using `custom_object_scope` are not added to the
+    global dictionary of custom objects, and will not appear in the returned
+    dictionary.
+
+    Example:
+
+    ```python
+    get_custom_objects().clear()
+    get_custom_objects()['MyObject'] = MyObject
+    ```
+
+    Returns:
+        Global dictionary mapping registered class names to classes.
+    """
+    return _GLOBAL_CUSTOM_OBJECTS
+
+
+@keras_export(
+    "keras.saving.register_keras_serializable",
+    "keras.utils.register_keras_serializable",
+)
+def register_keras_serializable(package="Custom", name=None):
+    """Registers an object with the Keras serialization framework.
+
+    This decorator injects the decorated class or function into the Keras custom
+    object dictionary, so that it can be serialized and deserialized without
+    needing an entry in the user-provided custom object dict. It also injects a
+    function that Keras will call to get the object's serializable string key.
+
+    Note that to be serialized and deserialized, classes must implement the
+    `get_config()` method. Functions do not have this requirement.
+
+    The object will be registered under the key 'package>name' where `name`,
+    defaults to the object name if not passed.
+
+    Example:
+
+    ```python
+    # Note that `'my_package'` is used as the `package` argument here, and since
+    # the `name` argument is not provided, `'MyDense'` is used as the `name`.
+    @keras.saving.register_keras_serializable('my_package')
+    class MyDense(keras.layers.Dense):
+      pass
+
+    assert keras.saving.get_registered_object('my_package>MyDense') == MyDense
+    assert keras.saving.get_registered_name(MyDense) == 'my_package>MyDense'
+    ```
+
+    Args:
+      package: The package that this class belongs to. This is used for the
+        `key` (which is `"package>name"`) to idenfify the class. Note that this
+        is the first argument passed into the decorator.
+      name: The name to serialize this class under in this package. If not
+        provided or `None`, the class' name will be used (note that this is the
+        case when the decorator is used with only one argument, which becomes
+        the `package`).
+
+    Returns:
+      A decorator that registers the decorated class with the passed names.
+    """
+
+    def decorator(arg):
+        """Registers a class with the Keras serialization framework."""
+        class_name = name if name is not None else arg.__name__
+        registered_name = package + ">" + class_name
+
+        if inspect.isclass(arg) and not hasattr(arg, "get_config"):
+            raise ValueError(
+                "Cannot register a class that does not have a "
+                "get_config() method."
+            )
+
+        _GLOBAL_CUSTOM_OBJECTS[registered_name] = arg
+        _GLOBAL_CUSTOM_NAMES[arg] = registered_name
+
+        return arg
+
+    return decorator
+
+
+@keras_export(
+    "keras.saving.get_registered_name", "keras.utils.get_registered_name"
+)
+def get_registered_name(obj):
+    """Returns the name registered to an object within the Keras framework.
+
+    This function is part of the Keras serialization and deserialization
+    framework. It maps objects to the string names associated with those objects
+    for serialization/deserialization.
+
+    Args:
+      obj: The object to look up.
+
+    Returns:
+      The name associated with the object, or the default Python name if the
+        object is not registered.
+    """
+    if obj in _GLOBAL_CUSTOM_NAMES:
+        return _GLOBAL_CUSTOM_NAMES[obj]
+    else:
+        return obj.__name__
+
+
+@keras_export(
+    "keras.saving.get_registered_object", "keras.utils.get_registered_object"
+)
+def get_registered_object(name, custom_objects=None, module_objects=None):
+    """Returns the class associated with `name` if it is registered with Keras.
+
+    This function is part of the Keras serialization and deserialization
+    framework. It maps strings to the objects associated with them for
+    serialization/deserialization.
+
+    Example:
+
+    ```python
+    def from_config(cls, config, custom_objects=None):
+      if 'my_custom_object_name' in config:
+        config['hidden_cls'] = tf.keras.saving.get_registered_object(
+            config['my_custom_object_name'], custom_objects=custom_objects)
+    ```
+
+    Args:
+      name: The name to look up.
+      custom_objects: A dictionary of custom objects to look the name up in.
+        Generally, custom_objects is provided by the user.
+      module_objects: A dictionary of custom objects to look the name up in.
+        Generally, module_objects is provided by midlevel library implementers.
+
+    Returns:
+      An instantiable class associated with `name`, or `None` if no such class
+        exists.
+    """
+    if name in _THREAD_LOCAL_CUSTOM_OBJECTS.__dict__:
+        return _THREAD_LOCAL_CUSTOM_OBJECTS.__dict__[name]
+    elif name in _GLOBAL_CUSTOM_OBJECTS:
+        return _GLOBAL_CUSTOM_OBJECTS[name]
+    elif custom_objects and name in custom_objects:
+        return custom_objects[name]
+    elif module_objects and name in module_objects:
+        return module_objects[name]
+    return None
+
+
+# Aliases
+custom_object_scope = CustomObjectScope
diff --git a/keras/saving/object_registration_test.py b/keras/saving/object_registration_test.py
new file mode 100644
index 000000000000..3b1a95ca57a7
--- /dev/null
+++ b/keras/saving/object_registration_test.py
@@ -0,0 +1,144 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras serializable object registration functionality."""
+
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras.saving import object_registration
+from keras.saving import serialization_lib
+
+
+class TestObjectRegistration(tf.test.TestCase):
+    def test_custom_object_scope(self):
+        def custom_fn():
+            pass
+
+        class CustomClass:
+            pass
+
+        def check_get_in_thread():
+            with object_registration.custom_object_scope(
+                {"CustomClass": CustomClass, "custom_fn": custom_fn}
+            ):
+                actual_custom_fn = keras.activations.get("custom_fn")
+                self.assertEqual(actual_custom_fn, custom_fn)
+                actual_custom_class = keras.regularizers.get("CustomClass")
+                self.assertEqual(actual_custom_class.__class__, CustomClass)
+
+            with object_registration.custom_object_scope(
+                {"CustomClass": CustomClass, "custom_fn": custom_fn}
+            ):
+                actual_custom_fn = keras.activations.get("custom_fn")
+                self.assertEqual(actual_custom_fn, custom_fn)
+                actual_custom_class = keras.regularizers.get("CustomClass")
+                self.assertEqual(actual_custom_class.__class__, CustomClass)
+                checked_thread = self.checkedThread(check_get_in_thread)
+                checked_thread.start()
+                checked_thread.join()
+
+    def test_serialize_custom_class_with_default_name(self):
+        @object_registration.register_keras_serializable()
+        class TestClass:
+            def __init__(self, value):
+                self._value = value
+
+            def get_config(self):
+                return {"value": self._value}
+
+        serialized_name = "Custom>TestClass"
+        inst = TestClass(value=10)
+        class_name = object_registration._GLOBAL_CUSTOM_NAMES[TestClass]
+        self.assertEqual(serialized_name, class_name)
+        config = serialization_lib.serialize_keras_object(inst)
+        self.assertEqual(class_name, config["class_name"])
+        new_inst = serialization_lib.deserialize_keras_object(config)
+        self.assertIsNot(inst, new_inst)
+        self.assertIsInstance(new_inst, TestClass)
+        self.assertEqual(10, new_inst._value)
+
+        # Make sure registering a new class with same name will fail.
+        with self.assertRaisesRegex(
+            ValueError, ".*has already been registered.*"
+        ):
+
+            @object_registration.register_keras_serializable()
+            class TestClass:
+                def __init__(self, value):
+                    self._value = value
+
+                def get_config(self):
+                    return {"value": self._value}
+
+    def test_serialize_custom_class_with_custom_name(self):
+        @object_registration.register_keras_serializable(
+            "TestPackage", "CustomName"
+        )
+        class OtherTestClass:
+            def __init__(self, val):
+                self._val = val
+
+            def get_config(self):
+                return {"val": self._val}
+
+        serialized_name = "TestPackage>CustomName"
+        inst = OtherTestClass(val=5)
+        class_name = object_registration._GLOBAL_CUSTOM_NAMES[OtherTestClass]
+        self.assertEqual(serialized_name, class_name)
+        fn_class_name = object_registration.get_registered_name(OtherTestClass)
+        self.assertEqual(fn_class_name, class_name)
+
+        cls = object_registration.get_registered_object(fn_class_name)
+        self.assertEqual(OtherTestClass, cls)
+
+        config = keras.utils.serialization.serialize_keras_object(inst)
+        self.assertEqual(class_name, config["class_name"])
+        new_inst = keras.utils.serialization.deserialize_keras_object(config)
+        self.assertIsNot(inst, new_inst)
+        self.assertIsInstance(new_inst, OtherTestClass)
+        self.assertEqual(5, new_inst._val)
+
+    def test_serialize_custom_function(self):
+        @object_registration.register_keras_serializable()
+        def my_fn():
+            return 42
+
+        serialized_name = "Custom>my_fn"
+        class_name = object_registration._GLOBAL_CUSTOM_NAMES[my_fn]
+        self.assertEqual(serialized_name, class_name)
+        fn_class_name = object_registration.get_registered_name(my_fn)
+        self.assertEqual(fn_class_name, class_name)
+
+        config = keras.utils.serialization.serialize_keras_object(my_fn)
+        self.assertEqual(class_name, config)
+        fn = keras.utils.serialization.deserialize_keras_object(config)
+        self.assertEqual(42, fn())
+
+        fn_2 = object_registration.get_registered_object(fn_class_name)
+        self.assertEqual(42, fn_2())
+
+    def test_serialize_custom_class_without_get_config_fails(self):
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Cannot register a class that does not have a get_config.*",
+        ):
+
+            @object_registration.register_keras_serializable(
+                "TestPackage", "TestClass"
+            )
+            class TestClass:
+                def __init__(self, value):
+                    self._value = value
diff --git a/keras/saving/pickle_utils.py b/keras/saving/pickle_utils.py
index 93931a92e481..fe84b548f154 100644
--- a/keras/saving/pickle_utils.py
+++ b/keras/saving/pickle_utils.py
@@ -13,69 +13,65 @@
 # limitations under the License.
 # ==============================================================================
 """Saving utilities to support Python's Pickle protocol."""
-# pylint: disable=g-bad-import-order
-import tensorflow.compat.v2 as tf
-
 import os
-import tarfile
-import io
-import uuid
-import numpy
+import tempfile
+
+import tensorflow.compat.v2 as tf
 
-from keras.saving import save as save_module
+from keras.saving import saving_lib
 
 
 def deserialize_model_from_bytecode(serialized_model):
-  """Reconstruct a Model from the output of `serialize_model_as_bytecode`.
+    """Reconstruct a Model from the output of `serialize_model_as_bytecode`.
 
-  Args:
-      serialized_model: (np.array) return value from
-        `serialize_model_as_bytecode`.
+    Args:
+        serialized_model: (bytes) return value from
+          `serialize_model_as_bytecode`.
 
-  Returns:
-      keras.Model: Keras Model instance.
-  """
-  temp_dir = f"ram://{uuid.uuid4()}"
-  b = io.BytesIO(serialized_model)
-  with tarfile.open(fileobj=b, mode="r") as archive:
-    for name in archive.getnames():
-      dest_path = tf.io.gfile.join(temp_dir, name)
-      member = archive.getmember(name)
-      tf.io.gfile.makedirs(os.path.dirname(dest_path))
-      if member.isfile():
-        with tf.io.gfile.GFile(dest_path, "wb") as f:
-          f.write(archive.extractfile(name).read())
-  model = save_module.load_model(temp_dir)
-  tf.io.gfile.rmtree(temp_dir)
-  return model
+    Returns:
+        Keras Model instance.
+    """
+    # Note: we don't use a RAM path for this because zipfile cannot write
+    # to such paths.
+    temp_dir = tempfile.mkdtemp()
+    try:
+        filepath = os.path.join(temp_dir, "model.keras")
+        with open(filepath, "wb") as f:
+            f.write(serialized_model)
+        # When loading, direct import will work for most custom objects
+        # though it will require get_config() to be implemented.
+        # Some custom objects (e.g. an activation in a Dense layer,
+        # serialized as a string by Dense.get_config()) will require
+        # a custom_object_scope.
+        model = saving_lib.load_model(filepath, safe_mode=False)
+    except Exception as e:
+        raise e
+    else:
+        return model
+    finally:
+        tf.io.gfile.rmtree(temp_dir)
 
 
 def serialize_model_as_bytecode(model):
-  """Convert a Keras Model into a bytecode representation for pickling.
+    """Convert a Keras Model into a bytecode representation for pickling.
 
-  Args:
-      model: (tf.keras.Model) Keras Model instance.
+    Args:
+        model: Keras Model instance.
 
-  Returns:
-      tuple: tuple of arguments that can be sent to
-          `deserialize_from_bytecode`.
-  """
-  temp_dir = f"ram://{uuid.uuid4()}"
-  model.save(temp_dir)
-  b = io.BytesIO()
-  with tarfile.open(fileobj=b, mode="w") as archive:
-    for root, dirs, filenames in tf.io.gfile.walk(temp_dir):
-      for dirname in dirs:
-        dest_path = tf.io.gfile.join(root, dirname)
-        t = tarfile.TarInfo(dest_path)
-        t.type = tarfile.DIRTYPE
-        archive.addfile(t)
-      for filename in filenames:
-        dest_path = tf.io.gfile.join(root, filename)
-        with tf.io.gfile.GFile(dest_path, "rb") as f:
-          info = tarfile.TarInfo(name=os.path.relpath(dest_path, temp_dir))
-          info.size = f.size()
-          archive.addfile(tarinfo=info, fileobj=f)
-  tf.io.gfile.rmtree(temp_dir)
-  b.seek(0)
-  return (numpy.asarray(memoryview(b.read())),)
+    Returns:
+        Tuple that can be read by `deserialize_from_bytecode`.
+    """
+    # Note: we don't use a RAM path for this because zipfile cannot write
+    # to such paths.
+    temp_dir = tempfile.mkdtemp()
+    try:
+        filepath = os.path.join(temp_dir, "model.keras")
+        saving_lib.save_model(model, filepath)
+        with open(filepath, "rb") as f:
+            data = f.read()
+    except Exception as e:
+        raise e
+    else:
+        return data
+    finally:
+        tf.io.gfile.rmtree(temp_dir)
diff --git a/keras/saving/pickle_utils_test.py b/keras/saving/pickle_utils_test.py
index c4f06d39b37b..0d487ea8422f 100644
--- a/keras/saving/pickle_utils_test.py
+++ b/keras/saving/pickle_utils_test.py
@@ -13,68 +13,86 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for pickling / deepcopying of Keras Models."""
-# pylint: disable=g-bad-import-order
-import tensorflow.compat.v2 as tf
-
 import copy
 import pickle
+
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
 
+@test_utils.run_v2_only
 class TestPickleProtocol(test_combinations.TestCase):
-  """Tests pickle protoocol support."""
+    """Tests pickle protocol support."""
 
-  @test_combinations.run_with_all_model_types
-  @test_combinations.parameterized.named_parameters(
-      ('copy', copy.copy), ('deepcopy', copy.deepcopy),
-      *((f'pickle_protocol_level_{protocol}',
-         lambda model: pickle.loads(pickle.dumps(model, protocol=protocol)))  # pylint: disable=cell-var-from-loop
-        for protocol in range(pickle.HIGHEST_PROTOCOL + 1)))
-  def test_built_models(self, serializer):
-    """Built models should be copyable and picklable for all model types."""
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest('pickle model only available in v2 when tf format is used.')
-    model = test_utils.get_small_mlp(
-        num_hidden=1, num_classes=2, input_dim=3)
-    model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy')
+    @test_combinations.run_with_all_model_types
+    @test_combinations.parameterized.named_parameters(
+        ("copy", copy.copy),
+        ("deepcopy", copy.deepcopy),
+        *(
+            (
+                f"pickle_protocol_level_{protocol}",
+                lambda model: pickle.loads(
+                    pickle.dumps(model, protocol=protocol)
+                ),
+            )
+            for protocol in range(pickle.HIGHEST_PROTOCOL + 1)
+        ),
+    )
+    def test_built_models(self, serializer):
+        """Built models should be copyable and pickleable for all model
+        types."""
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest(
+                "pickle model only available in v2 when tf format is used."
+            )
+        model = test_utils.get_small_mlp(
+            num_hidden=1, num_classes=2, input_dim=3
+        )
+        model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy")
 
-    # train
-    x = np.random.random(size=(1000, 3))
-    y = np.random.randint(low=0, high=2, size=(1000,))
-    model.fit(x, y)  # builds model
-    y1 = model.predict(x)
-    # roundtrip with training
-    model = serializer(model)
-    y2 = model.predict(x)
-    # check that the predictions are the same
-    self.assertAllClose(y1, y2)
-    # and that we can continue training
-    model.fit(x, y)
-    y3 = model.predict(x)
-    # check that the predictions are the same
-    self.assertNotAllClose(y2, y3)
+        # train
+        x = np.random.random(size=(10, 3))
+        y = np.random.randint(low=0, high=2, size=(10,))
+        model.fit(x, y)  # builds model
+        y1 = model.predict(x)
+        # roundtrip with training
+        model = serializer(model)
+        y2 = model.predict(x)
+        # check that the predictions are the same
+        self.assertAllClose(y1, y2)
+        # and that we can continue training
+        model.fit(x, y)
+        y3 = model.predict(x)
+        # check that the predictions are the same
+        self.assertNotAllClose(y2, y3)
 
-  @test_combinations.run_with_all_model_types
-  @test_combinations.parameterized.named_parameters(
-      ('copy', copy.copy),
-      ('deepcopy', copy.deepcopy),
-  )
-  def test_unbuilt_models(self, serializer):
-    """Unbuilt models should be copyable & deepcopyable for all model types."""
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest('pickle model only available in v2 when tf format is used.')
-    original_model = test_utils.get_small_mlp(
-        num_hidden=1, num_classes=2, input_dim=3)
-    # roundtrip without compiling or training
-    model = serializer(original_model)
-    # compile
-    model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy')
-    # roundtrip compiled but not trained
-    model = serializer(model)
+    @test_combinations.run_with_all_model_types
+    @test_combinations.parameterized.named_parameters(
+        ("copy", copy.copy),
+        ("deepcopy", copy.deepcopy),
+    )
+    def test_unbuilt_models(self, serializer):
+        """Unbuilt models should be copyable & deepcopyable for all model
+        types."""
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest(
+                "pickle model only available in v2 when tf format is used."
+            )
+        original_model = test_utils.get_small_mlp(
+            num_hidden=1, num_classes=2, input_dim=3
+        )
+        # roundtrip without compiling or training
+        model = serializer(original_model)
+        # compile
+        model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy")
+        if hasattr(model.optimizer, "_distribution_strategy"):
+            model.optimizer._distribution_strategy = None
+        # roundtrip compiled but not trained
+        model = serializer(model)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/save.py b/keras/saving/save.py
deleted file mode 100644
index 270a6cdca8b4..000000000000
--- a/keras/saving/save.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras model saving code."""
-
-import tensorflow.compat.v2 as tf
-from keras.saving import hdf5_format
-from keras.saving import saving_utils
-from keras.saving.saved_model import load as saved_model_load
-from keras.saving.saved_model import load_context
-from keras.saving.saved_model import save as saved_model_save
-from keras.utils import generic_utils
-from keras.utils import traceback_utils
-from keras.utils.io_utils import path_to_string
-from tensorflow.python.util.tf_export import keras_export
-
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-except ImportError:
-  h5py = None
-# pylint: enable=g-import-not-at-top
-
-
-@keras_export('keras.models.save_model')
-@traceback_utils.filter_traceback
-def save_model(model,
-               filepath,
-               overwrite=True,
-               include_optimizer=True,
-               save_format=None,
-               signatures=None,
-               options=None,
-               save_traces=True):
-  # pylint: disable=line-too-long
-  """Saves a model as a TensorFlow SavedModel or HDF5 file.
-
-  See the [Serialization and Saving guide](https://keras.io/guides/serialization_and_saving/)
-  for details.
-
-  Usage:
-
-  >>> model = tf.keras.Sequential([
-  ...     tf.keras.layers.Dense(5, input_shape=(3,)),
-  ...     tf.keras.layers.Softmax()])
-  >>> model.save('/tmp/model')
-  >>> loaded_model = tf.keras.models.load_model('/tmp/model')
-  >>> x = tf.random.uniform((10, 3))
-  >>> assert np.allclose(model.predict(x), loaded_model.predict(x))
-
-  Note that `model.save()` is an alias for `tf.keras.models.save_model()`.
-
-  The SavedModel and HDF5 file contains:
-
-  - the model's configuration (topology)
-  - the model's weights
-  - the model's optimizer's state (if any)
-
-  Thus models can be reinstantiated in the exact same state, without any of the
-  code used for model definition or training.
-
-  Note that the model weights may have different scoped names after being
-  loaded. Scoped names include the model/layer names, such as
-  `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
-  access specific variables, e.g. `model.get_layer("dense_1").kernel`.
-
-  __SavedModel serialization format__
-
-  Keras SavedModel uses `tf.saved_model.save` to save the model and all
-  trackable objects attached to the model (e.g. layers and variables). The model
-  config, weights, and optimizer are saved in the SavedModel. Additionally, for
-  every Keras layer attached to the model, the SavedModel stores:
-
-    * the config and metadata -- e.g. name, dtype, trainable status
-    * traced call and loss functions, which are stored as TensorFlow subgraphs.
-
-  The traced functions allow the SavedModel format to save and load custom
-  layers without the original class definition.
-
-  You can choose to not save the traced functions by disabling the `save_traces`
-  option. This will decrease the time it takes to save the model and the
-  amount of disk space occupied by the output SavedModel. If you enable this
-  option, then you _must_ provide all custom class definitions when loading
-  the model. See the `custom_objects` argument in `tf.keras.models.load_model`.
-
-  Args:
-      model: Keras model instance to be saved.
-      filepath: One of the following:
-        - String or `pathlib.Path` object, path where to save the model
-        - `h5py.File` object where to save the model
-      overwrite: Whether we should overwrite any existing model at the target
-        location, or instead ask the user with a manual prompt.
-      include_optimizer: If True, save optimizer's state together.
-      save_format: Either 'tf' or 'h5', indicating whether to save the model
-        to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X, and 'h5'
-        in TF 1.X.
-      signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
-        format only. Please see the `signatures` argument in
-        `tf.saved_model.save` for details.
-      options: (only applies to SavedModel format) `tf.saved_model.SaveOptions`
-        object that specifies options for saving to SavedModel.
-      save_traces: (only applies to SavedModel format) When enabled, the
-        SavedModel will store the function traces for each layer. This
-        can be disabled, so that only the configs of each layer are stored.
-        Defaults to `True`. Disabling this will decrease serialization time and
-        reduce file size, but it requires that all custom layers/models
-        implement a `get_config()` method.
-
-  Raises:
-      ImportError: If save format is hdf5, and h5py is not available.
-  """
-  # pylint: enable=line-too-long
-  from keras.engine import sequential  # pylint: disable=g-import-not-at-top
-
-  default_format = 'tf' if tf.__internal__.tf2.enabled() else 'h5'
-  save_format = save_format or default_format
-
-  filepath = path_to_string(filepath)
-
-  # If the user has not already called fit or built the underlying metrics, we
-  # should do that before saving to ensure the metric names have all
-  # appropriate name transformations applied.
-  saving_utils.try_build_compiled_arguments(model)
-
-  if (save_format == 'h5' or
-      (h5py is not None and isinstance(filepath, h5py.File)) or
-      saving_utils.is_hdf5_filepath(filepath)):
-    # TODO(b/130258301): add utility method for detecting model type.
-    if (not model._is_graph_network and  # pylint:disable=protected-access
-        not isinstance(model, sequential.Sequential)):
-      raise NotImplementedError(
-          'Saving the model to HDF5 format requires the model to be a '
-          'Functional model or a Sequential model. It does not work for '
-          'subclassed models, because such models are defined via the body of '
-          'a Python method, which isn\'t safely serializable. Consider saving '
-          'to the Tensorflow SavedModel format (by setting save_format="tf") '
-          'or using `save_weights`.')
-    hdf5_format.save_model_to_hdf5(
-        model, filepath, overwrite, include_optimizer)
-  else:
-    with generic_utils.SharedObjectSavingScope():
-      saved_model_save.save(model, filepath, overwrite, include_optimizer,
-                            signatures, options, save_traces)
-
-
-@keras_export('keras.models.load_model')
-@traceback_utils.filter_traceback
-def load_model(filepath, custom_objects=None, compile=True, options=None):  # pylint: disable=redefined-builtin
-  """Loads a model saved via `model.save()`.
-
-  Usage:
-
-  >>> model = tf.keras.Sequential([
-  ...     tf.keras.layers.Dense(5, input_shape=(3,)),
-  ...     tf.keras.layers.Softmax()])
-  >>> model.save('/tmp/model')
-  >>> loaded_model = tf.keras.models.load_model('/tmp/model')
-  >>> x = tf.random.uniform((10, 3))
-  >>> assert np.allclose(model.predict(x), loaded_model.predict(x))
-
-  Note that the model weights may have different scoped names after being
-  loaded. Scoped names include the model/layer names, such as
-  `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
-  access specific variables, e.g. `model.get_layer("dense_1").kernel`.
-
-  Args:
-      filepath: One of the following:
-          - String or `pathlib.Path` object, path to the saved model
-          - `h5py.File` object from which to load the model
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-      compile: Boolean, whether to compile the model
-          after loading.
-      options: Optional `tf.saved_model.LoadOptions` object that specifies
-        options for loading from SavedModel.
-
-  Returns:
-      A Keras model instance. If the original model was compiled, and saved with
-      the optimizer, then the returned model will be compiled. Otherwise, the
-      model will be left uncompiled. In the case that an uncompiled model is
-      returned, a warning is displayed if the `compile` argument is set to
-      `True`.
-
-  Raises:
-      ImportError: if loading from an hdf5 file and h5py is not available.
-      IOError: In case of an invalid savefile.
-  """
-  with generic_utils.SharedObjectLoadingScope():
-    with generic_utils.CustomObjectScope(custom_objects or {}):
-      with load_context.load_context(options):
-        filepath_str = path_to_string(filepath)
-        if isinstance(filepath_str, str):
-          if not tf.io.gfile.exists(filepath_str):
-            raise IOError(f'No file or directory found at {filepath_str}')
-
-          if tf.io.gfile.isdir(filepath_str):
-            return saved_model_load.load(filepath_str, compile, options)
-          else:
-            if h5py is None:
-              raise ImportError(
-                  'Filepath looks like a hdf5 file but h5py is not available.'
-                  f' filepath={filepath_str}')
-            return hdf5_format.load_model_from_hdf5(
-                tf.io.gfile.GFile(filepath_str, mode='rb'), custom_objects,
-                compile)
-        elif h5py is not None and isinstance(filepath, h5py.File):
-          return hdf5_format.load_model_from_hdf5(filepath, custom_objects,
-                                                  compile)
-
-  raise IOError(
-      'Unable to load model. Filepath is not an hdf5 file (or h5py is not '
-      f'available) or SavedModel. Received: filepath={filepath}')
-
-# Inject the load_model function to keras_deps to remove the dependency
-# from TFLite to Keras.
-tf.__internal__.register_load_model_function(load_model)
diff --git a/keras/saving/save_test.py b/keras/saving/save_test.py
deleted file mode 100644
index 27fde3a312f5..000000000000
--- a/keras/saving/save_test.py
+++ /dev/null
@@ -1,1385 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras model saving code."""
-
-import tensorflow.compat.v2 as tf
-
-import collections
-import os
-import pathlib
-import shutil
-import tempfile
-import warnings
-
-from absl.testing import parameterized
-import numpy as np
-
-import keras
-from keras import losses
-from keras.optimizers import optimizer_v1
-from keras import optimizers
-from keras.engine import functional
-from keras.engine import sequential
-from keras.feature_column import dense_features
-from keras.feature_column import sequence_feature_column as ksfc
-from keras.layers import core
-from keras.premade_models.linear import LinearModel
-from keras.saving import model_config
-from keras.saving import save
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-from keras.utils import generic_utils
-
-
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-class TestSaveModel(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.model = test_utils.get_small_sequential_mlp(1, 2, 3)
-    self.subclassed_model = test_utils.get_small_subclass_mlp(1, 2)
-
-  def assert_h5_format(self, path):
-    if h5py is not None:
-      self.assertTrue(h5py.is_hdf5(path),
-                      'Model saved at path {} is not a valid hdf5 file.'
-                      .format(path))
-
-  def assert_saved_model(self, path):
-    tf.__internal__.saved_model.parse_saved_model(path)
-
-  @test_utils.run_v2_only
-  def test_load_file_not_found(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'does_not_exist'
-    with self.assertRaisesRegex(IOError, 'No file or directory found at'):
-      save.load_model(path)
-
-  @test_utils.run_v2_only
-  def test_save_format_defaults(self):
-    path = os.path.join(self.get_temp_dir(), 'model_path')
-    save.save_model(self.model, path)
-    self.assert_saved_model(path)
-
-  @test_utils.run_v2_only
-  def test_save_format_defaults_pathlib(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'model_path'
-    save.save_model(self.model, path)
-    self.assert_saved_model(path)
-
-  @test_utils.run_v2_only
-  def test_save_hdf5(self):
-    path = os.path.join(self.get_temp_dir(), 'model')
-    save.save_model(self.model, path, save_format='h5')
-    self.assert_h5_format(path)
-    with self.assertRaisesRegex(
-        NotImplementedError,
-        'requires the model to be a Functional model or a Sequential model.'):
-      save.save_model(self.subclassed_model, path, save_format='h5')
-
-  @test_utils.run_v2_only
-  def test_save_load_hdf5_pathlib(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'model'
-    save.save_model(self.model, path, save_format='h5')
-    save.load_model(path)
-
-  @test_utils.run_v2_only
-  def test_save_tf(self):
-    path = os.path.join(self.get_temp_dir(), 'model')
-    save.save_model(self.model, path, save_format='tf')
-    self.assert_saved_model(path)
-    with self.assertRaisesRegex(
-        ValueError, r'Model.*cannot be saved.*as opposed to `model.call\(\).*'):
-      save.save_model(self.subclassed_model, path, save_format='tf')
-    self.subclassed_model.predict(np.random.random((3, 5)))
-    save.save_model(self.subclassed_model, path, save_format='tf')
-    self.assert_saved_model(path)
-
-  @test_utils.run_v2_only
-  def test_save_load_tf_string(self):
-    path = os.path.join(self.get_temp_dir(), 'model')
-    save.save_model(self.model, path, save_format='tf')
-    save.load_model(path)
-
-  @test_utils.run_v2_only
-  def test_save_load_tf_pathlib(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'model'
-    save.save_model(self.model, path, save_format='tf')
-    save.load_model(path)
-
-  @test_utils.run_v2_only
-  def test_save_load_weights_tf_pathlib(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'model'
-    self.model.save_weights(path, save_format='tf')
-    self.model.load_weights(path)
-
-  @test_utils.run_v2_only
-  def test_save_load_weights_hdf5_pathlib(self):
-    path = pathlib.Path(self.get_temp_dir()) / 'model'
-    self.model.save_weights(path, save_format='h5')
-    self.model.load_weights(path)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_saving_h5_for_rnn_layers(self):
-    # See https://github.com/tensorflow/tensorflow/issues/35731 for details.
-    inputs = keras.Input([10, 91], name='train_input')
-    rnn_layers = [
-        keras.layers.LSTMCell(size, recurrent_dropout=0, name='rnn_cell%d' % i)
-        for i, size in enumerate([512, 512])
-    ]
-    rnn_output = keras.layers.RNN(
-        rnn_layers, return_sequences=True, name='rnn_layer')(inputs)
-    pred_feat = keras.layers.Dense(91, name='prediction_features')(rnn_output)
-    pred = keras.layers.Softmax()(pred_feat)
-    model = keras.Model(inputs=[inputs], outputs=[pred, pred_feat])
-    path = os.path.join(self.get_temp_dir(), 'model_path.h5')
-    model.save(path)
-
-    # Make sure the variable name is unique.
-    self.assertNotEqual(rnn_layers[0].kernel.name,
-                        rnn_layers[1].kernel.name)
-    self.assertIn('rnn_cell1', rnn_layers[1].kernel.name)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_saving_optimizer_weights(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = keras.layers.Dense(1)
-
-      def call(self, x):
-        return self.layer(x)
-
-    path = os.path.join(self.get_temp_dir(), 'weights_path')
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-
-    model = MyModel()
-    model.compile('rmsprop', loss='bce')
-    model.train_on_batch(x, y)
-    model.reset_metrics()
-    model.save_weights(path, save_format='tf')
-
-    batch_loss = model.train_on_batch(x, y)
-
-    new_model = MyModel()
-    new_model.compile('rmsprop', loss='bce')
-    new_model.train_on_batch(x, y)
-    new_model.reset_metrics()
-
-    new_model.load_weights(path)
-    new_batch_loss = new_model.train_on_batch(x, y)
-
-    self.assertAllClose(batch_loss, new_batch_loss)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['eager', 'graph']))
-  def test_save_include_optimizer_false(self):
-
-    def get_variables(file_name):
-      reader = tf.train.load_checkpoint(
-          os.path.join(file_name, 'variables/variables'))
-      shape_from_key = reader.get_variable_to_shape_map()
-      return sorted(shape_from_key.keys())
-
-    path = os.path.join(self.get_temp_dir(), 'no_optimizer')
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1))
-    model.compile('adam', loss='mse')
-    model.train_on_batch(x, y)
-    model.save(path, save_format='tf', include_optimizer=False)
-    variables = get_variables(path)
-
-    for v in variables:
-      self.assertNotIn('optimizer', v)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_saving_model_with_custom_object(self):
-    with generic_utils.custom_object_scope(), self.cached_session():
-
-      @generic_utils.register_keras_serializable()
-      class CustomLoss(losses.MeanSquaredError):
-        pass
-
-      model = sequential.Sequential(
-          [core.Dense(units=1, input_shape=(1,))])
-      model.compile(optimizer='sgd', loss=CustomLoss())
-      model.fit(np.zeros([10, 1]), np.zeros([10, 1]))
-
-      temp_dir = self.get_temp_dir()
-      filepath = os.path.join(temp_dir, 'saving')
-      model.save(filepath)
-
-      # Make sure the model can be correctly load back.
-      _ = save.load_model(filepath, compile=True)
-
-  def test_saving_model_with_name_conflict(self):
-
-    class Sequential(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = keras.layers.Dense(1)
-
-      def call(self, x):
-        return self.layer(x)
-
-    model = Sequential()
-    model(tf.ones((10, 10)))
-    temp_dir = self.get_temp_dir()
-    filepath = os.path.join(temp_dir, 'Sequential')
-
-    with self.assertLogs() as logs:
-      model.save(filepath, save_format='tf')
-
-    expected_substring = 'has the same name \'Sequential\' as a built-in Keras'
-    matched = [log for log in logs.output if expected_substring in log]
-    self.assertNotEmpty(matched)
-
-  def test_saving_built_in_model(self):
-    model = LinearModel()
-    model(tf.constant([[5.]]))
-    temp_dir = self.get_temp_dir()
-    filepath = os.path.join(temp_dir, 'LinearModel')
-    with self.assertLogs() as logs:
-      model.save(filepath, save_format='tf')
-
-    expected_substring = 'has the same name \'LinearModel\' as a built-in Keras'
-    matched = [log for log in logs.output if expected_substring in log]
-    # Check that a warning is *not* logged for a premade model.
-    self.assertEmpty(matched)
-
-
-@generic_utils.register_keras_serializable(package='Foo')
-class RegisteredSubLayer(keras.layers.Layer):
-  pass
-
-
-class TestJson(test_combinations.TestCase):
-  """Tests to_json()/from_json()."""
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_saving_with_dense_features(self):
-    cols = [
-        tf.feature_column.numeric_column('a'),
-        tf.feature_column.indicator_column(
-            tf.feature_column.categorical_column_with_vocabulary_list(
-                'b', ['one', 'two']))
-    ]
-    input_layers = {
-        'a': keras.layers.Input(shape=(1,), name='a'),
-        'b': keras.layers.Input(shape=(1,), name='b', dtype='string')
-    }
-
-    fc_layer = dense_features.DenseFeatures(cols)(input_layers)
-    output = keras.layers.Dense(10)(fc_layer)
-
-    model = keras.models.Model(input_layers, output)
-
-    model.compile(
-        loss=keras.losses.MSE,
-        optimizer='rmsprop',
-        metrics=[keras.metrics.categorical_accuracy])
-
-    config = model.to_json()
-    loaded_model = model_config.model_from_json(config)
-
-    inputs_a = np.arange(10).reshape(10, 1)
-    inputs_b = np.arange(10).reshape(10, 1).astype('str')
-
-    with self.cached_session():
-      # Initialize tables for V1 lookup.
-      if not tf.executing_eagerly():
-        self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertLen(loaded_model.predict({'a': inputs_a, 'b': inputs_b}), 10)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_saving_with_sequence_features(self):
-    cols = [
-        tf.feature_column.sequence_numeric_column('a'),
-        tf.feature_column.indicator_column(
-            tf.feature_column.sequence_categorical_column_with_vocabulary_list(
-                'b', ['one', 'two']))
-    ]
-    input_layers = {
-        'a':
-            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
-        'b':
-            keras.layers.Input(
-                shape=(None, 1), sparse=True, name='b', dtype='string')
-    }
-
-    fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
-    # TODO(tibell): Figure out the right dtype and apply masking.
-    # sequence_length_mask = array_ops.sequence_mask(sequence_length)
-    # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
-    x = keras.layers.GRU(32)(fc_layer)
-    output = keras.layers.Dense(10)(x)
-
-    model = keras.models.Model(input_layers, output)
-
-    model.compile(
-        loss=keras.losses.MSE,
-        optimizer='rmsprop',
-        metrics=[keras.metrics.categorical_accuracy])
-
-    config = model.to_json()
-    loaded_model = model_config.model_from_json(config)
-
-    batch_size = 10
-    timesteps = 1
-
-    values_a = np.arange(10, dtype=np.float32)
-    indices_a = np.zeros((10, 3), dtype=np.int64)
-    indices_a[:, 0] = np.arange(10)
-    inputs_a = tf.SparseTensor(indices_a, values_a,
-                               (batch_size, timesteps, 1))
-
-    values_b = np.zeros(10, dtype=np.str)
-    indices_b = np.zeros((10, 3), dtype=np.int64)
-    indices_b[:, 0] = np.arange(10)
-    inputs_b = tf.SparseTensor(indices_b, values_b,
-                               (batch_size, timesteps, 1))
-
-    with self.cached_session():
-      # Initialize tables for V1 lookup.
-      if not tf.executing_eagerly():
-        self.evaluate(tf.compat.v1.tables_initializer())
-
-      self.assertLen(
-          loaded_model.predict({
-              'a': inputs_a,
-              'b': inputs_b
-          }, steps=1), batch_size)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_nested_layers(self):
-
-    class MyLayer(keras.layers.Layer):
-
-      def __init__(self, sublayers, **kwargs):
-        super().__init__(**kwargs)
-        self.sublayers = sublayers
-
-      def get_config(self):
-        config = super().get_config()
-        config['sublayers'] = self.sublayers
-        return config
-
-    layer = MyLayer([keras.layers.Dense(2, name='MyDense'),
-                     RegisteredSubLayer(name='MySubLayer')])
-    model = keras.Sequential([keras.Input([None]), layer])
-    model_json = model.to_json()
-
-    self.assertIn('Foo>RegisteredSubLayer', model_json)
-
-    loaded_model = model_config.model_from_json(
-        model_json, custom_objects={'MyLayer': MyLayer})
-    loaded_layer = loaded_model.layers[0]
-    self.assertIsInstance(loaded_layer.sublayers[0], keras.layers.Dense)
-    self.assertEqual(loaded_layer.sublayers[0].name, 'MyDense')
-    self.assertIsInstance(loaded_layer.sublayers[1], RegisteredSubLayer)
-    self.assertEqual(loaded_layer.sublayers[1].name, 'MySubLayer')
-
-
-class MaskedTensor(tf.experimental.ExtensionType):
-  __name__ = 'MaskedTensor_save_test'
-  values: tf.Tensor
-  mask: tf.Tensor
-  class Spec(tf.TypeSpec):
-
-    @property
-    def shape(self):
-      return self.values.shape
-
-    @property
-    def dtype(self):
-      return self.values.dtype
-
-    def with_shape(self, shape):
-      values_spec = tf.TensorSpec(
-          shape, dtype=self.values.dtype, name=self.values.name)
-      mask_spec = tf.TensorSpec(
-          shape, dtype=self.mask.dtype, name=self.mask.name)
-      return MaskedTensor.Spec(values_spec, mask_spec)
-
-
-@test_combinations.run_with_all_saved_model_formats
-class TestWholeModelSaving(test_combinations.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  def _assert_same_weights_and_metrics(self, model, loaded_model):
-    """Checks that the loaded weights and metrics are the same as the original.
-
-    Args:
-      model: original model
-      loaded_model: loaded model
-    """
-    self.assertAllClose(model.weights, loaded_model.weights)
-
-    if loaded_model.optimizer:
-      if test_utils.get_save_format() == 'tf':
-        # TODO(b/153110928): Keras TF format doesn't restore optimizer weights
-        # currently.
-        return
-      self.assertAllClose(model.optimizer.weights,
-                          loaded_model.optimizer.weights)
-
-    # In V1/Graph mode, the model isn't built, so the metrics are not loaded
-    # immediately (requires model to be called on some data before building
-    # metrics).
-    check_metrics = tf.__internal__.tf2.enabled() and tf.executing_eagerly()
-
-    if check_metrics:
-      self.assertAllEqual([m.name for m in model.metrics],
-                          [m.name for m in loaded_model.metrics])
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_save_and_load(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    save_kwargs = test_utils.get_save_kwargs()
-
-    if ((save_format == 'h5' or not save_kwargs.get('save_traces', True)) and
-        test_utils.get_model_type() == 'subclass'):
-      # HDF5 format currently does not allow saving subclassed models.
-      # When saving with `save_traces=False`, the subclassed model must have a
-      # get_config/from_config, which the autogenerated model does not have.
-      return
-
-    with self.cached_session():
-      model = test_utils.get_model_from_layers(
-          [keras.layers.Dense(2),
-           keras.layers.RepeatVector(3),
-           keras.layers.TimeDistributed(keras.layers.Dense(3))],
-          input_shape=(3,))
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.optimizer_v2.rmsprop.RMSprop(lr=0.0001),
-          metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalCrossentropy(
-                  name='cce', label_smoothing=tf.constant(0.2)),
-          ],
-          weighted_metrics=[
-              keras.metrics.categorical_crossentropy,
-              keras.metrics.CategoricalCrossentropy(
-                  name='cce', label_smoothing=tf.constant(0.2)),
-          ],
-          sample_weight_mode='temporal')
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      keras.models.save_model(
-          model, saved_model_dir, save_format=save_format,
-          **save_kwargs)
-
-      loaded_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights_and_metrics(model, loaded_model)
-
-      out2 = loaded_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-      eval_out = model.evaluate(x, y)
-      eval_out2 = loaded_model.evaluate(x, y)
-      self.assertArrayNear(eval_out, eval_out2, 0.001)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_sequential_model_saving_without_input_shape(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer='rmsprop',
-          metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy(name='cat_acc')
-          ],
-          weighted_metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy(name='cat_acc2')
-          ],
-          sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      model.save(saved_model_dir, save_format=save_format)
-
-      new_model = keras.models.load_model(saved_model_dir)
-
-      self._assert_same_weights_and_metrics(model, new_model)
-
-      out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_sequential_model_saving_without_compile(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-
-      x = np.random.random((1, 3))
-      out = model.predict(x)
-
-      # Save the model without any compilation or training.
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-
-      new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights_and_metrics(model, new_model)
-
-      out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_sequential_model_saving_2(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-
-    with tf.Graph().as_default(), self.cached_session():
-      # test with custom optimizer, loss
-
-      class CustomOp(optimizer_v1.RMSprop):
-        pass
-
-      def custom_loss(y_true, y_pred):
-        return keras.losses.mse(y_true, y_pred)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss=custom_loss, optimizer=CustomOp(), metrics=['acc'])
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-
-      new_model = keras.models.load_model(
-          saved_model_dir,
-          custom_objects={'CustomOp': CustomOp,
-                          'custom_loss': custom_loss})
-      self._assert_same_weights_and_metrics(model, new_model)
-
-      out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_saving_without_compilation(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_shape=(3,)))
-    model.add(keras.layers.Dense(3))
-    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-    model = keras.models.load_model(saved_model_dir)
-
-  def test_saving_with_tf_optimizer(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_shape=(3,)))
-    model.add(keras.layers.Dense(3))
-    model.compile(loss='mse',
-                  optimizer=tf.compat.v1.train.AdadeltaOptimizer(0.1),
-                  metrics=['acc'])
-
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-    model = keras.models.load_model(saved_model_dir)
-
-  def test_saving_right_after_compilation(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-      if not tf.compat.v1.executing_eagerly_outside_functions():
-        model._make_train_function()
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      model = keras.models.load_model(saved_model_dir)
-
-  def test_saving_lambda_numpy_array_arguments(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-
-    if h5py is None:
-      self.skipTest('h5py required to run this test')
-
-    mean = np.random.random((4, 2, 3))
-    std = np.abs(np.random.random((4, 2, 3))) + 1e-5
-    inputs = keras.layers.Input(shape=(4, 2, 3))
-    output = keras.layers.Lambda(lambda image, mu, std: (image - mu) / std,
-                                 arguments={'mu': mean, 'std': std})(inputs)
-    model = keras.models.Model(inputs, output)
-    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-
-    model = keras.models.load_model(saved_model_dir)
-
-    self.assertAllClose(mean, model.layers[1].arguments['mu'])
-    self.assertAllClose(std, model.layers[1].arguments['std'])
-
-  def test_saving_model_with_long_layer_names(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    with self.cached_session():
-      # This layer name will make the `layers_name` HDF5 attribute blow
-      # out of proportion. Note that it fits into the internal HDF5
-      # attribute memory limit on its own but because h5py converts
-      # the list of layer names into numpy array, which uses the same
-      # amount of memory for every item, it increases the memory
-      # requirements substantially.
-      x = keras.Input(shape=(2,), name='input_' + ('x' * (2**15)))
-      f = x
-      for i in range(4):
-        f = keras.layers.Dense(2, name='dense_%d' % (i,))(f)
-      model = keras.Model(inputs=[x], outputs=[f])
-      model.compile(
-          'adam', loss=keras.losses.MeanSquaredError(), metrics=['acc'])
-
-      x = np.random.random((1, 2))
-      y = np.random.random((1, 2))
-      model.train_on_batch(x, y)
-      out = model.predict(x)
-
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      model = keras.models.load_model(saved_model_dir)
-
-      if save_format in ['tf', 'tensorflow']:
-        return
-      # Check that the HDF5 files contains chunked array
-      # of layer names.
-      with h5py.File(saved_model_dir, 'r') as h5file:
-        num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
-                                if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happened.
-      self.assertGreater(num_names_arrays, 0)
-      out2 = model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_saving_model_with_long_weights_names(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-
-    with self.cached_session():
-      x = keras.Input(shape=(2,), name='nested_model_input')
-      f = x
-      for i in range(4):
-        f = keras.layers.Dense(2, name='nested_model_dense_%d' % (i,))(f)
-      # This layer name will make the `weights_name`
-      # HDF5 attribute blow out of proportion.
-      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**14)))(f)
-      nested_model = keras.Model(inputs=[x], outputs=[f], name='nested_model')
-
-      x = keras.Input(shape=(2,), name='outer_model_input')
-      f = nested_model(x)
-      f = keras.layers.Dense(2, name='outer_model_output')(f)
-
-      model = keras.Model(inputs=[x], outputs=[f])
-      model.compile(loss='mse', optimizer='adam', metrics=['acc'])
-
-      x = np.random.random((1, 2))
-      y = np.random.random((1, 2))
-      model.train_on_batch(x, y)
-      out = model.predict(x)
-
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      model = keras.models.load_model(saved_model_dir)
-
-      if save_format in ['h5', 'hdf5', 'keras']:
-        # Check that the HDF5 files contains chunked array
-        # of weight names.
-        with h5py.File(saved_model_dir, 'r') as h5file:
-          num_weight_arrays = len(
-              [attr for attr in h5file['model_weights']['nested_model'].attrs
-               if attr.startswith('weight_names')])
-        # The chunking of layer names array should have happened.
-        self.assertGreater(num_weight_arrays, 0)
-      out2 = model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_model_saving_to_pre_created_h5py_file(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    with tf.Graph().as_default(), self.cached_session():
-      inputs = keras.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      outputs = keras.layers.Dense(3)(x)
-
-      model = keras.Model(inputs, outputs)
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=optimizer_v1.Adam(),
-          metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
-          ])
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      loaded_model = keras.models.load_model(saved_model_dir)
-      out1 = loaded_model.predict(x)
-      self.assertAllClose(out, out1, atol=1e-05)
-      if save_format in ['tf', 'tensorflow']:
-        return
-
-      # Test h5 format specifically
-      fd, fname = tempfile.mkstemp('.h5')
-      with h5py.File(fname, mode='r+') as h5file:
-        keras.models.save_model(model, h5file)
-        loaded_model = keras.models.load_model(h5file)
-        out2 = loaded_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-      # Test non-default options in h5
-      with h5py.File(
-          '_', driver='core', mode='w', backing_store=False) as h5file:
-        keras.models.save_model(model, h5file)
-        loaded_model = keras.models.load_model(h5file)
-        out2 = loaded_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-      # Cleanup
-      os.close(fd)
-      os.remove(fname)
-
-  def test_model_saving_to_new_dir_path(self):
-    saved_model_dir = os.path.join(self._save_model_dir(), 'newdir',
-                                   'saved_model')
-    save_format = test_utils.get_save_format()
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-
-      x = np.random.random((1, 3))
-      out = model.predict(x)
-
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-
-      new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights_and_metrics(model, new_model)
-
-      out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
-  def test_model_raise_exception_with_failed_saving(self):
-    if h5py is None:
-      self.skipTest('h5py required to run this test')
-
-    saved_model_dir = self._save_model_dir()
-    saved_model_path = os.path.join(saved_model_dir, 'saved_model.h5')
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-
-      with self.assertRaisesRegex(OSError, 'Unable to create file'):
-        with h5py.File(saved_model_path, 'w'):
-          keras.models.save_model(model, saved_model_path)
-
-  def test_saving_constant_initializer_with_numpy(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Dense(
-            2,
-            input_shape=(3,),
-            kernel_initializer=keras.initializers.Constant(np.ones((3, 2)))))
-    model.add(keras.layers.Dense(3))
-    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-    model = keras.models.load_model(saved_model_dir)
-
-  def test_saving_group_naming_h5py(self):
-    # Test saving model with layer which name is prefix to a previous layer
-    # name.
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-    h5_path = os.path.join(temp_dir, 'test.h5')
-
-    input_layer = keras.layers.Input((None, None, 3), name='test_input')
-    x = keras.layers.Conv2D(1, 1, name='conv1/conv')(input_layer)
-    x = keras.layers.Activation('relu', name='conv1')(x)
-    model = keras.models.Model(inputs=input_layer, outputs=x)
-
-    model.save_weights(h5_path)
-    model.load_weights(h5_path)
-
-  def test_primitive_attrs_contain_no_extraneous_strings(self):
-    if h5py is None:
-      self.skipTest('h5py required to run this test')
-
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_shape=[2]))
-    model.save(saved_model_dir, save_format=save_format)
-    if save_format in ['tf', 'tensorflow']:
-      return
-
-    h5file = h5py.File(saved_model_dir, 'r')
-    self.assertRegex(h5file.attrs['keras_version'], r'^[\d]+\.[\d]+\.[\S]+$')
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_functional_model_with_custom_loss_and_metric(self):
-    def _make_model():
-      inputs = keras.Input(shape=(4,))
-      x = keras.layers.Dense(8, activation='relu')(inputs)
-      outputs = keras.layers.Dense(3, activation='softmax')(x)
-      model = keras.Model(inputs=inputs, outputs=outputs)
-      custom_loss = keras.layers.Lambda(lambda x: keras.backend.sum(x * x))(x)
-      model.add_loss(custom_loss)
-      model.add_metric(custom_loss, aggregation='mean', name='custom_loss')
-      return model
-
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-
-    with self.cached_session():
-      model = _make_model()
-      model.compile(
-          loss=keras.losses.SparseCategoricalCrossentropy(),
-          optimizer=optimizers.gradient_descent_v2.SGD(),
-          metrics=[keras.metrics.SparseCategoricalCrossentropy()])
-      x = np.random.normal(size=(32, 4))
-      y = np.random.randint(0, 3, size=32)
-      model.train_on_batch(x, y)
-      evaluation_results = model.evaluate(x, y)
-      # Save and reload model.
-      model.save(saved_model_dir, save_format=save_format)
-      del model  # Prevent misuse.
-      loaded_model = keras.models.load_model(saved_model_dir)
-      loaded_model_eval_results = loaded_model.evaluate(x, y)
-      # Assert all evaluation results are the same.
-      self.assertAllClose(evaluation_results, loaded_model_eval_results, 1e-9)
-      # Check correctness of the loss calculation.
-      self.assertAllGreater(evaluation_results, 0.)
-      evaluation_results = dict(
-          zip(loaded_model.metrics_names, evaluation_results))
-      self.assertNear(
-          evaluation_results['sparse_categorical_crossentropy'] +
-          evaluation_results['custom_loss'], evaluation_results['loss'], 1e-6)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_save_uncompiled_model_with_optimizer(self):
-    with self.cached_session() as session:
-      saved_model_dir = self._save_model_dir()
-      save_format = test_utils.get_save_format()
-      model = keras.models.Sequential([keras.layers.Dense(1, input_shape=(3,))])
-      # Set the model's optimizer but don't compile. This can happen if the
-      # model is trained with a custom training loop.
-      model.optimizer = keras.optimizers.optimizer_v2.rmsprop.RMSprop(lr=0.0001)
-      if not tf.executing_eagerly():
-        session.run([v.initializer for v in model.variables])
-      model.save(saved_model_dir, save_format=save_format)
-
-      if save_format in ['tf', 'tensorflow']:
-        loaded = keras.models.load_model(saved_model_dir)
-        self.assertIsInstance(
-            loaded.optimizer,
-            keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_functional_model_with_getitem_op_layer(self):
-    inp = keras.Input(shape=(8))
-
-    out = inp[:]
-    model = keras.Model(
-        inputs=[inp],
-        outputs=out)
-    batch_size = 7
-    x = tf.stack([
-        tf.range(8) for _ in range(batch_size)])
-    args = [x]
-    expected = x[:]
-
-    self.assertAllEqual(model(args), expected)
-    self.assertAllEqual(model.predict(args, batch_size=batch_size), expected)
-
-    # Make sure it can be successfully saved and loaded.
-    save_format = test_utils.get_save_format()
-    saved_model_dir = self._save_model_dir()
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-
-    loaded_model = keras.models.load_model(saved_model_dir)
-
-    self.assertAllEqual(loaded_model(args), expected)
-    self.assertAllEqual(loaded_model.predict(args, batch_size=batch_size),
-                        expected)
-
-  @test_combinations.generate(test_combinations.combine(
-      mode=['eager', 'graph']))
-  def test_custom_functional_registered(self):
-
-    def _get_cls_definition():
-      class CustomModel(keras.Model):
-
-        def c(self):
-          return 'c'
-
-      return CustomModel
-
-    cls = _get_cls_definition()
-    self.assertEqual(cls.__bases__[0], keras.Model)
-
-    with self.cached_session() as sess:
-      input_ = keras.layers.Input(shape=(1,))
-      output = keras.layers.Dense(1)(input_)
-      model = cls(input_, output)
-      # `cls` now inherits from `Functional` class.
-      self.assertEqual(cls.__bases__[0], functional.Functional)
-
-      if not tf.executing_eagerly():
-        sess.run([v.initializer for v in model.variables])
-
-      save_format = test_utils.get_save_format()
-      saved_model_dir = self._save_model_dir()
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-
-    loaded_model = keras.models.load_model(
-        saved_model_dir, custom_objects={'CustomModel': cls})
-    self.assertIsInstance(loaded_model, cls)
-
-    # Check with "new" `CustomModel` class definition.
-    new_cls = _get_cls_definition()
-    # The new `CustomModel` class is *not* derived from `Functional`.
-    self.assertEqual(new_cls.__bases__[0], keras.Model)
-    reloaded_model = keras.models.load_model(
-        saved_model_dir, custom_objects={'CustomModel': new_cls})
-    self.assertIsInstance(reloaded_model, new_cls)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_shared_objects(self):
-    class OuterLayer(keras.layers.Layer):
-
-      def __init__(self, inner_layer):
-        super().__init__()
-        self.inner_layer = inner_layer
-
-      def call(self, inputs):
-        return self.inner_layer(inputs)
-
-      def get_config(self):
-        return {
-            'inner_layer': generic_utils.serialize_keras_object(
-                self.inner_layer)
-        }
-
-      @classmethod
-      def from_config(cls, config):
-        return cls(generic_utils.deserialize_keras_object(
-            config['inner_layer']))
-
-    class InnerLayer(keras.layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.v = self.add_weight(name='v', shape=[], dtype=tf.float32)
-
-      def call(self, inputs):
-        return self.v + inputs
-
-      @classmethod
-      def from_config(cls, config):
-        return cls()
-
-    # Create a model with 2 output layers that share the same inner layer.
-    inner_layer = InnerLayer()
-    outer_layer_1 = OuterLayer(inner_layer)
-    outer_layer_2 = OuterLayer(inner_layer)
-    input_ = keras.Input(shape=(1,))
-    model = keras.Model(
-        inputs=input_, outputs=[outer_layer_1(input_), outer_layer_2(input_)])
-
-    # Changes to the shared layer should affect both outputs.
-    model.layers[1].inner_layer.v.assign(5)
-    self.assertAllEqual(model(1), [6.0, 6.0])
-    model.layers[1].inner_layer.v.assign(3)
-    self.assertAllEqual(model(1), [4.0, 4.0])
-
-    # After loading, changes to the shared layer should still affect both
-    # outputs.
-    def _do_assertions(loaded):
-      loaded.layers[1].inner_layer.v.assign(5)
-      self.assertAllEqual(loaded(1), [6.0, 6.0])
-      loaded.layers[1].inner_layer.v.assign(3)
-      self.assertAllEqual(loaded(1), [4.0, 4.0])
-      loaded.layers[2].inner_layer.v.assign(5)
-      self.assertAllEqual(loaded(1), [6.0, 6.0])
-      loaded.layers[2].inner_layer.v.assign(3)
-      self.assertAllEqual(loaded(1), [4.0, 4.0])
-
-    # We'd like to make sure we only attach shared object IDs when strictly
-    # necessary, so we'll recursively traverse the generated config to count
-    # whether we have the exact number we expect.
-    def _get_all_keys_recursive(dict_or_iterable):
-      if isinstance(dict_or_iterable, dict):
-        for key in dict_or_iterable.keys():
-          yield key
-        for key in _get_all_keys_recursive(dict_or_iterable.values()):
-          yield key
-      elif isinstance(dict_or_iterable, str):
-        return
-      else:
-        try:
-          for item in dict_or_iterable:
-            for key in _get_all_keys_recursive(item):
-              yield key
-        # Not an iterable or dictionary
-        except TypeError:
-          return
-
-    with generic_utils.CustomObjectScope({
-        'OuterLayer': OuterLayer, 'InnerLayer': InnerLayer}):
-
-      # Test saving and loading to disk
-      save_format = test_utils.get_save_format()
-      saved_model_dir = self._save_model_dir()
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      loaded = keras.models.load_model(saved_model_dir)
-      _do_assertions(loaded)
-
-      # Test recreating directly from config
-      config = model.get_config()
-      key_count = collections.Counter(_get_all_keys_recursive(config))
-      self.assertEqual(key_count[generic_utils.SHARED_OBJECT_KEY], 2)
-      loaded = keras.Model.from_config(config)
-      _do_assertions(loaded)
-
-  @test_combinations.generate(test_combinations.combine(mode=['eager']))
-  def test_shared_objects_wrapper(self):
-    """Tests that shared layers wrapped with `Wrapper` restore correctly."""
-    input_ = keras.Input(shape=(1,))
-    unwrapped = keras.layers.Layer(name='unwrapped')
-    wrapped = keras.layers.Wrapper(unwrapped, name='wrapped')
-    model = keras.Model(inputs=input_,
-                        outputs=[unwrapped(input_), wrapped(input_)])
-
-    # Test recreating directly from config
-    config = model.get_config()
-    loaded = keras.Model.from_config(config)
-    self.assertIs(loaded.layers[1], loaded.layers[2].layer)
-
-    # Test saving and loading to disk
-    save_format = test_utils.get_save_format()
-    saved_model_dir = self._save_model_dir()
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-    loaded = keras.models.load_model(saved_model_dir)
-    self.assertIs(loaded.layers[1], loaded.layers[2].layer)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager'], fit=[True, False]))
-  def test_multi_output_metrics_name_stay_same(self, fit):
-    """Tests that metric names don't change with each save/load cycle.
-
-    e.g. "head_0_accuracy" should not become "head_0_head_0_accuracy" after
-    saving and loading a model.
-
-    Arguments:
-      fit: Whether the model should be fit before saving.
-    """
-    # This doesn't work at all, so we can't check whether metric names are
-    # correct.
-    if not tf.executing_eagerly() and not fit:
-      self.skipTest('b/181767784')
-
-    input_ = keras.Input((4,))
-    model = keras.Model(
-        input_,
-        [keras.layers.Softmax(name='head_0')(keras.layers.Dense(3)(input_)),
-         keras.layers.Softmax(name='head_1')(keras.layers.Dense(5)(input_))])
-    metric = keras.metrics.BinaryAccuracy()
-    model.compile(optimizer='rmsprop',
-                  loss='mse',
-                  metrics={'head_0': [metric, 'accuracy']})
-
-    x = np.random.rand(2, 4)
-    y = {'head_0': np.random.randint(2, size=(2, 3)),
-         'head_1': np.random.randint(2, size=(2, 5))}
-
-    # Make sure metrix prefixing works the same regardless of whether the user
-    # has fit the model before saving.
-    if fit:
-      model.fit(x, y, verbose=0)
-
-    # Save and reload.
-    save_format = test_utils.get_save_format()
-    saved_model_dir = self._save_model_dir()
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-    loaded = keras.models.load_model(saved_model_dir)
-
-    # Make sure the metrics names from the model before saving match the loaded
-    # model.
-    self.assertSequenceEqual(model.metrics_names, loaded.metrics_names)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_warning_when_saving_invalid_custom_mask_layer(self):
-
-    class MyMasking(keras.layers.Layer):
-
-      def call(self, inputs):
-        return inputs
-
-      def compute_mask(self, inputs, mask=None):
-        mask = tf.not_equal(inputs, 0)
-        return mask
-
-    class MyLayer(keras.layers.Layer):
-
-      def call(self, inputs, mask=None):
-        return tf.identity(inputs)
-
-    samples = np.random.random((2, 2))
-    model = keras.Sequential([MyMasking(), MyLayer()])
-    model.predict(samples)
-    with warnings.catch_warnings(record=True) as w:
-      model.save(self._save_model_dir(), test_utils.get_save_format())
-    self.assertIn(generic_utils.CustomMaskWarning,
-                  {warning.category for warning in w})
-
-    # Test that setting up a custom mask correctly does not issue a warning.
-    class MyCorrectMasking(keras.layers.Layer):
-
-      def call(self, inputs):
-        return inputs
-
-      def compute_mask(self, inputs, mask=None):
-        mask = tf.not_equal(inputs, 0)
-        return mask
-
-      # This get_config doesn't actually do anything because our mask is
-      # static and doesn't need any external information to work. We do need a
-      # dummy get_config method to prevent the warning from appearing, however.
-      def get_config(self, *args, **kwargs):
-        return {}
-
-    model = keras.Sequential([MyCorrectMasking(), MyLayer()])
-    model.predict(samples)
-    with warnings.catch_warnings(record=True) as w:
-      model.save(self._save_model_dir(), test_utils.get_save_format())
-    self.assertNotIn(generic_utils.CustomMaskWarning,
-                     {warning.category for warning in w})
-
-  # Test only in eager mode because ragged tensor inputs
-  # cannot be used in graph mode.
-  @test_combinations.generate(
-      test_combinations.combine(mode=['eager']))
-  @test_utils.run_v2_only
-  def test_save_functional_with_ragged_constant_input(self):
-    input1 = keras.Input(shape=[])
-    input2 = tf.ragged.constant([[1., 2.], [3.]])
-    outputs = keras.layers.Add()([input1, input2])
-    model = keras.Model(input1, outputs)
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir)
-    keras.models.load_model(saved_model_dir)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['eager']))
-  @test_utils.run_v2_only
-  def test_save_functional_with_constant_input(self):
-    input1 = keras.Input(shape=[2])
-    input2 = tf.constant([[1., 2.]])
-    outputs = keras.layers.Add()([input1, input2])
-    model = keras.Model(input1, outputs)
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir)
-    keras.models.load_model(saved_model_dir)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['eager']))
-  @test_utils.run_v2_only
-  def test_save_inputs_spec_with_composite_tensor_names(self):
-
-    class KerasModel(keras.Model):
-
-      def call(self, inputs):
-        return inputs
-
-    spec = MaskedTensor.Spec(
-        tf.TensorSpec([None], name='x__values'),
-        tf.TensorSpec([None], dtype=tf.bool, name='x__mask')
-    )
-    km1 = KerasModel()
-    inputs = keras.Input(type_spec=spec)
-    km1(inputs)
-    self.assertEqual(km1.save_spec()[0][0].mask.name, 'x__mask')
-
-
-# Factory functions to create models that will be serialized inside a Network.
-def _make_graph_network(input_size, output_size):
-  inputs = keras.Input(input_size)
-  x = keras.layers.Dense(8, activation='relu')(inputs)
-  y = keras.layers.Dense(output_size)(x)
-  return keras.Model(inputs=inputs, outputs=y)
-
-
-def _make_sequential(input_size, output_size):
-  del input_size
-  return keras.Sequential([
-      keras.layers.Dense(8, activation='relu'),
-      keras.layers.Dense(output_size),
-  ])
-
-
-def _make_sequential_built(input_size, output_size):
-  model = _make_sequential(input_size, output_size)
-  model.build((None, input_size))
-  return model
-
-
-def _make_sequential_graph_network(input_size, output_size):
-  return keras.Sequential([
-      keras.layers.InputLayer(input_size),
-      keras.layers.Dense(8, activation='relu'),
-      keras.layers.Dense(output_size),
-  ])
-
-
-def _make_sequential_input_shape(input_size, output_size):
-  return keras.Sequential([
-      keras.layers.Dense(8, activation='relu', input_shape=(input_size,)),
-      keras.layers.Dense(output_size),
-  ])
-
-
-class _make_subclassed(keras.Model):  # pylint: disable=invalid-name
-
-  def __init__(self, input_size, output_size):
-    super().__init__()
-    self._config = {'input_size': input_size, 'output_size': output_size}
-    self._hidden_layer = keras.layers.Dense(8, activation='relu', name='hidden')
-    self._logits_layer = keras.layers.Dense(output_size, name='logits')
-
-  def call(self, inputs):
-    x = self._hidden_layer(inputs)
-    return self._logits_layer(x)
-
-  def get_config(self):
-    return self._config
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
-
-
-class _make_subclassed_built(_make_subclassed):  # pylint: disable=invalid-name
-
-  def __init__(self, input_size, output_size):
-    super().__init__(input_size, output_size)
-    self.build((None, input_size))
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class TestWholeModelSavingWithNesting(tf.test.TestCase, parameterized.TestCase):
-  """Tests saving a whole model that contains other models."""
-
-  @parameterized.named_parameters([
-      ('graph_network', _make_graph_network),
-      ('sequential', _make_sequential),
-      ('sequential_built', _make_sequential_built),
-      ('sequential_graph_network', _make_sequential_graph_network),
-      ('sequential_input_shape', _make_sequential_input_shape),
-      ('subclassed', _make_subclassed),
-      ('subclassed_built', _make_subclassed_built),
-  ])
-  def test_functional(self, model_fn):
-    """Tests serializing a model that uses a nested model to share weights."""
-    if h5py is None:
-      self.skipTest('h5py required to run this test')
-
-    def _make_model():
-      inputs = (keras.Input(shape=(4,), name='examples'),
-                keras.Input(shape=(4,), name='neighbors'))
-      base_model = model_fn(inputs[0].shape.as_list()[-1], 2)
-      outputs = keras.layers.add([base_model(inputs[0]), base_model(inputs[1])])
-      return keras.Model(inputs=inputs, outputs=outputs)
-
-    with self.cached_session():
-      x = (np.random.normal(size=(16, 4)).astype(np.float32),
-           np.random.normal(size=(16, 4)).astype(np.float32))
-      model = _make_model()
-      predictions = model(x)
-      # Save and reload.
-      model_path = os.path.join(self.get_temp_dir(), 'model.h5')
-      model.save(model_path)
-      del model
-      loaded_model = keras.models.load_model(
-          model_path,
-          custom_objects={
-              '_make_subclassed': _make_subclassed,
-              '_make_subclassed_built': _make_subclassed_built,
-          },
-          compile=False)
-      self.assertAllClose(loaded_model(x), predictions, 1e-9)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/save_weights_test.py b/keras/saving/save_weights_test.py
deleted file mode 100644
index ba7a2703d95d..000000000000
--- a/keras/saving/save_weights_test.py
+++ /dev/null
@@ -1,677 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#,============================================================================
-"""Tests for model saving in the HDF5 format."""
-
-import tensorflow.compat.v2 as tf
-
-import os
-import shutil
-import uuid
-
-from absl.testing import parameterized
-import numpy as np
-
-import keras
-from keras.testing_infra import test_combinations
-from keras.optimizers import optimizer_v1
-from keras.testing_infra import test_utils
-from keras.engine import training
-from keras.saving import hdf5_format
-
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class TestWeightSavingAndLoading(tf.test.TestCase, parameterized.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  @test_combinations.run_with_all_weight_formats
-  def test_weight_loading(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-    with self.cached_session():
-      a = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3)(a)
-      b = keras.layers.Dense(1)(x)
-      model = keras.models.Model(a, b)
-
-      x = np.random.random((3, 2))
-      ref_y = model.predict(x)
-      weights = model.get_weights()
-      model.set_weights(weights)
-      y = model.predict(x)
-      self.assertAllClose(ref_y, y)
-
-      with self.assertRaises(ValueError):
-        model.set_weights(weights[1:])
-      with self.assertRaises(ValueError):
-        model.set_weights(weights[::-1])
-
-      model.save_weights(saved_model_dir, save_format=save_format)
-      model.load_weights(saved_model_dir)
-      y = model.predict(x)
-      self.assertAllClose(ref_y, y)
-
-  def test_weight_preprocessing(self):
-    input_dim = 3
-    output_dim = 3
-    size = 2
-    cases = [
-        [
-            (keras.layers.Bidirectional(keras.layers.SimpleRNN(2))),
-            [np.random.random((2, 1)), np.random.random((2, 1))],
-            (None, 3, 2),
-        ],
-        [
-            (keras.layers.TimeDistributed(keras.layers.Dense(1))),
-            [np.random.random((2, 1)), np.random.random((1,))],
-            (None, 3, 2),
-        ],
-        [
-            (keras.layers.Conv1D(output_dim, size, use_bias=False)),
-            [np.random.random((output_dim, input_dim, size, 1))],
-            (None, 4, input_dim),
-        ],
-        [
-            (keras.layers.Conv2D(output_dim, size,
-                                 use_bias=False, data_format='channels_first')),
-            [np.random.random((output_dim, input_dim, size, size))],
-            (None, input_dim, 4, 4),
-        ],
-        [
-            (keras.layers.Conv2DTranspose(output_dim, size,
-                                          use_bias=False,
-                                          data_format='channels_first')),
-            [np.random.random((output_dim, input_dim, size, size))],
-            (None, input_dim, 4, 4),
-        ],
-        [
-            (keras.layers.Conv2DTranspose(output_dim, size,
-                                          use_bias=False,
-                                          data_format='channels_last')),
-            [np.random.random((size, size, input_dim, output_dim))],
-            (None, 4, 4, input_dim),
-        ],
-        [
-            (keras.layers.Conv3D(output_dim, size,
-                                 use_bias=False, data_format='channels_first')),
-            [np.random.random((output_dim, input_dim, size, size, size))],
-            (None, input_dim, 4, 4, 4),
-        ],
-        [
-            (keras.layers.GRUV1(output_dim)),
-            [np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,))],
-            (None, 4, input_dim),
-        ],
-        [
-            (keras.layers.LSTMV1(output_dim)),
-            [np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,)),
-             np.random.random((input_dim, output_dim)),
-             np.random.random((output_dim, output_dim)),
-             np.random.random((output_dim,))],
-            (None, 4, input_dim),
-        ],
-    ]
-    for layer, weights, input_shape in cases:
-      layer.build(input_shape)
-      _ = hdf5_format.preprocess_weights_for_loading(
-          layer, weights, original_keras_version='1')
-
-    model = keras.models.Sequential([keras.layers.Dense(2, input_dim=2)])
-    _ = hdf5_format.preprocess_weights_for_loading(
-        model, model.weights, original_keras_version='1')
-
-    x = keras.Input((2,))
-    y = keras.layers.Dense(2)(x)
-    model = keras.models.Model(x, y)
-    _ = hdf5_format.preprocess_weights_for_loading(
-        model, model.weights, original_keras_version='1')
-
-  @parameterized.named_parameters(
-      ('gru', keras.layers.GRU, {
-          'units': 2,
-          'input_shape': (3, 5)
-      }),
-      ('gru_with_reset_after', keras.layers.GRU, {
-          'units': 2,
-          'input_shape': (3, 5),
-          'reset_after': True
-      }),
-      ('lstm', keras.layers.LSTM, {
-          'units': 2,
-          'input_shape': (3, 5)
-      }),
-      ('cudnngru', keras.layers.CuDNNGRU, {
-          'units': 2,
-          'input_shape': (3, 5)
-      }),
-      ('cudnnlstm', keras.layers.CuDNNLSTM, {
-          'units': 2,
-          'input_shape': (3, 5)
-      }))
-  def test_preprocess_weights_for_loading_rnn_should_be_idempotent(
-      self, layer_class, layer_args):
-    with self.cached_session():
-      layer = layer_class(**layer_args)
-      layer.build(input_shape=layer_args.get('input_shape'))
-      weights1 = layer.get_weights()
-      weights2 = hdf5_format.preprocess_weights_for_loading(
-          layer, weights1)
-      _ = [
-          self.assertAllClose(x, y, rtol=1e-05)
-          for (x, y) in zip(weights1, weights2)
-      ]
-
-  def test_sequential_weight_loading(self):
-    if h5py is None:
-      return
-
-    h5_path = self._save_model_dir('test.h5')
-
-    num_hidden = 5
-    input_dim = 3
-    batch_size = 5
-    num_classes = 2
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-      model.add(keras.layers.Dense(num_classes))
-
-      x = np.random.random((batch_size, input_dim))
-      ref_y = model.predict(x)
-
-      model.save_weights(h5_path)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-      model.add(keras.layers.Dense(num_classes))
-      model.load_weights(h5_path)
-      y = model.predict(x)
-
-      self.assertAllClose(y, ref_y)
-
-  @test_combinations.run_with_all_saved_model_formats(
-      exclude_formats=['tf_no_traces'])
-  def test_nested_model_weight_loading(self):
-    save_format = test_utils.get_save_format()
-    saved_model_dir = self._save_model_dir()
-
-    batch_size = 5
-    shape = (None, None, 3)
-
-    with self.cached_session():
-      def gen_model():
-
-        def seq_model():
-          model = keras.models.Sequential([
-              keras.layers.Conv2D(3, 1, input_shape=shape),
-              keras.layers.BatchNormalization()])
-          return model
-
-        x = inner_inputs = keras.layers.Input((None, None, 3))
-        x = seq_model()(x)
-        x = seq_model()(x)
-        inner_model = keras.models.Model(inner_inputs, x)
-
-        inputs = keras.layers.Input(shape)
-        return keras.models.Model(inputs, inner_model(inputs))
-
-      model = gen_model()
-      x = np.random.random((batch_size, 1, 1, 3))
-      ref_y = model.predict(x)
-
-      model.save_weights(saved_model_dir, save_format=save_format)
-
-      model = gen_model()
-      model.load_weights(saved_model_dir)
-      y = model.predict(x)
-
-      self.assertAllClose(y, ref_y)
-
-  def test_sequential_weight_loading_group_name_with_incorrect_length(self):
-    if h5py is None:
-      return
-
-    h5_path = self._save_model_dir('test.h5')
-
-    num_hidden = 5
-    input_dim = 3
-    num_classes = 2
-    with self.cached_session():
-      ref_model = keras.models.Sequential()
-      ref_model.add(keras.layers.Dense(num_hidden, input_dim=input_dim,
-                                       name='d1'))
-      ref_model.add(keras.layers.Dense(num_classes, name='d2'))
-      ref_model.compile(loss=keras.losses.MSE,
-                        optimizer='rmsprop',
-                        metrics=[keras.metrics.categorical_accuracy])
-
-      f_ref_model = h5py.File(h5_path, 'w')
-      hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model)
-
-      f_model = h5py.File(h5_path, 'r')
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden, use_bias=False,
-                                   input_dim=input_dim, name='d1'))
-      model.add(keras.layers.Dense(num_classes, name='d2'))
-      model.compile(loss=keras.losses.MSE,
-                    optimizer='rmsprop',
-                    metrics=[keras.metrics.categorical_accuracy])
-      with self.assertRaises(
-          ValueError,
-          msg='Weight count mismatch for layer #0 (named d1). '
-          'Layer expects 1 weight(s). Received 2 saved weight(s)'):
-        hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model)
-
-      hdf5_format.load_weights_from_hdf5_group_by_name(
-          f_model, model, skip_mismatch=True)
-      self.assertAllClose(keras.backend.get_value(ref_model.layers[1].kernel),
-                          keras.backend.get_value(model.layers[1].kernel))
-
-  def test_sequential_weight_loading_group_name_with_incorrect_shape(self):
-    if h5py is None:
-      return
-
-    h5_path = self._save_model_dir('test.h5')
-
-    num_hidden = 5
-    input_dim = 3
-    num_classes = 2
-    with tf.Graph().as_default(), self.cached_session():
-      ref_model = keras.models.Sequential()
-      ref_model.add(keras.layers.Dense(num_hidden, input_dim=input_dim,
-                                       name='d1'))
-      ref_model.add(keras.layers.Dense(num_classes, name='d2'))
-      ref_model.compile(loss=keras.losses.MSE,
-                        optimizer=optimizer_v1.RMSprop(lr=0.0001),
-                        metrics=[keras.metrics.categorical_accuracy])
-
-      f_ref_model = h5py.File(h5_path, 'w')
-      keras.backend.set_value(ref_model.layers[1].bias, [3.5] * num_classes)
-      hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model)
-
-      f_model = h5py.File(h5_path, 'r')
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(num_hidden + 5, input_dim=input_dim,
-                                   name='d1'))
-      model.add(keras.layers.Dense(num_classes, name='d2'))
-      model.compile(loss=keras.losses.MSE,
-                    optimizer=optimizer_v1.RMSprop(lr=0.0001),
-                    metrics=[keras.metrics.categorical_accuracy])
-      with self.assertRaises(
-          ValueError,
-          msg='Shape mismatch in layer #0 (named d1) for weight d1_1/kernel:0. '
-          'Weight expects shape (3, 10). '
-          'Received saved weight with shape (3, 5)'):
-        hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model)
-
-      hdf5_format.load_weights_from_hdf5_group_by_name(
-          f_model, model, skip_mismatch=True)
-      self.assertAllClose([3.5] * num_classes,
-                          keras.backend.get_value(model.layers[1].bias))
-
-  @test_combinations.run_with_all_saved_model_formats(
-      exclude_formats=['tf_no_traces'])
-  @test_combinations.run_with_all_model_types
-  def test_load_weights_from_saved_model(self):
-    save_path = self._save_model_dir()
-    save_format = test_utils.get_save_format()
-
-    if save_format == 'h5' and test_utils.get_model_type() == 'subclass':
-      # TODO(b/173646281): HDF5 format currently does not allow saving
-      # subclassed models.
-      return
-
-    with self.cached_session():
-      model = test_utils.get_small_mlp(1, 4, input_dim=3)
-      data = np.random.random((1, 3))
-      labels = np.random.random((1, 4))
-      model.compile(loss='mse', optimizer='rmsprop')
-      model.fit(data, labels)
-      model.save(save_path, save_format=save_format)
-      new_model = test_utils.get_small_mlp(1, 4, input_dim=3)
-      if test_utils.get_model_type() == 'subclass':
-        # Call on test data to build the model.
-        new_model.predict(data)
-      new_model.load_weights(save_path)
-      self.assertAllClose(model.weights, new_model.weights)
-
-
-class SubclassedModel(training.Model):
-
-  def __init__(self):
-    super().__init__()
-    self.x_layer = keras.layers.Dense(3)
-    self.b_layer = keras.layers.Dense(1)
-
-  def call(self, a):
-    return self.b_layer(self.x_layer(a))
-
-
-class TestWeightSavingAndLoadingTFFormat(tf.test.TestCase, parameterized.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_tensorflow_format_overwrite(self):
-    with self.cached_session() as session:
-      model = SubclassedModel()
-      temp_dir = self.get_temp_dir()
-      prefix = os.path.join(temp_dir, 'ckpt')
-
-      x = tf.constant(np.random.random((3, 2)), dtype=tf.float32)
-      executing_eagerly = tf.executing_eagerly()
-      model(x)  # pylint: disable=not-callable
-      if not executing_eagerly:
-        session.run([v.initializer for v in model.variables])
-      model.save_weights(prefix, save_format='tensorflow')
-      model.save_weights(prefix, save_format='tensorflow', overwrite=True)
-      with self.assertRaises(EOFError):
-        # Indirectly tests that the user is prompted
-        model.save_weights(prefix, save_format='tensorflow', overwrite=False)
-
-  def test_no_default_session(self):
-    with tf.Graph().as_default():
-      self.assertFalse(tf.compat.v1.get_default_session())
-      data = np.random.random((1000, 32)).astype(np.float32)
-      labels = np.random.random((1000, 10)).astype(np.float32)
-
-      model = keras.models.Sequential([
-          keras.layers.Dense(10, activation='softmax'),
-          keras.layers.Dense(10, activation='softmax')])
-
-      model.compile(optimizer=tf.compat.v1.train.RMSPropOptimizer(0.001),
-                    loss='categorical_crossentropy',
-                    metrics=['accuracy'])
-
-      model.fit(data, labels)
-      fname = os.path.join(self.get_temp_dir(), 'weights', 'ckpt')
-      model.save_weights(fname)
-      model.load_weights(fname)
-
-  def test_no_graph_pollution(self):
-    with tf.compat.v1.get_default_graph().as_default():
-      graph = tf.Graph()
-      with graph.as_default(), self.session(graph) as session:
-        model = SubclassedModel()
-        temp_dir = self.get_temp_dir()
-        prefix = os.path.join(temp_dir, 'ckpt')
-
-        x = tf.constant(np.random.random((3, 2)), dtype=tf.float32)
-        model(x)  # pylint: disable=not-callable
-        session.run([v.initializer for v in model.variables])
-        model.save_weights(prefix, save_format='tensorflow')
-        op_count = len(graph.get_operations())
-        model.save_weights(prefix, save_format='tensorflow')
-        self.assertLen(graph.get_operations(), op_count)
-
-        model.load_weights(prefix)
-        op_count = len(graph.get_operations())
-        model.load_weights(prefix)
-        self.assertLen(graph.get_operations(), op_count)
-
-  def _weight_loading_test_template(self, make_model_fn):
-    with self.cached_session():
-      model = make_model_fn()
-      model.compile(
-          loss='mse',
-          optimizer=tf.compat.v1.train.RMSPropOptimizer(0.1),
-          metrics=['acc', keras.metrics.CategoricalAccuracy()])
-      temp_dir = self.get_temp_dir()
-      prefix = os.path.join(temp_dir, 'ckpt')
-      train_x = np.random.random((3, 2))
-      train_y = np.random.random((3,))
-      x = tf.constant(train_x, dtype=tf.float32)
-
-      model.train_on_batch(train_x, train_y)
-      model.save_weights(prefix, save_format='tf')
-      ref_y_before_train = model.predict(train_x)
-      model.train_on_batch(train_x, train_y)
-      ref_y_after_train = model.predict(train_x)
-      for v in model.variables:
-        self.evaluate(
-            v.assign(tf.random.normal(shape=tf.shape(v))))
-
-      self.addCleanup(shutil.rmtree, temp_dir)
-
-      model.load_weights(prefix)
-      self.assertAllClose(ref_y_before_train, self.evaluate(model(x)))
-
-      # Test restore-on-create if this is a subclassed Model (graph Networks
-      # will have already created their variables).
-      load_model = make_model_fn()
-      load_model.load_weights(prefix)
-      self.assertAllClose(
-          ref_y_before_train,
-          self.evaluate(load_model(x)))
-      load_model = make_model_fn()
-      load_model.load_weights(prefix)
-      # We need to run some of the restore ops for predict(), but not all
-      # variables have been created yet (optimizer slot variables). Tests
-      # incremental restore.
-      load_model.predict(train_x)
-      load_model.compile(
-          loss='mse',
-          optimizer=tf.compat.v1.train.RMSPropOptimizer(0.1),
-          metrics=['acc', keras.metrics.CategoricalAccuracy()])
-      load_model.train_on_batch(train_x, train_y)
-      self.assertAllClose(ref_y_after_train, self.evaluate(load_model(x)))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_weight_loading_graph_model(self):
-    def _make_graph_model():
-      a = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3)(a)
-      b = keras.layers.Dense(1)(x)
-      return keras.models.Model(a, b)
-
-    self._weight_loading_test_template(_make_graph_model)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_weight_loading_subclassed_model(self):
-    self._weight_loading_test_template(SubclassedModel)
-
-  def _new_layer_weight_loading_test_template(
-      self, first_model_fn, second_model_fn):
-    with self.cached_session() as session:
-      model = first_model_fn()
-      temp_dir = self.get_temp_dir()
-      prefix = os.path.join(temp_dir, 'ckpt')
-
-      x = tf.constant(np.random.random((3, 2)), dtype=tf.float32)
-      executing_eagerly = tf.executing_eagerly()
-      ref_y_tensor = model(x)
-      if not executing_eagerly:
-        session.run([v.initializer for v in model.variables])
-      ref_y = self.evaluate(ref_y_tensor)
-      model.save_weights(prefix)
-      self.assertEqual(
-          prefix,
-          tf.train.latest_checkpoint(temp_dir))
-      for v in model.variables:
-        self.evaluate(
-            v.assign(tf.random.normal(shape=tf.shape(v))))
-
-      self.addCleanup(shutil.rmtree, temp_dir)
-
-      second_model = second_model_fn()
-      status = second_model.load_weights(prefix)
-      second_model(x)
-      status.run_restore_ops()
-      second_model.save_weights(prefix)
-      # Check that the second model's checkpoint loads into the original model
-      status = model.load_weights(prefix)
-      status.run_restore_ops(session)
-      y = self.evaluate(model(x))
-      self.assertAllClose(ref_y, y)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_weight_loading_graph_model_added_layer(self):
-    def _save_graph_model():
-      a = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3, name='first')(a)
-      b = keras.layers.Dense(1, name='second')(x)
-      return keras.models.Model(a, b)
-    def _restore_graph_model():
-      a = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3, name='first')(a)
-      y = keras.layers.Dense(1, name='second')(x)
-      b = keras.layers.Dense(3, name='secondjr')(y)
-      return keras.models.Model(a, b)
-
-    self._new_layer_weight_loading_test_template(
-        _save_graph_model, _restore_graph_model)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_weight_loading_graph_model_added_no_weight_layer(self):
-    def _save_graph_model():
-      a = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3, name='first')(a)
-      b = keras.layers.Dense(1, name='second')(x)
-      return keras.models.Model(a, b)
-    def _restore_graph_model():
-      a = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3, name='first')(a)
-      b = keras.layers.Dense(1, name='second')(x)
-      y = keras.layers.Dropout(rate=0.1)(b)
-      return keras.models.Model(a, y)
-
-    self._new_layer_weight_loading_test_template(
-        _save_graph_model, _restore_graph_model)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_weight_loading_subclassed_model_added_layer(self):
-
-    class SubclassedModelRestore(training.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.x_layer = keras.layers.Dense(3)
-        self.y_layer = keras.layers.Dense(3)
-        self.b_layer = keras.layers.Dense(1)
-
-      def call(self, a):
-        return self.b_layer(self.y_layer(self.x_layer(a)))
-
-    self._new_layer_weight_loading_test_template(
-        SubclassedModel, SubclassedModelRestore)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_incompatible_checkpoint(self):
-    save_path = tf.train.Checkpoint().save(
-        os.path.join(self.get_temp_dir(), 'ckpt'))
-    m = DummySubclassModel()
-    with self.assertRaisesRegex(AssertionError, 'Nothing to load'):
-      m.load_weights(save_path)
-    m.dense = keras.layers.Dense(2)
-    m.dense(tf.constant([[1.]]))
-    with self.assertRaisesRegex(AssertionError,
-                                'Nothing except the root object matched'):
-      m.load_weights(save_path)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_directory_passed(self):
-    with self.cached_session():
-      m = DummySubclassModel()
-      v = m.add_weight(name='v', shape=[])
-      self.evaluate(v.assign(42.))
-      prefix = os.path.join(self.get_temp_dir(), str(uuid.uuid4()), 'ckpt/')
-      m.save_weights(prefix)
-      self.evaluate(v.assign(2.))
-      m.load_weights(prefix)
-      self.assertEqual(42., self.evaluate(v))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_relative_path(self):
-    with self.cached_session():
-      m = DummySubclassModel()
-      v = m.add_weight(name='v', shape=[])
-      os.chdir(self.get_temp_dir())
-
-      prefix = 'ackpt'
-      self.evaluate(v.assign(42.))
-      m.save_weights(prefix)
-      self.assertTrue(tf.io.gfile.exists('ackpt.index'))
-      self.evaluate(v.assign(1.))
-      m.load_weights(prefix)
-      self.assertEqual(42., self.evaluate(v))
-
-      prefix = 'subdir/ackpt'
-      self.evaluate(v.assign(43.))
-      m.save_weights(prefix)
-      self.assertTrue(tf.io.gfile.exists('subdir/ackpt.index'))
-      self.evaluate(v.assign(2.))
-      m.load_weights(prefix)
-      self.assertEqual(43., self.evaluate(v))
-
-      prefix = 'ackpt/'
-      self.evaluate(v.assign(44.))
-      m.save_weights(prefix)
-      self.assertTrue(tf.io.gfile.exists('ackpt/.index'))
-      self.evaluate(v.assign(3.))
-      m.load_weights(prefix)
-      self.assertEqual(44., self.evaluate(v))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_nonexistent_prefix_directory(self):
-    with self.cached_session():
-      m = DummySubclassModel()
-      v = m.add_weight(name='v', shape=[])
-      self.evaluate(v.assign(42.))
-      prefix = os.path.join(self.get_temp_dir(), str(uuid.uuid4()), 'bckpt')
-      m.save_weights(prefix)
-      self.evaluate(v.assign(2.))
-      m.load_weights(prefix)
-      self.assertEqual(42., self.evaluate(v))
-
-
-class DummySubclassModel(training.Model):
-  pass
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/saved_model/base_serialization.py b/keras/saving/saved_model/base_serialization.py
deleted file mode 100644
index 97b7c67ae8c1..000000000000
--- a/keras/saving/saved_model/base_serialization.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Helper classes that list&validate all attributes to serialize to SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-
-from keras.saving.saved_model import json_utils
-from keras.saving.saved_model import utils
-
-
-class SavedModelSaver(object, metaclass=abc.ABCMeta):
-  """Saver defining the methods and properties used to serialize Keras objects.
-  """
-
-  def __init__(self, obj):
-    self.obj = obj
-
-  @abc.abstractproperty
-  def object_identifier(self):
-    """String stored in object identifier field in the SavedModel proto.
-
-    Returns:
-      A string with the object identifier, which is used at load time.
-    """
-    raise NotImplementedError
-
-  @property
-  def tracking_metadata(self):
-    """String stored in metadata field in the SavedModel proto.
-
-    Returns:
-      A serialized JSON storing information necessary for recreating this layer.
-    """
-    # TODO(kathywu): check that serialized JSON can be loaded (e.g., if an
-    # object is in the python property)
-    return json_utils.Encoder().encode(self.python_properties)
-
-  def trackable_children(self, serialization_cache):
-    """Lists all Trackable children connected to this object."""
-    if not utils.should_save_traces():
-      return {}
-
-    children = self.objects_to_serialize(serialization_cache)
-    children.update(self.functions_to_serialize(serialization_cache))
-    return children
-
-  @abc.abstractproperty
-  def python_properties(self):
-    """Returns dictionary of python properties to save in the metadata.
-
-    This dictionary must be serializable and deserializable to/from JSON.
-
-    When loading, the items in this dict are used to initialize the object and
-    define attributes in the revived object.
-    """
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def objects_to_serialize(self, serialization_cache):
-    """Returns dictionary of extra checkpointable objects to serialize.
-
-    See `functions_to_serialize` for an explanation of this function's
-    effects.
-
-    Args:
-      serialization_cache: Dictionary passed to all objects in the same object
-        graph during serialization.
-
-    Returns:
-        A dictionary mapping attribute names to checkpointable objects.
-    """
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def functions_to_serialize(self, serialization_cache):
-    """Returns extra functions to include when serializing a Keras object.
-
-    Normally, when calling exporting an object to SavedModel, only the
-    functions and objects defined by the user are saved. For example:
-
-    ```
-    obj = tf.Module()
-    obj.v = tf.Variable(1.)
-
-    @tf.function
-    def foo(...): ...
-
-    obj.foo = foo
-
-    w = tf.Variable(1.)
-
-    tf.saved_model.save(obj, 'path/to/saved/model')
-    loaded = tf.saved_model.load('path/to/saved/model')
-
-    loaded.v  # Variable with the same value as obj.v
-    loaded.foo  # Equivalent to obj.foo
-    loaded.w  # AttributeError
-    ```
-
-    Assigning trackable objects to attributes creates a graph, which is used for
-    both checkpointing and SavedModel serialization.
-
-    When the graph generated from attribute tracking is insufficient, extra
-    objects and functions may be added at serialization time. For example,
-    most models do not have their call function wrapped with a @tf.function
-    decorator. This results in `model.call` not being saved. Since Keras objects
-    should be revivable from the SavedModel format, the call function is added
-    as an extra function to serialize.
-
-    This function and `objects_to_serialize` is called multiple times when
-    exporting to SavedModel. Please use the cache to avoid generating new
-    functions and objects. A fresh cache is created for each SavedModel export.
-
-    Args:
-      serialization_cache: Dictionary passed to all objects in the same object
-        graph during serialization.
-
-    Returns:
-        A dictionary mapping attribute names to `Function` or
-        `ConcreteFunction`.
-    """
-    raise NotImplementedError
diff --git a/keras/saving/saved_model/create_test_saved_model.py b/keras/saving/saved_model/create_test_saved_model.py
deleted file mode 100644
index 832da70ac1b1..000000000000
--- a/keras/saving/saved_model/create_test_saved_model.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""A binary that creates a serialized SavedModel from a keras model.
-
-This is used in tests to ensure that model serialization is deterministic across
-different processes.
-"""
-
-from absl import app
-from absl import flags
-from keras import regularizers
-from keras.testing_infra import test_utils
-
-import tensorflow.compat.v2 as tf
-
-flags.DEFINE_string('output_path', '', 'The path to write the SavedModel at.')
-
-FLAGS = flags.FLAGS
-
-
-def main(_) -> None:
-  with test_utils.model_type_scope('functional'):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.layers[-1].activity_regularizer = regularizers.get('l2')
-    model.activity_regularizer = regularizers.get('l2')
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop')
-    def callable_loss():
-      return tf.reduce_sum(model.weights[0])
-    model.add_loss(callable_loss)
-
-    print(f'_____Writing saved model to: {FLAGS.output_path}')
-    model.save(FLAGS.output_path)
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/keras/saving/saved_model/determinism_test.py b/keras/saving/saved_model/determinism_test.py
deleted file mode 100755
index 9f9ee2e499a7..000000000000
--- a/keras/saving/saved_model/determinism_test.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""Saves the same model twice and ensures that they are serialized the same."""
-
-import subprocess
-
-from absl import flags
-import tensorflow.compat.v2 as tf
-
-
-from tensorflow.core.protobuf import saved_model_pb2
-
-
-FLAGS = flags.FLAGS
-
-
-class DeterminismTest(tf.test.TestCase):
-
-  def test_saving_is_deterministic(self):
-    create_saved_model = f'{FLAGS.test_srcdir}/create_test_saved_model.par'
-    saved_model_a_path = f'{FLAGS.test_tmpdir}/a'
-    saved_model_b_path = f'{FLAGS.test_tmpdir}/b'
-
-    save_a = subprocess.Popen(
-        [create_saved_model, '--output_path', saved_model_a_path])
-    save_b = subprocess.Popen(
-        [create_saved_model, '--output_path', saved_model_b_path])
-    save_a.wait()
-    save_b.wait()
-    saved_model_a = saved_model_pb2.SavedModel()
-    with tf.io.gfile.GFile(f'{saved_model_a_path}/saved_model.pb') as f:
-      saved_model_a.MergeFromString(f.read())
-    saved_model_b = saved_model_pb2.SavedModel()
-    with tf.io.gfile.GFile(f'{saved_model_b_path}/saved_model.pb') as f:
-      saved_model_b.MergeFromString(f.read())
-
-    self.assertProtoEquals(saved_model_a, saved_model_b)
diff --git a/keras/saving/saved_model/json_utils.py b/keras/saving/saved_model/json_utils.py
deleted file mode 100644
index 7b81c2da26ce..000000000000
--- a/keras/saving/saved_model/json_utils.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils for creating and loading the Layer metadata for SavedModel.
-
-These are required to retain the original format of the build input shape, since
-layers and models may have different build behaviors depending on if the shape
-is a list, tuple, or TensorShape. For example, Network.build() will create
-separate inputs if the given input_shape is a list, and will create a single
-input if the given shape is a tuple.
-"""
-
-import tensorflow.compat.v2 as tf
-
-import collections
-import functools
-import enum
-import json
-import numpy as np
-import wrapt
-
-from keras.utils import generic_utils
-
-
-from tensorflow.python.framework import type_spec
-
-
-_EXTENSION_TYPE_SPEC = '_EXTENSION_TYPE_SPEC'
-
-
-class Encoder(json.JSONEncoder):
-  """JSON encoder and decoder that handles TensorShapes and tuples."""
-
-  def default(self, obj):  # pylint: disable=method-hidden
-    """Encodes objects for types that aren't handled by the default encoder."""
-    if isinstance(obj, tf.TensorShape):
-      items = obj.as_list() if obj.rank is not None else None
-      return {'class_name': 'TensorShape', 'items': items}
-    return get_json_type(obj)
-
-  def encode(self, obj):
-    return super().encode(_encode_tuple(obj))
-
-
-def _encode_tuple(x):
-  if isinstance(x, tuple):
-    return {'class_name': '__tuple__',
-            'items': tuple(_encode_tuple(i) for i in x)}
-  elif isinstance(x, list):
-    return [_encode_tuple(i) for i in x]
-  elif isinstance(x, dict):
-    return {key: _encode_tuple(value) for key, value in x.items()}
-  else:
-    return x
-
-
-def decode(json_string):
-  return json.loads(json_string, object_hook=_decode_helper)
-
-
-def decode_and_deserialize(json_string, module_objects=None,
-                           custom_objects=None):
-  """Decodes the JSON and deserializes any Keras objects found in the dict."""
-  return json.loads(json_string,
-                    object_hook=functools.partial(
-                        _decode_helper,
-                        deserialize=True,
-                        module_objects=module_objects,
-                        custom_objects=custom_objects))
-
-
-def _decode_helper(obj, deserialize=False, module_objects=None,
-                   custom_objects=None):
-  """A decoding helper that is TF-object aware.
-
-  Args:
-    obj: A decoded dictionary that may represent an object.
-    deserialize: Boolean, defaults to False. When True, deserializes any Keras
-      objects found in `obj`.
-    module_objects: A dictionary of built-in objects to look the name up in.
-      Generally, `module_objects` is provided by midlevel library implementers.
-    custom_objects: A dictionary of custom objects to look the name up in.
-      Generally, `custom_objects` is provided by the end user.
-
-  Returns:
-    The decoded object.
-  """
-  if isinstance(obj, dict) and 'class_name' in obj:
-    if obj['class_name'] == 'TensorShape':
-      return tf.TensorShape(obj['items'])
-    elif obj['class_name'] == 'TypeSpec':
-      return type_spec.lookup(obj['type_spec'])._deserialize(  # pylint: disable=protected-access
-          _decode_helper(obj['serialized']))
-    elif obj['class_name'] == 'CompositeTensor':
-      spec = obj['spec']
-      tensors = []
-      for dtype, tensor in obj['tensors']:
-        tensors.append(tf.constant(tensor, dtype=tf.dtypes.as_dtype(dtype)))
-      return tf.nest.pack_sequence_as(
-          _decode_helper(spec),
-          tensors,
-          expand_composites=True)
-    elif obj['class_name'] == '__tuple__':
-      return tuple(_decode_helper(i) for i in obj['items'])
-    elif obj['class_name'] == '__ellipsis__':
-      return Ellipsis
-    elif deserialize and '__passive_serialization__' in obj:
-      # __passive_serialization__ is added by the JSON encoder when encoding
-      # an object that has a `get_config()` method.
-      try:
-        return generic_utils.deserialize_keras_object(
-            obj,
-            module_objects=module_objects,
-            custom_objects=custom_objects)
-      except ValueError:
-        pass
-  return obj
-
-
-def get_json_type(obj):
-  """Serializes any object to a JSON-serializable structure.
-
-  Args:
-      obj: the object to serialize
-
-  Returns:
-      JSON-serializable structure representing `obj`.
-
-  Raises:
-      TypeError: if `obj` cannot be serialized.
-  """
-  # if obj is a serializable Keras class instance
-  # e.g. optimizer, layer
-  if hasattr(obj, 'get_config'):
-    serialized = generic_utils.serialize_keras_object(obj)
-    serialized['__passive_serialization__'] = True
-    return serialized
-
-  # if obj is any numpy type
-  if type(obj).__module__ == np.__name__:
-    if isinstance(obj, np.ndarray):
-      return obj.tolist()
-    else:
-      return obj.item()
-
-  # misc functions (e.g. loss function)
-  if callable(obj):
-    return obj.__name__
-
-  # if obj is a python 'type'
-  if type(obj).__name__ == type.__name__:
-    return obj.__name__
-
-  if isinstance(obj, tf.compat.v1.Dimension):
-    return obj.value
-
-  if isinstance(obj, tf.TensorShape):
-    return obj.as_list()
-
-  if isinstance(obj, tf.DType):
-    return obj.name
-
-  if isinstance(obj, collections.abc.Mapping):
-    return dict(obj)
-
-  if obj is Ellipsis:
-    return {'class_name': '__ellipsis__'}
-
-  if isinstance(obj, wrapt.ObjectProxy):
-    return obj.__wrapped__
-
-  if isinstance(obj, tf.TypeSpec):
-    try:
-      type_spec_name = type_spec.get_name(type(obj))
-      return {'class_name': 'TypeSpec', 'type_spec': type_spec_name,
-              'serialized': obj._serialize()}  # pylint: disable=protected-access
-    except ValueError:
-      raise ValueError(
-          f'Unable to serialize {obj} to JSON, because the TypeSpec '
-          f'class {type(obj)} has not been registered.')
-  if isinstance(obj, tf.__internal__.CompositeTensor):
-    spec = tf.type_spec_from_value(obj)
-    tensors = []
-    for tensor in tf.nest.flatten(obj, expand_composites=True):
-      tensors.append((tensor.dtype.name, tensor.numpy().tolist()))
-    return {'class_name': 'CompositeTensor',
-            'spec': get_json_type(spec),
-            'tensors': tensors}
-
-  if isinstance(obj, enum.Enum):
-    return obj.value
-
-  raise TypeError(
-      f'Unable to serialize {obj} to JSON. Unrecognized type {type(obj)}.')
diff --git a/keras/saving/saved_model/json_utils_test.py b/keras/saving/saved_model/json_utils_test.py
deleted file mode 100644
index 4f1e01447b9a..000000000000
--- a/keras/saving/saved_model/json_utils_test.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Tests the JSON encoder and decoder."""
-
-import tensorflow.compat.v2 as tf
-
-import enum
-from keras.saving.saved_model import json_utils
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-
-
-class JsonUtilsTest(test_combinations.TestCase):
-
-  def test_encode_decode_tensor_shape(self):
-    metadata = {
-        'key1': tf.TensorShape(None),
-        'key2': [tf.TensorShape([None]),
-                 tf.TensorShape([3, None, 5])]}
-    string = json_utils.Encoder().encode(metadata)
-    loaded = json_utils.decode(string)
-
-    self.assertEqual(set(loaded.keys()), {'key1', 'key2'})
-    self.assertAllEqual(loaded['key1'].rank, None)
-    self.assertAllEqual(loaded['key2'][0].as_list(), [None])
-    self.assertAllEqual(loaded['key2'][1].as_list(), [3, None, 5])
-
-  def test_encode_decode_tuple(self):
-    metadata = {
-        'key1': (3, 5),
-        'key2': [(1, (3, 4)), (1,)]}
-    string = json_utils.Encoder().encode(metadata)
-    loaded = json_utils.decode(string)
-
-    self.assertEqual(set(loaded.keys()), {'key1', 'key2'})
-    self.assertAllEqual(loaded['key1'], (3, 5))
-    self.assertAllEqual(loaded['key2'], [(1, (3, 4)), (1,)])
-
-  def test_encode_decode_type_spec(self):
-    spec = tf.TensorSpec((1, 5), tf.float32)
-    string = json_utils.Encoder().encode(spec)
-    loaded = json_utils.decode(string)
-    self.assertEqual(spec, loaded)
-
-    invalid_type_spec = {'class_name': 'TypeSpec', 'type_spec': 'Invalid Type',
-                         'serialized': None}
-    string = json_utils.Encoder().encode(invalid_type_spec)
-    with self.assertRaisesRegexp(ValueError, 'No TypeSpec has been registered'):
-      loaded = json_utils.decode(string)
-
-  def test_encode_decode_enum(self):
-    class Enum(enum.Enum):
-      CLASS_A = 'a'
-      CLASS_B = 'b'
-    config = {'key': Enum.CLASS_A, 'key2': Enum.CLASS_B}
-    string = json_utils.Encoder().encode(config)
-    loaded = json_utils.decode(string)
-    self.assertAllEqual({'key': 'a', 'key2': 'b'}, loaded)
-
-  @test_utils.run_v2_only
-  def test_encode_decode_ragged_tensor(self):
-    x = tf.ragged.constant([[1., 2.], [3.]])
-    string = json_utils.Encoder().encode(x)
-    loaded = json_utils.decode(string)
-    self.assertAllEqual(loaded, x)
-
-  @test_utils.run_v2_only
-  def test_encode_decode_extension_type_tensor(self):
-    class MaskedTensor(tf.experimental.ExtensionType):
-      __name__ = 'MaskedTensor'
-      values: tf.Tensor
-      mask: tf.Tensor
-    x = MaskedTensor(values=[[1, 2, 3], [4, 5, 6]],
-                     mask=[[True, True, False], [True, False, True]])
-    string = json_utils.Encoder().encode(x)
-    loaded = json_utils.decode(string)
-    self.assertAllEqual(loaded, x)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/saved_model/layer_serialization.py b/keras/saving/saved_model/layer_serialization.py
deleted file mode 100644
index a4945c0b012e..000000000000
--- a/keras/saving/saved_model/layer_serialization.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and functions implementing Layer SavedModel serialization."""
-
-from keras.mixed_precision import policy
-from keras.saving.saved_model import base_serialization
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import save_impl
-from keras.saving.saved_model import serialized_attributes
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-
-class LayerSavedModelSaver(base_serialization.SavedModelSaver):
-  """Implements Layer SavedModel serialization."""
-
-  @property
-  def object_identifier(self):
-    return constants.LAYER_IDENTIFIER
-
-  @property
-  def python_properties(self):
-    # TODO(kathywu): Add python property validator
-    return self._python_properties_internal()
-
-  def _python_properties_internal(self):
-    """Returns dictionary of all python properties."""
-    # TODO(kathywu): Add support for metrics serialization.
-    # TODO(kathywu): Synchronize with the keras spec (go/keras-json-spec) once
-    # the python config serialization has caught up.
-    metadata = dict(
-        name=self.obj.name,
-        trainable=self.obj.trainable,
-        expects_training_arg=self.obj._expects_training_arg,  # pylint: disable=protected-access
-        dtype=policy.serialize(self.obj._dtype_policy),  # pylint: disable=protected-access
-        batch_input_shape=getattr(self.obj, '_batch_input_shape', None),
-        stateful=self.obj.stateful,
-        must_restore_from_config=self.obj._must_restore_from_config,  # pylint: disable=protected-access
-    )
-
-    metadata.update(get_serialized(self.obj))
-    if self.obj.input_spec is not None:
-      # Layer's input_spec has already been type-checked in the property setter.
-      metadata['input_spec'] = tf.nest.map_structure(
-          lambda x: generic_utils.serialize_keras_object(x) if x else None,
-          self.obj.input_spec)
-    if (self.obj.activity_regularizer is not None and
-        hasattr(self.obj.activity_regularizer, 'get_config')):
-      metadata['activity_regularizer'] = generic_utils.serialize_keras_object(
-          self.obj.activity_regularizer)
-    if self.obj._build_input_shape is not None:  # pylint: disable=protected-access
-      metadata['build_input_shape'] = self.obj._build_input_shape  # pylint: disable=protected-access
-    return metadata
-
-  def objects_to_serialize(self, serialization_cache):
-    return (self._get_serialized_attributes(
-        serialization_cache).objects_to_serialize)
-
-  def functions_to_serialize(self, serialization_cache):
-    return (self._get_serialized_attributes(
-        serialization_cache).functions_to_serialize)
-
-  def _get_serialized_attributes(self, serialization_cache):
-    """Generates or retrieves serialized attributes from cache."""
-    keras_cache = serialization_cache.setdefault(constants.KERAS_CACHE_KEY, {})
-    if self.obj in keras_cache:
-      return keras_cache[self.obj]
-
-    serialized_attr = keras_cache[self.obj] = (
-        serialized_attributes.SerializedAttributes.new(self.obj))
-
-    if (save_impl.should_skip_serialization(self.obj) or
-        self.obj._must_restore_from_config):  # pylint: disable=protected-access
-      return serialized_attr
-
-    object_dict, function_dict = self._get_serialized_attributes_internal(
-        serialization_cache)
-
-    serialized_attr.set_and_validate_objects(object_dict)
-    serialized_attr.set_and_validate_functions(function_dict)
-    return serialized_attr
-
-  def _get_serialized_attributes_internal(self, serialization_cache):
-    """Returns dictionary of serialized attributes."""
-    objects = save_impl.wrap_layer_objects(self.obj, serialization_cache)
-    functions = save_impl.wrap_layer_functions(self.obj, serialization_cache)
-    # Attribute validator requires that the default save signature is added to
-    # function dict, even if the value is None.
-    functions['_default_save_signature'] = None
-    return objects, functions
-
-
-# TODO(kathywu): Move serialization utils (and related utils from
-# generic_utils.py) to a separate file.
-def get_serialized(obj):
-  with generic_utils.skip_failed_serialization():
-    # Store the config dictionary, which may be used when reviving the object.
-    # When loading, the program will attempt to revive the object from config,
-    # and if that fails, the object will be revived from the SavedModel.
-    return generic_utils.serialize_keras_object(obj)
-
-
-class InputLayerSavedModelSaver(base_serialization.SavedModelSaver):
-  """InputLayer serialization."""
-
-  @property
-  def object_identifier(self):
-    return constants.INPUT_LAYER_IDENTIFIER
-
-  @property
-  def python_properties(self):
-
-    return dict(
-        class_name=type(self.obj).__name__,
-        name=self.obj.name,
-        dtype=self.obj.dtype,
-        sparse=self.obj.sparse,
-        ragged=self.obj.ragged,
-        batch_input_shape=self.obj._batch_input_shape,  # pylint: disable=protected-access
-        config=self.obj.get_config())
-
-  def objects_to_serialize(self, serialization_cache):
-    return {}
-
-  def functions_to_serialize(self, serialization_cache):
-    return {}
-
-
-class RNNSavedModelSaver(LayerSavedModelSaver):
-  """RNN layer serialization."""
-
-  @property
-  def object_identifier(self):
-    return constants.RNN_LAYER_IDENTIFIER
-
-  def _get_serialized_attributes_internal(self, serialization_cache):
-    objects, functions = (
-        super()._get_serialized_attributes_internal(
-            serialization_cache))
-    states = tf.__internal__.tracking.wrap(self.obj.states)
-    # SaveModel require all the objects to be Trackable when saving.
-    # If the states is still a tuple after wrap_or_unwrap, it means it doesn't
-    # contain any trackable item within it, eg empty tuple or (None, None) for
-    # stateless ConvLSTM2D. We convert them to list so that wrap_or_unwrap can
-    # make it a Trackable again for saving. When loaded, ConvLSTM2D is
-    # able to handle the tuple/list conversion.
-    if isinstance(states, tuple):
-      states = tf.__internal__.tracking.wrap(list(states))
-    objects['states'] = states
-    return objects, functions
-
-
-class VocabularySavedModelSaver(LayerSavedModelSaver):
-  """Handles vocabulary layer serialization.
-
-  This class is needed for StringLookup, IntegerLookup, and TextVectorization,
-  which all have a vocabulary as part of the config. Currently, we keep this
-  vocab as part of the config until saving, when we need to clear it to avoid
-  initializing a StaticHashTable twice (once when restoring the config and once
-  when restoring restoring module resources). After clearing the vocab, we
-  persist a property to the layer indicating it was constructed with a vocab.
-  """
-
-  @property
-  def python_properties(self):
-    # TODO(kathywu): Add python property validator
-    metadata = self._python_properties_internal()
-    # Clear the vocabulary from the config during saving.
-    metadata['config']['vocabulary'] = None
-    # Persist a property to track that a vocabulary was passed on construction.
-    metadata['config']['has_input_vocabulary'] = self.obj._has_input_vocabulary  # pylint: disable=protected-access
-    return metadata
diff --git a/keras/saving/saved_model/load.py b/keras/saving/saved_model/load.py
deleted file mode 100644
index a36b5c3305fb..000000000000
--- a/keras/saving/saved_model/load.py
+++ /dev/null
@@ -1,1210 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras SavedModel deserialization."""
-
-import re
-import types
-import warnings
-
-from keras import backend
-from keras import regularizers
-from keras.engine import input_spec
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from keras.protobuf import saved_metadata_pb2
-from keras.protobuf import versions_pb2
-from keras.saving import saving_utils
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import json_utils
-from keras.saving.saved_model import utils
-from keras.saving.saved_model.serialized_attributes import CommonEndpoints
-from keras.utils import layer_utils
-from keras.utils import generic_utils
-from keras.utils import metrics_utils
-from keras.utils import tf_inspect
-from keras.utils.generic_utils import LazyLoader
-import tensorflow.compat.v1.logging as logging
-import tensorflow.compat.v2 as tf
-
-from google.protobuf import message
-
-# To avoid circular dependencies between keras/engine and keras/saving,
-# code in keras/saving must delay imports.
-
-# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
-# once the issue with copybara is fixed.
-# pylint:disable=g-inconsistent-quotes
-models_lib = LazyLoader('models_lib', globals(), 'keras.models')
-base_layer = LazyLoader('base_layer', globals(), 'keras.engine.base_layer')
-layers_module = LazyLoader('layers_module', globals(), 'keras.layers')
-input_layer = LazyLoader('input_layer', globals(), 'keras.engine.input_layer')
-functional_lib = LazyLoader('functional_lib', globals(),
-                            'keras.engine.functional')
-training_lib = LazyLoader('training_lib', globals(), 'keras.engine.training')
-training_lib_v1 = LazyLoader('training_lib_v1', globals(),
-                             'keras.engine.training_v1')
-metrics = LazyLoader('metrics', globals(), 'keras.metrics')
-base_rnn = LazyLoader('base_rnn', globals(), 'keras.layers.rnn.base_rnn')
-# pylint:enable=g-inconsistent-quotes
-
-PUBLIC_ATTRIBUTES = CommonEndpoints.all_functions.union(
-    CommonEndpoints.all_checkpointable_objects)
-PUBLIC_ATTRIBUTES.add(constants.KERAS_ATTR)
-
-
-def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
-  """Loads Keras objects from a SavedModel.
-
-  Any Keras layer or model saved to the SavedModel will be loaded back
-  as Keras objects. Other objects are loaded as regular trackable objects (same
-  as `tf.saved_model.load`).
-
-  Currently, Keras saving/loading only retains the Keras object's weights,
-  losses, and call function.
-
-  The loaded model can be re-compiled, but the original optimizer, compiled loss
-  functions, and metrics are not retained. This is temporary, and `model.save`
-  will soon be able to serialize compiled models.
-
-  Args:
-    path: Path to SavedModel.
-    compile: If true, compile the model after loading it.
-    options: Optional `tf.saved_model.LoadOptions` object that specifies options
-      for loading from SavedModel.
-
-  Returns:
-    Object loaded from SavedModel.
-  """
-  # TODO(kathywu): Add saving/loading of optimizer, compiled losses and metrics.
-  # TODO(kathywu): Add code to load from objects that contain all endpoints
-
-  # Look for metadata file or parse the SavedModel
-  metadata = saved_metadata_pb2.SavedMetadata()
-  meta_graph_def = tf.__internal__.saved_model.parse_saved_model(
-      path).meta_graphs[0]
-  object_graph_def = meta_graph_def.object_graph_def
-  path_to_metadata_pb = tf.io.gfile.join(path, constants.SAVED_METADATA_PATH)
-  if tf.compat.v1.gfile.Exists(path_to_metadata_pb):
-    try:
-      with tf.io.gfile.GFile(path_to_metadata_pb, 'rb') as f:
-        file_content = f.read()
-      metadata.ParseFromString(file_content)
-    except message.DecodeError as e:
-      raise IOError(
-          f'Cannot parse keras metadata at path {path_to_metadata_pb}: '
-          f'Received error: {e}')
-  else:
-    logging.warning('SavedModel saved prior to TF 2.5 detected when loading '
-                    'Keras model. Please ensure that you are saving the model '
-                    'with model.save() or tf.keras.models.save_model(), *NOT* '
-                    'tf.saved_model.save(). To confirm, there should be a file '
-                    'named "keras_metadata.pb" in the SavedModel directory.')
-    _read_legacy_metadata(object_graph_def, metadata, path)
-
-  if not metadata.nodes:
-    # When there are no Keras objects, return the results from the core loader
-    return tf.saved_model.load(path, options=options)
-
-  metadata = _update_to_current_version(metadata)
-  # Recreate layers and metrics using the info stored in the metadata.
-  keras_loader = KerasObjectLoader(metadata, object_graph_def)
-  keras_loader.load_layers(compile=compile)
-
-  # Generate a dictionary of all loaded nodes.
-  nodes_to_load = {'root': None}
-  for node_id, loaded_node in keras_loader.loaded_nodes.items():
-    nodes_to_load[keras_loader.get_path(node_id)] = loaded_node
-  with warnings.catch_warnings():
-    warnings.filterwarnings('ignore', message='Trying to load ShardedVariables')
-    loaded = tf.__internal__.saved_model.load_partial(
-        path, nodes_to_load, options=options)
-
-  # Finalize the loaded layers and remove the extra tracked dependencies.
-  keras_loader.finalize_objects()
-  keras_loader.del_tracking()
-
-  model = loaded['root']
-
-  # pylint: disable=protected-access
-  if isinstance(model, training_lib.Model) and compile:
-    # TODO(kathywu): Use compiled objects from SavedModel, instead of
-    # creating new objects from the training config.
-    training_config = model._serialized_attributes['metadata'].get(
-        'training_config', None)
-    if training_config is not None:
-      model.compile(
-          **saving_utils.compile_args_from_training_config(training_config),
-          from_serialized=True)
-      saving_utils.try_build_compiled_arguments(model)
-      if isinstance(model.optimizer, optimizer_v2.OptimizerV2):
-        if model.optimizer.get_slot_names():
-          logging.warning('Your optimizer uses slots. '
-                          'Slots cannot be restored from saved_model, '
-                          'as a result, your model is starting with  '
-                          'a new initialized optimizer.')
-    else:
-      logging.warning('No training configuration found in save file, so the '
-                      'model was *not* compiled. Compile it manually.')
-  # pylint: enable=protected-access
-
-  # Force variables and resources to initialize.
-  if not tf.executing_eagerly():
-    sess = backend.get_session()  # Variables are initialized by this call.
-    sess.run(
-        tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TABLE_INITIALIZERS))
-
-  return model
-
-
-def _update_to_current_version(metadata):
-  """Applies version updates to the metadata proto for backwards compat."""
-  for node in metadata.nodes:
-    if node.version.producer == 1 and node.identifier in [
-        constants.MODEL_IDENTIFIER, constants.SEQUENTIAL_IDENTIFIER,
-        constants.NETWORK_IDENTIFIER
-    ]:
-      node_metadata = json_utils.decode(node.metadata)
-      save_spec = node_metadata.get('save_spec')
-
-      if save_spec is not None:
-        node_metadata['full_save_spec'] = ([save_spec], {})
-        node.metadata = json_utils.Encoder().encode(node_metadata)
-  return metadata
-
-
-def _read_legacy_metadata(object_graph_def, metadata, path):
-  """Builds a KerasMetadata proto from the SavedModel ObjectGraphDef."""
-  # Older SavedModels store the metadata directly in the proto instead of the
-  # separate pb file.
-  node_paths = _generate_object_paths(object_graph_def)
-  for node_id, proto in enumerate(object_graph_def.nodes):
-    if (proto.WhichOneof('kind') == 'user_object' and
-        proto.user_object.identifier in constants.KERAS_OBJECT_IDENTIFIERS):
-      if not proto.user_object.metadata:
-        raise ValueError(
-            f'Unable to create a Keras model from SavedModel at {path}. '
-            'This SavedModel was exported with `tf.saved_model.save`, and '
-            'lacks the Keras metadata file. Please save your Keras model by '
-            'calling `model.save`or `tf.keras.models.save_model`. Note that '
-            'you can still load this SavedModel with `tf.saved_model.load`.')
-      metadata.nodes.add(
-          node_id=node_id,
-          node_path=node_paths[node_id],
-          version=versions_pb2.VersionDef(
-              producer=1, min_consumer=1, bad_consumers=[]),
-          identifier=proto.user_object.identifier,
-          metadata=proto.user_object.metadata)
-
-
-def _generate_object_paths(object_graph_def):
-  """Traverses through an ObjectGraphDef and builds a map of all node paths."""
-  paths = {0: 'root'}
-  nodes_to_visit = [0]
-
-  while nodes_to_visit:
-    current_node = nodes_to_visit.pop()
-    current_path = paths[current_node]
-    for reference in object_graph_def.nodes[current_node].children:
-      if reference.node_id in paths:
-        continue
-      paths[reference.node_id] = '{}.{}'.format(current_path,
-                                                reference.local_name)
-      nodes_to_visit.append(reference.node_id)
-
-  return paths
-
-
-def _is_graph_network(layer):
-  """Determines whether the layer is a graph network."""
-  # pylint: disable=protected-access
-  if isinstance(layer, RevivedNetwork):
-    return False
-  elif isinstance(layer, functional_lib.Functional):
-    return layer._is_graph_network or isinstance(layer, models_lib.Sequential)
-  return False
-
-
-class KerasObjectLoader:
-  """Loader that recreates Keras objects (e.g.
-
-  layers, models).
-
-  Layers and models are revived from either the config or SavedModel following
-  these rules:
-  1. If object is a graph network (i.e. Sequential or Functional) then it will
-     be initialized using the structure from the config only after the children
-     layers have been created. Graph networks must be initialized with inputs
-     and outputs, so all child layers must be created beforehand.
-  2. If object's config exists and the class can be found, then revive from
-     config.
-  3. Object may have already been created if its parent was revived from config.
-     In this case, do nothing.
-  4. If nothing of the above applies, compose the various artifacts from the
-     SavedModel to create a subclassed layer or model. At this time, custom
-     metrics are not supported.
-
-  """
-
-  def __init__(self, metadata, object_graph_def):
-    self._metadata = {x.node_id: x for x in metadata.nodes}
-    self._proto = object_graph_def
-
-    self._node_paths = {
-        node_data.node_id: node_data.node_path for node_data in metadata.nodes
-    }
-    self.loaded_nodes = {}  # Maps node path -> loaded node
-
-    # Store all node ids that have already been traversed when tracking nodes
-    # that were recreated from the config.
-    self._traversed_nodes_from_config = set()
-
-    # Maps model id -> (blank model obj, list of child layer or their node ids)
-    # This tracks all layers in functional and sequential models. These models
-    # are only reconstructed after all of their child layers have been created.
-    self.model_layer_dependencies = {}
-    self._models_to_reconstruct = []
-
-  def del_tracking(self):
-    """Removes tracked references that are only used when loading the model."""
-    # Now that the node object has been fully loaded, and the checkpoint has
-    # been restored, the object no longer needs to track objects added from
-    # SerializedAttributes. (Note that saving a training checkpoint still
-    # functions correctly, because layers and variables are tracked separately
-    # by the Layer object.)
-    # TODO(kathywu): Instead of outright deleting these nodes (which would
-    # make restoring from a different checkpoint tricky), mark them as extra
-    # dependencies that are OK to overwrite.
-    for node in self.loaded_nodes.values():
-      node = node[0]
-      if not isinstance(node, base_layer.Layer):
-        # Loaded nodes can contain other trackable objects created when
-        # loading layers from the config, such as variables.
-        continue
-      for name in PUBLIC_ATTRIBUTES:
-        node._delete_tracking(name)  # pylint: disable=protected-access
-
-      if isinstance(node, functional_lib.Functional):
-        # Delete the temporary layer dependencies, which were used to restore
-        # the checkpointed values. When the model is live, the user can delete
-        # or add layers to the model at any time, so these layer dependencies
-        # may be obsolete.
-        dependencies = list(node._self_unconditional_dependency_names)  # pylint: disable=protected-access
-        for name in dependencies:
-          if re.match(r'^layer(_with_weights)?-[\d+]', name) is not None:
-            node._delete_tracking(name)  # pylint: disable=protected-access
-
-  def _add_children_recreated_from_config(self, obj, proto, node_id):
-    """Recursively records objects recreated from config."""
-    # pylint: disable=protected-access
-    if node_id in self._traversed_nodes_from_config:
-      return
-
-    parent_path = self._node_paths[node_id]
-    self._traversed_nodes_from_config.add(node_id)
-    obj._maybe_initialize_trackable()
-    if isinstance(obj, base_layer.Layer) and not obj.built:
-      metadata = json_utils.decode(self._metadata[node_id].metadata)
-      self._try_build_layer(obj, node_id, metadata.get('build_input_shape'))
-
-    # Create list of all possible children
-    children = []
-    # Look for direct children
-    for reference in proto.children:
-      obj_child = obj._lookup_dependency(reference.local_name)
-      children.append((obj_child, reference.node_id, reference.local_name))
-
-    # Add metrics that may have been added to the layer._metrics list.
-    # This is stored in the SavedModel as layer.keras_api.layer_metrics in
-    # SavedModels created after Tf 2.2.
-    metric_list_node_id = self._search_for_child_node(
-        node_id, [constants.KERAS_ATTR, 'layer_metrics'])
-    if metric_list_node_id is not None and hasattr(obj, '_metrics'):
-      obj_metrics = {m.name: m for m in obj._metrics}
-      for reference in self._proto.nodes[metric_list_node_id].children:
-        metric = obj_metrics.get(reference.local_name)
-        if metric is not None:
-          metric_path = '{}.layer_metrics.{}'.format(constants.KERAS_ATTR,
-                                                     reference.local_name)
-          children.append((metric, reference.node_id, metric_path))
-
-    for (obj_child, child_id, child_name) in children:
-      child_proto = self._proto.nodes[child_id]
-
-      if not isinstance(obj_child, tf.__internal__.tracking.Trackable):
-        continue
-      if (child_proto.user_object.identifier
-          in tf.__internal__.saved_model.load.registered_identifiers()):
-        setter = tf.__internal__.saved_model.load.get_setter(
-            child_proto.user_object)
-      elif obj_child._object_identifier in constants.KERAS_OBJECT_IDENTIFIERS:
-        setter = _revive_setter
-      else:
-        setter = setattr
-        # pylint: enable=protected-access
-
-      if child_id in self.loaded_nodes:
-        if self.loaded_nodes[child_id][0] is not obj_child:
-          # This means that the same trackable object is referenced by two
-          # different objects that were recreated from the config.
-          logging.warning(
-              'Looks like there is an object (perhaps variable or '
-              'layer) that is shared between different layers/models. '
-              'This may cause issues when restoring the variable '
-              'values. Object: {}'.format(obj_child))
-        continue
-
-      # Overwrite variable names with the ones saved in the SavedModel.
-      if (child_proto.WhichOneof('kind') == 'variable' and
-          child_proto.variable.name):
-        obj_child._handle_name = child_proto.variable.name + ':0'  # pylint: disable=protected-access
-
-      if isinstance(obj_child, tf.__internal__.tracking.TrackableDataStructure):
-        setter = lambda *args: None
-
-      child_path = '{}.{}'.format(parent_path, child_name)
-      self._node_paths[child_id] = child_path
-      self._add_children_recreated_from_config(obj_child, child_proto, child_id)
-      self.loaded_nodes[child_id] = obj_child, setter
-
-  def load_layers(self, compile=True):  # pylint: disable=redefined-builtin
-    """Load all layer nodes from the metadata."""
-    # Load metrics after models and layers, since it's likely that models
-    # and layers will create the metric when initialized (this avoids wasting
-    # time by creating objects multiple times).
-    metric_list = []
-    for node_metadata in self._metadata.values():
-      if node_metadata.identifier == constants.METRIC_IDENTIFIER:
-        metric_list.append(node_metadata)
-        continue
-
-      self.loaded_nodes[node_metadata.node_id] = self._load_layer(
-          node_metadata.node_id, node_metadata.identifier,
-          node_metadata.metadata)
-
-    for node_metadata in metric_list:
-      try:
-        self.loaded_nodes[node_metadata.node_id] = self._load_layer(
-            node_metadata.node_id, node_metadata.identifier,
-            node_metadata.metadata)
-      except ValueError as e:
-        # Metrics are only needed when the model is compiled later. We ignore
-        # errors when trying to load custom metrics when `compile=False` until
-        # custom metrics are serialized properly (b/135550038).
-        if compile:
-          raise e
-        logging.warning('Unable to restore custom metric. Please ensure that '
-                        'the layer implements `get_config` and `from_config` '
-                        'when saving. In addition, please use the '
-                        '`custom_objects` arg when calling `load_model()`.')
-
-  def _load_layer(self, node_id, identifier, metadata):
-    """Load a single layer from a SavedUserObject proto."""
-    metadata = json_utils.decode(metadata)
-
-    # If node was already created
-    if node_id in self.loaded_nodes:
-      node, setter = self.loaded_nodes[node_id]
-
-      # Revive setter requires the object to have a `_serialized_attributes`
-      # property. Add it here.
-      _maybe_add_serialized_attributes(node, metadata)
-
-      config = metadata.get('config')
-      if _is_graph_network(node) and generic_utils.validate_config(config):
-        child_nodes = self._get_child_layer_node_ids(node_id)
-        self.model_layer_dependencies[node_id] = (node, child_nodes)
-        if not child_nodes:
-          self._models_to_reconstruct.append(node_id)
-      return node, setter
-
-    # Detect whether this object can be revived from the config. If not, then
-    # revive from the SavedModel instead.
-    obj, setter = self._revive_from_config(identifier, metadata, node_id)
-    if obj is None:
-      obj, setter = revive_custom_object(identifier, metadata)
-
-    # Add an attribute that stores the extra functions/objects saved in the
-    # SavedModel. Most of these functions/objects are ignored, but some are
-    # used later in the loading process (e.g. the list of regularization
-    # losses, or the training config of compiled models).
-    _maybe_add_serialized_attributes(obj, metadata)
-    return obj, setter
-
-  def _revive_from_config(self, identifier, metadata, node_id):
-    """Revives a layer/model from config, or returns None."""
-    if identifier == constants.METRIC_IDENTIFIER:
-      obj = self._revive_metric_from_config(metadata)
-    else:
-      obj = (
-          self._revive_graph_network(identifier, metadata, node_id) or
-          self._revive_layer_or_model_from_config(metadata, node_id))
-
-    if obj is None:
-      return None, None
-
-    setter = self._config_node_setter(_revive_setter)
-    self._add_children_recreated_from_config(obj, self._proto.nodes[node_id],
-                                             node_id)
-    return obj, setter
-
-  def _revive_graph_network(self, identifier, metadata, node_id):
-    """Revives a graph network from config."""
-    # Determine whether the metadata contains information for reviving a
-    # functional or Sequential model.
-    config = metadata.get('config')
-    if not generic_utils.validate_config(config):
-      return None
-
-    class_name = tf.compat.as_str(metadata['class_name'])
-    if generic_utils.get_registered_object(class_name) is not None:
-      return None
-    model_is_functional_or_sequential = (
-        metadata.get('is_graph_network', False) or class_name == 'Sequential' or
-        class_name == 'Functional')
-    if not model_is_functional_or_sequential:
-      return None
-
-    # Revive functional and sequential models as blank model objects for now (
-    # must be initialized to enable setattr tracking and attribute caching).
-    # Reconstruction of the network is deferred until all of the model's layers
-    # have been revived.
-    if class_name == 'Sequential':
-      model = models_lib.Sequential(name=config['name'])
-    # The model is a custom Sequential model.
-    elif identifier == constants.SEQUENTIAL_IDENTIFIER:
-      # Uses the custom class name, since the config does not have one.
-      model = models_lib.Sequential(name=class_name)
-    else:
-      model = models_lib.Functional(inputs=[], outputs=[], name=config['name'])
-
-    # Record this model and its layers. This will later be used to reconstruct
-    # the model.
-    layers = self._get_child_layer_node_ids(node_id)
-    self.model_layer_dependencies[node_id] = (model, layers)
-    if not layers:
-      self._models_to_reconstruct.append(node_id)
-    return model
-
-  def _revive_layer_or_model_from_config(self, metadata, node_id):
-    """Revives a layer/custom model from config; returns None if infeasible."""
-    # Check that the following requirements are met for reviving from config:
-    #    1. Object can be deserialized from config.
-    #    2. If the object needs to be built, then the build input shape can be
-    #       found.
-    class_name = metadata.get('class_name')
-    config = metadata.get('config')
-    shared_object_id = metadata.get('shared_object_id')
-    must_restore_from_config = metadata.get('must_restore_from_config')
-    if not generic_utils.validate_config(config):
-      return None
-
-    try:
-      obj = layers_module.deserialize(
-          generic_utils.serialize_keras_class_and_config(
-              class_name, config, shared_object_id=shared_object_id))
-    except (TypeError, KeyError) as e:
-      # A name conflict has occurred. The `class_name` is in the Keras native
-      # framework; however, the value in the framework is different from the
-      # user's class definition which confuses the KerasObjectLoader.
-      builtin_layer = layers_module.get_builtin_layer(class_name)
-      if builtin_layer:
-        raise RuntimeError(
-            f'Unable to restore object of class \'{class_name}\' likely due to '
-            f'name conflict with built-in Keras class \'{builtin_layer}\'. To '
-            'override the built-in Keras definition of the object, decorate '
-            'your class with `@keras.utils.register_keras_serializable` and '
-            'include that file in your program, or pass your class in a '
-            '`keras.utils.CustomObjectScope` that wraps this load call.') from e
-      else:
-        raise
-    except ValueError as e:
-      if must_restore_from_config:
-        raise e
-      else:
-        return None
-
-    # Use the dtype, name, and trainable status. Often times these are not
-    # specified in custom configs, so retrieve their values from the metadata.
-    # pylint: disable=protected-access
-    obj._name = metadata['name']
-    if metadata.get('trainable') is not None:
-      obj.trainable = metadata['trainable']
-    if metadata.get('dtype') is not None:
-      obj._set_dtype_policy(metadata['dtype'])
-    if metadata.get('stateful') is not None:
-      obj.stateful = metadata['stateful']
-    # Restore model save spec for subclassed models. (layers do not store a
-    # SaveSpec)
-    if isinstance(obj, training_lib.Model):
-      full_save_spec = metadata.get('full_save_spec')
-      if full_save_spec is not None:
-        args_spec, kwargs_spec = full_save_spec
-        inputs_spec = args_spec.pop(0)
-        obj._set_save_spec(inputs_spec, args_spec, kwargs_spec)
-    # pylint: enable=protected-access
-
-    build_input_shape = metadata.get('build_input_shape')
-    built = self._try_build_layer(obj, node_id, build_input_shape)
-
-    if not built:
-      # If the layer cannot be built, revive a custom layer instead.
-      return None
-    return obj
-
-  def _revive_metric_from_config(self, metadata):
-    """Revives a metric object using the config saved in the metadata."""
-    class_name = tf.compat.as_str(metadata['class_name'])
-    config = metadata.get('config')
-
-    if not generic_utils.validate_config(config):
-      return None
-
-    try:
-      obj = metrics.deserialize(
-          generic_utils.serialize_keras_class_and_config(class_name, config))
-    except ValueError:
-      return None
-
-    build_input_shape = metadata.get('build_input_shape')
-    if build_input_shape is not None and hasattr(obj, '_build'):
-      obj._build(build_input_shape)  # pylint: disable=protected-access
-
-    return obj
-
-  def _try_build_layer(self, obj, node_id, build_input_shape):
-    """Attempts to build the layer."""
-    if obj.built or hasattr(obj.build, '_is_default'):
-      obj.built = True
-      return True
-
-    if build_input_shape is None:
-      build_input_shape = self._infer_inputs(node_id, convert_to_shapes=True)
-
-    if build_input_shape is not None:
-      obj.build(build_input_shape)
-      base_layer.Layer.build(obj, build_input_shape)
-      return True
-
-    return False
-
-  def get_path(self, node_id):
-    return self._node_paths[node_id]
-
-  def finalize_objects(self):
-    """Finish setting up Keras objects.
-
-    This function is executed after all objects and functions have been created.
-    Call functions and losses are attached to each layer, and once all layers
-    have been fully set up, graph networks are initialized.
-
-    Subclassed models that are revived from the SavedModel are treated like
-    layers, and have their call/loss functions attached here.
-    """
-    # Finish setting up layers and subclassed models. This step attaches call
-    # functions and losses to each object, and sets model inputs/outputs.
-    layers_revived_from_config = []
-    layers_revived_from_saved_model = []
-    for node_id, (node, _) in self.loaded_nodes.items():
-      if (not isinstance(node, base_layer.Layer) or
-          # Don't finalize models until all layers have finished loading.
-          node_id in self.model_layer_dependencies):
-        continue
-
-      self._unblock_model_reconstruction(node_id, node)
-
-      if isinstance(node, input_layer.InputLayer):
-        continue
-      elif isinstance(node, metrics.Metric):
-        continue
-
-      if isinstance(node, (RevivedLayer, RevivedInputLayer)):
-        layers_revived_from_saved_model.append(node)
-      else:
-        layers_revived_from_config.append(node)
-
-    _finalize_saved_model_layers(layers_revived_from_saved_model)
-    _finalize_config_layers(layers_revived_from_config)
-
-    # Initialize graph networks, now that layer dependencies have been resolved.
-    self._reconstruct_all_models()
-
-  def _unblock_model_reconstruction(self, layer_id, layer):
-    """Removes layer from blocking model reconstruction."""
-    for model_id, v in self.model_layer_dependencies.items():
-      _, layers = v
-      if layer_id not in layers:
-        continue
-      layers[layers.index(layer_id)] = layer
-      if all(isinstance(x, base_layer.Layer) for x in layers):
-        self._models_to_reconstruct.append(model_id)
-
-  def _reconstruct_all_models(self):
-    """Reconstructs the network structure of all models."""
-    all_initialized_models = set()
-    while self._models_to_reconstruct:
-      model_id = self._models_to_reconstruct.pop(0)
-      all_initialized_models.add(model_id)
-      model, layers = self.model_layer_dependencies[model_id]
-      self._reconstruct_model(model_id, model, layers)
-      _finalize_config_layers([model])
-
-    if all_initialized_models != set(self.model_layer_dependencies.keys()):
-      # This should not happen.
-      uninitialized_model_ids = (
-          set(self.model_layer_dependencies.keys()) - all_initialized_models)
-      uninitialized_model_names = [
-          self.model_layer_dependencies[model_id][0].name
-          for model_id in uninitialized_model_ids
-      ]
-      raise ValueError(f'Error loading model(s) in the SavedModel format. '
-                       f'The following model(s) could not be initialized: '
-                       f'{uninitialized_model_names}')
-
-  def _reconstruct_model(self, model_id, model, layers):
-    """Reconstructs the network structure."""
-    config = json_utils.decode(self._metadata[model_id].metadata)['config']
-
-    # Set up model inputs
-    if model.inputs:
-      # Inputs may already be created if the model is instantiated in another
-      # object's __init__.
-      pass
-    elif isinstance(model, models_lib.Sequential):
-      if not layers or not isinstance(layers[0], input_layer.InputLayer):
-        if config['layers'][0]['class_name'] == 'InputLayer':
-          layers.insert(
-              0,
-              input_layer.InputLayer.from_config(config['layers'][0]['config']))
-        elif 'batch_input_shape' in config['layers'][0]['config']:
-          batch_input_shape = config['layers'][0]['config']['batch_input_shape']
-          layers.insert(
-              0,
-              input_layer.InputLayer(
-                  input_shape=batch_input_shape[1:],
-                  batch_size=batch_input_shape[0],
-                  dtype=layers[0].dtype,
-                  name=layers[0].name + '_input'))
-      model.__init__(layers, name=config['name'])
-      if not model.inputs:
-        first_layer = self._get_child_layer_node_ids(model_id)[0]
-        input_specs = self._infer_inputs(first_layer)
-        input_shapes = self._infer_inputs(first_layer, convert_to_shapes=True)
-        model._set_inputs(input_specs)  # pylint: disable=protected-access
-        if not model.built and not isinstance(input_specs, dict):
-          model.build(input_shapes)
-    else:  # Reconstruct functional model
-      (inputs, outputs,
-       created_layers) = functional_lib.reconstruct_from_config(
-           config, created_layers={layer.name: layer for layer in layers})
-      model.__init__(inputs, outputs, name=config['name'])
-      functional_lib.connect_ancillary_layers(model, created_layers)
-
-    # Set model dtype.
-    _set_network_attributes_from_metadata(model)
-
-    # Unblock models that are dependent on this model.
-    self._unblock_model_reconstruction(model_id, model)
-
-  def _get_child_layer_node_ids(self, node_id):
-    """Returns the node ids of each layer in a Sequential/Functional model."""
-    # Sequential and Functional track layers with names following the format
-    # "layer-N". Use this to generate the list of layers.
-    num_layers = 0
-    child_layers = {}
-    pattern = re.compile('layer-(\\d+)')
-
-    for child in self._proto.nodes[node_id].children:
-      m = pattern.match(child.local_name)
-      if m is None:
-        continue
-      layer_n = int(m.group(1))
-      num_layers = max(layer_n + 1, num_layers)
-      child_layers[layer_n] = child.node_id
-
-    ordered = []
-    for n in range(num_layers):
-      child = child_layers.get(n)
-      if child is None:
-        break
-      ordered.append(child)
-    return ordered
-
-  def _search_for_child_node(self, parent_id, path_to_child):
-    """Returns node id of child node.
-
-    A helper method for traversing the object graph proto.
-
-    As an example, say that the object graph proto in the SavedModel contains an
-    object with the following child and grandchild attributes:
-
-    `parent.child_a.child_b`
-
-    This method can be used to retrieve the node id of `child_b` using the
-    parent's node id by calling:
-
-    `_search_for_child_node(parent_id, ['child_a', 'child_b'])`.
-
-    Args:
-      parent_id: node id of parent node
-      path_to_child: list of children names.
-
-    Returns:
-      node_id of child, or None if child isn't found.
-    """
-    if not path_to_child:
-      return parent_id
-
-    for child in self._proto.nodes[parent_id].children:
-      if child.local_name == path_to_child[0]:
-        return self._search_for_child_node(child.node_id, path_to_child[1:])
-    return None
-
-  def _infer_inputs(self, layer_node_id, convert_to_shapes=False):
-    """Infers input shape of layer from SavedModel functions."""
-    call_fn_id = self._search_for_child_node(
-        layer_node_id, ['call_and_return_all_conditional_losses'])
-    if call_fn_id is None:
-      return None
-
-    concrete_functions = (
-        self._proto.nodes[call_fn_id].function.concrete_functions)
-    if not concrete_functions:
-      return None
-    call_fn_name = concrete_functions[0]
-    call_fn_proto = self._proto.concrete_functions[call_fn_name]
-    structured_input_signature = tf.__internal__.saved_model.decode_proto(
-        call_fn_proto.canonicalized_input_signature)
-    inputs = structured_input_signature[0][0]
-    if convert_to_shapes:
-      return tf.nest.map_structure(lambda spec: spec.shape, inputs)
-    else:
-      return inputs
-
-  def _config_node_setter(self, setter):
-    """Creates edges for nodes that are recreated from config."""
-
-    def setattr_wrapper(obj, name, value):
-      # Avoid overwriting attributes of objects recreated from the config.
-      if obj._lookup_dependency(name) is None:  # pylint: disable=protected-access
-        setter(obj, name, value)
-
-    return setattr_wrapper
-
-
-def _finalize_saved_model_layers(layers):
-  """Runs the final steps of loading Keras Layers from SavedModel."""
-  # pylint: disable=protected-access
-  # 1. Set up call functions for all layers initialized from the SavedModel (
-  # and not the config)
-  for layer in layers:
-    layer.built = True
-    layer_call = getattr(
-        _get_keras_attr(layer), 'call_and_return_conditional_losses', None)
-    if layer_call and layer_call.concrete_functions:
-      call_spec = layer_utils.CallFunctionSpec(
-          tf_inspect.getfullargspec(layer_call))
-      layer.call = utils.use_wrapped_call(layer, layer_call, call_spec,
-                                          return_method=True)
-      expects_training_arg = layer._serialized_attributes['metadata'][
-          'expects_training_arg']
-      if 'training' in layer_call.function_spec.arg_names:
-        # This could change the value of `expects_training_arg` if this layer
-        # doesn't expect a training arg, but has a child layer that does.
-        expects_training_arg = True
-      layer._init_call_fn_args(expects_training_arg)
-    else:
-      layer.call = types.MethodType(
-          _unable_to_call_layer_due_to_serialization_issue, layer)
-
-  for layer in layers:
-    # 2. Set model inputs and outputs.
-    if isinstance(layer, RevivedNetwork):
-      _set_network_attributes_from_metadata(layer)
-
-      if hasattr(_get_keras_attr(layer), 'call_and_return_conditional_losses'):
-        call_fn = _get_keras_attr(layer).call_and_return_conditional_losses
-        if not call_fn.concrete_functions:
-          continue
-        if call_fn.input_signature is None:
-          args, kwargs = infer_inputs_from_restored_call_function(call_fn)
-          args = list(args)
-          inputs = args.pop(0)
-        else:
-          args = call_fn.input_signature
-          args = list(args)
-          inputs = args.pop(0)
-          kwargs = None
-        layer._set_save_spec(inputs, args, kwargs)  # pylint: disable=protected-access
-
-        # V1 models require calling _set_inputs to set the `.inputs` attr.
-        # Skip this step when there are multiple tensor inputs (this behavior
-        # is not well supported in V1 models).
-        if not any(
-            isinstance(x, tf.TensorSpec)
-            for x in tf.nest.flatten([args, kwargs])):
-          layer._set_inputs(inputs)
-
-    # 3. Add losses that aren't generated by the layer.call function.
-    _restore_layer_unconditional_losses(layer)
-    _restore_layer_activation_loss(layer)
-
-    # 4. Restore metrics list
-    _restore_layer_metrics(layer)
-
-  # pylint: enable=protected-access
-
-
-def _unable_to_call_layer_due_to_serialization_issue(layer, *unused_args,
-                                                     **unused_kwargs):
-  """Replaces the `layer.call` if the layer was not fully serialized.
-
-  Keras Model/Layer serialization is relatively relaxed because SavedModels
-  are not always loaded back as keras models. Thus, when there is an issue
-  tracing a non-signature function, a warning is logged instead of raising an
-  error. This results in a SavedModel where the model's call function is saved,
-  but the internal layer call functions are not.
-
-  When deserialized with `tf.keras.models.load_model`, the internal layers
-  which do not have serialized call functions should raise an error when called.
-
-  Args:
-    layer: Layer without the serialized call function.
-
-  Raises:
-    ValueError
-  """
-
-  raise ValueError(
-      f'Cannot call custom layer {layer.name} of type {type(layer)}, because '
-      'the call function was not serialized to the SavedModel.'
-      'Please try one of the following methods to fix this issue:'
-      '\n\n(1) Implement `get_config` and `from_config` in the layer/model '
-      'class, and pass the object to the `custom_objects` argument when '
-      'loading the model. For more details, see: '
-      'https://www.tensorflow.org/guide/keras/save_and_serialize'
-      '\n\n(2) Ensure that the subclassed model or layer overwrites `call` '
-      'and not `__call__`. The input shape and dtype will be automatically '
-      'recorded when the object is called, and used when saving. To manually '
-      'specify the input shape/dtype, decorate the call function with '
-      '`@tf.function(input_signature=...)`.')
-
-
-def _finalize_config_layers(layers):
-  """Runs the final steps of loading Keras Layers from config."""
-  for layer in layers:
-    # It is assumed that layers define their unconditional losses after being
-    # recreated from the config and built. The exceptions to this
-    # are Functional and Sequential models, which only store conditional losses
-    # (losses dependent on the inputs) in the config. Unconditional losses like
-    # weight regularization must be revived from the SavedModel.
-    if _is_graph_network(layer):
-      _restore_layer_unconditional_losses(layer)
-
-    # Some layers, like Dense, record their activation loss function in the
-    # config. However, not all layers do this, so the activation loss may be
-    # missing when restored from the config/hdf5.
-    # TODO(kathywu): Investigate ways to improve the config to ensure consistent
-    # loading behavior between HDF5 and SavedModel.
-    _restore_layer_activation_loss(layer)
-
-    # Restore metrics list.
-    _restore_layer_metrics(layer)
-
-    # Restore RNN layer states.
-    if (isinstance(layer, base_rnn.RNN) and layer.stateful and
-        hasattr(_get_keras_attr(layer), 'states')):
-      layer.states = getattr(_get_keras_attr(layer), 'states', None)
-      for variable in tf.nest.flatten(layer.states):
-        backend.track_variable(variable)
-
-    # Perform any layer defined finalization of the layer state.
-    layer.finalize_state()
-
-
-def _finalize_metric(metric):
-  metric.update_state = types.MethodType(
-      metrics_utils.update_state_wrapper(metric.keras_api.update_state), metric)
-  metric.result = metric.keras_api.result
-
-
-def _restore_layer_unconditional_losses(layer):
-  """Restore unconditional losses from SavedModel."""
-  if hasattr(_get_keras_attr(layer), 'layer_regularization_losses'):
-    losses = getattr(_get_keras_attr(layer), 'layer_regularization_losses', [])
-  else:
-    # Some earlier SavedModels may not have layer_regularization_losses
-    # serialized separately. Fall back to using the regularization_losses
-    # list if it does not exist.
-    losses = layer._serialized_attributes.get('regularization_losses', [])  # pylint: disable=protected-access
-  for loss in losses:
-    layer.add_loss(loss)
-
-
-def _restore_layer_activation_loss(layer):
-  """Restore actiation loss from SavedModel."""
-  # Use wrapped activity regularizer function if the layer's activity
-  # regularizer wasn't created during initialization.
-  activity_regularizer = getattr(
-      _get_keras_attr(layer), 'activity_regularizer_fn', None)
-  if activity_regularizer and not layer.activity_regularizer:
-    try:
-      layer.activity_regularizer = activity_regularizer
-    except AttributeError:
-      # This may happen if a layer wrapper is saved with an activity
-      # regularizer. The wrapper object's activity regularizer is unsettable.
-      pass
-
-
-def revive_custom_object(identifier, metadata):
-  """Revives object from SavedModel."""
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    model_class = training_lib.Model
-  else:
-    model_class = training_lib_v1.Model
-
-  revived_classes = {
-      constants.INPUT_LAYER_IDENTIFIER:
-          (RevivedInputLayer, input_layer.InputLayer),
-      constants.LAYER_IDENTIFIER: (RevivedLayer, base_layer.Layer),
-      constants.MODEL_IDENTIFIER: (RevivedNetwork, model_class),
-      constants.NETWORK_IDENTIFIER: (RevivedNetwork, functional_lib.Functional),
-      constants.SEQUENTIAL_IDENTIFIER: (RevivedNetwork, models_lib.Sequential),
-  }
-  parent_classes = revived_classes.get(identifier, None)
-
-  if parent_classes is not None:
-    parent_classes = revived_classes[identifier]
-    revived_cls = type(
-        tf.compat.as_str(metadata['class_name']), parent_classes, {})
-    return revived_cls._init_from_metadata(metadata)  # pylint: disable=protected-access
-  else:
-    raise ValueError(
-        f'Unable to restore custom object of type {identifier}. '
-        f'Please make sure that any custom layers are included in the '
-        f'`custom_objects` arg when calling `load_model()` and make sure that '
-        f'all layers implement `get_config` and `from_config`.')
-
-
-def _restore_layer_metrics(layer):
-  metrics_list = getattr(_get_keras_attr(layer), 'layer_metrics', {})
-  layer_metrics = {m.name: m for m in layer._metrics}  # pylint: disable=protected-access
-  for name, metric in metrics_list.items():
-    if name not in layer_metrics:
-      # Metrics may be added during initialization/building of custom layers.
-      layer._metrics.append(metric)  # pylint: disable=protected-access
-
-
-# TODO(kathywu): Centrally define keys and functions for both  serialization and
-# deserialization.
-class RevivedLayer:
-  """Keras layer loaded from a SavedModel."""
-
-  @classmethod
-  def _init_from_metadata(cls, metadata):
-    """Create revived layer from metadata stored in the SavedModel proto."""
-    init_args = dict(name=metadata['name'], trainable=metadata['trainable'])
-    if metadata.get('dtype') is not None:
-      init_args['dtype'] = metadata['dtype']
-    if metadata.get('batch_input_shape') is not None:
-      init_args['batch_input_shape'] = metadata['batch_input_shape']
-
-    revived_obj = cls(**init_args)
-
-    with utils.no_automatic_dependency_tracking_scope(revived_obj):
-      # pylint:disable=protected-access
-      revived_obj._call_spec.expects_training_arg = metadata[
-          'expects_training_arg']
-      config = metadata.get('config')
-      if generic_utils.validate_config(config):
-        revived_obj._config = config
-      if metadata.get('input_spec') is not None:
-        revived_obj.input_spec = recursively_deserialize_keras_object(
-            metadata['input_spec'],
-            module_objects={'InputSpec': input_spec.InputSpec})
-      if metadata.get('activity_regularizer') is not None:
-        revived_obj.activity_regularizer = regularizers.deserialize(
-            metadata['activity_regularizer'])
-      if metadata.get('_is_feature_layer') is not None:
-        revived_obj._is_feature_layer = metadata['_is_feature_layer']
-      if metadata.get('stateful') is not None:
-        revived_obj.stateful = metadata['stateful']
-      # pylint:enable=protected-access
-
-    return revived_obj, _revive_setter
-
-  @property
-  def keras_api(self):
-    return self._serialized_attributes.get(constants.KERAS_ATTR, None)
-
-  def get_config(self):
-    if hasattr(self, '_config'):
-      return self._config
-    else:
-      raise NotImplementedError
-
-
-def _revive_setter(layer, name, value):
-  """Setter function that saves some attributes to separate dictionary."""
-  # Many attributes in the SavedModel conflict with properties defined in
-  # Layer and Model. Save these attributes to a separate dictionary.
-  if name in PUBLIC_ATTRIBUTES:
-    # pylint: disable=protected-access
-    if isinstance(value, tf.__internal__.tracking.Trackable):
-      layer._track_trackable(value, name=name)
-    layer._serialized_attributes[name] = value
-    # pylint: enable=protected-access
-  elif (isinstance(layer, functional_lib.Functional) and
-        re.match(r'^layer(_with_weights)?-[\d+]', name) is not None):
-    # Edges named "layer-n" or "layer_with_weights-n", which are tracked in
-    # network._track_layers, should not be added as an attribute. They should
-    # be temporarily added as a dependency so that checkpointed values can be
-    # restored. These dependencies are manually deleted in
-    # KerasObjectLoader.del_tracking.
-
-    # Set `overwrite=True` in the case that `layer` already tracks a different
-    # layer-n. This may cause variable values to not be loaded properly in the
-    # original layer-n, but we already warn the users about this
-    # (ctrl-f "shared between different layers/models").
-    layer._track_trackable(value, name, overwrite=True)  # pylint: disable=protected-access
-  elif getattr(layer, name, None) is not None:
-    # Don't overwrite already defined attributes.
-    pass
-  else:
-    setattr(layer, name, value)
-
-
-class RevivedInputLayer:
-  """InputLayer loaded from a SavedModel."""
-
-  @classmethod
-  def _init_from_metadata(cls, metadata):
-    """Revives the saved InputLayer from the Metadata."""
-    init_args = dict(
-        name=metadata['name'],
-        dtype=metadata['dtype'],
-        sparse=metadata['sparse'],
-        ragged=metadata['ragged'],
-        batch_input_shape=metadata['batch_input_shape'])
-    revived_obj = cls(**init_args)
-    with utils.no_automatic_dependency_tracking_scope(revived_obj):
-      revived_obj._config = metadata['config']  # pylint:disable=protected-access
-
-    return revived_obj, setattr
-
-  def get_config(self):
-    return self._config
-
-
-def recursively_deserialize_keras_object(config, module_objects=None):
-  """Deserialize Keras object from a nested structure."""
-  if isinstance(config, dict):
-    if 'class_name' in config:
-      return generic_utils.deserialize_keras_object(
-          config, module_objects=module_objects)
-    else:
-      return {
-          key: recursively_deserialize_keras_object(config[key], module_objects)
-          for key in config
-      }
-  elif isinstance(config, (tuple, list)):
-    return [
-        recursively_deserialize_keras_object(x, module_objects) for x in config
-    ]
-  else:
-    raise ValueError(
-        f'Unable to decode Keras layer config. Config should be a dictionary, '
-        f'tuple or list. Received: config={config}')
-
-
-def infer_inputs_from_restored_call_function(fn):
-  """Returns TypeSpec of inputs from a restored call function.
-
-  Args:
-    fn: Restored layer call function. It is assumed that `fn` has at least one
-      concrete function and that the inputs are in the first argument.
-
-  Returns:
-    TypeSpec of call function inputs in the form of (args, kwargs)
-  """
-
-  def common_spec(x, y):
-    if not isinstance(x, tf.TypeSpec):
-      # Doesn't particularly matter what is returned in this case because the
-      # result will be filtered out in _set_input_shape.
-      return x
-    # pylint:disable=protected-access
-    result = x._without_tensor_names().most_specific_common_supertype(
-        [y._without_tensor_names()])
-    if result is None:
-      # Please file a bug if you are being hindered by this error.
-      raise TypeError(f'No common supertype of {x} and {y}.')
-    return result
-
-  spec = fn.concrete_functions[0].structured_input_signature
-  for concrete in fn.concrete_functions[1:]:
-    spec2 = concrete.structured_input_signature
-    spec = tf.nest.map_structure(common_spec, spec, spec2)
-  return spec
-
-
-class RevivedNetwork(RevivedLayer):
-  """Keras network of layers loaded from a SavedModel."""
-
-  @classmethod
-  def _init_from_metadata(cls, metadata):
-    """Create revived network from metadata stored in the SavedModel proto."""
-    revived_obj = cls(name=metadata['name'])
-
-    # Store attributes revived from SerializedAttributes in a un-tracked
-    # dictionary. The attributes are the ones listed in CommonEndpoints or
-    # "keras_api" for keras-specific attributes.
-    with utils.no_automatic_dependency_tracking_scope(revived_obj):
-      # pylint:disable=protected-access
-      revived_obj._call_spec.expects_training_arg = metadata[
-          'expects_training_arg']
-      config = metadata.get('config')
-      if generic_utils.validate_config(config):
-        revived_obj._config = config
-
-      if metadata.get('activity_regularizer') is not None:
-        revived_obj.activity_regularizer = regularizers.deserialize(
-            metadata['activity_regularizer'])
-      # pylint:enable=protected-access
-
-    return revived_obj, _revive_setter  # pylint:disable=protected-access
-
-
-def _set_network_attributes_from_metadata(revived_obj):
-  """Sets attributes recorded in the metadata."""
-  with utils.no_automatic_dependency_tracking_scope(revived_obj):
-    # pylint:disable=protected-access
-    metadata = revived_obj._serialized_attributes['metadata']
-    if metadata.get('dtype') is not None:
-      revived_obj._set_dtype_policy(metadata['dtype'])
-    revived_obj._trainable = metadata['trainable']
-    # pylint:enable=protected-access
-
-
-def _maybe_add_serialized_attributes(layer, metadata):
-  # Store attributes revived from SerializedAttributes in a un-tracked
-  # dictionary. The attributes are the ones listed in CommonEndpoints or
-  # "keras_api" for keras-specific attributes.
-  if not hasattr(layer, '_serialized_attributes'):
-    with utils.no_automatic_dependency_tracking_scope(layer):
-      layer._serialized_attributes = {'metadata': metadata}  # pylint: disable=protected-access
-
-
-def _get_keras_attr(layer):
-  return getattr(layer, '_serialized_attributes',
-                 {}).get(constants.KERAS_ATTR, None)
diff --git a/keras/saving/saved_model/metric_serialization.py b/keras/saving/saved_model/metric_serialization.py
deleted file mode 100644
index 88f060b3a46d..000000000000
--- a/keras/saving/saved_model/metric_serialization.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and functions implementing Metrics SavedModel serialization."""
-
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import layer_serialization
-from keras.utils import generic_utils
-import tensorflow.compat.v2 as tf
-
-
-class MetricSavedModelSaver(layer_serialization.LayerSavedModelSaver):
-  """Metric serialization."""
-
-  @property
-  def object_identifier(self):
-    return constants.METRIC_IDENTIFIER
-
-  def _python_properties_internal(self):
-    metadata = dict(
-        class_name=generic_utils.get_registered_name(type(self.obj)),
-        name=self.obj.name,
-        dtype=self.obj.dtype)
-    metadata.update(layer_serialization.get_serialized(self.obj))
-    if self.obj._build_input_shape is not None:  # pylint: disable=protected-access
-      metadata['build_input_shape'] = self.obj._build_input_shape  # pylint: disable=protected-access
-    return metadata
-
-  def _get_serialized_attributes_internal(self, unused_serialization_cache):
-    return (
-        dict(variables=tf.__internal__.tracking.wrap(self.obj.variables)),
-        # TODO(b/135550038): save functions to enable saving custom metrics.
-        {},
-    )
diff --git a/keras/saving/saved_model/model_serialization.py b/keras/saving/saved_model/model_serialization.py
deleted file mode 100644
index d43d6fae6268..000000000000
--- a/keras/saving/saved_model/model_serialization.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and functions implementing to Model SavedModel serialization."""
-
-from keras.saving import saving_utils
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import layer_serialization
-from keras.saving.saved_model import save_impl
-
-
-class ModelSavedModelSaver(layer_serialization.LayerSavedModelSaver):
-  """Model SavedModel serialization."""
-
-  @property
-  def object_identifier(self):
-    return constants.MODEL_IDENTIFIER
-
-  def _python_properties_internal(self):
-    metadata = super()._python_properties_internal()
-    # Network stateful property is dependent on the child layers.
-    metadata.pop('stateful')
-    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
-    spec = self.obj.save_spec(dynamic_batch=False)
-    metadata['full_save_spec'] = spec
-    # save_spec is saved for forward compatibility on older TF versions.
-    metadata['save_spec'] = None if spec is None else spec[0][0]
-
-    metadata.update(
-        saving_utils.model_metadata(
-            self.obj, include_optimizer=True, require_config=False))
-    return metadata
-
-  def _get_serialized_attributes_internal(self, serialization_cache):
-    default_signature = None
-
-    # Create a default signature function if this is the only object in the
-    # cache (i.e. this is the root level object).
-    if len(serialization_cache[constants.KERAS_CACHE_KEY]) == 1:
-      default_signature = save_impl.default_save_signature(self.obj)
-
-    # Other than the default signature function, all other attributes match with
-    # the ones serialized by Layer.
-    objects, functions = (
-        super()._get_serialized_attributes_internal(
-            serialization_cache))
-    functions['_default_save_signature'] = default_signature
-    return objects, functions
-
-
-class SequentialSavedModelSaver(ModelSavedModelSaver):
-
-  @property
-  def object_identifier(self):
-    return constants.SEQUENTIAL_IDENTIFIER
diff --git a/keras/saving/saved_model/order_preserving_set.py b/keras/saving/saved_model/order_preserving_set.py
deleted file mode 100644
index 9f02b6152ebc..000000000000
--- a/keras/saving/saved_model/order_preserving_set.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A set based on dict so that it preserves key insertion order.
-
-Python Dicts are order-preserving since 3.6
-(https://mail.python.org/pipermail/python-dev/2017-December/151283.html),
-but sets are not. This class implements a set on top of a dict so that we get
-deterministic iteration order across runs.
-"""
-
-import collections.abc
-
-
-class OrderPreservingSet(collections.abc.MutableSet):
-  """A set based on dict so that it preserves key insertion order."""
-
-  def __init__(self, iterable=None):
-    self._dict = {item: None for item in (iterable or [])}
-
-  # abstract from collections.MutableSet
-  def __len__(self):
-    return len(self._dict)
-
-  # abstract from collections.MutableSet
-  def __contains__(self, value):
-    return value in self._dict
-
-  # override from collections.MutableSet
-  def __iter__(self):
-    return iter(self._dict)
-
-  # abstract from collections.MutableSet
-  def add(self, item):
-    self._dict[item] = None
-
-  # abstract from collections.MutableSet
-  def discard(self, value):
-    del self._dict[value]
-
-  # override from collections.MutableSet
-  def clear(self):
-    self._dict = {}
-
-  # override from collections.Set
-  def __eq__(self, other):
-    if not isinstance(other, OrderPreservingSet):
-      return NotImplemented
-    return self._dict.keys() == other._dict.keys()
-
-  # override from collections.Set
-  def __le__(self, other):
-    if not isinstance(other, OrderPreservingSet):
-      return NotImplemented
-    return self._dict.keys() <= other._dict.keys()
-
-  # override from collections.Set
-  def __ge__(self, other):
-    if not isinstance(other, OrderPreservingSet):
-      return NotImplemented
-    return self._dict.keys() >= other._dict.keys()
-
-  # override from collections.Set
-  def __and__(self, other):
-    # collections.Set defaults to the ordering in other, we want to use self
-    return self._from_iterable(value for value in self if value in other)
-
-  # override from collections.Set
-  def __or__(self, other):
-    # ensure that other is ordered before performing __or__
-    if not isinstance(other, OrderPreservingSet):
-      raise TypeError(
-          "cannot union an 'OrderPreservingSet' with an unordered iterable.")
-    result = self._from_iterable(value for value in self)
-    for value in other:
-      result._dict[value] = None
-    return result
-
-  def union(self, other):
-    return self | other
diff --git a/keras/saving/saved_model/revive_test.py b/keras/saving/saved_model/revive_test.py
deleted file mode 100644
index 21659a9d746f..000000000000
--- a/keras/saving/saved_model/revive_test.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Tests reviving models from config and SavedModel.
-
-These tests ensure that a model revived from a combination of config and
-SavedModel have the expected structure.
-"""
-
-import tensorflow.compat.v2 as tf
-# TODO(kathywu): Move relevant tests from saved_model_test to
-import shutil
-
-from absl.testing import parameterized
-import numpy as np
-
-import keras
-from keras import backend
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-from keras.saving.saved_model import load as keras_load
-from keras.utils import generic_utils
-
-
-class SubclassedModelNoConfig(keras.Model):
-
-  def __init__(self, a, b):
-    super().__init__()
-
-    self.a = a
-    self.b = b
-    self.shared = CustomLayerNoConfig(a, b)
-    self.all_layers = []
-
-  def build(self, input_shape):
-    self.all_layers.extend([
-        self.shared,
-        CustomLayerWithConfig(self.a + 1, self.b + 2),
-        CustomLayerNoConfig(self.a + 3, self.b + 4),
-        keras.Sequential([
-            # TODO(b/145029112): Bug with losses when there are shared layers.
-            # self.shared,  <-- Enable when bug is fixed.
-            CustomLayerNoConfig(self.a + 5, self.b + 6)])])
-    super().build(input_shape)
-
-  def call(self, inputs):
-    x = inputs
-    for layer in self.all_layers:
-      x = layer(x)
-    return x
-
-
-class SparseDense(keras.layers.Dense):
-
-  def call(self, inputs):
-    input_shape = tf.stack(
-        (tf.reduce_prod(tf.shape(inputs)[:-1]),
-         self.kernel.shape[0]))
-    output_shape = tf.concat(
-        (tf.shape(inputs)[:-1], [self.kernel.shape[1]]), -1)
-    x = tf.sparse.reshape(inputs, input_shape)
-    return tf.reshape(
-        self.activation(
-            tf.sparse.sparse_dense_matmul(x, self.kernel) + self.bias),
-        output_shape)
-
-
-class SubclassedSparseModelNoConfig(keras.Model):
-
-  def __init__(self, a, b):
-    super().__init__()
-    self.a = a
-    self.shared = CustomLayerNoConfig(a, b)
-    self.all_layers = [SparseDense(4)]
-
-  def call(self, inputs):
-    x = inputs
-    for layer in self.all_layers:
-      x = layer(x)
-    return self.shared(x + self.a)
-
-
-class SubclassedModelWithConfig(SubclassedModelNoConfig):
-
-  def get_config(self):
-    return {'a': self.a,
-            'b': self.b}
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
-
-
-class CustomLayerNoConfig(keras.layers.Layer):
-
-  def __init__(self, a, b, name=None):
-    super().__init__(name=name)
-    self.a = tf.Variable(a, name='a')
-    self.b = b
-    def a_regularizer():
-      return self.a * 2
-    self.add_loss(a_regularizer)
-    self.sum_metric = keras.metrics.Sum(name='inputs_sum')
-    self.unused_metric = keras.metrics.Sum(name='not_added_to_metrics')
-
-  def build(self, input_shape):
-    self.c = tf.Variable(
-        tf.constant(1.0, shape=input_shape[1:]), name=self.name+'_c')
-
-  def call(self, inputs):
-    self.add_loss(tf.reduce_sum(inputs))
-    self.add_metric(self.sum_metric(inputs))
-    self.add_metric(inputs, aggregation='mean', name='mean')
-
-    return inputs + self.c
-
-
-class CustomLayerWithConfig(CustomLayerNoConfig):
-
-  def get_config(self):
-    return {'a': backend.get_value(self.a),
-            'b': self.b,
-            'name': self.name}
-
-
-class CustomNetworkDefaultConfig(keras.Model):
-
-  def __init__(self, num_classes, name=None):
-    inputs = keras.Input((2, 3), name='inputs')
-    x = keras.layers.Flatten(name='flatten')(inputs)
-    y = keras.layers.Dense(num_classes, name='outputs')(x)
-    super().__init__(inputs, y, name=name)
-
-
-class CustomNetworkWithConfig(CustomNetworkDefaultConfig):
-
-  def __init__(self, num_classes, name=None):
-    super().__init__(num_classes, name=name)
-    self._config_dict = dict(num_classes=num_classes)
-
-  def get_config(self):
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(config['num_classes'], name=config.get('name'))
-
-
-class CustomNetworkWithConfigName(CustomNetworkWithConfig):
-
-  def __init__(self, num_classes, name=None):
-    super().__init__(num_classes, name=name)
-    self._config_dict['name'] = self.name
-
-
-class UnregisteredCustomSequentialModel(keras.Sequential):
-  # This class is *not* registered in the CustomObjectScope.
-
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    self.add(keras.layers.InputLayer(input_shape=(2, 3)))
-
-
-class FunctionalSubclassModel(keras.Model):
-
-  def __init__(self, units):
-    self.units = units
-    my_input = keras.Input(shape=(2, 3), name='inputs')
-    dense = keras.layers.Dense(self.units, activation='relu', name='dense')
-    output = dense(my_input)
-    outputs = {'output': output}
-    super().__init__(inputs=[my_input], outputs=outputs)
-
-  def get_config(self):
-    return {'units': self.units}
-
-
-class FunctionalSubclassModelWrongConfig(FunctionalSubclassModel):
-
-  def get_config(self):
-    return {}
-
-
-# The WideDeepModel, whose name conflicts with a Keras built-in model, is
-# registered in these tests.
-class WideDeepModel(SubclassedModelWithConfig):
-  pass
-
-
-class ReviveTestBase(test_combinations.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.path = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, self.path, ignore_errors=True)
-
-  def _assert_revived_correctness(self, model, revived):
-    self.assertAllEqual(model.input_names, revived.input_names)
-    self.assertAllEqual(model.output_names, revived.output_names)
-    if model.inputs is not None:
-      self.assertTrue(
-          all([
-              i.shape.as_list() == r.shape.as_list() and i.dtype == r.dtype
-              for (i, r) in zip(model.inputs, revived.inputs)
-          ]))
-      self.assertTrue(
-          all([
-              i.shape.as_list() == r.shape.as_list() and i.dtype == r.dtype
-              for (i, r) in zip(model.outputs, revived.outputs)
-          ]))
-
-    self.assertAllClose(self.evaluate(model.weights),
-                        self.evaluate(revived.weights))
-    input_arr = tf.constant(
-        np.random.random((2, 2, 3)).astype(np.float32))
-    if isinstance(revived.save_spec()[0][0],
-                  tf.SparseTensorSpec):
-      input_arr = tf.sparse.from_dense(input_arr)
-
-    self.assertAllClose(model(input_arr), revived(input_arr))
-    self.assertAllClose(sum(model.losses), sum(revived.losses))
-    self.assertAllClose(len(model.losses), len(revived.losses))
-    self.assertEqual(len(model.metrics), len(revived.metrics))
-    # TODO(b/150403085): Investigate why the metric order changes when running
-    # this test in tf-nightly.
-    self.assertAllClose(sorted([m.result() for m in model.metrics]),
-                        sorted([m.result() for m in revived.metrics]))
-    model_layers = {layer.name: layer for layer in model.layers}
-    revived_layers = {layer.name: layer for layer in revived.layers}
-    self.assertAllEqual(model_layers.keys(), revived_layers.keys())
-
-    for name in model_layers:
-      model_layer = model_layers[name]
-      revived_layer = revived_layers[name]
-      self.assertEqual(model_layer.name, revived_layer.name)
-      self.assertEqual(model_layer.dtype, revived_layer.dtype)
-      self.assertEqual(model_layer.trainable, revived_layer.trainable)
-      if 'WithConfig' in type(model_layer).__name__:
-        self.assertEqual(type(model_layer), type(revived_layer))
-      else:
-        # When loading layers from SavedModel, a new class is dynamically
-        # created with the same name.
-        self.assertEqual(type(model_layer).__name__,
-                         type(revived_layer).__name__)
-
-
-# These tests take a while to run, so each should run in a separate shard
-# (putting them in the same TestCase resolves this).
-class TestBigModelRevive(ReviveTestBase):
-
-  @test_combinations.run_with_all_model_types
-  def test_revive(self):
-    input_shape = None
-    if test_utils.get_model_type() == 'functional':
-      input_shape = (2, 3)
-
-    layer_with_config = CustomLayerWithConfig(1., 2)
-    layer_without_config = CustomLayerNoConfig(3., 4)
-    subclassed_with_config = SubclassedModelWithConfig(4., 6.)
-    subclassed_without_config = SubclassedModelNoConfig(7., 8.)
-
-    inputs = keras.Input((2, 3))
-    x = CustomLayerWithConfig(1., 2)(inputs)
-    x = CustomLayerNoConfig(3., 4)(x)
-    x = SubclassedModelWithConfig(4., 6.)(x)
-    x = SubclassedModelNoConfig(7., 8.)(x)
-    inner_model_functional = keras.Model(inputs, x)
-
-    inner_model_sequential = keras.Sequential(
-        [CustomLayerWithConfig(1., 2),
-         CustomLayerNoConfig(3., 4),
-         SubclassedModelWithConfig(4., 6.),
-         SubclassedModelNoConfig(7., 8.)])
-
-    class SubclassedModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.all_layers = [CustomLayerWithConfig(1., 2),
-                           CustomLayerNoConfig(3., 4),
-                           SubclassedModelWithConfig(4., 6.),
-                           SubclassedModelNoConfig(7., 8.)]
-
-      def call(self, inputs):
-        x = inputs
-        for layer in self.all_layers:
-          x = layer(x)
-        return x
-
-    inner_model_subclassed = SubclassedModel()
-
-    layers = [layer_with_config,
-              layer_without_config,
-              subclassed_with_config,
-              subclassed_without_config,
-              inner_model_functional,
-              inner_model_sequential,
-              inner_model_subclassed]
-    model = test_utils.get_model_from_layers(
-        layers, input_shape=input_shape)
-    # Run data through the Model to create save spec and weights.
-    model.predict(np.ones((10, 2, 3)), batch_size=10)
-
-    # Test that the correct checkpointed values are loaded, whether the layer is
-    # created from the config or SavedModel.
-    layer_with_config.c.assign(2 * layer_with_config.c)
-    layer_without_config.c.assign(3 * layer_without_config.c)
-
-    model.save(self.path, save_format='tf')
-    revived = keras_load.load(self.path)
-    self._assert_revived_correctness(model, revived)
-
-
-class TestModelRevive(ReviveTestBase):
-
-  def test_revive_subclassed_with_nested_model(self):
-    model = SubclassedModelNoConfig(1., 2.)
-    # Run data through the Model to create save spec and weights.
-    model.predict(np.ones((10, 2, 3)), batch_size=10)
-    model.save(self.path, save_format='tf')
-    revived = keras_load.load(self.path)
-    self._assert_revived_correctness(model, revived)
-
-  def test_revive_subclassed_with_sparse_model(self):
-    model = SubclassedSparseModelNoConfig(1., 2.)
-    # Run data through the Model to create save spec and weights.
-    x = tf.sparse.from_dense(np.ones((10, 2, 3), dtype=np.float32))
-    model.predict(x, batch_size=10)
-    model.save(self.path, save_format='tf')
-    revived = keras_load.load(self.path)
-    self._assert_revived_correctness(model, revived)
-
-  def test_revive_unregistered_sequential(self):
-    model = UnregisteredCustomSequentialModel()
-    x = np.random.random((2, 2, 3)).astype(np.float32)
-    model(x)
-    model.save(self.path, save_format='tf')
-    revived = keras_load.load(self.path)
-    self._assert_revived_correctness(model, revived)
-
-  def test_revive_sequential_inputs(self):
-    model = keras.models.Sequential([
-        keras.Input((None,), dtype=tf.string),
-        keras.layers.Lambda(tf.strings.lower)
-    ])
-    model.save(self.path, save_format='tf')
-    revived = keras_load.load(self.path)
-    revived_layers = list(
-        revived._flatten_layers(include_self=False, recursive=False))
-    self.assertEqual(tf.string, revived_layers[0].dtype)
-
-  @parameterized.named_parameters(
-      ('default_config', CustomNetworkDefaultConfig),
-      ('with_config', CustomNetworkWithConfig),
-      ('with_config_name', CustomNetworkWithConfigName))
-  def test_revive_network(self, model_cls):
-    model = model_cls(8)
-    model.save(self.path, include_optimizer=False, save_format='tf')
-    revived = keras_load.load(self.path, compile=False)
-    self._assert_revived_correctness(model, revived)
-
-  def test_functional_subclass(self):
-    model = FunctionalSubclassModel(32)
-    model.save(self.path, save_format='tf')
-    revived = keras_load.load(self.path, compile=False)
-    self._assert_revived_correctness(model, revived)
-
-  def test_functional_subclass_wrong_config(self):
-    model = FunctionalSubclassModelWrongConfig(32)
-    model.save(self.path, save_format='tf')
-    with self.assertRaisesRegex(TypeError, 'Unable to revive model'):
-      keras_load.load(self.path, compile=False)
-
-  def test_load_compiled_metrics(self):
-    model = test_utils.get_small_sequential_mlp(1, 3)
-
-    # Compile with dense categorical accuracy
-    model.compile('rmsprop', 'mse', 'acc')
-    x = np.random.random((5, 10)).astype(np.float32)
-    y_true = np.random.random((5, 3)).astype(np.float32)
-    model.train_on_batch(x, y_true)
-
-    model.save(self.path, include_optimizer=True, save_format='tf')
-    revived = keras_load.load(self.path, compile=True)
-    self.assertAllClose(model.test_on_batch(x, y_true),
-                        revived.test_on_batch(x, y_true))
-
-    # Compile with sparse categorical accuracy
-    model.compile('rmsprop', 'mse', 'acc')
-    y_true = np.random.randint(0, 3, (5, 1)).astype(np.float32)
-    model.train_on_batch(x, y_true)
-    model.save(self.path, include_optimizer=True, save_format='tf')
-    revived = keras_load.load(self.path, compile=True)
-    self.assertAllClose(model.test_on_batch(x, y_true),
-                        revived.test_on_batch(x, y_true))
-
-  def test_revived_model_has_save_spec(self):
-    model = SubclassedModelWithConfig(2, 3)
-    model.predict(np.random.random((5, 10)).astype(np.float32))
-    model.save(self.path, save_format='tf')
-    revived = keras_load.load(self.path, compile=True)
-    self.assertAllEqual(
-        model._get_save_spec(dynamic_batch=False),
-        revived._get_save_spec(dynamic_batch=False))
-
-  def test_load_model_with_name_conflict_raises_error(self):
-
-    class LinearModel(SubclassedModelWithConfig):
-      pass
-
-    model = LinearModel(2, 3)
-    model(np.random.random((5, 10)).astype(np.float32))
-    model.save(self.path, save_format='tf')
-    with self.assertRaisesRegex(
-        RuntimeError, 'Unable to restore object of class \'LinearModel\''):
-      keras_load.load(self.path, compile=True)
-
-  def test_load_model_with_name_conflict_registered_works(self):
-    model = WideDeepModel(2, 3)
-    model(np.random.random((5, 10)).astype(np.float32))
-    model.save(self.path, save_format='tf')
-    keras_load.load(self.path, compile=True)
-
-
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  with generic_utils.CustomObjectScope({
-      'CustomLayerWithConfig': CustomLayerWithConfig,
-      'CustomNetworkWithConfig': CustomNetworkWithConfig,
-      'CustomNetworkWithConfigName': CustomNetworkWithConfigName,
-      'SubclassedModelWithConfig': SubclassedModelWithConfig,
-      'FunctionalSubclassModel': FunctionalSubclassModel,
-      'FunctionalSubclassModelWrongConfig': FunctionalSubclassModelWrongConfig,
-      'WideDeepModel': WideDeepModel
-  }):
-    tf.test.main()
diff --git a/keras/saving/saved_model/save.py b/keras/saving/saved_model/save.py
deleted file mode 100644
index 5c916c31da62..000000000000
--- a/keras/saving/saved_model/save.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras SavedModel serialization."""
-
-import os
-from absl import logging
-
-from keras import backend
-from keras.layers import serialization
-from keras.protobuf import saved_metadata_pb2
-from keras.protobuf import versions_pb2
-from keras.saving import saving_utils
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import save_impl
-from keras.saving.saved_model import utils
-from keras.utils.generic_utils import LazyLoader
-from keras.utils.io_utils import ask_to_proceed_with_overwrite
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.saved_model import save as save_lib
-
-
-# To avoid circular dependencies between keras/engine and keras/saving,
-# code in keras/saving must delay imports.
-
-base_layer = LazyLoader(
-    "base_layer", globals(),
-    "keras.engine.base_layer")
-training_lib = LazyLoader(
-    "training_lib", globals(),
-    "keras.engine.training")
-
-
-def save(model, filepath, overwrite, include_optimizer, signatures=None,
-         options=None, save_traces=True):
-  """Saves a model as a SavedModel to the filepath.
-
-  Args:
-    model: Keras model instance to be saved.
-    filepath: String path to save the model.
-    overwrite: whether to overwrite the existing filepath.
-    include_optimizer: If True, save the model's optimizer state.
-    signatures: Signatures to save with the SavedModel. Applicable to the 'tf'
-      format only. Please see the `signatures` argument in `tf.saved_model.save`
-      for details.
-    options: (only applies to SavedModel format) `tf.saved_model.SaveOptions`
-      object that specifies options for saving to SavedModel.
-    save_traces: (only applies to SavedModel format) When enabled, the
-      SavedModel will store the function traces for each layer. This
-      can be disabled, so that only the configs of each layer are stored.
-      Defaults to `True`. Disabling this will decrease serialization time
-      and reduce file size, but it requires that all custom layers/models
-      implement a `get_config()` method.
-
-  Raises:
-    ValueError: if the model's inputs have not been defined.
-  """
-  # If file exists and should not be overwritten.
-  if not overwrite and os.path.exists(filepath):
-    proceed = ask_to_proceed_with_overwrite(filepath)
-    if not proceed:
-      return
-
-  if save_traces:
-    if save_impl.should_skip_serialization(model):
-      saving_utils.raise_model_input_error(model)
-
-  if not include_optimizer:
-    orig_optimizer = model.optimizer
-    model.optimizer = None
-    # TODO(b/180760306) Change to del model.optimizer if Layer's __delattr__
-    # calls AutoTrackable's __delattr__.
-    model._delete_tracking("optimizer")  # pylint: disable=protected-access
-
-  # Trace all functions and signatures with `training=0` instead of using an
-  # already-set learning phase placeholder.
-  # This is needed for compatibility reasons until learning phase setting
-  # is removed from the public apis.
-  with backend.deprecated_internal_learning_phase_scope(0):
-    with utils.keras_option_scope(save_traces):
-      saved_nodes, node_paths = save_lib.save_and_return_nodes(
-          model, filepath, signatures, options)
-
-    # Save all metadata to a separate file in the SavedModel directory.
-    metadata = generate_keras_metadata(saved_nodes, node_paths)
-
-  with tf.io.gfile.GFile(
-      tf.io.gfile.join(filepath, constants.SAVED_METADATA_PATH), "wb") as w:
-    w.write(metadata.SerializeToString(deterministic=True))
-
-  if not include_optimizer:
-    model.optimizer = orig_optimizer
-
-
-def generate_keras_metadata(saved_nodes, node_paths):
-  """Constructs a KerasMetadata proto with the metadata of each keras object."""
-  metadata = saved_metadata_pb2.SavedMetadata()
-  for node_id, node in enumerate(saved_nodes):
-    if isinstance(node, base_layer.Layer):
-      path = node_paths[node]
-      if not path:
-        node_path = "root"
-      else:
-        node_path = "root.{}".format(
-            ".".join([ref.name for ref in path]))
-
-      metadata.nodes.add(
-          node_id=node_id,
-          node_path=node_path,
-          version=versions_pb2.VersionDef(
-              producer=2, min_consumer=1, bad_consumers=[]),
-          identifier=node._object_identifier,  # pylint: disable=protected-access
-          metadata=node._tracking_metadata)  # pylint: disable=protected-access
-
-      # Log warning if the node's class name conflicts with a Keras built-in
-      # object.
-      class_name = node.__class__.__name__
-      builtin_layer = serialization.get_builtin_layer(class_name)
-      if builtin_layer:
-        if not isinstance(node, builtin_layer):
-          logging.warning(
-              "%s has the same name '%s' as a built-in Keras "
-              "object. Consider renaming %s to avoid naming "
-              "conflicts when loading with "
-              "`tf.keras.models.load_model`. If renaming is not possible, pass "
-              "the object in the `custom_objects` parameter of the load "
-              "function.", node, class_name, node.__class__)
-
-  return metadata
diff --git a/keras/saving/saved_model/save_impl.py b/keras/saving/saved_model/save_impl.py
deleted file mode 100644
index ac980ef4253a..000000000000
--- a/keras/saving/saved_model/save_impl.py
+++ /dev/null
@@ -1,731 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras SavedModel serialization.
-
-TODO (kathywu): Move to layer_serialization.py. Some model-specific logic should
-go to model_serialization.py.
-"""
-
-import functools
-import threading
-import weakref
-
-from keras import backend
-from keras.engine import base_layer_utils
-from keras.engine import input_spec
-from keras.mixed_precision import autocast_variable
-from keras.saving import saving_utils
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import load as keras_load
-from keras.saving.saved_model import serialized_attributes
-from keras.saving.saved_model import utils
-from keras.utils import layer_utils
-from keras.utils import tf_contextlib
-from keras.utils import tf_utils
-from keras.utils import version_utils
-from keras.utils.generic_utils import LazyLoader
-import tensorflow.compat.v1.logging as logging
-import tensorflow.compat.v2 as tf
-
-# To avoid circular dependencies between keras/engine and keras/saving,
-# code in keras/saving must delay imports.
-
-# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
-# once the issue with copybara is fixed.
-# pylint:disable=g-inconsistent-quotes
-base_layer = LazyLoader('base_layer', globals(), 'keras.engine.base_layer')
-metrics = LazyLoader('metrics', globals(), 'keras.metrics')
-input_layer = LazyLoader('input_layer', globals(), 'keras.engine.input_layer')
-training_lib = LazyLoader('training_lib', globals(), 'keras.engine.training')
-sequential_lib = LazyLoader('sequential_lib', globals(),
-                            'keras.engine.sequential')
-# pylint:enable=g-inconsistent-quotes
-
-
-def should_skip_serialization(layer):
-  """Skip serializing extra objects and functions if layer inputs aren't set."""
-  saved_model_input_spec_set = (isinstance(layer, training_lib.Model) and
-                                layer._saved_model_inputs_spec is not None)  # pylint: disable=protected-access
-  if not layer.built and not saved_model_input_spec_set:
-    logging.warning('Skipping full serialization of Keras layer {}, because '
-                    'it is not built.'.format(layer))
-    return True
-  return False
-
-
-def _filter_shards(variables):
-  return [var for var in variables if not hasattr(var, '_sharded_container')]
-
-
-def wrap_layer_objects(layer, serialization_cache):
-  """Returns extra trackable objects to attach to the serialized layer.
-
-  Args:
-    layer: Keras Layer object.
-    serialization_cache: Dictionary shared between all objects during
-      serialization.
-
-  Returns:
-    A dictionary containing all checkpointable objects from a
-    SerializedAttributes object. See LayerAttributes and ModelAttributes for
-    entire list of objects
-  """
-  # Wrap all regularization losses as tf.functions.
-  # First, generate list of all regularization losses in this layer and
-  # sublayers.
-  all_losses = layer._callable_losses[:]  # pylint: disable=protected-access
-  for child_layer in utils.list_all_layers(layer):
-    all_losses.extend(child_layer._callable_losses)  # pylint: disable=protected-access
-  # Next, wrap all loss functions as tf.functions. Use the serialization cache
-  # to store already-wrapped functions.
-  keras_loss_cache = serialization_cache.setdefault('keras_losses', {})
-  wrapped_loss_functions = []
-  for loss_fn in all_losses:
-    if loss_fn in keras_loss_cache:
-      wrapped_loss_functions.append(keras_loss_cache[loss_fn])
-    else:
-      wrapped_loss = _wrap_unconditional_loss(loss_fn, len(keras_loss_cache))
-      keras_loss_cache[loss_fn] = wrapped_loss
-      wrapped_loss_functions.append(wrapped_loss)
-  wrapped_layer_losses = [
-      keras_loss_cache[fn] for fn in layer._callable_losses[:]  # pylint: disable=protected-access
-  ]
-
-  layer_metrics = tf.__internal__.tracking.wrap(
-      {m.name: m for m in layer._metrics})  # pylint: disable=protected-access
-
-  # Avoid duplicate creation of shard Variables on loading.
-  # `layer.variables` will return the shard Variables rather than the
-  # ShardedVariables (b/224541446), but Keras loading will create new
-  # ShardedVariables (and thus shard Variables) from Keras metadata if needed.
-  # There's no need to also save the shard Variables here, so filter them out.
-  variables = _filter_shards(layer.variables)
-  trainable_variables = _filter_shards(layer.trainable_variables)
-  non_trainable_variables = _filter_shards(layer.non_trainable_variables)
-  return dict(
-      variables=tf.__internal__.tracking.wrap(variables),
-      trainable_variables=tf.__internal__.tracking.wrap(trainable_variables),
-      non_trainable_variables=tf.__internal__.tracking.wrap(
-          non_trainable_variables),
-      layers=tf.__internal__.tracking.wrap(utils.list_all_layers(layer)),
-      metrics=tf.__internal__.tracking.wrap(layer.metrics),
-      regularization_losses=tf.__internal__.tracking.wrap(
-          wrapped_loss_functions),
-      layer_regularization_losses=tf.__internal__.tracking.wrap(
-          wrapped_layer_losses),
-      layer_metrics=layer_metrics)
-  # pylint: disable=protected-access
-
-
-def wrap_layer_functions(layer, serialization_cache):
-  """Returns dict of wrapped layer call function and losses in tf.functions.
-
-  Args:
-    layer: Keras Layer object.
-    serialization_cache: Dictionary shared between all objects during
-      serialization.
-
-  Returns:
-    A dictionary containing all keras tf.functions to serialize. See
-    LayerAttributes and ModelAttributes for the list of all attributes.
-  """
-  # Since Sequential models may be modified in place using model.add() or
-  # model.pop(), don't use saved functions.
-  if (isinstance(layer, keras_load.RevivedLayer) and
-      not isinstance(layer, sequential_lib.Sequential)):
-    return {
-        fn_name: getattr(layer.keras_api, fn_name, None)
-        for fn_name in serialized_attributes.LayerAttributes.all_functions
-    }
-
-  # Reset the losses of the layer and its children. The call function in each
-  # child layer is replaced with tf.functions.
-  original_fns = _replace_child_layer_functions(layer, serialization_cache)
-  original_losses = _reset_layer_losses(layer)
-
-  # Wrap all the layer call and activity regularizer functions.
-
-  # Use LayerCallCollection to ensure that all layer call functions (__call__,
-  # call with losses) are traced with the same inputs.
-  call_collection = LayerCallCollection(layer)
-  call_fn_with_losses = call_collection.add_function(
-      _wrap_call_and_conditional_losses(layer),
-      '{}_layer_call_and_return_conditional_losses'.format(layer.name),
-      # If any of this layer's child layers use the training arg, the traced
-      # call functions of this layer will have a training keyword argument. If
-      # the original layer does not expect the training arg, then it will have
-      # to be removed (by setting `match_layer_training_arg`).
-      match_layer_training_arg=True)
-  call_fn = call_collection.add_function(
-      _extract_outputs_from_fn(layer, call_fn_with_losses),
-      '{}_layer_call_fn'.format(layer.name),
-      # Since `call_fn` wraps call_fn_with_losses and not the original call
-      # function, `match_layer_training_arg` should be set to False.
-      match_layer_training_arg=False)
-
-  fns = {
-      'call_and_return_conditional_losses': call_fn_with_losses,
-      '__call__': call_fn
-  }
-
-  if layer._activity_regularizer is not None:  # pylint: disable=protected-access
-    fns['activity_regularizer_fn'] = _wrap_activity_regularizer(layer)
-    fns['call_and_return_all_conditional_losses'] = (
-        call_collection.add_function(
-            _append_activity_regularizer_loss(layer, call_fn_with_losses,
-                                              fns['activity_regularizer_fn']),
-            '{}_layer_call_and_return_all_conditional_losses'.format(
-                layer.name),
-            match_layer_training_arg=False))
-  else:
-    fns['activity_regularizer_fn'] = None
-    fns['call_and_return_all_conditional_losses'] = call_fn_with_losses
-
-  # Manually trigger traces before restoring the overwritten functions. The
-  # functions are traced within the layer call context to ensure that layer
-  # functions (e.g. add_loss) behave as though running in graph mode.
-  with tracing_scope():
-    call_collection.trace_with_input_signature()
-    with base_layer_utils.call_context().enter(
-        layer, inputs=None, build_graph=True, training=None, saving=True):
-      for fn in fns.values():
-        if fn is not None and not isinstance(fn, LayerCall):
-          fn.get_concrete_function()
-
-  # Restore overwritten functions and losses
-  _restore_child_layer_functions(original_fns)
-  _restore_layer_losses(original_losses)
-
-  return fns
-
-
-def default_save_signature(layer):
-  original_losses = _reset_layer_losses(layer)
-  fn = saving_utils.trace_model_call(layer)
-  _restore_layer_losses(original_losses)
-  return fn
-
-
-def _replace_child_layer_functions(layer, serialization_cache):
-  """Replaces functions in the children layers with wrapped tf.functions.
-
-  This step allows functions from parent layers to reference the wrapped
-  functions from their children layers instead of retracing the ops.
-
-  This function also resets all losses stored in the layer. These are stored in
-  the returned dictionary. Use `_restore_child_layer_functions` to restore
-  the original attributes.
-
-  Args:
-    layer: Keras Layer object.
-    serialization_cache: Dictionary shared between all objects during
-      serialization.
-
-  Returns:
-    Dictionary mapping layer objects -> original functions and losses:
-      { Child layer 1: {
-          'losses': Original losses,
-          'call': Original call function
-          '_activity_regularizer': Original activity regularizer},
-        Child layer 2: ...
-      }
-  """
-  # pylint: disable=protected-access
-  original_fns = {}
-
-  def replace_layer_functions(child_layer, serialized_fns):
-    """Replaces layer call and activity regularizer with wrapped functions."""
-    original_fns[child_layer] = {
-        'call': child_layer.call,
-        '_activity_regularizer': child_layer._activity_regularizer
-    }
-    with utils.no_automatic_dependency_tracking_scope(child_layer):
-      try:
-        child_layer._activity_regularizer = serialized_fns.get(
-            'activity_regularizer_fn')
-      except AttributeError:
-        # Some layers have an unsettable activity regularizer.
-        pass
-      child_layer.call = utils.use_wrapped_call(
-          child_layer,
-          serialized_fns['call_and_return_conditional_losses'],
-          child_layer._call_spec,
-          default_training_value=False)
-
-  def replace_metric_functions(child_layer, serialized_fns):
-    """Replaces metric functions with wrapped functions."""
-    original_fns[child_layer] = {
-        '__call__': child_layer.__call__,
-        'result': child_layer.result,
-        'update_state': child_layer.update_state
-    }
-    with utils.no_automatic_dependency_tracking_scope(child_layer):
-      child_layer.__call__ = serialized_fns['__call__']
-      child_layer.result = serialized_fns['result']
-      child_layer.update_state = serialized_fns['update_state']
-
-  for child_layer in utils.list_all_layers(layer):
-    if isinstance(child_layer, input_layer.InputLayer):
-      continue
-
-    if child_layer not in serialization_cache[constants.KERAS_CACHE_KEY]:
-      serialized_functions = (
-          child_layer._trackable_saved_model_saver._get_serialized_attributes(
-              serialization_cache).functions)
-    else:
-      serialized_functions = (
-          serialization_cache[constants.KERAS_CACHE_KEY][child_layer].functions)
-    if not serialized_functions:
-      # This indicates either:
-      #   - circular dependency, which means the current layer's functions
-      #     should be wrapped first.
-      #   - Child layer's inputs are not defined, so its functions have not been
-      #     wrapped. In this case, no replacement is necessary so move on to the
-      #     next child.
-      continue
-
-    if isinstance(child_layer, metrics.Metric):
-      replace_metric_functions(child_layer, serialized_functions)
-    else:
-      replace_layer_functions(child_layer, serialized_functions)
-
-  return original_fns
-  # pylint: enable=protected-access
-
-
-def _restore_child_layer_functions(original_fns):
-  """Restores attributes replaced with `_replace_child_layer_functions`."""
-  for child_layer, fns in original_fns.items():
-    with utils.no_automatic_dependency_tracking_scope(child_layer):
-      for fn_name, fn in fns.items():
-        try:
-          setattr(child_layer, fn_name, fn)  # pylint: disable=protected-access
-        except AttributeError:
-          pass  # In the case of _activity_regularizer, setting the attribute
-          # may be disallowed.
-
-
-# pylint: disable=protected-access
-def _reset_layer_losses(parent_layer):
-  """Resets losses of layer and its sublayers, and returns original losses."""
-  losses_dict = {}
-  for layer in utils.list_all_layers_and_sublayers(parent_layer):
-    losses_dict[layer] = {
-        'losses': layer._losses[:],
-        'eager_losses': layer._eager_losses[:]
-    }
-    with utils.no_automatic_dependency_tracking_scope(layer):
-      layer._losses = []
-      layer._eager_losses = []
-  return losses_dict
-
-
-def _restore_layer_losses(losses_dict):
-  for layer in losses_dict:
-    with utils.no_automatic_dependency_tracking_scope(layer):
-      layer._losses = losses_dict[layer]['losses']
-      layer._eager_losses = losses_dict[layer]['eager_losses']
-
-
-# pylint: enable=protected-access
-
-
-class LayerTracingContext(threading.local):
-
-  def __init__(self):
-    super().__init__()
-    self.enable_call_tracing = False
-    self.trace_queue = []
-
-
-_thread_local_data = LayerTracingContext()
-
-
-@tf_contextlib.contextmanager
-def tracing_scope():
-  """Enables tracing scope."""
-  # This enables the LayerCallCollection's tracing mechanism to trace all call
-  # functions in the collection.
-  previous_value = _thread_local_data.enable_call_tracing
-  previous_queue = _thread_local_data.trace_queue
-  try:
-    _thread_local_data.enable_call_tracing = True
-    _thread_local_data.trace_queue = []
-    yield
-  finally:
-    # Run traces from the queue.
-    while _thread_local_data.trace_queue:
-      fn, args, kwargs, training = _thread_local_data.trace_queue.pop()
-      if training is not None:
-        with backend.deprecated_internal_learning_phase_scope(training):
-          fn.get_concrete_function(*args, **kwargs)
-      else:
-        fn.get_concrete_function(*args, **kwargs)
-    _thread_local_data.trace_queue = previous_queue
-    _thread_local_data.enable_call_tracing = previous_value
-
-
-def add_trace_to_queue(fn, args, kwargs, training=None):
-  if tracing_enabled():
-    _thread_local_data.trace_queue.append(
-        (fn, args[:], kwargs.copy(), training))
-
-
-def tracing_enabled():
-  """Whether to add extra traces to the queue."""
-  return _thread_local_data.enable_call_tracing
-
-
-class LayerCallCollection:
-  """Groups wrapped layer call functions.
-
-  This is used to ensure that all layer call functions are traced with the same
-  inputs-
-    - call
-    - call_and_return_conditional_losses
-    - call_and_return_all_conditional_losses
-  """
-
-  def __init__(self, layer):
-    self.layer = layer
-
-    self.layer_call_method = _get_layer_call_method(layer)
-    self._expects_training_arg = utils.layer_uses_training_bool(layer)
-    self._call_spec = layer._call_spec  # pylint: disable=protected-access
-
-    # Create new call spec if the layer itself does not accept a training arg,
-    # but one of its child layers does. When this layer's call functions are
-    # traced, they will be traced with an added `training` keyword argument.
-    if not self.layer._expects_training_arg and self._expects_training_arg:  # pylint: disable=protected-access
-      arg_spec = utils.set_training_arg_spec(self._call_spec.full_argspec,
-                                             False)
-      self._call_spec = layer_utils.CallFunctionSpec(arg_spec)
-
-    self._layer_inputs = self._get_layer_inputs(layer)
-    self._functions = weakref.WeakValueDictionary()
-
-    # Get the input argument name from the args.
-    if self._call_spec.arg_names:
-      self._input_arg_name = self._call_spec.arg_names[0]
-    else:
-      # Layer could be defined with only varargs, in which case use a default
-      # name.
-      self._input_arg_name = 'inputs'
-
-  def _get_layer_inputs(self, layer):
-    """Inspects layer object and returns the inferred input signature.
-
-    Args:
-      layer: Layer object.
-
-    Returns:
-      List of possibly nested TensorSpecs of the layer call function inputs in
-      the form of `(args, kwargs)`
-    """
-    if (isinstance(layer.call, tf.__internal__.function.Function) and
-        layer.call.input_signature is not None):
-      return layer.call.input_signature, {}
-    elif isinstance(layer, training_lib.Model):
-      return saving_utils.model_call_inputs(layer)
-    elif (layer.input_spec is not None and
-          layer._use_input_spec_as_call_signature):  # pylint: disable=protected-access
-
-      def to_tensor_spec_or_none(x):
-        spec = input_spec.to_tensor_spec(x, layer._compute_dtype)  # pylint: disable=protected-access
-        # If the shape is too general (e.g. multiple dimensions are allowed),
-        # return None so that separate functions can be generated for each
-        # inferred input signature.
-        # TODO(b/134962016): currently partial signatures are not supported.
-        if spec.shape == tf.TensorShape(None):
-          return None, None
-        return spec
-
-      input_signature = [
-          tf.nest.map_structure(to_tensor_spec_or_none, layer.input_spec)
-      ]
-
-      return input_signature, {}
-    else:
-      return None, None
-
-  def add_trace(self, *args, **kwargs):
-    """Traces all functions with the same args and kwargs.
-
-    Args:
-      *args: Positional args passed to the original function.
-      **kwargs: Keyword args passed to the original function.
-    """
-    args = list(args)
-    kwargs = kwargs.copy()
-
-    for fn in self._functions.values():
-      # TODO(kathywu): Replace arguments with broader shapes defined in the
-      # input signature.
-      if self._expects_training_arg:
-
-        def trace_with_training(value, fn=fn):
-          nonlocal args, kwargs
-          args, kwargs = self._call_spec.set_arg_value(  # pylint: disable=protected-access
-              'training', value, args, kwargs, inputs_in_args=True)
-          add_trace_to_queue(fn, args, kwargs, value)
-
-        trace_with_training(True)
-        trace_with_training(False)
-      else:
-        add_trace_to_queue(fn, args, kwargs)
-
-  def training_arg_was_passed(self, args, kwargs):
-    return self._call_spec.arg_was_passed(  # pylint: disable=protected-access
-        'training',
-        args,
-        kwargs,
-        inputs_in_args=True)
-
-  def get_training_arg_value(self, args, kwargs):
-    try:
-      return self._call_spec.get_arg_value(  # pylint: disable=protected-access
-          'training',
-          args,
-          kwargs,
-          inputs_in_args=True)
-    except KeyError:  # Training is not in args or kwargs.
-      return None
-
-  def get_input_arg_value(self, args, kwargs):
-    return self._call_spec.get_arg_value(  # pylint: disable=protected-access
-        self._input_arg_name,
-        args,
-        kwargs,
-        inputs_in_args=True)
-
-  def _maybe_wrap_with_training_arg(self, call_fn, match_layer_training_arg):
-    """Wraps call function with added training argument if necessary."""
-    if not self.layer._expects_training_arg and self._expects_training_arg:  # pylint: disable=protected-access
-      # Add training arg to wrapper function.  # pylint: disable=protected-access
-      def wrap_with_training_arg(*args, **kwargs):
-        if match_layer_training_arg:
-          # Remove the training value, since the original call_fn does not
-          # expect a training arg. Instead, the training value will be
-          # propagated using the call context created in LayerCall.
-          args = list(args)
-          kwargs = kwargs.copy()
-          args, kwargs = self._call_spec.set_arg_value(  # pylint: disable=protected-access
-              'training', None, args, kwargs, inputs_in_args=True,
-              pop_kwarg_if_none=True)
-        return call_fn(*args, **kwargs)
-
-      return tf.__internal__.decorator.make_decorator(
-          target=call_fn,
-          decorator_func=wrap_with_training_arg,
-          decorator_argspec=self._call_spec.full_argspec)
-
-    return call_fn
-
-  def add_function(self, call_fn, name, match_layer_training_arg):
-    """Adds a layer call function to the collection.
-
-    Args:
-      call_fn: a python function
-      name: Name of call function
-      match_layer_training_arg: If True, removes the `training` from the
-        function arguments when calling `call_fn`.
-
-    Returns:
-      LayerCall (tf.function)
-    """
-    fn = LayerCall(
-        self,
-        self._maybe_wrap_with_training_arg(call_fn, match_layer_training_arg),
-        name)
-    self._functions[name] = fn.wrapped_call
-    return fn
-
-  def trace_with_input_signature(self):
-    """Trace with the layer/models inferred input signature if possible."""
-    if self._layer_inputs[0] is None:
-      return
-
-    args, kwargs = self._layer_inputs
-    if self._expects_training_arg:
-      args, kwargs = self._call_spec.set_arg_value('training', False, args,
-                                                   kwargs, inputs_in_args=True)
-    if None not in tf.nest.flatten([args, kwargs]):
-      # Manually add traces for layers that have keyword arguments and have
-      # a fully defined input signature.
-      self.add_trace(*args, **kwargs)
-
-
-def _filtered_inputs(inputs):
-  return list(filter(tf_utils.is_tensor_or_variable, tf.nest.flatten(inputs)))
-
-
-def layer_call_wrapper(call_collection, method, name):
-  """Ensures layer losses are kept the same, and runs method in call context."""
-
-  # Create wrapper that deals with losses and call context.
-  def wrapper(*args, **kwargs):
-    """Calls method within call context."""
-    layer = call_collection.layer
-    training = None
-    inputs = _filtered_inputs([args, kwargs])
-    # pylint: disable=protected-access
-    if (args or kwargs) and call_collection.training_arg_was_passed(
-        args, kwargs):
-      training = call_collection.get_training_arg_value(args, kwargs)
-    # pylint: enable=protected-access
-    original_losses = _reset_layer_losses(layer)
-    with base_layer_utils.call_context().enter(
-        layer, inputs=inputs, build_graph=False, training=training,
-        saving=True):
-      with autocast_variable.enable_auto_cast_variables(
-          layer._compute_dtype_object):  # pylint: disable=protected-access
-        ret = method(*args, **kwargs)
-    _restore_layer_losses(original_losses)
-    return ret
-
-  # Rename to `name`, since tf.function doesn't have a name argument. Without
-  # this, all functions returned by this method will be named "call", which
-  # would be a nightmare to debug.
-  fn = tf.__internal__.decorator.make_decorator(
-      target=method, decorator_func=wrapper)
-  fn.__name__ = name
-  return fn
-
-
-class LayerCall:
-  """Function that triggers traces of other functions in the same collection."""
-
-  def __init__(self, call_collection, call_fn, name):
-    """Initializes a LayerCall object.
-
-    Args:
-      call_collection: a LayerCallCollection, which contains the other layer
-        call functions (e.g. call_with_conditional_losses, call). These
-        functions should be traced with the same arguments.
-      call_fn: A call function.
-      name: Name of the call function.
-    """
-    self.call_collection = call_collection
-    self.wrapped_call = tf.function(
-        layer_call_wrapper(call_collection, call_fn, name))
-
-  def _maybe_trace(self, args, kwargs):
-    # Trigger traces of other call functions + extra training-arg traces.
-    if tracing_enabled():
-      self.call_collection.add_trace(*args, **kwargs)
-
-  def __call__(self, *args, **kwargs):
-    self._maybe_trace(args, kwargs)
-    return self.wrapped_call(*args, **kwargs)
-
-  def get_concrete_function(self, *args, **kwargs):
-    self._maybe_trace(args, kwargs)
-    return self.wrapped_call.get_concrete_function(*args, **kwargs)
-
-
-def _wrap_call_and_conditional_losses(layer):
-  """Wraps call function that returns a tuple of (outputs, losses).
-
-  The losses returned are conditional on the inputs passed to the call function.
-  Unconditional losses (e.g. weight regularizeration) are wrapped separately.
-
-  Args:
-    layer: a Keras layer object
-
-  Returns:
-    python call function that returns outputs and conditional losses -- excludes
-    activity regularizer
-  """
-  # Create function that generates both outputs and losses
-  layer_call = _get_layer_call_method(layer)
-
-  def call_and_return_conditional_losses(*args, **kwargs):
-    """Returns layer (call_output, conditional losses) tuple."""
-    call_output = layer_call(*args, **kwargs)
-    if version_utils.is_v1_layer_or_model(layer):
-      conditional_losses = layer.get_losses_for(
-          _filtered_inputs([args, kwargs]))
-    else:
-      conditional_losses = [
-          l for l in layer.losses if not hasattr(l, '_unconditional_loss')
-      ]
-    return call_output, conditional_losses
-
-  return _create_call_fn_decorator(layer, call_and_return_conditional_losses)
-
-
-def _extract_outputs_from_fn(layer, call_and_return_conditional_losses):
-  """Returns a function that returns only call function outputs."""
-  if isinstance(layer, keras_load.RevivedLayer):
-    return layer.keras_api.__call__  # pylint: disable=protected-access
-
-  def call(inputs, *args, **kwargs):
-    return call_and_return_conditional_losses(inputs, *args, **kwargs)[0]
-
-  return _create_call_fn_decorator(layer, call)
-
-
-def _append_activity_regularizer_loss(layer, call_fn_with_losses,
-                                      activity_regularizer_fn):
-  """Appends activity regularizer loss to losses returned by the wrapped fn."""
-
-  def fn(inputs, *args, **kwargs):
-    outputs, losses = call_fn_with_losses(inputs, *args, **kwargs)
-    losses.append(activity_regularizer_fn(outputs))
-    return outputs, losses
-
-  return _create_call_fn_decorator(layer, fn)
-
-
-def _create_call_fn_decorator(layer, wrapped_call):
-  call_fn = _get_layer_call_method(layer)
-  fn, arg_spec = utils.maybe_add_training_arg(
-      layer._call_spec,  # pylint: disable=protected-access
-      wrapped_call,
-      layer._expects_training_arg,  # pylint: disable=protected-access
-      default_training_value=False)
-  return tf.__internal__.decorator.make_decorator(
-      target=call_fn, decorator_func=fn, decorator_argspec=arg_spec)
-
-
-def _wrap_unconditional_loss(loss_fn, index):
-  """Wraps callable/unconditional loss, returning a serializable function."""
-  # Extract original loss function from partial function
-  fn = loss_fn.args[0] if isinstance(loss_fn, functools.partial) else loss_fn
-  if isinstance(fn, tf.__internal__.function.Function):
-    return fn
-  else:
-    return tf.__internal__.function.Function(
-        fn, 'loss_fn_{}'.format(index), input_signature=[])
-
-
-def _wrap_activity_regularizer(layer):
-  """Wraps the activity regularizer."""
-  # pylint: disable=protected-access
-  if isinstance(layer._activity_regularizer, tf.__internal__.function.Function):
-    return layer._activity_regularizer
-  return tf.__internal__.function.Function(
-      layer._activity_regularizer,
-      '{}_activity_regularizer'.format(layer.name),
-      input_signature=[
-          tf.TensorSpec(None, layer._compute_dtype or backend.floatx())
-      ])
-  # pylint: enable=protected-access
-
-
-def _get_layer_call_method(layer):
-  if isinstance(layer.call, (tf.__internal__.function.Function)):
-    return layer.call.python_function
-  return layer.call
diff --git a/keras/saving/saved_model/saved_model_test.py b/keras/saving/saved_model/saved_model_test.py
deleted file mode 100644
index 972126691d85..000000000000
--- a/keras/saving/saved_model/saved_model_test.py
+++ /dev/null
@@ -1,1418 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Tests for saving and loading Keras models and layers from SavedModel.
-
-These should ensure that all layer properties are correctly assigned after
-loading from the SavedModel.
-
-Tests that focus on the model structure should go in revive_test.py
-"""
-
-import os
-import shutil
-import sys
-
-from absl.testing import parameterized
-import keras
-from keras import regularizers
-from keras.feature_column.dense_features import DenseFeatures
-from keras.protobuf import saved_metadata_pb2
-from keras.protobuf import versions_pb2
-from keras.saving.saved_model import json_utils
-from keras.saving.saved_model import load as keras_load
-from keras.saving.saved_model import save_impl as keras_save
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-from keras.utils import control_flow_util
-from keras.utils import generic_utils
-from keras.utils import tf_contextlib
-from keras.utils import tf_inspect
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-
-
-class LayerWithLearningPhase(keras.engine.base_layer.Layer):
-
-  def build(self, input_shape):
-    self.input_spec = keras.layers.InputSpec(shape=[None] * len(input_shape))
-    self.built = True
-
-  def call(self, x, training=None):
-    if training is None:
-      training = keras.backend.learning_phase()
-    output = control_flow_util.smart_cond(training, lambda: x * 0,
-                                          lambda: tf.identity(x))
-    if not tf.executing_eagerly():
-      output._uses_learning_phase = True  # pylint: disable=protected-access
-    return output
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  @property
-  def _use_input_spec_as_call_signature(self):
-    return True
-
-
-class LayerWithLoss(keras.layers.Layer):
-
-  def call(self, inputs):
-    self.add_loss(tf.reduce_sum(inputs))
-    return inputs * 2
-
-
-class LayerWithUpdate(keras.layers.Layer):
-
-  def build(self, _):
-    self.v = self.add_weight(
-        'v',
-        shape=[],
-        initializer=keras.initializers.zeros,
-        trainable=False,
-        dtype=tf.float32)
-
-  def call(self, inputs, training=True):
-    if training:
-      self.add_update(self.v.assign_add(1.))
-    return inputs * 2.
-
-
-@generic_utils.register_keras_serializable('Testing')
-class GlobalLayerThatShouldFailIfNotAdded(keras.layers.Layer):
-  _must_restore_from_config = True
-
-
-@test_combinations.run_all_keras_modes
-class TestSavedModelFormatAllModes(test_combinations.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  def _get_model(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.layers[-1].activity_regularizer = regularizers.get('l2')
-    model.activity_regularizer = regularizers.get('l2')
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop')
-    def callable_loss():
-      return tf.reduce_sum(model.weights[0])
-    model.add_loss(callable_loss)
-    return model
-
-  def _train_model(self, model, use_dataset=False):
-    x = np.random.random((1, 3))
-    y = np.random.random((1, 4))
-
-    if not tf.__internal__.tf2.enabled():
-      # The layer autocast behavior only runs when autocast is enabled, so
-      # in V1, the numpy inputs still need to be cast to float32.
-      x = x.astype(np.float32)
-      y = y.astype(np.float32)
-
-    if use_dataset:
-      dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(1)
-      model.fit(dataset)
-    else:
-      model.train_on_batch(x, y)
-
-  def _save_and_load(self, model):
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    return loaded
-
-  def _test_evaluation(self, model, loaded):
-    # Assert that original and loaded models have the same results when called.
-    self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
-    self.assertAllClose(self.evaluate(model.weights),
-                        self.evaluate(loaded.weights))
-
-    input_arr = tf.constant(
-        np.random.random((1, 3)).astype(np.float32))
-    self.assertAllClose(self.evaluate(model(input_arr)),
-                        self.evaluate(loaded(input_arr)))
-    # Validate losses. The order of conditional losses may change between the
-    # model and loaded model, so sort the losses first.
-    if tf.executing_eagerly():
-      self.assertAllClose(sorted(self.evaluate(model.losses)),
-                          sorted(self.evaluate(loaded.losses)))
-
-  @test_combinations.run_with_all_model_types
-  def test_model_save_and_load(self):
-    model = self._get_model()
-    self._train_model(model, use_dataset=False)
-    loaded = self._save_and_load(model)
-    self._test_evaluation(model, loaded)
-
-  @test_combinations.run_with_all_model_types
-  def test_model_save_and_load_dataset(self):
-    model = self._get_model()
-    self._train_model(model, use_dataset=True)
-    loaded = self._save_and_load(model)
-    self._test_evaluation(model, loaded)
-
-  def test_trainable_weights(self):
-    """Tests that trainable status of individual weights is preserved."""
-    layer = keras.layers.Dense(4, name='custom_layer')
-    layer.build([None, 3])
-    layer.add_weight(
-        'extra_weight', shape=[],
-        initializer=tf.compat.v1.constant_initializer(11),
-        trainable=True)
-    layer.add_weight(
-        'extra_weight_2', shape=[],
-        initializer=tf.compat.v1.constant_initializer(12),
-        trainable=False)
-    model = keras.Sequential([keras.Input([3,]), layer])
-
-    saved_model_dir = self._save_model_dir()
-    self.evaluate(tf.compat.v1.variables_initializer(layer.variables))
-    model.save(saved_model_dir, save_format='tf')
-    loaded_model = keras_load.load(saved_model_dir)
-    self.evaluate(tf.compat.v1.variables_initializer(loaded_model.variables))
-
-    loaded = loaded_model.layers[-1]
-
-    equal_attrs = ['name', '_expects_training_arg', 'trainable']
-    for attr in equal_attrs:
-      self.assertEqual(getattr(layer, attr), getattr(loaded, attr))
-
-    all_close = ['weights', 'trainable_weights', 'non_trainable_weights']
-    for attr in all_close:
-      self.assertAllClose(self.evaluate(getattr(layer, attr)),
-                          self.evaluate(getattr(loaded, attr)))
-
-  @test_combinations.run_with_all_model_types
-  def test_trainable_layers(self):
-    """Tests that trainable status of individual layers is preserved."""
-    model = model = self._get_model()
-    # Set the last layer to *not* be trainable.
-    model.layers[-1].trainable = False
-    self._train_model(model, use_dataset=True)
-    loaded = self._save_and_load(model)
-
-    self._test_evaluation(model, loaded)
-    self.assertFalse(model.layers[-1].trainable)
-    self.assertFalse(loaded.layers[-1].trainable)
-
-  def test_trainable_custom_model_false(self):
-    """Tests that overall False trainable status of Model is preserved."""
-    # Set all layers to *not* be trainable.
-    model = test_utils.SmallSubclassMLP(1, 4, trainable=False)
-    model.compile(loss='mse', optimizer='rmsprop')
-    self._train_model(model, use_dataset=False)
-    loaded = self._save_and_load(model)
-
-    self._test_evaluation(model, loaded)
-    self.assertEmpty(model.trainable_variables)
-    self.assertEmpty(loaded.trainable_variables)
-
-  def test_maintains_losses(self):
-    """Tests that the layer losses do not change before and after export."""
-    model = keras.models.Sequential([LayerWithLoss()])
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop')
-    input_arr = np.random.random((1, 3))
-    target_arr = np.random.random((1, 3))
-
-    # Test that symbolic losses are maintained (train_on_batch saves symbolic
-    # losses.)
-    model.train_on_batch(input_arr, target_arr)
-    previous_losses = model.losses[:]
-
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    with previous_losses[0].graph.as_default():
-      # If we try to compare symbolic Tensors in eager mode assertAllEqual will
-      # return False even if they are the same Tensor.
-      self.assertEqual(previous_losses, model.losses)
-
-    if tf.executing_eagerly():
-      # Test that eager losses are maintained.
-      model(input_arr)  # Calls model eagerly, creating eager losses.
-      previous_losses = model.losses[:]
-      model.save(saved_model_dir, save_format='tf')
-      self.assertAllEqual(previous_losses, model.losses)
-
-  def test_layer_with_learning_phase(self):
-    layer = LayerWithLearningPhase()
-    layer.build([None, None])
-    saved_model_dir = self._save_model_dir()
-    model = test_utils.get_model_from_layers(
-        [layer], input_shape=[None], model_type='functional')
-    model.save(saved_model_dir, save_format='tf')
-    loaded_model = keras_load.load(saved_model_dir)
-    loaded = loaded_model.layers[-1]
-    input_arr = tf.ones((4, 3))
-
-    # Run the layer, and use the keras backend learning phase
-    keras.backend.set_learning_phase(0)
-    self.assertAllEqual(input_arr, loaded(input_arr))
-    keras.backend.set_learning_phase(1)
-    self.assertAllEqual(tf.zeros((4, 3)), loaded(input_arr))
-
-    # Run the layer while explicitly setting the training argument
-    self.assertAllEqual(
-        input_arr, loaded(input_arr, training=tf.constant(False)))
-    self.assertAllEqual(
-        tf.zeros((4, 3)),
-        loaded(input_arr, training=tf.constant(True)))
-
-  @test_combinations.run_with_all_model_types
-  def test_standard_loader(self):
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.activity_regularizer = regularizers.get('l2')
-    def eager_loss():
-      return tf.reduce_sum(model.weights[0])
-    model.add_loss(eager_loss)
-
-    # Call predict to ensure that all layers are built and inputs are set.
-    model.predict(np.random.random((1, 3)).astype(np.float32))
-    saved_model_dir = self._save_model_dir()
-
-    model.save(saved_model_dir, save_format='tf')
-
-    loaded = tf.saved_model.load(saved_model_dir)
-    self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
-    all_close = ['variables', 'trainable_variables',
-                 'non_trainable_variables']
-    for attr in all_close:
-      self.assertAllClose(self.evaluate(getattr(model, attr)),
-                          self.evaluate(getattr(loaded.keras_api, attr)))
-    self.assertLen(loaded.regularization_losses, 1)
-    expected_layers = len(model.layers)
-    self.assertEqual(expected_layers, len(loaded.keras_api.layers))
-    input_arr = tf.ones((4, 3))
-    self.assertAllClose(self.evaluate(model(input_arr)),
-                        self.evaluate(loaded(input_arr, training=False)))
-
-  @test_combinations.run_with_all_model_types
-  def test_compiled_model(self):
-    # TODO(b/134519980): Issue with model.fit if the model call function uses
-    # a tf.function (Graph mode only).
-    if not tf.executing_eagerly():
-      return
-
-    input_arr = np.random.random((1, 3))
-    target_arr = np.random.random((1, 4))
-
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    expected_predict = model.predict(input_arr)
-
-    # Compile and save model.
-    model.compile('rmsprop', 'mse')
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    loaded = keras_load.load(saved_model_dir)
-    actual_predict = loaded.predict(input_arr)
-    self.assertAllClose(expected_predict, actual_predict)
-
-    loss_before = loaded.evaluate(input_arr, target_arr)
-    loaded.fit(input_arr, target_arr)
-    loss_after = loaded.evaluate(input_arr, target_arr)
-    self.assertLess(loss_after, loss_before)
-    predict = loaded.predict(input_arr)
-
-    ckpt_path = os.path.join(self.get_temp_dir(), 'weights')
-    loaded.save_weights(ckpt_path)
-
-    # Ensure that the checkpoint is compatible with the original model.
-    model.load_weights(ckpt_path)
-    self.assertAllClose(predict, model.predict(input_arr))
-
-  def test_metadata_input_spec(self):
-    class LayerWithNestedSpec(keras.layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = {
-            'a': keras.layers.InputSpec(max_ndim=3, axes={-1: 2}),
-            'b': keras.layers.InputSpec(shape=(None, 2, 3), dtype='int32')}
-
-      @property
-      def _use_input_spec_as_call_signature(self):
-        return True
-
-    layer = LayerWithNestedSpec()
-    saved_model_dir = self._save_model_dir()
-    model = test_utils.get_model_from_layers(
-        [layer], model_type='subclass')
-    model({'a': tf.constant([[2, 4]]),
-           'b': tf.ones([1, 2, 3], dtype=tf.int32)})
-    model.save(saved_model_dir, save_format='tf')
-    loaded_model = keras_load.load(saved_model_dir)
-    loaded = loaded_model.layers[-1]
-    self.assertEqual(3, loaded.input_spec['a'].max_ndim)
-    self.assertEqual({-1: 2}, loaded.input_spec['a'].axes)
-    self.assertAllEqual([None, 2, 3], loaded.input_spec['b'].shape)
-    self.assertEqual('int32', loaded.input_spec['b'].dtype)
-
-  def test_must_restore_from_config_fails_if_layer_is_not_in_scope(self):
-
-    class LayerThatShouldFailIfNotAdded(keras.layers.Layer):
-      _must_restore_from_config = True
-
-    layer = LayerThatShouldFailIfNotAdded()
-    saved_model_dir = self._save_model_dir()
-    model = test_utils.get_model_from_layers(
-        [layer], input_shape=[3], model_type='functional')
-    model.save(saved_model_dir, save_format='tf')
-    with self.assertRaisesRegex(ValueError,
-                                'Unknown layer: LayerThatShouldFailIfNotAdded'):
-      _ = keras_load.load(saved_model_dir)
-
-  def test_must_restore_from_config_custom_object_scope(self):
-
-    class LayerThatShouldFailIfNotAdded(keras.layers.Layer):
-      _must_restore_from_config = True
-
-    layer = LayerThatShouldFailIfNotAdded()
-    model = test_utils.get_model_from_layers(
-        [layer], input_shape=[3], model_type='functional')
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    with generic_utils.CustomObjectScope(
-        {'LayerThatShouldFailIfNotAdded': LayerThatShouldFailIfNotAdded}):
-      _ = keras_load.load(saved_model_dir)
-
-  def test_must_restore_from_config_registration(self):
-    layer = GlobalLayerThatShouldFailIfNotAdded()
-    saved_model_dir = self._save_model_dir()
-    model = test_utils.get_model_from_layers(
-        [layer], input_shape=[3], model_type='functional')
-    model.save(saved_model_dir, save_format='tf')
-    _ = keras_load.load(saved_model_dir)
-
-  def test_multi_input_model(self):
-    input_1 = keras.layers.Input(shape=(3,))
-    input_2 = keras.layers.Input(shape=(5,))
-    model = keras.Model([input_1, input_2], [input_1, input_2])
-    saved_model_dir = self._save_model_dir()
-
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    input_arr_1 = np.random.random((1, 3)).astype('float32')
-    input_arr_2 = np.random.random((1, 5)).astype('float32')
-
-    outputs = loaded([input_arr_1, input_arr_2])
-    self.assertAllEqual(input_arr_1, outputs[0])
-    self.assertAllEqual(input_arr_2, outputs[1])
-
-  def test_revived_sequential(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(5, input_shape=(3,),
-                                 kernel_regularizer=regularizers.get('l2')))
-    model.add(keras.layers.Dense(2, kernel_regularizer=regularizers.get('l2')))
-
-    self.evaluate(tf.compat.v1.variables_initializer(model.variables))
-
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-
-    self.assertLen(loaded.layers, 2)
-    self.assertLen(loaded.losses, 2)
-
-    loaded.pop()
-
-    self.assertLen(loaded.layers, 1)
-    self.assertLen(loaded.losses, 1)
-
-    loaded.add(keras.layers.Dense(2, kernel_regularizer=regularizers.get('l2')))
-
-    self.assertLen(loaded.layers, 2)
-    self.assertLen(loaded.losses, 2)
-
-  def testBatchNormUpdates(self):
-    model = keras.models.Sequential(
-        keras.layers.BatchNormalization(input_shape=(1,)))
-    self.evaluate(tf.compat.v1.variables_initializer(model.variables))
-    saved_model_dir = self._save_model_dir()
-
-    with self.captureWritesToStream(sys.stderr) as captured_logs:
-      model.save(saved_model_dir, save_format='tf')
-      loaded = keras_load.load(saved_model_dir)
-
-    # Assert that saving does not log deprecation warnings
-    # (even if it needs to set learning phase for compat reasons)
-    if tf.executing_eagerly():
-      self.assertNotIn('deprecated', captured_logs.contents())
-
-    input_arr = tf.constant([[11], [12], [13]], dtype=tf.float32)
-    input_arr2 = tf.constant([[14], [15], [16]], dtype=tf.float32)
-    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
-
-    self.evaluate(loaded(input_arr, training=True))
-    if not tf.executing_eagerly():
-      self.evaluate(loaded.get_updates_for(input_arr))
-    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
-
-    self.evaluate(loaded(input_arr2, training=False))
-    if not tf.executing_eagerly():
-      self.evaluate(loaded.get_updates_for(input_arr2))
-    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
-
-  def testDisablingBatchNormTrainableBeforeSaving(self):
-    # We disable trainable on the batchnorm layers before saving
-    model = keras.models.Sequential(
-        keras.layers.BatchNormalization(input_shape=(1,)))
-    model.trainable = False
-    self.evaluate(tf.compat.v1.variables_initializer(model.variables))
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
-    input_arr = tf.constant([[11], [12], [13]], dtype=tf.float32)
-    input_arr2 = tf.constant([[14], [15], [16]], dtype=tf.float32)
-    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
-
-    # Trainable should still be disabled after loading
-    self.evaluate(loaded(input_arr, training=True))
-    if not tf.executing_eagerly():
-      self.evaluate(loaded.get_updates_for(input_arr))
-    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.0])
-
-    # Re-enabling trainable on the loaded model should cause the batchnorm
-    # layer to start training again.
-    # Note: this only works in v2.
-    if tf.executing_eagerly():
-      loaded.trainable = True
-      self.evaluate(loaded(input_arr, training=True))
-      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
-
-      self.evaluate(loaded(input_arr2, training=False))
-      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
-
-  def testSaveWithSignatures(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(5, input_shape=(3,),
-                                 kernel_regularizer=regularizers.get('l2')))
-    model.add(keras.layers.Dropout(0.5))
-    model.add(keras.layers.Dense(4, kernel_regularizer=regularizers.get('l2')))
-
-    input_arr = np.random.random((2, 3))
-    target_arr = np.random.random((2, 4))
-
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop')
-    model.train_on_batch(input_arr, target_arr)
-
-    @tf.function(input_signature=[tf.TensorSpec((None, 3))])
-    def predict(inputs):
-      return {'predictions': model(inputs)}
-
-    feature_configs = {
-        'inputs': tf.io.FixedLenFeature(
-            shape=[2, 3], dtype=tf.float32)}
-
-    @tf.function(
-        input_signature=[tf.TensorSpec([None], tf.string)])
-    def parse_and_predict(examples):
-      features = tf.compat.v1.parse_single_example(examples[0], feature_configs)
-      return {'predictions': model(features['inputs']),
-              'layer_1_outputs': model.layers[0](features['inputs'])}
-
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf', signatures={
-        'predict': predict,
-        'parse_and_predict': parse_and_predict})
-    model.save('/tmp/saved', save_format='tf', signatures={
-        'predict': predict,
-        'parse_and_predict': parse_and_predict})
-
-    loaded = keras_load.load(saved_model_dir)
-
-    self.assertAllClose(
-        model.predict(input_arr),
-        loaded.signatures['predict'](tf.convert_to_tensor(
-            input_arr.astype('float32')))['predictions'])
-
-    feature = {
-        'inputs': feature_pb2.Feature(
-            float_list=feature_pb2.FloatList(
-                value=input_arr.astype('float32').flatten()))}
-    example = example_pb2.Example(
-        features=feature_pb2.Features(feature=feature))
-    outputs = loaded.signatures['parse_and_predict'](
-        tf.convert_to_tensor([example.SerializeToString()]))
-    self.assertAllClose(model.predict(input_arr), outputs['predictions'])
-    self.assertAllClose(model.layers[0](input_arr), outputs['layer_1_outputs'])
-
-  def testTrainingDefaults(self):
-    def assert_training_default(fn, default_value):
-      arg_spec = tf_inspect.getfullargspec(fn)
-      fn_defaults = arg_spec.defaults or []
-      defaults = dict()
-      # The call arg defaults are an n-tuple of the last n elements of the args
-      # list. (n = # of elements that have a default argument)
-      for i in range(-1 * len(fn_defaults), 0):
-        defaults[arg_spec.args[i]] = fn_defaults[i]
-      # The default training arg will be any (non-None) default specified in the
-      # method signature, or None if no value is specified.
-      defaults.update(arg_spec.kwonlydefaults or {})
-      self.assertEqual(defaults['training'], default_value)
-
-    class LayerWithTrainingRequiredArg(keras.engine.base_layer.Layer):
-
-      def call(self, inputs, training):
-        return control_flow_util.smart_cond(training, lambda: inputs * 0,
-                                            lambda: tf.identity(inputs))
-
-    class LayerWithTrainingDefaultTrue(keras.engine.base_layer.Layer):
-
-      def call(self, inputs, training=True):
-        return control_flow_util.smart_cond(training, lambda: inputs * 0,
-                                            lambda: tf.identity(inputs))
-
-    class Model(keras.models.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer_with_training_default_none = LayerWithLearningPhase()
-        self.layer_with_training_default_true = LayerWithTrainingDefaultTrue()
-        self.layer_with_required_training_arg = LayerWithTrainingRequiredArg()
-
-      def call(self, inputs):
-        x = self.layer_with_training_default_none(inputs)
-        x += self.layer_with_training_default_true(inputs)
-        x += self.layer_with_required_training_arg(inputs, False)
-        return x
-
-    model = Model()
-    # Build and set model inputs
-    model.predict(np.ones([1, 3]).astype('float32'))
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    load = tf.saved_model.load(saved_model_dir)
-
-    # Ensure that the Keras loader is able to load and build the model.
-    _ = keras_load.load(saved_model_dir)
-
-    assert_training_default(load.__call__, False)
-    assert_training_default(
-        load.layer_with_training_default_none.__call__, False)
-    assert_training_default(
-        load.layer_with_training_default_true.__call__, True)
-
-    # Assert that there are no defaults for layer with required training arg
-    arg_spec = tf_inspect.getfullargspec(
-        load.layer_with_required_training_arg.__call__)
-    self.assertFalse(arg_spec.defaults)  # defaults is None or empty
-
-  def testTraceModelWithKwarg(self):
-    class Model(keras.models.Model):
-
-      def call(self, inputs, keyword=None):
-        return tf.identity(inputs)
-
-    model = Model()
-    prediction = model.predict(np.ones([1, 3]).astype('float32'))
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    with keras.utils.generic_utils.custom_object_scope({'Model': Model}):
-      loaded = keras_load.load(saved_model_dir)
-    self.assertAllClose(prediction,
-                        loaded.predict(np.ones([1, 3]).astype('float32')))
-
-    loaded_without_scope = keras_load.load(saved_model_dir)
-    if tf.__internal__.tf2.enabled():
-      with self.assertRaises(NotImplementedError):
-        loaded_without_scope.predict(np.ones([1, 3]).astype('float32'))
-
-  def testFeatureColumns(self):
-    # TODO(b/120099662): Error with table initialization with Keras models in
-    # graph mode.
-    if tf.executing_eagerly():
-      numeric = tf.feature_column.numeric_column('a')
-      bucketized = tf.feature_column.bucketized_column(
-          numeric, boundaries=[5, 10, 15])
-      cat_vocab = tf.feature_column.categorical_column_with_vocabulary_list(
-          'b', ['1', '2', '3'])
-      one_hot = tf.feature_column.indicator_column(cat_vocab)
-      embedding = tf.feature_column.embedding_column(cat_vocab, dimension=8)
-      feature_layer = DenseFeatures([bucketized, one_hot, embedding])
-      model = keras.models.Sequential(feature_layer)
-
-      features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])}
-      predictions = model.predict(features)
-
-      saved_model_dir = self._save_model_dir()
-      model.save(saved_model_dir, save_format='tf')
-      loaded = keras_load.load(saved_model_dir)
-      loaded_predictions = loaded.predict(features)
-      self.assertAllClose(predictions, loaded_predictions)
-
-  def testSaveTensorKwarg(self):
-
-    class LayerWithTensorKwarg(keras.layers.Layer):
-
-      def call(self, inputs, tensor=None):
-        if tensor is not None:
-          return inputs * tf.cast(tensor, tf.float32)
-        else:
-          return inputs
-
-    t = self.evaluate(tf.sequence_mask(1))
-    inputs = keras.layers.Input(shape=(3))
-    model = keras.models.Model(inputs, LayerWithTensorKwarg()(inputs, t))
-
-    input_arr = np.random.random((1, 3))
-    predictions = model.predict(input_arr)
-
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    loaded_predictions = loaded.predict(input_arr)
-    self.assertAllClose(predictions, loaded_predictions)
-
-  def testModelWithTfFunctionCall(self):
-    class Subclass(keras.models.Model):
-
-      @tf.function
-      def call(self, inputs, training=False):
-        return inputs * tf.cast(training, tf.float32)
-
-    model = Subclass()
-    model.predict(tf.ones((1, 2)), steps=1)
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    self.assertAllEqual(
-        [[1, 5]],
-        self.evaluate(loaded(tf.constant([[1, 5.]]), training=True)))
-    self.assertAllEqual(
-        [[0, 0]],
-        self.evaluate(loaded(tf.constant([[1, 5.]]), training=False)))
-
-  def testReviveFunctionalModel(self):
-
-    class CustomAdd(keras.layers.Add):
-
-      def build(self, input_shape):
-        self.w = self.add_weight('w', shape=[])
-        super().build(input_shape)
-
-      def call(self, inputs):
-        outputs = super().call(inputs)
-        return outputs * self.w
-
-    input1 = keras.layers.Input(shape=(None, 3), name='input_1')
-    input2 = keras.layers.Input(shape=(None, 3), name='input_2')
-
-    d = keras.layers.Dense(4, name='dense_with_two_inbound_nodes')
-    output1 = d(input1)
-    output2 = d(input2)
-
-    # Use a custom layer in this model to ensure that layers aren't being
-    # recreated directly from the config.
-    outputs = CustomAdd(name='custom')([output1, output2])
-    model = keras.models.Model([input1, input2], outputs, name='save_model')
-
-    self.evaluate(tf.compat.v1.variables_initializer(model.variables))
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    loaded = keras_load.load(saved_model_dir)
-    self.assertEqual('save_model', loaded.name)
-    self.assertLen(
-        loaded.get_layer('dense_with_two_inbound_nodes')._inbound_nodes, 2)
-    self.assertEqual('CustomAdd', type(loaded.get_layer('custom')).__name__)
-    self.assertLen(loaded.get_layer('custom').weights, 1)
-
-  def _testAddUpdate(self, scope):
-    with scope:
-      layer_with_update = LayerWithUpdate()
-      model = test_utils.get_model_from_layers([layer_with_update],
-                                               input_shape=(3,))
-
-      x = np.ones((10, 3))
-      if test_utils.get_model_type() == 'subclass':
-        model.predict(x, batch_size=10)
-      self.evaluate(tf.compat.v1.variables_initializer(model.variables))
-      saved_model_dir = self._save_model_dir()
-      model.save(saved_model_dir, save_format='tf')
-
-    loaded = keras_load.load(saved_model_dir)
-    loaded_layer = loaded.layers[-1]
-    self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
-    self.assertEqual(self.evaluate(loaded_layer.v), 0.)
-
-    loaded.compile('sgd', 'mse')
-    loaded.fit(x, x, batch_size=10)
-    self.assertEqual(self.evaluate(loaded_layer.v), 1.)
-
-  @test_combinations.run_with_all_model_types
-  def testSaveLayerWithUpdates(self):
-    @tf_contextlib.contextmanager
-    def nullcontextmanager():
-      yield
-    self._testAddUpdate(nullcontextmanager())
-
-  @test_combinations.run_with_all_model_types
-  def testSaveInStrategyScope(self):
-    self._testAddUpdate(tf.distribute.MirroredStrategy().scope())
-
-  def testSaveTimeDistributedLayer(self):
-    model = keras.Sequential([
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(1, kernel_regularizer=regularizers.get('l2')),
-            input_shape=(None, 1))])
-    predictions = model.predict_on_batch(tf.ones((3, 2, 1)))
-
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    loaded = keras_load.load(saved_model_dir)
-    self.assertAllClose(loaded.predict_on_batch(tf.ones((3, 2, 1))),
-                        predictions)
-
-  @parameterized.named_parameters([
-      ('with_unrolling', True),
-      ('no_unrolling', False)
-  ])
-  def testSaveStatefulRNN(self, unroll):
-    batch = 12
-    timesteps = 10
-    input_dim = 8
-    input_arr = np.ones((batch, timesteps, input_dim)).astype('float32')
-
-    cells = [keras.layers.LSTMCell(32), keras.layers.LSTMCell(64)]
-    if unroll:
-      x = keras.Input(batch_shape=(batch, timesteps, input_dim))
-    else:
-      x = keras.Input(batch_shape=(batch, None, input_dim))
-    layer = keras.layers.RNN(cells, stateful=True, unroll=unroll)
-    y = layer(x)
-
-    model = keras.Model(x, y)
-    model.compile('rmsprop', 'mse',
-                  run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch(
-        np.zeros((batch, timesteps, input_dim)).astype('float32'),
-        np.zeros((batch, 64)).astype('float32'))
-
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    loaded = keras_load.load(saved_model_dir)
-    loaded_layer = loaded.layers[1]
-
-    if not tf.executing_eagerly():
-      keras.backend.get_session()  # force variable initialization
-
-    self.assertAllClose(layer.states, loaded_layer.states)
-    self.assertAllClose(model(input_arr), loaded(input_arr))
-
-  def testSaveBidirectionalLSTM(self):
-    # Make sure that the input spec of an unrolled RNN is not used when wrapped
-    # in a Bidirectional layer. https://github.com/keras-team/keras/issues/15454
-    input_layer = keras.Input(
-        batch_input_shape=(1, 15, 128), name='input', dtype=tf.float32)
-    lstm_layer = keras.layers.Bidirectional(
-        keras.layers.LSTM(
-            units=64,
-            name='lstm',
-            dropout=0.2,
-            trainable=False,
-            unroll=True,
-        )
-    )
-    output_layer = lstm_layer(input_layer)
-    model = keras.Model(input_layer, output_layer)
-    saved_model_dir = self._save_model_dir()
-    self.evaluate(tf.compat.v1.variables_initializer(model.variables))
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    input_arr = np.random.random((1, 15, 128)).astype('float32')
-    self.assertAllClose(model(input_arr), loaded(input_arr))
-
-  @parameterized.named_parameters([('stateful', True), ('stateless', False)])
-  def testSaveConvLSTM2D(self, stateful):
-    data_format = 'channels_first'
-    batch, timesteps, channels, rows, cols = 12, 10, 8, 4, 4
-    input_arr = np.ones(
-        (batch, timesteps, channels, rows, cols)).astype('float32')
-    layer = keras.layers.ConvLSTM2D(
-        filters=16, kernel_size=(1, 1), data_format=data_format,
-        stateful=stateful)
-    x = keras.Input(batch_shape=(batch, timesteps, channels, rows, cols))
-    y = layer(x)
-    model = keras.Model(x, y)
-
-    predict_1 = model(input_arr)
-    self.evaluate([v.initializer for v in model.variables])
-    saved_model_dir = self._save_model_dir()
-
-    model.save(saved_model_dir, save_format='tf')
-    del model
-
-    loaded = keras_load.load(saved_model_dir)
-    self.evaluate([v.initializer for v in loaded.variables])
-    if stateful:
-      loaded.reset_states()
-    predict_2 = loaded(input_arr)
-    self.assertAllClose(predict_1, predict_2)
-
-  def testSaveWithRaggedInputs(self):
-
-    class EmbeddingMerger(keras.layers.Layer):
-
-      def __init__(self, list_features, **kwargs):
-        super().__init__(**kwargs)
-        self._supports_ragged_inputs = True
-        self.embeddings = {
-            feature: keras.layers.Embedding(10, 3) for feature in list_features}
-        self.mean = keras.layers.Lambda(
-            tf.reduce_mean, arguments=dict(axis=1))
-
-      def call(self, inputs):
-        tensors = [self.embeddings[col](inputs[col]) for col in inputs]
-        tensors = [self.mean(inp) for inp in tensors]
-        return keras.layers.Add()(tensors)
-
-    list_features = ['feature_1', 'feature_2']
-    feature_1 = tf.ragged.constant([[0.], [1, 3]])
-    feature_2 = tf.ragged.constant([[1., 2], [4]])
-    f = {'feature_1': feature_1,
-         'feature_2': feature_2}
-    f_inputs = {
-        'feature_1': keras.Input(shape=(None,), name='feature_1', ragged=True),
-        'feature_2': keras.Input(shape=(None,), name='feature_2', ragged=True)}
-
-    out = EmbeddingMerger(list_features)(f_inputs)
-    model = keras.Model(f_inputs, out)
-    self.evaluate(tf.compat.v1.variables_initializer(model.variables))
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    loaded = keras_load.load(saved_model_dir)
-    self.evaluate(tf.compat.v1.variables_initializer(loaded.variables))
-    self.assertAllClose(model.predict(f), loaded.predict(f))
-
-  def testSaveMultipleInputs(self):
-    class CustomLayer(keras.layers.Layer):
-
-      def call(self, *input_list):
-        self.add_loss(input_list[-2] * 2)
-        return sum(input_list[:-1])  # The test's last input is a non-tensor arg
-
-    class CustomModel(keras.Model):
-
-      def build(self, _):
-        self.layer = CustomLayer()
-
-      def call(self, *inputs):
-        inputs = list(inputs)
-        inputs.append(object())  # Test that the layer handles non-tensor inputs
-        return self.layer(*inputs)
-
-    model = CustomModel()
-    inp = [tf.constant(i, shape=[1, 1], dtype=tf.float32)
-           for i in range(1, 5)]
-    expected = model(*inp)
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    actual = loaded(*inp)
-    self.assertAllEqual(self.evaluate(expected),
-                        self.evaluate(actual))
-
-  def testSaveMultipleInputsWithTraining(self):
-
-    class CustomModel(keras.Model):
-      def call(self, input_1, training, input_2):
-        if training:
-          return input_1
-        else:
-          return input_2
-
-    inp1 = tf.constant(1., shape=[1])
-    inp2 = tf.constant(2., shape=[1])
-
-    model = CustomModel()
-    self.assertEqual(self.evaluate(model(inp1, True, inp2)), 1.)
-    self.assertEqual(self.evaluate(model(inp1, False, inp2)), 2.)
-
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-    loaded = keras_load.load(saved_model_dir)
-    self.assertEqual(self.evaluate(loaded(inp1, True, inp2)), 1.)
-    self.assertEqual(self.evaluate(loaded(inp1, False, inp2)), 2.)
-
-  def test_wrapped_layer_training(self):
-    class Custom(keras.models.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = LayerWithLearningPhase()
-
-      def call(self, inputs):
-        return self.layer(inputs)
-    model = Custom()
-    x = tf.constant(1., shape=[1, 1])
-    expected_default = model(x)
-    expected_training_true = model(x, training=True)
-    expected_training_false = model(x, training=False)
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    def assert_loaded_model(loaded):
-      actual_default = loaded(x)
-      actual_training_true = loaded(x, training=True)
-      actual_training_false = loaded(x, training=False)
-      self.assertAllClose(
-          [expected_default, expected_training_true, expected_training_false],
-          [actual_default, actual_training_true, actual_training_false])
-
-    assert_loaded_model(keras_load.load(saved_model_dir))
-    assert_loaded_model(tf.saved_model.load(saved_model_dir))
-
-
-class TestSavedModelFormat(tf.test.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  def test_load_with_custom_model_and_layer(self):
-
-    class CustomLayer(keras.layers.Layer):
-
-      def __call__(self, inputs):
-        return inputs
-
-    class Model(keras.models.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = CustomLayer()
-
-      @tf.function(
-          input_signature=[tf.TensorSpec([None, 1])])
-      def call(self, inputs):
-        return self.layer(inputs)
-
-    model = Model()
-    inp = tf.constant([[1.0]])
-    model(inp)
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    # Even if the `CustomLayer` is not provided in `custom_object_scope`,
-    # `Model` still has that reference.
-    with keras.utils.generic_utils.custom_object_scope({'Model': Model}):
-      loaded = keras_load.load(saved_model_dir)
-    self.assertAllEqual([[1.0]], self.evaluate(loaded(inp)))
-    self.assertAllEqual([[1.0]], self.evaluate(loaded.layer(inp)))
-    self.assertIsInstance(loaded.layer, CustomLayer)
-
-    # If `CustomLayer` is provided in `custom_object_scope`, it should of
-    # course use that custom class.
-    with keras.utils.generic_utils.custom_object_scope({
-        'Model': Model,
-        'CustomLayer': CustomLayer
-    }):
-      loaded = keras_load.load(saved_model_dir)
-    self.assertAllEqual([[1.0]], self.evaluate(loaded(inp)))
-    self.assertAllEqual([[1.0]], self.evaluate(loaded.layer(inp)))
-    self.assertIsInstance(loaded.layer, CustomLayer)
-
-    # If the symbol is no longer available, loading should raise an error.
-    del CustomLayer
-    with keras.utils.generic_utils.custom_object_scope({'Model': Model}):
-      with self.assertRaisesRegex(
-          NameError, 'free variable \'CustomLayer\' referenced '
-          'before assignment in enclosing scope'):
-        loaded = keras_load.load(saved_model_dir)
-
-  def test_save_without_tracing(self):
-
-    class DoNotTrace(keras.layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = keras.layers.InputSpec(shape=[None])
-        self.built = True
-
-      def call(self, inputs):
-        raise ValueError('I said do not trace')
-
-      def get_config(self):
-        return {}
-
-      @property
-      def _use_input_spec_as_call_signature(self):
-        return True
-
-    root = keras.models.Sequential()
-    root.add(keras.layers.Input(shape=(3,)))
-    root.attached_layer = DoNotTrace()
-
-    saved_model_dir = self._save_model_dir()
-
-    # With the default settings, the call function is traced.
-    with self.assertRaisesRegex(ValueError, 'do not trace'):
-      root.save(saved_model_dir, save_format='tf')
-
-    # When saving the config only, the layer call function should not be not
-    # traced.
-    root.save(saved_model_dir, save_format='tf', save_traces=False)
-    loaded = tf.saved_model.load(saved_model_dir)
-    self.assertTrue(hasattr(loaded, 'attached_layer'))
-
-    # This should raise an error when loaded without the custom object
-    loaded = keras_load.load(saved_model_dir)
-    with self.assertRaisesRegex(ValueError, 'Cannot call custom layer'):
-      loaded.attached_layer(tf.constant([1.]))
-
-    # Try loading with the custom objects
-    with generic_utils.CustomObjectScope({'DoNotTrace': DoNotTrace}):
-      loaded = keras_load.load(saved_model_dir)
-    with self.assertRaisesRegex(ValueError, 'I said do not trace'):
-      loaded.attached_layer(tf.constant([1.]))
-
-  def test_load_non_keras_saved_model(self):
-    model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    saved_model_dir = self._save_model_dir()
-    tf.saved_model.save(model, saved_model_dir)
-    with self.assertRaisesRegex(ValueError, 'Unable to create a Keras model'):
-      keras_load.load(saved_model_dir)
-
-
-class TestLayerCallTracing(tf.test.TestCase, parameterized.TestCase):
-
-  def test_functions_have_same_trace(self):
-
-    class Layer(keras.engine.base_layer.Layer):
-
-      def call(self, inputs):
-        return inputs
-
-      def call2(self, inputs):
-        return inputs * 2
-
-    layer = Layer()
-
-    call_collection = keras_save.LayerCallCollection(layer)
-    fn = call_collection.add_function(layer.call, 'call', True)
-    fn2 = call_collection.add_function(layer.call2, 'call2', True)
-
-    with keras_save.tracing_scope():
-      fn(np.ones((2, 3)))
-      fn(np.ones((4, 5)))
-
-    self.assertLen(
-        fn.wrapped_call._list_all_concrete_functions_for_serialization(), 2)
-    self.assertLen(
-        fn2.wrapped_call._list_all_concrete_functions_for_serialization(), 2)
-
-    # Check that the shapes are correct
-    self.assertEqual(
-        {(2, 3), (4, 5)},
-        set(tuple(c.structured_input_signature[0][0].shape.as_list()) for c in
-            fn2.wrapped_call._list_all_concrete_functions_for_serialization()))
-
-  def test_training_arg_replacement(self):
-
-    def assert_num_traces(layer_cls, training_keyword):
-      layer = layer_cls()
-      call_collection = keras_save.LayerCallCollection(layer)
-      fn = call_collection.add_function(layer.call, 'call', True)
-
-      with keras_save.tracing_scope():
-        fn(np.ones((2, 3)), training=True)
-      self.assertLen(
-          fn.wrapped_call._list_all_concrete_functions_for_serialization(), 2)
-      with keras_save.tracing_scope():
-        fn(np.ones((2, 4)), training=False)
-      self.assertLen(
-          fn.wrapped_call._list_all_concrete_functions_for_serialization(), 4)
-
-      if training_keyword:
-        with keras_save.tracing_scope():
-          fn(np.ones((2, 5)), True)
-        self.assertLen(
-            fn.wrapped_call._list_all_concrete_functions_for_serialization(), 6)
-        with keras_save.tracing_scope():
-          fn(np.ones((2, 6)))
-        self.assertLen(
-            fn.wrapped_call._list_all_concrete_functions_for_serialization(), 8)
-
-    class LayerWithTrainingKeyword(keras.engine.base_layer.Layer):
-
-      def call(self, inputs, training=False):
-        return inputs * training
-
-    assert_num_traces(LayerWithTrainingKeyword, training_keyword=True)
-
-    class LayerWithKwargs(keras.engine.base_layer.Layer):
-
-      def call(self, inputs, **kwargs):
-        return inputs * kwargs['training']
-
-    assert_num_traces(LayerWithKwargs, training_keyword=False)
-
-    class LayerWithChildLayer(keras.engine.base_layer.Layer):
-
-      def __init__(self):
-        self.child = LayerWithKwargs()
-        super().__init__()
-
-      def call(self, inputs):
-        return self.child(inputs)
-
-    assert_num_traces(LayerWithChildLayer, training_keyword=False)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_maintains_losses(self):
-    layer = LayerWithLoss()
-    layer(np.ones((2, 3)))
-    previous_losses = layer.losses[:]
-
-    call_collection = keras_save.LayerCallCollection(layer)
-    fn = call_collection.add_function(layer.call, 'call', True)
-    fn(np.ones((2, 3)))
-
-    self.assertAllEqual(self.evaluate(previous_losses),
-                        self.evaluate(layer.losses))
-
-
-@generic_utils.register_keras_serializable('Testing')
-class CustomMeanMetric(keras.metrics.Mean):
-
-  def update_state(self, *args):  # pylint: disable=useless-super-delegation
-    # Sometimes built-in metrics return an op in update_state. Custom
-    # metrics don't support returning ops, so wrap the update_state method
-    # while returning nothing.
-    super().update_state(*args)
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
-class MetricTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  def generate_inputs(self, num_tensor_args, shape=(1, 5)):
-    return [
-        np.random.uniform(0, 1, shape).astype('float32')
-        for _ in range(num_tensor_args)
-    ]
-
-  def _test_metric_save_and_load(self,
-                                 metric,
-                                 save_dir,
-                                 num_tensor_args,
-                                 shape=(1, 5),
-                                 test_sample_weight=True):
-    with self.cached_session():
-      model = test_utils.get_model_from_layers(
-          [keras.layers.Layer()], input_shape=[3], model_type='functional')
-      model.saved_metric = metric
-      model.save(save_dir, save_format='tf')
-      loaded_model = keras_load.load(save_dir)
-      loaded = loaded_model.saved_metric
-      self.evaluate([v.initializer for v in loaded.variables])
-      self.assertEqual(metric.name, loaded.name)
-      self.assertEqual(metric.dtype, loaded.dtype)
-
-      inputs = self.generate_inputs(num_tensor_args, shape)
-      actual = self.evaluate(metric(*inputs))
-      self.assertAllClose(actual, loaded(*inputs))
-      self.assertAllClose(metric.variables, loaded.variables)
-
-      # Test with separate calls to update state and result.
-      inputs = self.generate_inputs(num_tensor_args, shape)
-      self.evaluate(metric.update_state(*inputs))
-      self.evaluate(loaded.update_state(*inputs))
-      actual = self.evaluate(metric.result())
-      self.assertAllClose(actual, loaded.result())
-
-      if test_sample_weight:
-        # Test with sample weights input.
-        inputs = self.generate_inputs(num_tensor_args, shape)
-        sample_weight = self.generate_inputs(1, [])[0]
-        inputs.append(sample_weight)
-
-        actual = self.evaluate(metric(*inputs))
-        self.assertAllClose(actual, loaded(*inputs))
-      return loaded
-
-  @parameterized.named_parameters([
-      ('mean', keras.metrics.Mean, 1, (1, 5)),
-      ('false_positives', keras.metrics.FalsePositives, 2, (1, 5)),
-      ('precision_at_top_k', keras.metrics.Precision, 2, (2, 3, 4), {
-          'top_k': 2,
-          'class_id': 1
-      }),
-      ('precision_at_recall', keras.metrics.PrecisionAtRecall, 2, (1, 5), {
-          'recall': .8
-      }), ('auc', keras.metrics.AUC, 2, (1, 5), {
-          'multi_label': True
-      }), ('cosine_similarity', keras.metrics.CosineSimilarity, 2, (2, 3, 1))
-  ])
-  def test_metric(self, metric_cls, num_tensor_args, shape, init_kwargs=None):
-    init_kwargs = init_kwargs or {}
-    metric = metric_cls(**init_kwargs)
-    metric(*self.generate_inputs(num_tensor_args, shape))
-    self.evaluate([v.initializer for v in metric.variables])
-    loaded = self._test_metric_save_and_load(metric, self._save_model_dir(),
-                                             num_tensor_args, shape)
-    self.assertEqual(type(loaded), type(metric))
-
-  @parameterized.named_parameters([
-      ('mean', keras.metrics.Mean, 1, False),
-      ('auc', keras.metrics.AUC, 2, False),
-      ('mean_tensor', keras.metrics.MeanTensor, 1, True)])
-  def test_custom_metric(self, base_cls, num_tensor_args, requires_build):
-
-    class CustomMetric(base_cls):
-
-      def update_state(self, *args):  # pylint: disable=useless-super-delegation
-        # Sometimes built-in metrics return an op in update_state. Custom
-        # metrics don't support returning ops, so wrap the update_state method
-        # while returning nothing.
-        super().update_state(*args)
-
-    with self.cached_session():
-      metric = CustomMetric()
-      save_dir = self._save_model_dir('first_save')
-
-      if requires_build:
-        metric(*self.generate_inputs(num_tensor_args))  # pylint: disable=not-callable
-
-      self.evaluate([v.initializer for v in metric.variables])
-
-      with self.assertRaisesRegex(ValueError,
-                                  'Unable to restore custom object'):
-        self._test_metric_save_and_load(metric, save_dir, num_tensor_args)
-      with generic_utils.CustomObjectScope({'CustomMetric': CustomMetric}):
-        loaded = self._test_metric_save_and_load(
-            metric,
-            save_dir,
-            num_tensor_args,
-            test_sample_weight=False)
-
-        self._test_metric_save_and_load(
-            loaded,
-            self._save_model_dir('second_save'),
-            num_tensor_args,
-            test_sample_weight=False)
-
-  def test_registered_custom_metric(self):
-
-    with self.cached_session():
-      metric = CustomMeanMetric()
-      save_dir = self._save_model_dir('first_save')
-      self.evaluate([v.initializer for v in metric.variables])
-      loaded = self._test_metric_save_and_load(
-          metric,
-          save_dir,
-          num_tensor_args=1,
-          test_sample_weight=False)
-
-      self._test_metric_save_and_load(
-          loaded,
-          self._save_model_dir('second_save'),
-          num_tensor_args=1,
-          test_sample_weight=False)
-
-  def test_custom_metric_wrapped_call(self):
-
-    class NegativeMean(keras.metrics.Mean):
-
-      @tf.function(
-          input_signature=[tf.TensorSpec(None, tf.float32)])
-      def update_state(self, value):
-        super().update_state(-value)
-
-    metric = NegativeMean()
-    self.evaluate([v.initializer for v in metric.variables])
-    with generic_utils.CustomObjectScope({'NegativeMean': NegativeMean}):
-      self._test_metric_save_and_load(
-          metric, self._save_model_dir(), 1, test_sample_weight=False)
-
-  @test_combinations.run_with_all_model_types
-  def test_custom_metric_model(self):
-    # TODO(b/134519980): Issue with `model.fit` if the model call function uses
-    # a `tf.function` in graph mode.
-    if not tf.executing_eagerly():
-      return
-
-    x = np.random.random((1, 3))
-    y = np.random.random((1, 4))
-
-    class CustomMetric(keras.metrics.MeanSquaredError):
-      pass
-
-    def zero_metric(y_true, y_pred):
-      del y_true, y_pred
-      return 0
-
-    model = test_utils.get_small_mlp(1, 4, input_dim=3)
-    model.compile(loss='mse', optimizer='SGD',
-                  metrics=[CustomMetric(), zero_metric])
-    model.fit(x, y)
-    saved_model_dir = self._save_model_dir()
-    model.save(saved_model_dir, save_format='tf')
-
-    with self.assertRaisesRegex(ValueError, 'custom_objects'):
-      keras_load.load(saved_model_dir)
-
-    with generic_utils.CustomObjectScope(
-        {'CustomMetric': CustomMetric, 'zero_metric': zero_metric}):
-      loaded = keras_load.load(saved_model_dir)
-
-    self.evaluate([v.initializer for v in loaded.variables])
-    loaded.fit(x, y)
-
-
-class TestUpdateMetadata(tf.test.TestCase):
-
-  def testAddFullSaveSpec(self):
-    save_spec = tf.TensorSpec([3, 5], dtype=tf.int32)
-    node_metadata = json_utils.Encoder().encode({'save_spec': save_spec})
-
-    metadata = saved_metadata_pb2.SavedMetadata()
-    metadata.nodes.add(
-        version=versions_pb2.VersionDef(
-            producer=1, min_consumer=1, bad_consumers=[]),
-        identifier='_tf_keras_model',
-        metadata=node_metadata)  # pylint: disable=protected-access
-
-    new_metadata = keras_load._update_to_current_version(metadata)
-    node_metadata = json_utils.decode(new_metadata.nodes[0].metadata)
-    expected_full_spec = ([tf.TensorSpec(shape=(3, 5), dtype=tf.int32)], {})
-    self.assertAllEqual(expected_full_spec, node_metadata.get('full_save_spec'))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/saved_model/serialized_attributes.py b/keras/saving/saved_model/serialized_attributes.py
deleted file mode 100644
index 1431a33b4283..000000000000
--- a/keras/saving/saved_model/serialized_attributes.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Helper classes that list&validate all attributes to serialize to SavedModel.
-"""
-
-from keras.saving.saved_model import constants
-from keras.saving.saved_model import order_preserving_set as ops
-from keras.saving.saved_model import save_impl
-from keras.utils.generic_utils import LazyLoader
-import tensorflow.compat.v2 as tf
-
-# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
-# once the issue with copybara is fixed.
-# pylint:disable=g-inconsistent-quotes
-base_layer = LazyLoader(
-    "base_layer", globals(),
-    "keras.engine.base_layer")
-training_lib = LazyLoader(
-    "training_lib", globals(),
-    "keras.engine.training")
-metrics = LazyLoader("metrics", globals(),
-                     "keras.metrics")
-base_rnn = LazyLoader(
-    "base_rnn", globals(),
-    "keras.layers.rnn.base_rnn")
-# pylint:enable=g-inconsistent-quotes
-
-
-class SerializedAttributes:
-  """Class that tracks and validates all serialization attributes.
-
-  Keras models contain many Python-defined components. For example, the
-  trainable_variable property lists the model's trainable variables by
-  recursively retrieving the trainable variables from each of the child layers.
-  Another example is model.call, a python function that calls child layers and
-  adds ops to the backend graph.
-
-  Only Tensorflow checkpointable objects and functions can be serialized to
-  SavedModel. Serializing a Keras model as-is results in a checkpointable object
-  that does not resemble a Keras model at all. Thus, extra checkpointable
-  objects and functions must be created during serialization.
-
-  **Defining new serialized attributes**
-  Child classes should be defined using:
-    SerializedAttributes.with_attributes(
-        'name', checkpointable_objects=[...], functions=[...], copy_from=[...])
-  This class is used to cache generated checkpointable objects and functions,
-  ensuring that new objects and functions are generated a single time.
-
-  **Usage during serialization**
-  Each Layer/Model object should have a corresponding instance of
-  SerializedAttributes. Create a new instance by calling
-  `SerializedAttributes.new(obj)`. Objects and functions may be saved using
-  `.set_and_validate_checkpointable_objects`/`.set_and_and_validate_functions`.
-  The properties `.checkpointable_objects` and `.functions` returns the cached
-  values.
-
-  **Adding/changing attributes to save to SavedModel**
-  1. Change the call to `SerializedAttributes.with_attributes` in the correct
-     class:
-     - CommonEndpoints: Base attributes to be added during serialization. If
-       these attributes are present in a Trackable object, it can be
-       deserialized to a Keras Model.
-     - LayerAttributes: Attributes to serialize for Layer objects.
-     - ModelAttributes: Attributes to serialize for Model objects.
-  2. Update class docstring
-  3. Update arguments to any calls to `set_and_validate_*`. For example, if
-     `call_raw_tensors` is added to the ModelAttributes function list, then
-     a `call_raw_tensors` function should be passed to
-     `set_and_validate_functions`.
-
-  **Common endpoints vs other attributes**
-  Only common endpoints are attached directly to the root object. Keras-specific
-  attributes are saved to a separate trackable object with the name "keras_api".
-  The number of objects attached to the root is limited because any naming
-  conflicts will cause user code to break.
-
-  Another reason is that this will only affect users who call
-  `tf.saved_model.load` instead of `tf.keras.models.load_model`. These are
-  advanced users who are likely to have defined their own tf.functions and
-  trackable objects. The added Keras-specific attributes are kept out of the way
-  in the "keras_api" namespace.
-
-  Properties defined in this class may be used to filter out keras-specific
-  attributes:
-  - `functions_to_serialize`: Returns dict of functions to attach to the root
-      object.
-  - `checkpointable_objects_to_serialize`: Returns dict of objects to attach to
-      the root object (including separate trackable object containing
-      keras-specific attributes)
-
-  All changes to the serialized attributes must be backwards-compatible, so
-  attributes should not be removed or modified without sufficient justification.
-  """
-
-  @staticmethod
-  def with_attributes(
-      name, checkpointable_objects=None, functions=None, copy_from=None):
-    """Creates a subclass with all attributes as specified in the arguments.
-
-    Args:
-      name: Name of subclass
-      checkpointable_objects: List of checkpointable objects to be serialized
-        in the SavedModel.
-      functions: List of functions to be serialized in the SavedModel.
-      copy_from: List of other SerializedAttributes subclasses. The returned
-        class will copy checkpoint objects/functions from each subclass.
-
-    Returns:
-      Child class with attributes as defined in the `checkpointable_objects`
-      and `functions` lists.
-    """
-    checkpointable_objects = checkpointable_objects or []
-    functions = functions or []
-
-    if copy_from is not None:
-      for cls in copy_from:
-        checkpointable_objects.extend(cls.all_checkpointable_objects)
-        functions.extend(cls.all_functions)
-
-    # OrderPreservingSets are used here to guarantee serialization determinism
-    # of Keras objects.
-    classdict = {
-        'all_checkpointable_objects':
-            ops.OrderPreservingSet(checkpointable_objects),
-        'all_functions':
-            ops.OrderPreservingSet(functions),
-    }
-    return type(name, (SerializedAttributes,), classdict)
-
-  @staticmethod
-  def new(obj):
-    """Returns a new SerializedAttribute object."""
-    if isinstance(obj, training_lib.Model):
-      return ModelAttributes()
-    elif isinstance(obj, metrics.Metric):
-      return MetricAttributes()
-    elif isinstance(obj, base_rnn.RNN):
-      return RNNAttributes()
-    elif isinstance(obj, base_layer.Layer):
-      return LayerAttributes()
-    else:
-      raise TypeError('Internal error during serialization. Expected Keras '
-                      f'Layer object. Received: {obj} '
-                      f'(of type {type(obj)})')
-
-  def __init__(self):
-    self._object_dict = {}
-    self._function_dict = {}
-    self._keras_trackable = tf.__internal__.tracking.AutoTrackable()
-
-  @property
-  def functions(self):
-    """Returns dictionary of all functions."""
-    return {key: value for key, value in self._function_dict.items()
-            if value is not None}
-
-  @property
-  def checkpointable_objects(self):
-    """Returns dictionary of all checkpointable objects."""
-    return {key: value for key, value in self._object_dict.items()
-            if value is not None}
-
-  @property
-  def functions_to_serialize(self):
-    """Returns functions to attach to the root object during serialization."""
-    functions = {}
-    for key, v in self.functions.items():
-      if key in CommonEndpoints.all_functions:
-        functions[key] = (v.wrapped_call if isinstance(v, save_impl.LayerCall)
-                          else v)
-    return functions
-
-  @property
-  def objects_to_serialize(self):
-    """Returns objects to attach to the root object during serialization."""
-    objects = {key: value for key, value in self.checkpointable_objects.items()
-               if key in CommonEndpoints.all_checkpointable_objects}
-    objects[constants.KERAS_ATTR] = self._keras_trackable
-    return objects
-
-  def set_and_validate_functions(self, function_dict):
-    """Saves function dictionary, and validates dictionary values."""
-    for key in self.all_functions:
-      if key in function_dict:
-        if (function_dict[key] is not None and  # Not all functions are required
-            not isinstance(function_dict[key],
-                           (tf.__internal__.function.Function,
-                            tf.types.experimental.ConcreteFunction,
-                            save_impl.LayerCall))):
-          raise ValueError(
-              'The tf.function dictionary contained a non-function object: '
-              f'{function_dict[key]} (for key {key}). Only tf.function '
-              'instances or ConcreteFunction instances should be passed.')
-        fn = function_dict[key]
-        self._function_dict[key] = fn
-
-        # Extract TensorFlow `Function` from LayerCall.
-        tf_fn = fn.wrapped_call if isinstance(fn, save_impl.LayerCall) else fn
-        setattr(self._keras_trackable, key, tf_fn)
-      else:
-        raise ValueError(
-            f'Function {key} missing from serialized tf.function dictionary.')
-    return self.functions
-
-  def set_and_validate_objects(self, object_dict):
-    """Saves objects to a dictionary, and validates the values."""
-    for key in self.all_checkpointable_objects:
-      if key in object_dict:
-        if not isinstance(object_dict[key], tf.__internal__.tracking.Trackable):
-          raise ValueError(
-              'The object dictionary contained a non-trackable object: '
-              f'{object_dict[key]} (for key {key}). Only trackable objects are '
-              f'allowed, such as Keras layers/models or tf.Module instances.')
-        self._object_dict[key] = object_dict[key]
-        setattr(self._keras_trackable, key, object_dict[key])
-      else:
-        raise ValueError(
-            f'Object {key} missing from serialized object dictionary.')
-    return self.checkpointable_objects
-
-
-class CommonEndpoints(SerializedAttributes.with_attributes(
-    'CommonEndpoints',
-    checkpointable_objects=['variables', 'trainable_variables',
-                            'regularization_losses'],
-    functions=['__call__', 'call_and_return_all_conditional_losses',
-               '_default_save_signature'])):
-  """Common endpoints shared by all models loadable by Keras.
-
-  List of all attributes:
-    variables: List of all variables in the model and its sublayers.
-    trainable_variables: List of all trainable variables in the model and its
-      sublayers.
-    regularization_losses: List of all unconditional losses (losses not
-      dependent on the inputs) in the model and its sublayers.
-    __call__: Function that takes inputs and returns the outputs of the model
-      call function.
-    call_and_return_all_conditional_losses: Function that returns a tuple of
-      (call function outputs, list of all losses that depend on the inputs).
-    _default_save_signature: Traced model call function. This is only included
-      if the top level exported object is a Keras model.
-  """
-
-
-class LayerAttributes(SerializedAttributes.with_attributes(
-    'LayerAttributes',
-    checkpointable_objects=['non_trainable_variables', 'layers', 'metrics',
-                            'layer_regularization_losses', 'layer_metrics'],
-    functions=['call_and_return_conditional_losses', 'activity_regularizer_fn'],
-    copy_from=[CommonEndpoints]
-    )):
-  """Layer checkpointable objects + functions that are saved to the SavedModel.
-
-  List of all attributes:
-    All attributes from CommonEndpoints
-    non_trainable_variables: List of non-trainable variables in the layer and
-      its sublayers.
-    layers: List of all sublayers.
-    metrics: List of all metrics in the layer and its sublayers.
-    call_and_return_conditional_losses: Function that takes inputs and returns a
-      tuple of (outputs of the call function, list of input-dependent losses).
-      The list of losses excludes the activity regularizer function, which is
-      separate to allow the deserialized Layer object to define a different
-      activity regularizer.
-    activity_regularizer_fn: Callable that returns the activity regularizer loss
-    layer_regularization_losses: List of losses owned only by this layer.
-    layer_metrics: List of metrics owned by this layer.
-  """
-
-
-class ModelAttributes(SerializedAttributes.with_attributes(
-    'ModelAttributes',
-    copy_from=[LayerAttributes])):
-  """Model checkpointable objects + functions that are saved to the SavedModel.
-
-  List of all attributes:
-    All attributes from LayerAttributes (including CommonEndpoints)
-  """
-  # TODO(kathywu): Add attributes `compile_losses` and `compile_metrics`, which
-  #  list all losses and metrics defined by `model.compile`.
-
-
-class MetricAttributes(
-    SerializedAttributes.with_attributes(
-        'MetricAttributes',
-        checkpointable_objects=['variables'],
-        functions=[],
-    )):
-  """Attributes that are added to Metric objects when saved to SavedModel.
-
-  List of all attributes:
-    variables: list of all variables
-  """
-  pass
-
-
-class RNNAttributes(SerializedAttributes.with_attributes(
-    'RNNAttributes',
-    checkpointable_objects=['states'],
-    copy_from=[LayerAttributes])):
-  """RNN checkpointable objects + functions that are saved to the SavedModel.
-
-  List of all attributes:
-    All attributes from LayerAttributes (including CommonEndpoints)
-    states: List of state variables
-  """
diff --git a/keras/saving/saved_model/utils.py b/keras/saving/saved_model/utils.py
deleted file mode 100644
index 1ea0ac916284..000000000000
--- a/keras/saving/saved_model/utils.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility functions shared between SavedModel saving/loading implementations."""
-
-import copy
-import inspect as _inspect
-import itertools
-import threading
-import types
-
-from keras import backend
-from keras.engine import base_layer_utils
-from keras.utils import control_flow_util
-from keras.utils import layer_utils
-from keras.utils import tf_contextlib
-from keras.utils.generic_utils import LazyLoader
-
-import tensorflow.compat.v2 as tf
-
-
-# pylint:disable=g-inconsistent-quotes
-training_lib = LazyLoader(
-    "training_lib", globals(),
-    "keras.engine.training")
-# pylint:enable=g-inconsistent-quotes
-
-
-def use_wrapped_call(layer, call_fn, call_spec,
-                     default_training_value=None,
-                     return_method=False):
-  """Creates fn that adds the losses returned by call_fn & returns the outputs.
-
-  Args:
-    layer: A Keras layer object
-    call_fn: tf.function that takes layer inputs (and possibly a training arg),
-      and returns a tuple of (outputs, list of losses).
-    call_spec: The `CallFunctionSpec` for the layer's call function.
-    default_training_value: Default value of the training kwarg. If `None`, the
-      default is `tf.keras.backend.learning_phase()`.
-    return_method: Whether to return a method bound to the layer.
-
-  Returns:
-    function that calls call_fn and returns the outputs. Losses returned by
-    call_fn are added to the layer losses.
-  """
-  expects_training_arg = layer_uses_training_bool(layer)
-
-  fn, arg_spec = maybe_add_training_arg(
-      call_spec,
-      call_fn, expects_training_arg, default_training_value)
-
-  def return_outputs_and_add_losses(*args, **kwargs):
-    """Returns the outputs from the layer call function, and adds the losses."""
-    if return_method:
-      args = args[1:]
-
-    outputs, losses = fn(*args, **kwargs)
-    layer.add_loss(losses)
-
-    # TODO(kathywu): This is a temporary hack. When a network of layers is
-    # revived from SavedModel, only the top-level layer will have losses. This
-    # causes issues in eager mode because the child layers may have graph losses
-    # (thus model.losses returns a mix of Eager and graph tensors). To fix this,
-    # whenever eager losses are added to one layer, add eager losses to all
-    # child layers. This causes `.losses` to only return eager losses.
-    # pylint: disable=protected-access
-    if tf.executing_eagerly():
-      for i in layer._flatten_layers():
-        if i is not layer:
-          i._eager_losses = [base_layer_utils.REVIVED_LOSS_PLACEHOLDER]
-    # pylint: enable=protected-access
-    return outputs
-
-  decorated = tf.__internal__.decorator.make_decorator(
-      target=call_fn,
-      decorator_func=return_outputs_and_add_losses,
-      decorator_argspec=arg_spec)
-
-  if return_method:
-    return types.MethodType(decorated, layer)
-  else:
-    return decorated
-
-
-def layer_uses_training_bool(layer):
-  """Returns whether this layer or any of its children uses the training arg."""
-  if layer._expects_training_arg:  # pylint: disable=protected-access
-    return True
-  visited = {layer}
-  to_visit = list_all_layers(layer)
-  while to_visit:
-    layer = to_visit.pop()
-    if layer in visited:
-      continue
-    if getattr(layer, '_expects_training_arg', True):
-      return True
-    visited.add(layer)
-    to_visit.extend(list_all_layers(layer))
-  return False
-
-
-def list_all_layers(obj):
-  if isinstance(obj, training_lib.Model):
-    # Handle special case of Sequential, which doesn't return
-    # the `Input` layer.
-    return obj.layers
-  else:
-    return list(obj._flatten_layers(include_self=False, recursive=False))  # pylint: disable=protected-access
-
-
-def list_all_layers_and_sublayers(obj):
-  s = set([obj])
-  s.update(itertools.chain.from_iterable(
-      list_all_layers_and_sublayers(layer) for layer in list_all_layers(obj)))
-  return s
-
-
-def maybe_add_training_arg(
-    call_spec, wrapped_call, expects_training_arg,
-    default_training_value):
-  """Decorate call and optionally adds training argument.
-
-  If a layer expects a training argument, this function ensures that 'training'
-  is present in the layer args or kwonly args, with the default training value.
-
-  Args:
-    call_spec: CallFunctionSpec of the layer.
-    wrapped_call: Wrapped call function.
-    expects_training_arg: Whether to include 'training' argument.
-    default_training_value: Default value of the training kwarg to include in
-      the arg spec. If `None`, the default is
-      `tf.keras.backend.learning_phase()`.
-
-  Returns:
-    Tuple of (
-      function that calls `wrapped_call` and sets the training arg,
-      Argspec of returned function or `None` if the argspec is unchanged)
-  """
-  if not expects_training_arg:
-    return wrapped_call, None
-
-  arg_spec = set_training_arg_spec(call_spec.full_argspec,
-                                   default_training_value)
-  call_spec = layer_utils.CallFunctionSpec(arg_spec)
-
-  def wrap_with_training_arg(*args, **kwargs):
-    """Wrap the `wrapped_call` function, and set training argument."""
-    try:
-      training = call_spec.get_arg_value('training', args, kwargs,
-                                         inputs_in_args=True)
-    except KeyError:
-      training = None
-
-    if training is None:
-      training = (default_training_value or
-                  base_layer_utils.call_context().training or
-                  backend.learning_phase())
-
-    args = list(args)
-    kwargs = kwargs.copy()
-
-    def replace_training_and_call(training):
-      new_args, new_kwargs = call_spec.set_arg_value('training', training, args, kwargs, inputs_in_args=True)
-      return wrapped_call(*new_args, **new_kwargs)
-
-    return control_flow_util.smart_cond(
-        training, lambda: replace_training_and_call(True),
-        lambda: replace_training_and_call(False))
-
-  return wrap_with_training_arg, arg_spec
-
-
-def set_training_arg_spec(arg_spec, default_training_value):
-  """Set `training=DEFAULT` argument in an ArgSpec."""
-  if 'training' in arg_spec.args:
-    # If `training` is already in the args list, try to set the default value.
-    index = arg_spec.args.index('training')
-    training_default_index = len(arg_spec.args) - index
-    defaults = list(arg_spec.defaults) if arg_spec.defaults is not None else []
-    if (arg_spec.defaults and
-        len(arg_spec.defaults) >= training_default_index and
-        defaults[-training_default_index] is None):
-      defaults[-training_default_index] = default_training_value
-      return arg_spec._replace(defaults=defaults)
-  elif 'training' not in arg_spec.kwonlyargs:
-    kwonlyargs = arg_spec.kwonlyargs + ['training']
-    kwonlydefaults = copy.copy(arg_spec.kwonlydefaults) or {}
-    kwonlydefaults['training'] = default_training_value
-    return arg_spec._replace(kwonlyargs=kwonlyargs,
-                             kwonlydefaults=kwonlydefaults)
-
-  return arg_spec
-
-
-class SaveOptionsContext(threading.local):
-
-  def __init__(self):
-    super().__init__()
-    self.save_traces = True
-
-
-_save_options_context = SaveOptionsContext()
-
-
-@tf_contextlib.contextmanager
-def keras_option_scope(save_traces):
-  previous_value = _save_options_context.save_traces
-  try:
-    _save_options_context.save_traces = save_traces
-    yield
-  finally:
-    _save_options_context.save_traces = previous_value
-
-
-def should_save_traces():
-  """Whether to trace layer functions-can be disabled in the save_traces arg."""
-  return _save_options_context.save_traces
-
-
-@tf_contextlib.contextmanager
-def no_automatic_dependency_tracking_scope(obj):
-  """A context that disables automatic dependency tracking when assigning attrs.
-
-  Objects that inherit from Autotrackable automatically creates dependencies
-  to trackable objects through attribute assignments, and wraps data structures
-  (lists or dicts) with trackable classes. This scope may be used to temporarily
-  disable this behavior. This works similar to the decorator
-  `no_automatic_dependency_tracking`.
-
-  Example usage:
-  ```
-  model = tf.keras.Model()
-  model.arr1 = []  # Creates a ListWrapper object
-  with no_automatic_dependency_tracking_scope(model):
-    model.arr2 = []  # Creates a regular, untracked python list
-  ```
-
-  Args:
-    obj: A trackable object.
-
-  Yields:
-    a scope in which the object doesn't track dependencies.
-  """
-  previous_value = getattr(obj, '_setattr_tracking', True)
-  obj._setattr_tracking = False  # pylint: disable=protected-access
-  try:
-    yield
-  finally:
-    obj._setattr_tracking = previous_value  # pylint: disable=protected-access
diff --git a/keras/saving/saved_model_experimental.py b/keras/saving/saved_model_experimental.py
deleted file mode 100644
index df3d86813baa..000000000000
--- a/keras/saving/saved_model_experimental.py
+++ /dev/null
@@ -1,465 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Deprecated experimental Keras SavedModel implementation."""
-
-import warnings
-
-from keras import backend
-from keras.optimizers import optimizer_v1
-from keras.optimizers.optimizer_v2 import optimizer_v2
-from keras.saving import model_config
-from keras.saving import saving_utils
-from keras.saving import utils_v1 as model_utils
-from keras.utils import mode_keys
-from keras.utils.generic_utils import LazyLoader
-
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import keras_export
-
-# To avoid circular dependencies between keras/engine and keras/saving,
-# code in keras/saving must delay imports.
-
-# TODO(b/134426265): Switch back to single-quotes to match the rest of the file
-# once the issue with copybara is fixed.
-# pylint:disable=g-inconsistent-quotes
-metrics_lib = LazyLoader("metrics_lib", globals(),
-                         "keras.metrics")
-models_lib = LazyLoader("models_lib", globals(),
-                        "keras.models")
-sequential = LazyLoader(
-    "sequential", globals(),
-    "keras.engine.sequential")
-# pylint:enable=g-inconsistent-quotes
-
-
-# File name for json format of SavedModel.
-SAVED_MODEL_FILENAME_JSON = 'saved_model.json'
-
-
-@keras_export(v1=['keras.experimental.export_saved_model'])
-def export_saved_model(model,
-                       saved_model_path,
-                       custom_objects=None,
-                       as_text=False,
-                       input_signature=None,
-                       serving_only=False):
-  """Exports a `tf.keras.Model` as a Tensorflow SavedModel.
-
-  Note that at this time, subclassed models can only be saved using
-  `serving_only=True`.
-
-  The exported `SavedModel` is a standalone serialization of Tensorflow objects,
-  and is supported by TF language APIs and the Tensorflow Serving system.
-  To load the model, use the function
-  `tf.keras.experimental.load_from_saved_model`.
-
-  The `SavedModel` contains:
-
-  1. a checkpoint containing the model weights.
-  2. a `SavedModel` proto containing the Tensorflow backend graph. Separate
-     graphs are saved for prediction (serving), train, and evaluation. If
-     the model has not been compiled, then only the graph computing predictions
-     will be exported.
-  3. the model's json config. If the model is subclassed, this will only be
-     included if the model's `get_config()` method is overwritten.
-
-  Example:
-
-  ```python
-  import tensorflow as tf
-
-  # Create a tf.keras model.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
-  model.summary()
-
-  # Save the tf.keras model in the SavedModel format.
-  path = '/tmp/simple_keras_model'
-  tf.keras.experimental.export_saved_model(model, path)
-
-  # Load the saved keras model back.
-  new_model = tf.keras.experimental.load_from_saved_model(path)
-  new_model.summary()
-  ```
-
-  Args:
-    model: A `tf.keras.Model` to be saved. If the model is subclassed, the flag
-      `serving_only` must be set to True.
-    saved_model_path: a string specifying the path to the SavedModel directory.
-    custom_objects: Optional dictionary mapping string names to custom classes
-      or functions (e.g. custom loss functions).
-    as_text: bool, `False` by default. Whether to write the `SavedModel` proto
-      in text format. Currently unavailable in serving-only mode.
-    input_signature: A possibly nested sequence of `tf.TensorSpec` objects, used
-      to specify the expected model inputs. See `tf.function` for more details.
-    serving_only: bool, `False` by default. When this is true, only the
-      prediction graph is saved.
-
-  Raises:
-    NotImplementedError: If the model is a subclassed model, and serving_only is
-      False.
-    ValueError: If the input signature cannot be inferred from the model.
-    AssertionError: If the SavedModel directory already exists and isn't empty.
-  """
-  warnings.warn(
-      '`tf.keras.experimental.export_saved_model` is deprecated'
-      'and will be removed in a future version. '
-      'Please use `model.save(..., save_format="tf")` or '
-      '`tf.keras.models.save_model(..., save_format="tf")`.',
-      stacklevel=2)
-  if serving_only:
-    tf.saved_model.save(
-        model,
-        saved_model_path,
-        signatures=saving_utils.trace_model_call(model, input_signature))
-  else:
-    _save_v1_format(model, saved_model_path, custom_objects, as_text,
-                    input_signature)
-
-  try:
-    _export_model_json(model, saved_model_path)
-  except NotImplementedError:
-    logging.warning('Skipped saving model JSON, subclassed model does not have '
-                    'get_config() defined.')
-
-
-def _export_model_json(model, saved_model_path):
-  """Saves model configuration as a json string under assets folder."""
-  model_json = model.to_json()
-  model_json_filepath = tf.io.gfile.join(
-      _get_or_create_assets_dir(saved_model_path),
-      tf.compat.as_text(SAVED_MODEL_FILENAME_JSON))
-  with tf.io.gfile.GFile(model_json_filepath, 'w') as f:
-    f.write(model_json)
-
-
-def _export_model_variables(model, saved_model_path):
-  """Saves model weights in checkpoint format under variables folder."""
-  _get_or_create_variables_dir(saved_model_path)
-  checkpoint_prefix = _get_variables_path(saved_model_path)
-  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
-  return checkpoint_prefix
-
-
-def _save_v1_format(model, path, custom_objects, as_text, input_signature):
-  """Exports model to v1 SavedModel format."""
-  if not model._is_graph_network:  # pylint: disable=protected-access
-    if isinstance(model, sequential.Sequential):
-      # If input shape is not directly set in the model, the exported model
-      # will infer the expected shapes of the input from the model.
-      if not model.built:
-        raise ValueError('Weights for sequential model have not yet been '
-                         'created. Weights are created when the Model is first '
-                         'called on inputs or `build()` is called with an '
-                         '`input_shape`, or the first layer in the model has '
-                         '`input_shape` during construction.')
-      # TODO(kathywu): Build the model with input_signature to create the
-      # weights before _export_model_variables().
-    else:
-      raise NotImplementedError(
-          'Subclassed models can only be exported for serving. Please set '
-          'argument serving_only=True.')
-
-  builder = tf.__internal__.saved_model.SavedModelBuilder(path)  # pylint: disable=protected-access
-
-  # Manually save variables to export them in an object-based checkpoint. This
-  # skips the `builder.add_meta_graph_and_variables()` step, which saves a
-  # named-based checkpoint.
-  # TODO(b/113134168): Add fn to Builder to save with object-based saver.
-  # TODO(b/113178242): This should only export the model json structure. Only
-  # one save is needed once the weights can be copied from the model to clone.
-  checkpoint_path = _export_model_variables(model, path)
-
-  # Export each mode. Use ModeKeys enums defined for `Estimator` to ensure that
-  # Keras models and `Estimator`s are exported with the same format.
-  # Every time a mode is exported, the code checks to see if new variables have
-  # been created (e.g. optimizer slot variables). If that is the case, the
-  # checkpoint is re-saved to include the new variables.
-  export_args = {'builder': builder,
-                 'model': model,
-                 'custom_objects': custom_objects,
-                 'checkpoint_path': checkpoint_path,
-                 'input_signature': input_signature}
-
-  has_saved_vars = False
-  if model.optimizer:
-    if isinstance(model.optimizer, (optimizer_v1.TFOptimizer,
-                                    optimizer_v2.OptimizerV2)):
-      _export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
-      has_saved_vars = True
-      _export_mode(mode_keys.ModeKeys.TEST, has_saved_vars, **export_args)
-    else:
-      logging.warning(
-          'Model was compiled with an optimizer, but the optimizer is not from '
-          '`tf.train` (e.g. `tf.train.AdagradOptimizer`). Only the serving '
-          'graph was exported. The train and evaluate graphs were not added to '
-          'the SavedModel.')
-  _export_mode(mode_keys.ModeKeys.PREDICT, has_saved_vars, **export_args)
-
-  builder.save(as_text)
-
-
-def _get_var_list(model):
-  """Returns list of all checkpointed saveable objects in the model."""
-  var_list, _, _ = tf.__internal__.tracking.ObjectGraphView(model).serialize_object_graph()
-  return var_list
-
-
-def create_placeholder(spec):
-  return backend.placeholder(shape=spec.shape, dtype=spec.dtype, name=spec.name)
-
-
-def _export_mode(
-    mode, has_saved_vars, builder, model, custom_objects, checkpoint_path,
-    input_signature):
-  """Exports a model, and optionally saves new vars from the clone model.
-
-  Args:
-    mode: A `tf.estimator.ModeKeys` string.
-    has_saved_vars: A `boolean` indicating whether the SavedModel has already
-      exported variables.
-    builder: A `SavedModelBuilder` object.
-    model: A `tf.keras.Model` object.
-    custom_objects: A dictionary mapping string names to custom classes
-      or functions.
-    checkpoint_path: String path to checkpoint.
-    input_signature: Nested TensorSpec containing the expected inputs. Can be
-      `None`, in which case the signature will be inferred from the model.
-
-  Raises:
-    ValueError: If the train/eval mode is being exported, but the model does
-      not have an optimizer.
-  """
-  compile_clone = (mode != mode_keys.ModeKeys.PREDICT)
-  if compile_clone and not model.optimizer:
-    raise ValueError(
-        f'Model {model.name} does not have an optimizer. '
-        f'Cannot export mode {mode}.')
-
-  model_graph = tf.compat.v1.get_default_graph()
-  with tf.Graph().as_default() as g, backend.learning_phase_scope(
-      mode == mode_keys.ModeKeys.TRAIN):
-
-    if input_signature is None:
-      input_tensors = None
-    else:
-      input_tensors = tf.nest.map_structure(create_placeholder, input_signature)
-
-    # Clone the model into blank graph. This will create placeholders for inputs
-    # and targets.
-    clone = models_lib.clone_and_build_model(
-        model, input_tensors=input_tensors, custom_objects=custom_objects,
-        compile_clone=compile_clone)
-
-    # Make sure that iterations variable is added to the global step collection,
-    # to ensure that, when the SavedModel graph is loaded, the iterations
-    # variable is returned by `tf.compat.v1.train.get_global_step()`. This is
-    # required for compatibility with the SavedModelEstimator.
-    if compile_clone:
-      g.add_to_collection(tf.compat.v1.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
-
-    # Extract update and train ops from train/test/predict functions.
-    train_op = None
-    if mode == mode_keys.ModeKeys.TRAIN:
-      clone._make_train_function()  # pylint: disable=protected-access
-      train_op = clone.train_function.updates_op
-    elif mode == mode_keys.ModeKeys.TEST:
-      clone._make_test_function()  # pylint: disable=protected-access
-    else:
-      clone._make_predict_function()  # pylint: disable=protected-access
-    g.get_collection_ref(tf.compat.v1.GraphKeys.UPDATE_OPS).extend(clone.state_updates)
-
-    with tf.compat.v1.Session().as_default():
-      clone_var_list = _get_var_list(clone)
-      if has_saved_vars:
-        # Confirm all variables in the clone have an entry in the checkpoint.
-        status = clone.load_weights(checkpoint_path)
-        status.assert_existing_objects_matched()
-      else:
-        # Confirm that variables between the clone and model match up exactly,
-        # not counting optimizer objects. Optimizer objects are ignored because
-        # if the model has not trained, the slot variables will not have been
-        # created yet.
-        # TODO(b/113179535): Replace with trackable equivalence.
-        _assert_same_non_optimizer_objects(model, model_graph, clone, g)
-
-        # TODO(b/113178242): Use value transfer for trackable objects.
-        clone.load_weights(checkpoint_path)
-
-        # Add graph and variables to SavedModel.
-        # TODO(b/113134168): Switch to add_meta_graph_and_variables.
-        clone.save_weights(checkpoint_path, save_format='tf', overwrite=True)
-        builder._has_saved_variables = True  # pylint: disable=protected-access
-
-      # Add graph to the SavedModel builder.
-      builder.add_meta_graph(
-          model_utils.EXPORT_TAG_MAP[mode],
-          signature_def_map=_create_signature_def_map(clone, mode),
-          saver=tf.compat.v1.train.Saver(
-              clone_var_list,
-              # Allow saving Models with no variables. This is somewhat odd, but
-              # it's not necessarily a bug.
-              allow_empty=True),
-          init_op=tf.compat.v1.local_variables_initializer(),
-          train_op=train_op)
-    return None
-
-
-def _create_signature_def_map(model, mode):
-  """Creates a SignatureDef map from a Keras model."""
-  inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
-  if model.optimizer:
-    targets_dict = {x.name.split(':')[0]: x
-                    for x in model._targets if x is not None}  # pylint: disable=protected-access
-    inputs_dict.update(targets_dict)
-  outputs_dict = {name: x
-                  for name, x in zip(model.output_names, model.outputs)}
-  metrics = saving_utils.extract_model_metrics(model)
-
-  # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
-  # are by default not added to any collections. We are doing this here, so
-  # that metric variables get initialized.
-  local_vars = set(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.LOCAL_VARIABLES))
-  vars_to_add = set()
-  if metrics is not None:
-    for key, value in metrics.items():
-      if isinstance(value, metrics_lib.Metric):
-        vars_to_add.update(value.variables)
-        # Convert Metric instances to (value_tensor, update_op) tuple.
-        metrics[key] = (value.result(), value.updates[0])
-  # Remove variables that are in the local variables collection already.
-  vars_to_add = vars_to_add.difference(local_vars)
-  for v in vars_to_add:
-    tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.LOCAL_VARIABLES, v)
-
-  export_outputs = model_utils.export_outputs_for_mode(
-      mode,
-      predictions=outputs_dict,
-      loss=model.total_loss if model.optimizer else None,
-      metrics=metrics)
-  return model_utils.build_all_signature_defs(
-      inputs_dict,
-      export_outputs=export_outputs,
-      serving_only=(mode == mode_keys.ModeKeys.PREDICT))
-
-
-def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
-  """Asserts model and clone contain the same trackable objects."""
-
-  # TODO(fchollet, kathywu): make sure this works in eager mode.
-  return True
-
-
-@keras_export(v1=['keras.experimental.load_from_saved_model'])
-def load_from_saved_model(saved_model_path, custom_objects=None):
-  """Loads a keras Model from a SavedModel created by `export_saved_model()`.
-
-  This function reinstantiates model state by:
-  1) loading model topology from json (this will eventually come
-     from metagraph).
-  2) loading model weights from checkpoint.
-
-  Example:
-
-  ```python
-  import tensorflow as tf
-
-  # Create a tf.keras model.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
-  model.summary()
-
-  # Save the tf.keras model in the SavedModel format.
-  path = '/tmp/simple_keras_model'
-  tf.keras.experimental.export_saved_model(model, path)
-
-  # Load the saved keras model back.
-  new_model = tf.keras.experimental.load_from_saved_model(path)
-  new_model.summary()
-  ```
-
-  Args:
-    saved_model_path: a string specifying the path to an existing SavedModel.
-    custom_objects: Optional dictionary mapping names
-        (strings) to custom classes or functions to be
-        considered during deserialization.
-
-  Returns:
-    a keras.Model instance.
-  """
-  warnings.warn(
-      '`tf.keras.experimental.load_from_saved_model` is deprecated'
-      'and will be removed in a future version. '
-      'Please switch to `tf.keras.models.load_model`.',
-      stacklevel=2)
-  # restore model topology from json string
-  model_json_filepath = tf.io.gfile.join(
-      tf.compat.as_bytes(saved_model_path),
-      tf.compat.as_bytes(tf.saved_model.ASSETS_DIRECTORY),
-      tf.compat.as_bytes(SAVED_MODEL_FILENAME_JSON))
-  with tf.io.gfile.GFile(model_json_filepath, 'r') as f:
-    model_json = f.read()
-  model = model_config.model_from_json(
-      model_json, custom_objects=custom_objects)
-
-  # restore model weights
-  checkpoint_prefix = tf.io.gfile.join(
-      tf.compat.as_text(saved_model_path),
-      tf.compat.as_text(tf.saved_model.VARIABLES_DIRECTORY),
-      tf.compat.as_text(tf.saved_model.VARIABLES_FILENAME))
-  model.load_weights(checkpoint_prefix)
-  return model
-
-
-#### Directory / path helpers
-
-
-def _get_or_create_variables_dir(export_dir):
-  """Return variables sub-directory, or create one if it doesn't exist."""
-  variables_dir = _get_variables_dir(export_dir)
-  tf.io.gfile.makedirs(variables_dir)
-  return variables_dir
-
-
-def _get_variables_dir(export_dir):
-  """Return variables sub-directory in the SavedModel."""
-  return tf.io.gfile.join(
-      tf.compat.as_text(export_dir),
-      tf.compat.as_text(tf.saved_model.VARIABLES_DIRECTORY))
-
-
-def _get_variables_path(export_dir):
-  """Return the variables path, used as the prefix for checkpoint files."""
-  return tf.io.gfile.join(
-      tf.compat.as_text(_get_variables_dir(export_dir)),
-      tf.compat.as_text(tf.saved_model.VARIABLES_FILENAME))
-
-
-def _get_or_create_assets_dir(export_dir):
-  """Return assets sub-directory, or create one if it doesn't exist."""
-  assets_destination_dir = _get_assets_dir(export_dir)
-
-  tf.io.gfile.makedirs(assets_destination_dir)
-
-  return assets_destination_dir
-
-
-def _get_assets_dir(export_dir):
-  """Return path to asset directory in the SavedModel."""
-  return tf.io.gfile.join(
-      tf.compat.as_text(export_dir),
-      tf.compat.as_text(tf.saved_model.ASSETS_DIRECTORY))
diff --git a/keras/saving/saved_model_experimental_test.py b/keras/saving/saved_model_experimental_test.py
deleted file mode 100644
index 4b42076ee085..000000000000
--- a/keras/saving/saved_model_experimental_test.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Tests for saving/loading function for keras Model."""
-
-import tensorflow.compat.v2 as tf
-
-import os
-import shutil
-
-from absl.testing import parameterized
-import numpy as np
-
-import keras
-from keras.optimizers import optimizer_v1
-from keras.engine import training as model_lib
-from keras.optimizers.optimizer_v2 import adadelta
-from keras.optimizers.optimizer_v2 import rmsprop
-from keras.saving import saved_model_experimental as keras_saved_model
-from keras.saving import utils_v1 as model_utils
-from keras.utils import control_flow_util
-from keras.utils import mode_keys
-
-
-class TestModelSavingandLoading(parameterized.TestCase, tf.test.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  def test_saving_sequential_model(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=rmsprop.RMSprop(lr=0.0001),
-          metrics=[keras.metrics.categorical_accuracy],
-          sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-
-      ref_y = model.predict(x)
-
-      saved_model_dir = self._save_model_dir()
-      keras_saved_model.export_saved_model(model, saved_model_dir)
-
-      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  def test_saving_sequential_model_without_compile(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-
-      x = np.random.random((1, 3))
-      ref_y = model.predict(x)
-
-      saved_model_dir = self._save_model_dir()
-      keras_saved_model.export_saved_model(model, saved_model_dir)
-      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
-
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  def test_saving_functional_model(self):
-    with self.cached_session():
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      output = keras.layers.Dense(3)(x)
-
-      model = keras.models.Model(inputs, output)
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=rmsprop.RMSprop(lr=0.0001),
-          metrics=[keras.metrics.categorical_accuracy])
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      ref_y = model.predict(x)
-
-      saved_model_dir = self._save_model_dir()
-      keras_saved_model.export_saved_model(model, saved_model_dir)
-      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
-
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  def test_saving_functional_model_without_compile(self):
-    with self.cached_session():
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      output = keras.layers.Dense(3)(x)
-
-      model = keras.models.Model(inputs, output)
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-
-      ref_y = model.predict(x)
-
-      saved_model_dir = self._save_model_dir()
-      keras_saved_model.export_saved_model(model, saved_model_dir)
-      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
-
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  def test_saving_with_tf_optimizer(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_shape=(3,)))
-    model.add(keras.layers.Dense(3))
-    model.compile(
-        loss='mse',
-        optimizer=tf.compat.v1.train.RMSPropOptimizer(0.1),
-        metrics=['acc'])
-
-    x = np.random.random((1, 3))
-    y = np.random.random((1, 3))
-    model.train_on_batch(x, y)
-    ref_y = model.predict(x)
-
-    saved_model_dir = self._save_model_dir()
-    keras_saved_model.export_saved_model(model, saved_model_dir)
-    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
-    loaded_model.compile(
-        loss='mse',
-        optimizer=tf.compat.v1.train.RMSPropOptimizer(0.1),
-        metrics=['acc'])
-    y = loaded_model.predict(x)
-    self.assertAllClose(ref_y, y, atol=1e-05)
-
-    # test that new updates are the same with both models
-    x = np.random.random((1, 3))
-    y = np.random.random((1, 3))
-
-    ref_loss = model.train_on_batch(x, y)
-    loss = loaded_model.train_on_batch(x, y)
-    self.assertAllClose(ref_loss, loss, atol=1e-05)
-
-    ref_y = model.predict(x)
-    y = loaded_model.predict(x)
-    self.assertAllClose(ref_y, y, atol=1e-05)
-
-    # test saving/loading again
-    saved_model_dir2 = self._save_model_dir('saved_model_2')
-    keras_saved_model.export_saved_model(loaded_model, saved_model_dir2)
-    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir2)
-    y = loaded_model.predict(x)
-    self.assertAllClose(ref_y, y, atol=1e-05)
-
-  def test_saving_subclassed_model_raise_error(self):
-    # For now, saving subclassed model should raise an error. It should be
-    # avoided later with loading from SavedModel.pb.
-
-    class SubclassedModel(model_lib.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer1 = keras.layers.Dense(3)
-        self.layer2 = keras.layers.Dense(1)
-
-      def call(self, inp):
-        return self.layer2(self.layer1(inp))
-
-    model = SubclassedModel()
-
-    saved_model_dir = self._save_model_dir()
-    with self.assertRaises(NotImplementedError):
-      keras_saved_model.export_saved_model(model, saved_model_dir)
-
-
-class LayerWithLearningPhase(keras.engine.base_layer.Layer):
-
-  def build(self, input_shape):
-    self.input_spec = keras.layers.InputSpec(shape=[None] * len(input_shape))
-    self.built = True
-
-  def call(self, x, training=None):
-    if training is None:
-      training = keras.backend.learning_phase()
-    output = control_flow_util.smart_cond(training, lambda: x * 0,
-                                          lambda: tf.identity(x))
-    if not tf.executing_eagerly():
-      output._uses_learning_phase = True  # pylint: disable=protected-access
-    return output
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-
-def functional_model(uses_learning_phase=True):
-  inputs = keras.layers.Input(shape=(3,))
-  x = keras.layers.Dense(2)(inputs)
-  x = keras.layers.Dense(3)(x)
-  if uses_learning_phase:
-    x = LayerWithLearningPhase()(x)
-  return keras.models.Model(inputs, x)
-
-
-def sequential_model(uses_learning_phase=True):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(2, input_shape=(3,)))
-  model.add(keras.layers.Dense(3))
-  if uses_learning_phase:
-    model.add(LayerWithLearningPhase())
-  return model
-
-
-def sequential_model_without_input_shape(uses_learning_phase=True):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(2))
-  model.add(keras.layers.Dense(3))
-  if uses_learning_phase:
-    model.add(LayerWithLearningPhase())
-  return model
-
-
-class Subclassed(keras.models.Model):
-
-  def __init__(self):
-    super().__init__()
-    self.dense1 = keras.layers.Dense(2)
-    self.dense2 = keras.layers.Dense(3)
-
-  def call(self, inputs):
-    x = self.dense1(inputs)
-    x = self.dense2(x)
-    return x
-
-
-def subclassed_model():
-  return Subclassed()
-
-
-def load_model(sess, path, mode):
-  tags = model_utils.EXPORT_TAG_MAP[mode]
-  sig_def_key = model_utils.SIGNATURE_KEY_MAP[mode]
-
-  meta_graph_def = tf.compat.v1.saved_model.load(sess, tags, path)
-  inputs = {
-      k: sess.graph.get_tensor_by_name(v.name)
-      for k, v in meta_graph_def.signature_def[sig_def_key].inputs.items()}
-  outputs = {
-      k: sess.graph.get_tensor_by_name(v.name)
-      for k, v in meta_graph_def.signature_def[sig_def_key].outputs.items()}
-  return inputs, outputs, meta_graph_def
-
-
-def get_train_op(meta_graph_def):
-  graph = tf.compat.v1.get_default_graph()
-  signature_def = meta_graph_def.signature_def['__saved_model_train_op']
-  op_name = signature_def.outputs['__saved_model_train_op'].name
-  return graph.as_graph_element(op_name)
-
-
-class TestModelSavedModelExport(tf.test.TestCase, parameterized.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  @parameterized.parameters(
-      {
-          'model_builder': functional_model,
-          'uses_learning_phase': True,
-          'optimizer_cls': adadelta.Adadelta,
-          'train_before_export': True},
-      {
-          'model_builder': functional_model,
-          'uses_learning_phase': True,
-          'optimizer_cls': tf.compat.v1.train.AdadeltaOptimizer,
-          'train_before_export': False},
-      {
-          'model_builder': functional_model,
-          'uses_learning_phase': False,
-          'optimizer_cls': None,
-          'train_before_export': False},
-      {
-          'model_builder': sequential_model,
-          'uses_learning_phase': True,
-          'optimizer_cls': tf.compat.v1.train.AdadeltaOptimizer,
-          'train_before_export': True},
-      {
-          'model_builder': sequential_model,
-          'uses_learning_phase': True,
-          'optimizer_cls': adadelta.Adadelta,
-          'train_before_export': False},
-      {
-          'model_builder': sequential_model,
-          'uses_learning_phase': False,
-          'optimizer_cls': None,
-          'train_before_export': False},
-      {
-          'model_builder': sequential_model_without_input_shape,
-          'uses_learning_phase': True,
-          'optimizer_cls': tf.compat.v1.train.AdadeltaOptimizer,
-          'train_before_export': False})
-  def testSaveAndLoadSavedModelExport(
-      self, model_builder, uses_learning_phase, optimizer_cls,
-      train_before_export):
-    optimizer = None if optimizer_cls is None else optimizer_cls()
-
-    saved_model_dir = self._save_model_dir()
-
-    np.random.seed(130)
-    input_arr = np.random.random((1, 3))
-    target_arr = np.random.random((1, 3))
-
-    model = model_builder(uses_learning_phase)
-    if optimizer is not None:
-      model.compile(
-          loss='mse',
-          optimizer=optimizer,
-          metrics=['mae'])
-      if train_before_export:
-        model.train_on_batch(input_arr, target_arr)
-
-      ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
-
-    ref_predict = model.predict(input_arr)
-
-    # Export SavedModel
-    keras_saved_model.export_saved_model(model, saved_model_dir)
-
-    input_name = model.input_names[0]
-    output_name = model.output_names[0]
-    target_name = output_name + '_target'
-
-    # Load predict graph, and test predictions
-    with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, saved_model_dir,
-                                      mode_keys.ModeKeys.PREDICT)
-
-      predictions = sess.run(outputs[output_name],
-                             {inputs[input_name]: input_arr})
-      self.assertAllClose(ref_predict, predictions, atol=1e-05)
-
-    if optimizer:
-      # Load eval graph, and test predictions, loss and metric values
-      with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, saved_model_dir,
-                                        mode_keys.ModeKeys.TEST)
-
-        # First obtain the loss and predictions, and run the metric update op by
-        # feeding in the inputs and targets.
-        metrics_name = 'mae' if tf.__internal__.tf2.enabled() else 'mean_absolute_error'
-        metrics_update_op_key = 'metrics/' + metrics_name + '/update_op'
-        metrics_value_op_key = 'metrics/' + metrics_name + '/value'
-
-        loss, predictions, _ = sess.run(
-            (outputs['loss'], outputs['predictions/' + output_name],
-             outputs[metrics_update_op_key]), {
-                 inputs[input_name]: input_arr,
-                 inputs[target_name]: target_arr
-             })
-
-        # The metric value should be run after the update op, to ensure that it
-        # reflects the correct value.
-        metric_value = sess.run(outputs[metrics_value_op_key])
-
-        self.assertEqual(int(train_before_export),
-                         sess.run(tf.compat.v1.train.get_global_step()))
-        self.assertAllClose(ref_loss, loss, atol=1e-05)
-        self.assertAllClose(ref_mae, metric_value, atol=1e-05)
-        self.assertAllClose(ref_predict, predictions, atol=1e-05)
-
-      # Load train graph, and check for the train op, and prediction values
-      with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-        inputs, outputs, meta_graph_def = load_model(
-            sess, saved_model_dir, mode_keys.ModeKeys.TRAIN)
-        self.assertEqual(int(train_before_export),
-                         sess.run(tf.compat.v1.train.get_global_step()))
-        self.assertIn('loss', outputs)
-        self.assertIn(metrics_update_op_key, outputs)
-        self.assertIn(metrics_value_op_key, outputs)
-        self.assertIn('predictions/' + output_name, outputs)
-
-        # Train for a step
-        train_op = get_train_op(meta_graph_def)
-        train_outputs, _ = sess.run(
-            [outputs, train_op], {inputs[input_name]: input_arr,
-                                  inputs[target_name]: target_arr})
-        self.assertEqual(int(train_before_export) + 1,
-                         sess.run(tf.compat.v1.train.get_global_step()))
-
-        if uses_learning_phase:
-          self.assertAllClose(
-              [[0, 0, 0]], train_outputs['predictions/' + output_name],
-              atol=1e-05)
-        else:
-          self.assertNotAllClose(
-              [[0, 0, 0]], train_outputs['predictions/' + output_name],
-              atol=1e-05)
-
-  def testSaveAndLoadSavedModelWithCustomObject(self):
-    saved_model_dir = self._save_model_dir()
-    with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-      def relu6(x):
-        return keras.backend.relu(x, max_value=6)
-      inputs = keras.layers.Input(shape=(1,))
-      outputs = keras.layers.Activation(relu6)(inputs)
-      model = keras.models.Model(inputs, outputs)
-      keras_saved_model.export_saved_model(
-          model, saved_model_dir, custom_objects={'relu6': relu6})
-    with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, saved_model_dir,
-                                      mode_keys.ModeKeys.PREDICT)
-      input_name = model.input_names[0]
-      output_name = model.output_names[0]
-      predictions = sess.run(
-          outputs[output_name], {inputs[input_name]: [[7], [-3], [4]]})
-      self.assertAllEqual([[6], [0], [4]], predictions)
-
-  def testAssertModelCloneSameObjectsIgnoreOptimizer(self):
-    input_arr = np.random.random((1, 3))
-    target_arr = np.random.random((1, 3))
-
-    model_graph = tf.Graph()
-    clone_graph = tf.Graph()
-
-    # Create two models with the same layers but different optimizers.
-    with tf.compat.v1.Session(graph=model_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(3)(x)
-      model = keras.models.Model(inputs, x)
-
-      model.compile(loss='mse', optimizer=tf.compat.v1.train.AdadeltaOptimizer())
-      model.train_on_batch(input_arr, target_arr)
-
-    with tf.compat.v1.Session(graph=clone_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(3)(x)
-      clone = keras.models.Model(inputs, x)
-      clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
-      clone.train_on_batch(input_arr, target_arr)
-
-    keras_saved_model._assert_same_non_optimizer_objects(
-        model, model_graph, clone, clone_graph)
-
-  def testAssertModelCloneSameObjectsThrowError(self):
-    input_arr = np.random.random((1, 3))
-    target_arr = np.random.random((1, 3))
-
-    model_graph = tf.Graph()
-    clone_graph = tf.Graph()
-
-    # Create two models with the same layers but different optimizers.
-    with tf.compat.v1.Session(graph=model_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(3)(x)
-      model = keras.models.Model(inputs, x)
-
-      model.compile(loss='mse', optimizer=tf.compat.v1.train.AdadeltaOptimizer())
-      model.train_on_batch(input_arr, target_arr)
-
-    with tf.compat.v1.Session(graph=clone_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(4)(x)
-      x = keras.layers.Dense(3)(x)
-      clone = keras.models.Model(inputs, x)
-      clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
-      clone.train_on_batch(input_arr, target_arr)
-
-  def testSaveSequentialModelWithoutInputShapes(self):
-    model = sequential_model_without_input_shape(True)
-    # A Sequential model that hasn't been built should raise an error.
-    with self.assertRaisesRegex(
-        ValueError, 'Weights for sequential model have not yet been created'):
-      keras_saved_model.export_saved_model(model, '')
-
-    # Even with input_signature, the model's weights has not been created.
-    with self.assertRaisesRegex(
-        ValueError, 'Weights for sequential model have not yet been created'):
-      saved_model_dir = self._save_model_dir()
-      keras_saved_model.export_saved_model(
-          model,
-          saved_model_dir,
-          input_signature=tf.TensorSpec(
-              shape=(10, 11, 12, 13, 14), dtype=tf.float32,
-              name='spec_input'))
-
-  @parameterized.parameters(
-      {
-          'model_builder': sequential_model_without_input_shape,
-          'input_signature': [tf.TensorSpec(shape=[None, 3],
-                                                     dtype=tf.float32)]},
-      {
-          'model_builder': subclassed_model,
-          'input_signature': [tf.TensorSpec(shape=[None, 3],
-                                                     dtype=tf.float32)]})
-  def testServingOnly(self, model_builder, input_signature):
-    if tf.executing_eagerly():
-      saved_model_dir = self._save_model_dir()
-      input_arr = np.random.random((5, 3)).astype(np.float32)
-      model = model_builder()
-      ref_predict = model.predict(input_arr)
-
-      keras_saved_model.export_saved_model(
-          model,
-          saved_model_dir,
-          serving_only=True,
-          input_signature=input_signature)
-
-      # Load predict graph, and test predictions
-      with tf.compat.v1.Session(graph=tf.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, saved_model_dir,
-                                        mode_keys.ModeKeys.PREDICT)
-        predictions = sess.run(outputs[next(iter(outputs.keys()))],
-                               {inputs[next(iter(inputs.keys()))]: input_arr})
-        self.assertAllClose(ref_predict, predictions, atol=1e-05)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/saving_api.py b/keras/saving/saving_api.py
new file mode 100644
index 000000000000..32c01da30558
--- /dev/null
+++ b/keras/saving/saving_api.py
@@ -0,0 +1,349 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Public API surface for saving APIs."""
+
+import os
+import warnings
+import zipfile
+
+import tensorflow.compat.v2 as tf
+from tensorflow.python.util.tf_export import keras_export
+
+from keras.saving import saving_lib
+from keras.saving.legacy import save as legacy_sm_saving_lib
+from keras.utils import io_utils
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+
+is_oss = True
+
+
+def _support_gcs_uri(filepath, save_format, is_oss):
+    """Supports GCS URIs through bigstore via a temporary file."""
+    gs_filepath = None
+    if str(filepath).startswith("gs://") and save_format != "tf":
+        gs_filepath = filepath
+        if not is_oss:
+            gs_filepath = filepath.replace("gs://", "/bigstore/")
+        filepath = os.path.join(
+            saving_lib.get_temp_dir(), os.path.basename(gs_filepath)
+        )
+    return gs_filepath, filepath
+
+
+@keras_export("keras.saving.save_model", "keras.models.save_model")
+def save_model(model, filepath, overwrite=True, save_format=None, **kwargs):
+    """Saves a model as a TensorFlow SavedModel or HDF5 file.
+
+    See the [Serialization and Saving guide](
+        https://keras.io/guides/serialization_and_saving/) for details.
+
+    Args:
+        model: Keras model instance to be saved.
+        filepath: `str` or `pathlib.Path` object. Path where to save the model.
+        overwrite: Whether we should overwrite any existing model at the target
+            location, or instead ask the user via an interactive prompt.
+        save_format: Either `"keras"`, `"tf"`, `"h5"`,
+            indicating whether to save the model
+            in the native Keras format (`.keras`),
+            in the TensorFlow SavedModel format (referred to as "SavedModel"
+            below), or in the legacy HDF5 format (`.h5`).
+            Defaults to `"tf"` in TF 2.X, and `"h5"` in TF 1.X.
+
+    SavedModel format arguments:
+        include_optimizer: Only applied to SavedModel and legacy HDF5 formats.
+            If False, do not save the optimizer state. Defaults to True.
+        signatures: Only applies to SavedModel format. Signatures to save
+            with the SavedModel. See the `signatures` argument in
+            `tf.saved_model.save` for details.
+        options: Only applies to SavedModel format.
+            `tf.saved_model.SaveOptions` object that specifies SavedModel
+            saving options.
+        save_traces: Only applies to SavedModel format. When enabled, the
+            SavedModel will store the function traces for each layer. This
+            can be disabled, so that only the configs of each layer are stored.
+            Defaults to `True`. Disabling this will decrease serialization time
+            and reduce file size, but it requires that all custom layers/models
+            implement a `get_config()` method.
+
+    Example:
+
+    ```python
+    model = tf.keras.Sequential([
+        tf.keras.layers.Dense(5, input_shape=(3,)),
+        tf.keras.layers.Softmax()])
+    model.save("model.keras")
+    loaded_model = tf.keras.saving.load_model("model.keras")
+    x = tf.random.uniform((10, 3))
+    assert np.allclose(model.predict(x), loaded_model.predict(x))
+    ```
+
+    Note that `model.save()` is an alias for `tf.keras.saving.save_model()`.
+
+    The SavedModel or HDF5 file contains:
+
+    - The model's configuration (architecture)
+    - The model's weights
+    - The model's optimizer's state (if any)
+
+    Thus models can be reinstantiated in the exact same state, without any of
+    the code used for model definition or training.
+
+    Note that the model weights may have different scoped names after being
+    loaded. Scoped names include the model/layer names, such as
+    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
+    access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+
+    __SavedModel serialization format__
+
+    With `save_format="tf"`, the model and all trackable objects attached
+    to the it (e.g. layers and variables) are saved as a TensorFlow SavedModel.
+    The model config, weights, and optimizer are included in the SavedModel.
+    Additionally, for every Keras layer attached to the model, the SavedModel
+    stores:
+
+    * The config and metadata -- e.g. name, dtype, trainable status
+    * Traced call and loss functions, which are stored as TensorFlow
+      subgraphs.
+
+    The traced functions allow the SavedModel format to save and load custom
+    layers without the original class definition.
+
+    You can choose to not save the traced functions by disabling the
+    `save_traces` option. This will decrease the time it takes to save the model
+    and the amount of disk space occupied by the output SavedModel. If you
+    enable this option, then you _must_ provide all custom class definitions
+    when loading the model. See the `custom_objects` argument in
+    `tf.keras.saving.load_model`.
+    """
+    save_format = get_save_format(filepath, save_format)
+
+    # Supports GCS URIs through bigstore via a temporary file
+    gs_filepath, filepath = _support_gcs_uri(filepath, save_format, is_oss)
+
+    # Deprecation warnings
+    if save_format == "h5":
+        warnings.warn(
+            "You are saving your model as an HDF5 file via `model.save()`. "
+            "This file format is considered legacy. "
+            "We recommend using instead the native Keras format, "
+            "e.g. `model.save('my_model.keras')`.",
+            stacklevel=2,
+        )
+
+    if save_format == "keras":
+        # If file exists and should not be overwritten.
+        try:
+            exists = os.path.exists(filepath)
+        except TypeError:
+            exists = False
+        if exists and not overwrite:
+            proceed = io_utils.ask_to_proceed_with_overwrite(filepath)
+            if not proceed:
+                return
+        if kwargs:
+            raise ValueError(
+                "The following argument(s) are not supported "
+                f"with the native Keras format: {list(kwargs.keys())}"
+            )
+        saving_lib.save_model(model, filepath)
+    else:
+        # Legacy case
+        return legacy_sm_saving_lib.save_model(
+            model,
+            filepath,
+            overwrite=overwrite,
+            save_format=save_format,
+            **kwargs,
+        )
+
+
+@keras_export("keras.saving.load_model", "keras.models.load_model")
+def load_model(
+    filepath, custom_objects=None, compile=True, safe_mode=True, **kwargs
+):
+    """Loads a model saved via `model.save()`.
+
+    Args:
+        filepath: `str` or `pathlib.Path` object, path to the saved model file.
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+        compile: Boolean, whether to compile the model after loading.
+        safe_mode: Boolean, whether to disallow unsafe `lambda` deserialization.
+            When `safe_mode=False`, loading an object has the potential to
+            trigger arbitrary code execution. This argument is only
+            applicable to the Keras v3 model format. Defaults to True.
+
+    SavedModel format arguments:
+        options: Only applies to SavedModel format.
+            Optional `tf.saved_model.LoadOptions` object that specifies
+            SavedModel loading options.
+
+    Returns:
+        A Keras model instance. If the original model was compiled,
+        and the argument `compile=True` is set, then the returned model
+        will be compiled. Otherwise, the model will be left uncompiled.
+
+    Example:
+
+    ```python
+    model = tf.keras.Sequential([
+        tf.keras.layers.Dense(5, input_shape=(3,)),
+        tf.keras.layers.Softmax()])
+    model.save("model.keras")
+    loaded_model = tf.keras.saving.load_model("model.keras")
+    x = tf.random.uniform((10, 3))
+    assert np.allclose(model.predict(x), loaded_model.predict(x))
+    ```
+
+    Note that the model variables may have different name values
+    (`var.name` property, e.g. `"dense_1/kernel:0"`) after being reloaded.
+    It is recommended that you use layer attributes to
+    access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+    """
+    # Supports GCS URIs by copying data to temporary file
+    save_format = get_save_format(filepath, save_format=None)
+    gs_filepath, filepath = _support_gcs_uri(filepath, save_format, is_oss)
+    if gs_filepath is not None:
+        tf.io.gfile.copy(gs_filepath, filepath, overwrite=True)
+
+    is_keras_zip = str(filepath).endswith(".keras") and zipfile.is_zipfile(
+        filepath
+    )
+
+    # Support for remote zip files
+    if (
+        saving_lib.is_remote_path(filepath)
+        and not tf.io.gfile.isdir(filepath)
+        and not is_keras_zip
+    ):
+        local_path = os.path.join(
+            saving_lib.get_temp_dir(), os.path.basename(filepath)
+        )
+
+        # Copy from remote to temporary local directory
+        tf.io.gfile.copy(filepath, local_path, overwrite=True)
+
+        # Switch filepath to local zipfile for loading model
+        if zipfile.is_zipfile(local_path):
+            filepath = local_path
+            is_keras_zip = True
+
+    if is_keras_zip:
+        if kwargs:
+            raise ValueError(
+                "The following argument(s) are not supported "
+                f"with the native Keras format: {list(kwargs.keys())}"
+            )
+        return saving_lib.load_model(
+            filepath,
+            custom_objects=custom_objects,
+            compile=compile,
+            safe_mode=safe_mode,
+        )
+
+    # Legacy case.
+    return legacy_sm_saving_lib.load_model(
+        filepath, custom_objects=custom_objects, compile=compile, **kwargs
+    )
+
+
+def save_weights(model, filepath, overwrite=True, **kwargs):
+    # Supports GCS URIs through bigstore via a temporary file
+    save_format = get_save_format(filepath, save_format=None)
+    gs_filepath, filepath = _support_gcs_uri(filepath, save_format, is_oss)
+
+    if str(filepath).endswith(".weights.h5"):
+        # If file exists and should not be overwritten.
+        try:
+            exists = os.path.exists(filepath)
+        except TypeError:
+            exists = False
+        if exists and not overwrite:
+            proceed = io_utils.ask_to_proceed_with_overwrite(filepath)
+            if not proceed:
+                return
+        saving_lib.save_weights_only(model, filepath)
+    else:
+        legacy_sm_saving_lib.save_weights(
+            model, filepath, overwrite=overwrite, **kwargs
+        )
+
+
+def load_weights(model, filepath, skip_mismatch=False, **kwargs):
+    # Supports GCS URIs by copying data to temporary file
+    save_format = get_save_format(filepath, save_format=None)
+    gs_filepath, filepath = _support_gcs_uri(filepath, save_format, is_oss)
+    if gs_filepath is not None:
+        tf.io.gfile.copy(gs_filepath, filepath, overwrite=True)
+
+    if str(filepath).endswith(".keras") and zipfile.is_zipfile(filepath):
+        saving_lib.load_weights_only(
+            model, filepath, skip_mismatch=skip_mismatch
+        )
+    elif str(filepath).endswith(".weights.h5"):
+        saving_lib.load_weights_only(
+            model, filepath, skip_mismatch=skip_mismatch
+        )
+    else:
+        return legacy_sm_saving_lib.load_weights(
+            model, filepath, skip_mismatch=skip_mismatch, **kwargs
+        )
+
+
+def get_save_format(filepath, save_format):
+    if save_format:
+        if save_format == "keras_v3":
+            return "keras"
+        if save_format == "keras":
+            if saving_lib.saving_v3_enabled():
+                return "keras"
+            else:
+                return "h5"
+        if save_format in ("h5", "hdf5"):
+            return "h5"
+        if save_format in ("tf", "tensorflow"):
+            return "tf"
+
+        raise ValueError(
+            "Unknown `save_format` argument. Expected one of "
+            "'keras', 'tf', or 'h5'. "
+            f"Received: save_format{save_format}"
+        )
+
+    # No save format specified: infer from filepath.
+
+    if str(filepath).endswith(".keras"):
+        if saving_lib.saving_v3_enabled():
+            return "keras"
+        else:
+            return "h5"
+
+    if str(filepath).endswith((".h5", ".hdf5")):
+        return "h5"
+
+    if h5py is not None and isinstance(filepath, h5py.File):
+        return "h5"
+
+    # No recognizable file format: default to TF in TF2 and h5 in TF1.
+
+    if tf.__internal__.tf2.enabled():
+        return "tf"
+    else:
+        return "h5"
diff --git a/keras/saving/saving_lib.py b/keras/saving/saving_lib.py
new file mode 100644
index 000000000000..a50dd1998ee1
--- /dev/null
+++ b/keras/saving/saving_lib.py
@@ -0,0 +1,743 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python-based idempotent model-saving functionality."""
+
+import datetime
+import io
+import json
+import os
+import re
+import tempfile
+import threading
+import warnings
+import zipfile
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras import losses
+from keras.engine import base_layer
+from keras.optimizers import optimizer
+from keras.saving.serialization_lib import ObjectSharingScope
+from keras.saving.serialization_lib import deserialize_keras_object
+from keras.saving.serialization_lib import serialize_keras_object
+from keras.utils import generic_utils
+from keras.utils import io_utils
+
+try:
+    import h5py
+except ImportError:
+    h5py = None
+
+keras_saving_gauge = tf.__internal__.monitoring.BoolGauge(
+    "/tensorflow/api/keras/saving", "keras saving usage", "method"
+)
+
+# isort: off
+
+_CONFIG_FILENAME = "config.json"
+_METADATA_FILENAME = "metadata.json"
+_VARS_FNAME = "model.weights"  # Will become e.g. "model.weights.h5"
+_ASSETS_DIRNAME = "assets"
+
+# A temporary flag to enable the new idempotent saving framework.
+_SAVING_V3_ENABLED = threading.local()
+_SAVING_V3_ENABLED.value = True
+
+ATTR_SKIPLIST = frozenset(
+    {
+        "_callable_losses",
+        "_captured_weight_regularizer",
+        "_checkpoint_dependencies",
+        "_deferred_dependencies",
+        "_eager_losses",
+        "_inbound_nodes",
+        "_inbound_nodes_value",
+        "_output_layers",
+        "_input_layers",
+        "_keras_api_names",
+        "_keras_api_names_v1",
+        "_name_based_restores",
+        "_non_trainable_weights",
+        "_outbound_nodes",
+        "_outbound_nodes_value",
+        "_saved_model_arg_spec",
+        "_self_name_based_restores",
+        "_self_saveable_object_factories",
+        "_self_tracked_trackables",
+        "_saved_model_inputs_spec",
+        "_self_unconditional_checkpoint_dependencies",
+        "_self_unconditional_deferred_dependencies",
+        "_self_unconditional_dependency_names",
+        "_tf_api_names",
+        "_tf_api_names_v1",
+        "_trainable_weights",
+        "_non_trainable_weights",
+        "_unconditional_checkpoint_dependencies",
+        "_unconditional_dependency_names",
+        "_updates",
+        "_layer_call_argspecs",
+        "inbound_nodes",
+        "outbound_nodes",
+        "input_shape",
+        "output_shape",
+        "submodules",
+        "weights",
+        "non_trainable_weights",
+        "trainable_weights",
+        "variables",
+        "non_trainable_variables",
+        "trainable_variables",
+        "updates",  # Would raise a warning if visited.
+        "state_updates",  # Would raise a warning if visited.
+    }
+)
+
+
+def save_model(model, filepath, weights_format="h5"):
+    """Save a zip-archive representing a Keras model to the given filepath.
+
+    The zip-based archive contains the following structure:
+
+    - JSON-based configuration file (config.json): Records of model, layer, and
+        other trackables' configuration.
+    - NPZ-based trackable state files, found in respective directories, such as
+        model/states.npz, model/dense_layer/states.npz, etc.
+    - Metadata file.
+
+    The states of Keras trackables (layers, optimizers, loss, and metrics) are
+    automatically saved as long as they can be discovered through the attributes
+    returned by `dir(Model)`. Typically, the state includes the variables
+    associated with the trackable, but some specially purposed layers may
+    contain more such as the vocabularies stored in the hashmaps. The trackables
+    define how their states are saved by exposing `save_state()` and
+    `load_state()` APIs.
+
+    For the case of layer states, the variables will be visited as long as
+    they are either 1) referenced via layer attributes, or 2) referenced via a
+    container (list, tuple, or dict), and the container is referenced via a
+    layer attribute.
+    """
+
+    # API usage tracking for Keras V3 saving
+    keras_saving_gauge.get_cell("save_model_v3").set(True)
+
+    filepath = str(filepath)
+    if not filepath.endswith(".keras"):
+        raise ValueError(
+            "Invalid `filepath` argument: expected a `.keras` extension. "
+            f"Received: filepath={filepath}"
+        )
+    if weights_format == "h5" and h5py is None:
+        raise ImportError("h5py must be installed in order to save a model.")
+
+    if not model.built:
+        warnings.warn(
+            "You are saving a model that has not yet been built. "
+            "It might not contain any weights yet. "
+            "Consider building the model first by calling it "
+            "on some data.",
+            stacklevel=2,
+        )
+    saving_v3_enabled_value = getattr(_SAVING_V3_ENABLED, "value", False)
+    _SAVING_V3_ENABLED.value = True
+
+    with ObjectSharingScope():
+        serialized_model_dict = serialize_keras_object(model)
+    config_json = json.dumps(serialized_model_dict)
+    metadata_json = json.dumps(
+        {
+            "keras_version": keras.__version__,
+            "date_saved": datetime.datetime.now().strftime("%Y-%m-%d@%H:%M:%S"),
+        }
+    )
+    # TODO(rameshsampath): Need a better logic for local vs remote path
+    if is_remote_path(filepath):
+        # Remote path. Zip to local drive and copy to remote
+        zip_filepath = os.path.join(get_temp_dir(), "tmp_model.keras")
+    else:
+        zip_filepath = filepath
+    try:
+        with zipfile.ZipFile(zip_filepath, "w") as zf:
+            with zf.open(_METADATA_FILENAME, "w") as f:
+                f.write(metadata_json.encode())
+            with zf.open(_CONFIG_FILENAME, "w") as f:
+                f.write(config_json.encode())
+
+            if weights_format == "h5":
+                weights_store = H5IOStore(
+                    _VARS_FNAME + ".h5", archive=zf, mode="w"
+                )
+            elif weights_format == "npz":
+                weights_store = NpzIOStore(
+                    _VARS_FNAME + ".npz", archive=zf, mode="w"
+                )
+            else:
+                raise ValueError(
+                    "Unknown `weights_format` argument. "
+                    "Expected 'h5' or 'npz'. "
+                    f"Received: weights_format={weights_format}"
+                )
+
+            asset_store = DiskIOStore(_ASSETS_DIRNAME, archive=zf, mode="w")
+
+            _save_state(
+                model,
+                weights_store=weights_store,
+                assets_store=asset_store,
+                inner_path="",
+                visited_trackables=set(),
+            )
+            weights_store.close()
+            asset_store.close()
+
+        if is_remote_path(filepath):
+            # Using tf.io.gfile context manager doesn't close zip file when
+            # writing to GCS. Hence writing to local and copying to filepath.
+            tf.io.gfile.copy(zip_filepath, filepath, overwrite=True)
+            os.remove(zip_filepath)
+    except Exception as e:
+        raise e
+    finally:
+        _SAVING_V3_ENABLED.value = saving_v3_enabled_value
+
+
+def load_model(filepath, custom_objects=None, compile=True, safe_mode=True):
+    """Load a zip archive representing a Keras model."""
+
+    filepath = str(filepath)
+    if not filepath.endswith(".keras"):
+        raise ValueError(
+            "Invalid filename: expected a `.keras` extension. "
+            f"Received: filepath={filepath}"
+        )
+
+    saving_v3_enabled_value = getattr(_SAVING_V3_ENABLED, "value", False)
+    _SAVING_V3_ENABLED.value = True
+
+    try:
+        with tf.io.gfile.GFile(
+            filepath, mode="r+b"
+        ) as gfile_handle, zipfile.ZipFile(gfile_handle, "r") as zf:
+            with zf.open(_CONFIG_FILENAME, "r") as f:
+                config_json = f.read()
+
+            # Note: we should NOT use a custom JSON decoder. Anything that
+            # needs custom decoding must be handled in deserialize_keras_object.
+            config_dict = json.loads(config_json)
+            if not compile:
+                # Disable compilation
+                config_dict["compile_config"] = None
+            # Construct the model from the configuration file in the archive.
+            with ObjectSharingScope():
+                model = deserialize_keras_object(
+                    config_dict, custom_objects, safe_mode=safe_mode
+                )
+
+            all_filenames = zf.namelist()
+            if _VARS_FNAME + ".h5" in all_filenames:
+                weights_store = H5IOStore(
+                    _VARS_FNAME + ".h5", archive=zf, mode="r"
+                )
+            elif _VARS_FNAME + ".npz" in all_filenames:
+                weights_store = NpzIOStore(
+                    _VARS_FNAME + ".npz", archive=zf, mode="r"
+                )
+            else:
+                raise ValueError(
+                    f"Expected a {_VARS_FNAME}.h5 or {_VARS_FNAME}.npz file."
+                )
+
+            if len(all_filenames) > 3:
+                asset_store = DiskIOStore(_ASSETS_DIRNAME, archive=zf, mode="r")
+            else:
+                asset_store = None
+
+            _load_state(
+                model,
+                weights_store=weights_store,
+                assets_store=asset_store,
+                inner_path="",
+                visited_trackables=set(),
+            )
+            weights_store.close()
+            if asset_store:
+                asset_store.close()
+
+    except Exception as e:
+        raise e
+    else:
+        return model
+    finally:
+        _SAVING_V3_ENABLED.value = saving_v3_enabled_value
+
+
+def save_weights_only(model, filepath):
+    """Save only the weights of a model to a target filepath (.weights.h5).
+
+    Note: only supports h5 for now.
+    """
+    # TODO: if h5 filepath is remote, create the file in a temporary directory
+    # then upload it
+
+    # API usage tracking for Keras V3 saving
+    keras_saving_gauge.get_cell("save_weights_v3").set(True)
+
+    filepath = str(filepath)
+    if not filepath.endswith(".weights.h5"):
+        raise ValueError(
+            "Invalid `filepath` argument: expected a `.weights.h5` extension. "
+            f"Received: filepath={filepath}"
+        )
+    weights_store = H5IOStore(filepath, mode="w")
+    _save_state(
+        model,
+        weights_store=weights_store,
+        assets_store=None,
+        inner_path="",
+        visited_trackables=set(),
+    )
+    weights_store.close()
+
+
+def load_weights_only(model, filepath, skip_mismatch=False):
+    """Load the weights of a model from a filepath (.keras or .weights.h5).
+
+    Note: only supports h5 for now.
+    """
+    temp_dir = None
+    archive = None
+    filepath = str(filepath)
+    if filepath.endswith(".weights.h5"):
+        # TODO: download file if h5 filepath is remote
+        weights_store = H5IOStore(filepath, mode="r")
+    elif filepath.endswith(".keras"):
+        archive = zipfile.ZipFile(filepath, "r")
+        weights_store = H5IOStore(
+            _VARS_FNAME + ".h5", archive=archive, mode="r"
+        )
+
+    _load_state(
+        model,
+        weights_store=weights_store,
+        assets_store=None,
+        inner_path="",
+        skip_mismatch=skip_mismatch,
+        visited_trackables=set(),
+    )
+    weights_store.close()
+    if temp_dir and tf.io.gfile.exists(temp_dir):
+        tf.io.gfile.rmtree(temp_dir)
+    if archive:
+        archive.close()
+
+
+def is_remote_path(filepath):
+    if re.match(r"^(/cns|/cfs|/gcs|.*://).*$", str(filepath)):
+        return True
+    return False
+
+
+def _write_to_zip_recursively(zipfile_to_save, system_path, zip_path):
+    if not tf.io.gfile.isdir(system_path):
+        zipfile_to_save.write(system_path, zip_path)
+    else:
+        for file_name in tf.io.gfile.listdir(system_path):
+            system_file_path = tf.io.gfile.join(system_path, file_name)
+            zip_file_path = tf.io.gfile.join(zip_path, file_name)
+            _write_to_zip_recursively(
+                zipfile_to_save, system_file_path, zip_file_path
+            )
+
+
+def _walk_trackable(trackable):
+    for child_attr in dir(trackable):
+        if child_attr.startswith("__") or child_attr in ATTR_SKIPLIST:
+            continue
+        try:
+            child_obj = getattr(trackable, child_attr)
+        except Exception:
+            # Avoid raising the exception when visiting the attributes.
+            continue
+        yield child_attr, child_obj
+
+
+def _save_state(
+    trackable, weights_store, assets_store, inner_path, visited_trackables
+):
+    # If the trackable has already been saved, skip it.
+    if id(trackable) in visited_trackables:
+        return
+
+    if hasattr(trackable, "save_own_variables") and weights_store:
+        trackable.save_own_variables(weights_store.make(inner_path))
+    if hasattr(trackable, "save_assets") and assets_store:
+        trackable.save_assets(assets_store.make(inner_path))
+
+    visited_trackables.add(id(trackable))
+
+    # Recursively save state of children trackables (layers, optimizers, etc.)
+    for child_attr, child_obj in _walk_trackable(trackable):
+        if _is_keras_trackable(child_obj):
+            _save_state(
+                child_obj,
+                weights_store,
+                assets_store,
+                inner_path=tf.io.gfile.join(inner_path, child_attr),
+                visited_trackables=visited_trackables,
+            )
+        elif isinstance(child_obj, (list, dict, tuple, set)):
+            _save_container_state(
+                child_obj,
+                weights_store,
+                assets_store,
+                inner_path=tf.io.gfile.join(inner_path, child_attr),
+                visited_trackables=visited_trackables,
+            )
+
+
+def _load_state(
+    trackable,
+    weights_store,
+    assets_store,
+    inner_path,
+    skip_mismatch=False,
+    visited_trackables=None,
+):
+    if visited_trackables and id(trackable) in visited_trackables:
+        return
+
+    if hasattr(trackable, "load_own_variables") and weights_store:
+        if skip_mismatch:
+            try:
+                trackable.load_own_variables(weights_store.get(inner_path))
+            except Exception as e:
+                warnings.warn(
+                    f"Could not load weights in object {trackable}. "
+                    "Skipping object. "
+                    f"Exception encountered: {e}",
+                    stacklevel=2,
+                )
+        else:
+            trackable.load_own_variables(weights_store.get(inner_path))
+
+    if hasattr(trackable, "load_assets") and assets_store:
+        if skip_mismatch:
+            try:
+                trackable.load_assets(assets_store.get(inner_path))
+            except Exception as e:
+                warnings.warn(
+                    f"Could not load assets in object {trackable}. "
+                    "Skipping object. "
+                    f"Exception encountered: {e}",
+                    stacklevel=2,
+                )
+        else:
+            trackable.load_assets(assets_store.get(inner_path))
+
+    if visited_trackables is not None:
+        visited_trackables.add(id(trackable))
+
+    # Recursively load states for Keras trackables such as layers/optimizers.
+    for child_attr, child_obj in _walk_trackable(trackable):
+        if _is_keras_trackable(child_obj):
+            _load_state(
+                child_obj,
+                weights_store,
+                assets_store,
+                inner_path=tf.io.gfile.join(inner_path, child_attr),
+                skip_mismatch=skip_mismatch,
+                visited_trackables=visited_trackables,
+            )
+        elif isinstance(child_obj, (list, dict, tuple, set)):
+            _load_container_state(
+                child_obj,
+                weights_store,
+                assets_store,
+                inner_path=tf.io.gfile.join(inner_path, child_attr),
+                skip_mismatch=skip_mismatch,
+                visited_trackables=visited_trackables,
+            )
+
+
+def _save_container_state(
+    container, weights_store, assets_store, inner_path, visited_trackables
+):
+    used_names = {}
+    if isinstance(container, dict):
+        container = list(container.values())
+
+    for trackable in container:
+        if _is_keras_trackable(trackable):
+            # Keeps layer name indexing in proper order
+            # when duplicate layers are in container.
+            if id(trackable) in visited_trackables:
+                continue
+            # Do NOT address the trackable via `trackable.name`, since
+            # names are usually autogenerated and thus not reproducible
+            # (i.e. they may vary across two instances of the same model).
+            name = generic_utils.to_snake_case(trackable.__class__.__name__)
+            if name in used_names:
+                used_names[name] += 1
+                name = f"{name}_{used_names[name]}"
+            else:
+                used_names[name] = 0
+            _save_state(
+                trackable,
+                weights_store,
+                assets_store,
+                inner_path=tf.io.gfile.join(inner_path, name),
+                visited_trackables=visited_trackables,
+            )
+
+
+def _load_container_state(
+    container,
+    weights_store,
+    assets_store,
+    inner_path,
+    skip_mismatch,
+    visited_trackables,
+):
+    used_names = {}
+    if isinstance(container, dict):
+        container = list(container.values())
+
+    for trackable in container:
+        if _is_keras_trackable(trackable):
+            # Keeps layer name indexing in proper order
+            # when duplicate layers are in container.
+            if visited_trackables and id(trackable) in visited_trackables:
+                continue
+            # Do NOT address the trackable via `trackable.name`, since
+            # names are usually autogenerated and thus not reproducible
+            # (i.e. they may vary across two instances of the same model).
+            name = generic_utils.to_snake_case(trackable.__class__.__name__)
+            if name in used_names:
+                used_names[name] += 1
+                name = f"{name}_{used_names[name]}"
+            else:
+                used_names[name] = 0
+            _load_state(
+                trackable,
+                weights_store,
+                assets_store,
+                inner_path=tf.io.gfile.join(inner_path, name),
+                skip_mismatch=skip_mismatch,
+                visited_trackables=visited_trackables,
+            )
+
+
+class DiskIOStore:
+    """Asset store backed by disk storage.
+
+    If `archive` is specified, then `root_path` refers to the filename
+    inside the archive.
+
+    If `archive` is not specified, then `root_path` refers to the full path of
+    the target directory.
+    """
+
+    def __init__(self, root_path, archive=None, mode=None):
+        self.mode = mode
+        self.root_path = root_path
+        self.archive = archive
+        self.tmp_dir = None
+        if self.archive:
+            self.tmp_dir = get_temp_dir()
+            if self.mode == "r":
+                self.archive.extractall(path=self.tmp_dir)
+            self.working_dir = tf.io.gfile.join(self.tmp_dir, self.root_path)
+            if self.mode == "w":
+                tf.io.gfile.makedirs(self.working_dir)
+        else:
+            if mode == "r":
+                self.working_dir = root_path
+            else:
+                self.tmp_dir = get_temp_dir()
+                self.working_dir = tf.io.gfile.join(
+                    self.tmp_dir, self.root_path
+                )
+                tf.io.gfile.makedirs(self.working_dir)
+
+    def make(self, path):
+        if not path:
+            return self.working_dir
+        path = tf.io.gfile.join(self.working_dir, path)
+        if not tf.io.gfile.exists(path):
+            tf.io.gfile.makedirs(path)
+        return path
+
+    def get(self, path):
+        if not path:
+            return self.working_dir
+        path = tf.io.gfile.join(self.working_dir, path)
+        if tf.io.gfile.exists(path):
+            return path
+        return None
+
+    def close(self):
+        if self.mode == "w" and self.archive:
+            _write_to_zip_recursively(
+                self.archive, self.working_dir, self.root_path
+            )
+        if self.tmp_dir and tf.io.gfile.exists(self.tmp_dir):
+            tf.io.gfile.rmtree(self.tmp_dir)
+
+
+class H5IOStore:
+    def __init__(self, root_path, archive=None, mode="r"):
+        """Numerical variable store backed by HDF5.
+
+        If `archive` is specified, then `root_path` refers to the filename
+        inside the archive.
+
+        If `archive` is not specified, then `root_path` refers to the path of
+        the h5 file on disk.
+        """
+        self.root_path = root_path
+        self.mode = mode
+        self.archive = archive
+        self.io_file = None
+
+        if self.archive:
+            if self.mode == "w":
+                self.io_file = io.BytesIO()
+            else:
+                self.io_file = self.archive.open(self.root_path, "r")
+            self.h5_file = h5py.File(self.io_file, mode=self.mode)
+        else:
+            self.h5_file = h5py.File(root_path, mode=self.mode)
+
+    def make(self, path):
+        if not path:
+            return self.h5_file.create_group("vars")
+        return self.h5_file.create_group(path).create_group("vars")
+
+    def get(self, path):
+        if not path:
+            return self.h5_file["vars"]
+        if path in self.h5_file and "vars" in self.h5_file[path]:
+            return self.h5_file[path]["vars"]
+        return {}
+
+    def close(self):
+        self.h5_file.close()
+        if self.mode == "w" and self.archive:
+            self.archive.writestr(self.root_path, self.io_file.getvalue())
+        if self.io_file:
+            self.io_file.close()
+
+
+class NpzIOStore:
+    def __init__(self, root_path, archive=None, mode="r"):
+        """Numerical variable store backed by NumPy.savez/load.
+
+         If `archive` is specified, then `root_path` refers to the filename
+        inside the archive.
+
+        If `archive` is not specified, then `root_path` refers to the path of
+        the npz file on disk.
+        """
+        self.root_path = root_path
+        self.mode = mode
+        self.archive = archive
+        if mode == "w":
+            self.contents = {}
+        else:
+            if self.archive:
+                self.f = archive.open(root_path, mode="r")
+            else:
+                self.f = open(root_path, mode="rb")
+            self.contents = np.load(self.f, allow_pickle=True)
+
+    def make(self, path):
+        if not path:
+            self.contents["__root__"] = {}
+            return self.contents["__root__"]
+        self.contents[path] = {}
+        return self.contents[path]
+
+    def get(self, path):
+        if not path:
+            if "__root__" in self.contents:
+                return dict(self.contents["__root__"])
+            return {}
+        if path in self.contents:
+            return self.contents[path].tolist()
+        return {}
+
+    def close(self):
+        if self.mode == "w":
+            if self.archive:
+                self.f = self.archive.open(
+                    self.root_path, mode="w", force_zip64=True
+                )
+            else:
+                self.f = open(self.root_path, mode="wb")
+            np.savez(self.f, **self.contents)
+        self.f.close()
+
+
+def get_temp_dir():
+    temp_dir = tempfile.mkdtemp()
+    testfile = tempfile.TemporaryFile(dir=temp_dir)
+    testfile.close()
+    return temp_dir
+
+
+def _is_keras_trackable(obj):
+    from keras.metrics import base_metric  # To avoid circular import
+
+    return isinstance(
+        obj,
+        (
+            base_layer.Layer,
+            optimizer.Optimizer,
+            base_metric.Metric,
+            losses.Loss,
+        ),
+    )
+
+
+def saving_v3_enabled():
+    return getattr(_SAVING_V3_ENABLED, "value", True)
+
+
+# Some debugging utilities.
+
+
+def _print_h5_file(h5_file, prefix="", action=None):
+    if not prefix:
+        print(f"Keras weights file ({h5_file}) {action}:")
+    if not hasattr(h5_file, "keys"):
+        return
+    for key in h5_file.keys():
+        print(f"...{prefix}{key}")
+        _print_h5_file(h5_file[key], prefix=prefix + "...")
+
+
+def _print_zip_file(zipfile, action):
+    io_utils.print_msg(f"Keras model archive {action}:")
+    # Same as `ZipFile.printdir()` except for using Keras' printing utility.
+    io_utils.print_msg(
+        "%-46s %19s %12s" % ("File Name", "Modified    ", "Size")
+    )
+    for zinfo in zipfile.filelist:
+        date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
+        io_utils.print_msg(
+            "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
+        )
diff --git a/keras/saving/saving_lib_test.py b/keras/saving/saving_lib_test.py
new file mode 100644
index 000000000000..d13c3457a59f
--- /dev/null
+++ b/keras/saving/saving_lib_test.py
@@ -0,0 +1,886 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras python-based idempotent saving functions."""
+import os
+import sys
+import zipfile
+from pathlib import Path
+from unittest import mock
+
+import h5py
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+from tensorflow.python.platform import tf_logging as logging
+
+import keras
+from keras import backend
+from keras.optimizers import adam
+from keras.saving import object_registration
+from keras.saving import saving_lib
+from keras.saving.legacy.saved_model import json_utils
+from keras.testing_infra import test_utils
+from keras.utils import io_utils
+
+train_step_message = "This is my training step"
+assets_data = "These are my assets"
+variables_data = np.random.random((10,))
+
+
+@keras.utils.register_keras_serializable(package="my_custom_package")
+class MyDense(keras.layers.Dense):
+    def build(self, input_shape):
+        self.additional_weights = [
+            self.add_weight(
+                "my_additional_weight",
+                initializer="ones",
+                trainable=True,
+            ),
+            self.add_weight(
+                "my_additional_weight_2",
+                initializer="ones",
+                trainable=True,
+            ),
+        ]
+        self.weights_in_dict = {
+            "my_weight": self.add_weight(
+                "my_dict_weight",
+                initializer="ones",
+                trainable=True,
+            ),
+        }
+        self.nested_layer = keras.layers.Dense(1)
+        return super().build(input_shape)
+
+    def call(self, inputs):
+        call_result = super().call(inputs)
+        return self.nested_layer(call_result)
+
+    def two(self):
+        return 2
+
+
+@keras.utils.register_keras_serializable(package="my_custom_package")
+class LayerWithCustomSaving(MyDense):
+    def build(self, input_shape):
+        self.assets = assets_data
+        self.stored_variables = variables_data
+        return super().build(input_shape)
+
+    def save_assets(self, inner_path):
+        with open(os.path.join(inner_path, "assets.txt"), "w") as f:
+            f.write(self.assets)
+
+    def save_own_variables(self, store):
+        store["variables"] = self.stored_variables
+
+    def load_assets(self, inner_path):
+        with open(os.path.join(inner_path, "assets.txt"), "r") as f:
+            text = f.read()
+        self.assets = text
+
+    def load_own_variables(self, store):
+        self.stored_variables = np.array(store["variables"])
+
+
+@keras.utils.register_keras_serializable(package="my_custom_package")
+class CustomModelX(keras.Model):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dense1 = MyDense(1)
+        self.dense2 = MyDense(1)
+
+    def call(self, inputs):
+        out = self.dense1(inputs)
+        return self.dense2(out)
+
+    def train_step(self, data):
+        tf.print(train_step_message)
+        x, y = data
+        with tf.GradientTape() as tape:
+            y_pred = self(x)
+            loss = self.compiled_loss(y, y_pred)
+
+        gradients = tape.gradient(loss, self.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        return {}
+
+    def one(self):
+        return 1
+
+
+@keras.utils.register_keras_serializable(package="my_custom_package")
+class ModelWithCustomSaving(keras.Model):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.custom_dense = LayerWithCustomSaving(1)
+
+    def call(self, inputs):
+        return self.custom_dense(inputs)
+
+
+@keras.utils.register_keras_serializable(package="my_custom_package")
+class CompileOverridingModel(keras.Model):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dense1 = MyDense(1)
+
+    def compile(self, *args, **kwargs):
+        super().compile(*args, **kwargs)
+
+    def call(self, inputs):
+        return self.dense1(inputs)
+
+
+@keras.utils.register_keras_serializable(package="my_custom_package")
+class CompileOverridingSequential(keras.Sequential):
+    def compile(self, *args, **kwargs):
+        super().compile(*args, **kwargs)
+
+
+@keras.utils.register_keras_serializable(package="my_custom_package")
+def my_mean_squared_error(y_true, y_pred):
+    """Identical to built-in `mean_squared_error`, added here as a custom
+
+    func.
+    """
+    return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)
+
+
+module_my_mean_squared_error = my_mean_squared_error
+
+
+@test_utils.run_v2_only
+class SavingV3Test(tf.test.TestCase, parameterized.TestCase):
+    def _get_subclassed_model(self):
+        subclassed_model = CustomModelX()
+        subclassed_model.compile(
+            optimizer=adam.Adam(),
+            loss=[
+                "mse",
+                keras.losses.mean_squared_error,
+                keras.losses.MeanSquaredError(),
+                my_mean_squared_error,
+            ],
+        )
+        return subclassed_model
+
+    def _get_sequential_model(self):
+        sequential_model = keras.Sequential([MyDense(1), MyDense(1)])
+        sequential_model.compile(
+            optimizer="adam", loss=["mse", keras.losses.mean_squared_error]
+        )
+        return sequential_model
+
+    def _get_functional_model(self):
+        inputs = keras.Input(shape=(32,))
+        x = MyDense(1, name="first_dense")(inputs)
+        outputs = MyDense(1, name="second_dense")(x)
+        functional_model = keras.Model(inputs, outputs)
+        functional_model.compile(
+            optimizer="adam", loss=["mse", keras.losses.mean_squared_error]
+        )
+        return functional_model
+
+    def test_saving_after_compile_but_before_fit(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        subclassed_model = self._get_subclassed_model()
+        subclassed_model._save_experimental(temp_filepath)
+
+        # This is so that we can register another function with the same custom
+        # object key, and make sure the newly registered function is used while
+        # loading.
+        del object_registration._GLOBAL_CUSTOM_OBJECTS[
+            "my_custom_package>my_mean_squared_error"
+        ]
+
+        @keras.utils.register_keras_serializable(package="my_custom_package")
+        def my_mean_squared_error(y_true, y_pred):
+            """Function-local `mean_squared_error`."""
+            return backend.mean(
+                tf.math.squared_difference(y_pred, y_true), axis=-1
+            )
+
+        loaded_model = saving_lib.load_model(temp_filepath)
+        self.assertEqual(
+            subclassed_model._is_compiled, loaded_model._is_compiled
+        )
+
+        # Everything should be the same class or function for the original model
+        # and the loaded model.
+        for model in [subclassed_model, loaded_model]:
+            self.assertIs(
+                model.optimizer.__class__,
+                adam.Adam,
+            )
+            self.assertIs(
+                model.compiled_loss.__class__,
+                keras.engine.compile_utils.LossesContainer,
+            )
+            self.assertEqual(model.compiled_loss._losses[0], "mse")
+            self.assertIs(
+                model.compiled_loss._losses[1], keras.losses.mean_squared_error
+            )
+            self.assertIs(
+                model.compiled_loss._losses[2].__class__,
+                keras.losses.MeanSquaredError,
+            )
+            self.assertIs(
+                model.compiled_loss._total_loss_mean.__class__,
+                keras.metrics.base_metric.Mean,
+            )
+
+        # Except for a custom function used because the loaded model is supposed
+        # to be using the newly registered custom function.
+        self.assertIs(
+            subclassed_model.compiled_loss._losses[3],
+            module_my_mean_squared_error,
+        )
+        self.assertIs(
+            loaded_model.compiled_loss._losses[3], my_mean_squared_error
+        )
+        self.assertIsNot(module_my_mean_squared_error, my_mean_squared_error)
+
+    def test_saving_after_fit(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        subclassed_model = self._get_subclassed_model()
+
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        subclassed_model.fit(x, y, epochs=1)
+        subclassed_model._save_experimental(temp_filepath)
+        loaded_model = saving_lib.load_model(temp_filepath)
+        self.assertEqual(
+            subclassed_model._is_compiled, loaded_model._is_compiled
+        )
+
+        io_utils.enable_interactive_logging()
+        # `tf.print` writes to stderr. This is to make sure the custom training
+        # step is used.
+        with self.captureWritesToStream(sys.stderr) as printed:
+            loaded_model.fit(x, y, epochs=1)
+            self.assertRegex(printed.contents(), train_step_message)
+
+        # Check that the custom classes do get used.
+        self.assertIsInstance(loaded_model, CustomModelX)
+        self.assertIsInstance(loaded_model.dense1, MyDense)
+        # Check that the custom method is available.
+        self.assertEqual(loaded_model.one(), 1)
+        self.assertEqual(loaded_model.dense1.two(), 2)
+
+        # Everything should be the same class or function for the original model
+        # and the loaded model.
+        for model in [subclassed_model, loaded_model]:
+            self.assertIs(
+                model.optimizer.__class__,
+                adam.Adam,
+            )
+            self.assertIs(
+                model.compiled_loss.__class__,
+                keras.engine.compile_utils.LossesContainer,
+            )
+            self.assertIs(
+                model.compiled_loss._losses[0].__class__,
+                keras.losses.LossFunctionWrapper,
+            )
+            self.assertIs(
+                model.compiled_loss._losses[1].__class__,
+                keras.losses.LossFunctionWrapper,
+            )
+            self.assertIs(
+                model.compiled_loss._losses[2].__class__,
+                keras.losses.MeanSquaredError,
+            )
+            self.assertIs(
+                model.compiled_loss._losses[3].__class__,
+                keras.losses.LossFunctionWrapper,
+            )
+            self.assertIs(
+                model.compiled_loss._total_loss_mean.__class__,
+                keras.metrics.base_metric.Mean,
+            )
+
+    def test_saving_preserve_unbuilt_state(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        subclassed_model = CustomModelX()
+        subclassed_model._save_experimental(temp_filepath)
+        loaded_model = saving_lib.load_model(temp_filepath)
+        self.assertEqual(
+            subclassed_model._is_compiled, loaded_model._is_compiled
+        )
+        self.assertFalse(subclassed_model.built)
+        self.assertFalse(loaded_model.built)
+
+    def test_saving_preserve_built_state(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        model = self._get_subclassed_model()
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        model.fit(x, y, epochs=1)
+        model._save_experimental(temp_filepath)
+        loaded_model = saving_lib.load_model(temp_filepath)
+        self.assertEqual(model._is_compiled, loaded_model._is_compiled)
+        self.assertTrue(model.built)
+        self.assertTrue(loaded_model.built)
+        self.assertEqual(
+            model._build_input_shape, loaded_model._build_input_shape
+        )
+        self.assertEqual(
+            tf.TensorShape([None, 32]), loaded_model._build_input_shape
+        )
+
+    def test_saved_module_paths_and_class_names(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        subclassed_model = self._get_subclassed_model()
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        subclassed_model.fit(x, y, epochs=1)
+        subclassed_model._save_experimental(temp_filepath)
+
+        with zipfile.ZipFile(temp_filepath, "r") as z:
+            with z.open(saving_lib._CONFIG_FILENAME, "r") as c:
+                config_json = c.read()
+        config_dict = json_utils.decode(config_json)
+        self.assertEqual(
+            config_dict["registered_name"], "my_custom_package>CustomModelX"
+        )
+        self.assertEqual(
+            config_dict["compile_config"]["optimizer"]["config"][
+                "is_legacy_optimizer"
+            ],
+            False,
+        )
+        self.assertEqual(
+            config_dict["compile_config"]["optimizer"]["class_name"],
+            "Adam",
+        )
+        self.assertLen(config_dict["compile_config"]["loss"], 4)
+        self.assertEqual(
+            config_dict["compile_config"]["loss"][0],
+            "mse",
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            layer=["tf_op_lambda", "lambda"],
+        )
+    )
+    def test_functional_model_with_tf_op_lambda_layer(self, layer):
+        class ToString:
+            def __init__(self):
+                self.contents = ""
+
+            def __call__(self, msg):
+                self.contents += msg + "\n"
+
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+
+        if layer == "lambda":
+            func = tf.function(lambda x: tf.math.cos(x) + tf.math.sin(x))
+            inputs = keras.layers.Input(shape=(32,))
+            outputs = keras.layers.Dense(1)(inputs)
+            outputs = keras.layers.Lambda(func._python_function)(outputs)
+
+        elif layer == "tf_op_lambda":
+            inputs = keras.layers.Input(shape=(32,))
+            outputs = keras.layers.Dense(1)(inputs)
+            outputs = outputs + inputs
+
+        functional_model = keras.Model(inputs, outputs)
+        functional_to_string = ToString()
+        functional_model.summary(print_fn=functional_to_string)
+        functional_model.compile(optimizer="adam", loss="mse", metrics=["mae"])
+
+        x = np.random.random((1000, 32))
+        y = np.random.random((1000, 1))
+        functional_model.fit(x, y, epochs=3)
+        functional_model._save_experimental(temp_filepath)
+        loaded_model = saving_lib.load_model(temp_filepath, safe_mode=False)
+        self.assertEqual(
+            functional_model._is_compiled, loaded_model._is_compiled
+        )
+
+        loaded_model.fit(x, y, epochs=3)
+        loaded_to_string = ToString()
+        loaded_model.summary(print_fn=loaded_to_string)
+
+        # Confirming the original and saved/loaded model have same structure.
+        self.assertEqual(
+            functional_to_string.contents, loaded_to_string.contents
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            model_type=["sequential", "functional", "subclassed"],
+        )
+    )
+    def test_saving_model_state(self, model_type):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        model = getattr(self, f"_get_{model_type}_model")()
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        model.fit(x, y, epochs=1)
+
+        # Assert that the archive has not been saved.
+        self.assertFalse(os.path.exists(temp_filepath))
+
+        # Mutate the `Dense` layer custom weights to ensure that list and
+        # dict-contained weights get restored.
+        model.layers[1].additional_weights[0].assign(2)
+        model.layers[1].weights_in_dict["my_weight"].assign(2)
+        model.layers[1].nested_layer.kernel.assign([[1]])
+
+        model._save_experimental(temp_filepath)
+
+        # Assert that the archive has been saved.
+        self.assertTrue(os.path.exists(temp_filepath))
+        loaded_model = saving_lib.load_model(temp_filepath)
+        self.assertEqual(model._is_compiled, loaded_model._is_compiled)
+
+        # The weights are supposed to be the same (between original and loaded
+        # models).
+        for original_weights, loaded_weights in zip(
+            model.get_weights(), loaded_model.get_weights()
+        ):
+            np.testing.assert_allclose(original_weights, loaded_weights)
+
+        # The optimizer variables are supposed to be the same (between original
+        # and loaded models).
+        for original_weights, loaded_weights in zip(
+            model.optimizer.variables, loaded_model.optimizer.variables
+        ):
+            np.testing.assert_allclose(original_weights, loaded_weights)
+
+    def test_saving_custom_assets_and_variables(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        model = ModelWithCustomSaving()
+        model.compile(
+            optimizer=adam.Adam(),
+            loss=[
+                "mse",
+                keras.losses.mean_squared_error,
+                keras.losses.MeanSquaredError(),
+                my_mean_squared_error,
+            ],
+        )
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        model.fit(x, y, epochs=1)
+
+        # Assert that the archive has not been saved.
+        self.assertFalse(os.path.exists(temp_filepath))
+
+        model._save_experimental(temp_filepath)
+
+        loaded_model = saving_lib.load_model(temp_filepath)
+        self.assertEqual(loaded_model.custom_dense.assets, assets_data)
+        self.assertEqual(
+            loaded_model.custom_dense.stored_variables.tolist(),
+            variables_data.tolist(),
+        )
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            model_type=["subclassed", "sequential"],
+        )
+    )
+    def test_compile_overridden_model_raises_if_no_from_config_overridden(
+        self, model_type
+    ):
+        temp_filepath = os.path.join(self.get_temp_dir(), "my_model.keras")
+        model = (
+            CompileOverridingModel()
+            if model_type == "subclassed"
+            else CompileOverridingSequential(
+                [keras.layers.Embedding(4, 1), MyDense(1), MyDense(1)]
+            )
+        )
+        model.compile("rmsprop", "mse")
+        model._save_experimental(temp_filepath)
+
+        with mock.patch.object(logging, "warning") as mock_warn:
+            saving_lib.load_model(temp_filepath)
+        if not mock_warn.call_args_list:
+            raise AssertionError("Did not warn.")
+        self.assertIn(
+            "`compile()` was not called as part of model loading "
+            "because the model's `compile()` method is custom. ",
+            mock_warn.call_args_list[0][0][0],
+        )
+
+    def test_metadata(self):
+        temp_filepath = Path(
+            os.path.join(self.get_temp_dir(), "my_model.keras")
+        )
+        model = CompileOverridingModel()
+        model._save_experimental(temp_filepath)
+        with zipfile.ZipFile(temp_filepath, "r") as z:
+            with z.open(saving_lib._METADATA_FILENAME, "r") as c:
+                metadata_json = c.read()
+        metadata = json_utils.decode(metadata_json)
+        self.assertIn("keras_version", metadata)
+        self.assertIn("date_saved", metadata)
+
+    def test_gfile_copy_local_called(self):
+        temp_filepath = Path(
+            os.path.join(self.get_temp_dir(), "my_model.keras")
+        )
+        model = CompileOverridingModel()
+        with mock.patch("re.match", autospec=True) as mock_re_match, mock.patch(
+            "tensorflow.compat.v2.io.gfile.copy", autospec=True
+        ) as mock_copy:
+            # Mock Remote Path check to true to test gfile copy logic
+            mock_re_match.return_value = True
+            model._save_experimental(temp_filepath)
+            mock_re_match.assert_called()
+            mock_copy.assert_called()
+            self.assertIn(str(temp_filepath), mock_re_match.call_args.args)
+            self.assertIn(str(temp_filepath), mock_copy.call_args.args)
+
+    def test_load_model_api_endpoint(self):
+        temp_filepath = Path(os.path.join(self.get_temp_dir(), "mymodel.keras"))
+        model = self._get_functional_model()
+        ref_input = np.random.random((10, 32))
+        ref_output = model.predict(ref_input)
+        model.save(temp_filepath, save_format="keras_v3")
+        model = keras.models.load_model(temp_filepath)
+        self.assertAllClose(model.predict(ref_input), ref_output, atol=1e-6)
+
+    def test_save_load_weights_only(self):
+        temp_filepath = Path(
+            os.path.join(self.get_temp_dir(), "mymodel.weights.h5")
+        )
+        model = self._get_functional_model()
+        ref_input = np.random.random((10, 32))
+        ref_output = model.predict(ref_input)
+        saving_lib.save_weights_only(model, temp_filepath)
+        model = self._get_functional_model()
+        saving_lib.load_weights_only(model, temp_filepath)
+        self.assertAllClose(model.predict(ref_input), ref_output, atol=1e-6)
+        # Test with Model method
+        model = self._get_functional_model()
+        model.load_weights(temp_filepath)
+        self.assertAllClose(model.predict(ref_input), ref_output, atol=1e-6)
+
+    def test_load_weights_only_with_keras_file(self):
+        # Test loading weights from whole saved model
+        temp_filepath = Path(os.path.join(self.get_temp_dir(), "mymodel.keras"))
+        model = self._get_functional_model()
+        ref_input = np.random.random((10, 32))
+        ref_output = model.predict(ref_input)
+        saving_lib.save_model(model, temp_filepath)
+        model = self._get_functional_model()
+        saving_lib.load_weights_only(model, temp_filepath)
+        self.assertAllClose(model.predict(ref_input), ref_output, atol=1e-6)
+        # Test with Model method
+        model = self._get_functional_model()
+        model.load_weights(temp_filepath)
+        self.assertAllClose(model.predict(ref_input), ref_output, atol=1e-6)
+
+    def test_compile_arg(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "mymodel.keras")
+        model = self._get_functional_model()
+        model.compile("rmsprop", "mse")
+        model.fit(np.random.random((10, 32)), np.random.random((10, 1)))
+        saving_lib.save_model(model, temp_filepath)
+
+        model = saving_lib.load_model(temp_filepath)
+        self.assertEqual(model._is_compiled, True)
+        model = saving_lib.load_model(temp_filepath, compile=False)
+        self.assertEqual(model._is_compiled, False)
+
+    def test_overwrite(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "mymodel.keras")
+        model = self._get_functional_model()
+        model.save(temp_filepath, save_format="keras_v3")
+        model.save(temp_filepath, save_format="keras_v3", overwrite=True)
+        with self.assertRaises(EOFError):
+            model.save(temp_filepath, save_format="keras_v3", overwrite=False)
+
+        temp_filepath = os.path.join(self.get_temp_dir(), "mymodel.weights.h5")
+        model = self._get_functional_model()
+        model.save_weights(temp_filepath)
+        model.save_weights(temp_filepath, overwrite=True)
+        with self.assertRaises(EOFError):
+            model.save_weights(temp_filepath, overwrite=False)
+
+    def test_partial_load(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "mymodel.keras")
+        original_model = keras.Sequential(
+            [
+                keras.Input(shape=(3,)),
+                keras.layers.Dense(4),
+                keras.layers.Dense(5),
+            ]
+        )
+        original_model.save(temp_filepath, save_format="keras_v3")
+
+        # Test with a model that has a differently shaped layer
+        new_model = keras.Sequential(
+            [
+                keras.Input(shape=(3,)),
+                keras.layers.Dense(4),
+                keras.layers.Dense(6),
+            ]
+        )
+        new_layer_kernel_value = new_model.layers[1].kernel.numpy()
+        with self.assertRaisesRegex(ValueError, "Shape mismatch"):
+            # Doesn't work by default
+            new_model.load_weights(temp_filepath)
+        # Now it works
+        new_model.load_weights(temp_filepath, skip_mismatch=True)
+        self.assertAllClose(
+            original_model.layers[0].get_weights(),
+            new_model.layers[0].get_weights(),
+        )
+        self.assertAllClose(
+            new_model.layers[1].kernel.numpy(), new_layer_kernel_value
+        )
+
+        # Test with a model that has a new layer
+        new_model = keras.Sequential(
+            [
+                keras.Input(shape=(3,)),
+                keras.layers.Dense(4),
+                keras.layers.Dense(5),
+                keras.layers.Dense(5),
+            ]
+        )
+        new_layer_kernel_value = new_model.layers[2].kernel.numpy()
+        with self.assertRaisesRegex(ValueError, "received 0 variables"):
+            # Doesn't work by default
+            new_model.load_weights(temp_filepath)
+        # Now it works
+        new_model.load_weights(temp_filepath, skip_mismatch=True)
+        self.assertAllClose(
+            original_model.layers[0].get_weights(),
+            new_model.layers[0].get_weights(),
+        )
+        self.assertAllClose(
+            original_model.layers[1].get_weights(),
+            new_model.layers[1].get_weights(),
+        )
+        self.assertAllClose(
+            new_model.layers[2].kernel.numpy(), new_layer_kernel_value
+        )
+
+    def test_api_errors(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "mymodel.notkeras")
+        model = self._get_functional_model()
+        with self.assertRaisesRegex(ValueError, "Unknown `save_format`"):
+            model.save(temp_filepath, save_format="invalid")
+        with self.assertRaisesRegex(ValueError, "Invalid `filepath` argument"):
+            model.save(temp_filepath, save_format="keras_v3")
+
+        temp_filepath = os.path.join(self.get_temp_dir(), "mymodel.keras")
+        with self.assertRaisesRegex(ValueError, "not supported"):
+            model.save(
+                temp_filepath, include_optimizer=False, save_format="keras_v3"
+            )
+
+    def test_safe_mode(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "unsafe_model.keras")
+        model = keras.Sequential(
+            [
+                keras.Input(shape=(3,)),
+                keras.layers.Lambda(lambda x: x * 2),
+            ]
+        )
+        model.save(temp_filepath, save_format="keras_v3")
+        with self.assertRaisesRegex(ValueError, "arbitrary code execution"):
+            model = saving_lib.load_model(temp_filepath)
+        model = saving_lib.load_model(temp_filepath, safe_mode=False)
+
+    def test_normalization_kpl(self):
+        # With adapt
+        temp_filepath = os.path.join(self.get_temp_dir(), "norm_model.keras")
+        model = keras.Sequential(
+            [
+                keras.Input(shape=(3,)),
+                keras.layers.Normalization(),
+            ]
+        )
+        data = np.random.random((3, 3))
+        model.layers[0].adapt(data)
+        ref_out = model(data)
+        model.save(temp_filepath, save_format="keras_v3")
+        model = saving_lib.load_model(temp_filepath)
+        out = model(data)
+        self.assertAllClose(ref_out, out, atol=1e-6)
+
+        # Without adapt
+        model = keras.Sequential(
+            [
+                keras.Input(shape=(3,)),
+                keras.layers.Normalization(
+                    mean=np.random.random((3,)), variance=np.random.random((3,))
+                ),
+            ]
+        )
+        ref_out = model(data)
+        model.save(temp_filepath, save_format="keras_v3")
+        model = saving_lib.load_model(temp_filepath)
+        out = model(data)
+        self.assertAllClose(ref_out, out, atol=1e-6)
+
+    def test_layer_index_naming(self):
+        weights_filepath = os.path.join(self.get_temp_dir(), "model.weights.h5")
+        model = keras.Sequential(
+            [
+                keras.layers.Dense(10),
+                keras.layers.Dense(10),
+                keras.layers.Dense(10),
+                keras.layers.Dense(10),
+            ]
+        )
+        model.build([1, 20])
+        model.save_weights(weights_filepath)
+        with h5py.File(weights_filepath, "r") as f:
+            self.assertAllEqual(
+                list(f["_layer_checkpoint_dependencies"].keys()),
+                ["dense", "dense_1", "dense_2", "dense_3"],
+            )
+
+
+# This custom class lacks custom object registration.
+class CustomRNN(keras.layers.Layer):
+    def __init__(self, units):
+        super(CustomRNN, self).__init__()
+        self.units = units
+        self.projection_1 = keras.layers.Dense(units=units, activation="tanh")
+        self.projection_2 = keras.layers.Dense(units=units, activation="tanh")
+        self.classifier = keras.layers.Dense(1)
+
+    def call(self, inputs):
+        outputs = []
+        state = tf.zeros(shape=(inputs.shape[0], self.units))
+        for t in range(inputs.shape[1]):
+            x = inputs[:, t, :]
+            h = self.projection_1(x)
+            y = h + self.projection_2(state)
+            state = y
+            outputs.append(y)
+        features = tf.stack(outputs, axis=1)
+        return self.classifier(features)
+
+
+# This class is properly registered with a `get_config()` method.
+# However, since it does not subclass keras.layers.Layer, it lacks
+# `from_config()` for deserialization.
+@keras.utils.register_keras_serializable()
+class GrowthFactor:
+    def __init__(self, factor):
+        self.factor = factor
+
+    def __call__(self, inputs):
+        return inputs * self.factor
+
+    def get_config(self):
+        return {"factor": self.factor}
+
+
+@keras.utils.register_keras_serializable(package="Complex")
+class FactorLayer(keras.layers.Layer):
+    def __init__(self, factor):
+        super().__init__()
+        self.factor = factor
+
+    def call(self, x):
+        return x * self.factor
+
+    def get_config(self):
+        return {"factor": self.factor}
+
+
+# This custom model does not explicitly deserialize the layers it includes
+# in its `get_config`. Explicit deserialization in a `from_config` override
+# or `__init__` is needed here, or an error will be thrown at loading time.
+@keras.utils.register_keras_serializable(package="Complex")
+class ComplexModel(keras.layers.Layer):
+    def __init__(self, first_layer, second_layer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.first_layer = first_layer
+        if second_layer is not None:
+            self.second_layer = second_layer
+        else:
+            self.second_layer = keras.layers.Dense(8)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "first_layer": self.first_layer,
+                "second_layer": self.second_layer,
+            }
+        )
+        return config
+
+    def call(self, inputs):
+        return self.first_layer(self.second_layer(inputs))
+
+
+@test_utils.run_v2_only
+class SavingV3BattleTest(tf.test.TestCase, parameterized.TestCase):
+    def test_custom_model_without_registration_error(self):
+        temp_filepath = os.path.join(
+            self.get_temp_dir(), "my_custom_model.keras"
+        )
+        timesteps = 10
+        input_dim = 5
+        batch_size = 16
+
+        inputs = keras.Input(batch_shape=(batch_size, timesteps, input_dim))
+        x = keras.layers.Conv1D(32, 3)(inputs)
+        outputs = CustomRNN(32)(x)
+
+        model = keras.Model(inputs, outputs)
+
+        with self.assertRaisesRegex(
+            TypeError, "is a custom class, please register it"
+        ):
+            model.save(temp_filepath)
+            _ = keras.models.load_model(temp_filepath)
+
+    def test_custom_object_without_from_config(self):
+        temp_filepath = os.path.join(
+            self.get_temp_dir(), "custom_fn_model.keras"
+        )
+
+        inputs = keras.Input(shape=(4, 4))
+        outputs = keras.layers.Dense(1, activation=GrowthFactor(0.5))(inputs)
+        model = keras.Model(inputs, outputs)
+
+        model.save(temp_filepath)
+
+        with self.assertRaisesRegex(
+            TypeError, "Unable to reconstruct an instance"
+        ):
+            _ = keras.models.load_model(temp_filepath)
+
+    def test_complex_model_without_explicit_deserialization(self):
+        temp_filepath = os.path.join(self.get_temp_dir(), "complex_model.keras")
+
+        inputs = keras.Input((32,))
+        outputs = ComplexModel(first_layer=FactorLayer(0.5))(inputs)
+        model = keras.Model(inputs, outputs)
+
+        model.save(temp_filepath)
+
+        with self.assertRaisesRegex(TypeError, "are explicitly deserialized"):
+            _ = keras.models.load_model(temp_filepath)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/saving_utils.py b/keras/saving/saving_utils.py
deleted file mode 100644
index 9dd5e4290698..000000000000
--- a/keras/saving/saving_utils.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils related to keras model saving."""
-
-# pylint: disable=g-bad-import-order, g-direct-tensorflow-import
-import tensorflow.compat.v2 as tf
-import keras
-
-import copy
-import os
-from keras import backend
-from keras import losses
-from keras.optimizers import optimizer_v1
-from keras import optimizers
-from keras.engine import base_layer_utils
-from keras.utils import generic_utils
-from keras.utils import version_utils
-from keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.platform import tf_logging as logging
-# pylint: enable=g-bad-import-order, g-direct-tensorflow-import
-
-
-def extract_model_metrics(model):
-  """Convert metrics from a Keras model `compile` API to dictionary.
-
-  This is used for converting Keras models to Estimators and SavedModels.
-
-  Args:
-    model: A `tf.keras.Model` object.
-
-  Returns:
-    Dictionary mapping metric names to metric instances. May return `None` if
-    the model does not contain any metrics.
-  """
-  if getattr(model, '_compile_metrics', None):
-    # TODO(psv/kathywu): use this implementation in model to estimator flow.
-    # We are not using model.metrics here because we want to exclude the metrics
-    # added using `add_metric` API.
-    return {m.name: m for m in model._compile_metric_functions}  # pylint: disable=protected-access
-  return None
-
-
-def model_call_inputs(model, keep_original_batch_size=False):
-  """Inspect model to get its input signature.
-
-  The model's input signature is a list with a single (possibly-nested) object.
-  This is due to the Keras-enforced restriction that tensor inputs must be
-  passed in as the first argument.
-
-  For example, a model with input {'feature1': <Tensor>, 'feature2': <Tensor>}
-  will have input signature: [{'feature1': TensorSpec, 'feature2': TensorSpec}]
-
-  Args:
-    model: Keras Model object.
-    keep_original_batch_size: A boolean indicating whether we want to keep using
-      the original batch size or set it to None. Default is `False`, which means
-      that the batch dim of the returned input signature will always be set to
-      `None`.
-
-  Returns:
-    A tuple containing `(args, kwargs)` TensorSpecs of the model call function
-    inputs.
-    `kwargs` does not contain the `training` argument.
-  """
-  input_specs = model.save_spec(dynamic_batch=not keep_original_batch_size)
-  if input_specs is None:
-    return None, None
-  input_specs = _enforce_names_consistency(input_specs)
-  return input_specs
-
-
-def raise_model_input_error(model):
-  if isinstance(model, keras.models.Sequential):
-    raise ValueError(
-        f'Model {model} cannot be saved because the input shape is not '
-        'available. Please specify an input shape either by calling '
-        '`build(input_shape)` directly, or by calling the model on actual '
-        'data using `Model()`, `Model.fit()`, or `Model.predict()`.')
-
-  # If the model is not a `Sequential`, it is intended to be a subclassed model.
-  raise ValueError(
-      f'Model {model} cannot be saved either because the input shape is not '
-      'available or because the forward pass of the model is not defined.'
-      'To define a forward pass, please override `Model.call()`. To specify '
-      'an input shape, either call `build(input_shape)` directly, or call '
-      'the model on actual data using `Model()`, `Model.fit()`, or '
-      '`Model.predict()`. If you have a custom training step, please make '
-      'sure to invoke the forward pass in train step through '
-      '`Model.__call__`, i.e. `model(inputs)`, as opposed to `model.call()`.')
-
-
-def trace_model_call(model, input_signature=None):
-  """Trace the model call to create a tf.function for exporting a Keras model.
-
-  Args:
-    model: A Keras model.
-    input_signature: optional, a list of tf.TensorSpec objects specifying the
-      inputs to the model.
-
-  Returns:
-    A tf.function wrapping the model's call function with input signatures set.
-
-  Raises:
-    ValueError: if input signature cannot be inferred from the model.
-  """
-  if input_signature is None:
-    if isinstance(model.call, tf.__internal__.function.Function):
-      input_signature = model.call.input_signature
-
-  if input_signature:
-    model_args = input_signature
-    model_kwargs = {}
-  else:
-    model_args, model_kwargs = model_call_inputs(model)
-
-    if model_args is None:
-      raise_model_input_error(model)
-
-  @tf.function
-  def _wrapped_model(*args, **kwargs):
-    """A concrete tf.function that wraps the model's call function."""
-    args, kwargs = model._call_spec.set_arg_value(  # pylint: disable=protected-access
-        'training', False, args, kwargs, inputs_in_args=True)
-
-    with base_layer_utils.call_context().enter(
-        model, inputs=None, build_graph=False, training=False, saving=True):
-      outputs = model(*args, **kwargs)
-
-    # Outputs always has to be a flat dict.
-    output_names = model.output_names  # Functional Model.
-    if output_names is None:  # Subclassed Model.
-      from keras.engine import compile_utils  # pylint: disable=g-import-not-at-top
-      output_names = compile_utils.create_pseudo_output_names(outputs)
-    outputs = tf.nest.flatten(outputs)
-    return {name: output for name, output in zip(output_names, outputs)}
-
-  return _wrapped_model.get_concrete_function(*model_args, **model_kwargs)
-
-
-def model_metadata(model, include_optimizer=True, require_config=True):
-  """Returns a dictionary containing the model metadata."""
-  from keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-  from keras.optimizers.optimizer_v2 import optimizer_v2  # pylint: disable=g-import-not-at-top
-
-  model_config = {'class_name': model.__class__.__name__}
-  try:
-    model_config['config'] = model.get_config()
-  except NotImplementedError as e:
-    if require_config:
-      raise e
-
-  metadata = dict(
-      keras_version=str(keras_version),
-      backend=backend.backend(),
-      model_config=model_config)
-  if model.optimizer and include_optimizer:
-    if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
-      logging.warning(
-          'TensorFlow optimizers do not '
-          'make it possible to access '
-          'optimizer attributes or optimizer state '
-          'after instantiation. '
-          'As a result, we cannot save the optimizer '
-          'as part of the model save file. '
-          'You will have to compile your model again after loading it. '
-          'Prefer using a Keras optimizer instead '
-          '(see keras.io/optimizers).')
-    elif model._compile_was_called:  # pylint: disable=protected-access
-      training_config = model._get_compile_args(user_metrics=False)  # pylint: disable=protected-access
-      training_config.pop('optimizer', None)  # Handled separately.
-      metadata['training_config'] = _serialize_nested_config(training_config)
-      if isinstance(model.optimizer, optimizer_v2.RestoredOptimizer):
-        raise NotImplementedError(
-            'Optimizers loaded from a SavedModel cannot be saved. '
-            'If you are calling `model.save` or `tf.keras.models.save_model`, '
-            'please set the `include_optimizer` option to `False`. For '
-            '`tf.saved_model.save`, delete the optimizer from the model.')
-      else:
-        optimizer_config = {
-            'class_name':
-                generic_utils.get_registered_name(model.optimizer.__class__),
-            'config':
-                model.optimizer.get_config()
-        }
-      metadata['training_config']['optimizer_config'] = optimizer_config
-  return metadata
-
-
-def should_overwrite(filepath, overwrite):
-  """Returns whether the filepath should be overwritten."""
-  # If file exists and should not be overwritten.
-  if not overwrite and os.path.isfile(filepath):
-    return ask_to_proceed_with_overwrite(filepath)
-  return True
-
-
-def compile_args_from_training_config(training_config, custom_objects=None):
-  """Return model.compile arguments from training config."""
-  if custom_objects is None:
-    custom_objects = {}
-
-  with generic_utils.CustomObjectScope(custom_objects):
-    optimizer_config = training_config['optimizer_config']
-    optimizer = optimizers.deserialize(optimizer_config)
-
-    # Recover losses.
-    loss = None
-    loss_config = training_config.get('loss', None)
-    if loss_config is not None:
-      loss = _deserialize_nested_config(losses.deserialize, loss_config)
-
-    # Recover metrics.
-    metrics = None
-    metrics_config = training_config.get('metrics', None)
-    if metrics_config is not None:
-      metrics = _deserialize_nested_config(_deserialize_metric, metrics_config)
-
-    # Recover weighted metrics.
-    weighted_metrics = None
-    weighted_metrics_config = training_config.get('weighted_metrics', None)
-    if weighted_metrics_config is not None:
-      weighted_metrics = _deserialize_nested_config(_deserialize_metric,
-                                                    weighted_metrics_config)
-
-    sample_weight_mode = training_config['sample_weight_mode'] if hasattr(
-        training_config, 'sample_weight_mode') else None
-    loss_weights = training_config['loss_weights']
-
-  return dict(
-      optimizer=optimizer,
-      loss=loss,
-      metrics=metrics,
-      weighted_metrics=weighted_metrics,
-      loss_weights=loss_weights,
-      sample_weight_mode=sample_weight_mode)
-
-
-def _deserialize_nested_config(deserialize_fn, config):
-  """Deserializes arbitrary Keras `config` using `deserialize_fn`."""
-
-  def _is_single_object(obj):
-    if isinstance(obj, dict) and 'class_name' in obj:
-      return True  # Serialized Keras object.
-    if isinstance(obj, str):
-      return True  # Serialized function or string.
-    return False
-
-  if config is None:
-    return None
-  if _is_single_object(config):
-    return deserialize_fn(config)
-  elif isinstance(config, dict):
-    return {
-        k: _deserialize_nested_config(deserialize_fn, v)
-        for k, v in config.items()
-    }
-  elif isinstance(config, (tuple, list)):
-    return [_deserialize_nested_config(deserialize_fn, obj) for obj in config]
-
-  raise ValueError(
-      'Saved configuration not understood. Configuration should be a '
-      f'dictionary, string, tuple or list. Received: config={config}.')
-
-
-def _serialize_nested_config(config):
-  """Serialized a nested structure of Keras objects."""
-
-  def _serialize_fn(obj):
-    if callable(obj):
-      return generic_utils.serialize_keras_object(obj)
-    return obj
-
-  return tf.nest.map_structure(_serialize_fn, config)
-
-
-def _deserialize_metric(metric_config):
-  """Deserialize metrics, leaving special strings untouched."""
-  from keras import metrics as metrics_module  # pylint:disable=g-import-not-at-top
-  if metric_config in ['accuracy', 'acc', 'crossentropy', 'ce']:
-    # Do not deserialize accuracy and cross-entropy strings as we have special
-    # case handling for these in compile, based on model output shape.
-    return metric_config
-  return metrics_module.deserialize(metric_config)
-
-
-def _enforce_names_consistency(specs):
-  """Enforces that either all specs have names or none do."""
-
-  def _has_name(spec):
-    return spec is None or (hasattr(spec, 'name') and spec.name is not None)
-
-  def _clear_name(spec):
-    spec = copy.deepcopy(spec)
-    if hasattr(spec, 'name'):
-      spec._name = None  # pylint:disable=protected-access
-    return spec
-
-  flat_specs = tf.nest.flatten(specs)
-  name_inconsistency = (
-      any(_has_name(s) for s in flat_specs) and
-      not all(_has_name(s) for s in flat_specs))
-
-  if name_inconsistency:
-    specs = tf.nest.map_structure(_clear_name, specs)
-  return specs
-
-
-def try_build_compiled_arguments(model):
-  if (not version_utils.is_v1_layer_or_model(model) and
-      model.outputs is not None):
-    try:
-      if not model.compiled_loss.built:
-        model.compiled_loss.build(model.outputs)
-      if not model.compiled_metrics.built:
-        model.compiled_metrics.build(model.outputs, model.outputs)
-    except:  # pylint: disable=bare-except
-      logging.warning(
-          'Compiled the loaded model, but the compiled metrics have yet to '
-          'be built. `model.compile_metrics` will be empty until you train '
-          'or evaluate the model.')
-
-
-def is_hdf5_filepath(filepath):
-  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
-          filepath.endswith('.hdf5'))
diff --git a/keras/saving/saving_utils_test.py b/keras/saving/saving_utils_test.py
deleted file mode 100644
index f9bb9939db35..000000000000
--- a/keras/saving/saving_utils_test.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for saving utility functions."""
-
-import tensorflow.compat.v2 as tf
-
-import os
-
-import numpy as np
-
-import keras
-from keras import backend
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
-from keras.engine import sequential
-from keras.feature_column import dense_features
-from keras.optimizers.optimizer_v2 import gradient_descent
-from keras.saving import saving_utils
-
-
-class TraceModelCallTest(test_combinations.TestCase):
-
-  def _assert_all_close(self, expected, actual):
-    if not tf.executing_eagerly():
-      with self.cached_session() as sess:
-        backend._initialize_variables(sess)
-        self.assertAllClose(expected, actual)
-    else:
-      self.assertAllClose(expected, actual)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_trace_model_outputs(self):
-    input_dim = 5 if test_utils.get_model_type() == 'functional' else None
-    model = test_utils.get_small_mlp(10, 3, input_dim)
-    inputs = tf.ones((8, 5))
-
-    if input_dim is None:
-      with self.assertRaisesRegex(ValueError, '.*input shape is not availabl*'):
-        saving_utils.trace_model_call(model)
-      model._set_inputs(inputs)
-
-    fn = saving_utils.trace_model_call(model)
-    signature_outputs = fn(inputs)
-    if model.output_names:
-      expected_outputs = {model.output_names[0]: model(inputs)}
-    else:
-      expected_outputs = {'output_1': model(inputs)}
-
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_trace_model_outputs_after_fitting(self):
-    input_dim = 5 if test_utils.get_model_type() == 'functional' else None
-    model = test_utils.get_small_mlp(10, 3, input_dim)
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(
-        x=np.random.random((8, 5)).astype(np.float32),
-        y=np.random.random((8, 3)).astype(np.float32),
-        epochs=2)
-
-    inputs = tf.ones((8, 5))
-
-    fn = saving_utils.trace_model_call(model)
-    signature_outputs = fn(inputs)
-    if model.output_names:
-      expected_outputs = {model.output_names[0]: model(inputs)}
-    else:
-      expected_outputs = {'output_1': model(inputs)}
-
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @test_combinations.run_with_all_model_types(exclude_models='sequential')
-  @test_combinations.run_all_keras_modes
-  def test_trace_multi_io_model_outputs(self):
-    input_dim = 5
-    num_classes = 3
-    num_classes_b = 4
-    input_a = keras.layers.Input(shape=(input_dim,), name='input_a')
-    input_b = keras.layers.Input(shape=(input_dim,), name='input_b')
-
-    dense = keras.layers.Dense(num_classes, name='dense')
-    dense2 = keras.layers.Dense(num_classes_b, name='dense2')
-    dropout = keras.layers.Dropout(0.5, name='dropout')
-    branch_a = [input_a, dense]
-    branch_b = [input_b, dense, dense2, dropout]
-
-    model = test_utils.get_multi_io_model(branch_a, branch_b)
-
-    input_a_ts = tf.constant(
-        np.random.random((10, input_dim)).astype(np.float32))
-    input_b_ts = tf.constant(
-        np.random.random((10, input_dim)).astype(np.float32))
-
-    if test_utils.get_model_type() == 'subclass':
-      with self.assertRaisesRegex(ValueError, '.*input shape is not availabl*'):
-        saving_utils.trace_model_call(model)
-
-    model.compile(
-        optimizer='sgd',
-        loss='mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x=[np.random.random((8, input_dim)).astype(np.float32),
-                 np.random.random((8, input_dim)).astype(np.float32)],
-              y=[np.random.random((8, num_classes)).astype(np.float32),
-                 np.random.random((8, num_classes_b)).astype(np.float32)],
-              epochs=2)
-
-    fn = saving_utils.trace_model_call(model)
-    # tf.function requires that the input structures match when calling a
-    # ConcreteFunction. For some reason V1 models defines the inputs as a list,
-    # while V2 models sets the inputs as a tuple.
-    if (not tf.executing_eagerly() and
-        test_utils.get_model_type() != 'functional'):
-      signature_outputs = fn([input_a_ts, input_b_ts])
-    else:
-      signature_outputs = fn((input_a_ts, input_b_ts))
-    outputs = model([input_a_ts, input_b_ts])
-    if model.output_names:
-      expected_outputs = {
-          model.output_names[0]: outputs[0],
-          model.output_names[1]: outputs[1]
-      }
-    else:
-      expected_outputs = {'output_1': outputs[0], 'output_2': outputs[1]}
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_trace_features_layer(self):
-    columns = [tf.feature_column.numeric_column('x')]
-    model = sequential.Sequential([dense_features.DenseFeatures(columns)])
-    model_input = {'x': tf.constant([[1.]])}
-    model.predict(model_input, steps=1)
-    fn = saving_utils.trace_model_call(model)
-    self.assertAllClose({'output_1': [[1.]]}, fn(model_input))
-
-    columns = [
-        tf.feature_column.numeric_column('x'),
-        tf.feature_column.numeric_column('y')
-    ]
-    model = sequential.Sequential([dense_features.DenseFeatures(columns)])
-    model_input = {'x': tf.constant([[1.]]),
-                   'y': tf.constant([[2.]])}
-    model.predict(model_input, steps=1)
-    fn = saving_utils.trace_model_call(model)
-    self.assertAllClose({'output_1': [[1., 2.]]}, fn(model_input))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_specify_input_signature(self):
-    model = test_utils.get_small_sequential_mlp(10, 3, None)
-    inputs = tf.ones((8, 5))
-
-    with self.assertRaisesRegex(ValueError, '.*input shape is not availabl*'):
-      saving_utils.trace_model_call(model)
-
-    fn = saving_utils.trace_model_call(
-        model, [tf.TensorSpec(shape=[None, 5], dtype=tf.float32)])
-    signature_outputs = fn(inputs)
-    if model.output_names:
-      expected_outputs = {model.output_names[0]: model(inputs)}
-    else:
-      expected_outputs = {'output_1': model(inputs)}
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=['graph', 'eager']))
-  def test_subclassed_model_with_input_signature(self):
-
-    class Model(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense = keras.layers.Dense(3, name='dense')
-
-      @tf.function(
-          input_signature=[[tf.TensorSpec([None, 5], tf.float32),
-                            tf.TensorSpec([None], tf.float32)]],)
-      def call(self, inputs, *args):
-        x, y = inputs
-        return self.dense(x) + y
-
-    model = Model()
-    fn = saving_utils.trace_model_call(model)
-    x = tf.ones((8, 5), dtype=tf.float32)
-    y = tf.ones((3,), dtype=tf.float32)
-    expected_outputs = {'output_1': model([x, y])}
-    signature_outputs = fn([x, y])
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @test_combinations.run_with_all_model_types
-  @test_combinations.run_all_keras_modes
-  def test_model_with_fixed_input_dim(self):
-    """Ensure that the batch_dim is removed when saving.
-
-    When serving or retraining, it is important to reset the batch dim.
-    This can be an issue inside of tf.function. See b/132783590 for context.
-    """
-    model = test_utils.get_small_mlp(10, 3, 5)
-
-    loss_object = keras.losses.MeanSquaredError()
-    optimizer = gradient_descent.SGD()
-
-    @tf.function
-    def train_step(data, labels):
-      with tf.GradientTape() as tape:
-        predictions = model(data)
-        loss = loss_object(labels, predictions)
-      gradients = tape.gradient(loss, model.trainable_variables)
-      optimizer.apply_gradients(zip(gradients, model.trainable_variables))
-
-    x = np.random.random((8, 5))
-    y = np.random.random((8, 3))
-
-    train_step(x, y)
-
-    fn = saving_utils.trace_model_call(model)
-    self.assertEqual(fn.structured_input_signature[0][0].shape.as_list(),
-                     tf.TensorShape([None, 5]).as_list())
-
-
-def _import_and_infer(save_dir, inputs):
-  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
-  graph = tf.Graph()
-  with graph.as_default(), tf.compat.v1.Session() as session:
-    model = tf.compat.v1.saved_model.load(session, [tf.saved_model.SERVING], save_dir)
-    signature = model.signature_def[
-        tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-    assert set(inputs.keys()) == set(
-        signature.inputs.keys()), ('expected {}, found {}'.format(
-            signature.inputs.keys(), inputs.keys()))
-    feed_dict = {}
-    for arg_name in inputs.keys():
-      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
-          inputs[arg_name])
-    output_dict = {}
-    for output_name, output_tensor_info in signature.outputs.items():
-      output_dict[output_name] = graph.get_tensor_by_name(
-          output_tensor_info.name)
-    return session.run(output_dict, feed_dict=feed_dict)
-
-
-class AutographedMetric(keras.metrics.Metric):
-
-  def build(self, input_shape):
-    pass
-
-  def update_state(self, values):
-    if tf.constant(False):
-      x = 1
-    else:
-      x = 2
-    return x
-
-  def reset_states(self):
-    pass
-
-  def result(self):
-    return tf.constant(0)
-
-  def GetMean(self):
-    return tf.constant(0)
-
-  def GetCount(self):
-    return tf.constant(0)
-
-
-class BasicAutographedMetricLayer(keras.layers.Layer):
-
-  def build(self, input_shape):
-    self._metric = AutographedMetric()
-
-  def call(self, inp):
-    self._metric.update_state(inp)
-    # TODO(b/172853147): Test control flow here.
-    return inp
-
-
-class BasicAutographedMetricModel(keras.models.Model):
-
-  def __init__(self):
-    super().__init__(name='test_model')
-    self._layer = BasicAutographedMetricLayer()
-
-  def call(self, inputs, **kwargs):
-    return self._layer(inputs)
-
-
-@test_combinations.run_with_all_model_types
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class ModelSaveTest(test_combinations.TestCase):
-
-  def test_model_save_preserves_autograph(self):
-    model = BasicAutographedMetricModel()
-    inputs = tf.ones((8, 5))
-    model._set_inputs(inputs)
-
-    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    tf.saved_model.save(model, save_dir)
-
-    if model.output_names:
-      output_name = model.output_names[0]
-      input_name = model.input_names[0]
-    else:
-      output_name = 'output_1'
-      input_name = 'input_1'
-
-    self.assertAllClose({output_name: model.predict_on_batch(inputs)},
-                        _import_and_infer(save_dir,
-                                          {input_name: np.ones((8, 5))}))
-
-    # Test v2 loading.
-    # TODO(mdan): tests using _import_and_infer should uniformly do this.
-    self.assertAllClose(model.predict_on_batch(inputs),
-                        tf.saved_model.load(save_dir)(inputs))
-
-  def test_model_save(self):
-    input_dim = 5
-    model = test_utils.get_small_mlp(10, 3, input_dim)
-    inputs = tf.ones((8, 5))
-
-    if test_utils.get_model_type() == 'subclass':
-      model._set_inputs(inputs)
-
-    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    tf.saved_model.save(model, save_dir)
-
-    if model.output_names:
-      output_name = model.output_names[0]
-      input_name = model.input_names[0]
-    else:
-      output_name = 'output_1'
-      input_name = 'input_1'
-
-    self.assertAllClose({output_name: model.predict_on_batch(inputs)},
-                        _import_and_infer(save_dir,
-                                          {input_name: np.ones((8, 5))}))
-
-
-class ExtractModelMetricsTest(test_combinations.TestCase):
-
-  def test_extract_model_metrics(self):
-    # saving_utils.extract_model_metrics is used in V1 only API
-    # keras.experimental.export_saved_model.
-    with tf.Graph().as_default():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
-
-      model = keras.models.Model([a, b], [d, e])
-      extract_metrics = saving_utils.extract_model_metrics(model)
-      self.assertEqual(None, extract_metrics)
-
-      extract_metric_names = [
-          'dense_binary_accuracy', 'dropout_binary_accuracy',
-          'dense_mean_squared_error', 'dropout_mean_squared_error'
-      ]
-      if tf.__internal__.tf2.enabled():
-        extract_metric_names.extend(['dense_mae', 'dropout_mae'])
-      else:
-        extract_metric_names.extend(
-            ['dense_mean_absolute_error', 'dropout_mean_absolute_error'])
-
-      model_metric_names = ['loss', 'dense_loss', 'dropout_loss'
-                           ] + extract_metric_names
-      model.compile(
-          loss='mae',
-          metrics=[
-              keras.metrics.BinaryAccuracy(), 'mae',
-              keras.metrics.mean_squared_error
-          ],
-          optimizer=tf.compat.v1.train.RMSPropOptimizer(learning_rate=0.01))
-      extract_metrics = saving_utils.extract_model_metrics(model)
-      self.assertEqual(set(model_metric_names), set(model.metrics_names))
-      self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
-
-
-class UnbuiltModelSavingErrorMessageTest(test_combinations.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest('The test does not intend to cover TF1.')
-
-  def test_sequential(self):
-    model = sequential.Sequential([keras.layers.Dense(10)])
-    optimizer = gradient_descent.SGD()
-    model.compile(optimizer, loss='mse', steps_per_execution=10)
-
-    # Forward pass not called yet. Input shape not available and thus error.
-    with self.assertRaisesRegex(
-        ValueError,
-        'Model.*cannot be saved.*specify an input shape either by calling.*'):
-      model.save(os.path.join(self.get_temp_dir(), 'my_saved_model'))
-
-  def test_functional(self):
-    inputs = keras.Input(shape=(32,))
-    outputs = keras.layers.Dense(1)(inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
-
-    x = np.random.random((1000, 32))
-    y = np.random.random((1000, 1))
-    model.fit(x, y, epochs=3)
-
-    # Functional model always has an input shape, so should save just fine.
-    model.save(os.path.join(self.get_temp_dir(), 'my_saved_model'))
-
-  def test_subclass_forward_pass_by_layer_underscore_call(self):
-
-    class CustomModel(keras.Model):
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.dense1 = keras.layers.Dense(1)
-
-      def train_step(self, data):
-        x, y = data
-        with tf.GradientTape() as tape:
-          y_pred = self.dense1(x, training=True)
-          loss = self.compiled_loss(y, y_pred)
-
-        gradients = tape.gradient(loss, self.trainable_variables)
-        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-        return {}
-
-    subclassed_model = CustomModel()
-    subclassed_model.compile(optimizer='adam', loss='mse')
-
-    x = np.random.random((1000, 32))
-    y = np.random.random((1000, 1))
-    subclassed_model.fit(x, y, epochs=1)
-
-    # Saving of this subclassed model is supposed to raise an error, even if
-    # `fit` has been called. This is because the model does not have `call()`
-    # overridden. Forward pass using `layer.__call__` works for training, but
-    # saving requires that `call()` be used.
-    with self.assertRaisesRegex(
-        ValueError, r'Model.*cannot be saved.*as opposed to `model.call\(\).*'):
-      subclassed_model.save(os.path.join(self.get_temp_dir(), 'my_saved_model'))
-
-  def test_subclass_forward_pass_by_model_call(self):
-
-    class CustomModel(keras.Model):
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.dense1 = keras.layers.Dense(1)
-
-      def call(self, inputs):
-        return self.dense1(inputs)
-
-      def train_step(self, data):
-        x, y = data
-        with tf.GradientTape() as tape:
-          y_pred = self.call(x)
-          loss = self.compiled_loss(y, y_pred)
-
-        gradients = tape.gradient(loss, self.trainable_variables)
-        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-        return {}
-
-    subclassed_model = CustomModel()
-    subclassed_model.compile(optimizer='adam', loss='mse')
-
-    x = np.random.random((1000, 32))
-    y = np.random.random((1000, 1))
-    subclassed_model.fit(x, y, epochs=1)
-
-    # Saving of this subclassed model is supposed to raise an error, even if
-    # `fit` has been called. This is because the model has `call()` overridden,
-    # but the forward pass uses `Model.call` as opposed to `Model.__call__`, and
-    # as a result the `Model` is not really built. The error message hints the
-    # user to use `Model.__call__`, i.e., `Model(inputs)` instead.
-    with self.assertRaisesRegex(
-        ValueError, r'Model.*cannot be saved.*as opposed to `model.call\(\).*'):
-      subclassed_model.save(os.path.join(self.get_temp_dir(), 'my_saved_model'))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/keras/saving/serialization_lib.py b/keras/saving/serialization_lib.py
new file mode 100644
index 000000000000..6f72af9f64b7
--- /dev/null
+++ b/keras/saving/serialization_lib.py
@@ -0,0 +1,832 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Object config serialization and deserialization logic."""
+
+import importlib
+import inspect
+import threading
+import types
+import warnings
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras.saving import object_registration
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.saving.legacy.saved_model.utils import in_tf_saved_model_scope
+from keras.utils import generic_utils
+
+# isort: off
+from tensorflow.python.util import tf_export
+from tensorflow.python.util.tf_export import keras_export
+
+PLAIN_TYPES = (str, int, float, bool)
+SHARED_OBJECTS = threading.local()
+SAFE_MODE = threading.local()
+# TODO(nkovela): Debug serialization of decorated functions inside lambdas
+# to allow for serialization of custom_gradient.
+NON_SERIALIZABLE_CLASS_MODULES = ("tensorflow.python.ops.custom_gradient",)
+
+# List of Keras modules with built-in string representations for Keras defaults
+BUILTIN_MODULES = (
+    "activations",
+    "constraints",
+    "initializers",
+    "losses",
+    "metrics",
+    "optimizers",
+    "regularizers",
+)
+
+
+class Config:
+    def __init__(self, **config):
+        self.config = config
+
+    def serialize(self):
+        return serialize_keras_object(self.config)
+
+
+class SafeModeScope:
+    """Scope to propagate safe mode flag to nested deserialization calls."""
+
+    def __init__(self, safe_mode=True):
+        self.safe_mode = safe_mode
+
+    def __enter__(self):
+        self.original_value = in_safe_mode()
+        SAFE_MODE.safe_mode = self.safe_mode
+
+    def __exit__(self, *args, **kwargs):
+        SAFE_MODE.safe_mode = self.original_value
+
+
+@keras_export("keras.__internal__.enable_unsafe_deserialization")
+def enable_unsafe_deserialization():
+    """Disables safe mode globally, allowing deserialization of lambdas."""
+    SAFE_MODE.safe_mode = False
+
+
+def in_safe_mode():
+    return getattr(SAFE_MODE, "safe_mode", None)
+
+
+class ObjectSharingScope:
+    """Scope to enable detection and reuse of previously seen objects."""
+
+    def __enter__(self):
+        SHARED_OBJECTS.enabled = True
+        SHARED_OBJECTS.id_to_obj_map = {}
+        SHARED_OBJECTS.id_to_config_map = {}
+
+    def __exit__(self, *args, **kwargs):
+        SHARED_OBJECTS.enabled = False
+        SHARED_OBJECTS.id_to_obj_map = {}
+        SHARED_OBJECTS.id_to_config_map = {}
+
+
+def get_shared_object(obj_id):
+    """Retrieve an object previously seen during deserialization."""
+    if getattr(SHARED_OBJECTS, "enabled", False):
+        return SHARED_OBJECTS.id_to_obj_map.get(obj_id, None)
+
+
+def record_object_after_serialization(obj, config):
+    """Call after serializing an object, to keep track of its config."""
+    if config["module"] == "__main__":
+        config["module"] = None  # Ensures module is None when no module found
+    if not getattr(SHARED_OBJECTS, "enabled", False):
+        return  # Not in a sharing scope
+    obj_id = int(id(obj))
+    if obj_id not in SHARED_OBJECTS.id_to_config_map:
+        SHARED_OBJECTS.id_to_config_map[obj_id] = config
+    else:
+        config["shared_object_id"] = obj_id
+        prev_config = SHARED_OBJECTS.id_to_config_map[obj_id]
+        prev_config["shared_object_id"] = obj_id
+
+
+def record_object_after_deserialization(obj, obj_id):
+    """Call after deserializing an object, to keep track of it in the future."""
+    if not getattr(SHARED_OBJECTS, "enabled", False):
+        return  # Not in a sharing scope
+    SHARED_OBJECTS.id_to_obj_map[obj_id] = obj
+
+
+@keras_export(
+    "keras.saving.serialize_keras_object", "keras.utils.serialize_keras_object"
+)
+def serialize_keras_object(obj):
+    """Retrieve the config dict by serializing the Keras object.
+
+    `serialize_keras_object()` serializes a Keras object to a python dictionary
+    that represents the object, and is a reciprocal function of
+    `deserialize_keras_object()`. See `deserialize_keras_object()` for more
+    information about the config format.
+
+    Args:
+      obj: the Keras object to serialize.
+
+    Returns:
+      A python dict that represents the object. The python dict can be
+      deserialized via `deserialize_keras_object()`.
+    """
+    # Fall back to legacy serialization for all TF1 users or if
+    # wrapped by in_tf_saved_model_scope() to explicitly use legacy
+    # saved_model logic.
+    if not tf.__internal__.tf2.enabled() or in_tf_saved_model_scope():
+        return legacy_serialization.serialize_keras_object(obj)
+
+    if obj is None:
+        return obj
+
+    if isinstance(obj, PLAIN_TYPES):
+        return obj
+
+    if isinstance(obj, (list, tuple)):
+        config_arr = [serialize_keras_object(x) for x in obj]
+        return tuple(config_arr) if isinstance(obj, tuple) else config_arr
+    if isinstance(obj, dict):
+        return serialize_dict(obj)
+
+    # Special cases:
+    if isinstance(obj, bytes):
+        return {
+            "class_name": "__bytes__",
+            "config": {"value": obj.decode("utf-8")},
+        }
+    if isinstance(obj, tf.TensorShape):
+        return obj.as_list() if obj._dims is not None else None
+    if isinstance(obj, tf.Tensor):
+        return {
+            "class_name": "__tensor__",
+            "config": {
+                "value": obj.numpy().tolist(),
+                "dtype": obj.dtype.name,
+            },
+        }
+    if type(obj).__module__ == np.__name__:
+        if isinstance(obj, np.ndarray) and obj.ndim > 0:
+            return {
+                "class_name": "__numpy__",
+                "config": {
+                    "value": obj.tolist(),
+                    "dtype": obj.dtype.name,
+                },
+            }
+        else:
+            # Treat numpy floats / etc as plain types.
+            return obj.item()
+    if isinstance(obj, tf.DType):
+        return obj.name
+    if isinstance(obj, tf.compat.v1.Dimension):
+        return obj.value
+    if isinstance(obj, types.FunctionType) and obj.__name__ == "<lambda>":
+        warnings.warn(
+            "The object being serialized includes a `lambda`. This is unsafe. "
+            "In order to reload the object, you will have to pass "
+            "`safe_mode=False` to the loading function. "
+            "Please avoid using `lambda` in the "
+            "future, and use named Python functions instead. "
+            f"This is the `lambda` being serialized: {inspect.getsource(obj)}",
+            stacklevel=2,
+        )
+        return {
+            "class_name": "__lambda__",
+            "config": {
+                "value": generic_utils.func_dump(obj),
+            },
+        }
+    if isinstance(obj, tf.TypeSpec):
+        ts_config = obj._serialize()
+        # TensorShape and tf.DType conversion
+        ts_config = list(
+            map(
+                lambda x: x.as_list()
+                if isinstance(x, tf.TensorShape)
+                else (x.name if isinstance(x, tf.DType) else x),
+                ts_config,
+            )
+        )
+        spec_name = obj.__class__.__name__
+        registered_name = None
+        if hasattr(obj, "_tf_extension_type_fields"):
+            # Special casing for ExtensionType
+            ts_config = tf.experimental.extension_type.as_dict(obj)
+            ts_config = serialize_dict(ts_config)
+            registered_name = object_registration.get_registered_name(
+                obj.__class__
+            )
+        return {
+            "class_name": "__typespec__",
+            "spec_name": spec_name,
+            "module": obj.__class__.__module__,
+            "config": ts_config,
+            "registered_name": registered_name,
+        }
+
+    inner_config = _get_class_or_fn_config(obj)
+    config_with_public_class = serialize_with_public_class(
+        obj.__class__, inner_config
+    )
+
+    # TODO(nkovela): Add TF ops dispatch handler serialization for
+    # ops.EagerTensor that contains nested numpy array.
+    # Target: NetworkConstructionTest.test_constant_initializer_with_numpy
+    if isinstance(inner_config, str) and inner_config == "op_dispatch_handler":
+        return obj
+
+    if config_with_public_class is not None:
+
+        # Special case for non-serializable class modules
+        if any(
+            mod in config_with_public_class["module"]
+            for mod in NON_SERIALIZABLE_CLASS_MODULES
+        ):
+            return obj
+
+        get_build_and_compile_config(obj, config_with_public_class)
+        record_object_after_serialization(obj, config_with_public_class)
+        return config_with_public_class
+
+    # Any custom object or otherwise non-exported object
+    if isinstance(obj, types.FunctionType):
+        module = obj.__module__
+    else:
+        module = obj.__class__.__module__
+    class_name = obj.__class__.__name__
+
+    if module == "builtins":
+        registered_name = None
+    else:
+        if isinstance(obj, types.FunctionType):
+            registered_name = object_registration.get_registered_name(obj)
+        else:
+            registered_name = object_registration.get_registered_name(
+                obj.__class__
+            )
+
+    config = {
+        "module": module,
+        "class_name": class_name,
+        "config": inner_config,
+        "registered_name": registered_name,
+    }
+    get_build_and_compile_config(obj, config)
+    record_object_after_serialization(obj, config)
+    return config
+
+
+def get_build_and_compile_config(obj, config):
+    if hasattr(obj, "get_build_config"):
+        build_config = obj.get_build_config()
+        if build_config is not None:
+            config["build_config"] = serialize_dict(build_config)
+    if hasattr(obj, "get_compile_config"):
+        compile_config = obj.get_compile_config()
+        if compile_config is not None:
+            config["compile_config"] = serialize_dict(compile_config)
+    return
+
+
+def serialize_with_public_class(cls, inner_config=None):
+    """Serializes classes from public Keras API or object registration.
+
+    Called to check and retrieve the config of any class that has a public
+    Keras API or has been registered as serializable via
+    `keras.saving.register_keras_serializable()`.
+    """
+    # This gets the `keras.*` exported name, such as "keras.optimizers.Adam".
+    keras_api_name = tf_export.get_canonical_name_for_symbol(
+        cls, api_name="keras"
+    )
+
+    # Case of custom or unknown class object
+    if keras_api_name is None:
+        registered_name = object_registration.get_registered_name(cls)
+        if registered_name is None:
+            return None
+
+        # Return custom object config with corresponding registration name
+        return {
+            "module": cls.__module__,
+            "class_name": cls.__name__,
+            "config": inner_config,
+            "registered_name": registered_name,
+        }
+
+    # Split the canonical Keras API name into a Keras module and class name.
+    parts = keras_api_name.split(".")
+    return {
+        "module": ".".join(parts[:-1]),
+        "class_name": parts[-1],
+        "config": inner_config,
+        "registered_name": None,
+    }
+
+
+def serialize_with_public_fn(fn, config, fn_module_name=None):
+    """Serializes functions from public Keras API or object registration.
+
+    Called to check and retrieve the config of any function that has a public
+    Keras API or has been registered as serializable via
+    `keras.saving.register_keras_serializable()`. If function's module name is
+    already known, returns corresponding config.
+    """
+    if fn_module_name:
+        return {
+            "module": fn_module_name,
+            "class_name": "function",
+            "config": config,
+            "registered_name": config,
+        }
+    keras_api_name = tf_export.get_canonical_name_for_symbol(
+        fn, api_name="keras"
+    )
+    if keras_api_name:
+        parts = keras_api_name.split(".")
+        return {
+            "module": ".".join(parts[:-1]),
+            "class_name": "function",
+            "config": config,
+            "registered_name": config,
+        }
+    else:
+        registered_name = object_registration.get_registered_name(fn)
+        if not registered_name and not fn.__module__ == "builtins":
+            return None
+        return {
+            "module": fn.__module__,
+            "class_name": "function",
+            "config": config,
+            "registered_name": registered_name,
+        }
+
+
+def _get_class_or_fn_config(obj):
+    """Return the object's config depending on its type."""
+    # Functions / lambdas:
+    if isinstance(obj, types.FunctionType):
+        return obj.__name__
+    # All classes:
+    if hasattr(obj, "get_config"):
+        config = obj.get_config()
+        if not isinstance(config, dict):
+            raise TypeError(
+                f"The `get_config()` method of {obj} should return "
+                f"a dict. It returned: {config}"
+            )
+        return serialize_dict(config)
+    elif hasattr(obj, "__name__"):
+        return object_registration.get_registered_name(obj)
+    else:
+        raise TypeError(
+            f"Cannot serialize object {obj} of type {type(obj)}. "
+            "To be serializable, "
+            "a class must implement the `get_config()` method."
+        )
+
+
+def serialize_dict(obj):
+    return {key: serialize_keras_object(value) for key, value in obj.items()}
+
+
+@keras_export(
+    "keras.saving.deserialize_keras_object",
+    "keras.utils.deserialize_keras_object",
+)
+def deserialize_keras_object(
+    config, custom_objects=None, safe_mode=True, **kwargs
+):
+    """Retrieve the object by deserializing the config dict.
+
+    The config dict is a Python dictionary that consists of a set of key-value
+    pairs, and represents a Keras object, such as an `Optimizer`, `Layer`,
+    `Metrics`, etc. The saving and loading library uses the following keys to
+    record information of a Keras object:
+
+    - `class_name`: String. This is the name of the class,
+      as exactly defined in the source
+      code, such as "LossesContainer".
+    - `config`: Dict. Library-defined or user-defined key-value pairs that store
+      the configuration of the object, as obtained by `object.get_config()`.
+    - `module`: String. The path of the python module, such as
+      "keras.engine.compile_utils". Built-in Keras classes
+      expect to have prefix `keras`.
+    - `registered_name`: String. The key the class is registered under via
+      `keras.saving.register_keras_serializable(package, name)` API. The key has
+      the format of '{package}>{name}', where `package` and `name` are the
+      arguments passed to `register_keras_serializable()`. If `name` is not
+      provided, it uses the class name. If `registered_name` successfully
+      resolves to a class (that was registered), the `class_name` and `config`
+      values in the dict will not be used. `registered_name` is only used for
+      non-built-in classes.
+
+    For example, the following dictionary represents the built-in Adam optimizer
+    with the relevant config:
+
+    ```python
+    dict_structure = {
+        "class_name": "Adam",
+        "config": {
+            "amsgrad": false,
+            "beta_1": 0.8999999761581421,
+            "beta_2": 0.9990000128746033,
+            "decay": 0.0,
+            "epsilon": 1e-07,
+            "learning_rate": 0.0010000000474974513,
+            "name": "Adam"
+        },
+        "module": "keras.optimizers",
+        "registered_name": None
+    }
+    # Returns an `Adam` instance identical to the original one.
+    deserialize_keras_object(dict_structure)
+    ```
+
+    If the class does not have an exported Keras namespace, the library tracks
+    it by its `module` and `class_name`. For example:
+
+    ```python
+    dict_structure = {
+      "class_name": "LossesContainer",
+      "config": {
+          "losses": [...],
+          "total_loss_mean": {...},
+      },
+      "module": "keras.engine.compile_utils",
+      "registered_name": "LossesContainer"
+    }
+
+    # Returns a `LossesContainer` instance identical to the original one.
+    deserialize_keras_object(dict_structure)
+    ```
+
+    And the following dictionary represents a user-customized `MeanSquaredError`
+    loss:
+
+    ```python
+    @keras.saving.register_keras_serializable(package='my_package')
+    class ModifiedMeanSquaredError(keras.losses.MeanSquaredError):
+      ...
+
+    dict_structure = {
+        "class_name": "ModifiedMeanSquaredError",
+        "config": {
+            "fn": "mean_squared_error",
+            "name": "mean_squared_error",
+            "reduction": "auto"
+        },
+        "registered_name": "my_package>ModifiedMeanSquaredError"
+    }
+    # Returns the `ModifiedMeanSquaredError` object
+    deserialize_keras_object(dict_structure)
+    ```
+
+    Args:
+        config: Python dict describing the object.
+        custom_objects: Python dict containing a mapping between custom
+            object names the corresponding classes or functions.
+        safe_mode: Boolean, whether to disallow unsafe `lambda` deserialization.
+            When `safe_mode=False`, loading an object has the potential to
+            trigger arbitrary code execution. This argument is only
+            applicable to the Keras v3 model format. Defaults to `True`.
+
+    Returns:
+      The object described by the `config` dictionary.
+
+    """
+    safe_scope_arg = in_safe_mode()  # Enforces SafeModeScope
+    safe_mode = safe_scope_arg if safe_scope_arg is not None else safe_mode
+
+    module_objects = kwargs.pop("module_objects", None)
+    custom_objects = custom_objects or {}
+    tlco = object_registration._THREAD_LOCAL_CUSTOM_OBJECTS.__dict__
+    gco = object_registration._GLOBAL_CUSTOM_OBJECTS
+    custom_objects = {**custom_objects, **tlco, **gco}
+
+    # Optional deprecated argument for legacy deserialization call
+    printable_module_name = kwargs.pop("printable_module_name", "object")
+    if kwargs:
+        raise ValueError(
+            "The following argument(s) are not supported: "
+            f"{list(kwargs.keys())}"
+        )
+
+    # Fall back to legacy deserialization for all TF1 users or if
+    # wrapped by in_tf_saved_model_scope() to explicitly use legacy
+    # saved_model logic.
+    if not tf.__internal__.tf2.enabled() or in_tf_saved_model_scope():
+        return legacy_serialization.deserialize_keras_object(
+            config, module_objects, custom_objects, printable_module_name
+        )
+
+    if config is None:
+        return None
+
+    if (
+        isinstance(config, str)
+        and custom_objects
+        and custom_objects.get(config) is not None
+    ):
+        # This is to deserialize plain functions which are serialized as
+        # string names by legacy saving formats.
+        return custom_objects[config]
+
+    if isinstance(config, (list, tuple)):
+        return [
+            deserialize_keras_object(
+                x, custom_objects=custom_objects, safe_mode=safe_mode
+            )
+            for x in config
+        ]
+
+    if module_objects is not None:
+        inner_config, fn_module_name, has_custom_object = None, None, False
+        if isinstance(config, dict):
+            if "config" in config:
+                inner_config = config["config"]
+            if "class_name" not in config:
+                raise ValueError(
+                    f"Unknown `config` as a `dict`, config={config}"
+                )
+
+            # Check case where config is function or class and in custom objects
+            if custom_objects and (
+                config["class_name"] in custom_objects
+                or config.get("registered_name") in custom_objects
+                or (
+                    isinstance(inner_config, str)
+                    and inner_config in custom_objects
+                )
+            ):
+                has_custom_object = True
+
+            # Case where config is function but not in custom objects
+            elif config["class_name"] == "function":
+                fn_module_name = config["module"]
+                if fn_module_name == "builtins":
+                    config = config["config"]
+                else:
+                    config = config["registered_name"]
+
+            # Case where config is class but not in custom objects
+            else:
+                if config.get("module", "_") is None:
+                    raise TypeError(
+                        "Cannot deserialize object of type "
+                        f"`{config['class_name']}`. If "
+                        f"`{config['class_name']}` is a custom class, please "
+                        "register it using the "
+                        "`@keras.saving.register_keras_serializable()` "
+                        "decorator."
+                    )
+                config = config["class_name"]
+        if not has_custom_object:
+            # Return if not found in either module objects or custom objects
+            if config not in module_objects:
+                # Object has already been deserialized
+                return config
+            if isinstance(module_objects[config], types.FunctionType):
+                return deserialize_keras_object(
+                    serialize_with_public_fn(
+                        module_objects[config], config, fn_module_name
+                    ),
+                    custom_objects=custom_objects,
+                )
+            return deserialize_keras_object(
+                serialize_with_public_class(
+                    module_objects[config], inner_config=inner_config
+                ),
+                custom_objects=custom_objects,
+            )
+
+    if isinstance(config, PLAIN_TYPES):
+        return config
+    if not isinstance(config, dict):
+        raise TypeError(f"Could not parse config: {config}")
+
+    if "class_name" not in config or "config" not in config:
+        return {
+            key: deserialize_keras_object(
+                value, custom_objects=custom_objects, safe_mode=safe_mode
+            )
+            for key, value in config.items()
+        }
+
+    class_name = config["class_name"]
+    inner_config = config["config"] or {}
+    custom_objects = custom_objects or {}
+
+    # Special cases:
+    if class_name == "__tensor__":
+        return tf.constant(inner_config["value"], dtype=inner_config["dtype"])
+    if class_name == "__numpy__":
+        return np.array(inner_config["value"], dtype=inner_config["dtype"])
+    if config["class_name"] == "__bytes__":
+        return inner_config["value"].encode("utf-8")
+    if config["class_name"] == "__lambda__":
+        if safe_mode:
+            raise ValueError(
+                "Requested the deserialization of a `lambda` object. "
+                "This carries a potential risk of arbitrary code execution "
+                "and thus it is disallowed by default. If you trust the "
+                "source of the saved model, you can pass `safe_mode=False` to "
+                "the loading function in order to allow `lambda` loading."
+            )
+        return generic_utils.func_load(inner_config["value"])
+
+    if config["class_name"] == "__typespec__":
+        cls = _retrieve_class_or_fn(
+            config["spec_name"],
+            config["registered_name"],
+            config["module"],
+            obj_type="class",
+            full_config=config,
+            custom_objects=custom_objects,
+        )
+
+        # Special casing for ExtensionType.Spec
+        if hasattr(cls, "_tf_extension_type_fields"):
+            inner_config = {
+                key: deserialize_keras_object(
+                    value, custom_objects=custom_objects, safe_mode=safe_mode
+                )
+                for key, value in inner_config.items()
+            }  # Deserialization of dict created by ExtensionType.as_dict()
+            return cls(**inner_config)  # Instantiate ExtensionType.Spec
+
+        if config["registered_name"] is not None:
+            return cls.from_config(inner_config)
+
+        # Conversion to TensorShape and tf.DType
+        inner_config = map(
+            lambda x: tf.TensorShape(x)
+            if isinstance(x, list)
+            else (getattr(tf, x) if hasattr(tf.dtypes, str(x)) else x),
+            inner_config,
+        )
+        return cls._deserialize(tuple(inner_config))
+
+    # Below: classes and functions.
+    module = config.get("module", None)
+    registered_name = config.get("registered_name", class_name)
+
+    if class_name == "function":
+        fn_name = inner_config
+        return _retrieve_class_or_fn(
+            fn_name,
+            registered_name,
+            module,
+            obj_type="function",
+            full_config=config,
+            custom_objects=custom_objects,
+        )
+
+    # Below, handling of all classes.
+    # First, is it a shared object?
+    if "shared_object_id" in config:
+        obj = get_shared_object(config["shared_object_id"])
+        if obj is not None:
+            return obj
+
+    cls = _retrieve_class_or_fn(
+        class_name,
+        registered_name,
+        module,
+        obj_type="class",
+        full_config=config,
+        custom_objects=custom_objects,
+    )
+
+    if isinstance(cls, types.FunctionType):
+        return cls
+    if not hasattr(cls, "from_config"):
+        raise TypeError(
+            f"Unable to reconstruct an instance of '{class_name}' because "
+            f"the class is missing a `from_config()` method. "
+            f"Full object config: {config}"
+        )
+
+    # Instantiate the class from its config inside a custom object scope
+    # so that we can catch any custom objects that the config refers to.
+    custom_obj_scope = object_registration.custom_object_scope(custom_objects)
+    safe_mode_scope = SafeModeScope(safe_mode)
+    with custom_obj_scope, safe_mode_scope:
+        instance = cls.from_config(inner_config)
+        build_config = config.get("build_config", None)
+        if build_config:
+            instance.build_from_config(build_config)
+        compile_config = config.get("compile_config", None)
+        if compile_config:
+            instance.compile_from_config(compile_config)
+
+    if "shared_object_id" in config:
+        record_object_after_deserialization(
+            instance, config["shared_object_id"]
+        )
+    return instance
+
+
+def _retrieve_class_or_fn(
+    name, registered_name, module, obj_type, full_config, custom_objects=None
+):
+    # If there is a custom object registered via
+    # `register_keras_serializable()`, that takes precedence.
+    if obj_type == "function":
+        custom_obj = object_registration.get_registered_object(
+            name, custom_objects=custom_objects
+        )
+    else:
+        custom_obj = object_registration.get_registered_object(
+            registered_name, custom_objects=custom_objects
+        )
+    if custom_obj is not None:
+        return custom_obj
+
+    if module:
+        # If it's a Keras built-in object,
+        # we cannot always use direct import, because the exported
+        # module name might not match the package structure
+        # (e.g. experimental symbols).
+        if module == "keras" or module.startswith("keras."):
+            api_name = module + "." + name
+
+            # Legacy internal APIs are stored in TF API naming dict
+            # with `compat.v1` prefix
+            if "__internal__.legacy" in api_name:
+                api_name = "compat.v1." + api_name
+
+            obj = tf_export.get_symbol_from_name(api_name)
+            if obj is not None:
+                return obj
+
+        # Configs of Keras built-in functions do not contain identifying
+        # information other than their name (e.g. 'acc' or 'tanh'). This special
+        # case searches the Keras modules that contain built-ins to retrieve
+        # the corresponding function from the identifying string.
+        if obj_type == "function" and module == "builtins":
+            for mod in BUILTIN_MODULES:
+                obj = tf_export.get_symbol_from_name(
+                    "keras." + mod + "." + name
+                )
+                if obj is not None:
+                    return obj
+
+            # Retrieval of registered custom function in a package
+            filtered_dict = {
+                k: v
+                for k, v in custom_objects.items()
+                if k.endswith(full_config["config"])
+            }
+            if filtered_dict:
+                return next(iter(filtered_dict.values()))
+
+        # Otherwise, attempt to retrieve the class object given the `module`
+        # and `class_name`. Import the module, find the class.
+        try:
+            mod = importlib.import_module(module)
+        except ModuleNotFoundError:
+            raise TypeError(
+                f"Could not deserialize {obj_type} '{name}' because "
+                f"its parent module {module} cannot be imported. "
+                f"Full object config: {full_config}"
+            )
+        obj = vars(mod).get(name, None)
+
+        if obj is None:
+            # Special case for keras.metrics.metrics
+            if registered_name is not None:
+                obj = vars(mod).get(registered_name, None)
+
+            # Support for `__qualname__`
+            if name.count(".") == 1:
+                outer_name, inner_name = name.split(".")
+                outer_obj = vars(mod).get(outer_name, None)
+                obj = (
+                    getattr(outer_obj, inner_name, None)
+                    if outer_obj is not None
+                    else None
+                )
+
+        if obj is not None:
+            return obj
+
+    raise TypeError(
+        f"Could not locate {obj_type} '{name}'. "
+        "Make sure custom classes are decorated with "
+        "`@keras.saving.register_keras_serializable()`. "
+        f"Full object config: {full_config}"
+    )
diff --git a/keras/saving/serialization_lib_test.py b/keras/saving/serialization_lib_test.py
new file mode 100644
index 000000000000..6645ee9b777f
--- /dev/null
+++ b/keras/saving/serialization_lib_test.py
@@ -0,0 +1,488 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for serialization_lib."""
+
+import json
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization as legacy_serialization
+from keras.testing_infra import test_utils
+
+
+def custom_fn(x):
+    return x**2
+
+
+class CustomLayer(keras.layers.Layer):
+    def __init__(self, factor):
+        super().__init__()
+        self.factor = factor
+
+    def call(self, x):
+        return x * self.factor
+
+    def get_config(self):
+        return {"factor": self.factor}
+
+
+class NestedCustomLayer(keras.layers.Layer):
+    def __init__(self, factor, dense=None, activation=None):
+        super().__init__()
+        self.factor = factor
+
+        if dense is None:
+            self.dense = keras.layers.Dense(1, activation=custom_fn)
+        else:
+            self.dense = serialization_lib.deserialize_keras_object(dense)
+        if activation is None:
+            self.activation = keras.layers.Activation("relu")
+        else:
+            self.activation = serialization_lib.deserialize_keras_object(
+                activation
+            )
+
+    def call(self, x):
+        return self.dense(x * self.factor)
+
+    def get_config(self):
+        return {
+            "factor": self.factor,
+            "dense": self.dense,
+            "activation": self.activation,
+        }
+
+
+class WrapperLayer(keras.layers.Layer):
+    def __init__(self, layer, **kwargs):
+        super().__init__(**kwargs)
+        self.layer = layer
+
+    def call(self, x):
+        return self.layer(x)
+
+    def get_config(self):
+        config = super().get_config()
+        return {"layer": self.layer, **config}
+
+
+@test_utils.run_v2_only
+class SerializationLibTest(tf.test.TestCase, parameterized.TestCase):
+    def roundtrip(self, obj, custom_objects=None, safe_mode=True):
+        serialized = serialization_lib.serialize_keras_object(obj)
+        json_data = json.dumps(serialized)
+        json_data = json.loads(json_data)
+        deserialized = serialization_lib.deserialize_keras_object(
+            json_data, custom_objects=custom_objects, safe_mode=safe_mode
+        )
+        reserialized = serialization_lib.serialize_keras_object(deserialized)
+        return serialized, deserialized, reserialized
+
+    @parameterized.named_parameters(
+        ("str", "hello"),
+        ("bytes", b"hello"),
+        ("nparray_int", np.array([0, 1])),
+        ("nparray_float", np.array([0.0, 1.0])),
+        ("nparray_item", np.float32(1.0)),
+        ("plain_types_list", ["hello", 0, "world", 1.0, True]),
+        ("plain_types_dict", {"1": "hello", "2": 0, "3": True}),
+        ("plain_types_nested_dict", {"1": "hello", "2": [True, False]}),
+    )
+    def test_simple_objects(self, obj):
+        serialized, _, reserialized = self.roundtrip(obj)
+        self.assertEqual(serialized, reserialized)
+
+    def test_builtin_layers(self):
+        serialized, _, reserialized = self.roundtrip(keras.layers.Dense(3))
+        self.assertEqual(serialized, reserialized)
+
+    def test_tensors_and_tensorshape(self):
+        x = tf.random.normal((2, 2), dtype="float64")
+        obj = {"x": x}
+        _, new_obj, _ = self.roundtrip(obj)
+        self.assertAllClose(x, new_obj["x"], atol=1e-5)
+
+        obj = {"x.shape": x.shape}
+        _, new_obj, _ = self.roundtrip(obj)
+        self.assertListEqual(x.shape.as_list(), new_obj["x.shape"])
+
+    def test_custom_fn(self):
+        obj = {"activation": custom_fn}
+        serialized, _, reserialized = self.roundtrip(
+            obj, custom_objects={"custom_fn": custom_fn}
+        )
+        self.assertEqual(serialized, reserialized)
+
+        # Test inside layer
+        dense = keras.layers.Dense(1, activation=custom_fn)
+        dense.build((None, 2))
+        _, new_dense, _ = self.roundtrip(
+            dense, custom_objects={"custom_fn": custom_fn}
+        )
+        x = tf.random.normal((2, 2))
+        y1 = dense(x)
+        _ = new_dense(x)
+        new_dense.set_weights(dense.get_weights())
+        y2 = new_dense(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+
+    def test_custom_layer(self):
+        layer = CustomLayer(factor=2)
+        x = tf.random.normal((2, 2))
+        y1 = layer(x)
+        _, new_layer, _ = self.roundtrip(
+            layer, custom_objects={"CustomLayer": CustomLayer}
+        )
+        y2 = new_layer(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+
+        layer = NestedCustomLayer(factor=2)
+        x = tf.random.normal((2, 2))
+        y1 = layer(x)
+        _, new_layer, _ = self.roundtrip(
+            layer,
+            custom_objects={
+                "NestedCustomLayer": NestedCustomLayer,
+                "custom_fn": custom_fn,
+            },
+        )
+        _ = new_layer(x)
+        new_layer.set_weights(layer.get_weights())
+        y2 = new_layer(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+
+    def test_lambda_fn(self):
+        obj = {"activation": lambda x: x**2}
+        with self.assertRaisesRegex(ValueError, "arbitrary code execution"):
+            self.roundtrip(obj, safe_mode=True)
+
+        _, new_obj, _ = self.roundtrip(obj, safe_mode=False)
+        self.assertEqual(obj["activation"](3), new_obj["activation"](3))
+
+    def test_lambda_layer(self):
+        lmbda = keras.layers.Lambda(lambda x: x**2)
+        with self.assertRaisesRegex(ValueError, "arbitrary code execution"):
+            self.roundtrip(lmbda, safe_mode=True)
+
+        _, new_lmbda, _ = self.roundtrip(lmbda, safe_mode=False)
+        x = tf.random.normal((2, 2))
+        y1 = lmbda(x)
+        y2 = new_lmbda(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+
+    def test_safe_mode_scope(self):
+        lmbda = keras.layers.Lambda(lambda x: x**2)
+        with serialization_lib.SafeModeScope(safe_mode=True):
+            with self.assertRaisesRegex(ValueError, "arbitrary code execution"):
+                self.roundtrip(lmbda)
+        with serialization_lib.SafeModeScope(safe_mode=False):
+            _, new_lmbda, _ = self.roundtrip(lmbda)
+        x = tf.random.normal((2, 2))
+        y1 = lmbda(x)
+        y2 = new_lmbda(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+
+    def test_tensorspec(self):
+        inputs = keras.Input(type_spec=tf.TensorSpec((2, 2), tf.float32))
+        outputs = keras.layers.Dense(1)(inputs)
+        model = keras.Model(inputs, outputs)
+        _, new_model, _ = self.roundtrip(model)
+        x = tf.random.normal((2, 2))
+        y1 = model(x)
+        new_model.set_weights(model.get_weights())
+        y2 = new_model(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+
+    def shared_inner_layer(self):
+        input_1 = keras.Input((2,))
+        input_2 = keras.Input((2,))
+        shared_layer = keras.layers.Dense(1)
+        output_1 = shared_layer(input_1)
+        wrapper_layer = WrapperLayer(shared_layer)
+        output_2 = wrapper_layer(input_2)
+        model = keras.Model([input_1, input_2], [output_1, output_2])
+        _, new_model, _ = self.roundtrip(
+            model, custom_objects={"WrapperLayer": WrapperLayer}
+        )
+
+        self.assertIs(model.layers[2], model.layers[3].layer)
+        self.assertIs(new_model.layers[2], new_model.layers[3].layer)
+
+    def test_functional_subclass(self):
+        class PlainFunctionalSubclass(keras.Model):
+            pass
+
+        inputs = keras.Input((2,))
+        outputs = keras.layers.Dense(1)(inputs)
+        model = PlainFunctionalSubclass(inputs, outputs)
+        x = tf.random.normal((2, 2))
+        y1 = model(x)
+        _, new_model, _ = self.roundtrip(
+            model,
+            custom_objects={"PlainFunctionalSubclass": PlainFunctionalSubclass},
+        )
+        new_model.set_weights(model.get_weights())
+        y2 = new_model(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+        self.assertIsInstance(new_model, PlainFunctionalSubclass)
+
+        class FunctionalSubclassWCustomInit(keras.Model):
+            def __init__(self, num_units=1, **kwargs):
+                inputs = keras.Input((2,))
+                outputs = keras.layers.Dense(num_units)(inputs)
+                super().__init__(inputs, outputs)
+
+        model = FunctionalSubclassWCustomInit(num_units=2)
+        x = tf.random.normal((2, 2))
+        y1 = model(x)
+        _, new_model, _ = self.roundtrip(
+            model,
+            custom_objects={
+                "FunctionalSubclassWCustomInit": FunctionalSubclassWCustomInit
+            },
+        )
+        new_model.set_weights(model.get_weights())
+        y2 = new_model(x)
+        self.assertAllClose(y1, y2, atol=1e-5)
+        self.assertIsInstance(new_model, FunctionalSubclassWCustomInit)
+
+    def test_shared_object(self):
+        class MyLayer(keras.layers.Layer):
+            def __init__(self, activation, **kwargs):
+                super().__init__(**kwargs)
+                if isinstance(activation, dict):
+                    self.activation = (
+                        serialization_lib.deserialize_keras_object(activation)
+                    )
+                else:
+                    self.activation = activation
+
+            def call(self, x):
+                return self.activation(x)
+
+            def get_config(self):
+                config = super().get_config()
+                config["activation"] = self.activation
+                return config
+
+        class SharedActivation:
+            def __call__(self, x):
+                return x**2
+
+            def get_config(self):
+                return {}
+
+            @classmethod
+            def from_config(cls, config):
+                return cls()
+
+        shared_act = SharedActivation()
+        layer_1 = MyLayer(activation=shared_act)
+        layer_2 = MyLayer(activation=shared_act)
+        layers = [layer_1, layer_2]
+
+        with serialization_lib.ObjectSharingScope():
+            serialized, new_layers, reserialized = self.roundtrip(
+                layers,
+                custom_objects={
+                    "MyLayer": MyLayer,
+                    "SharedActivation": SharedActivation,
+                },
+            )
+        self.assertIn("shared_object_id", serialized[0]["config"]["activation"])
+        obj_id = serialized[0]["config"]["activation"]
+        self.assertIn("shared_object_id", serialized[1]["config"]["activation"])
+        self.assertEqual(obj_id, serialized[1]["config"]["activation"])
+        self.assertIs(layers[0].activation, layers[1].activation)
+        self.assertIs(new_layers[0].activation, new_layers[1].activation)
+
+    def test_legacy_internal_object(self):
+        from keras.layers.rnn.legacy_cells import (
+            LSTMCell,  # pylint: disable=C6204
+        )
+
+        # tf.nn.rnn_cell.LSTMCell belongs to keras.__internal__.legacy namespace
+        cell = LSTMCell(32)
+        x = keras.Input((None, 5))
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer="rmsprop", loss="mse")
+
+        x_in = np.random.random((3, 5, 5))
+        y_out_1 = model.predict(x_in)
+        weights = model.get_weights()
+
+        # serialize and deserialize
+        config = serialization_lib.serialize_keras_object(layer)
+        layer = serialization_lib.deserialize_keras_object(
+            config,
+            custom_objects={"LSTMCell": LSTMCell},
+        )
+
+        # Restore RNN cell into model with weights
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_out_2 = model.predict(x_in)
+
+        self.assertAllClose(y_out_1, y_out_2, atol=1e-5)
+
+
+@keras.utils.register_keras_serializable()
+class MyDense(keras.layers.Layer):
+    def __init__(
+        self,
+        units,
+        *,
+        kernel_regularizer=None,
+        kernel_initializer=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self._units = units
+        self._kernel_regularizer = kernel_regularizer
+        self._kernel_initializer = kernel_initializer
+
+    def get_config(self):
+        return dict(
+            units=self._units,
+            kernel_initializer=self._kernel_initializer,
+            kernel_regularizer=self._kernel_regularizer,
+            **super().get_config()
+        )
+
+    def build(self, input_shape):
+        unused_batch_size, input_units = input_shape
+        self._kernel = self.add_weight(
+            "kernel",
+            [input_units, self._units],
+            dtype=tf.float32,
+            regularizer=self._kernel_regularizer,
+            initializer=self._kernel_initializer,
+        )
+
+    def call(self, inputs):
+        return tf.matmul(inputs, self._kernel)
+
+
+@keras.utils.register_keras_serializable()
+class MyWrapper(keras.layers.Layer):
+    def __init__(self, wrapped, **kwargs):
+        super().__init__(**kwargs)
+        self._wrapped = wrapped
+
+    def get_config(self):
+        return dict(wrapped=self._wrapped, **super().get_config())
+
+    @classmethod
+    def from_config(cls, config):
+        config["wrapped"] = keras.utils.deserialize_keras_object(
+            config["wrapped"]
+        )
+        return cls(**config)
+
+    def call(self, inputs):
+        return self._wrapped(inputs)
+
+
+@test_utils.run_v2_only
+class JsonSerializationTest(tf.test.TestCase, parameterized.TestCase):
+    def test_serialize_deserialize_custom_layer_json(self):
+        reg = keras.regularizers.L2(0.101)
+        ini = keras.initializers.Constant(1.0)
+        dense = MyDense(4, kernel_regularizer=reg, kernel_initializer=ini)
+        inputs = keras.layers.Input(shape=[3])
+        outputs = dense(inputs)
+        model = keras.Model(inputs, outputs)
+
+        model_json = model.to_json()
+        model2 = keras.models.model_from_json(model_json)
+
+        self.assertEqual(model_json, model2.to_json())
+
+    def test_serialize_deserialize_custom_layer_with_wrapper_json(self):
+        reg = keras.regularizers.L2(0.101)
+        ini = keras.initializers.Constant(1.0)
+        dense = MyDense(4, kernel_regularizer=reg, kernel_initializer=ini)
+        wrapper = MyWrapper(dense)
+        inputs = keras.layers.Input(shape=[3])
+        outputs = wrapper(inputs)
+        model = keras.Model(inputs, outputs)
+
+        model_json = model.to_json()
+        model2 = keras.models.model_from_json(model_json)
+
+        self.assertEqual(model_json, model2.to_json())
+
+
+@test_utils.run_v2_only
+class BackwardsCompatibilityTest(tf.test.TestCase, parameterized.TestCase):
+    def assert_old_format_can_be_deserialized(self, obj, custom_objects=None):
+        old_config = legacy_serialization.serialize_keras_object(obj)
+        revived = serialization_lib.deserialize_keras_object(
+            old_config, custom_objects=custom_objects
+        )
+        new_config_1 = serialization_lib.serialize_keras_object(obj)
+        new_config_2 = serialization_lib.serialize_keras_object(revived)
+        self.assertEqual(new_config_1, new_config_2)
+
+    def test_backwards_compatibility_with_old_serialized_format(self):
+        optimizer = keras.optimizers.Adam(learning_rate=0.1)
+        self.assert_old_format_can_be_deserialized(
+            optimizer, custom_objects=vars(keras.optimizers)
+        )
+        activation = keras.activations.relu
+        self.assert_old_format_can_be_deserialized(
+            activation, custom_objects=vars(keras.activations)
+        )
+        initializer = keras.initializers.VarianceScaling(scale=2.0)
+        self.assert_old_format_can_be_deserialized(
+            initializer, custom_objects=vars(keras.initializers)
+        )
+        regularizer = keras.regularizers.L2(0.3)
+        self.assert_old_format_can_be_deserialized(
+            regularizer, custom_objects=vars(keras.regularizers)
+        )
+        constraint = keras.constraints.UnitNorm()
+        self.assert_old_format_can_be_deserialized(
+            constraint, custom_objects=vars(keras.constraints)
+        )
+        layer = keras.layers.Dense(2)
+        self.assert_old_format_can_be_deserialized(
+            layer, custom_objects=vars(keras.layers)
+        )
+        layer = keras.layers.MultiHeadAttention(2, 4)
+        self.assert_old_format_can_be_deserialized(
+            layer, custom_objects=vars(keras.layers)
+        )
+
+        # Custom objects
+        layer = CustomLayer(2)
+        self.assert_old_format_can_be_deserialized(
+            layer, custom_objects={"CustomLayer": CustomLayer}
+        )
+        layer = keras.layers.Dense(1, activation=custom_fn)
+        self.assert_old_format_can_be_deserialized(
+            layer, custom_objects={**vars(keras.layers), "custom_fn": custom_fn}
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/saving/utils_v1/BUILD b/keras/saving/utils_v1/BUILD
deleted file mode 100644
index 3af65e18274d..000000000000
--- a/keras/saving/utils_v1/BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Description:
-#   Keras saving and loading libraries.
-
-# buildifier: disable=same-origin-load
-
-package(
-    default_visibility = [
-        "//keras:friends",
-    ],
-    licenses = ["notice"],
-)
-
-py_library(
-    name = "utils_v1",
-    srcs = [
-        "__init__.py",
-        "export_output.py",
-        "export_utils.py",
-        "mode_keys.py",
-        "signature_def_utils.py",
-        "unexported_constants.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        "//:expect_tensorflow_installed",
-    ],
-)
diff --git a/keras/saving/utils_v1/__init__.py b/keras/saving/utils_v1/__init__.py
deleted file mode 100644
index 12a1cafa1c0d..000000000000
--- a/keras/saving/utils_v1/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# LINT.IfChange
-"""Utils for saving a Keras Model or Estimator to the SavedModel format."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import
-from keras.saving.utils_v1.export_output import *
-from keras.saving.utils_v1.export_utils import build_all_signature_defs
-from keras.saving.utils_v1.export_utils import export_outputs_for_mode
-from keras.saving.utils_v1.export_utils import EXPORT_TAG_MAP
-from keras.saving.utils_v1.export_utils import get_export_outputs
-from keras.saving.utils_v1.export_utils import get_temp_export_dir
-from keras.saving.utils_v1.export_utils import get_timestamped_export_dir
-from keras.saving.utils_v1.export_utils import SIGNATURE_KEY_MAP
-# pylint: enable=wildcard-import
-# LINT.ThenChange(//tensorflow/python/saved_model/model_utils/__init__.py)
diff --git a/keras/saving/utils_v1/export_output.py b/keras/saving/utils_v1/export_output.py
deleted file mode 100644
index efcf20ef11e6..000000000000
--- a/keras/saving/utils_v1/export_output.py
+++ /dev/null
@@ -1,421 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# LINT.IfChange
-"""Classes for different types of export output."""
-
-import tensorflow.compat.v2 as tf
-
-import abc
-from keras.saving.utils_v1 import signature_def_utils as unexported_signature_utils
-
-
-class ExportOutput:
-  """Represents an output of a model that can be served.
-
-  These typically correspond to model heads.
-  """
-
-  __metaclass__ = abc.ABCMeta
-
-  _SEPARATOR_CHAR = '/'
-
-  @abc.abstractmethod
-  def as_signature_def(self, receiver_tensors):
-    """Generate a SignatureDef proto for inclusion in a MetaGraphDef.
-
-    The SignatureDef will specify outputs as described in this ExportOutput,
-    and will use the provided receiver_tensors as inputs.
-
-    Args:
-      receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-        input nodes that will be fed.
-    """
-    pass
-
-  def _check_output_key(self, key, error_label):
-    # For multi-head models, the key can be a tuple.
-    if isinstance(key, tuple):
-      key = self._SEPARATOR_CHAR.join(key)
-
-    if not isinstance(key, str):
-      raise ValueError(
-          '{} output key must be a string; got {}.'.format(error_label, key))
-    return key
-
-  def _wrap_and_check_outputs(
-      self, outputs, single_output_default_name, error_label=None):
-    """Wraps raw tensors as dicts and checks type.
-
-    Note that we create a new dict here so that we can overwrite the keys
-    if necessary.
-
-    Args:
-      outputs: A `Tensor` or a dict of string to `Tensor`.
-      single_output_default_name: A string key for use in the output dict
-        if the provided `outputs` is a raw tensor.
-      error_label: descriptive string for use in error messages. If none,
-        single_output_default_name will be used.
-
-    Returns:
-      A dict of tensors
-
-    Raises:
-      ValueError: if the outputs dict keys are not strings or tuples of strings
-        or the values are not Tensors.
-    """
-    if not isinstance(outputs, dict):
-      outputs = {single_output_default_name: outputs}
-
-    output_dict = {}
-    for key, value in outputs.items():
-      error_name = error_label or single_output_default_name
-      key = self._check_output_key(key, error_name)
-      if not isinstance(value, tf.Tensor):
-        raise ValueError(
-            '{} output value must be a Tensor; got {}.'.format(
-                error_name, value))
-
-      output_dict[key] = value
-    return output_dict
-
-
-class ClassificationOutput(ExportOutput):
-  """Represents the output of a classification head.
-
-  Either classes or scores or both must be set.
-
-  The classes `Tensor` must provide string labels, not integer class IDs.
-
-  If only classes is set, it is interpreted as providing top-k results in
-  descending order.
-
-  If only scores is set, it is interpreted as providing a score for every class
-  in order of class ID.
-
-  If both classes and scores are set, they are interpreted as zipped, so each
-  score corresponds to the class at the same index.  Clients should not depend
-  on the order of the entries.
-  """
-
-  def __init__(self, scores=None, classes=None):
-    """Constructor for `ClassificationOutput`.
-
-    Args:
-      scores: A float `Tensor` giving scores (sometimes but not always
-          interpretable as probabilities) for each class.  May be `None`, but
-          only if `classes` is set.  Interpretation varies-- see class doc.
-      classes: A string `Tensor` giving predicted class labels.  May be `None`,
-          but only if `scores` is set.  Interpretation varies-- see class doc.
-
-    Raises:
-      ValueError: if neither classes nor scores is set, or one of them is not a
-          `Tensor` with the correct dtype.
-    """
-    if (scores is not None
-        and not (isinstance(scores, tf.Tensor)
-                 and scores.dtype.is_floating)):
-      raise ValueError('Classification scores must be a float32 Tensor; '
-                       'got {}'.format(scores))
-    if (classes is not None
-        and not (isinstance(classes, tf.Tensor)
-                 and tf.as_dtype(classes.dtype) == tf.string)):
-      raise ValueError('Classification classes must be a string Tensor; '
-                       'got {}'.format(classes))
-    if scores is None and classes is None:
-      raise ValueError('Cannot create a ClassificationOutput with empty '
-                       'arguments. At least one of `scores` and `classes` '
-                       'must be defined.')
-    self._scores = scores
-    self._classes = classes
-
-  @property
-  def scores(self):
-    return self._scores
-
-  @property
-  def classes(self):
-    return self._classes
-
-  def as_signature_def(self, receiver_tensors):
-    if len(receiver_tensors) != 1:
-      raise ValueError(
-          'Classification signatures can only accept a single tensor input of '
-          'type tf.string. Please check to make sure that you have structured '
-          'the serving_input_receiver_fn so that it creates a single string '
-          'placeholder. If your model function expects multiple inputs, then '
-          'use `tf.io.parse_example()` to parse the string into multiple '
-          f'tensors.\n Received: {receiver_tensors}')
-    (_, examples), = receiver_tensors.items()
-    if tf.as_dtype(examples.dtype) != tf.string:
-      raise ValueError(
-          'Classification signatures can only accept a single tensor input of '
-          'type tf.string. Please check to make sure that you have structured '
-          'the serving_input_receiver_fn so that it creates a single string '
-          'placeholder. If your model function expects multiple inputs, then '
-          'use `tf.io.parse_example()` to parse the string into multiple '
-          f'tensors.\n Received: {receiver_tensors}')
-    return tf.compat.v1.saved_model.classification_signature_def(
-        examples, self.classes, self.scores)
-
-
-class RegressionOutput(ExportOutput):
-  """Represents the output of a regression head."""
-
-  def __init__(self, value):
-    """Constructor for `RegressionOutput`.
-
-    Args:
-      value: a float `Tensor` giving the predicted values.  Required.
-
-    Raises:
-      ValueError: if the value is not a `Tensor` with dtype tf.float32.
-    """
-    if not (isinstance(value, tf.Tensor) and value.dtype.is_floating):
-      raise ValueError('Regression output value must be a float32 Tensor; '
-                       'got {}'.format(value))
-    self._value = value
-
-  @property
-  def value(self):
-    return self._value
-
-  def as_signature_def(self, receiver_tensors):
-    if len(receiver_tensors) != 1:
-      raise ValueError(
-          'Regression signatures can only accept a single tensor input of '
-          'type tf.string. Please check to make sure that you have structured '
-          'the serving_input_receiver_fn so that it creates a single string '
-          'placeholder. If your model function expects multiple inputs, then '
-          'use `tf.io.parse_example()` to parse the string into multiple '
-          f'tensors.\n Received: {receiver_tensors}')
-    (_, examples), = receiver_tensors.items()
-    if tf.as_dtype(examples.dtype) != tf.string:
-      raise ValueError(
-          'Regression signatures can only accept a single tensor input of '
-          'type tf.string. Please check to make sure that you have structured '
-          'the serving_input_receiver_fn so that it creates a single string '
-          'placeholder. If your model function expects multiple inputs, then '
-          'use `tf.io.parse_example()` to parse the string into multiple '
-          f'tensors.\n Received: {receiver_tensors}')
-    return tf.compat.v1.saved_model.regression_signature_def(examples, self.value)
-
-
-class PredictOutput(ExportOutput):
-  """Represents the output of a generic prediction head.
-
-  A generic prediction need not be either a classification or a regression.
-
-  Named outputs must be provided as a dict from string to `Tensor`,
-  """
-  _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
-
-  def __init__(self, outputs):
-    """Constructor for PredictOutput.
-
-    Args:
-      outputs: A `Tensor` or a dict of string to `Tensor` representing the
-        predictions.
-
-    Raises:
-      ValueError: if the outputs is not dict, or any of its keys are not
-          strings, or any of its values are not `Tensor`s.
-    """
-
-    self._outputs = self._wrap_and_check_outputs(
-        outputs, self._SINGLE_OUTPUT_DEFAULT_NAME, error_label='Prediction')
-
-  @property
-  def outputs(self):
-    return self._outputs
-
-  def as_signature_def(self, receiver_tensors):
-    return tf.compat.v1.saved_model.predict_signature_def(receiver_tensors,
-                                                     self.outputs)
-
-
-class _SupervisedOutput(ExportOutput):
-  """Represents the output of a supervised training or eval process."""
-  __metaclass__ = abc.ABCMeta
-
-  LOSS_NAME = 'loss'
-  PREDICTIONS_NAME = 'predictions'
-  METRICS_NAME = 'metrics'
-
-  METRIC_VALUE_SUFFIX = 'value'
-  METRIC_UPDATE_SUFFIX = 'update_op'
-
-  _loss = None
-  _predictions = None
-  _metrics = None
-
-  def __init__(self, loss=None, predictions=None, metrics=None):
-    """Constructor for SupervisedOutput (ie, Train or Eval output).
-
-    Args:
-      loss: dict of Tensors or single Tensor representing calculated loss.
-      predictions: dict of Tensors or single Tensor representing model
-        predictions.
-      metrics: Dict of metric results keyed by name.
-        The values of the dict can be one of the following:
-        (1) instance of `Metric` class.
-        (2) (metric_value, update_op) tuples, or a single tuple.
-        metric_value must be a Tensor, and update_op must be a Tensor or Op.
-
-    Raises:
-      ValueError: if any of the outputs' dict keys are not strings or tuples of
-        strings or the values are not Tensors (or Operations in the case of
-        update_op).
-    """
-
-    if loss is not None:
-      loss_dict = self._wrap_and_check_outputs(loss, self.LOSS_NAME)
-      self._loss = self._prefix_output_keys(loss_dict, self.LOSS_NAME)
-    if predictions is not None:
-      pred_dict = self._wrap_and_check_outputs(
-          predictions, self.PREDICTIONS_NAME)
-      self._predictions = self._prefix_output_keys(
-          pred_dict, self.PREDICTIONS_NAME)
-    if metrics is not None:
-      self._metrics = self._wrap_and_check_metrics(metrics)
-
-  def _prefix_output_keys(self, output_dict, output_name):
-    """Prepend output_name to the output_dict keys if it doesn't exist.
-
-    This produces predictable prefixes for the pre-determined outputs
-    of SupervisedOutput.
-
-    Args:
-      output_dict: dict of string to Tensor, assumed valid.
-      output_name: prefix string to prepend to existing keys.
-
-    Returns:
-      dict with updated keys and existing values.
-    """
-
-    new_outputs = {}
-    for key, val in output_dict.items():
-      key = self._prefix_key(key, output_name)
-      new_outputs[key] = val
-    return new_outputs
-
-  def _prefix_key(self, key, output_name):
-    if key.find(output_name) != 0:
-      key = output_name + self._SEPARATOR_CHAR + key
-    return key
-
-  def _wrap_and_check_metrics(self, metrics):
-    """Handle the saving of metrics.
-
-    Metrics is either a tuple of (value, update_op), or a dict of such tuples.
-    Here, we separate out the tuples and create a dict with names to tensors.
-
-    Args:
-      metrics: Dict of metric results keyed by name.
-        The values of the dict can be one of the following:
-        (1) instance of `Metric` class.
-        (2) (metric_value, update_op) tuples, or a single tuple.
-        metric_value must be a Tensor, and update_op must be a Tensor or Op.
-
-    Returns:
-      dict of output_names to tensors
-
-    Raises:
-      ValueError: if the dict key is not a string, or the metric values or ops
-        are not tensors.
-    """
-    if not isinstance(metrics, dict):
-      metrics = {self.METRICS_NAME: metrics}
-
-    outputs = {}
-    for key, value in metrics.items():
-      if isinstance(value, tuple):
-        metric_val, metric_op = value
-      else:  # value is a keras.Metrics object
-        metric_val = value.result()
-        assert len(value.updates) == 1  # We expect only one update op.
-        metric_op = value.updates[0]
-      key = self._check_output_key(key, self.METRICS_NAME)
-      key = self._prefix_key(key, self.METRICS_NAME)
-
-      val_name = key + self._SEPARATOR_CHAR + self.METRIC_VALUE_SUFFIX
-      op_name = key + self._SEPARATOR_CHAR + self.METRIC_UPDATE_SUFFIX
-      if not isinstance(metric_val, tf.Tensor):
-        raise ValueError(
-            '{} output value must be a Tensor; got {}.'.format(
-                key, metric_val))
-      if not (tf.is_tensor(metric_op) or
-              isinstance(metric_op, tf.Operation)):
-        raise ValueError(
-            '{} update_op must be a Tensor or Operation; got {}.'.format(
-                key, metric_op))
-
-      # We must wrap any ops (or variables) in a Tensor before export, as the
-      # SignatureDef proto expects tensors only. See b/109740581
-      metric_op_tensor = metric_op
-      if not isinstance(metric_op, tf.Tensor):
-        with tf.control_dependencies([metric_op]):
-          metric_op_tensor = tf.constant([], name='metric_op_wrapper')
-
-      outputs[val_name] = metric_val
-      outputs[op_name] = metric_op_tensor
-
-    return outputs
-
-  @property
-  def loss(self):
-    return self._loss
-
-  @property
-  def predictions(self):
-    return self._predictions
-
-  @property
-  def metrics(self):
-    return self._metrics
-
-  @abc.abstractmethod
-  def _get_signature_def_fn(self):
-    """Returns a function that produces a SignatureDef given desired outputs."""
-    pass
-
-  def as_signature_def(self, receiver_tensors):
-    signature_def_fn = self._get_signature_def_fn()
-    return signature_def_fn(
-        receiver_tensors, self.loss, self.predictions, self.metrics)
-
-
-class TrainOutput(_SupervisedOutput):
-  """Represents the output of a supervised training process.
-
-  This class generates the appropriate signature def for exporting
-  training output by type-checking and wrapping loss, predictions, and metrics
-  values.
-  """
-
-  def _get_signature_def_fn(self):
-    return unexported_signature_utils.supervised_train_signature_def
-
-
-class EvalOutput(_SupervisedOutput):
-  """Represents the output of a supervised eval process.
-
-  This class generates the appropriate signature def for exporting
-  eval output by type-checking and wrapping loss, predictions, and metrics
-  values.
-  """
-
-  def _get_signature_def_fn(self):
-    return unexported_signature_utils.supervised_eval_signature_def
-# LINT.ThenChange(//tensorflow/python/saved_model/model_utils/export_output.py)
diff --git a/keras/saving/utils_v1/export_utils.py b/keras/saving/utils_v1/export_utils.py
deleted file mode 100644
index ceb1cf91df93..000000000000
--- a/keras/saving/utils_v1/export_utils.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# LINT.IfChange
-"""Utilities for creating SavedModels."""
-
-import collections
-import os
-import time
-
-from keras.saving.utils_v1 import export_output as export_output_lib
-from keras.saving.utils_v1 import mode_keys
-from keras.saving.utils_v1 import unexported_constants
-from keras.saving.utils_v1.mode_keys import KerasModeKeys as ModeKeys
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.platform import tf_logging as logging
-
-
-# Mapping of the modes to appropriate MetaGraph tags in the SavedModel.
-EXPORT_TAG_MAP = mode_keys.ModeKeyMap(**{
-    ModeKeys.PREDICT: [tf.saved_model.SERVING],
-    ModeKeys.TRAIN: [tf.saved_model.TRAINING],
-    ModeKeys.TEST: [unexported_constants.EVAL]})
-
-# For every exported mode, a SignatureDef map should be created using the
-# functions `export_outputs_for_mode` and `build_all_signature_defs`. By
-# default, this map will contain a single Signature that defines the input
-# tensors and output predictions, losses, and/or metrics (depending on the mode)
-# The default keys used in the SignatureDef map are defined below.
-SIGNATURE_KEY_MAP = mode_keys.ModeKeyMap(**{
-    ModeKeys.PREDICT: tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-    ModeKeys.TRAIN: unexported_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY,
-    ModeKeys.TEST: unexported_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY})
-
-# Default names used in the SignatureDef input map, which maps strings to
-# TensorInfo protos.
-SINGLE_FEATURE_DEFAULT_NAME = 'feature'
-SINGLE_RECEIVER_DEFAULT_NAME = 'input'
-SINGLE_LABEL_DEFAULT_NAME = 'label'
-
-### Below utilities are specific to SavedModel exports.
-
-
-def build_all_signature_defs(receiver_tensors,
-                             export_outputs,
-                             receiver_tensors_alternatives=None,
-                             serving_only=True):
-  """Build `SignatureDef`s for all export outputs.
-
-  Args:
-    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-      input nodes where this receiver expects to be fed by default.  Typically,
-      this is a single placeholder expecting serialized `tf.Example` protos.
-    export_outputs: a dict of ExportOutput instances, each of which has
-      an as_signature_def instance method that will be called to retrieve
-      the signature_def for all export output tensors.
-    receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor` or a dict of
-      string to `Tensor`.  These named receiver tensor alternatives generate
-      additional serving signatures, which may be used to feed inputs at
-      different points within the input receiver subgraph.  A typical usage is
-      to allow feeding raw feature `Tensor`s *downstream* of the
-      tf.io.parse_example() op.  Defaults to None.
-    serving_only: boolean; if true, resulting signature defs will only include
-      valid serving signatures. If false, all requested signatures will be
-      returned.
-
-  Returns:
-    signature_def representing all passed args.
-
-  Raises:
-    ValueError: if export_outputs is not a dict
-  """
-  if not isinstance(receiver_tensors, dict):
-    receiver_tensors = {SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
-  if export_outputs is None or not isinstance(export_outputs, dict):
-    raise ValueError('`export_outputs` must be a dict. Received '
-                     f'{export_outputs} with type '
-                     f'{type(export_outputs).__name__}.')
-
-  signature_def_map = {}
-  excluded_signatures = {}
-  for output_key, export_output in export_outputs.items():
-    signature_name = '{}'.format(output_key or 'None')
-    try:
-      signature = export_output.as_signature_def(receiver_tensors)
-      signature_def_map[signature_name] = signature
-    except ValueError as e:
-      excluded_signatures[signature_name] = str(e)
-
-  if receiver_tensors_alternatives:
-    for receiver_name, receiver_tensors_alt in (
-        receiver_tensors_alternatives.items()):
-      if not isinstance(receiver_tensors_alt, dict):
-        receiver_tensors_alt = {
-            SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
-        }
-      for output_key, export_output in export_outputs.items():
-        signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
-                                        'None')
-        try:
-          signature = export_output.as_signature_def(receiver_tensors_alt)
-          signature_def_map[signature_name] = signature
-        except ValueError as e:
-          excluded_signatures[signature_name] = str(e)
-
-  _log_signature_report(signature_def_map, excluded_signatures)
-
-  # The above calls to export_output_lib.as_signature_def should return only
-  # valid signatures; if there is a validity problem, they raise a ValueError,
-  # in which case we exclude that signature from signature_def_map above.
-  # The is_valid_signature check ensures that the signatures produced are
-  # valid for serving, and acts as an additional sanity check for export
-  # signatures produced for serving. We skip this check for training and eval
-  # signatures, which are not intended for serving.
-  if serving_only:
-    signature_def_map = {
-        k: v
-        for k, v in signature_def_map.items()
-        if tf.compat.v1.saved_model.is_valid_signature(v)
-    }
-  return signature_def_map
-
-
-_FRIENDLY_METHOD_NAMES = {
-    tf.saved_model.CLASSIFY_METHOD_NAME: 'Classify',
-    tf.saved_model.REGRESS_METHOD_NAME: 'Regress',
-    tf.saved_model.PREDICT_METHOD_NAME: 'Predict',
-    unexported_constants.SUPERVISED_TRAIN_METHOD_NAME: 'Train',
-    unexported_constants.SUPERVISED_EVAL_METHOD_NAME: 'Eval',
-}
-
-
-def _log_signature_report(signature_def_map, excluded_signatures):
-  """Log a report of which signatures were produced."""
-  sig_names_by_method_name = collections.defaultdict(list)
-
-  # We'll collect whatever method_names are present, but also we want to make
-  # sure to output a line for each of the three standard methods even if they
-  # have no signatures.
-  for method_name in _FRIENDLY_METHOD_NAMES:
-    sig_names_by_method_name[method_name] = []
-
-  for signature_name, sig in signature_def_map.items():
-    sig_names_by_method_name[sig.method_name].append(signature_name)
-
-  # TODO(b/67733540): consider printing the full signatures, not just names
-  for method_name, sig_names in sig_names_by_method_name.items():
-    if method_name in _FRIENDLY_METHOD_NAMES:
-      method_name = _FRIENDLY_METHOD_NAMES[method_name]
-    logging.info('Signatures INCLUDED in export for {}: {}'.format(
-        method_name, sig_names if sig_names else 'None'))
-
-  if excluded_signatures:
-    logging.info('Signatures EXCLUDED from export because they cannot be '
-                 'be served via TensorFlow Serving APIs:')
-    for signature_name, message in excluded_signatures.items():
-      logging.info('\'{}\' : {}'.format(signature_name, message))
-
-  if not signature_def_map:
-    logging.warning('Export includes no signatures!')
-  elif (tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
-        signature_def_map):
-    logging.warning('Export includes no default signature!')
-
-
-# When we create a timestamped directory, there is a small chance that the
-# directory already exists because another process is also creating these
-# directories. In this case we just wait one second to get a new timestamp and
-# try again. If this fails several times in a row, then something is seriously
-# wrong.
-MAX_DIRECTORY_CREATION_ATTEMPTS = 10
-
-
-def get_timestamped_export_dir(export_dir_base):
-  """Builds a path to a new subdirectory within the base directory.
-
-  Each export is written into a new subdirectory named using the
-  current time.  This guarantees monotonically increasing version
-  numbers even across multiple runs of the pipeline.
-  The timestamp used is the number of seconds since epoch UTC.
-
-  Args:
-    export_dir_base: A string containing a directory to write the exported
-        graph and checkpoints.
-  Returns:
-    The full path of the new subdirectory (which is not actually created yet).
-
-  Raises:
-    RuntimeError: if repeated attempts fail to obtain a unique timestamped
-      directory name.
-  """
-  attempts = 0
-  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
-    timestamp = int(time.time())
-
-    result_dir = tf.io.gfile.join(
-        tf.compat.as_bytes(export_dir_base), tf.compat.as_bytes(str(timestamp)))
-    if not tf.compat.v1.gfile.Exists(result_dir):
-      # Collisions are still possible (though extremely unlikely): this
-      # directory is not actually created yet, but it will be almost
-      # instantly on return from this function.
-      return result_dir
-    time.sleep(1)
-    attempts += 1
-    logging.warning(
-        'Directory {} already exists; retrying (attempt {}/{})'.format(
-            tf.compat.as_str(result_dir), attempts,
-            MAX_DIRECTORY_CREATION_ATTEMPTS))
-  raise RuntimeError('Failed to obtain a unique export directory name after '
-                     f'{MAX_DIRECTORY_CREATION_ATTEMPTS} attempts.')
-
-
-def get_temp_export_dir(timestamped_export_dir):
-  """Builds a directory name based on the argument but starting with 'temp-'.
-
-  This relies on the fact that TensorFlow Serving ignores subdirectories of
-  the base directory that can't be parsed as integers.
-
-  Args:
-    timestamped_export_dir: the name of the eventual export directory, e.g.
-      /foo/bar/<timestamp>
-
-  Returns:
-    A sister directory prefixed with 'temp-', e.g. /foo/bar/temp-<timestamp>.
-  """
-  (dirname, basename) = os.path.split(timestamped_export_dir)
-  if isinstance(basename, bytes):
-    str_name = basename.decode('utf-8')
-  else:
-    str_name = str(basename)
-  temp_export_dir = tf.io.gfile.join(
-      tf.compat.as_bytes(dirname),
-      tf.compat.as_bytes('temp-{}'.format(str_name)))
-  return temp_export_dir
-
-
-def export_outputs_for_mode(
-    mode, serving_export_outputs=None, predictions=None, loss=None,
-    metrics=None):
-  """Util function for constructing a `ExportOutput` dict given a mode.
-
-  The returned dict can be directly passed to `build_all_signature_defs` helper
-  function as the `export_outputs` argument, used for generating a SignatureDef
-  map.
-
-  Args:
-    mode: A `ModeKeys` specifying the mode.
-    serving_export_outputs: Describes the output signatures to be exported to
-      `SavedModel` and used during serving. Should be a dict or None.
-    predictions: A dict of Tensors or single Tensor representing model
-        predictions. This argument is only used if serving_export_outputs is not
-        set.
-    loss: A dict of Tensors or single Tensor representing calculated loss.
-    metrics: A dict of (metric_value, update_op) tuples, or a single tuple.
-      metric_value must be a Tensor, and update_op must be a Tensor or Op
-
-  Returns:
-    Dictionary mapping the a key to an `tf.estimator.export.ExportOutput` object
-    The key is the expected SignatureDef key for the mode.
-
-  Raises:
-    ValueError: if an appropriate ExportOutput cannot be found for the mode.
-  """
-  if mode not in SIGNATURE_KEY_MAP:
-    raise ValueError(
-        f'Export output type not found for `mode`: {mode}. Expected one of: '
-        f'{list(SIGNATURE_KEY_MAP.keys())}.\n'
-        'One likely error is that V1 Estimator Modekeys were somehow passed to '
-        'this function. Please ensure that you are using the new ModeKeys.')
-  signature_key = SIGNATURE_KEY_MAP[mode]
-  if mode_keys.is_predict(mode):
-    return get_export_outputs(serving_export_outputs, predictions)
-  elif mode_keys.is_train(mode):
-    return {signature_key: export_output_lib.TrainOutput(
-        loss=loss, predictions=predictions, metrics=metrics)}
-  else:
-    return {signature_key: export_output_lib.EvalOutput(
-        loss=loss, predictions=predictions, metrics=metrics)}
-
-
-def get_export_outputs(export_outputs, predictions):
-  """Validate export_outputs or create default export_outputs.
-
-  Args:
-    export_outputs: Describes the output signatures to be exported to
-      `SavedModel` and used during serving. Should be a dict or None.
-    predictions:  Predictions `Tensor` or dict of `Tensor`.
-
-  Returns:
-    Valid export_outputs dict
-
-  Raises:
-    TypeError: if export_outputs is not a dict or its values are not
-      ExportOutput instances.
-  """
-  if export_outputs is None:
-    default_output = export_output_lib.PredictOutput(predictions)
-    export_outputs = {
-        tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY: default_output}
-
-  if not isinstance(export_outputs, dict):
-    raise TypeError(
-        f'`export_outputs` must be dict, received: {export_outputs}.')
-  for v in export_outputs.values():
-    if not isinstance(v, export_output_lib.ExportOutput):
-      raise TypeError(
-          'Values in `export_outputs` must be ExportOutput objects, '
-          f'received: {export_outputs}.')
-
-  _maybe_add_default_serving_output(export_outputs)
-
-  return export_outputs
-
-
-def _maybe_add_default_serving_output(export_outputs):
-  """Add a default serving output to the export_outputs if not present.
-
-  Args:
-    export_outputs: Describes the output signatures to be exported to
-      `SavedModel` and used during serving. Should be a dict.
-
-  Returns:
-    export_outputs dict with default serving signature added if necessary
-
-  Raises:
-    ValueError: if multiple export_outputs were provided without a default
-      serving key.
-  """
-  if len(export_outputs) == 1:
-    (key, value), = export_outputs.items()
-    if key != tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-      export_outputs[
-          tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = value
-  if len(export_outputs) > 1:
-    if (tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-        not in export_outputs):
-      raise ValueError(
-          'Multiple `export_outputs` were provided, but none of them are '
-          'specified as the default. Use'
-          '`tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY` to '
-          'specify a default.')
-
-  return export_outputs
-# LINT.ThenChange(//tensorflow/python/saved_model/model_utils/export_utils.py)
diff --git a/keras/saving/utils_v1/mode_keys.py b/keras/saving/utils_v1/mode_keys.py
deleted file mode 100644
index d777cc562962..000000000000
--- a/keras/saving/utils_v1/mode_keys.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# LINT.IfChange
-"""Utils for managing different mode strings used by Keras and Estimator models.
-"""
-
-import collections
-
-
-class KerasModeKeys:
-  """Standard names for model modes.
-
-  The following standard keys are defined:
-
-  * `TRAIN`: training/fitting mode.
-  * `TEST`: testing/evaluation mode.
-  * `PREDICT`: prediction/inference mode.
-  """
-
-  TRAIN = 'train'
-  TEST = 'test'
-  PREDICT = 'predict'
-
-
-# TODO(kathywu): Remove copy in Estimator after nightlies
-class EstimatorModeKeys:
-  """Standard names for Estimator model modes.
-
-  The following standard keys are defined:
-
-  * `TRAIN`: training/fitting mode.
-  * `EVAL`: testing/evaluation mode.
-  * `PREDICT`: predication/inference mode.
-  """
-
-  TRAIN = 'train'
-  EVAL = 'eval'
-  PREDICT = 'infer'
-
-
-def is_predict(mode):
-  return mode in [KerasModeKeys.PREDICT, EstimatorModeKeys.PREDICT]
-
-
-def is_eval(mode):
-  return mode in [KerasModeKeys.TEST, EstimatorModeKeys.EVAL]
-
-
-def is_train(mode):
-  return mode in [KerasModeKeys.TRAIN, EstimatorModeKeys.TRAIN]
-
-
-class ModeKeyMap(collections.abc.Mapping):
-  """Map using ModeKeys as keys.
-
-  This class creates an immutable mapping from modes to values. For example,
-  SavedModel export of Keras and Estimator models use this to map modes to their
-  corresponding MetaGraph tags/SignatureDef keys.
-
-  Since this class uses modes, rather than strings, as keys, both "predict"
-  (Keras's PREDICT ModeKey) and "infer" (Estimator's PREDICT ModeKey) map to the
-  same value.
-  """
-
-  def __init__(self, **kwargs):
-    self._internal_dict = {}
-    self._keys = []
-    for key in kwargs:
-      self._keys.append(key)
-      dict_key = self._get_internal_key(key)
-      if dict_key in self._internal_dict:
-        raise ValueError(
-            'Error creating ModeKeyMap. Multiple keys/values found for {} mode.'
-            .format(dict_key))
-      self._internal_dict[dict_key] = kwargs[key]
-
-  def _get_internal_key(self, key):
-    """Return keys used for the internal dictionary."""
-    if is_train(key):
-      return KerasModeKeys.TRAIN
-    if is_eval(key):
-      return KerasModeKeys.TEST
-    if is_predict(key):
-      return KerasModeKeys.PREDICT
-    raise ValueError('Invalid mode key: {}.'.format(key))
-
-  def __getitem__(self, key):
-    return self._internal_dict[self._get_internal_key(key)]
-
-  def __iter__(self):
-    return iter(self._keys)
-
-  def __len__(self):
-    return len(self._keys)
-# LINT.ThenChange(//tensorflow/python/saved_model/model_utils/mode_keys.py)
diff --git a/keras/saving/utils_v1/signature_def_utils.py b/keras/saving/utils_v1/signature_def_utils.py
deleted file mode 100644
index b91d2097b76b..000000000000
--- a/keras/saving/utils_v1/signature_def_utils.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SignatureDef utility functions implementation."""
-
-import tensorflow.compat.v2 as tf
-
-from keras.saving.utils_v1 import unexported_constants
-
-
-# LINT.IfChange
-def supervised_train_signature_def(
-    inputs, loss, predictions=None, metrics=None):
-  return _supervised_signature_def(
-      unexported_constants.SUPERVISED_TRAIN_METHOD_NAME, inputs, loss=loss,
-      predictions=predictions, metrics=metrics)
-
-
-def supervised_eval_signature_def(
-    inputs, loss, predictions=None, metrics=None):
-  return _supervised_signature_def(
-      unexported_constants.SUPERVISED_EVAL_METHOD_NAME, inputs, loss=loss,
-      predictions=predictions, metrics=metrics)
-
-
-def _supervised_signature_def(
-    method_name, inputs, loss=None, predictions=None,
-    metrics=None):
-  """Creates a signature for training and eval data.
-
-  This function produces signatures that describe the inputs and outputs
-  of a supervised process, such as training or evaluation, that
-  results in loss, metrics, and the like. Note that this function only requires
-  inputs to be not None.
-
-  Args:
-    method_name: Method name of the SignatureDef as a string.
-    inputs: dict of string to `Tensor`.
-    loss: dict of string to `Tensor` representing computed loss.
-    predictions: dict of string to `Tensor` representing the output predictions.
-    metrics: dict of string to `Tensor` representing metric ops.
-
-  Returns:
-    A train- or eval-flavored signature_def.
-
-  Raises:
-    ValueError: If inputs or outputs is `None`.
-  """
-  if inputs is None or not inputs:
-    raise ValueError(f'{method_name} `inputs` cannot be None or empty.')
-
-  signature_inputs = {key: tf.compat.v1.saved_model.build_tensor_info(tensor)
-                      for key, tensor in inputs.items()}
-
-  signature_outputs = {}
-  for output_set in (loss, predictions, metrics):
-    if output_set is not None:
-      sig_out = {key: tf.compat.v1.saved_model.build_tensor_info(tensor)
-                 for key, tensor in output_set.items()}
-      signature_outputs.update(sig_out)
-
-  signature_def = tf.compat.v1.saved_model.build_signature_def(
-      signature_inputs, signature_outputs, method_name)
-
-  return signature_def
-# LINT.ThenChange(//keras/saving/utils_v1/signature_def_utils.py)
diff --git a/keras/saving/utils_v1/unexported_constants.py b/keras/saving/utils_v1/unexported_constants.py
deleted file mode 100644
index 9936f095df88..000000000000
--- a/keras/saving/utils_v1/unexported_constants.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Signature constants for SavedModel save and restore operations.
-
-These are the private constants that have not been exported.
-"""
-
-# LINT.IfChange
-DEFAULT_TRAIN_SIGNATURE_DEF_KEY = "train"
-
-DEFAULT_EVAL_SIGNATURE_DEF_KEY = "eval"
-
-SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
-
-SUPERVISED_EVAL_METHOD_NAME = "tensorflow/supervised/eval"
-# LINT.ThenChange(//tensorflow/python/saved_model/signature_constants.py)
-
-# LINT.IfChange
-EVAL = "eval"
-# LINT.ThenChange(//tensorflow/python/saved_model/tag_constants.py)
diff --git a/keras/testing_infra/BUILD b/keras/testing_infra/BUILD
index 8f5f1f29eab2..caee29ae0216 100644
--- a/keras/testing_infra/BUILD
+++ b/keras/testing_infra/BUILD
@@ -1,15 +1,13 @@
 # Description:
 #   Contains the Keras testing infrastructure.
 
+# Placeholder: load unaliased py_library
+# Placeholder: load unaliased py_test
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
-    default_visibility = [
-        "//keras:friends",
-        "//third_party/py/language/common/layers:__subpackages__",
-        "//third_party/py/tensorflow_probability:__subpackages__",
-        "//third_party/tensorflow_text:__subpackages__",
-    ],
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
+    default_visibility = ["//keras:friends"],
     licenses = ["notice"],
 )
 
@@ -40,7 +38,7 @@ py_library(
         "//keras/engine:base_layer_utils",
         "//keras/layers",
         "//keras/models",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/utils:tf_contextlib",
         "//keras/utils:tf_inspect",
     ],
diff --git a/keras/testing_infra/keras_doctest_lib.py b/keras/testing_infra/keras_doctest_lib.py
index 0aaa67d039f8..101eb2394854 100644
--- a/keras/testing_infra/keras_doctest_lib.py
+++ b/keras/testing_infra/keras_doctest_lib.py
@@ -22,21 +22,21 @@
 
 
 class _FloatExtractor(object):
-  """Class for extracting floats from a string.
+    """Class for extracting floats from a string.
 
-  For example:
+    For example:
 
-  >>> text_parts, floats = _FloatExtractor()("Text 1.0 Text")
-  >>> text_parts
-  ['Text ', ' Text']
-  >>> floats
-  array([1.])
-  """
+    >>> text_parts, floats = _FloatExtractor()("Text 1.0 Text")
+    >>> text_parts
+    ['Text ', ' Text']
+    >>> floats
+    array([1.])
+    """
 
-  # Note: non-capturing groups "(?" are not returned in matched groups, or by
-  # re.split.
-  _FLOAT_RE = re.compile(
-      r"""
+    # Note: non-capturing groups "(?" are not returned in matched groups, or by
+    # re.split.
+    _FLOAT_RE = re.compile(
+        r"""
       (                          # Captures the float value.
         (?:
            [-+]|                 # Start with a sign is okay anywhere.
@@ -58,154 +58,166 @@ class _FloatExtractor(object):
         [^\w.]                   # * Next char is not a word char or "."
       )
       """.format(
-          # Digits, a "." and optional more digits: "1.1".
-          digits_dot_maybe_digits=r'(?:[0-9]+\.(?:[0-9]*))',
-          # A "." with trailing digits ".23"
-          dot_digits=r'(?:\.[0-9]+)',
-          # digits: "12"
-          digits=r'(?:[0-9]+)',
-          # The exponent: An "e" or "E", optional sign, and at least one digit.
-          # "e-123", "E+12", "e12"
-          exponent=r'(?:[eE][-+]?[0-9]+)'),
-      re.VERBOSE)
-
-  def __call__(self, string):
-    """Extracts floats from a string.
-
-    >>> text_parts, floats = _FloatExtractor()("Text 1.0 Text")
-    >>> text_parts
-    ['Text ', ' Text']
-    >>> floats
-    array([1.])
-
-    Args:
-      string: the string to extract floats from.
-
-    Returns:
-      A (string, array) pair, where `string` has each float replaced by "..."
-      and `array` is a `float32` `numpy.array` containing the extracted floats.
-    """
-    texts = []
-    floats = []
-    for i, part in enumerate(self._FLOAT_RE.split(string)):
-      if i % 2 == 0:
-        texts.append(part)
-      else:
-        floats.append(float(part))
-
-    return texts, np.array(floats)
+            # Digits, a "." and optional more digits: "1.1".
+            digits_dot_maybe_digits=r"(?:[0-9]+\.(?:[0-9]*))",
+            # A "." with trailing digits ".23"
+            dot_digits=r"(?:\.[0-9]+)",
+            # digits: "12"
+            digits=r"(?:[0-9]+)",
+            # The exponent: An "e" or "E", optional sign, and at least one
+            # digit.  "e-123", "E+12", "e12"
+            exponent=r"(?:[eE][-+]?[0-9]+)",
+        ),
+        re.VERBOSE,
+    )
+
+    def __call__(self, string):
+        """Extracts floats from a string.
+
+        >>> text_parts, floats = _FloatExtractor()("Text 1.0 Text")
+        >>> text_parts
+        ['Text ', ' Text']
+        >>> floats
+        array([1.])
+
+        Args:
+          string: the string to extract floats from.
+
+        Returns:
+          A (string, array) pair, where `string` has each float replaced by
+          "..." and `array` is a `float32` `numpy.array` containing the
+          extracted floats.
+        """
+        texts = []
+        floats = []
+        for i, part in enumerate(self._FLOAT_RE.split(string)):
+            if i % 2 == 0:
+                texts.append(part)
+            else:
+                floats.append(float(part))
+
+        return texts, np.array(floats)
 
 
 class KerasDoctestOutputChecker(doctest.OutputChecker, object):
-  """Customizes how `want` and `got` are compared, see `check_output`."""
+    """Customizes how `want` and `got` are compared, see `check_output`."""
 
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.extract_floats = _FloatExtractor()
-    self.text_good = None
-    self.float_size_good = None
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.extract_floats = _FloatExtractor()
+        self.text_good = None
+        self.float_size_good = None
 
-  _ADDRESS_RE = re.compile(r'\bat 0x[0-9a-f]*?>')
-  # TODO(yashkatariya): Add other tensor's string substitutions too.
-  # tf.RaggedTensor doesn't need one.
-  _NUMPY_OUTPUT_RE = re.compile(r'<tf.Tensor.*?numpy=(.*?)>', re.DOTALL)
+    _ADDRESS_RE = re.compile(r"\bat 0x[0-9a-f]*?>")
+    # TODO(yashkatariya): Add other tensor's string substitutions too.
+    # tf.RaggedTensor doesn't need one.
+    _NUMPY_OUTPUT_RE = re.compile(r"<tf.Tensor.*?numpy=(.*?)>", re.DOTALL)
 
-  def _allclose(self, want, got, rtol=1e-3, atol=1e-3):
-    return np.allclose(want, got, rtol=rtol, atol=atol)
+    def _allclose(self, want, got, rtol=1e-3, atol=1e-3):
+        return np.allclose(want, got, rtol=rtol, atol=atol)
 
-  def _tf_tensor_numpy_output(self, string):
-    modified_string = self._NUMPY_OUTPUT_RE.sub(r'\1', string)
-    return modified_string, modified_string != string
+    def _tf_tensor_numpy_output(self, string):
+        modified_string = self._NUMPY_OUTPUT_RE.sub(r"\1", string)
+        return modified_string, modified_string != string
 
-  MESSAGE = textwrap.dedent("""\n
+    MESSAGE = textwrap.dedent(
+        """\n
         #############################################################
         Check the documentation (go/testable-docstrings) on how to
         write testable docstrings.
-        #############################################################""")
+        #############################################################"""
+    )
 
-  def check_output(self, want, got, optionflags):
-    """Compares the docstring output to the output gotten by running the code.
+    def check_output(self, want, got, optionflags):
+        """Compares the docstring output to the output gotten by running the
+        code.
 
-    Python addresses in the output are replaced with wildcards.
+        Python addresses in the output are replaced with wildcards.
 
-    Float values in the output compared as using `np.allclose`:
+        Float values in the output compared as using `np.allclose`:
 
-      * Float values are extracted from the text and replaced with wildcards.
-      * The wildcard text is compared to the actual output.
-      * The float values are compared using `np.allclose`.
+          * Float values are extracted from the text and replaced with
+            wildcards.
+          * The wildcard text is compared to the actual output.
+          * The float values are compared using `np.allclose`.
 
-    The method returns `True` if both the text comparison and the numeric
-    comparison are successful.
+        The method returns `True` if both the text comparison and the numeric
+        comparison are successful.
 
-    The numeric comparison will fail if either:
+        The numeric comparison will fail if either:
 
-      * The wrong number of floats are found.
-      * The float values are not within tolerence.
+          * The wrong number of floats are found.
+          * The float values are not within tolerence.
 
-    Args:
-      want: The output in the docstring.
-      got: The output generated after running the snippet.
-      optionflags: Flags passed to the doctest.
+        Args:
+          want: The output in the docstring.
+          got: The output generated after running the snippet.
+          optionflags: Flags passed to the doctest.
 
-    Returns:
-      A bool, indicating if the check was successful or not.
-    """
+        Returns:
+          A bool, indicating if the check was successful or not.
+        """
+
+        # If the docstring's output is empty and there is some output generated
+        # after running the snippet, return True. This is because if the user
+        # doesn't want to display output, respect that over what the doctest
+        # wants.
+        if got and not want:
+            return True
 
-    # If the docstring's output is empty and there is some output generated
-    # after running the snippet, return True. This is because if the user
-    # doesn't want to display output, respect that over what the doctest wants.
-    if got and not want:
-      return True
-
-    if want is None:
-      want = ''
-
-    # Replace python's addresses with ellipsis (`...`) since it can change on
-    # each execution.
-    want = self._ADDRESS_RE.sub('at ...>', want)
-
-    # Replace tf.Tensor strings with only their numpy field values.
-    want, want_changed = self._tf_tensor_numpy_output(want)
-    if want_changed:
-      got, _ = self._tf_tensor_numpy_output(got)
-
-    # Separate out the floats, and replace `want` with the wild-card version
-    # "result=7.0" => "result=..."
-    want_text_parts, self.want_floats = self.extract_floats(want)
-    want_text_wild = '...'.join(want_text_parts)
-
-    # Find the floats in the string returned by the test
-    _, self.got_floats = self.extract_floats(got)
-
-    self.text_good = super().check_output(
-        want=want_text_wild, got=got, optionflags=optionflags)
-    if not self.text_good:
-      return False
-
-    if self.want_floats.size == 0:
-      # If there are no floats in the "want" string, ignore all the floats in
-      # the result. "np.array([ ... ])" matches "np.array([ 1.0, 2.0 ])"
-      return True
-
-    self.float_size_good = (self.want_floats.size == self.got_floats.size)
-
-    if self.float_size_good:
-      return self._allclose(self.want_floats, self.got_floats)
-    else:
-      return False
-
-  def output_difference(self, example, got, optionflags):
-    got = [got]
-
-    # If the some of the float output is hidden with `...`, `float_size_good`
-    # will be False. This is because the floats extracted from the string is
-    # converted into a 1-D numpy array. Hence hidding floats is not allowed
-    # anymore.
-    if self.text_good:
-      if not self.float_size_good:
-        got.append("\n\nCAUTION: tf_doctest doesn't work if *some* of the "
-                   "*float output* is hidden with a \"...\".")
-
-    got.append(self.MESSAGE)
-    got = '\n'.join(got)
-    return super().output_difference(example, got, optionflags)
+        if want is None:
+            want = ""
+
+        # Replace python's addresses with ellipsis (`...`) since it can change
+        # on each execution.
+        want = self._ADDRESS_RE.sub("at ...>", want)
+
+        # Replace tf.Tensor strings with only their numpy field values.
+        want, want_changed = self._tf_tensor_numpy_output(want)
+        if want_changed:
+            got, _ = self._tf_tensor_numpy_output(got)
+
+        # Separate out the floats, and replace `want` with the wild-card version
+        # "result=7.0" => "result=..."
+        want_text_parts, self.want_floats = self.extract_floats(want)
+        want_text_wild = "...".join(want_text_parts)
+
+        # Find the floats in the string returned by the test
+        _, self.got_floats = self.extract_floats(got)
+
+        self.text_good = super().check_output(
+            want=want_text_wild, got=got, optionflags=optionflags
+        )
+        if not self.text_good:
+            return False
+
+        if self.want_floats.size == 0:
+            # If there are no floats in the "want" string, ignore all the floats
+            # in the result. "np.array([ ... ])" matches "np.array([ 1.0, 2.0
+            # ])"
+            return True
+
+        self.float_size_good = self.want_floats.size == self.got_floats.size
+
+        if self.float_size_good:
+            return self._allclose(self.want_floats, self.got_floats)
+        else:
+            return False
+
+    def output_difference(self, example, got, optionflags):
+        got = [got]
+
+        # If the some of the float output is hidden with `...`,
+        # `float_size_good` will be False. This is because the floats extracted
+        # from the string is converted into a 1-D numpy array. Hence hidding
+        # floats is not allowed anymore.
+        if self.text_good:
+            if not self.float_size_good:
+                got.append(
+                    "\n\nCAUTION: tf_doctest doesn't work if *some* of the "
+                    '*float output* is hidden with a "...".'
+                )
+
+        got.append(self.MESSAGE)
+        got = "\n".join(got)
+        return super().output_difference(example, got, optionflags)
diff --git a/keras/testing_infra/keras_doctest_lib_test.py b/keras/testing_infra/keras_doctest_lib_test.py
index ede34e3deebc..c31f8f05fe15 100644
--- a/keras/testing_infra/keras_doctest_lib_test.py
+++ b/keras/testing_infra/keras_doctest_lib_test.py
@@ -16,188 +16,210 @@
 
 import doctest
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras.testing_infra import keras_doctest_lib
-import tensorflow.compat.v2 as tf
 
 
 class KerasDoctestOutputCheckerTest(parameterized.TestCase):
+    @parameterized.parameters(
+        # Don't match ints.
+        ["result = 1", []],
+        # Match floats.
+        ["0.0", [0.0]],
+        ["text 1.0 text", [1.0]],
+        ["text 1. text", [1.0]],
+        ["text .1 text", [0.1]],
+        ["text 1e3 text", [1000.0]],
+        ["text 1.e3 text", [1000.0]],
+        ["text +1. text", [1.0]],
+        ["text -1. text", [-1.0]],
+        ["text 1e+3 text", [1000.0]],
+        ["text 1e-3 text", [0.001]],
+        ["text +1E3 text", [1000.0]],
+        ["text -1E3 text", [-1000.0]],
+        ["text +1e-3 text", [0.001]],
+        ["text -1e+3 text", [-1000.0]],
+        # Match at the start and end of a string.
+        [".1", [0.1]],
+        [".1 text", [0.1]],
+        ["text .1", [0.1]],
+        ["0.1 text", [0.1]],
+        ["text 0.1", [0.1]],
+        ["0. text", [0.0]],
+        ["text 0.", [0.0]],
+        ["1e-1 text", [0.1]],
+        ["text 1e-1", [0.1]],
+        # Don't match floats mixed into text
+        ["text1.0 text", []],
+        ["text 1.0text", []],
+        ["text1.0text", []],
+        ["0x12e4", []],  # not 12000
+        ["TensorBoard: http://128.0.0.1:8888", []],
+        # With a newline
+        ["1.0 text\n 2.0 3.0 text", [1.0, 2.0, 3.0]],
+        # With ints and a float.
+        ["shape (1,2,3) value -1e9", [-1e9]],
+        # "." after a float.
+        ["No floats at end of sentence: 1.0.", []],
+        ["No floats with ellipsis: 1.0...", []],
+        # A numpy array
+        [
+            """array([[1., 2., 3.],
+                 [4., 5., 6.]], dtype=float32)""",
+            [1, 2, 3, 4, 5, 6],
+        ],
+        # Match both parts of a complex number
+        # python style
+        ["(0.0002+30000j)", [0.0002, 30000]],
+        ["(2.3e-10-3.34e+9j)", [2.3e-10, -3.34e9]],
+        # numpy style
+        ["array([1.27+5.j])", [1.27, 5]],
+        ["(2.3e-10+3.34e+9j)", [2.3e-10, 3.34e9]],
+        [
+            """array([1.27e-09+5.e+00j,
+                 2.30e+01-1.e-03j])""",
+            [1.27e-09, 5.0e00, 2.30e01, -1.0e-03],
+        ],
+        # Check examples in tolerence.
+        ["1e-6", [0]],
+        ["0.0", [1e-6]],
+        ["1.000001e9", [1e9]],
+        ["1e9", [1.000001e9]],
+    )
+    def test_extract_floats(self, text, expected_floats):
+        extract_floats = keras_doctest_lib._FloatExtractor()
+        output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
+
+        (text_parts, extracted_floats) = extract_floats(text)
+        text_with_wildcards = "...".join(text_parts)
+
+        # Check that the lengths match before doing anything else.
+        try:
+            self.assertLen(extracted_floats, len(expected_floats))
+        except AssertionError as e:
+            msg = "\n\n  expected: {}\n  found:     {}".format(
+                expected_floats, extracted_floats
+            )
+            e.args = (e.args[0] + msg,)
+            raise e
+
+        # The floats should match according to allclose
+        try:
+            self.assertTrue(
+                output_checker._allclose(expected_floats, extracted_floats)
+            )
+        except AssertionError as e:
+            msg = "\n\nexpected:  {}\nfound:     {}".format(
+                expected_floats, extracted_floats
+            )
+            e.args = (e.args[0] + msg,)
+            raise e
+
+        # The wildcard text should match the input text, according to the
+        # OutputChecker base class.
+        try:
+            self.assertTrue(
+                doctest.OutputChecker().check_output(
+                    want=text_with_wildcards,
+                    got=text,
+                    optionflags=doctest.ELLIPSIS,
+                )
+            )
+        except AssertionError as e:
+            msg = f"\n\n  expected: {text_with_wildcards}\n  found:     {text}"
+            e.args = (e.args[0] + msg,)
+            raise e
+
+    @parameterized.parameters(
+        # CHeck examples out of tolerence.
+        ["1.001e-2", [0]],
+        ["0.0", [1.001e-3]],
+    )
+    def test_fail_tolerences(self, text, expected_floats):
+        extract_floats = keras_doctest_lib._FloatExtractor()
+        output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
+
+        (_, extracted_floats) = extract_floats(text)
+
+        # These floats should not match according to allclose
+        try:
+            self.assertFalse(
+                output_checker._allclose(expected_floats, extracted_floats)
+            )
+        except AssertionError as e:
+            msg = (
+                "\n\nThese matched! They should not have.\n"
+                "\n\n  Expected:  {}\n  found:     {}".format(
+                    expected_floats, extracted_floats
+                )
+            )
+            e.args = (e.args[0] + msg,)
+            raise e
+
+    def test_no_floats(self):
+        want = "text ... text"
+        got = "text 1.0 1.2 1.9 text"
+        output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
+        self.assertTrue(
+            output_checker.check_output(
+                want=want, got=got, optionflags=doctest.ELLIPSIS
+            )
+        )
+
+    @parameterized.parameters(
+        ["1.0, ..., 1.0", "1.0, 1.0, 1.0"],
+        ["1.0, 1.0..., 1.0", "1.0, 1.002, 1.0"],
+    )
+    def test_warning_messages(self, want, got):
+        output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
 
-  @parameterized.parameters(
-      # Don't match ints.
-      ['result = 1', []],
-      # Match floats.
-      ['0.0', [0.]],
-      ['text 1.0 text', [1.]],
-      ['text 1. text', [1.]],
-      ['text .1 text', [.1]],
-      ['text 1e3 text', [1000.]],
-      ['text 1.e3 text', [1000.]],
-      ['text +1. text', [1.]],
-      ['text -1. text', [-1.]],
-      ['text 1e+3 text', [1000.]],
-      ['text 1e-3 text', [0.001]],
-      ['text +1E3 text', [1000.]],
-      ['text -1E3 text', [-1000.]],
-      ['text +1e-3 text', [0.001]],
-      ['text -1e+3 text', [-1000.]],
-      # Match at the start and end of a string.
-      ['.1', [.1]],
-      ['.1 text', [.1]],
-      ['text .1', [.1]],
-      ['0.1 text', [.1]],
-      ['text 0.1', [.1]],
-      ['0. text', [0.]],
-      ['text 0.', [0.]],
-      ['1e-1 text', [.1]],
-      ['text 1e-1', [.1]],
-      # Don't match floats mixed into text
-      ['text1.0 text', []],
-      ['text 1.0text', []],
-      ['text1.0text', []],
-      ['0x12e4', []],  #  not 12000
-      ['TensorBoard: http://128.0.0.1:8888', []],
-      # With a newline
-      ['1.0 text\n 2.0 3.0 text', [1., 2., 3.]],
-      # With ints and a float.
-      ['shape (1,2,3) value -1e9', [-1e9]],
-      # "." after a float.
-      ['No floats at end of sentence: 1.0.', []],
-      ['No floats with ellipsis: 1.0...', []],
-      # A numpy array
-      [
-          """array([[1., 2., 3.],
-                 [4., 5., 6.]], dtype=float32)""", [1, 2, 3, 4, 5, 6]
-      ],
-      # Match both parts of a complex number
-      # python style
-      ['(0.0002+30000j)', [0.0002, 30000]],
-      ['(2.3e-10-3.34e+9j)', [2.3e-10, -3.34e+9]],
-      # numpy style
-      ['array([1.27+5.j])', [1.27, 5]],
-      ['(2.3e-10+3.34e+9j)', [2.3e-10, 3.34e+9]],
-      [
-          """array([1.27e-09+5.e+00j,
-                 2.30e+01-1.e-03j])""", [1.27e-09, 5.e+00, 2.30e+01, -1.e-03]
-      ],
-      # Check examples in tolerence.
-      ['1e-6', [0]],
-      ['0.0', [1e-6]],
-      ['1.000001e9', [1e9]],
-      ['1e9', [1.000001e9]],
-  )
-  def test_extract_floats(self, text, expected_floats):
-    extract_floats = keras_doctest_lib._FloatExtractor()
-    output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
-
-    (text_parts, extracted_floats) = extract_floats(text)
-    text_with_wildcards = '...'.join(text_parts)
-
-    # Check that the lengths match before doing anything else.
-    try:
-      self.assertLen(extracted_floats, len(expected_floats))
-    except AssertionError as e:
-      msg = '\n\n  expected: {}\n  found:     {}'.format(
-          expected_floats, extracted_floats)
-      e.args = (e.args[0] + msg,)
-      raise e
-
-    # The floats should match according to allclose
-    try:
-      self.assertTrue(
-          output_checker._allclose(expected_floats, extracted_floats))
-    except AssertionError as e:
-      msg = '\n\nexpected:  {}\nfound:     {}'.format(expected_floats,
-                                                      extracted_floats)
-      e.args = (e.args[0] + msg,)
-      raise e
-
-    # The wildcard text should match the input text, according to the
-    # OutputChecker base class.
-    try:
-      self.assertTrue(doctest.OutputChecker().check_output(
-          want=text_with_wildcards, got=text, optionflags=doctest.ELLIPSIS))
-    except AssertionError as e:
-      msg = '\n\n  expected: {}\n  found:     {}'.format(
-          text_with_wildcards, text)
-      e.args = (e.args[0] + msg,)
-      raise e
-
-  @parameterized.parameters(
-      # CHeck examples out of tolerence.
-      ['1.001e-2', [0]],
-      ['0.0', [1.001e-3]],
-  )
-  def test_fail_tolerences(self, text, expected_floats):
-    extract_floats = keras_doctest_lib._FloatExtractor()
-    output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
-
-    (_, extracted_floats) = extract_floats(text)
-
-    # These floats should not match according to allclose
-    try:
-      self.assertFalse(
-          output_checker._allclose(expected_floats, extracted_floats))
-    except AssertionError as e:
-      msg = ('\n\nThese matched! They should not have.\n'
-             '\n\n  Expected:  {}\n  found:     {}'.format(
-                 expected_floats, extracted_floats))
-      e.args = (e.args[0] + msg,)
-      raise e
-
-  def test_no_floats(self):
-    want = 'text ... text'
-    got = 'text 1.0 1.2 1.9 text'
-    output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
-    self.assertTrue(
         output_checker.check_output(
-            want=want, got=got, optionflags=doctest.ELLIPSIS))
-
-  @parameterized.parameters(['1.0, ..., 1.0', '1.0, 1.0, 1.0'],
-                            ['1.0, 1.0..., 1.0', '1.0, 1.002, 1.0'])
-  def test_warning_messages(self, want, got):
-    output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
-
-    output_checker.check_output(
-        want=want, got=got, optionflags=doctest.ELLIPSIS)
-
-    example = doctest.Example('None', want=want)
-    result = output_checker.output_difference(
-        example=example, got=got, optionflags=doctest.ELLIPSIS)
-    self.assertIn("doesn't work if *some* of the", result)
-
-  @parameterized.parameters(
-      ['<...>', ('<...>', False)],
-      ['TensorFlow', ('TensorFlow', False)],
-      [
-          'tf.Variable([[1, 2], [3, 4]])',
-          ('tf.Variable([[1, 2], [3, 4]])', False)
-      ],
-      ['<tf.Tensor: shape=(), dtype=float32, numpy=inf>', ('inf', True)],
-      [
-          '<tf.RaggedTensor:... shape=(2, 2), numpy=1>',
-          ('<tf.RaggedTensor:... shape=(2, 2), numpy=1>', False)
-      ],
-      [
-          """<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+            want=want, got=got, optionflags=doctest.ELLIPSIS
+        )
+
+        example = doctest.Example("None", want=want)
+        result = output_checker.output_difference(
+            example=example, got=got, optionflags=doctest.ELLIPSIS
+        )
+        self.assertIn("doesn't work if *some* of the", result)
+
+    @parameterized.parameters(
+        ["<...>", ("<...>", False)],
+        ["TensorFlow", ("TensorFlow", False)],
+        [
+            "tf.Variable([[1, 2], [3, 4]])",
+            ("tf.Variable([[1, 2], [3, 4]])", False),
+        ],
+        ["<tf.Tensor: shape=(), dtype=float32, numpy=inf>", ("inf", True)],
+        [
+            "<tf.RaggedTensor:... shape=(2, 2), numpy=1>",
+            ("<tf.RaggedTensor:... shape=(2, 2), numpy=1>", False),
+        ],
+        [
+            """<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
               array([[2, 2],
                      [3, 5]], dtype=int32)>""",
-          ('\n              array([[2, 2],\n                     [3, 5]], ' +
-           'dtype=int32)', True)
-      ],
-      [
-          '[<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], ' +
-          'dtype=int32)>, ' +
-          '<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 4], ' +
-          'dtype=int32)>]',
-          ('[array([1, 2], dtype=int32), array([3, 4], dtype=int32)]', True)
-      ],
-  )
-  def test_tf_tensor_numpy_output(self, string, expected_output):
-    output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
-    output = output_checker._tf_tensor_numpy_output(string)
-    self.assertEqual(expected_output, output)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+            (
+                "\n              array([[2, 2],\n                     [3, 5]], "
+                + "dtype=int32)",
+                True,
+            ),
+        ],
+        [
+            "[<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], "
+            + "dtype=int32)>, "
+            + "<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 4], "
+            + "dtype=int32)>]",
+            ("[array([1, 2], dtype=int32), array([3, 4], dtype=int32)]", True),
+        ],
+    )
+    def test_tf_tensor_numpy_output(self, string, expected_output):
+        output_checker = keras_doctest_lib.KerasDoctestOutputChecker()
+        output = output_checker._tf_tensor_numpy_output(string)
+        self.assertEqual(expected_output, output)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/testing_infra/test_combinations.py b/keras/testing_infra/test_combinations.py
index 0e9fc2a0689f..2f29e1e3d5fa 100644
--- a/keras/testing_infra/test_combinations.py
+++ b/keras/testing_infra/test_combinations.py
@@ -13,548 +13,578 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities for unit-testing Keras."""
-# pylint: disable=g-bad-import-order
 
-import tensorflow.compat.v2 as tf
 
 import collections
 import functools
 import itertools
 import unittest
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 import keras
 from keras.testing_infra import test_utils
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+    import h5py
 except ImportError:
-  h5py = None
+    h5py = None
 
-KERAS_MODEL_TYPES = ['functional', 'subclass', 'sequential']
+KERAS_MODEL_TYPES = ["functional", "subclass", "sequential"]
 
 
 class TestCase(tf.test.TestCase, parameterized.TestCase):
+    def tearDown(self):
+        keras.backend.clear_session()
+        super().tearDown()
 
-  def tearDown(self):
-    keras.backend.clear_session()
-    super().tearDown()
 
+def run_with_all_saved_model_formats(test_or_class=None, exclude_formats=None):
+    """Execute the decorated test with all Keras saved model formats).
 
-def run_with_all_saved_model_formats(
-    test_or_class=None,
-    exclude_formats=None):
-  """Execute the decorated test with all Keras saved model formats).
+    This decorator is intended to be applied either to individual test methods
+    in a `test_combinations.TestCase` class, or directly to a test class that
+    extends it. Doing so will cause the contents of the individual test method
+    (or all test methods in the class) to be executed multiple times - once for
+    each Keras saved model format.
+
+    The Keras saved model formats include:
+    1. HDF5: 'h5'
+    2. SavedModel: 'tf'
+
+    Note: if stacking this decorator with absl.testing's parameterized
+    decorators, those should be at the bottom of the stack.
 
-  This decorator is intended to be applied either to individual test methods in
-  a `test_combinations.TestCase` class, or directly to a test class that
-  extends it. Doing so will cause the contents of the individual test
-  method (or all test methods in the class) to be executed multiple times - once
-  for each Keras saved model format.
+    Various methods in `testing_utils` to get file path for saved models will
+    auto-generate a string of the two saved model formats. This allows unittests
+    to confirm the equivalence between the two Keras saved model formats.
 
-  The Keras saved model formats include:
-  1. HDF5: 'h5'
-  2. SavedModel: 'tf'
+    For example, consider the following unittest:
 
-  Note: if stacking this decorator with absl.testing's parameterized decorators,
-  those should be at the bottom of the stack.
+    ```python
+    class MyTests(test_utils.KerasTestCase):
 
-  Various methods in `testing_utils` to get file path for saved models will
-  auto-generate a string of the two saved model formats. This allows unittests
-  to confirm the equivalence between the two Keras saved model formats.
+      @test_utils.run_with_all_saved_model_formats
+      def test_foo(self):
+        save_format = test_utils.get_save_format()
+        saved_model_dir = '/tmp/saved_model/'
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(2, input_shape=(3,)))
+        model.add(keras.layers.Dense(3))
+        model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
-  For example, consider the following unittest:
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        model = keras.models.load_model(saved_model_dir)
 
-  ```python
-  class MyTests(test_utils.KerasTestCase):
+    if __name__ == "__main__":
+      tf.test.main()
+    ```
 
+    This test tries to save the model into the formats of 'hdf5', 'h5', 'keras',
+    'tensorflow', and 'tf'.
+
+    We can also annotate the whole class if we want this to apply to all tests
+    in the class:
+    ```python
     @test_utils.run_with_all_saved_model_formats
-    def test_foo(self):
-      save_format = test_utils.get_save_format()
-      saved_model_dir = '/tmp/saved_model/'
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      model = keras.models.load_model(saved_model_dir)
-
-  if __name__ == "__main__":
-    tf.test.main()
-  ```
-
-  This test tries to save the model into the formats of 'hdf5', 'h5', 'keras',
-  'tensorflow', and 'tf'.
-
-  We can also annotate the whole class if we want this to apply to all tests in
-  the class:
-  ```python
-  @test_utils.run_with_all_saved_model_formats
-  class MyTests(test_utils.KerasTestCase):
-
-    def test_foo(self):
-      save_format = test_utils.get_save_format()
-      saved_model_dir = '/tmp/saved_model/'
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      model = tf.keras.models.load_model(saved_model_dir)
-
-  if __name__ == "__main__":
-    tf.test.main()
-  ```
-
-  Args:
-    test_or_class: test method or class to be annotated. If None,
-      this method returns a decorator that can be applied to a test method or
-      test class. If it is not None this returns the decorator applied to the
-      test or class.
-    exclude_formats: A collection of Keras saved model formats to not run.
-      (May also be a single format not wrapped in a collection).
-      Defaults to None.
-
-  Returns:
-    Returns a decorator that will run the decorated test method multiple times:
-    once for each desired Keras saved model format.
-
-  Raises:
-    ImportError: If abseil parameterized is not installed or not included as
-      a target dependency.
-  """
-  # Exclude h5 save format if H5py isn't available.
-  if h5py is None:
-    exclude_formats.append(['h5'])
-  saved_model_formats = ['h5', 'tf', 'tf_no_traces']
-  params = [('_%s' % saved_format, saved_format)
-            for saved_format in saved_model_formats
-            if saved_format not in tf.nest.flatten(exclude_formats)]
-
-  def single_method_decorator(f):
-    """Decorator that constructs the test cases."""
-    # Use named_parameters so it can be individually run from the command line
-    @parameterized.named_parameters(*params)
-    @functools.wraps(f)
-    def decorated(self, saved_format, *args, **kwargs):
-      """A run of a single test case w/ the specified model type."""
-      if saved_format == 'h5':
-        _test_h5_saved_model_format(f, self, *args, **kwargs)
-      elif saved_format == 'tf':
-        _test_tf_saved_model_format(f, self, *args, **kwargs)
-      elif saved_format == 'tf_no_traces':
-        _test_tf_saved_model_format_no_traces(f, self, *args, **kwargs)
-      else:
-        raise ValueError('Unknown model type: %s' % (saved_format,))
-    return decorated
-
-  return _test_or_class_decorator(test_or_class, single_method_decorator)
+    class MyTests(test_utils.KerasTestCase):
+
+      def test_foo(self):
+        save_format = test_utils.get_save_format()
+        saved_model_dir = '/tmp/saved_model/'
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(2, input_shape=(3,)))
+        model.add(keras.layers.Dense(3))
+        model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        model = tf.keras.models.load_model(saved_model_dir)
+
+    if __name__ == "__main__":
+      tf.test.main()
+    ```
+
+    Args:
+      test_or_class: test method or class to be annotated. If None,
+        this method returns a decorator that can be applied to a test method or
+        test class. If it is not None this returns the decorator applied to the
+        test or class.
+      exclude_formats: A collection of Keras saved model formats to not run.
+        (May also be a single format not wrapped in a collection).
+        Defaults to `None`.
+
+    Returns:
+      Returns a decorator that will run the decorated test method multiple
+      times: once for each desired Keras saved model format.
+
+    Raises:
+      ImportError: If abseil parameterized is not installed or not included as
+        a target dependency.
+    """
+    # Exclude h5 save format if H5py isn't available.
+    if h5py is None:
+        exclude_formats.append(["h5"])
+    saved_model_formats = ["h5", "tf", "tf_no_traces"]
+    params = [
+        (f"_{saved_format}", saved_format)
+        for saved_format in saved_model_formats
+        if saved_format not in tf.nest.flatten(exclude_formats)
+    ]
+
+    def single_method_decorator(f):
+        """Decorator that constructs the test cases."""
+        # Use named_parameters so it can be individually run from the command
+        # line
+        @parameterized.named_parameters(*params)
+        @functools.wraps(f)
+        def decorated(self, saved_format, *args, **kwargs):
+            """A run of a single test case w/ the specified model type."""
+            if saved_format == "h5":
+                _test_h5_saved_model_format(f, self, *args, **kwargs)
+            elif saved_format == "tf":
+                _test_tf_saved_model_format(f, self, *args, **kwargs)
+            elif saved_format == "tf_no_traces":
+                _test_tf_saved_model_format_no_traces(f, self, *args, **kwargs)
+            else:
+                raise ValueError(f"Unknown model type: {saved_format}")
+
+        return decorated
+
+    return _test_or_class_decorator(test_or_class, single_method_decorator)
 
 
 def _test_h5_saved_model_format(f, test_or_class, *args, **kwargs):
-  with test_utils.saved_model_format_scope('h5'):
-    f(test_or_class, *args, **kwargs)
+    with test_utils.saved_model_format_scope("h5"):
+        f(test_or_class, *args, **kwargs)
 
 
 def _test_tf_saved_model_format(f, test_or_class, *args, **kwargs):
-  with test_utils.saved_model_format_scope('tf'):
-    f(test_or_class, *args, **kwargs)
+    with test_utils.saved_model_format_scope("tf"):
+        f(test_or_class, *args, **kwargs)
 
 
 def _test_tf_saved_model_format_no_traces(f, test_or_class, *args, **kwargs):
-  with test_utils.saved_model_format_scope('tf', save_traces=False):
-    f(test_or_class, *args, **kwargs)
+    with test_utils.saved_model_format_scope("tf", save_traces=False):
+        f(test_or_class, *args, **kwargs)
 
 
 def run_with_all_weight_formats(test_or_class=None, exclude_formats=None):
-  """Runs all tests with the supported formats for saving weights."""
-  exclude_formats = exclude_formats or []
-  exclude_formats.append('tf_no_traces')  # Only applies to saving models
-  return run_with_all_saved_model_formats(test_or_class, exclude_formats)
+    """Runs all tests with the supported formats for saving weights."""
+    exclude_formats = exclude_formats or []
+    exclude_formats.append("tf_no_traces")  # Only applies to saving models
+    return run_with_all_saved_model_formats(test_or_class, exclude_formats)
 
 
 # TODO(kaftan): Possibly enable 'subclass_custom_build' when tests begin to pass
 # it. Or perhaps make 'subclass' always use a custom build method.
-def run_with_all_model_types(
-    test_or_class=None,
-    exclude_models=None):
-  """Execute the decorated test with all Keras model types.
-
-  This decorator is intended to be applied either to individual test methods in
-  a `test_combinations.TestCase` class, or directly to a test class that
-  extends it. Doing so will cause the contents of the individual test
-  method (or all test methods in the class) to be executed multiple times - once
-  for each Keras model type.
-
-  The Keras model types are: ['functional', 'subclass', 'sequential']
-
-  Note: if stacking this decorator with absl.testing's parameterized decorators,
-  those should be at the bottom of the stack.
-
-  Various methods in `testing_utils` to get models will auto-generate a model
-  of the currently active Keras model type. This allows unittests to confirm
-  the equivalence between different Keras models.
-
-  For example, consider the following unittest:
-
-  ```python
-  class MyTests(test_utils.KerasTestCase):
-
-    @test_utils.run_with_all_model_types(
-      exclude_models = ['sequential'])
-    def test_foo(self):
-      model = test_utils.get_small_mlp(1, 4, input_dim=3)
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics)
-
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  if __name__ == "__main__":
-    tf.test.main()
-  ```
-
-  This test tries building a small mlp as both a functional model and as a
-  subclass model.
-
-  We can also annotate the whole class if we want this to apply to all tests in
-  the class:
-  ```python
-  @test_utils.run_with_all_model_types(exclude_models = ['sequential'])
-  class MyTests(test_utils.KerasTestCase):
-
-    def test_foo(self):
-      model = test_utils.get_small_mlp(1, 4, input_dim=3)
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics)
-
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  if __name__ == "__main__":
-    tf.test.main()
-  ```
-
-
-  Args:
-    test_or_class: test method or class to be annotated. If None,
-      this method returns a decorator that can be applied to a test method or
-      test class. If it is not None this returns the decorator applied to the
-      test or class.
-    exclude_models: A collection of Keras model types to not run.
-      (May also be a single model type not wrapped in a collection).
-      Defaults to None.
-
-  Returns:
-    Returns a decorator that will run the decorated test method multiple times:
-    once for each desired Keras model type.
-
-  Raises:
-    ImportError: If abseil parameterized is not installed or not included as
-      a target dependency.
-  """
-  model_types = ['functional', 'subclass', 'sequential']
-  params = [('_%s' % model, model) for model in model_types
-            if model not in tf.nest.flatten(exclude_models)]
-
-  def single_method_decorator(f):
-    """Decorator that constructs the test cases."""
-    # Use named_parameters so it can be individually run from the command line
-    @parameterized.named_parameters(*params)
-    @functools.wraps(f)
-    def decorated(self, model_type, *args, **kwargs):
-      """A run of a single test case w/ the specified model type."""
-      if model_type == 'functional':
-        _test_functional_model_type(f, self, *args, **kwargs)
-      elif model_type == 'subclass':
-        _test_subclass_model_type(f, self, *args, **kwargs)
-      elif model_type == 'sequential':
-        _test_sequential_model_type(f, self, *args, **kwargs)
-      else:
-        raise ValueError('Unknown model type: %s' % (model_type,))
-    return decorated
-
-  return _test_or_class_decorator(test_or_class, single_method_decorator)
+def run_with_all_model_types(test_or_class=None, exclude_models=None):
+    """Execute the decorated test with all Keras model types.
+
+    This decorator is intended to be applied either to individual test methods
+    in a `test_combinations.TestCase` class, or directly to a test class that
+    extends it. Doing so will cause the contents of the individual test method
+    (or all test methods in the class) to be executed multiple times - once for
+    each Keras model type.
+
+    The Keras model types are: ['functional', 'subclass', 'sequential']
+
+    Note: if stacking this decorator with absl.testing's parameterized
+    decorators, those should be at the bottom of the stack.
+
+    Various methods in `testing_utils` to get models will auto-generate a model
+    of the currently active Keras model type. This allows unittests to confirm
+    the equivalence between different Keras models.
+
+    For example, consider the following unittest:
+
+    ```python
+    class MyTests(test_utils.KerasTestCase):
+
+      @test_utils.run_with_all_model_types(
+        exclude_models = ['sequential'])
+      def test_foo(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+        inputs = np.zeros((10, 3))
+        targets = np.zeros((10, 4))
+        dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+    if __name__ == "__main__":
+      tf.test.main()
+    ```
+
+    This test tries building a small mlp as both a functional model and as a
+    subclass model.
+
+    We can also annotate the whole class if we want this to apply to all tests
+    in the class:
+    ```python
+    @test_utils.run_with_all_model_types(exclude_models = ['sequential'])
+    class MyTests(test_utils.KerasTestCase):
+
+      def test_foo(self):
+        model = test_utils.get_small_mlp(1, 4, input_dim=3)
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+        inputs = np.zeros((10, 3))
+        targets = np.zeros((10, 4))
+        dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+    if __name__ == "__main__":
+      tf.test.main()
+    ```
+
+
+    Args:
+      test_or_class: test method or class to be annotated. If None,
+        this method returns a decorator that can be applied to a test method or
+        test class. If it is not None this returns the decorator applied to the
+        test or class.
+      exclude_models: A collection of Keras model types to not run.
+        (May also be a single model type not wrapped in a collection).
+        Defaults to `None`.
+
+    Returns:
+      Returns a decorator that will run the decorated test method multiple
+      times: once for each desired Keras model type.
+
+    Raises:
+      ImportError: If abseil parameterized is not installed or not included as
+        a target dependency.
+    """
+    model_types = ["functional", "subclass", "sequential"]
+    params = [
+        (f"_{model}", model)
+        for model in model_types
+        if model not in tf.nest.flatten(exclude_models)
+    ]
+
+    def single_method_decorator(f):
+        """Decorator that constructs the test cases."""
+        # Use named_parameters so it can be individually run from the command
+        # line
+        @parameterized.named_parameters(*params)
+        @functools.wraps(f)
+        def decorated(self, model_type, *args, **kwargs):
+            """A run of a single test case w/ the specified model type."""
+            if model_type == "functional":
+                _test_functional_model_type(f, self, *args, **kwargs)
+            elif model_type == "subclass":
+                _test_subclass_model_type(f, self, *args, **kwargs)
+            elif model_type == "sequential":
+                _test_sequential_model_type(f, self, *args, **kwargs)
+            else:
+                raise ValueError(f"Unknown model type: {model_type}")
+
+        return decorated
+
+    return _test_or_class_decorator(test_or_class, single_method_decorator)
 
 
 def _test_functional_model_type(f, test_or_class, *args, **kwargs):
-  with test_utils.model_type_scope('functional'):
-    f(test_or_class, *args, **kwargs)
+    with test_utils.model_type_scope("functional"):
+        f(test_or_class, *args, **kwargs)
 
 
 def _test_subclass_model_type(f, test_or_class, *args, **kwargs):
-  with test_utils.model_type_scope('subclass'):
-    f(test_or_class, *args, **kwargs)
+    with test_utils.model_type_scope("subclass"):
+        f(test_or_class, *args, **kwargs)
 
 
 def _test_sequential_model_type(f, test_or_class, *args, **kwargs):
-  with test_utils.model_type_scope('sequential'):
-    f(test_or_class, *args, **kwargs)
-
-
-def run_all_keras_modes(test_or_class=None,
-                        config=None,
-                        always_skip_v1=False,
-                        always_skip_eager=False,
-                        **kwargs):
-  """Execute the decorated test with all keras execution modes.
-
-  This decorator is intended to be applied either to individual test methods in
-  a `test_combinations.TestCase` class, or directly to a test class that
-  extends it. Doing so will cause the contents of the individual test
-  method (or all test methods in the class) to be executed multiple times -
-  once executing in legacy graph mode, once running eagerly and with
-  `should_run_eagerly` returning True, and once running eagerly with
-  `should_run_eagerly` returning False.
-
-  If Tensorflow v2 behavior is enabled, legacy graph mode will be skipped, and
-  the test will only run twice.
-
-  Note: if stacking this decorator with absl.testing's parameterized decorators,
-  those should be at the bottom of the stack.
-
-  For example, consider the following unittest:
-
-  ```python
-  class MyTests(test_utils.KerasTestCase):
-
-    @test_utils.run_all_keras_modes
-    def test_foo(self):
-      model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(
-          optimizer, loss, metrics=metrics,
-          run_eagerly=test_utils.should_run_eagerly())
-
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  if __name__ == "__main__":
-    tf.test.main()
-  ```
-
-  This test will try compiling & fitting the small functional mlp using all
-  three Keras execution modes.
-
-  Args:
-    test_or_class: test method or class to be annotated. If None,
-      this method returns a decorator that can be applied to a test method or
-      test class. If it is not None this returns the decorator applied to the
-      test or class.
-    config: An optional config_pb2.ConfigProto to use to configure the
-      session when executing graphs.
-    always_skip_v1: If True, does not try running the legacy graph mode even
-      when Tensorflow v2 behavior is not enabled.
-    always_skip_eager: If True, does not execute the decorated test
-      with eager execution modes.
-    **kwargs: Additional kwargs for configuring tests for
-     in-progress Keras behaviors/ refactorings that we haven't fully
-     rolled out yet
-
-  Returns:
-    Returns a decorator that will run the decorated test method multiple times.
-
-  Raises:
-    ImportError: If abseil parameterized is not installed or not included as
-      a target dependency.
-  """
-  if kwargs:
-    raise ValueError('Unrecognized keyword args: {}'.format(kwargs))
-
-  params = [('_v2_function', 'v2_function')]
-  if not always_skip_eager:
-    params.append(('_v2_eager', 'v2_eager'))
-  if not (always_skip_v1 or tf.__internal__.tf2.enabled()):
-    params.append(('_v1_session', 'v1_session'))
-
-  def single_method_decorator(f):
-    """Decorator that constructs the test cases."""
-
-    # Use named_parameters so it can be individually run from the command line
-    @parameterized.named_parameters(*params)
-    @functools.wraps(f)
-    def decorated(self, run_mode, *args, **kwargs):
-      """A run of a single test case w/ specified run mode."""
-      if run_mode == 'v1_session':
-        _v1_session_test(f, self, config, *args, **kwargs)
-      elif run_mode == 'v2_eager':
-        _v2_eager_test(f, self, *args, **kwargs)
-      elif run_mode == 'v2_function':
-        _v2_function_test(f, self, *args, **kwargs)
-      else:
-        return ValueError('Unknown run mode %s' % run_mode)
-
-    return decorated
-
-  return _test_or_class_decorator(test_or_class, single_method_decorator)
+    with test_utils.model_type_scope("sequential"):
+        f(test_or_class, *args, **kwargs)
+
+
+def run_all_keras_modes(
+    test_or_class=None,
+    config=None,
+    always_skip_v1=False,
+    always_skip_eager=False,
+    **kwargs,
+):
+    """Execute the decorated test with all keras execution modes.
+
+    This decorator is intended to be applied either to individual test methods
+    in a `test_combinations.TestCase` class, or directly to a test class that
+    extends it. Doing so will cause the contents of the individual test method
+    (or all test methods in the class) to be executed multiple times - once
+    executing in legacy graph mode, once running eagerly and with
+    `should_run_eagerly` returning True, and once running eagerly with
+    `should_run_eagerly` returning False.
+
+    If Tensorflow v2 behavior is enabled, legacy graph mode will be skipped, and
+    the test will only run twice.
+
+    Note: if stacking this decorator with absl.testing's parameterized
+    decorators, those should be at the bottom of the stack.
+
+    For example, consider the following unittest:
+
+    ```python
+    class MyTests(test_utils.KerasTestCase):
+
+      @test_utils.run_all_keras_modes
+      def test_foo(self):
+        model = test_utils.get_small_functional_mlp(1, 4, input_dim=3)
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(
+            optimizer, loss, metrics=metrics,
+            run_eagerly=test_utils.should_run_eagerly())
+
+        inputs = np.zeros((10, 3))
+        targets = np.zeros((10, 4))
+        dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+        dataset = dataset.repeat(100)
+        dataset = dataset.batch(10)
+
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+    if __name__ == "__main__":
+      tf.test.main()
+    ```
+
+    This test will try compiling & fitting the small functional mlp using all
+    three Keras execution modes.
+
+    Args:
+      test_or_class: test method or class to be annotated. If None,
+        this method returns a decorator that can be applied to a test method or
+        test class. If it is not None this returns the decorator applied to the
+        test or class.
+      config: An optional config_pb2.ConfigProto to use to configure the
+        session when executing graphs.
+      always_skip_v1: If True, does not try running the legacy graph mode even
+        when Tensorflow v2 behavior is not enabled.
+      always_skip_eager: If True, does not execute the decorated test
+        with eager execution modes.
+      **kwargs: Additional kwargs for configuring tests for
+       in-progress Keras behaviors/ refactorings that we haven't fully
+       rolled out yet
+
+    Returns:
+      Returns a decorator that will run the decorated test method multiple
+      times.
+
+    Raises:
+      ImportError: If abseil parameterized is not installed or not included as
+        a target dependency.
+    """
+    if kwargs:
+        raise ValueError(f"Unrecognized keyword args: {kwargs}")
+
+    params = [("_v2_function", "v2_function")]
+    if not always_skip_eager:
+        params.append(("_v2_eager", "v2_eager"))
+    if not (always_skip_v1 or tf.__internal__.tf2.enabled()):
+        params.append(("_v1_session", "v1_session"))
+
+    def single_method_decorator(f):
+        """Decorator that constructs the test cases."""
+
+        # Use named_parameters so it can be individually run from the command
+        # line
+        @parameterized.named_parameters(*params)
+        @functools.wraps(f)
+        def decorated(self, run_mode, *args, **kwargs):
+            """A run of a single test case w/ specified run mode."""
+            if run_mode == "v1_session":
+                _v1_session_test(f, self, config, *args, **kwargs)
+            elif run_mode == "v2_eager":
+                _v2_eager_test(f, self, *args, **kwargs)
+            elif run_mode == "v2_function":
+                _v2_function_test(f, self, *args, **kwargs)
+            else:
+                return ValueError(f"Unknown run mode {run_mode}")
+
+        return decorated
+
+    return _test_or_class_decorator(test_or_class, single_method_decorator)
 
 
 def _v1_session_test(f, test_or_class, config, *args, **kwargs):
-  with tf.compat.v1.get_default_graph().as_default():
-    with test_utils.run_eagerly_scope(False):
-      with test_or_class.test_session(config=config):
-        f(test_or_class, *args, **kwargs)
+    with tf.compat.v1.get_default_graph().as_default():
+        with test_utils.run_eagerly_scope(False):
+            with test_or_class.test_session(config=config):
+                f(test_or_class, *args, **kwargs)
 
 
 def _v2_eager_test(f, test_or_class, *args, **kwargs):
-  with tf.__internal__.eager_context.eager_mode():
-    with test_utils.run_eagerly_scope(True):
-      f(test_or_class, *args, **kwargs)
+    with tf.__internal__.eager_context.eager_mode():
+        with test_utils.run_eagerly_scope(True):
+            f(test_or_class, *args, **kwargs)
 
 
 def _v2_function_test(f, test_or_class, *args, **kwargs):
-  with tf.__internal__.eager_context.eager_mode():
-    with test_utils.run_eagerly_scope(False):
-      f(test_or_class, *args, **kwargs)
+    with tf.__internal__.eager_context.eager_mode():
+        with test_utils.run_eagerly_scope(False):
+            f(test_or_class, *args, **kwargs)
 
 
 def _test_or_class_decorator(test_or_class, single_method_decorator):
-  """Decorate a test or class with a decorator intended for one method.
-
-  If the test_or_class is a class:
-    This will apply the decorator to all test methods in the class.
-
-  If the test_or_class is an iterable of already-parameterized test cases:
-    This will apply the decorator to all the cases, and then flatten the
-    resulting cross-product of test cases. This allows stacking the Keras
-    parameterized decorators w/ each other, and to apply them to test methods
-    that have already been marked with an absl parameterized decorator.
-
-  Otherwise, treat the obj as a single method and apply the decorator directly.
-
-  Args:
-    test_or_class: A test method (that may have already been decorated with a
-      parameterized decorator, or a test class that extends
-      test_combinations.TestCase
-    single_method_decorator:
-      A parameterized decorator intended for a single test method.
-  Returns:
-    The decorated result.
-  """
-  def _decorate_test_or_class(obj):
-    if isinstance(obj, collections.abc.Iterable):
-      return itertools.chain.from_iterable(
-          single_method_decorator(method) for method in obj)
-    if isinstance(obj, type):
-      cls = obj
-      for name, value in cls.__dict__.copy().items():
-        if callable(value) and name.startswith(
-            unittest.TestLoader.testMethodPrefix):
-          setattr(cls, name, single_method_decorator(value))
-
-      cls = type(cls).__new__(type(cls), cls.__name__, cls.__bases__,
-                              cls.__dict__.copy())
-      return cls
-
-    return single_method_decorator(obj)
-
-  if test_or_class is not None:
-    return _decorate_test_or_class(test_or_class)
-
-  return _decorate_test_or_class
+    """Decorate a test or class with a decorator intended for one method.
+
+    If the test_or_class is a class:
+      This will apply the decorator to all test methods in the class.
+
+    If the test_or_class is an iterable of already-parameterized test cases:
+      This will apply the decorator to all the cases, and then flatten the
+      resulting cross-product of test cases. This allows stacking the Keras
+      parameterized decorators w/ each other, and to apply them to test methods
+      that have already been marked with an absl parameterized decorator.
+
+    Otherwise, treat the obj as a single method and apply the decorator
+    directly.
+
+    Args:
+      test_or_class: A test method (that may have already been decorated with a
+        parameterized decorator, or a test class that extends
+        test_combinations.TestCase
+      single_method_decorator:
+        A parameterized decorator intended for a single test method.
+    Returns:
+      The decorated result.
+    """
+
+    def _decorate_test_or_class(obj):
+        if isinstance(obj, collections.abc.Iterable):
+            return itertools.chain.from_iterable(
+                single_method_decorator(method) for method in obj
+            )
+        if isinstance(obj, type):
+            cls = obj
+            for name, value in cls.__dict__.copy().items():
+                if callable(value) and name.startswith(
+                    unittest.TestLoader.testMethodPrefix
+                ):
+                    setattr(cls, name, single_method_decorator(value))
+
+            cls = type(cls).__new__(
+                type(cls), cls.__name__, cls.__bases__, cls.__dict__.copy()
+            )
+            return cls
+
+        return single_method_decorator(obj)
+
+    if test_or_class is not None:
+        return _decorate_test_or_class(test_or_class)
+
+    return _decorate_test_or_class
 
 
 def keras_mode_combinations(mode=None, run_eagerly=None):
-  """Returns the default test combinations for tf.keras tests.
-
-  Note that if tf2 is enabled, then v1 session test will be skipped.
-
-  Args:
-    mode: List of modes to run the tests. The valid options are 'graph' and
-      'eager'. Default to ['graph', 'eager'] if not specified. If a empty list
-      is provide, then the test will run under the context based on tf's
-      version, eg graph for v1 and eager for v2.
-    run_eagerly: List of `run_eagerly` value to be run with the tests.
-      Default to [True, False] if not specified. Note that for `graph` mode,
-      run_eagerly value will only be False.
-
-  Returns:
-    A list contains all the combinations to be used to generate test cases.
-  """
-  if mode is None:
-    mode = ['eager'] if tf.__internal__.tf2.enabled() else ['graph', 'eager']
-  if run_eagerly is None:
-    run_eagerly = [True, False]
-  result = []
-  if 'eager' in mode:
-    result += tf.__internal__.test.combinations.combine(mode=['eager'], run_eagerly=run_eagerly)
-  if 'graph' in mode:
-    result += tf.__internal__.test.combinations.combine(mode=['graph'], run_eagerly=[False])
-  return result
+    """Returns the default test combinations for tf.keras tests.
+
+    Note that if tf2 is enabled, then v1 session test will be skipped.
+
+    Args:
+      mode: List of modes to run the tests. The valid options are 'graph' and
+        'eager'. If None, uses ['graph', 'eager']. If an empty
+        list is provided, then the test will run under the context based on
+        tensorflow's version, e.g., graph for v1 and eager for v2. Defaults to
+        `None`.
+      run_eagerly: List of `run_eagerly` value to be run with the tests.
+        When None, uses [True, False]. Note that for `graph` mode,
+        run_eagerly value will only be False. Defaults to `None`.
+
+    Returns:
+      A list contains all the combinations to be used to generate test cases.
+    """
+    if mode is None:
+        mode = (
+            ["eager"] if tf.__internal__.tf2.enabled() else ["graph", "eager"]
+        )
+    if run_eagerly is None:
+        run_eagerly = [True, False]
+    result = []
+    if "eager" in mode:
+        result += tf.__internal__.test.combinations.combine(
+            mode=["eager"], run_eagerly=run_eagerly
+        )
+    if "graph" in mode:
+        result += tf.__internal__.test.combinations.combine(
+            mode=["graph"], run_eagerly=[False]
+        )
+    return result
 
 
 def keras_model_type_combinations():
-  return tf.__internal__.test.combinations.combine(model_type=KERAS_MODEL_TYPES)
+    return tf.__internal__.test.combinations.combine(
+        model_type=KERAS_MODEL_TYPES
+    )
 
 
 class KerasModeCombination(tf.__internal__.test.combinations.TestCombination):
-  """Combination for Keras test mode.
+    """Combination for Keras test mode.
 
-  It by default includes v1_session, v2_eager and v2_tf_function.
-  """
+    It by default includes v1_session, v2_eager and v2_tf_function.
+    """
 
-  def context_managers(self, kwargs):
-    run_eagerly = kwargs.pop('run_eagerly', None)
+    def context_managers(self, kwargs):
+        run_eagerly = kwargs.pop("run_eagerly", None)
 
-    if run_eagerly is not None:
-      return [test_utils.run_eagerly_scope(run_eagerly)]
-    else:
-      return []
+        if run_eagerly is not None:
+            return [test_utils.run_eagerly_scope(run_eagerly)]
+        else:
+            return []
 
-  def parameter_modifiers(self):
-    return [tf.__internal__.test.combinations.OptionalParameter('run_eagerly')]
+    def parameter_modifiers(self):
+        return [
+            tf.__internal__.test.combinations.OptionalParameter("run_eagerly")
+        ]
 
 
-class KerasModelTypeCombination(tf.__internal__.test.combinations.TestCombination):
-  """Combination for Keras model types when doing model test.
+class KerasModelTypeCombination(
+    tf.__internal__.test.combinations.TestCombination
+):
+    """Combination for Keras model types when doing model test.
 
-  It by default includes 'functional', 'subclass', 'sequential'.
+    It by default includes 'functional', 'subclass', 'sequential'.
 
-  Various methods in `testing_utils` to get models will auto-generate a model
-  of the currently active Keras model type. This allows unittests to confirm
-  the equivalence between different Keras models.
-  """
+    Various methods in `testing_utils` to get models will auto-generate a model
+    of the currently active Keras model type. This allows unittests to confirm
+    the equivalence between different Keras models.
+    """
 
-  def context_managers(self, kwargs):
-    model_type = kwargs.pop('model_type', None)
-    if model_type in KERAS_MODEL_TYPES:
-      return [test_utils.model_type_scope(model_type)]
-    else:
-      return []
+    def context_managers(self, kwargs):
+        model_type = kwargs.pop("model_type", None)
+        if model_type in KERAS_MODEL_TYPES:
+            return [test_utils.model_type_scope(model_type)]
+        else:
+            return []
 
-  def parameter_modifiers(self):
-    return [tf.__internal__.test.combinations.OptionalParameter('model_type')]
+    def parameter_modifiers(self):
+        return [
+            tf.__internal__.test.combinations.OptionalParameter("model_type")
+        ]
 
 
-_defaults = tf.__internal__.test.combinations.generate.keywords['test_combinations']
+_defaults = tf.__internal__.test.combinations.generate.keywords[
+    "test_combinations"
+]
 generate = functools.partial(
     tf.__internal__.test.combinations.generate,
-    test_combinations=_defaults +
-    (KerasModeCombination(), KerasModelTypeCombination()))
+    test_combinations=_defaults
+    + (KerasModeCombination(), KerasModelTypeCombination()),
+)
 combine = tf.__internal__.test.combinations.combine
 times = tf.__internal__.test.combinations.times
 NamedObject = tf.__internal__.test.combinations.NamedObject
diff --git a/keras/testing_infra/test_combinations_test.py b/keras/testing_infra/test_combinations_test.py
index e835152873e2..30493842b873 100644
--- a/keras/testing_infra/test_combinations_test.py
+++ b/keras/testing_infra/test_combinations_test.py
@@ -14,680 +14,714 @@
 # ==============================================================================
 """Tests for Keras test_utils."""
 
-import tensorflow.compat.v2 as tf
-
 import unittest
+
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 import keras
 from keras import models as keras_models
-from keras.testing_infra import test_utils
 from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 class CombinationsTest(tf.test.TestCase):
-
-  def test_run_all_keras_modes(self):
-    test_params = []
-
-    class ExampleTest(parameterized.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.generate(test_combinations.keras_mode_combinations())
-      def testBody(self):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        test_params.append((mode, should_run_eagerly))
-
-    e = ExampleTest()
-    if not tf.__internal__.tf2.enabled():
-      e.testBody_test_mode_graph_runeagerly_False()
-    e.testBody_test_mode_eager_runeagerly_True()
-    e.testBody_test_mode_eager_runeagerly_False()
-
-    if not tf.__internal__.tf2.enabled():
-      self.assertLen(test_params, 3)
-      self.assertAllEqual(test_params, [
-          ("graph", False),
-          ("eager", True),
-          ("eager", False),
-      ])
-
-      ts = unittest.makeSuite(ExampleTest)
-      res = unittest.TestResult()
-      ts.run(res)
-      self.assertLen(test_params, 6)
-    else:
-      self.assertLen(test_params, 2)
-      self.assertAllEqual(test_params, [
-          ("eager", True),
-          ("eager", False),
-      ])
-
-      ts = unittest.makeSuite(ExampleTest)
-      res = unittest.TestResult()
-      ts.run(res)
-      self.assertLen(test_params, 4)
-
-  def test_generate_keras_mode_eager_only(self):
-    result = test_combinations.keras_mode_combinations(mode=["eager"])
-    self.assertLen(result, 2)
-    self.assertEqual(result[0], {"mode": "eager", "run_eagerly": True})
-    self.assertEqual(result[1], {"mode": "eager", "run_eagerly": False})
-
-  def test_generate_keras_mode_skip_run_eagerly(self):
-    result = test_combinations.keras_mode_combinations(run_eagerly=[False])
-    if tf.__internal__.tf2.enabled():
-      self.assertLen(result, 1)
-      self.assertEqual(result[0], {"mode": "eager", "run_eagerly": False})
-    else:
-      self.assertLen(result, 2)
-      self.assertEqual(result[0], {"mode": "eager", "run_eagerly": False})
-      self.assertEqual(result[1], {"mode": "graph", "run_eagerly": False})
-
-  def test_run_all_keras_model_types(self):
-    model_types = []
-    models = []
-
-    class ExampleTest(parameterized.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.generate(
-          test_combinations.keras_model_type_combinations())
-      def testBody(self):
-        model_types.append(test_utils.get_model_type())
-        models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
-
-    e = ExampleTest()
-    e.testBody_test_modeltype_functional()
-    e.testBody_test_modeltype_subclass()
-    e.testBody_test_modeltype_sequential()
-
-    self.assertLen(model_types, 3)
-    self.assertAllEqual(model_types, [
-        "functional",
-        "subclass",
-        "sequential"
-    ])
-
-    # Validate that the models are what they should be
-    self.assertTrue(models[0]._is_graph_network)
-    self.assertFalse(models[1]._is_graph_network)
-    self.assertNotIsInstance(models[0], keras_models.Sequential)
-    self.assertNotIsInstance(models[1], keras_models.Sequential)
-    self.assertIsInstance(models[2], keras_models.Sequential)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(model_types, 6)
-
-  def test_combine_combinations(self):
-    test_cases = []
-
-    @test_combinations.generate(test_combinations.times(
-        test_combinations.keras_mode_combinations(),
-        test_combinations.keras_model_type_combinations()))
-    class ExampleTest(parameterized.TestCase):
-
-      def runTest(self):
-        pass
-
-      @parameterized.named_parameters(dict(testcase_name="_arg",
-                                           arg=True))
-      def testBody(self, arg):
-        del arg
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        test_cases.append((mode, should_run_eagerly,
-                           test_utils.get_model_type()))
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    expected_combinations = [
-        ("eager", False, "functional"),
-        ("eager", False, "sequential"),
-        ("eager", False, "subclass"),
-        ("eager", True, "functional"),
-        ("eager", True, "sequential"),
-        ("eager", True, "subclass"),
-    ]
-
-    if not tf.__internal__.tf2.enabled():
-      expected_combinations.extend([
-          ("graph", False, "functional"),
-          ("graph", False, "sequential"),
-          ("graph", False, "subclass"),
-      ])
-
-    self.assertAllEqual(sorted(test_cases), expected_combinations)
+    def test_run_all_keras_modes(self):
+        test_params = []
+
+        class ExampleTest(parameterized.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.generate(
+                test_combinations.keras_mode_combinations()
+            )
+            def testBody(self):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                test_params.append((mode, should_run_eagerly))
+
+        e = ExampleTest()
+        if not tf.__internal__.tf2.enabled():
+            e.testBody_test_mode_graph_runeagerly_False()
+        e.testBody_test_mode_eager_runeagerly_True()
+        e.testBody_test_mode_eager_runeagerly_False()
+
+        if not tf.__internal__.tf2.enabled():
+            self.assertLen(test_params, 3)
+            self.assertAllEqual(
+                test_params,
+                [
+                    ("graph", False),
+                    ("eager", True),
+                    ("eager", False),
+                ],
+            )
+
+            ts = unittest.makeSuite(ExampleTest)
+            res = unittest.TestResult()
+            ts.run(res)
+            self.assertLen(test_params, 6)
+        else:
+            self.assertLen(test_params, 2)
+            self.assertAllEqual(
+                test_params,
+                [
+                    ("eager", True),
+                    ("eager", False),
+                ],
+            )
+
+            ts = unittest.makeSuite(ExampleTest)
+            res = unittest.TestResult()
+            ts.run(res)
+            self.assertLen(test_params, 4)
+
+    def test_generate_keras_mode_eager_only(self):
+        result = test_combinations.keras_mode_combinations(mode=["eager"])
+        self.assertLen(result, 2)
+        self.assertEqual(result[0], {"mode": "eager", "run_eagerly": True})
+        self.assertEqual(result[1], {"mode": "eager", "run_eagerly": False})
+
+    def test_generate_keras_mode_skip_run_eagerly(self):
+        result = test_combinations.keras_mode_combinations(run_eagerly=[False])
+        if tf.__internal__.tf2.enabled():
+            self.assertLen(result, 1)
+            self.assertEqual(result[0], {"mode": "eager", "run_eagerly": False})
+        else:
+            self.assertLen(result, 2)
+            self.assertEqual(result[0], {"mode": "eager", "run_eagerly": False})
+            self.assertEqual(result[1], {"mode": "graph", "run_eagerly": False})
+
+    def test_run_all_keras_model_types(self):
+        model_types = []
+        models = []
+
+        class ExampleTest(parameterized.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.generate(
+                test_combinations.keras_model_type_combinations()
+            )
+            def testBody(self):
+                model_types.append(test_utils.get_model_type())
+                models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
+
+        e = ExampleTest()
+        e.testBody_test_modeltype_functional()
+        e.testBody_test_modeltype_subclass()
+        e.testBody_test_modeltype_sequential()
+
+        self.assertLen(model_types, 3)
+        self.assertAllEqual(
+            model_types, ["functional", "subclass", "sequential"]
+        )
+
+        # Validate that the models are what they should be
+        self.assertTrue(models[0]._is_graph_network)
+        self.assertFalse(models[1]._is_graph_network)
+        self.assertNotIsInstance(models[0], keras_models.Sequential)
+        self.assertNotIsInstance(models[1], keras_models.Sequential)
+        self.assertIsInstance(models[2], keras_models.Sequential)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(model_types, 6)
+
+    def test_combine_combinations(self):
+        test_cases = []
+
+        @test_combinations.generate(
+            test_combinations.times(
+                test_combinations.keras_mode_combinations(),
+                test_combinations.keras_model_type_combinations(),
+            )
+        )
+        class ExampleTest(parameterized.TestCase):
+            def runTest(self):
+                pass
+
+            @parameterized.named_parameters(
+                dict(testcase_name="_arg", arg=True)
+            )
+            def testBody(self, arg):
+                del arg
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                test_cases.append(
+                    (mode, should_run_eagerly, test_utils.get_model_type())
+                )
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        expected_combinations = [
+            ("eager", False, "functional"),
+            ("eager", False, "sequential"),
+            ("eager", False, "subclass"),
+            ("eager", True, "functional"),
+            ("eager", True, "sequential"),
+            ("eager", True, "subclass"),
+        ]
+
+        if not tf.__internal__.tf2.enabled():
+            expected_combinations.extend(
+                [
+                    ("graph", False, "functional"),
+                    ("graph", False, "sequential"),
+                    ("graph", False, "subclass"),
+                ]
+            )
+
+        self.assertAllEqual(sorted(test_cases), expected_combinations)
 
 
 class KerasParameterizedTest(test_combinations.TestCase):
+    def test_run_with_all_model_types(self):
+        model_types = []
+        models = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_with_all_model_types
+            def testBody(self):
+                model_types.append(test_utils.get_model_type())
+                models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
+
+        e = ExampleTest()
+        e.testBody_functional()
+        e.testBody_subclass()
+        e.testBody_sequential()
+
+        self.assertLen(model_types, 3)
+        self.assertAllEqual(
+            model_types, ["functional", "subclass", "sequential"]
+        )
+
+        # Validate that the models are what they should be
+        self.assertTrue(models[0]._is_graph_network)
+        self.assertFalse(models[1]._is_graph_network)
+        self.assertNotIsInstance(models[0], keras.models.Sequential)
+        self.assertNotIsInstance(models[1], keras.models.Sequential)
+        self.assertIsInstance(models[2], keras.models.Sequential)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(model_types, 6)
+
+    def test_run_with_all_model_types_and_extra_params(self):
+        model_types = []
+        models = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_with_all_model_types
+            @parameterized.named_parameters(
+                [
+                    dict(testcase_name="_0", with_brackets=True),
+                    dict(testcase_name="_1", with_brackets=False),
+                ]
+            )
+            def testBody(self, with_brackets):
+                with_brackets = (
+                    "with_brackets" if with_brackets else "without_brackets"
+                )
+                model_types.append((with_brackets, test_utils.get_model_type()))
+                models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
+
+        e = ExampleTest()
+        e.testBody_0_functional()
+        e.testBody_0_subclass()
+        e.testBody_0_sequential()
+        e.testBody_1_functional()
+        e.testBody_1_subclass()
+        e.testBody_1_sequential()
+
+        self.assertLen(model_types, 6)
+        self.assertAllEqual(
+            model_types,
+            [
+                ("with_brackets", "functional"),
+                ("with_brackets", "subclass"),
+                ("with_brackets", "sequential"),
+                ("without_brackets", "functional"),
+                ("without_brackets", "subclass"),
+                ("without_brackets", "sequential"),
+            ],
+        )
+
+        # Validate that the models are what they should be
+        self.assertTrue(models[0]._is_graph_network)
+        self.assertFalse(models[1]._is_graph_network)
+        self.assertNotIsInstance(models[0], keras.models.Sequential)
+        self.assertNotIsInstance(models[1], keras.models.Sequential)
+        self.assertIsInstance(models[2], keras.models.Sequential)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(model_types, 12)
+
+    def test_run_with_all_model_types_exclude_one(self):
+        model_types = []
+        models = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_with_all_model_types(
+                exclude_models="sequential"
+            )
+            def testBody(self):
+                model_types.append(test_utils.get_model_type())
+                models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
+
+        e = ExampleTest()
+        if hasattr(e, "testBody_functional"):
+            e.testBody_functional()
+        if hasattr(e, "testBody_subclass"):
+            e.testBody_subclass()
+        if hasattr(e, "testBody_sequential"):
+            e.testBody_sequential()
+
+        self.assertLen(model_types, 2)
+        self.assertAllEqual(model_types, ["functional", "subclass"])
+
+        # Validate that the models are what they should be
+        self.assertTrue(models[0]._is_graph_network)
+        self.assertFalse(models[1]._is_graph_network)
+        self.assertNotIsInstance(models[0], keras.models.Sequential)
+        self.assertNotIsInstance(models[1], keras.models.Sequential)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(model_types, 4)
+
+    def test_run_with_all_model_types_exclude_multiple(self):
+        model_types = []
+        models = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_with_all_model_types(
+                exclude_models=["sequential", "functional"]
+            )
+            def testBody(self):
+                model_types.append(test_utils.get_model_type())
+                models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
+
+        e = ExampleTest()
+        if hasattr(e, "testBody_functional"):
+            e.testBody_functional()
+        if hasattr(e, "testBody_subclass"):
+            e.testBody_subclass()
+        if hasattr(e, "testBody_sequential"):
+            e.testBody_sequential()
+
+        self.assertLen(model_types, 1)
+        self.assertAllEqual(model_types, ["subclass"])
+
+        # Validate that the models are what they should be
+        self.assertFalse(models[0]._is_graph_network)
+        self.assertNotIsInstance(models[0], keras.models.Sequential)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(model_types, 2)
+
+    def test_run_all_keras_modes(self):
+        l = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_all_keras_modes()
+            def testBody(self):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                l.append((mode, should_run_eagerly))
+
+        e = ExampleTest()
+        if not tf.__internal__.tf2.enabled():
+            e.testBody_v1_session()
+        e.testBody_v2_eager()
+        e.testBody_v2_function()
+
+        if not tf.__internal__.tf2.enabled():
+            self.assertLen(l, 3)
+            self.assertAllEqual(
+                l,
+                [
+                    ("graph", False),
+                    ("eager", True),
+                    ("eager", False),
+                ],
+            )
+
+            ts = unittest.makeSuite(ExampleTest)
+            res = unittest.TestResult()
+            ts.run(res)
+            self.assertLen(l, 6)
+        else:
+            self.assertLen(l, 2)
+            self.assertAllEqual(
+                l,
+                [
+                    ("eager", True),
+                    ("eager", False),
+                ],
+            )
+
+            ts = unittest.makeSuite(ExampleTest)
+            res = unittest.TestResult()
+            ts.run(res)
+            self.assertLen(l, 4)
+
+    def test_run_all_keras_modes_extra_params(self):
+        l = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_all_keras_modes()
+            @parameterized.named_parameters(
+                [
+                    dict(testcase_name="_0", with_brackets=True),
+                    dict(testcase_name="_1", with_brackets=False),
+                ]
+            )
+            def testBody(self, with_brackets):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                with_brackets = (
+                    "with_brackets" if with_brackets else "without_brackets"
+                )
+                should_run_eagerly = test_utils.should_run_eagerly()
+                l.append((with_brackets, mode, should_run_eagerly))
+
+        e = ExampleTest()
+        if not tf.__internal__.tf2.enabled():
+            e.testBody_0_v1_session()
+            e.testBody_1_v1_session()
+
+        e.testBody_0_v2_eager()
+        e.testBody_0_v2_function()
+        e.testBody_1_v2_eager()
+        e.testBody_1_v2_function()
+
+        expected_combinations = {
+            ("with_brackets", "eager", True),
+            ("with_brackets", "eager", False),
+            ("without_brackets", "eager", True),
+            ("without_brackets", "eager", False),
+        }
+
+        if not tf.__internal__.tf2.enabled():
+            expected_combinations = expected_combinations.union(
+                {
+                    ("with_brackets", "graph", False),
+                    ("without_brackets", "graph", False),
+                }
+            )
+
+        self.assertLen(l, len(expected_combinations))
+        self.assertEqual(set(l), expected_combinations)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(l, len(expected_combinations) * 2)
+
+    def test_run_all_keras_modes_always_skip_v1(self):
+        l = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_all_keras_modes(always_skip_v1=True)
+            def testBody(self):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                l.append((mode, should_run_eagerly))
+
+        e = ExampleTest()
+        if hasattr(e, "testBody_v1_session"):
+            e.testBody_v1_session()
+        if hasattr(e, "testBody_v2_eager"):
+            e.testBody_v2_eager()
+        if hasattr(e, "testBody_v2_function"):
+            e.testBody_v2_function()
+
+        self.assertLen(l, 2)
+        self.assertEqual(
+            set(l),
+            {
+                ("eager", True),
+                ("eager", False),
+            },
+        )
+
+    def test_run_all_keras_modes_with_all_model_types(self):
+        l = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_with_all_model_types
+            @test_combinations.run_all_keras_modes
+            def testBody(self):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                l.append(
+                    (mode, should_run_eagerly, test_utils.get_model_type())
+                )
+
+        e = ExampleTest()
+        e.testBody_v2_eager_functional()
+        e.testBody_v2_function_functional()
+        e.testBody_v2_eager_sequential()
+        e.testBody_v2_function_sequential()
+        e.testBody_v2_eager_subclass()
+        e.testBody_v2_function_subclass()
+
+        if not tf.__internal__.tf2.enabled():
+            e.testBody_v1_session_functional()
+            e.testBody_v1_session_sequential()
+            e.testBody_v1_session_subclass()
+
+        expected_combinations = {
+            ("eager", True, "functional"),
+            ("eager", False, "functional"),
+            ("eager", True, "sequential"),
+            ("eager", False, "sequential"),
+            ("eager", True, "subclass"),
+            ("eager", False, "subclass"),
+        }
+
+        if not tf.__internal__.tf2.enabled():
+            expected_combinations = expected_combinations.union(
+                {
+                    ("graph", False, "functional"),
+                    ("graph", False, "sequential"),
+                    ("graph", False, "subclass"),
+                }
+            )
+
+        self.assertLen(l, len(expected_combinations))
+        self.assertEqual(set(l), expected_combinations)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(l, len(expected_combinations) * 2)
+
+    def test_run_all_model_types_with_all_keras_modes(self):
+        l = []
+
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_all_keras_modes
+            @test_combinations.run_with_all_model_types
+            def testBody(self):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                l.append(
+                    (mode, should_run_eagerly, test_utils.get_model_type())
+                )
+
+        e = ExampleTest()
+        e.testBody_functional_v2_eager()
+        e.testBody_functional_v2_function()
+        e.testBody_sequential_v2_eager()
+        e.testBody_sequential_v2_function()
+        e.testBody_subclass_v2_eager()
+        e.testBody_subclass_v2_function()
+
+        if not tf.__internal__.tf2.enabled():
+            e.testBody_functional_v1_session()
+            e.testBody_sequential_v1_session()
+            e.testBody_subclass_v1_session()
+
+        expected_combinations = {
+            ("eager", True, "functional"),
+            ("eager", False, "functional"),
+            ("eager", True, "sequential"),
+            ("eager", False, "sequential"),
+            ("eager", True, "subclass"),
+            ("eager", False, "subclass"),
+        }
+
+        if not tf.__internal__.tf2.enabled():
+            expected_combinations = expected_combinations.union(
+                {
+                    ("graph", False, "functional"),
+                    ("graph", False, "sequential"),
+                    ("graph", False, "subclass"),
+                }
+            )
+
+        self.assertLen(l, len(expected_combinations))
+        self.assertEqual(set(l), expected_combinations)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(l, len(expected_combinations) * 2)
+
+    def test_run_all_keras_modes_with_all_model_types_annotate_class(self):
+        l = []
+
+        @test_combinations.run_with_all_model_types
+        @test_combinations.run_all_keras_modes
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @parameterized.named_parameters(
+                dict(testcase_name="_arg", arg=True)
+            )
+            def testBody(self, arg):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                l.append(
+                    (mode, should_run_eagerly, test_utils.get_model_type())
+                )
+
+        e = ExampleTest()
+        e.testBody_arg_v2_eager_functional()
+        e.testBody_arg_v2_function_functional()
+        e.testBody_arg_v2_eager_sequential()
+        e.testBody_arg_v2_function_sequential()
+        e.testBody_arg_v2_eager_subclass()
+        e.testBody_arg_v2_function_subclass()
+
+        if not tf.__internal__.tf2.enabled():
+            e.testBody_arg_v1_session_functional()
+            e.testBody_arg_v1_session_sequential()
+            e.testBody_arg_v1_session_subclass()
+
+        expected_combinations = {
+            ("eager", True, "functional"),
+            ("eager", False, "functional"),
+            ("eager", True, "sequential"),
+            ("eager", False, "sequential"),
+            ("eager", True, "subclass"),
+            ("eager", False, "subclass"),
+        }
+
+        if not tf.__internal__.tf2.enabled():
+            expected_combinations = expected_combinations.union(
+                {
+                    ("graph", False, "functional"),
+                    ("graph", False, "sequential"),
+                    ("graph", False, "subclass"),
+                }
+            )
+
+        self.assertLen(l, len(expected_combinations))
+        self.assertEqual(set(l), expected_combinations)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(l, len(expected_combinations) * 2)
+
+    def test_run_all_keras_modes_with_all_model_types_annotate_class_2(self):
+        l = []
+
+        @test_combinations.run_with_all_model_types
+        class ExampleTest(test_combinations.TestCase):
+            def runTest(self):
+                pass
+
+            @test_combinations.run_all_keras_modes
+            @parameterized.named_parameters(
+                dict(testcase_name="_arg", arg=True)
+            )
+            def testBody(self, arg):
+                mode = "eager" if tf.executing_eagerly() else "graph"
+                should_run_eagerly = test_utils.should_run_eagerly()
+                l.append(
+                    (mode, should_run_eagerly, test_utils.get_model_type())
+                )
+
+        e = ExampleTest()
+        e.testBody_arg_v2_eager_functional()
+        e.testBody_arg_v2_function_functional()
+        e.testBody_arg_v2_eager_sequential()
+        e.testBody_arg_v2_function_sequential()
+        e.testBody_arg_v2_eager_subclass()
+        e.testBody_arg_v2_function_subclass()
+
+        if not tf.__internal__.tf2.enabled():
+            e.testBody_arg_v1_session_functional()
+            e.testBody_arg_v1_session_sequential()
+            e.testBody_arg_v1_session_subclass()
+
+        expected_combinations = {
+            ("eager", True, "functional"),
+            ("eager", False, "functional"),
+            ("eager", True, "sequential"),
+            ("eager", False, "sequential"),
+            ("eager", True, "subclass"),
+            ("eager", False, "subclass"),
+        }
+
+        if not tf.__internal__.tf2.enabled():
+            expected_combinations = expected_combinations.union(
+                {
+                    ("graph", False, "functional"),
+                    ("graph", False, "sequential"),
+                    ("graph", False, "subclass"),
+                }
+            )
+
+        self.assertLen(l, len(expected_combinations))
+        self.assertEqual(set(l), expected_combinations)
+
+        ts = unittest.makeSuite(ExampleTest)
+        res = unittest.TestResult()
+        ts.run(res)
+
+        self.assertLen(l, len(expected_combinations) * 2)
 
-  def test_run_with_all_model_types(self):
-    model_types = []
-    models = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_with_all_model_types
-      def testBody(self):
-        model_types.append(test_utils.get_model_type())
-        models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
-
-    e = ExampleTest()
-    e.testBody_functional()
-    e.testBody_subclass()
-    e.testBody_sequential()
-
-    self.assertLen(model_types, 3)
-    self.assertAllEqual(model_types, [
-        "functional",
-        "subclass",
-        "sequential"
-    ])
-
-    # Validate that the models are what they should be
-    self.assertTrue(models[0]._is_graph_network)
-    self.assertFalse(models[1]._is_graph_network)
-    self.assertNotIsInstance(models[0], keras.models.Sequential)
-    self.assertNotIsInstance(models[1], keras.models.Sequential)
-    self.assertIsInstance(models[2], keras.models.Sequential)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(model_types, 6)
-
-  def test_run_with_all_model_types_and_extra_params(self):
-    model_types = []
-    models = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_with_all_model_types
-      @parameterized.named_parameters(
-          [dict(testcase_name="_0", with_brackets=True),
-           dict(testcase_name="_1", with_brackets=False)])
-      def testBody(self, with_brackets):
-        with_brackets = "with_brackets" if with_brackets else "without_brackets"
-        model_types.append((with_brackets, test_utils.get_model_type()))
-        models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
-
-    e = ExampleTest()
-    e.testBody_0_functional()
-    e.testBody_0_subclass()
-    e.testBody_0_sequential()
-    e.testBody_1_functional()
-    e.testBody_1_subclass()
-    e.testBody_1_sequential()
-
-    self.assertLen(model_types, 6)
-    self.assertAllEqual(model_types, [
-        ("with_brackets", "functional"),
-        ("with_brackets", "subclass"),
-        ("with_brackets", "sequential"),
-        ("without_brackets", "functional"),
-        ("without_brackets", "subclass"),
-        ("without_brackets", "sequential"),
-    ])
-
-    # Validate that the models are what they should be
-    self.assertTrue(models[0]._is_graph_network)
-    self.assertFalse(models[1]._is_graph_network)
-    self.assertNotIsInstance(models[0], keras.models.Sequential)
-    self.assertNotIsInstance(models[1], keras.models.Sequential)
-    self.assertIsInstance(models[2], keras.models.Sequential)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(model_types, 12)
-
-  def test_run_with_all_model_types_exclude_one(self):
-    model_types = []
-    models = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_with_all_model_types(exclude_models="sequential")
-      def testBody(self):
-        model_types.append(test_utils.get_model_type())
-        models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
-
-    e = ExampleTest()
-    if hasattr(e, "testBody_functional"):
-      e.testBody_functional()
-    if hasattr(e, "testBody_subclass"):
-      e.testBody_subclass()
-    if hasattr(e, "testBody_sequential"):
-      e.testBody_sequential()
-
-    self.assertLen(model_types, 2)
-    self.assertAllEqual(model_types, [
-        "functional",
-        "subclass"
-    ])
-
-    # Validate that the models are what they should be
-    self.assertTrue(models[0]._is_graph_network)
-    self.assertFalse(models[1]._is_graph_network)
-    self.assertNotIsInstance(models[0], keras.models.Sequential)
-    self.assertNotIsInstance(models[1], keras.models.Sequential)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(model_types, 4)
-
-  def test_run_with_all_model_types_exclude_multiple(self):
-    model_types = []
-    models = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_with_all_model_types(
-          exclude_models=["sequential", "functional"])
-      def testBody(self):
-        model_types.append(test_utils.get_model_type())
-        models.append(test_utils.get_small_mlp(1, 4, input_dim=3))
-
-    e = ExampleTest()
-    if hasattr(e, "testBody_functional"):
-      e.testBody_functional()
-    if hasattr(e, "testBody_subclass"):
-      e.testBody_subclass()
-    if hasattr(e, "testBody_sequential"):
-      e.testBody_sequential()
-
-    self.assertLen(model_types, 1)
-    self.assertAllEqual(model_types, [
-        "subclass"
-    ])
-
-    # Validate that the models are what they should be
-    self.assertFalse(models[0]._is_graph_network)
-    self.assertNotIsInstance(models[0], keras.models.Sequential)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(model_types, 2)
-
-  def test_run_all_keras_modes(self):
-    l = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_all_keras_modes()
-      def testBody(self):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        l.append((mode, should_run_eagerly))
-
-    e = ExampleTest()
-    if not tf.__internal__.tf2.enabled():
-      e.testBody_v1_session()
-    e.testBody_v2_eager()
-    e.testBody_v2_function()
-
-    if not tf.__internal__.tf2.enabled():
-      self.assertLen(l, 3)
-      self.assertAllEqual(l, [
-          ("graph", False),
-          ("eager", True),
-          ("eager", False),
-      ])
-
-      ts = unittest.makeSuite(ExampleTest)
-      res = unittest.TestResult()
-      ts.run(res)
-      self.assertLen(l, 6)
-    else:
-      self.assertLen(l, 2)
-      self.assertAllEqual(l, [
-          ("eager", True),
-          ("eager", False),
-      ])
-
-      ts = unittest.makeSuite(ExampleTest)
-      res = unittest.TestResult()
-      ts.run(res)
-      self.assertLen(l, 4)
-
-  def test_run_all_keras_modes_extra_params(self):
-    l = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_all_keras_modes()
-      @parameterized.named_parameters(
-          [dict(testcase_name="_0", with_brackets=True),
-           dict(testcase_name="_1", with_brackets=False)])
-      def testBody(self, with_brackets):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        with_brackets = "with_brackets" if with_brackets else "without_brackets"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        l.append((with_brackets, mode, should_run_eagerly))
-
-    e = ExampleTest()
-    if not tf.__internal__.tf2.enabled():
-      e.testBody_0_v1_session()
-      e.testBody_1_v1_session()
-
-    e.testBody_0_v2_eager()
-    e.testBody_0_v2_function()
-    e.testBody_1_v2_eager()
-    e.testBody_1_v2_function()
-
-    expected_combinations = {
-        ("with_brackets", "eager", True),
-        ("with_brackets", "eager", False),
-        ("without_brackets", "eager", True),
-        ("without_brackets", "eager", False),
-    }
-
-    if not tf.__internal__.tf2.enabled():
-      expected_combinations = expected_combinations.union({
-          ("with_brackets", "graph", False),
-          ("without_brackets", "graph", False),
-      })
-
-    self.assertLen(l, len(expected_combinations))
-    self.assertEqual(set(l), expected_combinations)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(l, len(expected_combinations) * 2)
-
-  def test_run_all_keras_modes_always_skip_v1(self):
-    l = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_all_keras_modes(always_skip_v1=True)
-      def testBody(self):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        l.append((mode, should_run_eagerly))
-
-    e = ExampleTest()
-    if hasattr(e, "testBody_v1_session"):
-      e.testBody_v1_session()
-    if hasattr(e, "testBody_v2_eager"):
-      e.testBody_v2_eager()
-    if hasattr(e, "testBody_v2_function"):
-      e.testBody_v2_function()
-
-    self.assertLen(l, 2)
-    self.assertEqual(
-        set(l), {
-            ("eager", True),
-            ("eager", False),
-        })
-
-  def test_run_all_keras_modes_with_all_model_types(self):
-    l = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_with_all_model_types
-      @test_combinations.run_all_keras_modes
-      def testBody(self):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        l.append((mode, should_run_eagerly, test_utils.get_model_type()))
-
-    e = ExampleTest()
-    e.testBody_v2_eager_functional()
-    e.testBody_v2_function_functional()
-    e.testBody_v2_eager_sequential()
-    e.testBody_v2_function_sequential()
-    e.testBody_v2_eager_subclass()
-    e.testBody_v2_function_subclass()
-
-    if not tf.__internal__.tf2.enabled():
-      e.testBody_v1_session_functional()
-      e.testBody_v1_session_sequential()
-      e.testBody_v1_session_subclass()
-
-    expected_combinations = {
-        ("eager", True, "functional"),
-        ("eager", False, "functional"),
-        ("eager", True, "sequential"),
-        ("eager", False, "sequential"),
-        ("eager", True, "subclass"),
-        ("eager", False, "subclass"),
-    }
-
-    if not tf.__internal__.tf2.enabled():
-      expected_combinations = expected_combinations.union({
-          ("graph", False, "functional"),
-          ("graph", False, "sequential"),
-          ("graph", False, "subclass"),
-      })
-
-    self.assertLen(l, len(expected_combinations))
-    self.assertEqual(set(l), expected_combinations)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(l, len(expected_combinations) * 2)
-
-  def test_run_all_model_types_with_all_keras_modes(self):
-    l = []
-
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_all_keras_modes
-      @test_combinations.run_with_all_model_types
-      def testBody(self):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        l.append((mode, should_run_eagerly, test_utils.get_model_type()))
-
-    e = ExampleTest()
-    e.testBody_functional_v2_eager()
-    e.testBody_functional_v2_function()
-    e.testBody_sequential_v2_eager()
-    e.testBody_sequential_v2_function()
-    e.testBody_subclass_v2_eager()
-    e.testBody_subclass_v2_function()
-
-    if not tf.__internal__.tf2.enabled():
-      e.testBody_functional_v1_session()
-      e.testBody_sequential_v1_session()
-      e.testBody_subclass_v1_session()
-
-    expected_combinations = {
-        ("eager", True, "functional"),
-        ("eager", False, "functional"),
-        ("eager", True, "sequential"),
-        ("eager", False, "sequential"),
-        ("eager", True, "subclass"),
-        ("eager", False, "subclass"),
-    }
-
-    if not tf.__internal__.tf2.enabled():
-      expected_combinations = expected_combinations.union({
-          ("graph", False, "functional"),
-          ("graph", False, "sequential"),
-          ("graph", False, "subclass"),
-      })
-
-    self.assertLen(l, len(expected_combinations))
-    self.assertEqual(set(l), expected_combinations)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(l, len(expected_combinations) * 2)
-
-  def test_run_all_keras_modes_with_all_model_types_annotate_class(self):
-    l = []
-
-    @test_combinations.run_with_all_model_types
     @test_combinations.run_all_keras_modes
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @parameterized.named_parameters(dict(testcase_name="_arg",
-                                           arg=True))
-      def testBody(self, arg):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        l.append((mode, should_run_eagerly, test_utils.get_model_type()))
-
-    e = ExampleTest()
-    e.testBody_arg_v2_eager_functional()
-    e.testBody_arg_v2_function_functional()
-    e.testBody_arg_v2_eager_sequential()
-    e.testBody_arg_v2_function_sequential()
-    e.testBody_arg_v2_eager_subclass()
-    e.testBody_arg_v2_function_subclass()
-
-    if not tf.__internal__.tf2.enabled():
-      e.testBody_arg_v1_session_functional()
-      e.testBody_arg_v1_session_sequential()
-      e.testBody_arg_v1_session_subclass()
-
-    expected_combinations = {
-        ("eager", True, "functional"),
-        ("eager", False, "functional"),
-        ("eager", True, "sequential"),
-        ("eager", False, "sequential"),
-        ("eager", True, "subclass"),
-        ("eager", False, "subclass"),
-    }
-
-    if not tf.__internal__.tf2.enabled():
-      expected_combinations = expected_combinations.union({
-          ("graph", False, "functional"),
-          ("graph", False, "sequential"),
-          ("graph", False, "subclass"),
-      })
-
-    self.assertLen(l, len(expected_combinations))
-    self.assertEqual(set(l), expected_combinations)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(l, len(expected_combinations) * 2)
-
-  def test_run_all_keras_modes_with_all_model_types_annotate_class_2(self):
-    l = []
+    @parameterized.named_parameters(dict(testcase_name="argument", arg=True))
+    def test_run_all_keras_modes_extra_params_2(self, arg):
+        self.assertEqual(arg, True)
 
     @test_combinations.run_with_all_model_types
-    class ExampleTest(test_combinations.TestCase):
-
-      def runTest(self):
-        pass
-
-      @test_combinations.run_all_keras_modes
-      @parameterized.named_parameters(dict(testcase_name="_arg",
-                                           arg=True))
-      def testBody(self, arg):
-        mode = "eager" if tf.executing_eagerly() else "graph"
-        should_run_eagerly = test_utils.should_run_eagerly()
-        l.append((mode, should_run_eagerly, test_utils.get_model_type()))
-
-    e = ExampleTest()
-    e.testBody_arg_v2_eager_functional()
-    e.testBody_arg_v2_function_functional()
-    e.testBody_arg_v2_eager_sequential()
-    e.testBody_arg_v2_function_sequential()
-    e.testBody_arg_v2_eager_subclass()
-    e.testBody_arg_v2_function_subclass()
-
-    if not tf.__internal__.tf2.enabled():
-      e.testBody_arg_v1_session_functional()
-      e.testBody_arg_v1_session_sequential()
-      e.testBody_arg_v1_session_subclass()
-
-    expected_combinations = {
-        ("eager", True, "functional"),
-        ("eager", False, "functional"),
-        ("eager", True, "sequential"),
-        ("eager", False, "sequential"),
-        ("eager", True, "subclass"),
-        ("eager", False, "subclass"),
-    }
-
-    if not tf.__internal__.tf2.enabled():
-      expected_combinations = expected_combinations.union({
-          ("graph", False, "functional"),
-          ("graph", False, "sequential"),
-          ("graph", False, "subclass"),
-      })
-
-    self.assertLen(l, len(expected_combinations))
-    self.assertEqual(set(l), expected_combinations)
-
-    ts = unittest.makeSuite(ExampleTest)
-    res = unittest.TestResult()
-    ts.run(res)
-
-    self.assertLen(l, len(expected_combinations) * 2)
-
-  @test_combinations.run_all_keras_modes
-  @parameterized.named_parameters(dict(testcase_name="argument",
-                                       arg=True))
-  def test_run_all_keras_modes_extra_params_2(self, arg):
-    self.assertEqual(arg, True)
-
-  @test_combinations.run_with_all_model_types
-  @parameterized.named_parameters(dict(testcase_name="argument",
-                                       arg=True))
-  def test_run_with_all_model_types_extra_params_2(self, arg):
-    self.assertEqual(arg, True)
+    @parameterized.named_parameters(dict(testcase_name="argument", arg=True))
+    def test_run_with_all_model_types_extra_params_2(self, arg):
+        self.assertEqual(arg, True)
+
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/testing_infra/test_utils.py b/keras/testing_infra/test_utils.py
index bb4441855460..0c138c1aea80 100644
--- a/keras/testing_infra/test_utils.py
+++ b/keras/testing_infra/test_utils.py
@@ -21,283 +21,327 @@
 import itertools
 import threading
 import unittest
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras import layers
 from keras import models
 from keras.engine import base_layer_utils
-from keras.optimizers.optimizer_v2 import adadelta as adadelta_v2
-from keras.optimizers.optimizer_v2 import adagrad as adagrad_v2
-from keras.optimizers.optimizer_v2 import adam as adam_v2
-from keras.optimizers.optimizer_v2 import adamax as adamax_v2
-from keras.optimizers.optimizer_v2 import gradient_descent as gradient_descent_v2
-from keras.optimizers.optimizer_v2 import nadam as nadam_v2
-from keras.optimizers.optimizer_v2 import rmsprop as rmsprop_v2
+from keras.optimizers.legacy import adadelta as adadelta_v2
+from keras.optimizers.legacy import adagrad as adagrad_v2
+from keras.optimizers.legacy import adam as adam_v2
+from keras.optimizers.legacy import adamax as adamax_v2
+from keras.optimizers.legacy import gradient_descent as gradient_descent_v2
+from keras.optimizers.legacy import nadam as nadam_v2
+from keras.optimizers.legacy import rmsprop as rmsprop_v2
 from keras.utils import tf_contextlib
 from keras.utils import tf_inspect
-import numpy as np
-import tensorflow.compat.v2 as tf
-from tensorflow.python.framework import test_util as tf_test_utils
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 from tensorflow.python.util.tf_export import keras_export
 
 
 def string_test(actual, expected):
-  np.testing.assert_array_equal(actual, expected)
+    np.testing.assert_array_equal(actual, expected)
 
 
 def numeric_test(actual, expected):
-  np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=1e-6)
-
-
-def get_test_data(train_samples,
-                  test_samples,
-                  input_shape,
-                  num_classes,
-                  random_seed=None):
-  """Generates test data to train a model on.
-
-  Args:
-    train_samples: Integer, how many training samples to generate.
-    test_samples: Integer, how many test samples to generate.
-    input_shape: Tuple of integers, shape of the inputs.
-    num_classes: Integer, number of classes for the data and targets.
-    random_seed: Integer, random seed used by numpy to generate data.
-
-  Returns:
-    A tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
-  """
-  if random_seed is not None:
-    np.random.seed(random_seed)
-  num_sample = train_samples + test_samples
-  templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
-  y = np.random.randint(0, num_classes, size=(num_sample,))
-  x = np.zeros((num_sample,) + input_shape, dtype=np.float32)
-  for i in range(num_sample):
-    x[i] = templates[y[i]] + np.random.normal(loc=0, scale=1., size=input_shape)
-  return ((x[:train_samples], y[:train_samples]),
-          (x[train_samples:], y[train_samples:]))
-
-
-@keras_export('keras.__internal__.utils.layer_test', v1=[])
+    np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=1e-6)
+
+
+def get_test_data(
+    train_samples, test_samples, input_shape, num_classes, random_seed=None
+):
+    """Generates test data to train a model on.
+
+    Args:
+      train_samples: Integer, how many training samples to generate.
+      test_samples: Integer, how many test samples to generate.
+      input_shape: Tuple of integers, shape of the inputs.
+      num_classes: Integer, number of classes for the data and targets.
+      random_seed: Integer, random seed used by numpy to generate data.
+
+    Returns:
+      A tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+    """
+    if random_seed is not None:
+        np.random.seed(random_seed)
+    num_sample = train_samples + test_samples
+    templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
+    y = np.random.randint(0, num_classes, size=(num_sample,))
+    x = np.zeros((num_sample,) + input_shape, dtype=np.float32)
+    for i in range(num_sample):
+        x[i] = templates[y[i]] + np.random.normal(
+            loc=0, scale=1.0, size=input_shape
+        )
+    return (
+        (x[:train_samples], y[:train_samples]),
+        (x[train_samples:], y[train_samples:]),
+    )
+
+
+@keras_export("keras.__internal__.utils.layer_test", v1=[])
 @tf_test_utils.disable_cudnn_autotune
-def layer_test(layer_cls,
-               kwargs=None,
-               input_shape=None,
-               input_dtype=None,
-               input_data=None,
-               expected_output=None,
-               expected_output_dtype=None,
-               expected_output_shape=None,
-               validate_training=True,
-               adapt_data=None,
-               custom_objects=None,
-               test_harness=None,
-               supports_masking=None):
-  """Test routine for a layer with a single input and single output.
-
-  Args:
-    layer_cls: Layer class object.
-    kwargs: Optional dictionary of keyword arguments for instantiating the
-      layer.
-    input_shape: Input shape tuple.
-    input_dtype: Data type of the input data.
-    input_data: Numpy array of input data.
-    expected_output: Numpy array of the expected output.
-    expected_output_dtype: Data type expected for the output.
-    expected_output_shape: Shape tuple for the expected shape of the output.
-    validate_training: Whether to attempt to validate training on this layer.
-      This might be set to False for non-differentiable layers that output
-      string or integer values.
-    adapt_data: Optional data for an 'adapt' call. If None, adapt() will not
-      be tested for this layer. This is only relevant for PreprocessingLayers.
-    custom_objects: Optional dictionary mapping name strings to custom objects
-      in the layer class. This is helpful for testing custom layers.
-    test_harness: The Tensorflow test, if any, that this function is being
-      called in.
-    supports_masking: Optional boolean to check the `supports_masking` property
-      of the layer. If None, the check will not be performed.
-
-  Returns:
-    The output data (Numpy array) returned by the layer, for additional
-    checks to be done by the calling code.
-
-  Raises:
-    ValueError: if `input_shape is None`.
-  """
-  if input_data is None:
-    if input_shape is None:
-      raise ValueError('input_shape is None')
-    if not input_dtype:
-      input_dtype = 'float32'
-    input_data_shape = list(input_shape)
-    for i, e in enumerate(input_data_shape):
-      if e is None:
-        input_data_shape[i] = np.random.randint(1, 4)
-    input_data = 10 * np.random.random(input_data_shape)
-    if input_dtype[:5] == 'float':
-      input_data -= 0.5
-    input_data = input_data.astype(input_dtype)
-  elif input_shape is None:
-    input_shape = input_data.shape
-  if input_dtype is None:
-    input_dtype = input_data.dtype
-  if expected_output_dtype is None:
-    expected_output_dtype = input_dtype
-
-  if tf.as_dtype(expected_output_dtype) == tf.string:
-    if test_harness:
-      assert_equal = test_harness.assertAllEqual
-    else:
-      assert_equal = string_test
-  else:
-    if test_harness:
-      assert_equal = test_harness.assertAllClose
+def layer_test(
+    layer_cls,
+    kwargs=None,
+    input_shape=None,
+    input_dtype=None,
+    input_data=None,
+    expected_output=None,
+    expected_output_dtype=None,
+    expected_output_shape=None,
+    validate_training=True,
+    adapt_data=None,
+    custom_objects=None,
+    test_harness=None,
+    supports_masking=None,
+):
+    """Test routine for a layer with a single input and single output.
+
+    Args:
+      layer_cls: Layer class object.
+      kwargs: Optional dictionary of keyword arguments for instantiating the
+        layer.
+      input_shape: Input shape tuple.
+      input_dtype: Data type of the input data.
+      input_data: Numpy array of input data.
+      expected_output: Numpy array of the expected output.
+      expected_output_dtype: Data type expected for the output.
+      expected_output_shape: Shape tuple for the expected shape of the output.
+      validate_training: Whether to attempt to validate training on this layer.
+        This might be set to False for non-differentiable layers that output
+        string or integer values.
+      adapt_data: Optional data for an 'adapt' call. If None, adapt() will not
+        be tested for this layer. This is only relevant for PreprocessingLayers.
+      custom_objects: Optional dictionary mapping name strings to custom objects
+        in the layer class. This is helpful for testing custom layers.
+      test_harness: The Tensorflow test, if any, that this function is being
+        called in.
+      supports_masking: Optional boolean to check the `supports_masking`
+        property of the layer. If None, the check will not be performed.
+
+    Returns:
+      The output data (Numpy array) returned by the layer, for additional
+      checks to be done by the calling code.
+
+    Raises:
+      ValueError: if `input_shape is None`.
+    """
+    if input_data is None:
+        if input_shape is None:
+            raise ValueError("input_shape is None")
+        if not input_dtype:
+            input_dtype = "float32"
+        input_data_shape = list(input_shape)
+        for i, e in enumerate(input_data_shape):
+            if e is None:
+                input_data_shape[i] = np.random.randint(1, 4)
+        input_data = 10 * np.random.random(input_data_shape)
+        if input_dtype[:5] == "float":
+            input_data -= 0.5
+        input_data = input_data.astype(input_dtype)
+    elif input_shape is None:
+        input_shape = input_data.shape
+    if input_dtype is None:
+        input_dtype = input_data.dtype
+    if expected_output_dtype is None:
+        expected_output_dtype = input_dtype
+
+    if tf.as_dtype(expected_output_dtype) == tf.string:
+        if test_harness:
+            assert_equal = test_harness.assertAllEqual
+        else:
+            assert_equal = string_test
     else:
-      assert_equal = numeric_test
-
-  # instantiation
-  kwargs = kwargs or {}
-  layer = layer_cls(**kwargs)
-
-  if (supports_masking is not None
-      and layer.supports_masking != supports_masking):
-    raise AssertionError(
-        'When testing layer %s, the `supports_masking` property is %r'
-        'but expected to be %r.\nFull kwargs: %s' %
-        (layer_cls.__name__, layer.supports_masking, supports_masking, kwargs))
-
-  # Test adapt, if data was passed.
-  if adapt_data is not None:
-    layer.adapt(adapt_data)
-
-  # test get_weights , set_weights at layer level
-  weights = layer.get_weights()
-  layer.set_weights(weights)
-
-  # test and instantiation from weights
-  if 'weights' in tf_inspect.getargspec(layer_cls.__init__):
-    kwargs['weights'] = weights
+        if test_harness:
+            assert_equal = test_harness.assertAllClose
+        else:
+            assert_equal = numeric_test
+
+    # instantiation
+    kwargs = kwargs or {}
     layer = layer_cls(**kwargs)
 
-  # test in functional API
-  x = layers.Input(shape=input_shape[1:], dtype=input_dtype)
-  y = layer(x)
-  if backend.dtype(y) != expected_output_dtype:
-    raise AssertionError('When testing layer %s, for input %s, found output '
-                         'dtype=%s but expected to find %s.\nFull kwargs: %s' %
-                         (layer_cls.__name__, x, backend.dtype(y),
-                          expected_output_dtype, kwargs))
-
-  def assert_shapes_equal(expected, actual):
-    """Asserts that the output shape from the layer matches the actual shape."""
-    if len(expected) != len(actual):
-      raise AssertionError(
-          'When testing layer %s, for input %s, found output_shape='
-          '%s but expected to find %s.\nFull kwargs: %s' %
-          (layer_cls.__name__, x, actual, expected, kwargs))
-
-    for expected_dim, actual_dim in zip(expected, actual):
-      if isinstance(expected_dim, tf.compat.v1.Dimension):
-        expected_dim = expected_dim.value
-      if isinstance(actual_dim, tf.compat.v1.Dimension):
-        actual_dim = actual_dim.value
-      if expected_dim is not None and expected_dim != actual_dim:
+    if (
+        supports_masking is not None
+        and layer.supports_masking != supports_masking
+    ):
         raise AssertionError(
-            'When testing layer %s, for input %s, found output_shape='
-            '%s but expected to find %s.\nFull kwargs: %s' %
-            (layer_cls.__name__, x, actual, expected, kwargs))
-
-  if expected_output_shape is not None:
-    assert_shapes_equal(tf.TensorShape(expected_output_shape),
-                        y.shape)
-
-  # check shape inference
-  model = models.Model(x, y)
-  computed_output_shape = tuple(
-      layer.compute_output_shape(
-          tf.TensorShape(input_shape)).as_list())
-  computed_output_signature = layer.compute_output_signature(
-      tf.TensorSpec(shape=input_shape, dtype=input_dtype))
-  actual_output = model.predict(input_data)
-  actual_output_shape = actual_output.shape
-  assert_shapes_equal(computed_output_shape, actual_output_shape)
-  assert_shapes_equal(computed_output_signature.shape, actual_output_shape)
-  if computed_output_signature.dtype != actual_output.dtype:
-    raise AssertionError(
-        'When testing layer %s, for input %s, found output_dtype='
-        '%s but expected to find %s.\nFull kwargs: %s' %
-        (layer_cls.__name__, x, actual_output.dtype,
-         computed_output_signature.dtype, kwargs))
-  if expected_output is not None:
-    assert_equal(actual_output, expected_output)
-
-  # test serialization, weight setting at model level
-  model_config = model.get_config()
-  recovered_model = models.Model.from_config(model_config, custom_objects)
-  if model.weights:
-    weights = model.get_weights()
-    recovered_model.set_weights(weights)
-    output = recovered_model.predict(input_data)
-    assert_equal(output, actual_output)
-
-  # test training mode (e.g. useful for dropout tests)
-  # Rebuild the model to avoid the graph being reused between predict() and
-  # See b/120160788 for more details. This should be mitigated after 2.0.
-  layer_weights = layer.get_weights()  # Get the layer weights BEFORE training.
-  if validate_training:
-    model = models.Model(x, layer(x))
-    if _thread_local_data.run_eagerly is not None:
-      model.compile(
-          'rmsprop',
-          'mse',
-          weighted_metrics=['acc'],
-          run_eagerly=should_run_eagerly())
-    else:
-      model.compile('rmsprop', 'mse', weighted_metrics=['acc'])
-    model.train_on_batch(input_data, actual_output)
-
-  # test as first layer in Sequential API
-  layer_config = layer.get_config()
-  layer_config['batch_input_shape'] = input_shape
-  layer = layer.__class__.from_config(layer_config)
-
-  # Test adapt, if data was passed.
-  if adapt_data is not None:
-    layer.adapt(adapt_data)
-
-  model = models.Sequential()
-  model.add(layers.Input(shape=input_shape[1:], dtype=input_dtype))
-  model.add(layer)
-
-  layer.set_weights(layer_weights)
-  actual_output = model.predict(input_data)
-  actual_output_shape = actual_output.shape
-  for expected_dim, actual_dim in zip(computed_output_shape,
-                                      actual_output_shape):
-    if expected_dim is not None:
-      if expected_dim != actual_dim:
+            "When testing layer %s, the `supports_masking` property is %r"
+            "but expected to be %r.\nFull kwargs: %s"
+            % (
+                layer_cls.__name__,
+                layer.supports_masking,
+                supports_masking,
+                kwargs,
+            )
+        )
+
+    # Test adapt, if data was passed.
+    if adapt_data is not None:
+        layer.adapt(adapt_data)
+
+    # test get_weights , set_weights at layer level
+    weights = layer.get_weights()
+    layer.set_weights(weights)
+
+    # test and instantiation from weights
+    if "weights" in tf_inspect.getargspec(layer_cls.__init__):
+        kwargs["weights"] = weights
+        layer = layer_cls(**kwargs)
+
+    # test in functional API
+    x = layers.Input(shape=input_shape[1:], dtype=input_dtype)
+    y = layer(x)
+    if backend.dtype(y) != expected_output_dtype:
+        raise AssertionError(
+            "When testing layer %s, for input %s, found output "
+            "dtype=%s but expected to find %s.\nFull kwargs: %s"
+            % (
+                layer_cls.__name__,
+                x,
+                backend.dtype(y),
+                expected_output_dtype,
+                kwargs,
+            )
+        )
+
+    def assert_shapes_equal(expected, actual):
+        """Asserts that the output shape from the layer matches the actual
+        shape."""
+        if len(expected) != len(actual):
+            raise AssertionError(
+                "When testing layer %s, for input %s, found output_shape="
+                "%s but expected to find %s.\nFull kwargs: %s"
+                % (layer_cls.__name__, x, actual, expected, kwargs)
+            )
+
+        for expected_dim, actual_dim in zip(expected, actual):
+            if isinstance(expected_dim, tf.compat.v1.Dimension):
+                expected_dim = expected_dim.value
+            if isinstance(actual_dim, tf.compat.v1.Dimension):
+                actual_dim = actual_dim.value
+            if expected_dim is not None and expected_dim != actual_dim:
+                raise AssertionError(
+                    "When testing layer %s, for input %s, found output_shape="
+                    "%s but expected to find %s.\nFull kwargs: %s"
+                    % (layer_cls.__name__, x, actual, expected, kwargs)
+                )
+
+    if expected_output_shape is not None:
+        assert_shapes_equal(tf.TensorShape(expected_output_shape), y.shape)
+
+    # check shape inference
+    model = models.Model(x, y)
+    computed_output_shape = tuple(
+        layer.compute_output_shape(tf.TensorShape(input_shape)).as_list()
+    )
+    computed_output_signature = layer.compute_output_signature(
+        tf.TensorSpec(shape=input_shape, dtype=input_dtype)
+    )
+    actual_output = model.predict(input_data)
+    actual_output_shape = actual_output.shape
+    assert_shapes_equal(computed_output_shape, actual_output_shape)
+    assert_shapes_equal(computed_output_signature.shape, actual_output_shape)
+    if computed_output_signature.dtype != actual_output.dtype:
         raise AssertionError(
-            'When testing layer %s **after deserialization**, '
-            'for input %s, found output_shape='
-            '%s but expected to find inferred shape %s.\nFull kwargs: %s' %
-            (layer_cls.__name__,
-             x,
-             actual_output_shape,
-             computed_output_shape,
-             kwargs))
-  if expected_output is not None:
-    assert_equal(actual_output, expected_output)
-
-  # test serialization, weight setting at model level
-  model_config = model.get_config()
-  recovered_model = models.Sequential.from_config(model_config, custom_objects)
-  if model.weights:
-    weights = model.get_weights()
-    recovered_model.set_weights(weights)
-    output = recovered_model.predict(input_data)
-    assert_equal(output, actual_output)
-
-  # for further checks in the caller function
-  return actual_output
+            "When testing layer %s, for input %s, found output_dtype="
+            "%s but expected to find %s.\nFull kwargs: %s"
+            % (
+                layer_cls.__name__,
+                x,
+                actual_output.dtype,
+                computed_output_signature.dtype,
+                kwargs,
+            )
+        )
+    if expected_output is not None:
+        assert_equal(actual_output, expected_output)
+
+    # test serialization, weight setting at model level
+    model_config = model.get_config()
+    recovered_model = models.Model.from_config(model_config, custom_objects)
+    if model.weights:
+        weights = model.get_weights()
+        recovered_model.set_weights(weights)
+        output = recovered_model.predict(input_data)
+        assert_equal(output, actual_output)
+
+    # test training mode (e.g. useful for dropout tests)
+    # Rebuild the model to avoid the graph being reused between predict() and
+    # See b/120160788 for more details. This should be mitigated after 2.0.
+    layer_weights = (
+        layer.get_weights()
+    )  # Get the layer weights BEFORE training.
+    if validate_training:
+        model = models.Model(x, layer(x))
+        if _thread_local_data.run_eagerly is not None:
+            model.compile(
+                "rmsprop",
+                "mse",
+                weighted_metrics=["acc"],
+                run_eagerly=should_run_eagerly(),
+            )
+        else:
+            model.compile("rmsprop", "mse", weighted_metrics=["acc"])
+        model.train_on_batch(input_data, actual_output)
+
+    # test as first layer in Sequential API
+    layer_config = layer.get_config()
+    layer_config["batch_input_shape"] = input_shape
+    layer = layer.__class__.from_config(layer_config)
+
+    # Test adapt, if data was passed.
+    if adapt_data is not None:
+        layer.adapt(adapt_data)
+
+    model = models.Sequential()
+    model.add(layers.Input(shape=input_shape[1:], dtype=input_dtype))
+    model.add(layer)
+
+    layer.set_weights(layer_weights)
+    actual_output = model.predict(input_data)
+    actual_output_shape = actual_output.shape
+    for expected_dim, actual_dim in zip(
+        computed_output_shape, actual_output_shape
+    ):
+        if expected_dim is not None:
+            if expected_dim != actual_dim:
+                raise AssertionError(
+                    "When testing layer %s **after deserialization**, "
+                    "for input %s, found output_shape="
+                    "%s but expected to find inferred shape %s.\n"
+                    "Full kwargs: %s"
+                    % (
+                        layer_cls.__name__,
+                        x,
+                        actual_output_shape,
+                        computed_output_shape,
+                        kwargs,
+                    )
+                )
+    if expected_output is not None:
+        assert_equal(actual_output, expected_output)
+
+    # test serialization, weight setting at model level
+    model_config = model.get_config()
+    recovered_model = models.Sequential.from_config(
+        model_config, custom_objects
+    )
+    if model.weights:
+        weights = model.get_weights()
+        recovered_model.set_weights(weights)
+        output = recovered_model.predict(input_data)
+        assert_equal(output, actual_output)
+
+    # for further checks in the caller function
+    return actual_output
 
 
 _thread_local_data = threading.local()
@@ -309,781 +353,836 @@ def assert_shapes_equal(expected, actual):
 
 @tf_contextlib.contextmanager
 def model_type_scope(value):
-  """Provides a scope within which the model type to test is equal to `value`.
+    """Provides a scope within which the model type to test is equal to `value`.
 
-  The model type gets restored to its original value upon exiting the scope.
+    The model type gets restored to its original value upon exiting the scope.
 
-  Args:
-     value: model type value
+    Args:
+       value: model type value
 
-  Yields:
-    The provided value.
-  """
-  previous_value = _thread_local_data.model_type
-  try:
-    _thread_local_data.model_type = value
-    yield value
-  finally:
-    # Restore model type to initial value.
-    _thread_local_data.model_type = previous_value
+    Yields:
+      The provided value.
+    """
+    previous_value = _thread_local_data.model_type
+    try:
+        _thread_local_data.model_type = value
+        yield value
+    finally:
+        # Restore model type to initial value.
+        _thread_local_data.model_type = previous_value
 
 
 @tf_contextlib.contextmanager
 def run_eagerly_scope(value):
-  """Provides a scope within which we compile models to run eagerly or not.
+    """Provides a scope within which we compile models to run eagerly or not.
 
-  The boolean gets restored to its original value upon exiting the scope.
+    The boolean gets restored to its original value upon exiting the scope.
 
-  Args:
-     value: Bool specifying if we should run models eagerly in the active test.
-     Should be True or False.
+    Args:
+       value: Bool specifying if we should run models eagerly in the active
+         test. Should be True or False.
 
-  Yields:
-    The provided value.
-  """
-  previous_value = _thread_local_data.run_eagerly
-  try:
-    _thread_local_data.run_eagerly = value
-    yield value
-  finally:
-    # Restore model type to initial value.
-    _thread_local_data.run_eagerly = previous_value
+    Yields:
+      The provided value.
+    """
+    previous_value = _thread_local_data.run_eagerly
+    try:
+        _thread_local_data.run_eagerly = value
+        yield value
+    finally:
+        # Restore model type to initial value.
+        _thread_local_data.run_eagerly = previous_value
 
 
 def should_run_eagerly():
-  """Returns whether the models we are testing should be run eagerly."""
-  if _thread_local_data.run_eagerly is None:
-    raise ValueError('Cannot call `should_run_eagerly()` outside of a '
-                     '`run_eagerly_scope()` or `run_all_keras_modes` '
-                     'decorator.')
+    """Returns whether the models we are testing should be run eagerly."""
+    if _thread_local_data.run_eagerly is None:
+        raise ValueError(
+            "Cannot call `should_run_eagerly()` outside of a "
+            "`run_eagerly_scope()` or `run_all_keras_modes` "
+            "decorator."
+        )
 
-  return _thread_local_data.run_eagerly and tf.executing_eagerly()
+    return _thread_local_data.run_eagerly and tf.executing_eagerly()
 
 
 @tf_contextlib.contextmanager
 def saved_model_format_scope(value, **kwargs):
-  """Provides a scope within which the savde model format to test is `value`.
-
-  The saved model format gets restored to its original value upon exiting the
-  scope.
-
-  Args:
-     value: saved model format value
-     **kwargs: optional kwargs to pass to the save function.
-
-  Yields:
-    The provided value.
-  """
-  previous_format = _thread_local_data.saved_model_format
-  previous_kwargs = _thread_local_data.save_kwargs
-  try:
-    _thread_local_data.saved_model_format = value
-    _thread_local_data.save_kwargs = kwargs
-    yield
-  finally:
-    # Restore saved model format to initial value.
-    _thread_local_data.saved_model_format = previous_format
-    _thread_local_data.save_kwargs = previous_kwargs
+    """Provides a scope within which the savde model format to test is `value`.
+
+    The saved model format gets restored to its original value upon exiting the
+    scope.
+
+    Args:
+       value: saved model format value
+       **kwargs: optional kwargs to pass to the save function.
+
+    Yields:
+      The provided value.
+    """
+    previous_format = _thread_local_data.saved_model_format
+    previous_kwargs = _thread_local_data.save_kwargs
+    try:
+        _thread_local_data.saved_model_format = value
+        _thread_local_data.save_kwargs = kwargs
+        yield
+    finally:
+        # Restore saved model format to initial value.
+        _thread_local_data.saved_model_format = previous_format
+        _thread_local_data.save_kwargs = previous_kwargs
 
 
 def get_save_format():
-  if _thread_local_data.saved_model_format is None:
-    raise ValueError(
-        'Cannot call `get_save_format()` outside of a '
-        '`saved_model_format_scope()` or `run_with_all_saved_model_formats` '
-        'decorator.')
-  return _thread_local_data.saved_model_format
+    if _thread_local_data.saved_model_format is None:
+        raise ValueError(
+            "Cannot call `get_save_format()` outside of a "
+            "`saved_model_format_scope()` or "
+            "`run_with_all_saved_model_formats` decorator."
+        )
+    return _thread_local_data.saved_model_format
 
 
 def get_save_kwargs():
-  if _thread_local_data.save_kwargs is None:
-    raise ValueError(
-        'Cannot call `get_save_kwargs()` outside of a '
-        '`saved_model_format_scope()` or `run_with_all_saved_model_formats` '
-        'decorator.')
-  return _thread_local_data.save_kwargs or {}
+    if _thread_local_data.save_kwargs is None:
+        raise ValueError(
+            "Cannot call `get_save_kwargs()` outside of a "
+            "`saved_model_format_scope()` or "
+            "`run_with_all_saved_model_formats` decorator."
+        )
+    return _thread_local_data.save_kwargs or {}
 
 
 def get_model_type():
-  """Gets the model type that should be tested."""
-  if _thread_local_data.model_type is None:
-    raise ValueError('Cannot call `get_model_type()` outside of a '
-                     '`model_type_scope()` or `run_with_all_model_types` '
-                     'decorator.')
+    """Gets the model type that should be tested."""
+    if _thread_local_data.model_type is None:
+        raise ValueError(
+            "Cannot call `get_model_type()` outside of a "
+            "`model_type_scope()` or `run_with_all_model_types` "
+            "decorator."
+        )
 
-  return _thread_local_data.model_type
+    return _thread_local_data.model_type
 
 
 def get_small_sequential_mlp(num_hidden, num_classes, input_dim=None):
-  model = models.Sequential()
-  if input_dim:
-    model.add(layers.Dense(num_hidden, activation='relu', input_dim=input_dim))
-  else:
-    model.add(layers.Dense(num_hidden, activation='relu'))
-  activation = 'sigmoid' if num_classes == 1 else 'softmax'
-  model.add(layers.Dense(num_classes, activation=activation))
-  return model
+    model = models.Sequential()
+    if input_dim:
+        model.add(
+            layers.Dense(num_hidden, activation="relu", input_dim=input_dim)
+        )
+    else:
+        model.add(layers.Dense(num_hidden, activation="relu"))
+    activation = "sigmoid" if num_classes == 1 else "softmax"
+    model.add(layers.Dense(num_classes, activation=activation))
+    return model
 
 
 def get_small_functional_mlp(num_hidden, num_classes, input_dim):
-  inputs = layers.Input(shape=(input_dim,))
-  outputs = layers.Dense(num_hidden, activation='relu')(inputs)
-  activation = 'sigmoid' if num_classes == 1 else 'softmax'
-  outputs = layers.Dense(num_classes, activation=activation)(outputs)
-  return models.Model(inputs, outputs)
+    inputs = layers.Input(shape=(input_dim,))
+    outputs = layers.Dense(num_hidden, activation="relu")(inputs)
+    activation = "sigmoid" if num_classes == 1 else "softmax"
+    outputs = layers.Dense(num_classes, activation=activation)(outputs)
+    return models.Model(inputs, outputs)
 
 
 class SmallSubclassMLP(models.Model):
-  """A subclass model based small MLP."""
-
-  def __init__(self,
-               num_hidden,
-               num_classes,
-               use_bn=False,
-               use_dp=False,
-               **kwargs):
-    super().__init__(name='test_model', **kwargs)
-    self.use_bn = use_bn
-    self.use_dp = use_dp
-
-    self.layer_a = layers.Dense(num_hidden, activation='relu')
-    activation = 'sigmoid' if num_classes == 1 else 'softmax'
-    self.layer_b = layers.Dense(num_classes, activation=activation)
-    if self.use_dp:
-      self.dp = layers.Dropout(0.5)
-    if self.use_bn:
-      self.bn = layers.BatchNormalization(axis=-1)
-
-  def call(self, inputs, **kwargs):
-    x = self.layer_a(inputs)
-    if self.use_dp:
-      x = self.dp(x)
-    if self.use_bn:
-      x = self.bn(x)
-    return self.layer_b(x)
+    """A subclass model based small MLP."""
+
+    def __init__(
+        self, num_hidden, num_classes, use_bn=False, use_dp=False, **kwargs
+    ):
+        super().__init__(name="test_model", **kwargs)
+        self.num_hidden = num_hidden
+        self.num_classes = num_classes
+        self.use_bn = use_bn
+        self.use_dp = use_dp
+
+        self.layer_a = layers.Dense(num_hidden, activation="relu")
+        activation = "sigmoid" if num_classes == 1 else "softmax"
+        self.layer_b = layers.Dense(num_classes, activation=activation)
+        if self.use_dp:
+            self.dp = layers.Dropout(0.5)
+        if self.use_bn:
+            self.bn = layers.BatchNormalization(axis=-1)
+
+    def call(self, inputs, **kwargs):
+        x = self.layer_a(inputs)
+        if self.use_dp:
+            x = self.dp(x)
+        if self.use_bn:
+            x = self.bn(x)
+        return self.layer_b(x)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_hidden": self.num_hidden,
+                "num_classes": self.num_classes,
+                "use_bn": self.use_bn,
+                "use_dp": self.use_dp,
+            }
+        )
+        return config
 
 
 class _SmallSubclassMLPCustomBuild(models.Model):
-  """A subclass model small MLP that uses a custom build method."""
+    """A subclass model small MLP that uses a custom build method."""
 
-  def __init__(self, num_hidden, num_classes):
-    super().__init__()
-    self.layer_a = None
-    self.layer_b = None
-    self.num_hidden = num_hidden
-    self.num_classes = num_classes
+    def __init__(self, num_hidden, num_classes):
+        super().__init__()
+        self.layer_a = None
+        self.layer_b = None
+        self.num_hidden = num_hidden
+        self.num_classes = num_classes
 
-  def build(self, input_shape):
-    self.layer_a = layers.Dense(self.num_hidden, activation='relu')
-    activation = 'sigmoid' if self.num_classes == 1 else 'softmax'
-    self.layer_b = layers.Dense(self.num_classes, activation=activation)
+    def build(self, input_shape):
+        self.layer_a = layers.Dense(self.num_hidden, activation="relu")
+        activation = "sigmoid" if self.num_classes == 1 else "softmax"
+        self.layer_b = layers.Dense(self.num_classes, activation=activation)
 
-  def call(self, inputs, **kwargs):
-    x = self.layer_a(inputs)
-    return self.layer_b(x)
+    def call(self, inputs, **kwargs):
+        x = self.layer_a(inputs)
+        return self.layer_b(x)
 
 
 def get_small_subclass_mlp(num_hidden, num_classes):
-  return SmallSubclassMLP(num_hidden, num_classes)
+    return SmallSubclassMLP(num_hidden, num_classes)
 
 
 def get_small_subclass_mlp_with_custom_build(num_hidden, num_classes):
-  return _SmallSubclassMLPCustomBuild(num_hidden, num_classes)
+    return _SmallSubclassMLPCustomBuild(num_hidden, num_classes)
 
 
 def get_small_mlp(num_hidden, num_classes, input_dim):
-  """Get a small mlp of the model type specified by `get_model_type`."""
-  model_type = get_model_type()
-  if model_type == 'subclass':
-    return get_small_subclass_mlp(num_hidden, num_classes)
-  if model_type == 'subclass_custom_build':
-    return get_small_subclass_mlp_with_custom_build(num_hidden, num_classes)
-  if model_type == 'sequential':
-    return get_small_sequential_mlp(num_hidden, num_classes, input_dim)
-  if model_type == 'functional':
-    return get_small_functional_mlp(num_hidden, num_classes, input_dim)
-  raise ValueError('Unknown model type {}'.format(model_type))
+    """Get a small mlp of the model type specified by `get_model_type`."""
+    model_type = get_model_type()
+    if model_type == "subclass":
+        return get_small_subclass_mlp(num_hidden, num_classes)
+    if model_type == "subclass_custom_build":
+        return get_small_subclass_mlp_with_custom_build(num_hidden, num_classes)
+    if model_type == "sequential":
+        return get_small_sequential_mlp(num_hidden, num_classes, input_dim)
+    if model_type == "functional":
+        return get_small_functional_mlp(num_hidden, num_classes, input_dim)
+    raise ValueError(f"Unknown model type {model_type}")
 
 
 class _SubclassModel(models.Model):
-  """A Keras subclass model."""
+    """A Keras subclass model."""
 
-  def __init__(self, model_layers, *args, **kwargs):
-    """Instantiate a model.
+    def __init__(self, model_layers, *args, **kwargs):
+        """Instantiate a model.
 
-    Args:
-      model_layers: a list of layers to be added to the model.
-      *args: Model's args
-      **kwargs: Model's keyword args, at most one of input_tensor -> the input
-        tensor required for ragged/sparse input.
-    """
+        Args:
+          model_layers: a list of layers to be added to the model.
+          *args: Model's args
+          **kwargs: Model's keyword args, at most one of input_tensor -> the
+            input tensor required for ragged/sparse input.
+        """
 
-    inputs = kwargs.pop('input_tensor', None)
-    super().__init__(*args, **kwargs)
-    # Note that clone and build doesn't support lists of layers in subclassed
-    # models. Adding each layer directly here.
-    for i, layer in enumerate(model_layers):
-      setattr(self, self._layer_name_for_i(i), layer)
+        inputs = kwargs.pop("input_tensor", None)
+        super().__init__(*args, **kwargs)
+        # Note that clone and build doesn't support lists of layers in
+        # subclassed models. Adding each layer directly here.
+        for i, layer in enumerate(model_layers):
+            setattr(self, self._layer_name_for_i(i), layer)
 
-    self.num_layers = len(model_layers)
+        self.num_layers = len(model_layers)
 
-    if inputs is not None:
-      self._set_inputs(inputs)
+        if inputs is not None:
+            self._set_inputs(inputs)
 
-  def _layer_name_for_i(self, i):
-    return 'layer{}'.format(i)
+    def _layer_name_for_i(self, i):
+        return f"layer{i}"
 
-  def call(self, inputs, **kwargs):
-    x = inputs
-    for i in range(self.num_layers):
-      layer = getattr(self, self._layer_name_for_i(i))
-      x = layer(x)
-    return x
+    def call(self, inputs, **kwargs):
+        x = inputs
+        for i in range(self.num_layers):
+            layer = getattr(self, self._layer_name_for_i(i))
+            x = layer(x)
+        return x
 
-  def get_config(self):
-    # This test model relies on the default Keras serialization of a model,
-    # rather than providing the details of `model_layers`.
-    raise NotImplementedError
+    def get_config(self):
+        # This test model relies on the default Keras serialization of a model,
+        # rather than providing the details of `model_layers`.
+        raise NotImplementedError
 
 
 class _SubclassModelCustomBuild(models.Model):
-  """A Keras subclass model that uses a custom build method."""
-
-  def __init__(self, layer_generating_func, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.all_layers = None
-    self._layer_generating_func = layer_generating_func
-
-  def build(self, input_shape):
-    model_layers = []
-    for layer in self._layer_generating_func():
-      model_layers.append(layer)
-    self.all_layers = model_layers
-
-  def call(self, inputs, **kwargs):
-    x = inputs
-    for layer in self.all_layers:
-      x = layer(x)
-    return x
-
-
-def get_model_from_layers(model_layers,
-                          input_shape=None,
-                          input_dtype=None,
-                          name=None,
-                          input_ragged=None,
-                          input_sparse=None,
-                          model_type=None):
-  """Builds a model from a sequence of layers.
-
-  Args:
-    model_layers: The layers used to build the network.
-    input_shape: Shape tuple of the input or 'TensorShape' instance.
-    input_dtype: Datatype of the input.
-    name: Name for the model.
-    input_ragged: Boolean, whether the input data is a ragged tensor.
-    input_sparse: Boolean, whether the input data is a sparse tensor.
-    model_type: One of "subclass", "subclass_custom_build", "sequential", or
-      "functional". When None, defaults to `get_model_type`.
-
-  Returns:
-    A Keras model.
-  """
-  if model_type is None:
-    model_type = get_model_type()
-  if model_type == 'subclass':
-    inputs = None
-    if input_ragged or input_sparse:
-      inputs = layers.Input(
-          shape=input_shape,
-          dtype=input_dtype,
-          ragged=input_ragged,
-          sparse=input_sparse)
-    return _SubclassModel(model_layers, name=name, input_tensor=inputs)
-
-  if model_type == 'subclass_custom_build':
-    layer_generating_func = lambda: model_layers
-    return _SubclassModelCustomBuild(layer_generating_func, name=name)
-
-  if model_type == 'sequential':
-    model = models.Sequential(name=name)
-    if input_shape:
-      model.add(
-          layers.InputLayer(
-              input_shape=input_shape,
-              dtype=input_dtype,
-              ragged=input_ragged,
-              sparse=input_sparse))
-    for layer in model_layers:
-      model.add(layer)
-    return model
-
-  if model_type == 'functional':
-    if not input_shape:
-      raise ValueError('Cannot create a functional model from layers with no '
-                       'input shape.')
-    inputs = layers.Input(
-        shape=input_shape,
-        dtype=input_dtype,
-        ragged=input_ragged,
-        sparse=input_sparse)
-    outputs = inputs
-    for layer in model_layers:
-      outputs = layer(outputs)
-    return models.Model(inputs, outputs, name=name)
+    """A Keras subclass model that uses a custom build method."""
+
+    def __init__(self, layer_generating_func, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.all_layers = None
+        self._layer_generating_func = layer_generating_func
+
+    def build(self, input_shape):
+        model_layers = []
+        for layer in self._layer_generating_func():
+            model_layers.append(layer)
+        self.all_layers = model_layers
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        for layer in self.all_layers:
+            x = layer(x)
+        return x
+
+
+def get_model_from_layers(
+    model_layers,
+    input_shape=None,
+    input_dtype=None,
+    name=None,
+    input_ragged=None,
+    input_sparse=None,
+    model_type=None,
+):
+    """Builds a model from a sequence of layers.
 
-  raise ValueError('Unknown model type {}'.format(model_type))
+    Args:
+      model_layers: The layers used to build the network.
+      input_shape: Shape tuple of the input or 'TensorShape' instance.
+      input_dtype: Datatype of the input.
+      name: Name for the model.
+      input_ragged: Boolean, whether the input data is a ragged tensor.
+      input_sparse: Boolean, whether the input data is a sparse tensor.
+      model_type: One of "subclass", "subclass_custom_build", "sequential", or
+        "functional". When None, defaults to `get_model_type`.
+
+    Returns:
+      A Keras model.
+    """
+    if model_type is None:
+        model_type = get_model_type()
+    if model_type == "subclass":
+        inputs = None
+        if input_ragged or input_sparse:
+            inputs = layers.Input(
+                shape=input_shape,
+                dtype=input_dtype,
+                ragged=input_ragged,
+                sparse=input_sparse,
+            )
+        return _SubclassModel(model_layers, name=name, input_tensor=inputs)
+
+    if model_type == "subclass_custom_build":
+        layer_generating_func = lambda: model_layers
+        return _SubclassModelCustomBuild(layer_generating_func, name=name)
+
+    if model_type == "sequential":
+        model = models.Sequential(name=name)
+        if input_shape:
+            model.add(
+                layers.InputLayer(
+                    input_shape=input_shape,
+                    dtype=input_dtype,
+                    ragged=input_ragged,
+                    sparse=input_sparse,
+                )
+            )
+        for layer in model_layers:
+            model.add(layer)
+        return model
+
+    if model_type == "functional":
+        if not input_shape:
+            raise ValueError(
+                "Cannot create a functional model from layers with no "
+                "input shape."
+            )
+        inputs = layers.Input(
+            shape=input_shape,
+            dtype=input_dtype,
+            ragged=input_ragged,
+            sparse=input_sparse,
+        )
+        outputs = inputs
+        for layer in model_layers:
+            outputs = layer(outputs)
+        return models.Model(inputs, outputs, name=name)
+
+    raise ValueError(f"Unknown model type {model_type}")
 
 
 class Bias(layers.Layer):
+    def build(self, input_shape):
+        self.bias = self.add_weight("bias", (1,), initializer="zeros")
 
-  def build(self, input_shape):
-    self.bias = self.add_weight('bias', (1,), initializer='zeros')
-
-  def call(self, inputs):
-    return inputs + self.bias
+    def call(self, inputs):
+        return inputs + self.bias
 
 
 class _MultiIOSubclassModel(models.Model):
-  """Multi IO Keras subclass model."""
-
-  def __init__(self, branch_a, branch_b, shared_input_branch=None,
-               shared_output_branch=None, name=None):
-    super().__init__(name=name)
-    self._shared_input_branch = shared_input_branch
-    self._branch_a = branch_a
-    self._branch_b = branch_b
-    self._shared_output_branch = shared_output_branch
-
-  def call(self, inputs, **kwargs):
-    if self._shared_input_branch:
-      for layer in self._shared_input_branch:
-        inputs = layer(inputs)
-      a = inputs
-      b = inputs
-    elif isinstance(inputs, dict):
-      a = inputs['input_1']
-      b = inputs['input_2']
-    else:
-      a, b = inputs
-
-    for layer in self._branch_a:
-      a = layer(a)
-    for layer in self._branch_b:
-      b = layer(b)
-    outs = [a, b]
-
-    if self._shared_output_branch:
-      for layer in self._shared_output_branch:
-        outs = layer(outs)
-
-    return outs
+    """Multi IO Keras subclass model."""
+
+    def __init__(
+        self,
+        branch_a,
+        branch_b,
+        shared_input_branch=None,
+        shared_output_branch=None,
+        name=None,
+    ):
+        super().__init__(name=name)
+        self._shared_input_branch = shared_input_branch
+        self._branch_a = branch_a
+        self._branch_b = branch_b
+        self._shared_output_branch = shared_output_branch
+
+    def call(self, inputs, **kwargs):
+        if self._shared_input_branch:
+            for layer in self._shared_input_branch:
+                inputs = layer(inputs)
+            a = inputs
+            b = inputs
+        elif isinstance(inputs, dict):
+            a = inputs["input_1"]
+            b = inputs["input_2"]
+        else:
+            a, b = inputs
+
+        for layer in self._branch_a:
+            a = layer(a)
+        for layer in self._branch_b:
+            b = layer(b)
+        outs = [a, b]
+
+        if self._shared_output_branch:
+            for layer in self._shared_output_branch:
+                outs = layer(outs)
+
+        return outs
 
 
 class _MultiIOSubclassModelCustomBuild(models.Model):
-  """Multi IO Keras subclass model that uses a custom build method."""
-
-  def __init__(self, branch_a_func, branch_b_func,
-               shared_input_branch_func=None,
-               shared_output_branch_func=None):
-    super().__init__()
-    self._shared_input_branch_func = shared_input_branch_func
-    self._branch_a_func = branch_a_func
-    self._branch_b_func = branch_b_func
-    self._shared_output_branch_func = shared_output_branch_func
-
-    self._shared_input_branch = None
-    self._branch_a = None
-    self._branch_b = None
-    self._shared_output_branch = None
-
-  def build(self, input_shape):
-    if self._shared_input_branch_func():
-      self._shared_input_branch = self._shared_input_branch_func()
-    self._branch_a = self._branch_a_func()
-    self._branch_b = self._branch_b_func()
-
-    if self._shared_output_branch_func():
-      self._shared_output_branch = self._shared_output_branch_func()
-
-  def call(self, inputs, **kwargs):
-    if self._shared_input_branch:
-      for layer in self._shared_input_branch:
-        inputs = layer(inputs)
-      a = inputs
-      b = inputs
-    else:
-      a, b = inputs
-
-    for layer in self._branch_a:
-      a = layer(a)
-    for layer in self._branch_b:
-      b = layer(b)
-    outs = a, b
-
-    if self._shared_output_branch:
-      for layer in self._shared_output_branch:
-        outs = layer(outs)
-
-    return outs
+    """Multi IO Keras subclass model that uses a custom build method."""
+
+    def __init__(
+        self,
+        branch_a_func,
+        branch_b_func,
+        shared_input_branch_func=None,
+        shared_output_branch_func=None,
+    ):
+        super().__init__()
+        self._shared_input_branch_func = shared_input_branch_func
+        self._branch_a_func = branch_a_func
+        self._branch_b_func = branch_b_func
+        self._shared_output_branch_func = shared_output_branch_func
+
+        self._shared_input_branch = None
+        self._branch_a = None
+        self._branch_b = None
+        self._shared_output_branch = None
+
+    def build(self, input_shape):
+        if self._shared_input_branch_func():
+            self._shared_input_branch = self._shared_input_branch_func()
+        self._branch_a = self._branch_a_func()
+        self._branch_b = self._branch_b_func()
+
+        if self._shared_output_branch_func():
+            self._shared_output_branch = self._shared_output_branch_func()
+
+    def call(self, inputs, **kwargs):
+        if self._shared_input_branch:
+            for layer in self._shared_input_branch:
+                inputs = layer(inputs)
+            a = inputs
+            b = inputs
+        else:
+            a, b = inputs
+
+        for layer in self._branch_a:
+            a = layer(a)
+        for layer in self._branch_b:
+            b = layer(b)
+        outs = a, b
+
+        if self._shared_output_branch:
+            for layer in self._shared_output_branch:
+                outs = layer(outs)
+
+        return outs
 
 
 def get_multi_io_model(
-    branch_a,
-    branch_b,
-    shared_input_branch=None,
-    shared_output_branch=None):
-  """Builds a multi-io model that contains two branches.
-
-  The produced model will be of the type specified by `get_model_type`.
-
-  To build a two-input, two-output model:
-    Specify a list of layers for branch a and branch b, but do not specify any
-    shared input branch or shared output branch. The resulting model will apply
-    each branch to a different input, to produce two outputs.
-
-    The first value in branch_a must be the Keras 'Input' layer for branch a,
-    and the first value in branch_b must be the Keras 'Input' layer for
-    branch b.
-
-    example usage:
-    ```
-    branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
-    branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
-
-    model = get_multi_io_model(branch_a, branch_b)
-    ```
-
-  To build a two-input, one-output model:
-    Specify a list of layers for branch a and branch b, and specify a
-    shared output branch. The resulting model will apply
-    each branch to a different input. It will then apply the shared output
-    branch to a tuple containing the intermediate outputs of each branch,
-    to produce a single output. The first layer in the shared_output_branch
-    must be able to merge a tuple of two tensors.
-
-    The first value in branch_a must be the Keras 'Input' layer for branch a,
-    and the first value in branch_b must be the Keras 'Input' layer for
-    branch b.
-
-    example usage:
-    ```
-    input_branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
-    input_branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
-    shared_output_branch = [Concatenate(), Dense(), Dense()]
-
-    model = get_multi_io_model(input_branch_a, input_branch_b,
-                               shared_output_branch=shared_output_branch)
-    ```
-  To build a one-input, two-output model:
-    Specify a list of layers for branch a and branch b, and specify a
-    shared input branch. The resulting model will take one input, and apply
-    the shared input branch to it. It will then respectively apply each branch
-    to that intermediate result in parallel, to produce two outputs.
-
-    The first value in the shared_input_branch must be the Keras 'Input' layer
-    for the whole model. Branch a and branch b should not contain any Input
-    layers.
-
-    example usage:
-    ```
-    shared_input_branch = [Input(shape=(2,), name='in'), Dense(), Dense()]
-    output_branch_a = [Dense(), Dense()]
-    output_branch_b = [Dense(), Dense()]
-
-
-    model = get_multi_io_model(output__branch_a, output_branch_b,
-                               shared_input_branch=shared_input_branch)
-    ```
-
-  Args:
-    branch_a: A sequence of layers for branch a of the model.
-    branch_b: A sequence of layers for branch b of the model.
-    shared_input_branch: An optional sequence of layers to apply to a single
-      input, before applying both branches to that intermediate result. If set,
-      the model will take only one input instead of two. Defaults to None.
-    shared_output_branch: An optional sequence of layers to merge the
-      intermediate results produced by branch a and branch b. If set,
-      the model will produce only one output instead of two. Defaults to None.
-
-  Returns:
-    A multi-io model of the type specified by `get_model_type`, specified
-    by the different branches.
-  """
-  # Extract the functional inputs from the layer lists
-  if shared_input_branch:
-    inputs = shared_input_branch[0]
-    shared_input_branch = shared_input_branch[1:]
-  else:
-    inputs = branch_a[0], branch_b[0]
-    branch_a = branch_a[1:]
-    branch_b = branch_b[1:]
-
-  model_type = get_model_type()
-  if model_type == 'subclass':
-    return _MultiIOSubclassModel(branch_a, branch_b, shared_input_branch,
-                                 shared_output_branch)
-
-  if model_type == 'subclass_custom_build':
-    return _MultiIOSubclassModelCustomBuild((lambda: branch_a),
-                                            (lambda: branch_b),
-                                            (lambda: shared_input_branch),
-                                            (lambda: shared_output_branch))
-
-  if model_type == 'sequential':
-    raise ValueError('Cannot use `get_multi_io_model` to construct '
-                     'sequential models')
-
-  if model_type == 'functional':
+    branch_a, branch_b, shared_input_branch=None, shared_output_branch=None
+):
+    """Builds a multi-io model that contains two branches.
+
+    The produced model will be of the type specified by `get_model_type`.
+
+    To build a two-input, two-output model:
+      Specify a list of layers for branch a and branch b, but do not specify any
+      shared input branch or shared output branch. The resulting model will
+      apply each branch to a different input, to produce two outputs.
+
+      The first value in branch_a must be the Keras 'Input' layer for branch a,
+      and the first value in branch_b must be the Keras 'Input' layer for
+      branch b.
+
+      example usage:
+      ```
+      branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+      branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+
+      model = get_multi_io_model(branch_a, branch_b)
+      ```
+
+    To build a two-input, one-output model:
+      Specify a list of layers for branch a and branch b, and specify a
+      shared output branch. The resulting model will apply
+      each branch to a different input. It will then apply the shared output
+      branch to a tuple containing the intermediate outputs of each branch,
+      to produce a single output. The first layer in the shared_output_branch
+      must be able to merge a tuple of two tensors.
+
+      The first value in branch_a must be the Keras 'Input' layer for branch a,
+      and the first value in branch_b must be the Keras 'Input' layer for
+      branch b.
+
+      example usage:
+      ```
+      input_branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+      input_branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+      shared_output_branch = [Concatenate(), Dense(), Dense()]
+
+      model = get_multi_io_model(input_branch_a, input_branch_b,
+                                 shared_output_branch=shared_output_branch)
+      ```
+    To build a one-input, two-output model:
+      Specify a list of layers for branch a and branch b, and specify a
+      shared input branch. The resulting model will take one input, and apply
+      the shared input branch to it. It will then respectively apply each branch
+      to that intermediate result in parallel, to produce two outputs.
+
+      The first value in the shared_input_branch must be the Keras 'Input' layer
+      for the whole model. Branch a and branch b should not contain any Input
+      layers.
+
+      example usage:
+      ```
+      shared_input_branch = [Input(shape=(2,), name='in'), Dense(), Dense()]
+      output_branch_a = [Dense(), Dense()]
+      output_branch_b = [Dense(), Dense()]
+
+
+      model = get_multi_io_model(output__branch_a, output_branch_b,
+                                 shared_input_branch=shared_input_branch)
+      ```
+
+    Args:
+      branch_a: A sequence of layers for branch a of the model.
+      branch_b: A sequence of layers for branch b of the model.
+      shared_input_branch: An optional sequence of layers to apply to a single
+        input, before applying both branches to that intermediate result. If
+        set, the model will take only one input instead of two. Defaults to
+        `None`.
+      shared_output_branch: An optional sequence of layers to merge the
+        intermediate results produced by branch a and branch b. If set,
+        the model will produce only one output instead of two.
+        Defaults to `None`.
+
+    Returns:
+      A multi-io model of the type specified by `get_model_type`, specified
+      by the different branches.
+    """
+    # Extract the functional inputs from the layer lists
     if shared_input_branch:
-      a_and_b = inputs
-      for layer in shared_input_branch:
-        a_and_b = layer(a_and_b)
-      a = a_and_b
-      b = a_and_b
+        inputs = shared_input_branch[0]
+        shared_input_branch = shared_input_branch[1:]
     else:
-      a, b = inputs
+        inputs = branch_a[0], branch_b[0]
+        branch_a = branch_a[1:]
+        branch_b = branch_b[1:]
 
-    for layer in branch_a:
-      a = layer(a)
-    for layer in branch_b:
-      b = layer(b)
-    outputs = a, b
-
-    if shared_output_branch:
-      for layer in shared_output_branch:
-        outputs = layer(outputs)
-
-    return models.Model(inputs, outputs)
-
-  raise ValueError('Unknown model type {}'.format(model_type))
+    model_type = get_model_type()
+    if model_type == "subclass":
+        return _MultiIOSubclassModel(
+            branch_a, branch_b, shared_input_branch, shared_output_branch
+        )
+
+    if model_type == "subclass_custom_build":
+        return _MultiIOSubclassModelCustomBuild(
+            (lambda: branch_a),
+            (lambda: branch_b),
+            (lambda: shared_input_branch),
+            (lambda: shared_output_branch),
+        )
+
+    if model_type == "sequential":
+        raise ValueError(
+            "Cannot use `get_multi_io_model` to construct sequential models"
+        )
+
+    if model_type == "functional":
+        if shared_input_branch:
+            a_and_b = inputs
+            for layer in shared_input_branch:
+                a_and_b = layer(a_and_b)
+            a = a_and_b
+            b = a_and_b
+        else:
+            a, b = inputs
+
+        for layer in branch_a:
+            a = layer(a)
+        for layer in branch_b:
+            b = layer(b)
+        outputs = a, b
+
+        if shared_output_branch:
+            for layer in shared_output_branch:
+                outputs = layer(outputs)
+
+        return models.Model(inputs, outputs)
+
+    raise ValueError(f"Unknown model type {model_type}")
 
 
 _V2_OPTIMIZER_MAP = {
-    'adadelta': adadelta_v2.Adadelta,
-    'adagrad': adagrad_v2.Adagrad,
-    'adam': adam_v2.Adam,
-    'adamax': adamax_v2.Adamax,
-    'nadam': nadam_v2.Nadam,
-    'rmsprop': rmsprop_v2.RMSprop,
-    'sgd': gradient_descent_v2.SGD
+    "adadelta": adadelta_v2.Adadelta,
+    "adagrad": adagrad_v2.Adagrad,
+    "adam": adam_v2.Adam,
+    "adamax": adamax_v2.Adamax,
+    "nadam": nadam_v2.Nadam,
+    "rmsprop": rmsprop_v2.RMSprop,
+    "sgd": gradient_descent_v2.SGD,
 }
 
 
 def get_v2_optimizer(name, **kwargs):
-  """Get the v2 optimizer requested.
+    """Get the v2 optimizer requested.
 
-  This is only necessary until v2 are the default, as we are testing in Eager,
-  and Eager + v1 optimizers fail tests. When we are in v2, the strings alone
-  should be sufficient, and this mapping can theoretically be removed.
+    This is only necessary until v2 are the default, as we are testing in Eager,
+    and Eager + v1 optimizers fail tests. When we are in v2, the strings alone
+    should be sufficient, and this mapping can theoretically be removed.
 
-  Args:
-    name: string name of Keras v2 optimizer.
-    **kwargs: any kwargs to pass to the optimizer constructor.
+    Args:
+      name: string name of Keras v2 optimizer.
+      **kwargs: any kwargs to pass to the optimizer constructor.
 
-  Returns:
-    Initialized Keras v2 optimizer.
+    Returns:
+      Initialized Keras v2 optimizer.
 
-  Raises:
-    ValueError: if an unknown name was passed.
-  """
-  try:
-    return _V2_OPTIMIZER_MAP[name](**kwargs)
-  except KeyError:
-    raise ValueError(
-        'Could not find requested v2 optimizer: {}\nValid choices: {}'.format(
-            name, list(_V2_OPTIMIZER_MAP.keys())))
+    Raises:
+      ValueError: if an unknown name was passed.
+    """
+    try:
+        return _V2_OPTIMIZER_MAP[name](**kwargs)
+    except KeyError:
+        raise ValueError(
+            "Could not find requested v2 optimizer: "
+            "{}\nValid choices: {}".format(name, list(_V2_OPTIMIZER_MAP.keys()))
+        )
 
 
-def get_expected_metric_variable_names(var_names, name_suffix=''):
-  """Returns expected metric variable names given names and prefix/suffix."""
-  if tf.__internal__.tf2.enabled() or tf.executing_eagerly():
-    # In V1 eager mode and V2 variable names are not made unique.
-    return [n + ':0' for n in var_names]
-  # In V1 graph mode variable names are made unique using a suffix.
-  return [n + name_suffix + ':0' for n in var_names]
+def get_expected_metric_variable_names(var_names, name_suffix=""):
+    """Returns expected metric variable names given names and prefix/suffix."""
+    if tf.__internal__.tf2.enabled() or tf.executing_eagerly():
+        # In V1 eager mode and V2 variable names are not made unique.
+        return [n + ":0" for n in var_names]
+    # In V1 graph mode variable names are made unique using a suffix.
+    return [n + name_suffix + ":0" for n in var_names]
 
 
 def enable_v2_dtype_behavior(fn):
-  """Decorator for enabling the layer V2 dtype behavior on a test."""
-  return _set_v2_dtype_behavior(fn, True)
+    """Decorator for enabling the layer V2 dtype behavior on a test."""
+    return _set_v2_dtype_behavior(fn, True)
 
 
 def disable_v2_dtype_behavior(fn):
-  """Decorator for disabling the layer V2 dtype behavior on a test."""
-  return _set_v2_dtype_behavior(fn, False)
+    """Decorator for disabling the layer V2 dtype behavior on a test."""
+    return _set_v2_dtype_behavior(fn, False)
 
 
 def _set_v2_dtype_behavior(fn, enabled):
-  """Returns version of 'fn' that runs with v2 dtype behavior on or off."""
-  @functools.wraps(fn)
-  def wrapper(*args, **kwargs):
-    v2_dtype_behavior = base_layer_utils.V2_DTYPE_BEHAVIOR
-    base_layer_utils.V2_DTYPE_BEHAVIOR = enabled
-    try:
-      return fn(*args, **kwargs)
-    finally:
-      base_layer_utils.V2_DTYPE_BEHAVIOR = v2_dtype_behavior
+    """Returns version of 'fn' that runs with v2 dtype behavior on or off."""
 
-  return tf.__internal__.decorator.make_decorator(fn, wrapper)
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        v2_dtype_behavior = base_layer_utils.V2_DTYPE_BEHAVIOR
+        base_layer_utils.V2_DTYPE_BEHAVIOR = enabled
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            base_layer_utils.V2_DTYPE_BEHAVIOR = v2_dtype_behavior
+
+    return tf.__internal__.decorator.make_decorator(fn, wrapper)
 
 
 @contextlib.contextmanager
 def device(should_use_gpu):
-  """Uses gpu when requested and available."""
-  if should_use_gpu and tf.test.is_gpu_available():
-    dev = '/device:GPU:0'
-  else:
-    dev = '/device:CPU:0'
-  with tf.device(dev):
-    yield
+    """Uses gpu when requested and available."""
+    if should_use_gpu and tf.test.is_gpu_available():
+        dev = "/device:GPU:0"
+    else:
+        dev = "/device:CPU:0"
+    with tf.device(dev):
+        yield
 
 
 @contextlib.contextmanager
 def use_gpu():
-  """Uses gpu when requested and available."""
-  with device(should_use_gpu=True):
-    yield
+    """Uses gpu when requested and available."""
+    with device(should_use_gpu=True):
+        yield
 
 
 def for_all_test_methods(decorator, *args, **kwargs):
-  """Generate class-level decorator from given method-level decorator.
+    """Generate class-level decorator from given method-level decorator.
 
-  It is expected for the given decorator to take some arguments and return
-  a method that is then called on the test method to produce a decorated
-  method.
+    It is expected for the given decorator to take some arguments and return
+    a method that is then called on the test method to produce a decorated
+    method.
 
-  Args:
-    decorator: The decorator to apply.
-    *args: Positional arguments
-    **kwargs: Keyword arguments
-  Returns: Function that will decorate a given classes test methods with the
-    decorator.
-  """
+    Args:
+      decorator: The decorator to apply.
+      *args: Positional arguments
+      **kwargs: Keyword arguments
+    Returns: Function that will decorate a given classes test methods with the
+      decorator.
+    """
 
-  def all_test_methods_impl(cls):
-    """Apply decorator to all test methods in class."""
-    for name in dir(cls):
-      value = getattr(cls, name)
-      if callable(value) and name.startswith('test') and (name !=
-                                                          'test_session'):
-        setattr(cls, name, decorator(*args, **kwargs)(value))
-    return cls
+    def all_test_methods_impl(cls):
+        """Apply decorator to all test methods in class."""
+        for name in dir(cls):
+            value = getattr(cls, name)
+            if (
+                callable(value)
+                and name.startswith("test")
+                and (name != "test_session")
+            ):
+                setattr(cls, name, decorator(*args, **kwargs)(value))
+        return cls
 
-  return all_test_methods_impl
+    return all_test_methods_impl
 
 
 # The description is just for documentation purposes.
-def run_without_tensor_float_32(description):  # pylint: disable=unused-argument
-  """Execute test with TensorFloat-32 disabled.
-
-  While almost every real-world deep learning model runs fine with
-  TensorFloat-32, many tests use assertAllClose or similar methods.
-  TensorFloat-32 matmuls typically will cause such methods to fail with the
-  default tolerances.
+def run_without_tensor_float_32(description):
+    """Execute test with TensorFloat-32 disabled.
 
-  Args:
-    description: A description used for documentation purposes, describing why
-      the test requires TensorFloat-32 to be disabled.
+    While almost every real-world deep learning model runs fine with
+    TensorFloat-32, many tests use assertAllClose or similar methods.
+    TensorFloat-32 matmuls typically will cause such methods to fail with the
+    default tolerances.
 
-  Returns:
-    Decorator which runs a test with TensorFloat-32 disabled.
-  """
+    Args:
+      description: A description used for documentation purposes, describing why
+        the test requires TensorFloat-32 to be disabled.
 
-  def decorator(f):
+    Returns:
+      Decorator which runs a test with TensorFloat-32 disabled.
+    """
 
-    @functools.wraps(f)
-    def decorated(self, *args, **kwargs):
-      allowed = tf.config.experimental.tensor_float_32_execution_enabled()
-      try:
-        tf.config.experimental.enable_tensor_float_32_execution(False)
-        f(self, *args, **kwargs)
-      finally:
-        tf.config.experimental.enable_tensor_float_32_execution(allowed)
+    def decorator(f):
+        @functools.wraps(f)
+        def decorated(self, *args, **kwargs):
+            allowed = tf.config.experimental.tensor_float_32_execution_enabled()
+            try:
+                tf.config.experimental.enable_tensor_float_32_execution(False)
+                f(self, *args, **kwargs)
+            finally:
+                tf.config.experimental.enable_tensor_float_32_execution(allowed)
 
-    return decorated
+        return decorated
 
-  return decorator
+    return decorator
 
 
 # The description is just for documentation purposes.
-def run_all_without_tensor_float_32(description):  # pylint: disable=unused-argument
-  """Execute all tests in a class with TensorFloat-32 disabled."""
-  return for_all_test_methods(run_without_tensor_float_32, description)
+def run_all_without_tensor_float_32(
+    description,
+):
+    """Execute all tests in a class with TensorFloat-32 disabled."""
+    return for_all_test_methods(run_without_tensor_float_32, description)
 
 
 def run_v2_only(obj=None):
-  """Execute the decorated test only if running in v2 mode.
+    """Execute the decorated test only if running in v2 mode.
 
-  This function is intended to be applied to tests that exercise v2 only
-  functionality. If the test is run in v1 mode it will simply be skipped.
+    This function is intended to be applied to tests that exercise v2 only
+    functionality. If the test is run in v1 mode it will simply be skipped.
 
-  See go/tf-test-decorator-cheatsheet for the decorators to use in different
-  v1/v2/eager/graph combinations.
+    See go/tf-test-decorator-cheatsheet for the decorators to use in different
+    v1/v2/eager/graph combinations.
 
-  Args:
-    obj: function to be annotated. If None, return a
-      decorator the can be applied to a function or class. If `obj` is not None,
-      return the decorator applied to `obj`.
+    Args:
+      obj: function to be annotated. If None, return a
+        decorator the can be applied to a function or class. If `obj` is not
+        None, return the decorator applied to `obj`.
+
+    Returns:
+      Returns a decorator that will conditionally skip the decorated test
+      method.
+    """
+    condition = not tf.__internal__.tf2.enabled()
+    reason = "Test is only compatible with TF v2."
 
-  Returns:
-    Returns a decorator that will conditionally skip the decorated test method.
-  """
-  condition = not tf.__internal__.tf2.enabled()
-  reason = 'Test is only compatible with TF v2.'
+    def decorator(f):
+        if tf_inspect.isclass(f):
+            return unittest.skipIf(condition=condition, reason=reason)(obj)
 
-  def decorator(f):
-    if tf_inspect.isclass(f):
-      return unittest.skipIf(condition=condition, reason=reason)(obj)
+        def decorated(self, *args, **kwargs):
+            if condition:
+                self.skipTest(reason)
+            return f(self, *args, **kwargs)
 
-    def decorated(self, *args, **kwargs):
-      if condition:
-        self.skipTest(reason)
-      return f(self, *args, **kwargs)
-    return decorated
+        return decorated
 
-  if obj is not None:
-    return decorator(obj)
+    if obj is not None:
+        return decorator(obj)
 
-  return decorator
+    return decorator
 
 
 def generate_combinations_with_testcase_name(**kwargs):
-  """Generate combinations based on its keyword arguments using combine().
-
-  This function calls combine() and appends a testcase name to the list of
-  dictionaries returned. The 'testcase_name' key is a required for named
-  parameterized tests.
-
-  Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]` or
-      `option=the_only_possibility`.
-
-  Returns:
-    a list of dictionaries for each combination. Keys in the dictionaries are
-    the keyword argument names.  Each key has one value - one of the
-    corresponding keyword argument values.
-  """
-  sort_by_key = lambda k: k[0]
-  combinations = []
-  for key, values in sorted(kwargs.items(), key=sort_by_key):
-    if not isinstance(values, list):
-      values = [values]
-    combinations.append([(key, value) for value in values])
-
-  combinations = [collections.OrderedDict(result)
-                  for result in itertools.product(*combinations)]
-  named_combinations = []
-  for combination in combinations:
-    assert isinstance(combination, collections.OrderedDict)
-    name = ''.join([
-        '_{}_{}'.format(''.join(filter(str.isalnum, key)),
-                        ''.join(filter(str.isalnum, str(value))))
-        for key, value in combination.items()
-    ])
-    named_combinations.append(
-        collections.OrderedDict(
-            list(combination.items()) +
-            [('testcase_name', '_test{}'.format(name))]))
-
-  return named_combinations
+    """Generate combinations based on its keyword arguments using combine().
+
+    This function calls combine() and appends a testcase name to the list of
+    dictionaries returned. The 'testcase_name' key is a required for named
+    parameterized tests.
+
+    Args:
+      **kwargs: keyword arguments of form `option=[possibilities, ...]` or
+        `option=the_only_possibility`.
+
+    Returns:
+      a list of dictionaries for each combination. Keys in the dictionaries are
+      the keyword argument names.  Each key has one value - one of the
+      corresponding keyword argument values.
+    """
+    sort_by_key = lambda k: k[0]
+    combinations = []
+    for key, values in sorted(kwargs.items(), key=sort_by_key):
+        if not isinstance(values, list):
+            values = [values]
+        combinations.append([(key, value) for value in values])
+
+    combinations = [
+        collections.OrderedDict(result)
+        for result in itertools.product(*combinations)
+    ]
+    named_combinations = []
+    for combination in combinations:
+        assert isinstance(combination, collections.OrderedDict)
+        name = "".join(
+            [
+                "_{}_{}".format(
+                    "".join(filter(str.isalnum, key)),
+                    "".join(filter(str.isalnum, str(value))),
+                )
+                for key, value in combination.items()
+            ]
+        )
+        named_combinations.append(
+            collections.OrderedDict(
+                list(combination.items()) + [("testcase_name", f"_test{name}")]
+            )
+        )
+
+    return named_combinations
diff --git a/keras/tests/BUILD b/keras/tests/BUILD
index 6e782cad7492..62681c407b38 100644
--- a/keras/tests/BUILD
+++ b/keras/tests/BUILD
@@ -1,6 +1,9 @@
 # Description:
 #   Contains Keras test utils and integration tests.
 
+# Placeholder: load unaliased py_library
+# Placeholder: load unaliased py_test
+
 # buildifier: disable=same-origin-load
 load("@org_keras//keras:keras.bzl", "cuda_py_test")
 
@@ -9,6 +12,7 @@ load("@org_keras//keras:keras.bzl", "tf_py_test")
 load("@org_keras//keras:keras.bzl", "tpu_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     default_visibility = [
         "//keras:friends",
         "//third_party/tensorflow/tools/pip_package:__pkg__",
@@ -256,7 +260,7 @@ tf_py_test(
         "//keras/api:keras_api",
         "//keras/layers/core",
         "//keras/metrics",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
     ],
 )
 
@@ -334,7 +338,7 @@ tf_py_test(
         "//keras/api:keras_api",
         "//keras/engine",
         "//keras/layers/core",
-        "//keras/optimizers/optimizer_v2",
+        "//keras/optimizers/legacy:optimizers",
         "//keras/testing_infra:test_combinations",
     ],
 )
diff --git a/keras/tests/__init__.py b/keras/tests/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/keras/tests/add_loss_correctness_test.py b/keras/tests/add_loss_correctness_test.py
index 62aa6d50e763..5bf87c9ce670 100644
--- a/keras/tests/add_loss_correctness_test.py
+++ b/keras/tests/add_loss_correctness_test.py
@@ -14,39 +14,43 @@
 # ==============================================================================
 """Tests add_loss API correctness."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 from keras import Input
-from keras.testing_infra import test_combinations
-from keras import layers
-from keras import losses
 from keras import Model
-from keras.optimizers import optimizer_v2
 from keras import Sequential
+from keras import layers
+from keras import losses
+from keras.optimizers import legacy as optimizer_legacy
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
+
+# isort: off
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.training.rmsprop import (
+    RMSPropOptimizer,
+)
 
 MAE = losses.MeanAbsoluteError
 mae = losses.mean_absolute_error
 
 
 def get_ctl_train_step(model):
-  optimizer = optimizer_v2.gradient_descent.SGD(0.05)
+    optimizer = optimizer_legacy.gradient_descent.SGD(0.05)
 
-  def train_step(x, y, w=None):
-    with tf.GradientTape() as tape:
-      if w is not None:
-        model([x, y, w])
-      else:
-        model([x, y])
-      loss = tf.reduce_sum(model.losses)
-    gradients = tape.gradient(loss, model.trainable_weights)
-    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
-    return loss
+    def train_step(x, y, w=None):
+        with tf.GradientTape() as tape:
+            if w is not None:
+                model([x, y, w])
+            else:
+                model([x, y])
+            loss = tf.reduce_sum(model.losses)
+        gradients = tape.gradient(loss, model.trainable_weights)
+        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+        return loss
 
-  return train_step
+    return train_step
 
 
 # TODO(psv): Add tests cases where a model is used in loss function but is
@@ -54,402 +58,411 @@ def train_step(x, y, w=None):
 
 
 class TestAddLossCorrectness(test_combinations.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.x = np.array([[0.], [1.], [2.]], dtype='float32')
-    self.y = np.array([[0.5], [2.], [3.5]], dtype='float32')
-    self.w = np.array([[1.25], [0.5], [1.25]], dtype='float32')
-
-  @test_combinations.run_all_keras_modes
-  def test_loss_on_model_fit(self):
-    inputs = Input(shape=(1,))
-    targets = Input(shape=(1,))
-    outputs = test_utils.Bias()(inputs)
-    model = Model([inputs, targets], outputs)
-    model.add_loss(MAE()(targets, outputs))
-    model.add_loss(tf.reduce_mean(mae(targets, outputs)))
-    model.compile(
-        optimizer_v2.gradient_descent.SGD(0.05),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit([self.x, self.y], batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-  @test_combinations.run_with_all_model_types(exclude_models=['sequential'])
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_loss_callable_on_model_fit(self):
-    model = test_utils.get_model_from_layers([test_utils.Bias()],
-                                             input_shape=(1,))
-
-    def callable_loss():
-      return tf.reduce_sum(model.weights)
-
-    model.add_loss(callable_loss)
-    model.compile(
-        optimizer_v2.gradient_descent.SGD(0.1),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit(self.x, batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [0., -.1, -.2, -.3, -.4], 1e-3)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_loss_on_model_ctl(self):
-    def get_model_and_train_step():
-      inputs = Input(shape=(1,))
-      targets = Input(shape=(1,))
-      outputs = test_utils.Bias()(inputs)
-      model = Model([inputs, targets], outputs)
-      model.add_loss(MAE()(targets, outputs))
-      model.add_loss(tf.reduce_mean(mae(targets, outputs)))
-      return get_ctl_train_step(model)
-
-    train_step = get_model_and_train_step()
-    loss = [train_step(self.x, self.y) for _ in range(5)]
-    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-    train_step = tf.function(get_model_and_train_step())
-    loss = [train_step(self.x, self.y) for _ in range(5)]
-    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_loss_callable_on_model_ctl(self):
-    def get_model_and_train_step():
-      inputs = Input(shape=(1,))
-      targets = Input(shape=(1,))
-      outputs = test_utils.Bias()(inputs)
-      model = Model([inputs, targets], outputs)
-
-      def callable_loss():
-        return tf.reduce_sum(model.weights)
-
-      model.add_loss(callable_loss)
-      return get_ctl_train_step(model)
-
-    train_step = get_model_and_train_step()
-    loss = [train_step(self.x, self.y) for _ in range(5)]
-    self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
-
-    train_step = tf.function(get_model_and_train_step())
-    loss = [train_step(self.x, self.y) for _ in range(5)]
-    self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
-
-  @test_combinations.run_all_keras_modes
-  def test_loss_with_sample_weight_on_model_fit(self):
-    inputs = Input(shape=(1,))
-    targets = Input(shape=(1,))
-    sw = Input(shape=(1,))
-    outputs = test_utils.Bias()(inputs)
-    model = Model([inputs, targets, sw], outputs)
-    model.add_loss(MAE()(targets, outputs, sw))
-    model.add_loss(3 * tf.reduce_mean(sw * mae(targets, outputs)))
-    model.compile(
-        optimizer_v2.gradient_descent.SGD(0.025),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [4., 3.6, 3.2, 2.8, 2.4], 1e-3)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_loss_with_sample_weight_on_model_ctl(self):
-    def get_model_and_train_step():
-      inputs = Input(shape=(1,))
-      targets = Input(shape=(1,))
-      sw = Input(shape=(1,))
-      outputs = test_utils.Bias()(inputs)
-      model = Model([inputs, targets, sw], outputs)
-      model.add_loss(MAE()(targets, outputs, sw))
-      model.add_loss(tf.reduce_mean(sw * mae(targets, outputs)))
-      return get_ctl_train_step(model)
-
-    train_step = get_model_and_train_step()
-    loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
-    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-    train_step = tf.function(get_model_and_train_step())
-    loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
-    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-  @test_combinations.run_all_keras_modes
-  def test_loss_with_sample_weight_in_model_call(self):
-
-    class MyModel(Model):
-
-      def __init__(self):
-        super().__init__()
-        self.bias = test_utils.Bias()
-
-      def call(self, inputs):
-        outputs = self.bias(inputs[0])
-        self.add_loss(MAE()(inputs[1], outputs, inputs[2]))
-        self.add_loss(tf.reduce_mean(inputs[2] * mae(inputs[1], outputs)))
-        return outputs
-
-    model = MyModel()
-    model.predict([self.x, self.y, self.w])
-    model.compile(
-        optimizer_v2.gradient_descent.SGD(0.05),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
-    self.assertEqual(len(model.losses), 2)
-    self.assertAllClose(history.history['loss'], [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-    eval_out = model.evaluate([self.x, self.y, self.w])
-    self.assertAlmostEqual(eval_out, 1.0, 3)
-
-  @test_combinations.run_all_keras_modes
-  def test_loss_with_sample_weight_in_layer_call(self):
-
-    class MyLayer(layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.bias = test_utils.Bias()
-
-      def call(self, inputs):
-        out = self.bias(inputs[0])
-        self.add_loss(MAE()(inputs[1], out, inputs[2]))
-        self.add_loss(tf.reduce_mean(inputs[2] * mae(inputs[1], out)))
-        return out
-
-    inputs = Input(shape=(1,))
-    targets = Input(shape=(1,))
-    sw = Input(shape=(1,))
-
-    outputs = MyLayer()([inputs, targets, sw])
-    model = Model([inputs, targets, sw], outputs)
-    model.predict([self.x, self.y, self.w])
-    model.compile(
-        optimizer_v2.gradient_descent.SGD(0.05),
-        run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-    output = model.evaluate([self.x, self.y, self.w])
-    self.assertAlmostEqual(output, 1.0, 3)
-
-    output = model.test_on_batch([self.x, self.y, self.w])
-    self.assertAlmostEqual(output, 1.0, 3)
-
-  @test_combinations.run_all_keras_modes
-  def test_loss_on_layer(self):
-
-    class MyLayer(layers.Layer):
-
-      def call(self, inputs):
-        self.add_loss(tf.reduce_sum(inputs))
-        return inputs
-
-    inputs = Input((3,))
-    layer = MyLayer()
-    outputs = layer(inputs)
-    model = Model(inputs, outputs)
-    self.assertEqual(len(model.losses), 1)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(loss, 2 * 3)
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_activity_regularizer(self):
-    loss = {}
-    for reg in [None, 'l2']:
-      model_layers = [
-          layers.Dense(
-              10,
-              activation='relu',
-              activity_regularizer=reg,
-              kernel_initializer='ones',
-              use_bias=False),
-          layers.Dense(
-              1,
-              activation='sigmoid',
-              kernel_initializer='ones',
-              use_bias=False),
-      ]
-
-      model = test_utils.get_model_from_layers(
-          model_layers, input_shape=(10,))
-
-      x = np.ones((10, 10), 'float32')
-      y = np.zeros((10, 1), 'float32')
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(
-          optimizer,
-          'binary_crossentropy',
-          run_eagerly=test_utils.should_run_eagerly())
-      model.fit(x, y, batch_size=2, epochs=5)
-      loss[reg] = model.evaluate(x, y)
-    self.assertLess(loss[None], loss['l2'])
-
-  @test_combinations.run_all_keras_modes
-  @test_combinations.run_with_all_model_types
-  def test_activity_regularizer_loss_value(self):
-    layer = layers.Dense(
-        1,
-        kernel_initializer='zeros',
-        bias_initializer='ones',
-        activity_regularizer='l2')
-
-    model = test_utils.get_model_from_layers([layer], input_shape=(10,))
-
-    x = np.ones((10, 10), 'float32')
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(
-        optimizer,
-        run_eagerly=test_utils.should_run_eagerly())
-    loss = model.test_on_batch(x)
-    self.assertAlmostEqual(0.01, loss, places=4)
-
-  @test_combinations.run_all_keras_modes
-  def test_activity_regularizer_batch_independent(self):
-    inputs = layers.Input(shape=(10,))
-    x = layers.Dense(10, activation='relu', activity_regularizer='l2')(inputs)
-    outputs = layers.Dense(1, activation='sigmoid')(x)
-    model = Model(inputs, outputs)
-
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(
-        optimizer,
-        run_eagerly=test_utils.should_run_eagerly())
-
-    loss_small_batch = model.test_on_batch(np.ones((10, 10), 'float32'))
-    loss_big_batch = model.test_on_batch(np.ones((20, 10), 'float32'))
-    self.assertAlmostEqual(loss_small_batch, loss_big_batch, places=4)
-
-  @test_combinations.run_all_keras_modes
-  def test_with_shared_layer(self):
-
-    class LayerWithLoss(layers.Layer):
-
-      def call(self, inputs):
-        self.add_loss(tf.reduce_sum(inputs))
-        return inputs * 2
-
-    shared_layer = LayerWithLoss()
-
-    m = Sequential([shared_layer])
-    m2 = Sequential([shared_layer, m])
-    m2(tf.constant([1, 2, 3]))
-    self.assertEqual(len(m2.losses), 2)
-    self.assertAllClose(m2.losses, [6, 12])
-
-  @test_combinations.run_all_keras_modes
-  def test_with_shared_nested_layer(self):
-
-    class LayerWithLoss(layers.Layer):
-
-      def call(self, inputs):
-        self.add_loss(tf.reduce_sum(inputs))
-        return inputs * 2
-
-    class LayerWithNestedLayerWithLoss(layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.loss_layer = LayerWithLoss()
-
-      def call(self, inputs):
-        return self.loss_layer(inputs)
-
-    shared_layer = LayerWithNestedLayerWithLoss()
-
-    m = Sequential([shared_layer])
-    m2 = Sequential([shared_layer, m])
-    m2(tf.constant([1, 2, 3]))
-    self.assertEqual(len(m2.losses), 2)
-    self.assertAllClose(m2.losses, [6, 12])
-
-  @test_combinations.run_all_keras_modes
-  def test_clear_losses(self):
-
-    class LayerWithSharedNestedLossLayer(layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.loss_layer = layers.ActivityRegularization(l2=0.001)
-        self.add_weight(shape=(1,), regularizer='l2')
-
-      def call(self, x):
-        x = self.loss_layer(x)
-        return self.loss_layer(x)
-
-    inputs = Input(shape=(1,))
-    l = LayerWithSharedNestedLossLayer()  # Weight loss + 2 activity losses.
-
-    x1 = tf.ones((1, 1))
-    _ = l(x1)
-    if not tf.executing_eagerly():
-      self.assertEqual(len(l.get_losses_for(x1)), 2)
-      self.assertEqual(len(l.get_losses_for(None)), 1)
-
-    x2 = tf.ones((1, 1))
-    _ = l(x2)
-    if not tf.executing_eagerly():
-      self.assertEqual(len(l.get_losses_for(x1)), 2)
-      self.assertEqual(len(l.get_losses_for(x2)), 2)
-      self.assertEqual(len(l.get_losses_for(None)), 1)
-
-    outputs = l(inputs)
-    model = Model(inputs, outputs)
-    if not tf.executing_eagerly():
-      self.assertEqual(len(model.losses), 7)
-      self.assertEqual(len(l.get_losses_for(x1)), 2)
-      self.assertEqual(len(l.get_losses_for(x2)), 2)
-      self.assertEqual(len(l.get_losses_for(None)), 1)
-
-    x3 = tf.ones((1, 1))
-    model(x3)
-    x4 = tf.ones((1, 1))
-    model(x4)
-    if tf.executing_eagerly():
-      # Eager losses are cleared every `__call__`.
-      self.assertEqual(len(model.losses), 3)
-    else:
-      self.assertEqual(len(model.losses), 11)
-      self.assertEqual(len(model.get_losses_for(x3)), 2)
-      self.assertEqual(len(model.get_losses_for(x4)), 2)
-      self.assertEqual(len(model.get_losses_for(None)), 1)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_invalid_constant_input(self):
-    inputs = Input(shape=(1,))
-    outputs = test_utils.Bias()(inputs)
-    model = Model(inputs, outputs)
-    with self.assertRaisesRegex(
-        ValueError,
-        'Expected a symbolic Tensors or a callable for the loss value'):
-      model.add_loss(1.)
-
-  @test_combinations.run_all_keras_modes(always_skip_v1=True)
-  def test_invalid_variable_input(self):
-    inputs = Input(shape=(1,))
-    outputs = test_utils.Bias()(inputs)
-    model = Model(inputs, outputs)
-    with self.assertRaisesRegex(
-        ValueError,
-        'Expected a symbolic Tensors or a callable for the loss value'):
-      model.add_loss(model.weights[0])
-
-  @test_combinations.run_all_keras_modes
-  def test_add_entropy_loss_on_functional_model(self):
-    inputs = Input(shape=(1,))
-    targets = Input(shape=(1,))
-    outputs = test_utils.Bias()(inputs)
-    model = Model([inputs, targets], outputs)
-    model.add_loss(losses.binary_crossentropy(targets, outputs))
-    model.compile('sgd', run_eagerly=test_utils.should_run_eagerly())
-    with tf.compat.v1.test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit([self.x, self.y], batch_size=3, epochs=5)
-      self.assertNotIn('Gradients do not exist for variables',
-                       str(mock_log.call_args))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def setUp(self):
+        super().setUp()
+        self.x = np.array([[0.0], [1.0], [2.0]], dtype="float32")
+        self.y = np.array([[0.5], [2.0], [3.5]], dtype="float32")
+        self.w = np.array([[1.25], [0.5], [1.25]], dtype="float32")
+
+    @test_combinations.run_all_keras_modes
+    def test_loss_on_model_fit(self):
+        inputs = Input(shape=(1,))
+        targets = Input(shape=(1,))
+        outputs = test_utils.Bias()(inputs)
+        model = Model([inputs, targets], outputs)
+        model.add_loss(MAE()(targets, outputs))
+        model.add_loss(tf.reduce_mean(mae(targets, outputs)))
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(0.05),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        history = model.fit([self.x, self.y], batch_size=3, epochs=5)
+        self.assertAllClose(
+            history.history["loss"], [2.0, 1.8, 1.6, 1.4, 1.2], 1e-3
+        )
+
+    @test_combinations.run_with_all_model_types(exclude_models=["sequential"])
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_loss_callable_on_model_fit(self):
+        model = test_utils.get_model_from_layers(
+            [test_utils.Bias()], input_shape=(1,)
+        )
+
+        def callable_loss():
+            return tf.reduce_sum(model.weights)
+
+        model.add_loss(callable_loss)
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(0.1),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        history = model.fit(self.x, batch_size=3, epochs=5)
+        self.assertAllClose(
+            history.history["loss"], [0.0, -0.1, -0.2, -0.3, -0.4], 1e-3
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_loss_on_model_ctl(self):
+        def get_model_and_train_step():
+            inputs = Input(shape=(1,))
+            targets = Input(shape=(1,))
+            outputs = test_utils.Bias()(inputs)
+            model = Model([inputs, targets], outputs)
+            model.add_loss(MAE()(targets, outputs))
+            model.add_loss(tf.reduce_mean(mae(targets, outputs)))
+            return get_ctl_train_step(model)
+
+        train_step = get_model_and_train_step()
+        loss = [train_step(self.x, self.y) for _ in range(5)]
+        self.assertAllClose(loss, [2.0, 1.8, 1.6, 1.4, 1.2], 1e-3)
+
+        train_step = tf.function(get_model_and_train_step())
+        loss = [train_step(self.x, self.y) for _ in range(5)]
+        self.assertAllClose(loss, [2.0, 1.8, 1.6, 1.4, 1.2], 1e-3)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_loss_callable_on_model_ctl(self):
+        def get_model_and_train_step():
+            inputs = Input(shape=(1,))
+            targets = Input(shape=(1,))
+            outputs = test_utils.Bias()(inputs)
+            model = Model([inputs, targets], outputs)
+
+            def callable_loss():
+                return tf.reduce_sum(model.weights)
+
+            model.add_loss(callable_loss)
+            return get_ctl_train_step(model)
+
+        train_step = get_model_and_train_step()
+        loss = [train_step(self.x, self.y) for _ in range(5)]
+        self.assertAllClose(loss, [0.0, -0.05, -0.1, -0.15, -0.2], 1e-3)
+
+        train_step = tf.function(get_model_and_train_step())
+        loss = [train_step(self.x, self.y) for _ in range(5)]
+        self.assertAllClose(loss, [0.0, -0.05, -0.1, -0.15, -0.2], 1e-3)
+
+    @test_combinations.run_all_keras_modes
+    def test_loss_with_sample_weight_on_model_fit(self):
+        inputs = Input(shape=(1,))
+        targets = Input(shape=(1,))
+        sw = Input(shape=(1,))
+        outputs = test_utils.Bias()(inputs)
+        model = Model([inputs, targets, sw], outputs)
+        model.add_loss(MAE()(targets, outputs, sw))
+        model.add_loss(3 * tf.reduce_mean(sw * mae(targets, outputs)))
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(0.025),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
+        self.assertAllClose(
+            history.history["loss"], [4.0, 3.6, 3.2, 2.8, 2.4], 1e-3
+        )
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_loss_with_sample_weight_on_model_ctl(self):
+        def get_model_and_train_step():
+            inputs = Input(shape=(1,))
+            targets = Input(shape=(1,))
+            sw = Input(shape=(1,))
+            outputs = test_utils.Bias()(inputs)
+            model = Model([inputs, targets, sw], outputs)
+            model.add_loss(MAE()(targets, outputs, sw))
+            model.add_loss(tf.reduce_mean(sw * mae(targets, outputs)))
+            return get_ctl_train_step(model)
+
+        train_step = get_model_and_train_step()
+        loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
+        self.assertAllClose(loss, [2.0, 1.8, 1.6, 1.4, 1.2], 1e-3)
+
+        train_step = tf.function(get_model_and_train_step())
+        loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
+        self.assertAllClose(loss, [2.0, 1.8, 1.6, 1.4, 1.2], 1e-3)
+
+    @test_combinations.run_all_keras_modes
+    def test_loss_with_sample_weight_in_model_call(self):
+        class MyModel(Model):
+            def __init__(self):
+                super().__init__()
+                self.bias = test_utils.Bias()
+
+            def call(self, inputs):
+                outputs = self.bias(inputs[0])
+                self.add_loss(MAE()(inputs[1], outputs, inputs[2]))
+                self.add_loss(
+                    tf.reduce_mean(inputs[2] * mae(inputs[1], outputs))
+                )
+                return outputs
+
+        model = MyModel()
+        model.predict([self.x, self.y, self.w])
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(0.05),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
+        self.assertEqual(len(model.losses), 2)
+        self.assertAllClose(
+            history.history["loss"], [2.0, 1.8, 1.6, 1.4, 1.2], 1e-3
+        )
+
+        eval_out = model.evaluate([self.x, self.y, self.w])
+        self.assertAlmostEqual(eval_out, 1.0, 3)
+
+    @test_combinations.run_all_keras_modes
+    def test_loss_with_sample_weight_in_layer_call(self):
+        class MyLayer(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.bias = test_utils.Bias()
+
+            def call(self, inputs):
+                out = self.bias(inputs[0])
+                self.add_loss(MAE()(inputs[1], out, inputs[2]))
+                self.add_loss(tf.reduce_mean(inputs[2] * mae(inputs[1], out)))
+                return out
+
+        inputs = Input(shape=(1,))
+        targets = Input(shape=(1,))
+        sw = Input(shape=(1,))
+
+        outputs = MyLayer()([inputs, targets, sw])
+        model = Model([inputs, targets, sw], outputs)
+        model.predict([self.x, self.y, self.w])
+        model.compile(
+            optimizer_legacy.gradient_descent.SGD(0.05),
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
+        self.assertAllClose(
+            history.history["loss"], [2.0, 1.8, 1.6, 1.4, 1.2], 1e-3
+        )
+
+        output = model.evaluate([self.x, self.y, self.w])
+        self.assertAlmostEqual(output, 1.0, 3)
+
+        output = model.test_on_batch([self.x, self.y, self.w])
+        self.assertAlmostEqual(output, 1.0, 3)
+
+    @test_combinations.run_all_keras_modes
+    def test_loss_on_layer(self):
+        class MyLayer(layers.Layer):
+            def call(self, inputs):
+                self.add_loss(tf.reduce_sum(inputs))
+                return inputs
+
+        inputs = Input((3,))
+        layer = MyLayer()
+        outputs = layer(inputs)
+        model = Model(inputs, outputs)
+        self.assertLen(model.losses, 1)
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        loss = model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
+        self.assertEqual(loss, 2 * 3)
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_activity_regularizer(self):
+        loss = {}
+        for reg in [None, "l2"]:
+            model_layers = [
+                layers.Dense(
+                    10,
+                    activation="relu",
+                    activity_regularizer=reg,
+                    kernel_initializer="ones",
+                    use_bias=False,
+                ),
+                layers.Dense(
+                    1,
+                    activation="sigmoid",
+                    kernel_initializer="ones",
+                    use_bias=False,
+                ),
+            ]
+
+            model = test_utils.get_model_from_layers(
+                model_layers, input_shape=(10,)
+            )
+
+            x = np.ones((10, 10), "float32")
+            y = np.zeros((10, 1), "float32")
+
+            optimizer = RMSPropOptimizer(learning_rate=0.001)
+            model.compile(
+                optimizer,
+                "binary_crossentropy",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+            model.fit(x, y, batch_size=2, epochs=5)
+            loss[reg] = model.evaluate(x, y)
+        self.assertLess(loss[None], loss["l2"])
+
+    @test_combinations.run_all_keras_modes
+    @test_combinations.run_with_all_model_types
+    def test_activity_regularizer_loss_value(self):
+        layer = layers.Dense(
+            1,
+            kernel_initializer="zeros",
+            bias_initializer="ones",
+            activity_regularizer="l2",
+        )
+
+        model = test_utils.get_model_from_layers([layer], input_shape=(10,))
+
+        x = np.ones((10, 10), "float32")
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        model.compile(optimizer, run_eagerly=test_utils.should_run_eagerly())
+        loss = model.test_on_batch(x)
+        self.assertAlmostEqual(0.01, loss, places=4)
+
+    @test_combinations.run_all_keras_modes
+    def test_activity_regularizer_batch_independent(self):
+        inputs = layers.Input(shape=(10,))
+        x = layers.Dense(10, activation="relu", activity_regularizer="l2")(
+            inputs
+        )
+        outputs = layers.Dense(1, activation="sigmoid")(x)
+        model = Model(inputs, outputs)
+
+        optimizer = RMSPropOptimizer(learning_rate=0.001)
+        model.compile(optimizer, run_eagerly=test_utils.should_run_eagerly())
+
+        loss_small_batch = model.test_on_batch(np.ones((10, 10), "float32"))
+        loss_big_batch = model.test_on_batch(np.ones((20, 10), "float32"))
+        self.assertAlmostEqual(loss_small_batch, loss_big_batch, places=4)
+
+    @test_combinations.run_all_keras_modes
+    def test_with_shared_layer(self):
+        class LayerWithLoss(layers.Layer):
+            def call(self, inputs):
+                self.add_loss(tf.reduce_sum(inputs))
+                return inputs * 2
+
+        shared_layer = LayerWithLoss()
+
+        m = Sequential([shared_layer])
+        m2 = Sequential([shared_layer, m])
+        m2(tf.constant([1, 2, 3]))
+        self.assertEqual(len(m2.losses), 2)
+        self.assertAllClose(m2.losses, [6, 12])
+
+    @test_combinations.run_all_keras_modes
+    def test_with_shared_nested_layer(self):
+        class LayerWithLoss(layers.Layer):
+            def call(self, inputs):
+                self.add_loss(tf.reduce_sum(inputs))
+                return inputs * 2
+
+        class LayerWithNestedLayerWithLoss(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.loss_layer = LayerWithLoss()
+
+            def call(self, inputs):
+                return self.loss_layer(inputs)
+
+        shared_layer = LayerWithNestedLayerWithLoss()
+
+        m = Sequential([shared_layer])
+        m2 = Sequential([shared_layer, m])
+        m2(tf.constant([1, 2, 3]))
+        self.assertLen(m2.losses, 2)
+        self.assertAllClose(m2.losses, [6, 12])
+
+    @test_combinations.run_all_keras_modes
+    def test_clear_losses(self):
+        class LayerWithSharedNestedLossLayer(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.loss_layer = layers.ActivityRegularization(l2=0.001)
+                self.add_weight(shape=(1,), regularizer="l2")
+
+            def call(self, x):
+                x = self.loss_layer(x)
+                return self.loss_layer(x)
+
+        inputs = Input(shape=(1,))
+        l = LayerWithSharedNestedLossLayer()  # Weight loss + 2 activity losses.
+
+        x1 = tf.ones((1, 1))
+        _ = l(x1)
+        if not tf.executing_eagerly():
+            self.assertLen(l.get_losses_for(x1), 2)
+            self.assertLen(l.get_losses_for(None), 1)
+
+        x2 = tf.ones((1, 1))
+        _ = l(x2)
+        if not tf.executing_eagerly():
+            self.assertLen(l.get_losses_for(x1), 2)
+            self.assertLen(l.get_losses_for(x2), 2)
+            self.assertLen(l.get_losses_for(None), 1)
+
+        outputs = l(inputs)
+        model = Model(inputs, outputs)
+        if not tf.executing_eagerly():
+            self.assertLen(model.losses, 7)
+            self.assertLen(l.get_losses_for(x1), 2)
+            self.assertLen(l.get_losses_for(x2), 2)
+            self.assertLen(l.get_losses_for(None), 1)
+
+        x3 = tf.ones((1, 1))
+        model(x3)
+        x4 = tf.ones((1, 1))
+        model(x4)
+        if tf.executing_eagerly():
+            # Eager losses are cleared every `__call__`.
+            self.assertLen(model.losses, 3)
+        else:
+            self.assertLen(model.losses, 11)
+            self.assertLen(l.get_losses_for(x3), 2)
+            self.assertLen(l.get_losses_for(x4), 2)
+            self.assertLen(l.get_losses_for(None), 1)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_invalid_constant_input(self):
+        inputs = Input(shape=(1,))
+        outputs = test_utils.Bias()(inputs)
+        model = Model(inputs, outputs)
+        with self.assertRaisesRegex(
+            ValueError,
+            "Expected a symbolic Tensors or a callable for the loss value",
+        ):
+            model.add_loss(1.0)
+
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_invalid_variable_input(self):
+        inputs = Input(shape=(1,))
+        outputs = test_utils.Bias()(inputs)
+        model = Model(inputs, outputs)
+        with self.assertRaisesRegex(
+            ValueError,
+            "Expected a symbolic Tensors or a callable for the loss value",
+        ):
+            model.add_loss(model.weights[0])
+
+    @test_combinations.run_all_keras_modes
+    def test_add_entropy_loss_on_functional_model(self):
+        inputs = Input(shape=(1,))
+        targets = Input(shape=(1,))
+        outputs = test_utils.Bias()(inputs)
+        model = Model([inputs, targets], outputs)
+        model.add_loss(losses.binary_crossentropy(targets, outputs))
+        model.compile("sgd", run_eagerly=test_utils.should_run_eagerly())
+        with tf.compat.v1.test.mock.patch.object(
+            logging, "warning"
+        ) as mock_log:
+            model.fit([self.x, self.y], batch_size=3, epochs=5)
+            self.assertNotIn(
+                "Gradients do not exist for variables", str(mock_log.call_args)
+            )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/automatic_outside_compilation_test.py b/keras/tests/automatic_outside_compilation_test.py
index be09248516fd..254679be8913 100644
--- a/keras/tests/automatic_outside_compilation_test.py
+++ b/keras/tests/automatic_outside_compilation_test.py
@@ -17,7 +17,10 @@
 import collections
 import os
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl import flags
+
 from keras import callbacks
 from keras.distribute import distribute_strategy_test
 from keras.engine import base_layer
@@ -29,262 +32,305 @@
 from keras.layers import regularization as regularization_layer_lib
 from keras.layers import reshaping as reshaping_layer_lib
 from keras.testing_infra import test_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
-from tensorboard.plugins.histogram import summary_v2 as histogram_summary_v2
-from tensorboard.plugins.image import summary_v2 as image_summary_v2
-from tensorboard.plugins.scalar import summary_v2 as scalar_summary_v2
-from tensorflow.python.eager.context import set_soft_device_placement
-from tensorflow.python.framework import test_util as tf_test_utils
+# isort: off
+from tensorboard.plugins.histogram import (
+    summary_v2 as histogram_summary_v2,
+)
+from tensorboard.plugins.image import (
+    summary_v2 as image_summary_v2,
+)
+from tensorboard.plugins.scalar import (
+    summary_v2 as scalar_summary_v2,
+)
+from tensorflow.python.eager.context import (
+    set_soft_device_placement,
+)
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
 NUM_CLASSES = 4
 
 FLAGS = flags.FLAGS
-flags.DEFINE_string('tpu', '', 'Name of TPU to connect to.')
-flags.DEFINE_string('project', None, 'Name of GCP project with TPU.')
-flags.DEFINE_string('zone', None, 'Name of GCP zone with TPU.')
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
 
 
 def get_tpu_cluster_resolver():
-  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-      tpu=FLAGS.tpu,
-      zone=FLAGS.zone,
-      project=FLAGS.project,
-  )
-  return resolver
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu,
+        zone=FLAGS.zone,
+        project=FLAGS.project,
+    )
+    return resolver
 
 
 def get_tpu_strategy():
-  resolver = get_tpu_cluster_resolver()
-  tf.config.experimental_connect_to_cluster(resolver)
-  tf.tpu.experimental.initialize_tpu_system(resolver)
-  return tf.distribute.experimental.TPUStrategy(resolver)
+    resolver = get_tpu_cluster_resolver()
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    return tf.distribute.experimental.TPUStrategy(resolver)
 
 
 class LayerForScalarSummary(base_layer.Layer):
-  """A pass-through layer that only records scalar values to summary."""
+    """A pass-through layer that only records scalar values to summary."""
 
-  def call(self, x):
-    # Add summary scalar using compat v2 implementation.
-    scalar_summary_v2.scalar('custom_scalar_summary_v2', tf.reduce_sum(x))
-    return x
+    def call(self, x):
+        # Add summary scalar using compat v2 implementation.
+        scalar_summary_v2.scalar("custom_scalar_summary_v2", tf.reduce_sum(x))
+        return x
 
 
 class LayerForImageSummary(base_layer.Layer):
-  """A pass-through layer that only records image values to summary."""
+    """A pass-through layer that only records image values to summary."""
 
-  def call(self, x):
-    # Add summary image using compat v2 implementation.
-    image_summary_v2.image('custom_image_summary_v2', x)
+    def call(self, x):
+        # Add summary image using compat v2 implementation.
+        image_summary_v2.image("custom_image_summary_v2", x)
 
-    return x
+        return x
 
 
 class LayerForHistogramSummary(base_layer.Layer):
-  """A pass-through layer that records histogram values to summary."""
+    """A pass-through layer that records histogram values to summary."""
 
-  def call(self, x):
-    # Add summary histogram using compat v2 implementation.
-    histogram_summary_v2.histogram('custom_histogram_summary_v2', x)
+    def call(self, x):
+        # Add summary histogram using compat v2 implementation.
+        histogram_summary_v2.histogram("custom_histogram_summary_v2", x)
 
-    return x
+        return x
 
 
 class CustomModel(training.Model):
-  """Custom model with summary ops in model call definition."""
-
-  def __init__(self, name=None, enable_histograms=True):
-    super().__init__()
-    self._my_layers = [
-        layer_lib.Dense(
-            4096,
-            name='dense1',
-            kernel_initializer=tf.compat.v1.glorot_normal_initializer(seed=0),
-            use_bias=False),
-        layer_lib.Dense(
-            4,
-            name='dense2',
-            kernel_initializer=tf.compat.v1.glorot_normal_initializer(seed=0),
-            use_bias=False),
-    ]
-    if enable_histograms:
-      self.histogram_summary_layer = LayerForHistogramSummary()
-    else:
-      self.histogram_summary_layer = base_layer.Layer()  # no-op pass through
-    self.scalar_summary_layer = LayerForScalarSummary()
-
-  def call(self, x):
-    for layer in self._my_layers:
-      x = layer(x)
-    x = self.scalar_summary_layer(x)
-    return self.histogram_summary_layer(x)
+    """Custom model with summary ops in model call definition."""
+
+    def __init__(self, name=None, enable_histograms=True):
+        super().__init__()
+        self._my_layers = [
+            layer_lib.Dense(
+                4096,
+                name="dense1",
+                kernel_initializer=tf.compat.v1.glorot_normal_initializer(
+                    seed=0
+                ),
+                use_bias=False,
+            ),
+            layer_lib.Dense(
+                4,
+                name="dense2",
+                kernel_initializer=tf.compat.v1.glorot_normal_initializer(
+                    seed=0
+                ),
+                use_bias=False,
+            ),
+        ]
+        if enable_histograms:
+            self.histogram_summary_layer = LayerForHistogramSummary()
+        else:
+            self.histogram_summary_layer = (
+                base_layer.Layer()
+            )  # no-op pass through
+        self.scalar_summary_layer = LayerForScalarSummary()
+
+    def call(self, x):
+        for layer in self._my_layers:
+            x = layer(x)
+        x = self.scalar_summary_layer(x)
+        return self.histogram_summary_layer(x)
 
 
 def get_image_dataset():
-  inputs = np.zeros((10, 28, 28, 3), dtype=np.float32)
-  targets = np.zeros((10, NUM_CLASSES), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-  dataset = dataset.repeat(100)
-  dataset = dataset.batch(10, drop_remainder=True)
-  return dataset
+    inputs = np.zeros((10, 28, 28, 3), dtype=np.float32)
+    targets = np.zeros((10, NUM_CLASSES), dtype=np.float32)
+    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10, drop_remainder=True)
+    return dataset
 
 
 def mnist_model(input_shape, enable_histograms=True):
-  """Creates a MNIST model."""
-  model = sequential_model_lib.Sequential()
-
-  # Adding custom pass-through layer to visualize input images.
-  model.add(LayerForImageSummary())
-
-  model.add(
-      conv_layer_lib.Conv2D(
-          32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
-  model.add(conv_layer_lib.Conv2D(64, (3, 3), activation='relu'))
-  model.add(pool_layer_lib.MaxPooling2D(pool_size=(2, 2)))
-  model.add(regularization_layer_lib.Dropout(0.25))
-  model.add(reshaping_layer_lib.Flatten())
-  model.add(layer_lib.Dense(128, activation='relu'))
-  model.add(regularization_layer_lib.Dropout(0.5))
-  model.add(layer_lib.Dense(NUM_CLASSES, activation='softmax'))
-
-  # Adding custom pass-through layer for summary recording.
-  if enable_histograms:
-    model.add(LayerForHistogramSummary())
-  return model
+    """Creates a MNIST model."""
+    model = sequential_model_lib.Sequential()
+
+    # Adding custom pass-through layer to visualize input images.
+    model.add(LayerForImageSummary())
+
+    model.add(
+        conv_layer_lib.Conv2D(
+            32, kernel_size=(3, 3), activation="relu", input_shape=input_shape
+        )
+    )
+    model.add(conv_layer_lib.Conv2D(64, (3, 3), activation="relu"))
+    model.add(pool_layer_lib.MaxPooling2D(pool_size=(2, 2)))
+    model.add(regularization_layer_lib.Dropout(0.25))
+    model.add(reshaping_layer_lib.Flatten())
+    model.add(layer_lib.Dense(128, activation="relu"))
+    model.add(regularization_layer_lib.Dropout(0.5))
+    model.add(layer_lib.Dense(NUM_CLASSES, activation="softmax"))
+
+    # Adding custom pass-through layer for summary recording.
+    if enable_histograms:
+        model.add(LayerForHistogramSummary())
+    return model
 
 
 @test_utils.run_v2_only
 class AutoOutsideCompilationWithKerasTest(tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    set_soft_device_placement(True)
-    self.summary_dir = self.get_temp_dir()
-
-  def validate_recorded_sumary_file(self, event_files, expected_event_counts):
-    event_counts = collections.defaultdict(int)
-    for event_file in event_files:
-      for e in tf.compat.v1.train.summary_iterator(event_file):
-        for v in e.summary.value:
-          event_counts[v.tag] += 1
-
-    event_counts = dict(event_counts)  # Avoid defaultdict type in repr below.
-    # Populate a count of 0 for tags that were expected but not found.
-    actual_event_counts = {
-        tag: event_counts.get(tag, 0) for tag in expected_event_counts
-    }
-    self.assertEqual(
-        expected_event_counts,
-        actual_event_counts,
-        msg='expected counts not found; all event counts: %r' % event_counts)
-
-  def testV2SummaryWithKerasSequentialModel(self):
-    # Histogram summaries require the MLIR bridge; see b/178826597#comment107.
-    # TODO(https://github.com/tensorflow/tensorboard/issues/2885): remove this
-    #   if histogram summaries are supported fully on non-MLIR bridge or
-    #   non-MLIR bridge is no longer run.
-    enable_histograms = tf_test_utils.is_mlir_bridge_enabled()
-    strategy = get_tpu_strategy()
-
-    with strategy.scope():
-      model = mnist_model((28, 28, 3), enable_histograms=enable_histograms)
-      model.compile('sgd', 'mse')
-
-      dataset = get_image_dataset()
-      tensorboard_callback = callbacks.TensorBoard(
-          self.summary_dir, update_freq=2)
-      model.fit(
-          dataset,
-          steps_per_epoch=10,
-          epochs=1,
-          callbacks=[tensorboard_callback])
-
-      event_files = tf.io.gfile.glob(
-          os.path.join(self.summary_dir, 'train', 'event*'))
-      # Since total of 10 steps are ran and summary ops should be invoked
-      # every 2 batches, we should see total of 5 event logs for each summary.
-      expected_event_counts = {
-          'sequential/layer_for_histogram_summary/custom_histogram_summary_v2':
-              5 if enable_histograms else 0,
-          'sequential/layer_for_image_summary/custom_image_summary_v2':
-              5,
-      }
-      self.validate_recorded_sumary_file(event_files, expected_event_counts)
-
-  def testV2SummaryWithKerasSubclassedModel(self):
-    # Histogram summaries require the MLIR bridge; see b/178826597#comment107.
-    # TODO(https://github.com/tensorflow/tensorboard/issues/2885): remove this
-    #   if histogram summaries are supported fully on non-MLIR bridge or
-    #   non-MLIR bridge is no longer run.
-    enable_histograms = tf_test_utils.is_mlir_bridge_enabled()
-    strategy = get_tpu_strategy()
-    with strategy.scope():
-      model = CustomModel(enable_histograms=enable_histograms)
-      model.compile('sgd', 'mse')
-
-      dataset = distribute_strategy_test.get_dataset(strategy)
-      tensorboard_callback = callbacks.TensorBoard(
-          self.summary_dir, update_freq=2)
-      model.fit(
-          dataset,
-          steps_per_epoch=10,
-          epochs=1,
-          callbacks=[tensorboard_callback])
-
-      event_files = tf.io.gfile.glob(
-          os.path.join(self.summary_dir, 'train', 'event*'))
-      # Since total of 10 steps are ran and summary ops should be invoked
-      # every 2 batches, we should see total of 5 event logs for each summary.
-      expected_event_counts = {
-          ('custom_model/layer_for_scalar_summary/'
-           'custom_scalar_summary_v2'):
-              5,
-          ('custom_model/layer_for_histogram_summary/'
-           'custom_histogram_summary_v2'):
-              5 if enable_histograms else 0,
-      }
-      self.validate_recorded_sumary_file(event_files, expected_event_counts)
-
-  def testSummaryWithCustomTrainingLoop(self):
-    strategy = get_tpu_strategy()
-
-    writer = tf.summary.create_file_writer(self.summary_dir)
-    with strategy.scope():
-      model = distribute_strategy_test.get_model()
-      model.compile('sgd', 'mse')
-
-    @tf.function
-    def custom_function(dataset):
-
-      def _custom_step(features, labels):
-        del labels
-        logits = model(features)
-        with tf.summary.record_if(True), writer.as_default():
-          scalar_summary_v2.scalar(
-              'logits',
-              tf.reduce_sum(logits),
-              step=model.optimizer.iterations)
-        return logits
-
-      iterator = iter(dataset)
-      output = strategy.unwrap(
-          strategy.run(_custom_step, args=(next(iterator))))
-      return output
-
-    dataset = strategy.experimental_distribute_dataset(
-        distribute_strategy_test.get_dataset(strategy))
-
-    custom_function(dataset)
-    writer.close()
-
-    event_files = tf.io.gfile.glob(
-        os.path.join(self.summary_dir, 'event*'))
-    expected_event_counts = {
-        'logits': 1,
-    }
-    self.validate_recorded_sumary_file(event_files, expected_event_counts)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def setUp(self):
+        super().setUp()
+        set_soft_device_placement(True)
+        self.summary_dir = self.get_temp_dir()
+
+    def validate_recorded_sumary_file(self, event_files, expected_event_counts):
+        event_counts = collections.defaultdict(int)
+        for event_file in event_files:
+            for e in tf.compat.v1.train.summary_iterator(event_file):
+                for v in e.summary.value:
+                    event_counts[v.tag] += 1
+
+        event_counts = dict(
+            event_counts
+        )  # Avoid defaultdict type in repr below.
+        # Populate a count of 0 for tags that were expected but not found.
+        actual_event_counts = {
+            tag: event_counts.get(tag, 0) for tag in expected_event_counts
+        }
+        self.assertEqual(
+            expected_event_counts,
+            actual_event_counts,
+            msg="expected counts not found; all event counts: %r"
+            % event_counts,
+        )
+
+    def testV2SummaryWithKerasSequentialModel(self):
+        # Histogram summaries require the MLIR bridge; see
+        # b/178826597#comment107.
+        # TODO(https://github.com/tensorflow/tensorboard/issues/2885): remove
+        # this if histogram summaries are supported fully on non-MLIR bridge or
+        # non-MLIR bridge is no longer run.
+        enable_histograms = tf_test_utils.is_mlir_bridge_enabled()
+        strategy = get_tpu_strategy()
+
+        with strategy.scope():
+            model = mnist_model(
+                (28, 28, 3), enable_histograms=enable_histograms
+            )
+            model.compile("sgd", "mse")
+
+            dataset = get_image_dataset()
+            tensorboard_callback = callbacks.TensorBoard(
+                self.summary_dir, update_freq=2
+            )
+            model.fit(
+                dataset,
+                steps_per_epoch=10,
+                epochs=1,
+                callbacks=[tensorboard_callback],
+            )
+
+            event_files = tf.io.gfile.glob(
+                os.path.join(self.summary_dir, "train", "event*")
+            )
+            # Since total of 10 steps are ran and summary ops should be invoked
+            # every 2 batches, we should see total of 5 event logs for each
+            # summary.
+            expected_event_counts = {
+                "sequential/layer_for_histogram_summary/custom_histogram_summary_v2": 5  # noqa: E501
+                if enable_histograms
+                else 0,
+                "sequential/layer_for_image_summary/custom_image_summary_v2": 5,
+            }
+            self.validate_recorded_sumary_file(
+                event_files, expected_event_counts
+            )
+
+    def testV2SummaryWithKerasSubclassedModel(self):
+        # Histogram summaries require the MLIR bridge; see
+        # b/178826597#comment107.
+        # TODO(https://github.com/tensorflow/tensorboard/issues/2885): remove
+        # this if histogram summaries are supported fully on non-MLIR bridge or
+        # non-MLIR bridge is no longer run.
+        enable_histograms = tf_test_utils.is_mlir_bridge_enabled()
+        strategy = get_tpu_strategy()
+        with strategy.scope():
+            model = CustomModel(enable_histograms=enable_histograms)
+            model.compile("sgd", "mse")
+
+            dataset = distribute_strategy_test.get_dataset(strategy)
+            tensorboard_callback = callbacks.TensorBoard(
+                self.summary_dir, update_freq=2
+            )
+            model.fit(
+                dataset,
+                steps_per_epoch=10,
+                epochs=1,
+                callbacks=[tensorboard_callback],
+            )
+
+            event_files = tf.io.gfile.glob(
+                os.path.join(self.summary_dir, "train", "event*")
+            )
+            # Since total of 10 steps are ran and summary ops should be invoked
+            # every 2 batches, we should see total of 5 event logs for each
+            # summary.
+            expected_event_counts = {
+                (
+                    "custom_model/layer_for_scalar_summary/"
+                    "custom_scalar_summary_v2"
+                ): 5,
+                (
+                    "custom_model/layer_for_histogram_summary/"
+                    "custom_histogram_summary_v2"
+                ): 5
+                if enable_histograms
+                else 0,
+            }
+            self.validate_recorded_sumary_file(
+                event_files, expected_event_counts
+            )
+
+    def testSummaryWithCustomTrainingLoop(self):
+        strategy = get_tpu_strategy()
+
+        writer = tf.summary.create_file_writer(self.summary_dir)
+        with strategy.scope():
+            model = distribute_strategy_test.get_model()
+            model.compile("sgd", "mse")
+
+        @tf.function
+        def custom_function(dataset):
+            def _custom_step(features, labels):
+                del labels
+                logits = model(features)
+                with tf.summary.record_if(True), writer.as_default():
+                    scalar_summary_v2.scalar(
+                        "logits",
+                        tf.reduce_sum(logits),
+                        step=model.optimizer.iterations,
+                    )
+                return logits
+
+            iterator = iter(dataset)
+            output = strategy.unwrap(
+                strategy.run(_custom_step, args=(next(iterator)))
+            )
+            return output
+
+        dataset = strategy.experimental_distribute_dataset(
+            distribute_strategy_test.get_dataset(strategy)
+        )
+
+        custom_function(dataset)
+        writer.close()
+
+        event_files = tf.io.gfile.glob(os.path.join(self.summary_dir, "event*"))
+        expected_event_counts = {
+            "logits": 1,
+        }
+        self.validate_recorded_sumary_file(event_files, expected_event_counts)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/convert_to_constants_test.py b/keras/tests/convert_to_constants_test.py
index 8e5a6425f0bc..bb743c84103b 100644
--- a/keras/tests/convert_to_constants_test.py
+++ b/keras/tests/convert_to_constants_test.py
@@ -14,157 +14,167 @@
 # ==============================================================================
 """Tests for convert_to_constants.py."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.framework import convert_to_constants
 from keras.testing_infra import test_utils
+
+# isort: off
+from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
 
 
 class VariablesToConstantsTest(tf.test.TestCase):
-
-  def _freezeModel(self, model):
-    """Freezes the model.
-
-    Args:
-      model: Function.
-
-    Returns:
-      root: AutoTrackable object with original ConcreteFunction.
-      output_func: frozen ConcreteFunction.
-    """
-    root = tf.Module()
-    root.f = model
-    input_func = root.f.get_concrete_function()
-
-    output_func = convert_to_constants.convert_variables_to_constants_v2(
-        input_func, lower_control_flow=False)
-    return root, output_func
-
-  def _hasStatefulPartitionedCallOp(self, graph_def):
-    """Determines if a StatefulPartitionedCall op exists in the graph."""
-    for node in graph_def.node:
-      if node.op == "StatefulPartitionedCall":
-        return True
-    return False
-
-  def _getNumVariables(self, graph_def):
-    """Returns the number of ReadVariableOp in the graph."""
-    return sum(node.op == "ReadVariableOp" for node in graph_def.node)
-
-  def _testConvertedFunction(self, obj, func, converted_concrete_func,
-                             input_data):
-    # Ensure the converted graph has no variables and no function calls.
-    constant_graph_def = converted_concrete_func.graph.as_graph_def()
-    self.assertEqual(0, self._getNumVariables(constant_graph_def))
-    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
-
-    # Check that the converted ConcreteFunction produces the same result as the
-    # original Function.
-    expected_value = tf.nest.flatten(func(**input_data))
-    actual_value = tf.nest.flatten(converted_concrete_func(**input_data))
-
-    for expected, actual in zip(expected_value, actual_value):
-      np.testing.assert_almost_equal(expected.numpy(), actual.numpy())
-
-    # Ensure the shape is retained.
-    for tensor in converted_concrete_func.inputs:
-      actual_shape = input_data[tensor.name.split(":")[0]].shape
-      self.assertEqual(tensor.shape, actual_shape)
-
-    # Save the converted ConcreteFunction as a signature.
-    save_dir = os.path.join(self.get_temp_dir(), "frozen_saved_model")
-    root = tf.Module()
-    root.f = converted_concrete_func
-    save(root, save_dir, {"mykey": converted_concrete_func})
-
-    # Load it back and make sure it works.
-    loaded_obj = load(save_dir)
-    actual_value = tf.nest.flatten(loaded_obj.signatures["mykey"](**input_data))
-    for expected, actual in zip(expected_value, actual_value):
-      np.testing.assert_almost_equal(expected.numpy(), actual.numpy())
-
-  @test_utils.run_v2_only
-  def testKerasModel(self):
-    """Test a basic Keras model with Variables."""
-    input_data = {"x": tf.constant(1., shape=[1, 1])}
-
-    # Create a simple Keras model.
-    x = [-1, 0, 1, 2, 3, 4]
-    y = [-3, -1, 1, 3, 5, 7]
-
-    model = keras.models.Sequential(
-        [keras.layers.Dense(units=1, input_shape=[1])])
-    model.compile(optimizer="sgd", loss="mean_squared_error")
-    model.fit(x, y, epochs=1)
-
-    @tf.function(input_signature=[
-        tf.TensorSpec(shape=[1, 1], dtype=tf.float32)
-    ])
-    def to_save(x):
-      return model(x)
-
-    root, output_func = self._freezeModel(to_save)
-    self._testConvertedFunction(root, root.f, output_func, input_data)
-
-  @test_utils.run_v2_only
-  def testKerasLSTM(self):
-    """Test a Keras LSTM containing dynamic_rnn ops."""
-    input_data = {
-        "x":
-            tf.constant(
+    def _freezeModel(self, model):
+        """Freezes the model.
+
+        Args:
+          model: Function.
+
+        Returns:
+          root: AutoTrackable object with original ConcreteFunction.
+          output_func: frozen ConcreteFunction.
+        """
+        root = tf.Module()
+        root.f = model
+        input_func = root.f.get_concrete_function()
+
+        output_func = convert_to_constants.convert_variables_to_constants_v2(
+            input_func, lower_control_flow=False
+        )
+        return root, output_func
+
+    def _hasStatefulPartitionedCallOp(self, graph_def):
+        """Determines if a StatefulPartitionedCall op exists in the graph."""
+        for node in graph_def.node:
+            if node.op == "StatefulPartitionedCall":
+                return True
+        return False
+
+    def _getNumVariables(self, graph_def):
+        """Returns the number of ReadVariableOp in the graph."""
+        return sum(node.op == "ReadVariableOp" for node in graph_def.node)
+
+    def _testConvertedFunction(
+        self, obj, func, converted_concrete_func, input_data
+    ):
+        # Ensure the converted graph has no variables and no function calls.
+        constant_graph_def = converted_concrete_func.graph.as_graph_def()
+        self.assertEqual(0, self._getNumVariables(constant_graph_def))
+        self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+        # Check that the converted ConcreteFunction produces the same result as
+        # the original Function.
+        expected_value = tf.nest.flatten(func(**input_data))
+        actual_value = tf.nest.flatten(converted_concrete_func(**input_data))
+
+        for expected, actual in zip(expected_value, actual_value):
+            np.testing.assert_almost_equal(expected.numpy(), actual.numpy())
+
+        # Ensure the shape is retained.
+        for tensor in converted_concrete_func.inputs:
+            actual_shape = input_data[tensor.name.split(":")[0]].shape
+            self.assertEqual(tensor.shape, actual_shape)
+
+        # Save the converted ConcreteFunction as a signature.
+        save_dir = os.path.join(self.get_temp_dir(), "frozen_saved_model")
+        root = tf.Module()
+        root.f = converted_concrete_func
+        save(root, save_dir, {"mykey": converted_concrete_func})
+
+        # Load it back and make sure it works.
+        loaded_obj = load(save_dir)
+        actual_value = tf.nest.flatten(
+            loaded_obj.signatures["mykey"](**input_data)
+        )
+        for expected, actual in zip(expected_value, actual_value):
+            np.testing.assert_almost_equal(expected.numpy(), actual.numpy())
+
+    @test_utils.run_v2_only
+    def testKerasModel(self):
+        """Test a basic Keras model with Variables."""
+        input_data = {"x": tf.constant(1.0, shape=[1, 1])}
+
+        # Create a simple Keras model.
+        x = [-1, 0, 1, 2, 3, 4]
+        y = [-3, -1, 1, 3, 5, 7]
+
+        model = keras.models.Sequential(
+            [keras.layers.Dense(units=1, input_shape=[1])]
+        )
+        model.compile(optimizer="sgd", loss="mean_squared_error")
+        model.fit(x, y, epochs=1)
+
+        @tf.function(
+            input_signature=[tf.TensorSpec(shape=[1, 1], dtype=tf.float32)]
+        )
+        def to_save(x):
+            return model(x)
+
+        root, output_func = self._freezeModel(to_save)
+        self._testConvertedFunction(root, root.f, output_func, input_data)
+
+    @test_utils.run_v2_only
+    def testKerasLSTM(self):
+        """Test a Keras LSTM containing dynamic_rnn ops."""
+        input_data = {
+            "x": tf.constant(
                 np.array(
-                    np.random.random_sample((10, 10, 10)), dtype=np.float32))
-    }
-
-    model = keras.models.Sequential(
-        [keras.layers.LSTM(units=10, input_shape=(10, 10))])
-
-    @tf.function(input_signature=[
-        tf.TensorSpec(shape=[10, 10, 10], dtype=tf.float32)
-    ])
-    def to_save(x):
-      return model(x)
-
-    root, output_func = self._freezeModel(to_save)
-    self._testConvertedFunction(root, root.f, output_func, input_data)
-
-  @test_utils.run_v2_only
-  def testEmbeddings(self):
-    """Test model with embeddings."""
-    input_data = {
-        "x":
-            tf.constant(
-                np.array(np.random.random_sample((20)), dtype=np.int32))
-    }
-
-    class EmbeddingModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.shared_weights = self.add_weight(
-            "weights",
-            shape=(2000, 300),
-            dtype=tf.float32,
-            initializer=tf.compat.v1.random_normal_initializer(
-                mean=0.0, stddev=300**(-0.5)))
-
-      @tf.function(input_signature=[
-          tf.TensorSpec(shape=(20), dtype=tf.int32)
-      ])
-      def func(self, x):
-        return tf.gather(self.shared_weights, x)
-
-    model = EmbeddingModel()
-    root, output_func = self._freezeModel(model.func)
-    self._testConvertedFunction(root, root.f, output_func, input_data)
+                    np.random.random_sample((10, 10, 10)), dtype=np.float32
+                )
+            )
+        }
+
+        model = keras.models.Sequential(
+            [keras.layers.LSTM(units=10, input_shape=(10, 10))]
+        )
+
+        @tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[10, 10, 10], dtype=tf.float32)
+            ]
+        )
+        def to_save(x):
+            return model(x)
+
+        root, output_func = self._freezeModel(to_save)
+        self._testConvertedFunction(root, root.f, output_func, input_data)
+
+    @test_utils.run_v2_only
+    def testEmbeddings(self):
+        """Test model with embeddings."""
+        input_data = {
+            "x": tf.constant(
+                np.array(np.random.random_sample((20)), dtype=np.int32)
+            )
+        }
+
+        class EmbeddingModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.shared_weights = self.add_weight(
+                    "weights",
+                    shape=(2000, 300),
+                    dtype=tf.float32,
+                    initializer=tf.compat.v1.random_normal_initializer(
+                        mean=0.0, stddev=300 ** (-0.5)
+                    ),
+                )
+
+            @tf.function(
+                input_signature=[tf.TensorSpec(shape=(20), dtype=tf.int32)]
+            )
+            def func(self, x):
+                return tf.gather(self.shared_weights, x)
+
+        model = EmbeddingModel()
+        root, output_func = self._freezeModel(model.func)
+        self._testConvertedFunction(root, root.f, output_func, input_data)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/tests/custom_training_loop_test.py b/keras/tests/custom_training_loop_test.py
index 891633cd4dd7..c9be92dbf2ea 100644
--- a/keras/tests/custom_training_loop_test.py
+++ b/keras/tests/custom_training_loop_test.py
@@ -14,10 +14,9 @@
 # ==============================================================================
 """Tests for custom training loops."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy as np
 
 import keras
 from keras.testing_infra import test_combinations
@@ -25,211 +24,220 @@
 
 
 class LayerWithLosses(keras.layers.Layer):
+    def build(self, input_shape):
+        self.v = self.add_weight(
+            name="hey",
+            shape=(),
+            initializer="ones",
+            regularizer=keras.regularizers.l1(100),
+        )
 
-  def build(self, input_shape):
-    self.v = self.add_weight(
-        name='hey',
-        shape=(),
-        initializer='ones',
-        regularizer=keras.regularizers.l1(100))
-
-  def call(self, inputs):
-    self.add_loss(tf.reduce_sum(inputs))
-    return self.v * inputs
+    def call(self, inputs):
+        self.add_loss(tf.reduce_sum(inputs))
+        return self.v * inputs
 
 
 class LayerWithMetrics(keras.layers.Layer):
+    def build(self, input_shape):
+        self.mean = keras.metrics.Mean(name="mean_object")
 
-  def build(self, input_shape):
-    self.mean = keras.metrics.Mean(name='mean_object')
-
-  def call(self, inputs):
-    self.add_metric(
-        tf.reduce_mean(inputs), name='mean_tensor', aggregation='mean')
-    self.add_metric(self.mean(inputs))
-    return inputs
+    def call(self, inputs):
+        self.add_metric(
+            tf.reduce_mean(inputs), name="mean_tensor", aggregation="mean"
+        )
+        self.add_metric(self.mean(inputs))
+        return inputs
 
 
 class LayerWithTrainingArg(keras.layers.Layer):
-
-  def call(self, inputs, training=None):
-    self.training = training
-    if training:
-      return inputs
-    else:
-      return 0. * inputs
+    def call(self, inputs, training=None):
+        self.training = training
+        if training:
+            return inputs
+        else:
+            return 0.0 * inputs
 
 
 def add_loss_step(defun):
-  optimizer = keras.optimizers.optimizer_v2.adam.Adam()
-  model = test_utils.get_model_from_layers([LayerWithLosses()],
-                                           input_shape=(10,))
-
-  def train_step(x):
-    with tf.GradientTape() as tape:
-      model(x)
-      assert len(model.losses) == 2
-      loss = tf.reduce_sum(model.losses)
-    gradients = tape.gradient(loss, model.trainable_weights)
-    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
-    return loss
-
-  if defun:
-    train_step = tf.function(train_step)
-
-  x = tf.ones((10, 10))
-  return train_step(x)
-
-
-def batch_norm_step(defun):
-  optimizer = keras.optimizers.optimizer_v2.adadelta.Adadelta()
-  model = test_utils.get_model_from_layers([
-      keras.layers.BatchNormalization(momentum=0.9),
-      keras.layers.Dense(1, kernel_initializer='zeros', activation='softmax')
-  ], input_shape=(10,))
-
-  def train_step(x, y):
-    with tf.GradientTape() as tape:
-      y_pred = model(x, training=True)
-      loss = keras.losses.binary_crossentropy(y, y_pred)
-    gradients = tape.gradient(loss, model.trainable_weights)
-    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
-    return loss, model(x, training=False)
-
-  if defun:
-    train_step = tf.function(train_step)
-
-  x, y = tf.ones((10, 10)), tf.ones((10, 1))
-  return train_step(x, y)
-
-
-def add_metric_step(defun):
-  optimizer = keras.optimizers.optimizer_v2.rmsprop.RMSprop()
-  model = test_utils.get_model_from_layers([
-      LayerWithMetrics(),
-      keras.layers.Dense(1, kernel_initializer='zeros', activation='softmax')
-  ], input_shape=(10,))
-
-  def train_step(x, y):
-    with tf.GradientTape() as tape:
-      y_pred_1 = model(x)
-      y_pred_2 = model(2 * x)
-      y_pred = y_pred_1 + y_pred_2
-      loss = keras.losses.mean_squared_error(y, y_pred)
-    gradients = tape.gradient(loss, model.trainable_weights)
-    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
-    assert len(model.metrics) == 2
-    return [m.result() for m in model.metrics]
-
-  if defun:
-    train_step = tf.function(train_step)
-
-  x, y = tf.ones((10, 10)), tf.zeros((10, 1))
-  metrics = train_step(x, y)
-  assert np.allclose(metrics[0], 1.5)
-  assert np.allclose(metrics[1], 1.5)
-  return metrics
-
-
-@test_combinations.run_with_all_model_types
-class CustomTrainingLoopTest(test_combinations.TestCase):
-
-  @parameterized.named_parameters(('add_loss_step', add_loss_step),
-                                  ('add_metric_step', add_metric_step),
-                                  ('batch_norm_step', batch_norm_step))
-  def test_eager_and_tf_function(self, train_step):
-    eager_result = train_step(defun=False)
-    fn_result = train_step(defun=True)
-    self.assertAllClose(eager_result, fn_result)
-
-  @parameterized.named_parameters(('eager', False), ('defun', True))
-  def test_training_arg_propagation(self, defun):
-
-    model = test_utils.get_model_from_layers([LayerWithTrainingArg()],
-                                             input_shape=(1,))
+    optimizer = keras.optimizers.legacy.adam.Adam()
+    model = test_utils.get_model_from_layers(
+        [LayerWithLosses()], input_shape=(10,)
+    )
 
     def train_step(x):
-      return model(x), model(x, training=False), model(x, training=True)
+        with tf.GradientTape() as tape:
+            model(x)
+            assert len(model.losses) == 2
+            loss = tf.reduce_sum(model.losses)
+        gradients = tape.gradient(loss, model.trainable_weights)
+        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+        return loss
 
     if defun:
-      train_step = tf.function(train_step)
+        train_step = tf.function(train_step)
 
-    x = tf.ones((1, 1))
-    results = train_step(x)
-    self.assertAllClose(results[0], tf.zeros((1, 1)))
-    self.assertAllClose(results[1], tf.zeros((1, 1)))
-    self.assertAllClose(results[2], tf.ones((1, 1)))
+    x = tf.ones((10, 10))
+    return train_step(x)
 
-  @parameterized.named_parameters(('eager', False), ('defun', True))
-  def test_learning_phase_propagation(self, defun):
 
-    class MyModel(keras.layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = LayerWithTrainingArg()
-
-      def call(self, inputs):
-        return self.layer(inputs)
-
-    model = MyModel()
-
-    def train_step(x):
-      no_learning_phase_out = model(x)
-      self.assertFalse(model.layer.training)
-      with keras.backend.learning_phase_scope(0):
-        inf_learning_phase_out = model(x)
-      self.assertEqual(model.layer.training, 0)
-      with keras.backend.learning_phase_scope(1):
-        train_learning_phase_out = model(x)
-      self.assertEqual(model.layer.training, 1)
-      return [
-          no_learning_phase_out, inf_learning_phase_out,
-          train_learning_phase_out
-      ]
+def batch_norm_step(defun):
+    optimizer = keras.optimizers.legacy.adadelta.Adadelta()
+    model = test_utils.get_model_from_layers(
+        [
+            keras.layers.BatchNormalization(momentum=0.9),
+            keras.layers.Dense(
+                1, kernel_initializer="zeros", activation="softmax"
+            ),
+        ],
+        input_shape=(10,),
+    )
+
+    def train_step(x, y):
+        with tf.GradientTape() as tape:
+            y_pred = model(x, training=True)
+            loss = keras.losses.binary_crossentropy(y, y_pred)
+        gradients = tape.gradient(loss, model.trainable_weights)
+        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+        return loss, model(x, training=False)
 
     if defun:
-      train_step = tf.function(train_step)
-
-    x = tf.ones((1, 1))
-    results = train_step(x)
-    self.assertAllClose(results[0], tf.zeros((1, 1)))
-    self.assertAllClose(results[1], tf.zeros((1, 1)))
-    self.assertAllClose(results[2], tf.ones((1, 1)))
-
-  @parameterized.named_parameters(('eager', False), ('defun', True))
-  def test_training_arg_priorities(self, defun):
-
-    class MyModel(keras.layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = LayerWithTrainingArg()
+        train_step = tf.function(train_step)
 
-      def call(self, inputs, training=False):
-        return self.layer(inputs)
+    x, y = tf.ones((10, 10)), tf.ones((10, 1))
+    return train_step(x, y)
 
-    model = MyModel()
 
-    def train_step(x):
-      explicit_out = model(x, training=True)
-      default_out = model(x)
-      with keras.backend.learning_phase_scope(1):
-        parent_out = model(x, training=False)
-        lr_out = model(x)
-      return [explicit_out, default_out, parent_out, lr_out]
+def add_metric_step(defun):
+    optimizer = keras.optimizers.legacy.rmsprop.RMSprop()
+    model = test_utils.get_model_from_layers(
+        [
+            LayerWithMetrics(),
+            keras.layers.Dense(
+                1, kernel_initializer="zeros", activation="softmax"
+            ),
+        ],
+        input_shape=(10,),
+    )
+
+    def train_step(x, y):
+        with tf.GradientTape() as tape:
+            y_pred_1 = model(x)
+            y_pred_2 = model(2 * x)
+            y_pred = y_pred_1 + y_pred_2
+            loss = keras.losses.mean_squared_error(y, y_pred)
+        gradients = tape.gradient(loss, model.trainable_weights)
+        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+        assert len(model.metrics) == 2
+        return [m.result() for m in model.metrics]
 
     if defun:
-      train_step = tf.function(train_step)
+        train_step = tf.function(train_step)
 
-    x = tf.ones((1, 1))
-    results = train_step(x)
-    self.assertAllClose(results[0], tf.ones((1, 1)))
-    self.assertAllClose(results[1], tf.zeros((1, 1)))
-    self.assertAllClose(results[2], tf.zeros((1, 1)))
-    self.assertAllClose(results[3], tf.ones((1, 1)))
+    x, y = tf.ones((10, 10)), tf.zeros((10, 1))
+    metrics = train_step(x, y)
+    assert np.allclose(metrics[0], 1.5)
+    assert np.allclose(metrics[1], 1.5)
+    return metrics
 
 
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+@test_combinations.run_with_all_model_types
+class CustomTrainingLoopTest(test_combinations.TestCase):
+    @parameterized.named_parameters(
+        ("add_loss_step", add_loss_step),
+        ("add_metric_step", add_metric_step),
+        ("batch_norm_step", batch_norm_step),
+    )
+    def test_eager_and_tf_function(self, train_step):
+        eager_result = train_step(defun=False)
+        fn_result = train_step(defun=True)
+        self.assertAllClose(eager_result, fn_result)
+
+    @parameterized.named_parameters(("eager", False), ("defun", True))
+    def test_training_arg_propagation(self, defun):
+
+        model = test_utils.get_model_from_layers(
+            [LayerWithTrainingArg()], input_shape=(1,)
+        )
+
+        def train_step(x):
+            return model(x), model(x, training=False), model(x, training=True)
+
+        if defun:
+            train_step = tf.function(train_step)
+
+        x = tf.ones((1, 1))
+        results = train_step(x)
+        self.assertAllClose(results[0], tf.zeros((1, 1)))
+        self.assertAllClose(results[1], tf.zeros((1, 1)))
+        self.assertAllClose(results[2], tf.ones((1, 1)))
+
+    @parameterized.named_parameters(("eager", False), ("defun", True))
+    def test_learning_phase_propagation(self, defun):
+        class MyModel(keras.layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.layer = LayerWithTrainingArg()
+
+            def call(self, inputs):
+                return self.layer(inputs)
+
+        model = MyModel()
+
+        def train_step(x):
+            no_learning_phase_out = model(x)
+            self.assertFalse(model.layer.training)
+            with keras.backend.learning_phase_scope(0):
+                inf_learning_phase_out = model(x)
+            self.assertEqual(model.layer.training, 0)
+            with keras.backend.learning_phase_scope(1):
+                train_learning_phase_out = model(x)
+            self.assertEqual(model.layer.training, 1)
+            return [
+                no_learning_phase_out,
+                inf_learning_phase_out,
+                train_learning_phase_out,
+            ]
+
+        if defun:
+            train_step = tf.function(train_step)
+
+        x = tf.ones((1, 1))
+        results = train_step(x)
+        self.assertAllClose(results[0], tf.zeros((1, 1)))
+        self.assertAllClose(results[1], tf.zeros((1, 1)))
+        self.assertAllClose(results[2], tf.ones((1, 1)))
+
+    @parameterized.named_parameters(("eager", False), ("defun", True))
+    def test_training_arg_priorities(self, defun):
+        class MyModel(keras.layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.layer = LayerWithTrainingArg()
+
+            def call(self, inputs, training=False):
+                return self.layer(inputs)
+
+        model = MyModel()
+
+        def train_step(x):
+            explicit_out = model(x, training=True)
+            default_out = model(x)
+            with keras.backend.learning_phase_scope(1):
+                parent_out = model(x, training=False)
+                lr_out = model(x)
+            return [explicit_out, default_out, parent_out, lr_out]
+
+        if defun:
+            train_step = tf.function(train_step)
+
+        x = tf.ones((1, 1))
+        results = train_step(x)
+        self.assertAllClose(results[0], tf.ones((1, 1)))
+        self.assertAllClose(results[1], tf.zeros((1, 1)))
+        self.assertAllClose(results[2], tf.zeros((1, 1)))
+        self.assertAllClose(results[3], tf.ones((1, 1)))
+
+
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/tests/get_config_samples.py b/keras/tests/get_config_samples.py
index 3ef1b630264c..12f9f7df84ed 100644
--- a/keras/tests/get_config_samples.py
+++ b/keras/tests/get_config_samples.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Sample `get_config` results for testing backwards compatibility."""
 
 # inputs = tf.keras.Input(10)
@@ -20,75 +20,69 @@
 # outputs = tf.keras.layers.Dense(1)(x)
 # model = tf.keras.Model(inputs, outputs)
 FUNCTIONAL_DNN = {
-    'input_layers': [['input_1', 0, 0]],
-    'layers': [{
-        'class_name': 'InputLayer',
-        'config': {
-            'batch_input_shape': (None, 10),
-            'dtype': 'float32',
-            'name': 'input_1',
-            'ragged': False,
-            'sparse': False
-        },
-        'inbound_nodes': [],
-        'name': 'input_1'
-    }, {
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'relu',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
+    "input_layers": [["input_1", 0, 0]],
+    "layers": [
+        {
+            "class_name": "InputLayer",
+            "config": {
+                "batch_input_shape": (None, 10),
+                "dtype": "float32",
+                "name": "input_1",
+                "ragged": False,
+                "sparse": False,
             },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
-            },
-            'kernel_regularizer': None,
-            'name': 'dense',
-            'trainable': True,
-            'units': 10,
-            'use_bias': True
+            "inbound_nodes": [],
+            "name": "input_1",
         },
-        'inbound_nodes': [[['input_1', 0, 0, {}]]],
-        'name': 'dense'
-    }, {
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "relu",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense",
+                "trainable": True,
+                "units": 10,
+                "use_bias": True,
             },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+            "inbound_nodes": [[["input_1", 0, 0, {}]]],
+            "name": "dense",
+        },
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense_1",
+                "trainable": True,
+                "units": 1,
+                "use_bias": True,
             },
-            'kernel_regularizer': None,
-            'name': 'dense_1',
-            'trainable': True,
-            'units': 1,
-            'use_bias': True
+            "inbound_nodes": [[["dense", 0, 0, {}]]],
+            "name": "dense_1",
         },
-        'inbound_nodes': [[['dense', 0, 0, {}]]],
-        'name': 'dense_1'
-    }],
-    'name': 'model',
-    'output_layers': [['dense_1', 0, 0]]
+    ],
+    "name": "model",
+    "output_layers": [["dense_1", 0, 0]],
 }
 
 # inputs = tf.keras.Input((256, 256, 3))
@@ -97,90 +91,85 @@
 # outputs = tf.keras.layers.Dense(1)(x)
 # model = tf.keras.Model(inputs, outputs)
 FUNCTIONAL_CNN = {
-    'input_layers': [['input_2', 0, 0]],
-    'layers': [{
-        'class_name': 'InputLayer',
-        'config': {
-            'batch_input_shape': (None, 256, 256, 3),
-            'dtype': 'float32',
-            'name': 'input_2',
-            'ragged': False,
-            'sparse': False
-        },
-        'inbound_nodes': [],
-        'name': 'input_2'
-    }, {
-        'class_name': 'Conv2D',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
-            },
-            'bias_regularizer': None,
-            'data_format': 'channels_last',
-            'dilation_rate': (1, 1),
-            'dtype': 'float32',
-            'filters': 3,
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+    "input_layers": [["input_2", 0, 0]],
+    "layers": [
+        {
+            "class_name": "InputLayer",
+            "config": {
+                "batch_input_shape": (None, 256, 256, 3),
+                "dtype": "float32",
+                "name": "input_2",
+                "ragged": False,
+                "sparse": False,
             },
-            'kernel_regularizer': None,
-            'kernel_size': (3, 3),
-            'name': 'conv2d',
-            'padding': 'valid',
-            'strides': (1, 1),
-            'trainable': True,
-            'use_bias': True
+            "inbound_nodes": [],
+            "name": "input_2",
         },
-        'inbound_nodes': [[['input_2', 0, 0, {}]]],
-        'name': 'conv2d'
-    }, {
-        'class_name': 'Flatten',
-        'config': {
-            'data_format': 'channels_last',
-            'dtype': 'float32',
-            'name': 'flatten',
-            'trainable': True
+        {
+            "class_name": "Conv2D",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "data_format": "channels_last",
+                "dilation_rate": (1, 1),
+                "dtype": "float32",
+                "filters": 3,
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "kernel_size": (3, 3),
+                "name": "conv2d",
+                "padding": "valid",
+                "strides": (1, 1),
+                "trainable": True,
+                "use_bias": True,
+            },
+            "inbound_nodes": [[["input_2", 0, 0, {}]]],
+            "name": "conv2d",
         },
-        'inbound_nodes': [[['conv2d', 0, 0, {}]]],
-        'name': 'flatten'
-    }, {
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
+        {
+            "class_name": "Flatten",
+            "config": {
+                "data_format": "channels_last",
+                "dtype": "float32",
+                "name": "flatten",
+                "trainable": True,
             },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+            "inbound_nodes": [[["conv2d", 0, 0, {}]]],
+            "name": "flatten",
+        },
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense_2",
+                "trainable": True,
+                "units": 1,
+                "use_bias": True,
             },
-            'kernel_regularizer': None,
-            'name': 'dense_2',
-            'trainable': True,
-            'units': 1,
-            'use_bias': True
+            "inbound_nodes": [[["flatten", 0, 0, {}]]],
+            "name": "dense_2",
         },
-        'inbound_nodes': [[['flatten', 0, 0, {}]]],
-        'name': 'dense_2'
-    }],
-    'name': 'model_1',
-    'output_layers': [['dense_2', 0, 0]]
+    ],
+    "name": "model_1",
+    "output_layers": [["dense_2", 0, 0]],
 }
 
 # inputs = tf.keras.Input((10, 3))
@@ -188,153 +177,137 @@
 # outputs = tf.keras.layers.Dense(1)(x)
 # model = tf.keras.Model(inputs, outputs)
 FUNCTIONAL_LSTM = {
-    'input_layers': [['input_5', 0, 0]],
-    'layers': [{
-        'class_name': 'InputLayer',
-        'config': {
-            'batch_input_shape': (None, 10, 3),
-            'dtype': 'float32',
-            'name': 'input_5',
-            'ragged': False,
-            'sparse': False
-        },
-        'inbound_nodes': [],
-        'name': 'input_5'
-    }, {
-        'class_name': 'LSTM',
-        'config': {
-            'activation': 'tanh',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
-            },
-            'bias_regularizer': None,
-            'dropout': 0.0,
-            'dtype': 'float32',
-            'go_backwards': False,
-            'implementation': 2,
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+    "input_layers": [["input_5", 0, 0]],
+    "layers": [
+        {
+            "class_name": "InputLayer",
+            "config": {
+                "batch_input_shape": (None, 10, 3),
+                "dtype": "float32",
+                "name": "input_5",
+                "ragged": False,
+                "sparse": False,
             },
-            'kernel_regularizer': None,
-            'name': 'lstm_2',
-            'recurrent_activation': 'sigmoid',
-            'recurrent_constraint': None,
-            'recurrent_dropout': 0.0,
-            'recurrent_initializer': {
-                'class_name': 'Orthogonal',
-                'config': {
-                    'gain': 1.0,
-                    'seed': None
-                }
-            },
-            'recurrent_regularizer': None,
-            'return_sequences': False,
-            'return_state': False,
-            'stateful': False,
-            'time_major': False,
-            'trainable': True,
-            'unit_forget_bias': True,
-            'units': 10,
-            'unroll': False,
-            'use_bias': True
+            "inbound_nodes": [],
+            "name": "input_5",
         },
-        'inbound_nodes': [[['input_5', 0, 0, {}]]],
-        'name': 'lstm_2'
-    }, {
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
+        {
+            "class_name": "LSTM",
+            "config": {
+                "activation": "tanh",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dropout": 0.0,
+                "dtype": "float32",
+                "go_backwards": False,
+                "implementation": 2,
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "lstm_2",
+                "recurrent_activation": "sigmoid",
+                "recurrent_constraint": None,
+                "recurrent_dropout": 0.0,
+                "recurrent_initializer": {
+                    "class_name": "Orthogonal",
+                    "config": {"gain": 1.0, "seed": None},
+                },
+                "recurrent_regularizer": None,
+                "return_sequences": False,
+                "return_state": False,
+                "stateful": False,
+                "time_major": False,
+                "trainable": True,
+                "unit_forget_bias": True,
+                "units": 10,
+                "unroll": False,
+                "use_bias": True,
             },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+            "inbound_nodes": [[["input_5", 0, 0, {}]]],
+            "name": "lstm_2",
+        },
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense_4",
+                "trainable": True,
+                "units": 1,
+                "use_bias": True,
             },
-            'kernel_regularizer': None,
-            'name': 'dense_4',
-            'trainable': True,
-            'units': 1,
-            'use_bias': True
+            "inbound_nodes": [[["lstm_2", 0, 0, {}]]],
+            "name": "dense_4",
         },
-        'inbound_nodes': [[['lstm_2', 0, 0, {}]]],
-        'name': 'dense_4'
-    }],
-    'name': 'model_3',
-    'output_layers': [['dense_4', 0, 0]]
+    ],
+    "name": "model_3",
+    "output_layers": [["dense_4", 0, 0]],
 }
 
 # model = tf.keras.Sequential()
 # model.add(tf.keras.layers.Dense(10))
 # model.add(tf.keras.layers.Dense(1))
 SEQUENTIAL_DNN = {
-    'layers': [{
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
-            },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+    "layers": [
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense_2",
+                "trainable": True,
+                "units": 10,
+                "use_bias": True,
             },
-            'kernel_regularizer': None,
-            'name': 'dense_2',
-            'trainable': True,
-            'units': 10,
-            'use_bias': True
-        }
-    }, {
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
-            },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+        },
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense_3",
+                "trainable": True,
+                "units": 1,
+                "use_bias": True,
             },
-            'kernel_regularizer': None,
-            'name': 'dense_3',
-            'trainable': True,
-            'units': 1,
-            'use_bias': True
-        }
-    }],
-    'name': 'sequential_1'
+        },
+    ],
+    "name": "sequential_1",
 }
 
 # model = tf.keras.Sequential()
@@ -342,147 +315,131 @@
 # model.add(tf.keras.layers.Flatten())
 # model.add(tf.keras.layers.Dense(1))
 SEQUENTIAL_CNN = {
-    'layers': [{
-        'class_name': 'Conv2D',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
+    "layers": [
+        {
+            "class_name": "Conv2D",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "data_format": "channels_last",
+                "dilation_rate": (1, 1),
+                "dtype": "float32",
+                "filters": 32,
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "kernel_size": (3, 3),
+                "name": "conv2d_1",
+                "padding": "valid",
+                "strides": (1, 1),
+                "trainable": True,
+                "use_bias": True,
             },
-            'bias_regularizer': None,
-            'data_format': 'channels_last',
-            'dilation_rate': (1, 1),
-            'dtype': 'float32',
-            'filters': 32,
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
-            },
-            'kernel_regularizer': None,
-            'kernel_size': (3, 3),
-            'name': 'conv2d_1',
-            'padding': 'valid',
-            'strides': (1, 1),
-            'trainable': True,
-            'use_bias': True
-        }
-    }, {
-        'class_name': 'Flatten',
-        'config': {
-            'data_format': 'channels_last',
-            'dtype': 'float32',
-            'name': 'flatten_1',
-            'trainable': True
-        }
-    }, {
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
+        },
+        {
+            "class_name": "Flatten",
+            "config": {
+                "data_format": "channels_last",
+                "dtype": "float32",
+                "name": "flatten_1",
+                "trainable": True,
             },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+        },
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense_6",
+                "trainable": True,
+                "units": 1,
+                "use_bias": True,
             },
-            'kernel_regularizer': None,
-            'name': 'dense_6',
-            'trainable': True,
-            'units': 1,
-            'use_bias': True
-        }
-    }],
-    'name': 'sequential_4'
+        },
+    ],
+    "name": "sequential_4",
 }
 
 # model = tf.keras.Sequential()
 # model.add(tf.keras.layers.LSTM(10))
 # model.add(tf.keras.layers.Dense(1))
 SEQUENTIAL_LSTM = {
-    'layers': [{
-        'class_name': 'LSTM',
-        'config': {
-            'activation': 'tanh',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
-            },
-            'bias_regularizer': None,
-            'dropout': 0.0,
-            'dtype': 'float32',
-            'go_backwards': False,
-            'implementation': 2,
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
-            },
-            'kernel_regularizer': None,
-            'name': 'lstm',
-            'recurrent_activation': 'sigmoid',
-            'recurrent_constraint': None,
-            'recurrent_dropout': 0.0,
-            'recurrent_initializer': {
-                'class_name': 'Orthogonal',
-                'config': {
-                    'gain': 1.0,
-                    'seed': None
-                }
+    "layers": [
+        {
+            "class_name": "LSTM",
+            "config": {
+                "activation": "tanh",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dropout": 0.0,
+                "dtype": "float32",
+                "go_backwards": False,
+                "implementation": 2,
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "lstm",
+                "recurrent_activation": "sigmoid",
+                "recurrent_constraint": None,
+                "recurrent_dropout": 0.0,
+                "recurrent_initializer": {
+                    "class_name": "Orthogonal",
+                    "config": {"gain": 1.0, "seed": None},
+                },
+                "recurrent_regularizer": None,
+                "return_sequences": False,
+                "return_state": False,
+                "stateful": False,
+                "time_major": False,
+                "trainable": True,
+                "unit_forget_bias": True,
+                "units": 10,
+                "unroll": False,
+                "use_bias": True,
             },
-            'recurrent_regularizer': None,
-            'return_sequences': False,
-            'return_state': False,
-            'stateful': False,
-            'time_major': False,
-            'trainable': True,
-            'unit_forget_bias': True,
-            'units': 10,
-            'unroll': False,
-            'use_bias': True
-        }
-    }, {
-        'class_name': 'Dense',
-        'config': {
-            'activation': 'linear',
-            'activity_regularizer': None,
-            'bias_constraint': None,
-            'bias_initializer': {
-                'class_name': 'Zeros',
-                'config': {}
-            },
-            'bias_regularizer': None,
-            'dtype': 'float32',
-            'kernel_constraint': None,
-            'kernel_initializer': {
-                'class_name': 'GlorotUniform',
-                'config': {
-                    'seed': None
-                }
+        },
+        {
+            "class_name": "Dense",
+            "config": {
+                "activation": "linear",
+                "activity_regularizer": None,
+                "bias_constraint": None,
+                "bias_initializer": {"class_name": "Zeros", "config": {}},
+                "bias_regularizer": None,
+                "dtype": "float32",
+                "kernel_constraint": None,
+                "kernel_initializer": {
+                    "class_name": "GlorotUniform",
+                    "config": {"seed": None},
+                },
+                "kernel_regularizer": None,
+                "name": "dense_4",
+                "trainable": True,
+                "units": 1,
+                "use_bias": True,
             },
-            'kernel_regularizer': None,
-            'name': 'dense_4',
-            'trainable': True,
-            'units': 1,
-            'use_bias': True
-        }
-    }],
-    'name': 'sequential_2'
+        },
+    ],
+    "name": "sequential_2",
 }
diff --git a/keras/tests/get_config_test.py b/keras/tests/get_config_test.py
index b5d42a589913..73c24a920e4b 100644
--- a/keras/tests/get_config_test.py
+++ b/keras/tests/get_config_test.py
@@ -11,44 +11,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#,============================================================================
+# ,============================================================================
 """Tests for `get_config` backwards compatibility."""
 
+import tensorflow.compat.v2 as tf
+
 from keras.engine import sequential
 from keras.engine import training
 from keras.testing_infra import test_combinations
 from keras.tests import get_config_samples
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class TestGetConfigBackwardsCompatible(test_combinations.TestCase):
-
-  def test_functional_dnn(self):
-    model = training.Model.from_config(get_config_samples.FUNCTIONAL_DNN)
-    self.assertLen(model.layers, 3)
-
-  def test_functional_cnn(self):
-    model = training.Model.from_config(get_config_samples.FUNCTIONAL_CNN)
-    self.assertLen(model.layers, 4)
-
-  def test_functional_lstm(self):
-    model = training.Model.from_config(get_config_samples.FUNCTIONAL_LSTM)
-    self.assertLen(model.layers, 3)
-
-  def test_sequential_dnn(self):
-    model = sequential.Sequential.from_config(get_config_samples.SEQUENTIAL_DNN)
-    self.assertLen(model.layers, 2)
-
-  def test_sequential_cnn(self):
-    model = sequential.Sequential.from_config(get_config_samples.SEQUENTIAL_CNN)
-    self.assertLen(model.layers, 3)
-
-  def test_sequential_lstm(self):
-    model = sequential.Sequential.from_config(
-        get_config_samples.SEQUENTIAL_LSTM)
-    self.assertLen(model.layers, 2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_functional_dnn(self):
+        model = training.Model.from_config(get_config_samples.FUNCTIONAL_DNN)
+        self.assertLen(model.layers, 3)
+
+    def test_functional_cnn(self):
+        model = training.Model.from_config(get_config_samples.FUNCTIONAL_CNN)
+        self.assertLen(model.layers, 4)
+
+    def test_functional_lstm(self):
+        model = training.Model.from_config(get_config_samples.FUNCTIONAL_LSTM)
+        self.assertLen(model.layers, 3)
+
+    def test_sequential_dnn(self):
+        model = sequential.Sequential.from_config(
+            get_config_samples.SEQUENTIAL_DNN
+        )
+        self.assertLen(model.layers, 2)
+
+    def test_sequential_cnn(self):
+        model = sequential.Sequential.from_config(
+            get_config_samples.SEQUENTIAL_CNN
+        )
+        self.assertLen(model.layers, 3)
+
+    def test_sequential_lstm(self):
+        model = sequential.Sequential.from_config(
+            get_config_samples.SEQUENTIAL_LSTM
+        )
+        self.assertLen(model.layers, 2)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/graph_util_test.py b/keras/tests/graph_util_test.py
index 6ebbcc72a08d..40884cf9d880 100644
--- a/keras/tests/graph_util_test.py
+++ b/keras/tests/graph_util_test.py
@@ -14,133 +14,164 @@
 # ==============================================================================
 """Tests for tensorflow.python.client.graph_util."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
-from tensorflow.core.protobuf import meta_graph_pb2
 import keras
+
+# isort: off
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.grappler import tf_optimizer
-from tensorflow.python.training.saver import export_meta_graph
+from tensorflow.python.training.saver import (
+    export_meta_graph,
+)
 
 
 class ConvertVariablesToConstantsTest(tf.test.TestCase):
-
-  def _get_tensors(self, sess, tensor_list):
-    """Returns a list of Tensor objects from the Session."""
-    return [
-        sess.graph.get_tensor_by_name(tensor.name) for tensor in tensor_list
-    ]
-
-  def _get_tensor_names(self, tensors):
-    """Returns a list of string names for the tensors specified."""
-    return [tensor.name.split(":")[0] for tensor in tensors]
-
-  def _evaluate_graph_def(self, graph_def, inputs, outputs, input_data):
-    """Evaluates the GraphDef using Sessions."""
-    with tf.Graph().as_default() as graph:
-      tf.import_graph_def(graph_def, name="")
-      sess = tf.compat.v1.Session(graph=graph)
-
-    input_tensors = self._get_tensors(sess, inputs)
-    output_tensors = self._get_tensors(sess, outputs)
-    return sess.run(
-        output_tensors, feed_dict=dict(zip(input_tensors, input_data)))
-
-  def _ensure_no_variables_in_graph(self, graph_def):
-    """Ensures there are no variables in the graph."""
-    for node in graph_def.node:
-      self.assertNotIn(
-          node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
-
-  def _test_converted_keras_model(self, model, constant_graph_def, input_data):
-    """Compares the converted Keras model."""
-    expected_value = model.predict(input_data)
-    actual_value = self._evaluate_graph_def(constant_graph_def, model.inputs,
-                                            model.outputs, [input_data])
-    np.testing.assert_almost_equal(np.array([expected_value]), actual_value, 5)
-
-  def _inline_functions(self, graph_def, arrays):
-    meta_graph = export_meta_graph(graph_def=graph_def)
-    fetch_collection = meta_graph_pb2.CollectionDef()
-    for name in arrays:
-      fetch_collection.node_list.value.append(name)
-    meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
-
-    # Initialize RewriterConfig with everything disabled except function
-    # inlining.
-    config = tf.compat.v1.ConfigProto()
-    rewrite_options = config.graph_options.rewrite_options
-    rewrite_options.optimizers.append("function")
-    return tf_optimizer.OptimizeGraph(config, meta_graph)
-
-  def testWithEmbeddings(self):
-    """Freezes a graph with embeddings."""
-    state_input = keras.layers.Input(
-        shape=(1,), name="state_input", dtype="int32")
-    output = keras.layers.Embedding(
-        output_dim=16, input_dim=100, input_length=1, name="state")(
-            state_input)
-    model = keras.models.Model(inputs=[state_input], outputs=[output])
-    model.compile(
-        loss={"state": "sparse_categorical_crossentropy"}, optimizer="adam")
-
-    # Freeze the graph.
-    sess = keras.backend.get_session()
-    variable_graph_def = sess.graph_def
-    output_tensor = self._get_tensor_names(model.outputs)
-    constant_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
-        sess, variable_graph_def, output_tensor)
-
-    # Validate converted graph.
-    input_data = np.array(np.random.random_sample([1, 1]), dtype=np.int32)
-    self._ensure_no_variables_in_graph(constant_graph_def)
-    self._test_converted_keras_model(model, constant_graph_def, input_data)
-
-  def testKerasBatchNorm(self):
-    """Freezes a graph with Keras batch norm."""
-    inputs = keras.layers.Input(shape=(128, 128, 1))
-    batch_norm = keras.layers.BatchNormalization()(inputs)
-    model = keras.models.Model(inputs, batch_norm, name="test")
-    model.compile(
-        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
-    tensor_names = [tensor.name for tensor in model.inputs + model.outputs]
-
-    # Freeze the graph.
-    sess = keras.backend.get_session()
-    variable_graph_def = sess.graph_def
-    variable_graph_def = self._inline_functions(variable_graph_def,
-                                                tensor_names)
-    output_tensor = self._get_tensor_names(model.outputs)
-    constant_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
-        sess, variable_graph_def, output_tensor)
-
-    # Validate converted graph.
-    input_data = np.array(
-        np.random.random_sample([1, 128, 128, 1]), dtype=np.int32)
-    self._ensure_no_variables_in_graph(constant_graph_def)
-    self._test_converted_keras_model(model, constant_graph_def, input_data)
-
-  def testLSTM(self):
-    """Freezes a Keras LSTM."""
-    model = keras.models.Sequential(
-        [keras.layers.LSTM(units=10, input_shape=(10, 10))])
-    tensor_names = [tensor.name for tensor in model.inputs + model.outputs]
-
-    # Freeze the model.
-    sess = keras.backend.get_session()
-    variable_graph_def = sess.graph_def
-    variable_graph_def = self._inline_functions(variable_graph_def,
-                                                tensor_names)
-    output_tensor = self._get_tensor_names(model.outputs)
-    constant_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
-        sess, variable_graph_def, output_tensor)
-
-    # Validate converted graph.
-    input_data = np.array(np.random.random_sample([10, 10, 10]), dtype=np.int32)
-    self._ensure_no_variables_in_graph(constant_graph_def)
-    self._test_converted_keras_model(model, constant_graph_def, input_data)
+    def _get_tensors(self, sess, tensor_list):
+        """Returns a list of Tensor objects from the Session."""
+        return [
+            sess.graph.get_tensor_by_name(tensor.name) for tensor in tensor_list
+        ]
+
+    def _get_tensor_names(self, tensors):
+        """Returns a list of string names for the tensors specified."""
+        return [tensor.name.split(":")[0] for tensor in tensors]
+
+    def _evaluate_graph_def(self, graph_def, inputs, outputs, input_data):
+        """Evaluates the GraphDef using Sessions."""
+        with tf.Graph().as_default() as graph:
+            tf.import_graph_def(graph_def, name="")
+            sess = tf.compat.v1.Session(graph=graph)
+
+        input_tensors = self._get_tensors(sess, inputs)
+        output_tensors = self._get_tensors(sess, outputs)
+        return sess.run(
+            output_tensors, feed_dict=dict(zip(input_tensors, input_data))
+        )
+
+    def _ensure_no_variables_in_graph(self, graph_def):
+        """Ensures there are no variables in the graph."""
+        for node in graph_def.node:
+            self.assertNotIn(
+                node.op,
+                ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"],
+            )
+
+    def _test_converted_keras_model(
+        self, model, constant_graph_def, input_data
+    ):
+        """Compares the converted Keras model."""
+        expected_value = model.predict(input_data)
+        actual_value = self._evaluate_graph_def(
+            constant_graph_def, model.inputs, model.outputs, [input_data]
+        )
+        np.testing.assert_almost_equal(
+            np.array([expected_value]), actual_value, 5
+        )
+
+    def _inline_functions(self, graph_def, arrays):
+        meta_graph = export_meta_graph(graph_def=graph_def)
+        fetch_collection = meta_graph_pb2.CollectionDef()
+        for name in arrays:
+            fetch_collection.node_list.value.append(name)
+        meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+
+        # Initialize RewriterConfig with everything disabled except function
+        # inlining.
+        config = tf.compat.v1.ConfigProto()
+        rewrite_options = config.graph_options.rewrite_options
+        rewrite_options.optimizers.append("function")
+        return tf_optimizer.OptimizeGraph(config, meta_graph)
+
+    def testWithEmbeddings(self):
+        """Freezes a graph with embeddings."""
+        state_input = keras.layers.Input(
+            shape=(1,), name="state_input", dtype="int32"
+        )
+        output = keras.layers.Embedding(
+            output_dim=16, input_dim=100, input_length=1, name="state"
+        )(state_input)
+        model = keras.models.Model(inputs=[state_input], outputs=[output])
+        model.compile(
+            loss={"state": "sparse_categorical_crossentropy"}, optimizer="adam"
+        )
+
+        # Freeze the graph.
+        sess = keras.backend.get_session()
+        variable_graph_def = sess.graph_def
+        output_tensor = self._get_tensor_names(model.outputs)
+        constant_graph_def = (
+            tf.compat.v1.graph_util.convert_variables_to_constants(
+                sess, variable_graph_def, output_tensor
+            )
+        )
+
+        # Validate converted graph.
+        input_data = np.array(np.random.random_sample([1, 1]), dtype=np.int32)
+        self._ensure_no_variables_in_graph(constant_graph_def)
+        self._test_converted_keras_model(model, constant_graph_def, input_data)
+
+    def testKerasBatchNorm(self):
+        """Freezes a graph with Keras batch norm."""
+        inputs = keras.layers.Input(shape=(128, 128, 1))
+        batch_norm = keras.layers.BatchNormalization()(inputs)
+        model = keras.models.Model(inputs, batch_norm, name="test")
+        model.compile(
+            optimizer="adam",
+            loss="categorical_crossentropy",
+            metrics=["accuracy"],
+        )
+        tensor_names = [tensor.name for tensor in model.inputs + model.outputs]
+
+        # Freeze the graph.
+        sess = keras.backend.get_session()
+        variable_graph_def = sess.graph_def
+        variable_graph_def = self._inline_functions(
+            variable_graph_def, tensor_names
+        )
+        output_tensor = self._get_tensor_names(model.outputs)
+        constant_graph_def = (
+            tf.compat.v1.graph_util.convert_variables_to_constants(
+                sess, variable_graph_def, output_tensor
+            )
+        )
+
+        # Validate converted graph.
+        input_data = np.array(
+            np.random.random_sample([1, 128, 128, 1]), dtype=np.int32
+        )
+        self._ensure_no_variables_in_graph(constant_graph_def)
+        self._test_converted_keras_model(model, constant_graph_def, input_data)
+
+    def testLSTM(self):
+        """Freezes a Keras LSTM."""
+        model = keras.models.Sequential(
+            [keras.layers.LSTM(units=10, input_shape=(10, 10))]
+        )
+        tensor_names = [tensor.name for tensor in model.inputs + model.outputs]
+
+        # Freeze the model.
+        sess = keras.backend.get_session()
+        variable_graph_def = sess.graph_def
+        variable_graph_def = self._inline_functions(
+            variable_graph_def, tensor_names
+        )
+        output_tensor = self._get_tensor_names(model.outputs)
+        constant_graph_def = (
+            tf.compat.v1.graph_util.convert_variables_to_constants(
+                sess, variable_graph_def, output_tensor
+            )
+        )
+
+        # Validate converted graph.
+        input_data = np.array(
+            np.random.random_sample([10, 10, 10]), dtype=np.int32
+        )
+        self._ensure_no_variables_in_graph(constant_graph_def)
+        self._test_converted_keras_model(model, constant_graph_def, input_data)
 
 
 if __name__ == "__main__":
-  tf.compat.v1.disable_eager_execution()
-  tf.test.main()
+    tf.compat.v1.disable_eager_execution()
+    tf.test.main()
diff --git a/keras/tests/integration_test.py b/keras/tests/integration_test.py
index cc9c577c7ac6..1ccfa02ae2b1 100644
--- a/keras/tests/integration_test.py
+++ b/keras/tests/integration_test.py
@@ -14,361 +14,435 @@
 # ==============================================================================
 """Integration tests for Keras."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import random
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
+from keras import utils
 from keras.layers.rnn import legacy_cells
 from keras.legacy_tf_layers import base as base_layer
-from keras import utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 class KerasIntegrationTest(test_combinations.TestCase):
-
-  def _save_and_reload_model(self, model):
-    self.temp_dir = self.get_temp_dir()
-    fpath = os.path.join(self.temp_dir,
-                         'test_model_%s' % (random.randint(0, 1e7),))
-    if tf.executing_eagerly():
-      save_format = 'tf'
-    else:
-      if (not isinstance(model, keras.Sequential) and
-          not model._is_graph_network):
-        return model  # Not supported
-      save_format = 'h5'
-    model.save(fpath, save_format=save_format)
-    model = keras.models.load_model(fpath)
-    return model
+    def _save_and_reload_model(self, model):
+        self.temp_dir = self.get_temp_dir()
+        fpath = os.path.join(
+            self.temp_dir, f"test_model_{random.randint(0, 10000000.0)}"
+        )
+        if tf.executing_eagerly():
+            save_format = "tf"
+        else:
+            if (
+                not isinstance(model, keras.Sequential)
+                and not model._is_graph_network
+            ):
+                return model  # Not supported
+            save_format = "h5"
+        model.save(fpath, save_format=save_format)
+        model = keras.models.load_model(fpath)
+        return model
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 class VectorClassificationIntegrationTest(test_combinations.TestCase):
-
-  def test_vector_classification(self):
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=100,
-        test_samples=0,
-        input_shape=(10,),
-        num_classes=2)
-    y_train = utils.to_categorical(y_train)
-
-    model = test_utils.get_model_from_layers(
-        [keras.layers.Dense(16, activation='relu'),
-         keras.layers.Dropout(0.1),
-         keras.layers.Dense(y_train.shape[-1], activation='softmax')],
-        input_shape=x_train.shape[1:])
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
-                        validation_data=(x_train, y_train),
-                        verbose=2)
-    self.assertGreater(history.history['val_acc'][-1], 0.7)
-    _, val_acc = model.evaluate(x_train, y_train)
-    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
-    predictions = model.predict(x_train)
-    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
-
-  def test_vector_classification_shared_model(self):
-    # Test that Sequential models that feature internal updates
-    # and internal losses can be shared.
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=100,
-        test_samples=0,
-        input_shape=(10,),
-        num_classes=2)
-    y_train = utils.to_categorical(y_train)
-
-    base_model = test_utils.get_model_from_layers(
-        [keras.layers.Dense(16,
-                            activation='relu',
-                            kernel_regularizer=keras.regularizers.l2(1e-5),
-                            bias_regularizer=keras.regularizers.l2(1e-5)),
-         keras.layers.BatchNormalization()],
-        input_shape=x_train.shape[1:])
-    x = keras.layers.Input(x_train.shape[1:])
-    y = base_model(x)
-    y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
-    model = keras.models.Model(x, y)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-    self.assertLen(model.losses, 2)
-    if not tf.executing_eagerly():
-      self.assertLen(model.get_updates_for(x), 2)
-    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
-                        validation_data=(x_train, y_train),
-                        verbose=2)
-    self.assertGreater(history.history['val_acc'][-1], 0.7)
-    _, val_acc = model.evaluate(x_train, y_train)
-    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
-    predictions = model.predict(x_train)
-    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+    def test_vector_classification(self):
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=100, test_samples=0, input_shape=(10,), num_classes=2
+        )
+        y_train = utils.to_categorical(y_train)
+
+        model = test_utils.get_model_from_layers(
+            [
+                keras.layers.Dense(16, activation="relu"),
+                keras.layers.Dropout(0.1),
+                keras.layers.Dense(y_train.shape[-1], activation="softmax"),
+            ],
+            input_shape=x_train.shape[1:],
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(
+            x_train,
+            y_train,
+            epochs=10,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+        self.assertGreater(history.history["val_acc"][-1], 0.7)
+        _, val_acc = model.evaluate(x_train, y_train)
+        self.assertAlmostEqual(history.history["val_acc"][-1], val_acc)
+        predictions = model.predict(x_train)
+        self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+    def test_vector_classification_shared_model(self):
+        # Test that Sequential models that feature internal updates
+        # and internal losses can be shared.
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=100, test_samples=0, input_shape=(10,), num_classes=2
+        )
+        y_train = utils.to_categorical(y_train)
+
+        base_model = test_utils.get_model_from_layers(
+            [
+                keras.layers.Dense(
+                    16,
+                    activation="relu",
+                    kernel_regularizer=keras.regularizers.l2(1e-5),
+                    bias_regularizer=keras.regularizers.l2(1e-5),
+                ),
+                keras.layers.BatchNormalization(),
+            ],
+            input_shape=x_train.shape[1:],
+        )
+        x = keras.layers.Input(x_train.shape[1:])
+        y = base_model(x)
+        y = keras.layers.Dense(y_train.shape[-1], activation="softmax")(y)
+        model = keras.models.Model(x, y)
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        self.assertLen(model.losses, 2)
+        if not tf.executing_eagerly():
+            self.assertLen(model.get_updates_for(x), 2)
+        history = model.fit(
+            x_train,
+            y_train,
+            epochs=10,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+        self.assertGreater(history.history["val_acc"][-1], 0.7)
+        _, val_acc = model.evaluate(x_train, y_train)
+        self.assertAlmostEqual(history.history["val_acc"][-1], val_acc)
+        predictions = model.predict(x_train)
+        self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
 
 @test_combinations.run_all_keras_modes
 class SequentialIntegrationTest(KerasIntegrationTest):
-
-  def test_sequential_save_and_pop(self):
-    # Test the following sequence of actions:
-    # - construct a Sequential model and train it
-    # - save it
-    # - load it
-    # - pop its last layer and add a new layer instead
-    # - continue training
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=100,
-        test_samples=0,
-        input_shape=(10,),
-        num_classes=2)
-    y_train = utils.to_categorical(y_train)
-    model = keras.Sequential([
-        keras.layers.Dense(16, activation='relu'),
-        keras.layers.Dropout(0.1),
-        keras.layers.Dense(y_train.shape[-1], activation='softmax')
-    ])
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x_train, y_train, epochs=1, batch_size=10,
-              validation_data=(x_train, y_train),
-              verbose=2)
-    model = self._save_and_reload_model(model)
-
-    model.pop()
-    model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
-
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
-                        validation_data=(x_train, y_train),
-                        verbose=2)
-    self.assertGreater(history.history['val_acc'][-1], 0.7)
-    model = self._save_and_reload_model(model)
-    _, val_acc = model.evaluate(x_train, y_train)
-    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
-    predictions = model.predict(x_train)
-    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+    def test_sequential_save_and_pop(self):
+        # Test the following sequence of actions:
+        # - construct a Sequential model and train it
+        # - save it
+        # - load it
+        # - pop its last layer and add a new layer instead
+        # - continue training
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=100, test_samples=0, input_shape=(10,), num_classes=2
+        )
+        y_train = utils.to_categorical(y_train)
+        model = keras.Sequential(
+            [
+                keras.layers.Dense(16, activation="relu"),
+                keras.layers.Dropout(0.1),
+                keras.layers.Dense(y_train.shape[-1], activation="softmax"),
+            ]
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            x_train,
+            y_train,
+            epochs=1,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+        model = self._save_and_reload_model(model)
+
+        model.pop()
+        model.add(keras.layers.Dense(y_train.shape[-1], activation="softmax"))
+
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(
+            x_train,
+            y_train,
+            epochs=10,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+        self.assertGreater(history.history["val_acc"][-1], 0.7)
+        model = self._save_and_reload_model(model)
+        _, val_acc = model.evaluate(x_train, y_train)
+        self.assertAlmostEqual(history.history["val_acc"][-1], val_acc)
+        predictions = model.predict(x_train)
+        self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
 
 # See b/122473407
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class TimeseriesClassificationIntegrationTest(test_combinations.TestCase):
-
-  @test_combinations.run_with_all_model_types
-  def test_timeseries_classification(self):
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=100,
-        test_samples=0,
-        input_shape=(4, 10),
-        num_classes=2)
-    y_train = utils.to_categorical(y_train)
-
-    layers = [
-        keras.layers.LSTM(5, return_sequences=True),
-        keras.layers.GRU(y_train.shape[-1], activation='softmax')
-    ]
-    model = test_utils.get_model_from_layers(
-        layers, input_shape=x_train.shape[1:])
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(x_train, y_train, epochs=15, batch_size=10,
-                        validation_data=(x_train, y_train),
-                        verbose=2)
-    self.assertGreater(history.history['val_acc'][-1], 0.7)
-    _, val_acc = model.evaluate(x_train, y_train)
-    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
-    predictions = model.predict(x_train)
-    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
-
-  def test_timeseries_classification_sequential_tf_rnn(self):
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=100,
-        test_samples=0,
-        input_shape=(4, 10),
-        num_classes=2)
-    y_train = utils.to_categorical(y_train)
-
-    with base_layer.keras_style_scope():
-      model = keras.models.Sequential()
-      model.add(keras.layers.RNN(legacy_cells.LSTMCell(5),
-                                 return_sequences=True,
-                                 input_shape=x_train.shape[1:]))
-      model.add(keras.layers.RNN(legacy_cells.GRUCell(y_train.shape[-1],
-                                                      activation='softmax',
-                                                      dtype=tf.float32)))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-          metrics=['acc'],
-          run_eagerly=test_utils.should_run_eagerly())
-
-    history = model.fit(x_train, y_train, epochs=15, batch_size=10,
-                        validation_data=(x_train, y_train),
-                        verbose=2)
-    self.assertGreater(history.history['val_acc'][-1], 0.7)
-    _, val_acc = model.evaluate(x_train, y_train)
-    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
-    predictions = model.predict(x_train)
-    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+    @test_combinations.run_with_all_model_types
+    def test_timeseries_classification(self):
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=100,
+            test_samples=0,
+            input_shape=(4, 10),
+            num_classes=2,
+        )
+        y_train = utils.to_categorical(y_train)
+
+        layers = [
+            keras.layers.LSTM(5, return_sequences=True),
+            keras.layers.GRU(y_train.shape[-1], activation="softmax"),
+        ]
+        model = test_utils.get_model_from_layers(
+            layers, input_shape=x_train.shape[1:]
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(
+            x_train,
+            y_train,
+            epochs=15,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+        self.assertGreater(history.history["val_acc"][-1], 0.7)
+        _, val_acc = model.evaluate(x_train, y_train)
+        self.assertAlmostEqual(history.history["val_acc"][-1], val_acc)
+        predictions = model.predict(x_train)
+        self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+    def test_timeseries_classification_sequential_tf_rnn(self):
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=100,
+            test_samples=0,
+            input_shape=(4, 10),
+            num_classes=2,
+        )
+        y_train = utils.to_categorical(y_train)
+
+        with base_layer.keras_style_scope():
+            model = keras.models.Sequential()
+            model.add(
+                keras.layers.RNN(
+                    legacy_cells.LSTMCell(5),
+                    return_sequences=True,
+                    input_shape=x_train.shape[1:],
+                )
+            )
+            model.add(
+                keras.layers.RNN(
+                    legacy_cells.GRUCell(
+                        y_train.shape[-1],
+                        activation="softmax",
+                        dtype=tf.float32,
+                    )
+                )
+            )
+            model.compile(
+                loss="categorical_crossentropy",
+                optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+                metrics=["acc"],
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+        history = model.fit(
+            x_train,
+            y_train,
+            epochs=15,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+        self.assertGreater(history.history["val_acc"][-1], 0.7)
+        _, val_acc = model.evaluate(x_train, y_train)
+        self.assertAlmostEqual(history.history["val_acc"][-1], val_acc)
+        predictions = model.predict(x_train)
+        self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 class ImageClassificationIntegrationTest(test_combinations.TestCase):
-
-  def test_image_classification(self):
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=100,
-        test_samples=0,
-        input_shape=(10, 10, 3),
-        num_classes=2)
-    y_train = utils.to_categorical(y_train)
-
-    layers = [
-        keras.layers.Conv2D(4, 3, padding='same', activation='relu'),
-        keras.layers.Conv2D(8, 3, padding='same'),
-        keras.layers.BatchNormalization(),
-        keras.layers.Conv2D(8, 3, padding='same'),
-        keras.layers.Flatten(),
-        keras.layers.Dense(y_train.shape[-1], activation='softmax')
-    ]
-    model = test_utils.get_model_from_layers(
-        layers, input_shape=x_train.shape[1:])
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
-                        validation_data=(x_train, y_train),
-                        verbose=2)
-    self.assertGreater(history.history['val_acc'][-1], 0.7)
-    _, val_acc = model.evaluate(x_train, y_train)
-    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
-    predictions = model.predict(x_train)
-    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+    def test_image_classification(self):
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=100,
+            test_samples=0,
+            input_shape=(10, 10, 3),
+            num_classes=2,
+        )
+        y_train = utils.to_categorical(y_train)
+
+        layers = [
+            keras.layers.Conv2D(4, 3, padding="same", activation="relu"),
+            keras.layers.Conv2D(8, 3, padding="same"),
+            keras.layers.BatchNormalization(),
+            keras.layers.Conv2D(8, 3, padding="same"),
+            keras.layers.Flatten(),
+            keras.layers.Dense(y_train.shape[-1], activation="softmax"),
+        ]
+        model = test_utils.get_model_from_layers(
+            layers, input_shape=x_train.shape[1:]
+        )
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        history = model.fit(
+            x_train,
+            y_train,
+            epochs=10,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+        self.assertGreater(history.history["val_acc"][-1], 0.7)
+        _, val_acc = model.evaluate(x_train, y_train)
+        self.assertAlmostEqual(history.history["val_acc"][-1], val_acc)
+        predictions = model.predict(x_train)
+        self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
 
 @test_combinations.run_all_keras_modes
 class ActivationV2IntegrationTest(test_combinations.TestCase):
-  """Tests activation function V2 in model exporting and loading.
-
-  This test is to verify in TF 2.x, when 'tf.nn.softmax' is used as an
-  activation function, its model exporting and loading work as expected.
-  Check b/123041942 for details.
-  """
-
-  def test_serialization_v2_model(self):
-    np.random.seed(1337)
-    (x_train, y_train), _ = test_utils.get_test_data(
-        train_samples=100,
-        test_samples=0,
-        input_shape=(10,),
-        num_classes=2)
-    y_train = utils.to_categorical(y_train)
-
-    model = keras.Sequential([
-        keras.layers.Flatten(input_shape=x_train.shape[1:]),
-        keras.layers.Dense(10, activation=tf.nn.relu),
-        # To mimic 'tf.nn.softmax' used in TF 2.x.
-        keras.layers.Dense(y_train.shape[-1], activation=tf.math.softmax),
-    ])
-
-    # Check if 'softmax' is in model.get_config().
-    last_layer_activation = model.get_layer(index=2).get_config()['activation']
-    self.assertEqual(last_layer_activation, 'softmax')
-
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.optimizer_v2.adam.Adam(0.005),
-        metrics=['accuracy'],
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(x_train, y_train, epochs=2, batch_size=10,
-              validation_data=(x_train, y_train),
-              verbose=2)
-
-    output_path = os.path.join(self.get_temp_dir(), 'tf_keras_saved_model')
-    model.save(output_path, save_format='tf')
-    loaded_model = keras.models.load_model(output_path)
-    self.assertEqual(model.summary(), loaded_model.summary())
+    """Tests activation function V2 in model exporting and loading.
+
+    This test is to verify in TF 2.x, when 'tf.nn.softmax' is used as an
+    activation function, its model exporting and loading work as expected.
+    Check b/123041942 for details.
+    """
+
+    def test_serialization_v2_model(self):
+        np.random.seed(1337)
+        (x_train, y_train), _ = test_utils.get_test_data(
+            train_samples=100, test_samples=0, input_shape=(10,), num_classes=2
+        )
+        y_train = utils.to_categorical(y_train)
+
+        model = keras.Sequential(
+            [
+                keras.layers.Flatten(input_shape=x_train.shape[1:]),
+                keras.layers.Dense(10, activation=tf.nn.relu),
+                # To mimic 'tf.nn.softmax' used in TF 2.x.
+                keras.layers.Dense(
+                    y_train.shape[-1], activation=tf.math.softmax
+                ),
+            ]
+        )
+
+        # Check if 'softmax' is in model.get_config().
+        last_layer_activation = model.get_layer(index=2).get_config()[
+            "activation"
+        ]
+        self.assertEqual(last_layer_activation, "softmax")
+
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer=keras.optimizers.legacy.adam.Adam(0.005),
+            metrics=["accuracy"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit(
+            x_train,
+            y_train,
+            epochs=2,
+            batch_size=10,
+            validation_data=(x_train, y_train),
+            verbose=2,
+        )
+
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        model.save(output_path, save_format="tf")
+        loaded_model = keras.models.load_model(output_path)
+        self.assertEqual(model.summary(), loaded_model.summary())
 
 
 @test_combinations.run_with_all_model_types
 @test_utils.run_v2_only
 class TokenClassificationIntegrationTest(test_combinations.TestCase):
-  """Tests a very simple token classification model.
-
-  The main purpose of this test is to verify that everything works as expected
-  when input sequences have variable length, and batches are padded only to the
-  maximum length of each batch. This is very common in NLP, and results in the
-  sequence dimension varying with each batch step for both the features
-  and the labels.
-  """
-
-  def test_token_classification(self):
-
-    def densify(x, y):
-      return x.to_tensor(), y.to_tensor()
-
-    utils.set_random_seed(1337)
-    data = tf.ragged.stack([
-        np.random.randint(low=0, high=16, size=random.randint(4, 16))
-        for _ in range(100)
-    ])
-    labels = tf.ragged.stack(
-        [np.random.randint(low=0, high=3, size=len(arr)) for arr in data])
-    features_dataset = tf.data.Dataset.from_tensor_slices(data)
-    labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
-    dataset = tf.data.Dataset.zip((features_dataset, labels_dataset))
-    dataset = dataset.batch(batch_size=10)
-    dataset = dataset.map(densify)  # Pads with 0 values by default
-
-    layers = [
-        keras.layers.Embedding(16, 4),
-        keras.layers.Conv1D(4, 5, padding='same', activation='relu'),
-        keras.layers.Conv1D(8, 5, padding='same'),
-        keras.layers.BatchNormalization(),
-        keras.layers.Conv1D(3, 5, padding='same', activation='softmax'),
-    ]
-    model = test_utils.get_model_from_layers(layers, input_shape=(None,))
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer='adam',
-        metrics=['acc'])
-    history = model.fit(dataset, epochs=10, validation_data=dataset, verbose=2)
-    self.assertGreater(history.history['val_acc'][-1], 0.5)
-    _, val_acc = model.evaluate(dataset)
-    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
-    predictions = model.predict(dataset)
-    self.assertIsInstance(predictions, tf.RaggedTensor)
-    self.assertEqual(predictions.shape[0], len(dataset) * 10)
-    self.assertEqual(predictions.shape[-1], 3)
-
-if __name__ == '__main__':
-  tf.test.main()
+    """Tests a very simple token classification model.
+
+    The main purpose of this test is to verify that everything works as expected
+    when input sequences have variable length, and batches are padded only to
+    the maximum length of each batch. This is very common in NLP, and results in
+    the sequence dimension varying with each batch step for both the features
+    and the labels.
+    """
+
+    def test_token_classification(self):
+        def densify(x, y):
+            return x.to_tensor(), y.to_tensor()
+
+        utils.set_random_seed(1337)
+        data = tf.ragged.stack(
+            [
+                np.random.randint(low=0, high=16, size=random.randint(4, 16))
+                for _ in range(100)
+            ]
+        )
+        labels = tf.ragged.stack(
+            [np.random.randint(low=0, high=3, size=len(arr)) for arr in data]
+        )
+        features_dataset = tf.data.Dataset.from_tensor_slices(data)
+        labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
+        dataset = tf.data.Dataset.zip((features_dataset, labels_dataset))
+        dataset = dataset.batch(batch_size=10)
+        dataset = dataset.map(densify)  # Pads with 0 values by default
+
+        layers = [
+            keras.layers.Embedding(16, 4),
+            keras.layers.Conv1D(4, 5, padding="same", activation="relu"),
+            keras.layers.Conv1D(8, 5, padding="same"),
+            keras.layers.BatchNormalization(),
+            keras.layers.Conv1D(3, 5, padding="same", activation="softmax"),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(None,))
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer="adam",
+            metrics=["acc"],
+        )
+        history = model.fit(
+            dataset, epochs=10, validation_data=dataset, verbose=2
+        )
+        self.assertGreater(history.history["val_acc"][-1], 0.5)
+        _, val_acc = model.evaluate(dataset)
+        self.assertAlmostEqual(history.history["val_acc"][-1], val_acc)
+        predictions = model.predict(dataset)
+        self.assertIsInstance(predictions, tf.RaggedTensor)
+        self.assertEqual(predictions.shape[0], len(dataset) * 10)
+        self.assertEqual(predictions.shape[-1], 3)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/keras_doctest.py b/keras/tests/keras_doctest.py
index 139432849685..90f2c66b6d4e 100644
--- a/keras/tests/keras_doctest.py
+++ b/keras/tests/keras_doctest.py
@@ -21,138 +21,138 @@
 import os
 import sys
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl import flags
 from absl.testing import absltest
+
 from keras.testing_infra import keras_doctest_lib
-import numpy as np
-import tensorflow as tf
-import tensorflow.compat.v2 as tf
 
 tf.compat.v1.enable_v2_behavior()
 
 # We put doctest after absltest so that it picks up the unittest monkeypatch.
 # Otherwise doctest tests aren't runnable at all.
-import doctest  # pylint: disable=g-import-not-at-top,g-bad-import-order
+import doctest  # noqa: E402
 
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string('module', None, 'A specific module to run doctest on.')
-flags.DEFINE_boolean('list', None,
-                     'List all the modules in the core package imported.')
-flags.DEFINE_string('file', None, 'A specific file to run doctest on.')
+flags.DEFINE_string("module", None, "A specific module to run doctest on.")
+flags.DEFINE_boolean(
+    "list", None, "List all the modules in the core package imported."
+)
+flags.DEFINE_string("file", None, "A specific file to run doctest on.")
 
-flags.mark_flags_as_mutual_exclusive(['module', 'file'])
-flags.mark_flags_as_mutual_exclusive(['list', 'file'])
+flags.mark_flags_as_mutual_exclusive(["module", "file"])
+flags.mark_flags_as_mutual_exclusive(["list", "file"])
 
-PACKAGE = 'keras.'
+PACKAGE = "keras."
 
 
 def find_modules():
-  """Finds all the modules in the core package imported.
+    """Finds all the modules in the core package imported.
 
-  Returns:
-    A list containing all the modules in tensorflow.python.
-  """
+    Returns:
+      A list containing all the modules in tensorflow.python.
+    """
 
-  tf_modules = []
-  for name, module in sys.modules.items():
-    if name.startswith(PACKAGE):
-      tf_modules.append(module)
+    tf_modules = []
+    for name, module in sys.modules.items():
+        if name.startswith(PACKAGE):
+            tf_modules.append(module)
 
-  return tf_modules
+    return tf_modules
 
 
 def filter_on_submodules(all_modules, submodule):
-  """Filters all the modules based on the module flag.
+    """Filters all the modules based on the module flag.
 
-  The module flag has to be relative to the core package imported.
-  For example, if `submodule=keras.layers` then, this function will return
-  all the modules in the submodule.
+    The module flag has to be relative to the core package imported.
+    For example, if `submodule=keras.layers` then, this function will return
+    all the modules in the submodule.
 
-  Args:
-    all_modules: All the modules in the core package.
-    submodule: Submodule to filter from all the modules.
+    Args:
+      all_modules: All the modules in the core package.
+      submodule: Submodule to filter from all the modules.
 
-  Returns:
-    All the modules in the submodule.
-  """
+    Returns:
+      All the modules in the submodule.
+    """
 
-  filtered_modules = [
-      mod for mod in all_modules if PACKAGE + submodule in mod.__name__
-  ]
-  return filtered_modules
+    filtered_modules = [
+        mod for mod in all_modules if PACKAGE + submodule in mod.__name__
+    ]
+    return filtered_modules
 
 
 def get_module_and_inject_docstring(file_path):
-  """Replaces the docstring of the module with the changed file's content.
+    """Replaces the docstring of the module with the changed file's content.
 
-  Args:
-    file_path: Path to the file
+    Args:
+      file_path: Path to the file
 
-  Returns:
-    A list containing the module changed by the file.
-  """
+    Returns:
+      A list containing the module changed by the file.
+    """
 
-  file_path = os.path.abspath(file_path)
-  mod_index = file_path.find(PACKAGE.replace('.', os.sep))
-  file_mod_name, _ = os.path.splitext(file_path[mod_index:])
-  file_module = sys.modules[file_mod_name.replace(os.sep, '.')]
+    file_path = os.path.abspath(file_path)
+    mod_index = file_path.find(PACKAGE.replace(".", os.sep))
+    file_mod_name, _ = os.path.splitext(file_path[mod_index:])
+    file_module = sys.modules[file_mod_name.replace(os.sep, ".")]
 
-  with open(file_path, 'r') as f:
-    content = f.read()
+    with open(file_path, "r") as f:
+        content = f.read()
 
-  file_module.__doc__ = content
+    file_module.__doc__ = content
 
-  return [file_module]
+    return [file_module]
 
 
 class TfTestCase(tf.test.TestCase):
+    def set_up(self, _):
+        self.setUp()
 
-  def set_up(self, _):
-    self.setUp()
-
-  def tear_down(self, _):
-    self.tearDown()
+    def tear_down(self, _):
+        self.tearDown()
 
 
 def load_tests(unused_loader, tests, unused_ignore):
-  """Loads all the tests in the docstrings and runs them."""
-
-  tf_modules = find_modules()
-
-  if FLAGS.module:
-    tf_modules = filter_on_submodules(tf_modules, FLAGS.module)
-
-  if FLAGS.list:
-    print('**************************************************')
-    for mod in tf_modules:
-      print(mod.__name__)
-    print('**************************************************')
+    """Loads all the tests in the docstrings and runs them."""
+
+    tf_modules = find_modules()
+
+    if FLAGS.module:
+        tf_modules = filter_on_submodules(tf_modules, FLAGS.module)
+
+    if FLAGS.list:
+        print("**************************************************")
+        for mod in tf_modules:
+            print(mod.__name__)
+        print("**************************************************")
+        return tests
+
+    if FLAGS.file:
+        tf_modules = get_module_and_inject_docstring(FLAGS.file)
+
+    for module in tf_modules:
+        testcase = TfTestCase()
+        tests.addTests(
+            doctest.DocTestSuite(
+                module,
+                test_finder=doctest.DocTestFinder(exclude_empty=False),
+                extraglobs={"tf": tf, "np": np, "os": os},
+                setUp=testcase.set_up,
+                tearDown=testcase.tear_down,
+                checker=keras_doctest_lib.KerasDoctestOutputChecker(),
+                optionflags=(
+                    doctest.ELLIPSIS
+                    | doctest.NORMALIZE_WHITESPACE
+                    | doctest.IGNORE_EXCEPTION_DETAIL
+                    | doctest.DONT_ACCEPT_BLANKLINE
+                ),
+            )
+        )
     return tests
 
-  if FLAGS.file:
-    tf_modules = get_module_and_inject_docstring(FLAGS.file)
-
-  for module in tf_modules:
-    testcase = TfTestCase()
-    tests.addTests(
-        doctest.DocTestSuite(
-            module,
-            test_finder=doctest.DocTestFinder(exclude_empty=False),
-            extraglobs={
-                'tf': tf,
-                'np': np,
-                'os': os
-            },
-            setUp=testcase.set_up,
-            tearDown=testcase.tear_down,
-            checker=keras_doctest_lib.KerasDoctestOutputChecker(),
-            optionflags=(doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
-                         | doctest.IGNORE_EXCEPTION_DETAIL
-                         | doctest.DONT_ACCEPT_BLANKLINE),
-        ))
-  return tests
-
-
-if __name__ == '__main__':
-  absltest.main()
+
+if __name__ == "__main__":
+    absltest.main()
diff --git a/keras/tests/memory_checker_test.py b/keras/tests/memory_checker_test.py
index 429aee5f2d8a..23373a20a7d3 100644
--- a/keras/tests/memory_checker_test.py
+++ b/keras/tests/memory_checker_test.py
@@ -13,64 +13,70 @@
 # limitations under the License.
 # =============================================================================
 
+import tensorflow.compat.v2 as tf
+
 import keras
 
-import tensorflow.compat.v2 as tf
-from tensorflow.python.framework.memory_checker import MemoryChecker
+# isort: off
+from tensorflow.python.framework.memory_checker import (
+    MemoryChecker,
+)
 
 
 class MemoryCheckerTest(tf.test.TestCase):
-
-  def testKerasBasic(self):
-    # TODO(kkb): Fix the slowness on Forge.
-    self.skipTest('This test is too slow on Forge so disabled for now.')
-
-    x = tf.zeros([1, 1])
-    y = tf.constant([[3]])
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_dim=1))
-    model.compile(loss='mean_squared_error')
-
-    with MemoryChecker() as memory_checker:
-      for _ in range(10):
-        model.fit(x, y)
-        model.evaluate(x, y)
-        memory_checker.record_snapshot()
-
-    memory_checker.report()
-    memory_checker.assert_no_leak_if_all_possibly_except_one()
-
-  def testKerasAdvanced(self):
-    # TODO(kkb): Fix the slowness on Forge.
-    self.skipTest('This test is too slow on Forge so disabled for now.')
-
-    # A real world example taken from the following.
-    # https://github.com/tensorflow/tensorflow/issues/32500
-    # b/142150794
-
-    with MemoryChecker() as memory_checker:
-      rows = 6
-      columns = 7
-      model = keras.Sequential([
-          keras.layers.Flatten(input_shape=[rows * columns, 3]),
-          keras.layers.Dense(7, input_shape=[rows * columns * 3]),
-      ])
-
-      model.compile(
-          optimizer=keras.optimizers.optimizer_v2.gradient_descent.SGD(lr=0.01),
-          loss='mean_squared_error',
-          metrics=['accuracy'])
-      states = [[1] * rows * columns for _ in range(20)]
-      f = tf.one_hot(states, dtype='float32', depth=3)
-
-      for _ in range(20):
-        model.predict(f, steps=10)
-        memory_checker.record_snapshot()
-
-    memory_checker.report()
-    memory_checker.assert_no_leak_if_all_possibly_except_one()
-
-
-if __name__ == '__main__':
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    def testKerasBasic(self):
+        # TODO(kkb): Fix the slowness on Forge.
+        self.skipTest("This test is too slow on Forge so disabled for now.")
+
+        x = tf.zeros([1, 1])
+        y = tf.constant([[3]])
+        model = keras.models.Sequential()
+        model.add(keras.layers.Dense(1, input_dim=1))
+        model.compile(loss="mean_squared_error")
+
+        with MemoryChecker() as memory_checker:
+            for _ in range(10):
+                model.fit(x, y)
+                model.evaluate(x, y)
+                memory_checker.record_snapshot()
+
+        memory_checker.report()
+        memory_checker.assert_no_leak_if_all_possibly_except_one()
+
+    def testKerasAdvanced(self):
+        # TODO(kkb): Fix the slowness on Forge.
+        self.skipTest("This test is too slow on Forge so disabled for now.")
+
+        # A real world example taken from the following.
+        # https://github.com/tensorflow/tensorflow/issues/32500
+        # b/142150794
+
+        with MemoryChecker() as memory_checker:
+            rows = 6
+            columns = 7
+            model = keras.Sequential(
+                [
+                    keras.layers.Flatten(input_shape=[rows * columns, 3]),
+                    keras.layers.Dense(7, input_shape=[rows * columns * 3]),
+                ]
+            )
+
+            model.compile(
+                optimizer=keras.optimizers.legacy.gradient_descent.SGD(lr=0.01),
+                loss="mean_squared_error",
+                metrics=["accuracy"],
+            )
+            states = [[1] * rows * columns for _ in range(20)]
+            f = tf.one_hot(states, dtype="float32", depth=3)
+
+            for _ in range(20):
+                model.predict(f, steps=10)
+                memory_checker.record_snapshot()
+
+        memory_checker.report()
+        memory_checker.assert_no_leak_if_all_possibly_except_one()
+
+
+if __name__ == "__main__":
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/tests/memory_test.py b/keras/tests/memory_test.py
index ffba441cafe3..4f3cb4f9cea3 100644
--- a/keras/tests/memory_test.py
+++ b/keras/tests/memory_test.py
@@ -23,52 +23,55 @@
 import tensorflow.compat.v2 as tf
 
 import keras
-from tensorflow.python.eager.memory_tests import memory_test_util
+
+# isort: off
+from tensorflow.python.eager.memory_tests import (
+    memory_test_util,
+)
 
 
 class SingleLayerNet(keras.Model):
-  """Simple keras model used to ensure that there are no leaks."""
+    """Simple keras model used to ensure that there are no leaks."""
 
-  def __init__(self):
-    super().__init__()
-    self.fc1 = keras.layers.Dense(5)
+    def __init__(self):
+        super().__init__()
+        self.fc1 = keras.layers.Dense(5)
 
-  def call(self, x):
-    return self.fc1(x)
+    def call(self, x):
+        return self.fc1(x)
 
 
 class MemoryTest(tf.test.TestCase):
+    def testMemoryLeakInSimpleModelForwardOnly(self):
+        if not memory_test_util.memory_profiler_is_available():
+            self.skipTest("memory_profiler required to run this test")
 
-  def testMemoryLeakInSimpleModelForwardOnly(self):
-    if not memory_test_util.memory_profiler_is_available():
-      self.skipTest("memory_profiler required to run this test")
-
-    inputs = tf.zeros([32, 100], tf.float32)
-    net = SingleLayerNet()
+        inputs = tf.zeros([1000, 1000], tf.float32)
+        net = SingleLayerNet()
 
-    def f():
-      with tf.GradientTape():
-        net(inputs)
+        def f():
+            with tf.GradientTape():
+                net(inputs)
 
-    memory_test_util.assert_no_leak(f)
+        memory_test_util.assert_no_leak(f, num_iters=1000)
 
-  def testMemoryLeakInSimpleModelForwardAndBackward(self):
-    if not memory_test_util.memory_profiler_is_available():
-      self.skipTest("memory_profiler required to run this test")
+    def testMemoryLeakInSimpleModelForwardAndBackward(self):
+        if not memory_test_util.memory_profiler_is_available():
+            self.skipTest("memory_profiler required to run this test")
 
-    inputs = tf.zeros([32, 100], tf.float32)
-    net = SingleLayerNet()
+        inputs = tf.zeros([1000, 1000], tf.float32)
+        net = SingleLayerNet()
 
-    def f():
-      with tf.GradientTape() as tape:
-        result = net(inputs)
+        def f():
+            with tf.GradientTape() as tape:
+                result = net(inputs)
 
-      tape.gradient(result, net.variables)
+            tape.gradient(result, net.variables)
 
-      del tape
+            del tape
 
-    memory_test_util.assert_no_leak(f)
+        memory_test_util.assert_no_leak(f, num_iters=1000)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/tests/model_architectures.py b/keras/tests/model_architectures.py
index e6237dfe4ec8..b3bd88641990 100644
--- a/keras/tests/model_architectures.py
+++ b/keras/tests/model_architectures.py
@@ -19,275 +19,297 @@
 import keras
 
 # Declaring namedtuple()
-ModelFn = collections.namedtuple('ModelFn',
-                                 ['model', 'input_shape', 'target_shape'])
+ModelFn = collections.namedtuple(
+    "ModelFn", ["model", "input_shape", "target_shape"]
+)
 
 
 def basic_sequential():
-  """Basic sequential model."""
-  model = keras.Sequential([
-      keras.layers.Dense(3, activation='relu', input_shape=(3,)),
-      keras.layers.Dense(2, activation='softmax'),
-  ])
-  return ModelFn(model, (None, 3), (None, 2))
+    """Basic sequential model."""
+    model = keras.Sequential(
+        [
+            keras.layers.Dense(3, activation="relu", input_shape=(3,)),
+            keras.layers.Dense(2, activation="softmax"),
+        ]
+    )
+    return ModelFn(model, (None, 3), (None, 2))
 
 
 def basic_sequential_deferred():
-  """Sequential model with deferred input shape."""
-  model = keras.Sequential([
-      keras.layers.Dense(3, activation='relu'),
-      keras.layers.Dense(2, activation='softmax'),
-  ])
-  return ModelFn(model, (None, 3), (None, 2))
+    """Sequential model with deferred input shape."""
+    model = keras.Sequential(
+        [
+            keras.layers.Dense(3, activation="relu"),
+            keras.layers.Dense(2, activation="softmax"),
+        ]
+    )
+    return ModelFn(model, (None, 3), (None, 2))
 
 
 def stacked_rnn():
-  """Stacked RNN model."""
-  inputs = keras.Input((None, 3))
-  layer = keras.layers.RNN([keras.layers.LSTMCell(2) for _ in range(3)])
-  x = layer(inputs)
-  outputs = keras.layers.Dense(2)(x)
-  model = keras.Model(inputs, outputs)
-  return ModelFn(model, (None, 4, 3), (None, 2))
+    """Stacked RNN model."""
+    inputs = keras.Input((None, 3))
+    layer = keras.layers.RNN([keras.layers.LSTMCell(2) for _ in range(3)])
+    x = layer(inputs)
+    outputs = keras.layers.Dense(2)(x)
+    model = keras.Model(inputs, outputs)
+    return ModelFn(model, (None, 4, 3), (None, 2))
 
 
 def lstm():
-  """LSTM model."""
-  inputs = keras.Input((None, 3))
-  x = keras.layers.LSTM(4, return_sequences=True)(inputs)
-  x = keras.layers.LSTM(3, return_sequences=True)(x)
-  x = keras.layers.LSTM(2, return_sequences=False)(x)
-  outputs = keras.layers.Dense(2)(x)
-  model = keras.Model(inputs, outputs)
-  return ModelFn(model, (None, 4, 3), (None, 2))
+    """LSTM model."""
+    inputs = keras.Input((None, 3))
+    x = keras.layers.LSTM(4, return_sequences=True)(inputs)
+    x = keras.layers.LSTM(3, return_sequences=True)(x)
+    x = keras.layers.LSTM(2, return_sequences=False)(x)
+    outputs = keras.layers.Dense(2)(x)
+    model = keras.Model(inputs, outputs)
+    return ModelFn(model, (None, 4, 3), (None, 2))
 
 
 def multi_input_multi_output():
-  """Multi-input Multi-output model."""
-  body_input = keras.Input(shape=(None,), name='body')
-  tags_input = keras.Input(shape=(2,), name='tags')
+    """Multi-input Multi-output model."""
+    body_input = keras.Input(shape=(None,), name="body")
+    tags_input = keras.Input(shape=(2,), name="tags")
 
-  x = keras.layers.Embedding(10, 4)(body_input)
-  body_features = keras.layers.LSTM(5)(x)
-  x = keras.layers.concatenate([body_features, tags_input])
+    x = keras.layers.Embedding(10, 4)(body_input)
+    body_features = keras.layers.LSTM(5)(x)
+    x = keras.layers.concatenate([body_features, tags_input])
 
-  pred_1 = keras.layers.Dense(2, activation='sigmoid', name='priority')(x)
-  pred_2 = keras.layers.Dense(3, activation='softmax', name='department')(x)
+    pred_1 = keras.layers.Dense(2, activation="sigmoid", name="priority")(x)
+    pred_2 = keras.layers.Dense(3, activation="softmax", name="department")(x)
 
-  model = keras.Model(
-      inputs=[body_input, tags_input], outputs=[pred_1, pred_2])
-  return ModelFn(model, [(None, 1), (None, 2)], [(None, 2), (None, 3)])
+    model = keras.Model(
+        inputs=[body_input, tags_input], outputs=[pred_1, pred_2]
+    )
+    return ModelFn(model, [(None, 1), (None, 2)], [(None, 2), (None, 3)])
 
 
 def nested_sequential_in_functional():
-  """A sequential model nested in a functional model."""
-  inner_model = keras.Sequential([
-      keras.layers.Dense(3, activation='relu', input_shape=(3,)),
-      keras.layers.Dense(2, activation='relu'),
-  ])
+    """A sequential model nested in a functional model."""
+    inner_model = keras.Sequential(
+        [
+            keras.layers.Dense(3, activation="relu", input_shape=(3,)),
+            keras.layers.Dense(2, activation="relu"),
+        ]
+    )
 
-  inputs = keras.Input(shape=(3,))
-  x = inner_model(inputs)
-  outputs = keras.layers.Dense(2, activation='softmax')(x)
-  model = keras.Model(inputs, outputs)
-  return ModelFn(model, (None, 3), (None, 2))
+    inputs = keras.Input(shape=(3,))
+    x = inner_model(inputs)
+    outputs = keras.layers.Dense(2, activation="softmax")(x)
+    model = keras.Model(inputs, outputs)
+    return ModelFn(model, (None, 3), (None, 2))
 
 
 def seq_to_seq():
-  """Sequence to sequence model."""
-  num_encoder_tokens = 3
-  num_decoder_tokens = 3
-  latent_dim = 2
-  encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
-  encoder = keras.layers.LSTM(latent_dim, return_state=True)
-  _, state_h, state_c = encoder(encoder_inputs)
-  encoder_states = [state_h, state_c]
-  decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
-  decoder_lstm = keras.layers.LSTM(
-      latent_dim, return_sequences=True, return_state=True)
-  decoder_outputs, _, _ = decoder_lstm(
-      decoder_inputs, initial_state=encoder_states)
-  decoder_dense = keras.layers.Dense(num_decoder_tokens, activation='softmax')
-  decoder_outputs = decoder_dense(decoder_outputs)
-  model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
-  return ModelFn(
-      model, [(None, 2, num_encoder_tokens), (None, 2, num_decoder_tokens)],
-      (None, 2, num_decoder_tokens))
+    """Sequence to sequence model."""
+    num_encoder_tokens = 3
+    num_decoder_tokens = 3
+    latent_dim = 2
+    encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
+    encoder = keras.layers.LSTM(latent_dim, return_state=True)
+    _, state_h, state_c = encoder(encoder_inputs)
+    encoder_states = [state_h, state_c]
+    decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
+    decoder_lstm = keras.layers.LSTM(
+        latent_dim, return_sequences=True, return_state=True
+    )
+    decoder_outputs, _, _ = decoder_lstm(
+        decoder_inputs, initial_state=encoder_states
+    )
+    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
+    decoder_outputs = decoder_dense(decoder_outputs)
+    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
+    return ModelFn(
+        model,
+        [(None, 2, num_encoder_tokens), (None, 2, num_decoder_tokens)],
+        (None, 2, num_decoder_tokens),
+    )
 
 
 def shared_layer_functional():
-  """Shared layer in a functional model."""
-  main_input = keras.Input(shape=(10,), dtype='int32', name='main_input')
-  x = keras.layers.Embedding(
-      output_dim=5, input_dim=4, input_length=10)(main_input)
-  lstm_out = keras.layers.LSTM(3)(x)
-  auxiliary_output = keras.layers.Dense(
-      1, activation='sigmoid', name='aux_output')(lstm_out)
-  auxiliary_input = keras.Input(shape=(5,), name='aux_input')
-  x = keras.layers.concatenate([lstm_out, auxiliary_input])
-  x = keras.layers.Dense(2, activation='relu')(x)
-  main_output = keras.layers.Dense(
-      1, activation='sigmoid', name='main_output')(x)
-  model = keras.Model(
-      inputs=[main_input, auxiliary_input],
-      outputs=[main_output, auxiliary_output])
-  return ModelFn(model, [(None, 10), (None, 5)], [(None, 1), (None, 1)])
+    """Shared layer in a functional model."""
+    main_input = keras.Input(shape=(10,), dtype="int32", name="main_input")
+    x = keras.layers.Embedding(output_dim=5, input_dim=4, input_length=10)(
+        main_input
+    )
+    lstm_out = keras.layers.LSTM(3)(x)
+    auxiliary_output = keras.layers.Dense(
+        1, activation="sigmoid", name="aux_output"
+    )(lstm_out)
+    auxiliary_input = keras.Input(shape=(5,), name="aux_input")
+    x = keras.layers.concatenate([lstm_out, auxiliary_input])
+    x = keras.layers.Dense(2, activation="relu")(x)
+    main_output = keras.layers.Dense(
+        1, activation="sigmoid", name="main_output"
+    )(x)
+    model = keras.Model(
+        inputs=[main_input, auxiliary_input],
+        outputs=[main_output, auxiliary_output],
+    )
+    return ModelFn(model, [(None, 10), (None, 5)], [(None, 1), (None, 1)])
 
 
 def shared_sequential():
-  """Shared sequential model in a functional model."""
-  inner_model = keras.Sequential([
-      keras.layers.Conv2D(2, 3, activation='relu'),
-      keras.layers.Conv2D(2, 3, activation='relu'),
-  ])
-  inputs_1 = keras.Input((5, 5, 3))
-  inputs_2 = keras.Input((5, 5, 3))
-  x1 = inner_model(inputs_1)
-  x2 = inner_model(inputs_2)
-  x = keras.layers.concatenate([x1, x2])
-  outputs = keras.layers.GlobalAveragePooling2D()(x)
-  model = keras.Model([inputs_1, inputs_2], outputs)
-  return ModelFn(model, [(None, 5, 5, 3), (None, 5, 5, 3)], (None, 4))
+    """Shared sequential model in a functional model."""
+    inner_model = keras.Sequential(
+        [
+            keras.layers.Conv2D(2, 3, activation="relu"),
+            keras.layers.Conv2D(2, 3, activation="relu"),
+        ]
+    )
+    inputs_1 = keras.Input((5, 5, 3))
+    inputs_2 = keras.Input((5, 5, 3))
+    x1 = inner_model(inputs_1)
+    x2 = inner_model(inputs_2)
+    x = keras.layers.concatenate([x1, x2])
+    outputs = keras.layers.GlobalAveragePooling2D()(x)
+    model = keras.Model([inputs_1, inputs_2], outputs)
+    return ModelFn(model, [(None, 5, 5, 3), (None, 5, 5, 3)], (None, 4))
 
 
 class MySubclassModel(keras.Model):
-  """A subclass model."""
+    """A subclass model."""
 
-  def __init__(self, input_dim=3):
-    super().__init__(name='my_subclass_model')
-    self._config = {'input_dim': input_dim}
-    self.dense1 = keras.layers.Dense(8, activation='relu')
-    self.dense2 = keras.layers.Dense(2, activation='softmax')
-    self.bn = keras.layers.BatchNormalization()
-    self.dp = keras.layers.Dropout(0.5)
+    def __init__(self, input_dim=3):
+        super().__init__(name="my_subclass_model")
+        self._config = {"input_dim": input_dim}
+        self.dense1 = keras.layers.Dense(8, activation="relu")
+        self.dense2 = keras.layers.Dense(2, activation="softmax")
+        self.bn = keras.layers.BatchNormalization()
+        self.dp = keras.layers.Dropout(0.5)
 
-  def call(self, inputs, **kwargs):
-    x = self.dense1(inputs)
-    x = self.dp(x)
-    x = self.bn(x)
-    return self.dense2(x)
+    def call(self, inputs, **kwargs):
+        x = self.dense1(inputs)
+        x = self.dp(x)
+        x = self.bn(x)
+        return self.dense2(x)
 
-  def get_config(self):
-    return self._config
+    def get_config(self):
+        return self._config
 
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
 
 
 def nested_subclassed_model():
-  """A subclass model nested in another subclass model."""
+    """A subclass model nested in another subclass model."""
 
-  class NestedSubclassModel(keras.Model):
-    """A nested subclass model."""
+    class NestedSubclassModel(keras.Model):
+        """A nested subclass model."""
 
-    def __init__(self):
-      super().__init__()
-      self.dense1 = keras.layers.Dense(4, activation='relu')
-      self.dense2 = keras.layers.Dense(2, activation='relu')
-      self.bn = keras.layers.BatchNormalization()
-      self.inner_subclass_model = MySubclassModel()
+        def __init__(self):
+            super().__init__()
+            self.dense1 = keras.layers.Dense(4, activation="relu")
+            self.dense2 = keras.layers.Dense(2, activation="relu")
+            self.bn = keras.layers.BatchNormalization()
+            self.inner_subclass_model = MySubclassModel()
 
-    def call(self, inputs):
-      x = self.dense1(inputs)
-      x = self.bn(x)
-      x = self.inner_subclass_model(x)
-      return self.dense2(x)
+        def call(self, inputs):
+            x = self.dense1(inputs)
+            x = self.bn(x)
+            x = self.inner_subclass_model(x)
+            return self.dense2(x)
 
-  return ModelFn(NestedSubclassModel(), (None, 3), (None, 2))
+    return ModelFn(NestedSubclassModel(), (None, 3), (None, 2))
 
 
 def nested_subclassed_in_functional_model():
-  """A subclass model nested in a functional model."""
-  inner_subclass_model = MySubclassModel()
-  inputs = keras.Input(shape=(3,))
-  x = inner_subclass_model(inputs)
-  x = keras.layers.BatchNormalization()(x)
-  outputs = keras.layers.Dense(2, activation='softmax')(x)
-  model = keras.Model(inputs, outputs)
-  return ModelFn(model, (None, 3), (None, 2))
+    """A subclass model nested in a functional model."""
+    inner_subclass_model = MySubclassModel()
+    inputs = keras.Input(shape=(3,))
+    x = inner_subclass_model(inputs)
+    x = keras.layers.BatchNormalization()(x)
+    outputs = keras.layers.Dense(2, activation="softmax")(x)
+    model = keras.Model(inputs, outputs)
+    return ModelFn(model, (None, 3), (None, 2))
 
 
 def nested_functional_in_subclassed_model():
-  """A functional model nested in a subclass model."""
-  def get_functional_model():
-    inputs = keras.Input(shape=(4,))
-    x = keras.layers.Dense(4, activation='relu')(inputs)
-    x = keras.layers.BatchNormalization()(x)
-    outputs = keras.layers.Dense(2)(x)
-    return keras.Model(inputs, outputs)
+    """A functional model nested in a subclass model."""
 
-  class NestedFunctionalInSubclassModel(keras.Model):
-    """A functional nested in subclass model."""
+    def get_functional_model():
+        inputs = keras.Input(shape=(4,))
+        x = keras.layers.Dense(4, activation="relu")(inputs)
+        x = keras.layers.BatchNormalization()(x)
+        outputs = keras.layers.Dense(2)(x)
+        return keras.Model(inputs, outputs)
 
-    def __init__(self):
-      super().__init__(
-          name='nested_functional_in_subclassed_model')
-      self.dense1 = keras.layers.Dense(4, activation='relu')
-      self.dense2 = keras.layers.Dense(2, activation='relu')
-      self.inner_functional_model = get_functional_model()
+    class NestedFunctionalInSubclassModel(keras.Model):
+        """A functional nested in subclass model."""
 
-    def call(self, inputs):
-      x = self.dense1(inputs)
-      x = self.inner_functional_model(x)
-      return self.dense2(x)
-  return ModelFn(NestedFunctionalInSubclassModel(), (None, 3), (None, 2))
+        def __init__(self):
+            super().__init__(name="nested_functional_in_subclassed_model")
+            self.dense1 = keras.layers.Dense(4, activation="relu")
+            self.dense2 = keras.layers.Dense(2, activation="relu")
+            self.inner_functional_model = get_functional_model()
+
+        def call(self, inputs):
+            x = self.dense1(inputs)
+            x = self.inner_functional_model(x)
+            return self.dense2(x)
+
+    return ModelFn(NestedFunctionalInSubclassModel(), (None, 3), (None, 2))
 
 
 def shared_layer_subclassed_model():
-  """Shared layer in a subclass model."""
+    """Shared layer in a subclass model."""
+
+    class SharedLayerSubclassModel(keras.Model):
+        """A subclass model with shared layers."""
 
-  class SharedLayerSubclassModel(keras.Model):
-    """A subclass model with shared layers."""
+        def __init__(self):
+            super().__init__(name="shared_layer_subclass_model")
+            self.dense = keras.layers.Dense(3, activation="relu")
+            self.dp = keras.layers.Dropout(0.5)
+            self.bn = keras.layers.BatchNormalization()
 
-    def __init__(self):
-      super().__init__(
-          name='shared_layer_subclass_model')
-      self.dense = keras.layers.Dense(3, activation='relu')
-      self.dp = keras.layers.Dropout(0.5)
-      self.bn = keras.layers.BatchNormalization()
+        def call(self, inputs):
+            x = self.dense(inputs)
+            x = self.dp(x)
+            x = self.bn(x)
+            return self.dense(x)
 
-    def call(self, inputs):
-      x = self.dense(inputs)
-      x = self.dp(x)
-      x = self.bn(x)
-      return self.dense(x)
-  return ModelFn(SharedLayerSubclassModel(), (None, 3), (None, 3))
+    return ModelFn(SharedLayerSubclassModel(), (None, 3), (None, 3))
 
 
 def functional_with_keyword_args():
-  """A functional model with keyword args."""
-  inputs = keras.Input(shape=(3,))
-  x = keras.layers.Dense(4)(inputs)
-  x = keras.layers.BatchNormalization()(x)
-  outputs = keras.layers.Dense(2)(x)
+    """A functional model with keyword args."""
+    inputs = keras.Input(shape=(3,))
+    x = keras.layers.Dense(4)(inputs)
+    x = keras.layers.BatchNormalization()(x)
+    outputs = keras.layers.Dense(2)(x)
 
-  model = keras.Model(inputs, outputs, name='m', trainable=False)
-  return ModelFn(model, (None, 3), (None, 2))
+    model = keras.Model(inputs, outputs, name="m", trainable=False)
+    return ModelFn(model, (None, 3), (None, 2))
 
 
 ALL_MODELS = [
-    ('basic_sequential', basic_sequential),
-    ('basic_sequential_deferred', basic_sequential_deferred),
-    ('stacked_rnn', stacked_rnn),
-    ('lstm', lstm),
-    ('multi_input_multi_output', multi_input_multi_output),
-    ('nested_sequential_in_functional', nested_sequential_in_functional),
-    ('seq_to_seq', seq_to_seq),
-    ('shared_layer_functional', shared_layer_functional),
-    ('shared_sequential', shared_sequential),
-    ('nested_subclassed_model', nested_subclassed_model),
-    ('nested_subclassed_in_functional_model',
-     nested_subclassed_in_functional_model),
-    ('nested_functional_in_subclassed_model',
-     nested_functional_in_subclassed_model),
-    ('shared_layer_subclassed_model', shared_layer_subclassed_model),
-    ('functional_with_keyword_args', functional_with_keyword_args)
+    ("basic_sequential", basic_sequential),
+    ("basic_sequential_deferred", basic_sequential_deferred),
+    ("stacked_rnn", stacked_rnn),
+    ("lstm", lstm),
+    ("multi_input_multi_output", multi_input_multi_output),
+    ("nested_sequential_in_functional", nested_sequential_in_functional),
+    ("seq_to_seq", seq_to_seq),
+    ("shared_layer_functional", shared_layer_functional),
+    ("shared_sequential", shared_sequential),
+    ("nested_subclassed_model", nested_subclassed_model),
+    (
+        "nested_subclassed_in_functional_model",
+        nested_subclassed_in_functional_model,
+    ),
+    (
+        "nested_functional_in_subclassed_model",
+        nested_functional_in_subclassed_model,
+    ),
+    ("shared_layer_subclassed_model", shared_layer_subclassed_model),
+    ("functional_with_keyword_args", functional_with_keyword_args),
 ]
 
 
 def get_models(exclude_models=None):
-  """Get all models excluding the specified ones."""
-  models = [model for model in ALL_MODELS
-            if model[0] not in exclude_models]
-  return models
+    """Get all models excluding the specified ones."""
+    models = [model for model in ALL_MODELS if model[0] not in exclude_models]
+    return models
diff --git a/keras/tests/model_architectures_test.py b/keras/tests/model_architectures_test.py
index b8f4637d7430..73193c3b1117 100644
--- a/keras/tests/model_architectures_test.py
+++ b/keras/tests/model_architectures_test.py
@@ -12,97 +12,96 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-"""Tests for saving/loading function for keras Model."""
 
-import tensorflow.compat.v2 as tf
+"""Tests for saving/loading function for keras Model."""
 
 import os
 import shutil
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from keras.testing_infra import test_combinations
 from keras.optimizers import optimizer_v1
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.tests import model_architectures
 
 
 @test_combinations.run_with_all_saved_model_formats
 class TestModelArchitectures(test_combinations.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  def get_test_data(self, input_shape, target_shape):
-    """Generate test dataset for testing."""
-    if isinstance(input_shape, list):
-      x = [
-          np.random.random((2,) + input_shape[i][1:])
-          for i in range(len(input_shape))
-      ]
-    else:
-      x = np.random.random((2,) + input_shape[1:])
-
-    if isinstance(target_shape, list):
-      y = [
-          np.random.random((2,) + target_shape[i][1:])
-          for i in range(len(target_shape))
-      ]
-    else:
-      y = np.random.random((2,) + target_shape[1:])
-
-    return x, y
-
-  def get_custom_objects(self):
-    """Define custom_objects."""
-
-    class CustomOpt(optimizer_v1.SGD):
-      pass
-
-    def custom_loss(y_true, y_pred):
-      return keras.losses.mse(y_true, y_pred)
-
-    return {'CustomOpt': CustomOpt,
-            'custom_loss': custom_loss}
-
-  @parameterized.named_parameters(*model_architectures.ALL_MODELS)
-  def test_basic_saving_and_loading(self, model_fn):
-    save_format = test_utils.get_save_format()
-    custom_objects = self.get_custom_objects()
-    if 'subclassed_in_functional' in model_fn.__name__:
-      subclass_custom_objects = {
-          'MySubclassModel':
-              model_architectures.MySubclassModel,
-      }
-      custom_objects.update(subclass_custom_objects)
-    elif ('subclassed' in model_fn.__name__ and save_format == 'h5'):
-      self.skipTest('Saving the model to HDF5 format requires the model to be '
-                    'a Functional model or a Sequential model.')
-
-    saved_model_dir = self._save_model_dir()
-    model_data = model_fn()
-    model = model_data.model
-    x_test, y_test = self.get_test_data(
-        model_data.input_shape, model_data.target_shape)
-    model.compile('rmsprop', 'mse')
-    model.train_on_batch(x_test, y_test)
-
-    # Save model.
-    out1 = model.predict(x_test)
-    keras.models.save_model(model, saved_model_dir, save_format=save_format)
-    # Load model.
-    loaded_model = keras.models.load_model(
-        saved_model_dir,
-        custom_objects=custom_objects)
-    out2 = loaded_model.predict(x_test)
-
-    self.assertAllClose(out1, out2, atol=1e-05)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _save_model_dir(self, dirname="saved_model"):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        return os.path.join(temp_dir, dirname)
+
+    def get_test_data(self, input_shape, target_shape):
+        """Generate test dataset for testing."""
+        if isinstance(input_shape, list):
+            x = [
+                np.random.random((2,) + input_shape[i][1:])
+                for i in range(len(input_shape))
+            ]
+        else:
+            x = np.random.random((2,) + input_shape[1:])
+
+        if isinstance(target_shape, list):
+            y = [
+                np.random.random((2,) + target_shape[i][1:])
+                for i in range(len(target_shape))
+            ]
+        else:
+            y = np.random.random((2,) + target_shape[1:])
+
+        return x, y
+
+    def get_custom_objects(self):
+        """Define custom_objects."""
+
+        class CustomOpt(optimizer_v1.SGD):
+            pass
+
+        def custom_loss(y_true, y_pred):
+            return keras.losses.mse(y_true, y_pred)
+
+        return {"CustomOpt": CustomOpt, "custom_loss": custom_loss}
+
+    @parameterized.named_parameters(*model_architectures.ALL_MODELS)
+    def test_basic_saving_and_loading(self, model_fn):
+        save_format = test_utils.get_save_format()
+        custom_objects = self.get_custom_objects()
+        if "subclassed_in_functional" in model_fn.__name__:
+            subclass_custom_objects = {
+                "MySubclassModel": model_architectures.MySubclassModel,
+            }
+            custom_objects.update(subclass_custom_objects)
+        elif "subclassed" in model_fn.__name__ and save_format == "h5":
+            self.skipTest(
+                "Saving the model to HDF5 format requires the model to be "
+                "a Functional model or a Sequential model."
+            )
+
+        saved_model_dir = self._save_model_dir()
+        model_data = model_fn()
+        model = model_data.model
+        x_test, y_test = self.get_test_data(
+            model_data.input_shape, model_data.target_shape
+        )
+        model.compile("rmsprop", "mse")
+        model.train_on_batch(x_test, y_test)
+
+        # Save model.
+        out1 = model.predict(x_test)
+        keras.models.save_model(model, saved_model_dir, save_format=save_format)
+        # Load model.
+        loaded_model = keras.models.load_model(
+            saved_model_dir, custom_objects=custom_objects
+        )
+        out2 = loaded_model.predict(x_test)
+
+        self.assertAllClose(out1, out2, atol=1e-05)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/model_subclassing_compiled_test.py b/keras/tests/model_subclassing_compiled_test.py
index fd60b326d9fb..1a93734f4f20 100644
--- a/keras/tests/model_subclassing_compiled_test.py
+++ b/keras/tests/model_subclassing_compiled_test.py
@@ -14,11 +14,10 @@
 # ==============================================================================
 """Tests for compiled Model subclassing."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.testing_infra import test_combinations
@@ -26,413 +25,455 @@
 from keras.tests import model_subclassing_test_util as model_util
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+    import h5py
 except ImportError:
-  h5py = None
+    h5py = None
 
 
 @test_combinations.run_all_keras_modes
 class ModelSubclassCompiledTest(test_combinations.TestCase):
-
-  def test_single_io_workflow_with_np_arrays(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = test_utils.SmallSubclassMLP(
-        num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc', keras.metrics.CategoricalAccuracy()],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-  def test_multi_io_workflow_with_np_arrays(self):
-    num_classes = (2, 3)
-    num_samples = 1000
-    input_dim = 50
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_dp=True, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
-
-  def test_single_io_workflow_with_datasets(self):
-    num_classes = 2
-    num_samples = 10
-    input_dim = 50
-
-    with self.cached_session():
-      model = test_utils.SmallSubclassMLP(
-          num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True)
-      model.compile(
-          loss='mse',
-          optimizer='rmsprop',
-          run_eagerly=test_utils.should_run_eagerly())
-
-      x = np.ones((num_samples, input_dim), dtype=np.float32)
-      y = np.zeros((num_samples, num_classes), dtype=np.float32)
-      dataset = tf.data.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      model.fit(dataset, epochs=2, steps_per_epoch=10, verbose=0)
-      _ = model.evaluate(dataset, steps=10, verbose=0)
-
-  def test_attributes(self):
-    # layers, weights, trainable_weights, non_trainable_weights, inputs, outputs
-
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    self.assertEqual(model.name, 'test_model')
-    self.assertEqual(model.built, False)
-    self.assertEqual(len(model.weights), 0)
-
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch([x1, x2], [y1, y2])
-
-    self.assertEqual(model.built, True)
-    self.assertEqual(len(model.layers), 4)
-    self.assertEqual(len(model.weights), 10)
-    self.assertEqual(len(model.trainable_weights), 8)
-    self.assertEqual(len(model.non_trainable_weights), 2)
-
-  def test_updates(self):
-    # test that updates get run during training
-    num_samples = 100
-    input_dim = 50
-
-    class BNNet(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.bn = keras.layers.BatchNormalization(beta_initializer='ones',
-                                                  gamma_initializer='ones')
-
-      def call(self, inputs):
-        return self.bn(inputs)
-
-    x = np.ones((num_samples, input_dim))
-    y = np.ones((num_samples, input_dim))
-
-    model = BNNet()
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    y_ref = model.predict(x)
-
-    model.train_on_batch(x, y)
-    y_new = model.predict(x)
-    self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
-
-  def test_training_and_inference_behavior(self):
-    # test that dropout is applied in training and not inference
-
-    num_samples = 100
-    input_dim = 50
-
-    class DPNet(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dp = keras.layers.Dropout(0.5)
-        self.dense = keras.layers.Dense(1,
-                                        use_bias=False,
-                                        kernel_initializer='ones')
-
-      def call(self, inputs):
-        x = self.dp(inputs)
-        return self.dense(x)
-
-    model = DPNet()
-    x = np.ones((num_samples, input_dim))
-    y = model.predict(x)
-    self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(x, y)
-    self.assertGreater(loss, 0.1)
-
-  def test_training_methods(self):
-    # test fit, train_on_batch
-    # on different input types: list, dict
-
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-    model.fit({'input_1': x1, 'input_2': x2},
-              {'output_1': y1, 'output_2': y2},
-              epochs=2, batch_size=32)
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0,
-              validation_data=([x1, x2], [y1, y2]))
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.train_on_batch([x1, x2], [y1, y2])
-    model.train_on_batch({'input_1': x1, 'input_2': x2},
-                         {'output_1': y1, 'output_2': y2})
-
-  def test_inference_methods(self):
-    # test predict, evaluate, test_on_batch, predict_on_batch
-    # on different input types: list, dict
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.evaluate([x1, x2], [y1, y2])
-    model.test_on_batch([x1, x2], [y1, y2])
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-    model.predict([x1, x2])
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-    model.predict_on_batch([x1, x2])
-
-  def test_saving(self):
-    num_classes = (2, 3)
-    num_samples = 100
-    input_dim = 50
-
-    x1 = np.ones((num_samples, input_dim))
-    x2 = np.ones((num_samples, input_dim))
-    y1 = np.zeros((num_samples, num_classes[0]))
-    y2 = np.zeros((num_samples, num_classes[1]))
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-    y_ref_1, y_ref_2 = model.predict([x1, x2])
-
-    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
-    model.save_weights(tf_format_name)
-    if h5py is not None:
-      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
-      model.save_weights(hdf5_format_name)
-
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=num_classes, use_bn=True)
-
-    if h5py is not None:
-      with self.assertRaises(ValueError):
-        model.load_weights(hdf5_format_name)
-
-    model.load_weights(tf_format_name)
-
-    y1, y2 = model.predict([x1, x2])
-    self.assertAllClose(y_ref_1, y1, atol=1e-5)
-    self.assertAllClose(y_ref_2, y2, atol=1e-5)
-
-    if h5py is not None:
-      model.load_weights(hdf5_format_name)
-
-      y1, y2 = model.predict([x1, x2])
-      self.assertAllClose(y_ref_1, y1, atol=1e-5)
-      self.assertAllClose(y_ref_2, y2, atol=1e-5)
-
-  def test_subclass_nested_in_subclass(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = model_util.NestedTestModel1(num_classes=num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
-    self.assertEqual(len(model.non_trainable_weights),
-                     2 + len(model.test_net.non_trainable_weights))
-    self.assertEqual(len(model.trainable_weights),
-                     6 + len(model.test_net.trainable_weights))
-
-  def test_graph_nested_in_subclass(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = model_util.NestedTestModel2(num_classes=num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
-    self.assertEqual(len(model.non_trainable_weights),
-                     2 + len(model.test_net.non_trainable_weights))
-    self.assertEqual(len(model.trainable_weights),
-                     6 + len(model.test_net.trainable_weights))
-
-  def test_subclass_nested_in_graph(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    model = model_util.get_nested_model_3(
-        input_dim=input_dim, num_classes=num_classes)
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 16)
-    self.assertEqual(len(model.non_trainable_weights), 4)
-    self.assertEqual(len(model.trainable_weights), 12)
-
-  def test_subclass_nested_in_sequential(self):
-    num_classes = 2
-    num_samples = 100
-    input_dim = 50
-
-    class Inner(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense1 = keras.layers.Dense(32, activation='relu')
-        self.dense2 = keras.layers.Dense(num_classes, activation='relu')
-        self.bn = keras.layers.BatchNormalization()
-
-      def call(self, inputs):
-        x = self.dense1(inputs)
-        x = self.dense2(x)
-        return self.bn(x)
-
-    model = keras.Sequential([Inner()])
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        metrics=['acc'],
-        run_eagerly=test_utils.should_run_eagerly())
-
-    x = np.ones((num_samples, input_dim))
-    y = np.zeros((num_samples, num_classes))
-    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
-    _ = model.evaluate(x, y, verbose=0)
-
-    self.assertEqual(len(model.weights), 8)
-    self.assertEqual(len(model.non_trainable_weights), 2)
-    self.assertEqual(len(model.trainable_weights), 6)
-
-  def test_support_for_manual_training_arg(self):
-    # In most cases, the `training` argument is left unspecified, in which
-    # case it defaults to value corresponding to the Model method being used
-    # (fit -> True, predict -> False, etc).
-    # If the user writes their model `call` method to take
-    # an explicit `training` argument, we must check that the correct value
-    # is being passed to the model for each method call.
-
-    class DPNet(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dp = keras.layers.Dropout(0.5)
-        self.dense = keras.layers.Dense(1,
-                                        use_bias=False,
-                                        kernel_initializer='ones')
-
-      def call(self, inputs, training=False):
-        x = self.dp(inputs, training=training)
-        return self.dense(x)
-
-    model = DPNet()
-    x = np.ones((10, 10))
-    y = model.predict(x)
-    self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(
-        loss='mse',
-        optimizer='rmsprop',
-        run_eagerly=test_utils.should_run_eagerly())
-    loss = model.train_on_batch(x, y)
-    self.assertGreater(loss, 0.1)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_single_io_workflow_with_np_arrays(self):
+        num_classes = 2
+        num_samples = 100
+        input_dim = 50
+
+        model = test_utils.SmallSubclassMLP(
+            num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=["acc", keras.metrics.CategoricalAccuracy()],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones((num_samples, input_dim))
+        y = np.zeros((num_samples, num_classes))
+
+        model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+        _ = model.evaluate(x, y, verbose=0)
+
+    def test_multi_io_workflow_with_np_arrays(self):
+        num_classes = (2, 3)
+        num_samples = 1000
+        input_dim = 50
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_dp=True, use_bn=True
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x1 = np.ones((num_samples, input_dim))
+        x2 = np.ones((num_samples, input_dim))
+        y1 = np.zeros((num_samples, num_classes[0]))
+        y2 = np.zeros((num_samples, num_classes[1]))
+
+        model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+        _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+
+    def test_single_io_workflow_with_datasets(self):
+        num_classes = 2
+        num_samples = 10
+        input_dim = 50
+
+        with self.cached_session():
+            model = test_utils.SmallSubclassMLP(
+                num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True
+            )
+            model.compile(
+                loss="mse",
+                optimizer="rmsprop",
+                run_eagerly=test_utils.should_run_eagerly(),
+            )
+
+            x = np.ones((num_samples, input_dim), dtype=np.float32)
+            y = np.zeros((num_samples, num_classes), dtype=np.float32)
+            dataset = tf.data.Dataset.from_tensor_slices((x, y))
+            dataset = dataset.repeat(100)
+            dataset = dataset.batch(10)
+
+            model.fit(dataset, epochs=2, steps_per_epoch=10, verbose=0)
+            _ = model.evaluate(dataset, steps=10, verbose=0)
+
+    def test_attributes(self):
+        # layers, weights, trainable_weights, non_trainable_weights, inputs,
+        # outputs
+
+        num_classes = (2, 3)
+        num_samples = 100
+        input_dim = 50
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+
+        x1 = np.ones((num_samples, input_dim))
+        x2 = np.ones((num_samples, input_dim))
+        y1 = np.zeros((num_samples, num_classes[0]))
+        y2 = np.zeros((num_samples, num_classes[1]))
+
+        self.assertEqual(model.name, "test_model")
+        self.assertEqual(model.built, False)
+        self.assertEqual(len(model.weights), 0)
+
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch([x1, x2], [y1, y2])
+
+        self.assertEqual(model.built, True)
+        self.assertEqual(len(model.layers), 4)
+        self.assertEqual(len(model.weights), 10)
+        self.assertEqual(len(model.trainable_weights), 8)
+        self.assertEqual(len(model.non_trainable_weights), 2)
+
+    def test_updates(self):
+        # test that updates get run during training
+        num_samples = 100
+        input_dim = 50
+
+        class BNNet(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.bn = keras.layers.BatchNormalization(
+                    beta_initializer="ones", gamma_initializer="ones"
+                )
+
+            def call(self, inputs):
+                return self.bn(inputs)
+
+        x = np.ones((num_samples, input_dim))
+        y = np.ones((num_samples, input_dim))
+
+        model = BNNet()
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        y_ref = model.predict(x)
+
+        model.train_on_batch(x, y)
+        y_new = model.predict(x)
+        self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
+
+    def test_training_and_inference_behavior(self):
+        # test that dropout is applied in training and not inference
+
+        num_samples = 100
+        input_dim = 50
+
+        class DPNet(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dp = keras.layers.Dropout(0.5)
+                self.dense = keras.layers.Dense(
+                    1, use_bias=False, kernel_initializer="ones"
+                )
+
+            def call(self, inputs):
+                x = self.dp(inputs)
+                return self.dense(x)
+
+        model = DPNet()
+        x = np.ones((num_samples, input_dim))
+        y = model.predict(x)
+        self.assertEqual(np.sum(y), np.sum(x))
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        loss = model.train_on_batch(x, y)
+        self.assertGreater(loss, 0.1)
+
+    def test_training_methods(self):
+        # test fit, train_on_batch
+        # on different input types: list, dict
+
+        num_classes = (2, 3)
+        num_samples = 100
+        input_dim = 50
+
+        x1 = np.ones((num_samples, input_dim))
+        x2 = np.ones((num_samples, input_dim))
+        y1 = np.zeros((num_samples, num_classes[0]))
+        y2 = np.zeros((num_samples, num_classes[1]))
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+        model.fit(
+            {"input_1": x1, "input_2": x2},
+            {"output_1": y1, "output_2": y2},
+            epochs=2,
+            batch_size=32,
+        )
+        model.fit(
+            [x1, x2],
+            [y1, y2],
+            epochs=2,
+            batch_size=32,
+            verbose=0,
+            validation_data=([x1, x2], [y1, y2]),
+        )
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.train_on_batch([x1, x2], [y1, y2])
+        model.train_on_batch(
+            {"input_1": x1, "input_2": x2}, {"output_1": y1, "output_2": y2}
+        )
+
+    def test_inference_methods(self):
+        # test predict, evaluate, test_on_batch, predict_on_batch
+        # on different input types: list, dict
+        num_classes = (2, 3)
+        num_samples = 100
+        input_dim = 50
+
+        x1 = np.ones((num_samples, input_dim))
+        x2 = np.ones((num_samples, input_dim))
+        y1 = np.zeros((num_samples, num_classes[0]))
+        y2 = np.zeros((num_samples, num_classes[1]))
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.evaluate([x1, x2], [y1, y2])
+        model.test_on_batch([x1, x2], [y1, y2])
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+        model.predict([x1, x2])
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+        model.predict_on_batch([x1, x2])
+
+    def test_saving(self):
+        num_classes = (2, 3)
+        num_samples = 100
+        input_dim = 50
+
+        x1 = np.ones((num_samples, input_dim))
+        x2 = np.ones((num_samples, input_dim))
+        y1 = np.zeros((num_samples, num_classes[0]))
+        y2 = np.zeros((num_samples, num_classes[1]))
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+        y_ref_1, y_ref_2 = model.predict([x1, x2])
+
+        tf_format_name = os.path.join(self.get_temp_dir(), "ckpt")
+        model.save_weights(tf_format_name)
+        if h5py is not None:
+            hdf5_format_name = os.path.join(self.get_temp_dir(), "weights.h5")
+            model.save_weights(hdf5_format_name)
+
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=num_classes, use_bn=True
+        )
+
+        if h5py is not None:
+            with self.assertRaises(ValueError):
+                model.load_weights(hdf5_format_name)
+
+        model.load_weights(tf_format_name)
+
+        y1, y2 = model.predict([x1, x2])
+        self.assertAllClose(y_ref_1, y1, atol=1e-5)
+        self.assertAllClose(y_ref_2, y2, atol=1e-5)
+
+        if h5py is not None:
+            model.load_weights(hdf5_format_name)
+
+            y1, y2 = model.predict([x1, x2])
+            self.assertAllClose(y_ref_1, y1, atol=1e-5)
+            self.assertAllClose(y_ref_2, y2, atol=1e-5)
+
+    def test_subclass_nested_in_subclass(self):
+        num_classes = 2
+        num_samples = 100
+        input_dim = 50
+
+        model = model_util.NestedTestModel1(num_classes=num_classes)
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones((num_samples, input_dim))
+        y = np.zeros((num_samples, num_classes))
+
+        model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+        _ = model.evaluate(x, y, verbose=0)
+
+        self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+        self.assertEqual(
+            len(model.non_trainable_weights),
+            2 + len(model.test_net.non_trainable_weights),
+        )
+        self.assertEqual(
+            len(model.trainable_weights),
+            6 + len(model.test_net.trainable_weights),
+        )
+
+    def test_graph_nested_in_subclass(self):
+        num_classes = 2
+        num_samples = 100
+        input_dim = 50
+
+        model = model_util.NestedTestModel2(num_classes=num_classes)
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones((num_samples, input_dim))
+        y = np.zeros((num_samples, num_classes))
+
+        model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+        _ = model.evaluate(x, y, verbose=0)
+
+        self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+        self.assertEqual(
+            len(model.non_trainable_weights),
+            2 + len(model.test_net.non_trainable_weights),
+        )
+        self.assertEqual(
+            len(model.trainable_weights),
+            6 + len(model.test_net.trainable_weights),
+        )
+
+    def test_subclass_nested_in_graph(self):
+        num_classes = 2
+        num_samples = 100
+        input_dim = 50
+
+        model = model_util.get_nested_model_3(
+            input_dim=input_dim, num_classes=num_classes
+        )
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones((num_samples, input_dim))
+        y = np.zeros((num_samples, num_classes))
+
+        model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+        _ = model.evaluate(x, y, verbose=0)
+
+        self.assertEqual(len(model.weights), 16)
+        self.assertEqual(len(model.non_trainable_weights), 4)
+        self.assertEqual(len(model.trainable_weights), 12)
+
+    def test_subclass_nested_in_sequential(self):
+        num_classes = 2
+        num_samples = 100
+        input_dim = 50
+
+        class Inner(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dense1 = keras.layers.Dense(32, activation="relu")
+                self.dense2 = keras.layers.Dense(num_classes, activation="relu")
+                self.bn = keras.layers.BatchNormalization()
+
+            def call(self, inputs):
+                x = self.dense1(inputs)
+                x = self.dense2(x)
+                return self.bn(x)
+
+        model = keras.Sequential([Inner()])
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            metrics=["acc"],
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+
+        x = np.ones((num_samples, input_dim))
+        y = np.zeros((num_samples, num_classes))
+        model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+        _ = model.evaluate(x, y, verbose=0)
+
+        self.assertEqual(len(model.weights), 8)
+        self.assertEqual(len(model.non_trainable_weights), 2)
+        self.assertEqual(len(model.trainable_weights), 6)
+
+    def test_support_for_manual_training_arg(self):
+        # In most cases, the `training` argument is left unspecified, in which
+        # case it defaults to value corresponding to the Model method being used
+        # (fit -> True, predict -> False, etc).
+        # If the user writes their model `call` method to take
+        # an explicit `training` argument, we must check that the correct value
+        # is being passed to the model for each method call.
+
+        class DPNet(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dp = keras.layers.Dropout(0.5)
+                self.dense = keras.layers.Dense(
+                    1, use_bias=False, kernel_initializer="ones"
+                )
+
+            def call(self, inputs, training=False):
+                x = self.dp(inputs, training=training)
+                return self.dense(x)
+
+        model = DPNet()
+        x = np.ones((10, 10))
+        y = model.predict(x)
+        self.assertEqual(np.sum(y), np.sum(x))
+        model.compile(
+            loss="mse",
+            optimizer="rmsprop",
+            run_eagerly=test_utils.should_run_eagerly(),
+        )
+        loss = model.train_on_batch(x, y)
+        self.assertGreater(loss, 0.1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/model_subclassing_test.py b/keras/tests/model_subclassing_test.py
index 8f86af2e11b9..dc56912e187b 100644
--- a/keras/tests/model_subclassing_test.py
+++ b/keras/tests/model_subclassing_test.py
@@ -14,739 +14,800 @@
 # ==============================================================================
 """Tests for Model subclassing."""
 
-import tensorflow.compat.v2 as tf
-
 import copy
 import os
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from tensorflow.python.framework import test_util as tf_test_utils
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.tests import model_subclassing_test_util as model_util
-from tensorflow.python.training.tracking import data_structures
+
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
+from tensorflow.python.trackable import data_structures
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+    import h5py
 except ImportError:
-  h5py = None
+    h5py = None
 
 
 @test_combinations.run_all_keras_modes
 class ModelSubclassingTest(test_combinations.TestCase):
-
-  def test_custom_build(self):
-    class DummyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense1 = keras.layers.Dense(32, activation='relu')
-        self.uses_custom_build = False
-
-      def call(self, inputs):
-        return self.dense1(inputs)
-
-      def build(self, input_shape):
-        self.uses_custom_build = True
-
-    test_model = DummyModel()
-    dummy_data = tf.ones((32, 50))
-    test_model(dummy_data)
-    self.assertTrue(test_model.uses_custom_build, 'Model should use user '
-                                                  'defined build when called.')
-
-  def test_attribute_conflict_error(self):
-
-    class ModelWithProperty(keras.Model):
-
-      @property
-      def read_only(self):
-        return 1.
-
-    m = ModelWithProperty()
-    with self.assertRaisesRegex(AttributeError, 'read_only'):
-      m.read_only = 2.
-
-  def test_custom_build_with_fit(self):
-
-    class DummyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer1 = keras.layers.Dense(10, activation='relu')
-
-      def build(self, input_shape):
-        self.layer2 = keras.layers.Dense(1, activation='relu')
-
-      def call(self, inputs):
-        return self.layer2(self.layer1(inputs))
-
-    model = DummyModel()
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=2)
-    self.assertLen(model.layers, 2)
-    self.assertLen(model.trainable_variables, 4)
-
-  def test_dataset_dict_with_fit(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense1 = keras.layers.Dense(1)
-        self.dense2 = keras.layers.Dense(1)
-        self.add = keras.layers.Add()
-
-      def call(self, x):
-        return self.add([self.dense1(x['a']), self.dense2(x['b'])])
-
-    model = MyModel()
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=test_utils.should_run_eagerly())
-
-    data = tf.data.Dataset.from_tensor_slices(({
-        'a': np.ones((32, 10)),
-        'b': np.ones((32, 20))
-    }, np.ones((32, 1)))).batch(2)
-    model.fit(data, epochs=2)
-
-  def test_invalid_input_shape_build(self):
-    num_classes = 2
-    input_dim = 50
-
-    model = test_utils.SmallSubclassMLP(
-        num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True)
-
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    with self.assertRaisesRegex(ValueError,
-                                'input shape is not one of the valid types'):
-      model.build(input_shape=tf.compat.v1.Dimension(input_dim))
-
-  def test_embed_dtype_with_subclass_build(self):
-    class Embedding(keras.layers.Layer):
-      """An Embedding layer."""
-
-      def __init__(self, vocab_size, embedding_dim, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.embedding_dim = embedding_dim
-
-      def build(self, _):
-        self.embedding = self.add_weight(
-            'embedding_kernel',
-            shape=[self.vocab_size, self.embedding_dim],
-            dtype=np.float32,
-            initializer=tf.compat.v1.random_uniform_initializer(-0.1, 0.1),
-            trainable=True)
-
-      def call(self, x):
-        return tf.compat.v1.nn.embedding_lookup(self.embedding, x)
-
-    class EmbedModel(keras.Model):
-
-      def __init__(self, vocab_size, embed_size):
-        super().__init__()
-        self.embed1 = Embedding(vocab_size, embed_size)
-
-      def call(self, inputs):
-        return self.embed1(inputs)
-
-    model = EmbedModel(100, 20)
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    with self.assertRaisesRegex(
-        ValueError, 'if your layers do not support float type inputs'):
-      model.build(input_shape=(35, 20))
-
-  def test_single_time_step_rnn_build(self):
-    dim = 4
-    timesteps = 1
-    batch_input_shape = (None, timesteps, dim)
-    units = 3
-
-    class SimpleRNNModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.lstm = keras.layers.LSTM(units)
-
-      def call(self, inputs):
-        return self.lstm(inputs)
-
-    model = SimpleRNNModel()
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    model.build(batch_input_shape)
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-    model(tf.ones((32, timesteps, dim)))
-
-  def test_single_io_subclass_build(self):
-    num_classes = 2
-    input_dim = 50
-    batch_size = None
-
-    model = test_utils.SmallSubclassMLP(
-        num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True)
-
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    model.build(input_shape=(batch_size, input_dim))
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-    model(tf.ones((32, input_dim)))
-
-  def test_single_io_dimension_subclass_build(self):
-    num_classes = 2
-    input_dim = tf.compat.v1.Dimension(50)
-    batch_size = tf.compat.v1.Dimension(None)
-
-    model = test_utils.SmallSubclassMLP(
-        num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True)
-
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    model.build(input_shape=(batch_size, input_dim))
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-    model(tf.ones((32, input_dim)))
-
-  def test_multidim_io_subclass_build(self):
-    num_classes = 10
-    # Input size, e.g. image
-    batch_size = 32
-    input_shape = (32, 32, 3)
-
-    model = model_util.SimpleConvTestModel(num_classes)
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    batch_input_shape = (batch_size,) + input_shape
-    model.build(input_shape=batch_input_shape)
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-
-    model(tf.ones(batch_input_shape))
-
-  def test_tensorshape_io_subclass_build(self):
-    num_classes = 10
-    # Input size, e.g. image
-    batch_size = None
-    input_shape = (32, 32, 3)
-
-    model = model_util.SimpleConvTestModel(num_classes)
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    model.build(
-        input_shape=tf.TensorShape((batch_size,) + input_shape))
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-
-    model(tf.ones((32,) + input_shape))
-
-  def test_subclass_save_model(self):
-    num_classes = 10
-    # Input size, e.g. image
-    batch_size = None
-    input_shape = (32, 32, 3)
-
-    model = model_util.SimpleConvTestModel(num_classes)
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    model.build(
-        input_shape=tf.TensorShape((batch_size,) + input_shape))
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-    weights = model.get_weights()
-
-    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
-    model.save_weights(tf_format_name)
-    if h5py is not None:
-      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
-      model.save_weights(hdf5_format_name)
-
-    model = model_util.SimpleConvTestModel(num_classes)
-    model.build(
-        input_shape=tf.TensorShape((batch_size,) + input_shape))
-    if h5py is not None:
-      model.load_weights(hdf5_format_name)
-      self.assertAllClose(weights, model.get_weights())
-    model.load_weights(tf_format_name)
-    self.assertAllClose(weights, model.get_weights())
-
-  def test_multi_io_subclass_build(self):
-    batch_size = None
-    num_samples = 1000
-    input_dim = 50
-    model = model_util.get_multi_io_subclass_model()
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    batch_input_shape = tf.TensorShape((batch_size, input_dim))
-    model.build(
-        input_shape=[batch_input_shape, batch_input_shape])
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-    x1 = tf.ones((num_samples, input_dim))
-    x2 = tf.ones((num_samples, input_dim))
-    model([x1, x2])
-
-  def test_summary(self):
-
-    class ToString:
-
-      def __init__(self):
-        self.contents = ''
-
-      def __call__(self, msg):
-        self.contents += msg + '\n'
-
-    # Single-io
-    model = test_utils.SmallSubclassMLP(
-        num_hidden=32, num_classes=4, use_bn=True, use_dp=True)
-    model(np.ones((3, 4)))  # need to build model first
-    print_fn = ToString()
-    model.summary(print_fn=print_fn)
-    self.assertIn('Trainable params: 356', print_fn.contents)
-
-    # Multi-io
-    model = model_util.get_multi_io_subclass_model(
-        num_classes=(5, 6), use_bn=True, use_dp=True)
-    model([np.ones((3, 4)), np.ones((3, 4))])  # need to build model first
-    print_fn = ToString()
-    model.summary(print_fn=print_fn)
-    self.assertIn('Trainable params: 587', print_fn.contents)
-
-    # Single-io with unused layer
-    model = test_utils.SmallSubclassMLP(
-        num_hidden=32, num_classes=4, use_bn=True, use_dp=True)
-    model.unused_layer = keras.layers.Dense(10)
-    model(np.ones((3, 4)))  # need to build model first
-    print_fn = ToString()
-    model.summary(print_fn=print_fn)
-    self.assertIn('Trainable params: 356', print_fn.contents)
-    self.assertIn('0 (unused)', print_fn.contents)
-
-  def test_no_dependency(self):
-    class Foo(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.isdep = keras.layers.Dense(1)
-        self.notdep = data_structures.NoDependency(keras.layers.Dense(2))
-        self.notdep_var = data_structures.NoDependency(
-            tf.Variable(1., name='notdep_var'))
-
-    m = Foo()
-    self.assertEqual([m.isdep, m.notdep], m.layers)
-    self.assertEqual(1, len(m._trackable_children()))
-    self.assertIs(m.isdep, m._trackable_children()['isdep'])
-    self.assertEqual('notdep_var:0', m.notdep_var.name)
-
-  def test_extra_variable(self):
-
-    class ExtraVar(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense = keras.layers.Dense(1)
-        self.var = tf.Variable(1.)
-        self.not_trainable_var = tf.Variable(2., trainable=False)
-
-      def call(self, inputs):
-        return self.dense(inputs + self.var)
-
-    m = ExtraVar()
-    self.assertTrue(m.trainable)
-    self.assertEqual([m.dense], m.layers)
-    self.assertEqual([m.var, m.not_trainable_var], m.variables)
-    self.assertEqual([m.var], m.trainable_variables)
-    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
-    self.assertLen(m.get_weights(), 2)
-    m.trainable = False
-    self.assertEqual([m.var, m.not_trainable_var], m.variables)
-    self.assertEqual([], m.trainable_variables)
-    self.assertEqual([m.var, m.not_trainable_var], m.non_trainable_variables)
-    self.assertLen(m.get_weights(), 2)
-    m.trainable = True
-
-    m(tf.ones([1, 1]))
-
-    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
-
-    self.assertLen(m.get_weights(), 4)
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
-                     m.variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.var],
-                     m.trainable_variables)
-    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
-
-    m.dense.trainable = False
-    self.assertEqual(
-        [m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
-        m.variables)
-    self.assertEqual([m.var], m.trainable_variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
-                     m.non_trainable_variables)
-    self.assertLen(m.get_weights(), 4)
-
-  def test_add_weight_in_model(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,), trainable=False)
-
-      def call(self, inputs):
-        return inputs + self.b + self.c
-
-    x = tf.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModel()
-    model(x)
-    self.assertEqual(1, len(model.trainable_weights))
-    self.assertEqual(1, len(model.non_trainable_weights))
-    self.assertEqual(2, len(model.weights))
-
-    class MyModelCustomBuild(keras.Model):
-
-      def build(self, input_shape):
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,), trainable=False)
-
-      def call(self, inputs):
-        return inputs + self.b + self.c
-
-    x = tf.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModelCustomBuild()
-    model(x)
-    self.assertEqual(1, len(model.trainable_weights))
-    self.assertEqual(1, len(model.non_trainable_weights))
-    self.assertEqual(2, len(model.weights))
-
-  def test_add_update_in_model(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,))
-
-      def call(self, inputs):
-        # Unconditional
-        self.add_update(self.b.assign(self.b * 2))
-        # Conditional
-        self.add_update(self.c.assign(inputs[1, :]))
-        return inputs + self.b + self.c
-
-    x = tf.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModel()
-    model(x)
-
-    if tf.executing_eagerly():
-      self.assertEqual(0, len(model.updates))
-    else:
-      self.assertEqual(2, len(model.updates))
-
-
-class GraphSpecificModelSubclassingTests(tf.test.TestCase):
-
-  def test_single_io_workflow_with_tensors(self):
-    num_classes = 2
-    num_samples = 10
-    input_dim = 50
-
-    with tf.Graph().as_default(), self.cached_session():
-      model = test_utils.SmallSubclassMLP(
-          num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True)
-      model.compile(loss='mse', optimizer='rmsprop')
-
-      x = tf.ones((num_samples, input_dim))
-      y = tf.zeros((num_samples, num_classes))
-
-      model.fit(x, y, epochs=2, steps_per_epoch=10, verbose=0)
-      _ = model.evaluate(steps=10, verbose=0)
-
-  def test_multi_io_workflow_with_tensors(self):
-    num_classes = (2, 3)
-    num_samples = 10
-    input_dim = 50
-
-    with tf.Graph().as_default(), self.cached_session():
-      model = model_util.get_multi_io_subclass_model(
-          num_classes=num_classes, use_dp=True, use_bn=True)
-      model.compile(loss='mse', optimizer='rmsprop')
-
-      x1 = tf.ones((num_samples, input_dim))
-      x2 = tf.ones((num_samples, input_dim))
-      y1 = tf.zeros((num_samples, num_classes[0]))
-      y2 = tf.zeros((num_samples, num_classes[1]))
-
-      model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
-      _ = model.evaluate(steps=10, verbose=0)
-
-  def test_updates_and_losses_for_nested_models_in_subclassed_model(self):
-
-    # Case 1: deferred-build sequential nested in subclass.
-    class TestModel1(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.fc = keras.layers.Dense(10, input_shape=(784,),
-                                     activity_regularizer='l1')
-        self.bn = keras.Sequential([keras.layers.BatchNormalization(axis=1)])
-
-      def call(self, x):
-        return self.bn(self.fc(x))
-
-    with tf.compat.v1.get_default_graph().as_default(), self.cached_session():
-      model = TestModel1()
-
-      x = tf.ones(shape=[100, 784], dtype='float32')
-      model(x)
-      self.assertLen(model.updates, 2)
-      self.assertLen(model.losses, 1)
-
-    # Case 2: placeholder-sequential nested in subclass.
-    class TestModel2(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.fc = keras.layers.Dense(10, input_shape=(784,),
-                                     activity_regularizer='l1')
-        self.bn = keras.Sequential(
-            [keras.layers.BatchNormalization(axis=1, input_shape=(10,))])
-
-      def call(self, x):
-        return self.bn(self.fc(x))
-
-    with tf.compat.v1.get_default_graph().as_default(), self.cached_session():
-      model = TestModel2()
-
-      x = tf.ones(shape=[100, 784], dtype='float32')
-      model(x)
-      self.assertEqual(len(model.get_updates_for(x)), 2)
-      self.assertEqual(len(model.get_losses_for(x)), 1)
-
-    # Case 3: functional-API model nested in subclass.
-    with tf.compat.v1.get_default_graph().as_default():
-      inputs = keras.Input((10,))
-      outputs = keras.layers.BatchNormalization(axis=1)(inputs)
-      bn = keras.Model(inputs, outputs)
-
-      class TestModel3(keras.Model):
-
-        def __init__(self):
-          super().__init__()
-          self.fc = keras.layers.Dense(10, input_shape=(784,),
-                                       activity_regularizer='l1')
-          self.bn = bn
-
-        def call(self, x):
-          return self.bn(self.fc(x))
-
-      with self.cached_session():
-        model = TestModel3()
-
-        x = tf.ones(shape=[100, 784], dtype='float32')
+    def test_custom_build(self):
+        class DummyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dense1 = keras.layers.Dense(32, activation="relu")
+                self.uses_custom_build = False
+
+            def call(self, inputs):
+                return self.dense1(inputs)
+
+            def build(self, input_shape):
+                self.uses_custom_build = True
+
+        test_model = DummyModel()
+        dummy_data = tf.ones((32, 50))
+        test_model(dummy_data)
+        self.assertTrue(
+            test_model.uses_custom_build,
+            "Model should use user defined build when called.",
+        )
+
+    def test_attribute_conflict_error(self):
+        class ModelWithProperty(keras.Model):
+            @property
+            def read_only(self):
+                return 1.0
+
+        m = ModelWithProperty()
+        with self.assertRaisesRegex(AttributeError, "read_only"):
+            m.read_only = 2.0
+
+    def test_custom_build_with_fit(self):
+        class DummyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.layer1 = keras.layers.Dense(10, activation="relu")
+
+            def build(self, input_shape):
+                self.layer2 = keras.layers.Dense(1, activation="relu")
+
+            def call(self, inputs):
+                return self.layer2(self.layer1(inputs))
+
+        model = DummyModel()
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+        model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=2)
+        self.assertLen(model.layers, 2)
+        self.assertLen(model.trainable_variables, 4)
+
+    def test_dataset_dict_with_fit(self):
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dense1 = keras.layers.Dense(1)
+                self.dense2 = keras.layers.Dense(1)
+                self.add = keras.layers.Add()
+
+            def call(self, x):
+                return self.add([self.dense1(x["a"]), self.dense2(x["b"])])
+
+        model = MyModel()
+        model.compile("sgd", "mse", run_eagerly=test_utils.should_run_eagerly())
+
+        data = tf.data.Dataset.from_tensor_slices(
+            ({"a": np.ones((32, 10)), "b": np.ones((32, 20))}, np.ones((32, 1)))
+        ).batch(2)
+        model.fit(data, epochs=2)
+
+    def test_invalid_input_shape_build(self):
+        num_classes = 2
+        input_dim = 50
+
+        model = test_utils.SmallSubclassMLP(
+            num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True
+        )
+
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        with self.assertRaisesRegex(
+            ValueError, "input shape is not one of the valid types"
+        ):
+            model.build(input_shape=tf.compat.v1.Dimension(input_dim))
+
+    def test_embed_dtype_with_subclass_build(self):
+        class Embedding(keras.layers.Layer):
+            """An Embedding layer."""
+
+            def __init__(self, vocab_size, embedding_dim, **kwargs):
+                super().__init__(**kwargs)
+                self.vocab_size = vocab_size
+                self.embedding_dim = embedding_dim
+
+            def build(self, _):
+                self.embedding = self.add_weight(
+                    "embedding_kernel",
+                    shape=[self.vocab_size, self.embedding_dim],
+                    dtype=np.float32,
+                    initializer=tf.compat.v1.random_uniform_initializer(
+                        -0.1, 0.1
+                    ),
+                    trainable=True,
+                )
+
+            def call(self, x):
+                return tf.compat.v1.nn.embedding_lookup(self.embedding, x)
+
+        class EmbedModel(keras.Model):
+            def __init__(self, vocab_size, embed_size):
+                super().__init__()
+                self.embed1 = Embedding(vocab_size, embed_size)
+
+            def call(self, inputs):
+                return self.embed1(inputs)
+
+        model = EmbedModel(100, 20)
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        with self.assertRaisesRegex(
+            ValueError, "if your layers do not support float type inputs"
+        ):
+            model.build(input_shape=(35, 20))
+
+    def test_single_time_step_rnn_build(self):
+        dim = 4
+        timesteps = 1
+        batch_input_shape = (None, timesteps, dim)
+        units = 3
+
+        class SimpleRNNModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.lstm = keras.layers.LSTM(units)
+
+            def call(self, inputs):
+                return self.lstm(inputs)
+
+        model = SimpleRNNModel()
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        model.build(batch_input_shape)
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+        model(tf.ones((32, timesteps, dim)))
+
+    def test_single_io_subclass_build(self):
+        num_classes = 2
+        input_dim = 50
+        batch_size = None
+
+        model = test_utils.SmallSubclassMLP(
+            num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True
+        )
+
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        model.build(input_shape=(batch_size, input_dim))
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+        model(tf.ones((32, input_dim)))
+
+    def test_single_io_dimension_subclass_build(self):
+        num_classes = 2
+        input_dim = tf.compat.v1.Dimension(50)
+        batch_size = tf.compat.v1.Dimension(None)
+
+        model = test_utils.SmallSubclassMLP(
+            num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True
+        )
+
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        model.build(input_shape=(batch_size, input_dim))
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+        model(tf.ones((32, input_dim)))
+
+    def test_multidim_io_subclass_build(self):
+        num_classes = 10
+        # Input size, e.g. image
+        batch_size = 32
+        input_shape = (32, 32, 3)
+
+        model = model_util.SimpleConvTestModel(num_classes)
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        batch_input_shape = (batch_size,) + input_shape
+        model.build(input_shape=batch_input_shape)
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+
+        model(tf.ones(batch_input_shape))
+
+    def test_tensorshape_io_subclass_build(self):
+        num_classes = 10
+        # Input size, e.g. image
+        batch_size = None
+        input_shape = (32, 32, 3)
+
+        model = model_util.SimpleConvTestModel(num_classes)
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        model.build(input_shape=tf.TensorShape((batch_size,) + input_shape))
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+
+        model(tf.ones((32,) + input_shape))
+
+    def test_subclass_save_model(self):
+        num_classes = 10
+        # Input size, e.g. image
+        batch_size = None
+        input_shape = (32, 32, 3)
+
+        model = model_util.SimpleConvTestModel(num_classes)
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        model.build(input_shape=tf.TensorShape((batch_size,) + input_shape))
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+        weights = model.get_weights()
+
+        tf_format_name = os.path.join(self.get_temp_dir(), "ckpt")
+        model.save_weights(tf_format_name)
+        if h5py is not None:
+            hdf5_format_name = os.path.join(self.get_temp_dir(), "weights.h5")
+            model.save_weights(hdf5_format_name)
+
+        model = model_util.SimpleConvTestModel(num_classes)
+        model.build(input_shape=tf.TensorShape((batch_size,) + input_shape))
+        if h5py is not None:
+            model.load_weights(hdf5_format_name)
+            self.assertAllClose(weights, model.get_weights())
+        model.load_weights(tf_format_name)
+        self.assertAllClose(weights, model.get_weights())
+
+    def test_multi_io_subclass_build(self):
+        batch_size = None
+        num_samples = 1000
+        input_dim = 50
+        model = model_util.get_multi_io_subclass_model()
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        batch_input_shape = tf.TensorShape((batch_size, input_dim))
+        model.build(input_shape=[batch_input_shape, batch_input_shape])
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+        x1 = tf.ones((num_samples, input_dim))
+        x2 = tf.ones((num_samples, input_dim))
+        model([x1, x2])
+
+    def test_summary(self):
+        class ToString:
+            def __init__(self):
+                self.contents = ""
+
+            def __call__(self, msg):
+                self.contents += msg + "\n"
+
+        # Single-io
+        model = test_utils.SmallSubclassMLP(
+            num_hidden=32, num_classes=4, use_bn=True, use_dp=True
+        )
+        model(np.ones((3, 4)))  # need to build model first
+        print_fn = ToString()
+        model.summary(print_fn=print_fn)
+        self.assertIn("Trainable params: 356", print_fn.contents)
+
+        # Multi-io
+        model = model_util.get_multi_io_subclass_model(
+            num_classes=(5, 6), use_bn=True, use_dp=True
+        )
+        model([np.ones((3, 4)), np.ones((3, 4))])  # need to build model first
+        print_fn = ToString()
+        model.summary(print_fn=print_fn)
+        self.assertIn("Trainable params: 587", print_fn.contents)
+
+        # Single-io with unused layer
+        model = test_utils.SmallSubclassMLP(
+            num_hidden=32, num_classes=4, use_bn=True, use_dp=True
+        )
+        model.unused_layer = keras.layers.Dense(10)
+        model(np.ones((3, 4)))  # need to build model first
+        print_fn = ToString()
+        model.summary(print_fn=print_fn)
+        self.assertIn("Trainable params: 356", print_fn.contents)
+        self.assertIn("0 (unused)", print_fn.contents)
+
+    def test_no_dependency(self):
+        class Foo(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.isdep = keras.layers.Dense(1)
+                self.notdep = data_structures.NoDependency(
+                    keras.layers.Dense(2)
+                )
+                self.notdep_var = data_structures.NoDependency(
+                    tf.Variable(1.0, name="notdep_var")
+                )
+
+        m = Foo()
+        self.assertEqual([m.isdep, m.notdep], m.layers)
+        self.assertEqual(1, len(m._trackable_children()))
+        self.assertIs(m.isdep, m._trackable_children()["isdep"])
+        self.assertEqual("notdep_var:0", m.notdep_var.name)
+
+    def test_extra_variable(self):
+        class ExtraVar(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.dense = keras.layers.Dense(1)
+                self.var = tf.Variable(1.0)
+                self.not_trainable_var = tf.Variable(2.0, trainable=False)
+
+            def call(self, inputs):
+                return self.dense(inputs + self.var)
+
+        m = ExtraVar()
+        self.assertTrue(m.trainable)
+        self.assertEqual([m.dense], m.layers)
+        self.assertEqual([m.var, m.not_trainable_var], m.variables)
+        self.assertEqual([m.var], m.trainable_variables)
+        self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+        self.assertLen(m.get_weights(), 2)
+        m.trainable = False
+        self.assertEqual([m.var, m.not_trainable_var], m.variables)
+        self.assertEqual([], m.trainable_variables)
+        self.assertEqual(
+            [m.var, m.not_trainable_var], m.non_trainable_variables
+        )
+        self.assertLen(m.get_weights(), 2)
+        m.trainable = True
+
+        m(tf.ones([1, 1]))
+
+        self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
+        self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
+
+        self.assertLen(m.get_weights(), 4)
+        self.assertEqual(
+            [m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
+            m.variables,
+        )
+        self.assertEqual(
+            [m.dense.kernel, m.dense.bias, m.var], m.trainable_variables
+        )
+        self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+
+        m.dense.trainable = False
+        self.assertEqual(
+            [m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
+            m.variables,
+        )
+        self.assertEqual([m.var], m.trainable_variables)
+        self.assertEqual(
+            [m.dense.kernel, m.dense.bias, m.not_trainable_var],
+            m.non_trainable_variables,
+        )
+        self.assertLen(m.get_weights(), 4)
+
+    def test_add_weight_in_model(self):
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.b = self.add_weight("bias", (10,))
+                self.c = self.add_weight("bias2", (10,), trainable=False)
+
+            def call(self, inputs):
+                return inputs + self.b + self.c
+
+        x = tf.convert_to_tensor(np.ones((10, 10), "float32"))
+        model = MyModel()
         model(x)
-        self.assertEqual(len(model.get_updates_for(x)), 2)
-        self.assertEqual(len(model.get_losses_for(x)), 1)
-
-  def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
-    num_classes = (2, 3)
-    num_samples = 1000
-    input_dim = 50
+        self.assertEqual(1, len(model.trainable_weights))
+        self.assertEqual(1, len(model.non_trainable_weights))
+        self.assertEqual(2, len(model.weights))
 
-    with tf.Graph().as_default(), self.cached_session():
-      model = model_util.get_multi_io_subclass_model(
-          num_classes=num_classes, use_dp=True, use_bn=True)
-      model.compile(loss='mse', optimizer='rmsprop')
+        class MyModelCustomBuild(keras.Model):
+            def build(self, input_shape):
+                self.b = self.add_weight("bias", (10,))
+                self.c = self.add_weight("bias2", (10,), trainable=False)
 
-      x1 = np.ones((num_samples, input_dim))
-      x2 = np.ones((num_samples, input_dim))
-      y1 = np.zeros((num_samples, num_classes[0]))
-      y2 = np.zeros((num_samples, num_classes[1]))
+            def call(self, inputs):
+                return inputs + self.b + self.c
 
-      x2_placeholder = tf.compat.v1.placeholder(
-          dtype='float32', shape=(None, input_dim))
-      model._set_inputs([x1, x2_placeholder])
+        x = tf.convert_to_tensor(np.ones((10, 10), "float32"))
+        model = MyModelCustomBuild()
+        model(x)
+        self.assertEqual(1, len(model.trainable_weights))
+        self.assertEqual(1, len(model.non_trainable_weights))
+        self.assertEqual(2, len(model.weights))
+
+    def test_add_update_in_model(self):
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.b = self.add_weight("bias", (10,))
+                self.c = self.add_weight("bias2", (10,))
+
+            def call(self, inputs):
+                # Unconditional
+                self.add_update(self.b.assign(self.b * 2))
+                # Conditional
+                self.add_update(self.c.assign(inputs[1, :]))
+                return inputs + self.b + self.c
+
+        x = tf.convert_to_tensor(np.ones((10, 10), "float32"))
+        model = MyModel()
+        model(x)
 
-      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-      _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+        if tf.executing_eagerly():
+            self.assertLen(model.updates, 0)
+        else:
+            self.assertLen(model.updates, 2)
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+class GraphSpecificModelSubclassingTests(tf.test.TestCase):
+    def test_single_io_workflow_with_tensors(self):
+        num_classes = 2
+        num_samples = 10
+        input_dim = 50
+
+        with tf.Graph().as_default(), self.cached_session():
+            model = test_utils.SmallSubclassMLP(
+                num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True
+            )
+            model.compile(loss="mse", optimizer="rmsprop")
+
+            x = tf.ones((num_samples, input_dim))
+            y = tf.zeros((num_samples, num_classes))
+
+            model.fit(x, y, epochs=2, steps_per_epoch=10, verbose=0)
+            _ = model.evaluate(steps=10, verbose=0)
+
+    def test_multi_io_workflow_with_tensors(self):
+        num_classes = (2, 3)
+        num_samples = 10
+        input_dim = 50
+
+        with tf.Graph().as_default(), self.cached_session():
+            model = model_util.get_multi_io_subclass_model(
+                num_classes=num_classes, use_dp=True, use_bn=True
+            )
+            model.compile(loss="mse", optimizer="rmsprop")
+
+            x1 = tf.ones((num_samples, input_dim))
+            x2 = tf.ones((num_samples, input_dim))
+            y1 = tf.zeros((num_samples, num_classes[0]))
+            y2 = tf.zeros((num_samples, num_classes[1]))
+
+            model.fit(
+                [x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0
+            )
+            _ = model.evaluate(steps=10, verbose=0)
+
+    def test_updates_and_losses_for_nested_models_in_subclassed_model(self):
+        # Case 1: deferred-build sequential nested in subclass.
+        class TestModel1(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.fc = keras.layers.Dense(
+                    10, input_shape=(784,), activity_regularizer="l1"
+                )
+                self.bn = keras.Sequential(
+                    [keras.layers.BatchNormalization(axis=1)]
+                )
+
+            def call(self, x):
+                return self.bn(self.fc(x))
+
+        with tf.compat.v1.get_default_graph().as_default(), self.cached_session():  # noqa: E501
+            model = TestModel1()
+
+            x = tf.ones(shape=[100, 784], dtype="float32")
+            model(x)
+            self.assertLen(model.updates, 2)
+            self.assertLen(model.losses, 1)
+
+        # Case 2: placeholder-sequential nested in subclass.
+        class TestModel2(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.fc = keras.layers.Dense(
+                    10, input_shape=(784,), activity_regularizer="l1"
+                )
+                self.bn = keras.Sequential(
+                    [keras.layers.BatchNormalization(axis=1, input_shape=(10,))]
+                )
+
+            def call(self, x):
+                return self.bn(self.fc(x))
+
+        with tf.compat.v1.get_default_graph().as_default(), self.cached_session():  # noqa: E501
+            model = TestModel2()
+
+            x = tf.ones(shape=[100, 784], dtype="float32")
+            model(x)
+            self.assertEqual(len(model.get_updates_for(x)), 2)
+            self.assertEqual(len(model.get_losses_for(x)), 1)
+
+        # Case 3: functional-API model nested in subclass.
+        with tf.compat.v1.get_default_graph().as_default():
+            inputs = keras.Input((10,))
+            outputs = keras.layers.BatchNormalization(axis=1)(inputs)
+            bn = keras.Model(inputs, outputs)
+
+            class TestModel3(keras.Model):
+                def __init__(self):
+                    super().__init__()
+                    self.fc = keras.layers.Dense(
+                        10, input_shape=(784,), activity_regularizer="l1"
+                    )
+                    self.bn = bn
+
+                def call(self, x):
+                    return self.bn(self.fc(x))
+
+            with self.cached_session():
+                model = TestModel3()
+
+                x = tf.ones(shape=[100, 784], dtype="float32")
+                model(x)
+                self.assertEqual(len(model.get_updates_for(x)), 2)
+                self.assertEqual(len(model.get_losses_for(x)), 1)
+
+    def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
+        num_classes = (2, 3)
+        num_samples = 1000
+        input_dim = 50
+
+        with tf.Graph().as_default(), self.cached_session():
+            model = model_util.get_multi_io_subclass_model(
+                num_classes=num_classes, use_dp=True, use_bn=True
+            )
+            model.compile(loss="mse", optimizer="rmsprop")
+
+            x1 = np.ones((num_samples, input_dim))
+            x2 = np.ones((num_samples, input_dim))
+            y1 = np.zeros((num_samples, num_classes[0]))
+            y2 = np.zeros((num_samples, num_classes[1]))
+
+            x2_placeholder = tf.compat.v1.placeholder(
+                dtype="float32", shape=(None, input_dim)
+            )
+            model._set_inputs([x1, x2_placeholder])
+
+            model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+            _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class CustomCallSignatureTests(tf.test.TestCase, parameterized.TestCase):
-
-  def test_no_inputs_in_signature(self):
-    model = model_util.CustomCallModel()
-    first = tf.ones([2, 3])
-    second = tf.ones([2, 5])
-    output = model(first, second)
-    self.evaluate([v.initializer for v in model.variables])
-    expected_output = self.evaluate(model.dense1(first) + model.dense2(second))
-    self.assertAllClose(expected_output, self.evaluate(output))
-    output = model(first, second, fiddle_with_output='yes')
-    self.assertAllClose(10. * expected_output, self.evaluate(output))
-    output = model(first, second=second, training=False)
-    self.assertAllClose(expected_output, self.evaluate(output))
-
-  def test_training_args_call_build(self):
-    input_dim = 2
-
-    model = model_util.TrainingNoDefaultModel()
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    model.build((None, input_dim))
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-
-  def test_training_and_mask_args_call_build(self):
-    input_dim = 2
-
-    model = model_util.TrainingMaskingModel()
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    model.build((None, input_dim))
-    self.assertTrue(model.weights, ('Model should have weights now that it '
-                                    'has been properly built.'))
-    self.assertTrue(model.built, 'Model should be built after calling `build`.')
-
-  def test_custom_call_kwargs_and_build(self):
-    first_input_shape = (2, 3)
-    second_input_shape = (2, 5)
-
-    model = model_util.CustomCallModel()
-    self.assertFalse(model.built, 'Model should not have been built')
-    self.assertFalse(model.weights, ('Model should have no weights since it '
-                                     'has not been built.'))
-    with self.assertRaisesRegex(ValueError,
-                                'cannot build your model if it has positional'):
-      model.build(input_shape=[first_input_shape, second_input_shape])
-
-  def test_kwargs_in_signature(self):
-
-    class HasKwargs(keras.Model):
-
-      def call(self, x, y=3, **kwargs):
-        return x
-
-    model = HasKwargs()
-    arg = tf.ones([1])
-    model(arg, a=3)
-    if not tf.executing_eagerly():
-      self.assertLen(model.inputs, 1)
-
-  @tf_test_utils.assert_no_new_tensors
-  @tf_test_utils.assert_no_garbage_created
-  def test_training_no_default(self):
-    if not tf.executing_eagerly():
-      return
-    model = model_util.TrainingNoDefaultModel()
-    arg = tf.ones([1, 1])
-    model(arg, True)
-
-  def test_positional_arg_in_call(self):
-
-    class ModelWithPositionalArgs(keras.Model):
-
-      def call(self, x, x2, x3=None):
-        return x + x2
-
-    x = np.ones((10, 1))
-    y = np.ones((10, 1))
-    m = ModelWithPositionalArgs()
-    m.compile('sgd', 'mse')
-    with self.assertRaisesRegex(ValueError, r'Models passed to `fit`'):
-      m.fit(x, y, batch_size=2)
-    with self.assertRaisesRegex(ValueError, r'Models passed to `evaluate`'):
-      m.evaluate(x, y, batch_size=2)
-    with self.assertRaisesRegex(ValueError, r'Models passed to `predict`'):
-      m.predict(x, batch_size=2)
-    with self.assertRaisesRegex(ValueError,
-                                r'Models passed to `train_on_batch`'):
-      m.train_on_batch(x, y)
-    with self.assertRaisesRegex(ValueError,
-                                r'Models passed to `test_on_batch`'):
-      m.test_on_batch(x, y)
-    with self.assertRaisesRegex(ValueError,
-                                r'Models passed to `predict_on_batch`'):
-      m.predict_on_batch(x)
-
-  def test_deepcopy(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Run in eager mode only.')
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.my_variable = tf.Variable(0.0, trainable=False)
-        self.layer = keras.layers.Dense(4)
-
-      def call(self, obs):
-        return self.layer(obs)
-
-    model = MyModel()
-    model.my_variable.assign_add(1.0)
-
-    new_model = copy.deepcopy(model)
-    self.assertEqual(model.my_variable.numpy(), 1.0)
-    self.assertEqual(new_model.my_variable.numpy(), 1.0)
-
-    model.my_variable.assign_add(1.0)
-    self.assertEqual(model.my_variable.numpy(), 2.0)
-    self.assertEqual(new_model.my_variable.numpy(), 1.0)
-
-    # Check that Trackable logic still works.
-    self.assertLen(new_model.variables, 1)
-    self.assertLen(new_model.layers, 1)
-
-  def test_batch_counters_not_in_variables(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.layer = keras.layers.Dense(4)
-
-      def call(self, obs):
-        return self.layer(obs)
-
-    model = MyModel()
-    model(np.ones((10, 10)))
-    self.assertLen(model.variables, 2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_no_inputs_in_signature(self):
+        model = model_util.CustomCallModel()
+        first = tf.ones([2, 3])
+        second = tf.ones([2, 5])
+        output = model(first, second)
+        self.evaluate([v.initializer for v in model.variables])
+        expected_output = self.evaluate(
+            model.dense1(first) + model.dense2(second)
+        )
+        self.assertAllClose(expected_output, self.evaluate(output))
+        output = model(first, second, fiddle_with_output="yes")
+        self.assertAllClose(10.0 * expected_output, self.evaluate(output))
+        output = model(first, second=second, training=False)
+        self.assertAllClose(expected_output, self.evaluate(output))
+
+    def test_training_args_call_build(self):
+        input_dim = 2
+
+        model = model_util.TrainingNoDefaultModel()
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        model.build((None, input_dim))
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+
+    def test_training_and_mask_args_call_build(self):
+        input_dim = 2
+
+        model = model_util.TrainingMaskingModel()
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        model.build((None, input_dim))
+        self.assertTrue(
+            model.weights,
+            "Model should have weights now that it has been properly built.",
+        )
+        self.assertTrue(
+            model.built, "Model should be built after calling `build`."
+        )
+
+    def test_custom_call_kwargs_and_build(self):
+        first_input_shape = (2, 3)
+        second_input_shape = (2, 5)
+
+        model = model_util.CustomCallModel()
+        self.assertFalse(model.built, "Model should not have been built")
+        self.assertFalse(
+            model.weights,
+            "Model should have no weights since it has not been built.",
+        )
+        with self.assertRaisesRegex(
+            ValueError, "cannot build your model if it has positional"
+        ):
+            model.build(input_shape=[first_input_shape, second_input_shape])
+
+    def test_kwargs_in_signature(self):
+        class HasKwargs(keras.Model):
+            def call(self, x, y=3, **kwargs):
+                return x
+
+        model = HasKwargs()
+        arg = tf.ones([1])
+        model(arg, a=3)
+        if not tf.executing_eagerly():
+            self.assertLen(model.inputs, 1)
+
+    @tf_test_utils.assert_no_new_tensors
+    @tf_test_utils.assert_no_garbage_created
+    def test_training_no_default(self):
+        if not tf.executing_eagerly():
+            return
+        model = model_util.TrainingNoDefaultModel()
+        arg = tf.ones([1, 1])
+        model(arg, True)
+
+    def test_positional_arg_in_call(self):
+        class ModelWithPositionalArgs(keras.Model):
+            def call(self, x, x2, x3=None):
+                return x + x2
+
+        x = np.ones((10, 1))
+        y = np.ones((10, 1))
+        m = ModelWithPositionalArgs()
+        m.compile("sgd", "mse")
+        with self.assertRaisesRegex(ValueError, r"Models passed to `fit`"):
+            m.fit(x, y, batch_size=2)
+        with self.assertRaisesRegex(ValueError, r"Models passed to `evaluate`"):
+            m.evaluate(x, y, batch_size=2)
+        with self.assertRaisesRegex(ValueError, r"Models passed to `predict`"):
+            m.predict(x, batch_size=2)
+        with self.assertRaisesRegex(
+            ValueError, r"Models passed to `train_on_batch`"
+        ):
+            m.train_on_batch(x, y)
+        with self.assertRaisesRegex(
+            ValueError, r"Models passed to `test_on_batch`"
+        ):
+            m.test_on_batch(x, y)
+        with self.assertRaisesRegex(
+            ValueError, r"Models passed to `predict_on_batch`"
+        ):
+            m.predict_on_batch(x)
+
+    def test_deepcopy(self):
+        if not tf.executing_eagerly():
+            self.skipTest("Run in eager mode only.")
+
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.my_variable = tf.Variable(0.0, trainable=False)
+                self.layer = keras.layers.Dense(4)
+
+            def call(self, obs):
+                return self.layer(obs)
+
+        model = MyModel()
+        model.my_variable.assign_add(1.0)
+
+        new_model = copy.deepcopy(model)
+        self.assertEqual(model.my_variable.numpy(), 1.0)
+        self.assertEqual(new_model.my_variable.numpy(), 1.0)
+
+        model.my_variable.assign_add(1.0)
+        self.assertEqual(model.my_variable.numpy(), 2.0)
+        self.assertEqual(new_model.my_variable.numpy(), 1.0)
+
+        # Check that Trackable logic still works.
+        self.assertLen(new_model.variables, 1)
+        self.assertLen(new_model.layers, 1)
+
+    def test_batch_counters_not_in_variables(self):
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.layer = keras.layers.Dense(4)
+
+            def call(self, obs):
+                return self.layer(obs)
+
+        model = MyModel()
+        model(np.ones((10, 10)))
+        self.assertLen(model.variables, 2)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/model_subclassing_test_util.py b/keras/tests/model_subclassing_test_util.py
index 2fd2dcf073e1..5d06f6c4540a 100644
--- a/keras/tests/model_subclassing_test_util.py
+++ b/keras/tests/model_subclassing_test_util.py
@@ -18,147 +18,141 @@
 from keras.testing_infra import test_utils
 
 
-# pylint: disable=missing-docstring,not-callable
 class SimpleConvTestModel(keras.Model):
+    def __init__(self, num_classes=10):
+        super().__init__(name="test_model")
+        self.num_classes = num_classes
 
-  def __init__(self, num_classes=10):
-    super().__init__(name='test_model')
-    self.num_classes = num_classes
+        self.conv1 = keras.layers.Conv2D(32, (3, 3), activation="relu")
+        self.flatten = keras.layers.Flatten()
+        self.dense1 = keras.layers.Dense(num_classes, activation="softmax")
 
-    self.conv1 = keras.layers.Conv2D(32, (3, 3), activation='relu')
-    self.flatten = keras.layers.Flatten()
-    self.dense1 = keras.layers.Dense(num_classes, activation='softmax')
-
-  def call(self, x):
-    x = self.conv1(x)
-    x = self.flatten(x)
-    return self.dense1(x)
+    def call(self, x):
+        x = self.conv1(x)
+        x = self.flatten(x)
+        return self.dense1(x)
 
 
 def get_multi_io_subclass_model(use_bn=False, use_dp=False, num_classes=(2, 3)):
-  """Creates MultiIOModel for the tests of subclass model."""
-  shared_layer = keras.layers.Dense(32, activation='relu')
-  branch_a = [shared_layer]
-  if use_dp:
-    branch_a.append(keras.layers.Dropout(0.5))
-  branch_a.append(keras.layers.Dense(num_classes[0], activation='softmax'))
+    """Creates MultiIOModel for the tests of subclass model."""
+    shared_layer = keras.layers.Dense(32, activation="relu")
+    branch_a = [shared_layer]
+    if use_dp:
+        branch_a.append(keras.layers.Dropout(0.5))
+    branch_a.append(keras.layers.Dense(num_classes[0], activation="softmax"))
 
-  branch_b = [shared_layer]
-  if use_bn:
-    branch_b.append(keras.layers.BatchNormalization())
-  branch_b.append(keras.layers.Dense(num_classes[1], activation='softmax'))
+    branch_b = [shared_layer]
+    if use_bn:
+        branch_b.append(keras.layers.BatchNormalization())
+    branch_b.append(keras.layers.Dense(num_classes[1], activation="softmax"))
 
-  model = (
-      test_utils._MultiIOSubclassModel(   # pylint: disable=protected-access
-          branch_a, branch_b, name='test_model'))
-  return model
+    model = test_utils._MultiIOSubclassModel(
+        branch_a, branch_b, name="test_model"
+    )
+    return model
 
 
 class NestedTestModel1(keras.Model):
-  """A model subclass nested inside a model subclass.
-  """
-
-  def __init__(self, num_classes=2):
-    super().__init__(name='nested_model_1')
-    self.num_classes = num_classes
-    self.dense1 = keras.layers.Dense(32, activation='relu')
-    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
-    self.bn = keras.layers.BatchNormalization()
-    self.test_net = test_utils.SmallSubclassMLP(
-        num_hidden=32, num_classes=4, use_bn=True, use_dp=True)
+    """A model subclass nested inside a model subclass."""
+
+    def __init__(self, num_classes=2):
+        super().__init__(name="nested_model_1")
+        self.num_classes = num_classes
+        self.dense1 = keras.layers.Dense(32, activation="relu")
+        self.dense2 = keras.layers.Dense(num_classes, activation="relu")
+        self.bn = keras.layers.BatchNormalization()
+        self.test_net = test_utils.SmallSubclassMLP(
+            num_hidden=32, num_classes=4, use_bn=True, use_dp=True
+        )
 
-  def call(self, inputs):
-    x = self.dense1(inputs)
-    x = self.bn(x)
-    x = self.test_net(x)
-    return self.dense2(x)
+    def call(self, inputs):
+        x = self.dense1(inputs)
+        x = self.bn(x)
+        x = self.test_net(x)
+        return self.dense2(x)
 
 
 class NestedTestModel2(keras.Model):
-  """A model subclass with a functional-API graph network inside.
-  """
-
-  def __init__(self, num_classes=2):
-    super().__init__(name='nested_model_2')
-    self.num_classes = num_classes
-    self.dense1 = keras.layers.Dense(32, activation='relu')
-    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
-    self.bn = self.bn = keras.layers.BatchNormalization()
-    self.test_net = self.get_functional_graph_model(32, 4)
-
-  @staticmethod
-  def get_functional_graph_model(input_dim, num_classes):
-    # A simple functional-API model (a.k.a. graph network)
-    inputs = keras.Input(shape=(input_dim,))
-    x = keras.layers.Dense(32, activation='relu')(inputs)
-    x = keras.layers.BatchNormalization()(x)
-    outputs = keras.layers.Dense(num_classes)(x)
-    return keras.Model(inputs, outputs)
+    """A model subclass with a functional-API graph network inside."""
+
+    def __init__(self, num_classes=2):
+        super().__init__(name="nested_model_2")
+        self.num_classes = num_classes
+        self.dense1 = keras.layers.Dense(32, activation="relu")
+        self.dense2 = keras.layers.Dense(num_classes, activation="relu")
+        self.bn = self.bn = keras.layers.BatchNormalization()
+        self.test_net = self.get_functional_graph_model(32, 4)
+
+    @staticmethod
+    def get_functional_graph_model(input_dim, num_classes):
+        # A simple functional-API model (a.k.a. graph network)
+        inputs = keras.Input(shape=(input_dim,))
+        x = keras.layers.Dense(32, activation="relu")(inputs)
+        x = keras.layers.BatchNormalization()(x)
+        outputs = keras.layers.Dense(num_classes)(x)
+        return keras.Model(inputs, outputs)
 
-  def call(self, inputs):
-    x = self.dense1(inputs)
-    x = self.bn(x)
-    x = self.test_net(x)
-    return self.dense2(x)
+    def call(self, inputs):
+        x = self.dense1(inputs)
+        x = self.bn(x)
+        x = self.test_net(x)
+        return self.dense2(x)
 
 
 def get_nested_model_3(input_dim, num_classes):
-  # A functional-API model with a subclassed model inside.
-  # NOTE: this requires the inner subclass to implement `compute_output_shape`.
+    # A functional-API model with a subclassed model inside.
+    # NOTE: this requires the inner subclass to implement
+    # `compute_output_shape`.
 
-  inputs = keras.Input(shape=(input_dim,))
-  x = keras.layers.Dense(32, activation='relu')(inputs)
-  x = keras.layers.BatchNormalization()(x)
+    inputs = keras.Input(shape=(input_dim,))
+    x = keras.layers.Dense(32, activation="relu")(inputs)
+    x = keras.layers.BatchNormalization()(x)
 
-  class Inner(keras.Model):
+    class Inner(keras.Model):
+        def __init__(self):
+            super().__init__()
+            self.dense1 = keras.layers.Dense(32, activation="relu")
+            self.dense2 = keras.layers.Dense(5, activation="relu")
+            self.bn = keras.layers.BatchNormalization()
 
-    def __init__(self):
-      super().__init__()
-      self.dense1 = keras.layers.Dense(32, activation='relu')
-      self.dense2 = keras.layers.Dense(5, activation='relu')
-      self.bn = keras.layers.BatchNormalization()
+        def call(self, inputs):
+            x = self.dense1(inputs)
+            x = self.dense2(x)
+            return self.bn(x)
 
-    def call(self, inputs):
-      x = self.dense1(inputs)
-      x = self.dense2(x)
-      return self.bn(x)
-
-  test_model = Inner()
-  x = test_model(x)
-  outputs = keras.layers.Dense(num_classes)(x)
-  return keras.Model(inputs, outputs, name='nested_model_3')
+    test_model = Inner()
+    x = test_model(x)
+    outputs = keras.layers.Dense(num_classes)(x)
+    return keras.Model(inputs, outputs, name="nested_model_3")
 
 
 class CustomCallModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(1, activation="relu")
+        self.dense2 = keras.layers.Dense(1, activation="softmax")
 
-  def __init__(self):
-    super().__init__()
-    self.dense1 = keras.layers.Dense(1, activation='relu')
-    self.dense2 = keras.layers.Dense(1, activation='softmax')
-
-  def call(self, first, second, fiddle_with_output='no', training=True):
-    combined = self.dense1(first) + self.dense2(second)
-    if fiddle_with_output == 'yes':
-      return 10. * combined
-    else:
-      return combined
+    def call(self, first, second, fiddle_with_output="no", training=True):
+        combined = self.dense1(first) + self.dense2(second)
+        if fiddle_with_output == "yes":
+            return 10.0 * combined
+        else:
+            return combined
 
 
 class TrainingNoDefaultModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(1)
 
-  def __init__(self):
-    super().__init__()
-    self.dense1 = keras.layers.Dense(1)
-
-  def call(self, x, training):
-    return self.dense1(x)
+    def call(self, x, training):
+        return self.dense1(x)
 
 
 class TrainingMaskingModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+        self.dense1 = keras.layers.Dense(1)
 
-  def __init__(self):
-    super().__init__()
-    self.dense1 = keras.layers.Dense(1)
-
-  def call(self, x, training=False, mask=None):
-    return self.dense1(x)
+    def call(self, x, training=False, mask=None):
+        return self.dense1(x)
diff --git a/keras/tests/saved_model_test.py b/keras/tests/saved_model_test.py
index f20a34c8b46a..dd80c7d007c0 100644
--- a/keras/tests/saved_model_test.py
+++ b/keras/tests/saved_model_test.py
@@ -14,47 +14,52 @@
 # ==============================================================================
 """Tests for trackable object SavedModel save."""
 
+import os
+
 import tensorflow.compat.v2 as tf
 
-import os
-from tensorflow.python.framework import test_util as tf_test_utils
 from keras.layers import core
-from keras.optimizers.optimizer_v2 import adam
+from keras.optimizers.legacy import adam
 
+# isort: off
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
-class _ModelWithOptimizerUsingDefun(tf.train.Checkpoint):
 
-  def __init__(self):
-    self.dense = core.Dense(1)
-    self.optimizer = adam.Adam(0.01)
+class _ModelWithOptimizerUsingDefun(tf.train.Checkpoint):
+    def __init__(self):
+        self.dense = core.Dense(1)
+        self.optimizer = adam.Adam(0.01)
 
-  @tf.function(
-      input_signature=(tf.TensorSpec([None, 2], tf.float32),
-                       tf.TensorSpec([None], tf.float32)),
-  )
-  def call(self, x, y):
-    with tf.GradientTape() as tape:
-      loss = tf.reduce_mean((self.dense(x) - y) ** 2.)
-    trainable_variables = self.dense.trainable_variables
-    gradients = tape.gradient(loss, trainable_variables)
-    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
-    return {"loss": loss}
+    @tf.function(
+        input_signature=(
+            tf.TensorSpec([None, 2], tf.float32),
+            tf.TensorSpec([None], tf.float32),
+        ),
+    )
+    def call(self, x, y):
+        with tf.GradientTape() as tape:
+            loss = tf.reduce_mean((self.dense(x) - y) ** 2.0)
+        trainable_variables = self.dense.trainable_variables
+        gradients = tape.gradient(loss, trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+        return {"loss": loss}
 
 
 class MemoryTests(tf.test.TestCase):
+    def setUp(self):
+        super().setUp()
+        self._model = _ModelWithOptimizerUsingDefun()
 
-  def setUp(self):
-    super().setUp()
-    self._model = _ModelWithOptimizerUsingDefun()
-
-  @tf_test_utils.assert_no_garbage_created
-  def DISABLED_test_no_reference_cycles(self):
-    x = tf.constant([[3., 4.]])
-    y = tf.constant([2.])
-    self._model.call(x, y)
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    tf.saved_model.save(self._model, save_dir, self._model.call)
+    @tf_test_utils.assert_no_garbage_created
+    def DISABLED_test_no_reference_cycles(self):
+        x = tf.constant([[3.0, 4.0]])
+        y = tf.constant([2.0])
+        self._model.call(x, y)
+        save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+        tf.saved_model.save(self._model, save_dir, self._model.call)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/tests/saver_test.py b/keras/tests/saver_test.py
index 922662553c05..bed83b35bdcb 100644
--- a/keras/tests/saver_test.py
+++ b/keras/tests/saver_test.py
@@ -14,132 +14,151 @@
 # =============================================================================
 """Tests for tensorflow.python.training.saver.py."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 import os
+
+import tensorflow.compat.v2 as tf
+
 from keras.engine import training
 from keras.layers import core
-from tensorflow.python.training.tracking import util as trackable_utils
 
+# isort: off
+from tensorflow.python.checkpoint import (
+    checkpoint as trackable_utils,
+)
 
-class NonLayerTrackable(tf.Module):
 
-  def __init__(self):
-    super().__init__()
-    self.a_variable = trackable_utils.add_variable(
-        self, name="a_variable", shape=[])
+class NonLayerTrackable(tf.Module):
+    def __init__(self):
+        super().__init__()
+        self.a_variable = trackable_utils.add_variable(
+            self, name="a_variable", shape=[]
+        )
 
 
 class MyModel(training.Model):
-  """A concrete Model for testing."""
+    """A concrete Model for testing."""
 
-  def __init__(self):
-    super().__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Trackables which aren't Layers.
-    self._non_layer = NonLayerTrackable()
+    def __init__(self):
+        super().__init__()
+        self._named_dense = core.Dense(1, use_bias=True)
+        self._second = core.Dense(1, use_bias=False)
+        # We can still track Trackables which aren't Layers.
+        self._non_layer = NonLayerTrackable()
 
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
+    def call(self, values):
+        ret = self._second(self._named_dense(values))
+        return ret
 
 
 class TrackableCompatibilityTests(tf.test.TestCase):
-
-  def _initialized_model(self):
-    input_value = tf.constant([[3.]])
-    model = MyModel()
-    optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-    optimizer_step = tf.compat.v1.train.get_or_create_global_step()
-    root_trackable = tf.train.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
-    return root_trackable
-
-  def _set_sentinels(self, root_trackable):
-    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")
-        .assign([102.]))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
-
-  def _check_sentinels(self, root_trackable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_trackable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
-
-  def testLoadFromObjectBasedGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    save_graph = tf.Graph()
-    with save_graph.as_default(), self.session(graph=save_graph) as sess:
-      root = self._initialized_model()
-      object_saver = tf.train.Checkpoint(root=root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
-
-      # An incompatible object-based checkpoint to check error messages
-      var = tf.Variable(1., name="a")
-      self.evaluate(var.initializer)
-      second_saver = tf.train.Checkpoint(v=var)
-      second_path = second_saver.save(file_prefix=os.path.join(
-          checkpoint_directory, "second"))
-
-    restore_graph = tf.Graph()
-    with restore_graph.as_default(), self.session(
-        graph=restore_graph) as sess:
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      saver = tf.compat.v1.train.Saver()
-      saver.restore(sess=sess, save_path=save_path)
-      self._check_sentinels(root)
-      before_second_restore_ops = restore_graph.get_operations()
-      # Test that multiple restores do not pollute the graph
-      saver.restore(sess=sess, save_path=save_path)
-      self.assertEqual(before_second_restore_ops,
-                       restore_graph.get_operations())
-      with self.assertRaisesRegex(tf.errors.NotFoundError,
-                                  "Could not find some variables"):
-        saver.restore(sess=sess, save_path=second_path)
-
-  def testLoadFromObjectBasedEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    save_graph = tf.Graph()
-    with save_graph.as_default(), self.session(graph=save_graph):
-      root = self._initialized_model()
-      object_saver = tf.train.Checkpoint(root=root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
-
-    with tf.__internal__.eager_context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      saver = tf.compat.v1.train.Saver(
-          root.model.variables + root.optimizer.variables())
-      saver.restore(sess=None, save_path=save_path)
-      self._check_sentinels(root)
+    def _initialized_model(self):
+        input_value = tf.constant([[3.0]])
+        model = MyModel()
+        optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+        optimizer_step = tf.compat.v1.train.get_or_create_global_step()
+        root_trackable = tf.train.Checkpoint(
+            optimizer=optimizer, model=model, optimizer_step=optimizer_step
+        )
+        train_op = optimizer.minimize(
+            functools.partial(model, input_value), global_step=optimizer_step
+        )
+        self.evaluate(trackable_utils.gather_initializers(root_trackable))
+        self.evaluate(train_op)
+        # A regular variable, a slot variable, and a non-slot Optimizer variable
+        # with known values to check when loading.
+        self.evaluate(model._named_dense.bias.assign([1.0]))
+        self.evaluate(
+            optimizer.get_slot(var=model._named_dense.bias, name="m").assign(
+                [2.0]
+            )
+        )
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.0))
+        return root_trackable
+
+    def _set_sentinels(self, root_trackable):
+        self.evaluate(root_trackable.model._named_dense.bias.assign([101.0]))
+        self.evaluate(
+            root_trackable.optimizer.get_slot(
+                var=root_trackable.model._named_dense.bias, name="m"
+            ).assign([102.0])
+        )
+        beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(103.0))
+
+    def _check_sentinels(self, root_trackable):
+        self.assertAllEqual(
+            [1.0], self.evaluate(root_trackable.model._named_dense.bias)
+        )
+        self.assertAllEqual(
+            [2.0],
+            self.evaluate(
+                root_trackable.optimizer.get_slot(
+                    var=root_trackable.model._named_dense.bias, name="m"
+                )
+            ),
+        )
+        beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+        self.assertAllEqual(3.0, self.evaluate(beta1_power))
+
+    def testLoadFromObjectBasedGraph(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+        save_graph = tf.Graph()
+        with save_graph.as_default(), self.session(graph=save_graph) as sess:
+            root = self._initialized_model()
+            object_saver = tf.train.Checkpoint(root=root)
+            save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+            # An incompatible object-based checkpoint to check error messages
+            var = tf.Variable(1.0, name="a")
+            self.evaluate(var.initializer)
+            second_saver = tf.train.Checkpoint(v=var)
+            second_path = second_saver.save(
+                file_prefix=os.path.join(checkpoint_directory, "second")
+            )
+
+        restore_graph = tf.Graph()
+        with restore_graph.as_default(), self.session(
+            graph=restore_graph
+        ) as sess:
+            root = self._initialized_model()
+            self._set_sentinels(root)
+            saver = tf.compat.v1.train.Saver()
+            saver.restore(sess=sess, save_path=save_path)
+            self._check_sentinels(root)
+            before_second_restore_ops = restore_graph.get_operations()
+            # Test that multiple restores do not pollute the graph
+            saver.restore(sess=sess, save_path=save_path)
+            self.assertEqual(
+                before_second_restore_ops, restore_graph.get_operations()
+            )
+            with self.assertRaisesRegex(
+                tf.errors.NotFoundError, "Could not find some variables"
+            ):
+                saver.restore(sess=sess, save_path=second_path)
+
+    def testLoadFromObjectBasedEager(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+        save_graph = tf.Graph()
+        with save_graph.as_default(), self.session(graph=save_graph):
+            root = self._initialized_model()
+            object_saver = tf.train.Checkpoint(root=root)
+            save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+        with tf.__internal__.eager_context.eager_mode():
+            root = self._initialized_model()
+            self._set_sentinels(root)
+            saver = tf.compat.v1.train.Saver(
+                root.model.variables + root.optimizer.variables()
+            )
+            saver.restore(sess=None, save_path=save_path)
+            self._check_sentinels(root)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/tests/serialization_util_test.py b/keras/tests/serialization_util_test.py
index a50373f34c4f..71652e63e5db 100644
--- a/keras/tests/serialization_util_test.py
+++ b/keras/tests/serialization_util_test.py
@@ -14,48 +14,55 @@
 # ==============================================================================
 """Tests for serialization functions."""
 
+import json
+
 import tensorflow.compat.v2 as tf
 
-import json
-from keras.testing_infra import test_combinations
 from keras.engine import input_layer
 from keras.engine import sequential
 from keras.engine import training
 from keras.layers import core
-from keras.saving.saved_model import json_utils
+from keras.saving.legacy.saved_model import json_utils
+from keras.testing_infra import test_combinations
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class SerializationTests(test_combinations.TestCase):
+    def test_serialize_dense(self):
+        dense = core.Dense(3)
+        dense(tf.constant([[4.0]]))
+        round_trip = json.loads(
+            json.dumps(dense, default=json_utils.get_json_type)
+        )
+        self.assertEqual(3, round_trip["config"]["units"])
+
+    def test_serialize_sequential(self):
+        model = sequential.Sequential()
+        model.add(core.Dense(4))
+        model.add(core.Dense(5))
+        model(tf.constant([[1.0]]))
+        sequential_round_trip = json.loads(
+            json.dumps(model, default=json_utils.get_json_type)
+        )
+        self.assertEqual(
+            # Note that `config['layers'][0]` will be an InputLayer in V2
+            # (but not in V1)
+            5,
+            sequential_round_trip["config"]["layers"][-1]["config"]["units"],
+        )
+
+    def test_serialize_model(self):
+        x = input_layer.Input(shape=[3])
+        y = core.Dense(10)(x)
+        model = training.Model(x, y)
+        model(tf.constant([[1.0, 1.0, 1.0]]))
+        model_round_trip = json.loads(
+            json.dumps(model, default=json_utils.get_json_type)
+        )
+        self.assertEqual(
+            10, model_round_trip["config"]["layers"][1]["config"]["units"]
+        )
 
-  def test_serialize_dense(self):
-    dense = core.Dense(3)
-    dense(tf.constant([[4.]]))
-    round_trip = json.loads(json.dumps(
-        dense, default=json_utils.get_json_type))
-    self.assertEqual(3, round_trip["config"]["units"])
-
-  def test_serialize_sequential(self):
-    model = sequential.Sequential()
-    model.add(core.Dense(4))
-    model.add(core.Dense(5))
-    model(tf.constant([[1.]]))
-    sequential_round_trip = json.loads(
-        json.dumps(model, default=json_utils.get_json_type))
-    self.assertEqual(
-        # Note that `config['layers'][0]` will be an InputLayer in V2
-        # (but not in V1)
-        5, sequential_round_trip["config"]["layers"][-1]["config"]["units"])
-
-  def test_serialize_model(self):
-    x = input_layer.Input(shape=[3])
-    y = core.Dense(10)(x)
-    model = training.Model(x, y)
-    model(tf.constant([[1., 1., 1.]]))
-    model_round_trip = json.loads(
-        json.dumps(model, default=json_utils.get_json_type))
-    self.assertEqual(
-        10, model_round_trip["config"]["layers"][1]["config"]["units"])
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/tests/temporal_sample_weights_correctness_test.py b/keras/tests/temporal_sample_weights_correctness_test.py
index c5d758766b4d..f6efd8117c2d 100644
--- a/keras/tests/temporal_sample_weights_correctness_test.py
+++ b/keras/tests/temporal_sample_weights_correctness_test.py
@@ -14,502 +14,582 @@
 # ==============================================================================
 """Tests temporal sample weights correctness using Keras model."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
-from keras.testing_infra import test_combinations
 from keras import layers
 from keras import metrics
-from keras.optimizers import optimizer_v2
+from keras.optimizers import legacy as optimizer_legacy
+from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
 
 class Bias(layers.Layer):
-  """Layer that add a bias to its inputs."""
+    """Layer that add a bias to its inputs."""
 
-  def build(self, input_shape):
-    self.bias = self.add_weight('bias', (1,), initializer='zeros')
+    def build(self, input_shape):
+        self.bias = self.add_weight("bias", (1,), initializer="zeros")
 
-  def call(self, inputs):
-    return inputs + self.bias
+    def call(self, inputs):
+        return inputs + self.bias
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    def compute_output_shape(self, input_shape):
+        return input_shape
 
 
 def get_multi_io_temporal_model():
-  timesteps = 2
-  inp_1 = layers.Input(shape=(1,), name='input_1')
-  inp_2 = layers.Input(shape=(1,), name='input_2')
-  x = layers.RepeatVector(timesteps)
-  out_1 = layers.TimeDistributed(Bias(), name='output_1')
-  out_2 = layers.TimeDistributed(Bias(), name='output_2')
+    timesteps = 2
+    inp_1 = layers.Input(shape=(1,), name="input_1")
+    inp_2 = layers.Input(shape=(1,), name="input_2")
+    x = layers.RepeatVector(timesteps)
+    out_1 = layers.TimeDistributed(Bias(), name="output_1")
+    out_2 = layers.TimeDistributed(Bias(), name="output_2")
 
-  branch_a = [inp_1, x, out_1]
-  branch_b = [inp_2, x, out_2]
-  return test_utils.get_multi_io_model(branch_a, branch_b)
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    return test_utils.get_multi_io_model(branch_a, branch_b)
 
 
 def get_compiled_multi_io_model_temporal(sample_weight_mode):
-  model = get_multi_io_temporal_model()
-  model.compile(
-      optimizer=optimizer_v2.gradient_descent.SGD(0.1),
-      loss='mae',
-      metrics=[metrics.MeanAbsoluteError(name='mae')],
-      weighted_metrics=[metrics.MeanAbsoluteError(name='mae_2')],
-      sample_weight_mode=sample_weight_mode,
-      run_eagerly=test_utils.should_run_eagerly())
-  return model
+    model = get_multi_io_temporal_model()
+    model.compile(
+        optimizer=optimizer_legacy.gradient_descent.SGD(0.1),
+        loss="mae",
+        metrics=[metrics.MeanAbsoluteError(name="mae")],
+        weighted_metrics=[metrics.MeanAbsoluteError(name="mae_2")],
+        sample_weight_mode=sample_weight_mode,
+        run_eagerly=test_utils.should_run_eagerly(),
+    )
+    return model
 
 
 def run_with_different_sample_weight_mode_inputs(fn, partial_sw=True):
-  """Executes the given function with different sample weight mode inputs.
-
-  Args:
-    fn: Training or eval function to execute.
-    partial_sw: Boolean flag to indicate whether temporal sample weight mode
-      should be set partially just for one output.
-  """
-  model = get_compiled_multi_io_model_temporal(sample_weight_mode='temporal')
-  fn(model)
-
-  model = get_compiled_multi_io_model_temporal(
-      sample_weight_mode=['temporal', 'temporal'])
-  fn(model)
-
-  model = get_compiled_multi_io_model_temporal(sample_weight_mode={
-      'output_1': 'temporal',
-      'output_2': 'temporal'
-  })
-  fn(model)
-
-  if partial_sw:
-    model = get_compiled_multi_io_model_temporal(
-        sample_weight_mode=[None, 'temporal'])
+    """Executes the given function with different sample weight mode inputs.
+
+    Args:
+      fn: Training or eval function to execute.
+      partial_sw: Boolean flag to indicate whether temporal sample weight mode
+        should be set partially just for one output.
+    """
+    model = get_compiled_multi_io_model_temporal(sample_weight_mode="temporal")
     fn(model)
 
-    # TODO(b/129700800): Enable after bug is fixed.
-    # model = get_compiled_multi_io_model_temporal(sample_weight_mode={
-    #     'output_2': 'temporal'
-    # })
-    # fn(model)
+    model = get_compiled_multi_io_model_temporal(
+        sample_weight_mode=["temporal", "temporal"]
+    )
+    fn(model)
 
+    model = get_compiled_multi_io_model_temporal(
+        sample_weight_mode={"output_1": "temporal", "output_2": "temporal"}
+    )
+    fn(model)
 
-@test_combinations.run_with_all_model_types(exclude_models=['sequential'])
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class TestMetricsCorrectnessMultiIOTemporal(test_combinations.TestCase):
+    if partial_sw:
+        model = get_compiled_multi_io_model_temporal(
+            sample_weight_mode=[None, "temporal"]
+        )
+        fn(model)
 
-  def custom_generator_multi_io_temporal(self, sample_weights=None):
-    """Generator for getting data for temporal multi io model.
+        # TODO(b/129700800): Enable after bug is fixed.
+        # model = get_compiled_multi_io_model_temporal(sample_weight_mode={
+        #     'output_2': 'temporal'
+        # })
+        # fn(model)
 
-    Args:
-      sample_weights: List of sample_weights.
 
-    Yields:
-      Tuple of inputs, label, sample weights data.
-    """
-    batch_size = 3
-    num_samples = 3
-    iteration = 0
-    while True:
-      batch_index = iteration * batch_size % num_samples
-      iteration += 1
-      start = batch_index
-      end = start + batch_size
-      x = [self.x[start:end], self.x[start:end]]
-      y = [self.y1[start:end], self.y2[start:end]]
-      if sample_weights:
-        sw = tf.nest.map_structure(lambda w: w[start:end], sample_weights)
-      else:
-        sw = None
-      yield x, y, sw
-
-  def setUp(self):
-    super(TestMetricsCorrectnessMultiIOTemporal, self).setUp()
-
-    self.x = np.asarray([[0.], [1.], [2.]])
-    self.y1 = np.asarray([[[.5], [1.]], [[2.], [2.5]], [[3.5], [2.5]]])
-    self.y2 = np.asarray([[[.5], [1.5]], [[2.], [1.5]], [[3.5], [3.]]])
-
-    # Without weights:
-    # Epoch 1 - bias = 0
-    #   y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
-    #   y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
-    #   mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]]
-    #   mae                 = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
-    #   mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]]
-    #   mae_2                 = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
-
-    # Epoch 2 - bias = 0.1 (2/2 * 0.1)
-    #   y_pred_1 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]]
-    #   y_pred_2 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]]
-    #   mae (y1 - y_pred_1) = [[[.4], [.9]], [[.9], [1.4]], [[1.4], [.4]]]
-    #   mae                 = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 = 0.9
-    #   mae_2 (y2 - y_pred_2) = [[[.4], [1.4]], [[.9], [.4]], [[1.4], [.9]]]
-    #   mae_2                 = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 = 0.9
-
-    self.expected_fit_result = {
-        'output_1_mae': [1, 0.9],
-        'output_2_mae': [1, 0.9],
-        'output_1_mae_2': [1, 0.9],
-        'output_2_mae_2': [1, 0.9],
-        'loss': [2., 1.8],
-        'output_1_loss': [1, 0.9],
-        'output_2_loss': [1, 0.9],
-    }
-
-    self.sample_weight_1 = np.asarray([[.5, 2.], [.5, 2.], [.5, 2.]])
-    self.sample_weight_2 = np.asarray([[2., .5], [2., .5], [2., .5]])
-
-    # With weights:
-    # Epoch 1
-    #   y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
-    #   y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
-    #   mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]]
-    #      with weights     = [[[.5 * .5], [1 * 2]],
-    #                          [[1 * .5], [1.5 * 2]],
-    #                          [[1.5 * .5], [.5 * 2]]]
-    #   mae (w/o weights)   = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
-    #   mae (weighted mean) = [[1.5/1.5, 6/6]] = [[1, 1]] = 2/2 = 1
-    #   mae (sum over bs)   = [[1.5/3, 6/3]] = [[.5, 2]] = 2.5/2 = 1.25
-
-    #   mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]]
-    #     with weights        = [[[.5 * 2], [1.5 * .5]],
-    #                            [[1. * 2], [.5 * .5]],
-    #                            [[1.5 * 2], [1. * .5]]]
-    #   mae_2 (w/o weights)   = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
-    #   mae_2 (weighted mean) = [[6/6, 1.5/1.5]] = [[1, 1]] = 2/2 = 1
-    #   mae_2 (sum over bs)   = [[6/3, 1.5/3]] = [[2, .5]] = 2.5/2 = 1.25
-
-    # Epoch 2 - bias = 0.125 (2.5/2 * 0.1)
-    #   y_pred_1 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125], [2.125]]]
-    #   y_pred_2 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125], [2.125]]]
-
-    #   mae (y1 - y_pred_1) = [[[.375], [.875]],
-    #                          [[.875], [1.375]],
-    #                          [[1.375], [.375]]]
-    #     with weights      = [[[.375 * .5], [.875 * 2.]],
-    #                          [[.875 * .5], [1.375 * 2.]],
-    #                          [[1.375 * .5], [.375 * 2.]]]
-    #   mae (w/o weights)   = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875
-    #   mae (weighted mean) = [[1.3125/1.5,  5.25/6]] = (.875+.875)/2 = .875
-    #   mae (sum over bs)   = [[1.3125/3,  5.25/3]] = (0.4375+1.75)/2 = 1.09375
-
-    #   mae_2 (y2 - y_pred_2) = [[[.375], [1.375]],
-    #                            [[.875], [.375]],
-    #                            [[1.375], [.875]]]
-    #     with weights        = [[[.375 * 2.], [1.375 * .5]],
-    #                            [[.875 * 2.], [.375 * .5]],
-    #                            [[1.375 * 2.], [.875 * .5]]]
-    #   mae_2 (w/o weights)   = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875
-    #   mae_2 (weighted mean) = [[5.25/6, 1.3125/1.5]] = (.875+.875)/2 = .875
-    #   mae_2 (sum over bs)  = [[5.25/3, 1.3125/3]] = (1.75+0.4375)/2 = 1.09375
-
-    self.expected_fit_result_with_weights = {
-        'output_1_mae': [1, 0.875],
-        'output_2_mae': [1, 0.875],
-        'output_1_mae_2': [1, 0.875],
-        'output_2_mae_2': [1, 0.875],
-        'loss': [2.5, 2.1875],
-        'output_1_loss': [1.25, 1.09375],
-        'output_2_loss': [1.25, 1.09375],
-    }
-
-    self.expected_fit_result_with_weights_output_2 = {
-        'output_1_mae': [1., 0.9],
-        'output_2_mae': [1, 0.875],
-        'output_1_mae_2': [1., 0.9],
-        'output_2_mae_2': [1., 0.875],
-        'loss': [2.25, 1.99375],
-        'output_1_loss': [1., 0.9],
-        'output_2_loss': [1.25, 1.09375],
-    }
-
-    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
-    # 'output_1_mae', 'output_1_mae_2',
-    # 'output_2_mae', 'output_2_mae_2'
-    self.expected_batch_result_with_weights = [
-        2.1875, 1.09375, 1.09375, 0.875, 0.875, 0.875, 0.875
-    ]
-    self.expected_batch_result_with_weights_output_2 = [
-        1.99375, 0.9, 1.09375, 0.9, 0.9, 0.875, 0.875
-    ]
-    self.expected_batch_result = [1.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]
-
-  def test_fit(self):
-
-    def _train_and_assert(model):
-      history = model.fit([self.x, self.x], [self.y1, self.y2],
-                          batch_size=3,
-                          epochs=2,
-                          shuffle=False)
-      for key, value in self.expected_fit_result.items():
-        self.assertAllClose(history.history[key], value, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_train_and_assert)
-
-  def test_fit_with_sample_weight(self):
-
-    def _train_and_assert(model):
-      history = model.fit([self.x, self.x], [self.y1, self.y2],
-                          sample_weight={
-                              'output_1': self.sample_weight_1,
-                              'output_2': self.sample_weight_2,
-                          },
-                          batch_size=3,
-                          epochs=2,
-                          shuffle=False)
-      for key, value in self.expected_fit_result_with_weights.items():
-        self.assertAllClose(history.history[key], value, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(
-        _train_and_assert, partial_sw=False)
-
-  def test_fit_with_partial_sample_weight(self):
-
-    def _train_and_assert(model):
-      history = model.fit([self.x, self.x], [self.y1, self.y2],
-                          sample_weight={
-                              'output_2': self.sample_weight_2,
-                          },
-                          batch_size=3,
-                          epochs=2,
-                          shuffle=False)
-      for key, value in self.expected_fit_result_with_weights_output_2.items():
-        self.assertAllClose(history.history[key], value, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_train_and_assert)
-
-  def test_eval(self):
-
-    def _eval_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2])
-      eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
-                                   batch_size=3)
-      self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_eval_and_assert)
-
-  def test_eval_with_sample_weight(self):
-
-    def _eval_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                           sample_weight={
-                               'output_1': self.sample_weight_1,
-                               'output_2': self.sample_weight_2,
-                           })
-      eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
-                                   batch_size=3,
-                                   sample_weight={
-                                       'output_1': self.sample_weight_1,
-                                       'output_2': self.sample_weight_2,
-                                   })
-      self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
-                          1e-3)
-
-    run_with_different_sample_weight_mode_inputs(
-        _eval_and_assert, partial_sw=False)
-
-  def test_eval_with_partial_sample_weight(self):
-
-    def _eval_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                           sample_weight={
-                               'output_2': self.sample_weight_2,
-                           })
-      eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
-                                   batch_size=3,
-                                   sample_weight={
-                                       'output_2': self.sample_weight_2,
-                                   })
-      self.assertAllClose(eval_result,
-                          self.expected_batch_result_with_weights_output_2,
-                          1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_eval_and_assert)
-
-  def test_train_on_batch(self):
-
-    def _train_and_assert(model):
-      for _ in range(2):
-        result = model.train_on_batch([self.x, self.x], [self.y1, self.y2])
-      self.assertAllClose(result, self.expected_batch_result, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_train_and_assert)
-
-  def test_train_on_batch_with_sample_weight(self):
-
-    def _train_and_assert(model):
-      for _ in range(2):
-        result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                                      sample_weight={
-                                          'output_1': self.sample_weight_1,
-                                          'output_2': self.sample_weight_2,
-                                      })
-      self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(
-        _train_and_assert, partial_sw=False)
-
-  def test_train_on_batch_with_partial_sample_weight(self):
-
-    def _train_and_assert(model):
-      for _ in range(2):
-        result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                                      sample_weight={
-                                          'output_2': self.sample_weight_2,
-                                      })
-      self.assertAllClose(result,
-                          self.expected_batch_result_with_weights_output_2,
-                          1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_train_and_assert)
-
-  def test_test_on_batch(self):
-
-    def _test_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2])
-      result = model.test_on_batch([self.x, self.x], [self.y1, self.y2])
-      self.assertAllClose(result, self.expected_batch_result, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_test_and_assert)
-
-  def test_test_on_batch_with_sample_weight(self):
-
-    def _test_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                           sample_weight={
-                               'output_1': self.sample_weight_1,
-                               'output_2': self.sample_weight_2,
-                           })
-      result = model.test_on_batch([self.x, self.x], [self.y1, self.y2],
-                                   sample_weight={
-                                       'output_1': self.sample_weight_1,
-                                       'output_2': self.sample_weight_2,
-                                   })
-      self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(
-        _test_and_assert, partial_sw=False)
-
-  def test_test_on_batch_with_partial_sample_weight(self):
-
-    def _test_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                           sample_weight={
-                               'output_2': self.sample_weight_2,
-                           })
-      result = model.test_on_batch([self.x, self.x], [self.y1, self.y2],
-                                   sample_weight={
-                                       'output_2': self.sample_weight_2,
-                                   })
-      self.assertAllClose(result,
-                          self.expected_batch_result_with_weights_output_2,
-                          1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_test_and_assert)
-
-  def test_fit_generator(self):
-
-    def _train_and_assert(model):
-      history = model.fit_generator(
-          self.custom_generator_multi_io_temporal(),
-          steps_per_epoch=1,
-          epochs=2)
-      for key, value in self.expected_fit_result.items():
-        self.assertAllClose(history.history[key], value, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_train_and_assert)
-
-  def test_fit_generator_with_sample_weight(self):
-
-    def _train_and_assert(model):
-      history = model.fit_generator(
-          self.custom_generator_multi_io_temporal(
-              sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-          steps_per_epoch=1,
-          epochs=2)
-      for key, value in self.expected_fit_result_with_weights.items():
-        self.assertAllClose(history.history[key], value, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(
-        _train_and_assert, partial_sw=False)
-
-  def test_fit_generator_with_partial_sample_weight(self):
-
-    def _train_and_assert(model):
-      history = model.fit_generator(
-          self.custom_generator_multi_io_temporal(
-              sample_weights={'output_2': self.sample_weight_2}),
-          steps_per_epoch=1,
-          epochs=2)
-      for key, value in self.expected_fit_result_with_weights_output_2.items():
-        self.assertAllClose(history.history[key], value, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_train_and_assert)
-
-  def test_eval_generator(self):
-
-    def _test_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2])
-      eval_result = model.evaluate_generator(
-          self.custom_generator_multi_io_temporal(), steps=1)
-      self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_test_and_assert)
-
-  def test_eval_generator_with_sample_weight(self):
-
-    def _test_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                           sample_weight={
-                               'output_1': self.sample_weight_1,
-                               'output_2': self.sample_weight_2,
-                           })
-      eval_result = model.evaluate_generator(
-          self.custom_generator_multi_io_temporal(
-              sample_weights=[self.sample_weight_1, self.sample_weight_2]),
-          steps=2)
-      self.assertAllClose(eval_result, self.expected_batch_result_with_weights,
-                          1e-3)
-
-    run_with_different_sample_weight_mode_inputs(
-        _test_and_assert, partial_sw=False)
-
-  def test_eval_generator_with_partial_sample_weight(self):
-
-    def _test_and_assert(model):
-      model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                           sample_weight={
-                               'output_2': self.sample_weight_2,
-                           })
-      eval_result = model.evaluate_generator(
-          self.custom_generator_multi_io_temporal(
-              sample_weights={'output_2': self.sample_weight_2}),
-          steps=2)
-      self.assertAllClose(eval_result,
-                          self.expected_batch_result_with_weights_output_2,
-                          1e-3)
-
-    run_with_different_sample_weight_mode_inputs(_test_and_assert)
-
-  def test_error_on_fit_with_class_weight(self):
-
-    def _train_and_assert(model):
-      with self.assertRaises(ValueError):
-        model.fit([self.x, self.x], [self.y1, self.y2],
-                  class_weight={'output_1': {
-                      .5: .5,
-                      2.: .5,
-                      3.5: .5
-                  }},
-                  batch_size=3,
-                  epochs=2,
-                  shuffle=False)
-
-    run_with_different_sample_weight_mode_inputs(_train_and_assert)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+@test_combinations.run_with_all_model_types(exclude_models=["sequential"])
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class TestMetricsCorrectnessMultiIOTemporal(test_combinations.TestCase):
+    def custom_generator_multi_io_temporal(self, sample_weights=None):
+        """Generator for getting data for temporal multi io model.
+
+        Args:
+          sample_weights: List of sample_weights.
+
+        Yields:
+          Tuple of inputs, label, sample weights data.
+        """
+        batch_size = 3
+        num_samples = 3
+        iteration = 0
+        while True:
+            batch_index = iteration * batch_size % num_samples
+            iteration += 1
+            start = batch_index
+            end = start + batch_size
+            x = [self.x[start:end], self.x[start:end]]
+            y = [self.y1[start:end], self.y2[start:end]]
+            if sample_weights:
+                sw = tf.nest.map_structure(
+                    lambda w: w[start:end], sample_weights
+                )
+            else:
+                sw = None
+            yield x, y, sw
+
+    def setUp(self):
+        super(TestMetricsCorrectnessMultiIOTemporal, self).setUp()
+
+        self.x = np.asarray([[0.0], [1.0], [2.0]])
+        self.y1 = np.asarray([[[0.5], [1.0]], [[2.0], [2.5]], [[3.5], [2.5]]])
+        self.y2 = np.asarray([[[0.5], [1.5]], [[2.0], [1.5]], [[3.5], [3.0]]])
+
+        # Without weights:
+        # Epoch 1 - bias = 0
+        #   y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+        #   y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+        #   mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]]
+        #   mae                 = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+        #   mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]]
+        #   mae_2                 = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+
+        # Epoch 2 - bias = 0.1 (2/2 * 0.1)
+        #   y_pred_1 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]]
+        #   y_pred_2 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]]
+        #   mae (y1 - y_pred_1) = [[[.4], [.9]], [[.9], [1.4]], [[1.4], [.4]]]
+        #   mae                 = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 = 0.9
+        #   mae_2 (y2 - y_pred_2) = [[[.4], [1.4]], [[.9], [.4]], [[1.4], [.9]]]
+        #   mae_2                 = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 =
+        #   0.9
+
+        self.expected_fit_result = {
+            "output_1_mae": [1, 0.9],
+            "output_2_mae": [1, 0.9],
+            "output_1_mae_2": [1, 0.9],
+            "output_2_mae_2": [1, 0.9],
+            "loss": [2.0, 1.8],
+            "output_1_loss": [1, 0.9],
+            "output_2_loss": [1, 0.9],
+        }
+
+        self.sample_weight_1 = np.asarray([[0.5, 2.0], [0.5, 2.0], [0.5, 2.0]])
+        self.sample_weight_2 = np.asarray([[2.0, 0.5], [2.0, 0.5], [2.0, 0.5]])
+
+        # With weights:
+        # Epoch 1
+        #   y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+        #   y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
+        #   mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]]
+        #      with weights     = [[[.5 * .5], [1 * 2]],
+        #                          [[1 * .5], [1.5 * 2]],
+        #                          [[1.5 * .5], [.5 * 2]]]
+        #   mae (w/o weights)   = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+        #   mae (weighted mean) = [[1.5/1.5, 6/6]] = [[1, 1]] = 2/2 = 1
+        #   mae (sum over bs)   = [[1.5/3, 6/3]] = [[.5, 2]] = 2.5/2 = 1.25
+
+        #   mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]]
+        #     with weights        = [[[.5 * 2], [1.5 * .5]],
+        #                            [[1. * 2], [.5 * .5]],
+        #                            [[1.5 * 2], [1. * .5]]]
+        #   mae_2 (w/o weights)   = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1
+        #   mae_2 (weighted mean) = [[6/6, 1.5/1.5]] = [[1, 1]] = 2/2 = 1
+        #   mae_2 (sum over bs)   = [[6/3, 1.5/3]] = [[2, .5]] = 2.5/2 = 1.25
+
+        # Epoch 2 - bias = 0.125 (2.5/2 * 0.1)
+        #   y_pred_1 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125],
+        #   [2.125]]]
+        #   y_pred_2 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125],
+        #   [2.125]]]
+
+        #   mae (y1 - y_pred_1) = [[[.375], [.875]],
+        #                          [[.875], [1.375]],
+        #                          [[1.375], [.375]]]
+        #     with weights      = [[[.375 * .5], [.875 * 2.]],
+        #                          [[.875 * .5], [1.375 * 2.]],
+        #                          [[1.375 * .5], [.375 * 2.]]]
+        #   mae (w/o weights)   = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875
+        #   mae (weighted mean) = [[1.3125/1.5,  5.25/6]] = (.875+.875)/2 = .875
+        #   mae (sum over bs)   = [[1.3125/3,  5.25/3]] = (0.4375+1.75)/2 =
+        #   1.09375
+
+        #   mae_2 (y2 - y_pred_2) = [[[.375], [1.375]],
+        #                            [[.875], [.375]],
+        #                            [[1.375], [.875]]]
+        #     with weights        = [[[.375 * 2.], [1.375 * .5]],
+        #                            [[.875 * 2.], [.375 * .5]],
+        #                            [[1.375 * 2.], [.875 * .5]]]
+        #   mae_2 (w/o weights)   = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875
+        #   mae_2 (weighted mean) = [[5.25/6, 1.3125/1.5]] = (.875+.875)/2 =
+        #   .875
+        #   mae_2 (sum over bs)  = [[5.25/3, 1.3125/3]] = (1.75+0.4375)/2 =
+        #   1.09375
+
+        self.expected_fit_result_with_weights = {
+            "output_1_mae": [1, 0.875],
+            "output_2_mae": [1, 0.875],
+            "output_1_mae_2": [1, 0.875],
+            "output_2_mae_2": [1, 0.875],
+            "loss": [2.5, 2.1875],
+            "output_1_loss": [1.25, 1.09375],
+            "output_2_loss": [1.25, 1.09375],
+        }
+
+        self.expected_fit_result_with_weights_output_2 = {
+            "output_1_mae": [1.0, 0.9],
+            "output_2_mae": [1, 0.875],
+            "output_1_mae_2": [1.0, 0.9],
+            "output_2_mae_2": [1.0, 0.875],
+            "loss": [2.25, 1.99375],
+            "output_1_loss": [1.0, 0.9],
+            "output_2_loss": [1.25, 1.09375],
+        }
+
+        # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+        # 'output_1_mae', 'output_1_mae_2',
+        # 'output_2_mae', 'output_2_mae_2'
+        self.expected_batch_result_with_weights = [
+            2.1875,
+            1.09375,
+            1.09375,
+            0.875,
+            0.875,
+            0.875,
+            0.875,
+        ]
+        self.expected_batch_result_with_weights_output_2 = [
+            1.99375,
+            0.9,
+            1.09375,
+            0.9,
+            0.9,
+            0.875,
+            0.875,
+        ]
+        self.expected_batch_result = [1.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]
+
+    def test_fit(self):
+        def _train_and_assert(model):
+            history = model.fit(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                batch_size=3,
+                epochs=2,
+                shuffle=False,
+            )
+            for key, value in self.expected_fit_result.items():
+                self.assertAllClose(history.history[key], value, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+    def test_fit_with_sample_weight(self):
+        def _train_and_assert(model):
+            history = model.fit(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_1": self.sample_weight_1,
+                    "output_2": self.sample_weight_2,
+                },
+                batch_size=3,
+                epochs=2,
+                shuffle=False,
+            )
+            for key, value in self.expected_fit_result_with_weights.items():
+                self.assertAllClose(history.history[key], value, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(
+            _train_and_assert, partial_sw=False
+        )
+
+    def test_fit_with_partial_sample_weight(self):
+        def _train_and_assert(model):
+            history = model.fit(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_2": self.sample_weight_2,
+                },
+                batch_size=3,
+                epochs=2,
+                shuffle=False,
+            )
+            for (
+                key,
+                value,
+            ) in self.expected_fit_result_with_weights_output_2.items():
+                self.assertAllClose(history.history[key], value, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+    def test_eval(self):
+        def _eval_and_assert(model):
+            model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+            eval_result = model.evaluate(
+                [self.x, self.x], [self.y1, self.y2], batch_size=3
+            )
+            self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_eval_and_assert)
+
+    def test_eval_with_sample_weight(self):
+        def _eval_and_assert(model):
+            model.train_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_1": self.sample_weight_1,
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            eval_result = model.evaluate(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                batch_size=3,
+                sample_weight={
+                    "output_1": self.sample_weight_1,
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            self.assertAllClose(
+                eval_result, self.expected_batch_result_with_weights, 1e-3
+            )
+
+        run_with_different_sample_weight_mode_inputs(
+            _eval_and_assert, partial_sw=False
+        )
+
+    def test_eval_with_partial_sample_weight(self):
+        def _eval_and_assert(model):
+            model.train_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            eval_result = model.evaluate(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                batch_size=3,
+                sample_weight={
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            self.assertAllClose(
+                eval_result,
+                self.expected_batch_result_with_weights_output_2,
+                1e-3,
+            )
+
+        run_with_different_sample_weight_mode_inputs(_eval_and_assert)
+
+    def test_train_on_batch(self):
+        def _train_and_assert(model):
+            for _ in range(2):
+                result = model.train_on_batch(
+                    [self.x, self.x], [self.y1, self.y2]
+                )
+            self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+    def test_train_on_batch_with_sample_weight(self):
+        def _train_and_assert(model):
+            for _ in range(2):
+                result = model.train_on_batch(
+                    [self.x, self.x],
+                    [self.y1, self.y2],
+                    sample_weight={
+                        "output_1": self.sample_weight_1,
+                        "output_2": self.sample_weight_2,
+                    },
+                )
+            self.assertAllClose(
+                result, self.expected_batch_result_with_weights, 1e-3
+            )
+
+        run_with_different_sample_weight_mode_inputs(
+            _train_and_assert, partial_sw=False
+        )
+
+    def test_train_on_batch_with_partial_sample_weight(self):
+        def _train_and_assert(model):
+            for _ in range(2):
+                result = model.train_on_batch(
+                    [self.x, self.x],
+                    [self.y1, self.y2],
+                    sample_weight={
+                        "output_2": self.sample_weight_2,
+                    },
+                )
+            self.assertAllClose(
+                result, self.expected_batch_result_with_weights_output_2, 1e-3
+            )
+
+        run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+    def test_test_on_batch(self):
+        def _test_and_assert(model):
+            model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+            result = model.test_on_batch([self.x, self.x], [self.y1, self.y2])
+            self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+    def test_test_on_batch_with_sample_weight(self):
+        def _test_and_assert(model):
+            model.train_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_1": self.sample_weight_1,
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            result = model.test_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_1": self.sample_weight_1,
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            self.assertAllClose(
+                result, self.expected_batch_result_with_weights, 1e-3
+            )
+
+        run_with_different_sample_weight_mode_inputs(
+            _test_and_assert, partial_sw=False
+        )
+
+    def test_test_on_batch_with_partial_sample_weight(self):
+        def _test_and_assert(model):
+            model.train_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            result = model.test_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            self.assertAllClose(
+                result, self.expected_batch_result_with_weights_output_2, 1e-3
+            )
+
+        run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+    def test_fit_generator(self):
+        def _train_and_assert(model):
+            history = model.fit_generator(
+                self.custom_generator_multi_io_temporal(),
+                steps_per_epoch=1,
+                epochs=2,
+            )
+            for key, value in self.expected_fit_result.items():
+                self.assertAllClose(history.history[key], value, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+    def test_fit_generator_with_sample_weight(self):
+        def _train_and_assert(model):
+            history = model.fit_generator(
+                self.custom_generator_multi_io_temporal(
+                    sample_weights=[self.sample_weight_1, self.sample_weight_2]
+                ),
+                steps_per_epoch=1,
+                epochs=2,
+            )
+            for key, value in self.expected_fit_result_with_weights.items():
+                self.assertAllClose(history.history[key], value, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(
+            _train_and_assert, partial_sw=False
+        )
+
+    def test_fit_generator_with_partial_sample_weight(self):
+        def _train_and_assert(model):
+            history = model.fit_generator(
+                self.custom_generator_multi_io_temporal(
+                    sample_weights={"output_2": self.sample_weight_2}
+                ),
+                steps_per_epoch=1,
+                epochs=2,
+            )
+            for (
+                key,
+                value,
+            ) in self.expected_fit_result_with_weights_output_2.items():
+                self.assertAllClose(history.history[key], value, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+    def test_eval_generator(self):
+        def _test_and_assert(model):
+            model.train_on_batch([self.x, self.x], [self.y1, self.y2])
+            eval_result = model.evaluate_generator(
+                self.custom_generator_multi_io_temporal(), steps=1
+            )
+            self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+        run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+    def test_eval_generator_with_sample_weight(self):
+        def _test_and_assert(model):
+            model.train_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_1": self.sample_weight_1,
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            eval_result = model.evaluate_generator(
+                self.custom_generator_multi_io_temporal(
+                    sample_weights=[self.sample_weight_1, self.sample_weight_2]
+                ),
+                steps=2,
+            )
+            self.assertAllClose(
+                eval_result, self.expected_batch_result_with_weights, 1e-3
+            )
+
+        run_with_different_sample_weight_mode_inputs(
+            _test_and_assert, partial_sw=False
+        )
+
+    def test_eval_generator_with_partial_sample_weight(self):
+        def _test_and_assert(model):
+            model.train_on_batch(
+                [self.x, self.x],
+                [self.y1, self.y2],
+                sample_weight={
+                    "output_2": self.sample_weight_2,
+                },
+            )
+            eval_result = model.evaluate_generator(
+                self.custom_generator_multi_io_temporal(
+                    sample_weights={"output_2": self.sample_weight_2}
+                ),
+                steps=2,
+            )
+            self.assertAllClose(
+                eval_result,
+                self.expected_batch_result_with_weights_output_2,
+                1e-3,
+            )
+
+        run_with_different_sample_weight_mode_inputs(_test_and_assert)
+
+    def test_error_on_fit_with_class_weight(self):
+        def _train_and_assert(model):
+            with self.assertRaises(ValueError):
+                model.fit(
+                    [self.x, self.x],
+                    [self.y1, self.y2],
+                    class_weight={"output_1": {0.5: 0.5, 2.0: 0.5, 3.5: 0.5}},
+                    batch_size=3,
+                    epochs=2,
+                    shuffle=False,
+                )
+
+        run_with_different_sample_weight_mode_inputs(_train_and_assert)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/tests/tracking_test.py b/keras/tests/tracking_test.py
index de6b8ba56512..c8c639dcd360 100644
--- a/keras/tests/tracking_test.py
+++ b/keras/tests/tracking_test.py
@@ -15,586 +15,625 @@
 
 import os
 
+import numpy
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-import numpy
-from keras.testing_infra import test_combinations
+
 from keras.engine import sequential
 from keras.engine import training
 from keras.layers import core
 from keras.layers.normalization import batch_normalization_v1
-from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import util
-
-
-class HasList(training.Model):
-
-  def __init__(self):
-    super().__init__()
-    self.layer_list = tf.__internal__.tracking.wrap([core.Dense(3)])
-    self.layer_list.append(core.Dense(4))
-    self.layer_list.extend(
-        [core.Dense(5),
-         core.Dense(6, kernel_regularizer=tf.reduce_sum)])
-    self.layer_list += [
-        core.Dense(7, bias_regularizer=tf.reduce_sum),
-        core.Dense(8)
-    ]
-    self.layer_list += (
-        tf.__internal__.tracking.wrap([core.Dense(9)]) +
-        tf.__internal__.tracking.wrap([core.Dense(10)]))
-    self.layer_list.extend(
-        tf.__internal__.tracking.wrap(
-            list([core.Dense(11)]) + [core.Dense(12)]))
-    self.layers_with_updates = tf.__internal__.tracking.wrap(
-        [batch_normalization_v1.BatchNormalization()])
-
-  def call(self, x):
-    aggregation = 0.
-    for l in self.layer_list:
-      x = l(x)
-      aggregation += tf.reduce_sum(x)
-    bn, = self.layers_with_updates
-    return bn(x) / aggregation
-
-
-class ListTests(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testTracking(self):
-    with self.test_session():
-      model = HasList()
-      output = model(tf.ones([32, 2]))
-      self.assertAllEqual([32, 12], output.shape)
-      self.assertEqual(11, len(model.layers))
-      self.assertEqual(10, len(model.layer_list.layers))
-      self.assertEqual(
-          len(model.layers),
-          len(model.layer_list.layers + model.layers_with_updates))
-      for index in range(10):
-        self.assertEqual(3 + index, model.layer_list.layers[index].units)
-      children = model._trackable_children()
-      self.assertLen(children, 2)
-      self.assertIs(model.layer_list, children["layer_list"])
-      self.assertIs(model.layers_with_updates,
-                    children["layers_with_updates"])
-      self.assertLen(
-          children["layer_list"]._trackable_children(), 10)
-      self.evaluate([v.initializer for v in model.variables])
-      self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
-      save_path = os.path.join(self.get_temp_dir(), "ckpt")
-      model.save_weights(save_path)
-      self.evaluate(model.variables[0].assign(tf.zeros([2, 3])))
-      model.load_weights(save_path)
-      self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
-                          self.evaluate(model.variables[0]))
-      v = tf.Variable(1.)
-      model.var_list = [v]
-    self.assertTrue(any(v is t for t in model.variables))
-    self.assertTrue(any(v is t for t in model.trainable_variables))
-    self.assertFalse(any(v is t for t in model.non_trainable_variables))
-    self.assertTrue(any(model.layer_list[0].trainable_weights[0]
-                        is t for t in model.trainable_weights))
-
-  def testSubModelTracking(self):
-    model = training.Model()
-    model.v = tf.Variable(1.)
-    self.assertIn(model.v, model.trainable_weights)
-    model2 = training.Model()
-    model2.m = [model]
-    self.assertIn(model.v, model2.trainable_weights)
-
-  def testSubSequentialTracking(self):
-
-    class _Subclassed(training.Model):
-
-      def __init__(self, wrapped):
-        super().__init__()
-        self._wrapped = wrapped
-
-      def call(self, x):
-        return self._wrapped(x)
-
-    model = sequential.Sequential()
-    layer = core.Dense(1)
-    model.add(layer)
-    model2 = _Subclassed(model)
-    model2(tf.ones([1, 2]))
-    model2.m = [model]
-    self.assertIn(layer.kernel, model2.trainable_weights)
-
-  def testLayerTrackedThroughSequential(self):
-    class AttrDict(dict):
-
-      def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.__dict__ = self
-
-    def ffnet(layer_sizes, name):
-      ff = sequential.Sequential(name=name)
-      for i, width in enumerate(layer_sizes):
-        ff.add(core.Dense(
-            width,
-            activation=("relu" if i < len(layer_sizes)-1 else None)))
-      return ff
-
-    class MyModel2(training.Model):
-
-      def __init__(self, config, name="my_model_2"):
-        super().__init__(name=name)
-        self._num_tokens = config.num_tokens
-
-        # list of sub-models
-        self._ffnet = [ffnet(config.module_layers + (self._num_tokens,), "ff")]
-
-      def null_input(self):
-        return tf.zeros([1, self._num_tokens], dtype=tf.float32)
-
-      def call(self, input_, module_index=None):
-        return self._ffnet[0](input_)
-
-    m2 = MyModel2(AttrDict(
-        num_tokens=5,
-        module_layers=(50, 30)))
-
-    # Construct
-    m2(m2.null_input())
-    self.assertLen(m2.trainable_variables, 6)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testUpdatesForwarded(self):
-    model = HasList()
-    model_input = tf.ones([32, 2])
-    model(model_input)
-    if tf.executing_eagerly():
-      self.assertEqual(0, len(model.updates))
-    else:
-      self.assertGreater(len(model.layers_with_updates[0].updates), 0)
-      self.assertEqual(set(model.layers_with_updates[0].updates),
-                       set(model.updates))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testLossesForwarded(self):
-    model = HasList()
-    model_input = tf.ones([32, 2])
-    model(model_input)
-    self.assertEqual(2, len(model.losses))
-
-  def testModelContainersCompareEqual(self):
-    class HasEqualContainers(training.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.l1 = []
-        self.l2 = []
-
-    model = HasEqualContainers()
-    first_layer = HasEqualContainers()
-    model.l1.append(first_layer)
-    second_layer = HasEqualContainers()
-    model.l2.append(second_layer)
-    self.assertEqual([first_layer, second_layer], model.layers)
+from keras.testing_infra import test_combinations
 
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testTensorConversion(self):
+# isort: off
+from tensorflow.python.trackable import data_structures
+from tensorflow.python.checkpoint import checkpoint as util
 
-    class ListToTensor(training.Model):
 
-      def __init__(self):
+class HasList(training.Model):
+    def __init__(self):
         super().__init__()
-        self.l = [1., 2., 3.]
+        self.layer_list = tf.__internal__.tracking.wrap([core.Dense(3)])
+        self.layer_list.append(core.Dense(4))
+        self.layer_list.extend(
+            [core.Dense(5), core.Dense(6, kernel_regularizer=tf.reduce_sum)]
+        )
+        self.layer_list += [
+            core.Dense(7, bias_regularizer=tf.reduce_sum),
+            core.Dense(8),
+        ]
+        self.layer_list += tf.__internal__.tracking.wrap(
+            [core.Dense(9)]
+        ) + tf.__internal__.tracking.wrap([core.Dense(10)])
+        self.layer_list.extend(
+            tf.__internal__.tracking.wrap(
+                list([core.Dense(11)]) + [core.Dense(12)]
+            )
+        )
+        self.layers_with_updates = tf.__internal__.tracking.wrap(
+            [batch_normalization_v1.BatchNormalization()]
+        )
+
+    def call(self, x):
+        aggregation = 0.0
+        for l in self.layer_list:
+            x = l(x)
+            aggregation += tf.reduce_sum(x)
+        (bn,) = self.layers_with_updates
+        return bn(x) / aggregation
 
-    self.assertAllEqual(
-        [1., 2., 3.],
-        self.evaluate(tf.constant(ListToTensor().l)))
 
-    self.assertAllEqual(
-        [1., 2., 3.],
-        self.evaluate(tf.raw_ops.Pack(values=ListToTensor().l)))
+class ListTests(test_combinations.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTracking(self):
+        with self.test_session():
+            model = HasList()
+            output = model(tf.ones([32, 2]))
+            self.assertAllEqual([32, 12], output.shape)
+            self.assertEqual(11, len(model.layers))
+            self.assertEqual(10, len(model.layer_list.layers))
+            self.assertEqual(
+                len(model.layers),
+                len(model.layer_list.layers + model.layers_with_updates),
+            )
+            for index in range(10):
+                self.assertEqual(
+                    3 + index, model.layer_list.layers[index].units
+                )
+            children = model._trackable_children()
+            self.assertLen(children, 2)
+            self.assertIs(model.layer_list, children["layer_list"])
+            self.assertIs(
+                model.layers_with_updates, children["layers_with_updates"]
+            )
+            self.assertLen(children["layer_list"]._trackable_children(), 10)
+            self.evaluate([v.initializer for v in model.variables])
+            self.evaluate(
+                model.variables[0].assign([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            )
+            save_path = os.path.join(self.get_temp_dir(), "ckpt")
+            model.save_weights(save_path)
+            self.evaluate(model.variables[0].assign(tf.zeros([2, 3])))
+            model.load_weights(save_path)
+            self.assertAllEqual(
+                [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
+                self.evaluate(model.variables[0]),
+            )
+            v = tf.Variable(1.0)
+            model.var_list = [v]
+        self.assertTrue(any(v is t for t in model.variables))
+        self.assertTrue(any(v is t for t in model.trainable_variables))
+        self.assertFalse(any(v is t for t in model.non_trainable_variables))
+        self.assertTrue(
+            any(
+                model.layer_list[0].trainable_weights[0] is t
+                for t in model.trainable_weights
+            )
+        )
+
+    def testSubModelTracking(self):
+        model = training.Model()
+        model.v = tf.Variable(1.0)
+        self.assertIn(model.v, model.trainable_weights)
+        model2 = training.Model()
+        model2.m = [model]
+        self.assertIn(model.v, model2.trainable_weights)
+
+    def testSubSequentialTracking(self):
+        class _Subclassed(training.Model):
+            def __init__(self, wrapped):
+                super().__init__()
+                self._wrapped = wrapped
+
+            def call(self, x):
+                return self._wrapped(x)
+
+        model = sequential.Sequential()
+        layer = core.Dense(1)
+        model.add(layer)
+        model2 = _Subclassed(model)
+        model2(tf.ones([1, 2]))
+        model2.m = [model]
+        self.assertIn(layer.kernel, model2.trainable_weights)
+
+    def testLayerTrackedThroughSequential(self):
+        class AttrDict(dict):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.__dict__ = self
+
+        def ffnet(layer_sizes, name):
+            ff = sequential.Sequential(name=name)
+            for i, width in enumerate(layer_sizes):
+                ff.add(
+                    core.Dense(
+                        width,
+                        activation=(
+                            "relu" if i < len(layer_sizes) - 1 else None
+                        ),
+                    )
+                )
+            return ff
+
+        class MyModel2(training.Model):
+            def __init__(self, config, name="my_model_2"):
+                super().__init__(name=name)
+                self._num_tokens = config.num_tokens
+
+                # list of sub-models
+                self._ffnet = [
+                    ffnet(config.module_layers + (self._num_tokens,), "ff")
+                ]
+
+            def null_input(self):
+                return tf.zeros([1, self._num_tokens], dtype=tf.float32)
+
+            def call(self, input_, module_index=None):
+                return self._ffnet[0](input_)
+
+        m2 = MyModel2(AttrDict(num_tokens=5, module_layers=(50, 30)))
+
+        # Construct
+        m2(m2.null_input())
+        self.assertLen(m2.trainable_variables, 6)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testUpdatesForwarded(self):
+        model = HasList()
+        model_input = tf.ones([32, 2])
+        model(model_input)
+        if tf.executing_eagerly():
+            self.assertEqual(0, len(model.updates))
+        else:
+            self.assertGreater(len(model.layers_with_updates[0].updates), 0)
+            self.assertEqual(
+                set(model.layers_with_updates[0].updates), set(model.updates)
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testLossesForwarded(self):
+        model = HasList()
+        model_input = tf.ones([32, 2])
+        model(model_input)
+        self.assertEqual(2, len(model.losses))
+
+    def testModelContainersCompareEqual(self):
+        class HasEqualContainers(training.Model):
+            def __init__(self):
+                super().__init__()
+                self.l1 = []
+                self.l2 = []
+
+        model = HasEqualContainers()
+        first_layer = HasEqualContainers()
+        model.l1.append(first_layer)
+        second_layer = HasEqualContainers()
+        model.l2.append(second_layer)
+        self.assertEqual([first_layer, second_layer], model.layers)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTensorConversion(self):
+        class ListToTensor(training.Model):
+            def __init__(self):
+                super().__init__()
+                self.l = [1.0, 2.0, 3.0]
+
+        self.assertAllEqual(
+            [1.0, 2.0, 3.0], self.evaluate(tf.constant(ListToTensor().l))
+        )
+
+        self.assertAllEqual(
+            [1.0, 2.0, 3.0],
+            self.evaluate(tf.raw_ops.Pack(values=ListToTensor().l)),
+        )
 
 
 class ListWrapperTest(tf.test.TestCase):
-
-  def testLayerCollectionWithExternalMutation(self):
-    l = []
-    l_wrapper = tf.__internal__.tracking.wrap(l)
-    layer = core.Dense(1)
-    l.append(layer)
-    self.assertEqual([layer], l_wrapper.layers)
+    def testLayerCollectionWithExternalMutation(self):
+        l = []
+        l_wrapper = tf.__internal__.tracking.wrap(l)
+        layer = core.Dense(1)
+        l.append(layer)
+        self.assertEqual([layer], l_wrapper.layers)
 
 
 class HasMapping(training.Model):
-
-  def __init__(self):
-    super().__init__()
-    self.layer_dict = tf.__internal__.tracking.wrap(dict(output=core.Dense(7)))
-    self.layer_dict["norm"] = tf.__internal__.tracking.wrap([])
-    self.layer_dict["dense"] = tf.__internal__.tracking.wrap([])
-    self.layer_dict["dense"].extend(
-        [core.Dense(5),
-         core.Dense(6, kernel_regularizer=tf.reduce_sum)])
-    self.layer_dict["norm"].append(
-        batch_normalization_v1.BatchNormalization())
-    self.layer_dict["norm"].append(
-        batch_normalization_v1.BatchNormalization())
-
-  def call(self, x):
-    aggregation = 0.
-    for norm, dense in zip(self.layer_dict["norm"], self.layer_dict["dense"]):
-      x = norm(dense(x))
-      aggregation += tf.reduce_sum(x)
-    return self.layer_dict["output"](x) / aggregation
+    def __init__(self):
+        super().__init__()
+        self.layer_dict = tf.__internal__.tracking.wrap(
+            dict(output=core.Dense(7))
+        )
+        self.layer_dict["norm"] = tf.__internal__.tracking.wrap([])
+        self.layer_dict["dense"] = tf.__internal__.tracking.wrap([])
+        self.layer_dict["dense"].extend(
+            [core.Dense(5), core.Dense(6, kernel_regularizer=tf.reduce_sum)]
+        )
+        self.layer_dict["norm"].append(
+            batch_normalization_v1.BatchNormalization()
+        )
+        self.layer_dict["norm"].append(
+            batch_normalization_v1.BatchNormalization()
+        )
+
+    def call(self, x):
+        aggregation = 0.0
+        for norm, dense in zip(
+            self.layer_dict["norm"], self.layer_dict["dense"]
+        ):
+            x = norm(dense(x))
+            aggregation += tf.reduce_sum(x)
+        return self.layer_dict["output"](x) / aggregation
 
 
 class MappingTests(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testTracking(self):
-    with self.test_session():
-      model = HasMapping()
-      output = model(tf.ones([32, 2]))
-      self.assertAllEqual([32, 7], output.shape.as_list())
-      self.assertEqual(5, len(model.layers))
-      self.assertEqual(len(model.layers), len(model.layer_dict.layers))
-      self.assertLen(model._trackable_children(), 1)
-      self.assertIs(model.layer_dict, model._trackable_children()["layer_dict"])
-      self.evaluate([v.initializer for v in model.variables])
-      test_var = model.layer_dict["output"].kernel
-      self.evaluate(test_var.assign(tf.ones([6, 7])))
-      save_path = os.path.join(self.get_temp_dir(), "ckpt")
-      model.save_weights(save_path)
-      self.evaluate(test_var.assign(tf.zeros([6, 7])))
-      model.load_weights(save_path)
-      self.assertAllEqual(numpy.ones([6, 7]),
-                          self.evaluate(test_var))
-
-  def testLayerCollectionWithExternalMutation(self):
-    d = {}
-    root = tf.Module()
-    root.wrapper = d
-    self.assertEqual([], root.wrapper.layers)
-    self.assertEqual([], root.wrapper.trainable_weights)
-    layer1 = core.Dense(1)
-    layer2 = core.Dense(1)
-    d["a"] = layer1
-    d["b"] = layer2
-    self.assertEqual([layer1, layer2], root.wrapper.layers)
-    # The layers have still not created variables
-    self.assertEqual([], root.wrapper.trainable_weights)
-
-  def testDictWrapperBadKeys(self):
-    a = tf.Module()
-    a.d = {}
-    a.d[1] = tf.__internal__.tracking.wrap([])
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegex(ValueError, "non-string key"):
-      model.save_weights(save_path)
-
-  def testDictWrapperNoDependency(self):
-    a = tf.Module()
-    a.d = data_structures.NoDependency({})
-    a.d[1] = [3]
-    self.assertEqual([a], util.list_objects(a))
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    model.load_weights(save_path)
-
-  def testNonStringKeyNotTrackableValue(self):
-    a = tf.Module()
-    a.d = {}
-    a.d["a"] = [3]
-    a.d[1] = data_structures.NoDependency([3])
-    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    model.load_weights(save_path)
-
-  def testNonAppendNotTrackable(self):
-    # Non-append mutations (deleting or overwriting values) are OK when the
-    # values aren't tracked.
-    a = tf.Module()
-    a.d = {}
-    a.d["a"] = [3]
-    a.d[1] = 3
-    a.d[1] = 2
-    self.assertEqual(2, a.d[1])
-    del a.d[1]
-    a.d[2] = data_structures.NoDependency(tf.Module())
-    second = tf.Module()
-    a.d[2] = data_structures.NoDependency(second)
-    self.assertIs(second, a.d[2])
-    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
-    model = training.Model()
-    model.sub = a
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-    model.load_weights(save_path)
-
-  def testPopNoSave(self):
-    model = training.Model()
-    model.d = {}
-    model.d["a"] = []
-    model.d.pop("a")
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegex(ValueError, "Unable to save"):
-      model.save_weights(save_path)
-
-  def testExternalModificationNoSave(self):
-    model = training.Model()
-    external_reference = {}
-    model.d = external_reference
-    external_reference["a"] = []
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegex(ValueError, "modified outside the wrapper"):
-      model.save_weights(save_path)
-
-  def testOverwriteCanStillSave(self):
-    model = training.Model()
-    model.d = {}
-    model.d["a"] = {}
-    model.d["a"] = {}
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    model.save_weights(save_path)
-
-  def testIter(self):
-    model = training.Model()
-    model.d = {1: 3}
-    model.d[1] = 3
-    self.assertEqual([1], list(model.d))
-    new_dict = {}
-    # This update() is super tricky. If the dict wrapper subclasses dict,
-    # CPython will access its storage directly instead of calling any
-    # methods/properties on the object. So the options are either not to
-    # subclass dict (in which case update will call normal iter methods, but the
-    # object won't pass isinstance checks) or to subclass dict and keep that
-    # storage updated (no shadowing all its methods like ListWrapper).
-    new_dict.update(model.d)
-    self.assertEqual({1: 3}, new_dict)
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTracking(self):
+        with self.test_session():
+            model = HasMapping()
+            output = model(tf.ones([32, 2]))
+            self.assertAllEqual([32, 7], output.shape.as_list())
+            self.assertEqual(5, len(model.layers))
+            self.assertEqual(len(model.layers), len(model.layer_dict.layers))
+            self.assertLen(model._trackable_children(), 1)
+            self.assertIs(
+                model.layer_dict, model._trackable_children()["layer_dict"]
+            )
+            self.evaluate([v.initializer for v in model.variables])
+            test_var = model.layer_dict["output"].kernel
+            self.evaluate(test_var.assign(tf.ones([6, 7])))
+            save_path = os.path.join(self.get_temp_dir(), "ckpt")
+            model.save_weights(save_path)
+            self.evaluate(test_var.assign(tf.zeros([6, 7])))
+            model.load_weights(save_path)
+            self.assertAllEqual(numpy.ones([6, 7]), self.evaluate(test_var))
+
+    def testLayerCollectionWithExternalMutation(self):
+        d = {}
+        root = tf.Module()
+        root.wrapper = d
+        self.assertEqual([], root.wrapper.layers)
+        self.assertEqual([], root.wrapper.trainable_weights)
+        layer1 = core.Dense(1)
+        layer2 = core.Dense(1)
+        d["a"] = layer1
+        d["b"] = layer2
+        self.assertEqual([layer1, layer2], root.wrapper.layers)
+        # The layers have still not created variables
+        self.assertEqual([], root.wrapper.trainable_weights)
+
+    def testDictWrapperBadKeys(self):
+        a = tf.Module()
+        a.d = {}
+        a.d[1] = tf.__internal__.tracking.wrap([])
+        model = training.Model()
+        model.sub = a
+        save_path = os.path.join(self.get_temp_dir(), "ckpt")
+        with self.assertRaisesRegex(ValueError, "non-string key"):
+            model.save_weights(save_path)
+
+    def testDictWrapperNoDependency(self):
+        a = tf.Module()
+        a.d = data_structures.NoDependency({})
+        a.d[1] = [3]
+        self.assertEqual([a], util.list_objects(a))
+        model = training.Model()
+        model.sub = a
+        save_path = os.path.join(self.get_temp_dir(), "ckpt")
+        model.save_weights(save_path)
+        model.load_weights(save_path)
+
+    def testNonStringKeyNotTrackableValue(self):
+        a = tf.Module()
+        a.d = {}
+        a.d["a"] = [3]
+        a.d[1] = data_structures.NoDependency([3])
+        self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+        model = training.Model()
+        model.sub = a
+        save_path = os.path.join(self.get_temp_dir(), "ckpt")
+        model.save_weights(save_path)
+        model.load_weights(save_path)
+
+    def testNonAppendNotTrackable(self):
+        # Non-append mutations (deleting or overwriting values) are OK when the
+        # values aren't tracked.
+        a = tf.Module()
+        a.d = {}
+        a.d["a"] = [3]
+        a.d[1] = 3
+        a.d[1] = 2
+        self.assertEqual(2, a.d[1])
+        del a.d[1]
+        a.d[2] = data_structures.NoDependency(tf.Module())
+        second = tf.Module()
+        a.d[2] = data_structures.NoDependency(second)
+        self.assertIs(second, a.d[2])
+        self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+        model = training.Model()
+        model.sub = a
+        save_path = os.path.join(self.get_temp_dir(), "ckpt")
+        model.save_weights(save_path)
+        model.load_weights(save_path)
+
+    def testPopNoSave(self):
+        model = training.Model()
+        model.d = {}
+        model.d["a"] = []
+        model.d.pop("a")
+        save_path = os.path.join(self.get_temp_dir(), "ckpt")
+        with self.assertRaisesRegex(ValueError, "Unable to save"):
+            model.save_weights(save_path)
+
+    def testExternalModificationNoSave(self):
+        model = training.Model()
+        external_reference = {}
+        model.d = external_reference
+        external_reference["a"] = []
+        save_path = os.path.join(self.get_temp_dir(), "ckpt")
+        with self.assertRaisesRegex(ValueError, "modified outside the wrapper"):
+            model.save_weights(save_path)
+
+    def testOverwriteCanStillSave(self):
+        model = training.Model()
+        model.d = {}
+        model.d["a"] = {}
+        model.d["a"] = {}
+        save_path = os.path.join(self.get_temp_dir(), "ckpt")
+        model.save_weights(save_path)
+
+    def testIter(self):
+        model = training.Model()
+        model.d = {1: 3}
+        model.d[1] = 3
+        self.assertEqual([1], list(model.d))
+        new_dict = {}
+        # This update() is super tricky. If the dict wrapper subclasses dict,
+        # CPython will access its storage directly instead of calling any
+        # methods/properties on the object. So the options are either not to
+        # subclass dict (in which case update will call normal iter methods, but
+        # the object won't pass isinstance checks) or to subclass dict and keep
+        # that storage updated (no shadowing all its methods like ListWrapper).
+        new_dict.update(model.d)
+        self.assertEqual({1: 3}, new_dict)
 
 
 class HasTuple(training.Model):
-
-  def __init__(self):
-    super().__init__()
-    self.layer_list = (
-        core.Dense(3), core.Dense(4),
-        core.Dense(5, kernel_regularizer=tf.reduce_sum))
-    self.layers_with_updates = (batch_normalization_v1.BatchNormalization(),)
-
-  def call(self, x):
-    aggregation = 0.
-    for l in self.layer_list:
-      x = l(x)
-      aggregation += tf.reduce_sum(x)
-    bn, = self.layers_with_updates
-    return bn(x) / aggregation
-
-
-class TupleTests(test_combinations.TestCase):
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testTracking(self):
-    with self.test_session():
-      model = HasTuple()
-      output = model(tf.ones([32, 2]))
-      self.assertAllEqual([32, 5], output.shape.as_list())
-      self.assertLen(model.layers, 4)
-      self.assertLen(model.layer_list.layers, 3)
-      self.assertEqual(
-          len(model.layers),
-          len(tuple(model.layer_list.layers) + model.layers_with_updates))
-      self.assertEqual(3, model.layer_list.layers[0].units)
-      self.assertEqual(4, model.layer_list.layers[1].units)
-      self.assertEqual(5, model.layer_list.layers[2].units)
-      self.assertLen(model._trackable_children(), 2)
-      self.assertIs(model.layer_list, model._trackable_children()["layer_list"])
-      self.assertIs(model.layers_with_updates,
-                    model._trackable_children()["layers_with_updates"])
-      self.assertLen(model.layer_list._trackable_children(), 3)
-      self.evaluate([v.initializer for v in model.variables])
-      self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
-      save_path = os.path.join(self.get_temp_dir(), "ckpt")
-      model.save_weights(save_path)
-      self.evaluate(model.variables[0].assign(tf.zeros([2, 3])))
-      model.load_weights(save_path)
-      self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
-                          self.evaluate(model.variables[0]))
-      v = tf.Variable(1.)
-      model.var_list = (v,)
-      self.assertIn(id(v), [id(obj) for obj in model.variables])
-      self.assertIn(id(v), [id(obj) for obj in model.trainable_variables])
-      self.assertNotIn(id(v),
-                       [id(obj) for obj in model.non_trainable_variables])
-      self.assertIn(id(model.layer_list[0].trainable_weights[0]),
-                    [id(obj) for obj in model.trainable_weights])
-
-  @parameterized.named_parameters(
-      ("Module", tf.Module),
-      ("Model", training.Model),
-  )
-  def testSubModelTracking(self, module_subclass):
-    model = module_subclass()
-    model.v = tf.Variable(1.)
-    self.assertIn(model.v, model.trainable_variables)
-    model2 = module_subclass()
-    model2.m = (model,)
-    self.assertIn(model.v, model2.trainable_variables)
-
-  def testSubSequentialTracking(self):
-
-    class _Subclassed(training.Model):
-
-      def __init__(self, wrapped):
-        super().__init__()
-        self._wrapped = wrapped
-
-      def call(self, x):
-        return self._wrapped(x)
-
-    model = sequential.Sequential()
-    layer = core.Dense(1)
-    model.add(layer)
-    model2 = _Subclassed(model)
-    model2(tf.ones([1, 2]))
-    model2.m = (model,)
-    self.assertIn(layer.kernel, model2.trainable_weights)
-
-  def testUpdatesForwarded(self):
-    with tf.Graph().as_default():
-      model = HasTuple()
-      model_input = tf.ones([32, 2])
-      model(model_input)
-      self.assertNotEmpty(model.layers_with_updates[0].updates)
-      self.assertEqual(set(model.layers_with_updates[0].updates),
-                       set(model.updates))
-
-    model = HasTuple()
-    model_input = tf.ones([32, 2])
-    model(model_input)
-    self.assertEmpty(model.updates)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testLossesForwarded(self):
-    model = HasTuple()
-    model_input = tf.ones([32, 2])
-    model(model_input)
-    self.assertLen(model.losses, 1)
-
-  def testModelContainersCompareEqual(self):
-    class HasEqualContainers(training.Model):
-
-      def __init__(self):
+    def __init__(self):
         super().__init__()
-        self.l1 = ()
-        self.l2 = ()
-
-    model = HasEqualContainers()
-    first_layer = HasEqualContainers()
-    model.l1 = (first_layer,)
-    second_layer = HasEqualContainers()
-    model.l2 = (second_layer,)
-    self.assertEqual((first_layer,), model.l1)
-    d = {model.l1: 1, model.l2: 2}
-    self.assertEqual(1, d[model.l1])
-    self.assertEqual(1, d[(first_layer,)])
-    self.assertEqual(2, d[model.l2])
-    self.assertEqual(2, d[(second_layer,)])
-    self.assertEqual([first_layer, second_layer], model.layers)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testTensorConversion(self):
-
-    class TupleToTensor(training.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.l = (1., 2., 3.)
+        self.layer_list = (
+            core.Dense(3),
+            core.Dense(4),
+            core.Dense(5, kernel_regularizer=tf.reduce_sum),
+        )
+        self.layers_with_updates = (
+            batch_normalization_v1.BatchNormalization(),
+        )
+
+    def call(self, x):
+        aggregation = 0.0
+        for l in self.layer_list:
+            x = l(x)
+            aggregation += tf.reduce_sum(x)
+        (bn,) = self.layers_with_updates
+        return bn(x) / aggregation
 
-    self.assertAllEqual(
-        (1., 2., 3.),
-        self.evaluate(tf.constant(TupleToTensor().l)))
 
-    self.assertAllEqual(
-        (1., 2., 3.),
-        self.evaluate(tf.raw_ops.Pack(values=TupleToTensor().l)))
+class TupleTests(test_combinations.TestCase):
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTracking(self):
+        with self.test_session():
+            model = HasTuple()
+            output = model(tf.ones([32, 2]))
+            self.assertAllEqual([32, 5], output.shape.as_list())
+            self.assertLen(model.layers, 4)
+            self.assertLen(model.layer_list.layers, 3)
+            self.assertEqual(
+                len(model.layers),
+                len(tuple(model.layer_list.layers) + model.layers_with_updates),
+            )
+            self.assertEqual(3, model.layer_list.layers[0].units)
+            self.assertEqual(4, model.layer_list.layers[1].units)
+            self.assertEqual(5, model.layer_list.layers[2].units)
+            self.assertLen(model._trackable_children(), 2)
+            self.assertIs(
+                model.layer_list, model._trackable_children()["layer_list"]
+            )
+            self.assertIs(
+                model.layers_with_updates,
+                model._trackable_children()["layers_with_updates"],
+            )
+            self.assertLen(model.layer_list._trackable_children(), 3)
+            self.evaluate([v.initializer for v in model.variables])
+            self.evaluate(
+                model.variables[0].assign([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            )
+            save_path = os.path.join(self.get_temp_dir(), "ckpt")
+            model.save_weights(save_path)
+            self.evaluate(model.variables[0].assign(tf.zeros([2, 3])))
+            model.load_weights(save_path)
+            self.assertAllEqual(
+                [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
+                self.evaluate(model.variables[0]),
+            )
+            v = tf.Variable(1.0)
+            model.var_list = (v,)
+            self.assertIn(id(v), [id(obj) for obj in model.variables])
+            self.assertIn(id(v), [id(obj) for obj in model.trainable_variables])
+            self.assertNotIn(
+                id(v), [id(obj) for obj in model.non_trainable_variables]
+            )
+            self.assertIn(
+                id(model.layer_list[0].trainable_weights[0]),
+                [id(obj) for obj in model.trainable_weights],
+            )
+
+    @parameterized.named_parameters(
+        ("Module", tf.Module),
+        ("Model", training.Model),
+    )
+    def testSubModelTracking(self, module_subclass):
+        model = module_subclass()
+        model.v = tf.Variable(1.0)
+        self.assertIn(model.v, model.trainable_variables)
+        model2 = module_subclass()
+        model2.m = (model,)
+        self.assertIn(model.v, model2.trainable_variables)
+
+    def testSubSequentialTracking(self):
+        class _Subclassed(training.Model):
+            def __init__(self, wrapped):
+                super().__init__()
+                self._wrapped = wrapped
+
+            def call(self, x):
+                return self._wrapped(x)
+
+        model = sequential.Sequential()
+        layer = core.Dense(1)
+        model.add(layer)
+        model2 = _Subclassed(model)
+        model2(tf.ones([1, 2]))
+        model2.m = (model,)
+        self.assertIn(layer.kernel, model2.trainable_weights)
+
+    def testUpdatesForwarded(self):
+        with tf.Graph().as_default():
+            model = HasTuple()
+            model_input = tf.ones([32, 2])
+            model(model_input)
+            self.assertNotEmpty(model.layers_with_updates[0].updates)
+            self.assertEqual(
+                set(model.layers_with_updates[0].updates), set(model.updates)
+            )
+
+        model = HasTuple()
+        model_input = tf.ones([32, 2])
+        model(model_input)
+        self.assertEmpty(model.updates)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testLossesForwarded(self):
+        model = HasTuple()
+        model_input = tf.ones([32, 2])
+        model(model_input)
+        self.assertLen(model.losses, 1)
+
+    def testModelContainersCompareEqual(self):
+        class HasEqualContainers(training.Model):
+            def __init__(self):
+                super().__init__()
+                self.l1 = ()
+                self.l2 = ()
+
+        model = HasEqualContainers()
+        first_layer = HasEqualContainers()
+        model.l1 = (first_layer,)
+        second_layer = HasEqualContainers()
+        model.l2 = (second_layer,)
+        self.assertEqual((first_layer,), model.l1)
+        d = {model.l1: 1, model.l2: 2}
+        self.assertEqual(1, d[model.l1])
+        self.assertEqual(1, d[(first_layer,)])
+        self.assertEqual(2, d[model.l2])
+        self.assertEqual(2, d[(second_layer,)])
+        self.assertEqual([first_layer, second_layer], model.layers)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testTensorConversion(self):
+        class TupleToTensor(training.Model):
+            def __init__(self):
+                super().__init__()
+                self.l = (1.0, 2.0, 3.0)
+
+        self.assertAllEqual(
+            (1.0, 2.0, 3.0), self.evaluate(tf.constant(TupleToTensor().l))
+        )
+
+        self.assertAllEqual(
+            (1.0, 2.0, 3.0),
+            self.evaluate(tf.raw_ops.Pack(values=TupleToTensor().l)),
+        )
 
 
 class InterfaceTests(test_combinations.TestCase):
-
-  def testNoDependency(self):
-    root = tf.Module()
-    hasdep = tf.Module()
-    root.hasdep = hasdep
-    nodep = tf.Module()
-    root.nodep = data_structures.NoDependency(nodep)
-    self.assertLen(root._trackable_children(), 1)
-    self.assertIs(root._trackable_children()["hasdep"], root.hasdep)
-    self.assertIs(root.hasdep, hasdep)
-    self.assertIs(root.nodep, nodep)
-
-    class NoDependencyModel(training.Model):
-
-      @tf.__internal__.tracking.no_automatic_dependency_tracking
-      def __init__(self):
-        super().__init__()
-        self.a = []
-        self.b = tf.Module()
-
-    nodeps = NoDependencyModel()
-    self.assertEqual([nodeps], util.list_objects(nodeps))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testDictionariesBasic(self):
-    a = training.Model()
-    b = training.Model()
-    a.attribute = {"b": b}
-    c = training.Model()
-    a.attribute["c"] = []
-    a.attribute["c"].append(c)
-    a_deps = util.list_objects(a)
-    self.assertIn(b, a_deps)
-    self.assertIn(c, a_deps)
-    self.assertIs(b, a.attribute["b"])
-    self.assertEqual({"b", "c"}, a.attribute._trackable_children().keys())
-    self.assertEqual([b, c], a.layers)
-    self.assertEqual([b, c], a.attribute.layers)
-    self.assertEqual([c], a.attribute["c"].layers)
-    checkpoint = tf.train.Checkpoint(a=a)
-    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    with self.cached_session():
-      checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testNoDepList(self):
-    a = training.Model()
-    a.l1 = data_structures.NoDependency([])
-    a.l1.insert(1, 0)
-    self.assertIsInstance(a.l1, list)
-    checkpoint = tf.train.Checkpoint(a=a)
-    checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    a.l2 = []
-    a.l2.insert(1, tf.Module())
-    with self.assertRaisesRegex(ValueError, "A list element was replaced"):
-      checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    def testNoDependency(self):
+        root = tf.Module()
+        hasdep = tf.Module()
+        root.hasdep = hasdep
+        nodep = tf.Module()
+        root.nodep = data_structures.NoDependency(nodep)
+        self.assertLen(root._trackable_children(), 1)
+        self.assertIs(root._trackable_children()["hasdep"], root.hasdep)
+        self.assertIs(root.hasdep, hasdep)
+        self.assertIs(root.nodep, nodep)
+
+        class NoDependencyModel(training.Model):
+            @tf.__internal__.tracking.no_automatic_dependency_tracking
+            def __init__(self):
+                super().__init__()
+                self.a = []
+                self.b = tf.Module()
+
+        nodeps = NoDependencyModel()
+        self.assertEqual([nodeps], util.list_objects(nodeps))
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDictionariesBasic(self):
+        a = training.Model()
+        b = training.Model()
+        a.attribute = {"b": b}
+        c = training.Model()
+        a.attribute["c"] = []
+        a.attribute["c"].append(c)
+        a_deps = util.list_objects(a)
+        self.assertIn(b, a_deps)
+        self.assertIn(c, a_deps)
+        self.assertIs(b, a.attribute["b"])
+        self.assertEqual({"b", "c"}, a.attribute._trackable_children().keys())
+        self.assertEqual([b, c], a.layers)
+        self.assertEqual([b, c], a.attribute.layers)
+        self.assertEqual([c], a.attribute["c"].layers)
+        checkpoint = tf.train.Checkpoint(a=a)
+        save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+        with self.cached_session():
+            checkpoint.restore(
+                save_path
+            ).assert_consumed().initialize_or_restore()
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testNoDepList(self):
+        a = training.Model()
+        a.l1 = data_structures.NoDependency([])
+        a.l1.insert(1, 0)
+        self.assertIsInstance(a.l1, list)
+        checkpoint = tf.train.Checkpoint(a=a)
+        checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+        a.l2 = []
+        a.l2.insert(1, tf.Module())
+        with self.assertRaisesRegex(ValueError, "A list element was replaced"):
+            checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
 
 if __name__ == "__main__":
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/tests/tracking_util_test.py b/keras/tests/tracking_util_test.py
index a48d5e736b05..4ee3cbdf9733 100644
--- a/keras/tests/tracking_util_test.py
+++ b/keras/tests/tracking_util_test.py
@@ -14,886 +14,1029 @@
 # ==============================================================================
 
 import functools
-
-import tensorflow.compat.v2 as tf
 import os
 import weakref
-from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_utils
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
+
+import tensorflow.compat.v2 as tf
+
 from keras.engine import input_layer
 from keras.engine import sequential
 from keras.engine import training
 from keras.layers import core
 from keras.layers import reshaping
-from keras.optimizers.optimizer_v2 import adam
+from keras.optimizers.legacy import adam
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+
+# isort: off
+from tensorflow.python.checkpoint import (
+    checkpoint as trackable_utils,
+)
+from tensorflow.python.eager import context
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.tracking import util as trackable_utils
 
 
-# pylint: disable=not-callable
 class MyModel(training.Model):
-  """A concrete Model for testing."""
+    """A concrete Model for testing."""
 
-  def __init__(self):
-    super().__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Trackables which aren't Layers.
-    self._non_layer = NonLayerTrackable()
+    def __init__(self):
+        super().__init__()
+        self._named_dense = core.Dense(1, use_bias=True)
+        self._second = core.Dense(1, use_bias=False)
+        # We can still track Trackables which aren't Layers.
+        self._non_layer = NonLayerTrackable()
 
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
+    def call(self, values):
+        ret = self._second(self._named_dense(values))
+        return ret
 
 
 class NonLayerTrackable(tf.Module):
-
-  def __init__(self):
-    super().__init__()
-    self.a_variable = trackable_utils.add_variable(
-        self, name="a_variable", shape=[])
+    def __init__(self):
+        super().__init__()
+        self.a_variable = trackable_utils.add_variable(
+            self, name="a_variable", shape=[]
+        )
 
 
 class InterfaceTests(tf.test.TestCase):
-
-  def testLayerDeduplication(self):
-    model = training.Model()
-    layer_one = core.Dense(1)
-    layer_two = core.Dense(1)
-    model.other_path = [layer_one, layer_two]
-    model.l2 = layer_two
-    model.l1 = layer_one
-    self.assertEqual([layer_one, layer_two], model.layers)
-
-  def testSaveWithOnlyKerasSession(self):
-
-    with tf.Graph().as_default(), self.cached_session():
-      inp = input_layer.Input([1])
-      dense = core.Dense(1)(inp)
-      model = training.Model(inp, dense)
-      model.compile(optimizer="sgd", loss="mse")
-      model.fit([1.], [2.])
-      checkpoint = tf.train.Checkpoint(model=model)
-      checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    def testLayerDeduplication(self):
+        model = training.Model()
+        layer_one = core.Dense(1)
+        layer_two = core.Dense(1)
+        model.other_path = [layer_one, layer_two]
+        model.l2 = layer_two
+        model.l1 = layer_one
+        self.assertEqual([layer_one, layer_two], model.layers)
+
+    def testSaveWithOnlyKerasSession(self):
+
+        with tf.Graph().as_default(), self.cached_session():
+            inp = input_layer.Input([1])
+            dense = core.Dense(1)(inp)
+            model = training.Model(inp, dense)
+            model.compile(optimizer="sgd", loss="mse")
+            model.fit([1.0], [2.0])
+            checkpoint = tf.train.Checkpoint(model=model)
+            checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
 
 class CheckpointingTests(test_combinations.TestCase):
+    @tf_test_utils.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+    def testNamingWithOptimizer(self):
+        input_value = tf.constant([[3.0]])
+        model = MyModel()
+        # A nuisance Model using the same optimizer. Its slot variables should
+        # not go in the checkpoint, since it is never depended on.
+        other_model = MyModel()
+        optimizer = adam.Adam(0.001)
+        step = tf.compat.v1.train.get_or_create_global_step()
+        root_trackable = tf.train.Checkpoint(
+            optimizer=optimizer, model=model, step=step
+        )
 
-  @tf_test_utils.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNamingWithOptimizer(self):
-    input_value = tf.constant([[3.]])
-    model = MyModel()
-    # A nuisance Model using the same optimizer. Its slot variables should not
-    # go in the checkpoint, since it is never depended on.
-    other_model = MyModel()
-    optimizer = adam.Adam(0.001)
-    step = tf.compat.v1.train.get_or_create_global_step()
-    root_trackable = tf.train.Checkpoint(
-        optimizer=optimizer, model=model, step=step)
-
-    with tf.GradientTape() as tape:
-      loss = model(input_value)
-    variables = model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    train_op = tf.group(
-        optimizer.apply_gradients(zip(gradients, variables)),
-        step.assign_add(1))
-
-    with tf.GradientTape() as tape:
-      loss = other_model(input_value)
-    variables = other_model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    optimizer.apply_gradients(zip(gradients, variables))
-
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    named_variables, serialized_graph, _ = tf.__internal__.tracking.ObjectGraphView(
-        root_trackable).serialize_object_graph()
-    expected_slot_keys = (
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
-    )
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        "optimizer/learning_rate",
-        "optimizer/beta_1",
-        "optimizer/beta_2",
-        "optimizer/iter",
-        "optimizer/decay",
-    ) + expected_slot_keys
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    expected_checkpoint_names = [
-        name + suffix for name in expected_checkpoint_names]
-    named_variables = {v.name: v for v in named_variables}
-    self.assertEqual(len(expected_checkpoint_names),
-                     len(named_variables.keys()))
-    # Check that we've mapped to the right variable objects (not exhaustive)
-    self.assertEqual(
-        "global_step",
-        named_variables["step" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense_1/kernel",
-        named_variables["model/_second/kernel" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        named_variables["model/_named_dense/kernel" + suffix].full_name)
-    self.assertEqual("Adam/beta_1",
-                     named_variables["optimizer/beta_1" + suffix].full_name)
-    self.assertEqual("Adam/beta_2",
-                     named_variables["optimizer/beta_2" + suffix].full_name)
-    # Spot check the generated protocol buffers.
-    self.assertEqual("optimizer",
-                     serialized_graph.nodes[0].children[1].local_name)
-    optimizer_node = serialized_graph.nodes[
-        serialized_graph.nodes[0].children[1].node_id]
-    children = [node.local_name for node in optimizer_node.children]
-    self.assertEqual(
-        # hyper variable dependencies
-        len(["beta_1", "beta_2", "iter", "decay", "learning_rate"]),
-        len(children))
-    serialized_slot_keys = []
-    for slot in optimizer_node.slot_variables:
-      for attribute in (
-          serialized_graph.nodes[slot.slot_variable_node_id].attributes):
-        serialized_slot_keys.append(attribute.checkpoint_key)
-    self.assertEqual(
-        len([key + suffix for key in expected_slot_keys]),
-        len(serialized_slot_keys))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testSaveRestore(self):
-    with self.test_session():
-      model = MyModel()
-      optimizer = adam.Adam(0.001)
-      root_trackable = tf.train.Checkpoint(
-          optimizer=optimizer, model=model)
-      input_value = tf.constant([[3.]])
-      with tf.GradientTape() as tape:
-        loss = model(input_value)
-      variables = model.trainable_variables
-      gradients = tape.gradient(loss, variables)
-      train_op = optimizer.apply_gradients(zip(gradients, variables))
-      self.assertFalse(root_trackable.save_counter.trainable)
-      self.evaluate(trackable_utils.gather_initializers(
-          root_trackable))
-      self.evaluate(train_op)
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      self.evaluate(tf.compat.v1.assign(model._named_dense.variables[1], [42.]))
-      m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-      self.evaluate(tf.compat.v1.assign(m_bias_slot, [1.5]))
-      save_path = root_trackable.save(file_prefix=prefix)
-      self.evaluate(tf.compat.v1.assign(model._named_dense.variables[1], [43.]))
-      self.evaluate(tf.compat.v1.assign(root_trackable.save_counter, 3))
-      optimizer_variables = self.evaluate(
-          sorted(optimizer.variables(), key=lambda v: v.name))
-      self.evaluate(tf.compat.v1.assign(m_bias_slot, [-2.]))
-      # Immediate restoration
-      status = root_trackable.restore(save_path=save_path).assert_consumed()
-      status.run_restore_ops()
-      self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-      self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
-      self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-      if not tf.executing_eagerly():
-        return  # Restore-on-create is only supported when executing eagerly
-      on_create_model = MyModel()
-      on_create_optimizer = adam.Adam(0.001)
-      on_create_root = tf.train.Checkpoint(
-          optimizer=on_create_optimizer, model=on_create_model)
-      # Deferred restoration
-      status = on_create_root.restore(save_path=save_path)
-      status.assert_nontrivial_match()
-      status.assert_existing_objects_matched()
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-      on_create_model(tf.constant([[3.]]))  # create variables
-      self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-      self.assertAllEqual([42.],
-                          self.evaluate(
-                              on_create_model._named_dense.variables[1]))
-      on_create_m_bias_slot = on_create_optimizer.get_slot(
-          on_create_model._named_dense.variables[1], "m")
-      status.assert_existing_objects_matched()
-      if not tf.executing_eagerly():
-        with self.assertRaises(AssertionError):
-          status.assert_consumed()
-      # Optimizer slot variables are created when the original variable is
-      # restored.
-      self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-      dummy_var = tf.Variable([1.])
-      on_create_optimizer.minimize(loss=dummy_var.read_value,
-                                   var_list=[dummy_var])
-      status.assert_existing_objects_matched()
-      status.assert_consumed()
-      self.assertAllEqual(
-          optimizer_variables,
-          # Creation order is different, so .variables() needs to be re-sorted.
-          self.evaluate(sorted(optimizer.variables(), key=lambda v: v.name)))
-
-  # TODO(allenl): Debug garbage created by this test in python3.
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      model = MyModel()
-      optimizer = adam.Adam(0.001)
-      root = tf.train.Checkpoint(
-          optimizer=optimizer, model=model)
-      root.restore(tf.train.latest_checkpoint(
-          checkpoint_directory))
-      for _ in range(num_training_steps):
-        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
-        input_value = tf.constant([[3.]])
         with tf.GradientTape() as tape:
-          loss = model(input_value)
+            loss = model(input_value)
         variables = model.trainable_variables
         gradients = tape.gradient(loss, variables)
-        optimizer.apply_gradients(zip(gradients, variables))
-      root.save(file_prefix=checkpoint_prefix)
-      self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer.iterations.numpy())
-
-  def testUsageGraph(self):
-    """Expected usage when graph building."""
-    with context.graph_mode():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with tf.Graph().as_default():
-          model = MyModel()
-          optimizer = adam.Adam(0.001)
-          root = tf.compat.v1.train.Checkpoint(
-              optimizer=optimizer, model=model)
-          input_value = tf.constant([[3.]])
-          with tf.GradientTape() as tape:
-            loss = model(input_value)
-          variables = model.trainable_variables
-          gradients = tape.gradient(loss, variables)
-          train_op = optimizer.apply_gradients(zip(gradients, variables))
-
-          checkpoint_path = tf.train.latest_checkpoint(
-              checkpoint_directory)
-          with self.session(graph=tf.compat.v1.get_default_graph()) as session:
-            status = root.restore(save_path=checkpoint_path)
-            status.initialize_or_restore(session=session)
-            if checkpoint_path is None:
-              self.assertEqual(0, training_continuation)
-              with self.assertRaises(AssertionError):
-                status.assert_consumed()
-              with self.assertRaises(AssertionError):
-                status.assert_existing_objects_matched()
-            else:
-              status.assert_consumed()
-              status.assert_existing_objects_matched()
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix, session=session)
-            self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.optimizer.iterations))
-            self.assertEqual(training_continuation + 1,
-                             session.run(root.save_counter))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testAgnosticUsage(self):
-    """Graph/eager agnostic usage."""
-    # Does create garbage when executing eagerly due to ops.Graph() creation.
-    with self.test_session():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      optimizer = adam.Adam(0.001)
-      def _train_fn(model, input_value):
+        train_op = tf.group(
+            optimizer.apply_gradients(zip(gradients, variables)),
+            step.assign_add(1),
+        )
+
         with tf.GradientTape() as tape:
-          loss = model(input_value)
-        variables = model.trainable_variables
+            loss = other_model(input_value)
+        variables = other_model.trainable_variables
         gradients = tape.gradient(loss, variables)
-        return optimizer.apply_gradients(zip(gradients, variables))
-      for training_continuation in range(3):
-        with test_utils.device(should_use_gpu=True):
-          model = MyModel()
-          root = tf.train.Checkpoint(
-              optimizer=optimizer, model=model)
-          manager = tf.train.CheckpointManager(
-              root, checkpoint_directory, max_to_keep=1)
-          status = root.restore(save_path=manager.latest_checkpoint)
-          input_value = tf.constant([[3.]])
-          train_fn = functools.partial(_train_fn, model, input_value)
-          if not tf.executing_eagerly():
-            train_fn = functools.partial(self.evaluate, train_fn())
-          status.initialize_or_restore()
-          for _ in range(num_training_steps):
-            train_fn()
-          manager.save()
-          self.assertEqual((training_continuation + 1) * num_training_steps,
-                           self.evaluate(root.optimizer.iterations))
-          self.assertEqual(training_continuation + 1,
-                           self.evaluate(root.save_counter))
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testPartialRestoreWarningObject(self):
-    optimizer = adam.Adam(0.0)
-    original_root = tf.train.Checkpoint(v1=tf.Variable(2.),
-                                               v2=tf.Variable(3.),
-                                               optimizer=optimizer)
-    # Create a slot variable to save
-    optimizer.minimize(original_root.v1.read_value, [original_root.v1])
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    save_path = original_root.save(prefix)
-    partial_root = tf.train.Checkpoint(v1=tf.Variable(0.))
-    weak_partial_root = weakref.ref(partial_root)
-    weak_v1 = weakref.ref(partial_root.v1)
-    partial_root.restore(save_path)
-    self.assertEqual(2., partial_root.v1.numpy())
-    with tf.compat.v1.test.mock.patch.object(logging, "warning") as mock_log:
-      del partial_root
-      self.assertIsNone(weak_partial_root())
-      self.assertIsNone(weak_v1())
-      messages = str(mock_log.call_args_list)
-    self.assertIn("(root).v2'", messages)
-    self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
-    self.assertNotIn("(root).v1'", messages)
-    self.assertIn("expect_partial()", messages)
-
-  # pylint: disable=cell-var-from-loop
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testWithDefun(self):
-    with self.test_session():
-      num_training_steps = 2
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with test_utils.device(should_use_gpu=True):
-          model = MyModel()
-          # Don't actually train so we can test variable values
-          optimizer = adam.Adam(0.)
-          root = tf.train.Checkpoint(
-              optimizer=optimizer, model=model)
-          checkpoint_path = tf.train.latest_checkpoint(
-              checkpoint_directory)
-          status = root.restore(save_path=checkpoint_path)
-          def train_fn():
-            @tf.function
-            def _call_model(x):
-              return model(x)
+        optimizer.apply_gradients(zip(gradients, variables))
+
+        self.evaluate(trackable_utils.gather_initializers(root_trackable))
+        self.evaluate(train_op)
+        (
+            named_variables,
+            serialized_graph,
+            _,
+        ) = tf.__internal__.tracking.ObjectGraphView(
+            root_trackable
+        ).serialize_object_graph()
+        expected_slot_keys = (
+            "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+            "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+            "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+            "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+            "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+            "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+        )
+        expected_checkpoint_names = (
+            # Created in the root node, so no prefix.
+            "step",
+            "model/_second/kernel",
+            "model/_named_dense/kernel",
+            "model/_named_dense/bias",
+            # non-Layer dependency of the model
+            "model/_non_layer/a_variable",
+            "optimizer/learning_rate",
+            "optimizer/beta_1",
+            "optimizer/beta_2",
+            "optimizer/iter",
+            "optimizer/decay",
+        ) + expected_slot_keys
+        suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+        expected_checkpoint_names = [
+            name + suffix for name in expected_checkpoint_names
+        ]
+        named_variables = {v.name: v for v in named_variables}
+        self.assertEqual(
+            len(expected_checkpoint_names), len(named_variables.keys())
+        )
+        # Check that we've created the right full_names of objects (not
+        # exhaustive)
+        expected_names = {
+            "step" + suffix: "global_step",
+            "model/_second/kernel" + suffix: "my_model/dense_1/kernel",
+            "model/_named_dense/kernel" + suffix: "my_model/dense/kernel",
+            "optimizer/beta_1" + suffix: "Adam/beta_1",
+            "optimizer/beta_2" + suffix: "Adam/beta_2",
+        }
+        for nodes in serialized_graph.nodes:
+            for attribute in nodes.attributes:
+                expected_name = expected_names.pop(
+                    attribute.checkpoint_key, None
+                )
+                if expected_name is not None:
+                    self.assertEqual(expected_name, attribute.full_name)
+        self.assertEmpty(expected_names)
+        # Spot check the generated protocol buffers.
+        self.assertEqual(
+            "optimizer", serialized_graph.nodes[0].children[1].local_name
+        )
+        optimizer_node = serialized_graph.nodes[
+            serialized_graph.nodes[0].children[1].node_id
+        ]
+        children = [node.local_name for node in optimizer_node.children]
+        self.assertEqual(
+            # hyper variable dependencies
+            len(["beta_1", "beta_2", "iter", "decay", "learning_rate"]),
+            len(children),
+        )
+        serialized_slot_keys = []
+        for slot in optimizer_node.slot_variables:
+            for attribute in serialized_graph.nodes[
+                slot.slot_variable_node_id
+            ].attributes:
+                serialized_slot_keys.append(attribute.checkpoint_key)
+        self.assertEqual(
+            len([key + suffix for key in expected_slot_keys]),
+            len(serialized_slot_keys),
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testSaveRestore(self):
+        with self.test_session():
+            model = MyModel()
+            optimizer = adam.Adam(0.001)
+            root_trackable = tf.train.Checkpoint(
+                optimizer=optimizer, model=model
+            )
+            input_value = tf.constant([[3.0]])
             with tf.GradientTape() as tape:
-              loss = _call_model(tf.constant([[3.]]))
-            gradients = tape.gradient(loss, model.variables)
-            return optimizer.apply_gradients(zip(gradients, model.variables))
-          if not tf.executing_eagerly():
-            train_fn = functools.partial(
-                self.evaluate, train_fn())
-          status.initialize_or_restore()
-          for _ in range(num_training_steps):
-            train_fn()
-          if training_continuation > 0:
+                loss = model(input_value)
+            variables = model.trainable_variables
+            gradients = tape.gradient(loss, variables)
+            train_op = optimizer.apply_gradients(zip(gradients, variables))
+            self.assertFalse(root_trackable.save_counter.trainable)
+            self.evaluate(trackable_utils.gather_initializers(root_trackable))
+            self.evaluate(train_op)
+            prefix = os.path.join(self.get_temp_dir(), "ckpt")
+            self.evaluate(
+                tf.compat.v1.assign(model._named_dense.variables[1], [42.0])
+            )
+            m_bias_slot = optimizer.get_slot(
+                model._named_dense.variables[1], "m"
+            )
+            self.evaluate(tf.compat.v1.assign(m_bias_slot, [1.5]))
+            save_path = root_trackable.save(file_prefix=prefix)
+            self.evaluate(
+                tf.compat.v1.assign(model._named_dense.variables[1], [43.0])
+            )
+            self.evaluate(tf.compat.v1.assign(root_trackable.save_counter, 3))
+            optimizer_variables = self.evaluate(
+                sorted(optimizer.variables(), key=lambda v: v.name)
+            )
+            self.evaluate(tf.compat.v1.assign(m_bias_slot, [-2.0]))
+            # Immediate restoration
+            status = root_trackable.restore(
+                save_path=save_path
+            ).assert_consumed()
+            status.run_restore_ops()
+            self.assertAllEqual(
+                [42.0], self.evaluate(model._named_dense.variables[1])
+            )
+            self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
+            self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+            if not tf.executing_eagerly():
+                # Restore-on-create is only supported when executing eagerly
+                return
+            on_create_model = MyModel()
+            on_create_optimizer = adam.Adam(0.001)
+            on_create_root = tf.train.Checkpoint(
+                optimizer=on_create_optimizer, model=on_create_model
+            )
+            # Deferred restoration
+            status = on_create_root.restore(save_path=save_path)
+            status.assert_nontrivial_match()
+            status.assert_existing_objects_matched()
+            with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            on_create_model(tf.constant([[3.0]]))  # create variables
+            self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+            self.assertAllEqual(
+                [42.0], self.evaluate(on_create_model._named_dense.variables[1])
+            )
+            on_create_m_bias_slot = on_create_optimizer.get_slot(
+                on_create_model._named_dense.variables[1], "m"
+            )
+            status.assert_existing_objects_matched()
+            if not tf.executing_eagerly():
+                with self.assertRaises(AssertionError):
+                    status.assert_consumed()
+            # Optimizer slot variables are created when the original variable is
+            # restored.
+            self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+            dummy_var = tf.Variable([1.0])
+            on_create_optimizer.minimize(
+                loss=dummy_var.read_value, var_list=[dummy_var]
+            )
+            status.assert_existing_objects_matched()
             status.assert_consumed()
-            self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
-          else:
-            self.evaluate(model.variables[0].assign([[42.]]))
-          root.save(file_prefix=checkpoint_prefix)
-          self.assertEqual((training_continuation + 1) * num_training_steps,
-                           self.evaluate(optimizer.iterations))
-          self.assertEqual(training_continuation + 1,
-                           self.evaluate(root.save_counter))
-  # pylint: enable=cell-var-from-loop
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testAnonymousVarsInInit(self):
-
-    class Model(training.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.w = tf.Variable(0.0)
-        self.b = tf.Variable(0.0)
-        self.vars = [self.w, self.b]
-
-      def call(self, x):
-        return x * self.w + self.b
-
-    model = Model()
-    optimizer = adam.Adam(learning_rate=0.05)
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    checkpoint = tf.train.Checkpoint(
-        model=model, optimizer=optimizer)
-    for _ in range(2):
-      checkpoint.save(checkpoint_prefix)
-      with tf.GradientTape() as tape:
-        loss = (tf.constant(1.)
-                - model(tf.constant(1.))) ** 2
-      grad = tape.gradient(loss, model.vars)
-      optimizer.apply_gradients(
-          [(g, v) for g, v in zip(grad, model.vars)])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testDeferredSlotRestoration(self):
-    with self.test_session():
-      checkpoint_directory = self.get_temp_dir()
-
-      root = tf.train.Checkpoint()
-      root.var = trackable_utils.add_variable(
-          root, name="var", initializer=0.)
-      optimizer = adam.Adam(0.1)
-      variables = [root.var]
-      gradients = [1.]
-      train_op = optimizer.apply_gradients(zip(gradients, variables))
-      # Note that `optimizer` has not been added as a dependency of
-      # `root`. Create a one-off grouping so that slot variables for `root.var`
-      # get initialized too.
-      self.evaluate(trackable_utils.gather_initializers(
-          tf.train.Checkpoint(root=root, optimizer=optimizer)))
-      self.evaluate(train_op)
-      self.evaluate(tf.compat.v1.assign(root.var, 12.))
-      no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
-      root.optimizer = optimizer
-      self.evaluate(tf.compat.v1.assign(root.var, 13.))
-      self.evaluate(tf.compat.v1.assign(
-          optimizer.get_slot(slot_name="m", var=root.var),
-          14.))
-      slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
-      new_root = tf.train.Checkpoint()
-      # Load the slot-containing checkpoint (deferred), then immediately
-      # overwrite the non-slot variable (also deferred).
-      slot_status = new_root.restore(slots_path)
-      no_slot_status = new_root.restore(no_slots_path)
-      with self.assertRaises(AssertionError):
-        no_slot_status.assert_consumed()
-      new_root.var = trackable_utils.add_variable(
-          new_root, name="var", shape=[])
-      no_slot_status.assert_consumed()
-      no_slot_status.run_restore_ops()
-      self.assertEqual(12., self.evaluate(new_root.var))
-      new_root.optimizer = adam.Adam(0.1)
-      slot_status.assert_existing_objects_matched()
-      if not tf.executing_eagerly():
-        with self.assertRaisesRegex(AssertionError, "Unresolved object"):
-          slot_status.assert_consumed()
-      self.assertEqual(12., self.evaluate(new_root.var))
-      if tf.executing_eagerly():
-        # Slot variables are only created with restoring initializers when
-        # executing eagerly.
-        self.assertEqual(14., self.evaluate(
-            new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
-      else:
-        # Slot variables are not created eagerly when graph building.
-        with self.assertRaises(KeyError):
-          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
-      variables = [new_root.var]
-      gradients = [1.]
-      train_op = new_root.optimizer.apply_gradients(zip(gradients, variables))
-      # The slot variable now exists; restore() didn't create it, but we should
-      # now have a restore op for it.
-      slot_status.run_restore_ops()
-      if not tf.executing_eagerly():
-        # The train op hasn't run when graph building, so the slot variable has
-        # its restored value. It has run in eager, so the value will
-        # be different.
-        self.assertEqual(14., self.evaluate(
-            new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
-      self.evaluate(train_op)
-      slot_status.assert_consumed()
-
-  def testManySavesGraph(self):
-    """Saves after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = tf.Graph()
-      with graph.as_default(), self.session(graph):
+            self.assertAllEqual(
+                optimizer_variables,
+                # Creation order is different, so .variables() needs to be
+                # re-sorted.
+                self.evaluate(
+                    sorted(optimizer.variables(), key=lambda v: v.name)
+                ),
+            )
+
+    # TODO(allenl): Debug garbage created by this test in python3.
+    def testDeferredRestorationUsageEager(self):
+        """An idiomatic eager execution example."""
+        num_training_steps = 10
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tf.train.Checkpoint()
-        obj.var = tf.Variable(0., name="v")
-        obj.opt = adam.Adam(0.1)
-        variables = [obj.var]
-        gradients = [1.]
-        obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(trackable_utils.gather_initializers(obj))
-        obj.save(checkpoint_prefix)
-        graph.finalize()
-        obj.save(checkpoint_prefix)
-
-  def testManyRestoresGraph(self):
-    """Restores after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = tf.Graph()
-      with graph.as_default(), self.session(graph):
+        for training_continuation in range(3):
+            model = MyModel()
+            optimizer = adam.Adam(0.001)
+            root = tf.train.Checkpoint(optimizer=optimizer, model=model)
+            root.restore(tf.train.latest_checkpoint(checkpoint_directory))
+            for _ in range(num_training_steps):
+                # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+                input_value = tf.constant([[3.0]])
+                with tf.GradientTape() as tape:
+                    loss = model(input_value)
+                variables = model.trainable_variables
+                gradients = tape.gradient(loss, variables)
+                optimizer.apply_gradients(zip(gradients, variables))
+            root.save(file_prefix=checkpoint_prefix)
+            self.assertEqual(
+                (training_continuation + 1) * num_training_steps,
+                root.optimizer.iterations.numpy(),
+            )
+
+    def testUsageGraph(self):
+        """Expected usage when graph building."""
+        with context.graph_mode():
+            num_training_steps = 10
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            for training_continuation in range(3):
+                with tf.Graph().as_default():
+                    model = MyModel()
+                    optimizer = adam.Adam(0.001)
+                    root = tf.compat.v1.train.Checkpoint(
+                        optimizer=optimizer, model=model
+                    )
+                    input_value = tf.constant([[3.0]])
+                    with tf.GradientTape() as tape:
+                        loss = model(input_value)
+                    variables = model.trainable_variables
+                    gradients = tape.gradient(loss, variables)
+                    train_op = optimizer.apply_gradients(
+                        zip(gradients, variables)
+                    )
+
+                    checkpoint_path = tf.train.latest_checkpoint(
+                        checkpoint_directory
+                    )
+                    with self.session(
+                        graph=tf.compat.v1.get_default_graph()
+                    ) as session:
+                        status = root.restore(save_path=checkpoint_path)
+                        status.initialize_or_restore(session=session)
+                        if checkpoint_path is None:
+                            self.assertEqual(0, training_continuation)
+                            with self.assertRaises(AssertionError):
+                                status.assert_consumed()
+                            with self.assertRaises(AssertionError):
+                                status.assert_existing_objects_matched()
+                        else:
+                            status.assert_consumed()
+                            status.assert_existing_objects_matched()
+                        for _ in range(num_training_steps):
+                            session.run(train_op)
+                        root.save(
+                            file_prefix=checkpoint_prefix, session=session
+                        )
+                        self.assertEqual(
+                            (training_continuation + 1) * num_training_steps,
+                            session.run(root.optimizer.iterations),
+                        )
+                        self.assertEqual(
+                            training_continuation + 1,
+                            session.run(root.save_counter),
+                        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testAgnosticUsage(self):
+        """Graph/eager agnostic usage."""
+        # Does create garbage when executing eagerly due to ops.Graph()
+        # creation.
+        with self.test_session():
+            num_training_steps = 10
+            checkpoint_directory = self.get_temp_dir()
+            optimizer = adam.Adam(0.001)
+
+            def _train_fn(model, input_value):
+                with tf.GradientTape() as tape:
+                    loss = model(input_value)
+                variables = model.trainable_variables
+                gradients = tape.gradient(loss, variables)
+                return optimizer.apply_gradients(zip(gradients, variables))
+
+            for training_continuation in range(3):
+                with test_utils.device(should_use_gpu=True):
+                    model = MyModel()
+                    root = tf.train.Checkpoint(optimizer=optimizer, model=model)
+                    manager = tf.train.CheckpointManager(
+                        root, checkpoint_directory, max_to_keep=1
+                    )
+                    status = root.restore(save_path=manager.latest_checkpoint)
+                    input_value = tf.constant([[3.0]])
+                    train_fn = functools.partial(_train_fn, model, input_value)
+                    if not tf.executing_eagerly():
+                        train_fn = functools.partial(self.evaluate, train_fn())
+                    status.initialize_or_restore()
+                    for _ in range(num_training_steps):
+                        train_fn()
+                    manager.save()
+                    self.assertEqual(
+                        (training_continuation + 1) * num_training_steps,
+                        self.evaluate(root.optimizer.iterations),
+                    )
+                    self.assertEqual(
+                        training_continuation + 1,
+                        self.evaluate(root.save_counter),
+                    )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testPartialRestoreWarningObject(self):
+        optimizer = adam.Adam(0.0)
+        original_root = tf.train.Checkpoint(
+            v1=tf.Variable(2.0), v2=tf.Variable(3.0), optimizer=optimizer
+        )
+        # Create a slot variable to save
+        optimizer.minimize(original_root.v1.read_value, [original_root.v1])
+        prefix = os.path.join(self.get_temp_dir(), "ckpt")
+        save_path = original_root.save(prefix)
+        partial_root = tf.train.Checkpoint(v1=tf.Variable(0.0))
+        weak_partial_root = weakref.ref(partial_root)
+        weak_v1 = weakref.ref(partial_root.v1)
+        partial_root.restore(save_path)
+        self.assertEqual(2.0, partial_root.v1.numpy())
+        with tf.compat.v1.test.mock.patch.object(
+            logging, "warning"
+        ) as mock_log:
+            del partial_root
+            self.assertIsNone(weak_partial_root())
+            self.assertIsNone(weak_v1())
+            messages = str(mock_log.call_args_list)
+        self.assertIn("(root).v2'", messages)
+        self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
+        self.assertNotIn("(root).v1'", messages)
+        self.assertIn("expect_partial()", messages)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testWithDefun(self):
+        with self.test_session():
+            num_training_steps = 2
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            for training_continuation in range(3):
+                with test_utils.device(should_use_gpu=True):
+                    model = MyModel()
+                    # Don't actually train so we can test variable values
+                    optimizer = adam.Adam(0.0)
+                    root = tf.train.Checkpoint(optimizer=optimizer, model=model)
+                    checkpoint_path = tf.train.latest_checkpoint(
+                        checkpoint_directory
+                    )
+                    status = root.restore(save_path=checkpoint_path)
+
+                    def train_fn():
+                        @tf.function
+                        def _call_model(x):
+                            return model(x)
+
+                        with tf.GradientTape() as tape:
+                            loss = _call_model(tf.constant([[3.0]]))
+                        gradients = tape.gradient(loss, model.variables)
+                        return optimizer.apply_gradients(
+                            zip(gradients, model.variables)
+                        )
+
+                    if not tf.executing_eagerly():
+                        train_fn = functools.partial(self.evaluate, train_fn())
+                    status.initialize_or_restore()
+                    for _ in range(num_training_steps):
+                        train_fn()
+                    if training_continuation > 0:
+                        status.assert_consumed()
+                        self.assertAllClose(
+                            [[42.0]], self.evaluate(model.variables[0])
+                        )
+                    else:
+                        self.evaluate(model.variables[0].assign([[42.0]]))
+                    root.save(file_prefix=checkpoint_prefix)
+                    self.assertEqual(
+                        (training_continuation + 1) * num_training_steps,
+                        self.evaluate(optimizer.iterations),
+                    )
+                    self.assertEqual(
+                        training_continuation + 1,
+                        self.evaluate(root.save_counter),
+                    )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testAnonymousVarsInInit(self):
+        class Model(training.Model):
+            def __init__(self):
+                super().__init__()
+                self.w = tf.Variable(0.0)
+                self.b = tf.Variable(0.0)
+                self.vars = [self.w, self.b]
+
+            def call(self, x):
+                return x * self.w + self.b
+
+        model = Model()
+        optimizer = adam.Adam(learning_rate=0.05)
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tf.train.Checkpoint()
-        obj.var = tf.Variable(0., name="v")
-        obj.opt = adam.Adam(0.1)
-        variables = [obj.var]
-        gradients = [1.]
-        obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(trackable_utils.gather_initializers(obj))
-        save_path = obj.save(checkpoint_prefix)
-        obj.restore(save_path)
-        graph.finalize()
-        obj.restore(save_path)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def test_sequential(self):
-    with self.test_session():
-      model = sequential.Sequential()
-      checkpoint = tf.train.Checkpoint(model=model)
-      model.add(core.Dense(4))
-      second_dense = core.Dense(5)
-      model.add(second_dense)
-      model(tf.constant([[1.]]))
-      checkpoint.restore(None).initialize_or_restore()
-      self.evaluate(second_dense.bias.assign(
-          tf.constant([1., 2., 3., 4., 5.])))
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      save_path = checkpoint.save(checkpoint_prefix)
-      self.evaluate(second_dense.bias.assign(
-          tf.constant([5., 6., 7., 8., 9.])))
-      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-      self.assertAllEqual([1., 2., 3., 4., 5.],
-                          self.evaluate(second_dense.bias))
-
-      deferred_sequential = sequential.Sequential()
-      deferred_sequential_checkpoint = tf.train.Checkpoint(
-          model=deferred_sequential)
-      status = deferred_sequential_checkpoint.restore(save_path)
-      deferred_sequential.add(core.Dense(4))
-      deferred_second_dense = core.Dense(5)
-      deferred_sequential.add(deferred_second_dense)
-      deferred_sequential(tf.constant([[1.]]))
-      status.run_restore_ops()
-      self.assertAllEqual([1., 2., 3., 4., 5.],
-                          self.evaluate(deferred_second_dense.bias))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def test_initialize_if_not_restoring(self):
-    with self.test_session():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
-      with test_utils.device(should_use_gpu=True):
-        model = MyModel()
-        optimizer = adam.Adam(0.001)
-        root = tf.train.Checkpoint(
-            model=model)  # Do not save the optimizer with the checkpoint.
-        optimizer_checkpoint = tf.train.Checkpoint(
-            optimizer=optimizer)
-
-        checkpoint_path = tf.train.latest_checkpoint(
-            checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        input_value = tf.constant([[3.]])
-        def train_fn():
-          with tf.GradientTape() as tape:
-            loss = model(input_value)
-          variables = model.trainable_variables
-          gradients = tape.gradient(loss, variables)
-          return optimizer.apply_gradients(zip(gradients, variables))
-        if not tf.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        # TODO(tanzheny): Add hyper variables to .variables(), and set them with
-        # set_weights etc.
-        variables_not_in_the_variables_property = [
-            obj for obj in optimizer._hyper.values()
-            if isinstance(obj, tf.Variable)]
-        self.evaluate([v.initializer for v
-                       in optimizer.variables()
-                       + variables_not_in_the_variables_property])
-        train_fn()
-        model_save_path = root.save(file_prefix=checkpoint_prefix)
-        self.evaluate(optimizer.beta_1.assign(42.))
-        optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
-      del train_fn
-
-      # Restore into a graph with the optimizer
-      with test_utils.device(should_use_gpu=True):
-        model = MyModel()
-        optimizer = adam.Adam(0.001)
-        root = tf.train.Checkpoint(
-            optimizer=optimizer, model=model)
-        status = root.restore(save_path=model_save_path)
-        input_value = tf.constant([[3.]])
-        def train_fn1():
-          with tf.GradientTape() as tape:
-            loss = model(input_value)
-          variables = model.trainable_variables
-          gradients = tape.gradient(loss, variables)
-          return optimizer.apply_gradients(zip(gradients, variables))
-        if not tf.executing_eagerly():
-          train_fn1 = functools.partial(self.evaluate, train_fn1())
-        status.initialize_or_restore()
-        train_fn1()
-        with self.assertRaises(AssertionError):
-          status.assert_existing_objects_matched()
-        with self.assertRaises(AssertionError):
-          status.assert_consumed()
-      del train_fn1
-
-      # Make sure initialization doesn't clobber later restores
-      with test_utils.device(should_use_gpu=True):
-        model = MyModel()
-        optimizer = adam.Adam(0.001, beta_1=1.0)
-        root = tf.train.Checkpoint(
-            optimizer=optimizer, model=model)
-        opt_root = tf.train.Checkpoint(
-            optimizer=optimizer)
-        status = root.restore(save_path=model_save_path)
-        init_only_optimizer_status = opt_root.restore(save_path=None)
-        optimizer_status = opt_root.restore(save_path=optimizer_save_path)
-        input_value = tf.constant([[3.]])
-        def train_fn2():
-          with tf.GradientTape() as tape:
-            loss = model(input_value)
-          variables = model.trainable_variables
-          gradients = tape.gradient(loss, variables)
-          return optimizer.apply_gradients(zip(gradients, variables))
-        if not tf.executing_eagerly():
-          train_fn2 = functools.partial(self.evaluate, train_fn2())
-        optimizer_status.run_restore_ops()
-        status.initialize_or_restore()
-        init_only_optimizer_status.initialize_or_restore()
-        train_fn2()
-        self.assertEqual(42., self.evaluate(optimizer.beta_1))
+        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+        for _ in range(2):
+            checkpoint.save(checkpoint_prefix)
+            with tf.GradientTape() as tape:
+                loss = (tf.constant(1.0) - model(tf.constant(1.0))) ** 2
+            grad = tape.gradient(loss, model.vars)
+            optimizer.apply_gradients(
+                [(g, v) for g, v in zip(grad, model.vars)]
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testDeferredSlotRestoration(self):
+        with self.test_session():
+            checkpoint_directory = self.get_temp_dir()
+
+            root = tf.train.Checkpoint()
+            root.var = trackable_utils.add_variable(
+                root, name="var", initializer=0.0
+            )
+            optimizer = adam.Adam(0.1)
+            variables = [root.var]
+            gradients = [1.0]
+            train_op = optimizer.apply_gradients(zip(gradients, variables))
+            # Note that `optimizer` has not been added as a dependency of
+            # `root`. Create a one-off grouping so that slot variables for
+            # `root.var` get initialized too.
+            self.evaluate(
+                trackable_utils.gather_initializers(
+                    tf.train.Checkpoint(root=root, optimizer=optimizer)
+                )
+            )
+            self.evaluate(train_op)
+            self.evaluate(tf.compat.v1.assign(root.var, 12.0))
+            no_slots_path = root.save(
+                os.path.join(checkpoint_directory, "no_slots")
+            )
+            root.optimizer = optimizer
+            self.evaluate(tf.compat.v1.assign(root.var, 13.0))
+            self.evaluate(
+                tf.compat.v1.assign(
+                    optimizer.get_slot(slot_name="m", var=root.var), 14.0
+                )
+            )
+            slots_path = root.save(
+                os.path.join(checkpoint_directory, "with_slots")
+            )
+            new_root = tf.train.Checkpoint()
+            # Load the slot-containing checkpoint (deferred), then immediately
+            # overwrite the non-slot variable (also deferred).
+            slot_status = new_root.restore(slots_path)
+            no_slot_status = new_root.restore(no_slots_path)
+            with self.assertRaises(AssertionError):
+                no_slot_status.assert_consumed()
+            new_root.var = trackable_utils.add_variable(
+                new_root, name="var", shape=[]
+            )
+            no_slot_status.assert_consumed()
+            no_slot_status.run_restore_ops()
+            self.assertEqual(12.0, self.evaluate(new_root.var))
+            new_root.optimizer = adam.Adam(0.1)
+            slot_status.assert_existing_objects_matched()
+            if not tf.executing_eagerly():
+                with self.assertRaisesRegex(
+                    AssertionError, "Unresolved object"
+                ):
+                    slot_status.assert_consumed()
+            self.assertEqual(12.0, self.evaluate(new_root.var))
+            if tf.executing_eagerly():
+                # Slot variables are only created with restoring initializers
+                # when executing eagerly.
+                self.assertEqual(
+                    14.0,
+                    self.evaluate(
+                        new_root.optimizer.get_slot(
+                            slot_name="m", var=new_root.var
+                        )
+                    ),
+                )
+            else:
+                # Slot variables are not created eagerly when graph building.
+                with self.assertRaises(KeyError):
+                    new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
+            variables = [new_root.var]
+            gradients = [1.0]
+            train_op = new_root.optimizer.apply_gradients(
+                zip(gradients, variables)
+            )
+            # The slot variable now exists; restore() didn't create it, but we
+            # should now have a restore op for it.
+            slot_status.run_restore_ops()
+            if not tf.executing_eagerly():
+                # The train op hasn't run when graph building, so the slot
+                # variable has its restored value. It has run in eager, so the
+                # value will be different.
+                self.assertEqual(
+                    14.0,
+                    self.evaluate(
+                        new_root.optimizer.get_slot(
+                            slot_name="m", var=new_root.var
+                        )
+                    ),
+                )
+            self.evaluate(train_op)
+            slot_status.assert_consumed()
+
+    def testManySavesGraph(self):
+        """Saves after the first should not modify the graph."""
+        with context.graph_mode():
+            graph = tf.Graph()
+            with graph.as_default(), self.session(graph):
+                checkpoint_directory = self.get_temp_dir()
+                checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+                obj = tf.train.Checkpoint()
+                obj.var = tf.Variable(0.0, name="v")
+                obj.opt = adam.Adam(0.1)
+                variables = [obj.var]
+                gradients = [1.0]
+                obj.opt.apply_gradients(zip(gradients, variables))
+                self.evaluate(trackable_utils.gather_initializers(obj))
+                obj.save(checkpoint_prefix)
+                graph.finalize()
+                obj.save(checkpoint_prefix)
+
+    def testManyRestoresGraph(self):
+        """Restores after the first should not modify the graph."""
+        with context.graph_mode():
+            graph = tf.Graph()
+            with graph.as_default(), self.session(graph):
+                checkpoint_directory = self.get_temp_dir()
+                checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+                obj = tf.train.Checkpoint()
+                obj.var = tf.Variable(0.0, name="v")
+                obj.opt = adam.Adam(0.1)
+                variables = [obj.var]
+                gradients = [1.0]
+                obj.opt.apply_gradients(zip(gradients, variables))
+                self.evaluate(trackable_utils.gather_initializers(obj))
+                save_path = obj.save(checkpoint_prefix)
+                obj.restore(save_path)
+                graph.finalize()
+                obj.restore(save_path)
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_sequential(self):
+        with self.test_session():
+            model = sequential.Sequential()
+            checkpoint = tf.train.Checkpoint(model=model)
+            model.add(core.Dense(4))
+            second_dense = core.Dense(5)
+            model.add(second_dense)
+            model(tf.constant([[1.0]]))
+            checkpoint.restore(None).initialize_or_restore()
+            self.evaluate(
+                second_dense.bias.assign(tf.constant([1.0, 2.0, 3.0, 4.0, 5.0]))
+            )
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            save_path = checkpoint.save(checkpoint_prefix)
+            self.evaluate(
+                second_dense.bias.assign(tf.constant([5.0, 6.0, 7.0, 8.0, 9.0]))
+            )
+            checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+            self.assertAllEqual(
+                [1.0, 2.0, 3.0, 4.0, 5.0], self.evaluate(second_dense.bias)
+            )
+
+            deferred_sequential = sequential.Sequential()
+            deferred_sequential_checkpoint = tf.train.Checkpoint(
+                model=deferred_sequential
+            )
+            status = deferred_sequential_checkpoint.restore(save_path)
+            deferred_sequential.add(core.Dense(4))
+            deferred_second_dense = core.Dense(5)
+            deferred_sequential.add(deferred_second_dense)
+            deferred_sequential(tf.constant([[1.0]]))
+            status.run_restore_ops()
+            self.assertAllEqual(
+                [1.0, 2.0, 3.0, 4.0, 5.0],
+                self.evaluate(deferred_second_dense.bias),
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_initialize_if_not_restoring(self):
+        with self.test_session():
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+            with test_utils.device(should_use_gpu=True):
+                model = MyModel()
+                optimizer = adam.Adam(0.001)
+                root = tf.train.Checkpoint(
+                    model=model
+                )  # Do not save the optimizer with the checkpoint.
+                optimizer_checkpoint = tf.train.Checkpoint(optimizer=optimizer)
+
+                checkpoint_path = tf.train.latest_checkpoint(
+                    checkpoint_directory
+                )
+                status = root.restore(save_path=checkpoint_path)
+                input_value = tf.constant([[3.0]])
+
+                def train_fn():
+                    with tf.GradientTape() as tape:
+                        loss = model(input_value)
+                    variables = model.trainable_variables
+                    gradients = tape.gradient(loss, variables)
+                    return optimizer.apply_gradients(zip(gradients, variables))
+
+                if not tf.executing_eagerly():
+                    train_fn = functools.partial(self.evaluate, train_fn())
+                status.initialize_or_restore()
+                # TODO(tanzheny): Add hyper variables to .variables(), and set
+                # them with set_weights etc.
+                variables_not_in_the_variables_property = [
+                    obj
+                    for obj in optimizer._hyper.values()
+                    if isinstance(obj, tf.Variable)
+                ]
+                self.evaluate(
+                    [
+                        v.initializer
+                        for v in optimizer.variables()
+                        + variables_not_in_the_variables_property
+                    ]
+                )
+                train_fn()
+                model_save_path = root.save(file_prefix=checkpoint_prefix)
+                self.evaluate(optimizer.beta_1.assign(42.0))
+                optimizer_save_path = optimizer_checkpoint.save(
+                    optimizer_only_prefix
+                )
+            del train_fn
+
+            # Restore into a graph with the optimizer
+            with test_utils.device(should_use_gpu=True):
+                model = MyModel()
+                optimizer = adam.Adam(0.001)
+                root = tf.train.Checkpoint(optimizer=optimizer, model=model)
+                status = root.restore(save_path=model_save_path)
+                input_value = tf.constant([[3.0]])
+
+                def train_fn1():
+                    with tf.GradientTape() as tape:
+                        loss = model(input_value)
+                    variables = model.trainable_variables
+                    gradients = tape.gradient(loss, variables)
+                    return optimizer.apply_gradients(zip(gradients, variables))
+
+                if not tf.executing_eagerly():
+                    train_fn1 = functools.partial(self.evaluate, train_fn1())
+                status.initialize_or_restore()
+                train_fn1()
+                with self.assertRaises(AssertionError):
+                    status.assert_existing_objects_matched()
+                with self.assertRaises(AssertionError):
+                    status.assert_consumed()
+            del train_fn1
+
+            # Make sure initialization doesn't clobber later restores
+            with test_utils.device(should_use_gpu=True):
+                model = MyModel()
+                optimizer = adam.Adam(0.001, beta_1=1.0)
+                root = tf.train.Checkpoint(optimizer=optimizer, model=model)
+                opt_root = tf.train.Checkpoint(optimizer=optimizer)
+                status = root.restore(save_path=model_save_path)
+                init_only_optimizer_status = opt_root.restore(save_path=None)
+                optimizer_status = opt_root.restore(
+                    save_path=optimizer_save_path
+                )
+                input_value = tf.constant([[3.0]])
+
+                def train_fn2():
+                    with tf.GradientTape() as tape:
+                        loss = model(input_value)
+                    variables = model.trainable_variables
+                    gradients = tape.gradient(loss, variables)
+                    return optimizer.apply_gradients(zip(gradients, variables))
+
+                if not tf.executing_eagerly():
+                    train_fn2 = functools.partial(self.evaluate, train_fn2())
+                optimizer_status.run_restore_ops()
+                status.initialize_or_restore()
+                init_only_optimizer_status.initialize_or_restore()
+                train_fn2()
+                self.assertEqual(42.0, self.evaluate(optimizer.beta_1))
 
 
 class _ManualScope(tf.Module):
+    def __call__(self):
+        with tf.compat.v1.variable_scope("ManualScope") as vs:
+            self.variable_scope = vs
+            with trackable_utils.capture_dependencies(template=self):
+                return self._build()
 
-  def __call__(self):
-    with tf.compat.v1.variable_scope("ManualScope") as vs:
-      self.variable_scope = vs
-      with trackable_utils.capture_dependencies(template=self):
-        return self._build()
-
-  def _build(self):
-    return tf.compat.v1.get_variable(name="in_manual_scope", shape=[])
+    def _build(self):
+        return tf.compat.v1.get_variable(name="in_manual_scope", shape=[])
 
 
 @test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class TemplateTests(test_combinations.TestCase):
-
-  def test_trackable_save_restore(self):
-    with self.test_session():
-      def _templated():
-        v = tf.compat.v1.get_variable(
-            "v", shape=[1], initializer=tf.compat.v1.zeros_initializer(),
-            use_resource=True)
-        v2 = tf.compat.v1.get_variable(
-            "v2", shape=[1], initializer=tf.compat.v1.zeros_initializer(),
-            use_resource=True)
-        manual = _ManualScope()
-        return v, v + 1., v2, manual, manual()
-
-      save_template = tf.compat.v1.make_template("s1", _templated)
-      v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
-      self.assertEqual(
-          set([id(v1_save), id(v2_save), id(manual_scope),
-               id(manual_scope_v), id(save_template)]),
-          set(map(id, trackable_utils.list_objects(save_template))))
-      self.assertDictEqual({"in_manual_scope": manual_scope_v},
-                           manual_scope._trackable_children())
-      optimizer = adam.Adam(0.0)
-      save_root = tf.train.Checkpoint(
-          my_template=save_template, optimizer=optimizer)
-      optimizer.minimize(v1_save.read_value,
-                         var_list=[v1_save])
-      self.evaluate([v.initializer for v in save_template.variables])
-      optimizer_variables = optimizer.variables() + list(
-          optimizer._hyper.values())
-      self.evaluate([v.initializer for v in optimizer_variables])
-      self.evaluate(v1_save.assign([12.]))
-      self.evaluate(v2_save.assign([14.]))
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      save_path = save_root.save(checkpoint_prefix)
-
-      load_template = tf.compat.v1.make_template("s2", _templated)
-      load_optimizer = adam.Adam(0.0)
-      load_root = tf.train.Checkpoint(
-          my_template=load_template, optimizer=load_optimizer)
-      status = load_root.restore(save_path)
-      var, var_plus_one, var2, _, _ = load_template()
-      load_optimizer.minimize(var.read_value, var_list=[var])
-
-      children = load_template._trackable_children()
-      self.assertEqual({"v", "v2", "ManualScope"}, children.keys())
-      status.assert_consumed().run_restore_ops()
-      self.assertAllEqual([12.], self.evaluate(var))
-      self.assertAllEqual([13.], self.evaluate(var_plus_one))
-      self.assertAllEqual([14.], self.evaluate(var2))
+    def test_trackable_save_restore(self):
+        with self.test_session():
+
+            def _templated():
+                v = tf.compat.v1.get_variable(
+                    "v",
+                    shape=[1],
+                    initializer=tf.compat.v1.zeros_initializer(),
+                    use_resource=True,
+                )
+                v2 = tf.compat.v1.get_variable(
+                    "v2",
+                    shape=[1],
+                    initializer=tf.compat.v1.zeros_initializer(),
+                    use_resource=True,
+                )
+                manual = _ManualScope()
+                return v, v + 1.0, v2, manual, manual()
+
+            save_template = tf.compat.v1.make_template("s1", _templated)
+            v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
+            self.assertEqual(
+                set(
+                    [
+                        id(v1_save),
+                        id(v2_save),
+                        id(manual_scope),
+                        id(manual_scope_v),
+                        id(save_template),
+                    ]
+                ),
+                set(map(id, trackable_utils.list_objects(save_template))),
+            )
+            self.assertDictEqual(
+                {"in_manual_scope": manual_scope_v},
+                manual_scope._trackable_children(),
+            )
+            optimizer = adam.Adam(0.0)
+            save_root = tf.train.Checkpoint(
+                my_template=save_template, optimizer=optimizer
+            )
+            optimizer.minimize(v1_save.read_value, var_list=[v1_save])
+            self.evaluate([v.initializer for v in save_template.variables])
+            optimizer_variables = optimizer.variables() + list(
+                optimizer._hyper.values()
+            )
+            self.evaluate([v.initializer for v in optimizer_variables])
+            self.evaluate(v1_save.assign([12.0]))
+            self.evaluate(v2_save.assign([14.0]))
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            save_path = save_root.save(checkpoint_prefix)
+
+            load_template = tf.compat.v1.make_template("s2", _templated)
+            load_optimizer = adam.Adam(0.0)
+            load_root = tf.train.Checkpoint(
+                my_template=load_template, optimizer=load_optimizer
+            )
+            status = load_root.restore(save_path)
+            var, var_plus_one, var2, _, _ = load_template()
+            load_optimizer.minimize(var.read_value, var_list=[var])
+
+            children = load_template._trackable_children()
+            self.assertEqual({"v", "v2", "ManualScope"}, children.keys())
+            status.assert_consumed().run_restore_ops()
+            self.assertAllEqual([12.0], self.evaluate(var))
+            self.assertAllEqual([13.0], self.evaluate(var_plus_one))
+            self.assertAllEqual([14.0], self.evaluate(var2))
 
 
 class CheckpointCompatibilityTests(test_combinations.TestCase):
-
-  def _initialized_model(self):
-    input_value = tf.constant([[3.]])
-    model = MyModel()
-    optimizer = adam.Adam(0.001)
-    root_trackable = tf.train.Checkpoint(
-        optimizer=optimizer, model=model)
-    with tf.GradientTape() as tape:
-      loss = model(input_value)
-    variables = model.trainable_variables
-    gradients = tape.gradient(loss, variables)
-    train_op = optimizer.apply_gradients(zip(gradients, variables))
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, slot_name="m").assign([2.]))
-    self.evaluate(optimizer.beta_1.assign(3.))
-    return root_trackable
-
-  def _set_sentinels(self, root_trackable):
-    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, slot_name="m")
-        .assign([102.]))
-    self.evaluate(root_trackable.optimizer.beta_1.assign(103.))
-
-  def _check_sentinels(self, root_trackable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_trackable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, slot_name="m")))
-    self.assertAllEqual(3.,
-                        self.evaluate(root_trackable.optimizer.beta_1))
-
-  def _write_name_based_checkpoint(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = tf.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        name_saver = tf.compat.v1.train.Saver()
-        return name_saver.save(
-            sess=session,
-            save_path=checkpoint_prefix,
-            global_step=root.optimizer.iterations)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testLoadFromNameBasedSaver(self):
-    """Save a name-based checkpoint, load it using the object-based API."""
-    with test_utils.device(should_use_gpu=True):
-      with self.test_session():
-        save_path = self._write_name_based_checkpoint()
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        with self.assertRaises(AssertionError):
-          self._check_sentinels(root)
-        object_saver = tf.train.Checkpoint(root=root)
-        self._set_sentinels(root)
-        status = object_saver.read(save_path)
-        if tf.executing_eagerly():
-          self._check_sentinels(root)
-        if tf.executing_eagerly():
-          status.assert_consumed()
-          status.assert_existing_objects_matched()
-          status.assert_nontrivial_match()
-        else:
-          # When graph building, we haven't read any keys, so we don't know
-          # whether the restore will be complete.
-          with self.assertRaisesRegex(AssertionError, "not restored"):
-            status.assert_consumed()
-          with self.assertRaisesRegex(AssertionError, "not restored"):
+    def _initialized_model(self):
+        input_value = tf.constant([[3.0]])
+        model = MyModel()
+        optimizer = adam.Adam(0.001)
+        root_trackable = tf.train.Checkpoint(optimizer=optimizer, model=model)
+        with tf.GradientTape() as tape:
+            loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        train_op = optimizer.apply_gradients(zip(gradients, variables))
+        self.evaluate(trackable_utils.gather_initializers(root_trackable))
+        self.evaluate(train_op)
+        # A regular variable, a slot variable, and a non-slot Optimizer variable
+        # with known values to check when loading.
+        self.evaluate(model._named_dense.bias.assign([1.0]))
+        self.evaluate(
+            optimizer.get_slot(
+                var=model._named_dense.bias, slot_name="m"
+            ).assign([2.0])
+        )
+        self.evaluate(optimizer.beta_1.assign(3.0))
+        return root_trackable
+
+    def _set_sentinels(self, root_trackable):
+        self.evaluate(root_trackable.model._named_dense.bias.assign([101.0]))
+        self.evaluate(
+            root_trackable.optimizer.get_slot(
+                var=root_trackable.model._named_dense.bias, slot_name="m"
+            ).assign([102.0])
+        )
+        self.evaluate(root_trackable.optimizer.beta_1.assign(103.0))
+
+    def _check_sentinels(self, root_trackable):
+        self.assertAllEqual(
+            [1.0], self.evaluate(root_trackable.model._named_dense.bias)
+        )
+        self.assertAllEqual(
+            [2.0],
+            self.evaluate(
+                root_trackable.optimizer.get_slot(
+                    var=root_trackable.model._named_dense.bias, slot_name="m"
+                )
+            ),
+        )
+        self.assertAllEqual(3.0, self.evaluate(root_trackable.optimizer.beta_1))
+
+    def _write_name_based_checkpoint(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        with context.graph_mode():
+            save_graph = tf.Graph()
+            with save_graph.as_default(), self.session(
+                graph=save_graph
+            ) as session:
+                root = self._initialized_model()
+                name_saver = tf.compat.v1.train.Saver()
+                return name_saver.save(
+                    sess=session,
+                    save_path=checkpoint_prefix,
+                    global_step=root.optimizer.iterations,
+                )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testLoadFromNameBasedSaver(self):
+        """Save a name-based checkpoint, load it using the object-based API."""
+        with test_utils.device(should_use_gpu=True):
+            with self.test_session():
+                save_path = self._write_name_based_checkpoint()
+                root = self._initialized_model()
+                self._set_sentinels(root)
+                with self.assertRaises(AssertionError):
+                    self._check_sentinels(root)
+                object_saver = tf.train.Checkpoint(root=root)
+                self._set_sentinels(root)
+                status = object_saver.read(save_path)
+                if tf.executing_eagerly():
+                    self._check_sentinels(root)
+                if tf.executing_eagerly():
+                    status.assert_consumed()
+                    status.assert_existing_objects_matched()
+                    status.assert_nontrivial_match()
+                else:
+                    # When graph building, we haven't read any keys, so we don't
+                    # know whether the restore will be complete.
+                    with self.assertRaisesRegex(AssertionError, "not restored"):
+                        status.assert_consumed()
+                    with self.assertRaisesRegex(AssertionError, "not restored"):
+                        status.assert_existing_objects_matched()
+                    with self.assertRaisesRegex(AssertionError, "not restored"):
+                        status.assert_nontrivial_match()
+                status.run_restore_ops()
+                self._check_sentinels(root)
+                self._set_sentinels(root)
+                status = object_saver.read(save_path)
+                status.initialize_or_restore()
+                status.assert_nontrivial_match()
+                self._check_sentinels(root)
+                # Check that there is no error when keys are missing from the
+                # name-based checkpoint.
+                root.not_in_name_checkpoint = tf.Variable([1.0])
+                status = object_saver.read(save_path)
+                with self.assertRaises(AssertionError):
+                    status.assert_existing_objects_matched()
+
+    def testSaveGraphLoadEager(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        with context.graph_mode():
+            save_graph = tf.Graph()
+            with save_graph.as_default(), self.session(graph=save_graph):
+                root = self._initialized_model()
+                save_path = root.save(file_prefix=checkpoint_prefix)
+        with tf.__internal__.eager_context.eager_mode():
+            root = self._initialized_model()
+            self._set_sentinels(root)
+            root.restore(save_path).assert_consumed()
+            self._check_sentinels(root)
+
+    def testSaveEagerLoadGraph(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        with tf.__internal__.eager_context.eager_mode():
+            root = self._initialized_model()
+            save_path = root.save(file_prefix=checkpoint_prefix)
+        with context.graph_mode():
+            save_graph = tf.Graph()
+            with save_graph.as_default(), self.session(graph=save_graph):
+                root = self._initialized_model()
+                self._set_sentinels(root)
+                root.restore(save_path).assert_consumed().run_restore_ops()
+                self._check_sentinels(root)
+
+    def testIgnoreSaveCounter(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        with self.cached_session() as session:
+            # Create and save a model using Saver() before using a Checkpoint.
+            # This generates a snapshot without the Checkpoint's `save_counter`.
+            model = sequential.Sequential()
+            model.add(reshaping.Flatten(input_shape=(1,)))
+            model.add(core.Dense(1))
+            name_saver = tf.compat.v1.train.Saver(model.trainable_variables)
+            save_path = name_saver.save(
+                sess=session, save_path=checkpoint_prefix, global_step=1
+            )
+            # Checkpoint.restore must successfully load that checkpoint.
+            ckpt = tf.train.Checkpoint(model=model)
+            status = ckpt.restore(save_path)
             status.assert_existing_objects_matched()
-          with self.assertRaisesRegex(AssertionError, "not restored"):
-            status.assert_nontrivial_match()
-        status.run_restore_ops()
-        self._check_sentinels(root)
-        self._set_sentinels(root)
-        status = object_saver.read(save_path)
-        status.initialize_or_restore()
-        status.assert_nontrivial_match()
-        self._check_sentinels(root)
-        # Check that there is no error when keys are missing from the name-based
-        # checkpoint.
-        root.not_in_name_checkpoint = tf.Variable([1.])
-        status = object_saver.read(save_path)
-        with self.assertRaises(AssertionError):
-          status.assert_existing_objects_matched()
-
-  def testSaveGraphLoadEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = tf.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        save_path = root.save(file_prefix=checkpoint_prefix)
-    with tf.__internal__.eager_context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      root.restore(save_path).assert_consumed()
-      self._check_sentinels(root)
-
-  def testSaveEagerLoadGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with tf.__internal__.eager_context.eager_mode():
-      root = self._initialized_model()
-      save_path = root.save(file_prefix=checkpoint_prefix)
-    with context.graph_mode():
-      save_graph = tf.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        root.restore(save_path).assert_consumed().run_restore_ops()
-        self._check_sentinels(root)
-
-  def testIgnoreSaveCounter(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with self.cached_session() as session:
-      # Create and save a model using Saver() before using a Checkpoint. This
-      # generates a snapshot without the Checkpoint's `save_counter`.
-      model = sequential.Sequential()
-      model.add(reshaping.Flatten(input_shape=(1,)))
-      model.add(core.Dense(1))
-      name_saver = tf.compat.v1.train.Saver(model.trainable_variables)
-      save_path = name_saver.save(
-          sess=session, save_path=checkpoint_prefix, global_step=1)
-      # Checkpoint.restore must successfully load that checkpoint.
-      ckpt = tf.train.Checkpoint(model=model)
-      status = ckpt.restore(save_path)
-      status.assert_existing_objects_matched()
-      # It should, however, refuse to load a checkpoint where an unrelated
-      # `save_counter` variable is missing.
-      model.layers[1].var = tf.Variable(0., name="save_counter")
-      status = ckpt.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_existing_objects_matched()
+            # It should, however, refuse to load a checkpoint where an unrelated
+            # `save_counter` variable is missing.
+            model.layers[1].var = tf.Variable(0.0, name="save_counter")
+            status = ckpt.restore(save_path)
+            with self.assertRaises(AssertionError):
+                status.assert_existing_objects_matched()
 
 
 if __name__ == "__main__":
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/tests/tracking_util_with_v1_optimizers_test.py b/keras/tests/tracking_util_with_v1_optimizers_test.py
index c750ce177fd9..bf1d85ed7bba 100644
--- a/keras/tests/tracking_util_with_v1_optimizers_test.py
+++ b/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -14,673 +14,799 @@
 # ==============================================================================
 """Tests for object-based saving which use tf.train.* optimizers."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 import os
-from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_utils
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
+
+import tensorflow.compat.v2 as tf
+
 from keras.engine import training
 from keras.layers import core
-from tensorflow.python.training.tracking import util as trackable_utils
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
+# isort: off
+from tensorflow.python.checkpoint import (
+    checkpoint as trackable_utils,
+)
+from tensorflow.python.eager import context
+from tensorflow.python.framework import (
+    test_util as tf_test_utils,
+)
 
-class NonLayerTrackable(tf.Module):
 
-  def __init__(self):
-    super().__init__()
-    self.a_variable = trackable_utils.add_variable(
-        self, name="a_variable", shape=[])
+class NonLayerTrackable(tf.Module):
+    def __init__(self):
+        super().__init__()
+        self.a_variable = trackable_utils.add_variable(
+            self, name="a_variable", shape=[]
+        )
 
 
-# pylint: disable=not-callable
 class MyModel(training.Model):
-  """A concrete Model for testing."""
+    """A concrete Model for testing."""
 
-  def __init__(self):
-    super().__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Trackables which aren't Layers.
-    self._non_layer = NonLayerTrackable()
+    def __init__(self):
+        super().__init__()
+        self._named_dense = core.Dense(1, use_bias=True)
+        self._second = core.Dense(1, use_bias=False)
+        # We can still track Trackables which aren't Layers.
+        self._non_layer = NonLayerTrackable()
 
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
+    def call(self, values):
+        ret = self._second(self._named_dense(values))
+        return ret
 
 
 class CheckpointingTests(test_combinations.TestCase):
-
-  @tf_test_utils.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNamingWithOptimizer(self):
-    input_value = tf.constant([[3.]])
-    model = MyModel()
-    # A nuisance Model using the same optimizer. Its slot variables should not
-    # go in the checkpoint, since it is never depended on.
-    other_model = MyModel()
-    optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-    optimizer_step = tf.compat.v1.train.get_or_create_global_step()
-    root_trackable = tf.train.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    if tf.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value),
-          global_step=optimizer_step)
-      optimizer.minimize(
-          lambda: other_model(input_value),
-          global_step=optimizer_step)
-    else:
-      train_op = optimizer.minimize(
-          model(input_value), global_step=optimizer_step)
-      optimizer.minimize(
-          other_model(input_value),
-          global_step=optimizer_step)
-      self.evaluate(trackable_utils.gather_initializers(
-          root_trackable))
-      self.evaluate(train_op)
-    named_variables, serialized_graph, _ = tf.__internal__.tracking.ObjectGraphView(
-        root_trackable).serialize_object_graph()
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "optimizer_step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
-        # Slot variables
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
-    )
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    expected_checkpoint_names = [
-        name + suffix for name in expected_checkpoint_names]
-    named_variables = {v.name: v for v in named_variables}
-    self.assertEqual(len(expected_checkpoint_names),
-                     len(named_variables.keys()))
-    # Check that we've mapped to the right variable objects (not exhaustive)
-    self.assertEqual(
-        "global_step",
-        named_variables["optimizer_step" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense_1/kernel",
-        named_variables["model/_second/kernel" + suffix].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        named_variables["model/_named_dense/kernel" + suffix].full_name)
-    self.assertEqual(
-        "beta1_power",
-        named_variables["optimizer/beta1_power" + suffix].full_name)
-    self.assertEqual(
-        "beta2_power",
-        named_variables["optimizer/beta2_power" + suffix].full_name)
-    # Spot check the generated protocol buffers.
-    self.assertEqual("optimizer",
-                     serialized_graph.nodes[0].children[1].local_name)
-    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
-        1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .original_variable_node_id]
-        .attributes[0].full_name)
-    # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual(
-        "my_model/dense/kernel/Adam",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .slot_variable_node_id]
-        .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel/Adam:0",
-        optimizer.get_slot(
-            var=model._named_dense.kernel,
-            name="m").name)
-    self.assertEqual(
-        "model/_named_dense/kernel" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .original_variable_node_id].attributes[0].checkpoint_key)
-    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual(
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .slot_variable_node_id].attributes[0].checkpoint_key)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testSaveRestore(self):
-    with self.test_session():
-      model = MyModel()
-      optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-      root_trackable = tf.train.Checkpoint(
-          optimizer=optimizer, model=model)
-      input_value = tf.constant([[3.]])
-      if tf.executing_eagerly():
-        optimizer.minimize(
-            lambda: model(input_value))
-      else:
-        train_op = optimizer.minimize(model(input_value))
-        # TODO(allenl): Make initialization more pleasant when graph building.
-        root_trackable.save_counter  # pylint: disable=pointless-statement
-        self.evaluate(trackable_utils.gather_initializers(
-            root_trackable))
-        self.evaluate(train_op)
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      self.evaluate(tf.compat.v1.assign(model._named_dense.variables[1], [42.]))
-      m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-      self.evaluate(tf.compat.v1.assign(m_bias_slot, [1.5]))
-      save_path = root_trackable.save(file_prefix=prefix)
-      self.evaluate(tf.compat.v1.assign(model._named_dense.variables[1], [43.]))
-      self.evaluate(tf.compat.v1.assign(root_trackable.save_counter, 3))
-      optimizer_variables = self.evaluate(optimizer.variables())
-      self.evaluate(tf.compat.v1.assign(m_bias_slot, [-2.]))
-      # Immediate restoration
-      status = root_trackable.restore(save_path=save_path).assert_consumed()
-      status.run_restore_ops()
-      self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-      self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
-      self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-      if not tf.executing_eagerly():
-        return  # Restore-on-create is only supported when executing eagerly
-      on_create_model = MyModel()
-      on_create_optimizer = tf.compat.v1.train.AdamOptimizer(
-          0.001,
-          # Preserve beta1_power and beta2_power when applying gradients
-          # so we can test that they've been restored correctly.
-          beta1=1.0,
-          beta2=1.0)
-      on_create_root = tf.train.Checkpoint(
-          optimizer=on_create_optimizer, model=on_create_model)
-      # Deferred restoration
-      status = on_create_root.restore(save_path=save_path)
-      status.assert_nontrivial_match()
-      status.assert_existing_objects_matched()
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-      on_create_model(tf.constant([[3.]]))  # create variables
-      self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-      self.assertAllEqual([42.],
-                          self.evaluate(
-                              on_create_model._named_dense.variables[1]))
-      on_create_m_bias_slot = on_create_optimizer.get_slot(
-          on_create_model._named_dense.variables[1], "m")
-      status.assert_existing_objects_matched()
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-      # Optimizer slot variables are created when the original variable is
-      # restored.
-      self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-      self.assertAllEqual(optimizer_variables[2:],
-                          self.evaluate(on_create_optimizer.variables()))
-      dummy_var = tf.Variable([1.])
-      on_create_optimizer.minimize(loss=dummy_var.read_value)
-      status.assert_existing_objects_matched()
-      status.assert_consumed()
-      beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-      self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-      self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
-
-  # TODO(allenl): Debug garbage created by this test in python3.
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      model = MyModel()
-      optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-      root = tf.train.Checkpoint(
-          optimizer=optimizer, model=model,
-          optimizer_step=tf.compat.v1.train.get_or_create_global_step())
-      root.restore(tf.train.latest_checkpoint(
-          checkpoint_directory))
-      for _ in range(num_training_steps):
-        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
-        input_value = tf.constant([[3.]])
-        optimizer.minimize(
-            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
-            global_step=root.optimizer_step)
-      root.save(file_prefix=checkpoint_prefix)
-      self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer_step.numpy())
-
-  def testEagerDistributionStrategy(self):
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    def _train_fn(optimizer, model, root):
-      input_value = tf.constant([[3.]])
-      optimizer.minimize(
-          functools.partial(model, input_value),
-          global_step=root.optimizer_step)
-
-    strategy = tf.distribute.MirroredStrategy()
-    with strategy.scope():
-      for training_continuation in range(3):
+    @tf_test_utils.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+    def testNamingWithOptimizer(self):
+        input_value = tf.constant([[3.0]])
         model = MyModel()
+        # A nuisance Model using the same optimizer. Its slot variables should
+        # not go in the checkpoint, since it is never depended on.
+        other_model = MyModel()
         optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-        root = tf.train.Checkpoint(
-            optimizer=optimizer,
-            model=model,
-            optimizer_step=tf.compat.v1.train.get_or_create_global_step())
-        root.restore(
-            tf.train.latest_checkpoint(checkpoint_directory))
-
-        for _ in range(num_training_steps):
-          strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model, root))
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         root.optimizer_step.numpy())
-
-  def testGraphDistributionStrategy(self):
-    self.skipTest("b/121381184")
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-    def _train_fn(optimizer, model, root):
-      input_value = tf.constant([[3.]])
-      return optimizer.minimize(
-          functools.partial(model, input_value),
-          global_step=root.optimizer_step)
-
-    for training_continuation in range(3):
-      with tf.Graph().as_default():
-        strategy = tf.distribute.MirroredStrategy()
-        with strategy.scope():
-          model = MyModel()
-          optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-          root = tf.train.Checkpoint(
-              optimizer=optimizer, model=model,
-              optimizer_step=tf.compat.v1.train.get_or_create_global_step())
-          status = root.restore(tf.train.latest_checkpoint(
-              checkpoint_directory))
-          train_op = strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model, root))
-          with self.session() as session:
-            if training_continuation > 0:
-              status.assert_consumed()
-            status.initialize_or_restore()
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         root.optimizer_step.numpy())
-
-  def testUsageGraph(self):
-    """Expected usage when graph building."""
-    with context.graph_mode():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with tf.Graph().as_default():
-          model = MyModel()
-          optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-          root = tf.compat.v1.train.Checkpoint(
-              optimizer=optimizer, model=model,
-              global_step=tf.compat.v1.train.get_or_create_global_step())
-          input_value = tf.constant([[3.]])
-          train_op = optimizer.minimize(
-              model(input_value),
-              global_step=root.global_step)
-          checkpoint_path = tf.train.latest_checkpoint(
-              checkpoint_directory)
-          with self.session(graph=tf.compat.v1.get_default_graph()) as session:
-            status = root.restore(save_path=checkpoint_path)
-            status.initialize_or_restore(session=session)
-            if checkpoint_path is None:
-              self.assertEqual(0, training_continuation)
-              with self.assertRaises(AssertionError):
-                status.assert_consumed()
-              with self.assertRaises(AssertionError):
-                status.assert_existing_objects_matched()
+        optimizer_step = tf.compat.v1.train.get_or_create_global_step()
+        root_trackable = tf.train.Checkpoint(
+            optimizer=optimizer, model=model, optimizer_step=optimizer_step
+        )
+        if tf.executing_eagerly():
+            optimizer.minimize(
+                lambda: model(input_value), global_step=optimizer_step
+            )
+            optimizer.minimize(
+                lambda: other_model(input_value), global_step=optimizer_step
+            )
+        else:
+            train_op = optimizer.minimize(
+                model(input_value), global_step=optimizer_step
+            )
+            optimizer.minimize(
+                other_model(input_value), global_step=optimizer_step
+            )
+            self.evaluate(trackable_utils.gather_initializers(root_trackable))
+            self.evaluate(train_op)
+        (
+            named_variables,
+            serialized_graph,
+            _,
+        ) = tf.__internal__.tracking.ObjectGraphView(
+            root_trackable
+        ).serialize_object_graph()
+        expected_checkpoint_names = (
+            # Created in the root node, so no prefix.
+            "optimizer_step",
+            "model/_second/kernel",
+            "model/_named_dense/kernel",
+            "model/_named_dense/bias",
+            # non-Layer dependency of the model
+            "model/_non_layer/a_variable",
+            # The optimizer creates two non-slot variables
+            "optimizer/beta1_power",
+            "optimizer/beta2_power",
+            # Slot variables
+            "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+            "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+            "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+            "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+            "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+            "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+        )
+        suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+        expected_checkpoint_names = [
+            name + suffix for name in expected_checkpoint_names
+        ]
+        named_variables = {v.name: v for v in named_variables}
+        self.assertEqual(
+            len(expected_checkpoint_names), len(named_variables.keys())
+        )
+        # Check that we've created the right full_names of objects (not
+        # exhaustive)
+        expected_names = {
+            "optimizer_step" + suffix: "global_step",
+            "model/_second/kernel" + suffix: "my_model/dense_1/kernel",
+            "model/_named_dense/kernel" + suffix: "my_model/dense/kernel",
+            "optimizer/beta1_power" + suffix: "beta1_power",
+            "optimizer/beta2_power" + suffix: "beta2_power",
+        }
+        for nodes in serialized_graph.nodes:
+            for attribute in nodes.attributes:
+                expected_name = expected_names.pop(
+                    attribute.checkpoint_key, None
+                )
+                if expected_name is not None:
+                    self.assertEqual(expected_name, attribute.full_name)
+        self.assertEmpty(expected_names)
+
+        # Spot check the generated protocol buffers.
+        self.assertEqual(
+            "optimizer", serialized_graph.nodes[0].children[1].local_name
+        )
+        optimizer_node = serialized_graph.nodes[
+            serialized_graph.nodes[0].children[1].node_id
+        ]
+        self.assertEqual("beta1_power", optimizer_node.children[0].local_name)
+        self.assertEqual(
+            "beta1_power",
+            serialized_graph.nodes[optimizer_node.children[0].node_id]
+            .attributes[0]
+            .full_name,
+        )
+        self.assertEqual(
+            "my_model/dense/kernel",
+            serialized_graph.nodes[
+                optimizer_node.slot_variables[0].original_variable_node_id
+            ]
+            .attributes[0]
+            .full_name,
+        )
+
+        # We strip off the :0 suffix, as variable.name-based saving does.
+        self.assertEqual(
+            "my_model/dense/kernel/Adam",
+            serialized_graph.nodes[
+                optimizer_node.slot_variables[0].slot_variable_node_id
+            ]
+            .attributes[0]
+            .full_name,
+        )
+        self.assertEqual(
+            "my_model/dense/kernel/Adam:0",
+            optimizer.get_slot(var=model._named_dense.kernel, name="m").name,
+        )
+        self.assertEqual(
+            "model/_named_dense/kernel" + suffix,
+            serialized_graph.nodes[
+                optimizer_node.slot_variables[0].original_variable_node_id
+            ]
+            .attributes[0]
+            .checkpoint_key,
+        )
+        self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+        self.assertEqual(
+            "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+            serialized_graph.nodes[
+                optimizer_node.slot_variables[0].slot_variable_node_id
+            ]
+            .attributes[0]
+            .checkpoint_key,
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testSaveRestore(self):
+        with self.test_session():
+            model = MyModel()
+            optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+            root_trackable = tf.train.Checkpoint(
+                optimizer=optimizer, model=model
+            )
+            input_value = tf.constant([[3.0]])
+            if tf.executing_eagerly():
+                optimizer.minimize(lambda: model(input_value))
             else:
-              status.assert_consumed()
-              status.assert_existing_objects_matched()
+                train_op = optimizer.minimize(model(input_value))
+                # TODO(allenl): Make initialization more pleasant when graph
+                # building.
+                root_trackable.save_counter
+                self.evaluate(
+                    trackable_utils.gather_initializers(root_trackable)
+                )
+                self.evaluate(train_op)
+            prefix = os.path.join(self.get_temp_dir(), "ckpt")
+            self.evaluate(
+                tf.compat.v1.assign(model._named_dense.variables[1], [42.0])
+            )
+            m_bias_slot = optimizer.get_slot(
+                model._named_dense.variables[1], "m"
+            )
+            self.evaluate(tf.compat.v1.assign(m_bias_slot, [1.5]))
+            save_path = root_trackable.save(file_prefix=prefix)
+            self.evaluate(
+                tf.compat.v1.assign(model._named_dense.variables[1], [43.0])
+            )
+            self.evaluate(tf.compat.v1.assign(root_trackable.save_counter, 3))
+            optimizer_variables = self.evaluate(optimizer.variables())
+            self.evaluate(tf.compat.v1.assign(m_bias_slot, [-2.0]))
+            # Immediate restoration
+            status = root_trackable.restore(
+                save_path=save_path
+            ).assert_consumed()
+            status.run_restore_ops()
+            self.assertAllEqual(
+                [42.0], self.evaluate(model._named_dense.variables[1])
+            )
+            self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
+            self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+            if not tf.executing_eagerly():
+                # Restore-on-create is only supported when executing eagerly
+                return
+            on_create_model = MyModel()
+            on_create_optimizer = tf.compat.v1.train.AdamOptimizer(
+                0.001,
+                # Preserve beta1_power and beta2_power when applying gradients
+                # so we can test that they've been restored correctly.
+                beta1=1.0,
+                beta2=1.0,
+            )
+            on_create_root = tf.train.Checkpoint(
+                optimizer=on_create_optimizer, model=on_create_model
+            )
+            # Deferred restoration
+            status = on_create_root.restore(save_path=save_path)
+            status.assert_nontrivial_match()
+            status.assert_existing_objects_matched()
+            with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            on_create_model(tf.constant([[3.0]]))  # create variables
+            self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+            self.assertAllEqual(
+                [42.0], self.evaluate(on_create_model._named_dense.variables[1])
+            )
+            on_create_m_bias_slot = on_create_optimizer.get_slot(
+                on_create_model._named_dense.variables[1], "m"
+            )
+            status.assert_existing_objects_matched()
+            with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            # Optimizer slot variables are created when the original variable is
+            # restored.
+            self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+            self.assertAllEqual(
+                optimizer_variables[2:],
+                self.evaluate(on_create_optimizer.variables()),
+            )
+            dummy_var = tf.Variable([1.0])
+            on_create_optimizer.minimize(loss=dummy_var.read_value)
+            status.assert_existing_objects_matched()
+            status.assert_consumed()
+            (
+                beta1_power,
+                beta2_power,
+            ) = on_create_optimizer._get_beta_accumulators()
+            self.assertAllEqual(
+                optimizer_variables[0], self.evaluate(beta1_power)
+            )
+            self.assertAllEqual(
+                optimizer_variables[1], self.evaluate(beta2_power)
+            )
+
+    # TODO(allenl): Debug garbage created by this test in python3.
+    def testDeferredRestorationUsageEager(self):
+        """An idiomatic eager execution example."""
+        num_training_steps = 10
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        for training_continuation in range(3):
+            model = MyModel()
+            optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+            root = tf.train.Checkpoint(
+                optimizer=optimizer,
+                model=model,
+                optimizer_step=tf.compat.v1.train.get_or_create_global_step(),
+            )
+            root.restore(tf.train.latest_checkpoint(checkpoint_directory))
             for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix, session=session)
-            self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.global_step))
-            self.assertEqual(training_continuation + 1,
-                             session.run(root.save_counter))
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testAgnosticUsage(self):
-    """Graph/eager agnostic usage."""
-    # Does create garbage when executing eagerly due to ops.Graph() creation.
-    with self.test_session():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      for training_continuation in range(3):
-        with test_utils.device(should_use_gpu=True):
-          model = MyModel()
-          optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-          root = tf.train.Checkpoint(
-              optimizer=optimizer, model=model,
-              global_step=tf.compat.v1.train.get_or_create_global_step())
-          manager = tf.train.CheckpointManager(
-              root, checkpoint_directory, max_to_keep=1)
-          status = root.restore(save_path=manager.latest_checkpoint)
-          input_value = tf.constant([[3.]])
-          train_fn = functools.partial(
-              optimizer.minimize,
-              functools.partial(model, input_value),
-              global_step=root.global_step)
-          if not tf.executing_eagerly():
-            train_fn = functools.partial(self.evaluate, train_fn())
-          status.initialize_or_restore()
-          for _ in range(num_training_steps):
-            train_fn()
-          manager.save()
-          self.assertEqual((training_continuation + 1) * num_training_steps,
-                           self.evaluate(root.global_step))
-          self.assertEqual(training_continuation + 1,
-                           self.evaluate(root.save_counter))
-
-  # pylint: disable=cell-var-from-loop
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testWithDefun(self):
-    with self.test_session():
-      num_training_steps = 2
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with test_utils.device(should_use_gpu=True):
-          model = MyModel()
-          # Don't actually train so we can test variable values
-          optimizer = tf.compat.v1.train.AdamOptimizer(0.)
-          root = tf.train.Checkpoint(
-              optimizer=optimizer, model=model,
-              global_step=tf.compat.v1.train.get_or_create_global_step())
-          checkpoint_path = tf.train.latest_checkpoint(
-              checkpoint_directory)
-          status = root.restore(save_path=checkpoint_path)
-          def train_fn():
-            @tf.function
-            def _call_model(x):
-              return model(x)
+                # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+                input_value = tf.constant([[3.0]])
+                optimizer.minimize(
+                    lambda: model(input_value),
+                    global_step=root.optimizer_step,
+                )
+            root.save(file_prefix=checkpoint_prefix)
+            self.assertEqual(
+                (training_continuation + 1) * num_training_steps,
+                root.optimizer_step.numpy(),
+            )
+
+    def testEagerDistributionStrategy(self):
+        num_training_steps = 10
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+        def _train_fn(optimizer, model, root):
+            input_value = tf.constant([[3.0]])
+            optimizer.minimize(
+                functools.partial(model, input_value),
+                global_step=root.optimizer_step,
+            )
+
+        strategy = tf.distribute.MirroredStrategy()
+        with strategy.scope():
+            for training_continuation in range(3):
+                model = MyModel()
+                optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+                root = tf.train.Checkpoint(
+                    optimizer=optimizer,
+                    model=model,
+                    optimizer_step=tf.compat.v1.train.get_or_create_global_step(),  # noqa: E501
+                )
+                root.restore(tf.train.latest_checkpoint(checkpoint_directory))
+
+                for _ in range(num_training_steps):
+                    strategy.extended.call_for_each_replica(
+                        functools.partial(_train_fn, optimizer, model, root)
+                    )
+                root.save(file_prefix=checkpoint_prefix)
+                self.assertEqual(
+                    (training_continuation + 1) * num_training_steps,
+                    root.optimizer_step.numpy(),
+                )
+
+    def testGraphDistributionStrategy(self):
+        self.skipTest("b/121381184")
+        num_training_steps = 10
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+        def _train_fn(optimizer, model, root):
+            input_value = tf.constant([[3.0]])
+            return optimizer.minimize(
+                functools.partial(model, input_value),
+                global_step=root.optimizer_step,
+            )
+
+        for training_continuation in range(3):
+            with tf.Graph().as_default():
+                strategy = tf.distribute.MirroredStrategy()
+                with strategy.scope():
+                    model = MyModel()
+                    optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+                    root = tf.train.Checkpoint(
+                        optimizer=optimizer,
+                        model=model,
+                        optimizer_step=tf.compat.v1.train.get_or_create_global_step(),  # noqa: E501
+                    )
+                    status = root.restore(
+                        tf.train.latest_checkpoint(checkpoint_directory)
+                    )
+                    train_op = strategy.extended.call_for_each_replica(
+                        functools.partial(_train_fn, optimizer, model, root)
+                    )
+                    with self.session() as session:
+                        if training_continuation > 0:
+                            status.assert_consumed()
+                        status.initialize_or_restore()
+                        for _ in range(num_training_steps):
+                            session.run(train_op)
+                        root.save(file_prefix=checkpoint_prefix)
+                self.assertEqual(
+                    (training_continuation + 1) * num_training_steps,
+                    root.optimizer_step.numpy(),
+                )
+
+    def testUsageGraph(self):
+        """Expected usage when graph building."""
+        with context.graph_mode():
+            num_training_steps = 10
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            for training_continuation in range(3):
+                with tf.Graph().as_default():
+                    model = MyModel()
+                    optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+                    root = tf.compat.v1.train.Checkpoint(
+                        optimizer=optimizer,
+                        model=model,
+                        global_step=tf.compat.v1.train.get_or_create_global_step(),  # noqa: E501
+                    )
+                    input_value = tf.constant([[3.0]])
+                    train_op = optimizer.minimize(
+                        model(input_value), global_step=root.global_step
+                    )
+                    checkpoint_path = tf.train.latest_checkpoint(
+                        checkpoint_directory
+                    )
+                    with self.session(
+                        graph=tf.compat.v1.get_default_graph()
+                    ) as session:
+                        status = root.restore(save_path=checkpoint_path)
+                        status.initialize_or_restore(session=session)
+                        if checkpoint_path is None:
+                            self.assertEqual(0, training_continuation)
+                            with self.assertRaises(AssertionError):
+                                status.assert_consumed()
+                            with self.assertRaises(AssertionError):
+                                status.assert_existing_objects_matched()
+                        else:
+                            status.assert_consumed()
+                            status.assert_existing_objects_matched()
+                        for _ in range(num_training_steps):
+                            session.run(train_op)
+                        root.save(
+                            file_prefix=checkpoint_prefix, session=session
+                        )
+                        self.assertEqual(
+                            (training_continuation + 1) * num_training_steps,
+                            session.run(root.global_step),
+                        )
+                        self.assertEqual(
+                            training_continuation + 1,
+                            session.run(root.save_counter),
+                        )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testAgnosticUsage(self):
+        """Graph/eager agnostic usage."""
+        # Does create garbage when executing eagerly due to ops.Graph()
+        # creation.
+        with self.test_session():
+            num_training_steps = 10
+            checkpoint_directory = self.get_temp_dir()
+            for training_continuation in range(3):
+                with test_utils.device(should_use_gpu=True):
+                    model = MyModel()
+                    optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+                    root = tf.train.Checkpoint(
+                        optimizer=optimizer,
+                        model=model,
+                        global_step=tf.compat.v1.train.get_or_create_global_step(),  # noqa: E501
+                    )
+                    manager = tf.train.CheckpointManager(
+                        root, checkpoint_directory, max_to_keep=1
+                    )
+                    status = root.restore(save_path=manager.latest_checkpoint)
+                    input_value = tf.constant([[3.0]])
+                    train_fn = functools.partial(
+                        optimizer.minimize,
+                        functools.partial(model, input_value),
+                        global_step=root.global_step,
+                    )
+                    if not tf.executing_eagerly():
+                        train_fn = functools.partial(self.evaluate, train_fn())
+                    status.initialize_or_restore()
+                    for _ in range(num_training_steps):
+                        train_fn()
+                    manager.save()
+                    self.assertEqual(
+                        (training_continuation + 1) * num_training_steps,
+                        self.evaluate(root.global_step),
+                    )
+                    self.assertEqual(
+                        training_continuation + 1,
+                        self.evaluate(root.save_counter),
+                    )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testWithDefun(self):
+        with self.test_session():
+            num_training_steps = 2
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            for training_continuation in range(3):
+                with test_utils.device(should_use_gpu=True):
+                    model = MyModel()
+                    # Don't actually train so we can test variable values
+                    optimizer = tf.compat.v1.train.AdamOptimizer(0.0)
+                    root = tf.train.Checkpoint(
+                        optimizer=optimizer,
+                        model=model,
+                        global_step=tf.compat.v1.train.get_or_create_global_step(),  # noqa: E501
+                    )
+                    checkpoint_path = tf.train.latest_checkpoint(
+                        checkpoint_directory
+                    )
+                    status = root.restore(save_path=checkpoint_path)
+
+                    def train_fn():
+                        @tf.function
+                        def _call_model(x):
+                            return model(x)
+
+                        with tf.GradientTape() as tape:
+                            loss = _call_model(tf.constant([[3.0]]))
+                        gradients = tape.gradient(loss, model.variables)
+                        return optimizer.apply_gradients(
+                            zip(gradients, model.variables),
+                            global_step=root.global_step,
+                        )
+
+                    if not tf.executing_eagerly():
+                        train_fn = functools.partial(self.evaluate, train_fn())
+                    status.initialize_or_restore()
+                    for _ in range(num_training_steps):
+                        train_fn()
+                    if training_continuation > 0:
+                        status.assert_consumed()
+                        self.assertAllClose(
+                            [[42.0]], self.evaluate(model.variables[0])
+                        )
+                    else:
+                        self.evaluate(model.variables[0].assign([[42.0]]))
+                    root.save(file_prefix=checkpoint_prefix)
+                    self.assertEqual(
+                        (training_continuation + 1) * num_training_steps,
+                        self.evaluate(root.global_step),
+                    )
+                    self.assertEqual(
+                        training_continuation + 1,
+                        self.evaluate(root.save_counter),
+                    )
+
+    @test_combinations.generate(test_combinations.combine(mode=["eager"]))
+    def testAnonymousVarsInInit(self):
+        class Model(training.Model):
+            def __init__(self):
+                super().__init__()
+                self.w = tf.Variable(0.0)
+                self.b = tf.Variable(0.0)
+                self.vars = [self.w, self.b]
+
+            def call(self, x):
+                return x * self.w + self.b
+
+        model = Model()
+        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.05)
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+        for _ in range(2):
+            checkpoint.save(checkpoint_prefix)
             with tf.GradientTape() as tape:
-              loss = _call_model(tf.constant([[3.]]))
-            gradients = tape.gradient(loss, model.variables)
-            return optimizer.apply_gradients(zip(gradients, model.variables),
-                                             global_step=root.global_step)
-          if not tf.executing_eagerly():
-            train_fn = functools.partial(
-                self.evaluate, train_fn())
-          status.initialize_or_restore()
-          for _ in range(num_training_steps):
-            train_fn()
-          if training_continuation > 0:
-            status.assert_consumed()
-            self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
-          else:
-            self.evaluate(model.variables[0].assign([[42.]]))
-          root.save(file_prefix=checkpoint_prefix)
-          self.assertEqual((training_continuation + 1) * num_training_steps,
-                           self.evaluate(root.global_step))
-          self.assertEqual(training_continuation + 1,
-                           self.evaluate(root.save_counter))
-  # pylint: enable=cell-var-from-loop
-
-  @test_combinations.generate(test_combinations.combine(mode=["eager"]))
-  def testAnonymousVarsInInit(self):
-
-    class Model(training.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.w = tf.Variable(0.0)
-        self.b = tf.Variable(0.0)
-        self.vars = [self.w, self.b]
-
-      def call(self, x):
-        return x * self.w + self.b
-
-    model = Model()
-    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.05)
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    checkpoint = tf.train.Checkpoint(
-        model=model, optimizer=optimizer)
-    for _ in range(2):
-      checkpoint.save(checkpoint_prefix)
-      with tf.GradientTape() as tape:
-        loss = (tf.constant(1.)
-                - model(tf.constant(1.))) ** 2
-      grad = tape.gradient(loss, model.vars)
-      optimizer.apply_gradients(
-          [(g, v) for g, v in zip(grad, model.vars)])
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def test_initialize_if_not_restoring(self):
-    with self.test_session():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
-      with test_utils.device(should_use_gpu=True):
-        model = MyModel()
-        optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-        root = tf.train.Checkpoint(
-            model=model,  # Do not save the optimizer with the checkpoint.
-            global_step=tf.compat.v1.train.get_or_create_global_step())
-        optimizer_checkpoint = tf.train.Checkpoint(
-            optimizer=optimizer)
-
-        checkpoint_path = tf.train.latest_checkpoint(
-            checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        input_value = tf.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
-        if not tf.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        self.evaluate([v.initializer for v in optimizer.variables()])
-        train_fn()
-        model_save_path = root.save(file_prefix=checkpoint_prefix)
-        self.evaluate(optimizer.variables()[0].assign(42.))
-        optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
-
-      # Restore into a graph with the optimizer
-      with test_utils.device(should_use_gpu=True):
-        model = MyModel()
-        optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-        root = tf.train.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=tf.compat.v1.train.get_or_create_global_step())
-        status = root.restore(save_path=model_save_path)
-        input_value = tf.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
-        if not tf.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        train_fn()
-        with self.assertRaises(AssertionError):
-          status.assert_existing_objects_matched()
-        with self.assertRaises(AssertionError):
-          status.assert_consumed()
-
-      # Make sure initialization doesn't clobber later restores
-      with test_utils.device(should_use_gpu=True):
-        model = MyModel()
-        optimizer = tf.compat.v1.train.AdamOptimizer(0.001, beta1=1.0)
-        root = tf.train.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=tf.compat.v1.train.get_or_create_global_step())
-        opt_root = tf.train.Checkpoint(
-            optimizer=optimizer)
-        status = root.restore(save_path=model_save_path)
-        init_only_optimizer_status = opt_root.restore(save_path=None)
-        optimizer_status = opt_root.restore(save_path=optimizer_save_path)
-        input_value = tf.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
-        if not tf.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        optimizer_status.run_restore_ops()
-        status.initialize_or_restore()
-        init_only_optimizer_status.initialize_or_restore()
-        train_fn()
-        self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+                loss = (tf.constant(1.0) - model(tf.constant(1.0))) ** 2
+            grad = tape.gradient(loss, model.vars)
+            optimizer.apply_gradients(
+                [(g, v) for g, v in zip(grad, model.vars)]
+            )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def test_initialize_if_not_restoring(self):
+        with self.test_session():
+            checkpoint_directory = self.get_temp_dir()
+            checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+            optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+            with test_utils.device(should_use_gpu=True):
+                model = MyModel()
+                optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+                root = tf.train.Checkpoint(
+                    # Do not save the optimizer with the checkpoint.
+                    model=model,
+                    global_step=tf.compat.v1.train.get_or_create_global_step(),
+                )
+                optimizer_checkpoint = tf.train.Checkpoint(optimizer=optimizer)
+
+                checkpoint_path = tf.train.latest_checkpoint(
+                    checkpoint_directory
+                )
+                status = root.restore(save_path=checkpoint_path)
+                input_value = tf.constant([[3.0]])
+                train_fn = functools.partial(
+                    optimizer.minimize,
+                    functools.partial(model, input_value),
+                    global_step=root.global_step,
+                )
+                if not tf.executing_eagerly():
+                    train_fn = functools.partial(self.evaluate, train_fn())
+                status.initialize_or_restore()
+                self.evaluate([v.initializer for v in optimizer.variables()])
+                train_fn()
+                model_save_path = root.save(file_prefix=checkpoint_prefix)
+                self.evaluate(optimizer.variables()[0].assign(42.0))
+                optimizer_save_path = optimizer_checkpoint.save(
+                    optimizer_only_prefix
+                )
+
+            # Restore into a graph with the optimizer
+            with test_utils.device(should_use_gpu=True):
+                model = MyModel()
+                optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+                root = tf.train.Checkpoint(
+                    optimizer=optimizer,
+                    model=model,
+                    global_step=tf.compat.v1.train.get_or_create_global_step(),
+                )
+                status = root.restore(save_path=model_save_path)
+                input_value = tf.constant([[3.0]])
+                train_fn = functools.partial(
+                    optimizer.minimize,
+                    functools.partial(model, input_value),
+                    global_step=root.global_step,
+                )
+                if not tf.executing_eagerly():
+                    train_fn = functools.partial(self.evaluate, train_fn())
+                status.initialize_or_restore()
+                train_fn()
+                with self.assertRaises(AssertionError):
+                    status.assert_existing_objects_matched()
+                with self.assertRaises(AssertionError):
+                    status.assert_consumed()
+
+            # Make sure initialization doesn't clobber later restores
+            with test_utils.device(should_use_gpu=True):
+                model = MyModel()
+                optimizer = tf.compat.v1.train.AdamOptimizer(0.001, beta1=1.0)
+                root = tf.train.Checkpoint(
+                    optimizer=optimizer,
+                    model=model,
+                    global_step=tf.compat.v1.train.get_or_create_global_step(),
+                )
+                opt_root = tf.train.Checkpoint(optimizer=optimizer)
+                status = root.restore(save_path=model_save_path)
+                init_only_optimizer_status = opt_root.restore(save_path=None)
+                optimizer_status = opt_root.restore(
+                    save_path=optimizer_save_path
+                )
+                input_value = tf.constant([[3.0]])
+                train_fn = functools.partial(
+                    optimizer.minimize,
+                    functools.partial(model, input_value),
+                    global_step=root.global_step,
+                )
+                if not tf.executing_eagerly():
+                    train_fn = functools.partial(self.evaluate, train_fn())
+                optimizer_status.run_restore_ops()
+                status.initialize_or_restore()
+                init_only_optimizer_status.initialize_or_restore()
+                train_fn()
+                self.assertEqual(42.0, self.evaluate(optimizer.variables()[0]))
 
 
 class CheckpointCompatibilityTests(test_combinations.TestCase):
-
-  def _initialized_model(self):
-    input_value = tf.constant([[3.]])
-    model = MyModel()
-    optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
-    optimizer_step = tf.compat.v1.train.get_or_create_global_step()
-    root_trackable = tf.train.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
-    self.evaluate(trackable_utils.gather_initializers(
-        root_trackable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
-    return root_trackable
-
-  def _set_sentinels(self, root_trackable):
-    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")
-        .assign([102.]))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
-
-  def _check_sentinels(self, root_trackable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_trackable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_trackable.optimizer.get_slot(
-            var=root_trackable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
-
-  def _write_name_based_checkpoint(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = tf.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        name_saver = tf.compat.v1.train.Saver()
-        return name_saver.save(
-            sess=session, save_path=checkpoint_prefix,
-            global_step=root.optimizer_step)
-
-  @test_combinations.generate(
-      test_combinations.combine(mode=["graph", "eager"]))
-  def testLoadFromNameBasedSaver(self):
-    """Save a name-based checkpoint, load it using the object-based API."""
-    with test_utils.device(should_use_gpu=True):
-      with self.test_session():
-        save_path = self._write_name_based_checkpoint()
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        with self.assertRaises(AssertionError):
-          self._check_sentinels(root)
-        object_saver = tf.train.Checkpoint(root=root)
-        self._set_sentinels(root)
-        status = object_saver.read(save_path)
-        if tf.executing_eagerly():
-          self._check_sentinels(root)
-        if tf.executing_eagerly():
-          status.assert_consumed()
-          status.assert_existing_objects_matched()
-          status.assert_nontrivial_match()
-        else:
-          # When graph building, we haven't read any keys, so we don't know
-          # whether the restore will be complete.
-          with self.assertRaisesRegex(AssertionError, "not restored"):
-            status.assert_consumed()
-          with self.assertRaisesRegex(AssertionError, "not restored"):
-            status.assert_existing_objects_matched()
-          with self.assertRaisesRegex(AssertionError, "not restored"):
-            status.assert_nontrivial_match()
-        status.run_restore_ops()
-        self._check_sentinels(root)
-        self._set_sentinels(root)
-        status = object_saver.read(save_path)
-        status.initialize_or_restore()
-        self._check_sentinels(root)
-        # Check that there is no error when keys are missing from the name-based
-        # checkpoint.
-        root.not_in_name_checkpoint = tf.Variable([1.])
-        status = object_saver.read(save_path)
-        with self.assertRaises(AssertionError):
-          status.assert_existing_objects_matched()
-
-  def testSaveGraphLoadEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = tf.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        save_path = root.save(file_prefix=checkpoint_prefix)
-    with tf.__internal__.eager_context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      root.restore(save_path).assert_consumed()
-      self._check_sentinels(root)
-
-  def testSaveEagerLoadGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with tf.__internal__.eager_context.eager_mode():
-      root = self._initialized_model()
-      save_path = root.save(file_prefix=checkpoint_prefix)
-    with context.graph_mode():
-      save_graph = tf.Graph()
-      with save_graph.as_default(), self.session(
-          graph=save_graph):
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        root.restore(save_path).assert_consumed().run_restore_ops()
-        self._check_sentinels(root)
+    def _initialized_model(self):
+        input_value = tf.constant([[3.0]])
+        model = MyModel()
+        optimizer = tf.compat.v1.train.AdamOptimizer(0.001)
+        optimizer_step = tf.compat.v1.train.get_or_create_global_step()
+        root_trackable = tf.train.Checkpoint(
+            optimizer=optimizer, model=model, optimizer_step=optimizer_step
+        )
+        train_op = optimizer.minimize(
+            functools.partial(model, input_value), global_step=optimizer_step
+        )
+        self.evaluate(trackable_utils.gather_initializers(root_trackable))
+        self.evaluate(train_op)
+        # A regular variable, a slot variable, and a non-slot Optimizer variable
+        # with known values to check when loading.
+        self.evaluate(model._named_dense.bias.assign([1.0]))
+        self.evaluate(
+            optimizer.get_slot(var=model._named_dense.bias, name="m").assign(
+                [2.0]
+            )
+        )
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.0))
+        return root_trackable
+
+    def _set_sentinels(self, root_trackable):
+        self.evaluate(root_trackable.model._named_dense.bias.assign([101.0]))
+        self.evaluate(
+            root_trackable.optimizer.get_slot(
+                var=root_trackable.model._named_dense.bias, name="m"
+            ).assign([102.0])
+        )
+        beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(103.0))
+
+    def _check_sentinels(self, root_trackable):
+        self.assertAllEqual(
+            [1.0], self.evaluate(root_trackable.model._named_dense.bias)
+        )
+        self.assertAllEqual(
+            [2.0],
+            self.evaluate(
+                root_trackable.optimizer.get_slot(
+                    var=root_trackable.model._named_dense.bias, name="m"
+                )
+            ),
+        )
+        beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+        self.assertAllEqual(3.0, self.evaluate(beta1_power))
+
+    def _write_name_based_checkpoint(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        with context.graph_mode():
+            save_graph = tf.Graph()
+            with save_graph.as_default(), self.session(
+                graph=save_graph
+            ) as session:
+                root = self._initialized_model()
+                name_saver = tf.compat.v1.train.Saver()
+                return name_saver.save(
+                    sess=session,
+                    save_path=checkpoint_prefix,
+                    global_step=root.optimizer_step,
+                )
+
+    @test_combinations.generate(
+        test_combinations.combine(mode=["graph", "eager"])
+    )
+    def testLoadFromNameBasedSaver(self):
+        """Save a name-based checkpoint, load it using the object-based API."""
+        with test_utils.device(should_use_gpu=True):
+            with self.test_session():
+                save_path = self._write_name_based_checkpoint()
+                root = self._initialized_model()
+                self._set_sentinels(root)
+                with self.assertRaises(AssertionError):
+                    self._check_sentinels(root)
+                object_saver = tf.train.Checkpoint(root=root)
+                self._set_sentinels(root)
+                status = object_saver.read(save_path)
+                if tf.executing_eagerly():
+                    self._check_sentinels(root)
+                if tf.executing_eagerly():
+                    status.assert_consumed()
+                    status.assert_existing_objects_matched()
+                    status.assert_nontrivial_match()
+                else:
+                    # When graph building, we haven't read any keys, so we don't
+                    # know whether the restore will be complete.
+                    with self.assertRaisesRegex(AssertionError, "not restored"):
+                        status.assert_consumed()
+                    with self.assertRaisesRegex(AssertionError, "not restored"):
+                        status.assert_existing_objects_matched()
+                    with self.assertRaisesRegex(AssertionError, "not restored"):
+                        status.assert_nontrivial_match()
+                status.run_restore_ops()
+                self._check_sentinels(root)
+                self._set_sentinels(root)
+                status = object_saver.read(save_path)
+                status.initialize_or_restore()
+                self._check_sentinels(root)
+                # Check that there is no error when keys are missing from the
+                # name-based checkpoint.
+                root.not_in_name_checkpoint = tf.Variable([1.0])
+                status = object_saver.read(save_path)
+                with self.assertRaises(AssertionError):
+                    status.assert_existing_objects_matched()
+
+    def testSaveGraphLoadEager(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        with context.graph_mode():
+            save_graph = tf.Graph()
+            with save_graph.as_default(), self.session(graph=save_graph):
+                root = self._initialized_model()
+                save_path = root.save(file_prefix=checkpoint_prefix)
+        with tf.__internal__.eager_context.eager_mode():
+            root = self._initialized_model()
+            self._set_sentinels(root)
+            root.restore(save_path).assert_consumed()
+            self._check_sentinels(root)
+
+    def testSaveEagerLoadGraph(self):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        with tf.__internal__.eager_context.eager_mode():
+            root = self._initialized_model()
+            save_path = root.save(file_prefix=checkpoint_prefix)
+        with context.graph_mode():
+            save_graph = tf.Graph()
+            with save_graph.as_default(), self.session(graph=save_graph):
+                root = self._initialized_model()
+                self._set_sentinels(root)
+                root.restore(save_path).assert_consumed().run_restore_ops()
+                self._check_sentinels(root)
 
 
 if __name__ == "__main__":
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/tests/tracking_util_xla_test.py b/keras/tests/tracking_util_xla_test.py
index 7fb0ddbf607c..4867ab5f20d0 100644
--- a/keras/tests/tracking_util_xla_test.py
+++ b/keras/tests/tracking_util_xla_test.py
@@ -13,65 +13,70 @@
 # limitations under the License.
 # ==============================================================================
 
-from tensorflow.compiler.tests import xla_test
-
 import tensorflow.compat.v2 as tf
+
 from keras.engine import training
 from keras.layers import core
-from keras.optimizers.optimizer_v2 import adam
-from tensorflow.python.training.tracking import util as trackable_utils
+from keras.optimizers.legacy import adam
 
+# isort: off
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.checkpoint import (
+    checkpoint as trackable_utils,
+)
 
-class NonLayerTrackable(tf.Module):
 
-  def __init__(self):
-    super().__init__()
-    self.a_variable = trackable_utils.add_variable(
-        self, name="a_variable", shape=[])
+class NonLayerTrackable(tf.Module):
+    def __init__(self):
+        super().__init__()
+        self.a_variable = trackable_utils.add_variable(
+            self, name="a_variable", shape=[]
+        )
 
 
 class Subclassed(training.Model):
-  """A concrete Model for testing."""
+    """A concrete Model for testing."""
 
-  def __init__(self):
-    super().__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Trackables which aren't Layers.
-    self._non_layer = NonLayerTrackable()
+    def __init__(self):
+        super().__init__()
+        self._named_dense = core.Dense(1, use_bias=True)
+        self._second = core.Dense(1, use_bias=False)
+        # We can still track Trackables which aren't Layers.
+        self._non_layer = NonLayerTrackable()
 
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
+    def call(self, values):
+        ret = self._second(self._named_dense(values))
+        return ret
 
 
 class CheckpointingTests(xla_test.XLATestCase):
-
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    for training_continuation in range(3):
-      with self.test_scope():
-        model = Subclassed()
-        optimizer = adam.Adam(0.001)
-        root = tf.train.Checkpoint(
-            optimizer=optimizer, model=model)
-        manager = tf.train.CheckpointManager(
-            root, checkpoint_directory, max_to_keep=2)
-        root.restore(manager.latest_checkpoint)
-        for _ in range(num_training_steps):
-          input_value = tf.constant([[3.]])
-          with tf.GradientTape() as tape:
-            loss = model(input_value)
-          variables = model.trainable_variables
-          gradients = tape.gradient(loss, variables)
-          optimizer.apply_gradients(zip(gradients, variables))
-        manager.save()
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         root.optimizer.iterations.numpy())
+    def testDeferredRestorationUsageEager(self):
+        """An idiomatic eager execution example."""
+        num_training_steps = 10
+        checkpoint_directory = self.get_temp_dir()
+        for training_continuation in range(3):
+            with self.test_scope():
+                model = Subclassed()
+                optimizer = adam.Adam(0.001)
+                root = tf.train.Checkpoint(optimizer=optimizer, model=model)
+                manager = tf.train.CheckpointManager(
+                    root, checkpoint_directory, max_to_keep=2
+                )
+                root.restore(manager.latest_checkpoint)
+                for _ in range(num_training_steps):
+                    input_value = tf.constant([[3.0]])
+                    with tf.GradientTape() as tape:
+                        loss = model(input_value)
+                    variables = model.trainable_variables
+                    gradients = tape.gradient(loss, variables)
+                    optimizer.apply_gradients(zip(gradients, variables))
+                manager.save()
+                self.assertEqual(
+                    (training_continuation + 1) * num_training_steps,
+                    root.optimizer.iterations.numpy(),
+                )
 
 
 if __name__ == "__main__":
-  tf.compat.v1.enable_eager_execution()
-  tf.test.main()
+    tf.compat.v1.enable_eager_execution()
+    tf.test.main()
diff --git a/keras/tools/bazel_build.sh b/keras/tools/bazel_build.sh
new file mode 100644
index 000000000000..f58233646514
--- /dev/null
+++ b/keras/tools/bazel_build.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+BAZEL_VERSION=5.4.0
+rm -rf ~/bazel
+mkdir ~/bazel
+
+pushd ~/bazel
+wget https://github.com/bazelbuild/bazel/releases/download/"${BAZEL_VERSION}"/bazel-"${BAZEL_VERSION}"-installer-linux-x86_64.sh
+chmod +x bazel-*.sh
+./bazel-"${BAZEL_VERSION}"-installer-linux-x86_64.sh --user
+rm bazel-"${BAZEL_VERSION}"-installer-linux-x86_64.sh
+popd
+
+PATH="/home/kbuilder/bin:$PATH"
+which bazel
+bazel version
+
+TAG_FILTERS="-no_oss,-oss_excluded,-oss_serial,-gpu,-benchmark-test,-no_oss_py3,-no_pip,-nopip"
+bazel build \
+    --define=use_fast_cpp_protos=false \
+    --build_tag_filters="${TAG_FILTERS}" \
+    -- //keras/...
diff --git a/keras/tools/pip_package/BUILD b/keras/tools/pip_package/BUILD
index 33d7bc2415a2..5b086a4f01cc 100644
--- a/keras/tools/pip_package/BUILD
+++ b/keras/tools/pip_package/BUILD
@@ -27,6 +27,7 @@ COMMON_PIP_DEPS = [
     "//keras/dtensor:test_util",
     "//keras/distribute:distribute_test_lib_pip",
     "//keras/integration_test:preprocessing_test_utils",
+    "//keras/integration_test/models:models",
     "//keras/layers/preprocessing:preprocessing_test_utils",
     "//keras/layers/preprocessing/benchmarks:feature_column_benchmark",
     "//keras/mixed_precision:test_util",
diff --git a/keras/tools/pip_package/create_pip_helper.py b/keras/tools/pip_package/create_pip_helper.py
index dd576e663852..02f380e78799 100644
--- a/keras/tools/pip_package/create_pip_helper.py
+++ b/keras/tools/pip_package/create_pip_helper.py
@@ -22,107 +22,122 @@
 import fnmatch
 import os
 
-PIP_EXCLUDED_FILES = frozenset([
-    'keras/api/create_python_api_wrapper.py',
-    'keras/applications/efficientnet_weight_update_util.py',
-    'keras/distribute/tpu_strategy_test_utils.py',
-    'keras/saving/saved_model/create_test_saved_model.py',
-    'keras/tools/pip_package/setup.py',
-    'keras/tools/pip_package/create_pip_helper.py',
-])
-
-PIP_EXCLUDED_DIRS = frozenset([
-    'keras/benchmarks',
-    'keras/integration_tests',
-    'keras/tests',
-])
+PIP_EXCLUDED_FILES = frozenset(
+    [
+        "keras/api/create_python_api_wrapper.py",
+        "keras/applications/efficientnet_weight_update_util.py",
+        "keras/distribute/tpu_strategy_test_utils.py",
+        "keras/saving/legacy/saved_model/create_test_saved_model.py",
+        "keras/tools/pip_package/setup.py",
+        "keras/tools/pip_package/create_pip_helper.py",
+    ]
+)
+
+PIP_EXCLUDED_DIRS = frozenset(
+    [
+        "keras/benchmarks",
+        "keras/tests",
+    ]
+)
 
 # Directories that should not have __init__.py files generated within them.
-EXCLUDED_INIT_FILE_DIRECTORIES = frozenset([
-    'keras/benchmarks',
-    'keras/tools',
-])
+EXCLUDED_INIT_FILE_DIRECTORIES = frozenset(
+    [
+        "keras/benchmarks",
+        "keras/tools",
+    ]
+)
 
 
 class PipPackagingError(Exception):
-  pass
+    pass
 
 
 def create_init_files(pip_root):
-  """Create __init__.py in pip directory tree.
-
-  These files are auto-generated by Bazel when doing typical build/test, but
-  do not get auto-generated by the pip build process. Currently, the entire
-  directory tree is just python files, so its fine to just create all of the
-  init files.
-
-  Args:
-    pip_root: Root directory of code being packaged into pip.
-  """
-  for path, subdirs, _ in os.walk(pip_root):
-    for subdir in subdirs:
-      init_file_path = os.path.join(path, subdir, '__init__.py')
-      if any(excluded_path in init_file_path
-             for excluded_path in EXCLUDED_INIT_FILE_DIRECTORIES):
-        continue
-      if not os.path.exists(init_file_path):
-        # Create empty file
-        open(init_file_path, 'w').close()
+    """Create __init__.py in pip directory tree.
+
+    These files are auto-generated by Bazel when doing typical build/test, but
+    do not get auto-generated by the pip build process. Currently, the entire
+    directory tree is just python files, so its fine to just create all of the
+    init files.
+
+    Args:
+      pip_root: Root directory of code being packaged into pip.
+    """
+    for path, subdirs, _ in os.walk(pip_root):
+        for subdir in subdirs:
+            init_file_path = os.path.join(path, subdir, "__init__.py")
+            if any(
+                excluded_path in init_file_path
+                for excluded_path in EXCLUDED_INIT_FILE_DIRECTORIES
+            ):
+                continue
+            if not os.path.exists(init_file_path):
+                # Create empty file
+                open(init_file_path, "w").close()
 
 
 def verify_python_files_in_pip(pip_root, bazel_root):
-  """Verifies all expected files are packaged into Pip.
-
-  Args:
-    pip_root: Root directory of code being packaged into pip.
-    bazel_root: Root directory of Keras Bazel workspace.
-
-  Raises:
-    PipPackagingError: Missing file in pip.
-  """
-  for path, _, files in os.walk(bazel_root):
-    if any(d for d in PIP_EXCLUDED_DIRS if d in path):
-      # Skip any directories that are exclude from PIP, eg tests.
-      continue
-
-    python_files = set(fnmatch.filter(files, '*.py'))
-    python_test_files = set(fnmatch.filter(files, '*test.py'))
-    python_benchmark_files = set(fnmatch.filter(files, '*benchmark.py'))
-    # We only care about python files in the pip package, see create_init_files.
-    files = python_files - python_test_files - python_benchmark_files
-    for f in files:
-      pip_path = os.path.join(pip_root, os.path.relpath(path, bazel_root), f)
-      file_name = os.path.join(path, f)
-      path_exists = os.path.exists(pip_path)
-      file_excluded = file_name.lstrip('./') in PIP_EXCLUDED_FILES
-      if not path_exists and not file_excluded:
-        raise PipPackagingError(
-            ('Pip package missing the file %s. If this is expected, add it '
-             'to PIP_EXCLUDED_FILES in create_pip_helper.py. Otherwise, '
-             'make sure it is a build dependency of the pip package') %
-            file_name)
-      if path_exists and file_excluded:
-        raise PipPackagingError(
-            ('File in PIP_EXCLUDED_FILES included in pip. %s' % file_name))
+    """Verifies all expected files are packaged into Pip.
+
+    Args:
+      pip_root: Root directory of code being packaged into pip.
+      bazel_root: Root directory of Keras Bazel workspace.
+
+    Raises:
+      PipPackagingError: Missing file in pip.
+    """
+    for path, _, files in os.walk(bazel_root):
+        if any(d for d in PIP_EXCLUDED_DIRS if d in path):
+            # Skip any directories that are exclude from PIP, eg tests.
+            continue
+
+        python_files = set(fnmatch.filter(files, "*.py"))
+        python_test_files = set(fnmatch.filter(files, "*test.py"))
+        python_benchmark_files = set(fnmatch.filter(files, "*benchmark.py"))
+        # We only care about python files in the pip package, see
+        # create_init_files.
+        files = python_files - python_test_files - python_benchmark_files
+        for f in files:
+            pip_path = os.path.join(
+                pip_root, os.path.relpath(path, bazel_root), f
+            )
+            file_name = os.path.join(path, f)
+            path_exists = os.path.exists(pip_path)
+            file_excluded = file_name.lstrip("./") in PIP_EXCLUDED_FILES
+            if not path_exists and not file_excluded:
+                raise PipPackagingError(
+                    "Pip package missing the file %s. If this is expected, "
+                    "add it to PIP_EXCLUDED_FILES in "
+                    "create_pip_helper.py. Otherwise, "
+                    "make sure it is a build dependency of the pip package"
+                    % file_name
+                )
+            if path_exists and file_excluded:
+                raise PipPackagingError(
+                    f"File in PIP_EXCLUDED_FILES included in pip. {file_name}"
+                )
 
 
 def main():
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--bazel-root',
-      type=str,
-      required=True,
-      help='Root directory of Keras Bazel workspace.')
-  parser.add_argument(
-      '--pip-root',
-      type=str,
-      required=True,
-      help='Root directory of code being packaged into pip.')
-
-  args = parser.parse_args()
-  create_init_files(args.pip_root)
-  verify_python_files_in_pip(args.pip_root, args.bazel_root)
-
-
-if __name__ == '__main__':
-  main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bazel-root",
+        type=str,
+        required=True,
+        help="Root directory of Keras Bazel workspace.",
+    )
+    parser.add_argument(
+        "--pip-root",
+        type=str,
+        required=True,
+        help="Root directory of code being packaged into pip.",
+    )
+
+    args = parser.parse_args()
+    create_init_files(args.pip_root)
+    verify_python_files_in_pip(args.pip_root, args.bazel_root)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/keras/tools/pip_package/setup.py b/keras/tools/pip_package/setup.py
index 3c4eb033712c..b47a0b91acbc 100644
--- a/keras/tools/pip_package/setup.py
+++ b/keras/tools/pip_package/setup.py
@@ -23,58 +23,61 @@
 from __future__ import print_function
 
 import sys
+
 import setuptools
 
-DOCLINES = __doc__.split('\n')
+DOCLINES = __doc__.split("\n")
 
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '2.10.0'
+_VERSION = "2.15.0"
 
 REQUIRED_PACKAGES = [
     # We depend on TensorFlow's declared pip dependencies.
     # Add a new dep there if one is needed.
 ]
 
-project_name = 'keras'
-if '--project_name' in sys.argv:
-  project_name_idx = sys.argv.index('--project_name')
-  project_name = sys.argv[project_name_idx + 1]
-  sys.argv.remove('--project_name')
-  sys.argv.pop(project_name_idx)
+project_name = "keras"
+if "--project_name" in sys.argv:
+    project_name_idx = sys.argv.index("--project_name")
+    project_name = sys.argv[project_name_idx + 1]
+    sys.argv.remove("--project_name")
+    sys.argv.pop(project_name_idx)
 
 
 setuptools.setup(
     name=project_name,
-    version=_VERSION.replace('-', ''),
-    description='Deep learning for humans.',
-    long_description='\n'.join(DOCLINES[2:]),
-    url='https://keras.io/',
-    download_url='https://github.com/keras-team/keras/tags',
-    author='Keras team',
-    author_email='keras-users@googlegroups.com',
+    version=_VERSION.replace("-", ""),
+    description="Deep learning for humans.",
+    long_description="\n".join(DOCLINES[2:]),
+    url="https://keras.io/",
+    download_url="https://github.com/keras-team/keras/tags",
+    author="Keras team",
+    author_email="keras-users@googlegroups.com",
     packages=setuptools.find_packages(),
     install_requires=REQUIRED_PACKAGES,
+    # Supported Python versions
+    python_requires=">=3.9",
     # PyPI package information.
     classifiers=[
-        'Development Status :: 5 - Production/Stable',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Education',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: Apache Software License',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3 :: Only',
-        'Topic :: Scientific/Engineering',
-        'Topic :: Scientific/Engineering :: Mathematics',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: Software Development',
-        'Topic :: Software Development :: Libraries',
-        'Topic :: Software Development :: Libraries :: Python Modules',
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3 :: Only",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Mathematics",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
     ],
-    license='Apache 2.0',
-    keywords=['keras', 'tensorflow', 'machine learning', 'deep learning'],
+    license="Apache 2.0",
+    keywords=["keras", "tensorflow", "machine learning", "deep learning"],
 )
diff --git a/keras/utils/BUILD b/keras/utils/BUILD
index 50aaf6452996..034f587f1e5f 100644
--- a/keras/utils/BUILD
+++ b/keras/utils/BUILD
@@ -1,15 +1,13 @@
 # Description:
 #   Contains the Keras Utilities (internal TensorFlow version).
 
+# Placeholder: load unaliased py_library
 load("@org_keras//keras:keras.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//keras:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
-    default_visibility = [
-        "//keras:friends",
-        "//third_party/tensorflow/python/feature_column:__pkg__",
-        "//third_party/tensorflow/tools/pip_package:__pkg__",
-    ],
+    default_visibility = ["//keras:friends"],
     licenses = ["notice"],
 )
 
@@ -17,17 +15,21 @@ py_library(
     name = "utils",
     srcs = [
         "__init__.py",
+        "legacy/__init__.py",
     ],
     srcs_version = "PY3",
     deps = [
         ":audio_dataset",
         ":data_utils",
+        ":feature_space",
         ":generic_utils",
         ":image_dataset",
         ":image_utils",
         ":layer_utils",
         ":np_utils",
+        ":sidecar_evaluator",
         ":text_dataset",
+        ":timed_threads",
         ":timeseries_dataset",
         ":vis_utils",
     ],
@@ -299,6 +301,68 @@ py_library(
     ],
 )
 
+py_library(
+    name = "sidecar_evaluator",
+    srcs = ["sidecar_evaluator.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//:expect_tensorboard_installed",
+        "//:expect_tensorflow_installed",
+    ],
+)
+
+py_library(
+    name = "feature_space",
+    srcs = ["feature_space.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//:expect_tensorflow_installed",
+        "//keras:backend",
+        "//keras/layers",
+    ],
+)
+
+py_library(
+    name = "timed_threads",
+    srcs = ["timed_threads.py"],
+    srcs_version = "PY3",
+)
+
+py_library(
+    name = "steps_per_execution_tuning",
+    srcs = ["steps_per_execution_tuning.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//:expect_numpy_installed",
+    ],
+)
+
+tf_py_test(
+    name = "steps_per_execution_tuning_test",
+    srcs = ["steps_per_execution_tuning_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":steps_per_execution_tuning",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
+    ],
+)
+
+tf_py_test(
+    name = "sidecar_evaluator_test",
+    size = "medium",
+    srcs = ["sidecar_evaluator_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":sidecar_evaluator",
+        "//:expect_absl_installed",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
 tf_py_test(
     name = "dataset_creator_test",
     srcs = ["dataset_creator_test.py"],
@@ -431,9 +495,14 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":layer_utils",
+        ":tf_utils",
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
+        "//keras:backend",
+        "//keras/dtensor",
+        "//keras/dtensor:layout_map",
+        "//keras/dtensor:test_util",
     ],
 )
 
@@ -447,6 +516,7 @@ tf_py_test(
         "//:expect_numpy_installed",
         "//:expect_tensorflow_installed",
         "//keras",
+        "//keras/testing_infra:test_combinations",
     ],
 )
 
@@ -598,3 +668,45 @@ tf_py_test(
         "//keras/testing_infra:test_utils",
     ],
 )
+
+tf_py_test(
+    name = "audio_dataset_with_tfio_test",
+    size = "small",
+    srcs = ["audio_dataset_with_tfio_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":audio_dataset",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//:expect_tensorflow_io_installed",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
+tf_py_test(
+    name = "feature_space_test",
+    size = "medium",
+    srcs = ["feature_space_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":feature_space",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
+
+tf_py_test(
+    name = "timed_threads_test",
+    size = "small",
+    srcs = ["timed_threads_test.py"],
+    deps = [
+        ":timed_threads",
+        "//:expect_tensorflow_installed",
+        "//keras",
+        "//keras/testing_infra:test_combinations",
+        "//keras/testing_infra:test_utils",
+    ],
+)
diff --git a/keras/utils/__init__.py b/keras/utils/__init__.py
index 23509cfd2b16..db2063432e6d 100644
--- a/keras/utils/__init__.py
+++ b/keras/utils/__init__.py
@@ -13,19 +13,30 @@
 # limitations under the License.
 # ==============================================================================
 """Public Keras utilities."""
-# pylint: disable=g-bad-import-order
 
-from keras.utils.data_utils import get_file
-from keras.utils.dataset_utils import split_dataset
-from keras.utils.generic_utils import Progbar
-from keras.utils.image_dataset import image_dataset_from_directory
+# isort: off
+
+# Serialization related
+from keras.saving.serialization_lib import deserialize_keras_object
+from keras.saving.serialization_lib import serialize_keras_object
+from keras.saving.object_registration import CustomObjectScope
+from keras.saving.object_registration import custom_object_scope
+from keras.saving.object_registration import get_custom_objects
+from keras.saving.object_registration import get_registered_name
+from keras.saving.object_registration import register_keras_serializable
+
+# Dataset related
+from keras.utils.audio_dataset import audio_dataset_from_directory
 from keras.utils.text_dataset import text_dataset_from_directory
-from keras.utils.tf_utils import set_random_seed
 from keras.utils.timeseries_dataset import timeseries_dataset_from_array
-from keras.utils.vis_utils import model_to_dot
-from keras.utils.vis_utils import plot_model
-from keras.utils.np_utils import normalize
-from keras.utils.np_utils import to_categorical
+from keras.utils.image_dataset import image_dataset_from_directory
+from keras.utils.dataset_utils import split_dataset
+
+# Sequence related
+from keras.utils.data_utils import GeneratorEnqueuer
+from keras.utils.data_utils import OrderedEnqueuer
+from keras.utils.data_utils import Sequence
+from keras.utils.data_utils import SequenceEnqueuer
 
 # Image related
 from keras.utils.image_utils import array_to_img
@@ -33,22 +44,31 @@
 from keras.utils.image_utils import load_img
 from keras.utils.image_utils import save_img
 
-# Sequence related
-from keras.utils.data_utils import Sequence
-from keras.utils.data_utils import GeneratorEnqueuer
-from keras.utils.data_utils import OrderedEnqueuer
-from keras.utils.data_utils import SequenceEnqueuer
-from keras.utils.data_utils import pad_sequences
+# Python utils
+from keras.utils.tf_utils import set_random_seed
+from keras.utils.generic_utils import Progbar
+from keras.utils.data_utils import get_file
 
-# Serialization related
-from keras.utils.generic_utils import custom_object_scope
-from keras.utils.generic_utils import CustomObjectScope
-from keras.utils.generic_utils import deserialize_keras_object
-from keras.utils.generic_utils import get_custom_objects
-from keras.utils.generic_utils import serialize_keras_object
-
-# Audio related
-from keras.utils.audio_dataset import audio_dataset_from_directory
+# Preprocessing utils
+from keras.utils.feature_space import FeatureSpace
 
 # Internal
 from keras.utils.layer_utils import get_source_inputs
+from keras.utils.layer_utils import warmstart_embedding_matrix
+
+# Deprecated
+from keras.utils.np_utils import normalize
+from keras.utils.np_utils import to_categorical
+from keras.utils.np_utils import to_ordinal
+from keras.utils.data_utils import pad_sequences
+
+# Evaluation related
+from keras.utils.sidecar_evaluator import SidecarEvaluator
+from keras.utils.sidecar_evaluator import SidecarEvaluatorModelExport
+
+# Timed Thread
+from keras.utils.timed_threads import TimedThread
+
+# Visualization related
+from keras.utils.vis_utils import model_to_dot
+from keras.utils.vis_utils import plot_model
diff --git a/keras/utils/audio_dataset.py b/keras/utils/audio_dataset.py
index a9d821afcf31..60d2ec422769 100644
--- a/keras/utils/audio_dataset.py
+++ b/keras/utils/audio_dataset.py
@@ -14,19 +14,16 @@
 # ==============================================================================
 """Keras audio dataset loading utilities."""
 
-import tensorflow.compat.v2 as tf
-
-# pylint: disable=g-classes-have-attributes
-
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.utils import dataset_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-try:
-  import tensorflow_io as tfio
-except ImportError:
-  tfio = None
+
+tfio = None  # Import as-needed.
 
 ALLOWED_FORMATS = (".wav",)
 
@@ -47,7 +44,7 @@ def audio_dataset_from_directory(
     subset=None,
     follow_links=False,
 ):
-  """Generates a `tf.data.Dataset` from audio files in a directory.
+    """Generates a `tf.data.Dataset` from audio files in a directory.
 
     If your directory structure is:
 
@@ -70,202 +67,237 @@ def audio_dataset_from_directory(
     Only `.wav` files are supported at this time.
 
     Args:
-      directory: Directory where the data is located. If `labels` is "inferred",
-        it should contain subdirectories, each containing audio files for a
-        class. Otherwise, the directory structure is ignored.
-      labels: Either "inferred" (labels are generated from the directory
-        structure), None (no labels), or a list/tuple of integer labels of the
-        same size as the number of audio files found in the directory. Labels
-        should be sorted according to the alphanumeric order of the audio file
-        paths (obtained via `os.walk(directory)` in Python).
-      label_mode: String describing the encoding of `labels`. Options are:
-          - 'int': means that the labels are encoded as integers (e.g. for
-            `sparse_categorical_crossentropy` loss). - 'categorical' means that
-            the labels are encoded as a categorical vector (e.g. for
-            `categorical_crossentropy` loss). - 'binary' means that the labels
-            (there can be only 2) are encoded as `float32` scalars with values 0
-            or 1 (e.g. for `binary_crossentropy`). - None (no labels).
-      class_names: Only valid if "labels" is "inferred". This is the explicit
-        list of class names (must match names of subdirectories). Used to
-        control the order of the classes (otherwise alphanumerical order is
-        used).
-      batch_size: Size of the batches of data. Default: 32. If `None`, the data
-        will not be batched (the dataset will yield individual samples).
-      sampling_rate: Audio sampling rate (in samples per second).
-      output_sequence_length: Maximum length of an audio sequence. Audio files
-        longer than this will be truncated to `output_sequence_length`. If set
-        to `None`, then all sequences in the same batch will be padded to the
-        length of the longest sequence in the batch.
-      ragged: Whether to return a Ragged dataset (where each sequence has its
-        own length). Default: False.
-      shuffle: Whether to shuffle the data. Default: True. If set to False,
-        sorts the data in alphanumeric order.
-      seed: Optional random seed for shuffling and transformations.
-      validation_split: Optional float between 0 and 1, fraction of data to
-        reserve for validation.
-      subset: Subset of the data to return. One of "training", "validation" or
-        "both". Only used if `validation_split` is set.
-      follow_links: Whether to visits subdirectories pointed to by symlinks.
-        Defaults to False.
+        directory: Directory where the data is located.
+            If `labels` is `"inferred"`, it should contain subdirectories,
+            each containing audio files for a class. Otherwise, the directory
+            structure is ignored.
+        labels: Either "inferred" (labels are generated from the directory
+            structure), `None` (no labels), or a list/tuple of integer labels
+            of the same size as the number of audio files found in
+            the directory. Labels should be sorted according to the
+            alphanumeric order of the audio file paths
+            (obtained via `os.walk(directory)` in Python).
+        label_mode: String describing the encoding of `labels`. Options are:
+            - `"int"`: means that the labels are encoded as integers (e.g. for
+              `sparse_categorical_crossentropy` loss).
+            - `"categorical"` means that the labels are encoded as a categorical
+              vector (e.g. for `categorical_crossentropy` loss)
+            - `"binary"` means that the labels (there can be only 2)
+              are encoded as `float32` scalars with values 0
+              or 1 (e.g. for `binary_crossentropy`).
+            - `None` (no labels).
+        class_names: Only valid if "labels" is `"inferred"`.
+            This is the explicit list of class names
+            (must match names of subdirectories). Used to control the order
+            of the classes (otherwise alphanumerical order is used).
+        batch_size: Size of the batches of data. Default: 32. If `None`,
+            the data will not be batched
+            (the dataset will yield individual samples).
+        sampling_rate: Audio sampling rate (in samples per second).
+        output_sequence_length: Maximum length of an audio sequence. Audio files
+            longer than this will be truncated to `output_sequence_length`.
+            If set to `None`, then all sequences in the same batch will
+            be padded to the
+            length of the longest sequence in the batch.
+        ragged: Whether to return a Ragged dataset (where each sequence has its
+            own length). Defaults to `False`.
+        shuffle: Whether to shuffle the data. Defaults to `True`.
+            If set to `False`, sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        validation_split: Optional float between 0 and 1, fraction of data to
+            reserve for validation.
+        subset: Subset of the data to return. One of `"training"`,
+            `"validation"` or `"both"`. Only used if `validation_split` is set.
+        follow_links: Whether to visits subdirectories pointed to by symlinks.
+            Defaults to `False`.
 
     Returns:
-      A `tf.data.Dataset` object.
-        - If `label_mode` is None, it yields `string` tensors of shape
-          `(batch_size,)`, containing the contents of a batch of audio files.
-        - Otherwise, it yields a tuple `(audio, labels)`, where `audio`
-          has shape `(batch_size, sequence_length, num_channels)` and `labels`
-          follows the format described
-          below.
+
+    A `tf.data.Dataset` object.
+
+    - If `label_mode` is `None`, it yields `string` tensors of shape
+      `(batch_size,)`, containing the contents of a batch of audio files.
+    - Otherwise, it yields a tuple `(audio, labels)`, where `audio`
+      has shape `(batch_size, sequence_length, num_channels)` and `labels`
+      follows the format described
+      below.
 
     Rules regarding labels format:
-      - if `label_mode` is `int`, the labels are an `int32` tensor of shape
-        `(batch_size,)`.
-      - if `label_mode` is `binary`, the labels are a `float32` tensor of
-        1s and 0s of shape `(batch_size, 1)`.
-      - if `label_mode` is `categorical`, the labels are a `float32` tensor
-        of shape `(batch_size, num_classes)`, representing a one-hot
-        encoding of the class index.
+
+    - if `label_mode` is `int`, the labels are an `int32` tensor of shape
+      `(batch_size,)`.
+    - if `label_mode` is `binary`, the labels are a `float32` tensor of
+      1s and 0s of shape `(batch_size, 1)`.
+    - if `label_mode` is `categorical`, the labels are a `float32` tensor
+      of shape `(batch_size, num_classes)`, representing a one-hot
+      encoding of the class index.
     """
-  if labels not in ("inferred", None):
-    if not isinstance(labels, (list, tuple)):
-      raise ValueError(
-          "The `labels` argument should be a list/tuple of integer labels, of "
-          "the same size as the number of audio files in the target "
-          "directory. If you wish to infer the labels from the subdirectory "
-          'names in the target directory, pass `labels="inferred"`. '
-          "If you wish to get a dataset that only contains audio samples "
-          f"(no labels), pass `labels=None`. Received: labels={labels}")
-    if class_names:
-      raise ValueError("You can only pass `class_names` if "
-                       f'`labels="inferred"`. Received: labels={labels}, and '
-                       f"class_names={class_names}")
-  if label_mode not in {"int", "categorical", "binary", None}:
-    raise ValueError(
-        '`label_mode` argument must be one of "int", "categorical", "binary", '
-        f'or None. Received: label_mode={label_mode}'
+    if labels not in ("inferred", None):
+        if not isinstance(labels, (list, tuple)):
+            raise ValueError(
+                "The `labels` argument should be a list/tuple of integer "
+                "labels, of the same size as the number of audio files in "
+                "the target directory. If you wish to infer the labels from "
+                "the subdirectory names in the target directory,"
+                ' pass `labels="inferred"`. '
+                "If you wish to get a dataset that only contains audio samples "
+                f"(no labels), pass `labels=None`. Received: labels={labels}"
+            )
+        if class_names:
+            raise ValueError(
+                "You can only pass `class_names` if "
+                f'`labels="inferred"`. Received: labels={labels}, and '
+                f"class_names={class_names}"
+            )
+    if label_mode not in {"int", "categorical", "binary", None}:
+        raise ValueError(
+            '`label_mode` argument must be one of "int", "categorical", '
+            '"binary", '
+            f"or None. Received: label_mode={label_mode}"
+        )
+
+    if ragged and output_sequence_length is not None:
+        raise ValueError(
+            "Cannot set both `ragged` and `output_sequence_length`"
+        )
+
+    if sampling_rate is not None:
+        if not isinstance(sampling_rate, int):
+            raise ValueError(
+                "`sampling_rate` should have an integer value. "
+                f"Received: sampling_rate={sampling_rate}"
+            )
+
+        if sampling_rate <= 0:
+            raise ValueError(
+                "`sampling_rate` should be higher than 0. "
+                f"Received: sampling_rate={sampling_rate}"
+            )
+
+        global tfio
+        if tfio is None:
+            try:
+                import tensorflow_io as tfio
+            except ImportError:
+                raise ImportError(
+                    "To use the argument `sampling_rate`, you should install "
+                    "tensorflow_io. You can install it via `pip install "
+                    "tensorflow-io`."
+                )
+
+    if labels is None or label_mode is None:
+        labels = None
+        label_mode = None
+
+    dataset_utils.check_validation_split_arg(
+        validation_split, subset, shuffle, seed
     )
 
-  if ragged and output_sequence_length is not None:
-    raise ValueError("Cannot set both `ragged` and `output_sequence_length`")
-
-  if sampling_rate is not None:
-    if not isinstance(sampling_rate, int):
-      raise ValueError('`sampling_rate` should have an integer value. '
-                       f'Received: sampling_rate={sampling_rate}')
-
-    if sampling_rate <= 0:
-      raise ValueError(f'`sampling_rate` should be higher than 0. '
-                       f'Received: sampling_rate={sampling_rate}')
-
-    if tfio is None:
-      raise ImportError(
-          'To use the argument `sampling_rate`, you should install '
-          'tensorflow_io. You can install it via `pip install tensorflow-io`.'
-      )
-
-  if labels is None or label_mode is None:
-    labels = None
-    label_mode = None
-
-  dataset_utils.check_validation_split_arg(validation_split, subset, shuffle,
-                                           seed)
-
-  if seed is None:
-    seed = np.random.randint(1e6)
-
-  file_paths, labels, class_names = dataset_utils.index_directory(
-      directory,
-      labels,
-      formats=ALLOWED_FORMATS,
-      class_names=class_names,
-      shuffle=shuffle,
-      seed=seed,
-      follow_links=follow_links,
-  )
-
-  if label_mode == "binary" and len(class_names) != 2:
-    raise ValueError(
-        f'When passing `label_mode="binary"`, there must be exactly 2 '
-        f"class_names. Received: class_names={class_names}")
-
-  if subset == "both":
-    train_dataset, val_dataset = get_training_and_validation_dataset(
-        file_paths=file_paths,
-        labels=labels,
-        validation_split=validation_split,
-        directory=directory,
-        label_mode=label_mode,
-        class_names=class_names,
-        sampling_rate=sampling_rate,
-        output_sequence_length=output_sequence_length,
-        ragged=ragged,
-    )
+    if seed is None:
+        seed = np.random.randint(1e6)
 
-    train_dataset = prepare_dataset(
-        dataset=train_dataset,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        seed=seed,
+    file_paths, labels, class_names = dataset_utils.index_directory(
+        directory,
+        labels,
+        formats=ALLOWED_FORMATS,
         class_names=class_names,
-        output_sequence_length=output_sequence_length,
-        ragged=ragged,
-    )
-    val_dataset = prepare_dataset(
-        dataset=val_dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        seed=seed,
-        class_names=class_names,
-        output_sequence_length=output_sequence_length,
-        ragged=ragged,
-    )
-    return train_dataset, val_dataset
-
-  else:
-    dataset = get_dataset(
-        file_paths=file_paths,
-        labels=labels,
-        directory=directory,
-        validation_split=validation_split,
-        subset=subset,
-        label_mode=label_mode,
-        class_names=class_names,
-        sampling_rate=sampling_rate,
-        output_sequence_length=output_sequence_length,
-        ragged=ragged,
-    )
-
-    dataset = prepare_dataset(
-        dataset=dataset,
-        batch_size=batch_size,
         shuffle=shuffle,
         seed=seed,
-        class_names=class_names,
-        output_sequence_length=output_sequence_length,
-        ragged=ragged,
+        follow_links=follow_links,
     )
-    return dataset
-
 
-def prepare_dataset(dataset, batch_size, shuffle, seed, class_names,
-                    output_sequence_length, ragged):
-  dataset = dataset.prefetch(tf.data.AUTOTUNE)
-  if batch_size is not None:
-    if shuffle:
-      dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+    if label_mode == "binary" and len(class_names) != 2:
+        raise ValueError(
+            'When passing `label_mode="binary"`, there must be exactly 2 '
+            f"class_names. Received: class_names={class_names}"
+        )
+
+    if subset == "both":
+        train_dataset, val_dataset = get_training_and_validation_dataset(
+            file_paths=file_paths,
+            labels=labels,
+            validation_split=validation_split,
+            directory=directory,
+            label_mode=label_mode,
+            class_names=class_names,
+            sampling_rate=sampling_rate,
+            output_sequence_length=output_sequence_length,
+            ragged=ragged,
+        )
+
+        train_dataset = prepare_dataset(
+            dataset=train_dataset,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            seed=seed,
+            class_names=class_names,
+            output_sequence_length=output_sequence_length,
+            ragged=ragged,
+        )
+        val_dataset = prepare_dataset(
+            dataset=val_dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            seed=seed,
+            class_names=class_names,
+            output_sequence_length=output_sequence_length,
+            ragged=ragged,
+        )
+        return train_dataset, val_dataset
 
-    if output_sequence_length is None and not ragged:
-      dataset = dataset.padded_batch(
-          batch_size, padded_shapes=([None, None], []))
     else:
-      dataset = dataset.batch(batch_size)
-  else:
-    if shuffle:
-      dataset = dataset.shuffle(buffer_size=1024, seed=seed)
+        dataset = get_dataset(
+            file_paths=file_paths,
+            labels=labels,
+            directory=directory,
+            validation_split=validation_split,
+            subset=subset,
+            label_mode=label_mode,
+            class_names=class_names,
+            sampling_rate=sampling_rate,
+            output_sequence_length=output_sequence_length,
+            ragged=ragged,
+        )
+
+        dataset = prepare_dataset(
+            dataset=dataset,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            seed=seed,
+            class_names=class_names,
+            output_sequence_length=output_sequence_length,
+            ragged=ragged,
+        )
+        return dataset
+
+
+def prepare_dataset(
+    dataset,
+    batch_size,
+    shuffle,
+    seed,
+    class_names,
+    output_sequence_length,
+    ragged,
+):
+    dataset = dataset.prefetch(tf.data.AUTOTUNE)
+    if batch_size is not None:
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+
+        if output_sequence_length is None and not ragged:
+            dataset = dataset.padded_batch(
+                batch_size, padded_shapes=([None, None], [])
+            )
+        else:
+            dataset = dataset.batch(batch_size)
+    else:
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=1024, seed=seed)
 
-  # Users may need to reference `class_names`.
-  dataset.class_names = class_names
-  return dataset
+    # Users may need to reference `class_names`.
+    dataset.class_names = class_names
+    return dataset
 
 
 def get_training_and_validation_dataset(
@@ -279,40 +311,48 @@ def get_training_and_validation_dataset(
     output_sequence_length,
     ragged,
 ):
-  file_paths_train, labels_train = dataset_utils.get_training_or_validation_split(
-      file_paths, labels, validation_split, "training")
-  if not file_paths_train:
-    raise ValueError(f"No training audio files found in directory {directory}. "
-                     f"Allowed format(s): {ALLOWED_FORMATS}")
-
-  file_paths_val, labels_val = dataset_utils.get_training_or_validation_split(
-      file_paths, labels, validation_split, "validation")
-  if not file_paths_val:
-    raise ValueError(
-        f"No validation audio files found in directory {directory}. "
-        f"Allowed format(s): {ALLOWED_FORMATS}")
-
-  train_dataset = paths_and_labels_to_dataset(
-      file_paths=file_paths_train,
-      labels=labels_train,
-      label_mode=label_mode,
-      num_classes=len(class_names),
-      sampling_rate=sampling_rate,
-      output_sequence_length=output_sequence_length,
-      ragged=ragged,
-  )
-
-  val_dataset = paths_and_labels_to_dataset(
-      file_paths=file_paths_val,
-      labels=labels_val,
-      label_mode=label_mode,
-      num_classes=len(class_names),
-      sampling_rate=sampling_rate,
-      output_sequence_length=output_sequence_length,
-      ragged=ragged,
-  )
-
-  return train_dataset, val_dataset
+    (
+        file_paths_train,
+        labels_train,
+    ) = dataset_utils.get_training_or_validation_split(
+        file_paths, labels, validation_split, "training"
+    )
+    if not file_paths_train:
+        raise ValueError(
+            f"No training audio files found in directory {directory}. "
+            f"Allowed format(s): {ALLOWED_FORMATS}"
+        )
+
+    file_paths_val, labels_val = dataset_utils.get_training_or_validation_split(
+        file_paths, labels, validation_split, "validation"
+    )
+    if not file_paths_val:
+        raise ValueError(
+            f"No validation audio files found in directory {directory}. "
+            f"Allowed format(s): {ALLOWED_FORMATS}"
+        )
+
+    train_dataset = paths_and_labels_to_dataset(
+        file_paths=file_paths_train,
+        labels=labels_train,
+        label_mode=label_mode,
+        num_classes=len(class_names),
+        sampling_rate=sampling_rate,
+        output_sequence_length=output_sequence_length,
+        ragged=ragged,
+    )
+
+    val_dataset = paths_and_labels_to_dataset(
+        file_paths=file_paths_val,
+        labels=labels_val,
+        label_mode=label_mode,
+        num_classes=len(class_names),
+        sampling_rate=sampling_rate,
+        output_sequence_length=output_sequence_length,
+        ragged=ragged,
+    )
+
+    return train_dataset, val_dataset
 
 
 def get_dataset(
@@ -327,42 +367,47 @@ def get_dataset(
     output_sequence_length,
     ragged,
 ):
-  file_paths, labels = dataset_utils.get_training_or_validation_split(
-      file_paths, labels, validation_split, subset)
-  if not file_paths:
-    raise ValueError(f"No audio files found in directory {directory}. "
-                     f"Allowed format(s): {ALLOWED_FORMATS}")
-
-  dataset = paths_and_labels_to_dataset(
-      file_paths=file_paths,
-      labels=labels,
-      label_mode=label_mode,
-      num_classes=len(class_names),
-      sampling_rate=sampling_rate,
-      output_sequence_length=output_sequence_length,
-      ragged=ragged,
-  )
-
-  return dataset
-
-
-def read_and_decode_audio(path,
-                          sampling_rate=None,
-                          output_sequence_length=None):
-  """Reads and decodes audio file."""
-  audio = tf.io.read_file(path)
-
-  if output_sequence_length is None:
-    output_sequence_length = -1
-
-  audio, default_audio_rate = tf.audio.decode_wav(
-      contents=audio, desired_samples=output_sequence_length)
-  if sampling_rate is not None:
-    # default_audio_rate should have dtype=int64
-    default_audio_rate = tf.cast(default_audio_rate, tf.int64)
-    audio = tfio.audio.resample(
-        input=audio, rate_in=default_audio_rate, rate_out=sampling_rate)
-  return audio
+    file_paths, labels = dataset_utils.get_training_or_validation_split(
+        file_paths, labels, validation_split, subset
+    )
+    if not file_paths:
+        raise ValueError(
+            f"No audio files found in directory {directory}. "
+            f"Allowed format(s): {ALLOWED_FORMATS}"
+        )
+
+    dataset = paths_and_labels_to_dataset(
+        file_paths=file_paths,
+        labels=labels,
+        label_mode=label_mode,
+        num_classes=len(class_names),
+        sampling_rate=sampling_rate,
+        output_sequence_length=output_sequence_length,
+        ragged=ragged,
+    )
+
+    return dataset
+
+
+def read_and_decode_audio(
+    path, sampling_rate=None, output_sequence_length=None
+):
+    """Reads and decodes audio file."""
+    audio = tf.io.read_file(path)
+
+    if output_sequence_length is None:
+        output_sequence_length = -1
+
+    audio, default_audio_rate = tf.audio.decode_wav(
+        contents=audio, desired_samples=output_sequence_length
+    )
+    if sampling_rate is not None:
+        # default_audio_rate should have dtype=int64
+        default_audio_rate = tf.cast(default_audio_rate, tf.int64)
+        audio = tfio.audio.resample(
+            input=audio, rate_in=default_audio_rate, rate_out=sampling_rate
+        )
+    return audio
 
 
 def paths_and_labels_to_dataset(
@@ -374,20 +419,24 @@ def paths_and_labels_to_dataset(
     output_sequence_length,
     ragged,
 ):
-  """Constructs a fixed-size dataset of audio and labels."""
-  path_ds = tf.data.Dataset.from_tensor_slices(file_paths)
-  audio_ds = path_ds.map(
-      lambda x: read_and_decode_audio(x, sampling_rate, output_sequence_length),
-      num_parallel_calls=tf.data.AUTOTUNE,
-  )
-
-  if ragged:
-    audio_ds = audio_ds.map(
-        lambda x: tf.RaggedTensor.from_tensor(x),
+    """Constructs a fixed-size dataset of audio and labels."""
+    path_ds = tf.data.Dataset.from_tensor_slices(file_paths)
+    audio_ds = path_ds.map(
+        lambda x: read_and_decode_audio(
+            x, sampling_rate, output_sequence_length
+        ),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
 
-  if label_mode:
-    label_ds = dataset_utils.labels_to_dataset(labels, label_mode, num_classes)
-    audio_ds = tf.data.Dataset.zip((audio_ds, label_ds))
-  return audio_ds
+    if ragged:
+        audio_ds = audio_ds.map(
+            lambda x: tf.RaggedTensor.from_tensor(x),
+            num_parallel_calls=tf.data.AUTOTUNE,
+        )
+
+    if label_mode:
+        label_ds = dataset_utils.labels_to_dataset(
+            labels, label_mode, num_classes
+        )
+        audio_ds = tf.data.Dataset.zip((audio_ds, label_ds))
+    return audio_ds
diff --git a/keras/utils/audio_dataset_test.py b/keras/utils/audio_dataset_test.py
index ed314a2202c3..c32dda318a2e 100644
--- a/keras/utils/audio_dataset_test.py
+++ b/keras/utils/audio_dataset_test.py
@@ -14,12 +14,12 @@
 # ==============================================================================
 """Tests for audio_dataset."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import shutil
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import audio_dataset
@@ -27,356 +27,434 @@
 
 @test_utils.run_v2_only
 class AudioDatasetFromDirectoryTest(test_combinations.TestCase):
-
-  def _get_audio_samples(self, count=16, different_sequence_lengths=False):
-    sequence_length = 30
-    num_channels = 1
-    audio_samples = []
-    for _ in range(count):
-      if different_sequence_lengths:
-        random_sequence_length = np.random.randint(10, sequence_length + 1)
-        audio = np.random.random((random_sequence_length, num_channels))
-      else:
-        audio = np.random.random((sequence_length, num_channels))
-      audio_samples.append(tf.audio.encode_wav(audio, 1000))
-    return audio_samples
-
-  def _prepare_directory(
-      self,
-      num_classes=2,
-      nested_dirs=False,
-      count=16,
-      different_sequence_lengths=False,
-  ):
-    # Get a unique temp directory
-    temp_dir = os.path.join(self.get_temp_dir(), str(np.random.randint(1e6)))
-    os.mkdir(temp_dir)
-    self.addCleanup(shutil.rmtree, temp_dir)
-
-    # Generate paths to class subdirectories
-    paths = []
-    for class_index in range(num_classes):
-      class_directory = "class_%s" % (class_index,)
-      if nested_dirs:
-        class_paths = [
-            class_directory,
-            os.path.join(class_directory, "subfolder_1"),
-            os.path.join(class_directory, "subfolder_2"),
-            os.path.join(class_directory, "subfolder_1", "sub-subfolder"),
+    def _get_audio_samples(self, count=16, different_sequence_lengths=False):
+        sequence_length = 30
+        num_channels = 1
+        audio_samples = []
+        for _ in range(count):
+            if different_sequence_lengths:
+                random_sequence_length = np.random.randint(
+                    10, sequence_length + 1
+                )
+                audio = np.random.random((random_sequence_length, num_channels))
+            else:
+                audio = np.random.random((sequence_length, num_channels))
+            audio_samples.append(tf.audio.encode_wav(audio, 1000))
+        return audio_samples
+
+    def _prepare_directory(
+        self,
+        num_classes=2,
+        nested_dirs=False,
+        count=16,
+        different_sequence_lengths=False,
+    ):
+        # Get a unique temp directory
+        temp_dir = os.path.join(
+            self.get_temp_dir(), str(np.random.randint(1e6))
+        )
+        os.mkdir(temp_dir)
+        self.addCleanup(shutil.rmtree, temp_dir)
+
+        # Generate paths to class subdirectories
+        paths = []
+        for class_index in range(num_classes):
+            class_directory = f"class_{class_index}"
+            if nested_dirs:
+                class_paths = [
+                    class_directory,
+                    os.path.join(class_directory, "subfolder_1"),
+                    os.path.join(class_directory, "subfolder_2"),
+                    os.path.join(
+                        class_directory, "subfolder_1", "sub-subfolder"
+                    ),
+                ]
+            else:
+                class_paths = [class_directory]
+            for path in class_paths:
+                os.mkdir(os.path.join(temp_dir, path))
+            paths += class_paths
+
+        # Save audio samples to the paths
+        i = 0
+        for audio in self._get_audio_samples(
+            count=count, different_sequence_lengths=different_sequence_lengths
+        ):
+            path = paths[i % len(paths)]
+            ext = "wav"
+            filename = os.path.join(path, f"audio_{i}.{ext}")
+            with open(os.path.join(temp_dir, filename), "wb") as f:
+                f.write(audio.numpy())
+            i += 1
+        return temp_dir
+
+    def test_audio_dataset_from_directory_standalone(self):
+        # Test retrieving audio samples withouts labels from a directory and its
+        # subdirs.
+        # Save a few extra audio in the parent directory.
+        directory = self._prepare_directory(count=7, num_classes=2)
+        for i, audio in enumerate(self._get_audio_samples(3)):
+            filename = f"audio_{i}.wav"
+            with open(os.path.join(directory, filename), "wb") as f:
+                f.write(audio.numpy())
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=5, output_sequence_length=30, labels=None
+        )
+        batch = next(iter(dataset))
+        # We return plain audio
+        self.assertEqual(batch.shape, (5, 30, 1))
+        self.assertEqual(batch.dtype.name, "float32")
+        # Count samples
+        batch_count = 0
+        sample_count = 0
+        for batch in dataset:
+            batch_count += 1
+            sample_count += batch.shape[0]
+        self.assertEqual(batch_count, 2)
+        self.assertEqual(sample_count, 10)
+
+    def test_audio_dataset_from_directory_binary(self):
+        directory = self._prepare_directory(num_classes=2)
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=8, output_sequence_length=30, label_mode="int"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 30, 1))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8,))
+        self.assertEqual(batch[1].dtype.name, "int32")
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=8,
+            output_sequence_length=30,
+            label_mode="binary",
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 30, 1))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8, 1))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=8,
+            output_sequence_length=30,
+            label_mode="categorical",
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 30, 1))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8, 2))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+    def test_static_shape_in_graph(self):
+        directory = self._prepare_directory(num_classes=2)
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=8, output_sequence_length=30, label_mode="int"
+        )
+        test_case = self
+
+        @tf.function
+        def symbolic_fn(ds):
+            for x, _ in ds.take(1):
+                test_case.assertListEqual(x.shape.as_list(), [None, 30, None])
+
+        symbolic_fn(dataset)
+
+    def test_sample_count(self):
+        directory = self._prepare_directory(num_classes=4, count=15)
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=8, output_sequence_length=30, label_mode=None
+        )
+        sample_count = 0
+        for batch in dataset:
+            sample_count += batch.shape[0]
+        self.assertEqual(sample_count, 15)
+
+    def test_audio_dataset_from_directory_multiclass(self):
+        directory = self._prepare_directory(num_classes=4, count=15)
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=8, output_sequence_length=30, label_mode=None
+        )
+        batch = next(iter(dataset))
+        self.assertEqual(batch.shape, (8, 30, 1))
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=8, output_sequence_length=30, label_mode=None
+        )
+        sample_count = 0
+        iterator = iter(dataset)
+        for batch in dataset:
+            sample_count += next(iterator).shape[0]
+        self.assertEqual(sample_count, 15)
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=8, output_sequence_length=30, label_mode="int"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 30, 1))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8,))
+        self.assertEqual(batch[1].dtype.name, "int32")
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=8,
+            output_sequence_length=30,
+            label_mode="categorical",
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 30, 1))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8, 4))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+    def test_audio_dataset_from_directory_validation_split(self):
+        directory = self._prepare_directory(num_classes=2, count=10)
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=10,
+            output_sequence_length=30,
+            validation_split=0.2,
+            subset="training",
+            seed=1337,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 30, 1))
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=10,
+            output_sequence_length=30,
+            validation_split=0.2,
+            subset="validation",
+            seed=1337,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (2, 30, 1))
+
+    def test_audio_dataset_from_directory_manual_labels(self):
+        directory = self._prepare_directory(num_classes=2, count=2)
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=8,
+            output_sequence_length=30,
+            labels=[0, 1],
+            shuffle=False,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertAllClose(batch[1], [0, 1])
+
+    def test_audio_dataset_from_directory_follow_links(self):
+        directory = self._prepare_directory(
+            num_classes=2, count=25, nested_dirs=True
+        )
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=8,
+            output_sequence_length=30,
+            label_mode=None,
+            follow_links=True,
+        )
+        sample_count = 0
+        for batch in dataset:
+            sample_count += batch.shape[0]
+        self.assertEqual(sample_count, 25)
+
+    def test_audio_dataset_from_directory_no_audio(self):
+        directory = self._prepare_directory(num_classes=2, count=0)
+        with self.assertRaisesRegex(
+            ValueError, "No audio files found in directory"
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(directory)
+
+    def test_audio_dataset_from_directory_ragged(self):
+        directory = self._prepare_directory(
+            num_classes=2, count=16, different_sequence_lengths=True
+        )
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, ragged=True, batch_size=8
+        )
+        batch = next(iter(dataset))
+
+        self.assertEqual(batch[0].shape.as_list(), [8, None, None])
+
+    def test_audio_dataset_from_directory_no_output_sequence_length_no_ragged(
+        self,
+    ):
+        # This test case tests `audio_dataset_from_directory` when `ragged` and
+        # `output_sequence_length` are not passed while the input sequence
+        # lengths are different.
+        directory = self._prepare_directory(
+            num_classes=2, count=16, different_sequence_lengths=True
+        )
+        # The tensor shapes are different and output_sequence_length is None
+        # should work fine and pad each sequence to the length of the longest
+        # sequence in it's batch
+        min_sequence_length, max_sequence_length = 10, 30
+        possible_sequence_lengths = [
+            i for i in range(min_sequence_length, max_sequence_length + 1)
         ]
-      else:
-        class_paths = [class_directory]
-      for path in class_paths:
-        os.mkdir(os.path.join(temp_dir, path))
-      paths += class_paths
-
-    # Save audio samples to the paths
-    i = 0
-    for audio in self._get_audio_samples(
-        count=count, different_sequence_lengths=different_sequence_lengths):
-      path = paths[i % len(paths)]
-      ext = "wav"
-      filename = os.path.join(path, "audio_%s.%s" % (i, ext))
-      with open(os.path.join(temp_dir, filename), "wb") as f:
-        f.write(audio.numpy())
-      i += 1
-    return temp_dir
-
-  def test_audio_dataset_from_directory_standalone(self):
-    # Test retrieving audio samples withouts labels from a directory and its subdirs.
-
-    # Save a few extra audio in the parent directory.
-    directory = self._prepare_directory(count=7, num_classes=2)
-    for i, audio in enumerate(self._get_audio_samples(3)):
-      filename = "audio_%s.wav" % (i,)
-      with open(os.path.join(directory, filename), "wb") as f:
-        f.write(audio.numpy())
-
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=5, output_sequence_length=30, labels=None)
-    batch = next(iter(dataset))
-    # We return plain audio
-    self.assertEqual(batch.shape, (5, 30, 1))
-    self.assertEqual(batch.dtype.name, "float32")
-    # Count samples
-    batch_count = 0
-    sample_count = 0
-    for batch in dataset:
-      batch_count += 1
-      sample_count += batch.shape[0]
-    self.assertEqual(batch_count, 2)
-    self.assertEqual(sample_count, 10)
-
-  def test_audio_dataset_from_directory_binary(self):
-    directory = self._prepare_directory(num_classes=2)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=8, output_sequence_length=30, label_mode="int")
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 30, 1))
-    self.assertEqual(batch[0].dtype.name, "float32")
-    self.assertEqual(batch[1].shape, (8,))
-    self.assertEqual(batch[1].dtype.name, "int32")
-
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=8, output_sequence_length=30, label_mode="binary")
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 30, 1))
-    self.assertEqual(batch[0].dtype.name, "float32")
-    self.assertEqual(batch[1].shape, (8, 1))
-    self.assertEqual(batch[1].dtype.name, "float32")
-
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory,
-        batch_size=8,
-        output_sequence_length=30,
-        label_mode="categorical")
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 30, 1))
-    self.assertEqual(batch[0].dtype.name, "float32")
-    self.assertEqual(batch[1].shape, (8, 2))
-    self.assertEqual(batch[1].dtype.name, "float32")
-
-  def test_static_shape_in_graph(self):
-    directory = self._prepare_directory(num_classes=2)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=8, output_sequence_length=30, label_mode="int")
-    test_case = self
-
-    @tf.function
-    def symbolic_fn(ds):
-      for x, _ in ds.take(1):
-        test_case.assertListEqual(x.shape.as_list(), [None, 30, None])
-
-    symbolic_fn(dataset)
-
-  def test_sample_count(self):
-    directory = self._prepare_directory(num_classes=4, count=15)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=8, output_sequence_length=30, label_mode=None)
-    sample_count = 0
-    for batch in dataset:
-      sample_count += batch.shape[0]
-    self.assertEqual(sample_count, 15)
-
-  def test_audio_dataset_from_directory_multiclass(self):
-    directory = self._prepare_directory(num_classes=4, count=15)
-
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=8, output_sequence_length=30, label_mode=None)
-    batch = next(iter(dataset))
-    self.assertEqual(batch.shape, (8, 30, 1))
-
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=8, output_sequence_length=30, label_mode=None)
-    sample_count = 0
-    iterator = iter(dataset)
-    for batch in dataset:
-      sample_count += next(iterator).shape[0]
-    self.assertEqual(sample_count, 15)
-
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=8, output_sequence_length=30, label_mode="int")
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 30, 1))
-    self.assertEqual(batch[0].dtype.name, "float32")
-    self.assertEqual(batch[1].shape, (8,))
-    self.assertEqual(batch[1].dtype.name, "int32")
-
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory,
-        batch_size=8,
-        output_sequence_length=30,
-        label_mode="categorical")
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 30, 1))
-    self.assertEqual(batch[0].dtype.name, "float32")
-    self.assertEqual(batch[1].shape, (8, 4))
-    self.assertEqual(batch[1].dtype.name, "float32")
-
-  def test_audio_dataset_from_directory_validation_split(self):
-    directory = self._prepare_directory(num_classes=2, count=10)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory,
-        batch_size=10,
-        output_sequence_length=30,
-        validation_split=0.2,
-        subset="training",
-        seed=1337,
-    )
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 30, 1))
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory,
-        batch_size=10,
-        output_sequence_length=30,
-        validation_split=0.2,
-        subset="validation",
-        seed=1337,
-    )
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (2, 30, 1))
-
-  def test_audio_dataset_from_directory_manual_labels(self):
-    directory = self._prepare_directory(num_classes=2, count=2)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory,
-        batch_size=8,
-        output_sequence_length=30,
-        labels=[0, 1],
-        shuffle=False,
-    )
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertAllClose(batch[1], [0, 1])
-
-  def test_audio_dataset_from_directory_follow_links(self):
-    directory = self._prepare_directory(
-        num_classes=2, count=25, nested_dirs=True)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory,
-        batch_size=8,
-        output_sequence_length=30,
-        label_mode=None,
-        follow_links=True,
-    )
-    sample_count = 0
-    for batch in dataset:
-      sample_count += batch.shape[0]
-    self.assertEqual(sample_count, 25)
-
-  def test_audio_dataset_from_directory_no_audio(self):
-    directory = self._prepare_directory(num_classes=2, count=0)
-    with self.assertRaisesRegex(ValueError,
-                                "No audio files found in directory"):
-      _ = audio_dataset.audio_dataset_from_directory(directory)
-
-  def test_audio_dataset_from_directory_ragged(self):
-    directory = self._prepare_directory(
-        num_classes=2, count=16, different_sequence_lengths=True)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, ragged=True, batch_size=8)
-    batch = next(iter(dataset))
-
-    self.assertEqual(batch[0].shape.as_list(), [8, None, None])
-
-  def test_audio_dataset_from_directory_no_output_sequence_length_no_ragged(
-      self):
-    # This test case tests `audio_dataset_from_directory` when `ragged` and `output_sequence_length`
-    # are not passed while the input sequence lengths are different.
-    directory = self._prepare_directory(
-        num_classes=2, count=16, different_sequence_lengths=True)
-    # The tensor shapes are different and output_sequence_length is None
-    # should work fine and pad each sequence to the length of the longest sequence
-    # in it's batch
-    min_sequence_length, max_sequence_length = 10, 30
-    possible_sequence_lengths = [
-        i for i in range(min_sequence_length, max_sequence_length + 1)
-    ]
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=2)
-    sequence_lengths = list(set([b.shape[1] for b, _ in dataset]))
-    for seq_len in sequence_lengths:
-      self.assertIn(seq_len, possible_sequence_lengths)
-
-  def test_audio_dataset_from_directory_no_output_sequence_length_same_lengths(
-      self):
-    # This test case tests `audio_dataset_from_directory` when `ragged` and `output_sequence_length`
-    # are not passed while the input sequence lengths are the same
-    directory = self._prepare_directory(
-        num_classes=2, count=16, different_sequence_lengths=False)
-    # The tensor shapes are different and output_sequence_length is None
-    # should work fine and pad each sequence to the length of the longest sequence
-    # in it's batch
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory, batch_size=2)
-    sequence_lengths = list(set([batch[0].shape[1] for batch in dataset]))
-    self.assertEqual(len(sequence_lengths), 1)
-
-  def test_audio_dataset_from_directory_errors(self):
-    directory = self._prepare_directory(num_classes=3, count=5)
-
-    with self.assertRaisesRegex(
-        ValueError, "`sampling_rate` should be higher than 0. Received:"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, ragged=False, output_sequence_length=10, sampling_rate=-1)
-
-    with self.assertRaisesRegex(
-        ValueError, "`sampling_rate` should have an integer value. Received:"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, ragged=False, output_sequence_length=10, sampling_rate=1.2)
-
-    with self.assertRaisesRegex(
-        ValueError, "Cannot set both `ragged` and `output_sequence_length`"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, ragged=True, output_sequence_length=30)
-
-    with self.assertRaisesRegex(ValueError, "`labels` argument should be"):
-      _ = audio_dataset.audio_dataset_from_directory(directory, labels="other")
-
-    with self.assertRaisesRegex(ValueError, "`label_mode` argument must be"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, label_mode="other")
-
-    with self.assertRaisesRegex(
-        ValueError, 'only pass `class_names` if `labels="inferred"`'):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory,
-          labels=[0, 0, 1, 1, 1],
-          class_names=["class_0", "class_1", "class_2"],
-      )
-
-    with self.assertRaisesRegex(
-        ValueError,
-        "Expected the lengths of `labels` to match the number of files"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, labels=[0, 0, 1, 1])
-
-    with self.assertRaisesRegex(ValueError,
-                                "`class_names` passed did not match"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, class_names=["class_0", "class_2"])
-
-    with self.assertRaisesRegex(ValueError, "there must be exactly 2"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, label_mode="binary")
-
-    with self.assertRaisesRegex(ValueError,
-                                "`validation_split` must be between 0 and 1"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, validation_split=2)
-
-    with self.assertRaisesRegex(ValueError,
-                                '`subset` must be either "training",'):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, validation_split=0.2, subset="other")
-
-    with self.assertRaisesRegex(ValueError, "`validation_split` must be set"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, validation_split=0, subset="training")
-
-    with self.assertRaisesRegex(ValueError, "must provide a `seed`"):
-      _ = audio_dataset.audio_dataset_from_directory(
-          directory, validation_split=0.2, subset="training")
-
-  def test_audio_dataset_from_directory_not_batched(self):
-    directory = self._prepare_directory(num_classes=2, count=2)
-    dataset = audio_dataset.audio_dataset_from_directory(
-        directory,
-        batch_size=None,
-        output_sequence_length=30,
-        label_mode=None,
-        shuffle=False,
-    )
-    sample = next(iter(dataset))
-    self.assertEqual(len(sample.shape), 2)
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=2
+        )
+        sequence_lengths = list(set([b.shape[1] for b, _ in dataset]))
+        for seq_len in sequence_lengths:
+            self.assertIn(seq_len, possible_sequence_lengths)
+
+    def test_audio_dataset_from_directory_no_output_sequence_length_same_lengths(  # noqa: E501
+        self,
+    ):
+        # This test case tests `audio_dataset_from_directory` when `ragged` and
+        # `output_sequence_length` are not passed while the input sequence
+        # lengths are the same
+        directory = self._prepare_directory(
+            num_classes=2, count=16, different_sequence_lengths=False
+        )
+        # The tensor shapes are different and output_sequence_length is None
+        # should work fine and pad each sequence to the length of the longest
+        # sequence in it's batch
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory, batch_size=2
+        )
+        sequence_lengths = list(set([batch[0].shape[1] for batch in dataset]))
+        self.assertEqual(len(sequence_lengths), 1)
+
+    def test_audio_dataset_from_directory_errors(self):
+        directory = self._prepare_directory(num_classes=3, count=5)
+
+        with self.assertRaisesRegex(
+            ValueError, "`sampling_rate` should be higher than 0. Received:"
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory,
+                ragged=False,
+                output_sequence_length=10,
+                sampling_rate=-1,
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "`sampling_rate` should have an integer value. Received:",
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory,
+                ragged=False,
+                output_sequence_length=10,
+                sampling_rate=1.2,
+            )
+
+        # Only run this test case when we don't have tensorflow_io.
+        try:
+            import tensorflow_io  # noqa: F401
+        except ImportError:
+            with self.assertRaisesRegex(
+                ImportError,
+                "To use the argument `sampling_rate`.*tensorflow_io.*",
+            ):
+                _ = audio_dataset.audio_dataset_from_directory(
+                    directory,
+                    ragged=False,
+                    output_sequence_length=10,
+                    sampling_rate=44100,
+                )
+
+        with self.assertRaisesRegex(
+            ValueError, "Cannot set both `ragged` and `output_sequence_length`"
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, ragged=True, output_sequence_length=30
+            )
+
+        with self.assertRaisesRegex(ValueError, "`labels` argument should be"):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, labels="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`label_mode` argument must be"
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, label_mode="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, 'only pass `class_names` if `labels="inferred"`'
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory,
+                labels=[0, 0, 1, 1, 1],
+                class_names=["class_0", "class_1", "class_2"],
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Expected the lengths of `labels` to match the number of files",
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, labels=[0, 0, 1, 1]
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`class_names` passed did not match"
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, class_names=["class_0", "class_2"]
+            )
+
+        with self.assertRaisesRegex(ValueError, "there must be exactly 2"):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, label_mode="binary"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`validation_split` must be between 0 and 1"
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, validation_split=2
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, '`subset` must be either "training",'
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, validation_split=0.2, subset="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`validation_split` must be set"
+        ):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, validation_split=0, subset="training"
+            )
+
+        with self.assertRaisesRegex(ValueError, "must provide a `seed`"):
+            _ = audio_dataset.audio_dataset_from_directory(
+                directory, validation_split=0.2, subset="training"
+            )
+
+    def test_audio_dataset_from_directory_not_batched(self):
+        directory = self._prepare_directory(num_classes=2, count=2)
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=None,
+            output_sequence_length=30,
+            label_mode=None,
+            shuffle=False,
+        )
+        sample = next(iter(dataset))
+        self.assertEqual(len(sample.shape), 2)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/utils/audio_dataset_with_tfio_test.py b/keras/utils/audio_dataset_with_tfio_test.py
new file mode 100644
index 000000000000..75689d29c7ac
--- /dev/null
+++ b/keras/utils/audio_dataset_with_tfio_test.py
@@ -0,0 +1,129 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for audio_dataset when tfio is available."""
+
+import os
+import shutil
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import audio_dataset
+
+
+@test_utils.run_v2_only
+class AudioDatasetFromDirectoryWithTfioTest(test_combinations.TestCase):
+    def _get_audio_samples(self, count=16, different_sequence_lengths=False):
+        sequence_length = 30
+        num_channels = 1
+        audio_samples = []
+        for _ in range(count):
+            if different_sequence_lengths:
+                random_sequence_length = np.random.randint(
+                    10, sequence_length + 1
+                )
+                audio = np.random.random((random_sequence_length, num_channels))
+            else:
+                audio = np.random.random((sequence_length, num_channels))
+            audio_samples.append(tf.audio.encode_wav(audio, 1000))
+        return audio_samples
+
+    def _prepare_directory(
+        self,
+        num_classes=2,
+        nested_dirs=False,
+        count=16,
+        different_sequence_lengths=False,
+    ):
+        # Get a unique temp directory
+        temp_dir = os.path.join(
+            self.get_temp_dir(), str(np.random.randint(1e6))
+        )
+        os.mkdir(temp_dir)
+        self.addCleanup(shutil.rmtree, temp_dir)
+
+        # Generate paths to class subdirectories
+        paths = []
+        for class_index in range(num_classes):
+            class_directory = f"class_{class_index}"
+            if nested_dirs:
+                class_paths = [
+                    class_directory,
+                    os.path.join(class_directory, "subfolder_1"),
+                    os.path.join(class_directory, "subfolder_2"),
+                    os.path.join(
+                        class_directory, "subfolder_1", "sub-subfolder"
+                    ),
+                ]
+            else:
+                class_paths = [class_directory]
+            for path in class_paths:
+                os.mkdir(os.path.join(temp_dir, path))
+            paths += class_paths
+
+        # Save audio samples to the paths
+        i = 0
+        for audio in self._get_audio_samples(
+            count=count, different_sequence_lengths=different_sequence_lengths
+        ):
+            path = paths[i % len(paths)]
+            ext = "wav"
+            filename = os.path.join(path, f"audio_{i}.{ext}")
+            with open(os.path.join(temp_dir, filename), "wb") as f:
+                f.write(audio.numpy())
+            i += 1
+        return temp_dir
+
+    def test_audio_dataset_from_directory_standalone_with_resampling(self):
+        # Test retrieving audio samples withouts labels from a directory and its
+        # subdirs where we double the sampling rate.
+        # Save a few extra audio in the parent directory.
+        directory = self._prepare_directory(count=7, num_classes=2)
+        for i, audio in enumerate(self._get_audio_samples(3)):
+            filename = f"audio_{i}.wav"
+            with open(os.path.join(directory, filename), "wb") as f:
+                f.write(audio.numpy())
+
+        dataset = audio_dataset.audio_dataset_from_directory(
+            directory,
+            batch_size=5,
+            output_sequence_length=30,
+            labels=None,
+            sampling_rate=2000,  # Twice the original sample rate.
+        )
+        batch = next(iter(dataset))
+        # We return plain audio. Expect twice as many samples now.
+        self.assertEqual(batch.shape, (5, 60, 1))
+        self.assertEqual(batch.dtype.name, "float32")
+        # Count samples
+        batch_count = 0
+        sample_count = 0
+        for batch in dataset:
+            batch_count += 1
+            sample_count += batch.shape[0]
+        self.assertEqual(batch_count, 2)
+        self.assertEqual(sample_count, 10)
+
+
+if __name__ == "__main__":
+    try:
+        import tensorflow_io  # noqa: F401
+
+        # Only run these tests if tensorflow_io is installed.
+        tf.test.main()
+    except ImportError:
+        pass
diff --git a/keras/utils/composite_tensor_support_test.py b/keras/utils/composite_tensor_support_test.py
index ae2e8f6f1f3e..25ce0cfd5451 100644
--- a/keras/utils/composite_tensor_support_test.py
+++ b/keras/utils/composite_tensor_support_test.py
@@ -14,296 +14,303 @@
 # ==============================================================================
 """Tests for Keras composite tensor support."""
 
-import tensorflow.compat.v2 as tf
-
-from absl.testing import parameterized
-
 import numpy as np
 import scipy.sparse
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 import keras
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
 from keras.engine import input_layer
-from keras.layers import core
 from keras.layers import Dense
 from keras.layers import Embedding
 from keras.layers import Layer
+from keras.layers import core
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 
 
 # Define test-only Layer classes to validate passing Sparse and Ragged tensors
 # between layers.
 class ToDense(Layer):
-  """Create a dense (standard) tensor from the given input tensor."""
-
-  def __init__(self, default_value, **kwargs):
-    super().__init__(**kwargs)
-    self._default_value = default_value
-
-  def call(self, inputs):
-    if isinstance(inputs, dict):  # Dicts are no longer flattened.
-      # Always a single element in these tests.
-      inputs = tf.nest.flatten(inputs)[0]
-
-    if isinstance(inputs, tf.RaggedTensor):
-      output = inputs.to_tensor(default_value=self._default_value)
-    elif isinstance(inputs, tf.SparseTensor):
-      output = tf.sparse.to_dense(
-          inputs, default_value=self._default_value)
-    elif isinstance(inputs, tf.Tensor):
-      output = inputs
-    else:
-      raise TypeError("Unexpected tensor type %s" % type(inputs).__name__)
+    """Create a dense (standard) tensor from the given input tensor."""
+
+    def __init__(self, default_value, **kwargs):
+        super().__init__(**kwargs)
+        self._default_value = default_value
 
-    # Return a float so that we can compile models with this as the final layer.
-    return tf.cast(output, tf.float32)
+    def call(self, inputs):
+        if isinstance(inputs, dict):  # Dicts are no longer flattened.
+            # Always a single element in these tests.
+            inputs = tf.nest.flatten(inputs)[0]
+
+        if isinstance(inputs, tf.RaggedTensor):
+            output = inputs.to_tensor(default_value=self._default_value)
+        elif isinstance(inputs, tf.SparseTensor):
+            output = tf.sparse.to_dense(
+                inputs, default_value=self._default_value
+            )
+        elif isinstance(inputs, tf.Tensor):
+            output = inputs
+        else:
+            raise TypeError(f"Unexpected tensor type {type(inputs).__name__}")
+
+        # Return a float so that we can compile models with this as the final
+        # layer.
+        return tf.cast(output, tf.float32)
 
 
 class ToRagged(Layer):
-  """Create a ragged tensor based on a given dense tensor."""
+    """Create a ragged tensor based on a given dense tensor."""
 
-  def __init__(self, padding, ragged_rank=1, **kwargs):
-    super().__init__(**kwargs)
-    self._padding = padding
-    self._ragged_rank = ragged_rank
+    def __init__(self, padding, ragged_rank=1, **kwargs):
+        super().__init__(**kwargs)
+        self._padding = padding
+        self._ragged_rank = ragged_rank
 
-  def call(self, inputs):
-    return tf.RaggedTensor.from_tensor(
-        inputs, padding=self._padding, ragged_rank=self._ragged_rank)
+    def call(self, inputs):
+        return tf.RaggedTensor.from_tensor(
+            inputs, padding=self._padding, ragged_rank=self._ragged_rank
+        )
 
 
 class ToSparse(Layer):
-  """Create a sparse tensor based on a given dense tensor."""
+    """Create a sparse tensor based on a given dense tensor."""
 
-  def call(self, inputs):
-    indices = tf.where(tf.not_equal(inputs, 0))
-    values = tf.gather_nd(inputs, indices)
-    shape = tf.shape(inputs, out_type=tf.int64)
-    return tf.SparseTensor(indices, values, dense_shape=shape)
+    def call(self, inputs):
+        indices = tf.where(tf.not_equal(inputs, 0))
+        values = tf.gather_nd(inputs, indices)
+        shape = tf.shape(inputs, out_type=tf.int64)
+        return tf.SparseTensor(indices, values, dense_shape=shape)
 
 
 class _SubclassModel(keras.Model):
-  """A Keras subclass model."""
-
-  def __init__(self, layers, i_layer=None):
-    super().__init__()
-    # Note that clone and build doesn't support lists of layers in subclassed
-    # models. Adding each layer directly here.
-    for i, layer in enumerate(layers):
-      setattr(self, self._layer_name_for_i(i), layer)
-    self.num_layers = len(layers)
-    if i_layer is not None:
-      self._set_inputs(i_layer)
-
-  def _layer_name_for_i(self, i):
-    return "layer{}".format(i)
-
-  def call(self, inputs, **kwargs):
-    x = inputs
-    for i in range(self.num_layers):
-      layer = getattr(self, self._layer_name_for_i(i))
-      x = layer(x)
-    return x
-
-
-def get_model_from_layers_with_input(layers,
-                                     input_shape=None,
-                                     input_dtype=None,
-                                     model_input=None):
-  """Builds a model from a sequence of layers."""
-  if model_input is not None and input_shape is not None:
-    raise ValueError("Cannot specify a model_input and an input shape.")
-
-  model_type = test_utils.get_model_type()
-  if model_type == "subclass":
-    return _SubclassModel(layers, model_input)
-
-  if model_type == "sequential":
-    model = keras.models.Sequential()
-    if model_input is not None:
-      model.add(model_input)
-    elif input_shape is not None:
-      model.add(keras.Input(shape=input_shape, dtype=input_dtype))
-    for layer in layers:
-      model.add(layer)
-    return model
-
-  if model_type == "functional":
-    if model_input is not None:
-      inputs = model_input
-    else:
-      if not input_shape:
-        raise ValueError("Cannot create a functional model from layers with no "
-                         "input shape.")
-      inputs = keras.Input(shape=input_shape, dtype=input_dtype)
-    outputs = inputs
-    for layer in layers:
-      outputs = layer(outputs)
-    return keras.Model(inputs, outputs)
-
-  raise ValueError("Unknown model type {}".format(model_type))
+    """A Keras subclass model."""
+
+    def __init__(self, layers, i_layer=None):
+        super().__init__()
+        # Note that clone and build doesn't support lists of layers in
+        # subclassed models. Adding each layer directly here.
+        for i, layer in enumerate(layers):
+            setattr(self, self._layer_name_for_i(i), layer)
+        self.num_layers = len(layers)
+        if i_layer is not None:
+            self._set_inputs(i_layer)
+
+    def _layer_name_for_i(self, i):
+        return f"layer{i}"
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        for i in range(self.num_layers):
+            layer = getattr(self, self._layer_name_for_i(i))
+            x = layer(x)
+        return x
+
+
+def get_model_from_layers_with_input(
+    layers, input_shape=None, input_dtype=None, model_input=None
+):
+    """Builds a model from a sequence of layers."""
+    if model_input is not None and input_shape is not None:
+        raise ValueError("Cannot specify a model_input and an input shape.")
+
+    model_type = test_utils.get_model_type()
+    if model_type == "subclass":
+        return _SubclassModel(layers, model_input)
+
+    if model_type == "sequential":
+        model = keras.models.Sequential()
+        if model_input is not None:
+            model.add(model_input)
+        elif input_shape is not None:
+            model.add(keras.Input(shape=input_shape, dtype=input_dtype))
+        for layer in layers:
+            model.add(layer)
+        return model
+
+    if model_type == "functional":
+        if model_input is not None:
+            inputs = model_input
+        else:
+            if not input_shape:
+                raise ValueError(
+                    "Cannot create a functional model from layers with no "
+                    "input shape."
+                )
+            inputs = keras.Input(shape=input_shape, dtype=input_dtype)
+        outputs = inputs
+        for layer in layers:
+            outputs = layer(outputs)
+        return keras.Model(inputs, outputs)
+
+    raise ValueError(f"Unknown model type {model_type}")
 
 
 def get_test_mode_kwargs():
-  run_eagerly = test_utils.should_run_eagerly()
-  return {
-      "run_eagerly": run_eagerly,
-  }
+    run_eagerly = test_utils.should_run_eagerly()
+    return {
+        "run_eagerly": run_eagerly,
+    }
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 class CompositeTensorInternalTest(test_combinations.TestCase):
-
-  def test_internal_ragged_tensors(self):
-    # Create a model that accepts an input, converts it to Ragged, and
-    # converts the ragged tensor back to a dense tensor.
-    layers = [ToRagged(padding=0), ToDense(default_value=-1)]
-    model = test_utils.get_model_from_layers(layers, input_shape=(None,))
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0]])
-    expected_output = np.array([[1, -1], [2, 3]])
-    output = model.predict(input_data)
-    self.assertAllEqual(expected_output, output)
-
-  def test_internal_sparse_tensors(self):
-    # Create a model that accepts an input, converts it to Sparse, and
-    # converts the sparse tensor back to a dense tensor.
-    layers = [ToSparse(), ToDense(default_value=-1)]
-    model = test_utils.get_model_from_layers(layers, input_shape=(None,))
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0]])
-    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
-    output = model.predict(input_data)
-    self.assertAllEqual(expected_output, output)
-
-  def test_training_internal_ragged_tensors(self):
-    # Create a model that implements y=Mx. This is easy to learn and will
-    # demonstrate appropriate gradient passing. (We have to use RaggedTensors
-    # for this test, as ToSparse() doesn't support gradient propagation through
-    # the layer.) TODO(b/124796939): Investigate this.
-    layers = [core.Dense(2), ToRagged(padding=0), ToDense(default_value=-1)]
-    model = test_utils.get_model_from_layers(layers, input_shape=(1,))
-
-    input_data = np.random.rand(1024, 1)
-    expected_data = np.concatenate((input_data * 3, input_data * .5), axis=-1)
-
-    model.compile(loss="mse", optimizer="adam", **get_test_mode_kwargs())
-    history = model.fit(input_data, expected_data, epochs=10, verbose=0)
-
-    # If the model trained, the loss stored at history[0] should be different
-    # than the one stored at history[-1].
-    self.assertNotEqual(history.history["loss"][-1], history.history["loss"][0])
+    def test_internal_ragged_tensors(self):
+        # Create a model that accepts an input, converts it to Ragged, and
+        # converts the ragged tensor back to a dense tensor.
+        layers = [ToRagged(padding=0), ToDense(default_value=-1)]
+        model = test_utils.get_model_from_layers(layers, input_shape=(None,))
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0]])
+        expected_output = np.array([[1, -1], [2, 3]])
+        output = model.predict(input_data)
+        self.assertAllEqual(expected_output, output)
+
+    def test_internal_sparse_tensors(self):
+        # Create a model that accepts an input, converts it to Sparse, and
+        # converts the sparse tensor back to a dense tensor.
+        layers = [ToSparse(), ToDense(default_value=-1)]
+        model = test_utils.get_model_from_layers(layers, input_shape=(None,))
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0]])
+        expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+        output = model.predict(input_data)
+        self.assertAllEqual(expected_output, output)
+
+    def test_training_internal_ragged_tensors(self):
+        # Create a model that implements y=Mx. This is easy to learn and will
+        # demonstrate appropriate gradient passing. (We have to use
+        # RaggedTensors for this test, as ToSparse() doesn't support gradient
+        # propagation through the layer.) TODO(b/124796939): Investigate this.
+        layers = [core.Dense(2), ToRagged(padding=0), ToDense(default_value=-1)]
+        model = test_utils.get_model_from_layers(layers, input_shape=(1,))
+
+        input_data = np.random.rand(1024, 1)
+        expected_data = np.concatenate(
+            (input_data * 3, input_data * 0.5), axis=-1
+        )
+
+        model.compile(loss="mse", optimizer="adam", **get_test_mode_kwargs())
+        history = model.fit(input_data, expected_data, epochs=10, verbose=0)
+
+        # If the model trained, the loss stored at history[0] should be
+        # different than the one stored at history[-1].
+        self.assertNotEqual(
+            history.history["loss"][-1], history.history["loss"][0]
+        )
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 class CompositeTensorOutputTest(test_combinations.TestCase):
-
-  def test_ragged_tensor_outputs(self):
-    # Create a model that accepts an input, converts it to Ragged, and
-    # converts the ragged tensor back to a dense tensor.
-    layers = [ToRagged(padding=0)]
-    model = test_utils.get_model_from_layers(layers, input_shape=(None,))
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0]])
-    output = model.predict(input_data)
-
-    expected_values = [[1], [2, 3]]
-    self.assertAllEqual(expected_values, output)
-
-  def test_ragged_tensor_rebatched_outputs(self):
-    # Create a model that accepts an input, converts it to Ragged, and
-    # converts the ragged tensor back to a dense tensor.
-    layers = [ToRagged(padding=0)]
-    model = test_utils.get_model_from_layers(layers, input_shape=(None,))
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0], [4, 0, 0], [5, 6, 0]])
-    output = model.predict(input_data, batch_size=2)
-
-    expected_values = [[1], [2, 3], [4], [5, 6]]
-    self.assertAllEqual(expected_values, output)
-
-  def test_sparse_tensor_outputs(self):
-    # Create a model that accepts an input, converts it to Ragged, and
-    # converts the ragged tensor back to a dense tensor.
-    layers = [ToSparse()]
-    model = test_utils.get_model_from_layers(layers, input_shape=(None,))
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0]])
-    output = model.predict(input_data)
-
-    expected_indices = np.array([[0, 0], [1, 0], [1, 1]])
-    expected_values = np.array([1, 2, 3])
-    expected_dense_shape = np.array([2, 3])
-
-    self.assertAllEqual(output.indices, expected_indices)
-    self.assertAllEqual(output.values, expected_values)
-    self.assertAllEqual(output.dense_shape, expected_dense_shape)
-
-  def test_sparse_tensor_rebatched_outputs(self):
-    # Create a model that accepts an input, converts it to Ragged, and
-    # converts the ragged tensor back to a dense tensor.
-    layers = [ToSparse()]
-    model = test_utils.get_model_from_layers(layers, input_shape=(None,))
-    model._run_eagerly = test_utils.should_run_eagerly()
-
-    # Define some input data with additional padding.
-    input_data = np.array([[1, 0, 0], [2, 3, 0], [4, 0, 0], [5, 6, 0]])
-    output = model.predict(input_data, batch_size=2)
-
-    expected_indices = np.array([[0, 0], [1, 0], [1, 1], [2, 0], [3, 0], [3,
-                                                                          1]])
-    expected_values = np.array([1, 2, 3, 4, 5, 6])
-    expected_dense_shape = np.array([4, 3])
-
-    self.assertAllEqual(output.indices, expected_indices)
-    self.assertAllEqual(output.values, expected_values)
-    self.assertAllEqual(output.dense_shape, expected_dense_shape)
+    def test_ragged_tensor_outputs(self):
+        # Create a model that accepts an input, converts it to Ragged, and
+        # converts the ragged tensor back to a dense tensor.
+        layers = [ToRagged(padding=0)]
+        model = test_utils.get_model_from_layers(layers, input_shape=(None,))
+        model._run_eagerly = test_utils.should_run_eagerly()
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0]])
+        output = model.predict(input_data)
+
+        expected_values = [[1], [2, 3]]
+        self.assertAllEqual(expected_values, output)
+
+    def test_ragged_tensor_rebatched_outputs(self):
+        # Create a model that accepts an input, converts it to Ragged, and
+        # converts the ragged tensor back to a dense tensor.
+        layers = [ToRagged(padding=0)]
+        model = test_utils.get_model_from_layers(layers, input_shape=(None,))
+        model._run_eagerly = test_utils.should_run_eagerly()
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0], [4, 0, 0], [5, 6, 0]])
+        output = model.predict(input_data, batch_size=2)
+
+        expected_values = [[1], [2, 3], [4], [5, 6]]
+        self.assertAllEqual(expected_values, output)
+
+    def test_sparse_tensor_outputs(self):
+        # Create a model that accepts an input, converts it to Ragged, and
+        # converts the ragged tensor back to a dense tensor.
+        layers = [ToSparse()]
+        model = test_utils.get_model_from_layers(layers, input_shape=(None,))
+        model._run_eagerly = test_utils.should_run_eagerly()
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0]])
+        output = model.predict(input_data)
+
+        expected_indices = np.array([[0, 0], [1, 0], [1, 1]])
+        expected_values = np.array([1, 2, 3])
+        expected_dense_shape = np.array([2, 3])
+
+        self.assertAllEqual(output.indices, expected_indices)
+        self.assertAllEqual(output.values, expected_values)
+        self.assertAllEqual(output.dense_shape, expected_dense_shape)
+
+    def test_sparse_tensor_rebatched_outputs(self):
+        # Create a model that accepts an input, converts it to Ragged, and
+        # converts the ragged tensor back to a dense tensor.
+        layers = [ToSparse()]
+        model = test_utils.get_model_from_layers(layers, input_shape=(None,))
+        model._run_eagerly = test_utils.should_run_eagerly()
+
+        # Define some input data with additional padding.
+        input_data = np.array([[1, 0, 0], [2, 3, 0], [4, 0, 0], [5, 6, 0]])
+        output = model.predict(input_data, batch_size=2)
+
+        expected_indices = np.array(
+            [[0, 0], [1, 0], [1, 1], [2, 0], [3, 0], [3, 1]]
+        )
+        expected_values = np.array([1, 2, 3, 4, 5, 6])
+        expected_dense_shape = np.array([4, 3])
+
+        self.assertAllEqual(output.indices, expected_indices)
+        self.assertAllEqual(output.values, expected_values)
+        self.assertAllEqual(output.dense_shape, expected_dense_shape)
 
 
 def get_input_name(use_dict):
-  # Define the input name.
-  if not use_dict:
-    return None  # This is the same as not setting 'name'.
-  elif test_utils.get_model_type() == "subclass":
-    return "input_1"  # Subclass models don"t support input names.
-  else:
-    return "test_input_name"
+    # Define the input name.
+    if not use_dict:
+        return None  # This is the same as not setting 'name'.
+    elif test_utils.get_model_type() == "subclass":
+        return "input_1"  # Subclass models don"t support input names.
+    else:
+        return "test_input_name"
 
 
 def get_kwargs(use_dataset, action="predict"):
-  if use_dataset or not tf.executing_eagerly():
-    if action == "fit":
-      return {"steps_per_epoch": 1}
-    return {"steps": 1}
-  else:
-    return {"batch_size": 2}
+    if use_dataset or not tf.executing_eagerly():
+        if action == "fit":
+            return {"steps_per_epoch": 1}
+        return {"steps": 1}
+    else:
+        return {"batch_size": 2}
 
 
 def prepare_inputs(data, use_dict, use_dataset, action, input_name):
-  input_data, expected_output = data
-  batch_size = input_data.shape[0]
-  # Prepare the input data.
-  if use_dict:
-    input_data = {input_name: input_data}
-  if use_dataset:
-    if action == "predict":
-      input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
-          batch_size)
-    else:
-      input_data = tf.data.Dataset.from_tensor_slices(
-          (input_data, expected_output)).batch(batch_size)
-      expected_output = None
-  return (input_data, expected_output)
+    input_data, expected_output = data
+    batch_size = input_data.shape[0]
+    # Prepare the input data.
+    if use_dict:
+        input_data = {input_name: input_data}
+    if use_dataset:
+        if action == "predict":
+            input_data = tf.data.Dataset.from_tensor_slices(input_data).batch(
+                batch_size
+            )
+        else:
+            input_data = tf.data.Dataset.from_tensor_slices(
+                (input_data, expected_output)
+            ).batch(batch_size)
+            expected_output = None
+    return (input_data, expected_output)
 
 
 @test_combinations.run_with_all_model_types
@@ -312,163 +319,191 @@ def prepare_inputs(data, use_dict, use_dataset, action, input_name):
     *test_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False],
         use_dataset=[True, False],
-        action=["predict", "evaluate", "fit"]))
+        action=["predict", "evaluate", "fit"],
+    )
+)
 class SparseTensorInputTest(test_combinations.TestCase):
-
-  def test_sparse_tensors(self, use_dict, use_dataset, action):
-    data = [(tf.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
-                                        [1, 2, 3], [2, 1, 3]),
-             np.array([[[1, -1, -1]], [[2, 3, -1]]])),
-            (tf.SparseTensor(
-                [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8],
-                [3, 1, 4]),
-             np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]], [[-1, 8, -1,
-                                                              -1]]]))]
-    # Prepare the model to test.
-    input_name = get_input_name(use_dict)
-    model_input = input_layer.Input(
-        shape=(1, None), sparse=True, name=input_name, dtype=tf.int32)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(
-        optimizer="sgd",
-        loss="mse",
-        metrics=["accuracy"],
-        **get_test_mode_kwargs())
-    kwargs = get_kwargs(use_dataset, action)
-
-    # Prepare the input data
-    for data_element in data:
-      input_data, expected_output = prepare_inputs(data_element, use_dict,
-                                                   use_dataset, action,
-                                                   input_name)
-      # Perform the action.
-      if action == "predict":
-        result = model.predict(input_data, **kwargs)
-        self.assertAllEqual(expected_output, result)
-      if action == "evaluate":
-        result = model.evaluate(input_data, expected_output, **kwargs)
-        self.assertAllEqual(1.0, result[-1])
-      if action == "fit":
-        # TODO(momernick): What's the best way of validating that fit happened?
-        _ = model.fit(input_data, expected_output, shuffle=False, **kwargs)
+    def test_sparse_tensors(self, use_dict, use_dataset, action):
+        data = [
+            (
+                tf.SparseTensor(
+                    [[0, 0, 0], [1, 0, 0], [1, 0, 1]], [1, 2, 3], [2, 1, 3]
+                ),
+                np.array([[[1, -1, -1]], [[2, 3, -1]]]),
+            ),
+            (
+                tf.SparseTensor(
+                    [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]],
+                    [5, 6, 7, 8],
+                    [3, 1, 4],
+                ),
+                np.array(
+                    [[[5, -1, -1, -1]], [[6, 7, -1, -1]], [[-1, 8, -1, -1]]]
+                ),
+            ),
+        ]
+        # Prepare the model to test.
+        input_name = get_input_name(use_dict)
+        model_input = input_layer.Input(
+            shape=(1, None), sparse=True, name=input_name, dtype=tf.int32
+        )
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            metrics=["accuracy"],
+            **get_test_mode_kwargs(),
+        )
+        kwargs = get_kwargs(use_dataset, action)
+
+        # Prepare the input data
+        for data_element in data:
+            input_data, expected_output = prepare_inputs(
+                data_element, use_dict, use_dataset, action, input_name
+            )
+            # Perform the action.
+            if action == "predict":
+                result = model.predict(input_data, **kwargs)
+                self.assertAllEqual(expected_output, result)
+            if action == "evaluate":
+                result = model.evaluate(input_data, expected_output, **kwargs)
+                self.assertAllEqual(1.0, result[-1])
+            if action == "fit":
+                # TODO(momernick): What's the best way of validating that fit
+                # happened?
+                _ = model.fit(
+                    input_data, expected_output, shuffle=False, **kwargs
+                )
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
-class ScipySparseTensorInputTest(test_combinations.TestCase,
-                                 tf.test.TestCase):
-
-  def test_sparse_scipy_predict_inputs_via_input_layer_args(self):
-    # Create a model that accepts a sparse input and converts the sparse tensor
-    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
-    # a one-dimensional shape; note also that scipy's default dtype is int64.
-    model_input = input_layer.Input(shape=(3,), sparse=True, dtype=tf.int64)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-
-    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
-                                         shape=[2, 3])
-    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
-    output = model.predict(input_data, steps=1)
-    self.assertAllEqual(expected_output, output)
-
-    input_data_2 = scipy.sparse.coo_matrix(
-        ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
-    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
-    output_2 = model.predict(input_data_2, steps=1)
-    self.assertAllEqual(expected_output_2, output_2)
-
-  def test_sparse_scipy_eval_inputs(self):
-    # Create a model that accepts a sparse input and converts the sparse tensor
-    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
-    # a one-dimensional shape; note also that scipy's default dtype is int64.
-    model_input = input_layer.Input(shape=(3,), sparse=True, dtype=tf.int64)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(
-        optimizer="sgd",
-        loss="mse",
-        metrics=["accuracy"])
-
-    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
-                                         shape=[2, 3])
-    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
-
-    output = model.evaluate(input_data, expected_output, steps=1)
-    self.assertAllEqual(1.0, output[-1])
-
-    input_data_2 = scipy.sparse.coo_matrix(
-        ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
-    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
-    output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
-    self.assertAllEqual(1.0, output_2[-1])
-
-  def test_sparse_scipy_predict_input_dicts_via_input_layer_args(self):
-    # Create a model that accepts a sparse input and converts the sparse tensor
-    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
-    # a one-dimensional shape; note also that scipy's default dtype is int64.
-    if test_utils.get_model_type() == "subclass":
-      input_name = "input_1"  # Subclass models don"t support input names.
-    else:
-      input_name = "test_input_name"
-    model_input = input_layer.Input(
-        shape=(3,), sparse=True, name=input_name, dtype=tf.int64)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-
-    input_data = {
-        input_name:
-            scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
-                                    shape=[2, 3])
-    }
-    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
-    output = model.predict(input_data, steps=1)
-    self.assertAllEqual(expected_output, output)
-
-    input_data_2 = {
-        input_name:
-            scipy.sparse.coo_matrix(
-                ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
-    }
-    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
-    output_2 = model.predict(input_data_2, steps=1)
-    self.assertAllEqual(expected_output_2, output_2)
-
-  def test_sparse_scipy_eval_input_dicts(self):
-    # Create a model that accepts a sparse input and converts the sparse tensor
-    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
-    # a one-dimensional shape; note also that scipy's default dtype is int64.
-    if test_utils.get_model_type() == "subclass":
-      input_name = "input_1"  # Subclass models don"t support input names.
-    else:
-      input_name = "test_input_name"
-    model_input = input_layer.Input(
-        shape=(3,), sparse=True, name=input_name, dtype=tf.int64)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(
-        optimizer="sgd",
-        loss="mse",
-        metrics=["accuracy"])
-
-    input_data = {
-        input_name:
-            scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
-                                    shape=[2, 3])
-    }
-    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
-    output = model.evaluate(input_data, expected_output, steps=1)
-    self.assertAllEqual(1.0, output[-1])
-
-    input_data_2 = {
-        input_name:
-            scipy.sparse.coo_matrix(
-                ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
-    }
-    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
-    output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
-    self.assertAllEqual(1.0, output_2[-1])
+class ScipySparseTensorInputTest(test_combinations.TestCase, tf.test.TestCase):
+    def test_sparse_scipy_predict_inputs_via_input_layer_args(self):
+        # Create a model that accepts a sparse input and converts the sparse
+        # tensor back to a dense tensor. Scipy sparse matrices are limited to
+        # 2D, so use a one-dimensional shape; note also that scipy's default
+        # dtype is int64.
+        model_input = input_layer.Input(shape=(3,), sparse=True, dtype=tf.int64)
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+
+        input_data = scipy.sparse.coo_matrix(
+            ([1, 2, 3], ([0, 1, 1], [0, 0, 1])), shape=[2, 3]
+        )
+        expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+        output = model.predict(input_data, steps=1)
+        self.assertAllEqual(expected_output, output)
+
+        input_data_2 = scipy.sparse.coo_matrix(
+            ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3]
+        )
+        expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+        output_2 = model.predict(input_data_2, steps=1)
+        self.assertAllEqual(expected_output_2, output_2)
+
+    def test_sparse_scipy_eval_inputs(self):
+        # Create a model that accepts a sparse input and converts the sparse
+        # tensor back to a dense tensor. Scipy sparse matrices are limited to
+        # 2D, so use a one-dimensional shape; note also that scipy's default
+        # dtype is int64.
+        model_input = input_layer.Input(shape=(3,), sparse=True, dtype=tf.int64)
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+        model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+        input_data = scipy.sparse.coo_matrix(
+            ([1, 2, 3], ([0, 1, 1], [0, 0, 1])), shape=[2, 3]
+        )
+        expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+
+        output = model.evaluate(input_data, expected_output, steps=1)
+        self.assertAllEqual(1.0, output[-1])
+
+        input_data_2 = scipy.sparse.coo_matrix(
+            ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3]
+        )
+        expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+        output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
+        self.assertAllEqual(1.0, output_2[-1])
+
+    def test_sparse_scipy_predict_input_dicts_via_input_layer_args(self):
+        # Create a model that accepts a sparse input and converts the sparse
+        # tensor back to a dense tensor. Scipy sparse matrices are limited to
+        # 2D, so use a one-dimensional shape; note also that scipy's default
+        # dtype is int64.
+        if test_utils.get_model_type() == "subclass":
+            input_name = "input_1"  # Subclass models don"t support input names.
+        else:
+            input_name = "test_input_name"
+        model_input = input_layer.Input(
+            shape=(3,), sparse=True, name=input_name, dtype=tf.int64
+        )
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+
+        input_data = {
+            input_name: scipy.sparse.coo_matrix(
+                ([1, 2, 3], ([0, 1, 1], [0, 0, 1])), shape=[2, 3]
+            )
+        }
+        expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+        output = model.predict(input_data, steps=1)
+        self.assertAllEqual(expected_output, output)
+
+        input_data_2 = {
+            input_name: scipy.sparse.coo_matrix(
+                ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3]
+            )
+        }
+        expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+        output_2 = model.predict(input_data_2, steps=1)
+        self.assertAllEqual(expected_output_2, output_2)
+
+    def test_sparse_scipy_eval_input_dicts(self):
+        # Create a model that accepts a sparse input and converts the sparse
+        # tensor back to a dense tensor. Scipy sparse matrices are limited to
+        # 2D, so use a one-dimensional shape; note also that scipy's default
+        # dtype is int64.
+        if test_utils.get_model_type() == "subclass":
+            input_name = "input_1"  # Subclass models don"t support input names.
+        else:
+            input_name = "test_input_name"
+        model_input = input_layer.Input(
+            shape=(3,), sparse=True, name=input_name, dtype=tf.int64
+        )
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+        model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+        input_data = {
+            input_name: scipy.sparse.coo_matrix(
+                ([1, 2, 3], ([0, 1, 1], [0, 0, 1])), shape=[2, 3]
+            )
+        }
+        expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+        output = model.evaluate(input_data, expected_output, steps=1)
+        self.assertAllEqual(1.0, output[-1])
+
+        input_data_2 = {
+            input_name: scipy.sparse.coo_matrix(
+                ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3]
+            )
+        }
+        expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+        output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
+        self.assertAllEqual(1.0, output_2[-1])
 
 
 @test_combinations.run_with_all_model_types
@@ -477,165 +512,207 @@ def test_sparse_scipy_eval_input_dicts(self):
     *test_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False],
         use_dataset=[True, False],
-        action=["predict", "evaluate", "fit"]))
-class RaggedTensorInputTest(test_combinations.TestCase,
-                            tf.test.TestCase):
-
-  def test_ragged_input(self, use_dict, use_dataset, action):
-    data = [(tf.ragged.constant([[[1]], [[2, 3]]]),
-             np.array([[[1, -1]], [[2, 3]]]))]
-
-    # Prepare the model to test.
-    input_name = get_input_name(use_dict)
-    model_input = input_layer.Input(
-        shape=(None, None), ragged=True, name=input_name, dtype=tf.int32,
-        batch_size=2)
-    self.assertIsInstance(model_input._type_spec,
-                          tf.RaggedTensorSpec)
-    self.assertEqual(model_input.shape.as_list(), [2, None, None])
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(
-        optimizer="sgd",
-        loss="mse",
-        metrics=["accuracy"],
-        **get_test_mode_kwargs())
-
-    # Prepare the input data
-    for data_element in data:
-      input_data, expected_output = prepare_inputs(data_element, use_dict,
-                                                   use_dataset, action,
-                                                   input_name)
-      # Perform the action.
-      if action == "predict":
-        result = model.predict(input_data)
-        self.assertAllEqual(expected_output, result)
-      if action == "evaluate":
-        result = model.evaluate(input_data, expected_output)
-        self.assertAllEqual(1.0, result[-1])
-      if action == "fit":
-        # TODO(momernick): What's the best way of validating that fit happened?
-        _ = model.fit(input_data, expected_output, shuffle=False)
+        action=["predict", "evaluate", "fit"],
+    )
+)
+class RaggedTensorInputTest(test_combinations.TestCase, tf.test.TestCase):
+    def test_ragged_input(self, use_dict, use_dataset, action):
+        data = [
+            (
+                tf.ragged.constant([[[1]], [[2, 3]]]),
+                np.array([[[1, -1]], [[2, 3]]]),
+            )
+        ]
+
+        # Prepare the model to test.
+        input_name = get_input_name(use_dict)
+        model_input = input_layer.Input(
+            shape=(None, None),
+            ragged=True,
+            name=input_name,
+            dtype=tf.int32,
+            batch_size=2,
+        )
+        self.assertIsInstance(model_input._type_spec, tf.RaggedTensorSpec)
+        self.assertEqual(model_input.shape.as_list(), [2, None, None])
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            metrics=["accuracy"],
+            **get_test_mode_kwargs(),
+        )
+
+        # Prepare the input data
+        for data_element in data:
+            input_data, expected_output = prepare_inputs(
+                data_element, use_dict, use_dataset, action, input_name
+            )
+            # Perform the action.
+            if action == "predict":
+                result = model.predict(input_data)
+                self.assertAllEqual(expected_output, result)
+            if action == "evaluate":
+                result = model.evaluate(input_data, expected_output)
+                self.assertAllEqual(1.0, result[-1])
+            if action == "fit":
+                # TODO(momernick): What's the best way of validating that fit
+                # happened?
+                _ = model.fit(input_data, expected_output, shuffle=False)
 
 
 @test_combinations.run_with_all_model_types
 @test_combinations.run_all_keras_modes
 @parameterized.named_parameters(
     *test_utils.generate_combinations_with_testcase_name(
-        use_dict=[True, False], use_dataset=[True, False]))
-class RaggedTensorInputValidationTest(test_combinations.TestCase,
-                                      tf.test.TestCase):
-
-  def test_ragged_tensor_input_with_one_none_dimension(self, use_dict,
-                                                       use_dataset):
-    # Define some input data.
-    data = [(tf.ragged.constant([[[1, 0]], [[2, 3]]], ragged_rank=1),
-             np.array([[[1, 0]], [[2, 3]]]))]
-
-    # Prepare the model to test.
-    input_shape = (None, 2)  # RaggedTensorInputTest uses (None, None).
-    input_name = get_input_name(use_dict)
-    model_input = input_layer.Input(
-        shape=input_shape, ragged=True, name=input_name, dtype=tf.int32)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(
-        optimizer="sgd",
-        loss="mse",
-        metrics=["accuracy"],
-        **get_test_mode_kwargs())
-
-    for data_element in data:
-      input_data, expected_output = prepare_inputs(
-          data_element,
-          use_dict,
-          use_dataset,
-          action="predict",
-          input_name=input_name)
-      result = model.predict(input_data)
-      self.assertAllEqual(expected_output, result)
-
-  def test_ragged_tensor_input_with_no_none_dimension(self, use_dict,
-                                                      use_dataset):
-    # Define some input data.
-    data = [(tf.ragged.constant([[[1, 0]], [[2, 3]]], ragged_rank=0),
-             np.array([[[1, 0]], [[2, 3]]]))]
-
-    # Prepare the model to test.
-    input_shape = (1, 2)  # RaggedTensorInputTest uses (None, None).
-    input_name = get_input_name(use_dict)
-    model_input = input_layer.Input(
-        shape=input_shape, ragged=True, name=input_name, dtype=tf.int32)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(
-        optimizer="sgd",
-        loss="mse",
-        metrics=["accuracy"],
-        **get_test_mode_kwargs())
-    kwargs = get_kwargs(use_dataset)
-
-    for data_element in data:
-      input_data, expected_output = prepare_inputs(
-          data_element,
-          use_dict,
-          use_dataset,
-          action="predict",
-          input_name=input_name)
-      result = model.predict(input_data, **kwargs)
-      self.assertAllEqual(expected_output, result)
+        use_dict=[True, False], use_dataset=[True, False]
+    )
+)
+class RaggedTensorInputValidationTest(
+    test_combinations.TestCase, tf.test.TestCase
+):
+    def test_ragged_tensor_input_with_one_none_dimension(
+        self, use_dict, use_dataset
+    ):
+        # Define some input data.
+        data = [
+            (
+                tf.ragged.constant([[[1, 0]], [[2, 3]]], ragged_rank=1),
+                np.array([[[1, 0]], [[2, 3]]]),
+            )
+        ]
+
+        # Prepare the model to test.
+        input_shape = (None, 2)  # RaggedTensorInputTest uses (None, None).
+        input_name = get_input_name(use_dict)
+        model_input = input_layer.Input(
+            shape=input_shape, ragged=True, name=input_name, dtype=tf.int32
+        )
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            metrics=["accuracy"],
+            **get_test_mode_kwargs(),
+        )
+
+        for data_element in data:
+            input_data, expected_output = prepare_inputs(
+                data_element,
+                use_dict,
+                use_dataset,
+                action="predict",
+                input_name=input_name,
+            )
+            result = model.predict(input_data)
+            self.assertAllEqual(expected_output, result)
+
+    def test_ragged_tensor_input_with_no_none_dimension(
+        self, use_dict, use_dataset
+    ):
+        # Define some input data.
+        data = [
+            (
+                tf.ragged.constant([[[1, 0]], [[2, 3]]], ragged_rank=0),
+                np.array([[[1, 0]], [[2, 3]]]),
+            )
+        ]
+
+        # Prepare the model to test.
+        input_shape = (1, 2)  # RaggedTensorInputTest uses (None, None).
+        input_name = get_input_name(use_dict)
+        model_input = input_layer.Input(
+            shape=input_shape, ragged=True, name=input_name, dtype=tf.int32
+        )
+        layers = [ToDense(default_value=-1)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+        model.compile(
+            optimizer="sgd",
+            loss="mse",
+            metrics=["accuracy"],
+            **get_test_mode_kwargs(),
+        )
+        kwargs = get_kwargs(use_dataset)
+
+        for data_element in data:
+            input_data, expected_output = prepare_inputs(
+                data_element,
+                use_dict,
+                use_dataset,
+                action="predict",
+                input_name=input_name,
+            )
+            result = model.predict(input_data, **kwargs)
+            self.assertAllEqual(expected_output, result)
 
 
 @test_combinations.run_with_all_model_types()
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class CompositeTensorModelPredictTest(test_combinations.TestCase):
-
-  def _normalize_shape(self, shape):
-    if not isinstance(shape, tuple):
-      shape = tuple(shape.as_list())
-    return shape
-
-  def test_sparse_tensor_model_predict(self):
-    # Create a model that accepts a sparse input and runs a "Dense" layer on it.
-    model_input = input_layer.Input(
-        shape=(3,), sparse=True, dtype=tf.float32)
-
-    self.assertEqual([None, 3], model_input.shape.as_list())
-
-    layers = [Dense(2)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-
-    sparse_input = tf.SparseTensor(
-        # A two-row matrix
-        indices=[(0, 0), (0, 1), (0, 2), (5, 0), (5, 1), (5, 2)],
-        values=[1., 1., 1., 1., 1., 1.],
-        dense_shape=(6, 3))
-
-    shape = model(sparse_input).shape
-    self.assertEqual((6, 2), self._normalize_shape(shape))
-
-    shape = model.predict(sparse_input, steps=1).shape
-    self.assertEqual((6, 2), self._normalize_shape(shape))
-
-  def test_ragged_tensor_model_predict(self):
-    # Create a model that accepts a sparse input and runs a "Dense" layer on it.
-    model_input = input_layer.Input(shape=(None,), ragged=True)
-    self.assertEqual([None, None], model_input.shape.as_list())
-
-    layers = [Embedding(input_dim=7, output_dim=5)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-
-    ragged_input = tf.ragged.constant([
-        [1, 2, 3, 4, 5],
-        [2, 4],
-    ])
-
-    shape = model(ragged_input).shape
-    self.assertEqual((2, None, 5), self._normalize_shape(shape))
-
-    shape = model.predict(ragged_input, steps=1).shape
-    self.assertEqual((2, None, 5), self._normalize_shape(shape))
+    def _normalize_shape(self, shape):
+        if not isinstance(shape, tuple):
+            shape = tuple(shape.as_list())
+        return shape
+
+    def test_sparse_tensor_model_predict(self):
+        # Create a model that accepts a sparse input and runs a "Dense" layer on
+        # it.
+        model_input = input_layer.Input(
+            shape=(3,), sparse=True, dtype=tf.float32
+        )
+
+        self.assertEqual([None, 3], model_input.shape.as_list())
+
+        layers = [Dense(2)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+
+        sparse_input = tf.SparseTensor(
+            # A two-row matrix
+            indices=[(0, 0), (0, 1), (0, 2), (5, 0), (5, 1), (5, 2)],
+            values=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            dense_shape=(6, 3),
+        )
+
+        shape = model(sparse_input).shape
+        self.assertEqual((6, 2), self._normalize_shape(shape))
+
+        shape = model.predict(sparse_input, steps=1).shape
+        self.assertEqual((6, 2), self._normalize_shape(shape))
+
+    def test_ragged_tensor_model_predict(self):
+        # Create a model that accepts a sparse input and runs a "Dense" layer on
+        # it.
+        model_input = input_layer.Input(shape=(None,), ragged=True)
+        self.assertEqual([None, None], model_input.shape.as_list())
+
+        layers = [Embedding(input_dim=7, output_dim=5)]
+        model = get_model_from_layers_with_input(
+            layers, model_input=model_input
+        )
+
+        ragged_input = tf.ragged.constant(
+            [
+                [1, 2, 3, 4, 5],
+                [2, 4],
+            ]
+        )
+
+        shape = model(ragged_input).shape
+        self.assertEqual((2, None, 5), self._normalize_shape(shape))
+
+        shape = model.predict(ragged_input, steps=1).shape
+        self.assertEqual((2, None, 5), self._normalize_shape(shape))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/utils/control_flow_util.py b/keras/utils/control_flow_util.py
index 1d43c1221cbe..d895e93da68e 100644
--- a/keras/utils/control_flow_util.py
+++ b/keras/utils/control_flow_util.py
@@ -21,112 +21,118 @@
 
 
 def InXlaContext(graph):
-  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
-  return GetContainingXLAContext(ctxt) is not None
+    ctxt = graph._get_control_flow_context()
+    return GetContainingXLAContext(ctxt) is not None
 
 
 def GraphOrParentsInXlaContext(graph):
-  while True:
-    if InXlaContext(graph): return True
-    try:
-      graph = graph.outer_graph
-    except AttributeError:
-      return False
+    while True:
+        if InXlaContext(graph):
+            return True
+        try:
+            graph = graph.outer_graph
+        except AttributeError:
+            return False
 
 
 def IsInWhileLoop(op):
-  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
-  return GetContainingWhileContext(ctxt) is not None
+    ctxt = op._get_control_flow_context()
+    return GetContainingWhileContext(ctxt) is not None
 
 
 def GetContainingWhileContext(ctxt, stop_ctxt=None):
-  """Returns the first ancestor WhileContext of `ctxt`.
-
-  Returns `ctxt` if `ctxt` is a WhileContext, or None if `ctxt` is not in a
-  while loop.
-
-  Args:
-    ctxt: ControlFlowContext
-    stop_ctxt: ControlFlowContext, optional. If provided, the search will end
-      if it sees stop_ctxt.
-
-  Returns:
-    `ctxt` if `ctxt` is a WhileContext, the most nested WhileContext containing
-    `ctxt`, or None if `ctxt` is not in a while loop.  If `stop_ctxt` is not
-    `None`, this returns `ctxt` if it matches `stop_ctxt` in its traversal.
-  """
-  while ctxt:
-    if ctxt.IsWhileContext() or ctxt == stop_ctxt: return ctxt
-    ctxt = ctxt.outer_context
-  return None
+    """Returns the first ancestor WhileContext of `ctxt`.
+
+    Returns `ctxt` if `ctxt` is a WhileContext, or None if `ctxt` is not in a
+    while loop.
+
+    Args:
+      ctxt: ControlFlowContext
+      stop_ctxt: ControlFlowContext, optional. If provided, the search will end
+        if it sees stop_ctxt.
+
+    Returns:
+      `ctxt` if `ctxt` is a WhileContext, the most nested WhileContext
+      containing `ctxt`, or None if `ctxt` is not in a while loop.  If
+      `stop_ctxt` is not `None`, this returns `ctxt` if it matches `stop_ctxt`
+      in its traversal.
+    """
+    while ctxt:
+        if ctxt.IsWhileContext() or ctxt == stop_ctxt:
+            return ctxt
+        ctxt = ctxt.outer_context
+    return None
 
 
 def GetContainingXLAContext(ctxt):
-  """Returns the first ancestor XLAContext of `ctxt`.
-
-  Returns `ctxt` if `ctxt` is a XLAContext, or None if `ctxt` is not in a
-  while loop.
-
-  Args:
-    ctxt: ControlFlowContext
-
-  Returns:
-    `ctxt` if `ctxt` is a XLAContext, the most nested XLAContext containing
-    `ctxt`, or None if `ctxt` is not in a while loop.
-  """
-  while ctxt:
-    if ctxt.IsXLAContext(): return ctxt
-    ctxt = ctxt.outer_context
-  return None
-
-
-def smart_cond(pred, true_fn=None, false_fn=None, name=None):  # pylint: disable=invalid-name
-  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
-
-  If `pred` is a bool or has a constant value, we return either `true_fn()`
-  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
-
-  Args:
-    pred: A scalar determining whether to return the result of `true_fn` or
-      `false_fn`.
-    true_fn: The callable to be performed if pred is true.
-    false_fn: The callable to be performed if pred is false.
-    name: Optional name prefix when using `tf.cond`.
-
-  Returns:
-    Tensors returned by the call to either `true_fn` or `false_fn`.
-
-  Raises:
-    TypeError: If `true_fn` or `false_fn` is not callable.
-  """
-  if isinstance(pred, tf.Variable):
-    return tf.cond(
-        pred, true_fn=true_fn, false_fn=false_fn, name=name)
-  return tf.__internal__.smart_cond.smart_cond(
-      pred, true_fn=true_fn, false_fn=false_fn, name=name)
-
-
-def constant_value(pred):  # pylint: disable=invalid-name
-  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
-
-  Args:
-    pred: A scalar, either a Python bool or a TensorFlow boolean variable
-      or tensor, or the Python integer 1 or 0.
-
-  Returns:
-    True or False if `pred` has a constant boolean value, None otherwise.
-
-  Raises:
-    TypeError: If `pred` is not a Variable, Tensor or bool, or Python
-      integer 1 or 0.
-  """
-  if isinstance(pred, tf.Tensor):
-    return tf.get_static_value(pred)
-  if pred in {0, 1}:  # Accept 1/0 as valid boolean values
-    return bool(pred)
-  if isinstance(pred, bool):
-    return pred
-  if isinstance(pred, tf.Variable):
+    """Returns the first ancestor XLAContext of `ctxt`.
+
+    Returns `ctxt` if `ctxt` is a XLAContext, or None if `ctxt` is not in a
+    while loop.
+
+    Args:
+      ctxt: ControlFlowContext
+
+    Returns:
+      `ctxt` if `ctxt` is a XLAContext, the most nested XLAContext containing
+      `ctxt`, or None if `ctxt` is not in a while loop.
+    """
+    while ctxt:
+        if ctxt.IsXLAContext():
+            return ctxt
+        ctxt = ctxt.outer_context
     return None
-  raise TypeError("`pred` must be a Tensor, or a Python bool, or 1 or 0. "
-                  f"Received: {type(pred)}")
+
+
+def smart_cond(pred, true_fn=None, false_fn=None, name=None):
+    """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
+
+    If `pred` is a bool or has a constant value, we return either `true_fn()`
+    or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
+
+    Args:
+      pred: A scalar determining whether to return the result of `true_fn` or
+        `false_fn`.
+      true_fn: The callable to be performed if pred is true.
+      false_fn: The callable to be performed if pred is false.
+      name: Optional name prefix when using `tf.cond`.
+
+    Returns:
+      Tensors returned by the call to either `true_fn` or `false_fn`.
+
+    Raises:
+      TypeError: If `true_fn` or `false_fn` is not callable.
+    """
+    if isinstance(pred, tf.Variable):
+        return tf.cond(pred, true_fn=true_fn, false_fn=false_fn, name=name)
+    return tf.__internal__.smart_cond.smart_cond(
+        pred, true_fn=true_fn, false_fn=false_fn, name=name
+    )
+
+
+def constant_value(pred):
+    """Return the bool value for `pred`, or None if `pred` had a dynamic value.
+
+    Args:
+      pred: A scalar, either a Python bool or a TensorFlow boolean variable
+        or tensor, or the Python integer 1 or 0.
+
+    Returns:
+      True or False if `pred` has a constant boolean value, None otherwise.
+
+    Raises:
+      TypeError: If `pred` is not a Variable, Tensor or bool, or Python
+        integer 1 or 0.
+    """
+    if isinstance(pred, tf.Tensor):
+        return tf.get_static_value(pred)
+    if pred in {0, 1}:  # Accept 1/0 as valid boolean values
+        return bool(pred)
+    if isinstance(pred, bool):
+        return pred
+    if isinstance(pred, tf.Variable):
+        return None
+    raise TypeError(
+        "`pred` must be a Tensor, or a Python bool, or 1 or 0. "
+        f"Received: {type(pred)}"
+    )
diff --git a/keras/utils/conv_utils.py b/keras/utils/conv_utils.py
index 5940653999e0..930bbaf9fef9 100644
--- a/keras/utils/conv_utils.py
+++ b/keras/utils/conv_utils.py
@@ -14,517 +14,568 @@
 # ==============================================================================
 """Utilities used by convolution layers."""
 
-import tensorflow.compat.v2 as tf
-
 import itertools
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 
 
 def convert_data_format(data_format, ndim):
-  if data_format == 'channels_last':
-    if ndim == 3:
-      return 'NWC'
-    elif ndim == 4:
-      return 'NHWC'
-    elif ndim == 5:
-      return 'NDHWC'
-    else:
-      raise ValueError(
-          f'Input rank not supported: {ndim}. Expected values are [3, 4, 5]')
-  elif data_format == 'channels_first':
-    if ndim == 3:
-      return 'NCW'
-    elif ndim == 4:
-      return 'NCHW'
-    elif ndim == 5:
-      return 'NCDHW'
+    if data_format == "channels_last":
+        if ndim == 3:
+            return "NWC"
+        elif ndim == 4:
+            return "NHWC"
+        elif ndim == 5:
+            return "NDHWC"
+        else:
+            raise ValueError(
+                f"Input rank not supported: {ndim}. "
+                "Expected values are [3, 4, 5]"
+            )
+    elif data_format == "channels_first":
+        if ndim == 3:
+            return "NCW"
+        elif ndim == 4:
+            return "NCHW"
+        elif ndim == 5:
+            return "NCDHW"
+        else:
+            raise ValueError(
+                f"Input rank not supported: {ndim}. "
+                "Expected values are [3, 4, 5]"
+            )
     else:
-      raise ValueError(
-          f'Input rank not supported: {ndim}. Expected values are [3, 4, 5]')
-  else:
-    raise ValueError(
-        f'Invalid data_format: {data_format}. '
-        'Expected values are ["channels_first", "channels_last"]')
+        raise ValueError(
+            f"Invalid data_format: {data_format}. "
+            'Expected values are ["channels_first", "channels_last"]'
+        )
 
 
 def normalize_tuple(value, n, name, allow_zero=False):
-  """Transforms non-negative/positive integer/integers into an integer tuple.
-
-  Args:
-    value: The value to validate and convert. Could an int, or any iterable of
-      ints.
-    n: The size of the tuple to be returned.
-    name: The name of the argument being validated, e.g. "strides" or
-      "kernel_size". This is only used to format error messages.
-    allow_zero: Default to False. A ValueError will raised if zero is received
-      and this param is False.
-
-  Returns:
-    A tuple of n integers.
-
-  Raises:
-    ValueError: If something else than an int/long or iterable thereof or a
-    negative value is
-      passed.
-  """
-  error_msg = (f'The `{name}` argument must be a tuple of {n} '
-               f'integers. Received: {value}')
-
-  if isinstance(value, int):
-    value_tuple = (value,) * n
-  else:
-    try:
-      value_tuple = tuple(value)
-    except TypeError:
-      raise ValueError(error_msg)
-    if len(value_tuple) != n:
-      raise ValueError(error_msg)
-    for single_value in value_tuple:
-      try:
-        int(single_value)
-      except (ValueError, TypeError):
-        error_msg += (f'including element {single_value} of '
-                      f'type {type(single_value)}')
+    """Transforms non-negative/positive integer/integers into an integer tuple.
+
+    Args:
+      value: The value to validate and convert. Could an int, or any iterable of
+        ints.
+      n: The size of the tuple to be returned.
+      name: The name of the argument being validated, e.g. "strides" or
+        "kernel_size". This is only used to format error messages.
+      allow_zero: A ValueError will be raised if zero is received
+        and this param is False. Defaults to `False`.
+
+    Returns:
+      A tuple of n integers.
+
+    Raises:
+      ValueError: If something else than an int/long or iterable thereof or a
+      negative value is
+        passed.
+    """
+    error_msg = (
+        f"The `{name}` argument must be a tuple of {n} "
+        f"integers. Received: {value}"
+    )
+
+    if isinstance(value, int):
+        value_tuple = (value,) * n
+    else:
+        try:
+            value_tuple = tuple(value)
+        except TypeError:
+            raise ValueError(error_msg)
+        if len(value_tuple) != n:
+            raise ValueError(error_msg)
+        for single_value in value_tuple:
+            try:
+                int(single_value)
+            except (ValueError, TypeError):
+                error_msg += (
+                    f"including element {single_value} of "
+                    f"type {type(single_value)}"
+                )
+                raise ValueError(error_msg)
+
+    if allow_zero:
+        unqualified_values = {v for v in value_tuple if v < 0}
+        req_msg = ">= 0"
+    else:
+        unqualified_values = {v for v in value_tuple if v <= 0}
+        req_msg = "> 0"
+
+    if unqualified_values:
+        error_msg += (
+            f" including {unqualified_values}"
+            f" that does not satisfy the requirement `{req_msg}`."
+        )
         raise ValueError(error_msg)
 
-  if allow_zero:
-    unqualified_values = {v for v in value_tuple if v < 0}
-    req_msg = '>= 0'
-  else:
-    unqualified_values = {v for v in value_tuple if v <= 0}
-    req_msg = '> 0'
-
-  if unqualified_values:
-    error_msg += (f' including {unqualified_values}'
-                  f' that does not satisfy the requirement `{req_msg}`.')
-    raise ValueError(error_msg)
-
-  return value_tuple
+    return value_tuple
 
 
 def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
-  """Determines output length of a convolution given input length.
-
-  Args:
-      input_length: integer.
-      filter_size: integer.
-      padding: one of "same", "valid", "full", "causal"
-      stride: integer.
-      dilation: dilation rate, integer.
-
-  Returns:
-      The output length (integer).
-  """
-  if input_length is None:
-    return None
-  assert padding in {'same', 'valid', 'full', 'causal'}
-  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
-  if padding in ['same', 'causal']:
-    output_length = input_length
-  elif padding == 'valid':
-    output_length = input_length - dilated_filter_size + 1
-  elif padding == 'full':
-    output_length = input_length + dilated_filter_size - 1
-  return (output_length + stride - 1) // stride
+    """Determines output length of a convolution given input length.
+
+    Args:
+        input_length: integer.
+        filter_size: integer.
+        padding: one of "same", "valid", "full", "causal"
+        stride: integer.
+        dilation: dilation rate, integer.
+
+    Returns:
+        The output length (integer).
+    """
+    if input_length is None:
+        return None
+    assert padding in {"same", "valid", "full", "causal"}
+    dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+    if padding in ["same", "causal"]:
+        output_length = input_length
+    elif padding == "valid":
+        output_length = input_length - dilated_filter_size + 1
+    elif padding == "full":
+        output_length = input_length + dilated_filter_size - 1
+    return (output_length + stride - 1) // stride
 
 
 def conv_input_length(output_length, filter_size, padding, stride):
-  """Determines input length of a convolution given output length.
-
-  Args:
-      output_length: integer.
-      filter_size: integer.
-      padding: one of "same", "valid", "full".
-      stride: integer.
-
-  Returns:
-      The input length (integer).
-  """
-  if output_length is None:
-    return None
-  assert padding in {'same', 'valid', 'full'}
-  if padding == 'same':
-    pad = filter_size // 2
-  elif padding == 'valid':
-    pad = 0
-  elif padding == 'full':
-    pad = filter_size - 1
-  return (output_length - 1) * stride - 2 * pad + filter_size
-
-
-def deconv_output_length(input_length,
-                         filter_size,
-                         padding,
-                         output_padding=None,
-                         stride=0,
-                         dilation=1):
-  """Determines output length of a transposed convolution given input length.
-
-  Args:
-      input_length: Integer.
-      filter_size: Integer.
-      padding: one of `"same"`, `"valid"`, `"full"`.
-      output_padding: Integer, amount of padding along the output dimension. Can
-        be set to `None` in which case the output length is inferred.
-      stride: Integer.
-      dilation: Integer.
-
-  Returns:
-      The output length (integer).
-  """
-  assert padding in {'same', 'valid', 'full'}
-  if input_length is None:
-    return None
-
-  # Get the dilated kernel size
-  filter_size = filter_size + (filter_size - 1) * (dilation - 1)
-
-  # Infer length if output padding is None, else compute the exact length
-  if output_padding is None:
-    if padding == 'valid':
-      length = input_length * stride + max(filter_size - stride, 0)
-    elif padding == 'full':
-      length = input_length * stride - (stride + filter_size - 2)
-    elif padding == 'same':
-      length = input_length * stride
-
-  else:
-    if padding == 'same':
-      pad = filter_size // 2
-    elif padding == 'valid':
-      pad = 0
-    elif padding == 'full':
-      pad = filter_size - 1
-
-    length = ((input_length - 1) * stride + filter_size - 2 * pad +
-              output_padding)
-  return length
+    """Determines input length of a convolution given output length.
+
+    Args:
+        output_length: integer.
+        filter_size: integer.
+        padding: one of "same", "valid", "full".
+        stride: integer.
+
+    Returns:
+        The input length (integer).
+    """
+    if output_length is None:
+        return None
+    assert padding in {"same", "valid", "full"}
+    if padding == "same":
+        pad = filter_size // 2
+    elif padding == "valid":
+        pad = 0
+    elif padding == "full":
+        pad = filter_size - 1
+    return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_output_length(
+    input_length,
+    filter_size,
+    padding,
+    output_padding=None,
+    stride=0,
+    dilation=1,
+):
+    """Determines output length of a transposed convolution given input length.
+
+    Args:
+        input_length: Integer.
+        filter_size: Integer.
+        padding: one of `"same"`, `"valid"`, `"full"`.
+        output_padding: Integer, amount of padding along the output dimension.
+          Can be set to `None` in which case the output length is inferred.
+        stride: Integer.
+        dilation: Integer.
+
+    Returns:
+        The output length (integer).
+    """
+    assert padding in {"same", "valid", "full"}
+    if input_length is None:
+        return None
+
+    # Get the dilated kernel size
+    filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+
+    # Infer length if output padding is None, else compute the exact length
+    if output_padding is None:
+        if padding == "valid":
+            length = input_length * stride + max(filter_size - stride, 0)
+        elif padding == "full":
+            length = input_length * stride - (stride + filter_size - 2)
+        elif padding == "same":
+            length = input_length * stride
+
+    else:
+        if padding == "same":
+            pad = filter_size // 2
+        elif padding == "valid":
+            pad = 0
+        elif padding == "full":
+            pad = filter_size - 1
+
+        length = (
+            (input_length - 1) * stride + filter_size - 2 * pad + output_padding
+        )
+    return length
 
 
 def normalize_data_format(value):
-  if value is None:
-    value = backend.image_data_format()
-  data_format = value.lower()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('The `data_format` argument must be one of '
-                     f'"channels_first", "channels_last". Received: {value}')
-  return data_format
+    if value is None:
+        value = backend.image_data_format()
+    data_format = value.lower()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError(
+            "The `data_format` argument must be one of "
+            f'"channels_first", "channels_last". Received: {value}'
+        )
+    return data_format
 
 
 def normalize_padding(value):
-  if isinstance(value, (list, tuple)):
-    return value
-  padding = value.lower()
-  if padding not in {'valid', 'same', 'causal'}:
-    raise ValueError('The `padding` argument must be a list/tuple or one of '
-                     '"valid", "same" (or "causal", only for `Conv1D). '
-                     f'Received: {padding}')
-  return padding
+    if isinstance(value, (list, tuple)):
+        return value
+    padding = value.lower()
+    if padding not in {"valid", "same", "causal"}:
+        raise ValueError(
+            "The `padding` argument must be a list/tuple or one of "
+            '"valid", "same" (or "causal", only for `Conv1D). '
+            f"Received: {padding}"
+        )
+    return padding
 
 
 def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
-  """Compute a mask representing the connectivity of a convolution operation.
-
-  Assume a convolution with given parameters is applied to an input having N
-  spatial dimensions with `input_shape = (d_in1, ..., d_inN)` to produce an
-  output with shape `(d_out1, ..., d_outN)`. This method returns a boolean array
-  of shape `(d_in1, ..., d_inN, d_out1, ..., d_outN)` with `True` entries
-  indicating pairs of input and output locations that are connected by a weight.
-
-  Example:
-
-    >>> input_shape = (4,)
-    >>> kernel_shape = (2,)
-    >>> strides = (1,)
-    >>> padding = "valid"
-    >>> conv_kernel_mask(input_shape, kernel_shape, strides, padding)
-    array([[ True, False, False],
-           [ True,  True, False],
-           [False,  True,  True],
-           [False, False,  True]])
-
-    where rows and columns correspond to inputs and outputs respectively.
-
-
-  Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
-      input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
-      receptive field.
-    strides: tuple of size N, strides along each spatial dimension.
-    padding: type of padding, string `"same"` or `"valid"`.
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
-      height/width dimension as the input.
-
-  Returns:
-    A boolean 2N-D `np.ndarray` of shape
-    `(d_in1, ..., d_inN, d_out1, ..., d_outN)`, where `(d_out1, ..., d_outN)`
-    is the spatial shape of the output. `True` entries in the mask represent
-    pairs of input-output locations that are connected by a weight.
-
-  Raises:
-    ValueError: if `input_shape`, `kernel_shape` and `strides` don't have the
-        same number of dimensions.
-    NotImplementedError: if `padding` is not in {`"same"`, `"valid"`}.
-  """
-  if padding not in {'same', 'valid'}:
-    raise NotImplementedError(f'Padding type {padding} not supported. '
-                              'Only "valid" and "same" are implemented.')
-
-  in_dims = len(input_shape)
-  if isinstance(kernel_shape, int):
-    kernel_shape = (kernel_shape,) * in_dims
-  if isinstance(strides, int):
-    strides = (strides,) * in_dims
-
-  kernel_dims = len(kernel_shape)
-  stride_dims = len(strides)
-  if kernel_dims != in_dims or stride_dims != in_dims:
-    raise ValueError('Number of strides, input and kernel dimensions must all '
-                     f'match. Received: stride_dims={stride_dims}, '
-                     f'in_dims={in_dims}, kernel_dims={kernel_dims}')
-
-  output_shape = conv_output_shape(input_shape, kernel_shape, strides, padding)
-
-  mask_shape = input_shape + output_shape
-  mask = np.zeros(mask_shape, np.bool)
-
-  output_axes_ticks = [range(dim) for dim in output_shape]
-  for output_position in itertools.product(*output_axes_ticks):
-    input_axes_ticks = conv_connected_inputs(input_shape, kernel_shape,
-                                             output_position, strides, padding)
-    for input_position in itertools.product(*input_axes_ticks):
-      mask[input_position + output_position] = True
-
-  return mask
-
-
-def conv_kernel_idxs(input_shape, kernel_shape, strides, padding, filters_in,
-                     filters_out, data_format):
-  """Yields output-input tuples of indices in a CNN layer.
-
-  The generator iterates over all `(output_idx, input_idx)` tuples, where
+    """Compute a mask representing the connectivity of a convolution operation.
+
+    Assume a convolution with given parameters is applied to an input having N
+    spatial dimensions with `input_shape = (d_in1, ..., d_inN)` to produce an
+    output with shape `(d_out1, ..., d_outN)`. This method returns a boolean
+    array of shape `(d_in1, ..., d_inN, d_out1, ..., d_outN)` with `True`
+    entries indicating pairs of input and output locations that are connected by
+    a weight.
+
+    Example:
+
+      >>> input_shape = (4,)
+      >>> kernel_shape = (2,)
+      >>> strides = (1,)
+      >>> padding = "valid"
+      >>> conv_kernel_mask(input_shape, kernel_shape, strides, padding)
+      array([[ True, False, False],
+             [ True,  True, False],
+             [False,  True,  True],
+             [False, False,  True]])
+
+      where rows and columns correspond to inputs and outputs respectively.
+
+
+    Args:
+      input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+        input.
+      kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+        receptive field.
+      strides: tuple of size N, strides along each spatial dimension.
+      padding: type of padding, string `"same"` or `"valid"`.
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+
+    Returns:
+      A boolean 2N-D `np.ndarray` of shape
+      `(d_in1, ..., d_inN, d_out1, ..., d_outN)`, where `(d_out1, ..., d_outN)`
+      is the spatial shape of the output. `True` entries in the mask represent
+      pairs of input-output locations that are connected by a weight.
+
+    Raises:
+      ValueError: if `input_shape`, `kernel_shape` and `strides` don't have the
+          same number of dimensions.
+      NotImplementedError: if `padding` is not in {`"same"`, `"valid"`}.
+    """
+    if padding not in {"same", "valid"}:
+        raise NotImplementedError(
+            f"Padding type {padding} not supported. "
+            'Only "valid" and "same" are implemented.'
+        )
+
+    in_dims = len(input_shape)
+    if isinstance(kernel_shape, int):
+        kernel_shape = (kernel_shape,) * in_dims
+    if isinstance(strides, int):
+        strides = (strides,) * in_dims
+
+    kernel_dims = len(kernel_shape)
+    stride_dims = len(strides)
+    if kernel_dims != in_dims or stride_dims != in_dims:
+        raise ValueError(
+            "Number of strides, input and kernel dimensions must all "
+            f"match. Received: stride_dims={stride_dims}, "
+            f"in_dims={in_dims}, kernel_dims={kernel_dims}"
+        )
+
+    output_shape = conv_output_shape(
+        input_shape, kernel_shape, strides, padding
+    )
+
+    mask_shape = input_shape + output_shape
+    mask = np.zeros(mask_shape, bool)
+
+    output_axes_ticks = [range(dim) for dim in output_shape]
+    for output_position in itertools.product(*output_axes_ticks):
+        input_axes_ticks = conv_connected_inputs(
+            input_shape, kernel_shape, output_position, strides, padding
+        )
+        for input_position in itertools.product(*input_axes_ticks):
+            mask[input_position + output_position] = True
+
+    return mask
+
+
+def conv_kernel_idxs(
+    input_shape,
+    kernel_shape,
+    strides,
+    padding,
+    filters_in,
+    filters_out,
+    data_format,
+):
+    """Yields output-input tuples of indices in a CNN layer.
+
+    The generator iterates over all `(output_idx, input_idx)` tuples, where
     `output_idx` is an integer index in a flattened tensor representing a single
     output image of a convolutional layer that is connected (via the layer
     weights) to the respective single input image at `input_idx`
 
-  Example:
-
-    >>> input_shape = (2, 2)
-    >>> kernel_shape = (2, 1)
-    >>> strides = (1, 1)
-    >>> padding = "valid"
-    >>> filters_in = 1
-    >>> filters_out = 1
-    >>> data_format = "channels_last"
-    >>> list(conv_kernel_idxs(input_shape, kernel_shape, strides, padding,
-    ...                       filters_in, filters_out, data_format))
-    [(0, 0), (0, 2), (1, 1), (1, 3)]
-
-  Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
-      input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
-      receptive field.
-    strides: tuple of size N, strides along each spatial dimension.
-    padding: type of padding, string `"same"` or `"valid"`.
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
-      height/width dimension as the input.
-    filters_in: `int`, number if filters in the input to the layer.
-    filters_out: `int', number if filters in the output of the layer.
-    data_format: string, "channels_first" or "channels_last".
-
-  Yields:
-    The next tuple `(output_idx, input_idx)`, where
-    `output_idx` is an integer index in a flattened tensor representing a single
-    output image of a convolutional layer that is connected (via the layer
-    weights) to the respective single input image at `input_idx`.
-
-  Raises:
-      ValueError: if `data_format` is neither
-      `"channels_last"` nor `"channels_first"`, or if number of strides, input,
-      and kernel number of dimensions do not match.
-
-      NotImplementedError: if `padding` is neither `"same"` nor `"valid"`.
-  """
-  if padding not in ('same', 'valid'):
-    raise NotImplementedError(f'Padding type {padding} not supported. '
-                              'Only "valid" and "same" are implemented.')
-
-  in_dims = len(input_shape)
-  if isinstance(kernel_shape, int):
-    kernel_shape = (kernel_shape,) * in_dims
-  if isinstance(strides, int):
-    strides = (strides,) * in_dims
-
-  kernel_dims = len(kernel_shape)
-  stride_dims = len(strides)
-  if kernel_dims != in_dims or stride_dims != in_dims:
-    raise ValueError('Number of strides, input and kernel dimensions must all '
-                     f'match. Received: stride_dims={stride_dims}, '
-                     f'in_dims={in_dims}, kernel_dims={kernel_dims}')
-
-  output_shape = conv_output_shape(input_shape, kernel_shape, strides, padding)
-  output_axes_ticks = [range(dim) for dim in output_shape]
-
-  if data_format == 'channels_first':
-    concat_idxs = lambda spatial_idx, filter_idx: (filter_idx,) + spatial_idx
-  elif data_format == 'channels_last':
-    concat_idxs = lambda spatial_idx, filter_idx: spatial_idx + (filter_idx,)
-  else:
-    raise ValueError(
-        f'Data format `{data_format}` not recognized.'
-        '`data_format` must be "channels_first" or "channels_last".')
-
-  for output_position in itertools.product(*output_axes_ticks):
-    input_axes_ticks = conv_connected_inputs(input_shape, kernel_shape,
-                                             output_position, strides, padding)
-    for input_position in itertools.product(*input_axes_ticks):
-      for f_in in range(filters_in):
-        for f_out in range(filters_out):
-          out_idx = np.ravel_multi_index(
-              multi_index=concat_idxs(output_position, f_out),
-              dims=concat_idxs(output_shape, filters_out))
-          in_idx = np.ravel_multi_index(
-              multi_index=concat_idxs(input_position, f_in),
-              dims=concat_idxs(input_shape, filters_in))
-          yield (out_idx, in_idx)
-
-
-def conv_connected_inputs(input_shape, kernel_shape, output_position, strides,
-                          padding):
-  """Return locations of the input connected to an output position.
-
-  Assume a convolution with given parameters is applied to an input having N
-  spatial dimensions with `input_shape = (d_in1, ..., d_inN)`. This method
-  returns N ranges specifying the input region that was convolved with the
-  kernel to produce the output at position
-  `output_position = (p_out1, ..., p_outN)`.
-
-  Example:
-
-    >>> input_shape = (4, 4)
-    >>> kernel_shape = (2, 1)
-    >>> output_position = (1, 1)
-    >>> strides = (1, 1)
-    >>> padding = "valid"
-    >>> conv_connected_inputs(input_shape, kernel_shape, output_position,
-    ...                       strides, padding)
-    [range(1, 3), range(1, 2)]
-
-  Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
-      input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
-      receptive field.
-    output_position: tuple of size N: `(p_out1, ..., p_outN)`, a single position
-      in the output of the convolution.
-    strides: tuple of size N, strides along each spatial dimension.
-    padding: type of padding, string `"same"` or `"valid"`.
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
-      height/width dimension as the input.
-
-  Returns:
-    N ranges `[[p_in_left1, ..., p_in_right1], ...,
-              [p_in_leftN, ..., p_in_rightN]]` specifying the region in the
-    input connected to output_position.
-  """
-  ranges = []
-
-  ndims = len(input_shape)
-  for d in range(ndims):
-    left_shift = int(kernel_shape[d] / 2)
-    right_shift = kernel_shape[d] - left_shift
-
-    center = output_position[d] * strides[d]
-
-    if padding == 'valid':
-      center += left_shift
-
-    start = max(0, center - left_shift)
-    end = min(input_shape[d], center + right_shift)
-
-    ranges.append(range(start, end))
-
-  return ranges
+    Example:
+
+      >>> input_shape = (2, 2)
+      >>> kernel_shape = (2, 1)
+      >>> strides = (1, 1)
+      >>> padding = "valid"
+      >>> filters_in = 1
+      >>> filters_out = 1
+      >>> data_format = "channels_last"
+      >>> list(conv_kernel_idxs(input_shape, kernel_shape, strides, padding,
+      ...                       filters_in, filters_out, data_format))
+      [(0, 0), (0, 2), (1, 1), (1, 3)]
+
+    Args:
+      input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+        input.
+      kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+        receptive field.
+      strides: tuple of size N, strides along each spatial dimension.
+      padding: type of padding, string `"same"` or `"valid"`.
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+      filters_in: `int`, number if filters in the input to the layer.
+      filters_out: `int', number if filters in the output of the layer.
+      data_format: string, "channels_first" or "channels_last".
+
+    Yields:
+      The next tuple `(output_idx, input_idx)`, where `output_idx` is an integer
+      index in a flattened tensor representing a single output image of a
+      convolutional layer that is connected (via the layer weights) to the
+      respective single input image at `input_idx`.
+
+    Raises:
+        ValueError: if `data_format` is neither `"channels_last"` nor
+          `"channels_first"`, or if number of strides, input, and kernel number
+          of dimensions do not match.
+
+        NotImplementedError: if `padding` is neither `"same"` nor `"valid"`.
+    """
+    if padding not in ("same", "valid"):
+        raise NotImplementedError(
+            f"Padding type {padding} not supported. "
+            'Only "valid" and "same" are implemented.'
+        )
+
+    in_dims = len(input_shape)
+    if isinstance(kernel_shape, int):
+        kernel_shape = (kernel_shape,) * in_dims
+    if isinstance(strides, int):
+        strides = (strides,) * in_dims
+
+    kernel_dims = len(kernel_shape)
+    stride_dims = len(strides)
+    if kernel_dims != in_dims or stride_dims != in_dims:
+        raise ValueError(
+            "Number of strides, input and kernel dimensions must all "
+            f"match. Received: stride_dims={stride_dims}, "
+            f"in_dims={in_dims}, kernel_dims={kernel_dims}"
+        )
+
+    output_shape = conv_output_shape(
+        input_shape, kernel_shape, strides, padding
+    )
+    output_axes_ticks = [range(dim) for dim in output_shape]
+
+    if data_format == "channels_first":
+        concat_idxs = (
+            lambda spatial_idx, filter_idx: (filter_idx,) + spatial_idx
+        )
+    elif data_format == "channels_last":
+        concat_idxs = lambda spatial_idx, filter_idx: spatial_idx + (
+            filter_idx,
+        )
+    else:
+        raise ValueError(
+            f"Data format `{data_format}` not recognized."
+            '`data_format` must be "channels_first" or "channels_last".'
+        )
+
+    for output_position in itertools.product(*output_axes_ticks):
+        input_axes_ticks = conv_connected_inputs(
+            input_shape, kernel_shape, output_position, strides, padding
+        )
+        for input_position in itertools.product(*input_axes_ticks):
+            for f_in in range(filters_in):
+                for f_out in range(filters_out):
+                    out_idx = np.ravel_multi_index(
+                        multi_index=concat_idxs(output_position, f_out),
+                        dims=concat_idxs(output_shape, filters_out),
+                    )
+                    in_idx = np.ravel_multi_index(
+                        multi_index=concat_idxs(input_position, f_in),
+                        dims=concat_idxs(input_shape, filters_in),
+                    )
+                    yield (out_idx, in_idx)
+
+
+def conv_connected_inputs(
+    input_shape, kernel_shape, output_position, strides, padding
+):
+    """Return locations of the input connected to an output position.
+
+    Assume a convolution with given parameters is applied to an input having N
+    spatial dimensions with `input_shape = (d_in1, ..., d_inN)`. This method
+    returns N ranges specifying the input region that was convolved with the
+    kernel to produce the output at position
+    `output_position = (p_out1, ..., p_outN)`.
+
+    Example:
+
+      >>> input_shape = (4, 4)
+      >>> kernel_shape = (2, 1)
+      >>> output_position = (1, 1)
+      >>> strides = (1, 1)
+      >>> padding = "valid"
+      >>> conv_connected_inputs(input_shape, kernel_shape, output_position,
+      ...                       strides, padding)
+      [range(1, 3), range(1, 2)]
+
+    Args:
+      input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+        input.
+      kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+        receptive field.
+      output_position: tuple of size N: `(p_out1, ..., p_outN)`, a single
+        position in the output of the convolution.
+      strides: tuple of size N, strides along each spatial dimension.
+      padding: type of padding, string `"same"` or `"valid"`.
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+
+    Returns:
+      N ranges `[[p_in_left1, ..., p_in_right1], ...,
+                [p_in_leftN, ..., p_in_rightN]]` specifying the region in the
+      input connected to output_position.
+    """
+    ranges = []
+
+    ndims = len(input_shape)
+    for d in range(ndims):
+        left_shift = int(kernel_shape[d] / 2)
+        right_shift = kernel_shape[d] - left_shift
+
+        center = output_position[d] * strides[d]
+
+        if padding == "valid":
+            center += left_shift
+
+        start = max(0, center - left_shift)
+        end = min(input_shape[d], center + right_shift)
+
+        ranges.append(range(start, end))
+
+    return ranges
 
 
 def conv_output_shape(input_shape, kernel_shape, strides, padding):
-  """Return the output shape of an N-D convolution.
-
-  Forces dimensions where input is empty (size 0) to remain empty.
-
-  Args:
-    input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
-      input.
-    kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
-      receptive field.
-    strides: tuple of size N, strides along each spatial dimension.
-    padding: type of padding, string `"same"` or `"valid"`.
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
-      height/width dimension as the input.
-
-  Returns:
-    tuple of size N: `(d_out1, ..., d_outN)`, spatial shape of the output.
-  """
-  dims = range(len(kernel_shape))
-  output_shape = [
-      conv_output_length(input_shape[d], kernel_shape[d], padding, strides[d])
-      for d in dims
-  ]
-  output_shape = tuple(
-      [0 if input_shape[d] == 0 else output_shape[d] for d in dims])
-  return output_shape
+    """Return the output shape of an N-D convolution.
+
+    Forces dimensions where input is empty (size 0) to remain empty.
+
+    Args:
+      input_shape: tuple of size N: `(d_in1, ..., d_inN)`, spatial shape of the
+        input.
+      kernel_shape: tuple of size N, spatial shape of the convolutional kernel /
+        receptive field.
+      strides: tuple of size N, strides along each spatial dimension.
+      padding: type of padding, string `"same"` or `"valid"`.
+        `"valid"` means no padding. `"same"` results in padding evenly to
+        the left/right or up/down of the input such that output has the same
+        height/width dimension as the input.
+
+    Returns:
+      tuple of size N: `(d_out1, ..., d_outN)`, spatial shape of the output.
+    """
+    dims = range(len(kernel_shape))
+    output_shape = [
+        conv_output_length(input_shape[d], kernel_shape[d], padding, strides[d])
+        for d in dims
+    ]
+    output_shape = tuple(
+        [0 if input_shape[d] == 0 else output_shape[d] for d in dims]
+    )
+    return output_shape
 
 
 def squeeze_batch_dims(inp, op, inner_rank):
-  """Returns `unsqueeze_batch(op(squeeze_batch(inp)))`.
-
-  Where `squeeze_batch` reshapes `inp` to shape
-  `[prod(inp.shape[:-inner_rank])] + inp.shape[-inner_rank:]`
-  and `unsqueeze_batch` does the reverse reshape but on the output.
-
-  Args:
-    inp: A tensor with dims `batch_shape + inner_shape` where `inner_shape`
-      is length `inner_rank`.
-    op: A callable that takes a single input tensor and returns a single.
-      output tensor.
-    inner_rank: A python integer.
-
-  Returns:
-    `unsqueeze_batch_op(squeeze_batch(inp))`.
-  """
-  with tf.name_scope('squeeze_batch_dims'):
-    shape = inp.shape
-
-    inner_shape = shape[-inner_rank:]
-    if not inner_shape.is_fully_defined():
-      inner_shape = tf.shape(inp)[-inner_rank:]
-
-    batch_shape = shape[:-inner_rank]
-    if not batch_shape.is_fully_defined():
-      batch_shape = tf.shape(inp)[:-inner_rank]
-
-    if isinstance(inner_shape, tf.TensorShape):
-      inp_reshaped = tf.reshape(inp, [-1] + inner_shape.as_list())
-    else:
-      inp_reshaped = tf.reshape(
-          inp, tf.concat(([-1], inner_shape), axis=-1))
-
-    out_reshaped = op(inp_reshaped)
-
-    out_inner_shape = out_reshaped.shape[-inner_rank:]
-    if not out_inner_shape.is_fully_defined():
-      out_inner_shape = tf.shape(out_reshaped)[-inner_rank:]
-
-    out = tf.reshape(
-        out_reshaped, tf.concat((batch_shape, out_inner_shape), axis=-1))
-
-    out.set_shape(inp.shape[:-inner_rank] + out.shape[-inner_rank:])
-    return out
+    """Returns `unsqueeze_batch(op(squeeze_batch(inp)))`.
+
+    Where `squeeze_batch` reshapes `inp` to shape
+    `[prod(inp.shape[:-inner_rank])] + inp.shape[-inner_rank:]`
+    and `unsqueeze_batch` does the reverse reshape but on the output.
+
+    Args:
+      inp: A tensor with dims `batch_shape + inner_shape` where `inner_shape`
+        is length `inner_rank`.
+      op: A callable that takes a single input tensor and returns a single.
+        output tensor.
+      inner_rank: A python integer.
+
+    Returns:
+      `unsqueeze_batch_op(squeeze_batch(inp))`.
+    """
+    with tf.name_scope("squeeze_batch_dims"):
+        shape = inp.shape
+
+        inner_shape = shape[-inner_rank:]
+        if not inner_shape.is_fully_defined():
+            inner_shape = tf.shape(inp)[-inner_rank:]
+
+        batch_shape = shape[:-inner_rank]
+        if not batch_shape.is_fully_defined():
+            batch_shape = tf.shape(inp)[:-inner_rank]
+
+        if isinstance(inner_shape, tf.TensorShape):
+            inp_reshaped = tf.reshape(inp, [-1] + inner_shape.as_list())
+        else:
+            inp_reshaped = tf.reshape(
+                inp, tf.concat(([-1], inner_shape), axis=-1)
+            )
+
+        out_reshaped = op(inp_reshaped)
+
+        out_inner_shape = out_reshaped.shape[-inner_rank:]
+        if not out_inner_shape.is_fully_defined():
+            out_inner_shape = tf.shape(out_reshaped)[-inner_rank:]
+
+        out = tf.reshape(
+            out_reshaped, tf.concat((batch_shape, out_inner_shape), axis=-1)
+        )
+
+        out.set_shape(inp.shape[:-inner_rank] + out.shape[-inner_rank:])
+        return out
diff --git a/keras/utils/conv_utils_test.py b/keras/utils/conv_utils_test.py
index cc4b66eed11b..f7a11ad0842f 100644
--- a/keras/utils/conv_utils_test.py
+++ b/keras/utils/conv_utils_test.py
@@ -14,18 +14,17 @@
 # ==============================================================================
 """Tests for conv_utils."""
 
-import tensorflow.compat.v2 as tf
-
 import itertools
 
-from absl.testing import parameterized
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
 from keras.utils import conv_utils
 
 
 def _get_const_output_shape(input_shape, dim):
-  return tuple([min(d, dim) for d in input_shape])
+    return tuple([min(d, dim) for d in input_shape])
 
 
 input_shapes = [
@@ -50,316 +49,354 @@ def _get_const_output_shape(input_shape, dim):
 
 
 class TestBasicConvUtilsTest(tf.test.TestCase):
+    def test_convert_data_format(self):
+        self.assertEqual(
+            "NCDHW", conv_utils.convert_data_format("channels_first", 5)
+        )
+        self.assertEqual(
+            "NCHW", conv_utils.convert_data_format("channels_first", 4)
+        )
+        self.assertEqual(
+            "NCW", conv_utils.convert_data_format("channels_first", 3)
+        )
+        self.assertEqual(
+            "NHWC", conv_utils.convert_data_format("channels_last", 4)
+        )
+        self.assertEqual(
+            "NWC", conv_utils.convert_data_format("channels_last", 3)
+        )
+        self.assertEqual(
+            "NDHWC", conv_utils.convert_data_format("channels_last", 5)
+        )
+
+        with self.assertRaises(ValueError):
+            conv_utils.convert_data_format("invalid", 2)
+
+    def test_normalize_tuple(self):
+        self.assertEqual(
+            (2, 2, 2),
+            conv_utils.normalize_tuple(2, n=3, name="strides", allow_zero=True),
+        )
+        self.assertEqual(
+            (2, 1, 2),
+            conv_utils.normalize_tuple(
+                (2, 1, 2), n=3, name="strides", allow_zero=True
+            ),
+        )
+        self.assertEqual(
+            (
+                1,
+                2,
+                3,
+            ),
+            conv_utils.normalize_tuple((1, 2, 3), n=3, name="pool_size"),
+        )
+        self.assertEqual(
+            (3, 3, 3), conv_utils.normalize_tuple(3, n=3, name="pool_size")
+        )
 
-  def test_convert_data_format(self):
-    self.assertEqual('NCDHW', conv_utils.convert_data_format(
-        'channels_first', 5))
-    self.assertEqual('NCHW', conv_utils.convert_data_format(
-        'channels_first', 4))
-    self.assertEqual('NCW', conv_utils.convert_data_format('channels_first', 3))
-    self.assertEqual('NHWC', conv_utils.convert_data_format('channels_last', 4))
-    self.assertEqual('NWC', conv_utils.convert_data_format('channels_last', 3))
-    self.assertEqual('NDHWC', conv_utils.convert_data_format(
-        'channels_last', 5))
-
-    with self.assertRaises(ValueError):
-      conv_utils.convert_data_format('invalid', 2)
-
-  def test_normalize_tuple(self):
-    self.assertEqual(
-        (2, 2, 2),
-        conv_utils.normalize_tuple(2, n=3, name='strides', allow_zero=True))
-    self.assertEqual((2, 1, 2),
-                     conv_utils.normalize_tuple((2, 1, 2),
-                                                n=3,
-                                                name='strides',
-                                                allow_zero=True))
-    self.assertEqual((
-        1,
-        2,
-        3,
-    ), conv_utils.normalize_tuple((1, 2, 3), n=3, name='pool_size'))
-    self.assertEqual((3, 3, 3),
-                     conv_utils.normalize_tuple(3, n=3, name='pool_size'))
-
-    with self.assertRaisesRegex(
-        ValueError,
-        r'including \{-1\} that does not satisfy the requirement `> 0`'):
-      conv_utils.normalize_tuple((3, -1, 3), n=3, name='negative_size')
-
-    with self.assertRaisesRegex(
-        ValueError,
-        r'The `strides` argument .* a tuple of 3 integers.* \(2, 1\)$'):
-      conv_utils.normalize_tuple((2, 1), n=3, name='strides', allow_zero=True)
-
-    with self.assertRaisesRegex(
-        ValueError,
-        r'The `kernel_size` argument .* tuple of 3 integers.* None$'):
-      conv_utils.normalize_tuple(None, n=3, name='kernel_size')
-
-    with self.assertRaisesRegex(ValueError,
-                                r'including \{-4\} that does not .* `>= 0`'):
-      conv_utils.normalize_tuple(-4, n=3, name='strides', allow_zero=True)
-
-    with self.assertRaisesRegex(ValueError,
-                                r'including \{0\} that does not .* `> 0`'):
-      conv_utils.normalize_tuple((0, 1, 2), n=3, name='pool_size')
-
-  def test_normalize_data_format(self):
-    self.assertEqual('channels_last',
-                     conv_utils.normalize_data_format('Channels_Last'))
-    self.assertEqual('channels_first',
-                     conv_utils.normalize_data_format('CHANNELS_FIRST'))
-
-    with self.assertRaises(ValueError):
-      conv_utils.normalize_data_format('invalid')
-
-  def test_normalize_padding(self):
-    self.assertEqual('same', conv_utils.normalize_padding('SAME'))
-    self.assertEqual('valid', conv_utils.normalize_padding('VALID'))
-
-    with self.assertRaises(ValueError):
-      conv_utils.normalize_padding('invalid')
-
-  def test_conv_output_length(self):
-    self.assertEqual(4, conv_utils.conv_output_length(4, 2, 'same', 1, 1))
-    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'same', 2, 1))
-    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'valid', 1, 1))
-    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'valid', 2, 1))
-    self.assertEqual(5, conv_utils.conv_output_length(4, 2, 'full', 1, 1))
-    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'full', 2, 1))
-    self.assertEqual(2, conv_utils.conv_output_length(5, 2, 'valid', 2, 2))
-
-  def test_conv_input_length(self):
-    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'same', 1))
-    self.assertEqual(2, conv_utils.conv_input_length(2, 2, 'same', 2))
-    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'valid', 1))
-    self.assertEqual(4, conv_utils.conv_input_length(2, 2, 'valid', 2))
-    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'full', 1))
-    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'full', 2))
-
-  def test_deconv_output_length(self):
-    self.assertEqual(4, conv_utils.deconv_output_length(4, 2, 'same', stride=1))
-    self.assertEqual(8, conv_utils.deconv_output_length(4, 2, 'same', stride=2))
-    self.assertEqual(5, conv_utils.deconv_output_length(
-        4, 2, 'valid', stride=1))
-    self.assertEqual(8, conv_utils.deconv_output_length(
-        4, 2, 'valid', stride=2))
-    self.assertEqual(3, conv_utils.deconv_output_length(4, 2, 'full', stride=1))
-    self.assertEqual(6, conv_utils.deconv_output_length(4, 2, 'full', stride=2))
-    self.assertEqual(
-        5,
-        conv_utils.deconv_output_length(
-            4, 2, 'same', output_padding=2, stride=1))
-    self.assertEqual(
-        7,
-        conv_utils.deconv_output_length(
-            4, 2, 'same', output_padding=1, stride=2))
-    self.assertEqual(
-        7,
-        conv_utils.deconv_output_length(
-            4, 2, 'valid', output_padding=2, stride=1))
-    self.assertEqual(
-        9,
-        conv_utils.deconv_output_length(
-            4, 2, 'valid', output_padding=1, stride=2))
-    self.assertEqual(
-        5,
-        conv_utils.deconv_output_length(
-            4, 2, 'full', output_padding=2, stride=1))
-    self.assertEqual(
-        7,
-        conv_utils.deconv_output_length(
-            4, 2, 'full', output_padding=1, stride=2))
-    self.assertEqual(
-        5,
-        conv_utils.deconv_output_length(
-            4, 2, 'same', output_padding=1, stride=1, dilation=2))
-    self.assertEqual(
-        12,
-        conv_utils.deconv_output_length(
-            4, 2, 'valid', output_padding=2, stride=2, dilation=3))
-    self.assertEqual(
-        6,
-        conv_utils.deconv_output_length(
-            4, 2, 'full', output_padding=2, stride=2, dilation=3))
+        with self.assertRaisesRegex(
+            ValueError,
+            r"including \{-1\} that does not satisfy the requirement `> 0`",
+        ):
+            conv_utils.normalize_tuple((3, -1, 3), n=3, name="negative_size")
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"The `strides` argument .* a tuple of 3 integers.* \(2, 1\)$",
+        ):
+            conv_utils.normalize_tuple(
+                (2, 1), n=3, name="strides", allow_zero=True
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"The `kernel_size` argument .* tuple of 3 integers.* None$",
+        ):
+            conv_utils.normalize_tuple(None, n=3, name="kernel_size")
+
+        with self.assertRaisesRegex(
+            ValueError, r"including \{-4\} that does not .* `>= 0`"
+        ):
+            conv_utils.normalize_tuple(-4, n=3, name="strides", allow_zero=True)
+
+        with self.assertRaisesRegex(
+            ValueError, r"including \{0\} that does not .* `> 0`"
+        ):
+            conv_utils.normalize_tuple((0, 1, 2), n=3, name="pool_size")
+
+    def test_normalize_data_format(self):
+        self.assertEqual(
+            "channels_last", conv_utils.normalize_data_format("Channels_Last")
+        )
+        self.assertEqual(
+            "channels_first", conv_utils.normalize_data_format("CHANNELS_FIRST")
+        )
+
+        with self.assertRaises(ValueError):
+            conv_utils.normalize_data_format("invalid")
+
+    def test_normalize_padding(self):
+        self.assertEqual("same", conv_utils.normalize_padding("SAME"))
+        self.assertEqual("valid", conv_utils.normalize_padding("VALID"))
+
+        with self.assertRaises(ValueError):
+            conv_utils.normalize_padding("invalid")
+
+    def test_conv_output_length(self):
+        self.assertEqual(4, conv_utils.conv_output_length(4, 2, "same", 1, 1))
+        self.assertEqual(2, conv_utils.conv_output_length(4, 2, "same", 2, 1))
+        self.assertEqual(3, conv_utils.conv_output_length(4, 2, "valid", 1, 1))
+        self.assertEqual(2, conv_utils.conv_output_length(4, 2, "valid", 2, 1))
+        self.assertEqual(5, conv_utils.conv_output_length(4, 2, "full", 1, 1))
+        self.assertEqual(3, conv_utils.conv_output_length(4, 2, "full", 2, 1))
+        self.assertEqual(2, conv_utils.conv_output_length(5, 2, "valid", 2, 2))
+
+    def test_conv_input_length(self):
+        self.assertEqual(3, conv_utils.conv_input_length(4, 2, "same", 1))
+        self.assertEqual(2, conv_utils.conv_input_length(2, 2, "same", 2))
+        self.assertEqual(4, conv_utils.conv_input_length(3, 2, "valid", 1))
+        self.assertEqual(4, conv_utils.conv_input_length(2, 2, "valid", 2))
+        self.assertEqual(3, conv_utils.conv_input_length(4, 2, "full", 1))
+        self.assertEqual(4, conv_utils.conv_input_length(3, 2, "full", 2))
+
+    def test_deconv_output_length(self):
+        self.assertEqual(
+            4, conv_utils.deconv_output_length(4, 2, "same", stride=1)
+        )
+        self.assertEqual(
+            8, conv_utils.deconv_output_length(4, 2, "same", stride=2)
+        )
+        self.assertEqual(
+            5, conv_utils.deconv_output_length(4, 2, "valid", stride=1)
+        )
+        self.assertEqual(
+            8, conv_utils.deconv_output_length(4, 2, "valid", stride=2)
+        )
+        self.assertEqual(
+            3, conv_utils.deconv_output_length(4, 2, "full", stride=1)
+        )
+        self.assertEqual(
+            6, conv_utils.deconv_output_length(4, 2, "full", stride=2)
+        )
+        self.assertEqual(
+            5,
+            conv_utils.deconv_output_length(
+                4, 2, "same", output_padding=2, stride=1
+            ),
+        )
+        self.assertEqual(
+            7,
+            conv_utils.deconv_output_length(
+                4, 2, "same", output_padding=1, stride=2
+            ),
+        )
+        self.assertEqual(
+            7,
+            conv_utils.deconv_output_length(
+                4, 2, "valid", output_padding=2, stride=1
+            ),
+        )
+        self.assertEqual(
+            9,
+            conv_utils.deconv_output_length(
+                4, 2, "valid", output_padding=1, stride=2
+            ),
+        )
+        self.assertEqual(
+            5,
+            conv_utils.deconv_output_length(
+                4, 2, "full", output_padding=2, stride=1
+            ),
+        )
+        self.assertEqual(
+            7,
+            conv_utils.deconv_output_length(
+                4, 2, "full", output_padding=1, stride=2
+            ),
+        )
+        self.assertEqual(
+            5,
+            conv_utils.deconv_output_length(
+                4, 2, "same", output_padding=1, stride=1, dilation=2
+            ),
+        )
+        self.assertEqual(
+            12,
+            conv_utils.deconv_output_length(
+                4, 2, "valid", output_padding=2, stride=2, dilation=3
+            ),
+        )
+        self.assertEqual(
+            6,
+            conv_utils.deconv_output_length(
+                4, 2, "full", output_padding=2, stride=2, dilation=3
+            ),
+        )
 
 
 @parameterized.parameters(input_shapes)
 class TestConvUtils(tf.test.TestCase, parameterized.TestCase):
+    def test_conv_kernel_mask_fc(self, *input_shape):
+        padding = "valid"
+        kernel_shape = input_shape
+        ndims = len(input_shape)
+        strides = (1,) * ndims
+        output_shape = _get_const_output_shape(input_shape, dim=1)
+        mask = np.ones(input_shape + output_shape, bool)
+        self.assertAllEqual(
+            mask,
+            conv_utils.conv_kernel_mask(
+                input_shape, kernel_shape, strides, padding
+            ),
+        )
+
+    def test_conv_kernel_mask_diag(self, *input_shape):
+        ndims = len(input_shape)
+        kernel_shape = (1,) * ndims
+        strides = (1,) * ndims
+
+        for padding in ["valid", "same"]:
+            mask = np.identity(int(np.prod(input_shape)), bool)
+            mask = np.reshape(mask, input_shape * 2)
+            self.assertAllEqual(
+                mask,
+                conv_utils.conv_kernel_mask(
+                    input_shape, kernel_shape, strides, padding
+                ),
+            )
+
+    def test_conv_kernel_mask_full_stride(self, *input_shape):
+        padding = "valid"
+        ndims = len(input_shape)
+        kernel_shape = (1,) * ndims
+        strides = tuple([max(d, 1) for d in input_shape])
+        output_shape = _get_const_output_shape(input_shape, dim=1)
+
+        mask = np.zeros(input_shape + output_shape, bool)
+        if all(d > 0 for d in mask.shape):
+            mask[(0,) * len(output_shape)] = True
+
+        self.assertAllEqual(
+            mask,
+            conv_utils.conv_kernel_mask(
+                input_shape, kernel_shape, strides, padding
+            ),
+        )
+
+    def test_conv_kernel_mask_almost_full_stride(self, *input_shape):
+        padding = "valid"
+        ndims = len(input_shape)
+        kernel_shape = (1,) * ndims
+        strides = tuple([max(d - 1, 1) for d in input_shape])
+        output_shape = _get_const_output_shape(input_shape, dim=2)
+
+        mask = np.zeros(input_shape + output_shape, bool)
+        if all(d > 0 for d in mask.shape):
+            for in_position in itertools.product(
+                *[[0, d - 1] for d in input_shape]
+            ):
+                out_position = tuple([min(p, 1) for p in in_position])
+                mask[in_position + out_position] = True
+
+        self.assertAllEqual(
+            mask,
+            conv_utils.conv_kernel_mask(
+                input_shape, kernel_shape, strides, padding
+            ),
+        )
+
+    def test_conv_kernel_mask_rect_kernel(self, *input_shape):
+        padding = "valid"
+        ndims = len(input_shape)
+        strides = (1,) * ndims
+
+        for d in range(ndims):
+            kernel_shape = [1] * ndims
+            kernel_shape[d] = input_shape[d]
+
+            output_shape = list(input_shape)
+            output_shape[d] = min(1, input_shape[d])
+
+            mask = np.identity(int(np.prod(input_shape)), bool)
+            mask = np.reshape(mask, input_shape * 2)
+
+            for p in itertools.product(
+                *[range(input_shape[dim]) for dim in range(ndims)]
+            ):
+                p = list(p)
+                p[d] = slice(None)
+                mask[tuple(p * 2)] = True
+
+            mask = np.take(mask, range(0, min(1, input_shape[d])), ndims + d)
+
+            self.assertAllEqual(
+                mask,
+                conv_utils.conv_kernel_mask(
+                    input_shape, kernel_shape, strides, padding
+                ),
+            )
+
+    def test_conv_kernel_mask_wrong_padding(self, *input_shape):
+        ndims = len(input_shape)
+        kernel_shape = (1,) * ndims
+        strides = (1,) * ndims
+
+        conv_utils.conv_kernel_mask(input_shape, kernel_shape, strides, "valid")
+
+        conv_utils.conv_kernel_mask(input_shape, kernel_shape, strides, "same")
 
-  def test_conv_kernel_mask_fc(self, *input_shape):
-    padding = 'valid'
-    kernel_shape = input_shape
-    ndims = len(input_shape)
-    strides = (1,) * ndims
-    output_shape = _get_const_output_shape(input_shape, dim=1)
-    mask = np.ones(input_shape + output_shape, np.bool)
-    self.assertAllEqual(
-        mask,
-        conv_utils.conv_kernel_mask(
+        self.assertRaises(
+            NotImplementedError,
+            conv_utils.conv_kernel_mask,
             input_shape,
             kernel_shape,
             strides,
-            padding
-        )
-    )
-
-  def test_conv_kernel_mask_diag(self, *input_shape):
-    ndims = len(input_shape)
-    kernel_shape = (1,) * ndims
-    strides = (1,) * ndims
-
-    for padding in ['valid', 'same']:
-      mask = np.identity(int(np.prod(input_shape)), np.bool)
-      mask = np.reshape(mask, input_shape * 2)
-      self.assertAllEqual(
-          mask,
-          conv_utils.conv_kernel_mask(
-              input_shape,
-              kernel_shape,
-              strides,
-              padding
-          )
-      )
-
-  def test_conv_kernel_mask_full_stride(self, *input_shape):
-    padding = 'valid'
-    ndims = len(input_shape)
-    kernel_shape = (1,) * ndims
-    strides = tuple([max(d, 1) for d in input_shape])
-    output_shape = _get_const_output_shape(input_shape, dim=1)
-
-    mask = np.zeros(input_shape + output_shape, np.bool)
-    if all(d > 0 for d in mask.shape):  # pylint: disable=not-an-iterable
-      mask[(0,) * len(output_shape)] = True
-
-    self.assertAllEqual(
-        mask,
-        conv_utils.conv_kernel_mask(
+            "full",
+        )
+
+    def test_conv_kernel_mask_wrong_dims(self, *input_shape):
+        kernel_shape = 1
+        strides = 1
+
+        conv_utils.conv_kernel_mask(input_shape, kernel_shape, strides, "valid")
+
+        ndims = len(input_shape)
+
+        kernel_shape = (2,) * (ndims + 1)
+        self.assertRaises(
+            ValueError,
+            conv_utils.conv_kernel_mask,
+            input_shape,
+            kernel_shape,
+            strides,
+            "same",
+        )
+
+        strides = (1,) * ndims
+        self.assertRaises(
+            ValueError,
+            conv_utils.conv_kernel_mask,
             input_shape,
             kernel_shape,
             strides,
-            padding
-        )
-    )
-
-  def test_conv_kernel_mask_almost_full_stride(self, *input_shape):
-    padding = 'valid'
-    ndims = len(input_shape)
-    kernel_shape = (1,) * ndims
-    strides = tuple([max(d - 1, 1) for d in input_shape])
-    output_shape = _get_const_output_shape(input_shape, dim=2)
-
-    mask = np.zeros(input_shape + output_shape, np.bool)
-    if all(d > 0 for d in mask.shape):  # pylint: disable=not-an-iterable
-      for in_position in itertools.product(*[[0, d - 1] for d in input_shape]):
-        out_position = tuple([min(p, 1) for p in in_position])
-        mask[in_position + out_position] = True
-
-    self.assertAllEqual(
-        mask,
-        conv_utils.conv_kernel_mask(
+            "valid",
+        )
+
+        kernel_shape = (1,) * ndims
+        strides = (2,) * (ndims - 1)
+        self.assertRaises(
+            ValueError,
+            conv_utils.conv_kernel_mask,
             input_shape,
             kernel_shape,
             strides,
-            padding
-        )
-    )
-
-  def test_conv_kernel_mask_rect_kernel(self, *input_shape):
-    padding = 'valid'
-    ndims = len(input_shape)
-    strides = (1,) * ndims
-
-    for d in range(ndims):
-      kernel_shape = [1] * ndims
-      kernel_shape[d] = input_shape[d]
-
-      output_shape = list(input_shape)
-      output_shape[d] = min(1, input_shape[d])
-
-      mask = np.identity(int(np.prod(input_shape)), np.bool)
-      mask = np.reshape(mask, input_shape * 2)
-
-      for p in itertools.product(*[range(input_shape[dim])
-                                   for dim in range(ndims)]):
-        p = list(p)
-        p[d] = slice(None)
-        mask[p * 2] = True
-
-      mask = np.take(mask, range(0, min(1, input_shape[d])), ndims + d)
-
-      self.assertAllEqual(
-          mask,
-          conv_utils.conv_kernel_mask(
-              input_shape,
-              kernel_shape,
-              strides,
-              padding
-          )
-      )
-
-  def test_conv_kernel_mask_wrong_padding(self, *input_shape):
-    ndims = len(input_shape)
-    kernel_shape = (1,) * ndims
-    strides = (1,) * ndims
-
-    conv_utils.conv_kernel_mask(
-        input_shape,
-        kernel_shape,
-        strides,
-        'valid'
-    )
-
-    conv_utils.conv_kernel_mask(
-        input_shape,
-        kernel_shape,
-        strides,
-        'same'
-    )
-
-    self.assertRaises(NotImplementedError,
-                      conv_utils.conv_kernel_mask,
-                      input_shape, kernel_shape, strides, 'full')
-
-  def test_conv_kernel_mask_wrong_dims(self, *input_shape):
-    kernel_shape = 1
-    strides = 1
-
-    conv_utils.conv_kernel_mask(
-        input_shape,
-        kernel_shape,
-        strides,
-        'valid'
-    )
-
-    ndims = len(input_shape)
-
-    kernel_shape = (2,) * (ndims + 1)
-    self.assertRaises(ValueError,
-                      conv_utils.conv_kernel_mask,
-                      input_shape, kernel_shape, strides, 'same')
-
-    strides = (1,) * ndims
-    self.assertRaises(ValueError,
-                      conv_utils.conv_kernel_mask,
-                      input_shape, kernel_shape, strides, 'valid')
-
-    kernel_shape = (1,) * ndims
-    strides = (2,) * (ndims - 1)
-    self.assertRaises(ValueError,
-                      conv_utils.conv_kernel_mask,
-                      input_shape, kernel_shape, strides, 'valid')
-
-    strides = (2,) * ndims
-    conv_utils.conv_kernel_mask(
-        input_shape,
-        kernel_shape,
-        strides,
-        'valid'
-    )
-
-
-if __name__ == '__main__':
-  tf.test.main()
+            "valid",
+        )
+
+        strides = (2,) * ndims
+        conv_utils.conv_kernel_mask(input_shape, kernel_shape, strides, "valid")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/data_utils.py b/keras/utils/data_utils.py
index a281c740766b..21f48cb8c237 100644
--- a/keras/utils/data_utils.py
+++ b/keras/utils/data_utils.py
@@ -12,13 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-import-not-at-top
-"""Utilities for file download and caching."""
 
-import tensorflow.compat.v2 as tf
+"""Utilities for file download and caching."""
 
-from abc import abstractmethod
-from contextlib import closing
 import functools
 import hashlib
 import multiprocessing.dummy
@@ -32,486 +28,558 @@
 import time
 import typing
 import urllib
+import warnings
 import weakref
 import zipfile
-from six.moves.urllib.parse import urlsplit
+from abc import abstractmethod
+from contextlib import closing
 
 import numpy as np
-from six.moves.urllib.request import urlopen
+import tensorflow.compat.v2 as tf
+from six.moves.urllib.parse import urlsplit
+
+from keras.utils import io_utils
 from keras.utils import tf_inspect
 from keras.utils.generic_utils import Progbar
-from keras.utils import io_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
+from six.moves.urllib.request import urlopen
 
 # Required to support google internal urlretrieve
-if True:  # This gets transformed to `if sys.version_info[0] == 2:` in OSS.  # pylint: disable=using-constant-test
+if True:  # This gets transformed to `if sys.version_info[0] == 2:` in OSS.
+
+    def urlretrieve(url, filename, reporthook=None, data=None):
+        """Replacement for `urlretrieve` for Python 2.
+
+        Under Python 2, `urlretrieve` relies on `FancyURLopener` from legacy
+        `urllib` module, known to have issues with proxy management.
+
+        Args:
+            url: url to retrieve.
+            filename: where to store the retrieved data locally.
+            reporthook: a hook function that will be called once on
+              establishment of the network connection and once after each block
+              read thereafter. The hook will be passed three arguments; a count
+              of blocks transferred so far, a block size in bytes, and the total
+              size of the file.
+            data: `data` argument passed to `urlopen`.
+        """
+
+        def chunk_read(response, chunk_size=8192, reporthook=None):
+            content_type = response.info().get("Content-Length")
+            total_size = -1
+            if content_type is not None:
+                total_size = int(content_type.strip())
+            count = 0
+            while True:
+                chunk = response.read(chunk_size)
+                count += 1
+                if reporthook is not None:
+                    reporthook(count, chunk_size, total_size)
+                if chunk:
+                    yield chunk
+                else:
+                    break
+
+        response = urlopen(url, data)
+        with open(filename, "wb") as fd:
+            for chunk in chunk_read(response, reporthook=reporthook):
+                fd.write(chunk)
 
-  def urlretrieve(url, filename, reporthook=None, data=None):
-    """Replacement for `urlretrieve` for Python 2.
+else:
+    from urllib.request import urlretrieve
 
-    Under Python 2, `urlretrieve` relies on `FancyURLopener` from legacy
-    `urllib` module, known to have issues with proxy management.
+
+def is_generator_or_sequence(x):
+    """Check if `x` is a Keras generator type."""
+    builtin_iterators = (str, list, tuple, dict, set, frozenset)
+    if isinstance(x, (tf.Tensor, np.ndarray) + builtin_iterators):
+        return False
+    return (
+        tf_inspect.isgenerator(x)
+        or isinstance(x, Sequence)
+        or isinstance(x, typing.Iterator)
+    )
+
+
+def _resolve_path(path):
+    return os.path.realpath(os.path.abspath(path))
+
+
+def _is_path_in_dir(path, base_dir):
+    return _resolve_path(os.path.join(base_dir, path)).startswith(base_dir)
+
+
+def _is_link_in_dir(info, base):
+    tip = _resolve_path(os.path.join(base, os.path.dirname(info.name)))
+    return _is_path_in_dir(info.linkname, base_dir=tip)
+
+
+def _filter_safe_paths(members):
+    base_dir = _resolve_path(".")
+    for finfo in members:
+        valid_path = False
+        if _is_path_in_dir(finfo.name, base_dir):
+            valid_path = True
+            yield finfo
+        elif finfo.issym() or finfo.islnk():
+            if _is_link_in_dir(finfo, base_dir):
+                valid_path = True
+                yield finfo
+        if not valid_path:
+            warnings.warn(
+                "Skipping invalid path during archive extraction: "
+                f"'{finfo.name}'."
+            )
+
+
+def _extract_archive(file_path, path=".", archive_format="auto"):
+    """Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
 
     Args:
-        url: url to retrieve.
-        filename: where to store the retrieved data locally.
-        reporthook: a hook function that will be called once on establishment of
-          the network connection and once after each block read thereafter. The
-          hook will be passed three arguments; a count of blocks transferred so
-          far, a block size in bytes, and the total size of the file.
-        data: `data` argument passed to `urlopen`.
+        file_path: Path to the archive file.
+        path: Where to extract the archive file.
+        archive_format: Archive format to try for extracting the file.
+            Options are `'auto'`, `'tar'`, `'zip'`, and `None`.
+            `'tar'` includes tar, tar.gz, and tar.bz files.
+            The default 'auto' is `['tar', 'zip']`.
+            `None` or an empty list will return no matches found.
+
+    Returns:
+        True if a match was found and an archive extraction was completed,
+        False otherwise.
     """
+    if archive_format is None:
+        return False
+    if archive_format == "auto":
+        archive_format = ["tar", "zip"]
+    if isinstance(archive_format, str):
+        archive_format = [archive_format]
+
+    file_path = io_utils.path_to_string(file_path)
+    path = io_utils.path_to_string(path)
+
+    for archive_type in archive_format:
+        if archive_type == "tar":
+            open_fn = tarfile.open
+            is_match_fn = tarfile.is_tarfile
+        if archive_type == "zip":
+            open_fn = zipfile.ZipFile
+            is_match_fn = zipfile.is_zipfile
+
+        if is_match_fn(file_path):
+            with open_fn(file_path) as archive:
+                try:
+                    if zipfile.is_zipfile(file_path):
+                        # Zip archive.
+                        archive.extractall(path)
+                    else:
+                        # Tar archive, perhaps unsafe. Filter paths.
+                        archive.extractall(
+                            path, members=_filter_safe_paths(archive)
+                        )
+                except (tarfile.TarError, RuntimeError, KeyboardInterrupt):
+                    if os.path.exists(path):
+                        if os.path.isfile(path):
+                            os.remove(path)
+                        else:
+                            shutil.rmtree(path)
+                    raise
+            return True
+    return False
 
-    def chunk_read(response, chunk_size=8192, reporthook=None):
-      content_type = response.info().get('Content-Length')
-      total_size = -1
-      if content_type is not None:
-        total_size = int(content_type.strip())
-      count = 0
-      while True:
-        chunk = response.read(chunk_size)
-        count += 1
-        if reporthook is not None:
-          reporthook(count, chunk_size, total_size)
-        if chunk:
-          yield chunk
-        else:
-          break
 
-    response = urlopen(url, data)
-    with open(filename, 'wb') as fd:
-      for chunk in chunk_read(response, reporthook=reporthook):
-        fd.write(chunk)
-else:
-  from urllib.request import urlretrieve  # pylint: disable=g-importing-member
+@keras_export("keras.utils.get_file")
+def get_file(
+    fname=None,
+    origin=None,
+    untar=False,
+    md5_hash=None,
+    file_hash=None,
+    cache_subdir="datasets",
+    hash_algorithm="auto",
+    extract=False,
+    archive_format="auto",
+    cache_dir=None,
+):
+    """Downloads a file from a URL if it not already in the cache.
+
+    By default the file at the url `origin` is downloaded to the
+    cache_dir `~/.keras`, placed in the cache_subdir `datasets`,
+    and given the filename `fname`. The final location of a file
+    `example.txt` would therefore be `~/.keras/datasets/example.txt`.
+
+    Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
+    Passing a hash will verify the file after download. The command line
+    programs `shasum` and `sha256sum` can compute the hash.
+
+    Example:
+
+    ```python
+    path_to_downloaded_file = tf.keras.utils.get_file(
+        origin="https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz",
+        extract=True,
+    )
+    ```
 
+    Args:
+        fname: Name of the file. If an absolute path `/path/to/file.txt` is
+            specified the file will be saved at that location. If `None`, the
+            name of the file at `origin` will be used.
+        origin: Original URL of the file.
+        untar: Deprecated in favor of `extract` argument.
+            boolean, whether the file should be decompressed
+        md5_hash: Deprecated in favor of `file_hash` argument.
+            md5 hash of the file for verification
+        file_hash: The expected hash string of the file after download.
+            The sha256 and md5 hash algorithms are both supported.
+        cache_subdir: Subdirectory under the Keras cache dir where the file is
+            saved. If an absolute path `/path/to/folder` is
+            specified the file will be saved at that location.
+        hash_algorithm: Select the hash algorithm to verify the file.
+            options are `'md5'`, `'sha256'`, and `'auto'`.
+            The default 'auto' detects the hash algorithm in use.
+        extract: True tries extracting the file as an Archive, like tar or zip.
+        archive_format: Archive format to try for extracting the file.
+            Options are `'auto'`, `'tar'`, `'zip'`, and `None`.
+            `'tar'` includes tar, tar.gz, and tar.bz files.
+            The default `'auto'` corresponds to `['tar', 'zip']`.
+            None or an empty list will return no matches found.
+        cache_dir: Location to store cached files, when None it
+            defaults to `~/.keras/`.
 
-def is_generator_or_sequence(x):
-  """Check if `x` is a Keras generator type."""
-  builtin_iterators = (str, list, tuple, dict, set, frozenset)
-  if isinstance(x, (tf.Tensor, np.ndarray) + builtin_iterators):
-    return False
-  return (tf_inspect.isgenerator(x) or
-          isinstance(x, Sequence) or
-          isinstance(x, typing.Iterator))
-
-
-def _extract_archive(file_path, path='.', archive_format='auto'):
-  """Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
-
-  Args:
-      file_path: path to the archive file
-      path: path to extract the archive file
-      archive_format: Archive format to try for extracting the file.
-          Options are 'auto', 'tar', 'zip', and None.
-          'tar' includes tar, tar.gz, and tar.bz files.
-          The default 'auto' is ['tar', 'zip'].
-          None or an empty list will return no matches found.
-
-  Returns:
-      True if a match was found and an archive extraction was completed,
-      False otherwise.
-  """
-  if archive_format is None:
-    return False
-  if archive_format == 'auto':
-    archive_format = ['tar', 'zip']
-  if isinstance(archive_format, str):
-    archive_format = [archive_format]
-
-  file_path = io_utils.path_to_string(file_path)
-  path = io_utils.path_to_string(path)
-
-  for archive_type in archive_format:
-    if archive_type == 'tar':
-      open_fn = tarfile.open
-      is_match_fn = tarfile.is_tarfile
-    if archive_type == 'zip':
-      open_fn = zipfile.ZipFile
-      is_match_fn = zipfile.is_zipfile
-
-    if is_match_fn(file_path):
-      with open_fn(file_path) as archive:
-        try:
-          archive.extractall(path)
-        except (tarfile.TarError, RuntimeError, KeyboardInterrupt):
-          if os.path.exists(path):
-            if os.path.isfile(path):
-              os.remove(path)
-            else:
-              shutil.rmtree(path)
-          raise
-      return True
-  return False
-
-
-@keras_export('keras.utils.get_file')
-def get_file(fname=None,
-             origin=None,
-             untar=False,
-             md5_hash=None,
-             file_hash=None,
-             cache_subdir='datasets',
-             hash_algorithm='auto',
-             extract=False,
-             archive_format='auto',
-             cache_dir=None):
-  """Downloads a file from a URL if it not already in the cache.
-
-  By default the file at the url `origin` is downloaded to the
-  cache_dir `~/.keras`, placed in the cache_subdir `datasets`,
-  and given the filename `fname`. The final location of a file
-  `example.txt` would therefore be `~/.keras/datasets/example.txt`.
-
-  Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
-  Passing a hash will verify the file after download. The command line
-  programs `shasum` and `sha256sum` can compute the hash.
-
-  Example:
-
-  ```python
-  path_to_downloaded_file = tf.keras.utils.get_file(
-      "flower_photos",
-      "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz",
-      untar=True)
-  ```
-
-  Args:
-      fname: Name of the file. If an absolute path `/path/to/file.txt` is
-          specified the file will be saved at that location. If `None`, the
-          name of the file at `origin` will be used.
-      origin: Original URL of the file.
-      untar: Deprecated in favor of `extract` argument.
-          boolean, whether the file should be decompressed
-      md5_hash: Deprecated in favor of `file_hash` argument.
-          md5 hash of the file for verification
-      file_hash: The expected hash string of the file after download.
-          The sha256 and md5 hash algorithms are both supported.
-      cache_subdir: Subdirectory under the Keras cache dir where the file is
-          saved. If an absolute path `/path/to/folder` is
-          specified the file will be saved at that location.
-      hash_algorithm: Select the hash algorithm to verify the file.
-          options are `'md5'`, `'sha256'`, and `'auto'`.
-          The default 'auto' detects the hash algorithm in use.
-      extract: True tries extracting the file as an Archive, like tar or zip.
-      archive_format: Archive format to try for extracting the file.
-          Options are `'auto'`, `'tar'`, `'zip'`, and `None`.
-          `'tar'` includes tar, tar.gz, and tar.bz files.
-          The default `'auto'` corresponds to `['tar', 'zip']`.
-          None or an empty list will return no matches found.
-      cache_dir: Location to store cached files, when None it
-          defaults to the default directory `~/.keras/`.
-
-  Returns:
-      Path to the downloaded file
-  """
-  if origin is None:
-    raise ValueError('Please specify the "origin" argument (URL of the file '
-                     'to download).')
-
-  if cache_dir is None:
-    cache_dir = os.path.join(os.path.expanduser('~'), '.keras')
-  if md5_hash is not None and file_hash is None:
-    file_hash = md5_hash
-    hash_algorithm = 'md5'
-  datadir_base = os.path.expanduser(cache_dir)
-  if not os.access(datadir_base, os.W_OK):
-    datadir_base = os.path.join('/tmp', '.keras')
-  datadir = os.path.join(datadir_base, cache_subdir)
-  _makedirs_exist_ok(datadir)
-
-  fname = io_utils.path_to_string(fname)
-  if not fname:
-    fname = os.path.basename(urlsplit(origin).path)
-    if not fname:
-      raise ValueError(
-          f"Can't parse the file name from the origin provided: '{origin}'."
-          "Please specify the `fname` as the input param.")
-
-  if untar:
-    if fname.endswith('.tar.gz'):
-      fname = pathlib.Path(fname)
-      # The 2 `.with_suffix()` are because of `.tar.gz` as pathlib
-      # considers it as 2 suffixes.
-      fname = fname.with_suffix('').with_suffix('')
-      fname = str(fname)
-    untar_fpath = os.path.join(datadir, fname)
-    fpath = untar_fpath + '.tar.gz'
-  else:
-    fpath = os.path.join(datadir, fname)
-
-  download = False
-  if os.path.exists(fpath):
-    # File found; verify integrity if a hash was provided.
-    if file_hash is not None:
-      if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
-        io_utils.print_msg(
-            'A local file was found, but it seems to be '
-            f'incomplete or outdated because the {hash_algorithm} '
-            f'file hash does not match the original value of {file_hash} '
-            'so we will re-download the data.')
-        download = True
-  else:
-    download = True
-
-  if download:
-    io_utils.print_msg(f'Downloading data from {origin}')
-
-    class DLProgbar:
-      """Manage progress bar state for use in urlretrieve."""
-
-      def __init__(self):
-        self.progbar = None
-        self.finished = False
-
-      def __call__(self, block_num, block_size, total_size):
-        if not self.progbar:
-          if total_size == -1:
-            total_size = None
-          self.progbar = Progbar(total_size)
-        current = block_num * block_size
-        if current < total_size:
-          self.progbar.update(current)
-        elif not self.finished:
-          self.progbar.update(self.progbar.target)
-          self.finished = True
-
-    error_msg = 'URL fetch failure on {}: {} -- {}'
-    try:
-      try:
-        urlretrieve(origin, fpath, DLProgbar())
-      except urllib.error.HTTPError as e:
-        raise Exception(error_msg.format(origin, e.code, e.msg))
-      except urllib.error.URLError as e:
-        raise Exception(error_msg.format(origin, e.errno, e.reason))
-    except (Exception, KeyboardInterrupt) as e:
-      if os.path.exists(fpath):
-        os.remove(fpath)
-      raise
-
-    # Validate download if succeeded and user provided an expected hash
-    # Security conscious users would get the hash of the file from a separate
-    # channel and pass it to this API to prevent MITM / corruption:
-    if os.path.exists(fpath) and file_hash is not None:
-      if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
-        raise ValueError(
-            f'Incomplete or corrupted file detected. The {hash_algorithm} '
-            f'file hash does not match the provided value of {file_hash}.')
+    Returns:
+        Path to the downloaded file.
 
-  if untar:
-    if not os.path.exists(untar_fpath):
-      _extract_archive(fpath, datadir, archive_format='tar')
-    return untar_fpath
+    ⚠️ **Warning on malicious downloads** ⚠️
 
-  if extract:
-    _extract_archive(fpath, datadir, archive_format)
+    Downloading something from the Internet carries a risk.
+    NEVER download a file/archive if you do not trust the source.
+    We recommend that you specify the `file_hash` argument
+    (if the hash of the source file is known) to make sure that the file you
+    are getting is the one you expect.
+    """
+    if origin is None:
+        raise ValueError(
+            'Please specify the "origin" argument (URL of the file '
+            "to download)."
+        )
+
+    if cache_dir is None:
+        cache_dir = os.path.join(os.path.expanduser("~"), ".keras")
+    if md5_hash is not None and file_hash is None:
+        file_hash = md5_hash
+        hash_algorithm = "md5"
+    datadir_base = os.path.expanduser(cache_dir)
+    if not os.access(datadir_base, os.W_OK):
+        datadir_base = os.path.join("/tmp", ".keras")
+    datadir = os.path.join(datadir_base, cache_subdir)
+    _makedirs_exist_ok(datadir)
+
+    fname = io_utils.path_to_string(fname)
+    if not fname:
+        fname = os.path.basename(urlsplit(origin).path)
+        if not fname:
+            raise ValueError(
+                "Can't parse the file name from the origin provided: "
+                f"'{origin}'."
+                "Please specify the `fname` as the input param."
+            )
+
+    if untar:
+        if fname.endswith(".tar.gz"):
+            fname = pathlib.Path(fname)
+            # The 2 `.with_suffix()` are because of `.tar.gz` as pathlib
+            # considers it as 2 suffixes.
+            fname = fname.with_suffix("").with_suffix("")
+            fname = str(fname)
+        untar_fpath = os.path.join(datadir, fname)
+        fpath = untar_fpath + ".tar.gz"
+    else:
+        fpath = os.path.join(datadir, fname)
+
+    download = False
+    if os.path.exists(fpath):
+        # File found; verify integrity if a hash was provided.
+        if file_hash is not None:
+            if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
+                io_utils.print_msg(
+                    "A local file was found, but it seems to be "
+                    f"incomplete or outdated because the {hash_algorithm} "
+                    "file hash does not match the original value of "
+                    f"{file_hash} "
+                    "so we will re-download the data."
+                )
+                download = True
+    else:
+        download = True
 
-  return fpath
+    if download:
+        io_utils.print_msg(f"Downloading data from {origin}")
+
+        class DLProgbar:
+            """Manage progress bar state for use in urlretrieve."""
+
+            def __init__(self):
+                self.progbar = None
+                self.finished = False
+
+            def __call__(self, block_num, block_size, total_size):
+                if not self.progbar:
+                    if total_size == -1:
+                        total_size = None
+                    self.progbar = Progbar(total_size)
+                current = block_num * block_size
+
+                if total_size is None:
+                    self.progbar.update(current)
+                else:
+                    if current < total_size:
+                        self.progbar.update(current)
+                    elif not self.finished:
+                        self.progbar.update(self.progbar.target)
+                        self.finished = True
+
+        error_msg = "URL fetch failure on {}: {} -- {}"
+        try:
+            try:
+                urlretrieve(origin, fpath, DLProgbar())
+            except urllib.error.HTTPError as e:
+                raise Exception(error_msg.format(origin, e.code, e.msg))
+            except urllib.error.URLError as e:
+                raise Exception(error_msg.format(origin, e.errno, e.reason))
+        except (Exception, KeyboardInterrupt):
+            if os.path.exists(fpath):
+                os.remove(fpath)
+            raise
+
+        # Validate download if succeeded and user provided an expected hash
+        # Security conscious users would get the hash of the file from a
+        # separate channel and pass it to this API to prevent MITM / corruption:
+        if os.path.exists(fpath) and file_hash is not None:
+            if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
+                raise ValueError(
+                    "Incomplete or corrupted file detected. "
+                    f"The {hash_algorithm} "
+                    "file hash does not match the provided value "
+                    f"of {file_hash}."
+                )
+
+    if untar:
+        if not os.path.exists(untar_fpath):
+            _extract_archive(fpath, datadir, archive_format="tar")
+        return untar_fpath
+
+    if extract:
+        _extract_archive(fpath, datadir, archive_format)
+
+    return fpath
 
 
 def _makedirs_exist_ok(datadir):
-  os.makedirs(datadir, exist_ok=True)  # pylint: disable=unexpected-keyword-arg
+    os.makedirs(datadir, exist_ok=True)
 
 
 def _resolve_hasher(algorithm, file_hash=None):
-  """Returns hash algorithm as hashlib function."""
-  if algorithm == 'sha256':
-    return hashlib.sha256()
+    """Returns hash algorithm as hashlib function."""
+    if algorithm == "sha256":
+        return hashlib.sha256()
 
-  if algorithm == 'auto' and file_hash is not None and len(file_hash) == 64:
-    return hashlib.sha256()
+    if algorithm == "auto" and file_hash is not None and len(file_hash) == 64:
+        return hashlib.sha256()
 
-  # This is used only for legacy purposes.
-  return hashlib.md5()
+    # This is used only for legacy purposes.
+    return hashlib.md5()
 
 
-def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
-  """Calculates a file sha256 or md5 hash.
+def _hash_file(fpath, algorithm="sha256", chunk_size=65535):
+    """Calculates a file sha256 or md5 hash.
 
-  Example:
+    Example:
 
-  ```python
-  _hash_file('/path/to/file.zip')
-  'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
-  ```
+    ```python
+    _hash_file('/path/to/file.zip')
+    'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+    ```
 
-  Args:
-      fpath: path to the file being validated
-      algorithm: hash algorithm, one of `'auto'`, `'sha256'`, or `'md5'`.
-          The default `'auto'` detects the hash algorithm in use.
-      chunk_size: Bytes to read at a time, important for large files.
+    Args:
+        fpath: Path to the file being validated.
+        algorithm: Hash algorithm, one of `'auto'`, `'sha256'`, or `'md5'`.
+            The default `'auto'` detects the hash algorithm in use.
+        chunk_size: Bytes to read at a time, important for large files.
 
-  Returns:
-      The file hash
-  """
-  if isinstance(algorithm, str):
-    hasher = _resolve_hasher(algorithm)
-  else:
-    hasher = algorithm
+    Returns:
+        The file hash.
+    """
+    if isinstance(algorithm, str):
+        hasher = _resolve_hasher(algorithm)
+    else:
+        hasher = algorithm
 
-  with open(fpath, 'rb') as fpath_file:
-    for chunk in iter(lambda: fpath_file.read(chunk_size), b''):
-      hasher.update(chunk)
+    with open(fpath, "rb") as fpath_file:
+        for chunk in iter(lambda: fpath_file.read(chunk_size), b""):
+            hasher.update(chunk)
 
-  return hasher.hexdigest()
+    return hasher.hexdigest()
 
 
-def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
-  """Validates a file against a sha256 or md5 hash.
+def validate_file(fpath, file_hash, algorithm="auto", chunk_size=65535):
+    """Validates a file against a sha256 or md5 hash.
 
-  Args:
-      fpath: path to the file being validated
-      file_hash:  The expected hash string of the file.
-          The sha256 and md5 hash algorithms are both supported.
-      algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
-          The default 'auto' detects the hash algorithm in use.
-      chunk_size: Bytes to read at a time, important for large files.
+    Args:
+        fpath: path to the file being validated
+        file_hash:  The expected hash string of the file.
+            The sha256 and md5 hash algorithms are both supported.
+        algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
+            The default 'auto' detects the hash algorithm in use.
+        chunk_size: Bytes to read at a time, important for large files.
 
-  Returns:
-      Whether the file is valid
-  """
-  hasher = _resolve_hasher(algorithm, file_hash)
+    Returns:
+        Whether the file is valid
+    """
+    hasher = _resolve_hasher(algorithm, file_hash)
 
-  if str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash):
-    return True
-  else:
-    return False
+    if str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash):
+        return True
+    else:
+        return False
 
 
 class ThreadsafeIter:
-  """Wrap an iterator with a lock and propagate exceptions to all threads."""
+    """Wrap an iterator with a lock and propagate exceptions to all threads."""
 
-  def __init__(self, it):
-    self.it = it
-    self.lock = threading.Lock()
+    def __init__(self, it):
+        self.it = it
+        self.lock = threading.Lock()
 
-    # After a generator throws an exception all subsequent next() calls raise a
-    # StopIteration Exception. This, however, presents an issue when mixing
-    # generators and threading because it means the order of retrieval need not
-    # match the order in which the generator was called. This can make it appear
-    # that a generator exited normally when in fact the terminating exception is
-    # just in a different thread. In order to provide thread safety, once
-    # self.it has thrown an exception we continue to throw the same exception.
-    self._exception = None
+        # After a generator throws an exception all subsequent next() calls
+        # raise a StopIteration Exception. This, however, presents an issue when
+        # mixing generators and threading because it means the order of
+        # retrieval need not match the order in which the generator was called.
+        # This can make it appear that a generator exited normally when in fact
+        # the terminating exception is just in a different thread. In order to
+        # provide thread safety, once self.it has thrown an exception we
+        # continue to throw the same exception.
+        self._exception = None
 
-  def __iter__(self):
-    return self
+    def __iter__(self):
+        return self
 
-  def next(self):
-    return self.__next__()
+    def next(self):
+        return self.__next__()
 
-  def __next__(self):
-    with self.lock:
-      if self._exception:
-        raise self._exception  # pylint: disable=raising-bad-type
+    def __next__(self):
+        with self.lock:
+            if self._exception:
+                raise self._exception
 
-      try:
-        return next(self.it)
-      except Exception as e:
-        self._exception = e
-        raise
+            try:
+                return next(self.it)
+            except Exception as e:
+                self._exception = e
+                raise
 
 
 def threadsafe_generator(f):
+    @functools.wraps(f)
+    def g(*a, **kw):
+        return ThreadsafeIter(f(*a, **kw))
 
-  @functools.wraps(f)
-  def g(*a, **kw):
-    return ThreadsafeIter(f(*a, **kw))
+    return g
 
-  return g
 
-
-@keras_export('keras.utils.Sequence')
+@keras_export("keras.utils.Sequence")
 class Sequence:
-  """Base object for fitting to a sequence of data, such as a dataset.
+    """Base object for fitting to a sequence of data, such as a dataset.
 
-  Every `Sequence` must implement the `__getitem__` and the `__len__` methods.
-  If you want to modify your dataset between epochs you may implement
-  `on_epoch_end`.
-  The method `__getitem__` should return a complete batch.
+    Every `Sequence` must implement the `__getitem__` and the `__len__` methods.
+    If you want to modify your dataset between epochs, you may implement
+    `on_epoch_end`. The method `__getitem__` should return a complete batch.
 
-  Notes:
+    Notes:
 
-  `Sequence` are a safer way to do multiprocessing. This structure guarantees
-  that the network will only train once
-   on each sample per epoch which is not the case with generators.
+    `Sequence` is a safer way to do multiprocessing. This structure guarantees
+    that the network will only train once on each sample per epoch, which is not
+    the case with generators.
 
-  Examples:
+    Examples:
 
-  ```python
-  from skimage.io import imread
-  from skimage.transform import resize
-  import numpy as np
-  import math
+    ```python
+    from skimage.io import imread
+    from skimage.transform import resize
+    import numpy as np
+    import math
 
-  # Here, `x_set` is list of path to the images
-  # and `y_set` are the associated classes.
+    # Here, `x_set` is list of path to the images
+    # and `y_set` are the associated classes.
 
-  class CIFAR10Sequence(tf.keras.utils.Sequence):
+    class CIFAR10Sequence(tf.keras.utils.Sequence):
 
-      def __init__(self, x_set, y_set, batch_size):
-          self.x, self.y = x_set, y_set
-          self.batch_size = batch_size
+        def __init__(self, x_set, y_set, batch_size):
+            self.x, self.y = x_set, y_set
+            self.batch_size = batch_size
 
-      def __len__(self):
-          return math.ceil(len(self.x) / self.batch_size)
+        def __len__(self):
+            return math.ceil(len(self.x) / self.batch_size)
 
-      def __getitem__(self, idx):
-          batch_x = self.x[idx * self.batch_size:(idx + 1) *
-          self.batch_size]
-          batch_y = self.y[idx * self.batch_size:(idx + 1) *
-          self.batch_size]
+        def __getitem__(self, idx):
+            low = idx * self.batch_size
+            # Cap upper bound at array length; the last batch may be smaller
+            # if the total number of items is not a multiple of batch size.
+            high = min(low + self.batch_size, len(self.x))
+            batch_x = self.x[low:high]
+            batch_y = self.y[low:high]
 
-          return np.array([
-              resize(imread(file_name), (200, 200))
-                 for file_name in batch_x]), np.array(batch_y)
-  ```
-  """
+            return np.array([
+                resize(imread(file_name), (200, 200))
+                   for file_name in batch_x]), np.array(batch_y)
+    ```
+    """
 
-  @abstractmethod
-  def __getitem__(self, index):
-    """Gets batch at position `index`.
+    @abstractmethod
+    def __getitem__(self, index):
+        """Gets batch at position `index`.
 
-    Args:
-        index: position of the batch in the Sequence.
+        Args:
+            index: position of the batch in the Sequence.
 
-    Returns:
-        A batch
-    """
-    raise NotImplementedError
+        Returns:
+            A batch
+        """
+        raise NotImplementedError
 
-  @abstractmethod
-  def __len__(self):
-    """Number of batch in the Sequence.
+    @abstractmethod
+    def __len__(self):
+        """Number of batch in the Sequence.
 
-    Returns:
-        The number of batches in the Sequence.
-    """
-    raise NotImplementedError
+        Returns:
+            The number of batches in the Sequence.
+        """
+        raise NotImplementedError
 
-  def on_epoch_end(self):
-    """Method called at the end of every epoch.
-    """
-    pass
+    def on_epoch_end(self):
+        """Method called at the end of every epoch."""
+        pass
 
-  def __iter__(self):
-    """Create a generator that iterate over the Sequence."""
-    for item in (self[i] for i in range(len(self))):
-      yield item
+    def __iter__(self):
+        """Create a generator that iterate over the Sequence."""
+        for item in (self[i] for i in range(len(self))):
+            yield item
 
 
 def iter_sequence_infinite(seq):
-  """Iterates indefinitely over a Sequence.
+    """Iterates indefinitely over a Sequence.
 
-  Args:
-    seq: `Sequence` instance.
+    Args:
+      seq: `Sequence` instance.
 
-  Yields:
-    Batches of data from the `Sequence`.
-  """
-  while True:
-    for item in seq:
-      yield item
+    Yields:
+      Batches of data from the `Sequence`.
+    """
+    while True:
+        for item in seq:
+            yield item
 
 
 # Global variables to be shared across processes
@@ -531,522 +599,548 @@ def iter_sequence_infinite(seq):
 
 
 def dont_use_multiprocessing_pool(f):
-  @functools.wraps(f)
-  def wrapped(*args, **kwargs):
-    with _FORCE_THREADPOOL_LOCK:
-      global _FORCE_THREADPOOL
-      old_force_threadpool, _FORCE_THREADPOOL = _FORCE_THREADPOOL, True
-      out = f(*args, **kwargs)
-      _FORCE_THREADPOOL = old_force_threadpool
-      return out
-  return wrapped
+    @functools.wraps(f)
+    def wrapped(*args, **kwargs):
+        with _FORCE_THREADPOOL_LOCK:
+            global _FORCE_THREADPOOL
+            old_force_threadpool, _FORCE_THREADPOOL = _FORCE_THREADPOOL, True
+            out = f(*args, **kwargs)
+            _FORCE_THREADPOOL = old_force_threadpool
+            return out
+
+    return wrapped
 
 
 def get_pool_class(use_multiprocessing):
-  global _FORCE_THREADPOOL
-  if not use_multiprocessing or _FORCE_THREADPOOL:
-    return multiprocessing.dummy.Pool  # ThreadPool
-  return multiprocessing.Pool
+    global _FORCE_THREADPOOL
+    if not use_multiprocessing or _FORCE_THREADPOOL:
+        return multiprocessing.dummy.Pool  # ThreadPool
+    return multiprocessing.Pool
 
 
 def get_worker_id_queue():
-  """Lazily create the queue to track worker ids."""
-  global _WORKER_ID_QUEUE
-  if _WORKER_ID_QUEUE is None:
-    _WORKER_ID_QUEUE = multiprocessing.Queue()
-  return _WORKER_ID_QUEUE
+    """Lazily create the queue to track worker ids."""
+    global _WORKER_ID_QUEUE
+    if _WORKER_ID_QUEUE is None:
+        _WORKER_ID_QUEUE = multiprocessing.Queue()
+    return _WORKER_ID_QUEUE
 
 
 def init_pool(seqs):
-  global _SHARED_SEQUENCES
-  _SHARED_SEQUENCES = seqs
+    global _SHARED_SEQUENCES
+    _SHARED_SEQUENCES = seqs
 
 
 def get_index(uid, i):
-  """Get the value from the Sequence `uid` at index `i`.
+    """Get the value from the Sequence `uid` at index `i`.
 
-  To allow multiple Sequences to be used at the same time, we use `uid` to
-  get a specific one. A single Sequence would cause the validation to
-  overwrite the training Sequence.
+    To allow multiple Sequences to be used at the same time, we use `uid` to
+    get a specific one. A single Sequence would cause the validation to
+    overwrite the training Sequence.
 
-  Args:
-      uid: int, Sequence identifier
-      i: index
+    Args:
+        uid: int, Sequence identifier
+        i: index
 
-  Returns:
-      The value at index `i`.
-  """
-  return _SHARED_SEQUENCES[uid][i]
+    Returns:
+        The value at index `i`.
+    """
+    return _SHARED_SEQUENCES[uid][i]
 
 
-@keras_export('keras.utils.SequenceEnqueuer')
+@keras_export("keras.utils.SequenceEnqueuer")
 class SequenceEnqueuer:
-  """Base class to enqueue inputs.
-
-  The task of an Enqueuer is to use parallelism to speed up preprocessing.
-  This is done with processes or threads.
-
-  Example:
-
-  ```python
-      enqueuer = SequenceEnqueuer(...)
-      enqueuer.start()
-      datas = enqueuer.get()
-      for data in datas:
-          # Use the inputs; training, evaluating, predicting.
-          # ... stop sometime.
-      enqueuer.stop()
-  ```
-
-  The `enqueuer.get()` should be an infinite stream of data.
-  """
-
-  def __init__(self, sequence,
-               use_multiprocessing=False):
-    self.sequence = sequence
-    self.use_multiprocessing = use_multiprocessing
-
-    global _SEQUENCE_COUNTER
-    if _SEQUENCE_COUNTER is None:
-      try:
-        _SEQUENCE_COUNTER = multiprocessing.Value('i', 0)
-      except OSError:
-        # In this case the OS does not allow us to use
-        # multiprocessing. We resort to an int
-        # for enqueuer indexing.
-        _SEQUENCE_COUNTER = 0
-
-    if isinstance(_SEQUENCE_COUNTER, int):
-      self.uid = _SEQUENCE_COUNTER
-      _SEQUENCE_COUNTER += 1
-    else:
-      # Doing Multiprocessing.Value += x is not process-safe.
-      with _SEQUENCE_COUNTER.get_lock():
-        self.uid = _SEQUENCE_COUNTER.value
-        _SEQUENCE_COUNTER.value += 1
+    """Base class to enqueue inputs.
 
-    self.workers = 0
-    self.executor_fn = None
-    self.queue = None
-    self.run_thread = None
-    self.stop_signal = None
+    The task of an Enqueuer is to use parallelism to speed up preprocessing.
+    This is done with processes or threads.
 
-  def is_running(self):
-    return self.stop_signal is not None and not self.stop_signal.is_set()
+    Example:
 
-  def start(self, workers=1, max_queue_size=10):
-    """Starts the handler's workers.
+    ```python
+        enqueuer = SequenceEnqueuer(...)
+        enqueuer.start()
+        datas = enqueuer.get()
+        for data in datas:
+            # Use the inputs; training, evaluating, predicting.
+            # ... stop sometime.
+        enqueuer.stop()
+    ```
 
-    Args:
-        workers: Number of workers.
-        max_queue_size: queue size
-            (when full, workers could block on `put()`)
+    The `enqueuer.get()` should be an infinite stream of data.
     """
-    if self.use_multiprocessing:
-      self.executor_fn = self._get_executor_init(workers)
-    else:
-      # We do not need the init since it's threads.
-      self.executor_fn = lambda _: get_pool_class(False)(workers)
-    self.workers = workers
-    self.queue = queue.Queue(max_queue_size)
-    self.stop_signal = threading.Event()
-    self.run_thread = threading.Thread(target=self._run)
-    self.run_thread.daemon = True
-    self.run_thread.start()
 
-  def _send_sequence(self):
-    """Sends current Iterable to all workers."""
-    # For new processes that may spawn
-    _SHARED_SEQUENCES[self.uid] = self.sequence
-
-  def stop(self, timeout=None):
-    """Stops running threads and wait for them to exit, if necessary.
+    def __init__(self, sequence, use_multiprocessing=False):
+        self.sequence = sequence
+        self.use_multiprocessing = use_multiprocessing
+
+        global _SEQUENCE_COUNTER
+        if _SEQUENCE_COUNTER is None:
+            try:
+                _SEQUENCE_COUNTER = multiprocessing.Value("i", 0)
+            except OSError:
+                # In this case the OS does not allow us to use
+                # multiprocessing. We resort to an int
+                # for enqueuer indexing.
+                _SEQUENCE_COUNTER = 0
+
+        if isinstance(_SEQUENCE_COUNTER, int):
+            self.uid = _SEQUENCE_COUNTER
+            _SEQUENCE_COUNTER += 1
+        else:
+            # Doing Multiprocessing.Value += x is not process-safe.
+            with _SEQUENCE_COUNTER.get_lock():
+                self.uid = _SEQUENCE_COUNTER.value
+                _SEQUENCE_COUNTER.value += 1
+
+        self.workers = 0
+        self.executor_fn = None
+        self.queue = None
+        self.run_thread = None
+        self.stop_signal = None
+
+    def is_running(self):
+        return self.stop_signal is not None and not self.stop_signal.is_set()
+
+    def start(self, workers=1, max_queue_size=10):
+        """Starts the handler's workers.
+
+        Args:
+            workers: Number of workers.
+            max_queue_size: queue size
+                (when full, workers could block on `put()`)
+        """
+        if self.use_multiprocessing:
+            self.executor_fn = self._get_executor_init(workers)
+        else:
+            # We do not need the init since it's threads.
+            self.executor_fn = lambda _: get_pool_class(False)(workers)
+        self.workers = workers
+        self.queue = queue.Queue(max_queue_size)
+        self.stop_signal = threading.Event()
+        self.run_thread = threading.Thread(target=self._run)
+        self.run_thread.daemon = True
+        self.run_thread.start()
+
+    def _send_sequence(self):
+        """Sends current Iterable to all workers."""
+        # For new processes that may spawn
+        _SHARED_SEQUENCES[self.uid] = self.sequence
+
+    def stop(self, timeout=None):
+        """Stops running threads and wait for them to exit, if necessary.
+
+        Should be called by the same thread which called `start()`.
+
+        Args:
+            timeout: maximum time to wait on `thread.join()`
+        """
+        self.stop_signal.set()
+        with self.queue.mutex:
+            self.queue.queue.clear()
+            self.queue.unfinished_tasks = 0
+            self.queue.not_full.notify()
+        self.run_thread.join(timeout)
+        _SHARED_SEQUENCES[self.uid] = None
+
+    def __del__(self):
+        if self.is_running():
+            self.stop()
 
-    Should be called by the same thread which called `start()`.
+    @abstractmethod
+    def _run(self):
+        """Submits request to the executor and queue the `Future` objects."""
+        raise NotImplementedError
 
-    Args:
-        timeout: maximum time to wait on `thread.join()`
-    """
-    self.stop_signal.set()
-    with self.queue.mutex:
-      self.queue.queue.clear()
-      self.queue.unfinished_tasks = 0
-      self.queue.not_full.notify()
-    self.run_thread.join(timeout)
-    _SHARED_SEQUENCES[self.uid] = None
-
-  def __del__(self):
-    if self.is_running():
-      self.stop()
-
-  @abstractmethod
-  def _run(self):
-    """Submits request to the executor and queue the `Future` objects."""
-    raise NotImplementedError
-
-  @abstractmethod
-  def _get_executor_init(self, workers):
-    """Gets the Pool initializer for multiprocessing.
+    @abstractmethod
+    def _get_executor_init(self, workers):
+        """Gets the Pool initializer for multiprocessing.
 
-    Args:
-        workers: Number of workers.
+        Args:
+            workers: Number of workers.
 
-    Returns:
-        Function, a Function to initialize the pool
-    """
-    raise NotImplementedError
+        Returns:
+            Function, a Function to initialize the pool
+        """
+        raise NotImplementedError
 
-  @abstractmethod
-  def get(self):
-    """Creates a generator to extract data from the queue.
+    @abstractmethod
+    def get(self):
+        """Creates a generator to extract data from the queue.
 
-    Skip the data if it is `None`.
-    # Returns
-        Generator yielding tuples `(inputs, targets)`
-            or `(inputs, targets, sample_weights)`.
-    """
-    raise NotImplementedError
+        Skip the data if it is `None`.
+        # Returns
+            Generator yielding tuples `(inputs, targets)`
+                or `(inputs, targets, sample_weights)`.
+        """
+        raise NotImplementedError
 
 
-@keras_export('keras.utils.OrderedEnqueuer')
+@keras_export("keras.utils.OrderedEnqueuer")
 class OrderedEnqueuer(SequenceEnqueuer):
-  """Builds a Enqueuer from a Sequence.
-
-  Args:
-      sequence: A `tf.keras.utils.data_utils.Sequence` object.
-      use_multiprocessing: use multiprocessing if True, otherwise threading
-      shuffle: whether to shuffle the data at the beginning of each epoch
-  """
-
-  def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
-    super().__init__(sequence, use_multiprocessing)
-    self.shuffle = shuffle
-
-  def _get_executor_init(self, workers):
-    """Gets the Pool initializer for multiprocessing.
+    """Builds a Enqueuer from a Sequence.
 
     Args:
-        workers: Number of workers.
-
-    Returns:
-        Function, a Function to initialize the pool
+        sequence: A `tf.keras.utils.data_utils.Sequence` object.
+        use_multiprocessing: use multiprocessing if True, otherwise threading
+        shuffle: whether to shuffle the data at the beginning of each epoch
     """
-    def pool_fn(seqs):
-      pool = get_pool_class(True)(
-          workers, initializer=init_pool_generator,
-          initargs=(seqs, None, get_worker_id_queue()))
-      _DATA_POOLS.add(pool)
-      return pool
-
-    return pool_fn
-
-  def _wait_queue(self):
-    """Wait for the queue to be empty."""
-    while True:
-      time.sleep(0.1)
-      if self.queue.unfinished_tasks == 0 or self.stop_signal.is_set():
-        return
-
-  def _run(self):
-    """Submits request to the executor and queue the `Future` objects."""
-    sequence = list(range(len(self.sequence)))
-    self._send_sequence()  # Share the initial sequence
-    while True:
-      if self.shuffle:
-        random.shuffle(sequence)
-
-      with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
-        for i in sequence:
-          if self.stop_signal.is_set():
-            return
-
-          self.queue.put(
-              executor.apply_async(get_index, (self.uid, i)), block=True)
-
-        # Done with the current epoch, waiting for the final batches
-        self._wait_queue()
-
-        if self.stop_signal.is_set():
-          # We're done
-          return
-
-      # Call the internal on epoch end.
-      self.sequence.on_epoch_end()
-      self._send_sequence()  # Update the pool
-
-  def get(self):
-    """Creates a generator to extract data from the queue.
-
-    Skip the data if it is `None`.
 
-    Yields:
-        The next element in the queue, i.e. a tuple
-        `(inputs, targets)` or
-        `(inputs, targets, sample_weights)`.
-    """
-    while self.is_running():
-      try:
-        inputs = self.queue.get(block=True, timeout=5).get()
-        if self.is_running():
-          self.queue.task_done()
-        if inputs is not None:
-          yield inputs
-      except queue.Empty:
-        pass
-      except Exception as e:  # pylint: disable=broad-except
-        self.stop()
-        raise e
+    def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
+        super().__init__(sequence, use_multiprocessing)
+        self.shuffle = shuffle
+
+    def _get_executor_init(self, workers):
+        """Gets the Pool initializer for multiprocessing.
+
+        Args:
+            workers: Number of workers.
+
+        Returns:
+            Function, a Function to initialize the pool
+        """
+
+        def pool_fn(seqs):
+            pool = get_pool_class(True)(
+                workers,
+                initializer=init_pool_generator,
+                initargs=(seqs, None, get_worker_id_queue()),
+            )
+            _DATA_POOLS.add(pool)
+            return pool
+
+        return pool_fn
+
+    def _wait_queue(self):
+        """Wait for the queue to be empty."""
+        while True:
+            time.sleep(0.1)
+            if self.queue.unfinished_tasks == 0 or self.stop_signal.is_set():
+                return
+
+    def _run(self):
+        """Submits request to the executor and queue the `Future` objects."""
+        sequence = list(range(len(self.sequence)))
+        self._send_sequence()  # Share the initial sequence
+        while True:
+            if self.shuffle:
+                random.shuffle(sequence)
+
+            with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
+                for i in sequence:
+                    if self.stop_signal.is_set():
+                        return
+
+                    self.queue.put(
+                        executor.apply_async(get_index, (self.uid, i)),
+                        block=True,
+                    )
+
+                # Done with the current epoch, waiting for the final batches
+                self._wait_queue()
+
+                if self.stop_signal.is_set():
+                    # We're done
+                    return
+
+            # Call the internal on epoch end.
+            self.sequence.on_epoch_end()
+            self._send_sequence()  # Update the pool
+
+    def get(self):
+        """Creates a generator to extract data from the queue.
+
+        Skip the data if it is `None`.
+
+        Yields:
+            The next element in the queue, i.e. a tuple
+            `(inputs, targets)` or
+            `(inputs, targets, sample_weights)`.
+        """
+        while self.is_running():
+            try:
+                inputs = self.queue.get(block=True, timeout=5).get()
+                if self.is_running():
+                    self.queue.task_done()
+                if inputs is not None:
+                    yield inputs
+            except queue.Empty:
+                pass
+            except Exception as e:
+                self.stop()
+                raise e
 
 
 def init_pool_generator(gens, random_seed=None, id_queue=None):
-  """Initializer function for pool workers.
+    """Initializer function for pool workers.
 
-  Args:
-    gens: State which should be made available to worker processes.
-    random_seed: An optional value with which to seed child processes.
-    id_queue: A multiprocessing Queue of worker ids. This is used to indicate
-      that a worker process was created by Keras and can be terminated using
-      the cleanup_all_keras_forkpools utility.
-  """
-  global _SHARED_SEQUENCES
-  _SHARED_SEQUENCES = gens
+    Args:
+      gens: State which should be made available to worker processes.
+      random_seed: An optional value with which to seed child processes.
+      id_queue: A multiprocessing Queue of worker ids. This is used to indicate
+        that a worker process was created by Keras and can be terminated using
+        the cleanup_all_keras_forkpools utility.
+    """
+    global _SHARED_SEQUENCES
+    _SHARED_SEQUENCES = gens
 
-  worker_proc = multiprocessing.current_process()
+    worker_proc = multiprocessing.current_process()
 
-  # name isn't used for anything, but setting a more descriptive name is helpful
-  # when diagnosing orphaned processes.
-  worker_proc.name = 'Keras_worker_{}'.format(worker_proc.name)
+    # name isn't used for anything, but setting a more descriptive name is
+    # helpful when diagnosing orphaned processes.
+    worker_proc.name = f"Keras_worker_{worker_proc.name}"
 
-  if random_seed is not None:
-    np.random.seed(random_seed + worker_proc.ident)
+    if random_seed is not None:
+        np.random.seed(random_seed + worker_proc.ident)
 
-  if id_queue is not None:
-    # If a worker dies during init, the pool will just create a replacement.
-    id_queue.put(worker_proc.ident, block=True, timeout=0.1)
+    if id_queue is not None:
+        # If a worker dies during init, the pool will just create a replacement.
+        id_queue.put(worker_proc.ident, block=True, timeout=0.1)
 
 
 def next_sample(uid):
-  """Gets the next value from the generator `uid`.
+    """Gets the next value from the generator `uid`.
 
-  To allow multiple generators to be used at the same time, we use `uid` to
-  get a specific one. A single generator would cause the validation to
-  overwrite the training generator.
+    To allow multiple generators to be used at the same time, we use `uid` to
+    get a specific one. A single generator would cause the validation to
+    overwrite the training generator.
 
-  Args:
-      uid: int, generator identifier
+    Args:
+        uid: int, generator identifier
 
-  Returns:
-      The next value of generator `uid`.
-  """
-  return next(_SHARED_SEQUENCES[uid])
+    Returns:
+        The next value of generator `uid`.
+    """
+    return next(_SHARED_SEQUENCES[uid])
 
 
-@keras_export('keras.utils.GeneratorEnqueuer')
+@keras_export("keras.utils.GeneratorEnqueuer")
 class GeneratorEnqueuer(SequenceEnqueuer):
-  """Builds a queue out of a data generator.
-
-  The provided generator can be finite in which case the class will throw
-  a `StopIteration` exception.
+    """Builds a queue out of a data generator.
 
-  Args:
-      generator: a generator function which yields data
-      use_multiprocessing: use multiprocessing if True, otherwise threading
-      random_seed: Initial seed for workers,
-          will be incremented by one for each worker.
-  """
+    The provided generator can be finite in which case the class will throw
+    a `StopIteration` exception.
 
-  def __init__(self, generator,
-               use_multiprocessing=False,
-               random_seed=None):
-    super().__init__(generator, use_multiprocessing)
-    self.random_seed = random_seed
+    Args:
+        generator: a generator function which yields data
+        use_multiprocessing: use multiprocessing if True, otherwise threading
+        random_seed: Initial seed for workers,
+            will be incremented by one for each worker.
+    """
 
-  def _get_executor_init(self, workers):
-    """Gets the Pool initializer for multiprocessing.
+    def __init__(self, generator, use_multiprocessing=False, random_seed=None):
+        super().__init__(generator, use_multiprocessing)
+        self.random_seed = random_seed
+
+    def _get_executor_init(self, workers):
+        """Gets the Pool initializer for multiprocessing.
+
+        Args:
+          workers: Number of works.
+
+        Returns:
+            A Function to initialize the pool
+        """
+
+        def pool_fn(seqs):
+            pool = get_pool_class(True)(
+                workers,
+                initializer=init_pool_generator,
+                initargs=(seqs, self.random_seed, get_worker_id_queue()),
+            )
+            _DATA_POOLS.add(pool)
+            return pool
+
+        return pool_fn
+
+    def _run(self):
+        """Submits request to the executor and queue the `Future` objects."""
+        self._send_sequence()  # Share the initial generator
+        with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
+            while True:
+                if self.stop_signal.is_set():
+                    return
+
+                self.queue.put(
+                    executor.apply_async(next_sample, (self.uid,)), block=True
+                )
+
+    def get(self):
+        """Creates a generator to extract data from the queue.
+
+        Skip the data if it is `None`.
+
+        Yields:
+            The next element in the queue, i.e. a tuple
+            `(inputs, targets)` or
+            `(inputs, targets, sample_weights)`.
+        """
+        try:
+            while self.is_running():
+                inputs = self.queue.get(block=True).get()
+                self.queue.task_done()
+                if inputs is not None:
+                    yield inputs
+        except StopIteration:
+            # Special case for finite generators
+            last_ones = []
+            while self.queue.qsize() > 0:
+                last_ones.append(self.queue.get(block=True))
+            # Wait for them to complete
+            for f in last_ones:
+                f.wait()
+            # Keep the good ones
+            last_ones = [
+                future.get() for future in last_ones if future.successful()
+            ]
+            for inputs in last_ones:
+                if inputs is not None:
+                    yield inputs
+        except Exception as e:
+            self.stop()
+            if "generator already executing" in str(e):
+                raise RuntimeError(
+                    "Your generator is NOT thread-safe. "
+                    "Keras requires a thread-safe generator when "
+                    "`use_multiprocessing=False, workers > 1`. "
+                )
+            raise e
+
+
+@keras_export(
+    "keras.utils.pad_sequences", "keras.preprocessing.sequence.pad_sequences"
+)
+def pad_sequences(
+    sequences,
+    maxlen=None,
+    dtype="int32",
+    padding="pre",
+    truncating="pre",
+    value=0.0,
+):
+    """Pads sequences to the same length.
+
+    This function transforms a list (of length `num_samples`)
+    of sequences (lists of integers)
+    into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
+    `num_timesteps` is either the `maxlen` argument if provided,
+    or the length of the longest sequence in the list.
+
+    Sequences that are shorter than `num_timesteps`
+    are padded with `value` until they are `num_timesteps` long.
+
+    Sequences longer than `num_timesteps` are truncated
+    so that they fit the desired length.
+
+    The position where padding or truncation happens is determined by
+    the arguments `padding` and `truncating`, respectively.
+    Pre-padding or removing values from the beginning of the sequence is the
+    default.
+
+    >>> sequence = [[1], [2, 3], [4, 5, 6]]
+    >>> tf.keras.utils.pad_sequences(sequence)
+    array([[0, 0, 1],
+           [0, 2, 3],
+           [4, 5, 6]], dtype=int32)
+
+    >>> tf.keras.utils.pad_sequences(sequence, value=-1)
+    array([[-1, -1,  1],
+           [-1,  2,  3],
+           [ 4,  5,  6]], dtype=int32)
+
+    >>> tf.keras.utils.pad_sequences(sequence, padding='post')
+    array([[1, 0, 0],
+           [2, 3, 0],
+           [4, 5, 6]], dtype=int32)
+
+    >>> tf.keras.utils.pad_sequences(sequence, maxlen=2)
+    array([[0, 1],
+           [2, 3],
+           [5, 6]], dtype=int32)
 
     Args:
-      workers: Number of works.
+        sequences: List of sequences (each sequence is a list of integers).
+        maxlen: Optional Int, maximum length of all sequences. If not provided,
+            sequences will be padded to the length of the longest individual
+            sequence.
+        dtype: (Optional). Type of the output sequences.
+            To pad sequences with variable length strings, you can use `object`.
+            Defaults to `"int32"`.
+        padding: String, "pre" or "post" (optional):
+            pad either before or after each sequence. Defaults to `"pre"`.
+        truncating: String, "pre" or "post" (optional):
+            remove values from sequences larger than
+            `maxlen`, either at the beginning or at the end of the sequences.
+            Defaults to `"pre"`.
+        value: Float or String, padding value. (Optional). Defaults to `0.`.
 
     Returns:
-        A Function to initialize the pool
-    """
-    def pool_fn(seqs):
-      pool = get_pool_class(True)(
-          workers, initializer=init_pool_generator,
-          initargs=(seqs, self.random_seed, get_worker_id_queue()))
-      _DATA_POOLS.add(pool)
-      return pool
-    return pool_fn
-
-  def _run(self):
-    """Submits request to the executor and queue the `Future` objects."""
-    self._send_sequence()  # Share the initial generator
-    with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
-      while True:
-        if self.stop_signal.is_set():
-          return
-
-        self.queue.put(
-            executor.apply_async(next_sample, (self.uid,)), block=True)
-
-  def get(self):
-    """Creates a generator to extract data from the queue.
-
-    Skip the data if it is `None`.
+        Numpy array with shape `(len(sequences), maxlen)`
 
-    Yields:
-        The next element in the queue, i.e. a tuple
-        `(inputs, targets)` or
-        `(inputs, targets, sample_weights)`.
+    Raises:
+        ValueError: In case of invalid values for `truncating` or `padding`,
+            or in case of invalid shape for a `sequences` entry.
     """
-    try:
-      while self.is_running():
-        inputs = self.queue.get(block=True).get()
-        self.queue.task_done()
-        if inputs is not None:
-          yield inputs
-    except StopIteration:
-      # Special case for finite generators
-      last_ones = []
-      while self.queue.qsize() > 0:
-        last_ones.append(self.queue.get(block=True))
-      # Wait for them to complete
-      for f in last_ones:
-        f.wait()
-      # Keep the good ones
-      last_ones = [future.get() for future in last_ones if future.successful()]
-      for inputs in last_ones:
-        if inputs is not None:
-          yield inputs
-    except Exception as e:  # pylint: disable=broad-except
-      self.stop()
-      if 'generator already executing' in str(e):
-        raise RuntimeError(
-            'Your generator is NOT thread-safe. '
-            'Keras requires a thread-safe generator when '
-            '`use_multiprocessing=False, workers > 1`. ')
-      raise e
-
-
-@keras_export('keras.utils.pad_sequences',
-              'keras.preprocessing.sequence.pad_sequences')
-def pad_sequences(sequences, maxlen=None, dtype='int32',
-                  padding='pre', truncating='pre', value=0.):
-  """Pads sequences to the same length.
-
-  This function transforms a list (of length `num_samples`)
-  of sequences (lists of integers)
-  into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
-  `num_timesteps` is either the `maxlen` argument if provided,
-  or the length of the longest sequence in the list.
-
-  Sequences that are shorter than `num_timesteps`
-  are padded with `value` until they are `num_timesteps` long.
-
-  Sequences longer than `num_timesteps` are truncated
-  so that they fit the desired length.
-
-  The position where padding or truncation happens is determined by
-  the arguments `padding` and `truncating`, respectively.
-  Pre-padding or removing values from the beginning of the sequence is the
-  default.
-
-  >>> sequence = [[1], [2, 3], [4, 5, 6]]
-  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence)
-  array([[0, 0, 1],
-         [0, 2, 3],
-         [4, 5, 6]], dtype=int32)
-
-  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1)
-  array([[-1, -1,  1],
-         [-1,  2,  3],
-         [ 4,  5,  6]], dtype=int32)
-
-  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post')
-  array([[1, 0, 0],
-         [2, 3, 0],
-         [4, 5, 6]], dtype=int32)
-
-  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2)
-  array([[0, 1],
-         [2, 3],
-         [5, 6]], dtype=int32)
-
-  Args:
-      sequences: List of sequences (each sequence is a list of integers).
-      maxlen: Optional Int, maximum length of all sequences. If not provided,
-          sequences will be padded to the length of the longest individual
-          sequence.
-      dtype: (Optional, defaults to `"int32"`). Type of the output sequences.
-          To pad sequences with variable length strings, you can use `object`.
-      padding: String, "pre" or "post" (optional, defaults to `"pre"`):
-          pad either before or after each sequence.
-      truncating: String, "pre" or "post" (optional, defaults to `"pre"`):
-          remove values from sequences larger than
-          `maxlen`, either at the beginning or at the end of the sequences.
-      value: Float or String, padding value. (Optional, defaults to 0.)
-
-  Returns:
-      Numpy array with shape `(len(sequences), maxlen)`
-
-  Raises:
-      ValueError: In case of invalid values for `truncating` or `padding`,
-          or in case of invalid shape for a `sequences` entry.
-  """
-  if not hasattr(sequences, '__len__'):
-    raise ValueError('`sequences` must be iterable.')
-  num_samples = len(sequences)
-
-  lengths = []
-  sample_shape = ()
-  flag = True
-
-  # take the sample shape from the first non empty sequence
-  # checking for consistency in the main loop below.
-
-  for x in sequences:
-    try:
-      lengths.append(len(x))
-      if flag and len(x):
-        sample_shape = np.asarray(x).shape[1:]
-        flag = False
-    except TypeError as e:
-      raise ValueError('`sequences` must be a list of iterables. '
-                       f'Found non-iterable: {str(x)}') from e
-
-  if maxlen is None:
-    maxlen = np.max(lengths)
-
-  is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(
-      dtype, np.unicode_)
-  if isinstance(value, str) and dtype != object and not is_dtype_str:
-    raise ValueError(
-        f'`dtype` {dtype} is not compatible with `value`\'s type: '
-        f'{type(value)}\nYou should set `dtype=object` for variable length '
-        'strings.')
-
-  x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
-  for idx, s in enumerate(sequences):
-    if not len(s):  # pylint: disable=g-explicit-length-test
-      continue  # empty list/array was found
-    if truncating == 'pre':
-      trunc = s[-maxlen:]  # pylint: disable=invalid-unary-operand-type
-    elif truncating == 'post':
-      trunc = s[:maxlen]
-    else:
-      raise ValueError(f'Truncating type "{truncating}" not understood')
-
-    # check `trunc` has expected shape
-    trunc = np.asarray(trunc, dtype=dtype)
-    if trunc.shape[1:] != sample_shape:
-      raise ValueError(f'Shape of sample {trunc.shape[1:]} of sequence at '
-                       f'position {idx} is different from expected shape '
-                       f'{sample_shape}')
-
-    if padding == 'post':
-      x[idx, :len(trunc)] = trunc
-    elif padding == 'pre':
-      x[idx, -len(trunc):] = trunc
-    else:
-      raise ValueError(f'Padding type "{padding}" not understood')
-  return x
+    if not hasattr(sequences, "__len__"):
+        raise ValueError("`sequences` must be iterable.")
+    num_samples = len(sequences)
+
+    lengths = []
+    sample_shape = ()
+    flag = True
+
+    # take the sample shape from the first non empty sequence
+    # checking for consistency in the main loop below.
+
+    for x in sequences:
+        try:
+            lengths.append(len(x))
+            if flag and len(x):
+                sample_shape = np.asarray(x).shape[1:]
+                flag = False
+        except TypeError as e:
+            raise ValueError(
+                "`sequences` must be a list of iterables. "
+                f"Found non-iterable: {str(x)}"
+            ) from e
+
+    if maxlen is None:
+        maxlen = np.max(lengths)
+
+    is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(
+        dtype, np.unicode_
+    )
+    if isinstance(value, str) and dtype != object and not is_dtype_str:
+        raise ValueError(
+            f"`dtype` {dtype} is not compatible with `value`'s type: "
+            f"{type(value)}\nYou should set `dtype=object` for variable length "
+            "strings."
+        )
+
+    x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
+    for idx, s in enumerate(sequences):
+        if not len(s):
+            continue  # empty list/array was found
+        if truncating == "pre":
+            trunc = s[-maxlen:]
+        elif truncating == "post":
+            trunc = s[:maxlen]
+        else:
+            raise ValueError(f'Truncating type "{truncating}" not understood')
+
+        # check `trunc` has expected shape
+        trunc = np.asarray(trunc, dtype=dtype)
+        if trunc.shape[1:] != sample_shape:
+            raise ValueError(
+                f"Shape of sample {trunc.shape[1:]} of sequence at "
+                f"position {idx} is different from expected shape "
+                f"{sample_shape}"
+            )
+
+        if padding == "post":
+            x[idx, : len(trunc)] = trunc
+        elif padding == "pre":
+            x[idx, -len(trunc) :] = trunc
+        else:
+            raise ValueError(f'Padding type "{padding}" not understood')
+    return x
diff --git a/keras/utils/data_utils_test.py b/keras/utils/data_utils_test.py
index 7374311a7437..093281cda85c 100644
--- a/keras/utils/data_utils_test.py
+++ b/keras/utils/data_utils_test.py
@@ -14,417 +14,503 @@
 # ==============================================================================
 """Tests for data_utils."""
 
-import tensorflow.compat.v2 as tf
-
-from itertools import cycle
 import os
 import tarfile
 import urllib
 import zipfile
+from itertools import cycle
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
 from keras.utils import data_utils
 
 
 class TestGetFile(tf.test.TestCase):
-
-  def test_get_file_and_validate_it(self):
-    """Tests get_file from a url, plus extraction and validation.
-    """
-    dest_dir = self.get_temp_dir()
-    orig_dir = self.get_temp_dir()
-
-    text_file_path = os.path.join(orig_dir, 'test.txt')
-    zip_file_path = os.path.join(orig_dir, 'test.zip')
-    tar_file_path = os.path.join(orig_dir, 'test.tar.gz')
-
-    with open(text_file_path, 'w') as text_file:
-      text_file.write('Float like a butterfly, sting like a bee.')
-
-    with tarfile.open(tar_file_path, 'w:gz') as tar_file:
-      tar_file.add(text_file_path)
-
-    with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
-      zip_file.write(text_file_path)
-
-    origin = urllib.parse.urljoin(
-        'file://', urllib.request.pathname2url(os.path.abspath(tar_file_path)))
-
-    path = keras.utils.data_utils.get_file('test.txt', origin,
-                                           untar=True, cache_subdir=dest_dir)
-    filepath = path + '.tar.gz'
-    hashval_sha256 = keras.utils.data_utils._hash_file(filepath)
-    hashval_md5 = keras.utils.data_utils._hash_file(filepath, algorithm='md5')
-    path = keras.utils.data_utils.get_file(
-        'test.txt', origin, md5_hash=hashval_md5,
-        untar=True, cache_subdir=dest_dir)
-    path = keras.utils.data_utils.get_file(
-        filepath, origin, file_hash=hashval_sha256,
-        extract=True, cache_subdir=dest_dir)
-    self.assertTrue(os.path.exists(filepath))
-    self.assertTrue(keras.utils.data_utils.validate_file(filepath,
-                                                         hashval_sha256))
-    self.assertTrue(keras.utils.data_utils.validate_file(filepath, hashval_md5))
-    os.remove(filepath)
-
-    origin = urllib.parse.urljoin(
-        'file://', urllib.request.pathname2url(os.path.abspath(zip_file_path)))
-
-    hashval_sha256 = keras.utils.data_utils._hash_file(zip_file_path)
-    hashval_md5 = keras.utils.data_utils._hash_file(zip_file_path,
-                                                    algorithm='md5')
-    path = keras.utils.data_utils.get_file(
-        'test', origin, md5_hash=hashval_md5,
-        extract=True, cache_subdir=dest_dir)
-    path = keras.utils.data_utils.get_file(
-        'test', origin, file_hash=hashval_sha256,
-        extract=True, cache_subdir=dest_dir)
-    self.assertTrue(os.path.exists(path))
-    self.assertTrue(keras.utils.data_utils.validate_file(path, hashval_sha256))
-    self.assertTrue(keras.utils.data_utils.validate_file(path, hashval_md5))
-    os.remove(path)
-
-    for file_path, extract in [(text_file_path, False), (tar_file_path, True),
-                               (zip_file_path, True)]:
-      origin = urllib.parse.urljoin(
-          'file://', urllib.request.pathname2url(os.path.abspath(file_path)))
-      hashval_sha256 = keras.utils.data_utils._hash_file(file_path)
-      path = keras.utils.data_utils.get_file(
-          origin=origin,
-          file_hash=hashval_sha256,
-          extract=extract,
-          cache_subdir=dest_dir)
-      self.assertTrue(os.path.exists(path))
-      self.assertTrue(
-          keras.utils.data_utils.validate_file(path, hashval_sha256))
-      os.remove(path)
-
-    with self.assertRaisesRegexp(ValueError, 'Please specify the "origin".*'):
-      _ = keras.utils.data_utils.get_file()
-
-  def test_get_file_with_tgz_extension(self):
-    """Tests get_file from a url, plus extraction and validation."""
-    dest_dir = self.get_temp_dir()
-    orig_dir = self.get_temp_dir()
-
-    text_file_path = os.path.join(orig_dir, 'test.txt')
-    tar_file_path = os.path.join(orig_dir, 'test.tar.gz')
-
-    with open(text_file_path, 'w') as text_file:
-      text_file.write('Float like a butterfly, sting like a bee.')
-
-    with tarfile.open(tar_file_path, 'w:gz') as tar_file:
-      tar_file.add(text_file_path)
-
-    origin = urllib.parse.urljoin(
-        'file://', urllib.request.pathname2url(os.path.abspath(tar_file_path)))
-
-    path = keras.utils.data_utils.get_file(
-        'test.txt.tar.gz', origin, untar=True, cache_subdir=dest_dir)
-    self.assertEndsWith(path, '.txt')
-    self.assertTrue(os.path.exists(path))
-
-  def test_get_file_with_integrity_check(self):
-    """Tests get_file with validation before download."""
-    orig_dir = self.get_temp_dir()
-    file_path = os.path.join(orig_dir, 'test.txt')
-
-    with open(file_path, 'w') as text_file:
-      text_file.write('Float like a butterfly, sting like a bee.')
-
-    hashval = keras.utils.data_utils._hash_file(file_path)
-
-    origin = urllib.parse.urljoin(
-        'file://', urllib.request.pathname2url(os.path.abspath(file_path)))
-
-    path = keras.utils.data_utils.get_file(
-        'test.txt', origin, file_hash=hashval)
-    self.assertTrue(os.path.exists(path))
-
-  def test_get_file_with_failed_integrity_check(self):
-    """Tests get_file with validation before download."""
-    orig_dir = self.get_temp_dir()
-    file_path = os.path.join(orig_dir, 'test.txt')
-
-    with open(file_path, 'w') as text_file:
-      text_file.write('Float like a butterfly, sting like a bee.')
-
-    hashval = '0' * 64
-
-    origin = urllib.parse.urljoin(
-        'file://', urllib.request.pathname2url(os.path.abspath(file_path)))
-
-    with self.assertRaisesRegex(ValueError, 'Incomplete or corrupted file.*'):
-      _ = keras.utils.data_utils.get_file('test.txt', origin, file_hash=hashval)
+    def test_get_file_and_validate_it(self):
+        """Tests get_file from a url, plus extraction and validation."""
+        dest_dir = self.get_temp_dir()
+        orig_dir = self.get_temp_dir()
+
+        text_file_path = os.path.join(orig_dir, "test.txt")
+        zip_file_path = os.path.join(orig_dir, "test.zip")
+        tar_file_path = os.path.join(orig_dir, "test.tar.gz")
+
+        with open(text_file_path, "w") as text_file:
+            text_file.write("Float like a butterfly, sting like a bee.")
+
+        with tarfile.open(tar_file_path, "w:gz") as tar_file:
+            tar_file.add(text_file_path)
+
+        with zipfile.ZipFile(zip_file_path, "w") as zip_file:
+            zip_file.write(text_file_path)
+
+        origin = urllib.parse.urljoin(
+            "file://",
+            urllib.request.pathname2url(os.path.abspath(tar_file_path)),
+        )
+
+        path = keras.utils.data_utils.get_file(
+            "test.txt", origin, untar=True, cache_subdir=dest_dir
+        )
+        filepath = path + ".tar.gz"
+        hashval_sha256 = keras.utils.data_utils._hash_file(filepath)
+        hashval_md5 = keras.utils.data_utils._hash_file(
+            filepath, algorithm="md5"
+        )
+        path = keras.utils.data_utils.get_file(
+            "test.txt",
+            origin,
+            md5_hash=hashval_md5,
+            untar=True,
+            cache_subdir=dest_dir,
+        )
+        path = keras.utils.data_utils.get_file(
+            filepath,
+            origin,
+            file_hash=hashval_sha256,
+            extract=True,
+            cache_subdir=dest_dir,
+        )
+        self.assertTrue(os.path.exists(filepath))
+        self.assertTrue(
+            keras.utils.data_utils.validate_file(filepath, hashval_sha256)
+        )
+        self.assertTrue(
+            keras.utils.data_utils.validate_file(filepath, hashval_md5)
+        )
+        os.remove(filepath)
+
+        origin = urllib.parse.urljoin(
+            "file://",
+            urllib.request.pathname2url(os.path.abspath(zip_file_path)),
+        )
+
+        hashval_sha256 = keras.utils.data_utils._hash_file(zip_file_path)
+        hashval_md5 = keras.utils.data_utils._hash_file(
+            zip_file_path, algorithm="md5"
+        )
+        path = keras.utils.data_utils.get_file(
+            "test",
+            origin,
+            md5_hash=hashval_md5,
+            extract=True,
+            cache_subdir=dest_dir,
+        )
+        path = keras.utils.data_utils.get_file(
+            "test",
+            origin,
+            file_hash=hashval_sha256,
+            extract=True,
+            cache_subdir=dest_dir,
+        )
+        self.assertTrue(os.path.exists(path))
+        self.assertTrue(
+            keras.utils.data_utils.validate_file(path, hashval_sha256)
+        )
+        self.assertTrue(keras.utils.data_utils.validate_file(path, hashval_md5))
+        os.remove(path)
+
+        for file_path, extract in [
+            (text_file_path, False),
+            (tar_file_path, True),
+            (zip_file_path, True),
+        ]:
+            origin = urllib.parse.urljoin(
+                "file://",
+                urllib.request.pathname2url(os.path.abspath(file_path)),
+            )
+            hashval_sha256 = keras.utils.data_utils._hash_file(file_path)
+            path = keras.utils.data_utils.get_file(
+                origin=origin,
+                file_hash=hashval_sha256,
+                extract=extract,
+                cache_subdir=dest_dir,
+            )
+            self.assertTrue(os.path.exists(path))
+            self.assertTrue(
+                keras.utils.data_utils.validate_file(path, hashval_sha256)
+            )
+            os.remove(path)
+
+        with self.assertRaisesRegexp(
+            ValueError, 'Please specify the "origin".*'
+        ):
+            _ = keras.utils.data_utils.get_file()
+
+    def test_get_file_with_tgz_extension(self):
+        """Tests get_file from a url, plus extraction and validation."""
+        dest_dir = self.get_temp_dir()
+        orig_dir = self.get_temp_dir()
+
+        text_file_path = os.path.join(orig_dir, "test.txt")
+        tar_file_path = os.path.join(orig_dir, "test.tar.gz")
+
+        with open(text_file_path, "w") as text_file:
+            text_file.write("Float like a butterfly, sting like a bee.")
+
+        with tarfile.open(tar_file_path, "w:gz") as tar_file:
+            tar_file.add(text_file_path)
+
+        origin = urllib.parse.urljoin(
+            "file://",
+            urllib.request.pathname2url(os.path.abspath(tar_file_path)),
+        )
+
+        path = keras.utils.data_utils.get_file(
+            "test.txt.tar.gz", origin, untar=True, cache_subdir=dest_dir
+        )
+        self.assertEndsWith(path, ".txt")
+        self.assertTrue(os.path.exists(path))
+
+    def test_get_file_with_integrity_check(self):
+        """Tests get_file with validation before download."""
+        orig_dir = self.get_temp_dir()
+        file_path = os.path.join(orig_dir, "test.txt")
+
+        with open(file_path, "w") as text_file:
+            text_file.write("Float like a butterfly, sting like a bee.")
+
+        hashval = keras.utils.data_utils._hash_file(file_path)
+
+        origin = urllib.parse.urljoin(
+            "file://", urllib.request.pathname2url(os.path.abspath(file_path))
+        )
+
+        path = keras.utils.data_utils.get_file(
+            "test.txt", origin, file_hash=hashval
+        )
+        self.assertTrue(os.path.exists(path))
+
+    def test_get_file_with_failed_integrity_check(self):
+        """Tests get_file with validation before download."""
+        orig_dir = self.get_temp_dir()
+        file_path = os.path.join(orig_dir, "test.txt")
+
+        with open(file_path, "w") as text_file:
+            text_file.write("Float like a butterfly, sting like a bee.")
+
+        hashval = "0" * 64
+
+        origin = urllib.parse.urljoin(
+            "file://", urllib.request.pathname2url(os.path.abspath(file_path))
+        )
+
+        with self.assertRaisesRegex(
+            ValueError, "Incomplete or corrupted file.*"
+        ):
+            _ = keras.utils.data_utils.get_file(
+                "test.txt", origin, file_hash=hashval
+            )
 
 
 class TestSequence(keras.utils.data_utils.Sequence):
+    def __init__(self, shape, value=1.0):
+        self.shape = shape
+        self.inner = value
 
-  def __init__(self, shape, value=1.):
-    self.shape = shape
-    self.inner = value
-
-  def __getitem__(self, item):
-    return np.ones(self.shape, dtype=np.uint32) * item * self.inner
+    def __getitem__(self, item):
+        return np.ones(self.shape, dtype=np.uint32) * item * self.inner
 
-  def __len__(self):
-    return 100
+    def __len__(self):
+        return 100
 
-  def on_epoch_end(self):
-    self.inner *= 5.0
+    def on_epoch_end(self):
+        self.inner *= 5.0
 
 
 class FaultSequence(keras.utils.data_utils.Sequence):
+    def __getitem__(self, item):
+        raise IndexError(item, "item is not present")
 
-  def __getitem__(self, item):
-    raise IndexError(item, 'item is not present')
-
-  def __len__(self):
-    return 100
+    def __len__(self):
+        return 100
 
 
 @data_utils.threadsafe_generator
 def create_generator_from_sequence_threads(ds):
-  for i in cycle(range(len(ds))):
-    yield ds[i]
+    for i in cycle(range(len(ds))):
+        yield ds[i]
 
 
 def create_generator_from_sequence_pcs(ds):
-  for i in cycle(range(len(ds))):
-    yield ds[i]
+    for i in cycle(range(len(ds))):
+        yield ds[i]
 
 
 class TestEnqueuers(tf.test.TestCase):
-
-  def test_generator_enqueuer_threads(self):
-    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
-        create_generator_from_sequence_threads(TestSequence([3, 200, 200, 3])),
-        use_multiprocessing=False)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    acc = []
-    for _ in range(100):
-      acc.append(int(next(gen_output)[0, 0, 0, 0]))
-
-    self.assertEqual(len(set(acc) - set(range(100))), 0)
-    enqueuer.stop()
-
-  @data_utils.dont_use_multiprocessing_pool
-  def test_generator_enqueuer_processes(self):
-    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
-        create_generator_from_sequence_threads(TestSequence([3, 200, 200, 3])),
-        use_multiprocessing=True)
-    enqueuer.start(4, 10)
-    gen_output = enqueuer.get()
-    acc = []
-    for _ in range(300):
-      acc.append(int(next(gen_output)[0, 0, 0, 0]))
-    self.assertNotEqual(acc, list(range(100)))
-    enqueuer.stop()
-
-  def test_generator_enqueuer_fail_threads(self):
-    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
-        create_generator_from_sequence_threads(FaultSequence()),
-        use_multiprocessing=False)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    with self.assertRaises(IndexError):
-      next(gen_output)
-
-  @data_utils.dont_use_multiprocessing_pool
-  def test_generator_enqueuer_fail_processes(self):
-    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
-        create_generator_from_sequence_threads(FaultSequence()),
-        use_multiprocessing=True)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    with self.assertRaises(IndexError):
-      next(gen_output)
-
-  def test_ordered_enqueuer_threads(self):
-    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
-        TestSequence([3, 200, 200, 3]), use_multiprocessing=False)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    acc = []
-    for _ in range(100):
-      acc.append(next(gen_output)[0, 0, 0, 0])
-    self.assertEqual(acc, list(range(100)))
-    enqueuer.stop()
-
-  @data_utils.dont_use_multiprocessing_pool
-  def test_ordered_enqueuer_processes(self):
-    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
-        TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    acc = []
-    for _ in range(100):
-      acc.append(next(gen_output)[0, 0, 0, 0])
-    self.assertEqual(acc, list(range(100)))
-    enqueuer.stop()
-
-  def test_ordered_enqueuer_fail_threads(self):
-    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
-        FaultSequence(), use_multiprocessing=False)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    with self.assertRaises(IndexError):
-      next(gen_output)
-
-  @data_utils.dont_use_multiprocessing_pool
-  def test_ordered_enqueuer_fail_processes(self):
-    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
-        FaultSequence(), use_multiprocessing=True)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    with self.assertRaises(IndexError):
-      next(gen_output)
-
-  @data_utils.dont_use_multiprocessing_pool
-  def test_on_epoch_end_processes(self):
-    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
-        TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    acc = []
-    for _ in range(200):
-      acc.append(next(gen_output)[0, 0, 0, 0])
-    # Check that order was keep in GeneratorEnqueuer with processes
-    self.assertEqual(acc[100:], list([k * 5 for k in range(100)]))
-    enqueuer.stop()
-
-  @data_utils.dont_use_multiprocessing_pool
-  def test_context_switch(self):
-    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
-        TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
-    enqueuer2 = keras.utils.data_utils.OrderedEnqueuer(
-        TestSequence([3, 200, 200, 3], value=15), use_multiprocessing=True)
-    enqueuer.start(3, 10)
-    enqueuer2.start(3, 10)
-    gen_output = enqueuer.get()
-    gen_output2 = enqueuer2.get()
-    acc = []
-    for _ in range(100):
-      acc.append(next(gen_output)[0, 0, 0, 0])
-    self.assertEqual(acc[-1], 99)
-    # One epoch is completed so enqueuer will switch the Sequence
-
-    acc = []
-    self.skipTest('b/145555807 flakily timing out.')
-    for _ in range(100):
-      acc.append(next(gen_output2)[0, 0, 0, 0])
-    self.assertEqual(acc[-1], 99 * 15)
-    # One epoch has been completed so enqueuer2 will switch
-
-    # Be sure that both Sequence were updated
-    self.assertEqual(next(gen_output)[0, 0, 0, 0], 0)
-    self.assertEqual(next(gen_output)[0, 0, 0, 0], 5)
-    self.assertEqual(next(gen_output2)[0, 0, 0, 0], 0)
-    self.assertEqual(next(gen_output2)[0, 0, 0, 0], 15 * 5)
-
-    # Tear down everything
-    enqueuer.stop()
-    enqueuer2.stop()
-
-  def test_on_epoch_end_threads(self):
-    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
-        TestSequence([3, 200, 200, 3]), use_multiprocessing=False)
-    enqueuer.start(3, 10)
-    gen_output = enqueuer.get()
-    acc = []
-    for _ in range(100):
-      acc.append(next(gen_output)[0, 0, 0, 0])
-    acc = []
-    for _ in range(100):
-      acc.append(next(gen_output)[0, 0, 0, 0])
-    # Check that order was keep in GeneratorEnqueuer with processes
-    self.assertEqual(acc, list([k * 5 for k in range(100)]))
-    enqueuer.stop()
+    def test_generator_enqueuer_threads(self):
+        enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+            create_generator_from_sequence_threads(
+                TestSequence([3, 200, 200, 3])
+            ),
+            use_multiprocessing=False,
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        acc = []
+        for _ in range(100):
+            acc.append(int(next(gen_output)[0, 0, 0, 0]))
+
+        self.assertEqual(len(set(acc) - set(range(100))), 0)
+        enqueuer.stop()
+
+    @data_utils.dont_use_multiprocessing_pool
+    def test_generator_enqueuer_processes(self):
+        enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+            create_generator_from_sequence_threads(
+                TestSequence([3, 200, 200, 3])
+            ),
+            use_multiprocessing=True,
+        )
+        enqueuer.start(4, 10)
+        gen_output = enqueuer.get()
+        acc = []
+        for _ in range(300):
+            acc.append(int(next(gen_output)[0, 0, 0, 0]))
+        self.assertNotEqual(acc, list(range(100)))
+        enqueuer.stop()
+
+    def test_generator_enqueuer_fail_threads(self):
+        enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+            create_generator_from_sequence_threads(FaultSequence()),
+            use_multiprocessing=False,
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        with self.assertRaises(IndexError):
+            next(gen_output)
+
+    @data_utils.dont_use_multiprocessing_pool
+    def test_generator_enqueuer_fail_processes(self):
+        enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+            create_generator_from_sequence_threads(FaultSequence()),
+            use_multiprocessing=True,
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        with self.assertRaises(IndexError):
+            next(gen_output)
+
+    def test_ordered_enqueuer_threads(self):
+        enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+            TestSequence([3, 200, 200, 3]), use_multiprocessing=False
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        acc = []
+        for _ in range(100):
+            acc.append(next(gen_output)[0, 0, 0, 0])
+        self.assertEqual(acc, list(range(100)))
+        enqueuer.stop()
+
+    @data_utils.dont_use_multiprocessing_pool
+    def test_ordered_enqueuer_processes(self):
+        enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+            TestSequence([3, 200, 200, 3]), use_multiprocessing=True
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        acc = []
+        for _ in range(100):
+            acc.append(next(gen_output)[0, 0, 0, 0])
+        self.assertEqual(acc, list(range(100)))
+        enqueuer.stop()
+
+    def test_ordered_enqueuer_fail_threads(self):
+        enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+            FaultSequence(), use_multiprocessing=False
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        with self.assertRaises(IndexError):
+            next(gen_output)
+
+    @data_utils.dont_use_multiprocessing_pool
+    def test_ordered_enqueuer_fail_processes(self):
+        enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+            FaultSequence(), use_multiprocessing=True
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        with self.assertRaises(IndexError):
+            next(gen_output)
+
+    @data_utils.dont_use_multiprocessing_pool
+    def test_on_epoch_end_processes(self):
+        enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+            TestSequence([3, 200, 200, 3]), use_multiprocessing=True
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        acc = []
+        for _ in range(200):
+            acc.append(next(gen_output)[0, 0, 0, 0])
+        # Check that order was keep in GeneratorEnqueuer with processes
+        self.assertEqual(acc[100:], list([k * 5 for k in range(100)]))
+        enqueuer.stop()
+
+    @data_utils.dont_use_multiprocessing_pool
+    def test_context_switch(self):
+        enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+            TestSequence([3, 200, 200, 3]), use_multiprocessing=True
+        )
+        enqueuer2 = keras.utils.data_utils.OrderedEnqueuer(
+            TestSequence([3, 200, 200, 3], value=15), use_multiprocessing=True
+        )
+        enqueuer.start(3, 10)
+        enqueuer2.start(3, 10)
+        gen_output = enqueuer.get()
+        gen_output2 = enqueuer2.get()
+        acc = []
+        for _ in range(100):
+            acc.append(next(gen_output)[0, 0, 0, 0])
+        self.assertEqual(acc[-1], 99)
+        # One epoch is completed so enqueuer will switch the Sequence
+
+        acc = []
+        self.skipTest("b/145555807 flakily timing out.")
+        for _ in range(100):
+            acc.append(next(gen_output2)[0, 0, 0, 0])
+        self.assertEqual(acc[-1], 99 * 15)
+        # One epoch has been completed so enqueuer2 will switch
+
+        # Be sure that both Sequence were updated
+        self.assertEqual(next(gen_output)[0, 0, 0, 0], 0)
+        self.assertEqual(next(gen_output)[0, 0, 0, 0], 5)
+        self.assertEqual(next(gen_output2)[0, 0, 0, 0], 0)
+        self.assertEqual(next(gen_output2)[0, 0, 0, 0], 15 * 5)
+
+        # Tear down everything
+        enqueuer.stop()
+        enqueuer2.stop()
+
+    def test_on_epoch_end_threads(self):
+        enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+            TestSequence([3, 200, 200, 3]), use_multiprocessing=False
+        )
+        enqueuer.start(3, 10)
+        gen_output = enqueuer.get()
+        acc = []
+        for _ in range(100):
+            acc.append(next(gen_output)[0, 0, 0, 0])
+        acc = []
+        for _ in range(100):
+            acc.append(next(gen_output)[0, 0, 0, 0])
+        # Check that order was keep in GeneratorEnqueuer with processes
+        self.assertEqual(acc, list([k * 5 for k in range(100)]))
+        enqueuer.stop()
 
 
 class PadSequencesTest(tf.test.TestCase):
-
-  def test_pad_sequences(self):
-    a = [[1], [1, 2], [1, 2, 3]]
-
-    # test padding
-    b = data_utils.pad_sequences(a, maxlen=3, padding='pre')
-    self.assertAllClose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
-    b = data_utils.pad_sequences(a, maxlen=3, padding='post')
-    self.assertAllClose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])
-
-    # test truncating
-    b = data_utils.pad_sequences(a, maxlen=2, truncating='pre')
-    self.assertAllClose(b, [[0, 1], [1, 2], [2, 3]])
-    b = data_utils.pad_sequences(a, maxlen=2, truncating='post')
-    self.assertAllClose(b, [[0, 1], [1, 2], [1, 2]])
-
-    # test value
-    b = data_utils.pad_sequences(a, maxlen=3, value=1)
-    self.assertAllClose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
-
-  def test_pad_sequences_str(self):
-    a = [['1'], ['1', '2'], ['1', '2', '3']]
-
-    # test padding
-    b = data_utils.pad_sequences(
-        a, maxlen=3, padding='pre', value='pad', dtype=object)
-    self.assertAllEqual(
-        b, [['pad', 'pad', '1'], ['pad', '1', '2'], ['1', '2', '3']])
-    b = data_utils.pad_sequences(
-        a, maxlen=3, padding='post', value='pad', dtype='<U3')
-    self.assertAllEqual(
-        b, [['1', 'pad', 'pad'], ['1', '2', 'pad'], ['1', '2', '3']])
-
-    # test truncating
-    b = data_utils.pad_sequences(
-        a, maxlen=2, truncating='pre', value='pad', dtype=object)
-    self.assertAllEqual(b, [['pad', '1'], ['1', '2'], ['2', '3']])
-    b = data_utils.pad_sequences(
-        a, maxlen=2, truncating='post', value='pad', dtype='<U3')
-    self.assertAllEqual(b, [['pad', '1'], ['1', '2'], ['1', '2']])
-
-    with self.assertRaisesRegex(ValueError,
-                                '`dtype` int32 is not compatible with '):
-      data_utils.pad_sequences(a, maxlen=2, truncating='post', value='pad')
-
-  def test_pad_sequences_vector(self):
-    a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]
-
-    # test padding
-    b = data_utils.pad_sequences(a, maxlen=3, padding='pre')
-    self.assertAllClose(b, [[[0, 0], [0, 0], [1, 1]], [[0, 0], [2, 1], [2, 2]],
-                            [[3, 1], [3, 2], [3, 3]]])
-    b = data_utils.pad_sequences(a, maxlen=3, padding='post')
-    self.assertAllClose(b, [[[1, 1], [0, 0], [0, 0]], [[2, 1], [2, 2], [0, 0]],
-                            [[3, 1], [3, 2], [3, 3]]])
-
-    # test truncating
-    b = data_utils.pad_sequences(a, maxlen=2, truncating='pre')
-    self.assertAllClose(b,
-                        [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 2], [3, 3]]])
-
-    b = data_utils.pad_sequences(a, maxlen=2, truncating='post')
-    self.assertAllClose(b,
-                        [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2]]])
-
-    # test value
-    b = data_utils.pad_sequences(a, maxlen=3, value=1)
-    self.assertAllClose(b, [[[1, 1], [1, 1], [1, 1]], [[1, 1], [2, 1], [2, 2]],
-                            [[3, 1], [3, 2], [3, 3]]])
-
-
-if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
-
-  tf.test.main()
+    def test_pad_sequences(self):
+        a = [[1], [1, 2], [1, 2, 3]]
+
+        # test padding
+        b = data_utils.pad_sequences(a, maxlen=3, padding="pre")
+        self.assertAllClose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
+        b = data_utils.pad_sequences(a, maxlen=3, padding="post")
+        self.assertAllClose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])
+
+        # test truncating
+        b = data_utils.pad_sequences(a, maxlen=2, truncating="pre")
+        self.assertAllClose(b, [[0, 1], [1, 2], [2, 3]])
+        b = data_utils.pad_sequences(a, maxlen=2, truncating="post")
+        self.assertAllClose(b, [[0, 1], [1, 2], [1, 2]])
+
+        # test value
+        b = data_utils.pad_sequences(a, maxlen=3, value=1)
+        self.assertAllClose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
+
+    def test_pad_sequences_str(self):
+        a = [["1"], ["1", "2"], ["1", "2", "3"]]
+
+        # test padding
+        b = data_utils.pad_sequences(
+            a, maxlen=3, padding="pre", value="pad", dtype=object
+        )
+        self.assertAllEqual(
+            b, [["pad", "pad", "1"], ["pad", "1", "2"], ["1", "2", "3"]]
+        )
+        b = data_utils.pad_sequences(
+            a, maxlen=3, padding="post", value="pad", dtype="<U3"
+        )
+        self.assertAllEqual(
+            b, [["1", "pad", "pad"], ["1", "2", "pad"], ["1", "2", "3"]]
+        )
+
+        # test truncating
+        b = data_utils.pad_sequences(
+            a, maxlen=2, truncating="pre", value="pad", dtype=object
+        )
+        self.assertAllEqual(b, [["pad", "1"], ["1", "2"], ["2", "3"]])
+        b = data_utils.pad_sequences(
+            a, maxlen=2, truncating="post", value="pad", dtype="<U3"
+        )
+        self.assertAllEqual(b, [["pad", "1"], ["1", "2"], ["1", "2"]])
+
+        with self.assertRaisesRegex(
+            ValueError, "`dtype` int32 is not compatible with "
+        ):
+            data_utils.pad_sequences(
+                a, maxlen=2, truncating="post", value="pad"
+            )
+
+    def test_pad_sequences_vector(self):
+        a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]
+
+        # test padding
+        b = data_utils.pad_sequences(a, maxlen=3, padding="pre")
+        self.assertAllClose(
+            b,
+            [
+                [[0, 0], [0, 0], [1, 1]],
+                [[0, 0], [2, 1], [2, 2]],
+                [[3, 1], [3, 2], [3, 3]],
+            ],
+        )
+        b = data_utils.pad_sequences(a, maxlen=3, padding="post")
+        self.assertAllClose(
+            b,
+            [
+                [[1, 1], [0, 0], [0, 0]],
+                [[2, 1], [2, 2], [0, 0]],
+                [[3, 1], [3, 2], [3, 3]],
+            ],
+        )
+
+        # test truncating
+        b = data_utils.pad_sequences(a, maxlen=2, truncating="pre")
+        self.assertAllClose(
+            b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 2], [3, 3]]]
+        )
+
+        b = data_utils.pad_sequences(a, maxlen=2, truncating="post")
+        self.assertAllClose(
+            b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2]]]
+        )
+
+        # test value
+        b = data_utils.pad_sequences(a, maxlen=3, value=1)
+        self.assertAllClose(
+            b,
+            [
+                [[1, 1], [1, 1], [1, 1]],
+                [[1, 1], [2, 1], [2, 2]],
+                [[3, 1], [3, 2], [3, 3]],
+            ],
+        )
+
+
+if __name__ == "__main__":
+    # Bazel sets these environment variables to very long paths.
+    # Tempfile uses them to create long paths, and in turn multiprocessing
+    # library tries to create sockets named after paths. Delete whatever bazel
+    # writes to these to avoid tests failing due to socket addresses being too
+    # long.
+    for var in ("TMPDIR", "TMP", "TEMP"):
+        if var in os.environ:
+            del os.environ[var]
+
+    tf.test.main()
diff --git a/keras/utils/dataset_creator.py b/keras/utils/dataset_creator.py
index 2dc7e62ffcc2..537f3476a069 100644
--- a/keras/utils/dataset_creator.py
+++ b/keras/utils/dataset_creator.py
@@ -12,99 +12,105 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-classes-have-attributes
+
 """Input dataset creator for `model.fit`."""
 
 import tensorflow.compat.v2 as tf
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.utils.experimental.DatasetCreator', v1=[])
+@keras_export("keras.utils.experimental.DatasetCreator", v1=[])
 class DatasetCreator:
-  """Object that returns a `tf.data.Dataset` upon invoking.
-
-  `tf.keras.utils.experimental.DatasetCreator` is designated as a supported type
-  for `x`, or the input, in `tf.keras.Model.fit`. Pass an instance of this class
-  to `fit` when using a callable (with a `input_context` argument) that returns
-  a `tf.data.Dataset`.
-
-  ```python
-  model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
-  model.compile(tf.keras.optimizers.SGD(), loss="mse")
-
-  def dataset_fn(input_context):
-    global_batch_size = 64
-    batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-    dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat()
-    dataset = dataset.shard(
-        input_context.num_input_pipelines, input_context.input_pipeline_id)
-    dataset = dataset.batch(batch_size)
-    dataset = dataset.prefetch(2)
-    return dataset
-
-  input_options = tf.distribute.InputOptions(
-      experimental_fetch_to_device=True,
-      experimental_per_replica_buffer_size=2)
-  model.fit(tf.keras.utils.experimental.DatasetCreator(
-      dataset_fn, input_options=input_options), epochs=10, steps_per_epoch=10)
-  ```
-
-  `Model.fit` usage with `DatasetCreator` is intended to work across all
-  `tf.distribute.Strategy`s, as long as `Strategy.scope` is used at model
-  creation:
-
-  ```python
-  strategy = tf.distribute.experimental.ParameterServerStrategy(
-      cluster_resolver)
-  with strategy.scope():
+    """Object that returns a `tf.data.Dataset` upon invoking.
+
+    `tf.keras.utils.experimental.DatasetCreator` is designated as a supported
+    type for `x`, or the input, in `tf.keras.Model.fit`. Pass an instance of
+    this class to `fit` when using a callable (with a `input_context` argument)
+    that returns a `tf.data.Dataset`.
+
+    ```python
     model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
-  model.compile(tf.keras.optimizers.SGD(), loss="mse")
-
-  def dataset_fn(input_context):
-    ...
-
-  input_options = ...
-  model.fit(tf.keras.utils.experimental.DatasetCreator(
-      dataset_fn, input_options=input_options), epochs=10, steps_per_epoch=10)
-  ```
-
-  Note: When using `DatasetCreator`, `steps_per_epoch` argument in `Model.fit`
-  must be provided as the cardinality of such input cannot be inferred.
-
-  Args:
-    dataset_fn: A callable that takes a single argument of type
-      `tf.distribute.InputContext`, which is used for batch size calculation and
-      cross-worker input pipeline sharding (if neither is needed, the
-      `InputContext` parameter can be ignored in the `dataset_fn`), and returns
-      a `tf.data.Dataset`.
-    input_options: Optional `tf.distribute.InputOptions`, used for specific
-      options when used with distribution, for example, whether to prefetch
-      dataset elements to accelerator device memory or host device memory, and
-      prefetch buffer size in the replica device memory. No effect if not used
-      with distributed training. See `tf.distribute.InputOptions` for more
-      information.
-  """
-
-  def __init__(self, dataset_fn, input_options=None):
-    if not callable(dataset_fn):
-      raise TypeError(
-          '`dataset_fn` for `DatasetCreator` must be a `callable`. '
-          f'Received: {dataset_fn}')
-    if input_options and (not isinstance(input_options,
-                                         tf.distribute.InputOptions)):
-      raise TypeError(
-          '`input_options` for `DatasetCreator` must be a '
-          f'`tf.distribute.InputOptions`. Received: {input_options}')
-
-    self.dataset_fn = dataset_fn
-    self.input_options = input_options
-
-  def __call__(self, *args, **kwargs):
-    # When a `DatasetCreator` is invoked, it forwards args/kwargs straight to
-    # the callable.
-    dataset = self.dataset_fn(*args, **kwargs)
-    if not isinstance(dataset, tf.data.Dataset):
-      raise TypeError(
-          'The `callable` provided to `DatasetCreator` must return '
-          f'a Dataset. It returns "{dataset}"')
-    return dataset
+    model.compile(tf.keras.optimizers.SGD(), loss="mse")
+
+    def dataset_fn(input_context):
+      global_batch_size = 64
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat()
+      dataset = dataset.shard(
+          input_context.num_input_pipelines, input_context.input_pipeline_id)
+      dataset = dataset.batch(batch_size)
+      dataset = dataset.prefetch(2)
+      return dataset
+
+    input_options = tf.distribute.InputOptions(
+        experimental_fetch_to_device=True,
+        experimental_per_replica_buffer_size=2)
+    model.fit(tf.keras.utils.experimental.DatasetCreator(
+        dataset_fn, input_options=input_options), epochs=10, steps_per_epoch=10)
+    ```
+
+    `Model.fit` usage with `DatasetCreator` is intended to work across all
+    `tf.distribute.Strategy`s, as long as `Strategy.scope` is used at model
+    creation:
+
+    ```python
+    strategy = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    with strategy.scope():
+      model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
+    model.compile(tf.keras.optimizers.SGD(), loss="mse")
+
+    def dataset_fn(input_context):
+      ...
+
+    input_options = ...
+    model.fit(tf.keras.utils.experimental.DatasetCreator(
+        dataset_fn, input_options=input_options), epochs=10, steps_per_epoch=10)
+    ```
+
+    Note: When using `DatasetCreator`, `steps_per_epoch` argument in `Model.fit`
+    must be provided as the cardinality of such input cannot be inferred.
+
+    Args:
+      dataset_fn: A callable that takes a single argument of type
+        `tf.distribute.InputContext`, which is used for batch size calculation
+        and cross-worker input pipeline sharding (if neither is needed, the
+        `InputContext` parameter can be ignored in the `dataset_fn`), and
+        returns a `tf.data.Dataset`.
+      input_options: Optional `tf.distribute.InputOptions`, used for specific
+        options when used with distribution, for example, whether to prefetch
+        dataset elements to accelerator device memory or host device memory, and
+        prefetch buffer size in the replica device memory. No effect if not used
+        with distributed training. See `tf.distribute.InputOptions` for more
+        information.
+    """
+
+    def __init__(self, dataset_fn, input_options=None):
+        if not callable(dataset_fn):
+            raise TypeError(
+                "`dataset_fn` for `DatasetCreator` must be a `callable`. "
+                f"Received: {dataset_fn}"
+            )
+        if input_options and (
+            not isinstance(input_options, tf.distribute.InputOptions)
+        ):
+            raise TypeError(
+                "`input_options` for `DatasetCreator` must be a "
+                f"`tf.distribute.InputOptions`. Received: {input_options}"
+            )
+
+        self.dataset_fn = dataset_fn
+        self.input_options = input_options
+
+    def __call__(self, *args, **kwargs):
+        # When a `DatasetCreator` is invoked, it forwards args/kwargs straight
+        # to the callable.
+        dataset = self.dataset_fn(*args, **kwargs)
+        if not isinstance(dataset, tf.data.Dataset):
+            raise TypeError(
+                "The `callable` provided to `DatasetCreator` must return "
+                f'a Dataset. It returns "{dataset}"'
+            )
+        return dataset
diff --git a/keras/utils/dataset_creator_test.py b/keras/utils/dataset_creator_test.py
index 053e954d837d..3fa5442386fa 100644
--- a/keras/utils/dataset_creator_test.py
+++ b/keras/utils/dataset_creator_test.py
@@ -15,132 +15,180 @@
 """Tests for dataset_creator."""
 
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from keras.testing_infra import test_combinations
-from keras.testing_infra import test_utils
+
 from keras.distribute import multi_worker_testing_utils
 from keras.engine import data_adapter
 from keras.engine import sequential
 from keras.layers import core as core_layers
-from keras.optimizers.optimizer_v2 import gradient_descent
+from keras.optimizers.legacy import gradient_descent
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 from keras.utils import dataset_creator
-from tensorflow.python.training.server_lib import ClusterSpec
+
+# isort: off
+from tensorflow.python.distribute.cluster_resolver import (
+    SimpleClusterResolver,
+)
+from tensorflow.python.training.server_lib import (
+    ClusterSpec,
+)
 
 
 @test_utils.run_v2_only
 class DatasetCreatorTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_dataset_creator(self):
-    with self.assertRaisesRegex(
-        TypeError, "`dataset_fn` for `DatasetCreator` must be a `callable`."):
-      dataset_creator.DatasetCreator(2)
-
-    dataset_fn = lambda: 3
-    with self.assertRaisesRegex(
-        TypeError, "The `callable` provided to `DatasetCreator` must return "
-        "a Dataset."):
-      dataset_creator.DatasetCreator(dataset_fn)()
-
-    dataset_fn = lambda: tf.data.Dataset.from_tensor_slices([1, 1])
-    got = dataset_creator.DatasetCreator(dataset_fn)()
-    self.assertEqual(
-        next(iter(got)),
-        next(iter(tf.data.Dataset.from_tensor_slices([1, 1]))))
-
-  def _get_dataset_fn(self):
-
-    def dataset_fn(input_context):
-      global_batch_size = 64
-      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-      dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat()
-      dataset = dataset.shard(input_context.num_input_pipelines,
-                              input_context.input_pipeline_id)
-      dataset = dataset.batch(batch_size)
-      dataset = dataset.prefetch(2)
-      return dataset
-
-    return dataset_fn
-
-  @test_combinations.generate(
-      test_combinations.combine(use_input_options=[True, False]))
-  def test_dataset_creator_model_fit_without_strategy(self, use_input_options):
-    model = sequential.Sequential([core_layers.Dense(10)])
-    model.compile(gradient_descent.SGD(), loss="mse")
-
-    input_options = tf.distribute.InputOptions() if use_input_options else None
-    history = model.fit(
-        dataset_creator.DatasetCreator(self._get_dataset_fn(), input_options),
-        epochs=10,
-        steps_per_epoch=10,
-        verbose=0)
-    self.assertLen(history.history["loss"], 10)
-
-  def _get_parameter_server_strategy(self):
-    cluster_def = multi_worker_testing_utils.create_in_process_cluster(
-        num_workers=2, num_ps=1, rpc_layer="grpc")
-    return tf.distribute.experimental.ParameterServerStrategy(
-        SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc"))
-
-  @test_combinations.generate(
-      test_combinations.combine(use_input_options=[True, False]))
-  def test_dataset_creator_usage_in_parameter_server_model_fit(
-      self, use_input_options):
-    strategy = self._get_parameter_server_strategy()
-    with strategy.scope():
-      model = sequential.Sequential([core_layers.Dense(10)])
-    model.compile(gradient_descent.SGD(), loss="mse")
-
-    input_options = tf.distribute.InputOptions() if use_input_options else None
-    history = model.fit(
-        dataset_creator.DatasetCreator(self._get_dataset_fn(), input_options),
-        epochs=10,
-        steps_per_epoch=10,
-        verbose=0)
-    self.assertLen(history.history["loss"], 10)
-
-  def test_dataset_creator_input_options(self):
-    dataset_fn = lambda _: tf.data.Dataset.from_tensor_slices([1, 1])
-    input_options = tf.distribute.InputOptions(
-        experimental_fetch_to_device=True,
-        experimental_per_replica_buffer_size=2)
-    x = dataset_creator.DatasetCreator(dataset_fn, input_options=input_options)
-    with tf.distribute.MultiWorkerMirroredStrategy().scope():
-      data_handler = data_adapter.get_data_handler(
-          x,
-          steps_per_epoch=2,
-          model=sequential.Sequential([core_layers.Dense(10)]))
-
-    # Ensuring the resulting `DistributedDatasetsFromFunction` has the right
-    # options.
-    self.assertTrue(data_handler._dataset._options.experimental_fetch_to_device)
-    self.assertEqual(
-        data_handler._dataset._options.experimental_per_replica_buffer_size, 2)
-
-  def test_dataset_creator_input_options_with_cluster_coordinator(self):
-    dataset_fn = lambda _: tf.data.Dataset.from_tensor_slices([1, 1])
-    input_options = tf.distribute.InputOptions(
-        experimental_fetch_to_device=True,
-        experimental_per_replica_buffer_size=2)
-    x = dataset_creator.DatasetCreator(dataset_fn, input_options=input_options)
-    strategy = self._get_parameter_server_strategy()
-    with strategy.scope():
-      model = sequential.Sequential([core_layers.Dense(10)])
-      model._cluster_coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
-          strategy)
-      data_handler = data_adapter.get_data_handler(
-          x, steps_per_epoch=2, model=model)
-
-    iter_rv = iter(data_handler._dataset)._values[0]
-    iter_rv._rebuild_on(model._cluster_coordinator._cluster.workers[0])
-    distributed_iterator = iter_rv._get_values()
-
-    # Ensuring the resulting `DistributedIterator` has the right options.
-    self.assertTrue(distributed_iterator._options.experimental_fetch_to_device)
-    self.assertEqual(
-        distributed_iterator._options.experimental_per_replica_buffer_size, 2)
+    def test_dataset_creator(self):
+        with self.assertRaisesRegex(
+            TypeError, "`dataset_fn` for `DatasetCreator` must be a `callable`."
+        ):
+            dataset_creator.DatasetCreator(2)
+
+        dataset_fn = lambda: 3
+        with self.assertRaisesRegex(
+            TypeError,
+            "The `callable` provided to `DatasetCreator` must return "
+            "a Dataset.",
+        ):
+            dataset_creator.DatasetCreator(dataset_fn)()
+
+        dataset_fn = lambda: tf.data.Dataset.from_tensor_slices([1, 1])
+        got = dataset_creator.DatasetCreator(dataset_fn)()
+        self.assertEqual(
+            next(iter(got)),
+            next(iter(tf.data.Dataset.from_tensor_slices([1, 1]))),
+        )
+
+    def _get_dataset_fn(self):
+        def dataset_fn(input_context):
+            global_batch_size = 64
+            batch_size = input_context.get_per_replica_batch_size(
+                global_batch_size
+            )
+            dataset = tf.data.Dataset.from_tensors(([1.0], [1.0])).repeat()
+            dataset = dataset.shard(
+                input_context.num_input_pipelines,
+                input_context.input_pipeline_id,
+            )
+            dataset = dataset.batch(batch_size)
+            dataset = dataset.prefetch(2)
+            return dataset
+
+        return dataset_fn
+
+    @test_combinations.generate(
+        test_combinations.combine(use_input_options=[True, False])
+    )
+    def test_dataset_creator_model_fit_without_strategy(
+        self, use_input_options
+    ):
+        model = sequential.Sequential([core_layers.Dense(10)])
+        model.compile(gradient_descent.SGD(), loss="mse")
+
+        input_options = (
+            tf.distribute.InputOptions() if use_input_options else None
+        )
+        history = model.fit(
+            dataset_creator.DatasetCreator(
+                self._get_dataset_fn(), input_options
+            ),
+            epochs=10,
+            steps_per_epoch=10,
+            verbose=0,
+        )
+        self.assertLen(history.history["loss"], 10)
+
+    def _get_parameter_server_strategy(self):
+        cluster_def = multi_worker_testing_utils.create_in_process_cluster(
+            num_workers=2, num_ps=1, rpc_layer="grpc"
+        )
+        return tf.distribute.experimental.ParameterServerStrategy(
+            SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
+        )
+
+    @test_combinations.generate(
+        test_combinations.combine(use_input_options=[True, False])
+    )
+    def test_dataset_creator_usage_in_parameter_server_model_fit(
+        self, use_input_options
+    ):
+        strategy = self._get_parameter_server_strategy()
+        with strategy.scope():
+            model = sequential.Sequential([core_layers.Dense(10)])
+        model.compile(gradient_descent.SGD(), loss="mse")
+
+        input_options = (
+            tf.distribute.InputOptions() if use_input_options else None
+        )
+        history = model.fit(
+            dataset_creator.DatasetCreator(
+                self._get_dataset_fn(), input_options
+            ),
+            epochs=10,
+            steps_per_epoch=10,
+            verbose=0,
+        )
+        self.assertLen(history.history["loss"], 10)
+
+    def test_dataset_creator_input_options(self):
+        dataset_fn = lambda _: tf.data.Dataset.from_tensor_slices([1, 1])
+        input_options = tf.distribute.InputOptions(
+            experimental_fetch_to_device=True,
+            experimental_per_replica_buffer_size=2,
+        )
+        x = dataset_creator.DatasetCreator(
+            dataset_fn, input_options=input_options
+        )
+        with tf.distribute.MultiWorkerMirroredStrategy().scope():
+            data_handler = data_adapter.get_data_handler(
+                x,
+                steps_per_epoch=2,
+                model=sequential.Sequential([core_layers.Dense(10)]),
+            )
+
+        # Ensuring the resulting `DistributedDatasetsFromFunction` has the right
+        # options.
+        self.assertTrue(
+            data_handler._dataset._options.experimental_fetch_to_device
+        )
+        self.assertEqual(
+            data_handler._dataset._options.experimental_per_replica_buffer_size,
+            2,
+        )
+
+    def test_dataset_creator_input_options_with_cluster_coordinator(self):
+        dataset_fn = lambda _: tf.data.Dataset.from_tensor_slices([1, 1])
+        input_options = tf.distribute.InputOptions(
+            experimental_fetch_to_device=True,
+            experimental_per_replica_buffer_size=2,
+        )
+        x = dataset_creator.DatasetCreator(
+            dataset_fn, input_options=input_options
+        )
+        strategy = self._get_parameter_server_strategy()
+        with strategy.scope():
+            model = sequential.Sequential([core_layers.Dense(10)])
+            model._cluster_coordinator = (
+                tf.distribute.experimental.coordinator.ClusterCoordinator(
+                    strategy
+                )
+            )
+            data_handler = data_adapter.get_data_handler(
+                x, steps_per_epoch=2, model=model
+            )
+
+        iter_rv = iter(data_handler._dataset)._values[0]
+        iter_rv._rebuild_on(model._cluster_coordinator._cluster.workers[0])
+        distributed_iterator = iter_rv._get_values()
+
+        # Ensuring the resulting `DistributedIterator` has the right options.
+        self.assertTrue(
+            distributed_iterator._options.experimental_fetch_to_device
+        )
+        self.assertEqual(
+            distributed_iterator._options.experimental_per_replica_buffer_size,
+            2,
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py
index eeb8db0086ea..3fbe9b9c2d7f 100644
--- a/keras/utils/dataset_utils.py
+++ b/keras/utils/dataset_utils.py
@@ -14,627 +14,737 @@
 # ==============================================================================
 """Keras image dataset loading utilities."""
 
-import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
-
 import multiprocessing
 import os
+import random
 import time
 import warnings
-import random
 
 import numpy as np
-from tensorflow.python.util.tf_export import keras_export
+import tensorflow.compat.v2 as tf
 
+from keras.utils import io_utils
 
-@keras_export('keras.utils.split_dataset', v1=[])
-def split_dataset(dataset,
-                  left_size=None,
-                  right_size=None,
-                  shuffle=False,
-                  seed=None):
-  """Split a dataset into a left half and a right half (e.g. train / test).
-
-  Args:
-      dataset: A `tf.data.Dataset` object or a list/tuple of arrays with the
-        same length.
-      left_size: If float, it should be in range `[0, 1]` range and signifies
-        the fraction of the data to pack in the left dataset. If integer, it
-        signifies the number of samples to pack in the left dataset. If `None`,
-        it defaults to the complement to `right_size`.
-      right_size: If float, it should be in range `[0, 1]` range and signifies
-        the fraction of the data to pack in the right dataset. If integer, it
-        signifies the number of samples to pack in the right dataset. If `None`,
-        it defaults to the complement to `left_size`.
-      shuffle: Boolean, whether to shuffle the data before splitting it.
-      seed: A random seed for shuffling.
-
-  Returns:
-      A tuple of two `tf.data.Dataset` objects: the left and right splits.
-  """
-  dataset_type_spec = _get_type_spec(dataset)
-
-  if dataset_type_spec not in [tf.data.Dataset, list, tuple, np.ndarray]:
-    raise TypeError('The `dataset` argument must be either a `tf.data.Dataset` '
-                    'object or a list/tuple of arrays. '
-                    f'Received: dataset={dataset} of type {type(dataset)}')
-
-  if right_size is None and left_size is None:
-    raise ValueError('At least one of the `left_size` or `right_size` '
-                     'must be specified. Received: left_size=None and '
-                     'right_size=None')
-
-  dataset_as_list = _convert_dataset_to_list(dataset, dataset_type_spec)
-
-  if shuffle:
-    if seed is None:
-      seed = random.randint(0, int(1e6))
-    random.seed(seed)
-    random.shuffle(dataset_as_list)
-
-  total_length = len(dataset_as_list)
-
-  left_size, right_size = _rescale_dataset_split_sizes(left_size, right_size,
-                                                       total_length)
-  left_split = list(dataset_as_list[:left_size])
-  right_split = list(dataset_as_list[-right_size:])
-
-  left_split = _restore_dataset_from_list(left_split, dataset_type_spec,
-                                          dataset)
-  right_split = _restore_dataset_from_list(right_split, dataset_type_spec,
-                                           dataset)
-
-  left_split = tf.data.Dataset.from_tensor_slices(left_split)
-  right_split = tf.data.Dataset.from_tensor_slices(right_split)
-
-  # apply batching to the splits if the dataset is batched
-  if dataset_type_spec is tf.data.Dataset and is_batched(dataset):
-    batch_size = get_batch_size(dataset)
-    if batch_size is not None:
-      left_split = left_split.batch(batch_size)
-      right_split = right_split.batch(batch_size)
-
-  left_split = left_split.prefetch(tf.data.AUTOTUNE)
-  right_split = right_split.prefetch(tf.data.AUTOTUNE)
-
-  return left_split, right_split
-
-
-def _convert_dataset_to_list(dataset,
-                             dataset_type_spec,
-                             data_size_warning_flag=True,
-                             ensure_shape_similarity=True):
-  """Convert `tf.data.Dataset` object or list/tuple of NumPy arrays to a list.
-
-  Args:
-      dataset : A `tf.data.Dataset` object or a list/tuple of arrays.
-      dataset_type_spec : the type of the dataset
-      data_size_warning_flag (bool, optional): If set to True, a warning will be
-        issued if the dataset takes longer than 10 seconds to iterate. Defaults
-        to True.
-      ensure_shape_similarity (bool, optional): If set to True, the shape of
-        the first sample will be used to validate the shape of rest of the
-        samples. Defaults to True.
-
-  Returns:
-      List: A list of tuples/NumPy arrays.
-  """
-  dataset_iterator = _get_data_iterator_from_dataset(dataset, dataset_type_spec)
-  dataset_as_list = []
-
-  start_time = time.time()
-  for sample in _get_next_sample(dataset_iterator, ensure_shape_similarity,
-                                 data_size_warning_flag, start_time):
-    if dataset_type_spec in [tuple, list]:
-      dataset_as_list.append(np.array(sample))
-    else:
-      dataset_as_list.append(sample)
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-  return dataset_as_list
+
+@keras_export("keras.utils.split_dataset", v1=[])
+def split_dataset(
+    dataset, left_size=None, right_size=None, shuffle=False, seed=None
+):
+    """Split a dataset into a left half and a right half (e.g. train / test).
+
+    Args:
+        dataset: A `tf.data.Dataset` object, or a list/tuple of arrays with the
+          same length.
+        left_size: If float (in the range `[0, 1]`), it signifies
+          the fraction of the data to pack in the left dataset. If integer, it
+          signifies the number of samples to pack in the left dataset. If
+          `None`, it uses the complement to `right_size`. Defaults to `None`.
+        right_size: If float (in the range `[0, 1]`), it signifies
+          the fraction of the data to pack in the right dataset. If integer, it
+          signifies the number of samples to pack in the right dataset. If
+          `None`, it uses the complement to `left_size`. Defaults to `None`.
+        shuffle: Boolean, whether to shuffle the data before splitting it.
+        seed: A random seed for shuffling.
+
+    Returns:
+        A tuple of two `tf.data.Dataset` objects: the left and right splits.
+
+    Example:
+
+    >>> data = np.random.random(size=(1000, 4))
+    >>> left_ds, right_ds = tf.keras.utils.split_dataset(data, left_size=0.8)
+    >>> int(left_ds.cardinality())
+    800
+    >>> int(right_ds.cardinality())
+    200
+
+    """
+    dataset_type_spec = _get_type_spec(dataset)
+
+    if dataset_type_spec not in [tf.data.Dataset, list, tuple, np.ndarray]:
+        raise TypeError(
+            "The `dataset` argument must be either a `tf.data.Dataset` "
+            "object or a list/tuple of arrays. "
+            f"Received: dataset={dataset} of type {type(dataset)}"
+        )
+
+    if right_size is None and left_size is None:
+        raise ValueError(
+            "At least one of the `left_size` or `right_size` "
+            "must be specified. Received: left_size=None and "
+            "right_size=None"
+        )
+
+    dataset_as_list = _convert_dataset_to_list(dataset, dataset_type_spec)
+
+    if shuffle:
+        if seed is None:
+            seed = random.randint(0, int(1e6))
+        random.seed(seed)
+        random.shuffle(dataset_as_list)
+
+    total_length = len(dataset_as_list)
+
+    left_size, right_size = _rescale_dataset_split_sizes(
+        left_size, right_size, total_length
+    )
+    left_split = list(dataset_as_list[:left_size])
+    right_split = list(dataset_as_list[-right_size:])
+
+    left_split = _restore_dataset_from_list(
+        left_split, dataset_type_spec, dataset
+    )
+    right_split = _restore_dataset_from_list(
+        right_split, dataset_type_spec, dataset
+    )
+
+    left_split = tf.data.Dataset.from_tensor_slices(left_split)
+    right_split = tf.data.Dataset.from_tensor_slices(right_split)
+
+    # apply batching to the splits if the dataset is batched
+    if dataset_type_spec is tf.data.Dataset and is_batched(dataset):
+        batch_size = get_batch_size(dataset)
+        if batch_size is not None:
+            left_split = left_split.batch(batch_size)
+            right_split = right_split.batch(batch_size)
+
+    left_split = left_split.prefetch(tf.data.AUTOTUNE)
+    right_split = right_split.prefetch(tf.data.AUTOTUNE)
+
+    return left_split, right_split
+
+
+def _convert_dataset_to_list(
+    dataset,
+    dataset_type_spec,
+    data_size_warning_flag=True,
+    ensure_shape_similarity=True,
+):
+    """Convert `tf.data.Dataset` object or list/tuple of NumPy arrays to a list.
+
+    Args:
+        dataset : A `tf.data.Dataset` object or a list/tuple of arrays.
+        dataset_type_spec : the type of the dataset
+        data_size_warning_flag (bool, optional): If set to True, a warning will
+          be issued if the dataset takes longer than 10 seconds to iterate.
+          Defaults to `True`.
+        ensure_shape_similarity (bool, optional): If set to True, the shape of
+          the first sample will be used to validate the shape of rest of the
+          samples. Defaults to `True`.
+
+    Returns:
+        List: A list of tuples/NumPy arrays.
+    """
+    dataset_iterator = _get_data_iterator_from_dataset(
+        dataset, dataset_type_spec
+    )
+    dataset_as_list = []
+
+    start_time = time.time()
+    for sample in _get_next_sample(
+        dataset_iterator,
+        ensure_shape_similarity,
+        data_size_warning_flag,
+        start_time,
+    ):
+        if dataset_type_spec in [tuple, list]:
+            # The try-except here is for NumPy 1.24 compatibility, see:
+            # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
+            try:
+                arr = np.array(sample)
+            except ValueError:
+                arr = np.array(sample, dtype=object)
+            dataset_as_list.append(arr)
+        else:
+            dataset_as_list.append(sample)
+
+    return dataset_as_list
 
 
 def _get_data_iterator_from_dataset(dataset, dataset_type_spec):
-  """Get the iterator from a dataset.
-
-  Args:
-      dataset :  A `tf.data.Dataset` object or a list/tuple of arrays.
-      dataset_type_spec : the type of the dataset
-
-  Raises:
-      ValueError:
-                - If the dataset is empty.
-                - If the dataset is not a `tf.data.Dataset` object
-                  or a list/tuple of arrays.
-                - If the dataset is a list/tuple of arrays and the
-                  length of the list/tuple is not equal to the number
-
-  Returns:
-      iterator: An `iterator` object.
-  """
-  if dataset_type_spec == list:
-    if len(dataset) == 0:
-      raise ValueError('Received an empty list dataset. '
-                       'Please provide a non-empty list of arrays.')
-
-    if _get_type_spec(dataset[0]) is np.ndarray:
-      expected_shape = dataset[0].shape
-      for i, element in enumerate(dataset):
-        if np.array(element).shape[0] != expected_shape[0]:
-          raise ValueError('Received a list of NumPy arrays with different '
-                           f'lengths. Mismatch found at index {i}, '
-                           f'Expected shape={expected_shape} '
-                           f'Received shape={np.array(element).shape}.'
-                           f'Please provide a list of NumPy arrays with '
-                           f'the same length.')
-    else:
-      raise ValueError('Expected a list of `numpy.ndarray` objects,'
-                       f'Received: {type(dataset[0])}')
-
-    return iter(zip(*dataset))
-  elif dataset_type_spec == tuple:
-    if len(dataset) == 0:
-      raise ValueError('Received an empty list dataset.'
-                       'Please provide a non-empty tuple of arrays.')
-
-    if _get_type_spec(dataset[0]) is np.ndarray:
-      expected_shape = dataset[0].shape
-      for i, element in enumerate(dataset):
-        if np.array(element).shape[0] != expected_shape[0]:
-          raise ValueError('Received a tuple of NumPy arrays with different '
-                           f'lengths. Mismatch found at index {i}, '
-                           f'Expected shape={expected_shape} '
-                           f'Received shape={np.array(element).shape}.'
-                           f'Please provide a tuple of NumPy arrays with '
-                           'the same length.')
-    else:
-      raise ValueError('Expected a tuple of `numpy.ndarray` objects, '
-                       f'Received: {type(dataset[0])}')
-
-    return iter(zip(*dataset))
-  elif dataset_type_spec == tf.data.Dataset:
-    if is_batched(dataset):
-      dataset = dataset.unbatch()
-    return iter(dataset)
-  elif dataset_type_spec == np.ndarray:
-    return iter(dataset)
-
-
-def _get_next_sample(dataset_iterator, ensure_shape_similarity,
-                     data_size_warning_flag, start_time):
-  """"Yield data samples from the `dataset_iterator`.
-
-  Args:
-      dataset_iterator : An `iterator` object.
-      ensure_shape_similarity (bool, optional): If set to True, the shape of
-        the first sample will be used to validate the shape of rest of the
-        samples. Defaults to True.
-      data_size_warning_flag (bool, optional): If set to True, a warning will be
-        issued if the dataset takes longer than 10 seconds to iterate. Defaults
-        to True.
-      start_time (float): the start time of the dataset iteration. this is used
-        only if `data_size_warning_flag` is set to true.
-
-  Raises:
-      ValueError: - If the dataset is empty.
-                  - If `ensure_shape_similarity` is set to True and the
-                    shape of the first sample is not equal to the shape of
-                    atleast one of the rest of the samples.
-
-  Yields:
-      data_sample: A tuple/list of numpy arrays.
-  """
-  try:
-    dataset_iterator = iter(dataset_iterator)
-    first_sample = next(dataset_iterator)
-    if isinstance(first_sample, (tf.Tensor, np.ndarray)):
-      first_sample_shape = np.array(first_sample).shape
-    else:
-      first_sample_shape = None
-      ensure_shape_similarity = False
-    yield first_sample
-  except StopIteration:
-    raise ValueError('Received an empty Dataset. `dataset` must '
-                     'be a non-empty list/tuple of `numpy.ndarray` objects '
-                     'or `tf.data.Dataset` objects.')
-
-  for i, sample in enumerate(dataset_iterator):
-    if ensure_shape_similarity:
-      if first_sample_shape != np.array(sample).shape:
-        raise ValueError('All `dataset` samples must have same shape, '
-                         f'Expected shape: {np.array(first_sample).shape} '
-                         f'Received shape: {np.array(sample).shape} at index '
-                         f'{i}.')
-    if data_size_warning_flag:
-      if i % 10 == 0:
-        cur_time = time.time()
-        # warns user if the dataset is too large to iterate within 10s
-        if int(cur_time - start_time) > 10 and data_size_warning_flag:
-          warnings.warn(
-              'The dataset is taking longer than 10 seconds to '
-              'iterate over. This may be due to the size of the dataset. '
-              'Keep in mind that the `split_dataset` utility is only for '
-              'small in-memory dataset (e.g. < 10,000 samples).',
-              category=ResourceWarning,
-              source='split_dataset')
-          data_size_warning_flag = False
-    yield sample
-
-
-def _restore_dataset_from_list(dataset_as_list, dataset_type_spec,
-                               original_dataset):
-  """Restore the dataset from the list of arrays."""
-  if dataset_type_spec in [tuple, list]:
-    return tuple(np.array(sample) for sample in zip(*dataset_as_list))
-  elif dataset_type_spec == tf.data.Dataset:
-    if isinstance(original_dataset.element_spec, dict):
-      restored_dataset = {}
-      for d in dataset_as_list:
-        for k, v in d.items():
-          if k not in restored_dataset:
-            restored_dataset[k] = [v]
-          else:
-            restored_dataset[k].append(v)
-      return restored_dataset
-    else:
-      return tuple(np.array(sample) for sample in zip(*dataset_as_list))
-  return dataset_as_list
+    """Get the iterator from a dataset.
+
+    Args:
+        dataset :  A `tf.data.Dataset` object or a list/tuple of arrays.
+        dataset_type_spec : the type of the dataset
+
+    Raises:
+        ValueError:
+                  - If the dataset is empty.
+                  - If the dataset is not a `tf.data.Dataset` object
+                    or a list/tuple of arrays.
+                  - If the dataset is a list/tuple of arrays and the
+                    length of the list/tuple is not equal to the number
+
+    Returns:
+        iterator: An `iterator` object.
+    """
+    if dataset_type_spec == list:
+        if len(dataset) == 0:
+            raise ValueError(
+                "Received an empty list dataset. "
+                "Please provide a non-empty list of arrays."
+            )
+
+        if _get_type_spec(dataset[0]) is np.ndarray:
+            expected_shape = dataset[0].shape
+            for i, element in enumerate(dataset):
+                if np.array(element).shape[0] != expected_shape[0]:
+                    raise ValueError(
+                        "Received a list of NumPy arrays with different "
+                        f"lengths. Mismatch found at index {i}, "
+                        f"Expected shape={expected_shape} "
+                        f"Received shape={np.array(element).shape}."
+                        "Please provide a list of NumPy arrays with "
+                        "the same length."
+                    )
+        else:
+            raise ValueError(
+                "Expected a list of `numpy.ndarray` objects,"
+                f"Received: {type(dataset[0])}"
+            )
+
+        return iter(zip(*dataset))
+    elif dataset_type_spec == tuple:
+        if len(dataset) == 0:
+            raise ValueError(
+                "Received an empty list dataset."
+                "Please provide a non-empty tuple of arrays."
+            )
+
+        if _get_type_spec(dataset[0]) is np.ndarray:
+            expected_shape = dataset[0].shape
+            for i, element in enumerate(dataset):
+                if np.array(element).shape[0] != expected_shape[0]:
+                    raise ValueError(
+                        "Received a tuple of NumPy arrays with different "
+                        f"lengths. Mismatch found at index {i}, "
+                        f"Expected shape={expected_shape} "
+                        f"Received shape={np.array(element).shape}."
+                        "Please provide a tuple of NumPy arrays with "
+                        "the same length."
+                    )
+        else:
+            raise ValueError(
+                "Expected a tuple of `numpy.ndarray` objects, "
+                f"Received: {type(dataset[0])}"
+            )
+
+        return iter(zip(*dataset))
+    elif dataset_type_spec == tf.data.Dataset:
+        if is_batched(dataset):
+            dataset = dataset.unbatch()
+        return iter(dataset)
+    elif dataset_type_spec == np.ndarray:
+        return iter(dataset)
+
+
+def _get_next_sample(
+    dataset_iterator,
+    ensure_shape_similarity,
+    data_size_warning_flag,
+    start_time,
+):
+    """ "Yield data samples from the `dataset_iterator`.
+
+    Args:
+        dataset_iterator : An `iterator` object.
+        ensure_shape_similarity (bool, optional): If set to True, the shape of
+          the first sample will be used to validate the shape of rest of the
+          samples. Defaults to `True`.
+        data_size_warning_flag (bool, optional): If set to True, a warning will
+          be issued if the dataset takes longer than 10 seconds to iterate.
+          Defaults to `True`.
+        start_time (float): the start time of the dataset iteration. this is
+          used only if `data_size_warning_flag` is set to true.
+
+    Raises:
+        ValueError: - If the dataset is empty.
+                    - If `ensure_shape_similarity` is set to True and the
+                      shape of the first sample is not equal to the shape of
+                      atleast one of the rest of the samples.
+
+    Yields:
+        data_sample: A tuple/list of numpy arrays.
+    """
+    try:
+        dataset_iterator = iter(dataset_iterator)
+        first_sample = next(dataset_iterator)
+        if isinstance(first_sample, (tf.Tensor, np.ndarray)):
+            first_sample_shape = np.array(first_sample).shape
+        else:
+            first_sample_shape = None
+            ensure_shape_similarity = False
+        yield first_sample
+    except StopIteration:
+        raise ValueError(
+            "Received an empty Dataset. `dataset` must "
+            "be a non-empty list/tuple of `numpy.ndarray` objects "
+            "or `tf.data.Dataset` objects."
+        )
+
+    for i, sample in enumerate(dataset_iterator):
+        if ensure_shape_similarity:
+            if first_sample_shape != np.array(sample).shape:
+                raise ValueError(
+                    "All `dataset` samples must have same shape, "
+                    f"Expected shape: {np.array(first_sample).shape} "
+                    f"Received shape: {np.array(sample).shape} at index "
+                    f"{i}."
+                )
+        if data_size_warning_flag:
+            if i % 10 == 0:
+                cur_time = time.time()
+                # warns user if the dataset is too large to iterate within 10s
+                if int(cur_time - start_time) > 10 and data_size_warning_flag:
+                    warnings.warn(
+                        "The dataset is taking longer than 10 seconds to "
+                        "iterate over. This may be due to the size of the "
+                        "dataset. Keep in mind that the `split_dataset` "
+                        "utility is only for small in-memory dataset "
+                        "(e.g. < 10,000 samples).",
+                        category=ResourceWarning,
+                        source="split_dataset",
+                    )
+                    data_size_warning_flag = False
+        yield sample
+
+
+def _restore_dataset_from_list(
+    dataset_as_list, dataset_type_spec, original_dataset
+):
+    """Restore the dataset from the list of arrays."""
+    if dataset_type_spec in [tuple, list]:
+        return tuple(np.array(sample) for sample in zip(*dataset_as_list))
+    elif dataset_type_spec == tf.data.Dataset:
+        if isinstance(original_dataset.element_spec, dict):
+            restored_dataset = {}
+            for d in dataset_as_list:
+                for k, v in d.items():
+                    if k not in restored_dataset:
+                        restored_dataset[k] = [v]
+                    else:
+                        restored_dataset[k].append(v)
+            return restored_dataset
+        else:
+            return tuple(np.array(sample) for sample in zip(*dataset_as_list))
+    return dataset_as_list
 
 
 def _rescale_dataset_split_sizes(left_size, right_size, total_length):
-  """Rescale the dataset split sizes.
-
-  We want to ensure that the sum of
-  the split sizes is equal to the total length of the dataset.
-
-  Args:
-      left_size : The size of the left dataset split.
-      right_size : The size of the right dataset split.
-      total_length : The total length of the dataset.
-
-  Raises:
-      TypeError: - If `left_size` or `right_size` is not an integer or float.
-      ValueError: - If `left_size` or `right_size` is negative or greater
-                    than 1 or greater than `total_length`.
-
-  Returns:
-      tuple: A tuple of rescaled left_size and right_size
-  """
-  left_size_type = type(left_size)
-  right_size_type = type(right_size)
-
-  # check both left_size and right_size are integers or floats
-  if ((left_size is not None and left_size_type not in [int, float]) and
-      (right_size is not None and right_size_type not in [int, float])):
-    raise TypeError('Invalid `left_size` and `right_size` Types. Expected: '
-                    'integer or float or None, Received: type(left_size)='
-                    f'{left_size_type} and type(right_size)={right_size_type}')
-
-  # check left_size is a integer or float
-  if left_size is not None and left_size_type not in [int, float]:
-    raise TypeError('Invalid `left_size` Type. Expected: int or float or None, '
-                    f'Received: type(left_size)={left_size_type}.  ')
-
-  # check right_size is a integer or float
-  if right_size is not None and right_size_type not in [int, float]:
-    raise TypeError(f'Invalid `right_size` Type. '
-                    'Expected: int or float or None,'
-                    f'Received: type(right_size)={right_size_type}.')
-
-  # check left_size and right_size are non-zero
-  if left_size == 0 and right_size == 0:
-    raise ValueError('Both `left_size` and `right_size` are zero. '
-                     'At least one of the split sizes must be non-zero.')
-
-  # check left_size is non-negative and less than 1 and less than total_length
-  if (left_size_type == int and (left_size <= 0 or left_size >= total_length) or
-      left_size_type == float and (left_size <= 0 or left_size >= 1)):
-    raise ValueError('`left_size` should be either a positive integer '
-                     f'smaller than {total_length}, or a float '
-                     'within the range `[0, 1]`. Received: left_size='
-                     f'{left_size}')
-
-  # check right_size is non-negative and less than 1 and less than total_length
-  if (right_size_type == int and
-      (right_size <= 0 or right_size >= total_length) or
-      right_size_type == float and (right_size <= 0 or right_size >= 1)):
-    raise ValueError('`right_size` should be either a positive integer '
-                     f'and smaller than {total_length} or a float '
-                     'within the range `[0, 1]`. Received: right_size='
-                     f'{right_size}')
-
-  # check sum of left_size and right_size is less than or equal to total_length
-  if right_size_type == left_size_type == float and right_size + left_size > 1:
-    raise ValueError('The sum of `left_size` and `right_size` is greater '
-                     'than 1. It must be less than or equal to 1.')
-
-  if left_size_type == float:
-    left_size = round(left_size * total_length)
-  elif left_size_type == int:
-    left_size = float(left_size)
-
-  if right_size_type == float:
-    right_size = round(right_size * total_length)
-  elif right_size_type == int:
-    right_size = float(right_size)
-
-  if left_size is None:
-    left_size = total_length - right_size
-  elif right_size is None:
-    right_size = total_length - left_size
-
-  if left_size + right_size > total_length:
-    raise ValueError(
-        'The sum of `left_size` and `right_size` should '
-        'be smaller than the {total_length}. '
-        f'Received: left_size + right_size = {left_size+right_size}'
-        f'and total_length = {total_length}')
-
-  for split, side in [(left_size, 'left'), (right_size, 'right')]:
-    if split == 0:
-      raise ValueError(f'With `dataset` of length={total_length}, `left_size`='
-                       f'{left_size} and `right_size`={right_size}.'
-                       f'Resulting {side} side dataset split will be empty. '
-                       'Adjust any of the aforementioned parameters')
-
-  left_size, right_size = int(left_size), int(right_size)
-  return left_size, right_size
+    """Rescale the dataset split sizes.
+
+    We want to ensure that the sum of
+    the split sizes is equal to the total length of the dataset.
+
+    Args:
+        left_size : The size of the left dataset split.
+        right_size : The size of the right dataset split.
+        total_length : The total length of the dataset.
+
+    Raises:
+        TypeError: - If `left_size` or `right_size` is not an integer or float.
+        ValueError: - If `left_size` or `right_size` is negative or greater
+                      than 1 or greater than `total_length`.
+
+    Returns:
+        tuple: A tuple of rescaled left_size and right_size
+    """
+    left_size_type = type(left_size)
+    right_size_type = type(right_size)
+
+    # check both left_size and right_size are integers or floats
+    if (left_size is not None and left_size_type not in [int, float]) and (
+        right_size is not None and right_size_type not in [int, float]
+    ):
+        raise TypeError(
+            "Invalid `left_size` and `right_size` Types. Expected: "
+            "integer or float or None, Received: type(left_size)="
+            f"{left_size_type} and type(right_size)={right_size_type}"
+        )
+
+    # check left_size is a integer or float
+    if left_size is not None and left_size_type not in [int, float]:
+        raise TypeError(
+            "Invalid `left_size` Type. Expected: int or float or None, "
+            f"Received: type(left_size)={left_size_type}.  "
+        )
+
+    # check right_size is a integer or float
+    if right_size is not None and right_size_type not in [int, float]:
+        raise TypeError(
+            "Invalid `right_size` Type. "
+            "Expected: int or float or None,"
+            f"Received: type(right_size)={right_size_type}."
+        )
+
+    # check left_size and right_size are non-zero
+    if left_size == 0 and right_size == 0:
+        raise ValueError(
+            "Both `left_size` and `right_size` are zero. "
+            "At least one of the split sizes must be non-zero."
+        )
+
+    # check left_size is non-negative and less than 1 and less than total_length
+    if (
+        left_size_type == int
+        and (left_size <= 0 or left_size >= total_length)
+        or left_size_type == float
+        and (left_size <= 0 or left_size >= 1)
+    ):
+        raise ValueError(
+            "`left_size` should be either a positive integer "
+            f"smaller than {total_length}, or a float "
+            "within the range `[0, 1]`. Received: left_size="
+            f"{left_size}"
+        )
+
+    # check right_size is non-negative and less than 1 and less than
+    # total_length
+    if (
+        right_size_type == int
+        and (right_size <= 0 or right_size >= total_length)
+        or right_size_type == float
+        and (right_size <= 0 or right_size >= 1)
+    ):
+        raise ValueError(
+            "`right_size` should be either a positive integer "
+            f"and smaller than {total_length} or a float "
+            "within the range `[0, 1]`. Received: right_size="
+            f"{right_size}"
+        )
+
+    # check sum of left_size and right_size is less than or equal to
+    # total_length
+    if (
+        right_size_type == left_size_type == float
+        and right_size + left_size > 1
+    ):
+        raise ValueError(
+            "The sum of `left_size` and `right_size` is greater "
+            "than 1. It must be less than or equal to 1."
+        )
+
+    if left_size_type == float:
+        left_size = round(left_size * total_length)
+    elif left_size_type == int:
+        left_size = float(left_size)
+
+    if right_size_type == float:
+        right_size = round(right_size * total_length)
+    elif right_size_type == int:
+        right_size = float(right_size)
+
+    if left_size is None:
+        left_size = total_length - right_size
+    elif right_size is None:
+        right_size = total_length - left_size
+
+    if left_size + right_size > total_length:
+        raise ValueError(
+            "The sum of `left_size` and `right_size` should "
+            "be smaller than the {total_length}. "
+            f"Received: left_size + right_size = {left_size+right_size}"
+            f"and total_length = {total_length}"
+        )
+
+    for split, side in [(left_size, "left"), (right_size, "right")]:
+        if split == 0:
+            raise ValueError(
+                f"With `dataset` of length={total_length}, `left_size`="
+                f"{left_size} and `right_size`={right_size}."
+                f"Resulting {side} side dataset split will be empty. "
+                "Adjust any of the aforementioned parameters"
+            )
+
+    left_size, right_size = int(left_size), int(right_size)
+    return left_size, right_size
 
 
 def _get_type_spec(dataset):
-  """Get the type spec of the dataset."""
-  if isinstance(dataset, tuple):
-    return tuple
-  elif isinstance(dataset, list):
-    return list
-  elif isinstance(dataset, np.ndarray):
-    return np.ndarray
-  elif isinstance(dataset, dict):
-    return dict
-  elif isinstance(dataset, tf.data.Dataset):
-    return tf.data.Dataset
-  else:
-    return None
+    """Get the type spec of the dataset."""
+    if isinstance(dataset, tuple):
+        return tuple
+    elif isinstance(dataset, list):
+        return list
+    elif isinstance(dataset, np.ndarray):
+        return np.ndarray
+    elif isinstance(dataset, dict):
+        return dict
+    elif isinstance(dataset, tf.data.Dataset):
+        return tf.data.Dataset
+    else:
+        return None
 
 
 def is_batched(tf_dataset):
-  """"Check if the `tf.data.Dataset` is batched."""
-  try:
-    return tf_dataset.__class__.__name__ == 'BatchDataset'
-  except AttributeError:
-    return False
+    """ "Check if the `tf.data.Dataset` is batched."""
+    return hasattr(tf_dataset, "_batch_size")
 
 
 def get_batch_size(tf_dataset):
-  """Get the batch size of the dataset."""
-  if is_batched(tf_dataset):
-    return tf_dataset._batch_size  # pylint: disable=protected-access
-  else:
-    return None
-
-
-def index_directory(directory,
-                    labels,
-                    formats,
-                    class_names=None,
-                    shuffle=True,
-                    seed=None,
-                    follow_links=False):
-  """Make list of all files in the subdirs of `directory`, with their labels.
-
-  Args:
-    directory: The target directory (string).
-    labels: Either "inferred"
-        (labels are generated from the directory structure),
-        None (no labels),
-        or a list/tuple of integer labels of the same size as the number of
-        valid files found in the directory. Labels should be sorted according
-        to the alphanumeric order of the image file paths
-        (obtained via `os.walk(directory)` in Python).
-    formats: Allowlist of file extensions to index (e.g. ".jpg", ".txt").
-    class_names: Only valid if "labels" is "inferred". This is the explicit
-        list of class names (must match names of subdirectories). Used
-        to control the order of the classes
-        (otherwise alphanumerical order is used).
-    shuffle: Whether to shuffle the data. Default: True.
-        If set to False, sorts the data in alphanumeric order.
-    seed: Optional random seed for shuffling.
-    follow_links: Whether to visits subdirectories pointed to by symlinks.
-
-  Returns:
-    tuple (file_paths, labels, class_names).
-      file_paths: list of file paths (strings).
-      labels: list of matching integer labels (same length as file_paths)
-      class_names: names of the classes corresponding to these labels, in order.
-  """
-  if labels is None:
-    # in the no-label case, index from the parent directory down.
-    subdirs = ['']
-    class_names = subdirs
-  else:
-    subdirs = []
-    for subdir in sorted(tf.io.gfile.listdir(directory)):
-      if tf.io.gfile.isdir(tf.io.gfile.join(directory, subdir)):
-        if subdir.endswith('/'):
-          subdir = subdir[:-1]
-        subdirs.append(subdir)
-    if not class_names:
-      class_names = subdirs
+    """Get the batch size of the dataset."""
+    if is_batched(tf_dataset):
+        return tf_dataset._batch_size
     else:
-      if set(class_names) != set(subdirs):
-        raise ValueError(
-            'The `class_names` passed did not match the '
-            'names of the subdirectories of the target directory. '
-            f'Expected: {subdirs}, but received: {class_names}')
-  class_indices = dict(zip(class_names, range(len(class_names))))
-
-  # Build an index of the files
-  # in the different class subfolders.
-  pool = multiprocessing.pool.ThreadPool()
-  results = []
-  filenames = []
-
-  for dirpath in (tf.io.gfile.join(directory, subdir) for subdir in subdirs):
-    results.append(
-        pool.apply_async(index_subdirectory,
-                         (dirpath, class_indices, follow_links, formats)))
-  labels_list = []
-  for res in results:
-    partial_filenames, partial_labels = res.get()
-    labels_list.append(partial_labels)
-    filenames += partial_filenames
-  if labels not in ('inferred', None):
-    if len(labels) != len(filenames):
-      raise ValueError('Expected the lengths of `labels` to match the number '
-                       'of files in the target directory. len(labels) is '
-                       f'{len(labels)} while we found {len(filenames)} files '
-                       f'in directory {directory}.')
-  else:
-    i = 0
-    labels = np.zeros((len(filenames),), dtype='int32')
-    for partial_labels in labels_list:
-      labels[i:i + len(partial_labels)] = partial_labels
-      i += len(partial_labels)
-
-  if labels is None:
-    print(f'Found {len(filenames)} files.')
-  else:
-    print(f'Found {len(filenames)} files belonging '
-          f'to {len(class_names)} classes.')
-  pool.close()
-  pool.join()
-  file_paths = [tf.io.gfile.join(directory, fname) for fname in filenames]
-
-  if shuffle:
-    # Shuffle globally to erase macro-structure
-    if seed is None:
-      seed = np.random.randint(1e6)
-    rng = np.random.RandomState(seed)
-    rng.shuffle(file_paths)
-    rng = np.random.RandomState(seed)
-    rng.shuffle(labels)
-  return file_paths, labels, class_names
+        return None
+
+
+def index_directory(
+    directory,
+    labels,
+    formats,
+    class_names=None,
+    shuffle=True,
+    seed=None,
+    follow_links=False,
+):
+    """Make list of all files in `directory`, with their labels.
+
+    Args:
+      directory: Directory where the data is located.
+          If `labels` is "inferred", it should contain
+          subdirectories, each containing files for a class.
+          Otherwise, the directory structure is ignored.
+      labels: Either "inferred"
+          (labels are generated from the directory structure),
+          None (no labels),
+          or a list/tuple of integer labels of the same size as the number of
+          valid files found in the directory. Labels should be sorted according
+          to the alphanumeric order of the image file paths
+          (obtained via `os.walk(directory)` in Python).
+      formats: Allowlist of file extensions to index (e.g. ".jpg", ".txt").
+      class_names: Only valid if "labels" is "inferred". This is the explicit
+          list of class names (must match names of subdirectories). Used
+          to control the order of the classes
+          (otherwise alphanumerical order is used).
+      shuffle: Whether to shuffle the data. Default: True.
+          If set to False, sorts the data in alphanumeric order.
+      seed: Optional random seed for shuffling.
+      follow_links: Whether to visits subdirectories pointed to by symlinks.
+
+    Returns:
+      tuple (file_paths, labels, class_names).
+        file_paths: list of file paths (strings).
+        labels: list of matching integer labels (same length as file_paths)
+        class_names: names of the classes corresponding to these labels, in
+          order.
+    """
+    if labels != "inferred":
+        # in the explicit/no-label cases, index from the parent directory down.
+        subdirs = [""]
+        class_names = subdirs
+    else:
+        subdirs = []
+        for subdir in sorted(tf.io.gfile.listdir(directory)):
+            if tf.io.gfile.isdir(tf.io.gfile.join(directory, subdir)):
+                if not subdir.startswith("."):
+                    if subdir.endswith("/"):
+                        subdir = subdir[:-1]
+                    subdirs.append(subdir)
+        if not class_names:
+            class_names = subdirs
+        else:
+            if set(class_names) != set(subdirs):
+                raise ValueError(
+                    "The `class_names` passed did not match the "
+                    "names of the subdirectories of the target directory. "
+                    f"Expected: {subdirs}, but received: {class_names}"
+                )
+    class_indices = dict(zip(class_names, range(len(class_names))))
+
+    # Build an index of the files
+    # in the different class subfolders.
+    pool = multiprocessing.pool.ThreadPool()
+    results = []
+    filenames = []
+
+    for dirpath in (tf.io.gfile.join(directory, subdir) for subdir in subdirs):
+        results.append(
+            pool.apply_async(
+                index_subdirectory,
+                (dirpath, class_indices, follow_links, formats),
+            )
+        )
+    labels_list = []
+    for res in results:
+        partial_filenames, partial_labels = res.get()
+        labels_list.append(partial_labels)
+        filenames += partial_filenames
+    if labels not in ("inferred", None):
+        if len(labels) != len(filenames):
+            raise ValueError(
+                "Expected the lengths of `labels` to match the number "
+                "of files in the target directory. len(labels) is "
+                f"{len(labels)} while we found {len(filenames)} files "
+                f"in directory {directory}."
+            )
+        class_names = sorted(set(labels))
+    else:
+        i = 0
+        labels = np.zeros((len(filenames),), dtype="int32")
+        for partial_labels in labels_list:
+            labels[i : i + len(partial_labels)] = partial_labels
+            i += len(partial_labels)
+
+    if labels is None:
+        io_utils.print_msg(f"Found {len(filenames)} files.")
+    else:
+        io_utils.print_msg(
+            f"Found {len(filenames)} files belonging "
+            f"to {len(class_names)} classes."
+        )
+    pool.close()
+    pool.join()
+    file_paths = [tf.io.gfile.join(directory, fname) for fname in filenames]
+
+    if shuffle:
+        # Shuffle globally to erase macro-structure
+        if seed is None:
+            seed = np.random.randint(1e6)
+        rng = np.random.RandomState(seed)
+        rng.shuffle(file_paths)
+        rng = np.random.RandomState(seed)
+        rng.shuffle(labels)
+    return file_paths, labels, class_names
 
 
 def iter_valid_files(directory, follow_links, formats):
-  if not follow_links:
-    walk = tf.io.gfile.walk(directory)
-  else:
-    walk = os.walk(directory, followlinks=follow_links)
-  for root, _, files in sorted(walk, key=lambda x: x[0]):
-    for fname in sorted(files):
-      if fname.lower().endswith(formats):
-        yield root, fname
+    if not follow_links:
+        walk = tf.io.gfile.walk(directory)
+    else:
+        walk = os.walk(directory, followlinks=follow_links)
+    for root, _, files in sorted(walk, key=lambda x: x[0]):
+        for fname in sorted(files):
+            if fname.lower().endswith(formats):
+                yield root, fname
 
 
 def index_subdirectory(directory, class_indices, follow_links, formats):
-  """Recursively walks directory and list image paths and their class index.
-
-  Args:
-    directory: string, target directory.
-    class_indices: dict mapping class names to their index.
-    follow_links: boolean, whether to recursively follow subdirectories
-      (if False, we only list top-level images in `directory`).
-    formats: Allowlist of file extensions to index (e.g. ".jpg", ".txt").
-
-  Returns:
-    tuple `(filenames, labels)`. `filenames` is a list of relative file
-      paths, and `labels` is a list of integer labels corresponding to these
-      files.
-  """
-  dirname = os.path.basename(directory)
-  valid_files = iter_valid_files(directory, follow_links, formats)
-  labels = []
-  filenames = []
-  for root, fname in valid_files:
-    labels.append(class_indices[dirname])
-    absolute_path = tf.io.gfile.join(root, fname)
-    relative_path = tf.io.gfile.join(
-        dirname, os.path.relpath(absolute_path, directory))
-    filenames.append(relative_path)
-  return filenames, labels
+    """Recursively walks directory and list image paths and their class index.
+
+    Args:
+      directory: string, target directory.
+      class_indices: dict mapping class names to their index.
+      follow_links: boolean, whether to recursively follow subdirectories
+        (if False, we only list top-level images in `directory`).
+      formats: Allowlist of file extensions to index (e.g. ".jpg", ".txt").
+
+    Returns:
+      tuple `(filenames, labels)`. `filenames` is a list of relative file
+        paths, and `labels` is a list of integer labels corresponding to these
+        files.
+    """
+    dirname = os.path.basename(directory)
+    valid_files = iter_valid_files(directory, follow_links, formats)
+    labels = []
+    filenames = []
+    for root, fname in valid_files:
+        labels.append(class_indices[dirname])
+        absolute_path = tf.io.gfile.join(root, fname)
+        relative_path = tf.io.gfile.join(
+            dirname, os.path.relpath(absolute_path, directory)
+        )
+        filenames.append(relative_path)
+    return filenames, labels
 
 
 def get_training_or_validation_split(samples, labels, validation_split, subset):
-  """Potentially restict samples & labels to a training or validation split.
-
-  Args:
-    samples: List of elements.
-    labels: List of corresponding labels.
-    validation_split: Float, fraction of data to reserve for validation.
-    subset: Subset of the data to return.
-      Either "training", "validation", or None. If None, we return all of the
-      data.
-
-  Returns:
-    tuple (samples, labels), potentially restricted to the specified subset.
-  """
-  if not validation_split:
+    """Potentially restict samples & labels to a training or validation split.
+
+    Args:
+      samples: List of elements.
+      labels: List of corresponding labels.
+      validation_split: Float, fraction of data to reserve for validation.
+      subset: Subset of the data to return.
+        Either "training", "validation", or None. If None, we return all of the
+        data.
+
+    Returns:
+      tuple (samples, labels), potentially restricted to the specified subset.
+    """
+    if not validation_split:
+        return samples, labels
+
+    num_val_samples = int(validation_split * len(samples))
+    if subset == "training":
+        io_utils.print_msg(
+            f"Using {len(samples) - num_val_samples} " f"files for training."
+        )
+        samples = samples[:-num_val_samples]
+        labels = labels[:-num_val_samples]
+    elif subset == "validation":
+        io_utils.print_msg(f"Using {num_val_samples} files for validation.")
+        samples = samples[-num_val_samples:]
+        labels = labels[-num_val_samples:]
+    else:
+        raise ValueError(
+            '`subset` must be either "training" '
+            f'or "validation", received: {subset}'
+        )
     return samples, labels
 
-  num_val_samples = int(validation_split * len(samples))
-  if subset == 'training':
-    print(f'Using {len(samples) - num_val_samples} files for training.')
-    samples = samples[:-num_val_samples]
-    labels = labels[:-num_val_samples]
-  elif subset == 'validation':
-    print(f'Using {num_val_samples} files for validation.')
-    samples = samples[-num_val_samples:]
-    labels = labels[-num_val_samples:]
-  else:
-    raise ValueError('`subset` must be either "training" '
-                     f'or "validation", received: {subset}')
-  return samples, labels
-
 
 def labels_to_dataset(labels, label_mode, num_classes):
-  """Create a tf.data.Dataset from the list/tuple of labels.
-
-  Args:
-    labels: list/tuple of labels to be converted into a tf.data.Dataset.
-    label_mode: String describing the encoding of `labels`. Options are:
-    - 'binary' indicates that the labels (there can be only 2) are encoded as
-      `float32` scalars with values 0 or 1 (e.g. for `binary_crossentropy`).
-    - 'categorical' means that the labels are mapped into a categorical vector.
-      (e.g. for `categorical_crossentropy` loss).
-    num_classes: number of classes of labels.
-
-  Returns:
-    A `Dataset` instance.
-  """
-  label_ds = tf.data.Dataset.from_tensor_slices(labels)
-  if label_mode == 'binary':
-    label_ds = label_ds.map(
-        lambda x: tf.expand_dims(tf.cast(x, 'float32'), axis=-1),
-        num_parallel_calls=tf.data.AUTOTUNE)
-  elif label_mode == 'categorical':
-    label_ds = label_ds.map(lambda x: tf.one_hot(x, num_classes),
-                            num_parallel_calls=tf.data.AUTOTUNE)
-  return label_ds
+    """Create a tf.data.Dataset from the list/tuple of labels.
+
+    Args:
+      labels: list/tuple of labels to be converted into a tf.data.Dataset.
+      label_mode: String describing the encoding of `labels`. Options are:
+      - 'binary' indicates that the labels (there can be only 2) are encoded as
+        `float32` scalars with values 0 or 1 (e.g. for `binary_crossentropy`).
+      - 'categorical' means that the labels are mapped into a categorical
+        vector.  (e.g. for `categorical_crossentropy` loss).
+      num_classes: number of classes of labels.
+
+    Returns:
+      A `Dataset` instance.
+    """
+    label_ds = tf.data.Dataset.from_tensor_slices(labels)
+    if label_mode == "binary":
+        label_ds = label_ds.map(
+            lambda x: tf.expand_dims(tf.cast(x, "float32"), axis=-1),
+            num_parallel_calls=tf.data.AUTOTUNE,
+        )
+    elif label_mode == "categorical":
+        label_ds = label_ds.map(
+            lambda x: tf.one_hot(x, num_classes),
+            num_parallel_calls=tf.data.AUTOTUNE,
+        )
+    return label_ds
 
 
 def check_validation_split_arg(validation_split, subset, shuffle, seed):
-  """Raise errors in case of invalid argument values.
-
-  Args:
-    validation_split: float between 0 and 1, fraction of data to reserve for
-      validation.
-    subset: One of "training", "validation" or "both". Only used if
-      `validation_split` is set.
-    shuffle: Whether to shuffle the data. Either True or False.
-    seed: random seed for shuffling and transformations.
-  """
-  if validation_split and not 0 < validation_split < 1:
-    raise ValueError(
-        '`validation_split` must be between 0 and 1, '
-        f'received: {validation_split}')
-  if (validation_split or subset) and not (validation_split and subset):
-    raise ValueError(
-        'If `subset` is set, `validation_split` must be set, and inversely.')
-  if subset not in ('training', 'validation', 'both', None):
-    raise ValueError('`subset` must be either "training", '
-                     f'"validation" or "both", received: {subset}')
-  if validation_split and shuffle and seed is None:
-    raise ValueError(
-        'If using `validation_split` and shuffling the data, you must provide '
-        'a `seed` argument, to make sure that there is no overlap between the '
-        'training and validation subset.')
+    """Raise errors in case of invalid argument values.
+
+    Args:
+      validation_split: float between 0 and 1, fraction of data to reserve for
+        validation.
+      subset: One of "training", "validation" or "both". Only used if
+        `validation_split` is set.
+      shuffle: Whether to shuffle the data. Either True or False.
+      seed: random seed for shuffling and transformations.
+    """
+    if validation_split and not 0 < validation_split < 1:
+        raise ValueError(
+            "`validation_split` must be between 0 and 1, "
+            f"received: {validation_split}"
+        )
+    if (validation_split or subset) and not (validation_split and subset):
+        raise ValueError(
+            "If `subset` is set, `validation_split` must be set, and inversely."
+        )
+    if subset not in ("training", "validation", "both", None):
+        raise ValueError(
+            '`subset` must be either "training", '
+            f'"validation" or "both", received: {subset}'
+        )
+    if validation_split and shuffle and seed is None:
+        raise ValueError(
+            "If using `validation_split` and shuffling the data, you must "
+            "provide a `seed` argument, to make sure that there is no "
+            "overlap between the training and validation subset."
+        )
diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py
index 43bfc3fad263..1de07df756bc 100644
--- a/keras/utils/dataset_utils_test.py
+++ b/keras/utils/dataset_utils_test.py
@@ -1,457 +1,593 @@
 """Tests for Dataset Utils"""
 
-import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
+import os
+import shutil
 
 import numpy as np
-from keras.datasets import mnist
+import tensorflow.compat.v2 as tf
+
 from keras.testing_infra import test_utils
 from keras.utils import dataset_utils
 
 
 @test_utils.run_v2_only
 class SplitDatasetTest(tf.test.TestCase):
+    def test_numpy_array(self):
+        dataset = np.ones(shape=(200, 32))
+        res = dataset_utils.split_dataset(
+            dataset, left_size=0.8, right_size=0.2
+        )
+
+        self.assertLen(res, 2)
+        left_split, right_split = res
+
+        self.assertIsInstance(left_split, tf.data.Dataset)
+        self.assertIsInstance(right_split, tf.data.Dataset)
+
+        self.assertLen(left_split, 160)
+        self.assertLen(right_split, 40)
+
+        self.assertAllEqual(dataset[:160], list(left_split))
+        self.assertAllEqual(dataset[-40:], list(right_split))
+
+    def test_list_of_numpy_arrays(self):
+        # test with list of np arrays with same shapes
+        dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))]
+        res = dataset_utils.split_dataset(dataset, left_size=4)
+
+        self.assertLen(res, 2)
+        left_split, right_split = res
+
+        self.assertIsInstance(left_split, tf.data.Dataset)
+        self.assertIsInstance(right_split, tf.data.Dataset)
+
+        self.assertEqual(np.array(list(left_split)).shape, (4, 2, 32))
+        self.assertEqual(np.array(list(right_split)).shape, (196, 2, 32))
+
+        # test with different shapes
+        dataset = [np.ones(shape=(5, 3)), np.ones(shape=(5,))]
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=0.3
+        )
+
+        self.assertEqual(np.array(list(left_split), dtype=object).shape, (2, 2))
+        self.assertEqual(
+            np.array(list(right_split), dtype=object).shape, (3, 2)
+        )
+
+        self.assertEqual(
+            np.array(list(left_split)[0], dtype=object).shape, (2,)
+        )
+        self.assertEqual(np.array(list(left_split)[0][0]).shape, (3,))
+        self.assertEqual(np.array(list(left_split)[0][1]).shape, ())
+
+        self.assertEqual(
+            np.array(list(right_split)[0], dtype=object).shape, (2,)
+        )
+        self.assertEqual(np.array(list(right_split)[0][0]).shape, (3,))
+        self.assertEqual(np.array(list(right_split)[0][1]).shape, ())
+
+    def test_dataset_with_invalid_shape(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Received a list of NumPy arrays with different lengths",
+        ):
+            dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))]
+            dataset_utils.split_dataset(dataset, left_size=4)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Received a tuple of NumPy arrays with different lengths",
+        ):
+            dataset = (np.ones(shape=(200, 32)), np.zeros(shape=(201, 32)))
+            dataset_utils.split_dataset(dataset, left_size=4)
+
+    def test_tuple_of_numpy_arrays(self):
+        dataset = (np.random.rand(4, 3), np.random.rand(4, 3))
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=2
+        )
+
+        self.assertIsInstance(left_split, tf.data.Dataset)
+        self.assertIsInstance(right_split, tf.data.Dataset)
+
+        self.assertEqual(len(left_split), 2)
+        self.assertEqual(len(right_split), 2)
+
+        self.assertEqual(np.array(list(left_split)[0]).shape, (2, 3))
+        self.assertEqual(np.array(list(left_split)[1]).shape, (2, 3))
+
+        # test with fractional size
+        dataset = (np.random.rand(5, 32, 32), np.random.rand(5, 32, 32))
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, right_size=0.4
+        )
+        self.assertIsInstance(left_split, tf.data.Dataset)
+        self.assertIsInstance(right_split, tf.data.Dataset)
+
+        self.assertEqual(np.array(list(left_split)).shape, (3, 2, 32, 32))
+        self.assertEqual(np.array(list(right_split)).shape, (2, 2, 32, 32))
+
+        self.assertEqual(np.array(list(left_split))[0].shape, (2, 32, 32))
+        self.assertEqual(np.array(list(left_split))[1].shape, (2, 32, 32))
+
+        self.assertEqual(np.array(list(right_split))[0].shape, (2, 32, 32))
+        self.assertEqual(np.array(list(right_split))[1].shape, (2, 32, 32))
+
+        # test with tuple of np arrays with different shapes
+        dataset = (
+            np.random.rand(5, 32, 32),
+            np.random.rand(
+                5,
+            ),
+        )
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=2, right_size=3
+        )
+        self.assertIsInstance(left_split, tf.data.Dataset)
+        self.assertIsInstance(right_split, tf.data.Dataset)
+
+        self.assertEqual(np.array(list(left_split), dtype=object).shape, (2, 2))
+        self.assertEqual(
+            np.array(list(right_split), dtype=object).shape, (3, 2)
+        )
+
+        self.assertEqual(
+            np.array(list(left_split)[0], dtype=object).shape, (2,)
+        )
+        self.assertEqual(np.array(list(left_split)[0][0]).shape, (32, 32))
+        self.assertEqual(np.array(list(left_split)[0][1]).shape, ())
+
+        self.assertEqual(
+            np.array(list(right_split)[0], dtype=object).shape, (2,)
+        )
+        self.assertEqual(np.array(list(right_split)[0][0]).shape, (32, 32))
+        self.assertEqual(np.array(list(right_split)[0][1]).shape, ())
+
+    def test_batched_tf_dataset_of_vectors(self):
+        vectors = np.ones(shape=(100, 32, 32, 1))
+        dataset = tf.data.Dataset.from_tensor_slices(vectors)
+        dataset = dataset.batch(10)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=2
+        )
+
+        # Ensure that the splits are batched
+        self.assertEqual(len(list(right_split)), 10)
+
+        left_split, right_split = left_split.unbatch(), right_split.unbatch()
+        self.assertAllEqual(np.array(list(left_split)).shape, (2, 32, 32, 1))
+        self.assertAllEqual(np.array(list(right_split)).shape, (98, 32, 32, 1))
+        dataset = dataset.unbatch()
+        self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
+
+    def test_batched_tf_dataset_of_tuple_of_vectors(self):
+        tuple_of_vectors = (
+            np.random.rand(10, 32, 32),
+            np.random.rand(10, 32, 32),
+        )
+        dataset = tf.data.Dataset.from_tensor_slices(tuple_of_vectors)
+        dataset = dataset.batch(2)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=4
+        )
+
+        # Ensure that the splits are batched
+        self.assertEqual(np.array(list(right_split)).shape, (3, 2, 2, 32, 32))
+        self.assertEqual(np.array(list(left_split)).shape, (2, 2, 2, 32, 32))
+
+        left_split, right_split = left_split.unbatch(), right_split.unbatch()
+        self.assertAllEqual(np.array(list(left_split)).shape, (4, 2, 32, 32))
+        self.assertAllEqual(np.array(list(right_split)).shape, (6, 2, 32, 32))
+
+        dataset = dataset.unbatch()
+        self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
+
+    def test_batched_tf_dataset_of_dict_of_vectors(self):
+        dict_samples = {"X": np.random.rand(10, 3), "Y": np.random.rand(10, 3)}
+        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
+        dataset = dataset.batch(2)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=2
+        )
+
+        self.assertAllEqual(np.array(list(left_split)).shape, (1,))
+        self.assertAllEqual(np.array(list(right_split)).shape, (4,))
+
+        left_split, right_split = left_split.unbatch(), right_split.unbatch()
+        self.assertEqual(len(list(left_split)), 2)
+        self.assertEqual(len(list(right_split)), 8)
+        for i in range(10):
+            if i < 2:
+                self.assertEqual(
+                    list(left_split)[i], list(dataset.unbatch())[i]
+                )
+            else:
+                self.assertEqual(
+                    list(right_split)[i - 2], list(dataset.unbatch())[i]
+                )
+
+        # test with dict of np arrays with different shapes
+        dict_samples = {
+            "images": np.random.rand(10, 16, 16, 3),
+            "labels": np.random.rand(
+                10,
+            ),
+        }
+        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
+        dataset = dataset.batch(1)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, right_size=0.3
+        )
+
+        self.assertAllEqual(np.array(list(left_split)).shape, (7,))
+        self.assertAllEqual(np.array(list(right_split)).shape, (3,))
+
+        dataset = dataset.unbatch()
+        left_split, right_split = left_split.unbatch(), right_split.unbatch()
+        self.assertEqual(len(list(left_split)), 7)
+        self.assertEqual(len(list(right_split)), 3)
+        for i in range(10):
+            if i < 7:
+                self.assertEqual(list(left_split)[i], list(dataset)[i])
+            else:
+                self.assertEqual(list(right_split)[i - 7], list(dataset)[i])
+
+    def test_unbatched_tf_dataset_of_vectors(self):
+        vectors = np.ones(shape=(100, 16, 16, 3))
+        dataset = tf.data.Dataset.from_tensor_slices(vectors)
+
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=0.25
+        )
+
+        self.assertAllEqual(np.array(list(left_split)).shape, (25, 16, 16, 3))
+        self.assertAllEqual(np.array(list(right_split)).shape, (75, 16, 16, 3))
+
+        self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
+
+        dataset = [np.random.rand(10, 3, 3) for _ in range(5)]
+        dataset = tf.data.Dataset.from_tensor_slices(dataset)
+
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=2
+        )
+        self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
+
+    def test_unbatched_tf_dataset_of_tuple_of_vectors(self):
+        # test with tuple of np arrays with same shape
+        X, Y = (np.random.rand(10, 32, 32, 1), np.random.rand(10, 32, 32, 1))
+        dataset = tf.data.Dataset.from_tensor_slices((X, Y))
+
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=5
+        )
+
+        self.assertEqual(len(list(left_split)), 5)
+        self.assertEqual(len(list(right_split)), 5)
+        self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
+
+        # test with tuple of np arrays with different shapes
+        X, Y = (
+            np.random.rand(5, 3, 3),
+            np.random.rand(
+                5,
+            ),
+        )
+        dataset = tf.data.Dataset.from_tensor_slices((X, Y))
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=0.5
+        )
+
+        self.assertEqual(len(list(left_split)), 2)
+        self.assertEqual(len(list(right_split)), 3)
+        self.assertEqual(np.array(list(left_split)[0][0]).shape, (3, 3))
+        self.assertEqual(np.array(list(left_split)[0][1]).shape, ())
+
+    def test_unbatched_tf_dataset_of_dict_of_vectors(self):
+        # test with dict of np arrays of same shape
+        dict_samples = {"X": np.random.rand(10, 2), "Y": np.random.rand(10, 2)}
+        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=2
+        )
+        self.assertEqual(len(list(left_split)), 2)
+        self.assertEqual(len(list(right_split)), 8)
+        for i in range(10):
+            if i < 2:
+                self.assertEqual(list(left_split)[i], list(dataset)[i])
+            else:
+                self.assertEqual(list(right_split)[i - 2], list(dataset)[i])
+
+        # test with dict of np arrays with different shapes
+        dict_samples = {
+            "images": np.random.rand(10, 16, 16, 3),
+            "labels": np.random.rand(
+                10,
+            ),
+        }
+        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=0.3
+        )
+        self.assertEqual(len(list(left_split)), 3)
+        self.assertEqual(len(list(right_split)), 7)
+        for i in range(10):
+            if i < 3:
+                self.assertEqual(list(left_split)[i], list(dataset)[i])
+            else:
+                self.assertEqual(list(right_split)[i - 3], list(dataset)[i])
+
+        # test with dict of text arrays
+        txt_feature = ["abb", "bb", "cc", "d", "e", "f", "g", "h", "i", "j"]
+        dict_samples = {
+            "txt_feature": txt_feature,
+            "label": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        }
+        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=0.45, right_size=0.55
+        )
+        self.assertEqual(len(list(left_split)), 4)
+        self.assertEqual(len(list(right_split)), 6)
+        for i in range(10):
+            if i < 4:
+                self.assertEqual(list(left_split)[i], list(dataset)[i])
+            else:
+                self.assertEqual(list(right_split)[i - 4], list(dataset)[i])
+
+    def test_list_dataset(self):
+        dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)]
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=5, right_size=5
+        )
+        self.assertEqual(len(left_split), len(right_split))
+        self.assertIsInstance(left_split, tf.data.Dataset)
+        self.assertIsInstance(left_split, tf.data.Dataset)
+
+        dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)]
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=0.6, right_size=0.4
+        )
+        self.assertEqual(len(left_split), 6)
+        self.assertEqual(len(right_split), 4)
+
+    def test_invalid_dataset(self):
+        with self.assertRaisesRegex(
+            TypeError,
+            "The `dataset` argument must be either a `tf.data.Dataset` "
+            "object or a list/tuple of arrays.",
+        ):
+            dataset_utils.split_dataset(dataset=None, left_size=5)
+        with self.assertRaisesRegex(
+            TypeError,
+            "The `dataset` argument must be either a `tf.data.Dataset` "
+            "object or a list/tuple of arrays.",
+        ):
+            dataset_utils.split_dataset(dataset=1, left_size=5)
+        with self.assertRaisesRegex(
+            TypeError,
+            "The `dataset` argument must be either a `tf.data.Dataset` "
+            "object or a list/tuple of arrays.",
+        ):
+            dataset_utils.split_dataset(dataset=float(1.2), left_size=5)
+        with self.assertRaisesRegex(
+            TypeError,
+            "The `dataset` argument must be either a `tf.data.Dataset` "
+            "object or a list/tuple of arrays.",
+        ):
+            dataset_utils.split_dataset(dataset=dict({}), left_size=5)
+        with self.assertRaisesRegex(
+            TypeError,
+            "The `dataset` argument must be either a `tf.data.Dataset` "
+            "object or a list/tuple of arrays.",
+        ):
+            dataset_utils.split_dataset(dataset=float("INF"), left_size=5)
+
+    def test_valid_left_and_right_sizes(self):
+        dataset = np.array([1, 2, 3])
+        splitted_dataset = dataset_utils.split_dataset(dataset, 1, 2)
+        self.assertLen(splitted_dataset, 2)
+        left_split, right_split = splitted_dataset
+        self.assertEqual(len(left_split), 1)
+        self.assertEqual(len(right_split), 2)
+        self.assertEqual(list(left_split), [1])
+        self.assertEqual(list(right_split), [2, 3])
+
+        dataset = np.ones(shape=(200, 32))
+        res = dataset_utils.split_dataset(dataset, left_size=150, right_size=50)
+        self.assertLen(res, 2)
+        self.assertIsInstance(res[0], tf.data.Dataset)
+        self.assertIsInstance(res[1], tf.data.Dataset)
+
+        self.assertLen(res[0], 150)
+        self.assertLen(res[1], 50)
+
+        dataset = np.ones(shape=(200, 32))
+        res = dataset_utils.split_dataset(dataset, left_size=120)
+        self.assertLen(res, 2)
+        self.assertIsInstance(res[0], tf.data.Dataset)
+        self.assertIsInstance(res[1], tf.data.Dataset)
+
+        self.assertLen(res[0], 120)
+        self.assertLen(res[1], 80)
+
+        dataset = np.ones(shape=(10000, 16))
+        res = dataset_utils.split_dataset(dataset, right_size=20)
+        self.assertLen(res, 2)
+        self.assertIsInstance(res[0], tf.data.Dataset)
+        self.assertIsInstance(res[1], tf.data.Dataset)
+
+        self.assertLen(res[0], 9980)
+        self.assertLen(res[1], 20)
+
+        dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+        splitted_dataset = dataset_utils.split_dataset(
+            dataset, left_size=0.1, right_size=0.9
+        )
+        self.assertLen(splitted_dataset, 2)
+        left_split, right_split = splitted_dataset
+        self.assertEqual(len(left_split), 1)
+        self.assertEqual(len(right_split), 9)
+        self.assertEqual(list(left_split), [1])
+        self.assertEqual(list(right_split), [2, 3, 4, 5, 6, 7, 8, 9, 10])
+
+        dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+        splitted_dataset = dataset_utils.split_dataset(
+            dataset, left_size=2, right_size=5
+        )
+        self.assertLen(splitted_dataset, 2)
+        left_split, right_split = splitted_dataset
+        self.assertEqual(len(left_split), 2)
+        self.assertEqual(len(right_split), 5)
+        self.assertEqual(list(left_split), [1, 2])
+        self.assertEqual(list(right_split), [6, 7, 8, 9, 10])
+
+    def test_float_left_and_right_sizes(self):
+        X = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
+        dataset = tf.data.Dataset.from_tensor_slices(X)
+        left_split, right_split = dataset_utils.split_dataset(
+            dataset, left_size=0.8, right_size=0.2
+        )
+        self.assertEqual(len(left_split), 2)
+        self.assertEqual(len(right_split), 1)
+
+    def test_invalid_float_left_and_right_sizes(self):
+        expected_regex = (
+            r"^(.*?(\bleft_size\b).*?(\bshould be\b)"
+            r".*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))"
+        )
+        with self.assertRaisesRegexp(ValueError, expected_regex):
+            dataset = [
+                np.ones(shape=(200, 32, 32)),
+                np.zeros(shape=(200, 32, 32)),
+            ]
+            dataset_utils.split_dataset(dataset, left_size=1.5, right_size=0.2)
+
+        expected_regex = (
+            r"^(.*?(\bright_size\b).*?(\bshould be\b)"
+            r".*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))"
+        )
+        with self.assertRaisesRegex(ValueError, expected_regex):
+            dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))]
+            dataset_utils.split_dataset(dataset, left_size=0.8, right_size=-0.8)
+
+    def test_None_and_zero_left_and_right_size(self):
+        expected_regex = (
+            r"^.*?(\bleft_size\b).*?(\bright_size\b).*?(\bmust "
+            r"be specified\b).*?(\bReceived: left_size=None and"
+            r" right_size=None\b)"
+        )
+
+        with self.assertRaisesRegex(ValueError, expected_regex):
+            dataset_utils.split_dataset(
+                dataset=np.array([1, 2, 3]), left_size=None
+            )
+        with self.assertRaisesRegex(ValueError, expected_regex):
+            dataset_utils.split_dataset(
+                np.array([1, 2, 3]), left_size=None, right_size=None
+            )
+
+        expected_regex = (
+            r"^.*?(\bleft_size\b).*?(\bshould be\b)"
+            r".*?(\bpositive\b).*?(\bsmaller than 3\b)"
+        )
+        with self.assertRaisesRegex(ValueError, expected_regex):
+            dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=3)
+
+        expected_regex = (
+            "Both `left_size` and `right_size` are zero. "
+            "At least one of the split sizes must be non-zero."
+        )
+        with self.assertRaisesRegex(ValueError, expected_regex):
+            dataset_utils.split_dataset(
+                np.array([1, 2, 3]), left_size=0, right_size=0
+            )
+
+    def test_invalid_left_and_right_size_types(self):
+        expected_regex = (
+            r"^.*?(\bInvalid `left_size` and `right_size` Types"
+            r"\b).*?(\bExpected: integer or float or None\b)"
+        )
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            dataset_utils.split_dataset(
+                np.array([1, 2, 3]), left_size="1", right_size="1"
+            )
+
+        expected_regex = r"^.*?(\bInvalid `right_size` Type\b)"
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            dataset_utils.split_dataset(
+                np.array([1, 2, 3]), left_size=0, right_size="1"
+            )
+
+        expected_regex = r"^.*?(\bInvalid `left_size` Type\b)"
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            dataset_utils.split_dataset(
+                np.array([1, 2, 3]), left_size="100", right_size=None
+            )
+
+        expected_regex = r"^.*?(\bInvalid `right_size` Type\b)"
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            dataset_utils.split_dataset(np.array([1, 2, 3]), right_size="1")
+
+        expected_regex = r"^.*?(\bInvalid `right_size` Type\b)"
+        with self.assertRaisesRegex(TypeError, expected_regex):
+            dataset_utils.split_dataset(
+                np.array([1, 2, 3]), left_size=0.5, right_size="1"
+            )
+
+    def test_end_to_end(self):
+        x_train = np.random.random((10000, 28, 28))
+        y_train = np.random.randint(0, 10, size=(10000,))
+
+        left_split, right_split = dataset_utils.split_dataset(
+            (x_train, y_train), left_size=0.8
+        )
+
+        self.assertIsInstance(left_split, tf.data.Dataset)
+        self.assertIsInstance(right_split, tf.data.Dataset)
+
+        self.assertEqual(len(left_split), 8000)
+        self.assertEqual(len(right_split), 2000)
 
-  def test_numpy_array(self):
-    dataset = np.ones(shape=(200, 32))
-    res = dataset_utils.split_dataset(dataset, left_size=0.8, right_size=0.2)
-
-    self.assertLen(res, 2)
-    left_split, right_split = res
 
-    self.assertIsInstance(left_split, tf.data.Dataset)
-    self.assertIsInstance(right_split, tf.data.Dataset)
-
-    self.assertLen(left_split, 160)
-    self.assertLen(right_split, 40)
-
-    self.assertAllEqual(dataset[:160], list(left_split))
-    self.assertAllEqual(dataset[-40:], list(right_split))
+@test_utils.run_v2_only
+class IndexDirectoryStructureTest(tf.test.TestCase):
+    def test_explicit_labels_and_unnested_files(self):
 
-  def test_list_of_numpy_arrays(self):
-    # test with list of np arrays with same shapes
-    dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))]
-    res = dataset_utils.split_dataset(dataset, left_size=4)
+        # Get a unique temp directory
+        temp_dir = os.path.join(
+            self.get_temp_dir(), str(np.random.randint(1e6))
+        )
+        os.mkdir(temp_dir)
+        self.addCleanup(shutil.rmtree, temp_dir)
 
-    self.assertLen(res, 2)
-    left_split, right_split = res
+        # Number of temp files, each of which
+        # will have its own explicit label
+        num_files = 10
 
-    self.assertIsInstance(left_split, tf.data.Dataset)
-    self.assertIsInstance(right_split, tf.data.Dataset)
+        explicit_labels = np.random.randint(0, 10, size=num_files).tolist()
 
-    self.assertEqual(np.array(list(left_split)).shape, (4, 2, 32))
-    self.assertEqual(np.array(list(right_split)).shape, (196, 2, 32))
+        # Save empty text files to root of temp directory
+        # (content is not important, only location)
+        for i in range(len(explicit_labels)):
+            with open(os.path.join(temp_dir, f"file{i}.txt"), "w"):
+                pass
 
-    # test with different shapes
-    dataset = [np.ones(shape=(5, 3)), np.ones(shape=(5,))]
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=0.3)
+        file_paths, labels, class_names = dataset_utils.index_directory(
+            temp_dir, labels=explicit_labels, formats=".txt"
+        )
 
-    self.assertEqual(np.array(list(left_split)).shape, (2, 2))
-    self.assertEqual(np.array(list(right_split)).shape, (3, 2))
+        # Files are found at the root of the temp directory, when
+        # `labels` are passed explicitly to `index_directory` and
+        # the number of returned and passed labels match
+        self.assertLen(file_paths, num_files)
+        self.assertLen(labels, num_files)
 
-    self.assertEqual(np.array(list(left_split)[0]).shape, (2,))
-    self.assertEqual(np.array(list(left_split)[0][0]).shape, (3,))
-    self.assertEqual(np.array(list(left_split)[0][1]).shape, ())
+        # Class names are returned as a sorted list
+        expected_class_names = sorted(set(explicit_labels))
+        self.assertEqual(expected_class_names, class_names)
 
-    self.assertEqual(np.array(list(right_split)[0]).shape, (2,))
-    self.assertEqual(np.array(list(right_split)[0][0]).shape, (3,))
-    self.assertEqual(np.array(list(right_split)[0][1]).shape, ())
 
-  def test_dataset_with_invalid_shape(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Received a list of NumPy arrays '
-        'with different lengths'):
-      dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))]
-      dataset_utils.split_dataset(dataset, left_size=4)
-
-    with self.assertRaisesRegex(
-        ValueError, 'Received a tuple of NumPy arrays '
-        'with different lengths'):
-      dataset = (np.ones(shape=(200, 32)), np.zeros(shape=(201, 32)))
-      dataset_utils.split_dataset(dataset, left_size=4)
-
-  def test_tuple_of_numpy_arrays(self):
-    dataset = (np.random.rand(4, 3), np.random.rand(4, 3))
-    left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2)
-
-    self.assertIsInstance(left_split, tf.data.Dataset)
-    self.assertIsInstance(right_split, tf.data.Dataset)
-
-    self.assertEqual(len(left_split), 2)
-    self.assertEqual(len(right_split), 2)
-
-    self.assertEqual(np.array(list(left_split)[0]).shape, (2, 3))
-    self.assertEqual(np.array(list(left_split)[1]).shape, (2, 3))
-
-    # test with fractional size
-    dataset = (np.random.rand(5, 32, 32), np.random.rand(5, 32, 32))
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, right_size=0.4)
-    self.assertIsInstance(left_split, tf.data.Dataset)
-    self.assertIsInstance(right_split, tf.data.Dataset)
-
-    self.assertEqual(np.array(list(left_split)).shape, (3, 2, 32, 32))
-    self.assertEqual(np.array(list(right_split)).shape, (2, 2, 32, 32))
-
-    self.assertEqual(np.array(list(left_split))[0].shape, (2, 32, 32))
-    self.assertEqual(np.array(list(left_split))[1].shape, (2, 32, 32))
-
-    self.assertEqual(np.array(list(right_split))[0].shape, (2, 32, 32))
-    self.assertEqual(np.array(list(right_split))[1].shape, (2, 32, 32))
-
-    # test with tuple of np arrays with different shapes
-    dataset = (np.random.rand(5, 32, 32), np.random.rand(5,))
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=2, right_size=3)
-    self.assertIsInstance(left_split, tf.data.Dataset)
-    self.assertIsInstance(right_split, tf.data.Dataset)
-
-    self.assertEqual(np.array(list(left_split)).shape, (2, 2))
-    self.assertEqual(np.array(list(right_split)).shape, (3, 2))
-
-    self.assertEqual(np.array(list(left_split)[0]).shape, (2,))
-    self.assertEqual(np.array(list(left_split)[0][0]).shape, (32, 32))
-    self.assertEqual(np.array(list(left_split)[0][1]).shape, ())
-
-    self.assertEqual(np.array(list(right_split)[0]).shape, (2,))
-    self.assertEqual(np.array(list(right_split)[0][0]).shape, (32, 32))
-    self.assertEqual(np.array(list(right_split)[0][1]).shape, ())
-
-  def test_batched_tf_dataset_of_vectors(self):
-    vectors = np.ones(shape=(100, 32, 32, 1))
-    dataset = tf.data.Dataset.from_tensor_slices(vectors)
-    dataset = dataset.batch(10)
-    left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2)
-
-    # Ensure that the splits are batched
-    self.assertEqual(len(list(right_split)), 10)
-
-    left_split, right_split = left_split.unbatch(), right_split.unbatch()
-    self.assertAllEqual(np.array(list(left_split)).shape, (2, 32, 32, 1))
-    self.assertAllEqual(np.array(list(right_split)).shape, (98, 32, 32, 1))
-    dataset = dataset.unbatch()
-    self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
-
-  def test_batched_tf_dataset_of_tuple_of_vectors(self):
-    tuple_of_vectors = (np.random.rand(10, 32, 32), np.random.rand(10, 32, 32))
-    dataset = tf.data.Dataset.from_tensor_slices(tuple_of_vectors)
-    dataset = dataset.batch(2)
-    left_split, right_split = dataset_utils.split_dataset(dataset, left_size=4)
-
-    # Ensure that the splits are batched
-    self.assertEqual(np.array(list(right_split)).shape, (3, 2, 2, 32, 32))
-    self.assertEqual(np.array(list(left_split)).shape, (2, 2, 2, 32, 32))
-
-    left_split, right_split = left_split.unbatch(), right_split.unbatch()
-    self.assertAllEqual(np.array(list(left_split)).shape, (4, 2, 32, 32))
-    self.assertAllEqual(np.array(list(right_split)).shape, (6, 2, 32, 32))
-
-    dataset = dataset.unbatch()
-    self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
-
-  def test_batched_tf_dataset_of_dict_of_vectors(self):
-    dict_samples = {'X': np.random.rand(10, 3), 'Y': np.random.rand(10, 3)}
-    dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
-    dataset = dataset.batch(2)
-    left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2)
-
-    self.assertAllEqual(np.array(list(left_split)).shape, (1,))
-    self.assertAllEqual(np.array(list(right_split)).shape, (4,))
-
-    left_split, right_split = left_split.unbatch(), right_split.unbatch()
-    self.assertEqual(len(list(left_split)), 2)
-    self.assertEqual(len(list(right_split)), 8)
-    for i in range(10):
-      if i < 2:
-        self.assertEqual(list(left_split)[i], list(dataset.unbatch())[i])
-      else:
-        self.assertEqual(list(right_split)[i - 2], list(dataset.unbatch())[i])
-
-    # test with dict of np arrays with different shapes
-    dict_samples = {
-        'images': np.random.rand(10, 16, 16, 3),
-        'labels': np.random.rand(10,)
-    }
-    dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
-    dataset = dataset.batch(1)
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, right_size=0.3)
-
-    self.assertAllEqual(np.array(list(left_split)).shape, (7,))
-    self.assertAllEqual(np.array(list(right_split)).shape, (3,))
-
-    dataset = dataset.unbatch()
-    left_split, right_split = left_split.unbatch(), right_split.unbatch()
-    self.assertEqual(len(list(left_split)), 7)
-    self.assertEqual(len(list(right_split)), 3)
-    for i in range(10):
-      if i < 7:
-        self.assertEqual(list(left_split)[i], list(dataset)[i])
-      else:
-        self.assertEqual(list(right_split)[i - 7], list(dataset)[i])
-
-  def test_unbatched_tf_dataset_of_vectors(self):
-    vectors = np.ones(shape=(100, 16, 16, 3))
-    dataset = tf.data.Dataset.from_tensor_slices(vectors)
-
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=0.25)
-
-    self.assertAllEqual(np.array(list(left_split)).shape, (25, 16, 16, 3))
-    self.assertAllEqual(np.array(list(right_split)).shape, (75, 16, 16, 3))
-
-    self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
-
-    dataset = [np.random.rand(10, 3, 3) for _ in range(5)]
-    dataset = tf.data.Dataset.from_tensor_slices(dataset)
-
-    left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2)
-    self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
-
-  def test_unbatched_tf_dataset_of_tuple_of_vectors(self):
-    # test with tuple of np arrays with same shape
-    X, Y = (np.random.rand(10, 32, 32, 1), np.random.rand(10, 32, 32, 1))
-    dataset = tf.data.Dataset.from_tensor_slices((X, Y))
-
-    left_split, right_split = dataset_utils.split_dataset(dataset, left_size=5)
-
-    self.assertEqual(len(list(left_split)), 5)
-    self.assertEqual(len(list(right_split)), 5)
-    self.assertAllEqual(list(dataset), list(left_split) + list(right_split))
-
-    # test with tuple of np arrays with different shapes
-    X, Y = (np.random.rand(5, 3, 3), np.random.rand(5,))
-    dataset = tf.data.Dataset.from_tensor_slices((X, Y))
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=0.5)
-
-    self.assertEqual(len(list(left_split)), 2)
-    self.assertEqual(len(list(right_split)), 3)
-    self.assertEqual(np.array(list(left_split)[0][0]).shape, (3, 3))
-    self.assertEqual(np.array(list(left_split)[0][1]).shape, ())
-
-  def test_unbatched_tf_dataset_of_dict_of_vectors(self):
-    # test with dict of np arrays of same shape
-    dict_samples = {'X': np.random.rand(10, 2), 'Y': np.random.rand(10, 2)}
-    dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
-    left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2)
-    self.assertEqual(len(list(left_split)), 2)
-    self.assertEqual(len(list(right_split)), 8)
-    for i in range(10):
-      if i < 2:
-        self.assertEqual(list(left_split)[i], list(dataset)[i])
-      else:
-        self.assertEqual(list(right_split)[i - 2], list(dataset)[i])
-
-    # test with dict of np arrays with different shapes
-    dict_samples = {
-        'images': np.random.rand(10, 16, 16, 3),
-        'labels': np.random.rand(10,)
-    }
-    dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=0.3)
-    self.assertEqual(len(list(left_split)), 3)
-    self.assertEqual(len(list(right_split)), 7)
-    for i in range(10):
-      if i < 3:
-        self.assertEqual(list(left_split)[i], list(dataset)[i])
-      else:
-        self.assertEqual(list(right_split)[i - 3], list(dataset)[i])
-
-    # test with dict of text arrays
-    txt_feature = ['abb', 'bb', 'cc', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
-    dict_samples = {
-        'txt_feature': txt_feature,
-        'label': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-    }
-    dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=0.45, right_size=0.55)
-    self.assertEqual(len(list(left_split)), 4)
-    self.assertEqual(len(list(right_split)), 6)
-    for i in range(10):
-      if i < 4:
-        self.assertEqual(list(left_split)[i], list(dataset)[i])
-      else:
-        self.assertEqual(list(right_split)[i - 4], list(dataset)[i])
-
-  def test_list_dataset(self):
-    dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)]
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=5, right_size=5)
-    self.assertEqual(len(left_split), len(right_split))
-    self.assertIsInstance(left_split, tf.data.Dataset)
-    self.assertIsInstance(left_split, tf.data.Dataset)
-
-    dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)]
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=0.6, right_size=0.4)
-    self.assertEqual(len(left_split), 6)
-    self.assertEqual(len(right_split), 4)
-
-  def test_invalid_dataset(self):
-    with self.assertRaisesRegex(
-        TypeError, 'The `dataset` argument must be either a `tf.data.Dataset` '
-        'object or a list/tuple of arrays.'):
-      dataset_utils.split_dataset(dataset=None, left_size=5)
-    with self.assertRaisesRegex(
-        TypeError, 'The `dataset` argument must be either a `tf.data.Dataset` '
-        'object or a list/tuple of arrays.'):
-      dataset_utils.split_dataset(dataset=1, left_size=5)
-    with self.assertRaisesRegex(
-        TypeError, 'The `dataset` argument must be either a `tf.data.Dataset` '
-        'object or a list/tuple of arrays.'):
-      dataset_utils.split_dataset(dataset=float(1.2), left_size=5)
-    with self.assertRaisesRegex(
-        TypeError, 'The `dataset` argument must be either a `tf.data.Dataset` '
-        'object or a list/tuple of arrays.'):
-      dataset_utils.split_dataset(dataset=dict({}), left_size=5)
-    with self.assertRaisesRegex(
-        TypeError, 'The `dataset` argument must be either a `tf.data.Dataset` '
-        'object or a list/tuple of arrays.'):
-      dataset_utils.split_dataset(dataset=float('INF'), left_size=5)
-
-  def test_valid_left_and_right_sizes(self):
-    dataset = np.array([1, 2, 3])
-    splitted_dataset = dataset_utils.split_dataset(dataset, 1, 2)
-    self.assertLen(splitted_dataset, 2)
-    left_split, right_split = splitted_dataset
-    self.assertEqual(len(left_split), 1)
-    self.assertEqual(len(right_split), 2)
-    self.assertEqual(list(left_split), [1])
-    self.assertEqual(list(right_split), [2, 3])
-
-    dataset = np.ones(shape=(200, 32))
-    res = dataset_utils.split_dataset(dataset, left_size=150, right_size=50)
-    self.assertLen(res, 2)
-    self.assertIsInstance(res[0], tf.data.Dataset)
-    self.assertIsInstance(res[1], tf.data.Dataset)
-
-    self.assertLen(res[0], 150)
-    self.assertLen(res[1], 50)
-
-    dataset = np.ones(shape=(200, 32))
-    res = dataset_utils.split_dataset(dataset, left_size=120)
-    self.assertLen(res, 2)
-    self.assertIsInstance(res[0], tf.data.Dataset)
-    self.assertIsInstance(res[1], tf.data.Dataset)
-
-    self.assertLen(res[0], 120)
-    self.assertLen(res[1], 80)
-
-    dataset = np.ones(shape=(10000, 16))
-    res = dataset_utils.split_dataset(dataset, right_size=20)
-    self.assertLen(res, 2)
-    self.assertIsInstance(res[0], tf.data.Dataset)
-    self.assertIsInstance(res[1], tf.data.Dataset)
-
-    self.assertLen(res[0], 9980)
-    self.assertLen(res[1], 20)
-
-    dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-    splitted_dataset = dataset_utils.split_dataset(
-        dataset, left_size=0.1, right_size=0.9)
-    self.assertLen(splitted_dataset, 2)
-    left_split, right_split = splitted_dataset
-    self.assertEqual(len(left_split), 1)
-    self.assertEqual(len(right_split), 9)
-    self.assertEqual(list(left_split), [1])
-    self.assertEqual(list(right_split), [2, 3, 4, 5, 6, 7, 8, 9, 10])
-
-    dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-    splitted_dataset = dataset_utils.split_dataset(
-        dataset, left_size=2, right_size=5)
-    self.assertLen(splitted_dataset, 2)
-    left_split, right_split = splitted_dataset
-    self.assertEqual(len(left_split), 2)
-    self.assertEqual(len(right_split), 5)
-    self.assertEqual(list(left_split), [1, 2])
-    self.assertEqual(list(right_split), [6, 7, 8, 9, 10])
-
-  def test_float_left_and_right_sizes(self):
-    X = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
-    dataset = tf.data.Dataset.from_tensor_slices(X)
-    left_split, right_split = dataset_utils.split_dataset(
-        dataset, left_size=0.8, right_size=0.2)
-    self.assertEqual(len(left_split), 2)
-    self.assertEqual(len(right_split), 1)
-
-  def test_invalid_float_left_and_right_sizes(self):
-    expected_regex = (r'^(.*?(\bleft_size\b).*?(\bshould be\b)'
-                      r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))')
-    with self.assertRaisesRegexp(ValueError, expected_regex):
-      dataset = [np.ones(shape=(200, 32, 32)), np.zeros(shape=(200, 32, 32))]
-      dataset_utils.split_dataset(dataset, left_size=1.5, right_size=0.2)
-
-    expected_regex = (r'^(.*?(\bright_size\b).*?(\bshould be\b)'
-                      r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))')
-    with self.assertRaisesRegex(ValueError, expected_regex):
-      dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))]
-      dataset_utils.split_dataset(dataset, left_size=0.8, right_size=-0.8)
-
-  def test_None_and_zero_left_and_right_size(self):
-    expected_regex = (r'^.*?(\bleft_size\b).*?(\bright_size\b).*?(\bmust '
-                      r'be specified\b).*?(\bReceived: left_size=None and'
-                      r' right_size=None\b)')
-
-    with self.assertRaisesRegex(ValueError, expected_regex):
-      dataset_utils.split_dataset(dataset=np.array([1, 2, 3]), left_size=None)
-    with self.assertRaisesRegex(ValueError, expected_regex):
-      dataset_utils.split_dataset(
-          np.array([1, 2, 3]), left_size=None, right_size=None)
-
-    expected_regex = (r'^.*?(\bleft_size\b).*?(\bshould be\b)'
-                      r'.*?(\bpositive\b).*?(\bsmaller than 3\b)')
-    with self.assertRaisesRegex(ValueError, expected_regex):
-      dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=3)
-
-    expected_regex = ('Both `left_size` and `right_size` are zero. '
-                      'At least one of the split sizes must be non-zero.')
-    with self.assertRaisesRegex(ValueError, expected_regex):
-      dataset_utils.split_dataset(
-          np.array([1, 2, 3]), left_size=0, right_size=0)
-
-  def test_invalid_left_and_right_size_types(self):
-    expected_regex = (r'^.*?(\bInvalid `left_size` and `right_size` Types'
-                      r'\b).*?(\bExpected: integer or float or None\b)')
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      dataset_utils.split_dataset(
-          np.array([1, 2, 3]), left_size='1', right_size='1')
-
-    expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)')
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      dataset_utils.split_dataset(
-          np.array([1, 2, 3]), left_size=0, right_size='1')
-
-    expected_regex = (r'^.*?(\bInvalid `left_size` Type\b)')
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      dataset_utils.split_dataset(
-          np.array([1, 2, 3]), left_size='100', right_size=None)
-
-    expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)')
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      dataset_utils.split_dataset(np.array([1, 2, 3]), right_size='1')
-
-    expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)')
-    with self.assertRaisesRegex(TypeError, expected_regex):
-      dataset_utils.split_dataset(
-          np.array([1, 2, 3]), left_size=0.5, right_size='1')
-
-  def test_end_to_end(self):
-    x_train = np.random.random((10000, 28, 28))
-    y_train = np.random.randint(0, 10, size=(10000,))
-
-    left_split, right_split = dataset_utils.split_dataset(
-        (x_train, y_train), left_size=0.8)
-
-    self.assertIsInstance(left_split, tf.data.Dataset)
-    self.assertIsInstance(right_split, tf.data.Dataset)
-
-    self.assertEqual(len(left_split), 8000)
-    self.assertEqual(len(right_split), 2000)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/feature_space.py b/keras/utils/feature_space.py
new file mode 100644
index 000000000000..e52e158dab05
--- /dev/null
+++ b/keras/utils/feature_space.py
@@ -0,0 +1,772 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FeatureSpace structured data preprocessing & encoding utility."""
+
+import tensorflow.compat.v2 as tf
+
+from keras import backend
+from keras.engine import base_layer
+from keras.saving import saving_lib
+from keras.saving import serialization_lib
+from keras.utils.generic_utils import LazyLoader
+
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
+
+layers = LazyLoader("layers", globals(), "keras.layers")
+
+
+class Cross:
+    def __init__(self, feature_names, crossing_dim, output_mode="one_hot"):
+        if output_mode not in {"int", "one_hot"}:
+            raise ValueError(
+                "Invalid value for argument `output_mode`. "
+                "Expected one of {'int', 'one_hot'}. "
+                f"Received: output_mode={output_mode}"
+            )
+        self.feature_names = tuple(feature_names)
+        self.crossing_dim = crossing_dim
+        self.output_mode = output_mode
+
+    @property
+    def name(self):
+        return "_X_".join(self.feature_names)
+
+    def get_config(self):
+        return {
+            "feature_names": self.feature_names,
+            "crossing_dim": self.crossing_dim,
+            "output_mode": self.output_mode,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+
+class Feature:
+    def __init__(self, dtype, preprocessor, output_mode):
+        if output_mode not in {"int", "one_hot", "float"}:
+            raise ValueError(
+                "Invalid value for argument `output_mode`. "
+                "Expected one of {'int', 'one_hot', 'float'}. "
+                f"Received: output_mode={output_mode}"
+            )
+        self.dtype = dtype
+        if isinstance(preprocessor, dict):
+            preprocessor = serialization_lib.deserialize_keras_object(
+                preprocessor
+            )
+        self.preprocessor = preprocessor
+        self.output_mode = output_mode
+
+    def get_config(self):
+        return {
+            "dtype": self.dtype,
+            "preprocessor": serialization_lib.serialize_keras_object(
+                self.preprocessor
+            ),
+            "output_mode": self.output_mode,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+
+@keras_export("keras.utils.FeatureSpace", v1=[])
+class FeatureSpace(base_layer.Layer):
+    """One-stop utility for preprocessing and encoding structured data.
+
+    Arguments:
+        feature_names: Dict mapping the names of your features to their
+            type specification, e.g. `{"my_feature": "integer_categorical"}`
+            or `{"my_feature": FeatureSpace.integer_categorical()}`.
+            For a complete list of all supported types, see
+            "Available feature types" paragraph below.
+        output_mode: One of `"concat"` or `"dict"`. In concat mode, all
+            features get concatenated together into a single vector.
+            In dict mode, the FeatureSpace returns a dict of individually
+            encoded features (with the same keys as the input dict keys).
+        crosses: List of features to be crossed together, e.g.
+            `crosses=[("feature_1", "feature_2")]`. The features will be
+            "crossed" by hashing their combined value into
+            a fixed-length vector.
+        crossing_dim: Default vector size for hashing crossed features.
+            Defaults to `32`.
+        hashing_dim: Default vector size for hashing features of type
+            `"integer_hashed"` and `"string_hashed"`. Defaults to `32`.
+        num_discretization_bins: Default number of bins to be used for
+            discretizing features of type `"float_discretized"`.
+            Defaults to `32`.
+
+    **Available feature types:**
+
+    Note that all features can be referred to by their string name,
+    e.g. `"integer_categorical"`. When using the string name, the default
+    argument values are used.
+
+    ```python
+    # Plain float values.
+    FeatureSpace.float(name=None)
+
+    # Float values to be preprocessed via featurewise standardization
+    # (i.e. via a `keras.layers.Normalization` layer).
+    FeatureSpace.float_normalized(name=None)
+
+    # Float values to be preprocessed via linear rescaling
+    # (i.e. via a `keras.layers.Rescaling` layer).
+    FeatureSpace.float_rescaled(scale=1., offset=0., name=None)
+
+    # Float values to be discretized. By default, the discrete
+    # representation will then be one-hot encoded.
+    FeatureSpace.float_discretized(
+        num_bins, bin_boundaries=None, output_mode="one_hot", name=None)
+
+    # Integer values to be indexed. By default, the discrete
+    # representation will then be one-hot encoded.
+    FeatureSpace.integer_categorical(
+        max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)
+
+    # String values to be indexed. By default, the discrete
+    # representation will then be one-hot encoded.
+    FeatureSpace.string_categorical(
+        max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)
+
+    # Integer values to be hashed into a fixed number of bins.
+    # By default, the discrete representation will then be one-hot encoded.
+    FeatureSpace.integer_hashed(num_bins, output_mode="one_hot", name=None)
+
+    # String values to be hashed into a fixed number of bins.
+    # By default, the discrete representation will then be one-hot encoded.
+    FeatureSpace.string_hashed(num_bins, output_mode="one_hot", name=None)
+    ```
+
+    Examples:
+
+    **Basic usage with a dict of input data:**
+
+    ```python
+    raw_data = {
+        "float_values": [0.0, 0.1, 0.2, 0.3],
+        "string_values": ["zero", "one", "two", "three"],
+        "int_values": [0, 1, 2, 3],
+    }
+    dataset = tf.data.Dataset.from_tensor_slices(raw_data)
+
+    feature_space = FeatureSpace(
+        features={
+            "float_values": "float_normalized",
+            "string_values": "string_categorical",
+            "int_values": "integer_categorical",
+        },
+        crosses=[("string_values", "int_values")],
+        output_mode="concat",
+    )
+    # Before you start using the FeatureSpace,
+    # you must `adapt()` it on some data.
+    feature_space.adapt(dataset)
+
+    # You can call the FeatureSpace on a dict of data (batched or unbatched).
+    output_vector = feature_space(raw_data)
+    ```
+
+    **Basic usage with `tf.data`:**
+
+    ```python
+    # Unlabeled data
+    preprocessed_ds = unlabeled_dataset.map(feature_space)
+
+    # Labeled data
+    preprocessed_ds = labeled_dataset.map(lambda x, y: (feature_space(x), y))
+    ```
+
+    **Basic usage with the Keras Functional API:**
+
+    ```python
+    # Retrieve a dict Keras Input objects
+    inputs = feature_space.get_inputs()
+    # Retrieve the corresponding encoded Keras tensors
+    encoded_features = feature_space.get_encoded_features()
+    # Build a Functional model
+    outputs = keras.layers.Dense(1, activation="sigmoid")(encoded_features)
+    model = keras.Model(inputs, outputs)
+    ```
+
+    **Customizing each feature or feature cross:**
+
+    ```python
+    feature_space = FeatureSpace(
+        features={
+            "float_values": FeatureSpace.float_normalized(),
+            "string_values": FeatureSpace.string_categorical(max_tokens=10),
+            "int_values": FeatureSpace.integer_categorical(max_tokens=10),
+        },
+        crosses=[
+            FeatureSpace.cross(("string_values", "int_values"), crossing_dim=32)
+        ],
+        output_mode="concat",
+    )
+    ```
+
+    **Returning a dict of integer-encoded features:**
+
+    ```python
+    feature_space = FeatureSpace(
+        features={
+            "string_values": FeatureSpace.string_categorical(output_mode="int"),
+            "int_values": FeatureSpace.integer_categorical(output_mode="int"),
+        },
+        crosses=[
+            FeatureSpace.cross(
+                feature_names=("string_values", "int_values"),
+                crossing_dim=32,
+                output_mode="int",
+            )
+        ],
+        output_mode="dict",
+    )
+    ```
+
+    **Specifying your own Keras preprocessing layer:**
+
+    ```python
+    # Let's say that one of the features is a short text paragraph that
+    # we want to encode as a vector (one vector per paragraph) via TF-IDF.
+    data = {
+        "text": ["1st string", "2nd string", "3rd string"],
+    }
+
+    # There's a Keras layer for this: TextVectorization.
+    custom_layer = layers.TextVectorization(output_mode="tf_idf")
+
+    # We can use FeatureSpace.feature to create a custom feature
+    # that will use our preprocessing layer.
+    feature_space = FeatureSpace(
+        features={
+            "text": FeatureSpace.feature(
+                preprocessor=custom_layer, dtype="string", output_mode="float"
+            ),
+        },
+        output_mode="concat",
+    )
+    feature_space.adapt(tf.data.Dataset.from_tensor_slices(data))
+    output_vector = feature_space(data)
+    ```
+
+    **Retrieving the underlying Keras preprocessing layers:**
+
+    ```python
+    # The preprocessing layer of each feature is available in `.preprocessors`.
+    preprocessing_layer = feature_space.preprocessors["feature1"]
+
+    # The crossing layer of each feature cross is available in `.crossers`.
+    # It's an instance of keras.layers.HashedCrossing.
+    crossing_layer = feature_space.crossers["feature1_X_feature2"]
+    ```
+
+    **Saving and reloading a FeatureSpace:**
+
+    ```python
+    feature_space.save("myfeaturespace.keras")
+    reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
+    ```
+    """
+
+    @classmethod
+    def cross(cls, feature_names, crossing_dim, output_mode="one_hot"):
+        return Cross(feature_names, crossing_dim, output_mode=output_mode)
+
+    @classmethod
+    def feature(cls, dtype, preprocessor, output_mode):
+        return Feature(dtype, preprocessor, output_mode)
+
+    @classmethod
+    def float(cls, name=None):
+        from keras.layers.core import identity
+
+        name = name or backend.unique_object_name("float")
+        preprocessor = identity.Identity(
+            dtype="float32", name=f"{name}_preprocessor"
+        )
+        return Feature(
+            dtype="float32", preprocessor=preprocessor, output_mode="float"
+        )
+
+    @classmethod
+    def float_rescaled(cls, scale=1.0, offset=0.0, name=None):
+        name = name or backend.unique_object_name("float_rescaled")
+        preprocessor = layers.Rescaling(
+            scale=scale, offset=offset, name=f"{name}_preprocessor"
+        )
+        return Feature(
+            dtype="float32", preprocessor=preprocessor, output_mode="float"
+        )
+
+    @classmethod
+    def float_normalized(cls, name=None):
+        name = name or backend.unique_object_name("float_normalized")
+        preprocessor = layers.Normalization(
+            axis=-1, name=f"{name}_preprocessor"
+        )
+        return Feature(
+            dtype="float32", preprocessor=preprocessor, output_mode="float"
+        )
+
+    @classmethod
+    def float_discretized(
+        cls, num_bins, bin_boundaries=None, output_mode="one_hot", name=None
+    ):
+        name = name or backend.unique_object_name("float_discretized")
+        preprocessor = layers.Discretization(
+            num_bins=num_bins,
+            bin_boundaries=bin_boundaries,
+            name=f"{name}_preprocessor",
+        )
+        return Feature(
+            dtype="float32", preprocessor=preprocessor, output_mode=output_mode
+        )
+
+    @classmethod
+    def integer_categorical(
+        cls,
+        max_tokens=None,
+        num_oov_indices=1,
+        output_mode="one_hot",
+        name=None,
+    ):
+        name = name or backend.unique_object_name("integer_categorical")
+        preprocessor = layers.IntegerLookup(
+            name=f"{name}_preprocessor",
+            max_tokens=max_tokens,
+            num_oov_indices=num_oov_indices,
+        )
+        return Feature(
+            dtype="int64", preprocessor=preprocessor, output_mode=output_mode
+        )
+
+    @classmethod
+    def string_categorical(
+        cls,
+        max_tokens=None,
+        num_oov_indices=1,
+        output_mode="one_hot",
+        name=None,
+    ):
+        name = name or backend.unique_object_name("string_categorical")
+        preprocessor = layers.StringLookup(
+            name=f"{name}_preprocessor",
+            max_tokens=max_tokens,
+            num_oov_indices=num_oov_indices,
+        )
+        return Feature(
+            dtype="string", preprocessor=preprocessor, output_mode=output_mode
+        )
+
+    @classmethod
+    def string_hashed(cls, num_bins, output_mode="one_hot", name=None):
+        name = name or backend.unique_object_name("string_hashed")
+        preprocessor = layers.Hashing(
+            name=f"{name}_preprocessor", num_bins=num_bins
+        )
+        return Feature(
+            dtype="string", preprocessor=preprocessor, output_mode=output_mode
+        )
+
+    @classmethod
+    def integer_hashed(cls, num_bins, output_mode="one_hot", name=None):
+        name = name or backend.unique_object_name("integer_hashed")
+        preprocessor = layers.Hashing(
+            name=f"{name}_preprocessor", num_bins=num_bins
+        )
+        return Feature(
+            dtype="int64", preprocessor=preprocessor, output_mode=output_mode
+        )
+
+    def __init__(
+        self,
+        features,
+        output_mode="concat",
+        crosses=None,
+        crossing_dim=32,
+        hashing_dim=32,
+        num_discretization_bins=32,
+    ):
+        if not features:
+            raise ValueError("The `features` argument cannot be None or empty.")
+        self.crossing_dim = crossing_dim
+        self.hashing_dim = hashing_dim
+        self.num_discretization_bins = num_discretization_bins
+        self.features = {
+            name: self._standardize_feature(name, value)
+            for name, value in features.items()
+        }
+        self.crosses = []
+        if crosses:
+            feature_set = set(features.keys())
+            for cross in crosses:
+                if isinstance(cross, dict):
+                    cross = serialization_lib.deserialize_keras_object(cross)
+                if isinstance(cross, Cross):
+                    self.crosses.append(cross)
+                else:
+                    if not crossing_dim:
+                        raise ValueError(
+                            "When specifying `crosses`, the argument "
+                            "`crossing_dim` "
+                            "(dimensionality of the crossing space) "
+                            "should be specified as well."
+                        )
+                    for key in cross:
+                        if key not in feature_set:
+                            raise ValueError(
+                                "All features referenced "
+                                "in the `crosses` argument "
+                                "should be present in the `features` dict. "
+                                f"Received unknown features: {cross}"
+                            )
+                    self.crosses.append(Cross(cross, crossing_dim=crossing_dim))
+        self.crosses_by_name = {cross.name: cross for cross in self.crosses}
+
+        if output_mode not in {"dict", "concat"}:
+            raise ValueError(
+                "Invalid value for argument `output_mode`. "
+                "Expected one of {'dict', 'concat'}. "
+                f"Received: output_mode={output_mode}"
+            )
+        self.output_mode = output_mode
+
+        self.inputs = {
+            name: self._feature_to_input(name, value)
+            for name, value in self.features.items()
+        }
+        self.preprocessors = {
+            name: value.preprocessor for name, value in self.features.items()
+        }
+        self.encoded_features = None
+        self.crossers = {
+            cross.name: self._cross_to_crosser(cross) for cross in self.crosses
+        }
+        self.one_hot_encoders = {}
+        self.built = False
+        self._is_adapted = False
+        self.concat = None
+        self._preprocessed_features_names = None
+        self._crossed_features_names = None
+
+    def _feature_to_input(self, name, feature):
+        return layers.Input(shape=(1,), dtype=feature.dtype, name=name)
+
+    def _standardize_feature(self, name, feature):
+        if isinstance(feature, Feature):
+            return feature
+
+        if isinstance(feature, dict):
+            return serialization_lib.deserialize_keras_object(feature)
+
+        if feature == "float":
+            return self.float(name=name)
+        elif feature == "float_normalized":
+            return self.float_normalized(name=name)
+        elif feature == "float_rescaled":
+            return self.float_rescaled(name=name)
+        elif feature == "float_discretized":
+            return self.float_discretized(
+                name=name, num_bins=self.num_discretization_bins
+            )
+        elif feature == "integer_categorical":
+            return self.integer_categorical(name=name)
+        elif feature == "string_categorical":
+            return self.string_categorical(name=name)
+        elif feature == "integer_hashed":
+            return self.integer_hashed(self.hashing_dim, name=name)
+        elif feature == "string_hashed":
+            return self.string_hashed(self.hashing_dim, name=name)
+        else:
+            raise ValueError(f"Invalid feature type: {feature}")
+
+    def _cross_to_crosser(self, cross):
+        return layers.HashedCrossing(cross.crossing_dim, name=cross.name)
+
+    def _list_adaptable_preprocessors(self):
+        adaptable_preprocessors = []
+        for name in self.features.keys():
+            preprocessor = self.preprocessors[name]
+            # Special case: a Normalization layer with preset mean/variance.
+            # Not adaptable.
+            if isinstance(preprocessor, layers.Normalization):
+                if preprocessor.input_mean is not None:
+                    continue
+            if hasattr(preprocessor, "adapt"):
+                adaptable_preprocessors.append(name)
+        return adaptable_preprocessors
+
+    def adapt(self, dataset):
+        if not isinstance(dataset, tf.data.Dataset):
+            raise ValueError(
+                "`adapt()` can only be called on a tf.data.Dataset. "
+                f"Received instead: {dataset} (of type {type(dataset)})"
+            )
+
+        for name in self._list_adaptable_preprocessors():
+            # Call adapt() on each individual adaptable layer.
+
+            # TODO: consider rewriting this to instead iterate on the
+            # dataset once, split each batch into individual features,
+            # and call the layer's `_adapt_function` on each batch
+            # to simulate the behavior of adapt() in a more performant fashion.
+
+            feature_dataset = dataset.map(lambda x: x[name])
+            preprocessor = self.preprocessors[name]
+            # TODO: consider adding an adapt progress bar.
+            # Sample 1 element to check the rank
+            for x in feature_dataset.take(1):
+                pass
+            if x.shape.rank == 0:
+                # The dataset yields unbatched scalars; batch it.
+                feature_dataset = feature_dataset.batch(32)
+            if x.shape.rank in {0, 1}:
+                # If the rank is 1, add a dimension
+                # so we can reduce on axis=-1.
+                # Note: if rank was previously 0, it is now 1.
+                feature_dataset = feature_dataset.map(
+                    lambda x: tf.expand_dims(x, -1)
+                )
+            preprocessor.adapt(feature_dataset)
+        self._is_adapted = True
+        self.get_encoded_features()  # Finish building the layer
+        self.built = True
+
+    def get_inputs(self):
+        self._check_if_built()
+        return self.inputs
+
+    def get_encoded_features(self):
+        self._check_if_adapted()
+
+        if self.encoded_features is None:
+            preprocessed_features = self._preprocess_features(self.inputs)
+            crossed_features = self._cross_features(preprocessed_features)
+            merged_features = self._merge_features(
+                preprocessed_features, crossed_features
+            )
+            self.encoded_features = merged_features
+        return self.encoded_features
+
+    def _preprocess_features(self, features):
+        return {
+            name: self.preprocessors[name](features[name])
+            for name in features.keys()
+        }
+
+    def _cross_features(self, features):
+        all_outputs = {}
+        for cross in self.crosses:
+            inputs = [features[name] for name in cross.feature_names]
+            outputs = self.crossers[cross.name](inputs)
+            all_outputs[cross.name] = outputs
+        return all_outputs
+
+    def _merge_features(self, preprocessed_features, crossed_features):
+        if not self._preprocessed_features_names:
+            self._preprocessed_features_names = sorted(
+                preprocessed_features.keys()
+            )
+            self._crossed_features_names = sorted(crossed_features.keys())
+
+        all_names = (
+            self._preprocessed_features_names + self._crossed_features_names
+        )
+        all_features = [
+            preprocessed_features[name]
+            for name in self._preprocessed_features_names
+        ] + [crossed_features[name] for name in self._crossed_features_names]
+
+        if self.output_mode == "dict":
+            output_dict = {}
+        else:
+            features_to_concat = []
+
+        if self.built:
+            # Fast mode.
+            for name, feature in zip(all_names, all_features):
+                encoder = self.one_hot_encoders.get(name, None)
+                if encoder:
+                    feature = encoder(feature)
+                if self.output_mode == "dict":
+                    output_dict[name] = feature
+                else:
+                    features_to_concat.append(feature)
+            if self.output_mode == "dict":
+                return output_dict
+            else:
+                return self.concat(features_to_concat)
+
+        # If the object isn't built,
+        # we create the encoder and concat layers below
+        all_specs = [
+            self.features[name] for name in self._preprocessed_features_names
+        ] + [
+            self.crosses_by_name[name] for name in self._crossed_features_names
+        ]
+        for name, feature, spec in zip(all_names, all_features, all_specs):
+            dtype = feature.dtype.name
+
+            if spec.output_mode == "one_hot":
+                preprocessor = self.preprocessors.get(
+                    name
+                ) or self.crossers.get(name)
+                cardinality = None
+                if not feature.dtype.name.startswith("int"):
+                    raise ValueError(
+                        f"Feature '{name}' has `output_mode='one_hot'`. "
+                        "Thus its preprocessor should return an int64 dtype. "
+                        f"Instead it returns a {dtype} dtype."
+                    )
+
+                if isinstance(
+                    preprocessor, (layers.IntegerLookup, layers.StringLookup)
+                ):
+                    cardinality = preprocessor.vocabulary_size()
+                elif isinstance(preprocessor, layers.CategoryEncoding):
+                    cardinality = preprocessor.num_tokens
+                elif isinstance(preprocessor, layers.Discretization):
+                    cardinality = preprocessor.num_bins
+                elif isinstance(
+                    preprocessor, (layers.HashedCrossing, layers.Hashing)
+                ):
+                    cardinality = preprocessor.num_bins
+                else:
+                    raise ValueError(
+                        f"Feature '{name}' has `output_mode='one_hot'`. "
+                        "However it isn't a standard feature and the "
+                        "dimensionality of its output space is not known, "
+                        "thus it cannot be one-hot encoded. "
+                        "Try using `output_mode='int'`."
+                    )
+                if cardinality is not None:
+                    encoder = layers.CategoryEncoding(
+                        num_tokens=cardinality, output_mode="multi_hot"
+                    )
+                    self.one_hot_encoders[name] = encoder
+                    feature = encoder(feature)
+
+            if self.output_mode == "concat":
+                dtype = feature.dtype.name
+                if dtype.startswith("int") or dtype == "string":
+                    raise ValueError(
+                        f"Cannot concatenate features because feature '{name}' "
+                        f"has not been encoded (it has dtype {dtype}). "
+                        "Consider using `output_mode='dict'`."
+                    )
+                features_to_concat.append(feature)
+            else:
+                output_dict[name] = feature
+
+        if self.output_mode == "concat":
+            self.concat = layers.Concatenate(axis=-1)
+            return self.concat(features_to_concat)
+        else:
+            return output_dict
+
+    def _check_if_adapted(self):
+        if not self._is_adapted:
+            if not self._list_adaptable_preprocessors():
+                self._is_adapted = True
+            else:
+                raise ValueError(
+                    "You need to call `.adapt(dataset)` on the FeatureSpace "
+                    "before you can start using it."
+                )
+
+    def _check_if_built(self):
+        if not self.built:
+            self._check_if_adapted()
+            # Finishes building
+            self.get_encoded_features()
+            self.built = True
+
+    def __call__(self, data):
+        self._check_if_built()
+        if not isinstance(data, dict):
+            raise ValueError(
+                "A FeatureSpace can only be called with a dict. "
+                f"Received: data={data} (of type {type(data)}"
+            )
+
+        data = {key: tf.convert_to_tensor(value) for key, value in data.items()}
+        rebatched = False
+        for name, x in data.items():
+            if x.shape.rank == 0:
+                data[name] = tf.reshape(x, [1, 1])
+                rebatched = True
+            elif x.shape.rank == 1:
+                data[name] = tf.expand_dims(x, -1)
+
+        preprocessed_data = self._preprocess_features(data)
+        crossed_data = self._cross_features(preprocessed_data)
+        merged_data = self._merge_features(preprocessed_data, crossed_data)
+        if rebatched:
+            if self.output_mode == "concat":
+                assert merged_data.shape[0] == 1
+                return tf.squeeze(merged_data, axis=0)
+            else:
+                for name, x in merged_data.items():
+                    if x.shape.rank == 2 and x.shape[0] == 1:
+                        merged_data[name] = tf.squeeze(x, axis=0)
+        return merged_data
+
+    def get_config(self):
+        return {
+            "features": serialization_lib.serialize_keras_object(self.features),
+            "output_mode": self.output_mode,
+            "crosses": serialization_lib.serialize_keras_object(self.crosses),
+            "crossing_dim": self.crossing_dim,
+            "hashing_dim": self.hashing_dim,
+            "num_discretization_bins": self.num_discretization_bins,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+    def get_build_config(self):
+        return {
+            name: feature.preprocessor.get_build_config()
+            for name, feature in self.features.items()
+        }
+
+    def build_from_config(self, config):
+        for name in config.keys():
+            self.features[name].preprocessor.build_from_config(config[name])
+        self._is_adapted = True
+
+    def save(self, filepath):
+        """Save the `FeatureSpace` instance to a `.keras` file.
+
+        You can reload it via `keras.models.load_model()`:
+
+        ```python
+        feature_space.save("myfeaturespace.keras")
+        reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
+        ```
+        """
+        saving_lib.save_model(self, filepath)
+
+    def save_own_variables(self, store):
+        return
+
+    def load_own_variables(self, store):
+        return
diff --git a/keras/utils/feature_space_test.py b/keras/utils/feature_space_test.py
new file mode 100644
index 000000000000..ee3a8770290c
--- /dev/null
+++ b/keras/utils/feature_space_test.py
@@ -0,0 +1,400 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for FeatureSpace utility."""
+
+import os
+
+import tensorflow.compat.v2 as tf
+
+import keras
+from keras import layers
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import feature_space
+
+
+@test_utils.run_v2_only
+class FeatureSpaceTest(test_combinations.TestCase):
+    def _get_train_data_dict(
+        self, as_dataset=False, as_tf_tensors=False, as_labeled_dataset=False
+    ):
+        data = {
+            "float_1": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+            "float_2": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+            "float_3": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+            "string_1": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
+            "string_2": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
+            "int_1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "int_2": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "int_3": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        if as_dataset:
+            return tf.data.Dataset.from_tensor_slices(data)
+        elif as_tf_tensors:
+            return tf.nest.map_structure(tf.convert_to_tensor, data)
+        elif as_labeled_dataset:
+            labels = [0, 1, 0, 1, 0, 0, 1, 0, 1, 1]
+            return tf.data.Dataset.from_tensor_slices((data, labels))
+        return data
+
+    def test_basic_usage(self):
+        fs = feature_space.FeatureSpace(
+            features={
+                "float_1": "float",
+                "float_2": "float_normalized",
+                "float_3": "float_discretized",
+                "string_1": "string_categorical",
+                "string_2": "string_hashed",
+                "int_1": "integer_categorical",
+                "int_2": "integer_hashed",
+                "int_3": "integer_categorical",
+            },
+            crosses=[("float_3", "string_1"), ("string_2", "int_2")],
+            output_mode="concat",
+        )
+        # Test unbatched adapt
+        fs.adapt(self._get_train_data_dict(as_dataset=True))
+        # Test batched adapt
+        fs.adapt(self._get_train_data_dict(as_dataset=True).batch(4))
+
+        # Test unbatched call on raw data
+        data = {
+            key: value[0] for key, value in self._get_train_data_dict().items()
+        }
+        out = fs(data)
+        self.assertEqual(out.shape.as_list(), [195])
+
+        # Test unbatched call on TF tensors
+        data = self._get_train_data_dict(as_tf_tensors=True)
+        data = {key: value[0] for key, value in data.items()}
+        out = fs(data)
+        self.assertEqual(out.shape.as_list(), [195])
+
+        # Test batched call on raw data
+        out = fs(self._get_train_data_dict())
+        self.assertEqual(out.shape.as_list(), [10, 195])
+
+        # Test batched call on TF tensors
+        out = fs(self._get_train_data_dict(as_tf_tensors=True))
+        self.assertEqual(out.shape.as_list(), [10, 195])
+
+    def test_output_mode_dict(self):
+        fs = feature_space.FeatureSpace(
+            features={
+                "float_1": "float",
+                "float_2": "float_normalized",
+                "float_3": "float_discretized",
+                "string_1": "string_categorical",
+                "string_2": "string_hashed",
+                "int_1": "integer_categorical",
+                "int_2": "integer_hashed",
+                "int_3": "integer_categorical",
+            },
+            crosses=[("float_3", "string_1"), ("string_2", "int_2")],
+            output_mode="dict",
+        )
+        fs.adapt(self._get_train_data_dict(as_dataset=True))
+
+        # Test unbatched call on raw data
+        data = {
+            key: value[0] for key, value in self._get_train_data_dict().items()
+        }
+        out = fs(data)
+        self.assertIsInstance(out, dict)
+        self.assertLen(out, 10)
+        self.assertEqual(out["string_1"].shape.as_list(), [11])
+        self.assertEqual(out["int_2"].shape.as_list(), [32])
+        self.assertEqual(out["string_2_X_int_2"].shape.as_list(), [32])
+
+        # Test batched call on raw data
+        out = fs(self._get_train_data_dict())
+        self.assertIsInstance(out, dict)
+        self.assertLen(out, 10)
+        self.assertEqual(out["string_1"].shape.as_list(), [10, 11])
+        self.assertEqual(out["int_2"].shape.as_list(), [10, 32])
+        self.assertEqual(out["string_2_X_int_2"].shape.as_list(), [10, 32])
+
+        # Test batched call on TF tensors
+        out = fs(self._get_train_data_dict(as_tf_tensors=True))
+        self.assertIsInstance(out, dict)
+        self.assertLen(out, 10)
+        self.assertEqual(out["string_1"].shape.as_list(), [10, 11])
+        self.assertEqual(out["int_2"].shape.as_list(), [10, 32])
+        self.assertEqual(out["string_2_X_int_2"].shape.as_list(), [10, 32])
+
+    def test_output_mode_dict_of_ints(self):
+        cls = feature_space.FeatureSpace
+        fs = feature_space.FeatureSpace(
+            features={
+                "float_1": "float",
+                "float_2": "float_normalized",
+                "float_3": "float_discretized",
+                "string_1": cls.string_categorical(output_mode="int"),
+                "string_2": cls.string_hashed(num_bins=32, output_mode="int"),
+                "int_1": cls.integer_categorical(output_mode="int"),
+                "int_2": cls.integer_hashed(num_bins=32, output_mode="int"),
+                "int_3": cls.integer_categorical(output_mode="int"),
+            },
+            crosses=[
+                cls.cross(
+                    ("float_3", "string_1"), output_mode="int", crossing_dim=32
+                ),
+                cls.cross(
+                    ("string_2", "int_2"), output_mode="int", crossing_dim=32
+                ),
+            ],
+            output_mode="dict",
+        )
+        fs.adapt(self._get_train_data_dict(as_dataset=True))
+        data = {
+            key: value[0] for key, value in self._get_train_data_dict().items()
+        }
+        out = fs(data)
+        self.assertIsInstance(out, dict)
+        self.assertLen(out, 10)
+        self.assertEqual(out["string_1"].shape.as_list(), [1])
+        self.assertEqual(out["string_1"].dtype.name, "int64")
+        self.assertEqual(out["int_2"].shape.as_list(), [1])
+        self.assertEqual(out["int_2"].dtype.name, "int64")
+        self.assertEqual(out["string_2_X_int_2"].shape.as_list(), [1])
+        self.assertEqual(out["string_2_X_int_2"].dtype.name, "int64")
+
+    def test_functional_api_sync_processing(self):
+        fs = feature_space.FeatureSpace(
+            features={
+                "float_1": "float",
+                "float_2": "float_normalized",
+                "float_3": "float_discretized",
+                "string_1": "string_categorical",
+                "string_2": "string_hashed",
+                "int_1": "integer_categorical",
+                "int_2": "integer_hashed",
+                "int_3": "integer_categorical",
+            },
+            crosses=[("float_3", "string_1"), ("string_2", "int_2")],
+            output_mode="concat",
+        )
+        fs.adapt(self._get_train_data_dict(as_dataset=True))
+        inputs = fs.get_inputs()
+        features = fs.get_encoded_features()
+        outputs = layers.Dense(1)(features)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        model.compile("adam", "mse")
+        ds = self._get_train_data_dict(as_labeled_dataset=True)
+        model.fit(ds.batch(4))
+        model.evaluate(ds.batch(4))
+        ds = self._get_train_data_dict(as_dataset=True)
+        model.predict(ds.batch(4))
+
+    def test_tf_data_async_processing(self):
+        fs = feature_space.FeatureSpace(
+            features={
+                "float_1": "float",
+                "float_2": "float_normalized",
+                "float_3": "float_discretized",
+                "string_1": "string_categorical",
+                "string_2": "string_hashed",
+                "int_1": "integer_categorical",
+                "int_2": "integer_hashed",
+                "int_3": "integer_categorical",
+            },
+            crosses=[("float_3", "string_1"), ("string_2", "int_2")],
+            output_mode="concat",
+        )
+        fs.adapt(self._get_train_data_dict(as_dataset=True))
+        features = fs.get_encoded_features()
+        outputs = layers.Dense(1)(features)
+        model = keras.Model(inputs=features, outputs=outputs)
+        model.compile("adam", "mse")
+        ds = self._get_train_data_dict(as_labeled_dataset=True)
+        # Try map before batch
+        ds = ds.map(lambda x, y: (fs(x), y))
+        model.fit(ds.batch(4))
+        # Try map after batch
+        ds = self._get_train_data_dict(as_labeled_dataset=True)
+        ds = ds.batch(4)
+        ds = ds.map(lambda x, y: (fs(x), y))
+        model.evaluate(ds)
+        ds = self._get_train_data_dict(as_dataset=True)
+        ds = ds.map(fs)
+        model.predict(ds.batch(4))
+
+    def test_advanced_usage(self):
+        cls = feature_space.FeatureSpace
+        fs = feature_space.FeatureSpace(
+            features={
+                "float_1": cls.float(),
+                "float_2": cls.float_normalized(),
+                "float_3": cls.float_discretized(num_bins=3),
+                "string_1": cls.string_categorical(max_tokens=5),
+                "string_2": cls.string_hashed(num_bins=32),
+                "int_1": cls.integer_categorical(
+                    max_tokens=5, num_oov_indices=2
+                ),
+                "int_2": cls.integer_hashed(num_bins=32),
+                "int_3": cls.integer_categorical(max_tokens=5),
+            },
+            crosses=[
+                cls.cross(("float_3", "string_1"), crossing_dim=32),
+                cls.cross(("string_2", "int_2"), crossing_dim=32),
+            ],
+            output_mode="concat",
+        )
+        fs.adapt(self._get_train_data_dict(as_dataset=True))
+        data = {
+            key: value[0] for key, value in self._get_train_data_dict().items()
+        }
+        out = fs(data)
+        self.assertEqual(out.shape.as_list(), [148])
+
+    def test_manual_kpl(self):
+        data = {
+            "text": ["1st string", "2nd string", "3rd string"],
+        }
+        cls = feature_space.FeatureSpace
+
+        # Test with a tf-idf TextVectorization layer
+        tv = layers.TextVectorization(output_mode="tf_idf")
+        fs = feature_space.FeatureSpace(
+            features={
+                "text": cls.feature(
+                    preprocessor=tv, dtype="string", output_mode="float"
+                ),
+            },
+            output_mode="concat",
+        )
+        fs.adapt(tf.data.Dataset.from_tensor_slices(data))
+        out = fs(data)
+        self.assertEqual(out.shape.as_list(), [3, 5])
+
+    def test_no_adapt(self):
+        data = {
+            "int_1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        fs = feature_space.FeatureSpace(
+            {
+                "int_1": "integer_hashed",
+            },
+            output_mode="concat",
+        )
+        out = fs(data)
+        self.assertEqual(out.shape.as_list(), [10, 32])
+
+    def test_saving(self):
+        cls = feature_space.FeatureSpace
+        fs = feature_space.FeatureSpace(
+            features={
+                "float_1": cls.float(),
+                "float_2": cls.float_normalized(),
+                "float_3": cls.float_discretized(num_bins=3),
+                "string_1": cls.string_categorical(max_tokens=5),
+                "string_2": cls.string_hashed(num_bins=32),
+                "int_1": cls.integer_categorical(
+                    max_tokens=5, num_oov_indices=2
+                ),
+                "int_2": cls.integer_hashed(num_bins=32),
+                "int_3": cls.integer_categorical(max_tokens=5),
+            },
+            crosses=[
+                cls.cross(("float_3", "string_1"), crossing_dim=32),
+                cls.cross(("string_2", "int_2"), crossing_dim=32),
+            ],
+            output_mode="concat",
+        )
+        fs.adapt(self._get_train_data_dict(as_dataset=True))
+        data = {
+            key: value[0] for key, value in self._get_train_data_dict().items()
+        }
+        ref_out = fs(data)
+
+        temp_filepath = os.path.join(self.get_temp_dir(), "fs.keras")
+        fs.save(temp_filepath)
+        fs = keras.models.load_model(temp_filepath)
+
+        # Save again immediately after loading to test idempotency
+        temp_filepath = os.path.join(self.get_temp_dir(), "fs2.keras")
+        fs.save(temp_filepath)
+
+        # Test correctness of the first saved FS
+        out = fs(data)
+        self.assertAllClose(out, ref_out)
+
+        inputs = fs.get_inputs()
+        outputs = fs.get_encoded_features()
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        ds = self._get_train_data_dict(as_dataset=True)
+        out = model.predict(ds.batch(4))
+        self.assertAllClose(out[0], ref_out)
+
+        # Test correctness of the re-saved FS
+        fs = keras.models.load_model(temp_filepath)
+        out = fs(data)
+        self.assertAllClose(out, ref_out)
+
+    def test_errors(self):
+        # Test no features
+        with self.assertRaisesRegex(ValueError, "cannot be None or empty"):
+            feature_space.FeatureSpace(features={})
+        # Test no crossing dim
+        with self.assertRaisesRegex(ValueError, "`crossing_dim`"):
+            feature_space.FeatureSpace(
+                features={
+                    "f1": "integer_categorical",
+                    "f2": "integer_categorical",
+                },
+                crosses=[("f1", "f2")],
+                crossing_dim=None,
+            )
+        # Test wrong cross feature name
+        with self.assertRaisesRegex(ValueError, "should be present in "):
+            feature_space.FeatureSpace(
+                features={
+                    "f1": "integer_categorical",
+                    "f2": "integer_categorical",
+                },
+                crosses=[("f1", "unknown")],
+                crossing_dim=32,
+            )
+        # Test wrong output mode
+        with self.assertRaisesRegex(ValueError, "for argument `output_mode`"):
+            feature_space.FeatureSpace(
+                features={
+                    "f1": "integer_categorical",
+                    "f2": "integer_categorical",
+                },
+                output_mode="unknown",
+            )
+        # Test call before adapt
+        with self.assertRaisesRegex(ValueError, "You need to call `.adapt"):
+            fs = feature_space.FeatureSpace(
+                features={
+                    "f1": "integer_categorical",
+                    "f2": "integer_categorical",
+                }
+            )
+            fs({"f1": [0], "f2": [0]})
+        # Test get_encoded_features before adapt
+        with self.assertRaisesRegex(ValueError, "You need to call `.adapt"):
+            fs = feature_space.FeatureSpace(
+                features={
+                    "f1": "integer_categorical",
+                    "f2": "integer_categorical",
+                }
+            )
+            fs.get_encoded_features()
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/generic_utils.py b/keras/utils/generic_utils.py
index 44d2a95e9e08..ba58673eec43 100644
--- a/keras/utils/generic_utils.py
+++ b/keras/utils/generic_utils.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Python utilities required by Keras."""
 
-import tensorflow.compat.v2 as tf
-
 import binascii
 import codecs
 import importlib
@@ -23,1222 +21,537 @@
 import os
 import re
 import sys
-import threading
 import time
 import types as python_types
-import warnings
-import weakref
 
 import numpy as np
+import tensorflow.compat.v2 as tf
 
 from keras.utils import io_utils
-from keras.utils import tf_contextlib
 from keras.utils import tf_inspect
-from tensorflow.python.util.tf_export import keras_export
-
-_GLOBAL_CUSTOM_OBJECTS = {}
-_GLOBAL_CUSTOM_NAMES = {}
-
-# Flag that determines whether to skip the NotImplementedError when calling
-# get_config in custom models and layers. This is only enabled when saving to
-# SavedModel, when the config isn't required.
-_SKIP_FAILED_SERIALIZATION = False
-# If a layer does not have a defined config, then the returned config will be a
-# dictionary with the below key.
-_LAYER_UNDEFINED_CONFIG_KEY = 'layer was saved without config'
-
-
-@keras_export('keras.utils.custom_object_scope',  # pylint: disable=g-classes-have-attributes
-              'keras.utils.CustomObjectScope')
-class CustomObjectScope:
-  """Exposes custom classes/functions to Keras deserialization internals.
-
-  Under a scope `with custom_object_scope(objects_dict)`, Keras methods such
-  as `tf.keras.models.load_model` or `tf.keras.models.model_from_config`
-  will be able to deserialize any custom object referenced by a
-  saved config (e.g. a custom layer or metric).
-
-  Example:
-
-  Consider a custom regularizer `my_regularizer`:
-
-  ```python
-  layer = Dense(3, kernel_regularizer=my_regularizer)
-  config = layer.get_config()  # Config contains a reference to `my_regularizer`
-  ...
-  # Later:
-  with custom_object_scope({'my_regularizer': my_regularizer}):
-    layer = Dense.from_config(config)
-  ```
-
-  Args:
-      *args: Dictionary or dictionaries of `{name: object}` pairs.
-  """
-
-  def __init__(self, *args):
-    self.custom_objects = args
-    self.backup = None
-
-  def __enter__(self):
-    self.backup = _GLOBAL_CUSTOM_OBJECTS.copy()
-    for objects in self.custom_objects:
-      _GLOBAL_CUSTOM_OBJECTS.update(objects)
-    return self
-
-  def __exit__(self, *args, **kwargs):
-    _GLOBAL_CUSTOM_OBJECTS.clear()
-    _GLOBAL_CUSTOM_OBJECTS.update(self.backup)
-
-
-@keras_export('keras.utils.get_custom_objects')
-def get_custom_objects():
-  """Retrieves a live reference to the global dictionary of custom objects.
-
-  Updating and clearing custom objects using `custom_object_scope`
-  is preferred, but `get_custom_objects` can
-  be used to directly access the current collection of custom objects.
-
-  Example:
-
-  ```python
-  get_custom_objects().clear()
-  get_custom_objects()['MyObject'] = MyObject
-  ```
-
-  Returns:
-      Global dictionary of names to classes (`_GLOBAL_CUSTOM_OBJECTS`).
-  """
-  return _GLOBAL_CUSTOM_OBJECTS
-
-
-# Store a unique, per-object ID for shared objects.
-#
-# We store a unique ID for each object so that we may, at loading time,
-# re-create the network properly.  Without this ID, we would have no way of
-# determining whether a config is a description of a new object that
-# should be created or is merely a reference to an already-created object.
-SHARED_OBJECT_KEY = 'shared_object_id'
-
-
-SHARED_OBJECT_DISABLED = threading.local()
-SHARED_OBJECT_LOADING = threading.local()
-SHARED_OBJECT_SAVING = threading.local()
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
-# Attributes on the threadlocal variable must be set per-thread, thus we
-# cannot initialize these globally. Instead, we have accessor functions with
-# default values.
-def _shared_object_disabled():
-  """Get whether shared object handling is disabled in a threadsafe manner."""
-  return getattr(SHARED_OBJECT_DISABLED, 'disabled', False)
-
-
-def _shared_object_loading_scope():
-  """Get the current shared object saving scope in a threadsafe manner."""
-  return getattr(SHARED_OBJECT_LOADING, 'scope', NoopLoadingScope())
-
-
-def _shared_object_saving_scope():
-  """Get the current shared object saving scope in a threadsafe manner."""
-  return getattr(SHARED_OBJECT_SAVING, 'scope', None)
-
-
-class DisableSharedObjectScope:
-  """A context manager for disabling handling of shared objects.
-
-  Disables shared object handling for both saving and loading.
-
-  Created primarily for use with `clone_model`, which does extra surgery that
-  is incompatible with shared objects.
-  """
-
-  def __enter__(self):
-    SHARED_OBJECT_DISABLED.disabled = True
-    self._orig_loading_scope = _shared_object_loading_scope()
-    self._orig_saving_scope = _shared_object_saving_scope()
-
-  def __exit__(self, *args, **kwargs):
-    SHARED_OBJECT_DISABLED.disabled = False
-    SHARED_OBJECT_LOADING.scope = self._orig_loading_scope
-    SHARED_OBJECT_SAVING.scope = self._orig_saving_scope
-
-
-class NoopLoadingScope:
-  """The default shared object loading scope. It does nothing.
-
-  Created to simplify serialization code that doesn't care about shared objects
-  (e.g. when serializing a single object).
-  """
-
-  def get(self, unused_object_id):
-    return None
-
-  def set(self, object_id, obj):
-    pass
-
-
-class SharedObjectLoadingScope:
-  """A context manager for keeping track of loaded objects.
-
-  During the deserialization process, we may come across objects that are
-  shared across multiple layers. In order to accurately restore the network
-  structure to its original state, `SharedObjectLoadingScope` allows us to
-  re-use shared objects rather than cloning them.
-  """
-
-  def __enter__(self):
-    if _shared_object_disabled():
-      return NoopLoadingScope()
-
-    global SHARED_OBJECT_LOADING
-    SHARED_OBJECT_LOADING.scope = self
-    self._obj_ids_to_obj = {}
-    return self
 
-  def get(self, object_id):
-    """Given a shared object ID, returns a previously instantiated object.
+def func_dump(func):
+    """Serializes a user defined function.
 
     Args:
-      object_id: shared object ID to use when attempting to find already-loaded
-        object.
+        func: the function to serialize.
 
     Returns:
-      The object, if we've seen this ID before. Else, `None`.
+        A tuple `(code, defaults, closure)`.
     """
-    # Explicitly check for `None` internally to make external calling code a
-    # bit cleaner.
-    if object_id is None:
-      return
-    return self._obj_ids_to_obj.get(object_id)
-
-  def set(self, object_id, obj):
-    """Stores an instantiated object for future lookup and sharing."""
-    if object_id is None:
-      return
-    self._obj_ids_to_obj[object_id] = obj
-
-  def __exit__(self, *args, **kwargs):
-    global SHARED_OBJECT_LOADING
-    SHARED_OBJECT_LOADING.scope = NoopLoadingScope()
-
-
-class SharedObjectConfig(dict):
-  """A configuration container that keeps track of references.
-
-  `SharedObjectConfig` will automatically attach a shared object ID to any
-  configs which are referenced more than once, allowing for proper shared
-  object reconstruction at load time.
-
-  In most cases, it would be more proper to subclass something like
-  `collections.UserDict` or `collections.Mapping` rather than `dict` directly.
-  Unfortunately, python's json encoder does not support `Mapping`s. This is
-  important functionality to retain, since we are dealing with serialization.
-
-  We should be safe to subclass `dict` here, since we aren't actually
-  overriding any core methods, only augmenting with a new one for reference
-  counting.
-  """
-
-  def __init__(self, base_config, object_id, **kwargs):
-    self.ref_count = 1
-    self.object_id = object_id
-    super().__init__(base_config, **kwargs)
-
-  def increment_ref_count(self):
-    # As soon as we've seen the object more than once, we want to attach the
-    # shared object ID. This allows us to only attach the shared object ID when
-    # it's strictly necessary, making backwards compatibility breakage less
-    # likely.
-    if self.ref_count == 1:
-      self[SHARED_OBJECT_KEY] = self.object_id
-    self.ref_count += 1
-
-
-class SharedObjectSavingScope:
-  """Keeps track of shared object configs when serializing."""
-
-  def __enter__(self):
-    if _shared_object_disabled():
-      return None
-
-    global SHARED_OBJECT_SAVING
-
-    # Serialization can happen at a number of layers for a number of reasons.
-    # We may end up with a case where we're opening a saving scope within
-    # another saving scope. In that case, we'd like to use the outermost scope
-    # available and ignore inner scopes, since there is not (yet) a reasonable
-    # use case for having these nested and distinct.
-    if _shared_object_saving_scope() is not None:
-      self._passthrough = True
-      return _shared_object_saving_scope()
+    if os.name == "nt":
+        raw_code = marshal.dumps(func.__code__).replace(b"\\", b"/")
+        code = codecs.encode(raw_code, "base64").decode("ascii")
+    else:
+        raw_code = marshal.dumps(func.__code__)
+        code = codecs.encode(raw_code, "base64").decode("ascii")
+    defaults = func.__defaults__
+    if func.__closure__:
+        closure = tuple(c.cell_contents for c in func.__closure__)
     else:
-      self._passthrough = False
+        closure = None
+    return code, defaults, closure
 
-    SHARED_OBJECT_SAVING.scope = self
-    self._shared_objects_config = weakref.WeakKeyDictionary()
-    self._next_id = 0
-    return self
 
-  def get_config(self, obj):
-    """Gets a `SharedObjectConfig` if one has already been seen for `obj`.
+def func_load(code, defaults=None, closure=None, globs=None):
+    """Deserializes a user defined function.
 
     Args:
-      obj: The object for which to retrieve the `SharedObjectConfig`.
+        code: bytecode of the function.
+        defaults: defaults of the function.
+        closure: closure of the function.
+        globs: dictionary of global objects.
 
     Returns:
-      The SharedObjectConfig for a given object, if already seen. Else,
-        `None`.
+        A function object.
     """
-    try:
-      shared_object_config = self._shared_objects_config[obj]
-    except (TypeError, KeyError):
-      # If the object is unhashable (e.g. a subclass of `AbstractBaseClass`
-      # that has not overridden `__hash__`), a `TypeError` will be thrown.
-      # We'll just continue on without shared object support.
-      return None
-    shared_object_config.increment_ref_count()
-    return shared_object_config
-
-  def create_config(self, base_config, obj):
-    """Create a new SharedObjectConfig for a given object."""
-    shared_object_config = SharedObjectConfig(base_config, self._next_id)
-    self._next_id += 1
-    try:
-      self._shared_objects_config[obj] = shared_object_config
-    except TypeError:
-      # If the object is unhashable (e.g. a subclass of `AbstractBaseClass`
-      # that has not overridden `__hash__`), a `TypeError` will be thrown.
-      # We'll just continue on without shared object support.
-      pass
-    return shared_object_config
-
-  def __exit__(self, *args, **kwargs):
-    if not getattr(self, '_passthrough', False):
-      global SHARED_OBJECT_SAVING
-      SHARED_OBJECT_SAVING.scope = None
-
-
-def serialize_keras_class_and_config(
-    cls_name, cls_config, obj=None, shared_object_id=None):
-  """Returns the serialization of the class with the given config."""
-  base_config = {'class_name': cls_name, 'config': cls_config}
-
-  # We call `serialize_keras_class_and_config` for some branches of the load
-  # path. In that case, we may already have a shared object ID we'd like to
-  # retain.
-  if shared_object_id is not None:
-    base_config[SHARED_OBJECT_KEY] = shared_object_id
-
-  # If we have an active `SharedObjectSavingScope`, check whether we've already
-  # serialized this config. If so, just use that config. This will store an
-  # extra ID field in the config, allowing us to re-create the shared object
-  # relationship at load time.
-  if _shared_object_saving_scope() is not None and obj is not None:
-    shared_object_config = _shared_object_saving_scope().get_config(obj)
-    if shared_object_config is None:
-      return _shared_object_saving_scope().create_config(base_config, obj)
-    return shared_object_config
-
-  return base_config
-
-
-@keras_export('keras.utils.register_keras_serializable')
-def register_keras_serializable(package='Custom', name=None):
-  """Registers an object with the Keras serialization framework.
-
-  This decorator injects the decorated class or function into the Keras custom
-  object dictionary, so that it can be serialized and deserialized without
-  needing an entry in the user-provided custom object dict. It also injects a
-  function that Keras will call to get the object's serializable string key.
-
-  Note that to be serialized and deserialized, classes must implement the
-  `get_config()` method. Functions do not have this requirement.
-
-  The object will be registered under the key 'package>name' where `name`,
-  defaults to the object name if not passed.
-
-  Example:
-
-  ```python
-  # Note that `'my_package'` is used as the `package` argument here, and since
-  # the `name` argument is not provided, `'MyDense'` is used as the `name`.
-  @keras.utils.register_keras_serializable('my_package')
-  class MyDense(keras.layers.Dense):
-    pass
-    
-  assert keras.utils.get_registered_object('my_package>MyDense') == MyDense
-  assert keras.utils.get_registered_name(MyDense) == 'my_package>MyDense'
-  ```
-
-  Args:
-    package: The package that this class belongs to. This is used for the `key`
-      (which is 'package>name') to idenfify the class. Note that this is the
-      first argument passed into the decorator.
-    name: The name to serialize this class under in this package. If not
-      provided or `None`, the class' name will be used (note that this is the
-      case when the decorator is used with only one argument, which becomes the
-      `package`).
-
-  Returns:
-    A decorator that registers the decorated class with the passed names.
-  """
-
-  def decorator(arg):
-    """Registers a class with the Keras serialization framework."""
-    class_name = name if name is not None else arg.__name__
-    registered_name = package + '>' + class_name
-
-    if tf_inspect.isclass(arg) and not hasattr(arg, 'get_config'):
-      raise ValueError(
-          'Cannot register a class that does not have a get_config() method.')
-
-    if registered_name in _GLOBAL_CUSTOM_OBJECTS:
-      raise ValueError(
-          f'{registered_name} has already been registered to '
-          f'{_GLOBAL_CUSTOM_OBJECTS[registered_name]}')
-
-    if arg in _GLOBAL_CUSTOM_NAMES:
-      raise ValueError(
-          f'{arg} has already been registered to {_GLOBAL_CUSTOM_NAMES[arg]}')
-    _GLOBAL_CUSTOM_OBJECTS[registered_name] = arg
-    _GLOBAL_CUSTOM_NAMES[arg] = registered_name
-
-    return arg
-
-  return decorator
-
-
-@keras_export('keras.utils.get_registered_name')
-def get_registered_name(obj):
-  """Returns the name registered to an object within the Keras framework.
-
-  This function is part of the Keras serialization and deserialization
-  framework. It maps objects to the string names associated with those objects
-  for serialization/deserialization.
-
-  Args:
-    obj: The object to look up.
-
-  Returns:
-    The name associated with the object, or the default Python name if the
-      object is not registered.
-  """
-  if obj in _GLOBAL_CUSTOM_NAMES:
-    return _GLOBAL_CUSTOM_NAMES[obj]
-  else:
-    return obj.__name__
-
-
-@tf_contextlib.contextmanager
-def skip_failed_serialization():
-  global _SKIP_FAILED_SERIALIZATION
-  prev = _SKIP_FAILED_SERIALIZATION
-  try:
-    _SKIP_FAILED_SERIALIZATION = True
-    yield
-  finally:
-    _SKIP_FAILED_SERIALIZATION = prev
-
-
-@keras_export('keras.utils.get_registered_object')
-def get_registered_object(name, custom_objects=None, module_objects=None):
-  """Returns the class associated with `name` if it is registered with Keras.
-
-  This function is part of the Keras serialization and deserialization
-  framework. It maps strings to the objects associated with them for
-  serialization/deserialization.
-
-  Example:
-  ```
-  def from_config(cls, config, custom_objects=None):
-    if 'my_custom_object_name' in config:
-      config['hidden_cls'] = tf.keras.utils.get_registered_object(
-          config['my_custom_object_name'], custom_objects=custom_objects)
-  ```
-
-  Args:
-    name: The name to look up.
-    custom_objects: A dictionary of custom objects to look the name up in.
-      Generally, custom_objects is provided by the user.
-    module_objects: A dictionary of custom objects to look the name up in.
-      Generally, module_objects is provided by midlevel library implementers.
-
-  Returns:
-    An instantiable class associated with 'name', or None if no such class
-      exists.
-  """
-  if name in _GLOBAL_CUSTOM_OBJECTS:
-    return _GLOBAL_CUSTOM_OBJECTS[name]
-  elif custom_objects and name in custom_objects:
-    return custom_objects[name]
-  elif module_objects and name in module_objects:
-    return module_objects[name]
-  return None
-
-
-# pylint: disable=g-bad-exception-name
-class CustomMaskWarning(Warning):
-  pass
-# pylint: enable=g-bad-exception-name
-
-
-@keras_export('keras.utils.serialize_keras_object')
-def serialize_keras_object(instance):
-  """Serialize a Keras object into a JSON-compatible representation.
-
-  Calls to `serialize_keras_object` while underneath the
-  `SharedObjectSavingScope` context manager will cause any objects re-used
-  across multiple layers to be saved with a special shared object ID. This
-  allows the network to be re-created properly during deserialization.
-
-  Args:
-    instance: The object to serialize.
-
-  Returns:
-    A dict-like, JSON-compatible representation of the object's config.
-  """
-  _, instance = tf.__internal__.decorator.unwrap(instance)
-  if instance is None:
-    return None
-
-  # pylint: disable=protected-access
-  #
-  # For v1 layers, checking supports_masking is not enough. We have to also
-  # check whether compute_mask has been overridden.
-  supports_masking = (getattr(instance, 'supports_masking', False)
-                      or (hasattr(instance, 'compute_mask')
-                          and not is_default(instance.compute_mask)))
-  if supports_masking and is_default(instance.get_config):
-    warnings.warn(
-        'Custom mask layers require a config and must override '
-        'get_config. When loading, the custom mask layer must be '
-        'passed to the custom_objects argument.',
-        category=CustomMaskWarning,
-        stacklevel=2)
-  # pylint: enable=protected-access
-
-  if hasattr(instance, 'get_config'):
-    name = get_registered_name(instance.__class__)
-    try:
-      config = instance.get_config()
-    except NotImplementedError as e:
-      if _SKIP_FAILED_SERIALIZATION:
-        return serialize_keras_class_and_config(
-            name, {_LAYER_UNDEFINED_CONFIG_KEY: True})
-      raise e
-    serialization_config = {}
-    for key, item in config.items():
-      if isinstance(item, str):
-        serialization_config[key] = item
-        continue
-
-      # Any object of a different type needs to be converted to string or dict
-      # for serialization (e.g. custom functions, custom classes)
-      try:
-        serialized_item = serialize_keras_object(item)
-        if isinstance(serialized_item, dict) and not isinstance(item, dict):
-          serialized_item['__passive_serialization__'] = True
-        serialization_config[key] = serialized_item
-      except ValueError:
-        serialization_config[key] = item
-
-    name = get_registered_name(instance.__class__)
-    return serialize_keras_class_and_config(
-        name, serialization_config, instance)
-  if hasattr(instance, '__name__'):
-    return get_registered_name(instance)
-  raise ValueError(f'Cannot serialize {instance} since it doesn\'t implement '
-                   '`get_config()`, and also doesn\t have `__name__`')
-
-
-def get_custom_objects_by_name(item, custom_objects=None):
-  """Returns the item if it is in either local or global custom objects."""
-  if item in _GLOBAL_CUSTOM_OBJECTS:
-    return _GLOBAL_CUSTOM_OBJECTS[item]
-  elif custom_objects and item in custom_objects:
-    return custom_objects[item]
-  return None
-
-
-def class_and_config_for_serialized_keras_object(
-    config,
-    module_objects=None,
-    custom_objects=None,
-    printable_module_name='object'):
-  """Returns the class name and config for a serialized keras object."""
-  if (not isinstance(config, dict)
-      or 'class_name' not in config
-      or 'config' not in config):
-    raise ValueError(
-        f'Improper config format for {config}. '
-        'Expecting python dict contains `class_name` and `config` as keys')
-
-  class_name = config['class_name']
-  cls = get_registered_object(class_name, custom_objects, module_objects)
-  if cls is None:
-    raise ValueError(
-        f'Unknown {printable_module_name}: {class_name}. Please ensure this '
-        'object is passed to the `custom_objects` argument. See '
-        'https://www.tensorflow.org/guide/keras/save_and_serialize'
-        '#registering_the_custom_object for details.')
-
-  cls_config = config['config']
-  # Check if `cls_config` is a list. If it is a list, return the class and the
-  # associated class configs for recursively deserialization. This case will
-  # happen on the old version of sequential model (e.g. `keras_version` ==
-  # "2.0.6"), which is serialized in a different structure, for example
-  # "{'class_name': 'Sequential',
-  #   'config': [{'class_name': 'Embedding', 'config': ...}, {}, ...]}".
-  if isinstance(cls_config, list):
-    return (cls, cls_config)
-
-  deserialized_objects = {}
-  for key, item in cls_config.items():
-    if key == 'name':
-      # Assume that the value of 'name' is a string that should not be
-      # deserialized as a function. This avoids the corner case where
-      # cls_config['name'] has an identical name to a custom function and
-      # gets converted into that function.
-      deserialized_objects[key] = item
-    elif isinstance(item, dict) and '__passive_serialization__' in item:
-      deserialized_objects[key] = deserialize_keras_object(
-          item,
-          module_objects=module_objects,
-          custom_objects=custom_objects,
-          printable_module_name='config_item')
-    # TODO(momernick): Should this also have 'module_objects'?
-    elif (isinstance(item, str) and
-          tf_inspect.isfunction(get_registered_object(item, custom_objects))):
-      # Handle custom functions here. When saving functions, we only save the
-      # function's name as a string. If we find a matching string in the custom
-      # objects during deserialization, we convert the string back to the
-      # original function.
-      # Note that a potential issue is that a string field could have a naming
-      # conflict with a custom function name, but this should be a rare case.
-      # This issue does not occur if a string field has a naming conflict with
-      # a custom object, since the config of an object will always be a dict.
-      deserialized_objects[key] = get_registered_object(item, custom_objects)
-  for key, item in deserialized_objects.items():
-    cls_config[key] = deserialized_objects[key]
-
-  return (cls, cls_config)
-
-
-@keras_export('keras.utils.deserialize_keras_object')
-def deserialize_keras_object(identifier,
-                             module_objects=None,
-                             custom_objects=None,
-                             printable_module_name='object'):
-  """Turns the serialized form of a Keras object back into an actual object.
-
-  This function is for mid-level library implementers rather than end users.
-
-  Importantly, this utility requires you to provide the dict of `module_objects`
-  to use for looking up the object config; this is not populated by default.
-  If you need a deserialization utility that has preexisting knowledge of
-  built-in Keras objects, use e.g. `keras.layers.deserialize(config)`,
-  `keras.metrics.deserialize(config)`, etc.
-
-  Calling `deserialize_keras_object` while underneath the
-  `SharedObjectLoadingScope` context manager will cause any already-seen shared
-  objects to be returned as-is rather than creating a new object.
-
-  Args:
-    identifier: the serialized form of the object.
-    module_objects: A dictionary of built-in objects to look the name up in.
-      Generally, `module_objects` is provided by midlevel library implementers.
-    custom_objects: A dictionary of custom objects to look the name up in.
-      Generally, `custom_objects` is provided by the end user.
-    printable_module_name: A human-readable string representing the type of the
-      object. Printed in case of exception.
-
-  Returns:
-    The deserialized object.
-
-  Example:
-
-  A mid-level library implementer might want to implement a utility for
-  retrieving an object from its config, as such:
-
-  ```python
-  def deserialize(config, custom_objects=None):
-     return deserialize_keras_object(
-       identifier,
-       module_objects=globals(),
-       custom_objects=custom_objects,
-       name="MyObjectType",
-     )
-  ```
-
-  This is how e.g. `keras.layers.deserialize()` is implemented.
-  """
-  if identifier is None:
-    return None
-
-  if isinstance(identifier, dict):
-    # In this case we are dealing with a Keras config dictionary.
-    config = identifier
-    (cls, cls_config) = class_and_config_for_serialized_keras_object(
-        config, module_objects, custom_objects, printable_module_name)
-
-    # If this object has already been loaded (i.e. it's shared between multiple
-    # objects), return the already-loaded object.
-    shared_object_id = config.get(SHARED_OBJECT_KEY)
-    shared_object = _shared_object_loading_scope().get(shared_object_id)  # pylint: disable=assignment-from-none
-    if shared_object is not None:
-      return shared_object
-
-    if hasattr(cls, 'from_config'):
-      arg_spec = tf_inspect.getfullargspec(cls.from_config)
-      custom_objects = custom_objects or {}
-
-      if 'custom_objects' in arg_spec.args:
-        deserialized_obj = cls.from_config(
-            cls_config,
-            custom_objects=dict(
-                list(_GLOBAL_CUSTOM_OBJECTS.items()) +
-                list(custom_objects.items())))
-      else:
-        with CustomObjectScope(custom_objects):
-          deserialized_obj = cls.from_config(cls_config)
-    else:
-      # Then `cls` may be a function returning a class.
-      # in this case by convention `config` holds
-      # the kwargs of the function.
-      custom_objects = custom_objects or {}
-      with CustomObjectScope(custom_objects):
-        deserialized_obj = cls(**cls_config)
-
-    # Add object to shared objects, in case we find it referenced again.
-    _shared_object_loading_scope().set(shared_object_id, deserialized_obj)
-
-    return deserialized_obj
-
-  elif isinstance(identifier, str):
-    object_name = identifier
-    if custom_objects and object_name in custom_objects:
-      obj = custom_objects.get(object_name)
-    elif object_name in _GLOBAL_CUSTOM_OBJECTS:
-      obj = _GLOBAL_CUSTOM_OBJECTS[object_name]
-    else:
-      obj = module_objects.get(object_name)
-      if obj is None:
-        raise ValueError(
-            f'Unknown {printable_module_name}: {object_name}. Please ensure '
-            'this object is passed to the `custom_objects` argument. See '
-            'https://www.tensorflow.org/guide/keras/save_and_serialize'
-            '#registering_the_custom_object for details.')
-
-    # Classes passed by name are instantiated with no args, functions are
-    # returned as-is.
-    if tf_inspect.isclass(obj):
-      return obj()
-    return obj
-  elif tf_inspect.isfunction(identifier):
-    # If a function has already been deserialized, return as is.
-    return identifier
-  else:
-    raise ValueError(
-        f'Could not interpret serialized {printable_module_name}: {identifier}')
+    if isinstance(code, (tuple, list)):  # unpack previous dump
+        code, defaults, closure = code
+        if isinstance(defaults, list):
+            defaults = tuple(defaults)
 
+    def ensure_value_to_cell(value):
+        """Ensures that a value is converted to a python cell object.
 
-def func_dump(func):
-  """Serializes a user defined function.
-
-  Args:
-      func: the function to serialize.
-
-  Returns:
-      A tuple `(code, defaults, closure)`.
-  """
-  if os.name == 'nt':
-    raw_code = marshal.dumps(func.__code__).replace(b'\\', b'/')
-    code = codecs.encode(raw_code, 'base64').decode('ascii')
-  else:
-    raw_code = marshal.dumps(func.__code__)
-    code = codecs.encode(raw_code, 'base64').decode('ascii')
-  defaults = func.__defaults__
-  if func.__closure__:
-    closure = tuple(c.cell_contents for c in func.__closure__)
-  else:
-    closure = None
-  return code, defaults, closure
+        Args:
+            value: Any value that needs to be casted to the cell type
 
+        Returns:
+            A value wrapped as a cell object (see function "func_load")
+        """
 
-def func_load(code, defaults=None, closure=None, globs=None):
-  """Deserializes a user defined function.
+        def dummy_fn():
+
+            value  # just access it so it gets captured in .__closure__
+
+        cell_value = dummy_fn.__closure__[0]
+        if not isinstance(value, type(cell_value)):
+            return cell_value
+        return value
 
-  Args:
-      code: bytecode of the function.
-      defaults: defaults of the function.
-      closure: closure of the function.
-      globs: dictionary of global objects.
+    if closure is not None:
+        closure = tuple(ensure_value_to_cell(_) for _ in closure)
+    try:
+        raw_code = codecs.decode(code.encode("ascii"), "base64")
+    except (UnicodeEncodeError, binascii.Error):
+        raw_code = code.encode("raw_unicode_escape")
+    code = marshal.loads(raw_code)
+    if globs is None:
+        globs = globals()
+    return python_types.FunctionType(
+        code, globs, name=code.co_name, argdefs=defaults, closure=closure
+    )
 
-  Returns:
-      A function object.
-  """
-  if isinstance(code, (tuple, list)):  # unpack previous dump
-    code, defaults, closure = code
-    if isinstance(defaults, list):
-      defaults = tuple(defaults)
 
-  def ensure_value_to_cell(value):
-    """Ensures that a value is converted to a python cell object.
+def has_arg(fn, name, accept_all=False):
+    """Checks if a callable accepts a given keyword argument.
 
     Args:
-        value: Any value that needs to be casted to the cell type
+        fn: Callable to inspect.
+        name: Check if `fn` can be called with `name` as a keyword argument.
+        accept_all: What to return if there is no parameter called `name` but
+          the function accepts a `**kwargs` argument.
 
     Returns:
-        A value wrapped as a cell object (see function "func_load")
+        bool, whether `fn` accepts a `name` keyword argument.
     """
+    arg_spec = tf_inspect.getfullargspec(fn)
+    if accept_all and arg_spec.varkw is not None:
+        return True
+    return name in arg_spec.args or name in arg_spec.kwonlyargs
 
-    def dummy_fn():
-      # pylint: disable=pointless-statement
-      value  # just access it so it gets captured in .__closure__
-
-    cell_value = dummy_fn.__closure__[0]
-    if not isinstance(value, type(cell_value)):
-      return cell_value
-    return value
-
-  if closure is not None:
-    closure = tuple(ensure_value_to_cell(_) for _ in closure)
-  try:
-    raw_code = codecs.decode(code.encode('ascii'), 'base64')
-  except (UnicodeEncodeError, binascii.Error):
-    raw_code = code.encode('raw_unicode_escape')
-  code = marshal.loads(raw_code)
-  if globs is None:
-    globs = globals()
-  return python_types.FunctionType(
-      code, globs, name=code.co_name, argdefs=defaults, closure=closure)
 
-
-def has_arg(fn, name, accept_all=False):
-  """Checks if a callable accepts a given keyword argument.
-
-  Args:
-      fn: Callable to inspect.
-      name: Check if `fn` can be called with `name` as a keyword argument.
-      accept_all: What to return if there is no parameter called `name` but the
-        function accepts a `**kwargs` argument.
-
-  Returns:
-      bool, whether `fn` accepts a `name` keyword argument.
-  """
-  arg_spec = tf_inspect.getfullargspec(fn)
-  if accept_all and arg_spec.varkw is not None:
-    return True
-  return name in arg_spec.args or name in arg_spec.kwonlyargs
-
-
-@keras_export('keras.utils.Progbar')
+@keras_export("keras.utils.Progbar")
 class Progbar:
-  """Displays a progress bar.
-
-  Args:
-      target: Total number of steps expected, None if unknown.
-      width: Progress bar width on screen.
-      verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
-      stateful_metrics: Iterable of string names of metrics that should *not* be
-        averaged over time. Metrics in this list will be displayed as-is. All
-        others will be averaged by the progbar before display.
-      interval: Minimum visual progress update interval (in seconds).
-      unit_name: Display name for step counts (usually "step" or "sample").
-  """
-
-  def __init__(self,
-               target,
-               width=30,
-               verbose=1,
-               interval=0.05,
-               stateful_metrics=None,
-               unit_name='step'):
-    self.target = target
-    self.width = width
-    self.verbose = verbose
-    self.interval = interval
-    self.unit_name = unit_name
-    if stateful_metrics:
-      self.stateful_metrics = set(stateful_metrics)
-    else:
-      self.stateful_metrics = set()
-
-    self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
-                              sys.stdout.isatty()) or
-                             'ipykernel' in sys.modules or
-                             'posix' in sys.modules or
-                             'PYCHARM_HOSTED' in os.environ)
-    self._total_width = 0
-    self._seen_so_far = 0
-    # We use a dict + list to avoid garbage collection
-    # issues found in OrderedDict
-    self._values = {}
-    self._values_order = []
-    self._start = time.time()
-    self._last_update = 0
-    self._time_at_epoch_start = self._start
-    self._time_at_epoch_end = None
-    self._time_after_first_step = None
-
-  def update(self, current, values=None, finalize=None):
-    """Updates the progress bar.
+    """Displays a progress bar.
 
     Args:
-        current: Index of current step.
-        values: List of tuples: `(name, value_for_last_step)`. If `name` is in
-          `stateful_metrics`, `value_for_last_step` will be displayed as-is.
-          Else, an average of the metric over time will be displayed.
-        finalize: Whether this is the last update for the progress bar. If
-          `None`, defaults to `current >= self.target`.
+        target: Total number of steps expected, None if unknown.
+        width: Progress bar width on screen.
+        verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+        stateful_metrics: Iterable of string names of metrics that should *not*
+          be averaged over time. Metrics in this list will be displayed as-is.
+          All others will be averaged by the progbar before display.
+        interval: Minimum visual progress update interval (in seconds).
+        unit_name: Display name for step counts (usually "step" or "sample").
     """
-    if finalize is None:
-      if self.target is None:
-        finalize = False
-      else:
-        finalize = current >= self.target
-
-    values = values or []
-    for k, v in values:
-      if k not in self._values_order:
-        self._values_order.append(k)
-      if k not in self.stateful_metrics:
-        # In the case that progress bar doesn't have a target value in the first
-        # epoch, both on_batch_end and on_epoch_end will be called, which will
-        # cause 'current' and 'self._seen_so_far' to have the same value. Force
-        # the minimal value to 1 here, otherwise stateful_metric will be 0s.
-        value_base = max(current - self._seen_so_far, 1)
-        if k not in self._values:
-          self._values[k] = [v * value_base, value_base]
+
+    def __init__(
+        self,
+        target,
+        width=30,
+        verbose=1,
+        interval=0.05,
+        stateful_metrics=None,
+        unit_name="step",
+    ):
+        self.target = target
+        self.width = width
+        self.verbose = verbose
+        self.interval = interval
+        self.unit_name = unit_name
+        if stateful_metrics:
+            self.stateful_metrics = set(stateful_metrics)
         else:
-          self._values[k][0] += v * value_base
-          self._values[k][1] += value_base
-      else:
-        # Stateful metrics output a numeric value. This representation
-        # means "take an average from a single value" but keeps the
-        # numeric formatting.
-        self._values[k] = [v, 1]
-    self._seen_so_far = current
-
-    message = ''
-    now = time.time()
-    info = ' - %.0fs' % (now - self._start)
-    if current == self.target:
-      self._time_at_epoch_end = now
-    if self.verbose == 1:
-      if now - self._last_update < self.interval and not finalize:
-        return
-
-      prev_total_width = self._total_width
-      if self._dynamic_display:
-        message += '\b' * prev_total_width
-        message += '\r'
-      else:
-        message += '\n'
-
-      if self.target is not None:
-        numdigits = int(np.log10(self.target)) + 1
-        bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
-        prog = float(current) / self.target
-        prog_width = int(self.width * prog)
-        if prog_width > 0:
-          bar += ('=' * (prog_width - 1))
-          if current < self.target:
-            bar += '>'
-          else:
-            bar += '='
-        bar += ('.' * (self.width - prog_width))
-        bar += ']'
-      else:
-        bar = '%7d/Unknown' % current
-
-      self._total_width = len(bar)
-      message += bar
-
-      time_per_unit = self._estimate_step_duration(current, now)
-
-      if self.target is None or finalize:
-        info += self._format_time(time_per_unit, self.unit_name)
-      else:
-        eta = time_per_unit * (self.target - current)
-        if eta > 3600:
-          eta_format = '%d:%02d:%02d' % (eta // 3600,
-                                         (eta % 3600) // 60, eta % 60)
-        elif eta > 60:
-          eta_format = '%d:%02d' % (eta // 60, eta % 60)
+            self.stateful_metrics = set()
+
+        self._dynamic_display = (
+            (hasattr(sys.stdout, "isatty") and sys.stdout.isatty())
+            or "ipykernel" in sys.modules
+            or "posix" in sys.modules
+            or "PYCHARM_HOSTED" in os.environ
+        )
+        self._total_width = 0
+        self._seen_so_far = 0
+        # We use a dict + list to avoid garbage collection
+        # issues found in OrderedDict
+        self._values = {}
+        self._values_order = []
+        self._start = time.time()
+        self._last_update = 0
+        self._time_at_epoch_start = self._start
+        self._time_at_epoch_end = None
+        self._time_after_first_step = None
+
+    def update(self, current, values=None, finalize=None):
+        """Updates the progress bar.
+
+        Args:
+            current: Index of current step.
+            values: List of tuples: `(name, value_for_last_step)`. If `name` is
+              in `stateful_metrics`, `value_for_last_step` will be displayed
+              as-is. Else, an average of the metric over time will be
+              displayed.
+            finalize: Whether this is the last update for the progress bar. If
+              `None`, uses `current >= self.target`. Defaults to `None`.
+        """
+        if finalize is None:
+            if self.target is None:
+                finalize = False
+            else:
+                finalize = current >= self.target
+
+        values = values or []
+        for k, v in values:
+            if k not in self._values_order:
+                self._values_order.append(k)
+            if k not in self.stateful_metrics:
+                # In the case that progress bar doesn't have a target value in
+                # the first epoch, both on_batch_end and on_epoch_end will be
+                # called, which will cause 'current' and 'self._seen_so_far' to
+                # have the same value. Force the minimal value to 1 here,
+                # otherwise stateful_metric will be 0s.
+                value_base = max(current - self._seen_so_far, 1)
+                if k not in self._values:
+                    self._values[k] = [v * value_base, value_base]
+                else:
+                    self._values[k][0] += v * value_base
+                    self._values[k][1] += value_base
+            else:
+                # Stateful metrics output a numeric value. This representation
+                # means "take an average from a single value" but keeps the
+                # numeric formatting.
+                self._values[k] = [v, 1]
+        self._seen_so_far = current
+
+        message = ""
+        now = time.time()
+        info = f" - {now - self._start:.0f}s"
+        if current == self.target:
+            self._time_at_epoch_end = now
+        if self.verbose == 1:
+            if now - self._last_update < self.interval and not finalize:
+                return
+
+            prev_total_width = self._total_width
+            if self._dynamic_display:
+                message += "\b" * prev_total_width
+                message += "\r"
+            else:
+                message += "\n"
+
+            if self.target is not None:
+                numdigits = int(np.log10(self.target)) + 1
+                bar = ("%" + str(numdigits) + "d/%d [") % (current, self.target)
+                prog = float(current) / self.target
+                prog_width = int(self.width * prog)
+                if prog_width > 0:
+                    bar += "=" * (prog_width - 1)
+                    if current < self.target:
+                        bar += ">"
+                    else:
+                        bar += "="
+                bar += "." * (self.width - prog_width)
+                bar += "]"
+            else:
+                bar = "%7d/Unknown" % current
+
+            self._total_width = len(bar)
+            message += bar
+
+            time_per_unit = self._estimate_step_duration(current, now)
+
+            if self.target is None or finalize:
+                info += self._format_time(time_per_unit, self.unit_name)
+            else:
+                eta = time_per_unit * (self.target - current)
+                if eta > 3600:
+                    eta_format = "%d:%02d:%02d" % (
+                        eta // 3600,
+                        (eta % 3600) // 60,
+                        eta % 60,
+                    )
+                elif eta > 60:
+                    eta_format = "%d:%02d" % (eta // 60, eta % 60)
+                else:
+                    eta_format = "%ds" % eta
+
+                info = f" - ETA: {eta_format}"
+
+            for k in self._values_order:
+                info += f" - {k}:"
+                if isinstance(self._values[k], list):
+                    avg = np.mean(
+                        self._values[k][0] / max(1, self._values[k][1])
+                    )
+                    if abs(avg) > 1e-3:
+                        info += f" {avg:.4f}"
+                    else:
+                        info += f" {avg:.4e}"
+                else:
+                    info += f" {self._values[k]}"
+
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += " " * (prev_total_width - self._total_width)
+
+            if finalize:
+                info += "\n"
+
+            message += info
+            io_utils.print_msg(message, line_break=False)
+            message = ""
+
+        elif self.verbose == 2:
+            if finalize:
+                numdigits = int(np.log10(self.target)) + 1
+                count = ("%" + str(numdigits) + "d/%d") % (current, self.target)
+                info = count + info
+                for k in self._values_order:
+                    info += f" - {k}:"
+                    avg = np.mean(
+                        self._values[k][0] / max(1, self._values[k][1])
+                    )
+                    if avg > 1e-3:
+                        info += f" {avg:.4f}"
+                    else:
+                        info += f" {avg:.4e}"
+                if self._time_at_epoch_end:
+                    time_per_epoch = (
+                        self._time_at_epoch_end - self._time_at_epoch_start
+                    )
+                    avg_time_per_step = time_per_epoch / self.target
+                    self._time_at_epoch_start = now
+                    self._time_at_epoch_end = None
+                    info += " -" + self._format_time(time_per_epoch, "epoch")
+                    info += " -" + self._format_time(
+                        avg_time_per_step, self.unit_name
+                    )
+                    info += "\n"
+                message += info
+                io_utils.print_msg(message, line_break=False)
+                message = ""
+
+        self._last_update = now
+
+    def add(self, n, values=None):
+        self.update(self._seen_so_far + n, values)
+
+    def _format_time(self, time_per_unit, unit_name):
+        """format a given duration to display to the user.
+
+        Given the duration, this function formats it in either milliseconds
+        or seconds and displays the unit (i.e. ms/step or s/epoch)
+        Args:
+          time_per_unit: the duration to display
+          unit_name: the name of the unit to display
+        Returns:
+          a string with the correctly formatted duration and units
+        """
+        formatted = ""
+        if time_per_unit >= 1 or time_per_unit == 0:
+            formatted += f" {time_per_unit:.0f}s/{unit_name}"
+        elif time_per_unit >= 1e-3:
+            formatted += f" {time_per_unit * 1000.0:.0f}ms/{unit_name}"
         else:
-          eta_format = '%ds' % eta
-
-        info = ' - ETA: %s' % eta_format
-
-      for k in self._values_order:
-        info += ' - %s:' % k
-        if isinstance(self._values[k], list):
-          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
-          if abs(avg) > 1e-3:
-            info += ' %.4f' % avg
-          else:
-            info += ' %.4e' % avg
+            formatted += f" {time_per_unit * 1000000.0:.0f}us/{unit_name}"
+        return formatted
+
+    def _estimate_step_duration(self, current, now):
+        """Estimate the duration of a single step.
+
+        Given the step number `current` and the corresponding time `now` this
+        function returns an estimate for how long a single step takes. If this
+        is called before one step has been completed (i.e. `current == 0`) then
+        zero is given as an estimate. The duration estimate ignores the duration
+        of the (assumed to be non-representative) first step for estimates when
+        more steps are available (i.e. `current>1`).
+
+        Args:
+          current: Index of current step.
+          now: The current time.
+
+        Returns: Estimate of the duration of a single step.
+        """
+        if current:
+            # there are a few special scenarios here:
+            # 1) somebody is calling the progress bar without ever supplying
+            #    step 1
+            # 2) somebody is calling the progress bar and supplies step one
+            #    multiple times, e.g. as part of a finalizing call
+            # in these cases, we just fall back to the simple calculation
+            if self._time_after_first_step is not None and current > 1:
+                time_per_unit = (now - self._time_after_first_step) / (
+                    current - 1
+                )
+            else:
+                time_per_unit = (now - self._start) / current
+
+            if current == 1:
+                self._time_after_first_step = now
+            return time_per_unit
         else:
-          info += ' %s' % self._values[k]
-
-      self._total_width += len(info)
-      if prev_total_width > self._total_width:
-        info += (' ' * (prev_total_width - self._total_width))
-
-      if finalize:
-        info += '\n'
-
-      message += info
-      io_utils.print_msg(message, line_break=False)
-      message = ''
-
-    elif self.verbose == 2:
-      if finalize:
-        numdigits = int(np.log10(self.target)) + 1
-        count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
-        info = count + info
-        for k in self._values_order:
-          info += ' - %s:' % k
-          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
-          if avg > 1e-3:
-            info += ' %.4f' % avg
-          else:
-            info += ' %.4e' % avg
-        if self._time_at_epoch_end:
-          time_per_epoch = self._time_at_epoch_end - self._time_at_epoch_start
-          avg_time_per_step = time_per_epoch / self.target
-          self._time_at_epoch_start = now
-          self._time_at_epoch_end = None
-          info += ' -' + self._format_time(time_per_epoch, 'epoch')
-          info += ' -' + self._format_time(avg_time_per_step, self.unit_name)
-          info += '\n'
-        message += info
-        io_utils.print_msg(message, line_break=False)
-        message = ''
-
-    self._last_update = now
-
-  def add(self, n, values=None):
-    self.update(self._seen_so_far + n, values)
-
-  def _format_time(self, time_per_unit, unit_name):
-    """format a given duration to display to the user.
-
-    Given the duration, this function formats it in either milliseconds
-    or seconds and displays the unit (i.e. ms/step or s/epoch)
+            return 0
+
+    def _update_stateful_metrics(self, stateful_metrics):
+        self.stateful_metrics = self.stateful_metrics.union(stateful_metrics)
+
+
+def make_batches(size, batch_size):
+    """Returns a list of batch indices (tuples of indices).
+
     Args:
-      time_per_unit: the duration to display
-      unit_name: the name of the unit to display
+        size: Integer, total size of the data to slice into batches.
+        batch_size: Integer, batch size.
+
     Returns:
-      a string with the correctly formatted duration and units
-    """
-    formatted = ''
-    if time_per_unit >= 1 or time_per_unit == 0:
-      formatted += ' %.0fs/%s' % (time_per_unit, unit_name)
-    elif time_per_unit >= 1e-3:
-      formatted += ' %.0fms/%s' % (time_per_unit * 1e3, unit_name)
-    else:
-      formatted += ' %.0fus/%s' % (time_per_unit * 1e6, unit_name)
-    return formatted
-
-  def _estimate_step_duration(self, current, now):
-    """Estimate the duration of a single step.
-
-    Given the step number `current` and the corresponding time `now`
-    this function returns an estimate for how long a single step
-    takes. If this is called before one step has been completed
-    (i.e. `current == 0`) then zero is given as an estimate. The duration
-    estimate ignores the duration of the (assumed to be non-representative)
-    first step for estimates when more steps are available (i.e. `current>1`).
-    Args:
-      current: Index of current step.
-      now: The current time.
-    Returns: Estimate of the duration of a single step.
+        A list of tuples of array indices.
     """
-    if current:
-      # there are a few special scenarios here:
-      # 1) somebody is calling the progress bar without ever supplying step 1
-      # 2) somebody is calling the progress bar and supplies step one multiple
-      #    times, e.g. as part of a finalizing call
-      # in these cases, we just fall back to the simple calculation
-      if self._time_after_first_step is not None and current > 1:
-        time_per_unit = (now - self._time_after_first_step) / (current - 1)
-      else:
-        time_per_unit = (now - self._start) / current
-
-      if current == 1:
-        self._time_after_first_step = now
-      return time_per_unit
-    else:
-      return 0
+    num_batches = int(np.ceil(size / float(batch_size)))
+    return [
+        (i * batch_size, min(size, (i + 1) * batch_size))
+        for i in range(0, num_batches)
+    ]
 
-  def _update_stateful_metrics(self, stateful_metrics):
-    self.stateful_metrics = self.stateful_metrics.union(stateful_metrics)
 
+def slice_arrays(arrays, start=None, stop=None):
+    """Slice an array or list of arrays.
 
-def make_batches(size, batch_size):
-  """Returns a list of batch indices (tuples of indices).
+    This takes an array-like, or a list of
+    array-likes, and outputs:
+        - arrays[start:stop] if `arrays` is an array-like
+        - [x[start:stop] for x in arrays] if `arrays` is a list
 
-  Args:
-      size: Integer, total size of the data to slice into batches.
-      batch_size: Integer, batch size.
+    Can also work on list/array of indices: `slice_arrays(x, indices)`
 
-  Returns:
-      A list of tuples of array indices.
-  """
-  num_batches = int(np.ceil(size / float(batch_size)))
-  return [(i * batch_size, min(size, (i + 1) * batch_size))
-          for i in range(0, num_batches)]
+    Args:
+        arrays: Single array or list of arrays.
+        start: can be an integer index (start index) or a list/array of indices
+        stop: integer (stop index); should be None if `start` was a list.
 
+    Returns:
+        A slice of the array(s).
 
-def slice_arrays(arrays, start=None, stop=None):
-  """Slice an array or list of arrays.
-
-  This takes an array-like, or a list of
-  array-likes, and outputs:
-      - arrays[start:stop] if `arrays` is an array-like
-      - [x[start:stop] for x in arrays] if `arrays` is a list
-
-  Can also work on list/array of indices: `slice_arrays(x, indices)`
-
-  Args:
-      arrays: Single array or list of arrays.
-      start: can be an integer index (start index) or a list/array of indices
-      stop: integer (stop index); should be None if `start` was a list.
-
-  Returns:
-      A slice of the array(s).
-
-  Raises:
-      ValueError: If the value of start is a list and stop is not None.
-  """
-  if arrays is None:
-    return [None]
-  if isinstance(start, list) and stop is not None:
-    raise ValueError('The stop argument has to be None if the value of start '
-                     f'is a list. Received start={start}, stop={stop}')
-  elif isinstance(arrays, list):
-    if hasattr(start, '__len__'):
-      # hdf5 datasets only support list objects as indices
-      if hasattr(start, 'shape'):
-        start = start.tolist()
-      return [None if x is None else x[start] for x in arrays]
-    return [
-        None if x is None else
-        None if not hasattr(x, '__getitem__') else x[start:stop] for x in arrays
-    ]
-  else:
-    if hasattr(start, '__len__'):
-      if hasattr(start, 'shape'):
-        start = start.tolist()
-      return arrays[start]
-    if hasattr(start, '__getitem__'):
-      return arrays[start:stop]
-    return [None]
+    Raises:
+        ValueError: If the value of start is a list and stop is not None.
+    """
+    if arrays is None:
+        return [None]
+    if isinstance(start, list) and stop is not None:
+        raise ValueError(
+            "The stop argument has to be None if the value of start "
+            f"is a list. Received start={start}, stop={stop}"
+        )
+    elif isinstance(arrays, list):
+        if hasattr(start, "__len__"):
+            # hdf5 datasets only support list objects as indices
+            if hasattr(start, "shape"):
+                start = start.tolist()
+            return [None if x is None else x[start] for x in arrays]
+        return [
+            None
+            if x is None
+            else None
+            if not hasattr(x, "__getitem__")
+            else x[start:stop]
+            for x in arrays
+        ]
+    else:
+        if hasattr(start, "__len__"):
+            if hasattr(start, "shape"):
+                start = start.tolist()
+            return arrays[start]
+        if hasattr(start, "__getitem__"):
+            return arrays[start:stop]
+        return [None]
 
 
 def to_list(x):
-  """Normalizes a list/tensor into a list.
+    """Normalizes a list/tensor into a list.
 
-  If a tensor is passed, we return
-  a list of size 1 containing the tensor.
+    If a tensor is passed, we return
+    a list of size 1 containing the tensor.
 
-  Args:
-      x: target object to be normalized.
+    Args:
+        x: target object to be normalized.
 
-  Returns:
-      A list.
-  """
-  if isinstance(x, list):
-    return x
-  return [x]
+    Returns:
+        A list.
+    """
+    if isinstance(x, list):
+        return x
+    return [x]
 
 
 def to_snake_case(name):
-  intermediate = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
+    intermediate = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    insecure = re.sub("([a-z])([A-Z])", r"\1_\2", intermediate).lower()
+    # If the class is private the name starts with "_" which is not secure
+    # for creating scopes. We prefix the name with "private" in this case.
+    if insecure[0] != "_":
+        return insecure
+    return "private" + insecure
 
 
 def is_all_none(structure):
-  iterable = tf.nest.flatten(structure)
-  # We cannot use Python's `any` because the iterable may return Tensors.
-  for element in iterable:
-    if element is not None:
-      return False
-  return True
+    iterable = tf.nest.flatten(structure)
+    # We cannot use Python's `any` because the iterable may return Tensors.
+    for element in iterable:
+        if element is not None:
+            return False
+    return True
 
 
 def check_for_unexpected_keys(name, input_dict, expected_values):
-  unknown = set(input_dict.keys()).difference(expected_values)
-  if unknown:
-    raise ValueError(
-        f'Unknown entries in {name} dictionary: {list(unknown)}. Only expected '
-        f'following keys: {expected_values}')
-
-
-def validate_kwargs(kwargs,
-                    allowed_kwargs,
-                    error_message='Keyword argument not understood:'):
-  """Checks that all keyword arguments are in the set of allowed keys."""
-  for kwarg in kwargs:
-    if kwarg not in allowed_kwargs:
-      raise TypeError(error_message, kwarg)
+    unknown = set(input_dict.keys()).difference(expected_values)
+    if unknown:
+        raise ValueError(
+            f"Unknown entries in {name} dictionary: {list(unknown)}. "
+            f"Only expected following keys: {expected_values}"
+        )
 
 
-def validate_config(config):
-  """Determines whether config appears to be a valid layer config."""
-  return isinstance(config, dict) and _LAYER_UNDEFINED_CONFIG_KEY not in config
+def validate_kwargs(
+    kwargs, allowed_kwargs, error_message="Keyword argument not understood:"
+):
+    """Checks that all keyword arguments are in the set of allowed keys."""
+    for kwarg in kwargs:
+        if kwarg not in allowed_kwargs:
+            raise TypeError(error_message, kwarg)
 
 
 def default(method):
-  """Decorates a method to detect overrides in subclasses."""
-  method._is_default = True  # pylint: disable=protected-access
-  return method
+    """Decorates a method to detect overrides in subclasses."""
+    method._is_default = True
+    return method
 
 
 def is_default(method):
-  """Check if a method is decorated with the `default` wrapper."""
-  return getattr(method, '_is_default', False)
+    """Check if a method is decorated with the `default` wrapper."""
+    return getattr(method, "_is_default", False)
 
 
 def populate_dict_with_module_objects(target_dict, modules, obj_filter):
-  for module in modules:
-    for name in dir(module):
-      obj = getattr(module, name)
-      if obj_filter(obj):
-        target_dict[name] = obj
+    for module in modules:
+        for name in dir(module):
+            obj = getattr(module, name)
+            if obj_filter(obj):
+                target_dict[name] = obj
 
 
 class LazyLoader(python_types.ModuleType):
-  """Lazily import a module, mainly to avoid pulling in large dependencies."""
-
-  def __init__(self, local_name, parent_module_globals, name):
-    self._local_name = local_name
-    self._parent_module_globals = parent_module_globals
-    super().__init__(name)
-
-  def _load(self):
-    """Load the module and insert it into the parent's globals."""
-    # Import the target module and insert it into the parent's namespace
-    module = importlib.import_module(self.__name__)
-    self._parent_module_globals[self._local_name] = module
-    # Update this object's dict so that if someone keeps a reference to the
-    #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
-    #   that fail).
-    self.__dict__.update(module.__dict__)
-    return module
-
-  def __getattr__(self, item):
-    module = self._load()
-    return getattr(module, item)
-
-
-# Aliases
-
-custom_object_scope = CustomObjectScope  # pylint: disable=invalid-name
+    """Lazily import a module, mainly to avoid pulling in large dependencies."""
+
+    def __init__(self, local_name, parent_module_globals, name):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        super().__init__(name)
+
+    def _load(self):
+        """Load the module and insert it into the parent's globals."""
+        # Import the target module and insert it into the parent's namespace
+        module = importlib.import_module(self.__name__)
+        self._parent_module_globals[self._local_name] = module
+        # Update this object's dict so that if someone keeps a reference to the
+        # LazyLoader, lookups are efficient (__getattr__ is only called on
+        # lookups that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+
+    def __getattr__(self, item):
+        module = self._load()
+        return getattr(module, item)
diff --git a/keras/utils/generic_utils_test.py b/keras/utils/generic_utils_test.py
index 90868b8e3d63..4ed6242bda61 100644
--- a/keras/utils/generic_utils_test.py
+++ b/keras/utils/generic_utils_test.py
@@ -15,504 +15,450 @@
 """Tests for Keras generic Python utils."""
 
 
-from functools import partial
 import os
 import sys
+from functools import partial
+
+import numpy as np
+import tensorflow.compat.v2 as tf
 
 import keras
+from keras.saving import serialization_lib
+from keras.saving.legacy import serialization
+from keras.testing_infra import test_utils
 from keras.utils import generic_utils
 from keras.utils import io_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 class SnakeCaseTest(tf.test.TestCase):
-
-  def test_snake_case(self):
-    self.assertEqual(generic_utils.to_snake_case('SomeClass'), 'some_class')
-    self.assertEqual(generic_utils.to_snake_case('Conv2D'), 'conv2d')
-    self.assertEqual(generic_utils.to_snake_case('ConvLSTM2D'), 'conv_lstm2d')
+    def test_snake_case(self):
+        self.assertEqual(generic_utils.to_snake_case("SomeClass"), "some_class")
+        self.assertEqual(generic_utils.to_snake_case("Conv2D"), "conv2d")
+        self.assertEqual(
+            generic_utils.to_snake_case("ConvLSTM2D"), "conv_lstm2d"
+        )
 
 
 class HasArgTest(tf.test.TestCase):
-
-  def test_has_arg(self):
-
-    def f_x(x):
-      return x
-
-    def f_x_args(x, *args):
-      _ = args
-      return x
-
-    def f_x_kwargs(x, **kwargs):
-      _ = kwargs
-      return x
-
-    def f(a, b, c):
-      return a + b + c
-
-    partial_f = partial(f, b=1)
-
-    self.assertTrue(keras.utils.generic_utils.has_arg(
-        f_x, 'x', accept_all=False))
-    self.assertFalse(keras.utils.generic_utils.has_arg(
-        f_x, 'y', accept_all=False))
-    self.assertTrue(keras.utils.generic_utils.has_arg(
-        f_x_args, 'x', accept_all=False))
-    self.assertFalse(keras.utils.generic_utils.has_arg(
-        f_x_args, 'y', accept_all=False))
-    self.assertTrue(keras.utils.generic_utils.has_arg(
-        f_x_kwargs, 'x', accept_all=False))
-    self.assertFalse(keras.utils.generic_utils.has_arg(
-        f_x_kwargs, 'y', accept_all=False))
-    self.assertTrue(keras.utils.generic_utils.has_arg(
-        f_x_kwargs, 'y', accept_all=True))
-    self.assertTrue(
-        keras.utils.generic_utils.has_arg(partial_f, 'c', accept_all=True))
-
-
-class TestCustomObjectScope(tf.test.TestCase):
-
-  def test_custom_object_scope(self):
-
-    def custom_fn():
-      pass
-
-    class CustomClass:
-      pass
-
-    with keras.utils.generic_utils.custom_object_scope(
-        {'CustomClass': CustomClass, 'custom_fn': custom_fn}):
-      act = keras.activations.get('custom_fn')
-      self.assertEqual(act, custom_fn)
-      cl = keras.regularizers.get('CustomClass')
-      self.assertEqual(cl.__class__, CustomClass)
+    def test_has_arg(self):
+        def f_x(x):
+            return x
+
+        def f_x_args(x, *args):
+            _ = args
+            return x
+
+        def f_x_kwargs(x, **kwargs):
+            _ = kwargs
+            return x
+
+        def f(a, b, c):
+            return a + b + c
+
+        partial_f = partial(f, b=1)
+
+        self.assertTrue(
+            keras.utils.generic_utils.has_arg(f_x, "x", accept_all=False)
+        )
+        self.assertFalse(
+            keras.utils.generic_utils.has_arg(f_x, "y", accept_all=False)
+        )
+        self.assertTrue(
+            keras.utils.generic_utils.has_arg(f_x_args, "x", accept_all=False)
+        )
+        self.assertFalse(
+            keras.utils.generic_utils.has_arg(f_x_args, "y", accept_all=False)
+        )
+        self.assertTrue(
+            keras.utils.generic_utils.has_arg(f_x_kwargs, "x", accept_all=False)
+        )
+        self.assertFalse(
+            keras.utils.generic_utils.has_arg(f_x_kwargs, "y", accept_all=False)
+        )
+        self.assertTrue(
+            keras.utils.generic_utils.has_arg(f_x_kwargs, "y", accept_all=True)
+        )
+        self.assertTrue(
+            keras.utils.generic_utils.has_arg(partial_f, "c", accept_all=True)
+        )
 
 
 class SerializeKerasObjectTest(tf.test.TestCase):
-
-  def test_serialize_none(self):
-    serialized = keras.utils.generic_utils.serialize_keras_object(None)
-    self.assertEqual(serialized, None)
-    deserialized = keras.utils.generic_utils.deserialize_keras_object(
-        serialized)
-    self.assertEqual(deserialized, None)
-
-  def test_serialize_custom_class_with_default_name(self):
-
-    @keras.utils.generic_utils.register_keras_serializable()
-    class TestClass:
-
-      def __init__(self, value):
-        self._value = value
-
-      def get_config(self):
-        return {'value': self._value}
-
-    serialized_name = 'Custom>TestClass'
-    inst = TestClass(value=10)
-    class_name = keras.utils.generic_utils._GLOBAL_CUSTOM_NAMES[TestClass]
-    self.assertEqual(serialized_name, class_name)
-    config = keras.utils.generic_utils.serialize_keras_object(inst)
-    self.assertEqual(class_name, config['class_name'])
-    new_inst = keras.utils.generic_utils.deserialize_keras_object(config)
-    self.assertIsNot(inst, new_inst)
-    self.assertIsInstance(new_inst, TestClass)
-    self.assertEqual(10, new_inst._value)
-
-    # Make sure registering a new class with same name will fail.
-    with self.assertRaisesRegex(ValueError, '.*has already been registered.*'):
-      @keras.utils.generic_utils.register_keras_serializable()  # pylint: disable=function-redefined
-      class TestClass:  # pylint: disable=function-redefined
-
-        def __init__(self, value):
-          self._value = value
-
-        def get_config(self):
-          return {'value': self._value}
-
-  def test_serialize_custom_class_with_custom_name(self):
-
-    @keras.utils.generic_utils.register_keras_serializable(
-        'TestPackage', 'CustomName')
-    class OtherTestClass:
-
-      def __init__(self, val):
-        self._val = val
-
-      def get_config(self):
-        return {'val': self._val}
-
-    serialized_name = 'TestPackage>CustomName'
-    inst = OtherTestClass(val=5)
-    class_name = keras.utils.generic_utils._GLOBAL_CUSTOM_NAMES[OtherTestClass]
-    self.assertEqual(serialized_name, class_name)
-    fn_class_name = keras.utils.generic_utils.get_registered_name(
-        OtherTestClass)
-    self.assertEqual(fn_class_name, class_name)
-
-    cls = keras.utils.generic_utils.get_registered_object(fn_class_name)
-    self.assertEqual(OtherTestClass, cls)
-
-    config = keras.utils.generic_utils.serialize_keras_object(inst)
-    self.assertEqual(class_name, config['class_name'])
-    new_inst = keras.utils.generic_utils.deserialize_keras_object(config)
-    self.assertIsNot(inst, new_inst)
-    self.assertIsInstance(new_inst, OtherTestClass)
-    self.assertEqual(5, new_inst._val)
-
-  def test_serialize_custom_function(self):
-
-    @keras.utils.generic_utils.register_keras_serializable()
-    def my_fn():
-      return 42
-
-    serialized_name = 'Custom>my_fn'
-    class_name = keras.utils.generic_utils._GLOBAL_CUSTOM_NAMES[my_fn]
-    self.assertEqual(serialized_name, class_name)
-    fn_class_name = keras.utils.generic_utils.get_registered_name(my_fn)
-    self.assertEqual(fn_class_name, class_name)
-
-    config = keras.utils.generic_utils.serialize_keras_object(my_fn)
-    self.assertEqual(class_name, config)
-    fn = keras.utils.generic_utils.deserialize_keras_object(config)
-    self.assertEqual(42, fn())
-
-    fn_2 = keras.utils.generic_utils.get_registered_object(fn_class_name)
-    self.assertEqual(42, fn_2())
-
-  def test_serialize_custom_class_without_get_config_fails(self):
-
-    with self.assertRaisesRegex(
-        ValueError, 'Cannot register a class that does '
-        'not have a get_config.*'):
-
-      @keras.utils.generic_utils.register_keras_serializable(  # pylint: disable=unused-variable
-          'TestPackage', 'TestClass')
-      class TestClass:
-
-        def __init__(self, value):
-          self._value = value
-
-  def test_serializable_object(self):
-
-    class SerializableInt(int):
-      """A serializable object to pass out of a test layer's config."""
-
-      def __new__(cls, value):
-        return int.__new__(cls, value)
-
-      def get_config(self):
-        return {'value': int(self)}
-
-      @classmethod
-      def from_config(cls, config):
-        return cls(**config)
-
-    layer = keras.layers.Dense(
-        SerializableInt(3),
-        activation='relu',
-        kernel_initializer='ones',
-        bias_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    new_layer = keras.layers.deserialize(
-        config, custom_objects={'SerializableInt': SerializableInt})
-    self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
-    self.assertEqual(new_layer.units.__class__, SerializableInt)
-    self.assertEqual(new_layer.units, 3)
-
-  def test_nested_serializable_object(self):
-    class SerializableInt(int):
-      """A serializable object to pass out of a test layer's config."""
-
-      def __new__(cls, value):
-        return int.__new__(cls, value)
-
-      def get_config(self):
-        return {'value': int(self)}
-
-      @classmethod
-      def from_config(cls, config):
-        return cls(**config)
-
-    class SerializableNestedInt(int):
-      """A serializable object containing another serializable object."""
-
-      def __new__(cls, value, int_obj):
-        obj = int.__new__(cls, value)
-        obj.int_obj = int_obj
-        return obj
-
-      def get_config(self):
-        return {'value': int(self), 'int_obj': self.int_obj}
-
-      @classmethod
-      def from_config(cls, config):
-        return cls(**config)
-
-    nested_int = SerializableInt(4)
-    layer = keras.layers.Dense(
-        SerializableNestedInt(3, nested_int),
-        name='SerializableNestedInt',
-        activation='relu',
-        kernel_initializer='ones',
-        bias_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    new_layer = keras.layers.deserialize(
-        config,
-        custom_objects={
-            'SerializableInt': SerializableInt,
-            'SerializableNestedInt': SerializableNestedInt
-        })
-    # Make sure the string field doesn't get convert to custom object, even
-    # they have same value.
-    self.assertEqual(new_layer.name, 'SerializableNestedInt')
-    self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
-    self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
-    self.assertEqual(new_layer.units, 3)
-    self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
-    self.assertEqual(new_layer.units.int_obj, 4)
-
-  def test_nested_serializable_fn(self):
-
-    def serializable_fn(x):
-      """A serializable function to pass out of a test layer's config."""
-      return x
-
-    class SerializableNestedInt(int):
-      """A serializable object containing a serializable function."""
-
-      def __new__(cls, value, fn):
-        obj = int.__new__(cls, value)
-        obj.fn = fn
-        return obj
-
-      def get_config(self):
-        return {'value': int(self), 'fn': self.fn}
-
-      @classmethod
-      def from_config(cls, config):
-        return cls(**config)
-
-    layer = keras.layers.Dense(
-        SerializableNestedInt(3, serializable_fn),
-        activation='relu',
-        kernel_initializer='ones',
-        bias_regularizer='l2')
-    config = keras.layers.serialize(layer)
-    new_layer = keras.layers.deserialize(
-        config,
-        custom_objects={
-            'serializable_fn': serializable_fn,
-            'SerializableNestedInt': SerializableNestedInt
-        })
-    self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
-    self.assertIsInstance(new_layer.units, SerializableNestedInt)
-    self.assertEqual(new_layer.units, 3)
-    self.assertIs(new_layer.units.fn, serializable_fn)
-
-  def test_serialize_type_object_initializer(self):
-    layer = keras.layers.Dense(
-        1,
-        kernel_initializer=keras.initializers.ones,
-        bias_initializer=keras.initializers.zeros)
-    config = keras.layers.serialize(layer)
-    self.assertEqual(config['config']['bias_initializer']['class_name'],
-                     'Zeros')
-    self.assertEqual(config['config']['kernel_initializer']['class_name'],
-                     'Ones')
-
-  def test_serializable_with_old_config(self):
-    # model config generated by tf-1.2.1
-    old_model_config = {
-        'class_name':
-            'Sequential',
-        'config': [{
-            'class_name': 'Dense',
-            'config': {
-                'name': 'dense_1',
-                'trainable': True,
-                'batch_input_shape': [None, 784],
-                'dtype': 'float32',
-                'units': 32,
-                'activation': 'linear',
-                'use_bias': True,
-                'kernel_initializer': {
-                    'class_name': 'Ones',
-                    'config': {
-                        'dtype': 'float32'
-                    }
-                },
-                'bias_initializer': {
-                    'class_name': 'Zeros',
-                    'config': {
-                        'dtype': 'float32'
-                    }
-                },
-                'kernel_regularizer': None,
-                'bias_regularizer': None,
-                'activity_regularizer': None,
-                'kernel_constraint': None,
-                'bias_constraint': None
-            }
-        }]
-    }
-    old_model = keras.utils.generic_utils.deserialize_keras_object(
-        old_model_config, module_objects={'Sequential': keras.Sequential})
-    new_model = keras.Sequential([
-        keras.layers.Dense(32, input_dim=784, kernel_initializer='Ones'),
-    ])
-    input_data = np.random.normal(2, 1, (5, 784))
-    output = old_model.predict(input_data)
-    expected_output = new_model.predict(input_data)
-    self.assertAllEqual(output, expected_output)
-
-  def test_deserialize_unknown_object(self):
-
-    class CustomLayer(keras.layers.Layer):
-      pass
-
-    layer = CustomLayer()
-    config = keras.utils.generic_utils.serialize_keras_object(layer)
-    with self.assertRaisesRegexp(ValueError,
-                                 'passed to the `custom_objects` arg'):
-      keras.utils.generic_utils.deserialize_keras_object(config)
-    restored = keras.utils.generic_utils.deserialize_keras_object(
-        config, custom_objects={'CustomLayer': CustomLayer})
-    self.assertIsInstance(restored, CustomLayer)
+    def test_serialize_none(self):
+        serialized = serialization_lib.serialize_keras_object(None)
+        self.assertEqual(serialized, None)
+        deserialized = serialization_lib.deserialize_keras_object(serialized)
+        self.assertEqual(deserialized, None)
+
+    def test_serializable_object(self):
+        class SerializableInt(int):
+            """A serializable object to pass out of a test layer's config."""
+
+            def __new__(cls, value):
+                return int.__new__(cls, value)
+
+            def get_config(self):
+                return {"value": int(self)}
+
+            @classmethod
+            def from_config(cls, config):
+                return cls(**config)
+
+        layer = keras.layers.Dense(
+            SerializableInt(3),
+            activation="relu",
+            kernel_initializer="ones",
+            bias_regularizer="l2",
+        )
+        config = keras.layers.serialize(layer)
+        new_layer = keras.layers.deserialize(
+            config, custom_objects={"SerializableInt": SerializableInt}
+        )
+        self.assertEqual(new_layer.activation, keras.activations.relu)
+        self.assertEqual(
+            new_layer.bias_regularizer.__class__, keras.regularizers.L2
+        )
+        self.assertEqual(new_layer.units.__class__, SerializableInt)
+        self.assertEqual(new_layer.units, 3)
+
+    def test_nested_serializable_object(self):
+        class SerializableInt(int):
+            """A serializable object to pass out of a test layer's config."""
+
+            def __new__(cls, value):
+                return int.__new__(cls, value)
+
+            def get_config(self):
+                return {"value": int(self)}
+
+            @classmethod
+            def from_config(cls, config):
+                return cls(**config)
+
+        class SerializableNestedInt(int):
+            """A serializable object containing another serializable object."""
+
+            def __new__(cls, value, int_obj):
+                obj = int.__new__(cls, value)
+                obj.int_obj = int_obj
+                return obj
+
+            def get_config(self):
+                return {"value": int(self), "int_obj": self.int_obj}
+
+            @classmethod
+            def from_config(cls, config):
+                return cls(**config)
+
+        nested_int = SerializableInt(4)
+        layer = keras.layers.Dense(
+            SerializableNestedInt(3, nested_int),
+            name="SerializableNestedInt",
+            activation="relu",
+            kernel_initializer="ones",
+            bias_regularizer="l2",
+        )
+        config = keras.layers.serialize(layer)
+        new_layer = keras.layers.deserialize(
+            config,
+            custom_objects={
+                "SerializableInt": SerializableInt,
+                "SerializableNestedInt": SerializableNestedInt,
+            },
+        )
+        # Make sure the string field doesn't get convert to custom object, even
+        # they have same value.
+        self.assertEqual(new_layer.name, "SerializableNestedInt")
+        self.assertEqual(new_layer.activation, keras.activations.relu)
+        self.assertEqual(
+            new_layer.bias_regularizer.__class__, keras.regularizers.L2
+        )
+        self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
+        self.assertEqual(new_layer.units, 3)
+        self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
+        self.assertEqual(new_layer.units.int_obj, 4)
+
+    def test_nested_serializable_fn(self):
+        def serializable_fn(x):
+            """A serializable function to pass out of a test layer's config."""
+            return x
+
+        class SerializableNestedInt(int):
+            """A serializable object containing a serializable function."""
+
+            def __new__(cls, value, fn):
+                obj = int.__new__(cls, value)
+                obj.fn = fn
+                return obj
+
+            def get_config(self):
+                return {"value": int(self), "fn": self.fn}
+
+            @classmethod
+            def from_config(cls, config):
+                return cls(**config)
+
+        layer = keras.layers.Dense(
+            SerializableNestedInt(3, serializable_fn),
+            activation="relu",
+            kernel_initializer="ones",
+            bias_regularizer="l2",
+        )
+        config = keras.layers.serialize(layer)
+        new_layer = keras.layers.deserialize(
+            config,
+            custom_objects={
+                "serializable_fn": serializable_fn,
+                "SerializableNestedInt": SerializableNestedInt,
+            },
+        )
+        self.assertEqual(new_layer.activation, keras.activations.relu)
+        self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
+        self.assertIsInstance(new_layer.units, SerializableNestedInt)
+        self.assertEqual(new_layer.units, 3)
+        self.assertIs(new_layer.units.fn, serializable_fn)
+
+    def test_serialize_type_object_initializer(self):
+        layer = keras.layers.Dense(
+            1,
+            kernel_initializer=keras.initializers.ones,
+            bias_initializer=keras.initializers.zeros,
+        )
+        config = keras.layers.serialize(layer)
+        self.assertEqual(
+            config["config"]["bias_initializer"]["class_name"], "Zeros"
+        )
+        self.assertEqual(
+            config["config"]["kernel_initializer"]["class_name"], "Ones"
+        )
+
+    def test_serializable_with_old_config(self):
+        # model config generated by tf-1.2.1
+        old_model_config = {
+            "class_name": "Sequential",
+            "config": [
+                {
+                    "class_name": "Dense",
+                    "config": {
+                        "name": "dense_1",
+                        "trainable": True,
+                        "batch_input_shape": [None, 784],
+                        "dtype": "float32",
+                        "units": 32,
+                        "activation": "linear",
+                        "use_bias": True,
+                        "kernel_initializer": {
+                            "class_name": "Ones",
+                            "config": {"dtype": "float32"},
+                        },
+                        "bias_initializer": {
+                            "class_name": "Zeros",
+                            "config": {"dtype": "float32"},
+                        },
+                        "kernel_regularizer": None,
+                        "bias_regularizer": None,
+                        "activity_regularizer": None,
+                        "kernel_constraint": None,
+                        "bias_constraint": None,
+                    },
+                }
+            ],
+        }
+        old_model = serialization_lib.deserialize_keras_object(
+            old_model_config, module_objects={"Sequential": keras.Sequential}
+        )
+        new_model = keras.Sequential(
+            [
+                keras.layers.Dense(
+                    32, input_dim=784, kernel_initializer="Ones"
+                ),
+            ]
+        )
+        input_data = np.random.normal(2, 1, (5, 784))
+        output = old_model.predict(input_data)
+        expected_output = new_model.predict(input_data)
+        self.assertAllEqual(output, expected_output)
+
+    def test_deserialize_unknown_object(self):
+        class CustomLayer(keras.layers.Layer):
+            pass
+
+        layer = CustomLayer()
+        config = serialization_lib.serialize_keras_object(layer)
+        if tf.__internal__.tf2.enabled():
+            with self.assertRaisesRegex(
+                TypeError,
+                "Could not locate class 'CustomLayer'. Make sure custom classes",  # noqa: E501
+            ):
+                serialization_lib.deserialize_keras_object(config)
+        else:
+            with self.assertRaisesRegex(
+                ValueError, "using a `keras.utils.custom_object_scope`"
+            ):
+                serialization.deserialize_keras_object(config)
+        restored = serialization_lib.deserialize_keras_object(
+            config, custom_objects={"CustomLayer": CustomLayer}
+        )
+        self.assertIsInstance(restored, CustomLayer)
 
 
 class SliceArraysTest(tf.test.TestCase):
-
-  def test_slice_arrays(self):
-    input_a = list([1, 2, 3])
-    self.assertEqual(
-        keras.utils.generic_utils.slice_arrays(input_a, start=0),
-        [None, None, None])
-    self.assertEqual(
-        keras.utils.generic_utils.slice_arrays(input_a, stop=3),
-        [None, None, None])
-    self.assertEqual(
-        keras.utils.generic_utils.slice_arrays(input_a, start=0, stop=1),
-        [None, None, None])
+    def test_slice_arrays(self):
+        input_a = list([1, 2, 3])
+        self.assertEqual(
+            keras.utils.generic_utils.slice_arrays(input_a, start=0),
+            [None, None, None],
+        )
+        self.assertEqual(
+            keras.utils.generic_utils.slice_arrays(input_a, stop=3),
+            [None, None, None],
+        )
+        self.assertEqual(
+            keras.utils.generic_utils.slice_arrays(input_a, start=0, stop=1),
+            [None, None, None],
+        )
 
 
 # object() alone isn't compatible with WeakKeyDictionary, which we use to
 # track shared configs.
 class MaybeSharedObject:
-  pass
-
+    pass
 
-class SharedObjectScopeTest(tf.test.TestCase):
 
-  def test_shared_object_saving_scope_single_object_doesnt_export_id(self):
-    with generic_utils.SharedObjectSavingScope() as scope:
-      single_object = MaybeSharedObject()
-      self.assertIsNone(scope.get_config(single_object))
-      single_object_config = scope.create_config({}, single_object)
-      self.assertIsNotNone(single_object_config)
-      self.assertNotIn(generic_utils.SHARED_OBJECT_KEY,
-                       single_object_config)
-
-  def test_shared_object_saving_scope_shared_object_exports_id(self):
-    with generic_utils.SharedObjectSavingScope() as scope:
-      shared_object = MaybeSharedObject()
-      self.assertIsNone(scope.get_config(shared_object))
-      scope.create_config({}, shared_object)
-      first_object_config = scope.get_config(shared_object)
-      second_object_config = scope.get_config(shared_object)
-      self.assertIn(generic_utils.SHARED_OBJECT_KEY,
-                    first_object_config)
-      self.assertIn(generic_utils.SHARED_OBJECT_KEY,
-                    second_object_config)
-      self.assertIs(first_object_config, second_object_config)
-
-  def test_shared_object_loading_scope_noop(self):
-    # Test that, without a context manager scope, adding configs will do
-    # nothing.
-    obj_id = 1
-    obj = MaybeSharedObject()
-    generic_utils._shared_object_loading_scope().set(obj_id, obj)
-    self.assertIsNone(generic_utils._shared_object_loading_scope().get(obj_id))
-
-  def test_shared_object_loading_scope_returns_shared_obj(self):
-    obj_id = 1
-    obj = MaybeSharedObject()
-    with generic_utils.SharedObjectLoadingScope() as scope:
-      scope.set(obj_id, obj)
-      self.assertIs(scope.get(obj_id), obj)
-
-  def test_nested_shared_object_saving_scopes(self):
-    my_obj = MaybeSharedObject()
-    with generic_utils.SharedObjectSavingScope() as scope_1:
-      scope_1.create_config({}, my_obj)
-      with generic_utils.SharedObjectSavingScope() as scope_2:
-        # Nesting saving scopes should return the original scope and should
-        # not clear any objects we're tracking.
-        self.assertIs(scope_1, scope_2)
-        self.assertIsNotNone(scope_2.get_config(my_obj))
-      self.assertIsNotNone(scope_1.get_config(my_obj))
-    self.assertIsNone(generic_utils._shared_object_saving_scope())
-
-  def test_custom_object_scope_correct_class(self):
-    train_step_message = 'This is my training step'
-    temp_dir = os.path.join(self.get_temp_dir(), 'my_model')
-
-    class CustomModelX(keras.Model):
-
-      def __init__(self, *args, **kwargs):
+class CustomModelX(keras.Model):
+    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.dense1 = keras.layers.Dense(1)
+        self.train_step_message = "This is my training step"
 
-      def call(self, inputs):
+    def call(self, inputs):
         return self.dense1(inputs)
 
-      def train_step(self, data):
-        tf.print(train_step_message)
+    def train_step(self, data):
+        tf.print(self.train_step_message)
         x, y = data
         with tf.GradientTape() as tape:
-          y_pred = self(x)
-          loss = self.compiled_loss(y, y_pred)
+            y_pred = self(x)
+            loss = self.compiled_loss(y, y_pred)
 
         gradients = tape.gradient(loss, self.trainable_variables)
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
         return {}
 
-      def func_that_returns_one(self):
+    def func_that_returns_one(self):
         return 1
 
-    subclassed_model = CustomModelX()
-    subclassed_model.compile(optimizer='adam', loss='mse')
-
-    x = np.random.random((100, 32))
-    y = np.random.random((100, 1))
-    subclassed_model.fit(x, y, epochs=1)
-    subclassed_model.save(temp_dir, save_format='tf')
-
-    with keras.utils.generic_utils.custom_object_scope(
-        {'CustomModelX': CustomModelX}):
-      loaded_model = keras.models.load_model(temp_dir)
 
-    io_utils.enable_interactive_logging()
-    # `tf.print` writes to stderr.
-    with self.captureWritesToStream(sys.stderr) as printed:
-      loaded_model.fit(x, y, epochs=1)
-      if tf.__internal__.tf2.enabled():
-        # `tf.print` message is only available in stderr in TF2. Check that
-        # custom `train_step` is used.
-        self.assertRegex(printed.contents(), train_step_message)
-
-    # Check that the custom class does get used.
-    self.assertIsInstance(loaded_model, CustomModelX)
-    # Check that the custom method is available.
-    self.assertEqual(loaded_model.func_that_returns_one(), 1)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+class SharedObjectScopeTest(tf.test.TestCase):
+    def test_shared_object_saving_scope_single_object_doesnt_export_id(self):
+        with serialization.SharedObjectSavingScope() as scope:
+            single_object = MaybeSharedObject()
+            self.assertIsNone(scope.get_config(single_object))
+            single_object_config = scope.create_config({}, single_object)
+            self.assertIsNotNone(single_object_config)
+            self.assertNotIn(
+                serialization.SHARED_OBJECT_KEY, single_object_config
+            )
+
+    def test_shared_object_saving_scope_shared_object_exports_id(self):
+        with serialization.SharedObjectSavingScope() as scope:
+            shared_object = MaybeSharedObject()
+            self.assertIsNone(scope.get_config(shared_object))
+            scope.create_config({}, shared_object)
+            first_object_config = scope.get_config(shared_object)
+            second_object_config = scope.get_config(shared_object)
+            self.assertIn(serialization.SHARED_OBJECT_KEY, first_object_config)
+            self.assertIn(serialization.SHARED_OBJECT_KEY, second_object_config)
+            self.assertIs(first_object_config, second_object_config)
+
+    def test_shared_object_loading_scope_noop(self):
+        # Test that, without a context manager scope, adding configs will do
+        # nothing.
+        obj_id = 1
+        obj = MaybeSharedObject()
+        serialization._shared_object_loading_scope().set(obj_id, obj)
+        self.assertIsNone(
+            serialization._shared_object_loading_scope().get(obj_id)
+        )
+
+    def test_shared_object_loading_scope_returns_shared_obj(self):
+        obj_id = 1
+        obj = MaybeSharedObject()
+        with serialization.SharedObjectLoadingScope() as scope:
+            scope.set(obj_id, obj)
+            self.assertIs(scope.get(obj_id), obj)
+
+    def test_nested_shared_object_saving_scopes(self):
+        my_obj = MaybeSharedObject()
+        with serialization.SharedObjectSavingScope() as scope_1:
+            scope_1.create_config({}, my_obj)
+            with serialization.SharedObjectSavingScope() as scope_2:
+                # Nesting saving scopes should return the original scope and
+                # should not clear any objects we're tracking.
+                self.assertIs(scope_1, scope_2)
+                self.assertIsNotNone(scope_2.get_config(my_obj))
+            self.assertIsNotNone(scope_1.get_config(my_obj))
+        self.assertIsNone(serialization._shared_object_saving_scope())
+
+    def test_custom_object_scope_correct_class_saved_model(self):
+        temp_dir = os.path.join(self.get_temp_dir(), "my_model")
+
+        subclassed_model = CustomModelX()
+        subclassed_model.compile(optimizer="adam", loss="mse")
+
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        subclassed_model.fit(x, y, epochs=1)
+
+        subclassed_model.save(temp_dir, save_format="tf")
+
+        with keras.utils.custom_object_scope({"CustomModelX": CustomModelX}):
+            loaded_model = keras.models.load_model(temp_dir)
+
+        io_utils.enable_interactive_logging()
+        # `tf.print` writes to stderr.
+        with self.captureWritesToStream(sys.stderr) as printed:
+            loaded_model.fit(x, y, epochs=1)
+            if tf.__internal__.tf2.enabled():
+                # `tf.print` message is only available in stderr in TF2.
+                # Check that custom `train_step` is used.
+                self.assertRegex(printed.contents(), "This is my training step")
+
+        # Check that the custom class does get used.
+        self.assertIsInstance(loaded_model, CustomModelX)
+        # Check that the custom method is available.
+        self.assertEqual(loaded_model.func_that_returns_one(), 1)
+
+    @test_utils.run_v2_only
+    def test_custom_object_scope_correct_class_keras_v3(self):
+        temp_dir = os.path.join(self.get_temp_dir(), "my_model.keras")
+
+        subclassed_model = CustomModelX()
+        subclassed_model.compile(optimizer="adam", loss="mse")
+
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        subclassed_model.fit(x, y, epochs=1)
+
+        subclassed_model.save(temp_dir, save_format="keras_v3")
+
+        with keras.utils.custom_object_scope({"CustomModelX": CustomModelX}):
+            loaded_model = keras.models.load_model(temp_dir)
+
+        io_utils.enable_interactive_logging()
+        # `tf.print` writes to stderr.
+        with self.captureWritesToStream(sys.stderr) as printed:
+            loaded_model.fit(x, y, epochs=1)
+            if tf.__internal__.tf2.enabled():
+                # `tf.print` message is only available in stderr in TF2.
+                # Check that custom `train_step` is used.
+                self.assertRegex(printed.contents(), "This is my training step")
+
+        # Check that the custom class does get used.
+        self.assertIsInstance(loaded_model, CustomModelX)
+        # Check that the custom method is available.
+        self.assertEqual(loaded_model.func_that_returns_one(), 1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/image_dataset.py b/keras/utils/image_dataset.py
index 96fe7d3dd01a..fef22dda4911 100644
--- a/keras/utils/image_dataset.py
+++ b/keras/utils/image_dataset.py
@@ -14,308 +14,365 @@
 # ==============================================================================
 """Keras image dataset loading utilities."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
 
-import numpy as np
 from keras.utils import dataset_utils
 from keras.utils import image_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-ALLOWLIST_FORMATS = ('.bmp', '.gif', '.jpeg', '.jpg', '.png')
-
-
-@keras_export('keras.utils.image_dataset_from_directory',
-              'keras.preprocessing.image_dataset_from_directory',
-              v1=[])
-def image_dataset_from_directory(directory,
-                                 labels='inferred',
-                                 label_mode='int',
-                                 class_names=None,
-                                 color_mode='rgb',
-                                 batch_size=32,
-                                 image_size=(256, 256),
-                                 shuffle=True,
-                                 seed=None,
-                                 validation_split=None,
-                                 subset=None,
-                                 interpolation='bilinear',
-                                 follow_links=False,
-                                 crop_to_aspect_ratio=False,
-                                 **kwargs):
-  """Generates a `tf.data.Dataset` from image files in a directory.
-
-  If your directory structure is:
-
-  ```
-  main_directory/
-  ...class_a/
-  ......a_image_1.jpg
-  ......a_image_2.jpg
-  ...class_b/
-  ......b_image_1.jpg
-  ......b_image_2.jpg
-  ```
-
-  Then calling `image_dataset_from_directory(main_directory, labels='inferred')`
-  will return a `tf.data.Dataset` that yields batches of images from
-  the subdirectories `class_a` and `class_b`, together with labels
-  0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
-
-  Supported image formats: jpeg, png, bmp, gif.
-  Animated gifs are truncated to the first frame.
-
-  Args:
-    directory: Directory where the data is located.
-        If `labels` is "inferred", it should contain
-        subdirectories, each containing images for a class.
-        Otherwise, the directory structure is ignored.
-    labels: Either "inferred"
-        (labels are generated from the directory structure),
-        None (no labels),
-        or a list/tuple of integer labels of the same size as the number of
-        image files found in the directory. Labels should be sorted according
-        to the alphanumeric order of the image file paths
-        (obtained via `os.walk(directory)` in Python).
-    label_mode: String describing the encoding of `labels`. Options are:
-        - 'int': means that the labels are encoded as integers
-            (e.g. for `sparse_categorical_crossentropy` loss).
-        - 'categorical' means that the labels are
-            encoded as a categorical vector
-            (e.g. for `categorical_crossentropy` loss).
-        - 'binary' means that the labels (there can be only 2)
-            are encoded as `float32` scalars with values 0 or 1
-            (e.g. for `binary_crossentropy`).
-        - None (no labels).
-    class_names: Only valid if "labels" is "inferred". This is the explicit
-        list of class names (must match names of subdirectories). Used
-        to control the order of the classes
-        (otherwise alphanumerical order is used).
-    color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
-        Whether the images will be converted to
-        have 1, 3, or 4 channels.
-    batch_size: Size of the batches of data. Default: 32.
-      If `None`, the data will not be batched
-      (the dataset will yield individual samples).
-    image_size: Size to resize images to after they are read from disk,
-        specified as `(height, width)`. Defaults to `(256, 256)`.
-        Since the pipeline processes batches of images that must all have
-        the same size, this must be provided.
-    shuffle: Whether to shuffle the data. Default: True.
-        If set to False, sorts the data in alphanumeric order.
-    seed: Optional random seed for shuffling and transformations.
-    validation_split: Optional float between 0 and 1,
-        fraction of data to reserve for validation.
-    subset: Subset of the data to return.
-        One of "training", "validation" or "both".
-        Only used if `validation_split` is set.
-        When `subset="both"`, the utility returns a tuple of two datasets
-        (the training and validation datasets respectively).
-    interpolation: String, the interpolation method used when resizing images.
-      Defaults to `bilinear`. Supports `bilinear`, `nearest`, `bicubic`,
-      `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
-    follow_links: Whether to visits subdirectories pointed to by symlinks.
-        Defaults to False.
-    crop_to_aspect_ratio: If True, resize the images without aspect
-      ratio distortion. When the original aspect ratio differs from the target
-      aspect ratio, the output image will be cropped so as to return the largest
-      possible window in the image (of size `image_size`) that matches
-      the target aspect ratio. By default (`crop_to_aspect_ratio=False`),
-      aspect ratio may not be preserved.
-    **kwargs: Legacy keyword arguments.
-
-  Returns:
+ALLOWLIST_FORMATS = (".bmp", ".gif", ".jpeg", ".jpg", ".png")
+
+
+@keras_export(
+    "keras.utils.image_dataset_from_directory",
+    "keras.preprocessing.image_dataset_from_directory",
+    v1=[],
+)
+def image_dataset_from_directory(
+    directory,
+    labels="inferred",
+    label_mode="int",
+    class_names=None,
+    color_mode="rgb",
+    batch_size=32,
+    image_size=(256, 256),
+    shuffle=True,
+    seed=None,
+    validation_split=None,
+    subset=None,
+    interpolation="bilinear",
+    follow_links=False,
+    crop_to_aspect_ratio=False,
+    **kwargs,
+):
+    """Generates a `tf.data.Dataset` from image files in a directory.
+
+    If your directory structure is:
+
+    ```
+    main_directory/
+    ...class_a/
+    ......a_image_1.jpg
+    ......a_image_2.jpg
+    ...class_b/
+    ......b_image_1.jpg
+    ......b_image_2.jpg
+    ```
+
+    Then calling `image_dataset_from_directory(main_directory,
+    labels='inferred')` will return a `tf.data.Dataset` that yields batches of
+    images from the subdirectories `class_a` and `class_b`, together with labels
+    0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
+
+    Supported image formats: `.jpeg`, `.jpg`, `.png`, `.bmp`, `.gif`.
+    Animated gifs are truncated to the first frame.
+
+    Args:
+        directory: Directory where the data is located.
+            If `labels` is `"inferred"`, it should contain
+            subdirectories, each containing images for a class.
+            Otherwise, the directory structure is ignored.
+        labels: Either `"inferred"`
+            (labels are generated from the directory structure),
+            `None` (no labels),
+            or a list/tuple of integer labels of the same size as the number of
+            image files found in the directory. Labels should be sorted
+            according to the alphanumeric order of the image file paths
+            (obtained via `os.walk(directory)` in Python).
+        label_mode: String describing the encoding of `labels`. Options are:
+            - `"int"`: means that the labels are encoded as integers
+                (e.g. for `sparse_categorical_crossentropy` loss).
+            - `"categorical"` means that the labels are
+                encoded as a categorical vector
+                (e.g. for `categorical_crossentropy` loss).
+            - `"binary"` means that the labels (there can be only 2)
+                are encoded as `float32` scalars with values 0 or 1
+                (e.g. for `binary_crossentropy`).
+            - `None` (no labels).
+        class_names: Only valid if `labels` is `"inferred"`.
+            This is the explicit list of class names
+            (must match names of subdirectories). Used to control the order
+            of the classes (otherwise alphanumerical order is used).
+        color_mode: One of `"grayscale"`, `"rgb"`, `"rgba"`.
+            Defaults to `"rgb"`. Whether the images will be converted to
+            have 1, 3, or 4 channels.
+        batch_size: Size of the batches of data.
+            If `None`, the data will not be batched
+            (the dataset will yield individual samples). Defaults to 32.
+        image_size: Size to resize images to after they are read from disk,
+            specified as `(height, width)`.
+            Since the pipeline processes batches of images that must all have
+            the same size, this must be provided. Defaults to `(256, 256)`.
+        shuffle: Whether to shuffle the data. Defaults to `True`.
+            If set to `False`, sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        validation_split: Optional float between 0 and 1,
+            fraction of data to reserve for validation.
+        subset: Subset of the data to return.
+            One of `"training"`, `"validation"`, or `"both"`.
+            Only used if `validation_split` is set.
+            When `subset="both"`, the utility returns a tuple of two datasets
+            (the training and validation datasets respectively).
+        interpolation: String, the interpolation method used when
+            resizing images. Defaults to `"bilinear"`.
+            Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
+            `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+        follow_links: Whether to visit subdirectories pointed to by symlinks.
+            Defaults to `False`.
+        crop_to_aspect_ratio: If `True`, resize the images without aspect
+            ratio distortion. When the original aspect ratio differs from the
+            target aspect ratio, the output image will be cropped so as to
+            return the largest possible window in the image
+            (of size `image_size`) that matches the target aspect ratio. By
+            default (`crop_to_aspect_ratio=False`), aspect ratio may not be
+            preserved.
+        **kwargs: Legacy keyword arguments.
+
+    Returns:
+
     A `tf.data.Dataset` object.
-      - If `label_mode` is None, it yields `float32` tensors of shape
+
+    - If `label_mode` is `None`, it yields `float32` tensors of shape
         `(batch_size, image_size[0], image_size[1], num_channels)`,
         encoding images (see below for rules regarding `num_channels`).
-      - Otherwise, it yields a tuple `(images, labels)`, where `images`
-        has shape `(batch_size, image_size[0], image_size[1], num_channels)`,
+    - Otherwise, it yields a tuple `(images, labels)`, where `images` has
+        shape `(batch_size, image_size[0], image_size[1], num_channels)`,
         and `labels` follows the format described below.
 
-  Rules regarding labels format:
-    - if `label_mode` is `int`, the labels are an `int32` tensor of shape
-      `(batch_size,)`.
-    - if `label_mode` is `binary`, the labels are a `float32` tensor of
-      1s and 0s of shape `(batch_size, 1)`.
-    - if `label_mode` is `categorical`, the labels are a `float32` tensor
-      of shape `(batch_size, num_classes)`, representing a one-hot
-      encoding of the class index.
-
-  Rules regarding number of channels in the yielded images:
-    - if `color_mode` is `grayscale`,
-      there's 1 channel in the image tensors.
-    - if `color_mode` is `rgb`,
-      there are 3 channel in the image tensors.
-    - if `color_mode` is `rgba`,
-      there are 4 channel in the image tensors.
-  """
-  if 'smart_resize' in kwargs:
-    crop_to_aspect_ratio = kwargs.pop('smart_resize')
-  if kwargs:
-    raise TypeError(f'Unknown keywords argument(s): {tuple(kwargs.keys())}')
-  if labels not in ('inferred', None):
-    if not isinstance(labels, (list, tuple)):
-      raise ValueError(
-          '`labels` argument should be a list/tuple of integer labels, of '
-          'the same size as the number of image files in the target '
-          'directory. If you wish to infer the labels from the subdirectory '
-          'names in the target directory, pass `labels="inferred"`. '
-          'If you wish to get a dataset that only contains images '
-          f'(no labels), pass `labels=None`. Received: labels={labels}')
-    if class_names:
-      raise ValueError('You can only pass `class_names` if '
-                       f'`labels="inferred"`. Received: labels={labels}, and '
-                       f'class_names={class_names}')
-  if label_mode not in {'int', 'categorical', 'binary', None}:
-    raise ValueError(
-        '`label_mode` argument must be one of "int", "categorical", "binary", '
-        f'or None. Received: label_mode={label_mode}')
-  if labels is None or label_mode is None:
-    labels = None
-    label_mode = None
-  if color_mode == 'rgb':
-    num_channels = 3
-  elif color_mode == 'rgba':
-    num_channels = 4
-  elif color_mode == 'grayscale':
-    num_channels = 1
-  else:
-    raise ValueError(
-        '`color_mode` must be one of {"rgb", "rgba", "grayscale"}. '
-        f'Received: color_mode={color_mode}')
-  interpolation = image_utils.get_interpolation(interpolation)
-  dataset_utils.check_validation_split_arg(
-      validation_split, subset, shuffle, seed)
-
-  if seed is None:
-    seed = np.random.randint(1e6)
-  image_paths, labels, class_names = dataset_utils.index_directory(
-      directory,
-      labels,
-      formats=ALLOWLIST_FORMATS,
-      class_names=class_names,
-      shuffle=shuffle,
-      seed=seed,
-      follow_links=follow_links)
-
-  if label_mode == 'binary' and len(class_names) != 2:
-    raise ValueError(
-        f'When passing `label_mode="binary"`, there must be exactly 2 '
-        f'class_names. Received: class_names={class_names}')
-
-  if subset == 'both':
-    image_paths_train, labels_train = dataset_utils.get_training_or_validation_split(
-        image_paths, labels, validation_split, 'training')
-    image_paths_val, labels_val = dataset_utils.get_training_or_validation_split(
-        image_paths, labels, validation_split, 'validation')
-    if not image_paths_train:
-      raise ValueError(f'No training images found in directory {directory}. '
-                       f'Allowed formats: {ALLOWLIST_FORMATS}')
-    if not image_paths_val:
-      raise ValueError(f'No validation images found in directory {directory}. '
-                       f'Allowed formats: {ALLOWLIST_FORMATS}')
-    train_dataset = paths_and_labels_to_dataset(
-        image_paths=image_paths_train,
-        image_size=image_size,
-        num_channels=num_channels,
-        labels=labels_train,
-        label_mode=label_mode,
-        num_classes=len(class_names),
-        interpolation=interpolation,
-        crop_to_aspect_ratio=crop_to_aspect_ratio)
-    val_dataset = paths_and_labels_to_dataset(
-        image_paths=image_paths_val,
-        image_size=image_size,
-        num_channels=num_channels,
-        labels=labels_val,
-        label_mode=label_mode,
-        num_classes=len(class_names),
-        interpolation=interpolation,
-        crop_to_aspect_ratio=crop_to_aspect_ratio)
-    train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
-    val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
-    if batch_size is not None:
-      if shuffle:
-        # Shuffle locally at each iteration
-        train_dataset = train_dataset.shuffle(
-            buffer_size=batch_size * 8, seed=seed)
-      train_dataset = train_dataset.batch(batch_size)
-      val_dataset = val_dataset.batch(batch_size)
+    Rules regarding labels format:
+
+    - if `label_mode` is `"int"`, the labels are an `int32` tensor of shape
+        `(batch_size,)`.
+    - if `label_mode` is `"binary"`, the labels are a `float32` tensor of
+        1s and 0s of shape `(batch_size, 1)`.
+    - if `label_mode` is `"categorical"`, the labels are a `float32` tensor
+        of shape `(batch_size, num_classes)`, representing a one-hot
+        encoding of the class index.
+
+    Rules regarding number of channels in the yielded images:
+
+    - if `color_mode` is `"grayscale"`,
+        there's 1 channel in the image tensors.
+    - if `color_mode` is `"rgb"`,
+        there are 3 channels in the image tensors.
+    - if `color_mode` is `"rgba"`,
+        there are 4 channels in the image tensors.
+    """
+    if "smart_resize" in kwargs:
+        crop_to_aspect_ratio = kwargs.pop("smart_resize")
+    if kwargs:
+        raise TypeError(f"Unknown keywords argument(s): {tuple(kwargs.keys())}")
+    if labels not in ("inferred", None):
+        if not isinstance(labels, (list, tuple)):
+            raise ValueError(
+                "`labels` argument should be a list/tuple of integer labels, "
+                "of the same size as the number of image files in the target "
+                "directory. If you wish to infer the labels from the "
+                "subdirectory "
+                'names in the target directory, pass `labels="inferred"`. '
+                "If you wish to get a dataset that only contains images "
+                f"(no labels), pass `labels=None`. Received: labels={labels}"
+            )
+        if class_names:
+            raise ValueError(
+                "You can only pass `class_names` if "
+                f'`labels="inferred"`. Received: labels={labels}, and '
+                f"class_names={class_names}"
+            )
+    if label_mode not in {"int", "categorical", "binary", None}:
+        raise ValueError(
+            '`label_mode` argument must be one of "int", '
+            '"categorical", "binary", '
+            f"or None. Received: label_mode={label_mode}"
+        )
+    if labels is None or label_mode is None:
+        labels = None
+        label_mode = None
+    if color_mode == "rgb":
+        num_channels = 3
+    elif color_mode == "rgba":
+        num_channels = 4
+    elif color_mode == "grayscale":
+        num_channels = 1
     else:
-      if shuffle:
-        train_dataset = train_dataset.shuffle(buffer_size=1024, seed=seed)
-
-    # Users may need to reference `class_names`.
-    train_dataset.class_names = class_names
-    val_dataset.class_names = class_names
-    # Include file paths for images as attribute.
-    train_dataset.file_paths = image_paths_train
-    val_dataset.file_paths = image_paths_val
-    dataset = [train_dataset, val_dataset]
-  else:
-    image_paths, labels = dataset_utils.get_training_or_validation_split(
-        image_paths, labels, validation_split, subset)
-    if not image_paths:
-      raise ValueError(f'No images found in directory {directory}. '
-                       f'Allowed formats: {ALLOWLIST_FORMATS}')
-
-    dataset = paths_and_labels_to_dataset(
-        image_paths=image_paths,
-        image_size=image_size,
-        num_channels=num_channels,
-        labels=labels,
-        label_mode=label_mode,
-        num_classes=len(class_names),
-        interpolation=interpolation,
-        crop_to_aspect_ratio=crop_to_aspect_ratio)
-    dataset = dataset.prefetch(tf.data.AUTOTUNE)
-    if batch_size is not None:
-      if shuffle:
-        # Shuffle locally at each iteration
-        dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
-      dataset = dataset.batch(batch_size)
+        raise ValueError(
+            '`color_mode` must be one of {"rgb", "rgba", "grayscale"}. '
+            f"Received: color_mode={color_mode}"
+        )
+    interpolation = image_utils.get_interpolation(interpolation)
+    dataset_utils.check_validation_split_arg(
+        validation_split, subset, shuffle, seed
+    )
+
+    if seed is None:
+        seed = np.random.randint(1e6)
+    image_paths, labels, class_names = dataset_utils.index_directory(
+        directory,
+        labels,
+        formats=ALLOWLIST_FORMATS,
+        class_names=class_names,
+        shuffle=shuffle,
+        seed=seed,
+        follow_links=follow_links,
+    )
+
+    if label_mode == "binary" and len(class_names) != 2:
+        raise ValueError(
+            'When passing `label_mode="binary"`, there must be exactly 2 '
+            f"class_names. Received: class_names={class_names}"
+        )
+
+    if subset == "both":
+        (
+            image_paths_train,
+            labels_train,
+        ) = dataset_utils.get_training_or_validation_split(
+            image_paths, labels, validation_split, "training"
+        )
+        (
+            image_paths_val,
+            labels_val,
+        ) = dataset_utils.get_training_or_validation_split(
+            image_paths, labels, validation_split, "validation"
+        )
+        if not image_paths_train:
+            raise ValueError(
+                f"No training images found in directory {directory}. "
+                f"Allowed formats: {ALLOWLIST_FORMATS}"
+            )
+        if not image_paths_val:
+            raise ValueError(
+                f"No validation images found in directory {directory}. "
+                f"Allowed formats: {ALLOWLIST_FORMATS}"
+            )
+        train_dataset = paths_and_labels_to_dataset(
+            image_paths=image_paths_train,
+            image_size=image_size,
+            num_channels=num_channels,
+            labels=labels_train,
+            label_mode=label_mode,
+            num_classes=len(class_names),
+            interpolation=interpolation,
+            crop_to_aspect_ratio=crop_to_aspect_ratio,
+        )
+        val_dataset = paths_and_labels_to_dataset(
+            image_paths=image_paths_val,
+            image_size=image_size,
+            num_channels=num_channels,
+            labels=labels_val,
+            label_mode=label_mode,
+            num_classes=len(class_names),
+            interpolation=interpolation,
+            crop_to_aspect_ratio=crop_to_aspect_ratio,
+        )
+
+        if batch_size is not None:
+            if shuffle:
+                # Shuffle locally at each iteration
+                train_dataset = train_dataset.shuffle(
+                    buffer_size=batch_size * 8, seed=seed
+                )
+            train_dataset = train_dataset.batch(batch_size)
+            val_dataset = val_dataset.batch(batch_size)
+        else:
+            if shuffle:
+                train_dataset = train_dataset.shuffle(
+                    buffer_size=1024, seed=seed
+                )
+
+        train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
+        val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
+
+        # Users may need to reference `class_names`.
+        train_dataset.class_names = class_names
+        val_dataset.class_names = class_names
+
+        # Include file paths for images as attribute.
+        train_dataset.file_paths = image_paths_train
+        val_dataset.file_paths = image_paths_val
+        dataset = [train_dataset, val_dataset]
+    else:
+        image_paths, labels = dataset_utils.get_training_or_validation_split(
+            image_paths, labels, validation_split, subset
+        )
+        if not image_paths:
+            raise ValueError(
+                f"No images found in directory {directory}. "
+                f"Allowed formats: {ALLOWLIST_FORMATS}"
+            )
+
+        dataset = paths_and_labels_to_dataset(
+            image_paths=image_paths,
+            image_size=image_size,
+            num_channels=num_channels,
+            labels=labels,
+            label_mode=label_mode,
+            num_classes=len(class_names),
+            interpolation=interpolation,
+            crop_to_aspect_ratio=crop_to_aspect_ratio,
+        )
+
+        if batch_size is not None:
+            if shuffle:
+                # Shuffle locally at each iteration
+                dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+            dataset = dataset.batch(batch_size)
+        else:
+            if shuffle:
+                dataset = dataset.shuffle(buffer_size=1024, seed=seed)
+
+        dataset = dataset.prefetch(tf.data.AUTOTUNE)
+
+        # Users may need to reference `class_names`.
+        dataset.class_names = class_names
+
+        # Include file paths for images as attribute.
+        dataset.file_paths = image_paths
+    return dataset
+
+
+def paths_and_labels_to_dataset(
+    image_paths,
+    image_size,
+    num_channels,
+    labels,
+    label_mode,
+    num_classes,
+    interpolation,
+    crop_to_aspect_ratio=False,
+):
+    """Constructs a dataset of images and labels."""
+    # TODO(fchollet): consider making num_parallel_calls settable
+    path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
+    args = (image_size, num_channels, interpolation, crop_to_aspect_ratio)
+    img_ds = path_ds.map(
+        lambda x: load_image(x, *args), num_parallel_calls=tf.data.AUTOTUNE
+    )
+    if label_mode:
+        label_ds = dataset_utils.labels_to_dataset(
+            labels, label_mode, num_classes
+        )
+        img_ds = tf.data.Dataset.zip((img_ds, label_ds))
+    return img_ds
+
+
+def load_image(
+    path, image_size, num_channels, interpolation, crop_to_aspect_ratio=False
+):
+    """Load an image from a path and resize it."""
+    img = tf.io.read_file(path)
+    img = tf.image.decode_image(
+        img, channels=num_channels, expand_animations=False
+    )
+    if crop_to_aspect_ratio:
+        img = image_utils.smart_resize(
+            img, image_size, interpolation=interpolation
+        )
     else:
-      if shuffle:
-        dataset = dataset.shuffle(buffer_size=1024, seed=seed)
-
-    # Users may need to reference `class_names`.
-    dataset.class_names = class_names
-    # Include file paths for images as attribute.
-    dataset.file_paths = image_paths
-  return dataset
-
-
-def paths_and_labels_to_dataset(image_paths,
-                                image_size,
-                                num_channels,
-                                labels,
-                                label_mode,
-                                num_classes,
-                                interpolation,
-                                crop_to_aspect_ratio=False):
-  """Constructs a dataset of images and labels."""
-  # TODO(fchollet): consider making num_parallel_calls settable
-  path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
-  args = (image_size, num_channels, interpolation, crop_to_aspect_ratio)
-  img_ds = path_ds.map(
-      lambda x: load_image(x, *args), num_parallel_calls=tf.data.AUTOTUNE)
-  if label_mode:
-    label_ds = dataset_utils.labels_to_dataset(labels, label_mode, num_classes)
-    img_ds = tf.data.Dataset.zip((img_ds, label_ds))
-  return img_ds
-
-
-def load_image(path, image_size, num_channels, interpolation,
-               crop_to_aspect_ratio=False):
-  """Load an image from a path and resize it."""
-  img = tf.io.read_file(path)
-  img = tf.image.decode_image(
-      img, channels=num_channels, expand_animations=False)
-  if crop_to_aspect_ratio:
-    img = image_utils.smart_resize(img, image_size, interpolation=interpolation)
-  else:
-    img = tf.image.resize(img, image_size, method=interpolation)
-  img.set_shape((image_size[0], image_size[1], num_channels))
-  return img
+        img = tf.image.resize(img, image_size, method=interpolation)
+    img.set_shape((image_size[0], image_size[1], num_channels))
+    return img
diff --git a/keras/utils/image_dataset_test.py b/keras/utils/image_dataset_test.py
index fa6f9f61fafa..cc4c26c2408b 100644
--- a/keras/utils/image_dataset_test.py
+++ b/keras/utils/image_dataset_test.py
@@ -14,368 +14,442 @@
 # ==============================================================================
 """Tests for image_dataset."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import shutil
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import image_dataset
 from keras.utils import image_utils
 
 try:
-  import PIL  # pylint:disable=g-import-not-at-top
+    import PIL
 except ImportError:
-  PIL = None
+    PIL = None
 
 
 @test_utils.run_v2_only
 class ImageDatasetFromDirectoryTest(test_combinations.TestCase):
-
-  def _get_images(self, count=16, color_mode='rgb'):
-    width = height = 24
-    imgs = []
-    for _ in range(count):
-      if color_mode == 'grayscale':
-        img = np.random.randint(0, 256, size=(height, width, 1))
-      elif color_mode == 'rgba':
-        img = np.random.randint(0, 256, size=(height, width, 4))
-      else:
-        img = np.random.randint(0, 256, size=(height, width, 3))
-      img = image_utils.array_to_img(img)
-      imgs.append(img)
-    return imgs
-
-  def _prepare_directory(self,
-                         num_classes=2,
-                         grayscale=False,
-                         nested_dirs=False,
-                         color_mode='rgb',
-                         count=16):
-    # Get a unique temp directory
-    temp_dir = os.path.join(self.get_temp_dir(), str(np.random.randint(1e6)))
-    os.mkdir(temp_dir)
-    self.addCleanup(shutil.rmtree, temp_dir)
-
-    # Generate paths to class subdirectories
-    paths = []
-    for class_index in range(num_classes):
-      class_directory = 'class_%s' % (class_index,)
-      if nested_dirs:
-        class_paths = [
-            class_directory, os.path.join(class_directory, 'subfolder_1'),
-            os.path.join(class_directory, 'subfolder_2'), os.path.join(
-                class_directory, 'subfolder_1', 'sub-subfolder')
-        ]
-      else:
-        class_paths = [class_directory]
-      for path in class_paths:
-        os.mkdir(os.path.join(temp_dir, path))
-      paths += class_paths
-
-    # Save images to the paths
-    i = 0
-    for img in self._get_images(color_mode=color_mode, count=count):
-      path = paths[i % len(paths)]
-      if color_mode == 'rgb':
-        ext = 'jpg'
-      else:
-        ext = 'png'
-      filename = os.path.join(path, 'image_%s.%s' % (i, ext))
-      img.save(os.path.join(temp_dir, filename))
-      i += 1
-    return temp_dir
-
-  def test_image_dataset_from_directory_standalone(self):
-    # Test retrieving images without labels from a directory and its subdirs.
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    # Save a few extra images in the parent directory.
-    directory = self._prepare_directory(count=7, num_classes=2)
-    for i, img in enumerate(self._get_images(3)):
-      filename = 'image_%s.jpg' % (i,)
-      img.save(os.path.join(directory, filename))
-
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=5, image_size=(18, 18), labels=None)
-    batch = next(iter(dataset))
-    # We return plain images
-    self.assertEqual(batch.shape, (5, 18, 18, 3))
-    self.assertEqual(batch.dtype.name, 'float32')
-    # Count samples
-    batch_count = 0
-    sample_count = 0
-    for batch in dataset:
-      batch_count += 1
-      sample_count += batch.shape[0]
-    self.assertEqual(batch_count, 2)
-    self.assertEqual(sample_count, 10)
-
-  def test_image_dataset_from_directory_binary(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=2)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode='int')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
-    self.assertEqual(batch[0].dtype.name, 'float32')
-    self.assertEqual(batch[1].shape, (8,))
-    self.assertEqual(batch[1].dtype.name, 'int32')
-
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode='binary')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
-    self.assertEqual(batch[0].dtype.name, 'float32')
-    self.assertEqual(batch[1].shape, (8, 1))
-    self.assertEqual(batch[1].dtype.name, 'float32')
-
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
-    self.assertEqual(batch[0].dtype.name, 'float32')
-    self.assertEqual(batch[1].shape, (8, 2))
-    self.assertEqual(batch[1].dtype.name, 'float32')
-
-  def test_static_shape_in_graph(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=2)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode='int')
-    test_case = self
-
-    @tf.function
-    def symbolic_fn(ds):
-      for x, _ in ds.take(1):
-        test_case.assertListEqual(x.shape.as_list(), [None, 18, 18, 3])
-
-    symbolic_fn(dataset)
-
-  def test_sample_count(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=4, count=15)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode=None)
-    sample_count = 0
-    for batch in dataset:
-      sample_count += batch.shape[0]
-    self.assertEqual(sample_count, 15)
-
-  def test_image_dataset_from_directory_multiclass(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=4, count=15)
-
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode=None)
-    batch = next(iter(dataset))
-    self.assertEqual(batch.shape, (8, 18, 18, 3))
-
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode=None)
-    sample_count = 0
-    iterator = iter(dataset)
-    for batch in dataset:
-      sample_count += next(iterator).shape[0]
-    self.assertEqual(sample_count, 15)
-
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode='int')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
-    self.assertEqual(batch[0].dtype.name, 'float32')
-    self.assertEqual(batch[1].shape, (8,))
-    self.assertEqual(batch[1].dtype.name, 'int32')
-
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
-    self.assertEqual(batch[0].dtype.name, 'float32')
-    self.assertEqual(batch[1].shape, (8, 4))
-    self.assertEqual(batch[1].dtype.name, 'float32')
-
-  def test_image_dataset_from_directory_color_modes(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=4, color_mode='rgba')
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), color_mode='rgba')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 4))
-    self.assertEqual(batch[0].dtype.name, 'float32')
-
-    directory = self._prepare_directory(num_classes=4, color_mode='grayscale')
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), color_mode='grayscale')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 1))
-    self.assertEqual(batch[0].dtype.name, 'float32')
-
-  def test_image_dataset_from_directory_validation_split(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=2, count=10)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=10, image_size=(18, 18),
-        validation_split=0.2, subset='training', seed=1337)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=10, image_size=(18, 18),
-        validation_split=0.2, subset='validation', seed=1337)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (2, 18, 18, 3))
-
-    train_dataset, val_dataset = image_dataset.image_dataset_from_directory(
-        directory,
-        batch_size=10,
-        image_size=(18, 18),
-        validation_split=0.2,
-        subset='both',
-        seed=1337)
-    batch = next(iter(train_dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
-    batch = next(iter(val_dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (2, 18, 18, 3))
-
-  def test_image_dataset_from_directory_manual_labels(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=2, count=2)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18),
-        labels=[0, 1], shuffle=False)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertAllClose(batch[1], [0, 1])
-
-  def test_image_dataset_from_directory_follow_links(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=2, count=25,
-                                        nested_dirs=True)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=8, image_size=(18, 18), label_mode=None,
-        follow_links=True)
-    sample_count = 0
-    for batch in dataset:
-      sample_count += batch.shape[0]
-    self.assertEqual(sample_count, 25)
-
-  def test_image_dataset_from_directory_no_images(self):
-    directory = self._prepare_directory(num_classes=2, count=0)
-    with self.assertRaisesRegex(ValueError, 'No images found.'):
-      _ = image_dataset.image_dataset_from_directory(directory)
-
-  def test_image_dataset_from_directory_crop_to_aspect_ratio(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=2, count=5)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory, batch_size=5, image_size=(18, 18), crop_to_aspect_ratio=True)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (5, 18, 18, 3))
-
-  def test_image_dataset_from_directory_errors(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=3, count=5)
-
-    with self.assertRaisesRegex(ValueError, '`labels` argument should be'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, labels='other')
-
-    with self.assertRaisesRegex(ValueError, '`label_mode` argument must be'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, label_mode='other')
-
-    with self.assertRaisesRegex(ValueError, '`color_mode` must be one of'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, color_mode='other')
-
-    with self.assertRaisesRegex(
-        ValueError, 'only pass `class_names` if `labels="inferred"`'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, labels=[0, 0, 1, 1, 1],
-          class_names=['class_0', 'class_1', 'class_2'])
-
-    with self.assertRaisesRegex(
-        ValueError,
-        'Expected the lengths of `labels` to match the number of files'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, labels=[0, 0, 1, 1])
-
-    with self.assertRaisesRegex(
-        ValueError, '`class_names` passed did not match'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, class_names=['class_0', 'class_2'])
-
-    with self.assertRaisesRegex(ValueError, 'there must be exactly 2'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, label_mode='binary')
-
-    with self.assertRaisesRegex(ValueError,
-                                '`validation_split` must be between 0 and 1'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, validation_split=2)
-
-    with self.assertRaisesRegex(
-        ValueError, '`subset` must be either "training", '
-        '"validation" or "both"'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, validation_split=0.2, subset='other')
-
-    with self.assertRaisesRegex(ValueError, '`validation_split` must be set'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, validation_split=0, subset='training')
-
-    with self.assertRaisesRegex(ValueError, 'must provide a `seed`'):
-      _ = image_dataset.image_dataset_from_directory(
-          directory, validation_split=0.2, subset='training')
-
-  def test_image_dataset_from_directory_not_batched(self):
-    if PIL is None:
-      return  # Skip test if PIL is not available.
-
-    directory = self._prepare_directory(num_classes=2, count=2)
-    dataset = image_dataset.image_dataset_from_directory(
-        directory,
-        batch_size=None,
-        image_size=(18, 18),
-        label_mode=None,
-        shuffle=False)
-    sample = next(iter(dataset))
-    self.assertEqual(len(sample.shape), 3)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _get_images(self, count=16, color_mode="rgb"):
+        width = height = 24
+        imgs = []
+        for _ in range(count):
+            if color_mode == "grayscale":
+                img = np.random.randint(0, 256, size=(height, width, 1))
+            elif color_mode == "rgba":
+                img = np.random.randint(0, 256, size=(height, width, 4))
+            else:
+                img = np.random.randint(0, 256, size=(height, width, 3))
+            img = image_utils.array_to_img(img)
+            imgs.append(img)
+        return imgs
+
+    def _prepare_directory(
+        self,
+        num_classes=2,
+        grayscale=False,
+        nested_dirs=False,
+        color_mode="rgb",
+        count=16,
+    ):
+        # Get a unique temp directory
+        temp_dir = os.path.join(
+            self.get_temp_dir(), str(np.random.randint(1e6))
+        )
+        os.mkdir(temp_dir)
+        self.addCleanup(shutil.rmtree, temp_dir)
+
+        # Generate paths to class subdirectories
+        paths = []
+        for class_index in range(num_classes):
+            class_directory = f"class_{class_index}"
+            if nested_dirs:
+                class_paths = [
+                    class_directory,
+                    os.path.join(class_directory, "subfolder_1"),
+                    os.path.join(class_directory, "subfolder_2"),
+                    os.path.join(
+                        class_directory, "subfolder_1", "sub-subfolder"
+                    ),
+                ]
+            else:
+                class_paths = [class_directory]
+            for path in class_paths:
+                os.mkdir(os.path.join(temp_dir, path))
+            paths += class_paths
+
+        # Save images to the paths
+        i = 0
+        for img in self._get_images(color_mode=color_mode, count=count):
+            path = paths[i % len(paths)]
+            if color_mode == "rgb":
+                ext = "jpg"
+            else:
+                ext = "png"
+            filename = os.path.join(path, f"image_{i}.{ext}")
+            img.save(os.path.join(temp_dir, filename))
+            i += 1
+        return temp_dir
+
+    def test_image_dataset_from_directory_standalone(self):
+        # Test retrieving images without labels from a directory and its
+        # subdirs.
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        # Save a few extra images in the parent directory.
+        directory = self._prepare_directory(count=7, num_classes=2)
+        for i, img in enumerate(self._get_images(3)):
+            filename = f"image_{i}.jpg"
+            img.save(os.path.join(directory, filename))
+
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=5, image_size=(18, 18), labels=None
+        )
+        batch = next(iter(dataset))
+        # We return plain images
+        self.assertEqual(batch.shape, (5, 18, 18, 3))
+        self.assertEqual(batch.dtype.name, "float32")
+        # Count samples
+        batch_count = 0
+        sample_count = 0
+        for batch in dataset:
+            batch_count += 1
+            sample_count += batch.shape[0]
+        self.assertEqual(batch_count, 2)
+        self.assertEqual(sample_count, 10)
+
+    def test_image_dataset_from_directory_binary(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=2)
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), label_mode="int"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8,))
+        self.assertEqual(batch[1].dtype.name, "int32")
+
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), label_mode="binary"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8, 1))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=8,
+            image_size=(18, 18),
+            label_mode="categorical",
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8, 2))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+    def test_static_shape_in_graph(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=2)
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), label_mode="int"
+        )
+        test_case = self
+
+        @tf.function
+        def symbolic_fn(ds):
+            for x, _ in ds.take(1):
+                test_case.assertListEqual(x.shape.as_list(), [None, 18, 18, 3])
+
+        symbolic_fn(dataset)
+
+    def test_sample_count(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=4, count=15)
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), label_mode=None
+        )
+        sample_count = 0
+        for batch in dataset:
+            sample_count += batch.shape[0]
+        self.assertEqual(sample_count, 15)
+
+    def test_image_dataset_from_directory_multiclass(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=4, count=15)
+
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), label_mode=None
+        )
+        batch = next(iter(dataset))
+        self.assertEqual(batch.shape, (8, 18, 18, 3))
+
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), label_mode=None
+        )
+        sample_count = 0
+        iterator = iter(dataset)
+        for batch in dataset:
+            sample_count += next(iterator).shape[0]
+        self.assertEqual(sample_count, 15)
+
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), label_mode="int"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8,))
+        self.assertEqual(batch[1].dtype.name, "int32")
+
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=8,
+            image_size=(18, 18),
+            label_mode="categorical",
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+        self.assertEqual(batch[0].dtype.name, "float32")
+        self.assertEqual(batch[1].shape, (8, 4))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+    def test_image_dataset_from_directory_color_modes(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=4, color_mode="rgba")
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), color_mode="rgba"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 4))
+        self.assertEqual(batch[0].dtype.name, "float32")
+
+        directory = self._prepare_directory(
+            num_classes=4, color_mode="grayscale"
+        )
+        dataset = image_dataset.image_dataset_from_directory(
+            directory, batch_size=8, image_size=(18, 18), color_mode="grayscale"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 1))
+        self.assertEqual(batch[0].dtype.name, "float32")
+
+    def test_image_dataset_from_directory_validation_split(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=2, count=10)
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=10,
+            image_size=(18, 18),
+            validation_split=0.2,
+            subset="training",
+            seed=1337,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=10,
+            image_size=(18, 18),
+            validation_split=0.2,
+            subset="validation",
+            seed=1337,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (2, 18, 18, 3))
+
+        train_dataset, val_dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=10,
+            image_size=(18, 18),
+            validation_split=0.2,
+            subset="both",
+            seed=1337,
+        )
+        batch = next(iter(train_dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+        batch = next(iter(val_dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (2, 18, 18, 3))
+
+    def test_image_dataset_from_directory_manual_labels(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=2, count=2)
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=8,
+            image_size=(18, 18),
+            labels=[0, 1],
+            shuffle=False,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertAllClose(batch[1], [0, 1])
+
+    def test_image_dataset_from_directory_follow_links(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(
+            num_classes=2, count=25, nested_dirs=True
+        )
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=8,
+            image_size=(18, 18),
+            label_mode=None,
+            follow_links=True,
+        )
+        sample_count = 0
+        for batch in dataset:
+            sample_count += batch.shape[0]
+        self.assertEqual(sample_count, 25)
+
+    def test_image_dataset_from_directory_no_images(self):
+        directory = self._prepare_directory(num_classes=2, count=0)
+        with self.assertRaisesRegex(ValueError, "No images found."):
+            _ = image_dataset.image_dataset_from_directory(directory)
+
+    def test_image_dataset_from_directory_crop_to_aspect_ratio(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=2, count=5)
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=5,
+            image_size=(18, 18),
+            crop_to_aspect_ratio=True,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (5, 18, 18, 3))
+
+    def test_image_dataset_from_directory_errors(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=3, count=5)
+
+        with self.assertRaisesRegex(ValueError, "`labels` argument should be"):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, labels="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`label_mode` argument must be"
+        ):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, label_mode="other"
+            )
+
+        with self.assertRaisesRegex(ValueError, "`color_mode` must be one of"):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, color_mode="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, 'only pass `class_names` if `labels="inferred"`'
+        ):
+            _ = image_dataset.image_dataset_from_directory(
+                directory,
+                labels=[0, 0, 1, 1, 1],
+                class_names=["class_0", "class_1", "class_2"],
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Expected the lengths of `labels` to match the number of files",
+        ):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, labels=[0, 0, 1, 1]
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`class_names` passed did not match"
+        ):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, class_names=["class_0", "class_2"]
+            )
+
+        with self.assertRaisesRegex(ValueError, "there must be exactly 2"):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, label_mode="binary"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`validation_split` must be between 0 and 1"
+        ):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, validation_split=2
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            '`subset` must be either "training", "validation" or "both"',
+        ):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, validation_split=0.2, subset="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`validation_split` must be set"
+        ):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, validation_split=0, subset="training"
+            )
+
+        with self.assertRaisesRegex(ValueError, "must provide a `seed`"):
+            _ = image_dataset.image_dataset_from_directory(
+                directory, validation_split=0.2, subset="training"
+            )
+
+    def test_image_dataset_from_directory_not_batched(self):
+        if PIL is None:
+            return  # Skip test if PIL is not available.
+
+        directory = self._prepare_directory(num_classes=2, count=2)
+        dataset = image_dataset.image_dataset_from_directory(
+            directory,
+            batch_size=None,
+            image_size=(18, 18),
+            label_mode=None,
+            shuffle=False,
+        )
+        sample = next(iter(dataset))
+        self.assertEqual(len(sample.shape), 3)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/image_utils.py b/keras/utils/image_utils.py
index 2385af3f7944..94f4ebc2e631 100644
--- a/keras/utils/image_utils.py
+++ b/keras/utils/image_utils.py
@@ -14,431 +14,467 @@
 # ==============================================================================
 """Utilities related to image handling."""
 
-# pylint: disable=g-import-not-at-top
 
 import io
 import pathlib
 import warnings
 
-from keras import backend
 import numpy as np
 import tensorflow.compat.v2 as tf
+
+from keras import backend
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 try:
-  from PIL import Image as pil_image
+    from PIL import Image as pil_image
+
+    try:
+        pil_image_resampling = pil_image.Resampling
+    except AttributeError:
+        pil_image_resampling = pil_image
 except ImportError:
-  pil_image = None
+    pil_image = None
+    pil_image_resampling = None
 
 
-if pil_image is not None:
-  _PIL_INTERPOLATION_METHODS = {
-      'nearest': pil_image.NEAREST,
-      'bilinear': pil_image.BILINEAR,
-      'bicubic': pil_image.BICUBIC,
-      'hamming': pil_image.HAMMING,
-      'box': pil_image.BOX,
-      'lanczos': pil_image.LANCZOS,
-  }
+if pil_image_resampling is not None:
+    _PIL_INTERPOLATION_METHODS = {
+        "nearest": pil_image_resampling.NEAREST,
+        "bilinear": pil_image_resampling.BILINEAR,
+        "bicubic": pil_image_resampling.BICUBIC,
+        "hamming": pil_image_resampling.HAMMING,
+        "box": pil_image_resampling.BOX,
+        "lanczos": pil_image_resampling.LANCZOS,
+    }
 
 ResizeMethod = tf.image.ResizeMethod
 
 _TF_INTERPOLATION_METHODS = {
-    'bilinear': ResizeMethod.BILINEAR,
-    'nearest': ResizeMethod.NEAREST_NEIGHBOR,
-    'bicubic': ResizeMethod.BICUBIC,
-    'area': ResizeMethod.AREA,
-    'lanczos3': ResizeMethod.LANCZOS3,
-    'lanczos5': ResizeMethod.LANCZOS5,
-    'gaussian': ResizeMethod.GAUSSIAN,
-    'mitchellcubic': ResizeMethod.MITCHELLCUBIC
+    "bilinear": ResizeMethod.BILINEAR,
+    "nearest": ResizeMethod.NEAREST_NEIGHBOR,
+    "bicubic": ResizeMethod.BICUBIC,
+    "area": ResizeMethod.AREA,
+    "lanczos3": ResizeMethod.LANCZOS3,
+    "lanczos5": ResizeMethod.LANCZOS5,
+    "gaussian": ResizeMethod.GAUSSIAN,
+    "mitchellcubic": ResizeMethod.MITCHELLCUBIC,
 }
 
 
-@keras_export('keras.preprocessing.image.smart_resize', v1=[])
-def smart_resize(x, size, interpolation='bilinear'):
-  """Resize images to a target size without aspect ratio distortion.
-
-  Warning: `tf.keras.preprocessing.image.smart_resize` is not recommended for
-  new code. Prefer `tf.keras.layers.Resizing`, which provides the same
-  functionality as a preprocessing layer and adds `tf.RaggedTensor` support. See
-  the [preprocessing layer guide](
-  https://www.tensorflow.org/guide/keras/preprocessing_layers)
-  for an overview of preprocessing layers.
-
-  TensorFlow image datasets typically yield images that have each a different
-  size. However, these images need to be batched before they can be
-  processed by Keras layers. To be batched, images need to share the same height
-  and width.
-
-  You could simply do:
-
-  ```python
-  size = (200, 200)
-  ds = ds.map(lambda img: tf.image.resize(img, size))
-  ```
-
-  However, if you do this, you distort the aspect ratio of your images, since
-  in general they do not all have the same aspect ratio as `size`. This is
-  fine in many cases, but not always (e.g. for GANs this can be a problem).
-
-  Note that passing the argument `preserve_aspect_ratio=True` to `resize`
-  will preserve the aspect ratio, but at the cost of no longer respecting the
-  provided target size. Because `tf.image.resize` doesn't crop images,
-  your output images will still have different sizes.
-
-  This calls for:
-
-  ```python
-  size = (200, 200)
-  ds = ds.map(lambda img: smart_resize(img, size))
-  ```
-
-  Your output images will actually be `(200, 200)`, and will not be distorted.
-  Instead, the parts of the image that do not fit within the target size
-  get cropped out.
-
-  The resizing process is:
-
-  1. Take the largest centered crop of the image that has the same aspect ratio
-  as the target size. For instance, if `size=(200, 200)` and the input image has
-  size `(340, 500)`, we take a crop of `(340, 340)` centered along the width.
-  2. Resize the cropped image to the target size. In the example above,
-  we resize the `(340, 340)` crop to `(200, 200)`.
-
-  Args:
-    x: Input image or batch of images (as a tensor or NumPy array). Must be in
-      format `(height, width, channels)` or `(batch_size, height, width,
-      channels)`.
-    size: Tuple of `(height, width)` integer. Target size.
-    interpolation: String, interpolation to use for resizing. Defaults to
-      `'bilinear'`. Supports `bilinear`, `nearest`, `bicubic`, `area`,
-      `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
-
-  Returns:
-    Array with shape `(size[0], size[1], channels)`. If the input image was a
-    NumPy array, the output is a NumPy array, and if it was a TF tensor,
-    the output is a TF tensor.
-  """
-  if len(size) != 2:
-    raise ValueError('Expected `size` to be a tuple of 2 integers, '
-                     f'but got: {size}.')
-  img = tf.convert_to_tensor(x)
-  if img.shape.rank is not None:
-    if img.shape.rank < 3 or img.shape.rank > 4:
-      raise ValueError(
-          'Expected an image array with shape `(height, width, channels)`, '
-          'or `(batch_size, height, width, channels)`, but '
-          f'got input with incorrect rank, of shape {img.shape}.')
-  shape = tf.shape(img)
-  height, width = shape[-3], shape[-2]
-  target_height, target_width = size
-  if img.shape.rank is not None:
-    static_num_channels = img.shape[-1]
-  else:
-    static_num_channels = None
-
-  crop_height = tf.cast(
-      tf.cast(width * target_height, 'float32') / target_width, 'int32')
-  crop_width = tf.cast(
-      tf.cast(height * target_width, 'float32') / target_height, 'int32')
-
-  # Set back to input height / width if crop_height / crop_width is not smaller.
-  crop_height = tf.minimum(height, crop_height)
-  crop_width = tf.minimum(width, crop_width)
-
-  crop_box_hstart = tf.cast(
-      tf.cast(height - crop_height, 'float32') / 2, 'int32')
-  crop_box_wstart = tf.cast(tf.cast(width - crop_width, 'float32') / 2, 'int32')
-
-  if img.shape.rank == 4:
-    crop_box_start = tf.stack([0, crop_box_hstart, crop_box_wstart, 0])
-    crop_box_size = tf.stack([-1, crop_height, crop_width, -1])
-  else:
-    crop_box_start = tf.stack([crop_box_hstart, crop_box_wstart, 0])
-    crop_box_size = tf.stack([crop_height, crop_width, -1])
-
-  img = tf.slice(img, crop_box_start, crop_box_size)
-  img = tf.image.resize(images=img, size=size, method=interpolation)
-  # Apparent bug in resize_images_v2 may cause shape to be lost
-  if img.shape.rank is not None:
+@keras_export("keras.preprocessing.image.smart_resize", v1=[])
+def smart_resize(x, size, interpolation="bilinear"):
+    """Resize images to a target size without aspect ratio distortion.
+
+    Warning: `tf.keras.preprocessing.image.smart_resize` is not recommended for
+    new code. Prefer `tf.keras.layers.Resizing`, which provides the same
+    functionality as a preprocessing layer and adds `tf.RaggedTensor` support.
+    See the [preprocessing layer guide](
+    https://www.tensorflow.org/guide/keras/preprocessing_layers)
+    for an overview of preprocessing layers.
+
+    TensorFlow image datasets typically yield images that have each a different
+    size. However, these images need to be batched before they can be
+    processed by Keras layers. To be batched, images need to share the same
+    height and width.
+
+    You could simply do:
+
+    ```python
+    size = (200, 200)
+    ds = ds.map(lambda img: tf.image.resize(img, size))
+    ```
+
+    However, if you do this, you distort the aspect ratio of your images, since
+    in general they do not all have the same aspect ratio as `size`. This is
+    fine in many cases, but not always (e.g. for GANs this can be a problem).
+
+    Note that passing the argument `preserve_aspect_ratio=True` to `resize`
+    will preserve the aspect ratio, but at the cost of no longer respecting the
+    provided target size. Because `tf.image.resize` doesn't crop images,
+    your output images will still have different sizes.
+
+    This calls for:
+
+    ```python
+    size = (200, 200)
+    ds = ds.map(lambda img: smart_resize(img, size))
+    ```
+
+    Your output images will actually be `(200, 200)`, and will not be distorted.
+    Instead, the parts of the image that do not fit within the target size
+    get cropped out.
+
+    The resizing process is:
+
+    1. Take the largest centered crop of the image that has the same aspect
+    ratio as the target size. For instance, if `size=(200, 200)` and the input
+    image has size `(340, 500)`, we take a crop of `(340, 340)` centered along
+    the width.
+    2. Resize the cropped image to the target size. In the example above,
+    we resize the `(340, 340)` crop to `(200, 200)`.
+
+    Args:
+      x: Input image or batch of images (as a tensor or NumPy array). Must be in
+        format `(height, width, channels)` or `(batch_size, height, width,
+        channels)`.
+      size: Tuple of `(height, width)` integer. Target size.
+      interpolation: String, interpolation to use for resizing. Supports
+        `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
+        `gaussian`, `mitchellcubic`. Defaults to `'bilinear'`.
+
+    Returns:
+      Array with shape `(size[0], size[1], channels)`. If the input image was a
+      NumPy array, the output is a NumPy array, and if it was a TF tensor,
+      the output is a TF tensor.
+    """
+    if len(size) != 2:
+        raise ValueError(
+            f"Expected `size` to be a tuple of 2 integers, but got: {size}."
+        )
+    img = tf.convert_to_tensor(x)
+    if img.shape.rank is not None:
+        if img.shape.rank < 3 or img.shape.rank > 4:
+            raise ValueError(
+                "Expected an image array with shape `(height, width, "
+                "channels)`, or `(batch_size, height, width, channels)`, but "
+                f"got input with incorrect rank, of shape {img.shape}."
+            )
+    shape = tf.shape(img)
+    height, width = shape[-3], shape[-2]
+    target_height, target_width = size
+    if img.shape.rank is not None:
+        static_num_channels = img.shape[-1]
+    else:
+        static_num_channels = None
+
+    crop_height = tf.cast(
+        tf.cast(width * target_height, "float32") / target_width, "int32"
+    )
+    crop_width = tf.cast(
+        tf.cast(height * target_width, "float32") / target_height, "int32"
+    )
+
+    # Set back to input height / width if crop_height / crop_width is not
+    # smaller.
+    crop_height = tf.minimum(height, crop_height)
+    crop_width = tf.minimum(width, crop_width)
+
+    crop_box_hstart = tf.cast(
+        tf.cast(height - crop_height, "float32") / 2, "int32"
+    )
+    crop_box_wstart = tf.cast(
+        tf.cast(width - crop_width, "float32") / 2, "int32"
+    )
+
     if img.shape.rank == 4:
-      img.set_shape((None, None, None, static_num_channels))
-    if img.shape.rank == 3:
-      img.set_shape((None, None, static_num_channels))
-  if isinstance(x, np.ndarray):
-    return img.numpy()
-  return img
+        crop_box_start = tf.stack([0, crop_box_hstart, crop_box_wstart, 0])
+        crop_box_size = tf.stack([-1, crop_height, crop_width, -1])
+    else:
+        crop_box_start = tf.stack([crop_box_hstart, crop_box_wstart, 0])
+        crop_box_size = tf.stack([crop_height, crop_width, -1])
+
+    img = tf.slice(img, crop_box_start, crop_box_size)
+    img = tf.image.resize(images=img, size=size, method=interpolation)
+    # Apparent bug in resize_images_v2 may cause shape to be lost
+    if img.shape.rank is not None:
+        if img.shape.rank == 4:
+            img.set_shape((None, None, None, static_num_channels))
+        if img.shape.rank == 3:
+            img.set_shape((None, None, static_num_channels))
+    if isinstance(x, np.ndarray):
+        return img.numpy()
+    return img
 
 
 def get_interpolation(interpolation):
-  interpolation = interpolation.lower()
-  if interpolation not in _TF_INTERPOLATION_METHODS:
-    raise NotImplementedError(
-        'Value not recognized for `interpolation`: {}. Supported values '
-        'are: {}'.format(interpolation, _TF_INTERPOLATION_METHODS.keys()))
-  return _TF_INTERPOLATION_METHODS[interpolation]
+    interpolation = interpolation.lower()
+    if interpolation not in _TF_INTERPOLATION_METHODS:
+        raise NotImplementedError(
+            "Value not recognized for `interpolation`: {}. Supported values "
+            "are: {}".format(interpolation, _TF_INTERPOLATION_METHODS.keys())
+        )
+    return _TF_INTERPOLATION_METHODS[interpolation]
+
+
+@keras_export(
+    "keras.utils.array_to_img", "keras.preprocessing.image.array_to_img"
+)
+def array_to_img(x, data_format=None, scale=True, dtype=None):
+    """Converts a 3D Numpy array to a PIL Image instance.
+
+    Usage:
+
+    ```python
+    from PIL import Image
+    img = np.random.random(size=(100, 100, 3))
+    pil_img = tf.keras.utils.array_to_img(img)
+    ```
+
+
+    Args:
+        x: Input data, in any form that can be converted to a Numpy array.
+        data_format: Image data format, can be either `"channels_first"` or
+          `"channels_last"`. None means the global
+          setting `tf.keras.backend.image_data_format()` is used (unless you
+          changed it, it uses `"channels_last"`). Defaults to `None`.
+        scale: Whether to rescale the image such that minimum and maximum values
+          are 0 and 255 respectively. Defaults to `True`.
+        dtype: Dtype to use. None makes the global setting
+          `tf.keras.backend.floatx()` to be used (unless you changed it, it
+          uses `"float32"`). Defaults to `None`.
+
+    Returns:
+        A PIL Image instance.
+
+    Raises:
+        ImportError: if PIL is not available.
+        ValueError: if invalid `x` or `data_format` is passed.
+    """
+
+    if data_format is None:
+        data_format = backend.image_data_format()
+    if dtype is None:
+        dtype = backend.floatx()
+    if pil_image is None:
+        raise ImportError(
+            "Could not import PIL.Image. "
+            "The use of `array_to_img` requires PIL."
+        )
+    x = np.asarray(x, dtype=dtype)
+    if x.ndim != 3:
+        raise ValueError(
+            "Expected image array to have rank 3 (single image). "
+            f"Got array with shape: {x.shape}"
+        )
+
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError(f"Invalid data_format: {data_format}")
+
+    # Original Numpy array x has format (height, width, channel)
+    # or (channel, height, width)
+    # but target PIL image has format (width, height, channel)
+    if data_format == "channels_first":
+        x = x.transpose(1, 2, 0)
+    if scale:
+        x = x - np.min(x)
+        x_max = np.max(x)
+        if x_max != 0:
+            x /= x_max
+        x *= 255
+    if x.shape[2] == 4:
+        # RGBA
+        return pil_image.fromarray(x.astype("uint8"), "RGBA")
+    elif x.shape[2] == 3:
+        # RGB
+        return pil_image.fromarray(x.astype("uint8"), "RGB")
+    elif x.shape[2] == 1:
+        # grayscale
+        if np.max(x) > 255:
+            # 32-bit signed integer grayscale image. PIL mode "I"
+            return pil_image.fromarray(x[:, :, 0].astype("int32"), "I")
+        return pil_image.fromarray(x[:, :, 0].astype("uint8"), "L")
+    else:
+        raise ValueError(f"Unsupported channel number: {x.shape[2]}")
 
 
-@keras_export('keras.utils.array_to_img',
-              'keras.preprocessing.image.array_to_img')
-def array_to_img(x, data_format=None, scale=True, dtype=None):
-  """Converts a 3D Numpy array to a PIL Image instance.
-
-  Usage:
-
-  ```python
-  from PIL import Image
-  img = np.random.random(size=(100, 100, 3))
-  pil_img = tf.keras.preprocessing.image.array_to_img(img)
-  ```
-
-
-  Args:
-      x: Input data, in any form that can be converted to a Numpy array.
-      data_format: Image data format, can be either `"channels_first"` or
-        `"channels_last"`. Defaults to `None`, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to `"channels_last"`).
-      scale: Whether to rescale the image such that minimum and maximum values
-        are 0 and 255 respectively. Defaults to `True`.
-      dtype: Dtype to use. Default to `None`, in which case the global setting
-        `tf.keras.backend.floatx()` is used (unless you changed it, it defaults
-        to `"float32"`)
-
-  Returns:
-      A PIL Image instance.
-
-  Raises:
-      ImportError: if PIL is not available.
-      ValueError: if invalid `x` or `data_format` is passed.
-  """
-
-  if data_format is None:
-    data_format = backend.image_data_format()
-  if dtype is None:
-    dtype = backend.floatx()
-  if pil_image is None:
-    raise ImportError('Could not import PIL.Image. '
-                      'The use of `array_to_img` requires PIL.')
-  x = np.asarray(x, dtype=dtype)
-  if x.ndim != 3:
-    raise ValueError('Expected image array to have rank 3 (single image). '
-                     f'Got array with shape: {x.shape}')
-
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError(f'Invalid data_format: {data_format}')
-
-  # Original Numpy array x has format (height, width, channel)
-  # or (channel, height, width)
-  # but target PIL image has format (width, height, channel)
-  if data_format == 'channels_first':
-    x = x.transpose(1, 2, 0)
-  if scale:
-    x = x - np.min(x)
-    x_max = np.max(x)
-    if x_max != 0:
-      x /= x_max
-    x *= 255
-  if x.shape[2] == 4:
-    # RGBA
-    return pil_image.fromarray(x.astype('uint8'), 'RGBA')
-  elif x.shape[2] == 3:
-    # RGB
-    return pil_image.fromarray(x.astype('uint8'), 'RGB')
-  elif x.shape[2] == 1:
-    # grayscale
-    if np.max(x) > 255:
-      # 32-bit signed integer grayscale image. PIL mode "I"
-      return pil_image.fromarray(x[:, :, 0].astype('int32'), 'I')
-    return pil_image.fromarray(x[:, :, 0].astype('uint8'), 'L')
-  else:
-    raise ValueError(f'Unsupported channel number: {x.shape[2]}')
-
-
-@keras_export('keras.utils.img_to_array',
-              'keras.preprocessing.image.img_to_array')
+@keras_export(
+    "keras.utils.img_to_array", "keras.preprocessing.image.img_to_array"
+)
 def img_to_array(img, data_format=None, dtype=None):
-  """Converts a PIL Image instance to a Numpy array.
-
-  Usage:
-
-  ```python
-  from PIL import Image
-  img_data = np.random.random(size=(100, 100, 3))
-  img = tf.keras.preprocessing.image.array_to_img(img_data)
-  array = tf.keras.preprocessing.image.img_to_array(img)
-  ```
-
-
-  Args:
-      img: Input PIL Image instance.
-      data_format: Image data format, can be either `"channels_first"` or
-        `"channels_last"`. Defaults to `None`, in which case the global setting
-        `tf.keras.backend.image_data_format()` is used (unless you changed it,
-        it defaults to `"channels_last"`).
-      dtype: Dtype to use. Default to `None`, in which case the global setting
-        `tf.keras.backend.floatx()` is used (unless you changed it, it defaults
-        to `"float32"`).
-
-  Returns:
-      A 3D Numpy array.
-
-  Raises:
-      ValueError: if invalid `img` or `data_format` is passed.
-  """
-
-  if data_format is None:
-    data_format = backend.image_data_format()
-  if dtype is None:
-    dtype = backend.floatx()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError(f'Unknown data_format: {data_format}')
-  # Numpy array x has format (height, width, channel)
-  # or (channel, height, width)
-  # but original PIL image has format (width, height, channel)
-  x = np.asarray(img, dtype=dtype)
-  if len(x.shape) == 3:
-    if data_format == 'channels_first':
-      x = x.transpose(2, 0, 1)
-  elif len(x.shape) == 2:
-    if data_format == 'channels_first':
-      x = x.reshape((1, x.shape[0], x.shape[1]))
+    """Converts a PIL Image instance to a Numpy array.
+
+    Usage:
+
+    ```python
+    from PIL import Image
+    img_data = np.random.random(size=(100, 100, 3))
+    img = tf.keras.utils.array_to_img(img_data)
+    array = tf.keras.utils.image.img_to_array(img)
+    ```
+
+
+    Args:
+        img: Input PIL Image instance.
+        data_format: Image data format, can be either `"channels_first"` or
+          `"channels_last"`. None means the global
+          setting `tf.keras.backend.image_data_format()` is used (unless you
+          changed it, it uses `"channels_last"`). Defaults to `None`.
+        dtype: Dtype to use. None makes the global setting
+          `tf.keras.backend.floatx()` to be used (unless you changed it, it
+          uses `"float32"`). Defaults to `None`.
+
+    Returns:
+        A 3D Numpy array.
+
+    Raises:
+        ValueError: if invalid `img` or `data_format` is passed.
+    """
+
+    if data_format is None:
+        data_format = backend.image_data_format()
+    if dtype is None:
+        dtype = backend.floatx()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError(f"Unknown data_format: {data_format}")
+    # Numpy array x has format (height, width, channel)
+    # or (channel, height, width)
+    # but original PIL image has format (width, height, channel)
+    x = np.asarray(img, dtype=dtype)
+    if len(x.shape) == 3:
+        if data_format == "channels_first":
+            x = x.transpose(2, 0, 1)
+    elif len(x.shape) == 2:
+        if data_format == "channels_first":
+            x = x.reshape((1, x.shape[0], x.shape[1]))
+        else:
+            x = x.reshape((x.shape[0], x.shape[1], 1))
     else:
-      x = x.reshape((x.shape[0], x.shape[1], 1))
-  else:
-    raise ValueError(f'Unsupported image shape: {x.shape}')
-  return x
+        raise ValueError(f"Unsupported image shape: {x.shape}")
+    return x
 
 
-@keras_export('keras.utils.save_img', 'keras.preprocessing.image.save_img')
+@keras_export("keras.utils.save_img", "keras.preprocessing.image.save_img")
 def save_img(path, x, data_format=None, file_format=None, scale=True, **kwargs):
-  """Saves an image stored as a Numpy array to a path or file object.
-
-  Args:
-      path: Path or file object.
-      x: Numpy array.
-      data_format: Image data format, either `"channels_first"` or
-        `"channels_last"`.
-      file_format: Optional file format override. If omitted, the format to use
-        is determined from the filename extension. If a file object was used
-        instead of a filename, this parameter should always be used.
-      scale: Whether to rescale image values to be within `[0, 255]`.
-      **kwargs: Additional keyword arguments passed to `PIL.Image.save()`.
-  """
-  if data_format is None:
-    data_format = backend.image_data_format()
-  img = array_to_img(x, data_format=data_format, scale=scale)
-  if img.mode == 'RGBA' and (file_format == 'jpg' or file_format == 'jpeg'):
-    warnings.warn('The JPG format does not support '
-                  'RGBA images, converting to RGB.')
-    img = img.convert('RGB')
-  img.save(path, format=file_format, **kwargs)
-
-
-@keras_export('keras.utils.load_img', 'keras.preprocessing.image.load_img')
-def load_img(path,
-             grayscale=False,
-             color_mode='rgb',
-             target_size=None,
-             interpolation='nearest',
-             keep_aspect_ratio=False):
-  """Loads an image into PIL format.
-
-  Usage:
-
-  ```
-  image = tf.keras.preprocessing.image.load_img(image_path)
-  input_arr = tf.keras.preprocessing.image.img_to_array(image)
-  input_arr = np.array([input_arr])  # Convert single image to a batch.
-  predictions = model.predict(input_arr)
-  ```
-
-  Args:
-      path: Path to image file.
-      grayscale: DEPRECATED use `color_mode="grayscale"`.
-      color_mode: One of `"grayscale"`, `"rgb"`, `"rgba"`. Default: `"rgb"`.
-        The desired image format.
-      target_size: Either `None` (default to original size) or tuple of ints
-        `(img_height, img_width)`.
-      interpolation: Interpolation method used to resample the image if the
-        target size is different from that of the loaded image. Supported
-        methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
-        1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
-        version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
-        supported. By default, `"nearest"` is used.
-      keep_aspect_ratio: Boolean, whether to resize images to a target
-              size without aspect ratio distortion. The image is cropped in
-              the center with target aspect ratio before resizing.
-
-  Returns:
-      A PIL Image instance.
-
-  Raises:
-      ImportError: if PIL is not available.
-      ValueError: if interpolation method is not supported.
-  """
-  if grayscale:
-    warnings.warn('grayscale is deprecated. Please use '
-                  'color_mode = "grayscale"')
-    color_mode = 'grayscale'
-  if pil_image is None:
-    raise ImportError('Could not import PIL.Image. '
-                      'The use of `load_img` requires PIL.')
-  if isinstance(path, io.BytesIO):
-    img = pil_image.open(path)
-  elif isinstance(path, (pathlib.Path, bytes, str)):
-    if isinstance(path, pathlib.Path):
-      path = str(path.resolve())
-    with open(path, 'rb') as f:
-      img = pil_image.open(io.BytesIO(f.read()))
-  else:
-    raise TypeError('path should be path-like or io.BytesIO'
-                    ', not {}'.format(type(path)))
-
-  if color_mode == 'grayscale':
-    # if image is not already an 8-bit, 16-bit or 32-bit grayscale image
-    # convert it to an 8-bit grayscale image.
-    if img.mode not in ('L', 'I;16', 'I'):
-      img = img.convert('L')
-  elif color_mode == 'rgba':
-    if img.mode != 'RGBA':
-      img = img.convert('RGBA')
-  elif color_mode == 'rgb':
-    if img.mode != 'RGB':
-      img = img.convert('RGB')
-  else:
-    raise ValueError('color_mode must be "grayscale", "rgb", or "rgba"')
-  if target_size is not None:
-    width_height_tuple = (target_size[1], target_size[0])
-    if img.size != width_height_tuple:
-      if interpolation not in _PIL_INTERPOLATION_METHODS:
-        raise ValueError('Invalid interpolation method {} specified. Supported '
-                         'methods are {}'.format(
-                             interpolation,
-                             ', '.join(_PIL_INTERPOLATION_METHODS.keys())))
-      resample = _PIL_INTERPOLATION_METHODS[interpolation]
-
-      if keep_aspect_ratio:
-        width, height = img.size
-        target_width, target_height = width_height_tuple
-
-        crop_height = (width * target_height) // target_width
-        crop_width = (height * target_width) // target_height
-
-        # Set back to input height / width
-        # if crop_height / crop_width is not smaller.
-        crop_height = min(height, crop_height)
-        crop_width = min(width, crop_width)
-
-        crop_box_hstart = (height - crop_height) // 2
-        crop_box_wstart = (width - crop_width) // 2
-        crop_box_wend = crop_box_wstart + crop_width
-        crop_box_hend = crop_box_hstart + crop_height
-        crop_box = [
-            crop_box_wstart, crop_box_hstart, crop_box_wend, crop_box_hend
-        ]
-        img = img.resize(width_height_tuple, resample, box=crop_box)
-      else:
-        img = img.resize(width_height_tuple, resample)
-  return img
+    """Saves an image stored as a Numpy array to a path or file object.
+
+    Args:
+        path: Path or file object.
+        x: Numpy array.
+        data_format: Image data format, either `"channels_first"` or
+          `"channels_last"`.
+        file_format: Optional file format override. If omitted, the format to
+          use is determined from the filename extension. If a file object was
+          used instead of a filename, this parameter should always be used.
+        scale: Whether to rescale image values to be within `[0, 255]`.
+        **kwargs: Additional keyword arguments passed to `PIL.Image.save()`.
+    """
+    if data_format is None:
+        data_format = backend.image_data_format()
+    img = array_to_img(x, data_format=data_format, scale=scale)
+    if img.mode == "RGBA" and (file_format == "jpg" or file_format == "jpeg"):
+        warnings.warn(
+            "The JPG format does not support RGBA images, converting to RGB."
+        )
+        img = img.convert("RGB")
+    img.save(path, format=file_format, **kwargs)
+
+
+@keras_export("keras.utils.load_img", "keras.preprocessing.image.load_img")
+def load_img(
+    path,
+    grayscale=False,
+    color_mode="rgb",
+    target_size=None,
+    interpolation="nearest",
+    keep_aspect_ratio=False,
+):
+    """Loads an image into PIL format.
+
+    Usage:
+
+    ```python
+    image = tf.keras.utils.load_img(image_path)
+    input_arr = tf.keras.utils.img_to_array(image)
+    input_arr = np.array([input_arr])  # Convert single image to a batch.
+    predictions = model.predict(input_arr)
+    ```
+
+    Args:
+        path: Path to image file.
+        grayscale: DEPRECATED use `color_mode="grayscale"`.
+        color_mode: One of `"grayscale"`, `"rgb"`, `"rgba"`. Default: `"rgb"`.
+          The desired image format.
+        target_size: Either `None` (default to original size) or tuple of ints
+          `(img_height, img_width)`.
+        interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image. Supported
+          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
+          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
+          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
+          supported. By default, `"nearest"` is used.
+        keep_aspect_ratio: Boolean, whether to resize images to a target
+                size without aspect ratio distortion. The image is cropped in
+                the center with target aspect ratio before resizing.
+
+    Returns:
+        A PIL Image instance.
+
+    Raises:
+        ImportError: if PIL is not available.
+        ValueError: if interpolation method is not supported.
+    """
+    if grayscale:
+        warnings.warn(
+            'grayscale is deprecated. Please use color_mode = "grayscale"'
+        )
+        color_mode = "grayscale"
+    if pil_image is None:
+        raise ImportError(
+            "Could not import PIL.Image. The use of `load_img` requires PIL."
+        )
+    if isinstance(path, io.BytesIO):
+        img = pil_image.open(path)
+    elif isinstance(path, (pathlib.Path, bytes, str)):
+        if isinstance(path, pathlib.Path):
+            path = str(path.resolve())
+        with open(path, "rb") as f:
+            img = pil_image.open(io.BytesIO(f.read()))
+    else:
+        raise TypeError(
+            f"path should be path-like or io.BytesIO, not {type(path)}"
+        )
+
+    if color_mode == "grayscale":
+        # if image is not already an 8-bit, 16-bit or 32-bit grayscale image
+        # convert it to an 8-bit grayscale image.
+        if img.mode not in ("L", "I;16", "I"):
+            img = img.convert("L")
+    elif color_mode == "rgba":
+        if img.mode != "RGBA":
+            img = img.convert("RGBA")
+    elif color_mode == "rgb":
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+    else:
+        raise ValueError('color_mode must be "grayscale", "rgb", or "rgba"')
+    if target_size is not None:
+        width_height_tuple = (target_size[1], target_size[0])
+        if img.size != width_height_tuple:
+            if interpolation not in _PIL_INTERPOLATION_METHODS:
+                raise ValueError(
+                    "Invalid interpolation method {} specified. Supported "
+                    "methods are {}".format(
+                        interpolation,
+                        ", ".join(_PIL_INTERPOLATION_METHODS.keys()),
+                    )
+                )
+            resample = _PIL_INTERPOLATION_METHODS[interpolation]
+
+            if keep_aspect_ratio:
+                width, height = img.size
+                target_width, target_height = width_height_tuple
+
+                crop_height = (width * target_height) // target_width
+                crop_width = (height * target_width) // target_height
+
+                # Set back to input height / width
+                # if crop_height / crop_width is not smaller.
+                crop_height = min(height, crop_height)
+                crop_width = min(width, crop_width)
+
+                crop_box_hstart = (height - crop_height) // 2
+                crop_box_wstart = (width - crop_width) // 2
+                crop_box_wend = crop_box_wstart + crop_width
+                crop_box_hend = crop_box_hstart + crop_height
+                crop_box = [
+                    crop_box_wstart,
+                    crop_box_hstart,
+                    crop_box_wend,
+                    crop_box_hend,
+                ]
+                img = img.resize(width_height_tuple, resample, box=crop_box)
+            else:
+                img = img.resize(width_height_tuple, resample)
+    return img
diff --git a/keras/utils/image_utils_test.py b/keras/utils/image_utils_test.py
index ff88e939a3e3..07e103c00390 100644
--- a/keras/utils/image_utils_test.py
+++ b/keras/utils/image_utils_test.py
@@ -18,420 +18,486 @@
 import os
 import pathlib
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import image_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_utils.run_v2_only
 class TestImageUtils(test_combinations.TestCase):
-
-  def test_smart_resize(self):
-    test_input = np.random.random((20, 40, 3))
-    output = image_utils.smart_resize(test_input, size=(50, 50))
-    self.assertIsInstance(output, np.ndarray)
-    self.assertListEqual(list(output.shape), [50, 50, 3])
-    output = image_utils.smart_resize(test_input, size=(10, 10))
-    self.assertListEqual(list(output.shape), [10, 10, 3])
-    output = image_utils.smart_resize(test_input, size=(100, 50))
-    self.assertListEqual(list(output.shape), [100, 50, 3])
-    output = image_utils.smart_resize(test_input, size=(5, 15))
-    self.assertListEqual(list(output.shape), [5, 15, 3])
-
-  @parameterized.named_parameters(('size1', (50, 50)), ('size2', (10, 10)),
-                                  ('size3', (100, 50)), ('size4', (5, 15)))
-  def test_smart_resize_tf_dataset(self, size):
-    test_input_np = np.random.random((2, 20, 40, 3))
-    test_ds = tf.data.Dataset.from_tensor_slices(test_input_np)
-
-    resize = lambda img: image_utils.smart_resize(img, size=size)
-    test_ds = test_ds.map(resize)
-    for sample in test_ds.as_numpy_iterator():
-      self.assertIsInstance(sample, np.ndarray)
-      self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
-
-  def test_smart_resize_batch(self):
-    img = np.random.random((2, 20, 40, 3))
-    out = image_utils.smart_resize(img, size=(20, 20))
-    self.assertListEqual(list(out.shape), [2, 20, 20, 3])
-    self.assertAllClose(out, img[:, :, 10:-10, :])
-
-  def test_smart_resize_errors(self):
-    with self.assertRaisesRegex(ValueError, 'a tuple of 2 integers'):
-      image_utils.smart_resize(np.random.random((20, 20, 2)), size=(10, 5, 3))
-    with self.assertRaisesRegex(ValueError, 'incorrect rank'):
-      image_utils.smart_resize(np.random.random((2, 4)), size=(10, 5))
-    with self.assertRaisesRegex(ValueError, 'incorrect rank'):
-      image_utils.smart_resize(np.random.random((2, 4, 4, 5, 3)), size=(10, 5))
+    def test_smart_resize(self):
+        test_input = np.random.random((20, 40, 3))
+        output = image_utils.smart_resize(test_input, size=(50, 50))
+        self.assertIsInstance(output, np.ndarray)
+        self.assertListEqual(list(output.shape), [50, 50, 3])
+        output = image_utils.smart_resize(test_input, size=(10, 10))
+        self.assertListEqual(list(output.shape), [10, 10, 3])
+        output = image_utils.smart_resize(test_input, size=(100, 50))
+        self.assertListEqual(list(output.shape), [100, 50, 3])
+        output = image_utils.smart_resize(test_input, size=(5, 15))
+        self.assertListEqual(list(output.shape), [5, 15, 3])
+
+    @parameterized.named_parameters(
+        ("size1", (50, 50)),
+        ("size2", (10, 10)),
+        ("size3", (100, 50)),
+        ("size4", (5, 15)),
+    )
+    def test_smart_resize_tf_dataset(self, size):
+        test_input_np = np.random.random((2, 20, 40, 3))
+        test_ds = tf.data.Dataset.from_tensor_slices(test_input_np)
+
+        resize = lambda img: image_utils.smart_resize(img, size=size)
+        test_ds = test_ds.map(resize)
+        for sample in test_ds.as_numpy_iterator():
+            self.assertIsInstance(sample, np.ndarray)
+            self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
+
+    def test_smart_resize_batch(self):
+        img = np.random.random((2, 20, 40, 3))
+        out = image_utils.smart_resize(img, size=(20, 20))
+        self.assertListEqual(list(out.shape), [2, 20, 20, 3])
+        self.assertAllClose(out, img[:, :, 10:-10, :])
+
+    def test_smart_resize_errors(self):
+        with self.assertRaisesRegex(ValueError, "a tuple of 2 integers"):
+            image_utils.smart_resize(
+                np.random.random((20, 20, 2)), size=(10, 5, 3)
+            )
+        with self.assertRaisesRegex(ValueError, "incorrect rank"):
+            image_utils.smart_resize(np.random.random((2, 4)), size=(10, 5))
+        with self.assertRaisesRegex(ValueError, "incorrect rank"):
+            image_utils.smart_resize(
+                np.random.random((2, 4, 4, 5, 3)), size=(10, 5)
+            )
 
 
 @test_utils.run_v2_only
 class TestImageLoading(test_combinations.TestCase):
-
-  def test_load_img(self):
-    tmpdir = self.create_tempdir()
-    filename_rgb = os.path.join(tmpdir.full_path, 'rgb_utils.png')
-    filename_rgba = os.path.join(tmpdir.full_path, 'rgba_utils.png')
-    filename_grayscale_8bit = os.path.join(tmpdir.full_path,
-                                           'grayscale_8bit_utils.png')
-    filename_grayscale_16bit = os.path.join(tmpdir.full_path,
-                                            'grayscale_16bit_utils.tiff')
-    filename_grayscale_32bit = os.path.join(tmpdir.full_path,
-                                            'grayscale_32bit_utils.tiff')
-
-    original_rgb_array = np.array(
-        255 * np.random.rand(100, 100, 3), dtype=np.uint8)
-    original_rgb = image_utils.array_to_img(original_rgb_array, scale=False)
-    original_rgb.save(filename_rgb)
-
-    original_rgba_array = np.array(
-        255 * np.random.rand(100, 100, 4), dtype=np.uint8)
-    original_rgba = image_utils.array_to_img(original_rgba_array, scale=False)
-    original_rgba.save(filename_rgba)
-
-    original_grayscale_8bit_array = np.array(
-        255 * np.random.rand(100, 100, 1), dtype=np.uint8)
-    original_grayscale_8bit = image_utils.array_to_img(
-        original_grayscale_8bit_array, scale=False)
-    original_grayscale_8bit.save(filename_grayscale_8bit)
-
-    original_grayscale_16bit_array = np.array(
-        np.random.randint(-2147483648, 2147483647, (100, 100, 1)),
-        dtype=np.int16)
-    original_grayscale_16bit = image_utils.array_to_img(
-        original_grayscale_16bit_array, scale=False, dtype='int16')
-    original_grayscale_16bit.save(filename_grayscale_16bit)
-
-    original_grayscale_32bit_array = np.array(
-        np.random.randint(-2147483648, 2147483647, (100, 100, 1)),
-        dtype=np.int32)
-    original_grayscale_32bit = image_utils.array_to_img(
-        original_grayscale_32bit_array, scale=False, dtype='int32')
-    original_grayscale_32bit.save(filename_grayscale_32bit)
-
-    # Test that loaded image is exactly equal to original.
-
-    loaded_im = image_utils.load_img(filename_rgb)
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, original_rgb_array.shape)
-    self.assertAllClose(loaded_im_array, original_rgb_array)
-
-    loaded_im = image_utils.load_img(filename_rgba, color_mode='rgba')
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, original_rgba_array.shape)
-    self.assertAllClose(loaded_im_array, original_rgba_array)
-
-    loaded_im = image_utils.load_img(filename_rgb, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(
-        loaded_im_array.shape,
-        (original_rgb_array.shape[0], original_rgb_array.shape[1], 1))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_8bit, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, original_grayscale_8bit_array.shape)
-    self.assertAllClose(loaded_im_array, original_grayscale_8bit_array)
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_16bit, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int16')
-    self.assertEqual(loaded_im_array.shape,
-                     original_grayscale_16bit_array.shape)
-    self.assertAllClose(loaded_im_array, original_grayscale_16bit_array)
-    # test casting int16 image to float32
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertAllClose(loaded_im_array, original_grayscale_16bit_array)
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_32bit, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int32')
-    self.assertEqual(loaded_im_array.shape,
-                     original_grayscale_32bit_array.shape)
-    self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
-    # test casting int32 image to float32
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
-
-    # Test that nothing is changed when target size is equal to original.
-
-    loaded_im = image_utils.load_img(filename_rgb, target_size=(100, 100))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, original_rgb_array.shape)
-    self.assertAllClose(loaded_im_array, original_rgb_array)
-
-    loaded_im = image_utils.load_img(
-        filename_rgba, color_mode='rgba', target_size=(100, 100))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, original_rgba_array.shape)
-    self.assertAllClose(loaded_im_array, original_rgba_array)
-
-    loaded_im = image_utils.load_img(
-        filename_rgb, color_mode='grayscale', target_size=(100, 100))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(
-        loaded_im_array.shape,
-        (original_rgba_array.shape[0], original_rgba_array.shape[1], 1))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_8bit, color_mode='grayscale', target_size=(100, 100))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, original_grayscale_8bit_array.shape)
-    self.assertAllClose(loaded_im_array, original_grayscale_8bit_array)
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_16bit,
-        color_mode='grayscale',
-        target_size=(100, 100))
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int16')
-    self.assertEqual(loaded_im_array.shape,
-                     original_grayscale_16bit_array.shape)
-    self.assertAllClose(loaded_im_array, original_grayscale_16bit_array)
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_32bit,
-        color_mode='grayscale',
-        target_size=(100, 100))
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int32')
-    self.assertEqual(loaded_im_array.shape,
-                     original_grayscale_32bit_array.shape)
-    self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
-
-    # Test down-sampling with bilinear interpolation.
-
-    loaded_im = image_utils.load_img(filename_rgb, target_size=(25, 25))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, (25, 25, 3))
-
-    loaded_im = image_utils.load_img(
-        filename_rgba, color_mode='rgba', target_size=(25, 25))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, (25, 25, 4))
-
-    loaded_im = image_utils.load_img(
-        filename_rgb, color_mode='grayscale', target_size=(25, 25))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, (25, 25, 1))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_8bit, color_mode='grayscale', target_size=(25, 25))
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, (25, 25, 1))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_16bit, color_mode='grayscale', target_size=(25, 25))
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int16')
-    self.assertEqual(loaded_im_array.shape, (25, 25, 1))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_32bit, color_mode='grayscale', target_size=(25, 25))
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int32')
-    self.assertEqual(loaded_im_array.shape, (25, 25, 1))
-
-    # Test down-sampling with nearest neighbor interpolation.
-
-    loaded_im_nearest = image_utils.load_img(
-        filename_rgb, target_size=(25, 25), interpolation='nearest')
-    loaded_im_array_nearest = image_utils.img_to_array(loaded_im_nearest)
-    self.assertEqual(loaded_im_array_nearest.shape, (25, 25, 3))
-    self.assertTrue(np.any(loaded_im_array_nearest != loaded_im_array))
-
-    loaded_im_nearest = image_utils.load_img(
-        filename_rgba,
-        color_mode='rgba',
-        target_size=(25, 25),
-        interpolation='nearest')
-    loaded_im_array_nearest = image_utils.img_to_array(loaded_im_nearest)
-    self.assertEqual(loaded_im_array_nearest.shape, (25, 25, 4))
-    self.assertTrue(np.any(loaded_im_array_nearest != loaded_im_array))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_8bit,
-        color_mode='grayscale',
-        target_size=(25, 25),
-        interpolation='nearest')
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, (25, 25, 1))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_16bit,
-        color_mode='grayscale',
-        target_size=(25, 25),
-        interpolation='nearest')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int16')
-    self.assertEqual(loaded_im_array.shape, (25, 25, 1))
-
-    loaded_im = image_utils.load_img(
-        filename_grayscale_32bit,
-        color_mode='grayscale',
-        target_size=(25, 25),
-        interpolation='nearest')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype='int32')
-    self.assertEqual(loaded_im_array.shape, (25, 25, 1))
-
-    # Test different path type
-    with open(filename_grayscale_32bit, 'rb') as f:
-      path_ = io.BytesIO(f.read())  # io.Bytesio
-    loaded_im = image_utils.load_img(path_, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
-    self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
-
-    path_ = filename_grayscale_32bit  # str
-    loaded_im = image_utils.load_img(path_, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
-    self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
-
-    path_ = filename_grayscale_32bit.encode()  # bytes
-    loaded_im = image_utils.load_img(path_, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
-    self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
-
-    path_ = pathlib.Path(
-        os.path.join(tmpdir.full_path, 'grayscale_32bit_utils.tiff'))
-    loaded_im = image_utils.load_img(path_, color_mode='grayscale')
-    loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
-    self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
-
-    # Check that exception is raised if interpolation not supported.
-
-    loaded_im = image_utils.load_img(filename_rgb, interpolation='unsupported')
-    with self.assertRaises(ValueError):
-      loaded_im = image_utils.load_img(
-          filename_rgb, target_size=(25, 25), interpolation='unsupported')
-
-    # Check that the aspect ratio of a square is the same
-
-    filename_red_square = os.path.join(tmpdir.full_path, 'red_square_utils.png')
-    arr = np.zeros((50, 100, 3), dtype=np.uint8)  # rectangle image 100x50
-    arr[20:30, 45:55, 0] = 255  # red square 10x10
-    red_square_array = np.array(arr)
-    red_square = image_utils.array_to_img(red_square_array, scale=False)
-    red_square.save(filename_red_square)
-
-    loaded_im = image_utils.load_img(
-        filename_red_square, target_size=(25, 25), keep_aspect_ratio=True)
-    loaded_im_array = image_utils.img_to_array(loaded_im)
-    self.assertEqual(loaded_im_array.shape, (25, 25, 3))
-
-    red_channel_arr = loaded_im_array[:, :, 0].astype(np.bool)
-    square_width = np.sum(np.sum(red_channel_arr, axis=0))
-    square_height = np.sum(np.sum(red_channel_arr, axis=1))
-    aspect_ratio_result = square_width / square_height
-
-    # original square had 1:1 ratio
-    self.assertNear(aspect_ratio_result, 1.0, 0.01)
-
-  def test_array_to_img_and_img_to_array(self):
-    height, width = 10, 8
-
-    # Test the data format
-    # Test RGB 3D
-    x = np.random.random((3, height, width))
-    img = image_utils.array_to_img(x, data_format='channels_first')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_first')
-    self.assertEqual(x.shape, (3, height, width))
-
-    # Test RGBA 3D
-    x = np.random.random((4, height, width))
-    img = image_utils.array_to_img(x, data_format='channels_first')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_first')
-    self.assertEqual(x.shape, (4, height, width))
-
-    # Test 2D
-    x = np.random.random((1, height, width))
-    img = image_utils.array_to_img(x, data_format='channels_first')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_first')
-    self.assertEqual(x.shape, (1, height, width))
-
-    # grayscale 32-bit signed integer
-    x = np.array(
-        np.random.randint(-2147483648, 2147483647, (1, height, width)),
-        dtype=np.int32)
-    img = image_utils.array_to_img(x, data_format='channels_first')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_first')
-    self.assertEqual(x.shape, (1, height, width))
-
-    # Test tf data format
-    # Test RGB 3D
-    x = np.random.random((height, width, 3))
-    img = image_utils.array_to_img(x, data_format='channels_last')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_last')
-    self.assertEqual(x.shape, (height, width, 3))
-
-    # Test RGBA 3D
-    x = np.random.random((height, width, 4))
-    img = image_utils.array_to_img(x, data_format='channels_last')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_last')
-    self.assertEqual(x.shape, (height, width, 4))
-
-    # Test 2D
-    x = np.random.random((height, width, 1))
-    img = image_utils.array_to_img(x, data_format='channels_last')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_last')
-    self.assertEqual(x.shape, (height, width, 1))
-
-    # grayscale 16-bit signed integer
-    x = np.array(
-        np.random.randint(-2147483648, 2147483647, (height, width, 1)),
-        dtype=np.int16)
-    img = image_utils.array_to_img(x, data_format='channels_last')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_last')
-    self.assertEqual(x.shape, (height, width, 1))
-
-    # grayscale 32-bit signed integer
-    x = np.array(
-        np.random.randint(-2147483648, 2147483647, (height, width, 1)),
-        dtype=np.int32)
-    img = image_utils.array_to_img(x, data_format='channels_last')
-    self.assertEqual(img.size, (width, height))
-
-    x = image_utils.img_to_array(img, data_format='channels_last')
-    self.assertEqual(x.shape, (height, width, 1))
-
-    # Test invalid use case
-    with self.assertRaises(ValueError):
-      x = np.random.random((height, width))  # not 3D
-      img = image_utils.array_to_img(x, data_format='channels_first')
-
-    with self.assertRaises(ValueError):
-      x = np.random.random((height, width, 3))
-      # unknown data_format
-      img = image_utils.array_to_img(x, data_format='channels')
-
-    with self.assertRaises(ValueError):
-      # neither RGB, RGBA, or gray-scale
-      x = np.random.random((height, width, 5))
-      img = image_utils.array_to_img(x, data_format='channels_last')
-
-    with self.assertRaises(ValueError):
-      x = np.random.random((height, width, 3))
-      # unknown data_format
-      img = image_utils.img_to_array(x, data_format='channels')
-
-    with self.assertRaises(ValueError):
-      # neither RGB, RGBA, or gray-scale
-      x = np.random.random((height, width, 5, 3))
-      img = image_utils.img_to_array(x, data_format='channels_last')
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_load_img(self):
+        tmpdir = self.create_tempdir()
+        filename_rgb = os.path.join(tmpdir.full_path, "rgb_utils.png")
+        filename_rgba = os.path.join(tmpdir.full_path, "rgba_utils.png")
+        filename_grayscale_8bit = os.path.join(
+            tmpdir.full_path, "grayscale_8bit_utils.png"
+        )
+        filename_grayscale_16bit = os.path.join(
+            tmpdir.full_path, "grayscale_16bit_utils.tiff"
+        )
+        filename_grayscale_32bit = os.path.join(
+            tmpdir.full_path, "grayscale_32bit_utils.tiff"
+        )
+
+        original_rgb_array = np.array(
+            255 * np.random.rand(100, 100, 3), dtype=np.uint8
+        )
+        original_rgb = image_utils.array_to_img(original_rgb_array, scale=False)
+        original_rgb.save(filename_rgb)
+
+        original_rgba_array = np.array(
+            255 * np.random.rand(100, 100, 4), dtype=np.uint8
+        )
+        original_rgba = image_utils.array_to_img(
+            original_rgba_array, scale=False
+        )
+        original_rgba.save(filename_rgba)
+
+        original_grayscale_8bit_array = np.array(
+            255 * np.random.rand(100, 100, 1), dtype=np.uint8
+        )
+        original_grayscale_8bit = image_utils.array_to_img(
+            original_grayscale_8bit_array, scale=False
+        )
+        original_grayscale_8bit.save(filename_grayscale_8bit)
+
+        original_grayscale_16bit_array = np.array(
+            np.random.randint(-2147483648, 2147483647, (100, 100, 1)),
+            dtype=np.int16,
+        )
+        original_grayscale_16bit = image_utils.array_to_img(
+            original_grayscale_16bit_array, scale=False, dtype="int16"
+        )
+        original_grayscale_16bit.save(filename_grayscale_16bit)
+
+        original_grayscale_32bit_array = np.array(
+            np.random.randint(-2147483648, 2147483647, (100, 100, 1)),
+            dtype=np.int32,
+        )
+        original_grayscale_32bit = image_utils.array_to_img(
+            original_grayscale_32bit_array, scale=False, dtype="int32"
+        )
+        original_grayscale_32bit.save(filename_grayscale_32bit)
+
+        # Test that loaded image is exactly equal to original.
+
+        loaded_im = image_utils.load_img(filename_rgb)
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, original_rgb_array.shape)
+        self.assertAllClose(loaded_im_array, original_rgb_array)
+
+        loaded_im = image_utils.load_img(filename_rgba, color_mode="rgba")
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, original_rgba_array.shape)
+        self.assertAllClose(loaded_im_array, original_rgba_array)
+
+        loaded_im = image_utils.load_img(filename_rgb, color_mode="grayscale")
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(
+            loaded_im_array.shape,
+            (original_rgb_array.shape[0], original_rgb_array.shape[1], 1),
+        )
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_8bit, color_mode="grayscale"
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(
+            loaded_im_array.shape, original_grayscale_8bit_array.shape
+        )
+        self.assertAllClose(loaded_im_array, original_grayscale_8bit_array)
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_16bit, color_mode="grayscale"
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int16")
+        self.assertEqual(
+            loaded_im_array.shape, original_grayscale_16bit_array.shape
+        )
+        self.assertAllClose(loaded_im_array, original_grayscale_16bit_array)
+        # test casting int16 image to float32
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertAllClose(loaded_im_array, original_grayscale_16bit_array)
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_32bit, color_mode="grayscale"
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int32")
+        self.assertEqual(
+            loaded_im_array.shape, original_grayscale_32bit_array.shape
+        )
+        self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
+        # test casting int32 image to float32
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
+
+        # Test that nothing is changed when target size is equal to original.
+
+        loaded_im = image_utils.load_img(filename_rgb, target_size=(100, 100))
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, original_rgb_array.shape)
+        self.assertAllClose(loaded_im_array, original_rgb_array)
+
+        loaded_im = image_utils.load_img(
+            filename_rgba, color_mode="rgba", target_size=(100, 100)
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, original_rgba_array.shape)
+        self.assertAllClose(loaded_im_array, original_rgba_array)
+
+        loaded_im = image_utils.load_img(
+            filename_rgb, color_mode="grayscale", target_size=(100, 100)
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(
+            loaded_im_array.shape,
+            (original_rgba_array.shape[0], original_rgba_array.shape[1], 1),
+        )
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_8bit,
+            color_mode="grayscale",
+            target_size=(100, 100),
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(
+            loaded_im_array.shape, original_grayscale_8bit_array.shape
+        )
+        self.assertAllClose(loaded_im_array, original_grayscale_8bit_array)
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_16bit,
+            color_mode="grayscale",
+            target_size=(100, 100),
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int16")
+        self.assertEqual(
+            loaded_im_array.shape, original_grayscale_16bit_array.shape
+        )
+        self.assertAllClose(loaded_im_array, original_grayscale_16bit_array)
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_32bit,
+            color_mode="grayscale",
+            target_size=(100, 100),
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int32")
+        self.assertEqual(
+            loaded_im_array.shape, original_grayscale_32bit_array.shape
+        )
+        self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
+
+        # Test down-sampling with bilinear interpolation.
+
+        loaded_im = image_utils.load_img(filename_rgb, target_size=(25, 25))
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, (25, 25, 3))
+
+        loaded_im = image_utils.load_img(
+            filename_rgba, color_mode="rgba", target_size=(25, 25)
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, (25, 25, 4))
+
+        loaded_im = image_utils.load_img(
+            filename_rgb, color_mode="grayscale", target_size=(25, 25)
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, (25, 25, 1))
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_8bit,
+            color_mode="grayscale",
+            target_size=(25, 25),
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, (25, 25, 1))
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_16bit,
+            color_mode="grayscale",
+            target_size=(25, 25),
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int16")
+        self.assertEqual(loaded_im_array.shape, (25, 25, 1))
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_32bit,
+            color_mode="grayscale",
+            target_size=(25, 25),
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int32")
+        self.assertEqual(loaded_im_array.shape, (25, 25, 1))
+
+        # Test down-sampling with nearest neighbor interpolation.
+
+        loaded_im_nearest = image_utils.load_img(
+            filename_rgb, target_size=(25, 25), interpolation="nearest"
+        )
+        loaded_im_array_nearest = image_utils.img_to_array(loaded_im_nearest)
+        self.assertEqual(loaded_im_array_nearest.shape, (25, 25, 3))
+        self.assertTrue(np.any(loaded_im_array_nearest != loaded_im_array))
+
+        loaded_im_nearest = image_utils.load_img(
+            filename_rgba,
+            color_mode="rgba",
+            target_size=(25, 25),
+            interpolation="nearest",
+        )
+        loaded_im_array_nearest = image_utils.img_to_array(loaded_im_nearest)
+        self.assertEqual(loaded_im_array_nearest.shape, (25, 25, 4))
+        self.assertTrue(np.any(loaded_im_array_nearest != loaded_im_array))
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_8bit,
+            color_mode="grayscale",
+            target_size=(25, 25),
+            interpolation="nearest",
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, (25, 25, 1))
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_16bit,
+            color_mode="grayscale",
+            target_size=(25, 25),
+            interpolation="nearest",
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int16")
+        self.assertEqual(loaded_im_array.shape, (25, 25, 1))
+
+        loaded_im = image_utils.load_img(
+            filename_grayscale_32bit,
+            color_mode="grayscale",
+            target_size=(25, 25),
+            interpolation="nearest",
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype="int32")
+        self.assertEqual(loaded_im_array.shape, (25, 25, 1))
+
+        # Test different path type
+        with open(filename_grayscale_32bit, "rb") as f:
+            path_ = io.BytesIO(f.read())  # io.Bytesio
+        loaded_im = image_utils.load_img(path_, color_mode="grayscale")
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
+        self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
+
+        path_ = filename_grayscale_32bit  # str
+        loaded_im = image_utils.load_img(path_, color_mode="grayscale")
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
+        self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
+
+        path_ = filename_grayscale_32bit.encode()  # bytes
+        loaded_im = image_utils.load_img(path_, color_mode="grayscale")
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
+        self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
+
+        path_ = pathlib.Path(
+            os.path.join(tmpdir.full_path, "grayscale_32bit_utils.tiff")
+        )
+        loaded_im = image_utils.load_img(path_, color_mode="grayscale")
+        loaded_im_array = image_utils.img_to_array(loaded_im, dtype=np.int32)
+        self.assertAllClose(loaded_im_array, original_grayscale_32bit_array)
+
+        # Check that exception is raised if interpolation not supported.
+
+        loaded_im = image_utils.load_img(
+            filename_rgb, interpolation="unsupported"
+        )
+        with self.assertRaises(ValueError):
+            loaded_im = image_utils.load_img(
+                filename_rgb, target_size=(25, 25), interpolation="unsupported"
+            )
+
+        # Check that the aspect ratio of a square is the same
+
+        filename_red_square = os.path.join(
+            tmpdir.full_path, "red_square_utils.png"
+        )
+        arr = np.zeros((50, 100, 3), dtype=np.uint8)  # rectangle image 100x50
+        arr[20:30, 45:55, 0] = 255  # red square 10x10
+        red_square_array = np.array(arr)
+        red_square = image_utils.array_to_img(red_square_array, scale=False)
+        red_square.save(filename_red_square)
+
+        loaded_im = image_utils.load_img(
+            filename_red_square, target_size=(25, 25), keep_aspect_ratio=True
+        )
+        loaded_im_array = image_utils.img_to_array(loaded_im)
+        self.assertEqual(loaded_im_array.shape, (25, 25, 3))
+
+        red_channel_arr = loaded_im_array[:, :, 0].astype(bool)
+        square_width = np.sum(np.sum(red_channel_arr, axis=0))
+        square_height = np.sum(np.sum(red_channel_arr, axis=1))
+        aspect_ratio_result = square_width / square_height
+
+        # original square had 1:1 ratio
+        self.assertNear(aspect_ratio_result, 1.0, 0.01)
+
+    def test_array_to_img_and_img_to_array(self):
+        height, width = 10, 8
+
+        # Test the data format
+        # Test RGB 3D
+        x = np.random.random((3, height, width))
+        img = image_utils.array_to_img(x, data_format="channels_first")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_first")
+        self.assertEqual(x.shape, (3, height, width))
+
+        # Test RGBA 3D
+        x = np.random.random((4, height, width))
+        img = image_utils.array_to_img(x, data_format="channels_first")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_first")
+        self.assertEqual(x.shape, (4, height, width))
+
+        # Test 2D
+        x = np.random.random((1, height, width))
+        img = image_utils.array_to_img(x, data_format="channels_first")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_first")
+        self.assertEqual(x.shape, (1, height, width))
+
+        # grayscale 32-bit signed integer
+        x = np.array(
+            np.random.randint(-2147483648, 2147483647, (1, height, width)),
+            dtype=np.int32,
+        )
+        img = image_utils.array_to_img(x, data_format="channels_first")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_first")
+        self.assertEqual(x.shape, (1, height, width))
+
+        # Test tf data format
+        # Test RGB 3D
+        x = np.random.random((height, width, 3))
+        img = image_utils.array_to_img(x, data_format="channels_last")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_last")
+        self.assertEqual(x.shape, (height, width, 3))
+
+        # Test RGBA 3D
+        x = np.random.random((height, width, 4))
+        img = image_utils.array_to_img(x, data_format="channels_last")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_last")
+        self.assertEqual(x.shape, (height, width, 4))
+
+        # Test 2D
+        x = np.random.random((height, width, 1))
+        img = image_utils.array_to_img(x, data_format="channels_last")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_last")
+        self.assertEqual(x.shape, (height, width, 1))
+
+        # grayscale 16-bit signed integer
+        x = np.array(
+            np.random.randint(-2147483648, 2147483647, (height, width, 1)),
+            dtype=np.int16,
+        )
+        img = image_utils.array_to_img(x, data_format="channels_last")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_last")
+        self.assertEqual(x.shape, (height, width, 1))
+
+        # grayscale 32-bit signed integer
+        x = np.array(
+            np.random.randint(-2147483648, 2147483647, (height, width, 1)),
+            dtype=np.int32,
+        )
+        img = image_utils.array_to_img(x, data_format="channels_last")
+        self.assertEqual(img.size, (width, height))
+
+        x = image_utils.img_to_array(img, data_format="channels_last")
+        self.assertEqual(x.shape, (height, width, 1))
+
+        # Test invalid use case
+        with self.assertRaises(ValueError):
+            x = np.random.random((height, width))  # not 3D
+            img = image_utils.array_to_img(x, data_format="channels_first")
+
+        with self.assertRaises(ValueError):
+            x = np.random.random((height, width, 3))
+            # unknown data_format
+            img = image_utils.array_to_img(x, data_format="channels")
+
+        with self.assertRaises(ValueError):
+            # neither RGB, RGBA, or gray-scale
+            x = np.random.random((height, width, 5))
+            img = image_utils.array_to_img(x, data_format="channels_last")
+
+        with self.assertRaises(ValueError):
+            x = np.random.random((height, width, 3))
+            # unknown data_format
+            img = image_utils.img_to_array(x, data_format="channels")
+
+        with self.assertRaises(ValueError):
+            # neither RGB, RGBA, or gray-scale
+            x = np.random.random((height, width, 5, 3))
+            img = image_utils.img_to_array(x, data_format="channels_last")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/io_utils.py b/keras/utils/io_utils.py
index 3f3e0173dd33..461ac8a18686 100644
--- a/keras/utils/io_utils.py
+++ b/keras/utils/io_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=g-import-not-at-top
+
 """Utilities related to disk I/O."""
 
 import os
@@ -20,102 +20,109 @@
 import threading
 
 from absl import logging
+
 from keras.utils import keras_logging
 
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
-
 INTERACTIVE_LOGGING = threading.local()
 INTERACTIVE_LOGGING.enable = keras_logging.INTERACTIVE_LOGGING_DEFAULT
 
 
-@keras_export('keras.utils.enable_interactive_logging')
+@keras_export("keras.utils.enable_interactive_logging")
 def enable_interactive_logging():
-  """Turn on interactive logging.
+    """Turn on interactive logging.
 
-  When interactive logging is enabled, Keras displays logs via stdout.
-  This provides the best experience when using Keras in an interactive
-  environment such as a shell or a notebook.
-  """
-  INTERACTIVE_LOGGING.enable = True
+    When interactive logging is enabled, Keras displays logs via stdout.
+    This provides the best experience when using Keras in an interactive
+    environment such as a shell or a notebook.
+    """
+    INTERACTIVE_LOGGING.enable = True
 
 
-@keras_export('keras.utils.disable_interactive_logging')
+@keras_export("keras.utils.disable_interactive_logging")
 def disable_interactive_logging():
-  """Turn off interactive logging.
+    """Turn off interactive logging.
 
-  When interactive logging is disabled, Keras sends logs to `absl.logging`.
-  This is the best option when using Keras in a non-interactive
-  way, such as running a training or inference job on a server.
-  """
-  INTERACTIVE_LOGGING.enable = False
+    When interactive logging is disabled, Keras sends logs to `absl.logging`.
+    This is the best option when using Keras in a non-interactive
+    way, such as running a training or inference job on a server.
+    """
+    INTERACTIVE_LOGGING.enable = False
 
 
-@keras_export('keras.utils.is_interactive_logging_enabled')
+@keras_export("keras.utils.is_interactive_logging_enabled")
 def is_interactive_logging_enabled():
-  """Check if interactive logging is enabled.
+    """Check if interactive logging is enabled.
 
-  To switch between writing logs to stdout and `absl.logging`, you may use
-  `keras.utils.enable_interactive_logging()` and
-  `keras.utils.disable_interactie_logging()`.
+    To switch between writing logs to stdout and `absl.logging`, you may use
+    `keras.utils.enable_interactive_logging()` and
+    `keras.utils.disable_interactive_logging()`.
 
-  Returns:
-    Boolean (True if interactive logging is enabled and False otherwise).
-  """
-  # Use `getattr` in case `INTERACTIVE_LOGGING`
-  # does not have the `enable` attribute.
-  return getattr(INTERACTIVE_LOGGING, 'enable',
-                 keras_logging.INTERACTIVE_LOGGING_DEFAULT)
+    Returns:
+      Boolean (True if interactive logging is enabled and False otherwise).
+    """
+    # Use `getattr` in case `INTERACTIVE_LOGGING`
+    # does not have the `enable` attribute.
+    return getattr(
+        INTERACTIVE_LOGGING, "enable", keras_logging.INTERACTIVE_LOGGING_DEFAULT
+    )
 
 
+@logging.skip_log_prefix
 def print_msg(message, line_break=True):
-  """Print the message to absl logging or stdout."""
-  if is_interactive_logging_enabled():
-    if line_break:
-      sys.stdout.write(message + '\n')
+    """Print the message to absl logging or stdout."""
+    if is_interactive_logging_enabled():
+        if line_break:
+            sys.stdout.write(message + "\n")
+        else:
+            sys.stdout.write(message)
+        sys.stdout.flush()
     else:
-      sys.stdout.write(message)
-    sys.stdout.flush()
-  else:
-    logging.info(message)
+        logging.info(message)
 
 
 def path_to_string(path):
-  """Convert `PathLike` objects to their string representation.
+    """Convert `PathLike` objects to their string representation.
 
-  If given a non-string typed path object, converts it to its string
-  representation.
+    If given a non-string typed path object, converts it to its string
+    representation.
 
-  If the object passed to `path` is not among the above, then it is
-  returned unchanged. This allows e.g. passthrough of file objects
-  through this function.
+    If the object passed to `path` is not among the above, then it is
+    returned unchanged. This allows e.g. passthrough of file objects
+    through this function.
 
-  Args:
-    path: `PathLike` object that represents a path
+    Args:
+      path: `PathLike` object that represents a path
 
-  Returns:
-    A string representation of the path argument, if Python support exists.
-  """
-  if isinstance(path, os.PathLike):
-    return os.fspath(path)
-  return path
+    Returns:
+      A string representation of the path argument, if Python support exists.
+    """
+    if isinstance(path, os.PathLike):
+        return os.fspath(path)
+    return path
 
 
 def ask_to_proceed_with_overwrite(filepath):
-  """Produces a prompt asking about overwriting a file.
-
-  Args:
-      filepath: the path to the file to be overwritten.
-
-  Returns:
-      True if we can proceed with overwrite, False otherwise.
-  """
-  overwrite = input('[WARNING] %s already exists - overwrite? '
-                    '[y/n]' % (filepath)).strip().lower()
-  while overwrite not in ('y', 'n'):
-    overwrite = input('Enter "y" (overwrite) or "n" '
-                      '(cancel).').strip().lower()
-  if overwrite == 'n':
-    return False
-  print_msg('[TIP] Next time specify overwrite=True!')
-  return True
+    """Produces a prompt asking about overwriting a file.
+
+    Args:
+        filepath: the path to the file to be overwritten.
+
+    Returns:
+        True if we can proceed with overwrite, False otherwise.
+    """
+    overwrite = (
+        input(f"[WARNING] {filepath} already exists - overwrite? [y/n]")
+        .strip()
+        .lower()
+    )
+    while overwrite not in ("y", "n"):
+        overwrite = (
+            input('Enter "y" (overwrite) or "n" (cancel).').strip().lower()
+        )
+    if overwrite == "n":
+        return False
+    print_msg("[TIP] Next time specify overwrite=True!")
+    return True
diff --git a/keras/utils/io_utils_test.py b/keras/utils/io_utils_test.py
index a25cda6854f3..445bbaab76d8 100644
--- a/keras/utils/io_utils_test.py
+++ b/keras/utils/io_utils_test.py
@@ -15,69 +15,74 @@
 """Tests for io_utils."""
 
 import builtins
-from pathlib import Path
 import sys
+from pathlib import Path
+
+import tensorflow.compat.v2 as tf
 
 from keras.testing_infra import test_combinations
 from keras.utils import io_utils
-import tensorflow.compat.v2 as tf
 
 
 class TestIOUtils(test_combinations.TestCase):
-
-  def test_ask_to_proceed_with_overwrite(self):
-    with tf.compat.v1.test.mock.patch.object(builtins, 'input') as mock_log:
-      mock_log.return_value = 'y'
-      self.assertTrue(io_utils.ask_to_proceed_with_overwrite('/tmp/not_exists'))
-
-      mock_log.return_value = 'n'
-      self.assertFalse(
-          io_utils.ask_to_proceed_with_overwrite('/tmp/not_exists'))
-
-      mock_log.side_effect = ['m', 'y']
-      self.assertTrue(io_utils.ask_to_proceed_with_overwrite('/tmp/not_exists'))
-
-      mock_log.side_effect = ['m', 'n']
-      self.assertFalse(
-          io_utils.ask_to_proceed_with_overwrite('/tmp/not_exists'))
-
-  def test_path_to_string(self):
-
-    class PathLikeDummy:
-
-      def __fspath__(self):
-        return 'dummypath'
-
-    dummy = object()
-    # conversion of PathLike
-    self.assertEqual(io_utils.path_to_string(Path('path')), 'path')
-    self.assertEqual(io_utils.path_to_string(PathLikeDummy()), 'dummypath')
-
-    # pass-through, works for all versions of python
-    self.assertEqual(io_utils.path_to_string('path'), 'path')
-    self.assertIs(io_utils.path_to_string(dummy), dummy)
-
-  def test_print_msg(self):
-    enabled = io_utils.is_interactive_logging_enabled()
-
-    io_utils.disable_interactive_logging()
-    self.assertFalse(io_utils.is_interactive_logging_enabled())
-
-    with self.assertLogs(level='INFO') as logged:
-      io_utils.print_msg('Testing Message')
-    self.assertIn('Testing Message', logged.output[0])
-
-    io_utils.enable_interactive_logging()
-    self.assertTrue(io_utils.is_interactive_logging_enabled())
-
-    with self.captureWritesToStream(sys.stdout) as printed:
-      io_utils.print_msg('Testing Message')
-    self.assertEqual('Testing Message\n', printed.contents())
-
-    if enabled:
-      io_utils.enable_interactive_logging()
-    else:
-      io_utils.disable_interactive_logging()
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_ask_to_proceed_with_overwrite(self):
+        with tf.compat.v1.test.mock.patch.object(builtins, "input") as mock_log:
+            mock_log.return_value = "y"
+            self.assertTrue(
+                io_utils.ask_to_proceed_with_overwrite("/tmp/not_exists")
+            )
+
+            mock_log.return_value = "n"
+            self.assertFalse(
+                io_utils.ask_to_proceed_with_overwrite("/tmp/not_exists")
+            )
+
+            mock_log.side_effect = ["m", "y"]
+            self.assertTrue(
+                io_utils.ask_to_proceed_with_overwrite("/tmp/not_exists")
+            )
+
+            mock_log.side_effect = ["m", "n"]
+            self.assertFalse(
+                io_utils.ask_to_proceed_with_overwrite("/tmp/not_exists")
+            )
+
+    def test_path_to_string(self):
+        class PathLikeDummy:
+            def __fspath__(self):
+                return "dummypath"
+
+        dummy = object()
+        # conversion of PathLike
+        self.assertEqual(io_utils.path_to_string(Path("path")), "path")
+        self.assertEqual(io_utils.path_to_string(PathLikeDummy()), "dummypath")
+
+        # pass-through, works for all versions of python
+        self.assertEqual(io_utils.path_to_string("path"), "path")
+        self.assertIs(io_utils.path_to_string(dummy), dummy)
+
+    def test_print_msg(self):
+        enabled = io_utils.is_interactive_logging_enabled()
+
+        io_utils.disable_interactive_logging()
+        self.assertFalse(io_utils.is_interactive_logging_enabled())
+
+        with self.assertLogs(level="INFO") as logged:
+            io_utils.print_msg("Testing Message")
+        self.assertIn("Testing Message", logged.output[0])
+
+        io_utils.enable_interactive_logging()
+        self.assertTrue(io_utils.is_interactive_logging_enabled())
+
+        with self.captureWritesToStream(sys.stdout) as printed:
+            io_utils.print_msg("Testing Message")
+        self.assertEqual("Testing Message\n", printed.contents())
+
+        if enabled:
+            io_utils.enable_interactive_logging()
+        else:
+            io_utils.disable_interactive_logging()
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/kernelized_utils.py b/keras/utils/kernelized_utils.py
index 75b20fd11227..22fee770824d 100644
--- a/keras/utils/kernelized_utils.py
+++ b/keras/utils/kernelized_utils.py
@@ -18,95 +18,95 @@
 
 
 def _to_matrix(u):
-  """If input tensor is a vector (i.e., has rank 1), converts it to matrix."""
-  u_rank = len(u.shape)
-  if u_rank not in [1, 2]:
-    raise ValueError('The input tensor should have rank 1 or 2. '
-                     f'Received rank: {u_rank}')
-  if u_rank == 1:
-    return tf.expand_dims(u, 0)
-  return u
+    """If input tensor is a vector (i.e., has rank 1), converts it to matrix."""
+    u_rank = len(u.shape)
+    if u_rank not in [1, 2]:
+        raise ValueError(
+            f"The input tensor should have rank 1 or 2. Received rank: {u_rank}"
+        )
+    if u_rank == 1:
+        return tf.expand_dims(u, 0)
+    return u
 
 
 def _align_matrices(x, y):
-  """Aligns x and y tensors to allow computations over pairs of their rows."""
-  x_matrix = _to_matrix(x)
-  y_matrix = _to_matrix(y)
-  x_shape = x_matrix.shape
-  y_shape = y_matrix.shape
-  if y_shape[1] != x_shape[1]:  # dimensions do not match.
-    raise ValueError(
-        'The outermost dimensions of the input tensors should match. '
-        f'Received y = {y_shape[1]} vs x = {x_shape[1]}.')
-
-  x_tile = tf.tile(
-      tf.expand_dims(x_matrix, 1), [1, y_shape[0], 1])
-  y_tile = tf.tile(
-      tf.expand_dims(y_matrix, 0), [x_shape[0], 1, 1])
-  return x_tile, y_tile
+    """Aligns x and y tensors to allow computations over pairs of their rows."""
+    x_matrix = _to_matrix(x)
+    y_matrix = _to_matrix(y)
+    x_shape = x_matrix.shape
+    y_shape = y_matrix.shape
+    if y_shape[1] != x_shape[1]:  # dimensions do not match.
+        raise ValueError(
+            "The outermost dimensions of the input tensors should match. "
+            f"Received y = {y_shape[1]} vs x = {x_shape[1]}."
+        )
+
+    x_tile = tf.tile(tf.expand_dims(x_matrix, 1), [1, y_shape[0], 1])
+    y_tile = tf.tile(tf.expand_dims(y_matrix, 0), [x_shape[0], 1, 1])
+    return x_tile, y_tile
 
 
 def inner_product(u, v):
-  u = _to_matrix(u)
-  v = _to_matrix(v)
-  return tf.matmul(u, v, transpose_b=True)
+    u = _to_matrix(u)
+    v = _to_matrix(v)
+    return tf.matmul(u, v, transpose_b=True)
 
 
 def exact_gaussian_kernel(x, y, stddev):
-  r"""Computes exact Gaussian kernel value(s) for tensors x and y and stddev.
-
-  The Gaussian kernel for vectors u, v is defined as follows:
-       K(u, v) = exp(-||u-v||^2 / (2* stddev^2))
-  where the norm is the l2-norm. x, y can be either vectors or matrices. If they
-  are vectors, they must have the same dimension. If they are matrices, they
-  must have the same number of columns. In the latter case, the method returns
-  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
-  v is a row from y.
-
-  Args:
-    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
-    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
-    stddev: The width of the Gaussian kernel.
-
-  Returns:
-    A single value (scalar) with shape (1, 1) (if x, y are vectors) or a matrix
-      of shape (m, n) with entries K(u, v) (where K is the Gaussian kernel) for
-      all (u,v) pairs where u, v are rows from x and y respectively.
-
-  Raises:
-    ValueError: if the shapes of x, y are not compatible.
-  """
-  x_aligned, y_aligned = _align_matrices(x, y)
-  diff_squared_l2_norm = tf.reduce_sum(
-      tf.math.squared_difference(x_aligned, y_aligned), 2)
-  return tf.exp(-diff_squared_l2_norm / (2 * stddev * stddev))
+    r"""Computes exact Gaussian kernel value(s) for tensors x and y and stddev.
+
+    The Gaussian kernel for vectors u, v is defined as follows:
+         K(u, v) = exp(-||u-v||^2 / (2* stddev^2))
+    where the norm is the l2-norm. x, y can be either vectors or matrices. If
+    they are vectors, they must have the same dimension. If they are matrices,
+    they must have the same number of columns. In the latter case, the method
+    returns (as a matrix) K(u, v) values for all pairs (u, v) where u is a row
+    from x and v is a row from y.
+
+    Args:
+      x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+      y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+      stddev: The width of the Gaussian kernel.
+
+    Returns:
+      A single value (scalar) with shape (1, 1) (if x, y are vectors) or a
+      matrix of shape (m, n) with entries K(u, v) (where K is the Gaussian
+      kernel) for all (u,v) pairs where u, v are rows from x and y respectively.
+
+    Raises:
+      ValueError: if the shapes of x, y are not compatible.
+    """
+    x_aligned, y_aligned = _align_matrices(x, y)
+    diff_squared_l2_norm = tf.reduce_sum(
+        tf.math.squared_difference(x_aligned, y_aligned), 2
+    )
+    return tf.exp(-diff_squared_l2_norm / (2 * stddev * stddev))
 
 
 def exact_laplacian_kernel(x, y, stddev):
-  r"""Computes exact Laplacian kernel value(s) for tensors x and y using stddev.
-
-  The Laplacian kernel for vectors u, v is defined as follows:
-       K(u, v) = exp(-||u-v|| / stddev)
-  where the norm is the l1-norm. x, y can be either vectors or matrices. If they
-  are vectors, they must have the same dimension. If they are matrices, they
-  must have the same number of columns. In the latter case, the method returns
-  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
-  v is a row from y.
-
-  Args:
-    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
-    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
-    stddev: The width of the Gaussian kernel.
-
-  Returns:
-    A single value (scalar) with shape (1, 1)  if x, y are vectors or a matrix
-    of shape (m, n) with entries K(u, v) (where K is the Laplacian kernel) for
-    all (u,v) pairs where u, v are rows from x and y respectively.
-
-  Raises:
-    ValueError: if the shapes of x, y are not compatible.
-  """
-  x_aligned, y_aligned = _align_matrices(x, y)
-  diff_l1_norm = tf.reduce_sum(
-      tf.abs(tf.subtract(x_aligned, y_aligned)), 2)
-  return tf.exp(-diff_l1_norm / stddev)
+    r"""Computes exact Laplacian kernel value(s) for tensors x & y using stddev.
+
+    The Laplacian kernel for vectors u, v is defined as follows:
+         K(u, v) = exp(-||u-v|| / stddev)
+    where the norm is the l1-norm. x, y can be either vectors or matrices. If
+    they are vectors, they must have the same dimension. If they are matrices,
+    they must have the same number of columns. In the latter case, the method
+    returns (as a matrix) K(u, v) values for all pairs (u, v) where u is a row
+    from x and v is a row from y.
+
+    Args:
+      x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+      y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+      stddev: The width of the Gaussian kernel.
+
+    Returns:
+      A single value (scalar) with shape (1, 1)  if x, y are vectors or a matrix
+      of shape (m, n) with entries K(u, v) (where K is the Laplacian kernel) for
+      all (u,v) pairs where u, v are rows from x and y respectively.
+
+    Raises:
+      ValueError: if the shapes of x, y are not compatible.
+    """
+    x_aligned, y_aligned = _align_matrices(x, y)
+    diff_l1_norm = tf.reduce_sum(tf.abs(tf.subtract(x_aligned, y_aligned)), 2)
+    return tf.exp(-diff_l1_norm / stddev)
diff --git a/keras/utils/kernelized_utils_test.py b/keras/utils/kernelized_utils_test.py
index 4985e6b7b8f3..cc562325eaf6 100644
--- a/keras/utils/kernelized_utils_test.py
+++ b/keras/utils/kernelized_utils_test.py
@@ -14,98 +14,114 @@
 # ==============================================================================
 """Tests for kernelized_utils.py."""
 
-import tensorflow.compat.v2 as tf
-
 import functools
 
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 from keras.utils import kernelized_utils
 
 
 def _exact_gaussian(stddev):
-  return functools.partial(
-      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+    return functools.partial(
+        kernelized_utils.exact_gaussian_kernel, stddev=stddev
+    )
 
 
 def _exact_laplacian(stddev):
-  return functools.partial(
-      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+    return functools.partial(
+        kernelized_utils.exact_laplacian_kernel, stddev=stddev
+    )
 
 
 class KernelizedUtilsTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
-      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
-  def test_equal_vectors(self, exact_kernel_fn, expected_values):
-    """Identical vectors give exactly the identity kernel value."""
-    x = tf.constant([0.5, -0.5, -0.5, 0.5])
-    y = tf.constant([0.5, -0.5, -0.5, 0.5])
-    exact_kernel = exact_kernel_fn(x, y)
-    shape = exact_kernel.shape.as_list()
-    self.assertLen(shape, 2)
-    # x and y are identical and therefore K(x, y) will be precisely equal to
-    # the identity value of the kernel.
-    self.assertAllClose(expected_values, exact_kernel, atol=1e-6)
-
-  @parameterized.named_parameters(
-      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
-      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
-  def test_almost_identical_vectors(self, exact_kernel_fn, expected_values):
-    """Almost identical vectors give the identity kernel value."""
-    x = tf.constant([1.0, 0.4, -2.1, -1.1])
-    y = tf.constant([1.01, 0.39, -2.099, -1.101])
-    exact_kernel = exact_kernel_fn(x, y)
-    shape = exact_kernel.shape.as_list()
-    self.assertLen(shape, 2)
-    # x and y are almost identical and therefore K(x, y) will be almost equal to
-    # the identity value of the kernel.
-    self.assertAllClose(expected_values, exact_kernel, atol=1e-3)
-
-  @parameterized.named_parameters(
-      ('gaussian', _exact_gaussian(stddev=1.0), [[0.99], [0.977]]),
-      ('laplacian', _exact_laplacian(stddev=5.0), [[0.96], [0.94]]))
-  def test_similar_matrices(self, exact_kernel_fn, expected_values):
-    """Pairwise "close" vectors give high kernel values (similarity scores)."""
-    x = tf.constant([1.0, 3.4, -2.1, 0.9, 3.3, -2.0], shape=[2, 3])
-    y = tf.constant([1.1, 3.35, -2.05])
-    exact_kernel = exact_kernel_fn(x, y)
-    shape = exact_kernel.shape.as_list()
-    self.assertLen(shape, 2)
-    # The 2 rows of x are close to y. The pairwise kernel values (similarity
-    # scores) are somewhat close to the identity value of the kernel.
-    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
-
-  @parameterized.named_parameters(
-      ('gaussian', _exact_gaussian(stddev=2.0), [[.997, .279], [.251, 1.],
-                                                 [.164, 0.019]]),
-      ('laplacian', _exact_laplacian(stddev=2.0), [[.904, .128], [.116, 1.],
-                                                   [.07, 0.027]]))
-  def test_matrices_varying_similarity(self, exact_kernel_fn, expected_values):
-    """Test matrices with row vectors of varying pairwise similarity."""
-    x = tf.constant([1.0, 2., -2., 0.9, 3.3, -1.0], shape=[3, 2])
-    y = tf.constant([1.1, 2.1, -2., 0.9], shape=[2, 2])
-    exact_kernel = exact_kernel_fn(x, y)
-
-    shape = exact_kernel.shape.as_list()
-    self.assertLen(shape, 2)
-    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
-
-  @parameterized.named_parameters(
-      ('gaussian', _exact_gaussian(stddev=1.0), [[0.0]]),
-      ('laplacian', _exact_laplacian(stddev=1.0), [[0.0]]))
-  def test_completely_dissimilar_vectors(self, exact_kernel_fn,
-                                         expected_values):
-    """Very dissimilar vectors give very low similarity scores."""
-    x = tf.constant([1.0, 3.4, -2.1, -5.1])
-    y = tf.constant([0.5, 2.1, 1.0, 3.0])
-    exact_kernel = exact_kernel_fn(x, y)
-    shape = exact_kernel.shape.as_list()
-    self.assertLen(shape, 2)
-    # x and y are very "far" from each other and so the corresponding kernel
-    # value will be very low.
-    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    @parameterized.named_parameters(
+        ("gaussian", _exact_gaussian(stddev=10.0), [[1.0]]),
+        ("laplacian", _exact_laplacian(stddev=50.0), [[1.0]]),
+    )
+    def test_equal_vectors(self, exact_kernel_fn, expected_values):
+        """Identical vectors give exactly the identity kernel value."""
+        x = tf.constant([0.5, -0.5, -0.5, 0.5])
+        y = tf.constant([0.5, -0.5, -0.5, 0.5])
+        exact_kernel = exact_kernel_fn(x, y)
+        shape = exact_kernel.shape.as_list()
+        self.assertLen(shape, 2)
+        # x and y are identical and therefore K(x, y) will be precisely equal to
+        # the identity value of the kernel.
+        self.assertAllClose(expected_values, exact_kernel, atol=1e-6)
+
+    @parameterized.named_parameters(
+        ("gaussian", _exact_gaussian(stddev=10.0), [[1.0]]),
+        ("laplacian", _exact_laplacian(stddev=50.0), [[1.0]]),
+    )
+    def test_almost_identical_vectors(self, exact_kernel_fn, expected_values):
+        """Almost identical vectors give the identity kernel value."""
+        x = tf.constant([1.0, 0.4, -2.1, -1.1])
+        y = tf.constant([1.01, 0.39, -2.099, -1.101])
+        exact_kernel = exact_kernel_fn(x, y)
+        shape = exact_kernel.shape.as_list()
+        self.assertLen(shape, 2)
+        # x and y are almost identical and therefore K(x, y) will be almost
+        # equal to the identity value of the kernel.
+        self.assertAllClose(expected_values, exact_kernel, atol=1e-3)
+
+    @parameterized.named_parameters(
+        ("gaussian", _exact_gaussian(stddev=1.0), [[0.99], [0.977]]),
+        ("laplacian", _exact_laplacian(stddev=5.0), [[0.96], [0.94]]),
+    )
+    def test_similar_matrices(self, exact_kernel_fn, expected_values):
+        """Pairwise "close" vectors give high kernel values (similarity
+        scores)."""
+        x = tf.constant([1.0, 3.4, -2.1, 0.9, 3.3, -2.0], shape=[2, 3])
+        y = tf.constant([1.1, 3.35, -2.05])
+        exact_kernel = exact_kernel_fn(x, y)
+        shape = exact_kernel.shape.as_list()
+        self.assertLen(shape, 2)
+        # The 2 rows of x are close to y. The pairwise kernel values (similarity
+        # scores) are somewhat close to the identity value of the kernel.
+        self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+    @parameterized.named_parameters(
+        (
+            "gaussian",
+            _exact_gaussian(stddev=2.0),
+            [[0.997, 0.279], [0.251, 1.0], [0.164, 0.019]],
+        ),
+        (
+            "laplacian",
+            _exact_laplacian(stddev=2.0),
+            [[0.904, 0.128], [0.116, 1.0], [0.07, 0.027]],
+        ),
+    )
+    def test_matrices_varying_similarity(
+        self, exact_kernel_fn, expected_values
+    ):
+        """Test matrices with row vectors of varying pairwise similarity."""
+        x = tf.constant([1.0, 2.0, -2.0, 0.9, 3.3, -1.0], shape=[3, 2])
+        y = tf.constant([1.1, 2.1, -2.0, 0.9], shape=[2, 2])
+        exact_kernel = exact_kernel_fn(x, y)
+
+        shape = exact_kernel.shape.as_list()
+        self.assertLen(shape, 2)
+        self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+    @parameterized.named_parameters(
+        ("gaussian", _exact_gaussian(stddev=1.0), [[0.0]]),
+        ("laplacian", _exact_laplacian(stddev=1.0), [[0.0]]),
+    )
+    def test_completely_dissimilar_vectors(
+        self, exact_kernel_fn, expected_values
+    ):
+        """Very dissimilar vectors give very low similarity scores."""
+        x = tf.constant([1.0, 3.4, -2.1, -5.1])
+        y = tf.constant([0.5, 2.1, 1.0, 3.0])
+        exact_kernel = exact_kernel_fn(x, y)
+        shape = exact_kernel.shape.as_list()
+        self.assertLen(shape, 2)
+        # x and y are very "far" from each other and so the corresponding kernel
+        # value will be very low.
+        self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/kpl_test_utils.py b/keras/utils/kpl_test_utils.py
index 30232a842274..e96677f447fb 100644
--- a/keras/utils/kpl_test_utils.py
+++ b/keras/utils/kpl_test_utils.py
@@ -14,167 +14,192 @@
 # ==============================================================================
 """Test related utilities for KPL + tf.distribute."""
 
-import tensorflow.compat.v2 as tf
-
 import random
 import tempfile
 
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.layers.preprocessing import string_lookup
 
 
 class DistributeKplTestUtils(tf.test.TestCase):
-  """Utils for test of tf.distribute + KPL."""
-  FEATURE_VOCAB = [
-      "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
-      "wonder_woman"
-  ]
-  LABEL_VOCAB = ["yes", "no"]
-
-  def define_kpls_for_training(self, use_adapt):
-    """Function that defines KPL used for unit tests of tf.distribute.
-
-    Args:
-      use_adapt: if adapt will be called. False means there will be precomputed
-        statistics.
-
-    Returns:
-      feature_mapper: a simple keras model with one keras StringLookup layer
-      which maps feature to index.
-      label_mapper: similar to feature_mapper, but maps label to index.
-
-    """
-    if use_adapt:
-      feature_lookup_layer = (
-          string_lookup.StringLookup(
-              num_oov_indices=1))
-      feature_lookup_layer.adapt(self.FEATURE_VOCAB)
-      label_lookup_layer = (
-          string_lookup.StringLookup(
-              num_oov_indices=0, mask_token=None))
-      label_lookup_layer.adapt(self.LABEL_VOCAB)
-    else:
-      feature_lookup_layer = (
-          string_lookup.StringLookup(
-              vocabulary=self.FEATURE_VOCAB, num_oov_indices=1))
-      label_lookup_layer = (
-          string_lookup.StringLookup(
-              vocabulary=self.LABEL_VOCAB, num_oov_indices=0, mask_token=None))
-
-    raw_feature_input = keras.layers.Input(
-        shape=(3,), dtype=tf.string, name="feature", ragged=True)
-    feature_id_input = feature_lookup_layer(raw_feature_input)
-    feature_mapper = keras.Model({"features": raw_feature_input},
-                                 feature_id_input)
-
-    raw_label_input = keras.layers.Input(
-        shape=(1,), dtype=tf.string, name="label")
-    label_id_input = label_lookup_layer(raw_label_input)
-    label_mapper = keras.Model({"label": raw_label_input}, label_id_input)
-
-    return feature_mapper, label_mapper
-
-  def dataset_fn(self, feature_mapper, label_mapper):
-    """Function that generates dataset for test of tf.distribute + KPL.
-
-    Args:
-      feature_mapper: a simple keras model with one keras StringLookup layer
-        which maps feature to index.
-      label_mapper: similar to feature_mapper, but maps label to index.
-
-    Returns:
-      Generated dataset for test of tf.distribute + KPL.
-
-    """
-
-    def feature_and_label_gen():
-      # Generator of dataset.
-      while True:
-        features = random.sample(self.FEATURE_VOCAB, 3)
-        label = ["yes"] if self.FEATURE_VOCAB[0] in features else ["no"]
-        yield {"features": features, "label": label}
-
-    raw_dataset = tf.data.Dataset.from_generator(
-        feature_and_label_gen,
-        output_signature={
-            "features": tf.TensorSpec([3], tf.string),
-            "label": tf.TensorSpec([1], tf.string)
-        }).shuffle(100).batch(32)
-
-    train_dataset = raw_dataset.map(lambda x: (  # pylint: disable=g-long-lambda
-        {
-            "features": feature_mapper(x["features"])
-        }, label_mapper(x["label"])))
-    return train_dataset
-
-  def define_model(self):
-    """A simple model for test of tf.distribute + KPL."""
-    # Create the model. The input needs to be compatible with KPLs.
-    model_input = keras.layers.Input(
-        shape=(3,), dtype=tf.int64, name="model_input")
-
-    # input_dim includes a mask token and an oov token.
-    emb_output = keras.layers.Embedding(
-        input_dim=len(self.FEATURE_VOCAB) + 2, output_dim=20)(
-            model_input)
-    emb_output = tf.reduce_mean(emb_output, axis=1)
-    dense_output = keras.layers.Dense(
-        units=1, activation="sigmoid")(
-            emb_output)
-    model = keras.Model({"features": model_input}, dense_output)
-    return model
-
-  def define_reverse_lookup_layer(self):
-    """Create string reverse lookup layer for serving."""
-
-    label_inverse_lookup_layer = string_lookup.StringLookup(
-        num_oov_indices=0,
-        mask_token=None,
-        vocabulary=self.LABEL_VOCAB,
-        invert=True)
-    return label_inverse_lookup_layer
-
-  def create_serving_signature(self, model, feature_mapper,
-                               label_inverse_lookup_layer):
-    """Create serving signature for the given model."""
-
-    @tf.function
-    def serve_fn(raw_features):
-      raw_features = tf.expand_dims(raw_features, axis=0)
-      transformed_features = model.feature_mapper(raw_features)
-      outputs = model(transformed_features)
-      outputs = tf.squeeze(outputs, axis=0)
-      outputs = tf.cast(tf.greater(outputs, 0.5), tf.int64)
-      decoded_outputs = model.label_inverse_lookup_layer(outputs)
-      return tf.squeeze(decoded_outputs, axis=0)
-
-    model.feature_mapper = feature_mapper
-    model.label_inverse_lookup_layer = label_inverse_lookup_layer
-    # serving does NOT have batch dimension
-    return serve_fn.get_concrete_function(
-        tf.TensorSpec(
-            shape=(3), dtype=tf.string, name="example"))
-
-  def test_save_load_serving_model(self, model, feature_mapper,
-                                   label_inverse_lookup_layer):
-    """Test save/load/serving model."""
-
-    serving_fn = self.create_serving_signature(model, feature_mapper,
-                                               label_inverse_lookup_layer)
-
-    saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    model.save(saved_model_dir, save_format="tf",
-               signatures={"serving_default": serving_fn})
-
-    # Test the saved_model.
-    loaded_serving_fn = keras.saving.save.load_model(
-        saved_model_dir).signatures["serving_default"]
-
-    # check the result w/ and w/o avenger.
-    prediction0 = loaded_serving_fn(
-        tf.constant(["avenger", "ironman", "avenger"]))["output_0"]
-    self.assertIn(prediction0.numpy().decode("UTF-8"), ("yes", "no"))
-
-    prediction1 = loaded_serving_fn(
-        tf.constant(["ironman", "ironman", "unknown"]))["output_0"]
-    self.assertIn(prediction1.numpy().decode("UTF-8"), ("yes", "no"))
+    """Utils for test of tf.distribute + KPL."""
+
+    FEATURE_VOCAB = [
+        "avenger",
+        "ironman",
+        "batman",
+        "hulk",
+        "spiderman",
+        "kingkong",
+        "wonder_woman",
+    ]
+    LABEL_VOCAB = ["yes", "no"]
+
+    def define_kpls_for_training(self, use_adapt):
+        """Function that defines KPL used for unit tests of tf.distribute.
+
+        Args:
+          use_adapt: if adapt will be called. False means there will be
+            precomputed statistics.
+
+        Returns:
+          feature_mapper: a simple keras model with one keras StringLookup layer
+          which maps feature to index.
+          label_mapper: similar to feature_mapper, but maps label to index.
+
+        """
+        if use_adapt:
+            feature_lookup_layer = string_lookup.StringLookup(num_oov_indices=1)
+            feature_lookup_layer.adapt(self.FEATURE_VOCAB)
+            label_lookup_layer = string_lookup.StringLookup(
+                num_oov_indices=0, mask_token=None
+            )
+            label_lookup_layer.adapt(self.LABEL_VOCAB)
+        else:
+            feature_lookup_layer = string_lookup.StringLookup(
+                vocabulary=self.FEATURE_VOCAB, num_oov_indices=1
+            )
+            label_lookup_layer = string_lookup.StringLookup(
+                vocabulary=self.LABEL_VOCAB, num_oov_indices=0, mask_token=None
+            )
+
+        raw_feature_input = keras.layers.Input(
+            shape=(3,), dtype=tf.string, name="feature", ragged=True
+        )
+        feature_id_input = feature_lookup_layer(raw_feature_input)
+        feature_mapper = keras.Model(
+            {"features": raw_feature_input}, feature_id_input
+        )
+
+        raw_label_input = keras.layers.Input(
+            shape=(1,), dtype=tf.string, name="label"
+        )
+        label_id_input = label_lookup_layer(raw_label_input)
+        label_mapper = keras.Model({"label": raw_label_input}, label_id_input)
+
+        return feature_mapper, label_mapper
+
+    def dataset_fn(self, feature_mapper, label_mapper):
+        """Function that generates dataset for test of tf.distribute + KPL.
+
+        Args:
+          feature_mapper: a simple keras model with one keras StringLookup layer
+            which maps feature to index.
+          label_mapper: similar to feature_mapper, but maps label to index.
+
+        Returns:
+          Generated dataset for test of tf.distribute + KPL.
+
+        """
+
+        def feature_and_label_gen():
+            # Generator of dataset.
+            while True:
+                features = random.sample(self.FEATURE_VOCAB, 3)
+                label = ["yes"] if self.FEATURE_VOCAB[0] in features else ["no"]
+                yield {"features": features, "label": label}
+
+        raw_dataset = (
+            tf.data.Dataset.from_generator(
+                feature_and_label_gen,
+                output_signature={
+                    "features": tf.TensorSpec([3], tf.string),
+                    "label": tf.TensorSpec([1], tf.string),
+                },
+            )
+            .shuffle(100)
+            .batch(32)
+        )
+
+        train_dataset = raw_dataset.map(
+            lambda x: (
+                {"features": feature_mapper(x["features"])},
+                label_mapper(x["label"]),
+            )
+        )
+        return train_dataset
+
+    def define_model(self):
+        """A simple model for test of tf.distribute + KPL."""
+        # Create the model. The input needs to be compatible with KPLs.
+        model_input = keras.layers.Input(
+            shape=(3,), dtype=tf.int64, name="model_input"
+        )
+
+        # input_dim includes a mask token and an oov token.
+        emb_output = keras.layers.Embedding(
+            input_dim=len(self.FEATURE_VOCAB) + 2, output_dim=20
+        )(model_input)
+        emb_output = tf.reduce_mean(emb_output, axis=1)
+        dense_output = keras.layers.Dense(units=1, activation="sigmoid")(
+            emb_output
+        )
+        model = keras.Model({"features": model_input}, dense_output)
+        return model
+
+    def define_reverse_lookup_layer(self):
+        """Create string reverse lookup layer for serving."""
+
+        label_inverse_lookup_layer = string_lookup.StringLookup(
+            num_oov_indices=0,
+            mask_token=None,
+            vocabulary=self.LABEL_VOCAB,
+            invert=True,
+        )
+        return label_inverse_lookup_layer
+
+    def create_serving_signature(
+        self, model, feature_mapper, label_inverse_lookup_layer
+    ):
+        """Create serving signature for the given model."""
+
+        @tf.function
+        def serve_fn(raw_features):
+            raw_features = tf.expand_dims(raw_features, axis=0)
+            transformed_features = model.feature_mapper(raw_features)
+            outputs = model(transformed_features)
+            outputs = tf.squeeze(outputs, axis=0)
+            outputs = tf.cast(tf.greater(outputs, 0.5), tf.int64)
+            decoded_outputs = model.label_inverse_lookup_layer(outputs)
+            return tf.squeeze(decoded_outputs, axis=0)
+
+        model.feature_mapper = feature_mapper
+        model.label_inverse_lookup_layer = label_inverse_lookup_layer
+        # serving does NOT have batch dimension
+        return serve_fn.get_concrete_function(
+            tf.TensorSpec(shape=(3), dtype=tf.string, name="example")
+        )
+
+    def test_save_load_serving_model(
+        self, model, feature_mapper, label_inverse_lookup_layer
+    ):
+        """Test save/load/serving model."""
+
+        serving_fn = self.create_serving_signature(
+            model, feature_mapper, label_inverse_lookup_layer
+        )
+
+        saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+        model.save(
+            saved_model_dir,
+            save_format="tf",
+            signatures={"serving_default": serving_fn},
+        )
+
+        # Test the saved_model.
+        loaded_serving_fn = keras.saving.legacy.save.load_model(
+            saved_model_dir
+        ).signatures["serving_default"]
+
+        # check the result w/ and w/o avenger.
+        prediction0 = loaded_serving_fn(
+            tf.constant(["avenger", "ironman", "avenger"])
+        )["output_0"]
+        self.assertIn(prediction0.numpy().decode("UTF-8"), ("yes", "no"))
+
+        prediction1 = loaded_serving_fn(
+            tf.constant(["ironman", "ironman", "unknown"])
+        )["output_0"]
+        self.assertIn(prediction1.numpy().decode("UTF-8"), ("yes", "no"))
diff --git a/keras/utils/layer_utils.py b/keras/utils/layer_utils.py
index df81f85b090f..c15434667043 100644
--- a/keras/utils/layer_utils.py
+++ b/keras/utils/layer_utils.py
@@ -12,699 +12,1101 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Utilities related to layer/model functionality."""
 
 import copy
 import functools
+import re
 import weakref
 
-from keras.utils import io_utils
-from keras.utils import tf_inspect
 import numpy as np
-
 import tensorflow.compat.v2 as tf
+
+from keras import initializers
+from keras.utils import io_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.utils.get_source_inputs')
+@keras_export("keras.utils.get_source_inputs")
 def get_source_inputs(tensor, layer=None, node_index=None):
-  """Returns the list of input tensors necessary to compute `tensor`.
-
-  Output will always be a list of tensors
-  (potentially with 1 element).
-
-  Args:
-      tensor: The tensor to start from.
-      layer: Origin layer of the tensor. Will be
-          determined via tensor._keras_history if not provided.
-      node_index: Origin node index of the tensor.
-
-  Returns:
-      List of input tensors.
-  """
-  if not hasattr(tensor, '_keras_history'):
-    return tensor
-
-  if layer is None or node_index:
-    layer, node_index, _ = tensor._keras_history
-  if not layer._inbound_nodes:
-    return [tensor]
-  else:
-    node = layer._inbound_nodes[node_index]
-    if node.is_input:
-      # Reached an Input layer, stop recursion.
-      return tf.nest.flatten(node.input_tensors)
+    """Returns the list of input tensors necessary to compute `tensor`.
+
+    Output will always be a list of tensors
+    (potentially with 1 element).
+
+    Args:
+        tensor: The tensor to start from.
+        layer: Origin layer of the tensor. Will be
+            determined via tensor._keras_history if not provided.
+        node_index: Origin node index of the tensor.
+
+    Returns:
+        List of input tensors.
+    """
+    if not hasattr(tensor, "_keras_history"):
+        return tensor
+
+    if layer is None or node_index:
+        layer, node_index, _ = tensor._keras_history
+    if not layer._inbound_nodes:
+        return [tensor]
     else:
-      source_tensors = []
-      for layer, node_index, _, tensor in node.iterate_inbound():
-        previous_sources = get_source_inputs(tensor, layer, node_index)
-        # Avoid input redundancy.
-        for x in previous_sources:
-          if all(x is not t for t in source_tensors):
-            source_tensors.append(x)
-      return source_tensors
-
-
-def validate_string_arg(input_data,
-                        allowable_strings,
-                        layer_name,
-                        arg_name,
-                        allow_none=False,
-                        allow_callables=False):
-  """Validates the correctness of a string-based arg."""
-  if allow_none and input_data is None:
-    return
-  elif allow_callables and callable(input_data):
-    return
-  elif isinstance(input_data, str) and input_data in allowable_strings:
-    return
-  else:
-    allowed_args = '`None`, ' if allow_none else ''
-    allowed_args += 'a `Callable`, ' if allow_callables else ''
-    allowed_args += 'or one of the following values: %s' % (allowable_strings,)
-    if allow_callables:
-      callable_note = (
-          f'If restoring a model and `{arg_name}` is a custom callable, '
-          'please ensure the callable is registered as a custom object. '
-          'See https://www.tensorflow.org/guide/keras/save_and_serialize'
-          '#registering_the_custom_object for details. ')
+        node = layer._inbound_nodes[node_index]
+        if node.is_input:
+            # Reached an Input layer, stop recursion.
+            return tf.nest.flatten(node.input_tensors)
+        else:
+            source_tensors = []
+            for layer, node_index, _, tensor in node.iterate_inbound():
+                previous_sources = get_source_inputs(tensor, layer, node_index)
+                # Avoid input redundancy.
+                for x in previous_sources:
+                    if all(x is not t for t in source_tensors):
+                        source_tensors.append(x)
+            return source_tensors
+
+
+def validate_string_arg(
+    input_data,
+    allowable_strings,
+    layer_name,
+    arg_name,
+    allow_none=False,
+    allow_callables=False,
+):
+    """Validates the correctness of a string-based arg."""
+    if allow_none and input_data is None:
+        return
+    elif allow_callables and callable(input_data):
+        return
+    elif isinstance(input_data, str) and input_data in allowable_strings:
+        return
     else:
-      callable_note = ''
-    raise ValueError(
-        f'Unkown value for `{arg_name}` argument of layer {layer_name}. '
-        f'{callable_note}Allowed values are: {allowed_args}. Received: '
-        f'{input_data}')
+        allowed_args = "`None`, " if allow_none else ""
+        allowed_args += "a `Callable`, " if allow_callables else ""
+        allowed_args += f"or one of the following values: {allowable_strings}"
+        if allow_callables:
+            callable_note = (
+                f"If restoring a model and `{arg_name}` is a custom callable, "
+                "please ensure the callable is registered as a custom object. "
+                "See https://www.tensorflow.org/guide/keras/save_and_serialize"
+                "#registering_the_custom_object for details. "
+            )
+        else:
+            callable_note = ""
+        raise ValueError(
+            f"Unkown value for `{arg_name}` argument of layer {layer_name}. "
+            f"{callable_note}Allowed values are: {allowed_args}. Received: "
+            f"{input_data}"
+        )
 
 
 def count_params(weights):
-  """Count the total number of scalars composing the weights.
-
-  Args:
-      weights: An iterable containing the weights on which to compute params
-
-  Returns:
-      The total number of scalars composing the weights
-  """
-  unique_weights = {id(w): w for w in weights}.values()
-  # Ignore TrackableWeightHandlers, which will not have a shape defined.
-  unique_weights = [w for w in unique_weights if hasattr(w, 'shape')]
-  weight_shapes = [w.shape.as_list() for w in unique_weights]
-  standardized_weight_shapes = [
-      [0 if w_i is None else w_i for w_i in w] for w in weight_shapes
-  ]
-  return int(sum(np.prod(p) for p in standardized_weight_shapes))
-
-
-def print_summary(model,
-                  line_length=None,
-                  positions=None,
-                  print_fn=None,
-                  expand_nested=False,
-                  show_trainable=False):
-  """Prints a summary of a model.
-
-  Args:
-      model: Keras model instance.
-      line_length: Total length of printed lines
-          (e.g. set this to adapt the display to different
-          terminal window sizes).
-      positions: Relative or absolute positions of log elements in each line.
-          If not provided, defaults to `[.33, .55, .67, 1.]`.
-      print_fn: Print function to use.
-          It will be called on each line of the summary.
-          You can set it to a custom function
-          in order to capture the string summary.
-          It defaults to `print` (prints to stdout).
-      expand_nested: Whether to expand the nested models.
-          If not provided, defaults to `False`.
-      show_trainable: Whether to show if a layer is trainable.
-          If not provided, defaults to `False`.
-  """
-  if print_fn is None:
-    print_fn = io_utils.print_msg
-
-  if model.__class__.__name__ == 'Sequential':
-    sequential_like = True
-  elif not model._is_graph_network:
-    # We treat subclassed models as a simple sequence of layers, for logging
-    # purposes.
-    sequential_like = True
-  else:
-    sequential_like = True
-    nodes_by_depth = model._nodes_by_depth.values()
-    nodes = []
-    for v in nodes_by_depth:
-      if (len(v) > 1) or (len(v) == 1 and
-                          len(tf.nest.flatten(v[0].keras_inputs)) > 1):
-        # if the model has multiple nodes
-        # or if the nodes have multiple inbound_layers
-        # the model is no longer sequential
-        sequential_like = False
-        break
-      nodes += v
-    if sequential_like:
-      # search for shared layers
-      for layer in model.layers:
-        flag = False
-        for node in layer._inbound_nodes:
-          if node in nodes:
-            if flag:
-              sequential_like = False
-              break
-            else:
-              flag = True
-        if not sequential_like:
-          break
-
-  if sequential_like:
-    line_length = line_length or 65
-    positions = positions or [.45, .85, 1.]
-    if positions[-1] <= 1:
-      positions = [int(line_length * p) for p in positions]
-    # header names for the different log elements
-    to_display = ['Layer (type)', 'Output Shape', 'Param #']
-  else:
-    line_length = line_length or 98
-    positions = positions or [.33, .55, .67, 1.]
-    if positions[-1] <= 1:
-      positions = [int(line_length * p) for p in positions]
-    # header names for the different log elements
-    to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Connected to']
-    relevant_nodes = []
-    for v in model._nodes_by_depth.values():
-      relevant_nodes += v
-
-  if show_trainable:
-    line_length += 11
-    positions.append(line_length)
-    to_display.append('Trainable')
-
-  def print_row(fields, positions, nested_level=0):
-    left_to_print = [str(x) for x in fields]
-    while any(left_to_print):
-      line = ''
-      for col in range(len(left_to_print)):
-        if col > 0:
-          start_pos = positions[col - 1]
-        else:
-          start_pos = 0
-        end_pos = positions[col]
-        # Leave room for 2 spaces to delineate columns
-        # we don't need any if we are printing the last column
-        space = 2 if col != len(positions) - 1 else 0
-        cutoff = end_pos - start_pos - space
-        fit_into_line = left_to_print[col][:cutoff]
-        # For nicer formatting we line-break on seeing end of
-        # tuple/dict etc.
-        line_break_conditions = ('),', '},', '],', "',")
-        candidate_cutoffs = [
-            fit_into_line.find(x) + len(x)
-            for x in line_break_conditions
-            if fit_into_line.find(x) >= 0
-        ]
-        if candidate_cutoffs:
-          cutoff = min(candidate_cutoffs)
-          fit_into_line = fit_into_line[:cutoff]
-
-        if col == 0:
-          line += '|' * nested_level + ' '
-        line += fit_into_line
-        line += ' ' * space if space else ''
-        left_to_print[col] = left_to_print[col][cutoff:]
-
-        # Pad out to the next position
-        if nested_level:
-          line += ' ' * (positions[col] - len(line) - nested_level)
-        else:
-          line += ' ' * (positions[col] - len(line))
-      line += '|' * nested_level
-      print_fn(line)
+    """Count the total number of scalars composing the weights.
 
-  print_fn('Model: "{}"'.format(model.name))
-  print_fn('_' * line_length)
-  print_row(to_display, positions)
-  print_fn('=' * line_length)
+    Args:
+        weights: An iterable containing the weights on which to compute params
 
-  def print_layer_summary(layer, nested_level=0):
-    """Prints a summary for a single layer.
+    Returns:
+        The total number of scalars composing the weights
+    """
+    unique_weights = {id(w): w for w in weights}.values()
+    # Ignore TrackableWeightHandlers, which will not have a shape defined.
+    unique_weights = [w for w in unique_weights if hasattr(w, "shape")]
+    weight_shapes = [w.shape.as_list() for w in unique_weights]
+    standardized_weight_shapes = [
+        [0 if w_i is None else w_i for w_i in w] for w in weight_shapes
+    ]
+    return int(sum(np.prod(p) for p in standardized_weight_shapes))
+
+
+def weight_memory_size(weights):
+    """Calculate the memory footprint for weights based on their dtypes.
+
+    Args:
+        weights: An iterable contains the weights to compute weight size.
+
+    Returns:
+        The total memory size (in Bytes) of the weights.
+    """
+    unique_weights = {id(w): w for w in weights}.values()
+
+    total_memory_size = 0
+    for w in unique_weights:
+        # Ignore TrackableWeightHandlers, which will not have a shape defined.
+        if not hasattr(w, "shape"):
+            continue
+        elif None in w.shape.as_list():
+            continue
+        weight_shape = np.prod(w.shape.as_list())
+        per_param_size = w.dtype.size
+        total_memory_size += weight_shape * per_param_size
+    return total_memory_size
+
+
+def dtensor_variable_summary(weights):
+    """Group and calculate DTensor based weights memory size.
+
+    Since DTensor weights can be sharded across multiple device, the result
+    will be grouped by the layout/sharding spec for the variables, so that
+    the accurate per-device memory size can be calculated.
 
     Args:
-        layer: target layer.
-        nested_level: level of nesting of the layer inside its parent layer
-          (e.g. 0 for a top-level layer, 1 for a nested layer).
+        weights: An iterable contains the weights to compute weight size.
+
+    Returns:
+        total_weight_count, total_memory_size and per_sharing_spec_result which
+        is a dict with normalized layout spec as key and tuple of weight count
+        and weight size as value.
     """
-    try:
-      output_shape = layer.output_shape
-    except AttributeError:
-      output_shape = 'multiple'
-    except RuntimeError:  # output_shape unknown in Eager mode.
-      output_shape = '?'
-    name = layer.name
-    cls_name = layer.__class__.__name__
-    if not layer.built and not getattr(layer, '_is_graph_network', False):
-      # If a subclassed model has a layer that is not called in Model.call, the
-      # layer will not be built and we cannot call layer.count_params().
-      params = '0 (unused)'
+    unique_weights = {id(w): w for w in weights}.values()
+    total_weight_count = 0
+    total_memory_size = 0
+    per_sharing_spec_result = {}
+    for w in unique_weights:
+        # Ignore TrackableWeightHandlers, which will not have a shape defined.
+        if not hasattr(w, "shape"):
+            continue
+        if not isinstance(w, tf.experimental.dtensor.DVariable):
+            continue
+        layout = w.layout
+        # Remove all the duplication axis, and sort the column name.
+        # 1D replicated and 2D replicated variable will still be fully
+        # replicated, and [batch, model] sharding will have same memory
+        # footprint as the [model, batch] layout.
+        reduced_sharding_spec = list(sorted(set(layout.sharding_specs)))
+        if tf.experimental.dtensor.UNSHARDED in reduced_sharding_spec:
+            reduced_sharding_spec.remove(tf.experimental.dtensor.UNSHARDED)
+        reduced_sharding_spec = tuple(reduced_sharding_spec)  # For dict key
+        weight_count, memory_size = per_sharing_spec_result.get(
+            reduced_sharding_spec, (0, 0)
+        )
+        reduced_weight_shape = np.prod(w.shape.as_list())
+        per_param_size = w.dtype.size
+        weight_count += reduced_weight_shape
+        memory_size += reduced_weight_shape * per_param_size
+        per_sharing_spec_result[reduced_sharding_spec] = (
+            weight_count,
+            memory_size,
+        )
+        total_weight_count += reduced_weight_shape
+        total_memory_size += reduced_weight_shape * per_param_size
+    return total_weight_count, total_memory_size, per_sharing_spec_result
+
+
+def print_dtensor_variable_summary(model, print_fn, line_length):
+    if getattr(model, "_layout_map", None) is not None:
+        mesh = model._layout_map.get_default_mesh()
+    elif hasattr(model, "distribute_strategy") and hasattr(
+        model.distribute_strategy, "_mesh"
+    ):
+        mesh = model.distribute_strategy._mesh
     else:
-      params = layer.count_params()
-    fields = [name + ' (' + cls_name + ')', output_shape, params]
+        # Not running with DTensor
+        mesh = None
+    if mesh:
+        (
+            total_weight_count,
+            total_memory_size,
+            per_sharing_spec_result,
+        ) = dtensor_variable_summary(model.weights)
+        total_per_device_memory_size = 0
+        for sharding_spec in sorted(per_sharing_spec_result.keys()):
+            count, memory_size = per_sharing_spec_result[sharding_spec]
+            if len(sharding_spec) == 0:
+                print_fn(
+                    f"{count} / {total_weight_count} params "
+                    f"({readable_memory_size(memory_size)}) "
+                    "are fully replicated"
+                )
+                per_device_size = memory_size
+            else:
+                sharding_factor = np.prod(
+                    [mesh.dim_size(s) for s in sharding_spec]
+                )
+                per_device_size = memory_size / sharding_factor
+                print_fn(
+                    f"{count} / {total_weight_count} params "
+                    f"({readable_memory_size(memory_size)}) are sharded based "
+                    f"on spec '{sharding_spec}' and across {sharding_factor} "
+                    f"devices."
+                )
+            total_per_device_memory_size += per_device_size
+        print_fn(
+            "Overall per device memory usage: "
+            f"{readable_memory_size(total_per_device_memory_size)}"
+        )
+        print_fn(
+            "Overall sharding factor: {:.2f}".format(
+                total_memory_size / total_per_device_memory_size
+            )
+        )
+        print_fn("_" * line_length)
+
+
+def readable_memory_size(weight_memory_size):
+    """Convert the weight memory size (Bytes) to a readable string."""
+    units = ["Byte", "KB", "MB", "GB", "TB", "PB"]
+    scale = 1024
+    for unit in units:
+        if weight_memory_size / scale < 1:
+            return "{:.2f} {}".format(weight_memory_size, unit)
+        else:
+            weight_memory_size /= scale
+    return "{:.2f} {}".format(weight_memory_size, units[-1])
 
-    if show_trainable:
-      fields.append('Y' if layer.trainable else 'N')
 
-    print_row(fields, positions, nested_level)
+def get_layer_index_bound_by_layer_name(model, layer_range=None):
+    """Get the layer indexes from the model based on layer names.
 
-  def print_layer_summary_with_connections(layer, nested_level=0):
-    """Prints a summary for a single layer (including topological connections).
+    The layer indexes can be used to slice the model into sub models for
+    display.
 
     Args:
-        layer: target layer.
-        nested_level: level of nesting of the layer inside its parent layer
-          (e.g. 0 for a top-level layer, 1 for a nested layer).
+        model: `tf.keras.Model` instance.
+        layer_names: a list or tuple of 2 strings, the starting layer name and
+            ending layer name (both inclusive) for the result. All layers will
+            be included when `None` is provided.
+
+    Returns:
+        The index value of layer based on its unique name (layer_names).
+        Output will be [first_layer_index, last_layer_index + 1].
     """
-    try:
-      output_shape = layer.output_shape
-    except AttributeError:
-      output_shape = 'multiple'
-    connections = []
-    for node in layer._inbound_nodes:
-      if relevant_nodes and node not in relevant_nodes:
-        # node is not part of the current network
-        continue
-
-      for inbound_layer, node_index, tensor_index, _ in node.iterate_inbound():
-        connections.append('{}[{}][{}]'.format(inbound_layer.name, node_index,
-                                               tensor_index))
-
-    name = layer.name
-    cls_name = layer.__class__.__name__
-    fields = [
-        name + ' (' + cls_name + ')', output_shape,
-        layer.count_params(), connections
+    if layer_range is not None:
+        if len(layer_range) != 2:
+            raise ValueError(
+                "layer_range must be a list or tuple of length 2. Received: "
+                f"layer_range = {layer_range} of length {len(layer_range)}"
+            )
+        if not isinstance(layer_range[0], str) or not isinstance(
+            layer_range[1], str
+        ):
+            raise ValueError(
+                "layer_range should contain string type only. "
+                f"Received: {layer_range}"
+            )
+    else:
+        return [0, len(model.layers)]
+
+    lower_index = [
+        idx
+        for idx, layer in enumerate(model.layers)
+        if re.match(layer_range[0], layer.name)
+    ]
+    upper_index = [
+        idx
+        for idx, layer in enumerate(model.layers)
+        if re.match(layer_range[1], layer.name)
     ]
 
-    if show_trainable:
-      fields.append('Y' if layer.trainable else 'N')
+    if not lower_index or not upper_index:
+        raise ValueError(
+            "Passed layer_names do not match the layer names in the model. "
+            f"Received: {layer_range}"
+        )
+
+    if min(lower_index) > max(upper_index):
+        return [min(upper_index), max(lower_index) + 1]
+    return [min(lower_index), max(upper_index) + 1]
+
 
-    print_row(fields, positions, nested_level)
+def print_summary(
+    model,
+    line_length=None,
+    positions=None,
+    print_fn=None,
+    expand_nested=False,
+    show_trainable=False,
+    layer_range=None,
+):
+    """Prints a summary of a model.
+
+    Args:
+        model: Keras model instance.
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements in each line.
+            If not provided, defaults to `[0.3, 0.6, 0.70, 1.]`.
+        print_fn: Print function to use.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+            When `None`, uses `print` (prints to stdout).
+            Defaults to `None`.
+        expand_nested: Whether to expand the nested models.
+            Defaults to `False`.
+        show_trainable: Whether to show if a layer is trainable.
+            Defaults to `False`.
+        layer_range: List or tuple containing two strings,
+            the starting layer name and ending layer name (both inclusive),
+            indicating the range of layers to be printed in the summary. The
+            strings could also be regexes instead of an exact name. In this
+             case, the starting layer will be the first layer that matches
+            `layer_range[0]` and the ending layer will be the last element that
+            matches `layer_range[1]`. By default (`None`) all
+            layers in the model are included in the summary.
+    """
+    if print_fn is None:
+        print_fn = io_utils.print_msg
+
+    if model.__class__.__name__ == "Sequential":
+        sequential_like = True
+    elif not model._is_graph_network:
+        # We treat subclassed models as a simple sequence of layers, for logging
+        # purposes.
+        sequential_like = True
+    else:
+        sequential_like = True
+        nodes_by_depth = model._nodes_by_depth.values()
+        nodes = []
+        for v in nodes_by_depth:
+            if (len(v) > 1) or (
+                len(v) == 1 and len(tf.nest.flatten(v[0].keras_inputs)) > 1
+            ):
+                # if the model has multiple nodes
+                # or if the nodes have multiple inbound_layers
+                # the model is no longer sequential
+                sequential_like = False
+                break
+            nodes += v
+        if sequential_like:
+            # search for shared layers
+            for layer in model.layers:
+                flag = False
+                for node in layer._inbound_nodes:
+                    if node in nodes:
+                        if flag:
+                            sequential_like = False
+                            break
+                        else:
+                            flag = True
+                if not sequential_like:
+                    break
 
-  def print_layer(layer, nested_level=0, is_nested_last=False):
     if sequential_like:
-      print_layer_summary(layer, nested_level)
+        line_length = line_length or 65
+        positions = positions or [0.45, 0.85, 1.0]
+        if positions[-1] <= 1:
+            positions = [int(line_length * p) for p in positions]
+        # header names for the different log elements
+        to_display = ["Layer (type)", "Output Shape", "Param #"]
     else:
-      print_layer_summary_with_connections(layer, nested_level)
-
-    if expand_nested and hasattr(layer, 'layers') and layer.layers:
-      print_fn('|' * (nested_level + 1) + '¯' *
-               (line_length - 2 * nested_level - 2) + '|' * (nested_level + 1))
-
-      nested_layer = layer.layers
-      is_nested_last = False
-      for i in range(len(nested_layer)):
-        if i == len(nested_layer) - 1:
-          is_nested_last = True
-        print_layer(nested_layer[i], nested_level + 1, is_nested_last)
-
-      print_fn('|' * nested_level + '¯' * (line_length - 2 * nested_level) +
-               '|' * nested_level)
-
-    if not is_nested_last:
-      print_fn('|' * nested_level + ' ' * (line_length - 2 * nested_level) +
-               '|' * nested_level)
-
-  layers = model.layers
-  for layer in layers:
-    print_layer(layer)
-  print_fn('=' * line_length)
-
-  if hasattr(model, '_collected_trainable_weights'):
-    trainable_count = count_params(model._collected_trainable_weights)
-  else:
-    trainable_count = count_params(model.trainable_weights)
-
-  non_trainable_count = count_params(model.non_trainable_weights)
-
-  print_fn('Total params: {:,}'.format(trainable_count + non_trainable_count))
-  print_fn('Trainable params: {:,}'.format(trainable_count))
-  print_fn('Non-trainable params: {:,}'.format(non_trainable_count))
-  print_fn('_' * line_length)
-
-
-def convert_dense_weights_data_format(dense,
-                                      previous_feature_map_shape,
-                                      target_data_format='channels_first'):
-  """Utility useful when changing a convnet's `data_format`.
-
-  When porting the weights of a convnet from one data format to the other,
-  if the convnet includes a `Flatten` layer
-  (applied to the last convolutional feature map)
-  followed by a `Dense` layer, the weights of that `Dense` layer
-  should be updated to reflect the new dimension ordering.
-
-  Args:
-      dense: The target `Dense` layer.
-      previous_feature_map_shape: A shape tuple of 3 integers,
-          e.g. `(512, 7, 7)`. The shape of the convolutional
-          feature map right before the `Flatten` layer that
-          came before the target `Dense` layer.
-      target_data_format: One of "channels_last", "channels_first".
-          Set it "channels_last"
-          if converting a "channels_first" model to "channels_last",
-          or reciprocally.
-  """
-  assert target_data_format in {'channels_last', 'channels_first'}
-  kernel, bias = dense.get_weights()
-  for i in range(kernel.shape[1]):
-    if target_data_format == 'channels_first':
-      c, h, w = previous_feature_map_shape
-      original_fm_shape = (h, w, c)
-      ki = kernel[:, i].reshape(original_fm_shape)
-      ki = np.transpose(ki, (2, 0, 1))  # last -> first
+        line_length = line_length or 98
+        positions = positions or [0.3, 0.6, 0.70, 1.0]
+        if positions[-1] <= 1:
+            positions = [int(line_length * p) for p in positions]
+        # header names for the different log elements
+        to_display = ["Layer (type)", "Output Shape", "Param #", "Connected to"]
+        relevant_nodes = []
+        for v in model._nodes_by_depth.values():
+            relevant_nodes += v
+
+    if show_trainable:
+        line_length += 11
+        positions.append(line_length)
+        to_display.append("Trainable")
+
+    layer_range = get_layer_index_bound_by_layer_name(model, layer_range)
+
+    def print_row(fields, positions, nested_level=0):
+        left_to_print = [str(x) for x in fields]
+        while any(left_to_print):
+            line = ""
+            for col in range(len(left_to_print)):
+                if col > 0:
+                    start_pos = positions[col - 1]
+                else:
+                    start_pos = 0
+                end_pos = positions[col]
+                # Leave room for 2 spaces to delineate columns
+                # we don't need any if we are printing the last column
+                space = 2 if col != len(positions) - 1 else 0
+                cutoff = end_pos - start_pos - space
+                # Except for last col, offset by one to align the start of col
+                if col != len(positions) - 1:
+                    cutoff -= 1
+                if col == 0:
+                    cutoff -= nested_level
+                fit_into_line = left_to_print[col][:cutoff]
+                # For nicer formatting we line-break on seeing end of
+                # tuple/dict etc.
+                line_break_conditions = ("),", "},", "],", "',")
+                candidate_cutoffs = [
+                    fit_into_line.find(x) + len(x)
+                    for x in line_break_conditions
+                    if fit_into_line.find(x) >= 0
+                ]
+                if candidate_cutoffs:
+                    cutoff = min(candidate_cutoffs)
+                    fit_into_line = fit_into_line[:cutoff]
+
+                if col == 0:
+                    line += "|" * nested_level + " "
+                line += fit_into_line
+                line += " " * space if space else ""
+                left_to_print[col] = left_to_print[col][cutoff:]
+
+                # Pad out to the next position
+                # Make space for nested_level for last column
+                if nested_level and col == len(positions) - 1:
+                    line += " " * (positions[col] - len(line) - nested_level)
+                else:
+                    line += " " * (positions[col] - len(line))
+            line += "|" * nested_level
+            print_fn(line)
+
+    print_fn(f'Model: "{model.name}"')
+    print_fn("_" * line_length)
+    print_row(to_display, positions)
+    print_fn("=" * line_length)
+
+    def print_layer_summary(layer, nested_level=0):
+        """Prints a summary for a single layer.
+
+        Args:
+            layer: target layer.
+            nested_level: level of nesting of the layer inside its parent layer
+              (e.g. 0 for a top-level layer, 1 for a nested layer).
+        """
+        try:
+            output_shape = layer.output_shape
+        except AttributeError:
+            output_shape = "multiple"
+        except RuntimeError:  # output_shape unknown in Eager mode.
+            output_shape = "?"
+        name = layer.name
+        cls_name = layer.__class__.__name__
+        if not layer.built and not getattr(layer, "_is_graph_network", False):
+            # If a subclassed model has a layer that is not called in
+            # Model.call, the layer will not be built and we cannot call
+            # layer.count_params().
+            params = "0 (unused)"
+        else:
+            params = layer.count_params()
+        fields = [name + " (" + cls_name + ")", output_shape, params]
+
+        if show_trainable:
+            fields.append("Y" if layer.trainable else "N")
+
+        print_row(fields, positions, nested_level)
+
+    def print_layer_summary_with_connections(layer, nested_level=0):
+        """Prints a summary for a single layer (including its connections).
+
+        Args:
+            layer: target layer.
+            nested_level: level of nesting of the layer inside its parent layer
+              (e.g. 0 for a top-level layer, 1 for a nested layer).
+        """
+        try:
+            output_shape = layer.output_shape
+        except AttributeError:
+            output_shape = "multiple"
+        connections = []
+        for node in layer._inbound_nodes:
+            if relevant_nodes and node not in relevant_nodes:
+                # node is not part of the current network
+                continue
+
+            for (
+                inbound_layer,
+                node_index,
+                tensor_index,
+                _,
+            ) in node.iterate_inbound():
+                connections.append(
+                    f"{inbound_layer.name}[{node_index}][{tensor_index}]"
+                )
+
+        name = layer.name
+        cls_name = layer.__class__.__name__
+        fields = [
+            name + " (" + cls_name + ")",
+            output_shape,
+            layer.count_params(),
+            connections,
+        ]
+
+        if show_trainable:
+            fields.append("Y" if layer.trainable else "N")
+
+        print_row(fields, positions, nested_level)
+
+    def print_layer(layer, nested_level=0, is_nested_last=False):
+        if sequential_like:
+            print_layer_summary(layer, nested_level)
+        else:
+            print_layer_summary_with_connections(layer, nested_level)
+
+        if expand_nested and hasattr(layer, "layers") and layer.layers:
+            print_fn(
+                "|" * (nested_level + 1)
+                + "¯" * (line_length - 2 * nested_level - 2)
+                + "|" * (nested_level + 1)
+            )
+
+            nested_layer = layer.layers
+            is_nested_last = False
+            for i in range(len(nested_layer)):
+                if i == len(nested_layer) - 1:
+                    is_nested_last = True
+                print_layer(nested_layer[i], nested_level + 1, is_nested_last)
+
+            print_fn(
+                "|" * nested_level
+                + "¯" * (line_length - 2 * nested_level)
+                + "|" * nested_level
+            )
+
+        if not is_nested_last:
+            print_fn(
+                "|" * nested_level
+                + " " * (line_length - 2 * nested_level)
+                + "|" * nested_level
+            )
+
+    for layer in model.layers[layer_range[0] : layer_range[1]]:
+        print_layer(layer)
+    print_fn("=" * line_length)
+
+    if hasattr(model, "_collected_trainable_weights"):
+        trainable_count = count_params(model._collected_trainable_weights)
+        trainable_memory_size = weight_memory_size(
+            model._collected_trainable_weights
+        )
     else:
-      h, w, c = previous_feature_map_shape
-      original_fm_shape = (c, h, w)
-      ki = kernel[:, i].reshape(original_fm_shape)
-      ki = np.transpose(ki, (1, 2, 0))  # first -> last
-    kernel[:, i] = np.reshape(ki, (np.prod(previous_feature_map_shape),))
-  dense.set_weights([kernel, bias])
+        trainable_count = count_params(model.trainable_weights)
+        trainable_memory_size = weight_memory_size(model.trainable_weights)
+
+    non_trainable_count = count_params(model.non_trainable_weights)
+    non_trainable_memory_size = weight_memory_size(model.non_trainable_weights)
+
+    total_memory_size = trainable_memory_size + non_trainable_memory_size
+
+    print_fn(
+        f"Total params: {trainable_count + non_trainable_count} "
+        f"({readable_memory_size(total_memory_size)})"
+    )
+    print_fn(
+        f"Trainable params: {trainable_count} "
+        f"({readable_memory_size(trainable_memory_size)})"
+    )
+    print_fn(
+        f"Non-trainable params: {non_trainable_count} "
+        f"({readable_memory_size(non_trainable_memory_size)})"
+    )
+    print_fn("_" * line_length)
+
+    print_dtensor_variable_summary(model, print_fn, line_length)
+
+
+def convert_dense_weights_data_format(
+    dense, previous_feature_map_shape, target_data_format="channels_first"
+):
+    """Utility useful when changing a convnet's `data_format`.
+
+    When porting the weights of a convnet from one data format to the other,
+    if the convnet includes a `Flatten` layer
+    (applied to the last convolutional feature map)
+    followed by a `Dense` layer, the weights of that `Dense` layer
+    should be updated to reflect the new dimension ordering.
+
+    Args:
+        dense: The target `Dense` layer.
+        previous_feature_map_shape: A shape tuple of 3 integers,
+            e.g. `(512, 7, 7)`. The shape of the convolutional
+            feature map right before the `Flatten` layer that
+            came before the target `Dense` layer.
+        target_data_format: One of "channels_last", "channels_first".
+            Set it "channels_last"
+            if converting a "channels_first" model to "channels_last",
+            or reciprocally.
+    """
+    assert target_data_format in {"channels_last", "channels_first"}
+    kernel, bias = dense.get_weights()
+    for i in range(kernel.shape[1]):
+        if target_data_format == "channels_first":
+            c, h, w = previous_feature_map_shape
+            original_fm_shape = (h, w, c)
+            ki = kernel[:, i].reshape(original_fm_shape)
+            ki = np.transpose(ki, (2, 0, 1))  # last -> first
+        else:
+            h, w, c = previous_feature_map_shape
+            original_fm_shape = (c, h, w)
+            ki = kernel[:, i].reshape(original_fm_shape)
+            ki = np.transpose(ki, (1, 2, 0))  # first -> last
+        kernel[:, i] = np.reshape(ki, (np.prod(previous_feature_map_shape),))
+    dense.set_weights([kernel, bias])
 
 
 def is_builtin_layer(layer):
-  if not getattr(layer, '_keras_api_names', None):
-    return False
+    if not getattr(layer, "_keras_api_names", None):
+        return False
 
-  # Subclasses of `Layer` that are not exported inherit the export name
-  # of the base layer class.
-  return (layer._keras_api_names != ('keras.layers.Layer',) and
-          layer._keras_api_names_v1 != ('keras.layers.Layer',))
+    # Subclasses of `Layer` that are not exported inherit the export name
+    # of the base layer class.
+    return layer._keras_api_names != (
+        "keras.layers.Layer",
+    ) and layer._keras_api_names_v1 != ("keras.layers.Layer",)
 
 
 def cached_per_instance(f):
-  """Lightweight decorator for caching lazily constructed properties.
-
-  When to use:
-  This decorator provides simple caching with minimal overhead. It is designed
-  for properties which are expensive to compute and static over the life of a
-  class instance, and provides no mechanism for cache invalidation. Thus it is
-  best suited for lazily exposing derived properties of other static data.
+    """Lightweight decorator for caching lazily constructed properties.
+
+    When to use:
+    This decorator provides simple caching with minimal overhead. It is designed
+    for properties which are expensive to compute and static over the life of a
+    class instance, and provides no mechanism for cache invalidation. Thus it is
+    best suited for lazily exposing derived properties of other static data.
+
+    For classes with custom getattr / setattr behavior (such as trackable
+    objects), storing cache results as object attributes is not performant.
+    Instead, a specialized cache can significantly reduce property lookup
+    overhead. (While still allowing the decorated property to be lazily
+    computed.) Consider the following class:
+
+    ```
+    class MyClass:
+      def __setattr__(self, key, value):
+        # Some expensive class specific code
+        # ...
+        # ...
+
+        super(MyClass, self).__setattr__(key, value)
+
+      @property
+      def thing(self):
+        # `thing` is expensive to compute (and may not even be requested), so we
+        # want to lazily compute it and then cache it.
+        output = getattr(self, '_thing', None)
+        if output is None:
+          self._thing = output = compute_thing(self)
+        return output
+    ```
 
-  For classes with custom getattr / setattr behavior (such as trackable
-  objects), storing cache results as object attributes is not performant.
-  Instead, a specialized cache can significantly reduce property lookup
-  overhead. (While still allowing the decorated property to be lazily computed.)
-  Consider the following class:
+    It's also worth noting that ANY overriding of __setattr__, even something as
+    simple as:
+    ```
+      def __setattr__(self, key, value):
+        super(MyClass, self).__setattr__(key, value)
+    ```
 
-  ```
-  class MyClass:
-    def __setattr__(self, key, value):
-      # Some expensive class specific code
-      # ...
-      # ...
+    Slows down attribute assignment by nearly 10x.
 
-      super(MyClass, self).__setattr__(key, value)
+    By contrast, replacing the definition of `thing` with the following
+    sidesteps the expensive __setattr__ altogether:
 
+    '''
     @property
+    @tracking.cached_per_instance
     def thing(self):
       # `thing` is expensive to compute (and may not even be requested), so we
       # want to lazily compute it and then cache it.
-      output = getattr(self, '_thing', None)
-      if output is None:
-        self._thing = output = compute_thing(self)
-      return output
-  ```
-
-  It's also worth noting that ANY overriding of __setattr__, even something as
-  simple as:
-  ```
-    def __setattr__(self, key, value):
-      super(MyClass, self).__setattr__(key, value)
-  ```
-
-  Slows down attribute assignment by nearly 10x.
-
-  By contrast, replacing the definition of `thing` with the following sidesteps
-  the expensive __setattr__ altogether:
-
-  '''
-  @property
-  @tracking.cached_per_instance
-  def thing(self):
-    # `thing` is expensive to compute (and may not even be requested), so we
-    # want to lazily compute it and then cache it.
-    return compute_thing(self)
-  '''
-
-  Performance:
-  The overhead for this decorator is ~0.4 us / call. A much lower overhead
-  implementation (~0.085 us / call) can be achieved by using a custom dict type:
-
-  ```
-  def dict_based_cache(f):
-    class Cache(dict):
-      __slots__ = ()
-      def __missing__(self, key):
-        self[key] = output = f(key)
-        return output
-
-    return property(Cache().__getitem__)
-  ```
+      return compute_thing(self)
+    '''
+
+    Performance:
+    The overhead for this decorator is ~0.4 us / call. A much lower overhead
+    implementation (~0.085 us / call) can be achieved by using a custom dict
+    type:
+
+    ```
+    def dict_based_cache(f):
+      class Cache(dict):
+        __slots__ = ()
+        def __missing__(self, key):
+          self[key] = output = f(key)
+          return output
+
+      return property(Cache().__getitem__)
+    ```
+
+    However, that implementation holds class instances as keys, and as a result
+    blocks garbage collection. (And modifying it to use weakref's as keys raises
+    the lookup overhead to ~0.4 us) As a result, the WeakKeyDictionary
+    implementation below turns out to be more prudent.
 
-  However, that implementation holds class instances as keys, and as a result
-  blocks garbage collection. (And modifying it to use weakref's as keys raises
-  the lookup overhead to ~0.4 us) As a result, the WeakKeyDictionary
-  implementation below turns out to be more prudent.
-
-  Args:
-    f: The function to cache.
+    Args:
+      f: The function to cache.
 
-  Returns:
-    f decorated with simple caching behavior.
-  """
+    Returns:
+      f decorated with simple caching behavior.
+    """
 
-  cache = weakref.WeakKeyDictionary()
+    cache = weakref.WeakKeyDictionary()
 
-  @functools.wraps(f)
-  def wrapped(item):
-    output = cache.get(item)
-    if output is None:
-      cache[item] = output = f(item)
-    return output
+    @functools.wraps(f)
+    def wrapped(item):
+        output = cache.get(item)
+        if output is None:
+            cache[item] = output = f(item)
+        return output
 
-  wrapped.cache = cache
-  return wrapped
+    wrapped.cache = cache
+    return wrapped
 
 
 def filter_empty_layer_containers(layer_list):
-  """Filter out empty Layer-like containers and uniquify."""
-  # TODO(b/130381733): Make this an attribute in base_layer.Layer.
-  existing = set()
-  to_visit = layer_list[::-1]
-  while to_visit:
-    obj = to_visit.pop()
-    if id(obj) in existing:
-      continue
-    existing.add(id(obj))
-    if hasattr(obj, '_is_layer') and not isinstance(obj, type):
-      yield obj
-    else:
-      sub_layers = getattr(obj, 'layers', None) or []
+    """Filter out empty Layer-like containers and uniquify."""
+    # TODO(b/130381733): Make this an attribute in base_layer.Layer.
+    existing = set()
+    to_visit = layer_list[::-1]
+    while to_visit:
+        obj = to_visit.pop()
+        if id(obj) in existing:
+            continue
+        existing.add(id(obj))
+        if hasattr(obj, "_is_layer") and not isinstance(obj, type):
+            yield obj
+        else:
+            sub_layers = getattr(obj, "layers", None) or []
 
-      # Trackable data structures will not show up in ".layers" lists, but
-      # the layers they contain will.
-      to_visit.extend(sub_layers[::-1])
+            # Trackable data structures will not show up in ".layers" lists, but
+            # the layers they contain will.
+            to_visit.extend(sub_layers[::-1])
 
 
 class CallFunctionSpec:
-  """Caches the spec and provides utilities for handling call function args."""
+    """Caches the spec and provides utilities for handling call function
+    args."""
+
+    def __init__(self, full_argspec):
+        """Initialies a `CallFunctionSpec`.
+
+        Args:
+          full_argspec: the FullArgSpec of a call function of a layer.
+        """
+        self._full_argspec = full_argspec
+
+        self._arg_names = list(self._full_argspec.args)
+        # Scrub `self` that appears if a decorator was applied.
+        if self._arg_names and self._arg_names[0] == "self":
+            self._arg_names = self._arg_names[1:]
+        self._arg_names += self._full_argspec.kwonlyargs or []
+
+        call_accepts_kwargs = self._full_argspec.varkw is not None
+        self._expects_training_arg = (
+            "training" in self._arg_names or call_accepts_kwargs
+        )
+        self._expects_mask_arg = (
+            "mask" in self._arg_names or call_accepts_kwargs
+        )
+
+        call_fn_defaults = self._full_argspec.defaults or []
+        defaults = dict()
+        # The call arg defaults are an n-tuple of the last n elements of the
+        # args list. (n = # of elements that have a default argument)
+        for i in range(-1 * len(call_fn_defaults), 0):
+            defaults[self._arg_names[i]] = call_fn_defaults[i]
+        # The default training arg will be any (non-None) default specified in
+        # the method signature, or None if no value is specified.
+        defaults.update(self._full_argspec.kwonlydefaults or {})
+        self._default_training_arg = defaults.get("training")
 
-  def __init__(self, full_argspec):
-    """Initialies a `CallFunctionSpec`.
+    @property
+    def full_argspec(self):
+        """Returns the FullArgSpec of the call function."""
+        return self._full_argspec
 
-    Args:
-      full_argspec: the FullArgSpec of a call function of a layer.
-    """
-    self._full_argspec = full_argspec
-
-    self._arg_names = list(self._full_argspec.args)
-    # Scrub `self` that appears if a decorator was applied.
-    if self._arg_names and self._arg_names[0] == 'self':
-      self._arg_names = self._arg_names[1:]
-    self._arg_names += self._full_argspec.kwonlyargs or []
-
-    call_accepts_kwargs = self._full_argspec.varkw is not None
-    self._expects_training_arg = ('training' in self._arg_names or
-                                  call_accepts_kwargs)
-    self._expects_mask_arg = 'mask' in self._arg_names or call_accepts_kwargs
-
-    call_fn_defaults = self._full_argspec.defaults or []
-    defaults = dict()
-    # The call arg defaults are an n-tuple of the last n elements of the args
-    # list. (n = # of elements that have a default argument)
-    for i in range(-1 * len(call_fn_defaults), 0):
-      defaults[self._arg_names[i]] = call_fn_defaults[i]
-    # The default training arg will be any (non-None) default specified in the
-    # method signature, or None if no value is specified.
-    defaults.update(self._full_argspec.kwonlydefaults or {})
-    self._default_training_arg = defaults.get('training')
-
-  @property
-  def full_argspec(self):
-    """Returns the FullArgSpec of the call function."""
-    return self._full_argspec
-
-  @property
-  def arg_names(self):
-    """List of names of args and kwonlyargs."""
-    # `arg_names` is not accurate if the layer has variable positional args.
-    return self._arg_names
-
-  @arg_names.setter
-  def arg_names(self, value):
-    self._arg_names = value
-
-  @property
-  @cached_per_instance
-  def arg_positions(self):
-    """Returns a dict mapping arg names to their index positions."""
-    # `arg_positions` is not accurate if the layer has variable positional args.
-    call_fn_arg_positions = dict()
-    for pos, arg in enumerate(self._arg_names):
-      call_fn_arg_positions[arg] = pos
-    return call_fn_arg_positions
-
-  @property
-  def expects_training_arg(self):
-    """Whether the call function uses 'training' as a parameter."""
-    return self._expects_training_arg
-
-  @expects_training_arg.setter
-  def expects_training_arg(self, value):
-    self._expects_training_arg = value
-
-  @property
-  def expects_mask_arg(self):
-    """Whether the call function uses `mask` as a parameter."""
-    return self._expects_mask_arg
-
-  @expects_mask_arg.setter
-  def expects_mask_arg(self, value):
-    self._expects_mask_arg = value
-
-  @property
-  def default_training_arg(self):
-    """The default value given to the "training" argument."""
-    return self._default_training_arg
-
-  def arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
-    """Returns true if argument is present in `args` or `kwargs`.
+    @property
+    def arg_names(self):
+        """List of names of args and kwonlyargs."""
+        # `arg_names` is not accurate if the layer has variable positional args.
+        return self._arg_names
 
-    Args:
-      arg_name: String name of the argument to find.
-      args: Tuple of args passed to the call function.
-      kwargs: Dictionary of kwargs  passed to the call function.
-      inputs_in_args: Whether the input argument (the first argument in the call
-        function) is included in `args`. Defaults to `False`.
+    @arg_names.setter
+    def arg_names(self, value):
+        self._arg_names = value
 
-    Returns:
-      True if argument with `arg_name` is present in `args` or `kwargs`.
-    """
-    # Performance optimization: do no work in most common case.
-    if not args and not kwargs:
-      return False
+    @property
+    @cached_per_instance
+    def arg_positions(self):
+        """Returns a dict mapping arg names to their index positions."""
+        # `arg_positions` is not accurate if the layer has variable positional
+        # args.
+        call_fn_arg_positions = dict()
+        for pos, arg in enumerate(self._arg_names):
+            call_fn_arg_positions[arg] = pos
+        return call_fn_arg_positions
 
-    if arg_name in kwargs:
-      return True
-    call_fn_args = self._arg_names
-    if not inputs_in_args:
-      # Ignore `inputs` arg.
-      call_fn_args = call_fn_args[1:]
-    return arg_name in dict(zip(call_fn_args, args))
+    @property
+    def expects_training_arg(self):
+        """Whether the call function uses 'training' as a parameter."""
+        return self._expects_training_arg
 
-  def get_arg_value(self, arg_name, args, kwargs, inputs_in_args=False):
-    """Retrieves the value for the argument with name `arg_name`.
+    @expects_training_arg.setter
+    def expects_training_arg(self, value):
+        self._expects_training_arg = value
 
-    Args:
-      arg_name: String name of the argument to find.
-      args: Tuple of args passed to the call function.
-      kwargs: Dictionary of kwargs  passed to the call function.
-      inputs_in_args: Whether the input argument (the first argument in the call
-        function) is included in `args`. Defaults to `False`.
+    @property
+    def expects_mask_arg(self):
+        """Whether the call function uses `mask` as a parameter."""
+        return self._expects_mask_arg
 
-    Returns:
-      The value of the argument with name `arg_name`, extracted from `args` or
-      `kwargs`.
+    @expects_mask_arg.setter
+    def expects_mask_arg(self, value):
+        self._expects_mask_arg = value
 
-    Raises:
-      KeyError if the value of `arg_name` cannot be found.
-    """
-    if arg_name in kwargs:
-      return kwargs[arg_name]
-    call_fn_args = self._arg_names
-    if not inputs_in_args:
-      # Ignore `inputs` arg.
-      call_fn_args = call_fn_args[1:]
-    args_dict = dict(zip(call_fn_args, args))
-    return args_dict[arg_name]
-
-  def set_arg_value(self,
-                    arg_name,
-                    new_value,
-                    args,
-                    kwargs,
-                    inputs_in_args=False,
-                    pop_kwarg_if_none=False):
-    """Sets the value of an argument into the given args/kwargs.
+    @property
+    def default_training_arg(self):
+        """The default value given to the "training" argument."""
+        return self._default_training_arg
+
+    def arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
+        """Returns true if argument is present in `args` or `kwargs`.
+
+        Args:
+          arg_name: String name of the argument to find.
+          args: Tuple of args passed to the call function.
+          kwargs: Dictionary of kwargs  passed to the call function.
+          inputs_in_args: Whether the input argument (the first argument in the
+            call function) is included in `args`. Defaults to `False`.
+
+        Returns:
+          True if argument with `arg_name` is present in `args` or `kwargs`.
+        """
+        # Performance optimization: do no work in most common case.
+        if not args and not kwargs:
+            return False
+
+        if arg_name in kwargs:
+            return True
+        call_fn_args = self._arg_names
+        if not inputs_in_args:
+            # Ignore `inputs` arg.
+            call_fn_args = call_fn_args[1:]
+        return arg_name in dict(zip(call_fn_args, args))
+
+    def get_arg_value(self, arg_name, args, kwargs, inputs_in_args=False):
+        """Retrieves the value for the argument with name `arg_name`.
+
+        Args:
+          arg_name: String name of the argument to find.
+          args: Tuple of args passed to the call function.
+          kwargs: Dictionary of kwargs  passed to the call function.
+          inputs_in_args: Whether the input argument (the first argument in the
+            call function) is included in `args`. Defaults to `False`.
+
+        Returns:
+          The value of the argument with name `arg_name`, extracted from `args`
+          or `kwargs`.
+
+        Raises:
+          KeyError if the value of `arg_name` cannot be found.
+        """
+        if arg_name in kwargs:
+            return kwargs[arg_name]
+        call_fn_args = self._arg_names
+        if not inputs_in_args:
+            # Ignore `inputs` arg.
+            call_fn_args = call_fn_args[1:]
+        args_dict = dict(zip(call_fn_args, args))
+        return args_dict[arg_name]
+
+    def set_arg_value(
+        self,
+        arg_name,
+        new_value,
+        args,
+        kwargs,
+        inputs_in_args=False,
+        pop_kwarg_if_none=False,
+    ):
+        """Sets the value of an argument into the given args/kwargs.
+
+        Args:
+          arg_name: String name of the argument to find.
+          new_value: New value to give to the argument.
+          args: Tuple of args passed to the call function.
+          kwargs: Dictionary of kwargs  passed to the call function.
+          inputs_in_args: Whether the input argument (the first argument in the
+            call function) is included in `args`. Defaults to `False`.
+          pop_kwarg_if_none: If the new value is `None`, and this is `True`,
+            then the argument is deleted from `kwargs`.
+
+        Returns:
+          The updated `(args, kwargs)`.
+        """
+        if self.full_argspec.varargs:
+            try:
+                arg_pos = self.full_argspec.args.index(arg_name)
+                if self.full_argspec.args[0] == "self":
+                    arg_pos -= 1
+            except ValueError:
+                arg_pos = None
+        else:
+            arg_pos = self.arg_positions.get(arg_name, None)
+
+        if arg_pos is not None:
+            if not inputs_in_args:
+                # Ignore `inputs` arg.
+                arg_pos = arg_pos - 1
+            if len(args) > arg_pos:
+                args = list(args)
+                args[arg_pos] = new_value
+                return tuple(args), kwargs
+        if new_value is None and pop_kwarg_if_none:
+            kwargs.pop(arg_name, None)
+        else:
+            kwargs[arg_name] = new_value
+        return args, kwargs
+
+    def split_out_first_arg(self, args, kwargs):
+        """Splits (args, kwargs) into (inputs, args, kwargs)."""
+        # Grab the argument corresponding to the first argument in the
+        # layer's `call` method spec. This will either be the first positional
+        # argument, or it will be provided as a keyword argument.
+        if args:
+            inputs = args[0]
+            args = args[1:]
+        elif self._arg_names[0] in kwargs:
+            kwargs = copy.copy(kwargs)
+            inputs = kwargs.pop(self._arg_names[0])
+        else:
+            raise ValueError(
+                "The first argument to `Layer.call` must always be passed."
+            )
+        return inputs, args, kwargs
+
+
+@keras_export("keras.utils.warmstart_embedding_matrix")
+def warmstart_embedding_matrix(
+    base_vocabulary,
+    new_vocabulary,
+    base_embeddings,
+    new_embeddings_initializer="uniform",
+):
+    """Warm start embedding matrix with changing vocab.
+
+    This util can be used to warmstart the embedding layer matrix when
+    vocabulary changes between previously saved checkpoint and model.
+    Vocabulary change could mean, the size of the new vocab is different or the
+    vocabulary is reshuffled or new vocabulary has been added to old vocabulary.
+    If the vocabulary size changes, size of the embedding layer matrix also
+    changes. This util remaps the old vocabulary embeddings to the new embedding
+    layer matrix.
+
+    Example:
+    Here is an example that demonstrates how to use the
+    `warmstart_embedding_matrix` util.
+    >>> import keras
+    >>> vocab_base = tf.convert_to_tensor(["unk", "a", "b", "c"])
+    >>> vocab_new = tf.convert_to_tensor(
+    ...        ["unk", "unk", "a", "b", "c", "d", "e"])
+    >>> vectorized_vocab_base = np.random.rand(vocab_base.shape[0], 3)
+    >>> vectorized_vocab_new = np.random.rand(vocab_new.shape[0], 3)
+    >>> warmstarted_embedding_matrix = warmstart_embedding_matrix(
+    ...       base_vocabulary=vocab_base,
+    ...       new_vocabulary=vocab_new,
+    ...       base_embeddings=vectorized_vocab_base,
+    ...       new_embeddings_initializer=keras.initializers.Constant(
+    ...         vectorized_vocab_new))
+
+    Here is an example that demonstrates how to get vocabulary and embedding
+    weights from layers, use the `warmstart_embedding_matrix` util to remap the
+    layer embeddings and continue with model training.
+    ```
+    # get old and new vocabulary by using layer.get_vocabulary()
+    # for example assume TextVectorization layer is used
+    base_vocabulary = old_text_vectorization_layer.get_vocabulary()
+    new_vocabulary = new_text_vectorization_layer.get_vocabulary()
+    # get previous embedding layer weights
+    embedding_weights_base = model.get_layer('embedding').get_weights()[0]
+    warmstarted_embedding = keras.utils.warmstart_embedding_matrix(
+                                  base_vocabulary,
+                                  new_vocabulary,
+                                  base_embeddings=embedding_weights_base,
+                                  new_embeddings_initializer="uniform")
+    updated_embedding_variable = tf.Variable(warmstarted_embedding)
+
+    # update embedding layer weights
+    model.layers[1].embeddings = updated_embedding_variable
+    model.fit(..)
+    # continue with model training
+
+    ```
 
     Args:
-      arg_name: String name of the argument to find.
-      new_value: New value to give to the argument.
-      args: Tuple of args passed to the call function.
-      kwargs: Dictionary of kwargs  passed to the call function.
-      inputs_in_args: Whether the input argument (the first argument in the call
-        function) is included in `args`. Defaults to `False`.
-      pop_kwarg_if_none: If the new value is `None`, and this is `True`, then
-        the argument is deleted from `kwargs`.
+        base_vocabulary: The list of vocabulary terms that
+          the preexisting embedding matrix `base_embeddings` represents.
+          It can be either a 1D array/tensor or a tuple/list of vocabulary
+          terms (strings), or a path to a vocabulary text file. If passing a
+           file path, the file should contain one line per term in the
+           vocabulary.
+        new_vocabulary: The list of vocabulary terms for the new vocabulary
+           (same format as above).
+        base_embeddings: NumPy array or tensor representing the preexisting
+          embedding matrix.
+        new_embeddings_initializer: Initializer for embedding vectors for
+          previously unseen terms to be added to the new embedding matrix (see
+          `keras.initializers`). new_embedding matrix
+          needs to be specified with "constant" initializer.
+          matrix. None means "uniform". Default value is None.
 
     Returns:
-      The updated `(args, kwargs)`.
+      tf.tensor of remapped embedding layer matrix
+
     """
-    if self.full_argspec.varargs:
-      try:
-        arg_pos = self.full_argspec.args.index(arg_name)
-        if self.full_argspec.args[0] == 'self':
-          arg_pos -= 1
-      except ValueError:
-        arg_pos = None
-    else:
-      arg_pos = self.arg_positions.get(arg_name, None)
-
-    if arg_pos is not None:
-      if not inputs_in_args:
-        # Ignore `inputs` arg.
-        arg_pos = arg_pos - 1
-      if len(args) > arg_pos:
-        args = list(args)
-        args[arg_pos] = new_value
-        return tuple(args), kwargs
-    if new_value is None and pop_kwarg_if_none:
-      kwargs.pop(arg_name, None)
-    else:
-      kwargs[arg_name] = new_value
-    return args, kwargs
-
-  def split_out_first_arg(self, args, kwargs):
-    """Splits (args, kwargs) into (inputs, args, kwargs)."""
-    # Grab the argument corresponding to the first argument in the
-    # layer's `call` method spec. This will either be the first positional
-    # argument, or it will be provided as a keyword argument.
-    if args:
-      inputs = args[0]
-      args = args[1:]
-    elif self._arg_names[0] in kwargs:
-      kwargs = copy.copy(kwargs)
-      inputs = kwargs.pop(self._arg_names[0])
+    # convert vocab to list
+    base_vocabulary = convert_vocab_to_list(base_vocabulary)
+    new_vocabulary = convert_vocab_to_list(new_vocabulary)
+
+    # Initialize the new embedding layer matrix
+    new_embeddings_initializer = initializers.get(new_embeddings_initializer)
+    new_embedding = new_embeddings_initializer(
+        shape=(len(new_vocabulary), base_embeddings.shape[1]),
+        dtype=base_embeddings.dtype,
+    )
+
+    # create mapping dict {vocab:index}
+    base_vocabulary_dict = dict(
+        zip(base_vocabulary, range(len(base_vocabulary)))
+    )
+
+    indices_base_vocabulary = []
+    indices_new_vocabulary = []
+    for index, key in enumerate(new_vocabulary):
+        if key in base_vocabulary_dict:
+            indices_base_vocabulary.append(base_vocabulary_dict[key])
+            indices_new_vocabulary.append(int(index))
+
+    # update embedding matrix
+    if indices_base_vocabulary:
+        values_to_update = tf.gather(base_embeddings, indices_base_vocabulary)
+        new_embedding = tf.tensor_scatter_nd_update(
+            new_embedding,
+            tf.expand_dims(indices_new_vocabulary, axis=1),
+            values_to_update,
+        )
+    return new_embedding
+
+
+def convert_vocab_to_list(vocab):
+    """Convert input vacabulary to list."""
+    vocab_list = []
+    if tf.is_tensor(vocab):
+        vocab_list = list(vocab.numpy())
+    elif isinstance(vocab, (np.ndarray, tuple, list)):
+        vocab_list = list(vocab)
+    elif isinstance(vocab, str):
+        if not tf.io.gfile.exists(vocab):
+            raise ValueError(f"Vocabulary file {vocab} does not exist.")
+        with tf.io.gfile.GFile(vocab, "r") as vocabulary_file:
+            vocab_list = vocabulary_file.read().splitlines()
     else:
-      raise ValueError(
-          'The first argument to `Layer.call` must always be passed.')
-    return inputs, args, kwargs
+        raise ValueError(
+            "Vocabulary is expected to be either a NumPy array, "
+            "list, 1D tensor or a vocabulary text file. Instead type "
+            f"{type(vocab)} was received."
+        )
+    if len(vocab_list) == 0:
+        raise ValueError(
+            "Vocabulary is expected to be either a NumPy array, "
+            "list, 1D tensor or a vocabulary text file with at least one token."
+            " Received 0 instead."
+        )
+    return vocab_list
diff --git a/keras/utils/layer_utils_test.py b/keras/utils/layer_utils_test.py
index a4e8ce2000b8..7fd128a9bea9 100644
--- a/keras/utils/layer_utils_test.py
+++ b/keras/utils/layer_utils_test.py
@@ -14,476 +14,951 @@
 # ==============================================================================
 """Tests for layer_utils."""
 
-import keras
-import tensorflow.compat.v2 as tf
-
 import collections
 import contextlib
+import io
 import multiprocessing.dummy
 import os
 import pickle
 import shutil
 import sys
+import tempfile
 import time
 import timeit
 
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras import backend
+from keras import layers
+from keras.dtensor import dtensor_api as dtensor
+from keras.dtensor import layout_map as layout_map_lib
+from keras.dtensor import test_util
+from keras.testing_infra import test_utils
 from keras.utils import io_utils
 from keras.utils import layer_utils
-
+from keras.utils import tf_utils
 
 _PICKLEABLE_CALL_COUNT = collections.Counter()
 
 
 class MyPickleableObject(tf.__internal__.tracking.AutoTrackable):
-  """Needed for InterfaceTests.test_property_cache_serialization.
-
-  This class must be at the top level. This is a constraint of pickle,
-  unrelated to `cached_per_instance`.
-  """
-
-  @property
-  @layer_utils.cached_per_instance
-  def my_id(self):
-    _PICKLEABLE_CALL_COUNT[self] += 1
-    return id(self)
-
-
-class LayerUtilsTest(tf.test.TestCase):
-
-  def test_print_summary(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Conv2D(
-            filters=2, kernel_size=(2, 3), input_shape=(3, 5, 5), name='conv'))
-    model.add(keras.layers.Flatten(name='flat'))
-    model.add(keras.layers.Dense(5, name='dense'))
-
-    file_name = 'model_1.txt'
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    fpath = os.path.join(temp_dir, file_name)
-    writer = open(fpath, 'w')
-
-    def print_to_file(text):
-      print(text, file=writer)
-
-    try:
-      layer_utils.print_summary(model, print_fn=print_to_file)
-      self.assertTrue(tf.io.gfile.exists(fpath))
-      writer.close()
-      reader = open(fpath, 'r')
-      lines = reader.readlines()
-      reader.close()
-      self.assertEqual(len(lines), 15)
-    except ImportError:
-      pass
-
-  def test_print_summary_without_print_fn(self):
-    model = keras.Sequential([
-        keras.layers.Dense(5, input_shape=(10,), name='dense')])
-    io_utils.enable_interactive_logging()
-    with self.captureWritesToStream(sys.stdout) as printed:
-      layer_utils.print_summary(model)
-    self.assertIn('dense (Dense)', printed.contents())
-
-  def test_print_summary_expand_nested(self):
-    shape = (None, None, 3)
-
-    def make_model():
-      x = inputs = keras.Input(shape)
-      x = keras.layers.Conv2D(3, 1)(x)
-      x = keras.layers.BatchNormalization()(x)
-      return keras.Model(inputs, x)
-
-    x = inner_inputs = keras.Input(shape)
-    x = make_model()(x)
-    inner_model = keras.Model(inner_inputs, x)
-
-    inputs = keras.Input(shape)
-    model = keras.Model(inputs, inner_model(inputs))
-
-    file_name = 'model_2.txt'
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    fpath = os.path.join(temp_dir, file_name)
-    writer = open(fpath, 'w')
-
-    def print_to_file(text):
-      print(text, file=writer)
-
-    try:
-      layer_utils.print_summary(
-          model, print_fn=print_to_file, expand_nested=True)
-      self.assertTrue(tf.io.gfile.exists(fpath))
-      writer.close()
-      reader = open(fpath, 'r')
-      lines = reader.readlines()
-      reader.close()
-      check_str = (
-          'Model: "model_2"\n'
-          '_________________________________________________________________\n'
-          ' Layer (type)                Output Shape              Param #   \n'
-          '=================================================================\n'
-          ' input_3 (InputLayer)        [(None, None, None, 3)]   0         \n'
-          '                                                                 \n'
-          ' model_1 (Functional)        (None, None, None, 3)     24        \n'
-          '|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n'
-          '| input_1 (InputLayer)      [(None, None, None, 3)]   0         |\n'
-          '|                                                               |\n'
-          '| model (Functional)        (None, None, None, 3)     24        |\n'
-          '||¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯||\n'
-          '|| input_2 (InputLayer)    [(None, None, None, 3)]   0         ||\n'
-          '||                                                             ||\n'
-          '|| conv2d (Conv2D)         (None, None, None, 3)     12        ||\n'
-          '||                                                             ||\n'
-          '|| batch_normalization (BatchN  (None, None, None, 3)  12      ||\n'
-          '|| ormalization)                                               ||\n'
-          '|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n'
-          '¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯\n'
-          '=================================================================\n'
-          'Total params: 24\n'
-          'Trainable params: 18\n'
-          'Non-trainable params: 6\n'
-          '_________________________________________________________________\n')
-
-      fin_str = ''
-      for line in lines:
-        fin_str += line
-
-      self.assertIn(fin_str, check_str)
-      self.assertEqual(len(lines), 25)
-    except ImportError:
-      pass
-
-  def test_summary_subclass_model_expand_nested(self):
-
-    class Sequential(keras.Model):
-
-      def __init__(self, *args):
-        super().__init__()
-        self.module_list = list(args) if args else []
-
-      def call(self, x):
-        for module in self.module_list:
-          x = module(x)
-        return x
-
-    class Block(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.module = Sequential(
-            keras.layers.Dense(10),
-            keras.layers.Dense(10),
-        )
-
-      def call(self, input_tensor):
-        x = self.module(input_tensor)
-        return x
-
-    class Base(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.module = Sequential(Block(), Block())
-
-      def call(self, input_tensor):
-        x = self.module(input_tensor)
-        y = self.module(x)
-        return x, y
-
-    class Network(keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.child = Base()
-
-      def call(self, inputs):
-        return self.child(inputs)
-
-    net = Network()
-    inputs = keras.Input(shape=(10,))
-    outputs = net(inputs)
-    model = keras.models.Model(inputs=inputs, outputs=outputs)
-
-    file_name = 'model_3.txt'
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    fpath = os.path.join(temp_dir, file_name)
-    writer = open(fpath, 'w')
-
-    def print_to_file(text):
-      print(text, file=writer)
-
-    try:
-      layer_utils.print_summary(
-          model, line_length=120, print_fn=print_to_file, expand_nested=True)
-      self.assertTrue(tf.io.gfile.exists(fpath))
-      writer.close()
-      reader = open(fpath, 'r')
-      lines = reader.readlines()
-      reader.close()
-      # The output content are slightly different for the input shapes between
-      # v1 and v2.
-      if tf.__internal__.tf2.enabled():
-        self.assertEqual(len(lines), 39)
-      else:
-        self.assertEqual(len(lines), 40)
-    except ImportError:
-      pass
-
-  def test_print_summary_show_trainable(self):
-    model = keras.Sequential(name='trainable')
-    untrained = keras.layers.Conv2D(
-        filters=2, kernel_size=(2, 3), input_shape=(3, 5, 5), name='conv')
-    model.add(untrained)
-    model.add(keras.layers.Flatten(name='flat'))
-    model.add(keras.layers.Dense(5, name='dense'))
-
-    untrained.trainable = False
-
-    file_name = 'model_4.txt'
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    fpath = os.path.join(temp_dir, file_name)
-    writer = open(fpath, 'w')
-
-    def print_to_file(text):
-      print(text, file=writer)
-
-    try:
-      layer_utils.print_summary(
-          model, print_fn=print_to_file, show_trainable=True)
-      self.assertTrue(tf.io.gfile.exists(fpath))
-      writer.close()
-      reader = open(fpath, 'r')
-      lines = reader.readlines()
-      reader.close()
-      check_str = (
-          'Model: '
-          '"trainable"\n____________________________________________________________________________\n'
-          ' Layer (type)                Output Shape              Param #   '
-          'Trainable  '
-          '\n============================================================================\n'
-          ' conv (Conv2D)               (None, 2, 3, 2)           62        N'
-          '          \n'
-          '                                                                            '
-          '\n flat (Flatten)              (None, 12)                0         '
-          'Y          \n'
-          '                                                                            '
-          '\n dense (Dense)               (None, 5)                 65        '
-          'Y          \n'
-          '                                                                            '
-          '\n============================================================================\nTotal'
-          ' params: 127\nTrainable params: 65\nNon-trainable params: '
-          '62\n____________________________________________________________________________\n'
-          '____________________________________________________________________________\n'
-      )
-
-      fin_str = ''
-      for line in lines:
-        fin_str += line
-
-      self.assertIn(fin_str, check_str)
-      self.assertEqual(len(lines), 15)
-    except ImportError:
-      pass
-
-  def test_print_summary_expand_nested_show_trainable(self):
-    shape = (None, None, 3)
-
-    def make_model():
-      x = inputs = keras.Input(shape, name='input2')
-      untrainable = keras.layers.Conv2D(3, 1)
-      untrainable.trainable = False
-      x = untrainable(x)
-      x = keras.layers.BatchNormalization()(x)
-      return keras.Model(inputs, x)
-
-    x = inner_inputs = keras.Input(shape, name='input1')
-    x = make_model()(x)
-    inner_model = keras.Model(inner_inputs, x)
-
-    inputs = keras.Input(shape, name='input3')
-    model = keras.Model(inputs, inner_model(inputs))
-
-    file_name = 'model_6.txt'
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    fpath = os.path.join(temp_dir, file_name)
-    writer = open(fpath, 'w')
-
-    def print_to_file(text):
-      print(text, file=writer)
-
-    try:
-      layer_utils.print_summary(
-          model,
-          print_fn=print_to_file,
-          expand_nested=True,
-          show_trainable=True)
-      self.assertTrue(tf.io.gfile.exists(fpath))
-      writer.close()
-      reader = open(fpath, 'r')
-      lines = reader.readlines()
-      reader.close()
-      check_str = (
-          'Model: '
-          '"model_2"\n____________________________________________________________________________\n'
-          ' Layer (type)                Output Shape              Param #   '
-          'Trainable  '
-          '\n============================================================================\n'
-          ' input3 (InputLayer)         [(None, None, None, 3)]   0         Y'
-          '          \n'
-          '                                                                            '
-          '\n model_1 (Functional)        (None, None, None, 3)     24        '
-          'Y          '
-          '\n|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n|'
-          ' input1 (InputLayer)       [(None, None, None, 3)]   0         Y'
-          '          |\n|'
-          '                                                                          '
-          '|\n| model (Functional)        (None, None, None, 3)     24        '
-          'Y          '
-          '|\n||¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯||\n||'
-          ' input2 (InputLayer)     [(None, None, None, 3)]   0         Y'
-          '          ||\n||'
-          '                                                                        '
-          '||\n|| conv2d (Conv2D)         (None, None, None, 3)     12        '
-          'N          ||\n||'
-          '                                                                        '
-          '||\n|| batch_normalization (BatchN  (None, None, None, 3)  12      '
-          'Y          ||\n|| ormalization)'
-          '                                                          '
-          '||\n|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯\n============================================================================\nTotal'
-          ' params: 24\nTrainable params: 6\nNon-trainable params: '
-          '18\n____________________________________________________________________________\n'
-          '____________________________________________________________________________\n'
-      )
-
-      fin_str = ''
-      for line in lines:
-        fin_str += line
-
-      self.assertIn(fin_str, check_str)
-      self.assertEqual(len(lines), 25)
-    except ImportError:
-      pass
-
-  def test_property_cache(self):
-    test_counter = collections.Counter()
-
-    class MyObject(tf.__internal__.tracking.AutoTrackable):
-
-      def __init__(self):
-        super().__init__()
-        self._frozen = True
-
-      def __setattr__(self, key, value):
-        """Enforce that cache does not set attribute on MyObject."""
-        if getattr(self, '_frozen', False):
-          raise ValueError('Cannot mutate when frozen.')
-        return super().__setattr__(key, value)
-
-      @property
-      @layer_utils.cached_per_instance
-      def test_property(self):
-        test_counter[id(self)] += 1
+    """Needed for InterfaceTests.test_property_cache_serialization.
+
+    This class must be at the top level. This is a constraint of pickle,
+    unrelated to `cached_per_instance`.
+    """
+
+    @property
+    @layer_utils.cached_per_instance
+    def my_id(self):
+        _PICKLEABLE_CALL_COUNT[self] += 1
         return id(self)
 
-    first_object = MyObject()
-    second_object = MyObject()
-
-    # Make sure the objects return the correct values
-    self.assertEqual(first_object.test_property, id(first_object))
-    self.assertEqual(second_object.test_property, id(second_object))
-
-    # Make sure the cache does not share across objects
-    self.assertNotEqual(first_object.test_property, second_object.test_property)
-
-    # Check again (Now the values should be cached.)
-    self.assertEqual(first_object.test_property, id(first_object))
-    self.assertEqual(second_object.test_property, id(second_object))
-
-    # Count the function calls to make sure the cache is actually being used.
-    self.assertAllEqual(tuple(test_counter.values()), (1, 1))
-
-  def test_property_cache_threaded(self):
-    call_count = collections.Counter()
-
-    class MyObject(tf.__internal__.tracking.AutoTrackable):
-
-      @property
-      @layer_utils.cached_per_instance
-      def test_property(self):
-        # Random sleeps to ensure that the execution thread changes
-        # mid-computation.
-        call_count['test_property'] += 1
-        time.sleep(np.random.random() + 1.)
-
-        # Use a RandomState which is seeded off the instance's id (the mod is
-        # because numpy limits the range of seeds) to ensure that an instance
-        # returns the same value in different threads, but different instances
-        # return different values.
-        return int(np.random.RandomState(id(self) % (2 ** 31)).randint(2 ** 16))
-
-      def get_test_property(self, _):
-        """Function provided to .map for threading test."""
-        return self.test_property
-
-    # Test that multiple threads return the same value. This requires that
-    # the underlying function is repeatable, as cached_property makes no attempt
-    # to prioritize the first call.
-    test_obj = MyObject()
-    with contextlib.closing(multiprocessing.dummy.Pool(32)) as pool:
-      # Intentionally make a large pool (even when there are only a small number
-      # of cpus) to ensure that the runtime switches threads.
-      results = pool.map(test_obj.get_test_property, range(64))
-    self.assertEqual(len(set(results)), 1)
-
-    # Make sure we actually are testing threaded behavior.
-    self.assertGreater(call_count['test_property'], 1)
-
-    # Make sure new threads still cache hit.
-    with contextlib.closing(multiprocessing.dummy.Pool(2)) as pool:
-      start_time = timeit.default_timer()  # Don't time pool instantiation.
-      results = pool.map(test_obj.get_test_property, range(4))
-    total_time = timeit.default_timer() - start_time
-
-    # Note(taylorrobie): The reason that it is safe to time a unit test is that
-    #                    a cache hit will be << 1 second, and a cache miss is
-    #                    guaranteed to be >= 1 second. Empirically confirmed by
-    #                    100,000 runs with no flakes.
-    self.assertLess(total_time, 0.95)
-
-  def test_property_cache_serialization(self):
-    # Reset call count. .keys() must be wrapped in a list, because otherwise we
-    # would mutate the iterator while iterating.
-    for k in list(_PICKLEABLE_CALL_COUNT.keys()):
-      _PICKLEABLE_CALL_COUNT.pop(k)
-
-    first_instance = MyPickleableObject()
-    self.assertEqual(id(first_instance), first_instance.my_id)
-
-    # Test that we can pickle and un-pickle
-    second_instance = pickle.loads(pickle.dumps(first_instance))
-
-    self.assertEqual(id(second_instance), second_instance.my_id)
-    self.assertNotEqual(first_instance.my_id, second_instance.my_id)
-
-    # Make sure de-serialized object uses the cache.
-    self.assertEqual(_PICKLEABLE_CALL_COUNT[second_instance], 1)
-
-    # Make sure the decorator cache is not being serialized with the object.
-    expected_size = len(pickle.dumps(second_instance))
-    for _ in range(5):
-      # Add some more entries to the cache.
-      _ = MyPickleableObject().my_id
-    self.assertEqual(len(_PICKLEABLE_CALL_COUNT), 7)
-    size_check_instance = MyPickleableObject()
-    _ = size_check_instance.my_id
-    self.assertEqual(expected_size, len(pickle.dumps(size_check_instance)))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+
+class LayerUtilsTest(tf.test.TestCase, parameterized.TestCase):
+    def setUp(self):
+        super().setUp()
+        # Reset the UID so that all the layer/model ID will always start with 1.
+        # This will help remove the undetermined IDs from the model.summary()
+        backend.reset_uids()
+
+    def test_print_summary(self):
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Conv2D(
+                filters=2,
+                kernel_size=(2, 3),
+                input_shape=(3, 5, 5),
+                name="conv",
+            )
+        )
+        model.add(keras.layers.Flatten(name="flat"))
+        model.add(keras.layers.Dense(5, name="dense"))
+
+        file_name = "model_1.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        try:
+            layer_utils.print_summary(model, print_fn=print_to_file)
+            self.assertTrue(tf.io.gfile.exists(fpath))
+            writer.close()
+            with open(fpath, "r") as reader:
+                lines = reader.readlines()
+            self.assertEqual(len(lines), 15)
+        except ImportError:
+            pass
+
+    def test_print_summary_without_print_fn(self):
+        model = keras.Sequential(
+            [keras.layers.Dense(5, input_shape=(10,), name="dense")]
+        )
+        io_utils.enable_interactive_logging()
+        with self.captureWritesToStream(sys.stdout) as printed:
+            layer_utils.print_summary(model)
+        self.assertIn("dense (Dense)", printed.contents())
+
+    def test_print_summary_format_long_names(self):
+        shape = (8, 8, 3)
+
+        model = keras.Sequential(
+            [
+                keras.Input(shape),
+                keras.layers.Conv2D(4, 3, name="Really-Long-name-test"),
+                keras.layers.Conv2D(4, 3, name="Another-long-name-test"),
+                keras.layers.Flatten(),
+                keras.layers.Dense(2, name="long-name-test-output"),
+            ]
+        )
+        file_name = "sequential.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        layer_utils.print_summary(model, print_fn=print_to_file)
+        self.assertTrue(tf.io.gfile.exists(fpath))
+        writer.close()
+        reader = open(fpath, "r")
+        lines = reader.readlines()
+        reader.close()
+        check_str = (
+            'Model: "sequential"\n'
+            "_________________________________________________________________\n"  # noqa: E501
+            " Layer (type)                Output Shape              Param #   \n"  # noqa: E501
+            "=================================================================\n"  # noqa: E501
+            " Really-Long-name-test (Con  (None, 6, 6, 4)           112       \n"  # noqa: E501
+            " v2D)                                                            \n"  # noqa: E501
+            "                                                                 \n"  # noqa: E501
+            " Another-long-name-test (Co  (None, 4, 4, 4)           148       \n"  # noqa: E501
+            " nv2D)                                                           \n"  # noqa: E501
+            "                                                                 \n"  # noqa: E501
+            " flatten (Flatten)           (None, 64)                0         \n"  # noqa: E501
+            "                                                                 \n"  # noqa: E501
+            " long-name-test-output (Den  (None, 2)                 130       \n"  # noqa: E501
+            " se)                                                             \n"  # noqa: E501
+            "                                                                 \n"  # noqa: E501
+            "=================================================================\n"  # noqa: E501
+            "Total params: 390 (1.52 KB)\n"
+            "Trainable params: 390 (1.52 KB)\n"
+            "Non-trainable params: 0 (0.00 Byte)\n"
+            "_________________________________________________________________\n"  # noqa: E501
+        )
+        fin_str = "".join(lines)
+        self.assertIn(fin_str, check_str)
+        self.assertEqual(len(lines), 20)
+
+    def test_print_summary_expand_nested(self):
+        shape = (None, None, 3)
+
+        def make_model():
+            x = inputs = keras.Input(shape)
+            x = keras.layers.Conv2D(3, 1)(x)
+            x = keras.layers.BatchNormalization()(x)
+            return keras.Model(inputs, x)
+
+        x = inner_inputs = keras.Input(shape)
+        x = make_model()(x)
+        inner_model = keras.Model(inner_inputs, x)
+
+        inputs = keras.Input(shape)
+        model = keras.Model(inputs, inner_model(inputs))
+
+        file_name = "model_2.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        try:
+            layer_utils.print_summary(
+                model, print_fn=print_to_file, expand_nested=True
+            )
+            self.assertTrue(tf.io.gfile.exists(fpath))
+            writer.close()
+            reader = open(fpath, "r")
+            lines = reader.readlines()
+            reader.close()
+            check_str = (
+                'Model: "model_2"\n'
+                "_________________________________________________________________\n"  # noqa: E501
+                " Layer (type)                Output Shape              Param #   \n"  # noqa: E501
+                "=================================================================\n"  # noqa: E501
+                " input_3 (InputLayer)        [(None, None, None, 3)]   0         \n"  # noqa: E501
+                "                                                                 \n"  # noqa: E501
+                " model_1 (Functional)        (None, None, None, 3)     24        \n"  # noqa: E501
+                "|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n"  # noqa: E501
+                "| input_1 (InputLayer)       [(None, None, None, 3)]   0        |\n"  # noqa: E501
+                "|                                                               |\n"  # noqa: E501
+                "| model (Functional)         (None, None, None, 3)     24       |\n"  # noqa: E501
+                "||¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯||\n"  # noqa: E501
+                "|| input_2 (InputLayer)      [(None, None, None, 3)]   0       ||\n"  # noqa: E501
+                "||                                                             ||\n"  # noqa: E501
+                "|| conv2d (Conv2D)           (None, None, None, 3)     12      ||\n"  # noqa: E501
+                "||                                                             ||\n"  # noqa: E501
+                "|| batch_normalization (Bat  (None, None, None, 3)     12      ||\n"  # noqa: E501
+                "|| chNormalization)                                            ||\n"  # noqa: E501
+                "|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n"  # noqa: E501
+                "¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯\n"  # noqa: E501
+                "=================================================================\n"  # noqa: E501
+                "Total params: 24 (96.00 Byte)\n"
+                "Trainable params: 18 (72.00 Byte)\n"
+                "Non-trainable params: 6 (24.00 Byte)\n"
+                "_________________________________________________________________\n"  # noqa: E501
+            )
+
+            fin_str = "".join(lines)
+
+            self.assertIn(fin_str, check_str)
+            self.assertEqual(len(lines), 25)
+        except ImportError:
+            pass
+
+    def test_summary_subclass_model_expand_nested(self):
+        class Sequential(keras.Model):
+            def __init__(self, *args):
+                super().__init__()
+                self.module_list = list(args) if args else []
+
+            def call(self, x):
+                for module in self.module_list:
+                    x = module(x)
+                return x
+
+        class Block(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.module = Sequential(
+                    keras.layers.Dense(10),
+                    keras.layers.Dense(10),
+                )
+
+            def call(self, input_tensor):
+                x = self.module(input_tensor)
+                return x
+
+        class Base(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.module = Sequential(Block(), Block())
+
+            def call(self, input_tensor):
+                x = self.module(input_tensor)
+                y = self.module(x)
+                return x, y
+
+        class Network(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.child = Base()
+
+            def call(self, inputs):
+                return self.child(inputs)
+
+        net = Network()
+        inputs = keras.Input(shape=(10,))
+        outputs = net(inputs)
+        model = keras.models.Model(inputs=inputs, outputs=outputs)
+
+        file_name = "model_3.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        try:
+            layer_utils.print_summary(
+                model,
+                line_length=120,
+                print_fn=print_to_file,
+                expand_nested=True,
+            )
+            self.assertTrue(tf.io.gfile.exists(fpath))
+            writer.close()
+            with open(fpath, "r") as reader:
+                lines = reader.readlines()
+            # The output content are slightly different for the input shapes
+            # between v1 and v2.
+            if tf.__internal__.tf2.enabled():
+                self.assertEqual(len(lines), 39)
+            else:
+                self.assertEqual(len(lines), 40)
+        except ImportError:
+            pass
+
+    def test_print_summary_show_trainable(self):
+        model = keras.Sequential(name="trainable")
+        untrained = keras.layers.Conv2D(
+            filters=2, kernel_size=(2, 3), input_shape=(3, 5, 5), name="conv"
+        )
+        model.add(untrained)
+        model.add(keras.layers.Flatten(name="flat"))
+        model.add(keras.layers.Dense(5, name="dense"))
+
+        untrained.trainable = False
+
+        file_name = "model_4.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        try:
+            layer_utils.print_summary(
+                model, print_fn=print_to_file, show_trainable=True
+            )
+            self.assertTrue(tf.io.gfile.exists(fpath))
+            writer.close()
+            with open(fpath, "r") as reader:
+                lines = reader.readlines()
+            check_str = (
+                'Model: "trainable"\n'
+                "____________________________________________________________________________\n"  # noqa: E501
+                " Layer (type)                Output Shape              Param #   Trainable  \n"  # noqa: E501
+                "============================================================================\n"  # noqa: E501
+                " conv (Conv2D)               (None, 2, 3, 2)           62        N          \n"  # noqa: E501
+                "                                                                            \n"  # noqa: E501
+                " flat (Flatten)              (None, 12)                0         Y          \n"  # noqa: E501
+                "                                                                            \n"  # noqa: E501
+                " dense (Dense)               (None, 5)                 65        Y          \n"  # noqa: E501
+                "                                                                            \n"  # noqa: E501
+                "============================================================================\n"  # noqa: E501
+                "Total params: 127 (508.00 Byte)\n"
+                "Trainable params: 65 (260.00 Byte)\n"
+                "Non-trainable params: 62 (248.00 Byte)\n"
+                "____________________________________________________________________________\n"  # noqa: E501
+                "____________________________________________________________________________\n"  # noqa: E501
+            )
+
+            fin_str = "".join(lines)
+
+            self.assertIn(fin_str, check_str)
+            self.assertEqual(len(lines), 15)
+        except ImportError:
+            pass
+
+    def test_print_summary_expand_nested_show_trainable(self):
+        shape = (None, None, 3)
+
+        def make_model():
+            x = inputs = keras.Input(shape, name="input2")
+            untrainable = keras.layers.Conv2D(3, 1)
+            untrainable.trainable = False
+            x = untrainable(x)
+            x = keras.layers.BatchNormalization()(x)
+            return keras.Model(inputs, x)
+
+        x = inner_inputs = keras.Input(shape, name="input1")
+        x = make_model()(x)
+        inner_model = keras.Model(inner_inputs, x)
+
+        inputs = keras.Input(shape, name="input3")
+        model = keras.Model(inputs, inner_model(inputs))
+
+        file_name = "model_6.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        try:
+            layer_utils.print_summary(
+                model,
+                print_fn=print_to_file,
+                expand_nested=True,
+                show_trainable=True,
+            )
+            self.assertTrue(tf.io.gfile.exists(fpath))
+            writer.close()
+            with open(fpath, "r") as reader:
+                lines = reader.readlines()
+            check_str = (
+                'Model: "model_2"\n'
+                "____________________________________________________________________________\n"  # noqa: E501
+                " Layer (type)                Output Shape              Param #   Trainable  \n"  # noqa: E501
+                "============================================================================\n"  # noqa: E501
+                " input3 (InputLayer)         [(None, None, None, 3)]   0         Y          \n"  # noqa: E501
+                "                                                                            \n"  # noqa: E501
+                " model_1 (Functional)        (None, None, None, 3)     24        Y          \n"  # noqa: E501
+                "|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n"  # noqa: E501
+                "| input1 (InputLayer)        [(None, None, None, 3)]   0         Y         |\n"  # noqa: E501
+                "|                                                                          |\n"  # noqa: E501
+                "| model (Functional)         (None, None, None, 3)     24        Y         |\n"  # noqa: E501
+                "||¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯||\n"  # noqa: E501
+                "|| input2 (InputLayer)       [(None, None, None, 3)]   0         Y        ||\n"  # noqa: E501
+                "||                                                                        ||\n"  # noqa: E501
+                "|| conv2d (Conv2D)           (None, None, None, 3)     12        N        ||\n"  # noqa: E501
+                "||                                                                        ||\n"  # noqa: E501
+                "|| batch_normalization (Bat  (None, None, None, 3)     12        Y        ||\n"  # noqa: E501
+                "|| chNormalization)                                                       ||\n"  # noqa: E501
+                "|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n"  # noqa: E501
+                "¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯\n"  # noqa: E501
+                "============================================================================\n"  # noqa: E501
+                "Total params: 24 (96.00 Byte)\n"
+                "Trainable params: 6 (24.00 Byte)\n"
+                "Non-trainable params: 18 (72.00 Byte)\n"
+                "____________________________________________________________________________\n"  # noqa: E501
+            )
+
+            fin_str = "".join(lines)
+
+            self.assertIn(fin_str, check_str)
+            self.assertEqual(len(lines), 25)
+        except ImportError:
+            pass
+
+    def test_print_summary_layer_range(self):
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Conv2D(
+                filters=2,
+                kernel_size=(2, 3),
+                input_shape=(3, 5, 5),
+                name="conv",
+            )
+        )
+        model.add(keras.layers.Flatten(name="flat"))
+        model.add(keras.layers.Dense(5, name="dense"))
+
+        file_name = "model_7.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        try:
+            layer_utils.print_summary(
+                model, print_fn=print_to_file, layer_range=["conv", "flat"]
+            )
+            self.assertTrue(tf.io.gfile.exists(fpath))
+            writer.close()
+            with open(fpath, "r") as reader:
+                lines = reader.readlines()
+            # The expected lenght with no layer filter is 15
+            # we filtered out 2 lines by excluding the layer 'dense'
+            self.assertEqual(len(lines), 15 - 2)
+        except ImportError:
+            pass
+
+    def test_print_summary_layer_range_with_expand_nested(self):
+        shape = (None, None, 3)
+
+        def make_model():
+            x = inputs = keras.Input(shape, name="input_2")
+            x = keras.layers.Conv2D(3, 1)(x)
+            x = keras.layers.BatchNormalization()(x)
+            return keras.Model(inputs, x, name="2nd_inner")
+
+        x = inner_inputs = keras.Input(shape, name="input_1")
+        x = make_model()(x)
+        inner_model = keras.Model(inner_inputs, x, name="1st_inner")
+
+        inputs = keras.Input(shape, name="input_3")
+        model = keras.Model(inputs, inner_model(inputs))
+
+        file_name = "model_8.txt"
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+        fpath = os.path.join(temp_dir, file_name)
+        writer = open(fpath, "w")
+
+        def print_to_file(text):
+            print(text, file=writer)
+
+        try:
+            layer_utils.print_summary(
+                model,
+                print_fn=print_to_file,
+                expand_nested=True,
+                layer_range=["1st_inner", "1st_inner"],
+            )
+            layer_utils.print_summary(
+                model,
+                expand_nested=True,
+                layer_range=["1st_inner", "1st_inner"],
+            )
+            self.assertTrue(tf.io.gfile.exists(fpath))
+            writer.close()
+            with open(fpath, "r") as reader:
+                lines = reader.readlines()
+            check_str = (
+                'Model: "model"\n'
+                "_________________________________________________________________\n"  # noqa: E501
+                " Layer (type)                Output Shape              Param #   \n"  # noqa: E501
+                "=================================================================\n"  # noqa: E501
+                " 1st_inner (Functional)      (None, None, None, 3)     24        \n"  # noqa: E501
+                "|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n"  # noqa: E501
+                "| input_1 (InputLayer)       [(None, None, None, 3)]   0        |\n"  # noqa: E501
+                "|                                                               |\n"  # noqa: E501
+                "| 2nd_inner (Functional)     (None, None, None, 3)     24       |\n"  # noqa: E501
+                "||¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯||\n"  # noqa: E501
+                "|| input_2 (InputLayer)      [(None, None, None, 3)]   0       ||\n"  # noqa: E501
+                "||                                                             ||\n"  # noqa: E501
+                "|| conv2d (Conv2D)           (None, None, None, 3)     12      ||\n"  # noqa: E501
+                "||                                                             ||\n"  # noqa: E501
+                "|| batch_normalization (Bat  (None, None, None, 3)     12      ||\n"  # noqa: E501
+                "|| chNormalization)                                            ||\n"  # noqa: E501
+                "|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|\n"  # noqa: E501
+                "¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯\n"  # noqa: E501
+                "=================================================================\n"  # noqa: E501
+                "Total params: 24 (96.00 Byte)\n"
+                "Trainable params: 18 (72.00 Byte)\n"
+                "Non-trainable params: 6 (24.00 Byte)\n"
+                "_________________________________________________________________\n"  # noqa: E501
+            )
+
+            check_lines = check_str.split("\n")[
+                :-1
+            ]  # Removing final empty string which is not a line
+
+            fin_str = "".join(lines)
+            self.assertIn(fin_str, check_str)
+            self.assertEqual(len(lines), len(check_lines))
+        except ImportError:
+            pass
+
+    def test_weight_memory_size(self):
+        v1 = tf.Variable(tf.zeros(shape=(1, 2), dtype=tf.float32))
+        v2 = tf.Variable(tf.zeros(shape=(2, 3), dtype=tf.float64))
+        v3 = tf.Variable(tf.zeros(shape=(4, 5), dtype=tf.int16))
+        v4 = tf.Variable(tf.zeros(shape=(6,), dtype=tf.uint8))
+
+        weights = [v1, v1, v2, v3, v4]
+        weight_memory_size = layer_utils.weight_memory_size(weights)
+        expected_memory_size = 1 * 2 * 4 + 2 * 3 * 8 + 4 * 5 * 2 + 6 * 1
+        self.assertEqual(weight_memory_size, expected_memory_size)
+
+    @parameterized.parameters(
+        (0, "0.00 Byte"),
+        (1000, "1000.00 Byte"),
+        (1024, "1.00 KB"),
+        (1024 * 2 - 1, "2.00 KB"),
+        (1024 * 2 + 1, "2.00 KB"),
+        (1024**2 + 1, "1.00 MB"),
+        (1024**3 - 1, "1024.00 MB"),
+        (1024**3, "1.00 GB"),
+        (1024**4, "1.00 TB"),
+        (1024**5, "1.00 PB"),
+        (1024**5 * 1.41415, "1.41 PB"),
+    )
+    def test_readable_weight_memory_size(self, size, expected_result):
+        result = layer_utils.readable_memory_size(size)
+        self.assertEqual(result, expected_result)
+
+    def test_property_cache(self):
+        test_counter = collections.Counter()
+
+        class MyObject(tf.__internal__.tracking.AutoTrackable):
+            def __init__(self):
+                super().__init__()
+                self._frozen = True
+
+            def __setattr__(self, key, value):
+                """Enforce that cache does not set attribute on MyObject."""
+                if getattr(self, "_frozen", False):
+                    raise ValueError("Cannot mutate when frozen.")
+                return super().__setattr__(key, value)
+
+            @property
+            @layer_utils.cached_per_instance
+            def test_property(self):
+                test_counter[id(self)] += 1
+                return id(self)
+
+        first_object = MyObject()
+        second_object = MyObject()
+
+        # Make sure the objects return the correct values
+        self.assertEqual(first_object.test_property, id(first_object))
+        self.assertEqual(second_object.test_property, id(second_object))
+
+        # Make sure the cache does not share across objects
+        self.assertNotEqual(
+            first_object.test_property, second_object.test_property
+        )
+
+        # Check again (Now the values should be cached.)
+        self.assertEqual(first_object.test_property, id(first_object))
+        self.assertEqual(second_object.test_property, id(second_object))
+
+        # Count the function calls to make sure the cache is actually being
+        # used.
+        self.assertAllEqual(tuple(test_counter.values()), (1, 1))
+
+    def test_property_cache_threaded(self):
+        call_count = collections.Counter()
+
+        class MyObject(tf.__internal__.tracking.AutoTrackable):
+            @property
+            @layer_utils.cached_per_instance
+            def test_property(self):
+                # Random sleeps to ensure that the execution thread changes
+                # mid-computation.
+                call_count["test_property"] += 1
+                time.sleep(np.random.random() + 1.0)
+
+                # Use a RandomState which is seeded off the instance's id (the
+                # mod is because numpy limits the range of seeds) to ensure that
+                # an instance returns the same value in different threads, but
+                # different instances return different values.
+                return int(
+                    np.random.RandomState(id(self) % (2**31)).randint(2**16)
+                )
+
+            def get_test_property(self, _):
+                """Function provided to .map for threading test."""
+                return self.test_property
+
+        # Test that multiple threads return the same value. This requires that
+        # the underlying function is repeatable, as cached_property makes no
+        # attempt to prioritize the first call.
+        test_obj = MyObject()
+        with contextlib.closing(multiprocessing.dummy.Pool(32)) as pool:
+            # Intentionally make a large pool (even when there are only a small
+            # number of cpus) to ensure that the runtime switches threads.
+            results = pool.map(test_obj.get_test_property, range(64))
+        self.assertEqual(len(set(results)), 1)
+
+        # Make sure we actually are testing threaded behavior.
+        self.assertGreater(call_count["test_property"], 1)
+
+        # Make sure new threads still cache hit.
+        with contextlib.closing(multiprocessing.dummy.Pool(2)) as pool:
+            start_time = (
+                timeit.default_timer()
+            )  # Don't time pool instantiation.
+            results = pool.map(test_obj.get_test_property, range(4))
+        total_time = timeit.default_timer() - start_time
+
+        # Note(taylorrobie): The reason that it is safe to time a unit test is
+        # that a cache hit will be << 1 second, and a cache miss is guaranteed
+        # to be >= 1 second. Empirically confirmed by 100,000 runs with no
+        # flakes.
+        self.assertLess(total_time, 0.95)
+
+    def test_property_cache_serialization(self):
+        # Reset call count. .keys() must be wrapped in a list, because otherwise
+        # we would mutate the iterator while iterating.
+        for k in list(_PICKLEABLE_CALL_COUNT.keys()):
+            _PICKLEABLE_CALL_COUNT.pop(k)
+
+        first_instance = MyPickleableObject()
+        self.assertEqual(id(first_instance), first_instance.my_id)
+
+        # Test that we can pickle and un-pickle
+        second_instance = pickle.loads(pickle.dumps(first_instance))
+
+        self.assertEqual(id(second_instance), second_instance.my_id)
+        self.assertNotEqual(first_instance.my_id, second_instance.my_id)
+
+        # Make sure de-serialized object uses the cache.
+        self.assertEqual(_PICKLEABLE_CALL_COUNT[second_instance], 1)
+
+        # Make sure the decorator cache is not being serialized with the object.
+        expected_size = len(pickle.dumps(second_instance))
+        for _ in range(5):
+            # Add some more entries to the cache.
+            _ = MyPickleableObject().my_id
+        self.assertEqual(len(_PICKLEABLE_CALL_COUNT), 7)
+        size_check_instance = MyPickleableObject()
+        _ = size_check_instance.my_id
+        self.assertEqual(expected_size, len(pickle.dumps(size_check_instance)))
+
+    def test_warmstart_embedding_matrix_with_list(self):
+        vocab_base = ["unk", "a", "b", "c"]
+        vocab_new = ["unk", "unk", "a", "b", "c", "d", "e"]
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        vectorized_vocab_new = np.random.rand(len(vocab_new), 3)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base,
+            new_vocabulary=vocab_new,
+            base_embeddings=vectorized_vocab_base,
+            new_embeddings_initializer=keras.initializers.Constant(
+                vectorized_vocab_new
+            ),
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[2],
+            vectorized_vocab_base[1],
+        )
+
+    def test_warmstart_embedding_matrix_with_nparray(self):
+        vocab_base = np.array(["unk", "a", "b", "c"])
+        vocab_new = np.array(["unk", "unk", "a", "b", "c", "d", "e"])
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        vectorized_vocab_new = np.random.rand(len(vocab_new), 3)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base,
+            new_vocabulary=vocab_new,
+            base_embeddings=vectorized_vocab_base,
+            new_embeddings_initializer=keras.initializers.Constant(
+                vectorized_vocab_new
+            ),
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[2],
+            vectorized_vocab_base[1],
+        )
+
+    @test_utils.run_v2_only
+    def test_warmstart_embedding_matrix_with_tensor(self):
+        vocab_base = tf.convert_to_tensor(["unk", "a", "b", "c"])
+        vocab_new = tf.convert_to_tensor(
+            ["unk", "unk", "a", "b", "c", "d", "e"]
+        )
+        vectorized_vocab_base = np.random.rand(vocab_base.shape[0], 3)
+        vectorized_vocab_new = np.random.rand(vocab_new.shape[0], 3)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base,
+            new_vocabulary=vocab_new,
+            base_embeddings=vectorized_vocab_base,
+            new_embeddings_initializer=keras.initializers.Constant(
+                vectorized_vocab_new
+            ),
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[2],
+            vectorized_vocab_base[1],
+        )
+
+    def test_warmstart_embedding_matrix_with_file_name(self):
+        def _write_list_to_file(filename, content_list):
+            with tf.io.gfile.GFile(filename, "w") as output_file:
+                for line in content_list:
+                    output_file.write(line + "\n")
+
+        vocab_base = ["UNK", "a", "b", "c"]
+        vocab_base_file = tempfile.mktemp(".tsv")
+        _write_list_to_file(vocab_base_file, vocab_base)
+        vocab_new = ["UNK", "UNK", "a", "b", "c", "d", "e"]
+        vocab_new_file = tempfile.mktemp(".tsv")
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        vectorized_vocab_new = np.random.rand(len(vocab_new), 3)
+        _write_list_to_file(vocab_new_file, vocab_new)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base_file,
+            new_vocabulary=vocab_new_file,
+            base_embeddings=vectorized_vocab_base,
+            new_embeddings_initializer=keras.initializers.Constant(
+                vectorized_vocab_new
+            ),
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[3],
+            vectorized_vocab_base[2],
+        )
+
+    def test_warmstart_default_initialization(self):
+        def _write_list_to_file(filename, content_list):
+            with tf.io.gfile.GFile(filename, "w") as output_file:
+                for line in content_list:
+                    output_file.write(line + "\n")
+
+        vocab_base = ["UNK", "a", "b", "c"]
+        vocab_base_file = tempfile.mktemp(".tsv")
+        _write_list_to_file(vocab_base_file, vocab_base)
+        vocab_new = ["UNK", "UNK", "a", "b", "c", "d", "e"]
+        vocab_new_file = tempfile.mktemp(".tsv")
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        _write_list_to_file(vocab_new_file, vocab_new)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base_file,
+            new_vocabulary=vocab_new_file,
+            base_embeddings=vectorized_vocab_base,
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[3],
+            vectorized_vocab_base[2],
+        )
+
+    def test_warmstart_default_value(self):
+        vocab_base = np.array(["unk", "a", "b", "c"])
+        vocab_new = np.array(["unk", "unk", "a", "b", "c", "d", "e"])
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base,
+            new_vocabulary=vocab_new,
+            base_embeddings=vectorized_vocab_base,
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[2],
+            vectorized_vocab_base[1],
+        )
+
+    def test_warmstart_with_randomuniform_initializer(self):
+        vocab_base = np.array(["unk", "a", "b", "c"])
+        vocab_new = np.array(["unk", "unk", "a", "b", "c", "d", "e"])
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base,
+            new_vocabulary=vocab_new,
+            base_embeddings=vectorized_vocab_base,
+            new_embeddings_initializer="RandomUniform",
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[2],
+            vectorized_vocab_base[1],
+        )
+
+    def test_warmstart_with_nothing_in_common(self):
+        vocab_base = np.array(["unk", "a", "b", "c"])
+        vocab_new = np.array(["d", "e", "f", "g", "h"])
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        vectorized_vocab_new = np.random.rand(len(vocab_new), 3)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base,
+            new_vocabulary=vocab_new,
+            base_embeddings=vectorized_vocab_base,
+            new_embeddings_initializer=keras.initializers.Constant(
+                vectorized_vocab_new
+            ),
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix,
+            vectorized_vocab_new,
+        )
+
+    def test_warmstart_with_new_vocab_smaller(self):
+        vocab_base = np.array(["unk", "a", "b", "c"])
+        vocab_new = np.array(["d", "e", "f", "a"])
+        vectorized_vocab_base = np.random.rand(len(vocab_base), 3)
+        warmstarted_embedding_matrix = layer_utils.warmstart_embedding_matrix(
+            base_vocabulary=vocab_base,
+            new_vocabulary=vocab_new,
+            base_embeddings=vectorized_vocab_base,
+            new_embeddings_initializer="uniform",
+        )
+        self.assertAllEqual(
+            warmstarted_embedding_matrix[3],
+            vectorized_vocab_base[1],
+        )
+
+
+@test_utils.run_v2_only
+class DTensorVariableSummaryTest(test_util.DTensorBaseTest):
+    def setUp(self):
+        super().setUp()
+        backend.reset_uids()
+        backend.enable_tf_random_generator()
+        tf_utils.set_random_seed(1337)
+        global_ids = test_util.create_device_ids_array((2, 2))
+        local_device_ids = np.ravel(global_ids).tolist()
+        mesh_dict = {
+            "CPU": dtensor.Mesh(
+                ["batch", "model"],
+                global_ids,
+                local_device_ids,
+                test_util.create_device_list((2, 2), "CPU"),
+            )
+        }
+        self.mesh = self.configTestMesh(mesh_dict)
+        self.replicated_2d = dtensor.Layout.replicated(self.mesh, rank=2)
+        self.replicated_1d = dtensor.Layout.replicated(self.mesh, rank=1)
+        self.sharded_2d = dtensor.Layout(["model", "batch"], self.mesh)
+        self.sharded_1d = dtensor.Layout(["model"], self.mesh)
+
+    def test_model_summary(self):
+        layout_map = layout_map_lib.LayoutMap(mesh=self.mesh)
+        layout_map["d1.kernel"] = self.replicated_2d
+        layout_map["d1.bias"] = self.replicated_1d
+        layout_map["d2.kernel"] = self.sharded_2d
+        layout_map["d2.bias"] = self.sharded_1d
+
+        with layout_map.scope():
+            inputs = layers.Input((10,), batch_size=10)
+            x = layers.Dense(20, name="d1")(inputs)
+            x = layers.Dropout(0.1)(x)
+            output = layers.Dense(30, name="d2")(x)
+
+            model = keras.Model(inputs, output)
+
+        # For dtype = float32, following value are expected from memory stats
+        expected_result = {}
+        replicated_var_count = 10 * 20 + 20  # For d1 kernel and bias
+        model_batch_shard_var_count = 30 * 20  # For d2 kernel
+        model_shard_var_count = 30  # For d2 bias
+        expected_result[()] = (replicated_var_count, replicated_var_count * 4)
+        expected_result[("batch", "model")] = (
+            model_batch_shard_var_count,
+            model_batch_shard_var_count * 4,
+        )
+        expected_result[("model",)] = (
+            model_shard_var_count,
+            model_shard_var_count * 4,
+        )
+
+        expected_total_weight_count = (
+            replicated_var_count
+            + model_batch_shard_var_count
+            + model_shard_var_count
+        )
+        expected_total_memory_size = expected_total_weight_count * 4
+
+        (
+            total_weight_count,
+            total_memory_size,
+            per_sharing_spec_result,
+        ) = layer_utils.dtensor_variable_summary(model.weights)
+
+        self.assertEqual(total_weight_count, expected_total_weight_count)
+        self.assertEqual(total_memory_size, expected_total_memory_size)
+        self.assertDictEqual(per_sharing_spec_result, expected_result)
+
+        output_buffer = io.StringIO()
+
+        def print_to_buffer(content):
+            output_buffer.write(content)
+
+        model.summary(print_fn=print_to_buffer)
+
+        self.assertRegex(
+            output_buffer.getvalue(),
+            f"{replicated_var_count} / {expected_total_weight_count} params "
+            ".* are fully replicated",
+        )
+        self.assertRegex(
+            output_buffer.getvalue(),
+            f"{model_batch_shard_var_count} / {expected_total_weight_count} "
+            r"params .* are sharded based on spec .*batch.*model"
+            r".* across 4 devices",
+        )
+        self.assertRegex(
+            output_buffer.getvalue(),
+            f"{model_shard_var_count} / {expected_total_weight_count} "
+            r"params .* are sharded based on spec .*model"
+            r".* across 2 devices",
+        )
+        self.assertIn(
+            "Overall per device memory usage: 1.50 KB", output_buffer.getvalue()
+        )
+        self.assertIn("Overall sharding factor: 2.21", output_buffer.getvalue())
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/optimizers/optimizer_v2/__init__.py b/keras/utils/legacy/__init__.py
similarity index 69%
rename from keras/optimizers/optimizer_v2/__init__.py
rename to keras/utils/legacy/__init__.py
index 78cb171abbaf..d4dd953bea8f 100644
--- a/keras/optimizers/optimizer_v2/__init__.py
+++ b/keras/utils/legacy/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,3 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Legacy public Keras utilities."""
+
+# isort: off
+
+# Serialization related
+from keras.saving.legacy.serialization import deserialize_keras_object
+from keras.saving.legacy.serialization import serialize_keras_object
diff --git a/keras/utils/losses_utils.py b/keras/utils/losses_utils.py
index ab99e2115793..28a450bce298 100644
--- a/keras/utils/losses_utils.py
+++ b/keras/utils/losses_utils.py
@@ -12,360 +12,423 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Utilities related to loss functions."""
 
 import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.engine import keras_tensor
 from keras.utils import tf_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.losses.Reduction', v1=[])
+@keras_export("keras.losses.Reduction", v1=[])
 class ReductionV2:
-  """Types of loss reduction.
-
-  Contains the following values:
-
-  * `AUTO`: Indicates that the reduction option will be determined by the usage
-     context. For almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
-     used with `tf.distribute.Strategy`, outside of built-in training loops such
-     as `tf.keras` `compile` and `fit`, we expect reduction value to be
-     `SUM` or `NONE`. Using `AUTO` in that case will raise an error.
-  * `NONE`: No **additional** reduction is applied to the output of the wrapped
-     loss function. When non-scalar losses are returned to Keras functions like
-     `fit`/`evaluate`, the unreduced vector loss is passed to the optimizer
-     but the reported loss will be a scalar value.
-
-     Caution: **Verify the shape of the outputs when using** `Reduction.NONE`.
-     The builtin loss functions wrapped by the loss classes reduce
-     one dimension (`axis=-1`, or `axis` if specified by loss function).
-     `Reduction.NONE` just means that no **additional** reduction is applied by
-     the class wrapper. For categorical losses with an example input shape of
-     `[batch, W, H, n_classes]` the `n_classes` dimension is reduced. For
-     pointwise losses you must include a dummy axis so that `[batch, W, H, 1]`
-     is reduced to `[batch, W, H]`. Without the dummy axis `[batch, W, H]`
-     will be incorrectly reduced to `[batch, W]`.
-
-  * `SUM`: Scalar sum of weighted losses.
-  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-     This reduction type is not supported when used with
-     `tf.distribute.Strategy` outside of built-in training loops like `tf.keras`
-     `compile`/`fit`.
-
-     You can implement 'SUM_OVER_BATCH_SIZE' using global batch size like:
-     ```
-     with strategy.scope():
-       loss_obj = tf.keras.losses.CategoricalCrossentropy(
-           reduction=tf.keras.losses.Reduction.NONE)
-       ....
-       loss = tf.reduce_sum(loss_obj(labels, predictions)) *
-           (1. / global_batch_size)
-     ```
-
-  Please see the [custom training guide](
-  https://www.tensorflow.org/tutorials/distribute/custom_training) for more
-  details on this.
-  """
-
-  AUTO = 'auto'
-  NONE = 'none'
-  SUM = 'sum'
-  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
-
-  @classmethod
-  def all(cls):
-    return (cls.AUTO, cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
-
-  @classmethod
-  def validate(cls, key):
-    if key not in cls.all():
-      raise ValueError(
-          f'Invalid Reduction Key: {key}. Expected keys are "{cls.all()}"')
+    """Types of loss reduction.
+
+    Contains the following values:
+
+    * `AUTO`: Indicates that the reduction option will be determined by the
+      usage context. For almost all cases this uses `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, we expect reduction
+      value to be `SUM` or `NONE`. Using `AUTO` in that case will raise an
+      error.
+    * `NONE`: No **additional** reduction is applied to the output of the
+      wrapped loss function. When non-scalar losses are returned to Keras
+      functions like `fit`/`evaluate`, the unreduced vector loss is passed to
+      the optimizer but the reported loss will be a scalar value.
+
+       Caution: **Verify the shape of the outputs when using** `Reduction.NONE`.
+       The builtin loss functions wrapped by the loss classes reduce one
+       dimension (`axis=-1`, or `axis` if specified by loss function).
+       `Reduction.NONE` just means that no **additional** reduction is applied
+       by the class wrapper. For categorical losses with an example input shape
+       of `[batch, W, H, n_classes]` the `n_classes` dimension is reduced. For
+       pointwise losses you must include a dummy axis so that `[batch, W, H, 1]`
+       is reduced to `[batch, W, H]`. Without the dummy axis `[batch, W, H]`
+       will be incorrectly reduced to `[batch, W]`.
+
+    * `SUM`: Scalar sum of weighted losses.
+    * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in
+       losses.  This reduction type is not supported when used with
+       `tf.distribute.Strategy` outside of built-in training loops like
+       `tf.keras` `compile`/`fit`.
+
+       You can implement 'SUM_OVER_BATCH_SIZE' using global batch size like:
+       ```
+       with strategy.scope():
+         loss_obj = tf.keras.losses.CategoricalCrossentropy(
+             reduction=tf.keras.losses.Reduction.NONE)
+         ....
+         loss = tf.reduce_sum(loss_obj(labels, predictions)) *
+             (1. / global_batch_size)
+       ```
+
+    Please see the [custom training guide](
+    https://www.tensorflow.org/tutorials/distribute/custom_training) for more
+    details on this.
+    """
+
+    AUTO = "auto"
+    NONE = "none"
+    SUM = "sum"
+    SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
+
+    @classmethod
+    def all(cls):
+        return (cls.AUTO, cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
+
+    @classmethod
+    def validate(cls, key):
+        if key not in cls.all():
+            raise ValueError(
+                f'Invalid Reduction Key: {key}. Expected keys are "{cls.all()}"'
+            )
 
 
 def remove_squeezable_dimensions(
-    labels, predictions, expected_rank_diff=0, name=None):
-  """Squeeze last dim if ranks differ from expected by exactly 1.
-
-  In the common case where we expect shapes to match, `expected_rank_diff`
-  defaults to 0, and we squeeze the last dimension of the larger rank if they
-  differ by 1.
-
-  But, for example, if `labels` contains class IDs and `predictions` contains 1
-  probability per class, we expect `predictions` to have 1 more dimension than
-  `labels`, so `expected_rank_diff` would be 1. In this case, we'd squeeze
-  `labels` if `rank(predictions) - rank(labels) == 0`, and
-  `predictions` if `rank(predictions) - rank(labels) == 2`.
-
-  This will use static shape if available. Otherwise, it will add graph
-  operations, which could result in a performance hit.
-
-  Args:
-    labels: Label values, a `Tensor` whose dimensions match `predictions`.
-    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
-    expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`.
-    name: Name of the op.
-
-  Returns:
-    Tuple of `labels` and `predictions`, possibly with last dim squeezed.
-  """
-  with backend.name_scope(name or 'remove_squeezable_dimensions'):
-    if not tf_utils.is_tensor_or_extension_type(predictions):
-      predictions = tf.convert_to_tensor(predictions)
-    if not tf_utils.is_tensor_or_extension_type(labels):
-      labels = tf.convert_to_tensor(labels)
-    predictions_shape = predictions.shape
-    predictions_rank = predictions_shape.ndims
-    labels_shape = labels.shape
-    labels_rank = labels_shape.ndims
-    if (labels_rank is not None) and (predictions_rank is not None):
-      # Use static rank.
-      rank_diff = predictions_rank - labels_rank
-      if (rank_diff == expected_rank_diff + 1 and
-          predictions_shape.dims[-1].is_compatible_with(1)):
-        predictions = tf.squeeze(predictions, [-1])
-      elif (rank_diff == expected_rank_diff - 1 and
-            labels_shape.dims[-1].is_compatible_with(1)):
-        labels = tf.squeeze(labels, [-1])
-      return labels, predictions
-
-    # Use dynamic rank.
-    rank_diff = tf.rank(predictions) - tf.rank(labels)
-    if (predictions_rank is None) or (
-        predictions_shape.dims[-1].is_compatible_with(1)):
-      predictions = tf.cond(
-          tf.equal(expected_rank_diff + 1, rank_diff),
-          lambda: tf.squeeze(predictions, [-1]),
-          lambda: predictions)
-    if (labels_rank is None) or (
-        labels_shape.dims[-1].is_compatible_with(1)):
-      labels = tf.cond(
-          tf.equal(expected_rank_diff - 1, rank_diff),
-          lambda: tf.squeeze(labels, [-1]),
-          lambda: labels)
-    return labels, predictions
+    labels, predictions, expected_rank_diff=0, name=None
+):
+    """Squeeze last dim if ranks differ from expected by exactly 1.
+
+    In the common case where we expect shapes to match, `expected_rank_diff`
+    defaults to 0, and we squeeze the last dimension of the larger rank if they
+    differ by 1.
+
+    But, for example, if `labels` contains class IDs and `predictions` contains
+    1 probability per class, we expect `predictions` to have 1 more dimension
+    than `labels`, so `expected_rank_diff` would be 1. In this case, we'd
+    squeeze `labels` if `rank(predictions) - rank(labels) == 0`, and
+    `predictions` if `rank(predictions) - rank(labels) == 2`.
+
+    This will use static shape if available. Otherwise, it will add graph
+    operations, which could result in a performance hit.
+
+    Args:
+      labels: Label values, a `Tensor` whose dimensions match `predictions`.
+      predictions: Predicted values, a `Tensor` of arbitrary dimensions.
+      expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`.
+      name: Name of the op.
+
+    Returns:
+      Tuple of `labels` and `predictions`, possibly with last dim squeezed.
+    """
+    with backend.name_scope(name or "remove_squeezable_dimensions"):
+        if not tf_utils.is_tensor_or_extension_type(predictions):
+            predictions = tf.convert_to_tensor(predictions)
+        if not tf_utils.is_tensor_or_extension_type(labels):
+            labels = tf.convert_to_tensor(labels)
+        predictions_shape = predictions.shape
+        predictions_rank = predictions_shape.ndims
+        labels_shape = labels.shape
+        labels_rank = labels_shape.ndims
+        if (labels_rank is not None) and (predictions_rank is not None):
+            # Use static rank.
+            rank_diff = predictions_rank - labels_rank
+            if rank_diff == expected_rank_diff + 1 and predictions_shape.dims[
+                -1
+            ].is_compatible_with(1):
+                predictions = tf.squeeze(predictions, [-1])
+            elif rank_diff == expected_rank_diff - 1 and labels_shape.dims[
+                -1
+            ].is_compatible_with(1):
+                labels = tf.squeeze(labels, [-1])
+            return labels, predictions
+
+        # Use dynamic rank.
+        rank_diff = tf.rank(predictions) - tf.rank(labels)
+        if (predictions_rank is None) or (
+            predictions_shape.dims[-1].is_compatible_with(1)
+        ):
+            predictions = tf.cond(
+                tf.equal(expected_rank_diff + 1, rank_diff),
+                lambda: tf.squeeze(predictions, [-1]),
+                lambda: predictions,
+            )
+        if (labels_rank is None) or (
+            labels_shape.dims[-1].is_compatible_with(1)
+        ):
+            labels = tf.cond(
+                tf.equal(expected_rank_diff - 1, rank_diff),
+                lambda: tf.squeeze(labels, [-1]),
+                lambda: labels,
+            )
+        return labels, predictions
 
 
 def squeeze_or_expand_dimensions(y_pred, y_true=None, sample_weight=None):
-  """Squeeze or expand last dimension if needed.
-
-  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
-  (using `remove_squeezable_dimensions`).
-  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
-  from the new rank of `y_pred`.
-  If `sample_weight` is scalar, it is kept scalar.
-
-  This will use static shape if available. Otherwise, it will add graph
-  operations, which could result in a performance hit.
-
-  Args:
-    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
-    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
-    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
-      `y_pred`.
-
-  Returns:
-    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
-    the last dimension squeezed,
-    `sample_weight` could be extended by one dimension.
-    If `sample_weight` is None, (y_pred, y_true) is returned.
-  """
-  y_pred_shape = y_pred.shape
-  y_pred_rank = y_pred_shape.ndims
-  if y_true is not None:
-
-    # If sparse matrix is provided as `y_true`, the last dimension in `y_pred`
-    # may be > 1. Eg: y_true = [0, 1, 2] (shape=(3,)),
-    # y_pred = [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]] (shape=(3, 3))
-    # In this case, we should not try to remove squeezable dimension.
-    y_true_shape = y_true.shape
-    y_true_rank = y_true_shape.ndims
-    if (y_true_rank is not None) and (y_pred_rank is not None):
-      # Use static rank for `y_true` and `y_pred`.
-      if (y_pred_rank - y_true_rank != 1) or y_pred_shape[-1] == 1:
-        y_true, y_pred = remove_squeezable_dimensions(
-            y_true, y_pred)
-    else:
-      # Use dynamic rank.
-      rank_diff = tf.rank(y_pred) - tf.rank(y_true)
-      squeeze_dims = lambda: remove_squeezable_dimensions(  # pylint: disable=g-long-lambda
-          y_true, y_pred)
-      is_last_dim_1 = tf.equal(1, tf.shape(y_pred)[-1])
-      maybe_squeeze_dims = lambda: tf.cond(  # pylint: disable=g-long-lambda
-          is_last_dim_1, squeeze_dims, lambda: (y_true, y_pred))
-      y_true, y_pred = tf.cond(
-          tf.equal(1, rank_diff), maybe_squeeze_dims, squeeze_dims)
-
-  if sample_weight is None:
-    return y_pred, y_true
-
-  weights_shape = sample_weight.shape
-  weights_rank = weights_shape.ndims
-  if weights_rank == 0:  # If weights is scalar, do nothing.
-    return y_pred, y_true, sample_weight
+    """Squeeze or expand last dimension if needed.
+
+    1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
+    (using `remove_squeezable_dimensions`).
+    2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
+    from the new rank of `y_pred`.
+    If `sample_weight` is scalar, it is kept scalar.
+
+    This will use static shape if available. Otherwise, it will add graph
+    operations, which could result in a performance hit.
+
+    Args:
+      y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
+      y_true: Optional label `Tensor` whose dimensions match `y_pred`.
+      sample_weight: Optional weight scalar or `Tensor` whose dimensions match
+        `y_pred`.
+
+    Returns:
+      Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
+      the last dimension squeezed,
+      `sample_weight` could be extended by one dimension.
+      If `sample_weight` is None, (y_pred, y_true) is returned.
+    """
+    y_pred_shape = y_pred.shape
+    y_pred_rank = y_pred_shape.ndims
+    if y_true is not None:
+
+        # If sparse matrix is provided as `y_true`, the last dimension in
+        # `y_pred` may be > 1. Eg: y_true = [0, 1, 2] (shape=(3,)), y_pred =
+        # [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]] (shape=(3, 3)) In
+        # this case, we should not try to remove squeezable dimension.
+        y_true_shape = y_true.shape
+        y_true_rank = y_true_shape.ndims
+        if (y_true_rank is not None) and (y_pred_rank is not None):
+            # Use static rank for `y_true` and `y_pred`.
+            if (y_pred_rank - y_true_rank != 1) or y_pred_shape[-1] == 1:
+                y_true, y_pred = remove_squeezable_dimensions(y_true, y_pred)
+        else:
+            # Use dynamic rank.
+            rank_diff = tf.rank(y_pred) - tf.rank(y_true)
+            squeeze_dims = lambda: remove_squeezable_dimensions(y_true, y_pred)
+            is_last_dim_1 = tf.equal(1, tf.shape(y_pred)[-1])
+            maybe_squeeze_dims = lambda: tf.cond(
+                is_last_dim_1, squeeze_dims, lambda: (y_true, y_pred)
+            )
+            y_true, y_pred = tf.cond(
+                tf.equal(1, rank_diff), maybe_squeeze_dims, squeeze_dims
+            )
+
+    if sample_weight is None:
+        return y_pred, y_true
+
+    weights_shape = sample_weight.shape
+    weights_rank = weights_shape.ndims
+    if weights_rank == 0:  # If weights is scalar, do nothing.
+        return y_pred, y_true, sample_weight
+
+    if (y_pred_rank is not None) and (weights_rank is not None):
+        # Use static rank.
+        if weights_rank - y_pred_rank == 1:
+            sample_weight = tf.squeeze(sample_weight, [-1])
+        elif y_pred_rank - weights_rank == 1:
+            sample_weight = tf.expand_dims(sample_weight, [-1])
+        return y_pred, y_true, sample_weight
 
-  if (y_pred_rank is not None) and (weights_rank is not None):
-    # Use static rank.
-    if weights_rank - y_pred_rank == 1:
-      sample_weight = tf.squeeze(sample_weight, [-1])
-    elif y_pred_rank - weights_rank == 1:
-      sample_weight = tf.expand_dims(sample_weight, [-1])
+    # Use dynamic rank.
+    weights_rank_tensor = tf.rank(sample_weight)
+    rank_diff = weights_rank_tensor - tf.rank(y_pred)
+    maybe_squeeze_weights = lambda: tf.squeeze(sample_weight, [-1])
+
+    def _maybe_expand_weights():
+        expand_weights = lambda: tf.expand_dims(sample_weight, [-1])
+        return tf.cond(
+            tf.equal(rank_diff, -1), expand_weights, lambda: sample_weight
+        )
+
+    def _maybe_adjust_weights():
+        return tf.cond(
+            tf.equal(rank_diff, 1), maybe_squeeze_weights, _maybe_expand_weights
+        )
+
+    # squeeze or expand last dim of `sample_weight` if its rank differs by 1
+    # from the new rank of `y_pred`.
+    sample_weight = tf.cond(
+        tf.equal(weights_rank_tensor, 0),
+        lambda: sample_weight,
+        _maybe_adjust_weights,
+    )
     return y_pred, y_true, sample_weight
 
-  # Use dynamic rank.
-  weights_rank_tensor = tf.rank(sample_weight)
-  rank_diff = weights_rank_tensor - tf.rank(y_pred)
-  maybe_squeeze_weights = lambda: tf.squeeze(sample_weight, [-1])
-
-  def _maybe_expand_weights():
-    expand_weights = lambda: tf.expand_dims(sample_weight, [-1])
-    return tf.cond(
-        tf.equal(rank_diff, -1), expand_weights, lambda: sample_weight)
-
-  def _maybe_adjust_weights():
-    return tf.cond(
-        tf.equal(rank_diff, 1), maybe_squeeze_weights,
-        _maybe_expand_weights)
-
-  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
-  # from the new rank of `y_pred`.
-  sample_weight = tf.cond(
-      tf.equal(weights_rank_tensor, 0), lambda: sample_weight,
-      _maybe_adjust_weights)
-  return y_pred, y_true, sample_weight
-
 
 def _safe_mean(losses, num_present):
-  """Computes a safe mean of the losses.
+    """Computes a safe mean of the losses.
 
-  Args:
-    losses: `Tensor` whose elements contain individual loss measurements.
-    num_present: The number of measurable elements in `losses`.
+    Args:
+      losses: `Tensor` whose elements contain individual loss measurements.
+      num_present: The number of measurable elements in `losses`.
 
-  Returns:
-    A scalar representing the mean of `losses`. If `num_present` is zero,
-      then zero is returned.
-  """
-  total_loss = tf.reduce_sum(losses)
-  return tf.math.divide_no_nan(total_loss, num_present, name='value')
+    Returns:
+      A scalar representing the mean of `losses`. If `num_present` is zero,
+        then zero is returned.
+    """
+    total_loss = tf.reduce_sum(losses)
+    return tf.math.divide_no_nan(total_loss, num_present, name="value")
 
 
 def _num_elements(losses):
-  """Computes the number of elements in `losses` tensor."""
-  with backend.name_scope('num_elements') as scope:
-    return tf.cast(tf.size(losses, name=scope), dtype=losses.dtype)
-
-
-def reduce_weighted_loss(weighted_losses,
-                         reduction=ReductionV2.SUM_OVER_BATCH_SIZE):
-  """Reduces the individual weighted loss measurements."""
-  if reduction == ReductionV2.NONE:
-    loss = weighted_losses
-  else:
-    loss = tf.reduce_sum(weighted_losses)
-    if reduction == ReductionV2.SUM_OVER_BATCH_SIZE:
-      loss = _safe_mean(loss, _num_elements(weighted_losses))
-  return loss
-
-
-@keras_export('keras.__internal__.losses.compute_weighted_loss', v1=[])
-def compute_weighted_loss(losses,
-                          sample_weight=None,
-                          reduction=ReductionV2.SUM_OVER_BATCH_SIZE,
-                          name=None):
-  """Computes the weighted loss.
-
-  Args:
-    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
-    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
-      `losses`, or be broadcastable to `losses`.
-    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
-      Default value is `SUM_OVER_BATCH_SIZE`.
-    name: Optional name for the op.
-
-  Raises:
-    ValueError: If the shape of `sample_weight` is not compatible with `losses`.
-
-  Returns:
-    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
-    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
-  """
-  ReductionV2.validate(reduction)
-
-  # If this function is called directly, then we just default 'AUTO' to
-  # 'SUM_OVER_BATCH_SIZE'. Eg. Canned estimator use cases.
-  if reduction == ReductionV2.AUTO:
-    reduction = ReductionV2.SUM_OVER_BATCH_SIZE
-  if sample_weight is None:
-    sample_weight = 1.0
-  with backend.name_scope(name or 'weighted_loss'):
-    # Save the `reduction` argument for loss normalization when distributing
-    # to multiple replicas. Used only for estimator + v1 optimizer flow.
-    tf.compat.v1.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
-
-    if not isinstance(losses,
-                      (keras_tensor.KerasTensor, tf.RaggedTensor)):
-      losses = tf.convert_to_tensor(losses)
-
-    if not isinstance(sample_weight,
-                      (keras_tensor.KerasTensor, tf.RaggedTensor)):
-      sample_weight = tf.convert_to_tensor(sample_weight)
-
-    # Convert any non float dtypes to floats, to avoid it loss any precision for
-    # dtype like int or bool.
-    if not losses.dtype.is_floating:
-      input_dtype = losses.dtype
-      losses = tf.cast(losses, 'float32')
-      input_casted = True
+    """Computes the number of elements in `losses` tensor."""
+    with backend.name_scope("num_elements") as scope:
+        return tf.cast(tf.size(losses, name=scope), dtype=losses.dtype)
+
+
+def reduce_weighted_loss(
+    weighted_losses, reduction=ReductionV2.SUM_OVER_BATCH_SIZE
+):
+    """Reduces the individual weighted loss measurements."""
+    if reduction == ReductionV2.NONE:
+        loss = weighted_losses
     else:
-      input_casted = False
-    sample_weight = tf.cast(sample_weight, losses.dtype)
-    # Update dimensions of `sample_weight` to match with `losses` if possible.
-    losses, _, sample_weight = squeeze_or_expand_dimensions(  # pylint: disable=unbalanced-tuple-unpacking
-        losses, None, sample_weight)
-    weighted_losses = tf.multiply(losses, sample_weight)
-
-    # Apply reduction function to the individual weighted losses.
-    loss = reduce_weighted_loss(weighted_losses, reduction)
-    if input_casted:
-      # Convert the result back to the input type.
-      loss = tf.cast(loss, input_dtype)
+        loss = tf.reduce_sum(weighted_losses)
+        if reduction == ReductionV2.SUM_OVER_BATCH_SIZE:
+            loss = _safe_mean(loss, _num_elements(weighted_losses))
     return loss
 
 
+@keras_export("keras.__internal__.losses.compute_weighted_loss", v1=[])
+def compute_weighted_loss(
+    losses,
+    sample_weight=None,
+    reduction=ReductionV2.SUM_OVER_BATCH_SIZE,
+    name=None,
+):
+    """Computes the weighted loss.
+
+    Args:
+      losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
+      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank
+        as `losses`, or be broadcastable to `losses`.
+      reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
+        loss. Default value is `SUM_OVER_BATCH_SIZE`.
+      name: Optional name for the op.
+
+    Raises:
+      ValueError: If the shape of `sample_weight` is not compatible with
+        `losses`.
+
+    Returns:
+      Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
+      `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
+    """
+    ReductionV2.validate(reduction)
+
+    # If this function is called directly, then we just default 'AUTO' to
+    # 'SUM_OVER_BATCH_SIZE'. Eg. Canned estimator use cases.
+    if reduction == ReductionV2.AUTO:
+        reduction = ReductionV2.SUM_OVER_BATCH_SIZE
+    if sample_weight is None:
+        sample_weight = 1.0
+    with backend.name_scope(name or "weighted_loss"):
+        # Save the `reduction` argument for loss normalization when distributing
+        # to multiple replicas. Used only for estimator + v1 optimizer flow.
+        tf.compat.v1.get_default_graph()._last_loss_reduction = reduction
+
+        if not isinstance(losses, (keras_tensor.KerasTensor, tf.RaggedTensor)):
+            losses = tf.convert_to_tensor(losses)
+
+        if not isinstance(
+            sample_weight, (keras_tensor.KerasTensor, tf.RaggedTensor)
+        ):
+            sample_weight = tf.convert_to_tensor(sample_weight)
+
+        # Convert any non float dtypes to floats, to avoid it loss any precision
+        # for dtype like int or bool.
+        if not losses.dtype.is_floating:
+            input_dtype = losses.dtype
+            losses = tf.cast(losses, "float32")
+            input_casted = True
+        else:
+            input_casted = False
+        sample_weight = tf.cast(sample_weight, losses.dtype)
+        # Update dimensions of `sample_weight` to match with `losses` if
+        # possible.
+        (
+            losses,
+            _,
+            sample_weight,
+        ) = squeeze_or_expand_dimensions(losses, None, sample_weight)
+        weighted_losses = tf.multiply(losses, sample_weight)
+
+        # Apply reduction function to the individual weighted losses.
+        loss = reduce_weighted_loss(weighted_losses, reduction)
+        if input_casted:
+            # Convert the result back to the input type.
+            loss = tf.cast(loss, input_dtype)
+        return loss
+
+
 def scale_loss_for_distribution(loss_value):
-  """Scales and returns the given loss value by the number of replicas."""
-  num_replicas = (
-      tf.distribute.get_strategy().num_replicas_in_sync)
-  if num_replicas > 1:
-    loss_value *= (1. / num_replicas)
-  return loss_value
+    """Scales and returns the given loss value by the number of replicas."""
+    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
+    if num_replicas > 1:
+        loss_value *= 1.0 / num_replicas
+    return loss_value
 
 
 def cast_losses_to_common_dtype(losses):
-  """Cast a list of losses to a common dtype.
-
-  If any loss is floating-point, they will all be casted to the most-precise
-  floating-point loss. Otherwise the losses are not casted. We also skip casting
-  losses if there are any complex losses.
-
-  Args:
-    losses: A list of losses.
-
-  Returns:
-    `losses`, but they have been casted to a common dtype.
-  """
-  highest_float = None
-  for loss in losses:
-    if loss.dtype.is_floating:
-      if highest_float is None or loss.dtype.size > highest_float.size:
-        highest_float = loss.dtype
-      elif {loss.dtype, highest_float} == {'bfloat16', 'float16'}:
-        highest_float = 'float32'
-    if loss.dtype.is_complex:
-      return losses  # If we find any complex losses, do not cast any losses
-  if highest_float:
-    losses = [tf.cast(loss, highest_float) for loss in losses]
-  return losses
+    """Cast a list of losses to a common dtype.
+
+    If any loss is floating-point, they will all be casted to the most-precise
+    floating-point loss. Otherwise the losses are not casted. We also skip
+    casting losses if there are any complex losses.
+
+    Args:
+      losses: A list of losses.
+
+    Returns:
+      `losses`, but they have been casted to a common dtype.
+    """
+    highest_float = None
+    for loss in losses:
+        if loss.dtype.is_floating:
+            if highest_float is None or loss.dtype.size > highest_float.size:
+                highest_float = loss.dtype
+            elif {loss.dtype, highest_float} == {"bfloat16", "float16"}:
+                highest_float = "float32"
+        if loss.dtype.is_complex:
+            return (
+                losses  # If we find any complex losses, do not cast any losses
+            )
+    if highest_float:
+        losses = [tf.cast(loss, highest_float) for loss in losses]
+    return losses
+
+
+def get_mask(y_p):
+    """Returns Keras mask from tensor."""
+    return getattr(y_p, "_keras_mask", None)
+
+
+def apply_mask(y_p, sw, mask):
+    """Applies any mask on predictions to sample weights."""
+    if mask is not None:
+        mask = tf.cast(mask, y_p.dtype)
+        if sw is not None:
+            sw = tf.cast(sw, mask.dtype)
+            mask, _, sw = squeeze_or_expand_dimensions(mask, sample_weight=sw)
+            sw *= mask
+        else:
+            sw = mask
+    return sw
+
+
+def apply_valid_mask(losses, sw, mask, reduction):
+    """Redistribute sample weights considering only valid entries."""
+    if mask is not None:
+        mask = tf.cast(mask, losses.dtype)
+
+        if reduction in (ReductionV2.AUTO, ReductionV2.SUM_OVER_BATCH_SIZE):
+            # Valid entries have weight `total/valid`, while invalid ones
+            # have 0. When summed over batch, they will be reduced to:
+            #
+            # mean(loss * sample_weight * total / valid)
+            #   = sum(loss * sample_weight * total / valid) / total
+            #   = sum(loss * sample_weight) / total * total / valid
+            #   = sum(loss * sample_weight) / valid
+
+            total = tf.cast(tf.size(mask), losses.dtype)
+            valid = tf.reduce_sum(mask)
+            mask *= total / valid
+
+    return apply_mask(losses, sw, mask)
diff --git a/keras/utils/losses_utils_test.py b/keras/utils/losses_utils_test.py
index 0dfa21dfc750..03c531bf1db0 100644
--- a/keras/utils/losses_utils_test.py
+++ b/keras/utils/losses_utils_test.py
@@ -14,64 +14,69 @@
 # ==============================================================================
 """Tests for losses_utils."""
 
+import tensorflow.compat.v2 as tf
+
 from keras.testing_infra import test_combinations
 from keras.utils import losses_utils
-import tensorflow.compat.v2 as tf
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class RemoveSqueezableTest(tf.test.TestCase):
-  """Test remove_squeezable_dimensions"""
+    """Test remove_squeezable_dimensions"""
 
-  def test_ragged_3d_same_shape(self):
-    """ shape (2, (sequence={1, 2}), 3)"""
-    x = tf.ragged.constant([[[1, 2, 3]], [[4, 5, 6], [7, 8, 9]]])
-    rank = x.shape.ndims
-    x_p, _ = losses_utils.remove_squeezable_dimensions(x, x)
-    self.assertEqual(x_p.shape.ndims, rank)
+    def test_ragged_3d_same_shape(self):
+        """shape (2, (sequence={1, 2}), 3)"""
+        x = tf.ragged.constant([[[1, 2, 3]], [[4, 5, 6], [7, 8, 9]]])
+        rank = x.shape.ndims
+        x_p, _ = losses_utils.remove_squeezable_dimensions(x, x)
+        self.assertEqual(x_p.shape.ndims, rank)
 
-  def test_ragged_3d_4d_squeezable(self):
-    """ shapes:
+    def test_ragged_3d_4d_squeezable(self):
+        """shapes:
 
         x: (2, (sequence={1, 2}), 3)
         y: (2, (sequence={1, 2}), 3, 1)
-    """
-    x = tf.ragged.constant([[[1, 2, 3]], [[4, 5, 6], [7, 8, 9]]])
-    y = tf.expand_dims(x, axis=-1)
-    self.assertEqual(x.shape.ndims, 3)
-    self.assertEqual(y.shape.ndims, 4)
-    _, y_p = losses_utils.remove_squeezable_dimensions(x, y)
-    y_p.shape.assert_is_compatible_with(x.shape)
-    self.assertEqual(y_p.shape.ndims, 3)
+        """
+        x = tf.ragged.constant([[[1, 2, 3]], [[4, 5, 6], [7, 8, 9]]])
+        y = tf.expand_dims(x, axis=-1)
+        self.assertEqual(x.shape.ndims, 3)
+        self.assertEqual(y.shape.ndims, 4)
+        _, y_p = losses_utils.remove_squeezable_dimensions(x, y)
+        y_p.shape.assert_is_compatible_with(x.shape)
+        self.assertEqual(y_p.shape.ndims, 3)
 
-    x_p, _ = losses_utils.remove_squeezable_dimensions(y, x)
-    x_p.shape.assert_is_compatible_with(x.shape)
-    self.assertEqual(x_p.shape.ndims, 3)
+        x_p, _ = losses_utils.remove_squeezable_dimensions(y, x)
+        x_p.shape.assert_is_compatible_with(x.shape)
+        self.assertEqual(x_p.shape.ndims, 3)
 
-  def test_dense_2d_3d_squeezable(self):
-    x = tf.constant([[1, 2], [3, 4]])
-    y = tf.constant([[[1], [2]], [[3], [4]]])
-    _, y_p = losses_utils.remove_squeezable_dimensions(x, y)
-    y_p.shape.assert_is_compatible_with(x.shape)
-    self.assertEqual(y_p.shape.ndims, x.shape.ndims)
-    x_p, _ = losses_utils.remove_squeezable_dimensions(y, x)
-    x_p.shape.assert_is_compatible_with(x.shape)
+    def test_dense_2d_3d_squeezable(self):
+        x = tf.constant([[1, 2], [3, 4]])
+        y = tf.constant([[[1], [2]], [[3], [4]]])
+        _, y_p = losses_utils.remove_squeezable_dimensions(x, y)
+        y_p.shape.assert_is_compatible_with(x.shape)
+        self.assertEqual(y_p.shape.ndims, x.shape.ndims)
+        x_p, _ = losses_utils.remove_squeezable_dimensions(y, x)
+        x_p.shape.assert_is_compatible_with(x.shape)
 
 
 class RemoveSqueezableTestGraphOnly(tf.test.TestCase):
-  """Test remove_squeezable_dimensions (graph-mode only)."""
+    """Test remove_squeezable_dimensions (graph-mode only)."""
 
-  def test_placeholder(self):
-    """Test dynamic rank tensors."""
-    with tf.Graph().as_default():
-      x = tf.compat.v1.placeholder_with_default([1., 2., 3.], shape=None)
-      y = tf.compat.v1.placeholder_with_default([[1.], [2.], [3.]], shape=None)
-      _, y_p = losses_utils.remove_squeezable_dimensions(x, y)
-      y_p.shape.assert_is_compatible_with(x.shape)
-      self.assertAllEqual(tf.shape(x), tf.shape(y_p))
-      x_p, _ = losses_utils.remove_squeezable_dimensions(y, x)
-      x_p.shape.assert_is_compatible_with(x.shape)
+    def test_placeholder(self):
+        """Test dynamic rank tensors."""
+        with tf.Graph().as_default():
+            x = tf.compat.v1.placeholder_with_default(
+                [1.0, 2.0, 3.0], shape=None
+            )
+            y = tf.compat.v1.placeholder_with_default(
+                [[1.0], [2.0], [3.0]], shape=None
+            )
+            _, y_p = losses_utils.remove_squeezable_dimensions(x, y)
+            y_p.shape.assert_is_compatible_with(x.shape)
+            self.assertAllEqual(tf.shape(x), tf.shape(y_p))
+            x_p, _ = losses_utils.remove_squeezable_dimensions(y, x)
+            x_p.shape.assert_is_compatible_with(x.shape)
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/metrics_utils.py b/keras/utils/metrics_utils.py
index 18a191709a37..0edd82d703de 100644
--- a/keras/utils/metrics_utils.py
+++ b/keras/utils/metrics_utils.py
@@ -12,251 +12,277 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Utils related to keras metrics."""
 
-from enum import Enum
 import functools
 import weakref
+from enum import Enum
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 from keras import backend
 from keras.utils import losses_utils
 from keras.utils import tf_utils
 from keras.utils.generic_utils import to_list
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 NEG_INF = -1e10
 
 
 class Reduction(Enum):
-  """Types of metrics reduction.
+    """Types of metrics reduction.
 
-  Contains the following values:
+    Contains the following values:
 
-  * `SUM`: Scalar sum of weighted values.
-  * `SUM_OVER_BATCH_SIZE`: Scalar sum of weighted values divided by
-        number of elements.
-  * `WEIGHTED_MEAN`: Scalar sum of weighted values divided by sum of weights.
-  """
-  SUM = 'sum'
-  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
-  WEIGHTED_MEAN = 'weighted_mean'
+    * `SUM`: Scalar sum of weighted values.
+    * `SUM_OVER_BATCH_SIZE`: Scalar sum of weighted values divided by
+          number of elements.
+    * `WEIGHTED_MEAN`: Scalar sum of weighted values divided by sum of weights.
+    """
 
+    SUM = "sum"
+    SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
+    WEIGHTED_MEAN = "weighted_mean"
 
-def update_state_wrapper(update_state_fn):
-  """Decorator to wrap metric `update_state()` with `add_update()`.
 
-  Args:
-    update_state_fn: function that accumulates metric statistics.
-
-  Returns:
-    Decorated function that wraps `update_state_fn()` with `add_update()`.
-  """
-
-  def decorated(metric_obj, *args, **kwargs):
-    """Decorated function with `add_update()`."""
-    strategy = tf.distribute.get_strategy()
-
-    for weight in metric_obj.weights:
-      if (backend.is_tpu_strategy(strategy) and
-          not strategy.extended.variable_created_in_scope(weight)
-          and not tf.distribute.in_cross_replica_context()):
-        raise ValueError(
-            'Trying to run metric.update_state in replica context when '
-            'the metric was not created in TPUStrategy scope. '
-            'Make sure the keras Metric is created in TPUstrategy scope. ')
-
-    with tf_utils.graph_context_for_symbolic_tensors(*args, **kwargs):
-      update_op = update_state_fn(*args, **kwargs)
-    if update_op is not None:  # update_op will be None in eager execution.
-      metric_obj.add_update(update_op)
-    return update_op
-
-  return tf.__internal__.decorator.make_decorator(update_state_fn, decorated)
+def update_state_wrapper(update_state_fn):
+    """Decorator to wrap metric `update_state()` with `add_update()`.
+
+    Args:
+      update_state_fn: function that accumulates metric statistics.
+
+    Returns:
+      Decorated function that wraps `update_state_fn()` with `add_update()`.
+    """
+
+    def decorated(metric_obj, *args, **kwargs):
+        """Decorated function with `add_update()`."""
+        strategy = tf.distribute.get_strategy()
+
+        for weight in metric_obj.weights:
+            if (
+                backend.is_tpu_strategy(strategy)
+                and not strategy.extended.variable_created_in_scope(weight)
+                and not tf.distribute.in_cross_replica_context()
+            ):
+                raise ValueError(
+                    "Trying to run metric.update_state in replica context when "
+                    "the metric was not created in TPUStrategy scope. "
+                    "Make sure the keras Metric is created in TPUstrategy "
+                    "scope. "
+                )
+
+        with tf_utils.graph_context_for_symbolic_tensors(*args, **kwargs):
+            result = update_state_fn(*args, **kwargs)
+        if not tf.executing_eagerly():
+            result = tf.compat.v1.get_default_graph().get_operations()[-1]
+            metric_obj.add_update(result)
+        return result
+
+    return tf.__internal__.decorator.make_decorator(update_state_fn, decorated)
 
 
 def result_wrapper(result_fn):
-  """Decorator to wrap metric `result()` function in `merge_call()`.
-
-  Result computation is an idempotent operation that simply calculates the
-  metric value using the state variables.
-
-  If metric state variables are distributed across replicas/devices and
-  `result()` is requested from the context of one device - This function wraps
-  `result()` in a distribution strategy `merge_call()`. With this,
-  the metric state variables will be aggregated across devices.
-
-  Args:
-    result_fn: function that computes the metric result.
-
-  Returns:
-    Decorated function that wraps `result_fn()` in distribution strategy
-    `merge_call()`.
-  """
-
-  def decorated(metric_obj, *args):
-    """Decorated function with merge_call."""
-    replica_context = tf.distribute.get_replica_context()
-
-    # The purpose of using `merge_call` to call `result()` is to trigger cross
-    # replica aggregation of metric state variables (SyncOnReadVariable). After
-    # we introduced `variable_sync_on_read_context`, in principle there is no
-    # need to use `merge_call` here. However the branch still exists because:
-    #
-    # 1. Keras V1 training code sometimes assumes `result_t` is the same tensor
-    #    across replicas (achieved by `merge_call`). With
-    #    `variable_sync_on_read_context` each replica gets their own tensors
-    #    residing on replica's device, thus breaking the assumption.
-    # 2. Keras c/fit creates a tf.function (a.k.a, train_function) that returns
-    #    the metric values of the first replica. With
-    #    `variable_sync_on_read_context` since each replica gets their own
-    #    tensors, the metric result tensors on the non-first replicas are not in
-    #    the return value of train_function, making TF graph optimizer prune the
-    #    branch that computes and aggregates those metric results. As a result,
-    #    if NCCL is used to do the aggregation, the program will hang because
-    #    NCCL ops are only launched on the non-pruned first replica.
-    #
-    # We condition on strategy_supports_no_merge_call() since we know if it is
-    # True, the program uses `jit_compile` to compile replica fn, meaning it is
-    # not V1 training (hence #1 is okay), and no pruning will happen as
-    # compiled functions are not inlined (hence #2 is okay).
-    if (replica_context is None or
-        tf.__internal__.distribute.strategy_supports_no_merge_call()):
-      with tf.__internal__.distribute.variable_sync_on_read_context():
-        raw_result = result_fn(*args)
-        # Results need to be wrapped in a `tf.identity` op to ensure
-        # correct execution order.
-        if isinstance(raw_result,
-                      (tf.Tensor, tf.Variable, float, int)):
-          result_t = tf.identity(raw_result)
-        elif isinstance(raw_result, dict):
-          result_t = {
-              key: tf.identity(value)
-              for key, value in raw_result.items()
-          }
+    """Decorator to wrap metric `result()` function in `merge_call()`.
+
+    Result computation is an idempotent operation that simply calculates the
+    metric value using the state variables.
+
+    If metric state variables are distributed across replicas/devices and
+    `result()` is requested from the context of one device - This function wraps
+    `result()` in a distribution strategy `merge_call()`. With this,
+    the metric state variables will be aggregated across devices.
+
+    Args:
+      result_fn: function that computes the metric result.
+
+    Returns:
+      Decorated function that wraps `result_fn()` in distribution strategy
+      `merge_call()`.
+    """
+
+    def decorated(metric_obj, *args):
+        """Decorated function with merge_call."""
+        replica_context = tf.distribute.get_replica_context()
+
+        # The purpose of using `merge_call` to call `result()` is to trigger
+        # cross replica aggregation of metric state variables
+        # (SyncOnReadVariable). After we introduced
+        # `variable_sync_on_read_context`, in principle there is no need to use
+        # `merge_call` here. However the branch still exists because:
+        #
+        # 1. Keras V1 training code sometimes assumes `result_t` is the same
+        #    tensor across replicas (achieved by `merge_call`). With
+        #    `variable_sync_on_read_context` each replica gets their own tensors
+        #    residing on replica's device, thus breaking the assumption.
+        # 2. Keras c/fit creates a tf.function (a.k.a, train_function) that
+        #    returns the metric values of the first replica. With
+        #    `variable_sync_on_read_context` since each replica gets their own
+        #    tensors, the metric result tensors on the non-first replicas are
+        #    not in the return value of train_function, making TF graph
+        #    optimizer prune the branch that computes and aggregates those
+        #    metric results. As a result, if NCCL is used to do the aggregation,
+        #    the program will hang because NCCL ops are only launched on the
+        #    non-pruned first replica.
+        #
+        # We condition on strategy_supports_no_merge_call() since we know if it
+        # is True, the program uses `jit_compile` to compile replica fn, meaning
+        # it is not V1 training (hence #1 is okay), and no pruning will happen
+        # as compiled functions are not inlined (hence #2 is okay).
+        if (
+            replica_context is None
+            or tf.__internal__.distribute.strategy_supports_no_merge_call()
+        ):
+            with tf.__internal__.distribute.variable_sync_on_read_context():
+                raw_result = result_fn(*args)
+                # Results need to be wrapped in a `tf.identity` op to ensure
+                # correct execution order.
+                if isinstance(raw_result, (tf.Tensor, tf.Variable, float, int)):
+                    result_t = tf.identity(raw_result)
+                elif isinstance(raw_result, dict):
+                    result_t = tf.nest.map_structure(tf.identity, raw_result)
+                else:
+                    try:
+                        result_t = tf.identity(raw_result)
+                    except (ValueError, TypeError):
+                        raise RuntimeError(
+                            "The output of `metric.result()` can only be a "
+                            "single Tensor/Variable, or a dict of "
+                            "Tensors/Variables. "
+                            f"For metric {metric_obj.name}, "
+                            f"got result {raw_result}."
+                        )
         else:
-          try:
-            result_t = tf.identity(raw_result)
-          except (ValueError, TypeError):
-            raise RuntimeError(
-                'The output of `metric.result()` can only be a single '
-                'Tensor/Variable, or a dict of Tensors/Variables. '
-                f'For metric {metric_obj.name}, got result {raw_result}.')
-    else:
-      # TODO(psv): Test distribution of metrics using different distribution
-      # strategies.
-
-      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
-      # with distribution object as the first parameter. We create a wrapper
-      # here so that the result function need not have that parameter.
-      def merge_fn_wrapper(distribution, merge_fn, *args):
-        # We will get `PerReplica` merge function. Taking the first one as all
-        # are identical copies of the function that we had passed below.
-        result = distribution.experimental_local_results(merge_fn)[0](*args)
-
-        # Wrapping result in identity so that control dependency between
-        # update_op from `update_state` and result works in case result returns
-        # a tensor.
-        return tf.identity(result)
-
-      # Wrapping result in merge_call. merge_call is used when we want to leave
-      # replica mode and compute a value in cross replica mode.
-      result_t = replica_context.merge_call(
-          merge_fn_wrapper, args=(result_fn,) + args)
-
-    # We are saving the result op here to be used in train/test execution
-    # functions. This basically gives the result op that was generated with a
-    # control dep to the updates for these workflows.
-    metric_obj._call_result = result_t
-    return result_t
-
-  return tf.__internal__.decorator.make_decorator(result_fn, decorated)
+            # TODO(psv): Test distribution of metrics using different
+            # distribution strategies.
+
+            # Creating a wrapper for merge_fn. merge_call invokes the given
+            # merge_fn with distribution object as the first parameter. We
+            # create a wrapper here so that the result function need not have
+            # that parameter.
+            def merge_fn_wrapper(distribution, merge_fn, *args):
+                # We will get `PerReplica` merge function. Taking the first one
+                # as all are identical copies of the function that we had passed
+                # below.
+                result = distribution.experimental_local_results(merge_fn)[0](
+                    *args
+                )
+
+                # Wrapping result in identity so that control dependency between
+                # update_op from `update_state` and result works in case result
+                # returns a tensor.
+                return tf.nest.map_structure(tf.identity, result)
+
+            # Wrapping result in merge_call. merge_call is used when we want to
+            # leave replica mode and compute a value in cross replica mode.
+            result_t = replica_context.merge_call(
+                merge_fn_wrapper, args=(result_fn,) + args
+            )
+
+        # We are saving the result op here to be used in train/test execution
+        # functions. This basically gives the result op that was generated with
+        # a control dep to the updates for these workflows.
+        metric_obj._call_result = result_t
+        return result_t
+
+    return tf.__internal__.decorator.make_decorator(result_fn, decorated)
 
 
 def weakmethod(method):
-  """Creates a weak reference to the bound method."""
+    """Creates a weak reference to the bound method."""
 
-  cls = method.im_class
-  func = method.im_func
-  instance_ref = weakref.ref(method.im_self)
+    cls = method.im_class
+    func = method.im_func
+    instance_ref = weakref.ref(method.im_self)
 
-  @functools.wraps(method)
-  def inner(*args, **kwargs):
-    return func.__get__(instance_ref(), cls)(*args, **kwargs)
+    @functools.wraps(method)
+    def inner(*args, **kwargs):
+        return func.__get__(instance_ref(), cls)(*args, **kwargs)
 
-  del method
-  return inner
+    del method
+    return inner
 
 
 def assert_thresholds_range(thresholds):
-  if thresholds is not None:
-    invalid_thresholds = [t for t in thresholds if t is None or t < 0 or t > 1]
-    if invalid_thresholds:
-      raise ValueError(
-          f'Threshold values must be in [0, 1]. Received: {invalid_thresholds}')
+    if thresholds is not None:
+        invalid_thresholds = [
+            t for t in thresholds if t is None or t < 0 or t > 1
+        ]
+        if invalid_thresholds:
+            raise ValueError(
+                "Threshold values must be in [0, 1]. "
+                f"Received: {invalid_thresholds}"
+            )
 
 
 def parse_init_thresholds(thresholds, default_threshold=0.5):
-  if thresholds is not None:
-    assert_thresholds_range(to_list(thresholds))
-  thresholds = to_list(default_threshold if thresholds is None else thresholds)
-  return thresholds
+    if thresholds is not None:
+        assert_thresholds_range(to_list(thresholds))
+    thresholds = to_list(
+        default_threshold if thresholds is None else thresholds
+    )
+    return thresholds
 
 
 class ConfusionMatrix(Enum):
-  TRUE_POSITIVES = 'tp'
-  FALSE_POSITIVES = 'fp'
-  TRUE_NEGATIVES = 'tn'
-  FALSE_NEGATIVES = 'fn'
+    TRUE_POSITIVES = "tp"
+    FALSE_POSITIVES = "fp"
+    TRUE_NEGATIVES = "tn"
+    FALSE_NEGATIVES = "fn"
 
 
 class AUCCurve(Enum):
-  """Type of AUC Curve (ROC or PR)."""
-  ROC = 'ROC'
-  PR = 'PR'
-
-  @staticmethod
-  def from_str(key):
-    if key in ('pr', 'PR'):
-      return AUCCurve.PR
-    elif key in ('roc', 'ROC'):
-      return AUCCurve.ROC
-    else:
-      raise ValueError(
-          f'Invalid AUC curve value: "{key}". '
-          'Expected values are ["PR", "ROC"]')
+    """Type of AUC Curve (ROC or PR)."""
+
+    ROC = "ROC"
+    PR = "PR"
+
+    @staticmethod
+    def from_str(key):
+        if key in ("pr", "PR"):
+            return AUCCurve.PR
+        elif key in ("roc", "ROC"):
+            return AUCCurve.ROC
+        else:
+            raise ValueError(
+                f'Invalid AUC curve value: "{key}". '
+                'Expected values are ["PR", "ROC"]'
+            )
 
 
 class AUCSummationMethod(Enum):
-  """Type of AUC summation method.
-
-  https://en.wikipedia.org/wiki/Riemann_sum)
-
-  Contains the following values:
-  * 'interpolation': Applies mid-point summation scheme for `ROC` curve. For
-    `PR` curve, interpolates (true/false) positives but not the ratio that is
-    precision (see Davis & Goadrich 2006 for details).
-  * 'minoring': Applies left summation for increasing intervals and right
-    summation for decreasing intervals.
-  * 'majoring': Applies right summation for increasing intervals and left
-    summation for decreasing intervals.
-  """
-  INTERPOLATION = 'interpolation'
-  MAJORING = 'majoring'
-  MINORING = 'minoring'
-
-  @staticmethod
-  def from_str(key):
-    if key in ('interpolation', 'Interpolation'):
-      return AUCSummationMethod.INTERPOLATION
-    elif key in ('majoring', 'Majoring'):
-      return AUCSummationMethod.MAJORING
-    elif key in ('minoring', 'Minoring'):
-      return AUCSummationMethod.MINORING
-    else:
-      raise ValueError(
-          f'Invalid AUC summation method value: "{key}". '
-          'Expected values are ["interpolation", "majoring", "minoring"]')
+    """Type of AUC summation method.
+
+    https://en.wikipedia.org/wiki/Riemann_sum)
+
+    Contains the following values:
+    * 'interpolation': Applies mid-point summation scheme for `ROC` curve. For
+      `PR` curve, interpolates (true/false) positives but not the ratio that is
+      precision (see Davis & Goadrich 2006 for details).
+    * 'minoring': Applies left summation for increasing intervals and right
+      summation for decreasing intervals.
+    * 'majoring': Applies right summation for increasing intervals and left
+      summation for decreasing intervals.
+    """
+
+    INTERPOLATION = "interpolation"
+    MAJORING = "majoring"
+    MINORING = "minoring"
+
+    @staticmethod
+    def from_str(key):
+        if key in ("interpolation", "Interpolation"):
+            return AUCSummationMethod.INTERPOLATION
+        elif key in ("majoring", "Majoring"):
+            return AUCSummationMethod.MAJORING
+        elif key in ("minoring", "Minoring"):
+            return AUCSummationMethod.MINORING
+        else:
+            raise ValueError(
+                f'Invalid AUC summation method value: "{key}". '
+                'Expected values are ["interpolation", "majoring", "minoring"]'
+            )
 
 
 def _update_confusion_matrix_variables_optimized(
@@ -267,659 +293,722 @@ def _update_confusion_matrix_variables_optimized(
     multi_label=False,
     sample_weights=None,
     label_weights=None,
-    thresholds_with_epsilon=False):
-  """Update confusion matrix variables with memory efficient alternative.
-
-  Note that the thresholds need to be evenly distributed within the list, eg,
-  the diff between consecutive elements are the same.
-
-  To compute TP/FP/TN/FN, we are measuring a binary classifier
-    C(t) = (predictions >= t)
-  at each threshold 't'. So we have
-    TP(t) = sum( C(t) * true_labels )
-    FP(t) = sum( C(t) * false_labels )
-
-  But, computing C(t) requires computation for each t. To make it fast,
-  observe that C(t) is a cumulative integral, and so if we have
-    thresholds = [t_0, ..., t_{n-1}];  t_0 < ... < t_{n-1}
-  where n = num_thresholds, and if we can compute the bucket function
-    B(i) = Sum( (predictions == t), t_i <= t < t{i+1} )
-  then we get
-    C(t_i) = sum( B(j), j >= i )
-  which is the reversed cumulative sum in tf.cumsum().
-
-  We can compute B(i) efficiently by taking advantage of the fact that
-  our thresholds are evenly distributed, in that
-    width = 1.0 / (num_thresholds - 1)
-    thresholds = [0.0, 1*width, 2*width, 3*width, ..., 1.0]
-  Given a prediction value p, we can map it to its bucket by
-    bucket_index(p) = floor( p * (num_thresholds - 1) )
-  so we can use tf.math.unsorted_segment_sum() to update the buckets in one
-  pass.
-
-  Consider following example:
-  y_true = [0, 0, 1, 1]
-  y_pred = [0.1, 0.5, 0.3, 0.9]
-  thresholds = [0.0, 0.5, 1.0]
-  num_buckets = 2   # [0.0, 1.0], (1.0, 2.0]
-  bucket_index(y_pred) = tf.math.floor(y_pred * num_buckets)
-                       = tf.math.floor([0.2, 1.0, 0.6, 1.8])
-                       = [0, 0, 0, 1]
-  # The meaning of this bucket is that if any of the label is true,
-  # then 1 will be added to the corresponding bucket with the index.
-  # Eg, if the label for 0.2 is true, then 1 will be added to bucket 0. If the
-  # label for 1.8 is true, then 1 will be added to bucket 1.
-  #
-  # Note the second item "1.0" is floored to 0, since the value need to be
-  # strictly larger than the bucket lower bound.
-  # In the implementation, we use tf.math.ceil() - 1 to achieve this.
-  tp_bucket_value = tf.math.unsorted_segment_sum(true_labels, bucket_indices,
-                                                 num_segments=num_thresholds)
-                  = [1, 1, 0]
-  # For [1, 1, 0] here, it means there is 1 true value contributed by bucket 0,
-  # and 1 value contributed by bucket 1. When we aggregate them to together,
-  # the result become [a + b + c, b + c, c], since large thresholds will always
-  # contribute to the value for smaller thresholds.
-  true_positive = tf.math.cumsum(tp_bucket_value, reverse=True)
-                = [2, 1, 0]
-
-  This implementation exhibits a run time and space complexity of O(T + N),
-  where T is the number of thresholds and N is the size of predictions.
-  Metrics that rely on standard implementation instead exhibit a complexity of
-  O(T * N).
-
-  Args:
-    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
-      and corresponding variables to update as values.
-    y_true: A floating point `Tensor` whose shape matches `y_pred`. Will be cast
-      to `bool`.
-    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
-      the range `[0, 1]`.
-    thresholds: A sorted floating point `Tensor` with value in `[0, 1]`.
-      It need to be evenly distributed (the diff between each element need to be
-      the same).
-    multi_label: Optional boolean indicating whether multidimensional
-      prediction/labels should be treated as multilabel responses, or flattened
-      into a single label. When True, the valus of `variables_to_update` must
-      have a second dimension equal to the number of labels in y_true and
-      y_pred, and those tensors must not be RaggedTensors.
-    sample_weights: Optional `Tensor` whose rank is either 0, or the same rank
-      as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions
-      must be either `1`, or the same as the corresponding `y_true` dimension).
-    label_weights: Optional tensor of non-negative weights for multilabel
-      data. The weights are applied when calculating TP, FP, FN, and TN without
-      explicit multilabel handling (i.e. when the data is to be flattened).
-    thresholds_with_epsilon: Optional boolean indicating whether the leading and
-      tailing thresholds has any epsilon added for floating point imprecisions.
-      It will change how we handle the leading and tailing bucket.
-
-  Returns:
-    Update op.
-  """
-  num_thresholds = thresholds.shape.as_list()[0]
-
-  if sample_weights is None:
-    sample_weights = 1.0
-  else:
-    sample_weights = tf.__internal__.ops.broadcast_weights(
-        tf.cast(sample_weights, dtype=y_pred.dtype), y_pred)
-    if not multi_label:
-      sample_weights = tf.reshape(sample_weights, [-1])
-  if label_weights is None:
-    label_weights = 1.0
-  else:
-    label_weights = tf.expand_dims(label_weights, 0)
-    label_weights = tf.__internal__.ops.broadcast_weights(label_weights,
-                                                            y_pred)
+    thresholds_with_epsilon=False,
+):
+    """Update confusion matrix variables with memory efficient alternative.
+
+    Note that the thresholds need to be evenly distributed within the list, eg,
+    the diff between consecutive elements are the same.
+
+    To compute TP/FP/TN/FN, we are measuring a binary classifier
+      C(t) = (predictions >= t)
+    at each threshold 't'. So we have
+      TP(t) = sum( C(t) * true_labels )
+      FP(t) = sum( C(t) * false_labels )
+
+    But, computing C(t) requires computation for each t. To make it fast,
+    observe that C(t) is a cumulative integral, and so if we have
+      thresholds = [t_0, ..., t_{n-1}];  t_0 < ... < t_{n-1}
+    where n = num_thresholds, and if we can compute the bucket function
+      B(i) = Sum( (predictions == t), t_i <= t < t{i+1} )
+    then we get
+      C(t_i) = sum( B(j), j >= i )
+    which is the reversed cumulative sum in tf.cumsum().
+
+    We can compute B(i) efficiently by taking advantage of the fact that
+    our thresholds are evenly distributed, in that
+      width = 1.0 / (num_thresholds - 1)
+      thresholds = [0.0, 1*width, 2*width, 3*width, ..., 1.0]
+    Given a prediction value p, we can map it to its bucket by
+      bucket_index(p) = floor( p * (num_thresholds - 1) )
+    so we can use tf.math.unsorted_segment_sum() to update the buckets in one
+    pass.
+
+    Consider following example:
+    y_true = [0, 0, 1, 1]
+    y_pred = [0.1, 0.5, 0.3, 0.9]
+    thresholds = [0.0, 0.5, 1.0]
+    num_buckets = 2   # [0.0, 1.0], (1.0, 2.0]
+    bucket_index(y_pred) = tf.math.floor(y_pred * num_buckets)
+                         = tf.math.floor([0.2, 1.0, 0.6, 1.8])
+                         = [0, 0, 0, 1]
+    # The meaning of this bucket is that if any of the label is true,
+    # then 1 will be added to the corresponding bucket with the index.
+    # Eg, if the label for 0.2 is true, then 1 will be added to bucket 0. If the
+    # label for 1.8 is true, then 1 will be added to bucket 1.
+    #
+    # Note the second item "1.0" is floored to 0, since the value need to be
+    # strictly larger than the bucket lower bound.
+    # In the implementation, we use tf.math.ceil() - 1 to achieve this.
+    tp_bucket_value = tf.math.unsorted_segment_sum(true_labels, bucket_indices,
+                                                   num_segments=num_thresholds)
+                    = [1, 1, 0]
+    # For [1, 1, 0] here, it means there is 1 true value contributed by bucket
+    # 0, and 1 value contributed by bucket 1. When we aggregate them to
+    # together, the result become [a + b + c, b + c, c], since large thresholds
+    # will always contribute to the value for smaller thresholds.
+    true_positive = tf.math.cumsum(tp_bucket_value, reverse=True)
+                  = [2, 1, 0]
+
+    This implementation exhibits a run time and space complexity of O(T + N),
+    where T is the number of thresholds and N is the size of predictions.
+    Metrics that rely on standard implementation instead exhibit a complexity of
+    O(T * N).
+
+    Args:
+      variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+        and corresponding variables to update as values.
+      y_true: A floating point `Tensor` whose shape matches `y_pred`. Will be
+        cast to `bool`.
+      y_pred: A floating point `Tensor` of arbitrary shape and whose values are
+        in the range `[0, 1]`.
+      thresholds: A sorted floating point `Tensor` with value in `[0, 1]`.
+        It need to be evenly distributed (the diff between each element need to
+        be the same).
+      multi_label: Optional boolean indicating whether multidimensional
+        prediction/labels should be treated as multilabel responses, or
+        flattened into a single label. When True, the valus of
+        `variables_to_update` must have a second dimension equal to the number
+        of labels in y_true and y_pred, and those tensors must not be
+        RaggedTensors.
+      sample_weights: Optional `Tensor` whose rank is either 0, or the same rank
+        as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions
+        must be either `1`, or the same as the corresponding `y_true`
+        dimension).
+      label_weights: Optional tensor of non-negative weights for multilabel
+        data. The weights are applied when calculating TP, FP, FN, and TN
+        without explicit multilabel handling (i.e. when the data is to be
+        flattened).
+      thresholds_with_epsilon: Optional boolean indicating whether the leading
+        and tailing thresholds has any epsilon added for floating point
+        imprecisions.  It will change how we handle the leading and tailing
+        bucket.
+
+    Returns:
+      Update op.
+    """
+    num_thresholds = thresholds.shape.as_list()[0]
+
+    if sample_weights is None:
+        sample_weights = 1.0
+    else:
+        sample_weights = tf.__internal__.ops.broadcast_weights(
+            tf.cast(sample_weights, dtype=y_pred.dtype), y_pred
+        )
+        if not multi_label:
+            sample_weights = tf.reshape(sample_weights, [-1])
+    if label_weights is None:
+        label_weights = 1.0
+    else:
+        label_weights = tf.expand_dims(label_weights, 0)
+        label_weights = tf.__internal__.ops.broadcast_weights(
+            label_weights, y_pred
+        )
+        if not multi_label:
+            label_weights = tf.reshape(label_weights, [-1])
+    weights = tf.cast(tf.multiply(sample_weights, label_weights), y_true.dtype)
+
+    # We shouldn't need this, but in case there are predict value that is out of
+    # the range of [0.0, 1.0]
+    y_pred = tf.clip_by_value(y_pred, clip_value_min=0.0, clip_value_max=1.0)
+
+    y_true = tf.cast(tf.cast(y_true, tf.bool), y_true.dtype)
     if not multi_label:
-      label_weights = tf.reshape(label_weights, [-1])
-  weights = tf.multiply(sample_weights, label_weights)
-
-  # We shouldn't need this, but in case there are predict value that is out of
-  # the range of [0.0, 1.0]
-  y_pred = tf.clip_by_value(y_pred,
-                                  clip_value_min=0.0, clip_value_max=1.0)
-
-  y_true = tf.cast(tf.cast(y_true, tf.bool), y_true.dtype)
-  if not multi_label:
-    y_true = tf.reshape(y_true, [-1])
-    y_pred = tf.reshape(y_pred, [-1])
-
-  true_labels = tf.multiply(y_true, weights)
-  false_labels = tf.multiply((1.0 - y_true), weights)
-
-  # Compute the bucket indices for each prediction value.
-  # Since the predict value has to be strictly greater than the thresholds,
-  # eg, buckets like [0, 0.5], (0.5, 1], and 0.5 belongs to first bucket.
-  # We have to use math.ceil(val) - 1 for the bucket.
-  bucket_indices = tf.math.ceil(y_pred * (num_thresholds - 1)) - 1
-
-  if thresholds_with_epsilon:
-    # In this case, the first bucket should actually take into account since
-    # the any prediction between [0.0, 1.0] should be larger than the first
-    # threshold. We change the bucket value from -1 to 0.
-    bucket_indices = tf.nn.relu(bucket_indices)
-
-  bucket_indices = tf.cast(bucket_indices, tf.int32)
-
-  if multi_label:
-    # We need to run bucket segment sum for each of the label class. In the
-    # multi_label case, the rank of the label is 2. We first transpose it so
-    # that the label dim becomes the first and we can parallel run though them.
-    true_labels = tf.transpose(true_labels)
-    false_labels = tf.transpose(false_labels)
-    bucket_indices = tf.transpose(bucket_indices)
-
-    def gather_bucket(label_and_bucket_index):
-      label, bucket_index = label_and_bucket_index[0], label_and_bucket_index[1]
-      return tf.math.unsorted_segment_sum(
-          data=label, segment_ids=bucket_index, num_segments=num_thresholds)
-    tp_bucket_v = tf.vectorized_map(
-        gather_bucket, (true_labels, bucket_indices))
-    fp_bucket_v = tf.vectorized_map(
-        gather_bucket, (false_labels, bucket_indices))
-    tp = tf.transpose(
-        tf.cumsum(tp_bucket_v, reverse=True, axis=1))
-    fp = tf.transpose(
-        tf.cumsum(fp_bucket_v, reverse=True, axis=1))
-  else:
-    tp_bucket_v = tf.math.unsorted_segment_sum(
-        data=true_labels, segment_ids=bucket_indices,
-        num_segments=num_thresholds)
-    fp_bucket_v = tf.math.unsorted_segment_sum(
-        data=false_labels, segment_ids=bucket_indices,
-        num_segments=num_thresholds)
-    tp = tf.cumsum(tp_bucket_v, reverse=True)
-    fp = tf.cumsum(fp_bucket_v, reverse=True)
-
-  # fn = sum(true_labels) - tp
-  # tn = sum(false_labels) - fp
-  if (ConfusionMatrix.TRUE_NEGATIVES in variables_to_update or
-      ConfusionMatrix.FALSE_NEGATIVES in variables_to_update):
+        y_true = tf.reshape(y_true, [-1])
+        y_pred = tf.reshape(y_pred, [-1])
+
+    true_labels = tf.multiply(y_true, weights)
+    false_labels = tf.multiply((1.0 - y_true), weights)
+
+    # Compute the bucket indices for each prediction value.
+    # Since the predict value has to be strictly greater than the thresholds,
+    # eg, buckets like [0, 0.5], (0.5, 1], and 0.5 belongs to first bucket.
+    # We have to use math.ceil(val) - 1 for the bucket.
+    bucket_indices = tf.math.ceil(y_pred * (num_thresholds - 1)) - 1
+
+    if thresholds_with_epsilon:
+        # In this case, the first bucket should actually take into account since
+        # the any prediction between [0.0, 1.0] should be larger than the first
+        # threshold. We change the bucket value from -1 to 0.
+        bucket_indices = tf.nn.relu(bucket_indices)
+
+    bucket_indices = tf.cast(bucket_indices, tf.int32)
+
     if multi_label:
-      total_true_labels = tf.reduce_sum(true_labels, axis=1)
-      total_false_labels = tf.reduce_sum(false_labels, axis=1)
+        # We need to run bucket segment sum for each of the label class. In the
+        # multi_label case, the rank of the label is 2. We first transpose it so
+        # that the label dim becomes the first and we can parallel run though
+        # them.
+        true_labels = tf.transpose(true_labels)
+        false_labels = tf.transpose(false_labels)
+        bucket_indices = tf.transpose(bucket_indices)
+
+        def gather_bucket(label_and_bucket_index):
+            label, bucket_index = (
+                label_and_bucket_index[0],
+                label_and_bucket_index[1],
+            )
+            return tf.math.unsorted_segment_sum(
+                data=label,
+                segment_ids=bucket_index,
+                num_segments=num_thresholds,
+            )
+
+        tp_bucket_v = tf.vectorized_map(
+            gather_bucket, (true_labels, bucket_indices), warn=False
+        )
+        fp_bucket_v = tf.vectorized_map(
+            gather_bucket, (false_labels, bucket_indices), warn=False
+        )
+        tp = tf.transpose(tf.cumsum(tp_bucket_v, reverse=True, axis=1))
+        fp = tf.transpose(tf.cumsum(fp_bucket_v, reverse=True, axis=1))
     else:
-      total_true_labels = tf.reduce_sum(true_labels)
-      total_false_labels = tf.reduce_sum(false_labels)
-
-  update_ops = []
-  if ConfusionMatrix.TRUE_POSITIVES in variables_to_update:
-    variable = variables_to_update[ConfusionMatrix.TRUE_POSITIVES]
-    update_ops.append(variable.assign_add(tp))
-  if ConfusionMatrix.FALSE_POSITIVES in variables_to_update:
-    variable = variables_to_update[ConfusionMatrix.FALSE_POSITIVES]
-    update_ops.append(variable.assign_add(fp))
-  if ConfusionMatrix.TRUE_NEGATIVES in variables_to_update:
-    variable = variables_to_update[ConfusionMatrix.TRUE_NEGATIVES]
-    tn = total_false_labels - fp
-    update_ops.append(variable.assign_add(tn))
-  if ConfusionMatrix.FALSE_NEGATIVES in variables_to_update:
-    variable = variables_to_update[ConfusionMatrix.FALSE_NEGATIVES]
-    fn = total_true_labels - tp
-    update_ops.append(variable.assign_add(fn))
-  return tf.group(update_ops)
+        tp_bucket_v = tf.math.unsorted_segment_sum(
+            data=true_labels,
+            segment_ids=bucket_indices,
+            num_segments=num_thresholds,
+        )
+        fp_bucket_v = tf.math.unsorted_segment_sum(
+            data=false_labels,
+            segment_ids=bucket_indices,
+            num_segments=num_thresholds,
+        )
+        tp = tf.cumsum(tp_bucket_v, reverse=True)
+        fp = tf.cumsum(fp_bucket_v, reverse=True)
+
+    # fn = sum(true_labels) - tp
+    # tn = sum(false_labels) - fp
+    if (
+        ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+        or ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+    ):
+        if multi_label:
+            total_true_labels = tf.reduce_sum(true_labels, axis=1)
+            total_false_labels = tf.reduce_sum(false_labels, axis=1)
+        else:
+            total_true_labels = tf.reduce_sum(true_labels)
+            total_false_labels = tf.reduce_sum(false_labels)
+
+    update_ops = []
+    if ConfusionMatrix.TRUE_POSITIVES in variables_to_update:
+        variable = variables_to_update[ConfusionMatrix.TRUE_POSITIVES]
+        update_ops.append(variable.assign_add(tp))
+    if ConfusionMatrix.FALSE_POSITIVES in variables_to_update:
+        variable = variables_to_update[ConfusionMatrix.FALSE_POSITIVES]
+        update_ops.append(variable.assign_add(fp))
+    if ConfusionMatrix.TRUE_NEGATIVES in variables_to_update:
+        variable = variables_to_update[ConfusionMatrix.TRUE_NEGATIVES]
+        tn = total_false_labels - fp
+        update_ops.append(variable.assign_add(tn))
+    if ConfusionMatrix.FALSE_NEGATIVES in variables_to_update:
+        variable = variables_to_update[ConfusionMatrix.FALSE_NEGATIVES]
+        fn = total_true_labels - tp
+        update_ops.append(variable.assign_add(fn))
+    return tf.group(update_ops)
 
 
 def is_evenly_distributed_thresholds(thresholds):
-  """Check if the thresholds list is evenly distributed.
-
-  We could leverage evenly distributed thresholds to use less memory when
-  calculate metrcis like AUC where each individual threshold need to be
-  evaluated.
-
-  Args:
-    thresholds: A python list or tuple, or 1D numpy array whose value is ranged
-      in [0, 1].
-
-  Returns:
-    boolean, whether the values in the inputs are evenly distributed.
-  """
-  # Check the list value and see if it is evenly distributed.
-  num_thresholds = len(thresholds)
-  if num_thresholds < 3:
-    return False
-  even_thresholds = np.arange(num_thresholds,
-                              dtype=np.float32) / (num_thresholds - 1)
-  return np.allclose(thresholds, even_thresholds, atol=backend.epsilon())
-
-
-def update_confusion_matrix_variables(variables_to_update,
-                                      y_true,
-                                      y_pred,
-                                      thresholds,
-                                      top_k=None,
-                                      class_id=None,
-                                      sample_weight=None,
-                                      multi_label=False,
-                                      label_weights=None,
-                                      thresholds_distributed_evenly=False):
-  """Returns op to update the given confusion matrix variables.
-
-  For every pair of values in y_true and y_pred:
-
-  true_positive: y_true == True and y_pred > thresholds
-  false_negatives: y_true == True and y_pred <= thresholds
-  true_negatives: y_true == False and y_pred <= thresholds
-  false_positive: y_true == False and y_pred > thresholds
-
-  The results will be weighted and added together. When multiple thresholds are
-  provided, we will repeat the same for every threshold.
-
-  For estimation of these metrics over a stream of data, the function creates an
-  `update_op` operation that updates the given variables.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use weights of 0 to mask values.
-
-  Args:
-    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
-      and corresponding variables to update as values.
-    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
-    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
-      the range `[0, 1]`.
-    thresholds: A float value, float tensor, python list, or tuple of float
-      thresholds in `[0, 1]`, or NEG_INF (used when top_k is set).
-    top_k: Optional int, indicates that the positive labels should be limited to
-      the top k predictions.
-    class_id: Optional int, limits the prediction and labels to the class
-      specified by this argument.
-    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
-      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
-      be either `1`, or the same as the corresponding `y_true` dimension).
-    multi_label: Optional boolean indicating whether multidimensional
-      prediction/labels should be treated as multilabel responses, or flattened
-      into a single label. When True, the valus of `variables_to_update` must
-      have a second dimension equal to the number of labels in y_true and
-      y_pred, and those tensors must not be RaggedTensors.
-    label_weights: (optional) tensor of non-negative weights for multilabel
-      data. The weights are applied when calculating TP, FP, FN, and TN without
-      explicit multilabel handling (i.e. when the data is to be flattened).
-    thresholds_distributed_evenly: Boolean, whether the thresholds are evenly
-      distributed within the list. An optimized method will be used if this is
-      the case. See _update_confusion_matrix_variables_optimized() for more
-      details.
-
-  Returns:
-    Update op.
-
-  Raises:
-    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
-      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
-      `variables_to_update` contains invalid keys.
-  """
-  if multi_label and label_weights is not None:
-    raise ValueError('`label_weights` for multilabel data should be handled '
-                     'outside of `update_confusion_matrix_variables` when '
-                     '`multi_label` is True.')
-  if variables_to_update is None:
-    return
-  if not any(
-      key for key in variables_to_update if key in list(ConfusionMatrix)):
-    raise ValueError(
-        'Please provide at least one valid confusion matrix '
-        'variable to update. Valid variable key options are: '
-        f'"{list(ConfusionMatrix)}". Received: "{variables_to_update.keys()}"')
-
-  variable_dtype = list(variables_to_update.values())[0].dtype
-
-  y_true = tf.cast(y_true, dtype=variable_dtype)
-  y_pred = tf.cast(y_pred, dtype=variable_dtype)
-
-  if thresholds_distributed_evenly:
-    # Check whether the thresholds has any leading or tailing epsilon added
-    # for floating point imprecision. The leading and tailing threshold will be
-    # handled bit differently as the corner case.
-    # At this point, thresholds should be a list/array with more than 2 items,
-    # and ranged between [0, 1]. See is_evenly_distributed_thresholds() for more
-    # details.
-    thresholds_with_epsilon = thresholds[0] < 0.0 or thresholds[-1] > 1.0
-
-  thresholds = tf.convert_to_tensor(
-      thresholds, dtype=variable_dtype)
-  num_thresholds = thresholds.shape.as_list()[0]
-
-  if multi_label:
-    one_thresh = tf.equal(
-        tf.cast(1, dtype=tf.int32),
-        tf.rank(thresholds),
-        name='one_set_of_thresholds_cond')
-  else:
-    [y_pred,
-     y_true], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true],
-                                                               sample_weight)
-    one_thresh = tf.cast(True, dtype=tf.bool)
-
-  invalid_keys = [
-      key for key in variables_to_update if key not in list(ConfusionMatrix)
-  ]
-  if invalid_keys:
-    raise ValueError(
-        f'Invalid keys: "{invalid_keys}". '
-        f'Valid variable key options are: "{list(ConfusionMatrix)}"')
-
-  if sample_weight is None:
-    y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
-        y_pred, y_true)
-  else:
-    sample_weight = tf.cast(sample_weight, dtype=variable_dtype)
-    y_pred, y_true, sample_weight = (
-        losses_utils.squeeze_or_expand_dimensions(
-            y_pred, y_true, sample_weight=sample_weight))
-  y_pred.shape.assert_is_compatible_with(y_true.shape)
-
-  if top_k is not None:
-    y_pred = _filter_top_k(y_pred, top_k)
-  if class_id is not None:
-    y_true = y_true[..., class_id]
-    y_pred = y_pred[..., class_id]
-
-  if thresholds_distributed_evenly:
-    return _update_confusion_matrix_variables_optimized(
-        variables_to_update, y_true, y_pred, thresholds,
-        multi_label=multi_label, sample_weights=sample_weight,
-        label_weights=label_weights,
-        thresholds_with_epsilon=thresholds_with_epsilon)
-
-  pred_shape = tf.shape(y_pred)
-  num_predictions = pred_shape[0]
-  if y_pred.shape.ndims == 1:
-    num_labels = 1
-  else:
-    num_labels = tf.math.reduce_prod(pred_shape[1:], axis=0)
-  thresh_label_tile = tf.where(one_thresh, num_labels,
-                                         tf.ones([], dtype=tf.int32))
-
-  # Reshape predictions and labels, adding a dim for thresholding.
-  if multi_label:
-    predictions_extra_dim = tf.expand_dims(y_pred, 0)
-    labels_extra_dim = tf.expand_dims(
-        tf.cast(y_true, dtype=tf.bool), 0)
-  else:
-    # Flatten predictions and labels when not multilabel.
-    predictions_extra_dim = tf.reshape(y_pred, [1, -1])
-    labels_extra_dim = tf.reshape(
-        tf.cast(y_true, dtype=tf.bool), [1, -1])
-
-  # Tile the thresholds for every prediction.
-  if multi_label:
-    thresh_pretile_shape = [num_thresholds, 1, -1]
-    thresh_tiles = [1, num_predictions, thresh_label_tile]
-    data_tiles = [num_thresholds, 1, 1]
-  else:
-    thresh_pretile_shape = [num_thresholds, -1]
-    thresh_tiles = [1, num_predictions * num_labels]
-    data_tiles = [num_thresholds, 1]
-
-  thresh_tiled = tf.tile(
-      tf.reshape(thresholds, thresh_pretile_shape),
-      tf.stack(thresh_tiles))
-
-  # Tile the predictions for every threshold.
-  preds_tiled = tf.tile(predictions_extra_dim, data_tiles)
-
-  # Compare predictions and threshold.
-  pred_is_pos = tf.greater(preds_tiled, thresh_tiled)
-
-  # Tile labels by number of thresholds
-  label_is_pos = tf.tile(labels_extra_dim, data_tiles)
-
-  if sample_weight is not None:
-    sample_weight = tf.__internal__.ops.broadcast_weights(
-        tf.cast(sample_weight, dtype=variable_dtype), y_pred)
-    weights_tiled = tf.tile(
-        tf.reshape(sample_weight, thresh_tiles), data_tiles)
-  else:
-    weights_tiled = None
-
-  if label_weights is not None and not multi_label:
-    label_weights = tf.expand_dims(label_weights, 0)
-    label_weights = tf.__internal__.ops.broadcast_weights(label_weights,
-                                                            y_pred)
-    label_weights_tiled = tf.tile(
-        tf.reshape(label_weights, thresh_tiles), data_tiles)
-    if weights_tiled is None:
-      weights_tiled = label_weights_tiled
+    """Check if the thresholds list is evenly distributed.
+
+    We could leverage evenly distributed thresholds to use less memory when
+    calculate metrcis like AUC where each individual threshold need to be
+    evaluated.
+
+    Args:
+      thresholds: A python list or tuple, or 1D numpy array whose value is
+        ranged in [0, 1].
+
+    Returns:
+      boolean, whether the values in the inputs are evenly distributed.
+    """
+    # Check the list value and see if it is evenly distributed.
+    num_thresholds = len(thresholds)
+    if num_thresholds < 3:
+        return False
+    even_thresholds = np.arange(num_thresholds, dtype=np.float32) / (
+        num_thresholds - 1
+    )
+    return np.allclose(thresholds, even_thresholds, atol=backend.epsilon())
+
+
+def update_confusion_matrix_variables(
+    variables_to_update,
+    y_true,
+    y_pred,
+    thresholds,
+    top_k=None,
+    class_id=None,
+    sample_weight=None,
+    multi_label=False,
+    label_weights=None,
+    thresholds_distributed_evenly=False,
+):
+    """Returns op to update the given confusion matrix variables.
+
+    For every pair of values in y_true and y_pred:
+
+    true_positive: y_true == True and y_pred > thresholds
+    false_negatives: y_true == True and y_pred <= thresholds
+    true_negatives: y_true == False and y_pred <= thresholds
+    false_positive: y_true == False and y_pred > thresholds
+
+    The results will be weighted and added together. When multiple thresholds
+    are provided, we will repeat the same for every threshold.
+
+    For estimation of these metrics over a stream of data, the function creates
+    an `update_op` operation that updates the given variables.
+
+    If `sample_weight` is `None`, weights default to 1.
+    Use weights of 0 to mask values.
+
+    Args:
+      variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+        and corresponding variables to update as values.
+      y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
+      y_pred: A floating point `Tensor` of arbitrary shape and whose values are
+        in the range `[0, 1]`.
+      thresholds: A float value, float tensor, python list, or tuple of float
+        thresholds in `[0, 1]`, or NEG_INF (used when top_k is set).
+      top_k: Optional int, indicates that the positive labels should be limited
+        to the top k predictions.
+      class_id: Optional int, limits the prediction and labels to the class
+        specified by this argument.
+      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank
+        as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions
+        must be either `1`, or the same as the corresponding `y_true`
+        dimension).
+      multi_label: Optional boolean indicating whether multidimensional
+        prediction/labels should be treated as multilabel responses, or
+        flattened into a single label. When True, the valus of
+        `variables_to_update` must have a second dimension equal to the number
+        of labels in y_true and y_pred, and those tensors must not be
+        RaggedTensors.
+      label_weights: (optional) tensor of non-negative weights for multilabel
+        data. The weights are applied when calculating TP, FP, FN, and TN
+        without explicit multilabel handling (i.e. when the data is to be
+        flattened).
+      thresholds_distributed_evenly: Boolean, whether the thresholds are evenly
+        distributed within the list. An optimized method will be used if this is
+        the case. See _update_confusion_matrix_variables_optimized() for more
+        details.
+
+    Returns:
+      Update op.
+
+    Raises:
+      ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
+        `sample_weight` is not `None` and its shape doesn't match `y_pred`, or
+        if `variables_to_update` contains invalid keys.
+    """
+    if multi_label and label_weights is not None:
+        raise ValueError(
+            "`label_weights` for multilabel data should be handled "
+            "outside of `update_confusion_matrix_variables` when "
+            "`multi_label` is True."
+        )
+    if variables_to_update is None:
+        return
+    if not any(
+        key for key in variables_to_update if key in list(ConfusionMatrix)
+    ):
+        raise ValueError(
+            "Please provide at least one valid confusion matrix "
+            "variable to update. Valid variable key options are: "
+            f'"{list(ConfusionMatrix)}". '
+            f'Received: "{variables_to_update.keys()}"'
+        )
+
+    variable_dtype = list(variables_to_update.values())[0].dtype
+
+    y_true = tf.cast(y_true, dtype=variable_dtype)
+    y_pred = tf.cast(y_pred, dtype=variable_dtype)
+
+    if thresholds_distributed_evenly:
+        # Check whether the thresholds has any leading or tailing epsilon added
+        # for floating point imprecision. The leading and tailing threshold will
+        # be handled bit differently as the corner case.  At this point,
+        # thresholds should be a list/array with more than 2 items, and ranged
+        # between [0, 1]. See is_evenly_distributed_thresholds() for more
+        # details.
+        thresholds_with_epsilon = thresholds[0] < 0.0 or thresholds[-1] > 1.0
+
+    thresholds = tf.convert_to_tensor(thresholds, dtype=variable_dtype)
+    num_thresholds = thresholds.shape.as_list()[0]
+
+    if multi_label:
+        one_thresh = tf.equal(
+            tf.cast(1, dtype=tf.int32),
+            tf.rank(thresholds),
+            name="one_set_of_thresholds_cond",
+        )
+    else:
+        [y_pred, y_true], _ = ragged_assert_compatible_and_get_flat_values(
+            [y_pred, y_true], sample_weight
+        )
+        one_thresh = tf.cast(True, dtype=tf.bool)
+
+    invalid_keys = [
+        key for key in variables_to_update if key not in list(ConfusionMatrix)
+    ]
+    if invalid_keys:
+        raise ValueError(
+            f'Invalid keys: "{invalid_keys}". '
+            f'Valid variable key options are: "{list(ConfusionMatrix)}"'
+        )
+
+    if sample_weight is None:
+        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
+            y_pred, y_true
+        )
+    else:
+        sample_weight = tf.cast(sample_weight, dtype=variable_dtype)
+        (
+            y_pred,
+            y_true,
+            sample_weight,
+        ) = losses_utils.squeeze_or_expand_dimensions(
+            y_pred, y_true, sample_weight=sample_weight
+        )
+    y_pred.shape.assert_is_compatible_with(y_true.shape)
+
+    if top_k is not None:
+        y_pred = _filter_top_k(y_pred, top_k)
+    if class_id is not None:
+        # Preserve dimension to match with sample_weight
+        y_true = y_true[..., class_id, None]
+        y_pred = y_pred[..., class_id, None]
+
+    if thresholds_distributed_evenly:
+        return _update_confusion_matrix_variables_optimized(
+            variables_to_update,
+            y_true,
+            y_pred,
+            thresholds,
+            multi_label=multi_label,
+            sample_weights=sample_weight,
+            label_weights=label_weights,
+            thresholds_with_epsilon=thresholds_with_epsilon,
+        )
+
+    pred_shape = tf.shape(y_pred)
+    num_predictions = pred_shape[0]
+    if y_pred.shape.ndims == 1:
+        num_labels = 1
     else:
-      weights_tiled = tf.multiply(weights_tiled, label_weights_tiled)
+        num_labels = tf.math.reduce_prod(pred_shape[1:], axis=0)
+    thresh_label_tile = tf.where(
+        one_thresh, num_labels, tf.ones([], dtype=tf.int32)
+    )
 
-  update_ops = []
+    # Reshape predictions and labels, adding a dim for thresholding.
+    if multi_label:
+        predictions_extra_dim = tf.expand_dims(y_pred, 0)
+        labels_extra_dim = tf.expand_dims(tf.cast(y_true, dtype=tf.bool), 0)
+    else:
+        # Flatten predictions and labels when not multilabel.
+        predictions_extra_dim = tf.reshape(y_pred, [1, -1])
+        labels_extra_dim = tf.reshape(tf.cast(y_true, dtype=tf.bool), [1, -1])
 
-  def weighted_assign_add(label, pred, weights, var):
-    label_and_pred = tf.cast(
-        tf.logical_and(label, pred), dtype=var.dtype)
-    if weights is not None:
-      label_and_pred *= tf.cast(weights, dtype=var.dtype)
-    return var.assign_add(tf.reduce_sum(label_and_pred, 1))
+    # Tile the thresholds for every prediction.
+    if multi_label:
+        thresh_pretile_shape = [num_thresholds, 1, -1]
+        thresh_tiles = [1, num_predictions, thresh_label_tile]
+        data_tiles = [num_thresholds, 1, 1]
+    else:
+        thresh_pretile_shape = [num_thresholds, -1]
+        thresh_tiles = [1, num_predictions * num_labels]
+        data_tiles = [num_thresholds, 1]
+
+    thresh_tiled = tf.tile(
+        tf.reshape(thresholds, thresh_pretile_shape), tf.stack(thresh_tiles)
+    )
+
+    # Tile the predictions for every threshold.
+    preds_tiled = tf.tile(predictions_extra_dim, data_tiles)
+
+    # Compare predictions and threshold.
+    pred_is_pos = tf.greater(preds_tiled, thresh_tiled)
+
+    # Tile labels by number of thresholds
+    label_is_pos = tf.tile(labels_extra_dim, data_tiles)
+
+    if sample_weight is not None:
+        sample_weight = tf.__internal__.ops.broadcast_weights(
+            tf.cast(sample_weight, dtype=variable_dtype), y_pred
+        )
+        weights_tiled = tf.tile(
+            tf.reshape(sample_weight, thresh_tiles), data_tiles
+        )
+    else:
+        weights_tiled = None
+
+    if label_weights is not None and not multi_label:
+        label_weights = tf.expand_dims(label_weights, 0)
+        label_weights = tf.__internal__.ops.broadcast_weights(
+            label_weights, y_pred
+        )
+        label_weights_tiled = tf.tile(
+            tf.reshape(label_weights, thresh_tiles), data_tiles
+        )
+        if weights_tiled is None:
+            weights_tiled = label_weights_tiled
+        else:
+            weights_tiled = tf.multiply(weights_tiled, label_weights_tiled)
+
+    update_ops = []
 
-  loop_vars = {
-      ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
-  }
-  update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
-  update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update
-  update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+    def weighted_assign_add(label, pred, weights, var):
+        label_and_pred = tf.cast(tf.logical_and(label, pred), dtype=var.dtype)
+        if weights is not None:
+            label_and_pred *= tf.cast(weights, dtype=var.dtype)
+        return var.assign_add(tf.reduce_sum(label_and_pred, 1))
 
-  if update_fn or update_tn:
-    pred_is_neg = tf.logical_not(pred_is_pos)
-    loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
+    loop_vars = {
+        ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
+    }
+    update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+    update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update
+    update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
 
-  if update_fp or update_tn:
-    label_is_neg = tf.logical_not(label_is_pos)
-    loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
-    if update_tn:
-      loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
+    if update_fn or update_tn:
+        pred_is_neg = tf.logical_not(pred_is_pos)
+        loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
 
-  for matrix_cond, (label, pred) in loop_vars.items():
+    if update_fp or update_tn:
+        label_is_neg = tf.logical_not(label_is_pos)
+        loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
+        if update_tn:
+            loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (
+                label_is_neg,
+                pred_is_neg,
+            )
 
-    if matrix_cond in variables_to_update:
-      update_ops.append(
-          weighted_assign_add(label, pred, weights_tiled,
-                              variables_to_update[matrix_cond]))
+    for matrix_cond, (label, pred) in loop_vars.items():
 
-  return tf.group(update_ops)
+        if matrix_cond in variables_to_update:
+            update_ops.append(
+                weighted_assign_add(
+                    label, pred, weights_tiled, variables_to_update[matrix_cond]
+                )
+            )
+
+    return tf.group(update_ops)
 
 
 def _filter_top_k(x, k):
-  """Filters top-k values in the last dim of x and set the rest to NEG_INF.
+    """Filters top-k values in the last dim of x and set the rest to NEG_INF.
 
-  Used for computing top-k prediction values in dense labels (which has the same
-  shape as predictions) for recall and precision top-k metrics.
+    Used for computing top-k prediction values in dense labels (which has the
+    same shape as predictions) for recall and precision top-k metrics.
 
-  Args:
-    x: tensor with any dimensions.
-    k: the number of values to keep.
+    Args:
+      x: tensor with any dimensions.
+      k: the number of values to keep.
 
-  Returns:
-    tensor with same shape and dtype as x.
-  """
-  _, top_k_idx = tf.math.top_k(x, k, sorted=False)
-  top_k_mask = tf.reduce_sum(
-      tf.one_hot(top_k_idx, tf.shape(x)[-1], axis=-1), axis=-2)
-  return x * top_k_mask + NEG_INF * (1 - top_k_mask)
+    Returns:
+      tensor with same shape and dtype as x.
+    """
+    _, top_k_idx = tf.math.top_k(x, k, sorted=False)
+    top_k_mask = tf.reduce_sum(
+        tf.one_hot(top_k_idx, tf.shape(x)[-1], axis=-1), axis=-2
+    )
+    return x * top_k_mask + NEG_INF * (1 - top_k_mask)
 
 
 def ragged_assert_compatible_and_get_flat_values(values, mask=None):
-  """If ragged, it checks the compatibility and then returns the flat_values.
-
-     Note: If two tensors are dense, it does not check their compatibility.
-     Note: Although two ragged tensors with different ragged ranks could have
-           identical overall rank and dimension sizes and hence be compatible,
-           we do not support those cases.
-  Args:
-     values: A list of potentially ragged tensor of the same ragged_rank.
-     mask: A potentially ragged tensor of the same ragged_rank as elements in
-       Values.
-
-  Returns:
-     A tuple in which the first element is the list of tensors and the second
-     is the mask tensor. ([Values], mask). Mask and the element in Values
-     are equal to the flat_values of the input arguments (if they were ragged).
-  """
-  if isinstance(values, list):
-    is_all_ragged = \
-        all(isinstance(rt, tf.RaggedTensor) for rt in values)
-    is_any_ragged = \
-        any(isinstance(rt, tf.RaggedTensor) for rt in values)
-  else:
-    is_all_ragged = isinstance(values, tf.RaggedTensor)
-    is_any_ragged = is_all_ragged
-  if (is_all_ragged and
-      ((mask is None) or isinstance(mask, tf.RaggedTensor))):
-    to_be_stripped = False
-    if not isinstance(values, list):
-      values = [values]
-      to_be_stripped = True
-
-    # NOTE: we leave the flat_values compatibility to
-    # tf.TensorShape `assert_is_compatible_with`
-    # check if both dynamic dimensions are equal and then use the flat_values.
-    nested_row_split_list = [rt.nested_row_splits for rt in values]
-    assertion_list = _assert_splits_match(nested_row_split_list)
-
-    # if both are ragged sample_weights also should be ragged with same dims.
-    if isinstance(mask, tf.RaggedTensor):
-      assertion_list_for_mask = _assert_splits_match(
-          [nested_row_split_list[0], mask.nested_row_splits])
-      with tf.control_dependencies(assertion_list_for_mask):
-        mask = tf.expand_dims(mask.flat_values, -1)
-
-    # values has at least 1 element.
-    flat_values = []
-    for value in values:
-      with tf.control_dependencies(assertion_list):
-        flat_values.append(tf.expand_dims(value.flat_values, -1))
-
-    values = flat_values[0] if to_be_stripped else flat_values
-
-  elif is_any_ragged:
-    raise TypeError('Some of the inputs are not tf.RaggedTensor. '
-                    f'Input received: {values}')
-  # values are empty or value are not ragged and mask is ragged.
-  elif isinstance(mask, tf.RaggedTensor):
-    raise TypeError('Ragged mask is not allowed with non-ragged inputs. '
-                    f'Input received: {values}, mask received: {mask}')
-
-  return values, mask
+    """If ragged, it checks the compatibility and then returns the flat_values.
+
+       Note: If two tensors are dense, it does not check their compatibility.
+       Note: Although two ragged tensors with different ragged ranks could have
+             identical overall rank and dimension sizes and hence be compatible,
+             we do not support those cases.
+    Args:
+       values: A list of potentially ragged tensor of the same ragged_rank.
+       mask: A potentially ragged tensor of the same ragged_rank as elements in
+         Values.
+
+    Returns:
+       A tuple in which the first element is the list of tensors and the second
+       is the mask tensor. ([Values], mask). Mask and the element in Values
+       are equal to the flat_values of the input arguments (if they were
+       ragged).
+    """
+    if isinstance(values, list):
+        is_all_ragged = all(isinstance(rt, tf.RaggedTensor) for rt in values)
+        is_any_ragged = any(isinstance(rt, tf.RaggedTensor) for rt in values)
+    else:
+        is_all_ragged = isinstance(values, tf.RaggedTensor)
+        is_any_ragged = is_all_ragged
+    if is_all_ragged and ((mask is None) or isinstance(mask, tf.RaggedTensor)):
+        to_be_stripped = False
+        if not isinstance(values, list):
+            values = [values]
+            to_be_stripped = True
+
+        # NOTE: we leave the flat_values compatibility to
+        # tf.TensorShape `assert_is_compatible_with` check if both dynamic
+        # dimensions are equal and then use the flat_values.
+        nested_row_split_list = [rt.nested_row_splits for rt in values]
+        assertion_list = _assert_splits_match(nested_row_split_list)
+
+        # if both are ragged sample_weights also should be ragged with same
+        # dims.
+        if isinstance(mask, tf.RaggedTensor):
+            assertion_list_for_mask = _assert_splits_match(
+                [nested_row_split_list[0], mask.nested_row_splits]
+            )
+            with tf.control_dependencies(assertion_list_for_mask):
+                mask = tf.expand_dims(mask.flat_values, -1)
+
+        # values has at least 1 element.
+        flat_values = []
+        for value in values:
+            with tf.control_dependencies(assertion_list):
+                flat_values.append(tf.expand_dims(value.flat_values, -1))
+
+        values = flat_values[0] if to_be_stripped else flat_values
+
+    elif is_any_ragged:
+        raise TypeError(
+            "Some of the inputs are not tf.RaggedTensor. "
+            f"Input received: {values}"
+        )
+    # values are empty or value are not ragged and mask is ragged.
+    elif isinstance(mask, tf.RaggedTensor):
+        raise TypeError(
+            "Ragged mask is not allowed with non-ragged inputs. "
+            f"Input received: {values}, mask received: {mask}"
+        )
+
+    return values, mask
 
 
 def _assert_splits_match(nested_splits_lists):
-  """Checks that the given splits lists are identical.
-
-  Performs static tests to ensure that the given splits lists are identical,
-  and returns a list of control dependency op tensors that check that they are
-  fully identical.
-
-  Args:
-    nested_splits_lists: A list of nested_splits_lists, where each split_list is
-      a list of `splits` tensors from a `RaggedTensor`, ordered from outermost
-      ragged dimension to innermost ragged dimension.
-
-  Returns:
-    A list of control dependency op tensors.
-  Raises:
-    ValueError: If the splits are not identical.
-  """
-  error_msg = ('Inputs must have identical ragged splits. '
-               f'Input received: {nested_splits_lists}')
-  for splits_list in nested_splits_lists:
-    if len(splits_list) != len(nested_splits_lists[0]):
-      raise ValueError(error_msg)
-  return [
-      tf.debugging.assert_equal(s1, s2, message=error_msg)  # pylint: disable=g-complex-comprehension
-      for splits_list in nested_splits_lists[1:]
-      for (s1, s2) in zip(nested_splits_lists[0], splits_list)
-  ]
+    """Checks that the given splits lists are identical.
+
+    Performs static tests to ensure that the given splits lists are identical,
+    and returns a list of control dependency op tensors that check that they are
+    fully identical.
+
+    Args:
+      nested_splits_lists: A list of nested_splits_lists, where each split_list
+        is a list of `splits` tensors from a `RaggedTensor`, ordered from
+        outermost ragged dimension to innermost ragged dimension.
+
+    Returns:
+      A list of control dependency op tensors.
+    Raises:
+      ValueError: If the splits are not identical.
+    """
+    error_msg = (
+        "Inputs must have identical ragged splits. "
+        f"Input received: {nested_splits_lists}"
+    )
+    for splits_list in nested_splits_lists:
+        if len(splits_list) != len(nested_splits_lists[0]):
+            raise ValueError(error_msg)
+    return [
+        tf.debugging.assert_equal(s1, s2, message=error_msg)
+        for splits_list in nested_splits_lists[1:]
+        for (s1, s2) in zip(nested_splits_lists[0], splits_list)
+    ]
 
 
 def binary_matches(y_true, y_pred, threshold=0.5):
-  """Creates int Tensor, 1 for label-prediction match, 0 for mismatch.
+    """Creates int Tensor, 1 for label-prediction match, 0 for mismatch.
 
-  Args:
-    y_true: Ground truth values, of shape (batch_size, d0, .. dN).
-    y_pred: The predicted values, of shape (batch_size, d0, .. dN).
-    threshold: (Optional) Float representing the threshold for deciding whether
-      prediction values are 1 or 0.
+    Args:
+      y_true: Ground truth values, of shape (batch_size, d0, .. dN).
+      y_pred: The predicted values, of shape (batch_size, d0, .. dN).
+      threshold: (Optional) Float representing the threshold for deciding
+        whether prediction values are 1 or 0.
 
-  Returns:
-    Binary matches, of shape (batch_size, d0, .. dN).
-  """
-  y_pred = tf.convert_to_tensor(y_pred)
-  threshold = tf.cast(threshold, y_pred.dtype)
-  y_pred = tf.cast(y_pred > threshold, y_pred.dtype)
-  return tf.cast(tf.equal(y_true, y_pred), backend.floatx())
+    Returns:
+      Binary matches, of shape (batch_size, d0, .. dN).
+    """
+    y_pred = tf.convert_to_tensor(y_pred)
+    threshold = tf.cast(threshold, y_pred.dtype)
+    y_pred = tf.cast(y_pred > threshold, y_pred.dtype)
+    return tf.cast(tf.equal(y_true, y_pred), backend.floatx())
 
 
 def sparse_categorical_matches(y_true, y_pred):
-  """Creates float Tensor, 1.0 for label-prediction match, 0.0 for mismatch.
-
-  You can provide logits of classes as `y_pred`, since argmax of
-  logits and probabilities are same.
-
-  Args:
-    y_true: Integer ground truth values.
-    y_pred: The prediction values.
-
-  Returns:
-    Match tensor: 1.0 for label-prediction match, 0.0 for mismatch.
-  """
-  reshape_matches = False
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true = tf.convert_to_tensor(y_true)
-  y_true_org_shape = tf.shape(y_true)
-  y_pred_rank = y_pred.shape.ndims
-  y_true_rank = y_true.shape.ndims
-
-  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
-  if (y_true_rank is not None) and (y_pred_rank is not None) and (len(
-      backend.int_shape(y_true)) == len(backend.int_shape(y_pred))):
-    y_true = tf.squeeze(y_true, [-1])
-    reshape_matches = True
-  y_pred = tf.math.argmax(y_pred, axis=-1)
-
-  # If the predicted output and actual output types don't match, force cast them
-  # to match.
-  if backend.dtype(y_pred) != backend.dtype(y_true):
-    y_pred = tf.cast(y_pred, backend.dtype(y_true))
-  matches = tf.cast(tf.equal(y_true, y_pred), backend.floatx())
-  if reshape_matches:
-    matches = tf.reshape(matches, shape=y_true_org_shape)
-  return matches
+    """Creates float Tensor, 1.0 for label-prediction match, 0.0 for mismatch.
+
+    You can provide logits of classes as `y_pred`, since argmax of
+    logits and probabilities are same.
+
+    Args:
+      y_true: Integer ground truth values.
+      y_pred: The prediction values.
+
+    Returns:
+      Match tensor: 1.0 for label-prediction match, 0.0 for mismatch.
+    """
+    reshape_matches = False
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true = tf.convert_to_tensor(y_true)
+    y_true_org_shape = tf.shape(y_true)
+    y_pred_rank = y_pred.shape.ndims
+    y_true_rank = y_true.shape.ndims
+
+    # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
+    if (
+        (y_true_rank is not None)
+        and (y_pred_rank is not None)
+        and (len(backend.int_shape(y_true)) == len(backend.int_shape(y_pred)))
+    ):
+        y_true = tf.squeeze(y_true, [-1])
+        reshape_matches = True
+    y_pred = tf.math.argmax(y_pred, axis=-1)
+
+    # If the predicted output and actual output types don't match, force cast
+    # them to match.
+    if backend.dtype(y_pred) != backend.dtype(y_true):
+        y_pred = tf.cast(y_pred, backend.dtype(y_true))
+    matches = tf.cast(tf.equal(y_true, y_pred), backend.floatx())
+    if reshape_matches:
+        matches = tf.reshape(matches, shape=y_true_org_shape)
+    return matches
 
 
 def sparse_top_k_categorical_matches(y_true, y_pred, k=5):
-  """Creates float Tensor, 1.0 for label-TopK_prediction match, 0.0 for mismatch.
-
-  Args:
-    y_true: tensor of true targets.
-    y_pred: tensor of predicted targets.
-    k: (Optional) Number of top elements to look at for computing accuracy.
-      Defaults to 5.
-
-  Returns:
-    Match tensor: 1.0 for label-prediction match, 0.0 for mismatch.
-  """
-  reshape_matches = False
-  y_true = tf.convert_to_tensor(y_true)
-  y_pred = tf.convert_to_tensor(y_pred)
-  y_true_rank = y_true.shape.ndims
-  y_pred_rank = y_pred.shape.ndims
-  y_true_org_shape = tf.shape(y_true)
-
-  # Flatten y_pred to (batch_size, num_samples) and y_true to (num_samples,)
-  if (y_true_rank is not None) and (y_pred_rank is not None):
-    if y_pred_rank > 2:
-      y_pred = tf.reshape(y_pred, [-1, y_pred.shape[-1]])
-    if y_true_rank > 1:
-      reshape_matches = True
-      y_true = tf.reshape(y_true, [-1])
-
-  matches = tf.cast(
-      tf.math.in_top_k(
-          predictions=y_pred, targets=tf.cast(y_true, 'int32'), k=k),
-      dtype=backend.floatx())
-
-  # returned matches is expected to have same shape as y_true input
-  if reshape_matches:
-    return tf.reshape(matches, shape=y_true_org_shape)
-
-  return matches
+    """Creates float Tensor, 1.0 for label-TopK_prediction match, 0.0 for
+    mismatch.
+
+    Args:
+      y_true: tensor of true targets.
+      y_pred: tensor of predicted targets.
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to `5`.
+
+    Returns:
+      Match tensor: 1.0 for label-prediction match, 0.0 for mismatch.
+    """
+    reshape_matches = False
+    y_true = tf.convert_to_tensor(y_true)
+    y_pred = tf.convert_to_tensor(y_pred)
+    y_true_rank = y_true.shape.ndims
+    y_pred_rank = y_pred.shape.ndims
+    y_true_org_shape = tf.shape(y_true)
+
+    # Flatten y_pred to (batch_size, num_samples) and y_true to (num_samples,)
+    if (y_true_rank is not None) and (y_pred_rank is not None):
+        if y_pred_rank > 2:
+            y_pred = tf.reshape(y_pred, [-1, y_pred.shape[-1]])
+        if y_true_rank > 1:
+            reshape_matches = True
+            y_true = tf.reshape(y_true, [-1])
+
+    matches = tf.cast(
+        tf.math.in_top_k(
+            predictions=y_pred, targets=tf.cast(y_true, "int32"), k=k
+        ),
+        dtype=backend.floatx(),
+    )
+
+    # returned matches is expected to have same shape as y_true input
+    if reshape_matches:
+        return tf.reshape(matches, shape=y_true_org_shape)
+
+    return matches
diff --git a/keras/utils/metrics_utils_test.py b/keras/utils/metrics_utils_test.py
index 42284a06a953..e099781b4fb7 100644
--- a/keras/utils/metrics_utils_test.py
+++ b/keras/utils/metrics_utils_test.py
@@ -14,422 +14,535 @@
 # ==============================================================================
 """Tests for metrics_utils."""
 
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
 
 from keras import backend
 from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
 from keras.utils import metrics_utils
 
-import numpy as np
-import tensorflow.compat.v2 as tf
-
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class RaggedSizeOpTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters([
-      {
-          'x_list': [1],
-          'y_list': [2]
-      },
-      {
-          'x_list': [1, 2],
-          'y_list': [2, 3]
-      },
-      {
-          'x_list': [1, 2, 4],
-          'y_list': [2, 3, 5]
-      },
-      {
-          'x_list': [[1, 2], [3, 4]],
-          'y_list': [[2, 3], [5, 6]]
-      },
-  ])
-  def test_passing_dense_tensors(self, x_list, y_list):
-    x = tf.constant(x_list)
-    y = tf.constant(y_list)
-    [x,
-     y], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x, y])
-    x.shape.assert_is_compatible_with(y.shape)
-
-  @parameterized.parameters([
-      {
-          'x_list': [1],
-      },
-      {
-          'x_list': [1, 2],
-      },
-      {
-          'x_list': [1, 2, 4],
-      },
-      {
-          'x_list': [[1, 2], [3, 4]],
-      },
-  ])
-  def test_passing_one_dense_tensor(self, x_list):
-    x = tf.constant(x_list)
-    [x], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x])
-
-  @parameterized.parameters([
-      {
-          'x_list': [1],
-          'y_list': [2]
-      },
-      {
-          'x_list': [1, 2],
-          'y_list': [2, 3]
-      },
-      {
-          'x_list': [1, 2, 4],
-          'y_list': [2, 3, 5]
-      },
-      {
-          'x_list': [[1, 2], [3, 4]],
-          'y_list': [[2, 3], [5, 6]]
-      },
-      {
-          'x_list': [[1, 2], [3, 4], [1]],
-          'y_list': [[2, 3], [5, 6], [3]]
-      },
-      {
-          'x_list': [[1, 2], [], [1]],
-          'y_list': [[2, 3], [], [3]]
-      },
-  ])
-  def test_passing_both_ragged(self, x_list, y_list):
-    x = tf.ragged.constant(x_list)
-    y = tf.ragged.constant(y_list)
-    [x,
-     y], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x, y])
-    x.shape.assert_is_compatible_with(y.shape)
-
-  @parameterized.parameters([
-      {
-          'x_list': [1],
-      },
-      {
-          'x_list': [1, 2],
-      },
-      {
-          'x_list': [1, 2, 4],
-      },
-      {
-          'x_list': [[1, 2], [3, 4]],
-      },
-      {
-          'x_list': [[1, 2], [3, 4], [1]],
-      },
-      {
-          'x_list': [[1, 2], [], [1]],
-      },
-  ])
-  def test_passing_one_ragged(self, x_list):
-    x = tf.ragged.constant(x_list)
-    [x], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x])
-
-  @parameterized.parameters([
-      {
-          'x_list': [1],
-          'y_list': [2],
-          'mask_list': [0]
-      },
-      {
-          'x_list': [1, 2],
-          'y_list': [2, 3],
-          'mask_list': [0, 1]
-      },
-      {
-          'x_list': [1, 2, 4],
-          'y_list': [2, 3, 5],
-          'mask_list': [1, 1, 1]
-      },
-      {
-          'x_list': [[1, 2], [3, 4]],
-          'y_list': [[2, 3], [5, 6]],
-          'mask_list': [[1, 1], [0, 1]]
-      },
-      {
-          'x_list': [[1, 2], [3, 4], [1]],
-          'y_list': [[2, 3], [5, 6], [3]],
-          'mask_list': [[1, 1], [0, 0], [1]]
-      },
-      {
-          'x_list': [[1, 2], [], [1]],
-          'y_list': [[2, 3], [], [3]],
-          'mask_list': [[1, 1], [], [0]]
-      },
-  ])
-  def test_passing_both_ragged_with_mask(self, x_list, y_list, mask_list):
-    x = tf.ragged.constant(x_list)
-    y = tf.ragged.constant(y_list)
-    mask = tf.ragged.constant(mask_list)
-    [x, y], mask = \
-        metrics_utils.ragged_assert_compatible_and_get_flat_values([x, y], mask)
-    x.shape.assert_is_compatible_with(y.shape)
-    y.shape.assert_is_compatible_with(mask.shape)
-
-  @parameterized.parameters([
-      {
-          'x_list': [1],
-          'mask_list': [0]
-      },
-      {
-          'x_list': [1, 2],
-          'mask_list': [0, 1]
-      },
-      {
-          'x_list': [1, 2, 4],
-          'mask_list': [1, 1, 1]
-      },
-      {
-          'x_list': [[1, 2], [3, 4]],
-          'mask_list': [[1, 1], [0, 1]]
-      },
-      {
-          'x_list': [[1, 2], [3, 4], [1]],
-          'mask_list': [[1, 1], [0, 0], [1]]
-      },
-      {
-          'x_list': [[1, 2], [], [1]],
-          'mask_list': [[1, 1], [], [0]]
-      },
-  ])
-  def test_passing_one_ragged_with_mask(self, x_list, mask_list):
-    x = tf.ragged.constant(x_list)
-    mask = tf.ragged.constant(mask_list)
-    [x], mask = \
-        metrics_utils.ragged_assert_compatible_and_get_flat_values([x], mask)
-    x.shape.assert_is_compatible_with(mask.shape)
-
-  @parameterized.parameters([
-      {
-          'x_list': [[[1, 3]]],
-          'y_list': [[2, 3]]
-      },
-  ])
-  def test_failing_different_ragged_and_dense_ranks(self, x_list, y_list):
-    x = tf.ragged.constant(x_list)
-    y = tf.ragged.constant(y_list)
-    with self.assertRaises(ValueError):  # pylint: disable=g-error-prone-assert-raises
-      [x, y
-      ], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x, y])
-
-  @parameterized.parameters([
-      {
-          'x_list': [[[1, 3]]],
-          'y_list': [[[2, 3]]],
-          'mask_list': [[0, 1]]
-      },
-  ])
-  def test_failing_different_mask_ranks(self, x_list, y_list, mask_list):
-    x = tf.ragged.constant(x_list)
-    y = tf.ragged.constant(y_list)
-    mask = tf.ragged.constant(mask_list)
-    with self.assertRaises(ValueError):  # pylint: disable=g-error-prone-assert-raises
-      [x, y
-      ], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x, y],
-                                                                        mask)
-
-  # we do not support such cases that ragged_ranks are different but overall
-  # dimension shapes and sizes are identical due to adding too much performance
-  # overheads to the overall use cases.
-  def test_failing_different_ragged_ranks(self):
-    dt = tf.constant([[[1, 2]]])
-    # adding a ragged dimension
-    x = tf.RaggedTensor.from_row_splits(dt, row_splits=[0, 1])
-    y = tf.ragged.constant([[[[1, 2]]]])
-    with self.assertRaises(ValueError):  # pylint: disable=g-error-prone-assert-raises
-      [x, y], _ = \
-          metrics_utils.ragged_assert_compatible_and_get_flat_values([x, y])
-
-
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+    @parameterized.parameters(
+        [
+            {"x_list": [1], "y_list": [2]},
+            {"x_list": [1, 2], "y_list": [2, 3]},
+            {"x_list": [1, 2, 4], "y_list": [2, 3, 5]},
+            {"x_list": [[1, 2], [3, 4]], "y_list": [[2, 3], [5, 6]]},
+        ]
+    )
+    def test_passing_dense_tensors(self, x_list, y_list):
+        x = tf.constant(x_list)
+        y = tf.constant(y_list)
+        [x, y], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+            [x, y]
+        )
+        x.shape.assert_is_compatible_with(y.shape)
+
+    @parameterized.parameters(
+        [
+            {
+                "x_list": [1],
+            },
+            {
+                "x_list": [1, 2],
+            },
+            {
+                "x_list": [1, 2, 4],
+            },
+            {
+                "x_list": [[1, 2], [3, 4]],
+            },
+        ]
+    )
+    def test_passing_one_dense_tensor(self, x_list):
+        x = tf.constant(x_list)
+        [x], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x])
+
+    @parameterized.parameters(
+        [
+            {"x_list": [1], "y_list": [2]},
+            {"x_list": [1, 2], "y_list": [2, 3]},
+            {"x_list": [1, 2, 4], "y_list": [2, 3, 5]},
+            {"x_list": [[1, 2], [3, 4]], "y_list": [[2, 3], [5, 6]]},
+            {"x_list": [[1, 2], [3, 4], [1]], "y_list": [[2, 3], [5, 6], [3]]},
+            {"x_list": [[1, 2], [], [1]], "y_list": [[2, 3], [], [3]]},
+        ]
+    )
+    def test_passing_both_ragged(self, x_list, y_list):
+        x = tf.ragged.constant(x_list)
+        y = tf.ragged.constant(y_list)
+        [x, y], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+            [x, y]
+        )
+        x.shape.assert_is_compatible_with(y.shape)
+
+    @parameterized.parameters(
+        [
+            {
+                "x_list": [1],
+            },
+            {
+                "x_list": [1, 2],
+            },
+            {
+                "x_list": [1, 2, 4],
+            },
+            {
+                "x_list": [[1, 2], [3, 4]],
+            },
+            {
+                "x_list": [[1, 2], [3, 4], [1]],
+            },
+            {
+                "x_list": [[1, 2], [], [1]],
+            },
+        ]
+    )
+    def test_passing_one_ragged(self, x_list):
+        x = tf.ragged.constant(x_list)
+        [x], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values([x])
+
+    @parameterized.parameters(
+        [
+            {"x_list": [1], "y_list": [2], "mask_list": [0]},
+            {"x_list": [1, 2], "y_list": [2, 3], "mask_list": [0, 1]},
+            {"x_list": [1, 2, 4], "y_list": [2, 3, 5], "mask_list": [1, 1, 1]},
+            {
+                "x_list": [[1, 2], [3, 4]],
+                "y_list": [[2, 3], [5, 6]],
+                "mask_list": [[1, 1], [0, 1]],
+            },
+            {
+                "x_list": [[1, 2], [3, 4], [1]],
+                "y_list": [[2, 3], [5, 6], [3]],
+                "mask_list": [[1, 1], [0, 0], [1]],
+            },
+            {
+                "x_list": [[1, 2], [], [1]],
+                "y_list": [[2, 3], [], [3]],
+                "mask_list": [[1, 1], [], [0]],
+            },
+        ]
+    )
+    def test_passing_both_ragged_with_mask(self, x_list, y_list, mask_list):
+        x = tf.ragged.constant(x_list)
+        y = tf.ragged.constant(y_list)
+        mask = tf.ragged.constant(mask_list)
+        [
+            x,
+            y,
+        ], mask = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+            [x, y], mask
+        )
+        x.shape.assert_is_compatible_with(y.shape)
+        y.shape.assert_is_compatible_with(mask.shape)
+
+    @parameterized.parameters(
+        [
+            {"x_list": [1], "mask_list": [0]},
+            {"x_list": [1, 2], "mask_list": [0, 1]},
+            {"x_list": [1, 2, 4], "mask_list": [1, 1, 1]},
+            {"x_list": [[1, 2], [3, 4]], "mask_list": [[1, 1], [0, 1]]},
+            {
+                "x_list": [[1, 2], [3, 4], [1]],
+                "mask_list": [[1, 1], [0, 0], [1]],
+            },
+            {"x_list": [[1, 2], [], [1]], "mask_list": [[1, 1], [], [0]]},
+        ]
+    )
+    def test_passing_one_ragged_with_mask(self, x_list, mask_list):
+        x = tf.ragged.constant(x_list)
+        mask = tf.ragged.constant(mask_list)
+        [x], mask = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+            [x], mask
+        )
+        x.shape.assert_is_compatible_with(mask.shape)
+
+    @parameterized.parameters(
+        [
+            {"x_list": [[[1, 3]]], "y_list": [[2, 3]]},
+        ]
+    )
+    def test_failing_different_ragged_and_dense_ranks(self, x_list, y_list):
+        x = tf.ragged.constant(x_list)
+        y = tf.ragged.constant(y_list)
+        with self.assertRaises(ValueError):
+            [
+                x,
+                y,
+            ], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+                [x, y]
+            )
+
+    @parameterized.parameters(
+        [
+            {"x_list": [[[1, 3]]], "y_list": [[[2, 3]]], "mask_list": [[0, 1]]},
+        ]
+    )
+    def test_failing_different_mask_ranks(self, x_list, y_list, mask_list):
+        x = tf.ragged.constant(x_list)
+        y = tf.ragged.constant(y_list)
+        mask = tf.ragged.constant(mask_list)
+        with self.assertRaises(ValueError):
+            [
+                x,
+                y,
+            ], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+                [x, y], mask
+            )
+
+    # we do not support such cases that ragged_ranks are different but overall
+    # dimension shapes and sizes are identical due to adding too much
+    # performance overheads to the overall use cases.
+    def test_failing_different_ragged_ranks(self):
+        dt = tf.constant([[[1, 2]]])
+        # adding a ragged dimension
+        x = tf.RaggedTensor.from_row_splits(dt, row_splits=[0, 1])
+        y = tf.ragged.constant([[[[1, 2]]]])
+        with self.assertRaises(ValueError):
+            [
+                x,
+                y,
+            ], _ = metrics_utils.ragged_assert_compatible_and_get_flat_values(
+                [x, y]
+            )
+
+
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class FilterTopKTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_one_dimensional(self):
-    x = tf.constant([.3, .1, .2, -.5, 42.])
-    top_1 = self.evaluate(metrics_utils._filter_top_k(x=x, k=1))
-    top_2 = self.evaluate(metrics_utils._filter_top_k(x=x, k=2))
-    top_3 = self.evaluate(metrics_utils._filter_top_k(x=x, k=3))
-
-    self.assertAllClose(top_1, [
-        metrics_utils.NEG_INF, metrics_utils.NEG_INF, metrics_utils.NEG_INF,
-        metrics_utils.NEG_INF, 42.
-    ])
-    self.assertAllClose(top_2, [
-        .3, metrics_utils.NEG_INF, metrics_utils.NEG_INF, metrics_utils.NEG_INF,
-        42.
-    ])
-    self.assertAllClose(
-        top_3, [.3, metrics_utils.NEG_INF, .2, metrics_utils.NEG_INF, 42.])
-
-  def test_three_dimensional(self):
-    x = tf.constant([[[.3, .1, .2], [-.3, -.2, -.1]],
-                              [[5., .2, 42.], [-.3, -.6, -.99]]])
-    top_2 = self.evaluate(metrics_utils._filter_top_k(x=x, k=2))
-
-    self.assertAllClose(
-        top_2,
-        [[[.3, metrics_utils.NEG_INF, .2], [metrics_utils.NEG_INF, -.2, -.1]],
-         [[5., metrics_utils.NEG_INF, 42.], [-.3, -.6, metrics_utils.NEG_INF]]])
-
-  def test_handles_dynamic_shapes(self):
-    # See b/150281686.  # GOOGLE_INTERNAL
-
-    def _identity(x):
-      return x
-
-    def _filter_top_k(x):
-      # This loses the static shape.
-      x = tf.numpy_function(_identity, (x,), tf.float32)
-
-      return metrics_utils._filter_top_k(x=x, k=2)
-
-    x = tf.constant([.3, .1, .2, -.5, 42.])
-    top_2 = self.evaluate(_filter_top_k(x))
-    self.assertAllClose(top_2, [
-        .3, metrics_utils.NEG_INF, metrics_utils.NEG_INF, metrics_utils.NEG_INF,
-        42.
-    ])
+    def test_one_dimensional(self):
+        x = tf.constant([0.3, 0.1, 0.2, -0.5, 42.0])
+        top_1 = self.evaluate(metrics_utils._filter_top_k(x=x, k=1))
+        top_2 = self.evaluate(metrics_utils._filter_top_k(x=x, k=2))
+        top_3 = self.evaluate(metrics_utils._filter_top_k(x=x, k=3))
+
+        self.assertAllClose(
+            top_1,
+            [
+                metrics_utils.NEG_INF,
+                metrics_utils.NEG_INF,
+                metrics_utils.NEG_INF,
+                metrics_utils.NEG_INF,
+                42.0,
+            ],
+        )
+        self.assertAllClose(
+            top_2,
+            [
+                0.3,
+                metrics_utils.NEG_INF,
+                metrics_utils.NEG_INF,
+                metrics_utils.NEG_INF,
+                42.0,
+            ],
+        )
+        self.assertAllClose(
+            top_3,
+            [0.3, metrics_utils.NEG_INF, 0.2, metrics_utils.NEG_INF, 42.0],
+        )
+
+    def test_three_dimensional(self):
+        x = tf.constant(
+            [
+                [[0.3, 0.1, 0.2], [-0.3, -0.2, -0.1]],
+                [[5.0, 0.2, 42.0], [-0.3, -0.6, -0.99]],
+            ]
+        )
+        top_2 = self.evaluate(metrics_utils._filter_top_k(x=x, k=2))
+
+        self.assertAllClose(
+            top_2,
+            [
+                [
+                    [0.3, metrics_utils.NEG_INF, 0.2],
+                    [metrics_utils.NEG_INF, -0.2, -0.1],
+                ],
+                [
+                    [5.0, metrics_utils.NEG_INF, 42.0],
+                    [-0.3, -0.6, metrics_utils.NEG_INF],
+                ],
+            ],
+        )
+
+    def test_handles_dynamic_shapes(self):
+        # See b/150281686.  # GOOGLE_INTERNAL
+
+        def _identity(x):
+            return x
+
+        def _filter_top_k(x):
+            # This loses the static shape.
+            x = tf.numpy_function(_identity, (x,), tf.float32)
+
+            return metrics_utils._filter_top_k(x=x, k=2)
+
+        x = tf.constant([0.3, 0.1, 0.2, -0.5, 42.0])
+        top_2 = self.evaluate(_filter_top_k(x))
+        self.assertAllClose(
+            top_2,
+            [
+                0.3,
+                metrics_utils.NEG_INF,
+                metrics_utils.NEG_INF,
+                metrics_utils.NEG_INF,
+                42.0,
+            ],
+        )
 
 
 class MatchesMethodsTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_sparse_categorical_matches(self):
-    matches_method = metrics_utils.sparse_categorical_matches
-
-    # Test return tensor is type float
-    y_true = tf.constant(np.random.randint(0, 7, (6,)))
-    y_pred = tf.constant(np.random.random((6, 7)))
-    self.assertEqual(matches_method(y_true, y_pred).dtype, backend.floatx())
-
-    # Tests that resulting Tensor always has same shape as y_true. Tests from
-    # 1 dim to 4 dims
-    dims = []
-    for _ in range(4):
-      dims.append(np.random.randint(1, 7))
-      y_true = tf.constant(np.random.randint(0, 7, dims))
-      y_pred = tf.constant(np.random.random(dims + [3]))
-      self.assertEqual(
-          matches_method(y_true, y_pred).shape, y_true.shape)
-
-    # Test correctness if the shape of y_true is (num_samples,)
-    y_true = tf.constant([1., 0., 0., 0.])
-    y_pred = tf.constant([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred), [0., 1., 1., 1.])
-
-    # Test correctness if the shape of y_true is (num_samples, 1)
-    y_true = tf.constant([[1.], [0.], [0.], [0.]])
-    y_pred = tf.constant([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred), [[0.], [1.], [1.], [1.]])
-
-    # Test correctness if the shape of y_true is (batch_size, seq_length) and
-    # y_pred is (batch_size, seq_length, num_classes)
-    y_pred = tf.constant([[[0.2, 0.3, 0.1], [0.1, 0.2, 0.7]],
-                          [[0.3, 0.2, 0.1], [0.7, 0.2, 0.1]]])
-    y_true = tf.constant([[1, 0], [1, 0]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred), [[1., 0.], [0., 1.]])
-
-  def test_sparse_top_k_categorical_matches(self):
-    matches_method = metrics_utils.sparse_top_k_categorical_matches
-
-    # Test return tensor is type float
-    y_true = tf.constant(np.random.randint(0, 7, (6,)))
-    y_pred = tf.constant(np.random.random((6, 7)), dtype=tf.float32)
-    self.assertEqual(
-        matches_method(y_true, y_pred, 1).dtype, backend.floatx())
-
-    # Tests that resulting Tensor always has same shape as y_true. Tests from
-    # 1 dim to 4 dims
-    dims = []
-    for _ in range(4):
-      dims.append(np.random.randint(1, 7))
-      y_true = tf.constant(np.random.randint(0, 7, dims))
-      y_pred = tf.constant(np.random.random(dims + [3]), dtype=tf.float32)
-      self.assertEqual(
-          matches_method(y_true, y_pred, 1).shape, y_true.shape)
-
-    # Test correctness if the shape of y_true is (num_samples,) for k = 1,2,3
-    y_true = tf.constant([1., 0., 0., 0.])
-    y_pred = tf.constant([[0.7, 0.2, 0.1], [0.5, 0.3, 0.2], [0.6, 0.3, 0.1],
-                          [0.0, 0.1, 0.9]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 1), [0., 1., 1., 0.])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 2), [1., 1., 1., 0.])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 3), [1., 1., 1., 1.])
-
-    # Test correctness if the shape of y_true is (num_samples, 1)
-    # for k = 1,2,3
-    y_true = tf.constant([[1.], [0.], [0.], [0.]])
-    y_pred = tf.constant([[0.7, 0.2, 0.1], [0.5, 0.3, 0.2], [0.6, 0.3, 0.1],
-                          [0.0, 0.1, 0.9]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 1),
-        [[0.], [1.], [1.], [0.]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 2),
-        [[1.], [1.], [1.], [0.]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 3),
-        [[1.], [1.], [1.], [1.]])
-
-    # Test correctness if the shape of y_true is (batch_size, seq_length) and
-    # y_pred is (batch_size, seq_length, num_classes) for k = 1,2,3
-    y_pred = tf.constant([[[0.2, 0.3, 0.1], [0.1, 0.2, 0.7]],
-                          [[0.3, 0.2, 0.1], [0.7, 0.2, 0.1]]])
-    y_true = tf.constant([[1, 0], [1, 0]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 1), [[1., 0.], [0., 1.]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 2), [[1., 0.], [1., 1.]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, 3), [[1., 1.], [1., 1.]])
-
-  def test_binary_matches(self):
-    matches_method = metrics_utils.binary_matches
-
-    # Test return tensor is type float
-    y_true = tf.constant(np.random.random((6, 7)))
-    y_pred = tf.constant(np.random.random((6, 7)))
-    self.assertEqual(
-        matches_method(y_true, y_pred, .5).dtype,
-        backend.floatx())
-
-    # Tests that resulting Tensor always has same shape as y_true. Tests from
-    # 1 dim to 4 dims.
-    dims = []
-    for _ in range(4):
-      dims.append(np.random.randint(1, 7))
-      y_true = y_pred = tf.constant(np.random.random(dims))
-      self.assertEqual(
-          matches_method(y_true, y_pred, 0.).shape, y_true.shape)
-
-    # Testing for correctness shape (num_samples, 1)
-    y_true = tf.constant([[1.], [0.], [1.], [1.]])
-    y_pred = tf.constant([[.75], [.2], [.2], [.75]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, .5),
-        [[1.], [1.], [0.], [1.]])
-
-    # Testing for correctness shape (num_samples,)
-    y_true = tf.constant([1., 0., 1., 1.])
-    y_pred = tf.constant([.75, .2, .2, .75])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, .5), [1., 1., 0., 1.])
-
-    # Testing for correctness batches of sequences
-    # shape (num_samples, seq_len)
-    y_true = tf.constant([[1., 0.], [0., 1.], [1., 0.], [1., 0.]])
-    y_pred = tf.constant([[.75, .2], [.2, .75], [.2, .75], [.75, .2]])
-    self.assertAllEqual(
-        matches_method(y_true, y_pred, .5),
-        [[1., 1.], [1., 1.], [0., 0.], [1., 1.]])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def test_sparse_categorical_matches(self):
+        matches_method = metrics_utils.sparse_categorical_matches
+
+        # Test return tensor is type float
+        y_true = tf.constant(np.random.randint(0, 7, (6,)))
+        y_pred = tf.constant(np.random.random((6, 7)))
+        self.assertEqual(matches_method(y_true, y_pred).dtype, backend.floatx())
+
+        # Tests that resulting Tensor always has same shape as y_true. Tests
+        # from 1 dim to 4 dims
+        dims = []
+        for _ in range(4):
+            dims.append(np.random.randint(1, 7))
+            y_true = tf.constant(np.random.randint(0, 7, dims))
+            y_pred = tf.constant(np.random.random(dims + [3]))
+            self.assertEqual(matches_method(y_true, y_pred).shape, y_true.shape)
+
+        # Test correctness if the shape of y_true is (num_samples,)
+        y_true = tf.constant([1.0, 0.0, 0.0, 0.0])
+        y_pred = tf.constant([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+        self.assertAllEqual(
+            matches_method(y_true, y_pred), [0.0, 1.0, 1.0, 1.0]
+        )
+
+        # Test correctness if the shape of y_true is (num_samples, 1)
+        y_true = tf.constant([[1.0], [0.0], [0.0], [0.0]])
+        y_pred = tf.constant([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+        self.assertAllEqual(
+            matches_method(y_true, y_pred), [[0.0], [1.0], [1.0], [1.0]]
+        )
+
+        # Test correctness if the shape of y_true is (batch_size, seq_length)
+        # and y_pred is (batch_size, seq_length, num_classes)
+        y_pred = tf.constant(
+            [
+                [[0.2, 0.3, 0.1], [0.1, 0.2, 0.7]],
+                [[0.3, 0.2, 0.1], [0.7, 0.2, 0.1]],
+            ]
+        )
+        y_true = tf.constant([[1, 0], [1, 0]])
+        self.assertAllEqual(
+            matches_method(y_true, y_pred), [[1.0, 0.0], [0.0, 1.0]]
+        )
+
+    def test_sparse_top_k_categorical_matches(self):
+        matches_method = metrics_utils.sparse_top_k_categorical_matches
+
+        # Test return tensor is type float
+        y_true = tf.constant(np.random.randint(0, 7, (6,)))
+        y_pred = tf.constant(np.random.random((6, 7)), dtype=tf.float32)
+        self.assertEqual(
+            matches_method(y_true, y_pred, 1).dtype, backend.floatx()
+        )
+
+        # Tests that resulting Tensor always has same shape as y_true. Tests
+        # from 1 dim to 4 dims
+        dims = []
+        for _ in range(4):
+            dims.append(np.random.randint(1, 7))
+            y_true = tf.constant(np.random.randint(0, 7, dims))
+            y_pred = tf.constant(np.random.random(dims + [3]), dtype=tf.float32)
+            self.assertEqual(
+                matches_method(y_true, y_pred, 1).shape, y_true.shape
+            )
+
+        # Test correctness if the shape of y_true is (num_samples,) for k =
+        # 1,2,3
+        y_true = tf.constant([1.0, 0.0, 0.0, 0.0])
+        y_pred = tf.constant(
+            [[0.7, 0.2, 0.1], [0.5, 0.3, 0.2], [0.6, 0.3, 0.1], [0.0, 0.1, 0.9]]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 1), [0.0, 1.0, 1.0, 0.0]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 2), [1.0, 1.0, 1.0, 0.0]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 3), [1.0, 1.0, 1.0, 1.0]
+        )
+
+        # Test correctness if the shape of y_true is (num_samples, 1)
+        # for k = 1,2,3
+        y_true = tf.constant([[1.0], [0.0], [0.0], [0.0]])
+        y_pred = tf.constant(
+            [[0.7, 0.2, 0.1], [0.5, 0.3, 0.2], [0.6, 0.3, 0.1], [0.0, 0.1, 0.9]]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 1), [[0.0], [1.0], [1.0], [0.0]]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 2), [[1.0], [1.0], [1.0], [0.0]]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 3), [[1.0], [1.0], [1.0], [1.0]]
+        )
+
+        # Test correctness if the shape of y_true is (batch_size, seq_length)
+        # and y_pred is (batch_size, seq_length, num_classes) for k = 1,2,3
+        y_pred = tf.constant(
+            [
+                [[0.2, 0.3, 0.1], [0.1, 0.2, 0.7]],
+                [[0.3, 0.2, 0.1], [0.7, 0.2, 0.1]],
+            ]
+        )
+        y_true = tf.constant([[1, 0], [1, 0]])
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 1), [[1.0, 0.0], [0.0, 1.0]]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 2), [[1.0, 0.0], [1.0, 1.0]]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 3), [[1.0, 1.0], [1.0, 1.0]]
+        )
+
+    def test_binary_matches(self):
+        matches_method = metrics_utils.binary_matches
+
+        # Test return tensor is type float
+        y_true = tf.constant(np.random.random((6, 7)))
+        y_pred = tf.constant(np.random.random((6, 7)))
+        self.assertEqual(
+            matches_method(y_true, y_pred, 0.5).dtype, backend.floatx()
+        )
+
+        # Tests that resulting Tensor always has same shape as y_true. Tests
+        # from 1 dim to 4 dims.
+        dims = []
+        for _ in range(4):
+            dims.append(np.random.randint(1, 7))
+            y_true = y_pred = tf.constant(np.random.random(dims))
+            self.assertEqual(
+                matches_method(y_true, y_pred, 0.0).shape, y_true.shape
+            )
+
+        # Testing for correctness shape (num_samples, 1)
+        y_true = tf.constant([[1.0], [0.0], [1.0], [1.0]])
+        y_pred = tf.constant([[0.75], [0.2], [0.2], [0.75]])
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 0.5), [[1.0], [1.0], [0.0], [1.0]]
+        )
+
+        # Testing for correctness shape (num_samples,)
+        y_true = tf.constant([1.0, 0.0, 1.0, 1.0])
+        y_pred = tf.constant([0.75, 0.2, 0.2, 0.75])
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 0.5), [1.0, 1.0, 0.0, 1.0]
+        )
+
+        # Testing for correctness batches of sequences
+        # shape (num_samples, seq_len)
+        y_true = tf.constant([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]])
+        y_pred = tf.constant(
+            [[0.75, 0.2], [0.2, 0.75], [0.2, 0.75], [0.75, 0.2]]
+        )
+        self.assertAllEqual(
+            matches_method(y_true, y_pred, 0.5),
+            [[1.0, 1.0], [1.0, 1.0], [0.0, 0.0], [1.0, 1.0]],
+        )
+
+
+@test_utils.run_v2_only
+class UpdateConfusionMatrixVarTest(tf.test.TestCase, parameterized.TestCase):
+    def setUp(self):
+        self.tp = metrics_utils.ConfusionMatrix.TRUE_POSITIVES
+        self.tn = metrics_utils.ConfusionMatrix.TRUE_NEGATIVES
+        self.fp = metrics_utils.ConfusionMatrix.FALSE_POSITIVES
+        self.fn = metrics_utils.ConfusionMatrix.FALSE_NEGATIVES
+        self.variables_to_update = {
+            self.tp: tf.Variable([0], dtype=tf.float32),
+            self.tn: tf.Variable([0], dtype=tf.float32),
+            self.fp: tf.Variable([0], dtype=tf.float32),
+            self.fn: tf.Variable([0], dtype=tf.float32),
+        }
+
+    def test_without_sample_weight(self):
+        y_true = tf.constant([[1, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant([[0.8, 0.7, 0.1], [0.1, 0.6, 0.4]])
+        thresholds = [0.5]
+
+        metrics_utils.update_confusion_matrix_variables(
+            variables_to_update=self.variables_to_update,
+            y_true=y_true,
+            y_pred=y_pred,
+            thresholds=thresholds,
+        )
+        self.assertEqual(self.variables_to_update[self.tp].numpy()[0], 2)
+        self.assertEqual(self.variables_to_update[self.tn].numpy()[0], 2)
+        self.assertEqual(self.variables_to_update[self.fp].numpy()[0], 1)
+        self.assertEqual(self.variables_to_update[self.fn].numpy()[0], 1)
+
+    def test_with_sample_weight(self):
+        y_true = tf.constant([[1, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant([[0.8, 0.7, 0.1], [0.1, 0.6, 0.4]])
+        thresholds = [0.5]
+        sample_weight = [2, 1]
+
+        metrics_utils.update_confusion_matrix_variables(
+            variables_to_update=self.variables_to_update,
+            y_true=y_true,
+            y_pred=y_pred,
+            thresholds=thresholds,
+            sample_weight=sample_weight,
+        )
+        self.assertEqual(self.variables_to_update[self.tp].numpy()[0], 4)
+        self.assertEqual(self.variables_to_update[self.tn].numpy()[0], 3)
+        self.assertEqual(self.variables_to_update[self.fp].numpy()[0], 1)
+        self.assertEqual(self.variables_to_update[self.fn].numpy()[0], 1)
+
+    def test_with_class_id(self):
+        y_true = tf.constant([[1, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant([[0.8, 0.7, 0.1], [0.1, 0.6, 0.4]])
+        thresholds = [0.5]
+        class_id = 2
+
+        metrics_utils.update_confusion_matrix_variables(
+            variables_to_update=self.variables_to_update,
+            y_true=y_true,
+            y_pred=y_pred,
+            thresholds=thresholds,
+            class_id=class_id,
+        )
+        self.assertEqual(self.variables_to_update[self.tp].numpy()[0], 0)
+        self.assertEqual(self.variables_to_update[self.tn].numpy()[0], 1)
+        self.assertEqual(self.variables_to_update[self.fp].numpy()[0], 0)
+        self.assertEqual(self.variables_to_update[self.fn].numpy()[0], 1)
+
+    def test_with_sample_weight_and_classid(self):
+        y_true = tf.constant([[1, 1, 0], [0, 0, 1]])
+        y_pred = tf.constant([[0.8, 0.7, 0.1], [0.1, 0.6, 0.4]])
+        thresholds = [0.5]
+        sample_weight = [2, 1]
+        class_id = 2
+
+        metrics_utils.update_confusion_matrix_variables(
+            variables_to_update=self.variables_to_update,
+            y_true=y_true,
+            y_pred=y_pred,
+            thresholds=thresholds,
+            sample_weight=sample_weight,
+            class_id=class_id,
+        )
+        self.assertEqual(self.variables_to_update[self.tp].numpy()[0], 0)
+        self.assertEqual(self.variables_to_update[self.tn].numpy()[0], 2)
+        self.assertEqual(self.variables_to_update[self.fp].numpy()[0], 0)
+        self.assertEqual(self.variables_to_update[self.fn].numpy()[0], 1)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/mode_keys.py b/keras/utils/mode_keys.py
index 38881970937b..7ba5a17585ec 100644
--- a/keras/utils/mode_keys.py
+++ b/keras/utils/mode_keys.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Keras model mode constants."""
 
-# pylint: disable=unused-import
-from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys as ModeKeys
-# pylint: enable=unused-import
+# isort: off
+from tensorflow.python.saved_model.model_utils.mode_keys import (  # noqa: F401,E501
+    KerasModeKeys as ModeKeys,
+)
diff --git a/keras/utils/np_utils.py b/keras/utils/np_utils.py
index d2b7492fd0c0..60cad3fa6197 100644
--- a/keras/utils/np_utils.py
+++ b/keras/utils/np_utils.py
@@ -15,77 +15,128 @@
 """Numpy-related utilities."""
 
 import numpy as np
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.utils.to_categorical')
-def to_categorical(y, num_classes=None, dtype='float32'):
-  """Converts a class vector (integers) to binary class matrix.
+@keras_export("keras.utils.to_categorical")
+def to_categorical(y, num_classes=None, dtype="float32"):
+    """Converts a class vector (integers) to binary class matrix.
 
-  E.g. for use with `categorical_crossentropy`.
+    E.g. for use with `categorical_crossentropy`.
 
-  Args:
-      y: Array-like with class values to be converted into a matrix
-          (integers from 0 to `num_classes - 1`).
-      num_classes: Total number of classes. If `None`, this would be inferred
-        as `max(y) + 1`.
-      dtype: The data type expected by the input. Default: `'float32'`.
+    Args:
+        y: Array-like with class values to be converted into a matrix
+            (integers from 0 to `num_classes - 1`).
+        num_classes: Total number of classes. If `None`, this would be inferred
+          as `max(y) + 1`.
+        dtype: The data type expected by the input. Default: `'float32'`.
 
-  Returns:
-      A binary matrix representation of the input. The class axis is placed
-      last.
+    Returns:
+        A binary matrix representation of the input as a NumPy array. The class
+        axis is placed last.
 
-  Example:
+    Example:
 
-  >>> a = tf.keras.utils.to_categorical([0, 1, 2, 3], num_classes=4)
-  >>> a = tf.constant(a, shape=[4, 4])
-  >>> print(a)
-  tf.Tensor(
+    >>> a = tf.keras.utils.to_categorical([0, 1, 2, 3], num_classes=4)
+    >>> print(a)
     [[1. 0. 0. 0.]
      [0. 1. 0. 0.]
      [0. 0. 1. 0.]
-     [0. 0. 0. 1.]], shape=(4, 4), dtype=float32)
-
-  >>> b = tf.constant([.9, .04, .03, .03,
-  ...                  .3, .45, .15, .13,
-  ...                  .04, .01, .94, .05,
-  ...                  .12, .21, .5, .17],
-  ...                 shape=[4, 4])
-  >>> loss = tf.keras.backend.categorical_crossentropy(a, b)
-  >>> print(np.around(loss, 5))
-  [0.10536 0.82807 0.1011  1.77196]
-
-  >>> loss = tf.keras.backend.categorical_crossentropy(a, a)
-  >>> print(np.around(loss, 5))
-  [0. 0. 0. 0.]
-  """
-  y = np.array(y, dtype='int')
-  input_shape = y.shape
-  if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
-    input_shape = tuple(input_shape[:-1])
-  y = y.ravel()
-  if not num_classes:
-    num_classes = np.max(y) + 1
-  n = y.shape[0]
-  categorical = np.zeros((n, num_classes), dtype=dtype)
-  categorical[np.arange(n), y] = 1
-  output_shape = input_shape + (num_classes,)
-  categorical = np.reshape(categorical, output_shape)
-  return categorical
-
-
-@keras_export('keras.utils.normalize')
+     [0. 0. 0. 1.]]
+
+    >>> b = tf.constant([.9, .04, .03, .03,
+    ...                  .3, .45, .15, .13,
+    ...                  .04, .01, .94, .05,
+    ...                  .12, .21, .5, .17],
+    ...                 shape=[4, 4])
+    >>> loss = tf.keras.backend.categorical_crossentropy(a, b)
+    >>> print(np.around(loss, 5))
+    [0.10536 0.82807 0.1011  1.77196]
+
+    >>> loss = tf.keras.backend.categorical_crossentropy(a, a)
+    >>> print(np.around(loss, 5))
+    [0. 0. 0. 0.]
+    """
+    y = np.array(y, dtype="int")
+    input_shape = y.shape
+
+    # Shrink the last dimension if the shape is (..., 1).
+    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
+        input_shape = tuple(input_shape[:-1])
+
+    y = y.reshape(-1)
+    if not num_classes:
+        num_classes = np.max(y) + 1
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes), dtype=dtype)
+    categorical[np.arange(n), y] = 1
+    output_shape = input_shape + (num_classes,)
+    categorical = np.reshape(categorical, output_shape)
+    return categorical
+
+
+@keras_export("keras.utils.to_ordinal")
+def to_ordinal(y, num_classes=None, dtype="float32"):
+    """Converts a class vector (integers) to an ordinal regression matrix.
+
+    This utility encodes class vector to ordinal regression/classification
+    matrix where each sample is indicated by a row and rank of that sample is
+    indicated by number of ones in that row.
+
+    Args:
+        y: Array-like with class values to be converted into a matrix
+            (integers from 0 to `num_classes - 1`).
+        num_classes: Total number of classes. If `None`, this would be inferred
+            as `max(y) + 1`.
+        dtype: The data type expected by the input. Default: `'float32'`.
+
+    Returns:
+        An ordinal regression matrix representation of the input as a NumPy
+        array. The class axis is placed last.
+
+    Example:
+
+    >>> a = tf.keras.utils.to_ordinal([0, 1, 2, 3], num_classes=4)
+    >>> print(a)
+    [[0. 0. 0.]
+     [1. 0. 0.]
+     [1. 1. 0.]
+     [1. 1. 1.]]
+    """
+    y = np.array(y, dtype="int")
+    input_shape = y.shape
+
+    # Shrink the last dimension if the shape is (..., 1).
+    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
+        input_shape = tuple(input_shape[:-1])
+
+    y = y.reshape(-1)
+    if not num_classes:
+        num_classes = np.max(y) + 1
+    n = y.shape[0]
+    range_values = np.arange(num_classes - 1)
+    range_values = np.tile(np.expand_dims(range_values, 0), [n, 1])
+    ordinal = np.zeros((n, num_classes - 1), dtype=dtype)
+    ordinal[range_values < np.expand_dims(y, -1)] = 1
+    output_shape = input_shape + (num_classes - 1,)
+    ordinal = np.reshape(ordinal, output_shape)
+    return ordinal
+
+
+@keras_export("keras.utils.normalize")
 def normalize(x, axis=-1, order=2):
-  """Normalizes a Numpy array.
-
-  Args:
-      x: Numpy array to normalize.
-      axis: axis along which to normalize.
-      order: Normalization order (e.g. `order=2` for L2 norm).
-
-  Returns:
-      A normalized copy of the array.
-  """
-  l2 = np.atleast_1d(np.linalg.norm(x, order, axis))
-  l2[l2 == 0] = 1
-  return x / np.expand_dims(l2, axis)
+    """Normalizes a Numpy array.
+
+    Args:
+        x: Numpy array to normalize.
+        axis: axis along which to normalize.
+        order: Normalization order (e.g. `order=2` for L2 norm).
+
+    Returns:
+        A normalized copy of the array.
+    """
+    l2 = np.atleast_1d(np.linalg.norm(x, order, axis))
+    l2[l2 == 0] = 1
+    return x / np.expand_dims(l2, axis)
diff --git a/keras/utils/np_utils_test.py b/keras/utils/np_utils_test.py
index ff2a68a54741..d108e10dd61a 100644
--- a/keras/utils/np_utils_test.py
+++ b/keras/utils/np_utils_test.py
@@ -14,35 +14,71 @@
 # ==============================================================================
 """Tests for np_utils."""
 
-import tensorflow.compat.v2 as tf
-
 import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
 
+from keras.testing_infra import test_combinations
 from keras.utils import np_utils
 
+NUM_CLASSES = 5
+
+
+class TestNPUtils(test_combinations.TestCase):
+    @parameterized.parameters(
+        [
+            ((1,), (1, NUM_CLASSES)),
+            ((3,), (3, NUM_CLASSES)),
+            ((4, 3), (4, 3, NUM_CLASSES)),
+            ((5, 4, 3), (5, 4, 3, NUM_CLASSES)),
+            ((3, 1), (3, NUM_CLASSES)),
+            ((3, 2, 1), (3, 2, NUM_CLASSES)),
+        ]
+    )
+    def test_to_categorical(self, shape, expected_shape):
+        label = np.random.randint(0, NUM_CLASSES, shape)
+        one_hot = np_utils.to_categorical(label, NUM_CLASSES)
+        # Check shape
+        self.assertEqual(one_hot.shape, expected_shape)
+        # Make sure there is only one 1 in a row
+        self.assertTrue(np.all(one_hot.sum(axis=-1) == 1))
+        # Get original labels back from one hots
+        self.assertTrue(
+            np.all(np.argmax(one_hot, -1).reshape(label.shape) == label)
+        )
+
+    def test_to_categorial_without_num_classes(self):
+        label = [0, 2, 5]
+        one_hot = np_utils.to_categorical(label)
+        self.assertEqual(one_hot.shape, (3, 5 + 1))
+
+    @parameterized.parameters(
+        [
+            ((1,), (1, NUM_CLASSES - 1)),
+            ((3,), (3, NUM_CLASSES - 1)),
+            ((4, 3), (4, 3, NUM_CLASSES - 1)),
+            ((5, 4, 3), (5, 4, 3, NUM_CLASSES - 1)),
+            ((3, 1), (3, NUM_CLASSES - 1)),
+            ((3, 2, 1), (3, 2, NUM_CLASSES - 1)),
+        ]
+    )
+    def test_to_ordinal(self, shape, expected_shape):
+        label = np.random.randint(0, NUM_CLASSES, shape)
+        ordinal = np_utils.to_ordinal(label, NUM_CLASSES)
+        # Check shape
+        self.assertEqual(ordinal.shape, expected_shape)
+        # Make sure all the values are either 0 or 1
+        self.assertTrue(np.all(np.logical_or(ordinal == 0, ordinal == 1)))
+        # Get original labels back from ordinal matrix
+        self.assertTrue(
+            np.all(ordinal.cumprod(-1).sum(-1).reshape(label.shape) == label)
+        )
+
+    def test_to_ordinal_without_num_classes(self):
+        label = [0, 2, 5]
+        one_hot = np_utils.to_ordinal(label)
+        self.assertEqual(one_hot.shape, (3, 5))
+
 
-class TestNPUtils(tf.test.TestCase):
-
-  def test_to_categorical(self):
-    num_classes = 5
-    shapes = [(1,), (3,), (4, 3), (5, 4, 3), (3, 1), (3, 2, 1)]
-    expected_shapes = [(1, num_classes), (3, num_classes), (4, 3, num_classes),
-                       (5, 4, 3, num_classes), (3, num_classes),
-                       (3, 2, num_classes)]
-    labels = [np.random.randint(0, num_classes, shape) for shape in shapes]
-    one_hots = [
-        np_utils.to_categorical(label, num_classes) for label in labels]
-    for label, one_hot, expected_shape in zip(labels,
-                                              one_hots,
-                                              expected_shapes):
-      # Check shape
-      self.assertEqual(one_hot.shape, expected_shape)
-      # Make sure there is only one 1 in a row
-      self.assertTrue(np.all(one_hot.sum(axis=-1) == 1))
-      # Get original labels back from one hots
-      self.assertTrue(np.all(
-          np.argmax(one_hot, -1).reshape(label.shape) == label))
-
-
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/object_identity.py b/keras/utils/object_identity.py
index db5a313045b0..856e61820233 100644
--- a/keras/utils/object_identity.py
+++ b/keras/utils/object_identity.py
@@ -20,227 +20,234 @@
 
 # LINT.IfChange
 class _ObjectIdentityWrapper:
-  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
+    """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
 
-  Since __eq__ is based on object identity, it's safe to also define __hash__
-  based on object ids. This lets us add unhashable types like trackable
-  _ListWrapper objects to object-identity collections.
-  """
+    Since __eq__ is based on object identity, it's safe to also define __hash__
+    based on object ids. This lets us add unhashable types like trackable
+    _ListWrapper objects to object-identity collections.
+    """
 
-  __slots__ = ["_wrapped", "__weakref__"]
+    __slots__ = ["_wrapped", "__weakref__"]
 
-  def __init__(self, wrapped):
-    self._wrapped = wrapped
+    def __init__(self, wrapped):
+        self._wrapped = wrapped
 
-  @property
-  def unwrapped(self):
-    return self._wrapped
+    @property
+    def unwrapped(self):
+        return self._wrapped
 
-  def _assert_type(self, other):
-    if not isinstance(other, _ObjectIdentityWrapper):
-      raise TypeError(
-          "Cannot compare wrapped object with unwrapped object. "
-          f"Expect the object to be `_ObjectIdentityWrapper`. Got: {other}")
+    def _assert_type(self, other):
+        if not isinstance(other, _ObjectIdentityWrapper):
+            raise TypeError(
+                "Cannot compare wrapped object with unwrapped object. "
+                "Expect the object to be `_ObjectIdentityWrapper`. "
+                f"Got: {other}"
+            )
 
-  def __lt__(self, other):
-    self._assert_type(other)
-    return id(self._wrapped) < id(other._wrapped)  # pylint: disable=protected-access
+    def __lt__(self, other):
+        self._assert_type(other)
+        return id(self._wrapped) < id(other._wrapped)
 
-  def __gt__(self, other):
-    self._assert_type(other)
-    return id(self._wrapped) > id(other._wrapped)  # pylint: disable=protected-access
+    def __gt__(self, other):
+        self._assert_type(other)
+        return id(self._wrapped) > id(other._wrapped)
 
-  def __eq__(self, other):
-    if other is None:
-      return False
-    self._assert_type(other)
-    return self._wrapped is other._wrapped  # pylint: disable=protected-access
+    def __eq__(self, other):
+        if other is None:
+            return False
+        self._assert_type(other)
+        return self._wrapped is other._wrapped
 
-  def __ne__(self, other):
-    return not self.__eq__(other)
+    def __ne__(self, other):
+        return not self.__eq__(other)
 
-  def __hash__(self):
-    # Wrapper id() is also fine for weakrefs. In fact, we rely on
-    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
-    # weakref.ref(a) in _WeakObjectIdentityWrapper.
-    return id(self._wrapped)
+    def __hash__(self):
+        # Wrapper id() is also fine for weakrefs. In fact, we rely on
+        # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
+        # weakref.ref(a) in _WeakObjectIdentityWrapper.
+        return id(self._wrapped)
 
-  def __repr__(self):
-    return "<{} wrapping {!r}>".format(type(self).__name__, self._wrapped)
+    def __repr__(self):
+        return f"<{type(self).__name__} wrapping {self._wrapped!r}>"
 
 
 class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
 
-  __slots__ = ()
+    __slots__ = ()
 
-  def __init__(self, wrapped):
-    super().__init__(weakref.ref(wrapped))
+    def __init__(self, wrapped):
+        super().__init__(weakref.ref(wrapped))
 
-  @property
-  def unwrapped(self):
-    return self._wrapped()
+    @property
+    def unwrapped(self):
+        return self._wrapped()
 
 
 class Reference(_ObjectIdentityWrapper):
-  """Reference that refers an object.
+    """Reference that refers an object.
 
-  ```python
-  x = [1]
-  y = [1]
+    ```python
+    x = [1]
+    y = [1]
 
-  x_ref1 = Reference(x)
-  x_ref2 = Reference(x)
-  y_ref2 = Reference(y)
+    x_ref1 = Reference(x)
+    x_ref2 = Reference(x)
+    y_ref2 = Reference(y)
 
-  print(x_ref1 == x_ref2)
-  ==> True
+    print(x_ref1 == x_ref2)
+    ==> True
 
-  print(x_ref1 == y)
-  ==> False
-  ```
-  """
+    print(x_ref1 == y)
+    ==> False
+    ```
+    """
 
-  __slots__ = ()
+    __slots__ = ()
 
-  # Disabling super class' unwrapped field.
-  unwrapped = property()
+    # Disabling super class' unwrapped field.
+    unwrapped = property()
 
-  def deref(self):
-    """Returns the referenced object.
+    def deref(self):
+        """Returns the referenced object.
 
-    ```python
-    x_ref = Reference(x)
-    print(x is x_ref.deref())
-    ==> True
-    ```
-    """
-    return self._wrapped
+        ```python
+        x_ref = Reference(x)
+        print(x is x_ref.deref())
+        ==> True
+        ```
+        """
+        return self._wrapped
 
 
 class ObjectIdentityDictionary(collections.abc.MutableMapping):
-  """A mutable mapping data structure which compares using "is".
+    """A mutable mapping data structure which compares using "is".
 
-  This is necessary because we have trackable objects (_ListWrapper) which
-  have behavior identical to built-in Python lists (including being unhashable
-  and comparing based on the equality of their contents by default).
-  """
+    This is necessary because we have trackable objects (_ListWrapper) which
+    have behavior identical to built-in Python lists (including being unhashable
+    and comparing based on the equality of their contents by default).
+    """
 
-  __slots__ = ["_storage"]
+    __slots__ = ["_storage"]
 
-  def __init__(self):
-    self._storage = {}
+    def __init__(self):
+        self._storage = {}
 
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
+    def _wrap_key(self, key):
+        return _ObjectIdentityWrapper(key)
 
-  def __getitem__(self, key):
-    return self._storage[self._wrap_key(key)]
+    def __getitem__(self, key):
+        return self._storage[self._wrap_key(key)]
 
-  def __setitem__(self, key, value):
-    self._storage[self._wrap_key(key)] = value
+    def __setitem__(self, key, value):
+        self._storage[self._wrap_key(key)] = value
 
-  def __delitem__(self, key):
-    del self._storage[self._wrap_key(key)]
+    def __delitem__(self, key):
+        del self._storage[self._wrap_key(key)]
 
-  def __len__(self):
-    return len(self._storage)
+    def __len__(self):
+        return len(self._storage)
 
-  def __iter__(self):
-    for key in self._storage:
-      yield key.unwrapped
+    def __iter__(self):
+        for key in self._storage:
+            yield key.unwrapped
 
-  def __repr__(self):
-    return "ObjectIdentityDictionary(%s)" % repr(self._storage)
+    def __repr__(self):
+        return f"ObjectIdentityDictionary({repr(self._storage)})"
 
 
 class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
-  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
+    """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
 
-  __slots__ = ["__weakref__"]
+    __slots__ = ["__weakref__"]
 
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
+    def _wrap_key(self, key):
+        return _WeakObjectIdentityWrapper(key)
 
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len(list(self._storage))
+    def __len__(self):
+        # Iterate, discarding old weak refs
+        return len(list(self._storage))
 
-  def __iter__(self):
-    keys = self._storage.keys()
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        del self[key]
-      else:
-        yield unwrapped
+    def __iter__(self):
+        keys = self._storage.keys()
+        for key in keys:
+            unwrapped = key.unwrapped
+            if unwrapped is None:
+                del self[key]
+            else:
+                yield unwrapped
 
 
 class ObjectIdentitySet(collections.abc.MutableSet):
-  """Like the built-in set, but compares objects with "is"."""
+    """Like the built-in set, but compares objects with "is"."""
 
-  __slots__ = ["_storage", "__weakref__"]
+    __slots__ = ["_storage", "__weakref__"]
 
-  def __init__(self, *args):
-    self._storage = set(self._wrap_key(obj) for obj in list(*args))
+    def __init__(self, *args):
+        self._storage = set(self._wrap_key(obj) for obj in list(*args))
 
-  @staticmethod
-  def _from_storage(storage):
-    result = ObjectIdentitySet()
-    result._storage = storage  # pylint: disable=protected-access
-    return result
+    @staticmethod
+    def _from_storage(storage):
+        result = ObjectIdentitySet()
+        result._storage = storage
+        return result
 
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
+    def _wrap_key(self, key):
+        return _ObjectIdentityWrapper(key)
 
-  def __contains__(self, key):
-    return self._wrap_key(key) in self._storage
+    def __contains__(self, key):
+        return self._wrap_key(key) in self._storage
 
-  def discard(self, key):
-    self._storage.discard(self._wrap_key(key))
+    def discard(self, key):
+        self._storage.discard(self._wrap_key(key))
 
-  def add(self, key):
-    self._storage.add(self._wrap_key(key))
+    def add(self, key):
+        self._storage.add(self._wrap_key(key))
 
-  def update(self, items):
-    self._storage.update([self._wrap_key(item) for item in items])
+    def update(self, items):
+        self._storage.update([self._wrap_key(item) for item in items])
 
-  def clear(self):
-    self._storage.clear()
+    def clear(self):
+        self._storage.clear()
 
-  def intersection(self, items):
-    return self._storage.intersection([self._wrap_key(item) for item in items])
+    def intersection(self, items):
+        return self._storage.intersection(
+            [self._wrap_key(item) for item in items]
+        )
 
-  def difference(self, items):
-    return ObjectIdentitySet._from_storage(
-        self._storage.difference([self._wrap_key(item) for item in items]))
+    def difference(self, items):
+        return ObjectIdentitySet._from_storage(
+            self._storage.difference([self._wrap_key(item) for item in items])
+        )
 
-  def __len__(self):
-    return len(self._storage)
+    def __len__(self):
+        return len(self._storage)
 
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      yield key.unwrapped
+    def __iter__(self):
+        keys = list(self._storage)
+        for key in keys:
+            yield key.unwrapped
 
 
 class ObjectIdentityWeakSet(ObjectIdentitySet):
-  """Like weakref.WeakSet, but compares objects with "is"."""
+    """Like weakref.WeakSet, but compares objects with "is"."""
+
+    __slots__ = ()
+
+    def _wrap_key(self, key):
+        return _WeakObjectIdentityWrapper(key)
 
-  __slots__ = ()
+    def __len__(self):
+        # Iterate, discarding old weak refs
+        return len([_ for _ in self])
 
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
+    def __iter__(self):
+        keys = list(self._storage)
+        for key in keys:
+            unwrapped = key.unwrapped
+            if unwrapped is None:
+                self.discard(key)
+            else:
+                yield unwrapped
 
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len([_ for _ in self])
 
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        self.discard(key)
-      else:
-        yield unwrapped
 # LINT.ThenChange(//tensorflow/python/util/object_identity.py)
diff --git a/keras/utils/sidecar_evaluator.py b/keras/utils/sidecar_evaluator.py
new file mode 100644
index 000000000000..82b3c1df04d5
--- /dev/null
+++ b/keras/utils/sidecar_evaluator.py
@@ -0,0 +1,432 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python module for evaluation loop."""
+
+import re
+
+import tensorflow as tf
+
+# isort: off
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
+from keras.callbacks import ModelCheckpoint
+from keras.optimizers import optimizer
+from tensorflow.python.util.tf_export import keras_export
+
+_PRINT_EVAL_STEP_EVERY_SEC = 60.0
+_ITERATIONS_UNINITIALIZED = -1
+_CHECKPOINT_TIMEOUT_SEC = 30
+
+
+def list_checkpoint_attributes(ckpt_dir_or_file):
+    """Lists all the attributes in a checkpoint.
+
+    Checkpoint keys are paths in a checkpoint graph, and attribute is the first
+    element in the path. e.g. with a checkpoint key
+    "optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE", optimizer is the attribute. The
+    attribute is also used to save/restore a variable in a checkpoint,
+    e.g. tf.train.Checkpoint(optimizer=optimizer, model=model).
+
+    Args:
+      ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
+
+    Returns:
+      Set of attributes in a checkpoint.
+    """
+    reader = tf.train.load_checkpoint(ckpt_dir_or_file)
+    variable_map = reader.get_variable_to_shape_map()
+    return {name.split("/")[0] for name in variable_map.keys()}
+
+
+@keras_export("keras.utils.SidecarEvaluator", v1=[])
+class SidecarEvaluator:
+    """A class designed for a dedicated evaluator task.
+
+    `SidecarEvaluator` is expected to be run in a process on a separate machine
+    from the training cluster. It is meant for the purpose of a dedicated
+    evaluator, evaluating the metric results of a training cluster which has one
+    or more workers performing the training, and saving checkpoints.
+
+    The `SidecarEvaluator` API is compatible with both Custom Training Loop
+    (CTL), and Keras `Model.fit` to be used in the training cluster. Using the
+    model (with compiled metrics) provided at `__init__`, `SidecarEvaluator`
+    repeatedly performs evaluation "epochs" when it finds a checkpoint that has
+    not yet been used. Depending on the `steps` argument, an eval epoch is
+    evaluation over all eval data, or up to certain number of steps (batches).
+    See examples below for how the training program should save the checkpoints
+    in order to be recognized by `SidecarEvaluator`.
+
+    Since under the hood, `SidecarEvaluator` uses `model.evaluate` for
+    evaluation, it also supports arbitrary Keras callbacks. That is, if one or
+    more callbacks are provided, their `on_test_batch_begin` and
+    `on_test_batch_end` methods are called at the start and end of a batch, and
+    their `on_test_begin` and `on_test_end` are called at the start and end of
+    an evaluation epoch. Note that `SidecarEvaluator` may skip some checkpoints
+    because it always picks up the latest checkpoint available, and during an
+    evaluation epoch, multiple checkpoints can be produced from the training
+    side.
+
+    Example:
+    ```python
+    model = tf.keras.models.Sequential(...)
+    model.compile(metrics=tf.keras.metrics.SparseCategoricalAccuracy(
+        name="eval_metrics"))
+    data = tf.data.Dataset.from_tensor_slices(...)
+
+    tf.keras.SidecarEvaluator(
+        model=model,
+        data=data,
+        # dir for training-saved checkpoint
+        checkpoint_dir='/tmp/checkpoint_dir',
+        steps=None,  # Eval until dataset is exhausted
+        max_evaluations=None,  # The evaluation needs to be stopped manually
+        callbacks=[tf.keras.callbacks.TensorBoard(log_dir='/tmp/log_dir')]
+    ).start()
+    ```
+
+    `SidecarEvaluator.start` writes a series of summary files which can be
+    visualized by tensorboard (which provides a webpage link):
+
+    ```bash
+    $ tensorboard --logdir=/tmp/log_dir/validation
+    ...
+    TensorBoard 2.4.0a0 at http://host:port (Press CTRL+C to quit)
+    ```
+
+    If the training cluster uses a CTL, the `checkpoint_dir` should contain
+    checkpoints that track both `model` and `optimizer`, to fulfill
+    `SidecarEvaluator`'s expectation. This can be done by a
+    `tf.train.Checkpoint` and a `tf.train.CheckpointManager`:
+
+    ```python
+    # Same `checkpoint_dir` supplied to `SidecarEvaluator`.
+    checkpoint_dir = ...
+    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint, checkpoint_dir=..., max_to_keep=...)
+    checkpoint_manager.save()
+    ```
+
+    If the training cluster uses Keras `Model.fit` API, a
+    `tf.keras.callbacks.ModelCheckpoint` should be used, with
+    `save_weights_only=True`, and the `filepath` should have 'ckpt-{epoch}'
+    appended:
+
+    ```python
+    # Same `checkpoint_dir` supplied to `SidecarEvaluator`.
+    checkpoint_dir = ...
+    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
+        filepath=os.path.join(checkpoint_dir, 'ckpt-{epoch}'),
+        save_weights_only=True)
+    model.fit(dataset, epochs, callbacks=[model_checkpoint])
+    ```
+    """
+
+    def __init__(
+        self,
+        model,
+        data,
+        checkpoint_dir,
+        steps=None,
+        max_evaluations=None,
+        callbacks=None,
+    ):
+        """Initializes an `SidecarEvaluator` object.
+
+        Args:
+          model: Model to use for evaluation. The model object used here should
+            be a `tf.keras.Model`, and should be the same as the one that is
+            used in training, where `tf.keras.Model`s are checkpointed. The
+            model should have one or more metrics compiled before using
+            `SidecarEvaluator`.
+          data: The input data for evaluation. `SidecarEvaluator` supports all
+            data types that Keras `model.evaluate` supports as the input data
+            `x`, such as a `tf.data.Dataset`.
+          checkpoint_dir: Directory where checkpoint files are saved.
+          steps: Number of steps to perform evaluation for, when evaluating a
+            single checkpoint file. If `None`, evaluation continues until the
+            dataset is exhausted. For repeated evaluation dataset, user must
+            specify `steps` to avoid infinite evaluation loop.
+          max_evaluations: Maximum number of the checkpoint file to be
+            evaluated, for `SidecarEvaluator` to know when to stop. The
+            evaluator will stop after it evaluates a checkpoint filepath ending
+            with '<ckpt_name>-<max_evaluations>'. If using
+            `tf.train.CheckpointManager.save` for saving checkpoints, the kth
+            saved checkpoint has the filepath suffix '<ckpt_name>-<k>' (k=1 for
+            the first saved), and if checkpoints are saved every epoch after
+            training, the filepath saved at the kth epoch would end with
+            '<ckpt_name>-<k>. Thus, if training runs for n epochs, and the
+            evaluator should end after the training finishes, use n for this
+            parameter. Note that this is not necessarily equal to the number of
+            total evaluations, since some checkpoints may be skipped if
+            evaluation is slower than checkpoint creation. If `None`,
+            `SidecarEvaluator` will evaluate indefinitely, and the user must
+            terminate evaluator program themselves.
+          callbacks: List of `keras.callbacks.Callback` instances to apply
+            during evaluation. See
+            [callbacks](/api_docs/python/tf/keras/callbacks).
+        """
+        self.model = model
+        self.data = data
+        self.checkpoint_dir = checkpoint_dir
+        self._iterations = tf.Variable(
+            name="iterations",
+            initial_value=_ITERATIONS_UNINITIALIZED,
+            dtype=tf.int64,
+        )
+        self.max_evaluations = max_evaluations
+        self.steps = steps
+        self.callbacks = callbacks or []
+
+    def _timeout_fn(self):
+        logging.info(
+            "No checkpoints appear to be found after "
+            f"{_CHECKPOINT_TIMEOUT_SEC} seconds. "
+            "Please check if you are properly using a "
+            "`tf.train.Checkpoint/CheckpointManager` or "
+            "`tf.keras.callbacks.ModelCheckpoint(save_weights_only=True)` to "
+            "save checkpoints by the training. See "
+            "`tf.keras.SidecarEvaluator` doc for recommended flows "
+            "of saving checkpoints."
+        )
+        return False
+
+    def start(self):
+        """Starts the evaluation loop."""
+        if self.model.optimizer and isinstance(
+            self.model.optimizer, optimizer.Optimizer
+        ):
+            checkpoint = tf.train.Checkpoint(
+                model=self.model, optimizer=self.model.optimizer
+            )
+        else:
+            optimizer_checkpoint = tf.train.Checkpoint(iter=self._iterations)
+            checkpoint = tf.train.Checkpoint(
+                model=self.model, optimizer=optimizer_checkpoint
+            )
+        for latest_checkpoint in tf.train.checkpoints_iterator(
+            self.checkpoint_dir,
+            timeout=_CHECKPOINT_TIMEOUT_SEC,
+            timeout_fn=self._timeout_fn,
+        ):
+            try:
+                # `expect_partial` because the checkpoint can have other
+                # `Trackable`s such as `optimizer`.
+                checkpoint.restore(latest_checkpoint).expect_partial()
+                checkpoint_attributes = list_checkpoint_attributes(
+                    latest_checkpoint
+                )
+                # The checkpoint should contain model and optimizer for
+                # SidecarEvaluator to work. But the model weights saved by
+                # ModelCheckpoint callback does not contain model as an
+                # attribute. To make SidecarEvaluator compatibly work in this
+                # case, use model.load_weights to load the model's weights,
+                # while self._iterations is still restored by checkpoint
+                # variable.
+                if "model" not in checkpoint_attributes:
+                    self.model.load_weights(latest_checkpoint)
+                # The model checkpoint might not include optimizer in cases,
+                # e.g.  using a custom training loop. Directly assign the
+                # iterations property to be used in callbacks.
+                if self.model.optimizer and not isinstance(
+                    self.model.optimizer,
+                    optimizer.Optimizer,
+                ):
+                    # experimental optimizer automatically restores the
+                    # iteration value.
+                    self.model.optimizer.iterations.assign(self._iterations)
+            except (tf.errors.OpError,) as e:
+                if isinstance(e, tf.errors.UnavailableError):
+                    # With distribute training, worker preemption can result in
+                    # `UnavailableError`. Raise this to be handled outside the
+                    # evaluation loop.
+                    raise e
+
+                # A couple errors can happen here with the coordinator racing to
+                # write checkpoint:
+                # 1) OpError: open failed for <file path>: No such file or
+                # directory
+                # 2) NotFoundError (subclass of OpError): Unsuccessful
+                # TensorSliceReader constructor.
+                # TODO(rchao): Remove this except block once b/150954027 is
+                # resolved.
+                logging.info(
+                    "SidecarEvaluator encountered an error when loading the "
+                    f"checkpoint at {latest_checkpoint}. Retrying. "
+                    f"Error: {e.__class__.__name__}: {e}"
+                )
+                continue
+            if (
+                self._iterations.numpy() == _ITERATIONS_UNINITIALIZED
+                and not isinstance(
+                    self.model.optimizer,
+                    optimizer.Optimizer,
+                )
+            ):
+                raise RuntimeError(
+                    "Variable `iterations` cannot be loaded from the "
+                    f"checkpoint file at {self.checkpoint_dir}. "
+                    "Please ensure `iterations` is "
+                    "included in the checkpoint saved during training."
+                )
+
+            logging.info(
+                "Evaluation starts: Model weights loaded from latest "
+                f"checkpoint file {latest_checkpoint}"
+            )
+            self.model.evaluate(
+                self.data, steps=self.steps, callbacks=self.callbacks, verbose=2
+            )
+
+            return_metrics = {}
+            for metric in self.model.metrics:
+                result = metric.result()
+                if isinstance(result, dict):
+                    return_metrics.update(result)
+                else:
+                    return_metrics[metric.name] = result
+
+            logging.info(
+                "End of evaluation. Metrics: %s",
+                " ".join(
+                    [
+                        f"{name}={value.numpy()}"
+                        for name, value in return_metrics.items()
+                    ]
+                ),
+            )
+
+            if self.max_evaluations and (
+                self.max_evaluations <= int(latest_checkpoint.split("-")[-1])
+            ):
+                # Exit the loop because we have evaluated the final checkpoint
+                # file.
+                logging.info(
+                    "Last checkpoint evaluated. SidecarEvaluator stops."
+                )
+                return
+
+
+@keras_export("keras.experimental.SidecarEvaluator", v1=[])
+@deprecation.deprecated_endpoints("keras.experimental.SidecarEvaluator")
+class SidecarEvaluatorExperimental(SidecarEvaluator):
+    """Deprecated. Please use `tf.keras.utils.SidecarEvaluator` instead.
+
+    Caution: `tf.keras.experimental.SidecarEvaluator` endpoint is
+      deprecated and will be removed in a future release. Please use
+      `tf.keras.utils.SidecarEvaluator`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        logging.warning(
+            "`tf.keras.experimental.SidecarEvaluator` endpoint is "
+            "deprecated and will be removed in a future release. Please use "
+            "`tf.keras.utils.SidecarEvaluator`."
+        )
+        super().__init__(*args, **kwargs)
+
+
+@keras_export("keras.callbacks.SidecarEvaluatorModelExport")
+class SidecarEvaluatorModelExport(ModelCheckpoint):
+    """Callback to save the best Keras model.
+
+    It expands the functionality of the existing ModelCheckpoint callback to
+    enable exporting the best models after evaluation with validation dataset.
+
+    When using the `SidecarEvaluatorModelExport` callback in conjunction with
+    `keras.utils.SidecarEvaluator`, users should provide the `filepath`, which
+    is the path for this callback to export model or save weights to, and
+    `ckpt_filepath`, which is where the checkpoint is available to extract
+    the epoch number from. The callback will then export the model that the
+    evaluator deems as the best (among the checkpoints saved by the training
+    counterpart) to the specified `filepath`. This callback is intended to be
+    used by SidecarEvaluator only.
+
+    Example:
+
+    ```python
+    model.compile(loss=..., optimizer=...,
+                  metrics=['accuracy'])
+    sidecar_evaluator = keras.utils.SidecarEvaluator(
+        model=model,
+        data=dataset,
+        checkpoint_dir=checkpoint_dir,
+        max_evaluations=1,
+        callbacks=[
+            SidecarEvaluatorModelExport(
+                export_filepath=os.path.join(checkpoint_dir,
+                                      'best_model_eval',
+                                      'best-model-{epoch:04d}'),
+                checkpoint_filepath=os.path.join(checkpoint_dir,
+                'ckpt-{epoch:04d}'),
+                save_freq="eval",
+                save_weights_only=True,
+                monitor="loss",
+                mode="min",
+                verbose=1,
+            ),
+        ],
+    )
+    sidecar_evaluator.start()
+    # Model weights are saved if evaluator deems it's the best seen so far.
+
+    Args:
+        export_filepath: Path where best models should be saved by this
+          `SidecarEvaluatorModelExport` callback. Epoch formatting options, such
+          as `os.path.join(best_model_dir, 'best-model-{epoch:04d}')`, can be
+          used to allow saved model to preserve epoch information in the file
+          name. SidecarEvaluatorModelExport will use the "training epoch" at
+          which the checkpoint was saved by training to fill the epoch
+          placeholder in the path.
+        checkpoint_filepath: Path where checkpoints were saved by training. This
+          should be the same as what is provided to `filepath` argument of
+          `ModelCheckpoint` on the training side, such as
+          `os.path.join(checkpoint_dir, 'ckpt-{epoch:04d}')`.
+    """
+
+    def __init__(self, export_filepath, checkpoint_filepath, **kwargs):
+        super().__init__(
+            filepath=export_filepath,
+            save_best_only=True,
+            **kwargs,
+        )
+
+        self._checkpoint_filepath = checkpoint_filepath
+
+    def on_test_begin(self, logs=None):
+        """Updates export_index to the latest checkpoint."""
+
+        most_recent_filepath = (
+            self._get_most_recently_modified_file_matching_pattern(
+                self._checkpoint_filepath
+            )
+        )
+        if most_recent_filepath is not None:
+            self.export_index = (
+                int(
+                    re.match(r".*ckpt-(?P<ckpt>\d+)", most_recent_filepath)[
+                        "ckpt"
+                    ]
+                )
+                - 1
+            )
+        else:
+            self.export_index = 0
+
+    def on_test_end(self, logs):
+        """Saves best model at the end of an evaluation epoch."""
+
+        self.epochs_since_last_save += 1
+        self._save_model(epoch=self.export_index, batch=None, logs=logs)
diff --git a/keras/utils/sidecar_evaluator_test.py b/keras/utils/sidecar_evaluator_test.py
new file mode 100644
index 000000000000..f336393470e3
--- /dev/null
+++ b/keras/utils/sidecar_evaluator_test.py
@@ -0,0 +1,460 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test covering sidecar_evaluator.py."""
+
+import enum
+import os
+import shutil
+import threading
+import time
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl.testing import parameterized
+
+import keras
+from keras.optimizers import sgd
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import np_utils
+from keras.utils import sidecar_evaluator as sidecar_evaluator_lib
+from keras.utils.sidecar_evaluator import SidecarEvaluatorModelExport
+
+# isort: off
+from tensorflow.python.platform import tf_logging as logging
+
+_BATCH_SIZE = 32
+TRAIN_SAMPLES = 20
+TEST_SAMPLES = 20
+INPUT_DIM = 3
+NUM_CLASSES = 2
+NUM_HIDDEN = 5
+BATCH_SIZE = 5
+
+
+class TestModel(keras.Model):
+    def __init__(self):
+        super().__init__(name="test_model")
+        self.dense = keras.layers.Dense(10)
+
+    def call(self, inputs):
+        return self.dense(inputs)
+
+
+class DictMetric(keras.metrics.MeanSquaredError):
+    def result(self):
+        res = super().result()
+        return {"mean_squared_error_1": res, "mean_squared_error_2": res}
+
+
+class ModelType(enum.Enum):
+    SEQUENTIAL = "sequential"
+    SUBCLASS = "subclass"
+
+
+def _test_model_builder(model_type: ModelType, compile_model, build_model):
+    if model_type == ModelType.SEQUENTIAL:
+        model = keras.Sequential([keras.layers.Dense(10)])
+    elif model_type == ModelType.SUBCLASS:
+        model = TestModel()
+
+    if compile_model:
+        model.compile(
+            sgd.SGD(),
+            loss="mse",
+            metrics=[keras.metrics.CategoricalAccuracy(), DictMetric()],
+        )
+    if build_model:
+        model.build((None, 32))
+
+    return model
+
+
+@test_utils.run_v2_only
+class SidecarEvaluatorTest(tf.test.TestCase, parameterized.TestCase):
+    def assertSummaryEventsWritten(self, log_dir):
+        # Asserts summary files do get written when log_dir is provided.
+        summary_files = tf.io.gfile.listdir(log_dir)
+        self.assertNotEmpty(
+            summary_files,
+            "Summary should have been written and log_dir should not be empty.",
+        )
+
+        # Asserts the content of the summary file.
+        event_pb_written = False
+        event_tags = []
+        for summary_file in summary_files:
+            for event_pb in tf.compat.v1.train.summary_iterator(
+                os.path.join(log_dir, summary_file)
+            ):
+                if event_pb.step > 0:
+                    self.assertEqual(event_pb.step, 32)
+                    event_tags.append(event_pb.summary.value[0].tag)
+                    event_pb_written = True
+        self.assertCountEqual(
+            event_tags,
+            [
+                "evaluation_categorical_accuracy_vs_iterations",
+                "evaluation_loss_vs_iterations",
+                "evaluation_mean_squared_error_1_vs_iterations",
+                "evaluation_mean_squared_error_2_vs_iterations",
+            ],
+        )
+
+        # Verifying at least one non-zeroth step is written to summary.
+        self.assertTrue(event_pb_written)
+
+    def assertModelsSameVariables(self, model_a, model_b):
+        # Check both have the same number of variables.
+        self.assertEqual(len(model_a.variables), len(model_b.variables))
+
+        # Check variable values to be equal.
+        for var_a, var_b in zip(model_a.variables, model_b.variables):
+            self.assertAllEqual(var_a.numpy(), var_b.numpy())
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
+        )
+    )
+    def testIterationsNotSavedWillRaiseError(self, model_type):
+        model = _test_model_builder(
+            model_type=model_type, compile_model=False, build_model=True
+        )
+
+        checkpoint_dir = self.get_temp_dir()
+        checkpoint = tf.train.Checkpoint(model=model)
+        checkpoint_manager = tf.train.CheckpointManager(
+            checkpoint, checkpoint_dir, max_to_keep=2
+        )
+        checkpoint_manager.save()
+
+        sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
+            model, data=None, checkpoint_dir=checkpoint_dir
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "`iterations` cannot be loaded from the checkpoint file.",
+        ):
+            sidecar_evaluator.start()
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
+        )
+    )
+    def testModelNotBuiltRaiseError(self, model_type):
+        model = _test_model_builder(
+            model_type=model_type, compile_model=False, build_model=False
+        )
+
+        checkpoint_dir = self.get_temp_dir()
+        checkpoint = tf.train.Checkpoint(model=model)
+        checkpoint_manager = tf.train.CheckpointManager(
+            checkpoint, checkpoint_dir, max_to_keep=2
+        )
+        checkpoint_manager.save()
+
+        sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
+            model, data=None, checkpoint_dir=checkpoint_dir
+        )
+        with self.assertRaisesRegex(AssertionError, "Nothing to load."):
+            sidecar_evaluator.start()
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
+            build_model=[True, False],
+        )
+    )
+    def testSidecarEvaluatorOutputsSummary(self, model_type, build_model):
+        # Create a model with synthetic data, and fit for one epoch.
+        model = _test_model_builder(
+            model_type=model_type, compile_model=True, build_model=False
+        )
+        data = np.random.random((1000, 32))
+        labels = np.random.random((1000, 10))
+        dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+        dataset = dataset.batch(32)
+        model.fit(dataset, epochs=1)
+
+        # Save a checkpoint.
+        checkpoint_dir = os.path.join(self.get_temp_dir(), "ckpt")
+        log_dir = os.path.join(self.get_temp_dir(), "summary")
+        logging.info(
+            "checkpoint_dir = %s, log_dir = %s", checkpoint_dir, log_dir
+        )
+        checkpoint = tf.train.Checkpoint(model=model, optimizer=model.optimizer)
+        checkpoint_manager = tf.train.CheckpointManager(
+            checkpoint, checkpoint_dir, max_to_keep=2
+        )
+        logging.info(
+            "Checkpoint manager saved to: %s", checkpoint_manager.save()
+        )
+        self.assertNotEmpty(
+            tf.io.gfile.listdir(checkpoint_dir),
+            "Checkpoint should have been written and "
+            "checkpoint_dir should not be empty.",
+        )
+
+        # Create a new model used for evaluation.
+        eval_model = _test_model_builder(
+            model_type=model_type, compile_model=True, build_model=build_model
+        )
+        # Have a sidecar_evaluator evaluate once.
+        sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
+            eval_model,
+            data=dataset,
+            checkpoint_dir=checkpoint_dir,
+            max_evaluations=1,
+            callbacks=[keras.callbacks.TensorBoard(log_dir=log_dir)],
+        )
+        sidecar_evaluator.start()
+        # Eval model has been restored to the same state as the original model,
+        # so their weights should match. If not, restoration of the model didn't
+        # work.
+        self.assertModelsSameVariables(model, eval_model)
+
+        self.assertSummaryEventsWritten(os.path.join(log_dir, "validation"))
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
+            build_model=[True, False],
+        )
+    )
+    def testSidecarEvaluatorOutputsSummarySavedWithCallback(
+        self, model_type, build_model
+    ):
+        checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoints")
+        log_dir = os.path.join(self.get_temp_dir(), "summary")
+        # Create a model with synthetic data, and fit for one epoch.
+        model = _test_model_builder(
+            model_type=model_type, compile_model=True, build_model=False
+        )
+        data = np.random.random((1000, 32))
+        labels = np.random.random((1000, 10))
+        dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+        dataset = dataset.batch(_BATCH_SIZE)
+        save_callback = keras.callbacks.ModelCheckpoint(
+            filepath=os.path.join(checkpoint_dir, "ckpt-{epoch}"),
+            save_weights_only=True,
+        )
+        model.fit(dataset, epochs=1, callbacks=[save_callback])
+        self.assertNotEmpty(
+            tf.io.gfile.listdir(checkpoint_dir),
+            "Checkpoint should have been written and "
+            "checkpoint_dir should not be empty.",
+        )
+
+        # Create a new model used for evaluation.
+        eval_model = _test_model_builder(
+            model_type=model_type, compile_model=True, build_model=build_model
+        )
+        # Have an sidecar_evaluator evaluate once.
+        sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
+            eval_model,
+            data=dataset,
+            checkpoint_dir=checkpoint_dir,
+            max_evaluations=1,
+            callbacks=[keras.callbacks.TensorBoard(log_dir=log_dir)],
+        )
+        with self.assertLogs() as cm:
+            sidecar_evaluator.start()
+
+        metrics_logging = [
+            line for line in cm.output if "End of evaluation" in line
+        ]
+        self.assertLen(metrics_logging, 1)
+        expected_logged_metrics = [
+            "loss",
+            "categorical_accuracy",
+            "mean_squared_error_1",
+            "mean_squared_error_2",
+        ]
+        for metric_name in expected_logged_metrics:
+            self.assertRegex(metrics_logging[0], f"{metric_name}=")
+
+        # Eval model has been restored to the same state as the original model,
+        # so their weights should match. If not, restoration of the model didn't
+        # work.
+        self.assertModelsSameVariables(model, eval_model)
+
+        # check the iterations is restored.
+        self.assertEqual(
+            sidecar_evaluator.model.optimizer.iterations.numpy(), _BATCH_SIZE
+        )
+
+        self.assertSummaryEventsWritten(os.path.join(log_dir, "validation"))
+
+    @tf.__internal__.distribute.combinations.generate(
+        tf.__internal__.test.combinations.combine(
+            mode=["eager"],
+            model_type=[ModelType.SEQUENTIAL, ModelType.SUBCLASS],
+            build_model=[True, False],
+        )
+    )
+    def testTimeoutFunction(self, model_type, build_model):
+        checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoints")
+        # Create a model with synthetic data, and fit for one epoch.
+        data = np.random.random((1000, 32))
+        labels = np.random.random((1000, 10))
+        dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+        dataset = dataset.batch(_BATCH_SIZE)
+
+        # Create a new model used for evaluation.
+        eval_model = _test_model_builder(
+            model_type=model_type, compile_model=True, build_model=build_model
+        )
+        # Have an sidecar_evaluator evaluate once.
+        sidecar_evaluator = sidecar_evaluator_lib.SidecarEvaluator(
+            eval_model,
+            data=dataset,
+            checkpoint_dir=checkpoint_dir,
+            max_evaluations=1,
+        )
+        with self.assertLogs() as cm:
+            threading.Thread(
+                target=sidecar_evaluator.start, daemon=True
+            ).start()
+            time.sleep(50)
+
+        metrics_logging = [
+            l for l in cm.output if "No checkpoints appear to be found" in l
+        ]
+        self.assertGreaterEqual(len(metrics_logging), 1)
+
+    def testExperimentalDeprecatedMessage(self):
+
+        warning_messages = []
+
+        def warning(msg):
+            warning_messages.append(msg)
+
+        with tf.compat.v1.test.mock.patch.object(logging, "warning", warning):
+            sidecar_evaluator_lib.SidecarEvaluatorExperimental(None, None, None)
+
+        warning_msg = (
+            "`tf.keras.experimental.SidecarEvaluator` endpoint is deprecated"
+        )
+        self.assertIn(warning_msg, "\n".join(warning_messages))
+
+    @test_combinations.run_with_all_model_types
+    def test_best_model_exporter_with_sidecarevaluator(self):
+        temp_dir = self.get_temp_dir()
+        self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+        # Create a model with synthetic data, and fit for 20 epochs.
+        layers = [
+            keras.layers.Dense(
+                NUM_HIDDEN, input_dim=INPUT_DIM, activation="relu"
+            ),
+            keras.layers.Dense(NUM_CLASSES, activation="softmax"),
+        ]
+        model = test_utils.get_model_from_layers(layers, input_shape=(3,))
+        model.compile(
+            loss="categorical_crossentropy",
+            optimizer="rmsprop",
+            metrics=["acc"],
+        )
+
+        (x_train, y_train), (x_test, y_test) = test_utils.get_test_data(
+            train_samples=TRAIN_SAMPLES,
+            test_samples=TEST_SAMPLES,
+            input_shape=(INPUT_DIM,),
+            num_classes=NUM_CLASSES,
+        )
+        y_test = np_utils.to_categorical(y_test)
+        y_train = np_utils.to_categorical(y_train)
+
+        callbacks = [
+            keras.callbacks.ModelCheckpoint(
+                filepath=os.path.join(
+                    os.path.join(temp_dir, "ckpt"), "ckpt-{epoch:04d}"
+                ),
+                monitor="loss",
+                save_best_only=True,
+                save_weights_only=True,
+                save_freq="epoch",
+                mode="min",
+            )
+        ]
+
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=callbacks,
+            epochs=20,
+            verbose=0,
+        )
+        self.assertNotEmpty(
+            tf.io.gfile.listdir(os.path.join(temp_dir, "ckpt")),
+            "Checkpoints should have been written and "
+            "checkpoint_dir should not be empty.",
+        )
+
+        # Have a sidecar_evaluator evaluate once.
+        dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+        dataset = dataset.batch(BATCH_SIZE)
+        sidecar_evaluator = keras.utils.SidecarEvaluator(
+            model=model,
+            data=dataset,
+            checkpoint_dir=os.path.join(temp_dir, "ckpt"),
+            max_evaluations=1,
+            callbacks=[
+                SidecarEvaluatorModelExport(
+                    export_filepath=os.path.join(
+                        os.path.join(temp_dir, "ckpt"),
+                        "best_model_eval",
+                        "best-model-{epoch:04d}",
+                    ),
+                    checkpoint_filepath=os.path.join(
+                        os.path.join(temp_dir, "ckpt"), "ckpt-{epoch:04d}"
+                    ),
+                    save_weights_only=False,
+                    monitor="loss",
+                    mode="min",
+                    verbose=1,
+                ),
+            ],
+        )
+        sidecar_evaluator.start()
+
+        # Asserts output directory exists.
+        assert os.path.exists(
+            os.path.join(os.path.join(temp_dir, "ckpt"), "best_model_eval")
+        )
+
+        # Asserts best model files do get written.
+        self.assertRegex(
+            str(
+                tf.io.gfile.listdir(
+                    os.path.join(
+                        os.path.join(temp_dir, "ckpt"), "best_model_eval"
+                    )
+                )
+            ),
+            r"(.*best-model.*)+",
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/steps_per_execution_tuning.py b/keras/utils/steps_per_execution_tuning.py
new file mode 100644
index 000000000000..ade47a736da5
--- /dev/null
+++ b/keras/utils/steps_per_execution_tuning.py
@@ -0,0 +1,264 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Steps per execution autotuning for Keras engine."""
+
+import logging
+import threading
+import time
+
+import numpy as np
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.utils.StepsPerExecutionTuner")
+class StepsPerExecutionTuner:
+    """Steps per execution tuner class.
+
+    Args:
+        optimizer: The optimizer used for training/evaluation/prediction. Used
+            to measure iterations and global throughput
+            (`optimizer.iterations`/second).
+        spe_variable: A `tf.Variable` representing the `steps_per_execution`
+            variable used during training/evaluation/prediction. Must be
+            updatable with `spe_variable.assign`.
+        interval: Optional int, the amount of seconds to wait between calls to
+            measure throughput and tune `spe_variable`. Defaults to 5.
+        change_spe_interval: Optional int, the number of throughput measurements
+            before tuning. Defaults to 10.
+        change_threshold: Optional float, the percent different in throughput to
+            trigger a `steps_per_execution` change. For example, `0.1` triggers
+            changes if throughput changes more than 10%.
+
+    Examples:
+
+    If you're using `model.compile` and `model.fit`, this functionality is
+    available at compile time with `steps_per_execution='auto'`
+
+    ```python
+    model.compile(..., steps_per_execution='auto')
+    ```
+
+    Custom training loop usage:
+
+    ```python
+    # Get model
+    inputs = keras.Input(shape=(784,), name="digits")
+    x = layers.Dense(64, activation="relu", name="dense_1")(inputs)
+    x = layers.Dense(64, activation="relu", name="dense_2")(x)
+    outputs = layers.Dense(10, name="predictions")(x)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+
+    # Instantiate an optimizer to train the model.
+    optimizer = keras.optimizers.SGD(learning_rate=1e-3)
+    # Instantiate a loss function.
+    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+    # Prepare the training dataset.
+    batch_size = 64
+    (x_train, y_train), (_, _) = keras.datasets.mnist.load_data()
+    x_train = np.reshape(x_train, (-1, 784))
+    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+
+    # Create our steps per execution variable
+    steps_per_execution = tf.Variable(
+        1,
+        dtype="int64",
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
+    )
+
+    # Create the tuner
+    tuner = StepsPerExecutionTuner(
+        optimizer, steps_per_execution
+    )
+
+    # Create a step function that runs a single training step
+    @tf.function
+    def step_fn(iterator):
+        batch_data, labels = next(iterator)
+        with tf.GradientTape() as tape:
+            logits = model(batch_data, training=True)
+            loss_value = loss_fn(labels, logits)
+        grads = tape.gradient(loss_value, model.trainable_weights)
+        optimizer.apply_gradients(zip(grads, model.trainable_weights))
+
+    # We can now pack multiple execution steps into one call
+    @tf.function
+    def multi_step_train_fn(iterator, steps_per_execution):
+        for _ in tf.range(steps_per_execution):
+            outputs = step_fn(iterator)
+        return
+
+    initial_steps_per_execution = 1
+    steps_per_epoch = 100
+    epochs = 2
+
+    # Start the tuner before training
+    tuner.start()
+
+    # We can now call our multi step training with our data
+    for epoch in range(epochs):
+        for _ in range(steps_per_epoch):
+            multi_step_train_fn(iterator, steps_per_execution)
+
+    # End the tuner after training
+    tuner.stop()
+    ```
+    """
+
+    def __init__(
+        self,
+        optimizer,
+        spe_variable,
+        interval=5,
+        change_spe_interval=10,
+        change_threshold=0.1,
+    ):
+        self.optimizer = optimizer
+        self._steps_per_execution = spe_variable
+        self.interval = interval
+        self.change_spe_interval = change_spe_interval
+        self.spe_change_threshold = change_threshold
+        self.steps_per_execution_stop_event = threading.Event()
+        self.thread = None
+
+    def start(self):
+        """Starts steps per execution tuning thread.
+
+        Returns a `threading.Thread` which will run every `self.interval`
+            seconds to measure throughput and tune steps per execution.
+        """
+        if self.thread and self.thread.is_alive():
+            return self.thread
+        self._begin_tuning()
+        self.thread = threading.Thread(
+            target=self._steps_per_execution_interval_call, daemon=True
+        )  # needed to shut down successfully
+        self.thread.start()
+        return self.thread
+
+    @property
+    def steps_per_execution(self):
+        """Settable attribute representing`steps_per_execution` variable."""
+        return self._steps_per_execution
+
+    @steps_per_execution.setter
+    def steps_per_execution(self, value):
+        self._steps_per_execution.assign(value)
+        self.init_spe = value
+
+    def _steps_per_execution_interval_call(self):
+        while not self.steps_per_execution_stop_event.is_set():
+            self._measure_and_tune()
+            self.steps_per_execution_stop_event.wait(self.interval)
+
+    def _begin_tuning(self):
+        self.start_time = time.time()
+        self.init_iterations = self.optimizer.iterations.numpy()
+        self.init_spe = self._steps_per_execution.numpy().item()
+        self.spe_last_logged = {
+            "iteration": self.init_iterations,
+            "time_secs": self.start_time,
+        }
+        self.rgsps = []  # rgsps = recent global steps per second
+        self.avg_rgsps = 0
+        self.prev_avg_rgsps = 0
+        self.spe_tune_last_action_add = True
+        self.spe_measurement_count = 0
+
+    def stop(self):
+        """Stops steps per execution tuning thread."""
+        if not self.steps_per_execution_stop_event.is_set():
+            self.steps_per_execution_stop_event.set()
+
+    def _should_tune(self):
+        epoch_boundary = False
+        if self.rgsps[-1] == 0:
+            epoch_boundary = True
+
+        return (
+            self.spe_measurement_count % self.change_spe_interval == 0
+            and self.rgsps
+            and not epoch_boundary
+        )
+
+    def _tune(self):
+        """Changes the steps per execution using the following algorithm.
+
+        If there is more than a 10% increase in the throughput, then the last
+        recorded action is repeated (i.e. if increasing the SPE caused an
+        increase in throughput, it is increased again). If there is more than a
+        10% decrease in the throughput, then the opposite of the last action is
+        performed (i.e. if increasing the SPE decreased the throughput, then the
+        SPE is decreased).
+        """
+        self.avg_rgsps = sum(self.rgsps) / len(self.rgsps)
+        fast_threshold = (1 + self.spe_change_threshold) * self.prev_avg_rgsps
+        slow_threshold = (1 - self.spe_change_threshold) * self.prev_avg_rgsps
+
+        if self.spe_tune_last_action_add:
+            repeat_action_mult = 1.5
+            opposite_action_mult = 0.5
+        else:
+            repeat_action_mult = 0.5
+            opposite_action_mult = 1.5
+
+        spe_variable = self._steps_per_execution
+        spe_limit = spe_variable.dtype.max / 1.5
+        current_spe = spe_variable.numpy().item()
+        if self.avg_rgsps > fast_threshold:
+            # Note that our first iteration will always trigger this as our
+            # threshold should be 0
+            new_spe = current_spe * repeat_action_mult
+        elif self.avg_rgsps < slow_threshold:
+            new_spe = current_spe * opposite_action_mult
+            self.spe_tune_last_action_add = not self.spe_tune_last_action_add
+        else:
+            new_spe = current_spe
+
+        if current_spe >= spe_limit:
+            new_spe = current_spe
+        elif current_spe == 0:
+            new_spe = self.init_spe
+
+        self._steps_per_execution.assign(np.round(new_spe))
+        self.prev_avg_rgsps = self.avg_rgsps
+
+    def _measure_and_tune(self):
+        self.spe_measurement_count += 1
+
+        cur_iteration = self.optimizer.iterations.numpy()
+
+        cur_time_secs = time.time()
+        recent_gsps = (cur_iteration - self.spe_last_logged["iteration"]) / (
+            cur_time_secs - self.spe_last_logged["time_secs"]
+        )
+
+        self.rgsps.append(recent_gsps)
+        if len(self.rgsps) > self.change_spe_interval:
+            self.rgsps.pop(0)
+
+        if cur_iteration == 0:  # No need to tune, we have no measurements
+            self.start_time = cur_time_secs
+            return
+
+        self.spe_last_logged["iteration"] = cur_iteration
+        self.spe_last_logged["time_secs"] = cur_time_secs
+
+        try:
+            if self._should_tune():
+                self._tune()
+        except RuntimeError:
+            logging.exception("Steps per execution autotuner failed to run.")
+            return
diff --git a/keras/utils/steps_per_execution_tuning_test.py b/keras/utils/steps_per_execution_tuning_test.py
new file mode 100644
index 000000000000..163a20932376
--- /dev/null
+++ b/keras/utils/steps_per_execution_tuning_test.py
@@ -0,0 +1,140 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test steps_per_execution_tuning."""
+
+import time
+
+import tensorflow.compat.v2 as tf
+
+from keras import Input
+from keras import Model
+from keras import losses
+from keras import optimizers
+from keras.layers import Dense
+from keras.testing_infra import test_combinations
+from keras.utils import steps_per_execution_tuning
+
+
+class mockOptimizer:
+    def __init__(self, iterations):
+        self.iterations = tf.Variable(iterations)
+
+
+@test_combinations.run_with_all_model_types
+@test_combinations.run_all_keras_modes(always_skip_v1=True)
+class StepsPerExecutionTuningTest(test_combinations.TestCase):
+    def test_variables(self):
+        spe_variable = tf.Variable(1)
+        tuner = steps_per_execution_tuning.StepsPerExecutionTuner(
+            mockOptimizer(5), spe_variable, 5, 50, 0.5
+        )
+        assert tuner.optimizer.iterations.numpy() == 5
+        assert tuner._steps_per_execution.numpy().item() == 1
+        assert tuner.interval == 5
+        assert tuner.change_spe_interval == 50
+        assert tuner.spe_change_threshold == 0.5
+        assert not tuner.steps_per_execution_stop_event.is_set()
+
+    def test_start_stop(self):
+        spe_variable = tf.Variable(1)
+        tuner = steps_per_execution_tuning.StepsPerExecutionTuner(
+            mockOptimizer(5), spe_variable, interval=0.2
+        )
+        tuner.start()
+        assert not tuner.steps_per_execution_stop_event.is_set()
+        assert tuner.start_time > 0
+        time.sleep(0.5)  # should be enough time for 2 measurements
+        tuner.stop()
+        assert tuner.steps_per_execution_stop_event.is_set()
+        assert tuner.spe_measurement_count > 0
+
+    def test_settable_steps_per_execution(self):
+        spe_variable = tf.Variable(1)
+        tuner = steps_per_execution_tuning.StepsPerExecutionTuner(
+            mockOptimizer(5), spe_variable, interval=0.2
+        )
+        tuner.start()
+        tuner.stop()
+        assert tuner.init_spe == 1
+        tuner.steps_per_execution = 5
+        assert spe_variable.numpy().item() == 5
+        assert tuner.init_spe == 5
+
+    def test_custom_training_loop(self):
+        dataset = _get_dataset()
+        iterator = iter(dataset)
+
+        inputs = Input(shape=(784,), name="digits")
+        x = Dense(64, activation="relu", name="dense_1")(inputs)
+        x = Dense(64, activation="relu", name="dense_2")(x)
+        outputs = Dense(10, name="predictions")(x)
+        model = Model(inputs=inputs, outputs=outputs)
+        optimizer = optimizers.SGD(learning_rate=1e-3)
+        loss_fn = losses.SparseCategoricalCrossentropy(from_logits=True)
+
+        # Create our steps per execution variable
+        steps_per_execution = tf.Variable(
+            1,
+            dtype="int64",
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+        )
+
+        # Create the tuner
+        tuner = steps_per_execution_tuning.StepsPerExecutionTuner(
+            optimizer, steps_per_execution
+        )
+
+        # Create a step function that runs a single training step
+        @tf.function
+        def step_fn(iterator):
+            batch_data, labels = next(iterator)
+            print(batch_data.shape, labels.shape)
+            with tf.GradientTape() as tape:
+                logits = model(batch_data, training=True)
+                loss_value = loss_fn(labels, logits)
+            grads = tape.gradient(loss_value, model.trainable_weights)
+            optimizer.apply_gradients(zip(grads, model.trainable_weights))
+
+        # We can now pack multiple execution steps into one call
+        @tf.function
+        def multi_step_train_fn(iterator, steps_per_execution):
+            for _ in tf.range(steps_per_execution):
+                step_fn(iterator)
+            return
+
+        steps_per_epoch = 10
+        epochs = 2
+
+        # Start the tuner before training
+        tuner.start()
+
+        for _ in range(epochs):
+            for _ in range(steps_per_epoch):
+                multi_step_train_fn(iterator, steps_per_execution)
+
+        # End the tuner after training
+        tuner.stop()
+
+
+def _get_dataset():
+    inputs = tf.zeros((1000, 784), dtype=tf.float32)
+    targets = tf.zeros((1000,), dtype=tf.float32)
+    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10)
+    return dataset
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/text_dataset.py b/keras/utils/text_dataset.py
index c7cec37b3c15..37ba1a94b10c 100644
--- a/keras/utils/text_dataset.py
+++ b/keras/utils/text_dataset.py
@@ -14,234 +14,272 @@
 # ==============================================================================
 """Keras text dataset generation utilities."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 from keras.utils import dataset_utils
+
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.utils.text_dataset_from_directory',
-              'keras.preprocessing.text_dataset_from_directory',
-              v1=[])
-def text_dataset_from_directory(directory,
-                                labels='inferred',
-                                label_mode='int',
-                                class_names=None,
-                                batch_size=32,
-                                max_length=None,
-                                shuffle=True,
-                                seed=None,
-                                validation_split=None,
-                                subset=None,
-                                follow_links=False):
-  """Generates a `tf.data.Dataset` from text files in a directory.
-
-  If your directory structure is:
-
-  ```
-  main_directory/
-  ...class_a/
-  ......a_text_1.txt
-  ......a_text_2.txt
-  ...class_b/
-  ......b_text_1.txt
-  ......b_text_2.txt
-  ```
-
-  Then calling `text_dataset_from_directory(main_directory, labels='inferred')`
-  will return a `tf.data.Dataset` that yields batches of texts from
-  the subdirectories `class_a` and `class_b`, together with labels
-  0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
-
-  Only `.txt` files are supported at this time.
-
-  Args:
-    directory: Directory where the data is located.
-        If `labels` is "inferred", it should contain
-        subdirectories, each containing text files for a class.
-        Otherwise, the directory structure is ignored.
-    labels: Either "inferred"
-        (labels are generated from the directory structure),
-        None (no labels),
-        or a list/tuple of integer labels of the same size as the number of
-        text files found in the directory. Labels should be sorted according
-        to the alphanumeric order of the text file paths
-        (obtained via `os.walk(directory)` in Python).
-    label_mode: String describing the encoding of `labels`. Options are:
-        - 'int': means that the labels are encoded as integers
-            (e.g. for `sparse_categorical_crossentropy` loss).
-        - 'categorical' means that the labels are
-            encoded as a categorical vector
-            (e.g. for `categorical_crossentropy` loss).
-        - 'binary' means that the labels (there can be only 2)
-            are encoded as `float32` scalars with values 0 or 1
-            (e.g. for `binary_crossentropy`).
-        - None (no labels).
-    class_names: Only valid if "labels" is "inferred". This is the explicit
-        list of class names (must match names of subdirectories). Used
-        to control the order of the classes
-        (otherwise alphanumerical order is used).
-    batch_size: Size of the batches of data. Default: 32.
-      If `None`, the data will not be batched
-      (the dataset will yield individual samples).
-    max_length: Maximum size of a text string. Texts longer than this will
-      be truncated to `max_length`.
-    shuffle: Whether to shuffle the data. Default: True.
-        If set to False, sorts the data in alphanumeric order.
-    seed: Optional random seed for shuffling and transformations.
-    validation_split: Optional float between 0 and 1,
-        fraction of data to reserve for validation.
-    subset: Subset of the data to return.
-        One of "training", "validation" or "both".
-        Only used if `validation_split` is set.
-        When `subset="both"`, the utility returns a tuple of two datasets
-        (the training and validation datasets respectively).
-    follow_links: Whether to visits subdirectories pointed to by symlinks.
-        Defaults to False.
-
-  Returns:
+@keras_export(
+    "keras.utils.text_dataset_from_directory",
+    "keras.preprocessing.text_dataset_from_directory",
+    v1=[],
+)
+def text_dataset_from_directory(
+    directory,
+    labels="inferred",
+    label_mode="int",
+    class_names=None,
+    batch_size=32,
+    max_length=None,
+    shuffle=True,
+    seed=None,
+    validation_split=None,
+    subset=None,
+    follow_links=False,
+):
+    """Generates a `tf.data.Dataset` from text files in a directory.
+
+    If your directory structure is:
+
+    ```
+    main_directory/
+    ...class_a/
+    ......a_text_1.txt
+    ......a_text_2.txt
+    ...class_b/
+    ......b_text_1.txt
+    ......b_text_2.txt
+    ```
+
+    Then calling `text_dataset_from_directory(main_directory,
+    labels='inferred')` will return a `tf.data.Dataset` that yields batches of
+    texts from the subdirectories `class_a` and `class_b`, together with labels
+    0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
+
+    Only `.txt` files are supported at this time.
+
+    Args:
+        directory: Directory where the data is located.
+            If `labels` is `"inferred"`, it should contain
+            subdirectories, each containing text files for a class.
+            Otherwise, the directory structure is ignored.
+        labels: Either `"inferred"`
+            (labels are generated from the directory structure),
+            `None` (no labels),
+            or a list/tuple of integer labels of the same size as the number of
+            text files found in the directory. Labels should be sorted according
+            to the alphanumeric order of the text file paths
+            (obtained via `os.walk(directory)` in Python).
+        label_mode: String describing the encoding of `labels`. Options are:
+            - `"int"`: means that the labels are encoded as integers
+                (e.g. for `sparse_categorical_crossentropy` loss).
+            - `"categorical"` means that the labels are
+                encoded as a categorical vector
+                (e.g. for `categorical_crossentropy` loss).
+            - `"binary"` means that the labels (there can be only 2)
+                are encoded as `float32` scalars with values 0 or 1
+                (e.g. for `binary_crossentropy`).
+            - `None` (no labels).
+        class_names: Only valid if `"labels"` is `"inferred"`.
+            This is the explicit list of class names
+            (must match names of subdirectories). Used to control the order
+            of the classes (otherwise alphanumerical order is used).
+        batch_size: Size of the batches of data. Defaults to 32.
+            If `None`, the data will not be batched
+            (the dataset will yield individual samples).
+        max_length: Maximum size of a text string. Texts longer than this will
+            be truncated to `max_length`.
+        shuffle: Whether to shuffle the data. Defaults to `True`.
+            If set to `False`, sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        validation_split: Optional float between 0 and 1,
+            fraction of data to reserve for validation.
+        subset: Subset of the data to return.
+            One of `"training"`, `"validation"` or `"both"`.
+            Only used if `validation_split` is set.
+            When `subset="both"`, the utility returns a tuple of two datasets
+            (the training and validation datasets respectively).
+        follow_links: Whether to visits subdirectories pointed to by symlinks.
+            Defaults to `False`.
+
+    Returns:
+
     A `tf.data.Dataset` object.
-      - If `label_mode` is None, it yields `string` tensors of shape
+
+    - If `label_mode` is `None`, it yields `string` tensors of shape
         `(batch_size,)`, containing the contents of a batch of text files.
-      - Otherwise, it yields a tuple `(texts, labels)`, where `texts`
+    - Otherwise, it yields a tuple `(texts, labels)`, where `texts`
         has shape `(batch_size,)` and `labels` follows the format described
         below.
 
-  Rules regarding labels format:
+    Rules regarding labels format:
+
     - if `label_mode` is `int`, the labels are an `int32` tensor of shape
-      `(batch_size,)`.
+        `(batch_size,)`.
     - if `label_mode` is `binary`, the labels are a `float32` tensor of
-      1s and 0s of shape `(batch_size, 1)`.
+        1s and 0s of shape `(batch_size, 1)`.
     - if `label_mode` is `categorical`, the labels are a `float32` tensor
-      of shape `(batch_size, num_classes)`, representing a one-hot
-      encoding of the class index.
-  """
-  if labels not in ('inferred', None):
-    if not isinstance(labels, (list, tuple)):
-      raise ValueError(
-          '`labels` argument should be a list/tuple of integer labels, of '
-          'the same size as the number of text files in the target '
-          'directory. If you wish to infer the labels from the subdirectory '
-          'names in the target directory, pass `labels="inferred"`. '
-          'If you wish to get a dataset that only contains text samples '
-          f'(no labels), pass `labels=None`. Received: labels={labels}')
-    if class_names:
-      raise ValueError('You can only pass `class_names` if '
-                       f'`labels="inferred"`. Received: labels={labels}, and '
-                       f'class_names={class_names}')
-  if label_mode not in {'int', 'categorical', 'binary', None}:
-    raise ValueError(
-        '`label_mode` argument must be one of "int", "categorical", "binary", '
-        f'or None. Received: label_mode={label_mode}')
-  if labels is None or label_mode is None:
-    labels = None
-    label_mode = None
-  dataset_utils.check_validation_split_arg(
-      validation_split, subset, shuffle, seed)
-
-  if seed is None:
-    seed = np.random.randint(1e6)
-  file_paths, labels, class_names = dataset_utils.index_directory(
-      directory,
-      labels,
-      formats=('.txt',),
-      class_names=class_names,
-      shuffle=shuffle,
-      seed=seed,
-      follow_links=follow_links)
-
-  if label_mode == 'binary' and len(class_names) != 2:
-    raise ValueError(
-        f'When passing `label_mode="binary"`, there must be exactly 2 '
-        f'class_names. Received: class_names={class_names}')
-
-  if subset == 'both':
-    file_paths_train, labels_train = dataset_utils.get_training_or_validation_split(
-        file_paths, labels, validation_split, 'training')
-    file_paths_val, labels_val = dataset_utils.get_training_or_validation_split(
-        file_paths, labels, validation_split, 'validation')
-    if not file_paths_train:
-      raise ValueError(
-          f'No training text files found in directory {directory}. '
-          f'Allowed format: .txt')
-    if not file_paths_val:
-      raise ValueError(
-          f'No validation text files found in directory {directory}. '
-          f'Allowed format: .txt')
-    train_dataset = paths_and_labels_to_dataset(
-        file_paths=file_paths_train,
-        labels=labels_train,
-        label_mode=label_mode,
-        num_classes=len(class_names),
-        max_length=max_length)
-    val_dataset = paths_and_labels_to_dataset(
-        file_paths=file_paths_val,
-        labels=labels_val,
-        label_mode=label_mode,
-        num_classes=len(class_names),
-        max_length=max_length)
-
-    train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
-    val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
-    if batch_size is not None:
-      if shuffle:
-        # Shuffle locally at each iteration
-        train_dataset = train_dataset.shuffle(
-            buffer_size=batch_size * 8, seed=seed)
-      train_dataset = train_dataset.batch(batch_size)
-      val_dataset = val_dataset.batch(batch_size)
-    else:
-      if shuffle:
-        train_dataset = train_dataset.shuffle(buffer_size=1024, seed=seed)
-    # Users may need to reference `class_names`.
-    train_dataset.class_names = class_names
-    val_dataset.class_names = class_names
-    dataset = [train_dataset, val_dataset]
-  else:
-    file_paths, labels = dataset_utils.get_training_or_validation_split(
-        file_paths, labels, validation_split, subset)
-    if not file_paths:
-      raise ValueError(f'No text files found in directory {directory}. '
-                       f'Allowed format: .txt')
-    dataset = paths_and_labels_to_dataset(
-        file_paths=file_paths,
-        labels=labels,
-        label_mode=label_mode,
-        num_classes=len(class_names),
-        max_length=max_length)
-    dataset = dataset.prefetch(tf.data.AUTOTUNE)
-    if batch_size is not None:
-      if shuffle:
-        # Shuffle locally at each iteration
-        dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
-      dataset = dataset.batch(batch_size)
+        of shape `(batch_size, num_classes)`, representing a one-hot
+        encoding of the class index.
+    """
+    if labels not in ("inferred", None):
+        if not isinstance(labels, (list, tuple)):
+            raise ValueError(
+                "`labels` argument should be a list/tuple of integer labels, "
+                "of the same size as the number of text files in the target "
+                "directory. If you wish to infer the labels from the "
+                "subdirectory names in the target directory, "
+                'pass `labels="inferred"`. '
+                "If you wish to get a dataset that only contains text samples "
+                f"(no labels), pass `labels=None`. Received: labels={labels}"
+            )
+        if class_names:
+            raise ValueError(
+                "You can only pass `class_names` if "
+                f'`labels="inferred"`. Received: labels={labels}, and '
+                f"class_names={class_names}"
+            )
+    if label_mode not in {"int", "categorical", "binary", None}:
+        raise ValueError(
+            '`label_mode` argument must be one of "int", '
+            '"categorical", "binary", '
+            f"or None. Received: label_mode={label_mode}"
+        )
+    if labels is None or label_mode is None:
+        labels = None
+        label_mode = None
+    dataset_utils.check_validation_split_arg(
+        validation_split, subset, shuffle, seed
+    )
+
+    if seed is None:
+        seed = np.random.randint(1e6)
+    file_paths, labels, class_names = dataset_utils.index_directory(
+        directory,
+        labels,
+        formats=(".txt",),
+        class_names=class_names,
+        shuffle=shuffle,
+        seed=seed,
+        follow_links=follow_links,
+    )
+
+    if label_mode == "binary" and len(class_names) != 2:
+        raise ValueError(
+            'When passing `label_mode="binary"`, there must be exactly 2 '
+            f"class_names. Received: class_names={class_names}"
+        )
+
+    if subset == "both":
+        (
+            file_paths_train,
+            labels_train,
+        ) = dataset_utils.get_training_or_validation_split(
+            file_paths, labels, validation_split, "training"
+        )
+        (
+            file_paths_val,
+            labels_val,
+        ) = dataset_utils.get_training_or_validation_split(
+            file_paths, labels, validation_split, "validation"
+        )
+        if not file_paths_train:
+            raise ValueError(
+                f"No training text files found in directory {directory}. "
+                "Allowed format: .txt"
+            )
+        if not file_paths_val:
+            raise ValueError(
+                f"No validation text files found in directory {directory}. "
+                "Allowed format: .txt"
+            )
+        train_dataset = paths_and_labels_to_dataset(
+            file_paths=file_paths_train,
+            labels=labels_train,
+            label_mode=label_mode,
+            num_classes=len(class_names),
+            max_length=max_length,
+        )
+        val_dataset = paths_and_labels_to_dataset(
+            file_paths=file_paths_val,
+            labels=labels_val,
+            label_mode=label_mode,
+            num_classes=len(class_names),
+            max_length=max_length,
+        )
+
+        train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
+        val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
+        if batch_size is not None:
+            if shuffle:
+                # Shuffle locally at each iteration
+                train_dataset = train_dataset.shuffle(
+                    buffer_size=batch_size * 8, seed=seed
+                )
+            train_dataset = train_dataset.batch(batch_size)
+            val_dataset = val_dataset.batch(batch_size)
+        else:
+            if shuffle:
+                train_dataset = train_dataset.shuffle(
+                    buffer_size=1024, seed=seed
+                )
+        # Users may need to reference `class_names`.
+        train_dataset.class_names = class_names
+        val_dataset.class_names = class_names
+        dataset = [train_dataset, val_dataset]
     else:
-      if shuffle:
-        dataset = dataset.shuffle(buffer_size=1024, seed=seed)
-    # Users may need to reference `class_names`.
-    dataset.class_names = class_names
-  return dataset
-
-
-def paths_and_labels_to_dataset(file_paths,
-                                labels,
-                                label_mode,
-                                num_classes,
-                                max_length):
-  """Constructs a dataset of text strings and labels."""
-  path_ds = tf.data.Dataset.from_tensor_slices(file_paths)
-  string_ds = path_ds.map(
-      lambda x: path_to_string_content(x, max_length),
-      num_parallel_calls=tf.data.AUTOTUNE)
-  if label_mode:
-    label_ds = dataset_utils.labels_to_dataset(labels, label_mode, num_classes)
-    string_ds = tf.data.Dataset.zip((string_ds, label_ds))
-  return string_ds
+        file_paths, labels = dataset_utils.get_training_or_validation_split(
+            file_paths, labels, validation_split, subset
+        )
+        if not file_paths:
+            raise ValueError(
+                f"No text files found in directory {directory}. "
+                "Allowed format: .txt"
+            )
+        dataset = paths_and_labels_to_dataset(
+            file_paths=file_paths,
+            labels=labels,
+            label_mode=label_mode,
+            num_classes=len(class_names),
+            max_length=max_length,
+        )
+        dataset = dataset.prefetch(tf.data.AUTOTUNE)
+        if batch_size is not None:
+            if shuffle:
+                # Shuffle locally at each iteration
+                dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+            dataset = dataset.batch(batch_size)
+        else:
+            if shuffle:
+                dataset = dataset.shuffle(buffer_size=1024, seed=seed)
+        # Users may need to reference `class_names`.
+        dataset.class_names = class_names
+    return dataset
+
+
+def paths_and_labels_to_dataset(
+    file_paths, labels, label_mode, num_classes, max_length
+):
+    """Constructs a dataset of text strings and labels."""
+    path_ds = tf.data.Dataset.from_tensor_slices(file_paths)
+    string_ds = path_ds.map(
+        lambda x: path_to_string_content(x, max_length),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
+    if label_mode:
+        label_ds = dataset_utils.labels_to_dataset(
+            labels, label_mode, num_classes
+        )
+        string_ds = tf.data.Dataset.zip((string_ds, label_ds))
+    return string_ds
 
 
 def path_to_string_content(path, max_length):
-  txt = tf.io.read_file(path)
-  if max_length is not None:
-    txt = tf.compat.v1.strings.substr(txt, 0, max_length)
-  return txt
+    txt = tf.io.read_file(path)
+    if max_length is not None:
+        txt = tf.compat.v1.strings.substr(txt, 0, max_length)
+    return txt
diff --git a/keras/utils/text_dataset_test.py b/keras/utils/text_dataset_test.py
index e050fae7c45c..532eb06cf848 100644
--- a/keras/utils/text_dataset_test.py
+++ b/keras/utils/text_dataset_test.py
@@ -14,12 +14,13 @@
 # ==============================================================================
 """Tests for text_dataset."""
 
-import tensorflow.compat.v2 as tf
-
 import os
 import random
 import shutil
 import string
+
+import tensorflow.compat.v2 as tf
+
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 from keras.utils import text_dataset
@@ -27,251 +28,297 @@
 
 @test_utils.run_v2_only
 class TextDatasetFromDirectoryTest(test_combinations.TestCase):
-
-  def _prepare_directory(self,
-                         num_classes=2,
-                         nested_dirs=False,
-                         count=16,
-                         length=20):
-    # Get a unique temp directory
-    temp_dir = os.path.join(self.get_temp_dir(), str(random.randint(0, 1e6)))
-    os.mkdir(temp_dir)
-    self.addCleanup(shutil.rmtree, temp_dir)
-
-    # Generate paths to class subdirectories
-    paths = []
-    for class_index in range(num_classes):
-      class_directory = 'class_%s' % (class_index,)
-      if nested_dirs:
-        class_paths = [
-            class_directory, os.path.join(class_directory, 'subfolder_1'),
-            os.path.join(class_directory, 'subfolder_2'), os.path.join(
-                class_directory, 'subfolder_1', 'sub-subfolder')
-        ]
-      else:
-        class_paths = [class_directory]
-      for path in class_paths:
-        os.mkdir(os.path.join(temp_dir, path))
-      paths += class_paths
-
-    for i in range(count):
-      path = paths[i % len(paths)]
-      filename = os.path.join(path, 'text_%s.txt' % (i,))
-      f = open(os.path.join(temp_dir, filename), 'w')
-      text = ''.join([random.choice(string.printable) for _ in range(length)])
-      f.write(text)
-      f.close()
-    return temp_dir
-
-  def test_text_dataset_from_directory_standalone(self):
-    # Test retrieving txt files without labels from a directory and its subdirs.
-    # Save a few extra files in the parent directory.
-    directory = self._prepare_directory(count=7, num_classes=2)
-    for i in range(3):
-      filename = 'text_%s.txt' % (i,)
-      f = open(os.path.join(directory, filename), 'w')
-      text = ''.join([random.choice(string.printable) for _ in range(20)])
-      f.write(text)
-      f.close()
-
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=5, label_mode=None, max_length=10)
-    batch = next(iter(dataset))
-    # We just return the texts, no labels
-    self.assertEqual(batch.shape, (5,))
-    self.assertEqual(batch.dtype.name, 'string')
-    # Count samples
-    batch_count = 0
-    sample_count = 0
-    for batch in dataset:
-      batch_count += 1
-      sample_count += batch.shape[0]
-    self.assertEqual(batch_count, 2)
-    self.assertEqual(sample_count, 10)
-
-  def test_text_dataset_from_directory_binary(self):
-    directory = self._prepare_directory(num_classes=2)
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode='int', max_length=10)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8,))
-    self.assertEqual(batch[0].dtype.name, 'string')
-    self.assertEqual(len(batch[0].numpy()[0]), 10)  # Test max_length
-    self.assertEqual(batch[1].shape, (8,))
-    self.assertEqual(batch[1].dtype.name, 'int32')
-
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode='binary')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8,))
-    self.assertEqual(batch[0].dtype.name, 'string')
-    self.assertEqual(batch[1].shape, (8, 1))
-    self.assertEqual(batch[1].dtype.name, 'float32')
-
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode='categorical')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8,))
-    self.assertEqual(batch[0].dtype.name, 'string')
-    self.assertEqual(batch[1].shape, (8, 2))
-    self.assertEqual(batch[1].dtype.name, 'float32')
-
-  def test_sample_count(self):
-    directory = self._prepare_directory(num_classes=4, count=15)
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode=None)
-    sample_count = 0
-    for batch in dataset:
-      sample_count += batch.shape[0]
-    self.assertEqual(sample_count, 15)
-
-  def test_text_dataset_from_directory_multiclass(self):
-    directory = self._prepare_directory(num_classes=4, count=15)
-
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode=None)
-    batch = next(iter(dataset))
-    self.assertEqual(batch.shape, (8,))
-
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode=None)
-    sample_count = 0
-    iterator = iter(dataset)
-    for batch in dataset:
-      sample_count += next(iterator).shape[0]
-    self.assertEqual(sample_count, 15)
-
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode='int')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8,))
-    self.assertEqual(batch[0].dtype.name, 'string')
-    self.assertEqual(batch[1].shape, (8,))
-    self.assertEqual(batch[1].dtype.name, 'int32')
-
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode='categorical')
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8,))
-    self.assertEqual(batch[0].dtype.name, 'string')
-    self.assertEqual(batch[1].shape, (8, 4))
-    self.assertEqual(batch[1].dtype.name, 'float32')
-
-  def test_text_dataset_from_directory_validation_split(self):
-    directory = self._prepare_directory(num_classes=2, count=10)
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=10, validation_split=0.2, subset='training',
-        seed=1337)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8,))
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=10, validation_split=0.2, subset='validation',
-        seed=1337)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (2,))
-
-    train_dataset, val_dataset = text_dataset.text_dataset_from_directory(
-        directory,
-        batch_size=10,
-        validation_split=0.2,
-        subset='both',
-        seed=1337)
-    batch = next(iter(train_dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (8,))
-    batch = next(iter(val_dataset))
-    self.assertLen(batch, 2)
-    self.assertEqual(batch[0].shape, (2,))
-
-  def test_text_dataset_from_directory_manual_labels(self):
-    directory = self._prepare_directory(num_classes=2, count=2)
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, labels=[0, 1], shuffle=False)
-    batch = next(iter(dataset))
-    self.assertLen(batch, 2)
-    self.assertAllClose(batch[1], [0, 1])
-
-  def test_text_dataset_from_directory_follow_links(self):
-    directory = self._prepare_directory(num_classes=2, count=25,
-                                        nested_dirs=True)
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=8, label_mode=None, follow_links=True)
-    sample_count = 0
-    for batch in dataset:
-      sample_count += batch.shape[0]
-    self.assertEqual(sample_count, 25)
-
-  def test_text_dataset_from_directory_no_files(self):
-    directory = self._prepare_directory(num_classes=2, count=0)
-    with self.assertRaisesRegex(ValueError, 'No text files found'):
-      _ = text_dataset.text_dataset_from_directory(directory)
-
-  def test_text_dataset_from_directory_errors(self):
-    directory = self._prepare_directory(num_classes=3, count=5)
-
-    with self.assertRaisesRegex(ValueError, '`labels` argument should be'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, labels='other')
-
-    with self.assertRaisesRegex(ValueError, '`label_mode` argument must be'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, label_mode='other')
-
-    with self.assertRaisesRegex(
-        ValueError, 'only pass `class_names` if `labels="inferred"`'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, labels=[0, 0, 1, 1, 1],
-          class_names=['class_0', 'class_1', 'class_2'])
-
-    with self.assertRaisesRegex(
-        ValueError,
-        'Expected the lengths of `labels` to match the number of files'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, labels=[0, 0, 1, 1])
-
-    with self.assertRaisesRegex(
-        ValueError, '`class_names` passed did not match'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, class_names=['class_0', 'class_2'])
-
-    with self.assertRaisesRegex(ValueError, 'there must be exactly 2'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, label_mode='binary')
-
-    with self.assertRaisesRegex(ValueError,
-                                '`validation_split` must be between 0 and 1'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, validation_split=2)
-
-    with self.assertRaisesRegex(
-        ValueError, '`subset` must be either "training", '
-        '"validation" or "both"'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, validation_split=0.2, subset='other')
-
-    with self.assertRaisesRegex(ValueError, '`validation_split` must be set'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, validation_split=0, subset='training')
-
-    with self.assertRaisesRegex(ValueError, 'must provide a `seed`'):
-      _ = text_dataset.text_dataset_from_directory(
-          directory, validation_split=0.2, subset='training')
-
-  def test_text_dataset_from_directory_not_batched(self):
-    directory = self._prepare_directory()
-    dataset = text_dataset.text_dataset_from_directory(
-        directory, batch_size=None, label_mode=None, follow_links=True)
-
-    sample = next(iter(dataset))
-    self.assertEqual(len(sample.shape), 0)
-
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _prepare_directory(
+        self, num_classes=2, nested_dirs=False, count=16, length=20
+    ):
+        # Get a unique temp directory
+        temp_dir = os.path.join(
+            self.get_temp_dir(), str(random.randint(0, 1e6))
+        )
+        os.mkdir(temp_dir)
+        self.addCleanup(shutil.rmtree, temp_dir)
+
+        # Generate paths to class subdirectories
+        paths = []
+        for class_index in range(num_classes):
+            class_directory = f"class_{class_index}"
+            if nested_dirs:
+                class_paths = [
+                    class_directory,
+                    os.path.join(class_directory, "subfolder_1"),
+                    os.path.join(class_directory, "subfolder_2"),
+                    os.path.join(
+                        class_directory, "subfolder_1", "sub-subfolder"
+                    ),
+                ]
+            else:
+                class_paths = [class_directory]
+            for path in class_paths:
+                os.mkdir(os.path.join(temp_dir, path))
+            paths += class_paths
+
+        for i in range(count):
+            path = paths[i % len(paths)]
+            filename = os.path.join(path, f"text_{i}.txt")
+            with open(os.path.join(temp_dir, filename), "w") as f:
+                text = "".join(
+                    [random.choice(string.printable) for _ in range(length)]
+                )
+                f.write(text)
+        return temp_dir
+
+    def test_text_dataset_from_directory_standalone(self):
+        # Test retrieving txt files without labels from a directory and its
+        # subdirs. Save a few extra files in the parent directory.
+        directory = self._prepare_directory(count=7, num_classes=2)
+        for i in range(3):
+            filename = f"text_{i}.txt"
+            with open(os.path.join(directory, filename), "w") as f:
+                text = "".join(
+                    [random.choice(string.printable) for _ in range(20)]
+                )
+                f.write(text)
+
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=5, label_mode=None, max_length=10
+        )
+        batch = next(iter(dataset))
+        # We just return the texts, no labels
+        self.assertEqual(batch.shape, (5,))
+        self.assertEqual(batch.dtype.name, "string")
+        # Count samples
+        batch_count = 0
+        sample_count = 0
+        for batch in dataset:
+            batch_count += 1
+            sample_count += batch.shape[0]
+        self.assertEqual(batch_count, 2)
+        self.assertEqual(sample_count, 10)
+
+    def test_text_dataset_from_directory_binary(self):
+        directory = self._prepare_directory(num_classes=2)
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode="int", max_length=10
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8,))
+        self.assertEqual(batch[0].dtype.name, "string")
+        self.assertEqual(len(batch[0].numpy()[0]), 10)  # Test max_length
+        self.assertEqual(batch[1].shape, (8,))
+        self.assertEqual(batch[1].dtype.name, "int32")
+
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode="binary"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8,))
+        self.assertEqual(batch[0].dtype.name, "string")
+        self.assertEqual(batch[1].shape, (8, 1))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode="categorical"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8,))
+        self.assertEqual(batch[0].dtype.name, "string")
+        self.assertEqual(batch[1].shape, (8, 2))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+    def test_sample_count(self):
+        directory = self._prepare_directory(num_classes=4, count=15)
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode=None
+        )
+        sample_count = 0
+        for batch in dataset:
+            sample_count += batch.shape[0]
+        self.assertEqual(sample_count, 15)
+
+    def test_text_dataset_from_directory_multiclass(self):
+        directory = self._prepare_directory(num_classes=4, count=15)
+
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode=None
+        )
+        batch = next(iter(dataset))
+        self.assertEqual(batch.shape, (8,))
+
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode=None
+        )
+        sample_count = 0
+        iterator = iter(dataset)
+        for batch in dataset:
+            sample_count += next(iterator).shape[0]
+        self.assertEqual(sample_count, 15)
+
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode="int"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8,))
+        self.assertEqual(batch[0].dtype.name, "string")
+        self.assertEqual(batch[1].shape, (8,))
+        self.assertEqual(batch[1].dtype.name, "int32")
+
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode="categorical"
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8,))
+        self.assertEqual(batch[0].dtype.name, "string")
+        self.assertEqual(batch[1].shape, (8, 4))
+        self.assertEqual(batch[1].dtype.name, "float32")
+
+    def test_text_dataset_from_directory_validation_split(self):
+        directory = self._prepare_directory(num_classes=2, count=10)
+        dataset = text_dataset.text_dataset_from_directory(
+            directory,
+            batch_size=10,
+            validation_split=0.2,
+            subset="training",
+            seed=1337,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8,))
+        dataset = text_dataset.text_dataset_from_directory(
+            directory,
+            batch_size=10,
+            validation_split=0.2,
+            subset="validation",
+            seed=1337,
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (2,))
+
+        train_dataset, val_dataset = text_dataset.text_dataset_from_directory(
+            directory,
+            batch_size=10,
+            validation_split=0.2,
+            subset="both",
+            seed=1337,
+        )
+        batch = next(iter(train_dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (8,))
+        batch = next(iter(val_dataset))
+        self.assertLen(batch, 2)
+        self.assertEqual(batch[0].shape, (2,))
+
+    def test_text_dataset_from_directory_manual_labels(self):
+        directory = self._prepare_directory(num_classes=2, count=2)
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, labels=[0, 1], shuffle=False
+        )
+        batch = next(iter(dataset))
+        self.assertLen(batch, 2)
+        self.assertAllClose(batch[1], [0, 1])
+
+    def test_text_dataset_from_directory_follow_links(self):
+        directory = self._prepare_directory(
+            num_classes=2, count=25, nested_dirs=True
+        )
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=8, label_mode=None, follow_links=True
+        )
+        sample_count = 0
+        for batch in dataset:
+            sample_count += batch.shape[0]
+        self.assertEqual(sample_count, 25)
+
+    def test_text_dataset_from_directory_no_files(self):
+        directory = self._prepare_directory(num_classes=2, count=0)
+        with self.assertRaisesRegex(ValueError, "No text files found"):
+            _ = text_dataset.text_dataset_from_directory(directory)
+
+    def test_text_dataset_from_directory_errors(self):
+        directory = self._prepare_directory(num_classes=3, count=5)
+
+        with self.assertRaisesRegex(ValueError, "`labels` argument should be"):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, labels="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`label_mode` argument must be"
+        ):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, label_mode="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, 'only pass `class_names` if `labels="inferred"`'
+        ):
+            _ = text_dataset.text_dataset_from_directory(
+                directory,
+                labels=[0, 0, 1, 1, 1],
+                class_names=["class_0", "class_1", "class_2"],
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "Expected the lengths of `labels` to match the number of files",
+        ):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, labels=[0, 0, 1, 1]
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`class_names` passed did not match"
+        ):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, class_names=["class_0", "class_2"]
+            )
+
+        with self.assertRaisesRegex(ValueError, "there must be exactly 2"):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, label_mode="binary"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`validation_split` must be between 0 and 1"
+        ):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, validation_split=2
+            )
+
+        with self.assertRaisesRegex(
+            ValueError,
+            '`subset` must be either "training", "validation" or "both"',
+        ):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, validation_split=0.2, subset="other"
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, "`validation_split` must be set"
+        ):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, validation_split=0, subset="training"
+            )
+
+        with self.assertRaisesRegex(ValueError, "must provide a `seed`"):
+            _ = text_dataset.text_dataset_from_directory(
+                directory, validation_split=0.2, subset="training"
+            )
+
+    def test_text_dataset_from_directory_not_batched(self):
+        directory = self._prepare_directory()
+        dataset = text_dataset.text_dataset_from_directory(
+            directory, batch_size=None, label_mode=None, follow_links=True
+        )
+
+        sample = next(iter(dataset))
+        self.assertEqual(len(sample.shape), 0)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/tf_contextlib.py b/keras/utils/tf_contextlib.py
index 73103e7996ba..d988badaaf55 100644
--- a/keras/utils/tf_contextlib.py
+++ b/keras/utils/tf_contextlib.py
@@ -14,20 +14,22 @@
 # ==============================================================================
 """TFDecorator-aware replacements for the contextlib module."""
 
-import tensorflow.compat.v2 as tf
-
 import contextlib as _contextlib
 
+import tensorflow.compat.v2 as tf
+
 
 def contextmanager(target):
-  """A tf_decorator-aware wrapper for `contextlib.contextmanager`.
+    """A tf_decorator-aware wrapper for `contextlib.contextmanager`.
 
-  Usage is identical to `contextlib.contextmanager`.
+    Usage is identical to `contextlib.contextmanager`.
 
-  Args:
-    target: A callable to be wrapped in a contextmanager.
-  Returns:
-    A callable that can be used inside of a `with` statement.
-  """
-  context_manager = _contextlib.contextmanager(target)
-  return tf.__internal__.decorator.make_decorator(target, context_manager, 'contextmanager')
+    Args:
+      target: A callable to be wrapped in a contextmanager.
+    Returns:
+      A callable that can be used inside of a `with` statement.
+    """
+    context_manager = _contextlib.contextmanager(target)
+    return tf.__internal__.decorator.make_decorator(
+        target, context_manager, "contextmanager"
+    )
diff --git a/keras/utils/tf_inspect.py b/keras/utils/tf_inspect.py
index c69ece159490..d9ea152cd278 100644
--- a/keras/utils/tf_inspect.py
+++ b/keras/utils/tf_inspect.py
@@ -13,390 +13,430 @@
 # limitations under the License.
 # ==============================================================================
 """TFDecorator-aware replacements for the inspect module."""
-# pylint: disable=g-classes-have-attributes
-import tensorflow.compat.v2 as tf
-
 import collections
 import functools
 import inspect as _inspect
 
-ArgSpec = _inspect.ArgSpec
-
+import tensorflow.compat.v2 as tf
 
-if hasattr(_inspect, 'FullArgSpec'):
-  FullArgSpec = _inspect.FullArgSpec  # pylint: disable=invalid-name
+if hasattr(_inspect, "ArgSpec"):
+    ArgSpec = _inspect.ArgSpec
+else:
+    ArgSpec = collections.namedtuple(
+        "ArgSpec",
+        [
+            "args",
+            "varargs",
+            "keywords",
+            "defaults",
+        ],
+    )
+
+if hasattr(_inspect, "FullArgSpec"):
+    FullArgSpec = _inspect.FullArgSpec
 else:
-  FullArgSpec = collections.namedtuple('FullArgSpec', [
-      'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
-      'annotations'
-  ])
+    FullArgSpec = collections.namedtuple(
+        "FullArgSpec",
+        [
+            "args",
+            "varargs",
+            "varkw",
+            "defaults",
+            "kwonlyargs",
+            "kwonlydefaults",
+            "annotations",
+        ],
+    )
 
 
 def _convert_maybe_argspec_to_fullargspec(argspec):
-  if isinstance(argspec, FullArgSpec):
-    return argspec
-  return FullArgSpec(
-      args=argspec.args,
-      varargs=argspec.varargs,
-      varkw=argspec.keywords,
-      defaults=argspec.defaults,
-      kwonlyargs=[],
-      kwonlydefaults=None,
-      annotations={})
+    if isinstance(argspec, FullArgSpec):
+        return argspec
+    return FullArgSpec(
+        args=argspec.args,
+        varargs=argspec.varargs,
+        varkw=argspec.keywords,
+        defaults=argspec.defaults,
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={},
+    )
+
+
+if hasattr(_inspect, "getfullargspec"):
+    _getfullargspec = _inspect.getfullargspec
+
+    def _getargspec(target):
+        """A python3 version of getargspec.
+
+        Calls `getfullargspec` and assigns args, varargs,
+        varkw, and defaults to a python 2/3 compatible `ArgSpec`.
+
+        The parameter name 'varkw' is changed to 'keywords' to fit the
+        `ArgSpec` struct.
+
+        Args:
+          target: the target object to inspect.
+
+        Returns:
+          An ArgSpec with args, varargs, keywords, and defaults parameters
+          from FullArgSpec.
+        """
+        fullargspecs = getfullargspec(target)
+        argspecs = ArgSpec(
+            args=fullargspecs.args,
+            varargs=fullargspecs.varargs,
+            keywords=fullargspecs.varkw,
+            defaults=fullargspecs.defaults,
+        )
+        return argspecs
 
-if hasattr(_inspect, 'getfullargspec'):
-  _getfullargspec = _inspect.getfullargspec  # pylint: disable=invalid-name
+else:
+    _getargspec = _inspect.getargspec
 
-  def _getargspec(target):
-    """A python3 version of getargspec.
+    def _getfullargspec(target):
+        """A python2 version of getfullargspec.
 
-    Calls `getfullargspec` and assigns args, varargs,
-    varkw, and defaults to a python 2/3 compatible `ArgSpec`.
+        Args:
+          target: the target object to inspect.
 
-    The parameter name 'varkw' is changed to 'keywords' to fit the
-    `ArgSpec` struct.
+        Returns:
+          A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
+        """
+        return _convert_maybe_argspec_to_fullargspec(getargspec(target))
 
-    Args:
-      target: the target object to inspect.
 
-    Returns:
-      An ArgSpec with args, varargs, keywords, and defaults parameters
-      from FullArgSpec.
-    """
-    fullargspecs = getfullargspec(target)
-    argspecs = ArgSpec(
-        args=fullargspecs.args,
-        varargs=fullargspecs.varargs,
-        keywords=fullargspecs.varkw,
-        defaults=fullargspecs.defaults)
-    return argspecs
-else:
-  _getargspec = _inspect.getargspec
+def currentframe():
+    """TFDecorator-aware replacement for inspect.currentframe."""
+    return _inspect.stack()[1][0]
+
 
-  def _getfullargspec(target):
-    """A python2 version of getfullargspec.
+def getargspec(obj):
+    """TFDecorator-aware replacement for `inspect.getargspec`.
+
+    Note: `getfullargspec` is recommended as the python 2/3 compatible
+    replacement for this function.
 
     Args:
-      target: the target object to inspect.
+      obj: A function, partial function, or callable object, possibly decorated.
 
     Returns:
-      A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
+      The `ArgSpec` that describes the signature of the outermost decorator that
+      changes the callable's signature, or the `ArgSpec` that describes
+      the object if not decorated.
+
+    Raises:
+      ValueError: When callable's signature can not be expressed with
+        ArgSpec.
+      TypeError: For objects of unsupported types.
     """
-    return _convert_maybe_argspec_to_fullargspec(getargspec(target))
-
+    if isinstance(obj, functools.partial):
+        return _get_argspec_for_partial(obj)
+
+    decorators, target = tf.__internal__.decorator.unwrap(obj)
+
+    spec = next(
+        (
+            d.decorator_argspec
+            for d in decorators
+            if d.decorator_argspec is not None
+        ),
+        None,
+    )
+    if spec:
+        return spec
 
-def currentframe():
-  """TFDecorator-aware replacement for inspect.currentframe."""
-  return _inspect.stack()[1][0]
-
-
-def getargspec(obj):
-  """TFDecorator-aware replacement for `inspect.getargspec`.
-
-  Note: `getfullargspec` is recommended as the python 2/3 compatible
-  replacement for this function.
-
-  Args:
-    obj: A function, partial function, or callable object, possibly decorated.
-
-  Returns:
-    The `ArgSpec` that describes the signature of the outermost decorator that
-    changes the callable's signature, or the `ArgSpec` that describes
-    the object if not decorated.
-
-  Raises:
-    ValueError: When callable's signature can not be expressed with
-      ArgSpec.
-    TypeError: For objects of unsupported types.
-  """
-  if isinstance(obj, functools.partial):
-    return _get_argspec_for_partial(obj)
-
-  decorators, target = tf.__internal__.decorator.unwrap(obj)
-
-  spec = next((d.decorator_argspec
-               for d in decorators
-               if d.decorator_argspec is not None), None)
-  if spec:
-    return spec
-
-  try:
-    # Python3 will handle most callables here (not partial).
-    return _getargspec(target)
-  except TypeError:
-    pass
-
-  if isinstance(target, type):
     try:
-      return _getargspec(target.__init__)
+        # Python3 will handle most callables here (not partial).
+        return _getargspec(target)
     except TypeError:
-      pass
+        pass
 
-    try:
-      return _getargspec(target.__new__)
-    except TypeError:
-      pass
+    if isinstance(target, type):
+        try:
+            return _getargspec(target.__init__)
+        except TypeError:
+            pass
+
+        try:
+            return _getargspec(target.__new__)
+        except TypeError:
+            pass
 
-  # The `type(target)` ensures that if a class is received we don't return
-  # the signature of its __call__ method.
-  return _getargspec(type(target).__call__)
+    # The `type(target)` ensures that if a class is received we don't return
+    # the signature of its __call__ method.
+    return _getargspec(type(target).__call__)
 
 
 def _get_argspec_for_partial(obj):
-  """Implements `getargspec` for `functools.partial` objects.
-
-  Args:
-    obj: The `functools.partial` object
-  Returns:
-    An `inspect.ArgSpec`
-  Raises:
-    ValueError: When callable's signature can not be expressed with
-      ArgSpec.
-  """
-  # When callable is a functools.partial object, we construct its ArgSpec with
-  # following strategy:
-  # - If callable partial contains default value for positional arguments (ie.
-  # object.args), then final ArgSpec doesn't contain those positional arguments.
-  # - If callable partial contains default value for keyword arguments (ie.
-  # object.keywords), then we merge them with wrapped target. Default values
-  # from callable partial takes precedence over those from wrapped target.
-  #
-  # However, there is a case where it is impossible to construct a valid
-  # ArgSpec. Python requires arguments that have no default values must be
-  # defined before those with default values. ArgSpec structure is only valid
-  # when this presumption holds true because default values are expressed as a
-  # tuple of values without keywords and they are always assumed to belong to
-  # last K arguments where K is number of default values present.
-  #
-  # Since functools.partial can give default value to any argument, this
-  # presumption may no longer hold in some cases. For example:
-  #
-  # def func(m, n):
-  #   return 2 * m + n
-  # partialed = functools.partial(func, m=1)
-  #
-  # This example will result in m having a default value but n doesn't. This is
-  # usually not allowed in Python and can not be expressed in ArgSpec correctly.
-  #
-  # Thus, we must detect cases like this by finding first argument with default
-  # value and ensures all following arguments also have default values. When
-  # this is not true, a ValueError is raised.
-
-  n_prune_args = len(obj.args)
-  partial_keywords = obj.keywords or {}
-
-  args, varargs, keywords, defaults = getargspec(obj.func)
-
-  # Pruning first n_prune_args arguments.
-  args = args[n_prune_args:]
-
-  # Partial function may give default value to any argument, therefore length
-  # of default value list must be len(args) to allow each argument to
-  # potentially be given a default value.
-  no_default = object()
-  all_defaults = [no_default] * len(args)
-
-  if defaults:
-    all_defaults[-len(defaults):] = defaults
-
-  # Fill in default values provided by partial function in all_defaults.
-  for kw, default in partial_keywords.items():
-    if kw in args:
-      idx = args.index(kw)
-      all_defaults[idx] = default
-    elif not keywords:
-      raise ValueError('Function does not have **kwargs parameter, but '
-                       'contains an unknown partial keyword.')
-
-  # Find first argument with default value set.
-  first_default = next(
-      (idx for idx, x in enumerate(all_defaults) if x is not no_default), None)
-
-  # If no default values are found, return ArgSpec with defaults=None.
-  if first_default is None:
-    return ArgSpec(args, varargs, keywords, None)
-
-  # Checks if all arguments have default value set after first one.
-  invalid_default_values = [
-      args[i] for i, j in enumerate(all_defaults)
-      if j is no_default and i > first_default
-  ]
-
-  if invalid_default_values:
-    raise ValueError(f'Some arguments {invalid_default_values} do not have '
-                     'default value, but they are positioned after those with '
-                     'default values. This can not be expressed with ArgSpec.')
-
-  return ArgSpec(args, varargs, keywords, tuple(all_defaults[first_default:]))
+    """Implements `getargspec` for `functools.partial` objects.
+
+    Args:
+      obj: The `functools.partial` object
+    Returns:
+      An `inspect.ArgSpec`
+    Raises:
+      ValueError: When callable's signature can not be expressed with
+        ArgSpec.
+    """
+    # When callable is a functools.partial object, we construct its ArgSpec with
+    # following strategy:
+    # - If callable partial contains default value for positional arguments (ie.
+    # object.args), then final ArgSpec doesn't contain those positional
+    # arguments.
+    # - If callable partial contains default value for keyword arguments (ie.
+    # object.keywords), then we merge them with wrapped target. Default values
+    # from callable partial takes precedence over those from wrapped target.
+    #
+    # However, there is a case where it is impossible to construct a valid
+    # ArgSpec. Python requires arguments that have no default values must be
+    # defined before those with default values. ArgSpec structure is only valid
+    # when this presumption holds true because default values are expressed as a
+    # tuple of values without keywords and they are always assumed to belong to
+    # last K arguments where K is number of default values present.
+    #
+    # Since functools.partial can give default value to any argument, this
+    # presumption may no longer hold in some cases. For example:
+    #
+    # def func(m, n):
+    #   return 2 * m + n
+    # partialed = functools.partial(func, m=1)
+    #
+    # This example will result in m having a default value but n doesn't. This
+    # is usually not allowed in Python and can not be expressed in ArgSpec
+    # correctly.
+    #
+    # Thus, we must detect cases like this by finding first argument with
+    # default value and ensures all following arguments also have default
+    # values. When this is not true, a ValueError is raised.
+
+    n_prune_args = len(obj.args)
+    partial_keywords = obj.keywords or {}
+
+    args, varargs, keywords, defaults = getargspec(obj.func)
+
+    # Pruning first n_prune_args arguments.
+    args = args[n_prune_args:]
+
+    # Partial function may give default value to any argument, therefore length
+    # of default value list must be len(args) to allow each argument to
+    # potentially be given a default value.
+    no_default = object()
+    all_defaults = [no_default] * len(args)
+
+    if defaults:
+        all_defaults[-len(defaults) :] = defaults
+
+    # Fill in default values provided by partial function in all_defaults.
+    for kw, default in partial_keywords.items():
+        if kw in args:
+            idx = args.index(kw)
+            all_defaults[idx] = default
+        elif not keywords:
+            raise ValueError(
+                "Function does not have **kwargs parameter, but "
+                "contains an unknown partial keyword."
+            )
+
+    # Find first argument with default value set.
+    first_default = next(
+        (idx for idx, x in enumerate(all_defaults) if x is not no_default), None
+    )
+
+    # If no default values are found, return ArgSpec with defaults=None.
+    if first_default is None:
+        return ArgSpec(args, varargs, keywords, None)
+
+    # Checks if all arguments have default value set after first one.
+    invalid_default_values = [
+        args[i]
+        for i, j in enumerate(all_defaults)
+        if j is no_default and i > first_default
+    ]
+
+    if invalid_default_values:
+        raise ValueError(
+            f"Some arguments {invalid_default_values} do not have "
+            "default value, but they are positioned after those with "
+            "default values. This can not be expressed with ArgSpec."
+        )
+
+    return ArgSpec(args, varargs, keywords, tuple(all_defaults[first_default:]))
 
 
 def getfullargspec(obj):
-  """TFDecorator-aware replacement for `inspect.getfullargspec`.
+    """TFDecorator-aware replacement for `inspect.getfullargspec`.
 
-  This wrapper emulates `inspect.getfullargspec` in[^)]* Python2.
+    This wrapper emulates `inspect.getfullargspec` in[^)]* Python2.
 
-  Args:
-    obj: A callable, possibly decorated.
+    Args:
+      obj: A callable, possibly decorated.
 
-  Returns:
-    The `FullArgSpec` that describes the signature of
-    the outermost decorator that changes the callable's signature. If the
-    callable is not decorated, `inspect.getfullargspec()` will be called
-    directly on the callable.
-  """
-  decorators, target = tf.__internal__.decorator.unwrap(obj)
+    Returns:
+      The `FullArgSpec` that describes the signature of
+      the outermost decorator that changes the callable's signature. If the
+      callable is not decorated, `inspect.getfullargspec()` will be called
+      directly on the callable.
+    """
+    decorators, target = tf.__internal__.decorator.unwrap(obj)
 
-  for d in decorators:
-    if d.decorator_argspec is not None:
-      return _convert_maybe_argspec_to_fullargspec(d.decorator_argspec)
-  return _getfullargspec(target)
+    for d in decorators:
+        if d.decorator_argspec is not None:
+            return _convert_maybe_argspec_to_fullargspec(d.decorator_argspec)
+    return _getfullargspec(target)
 
 
 def getcallargs(*func_and_positional, **named):
-  """TFDecorator-aware replacement for inspect.getcallargs.
-
-  Args:
-    *func_and_positional: A callable, possibly decorated, followed by any
-      positional arguments that would be passed to `func`.
-    **named: The named argument dictionary that would be passed to `func`.
-
-  Returns:
-    A dictionary mapping `func`'s named arguments to the values they would
-    receive if `func(*positional, **named)` were called.
-
-  `getcallargs` will use the argspec from the outermost decorator that provides
-  it. If no attached decorators modify argspec, the final unwrapped target's
-  argspec will be used.
-  """
-  func = func_and_positional[0]
-  positional = func_and_positional[1:]
-  argspec = getfullargspec(func)
-  call_args = named.copy()
-  this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
-  if ismethod(func) and this:
-    positional = (this,) + positional
-  remaining_positionals = [arg for arg in argspec.args if arg not in call_args]
-  call_args.update(dict(zip(remaining_positionals, positional)))
-  default_count = 0 if not argspec.defaults else len(argspec.defaults)
-  if default_count:
-    for arg, value in zip(argspec.args[-default_count:], argspec.defaults):
-      if arg not in call_args:
-        call_args[arg] = value
-  if argspec.kwonlydefaults is not None:
-    for k, v in argspec.kwonlydefaults.items():
-      if k not in call_args:
-        call_args[k] = v
-  return call_args
+    """TFDecorator-aware replacement for inspect.getcallargs.
+
+    Args:
+      *func_and_positional: A callable, possibly decorated, followed by any
+        positional arguments that would be passed to `func`.
+      **named: The named argument dictionary that would be passed to `func`.
+
+    Returns:
+      A dictionary mapping `func`'s named arguments to the values they would
+      receive if `func(*positional, **named)` were called.
+
+    `getcallargs` will use the argspec from the outermost decorator that
+    provides it. If no attached decorators modify argspec, the final unwrapped
+    target's argspec will be used.
+    """
+    func = func_and_positional[0]
+    positional = func_and_positional[1:]
+    argspec = getfullargspec(func)
+    call_args = named.copy()
+    this = getattr(func, "im_self", None) or getattr(func, "__self__", None)
+    if ismethod(func) and this:
+        positional = (this,) + positional
+    remaining_positionals = [
+        arg for arg in argspec.args if arg not in call_args
+    ]
+    call_args.update(dict(zip(remaining_positionals, positional)))
+    default_count = 0 if not argspec.defaults else len(argspec.defaults)
+    if default_count:
+        for arg, value in zip(argspec.args[-default_count:], argspec.defaults):
+            if arg not in call_args:
+                call_args[arg] = value
+    if argspec.kwonlydefaults is not None:
+        for k, v in argspec.kwonlydefaults.items():
+            if k not in call_args:
+                call_args[k] = v
+    return call_args
 
 
 def getframeinfo(*args, **kwargs):
-  return _inspect.getframeinfo(*args, **kwargs)
+    return _inspect.getframeinfo(*args, **kwargs)
 
 
 def getdoc(obj):
-  """TFDecorator-aware replacement for inspect.getdoc.
+    """TFDecorator-aware replacement for inspect.getdoc.
 
-  Args:
-    obj: An object, possibly decorated.
+    Args:
+      obj: An object, possibly decorated.
 
-  Returns:
-    The docstring associated with the object.
+    Returns:
+      The docstring associated with the object.
 
-  The outermost-decorated object is intended to have the most complete
-  documentation, so the decorated parameter is not unwrapped.
-  """
-  return _inspect.getdoc(obj)
+    The outermost-decorated object is intended to have the most complete
+    documentation, so the decorated parameter is not unwrapped.
+    """
+    return _inspect.getdoc(obj)
 
 
 def getfile(obj):
-  """TFDecorator-aware replacement for inspect.getfile."""
-  unwrapped_object = tf.__internal__.decorator.unwrap(obj)[1]
+    """TFDecorator-aware replacement for inspect.getfile."""
+    unwrapped_object = tf.__internal__.decorator.unwrap(obj)[1]
 
-  # Work around for the case when object is a stack frame
-  # and only .pyc files are used. In this case, getfile
-  # might return incorrect path. So, we get the path from f_globals
-  # instead.
-  if (hasattr(unwrapped_object, 'f_globals') and
-      '__file__' in unwrapped_object.f_globals):
-    return unwrapped_object.f_globals['__file__']
-  return _inspect.getfile(unwrapped_object)
+    # Work around for the case when object is a stack frame
+    # and only .pyc files are used. In this case, getfile
+    # might return incorrect path. So, we get the path from f_globals
+    # instead.
+    if (
+        hasattr(unwrapped_object, "f_globals")
+        and "__file__" in unwrapped_object.f_globals
+    ):
+        return unwrapped_object.f_globals["__file__"]
+    return _inspect.getfile(unwrapped_object)
 
 
 def getmembers(obj, predicate=None):
-  """TFDecorator-aware replacement for inspect.getmembers."""
-  return _inspect.getmembers(obj, predicate)
+    """TFDecorator-aware replacement for inspect.getmembers."""
+    return _inspect.getmembers(obj, predicate)
 
 
 def getmodule(obj):
-  """TFDecorator-aware replacement for inspect.getmodule."""
-  return _inspect.getmodule(obj)
+    """TFDecorator-aware replacement for inspect.getmodule."""
+    return _inspect.getmodule(obj)
 
 
 def getmro(cls):
-  """TFDecorator-aware replacement for inspect.getmro."""
-  return _inspect.getmro(cls)
+    """TFDecorator-aware replacement for inspect.getmro."""
+    return _inspect.getmro(cls)
 
 
 def getsource(obj):
-  """TFDecorator-aware replacement for inspect.getsource."""
-  return _inspect.getsource(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.getsource."""
+    return _inspect.getsource(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def getsourcefile(obj):
-  """TFDecorator-aware replacement for inspect.getsourcefile."""
-  return _inspect.getsourcefile(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.getsourcefile."""
+    return _inspect.getsourcefile(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def getsourcelines(obj):
-  """TFDecorator-aware replacement for inspect.getsourcelines."""
-  return _inspect.getsourcelines(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.getsourcelines."""
+    return _inspect.getsourcelines(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def isbuiltin(obj):
-  """TFDecorator-aware replacement for inspect.isbuiltin."""
-  return _inspect.isbuiltin(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.isbuiltin."""
+    return _inspect.isbuiltin(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def isclass(obj):
-  """TFDecorator-aware replacement for inspect.isclass."""
-  return _inspect.isclass(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.isclass."""
+    return _inspect.isclass(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def isfunction(obj):
-  """TFDecorator-aware replacement for inspect.isfunction."""
-  return _inspect.isfunction(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.isfunction."""
+    return _inspect.isfunction(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def isframe(obj):
-  """TFDecorator-aware replacement for inspect.ismodule."""
-  return _inspect.isframe(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.ismodule."""
+    return _inspect.isframe(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def isgenerator(obj):
-  """TFDecorator-aware replacement for inspect.isgenerator."""
-  return _inspect.isgenerator(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.isgenerator."""
+    return _inspect.isgenerator(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def isgeneratorfunction(obj):
-  """TFDecorator-aware replacement for inspect.isgeneratorfunction."""
-  return _inspect.isgeneratorfunction(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.isgeneratorfunction."""
+    return _inspect.isgeneratorfunction(
+        tf.__internal__.decorator.unwrap(obj)[1]
+    )
 
 
 def ismethod(obj):
-  """TFDecorator-aware replacement for inspect.ismethod."""
-  return _inspect.ismethod(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.ismethod."""
+    return _inspect.ismethod(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def ismodule(obj):
-  """TFDecorator-aware replacement for inspect.ismodule."""
-  return _inspect.ismodule(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.ismodule."""
+    return _inspect.ismodule(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def isroutine(obj):
-  """TFDecorator-aware replacement for inspect.isroutine."""
-  return _inspect.isroutine(tf.__internal__.decorator.unwrap(obj)[1])
+    """TFDecorator-aware replacement for inspect.isroutine."""
+    return _inspect.isroutine(tf.__internal__.decorator.unwrap(obj)[1])
 
 
 def stack(context=1):
-  """TFDecorator-aware replacement for inspect.stack."""
-  return _inspect.stack(context)[1:]
+    """TFDecorator-aware replacement for inspect.stack."""
+    return _inspect.stack(context)[1:]
diff --git a/keras/utils/tf_utils.py b/keras/utils/tf_utils.py
index f9e7d807ffc7..2ca549e0cdfe 100644
--- a/keras/utils/tf_utils.py
+++ b/keras/utils/tf_utils.py
@@ -15,607 +15,741 @@
 """TensorFlow-related utilities."""
 
 import collections
+import contextlib
 import copy
+import platform
 import random
+import threading
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from absl import logging
 
 from keras import backend
 from keras.engine import keras_tensor
 from keras.utils import object_identity
 from keras.utils import tf_contextlib
 
-import numpy as np
-
-import tensorflow.compat.v2 as tf
-
+# isort: off
 from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python import pywrap_tfe
 
 
-
-@keras_export('keras.utils.set_random_seed', v1=[])
+@keras_export("keras.utils.set_random_seed", v1=[])
 def set_random_seed(seed):
-  """Sets all random seeds for the program (Python, NumPy, and TensorFlow).
-
-  You can use this utility to make almost any Keras program fully deterministic.
-  Some limitations apply in cases where network communications are involved
-  (e.g. parameter server distribution), which creates additional sources of
-  randomness, or when certain non-deterministic cuDNN ops are involved.
-
-  Calling this utility is equivalent to the following:
-
-  ```python
-  import random
-  import numpy as np
-  import tensorflow as tf
-  random.seed(seed)
-  np.random.seed(seed)
-  tf.random.set_seed(seed)
-  ```
-
-  Arguments:
-    seed: Integer, the random seed to use.
-  """
-  if not isinstance(seed, int):
-    raise ValueError(
-        'Expected `seed` argument to be an integer. '
-        f'Received: seed={seed} (of type {type(seed)})')
-  random.seed(seed)
-  np.random.seed(seed)
-  tf.random.set_seed(seed)
-  backend._SEED_GENERATOR.generator = random.Random(seed)  # pylint:disable=protected-access
+    """Sets all random seeds for the program (Python, NumPy, and TensorFlow).
+
+    You can use this utility to make almost any Keras program fully
+    deterministic. Some limitations apply in cases where network communications
+    are involved (e.g. parameter server distribution), which creates additional
+    sources of randomness, or when certain non-deterministic cuDNN ops are
+    involved.
+
+    Calling this utility is equivalent to the following:
+
+    ```python
+    import random
+    import numpy as np
+    import tensorflow as tf
+    random.seed(seed)
+    np.random.seed(seed)
+    tf.random.set_seed(seed)
+    ```
+
+    Arguments:
+      seed: Integer, the random seed to use.
+    """
+    if not isinstance(seed, int):
+        raise ValueError(
+            "Expected `seed` argument to be an integer. "
+            f"Received: seed={seed} (of type {type(seed)})"
+        )
+    random.seed(seed)
+    np.random.seed(seed)
+    tf.random.set_seed(seed)
+    backend._SEED_GENERATOR.generator = random.Random(seed)
+
+
+def get_random_seed():
+    """Retrieve a seed value to seed a random generator.
+
+    Returns:
+      the random seed as an integer.
+    """
+    if getattr(backend._SEED_GENERATOR, "generator", None):
+        return backend._SEED_GENERATOR.generator.randint(1, 1e9)
+    else:
+        return random.randint(1, 1e9)
 
 
 def is_tensor_or_tensor_list(v):
-  v = tf.nest.flatten(v)
-  if v and isinstance(v[0], tf.Tensor):
-    return True
-  else:
-    return False
+    v = tf.nest.flatten(v)
+    if v and isinstance(v[0], tf.Tensor):
+        return True
+    else:
+        return False
 
 
 def get_reachable_from_inputs(inputs, targets=None):
-  """Returns the set of tensors/ops reachable from `inputs`.
-
-  Stops if all targets have been found (target is optional).
-
-  Only valid in Symbolic mode, not Eager mode.
-
-  Args:
-    inputs: List of tensors.
-    targets: List of tensors.
-
-  Returns:
-    A set of tensors reachable from the inputs (includes the inputs themselves).
-  """
-  inputs = tf.nest.flatten(inputs, expand_composites=True)
-  reachable = object_identity.ObjectIdentitySet(inputs)
-  if targets:
-    remaining_targets = object_identity.ObjectIdentitySet(tf.nest.flatten(targets))
-  queue = collections.deque(inputs)
-
-  while queue:
-    x = queue.pop()
-    if isinstance(x, tuple(_user_convertible_tensor_types)):
-      # Can't find consumers of user-specific types.
-      continue
-
-    if isinstance(x, tf.Operation):
-      outputs = x.outputs[:] or []
-      outputs += x._control_outputs  # pylint: disable=protected-access
-    elif isinstance(x, tf.Variable):
-      try:
-        outputs = [x.op]
-      except AttributeError:
-        # Variables can be created in an Eager context.
-        outputs = []
-    elif tf.is_tensor(x):
-      outputs = x.consumers()
-    else:
-      raise TypeError(
-          f'Expected tf.Operation, tf.Variable, or tf.Tensor. Received: {x}')
-
-    for y in outputs:
-      if y not in reachable:
-        reachable.add(y)
-        if targets:
-          remaining_targets.discard(y)
-        queue.appendleft(y)
+    """Returns the set of tensors/ops reachable from `inputs`.
+
+    Stops if all targets have been found (target is optional).
+
+    Only valid in Symbolic mode, not Eager mode.
+
+    Args:
+      inputs: List of tensors.
+      targets: List of tensors.
+
+    Returns:
+      A set of tensors reachable from the inputs (includes the inputs
+      themselves).
+    """
+    inputs = tf.nest.flatten(inputs, expand_composites=True)
+    reachable = object_identity.ObjectIdentitySet(inputs)
+    if targets:
+        remaining_targets = object_identity.ObjectIdentitySet(
+            tf.nest.flatten(targets)
+        )
+    queue = collections.deque(inputs)
+
+    while queue:
+        x = queue.pop()
+        if isinstance(x, tuple(_user_convertible_tensor_types)):
+            # Can't find consumers of user-specific types.
+            continue
+
+        if isinstance(x, tf.Operation):
+            outputs = x.outputs[:] or []
+            outputs += x._control_outputs
+        elif isinstance(x, tf.Variable):
+            try:
+                outputs = [x.op]
+            except AttributeError:
+                # Variables can be created in an Eager context.
+                outputs = []
+        elif tf.is_tensor(x):
+            outputs = x.consumers()
+        else:
+            raise TypeError(
+                "Expected tf.Operation, tf.Variable, or tf.Tensor. "
+                f"Received: {x}"
+            )
+
+        for y in outputs:
+            if y not in reachable:
+                reachable.add(y)
+                if targets:
+                    remaining_targets.discard(y)
+                queue.appendleft(y)
+
+        if targets and not remaining_targets:
+            return reachable
+
+    return reachable
 
-    if targets and not remaining_targets:
-      return reachable
 
-  return reachable
+# This function needs access to private functions of `nest`.
 
 
-# This function needs access to private functions of `nest`.
-#  pylint: disable=protected-access
 def map_structure_with_atomic(is_atomic_fn, map_fn, nested):
-  """Maps the atomic elements of a nested structure.
-
-  Args:
-    is_atomic_fn: A function that determines if an element of `nested` is
-      atomic.
-    map_fn: The function to apply to atomic elements of `nested`.
-    nested: A nested structure.
-
-  Returns:
-    The nested structure, with atomic elements mapped according to `map_fn`.
-
-  Raises:
-    ValueError: If an element that is neither atomic nor a sequence is
-      encountered.
-  """
-  if is_atomic_fn(nested):
-    return map_fn(nested)
-
-  # Recursively convert.
-  if not tf.nest.is_nested(nested):
-    raise ValueError(
-        f'Received non-atomic and non-sequence element: {nested} '
-        f'of type {type(nested)}')
-  if tf.__internal__.nest.is_mapping(nested):
-    values = [nested[k] for k in sorted(nested.keys())]
-  elif tf.__internal__.nest.is_attrs(nested):
-    values = _astuple(nested)
-  else:
-    values = nested
-  mapped_values = [
-      map_structure_with_atomic(is_atomic_fn, map_fn, ele) for ele in values
-  ]
-  return tf.__internal__.nest.sequence_like(nested, mapped_values)
+    """Maps the atomic elements of a nested structure.
+
+    Args:
+      is_atomic_fn: A function that determines if an element of `nested` is
+        atomic.
+      map_fn: The function to apply to atomic elements of `nested`.
+      nested: A nested structure.
+
+    Returns:
+      The nested structure, with atomic elements mapped according to `map_fn`.
+
+    Raises:
+      ValueError: If an element that is neither atomic nor a sequence is
+        encountered.
+    """
+    if is_atomic_fn(nested):
+        return map_fn(nested)
+
+    # Recursively convert.
+    if not tf.nest.is_nested(nested):
+        raise ValueError(
+            f"Received non-atomic and non-sequence element: {nested} "
+            f"of type {type(nested)}"
+        )
+    if tf.__internal__.nest.is_mapping(nested):
+        values = [nested[k] for k in sorted(nested.keys())]
+    elif tf.__internal__.nest.is_attrs(nested):
+        values = _astuple(nested)
+    else:
+        values = nested
+    mapped_values = [
+        map_structure_with_atomic(is_atomic_fn, map_fn, ele) for ele in values
+    ]
+    return tf.__internal__.nest.sequence_like(nested, mapped_values)
 
 
 def get_shapes(tensors):
-  """Gets shapes from tensors."""
-  return tf.nest.map_structure(
-      lambda x: x.shape if hasattr(x, 'shape') else None, tensors)
-
-
-#  pylint: enable=protected-access
+    """Gets shapes from tensors."""
+    return tf.nest.map_structure(
+        lambda x: x.shape if hasattr(x, "shape") else None, tensors
+    )
 
 
 def convert_shapes(input_shape, to_tuples=True):
-  """Converts nested shape representations to desired format.
-
-  Performs:
-
-  TensorShapes -> tuples if `to_tuples=True`.
-  tuples of int or None -> TensorShapes if `to_tuples=False`.
-
-  Valid objects to be converted are:
-  - TensorShapes
-  - tuples with elements of type int or None.
-  - ints
-  - None
-
-  Args:
-    input_shape: A nested structure of objects to be converted to TensorShapes.
-    to_tuples: If `True`, converts all TensorShape to tuples. Otherwise converts
-      all tuples representing shapes to TensorShapes.
-
-  Returns:
-    Nested structure of shapes in desired format.
-
-  Raises:
-    ValueError: when the input tensor shape can't be converted to tuples, eg
-      unknown tensor shape.
-  """
-
-  def _is_shape_component(value):
-    return value is None or isinstance(value, (int, tf.compat.v1.Dimension))
-
-  def _is_atomic_shape(input_shape):
-    # Ex: TensorShape or (None, 10, 32) or 5 or `None`
-    if _is_shape_component(input_shape):
-      return True
-    if isinstance(input_shape, tf.TensorShape):
-      return True
-    if (isinstance(input_shape, (tuple, list)) and
-        all(_is_shape_component(ele) for ele in input_shape)):
-      return True
-    return False
-
-  def _convert_shape(input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    if to_tuples:
-      input_shape = tuple(input_shape.as_list())
-    return input_shape
-
-  return map_structure_with_atomic(_is_atomic_shape, _convert_shape,
-                                   input_shape)
+    """Converts nested shape representations to desired format.
+
+    Performs:
+
+    TensorShapes -> tuples if `to_tuples=True`.
+    tuples of int or None -> TensorShapes if `to_tuples=False`.
+
+    Valid objects to be converted are:
+    - TensorShapes
+    - tuples with elements of type int or None.
+    - ints
+    - None
+
+    Args:
+      input_shape: A nested structure of objects to be converted to
+        TensorShapes.
+      to_tuples: If `True`, converts all TensorShape to tuples. Otherwise
+        converts all tuples representing shapes to TensorShapes.
+
+    Returns:
+      Nested structure of shapes in desired format.
+
+    Raises:
+      ValueError: when the input tensor shape can't be converted to tuples, eg
+        unknown tensor shape.
+    """
+
+    def _is_shape_component(value):
+        return value is None or isinstance(value, (int, tf.compat.v1.Dimension))
+
+    def _is_atomic_shape(input_shape):
+        # Ex: TensorShape or (None, 10, 32) or 5 or `None`
+        if _is_shape_component(input_shape):
+            return True
+        if isinstance(input_shape, tf.TensorShape):
+            return True
+        if isinstance(input_shape, (tuple, list)) and all(
+            _is_shape_component(ele) for ele in input_shape
+        ):
+            return True
+        return False
+
+    def _convert_shape(input_shape):
+        input_shape = tf.TensorShape(input_shape)
+        if to_tuples:
+            input_shape = tuple(input_shape.as_list())
+        return input_shape
+
+    return map_structure_with_atomic(
+        _is_atomic_shape, _convert_shape, input_shape
+    )
 
 
 def validate_axis(axis, input_shape):
-  """Validate an axis value and returns its standardized form.
-
-  Args:
-    axis: Value to validate. Can be an integer or a list/tuple of integers.
-      Integers may be negative.
-    input_shape: Reference input shape that the axis/axes refer to.
-
-  Returns:
-    Normalized form of `axis`, i.e. a list with all-positive values.
-  """
-  input_shape = tf.TensorShape(input_shape)
-  rank = input_shape.rank
-  if not rank:
-    raise ValueError(
-        f'Input has undefined rank. Received: input_shape={input_shape}')
-
-  # Convert axis to list and resolve negatives
-  if isinstance(axis, int):
-    axis = [axis]
-  else:
-    axis = list(axis)
-  for idx, x in enumerate(axis):
-    if x < 0:
-      axis[idx] = rank + x
-
-  # Validate axes
-  for x in axis:
-    if x < 0 or x >= rank:
-      raise ValueError(
-          'Invalid value for `axis` argument. '
-          'Expected 0 <= axis < inputs.rank (with '
-          f'inputs.rank={rank}). Received: axis={tuple(axis)}')
-  if len(axis) != len(set(axis)):
-    raise ValueError(f'Duplicate axis: {tuple(axis)}')
-  return axis
+    """Validate an axis value and returns its standardized form.
+
+    Args:
+      axis: Value to validate. Can be an integer or a list/tuple of integers.
+        Integers may be negative.
+      input_shape: Reference input shape that the axis/axes refer to.
+
+    Returns:
+      Normalized form of `axis`, i.e. a list with all-positive values.
+    """
+    input_shape = tf.TensorShape(input_shape)
+    rank = input_shape.rank
+    if not rank:
+        raise ValueError(
+            f"Input has undefined rank. Received: input_shape={input_shape}"
+        )
+
+    # Convert axis to list and resolve negatives
+    if isinstance(axis, int):
+        axis = [axis]
+    else:
+        axis = list(axis)
+    for idx, x in enumerate(axis):
+        if x < 0:
+            axis[idx] = rank + x
+
+    # Validate axes
+    for x in axis:
+        if x < 0 or x >= rank:
+            raise ValueError(
+                "Invalid value for `axis` argument. "
+                "Expected 0 <= axis < inputs.rank (with "
+                f"inputs.rank={rank}). Received: axis={tuple(axis)}"
+            )
+    if len(axis) != len(set(axis)):
+        raise ValueError(f"Duplicate axis: {tuple(axis)}")
+    return axis
 
 
 class ListWrapper:
-  """A wrapper for lists to be treated as elements for `nest`."""
+    """A wrapper for lists to be treated as elements for `nest`."""
 
-  def __init__(self, list_to_wrap):
-    self._list = list_to_wrap
+    def __init__(self, list_to_wrap):
+        self._list = list_to_wrap
 
-  def as_list(self):
-    return self._list
+    def as_list(self):
+        return self._list
 
 
 def convert_inner_node_data(nested, wrap=False):
-  """Either wraps or unwraps innermost node data lists in `ListWrapper` objects.
-
-  Args:
-    nested: A nested data structure.
-    wrap: If `True`, wrap innermost lists in `ListWrapper` objects. If `False`,
-      unwraps `ListWrapper` objects into lists.
-
-  Returns:
-    Structure of same type as nested, with lists wrapped/unwrapped.
-  """
-
-  def _is_serialized_node_data(nested):
-    # Node data can be of form `[layer_name, node_id, tensor_id]` or
-    # `[layer_name, node_id, tensor_id, kwargs]`.
-    if (isinstance(nested, list) and (len(nested) in [3, 4]) and
-        isinstance(nested[0], str)):
-      return True
-    return False
-
-  def _is_atomic_nested(nested):
-    """Returns `True` if `nested` is a list representing node data."""
-    if isinstance(nested, ListWrapper):
-      return True
-    if _is_serialized_node_data(nested):
-      return True
-    return not tf.nest.is_nested(nested)
-
-  def _convert_object_or_list(nested):
-    """Convert b/t `ListWrapper` object and list representations."""
-    if wrap:
-      if isinstance(nested, ListWrapper):
-        return nested
-      if _is_serialized_node_data(nested):
-        return ListWrapper(nested)
-      return nested
-    else:
-      if isinstance(nested, ListWrapper):
-        return nested.as_list()
-      return nested
-
-  return map_structure_with_atomic(_is_atomic_nested, _convert_object_or_list,
-                                   nested)
+    """Either wraps or unwraps innermost node data lists in `ListWrapper`
+    objects.
+
+    Args:
+      nested: A nested data structure.
+      wrap: If `True`, wrap innermost lists in `ListWrapper` objects. If
+        `False`, unwraps `ListWrapper` objects into lists.
+
+    Returns:
+      Structure of same type as nested, with lists wrapped/unwrapped.
+    """
+
+    def _is_serialized_node_data(nested):
+        # Node data can be of form `[layer_name, node_id, tensor_id]` or
+        # `[layer_name, node_id, tensor_id, kwargs]`.
+        if (
+            isinstance(nested, list)
+            and (len(nested) in [3, 4])
+            and isinstance(nested[0], str)
+        ):
+            return True
+        return False
+
+    def _is_atomic_nested(nested):
+        """Returns `True` if `nested` is a list representing node data."""
+        if isinstance(nested, ListWrapper):
+            return True
+        if _is_serialized_node_data(nested):
+            return True
+        return not tf.nest.is_nested(nested)
+
+    def _convert_object_or_list(nested):
+        """Convert b/t `ListWrapper` object and list representations."""
+        if wrap:
+            if isinstance(nested, ListWrapper):
+                return nested
+            if _is_serialized_node_data(nested):
+                return ListWrapper(nested)
+            return nested
+        else:
+            if isinstance(nested, ListWrapper):
+                return nested.as_list()
+            return nested
+
+    return map_structure_with_atomic(
+        _is_atomic_nested, _convert_object_or_list, nested
+    )
 
 
 def shape_type_conversion(fn):
-  """Decorator that handles tuple/TensorShape conversion.
+    """Decorator that handles tuple/TensorShape conversion.
 
-  Used in `compute_output_shape` and `build`.
+    Used in `compute_output_shape` and `build`.
 
-  Args:
-    fn: function to wrap.
+    Args:
+      fn: function to wrap.
 
-  Returns:
-    Wrapped function.
-  """
+    Returns:
+      Wrapped function.
+    """
 
-  def wrapper(instance, input_shape):
-    # Pass shapes as tuples to `fn`
-    # This preserves compatibility with external Keras.
-    if input_shape is not None:
-      input_shape = convert_shapes(input_shape, to_tuples=True)
-    output_shape = fn(instance, input_shape)
-    # Return shapes from `fn` as TensorShapes.
-    if output_shape is not None:
-      output_shape = convert_shapes(output_shape, to_tuples=False)
-    return output_shape
+    def wrapper(instance, input_shape):
+        # Pass shapes as tuples to `fn`
+        # This preserves compatibility with external Keras.
+        if input_shape is not None:
+            input_shape = convert_shapes(input_shape, to_tuples=True)
+        output_shape = fn(instance, input_shape)
+        # Return shapes from `fn` as TensorShapes.
+        if output_shape is not None:
+            output_shape = convert_shapes(output_shape, to_tuples=False)
+        return output_shape
 
-  return wrapper
+    return wrapper
 
 
 def are_all_symbolic_tensors(tensors):
-  return all(map(is_symbolic_tensor, tensors))
+    return all(map(is_symbolic_tensor, tensors))
 
 
 _user_convertible_tensor_types = set()
 
 
 def is_extension_type(tensor):
-  """Returns whether a tensor is of an ExtensionType.
+    """Returns whether a tensor is of an ExtensionType.
 
-  github.com/tensorflow/community/pull/269
-  Currently it works by checking if `tensor` is a `CompositeTensor` instance,
-  but this will be changed to use an appropriate extensiontype protocol
-  check once ExtensionType is made public.
+    github.com/tensorflow/community/pull/269
+    Currently it works by checking if `tensor` is a `CompositeTensor` instance,
+    but this will be changed to use an appropriate extensiontype protocol
+    check once ExtensionType is made public.
 
-  Args:
-    tensor: An object to test
+    Args:
+      tensor: An object to test
 
-  Returns:
-    True if the tensor is an extension type object, false if not.
-  """
-  return isinstance(tensor, tf.__internal__.CompositeTensor)
+    Returns:
+      True if the tensor is an extension type object, false if not.
+    """
+    return isinstance(tensor, tf.__internal__.CompositeTensor)
 
 
 def is_symbolic_tensor(tensor):
-  """Returns whether a tensor is symbolic (from a TF graph) or an eager tensor.
-
-  A Variable can be seen as either: it is considered symbolic
-  when we are in a graph scope, and eager when we are in an eager scope.
-
-  Args:
-    tensor: A tensor instance to test.
-
-  Returns:
-    True for symbolic tensors, False for eager tensors.
-  """
-  if isinstance(tensor, tf.Tensor):
-    return hasattr(tensor, 'graph')
-  elif is_extension_type(tensor):
-    component_tensors = tf.nest.flatten(tensor, expand_composites=True)
-    return any(hasattr(t, 'graph') for t in component_tensors)
-  elif isinstance(tensor, tf.Variable):
-    # Variables that are output of a Keras Layer in Functional API mode
-    # should be considered symbolic.
-    # TODO(omalleyt): We need a better way to check this in order to
-    # enable `run_eagerly=True` for Models containing Layers that
-    # return Variables as outputs.
-    return (getattr(tensor, '_keras_history', False) or
-            not tf.executing_eagerly())
-  elif isinstance(tensor, tuple(_user_convertible_tensor_types)):
-    tensor = ops.convert_to_tensor_or_composite(tensor)
-    return is_symbolic_tensor(tensor)
-  else:
-    return False
-
-
-@keras_export('keras.__internal__.utils.register_symbolic_tensor_type', v1=[])
+    """Returns whether a tensor is symbolic (from a TF graph) or an eager
+    tensor.
+
+    A Variable can be seen as either: it is considered symbolic
+    when we are in a graph scope, and eager when we are in an eager scope.
+
+    Args:
+      tensor: A tensor instance to test.
+
+    Returns:
+      True for symbolic tensors, False for eager tensors.
+    """
+    if isinstance(tensor, tf.Tensor):
+        return hasattr(tensor, "graph")
+    elif is_extension_type(tensor):
+        component_tensors = tf.nest.flatten(tensor, expand_composites=True)
+        return any(hasattr(t, "graph") for t in component_tensors)
+    elif isinstance(tensor, tf.Variable):
+        # Variables that are output of a Keras Layer in Functional API mode
+        # should be considered symbolic.
+        # TODO(omalleyt): We need a better way to check this in order to
+        # enable `run_eagerly=True` for Models containing Layers that
+        # return Variables as outputs.
+        return (
+            getattr(tensor, "_keras_history", False)
+            or not tf.executing_eagerly()
+        )
+    elif isinstance(tensor, tuple(_user_convertible_tensor_types)):
+        tensor = ops.convert_to_tensor_or_composite(tensor)
+        return is_symbolic_tensor(tensor)
+    else:
+        return False
+
+
+@keras_export("keras.__internal__.utils.register_symbolic_tensor_type", v1=[])
 def register_symbolic_tensor_type(cls):
-  """Allows users to specify types regarded as symbolic `Tensor`s.
+    """Allows users to specify types regarded as symbolic `Tensor`s.
 
-  Used in conjunction with `tf.register_tensor_conversion_function`, calling
-  `tf.keras.__internal__.utils.register_symbolic_tensor_type(cls)`
-  allows non-`Tensor` objects to be plumbed through Keras layers.
+    Used in conjunction with `tf.register_tensor_conversion_function`, calling
+    `tf.keras.__internal__.utils.register_symbolic_tensor_type(cls)`
+    allows non-`Tensor` objects to be plumbed through Keras layers.
 
-  Example:
+    Example:
 
-  ```python
-  # One-time setup.
-  class Foo:
-    def __init__(self, input_):
-      self._input = input_
-    def value(self):
-      return tf.constant(42.)
+    ```python
+    # One-time setup.
+    class Foo:
+      def __init__(self, input_):
+        self._input = input_
+      def value(self):
+        return tf.constant(42.)
 
-  tf.register_tensor_conversion_function(
-      Foo, lambda x, *args, **kwargs: x.value())
+    tf.register_tensor_conversion_function(
+        Foo, lambda x, *args, **kwargs: x.value())
 
-  tf.keras.__internal__.utils.register_symbolic_tensor_type(Foo)
+    tf.keras.__internal__.utils.register_symbolic_tensor_type(Foo)
 
-  # User-land.
-  layer = tf.keras.layers.Lambda(lambda input_: Foo(input_))
-  ```
+    # User-land.
+    layer = tf.keras.layers.Lambda(lambda input_: Foo(input_))
+    ```
 
-  Args:
-    cls: A `class` type which shall be regarded as a symbolic `Tensor`.
-  """
-  global _user_convertible_tensor_types
-  if cls not in _user_convertible_tensor_types:
-    keras_tensor.register_keras_tensor_specialization(
-        cls, keras_tensor.UserRegisteredTypeKerasTensor)
-  _user_convertible_tensor_types.add(cls)
+    Args:
+      cls: A `class` type which shall be regarded as a symbolic `Tensor`.
+    """
+    global _user_convertible_tensor_types
+    if cls not in _user_convertible_tensor_types:
+        keras_tensor.register_keras_tensor_specialization(
+            cls, keras_tensor.UserRegisteredTypeKerasTensor
+        )
+    _user_convertible_tensor_types.add(cls)
 
 
 def type_spec_from_value(value):
-  """Grab type_spec without converting array-likes to tensors."""
-  if is_extension_type(value):
-    return value._type_spec  # pylint: disable=protected-access
-  # Get a TensorSpec for array-like data without
-  # converting the data to a Tensor
-  if hasattr(value, 'shape') and hasattr(value, 'dtype'):
-    return tf.TensorSpec(value.shape, value.dtype)
-  else:
-    return tf.type_spec_from_value(value)
+    """Grab type_spec without converting array-likes to tensors."""
+    if is_extension_type(value):
+        return value._type_spec
+    # Get a TensorSpec for array-like data without
+    # converting the data to a Tensor
+    if hasattr(value, "shape") and hasattr(value, "dtype"):
+        return tf.TensorSpec(value.shape, value.dtype)
+    else:
+        return tf.type_spec_from_value(value)
 
 
 def is_ragged(tensor):
-  """Returns true if `tensor` is a ragged tensor or ragged tensor value."""
-  return isinstance(
-      tensor,
-      (tf.RaggedTensor, tf.compat.v1.ragged.RaggedTensorValue))
+    """Returns true if `tensor` is a ragged tensor or ragged tensor value."""
+    return isinstance(
+        tensor, (tf.RaggedTensor, tf.compat.v1.ragged.RaggedTensorValue)
+    )
 
 
 def is_sparse(tensor):
-  """Returns true if `tensor` is a sparse tensor or sparse tensor value."""
-  return isinstance(
-      tensor,
-      (tf.SparseTensor, tf.compat.v1.SparseTensorValue))
+    """Returns true if `tensor` is a sparse tensor or sparse tensor value."""
+    return isinstance(tensor, (tf.SparseTensor, tf.compat.v1.SparseTensorValue))
 
 
 def is_tensor_or_variable(x):
-  return tf.is_tensor(x) or isinstance(x, tf.Variable)
+    return tf.is_tensor(x) or isinstance(x, tf.Variable)
 
 
 def is_tensor_or_extension_type(x):
-  """Returns true if 'x' is a TF-native type or an ExtensionType."""
-  return tf.is_tensor(x) or is_extension_type(x)
+    """Returns true if 'x' is a TF-native type or an ExtensionType."""
+    return tf.is_tensor(x) or is_extension_type(x)
+
+
+def convert_variables_to_tensors(values):
+    """Converts `Variable`s in `values` to `Tensor`s.
+
+    This is a Keras version of `convert_variables_to_tensors` in TensorFlow
+    variable_utils.py.
+
+    If an object in `values` is an `ExtensionType` and it overrides its
+    `_convert_variables_to_tensors` method, its `ResourceVariable` components
+    will also be converted to `Tensor`s. Objects other than `ResourceVariable`s
+    in `values` will be returned unchanged.
+
+    Args:
+        values: A nested structure of `ResourceVariable`s, or any other objects.
+
+    Returns:
+        A new structure with `ResourceVariable`s in `values` converted to
+        `Tensor`s.
+    """
+
+    def _convert_resource_variable_to_tensor(x):
+        if isinstance(x, tf.Variable):
+            return tf.convert_to_tensor(x)
+        elif is_extension_type(x):
+            return x._convert_variables_to_tensors()
+        else:
+            return x
+
+    return tf.nest.map_structure(_convert_resource_variable_to_tensor, values)
 
 
 def assert_no_legacy_layers(layers):
-  """Prevent tf.layers.Layers from being used with Keras.
+    """Prevent tf.layers.Layers from being used with Keras.
 
-  Certain legacy layers inherit from their keras analogs; however they are
-  not supported with keras and can lead to subtle and hard to diagnose bugs.
+    Certain legacy layers inherit from their keras analogs; however they are
+    not supported with keras and can lead to subtle and hard to diagnose bugs.
 
-  Args:
-    layers: A list of layers to check
+    Args:
+      layers: A list of layers to check
 
-  Raises:
-    TypeError: If any elements of layers are tf.layers.Layers
-  """
+    Raises:
+      TypeError: If any elements of layers are tf.layers.Layers
+    """
 
-  # isinstance check for tf.layers.Layer introduces a circular dependency.
-  legacy_layers = [l for l in layers if getattr(l, '_is_legacy_layer', None)]
-  if legacy_layers:
-    layer_str = '\n'.join('  ' + str(l) for l in legacy_layers)
-    raise TypeError(
-        f'The following are legacy tf.layers.Layers:\n{layer_str}\n'
-        'To use keras as a '
-        'framework (for instance using the Network, Model, or Sequential '
-        'classes), please use the tf.keras.layers implementation instead. '
-        '(Or, if writing custom layers, subclass from tf.keras.layers rather '
-        'than tf.layers)')
+    # isinstance check for tf.layers.Layer introduces a circular dependency.
+    legacy_layers = [l for l in layers if getattr(l, "_is_legacy_layer", None)]
+    if legacy_layers:
+        layer_str = "\n".join("  " + str(l) for l in legacy_layers)
+        raise TypeError(
+            f"The following are legacy tf.layers.Layers:\n{layer_str}\n"
+            "To use keras as a "
+            "framework (for instance using the Network, Model, or Sequential "
+            "classes), please use the tf.keras.layers implementation instead. "
+            "(Or, if writing custom layers, subclass from tf.keras.layers "
+            "rather than tf.layers)"
+        )
 
 
 @tf_contextlib.contextmanager
 def maybe_init_scope(layer):
-  """Open an `init_scope` if in V2 mode and using the keras graph.
-
-  Args:
-    layer: The Layer/Model that is currently active.
-
-  Yields:
-    None
-  """
-  # Don't open an init_scope in V1 mode or when using legacy tf.layers.
-  if (tf.compat.v1.executing_eagerly_outside_functions() and
-      getattr(layer, '_keras_style', True)):
-    with tf.init_scope():
-      yield
-  else:
-    yield
+    """Open an `init_scope` if in V2 mode and using the keras graph.
+
+    Args:
+      layer: The Layer/Model that is currently active.
+
+    Yields:
+      None
+    """
+    # Don't open an init_scope in V1 mode, when using legacy tf.layers, or in a
+    # local-variable scope.
+    # The local-variable scope should ensure that created variables are local to
+    # the function being executed, rather than lifted out of the graph by
+    # `init_scope`. This way the variables are freely usable and mutable within
+    # the function, which enables a visitation guarantee for model evaluation,
+    # when the scope is applied to metric variable creation.
+    if (
+        tf.compat.v1.executing_eagerly_outside_functions()
+        and getattr(layer, "_keras_style", True)
+        and not in_local_vars_context()
+    ):
+        with tf.init_scope():
+            yield
+    else:
+        yield
 
 
 @tf_contextlib.contextmanager
 def graph_context_for_symbolic_tensors(*args, **kwargs):
-  """Returns graph context manager if any of the inputs is a symbolic tensor."""
-  if any(is_symbolic_tensor(v) for v in list(args) + list(kwargs.values())):
-    with backend.get_graph().as_default():
-      yield
-  else:
-    yield
+    """Returns graph context manager if any of the inputs is a symbolic
+    tensor."""
+    if any(is_symbolic_tensor(v) for v in list(args) + list(kwargs.values())):
+        with backend.get_graph().as_default():
+            yield
+    else:
+        yield
 
 
 def dataset_is_infinite(dataset):
-  """True if the passed dataset is infinite."""
-  if tf.compat.v1.executing_eagerly_outside_functions():
-    return tf.equal(
-        tf.data.experimental.cardinality(dataset), tf.data.experimental.INFINITE_CARDINALITY)
-  else:
-    dataset_size = backend.get_session().run(
-        tf.data.experimental.cardinality(dataset))
-    return dataset_size == tf.data.experimental.INFINITE_CARDINALITY
+    """True if the passed dataset is infinite."""
+    if tf.compat.v1.executing_eagerly_outside_functions():
+        return tf.equal(
+            tf.data.experimental.cardinality(dataset),
+            tf.data.experimental.INFINITE_CARDINALITY,
+        )
+    else:
+        dataset_size = backend.get_session().run(
+            tf.data.experimental.cardinality(dataset)
+        )
+        return dataset_size == tf.data.experimental.INFINITE_CARDINALITY
 
 
 def get_tensor_spec(t, dynamic_batch=False, name=None):
-  """Returns a `TensorSpec` given a single `Tensor` or `TensorSpec`."""
-  # pylint: disable=protected-access
-  if isinstance(t, tf.TypeSpec):
-    spec = t
-  elif is_extension_type(t):
-    # TODO(b/148821952): Should these specs have a name attr?
-    spec = t._type_spec
-  elif (hasattr(t, '_keras_history') and
-        hasattr(t._keras_history[0], '_type_spec')):
-    return t._keras_history[0]._type_spec
-  elif isinstance(t, keras_tensor.KerasTensor):
-    spec = t.type_spec
-  elif hasattr(t, 'shape') and hasattr(t, 'dtype'):
-    spec = tf.TensorSpec(shape=t.shape, dtype=t.dtype, name=name)
-  else:
-    return None  # Allow non-Tensors to pass through.
-  # pylint: enable=protected-access
-
-  if not dynamic_batch:
-    return spec
-
-  shape = spec.shape
-  if shape.rank is None or shape.rank == 0:
-    return spec
-
-  shape_list = shape.as_list()
-  shape_list[0] = None
-  # TODO(b/203201161) Remove this deepcopy one type_spec_with_shape has been
-  # updated to not mutate spec.
-  spec = copy.deepcopy(spec)
-  return keras_tensor.type_spec_with_shape(spec, tf.TensorShape(shape_list))
+    """Returns a `TensorSpec` given a single `Tensor` or `TensorSpec`."""
+
+    if isinstance(t, tf.TypeSpec):
+        spec = t
+    elif is_extension_type(t):
+        # TODO(b/148821952): Should these specs have a name attr?
+        spec = t._type_spec
+    elif hasattr(t, "_keras_history") and hasattr(
+        t._keras_history[0], "_type_spec"
+    ):
+        return t._keras_history[0]._type_spec
+    elif isinstance(t, keras_tensor.KerasTensor):
+        spec = t.type_spec
+    elif hasattr(t, "shape") and hasattr(t, "dtype"):
+        spec = tf.TensorSpec(shape=t.shape, dtype=t.dtype, name=name)
+    else:
+        return None  # Allow non-Tensors to pass through.
+
+    if not dynamic_batch:
+        return spec
+
+    shape = spec.shape
+    if shape.rank is None or shape.rank == 0:
+        return spec
+
+    shape_list = shape.as_list()
+    shape_list[0] = None
+    # TODO(b/203201161) Remove this deepcopy one type_spec_with_shape has been
+    # updated to not mutate spec.
+    spec = copy.deepcopy(spec)
+    return keras_tensor.type_spec_with_shape(spec, tf.TensorShape(shape_list))
 
 
 def sync_to_numpy_or_python_type(tensors):
-  """Syncs and converts a structure of `Tensor`s to `NumPy` arrays or Python scalar types.
+    """Syncs and converts a structure of `Tensor`s to `NumPy` arrays or Python
+    scalar types.
+
+    For each tensor, it calls `tensor.numpy()`. If the result is a scalar value,
+    it converts it to a Python type, such as a float or int, by calling
+    `result.item()`.
+
+    Numpy scalars are converted, as Python types are often more convenient to
+    deal with. This is especially useful for bfloat16 Numpy scalars, which don't
+    support as many operations as other Numpy values.
+
+    Async strategies (such as `TPUStrategy` and `ParameterServerStrategy`) are
+    forced to
+    sync during this process.
+
+    Args:
+      tensors: A structure of tensors.
+
+    Returns:
+      `tensors`, but scalar tensors are converted to Python types and non-scalar
+      tensors are converted to Numpy arrays.
+    """
+    if isinstance(tensors, tf.distribute.experimental.coordinator.RemoteValue):
+        tensors = tensors.fetch()
+    if isinstance(tensors, list) and isinstance(
+        tensors[0], tf.distribute.experimental.coordinator.RemoteValue
+    ):
+        tensors = tf.nest.map_structure(lambda t: t.fetch(), tensors)
+
+    def _to_single_numpy_or_python_type(t):
+        # Don't turn ragged or sparse tensors to NumPy.
+        if isinstance(t, tf.Tensor):
+            t = t.numpy()
+        # Strings, ragged and sparse tensors don't have .item(). Return them
+        # as-is.
+        if not isinstance(t, (np.ndarray, np.generic)):
+            return t
+        return t.item() if np.ndim(t) == 0 else t
+
+    return tf.nest.map_structure(_to_single_numpy_or_python_type, tensors)
 
-  For each tensor, it calls `tensor.numpy()`. If the result is a scalar value,
-  it converts it to a Python type, such as a float or int, by calling
-  `result.item()`.
 
-  Numpy scalars are converted, as Python types are often more convenient to deal
-  with. This is especially useful for bfloat16 Numpy scalars, which don't
-  support as many operations as other Numpy values.
+def _astuple(attrs):
+    """Converts the given attrs to tuple non-recursively."""
+    cls = type(attrs)
+    fields = getattr(cls, "__attrs_attrs__", None)
+    if fields is None:
+        raise ValueError(f"{cls} is not an attrs-decorated class.")
+    values = []
+    for field in fields:
+        values.append(getattr(attrs, field.name))
+    return tuple(values)
+
+
+def can_jit_compile(warn=False):
+    """Returns True if TensorFlow XLA is available for the platform."""
+    if platform.system() == "Darwin" and "arm" in platform.processor().lower():
+        if warn:
+            logging.warning(
+                "XLA (`jit_compile`) is not yet supported on Apple M1/M2 ARM "
+                "processors. Falling back to `jit_compile=False`."
+            )
+        return False
+    if pywrap_tfe.TF_ListPluggablePhysicalDevices():
+        if warn:
+            logging.warning(
+                "XLA (`jit_compile`) is not supported on your system. "
+                "Falling back to `jit_compile=False`."
+            )
+        return False
+    return True
 
-  Async strategies (such as `TPUStrategy` and `ParameterServerStrategy`) are
-  forced to
-  sync during this process.
 
-  Args:
-    tensors: A structure of tensors.
+_metric_local_vars_scope = threading.local()
 
-  Returns:
-    `tensors`, but scalar tensors are converted to Python types and non-scalar
-    tensors are converted to Numpy arrays.
-  """
-  if isinstance(tensors, tf.distribute.experimental.coordinator.RemoteValue):
-    tensors = tensors.fetch()
 
-  def _to_single_numpy_or_python_type(t):
-    # Don't turn ragged or sparse tensors to NumPy.
-    if isinstance(t, tf.Tensor):
-      t = t.numpy()
-    # Strings, ragged and sparse tensors don't have .item(). Return them as-is.
-    if not isinstance(t, (np.ndarray, np.generic)):
-      return t
-    return t.item() if np.ndim(t) == 0 else t
+def get_metric_local_vars_scope():
+    try:
+        return _metric_local_vars_scope.current
+    except AttributeError:
+        return None
 
-  return tf.nest.map_structure(_to_single_numpy_or_python_type, tensors)
 
+def in_local_vars_context():
+    ctx = get_metric_local_vars_scope()
+    return ctx is not None
 
-def _astuple(attrs):
-  """Converts the given attrs to tuple non-recursively."""
-  cls = type(attrs)
-  fields = getattr(cls, '__attrs_attrs__', None)
-  if fields is None:
-    raise ValueError(f'{cls} is not an attrs-decorated class.')
-  values = []
-  for field in fields:
-    values.append(getattr(attrs, field.name))
-  return tuple(values)
+
+@contextlib.contextmanager
+def with_metric_local_vars_scope():
+    previous_scope = getattr(_metric_local_vars_scope, "current", None)
+    _metric_local_vars_scope.current = MetricLocalVarsScope()
+    yield
+    _metric_local_vars_scope.current = previous_scope
+
+
+class MetricLocalVarsScope:
+    """Turn on local variable creation for Metrics.
+
+    No functionality is needed here, it just exists to modulate Metric's
+    variable creation."""
diff --git a/keras/utils/tf_utils_test.py b/keras/utils/tf_utils_test.py
index e02e3922f95b..023cd123f040 100644
--- a/keras/utils/tf_utils_test.py
+++ b/keras/utils/tf_utils_test.py
@@ -14,342 +14,475 @@
 # ==============================================================================
 """Tests for Keras TF utils."""
 
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+import numpy as np
+import tensorflow.compat.v2 as tf
 from absl.testing import parameterized
+
 import keras
 from keras.testing_infra import test_combinations
 from keras.utils import tf_utils
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 try:
-  import attr  # pylint:disable=g-import-not-at-top
+    import attr
 except ImportError:
-  attr = None
+    attr = None
 
 
-@test_combinations.generate(test_combinations.combine(mode=['graph', 'eager']))
+@test_combinations.generate(test_combinations.combine(mode=["graph", "eager"]))
 class TestIsSymbolicTensor(tf.test.TestCase, parameterized.TestCase):
-
-  def test_default_behavior(self):
-    if tf.executing_eagerly():
-      self.assertFalse(tf_utils.is_symbolic_tensor(
-          tf.Variable(name='blah', initial_value=0.)))
-      self.assertFalse(
-          tf_utils.is_symbolic_tensor(
-              tf.convert_to_tensor(0.)))
-      self.assertFalse(tf_utils.is_symbolic_tensor(
-          tf.SparseTensor(
-              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
-    else:
-      self.assertTrue(tf_utils.is_symbolic_tensor(
-          tf.Variable(name='blah', initial_value=0.)))
-      self.assertTrue(
-          tf_utils.is_symbolic_tensor(
-              tf.convert_to_tensor(0.)))
-      self.assertTrue(tf_utils.is_symbolic_tensor(
-          tf.SparseTensor(
-              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
-
-  def test_works_with_registered(self):
-
-    class CustomClass:
-
-      def value(self):
-        return tf.convert_to_tensor(42.)
-
-    tf.register_tensor_conversion_function(
-        CustomClass, lambda value, **_: value.value())
-
-    tf_utils.register_symbolic_tensor_type(CustomClass)
-
-    if tf.executing_eagerly():
-      self.assertFalse(tf_utils.is_symbolic_tensor(
-          tf.Variable(name='blah', initial_value=0.)))
-      self.assertFalse(
-          tf_utils.is_symbolic_tensor(
-              tf.convert_to_tensor(0.)))
-      self.assertFalse(tf_utils.is_symbolic_tensor(
-          tf.SparseTensor(
-              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
-      self.assertFalse(tf_utils.is_symbolic_tensor(CustomClass()))
-    else:
-      self.assertTrue(tf_utils.is_symbolic_tensor(
-          tf.Variable(name='blah', initial_value=0.)))
-      self.assertTrue(
-          tf_utils.is_symbolic_tensor(
-              tf.convert_to_tensor(0.)))
-      self.assertTrue(tf_utils.is_symbolic_tensor(
-          tf.SparseTensor(
-              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
-      self.assertTrue(tf_utils.is_symbolic_tensor(CustomClass()))
-
-  def test_enables_nontensor_plumbing(self):
-    if tf.executing_eagerly():
-      self.skipTest('`compile` functionality changed.')
-    # Setup.
-
-    class Foo:
-
-      def __init__(self, input_):
-        self._input = input_
-        self.value = tf.convert_to_tensor([[42.]])
-
-      @property
-      def dtype(self):
-        return self.value.dtype
-
-    tf.register_tensor_conversion_function(
-        Foo, lambda x, *args, **kwargs: x.value)
-    tf_utils.register_symbolic_tensor_type(Foo)
-
-    class PlumbingLayer(keras.layers.Lambda):
-
-      def __init__(self, fn, **kwargs):
-        def _fn(*fargs, **fkwargs):
-          d = fn(*fargs, **fkwargs)
-          x = tf.convert_to_tensor(d)
-          d.shape = x.shape
-          d.get_shape = x.get_shape
-          return d, x
-        super().__init__(_fn, **kwargs)
-        self._enter_dunder_call = False
-
-      def __call__(self, inputs, *args, **kwargs):
-        self._enter_dunder_call = True
-        d, _ = super().__call__(inputs, *args, **kwargs)
-        self._enter_dunder_call = False
-        return d
-
-      def call(self, inputs, *args, **kwargs):
-        d, v = super().call(inputs, *args, **kwargs)
-        if self._enter_dunder_call:
-          return d, v
-        return d
-
-    # User-land.
-    model = keras.Sequential([
-        keras.layers.InputLayer((1,)),
-        PlumbingLayer(Foo),  # Makes a `Foo` object.
-    ])
-    # Let's ensure Keras graph history is preserved by composing the models.
-    model = keras.Model(model.inputs, model(model.outputs))
-    # Now we instantiate the model and verify we have a `Foo` object, not a
-    # `Tensor`.
-    y = model(tf.convert_to_tensor([[7.]]))
-    self.assertIsInstance(y, Foo)
-    # Confirm that (custom) loss sees `Foo` instance, not Tensor.
-    obtained_prediction_box = [None]
-    def custom_loss(y_obs, y_pred):
-      del y_obs
-      obtained_prediction_box[0] = y_pred
-      return y_pred
-    # Apparently `compile` calls the loss function enough to trigger the
-    # side-effect.
-    model.compile('SGD', loss=custom_loss)
-    self.assertIsInstance(obtained_prediction_box[0], Foo)
+    def test_default_behavior(self):
+        if tf.executing_eagerly():
+            self.assertFalse(
+                tf_utils.is_symbolic_tensor(
+                    tf.Variable(name="blah", initial_value=0.0)
+                )
+            )
+            self.assertFalse(
+                tf_utils.is_symbolic_tensor(tf.convert_to_tensor(0.0))
+            )
+            self.assertFalse(
+                tf_utils.is_symbolic_tensor(
+                    tf.SparseTensor(
+                        indices=[[0, 0], [1, 2]],
+                        values=[1, 2],
+                        dense_shape=[3, 4],
+                    )
+                )
+            )
+        else:
+            self.assertTrue(
+                tf_utils.is_symbolic_tensor(
+                    tf.Variable(name="blah", initial_value=0.0)
+                )
+            )
+            self.assertTrue(
+                tf_utils.is_symbolic_tensor(tf.convert_to_tensor(0.0))
+            )
+            self.assertTrue(
+                tf_utils.is_symbolic_tensor(
+                    tf.SparseTensor(
+                        indices=[[0, 0], [1, 2]],
+                        values=[1, 2],
+                        dense_shape=[3, 4],
+                    )
+                )
+            )
+
+    def test_works_with_registered(self):
+        class CustomClass:
+            def value(self):
+                return tf.convert_to_tensor(42.0)
+
+        tf.register_tensor_conversion_function(
+            CustomClass, lambda value, **_: value.value()
+        )
+
+        tf_utils.register_symbolic_tensor_type(CustomClass)
+
+        if tf.executing_eagerly():
+            self.assertFalse(
+                tf_utils.is_symbolic_tensor(
+                    tf.Variable(name="blah", initial_value=0.0)
+                )
+            )
+            self.assertFalse(
+                tf_utils.is_symbolic_tensor(tf.convert_to_tensor(0.0))
+            )
+            self.assertFalse(
+                tf_utils.is_symbolic_tensor(
+                    tf.SparseTensor(
+                        indices=[[0, 0], [1, 2]],
+                        values=[1, 2],
+                        dense_shape=[3, 4],
+                    )
+                )
+            )
+            self.assertFalse(tf_utils.is_symbolic_tensor(CustomClass()))
+        else:
+            self.assertTrue(
+                tf_utils.is_symbolic_tensor(
+                    tf.Variable(name="blah", initial_value=0.0)
+                )
+            )
+            self.assertTrue(
+                tf_utils.is_symbolic_tensor(tf.convert_to_tensor(0.0))
+            )
+            self.assertTrue(
+                tf_utils.is_symbolic_tensor(
+                    tf.SparseTensor(
+                        indices=[[0, 0], [1, 2]],
+                        values=[1, 2],
+                        dense_shape=[3, 4],
+                    )
+                )
+            )
+            self.assertTrue(tf_utils.is_symbolic_tensor(CustomClass()))
+
+    def test_enables_nontensor_plumbing(self):
+        if tf.executing_eagerly():
+            self.skipTest("`compile` functionality changed.")
+        # Setup.
+
+        class Foo:
+            def __init__(self, input_):
+                self._input = input_
+                self.value = tf.convert_to_tensor([[42.0]])
+
+            @property
+            def dtype(self):
+                return self.value.dtype
+
+        tf.register_tensor_conversion_function(
+            Foo, lambda x, *args, **kwargs: x.value
+        )
+        tf_utils.register_symbolic_tensor_type(Foo)
+
+        class PlumbingLayer(keras.layers.Lambda):
+            def __init__(self, fn, **kwargs):
+                def _fn(*fargs, **fkwargs):
+                    d = fn(*fargs, **fkwargs)
+                    x = tf.convert_to_tensor(d)
+                    d.shape = x.shape
+                    d.get_shape = x.get_shape
+                    return d, x
+
+                super().__init__(_fn, **kwargs)
+                self._enter_dunder_call = False
+
+            def __call__(self, inputs, *args, **kwargs):
+                self._enter_dunder_call = True
+                d, _ = super().__call__(inputs, *args, **kwargs)
+                self._enter_dunder_call = False
+                return d
+
+            def call(self, inputs, *args, **kwargs):
+                d, v = super().call(inputs, *args, **kwargs)
+                if self._enter_dunder_call:
+                    return d, v
+                return d
+
+        # User-land.
+        model = keras.Sequential(
+            [
+                keras.layers.InputLayer((1,)),
+                PlumbingLayer(Foo),  # Makes a `Foo` object.
+            ]
+        )
+        # Let's ensure Keras graph history is preserved by composing the models.
+        model = keras.Model(model.inputs, model(model.outputs))
+        # Now we instantiate the model and verify we have a `Foo` object, not a
+        # `Tensor`.
+        y = model(tf.convert_to_tensor([[7.0]]))
+        self.assertIsInstance(y, Foo)
+        # Confirm that (custom) loss sees `Foo` instance, not Tensor.
+        obtained_prediction_box = [None]
+
+        def custom_loss(y_obs, y_pred):
+            del y_obs
+            obtained_prediction_box[0] = y_pred
+            return y_pred
+
+        # Apparently `compile` calls the loss function enough to trigger the
+        # side-effect.
+        model.compile("SGD", loss=custom_loss)
+        self.assertIsInstance(obtained_prediction_box[0], Foo)
 
 
 class ConvertInnerNodeDataTest(tf.test.TestCase):
-
-  def test_convert_inner_node_data(self):
-    data = tf_utils.convert_inner_node_data((tf_utils.ListWrapper(['l', 2, 3]),
-                                             tf_utils.ListWrapper(['l', 5, 6])))
-    self.assertEqual(data, (['l', 2, 3], ['l', 5, 6]))
-
-    data = tf_utils.convert_inner_node_data(((['l', 2, 3], ['l', 5, 6])),
-                                            wrap=True)
-    self.assertTrue(all(isinstance(ele, tf_utils.ListWrapper) for ele in data))
+    def test_convert_inner_node_data(self):
+        data = tf_utils.convert_inner_node_data(
+            (
+                tf_utils.ListWrapper(["l", 2, 3]),
+                tf_utils.ListWrapper(["l", 5, 6]),
+            )
+        )
+        self.assertEqual(data, (["l", 2, 3], ["l", 5, 6]))
+
+        data = tf_utils.convert_inner_node_data(
+            ((["l", 2, 3], ["l", 5, 6])), wrap=True
+        )
+        self.assertTrue(
+            all(isinstance(ele, tf_utils.ListWrapper) for ele in data)
+        )
 
 
 class AttrsTest(tf.test.TestCase):
+    def test_map_structure_with_atomic_accept_attr(self):
+        if attr is None:
+            self.skipTest("attr module is unavailable.")
 
-  def test_map_structure_with_atomic_accept_attr(self):
-    if attr is None:
-      self.skipTest('attr module is unavailable.')
+        @attr.s(frozen=True)
+        class Foo:
 
-    @attr.s(frozen=True)
-    class Foo:
+            bar = attr.ib()
 
-      bar = attr.ib()
-
-    self.assertEqual(
-        Foo(2),
-        tf_utils.map_structure_with_atomic(
-            is_atomic_fn=lambda x: isinstance(x, int),
-            map_fn=lambda x: x + 1,
-            nested=Foo(1)))
+        self.assertEqual(
+            Foo(2),
+            tf_utils.map_structure_with_atomic(
+                is_atomic_fn=lambda x: isinstance(x, int),
+                map_fn=lambda x: x + 1,
+                nested=Foo(1),
+            ),
+        )
 
 
 class TestIsRagged(tf.test.TestCase):
+    def test_is_ragged_return_true_for_ragged_tensor(self):
+        tensor = tf.RaggedTensor.from_row_splits(
+            values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8]
+        )
+        self.assertTrue(tf_utils.is_ragged(tensor))
 
-  def test_is_ragged_return_true_for_ragged_tensor(self):
-    tensor = tf.RaggedTensor.from_row_splits(
-        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    self.assertTrue(tf_utils.is_ragged(tensor))
-
-  def test_is_ragged_return_false_for_list(self):
-    tensor = [1., 2., 3.]
-    self.assertFalse(tf_utils.is_ragged(tensor))
+    def test_is_ragged_return_false_for_list(self):
+        tensor = [1.0, 2.0, 3.0]
+        self.assertFalse(tf_utils.is_ragged(tensor))
 
 
 class TestIsSparse(tf.test.TestCase):
+    def test_is_sparse_return_true_for_sparse_tensor(self):
+        tensor = tf.SparseTensor(
+            indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]
+        )
+        self.assertTrue(tf_utils.is_sparse(tensor))
 
-  def test_is_sparse_return_true_for_sparse_tensor(self):
-    tensor = tf.SparseTensor(
-        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
-    self.assertTrue(tf_utils.is_sparse(tensor))
-
-  def test_is_sparse_return_true_for_sparse_tensor_value(self):
-    tensor = tf.compat.v1.SparseTensorValue(
-        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
-    self.assertTrue(tf_utils.is_sparse(tensor))
+    def test_is_sparse_return_true_for_sparse_tensor_value(self):
+        tensor = tf.compat.v1.SparseTensorValue(
+            indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]
+        )
+        self.assertTrue(tf_utils.is_sparse(tensor))
 
-  def test_is_sparse_return_false_for_list(self):
-    tensor = [1., 2., 3.]
-    self.assertFalse(tf_utils.is_sparse(tensor))
+    def test_is_sparse_return_false_for_list(self):
+        tensor = [1.0, 2.0, 3.0]
+        self.assertFalse(tf_utils.is_sparse(tensor))
 
 
 class TestIsExtensionType(tf.test.TestCase):
+    def test_is_extension_type_return_true_for_ragged_tensor(self):
+        self.assertTrue(
+            tf_utils.is_extension_type(tf.ragged.constant([[1, 2], [3]]))
+        )
 
-  def test_is_extension_type_return_true_for_ragged_tensor(self):
-    self.assertTrue(tf_utils.is_extension_type(
-        tf.ragged.constant([[1, 2], [3]])))
+    def test_is_extension_type_return_true_for_sparse_tensor(self):
+        self.assertTrue(
+            tf_utils.is_extension_type(tf.sparse.from_dense([[1, 2], [3, 4]]))
+        )
 
-  def test_is_extension_type_return_true_for_sparse_tensor(self):
-    self.assertTrue(tf_utils.is_extension_type(
-        tf.sparse.from_dense([[1, 2], [3, 4]])))
+    def test_is_extension_type_return_false_for_dense_tensor(self):
+        self.assertFalse(
+            tf_utils.is_extension_type(tf.constant([[1, 2], [3, 4]]))
+        )
 
-  def test_is_extension_type_return_false_for_dense_tensor(self):
-    self.assertFalse(tf_utils.is_extension_type(
-        tf.constant([[1, 2], [3, 4]])))
-
-  def test_is_extension_type_return_false_for_list(self):
-    tensor = [1., 2., 3.]
-    self.assertFalse(tf_utils.is_extension_type(tensor))
+    def test_is_extension_type_return_false_for_list(self):
+        tensor = [1.0, 2.0, 3.0]
+        self.assertFalse(tf_utils.is_extension_type(tensor))
 
 
 class TestIsTensorOrExtensionType(tf.test.TestCase):
-
-  def test_is_tensor_or_extension_type_return_true_for_ragged_tensor(self):
-    self.assertTrue(tf_utils.is_tensor_or_extension_type(
-        tf.ragged.constant([[1, 2], [3]])))
-
-  def test_is_tensor_or_extension_type_return_true_for_sparse_tensor(self):
-    self.assertTrue(tf_utils.is_tensor_or_extension_type(
-        tf.sparse.from_dense([[1, 2], [3, 4]])))
-
-  def test_is_tensor_or_extension_type_return_true_for_dense_tensor(self):
-    self.assertTrue(tf_utils.is_tensor_or_extension_type(
-        tf.constant([[1, 2], [3, 4]])))
-
-  def test_is_tensor_or_extension_type_return_true_for_custom_ext_types(self):
-    class DummyExtensionType(tf.experimental.ExtensionType):
-      ...
-    self.assertTrue(tf_utils.is_tensor_or_extension_type(DummyExtensionType()))
-
-  def test_is_tensor_or_extension_type_return_false_for_list(self):
-    self.assertFalse(tf_utils.is_tensor_or_extension_type([1., 2., 3.]))
+    def test_is_tensor_or_extension_type_return_true_for_ragged_tensor(self):
+        self.assertTrue(
+            tf_utils.is_tensor_or_extension_type(
+                tf.ragged.constant([[1, 2], [3]])
+            )
+        )
+
+    def test_is_tensor_or_extension_type_return_true_for_sparse_tensor(self):
+        self.assertTrue(
+            tf_utils.is_tensor_or_extension_type(
+                tf.sparse.from_dense([[1, 2], [3, 4]])
+            )
+        )
+
+    def test_is_tensor_or_extension_type_return_true_for_dense_tensor(self):
+        self.assertTrue(
+            tf_utils.is_tensor_or_extension_type(tf.constant([[1, 2], [3, 4]]))
+        )
+
+    def test_is_tensor_or_extension_type_return_true_for_custom_ext_types(self):
+        class DummyExtensionType(tf.experimental.ExtensionType):
+            ...
+
+        self.assertTrue(
+            tf_utils.is_tensor_or_extension_type(DummyExtensionType())
+        )
+
+    def test_is_tensor_or_extension_type_return_false_for_list(self):
+        self.assertFalse(tf_utils.is_tensor_or_extension_type([1.0, 2.0, 3.0]))
+
+
+@test_combinations.generate(test_combinations.combine(mode=["eager"]))
+class TestConvertVariablesToTensors(tf.test.TestCase):
+    def test_convert_variables_to_tensors(self):
+        x = tf.Variable([1.0])
+        result = tf_utils.convert_variables_to_tensors(x)
+        self.assertIsInstance(result, tf.Tensor)
+        self.assertAllEqual(result, [1.0])
+
+    def test_convert_variables_in_list_to_tensors(self):
+        x = [tf.Variable([1.0]), tf.constant([2.0])]
+        result = tf_utils.convert_variables_to_tensors(x)
+        self.assertLen(result, 2)
+        self.assertIsInstance(result[0], tf.Tensor)
+        self.assertAllEqual(result[0], [1.0])
+        self.assertIs(result[1], x[1])
+
+    def test_convert_variables_in_composite_tensor_to_tensors(self):
+        class Spec(tf.TypeSpec):
+            value_type = property(lambda self: CompositeVariable)
+
+            def _serialize(self):
+                pass
+
+            def _component_specs(self):
+                pass
+
+            def _to_components(self, value):
+                return value.variables
+
+            def _from_components(self, variable_list):
+                return CompositeVariable(variable_list)
+
+        class CompositeVariable(tf.__internal__.CompositeTensor):
+            def __init__(self, variable_list):
+                self.variables = variable_list
+
+            @property
+            def _type_spec(self):
+                return Spec()
+
+            def _convert_variables_to_tensors(self):
+                self.variables = tf.nest.map_structure(
+                    tf_utils.convert_variables_to_tensors, self.variables
+                )
+                return self
+
+        cv = CompositeVariable([tf.Variable([1.0])])
+        self.assertIsInstance(cv.variables[0], tf.Variable)
+        result = tf_utils.convert_variables_to_tensors(cv)
+        self.assertLen(result.variables, 1)
+        self.assertIsInstance(result.variables[0], tf.Tensor)
+        self.assertAllEqual(result.variables[0], [1.0])
 
 
 class TestRandomSeedSetting(tf.test.TestCase):
-
-  def test_seeds(self):
-    if not tf.__internal__.tf2.enabled():
-      self.skipTest('set_random_seed() is only expected to work in tf2.')
-    def get_model_output():
-      model = keras.Sequential([
-          keras.layers.Dense(10),
-          keras.layers.Dropout(0.5),
-          keras.layers.Dense(10),
-      ])
-      x = np.random.random((32, 10)).astype('float32')
-      ds = tf.data.Dataset.from_tensor_slices(x).shuffle(32).batch(16)
-      return model.predict(ds)
-
-    tf_utils.set_random_seed(42)
-    y1 = get_model_output()
-    tf_utils.set_random_seed(42)
-    y2 = get_model_output()
-    self.assertAllClose(y1, y2, atol=1e-6)
+    def test_seeds(self):
+        if not tf.__internal__.tf2.enabled():
+            self.skipTest("set_random_seed() is only expected to work in tf2.")
+
+        def get_model_output():
+            model = keras.Sequential(
+                [
+                    keras.layers.Dense(10),
+                    keras.layers.Dropout(0.5),
+                    keras.layers.Dense(10),
+                ]
+            )
+            x = np.random.random((32, 10)).astype("float32")
+            ds = tf.data.Dataset.from_tensor_slices(x).shuffle(32).batch(16)
+            return model.predict(ds)
+
+        tf_utils.set_random_seed(42)
+        y1 = get_model_output()
+        tf_utils.set_random_seed(42)
+        y2 = get_model_output()
+        self.assertAllClose(y1, y2, atol=1e-6)
 
 
 class CustomTypeSpec(tf.TypeSpec):
-  """Stubbed-out custom type spec, for testing."""
+    """Stubbed-out custom type spec, for testing."""
 
-  def __init__(self, shape, dtype):
-    self.shape = tf.TensorShape(shape)
-    self.dtype = tf.dtypes.as_dtype(dtype)
+    def __init__(self, shape, dtype):
+        self.shape = tf.TensorShape(shape)
+        self.dtype = tf.dtypes.as_dtype(dtype)
 
-  def with_shape(self, new_shape):
-    return CustomTypeSpec(new_shape, self.dtype)
+    def with_shape(self, new_shape):
+        return CustomTypeSpec(new_shape, self.dtype)
 
-  # Stub implementations for all the TypeSpec methods:
-  value_type = None
-  _to_components = lambda self, value: None
-  _from_components = lambda self, components: None
-  _component_specs = property(lambda self: None)
-  _serialize = lambda self: (self.shape, self.dtype)
+    # Stub implementations for all the TypeSpec methods:
+    value_type = None
+    _to_components = lambda self, value: None
+    _from_components = lambda self, components: None
+    _component_specs = property(lambda self: None)
+    _serialize = lambda self: (self.shape, self.dtype)
 
 
 class TestGetTensorSpec(parameterized.TestCase):
-
-  @parameterized.parameters([
-      (lambda: tf.constant([[1, 2]]), [1, 2]),
-      (tf.TensorSpec([8, 3], tf.int32), [8, 3]),
-      (tf.TensorSpec([8], tf.int32), [8]),
-      (tf.TensorSpec([], tf.int32), []),
-      (tf.TensorSpec(None, tf.int32), None),
-      (tf.RaggedTensorSpec([8, 3], tf.int32), [8, 3]),
-      (tf.SparseTensorSpec([8, 3], tf.int32), [8, 3]),
-  ])
-  def test_without_dynamic_batch(self, t, expected_shape):
-    if callable(t):
-      t = t()
-    result = tf_utils.get_tensor_spec(t)
-    self.assertTrue(result.is_compatible_with(t))
-    if expected_shape is None:
-      self.assertIsNone(result.shape.rank)
-    else:
-      self.assertEqual(result.shape.as_list(), expected_shape)
-
-  @parameterized.parameters([
-      (lambda: tf.constant([[1, 2]]), [None, 2]),
-      (tf.TensorSpec([8, 3], tf.int32), [None, 3]),
-      (tf.TensorSpec([8], tf.int32), [None]),
-      (tf.TensorSpec([], tf.int32), []),
-      (tf.TensorSpec(None, tf.int32), None),
-      (tf.RaggedTensorSpec([8, 3], tf.int32), [None, 3]),
-      (tf.SparseTensorSpec([8, 3], tf.int32), [None, 3]),
-  ])
-  def test_with_dynamic_batch(self, t, expected_shape):
-    if callable(t):
-      t = t()
-    result = tf_utils.get_tensor_spec(t, True)
-    self.assertTrue(result.is_compatible_with(t))
-    if expected_shape is None:
-      self.assertIsNone(result.shape.rank)
-    else:
-      self.assertEqual(result.shape.as_list(), expected_shape)
-
-  def test_with_keras_tensor_with_ragged_spec(self):
-    t = keras.engine.keras_tensor.KerasTensor(
-        tf.RaggedTensorSpec(shape=(None, None, 1)))
-    self.assertIsInstance(tf_utils.get_tensor_spec(t), tf.RaggedTensorSpec)
+    @parameterized.parameters(
+        [
+            (lambda: tf.constant([[1, 2]]), [1, 2]),
+            (tf.TensorSpec([8, 3], tf.int32), [8, 3]),
+            (tf.TensorSpec([8], tf.int32), [8]),
+            (tf.TensorSpec([], tf.int32), []),
+            (tf.TensorSpec(None, tf.int32), None),
+            (tf.RaggedTensorSpec([8, 3], tf.int32), [8, 3]),
+            (tf.SparseTensorSpec([8, 3], tf.int32), [8, 3]),
+        ]
+    )
+    def test_without_dynamic_batch(self, t, expected_shape):
+        if callable(t):
+            t = t()
+        result = tf_utils.get_tensor_spec(t)
+        self.assertTrue(result.is_compatible_with(t))
+        if expected_shape is None:
+            self.assertIsNone(result.shape.rank)
+        else:
+            self.assertEqual(result.shape.as_list(), expected_shape)
+
+    @parameterized.parameters(
+        [
+            (lambda: tf.constant([[1, 2]]), [None, 2]),
+            (tf.TensorSpec([8, 3], tf.int32), [None, 3]),
+            (tf.TensorSpec([8], tf.int32), [None]),
+            (tf.TensorSpec([], tf.int32), []),
+            (tf.TensorSpec(None, tf.int32), None),
+            (tf.RaggedTensorSpec([8, 3], tf.int32), [None, 3]),
+            (tf.SparseTensorSpec([8, 3], tf.int32), [None, 3]),
+        ]
+    )
+    def test_with_dynamic_batch(self, t, expected_shape):
+        if callable(t):
+            t = t()
+        result = tf_utils.get_tensor_spec(t, True)
+        self.assertTrue(result.is_compatible_with(t))
+        if expected_shape is None:
+            self.assertIsNone(result.shape.rank)
+        else:
+            self.assertEqual(result.shape.as_list(), expected_shape)
+
+    def test_with_keras_tensor_with_ragged_spec(self):
+        t = keras.engine.keras_tensor.KerasTensor(
+            tf.RaggedTensorSpec(shape=(None, None, 1))
+        )
+        self.assertIsInstance(tf_utils.get_tensor_spec(t), tf.RaggedTensorSpec)
 
 
 class TestSyncToNumpyOrPythonType(parameterized.TestCase):
+    @parameterized.parameters(
+        [
+            (0.5,),
+            (b"string value",),
+        ]
+    )
+    def test_types(self, value):
+        if not tf.executing_eagerly():
+            self.skipTest("`sync_to_numpy_or_python_type` only works in eager")
+        tensor = tf.constant(value)
+
+        self.assertEqual(tf_utils.sync_to_numpy_or_python_type(tensor), value)
+
+
+class TestCanJitCompile(tf.test.TestCase):
+    def test_darwin_arm_xla(self):
+        with patch("platform.processor", MagicMock(return_value="arm")):
+            with patch("platform.system", MagicMock(return_value="Darwin")):
+                self.assertFalse(tf_utils.can_jit_compile())
 
-  @parameterized.parameters([
-      (0.5,),
-      (b'string value',),
-  ])
-  def test_types(self, value):
-    if not tf.executing_eagerly():
-      self.skipTest('`sync_to_numpy_or_python_type` only works in eager')
-    tensor = tf.constant(value)
+    def test_linux_xla(self):
+        with patch("platform.system", MagicMock(return_value="Linux")):
+            self.assertTrue(tf_utils.can_jit_compile())
 
-    self.assertEqual(tf_utils.sync_to_numpy_or_python_type(
-        tensor), value)
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/timed_threads.py b/keras/utils/timed_threads.py
new file mode 100644
index 000000000000..794fd243c42b
--- /dev/null
+++ b/keras/utils/timed_threads.py
@@ -0,0 +1,148 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Thread utilities."""
+
+import abc
+import threading
+
+from absl import logging
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.utils.TimedThread", v1=[])
+class TimedThread:
+    """Time-based interval Threads.
+
+    Runs a timed thread every x seconds. It can be used to run a threaded
+    function alongside model training or any other snippet of code.
+
+    Args:
+        interval: The interval, in seconds, to wait between calls to the
+            `on_interval` function.
+        **kwargs: additional args that are passed to `threading.Thread`. By
+            default, `Thread` is started as a `daemon` thread unless
+            overridden by the user in `kwargs`.
+
+    Examples:
+
+    ```python
+    class TimedLogIterations(keras.utils.TimedThread):
+        def __init__(self, model, interval):
+            self.model = model
+            super().__init__(interval)
+
+        def on_interval(self):
+            # Logs Optimizer iterations every x seconds
+            try:
+                opt_iterations = self.model.optimizer.iterations.numpy()
+                print(f"Epoch: {epoch}, Optimizer Iterations: {opt_iterations}")
+            except Exception as e:
+                print(str(e))  # To prevent thread from getting killed
+
+    # `start` and `stop` the `TimerThread` manually. If the `on_interval` call
+    # requires access to `model` or other objects, override `__init__` method.
+    # Wrap it in a `try-except` to handle exceptions and `stop` the thread run.
+    timed_logs = TimedLogIterations(model=model, interval=5)
+    timed_logs.start()
+    try:
+        model.fit(...)
+    finally:
+        timed_logs.stop()
+
+    # Alternatively, run the `TimedThread` in a context manager
+    with TimedLogIterations(model=model, interval=5):
+        model.fit(...)
+
+    # If the timed thread instance needs access to callback events,
+    # subclass both `TimedThread` and `Callback`.  Note that when calling
+    # `super`, they will have to called for each parent class if both of them
+    # have the method that needs to be run. Also, note that `Callback` has
+    # access to `model` as an attribute and need not be explictly provided.
+    class LogThreadCallback(
+        keras.utils.TimedThread, keras.callbacks.Callback
+    ):
+        def __init__(self, interval):
+            self._epoch = 0
+            keras.utils.TimedThread.__init__(self, interval)
+            keras.callbacks.Callback.__init__(self)
+
+        def on_interval(self):
+            if self.epoch:
+                opt_iter = self.model.optimizer.iterations.numpy()
+                logging.info(f"Epoch: {self._epoch}, Opt Iteration: {opt_iter}")
+
+        def on_epoch_begin(self, epoch, logs=None):
+            self._epoch = epoch
+
+    with LogThreadCallback(interval=5) as thread_callback:
+        # It's required to pass `thread_callback` to also `callbacks` arg of
+        # `model.fit` to be triggered on callback events.
+        model.fit(..., callbacks=[thread_callback])
+    ```
+    """
+
+    def __init__(self, interval, **kwargs):
+        self.interval = interval
+        self.daemon = kwargs.pop("daemon", True)
+        self.thread_kwargs = kwargs
+        self.thread = None
+        self.thread_stop_event = None
+
+    def _call_on_interval(self):
+        # Runs indefinitely once thread is started
+        while not self.thread_stop_event.is_set():
+            self.on_interval()
+            self.thread_stop_event.wait(self.interval)
+
+    def start(self):
+        """Creates and starts the thread run."""
+        if self.thread and self.thread.is_alive():
+            logging.warning("Thread is already running.")
+            return
+        self.thread = threading.Thread(
+            target=self._call_on_interval,
+            daemon=self.daemon,
+            **self.thread_kwargs
+        )
+        self.thread_stop_event = threading.Event()
+        self.thread.start()
+
+    def stop(self):
+        """Stops the thread run."""
+        if self.thread_stop_event:
+            self.thread_stop_event.set()
+
+    def is_alive(self):
+        """Returns True if thread is running. Otherwise returns False."""
+        if self.thread:
+            return self.thread.is_alive()
+        return False
+
+    def __enter__(self):
+        # Starts the thread in context manager
+        self.start()
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        # Stops the thread run.
+        self.stop()
+
+    @abc.abstractmethod
+    def on_interval(self):
+        """User-defined behavior that is called in the thread."""
+        raise NotImplementedError(
+            "Runs every x interval seconds. Needs to be "
+            "implemented in subclasses of `TimedThread`"
+        )
diff --git a/keras/utils/timed_threads_test.py b/keras/utils/timed_threads_test.py
new file mode 100644
index 000000000000..011603feb268
--- /dev/null
+++ b/keras/utils/timed_threads_test.py
@@ -0,0 +1,119 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for timed_threads."""
+
+import time
+
+import tensorflow.compat.v2 as tf
+from absl import logging
+
+import keras
+from keras.testing_infra import test_combinations
+from keras.testing_infra import test_utils
+from keras.utils import timed_threads
+
+
+@test_utils.run_v2_only
+class TimedThreadTest(test_combinations.TestCase):
+    def test_timed_thread_run(self):
+        class LogThread(timed_threads.TimedThread):
+            def on_interval(self):
+                logging.info("Thread Run")
+
+        log_thread = LogThread(interval=0.1)
+        with self.assertLogs(level="INFO") as logs:
+            log_thread.start()
+            time.sleep(1)
+            self.assertTrue(log_thread.is_alive())
+            log_thread.stop()
+        self.assertIn("INFO:absl:Thread Run", logs.output)
+        time.sleep(0.1)
+        self.assertFalse(log_thread.is_alive())
+
+    def test_timed_thread_restart(self):
+        # Verfiy that thread can be started and stopped multiple times.
+        class LogThread(timed_threads.TimedThread):
+            def on_interval(self):
+                logging.info("Thread Run")
+
+        log_thread = LogThread(interval=0.1)
+        for _ in range(2):
+            self.assertFalse(log_thread.is_alive())
+            with self.assertLogs(level="INFO") as logs:
+                log_thread.start()
+                time.sleep(1)
+                self.assertTrue(log_thread.is_alive())
+                log_thread.stop()
+            self.assertIn("INFO:absl:Thread Run", logs.output)
+            time.sleep(0.1)
+            self.assertFalse(log_thread.is_alive())
+
+    def test_timed_thread_running_warning(self):
+        # Verfiy thread start warning if its already running
+        class LogThread(timed_threads.TimedThread):
+            def on_interval(self):
+                logging.info("Thread Run")
+
+        log_thread = LogThread(interval=0.1)
+        self.assertFalse(log_thread.is_alive())
+        with self.assertLogs(level="INFO") as logs:
+            log_thread.start()
+            time.sleep(1)
+            self.assertTrue(log_thread.is_alive())
+            self.assertIn("INFO:absl:Thread Run", logs.output)
+        with self.assertLogs(level="WARNING") as logs:
+            log_thread.start()
+            self.assertIn(
+                "WARNING:absl:Thread is already running.", logs.output
+            )
+            self.assertTrue(log_thread.is_alive())
+        log_thread.stop()
+        time.sleep(0.1)
+        self.assertFalse(log_thread.is_alive())
+
+    def test_timed_thread_callback_model_fit(self):
+        class LogThreadCallback(
+            timed_threads.TimedThread, keras.callbacks.Callback
+        ):
+            def __init__(self, interval):
+                self._epoch = 0
+                timed_threads.TimedThread.__init__(self, interval=interval)
+                keras.callbacks.Callback.__init__(self)
+
+            def on_interval(self):
+                if self._epoch:
+                    # Verify that `model` is accessible.
+                    _ = self.model.optimizer.iterations.numpy()
+                    logging.info(f"Thread Run Epoch: {self._epoch}")
+
+            def on_epoch_begin(self, epoch, logs=None):
+                self._epoch = epoch
+                time.sleep(1)
+
+        x = tf.random.normal((32, 2))
+        y = tf.ones((32, 1), dtype=tf.float32)
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile(loss="mse")
+        with self.assertLogs(level="INFO") as logs, LogThreadCallback(
+            interval=0.1
+        ) as log_thread_callback:
+            self.assertIsNone(log_thread_callback.model)
+            model.fit(x, y, epochs=2, callbacks=[log_thread_callback])
+            self.assertIsNotNone(log_thread_callback.model)
+            self.assertIn("INFO:absl:Thread Run Epoch: 1", logs.output)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/timeseries_dataset.py b/keras/utils/timeseries_dataset.py
index 92fa2eb08588..c81dc18ef32c 100644
--- a/keras/utils/timeseries_dataset.py
+++ b/keras/utils/timeseries_dataset.py
@@ -14,16 +14,18 @@
 # ==============================================================================
 """Keras timeseries dataset utilities."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
-# pylint: disable=g-classes-have-attributes
 
-import numpy as np
+# isort: off
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.utils.timeseries_dataset_from_array',
-              'keras.preprocessing.timeseries_dataset_from_array',
-              v1=[])
+@keras_export(
+    "keras.utils.timeseries_dataset_from_array",
+    "keras.preprocessing.timeseries_dataset_from_array",
+    v1=[],
+)
 def timeseries_dataset_from_array(
     data,
     targets,
@@ -34,212 +36,242 @@ def timeseries_dataset_from_array(
     shuffle=False,
     seed=None,
     start_index=None,
-    end_index=None):
-  """Creates a dataset of sliding windows over a timeseries provided as array.
-
-  This function takes in a sequence of data-points gathered at
-  equal intervals, along with time series parameters such as
-  length of the sequences/windows, spacing between two sequence/windows, etc.,
-  to produce batches of timeseries inputs and targets.
-
-  Args:
-    data: Numpy array or eager tensor
-      containing consecutive data points (timesteps).
-      Axis 0 is expected to be the time dimension.
-    targets: Targets corresponding to timesteps in `data`.
-      `targets[i]` should be the target
-      corresponding to the window that starts at index `i`
-      (see example 2 below).
-      Pass None if you don't have target data (in this case the dataset will
-      only yield the input data).
-    sequence_length: Length of the output sequences (in number of timesteps).
-    sequence_stride: Period between successive output sequences.
-      For stride `s`, output samples would
-      start at index `data[i]`, `data[i + s]`, `data[i + 2 * s]`, etc.
-    sampling_rate: Period between successive individual timesteps
-      within sequences. For rate `r`, timesteps
-      `data[i], data[i + r], ... data[i + sequence_length]`
-      are used for creating a sample sequence.
-    batch_size: Number of timeseries samples in each batch
-      (except maybe the last one). If `None`, the data will not be batched
-      (the dataset will yield individual samples).
-    shuffle: Whether to shuffle output samples,
-      or instead draw them in chronological order.
-    seed: Optional int; random seed for shuffling.
-    start_index: Optional int; data points earlier (exclusive)
-      than `start_index` will not be used
-      in the output sequences. This is useful to reserve part of the
-      data for test or validation.
-    end_index: Optional int; data points later (exclusive) than `end_index`
-      will not be used in the output sequences.
-      This is useful to reserve part of the data for test or validation.
-
-  Returns:
-    A tf.data.Dataset instance. If `targets` was passed, the dataset yields
+    end_index=None,
+):
+    """Creates a dataset of sliding windows over a timeseries provided as array.
+
+    This function takes in a sequence of data-points gathered at
+    equal intervals, along with time series parameters such as
+    length of the sequences/windows, spacing between two sequence/windows, etc.,
+    to produce batches of timeseries inputs and targets.
+
+    Args:
+        data: Numpy array or eager tensor
+            containing consecutive data points (timesteps).
+            Axis 0 is expected to be the time dimension.
+        targets: Targets corresponding to timesteps in `data`.
+            `targets[i]` should be the target
+            corresponding to the window that starts at index `i`
+            (see example 2 below).
+            Pass `None` if you don't have target data (in this case the dataset
+            will only yield the input data).
+        sequence_length: Length of the output sequences
+            (in number of timesteps).
+        sequence_stride: Period between successive output sequences.
+            For stride `s`, output samples would
+            start at index `data[i]`, `data[i + s]`, `data[i + 2 * s]`, etc.
+        sampling_rate: Period between successive individual timesteps
+            within sequences. For rate `r`, timesteps
+            `data[i], data[i + r], ... data[i + sequence_length]`
+            are used for creating a sample sequence.
+        batch_size: Number of timeseries samples in each batch
+            (except maybe the last one). If `None`, the data will not be batched
+            (the dataset will yield individual samples).
+        shuffle: Whether to shuffle output samples,
+            or instead draw them in chronological order.
+        seed: Optional int; random seed for shuffling.
+        start_index: Optional int; data points earlier (exclusive)
+            than `start_index` will not be used
+            in the output sequences. This is useful to reserve part of the
+            data for test or validation.
+        end_index: Optional int; data points later (exclusive) than `end_index`
+            will not be used in the output sequences.
+            This is useful to reserve part of the data for test or validation.
+
+    Returns:
+
+    A `tf.data.Dataset` instance. If `targets` was passed, the dataset yields
     tuple `(batch_of_sequences, batch_of_targets)`. If not, the dataset yields
     only `batch_of_sequences`.
 
-  Example 1:
-
-  Consider indices `[0, 1, ... 99]`.
-  With `sequence_length=10,  sampling_rate=2, sequence_stride=3`,
-  `shuffle=False`, the dataset will yield batches of sequences
-  composed of the following indices:
-
-  ```
-  First sequence:  [0  2  4  6  8 10 12 14 16 18]
-  Second sequence: [3  5  7  9 11 13 15 17 19 21]
-  Third sequence:  [6  8 10 12 14 16 18 20 22 24]
-  ...
-  Last sequence:   [78 80 82 84 86 88 90 92 94 96]
-  ```
-
-  In this case the last 3 data points are discarded since no full sequence
-  can be generated to include them (the next sequence would have started
-  at index 81, and thus its last step would have gone over 99).
-
-  Example 2: Temporal regression.
-
-  Consider an array `data` of scalar values, of shape `(steps,)`.
-  To generate a dataset that uses the past 10
-  timesteps to predict the next timestep, you would use:
-
-  ```python
-  input_data = data[:-10]
-  targets = data[10:]
-  dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
-      input_data, targets, sequence_length=10)
-  for batch in dataset:
-    inputs, targets = batch
-    assert np.array_equal(inputs[0], data[:10])  # First sequence: steps [0-9]
-    assert np.array_equal(targets[0], data[10])  # Corresponding target: step 10
-    break
-  ```
-
-  Example 3: Temporal regression for many-to-many architectures.
-
-  Consider two arrays of scalar values `X` and `Y`,
-  both of shape `(100,)`. The resulting dataset should consist samples with
-  20 timestamps each. The samples should not overlap.
-  To generate a dataset that uses the current timestamp
-  to predict the corresponding target timestep, you would use:
-
-  ```python
-  X = np.arange(100)
-  Y = X*2
-
-  sample_length = 20
-  input_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
-    X, None, sequence_length=sample_length, sequence_stride=sample_length)
-  target_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
-    Y, None, sequence_length=sample_length, sequence_stride=sample_length)
-
-  for batch in zip(input_dataset, target_dataset):
-    inputs, targets = batch
-    assert np.array_equal(inputs[0], X[:sample_length])
-
-    # second sample equals output timestamps 20-40
-    assert np.array_equal(targets[1], Y[sample_length:2*sample_length])
-    break
-  ```
-  """
-  if start_index:
-    if start_index < 0:
-      raise ValueError(f'`start_index` must be 0 or greater. Received: '
-                       f'start_index={start_index}')
-    if start_index >= len(data):
-      raise ValueError(f'`start_index` must be lower than the length of the '
-                       f'data. Received: start_index={start_index}, for data '
-                       f'of length {len(data)}')
-  if end_index:
-    if start_index and end_index <= start_index:
-      raise ValueError(f'`end_index` must be higher than `start_index`. '
-                       f'Received: start_index={start_index}, and '
-                       f'end_index={end_index} ')
-    if end_index >= len(data):
-      raise ValueError(f'`end_index` must be lower than the length of the '
-                       f'data. Received: end_index={end_index}, for data of '
-                       f'length {len(data)}')
-    if end_index <= 0:
-      raise ValueError('`end_index` must be higher than 0. '
-                       f'Received: end_index={end_index}')
-
-  # Validate strides
-  if sampling_rate <= 0:
-    raise ValueError(f'`sampling_rate` must be higher than 0. Received: '
-                     f'sampling_rate={sampling_rate}')
-  if sampling_rate >= len(data):
-    raise ValueError(f'`sampling_rate` must be lower than the length of the '
-                     f'data. Received: sampling_rate={sampling_rate}, for data '
-                     f'of length {len(data)}')
-  if sequence_stride <= 0:
-    raise ValueError(f'`sequence_stride` must be higher than 0. Received: '
-                     f'sequence_stride={sequence_stride}')
-  if sequence_stride >= len(data):
-    raise ValueError(f'`sequence_stride` must be lower than the length of the '
-                     f'data. Received: sequence_stride={sequence_stride}, for '
-                     f'data of length {len(data)}')
-
-  if start_index is None:
-    start_index = 0
-  if end_index is None:
-    end_index = len(data)
-
-  # Determine the lowest dtype to store start positions (to lower memory usage).
-  num_seqs = end_index - start_index - (sequence_length * sampling_rate) + 1
-  if targets is not None:
-    num_seqs = min(num_seqs, len(targets))
-  if num_seqs < 2147483647:
-    index_dtype = 'int32'
-  else:
-    index_dtype = 'int64'
-
-  # Generate start positions
-  start_positions = np.arange(0, num_seqs, sequence_stride, dtype=index_dtype)
-  if shuffle:
-    if seed is None:
-      seed = np.random.randint(1e6)
-    rng = np.random.RandomState(seed)
-    rng.shuffle(start_positions)
-
-  sequence_length = tf.cast(sequence_length, dtype=index_dtype)
-  sampling_rate = tf.cast(sampling_rate, dtype=index_dtype)
-
-  positions_ds = tf.data.Dataset.from_tensors(start_positions).repeat()
-
-  # For each initial window position, generates indices of the window elements
-  indices = tf.data.Dataset.zip(
-      (tf.data.Dataset.range(len(start_positions)), positions_ds)).map(
-          lambda i, positions: tf.range(  # pylint: disable=g-long-lambda
-              positions[i],
-              positions[i] + sequence_length * sampling_rate,
-              sampling_rate),
-          num_parallel_calls=tf.data.AUTOTUNE)
-
-  dataset = sequences_from_indices(data, indices, start_index, end_index)
-  if targets is not None:
+    Example 1:
+
+    Consider indices `[0, 1, ... 98]`.
+    With `sequence_length=10,  sampling_rate=2, sequence_stride=3`,
+    `shuffle=False`, the dataset will yield batches of sequences
+    composed of the following indices:
+
+    ```
+    First sequence:  [0  2  4  6  8 10 12 14 16 18]
+    Second sequence: [3  5  7  9 11 13 15 17 19 21]
+    Third sequence:  [6  8 10 12 14 16 18 20 22 24]
+    ...
+    Last sequence:   [78 80 82 84 86 88 90 92 94 96]
+    ```
+
+    In this case the last 2 data points are discarded since no full sequence
+    can be generated to include them (the next sequence would have started
+    at index 81, and thus its last step would have gone over 98).
+
+    Example 2: Temporal regression.
+
+    Consider an array `data` of scalar values, of shape `(steps,)`.
+    To generate a dataset that uses the past 10
+    timesteps to predict the next timestep, you would use:
+
+    ```python
+    input_data = data[:-10]
+    targets = data[10:]
+    dataset = tf.keras.utils.timeseries_dataset_from_array(
+        input_data, targets, sequence_length=10)
+    for batch in dataset:
+      inputs, targets = batch
+      assert np.array_equal(inputs[0], data[:10])  # First sequence: steps [0-9]
+      # Corresponding target: step 10
+      assert np.array_equal(targets[0], data[10])
+      break
+    ```
+
+    Example 3: Temporal regression for many-to-many architectures.
+
+    Consider two arrays of scalar values `X` and `Y`,
+    both of shape `(100,)`. The resulting dataset should consist samples with
+    20 timestamps each. The samples should not overlap.
+    To generate a dataset that uses the current timestamp
+    to predict the corresponding target timestep, you would use:
+
+    ```python
+    X = np.arange(100)
+    Y = X*2
+
+    sample_length = 20
+    input_dataset = tf.keras.utils.timeseries_dataset_from_array(
+        X, None, sequence_length=sample_length, sequence_stride=sample_length)
+    target_dataset = tf.keras.utils.timeseries_dataset_from_array(
+        Y, None, sequence_length=sample_length, sequence_stride=sample_length)
+
+    for batch in zip(input_dataset, target_dataset):
+        inputs, targets = batch
+        assert np.array_equal(inputs[0], X[:sample_length])
+
+        # second sample equals output timestamps 20-40
+        assert np.array_equal(targets[1], Y[sample_length:2*sample_length])
+        break
+    ```
+    """
+    if start_index:
+        if start_index < 0:
+            raise ValueError(
+                "`start_index` must be 0 or greater. Received: "
+                f"start_index={start_index}"
+            )
+        if start_index >= len(data):
+            raise ValueError(
+                "`start_index` must be lower than the length of the "
+                f"data. Received: start_index={start_index}, for data "
+                f"of length {len(data)}"
+            )
+    if end_index:
+        if start_index and end_index <= start_index:
+            raise ValueError(
+                "`end_index` must be higher than `start_index`. "
+                f"Received: start_index={start_index}, and "
+                f"end_index={end_index} "
+            )
+        if end_index >= len(data):
+            raise ValueError(
+                "`end_index` must be lower than the length of the "
+                f"data. Received: end_index={end_index}, for data of "
+                f"length {len(data)}"
+            )
+        if end_index <= 0:
+            raise ValueError(
+                "`end_index` must be higher than 0. "
+                f"Received: end_index={end_index}"
+            )
+
+    # Validate strides
+    if sampling_rate <= 0:
+        raise ValueError(
+            "`sampling_rate` must be higher than 0. Received: "
+            f"sampling_rate={sampling_rate}"
+        )
+    if sampling_rate >= len(data):
+        raise ValueError(
+            "`sampling_rate` must be lower than the length of the "
+            f"data. Received: sampling_rate={sampling_rate}, for data "
+            f"of length {len(data)}"
+        )
+    if sequence_stride <= 0:
+        raise ValueError(
+            "`sequence_stride` must be higher than 0. Received: "
+            f"sequence_stride={sequence_stride}"
+        )
+    if sequence_stride >= len(data):
+        raise ValueError(
+            "`sequence_stride` must be lower than the length of the "
+            f"data. Received: sequence_stride={sequence_stride}, for "
+            f"data of length {len(data)}"
+        )
+
+    if start_index is None:
+        start_index = 0
+    if end_index is None:
+        end_index = len(data)
+
+    # Determine the lowest dtype to store start positions (to lower memory
+    # usage).
+    num_seqs = end_index - start_index - (sequence_length - 1) * sampling_rate
+    if targets is not None:
+        num_seqs = min(num_seqs, len(targets))
+    if num_seqs < 2147483647:
+        index_dtype = "int32"
+    else:
+        index_dtype = "int64"
+
+    # Generate start positions
+    start_positions = np.arange(0, num_seqs, sequence_stride, dtype=index_dtype)
+    if shuffle:
+        if seed is None:
+            seed = np.random.randint(1e6)
+        rng = np.random.RandomState(seed)
+        rng.shuffle(start_positions)
+
+    sequence_length = tf.cast(sequence_length, dtype=index_dtype)
+    sampling_rate = tf.cast(sampling_rate, dtype=index_dtype)
+
+    positions_ds = tf.data.Dataset.from_tensors(start_positions).repeat()
+
+    # For each initial window position, generates indices of the window elements
     indices = tf.data.Dataset.zip(
-        (tf.data.Dataset.range(len(start_positions)), positions_ds)).map(
+        (tf.data.Dataset.range(len(start_positions)), positions_ds)
+    ).map(
+        lambda i, positions: tf.range(
+            positions[i],
+            positions[i] + sequence_length * sampling_rate,
+            sampling_rate,
+        ),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
+
+    dataset = sequences_from_indices(data, indices, start_index, end_index)
+    if targets is not None:
+        indices = tf.data.Dataset.zip(
+            (tf.data.Dataset.range(len(start_positions)), positions_ds)
+        ).map(
             lambda i, positions: positions[i],
-            num_parallel_calls=tf.data.AUTOTUNE)
-    target_ds = sequences_from_indices(
-        targets, indices, start_index, end_index)
-    dataset = tf.data.Dataset.zip((dataset, target_ds))
-  dataset = dataset.prefetch(tf.data.AUTOTUNE)
-  if batch_size is not None:
-    if shuffle:
-      # Shuffle locally at each iteration
-      dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
-    dataset = dataset.batch(batch_size)
-  else:
-    if shuffle:
-      dataset = dataset.shuffle(buffer_size=1024, seed=seed)
-  return dataset
+            num_parallel_calls=tf.data.AUTOTUNE,
+        )
+        target_ds = sequences_from_indices(
+            targets, indices, start_index, end_index
+        )
+        dataset = tf.data.Dataset.zip((dataset, target_ds))
+    dataset = dataset.prefetch(tf.data.AUTOTUNE)
+    if batch_size is not None:
+        if shuffle:
+            # Shuffle locally at each iteration
+            dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+        dataset = dataset.batch(batch_size)
+    else:
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=1024, seed=seed)
+    return dataset
 
 
 def sequences_from_indices(array, indices_ds, start_index, end_index):
-  dataset = tf.data.Dataset.from_tensors(array[start_index : end_index])
-  dataset = tf.data.Dataset.zip((dataset.repeat(), indices_ds)).map(
-      lambda steps, inds: tf.gather(steps, inds),  # pylint: disable=unnecessary-lambda
-      num_parallel_calls=tf.data.AUTOTUNE)
-  return dataset
+    dataset = tf.data.Dataset.from_tensors(array[start_index:end_index])
+    dataset = tf.data.Dataset.zip((dataset.repeat(), indices_ds)).map(
+        lambda steps, inds: tf.gather(steps, inds),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
+    return dataset
diff --git a/keras/utils/timeseries_dataset_test.py b/keras/utils/timeseries_dataset_test.py
index cda8db59c343..77f6acd33d3a 100644
--- a/keras/utils/timeseries_dataset_test.py
+++ b/keras/utils/timeseries_dataset_test.py
@@ -14,175 +14,213 @@
 # ==============================================================================
 """Tests for timeseries_dataset."""
 
+import numpy as np
 import tensorflow.compat.v2 as tf
 
-import numpy as np
 from keras.testing_infra import test_utils
 from keras.utils import timeseries_dataset
 
 
 @test_utils.run_v2_only
 class TimeseriesDatasetTest(tf.test.TestCase):
+    def test_basics(self):
+        # Test ordering, targets, sequence length, batch size
+        data = np.arange(100)
+        targets = data * 2
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data, targets, sequence_length=9, batch_size=5
+        )
+        # Expect 19 batches
+        for i, batch in enumerate(dataset):
+            self.assertLen(batch, 2)
+            inputs, targets = batch
+            if i < 18:
+                self.assertEqual(inputs.shape, (5, 9))
+            if i == 18:
+                # Last batch: size 2
+                self.assertEqual(inputs.shape, (2, 9))
+            # Check target values
+            self.assertAllClose(targets, inputs[:, 0] * 2)
+            for j in range(min(5, len(inputs))):
+                # Check each sample in the batch
+                self.assertAllClose(
+                    inputs[j], np.arange(i * 5 + j, i * 5 + j + 9)
+                )
 
-  def test_basics(self):
-    # Test ordering, targets, sequence length, batch size
-    data = np.arange(100)
-    targets = data * 2
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, targets, sequence_length=9, batch_size=5)
-    # Expect 19 batches
-    for i, batch in enumerate(dataset):
-      self.assertLen(batch, 2)
-      inputs, targets = batch
-      if i < 18:
-        self.assertEqual(inputs.shape, (5, 9))
-      if i == 18:
-        # Last batch: size 2
-        self.assertEqual(inputs.shape, (2, 9))
-      # Check target values
-      self.assertAllClose(targets, inputs[:, 0] * 2)
-      for j in range(min(5, len(inputs))):
-        # Check each sample in the batch
-        self.assertAllClose(inputs[j], np.arange(i * 5 + j, i * 5 + j + 9))
+    def test_timeseries_regression(self):
+        # Test simple timeseries regression use case
+        data = np.arange(10)
+        offset = 3
+        targets = data[offset:]
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data, targets, sequence_length=offset, batch_size=1
+        )
+        i = 0
+        for batch in dataset:
+            self.assertLen(batch, 2)
+            inputs, targets = batch
+            self.assertEqual(inputs.shape, (1, 3))
+            # Check values
+            self.assertAllClose(targets[0], data[offset + i])
+            self.assertAllClose(inputs[0], data[i : i + offset])
+            i += 1
+        self.assertEqual(i, 7)  # Expect 7 batches
 
-  def test_timeseries_regression(self):
-    # Test simple timeseries regression use case
-    data = np.arange(10)
-    offset = 3
-    targets = data[offset:]
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, targets, sequence_length=offset, batch_size=1)
-    i = 0
-    for batch in dataset:
-      self.assertLen(batch, 2)
-      inputs, targets = batch
-      self.assertEqual(inputs.shape, (1, 3))
-      # Check values
-      self.assertAllClose(targets[0], data[offset + i])
-      self.assertAllClose(inputs[0], data[i : i + offset])
-      i += 1
-    self.assertEqual(i, 7)  # Expect 7 batches
+    def test_no_targets(self):
+        data = np.arange(50)
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data, None, sequence_length=10, batch_size=5
+        )
+        # Expect 9 batches
+        i = None
+        for i, batch in enumerate(dataset):
+            if i < 8:
+                self.assertEqual(batch.shape, (5, 10))
+            elif i == 8:
+                self.assertEqual(batch.shape, (1, 10))
+            for j in range(min(5, len(batch))):
+                # Check each sample in the batch
+                self.assertAllClose(
+                    batch[j], np.arange(i * 5 + j, i * 5 + j + 10)
+                )
+        self.assertEqual(i, 8)
 
-  def test_no_targets(self):
-    data = np.arange(50)
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, None, sequence_length=10, batch_size=5)
-    # Expect 9 batches
-    i = None
-    for i, batch in enumerate(dataset):
-      if i < 8:
-        self.assertEqual(batch.shape, (5, 10))
-      elif i == 8:
-        self.assertEqual(batch.shape, (1, 10))
-      for j in range(min(5, len(batch))):
-        # Check each sample in the batch
-        self.assertAllClose(batch[j], np.arange(i * 5 + j, i * 5 + j + 10))
-    self.assertEqual(i, 8)
+    def test_shuffle(self):
+        # Test cross-epoch random order and seed determinism
+        data = np.arange(10)
+        targets = data * 2
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data,
+            targets,
+            sequence_length=5,
+            batch_size=1,
+            shuffle=True,
+            seed=123,
+        )
+        first_seq = None
+        for x, y in dataset.take(1):
+            self.assertNotAllClose(x, np.arange(0, 5))
+            self.assertAllClose(x[:, 0] * 2, y)
+            first_seq = x
+        # Check that a new iteration with the same dataset yields different
+        # results
+        for x, _ in dataset.take(1):
+            self.assertNotAllClose(x, first_seq)
+        # Check determism with same seed
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data,
+            targets,
+            sequence_length=5,
+            batch_size=1,
+            shuffle=True,
+            seed=123,
+        )
+        for x, _ in dataset.take(1):
+            self.assertAllClose(x, first_seq)
 
-  def test_shuffle(self):
-    # Test cross-epoch random order and seed determinism
-    data = np.arange(10)
-    targets = data * 2
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123)
-    first_seq = None
-    for x, y in dataset.take(1):
-      self.assertNotAllClose(x, np.arange(0, 5))
-      self.assertAllClose(x[:, 0] * 2, y)
-      first_seq = x
-    # Check that a new iteration with the same dataset yields different results
-    for x, _ in dataset.take(1):
-      self.assertNotAllClose(x, first_seq)
-    # Check determism with same seed
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123)
-    for x, _ in dataset.take(1):
-      self.assertAllClose(x, first_seq)
+    def test_sampling_rate(self):
+        data = np.arange(100)
+        targets = data * 2
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data, targets, sequence_length=9, batch_size=5, sampling_rate=2
+        )
+        for i, batch in enumerate(dataset):
+            self.assertLen(batch, 2)
+            inputs, targets = batch
+            if i < 16:
+                self.assertEqual(inputs.shape, (5, 9))
+            if i == 16:
+                # Last batch: size 4
+                self.assertEqual(inputs.shape, (4, 9))
+            # Check target values
+            self.assertAllClose(inputs[:, 0] * 2, targets)
+            for j in range(min(5, len(inputs))):
+                # Check each sample in the batch
+                start_index = i * 5 + j
+                end_index = start_index + 9 * 2
+                self.assertAllClose(
+                    inputs[j], np.arange(start_index, end_index, 2)
+                )
 
-  def test_sampling_rate(self):
-    data = np.arange(100)
-    targets = data * 2
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, targets, sequence_length=9, batch_size=5, sampling_rate=2)
-    for i, batch in enumerate(dataset):
-      self.assertLen(batch, 2)
-      inputs, targets = batch
-      if i < 16:
-        self.assertEqual(inputs.shape, (5, 9))
-      if i == 16:
-        # Last batch: size 3
-        self.assertEqual(inputs.shape, (3, 9))
-      # Check target values
-      self.assertAllClose(inputs[:, 0] * 2, targets)
-      for j in range(min(5, len(inputs))):
-        # Check each sample in the batch
-        start_index = i * 5 + j
-        end_index = start_index + 9 * 2
-        self.assertAllClose(inputs[j], np.arange(start_index, end_index, 2))
+    def test_sequence_stride(self):
+        data = np.arange(100)
+        targets = data * 2
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data, targets, sequence_length=9, batch_size=5, sequence_stride=3
+        )
+        for i, batch in enumerate(dataset):
+            self.assertLen(batch, 2)
+            inputs, targets = batch
+            if i < 6:
+                self.assertEqual(inputs.shape, (5, 9))
+            if i == 6:
+                # Last batch: size 1
+                self.assertEqual(inputs.shape, (1, 9))
+            # Check target values
+            self.assertAllClose(inputs[:, 0] * 2, targets)
+            for j in range(min(5, len(inputs))):
+                # Check each sample in the batch
+                start_index = i * 5 * 3 + j * 3
+                end_index = start_index + 9
+                self.assertAllClose(
+                    inputs[j], np.arange(start_index, end_index)
+                )
 
-  def test_sequence_stride(self):
-    data = np.arange(100)
-    targets = data * 2
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, targets, sequence_length=9, batch_size=5, sequence_stride=3)
-    for i, batch in enumerate(dataset):
-      self.assertLen(batch, 2)
-      inputs, targets = batch
-      if i < 6:
-        self.assertEqual(inputs.shape, (5, 9))
-      if i == 6:
-        # Last batch: size 1
-        self.assertEqual(inputs.shape, (1, 9))
-      # Check target values
-      self.assertAllClose(inputs[:, 0] * 2, targets)
-      for j in range(min(5, len(inputs))):
-        # Check each sample in the batch
-        start_index = i * 5 * 3 + j * 3
-        end_index = start_index + 9
-        self.assertAllClose(inputs[j],
-                            np.arange(start_index, end_index))
+    def test_start_and_end_index(self):
+        data = np.arange(100)
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data,
+            None,
+            sequence_length=9,
+            batch_size=5,
+            sequence_stride=3,
+            sampling_rate=2,
+            start_index=10,
+            end_index=90,
+        )
+        for batch in dataset:
+            self.assertAllLess(batch[0], 90)
+            self.assertAllGreater(batch[0], 9)
 
-  def test_start_and_end_index(self):
-    data = np.arange(100)
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, None,
-        sequence_length=9, batch_size=5, sequence_stride=3, sampling_rate=2,
-        start_index=10, end_index=90)
-    for batch in dataset:
-      self.assertAllLess(batch[0], 90)
-      self.assertAllGreater(batch[0], 9)
+    def test_errors(self):
+        # bad start index
+        with self.assertRaisesRegex(ValueError, "`start_index` must be "):
+            _ = timeseries_dataset.timeseries_dataset_from_array(
+                np.arange(10), None, 3, start_index=-1
+            )
+        with self.assertRaisesRegex(ValueError, "`start_index` must be "):
+            _ = timeseries_dataset.timeseries_dataset_from_array(
+                np.arange(10), None, 3, start_index=11
+            )
+        # bad end index
+        with self.assertRaisesRegex(ValueError, "`end_index` must be "):
+            _ = timeseries_dataset.timeseries_dataset_from_array(
+                np.arange(10), None, 3, end_index=-1
+            )
+        with self.assertRaisesRegex(ValueError, "`end_index` must be "):
+            _ = timeseries_dataset.timeseries_dataset_from_array(
+                np.arange(10), None, 3, end_index=11
+            )
+        # bad sampling_rate
+        with self.assertRaisesRegex(ValueError, "`sampling_rate` must be "):
+            _ = timeseries_dataset.timeseries_dataset_from_array(
+                np.arange(10), None, 3, sampling_rate=0
+            )
+        # bad sequence stride
+        with self.assertRaisesRegex(ValueError, "`sequence_stride` must be "):
+            _ = timeseries_dataset.timeseries_dataset_from_array(
+                np.arange(10), None, 3, sequence_stride=0
+            )
 
-  def test_errors(self):
-    # bad start index
-    with self.assertRaisesRegex(ValueError, '`start_index` must be '):
-      _ = timeseries_dataset.timeseries_dataset_from_array(
-          np.arange(10), None, 3, start_index=-1)
-    with self.assertRaisesRegex(ValueError, '`start_index` must be '):
-      _ = timeseries_dataset.timeseries_dataset_from_array(
-          np.arange(10), None, 3, start_index=11)
-    # bad end index
-    with self.assertRaisesRegex(ValueError, '`end_index` must be '):
-      _ = timeseries_dataset.timeseries_dataset_from_array(
-          np.arange(10), None, 3, end_index=-1)
-    with self.assertRaisesRegex(ValueError, '`end_index` must be '):
-      _ = timeseries_dataset.timeseries_dataset_from_array(
-          np.arange(10), None, 3, end_index=11)
-    # bad sampling_rate
-    with self.assertRaisesRegex(ValueError, '`sampling_rate` must be '):
-      _ = timeseries_dataset.timeseries_dataset_from_array(
-          np.arange(10), None, 3, sampling_rate=0)
-    # bad sequence stride
-    with self.assertRaisesRegex(ValueError, '`sequence_stride` must be '):
-      _ = timeseries_dataset.timeseries_dataset_from_array(
-          np.arange(10), None, 3, sequence_stride=0)
+    def test_not_batched(self):
+        data = np.arange(100)
 
-  def test_not_batched(self):
-    data = np.arange(100)
+        dataset = timeseries_dataset.timeseries_dataset_from_array(
+            data, None, sequence_length=9, batch_size=None, shuffle=True
+        )
+        sample = next(iter(dataset))
+        self.assertEqual(len(sample.shape), 1)
 
-    dataset = timeseries_dataset.timeseries_dataset_from_array(
-        data, None, sequence_length=9, batch_size=None, shuffle=True)
-    sample = next(iter(dataset))
-    self.assertEqual(len(sample.shape), 1)
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/traceback_utils.py b/keras/utils/traceback_utils.py
index 31bc1e5f12a3..6cbc804319e7 100644
--- a/keras/utils/traceback_utils.py
+++ b/keras/utils/traceback_utils.py
@@ -19,142 +19,150 @@
 import sys
 import traceback
 import types
-import tensorflow.compat.v2 as tf
 
+import tensorflow.compat.v2 as tf
 
 _EXCLUDED_PATHS = (
-    os.path.abspath(os.path.join(__file__, '..', '..')),
-    os.path.join('tensorflow', 'python'),
+    os.path.abspath(os.path.join(__file__, "..", "..")),
+    os.path.join("tensorflow", "python"),
 )
 
 
 def include_frame(fname):
-  for exclusion in _EXCLUDED_PATHS:
-    if exclusion in fname:
-      return False
-  return True
+    for exclusion in _EXCLUDED_PATHS:
+        if exclusion in fname:
+            return False
+    return True
 
 
 def _process_traceback_frames(tb):
-  """Iterate through traceback frames and return a new, filtered traceback."""
-  last_tb = None
-  tb_list = list(traceback.walk_tb(tb))
-  for f, line_no in reversed(tb_list):
-    if include_frame(f.f_code.co_filename):
-      last_tb = types.TracebackType(last_tb, f, f.f_lasti, line_no)
-  if last_tb is None and tb_list:
-    # If no frames were kept during filtering, create a new traceback
-    # from the outermost function.
-    f, line_no = tb_list[-1]
-    last_tb = types.TracebackType(last_tb, f, f.f_lasti, line_no)
-  return last_tb
+    """Iterate through traceback frames and return a new, filtered traceback."""
+    last_tb = None
+    tb_list = list(traceback.walk_tb(tb))
+    for f, line_no in reversed(tb_list):
+        if include_frame(f.f_code.co_filename):
+            last_tb = types.TracebackType(last_tb, f, f.f_lasti, line_no)
+    if last_tb is None and tb_list:
+        # If no frames were kept during filtering, create a new traceback
+        # from the outermost function.
+        f, line_no = tb_list[-1]
+        last_tb = types.TracebackType(last_tb, f, f.f_lasti, line_no)
+    return last_tb
 
 
 def filter_traceback(fn):
-  """Filter out Keras-internal stack trace frames in exceptions raised by fn."""
-  if sys.version_info.major != 3 or sys.version_info.minor < 7:
-    return fn
+    """Filter out Keras-internal stack trace frames in exceptions raised by
+    fn."""
+    if sys.version_info.major != 3 or sys.version_info.minor < 7:
+        return fn
 
-  def error_handler(*args, **kwargs):
-    if not tf.debugging.is_traceback_filtering_enabled():
-      return fn(*args, **kwargs)
+    def error_handler(*args, **kwargs):
+        if not tf.debugging.is_traceback_filtering_enabled():
+            return fn(*args, **kwargs)
 
-    filtered_tb = None
-    try:
-      return fn(*args, **kwargs)
-    except Exception as e:  # pylint: disable=broad-except
-      filtered_tb = _process_traceback_frames(e.__traceback__)
-      # To get the full stack trace, call:
-      # `tf.debugging.disable_traceback_filtering()`
-      raise e.with_traceback(filtered_tb) from None
-    finally:
-      del filtered_tb
+        filtered_tb = None
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            filtered_tb = _process_traceback_frames(e.__traceback__)
+            # To get the full stack trace, call:
+            # `tf.debugging.disable_traceback_filtering()`
+            raise e.with_traceback(filtered_tb) from None
+        finally:
+            del filtered_tb
 
-  return tf.__internal__.decorator.make_decorator(fn, error_handler)
+    return tf.__internal__.decorator.make_decorator(fn, error_handler)
 
 
 def inject_argument_info_in_traceback(fn, object_name=None):
-  """Add information about call argument values to an error message.
-
-  Arguments:
-    fn: Function to wrap. Exceptions raised by the this function will be
-      re-raised with additional information added to the error message,
-      displaying the values of the different arguments that the function
-      was called with.
-    object_name: String, display name of the class/function being called,
-      e.g. `'layer "layer_name" (LayerClass)'`.
-
-  Returns:
-    A wrapped version of `fn`.
-  """
-  def error_handler(*args, **kwargs):
-    signature = None
-    bound_signature = None
-    try:
-      return fn(*args, **kwargs)
-    except Exception as e:  # pylint: disable=broad-except
-      if hasattr(e, '_keras_call_info_injected'):
-        # Only inject info for the innermost failing call
-        raise e
-      signature = inspect.signature(fn)
-      try:
-        # The first argument is `self`, so filter it out
-        bound_signature = signature.bind(*args, **kwargs)
-      except TypeError:
-        # Likely unbindable arguments
-        raise e
-
-      # Add argument context
-      arguments_context = []
-      for arg in list(signature.parameters.values()):
-        if arg.name in bound_signature.arguments:
-          value = tf.nest.map_structure(
-              format_argument_value, bound_signature.arguments[arg.name])
-        else:
-          value = arg.default
-        arguments_context.append(f'  • {arg.name}={value}')
-
-      if arguments_context:
-        arguments_context = '\n'.join(arguments_context)
-        # Get original error message and append information to it.
-        if isinstance(e, tf.errors.OpError):
-          message = e.message
-        elif e.args:
-          # Canonically, the 1st argument in an exception is the error message.
-          # This works for all built-in Python exceptions.
-          message = e.args[0]
-        else:
-          message = ''
-        display_name = f'{object_name if object_name else fn.__name__}'
-        message = (
-            f'Exception encountered when calling {display_name}.\n\n'
-            f'{message}\n\n'
-            f'Call arguments received by {display_name}:\n'
-            f'{arguments_context}')
-
-        # Reraise exception, with added context
-        if isinstance(e, tf.errors.OpError):
-          new_e = e.__class__(e.node_def, e.op, message, e.error_code)
-        else:
-          try:
-            # For standard exceptions such as ValueError, TypeError, etc.
-            new_e = e.__class__(message)
-          except TypeError:
-            # For any custom error that doesn't have a standard signature.
-            new_e = RuntimeError(message)
-        new_e._keras_call_info_injected = True  # pylint: disable=protected-access
-      else:
-        new_e = e
-      raise new_e.with_traceback(e.__traceback__) from None
-    finally:
-      del signature
-      del bound_signature
-  return tf.__internal__.decorator.make_decorator(fn, error_handler)
+    """Add information about call argument values to an error message.
+
+    Arguments:
+      fn: Function to wrap. Exceptions raised by the this function will be
+        re-raised with additional information added to the error message,
+        displaying the values of the different arguments that the function
+        was called with.
+      object_name: String, display name of the class/function being called,
+        e.g. `'layer "layer_name" (LayerClass)'`.
+
+    Returns:
+      A wrapped version of `fn`.
+    """
+
+    def error_handler(*args, **kwargs):
+        signature = None
+        bound_signature = None
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            if hasattr(e, "_keras_call_info_injected"):
+                # Only inject info for the innermost failing call
+                raise e
+            signature = inspect.signature(fn)
+            try:
+                # The first argument is `self`, so filter it out
+                bound_signature = signature.bind(*args, **kwargs)
+            except TypeError:
+                # Likely unbindable arguments
+                raise e
+
+            # Add argument context
+            arguments_context = []
+            for arg in list(signature.parameters.values()):
+                if arg.name in bound_signature.arguments:
+                    value = tf.nest.map_structure(
+                        format_argument_value,
+                        bound_signature.arguments[arg.name],
+                    )
+                else:
+                    value = arg.default
+                arguments_context.append(f"  • {arg.name}={value}")
+
+            if arguments_context:
+                arguments_context = "\n".join(arguments_context)
+                # Get original error message and append information to it.
+                if isinstance(e, tf.errors.OpError):
+                    message = e.message
+                elif e.args:
+                    # Canonically, the 1st argument in an exception is the error
+                    # message.  This works for all built-in Python exceptions.
+                    message = e.args[0]
+                else:
+                    message = ""
+                display_name = f"{object_name if object_name else fn.__name__}"
+                message = (
+                    f"Exception encountered when calling {display_name}.\n\n"
+                    f"{message}\n\n"
+                    f"Call arguments received by {display_name}:\n"
+                    f"{arguments_context}"
+                )
+
+                # Reraise exception, with added context
+                if isinstance(e, tf.errors.OpError):
+                    new_e = e.__class__(e.node_def, e.op, message, e.error_code)
+                else:
+                    try:
+                        # For standard exceptions such as ValueError, TypeError,
+                        # etc.
+                        new_e = e.__class__(message)
+                    except TypeError:
+                        # For any custom error that doesn't have a standard
+                        # signature.
+                        new_e = RuntimeError(message)
+                new_e._keras_call_info_injected = True
+            else:
+                new_e = e
+            raise new_e.with_traceback(e.__traceback__) from None
+        finally:
+            del signature
+            del bound_signature
+
+    return tf.__internal__.decorator.make_decorator(fn, error_handler)
 
 
 def format_argument_value(value):
-  if isinstance(value, tf.Tensor):
-    # Simplified representation for eager / graph tensors
-    # to keep messages readable
-    return f'tf.Tensor(shape={value.shape}, dtype={value.dtype.name})'
-  return repr(value)
+    if isinstance(value, tf.Tensor):
+        # Simplified representation for eager / graph tensors
+        # to keep messages readable
+        return f"tf.Tensor(shape={value.shape}, dtype={value.dtype.name})"
+    return repr(value)
diff --git a/keras/utils/traceback_utils_test.py b/keras/utils/traceback_utils_test.py
index cb7cd449c71e..cb223f38b313 100644
--- a/keras/utils/traceback_utils_test.py
+++ b/keras/utils/traceback_utils_test.py
@@ -14,180 +14,176 @@
 # ==============================================================================
 """Tests for traceback_utils."""
 
+import tensorflow.compat.v2 as tf
+
 from keras import layers
 from keras.utils import traceback_utils
-import tensorflow.compat.v2 as tf
 
 
 class TracebackUtilsTest(tf.test.TestCase):
-
-  def test_info_injection_basics(self):
-    def error_fn(arg_1, arg_2, keyword_arg_1=None, keyword_arg_2=None):
-      raise ValueError('Original message')
-
-    with self.assertRaises(ValueError) as e:
-      traceback_utils.inject_argument_info_in_traceback(
-          error_fn, 'ObjName')(1, 2, keyword_arg_1=3, keyword_arg_2=4)
-    self.assertIn('Original message', str(e.exception))
-    self.assertIn('Exception encountered when calling ObjName',
-                  str(e.exception))
-    self.assertIn('Call arguments received', str(e.exception))
-    self.assertIn('arg_1=1', str(e.exception))
-    self.assertIn('arg_2=2', str(e.exception))
-    self.assertIn('keyword_arg_1=3', str(e.exception))
-    self.assertIn('keyword_arg_2=4', str(e.exception))
-
-    with self.assertRaises(ValueError) as e:
-      traceback_utils.inject_argument_info_in_traceback(
-          error_fn)(1, 2, keyword_arg_1=3, keyword_arg_2=4)
-    self.assertIn('Exception encountered when calling error_fn',
-                  str(e.exception))
-
-  def test_info_injection_no_args(self):
-    def error_fn():
-      raise ValueError('Original message')
-
-    with self.assertRaises(ValueError) as e:
-      traceback_utils.inject_argument_info_in_traceback(error_fn)()
-    self.assertEqual(str(e.exception).count('Call arguments received'), 0)
-
-  def test_info_injection_unbindable(self):
-    def error_fn(arg_1, keyword_arg_1=1):
-      return arg_1 + keyword_arg_1
-
-    with self.assertRaises(TypeError) as e:
-      traceback_utils.inject_argument_info_in_traceback(error_fn)()
-    self.assertIn('missing 1 required positional argument', str(e.exception))
-
-  def test_info_injection_nested(self):
-    def inner_fn(arg_1):
-      raise ValueError('Original message')
-
-    def outer_fn(arg_1):
-      return inner_fn(arg_1)
-
-    with self.assertRaises(ValueError) as e:
-      traceback_utils.inject_argument_info_in_traceback(
-          outer_fn)(1)
-    self.assertEqual(str(e.exception).count('Call arguments received'), 1)
-
-  def test_info_injection_tf_op_error(self):
-    def error_fn(arg_1, keyword_arg_1=1):
-      return arg_1 + keyword_arg_1 + tf.zeros((2, 3))
-
-    with self.assertRaises(tf.errors.InvalidArgumentError) as e:
-      traceback_utils.inject_argument_info_in_traceback(error_fn)(
-          tf.zeros((3, 3)))
-    self.assertIn('Incompatible shapes', str(e.exception))
-    self.assertIn('Call arguments received', str(e.exception))
+    def test_info_injection_basics(self):
+        def error_fn(arg_1, arg_2, keyword_arg_1=None, keyword_arg_2=None):
+            raise ValueError("Original message")
+
+        with self.assertRaises(ValueError) as e:
+            traceback_utils.inject_argument_info_in_traceback(
+                error_fn, "ObjName"
+            )(1, 2, keyword_arg_1=3, keyword_arg_2=4)
+        self.assertIn("Original message", str(e.exception))
+        self.assertIn(
+            "Exception encountered when calling ObjName", str(e.exception)
+        )
+        self.assertIn("Call arguments received", str(e.exception))
+        self.assertIn("arg_1=1", str(e.exception))
+        self.assertIn("arg_2=2", str(e.exception))
+        self.assertIn("keyword_arg_1=3", str(e.exception))
+        self.assertIn("keyword_arg_2=4", str(e.exception))
+
+        with self.assertRaises(ValueError) as e:
+            traceback_utils.inject_argument_info_in_traceback(error_fn)(
+                1, 2, keyword_arg_1=3, keyword_arg_2=4
+            )
+        self.assertIn(
+            "Exception encountered when calling error_fn", str(e.exception)
+        )
+
+    def test_info_injection_no_args(self):
+        def error_fn():
+            raise ValueError("Original message")
+
+        with self.assertRaises(ValueError) as e:
+            traceback_utils.inject_argument_info_in_traceback(error_fn)()
+        self.assertEqual(str(e.exception).count("Call arguments received"), 0)
+
+    def test_info_injection_unbindable(self):
+        def error_fn(arg_1, keyword_arg_1=1):
+            return arg_1 + keyword_arg_1
+
+        with self.assertRaises(TypeError) as e:
+            traceback_utils.inject_argument_info_in_traceback(error_fn)()
+        self.assertIn(
+            "missing 1 required positional argument", str(e.exception)
+        )
+
+    def test_info_injection_nested(self):
+        def inner_fn(arg_1):
+            raise ValueError("Original message")
+
+        def outer_fn(arg_1):
+            return inner_fn(arg_1)
+
+        with self.assertRaises(ValueError) as e:
+            traceback_utils.inject_argument_info_in_traceback(outer_fn)(1)
+        self.assertEqual(str(e.exception).count("Call arguments received"), 1)
+
+    def test_info_injection_tf_op_error(self):
+        def error_fn(arg_1, keyword_arg_1=1):
+            return arg_1 + keyword_arg_1 + tf.zeros((2, 3))
+
+        with self.assertRaises(tf.errors.InvalidArgumentError) as e:
+            traceback_utils.inject_argument_info_in_traceback(error_fn)(
+                tf.zeros((3, 3))
+            )
+        self.assertIn("Incompatible shapes", str(e.exception))
+        self.assertIn("Call arguments received", str(e.exception))
 
 
 class LayerCallInfoInjectionTest(tf.test.TestCase):
-
-  def assert_info_injected(self, fn):
-    tf.debugging.enable_traceback_filtering()
-    try:
-      fn()
-    except Exception as e:  # pylint: disable=broad-except
-      # Info should be injected exactly once.
-      self.assertEqual(str(e).count('Call arguments received'), 1)  # pylint: disable=g-assert-in-except
-
-  def test_custom_layer_call_nested(self):
-
-    class InnerLayer(layers.Layer):
-
-      def call(self, inputs, training=False, mask=None):
-        return inputs + tf.zeros((3, 4))
-
-    class OuterLayer(layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.inner = InnerLayer()
-
-      def call(self, inputs, training=True):
-        return self.inner(inputs)
-
-    def fn():
-      layer = OuterLayer()
-      layer(tf.zeros((3, 5)), training=False)
-
-    self.assert_info_injected(fn)
-
-  def test_custom_layer_call_eager_dense_input(self):
-
-    class MyLayer(layers.Layer):
-
-      def call(self, inputs, training=False, mask=None):
-        return inputs + tf.zeros((3, 4))
-
-    def fn():
-      layer = MyLayer()
-      layer(tf.zeros((3, 5)), training=False)
-
-    self.assert_info_injected(fn)
-
-  def test_custom_layer_call_eager_sparse_input(self):
-
-    class MyLayer(layers.Layer):
-
-      def call(self, inputs, training=False, mask=None):
-        return inputs + tf.zeros((3, 4))
-
-    def fn():
-      layer = MyLayer()
-      layer(
-          tf.SparseTensor(indices=[[0, 0]], values=[1], dense_shape=[3, 5]),
-          training=False)
-
-    self.assert_info_injected(fn)
-
-  def test_custom_layer_call_eager_ragged_input(self):
-
-    class MyLayer(layers.Layer):
-
-      def call(self, inputs, training=False, mask=None):
-        return inputs + tf.zeros((3, 4))
-
-    def fn():
-      layer = MyLayer()
-      layer(tf.ragged.constant([[0, 0, 0], [0, 0]]), training=False)
-
-    self.assert_info_injected(fn)
-
-  def test_custom_layer_call_symbolic(self):
-
-    class MyLayer(layers.Layer):
-
-      def call(self, inputs, training=False, mask=None):
-        return inputs + tf.zeros((3, 4))
-
-    def fn():
-      layer = MyLayer()
-      layer(layers.Input((3, 5)), training=False)
-
-    self.assert_info_injected(fn)
-
-  def test_custom_layer_call_unbindable(self):
-
-    class MyLayer(layers.Layer):
-
-      def __init__(self):
-        super().__init__()
-        self.input_spec = layers.InputSpec(shape=(3, 4))
-
-      def call(self, inputs, training=False, mask=None):
-        return inputs + tf.zeros((3, 4))
-
-    def fn():
-      layer = MyLayer()
-      layer(bad=True, arguments=True)
-
-    with self.assertRaisesRegex(
-        ValueError, 'The first argument to `Layer.call` must always'):
-      fn()
-
-
-if __name__ == '__main__':
-  if tf.__internal__.tf2.enabled():
-    tf.test.main()
+    def assert_info_injected(self, fn):
+        tf.debugging.enable_traceback_filtering()
+        try:
+            fn()
+        except Exception as e:
+            # Info should be injected exactly once.
+            self.assertEqual(str(e).count("Call arguments received"), 1)
+
+    def test_custom_layer_call_nested(self):
+        class InnerLayer(layers.Layer):
+            def call(self, inputs, training=False, mask=None):
+                return inputs + tf.zeros((3, 4))
+
+        class OuterLayer(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.inner = InnerLayer()
+
+            def call(self, inputs, training=True):
+                return self.inner(inputs)
+
+        def fn():
+            layer = OuterLayer()
+            layer(tf.zeros((3, 5)), training=False)
+
+        self.assert_info_injected(fn)
+
+    def test_custom_layer_call_eager_dense_input(self):
+        class MyLayer(layers.Layer):
+            def call(self, inputs, training=False, mask=None):
+                return inputs + tf.zeros((3, 4))
+
+        def fn():
+            layer = MyLayer()
+            layer(tf.zeros((3, 5)), training=False)
+
+        self.assert_info_injected(fn)
+
+    def test_custom_layer_call_eager_sparse_input(self):
+        class MyLayer(layers.Layer):
+            def call(self, inputs, training=False, mask=None):
+                return inputs + tf.zeros((3, 4))
+
+        def fn():
+            layer = MyLayer()
+            layer(
+                tf.SparseTensor(
+                    indices=[[0, 0]], values=[1], dense_shape=[3, 5]
+                ),
+                training=False,
+            )
+
+        self.assert_info_injected(fn)
+
+    def test_custom_layer_call_eager_ragged_input(self):
+        class MyLayer(layers.Layer):
+            def call(self, inputs, training=False, mask=None):
+                return inputs + tf.zeros((3, 4))
+
+        def fn():
+            layer = MyLayer()
+            layer(tf.ragged.constant([[0, 0, 0], [0, 0]]), training=False)
+
+        self.assert_info_injected(fn)
+
+    def test_custom_layer_call_symbolic(self):
+        class MyLayer(layers.Layer):
+            def call(self, inputs, training=False, mask=None):
+                return inputs + tf.zeros((3, 4))
+
+        def fn():
+            layer = MyLayer()
+            layer(layers.Input((3, 5)), training=False)
+
+        self.assert_info_injected(fn)
+
+    def test_custom_layer_call_unbindable(self):
+        class MyLayer(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self.input_spec = layers.InputSpec(shape=(3, 4))
+
+            def call(self, inputs, training=False, mask=None):
+                return inputs + tf.zeros((3, 4))
+
+        def fn():
+            layer = MyLayer()
+            layer(bad=True, arguments=True)
+
+        with self.assertRaisesRegex(
+            ValueError, "The first argument to `Layer.call` must always"
+        ):
+            fn()
+
+
+if __name__ == "__main__":
+    if tf.__internal__.tf2.enabled():
+        tf.test.main()
diff --git a/keras/utils/version_utils.py b/keras/utils/version_utils.py
index f17107877487..ba73509210b1 100644
--- a/keras/utils/version_utils.py
+++ b/keras/utils/version_utils.py
@@ -12,121 +12,119 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
+
 """Utilities for Keras classes with v1 and v2 versions."""
 
 import tensorflow.compat.v2 as tf
+
 from keras.utils.generic_utils import LazyLoader
 
 # TODO(b/134426265): Switch back to single-quotes once the issue
 # with copybara is fixed.
-# pylint: disable=g-inconsistent-quotes
-training = LazyLoader(
-    "training", globals(),
-    "keras.engine.training")
-training_v1 = LazyLoader(
-    "training_v1", globals(),
-    "keras.engine.training_v1")
-base_layer = LazyLoader(
-    "base_layer", globals(),
-    "keras.engine.base_layer")
-base_layer_v1 = LazyLoader(
-    "base_layer_v1", globals(),
-    "keras.engine.base_layer_v1")
-callbacks = LazyLoader(
-    "callbacks", globals(),
-    "keras.callbacks")
-callbacks_v1 = LazyLoader(
-    "callbacks_v1", globals(),
-    "keras.callbacks_v1")
-
 
-# pylint: enable=g-inconsistent-quotes
+training = LazyLoader("training", globals(), "keras.engine.training")
+training_v1 = LazyLoader("training_v1", globals(), "keras.engine.training_v1")
+base_layer = LazyLoader("base_layer", globals(), "keras.engine.base_layer")
+base_layer_v1 = LazyLoader(
+    "base_layer_v1", globals(), "keras.engine.base_layer_v1"
+)
+callbacks = LazyLoader("callbacks", globals(), "keras.callbacks")
+callbacks_v1 = LazyLoader("callbacks_v1", globals(), "keras.callbacks_v1")
 
 
 class ModelVersionSelector:
-  """Chooses between Keras v1 and v2 Model class."""
+    """Chooses between Keras v1 and v2 Model class."""
 
-  def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    use_v2 = should_use_v2()
-    cls = swap_class(cls, training.Model, training_v1.Model, use_v2)  # pylint: disable=self-cls-assignment
-    return super(ModelVersionSelector, cls).__new__(cls)
+    def __new__(cls, *args, **kwargs):
+        use_v2 = should_use_v2()
+        cls = swap_class(cls, training.Model, training_v1.Model, use_v2)
+        return super(ModelVersionSelector, cls).__new__(cls)
 
 
 class LayerVersionSelector:
-  """Chooses between Keras v1 and v2 Layer class."""
+    """Chooses between Keras v1 and v2 Layer class."""
 
-  def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    use_v2 = should_use_v2()
-    cls = swap_class(cls, base_layer.Layer, base_layer_v1.Layer, use_v2)  # pylint: disable=self-cls-assignment
-    return super(LayerVersionSelector, cls).__new__(cls)
+    def __new__(cls, *args, **kwargs):
+        use_v2 = should_use_v2()
+        cls = swap_class(cls, base_layer.Layer, base_layer_v1.Layer, use_v2)
+        return super(LayerVersionSelector, cls).__new__(cls)
 
 
 class TensorBoardVersionSelector:
-  """Chooses between Keras v1 and v2 TensorBoard callback class."""
-
-  def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    use_v2 = should_use_v2()
-    start_cls = cls
-    cls = swap_class(start_cls, callbacks.TensorBoard, callbacks_v1.TensorBoard,
-                     use_v2)
-    if start_cls == callbacks_v1.TensorBoard and cls == callbacks.TensorBoard:
-      # Since the v2 class is not a subclass of the v1 class, __init__ has to
-      # be called manually.
-      return cls(*args, **kwargs)
-    return super(TensorBoardVersionSelector, cls).__new__(cls)
+    """Chooses between Keras v1 and v2 TensorBoard callback class."""
+
+    def __new__(cls, *args, **kwargs):
+        use_v2 = should_use_v2()
+        start_cls = cls
+        cls = swap_class(
+            start_cls, callbacks.TensorBoard, callbacks_v1.TensorBoard, use_v2
+        )
+        if (
+            start_cls == callbacks_v1.TensorBoard
+            and cls == callbacks.TensorBoard
+        ):
+            # Since the v2 class is not a subclass of the v1 class, __init__ has
+            # to be called manually.
+            return cls(*args, **kwargs)
+        return super(TensorBoardVersionSelector, cls).__new__(cls)
 
 
 def should_use_v2():
-  """Determine if v1 or v2 version should be used."""
-  if tf.executing_eagerly():
-    return True
-  elif tf.compat.v1.executing_eagerly_outside_functions():
-    # Check for a v1 `wrap_function` FuncGraph.
-    # Code inside a `wrap_function` is treated like v1 code.
-    graph = tf.compat.v1.get_default_graph()
-    if (getattr(graph, "name", False) and
-        graph.name.startswith("wrapped_function")):
-      return False
-    return True
-  else:
-    return False
+    """Determine if v1 or v2 version should be used."""
+    if tf.executing_eagerly():
+        return True
+    elif tf.compat.v1.executing_eagerly_outside_functions():
+        # Check for a v1 `wrap_function` FuncGraph.
+        # Code inside a `wrap_function` is treated like v1 code.
+        graph = tf.compat.v1.get_default_graph()
+        if getattr(graph, "name", False) and graph.name.startswith(
+            "wrapped_function"
+        ):
+            return False
+        return True
+    else:
+        return False
 
 
 def swap_class(cls, v2_cls, v1_cls, use_v2):
-  """Swaps in v2_cls or v1_cls depending on graph mode."""
-  if cls == object:
+    """Swaps in v2_cls or v1_cls depending on graph mode."""
+    if cls == object:
+        return cls
+    if cls in (v2_cls, v1_cls):
+        return v2_cls if use_v2 else v1_cls
+
+    # Recursively search superclasses to swap in the right Keras class.
+    new_bases = []
+    for base in cls.__bases__:
+        if (
+            use_v2
+            and issubclass(base, v1_cls)
+            # `v1_cls` often extends `v2_cls`, so it may still call `swap_class`
+            # even if it doesn't need to. That being said, it may be the safest
+            # not to over optimize this logic for the sake of correctness,
+            # especially if we swap v1 & v2 classes that don't extend each
+            # other, or when the inheritance order is different.
+            or (not use_v2 and issubclass(base, v2_cls))
+        ):
+            new_base = swap_class(base, v2_cls, v1_cls, use_v2)
+        else:
+            new_base = base
+        new_bases.append(new_base)
+    cls.__bases__ = tuple(new_bases)
     return cls
-  if cls in (v2_cls, v1_cls):
-    return v2_cls if use_v2 else v1_cls
-
-  # Recursively search superclasses to swap in the right Keras class.
-  new_bases = []
-  for base in cls.__bases__:
-    if ((use_v2 and issubclass(base, v1_cls)
-         # `v1_cls` often extends `v2_cls`, so it may still call `swap_class`
-         # even if it doesn't need to. That being said, it may be the safest
-         # not to over optimize this logic for the sake of correctness,
-         # especially if we swap v1 & v2 classes that don't extend each other,
-         # or when the inheritance order is different.
-         or (not use_v2 and issubclass(base, v2_cls)))):
-      new_base = swap_class(base, v2_cls, v1_cls, use_v2)
-    else:
-      new_base = base
-    new_bases.append(new_base)
-  cls.__bases__ = tuple(new_bases)
-  return cls
 
 
 def disallow_legacy_graph(cls_name, method_name):
-  if not tf.compat.v1.executing_eagerly_outside_functions():
-    error_msg = (
-        f"Calling `{cls_name}.{method_name}` in graph mode is not supported "
-        f"when the `{cls_name}` instance was constructed with eager mode "
-        f"enabled. Please construct your `{cls_name}` instance in graph mode or"
-        f" call `{cls_name}.{method_name}` with eager mode enabled.")
-    raise ValueError(error_msg)
+    if not tf.compat.v1.executing_eagerly_outside_functions():
+        error_msg = (
+            f"Calling `{cls_name}.{method_name}` in graph mode is not "
+            f"supported when the `{cls_name}` instance was constructed with "
+            f"eager mode enabled. Please construct your `{cls_name}` instance "
+            f"in graph mode or call `{cls_name}.{method_name}` with "
+            "eager mode enabled."
+        )
+        raise ValueError(error_msg)
 
 
 def is_v1_layer_or_model(obj):
-  return isinstance(obj, (base_layer_v1.Layer, training_v1.Model))
+    return isinstance(obj, (base_layer_v1.Layer, training_v1.Model))
diff --git a/keras/utils/version_utils_test.py b/keras/utils/version_utils_test.py
index 176debee170f..6c73cda93a26 100644
--- a/keras/utils/version_utils_test.py
+++ b/keras/utils/version_utils_test.py
@@ -16,166 +16,160 @@
 
 import abc
 
+import numpy as np
+import tensorflow.compat.v2 as tf
+
 import keras
 from keras.engine import base_layer
 from keras.engine import base_layer_v1
 from keras.engine import training
 from keras.engine import training_v1
 from keras.testing_infra import test_combinations
-import numpy as np
-import tensorflow.compat.v2 as tf
 
 
 @test_combinations.run_all_keras_modes
 class SplitUtilsTest(test_combinations.TestCase):
-
-  def _check_model_class(self, model_class):
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertEqual(model_class, training.Model)
-    else:
-      self.assertEqual(model_class, training_v1.Model)
-
-  def _check_layer_class(self, layer):
-    if tf.compat.v1.executing_eagerly_outside_functions():
-      self.assertIsInstance(layer, base_layer.Layer)
-      self.assertNotIsInstance(layer, base_layer_v1.Layer)
-    else:
-      self.assertIsInstance(layer, base_layer_v1.Layer)
-
-  def test_functional_model(self):
-    inputs = keras.Input(10)
-    outputs = keras.layers.Dense(1)(inputs)
-    model = keras.Model(inputs, outputs)
-    self._check_model_class(model.__class__.__bases__[0])
-    self._check_layer_class(model)
-
-  def test_subclass_model_with_functional_init(self):
-    inputs = keras.Input(10)
-    outputs = keras.layers.Dense(1)(inputs)
-
-    class MyModel(keras.Model):
-      pass
-
-    model = MyModel(inputs, outputs)
-    model_class = model.__class__.__bases__[0].__bases__[0]
-    self._check_model_class(model_class)
-    self._check_layer_class(model)
-
-  def test_subclass_model_with_functional_init_interleaved_v1_functional(self):
-    with tf.Graph().as_default():
-      inputs = keras.Input(10)
-      outputs = keras.layers.Dense(1)(inputs)
-      _ = keras.Model(inputs, outputs)
-
-    inputs = keras.Input(10)
-    outputs = keras.layers.Dense(1)(inputs)
-
-    class MyModel(keras.Model):
-      pass
-
-    model = MyModel(inputs, outputs)
-    model_class = model.__class__.__bases__[0].__bases__[0]
-    self._check_model_class(model_class)
-    self._check_layer_class(model)
-
-  def test_sequential_model(self):
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model_class = model.__class__.__bases__[0].__bases__[0]
-    self._check_model_class(model_class)
-    self._check_layer_class(model)
-
-  def test_subclass_model(self):
-
-    class MyModel(keras.Model):
-
-      def call(self, x):
-        return 2 * x
-
-    model = MyModel()
-    model_class = model.__class__.__bases__[0]
-    self._check_model_class(model_class)
-    self._check_layer_class(model)
-
-  def test_layer(self):
-    class IdentityLayer(base_layer.Layer):
-      """A layer that returns it's input.
-
-      Useful for testing a layer without a variable.
-      """
-
-      def call(self, inputs):
-        return inputs
-
-    layer = IdentityLayer()
-    self._check_layer_class(layer)
-
-  def test_multiple_subclass_model(self):
-
-    class Model1(keras.Model):
-      pass
-
-    class Model2(Model1):
-
-      def call(self, x):
-        return 2 * x
-
-    model = Model2()
-    model_class = model.__class__.__bases__[0].__bases__[0]
-    self._check_model_class(model_class)
-    self._check_layer_class(model)
-
-  def test_user_provided_metaclass(self):
-
-    class AbstractModel(keras.Model, metaclass=abc.ABCMeta):
-
-      @abc.abstractmethod
-      def call(self, inputs):
-        """Calls the model."""
-
-    class MyModel(AbstractModel):
-
-      def call(self, inputs):
-        return 2 * inputs
-
-    with self.assertRaisesRegex(TypeError, 'instantiate abstract class'):
-      AbstractModel()  # pylint: disable=abstract-class-instantiated
-
-    model = MyModel()
-    model_class = model.__class__.__bases__[0].__bases__[0]
-    self._check_model_class(model_class)
-    self._check_layer_class(model)
-
-  def test_multiple_inheritance(self):
-
-    class Return2:
-
-      def return_2(self):
-        return 2
-
-    class MyModel(keras.Model, Return2):
-
-      def call(self, x):
-        return self.return_2() * x
-
-    model = MyModel()
-    bases = model.__class__.__bases__
-    self._check_model_class(bases[0])
-    self.assertEqual(bases[1], Return2)
-    self.assertEqual(model.return_2(), 2)
-    self._check_layer_class(model)
-
-  def test_fit_error(self):
-    if not tf.compat.v1.executing_eagerly_outside_functions():
-      # Error only appears on the v2 class.
-      return
-
-    model = keras.Sequential([keras.layers.Dense(1)])
-    model.compile('sgd', 'mse')
-    x, y = np.ones((10, 10)), np.ones((10, 1))
-    with tf.compat.v1.get_default_graph().as_default():
-      with self.assertRaisesRegex(
-          ValueError, 'instance was constructed with eager mode enabled'):
-        model.fit(x, y, batch_size=2)
-
-if __name__ == '__main__':
-  tf.test.main()
+    def _check_model_class(self, model_class):
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertEqual(model_class, training.Model)
+        else:
+            self.assertEqual(model_class, training_v1.Model)
+
+    def _check_layer_class(self, layer):
+        if tf.compat.v1.executing_eagerly_outside_functions():
+            self.assertIsInstance(layer, base_layer.Layer)
+            self.assertNotIsInstance(layer, base_layer_v1.Layer)
+        else:
+            self.assertIsInstance(layer, base_layer_v1.Layer)
+
+    def test_functional_model(self):
+        inputs = keras.Input(10)
+        outputs = keras.layers.Dense(1)(inputs)
+        model = keras.Model(inputs, outputs)
+        self._check_model_class(model.__class__.__bases__[0])
+        self._check_layer_class(model)
+
+    def test_subclass_model_with_functional_init(self):
+        inputs = keras.Input(10)
+        outputs = keras.layers.Dense(1)(inputs)
+
+        class MyModel(keras.Model):
+            pass
+
+        model = MyModel(inputs, outputs)
+        model_class = model.__class__.__bases__[0].__bases__[0]
+        self._check_model_class(model_class)
+        self._check_layer_class(model)
+
+    def test_subclass_model_with_functional_init_interleaved_v1_functional(
+        self,
+    ):
+        with tf.Graph().as_default():
+            inputs = keras.Input(10)
+            outputs = keras.layers.Dense(1)(inputs)
+            _ = keras.Model(inputs, outputs)
+
+        inputs = keras.Input(10)
+        outputs = keras.layers.Dense(1)(inputs)
+
+        class MyModel(keras.Model):
+            pass
+
+        model = MyModel(inputs, outputs)
+        model_class = model.__class__.__bases__[0].__bases__[0]
+        self._check_model_class(model_class)
+        self._check_layer_class(model)
+
+    def test_sequential_model(self):
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model_class = model.__class__.__bases__[0].__bases__[0]
+        self._check_model_class(model_class)
+        self._check_layer_class(model)
+
+    def test_subclass_model(self):
+        class MyModel(keras.Model):
+            def call(self, x):
+                return 2 * x
+
+        model = MyModel()
+        model_class = model.__class__.__bases__[0]
+        self._check_model_class(model_class)
+        self._check_layer_class(model)
+
+    def test_layer(self):
+        class IdentityLayer(base_layer.Layer):
+            """A layer that returns it's input.
+
+            Useful for testing a layer without a variable.
+            """
+
+            def call(self, inputs):
+                return inputs
+
+        layer = IdentityLayer()
+        self._check_layer_class(layer)
+
+    def test_multiple_subclass_model(self):
+        class Model1(keras.Model):
+            pass
+
+        class Model2(Model1):
+            def call(self, x):
+                return 2 * x
+
+        model = Model2()
+        model_class = model.__class__.__bases__[0].__bases__[0]
+        self._check_model_class(model_class)
+        self._check_layer_class(model)
+
+    def test_user_provided_metaclass(self):
+        class AbstractModel(keras.Model, metaclass=abc.ABCMeta):
+            @abc.abstractmethod
+            def call(self, inputs):
+                """Calls the model."""
+
+        class MyModel(AbstractModel):
+            def call(self, inputs):
+                return 2 * inputs
+
+        with self.assertRaisesRegex(TypeError, "instantiate abstract class"):
+            AbstractModel()
+
+        model = MyModel()
+        model_class = model.__class__.__bases__[0].__bases__[0]
+        self._check_model_class(model_class)
+        self._check_layer_class(model)
+
+    def test_multiple_inheritance(self):
+        class Return2:
+            def return_2(self):
+                return 2
+
+        class MyModel(keras.Model, Return2):
+            def call(self, x):
+                return self.return_2() * x
+
+        model = MyModel()
+        bases = model.__class__.__bases__
+        self._check_model_class(bases[0])
+        self.assertEqual(bases[1], Return2)
+        self.assertEqual(model.return_2(), 2)
+        self._check_layer_class(model)
+
+    def test_fit_error(self):
+        if not tf.compat.v1.executing_eagerly_outside_functions():
+            # Error only appears on the v2 class.
+            return
+
+        model = keras.Sequential([keras.layers.Dense(1)])
+        model.compile("sgd", "mse")
+        x, y = np.ones((10, 10)), np.ones((10, 1))
+        with tf.compat.v1.get_default_graph().as_default():
+            with self.assertRaisesRegex(
+                ValueError, "instance was constructed with eager mode enabled"
+            ):
+                model.fit(x, y, batch_size=2)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/utils/vis_utils.py b/keras/utils/vis_utils.py
index accf546f1bf6..7cb0115992b2 100644
--- a/keras/utils/vis_utils.py
+++ b/keras/utils/vis_utils.py
@@ -12,448 +12,486 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-# pylint: disable=g-import-not-at-top
-"""Utilities related to model visualization."""
 
-import tensorflow.compat.v2 as tf
+
+"""Utilities related to model visualization."""
 
 import os
 import sys
-import re
+
+import tensorflow.compat.v2 as tf
 
 from keras.utils import io_utils
-from tensorflow.python.util.tf_export import keras_export
+from keras.utils import layer_utils
 
+# isort: off
+from tensorflow.python.util.tf_export import keras_export
 
 try:
-  # pydot-ng is a fork of pydot that is better maintained.
-  import pydot_ng as pydot
+    # pydot-ng is a fork of pydot that is better maintained.
+    import pydot_ng as pydot
 except ImportError:
-  # pydotplus is an improved version of pydot
-  try:
-    import pydotplus as pydot
-  except ImportError:
-    # Fall back on pydot if necessary.
+    # pydotplus is an improved version of pydot
     try:
-      import pydot
+        import pydotplus as pydot
     except ImportError:
-      pydot = None
+        # Fall back on pydot if necessary.
+        try:
+            import pydot
+        except ImportError:
+            pydot = None
 
 
 def check_pydot():
-  """Returns True if PyDot is available."""
-  return pydot is not None
+    """Returns True if PyDot is available."""
+    return pydot is not None
 
 
 def check_graphviz():
-  """Returns True if both PyDot and Graphviz are available."""
-  if not check_pydot():
-    return False
-  try:
-    # Attempt to create an image of a blank graph
-    # to check the pydot/graphviz installation.
-    pydot.Dot.create(pydot.Dot())
-    return True
-  except (OSError, pydot.InvocationException):
-    return False
+    """Returns True if both PyDot and Graphviz are available."""
+    if not check_pydot():
+        return False
+    try:
+        # Attempt to create an image of a blank graph
+        # to check the pydot/graphviz installation.
+        pydot.Dot.create(pydot.Dot())
+        return True
+    except (OSError, pydot.InvocationException):
+        return False
 
 
 def is_wrapped_model(layer):
-  from keras.engine import functional
-  from keras.layers import Wrapper
-  return (isinstance(layer, Wrapper) and
-          isinstance(layer.layer, functional.Functional))
+    from keras.engine import functional
+    from keras.layers import Wrapper
+
+    return isinstance(layer, Wrapper) and isinstance(
+        layer.layer, functional.Functional
+    )
 
 
 def add_edge(dot, src, dst):
-  if not dot.get_edge(src, dst):
-    dot.add_edge(pydot.Edge(src, dst))
-
-
-def get_layer_index_bound_by_layer_name(model, layer_names):
-  """Return specific range of layers to plot, mainly for sub-graph plot models.
-
-  Args:
-    model: tf.keras.Model
-    layer_names: unique name of layer of the model, type(str)
-
-  Returns:
-    return the index value of layer based on its unique name (layer_names)
-  """
-  lower_index = []
-  upper_index = []
-  for idx, layer in enumerate(model.layers):
-    if re.match(layer_names[0], layer.name):
-      lower_index.append(idx)
-    if re.match(layer_names[1], layer.name):
-      upper_index.append(idx)
-  if not lower_index or not upper_index:
-    raise ValueError(
-        'Passed layer_names does not match to layers in the model. '
-        f'Recieved: {layer_names}')
-  if min(lower_index) > max(upper_index):
-    return [min(upper_index), max(lower_index)]
-  return [min(lower_index), max(upper_index)]
-
-
-@keras_export('keras.utils.model_to_dot')
-def model_to_dot(model,
-                 show_shapes=False,
-                 show_dtype=False,
-                 show_layer_names=True,
-                 rankdir='TB',
-                 expand_nested=False,
-                 dpi=96,
-                 subgraph=False,
-                 layer_range=None,
-                 show_layer_activations=False):
-  """Convert a Keras model to dot format.
-
-  Args:
-    model: A Keras model instance.
-    show_shapes: whether to display shape information.
-    show_dtype: whether to display layer dtypes.
-    show_layer_names: whether to display layer names.
-    rankdir: `rankdir` argument passed to PyDot,
-        a string specifying the format of the plot:
-        'TB' creates a vertical plot;
-        'LR' creates a horizontal plot.
-    expand_nested: whether to expand nested models into clusters.
-    dpi: Dots per inch.
-    subgraph: whether to return a `pydot.Cluster` instance.
-    layer_range: input of `list` containing two `str` items, which is the
+    if not dot.get_edge(src, dst):
+        dot.add_edge(pydot.Edge(src, dst))
+
+
+@keras_export("keras.utils.model_to_dot")
+def model_to_dot(
+    model,
+    show_shapes=False,
+    show_dtype=False,
+    show_layer_names=True,
+    rankdir="TB",
+    expand_nested=False,
+    dpi=96,
+    subgraph=False,
+    layer_range=None,
+    show_layer_activations=False,
+    show_trainable=False,
+):
+    """Convert a Keras model to dot format.
+
+    Args:
+      model: A Keras model instance.
+      show_shapes: whether to display shape information.
+      show_dtype: whether to display layer dtypes.
+      show_layer_names: whether to display layer names.
+      rankdir: `rankdir` argument passed to PyDot,
+          a string specifying the format of the plot:
+          'TB' creates a vertical plot;
+          'LR' creates a horizontal plot.
+      expand_nested: whether to expand nested models into clusters.
+      dpi: Dots per inch.
+      subgraph: whether to return a `pydot.Cluster` instance.
+      layer_range: input of `list` containing two `str` items, which is the
+          starting layer name and ending layer name (both inclusive) indicating
+          the range of layers for which the `pydot.Dot` will be generated. It
+          also accepts regex patterns instead of exact name. In such case, start
+          predicate will be the first element it matches to `layer_range[0]`
+          and the end predicate will be the last element it matches to
+          `layer_range[1]`. By default `None` which considers all layers of
+          model. Note that you must pass range such that the resultant subgraph
+          must be complete.
+      show_layer_activations: Display layer activations (only for layers that
+          have an `activation` property).
+      show_trainable: whether to display if a layer is trainable. Displays 'T'
+          when the layer is trainable and 'NT' when it is not trainable.
+
+    Returns:
+      A `pydot.Dot` instance representing the Keras model or
+      a `pydot.Cluster` instance representing nested model if
+      `subgraph=True`.
+
+    Raises:
+      ValueError: if `model_to_dot` is called before the model is built.
+      ImportError: if pydot is not available.
+    """
+
+    if not model.built:
+        raise ValueError(
+            "This model has not yet been built. "
+            "Build the model first by calling `build()` or by calling "
+            "the model on a batch of data."
+        )
+
+    from keras.engine import functional
+    from keras.engine import sequential
+    from keras.layers import Wrapper
+
+    if not check_pydot():
+        raise ImportError(
+            "You must install pydot (`pip install pydot`) for "
+            "model_to_dot to work."
+        )
+
+    if subgraph:
+        dot = pydot.Cluster(style="dashed", graph_name=model.name)
+        dot.set("label", model.name)
+        dot.set("labeljust", "l")
+    else:
+        dot = pydot.Dot()
+        dot.set("rankdir", rankdir)
+        dot.set("concentrate", True)
+        dot.set("dpi", dpi)
+        dot.set_node_defaults(shape="record")
+
+    if layer_range is not None:
+        if len(layer_range) != 2:
+            raise ValueError(
+                "layer_range must be of shape (2,). Received: "
+                f"layer_range = {layer_range} of length {len(layer_range)}"
+            )
+        if not isinstance(layer_range[0], str) or not isinstance(
+            layer_range[1], str
+        ):
+            raise ValueError(
+                "layer_range should contain string type only. "
+                f"Received: {layer_range}"
+            )
+        layer_range = layer_utils.get_layer_index_bound_by_layer_name(
+            model, layer_range
+        )
+        if layer_range[0] < 0 or layer_range[1] > len(model.layers):
+            raise ValueError(
+                "Both values in layer_range should be in range (0, "
+                f"{len(model.layers)}. Received: {layer_range}"
+            )
+
+    sub_n_first_node = {}
+    sub_n_last_node = {}
+    sub_w_first_node = {}
+    sub_w_last_node = {}
+
+    layers = model.layers
+    if not model._is_graph_network:
+        node = pydot.Node(str(id(model)), label=model.name)
+        dot.add_node(node)
+        return dot
+    elif isinstance(model, sequential.Sequential):
+        if not model.built:
+            model.build()
+        layers = super(sequential.Sequential, model).layers
+
+    # Create graph nodes.
+    for i, layer in enumerate(layers):
+        if (layer_range) and (i < layer_range[0] or i >= layer_range[1]):
+            continue
+
+        layer_id = str(id(layer))
+
+        # Append a wrapped layer's label to node's label, if it exists.
+        layer_name = layer.name
+        class_name = layer.__class__.__name__
+
+        if isinstance(layer, Wrapper):
+            if expand_nested and isinstance(layer.layer, functional.Functional):
+                submodel_wrapper = model_to_dot(
+                    layer.layer,
+                    show_shapes,
+                    show_dtype,
+                    show_layer_names,
+                    rankdir,
+                    expand_nested,
+                    subgraph=True,
+                    show_layer_activations=show_layer_activations,
+                    show_trainable=show_trainable,
+                )
+                # sub_w : submodel_wrapper
+                sub_w_nodes = submodel_wrapper.get_nodes()
+                sub_w_first_node[layer.layer.name] = sub_w_nodes[0]
+                sub_w_last_node[layer.layer.name] = sub_w_nodes[-1]
+                dot.add_subgraph(submodel_wrapper)
+            else:
+                layer_name = f"{layer_name}({layer.layer.name})"
+                child_class_name = layer.layer.__class__.__name__
+                class_name = f"{class_name}({child_class_name})"
+
+        if expand_nested and isinstance(layer, functional.Functional):
+            submodel_not_wrapper = model_to_dot(
+                layer,
+                show_shapes,
+                show_dtype,
+                show_layer_names,
+                rankdir,
+                expand_nested,
+                subgraph=True,
+                show_layer_activations=show_layer_activations,
+                show_trainable=show_trainable,
+            )
+            # sub_n : submodel_not_wrapper
+            sub_n_nodes = submodel_not_wrapper.get_nodes()
+            sub_n_first_node[layer.name] = sub_n_nodes[0]
+            sub_n_last_node[layer.name] = sub_n_nodes[-1]
+            dot.add_subgraph(submodel_not_wrapper)
+
+        # Create node's label.
+        label = class_name
+
+        # Rebuild the label as a table including the layer's activation.
+        if (
+            show_layer_activations
+            and hasattr(layer, "activation")
+            and layer.activation is not None
+        ):
+            if hasattr(layer.activation, "name"):
+                activation_name = layer.activation.name
+            elif hasattr(layer.activation, "__name__"):
+                activation_name = layer.activation.__name__
+            else:
+                activation_name = str(layer.activation)
+            label = "{%s|%s}" % (label, activation_name)
+
+        # Rebuild the label as a table including the layer's name.
+        if show_layer_names:
+            label = f"{layer_name}|{label}"
+
+        # Rebuild the label as a table including the layer's dtype.
+        if show_dtype:
+
+            def format_dtype(dtype):
+                if dtype is None:
+                    return "?"
+                else:
+                    return str(dtype)
+
+            label = f"{label}|{format_dtype(layer.dtype)}"
+
+        # Rebuild the label as a table including input/output shapes.
+        if show_shapes:
+
+            def format_shape(shape):
+                return (
+                    str(shape)
+                    .replace(str(None), "None")
+                    .replace("{", r"\{")
+                    .replace("}", r"\}")
+                )
+
+            try:
+                outputlabels = format_shape(layer.output_shape)
+            except AttributeError:
+                outputlabels = "?"
+            if hasattr(layer, "input_shape"):
+                inputlabels = format_shape(layer.input_shape)
+            elif hasattr(layer, "input_shapes"):
+                inputlabels = ", ".join(
+                    [format_shape(ishape) for ishape in layer.input_shapes]
+                )
+            else:
+                inputlabels = "?"
+            label = "{%s}|{input:|output:}|{{%s}|{%s}}" % (
+                label,
+                inputlabels,
+                outputlabels,
+            )
+
+        # Rebuild the label as a table including trainable status
+        if show_trainable:
+            label = f"{'T' if layer.trainable else 'NT'}|{label}"
+
+        if not expand_nested or not isinstance(layer, functional.Functional):
+            node = pydot.Node(layer_id, label=label)
+            dot.add_node(node)
+
+    # Connect nodes with edges.
+    for i, layer in enumerate(layers):
+        if (layer_range) and (i <= layer_range[0] or i >= layer_range[1]):
+            continue
+        layer_id = str(id(layer))
+        for i, node in enumerate(layer._inbound_nodes):
+            node_key = layer.name + "_ib-" + str(i)
+            if node_key in model._network_nodes:
+                for inbound_layer in tf.nest.flatten(node.inbound_layers):
+                    inbound_layer_id = str(id(inbound_layer))
+                    if not expand_nested:
+                        assert dot.get_node(inbound_layer_id)
+                        assert dot.get_node(layer_id)
+                        add_edge(dot, inbound_layer_id, layer_id)
+                    else:
+                        # if inbound_layer is not Model or wrapped Model
+                        if not isinstance(
+                            inbound_layer, functional.Functional
+                        ) and not is_wrapped_model(inbound_layer):
+                            # if current layer is not Model or wrapped Model
+                            if not isinstance(
+                                layer, functional.Functional
+                            ) and not is_wrapped_model(layer):
+                                assert dot.get_node(inbound_layer_id)
+                                assert dot.get_node(layer_id)
+                                add_edge(dot, inbound_layer_id, layer_id)
+                            # if current layer is Model
+                            elif isinstance(layer, functional.Functional):
+                                add_edge(
+                                    dot,
+                                    inbound_layer_id,
+                                    sub_n_first_node[layer.name].get_name(),
+                                )
+                            # if current layer is wrapped Model
+                            elif is_wrapped_model(layer):
+                                add_edge(dot, inbound_layer_id, layer_id)
+                                name = sub_w_first_node[
+                                    layer.layer.name
+                                ].get_name()
+                                add_edge(dot, layer_id, name)
+                        # if inbound_layer is Model
+                        elif isinstance(inbound_layer, functional.Functional):
+                            name = sub_n_last_node[
+                                inbound_layer.name
+                            ].get_name()
+                            if isinstance(layer, functional.Functional):
+                                output_name = sub_n_first_node[
+                                    layer.name
+                                ].get_name()
+                                add_edge(dot, name, output_name)
+                            else:
+                                add_edge(dot, name, layer_id)
+                        # if inbound_layer is wrapped Model
+                        elif is_wrapped_model(inbound_layer):
+                            inbound_layer_name = inbound_layer.layer.name
+                            add_edge(
+                                dot,
+                                sub_w_last_node[inbound_layer_name].get_name(),
+                                layer_id,
+                            )
+    return dot
+
+
+@keras_export("keras.utils.plot_model")
+def plot_model(
+    model,
+    to_file="model.png",
+    show_shapes=False,
+    show_dtype=False,
+    show_layer_names=True,
+    rankdir="TB",
+    expand_nested=False,
+    dpi=96,
+    layer_range=None,
+    show_layer_activations=False,
+    show_trainable=False,
+):
+    """Converts a Keras model to dot format and save to a file.
+
+    Example:
+
+    ```python
+    input = tf.keras.Input(shape=(100,), dtype='int32', name='input')
+    x = tf.keras.layers.Embedding(
+        output_dim=512, input_dim=10000, input_length=100)(input)
+    x = tf.keras.layers.LSTM(32)(x)
+    x = tf.keras.layers.Dense(64, activation='relu')(x)
+    x = tf.keras.layers.Dense(64, activation='relu')(x)
+    x = tf.keras.layers.Dense(64, activation='relu')(x)
+    output = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(x)
+    model = tf.keras.Model(inputs=[input], outputs=[output])
+    dot_img_file = '/tmp/model_1.png'
+    tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)
+    ```
+
+    Args:
+      model: A Keras model instance
+      to_file: File name of the plot image.
+      show_shapes: whether to display shape information.
+      show_dtype: whether to display layer dtypes.
+      show_layer_names: whether to display layer names.
+      rankdir: `rankdir` argument passed to PyDot,
+          a string specifying the format of the plot: 'TB' creates a vertical
+            plot; 'LR' creates a horizontal plot.
+      expand_nested: Whether to expand nested models into clusters.
+      dpi: Dots per inch.
+      layer_range: input of `list` containing two `str` items, which is the
         starting layer name and ending layer name (both inclusive) indicating
-        the range of layers for which the `pydot.Dot` will be generated. It
-        also accepts regex patterns instead of exact name. In such case, start
-        predicate will be the first element it matches to `layer_range[0]`
-        and the end predicate will be the last element it matches to
-        `layer_range[1]`. By default `None` which considers all layers of
-        model. Note that you must pass range such that the resultant subgraph
-        must be complete.
-    show_layer_activations: Display layer activations (only for layers that
+        the range of layers for which the plot will be generated. It also
+        accepts regex patterns instead of exact name. In such case, start
+        predicate will be the first element it matches to `layer_range[0]` and
+        the end predicate will be the last element it matches to
+        `layer_range[1]`. By default `None` which considers all layers of model.
+        Note that you must pass range such that the resultant subgraph must be
+        complete.
+      show_layer_activations: Display layer activations (only for layers that
         have an `activation` property).
+      show_trainable: whether to display if a layer is trainable. Displays 'T'
+        when the layer is trainable and 'NT' when it is not trainable.
+
+    Raises:
+      ImportError: if graphviz or pydot are not available.
+      ValueError: if `plot_model` is called before the model is built.
+
+    Returns:
+      A Jupyter notebook Image object if Jupyter is installed.
+      This enables in-line display of the model plots in notebooks.
+    """
 
-  Returns:
-    A `pydot.Dot` instance representing the Keras model or
-    a `pydot.Cluster` instance representing nested model if
-    `subgraph=True`.
-
-  Raises:
-    ValueError: if `model_to_dot` is called before the model is built.
-    ImportError: if pydot is not available.
-  """
-
-  if not model.built:
-    raise ValueError('This model has not yet been built. '
-                     'Build the model first by calling `build()` or by calling '
-                     'the model on a batch of data.')
-
-  from keras.layers import Wrapper
-  from keras.engine import sequential
-  from keras.engine import functional
-
-  if not check_pydot():
-    raise ImportError('You must install pydot (`pip install pydot`) for '
-                      'model_to_dot to work.')
-
-  if subgraph:
-    dot = pydot.Cluster(style='dashed', graph_name=model.name)
-    dot.set('label', model.name)
-    dot.set('labeljust', 'l')
-  else:
-    dot = pydot.Dot()
-    dot.set('rankdir', rankdir)
-    dot.set('concentrate', True)
-    dot.set('dpi', dpi)
-    dot.set_node_defaults(shape='record')
-
-  if layer_range is not None:
-    if len(layer_range) != 2:
-      raise ValueError(
-          'layer_range must be of shape (2,). Received: '
-          f'layer_range = {layer_range} of length {len(layer_range)}')
-    if (not isinstance(layer_range[0], str) or
-        not isinstance(layer_range[1], str)):
-      raise ValueError(
-          'layer_range should contain string type only. '
-          f'Received: {layer_range}')
-    layer_range = get_layer_index_bound_by_layer_name(model, layer_range)
-    if layer_range[0] < 0 or layer_range[1] > len(model.layers):
-      raise ValueError('Both values in layer_range should be in range (0, '
-                       f'{len(model.layers)}. Received: {layer_range}')
-
-  sub_n_first_node = {}
-  sub_n_last_node = {}
-  sub_w_first_node = {}
-  sub_w_last_node = {}
-
-  layers = model.layers
-  if not model._is_graph_network:
-    node = pydot.Node(str(id(model)), label=model.name)
-    dot.add_node(node)
-    return dot
-  elif isinstance(model, sequential.Sequential):
     if not model.built:
-      model.build()
-    layers = super(sequential.Sequential, model).layers
-
-  # Create graph nodes.
-  for i, layer in enumerate(layers):
-    if (layer_range) and (i < layer_range[0] or i > layer_range[1]):
-      continue
-
-    layer_id = str(id(layer))
-
-    # Append a wrapped layer's label to node's label, if it exists.
-    layer_name = layer.name
-    class_name = layer.__class__.__name__
-
-    if isinstance(layer, Wrapper):
-      if expand_nested and isinstance(layer.layer,
-                                      functional.Functional):
-        submodel_wrapper = model_to_dot(
-            layer.layer,
-            show_shapes,
-            show_dtype,
-            show_layer_names,
-            rankdir,
-            expand_nested,
-            subgraph=True)
-        # sub_w : submodel_wrapper
-        sub_w_nodes = submodel_wrapper.get_nodes()
-        sub_w_first_node[layer.layer.name] = sub_w_nodes[0]
-        sub_w_last_node[layer.layer.name] = sub_w_nodes[-1]
-        dot.add_subgraph(submodel_wrapper)
-      else:
-        layer_name = '{}({})'.format(layer_name, layer.layer.name)
-        child_class_name = layer.layer.__class__.__name__
-        class_name = '{}({})'.format(class_name, child_class_name)
-
-    if expand_nested and isinstance(layer, functional.Functional):
-      submodel_not_wrapper = model_to_dot(
-          layer,
-          show_shapes,
-          show_dtype,
-          show_layer_names,
-          rankdir,
-          expand_nested,
-          subgraph=True)
-      # sub_n : submodel_not_wrapper
-      sub_n_nodes = submodel_not_wrapper.get_nodes()
-      sub_n_first_node[layer.name] = sub_n_nodes[0]
-      sub_n_last_node[layer.name] = sub_n_nodes[-1]
-      dot.add_subgraph(submodel_not_wrapper)
-
-    # Create node's label.
-    label = class_name
-
-    # Rebuild the label as a table including the layer's activation.
-    if (show_layer_activations and hasattr(layer, 'activation') and
-        layer.activation is not None):
-      if hasattr(layer.activation, 'name'):
-        activation_name = layer.activation.name
-      elif hasattr(layer.activation, '__name__'):
-        activation_name = layer.activation.__name__
-      else:
-        activation_name = str(layer.activation)
-      label = '{%s|%s}' % (label, activation_name)
-
-    # Rebuild the label as a table including the layer's name.
-    if show_layer_names:
-      label = '%s|%s' % (layer_name, label)
-
-    # Rebuild the label as a table including the layer's dtype.
-    if show_dtype:
-
-      def format_dtype(dtype):
-        if dtype is None:
-          return '?'
+        raise ValueError(
+            "This model has not yet been built. "
+            "Build the model first by calling `build()` or by calling "
+            "the model on a batch of data."
+        )
+
+    if not check_graphviz():
+        message = (
+            "You must install pydot (`pip install pydot`) "
+            "and install graphviz "
+            "(see instructions at https://graphviz.gitlab.io/download/) "
+            "for plot_model to work."
+        )
+        if "IPython.core.magics.namespace" in sys.modules:
+            # We don't raise an exception here in order to avoid crashing
+            # notebook tests where graphviz is not available.
+            io_utils.print_msg(message)
+            return
         else:
-          return str(dtype)
-
-      label = '%s|%s' % (label, format_dtype(layer.dtype))
-
-    # Rebuild the label as a table including input/output shapes.
-    if show_shapes:
-
-      def format_shape(shape):
-        return str(shape).replace(str(None), 'None')
-
-      try:
-        outputlabels = format_shape(layer.output_shape)
-      except AttributeError:
-        outputlabels = '?'
-      if hasattr(layer, 'input_shape'):
-        inputlabels = format_shape(layer.input_shape)
-      elif hasattr(layer, 'input_shapes'):
-        inputlabels = ', '.join(
-            [format_shape(ishape) for ishape in layer.input_shapes])
-      else:
-        inputlabels = '?'
-      label = '{%s}|{input:|output:}|{{%s}|{%s}}' % (label, inputlabels,
-                                                     outputlabels)
-    if not expand_nested or not isinstance(
-        layer, functional.Functional):
-      node = pydot.Node(layer_id, label=label)
-      dot.add_node(node)
-
-  # Connect nodes with edges.
-  for i, layer in enumerate(layers):
-    if (layer_range) and (i <= layer_range[0] or i > layer_range[1]):
-      continue
-    layer_id = str(id(layer))
-    for i, node in enumerate(layer._inbound_nodes):
-      node_key = layer.name + '_ib-' + str(i)
-      if node_key in model._network_nodes:
-        for inbound_layer in tf.nest.flatten(node.inbound_layers):
-          inbound_layer_id = str(id(inbound_layer))
-          if not expand_nested:
-            assert dot.get_node(inbound_layer_id)
-            assert dot.get_node(layer_id)
-            add_edge(dot, inbound_layer_id, layer_id)
-          else:
-            # if inbound_layer is not Model or wrapped Model
-            if (not isinstance(inbound_layer,
-                               functional.Functional) and
-                not is_wrapped_model(inbound_layer)):
-              # if current layer is not Model or wrapped Model
-              if (not isinstance(layer, functional.Functional) and
-                  not is_wrapped_model(layer)):
-                assert dot.get_node(inbound_layer_id)
-                assert dot.get_node(layer_id)
-                add_edge(dot, inbound_layer_id, layer_id)
-              # if current layer is Model
-              elif isinstance(layer, functional.Functional):
-                add_edge(dot, inbound_layer_id,
-                         sub_n_first_node[layer.name].get_name())
-              # if current layer is wrapped Model
-              elif is_wrapped_model(layer):
-                add_edge(dot, inbound_layer_id, layer_id)
-                name = sub_w_first_node[layer.layer.name].get_name()
-                add_edge(dot, layer_id, name)
-            # if inbound_layer is Model
-            elif isinstance(inbound_layer, functional.Functional):
-              name = sub_n_last_node[inbound_layer.name].get_name()
-              if isinstance(layer, functional.Functional):
-                output_name = sub_n_first_node[layer.name].get_name()
-                add_edge(dot, name, output_name)
-              else:
-                add_edge(dot, name, layer_id)
-            # if inbound_layer is wrapped Model
-            elif is_wrapped_model(inbound_layer):
-              inbound_layer_name = inbound_layer.layer.name
-              add_edge(dot,
-                       sub_w_last_node[inbound_layer_name].get_name(),
-                       layer_id)
-  return dot
-
-
-@keras_export('keras.utils.plot_model')
-def plot_model(model,
-               to_file='model.png',
-               show_shapes=False,
-               show_dtype=False,
-               show_layer_names=True,
-               rankdir='TB',
-               expand_nested=False,
-               dpi=96,
-               layer_range=None,
-               show_layer_activations=False):
-  """Converts a Keras model to dot format and save to a file.
-
-  Example:
-
-  ```python
-  input = tf.keras.Input(shape=(100,), dtype='int32', name='input')
-  x = tf.keras.layers.Embedding(
-      output_dim=512, input_dim=10000, input_length=100)(input)
-  x = tf.keras.layers.LSTM(32)(x)
-  x = tf.keras.layers.Dense(64, activation='relu')(x)
-  x = tf.keras.layers.Dense(64, activation='relu')(x)
-  x = tf.keras.layers.Dense(64, activation='relu')(x)
-  output = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(x)
-  model = tf.keras.Model(inputs=[input], outputs=[output])
-  dot_img_file = '/tmp/model_1.png'
-  tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)
-  ```
-
-  Args:
-    model: A Keras model instance
-    to_file: File name of the plot image.
-    show_shapes: whether to display shape information.
-    show_dtype: whether to display layer dtypes.
-    show_layer_names: whether to display layer names.
-    rankdir: `rankdir` argument passed to PyDot,
-        a string specifying the format of the plot: 'TB' creates a vertical
-          plot; 'LR' creates a horizontal plot.
-    expand_nested: Whether to expand nested models into clusters.
-    dpi: Dots per inch.
-    layer_range: input of `list` containing two `str` items, which is the
-      starting layer name and ending layer name (both inclusive) indicating the
-      range of layers for which the plot will be generated. It also accepts
-      regex patterns instead of exact name. In such case, start predicate will
-      be the first element it matches to `layer_range[0]` and the end predicate
-      will be the last element it matches to `layer_range[1]`. By default `None`
-      which considers all layers of model. Note that you must pass range such
-      that the resultant subgraph must be complete.
-    show_layer_activations: Display layer activations (only for layers that
-      have an `activation` property).
-
-  Raises:
-    ImportError: if graphviz or pydot are not available.
-    ValueError: if `plot_model` is called before the model is built.
-
-  Returns:
-    A Jupyter notebook Image object if Jupyter is installed.
-    This enables in-line display of the model plots in notebooks.
-  """
-
-  if not model.built:
-    raise ValueError('This model has not yet been built. '
-                     'Build the model first by calling `build()` or by calling '
-                     'the model on a batch of data.')
-
-  if not check_graphviz():
-    message = (
-        'You must install pydot (`pip install pydot`) '
-        'and install graphviz '
-        '(see instructions at https://graphviz.gitlab.io/download/) '
-        'for plot_model to work.')
-    if 'IPython.core.magics.namespace' in sys.modules:
-      # We don't raise an exception here in order to avoid crashing notebook
-      # tests where graphviz is not available.
-      io_utils.print_msg(message)
-      return
+            raise ImportError(message)
+
+    dot = model_to_dot(
+        model,
+        show_shapes=show_shapes,
+        show_dtype=show_dtype,
+        show_layer_names=show_layer_names,
+        rankdir=rankdir,
+        expand_nested=expand_nested,
+        dpi=dpi,
+        layer_range=layer_range,
+        show_layer_activations=show_layer_activations,
+        show_trainable=show_trainable,
+    )
+    to_file = io_utils.path_to_string(to_file)
+    if dot is None:
+        return
+    _, extension = os.path.splitext(to_file)
+    if not extension:
+        extension = "png"
     else:
-      raise ImportError(message)
-
-  dot = model_to_dot(
-      model,
-      show_shapes=show_shapes,
-      show_dtype=show_dtype,
-      show_layer_names=show_layer_names,
-      rankdir=rankdir,
-      expand_nested=expand_nested,
-      dpi=dpi,
-      layer_range=layer_range,
-      show_layer_activations=show_layer_activations)
-  to_file = io_utils.path_to_string(to_file)
-  if dot is None:
-    return
-  _, extension = os.path.splitext(to_file)
-  if not extension:
-    extension = 'png'
-  else:
-    extension = extension[1:]
-  # Save image to disk.
-  dot.write(to_file, format=extension)
-  # Return the image as a Jupyter Image object, to be displayed in-line.
-  # Note that we cannot easily detect whether the code is running in a
-  # notebook, and thus we always return the Image if Jupyter is available.
-  if extension != 'pdf':
-    try:
-      from IPython import display
-      return display.Image(filename=to_file)
-    except ImportError:
-      pass
+        extension = extension[1:]
+    # Save image to disk.
+    dot.write(to_file, format=extension)
+    # Return the image as a Jupyter Image object, to be displayed in-line.
+    # Note that we cannot easily detect whether the code is running in a
+    # notebook, and thus we always return the Image if Jupyter is available.
+    if extension != "pdf":
+        try:
+            from IPython import display
+
+            return display.Image(filename=to_file)
+        except ImportError:
+            pass
diff --git a/keras/utils/vis_utils_test.py b/keras/utils/vis_utils_test.py
index 185b83ef0e89..1665c8b0268d 100644
--- a/keras/utils/vis_utils_test.py
+++ b/keras/utils/vis_utils_test.py
@@ -15,228 +15,303 @@
 """Tests for Keras Vis utils."""
 
 import tensorflow.compat.v2 as tf
-
 from absl.testing import parameterized
 
 import keras
 from keras.applications import efficientnet
+from keras.utils import layer_utils
 from keras.utils import vis_utils
 
 
 class ModelToDotFormatTest(tf.test.TestCase, parameterized.TestCase):
+    def test_plot_model_cnn(self):
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Conv2D(
+                filters=2,
+                kernel_size=(2, 3),
+                input_shape=(3, 5, 5),
+                name="conv",
+            )
+        )
+        model.add(keras.layers.Flatten(name="flat"))
+        model.add(keras.layers.Dense(5, name="dense"))
+        dot_img_file = "model_1.png"
+        try:
+            vis_utils.plot_model(
+                model, to_file=dot_img_file, show_shapes=True, show_dtype=True
+            )
+            self.assertTrue(tf.io.gfile.exists(dot_img_file))
+            tf.io.gfile.remove(dot_img_file)
+        except ImportError:
+            pass
+
+    def test_plot_model_with_wrapped_layers_and_models(self):
+        inputs = keras.Input(shape=(None, 3))
+        lstm = keras.layers.LSTM(6, return_sequences=True, name="lstm")
+        x = lstm(inputs)
+        # Add layer inside a Wrapper
+        bilstm = keras.layers.Bidirectional(
+            keras.layers.LSTM(16, return_sequences=True, name="bilstm")
+        )
+        x = bilstm(x)
+        # Add model inside a Wrapper
+        submodel = keras.Sequential(
+            [keras.layers.Dense(32, name="dense", input_shape=(None, 32))]
+        )
+        wrapped_dense = keras.layers.TimeDistributed(submodel)
+        x = wrapped_dense(x)
+        # Add shared submodel
+        outputs = submodel(x)
+        model = keras.Model(inputs, outputs)
+        dot_img_file = "model_2.png"
+        try:
+            vis_utils.plot_model(
+                model,
+                to_file=dot_img_file,
+                show_shapes=True,
+                show_dtype=True,
+                expand_nested=True,
+            )
+            self.assertTrue(tf.io.gfile.exists(dot_img_file))
+            tf.io.gfile.remove(dot_img_file)
+        except ImportError:
+            pass
+
+    def test_plot_model_with_add_loss(self):
+        inputs = keras.Input(shape=(None, 3))
+        outputs = keras.layers.Dense(1)(inputs)
+        model = keras.Model(inputs, outputs)
+        model.add_loss(tf.reduce_mean(outputs))
+        dot_img_file = "model_3.png"
+        try:
+            vis_utils.plot_model(
+                model,
+                to_file=dot_img_file,
+                show_shapes=True,
+                show_dtype=True,
+                expand_nested=True,
+            )
+            self.assertTrue(tf.io.gfile.exists(dot_img_file))
+            tf.io.gfile.remove(dot_img_file)
+        except ImportError:
+            pass
+
+        model = keras.Sequential(
+            [keras.Input(shape=(None, 3)), keras.layers.Dense(1)]
+        )
+        model.add_loss(tf.reduce_mean(model.output))
+        dot_img_file = "model_4.png"
+        try:
+            vis_utils.plot_model(
+                model,
+                to_file=dot_img_file,
+                show_shapes=True,
+                show_dtype=True,
+                expand_nested=True,
+            )
+            self.assertTrue(tf.io.gfile.exists(dot_img_file))
+            tf.io.gfile.remove(dot_img_file)
+        except ImportError:
+            pass
+
+    @parameterized.parameters(
+        {"show_shapes": False, "show_dtype": False},
+        {"show_shapes": False, "show_dtype": True},
+        {"show_shapes": True, "show_dtype": False},
+        {"show_shapes": True, "show_dtype": True},
+    )
+    def test_plot_model_cnn_with_activations(self, show_shapes, show_dtype):
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Conv2D(
+                filters=2,
+                kernel_size=2,
+                input_shape=(9, 9, 3),
+                activation="relu",
+            )
+        )
+        model.add(
+            keras.layers.Conv2D(
+                filters=4, kernel_size=2, strides=(2, 2), activation="relu"
+            )
+        )
+        model.add(keras.layers.Flatten(name="flat"))
+        model.add(keras.layers.Dense(5, name="head", activation="softmax"))
+        dot_img_file = "model_5.png"
+        try:
+            vis_utils.plot_model(
+                model,
+                to_file=dot_img_file,
+                show_shapes=show_shapes,
+                show_dtype=show_dtype,
+                show_layer_activations=True,
+            )
+            self.assertTrue(tf.io.gfile.exists(dot_img_file))
+            tf.io.gfile.remove(dot_img_file)
+        except ImportError:
+            pass
+
+    @parameterized.parameters(
+        {"layer_range": ["block1a_project_conv", "block1a_activation"]},
+        {"layer_range": ["block1a_activation", "block1a_project_conv"]},
+        {"layer_range": [r"block*", "block2a_se_excite"]},
+        {"layer_range": [r"block\da_activation", r"block\da_project_bn"]},
+    )
+    def test_dot_layer_range(self, layer_range):
+        model = efficientnet.EfficientNetB0(weights=None)
+        layer_ids_from_model = get_layer_ids_from_model(model, layer_range)
+        try:
+            dot = vis_utils.model_to_dot(model, layer_range=layer_range)
+            dot_edges = dot.get_edges()
+            layer_ids_from_dot = get_layer_ids_from_dot(dot_edges)
+            self.assertAllEqual(
+                sorted(layer_ids_from_model), sorted(layer_ids_from_dot)
+            )
+        except ImportError:
+            pass
+
+    @parameterized.parameters(
+        {"layer_range": ["block1a_project_conv", "block1a_activation"]},
+        {"layer_range": ["block1a_activation", "block1a_project_conv"]},
+        {"layer_range": [r"block*", "block2a_se_excite"]},
+        {"layer_range": [r"block\da_activation", r"block\da_project_bn"]},
+    )
+    def test_plot_layer_range(self, layer_range):
+        model = efficientnet.EfficientNetB0(weights=None)
+        effnet_subplot = "model_effnet.png"
+        try:
+            vis_utils.plot_model(
+                model, to_file=effnet_subplot, layer_range=layer_range
+            )
+            self.assertTrue(tf.io.gfile.exists(effnet_subplot))
+        except ImportError:
+            pass
+        finally:
+            if tf.io.gfile.exists(effnet_subplot):
+                tf.io.gfile.remove(effnet_subplot)
 
-  def test_plot_model_cnn(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Conv2D(
-            filters=2, kernel_size=(2, 3), input_shape=(3, 5, 5), name='conv'))
-    model.add(keras.layers.Flatten(name='flat'))
-    model.add(keras.layers.Dense(5, name='dense'))
-    dot_img_file = 'model_1.png'
-    try:
-      vis_utils.plot_model(
-          model, to_file=dot_img_file, show_shapes=True, show_dtype=True)
-      self.assertTrue(tf.io.gfile.exists(dot_img_file))
-      tf.io.gfile.remove(dot_img_file)
-    except ImportError:
-      pass
-
-  def test_plot_model_with_wrapped_layers_and_models(self):
-    inputs = keras.Input(shape=(None, 3))
-    lstm = keras.layers.LSTM(6, return_sequences=True, name='lstm')
-    x = lstm(inputs)
-    # Add layer inside a Wrapper
-    bilstm = keras.layers.Bidirectional(
-        keras.layers.LSTM(16, return_sequences=True, name='bilstm'))
-    x = bilstm(x)
-    # Add model inside a Wrapper
-    submodel = keras.Sequential(
-        [keras.layers.Dense(32, name='dense', input_shape=(None, 32))]
+    @parameterized.parameters(
+        {"layer_range": ["block1a_se_squeeze", "block2a_project_conv"]},
+        {"layer_range": [r"block\da_se_reshape", r"block*"]},
     )
-    wrapped_dense = keras.layers.TimeDistributed(submodel)
-    x = wrapped_dense(x)
-    # Add shared submodel
-    outputs = submodel(x)
-    model = keras.Model(inputs, outputs)
-    dot_img_file = 'model_2.png'
-    try:
-      vis_utils.plot_model(
-          model,
-          to_file=dot_img_file,
-          show_shapes=True,
-          show_dtype=True,
-          expand_nested=True)
-      self.assertTrue(tf.io.gfile.exists(dot_img_file))
-      tf.io.gfile.remove(dot_img_file)
-    except ImportError:
-      pass
-
-  def test_plot_model_with_add_loss(self):
-    inputs = keras.Input(shape=(None, 3))
-    outputs = keras.layers.Dense(1)(inputs)
-    model = keras.Model(inputs, outputs)
-    model.add_loss(tf.reduce_mean(outputs))
-    dot_img_file = 'model_3.png'
-    try:
-      vis_utils.plot_model(
-          model,
-          to_file=dot_img_file,
-          show_shapes=True,
-          show_dtype=True,
-          expand_nested=True)
-      self.assertTrue(tf.io.gfile.exists(dot_img_file))
-      tf.io.gfile.remove(dot_img_file)
-    except ImportError:
-      pass
-
-    model = keras.Sequential([
-        keras.Input(shape=(None, 3)), keras.layers.Dense(1)])
-    model.add_loss(tf.reduce_mean(model.output))
-    dot_img_file = 'model_4.png'
-    try:
-      vis_utils.plot_model(
-          model,
-          to_file=dot_img_file,
-          show_shapes=True,
-          show_dtype=True,
-          expand_nested=True)
-      self.assertTrue(tf.io.gfile.exists(dot_img_file))
-      tf.io.gfile.remove(dot_img_file)
-    except ImportError:
-      pass
-
-  @parameterized.parameters({
-      'show_shapes': False,
-      'show_dtype': False
-  }, {
-      'show_shapes': False,
-      'show_dtype': True
-  }, {
-      'show_shapes': True,
-      'show_dtype': False
-  }, {
-      'show_shapes': True,
-      'show_dtype': True
-  })
-  def test_plot_model_cnn_with_activations(self, show_shapes, show_dtype):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Conv2D(
-            filters=2, kernel_size=2, input_shape=(9, 9, 3), activation='relu'))
-    model.add(
-        keras.layers.Conv2D(
-            filters=4, kernel_size=2, strides=(2, 2), activation='relu'))
-    model.add(keras.layers.Flatten(name='flat'))
-    model.add(keras.layers.Dense(5, name='head', activation='softmax'))
-    dot_img_file = 'model_5.png'
-    try:
-      vis_utils.plot_model(
-          model,
-          to_file=dot_img_file,
-          show_shapes=show_shapes,
-          show_dtype=show_dtype,
-          show_layer_activations=True)
-      self.assertTrue(tf.io.gfile.exists(dot_img_file))
-      tf.io.gfile.remove(dot_img_file)
-    except ImportError:
-      pass
-
-  @parameterized.parameters(
-      {'layer_range': ['block1a_project_conv', 'block1a_activation']},
-      {'layer_range': ['block1a_activation', 'block1a_project_conv']},
-      {'layer_range': [r'block*', 'block2a_se_excite']},
-      {'layer_range': [r'block\da_activation', r'block\da_project_bn']})
-  def test_dot_layer_range(self, layer_range):
-    model = efficientnet.EfficientNetB0(weights=None)
-    layer_ids_from_model = get_layer_ids_from_model(model, layer_range)
-    try:
-      dot = vis_utils.model_to_dot(model, layer_range=layer_range)
-      dot_edges = dot.get_edges()
-      layer_ids_from_dot = get_layer_ids_from_dot(dot_edges)
-      self.assertAllEqual(
-          sorted(layer_ids_from_model), sorted(layer_ids_from_dot))
-    except ImportError:
-      pass
-
-  @parameterized.parameters(
-      {'layer_range': ['block1a_project_conv', 'block1a_activation']},
-      {'layer_range': ['block1a_activation', 'block1a_project_conv']},
-      {'layer_range': [r'block*', 'block2a_se_excite']},
-      {'layer_range': [r'block\da_activation', r'block\da_project_bn']})
-  def test_plot_layer_range(self, layer_range):
-    model = efficientnet.EfficientNetB0(weights=None)
-    effnet_subplot = 'model_effnet.png'
-    try:
-      vis_utils.plot_model(
-          model, to_file=effnet_subplot, layer_range=layer_range)
-      self.assertTrue(tf.io.gfile.exists(effnet_subplot))
-    except ImportError:
-      pass
-    finally:
-      if tf.io.gfile.exists(effnet_subplot):
-        tf.io.gfile.remove(effnet_subplot)
-
-  @parameterized.parameters(
-      {'layer_range': ['block1a_se_squeeze', 'block2a_project_conv']},
-      {'layer_range': [r'block\da_se_reshape', r'block*']})
-  def test_layer_range_assertion_fail(self, layer_range):
-    model = efficientnet.EfficientNetB0(weights=None)
-    try:
-      with self.assertRaises(AssertionError):
-        vis_utils.model_to_dot(model, layer_range=layer_range)
-      with self.assertRaises(AssertionError):
-        vis_utils.plot_model(model, layer_range=layer_range)
-    except ImportError:
-      pass
-
-  @parameterized.parameters(
-      {'layer_range': ['block1a_activation']},
-      {'layer_range': []},
-      {'layer_range': ['input', 'block1a_activation', 'block1a_project_conv']},
-      {'layer_range': [9, 'block1a_activation']},
-      {'layer_range': [29, 9]},
-      {'layer_range': ['block8a_se_reshape', 'block*']})
-  def test_layer_range_value_fail(self, layer_range):
-    model = efficientnet.EfficientNetB0(weights=None)
-    try:
-      with self.assertRaises(ValueError):
-        vis_utils.model_to_dot(model, layer_range=layer_range)
-      with self.assertRaises(ValueError):
-        vis_utils.plot_model(model, layer_range=layer_range)
-    except ImportError:
-      pass
-
-  def test_model_with_tf_op(self):
-    # Test fix for a bug in which inputs to a TFOp layer past the 1st one
-    # were not connected in the Keras model plot.
-    a = keras.Input((2,))
-    b = keras.Input((2,))
-    model = keras.Model(inputs=[a, b], outputs=a + b)
-    try:
-      dot = vis_utils.model_to_dot(model)
-      self.assertLen(dot.get_edges(), 2)  # This model has 2 edges.
-    except ImportError:
-      pass
+    def test_layer_range_assertion_fail(self, layer_range):
+        model = efficientnet.EfficientNetB0(weights=None)
+        try:
+            with self.assertRaises(AssertionError):
+                vis_utils.model_to_dot(model, layer_range=layer_range)
+            with self.assertRaises(AssertionError):
+                vis_utils.plot_model(model, layer_range=layer_range)
+        except ImportError:
+            pass
+
+    @parameterized.parameters(
+        {"layer_range": ["block1a_activation"]},
+        {"layer_range": []},
+        {
+            "layer_range": [
+                "input",
+                "block1a_activation",
+                "block1a_project_conv",
+            ]
+        },
+        {"layer_range": [9, "block1a_activation"]},
+        {"layer_range": [29, 9]},
+        {"layer_range": ["block8a_se_reshape", "block*"]},
+    )
+    def test_layer_range_value_fail(self, layer_range):
+        model = efficientnet.EfficientNetB0(weights=None)
+        try:
+            with self.assertRaises(ValueError):
+                vis_utils.model_to_dot(model, layer_range=layer_range)
+            with self.assertRaises(ValueError):
+                vis_utils.plot_model(model, layer_range=layer_range)
+        except ImportError:
+            pass
+
+    def test_model_with_tf_op(self):
+        # Test fix for a bug in which inputs to a TFOp layer past the 1st one
+        # were not connected in the Keras model plot.
+        a = keras.Input((2,))
+        b = keras.Input((2,))
+        model = keras.Model(inputs=[a, b], outputs=a + b)
+        try:
+            dot = vis_utils.model_to_dot(model)
+            self.assertLen(dot.get_edges(), 2)  # This model has 2 edges.
+        except ImportError:
+            pass
+
+    def test_model_with_brackets_in_shape(self):
+        # Test fix for a bug in which plotting the model shapes fails if
+        # any labels contain brackets
+        class DictLayer(keras.layers.Layer):
+            def call(self, inputs) -> tf.Tensor:
+                tensor_input, dict_input = inputs
+                return tf.concat(list(dict_input.values()), axis=1)
+
+        inputs = {
+            "a": keras.Input(name="a", shape=(1), dtype=tf.float32),
+            "b": keras.Input(name="b", shape=(1), dtype=tf.float32),
+        }
+        outputs = DictLayer()((inputs["a"], inputs))
+        model = keras.Model(
+            inputs=inputs,
+            outputs=outputs,
+        )
+        try:
+            vis_utils.model_to_dot(
+                model, show_shapes=True, show_dtype=True, show_layer_names=True
+            )
+        except ImportError:
+            pass
+
+    def test_plot_model_with_show_trainable(self):
+        model = keras.Sequential(name="trainable")
+
+        untrained = keras.layers.Conv2D(
+            filters=2, kernel_size=(2, 3), input_shape=(3, 5, 5), name="conv"
+        )
+        model.add(untrained)
+        model.add(keras.layers.Flatten(name="flat"))
+        model.add(keras.layers.Dense(5, name="dense"))
+
+        # Should display as Non Trainable
+        untrained.trainable = False
+
+        dot_img_file = "model_trainable.png"
+        try:
+            vis_utils.plot_model(
+                model,
+                to_file=dot_img_file,
+                show_shapes=True,
+                show_dtype=True,
+                show_trainable=True,
+            )
+            self.assertTrue(tf.io.gfile.exists(dot_img_file))
+            tf.io.gfile.remove(dot_img_file)
+        except ImportError:
+            pass
 
 
 def get_layer_ids_from_model(model, layer_range):
-  layer_range = vis_utils.get_layer_index_bound_by_layer_name(
-      model, layer_range)
-  layer_ids_from_model = []
-  for i, layer in enumerate(model.layers):
-    if i >= layer_range[0] and i <= layer_range[1]:
-      layer_ids_from_model.append(str(id(layer)))
-  return layer_ids_from_model
+    layer_range = layer_utils.get_layer_index_bound_by_layer_name(
+        model, layer_range
+    )
+    layer_ids_from_model = [
+        str(id(layer))
+        for layer in model.layers[layer_range[0] : layer_range[1]]
+    ]
+    return layer_ids_from_model
 
 
 def get_layer_ids_from_dot(dot_edges):
-  layer_ids_from_dot = []
-  for edge in dot_edges:
-    for pt in edge.obj_dict['points']:
-      if pt not in layer_ids_from_dot:
-        layer_ids_from_dot.append(pt)
-  return layer_ids_from_dot
+    layer_ids_from_dot = []
+    for edge in dot_edges:
+        for pt in edge.obj_dict["points"]:
+            if pt not in layer_ids_from_dot:
+                layer_ids_from_dot.append(pt)
+    return layer_ids_from_dot
 
 
-if __name__ == '__main__':
-  tf.test.main()
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/keras/wrappers/BUILD b/keras/wrappers/BUILD
deleted file mode 100644
index c76c1cfcfb94..000000000000
--- a/keras/wrappers/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-# Description:
-#   Contains the Keras wrapper API (internal TensorFlow version).
-
-load("@org_keras//keras:keras.bzl", "tf_py_test")
-
-package(
-    default_visibility = ["//keras:friends"],
-    licenses = ["notice"],
-)
-
-py_library(
-    name = "wrappers",
-    srcs = [
-        "__init__.py",
-        "scikit_learn.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        "//:expect_numpy_installed",
-        "//:expect_tensorflow_installed",
-        "//keras:engine",
-        "//keras:losses",
-        "//keras/utils:generic_utils",
-    ],
-)
-
-tf_py_test(
-    name = "scikit_learn_test",
-    size = "small",
-    srcs = ["scikit_learn_test.py"],
-    python_version = "PY3",
-    tags = ["notsan"],
-    deps = [
-        ":wrappers",
-        "//:expect_numpy_installed",
-        "//:expect_tensorflow_installed",
-        "//keras/testing_infra:test_utils",
-        "//keras/utils:np_utils",
-    ],
-)
diff --git a/keras/wrappers/scikit_learn.py b/keras/wrappers/scikit_learn.py
deleted file mode 100644
index 348ccdd14ecb..000000000000
--- a/keras/wrappers/scikit_learn.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Wrapper for using the Scikit-Learn API with Keras models."""
-# pylint: disable=g-classes-have-attributes
-
-import copy
-import types
-import warnings
-
-import numpy as np
-
-from keras import losses
-from keras.models import Sequential
-from keras.utils.generic_utils import has_arg
-from keras.utils.np_utils import to_categorical
-from tensorflow.python.util.tf_export import keras_export
-from tensorflow.tools.docs import doc_controls
-
-
-class BaseWrapper:
-  """Base class for the Keras scikit-learn wrapper.
-
-  Warning: This class should not be used directly.
-  Use descendant classes instead.
-
-  Args:
-      build_fn: callable function or class instance
-      **sk_params: model parameters & fitting parameters
-
-  The `build_fn` should construct, compile and return a Keras model, which
-  will then be used to fit/predict. One of the following
-  three values could be passed to `build_fn`:
-  1. A function
-  2. An instance of a class that implements the `__call__` method
-  3. None. This means you implement a class that inherits from either
-  `KerasClassifier` or `KerasRegressor`. The `__call__` method of the
-  present class will then be treated as the default `build_fn`.
-
-  `sk_params` takes both model parameters and fitting parameters. Legal model
-  parameters are the arguments of `build_fn`. Note that like all other
-  estimators in scikit-learn, `build_fn` should provide default values for
-  its arguments, so that you could create the estimator without passing any
-  values to `sk_params`.
-
-  `sk_params` could also accept parameters for calling `fit`, `predict`,
-  `predict_proba`, and `score` methods (e.g., `epochs`, `batch_size`).
-  fitting (predicting) parameters are selected in the following order:
-
-  1. Values passed to the dictionary arguments of
-  `fit`, `predict`, `predict_proba`, and `score` methods
-  2. Values passed to `sk_params`
-  3. The default values of the `keras.models.Sequential`
-  `fit`, `predict` methods.
-
-  When using scikit-learn's `grid_search` API, legal tunable parameters are
-  those you could pass to `sk_params`, including fitting parameters.
-  In other words, you could use `grid_search` to search for the best
-  `batch_size` or `epochs` as well as the model parameters.
-  """
-
-  def __init__(self, build_fn=None, **sk_params):
-    self.build_fn = build_fn
-    self.sk_params = sk_params
-    self.check_params(sk_params)
-
-  def check_params(self, params):
-    """Checks for user typos in `params`.
-
-    Args:
-        params: dictionary; the parameters to be checked
-
-    Raises:
-        ValueError: if any member of `params` is not a valid argument.
-    """
-    legal_params_fns = [
-        Sequential.fit, Sequential.predict, Sequential.evaluate
-    ]
-    if self.build_fn is None:
-      legal_params_fns.append(self.__call__)
-    elif (not isinstance(self.build_fn, types.FunctionType) and
-          not isinstance(self.build_fn, types.MethodType)):
-      legal_params_fns.append(self.build_fn.__call__)
-    else:
-      legal_params_fns.append(self.build_fn)
-
-    for params_name in params:
-      for fn in legal_params_fns:
-        if has_arg(fn, params_name):
-          break
-      else:
-        if params_name != 'nb_epoch':
-          raise ValueError('{} is not a legal parameter'.format(params_name))
-
-  def get_params(self, **params):  # pylint: disable=unused-argument
-    """Gets parameters for this estimator.
-
-    Args:
-        **params: ignored (exists for API compatibility).
-
-    Returns:
-        Dictionary of parameter names mapped to their values.
-    """
-    res = self.sk_params.copy()
-    res.update({'build_fn': self.build_fn})
-    return res
-
-  def set_params(self, **params):
-    """Sets the parameters of this estimator.
-
-    Args:
-        **params: Dictionary of parameter names mapped to their values.
-
-    Returns:
-        self
-    """
-    self.check_params(params)
-    self.sk_params.update(params)
-    return self
-
-  def fit(self, x, y, **kwargs):
-    """Constructs a new model with `build_fn` & fit the model to `(x, y)`.
-
-    Args:
-        x : array-like, shape `(n_samples, n_features)`
-            Training samples where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for `x`.
-        **kwargs: dictionary arguments
-            Legal arguments are the arguments of `Sequential.fit`
-
-    Returns:
-        history : object
-            details about the training history at each epoch.
-    """
-    if self.build_fn is None:
-      self.model = self.__call__(**self.filter_sk_params(self.__call__))
-    elif (not isinstance(self.build_fn, types.FunctionType) and
-          not isinstance(self.build_fn, types.MethodType)):
-      self.model = self.build_fn(
-          **self.filter_sk_params(self.build_fn.__call__))
-    else:
-      self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
-
-    if (losses.is_categorical_crossentropy(self.model.loss) and
-        len(y.shape) != 2):
-      y = to_categorical(y)
-
-    fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
-    fit_args.update(kwargs)
-
-    history = self.model.fit(x, y, **fit_args)
-
-    return history
-
-  def filter_sk_params(self, fn, override=None):
-    """Filters `sk_params` and returns those in `fn`'s arguments.
-
-    Args:
-        fn : arbitrary function
-        override: dictionary, values to override `sk_params`
-
-    Returns:
-        res : dictionary containing variables
-            in both `sk_params` and `fn`'s arguments.
-    """
-    override = override or {}
-    res = {}
-    for name, value in self.sk_params.items():
-      if has_arg(fn, name):
-        res.update({name: value})
-    res.update(override)
-    return res
-
-
-@keras_export('keras.wrappers.scikit_learn.KerasClassifier')
-@doc_controls.do_not_generate_docs
-class KerasClassifier(BaseWrapper):
-  """Implementation of the scikit-learn classifier API for Keras.
-
-  DEPRECATED. Use [Sci-Keras](https://github.com/adriangb/scikeras) instead.
-  See https://www.adriangb.com/scikeras/stable/migration.html
-  for help migrating.
-  """
-
-  def __init__(self, build_fn=None, **sk_params):
-    warnings.warn(
-        'KerasClassifier is deprecated, '
-        'use Sci-Keras (https://github.com/adriangb/scikeras) instead. '
-        'See https://www.adriangb.com/scikeras/stable/migration.html '
-        'for help migrating.',
-        DeprecationWarning,
-        stacklevel=2)
-    super().__init__(build_fn, **sk_params)
-
-  def fit(self, x, y, **kwargs):
-    """Constructs a new model with `build_fn` & fit the model to `(x, y)`.
-
-    Args:
-        x : array-like, shape `(n_samples, n_features)`
-            Training samples where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for `x`.
-        **kwargs: dictionary arguments
-            Legal arguments are the arguments of `Sequential.fit`
-
-    Returns:
-        history : object
-            details about the training history at each epoch.
-
-    Raises:
-        ValueError: In case of invalid shape for `y` argument.
-    """
-    y = np.array(y)
-    if len(y.shape) == 2 and y.shape[1] > 1:
-      self.classes_ = np.arange(y.shape[1])
-    elif (len(y.shape) == 2 and y.shape[1] == 1) or len(y.shape) == 1:
-      self.classes_ = np.unique(y)
-      y = np.searchsorted(self.classes_, y)
-    else:
-      raise ValueError('Invalid shape for y: ' + str(y.shape))
-    self.n_classes_ = len(self.classes_)
-    return super().fit(x, y, **kwargs)
-
-  def predict(self, x, **kwargs):
-    """Returns the class predictions for the given test data.
-
-    Args:
-        x: array-like, shape `(n_samples, n_features)`
-            Test samples where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        **kwargs: dictionary arguments
-            Legal arguments are the arguments
-            of `Sequential.predict`.
-
-    Returns:
-        preds: array-like, shape `(n_samples,)`
-            Class predictions.
-    """
-    proba = self.model.predict(x, **kwargs)
-    if proba.shape[-1] > 1:
-      classes = proba.argmax(axis=-1)
-    else:
-      classes = (proba > 0.5).astype('int32')
-    return self.classes_[classes]
-
-  def predict_proba(self, x, **kwargs):
-    """Returns class probability estimates for the given test data.
-
-    Args:
-        x: array-like, shape `(n_samples, n_features)`
-            Test samples where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        **kwargs: dictionary arguments
-            Legal arguments are the arguments
-            of `Sequential.predict`.
-
-    Returns:
-        proba: array-like, shape `(n_samples, n_outputs)`
-            Class probability estimates.
-            In the case of binary classification,
-            to match the scikit-learn API,
-            will return an array of shape `(n_samples, 2)`
-            (instead of `(n_sample, 1)` as in Keras).
-    """
-    probs = self.model.predict(x, **kwargs)
-
-    # check if binary classification
-    if probs.shape[1] == 1:
-      # first column is probability of class 0 and second is of class 1
-      probs = np.hstack([1 - probs, probs])
-    return probs
-
-  def score(self, x, y, **kwargs):
-    """Returns the mean accuracy on the given test data and labels.
-
-    Args:
-        x: array-like, shape `(n_samples, n_features)`
-            Test samples where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for `x`.
-        **kwargs: dictionary arguments
-            Legal arguments are the arguments of `Sequential.evaluate`.
-
-    Returns:
-        score: float
-            Mean accuracy of predictions on `x` wrt. `y`.
-
-    Raises:
-        ValueError: If the underlying model isn't configured to
-            compute accuracy. You should pass `metrics=["accuracy"]` to
-            the `.compile()` method of the model.
-    """
-    y = np.searchsorted(self.classes_, y)
-    kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
-
-    loss_name = self.model.loss
-    if hasattr(loss_name, '__name__'):
-      loss_name = loss_name.__name__
-    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
-      y = to_categorical(y)
-
-    outputs = self.model.evaluate(x, y, **kwargs)
-    if not isinstance(outputs, list):
-      outputs = [outputs]
-    for name, output in zip(self.model.metrics_names, outputs):
-      if name in ['accuracy', 'acc']:
-        return output
-    raise ValueError('The model is not configured to compute accuracy. '
-                     'You should pass `metrics=["accuracy"]` to '
-                     'the `model.compile()` method.')
-
-
-@keras_export('keras.wrappers.scikit_learn.KerasRegressor')
-@doc_controls.do_not_generate_docs
-class KerasRegressor(BaseWrapper):
-  """Implementation of the scikit-learn regressor API for Keras.
-
-  DEPRECATED. Use [Sci-Keras](https://github.com/adriangb/scikeras) instead.
-  See https://www.adriangb.com/scikeras/stable/migration.html
-  for help migrating.
-  """
-
-  @doc_controls.do_not_doc_inheritable
-  def __init__(self, build_fn=None, **sk_params):
-    warnings.warn(
-        'KerasRegressor is deprecated, '
-        'use Sci-Keras (https://github.com/adriangb/scikeras) instead. '
-        'See https://www.adriangb.com/scikeras/stable/migration.html '
-        'for help migrating.',
-        DeprecationWarning,
-        stacklevel=2)
-    super().__init__(build_fn, **sk_params)
-
-  def predict(self, x, **kwargs):
-    """Returns predictions for the given test data.
-
-    Args:
-        x: array-like, shape `(n_samples, n_features)`
-            Test samples where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        **kwargs: dictionary arguments
-            Legal arguments are the arguments of `Sequential.predict`.
-
-    Returns:
-        preds: array-like, shape `(n_samples,)`
-            Predictions.
-    """
-    kwargs = self.filter_sk_params(Sequential.predict, kwargs)
-    return np.squeeze(self.model.predict(x, **kwargs))
-
-  def score(self, x, y, **kwargs):
-    """Returns the mean loss on the given test data and labels.
-
-    Args:
-        x: array-like, shape `(n_samples, n_features)`
-            Test samples where `n_samples` is the number of samples
-            and `n_features` is the number of features.
-        y: array-like, shape `(n_samples,)`
-            True labels for `x`.
-        **kwargs: dictionary arguments
-            Legal arguments are the arguments of `Sequential.evaluate`.
-
-    Returns:
-        score: float
-            Mean accuracy of predictions on `x` wrt. `y`.
-    """
-    kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
-    loss = self.model.evaluate(x, y, **kwargs)
-    if isinstance(loss, list):
-      return -loss[0]
-    return -loss
diff --git a/keras/wrappers/scikit_learn_test.py b/keras/wrappers/scikit_learn_test.py
deleted file mode 100644
index d00e9df8da34..000000000000
--- a/keras/wrappers/scikit_learn_test.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Scikit-learn API wrapper."""
-
-import warnings
-
-import tensorflow.compat.v2 as tf
-
-import numpy as np
-
-import keras
-from keras.testing_infra import test_utils
-from keras.wrappers import scikit_learn
-
-INPUT_DIM = 5
-HIDDEN_DIM = 5
-TRAIN_SAMPLES = 10
-TEST_SAMPLES = 5
-NUM_CLASSES = 2
-BATCH_SIZE = 5
-EPOCHS = 1
-
-
-def build_fn_clf(hidden_dim):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(INPUT_DIM, input_shape=(INPUT_DIM,)))
-  model.add(keras.layers.Activation('relu'))
-  model.add(keras.layers.Dense(hidden_dim))
-  model.add(keras.layers.Activation('relu'))
-  model.add(keras.layers.Dense(NUM_CLASSES))
-  model.add(keras.layers.Activation('softmax'))
-  model.compile(
-      optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
-  return model
-
-
-def assert_classification_works(clf):
-  np.random.seed(42)
-  (x_train, y_train), (x_test, _) = test_utils.get_test_data(
-      train_samples=TRAIN_SAMPLES,
-      test_samples=TEST_SAMPLES,
-      input_shape=(INPUT_DIM,),
-      num_classes=NUM_CLASSES)
-
-  clf.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS)
-
-  score = clf.score(x_train, y_train, batch_size=BATCH_SIZE)
-  assert np.isscalar(score) and np.isfinite(score)
-
-  preds = clf.predict(x_test, batch_size=BATCH_SIZE)
-  assert preds.shape == (TEST_SAMPLES,)
-  for prediction in np.unique(preds):
-    assert prediction in range(NUM_CLASSES)
-
-  proba = clf.predict_proba(x_test, batch_size=BATCH_SIZE)
-  assert proba.shape == (TEST_SAMPLES, NUM_CLASSES)
-  assert np.allclose(np.sum(proba, axis=1), np.ones(TEST_SAMPLES))
-
-
-def build_fn_reg(hidden_dim):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(INPUT_DIM, input_shape=(INPUT_DIM,)))
-  model.add(keras.layers.Activation('relu'))
-  model.add(keras.layers.Dense(hidden_dim))
-  model.add(keras.layers.Activation('relu'))
-  model.add(keras.layers.Dense(1))
-  model.add(keras.layers.Activation('linear'))
-  model.compile(
-      optimizer='sgd', loss='mean_absolute_error', metrics=['accuracy'])
-  return model
-
-
-def assert_regression_works(reg):
-  np.random.seed(42)
-  (x_train, y_train), (x_test, _) = test_utils.get_test_data(
-      train_samples=TRAIN_SAMPLES,
-      test_samples=TEST_SAMPLES,
-      input_shape=(INPUT_DIM,),
-      num_classes=NUM_CLASSES)
-
-  reg.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS)
-
-  score = reg.score(x_train, y_train, batch_size=BATCH_SIZE)
-  assert np.isscalar(score) and np.isfinite(score)
-
-  preds = reg.predict(x_test, batch_size=BATCH_SIZE)
-  assert preds.shape == (TEST_SAMPLES,)
-
-
-class ScikitLearnAPIWrapperTest(tf.test.TestCase):
-
-  def test_classify_build_fn(self):
-    with self.cached_session():
-      clf = scikit_learn.KerasClassifier(
-          build_fn=build_fn_clf,
-          hidden_dim=HIDDEN_DIM,
-          batch_size=BATCH_SIZE,
-          epochs=EPOCHS)
-
-      assert_classification_works(clf)
-
-  def test_classify_class_build_fn(self):
-
-    class ClassBuildFnClf:
-
-      def __call__(self, hidden_dim):
-        return build_fn_clf(hidden_dim)
-
-    with self.cached_session():
-      clf = scikit_learn.KerasClassifier(
-          build_fn=ClassBuildFnClf(),
-          hidden_dim=HIDDEN_DIM,
-          batch_size=BATCH_SIZE,
-          epochs=EPOCHS)
-
-      assert_classification_works(clf)
-
-  def test_classify_inherit_class_build_fn(self):
-
-    class InheritClassBuildFnClf(scikit_learn.KerasClassifier):
-
-      def __call__(self, hidden_dim):
-        return build_fn_clf(hidden_dim)
-
-    with self.cached_session():
-      clf = InheritClassBuildFnClf(
-          build_fn=None,
-          hidden_dim=HIDDEN_DIM,
-          batch_size=BATCH_SIZE,
-          epochs=EPOCHS)
-
-      assert_classification_works(clf)
-
-  def test_regression_build_fn(self):
-    with self.cached_session():
-      reg = scikit_learn.KerasRegressor(
-          build_fn=build_fn_reg,
-          hidden_dim=HIDDEN_DIM,
-          batch_size=BATCH_SIZE,
-          epochs=EPOCHS)
-
-      assert_regression_works(reg)
-
-  def test_regression_class_build_fn(self):
-
-    class ClassBuildFnReg:
-
-      def __call__(self, hidden_dim):
-        return build_fn_reg(hidden_dim)
-
-    with self.cached_session():
-      reg = scikit_learn.KerasRegressor(
-          build_fn=ClassBuildFnReg(),
-          hidden_dim=HIDDEN_DIM,
-          batch_size=BATCH_SIZE,
-          epochs=EPOCHS)
-
-      assert_regression_works(reg)
-
-  def test_regression_inherit_class_build_fn(self):
-
-    class InheritClassBuildFnReg(scikit_learn.KerasRegressor):
-
-      def __call__(self, hidden_dim):
-        return build_fn_reg(hidden_dim)
-
-    with self.cached_session():
-      reg = InheritClassBuildFnReg(
-          build_fn=None,
-          hidden_dim=HIDDEN_DIM,
-          batch_size=BATCH_SIZE,
-          epochs=EPOCHS)
-
-      assert_regression_works(reg)
-
-  def test_regressor_deprecated(self):
-    with warnings.catch_warnings(record=True) as w:
-      warnings.simplefilter('always')
-      scikit_learn.KerasRegressor(build_fn_reg)
-      assert len(w) == 1
-      assert issubclass(w[-1].category, DeprecationWarning)
-      assert 'KerasRegressor is deprecated' in str(w[-1].message)
-
-  def test_classifier_deprecated(self):
-    with warnings.catch_warnings(record=True) as w:
-      warnings.simplefilter('always')
-      scikit_learn.KerasClassifier(build_fn_clf)
-      assert len(w) == 1
-      assert issubclass(w[-1].category, DeprecationWarning)
-      assert 'KerasClassifier is deprecated' in str(w[-1].message)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/oss_setup.py b/oss_setup.py
new file mode 100644
index 000000000000..07db3105ccbf
--- /dev/null
+++ b/oss_setup.py
@@ -0,0 +1,92 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Setup script for the Keras pip package."""
+
+import os
+
+import setuptools
+
+DESCRIPTION = """Keras is a deep learning API written in Python,
+running on top of the machine learning platform TensorFlow.
+
+It was developed with a focus on enabling fast experimentation and
+providing a delightful developer experience.
+The purpose of Keras is to give an *unfair advantage* to any developer
+looking to ship ML-powered apps.
+
+Keras is:
+
+-   **Simple** -- but not simplistic. Keras reduces developer *cognitive load*
+    to free you to focus on the parts of the problem that really matter.
+    Keras focuses on ease of use, debugging speed, code elegance & conciseness,
+    maintainability, and deployability (via TFServing, TFLite, TF.js).
+-   **Flexible** -- Keras adopts the principle of *progressive disclosure of
+    complexity*: simple workflows should be quick and easy, while arbitrarily
+    advanced workflows should be *possible* via a clear path that builds upon
+    what you've already learned.
+-   **Powerful** -- Keras provides industry-strength performance and
+    scalability: it is used by organizations and companies including NASA,
+    YouTube, and Waymo. That's right -- your YouTube recommendations are
+    powered by Keras, and so is the world's most advanced driverless vehicle.
+"""
+
+with open(os.path.abspath(__file__)) as f:
+    contents = f.read()
+    if contents.count("{PACKAGE}") > 1 or contents.count("{VERSION}") > 1:
+        raise ValueError(
+            "You must fill the 'PACKAGE' and 'VERSION' "
+            "tags before running setup.py. If you are trying to "
+            "build a fresh package, you should be using "
+            "`pip_build.py` instead of `setup.py`."
+        )
+
+setuptools.setup(
+    name="{{PACKAGE}}",
+    # Version strings with `-` characters are semver compatible,
+    # but incompatible with pip. For pip, we will remove all `-`` characters.
+    version="{{VERSION}}",
+    description="Deep learning for humans.",
+    long_description=DESCRIPTION,
+    url="https://keras.io/",
+    download_url="https://github.com/keras-team/keras/tags",
+    author="Keras team",
+    author_email="keras-users@googlegroups.com",
+    packages=setuptools.find_packages(),
+    install_requires=[],
+    # Supported Python versions
+    python_requires=">=3.8",
+    # PyPI package information.
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3 :: Only",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Mathematics",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+    license="Apache 2.0",
+    keywords=["keras", "tensorflow", "machine learning", "deep learning"],
+)
diff --git a/pip_build.py b/pip_build.py
new file mode 100644
index 000000000000..6c09c1ccb7a3
--- /dev/null
+++ b/pip_build.py
@@ -0,0 +1,518 @@
+"""Build the Keras pip package.
+
+The steps are as follows:
+
+0. Run bazel build in the Keras root directory to obtain protobuf Python files.
+1. Create a temporary build directory (e.g. `/tmp/keras_build`)
+2. Copy the Keras codebase to it (to `/tmp/keras_build/keras/src`)
+  and rewrite internal imports so that they refer to `keras.src` rather than
+  just `keras`.
+3. Also copy `setup.py` to the build directory.
+4. List and import every file in the codebase (in `/tmp/keras_build/keras/src`),
+  so we can inspect the symbols the codebase contains.
+5. Use the annotations left by the `keras_export` decorator to filter the
+  symbols that should be exported, as well as their export path (default one
+  and v1 one).
+6. Use this information to generate `__init__.py` files in
+  `tmp/keras_build/keras/`.
+7. Run the setup script to write out build artifacts to `tmp/keras_build/dist`.
+8. Copy the artifacts out. This is what should be uploaded to PyPI.
+
+This script borrows heavily from Namex (https://github.com/fchollet/namex).
+
+Notes:
+
+* This script should be run on the Keras codebase as obtained from GitHub
+  (OSS-facing), not the Google-internal one. The files are expect to be already
+  converted to their public form.
+* This script only targets Linux x86 64. It could be adapted to MacOS
+  relatively easily by changing requirements.txt and the bazel build script.
+* This script should be run from an environment that has all Keras dependencies
+  installed. Note that their specific version is not important; the only
+  thing that matters is that we should be able to import the Keras codebase
+  in its current state (so we can perform step 4). If you install the
+  dependencies used by the latest TF-nightly you should be good.
+"""
+
+import argparse
+import datetime
+import glob
+import importlib
+import inspect
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tempfile
+
+PACKAGE_NAME = "keras"
+DIST_DIRNAME = "dist"
+SRC_DIRNAME = "src"
+TMP_BUILD_DIRNAME = "keras_build"
+TMP_TEST_DIRNAME = "keras_test"
+VERBOSE = True
+INIT_FILE_HEADER = """AUTOGENERATED. DO NOT EDIT."""
+# These are symbols that have export issues and that we skip for now.
+SYMBOLS_TO_SKIP = ["layer_test"]
+
+
+def copy_keras_codebase(source_dir, target_dir):
+    disallowed = [
+        "tools",
+        "integration_test",
+    ]
+
+    def ignore(path, names):
+        to_ignore = []
+        for name in names:
+            if name.endswith("_test.py"):
+                to_ignore.append(name)
+            elif name in disallowed:
+                to_ignore.append(name)
+        return to_ignore
+
+    shutil.copytree(source_dir, target_dir, ignore=ignore)
+
+
+def convert_keras_imports(src_directory):
+    def _convert_line(line):
+        if "import keras.protobuf" in line or "from keras.protobuf" in line:
+            return line
+        # Imports starting from `root_name`.
+        if line.strip() == f"import {PACKAGE_NAME}":
+            line = line.replace(
+                f"import {PACKAGE_NAME}",
+                f"import {PACKAGE_NAME}.{SRC_DIRNAME} as {PACKAGE_NAME}",
+            )
+            return line
+
+        line = line.replace(
+            f"import {PACKAGE_NAME}.",
+            f"import {PACKAGE_NAME}.{SRC_DIRNAME}.",
+        )
+        line = line.replace(
+            f"from {PACKAGE_NAME}.",
+            f"from {PACKAGE_NAME}.{SRC_DIRNAME}.",
+        )
+        line = line.replace(
+            f"from {PACKAGE_NAME} import",
+            f"from {PACKAGE_NAME}.{SRC_DIRNAME} import",
+        )
+        # A way to catch LazyLoader calls. Hacky.
+        line = line.replace('globals(), "keras.', 'globals(), "keras.src.')
+        return line
+
+    for root, _, files in os.walk(src_directory):
+        for fname in files:
+            if fname.endswith(".py") and not fname.endswith("_pb2.py"):
+                fpath = os.path.join(root, fname)
+                if VERBOSE:
+                    print(f"...processing {fpath}")
+                with open(fpath) as f:
+                    contents = f.read()
+                lines = contents.split("\n")
+                in_string = False
+                new_lines = []
+                for line in lines:
+                    if line.strip().startswith('"""') or line.strip().endswith(
+                        '"""'
+                    ):
+                        if line.count('"') % 2 == 1:
+                            in_string = not in_string
+                    else:
+                        line = _convert_line(line)
+                    new_lines.append(line)
+
+                with open(fpath, "w") as f:
+                    f.write("\n".join(new_lines) + "\n")
+
+
+def generate_keras_api_files(package_directory, src_directory):
+    if VERBOSE:
+        print("# Compiling codebase entry points.")
+
+    codebase_walk_entry_points = []
+    for root, _, files in os.walk(src_directory):
+        for fname in files:
+            parts = root.split("/")
+            parts = parts[parts.index("keras") :]
+            base_entry_point = ".".join(parts)
+            if fname == "__init__.py":
+                codebase_walk_entry_points.append(base_entry_point)
+            elif fname.endswith(".py") and not fname.endswith("_test.py"):
+                module_name = fname[:-3]
+                codebase_walk_entry_points.append(
+                    base_entry_point + "." + module_name
+                )
+
+    # Import all Python modules found in the code directory.
+    modules = []
+    sys.path.insert(0, os.getcwd())
+    for entry_point in codebase_walk_entry_points:
+        if VERBOSE:
+            print(f"Load entry point: {entry_point}")
+        mod = importlib.import_module(entry_point, package=".")
+        modules.append(mod)
+
+    if VERBOSE:
+        print("# Compiling list of symbols to export.")
+
+    # Populate list of all symbols to register.
+    all_symbols = set()
+    processed = set()
+    from tensorflow.python.util import tf_decorator
+
+    for module in modules:
+        for name in dir(module):
+            if name in SYMBOLS_TO_SKIP:
+                continue
+            symbol = getattr(module, name)
+
+            # Get the real symbol behind any TF decorator
+            try:
+                _, symbol = tf_decorator.unwrap(symbol)
+            except ModuleNotFoundError:
+                # unwrap will not work on a ModuleSpec (which can't be
+                # an API symbol anyway)
+                continue
+
+            # Skip if already seen
+            if id(symbol) in processed:
+                continue
+            processed.add(id(symbol))
+
+            try:
+                if not hasattr(symbol, "_keras_api_names"):
+                    continue
+            except:  # noqa: E722
+                if VERBOSE:
+                    print(
+                        f"[!] Could not inspect symbol '{name}' from {module}."
+                    )
+                continue
+            # If the symbol is a non-registered subclass of
+            # a registered symbol, skip it.
+            skip = False
+
+            def has_same_metadata(a, b):
+                if (
+                    hasattr(a, "_keras_api_names")
+                    and hasattr(b, "_keras_api_names")
+                    and a._keras_api_names == b._keras_api_names
+                    and a._keras_api_names_v1 == b._keras_api_names_v1
+                ):
+                    return True
+                return False
+
+            try:
+                classes = inspect.getmro(symbol)
+                if len(classes) >= 2:
+                    parents = classes[1:]
+                    for p in parents:
+                        if has_same_metadata(p, symbol):
+                            skip = True
+            except AttributeError:
+                # getmro will error out on a non-class
+                # (in which case there can be no subclassing issues).
+                pass
+            if not skip:
+                all_symbols.add(symbol)
+
+    # Generate __init__ files content.
+    if VERBOSE:
+        print("# Processing export path data for each symbol.")
+    init_files_content = grab_symbol_metadata(all_symbols, is_v1=False)
+    init_files_content_v1 = grab_symbol_metadata(all_symbols, is_v1=True)
+
+    if VERBOSE:
+        print("# Writing out API files.")
+    write_out_api_files(
+        init_files_content,
+        target_dir=pathlib.Path(package_directory).parent.resolve(),
+    )
+    v1_path = os.path.join(package_directory, "api", "_v1")
+    v2_path = os.path.join(package_directory, "api", "_v2")
+    write_out_api_files(
+        init_files_content,
+        target_dir=v2_path,
+        root_offset=["api", "_v2", "keras"],
+    )
+    write_out_api_files(
+        init_files_content_v1,
+        target_dir=v1_path,
+        root_offset=["api", "_v1", "keras"],
+    )
+    # Add missing __init__ files in api dirs.
+    with open(os.path.join(package_directory, "api", "__init__.py"), "w"):
+        pass
+    with open(os.path.join(v1_path, "__init__.py"), "w"):
+        pass
+    with open(os.path.join(v2_path, "__init__.py"), "w"):
+        pass
+
+
+def grab_symbol_metadata(all_symbols, is_v1=False):
+    # init_files_content is a dict mapping a directory path to a list of
+    # symbol metadata entries to populate the __init__ file for the directory.
+    # Each entry is a dict with keys 'symbol' and 'export_name'.
+    init_files_content = {}
+    for symbol in all_symbols:
+        if VERBOSE:
+            print(f"...processing symbol '{symbol.__name__}'")
+        if is_v1:
+            api_names = symbol._keras_api_names_v1
+        else:
+            api_names = symbol._keras_api_names
+        for export_path in api_names:
+            export_modules = export_path.split(".")
+            export_name = export_modules[-1]
+            parent_path = os.path.join(*export_modules[:-1])
+            if parent_path not in init_files_content:
+                init_files_content[parent_path] = []
+            init_files_content[parent_path].append(
+                {"symbol": symbol, "export_name": export_name}
+            )
+            for i in range(1, len(export_modules[:-1])):
+                intermediate_path = os.path.join(*export_modules[:i])
+                if intermediate_path not in init_files_content:
+                    init_files_content[intermediate_path] = []
+                init_files_content[intermediate_path].append(
+                    {
+                        "module": export_modules[i],
+                        "location": ".".join(export_modules[:i]),
+                    }
+                )
+    return init_files_content
+
+
+def write_out_api_files(init_files_content, target_dir, root_offset=None):
+    # Go over init_files_content, make dirs,
+    # create __init__.py file, populate file with public symbol imports.
+    root_offset = root_offset or []
+    for path, contents in init_files_content.items():
+        os.makedirs(os.path.join(target_dir, path), exist_ok=True)
+        init_file_lines = []
+        modules_included = set()
+        for symbol_metadata in contents:
+            if "symbol" in symbol_metadata:
+                symbol = symbol_metadata["symbol"]
+                name = symbol_metadata["export_name"]
+                if name == symbol.__name__:
+                    init_file_lines.append(
+                        f"from {symbol.__module__} import {symbol.__name__}"
+                    )
+                else:
+                    init_file_lines.append(
+                        f"from {symbol.__module__} "
+                        f"import {symbol.__name__} as {name}"
+                    )
+            elif "module" in symbol_metadata:
+                if symbol_metadata["module"] not in modules_included:
+                    parts = path.split("/")
+                    parts = [parts[0]] + root_offset + parts[1:]
+                    module_location = ".".join(parts)
+                    init_file_lines.append(
+                        f"from {module_location} "
+                        f"import {symbol_metadata['module']}"
+                    )
+                    modules_included.add(symbol_metadata["module"])
+
+        init_path = os.path.join(target_dir, path, "__init__.py")
+        if VERBOSE:
+            print(f"...writing {init_path}")
+        init_file_lines = sorted(init_file_lines)
+        with open(init_path, "w") as f:
+            contents = (
+                f'"""{INIT_FILE_HEADER}"""\n\n'
+                + "\n".join(init_file_lines)
+                + "\n"
+            )
+            f.write(contents)
+
+
+def build_pip_package(
+    keras_root_directory,
+    build_directory,
+    package_directory,
+    src_directory,
+    dist_directory,
+    is_nightly=False,
+    rc=None,
+):
+    # Build Keras with Bazel to get the protobuf .py files
+    os.chdir(keras_root_directory)
+    os.system(f"sh {os.path.join('keras', 'tools', 'bazel_build.sh')}")
+    os.chdir(build_directory)
+
+    # Copy sources (`keras/` directory and setup files) to build directory
+    copy_keras_codebase(
+        os.path.join(keras_root_directory, "keras"), src_directory
+    )
+    shutil.copy(
+        os.path.join(keras_root_directory, "oss_setup.py"),
+        os.path.join(build_directory, "setup.py"),
+    )
+
+    # Add blank __init__.py file at package root
+    # to make the package directory importable.
+    with open(os.path.join(package_directory, "__init__.py"), "w") as f:
+        pass
+
+    # Move protobuf .py files to package root.
+    shutil.rmtree(os.path.join(src_directory, "protobuf"))
+    shutil.move(
+        os.path.join(keras_root_directory, "bazel-bin", "keras", "protobuf"),
+        package_directory,
+    )
+    # Add blank __init__.py file in protobuf dir.
+    with open(
+        os.path.join(package_directory, "protobuf", "__init__.py"), "w"
+    ) as f:
+        pass
+
+    # Convert imports from `keras.xyz` to `keras.src.xyz`.
+    convert_keras_imports(src_directory)
+
+    # Generate API __init__.py files in `keras/`
+    generate_keras_api_files(package_directory, src_directory)
+
+    # Make sure to export the __version__ string
+    version = getattr(
+        importlib.import_module("keras.src", package="."), "__version__"
+    )
+    if is_nightly:
+        date = datetime.datetime.now()
+        version += f".dev{date.strftime('%Y%m%d%H')}"
+    elif rc:
+        version += rc
+    with open(os.path.join(package_directory, "__init__.py")) as f:
+        init_contents = f.read()
+    with open(os.path.join(package_directory, "__init__.py"), "w") as f:
+        f.write(init_contents + "\n\n" + f'__version__ = "{version}"\n')
+
+    # Insert {{PACKAGE}} and {{VERSION}} strings in setup.py
+    if is_nightly:
+        package = PACKAGE_NAME + "-nightly"
+    else:
+        package = PACKAGE_NAME
+    with open(os.path.join(build_directory, "setup.py")) as f:
+        setup_contents = f.read()
+    with open(os.path.join(build_directory, "setup.py"), "w") as f:
+        setup_contents = setup_contents.replace("{{VERSION}}", version)
+        setup_contents = setup_contents.replace("{{PACKAGE}}", package)
+        f.write(setup_contents)
+
+    # Build the package
+    os.system("python3 -m build")
+
+    # Save the dist files generated by the build process
+    saved_filenames = []
+    for filename in glob.glob(os.path.join(build_directory, "dist", "*.*")):
+        if VERBOSE:
+            print(f"Saving build artifact {filename}")
+        shutil.copy(filename, dist_directory)
+        saved_filenames.append(filename)
+    if VERBOSE:
+        print(f"Saved artifacts to {dist_directory}")
+    return saved_filenames, version
+
+
+def test_wheel(wheel_path, expected_version, requirements_path):
+    test_directory = os.path.join(tempfile.gettempdir(), TMP_TEST_DIRNAME)
+    os.mkdir(test_directory)
+    os.chdir(test_directory)
+    symbols_to_check = [
+        "keras.layers",
+        "keras.Input",
+        "keras.__internal__",
+        "keras.experimental",
+    ]
+    checks = ";".join(symbols_to_check)
+    script = (
+        "#!/bin/bash\n"
+        "virtualenv kenv\n"
+        f"source {os.path.join('kenv', 'bin', 'activate')}\n"
+        f"pip3 install -r {requirements_path}\n"
+        f"pip3 install {wheel_path} --force-reinstall\n"
+        f"python3 -c 'import keras;{checks};print(keras.__version__)'\n"
+        f"python3 -c 'import tensorflow as tf;tf.compat.v1.layers.Dense'\n"
+    )
+    try:
+        # Check version is correct
+        output = subprocess.check_output(script.encode(), shell=True)
+        output = output.decode().rstrip().split("\n")[-1].strip()
+        if not output == expected_version:
+            raise ValueError(
+                "Incorrect version; expected "
+                f"{expected_version} but received {output}"
+            )
+    finally:
+        shutil.rmtree(test_directory)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--nightly",
+        action="store_true",
+        help="Whether this is for the `keras-nightly` package.",
+    )
+    parser.add_argument(
+        "--RC",
+        type=str,
+        help="Whether this is for the release candidate.",
+    )
+    args = parser.parse_args()
+    is_nightly = args.nightly
+    rc = args.RC
+
+    build_directory = os.path.join(tempfile.gettempdir(), TMP_BUILD_DIRNAME)
+    keras_root_directory = pathlib.Path(__file__).parent.resolve()
+    dist_directory = os.path.join(keras_root_directory, DIST_DIRNAME)
+    package_directory = os.path.join(build_directory, PACKAGE_NAME)
+    src_directory = os.path.join(build_directory, PACKAGE_NAME, SRC_DIRNAME)
+    if VERBOSE:
+        print(
+            "Using:\n"
+            f"build_directory={build_directory}\n"
+            f"keras_root_directory={keras_root_directory}\n"
+            f"dist_directory={dist_directory}\n"
+            f"package_directory={package_directory}\n"
+            f"src_directory={src_directory}\n"
+            f"is_nightly={is_nightly}\n"
+            f"rc={rc}"
+        )
+    if os.path.exists(build_directory):
+        raise ValueError(f"Directory already exists: {build_directory}")
+    os.mkdir(build_directory)
+    os.mkdir(package_directory)
+    if not os.path.exists(dist_directory):
+        os.mkdir(dist_directory)
+    try:
+        saved_filenames, version = build_pip_package(
+            keras_root_directory,
+            build_directory,
+            package_directory,
+            src_directory,
+            dist_directory,
+            is_nightly,
+            rc,
+        )
+        wheel_filename = [f for f in saved_filenames if f.endswith(".whl")][0]
+        if VERBOSE:
+            print("Testing wheel artifact.")
+        test_wheel(
+            wheel_path=os.path.join(dist_directory, wheel_filename),
+            expected_version=version,
+            requirements_path=os.path.join(
+                keras_root_directory, "requirements.txt"
+            ),
+        )
+        if VERBOSE:
+            print("Test successful.")
+    finally:
+        # Clean up: remove the build directory (no longer needed)
+        if VERBOSE:
+            print(f"Deleting temp build directory at {build_directory}...")
+        shutil.rmtree(build_directory)
diff --git a/requirements.txt b/requirements.txt
index d311f9368af7..412ef5fb6a63 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,11 +2,18 @@
 # The rest of the packages are mostly used for testing purpose.
 pandas
 pydot
-scipy ~= 1.5.2
+scipy ~= 1.9.2
+# Remove once both TensorFlow and Keras nightly builds pass.
+# Temporarily enforce 3.20.3 version, as the only version which is compatible
+# with both new and old protobuf stubs. This is needed to resolve
+# Keras-TensorFlow circular dependency issue, when one of them gets a dependency
+# incompatible with another one (protobuf in this specific case).
+protobuf==3.20.3
 tf-nightly
 portpicker
 pyyaml
 Pillow
-numpy ~= 1.21.4  # Sync with the numpy version used in TF
-pylint
+numpy ~= 1.24.3  # Sync with the numpy version used in TF
 black==22.3.0
+isort==5.10.1
+flake8==4.0.1
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 000000000000..2f53d6d6975b
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,11 @@
+[isort]
+force_single_line=True
+known_first_party=keras
+line_length=80
+profile=black
+
+[flake8]
+# imported but unused in __init__.py, that's ok.
+per-file-ignores=*__init__.py:F401
+ignore=E203,W503,W605,F632,E266,E731,E712,E741
+max-line-length=80
diff --git a/shell/format.sh b/shell/format.sh
new file mode 100755
index 000000000000..234634b3727f
--- /dev/null
+++ b/shell/format.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+isort --sl keras
+black --line-length 80 keras
+flake8 keras
diff --git a/shell/lint.sh b/shell/lint.sh
new file mode 100755
index 000000000000..0f06e65ca391
--- /dev/null
+++ b/shell/lint.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+isort --check --sl -c keras
+if ! [ $? -eq 0 ]
+then
+  echo "Please run \"sh shell/format.sh\" to format the code."
+  exit 1
+fi
+echo "no issues with isort"
+flake8 keras
+if ! [ $? -eq 0 ]
+then
+  echo "Please fix the code style issue."
+  exit 1
+fi
+echo "no issues with flake8"
+black --check --line-length 80 keras
+if ! [ $? -eq 0 ]
+then
+  echo "Please run \"sh shell/format.sh\" to format the code."
+    exit 1
+fi
+echo "no issues with black"
+echo "linting success!"